summaryrefslogtreecommitdiffstats
path: root/drivers/vdpa
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/vdpa')
-rw-r--r--drivers/vdpa/Kconfig129
-rw-r--r--drivers/vdpa/Makefile10
-rw-r--r--drivers/vdpa/alibaba/Makefile3
-rw-r--r--drivers/vdpa/alibaba/eni_vdpa.c551
-rw-r--r--drivers/vdpa/ifcvf/Makefile3
-rw-r--r--drivers/vdpa/ifcvf/ifcvf_base.c432
-rw-r--r--drivers/vdpa/ifcvf/ifcvf_base.h134
-rw-r--r--drivers/vdpa/ifcvf/ifcvf_main.c882
-rw-r--r--drivers/vdpa/mlx5/Makefile4
-rw-r--r--drivers/vdpa/mlx5/core/mlx5_vdpa.h136
-rw-r--r--drivers/vdpa/mlx5/core/mr.c621
-rw-r--r--drivers/vdpa/mlx5/core/resources.c323
-rw-r--r--drivers/vdpa/mlx5/net/debug.c153
-rw-r--r--drivers/vdpa/mlx5/net/mlx5_vnet.c3620
-rw-r--r--drivers/vdpa/mlx5/net/mlx5_vnet.h118
-rw-r--r--drivers/vdpa/pds/Makefile9
-rw-r--r--drivers/vdpa/pds/aux_drv.c140
-rw-r--r--drivers/vdpa/pds/aux_drv.h26
-rw-r--r--drivers/vdpa/pds/cmds.c185
-rw-r--r--drivers/vdpa/pds/cmds.h18
-rw-r--r--drivers/vdpa/pds/debugfs.c286
-rw-r--r--drivers/vdpa/pds/debugfs.h17
-rw-r--r--drivers/vdpa/pds/vdpa_dev.c844
-rw-r--r--drivers/vdpa/pds/vdpa_dev.h50
-rw-r--r--drivers/vdpa/solidrun/Makefile7
-rw-r--r--drivers/vdpa/solidrun/snet_ctrl.c336
-rw-r--r--drivers/vdpa/solidrun/snet_hwmon.c188
-rw-r--r--drivers/vdpa/solidrun/snet_main.c1132
-rw-r--r--drivers/vdpa/solidrun/snet_vdpa.h209
-rw-r--r--drivers/vdpa/vdpa.c1329
-rw-r--r--drivers/vdpa/vdpa_sim/Makefile4
-rw-r--r--drivers/vdpa/vdpa_sim/vdpa_sim.c807
-rw-r--r--drivers/vdpa/vdpa_sim/vdpa_sim.h121
-rw-r--r--drivers/vdpa/vdpa_sim/vdpa_sim_blk.c527
-rw-r--r--drivers/vdpa/vdpa_sim/vdpa_sim_net.c564
-rw-r--r--drivers/vdpa/vdpa_user/Makefile5
-rw-r--r--drivers/vdpa/vdpa_user/iova_domain.c624
-rw-r--r--drivers/vdpa/vdpa_user/iova_domain.h80
-rw-r--r--drivers/vdpa/vdpa_user/vduse_dev.c2171
-rw-r--r--drivers/vdpa/virtio_pci/Makefile2
-rw-r--r--drivers/vdpa/virtio_pci/vp_vdpa.c667
41 files changed, 17467 insertions, 0 deletions
diff --git a/drivers/vdpa/Kconfig b/drivers/vdpa/Kconfig
new file mode 100644
index 0000000000..656c1cb541
--- /dev/null
+++ b/drivers/vdpa/Kconfig
@@ -0,0 +1,129 @@
+# SPDX-License-Identifier: GPL-2.0-only
+menuconfig VDPA
+ tristate "vDPA drivers"
+ depends on NET
+ help
+ Enable this module to support vDPA device that uses a
+ datapath which complies with virtio specifications with
+ vendor specific control path.
+
+if VDPA
+
+config VDPA_SIM
+ tristate "vDPA device simulator core"
+ depends on RUNTIME_TESTING_MENU && HAS_DMA
+ select DMA_OPS
+ select VHOST_RING
+ select IOMMU_IOVA
+ help
+ Enable this module to support vDPA device simulators. These devices
+ are used for testing, prototyping and development of vDPA.
+
+config VDPA_SIM_NET
+ tristate "vDPA simulator for networking device"
+ depends on VDPA_SIM
+ select GENERIC_NET_UTILS
+ help
+ vDPA networking device simulator which loops TX traffic back to RX.
+
+config VDPA_SIM_BLOCK
+ tristate "vDPA simulator for block device"
+ depends on VDPA_SIM
+ help
+ vDPA block device simulator which terminates IO request in a
+ memory buffer.
+
+config VDPA_USER
+ tristate "VDUSE (vDPA Device in Userspace) support"
+ depends on EVENTFD && MMU && HAS_DMA
+ select DMA_OPS
+ select VHOST_IOTLB
+ select IOMMU_IOVA
+ help
+ With VDUSE it is possible to emulate a vDPA Device
+ in a userspace program.
+
+config IFCVF
+ tristate "Intel IFC VF vDPA driver"
+ depends on PCI_MSI
+ help
+ This kernel module can drive Intel IFC VF NIC to offload
+ virtio dataplane traffic to hardware.
+ To compile this driver as a module, choose M here: the module will
+ be called ifcvf.
+
+config MLX5_VDPA
+ bool
+ select VHOST_IOTLB
+ help
+ Support library for Mellanox VDPA drivers. Provides code that is
+ common for all types of VDPA drivers. The following drivers are planned:
+ net, block.
+
+config MLX5_VDPA_NET
+ tristate "vDPA driver for ConnectX devices"
+ select MLX5_VDPA
+ select VHOST_RING
+ depends on MLX5_CORE
+ help
+ VDPA network driver for ConnectX6 and newer. Provides offloading
+ of virtio net datapath such that descriptors put on the ring will
+ be executed by the hardware. It also supports a variety of stateless
+ offloads depending on the actual device used and firmware version.
+
+config MLX5_VDPA_STEERING_DEBUG
+ bool "expose steering counters on debugfs"
+ select MLX5_VDPA
+ help
+ Expose RX steering counters in debugfs to aid in debugging. For each VLAN
+ or non VLAN interface, two hardware counters are added to the RX flow
+ table: one for unicast and one for multicast.
+ The counters counts the number of packets and bytes and exposes them in
+ debugfs. Once can read the counters using, e.g.:
+ cat /sys/kernel/debug/mlx5/mlx5_core.sf.1/vdpa-0/rx/untagged/ucast/packets
+ cat /sys/kernel/debug/mlx5/mlx5_core.sf.1/vdpa-0/rx/untagged/mcast/bytes
+
+config VP_VDPA
+ tristate "Virtio PCI bridge vDPA driver"
+ select VIRTIO_PCI_LIB
+ depends on PCI_MSI
+ help
+ This kernel module bridges virtio PCI device to vDPA bus.
+
+config ALIBABA_ENI_VDPA
+ tristate "vDPA driver for Alibaba ENI"
+ select VIRTIO_PCI_LIB_LEGACY
+ depends on PCI_MSI && X86
+ help
+ VDPA driver for Alibaba ENI (Elastic Network Interface) which is built upon
+ virtio 0.9.5 specification.
+
+ config SNET_VDPA
+ tristate "SolidRun's vDPA driver for SolidNET"
+ depends on PCI_MSI && PCI_IOV && (HWMON || HWMON=n)
+
+ # This driver MAY create a HWMON device.
+ # Depending on (HWMON || HWMON=n) ensures that:
+ # If HWMON=n the driver can be compiled either as a module or built-in.
+ # If HWMON=y the driver can be compiled either as a module or built-in.
+ # If HWMON=m the driver is forced to be compiled as a module.
+ # By doing so, IS_ENABLED can be used instead of IS_REACHABLE
+
+ help
+ vDPA driver for SolidNET DPU.
+ With this driver, the VirtIO dataplane can be
+ offloaded to a SolidNET DPU.
+ This driver includes a HW monitor device that
+ reads health values from the DPU.
+
+config PDS_VDPA
+ tristate "vDPA driver for AMD/Pensando DSC devices"
+ select VIRTIO_PCI_LIB
+ depends on PCI_MSI
+ depends on PDS_CORE
+ help
+ vDPA network driver for AMD/Pensando's PDS Core devices.
+ With this driver, the VirtIO dataplane can be
+ offloaded to an AMD/Pensando DSC device.
+
+endif # VDPA
diff --git a/drivers/vdpa/Makefile b/drivers/vdpa/Makefile
new file mode 100644
index 0000000000..8f53c6f3cc
--- /dev/null
+++ b/drivers/vdpa/Makefile
@@ -0,0 +1,10 @@
+# SPDX-License-Identifier: GPL-2.0
+obj-$(CONFIG_VDPA) += vdpa.o
+obj-$(CONFIG_VDPA_SIM) += vdpa_sim/
+obj-$(CONFIG_VDPA_USER) += vdpa_user/
+obj-$(CONFIG_IFCVF) += ifcvf/
+obj-$(CONFIG_MLX5_VDPA) += mlx5/
+obj-$(CONFIG_VP_VDPA) += virtio_pci/
+obj-$(CONFIG_ALIBABA_ENI_VDPA) += alibaba/
+obj-$(CONFIG_SNET_VDPA) += solidrun/
+obj-$(CONFIG_PDS_VDPA) += pds/
diff --git a/drivers/vdpa/alibaba/Makefile b/drivers/vdpa/alibaba/Makefile
new file mode 100644
index 0000000000..ef4aae69f8
--- /dev/null
+++ b/drivers/vdpa/alibaba/Makefile
@@ -0,0 +1,3 @@
+# SPDX-License-Identifier: GPL-2.0
+obj-$(CONFIG_ALIBABA_ENI_VDPA) += eni_vdpa.o
+
diff --git a/drivers/vdpa/alibaba/eni_vdpa.c b/drivers/vdpa/alibaba/eni_vdpa.c
new file mode 100644
index 0000000000..cce3d18371
--- /dev/null
+++ b/drivers/vdpa/alibaba/eni_vdpa.c
@@ -0,0 +1,551 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * vDPA bridge driver for Alibaba ENI(Elastic Network Interface)
+ *
+ * Copyright (c) 2021, Alibaba Inc. All rights reserved.
+ * Author: Wu Zongyong <wuzongyong@linux.alibaba.com>
+ *
+ */
+
+#include "linux/bits.h"
+#include <linux/interrupt.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/vdpa.h>
+#include <linux/virtio.h>
+#include <linux/virtio_config.h>
+#include <linux/virtio_ring.h>
+#include <linux/virtio_pci.h>
+#include <linux/virtio_pci_legacy.h>
+#include <uapi/linux/virtio_net.h>
+
+#define ENI_MSIX_NAME_SIZE 256
+
+#define ENI_ERR(pdev, fmt, ...) \
+ dev_err(&pdev->dev, "%s"fmt, "eni_vdpa: ", ##__VA_ARGS__)
+#define ENI_DBG(pdev, fmt, ...) \
+ dev_dbg(&pdev->dev, "%s"fmt, "eni_vdpa: ", ##__VA_ARGS__)
+#define ENI_INFO(pdev, fmt, ...) \
+ dev_info(&pdev->dev, "%s"fmt, "eni_vdpa: ", ##__VA_ARGS__)
+
+struct eni_vring {
+ void __iomem *notify;
+ char msix_name[ENI_MSIX_NAME_SIZE];
+ struct vdpa_callback cb;
+ int irq;
+};
+
+struct eni_vdpa {
+ struct vdpa_device vdpa;
+ struct virtio_pci_legacy_device ldev;
+ struct eni_vring *vring;
+ struct vdpa_callback config_cb;
+ char msix_name[ENI_MSIX_NAME_SIZE];
+ int config_irq;
+ int queues;
+ int vectors;
+};
+
+static struct eni_vdpa *vdpa_to_eni(struct vdpa_device *vdpa)
+{
+ return container_of(vdpa, struct eni_vdpa, vdpa);
+}
+
+static struct virtio_pci_legacy_device *vdpa_to_ldev(struct vdpa_device *vdpa)
+{
+ struct eni_vdpa *eni_vdpa = vdpa_to_eni(vdpa);
+
+ return &eni_vdpa->ldev;
+}
+
+static u64 eni_vdpa_get_device_features(struct vdpa_device *vdpa)
+{
+ struct virtio_pci_legacy_device *ldev = vdpa_to_ldev(vdpa);
+ u64 features = vp_legacy_get_features(ldev);
+
+ features |= BIT_ULL(VIRTIO_F_ACCESS_PLATFORM);
+ features |= BIT_ULL(VIRTIO_F_ORDER_PLATFORM);
+
+ return features;
+}
+
+static int eni_vdpa_set_driver_features(struct vdpa_device *vdpa, u64 features)
+{
+ struct virtio_pci_legacy_device *ldev = vdpa_to_ldev(vdpa);
+
+ if (!(features & BIT_ULL(VIRTIO_NET_F_MRG_RXBUF)) && features) {
+ ENI_ERR(ldev->pci_dev,
+ "VIRTIO_NET_F_MRG_RXBUF is not negotiated\n");
+ return -EINVAL;
+ }
+
+ vp_legacy_set_features(ldev, (u32)features);
+
+ return 0;
+}
+
+static u64 eni_vdpa_get_driver_features(struct vdpa_device *vdpa)
+{
+ struct virtio_pci_legacy_device *ldev = vdpa_to_ldev(vdpa);
+
+ return vp_legacy_get_driver_features(ldev);
+}
+
+static u8 eni_vdpa_get_status(struct vdpa_device *vdpa)
+{
+ struct virtio_pci_legacy_device *ldev = vdpa_to_ldev(vdpa);
+
+ return vp_legacy_get_status(ldev);
+}
+
+static int eni_vdpa_get_vq_irq(struct vdpa_device *vdpa, u16 idx)
+{
+ struct eni_vdpa *eni_vdpa = vdpa_to_eni(vdpa);
+ int irq = eni_vdpa->vring[idx].irq;
+
+ if (irq == VIRTIO_MSI_NO_VECTOR)
+ return -EINVAL;
+
+ return irq;
+}
+
+static void eni_vdpa_free_irq(struct eni_vdpa *eni_vdpa)
+{
+ struct virtio_pci_legacy_device *ldev = &eni_vdpa->ldev;
+ struct pci_dev *pdev = ldev->pci_dev;
+ int i;
+
+ for (i = 0; i < eni_vdpa->queues; i++) {
+ if (eni_vdpa->vring[i].irq != VIRTIO_MSI_NO_VECTOR) {
+ vp_legacy_queue_vector(ldev, i, VIRTIO_MSI_NO_VECTOR);
+ devm_free_irq(&pdev->dev, eni_vdpa->vring[i].irq,
+ &eni_vdpa->vring[i]);
+ eni_vdpa->vring[i].irq = VIRTIO_MSI_NO_VECTOR;
+ }
+ }
+
+ if (eni_vdpa->config_irq != VIRTIO_MSI_NO_VECTOR) {
+ vp_legacy_config_vector(ldev, VIRTIO_MSI_NO_VECTOR);
+ devm_free_irq(&pdev->dev, eni_vdpa->config_irq, eni_vdpa);
+ eni_vdpa->config_irq = VIRTIO_MSI_NO_VECTOR;
+ }
+
+ if (eni_vdpa->vectors) {
+ pci_free_irq_vectors(pdev);
+ eni_vdpa->vectors = 0;
+ }
+}
+
+static irqreturn_t eni_vdpa_vq_handler(int irq, void *arg)
+{
+ struct eni_vring *vring = arg;
+
+ if (vring->cb.callback)
+ return vring->cb.callback(vring->cb.private);
+
+ return IRQ_HANDLED;
+}
+
+static irqreturn_t eni_vdpa_config_handler(int irq, void *arg)
+{
+ struct eni_vdpa *eni_vdpa = arg;
+
+ if (eni_vdpa->config_cb.callback)
+ return eni_vdpa->config_cb.callback(eni_vdpa->config_cb.private);
+
+ return IRQ_HANDLED;
+}
+
+static int eni_vdpa_request_irq(struct eni_vdpa *eni_vdpa)
+{
+ struct virtio_pci_legacy_device *ldev = &eni_vdpa->ldev;
+ struct pci_dev *pdev = ldev->pci_dev;
+ int i, ret, irq;
+ int queues = eni_vdpa->queues;
+ int vectors = queues + 1;
+
+ ret = pci_alloc_irq_vectors(pdev, vectors, vectors, PCI_IRQ_MSIX);
+ if (ret != vectors) {
+ ENI_ERR(pdev,
+ "failed to allocate irq vectors want %d but %d\n",
+ vectors, ret);
+ return ret;
+ }
+
+ eni_vdpa->vectors = vectors;
+
+ for (i = 0; i < queues; i++) {
+ snprintf(eni_vdpa->vring[i].msix_name, ENI_MSIX_NAME_SIZE,
+ "eni-vdpa[%s]-%d\n", pci_name(pdev), i);
+ irq = pci_irq_vector(pdev, i);
+ ret = devm_request_irq(&pdev->dev, irq,
+ eni_vdpa_vq_handler,
+ 0, eni_vdpa->vring[i].msix_name,
+ &eni_vdpa->vring[i]);
+ if (ret) {
+ ENI_ERR(pdev, "failed to request irq for vq %d\n", i);
+ goto err;
+ }
+ vp_legacy_queue_vector(ldev, i, i);
+ eni_vdpa->vring[i].irq = irq;
+ }
+
+ snprintf(eni_vdpa->msix_name, ENI_MSIX_NAME_SIZE, "eni-vdpa[%s]-config\n",
+ pci_name(pdev));
+ irq = pci_irq_vector(pdev, queues);
+ ret = devm_request_irq(&pdev->dev, irq, eni_vdpa_config_handler, 0,
+ eni_vdpa->msix_name, eni_vdpa);
+ if (ret) {
+ ENI_ERR(pdev, "failed to request irq for config vq %d\n", i);
+ goto err;
+ }
+ vp_legacy_config_vector(ldev, queues);
+ eni_vdpa->config_irq = irq;
+
+ return 0;
+err:
+ eni_vdpa_free_irq(eni_vdpa);
+ return ret;
+}
+
+static void eni_vdpa_set_status(struct vdpa_device *vdpa, u8 status)
+{
+ struct eni_vdpa *eni_vdpa = vdpa_to_eni(vdpa);
+ struct virtio_pci_legacy_device *ldev = &eni_vdpa->ldev;
+ u8 s = eni_vdpa_get_status(vdpa);
+
+ if (status & VIRTIO_CONFIG_S_DRIVER_OK &&
+ !(s & VIRTIO_CONFIG_S_DRIVER_OK)) {
+ eni_vdpa_request_irq(eni_vdpa);
+ }
+
+ vp_legacy_set_status(ldev, status);
+
+ if (!(status & VIRTIO_CONFIG_S_DRIVER_OK) &&
+ (s & VIRTIO_CONFIG_S_DRIVER_OK))
+ eni_vdpa_free_irq(eni_vdpa);
+}
+
+static int eni_vdpa_reset(struct vdpa_device *vdpa)
+{
+ struct eni_vdpa *eni_vdpa = vdpa_to_eni(vdpa);
+ struct virtio_pci_legacy_device *ldev = &eni_vdpa->ldev;
+ u8 s = eni_vdpa_get_status(vdpa);
+
+ vp_legacy_set_status(ldev, 0);
+
+ if (s & VIRTIO_CONFIG_S_DRIVER_OK)
+ eni_vdpa_free_irq(eni_vdpa);
+
+ return 0;
+}
+
+static u16 eni_vdpa_get_vq_num_max(struct vdpa_device *vdpa)
+{
+ struct virtio_pci_legacy_device *ldev = vdpa_to_ldev(vdpa);
+
+ return vp_legacy_get_queue_size(ldev, 0);
+}
+
+static u16 eni_vdpa_get_vq_num_min(struct vdpa_device *vdpa)
+{
+ struct virtio_pci_legacy_device *ldev = vdpa_to_ldev(vdpa);
+
+ return vp_legacy_get_queue_size(ldev, 0);
+}
+
+static int eni_vdpa_get_vq_state(struct vdpa_device *vdpa, u16 qid,
+ struct vdpa_vq_state *state)
+{
+ return -EOPNOTSUPP;
+}
+
+static int eni_vdpa_set_vq_state(struct vdpa_device *vdpa, u16 qid,
+ const struct vdpa_vq_state *state)
+{
+ struct virtio_pci_legacy_device *ldev = vdpa_to_ldev(vdpa);
+ const struct vdpa_vq_state_split *split = &state->split;
+
+ /* ENI is build upon virtio-pci specfication which not support
+ * to set state of virtqueue. But if the state is equal to the
+ * device initial state by chance, we can let it go.
+ */
+ if (!vp_legacy_get_queue_enable(ldev, qid)
+ && split->avail_index == 0)
+ return 0;
+
+ return -EOPNOTSUPP;
+}
+
+
+static void eni_vdpa_set_vq_cb(struct vdpa_device *vdpa, u16 qid,
+ struct vdpa_callback *cb)
+{
+ struct eni_vdpa *eni_vdpa = vdpa_to_eni(vdpa);
+
+ eni_vdpa->vring[qid].cb = *cb;
+}
+
+static void eni_vdpa_set_vq_ready(struct vdpa_device *vdpa, u16 qid,
+ bool ready)
+{
+ struct virtio_pci_legacy_device *ldev = vdpa_to_ldev(vdpa);
+
+ /* ENI is a legacy virtio-pci device. This is not supported
+ * by specification. But we can disable virtqueue by setting
+ * address to 0.
+ */
+ if (!ready)
+ vp_legacy_set_queue_address(ldev, qid, 0);
+}
+
+static bool eni_vdpa_get_vq_ready(struct vdpa_device *vdpa, u16 qid)
+{
+ struct virtio_pci_legacy_device *ldev = vdpa_to_ldev(vdpa);
+
+ return vp_legacy_get_queue_enable(ldev, qid);
+}
+
+static void eni_vdpa_set_vq_num(struct vdpa_device *vdpa, u16 qid,
+ u32 num)
+{
+ struct virtio_pci_legacy_device *ldev = vdpa_to_ldev(vdpa);
+ struct pci_dev *pdev = ldev->pci_dev;
+ u16 n = vp_legacy_get_queue_size(ldev, qid);
+
+ /* ENI is a legacy virtio-pci device which not allow to change
+ * virtqueue size. Just report a error if someone tries to
+ * change it.
+ */
+ if (num != n)
+ ENI_ERR(pdev,
+ "not support to set vq %u fixed num %u to %u\n",
+ qid, n, num);
+}
+
+static int eni_vdpa_set_vq_address(struct vdpa_device *vdpa, u16 qid,
+ u64 desc_area, u64 driver_area,
+ u64 device_area)
+{
+ struct virtio_pci_legacy_device *ldev = vdpa_to_ldev(vdpa);
+ u32 pfn = desc_area >> VIRTIO_PCI_QUEUE_ADDR_SHIFT;
+
+ vp_legacy_set_queue_address(ldev, qid, pfn);
+
+ return 0;
+}
+
+static void eni_vdpa_kick_vq(struct vdpa_device *vdpa, u16 qid)
+{
+ struct eni_vdpa *eni_vdpa = vdpa_to_eni(vdpa);
+
+ iowrite16(qid, eni_vdpa->vring[qid].notify);
+}
+
+static u32 eni_vdpa_get_device_id(struct vdpa_device *vdpa)
+{
+ struct virtio_pci_legacy_device *ldev = vdpa_to_ldev(vdpa);
+
+ return ldev->id.device;
+}
+
+static u32 eni_vdpa_get_vendor_id(struct vdpa_device *vdpa)
+{
+ struct virtio_pci_legacy_device *ldev = vdpa_to_ldev(vdpa);
+
+ return ldev->id.vendor;
+}
+
+static u32 eni_vdpa_get_vq_align(struct vdpa_device *vdpa)
+{
+ return VIRTIO_PCI_VRING_ALIGN;
+}
+
+static size_t eni_vdpa_get_config_size(struct vdpa_device *vdpa)
+{
+ return sizeof(struct virtio_net_config);
+}
+
+
+static void eni_vdpa_get_config(struct vdpa_device *vdpa,
+ unsigned int offset,
+ void *buf, unsigned int len)
+{
+ struct eni_vdpa *eni_vdpa = vdpa_to_eni(vdpa);
+ struct virtio_pci_legacy_device *ldev = &eni_vdpa->ldev;
+ void __iomem *ioaddr = ldev->ioaddr +
+ VIRTIO_PCI_CONFIG_OFF(eni_vdpa->vectors) +
+ offset;
+ u8 *p = buf;
+ int i;
+
+ for (i = 0; i < len; i++)
+ *p++ = ioread8(ioaddr + i);
+}
+
+static void eni_vdpa_set_config(struct vdpa_device *vdpa,
+ unsigned int offset, const void *buf,
+ unsigned int len)
+{
+ struct eni_vdpa *eni_vdpa = vdpa_to_eni(vdpa);
+ struct virtio_pci_legacy_device *ldev = &eni_vdpa->ldev;
+ void __iomem *ioaddr = ldev->ioaddr +
+ VIRTIO_PCI_CONFIG_OFF(eni_vdpa->vectors) +
+ offset;
+ const u8 *p = buf;
+ int i;
+
+ for (i = 0; i < len; i++)
+ iowrite8(*p++, ioaddr + i);
+}
+
+static void eni_vdpa_set_config_cb(struct vdpa_device *vdpa,
+ struct vdpa_callback *cb)
+{
+ struct eni_vdpa *eni_vdpa = vdpa_to_eni(vdpa);
+
+ eni_vdpa->config_cb = *cb;
+}
+
+static const struct vdpa_config_ops eni_vdpa_ops = {
+ .get_device_features = eni_vdpa_get_device_features,
+ .set_driver_features = eni_vdpa_set_driver_features,
+ .get_driver_features = eni_vdpa_get_driver_features,
+ .get_status = eni_vdpa_get_status,
+ .set_status = eni_vdpa_set_status,
+ .reset = eni_vdpa_reset,
+ .get_vq_num_max = eni_vdpa_get_vq_num_max,
+ .get_vq_num_min = eni_vdpa_get_vq_num_min,
+ .get_vq_state = eni_vdpa_get_vq_state,
+ .set_vq_state = eni_vdpa_set_vq_state,
+ .set_vq_cb = eni_vdpa_set_vq_cb,
+ .set_vq_ready = eni_vdpa_set_vq_ready,
+ .get_vq_ready = eni_vdpa_get_vq_ready,
+ .set_vq_num = eni_vdpa_set_vq_num,
+ .set_vq_address = eni_vdpa_set_vq_address,
+ .kick_vq = eni_vdpa_kick_vq,
+ .get_device_id = eni_vdpa_get_device_id,
+ .get_vendor_id = eni_vdpa_get_vendor_id,
+ .get_vq_align = eni_vdpa_get_vq_align,
+ .get_config_size = eni_vdpa_get_config_size,
+ .get_config = eni_vdpa_get_config,
+ .set_config = eni_vdpa_set_config,
+ .set_config_cb = eni_vdpa_set_config_cb,
+ .get_vq_irq = eni_vdpa_get_vq_irq,
+};
+
+
+static u16 eni_vdpa_get_num_queues(struct eni_vdpa *eni_vdpa)
+{
+ struct virtio_pci_legacy_device *ldev = &eni_vdpa->ldev;
+ u32 features = vp_legacy_get_features(ldev);
+ u16 num = 2;
+
+ if (features & BIT_ULL(VIRTIO_NET_F_MQ)) {
+ __virtio16 max_virtqueue_pairs;
+
+ eni_vdpa_get_config(&eni_vdpa->vdpa,
+ offsetof(struct virtio_net_config, max_virtqueue_pairs),
+ &max_virtqueue_pairs,
+ sizeof(max_virtqueue_pairs));
+ num = 2 * __virtio16_to_cpu(virtio_legacy_is_little_endian(),
+ max_virtqueue_pairs);
+ }
+
+ if (features & BIT_ULL(VIRTIO_NET_F_CTRL_VQ))
+ num += 1;
+
+ return num;
+}
+
+static int eni_vdpa_probe(struct pci_dev *pdev, const struct pci_device_id *id)
+{
+ struct device *dev = &pdev->dev;
+ struct eni_vdpa *eni_vdpa;
+ struct virtio_pci_legacy_device *ldev;
+ int ret, i;
+
+ ret = pcim_enable_device(pdev);
+ if (ret)
+ return ret;
+
+ eni_vdpa = vdpa_alloc_device(struct eni_vdpa, vdpa,
+ dev, &eni_vdpa_ops, 1, 1, NULL, false);
+ if (IS_ERR(eni_vdpa)) {
+ ENI_ERR(pdev, "failed to allocate vDPA structure\n");
+ return PTR_ERR(eni_vdpa);
+ }
+
+ ldev = &eni_vdpa->ldev;
+ ldev->pci_dev = pdev;
+
+ ret = vp_legacy_probe(ldev);
+ if (ret) {
+ ENI_ERR(pdev, "failed to probe legacy PCI device\n");
+ goto err;
+ }
+
+ pci_set_master(pdev);
+ pci_set_drvdata(pdev, eni_vdpa);
+
+ eni_vdpa->vdpa.dma_dev = &pdev->dev;
+ eni_vdpa->queues = eni_vdpa_get_num_queues(eni_vdpa);
+
+ eni_vdpa->vring = devm_kcalloc(&pdev->dev, eni_vdpa->queues,
+ sizeof(*eni_vdpa->vring),
+ GFP_KERNEL);
+ if (!eni_vdpa->vring) {
+ ret = -ENOMEM;
+ ENI_ERR(pdev, "failed to allocate virtqueues\n");
+ goto err_remove_vp_legacy;
+ }
+
+ for (i = 0; i < eni_vdpa->queues; i++) {
+ eni_vdpa->vring[i].irq = VIRTIO_MSI_NO_VECTOR;
+ eni_vdpa->vring[i].notify = ldev->ioaddr + VIRTIO_PCI_QUEUE_NOTIFY;
+ }
+ eni_vdpa->config_irq = VIRTIO_MSI_NO_VECTOR;
+
+ ret = vdpa_register_device(&eni_vdpa->vdpa, eni_vdpa->queues);
+ if (ret) {
+ ENI_ERR(pdev, "failed to register to vdpa bus\n");
+ goto err_remove_vp_legacy;
+ }
+
+ return 0;
+
+err_remove_vp_legacy:
+ vp_legacy_remove(&eni_vdpa->ldev);
+err:
+ put_device(&eni_vdpa->vdpa.dev);
+ return ret;
+}
+
+static void eni_vdpa_remove(struct pci_dev *pdev)
+{
+ struct eni_vdpa *eni_vdpa = pci_get_drvdata(pdev);
+
+ vdpa_unregister_device(&eni_vdpa->vdpa);
+ vp_legacy_remove(&eni_vdpa->ldev);
+}
+
+static struct pci_device_id eni_pci_ids[] = {
+ { PCI_DEVICE_SUB(PCI_VENDOR_ID_REDHAT_QUMRANET,
+ VIRTIO_TRANS_ID_NET,
+ PCI_SUBVENDOR_ID_REDHAT_QUMRANET,
+ VIRTIO_ID_NET) },
+ { 0 },
+};
+
+static struct pci_driver eni_vdpa_driver = {
+ .name = "alibaba-eni-vdpa",
+ .id_table = eni_pci_ids,
+ .probe = eni_vdpa_probe,
+ .remove = eni_vdpa_remove,
+};
+
+module_pci_driver(eni_vdpa_driver);
+
+MODULE_AUTHOR("Wu Zongyong <wuzongyong@linux.alibaba.com>");
+MODULE_DESCRIPTION("Alibaba ENI vDPA driver");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/vdpa/ifcvf/Makefile b/drivers/vdpa/ifcvf/Makefile
new file mode 100644
index 0000000000..d709915995
--- /dev/null
+++ b/drivers/vdpa/ifcvf/Makefile
@@ -0,0 +1,3 @@
+# SPDX-License-Identifier: GPL-2.0
+obj-$(CONFIG_IFCVF) += ifcvf.o
+ifcvf-$(CONFIG_IFCVF) += ifcvf_main.o ifcvf_base.o
diff --git a/drivers/vdpa/ifcvf/ifcvf_base.c b/drivers/vdpa/ifcvf/ifcvf_base.c
new file mode 100644
index 0000000000..060f837a4f
--- /dev/null
+++ b/drivers/vdpa/ifcvf/ifcvf_base.c
@@ -0,0 +1,432 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Intel IFC VF NIC driver for virtio dataplane offloading
+ *
+ * Copyright (C) 2020 Intel Corporation.
+ *
+ * Author: Zhu Lingshan <lingshan.zhu@intel.com>
+ *
+ */
+
+#include "ifcvf_base.h"
+
+u16 ifcvf_set_vq_vector(struct ifcvf_hw *hw, u16 qid, int vector)
+{
+ struct virtio_pci_common_cfg __iomem *cfg = hw->common_cfg;
+
+ vp_iowrite16(qid, &cfg->queue_select);
+ vp_iowrite16(vector, &cfg->queue_msix_vector);
+
+ return vp_ioread16(&cfg->queue_msix_vector);
+}
+
+u16 ifcvf_set_config_vector(struct ifcvf_hw *hw, int vector)
+{
+ struct virtio_pci_common_cfg __iomem *cfg = hw->common_cfg;
+
+ vp_iowrite16(vector, &cfg->msix_config);
+
+ return vp_ioread16(&cfg->msix_config);
+}
+
+static void __iomem *get_cap_addr(struct ifcvf_hw *hw,
+ struct virtio_pci_cap *cap)
+{
+ u32 length, offset;
+ u8 bar;
+
+ length = le32_to_cpu(cap->length);
+ offset = le32_to_cpu(cap->offset);
+ bar = cap->bar;
+
+ if (bar >= IFCVF_PCI_MAX_RESOURCE) {
+ IFCVF_DBG(hw->pdev,
+ "Invalid bar number %u to get capabilities\n", bar);
+ return NULL;
+ }
+
+ if (offset + length > pci_resource_len(hw->pdev, bar)) {
+ IFCVF_DBG(hw->pdev,
+ "offset(%u) + len(%u) overflows bar%u's capability\n",
+ offset, length, bar);
+ return NULL;
+ }
+
+ return hw->base[bar] + offset;
+}
+
+static int ifcvf_read_config_range(struct pci_dev *dev,
+ uint32_t *val, int size, int where)
+{
+ int ret, i;
+
+ for (i = 0; i < size; i += 4) {
+ ret = pci_read_config_dword(dev, where + i, val + i / 4);
+ if (ret < 0)
+ return ret;
+ }
+
+ return 0;
+}
+
+static u16 ifcvf_get_vq_size(struct ifcvf_hw *hw, u16 qid)
+{
+ u16 queue_size;
+
+ vp_iowrite16(qid, &hw->common_cfg->queue_select);
+ queue_size = vp_ioread16(&hw->common_cfg->queue_size);
+
+ return queue_size;
+}
+
+/* This function returns the max allowed safe size for
+ * all virtqueues. It is the minimal size that can be
+ * suppprted by all virtqueues.
+ */
+u16 ifcvf_get_max_vq_size(struct ifcvf_hw *hw)
+{
+ u16 queue_size, max_size, qid;
+
+ max_size = ifcvf_get_vq_size(hw, 0);
+ for (qid = 1; qid < hw->nr_vring; qid++) {
+ queue_size = ifcvf_get_vq_size(hw, qid);
+ /* 0 means the queue is unavailable */
+ if (!queue_size)
+ continue;
+
+ max_size = min(queue_size, max_size);
+ }
+
+ return max_size;
+}
+
+int ifcvf_init_hw(struct ifcvf_hw *hw, struct pci_dev *pdev)
+{
+ struct virtio_pci_cap cap;
+ u16 notify_off;
+ int ret;
+ u8 pos;
+ u32 i;
+
+ ret = pci_read_config_byte(pdev, PCI_CAPABILITY_LIST, &pos);
+ if (ret < 0) {
+ IFCVF_ERR(pdev, "Failed to read PCI capability list\n");
+ return -EIO;
+ }
+ hw->pdev = pdev;
+
+ while (pos) {
+ ret = ifcvf_read_config_range(pdev, (u32 *)&cap,
+ sizeof(cap), pos);
+ if (ret < 0) {
+ IFCVF_ERR(pdev,
+ "Failed to get PCI capability at %x\n", pos);
+ break;
+ }
+
+ if (cap.cap_vndr != PCI_CAP_ID_VNDR)
+ goto next;
+
+ switch (cap.cfg_type) {
+ case VIRTIO_PCI_CAP_COMMON_CFG:
+ hw->common_cfg = get_cap_addr(hw, &cap);
+ IFCVF_DBG(pdev, "hw->common_cfg = %p\n",
+ hw->common_cfg);
+ break;
+ case VIRTIO_PCI_CAP_NOTIFY_CFG:
+ pci_read_config_dword(pdev, pos + sizeof(cap),
+ &hw->notify_off_multiplier);
+ hw->notify_bar = cap.bar;
+ hw->notify_base = get_cap_addr(hw, &cap);
+ hw->notify_base_pa = pci_resource_start(pdev, cap.bar) +
+ le32_to_cpu(cap.offset);
+ IFCVF_DBG(pdev, "hw->notify_base = %p\n",
+ hw->notify_base);
+ break;
+ case VIRTIO_PCI_CAP_ISR_CFG:
+ hw->isr = get_cap_addr(hw, &cap);
+ IFCVF_DBG(pdev, "hw->isr = %p\n", hw->isr);
+ break;
+ case VIRTIO_PCI_CAP_DEVICE_CFG:
+ hw->dev_cfg = get_cap_addr(hw, &cap);
+ hw->cap_dev_config_size = le32_to_cpu(cap.length);
+ IFCVF_DBG(pdev, "hw->dev_cfg = %p\n", hw->dev_cfg);
+ break;
+ }
+
+next:
+ pos = cap.cap_next;
+ }
+
+ if (hw->common_cfg == NULL || hw->notify_base == NULL ||
+ hw->isr == NULL || hw->dev_cfg == NULL) {
+ IFCVF_ERR(pdev, "Incomplete PCI capabilities\n");
+ return -EIO;
+ }
+
+ hw->nr_vring = vp_ioread16(&hw->common_cfg->num_queues);
+ hw->vring = kzalloc(sizeof(struct vring_info) * hw->nr_vring, GFP_KERNEL);
+ if (!hw->vring)
+ return -ENOMEM;
+
+ for (i = 0; i < hw->nr_vring; i++) {
+ vp_iowrite16(i, &hw->common_cfg->queue_select);
+ notify_off = vp_ioread16(&hw->common_cfg->queue_notify_off);
+ hw->vring[i].notify_addr = hw->notify_base +
+ notify_off * hw->notify_off_multiplier;
+ hw->vring[i].notify_pa = hw->notify_base_pa +
+ notify_off * hw->notify_off_multiplier;
+ hw->vring[i].irq = -EINVAL;
+ }
+
+ hw->lm_cfg = hw->base[IFCVF_LM_BAR];
+
+ IFCVF_DBG(pdev,
+ "PCI capability mapping: common cfg: %p, notify base: %p\n, isr cfg: %p, device cfg: %p, multiplier: %u\n",
+ hw->common_cfg, hw->notify_base, hw->isr,
+ hw->dev_cfg, hw->notify_off_multiplier);
+
+ hw->vqs_reused_irq = -EINVAL;
+ hw->config_irq = -EINVAL;
+
+ return 0;
+}
+
+u8 ifcvf_get_status(struct ifcvf_hw *hw)
+{
+ return vp_ioread8(&hw->common_cfg->device_status);
+}
+
+void ifcvf_set_status(struct ifcvf_hw *hw, u8 status)
+{
+ vp_iowrite8(status, &hw->common_cfg->device_status);
+}
+
+void ifcvf_reset(struct ifcvf_hw *hw)
+{
+ ifcvf_set_status(hw, 0);
+ while (ifcvf_get_status(hw))
+ msleep(1);
+}
+
+u64 ifcvf_get_hw_features(struct ifcvf_hw *hw)
+{
+ struct virtio_pci_common_cfg __iomem *cfg = hw->common_cfg;
+ u32 features_lo, features_hi;
+ u64 features;
+
+ vp_iowrite32(0, &cfg->device_feature_select);
+ features_lo = vp_ioread32(&cfg->device_feature);
+
+ vp_iowrite32(1, &cfg->device_feature_select);
+ features_hi = vp_ioread32(&cfg->device_feature);
+
+ features = ((u64)features_hi << 32) | features_lo;
+
+ return features;
+}
+
+/* return provisioned vDPA dev features */
+u64 ifcvf_get_dev_features(struct ifcvf_hw *hw)
+{
+ return hw->dev_features;
+}
+
+u64 ifcvf_get_driver_features(struct ifcvf_hw *hw)
+{
+ struct virtio_pci_common_cfg __iomem *cfg = hw->common_cfg;
+ u32 features_lo, features_hi;
+ u64 features;
+
+ vp_iowrite32(0, &cfg->device_feature_select);
+ features_lo = vp_ioread32(&cfg->guest_feature);
+
+ vp_iowrite32(1, &cfg->device_feature_select);
+ features_hi = vp_ioread32(&cfg->guest_feature);
+
+ features = ((u64)features_hi << 32) | features_lo;
+
+ return features;
+}
+
+int ifcvf_verify_min_features(struct ifcvf_hw *hw, u64 features)
+{
+ if (!(features & BIT_ULL(VIRTIO_F_ACCESS_PLATFORM)) && features) {
+ IFCVF_ERR(hw->pdev, "VIRTIO_F_ACCESS_PLATFORM is not negotiated\n");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+u32 ifcvf_get_config_size(struct ifcvf_hw *hw)
+{
+ u32 net_config_size = sizeof(struct virtio_net_config);
+ u32 blk_config_size = sizeof(struct virtio_blk_config);
+ u32 cap_size = hw->cap_dev_config_size;
+ u32 config_size;
+
+ /* If the onboard device config space size is greater than
+ * the size of struct virtio_net/blk_config, only the spec
+ * implementing contents size is returned, this is very
+ * unlikely, defensive programming.
+ */
+ switch (hw->dev_type) {
+ case VIRTIO_ID_NET:
+ config_size = min(cap_size, net_config_size);
+ break;
+ case VIRTIO_ID_BLOCK:
+ config_size = min(cap_size, blk_config_size);
+ break;
+ default:
+ config_size = 0;
+ IFCVF_ERR(hw->pdev, "VIRTIO ID %u not supported\n", hw->dev_type);
+ }
+
+ return config_size;
+}
+
+void ifcvf_read_dev_config(struct ifcvf_hw *hw, u64 offset,
+ void *dst, int length)
+{
+ u8 old_gen, new_gen, *p;
+ int i;
+
+ WARN_ON(offset + length > hw->config_size);
+ do {
+ old_gen = vp_ioread8(&hw->common_cfg->config_generation);
+ p = dst;
+ for (i = 0; i < length; i++)
+ *p++ = vp_ioread8(hw->dev_cfg + offset + i);
+
+ new_gen = vp_ioread8(&hw->common_cfg->config_generation);
+ } while (old_gen != new_gen);
+}
+
+void ifcvf_write_dev_config(struct ifcvf_hw *hw, u64 offset,
+ const void *src, int length)
+{
+ const u8 *p;
+ int i;
+
+ p = src;
+ WARN_ON(offset + length > hw->config_size);
+ for (i = 0; i < length; i++)
+ vp_iowrite8(*p++, hw->dev_cfg + offset + i);
+}
+
+void ifcvf_set_driver_features(struct ifcvf_hw *hw, u64 features)
+{
+ struct virtio_pci_common_cfg __iomem *cfg = hw->common_cfg;
+
+ vp_iowrite32(0, &cfg->guest_feature_select);
+ vp_iowrite32((u32)features, &cfg->guest_feature);
+
+ vp_iowrite32(1, &cfg->guest_feature_select);
+ vp_iowrite32(features >> 32, &cfg->guest_feature);
+}
+
+u16 ifcvf_get_vq_state(struct ifcvf_hw *hw, u16 qid)
+{
+ struct ifcvf_lm_cfg __iomem *lm_cfg = hw->lm_cfg;
+ u16 last_avail_idx;
+
+ last_avail_idx = vp_ioread16(&lm_cfg->vq_state_region + qid * 2);
+
+ return last_avail_idx;
+}
+
+int ifcvf_set_vq_state(struct ifcvf_hw *hw, u16 qid, u16 num)
+{
+ struct ifcvf_lm_cfg __iomem *lm_cfg = hw->lm_cfg;
+
+ vp_iowrite16(num, &lm_cfg->vq_state_region + qid * 2);
+
+ return 0;
+}
+
+void ifcvf_set_vq_num(struct ifcvf_hw *hw, u16 qid, u32 num)
+{
+ struct virtio_pci_common_cfg __iomem *cfg = hw->common_cfg;
+
+ vp_iowrite16(qid, &cfg->queue_select);
+ vp_iowrite16(num, &cfg->queue_size);
+}
+
+int ifcvf_set_vq_address(struct ifcvf_hw *hw, u16 qid, u64 desc_area,
+ u64 driver_area, u64 device_area)
+{
+ struct virtio_pci_common_cfg __iomem *cfg = hw->common_cfg;
+
+ vp_iowrite16(qid, &cfg->queue_select);
+ vp_iowrite64_twopart(desc_area, &cfg->queue_desc_lo,
+ &cfg->queue_desc_hi);
+ vp_iowrite64_twopart(driver_area, &cfg->queue_avail_lo,
+ &cfg->queue_avail_hi);
+ vp_iowrite64_twopart(device_area, &cfg->queue_used_lo,
+ &cfg->queue_used_hi);
+
+ return 0;
+}
+
+bool ifcvf_get_vq_ready(struct ifcvf_hw *hw, u16 qid)
+{
+ struct virtio_pci_common_cfg __iomem *cfg = hw->common_cfg;
+ u16 queue_enable;
+
+ vp_iowrite16(qid, &cfg->queue_select);
+ queue_enable = vp_ioread16(&cfg->queue_enable);
+
+ return (bool)queue_enable;
+}
+
+void ifcvf_set_vq_ready(struct ifcvf_hw *hw, u16 qid, bool ready)
+{
+ struct virtio_pci_common_cfg __iomem *cfg = hw->common_cfg;
+
+ vp_iowrite16(qid, &cfg->queue_select);
+ vp_iowrite16(ready, &cfg->queue_enable);
+}
+
+static void ifcvf_reset_vring(struct ifcvf_hw *hw)
+{
+ u16 qid;
+
+ for (qid = 0; qid < hw->nr_vring; qid++) {
+ hw->vring[qid].cb.callback = NULL;
+ hw->vring[qid].cb.private = NULL;
+ ifcvf_set_vq_vector(hw, qid, VIRTIO_MSI_NO_VECTOR);
+ }
+}
+
+static void ifcvf_reset_config_handler(struct ifcvf_hw *hw)
+{
+ hw->config_cb.callback = NULL;
+ hw->config_cb.private = NULL;
+ ifcvf_set_config_vector(hw, VIRTIO_MSI_NO_VECTOR);
+}
+
+static void ifcvf_synchronize_irq(struct ifcvf_hw *hw)
+{
+ u32 nvectors = hw->num_msix_vectors;
+ struct pci_dev *pdev = hw->pdev;
+ int i, irq;
+
+ for (i = 0; i < nvectors; i++) {
+ irq = pci_irq_vector(pdev, i);
+ if (irq >= 0)
+ synchronize_irq(irq);
+ }
+}
+
+void ifcvf_stop(struct ifcvf_hw *hw)
+{
+ ifcvf_synchronize_irq(hw);
+ ifcvf_reset_vring(hw);
+ ifcvf_reset_config_handler(hw);
+}
+
+void ifcvf_notify_queue(struct ifcvf_hw *hw, u16 qid)
+{
+ vp_iowrite16(qid, hw->vring[qid].notify_addr);
+}
diff --git a/drivers/vdpa/ifcvf/ifcvf_base.h b/drivers/vdpa/ifcvf/ifcvf_base.h
new file mode 100644
index 0000000000..b57849c643
--- /dev/null
+++ b/drivers/vdpa/ifcvf/ifcvf_base.h
@@ -0,0 +1,134 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Intel IFC VF NIC driver for virtio dataplane offloading
+ *
+ * Copyright (C) 2020 Intel Corporation.
+ *
+ * Author: Zhu Lingshan <lingshan.zhu@intel.com>
+ *
+ */
+
+#ifndef _IFCVF_H_
+#define _IFCVF_H_
+
+#include <linux/pci.h>
+#include <linux/pci_regs.h>
+#include <linux/vdpa.h>
+#include <linux/virtio_pci_modern.h>
+#include <uapi/linux/virtio_net.h>
+#include <uapi/linux/virtio_blk.h>
+#include <uapi/linux/virtio_config.h>
+#include <uapi/linux/virtio_pci.h>
+#include <uapi/linux/vdpa.h>
+
+#define N3000_DEVICE_ID 0x1041
+#define N3000_SUBSYS_DEVICE_ID 0x001A
+
+#define IFCVF_QUEUE_ALIGNMENT PAGE_SIZE
+#define IFCVF_PCI_MAX_RESOURCE 6
+
+#define IFCVF_LM_BAR 4
+
+#define IFCVF_ERR(pdev, fmt, ...) dev_err(&pdev->dev, fmt, ##__VA_ARGS__)
+#define IFCVF_DBG(pdev, fmt, ...) dev_dbg(&pdev->dev, fmt, ##__VA_ARGS__)
+#define IFCVF_INFO(pdev, fmt, ...) dev_info(&pdev->dev, fmt, ##__VA_ARGS__)
+
+/* all vqs and config interrupt has its own vector */
+#define MSIX_VECTOR_PER_VQ_AND_CONFIG 1
+/* all vqs share a vector, and config interrupt has a separate vector */
+#define MSIX_VECTOR_SHARED_VQ_AND_CONFIG 2
+/* all vqs and config interrupt share a vector */
+#define MSIX_VECTOR_DEV_SHARED 3
+
+struct vring_info {
+ u16 last_avail_idx;
+ void __iomem *notify_addr;
+ phys_addr_t notify_pa;
+ u32 irq;
+ struct vdpa_callback cb;
+ char msix_name[256];
+};
+
+struct ifcvf_lm_cfg {
+ __le64 control;
+ __le64 status;
+ __le64 lm_mem_log_start_addr;
+ __le64 lm_mem_log_end_addr;
+ __le16 vq_state_region;
+};
+
+struct ifcvf_hw {
+ u8 __iomem *isr;
+ /* Live migration */
+ struct ifcvf_lm_cfg __iomem *lm_cfg;
+ /* Notification bar number */
+ u8 notify_bar;
+ u8 msix_vector_status;
+ /* virtio-net or virtio-blk device config size */
+ u32 config_size;
+ /* Notificaiton bar address */
+ void __iomem *notify_base;
+ phys_addr_t notify_base_pa;
+ u32 notify_off_multiplier;
+ u32 dev_type;
+ u64 hw_features;
+ /* provisioned device features */
+ u64 dev_features;
+ struct virtio_pci_common_cfg __iomem *common_cfg;
+ void __iomem *dev_cfg;
+ struct vring_info *vring;
+ void __iomem * const *base;
+ char config_msix_name[256];
+ struct vdpa_callback config_cb;
+ int config_irq;
+ int vqs_reused_irq;
+ u16 nr_vring;
+ /* VIRTIO_PCI_CAP_DEVICE_CFG size */
+ u32 num_msix_vectors;
+ u32 cap_dev_config_size;
+ struct pci_dev *pdev;
+};
+
+struct ifcvf_adapter {
+ struct vdpa_device vdpa;
+ struct pci_dev *pdev;
+ struct ifcvf_hw *vf;
+};
+
+struct ifcvf_vdpa_mgmt_dev {
+ struct vdpa_mgmt_dev mdev;
+ struct ifcvf_hw vf;
+ struct ifcvf_adapter *adapter;
+ struct pci_dev *pdev;
+};
+
+int ifcvf_init_hw(struct ifcvf_hw *hw, struct pci_dev *dev);
+void ifcvf_stop(struct ifcvf_hw *hw);
+void ifcvf_notify_queue(struct ifcvf_hw *hw, u16 qid);
+void ifcvf_read_dev_config(struct ifcvf_hw *hw, u64 offset,
+ void *dst, int length);
+void ifcvf_write_dev_config(struct ifcvf_hw *hw, u64 offset,
+ const void *src, int length);
+u8 ifcvf_get_status(struct ifcvf_hw *hw);
+void ifcvf_set_status(struct ifcvf_hw *hw, u8 status);
+void io_write64_twopart(u64 val, u32 *lo, u32 *hi);
+void ifcvf_reset(struct ifcvf_hw *hw);
+u64 ifcvf_get_dev_features(struct ifcvf_hw *hw);
+u64 ifcvf_get_hw_features(struct ifcvf_hw *hw);
+int ifcvf_verify_min_features(struct ifcvf_hw *hw, u64 features);
+u16 ifcvf_get_vq_state(struct ifcvf_hw *hw, u16 qid);
+int ifcvf_set_vq_state(struct ifcvf_hw *hw, u16 qid, u16 num);
+struct ifcvf_adapter *vf_to_adapter(struct ifcvf_hw *hw);
+int ifcvf_probed_virtio_net(struct ifcvf_hw *hw);
+u32 ifcvf_get_config_size(struct ifcvf_hw *hw);
+u16 ifcvf_set_vq_vector(struct ifcvf_hw *hw, u16 qid, int vector);
+u16 ifcvf_set_config_vector(struct ifcvf_hw *hw, int vector);
+void ifcvf_set_vq_num(struct ifcvf_hw *hw, u16 qid, u32 num);
+int ifcvf_set_vq_address(struct ifcvf_hw *hw, u16 qid, u64 desc_area,
+ u64 driver_area, u64 device_area);
+bool ifcvf_get_vq_ready(struct ifcvf_hw *hw, u16 qid);
+void ifcvf_set_vq_ready(struct ifcvf_hw *hw, u16 qid, bool ready);
+void ifcvf_set_driver_features(struct ifcvf_hw *hw, u64 features);
+u64 ifcvf_get_driver_features(struct ifcvf_hw *hw);
+u16 ifcvf_get_max_vq_size(struct ifcvf_hw *hw);
+#endif /* _IFCVF_H_ */
diff --git a/drivers/vdpa/ifcvf/ifcvf_main.c b/drivers/vdpa/ifcvf/ifcvf_main.c
new file mode 100644
index 0000000000..e98fa8100f
--- /dev/null
+++ b/drivers/vdpa/ifcvf/ifcvf_main.c
@@ -0,0 +1,882 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Intel IFC VF NIC driver for virtio dataplane offloading
+ *
+ * Copyright (C) 2020 Intel Corporation.
+ *
+ * Author: Zhu Lingshan <lingshan.zhu@intel.com>
+ *
+ */
+
+#include <linux/interrupt.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/sysfs.h>
+#include "ifcvf_base.h"
+
+#define DRIVER_AUTHOR "Intel Corporation"
+#define IFCVF_DRIVER_NAME "ifcvf"
+
+static irqreturn_t ifcvf_config_changed(int irq, void *arg)
+{
+ struct ifcvf_hw *vf = arg;
+
+ if (vf->config_cb.callback)
+ return vf->config_cb.callback(vf->config_cb.private);
+
+ return IRQ_HANDLED;
+}
+
+static irqreturn_t ifcvf_vq_intr_handler(int irq, void *arg)
+{
+ struct vring_info *vring = arg;
+
+ if (vring->cb.callback)
+ return vring->cb.callback(vring->cb.private);
+
+ return IRQ_HANDLED;
+}
+
+static irqreturn_t ifcvf_vqs_reused_intr_handler(int irq, void *arg)
+{
+ struct ifcvf_hw *vf = arg;
+ struct vring_info *vring;
+ int i;
+
+ for (i = 0; i < vf->nr_vring; i++) {
+ vring = &vf->vring[i];
+ if (vring->cb.callback)
+ vring->cb.callback(vring->cb.private);
+ }
+
+ return IRQ_HANDLED;
+}
+
+static irqreturn_t ifcvf_dev_intr_handler(int irq, void *arg)
+{
+ struct ifcvf_hw *vf = arg;
+ u8 isr;
+
+ isr = vp_ioread8(vf->isr);
+ if (isr & VIRTIO_PCI_ISR_CONFIG)
+ ifcvf_config_changed(irq, arg);
+
+ return ifcvf_vqs_reused_intr_handler(irq, arg);
+}
+
+static void ifcvf_free_irq_vectors(void *data)
+{
+ pci_free_irq_vectors(data);
+}
+
+static void ifcvf_free_per_vq_irq(struct ifcvf_hw *vf)
+{
+ struct pci_dev *pdev = vf->pdev;
+ int i;
+
+ for (i = 0; i < vf->nr_vring; i++) {
+ if (vf->vring[i].irq != -EINVAL) {
+ devm_free_irq(&pdev->dev, vf->vring[i].irq, &vf->vring[i]);
+ vf->vring[i].irq = -EINVAL;
+ }
+ }
+}
+
+static void ifcvf_free_vqs_reused_irq(struct ifcvf_hw *vf)
+{
+ struct pci_dev *pdev = vf->pdev;
+
+ if (vf->vqs_reused_irq != -EINVAL) {
+ devm_free_irq(&pdev->dev, vf->vqs_reused_irq, vf);
+ vf->vqs_reused_irq = -EINVAL;
+ }
+
+}
+
+static void ifcvf_free_vq_irq(struct ifcvf_hw *vf)
+{
+ if (vf->msix_vector_status == MSIX_VECTOR_PER_VQ_AND_CONFIG)
+ ifcvf_free_per_vq_irq(vf);
+ else
+ ifcvf_free_vqs_reused_irq(vf);
+}
+
+static void ifcvf_free_config_irq(struct ifcvf_hw *vf)
+{
+ struct pci_dev *pdev = vf->pdev;
+
+ if (vf->config_irq == -EINVAL)
+ return;
+
+ /* If the irq is shared by all vqs and the config interrupt,
+ * it is already freed in ifcvf_free_vq_irq, so here only
+ * need to free config irq when msix_vector_status != MSIX_VECTOR_DEV_SHARED
+ */
+ if (vf->msix_vector_status != MSIX_VECTOR_DEV_SHARED) {
+ devm_free_irq(&pdev->dev, vf->config_irq, vf);
+ vf->config_irq = -EINVAL;
+ }
+}
+
+static void ifcvf_free_irq(struct ifcvf_hw *vf)
+{
+ struct pci_dev *pdev = vf->pdev;
+
+ ifcvf_free_vq_irq(vf);
+ ifcvf_free_config_irq(vf);
+ ifcvf_free_irq_vectors(pdev);
+ vf->num_msix_vectors = 0;
+}
+
+/* ifcvf MSIX vectors allocator, this helper tries to allocate
+ * vectors for all virtqueues and the config interrupt.
+ * It returns the number of allocated vectors, negative
+ * return value when fails.
+ */
+static int ifcvf_alloc_vectors(struct ifcvf_hw *vf)
+{
+ struct pci_dev *pdev = vf->pdev;
+ int max_intr, ret;
+
+ /* all queues and config interrupt */
+ max_intr = vf->nr_vring + 1;
+ ret = pci_alloc_irq_vectors(pdev, 1, max_intr, PCI_IRQ_MSIX | PCI_IRQ_AFFINITY);
+
+ if (ret < 0) {
+ IFCVF_ERR(pdev, "Failed to alloc IRQ vectors\n");
+ return ret;
+ }
+
+ if (ret < max_intr)
+ IFCVF_INFO(pdev,
+ "Requested %u vectors, however only %u allocated, lower performance\n",
+ max_intr, ret);
+
+ return ret;
+}
+
+static int ifcvf_request_per_vq_irq(struct ifcvf_hw *vf)
+{
+ struct pci_dev *pdev = vf->pdev;
+ int i, vector, ret, irq;
+
+ vf->vqs_reused_irq = -EINVAL;
+ for (i = 0; i < vf->nr_vring; i++) {
+ snprintf(vf->vring[i].msix_name, 256, "ifcvf[%s]-%d\n", pci_name(pdev), i);
+ vector = i;
+ irq = pci_irq_vector(pdev, vector);
+ ret = devm_request_irq(&pdev->dev, irq,
+ ifcvf_vq_intr_handler, 0,
+ vf->vring[i].msix_name,
+ &vf->vring[i]);
+ if (ret) {
+ IFCVF_ERR(pdev, "Failed to request irq for vq %d\n", i);
+ goto err;
+ }
+
+ vf->vring[i].irq = irq;
+ ret = ifcvf_set_vq_vector(vf, i, vector);
+ if (ret == VIRTIO_MSI_NO_VECTOR) {
+ IFCVF_ERR(pdev, "No msix vector for vq %u\n", i);
+ goto err;
+ }
+ }
+
+ return 0;
+err:
+ ifcvf_free_irq(vf);
+
+ return -EFAULT;
+}
+
+static int ifcvf_request_vqs_reused_irq(struct ifcvf_hw *vf)
+{
+ struct pci_dev *pdev = vf->pdev;
+ int i, vector, ret, irq;
+
+ vector = 0;
+ snprintf(vf->vring[0].msix_name, 256, "ifcvf[%s]-vqs-reused-irq\n", pci_name(pdev));
+ irq = pci_irq_vector(pdev, vector);
+ ret = devm_request_irq(&pdev->dev, irq,
+ ifcvf_vqs_reused_intr_handler, 0,
+ vf->vring[0].msix_name, vf);
+ if (ret) {
+ IFCVF_ERR(pdev, "Failed to request reused irq for the device\n");
+ goto err;
+ }
+
+ vf->vqs_reused_irq = irq;
+ for (i = 0; i < vf->nr_vring; i++) {
+ vf->vring[i].irq = -EINVAL;
+ ret = ifcvf_set_vq_vector(vf, i, vector);
+ if (ret == VIRTIO_MSI_NO_VECTOR) {
+ IFCVF_ERR(pdev, "No msix vector for vq %u\n", i);
+ goto err;
+ }
+ }
+
+ return 0;
+err:
+ ifcvf_free_irq(vf);
+
+ return -EFAULT;
+}
+
+static int ifcvf_request_dev_irq(struct ifcvf_hw *vf)
+{
+ struct pci_dev *pdev = vf->pdev;
+ int i, vector, ret, irq;
+
+ vector = 0;
+ snprintf(vf->vring[0].msix_name, 256, "ifcvf[%s]-dev-irq\n", pci_name(pdev));
+ irq = pci_irq_vector(pdev, vector);
+ ret = devm_request_irq(&pdev->dev, irq,
+ ifcvf_dev_intr_handler, 0,
+ vf->vring[0].msix_name, vf);
+ if (ret) {
+ IFCVF_ERR(pdev, "Failed to request irq for the device\n");
+ goto err;
+ }
+
+ vf->vqs_reused_irq = irq;
+ for (i = 0; i < vf->nr_vring; i++) {
+ vf->vring[i].irq = -EINVAL;
+ ret = ifcvf_set_vq_vector(vf, i, vector);
+ if (ret == VIRTIO_MSI_NO_VECTOR) {
+ IFCVF_ERR(pdev, "No msix vector for vq %u\n", i);
+ goto err;
+ }
+ }
+
+ vf->config_irq = irq;
+ ret = ifcvf_set_config_vector(vf, vector);
+ if (ret == VIRTIO_MSI_NO_VECTOR) {
+ IFCVF_ERR(pdev, "No msix vector for device config\n");
+ goto err;
+ }
+
+ return 0;
+err:
+ ifcvf_free_irq(vf);
+
+ return -EFAULT;
+
+}
+
+static int ifcvf_request_vq_irq(struct ifcvf_hw *vf)
+{
+ int ret;
+
+ if (vf->msix_vector_status == MSIX_VECTOR_PER_VQ_AND_CONFIG)
+ ret = ifcvf_request_per_vq_irq(vf);
+ else
+ ret = ifcvf_request_vqs_reused_irq(vf);
+
+ return ret;
+}
+
+static int ifcvf_request_config_irq(struct ifcvf_hw *vf)
+{
+ struct pci_dev *pdev = vf->pdev;
+ int config_vector, ret;
+
+ if (vf->msix_vector_status == MSIX_VECTOR_PER_VQ_AND_CONFIG)
+ config_vector = vf->nr_vring;
+ else if (vf->msix_vector_status == MSIX_VECTOR_SHARED_VQ_AND_CONFIG)
+ /* vector 0 for vqs and 1 for config interrupt */
+ config_vector = 1;
+ else if (vf->msix_vector_status == MSIX_VECTOR_DEV_SHARED)
+ /* re-use the vqs vector */
+ return 0;
+ else
+ return -EINVAL;
+
+ snprintf(vf->config_msix_name, 256, "ifcvf[%s]-config\n",
+ pci_name(pdev));
+ vf->config_irq = pci_irq_vector(pdev, config_vector);
+ ret = devm_request_irq(&pdev->dev, vf->config_irq,
+ ifcvf_config_changed, 0,
+ vf->config_msix_name, vf);
+ if (ret) {
+ IFCVF_ERR(pdev, "Failed to request config irq\n");
+ goto err;
+ }
+
+ ret = ifcvf_set_config_vector(vf, config_vector);
+ if (ret == VIRTIO_MSI_NO_VECTOR) {
+ IFCVF_ERR(pdev, "No msix vector for device config\n");
+ goto err;
+ }
+
+ return 0;
+err:
+ ifcvf_free_irq(vf);
+
+ return -EFAULT;
+}
+
+static int ifcvf_request_irq(struct ifcvf_hw *vf)
+{
+ int nvectors, ret, max_intr;
+
+ nvectors = ifcvf_alloc_vectors(vf);
+ if (nvectors <= 0)
+ return -EFAULT;
+
+ vf->msix_vector_status = MSIX_VECTOR_PER_VQ_AND_CONFIG;
+ max_intr = vf->nr_vring + 1;
+ if (nvectors < max_intr)
+ vf->msix_vector_status = MSIX_VECTOR_SHARED_VQ_AND_CONFIG;
+
+ if (nvectors == 1) {
+ vf->msix_vector_status = MSIX_VECTOR_DEV_SHARED;
+ ret = ifcvf_request_dev_irq(vf);
+
+ return ret;
+ }
+
+ ret = ifcvf_request_vq_irq(vf);
+ if (ret)
+ return ret;
+
+ ret = ifcvf_request_config_irq(vf);
+
+ if (ret)
+ return ret;
+
+ vf->num_msix_vectors = nvectors;
+
+ return 0;
+}
+
+static struct ifcvf_adapter *vdpa_to_adapter(struct vdpa_device *vdpa_dev)
+{
+ return container_of(vdpa_dev, struct ifcvf_adapter, vdpa);
+}
+
+static struct ifcvf_hw *vdpa_to_vf(struct vdpa_device *vdpa_dev)
+{
+ struct ifcvf_adapter *adapter = vdpa_to_adapter(vdpa_dev);
+
+ return adapter->vf;
+}
+
+static u64 ifcvf_vdpa_get_device_features(struct vdpa_device *vdpa_dev)
+{
+ struct ifcvf_adapter *adapter = vdpa_to_adapter(vdpa_dev);
+ struct ifcvf_hw *vf = vdpa_to_vf(vdpa_dev);
+ struct pci_dev *pdev = adapter->pdev;
+ u32 type = vf->dev_type;
+ u64 features;
+
+ if (type == VIRTIO_ID_NET || type == VIRTIO_ID_BLOCK)
+ features = ifcvf_get_dev_features(vf);
+ else {
+ features = 0;
+ IFCVF_ERR(pdev, "VIRTIO ID %u not supported\n", vf->dev_type);
+ }
+
+ return features;
+}
+
+static int ifcvf_vdpa_set_driver_features(struct vdpa_device *vdpa_dev, u64 features)
+{
+ struct ifcvf_hw *vf = vdpa_to_vf(vdpa_dev);
+ int ret;
+
+ ret = ifcvf_verify_min_features(vf, features);
+ if (ret)
+ return ret;
+
+ ifcvf_set_driver_features(vf, features);
+
+ return 0;
+}
+
+static u64 ifcvf_vdpa_get_driver_features(struct vdpa_device *vdpa_dev)
+{
+ struct ifcvf_hw *vf = vdpa_to_vf(vdpa_dev);
+ u64 features;
+
+ features = ifcvf_get_driver_features(vf);
+
+ return features;
+}
+
+static u8 ifcvf_vdpa_get_status(struct vdpa_device *vdpa_dev)
+{
+ struct ifcvf_hw *vf = vdpa_to_vf(vdpa_dev);
+
+ return ifcvf_get_status(vf);
+}
+
+static void ifcvf_vdpa_set_status(struct vdpa_device *vdpa_dev, u8 status)
+{
+ struct ifcvf_hw *vf;
+ u8 status_old;
+ int ret;
+
+ vf = vdpa_to_vf(vdpa_dev);
+ status_old = ifcvf_get_status(vf);
+
+ if (status_old == status)
+ return;
+
+ if ((status & VIRTIO_CONFIG_S_DRIVER_OK) &&
+ !(status_old & VIRTIO_CONFIG_S_DRIVER_OK)) {
+ ret = ifcvf_request_irq(vf);
+ if (ret) {
+ IFCVF_ERR(vf->pdev, "failed to request irq with error %d\n", ret);
+ return;
+ }
+ }
+
+ ifcvf_set_status(vf, status);
+}
+
+static int ifcvf_vdpa_reset(struct vdpa_device *vdpa_dev)
+{
+ struct ifcvf_hw *vf = vdpa_to_vf(vdpa_dev);
+ u8 status = ifcvf_get_status(vf);
+
+ ifcvf_stop(vf);
+
+ if (status & VIRTIO_CONFIG_S_DRIVER_OK)
+ ifcvf_free_irq(vf);
+
+ ifcvf_reset(vf);
+
+ return 0;
+}
+
+static u16 ifcvf_vdpa_get_vq_num_max(struct vdpa_device *vdpa_dev)
+{
+ struct ifcvf_hw *vf = vdpa_to_vf(vdpa_dev);
+
+ return ifcvf_get_max_vq_size(vf);
+}
+
+static int ifcvf_vdpa_get_vq_state(struct vdpa_device *vdpa_dev, u16 qid,
+ struct vdpa_vq_state *state)
+{
+ struct ifcvf_hw *vf = vdpa_to_vf(vdpa_dev);
+
+ state->split.avail_index = ifcvf_get_vq_state(vf, qid);
+ return 0;
+}
+
+static int ifcvf_vdpa_set_vq_state(struct vdpa_device *vdpa_dev, u16 qid,
+ const struct vdpa_vq_state *state)
+{
+ struct ifcvf_hw *vf = vdpa_to_vf(vdpa_dev);
+
+ return ifcvf_set_vq_state(vf, qid, state->split.avail_index);
+}
+
+static void ifcvf_vdpa_set_vq_cb(struct vdpa_device *vdpa_dev, u16 qid,
+ struct vdpa_callback *cb)
+{
+ struct ifcvf_hw *vf = vdpa_to_vf(vdpa_dev);
+
+ vf->vring[qid].cb = *cb;
+}
+
+static void ifcvf_vdpa_set_vq_ready(struct vdpa_device *vdpa_dev,
+ u16 qid, bool ready)
+{
+ struct ifcvf_hw *vf = vdpa_to_vf(vdpa_dev);
+
+ ifcvf_set_vq_ready(vf, qid, ready);
+}
+
+static bool ifcvf_vdpa_get_vq_ready(struct vdpa_device *vdpa_dev, u16 qid)
+{
+ struct ifcvf_hw *vf = vdpa_to_vf(vdpa_dev);
+
+ return ifcvf_get_vq_ready(vf, qid);
+}
+
+static void ifcvf_vdpa_set_vq_num(struct vdpa_device *vdpa_dev, u16 qid,
+ u32 num)
+{
+ struct ifcvf_hw *vf = vdpa_to_vf(vdpa_dev);
+
+ ifcvf_set_vq_num(vf, qid, num);
+}
+
+static int ifcvf_vdpa_set_vq_address(struct vdpa_device *vdpa_dev, u16 qid,
+ u64 desc_area, u64 driver_area,
+ u64 device_area)
+{
+ struct ifcvf_hw *vf = vdpa_to_vf(vdpa_dev);
+
+ return ifcvf_set_vq_address(vf, qid, desc_area, driver_area, device_area);
+}
+
+static void ifcvf_vdpa_kick_vq(struct vdpa_device *vdpa_dev, u16 qid)
+{
+ struct ifcvf_hw *vf = vdpa_to_vf(vdpa_dev);
+
+ ifcvf_notify_queue(vf, qid);
+}
+
+static u32 ifcvf_vdpa_get_generation(struct vdpa_device *vdpa_dev)
+{
+ struct ifcvf_hw *vf = vdpa_to_vf(vdpa_dev);
+
+ return vp_ioread8(&vf->common_cfg->config_generation);
+}
+
+static u32 ifcvf_vdpa_get_device_id(struct vdpa_device *vdpa_dev)
+{
+ struct ifcvf_hw *vf = vdpa_to_vf(vdpa_dev);
+
+ return vf->dev_type;
+}
+
+static u32 ifcvf_vdpa_get_vendor_id(struct vdpa_device *vdpa_dev)
+{
+ struct ifcvf_adapter *adapter = vdpa_to_adapter(vdpa_dev);
+ struct pci_dev *pdev = adapter->pdev;
+
+ return pdev->subsystem_vendor;
+}
+
+static u32 ifcvf_vdpa_get_vq_align(struct vdpa_device *vdpa_dev)
+{
+ return IFCVF_QUEUE_ALIGNMENT;
+}
+
+static size_t ifcvf_vdpa_get_config_size(struct vdpa_device *vdpa_dev)
+{
+ struct ifcvf_hw *vf = vdpa_to_vf(vdpa_dev);
+
+ return vf->config_size;
+}
+
+static u32 ifcvf_vdpa_get_vq_group(struct vdpa_device *vdpa, u16 idx)
+{
+ return 0;
+}
+
+static void ifcvf_vdpa_get_config(struct vdpa_device *vdpa_dev,
+ unsigned int offset,
+ void *buf, unsigned int len)
+{
+ struct ifcvf_hw *vf = vdpa_to_vf(vdpa_dev);
+
+ ifcvf_read_dev_config(vf, offset, buf, len);
+}
+
+static void ifcvf_vdpa_set_config(struct vdpa_device *vdpa_dev,
+ unsigned int offset, const void *buf,
+ unsigned int len)
+{
+ struct ifcvf_hw *vf = vdpa_to_vf(vdpa_dev);
+
+ ifcvf_write_dev_config(vf, offset, buf, len);
+}
+
+static void ifcvf_vdpa_set_config_cb(struct vdpa_device *vdpa_dev,
+ struct vdpa_callback *cb)
+{
+ struct ifcvf_hw *vf = vdpa_to_vf(vdpa_dev);
+
+ vf->config_cb.callback = cb->callback;
+ vf->config_cb.private = cb->private;
+}
+
+static int ifcvf_vdpa_get_vq_irq(struct vdpa_device *vdpa_dev,
+ u16 qid)
+{
+ struct ifcvf_hw *vf = vdpa_to_vf(vdpa_dev);
+
+ if (vf->vqs_reused_irq < 0)
+ return vf->vring[qid].irq;
+ else
+ return -EINVAL;
+}
+
+static struct vdpa_notification_area ifcvf_get_vq_notification(struct vdpa_device *vdpa_dev,
+ u16 idx)
+{
+ struct ifcvf_hw *vf = vdpa_to_vf(vdpa_dev);
+ struct vdpa_notification_area area;
+
+ area.addr = vf->vring[idx].notify_pa;
+ if (!vf->notify_off_multiplier)
+ area.size = PAGE_SIZE;
+ else
+ area.size = vf->notify_off_multiplier;
+
+ return area;
+}
+
+/*
+ * IFCVF currently doesn't have on-chip IOMMU, so not
+ * implemented set_map()/dma_map()/dma_unmap()
+ */
+static const struct vdpa_config_ops ifc_vdpa_ops = {
+ .get_device_features = ifcvf_vdpa_get_device_features,
+ .set_driver_features = ifcvf_vdpa_set_driver_features,
+ .get_driver_features = ifcvf_vdpa_get_driver_features,
+ .get_status = ifcvf_vdpa_get_status,
+ .set_status = ifcvf_vdpa_set_status,
+ .reset = ifcvf_vdpa_reset,
+ .get_vq_num_max = ifcvf_vdpa_get_vq_num_max,
+ .get_vq_state = ifcvf_vdpa_get_vq_state,
+ .set_vq_state = ifcvf_vdpa_set_vq_state,
+ .set_vq_cb = ifcvf_vdpa_set_vq_cb,
+ .set_vq_ready = ifcvf_vdpa_set_vq_ready,
+ .get_vq_ready = ifcvf_vdpa_get_vq_ready,
+ .set_vq_num = ifcvf_vdpa_set_vq_num,
+ .set_vq_address = ifcvf_vdpa_set_vq_address,
+ .get_vq_irq = ifcvf_vdpa_get_vq_irq,
+ .kick_vq = ifcvf_vdpa_kick_vq,
+ .get_generation = ifcvf_vdpa_get_generation,
+ .get_device_id = ifcvf_vdpa_get_device_id,
+ .get_vendor_id = ifcvf_vdpa_get_vendor_id,
+ .get_vq_align = ifcvf_vdpa_get_vq_align,
+ .get_vq_group = ifcvf_vdpa_get_vq_group,
+ .get_config_size = ifcvf_vdpa_get_config_size,
+ .get_config = ifcvf_vdpa_get_config,
+ .set_config = ifcvf_vdpa_set_config,
+ .set_config_cb = ifcvf_vdpa_set_config_cb,
+ .get_vq_notification = ifcvf_get_vq_notification,
+};
+
+static struct virtio_device_id id_table_net[] = {
+ {VIRTIO_ID_NET, VIRTIO_DEV_ANY_ID},
+ {0},
+};
+
+static struct virtio_device_id id_table_blk[] = {
+ {VIRTIO_ID_BLOCK, VIRTIO_DEV_ANY_ID},
+ {0},
+};
+
+static u32 get_dev_type(struct pci_dev *pdev)
+{
+ u32 dev_type;
+
+ /* This drirver drives both modern virtio devices and transitional
+ * devices in modern mode.
+ * vDPA requires feature bit VIRTIO_F_ACCESS_PLATFORM,
+ * so legacy devices and transitional devices in legacy
+ * mode will not work for vDPA, this driver will not
+ * drive devices with legacy interface.
+ */
+
+ if (pdev->device < 0x1040)
+ dev_type = pdev->subsystem_device;
+ else
+ dev_type = pdev->device - 0x1040;
+
+ return dev_type;
+}
+
+static int ifcvf_vdpa_dev_add(struct vdpa_mgmt_dev *mdev, const char *name,
+ const struct vdpa_dev_set_config *config)
+{
+ struct ifcvf_vdpa_mgmt_dev *ifcvf_mgmt_dev;
+ struct ifcvf_adapter *adapter;
+ struct vdpa_device *vdpa_dev;
+ struct pci_dev *pdev;
+ struct ifcvf_hw *vf;
+ u64 device_features;
+ int ret;
+
+ ifcvf_mgmt_dev = container_of(mdev, struct ifcvf_vdpa_mgmt_dev, mdev);
+ vf = &ifcvf_mgmt_dev->vf;
+ pdev = vf->pdev;
+ adapter = vdpa_alloc_device(struct ifcvf_adapter, vdpa,
+ &pdev->dev, &ifc_vdpa_ops, 1, 1, NULL, false);
+ if (IS_ERR(adapter)) {
+ IFCVF_ERR(pdev, "Failed to allocate vDPA structure");
+ return PTR_ERR(adapter);
+ }
+
+ ifcvf_mgmt_dev->adapter = adapter;
+ adapter->pdev = pdev;
+ adapter->vdpa.dma_dev = &pdev->dev;
+ adapter->vdpa.mdev = mdev;
+ adapter->vf = vf;
+ vdpa_dev = &adapter->vdpa;
+
+ device_features = vf->hw_features;
+ if (config->mask & BIT_ULL(VDPA_ATTR_DEV_FEATURES)) {
+ if (config->device_features & ~device_features) {
+ IFCVF_ERR(pdev, "The provisioned features 0x%llx are not supported by this device with features 0x%llx\n",
+ config->device_features, device_features);
+ return -EINVAL;
+ }
+ device_features &= config->device_features;
+ }
+ vf->dev_features = device_features;
+
+ if (name)
+ ret = dev_set_name(&vdpa_dev->dev, "%s", name);
+ else
+ ret = dev_set_name(&vdpa_dev->dev, "vdpa%u", vdpa_dev->index);
+
+ ret = _vdpa_register_device(&adapter->vdpa, vf->nr_vring);
+ if (ret) {
+ put_device(&adapter->vdpa.dev);
+ IFCVF_ERR(pdev, "Failed to register to vDPA bus");
+ return ret;
+ }
+
+ return 0;
+}
+
+static void ifcvf_vdpa_dev_del(struct vdpa_mgmt_dev *mdev, struct vdpa_device *dev)
+{
+ struct ifcvf_vdpa_mgmt_dev *ifcvf_mgmt_dev;
+
+ ifcvf_mgmt_dev = container_of(mdev, struct ifcvf_vdpa_mgmt_dev, mdev);
+ _vdpa_unregister_device(dev);
+ ifcvf_mgmt_dev->adapter = NULL;
+}
+
+static const struct vdpa_mgmtdev_ops ifcvf_vdpa_mgmt_dev_ops = {
+ .dev_add = ifcvf_vdpa_dev_add,
+ .dev_del = ifcvf_vdpa_dev_del
+};
+
+static int ifcvf_probe(struct pci_dev *pdev, const struct pci_device_id *id)
+{
+ struct ifcvf_vdpa_mgmt_dev *ifcvf_mgmt_dev;
+ struct device *dev = &pdev->dev;
+ struct ifcvf_hw *vf;
+ u32 dev_type;
+ int ret, i;
+
+ ret = pcim_enable_device(pdev);
+ if (ret) {
+ IFCVF_ERR(pdev, "Failed to enable device\n");
+ return ret;
+ }
+ ret = pcim_iomap_regions(pdev, BIT(0) | BIT(2) | BIT(4),
+ IFCVF_DRIVER_NAME);
+ if (ret) {
+ IFCVF_ERR(pdev, "Failed to request MMIO region\n");
+ return ret;
+ }
+
+ ret = dma_set_mask_and_coherent(dev, DMA_BIT_MASK(64));
+ if (ret) {
+ IFCVF_ERR(pdev, "No usable DMA configuration\n");
+ return ret;
+ }
+
+ ret = devm_add_action_or_reset(dev, ifcvf_free_irq_vectors, pdev);
+ if (ret) {
+ IFCVF_ERR(pdev,
+ "Failed for adding devres for freeing irq vectors\n");
+ return ret;
+ }
+
+ pci_set_master(pdev);
+ ifcvf_mgmt_dev = kzalloc(sizeof(struct ifcvf_vdpa_mgmt_dev), GFP_KERNEL);
+ if (!ifcvf_mgmt_dev) {
+ IFCVF_ERR(pdev, "Failed to alloc memory for the vDPA management device\n");
+ return -ENOMEM;
+ }
+
+ vf = &ifcvf_mgmt_dev->vf;
+ vf->dev_type = get_dev_type(pdev);
+ vf->base = pcim_iomap_table(pdev);
+ vf->pdev = pdev;
+
+ ret = ifcvf_init_hw(vf, pdev);
+ if (ret) {
+ IFCVF_ERR(pdev, "Failed to init IFCVF hw\n");
+ goto err;
+ }
+
+ for (i = 0; i < vf->nr_vring; i++)
+ vf->vring[i].irq = -EINVAL;
+
+ vf->hw_features = ifcvf_get_hw_features(vf);
+ vf->config_size = ifcvf_get_config_size(vf);
+
+ dev_type = get_dev_type(pdev);
+ switch (dev_type) {
+ case VIRTIO_ID_NET:
+ ifcvf_mgmt_dev->mdev.id_table = id_table_net;
+ break;
+ case VIRTIO_ID_BLOCK:
+ ifcvf_mgmt_dev->mdev.id_table = id_table_blk;
+ break;
+ default:
+ IFCVF_ERR(pdev, "VIRTIO ID %u not supported\n", dev_type);
+ ret = -EOPNOTSUPP;
+ goto err;
+ }
+
+ ifcvf_mgmt_dev->mdev.ops = &ifcvf_vdpa_mgmt_dev_ops;
+ ifcvf_mgmt_dev->mdev.device = dev;
+ ifcvf_mgmt_dev->mdev.max_supported_vqs = vf->nr_vring;
+ ifcvf_mgmt_dev->mdev.supported_features = vf->hw_features;
+ ifcvf_mgmt_dev->mdev.config_attr_mask = (1 << VDPA_ATTR_DEV_FEATURES);
+
+ ret = vdpa_mgmtdev_register(&ifcvf_mgmt_dev->mdev);
+ if (ret) {
+ IFCVF_ERR(pdev,
+ "Failed to initialize the management interfaces\n");
+ goto err;
+ }
+
+ pci_set_drvdata(pdev, ifcvf_mgmt_dev);
+
+ return 0;
+
+err:
+ kfree(ifcvf_mgmt_dev->vf.vring);
+ kfree(ifcvf_mgmt_dev);
+ return ret;
+}
+
+static void ifcvf_remove(struct pci_dev *pdev)
+{
+ struct ifcvf_vdpa_mgmt_dev *ifcvf_mgmt_dev;
+
+ ifcvf_mgmt_dev = pci_get_drvdata(pdev);
+ vdpa_mgmtdev_unregister(&ifcvf_mgmt_dev->mdev);
+ kfree(ifcvf_mgmt_dev->vf.vring);
+ kfree(ifcvf_mgmt_dev);
+}
+
+static struct pci_device_id ifcvf_pci_ids[] = {
+ /* N3000 network device */
+ { PCI_DEVICE_SUB(PCI_VENDOR_ID_REDHAT_QUMRANET,
+ N3000_DEVICE_ID,
+ PCI_VENDOR_ID_INTEL,
+ N3000_SUBSYS_DEVICE_ID) },
+ /* C5000X-PL network device
+ * F2000X-PL network device
+ */
+ { PCI_DEVICE_SUB(PCI_VENDOR_ID_REDHAT_QUMRANET,
+ VIRTIO_TRANS_ID_NET,
+ PCI_VENDOR_ID_INTEL,
+ VIRTIO_ID_NET) },
+ /* C5000X-PL block device */
+ { PCI_DEVICE_SUB(PCI_VENDOR_ID_REDHAT_QUMRANET,
+ VIRTIO_TRANS_ID_BLOCK,
+ PCI_VENDOR_ID_INTEL,
+ VIRTIO_ID_BLOCK) },
+
+ { 0 },
+};
+MODULE_DEVICE_TABLE(pci, ifcvf_pci_ids);
+
+static struct pci_driver ifcvf_driver = {
+ .name = IFCVF_DRIVER_NAME,
+ .id_table = ifcvf_pci_ids,
+ .probe = ifcvf_probe,
+ .remove = ifcvf_remove,
+};
+
+module_pci_driver(ifcvf_driver);
+
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/vdpa/mlx5/Makefile b/drivers/vdpa/mlx5/Makefile
new file mode 100644
index 0000000000..e791394c33
--- /dev/null
+++ b/drivers/vdpa/mlx5/Makefile
@@ -0,0 +1,4 @@
+subdir-ccflags-y += -I$(srctree)/drivers/vdpa/mlx5/core
+
+obj-$(CONFIG_MLX5_VDPA_NET) += mlx5_vdpa.o
+mlx5_vdpa-$(CONFIG_MLX5_VDPA_NET) += net/mlx5_vnet.o core/resources.o core/mr.o net/debug.o
diff --git a/drivers/vdpa/mlx5/core/mlx5_vdpa.h b/drivers/vdpa/mlx5/core/mlx5_vdpa.h
new file mode 100644
index 0000000000..ca56242972
--- /dev/null
+++ b/drivers/vdpa/mlx5/core/mlx5_vdpa.h
@@ -0,0 +1,136 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/* Copyright (c) 2020 Mellanox Technologies Ltd. */
+
+#ifndef __MLX5_VDPA_H__
+#define __MLX5_VDPA_H__
+
+#include <linux/etherdevice.h>
+#include <linux/vringh.h>
+#include <linux/vdpa.h>
+#include <linux/mlx5/driver.h>
+
+#define MLX5V_ETH_HARD_MTU (ETH_HLEN + VLAN_HLEN + ETH_FCS_LEN)
+
+struct mlx5_vdpa_direct_mr {
+ u64 start;
+ u64 end;
+ u32 perm;
+ u32 mr;
+ struct sg_table sg_head;
+ int log_size;
+ int nsg;
+ int nent;
+ struct list_head list;
+ u64 offset;
+};
+
+struct mlx5_vdpa_mr {
+ u32 mkey;
+
+ /* list of direct MRs descendants of this indirect mr */
+ struct list_head head;
+ unsigned long num_directs;
+ unsigned long num_klms;
+ /* state of dvq mr */
+ bool initialized;
+
+ /* serialize mkey creation and destruction */
+ struct mutex mkey_mtx;
+ bool user_mr;
+};
+
+struct mlx5_vdpa_resources {
+ u32 pdn;
+ struct mlx5_uars_page *uar;
+ void __iomem *kick_addr;
+ u64 phys_kick_addr;
+ u16 uid;
+ u32 null_mkey;
+ bool valid;
+};
+
+struct mlx5_control_vq {
+ struct vhost_iotlb *iotlb;
+ /* spinlock to synchronize iommu table */
+ spinlock_t iommu_lock;
+ struct vringh vring;
+ bool ready;
+ u64 desc_addr;
+ u64 device_addr;
+ u64 driver_addr;
+ struct vdpa_callback event_cb;
+ struct vringh_kiov riov;
+ struct vringh_kiov wiov;
+ unsigned short head;
+ unsigned int received_desc;
+ unsigned int completed_desc;
+};
+
+struct mlx5_vdpa_wq_ent {
+ struct work_struct work;
+ struct mlx5_vdpa_dev *mvdev;
+};
+
+enum {
+ MLX5_VDPA_DATAVQ_GROUP,
+ MLX5_VDPA_CVQ_GROUP,
+ MLX5_VDPA_NUMVQ_GROUPS
+};
+
+enum {
+ MLX5_VDPA_NUM_AS = MLX5_VDPA_NUMVQ_GROUPS
+};
+
+struct mlx5_vdpa_dev {
+ struct vdpa_device vdev;
+ struct mlx5_core_dev *mdev;
+ struct mlx5_vdpa_resources res;
+
+ u64 mlx_features;
+ u64 actual_features;
+ u8 status;
+ u32 max_vqs;
+ u16 max_idx;
+ u32 generation;
+
+ struct mlx5_vdpa_mr mr;
+ struct mlx5_control_vq cvq;
+ struct workqueue_struct *wq;
+ unsigned int group2asid[MLX5_VDPA_NUMVQ_GROUPS];
+ bool suspended;
+};
+
+int mlx5_vdpa_create_tis(struct mlx5_vdpa_dev *mvdev, void *in, u32 *tisn);
+void mlx5_vdpa_destroy_tis(struct mlx5_vdpa_dev *mvdev, u32 tisn);
+int mlx5_vdpa_create_rqt(struct mlx5_vdpa_dev *mvdev, void *in, int inlen, u32 *rqtn);
+int mlx5_vdpa_modify_rqt(struct mlx5_vdpa_dev *mvdev, void *in, int inlen, u32 rqtn);
+void mlx5_vdpa_destroy_rqt(struct mlx5_vdpa_dev *mvdev, u32 rqtn);
+int mlx5_vdpa_create_tir(struct mlx5_vdpa_dev *mvdev, void *in, u32 *tirn);
+void mlx5_vdpa_destroy_tir(struct mlx5_vdpa_dev *mvdev, u32 tirn);
+int mlx5_vdpa_alloc_transport_domain(struct mlx5_vdpa_dev *mvdev, u32 *tdn);
+void mlx5_vdpa_dealloc_transport_domain(struct mlx5_vdpa_dev *mvdev, u32 tdn);
+int mlx5_vdpa_alloc_resources(struct mlx5_vdpa_dev *mvdev);
+void mlx5_vdpa_free_resources(struct mlx5_vdpa_dev *mvdev);
+int mlx5_vdpa_create_mkey(struct mlx5_vdpa_dev *mvdev, u32 *mkey, u32 *in,
+ int inlen);
+int mlx5_vdpa_destroy_mkey(struct mlx5_vdpa_dev *mvdev, u32 mkey);
+int mlx5_vdpa_handle_set_map(struct mlx5_vdpa_dev *mvdev, struct vhost_iotlb *iotlb,
+ bool *change_map, unsigned int asid);
+int mlx5_vdpa_create_mr(struct mlx5_vdpa_dev *mvdev, struct vhost_iotlb *iotlb,
+ unsigned int asid);
+void mlx5_vdpa_destroy_mr(struct mlx5_vdpa_dev *mvdev);
+void mlx5_vdpa_destroy_mr_asid(struct mlx5_vdpa_dev *mvdev, unsigned int asid);
+
+#define mlx5_vdpa_warn(__dev, format, ...) \
+ dev_warn((__dev)->mdev->device, "%s:%d:(pid %d) warning: " format, __func__, __LINE__, \
+ current->pid, ##__VA_ARGS__)
+
+#define mlx5_vdpa_info(__dev, format, ...) \
+ dev_info((__dev)->mdev->device, "%s:%d:(pid %d): " format, __func__, __LINE__, \
+ current->pid, ##__VA_ARGS__)
+
+#define mlx5_vdpa_dbg(__dev, format, ...) \
+ dev_debug((__dev)->mdev->device, "%s:%d:(pid %d): " format, __func__, __LINE__, \
+ current->pid, ##__VA_ARGS__)
+
+#endif /* __MLX5_VDPA_H__ */
diff --git a/drivers/vdpa/mlx5/core/mr.c b/drivers/vdpa/mlx5/core/mr.c
new file mode 100644
index 0000000000..5a1971fcd8
--- /dev/null
+++ b/drivers/vdpa/mlx5/core/mr.c
@@ -0,0 +1,621 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/* Copyright (c) 2020 Mellanox Technologies Ltd. */
+
+#include <linux/vhost_types.h>
+#include <linux/vdpa.h>
+#include <linux/gcd.h>
+#include <linux/string.h>
+#include <linux/mlx5/qp.h>
+#include "mlx5_vdpa.h"
+
+/* DIV_ROUND_UP where the divider is a power of 2 give by its log base 2 value */
+#define MLX5_DIV_ROUND_UP_POW2(_n, _s) \
+({ \
+ u64 __s = _s; \
+ u64 _res; \
+ _res = (((_n) + (1 << (__s)) - 1) >> (__s)); \
+ _res; \
+})
+
+static int get_octo_len(u64 len, int page_shift)
+{
+ u64 page_size = 1ULL << page_shift;
+ int npages;
+
+ npages = ALIGN(len, page_size) >> page_shift;
+ return (npages + 1) / 2;
+}
+
+static void mlx5_set_access_mode(void *mkc, int mode)
+{
+ MLX5_SET(mkc, mkc, access_mode_1_0, mode & 0x3);
+ MLX5_SET(mkc, mkc, access_mode_4_2, mode >> 2);
+}
+
+static void populate_mtts(struct mlx5_vdpa_direct_mr *mr, __be64 *mtt)
+{
+ struct scatterlist *sg;
+ int nsg = mr->nsg;
+ u64 dma_addr;
+ u64 dma_len;
+ int j = 0;
+ int i;
+
+ for_each_sg(mr->sg_head.sgl, sg, mr->nent, i) {
+ for (dma_addr = sg_dma_address(sg), dma_len = sg_dma_len(sg);
+ nsg && dma_len;
+ nsg--, dma_addr += BIT(mr->log_size), dma_len -= BIT(mr->log_size))
+ mtt[j++] = cpu_to_be64(dma_addr);
+ }
+}
+
+static int create_direct_mr(struct mlx5_vdpa_dev *mvdev, struct mlx5_vdpa_direct_mr *mr)
+{
+ int inlen;
+ void *mkc;
+ void *in;
+ int err;
+
+ inlen = MLX5_ST_SZ_BYTES(create_mkey_in) + roundup(MLX5_ST_SZ_BYTES(mtt) * mr->nsg, 16);
+ in = kvzalloc(inlen, GFP_KERNEL);
+ if (!in)
+ return -ENOMEM;
+
+ MLX5_SET(create_mkey_in, in, uid, mvdev->res.uid);
+ mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
+ MLX5_SET(mkc, mkc, lw, !!(mr->perm & VHOST_MAP_WO));
+ MLX5_SET(mkc, mkc, lr, !!(mr->perm & VHOST_MAP_RO));
+ mlx5_set_access_mode(mkc, MLX5_MKC_ACCESS_MODE_MTT);
+ MLX5_SET(mkc, mkc, qpn, 0xffffff);
+ MLX5_SET(mkc, mkc, pd, mvdev->res.pdn);
+ MLX5_SET64(mkc, mkc, start_addr, mr->offset);
+ MLX5_SET64(mkc, mkc, len, mr->end - mr->start);
+ MLX5_SET(mkc, mkc, log_page_size, mr->log_size);
+ MLX5_SET(mkc, mkc, translations_octword_size,
+ get_octo_len(mr->end - mr->start, mr->log_size));
+ MLX5_SET(create_mkey_in, in, translations_octword_actual_size,
+ get_octo_len(mr->end - mr->start, mr->log_size));
+ populate_mtts(mr, MLX5_ADDR_OF(create_mkey_in, in, klm_pas_mtt));
+ err = mlx5_vdpa_create_mkey(mvdev, &mr->mr, in, inlen);
+ kvfree(in);
+ if (err) {
+ mlx5_vdpa_warn(mvdev, "Failed to create direct MR\n");
+ return err;
+ }
+
+ return 0;
+}
+
+static void destroy_direct_mr(struct mlx5_vdpa_dev *mvdev, struct mlx5_vdpa_direct_mr *mr)
+{
+ mlx5_vdpa_destroy_mkey(mvdev, mr->mr);
+}
+
+static u64 map_start(struct vhost_iotlb_map *map, struct mlx5_vdpa_direct_mr *mr)
+{
+ return max_t(u64, map->start, mr->start);
+}
+
+static u64 map_end(struct vhost_iotlb_map *map, struct mlx5_vdpa_direct_mr *mr)
+{
+ return min_t(u64, map->last + 1, mr->end);
+}
+
+static u64 maplen(struct vhost_iotlb_map *map, struct mlx5_vdpa_direct_mr *mr)
+{
+ return map_end(map, mr) - map_start(map, mr);
+}
+
+#define MLX5_VDPA_INVALID_START_ADDR ((u64)-1)
+#define MLX5_VDPA_INVALID_LEN ((u64)-1)
+
+static u64 indir_start_addr(struct mlx5_vdpa_mr *mkey)
+{
+ struct mlx5_vdpa_direct_mr *s;
+
+ s = list_first_entry_or_null(&mkey->head, struct mlx5_vdpa_direct_mr, list);
+ if (!s)
+ return MLX5_VDPA_INVALID_START_ADDR;
+
+ return s->start;
+}
+
+static u64 indir_len(struct mlx5_vdpa_mr *mkey)
+{
+ struct mlx5_vdpa_direct_mr *s;
+ struct mlx5_vdpa_direct_mr *e;
+
+ s = list_first_entry_or_null(&mkey->head, struct mlx5_vdpa_direct_mr, list);
+ if (!s)
+ return MLX5_VDPA_INVALID_LEN;
+
+ e = list_last_entry(&mkey->head, struct mlx5_vdpa_direct_mr, list);
+
+ return e->end - s->start;
+}
+
+#define LOG_MAX_KLM_SIZE 30
+#define MAX_KLM_SIZE BIT(LOG_MAX_KLM_SIZE)
+
+static u32 klm_bcount(u64 size)
+{
+ return (u32)size;
+}
+
+static void fill_indir(struct mlx5_vdpa_dev *mvdev, struct mlx5_vdpa_mr *mkey, void *in)
+{
+ struct mlx5_vdpa_direct_mr *dmr;
+ struct mlx5_klm *klmarr;
+ struct mlx5_klm *klm;
+ bool first = true;
+ u64 preve;
+ int i;
+
+ klmarr = MLX5_ADDR_OF(create_mkey_in, in, klm_pas_mtt);
+ i = 0;
+ list_for_each_entry(dmr, &mkey->head, list) {
+again:
+ klm = &klmarr[i++];
+ if (first) {
+ preve = dmr->start;
+ first = false;
+ }
+
+ if (preve == dmr->start) {
+ klm->key = cpu_to_be32(dmr->mr);
+ klm->bcount = cpu_to_be32(klm_bcount(dmr->end - dmr->start));
+ preve = dmr->end;
+ } else {
+ klm->key = cpu_to_be32(mvdev->res.null_mkey);
+ klm->bcount = cpu_to_be32(klm_bcount(dmr->start - preve));
+ preve = dmr->start;
+ goto again;
+ }
+ }
+}
+
+static int klm_byte_size(int nklms)
+{
+ return 16 * ALIGN(nklms, 4);
+}
+
+static int create_indirect_key(struct mlx5_vdpa_dev *mvdev, struct mlx5_vdpa_mr *mr)
+{
+ int inlen;
+ void *mkc;
+ void *in;
+ int err;
+ u64 start;
+ u64 len;
+
+ start = indir_start_addr(mr);
+ len = indir_len(mr);
+ if (start == MLX5_VDPA_INVALID_START_ADDR || len == MLX5_VDPA_INVALID_LEN)
+ return -EINVAL;
+
+ inlen = MLX5_ST_SZ_BYTES(create_mkey_in) + klm_byte_size(mr->num_klms);
+ in = kzalloc(inlen, GFP_KERNEL);
+ if (!in)
+ return -ENOMEM;
+
+ MLX5_SET(create_mkey_in, in, uid, mvdev->res.uid);
+ mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
+ MLX5_SET(mkc, mkc, lw, 1);
+ MLX5_SET(mkc, mkc, lr, 1);
+ mlx5_set_access_mode(mkc, MLX5_MKC_ACCESS_MODE_KLMS);
+ MLX5_SET(mkc, mkc, qpn, 0xffffff);
+ MLX5_SET(mkc, mkc, pd, mvdev->res.pdn);
+ MLX5_SET64(mkc, mkc, start_addr, start);
+ MLX5_SET64(mkc, mkc, len, len);
+ MLX5_SET(mkc, mkc, translations_octword_size, klm_byte_size(mr->num_klms) / 16);
+ MLX5_SET(create_mkey_in, in, translations_octword_actual_size, mr->num_klms);
+ fill_indir(mvdev, mr, in);
+ err = mlx5_vdpa_create_mkey(mvdev, &mr->mkey, in, inlen);
+ kfree(in);
+ return err;
+}
+
+static void destroy_indirect_key(struct mlx5_vdpa_dev *mvdev, struct mlx5_vdpa_mr *mkey)
+{
+ mlx5_vdpa_destroy_mkey(mvdev, mkey->mkey);
+}
+
+static int map_direct_mr(struct mlx5_vdpa_dev *mvdev, struct mlx5_vdpa_direct_mr *mr,
+ struct vhost_iotlb *iotlb)
+{
+ struct vhost_iotlb_map *map;
+ unsigned long lgcd = 0;
+ int log_entity_size;
+ unsigned long size;
+ u64 start = 0;
+ int err;
+ struct page *pg;
+ unsigned int nsg;
+ int sglen;
+ u64 pa;
+ u64 paend;
+ struct scatterlist *sg;
+ struct device *dma = mvdev->vdev.dma_dev;
+
+ for (map = vhost_iotlb_itree_first(iotlb, mr->start, mr->end - 1);
+ map; map = vhost_iotlb_itree_next(map, start, mr->end - 1)) {
+ size = maplen(map, mr);
+ lgcd = gcd(lgcd, size);
+ start += size;
+ }
+ log_entity_size = ilog2(lgcd);
+
+ sglen = 1 << log_entity_size;
+ nsg = MLX5_DIV_ROUND_UP_POW2(mr->end - mr->start, log_entity_size);
+
+ err = sg_alloc_table(&mr->sg_head, nsg, GFP_KERNEL);
+ if (err)
+ return err;
+
+ sg = mr->sg_head.sgl;
+ for (map = vhost_iotlb_itree_first(iotlb, mr->start, mr->end - 1);
+ map; map = vhost_iotlb_itree_next(map, mr->start, mr->end - 1)) {
+ paend = map->addr + maplen(map, mr);
+ for (pa = map->addr; pa < paend; pa += sglen) {
+ pg = pfn_to_page(__phys_to_pfn(pa));
+ if (!sg) {
+ mlx5_vdpa_warn(mvdev, "sg null. start 0x%llx, end 0x%llx\n",
+ map->start, map->last + 1);
+ err = -ENOMEM;
+ goto err_map;
+ }
+ sg_set_page(sg, pg, sglen, 0);
+ sg = sg_next(sg);
+ if (!sg)
+ goto done;
+ }
+ }
+done:
+ mr->log_size = log_entity_size;
+ mr->nsg = nsg;
+ mr->nent = dma_map_sg_attrs(dma, mr->sg_head.sgl, mr->nsg, DMA_BIDIRECTIONAL, 0);
+ if (!mr->nent) {
+ err = -ENOMEM;
+ goto err_map;
+ }
+
+ err = create_direct_mr(mvdev, mr);
+ if (err)
+ goto err_direct;
+
+ return 0;
+
+err_direct:
+ dma_unmap_sg_attrs(dma, mr->sg_head.sgl, mr->nsg, DMA_BIDIRECTIONAL, 0);
+err_map:
+ sg_free_table(&mr->sg_head);
+ return err;
+}
+
+static void unmap_direct_mr(struct mlx5_vdpa_dev *mvdev, struct mlx5_vdpa_direct_mr *mr)
+{
+ struct device *dma = mvdev->vdev.dma_dev;
+
+ destroy_direct_mr(mvdev, mr);
+ dma_unmap_sg_attrs(dma, mr->sg_head.sgl, mr->nsg, DMA_BIDIRECTIONAL, 0);
+ sg_free_table(&mr->sg_head);
+}
+
+static int add_direct_chain(struct mlx5_vdpa_dev *mvdev, u64 start, u64 size, u8 perm,
+ struct vhost_iotlb *iotlb)
+{
+ struct mlx5_vdpa_mr *mr = &mvdev->mr;
+ struct mlx5_vdpa_direct_mr *dmr;
+ struct mlx5_vdpa_direct_mr *n;
+ LIST_HEAD(tmp);
+ u64 st;
+ u64 sz;
+ int err;
+
+ st = start;
+ while (size) {
+ sz = (u32)min_t(u64, MAX_KLM_SIZE, size);
+ dmr = kzalloc(sizeof(*dmr), GFP_KERNEL);
+ if (!dmr) {
+ err = -ENOMEM;
+ goto err_alloc;
+ }
+
+ dmr->start = st;
+ dmr->end = st + sz;
+ dmr->perm = perm;
+ err = map_direct_mr(mvdev, dmr, iotlb);
+ if (err) {
+ kfree(dmr);
+ goto err_alloc;
+ }
+
+ list_add_tail(&dmr->list, &tmp);
+ size -= sz;
+ mr->num_directs++;
+ mr->num_klms++;
+ st += sz;
+ }
+ list_splice_tail(&tmp, &mr->head);
+ return 0;
+
+err_alloc:
+ list_for_each_entry_safe(dmr, n, &mr->head, list) {
+ list_del_init(&dmr->list);
+ unmap_direct_mr(mvdev, dmr);
+ kfree(dmr);
+ }
+ return err;
+}
+
+/* The iotlb pointer contains a list of maps. Go over the maps, possibly
+ * merging mergeable maps, and create direct memory keys that provide the
+ * device access to memory. The direct mkeys are then referred to by the
+ * indirect memory key that provides access to the enitre address space given
+ * by iotlb.
+ */
+static int create_user_mr(struct mlx5_vdpa_dev *mvdev, struct vhost_iotlb *iotlb)
+{
+ struct mlx5_vdpa_mr *mr = &mvdev->mr;
+ struct mlx5_vdpa_direct_mr *dmr;
+ struct mlx5_vdpa_direct_mr *n;
+ struct vhost_iotlb_map *map;
+ u32 pperm = U16_MAX;
+ u64 last = U64_MAX;
+ u64 ps = U64_MAX;
+ u64 pe = U64_MAX;
+ u64 start = 0;
+ int err = 0;
+ int nnuls;
+
+ INIT_LIST_HEAD(&mr->head);
+ for (map = vhost_iotlb_itree_first(iotlb, start, last); map;
+ map = vhost_iotlb_itree_next(map, start, last)) {
+ start = map->start;
+ if (pe == map->start && pperm == map->perm) {
+ pe = map->last + 1;
+ } else {
+ if (ps != U64_MAX) {
+ if (pe < map->start) {
+ /* We have a hole in the map. Check how
+ * many null keys are required to fill it.
+ */
+ nnuls = MLX5_DIV_ROUND_UP_POW2(map->start - pe,
+ LOG_MAX_KLM_SIZE);
+ mr->num_klms += nnuls;
+ }
+ err = add_direct_chain(mvdev, ps, pe - ps, pperm, iotlb);
+ if (err)
+ goto err_chain;
+ }
+ ps = map->start;
+ pe = map->last + 1;
+ pperm = map->perm;
+ }
+ }
+ err = add_direct_chain(mvdev, ps, pe - ps, pperm, iotlb);
+ if (err)
+ goto err_chain;
+
+ /* Create the memory key that defines the guests's address space. This
+ * memory key refers to the direct keys that contain the MTT
+ * translations
+ */
+ err = create_indirect_key(mvdev, mr);
+ if (err)
+ goto err_chain;
+
+ mr->user_mr = true;
+ return 0;
+
+err_chain:
+ list_for_each_entry_safe_reverse(dmr, n, &mr->head, list) {
+ list_del_init(&dmr->list);
+ unmap_direct_mr(mvdev, dmr);
+ kfree(dmr);
+ }
+ return err;
+}
+
+static int create_dma_mr(struct mlx5_vdpa_dev *mvdev, struct mlx5_vdpa_mr *mr)
+{
+ int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
+ void *mkc;
+ u32 *in;
+ int err;
+
+ in = kzalloc(inlen, GFP_KERNEL);
+ if (!in)
+ return -ENOMEM;
+
+ mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
+
+ MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_PA);
+ MLX5_SET(mkc, mkc, length64, 1);
+ MLX5_SET(mkc, mkc, lw, 1);
+ MLX5_SET(mkc, mkc, lr, 1);
+ MLX5_SET(mkc, mkc, pd, mvdev->res.pdn);
+ MLX5_SET(mkc, mkc, qpn, 0xffffff);
+
+ err = mlx5_vdpa_create_mkey(mvdev, &mr->mkey, in, inlen);
+ if (!err)
+ mr->user_mr = false;
+
+ kfree(in);
+ return err;
+}
+
+static void destroy_dma_mr(struct mlx5_vdpa_dev *mvdev, struct mlx5_vdpa_mr *mr)
+{
+ mlx5_vdpa_destroy_mkey(mvdev, mr->mkey);
+}
+
+static int dup_iotlb(struct mlx5_vdpa_dev *mvdev, struct vhost_iotlb *src)
+{
+ struct vhost_iotlb_map *map;
+ u64 start = 0, last = ULLONG_MAX;
+ int err;
+
+ if (!src) {
+ err = vhost_iotlb_add_range(mvdev->cvq.iotlb, start, last, start, VHOST_ACCESS_RW);
+ return err;
+ }
+
+ for (map = vhost_iotlb_itree_first(src, start, last); map;
+ map = vhost_iotlb_itree_next(map, start, last)) {
+ err = vhost_iotlb_add_range(mvdev->cvq.iotlb, map->start, map->last,
+ map->addr, map->perm);
+ if (err)
+ return err;
+ }
+ return 0;
+}
+
+static void prune_iotlb(struct mlx5_vdpa_dev *mvdev)
+{
+ vhost_iotlb_del_range(mvdev->cvq.iotlb, 0, ULLONG_MAX);
+}
+
+static void destroy_user_mr(struct mlx5_vdpa_dev *mvdev, struct mlx5_vdpa_mr *mr)
+{
+ struct mlx5_vdpa_direct_mr *dmr;
+ struct mlx5_vdpa_direct_mr *n;
+
+ destroy_indirect_key(mvdev, mr);
+ list_for_each_entry_safe_reverse(dmr, n, &mr->head, list) {
+ list_del_init(&dmr->list);
+ unmap_direct_mr(mvdev, dmr);
+ kfree(dmr);
+ }
+}
+
+static void _mlx5_vdpa_destroy_cvq_mr(struct mlx5_vdpa_dev *mvdev, unsigned int asid)
+{
+ if (mvdev->group2asid[MLX5_VDPA_CVQ_GROUP] != asid)
+ return;
+
+ prune_iotlb(mvdev);
+}
+
+static void _mlx5_vdpa_destroy_dvq_mr(struct mlx5_vdpa_dev *mvdev, unsigned int asid)
+{
+ struct mlx5_vdpa_mr *mr = &mvdev->mr;
+
+ if (mvdev->group2asid[MLX5_VDPA_DATAVQ_GROUP] != asid)
+ return;
+
+ if (!mr->initialized)
+ return;
+
+ if (mr->user_mr)
+ destroy_user_mr(mvdev, mr);
+ else
+ destroy_dma_mr(mvdev, mr);
+
+ mr->initialized = false;
+}
+
+void mlx5_vdpa_destroy_mr_asid(struct mlx5_vdpa_dev *mvdev, unsigned int asid)
+{
+ struct mlx5_vdpa_mr *mr = &mvdev->mr;
+
+ mutex_lock(&mr->mkey_mtx);
+
+ _mlx5_vdpa_destroy_dvq_mr(mvdev, asid);
+ _mlx5_vdpa_destroy_cvq_mr(mvdev, asid);
+
+ mutex_unlock(&mr->mkey_mtx);
+}
+
+void mlx5_vdpa_destroy_mr(struct mlx5_vdpa_dev *mvdev)
+{
+ mlx5_vdpa_destroy_mr_asid(mvdev, mvdev->group2asid[MLX5_VDPA_CVQ_GROUP]);
+ mlx5_vdpa_destroy_mr_asid(mvdev, mvdev->group2asid[MLX5_VDPA_DATAVQ_GROUP]);
+}
+
+static int _mlx5_vdpa_create_cvq_mr(struct mlx5_vdpa_dev *mvdev,
+ struct vhost_iotlb *iotlb,
+ unsigned int asid)
+{
+ if (mvdev->group2asid[MLX5_VDPA_CVQ_GROUP] != asid)
+ return 0;
+
+ return dup_iotlb(mvdev, iotlb);
+}
+
+static int _mlx5_vdpa_create_dvq_mr(struct mlx5_vdpa_dev *mvdev,
+ struct vhost_iotlb *iotlb,
+ unsigned int asid)
+{
+ struct mlx5_vdpa_mr *mr = &mvdev->mr;
+ int err;
+
+ if (mvdev->group2asid[MLX5_VDPA_DATAVQ_GROUP] != asid)
+ return 0;
+
+ if (mr->initialized)
+ return 0;
+
+ if (iotlb)
+ err = create_user_mr(mvdev, iotlb);
+ else
+ err = create_dma_mr(mvdev, mr);
+
+ if (err)
+ return err;
+
+ mr->initialized = true;
+
+ return 0;
+}
+
+static int _mlx5_vdpa_create_mr(struct mlx5_vdpa_dev *mvdev,
+ struct vhost_iotlb *iotlb, unsigned int asid)
+{
+ int err;
+
+ err = _mlx5_vdpa_create_dvq_mr(mvdev, iotlb, asid);
+ if (err)
+ return err;
+
+ err = _mlx5_vdpa_create_cvq_mr(mvdev, iotlb, asid);
+ if (err)
+ goto out_err;
+
+ return 0;
+
+out_err:
+ _mlx5_vdpa_destroy_dvq_mr(mvdev, asid);
+
+ return err;
+}
+
+int mlx5_vdpa_create_mr(struct mlx5_vdpa_dev *mvdev, struct vhost_iotlb *iotlb,
+ unsigned int asid)
+{
+ int err;
+
+ mutex_lock(&mvdev->mr.mkey_mtx);
+ err = _mlx5_vdpa_create_mr(mvdev, iotlb, asid);
+ mutex_unlock(&mvdev->mr.mkey_mtx);
+ return err;
+}
+
+int mlx5_vdpa_handle_set_map(struct mlx5_vdpa_dev *mvdev, struct vhost_iotlb *iotlb,
+ bool *change_map, unsigned int asid)
+{
+ struct mlx5_vdpa_mr *mr = &mvdev->mr;
+ int err = 0;
+
+ *change_map = false;
+ mutex_lock(&mr->mkey_mtx);
+ if (mr->initialized) {
+ mlx5_vdpa_info(mvdev, "memory map update\n");
+ *change_map = true;
+ }
+ if (!*change_map)
+ err = _mlx5_vdpa_create_mr(mvdev, iotlb, asid);
+ mutex_unlock(&mr->mkey_mtx);
+
+ return err;
+}
diff --git a/drivers/vdpa/mlx5/core/resources.c b/drivers/vdpa/mlx5/core/resources.c
new file mode 100644
index 0000000000..d5a59c9035
--- /dev/null
+++ b/drivers/vdpa/mlx5/core/resources.c
@@ -0,0 +1,323 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/* Copyright (c) 2020 Mellanox Technologies Ltd. */
+
+#include <linux/iova.h>
+#include <linux/mlx5/driver.h>
+#include "mlx5_vdpa.h"
+
+static int alloc_pd(struct mlx5_vdpa_dev *dev, u32 *pdn, u16 uid)
+{
+ struct mlx5_core_dev *mdev = dev->mdev;
+
+ u32 out[MLX5_ST_SZ_DW(alloc_pd_out)] = {};
+ u32 in[MLX5_ST_SZ_DW(alloc_pd_in)] = {};
+ int err;
+
+ MLX5_SET(alloc_pd_in, in, opcode, MLX5_CMD_OP_ALLOC_PD);
+ MLX5_SET(alloc_pd_in, in, uid, uid);
+
+ err = mlx5_cmd_exec_inout(mdev, alloc_pd, in, out);
+ if (!err)
+ *pdn = MLX5_GET(alloc_pd_out, out, pd);
+
+ return err;
+}
+
+static int dealloc_pd(struct mlx5_vdpa_dev *dev, u32 pdn, u16 uid)
+{
+ u32 in[MLX5_ST_SZ_DW(dealloc_pd_in)] = {};
+ struct mlx5_core_dev *mdev = dev->mdev;
+
+ MLX5_SET(dealloc_pd_in, in, opcode, MLX5_CMD_OP_DEALLOC_PD);
+ MLX5_SET(dealloc_pd_in, in, pd, pdn);
+ MLX5_SET(dealloc_pd_in, in, uid, uid);
+ return mlx5_cmd_exec_in(mdev, dealloc_pd, in);
+}
+
+static int get_null_mkey(struct mlx5_vdpa_dev *dev, u32 *null_mkey)
+{
+ u32 out[MLX5_ST_SZ_DW(query_special_contexts_out)] = {};
+ u32 in[MLX5_ST_SZ_DW(query_special_contexts_in)] = {};
+ struct mlx5_core_dev *mdev = dev->mdev;
+ int err;
+
+ MLX5_SET(query_special_contexts_in, in, opcode, MLX5_CMD_OP_QUERY_SPECIAL_CONTEXTS);
+ err = mlx5_cmd_exec_inout(mdev, query_special_contexts, in, out);
+ if (!err)
+ *null_mkey = MLX5_GET(query_special_contexts_out, out, null_mkey);
+ return err;
+}
+
+static int create_uctx(struct mlx5_vdpa_dev *mvdev, u16 *uid)
+{
+ u32 out[MLX5_ST_SZ_DW(create_uctx_out)] = {};
+ int inlen;
+ void *in;
+ int err;
+
+ if (MLX5_CAP_GEN(mvdev->mdev, umem_uid_0))
+ return 0;
+
+ /* 0 means not supported */
+ if (!MLX5_CAP_GEN(mvdev->mdev, log_max_uctx))
+ return -EOPNOTSUPP;
+
+ inlen = MLX5_ST_SZ_BYTES(create_uctx_in);
+ in = kzalloc(inlen, GFP_KERNEL);
+ if (!in)
+ return -ENOMEM;
+
+ MLX5_SET(create_uctx_in, in, opcode, MLX5_CMD_OP_CREATE_UCTX);
+ MLX5_SET(create_uctx_in, in, uctx.cap, MLX5_UCTX_CAP_RAW_TX);
+
+ err = mlx5_cmd_exec(mvdev->mdev, in, inlen, out, sizeof(out));
+ kfree(in);
+ if (!err)
+ *uid = MLX5_GET(create_uctx_out, out, uid);
+
+ return err;
+}
+
+static void destroy_uctx(struct mlx5_vdpa_dev *mvdev, u32 uid)
+{
+ u32 out[MLX5_ST_SZ_DW(destroy_uctx_out)] = {};
+ u32 in[MLX5_ST_SZ_DW(destroy_uctx_in)] = {};
+
+ if (!uid)
+ return;
+
+ MLX5_SET(destroy_uctx_in, in, opcode, MLX5_CMD_OP_DESTROY_UCTX);
+ MLX5_SET(destroy_uctx_in, in, uid, uid);
+
+ mlx5_cmd_exec(mvdev->mdev, in, sizeof(in), out, sizeof(out));
+}
+
+int mlx5_vdpa_create_tis(struct mlx5_vdpa_dev *mvdev, void *in, u32 *tisn)
+{
+ u32 out[MLX5_ST_SZ_DW(create_tis_out)] = {};
+ int err;
+
+ MLX5_SET(create_tis_in, in, opcode, MLX5_CMD_OP_CREATE_TIS);
+ MLX5_SET(create_tis_in, in, uid, mvdev->res.uid);
+ err = mlx5_cmd_exec_inout(mvdev->mdev, create_tis, in, out);
+ if (!err)
+ *tisn = MLX5_GET(create_tis_out, out, tisn);
+
+ return err;
+}
+
+void mlx5_vdpa_destroy_tis(struct mlx5_vdpa_dev *mvdev, u32 tisn)
+{
+ u32 in[MLX5_ST_SZ_DW(destroy_tis_in)] = {};
+
+ MLX5_SET(destroy_tis_in, in, opcode, MLX5_CMD_OP_DESTROY_TIS);
+ MLX5_SET(destroy_tis_in, in, uid, mvdev->res.uid);
+ MLX5_SET(destroy_tis_in, in, tisn, tisn);
+ mlx5_cmd_exec_in(mvdev->mdev, destroy_tis, in);
+}
+
+int mlx5_vdpa_create_rqt(struct mlx5_vdpa_dev *mvdev, void *in, int inlen, u32 *rqtn)
+{
+ u32 out[MLX5_ST_SZ_DW(create_rqt_out)] = {};
+ int err;
+
+ MLX5_SET(create_rqt_in, in, opcode, MLX5_CMD_OP_CREATE_RQT);
+ err = mlx5_cmd_exec(mvdev->mdev, in, inlen, out, sizeof(out));
+ if (!err)
+ *rqtn = MLX5_GET(create_rqt_out, out, rqtn);
+
+ return err;
+}
+
+int mlx5_vdpa_modify_rqt(struct mlx5_vdpa_dev *mvdev, void *in, int inlen, u32 rqtn)
+{
+ u32 out[MLX5_ST_SZ_DW(create_rqt_out)] = {};
+
+ MLX5_SET(modify_rqt_in, in, uid, mvdev->res.uid);
+ MLX5_SET(modify_rqt_in, in, rqtn, rqtn);
+ MLX5_SET(modify_rqt_in, in, opcode, MLX5_CMD_OP_MODIFY_RQT);
+ return mlx5_cmd_exec(mvdev->mdev, in, inlen, out, sizeof(out));
+}
+
+void mlx5_vdpa_destroy_rqt(struct mlx5_vdpa_dev *mvdev, u32 rqtn)
+{
+ u32 in[MLX5_ST_SZ_DW(destroy_rqt_in)] = {};
+
+ MLX5_SET(destroy_rqt_in, in, opcode, MLX5_CMD_OP_DESTROY_RQT);
+ MLX5_SET(destroy_rqt_in, in, uid, mvdev->res.uid);
+ MLX5_SET(destroy_rqt_in, in, rqtn, rqtn);
+ mlx5_cmd_exec_in(mvdev->mdev, destroy_rqt, in);
+}
+
+int mlx5_vdpa_create_tir(struct mlx5_vdpa_dev *mvdev, void *in, u32 *tirn)
+{
+ u32 out[MLX5_ST_SZ_DW(create_tir_out)] = {};
+ int err;
+
+ MLX5_SET(create_tir_in, in, opcode, MLX5_CMD_OP_CREATE_TIR);
+ err = mlx5_cmd_exec_inout(mvdev->mdev, create_tir, in, out);
+ if (!err)
+ *tirn = MLX5_GET(create_tir_out, out, tirn);
+
+ return err;
+}
+
+void mlx5_vdpa_destroy_tir(struct mlx5_vdpa_dev *mvdev, u32 tirn)
+{
+ u32 in[MLX5_ST_SZ_DW(destroy_tir_in)] = {};
+
+ MLX5_SET(destroy_tir_in, in, opcode, MLX5_CMD_OP_DESTROY_TIR);
+ MLX5_SET(destroy_tir_in, in, uid, mvdev->res.uid);
+ MLX5_SET(destroy_tir_in, in, tirn, tirn);
+ mlx5_cmd_exec_in(mvdev->mdev, destroy_tir, in);
+}
+
+int mlx5_vdpa_alloc_transport_domain(struct mlx5_vdpa_dev *mvdev, u32 *tdn)
+{
+ u32 out[MLX5_ST_SZ_DW(alloc_transport_domain_out)] = {};
+ u32 in[MLX5_ST_SZ_DW(alloc_transport_domain_in)] = {};
+ int err;
+
+ MLX5_SET(alloc_transport_domain_in, in, opcode, MLX5_CMD_OP_ALLOC_TRANSPORT_DOMAIN);
+ MLX5_SET(alloc_transport_domain_in, in, uid, mvdev->res.uid);
+
+ err = mlx5_cmd_exec_inout(mvdev->mdev, alloc_transport_domain, in, out);
+ if (!err)
+ *tdn = MLX5_GET(alloc_transport_domain_out, out, transport_domain);
+
+ return err;
+}
+
+void mlx5_vdpa_dealloc_transport_domain(struct mlx5_vdpa_dev *mvdev, u32 tdn)
+{
+ u32 in[MLX5_ST_SZ_DW(dealloc_transport_domain_in)] = {};
+
+ MLX5_SET(dealloc_transport_domain_in, in, opcode, MLX5_CMD_OP_DEALLOC_TRANSPORT_DOMAIN);
+ MLX5_SET(dealloc_transport_domain_in, in, uid, mvdev->res.uid);
+ MLX5_SET(dealloc_transport_domain_in, in, transport_domain, tdn);
+ mlx5_cmd_exec_in(mvdev->mdev, dealloc_transport_domain, in);
+}
+
+int mlx5_vdpa_create_mkey(struct mlx5_vdpa_dev *mvdev, u32 *mkey, u32 *in,
+ int inlen)
+{
+ u32 lout[MLX5_ST_SZ_DW(create_mkey_out)] = {};
+ u32 mkey_index;
+ int err;
+
+ MLX5_SET(create_mkey_in, in, opcode, MLX5_CMD_OP_CREATE_MKEY);
+ MLX5_SET(create_mkey_in, in, uid, mvdev->res.uid);
+
+ err = mlx5_cmd_exec(mvdev->mdev, in, inlen, lout, sizeof(lout));
+ if (err)
+ return err;
+
+ mkey_index = MLX5_GET(create_mkey_out, lout, mkey_index);
+ *mkey = mlx5_idx_to_mkey(mkey_index);
+ return 0;
+}
+
+int mlx5_vdpa_destroy_mkey(struct mlx5_vdpa_dev *mvdev, u32 mkey)
+{
+ u32 in[MLX5_ST_SZ_DW(destroy_mkey_in)] = {};
+
+ MLX5_SET(destroy_mkey_in, in, uid, mvdev->res.uid);
+ MLX5_SET(destroy_mkey_in, in, opcode, MLX5_CMD_OP_DESTROY_MKEY);
+ MLX5_SET(destroy_mkey_in, in, mkey_index, mlx5_mkey_to_idx(mkey));
+ return mlx5_cmd_exec_in(mvdev->mdev, destroy_mkey, in);
+}
+
+static int init_ctrl_vq(struct mlx5_vdpa_dev *mvdev)
+{
+ mvdev->cvq.iotlb = vhost_iotlb_alloc(0, 0);
+ if (!mvdev->cvq.iotlb)
+ return -ENOMEM;
+
+ spin_lock_init(&mvdev->cvq.iommu_lock);
+ vringh_set_iotlb(&mvdev->cvq.vring, mvdev->cvq.iotlb, &mvdev->cvq.iommu_lock);
+
+ return 0;
+}
+
+static void cleanup_ctrl_vq(struct mlx5_vdpa_dev *mvdev)
+{
+ vhost_iotlb_free(mvdev->cvq.iotlb);
+}
+
+int mlx5_vdpa_alloc_resources(struct mlx5_vdpa_dev *mvdev)
+{
+ u64 offset = MLX5_CAP64_DEV_VDPA_EMULATION(mvdev->mdev, doorbell_bar_offset);
+ struct mlx5_vdpa_resources *res = &mvdev->res;
+ struct mlx5_core_dev *mdev = mvdev->mdev;
+ u64 kick_addr;
+ int err;
+
+ if (res->valid) {
+ mlx5_vdpa_warn(mvdev, "resources already allocated\n");
+ return -EINVAL;
+ }
+ mutex_init(&mvdev->mr.mkey_mtx);
+ res->uar = mlx5_get_uars_page(mdev);
+ if (IS_ERR(res->uar)) {
+ err = PTR_ERR(res->uar);
+ goto err_uars;
+ }
+
+ err = create_uctx(mvdev, &res->uid);
+ if (err)
+ goto err_uctx;
+
+ err = alloc_pd(mvdev, &res->pdn, res->uid);
+ if (err)
+ goto err_pd;
+
+ err = get_null_mkey(mvdev, &res->null_mkey);
+ if (err)
+ goto err_key;
+
+ kick_addr = mdev->bar_addr + offset;
+ res->phys_kick_addr = kick_addr;
+
+ res->kick_addr = ioremap(kick_addr, PAGE_SIZE);
+ if (!res->kick_addr) {
+ err = -ENOMEM;
+ goto err_key;
+ }
+
+ err = init_ctrl_vq(mvdev);
+ if (err)
+ goto err_ctrl;
+
+ res->valid = true;
+
+ return 0;
+
+err_ctrl:
+ iounmap(res->kick_addr);
+err_key:
+ dealloc_pd(mvdev, res->pdn, res->uid);
+err_pd:
+ destroy_uctx(mvdev, res->uid);
+err_uctx:
+ mlx5_put_uars_page(mdev, res->uar);
+err_uars:
+ mutex_destroy(&mvdev->mr.mkey_mtx);
+ return err;
+}
+
+void mlx5_vdpa_free_resources(struct mlx5_vdpa_dev *mvdev)
+{
+ struct mlx5_vdpa_resources *res = &mvdev->res;
+
+ if (!res->valid)
+ return;
+
+ cleanup_ctrl_vq(mvdev);
+ iounmap(res->kick_addr);
+ res->kick_addr = NULL;
+ dealloc_pd(mvdev, res->pdn, res->uid);
+ destroy_uctx(mvdev, res->uid);
+ mlx5_put_uars_page(mvdev->mdev, res->uar);
+ mutex_destroy(&mvdev->mr.mkey_mtx);
+ res->valid = false;
+}
diff --git a/drivers/vdpa/mlx5/net/debug.c b/drivers/vdpa/mlx5/net/debug.c
new file mode 100644
index 0000000000..9c85162c19
--- /dev/null
+++ b/drivers/vdpa/mlx5/net/debug.c
@@ -0,0 +1,153 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/* Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. */
+
+#include <linux/debugfs.h>
+#include <linux/mlx5/fs.h>
+#include "mlx5_vnet.h"
+
+static int tirn_show(struct seq_file *file, void *priv)
+{
+ struct mlx5_vdpa_net *ndev = file->private;
+
+ seq_printf(file, "0x%x\n", ndev->res.tirn);
+ return 0;
+}
+
+DEFINE_SHOW_ATTRIBUTE(tirn);
+
+void mlx5_vdpa_remove_tirn(struct mlx5_vdpa_net *ndev)
+{
+ if (ndev->debugfs)
+ debugfs_remove(ndev->res.tirn_dent);
+}
+
+void mlx5_vdpa_add_tirn(struct mlx5_vdpa_net *ndev)
+{
+ ndev->res.tirn_dent = debugfs_create_file("tirn", 0444, ndev->rx_dent,
+ ndev, &tirn_fops);
+}
+
+static int rx_flow_table_show(struct seq_file *file, void *priv)
+{
+ struct mlx5_vdpa_net *ndev = file->private;
+
+ seq_printf(file, "0x%x\n", mlx5_flow_table_id(ndev->rxft));
+ return 0;
+}
+
+DEFINE_SHOW_ATTRIBUTE(rx_flow_table);
+
+void mlx5_vdpa_remove_rx_flow_table(struct mlx5_vdpa_net *ndev)
+{
+ if (ndev->debugfs)
+ debugfs_remove(ndev->rx_table_dent);
+}
+
+void mlx5_vdpa_add_rx_flow_table(struct mlx5_vdpa_net *ndev)
+{
+ ndev->rx_table_dent = debugfs_create_file("table_id", 0444, ndev->rx_dent,
+ ndev, &rx_flow_table_fops);
+}
+
+#if defined(CONFIG_MLX5_VDPA_STEERING_DEBUG)
+static int packets_show(struct seq_file *file, void *priv)
+{
+ struct mlx5_vdpa_counter *counter = file->private;
+ u64 packets;
+ u64 bytes;
+ int err;
+
+ err = mlx5_fc_query(counter->mdev, counter->counter, &packets, &bytes);
+ if (err)
+ return err;
+
+ seq_printf(file, "0x%llx\n", packets);
+ return 0;
+}
+
+static int bytes_show(struct seq_file *file, void *priv)
+{
+ struct mlx5_vdpa_counter *counter = file->private;
+ u64 packets;
+ u64 bytes;
+ int err;
+
+ err = mlx5_fc_query(counter->mdev, counter->counter, &packets, &bytes);
+ if (err)
+ return err;
+
+ seq_printf(file, "0x%llx\n", bytes);
+ return 0;
+}
+
+DEFINE_SHOW_ATTRIBUTE(packets);
+DEFINE_SHOW_ATTRIBUTE(bytes);
+
+static void add_counter_node(struct mlx5_vdpa_counter *counter,
+ struct dentry *parent)
+{
+ debugfs_create_file("packets", 0444, parent, counter,
+ &packets_fops);
+ debugfs_create_file("bytes", 0444, parent, counter,
+ &bytes_fops);
+}
+
+void mlx5_vdpa_add_rx_counters(struct mlx5_vdpa_net *ndev,
+ struct macvlan_node *node)
+{
+ static const char *ut = "untagged";
+ char vidstr[9];
+ u16 vid;
+
+ node->ucast_counter.mdev = ndev->mvdev.mdev;
+ node->mcast_counter.mdev = ndev->mvdev.mdev;
+ if (node->tagged) {
+ vid = key2vid(node->macvlan);
+ snprintf(vidstr, sizeof(vidstr), "0x%x", vid);
+ } else {
+ strcpy(vidstr, ut);
+ }
+
+ node->dent = debugfs_create_dir(vidstr, ndev->rx_dent);
+ if (IS_ERR(node->dent)) {
+ node->dent = NULL;
+ return;
+ }
+
+ node->ucast_counter.dent = debugfs_create_dir("ucast", node->dent);
+ if (IS_ERR(node->ucast_counter.dent))
+ return;
+
+ add_counter_node(&node->ucast_counter, node->ucast_counter.dent);
+
+ node->mcast_counter.dent = debugfs_create_dir("mcast", node->dent);
+ if (IS_ERR(node->mcast_counter.dent))
+ return;
+
+ add_counter_node(&node->mcast_counter, node->mcast_counter.dent);
+}
+
+void mlx5_vdpa_remove_rx_counters(struct mlx5_vdpa_net *ndev,
+ struct macvlan_node *node)
+{
+ if (node->dent && ndev->debugfs)
+ debugfs_remove_recursive(node->dent);
+}
+#endif
+
+void mlx5_vdpa_add_debugfs(struct mlx5_vdpa_net *ndev)
+{
+ struct mlx5_core_dev *mdev;
+
+ mdev = ndev->mvdev.mdev;
+ ndev->debugfs = debugfs_create_dir(dev_name(&ndev->mvdev.vdev.dev),
+ mlx5_debugfs_get_dev_root(mdev));
+ if (!IS_ERR(ndev->debugfs))
+ ndev->rx_dent = debugfs_create_dir("rx", ndev->debugfs);
+}
+
+void mlx5_vdpa_remove_debugfs(struct mlx5_vdpa_net *ndev)
+{
+ debugfs_remove_recursive(ndev->debugfs);
+ ndev->debugfs = NULL;
+}
diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c b/drivers/vdpa/mlx5/net/mlx5_vnet.c
new file mode 100644
index 0000000000..ca972af3c8
--- /dev/null
+++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
@@ -0,0 +1,3620 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/* Copyright (c) 2020 Mellanox Technologies Ltd. */
+
+#include <linux/module.h>
+#include <linux/vdpa.h>
+#include <linux/vringh.h>
+#include <uapi/linux/virtio_net.h>
+#include <uapi/linux/virtio_ids.h>
+#include <uapi/linux/vdpa.h>
+#include <linux/virtio_config.h>
+#include <linux/auxiliary_bus.h>
+#include <linux/mlx5/cq.h>
+#include <linux/mlx5/qp.h>
+#include <linux/mlx5/device.h>
+#include <linux/mlx5/driver.h>
+#include <linux/mlx5/vport.h>
+#include <linux/mlx5/fs.h>
+#include <linux/mlx5/mlx5_ifc_vdpa.h>
+#include <linux/mlx5/mpfs.h>
+#include "mlx5_vdpa.h"
+#include "mlx5_vnet.h"
+
+MODULE_AUTHOR("Eli Cohen <eli@mellanox.com>");
+MODULE_DESCRIPTION("Mellanox VDPA driver");
+MODULE_LICENSE("Dual BSD/GPL");
+
+#define VALID_FEATURES_MASK \
+ (BIT_ULL(VIRTIO_NET_F_CSUM) | BIT_ULL(VIRTIO_NET_F_GUEST_CSUM) | \
+ BIT_ULL(VIRTIO_NET_F_CTRL_GUEST_OFFLOADS) | BIT_ULL(VIRTIO_NET_F_MTU) | BIT_ULL(VIRTIO_NET_F_MAC) | \
+ BIT_ULL(VIRTIO_NET_F_GUEST_TSO4) | BIT_ULL(VIRTIO_NET_F_GUEST_TSO6) | \
+ BIT_ULL(VIRTIO_NET_F_GUEST_ECN) | BIT_ULL(VIRTIO_NET_F_GUEST_UFO) | BIT_ULL(VIRTIO_NET_F_HOST_TSO4) | \
+ BIT_ULL(VIRTIO_NET_F_HOST_TSO6) | BIT_ULL(VIRTIO_NET_F_HOST_ECN) | BIT_ULL(VIRTIO_NET_F_HOST_UFO) | \
+ BIT_ULL(VIRTIO_NET_F_MRG_RXBUF) | BIT_ULL(VIRTIO_NET_F_STATUS) | BIT_ULL(VIRTIO_NET_F_CTRL_VQ) | \
+ BIT_ULL(VIRTIO_NET_F_CTRL_RX) | BIT_ULL(VIRTIO_NET_F_CTRL_VLAN) | \
+ BIT_ULL(VIRTIO_NET_F_CTRL_RX_EXTRA) | BIT_ULL(VIRTIO_NET_F_GUEST_ANNOUNCE) | \
+ BIT_ULL(VIRTIO_NET_F_MQ) | BIT_ULL(VIRTIO_NET_F_CTRL_MAC_ADDR) | BIT_ULL(VIRTIO_NET_F_HASH_REPORT) | \
+ BIT_ULL(VIRTIO_NET_F_RSS) | BIT_ULL(VIRTIO_NET_F_RSC_EXT) | BIT_ULL(VIRTIO_NET_F_STANDBY) | \
+ BIT_ULL(VIRTIO_NET_F_SPEED_DUPLEX) | BIT_ULL(VIRTIO_F_NOTIFY_ON_EMPTY) | \
+ BIT_ULL(VIRTIO_F_ANY_LAYOUT) | BIT_ULL(VIRTIO_F_VERSION_1) | BIT_ULL(VIRTIO_F_ACCESS_PLATFORM) | \
+ BIT_ULL(VIRTIO_F_RING_PACKED) | BIT_ULL(VIRTIO_F_ORDER_PLATFORM) | BIT_ULL(VIRTIO_F_SR_IOV))
+
+#define VALID_STATUS_MASK \
+ (VIRTIO_CONFIG_S_ACKNOWLEDGE | VIRTIO_CONFIG_S_DRIVER | VIRTIO_CONFIG_S_DRIVER_OK | \
+ VIRTIO_CONFIG_S_FEATURES_OK | VIRTIO_CONFIG_S_NEEDS_RESET | VIRTIO_CONFIG_S_FAILED)
+
+#define MLX5_FEATURE(_mvdev, _feature) (!!((_mvdev)->actual_features & BIT_ULL(_feature)))
+
+#define MLX5V_UNTAGGED 0x1000
+
+struct mlx5_vdpa_cq_buf {
+ struct mlx5_frag_buf_ctrl fbc;
+ struct mlx5_frag_buf frag_buf;
+ int cqe_size;
+ int nent;
+};
+
+struct mlx5_vdpa_cq {
+ struct mlx5_core_cq mcq;
+ struct mlx5_vdpa_cq_buf buf;
+ struct mlx5_db db;
+ int cqe;
+};
+
+struct mlx5_vdpa_umem {
+ struct mlx5_frag_buf_ctrl fbc;
+ struct mlx5_frag_buf frag_buf;
+ int size;
+ u32 id;
+};
+
+struct mlx5_vdpa_qp {
+ struct mlx5_core_qp mqp;
+ struct mlx5_frag_buf frag_buf;
+ struct mlx5_db db;
+ u16 head;
+ bool fw;
+};
+
+struct mlx5_vq_restore_info {
+ u32 num_ent;
+ u64 desc_addr;
+ u64 device_addr;
+ u64 driver_addr;
+ u16 avail_index;
+ u16 used_index;
+ struct msi_map map;
+ bool ready;
+ bool restore;
+};
+
+struct mlx5_vdpa_virtqueue {
+ bool ready;
+ u64 desc_addr;
+ u64 device_addr;
+ u64 driver_addr;
+ u32 num_ent;
+
+ /* Resources for implementing the notification channel from the device
+ * to the driver. fwqp is the firmware end of an RC connection; the
+ * other end is vqqp used by the driver. cq is where completions are
+ * reported.
+ */
+ struct mlx5_vdpa_cq cq;
+ struct mlx5_vdpa_qp fwqp;
+ struct mlx5_vdpa_qp vqqp;
+
+ /* umem resources are required for the virtqueue operation. They're use
+ * is internal and they must be provided by the driver.
+ */
+ struct mlx5_vdpa_umem umem1;
+ struct mlx5_vdpa_umem umem2;
+ struct mlx5_vdpa_umem umem3;
+
+ u32 counter_set_id;
+ bool initialized;
+ int index;
+ u32 virtq_id;
+ struct mlx5_vdpa_net *ndev;
+ u16 avail_idx;
+ u16 used_idx;
+ int fw_state;
+ struct msi_map map;
+
+ /* keep last in the struct */
+ struct mlx5_vq_restore_info ri;
+};
+
+static bool is_index_valid(struct mlx5_vdpa_dev *mvdev, u16 idx)
+{
+ if (!(mvdev->actual_features & BIT_ULL(VIRTIO_NET_F_MQ))) {
+ if (!(mvdev->actual_features & BIT_ULL(VIRTIO_NET_F_CTRL_VQ)))
+ return idx < 2;
+ else
+ return idx < 3;
+ }
+
+ return idx <= mvdev->max_idx;
+}
+
+static void free_resources(struct mlx5_vdpa_net *ndev);
+static void init_mvqs(struct mlx5_vdpa_net *ndev);
+static int setup_driver(struct mlx5_vdpa_dev *mvdev);
+static void teardown_driver(struct mlx5_vdpa_net *ndev);
+
+static bool mlx5_vdpa_debug;
+
+#define MLX5_CVQ_MAX_ENT 16
+
+#define MLX5_LOG_VIO_FLAG(_feature) \
+ do { \
+ if (features & BIT_ULL(_feature)) \
+ mlx5_vdpa_info(mvdev, "%s\n", #_feature); \
+ } while (0)
+
+#define MLX5_LOG_VIO_STAT(_status) \
+ do { \
+ if (status & (_status)) \
+ mlx5_vdpa_info(mvdev, "%s\n", #_status); \
+ } while (0)
+
+/* TODO: cross-endian support */
+static inline bool mlx5_vdpa_is_little_endian(struct mlx5_vdpa_dev *mvdev)
+{
+ return virtio_legacy_is_little_endian() ||
+ (mvdev->actual_features & BIT_ULL(VIRTIO_F_VERSION_1));
+}
+
+static u16 mlx5vdpa16_to_cpu(struct mlx5_vdpa_dev *mvdev, __virtio16 val)
+{
+ return __virtio16_to_cpu(mlx5_vdpa_is_little_endian(mvdev), val);
+}
+
+static __virtio16 cpu_to_mlx5vdpa16(struct mlx5_vdpa_dev *mvdev, u16 val)
+{
+ return __cpu_to_virtio16(mlx5_vdpa_is_little_endian(mvdev), val);
+}
+
+static u16 ctrl_vq_idx(struct mlx5_vdpa_dev *mvdev)
+{
+ if (!(mvdev->actual_features & BIT_ULL(VIRTIO_NET_F_MQ)))
+ return 2;
+
+ return mvdev->max_vqs;
+}
+
+static bool is_ctrl_vq_idx(struct mlx5_vdpa_dev *mvdev, u16 idx)
+{
+ return idx == ctrl_vq_idx(mvdev);
+}
+
+static void print_status(struct mlx5_vdpa_dev *mvdev, u8 status, bool set)
+{
+ if (status & ~VALID_STATUS_MASK)
+ mlx5_vdpa_warn(mvdev, "Warning: there are invalid status bits 0x%x\n",
+ status & ~VALID_STATUS_MASK);
+
+ if (!mlx5_vdpa_debug)
+ return;
+
+ mlx5_vdpa_info(mvdev, "driver status %s", set ? "set" : "get");
+ if (set && !status) {
+ mlx5_vdpa_info(mvdev, "driver resets the device\n");
+ return;
+ }
+
+ MLX5_LOG_VIO_STAT(VIRTIO_CONFIG_S_ACKNOWLEDGE);
+ MLX5_LOG_VIO_STAT(VIRTIO_CONFIG_S_DRIVER);
+ MLX5_LOG_VIO_STAT(VIRTIO_CONFIG_S_DRIVER_OK);
+ MLX5_LOG_VIO_STAT(VIRTIO_CONFIG_S_FEATURES_OK);
+ MLX5_LOG_VIO_STAT(VIRTIO_CONFIG_S_NEEDS_RESET);
+ MLX5_LOG_VIO_STAT(VIRTIO_CONFIG_S_FAILED);
+}
+
+static void print_features(struct mlx5_vdpa_dev *mvdev, u64 features, bool set)
+{
+ if (features & ~VALID_FEATURES_MASK)
+ mlx5_vdpa_warn(mvdev, "There are invalid feature bits 0x%llx\n",
+ features & ~VALID_FEATURES_MASK);
+
+ if (!mlx5_vdpa_debug)
+ return;
+
+ mlx5_vdpa_info(mvdev, "driver %s feature bits:\n", set ? "sets" : "reads");
+ if (!features)
+ mlx5_vdpa_info(mvdev, "all feature bits are cleared\n");
+
+ MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_CSUM);
+ MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_GUEST_CSUM);
+ MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_CTRL_GUEST_OFFLOADS);
+ MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_MTU);
+ MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_MAC);
+ MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_GUEST_TSO4);
+ MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_GUEST_TSO6);
+ MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_GUEST_ECN);
+ MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_GUEST_UFO);
+ MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_HOST_TSO4);
+ MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_HOST_TSO6);
+ MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_HOST_ECN);
+ MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_HOST_UFO);
+ MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_MRG_RXBUF);
+ MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_STATUS);
+ MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_CTRL_VQ);
+ MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_CTRL_RX);
+ MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_CTRL_VLAN);
+ MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_CTRL_RX_EXTRA);
+ MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_GUEST_ANNOUNCE);
+ MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_MQ);
+ MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_CTRL_MAC_ADDR);
+ MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_HASH_REPORT);
+ MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_RSS);
+ MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_RSC_EXT);
+ MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_STANDBY);
+ MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_SPEED_DUPLEX);
+ MLX5_LOG_VIO_FLAG(VIRTIO_F_NOTIFY_ON_EMPTY);
+ MLX5_LOG_VIO_FLAG(VIRTIO_F_ANY_LAYOUT);
+ MLX5_LOG_VIO_FLAG(VIRTIO_F_VERSION_1);
+ MLX5_LOG_VIO_FLAG(VIRTIO_F_ACCESS_PLATFORM);
+ MLX5_LOG_VIO_FLAG(VIRTIO_F_RING_PACKED);
+ MLX5_LOG_VIO_FLAG(VIRTIO_F_ORDER_PLATFORM);
+ MLX5_LOG_VIO_FLAG(VIRTIO_F_SR_IOV);
+}
+
+static int create_tis(struct mlx5_vdpa_net *ndev)
+{
+ struct mlx5_vdpa_dev *mvdev = &ndev->mvdev;
+ u32 in[MLX5_ST_SZ_DW(create_tis_in)] = {};
+ void *tisc;
+ int err;
+
+ tisc = MLX5_ADDR_OF(create_tis_in, in, ctx);
+ MLX5_SET(tisc, tisc, transport_domain, ndev->res.tdn);
+ err = mlx5_vdpa_create_tis(mvdev, in, &ndev->res.tisn);
+ if (err)
+ mlx5_vdpa_warn(mvdev, "create TIS (%d)\n", err);
+
+ return err;
+}
+
+static void destroy_tis(struct mlx5_vdpa_net *ndev)
+{
+ mlx5_vdpa_destroy_tis(&ndev->mvdev, ndev->res.tisn);
+}
+
+#define MLX5_VDPA_CQE_SIZE 64
+#define MLX5_VDPA_LOG_CQE_SIZE ilog2(MLX5_VDPA_CQE_SIZE)
+
+static int cq_frag_buf_alloc(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_cq_buf *buf, int nent)
+{
+ struct mlx5_frag_buf *frag_buf = &buf->frag_buf;
+ u8 log_wq_stride = MLX5_VDPA_LOG_CQE_SIZE;
+ u8 log_wq_sz = MLX5_VDPA_LOG_CQE_SIZE;
+ int err;
+
+ err = mlx5_frag_buf_alloc_node(ndev->mvdev.mdev, nent * MLX5_VDPA_CQE_SIZE, frag_buf,
+ ndev->mvdev.mdev->priv.numa_node);
+ if (err)
+ return err;
+
+ mlx5_init_fbc(frag_buf->frags, log_wq_stride, log_wq_sz, &buf->fbc);
+
+ buf->cqe_size = MLX5_VDPA_CQE_SIZE;
+ buf->nent = nent;
+
+ return 0;
+}
+
+static int umem_frag_buf_alloc(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_umem *umem, int size)
+{
+ struct mlx5_frag_buf *frag_buf = &umem->frag_buf;
+
+ return mlx5_frag_buf_alloc_node(ndev->mvdev.mdev, size, frag_buf,
+ ndev->mvdev.mdev->priv.numa_node);
+}
+
+static void cq_frag_buf_free(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_cq_buf *buf)
+{
+ mlx5_frag_buf_free(ndev->mvdev.mdev, &buf->frag_buf);
+}
+
+static void *get_cqe(struct mlx5_vdpa_cq *vcq, int n)
+{
+ return mlx5_frag_buf_get_wqe(&vcq->buf.fbc, n);
+}
+
+static void cq_frag_buf_init(struct mlx5_vdpa_cq *vcq, struct mlx5_vdpa_cq_buf *buf)
+{
+ struct mlx5_cqe64 *cqe64;
+ void *cqe;
+ int i;
+
+ for (i = 0; i < buf->nent; i++) {
+ cqe = get_cqe(vcq, i);
+ cqe64 = cqe;
+ cqe64->op_own = MLX5_CQE_INVALID << 4;
+ }
+}
+
+static void *get_sw_cqe(struct mlx5_vdpa_cq *cq, int n)
+{
+ struct mlx5_cqe64 *cqe64 = get_cqe(cq, n & (cq->cqe - 1));
+
+ if (likely(get_cqe_opcode(cqe64) != MLX5_CQE_INVALID) &&
+ !((cqe64->op_own & MLX5_CQE_OWNER_MASK) ^ !!(n & cq->cqe)))
+ return cqe64;
+
+ return NULL;
+}
+
+static void rx_post(struct mlx5_vdpa_qp *vqp, int n)
+{
+ vqp->head += n;
+ vqp->db.db[0] = cpu_to_be32(vqp->head);
+}
+
+static void qp_prepare(struct mlx5_vdpa_net *ndev, bool fw, void *in,
+ struct mlx5_vdpa_virtqueue *mvq, u32 num_ent)
+{
+ struct mlx5_vdpa_qp *vqp;
+ __be64 *pas;
+ void *qpc;
+
+ vqp = fw ? &mvq->fwqp : &mvq->vqqp;
+ MLX5_SET(create_qp_in, in, uid, ndev->mvdev.res.uid);
+ qpc = MLX5_ADDR_OF(create_qp_in, in, qpc);
+ if (vqp->fw) {
+ /* Firmware QP is allocated by the driver for the firmware's
+ * use so we can skip part of the params as they will be chosen by firmware
+ */
+ qpc = MLX5_ADDR_OF(create_qp_in, in, qpc);
+ MLX5_SET(qpc, qpc, rq_type, MLX5_ZERO_LEN_RQ);
+ MLX5_SET(qpc, qpc, no_sq, 1);
+ return;
+ }
+
+ MLX5_SET(qpc, qpc, st, MLX5_QP_ST_RC);
+ MLX5_SET(qpc, qpc, pm_state, MLX5_QP_PM_MIGRATED);
+ MLX5_SET(qpc, qpc, pd, ndev->mvdev.res.pdn);
+ MLX5_SET(qpc, qpc, mtu, MLX5_QPC_MTU_256_BYTES);
+ MLX5_SET(qpc, qpc, uar_page, ndev->mvdev.res.uar->index);
+ MLX5_SET(qpc, qpc, log_page_size, vqp->frag_buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT);
+ MLX5_SET(qpc, qpc, no_sq, 1);
+ MLX5_SET(qpc, qpc, cqn_rcv, mvq->cq.mcq.cqn);
+ MLX5_SET(qpc, qpc, log_rq_size, ilog2(num_ent));
+ MLX5_SET(qpc, qpc, rq_type, MLX5_NON_ZERO_RQ);
+ pas = (__be64 *)MLX5_ADDR_OF(create_qp_in, in, pas);
+ mlx5_fill_page_frag_array(&vqp->frag_buf, pas);
+}
+
+static int rq_buf_alloc(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_qp *vqp, u32 num_ent)
+{
+ return mlx5_frag_buf_alloc_node(ndev->mvdev.mdev,
+ num_ent * sizeof(struct mlx5_wqe_data_seg), &vqp->frag_buf,
+ ndev->mvdev.mdev->priv.numa_node);
+}
+
+static void rq_buf_free(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_qp *vqp)
+{
+ mlx5_frag_buf_free(ndev->mvdev.mdev, &vqp->frag_buf);
+}
+
+static int qp_create(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq,
+ struct mlx5_vdpa_qp *vqp)
+{
+ struct mlx5_core_dev *mdev = ndev->mvdev.mdev;
+ int inlen = MLX5_ST_SZ_BYTES(create_qp_in);
+ u32 out[MLX5_ST_SZ_DW(create_qp_out)] = {};
+ void *qpc;
+ void *in;
+ int err;
+
+ if (!vqp->fw) {
+ vqp = &mvq->vqqp;
+ err = rq_buf_alloc(ndev, vqp, mvq->num_ent);
+ if (err)
+ return err;
+
+ err = mlx5_db_alloc(ndev->mvdev.mdev, &vqp->db);
+ if (err)
+ goto err_db;
+ inlen += vqp->frag_buf.npages * sizeof(__be64);
+ }
+
+ in = kzalloc(inlen, GFP_KERNEL);
+ if (!in) {
+ err = -ENOMEM;
+ goto err_kzalloc;
+ }
+
+ qp_prepare(ndev, vqp->fw, in, mvq, mvq->num_ent);
+ qpc = MLX5_ADDR_OF(create_qp_in, in, qpc);
+ MLX5_SET(qpc, qpc, st, MLX5_QP_ST_RC);
+ MLX5_SET(qpc, qpc, pm_state, MLX5_QP_PM_MIGRATED);
+ MLX5_SET(qpc, qpc, pd, ndev->mvdev.res.pdn);
+ MLX5_SET(qpc, qpc, mtu, MLX5_QPC_MTU_256_BYTES);
+ if (!vqp->fw)
+ MLX5_SET64(qpc, qpc, dbr_addr, vqp->db.dma);
+ MLX5_SET(create_qp_in, in, opcode, MLX5_CMD_OP_CREATE_QP);
+ err = mlx5_cmd_exec(mdev, in, inlen, out, sizeof(out));
+ kfree(in);
+ if (err)
+ goto err_kzalloc;
+
+ vqp->mqp.uid = ndev->mvdev.res.uid;
+ vqp->mqp.qpn = MLX5_GET(create_qp_out, out, qpn);
+
+ if (!vqp->fw)
+ rx_post(vqp, mvq->num_ent);
+
+ return 0;
+
+err_kzalloc:
+ if (!vqp->fw)
+ mlx5_db_free(ndev->mvdev.mdev, &vqp->db);
+err_db:
+ if (!vqp->fw)
+ rq_buf_free(ndev, vqp);
+
+ return err;
+}
+
+static void qp_destroy(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_qp *vqp)
+{
+ u32 in[MLX5_ST_SZ_DW(destroy_qp_in)] = {};
+
+ MLX5_SET(destroy_qp_in, in, opcode, MLX5_CMD_OP_DESTROY_QP);
+ MLX5_SET(destroy_qp_in, in, qpn, vqp->mqp.qpn);
+ MLX5_SET(destroy_qp_in, in, uid, ndev->mvdev.res.uid);
+ if (mlx5_cmd_exec_in(ndev->mvdev.mdev, destroy_qp, in))
+ mlx5_vdpa_warn(&ndev->mvdev, "destroy qp 0x%x\n", vqp->mqp.qpn);
+ if (!vqp->fw) {
+ mlx5_db_free(ndev->mvdev.mdev, &vqp->db);
+ rq_buf_free(ndev, vqp);
+ }
+}
+
+static void *next_cqe_sw(struct mlx5_vdpa_cq *cq)
+{
+ return get_sw_cqe(cq, cq->mcq.cons_index);
+}
+
+static int mlx5_vdpa_poll_one(struct mlx5_vdpa_cq *vcq)
+{
+ struct mlx5_cqe64 *cqe64;
+
+ cqe64 = next_cqe_sw(vcq);
+ if (!cqe64)
+ return -EAGAIN;
+
+ vcq->mcq.cons_index++;
+ return 0;
+}
+
+static void mlx5_vdpa_handle_completions(struct mlx5_vdpa_virtqueue *mvq, int num)
+{
+ struct mlx5_vdpa_net *ndev = mvq->ndev;
+ struct vdpa_callback *event_cb;
+
+ event_cb = &ndev->event_cbs[mvq->index];
+ mlx5_cq_set_ci(&mvq->cq.mcq);
+
+ /* make sure CQ cosumer update is visible to the hardware before updating
+ * RX doorbell record.
+ */
+ dma_wmb();
+ rx_post(&mvq->vqqp, num);
+ if (event_cb->callback)
+ event_cb->callback(event_cb->private);
+}
+
+static void mlx5_vdpa_cq_comp(struct mlx5_core_cq *mcq, struct mlx5_eqe *eqe)
+{
+ struct mlx5_vdpa_virtqueue *mvq = container_of(mcq, struct mlx5_vdpa_virtqueue, cq.mcq);
+ struct mlx5_vdpa_net *ndev = mvq->ndev;
+ void __iomem *uar_page = ndev->mvdev.res.uar->map;
+ int num = 0;
+
+ while (!mlx5_vdpa_poll_one(&mvq->cq)) {
+ num++;
+ if (num > mvq->num_ent / 2) {
+ /* If completions keep coming while we poll, we want to
+ * let the hardware know that we consumed them by
+ * updating the doorbell record. We also let vdpa core
+ * know about this so it passes it on the virtio driver
+ * on the guest.
+ */
+ mlx5_vdpa_handle_completions(mvq, num);
+ num = 0;
+ }
+ }
+
+ if (num)
+ mlx5_vdpa_handle_completions(mvq, num);
+
+ mlx5_cq_arm(&mvq->cq.mcq, MLX5_CQ_DB_REQ_NOT, uar_page, mvq->cq.mcq.cons_index);
+}
+
+static int cq_create(struct mlx5_vdpa_net *ndev, u16 idx, u32 num_ent)
+{
+ struct mlx5_vdpa_virtqueue *mvq = &ndev->vqs[idx];
+ struct mlx5_core_dev *mdev = ndev->mvdev.mdev;
+ void __iomem *uar_page = ndev->mvdev.res.uar->map;
+ u32 out[MLX5_ST_SZ_DW(create_cq_out)];
+ struct mlx5_vdpa_cq *vcq = &mvq->cq;
+ __be64 *pas;
+ int inlen;
+ void *cqc;
+ void *in;
+ int err;
+ int eqn;
+
+ err = mlx5_db_alloc(mdev, &vcq->db);
+ if (err)
+ return err;
+
+ vcq->mcq.set_ci_db = vcq->db.db;
+ vcq->mcq.arm_db = vcq->db.db + 1;
+ vcq->mcq.cqe_sz = 64;
+
+ err = cq_frag_buf_alloc(ndev, &vcq->buf, num_ent);
+ if (err)
+ goto err_db;
+
+ cq_frag_buf_init(vcq, &vcq->buf);
+
+ inlen = MLX5_ST_SZ_BYTES(create_cq_in) +
+ MLX5_FLD_SZ_BYTES(create_cq_in, pas[0]) * vcq->buf.frag_buf.npages;
+ in = kzalloc(inlen, GFP_KERNEL);
+ if (!in) {
+ err = -ENOMEM;
+ goto err_vzalloc;
+ }
+
+ MLX5_SET(create_cq_in, in, uid, ndev->mvdev.res.uid);
+ pas = (__be64 *)MLX5_ADDR_OF(create_cq_in, in, pas);
+ mlx5_fill_page_frag_array(&vcq->buf.frag_buf, pas);
+
+ cqc = MLX5_ADDR_OF(create_cq_in, in, cq_context);
+ MLX5_SET(cqc, cqc, log_page_size, vcq->buf.frag_buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT);
+
+ /* Use vector 0 by default. Consider adding code to choose least used
+ * vector.
+ */
+ err = mlx5_comp_eqn_get(mdev, 0, &eqn);
+ if (err)
+ goto err_vec;
+
+ cqc = MLX5_ADDR_OF(create_cq_in, in, cq_context);
+ MLX5_SET(cqc, cqc, log_cq_size, ilog2(num_ent));
+ MLX5_SET(cqc, cqc, uar_page, ndev->mvdev.res.uar->index);
+ MLX5_SET(cqc, cqc, c_eqn_or_apu_element, eqn);
+ MLX5_SET64(cqc, cqc, dbr_addr, vcq->db.dma);
+
+ err = mlx5_core_create_cq(mdev, &vcq->mcq, in, inlen, out, sizeof(out));
+ if (err)
+ goto err_vec;
+
+ vcq->mcq.comp = mlx5_vdpa_cq_comp;
+ vcq->cqe = num_ent;
+ vcq->mcq.set_ci_db = vcq->db.db;
+ vcq->mcq.arm_db = vcq->db.db + 1;
+ mlx5_cq_arm(&mvq->cq.mcq, MLX5_CQ_DB_REQ_NOT, uar_page, mvq->cq.mcq.cons_index);
+ kfree(in);
+ return 0;
+
+err_vec:
+ kfree(in);
+err_vzalloc:
+ cq_frag_buf_free(ndev, &vcq->buf);
+err_db:
+ mlx5_db_free(ndev->mvdev.mdev, &vcq->db);
+ return err;
+}
+
+static void cq_destroy(struct mlx5_vdpa_net *ndev, u16 idx)
+{
+ struct mlx5_vdpa_virtqueue *mvq = &ndev->vqs[idx];
+ struct mlx5_core_dev *mdev = ndev->mvdev.mdev;
+ struct mlx5_vdpa_cq *vcq = &mvq->cq;
+
+ if (mlx5_core_destroy_cq(mdev, &vcq->mcq)) {
+ mlx5_vdpa_warn(&ndev->mvdev, "destroy CQ 0x%x\n", vcq->mcq.cqn);
+ return;
+ }
+ cq_frag_buf_free(ndev, &vcq->buf);
+ mlx5_db_free(ndev->mvdev.mdev, &vcq->db);
+}
+
+static int read_umem_params(struct mlx5_vdpa_net *ndev)
+{
+ u32 in[MLX5_ST_SZ_DW(query_hca_cap_in)] = {};
+ u16 opmod = (MLX5_CAP_VDPA_EMULATION << 1) | (HCA_CAP_OPMOD_GET_CUR & 0x01);
+ struct mlx5_core_dev *mdev = ndev->mvdev.mdev;
+ int out_size;
+ void *caps;
+ void *out;
+ int err;
+
+ out_size = MLX5_ST_SZ_BYTES(query_hca_cap_out);
+ out = kzalloc(out_size, GFP_KERNEL);
+ if (!out)
+ return -ENOMEM;
+
+ MLX5_SET(query_hca_cap_in, in, opcode, MLX5_CMD_OP_QUERY_HCA_CAP);
+ MLX5_SET(query_hca_cap_in, in, op_mod, opmod);
+ err = mlx5_cmd_exec_inout(mdev, query_hca_cap, in, out);
+ if (err) {
+ mlx5_vdpa_warn(&ndev->mvdev,
+ "Failed reading vdpa umem capabilities with err %d\n", err);
+ goto out;
+ }
+
+ caps = MLX5_ADDR_OF(query_hca_cap_out, out, capability);
+
+ ndev->umem_1_buffer_param_a = MLX5_GET(virtio_emulation_cap, caps, umem_1_buffer_param_a);
+ ndev->umem_1_buffer_param_b = MLX5_GET(virtio_emulation_cap, caps, umem_1_buffer_param_b);
+
+ ndev->umem_2_buffer_param_a = MLX5_GET(virtio_emulation_cap, caps, umem_2_buffer_param_a);
+ ndev->umem_2_buffer_param_b = MLX5_GET(virtio_emulation_cap, caps, umem_2_buffer_param_b);
+
+ ndev->umem_3_buffer_param_a = MLX5_GET(virtio_emulation_cap, caps, umem_3_buffer_param_a);
+ ndev->umem_3_buffer_param_b = MLX5_GET(virtio_emulation_cap, caps, umem_3_buffer_param_b);
+
+out:
+ kfree(out);
+ return 0;
+}
+
+static void set_umem_size(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq, int num,
+ struct mlx5_vdpa_umem **umemp)
+{
+ u32 p_a;
+ u32 p_b;
+
+ switch (num) {
+ case 1:
+ p_a = ndev->umem_1_buffer_param_a;
+ p_b = ndev->umem_1_buffer_param_b;
+ *umemp = &mvq->umem1;
+ break;
+ case 2:
+ p_a = ndev->umem_2_buffer_param_a;
+ p_b = ndev->umem_2_buffer_param_b;
+ *umemp = &mvq->umem2;
+ break;
+ case 3:
+ p_a = ndev->umem_3_buffer_param_a;
+ p_b = ndev->umem_3_buffer_param_b;
+ *umemp = &mvq->umem3;
+ break;
+ }
+
+ (*umemp)->size = p_a * mvq->num_ent + p_b;
+}
+
+static void umem_frag_buf_free(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_umem *umem)
+{
+ mlx5_frag_buf_free(ndev->mvdev.mdev, &umem->frag_buf);
+}
+
+static int create_umem(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq, int num)
+{
+ int inlen;
+ u32 out[MLX5_ST_SZ_DW(create_umem_out)] = {};
+ void *um;
+ void *in;
+ int err;
+ __be64 *pas;
+ struct mlx5_vdpa_umem *umem;
+
+ set_umem_size(ndev, mvq, num, &umem);
+ err = umem_frag_buf_alloc(ndev, umem, umem->size);
+ if (err)
+ return err;
+
+ inlen = MLX5_ST_SZ_BYTES(create_umem_in) + MLX5_ST_SZ_BYTES(mtt) * umem->frag_buf.npages;
+
+ in = kzalloc(inlen, GFP_KERNEL);
+ if (!in) {
+ err = -ENOMEM;
+ goto err_in;
+ }
+
+ MLX5_SET(create_umem_in, in, opcode, MLX5_CMD_OP_CREATE_UMEM);
+ MLX5_SET(create_umem_in, in, uid, ndev->mvdev.res.uid);
+ um = MLX5_ADDR_OF(create_umem_in, in, umem);
+ MLX5_SET(umem, um, log_page_size, umem->frag_buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT);
+ MLX5_SET64(umem, um, num_of_mtt, umem->frag_buf.npages);
+
+ pas = (__be64 *)MLX5_ADDR_OF(umem, um, mtt[0]);
+ mlx5_fill_page_frag_array_perm(&umem->frag_buf, pas, MLX5_MTT_PERM_RW);
+
+ err = mlx5_cmd_exec(ndev->mvdev.mdev, in, inlen, out, sizeof(out));
+ if (err) {
+ mlx5_vdpa_warn(&ndev->mvdev, "create umem(%d)\n", err);
+ goto err_cmd;
+ }
+
+ kfree(in);
+ umem->id = MLX5_GET(create_umem_out, out, umem_id);
+
+ return 0;
+
+err_cmd:
+ kfree(in);
+err_in:
+ umem_frag_buf_free(ndev, umem);
+ return err;
+}
+
+static void umem_destroy(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq, int num)
+{
+ u32 in[MLX5_ST_SZ_DW(destroy_umem_in)] = {};
+ u32 out[MLX5_ST_SZ_DW(destroy_umem_out)] = {};
+ struct mlx5_vdpa_umem *umem;
+
+ switch (num) {
+ case 1:
+ umem = &mvq->umem1;
+ break;
+ case 2:
+ umem = &mvq->umem2;
+ break;
+ case 3:
+ umem = &mvq->umem3;
+ break;
+ }
+
+ MLX5_SET(destroy_umem_in, in, opcode, MLX5_CMD_OP_DESTROY_UMEM);
+ MLX5_SET(destroy_umem_in, in, umem_id, umem->id);
+ if (mlx5_cmd_exec(ndev->mvdev.mdev, in, sizeof(in), out, sizeof(out)))
+ return;
+
+ umem_frag_buf_free(ndev, umem);
+}
+
+static int umems_create(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
+{
+ int num;
+ int err;
+
+ for (num = 1; num <= 3; num++) {
+ err = create_umem(ndev, mvq, num);
+ if (err)
+ goto err_umem;
+ }
+ return 0;
+
+err_umem:
+ for (num--; num > 0; num--)
+ umem_destroy(ndev, mvq, num);
+
+ return err;
+}
+
+static void umems_destroy(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
+{
+ int num;
+
+ for (num = 3; num > 0; num--)
+ umem_destroy(ndev, mvq, num);
+}
+
+static int get_queue_type(struct mlx5_vdpa_net *ndev)
+{
+ u32 type_mask;
+
+ type_mask = MLX5_CAP_DEV_VDPA_EMULATION(ndev->mvdev.mdev, virtio_queue_type);
+
+ /* prefer split queue */
+ if (type_mask & MLX5_VIRTIO_EMULATION_CAP_VIRTIO_QUEUE_TYPE_SPLIT)
+ return MLX5_VIRTIO_EMULATION_VIRTIO_QUEUE_TYPE_SPLIT;
+
+ WARN_ON(!(type_mask & MLX5_VIRTIO_EMULATION_CAP_VIRTIO_QUEUE_TYPE_PACKED));
+
+ return MLX5_VIRTIO_EMULATION_VIRTIO_QUEUE_TYPE_PACKED;
+}
+
+static bool vq_is_tx(u16 idx)
+{
+ return idx % 2;
+}
+
+enum {
+ MLX5_VIRTIO_NET_F_MRG_RXBUF = 2,
+ MLX5_VIRTIO_NET_F_HOST_ECN = 4,
+ MLX5_VIRTIO_NET_F_GUEST_ECN = 6,
+ MLX5_VIRTIO_NET_F_GUEST_TSO6 = 7,
+ MLX5_VIRTIO_NET_F_GUEST_TSO4 = 8,
+ MLX5_VIRTIO_NET_F_GUEST_CSUM = 9,
+ MLX5_VIRTIO_NET_F_CSUM = 10,
+ MLX5_VIRTIO_NET_F_HOST_TSO6 = 11,
+ MLX5_VIRTIO_NET_F_HOST_TSO4 = 12,
+};
+
+static u16 get_features(u64 features)
+{
+ return (!!(features & BIT_ULL(VIRTIO_NET_F_MRG_RXBUF)) << MLX5_VIRTIO_NET_F_MRG_RXBUF) |
+ (!!(features & BIT_ULL(VIRTIO_NET_F_HOST_ECN)) << MLX5_VIRTIO_NET_F_HOST_ECN) |
+ (!!(features & BIT_ULL(VIRTIO_NET_F_GUEST_ECN)) << MLX5_VIRTIO_NET_F_GUEST_ECN) |
+ (!!(features & BIT_ULL(VIRTIO_NET_F_GUEST_TSO6)) << MLX5_VIRTIO_NET_F_GUEST_TSO6) |
+ (!!(features & BIT_ULL(VIRTIO_NET_F_GUEST_TSO4)) << MLX5_VIRTIO_NET_F_GUEST_TSO4) |
+ (!!(features & BIT_ULL(VIRTIO_NET_F_CSUM)) << MLX5_VIRTIO_NET_F_CSUM) |
+ (!!(features & BIT_ULL(VIRTIO_NET_F_HOST_TSO6)) << MLX5_VIRTIO_NET_F_HOST_TSO6) |
+ (!!(features & BIT_ULL(VIRTIO_NET_F_HOST_TSO4)) << MLX5_VIRTIO_NET_F_HOST_TSO4);
+}
+
+static bool counters_supported(const struct mlx5_vdpa_dev *mvdev)
+{
+ return MLX5_CAP_GEN_64(mvdev->mdev, general_obj_types) &
+ BIT_ULL(MLX5_OBJ_TYPE_VIRTIO_Q_COUNTERS);
+}
+
+static bool msix_mode_supported(struct mlx5_vdpa_dev *mvdev)
+{
+ return MLX5_CAP_DEV_VDPA_EMULATION(mvdev->mdev, event_mode) &
+ (1 << MLX5_VIRTIO_Q_EVENT_MODE_MSIX_MODE) &&
+ pci_msix_can_alloc_dyn(mvdev->mdev->pdev);
+}
+
+static int create_virtqueue(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
+{
+ int inlen = MLX5_ST_SZ_BYTES(create_virtio_net_q_in);
+ u32 out[MLX5_ST_SZ_DW(create_virtio_net_q_out)] = {};
+ void *obj_context;
+ u16 mlx_features;
+ void *cmd_hdr;
+ void *vq_ctx;
+ void *in;
+ int err;
+
+ err = umems_create(ndev, mvq);
+ if (err)
+ return err;
+
+ in = kzalloc(inlen, GFP_KERNEL);
+ if (!in) {
+ err = -ENOMEM;
+ goto err_alloc;
+ }
+
+ mlx_features = get_features(ndev->mvdev.actual_features);
+ cmd_hdr = MLX5_ADDR_OF(create_virtio_net_q_in, in, general_obj_in_cmd_hdr);
+
+ MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode, MLX5_CMD_OP_CREATE_GENERAL_OBJECT);
+ MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type, MLX5_OBJ_TYPE_VIRTIO_NET_Q);
+ MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, uid, ndev->mvdev.res.uid);
+
+ obj_context = MLX5_ADDR_OF(create_virtio_net_q_in, in, obj_context);
+ MLX5_SET(virtio_net_q_object, obj_context, hw_available_index, mvq->avail_idx);
+ MLX5_SET(virtio_net_q_object, obj_context, hw_used_index, mvq->used_idx);
+ MLX5_SET(virtio_net_q_object, obj_context, queue_feature_bit_mask_12_3,
+ mlx_features >> 3);
+ MLX5_SET(virtio_net_q_object, obj_context, queue_feature_bit_mask_2_0,
+ mlx_features & 7);
+ vq_ctx = MLX5_ADDR_OF(virtio_net_q_object, obj_context, virtio_q_context);
+ MLX5_SET(virtio_q, vq_ctx, virtio_q_type, get_queue_type(ndev));
+
+ if (vq_is_tx(mvq->index))
+ MLX5_SET(virtio_net_q_object, obj_context, tisn_or_qpn, ndev->res.tisn);
+
+ if (mvq->map.virq) {
+ MLX5_SET(virtio_q, vq_ctx, event_mode, MLX5_VIRTIO_Q_EVENT_MODE_MSIX_MODE);
+ MLX5_SET(virtio_q, vq_ctx, event_qpn_or_msix, mvq->map.index);
+ } else {
+ MLX5_SET(virtio_q, vq_ctx, event_mode, MLX5_VIRTIO_Q_EVENT_MODE_QP_MODE);
+ MLX5_SET(virtio_q, vq_ctx, event_qpn_or_msix, mvq->fwqp.mqp.qpn);
+ }
+
+ MLX5_SET(virtio_q, vq_ctx, queue_index, mvq->index);
+ MLX5_SET(virtio_q, vq_ctx, queue_size, mvq->num_ent);
+ MLX5_SET(virtio_q, vq_ctx, virtio_version_1_0,
+ !!(ndev->mvdev.actual_features & BIT_ULL(VIRTIO_F_VERSION_1)));
+ MLX5_SET64(virtio_q, vq_ctx, desc_addr, mvq->desc_addr);
+ MLX5_SET64(virtio_q, vq_ctx, used_addr, mvq->device_addr);
+ MLX5_SET64(virtio_q, vq_ctx, available_addr, mvq->driver_addr);
+ MLX5_SET(virtio_q, vq_ctx, virtio_q_mkey, ndev->mvdev.mr.mkey);
+ MLX5_SET(virtio_q, vq_ctx, umem_1_id, mvq->umem1.id);
+ MLX5_SET(virtio_q, vq_ctx, umem_1_size, mvq->umem1.size);
+ MLX5_SET(virtio_q, vq_ctx, umem_2_id, mvq->umem2.id);
+ MLX5_SET(virtio_q, vq_ctx, umem_2_size, mvq->umem2.size);
+ MLX5_SET(virtio_q, vq_ctx, umem_3_id, mvq->umem3.id);
+ MLX5_SET(virtio_q, vq_ctx, umem_3_size, mvq->umem3.size);
+ MLX5_SET(virtio_q, vq_ctx, pd, ndev->mvdev.res.pdn);
+ if (counters_supported(&ndev->mvdev))
+ MLX5_SET(virtio_q, vq_ctx, counter_set_id, mvq->counter_set_id);
+
+ err = mlx5_cmd_exec(ndev->mvdev.mdev, in, inlen, out, sizeof(out));
+ if (err)
+ goto err_cmd;
+
+ mvq->fw_state = MLX5_VIRTIO_NET_Q_OBJECT_STATE_INIT;
+ kfree(in);
+ mvq->virtq_id = MLX5_GET(general_obj_out_cmd_hdr, out, obj_id);
+
+ return 0;
+
+err_cmd:
+ kfree(in);
+err_alloc:
+ umems_destroy(ndev, mvq);
+ return err;
+}
+
+static void destroy_virtqueue(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
+{
+ u32 in[MLX5_ST_SZ_DW(destroy_virtio_net_q_in)] = {};
+ u32 out[MLX5_ST_SZ_DW(destroy_virtio_net_q_out)] = {};
+
+ MLX5_SET(destroy_virtio_net_q_in, in, general_obj_out_cmd_hdr.opcode,
+ MLX5_CMD_OP_DESTROY_GENERAL_OBJECT);
+ MLX5_SET(destroy_virtio_net_q_in, in, general_obj_out_cmd_hdr.obj_id, mvq->virtq_id);
+ MLX5_SET(destroy_virtio_net_q_in, in, general_obj_out_cmd_hdr.uid, ndev->mvdev.res.uid);
+ MLX5_SET(destroy_virtio_net_q_in, in, general_obj_out_cmd_hdr.obj_type,
+ MLX5_OBJ_TYPE_VIRTIO_NET_Q);
+ if (mlx5_cmd_exec(ndev->mvdev.mdev, in, sizeof(in), out, sizeof(out))) {
+ mlx5_vdpa_warn(&ndev->mvdev, "destroy virtqueue 0x%x\n", mvq->virtq_id);
+ return;
+ }
+ mvq->fw_state = MLX5_VIRTIO_NET_Q_OBJECT_NONE;
+ umems_destroy(ndev, mvq);
+}
+
+static u32 get_rqpn(struct mlx5_vdpa_virtqueue *mvq, bool fw)
+{
+ return fw ? mvq->vqqp.mqp.qpn : mvq->fwqp.mqp.qpn;
+}
+
+static u32 get_qpn(struct mlx5_vdpa_virtqueue *mvq, bool fw)
+{
+ return fw ? mvq->fwqp.mqp.qpn : mvq->vqqp.mqp.qpn;
+}
+
+static void alloc_inout(struct mlx5_vdpa_net *ndev, int cmd, void **in, int *inlen, void **out,
+ int *outlen, u32 qpn, u32 rqpn)
+{
+ void *qpc;
+ void *pp;
+
+ switch (cmd) {
+ case MLX5_CMD_OP_2RST_QP:
+ *inlen = MLX5_ST_SZ_BYTES(qp_2rst_in);
+ *outlen = MLX5_ST_SZ_BYTES(qp_2rst_out);
+ *in = kzalloc(*inlen, GFP_KERNEL);
+ *out = kzalloc(*outlen, GFP_KERNEL);
+ if (!*in || !*out)
+ goto outerr;
+
+ MLX5_SET(qp_2rst_in, *in, opcode, cmd);
+ MLX5_SET(qp_2rst_in, *in, uid, ndev->mvdev.res.uid);
+ MLX5_SET(qp_2rst_in, *in, qpn, qpn);
+ break;
+ case MLX5_CMD_OP_RST2INIT_QP:
+ *inlen = MLX5_ST_SZ_BYTES(rst2init_qp_in);
+ *outlen = MLX5_ST_SZ_BYTES(rst2init_qp_out);
+ *in = kzalloc(*inlen, GFP_KERNEL);
+ *out = kzalloc(MLX5_ST_SZ_BYTES(rst2init_qp_out), GFP_KERNEL);
+ if (!*in || !*out)
+ goto outerr;
+
+ MLX5_SET(rst2init_qp_in, *in, opcode, cmd);
+ MLX5_SET(rst2init_qp_in, *in, uid, ndev->mvdev.res.uid);
+ MLX5_SET(rst2init_qp_in, *in, qpn, qpn);
+ qpc = MLX5_ADDR_OF(rst2init_qp_in, *in, qpc);
+ MLX5_SET(qpc, qpc, remote_qpn, rqpn);
+ MLX5_SET(qpc, qpc, rwe, 1);
+ pp = MLX5_ADDR_OF(qpc, qpc, primary_address_path);
+ MLX5_SET(ads, pp, vhca_port_num, 1);
+ break;
+ case MLX5_CMD_OP_INIT2RTR_QP:
+ *inlen = MLX5_ST_SZ_BYTES(init2rtr_qp_in);
+ *outlen = MLX5_ST_SZ_BYTES(init2rtr_qp_out);
+ *in = kzalloc(*inlen, GFP_KERNEL);
+ *out = kzalloc(MLX5_ST_SZ_BYTES(init2rtr_qp_out), GFP_KERNEL);
+ if (!*in || !*out)
+ goto outerr;
+
+ MLX5_SET(init2rtr_qp_in, *in, opcode, cmd);
+ MLX5_SET(init2rtr_qp_in, *in, uid, ndev->mvdev.res.uid);
+ MLX5_SET(init2rtr_qp_in, *in, qpn, qpn);
+ qpc = MLX5_ADDR_OF(rst2init_qp_in, *in, qpc);
+ MLX5_SET(qpc, qpc, mtu, MLX5_QPC_MTU_256_BYTES);
+ MLX5_SET(qpc, qpc, log_msg_max, 30);
+ MLX5_SET(qpc, qpc, remote_qpn, rqpn);
+ pp = MLX5_ADDR_OF(qpc, qpc, primary_address_path);
+ MLX5_SET(ads, pp, fl, 1);
+ break;
+ case MLX5_CMD_OP_RTR2RTS_QP:
+ *inlen = MLX5_ST_SZ_BYTES(rtr2rts_qp_in);
+ *outlen = MLX5_ST_SZ_BYTES(rtr2rts_qp_out);
+ *in = kzalloc(*inlen, GFP_KERNEL);
+ *out = kzalloc(MLX5_ST_SZ_BYTES(rtr2rts_qp_out), GFP_KERNEL);
+ if (!*in || !*out)
+ goto outerr;
+
+ MLX5_SET(rtr2rts_qp_in, *in, opcode, cmd);
+ MLX5_SET(rtr2rts_qp_in, *in, uid, ndev->mvdev.res.uid);
+ MLX5_SET(rtr2rts_qp_in, *in, qpn, qpn);
+ qpc = MLX5_ADDR_OF(rst2init_qp_in, *in, qpc);
+ pp = MLX5_ADDR_OF(qpc, qpc, primary_address_path);
+ MLX5_SET(ads, pp, ack_timeout, 14);
+ MLX5_SET(qpc, qpc, retry_count, 7);
+ MLX5_SET(qpc, qpc, rnr_retry, 7);
+ break;
+ default:
+ goto outerr_nullify;
+ }
+
+ return;
+
+outerr:
+ kfree(*in);
+ kfree(*out);
+outerr_nullify:
+ *in = NULL;
+ *out = NULL;
+}
+
+static void free_inout(void *in, void *out)
+{
+ kfree(in);
+ kfree(out);
+}
+
+/* Two QPs are used by each virtqueue. One is used by the driver and one by
+ * firmware. The fw argument indicates whether the subjected QP is the one used
+ * by firmware.
+ */
+static int modify_qp(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq, bool fw, int cmd)
+{
+ int outlen;
+ int inlen;
+ void *out;
+ void *in;
+ int err;
+
+ alloc_inout(ndev, cmd, &in, &inlen, &out, &outlen, get_qpn(mvq, fw), get_rqpn(mvq, fw));
+ if (!in || !out)
+ return -ENOMEM;
+
+ err = mlx5_cmd_exec(ndev->mvdev.mdev, in, inlen, out, outlen);
+ free_inout(in, out);
+ return err;
+}
+
+static int connect_qps(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
+{
+ int err;
+
+ err = modify_qp(ndev, mvq, true, MLX5_CMD_OP_2RST_QP);
+ if (err)
+ return err;
+
+ err = modify_qp(ndev, mvq, false, MLX5_CMD_OP_2RST_QP);
+ if (err)
+ return err;
+
+ err = modify_qp(ndev, mvq, true, MLX5_CMD_OP_RST2INIT_QP);
+ if (err)
+ return err;
+
+ err = modify_qp(ndev, mvq, false, MLX5_CMD_OP_RST2INIT_QP);
+ if (err)
+ return err;
+
+ err = modify_qp(ndev, mvq, true, MLX5_CMD_OP_INIT2RTR_QP);
+ if (err)
+ return err;
+
+ err = modify_qp(ndev, mvq, false, MLX5_CMD_OP_INIT2RTR_QP);
+ if (err)
+ return err;
+
+ return modify_qp(ndev, mvq, true, MLX5_CMD_OP_RTR2RTS_QP);
+}
+
+struct mlx5_virtq_attr {
+ u8 state;
+ u16 available_index;
+ u16 used_index;
+};
+
+static int query_virtqueue(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq,
+ struct mlx5_virtq_attr *attr)
+{
+ int outlen = MLX5_ST_SZ_BYTES(query_virtio_net_q_out);
+ u32 in[MLX5_ST_SZ_DW(query_virtio_net_q_in)] = {};
+ void *out;
+ void *obj_context;
+ void *cmd_hdr;
+ int err;
+
+ out = kzalloc(outlen, GFP_KERNEL);
+ if (!out)
+ return -ENOMEM;
+
+ cmd_hdr = MLX5_ADDR_OF(query_virtio_net_q_in, in, general_obj_in_cmd_hdr);
+
+ MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode, MLX5_CMD_OP_QUERY_GENERAL_OBJECT);
+ MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type, MLX5_OBJ_TYPE_VIRTIO_NET_Q);
+ MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_id, mvq->virtq_id);
+ MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, uid, ndev->mvdev.res.uid);
+ err = mlx5_cmd_exec(ndev->mvdev.mdev, in, sizeof(in), out, outlen);
+ if (err)
+ goto err_cmd;
+
+ obj_context = MLX5_ADDR_OF(query_virtio_net_q_out, out, obj_context);
+ memset(attr, 0, sizeof(*attr));
+ attr->state = MLX5_GET(virtio_net_q_object, obj_context, state);
+ attr->available_index = MLX5_GET(virtio_net_q_object, obj_context, hw_available_index);
+ attr->used_index = MLX5_GET(virtio_net_q_object, obj_context, hw_used_index);
+ kfree(out);
+ return 0;
+
+err_cmd:
+ kfree(out);
+ return err;
+}
+
+static bool is_valid_state_change(int oldstate, int newstate)
+{
+ switch (oldstate) {
+ case MLX5_VIRTIO_NET_Q_OBJECT_STATE_INIT:
+ return newstate == MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY;
+ case MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY:
+ return newstate == MLX5_VIRTIO_NET_Q_OBJECT_STATE_SUSPEND;
+ case MLX5_VIRTIO_NET_Q_OBJECT_STATE_SUSPEND:
+ case MLX5_VIRTIO_NET_Q_OBJECT_STATE_ERR:
+ default:
+ return false;
+ }
+}
+
+static int modify_virtqueue(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq, int state)
+{
+ int inlen = MLX5_ST_SZ_BYTES(modify_virtio_net_q_in);
+ u32 out[MLX5_ST_SZ_DW(modify_virtio_net_q_out)] = {};
+ void *obj_context;
+ void *cmd_hdr;
+ void *in;
+ int err;
+
+ if (mvq->fw_state == MLX5_VIRTIO_NET_Q_OBJECT_NONE)
+ return 0;
+
+ if (!is_valid_state_change(mvq->fw_state, state))
+ return -EINVAL;
+
+ in = kzalloc(inlen, GFP_KERNEL);
+ if (!in)
+ return -ENOMEM;
+
+ cmd_hdr = MLX5_ADDR_OF(modify_virtio_net_q_in, in, general_obj_in_cmd_hdr);
+
+ MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode, MLX5_CMD_OP_MODIFY_GENERAL_OBJECT);
+ MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type, MLX5_OBJ_TYPE_VIRTIO_NET_Q);
+ MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_id, mvq->virtq_id);
+ MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, uid, ndev->mvdev.res.uid);
+
+ obj_context = MLX5_ADDR_OF(modify_virtio_net_q_in, in, obj_context);
+ MLX5_SET64(virtio_net_q_object, obj_context, modify_field_select,
+ MLX5_VIRTQ_MODIFY_MASK_STATE);
+ MLX5_SET(virtio_net_q_object, obj_context, state, state);
+ err = mlx5_cmd_exec(ndev->mvdev.mdev, in, inlen, out, sizeof(out));
+ kfree(in);
+ if (!err)
+ mvq->fw_state = state;
+
+ return err;
+}
+
+static int counter_set_alloc(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
+{
+ u32 in[MLX5_ST_SZ_DW(create_virtio_q_counters_in)] = {};
+ u32 out[MLX5_ST_SZ_DW(create_virtio_q_counters_out)] = {};
+ void *cmd_hdr;
+ int err;
+
+ if (!counters_supported(&ndev->mvdev))
+ return 0;
+
+ cmd_hdr = MLX5_ADDR_OF(create_virtio_q_counters_in, in, hdr);
+
+ MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode, MLX5_CMD_OP_CREATE_GENERAL_OBJECT);
+ MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type, MLX5_OBJ_TYPE_VIRTIO_Q_COUNTERS);
+ MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, uid, ndev->mvdev.res.uid);
+
+ err = mlx5_cmd_exec(ndev->mvdev.mdev, in, sizeof(in), out, sizeof(out));
+ if (err)
+ return err;
+
+ mvq->counter_set_id = MLX5_GET(general_obj_out_cmd_hdr, out, obj_id);
+
+ return 0;
+}
+
+static void counter_set_dealloc(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
+{
+ u32 in[MLX5_ST_SZ_DW(destroy_virtio_q_counters_in)] = {};
+ u32 out[MLX5_ST_SZ_DW(destroy_virtio_q_counters_out)] = {};
+
+ if (!counters_supported(&ndev->mvdev))
+ return;
+
+ MLX5_SET(destroy_virtio_q_counters_in, in, hdr.opcode, MLX5_CMD_OP_DESTROY_GENERAL_OBJECT);
+ MLX5_SET(destroy_virtio_q_counters_in, in, hdr.obj_id, mvq->counter_set_id);
+ MLX5_SET(destroy_virtio_q_counters_in, in, hdr.uid, ndev->mvdev.res.uid);
+ MLX5_SET(destroy_virtio_q_counters_in, in, hdr.obj_type, MLX5_OBJ_TYPE_VIRTIO_Q_COUNTERS);
+ if (mlx5_cmd_exec(ndev->mvdev.mdev, in, sizeof(in), out, sizeof(out)))
+ mlx5_vdpa_warn(&ndev->mvdev, "dealloc counter set 0x%x\n", mvq->counter_set_id);
+}
+
+static irqreturn_t mlx5_vdpa_int_handler(int irq, void *priv)
+{
+ struct vdpa_callback *cb = priv;
+
+ if (cb->callback)
+ return cb->callback(cb->private);
+
+ return IRQ_HANDLED;
+}
+
+static void alloc_vector(struct mlx5_vdpa_net *ndev,
+ struct mlx5_vdpa_virtqueue *mvq)
+{
+ struct mlx5_vdpa_irq_pool *irqp = &ndev->irqp;
+ struct mlx5_vdpa_irq_pool_entry *ent;
+ int err;
+ int i;
+
+ for (i = 0; i < irqp->num_ent; i++) {
+ ent = &irqp->entries[i];
+ if (!ent->used) {
+ snprintf(ent->name, MLX5_VDPA_IRQ_NAME_LEN, "%s-vq-%d",
+ dev_name(&ndev->mvdev.vdev.dev), mvq->index);
+ ent->dev_id = &ndev->event_cbs[mvq->index];
+ err = request_irq(ent->map.virq, mlx5_vdpa_int_handler, 0,
+ ent->name, ent->dev_id);
+ if (err)
+ return;
+
+ ent->used = true;
+ mvq->map = ent->map;
+ return;
+ }
+ }
+}
+
+static void dealloc_vector(struct mlx5_vdpa_net *ndev,
+ struct mlx5_vdpa_virtqueue *mvq)
+{
+ struct mlx5_vdpa_irq_pool *irqp = &ndev->irqp;
+ int i;
+
+ for (i = 0; i < irqp->num_ent; i++)
+ if (mvq->map.virq == irqp->entries[i].map.virq) {
+ free_irq(mvq->map.virq, irqp->entries[i].dev_id);
+ irqp->entries[i].used = false;
+ return;
+ }
+}
+
+static int setup_vq(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
+{
+ u16 idx = mvq->index;
+ int err;
+
+ if (!mvq->num_ent)
+ return 0;
+
+ if (mvq->initialized)
+ return 0;
+
+ err = cq_create(ndev, idx, mvq->num_ent);
+ if (err)
+ return err;
+
+ err = qp_create(ndev, mvq, &mvq->fwqp);
+ if (err)
+ goto err_fwqp;
+
+ err = qp_create(ndev, mvq, &mvq->vqqp);
+ if (err)
+ goto err_vqqp;
+
+ err = connect_qps(ndev, mvq);
+ if (err)
+ goto err_connect;
+
+ err = counter_set_alloc(ndev, mvq);
+ if (err)
+ goto err_connect;
+
+ alloc_vector(ndev, mvq);
+ err = create_virtqueue(ndev, mvq);
+ if (err)
+ goto err_vq;
+
+ if (mvq->ready) {
+ err = modify_virtqueue(ndev, mvq, MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY);
+ if (err) {
+ mlx5_vdpa_warn(&ndev->mvdev, "failed to modify to ready vq idx %d(%d)\n",
+ idx, err);
+ goto err_modify;
+ }
+ }
+
+ mvq->initialized = true;
+ return 0;
+
+err_modify:
+ destroy_virtqueue(ndev, mvq);
+err_vq:
+ dealloc_vector(ndev, mvq);
+ counter_set_dealloc(ndev, mvq);
+err_connect:
+ qp_destroy(ndev, &mvq->vqqp);
+err_vqqp:
+ qp_destroy(ndev, &mvq->fwqp);
+err_fwqp:
+ cq_destroy(ndev, idx);
+ return err;
+}
+
+static void suspend_vq(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
+{
+ struct mlx5_virtq_attr attr;
+
+ if (!mvq->initialized)
+ return;
+
+ if (mvq->fw_state != MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY)
+ return;
+
+ if (modify_virtqueue(ndev, mvq, MLX5_VIRTIO_NET_Q_OBJECT_STATE_SUSPEND))
+ mlx5_vdpa_warn(&ndev->mvdev, "modify to suspend failed\n");
+
+ if (query_virtqueue(ndev, mvq, &attr)) {
+ mlx5_vdpa_warn(&ndev->mvdev, "failed to query virtqueue\n");
+ return;
+ }
+ mvq->avail_idx = attr.available_index;
+ mvq->used_idx = attr.used_index;
+}
+
+static void suspend_vqs(struct mlx5_vdpa_net *ndev)
+{
+ int i;
+
+ for (i = 0; i < ndev->mvdev.max_vqs; i++)
+ suspend_vq(ndev, &ndev->vqs[i]);
+}
+
+static void teardown_vq(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
+{
+ if (!mvq->initialized)
+ return;
+
+ suspend_vq(ndev, mvq);
+ destroy_virtqueue(ndev, mvq);
+ dealloc_vector(ndev, mvq);
+ counter_set_dealloc(ndev, mvq);
+ qp_destroy(ndev, &mvq->vqqp);
+ qp_destroy(ndev, &mvq->fwqp);
+ cq_destroy(ndev, mvq->index);
+ mvq->initialized = false;
+}
+
+static int create_rqt(struct mlx5_vdpa_net *ndev)
+{
+ int rqt_table_size = roundup_pow_of_two(ndev->rqt_size);
+ int act_sz = roundup_pow_of_two(ndev->cur_num_vqs / 2);
+ __be32 *list;
+ void *rqtc;
+ int inlen;
+ void *in;
+ int i, j;
+ int err;
+
+ inlen = MLX5_ST_SZ_BYTES(create_rqt_in) + rqt_table_size * MLX5_ST_SZ_BYTES(rq_num);
+ in = kzalloc(inlen, GFP_KERNEL);
+ if (!in)
+ return -ENOMEM;
+
+ MLX5_SET(create_rqt_in, in, uid, ndev->mvdev.res.uid);
+ rqtc = MLX5_ADDR_OF(create_rqt_in, in, rqt_context);
+
+ MLX5_SET(rqtc, rqtc, list_q_type, MLX5_RQTC_LIST_Q_TYPE_VIRTIO_NET_Q);
+ MLX5_SET(rqtc, rqtc, rqt_max_size, rqt_table_size);
+ list = MLX5_ADDR_OF(rqtc, rqtc, rq_num[0]);
+ for (i = 0, j = 0; i < act_sz; i++, j += 2)
+ list[i] = cpu_to_be32(ndev->vqs[j % ndev->cur_num_vqs].virtq_id);
+
+ MLX5_SET(rqtc, rqtc, rqt_actual_size, act_sz);
+ err = mlx5_vdpa_create_rqt(&ndev->mvdev, in, inlen, &ndev->res.rqtn);
+ kfree(in);
+ if (err)
+ return err;
+
+ return 0;
+}
+
+#define MLX5_MODIFY_RQT_NUM_RQS ((u64)1)
+
+static int modify_rqt(struct mlx5_vdpa_net *ndev, int num)
+{
+ int act_sz = roundup_pow_of_two(num / 2);
+ __be32 *list;
+ void *rqtc;
+ int inlen;
+ void *in;
+ int i, j;
+ int err;
+
+ inlen = MLX5_ST_SZ_BYTES(modify_rqt_in) + act_sz * MLX5_ST_SZ_BYTES(rq_num);
+ in = kzalloc(inlen, GFP_KERNEL);
+ if (!in)
+ return -ENOMEM;
+
+ MLX5_SET(modify_rqt_in, in, uid, ndev->mvdev.res.uid);
+ MLX5_SET64(modify_rqt_in, in, bitmask, MLX5_MODIFY_RQT_NUM_RQS);
+ rqtc = MLX5_ADDR_OF(modify_rqt_in, in, ctx);
+ MLX5_SET(rqtc, rqtc, list_q_type, MLX5_RQTC_LIST_Q_TYPE_VIRTIO_NET_Q);
+
+ list = MLX5_ADDR_OF(rqtc, rqtc, rq_num[0]);
+ for (i = 0, j = 0; i < act_sz; i++, j = j + 2)
+ list[i] = cpu_to_be32(ndev->vqs[j % num].virtq_id);
+
+ MLX5_SET(rqtc, rqtc, rqt_actual_size, act_sz);
+ err = mlx5_vdpa_modify_rqt(&ndev->mvdev, in, inlen, ndev->res.rqtn);
+ kfree(in);
+ if (err)
+ return err;
+
+ return 0;
+}
+
+static void destroy_rqt(struct mlx5_vdpa_net *ndev)
+{
+ mlx5_vdpa_destroy_rqt(&ndev->mvdev, ndev->res.rqtn);
+}
+
+static int create_tir(struct mlx5_vdpa_net *ndev)
+{
+#define HASH_IP_L4PORTS \
+ (MLX5_HASH_FIELD_SEL_SRC_IP | MLX5_HASH_FIELD_SEL_DST_IP | MLX5_HASH_FIELD_SEL_L4_SPORT | \
+ MLX5_HASH_FIELD_SEL_L4_DPORT)
+ static const u8 rx_hash_toeplitz_key[] = { 0x2c, 0xc6, 0x81, 0xd1, 0x5b, 0xdb, 0xf4, 0xf7,
+ 0xfc, 0xa2, 0x83, 0x19, 0xdb, 0x1a, 0x3e, 0x94,
+ 0x6b, 0x9e, 0x38, 0xd9, 0x2c, 0x9c, 0x03, 0xd1,
+ 0xad, 0x99, 0x44, 0xa7, 0xd9, 0x56, 0x3d, 0x59,
+ 0x06, 0x3c, 0x25, 0xf3, 0xfc, 0x1f, 0xdc, 0x2a };
+ void *rss_key;
+ void *outer;
+ void *tirc;
+ void *in;
+ int err;
+
+ in = kzalloc(MLX5_ST_SZ_BYTES(create_tir_in), GFP_KERNEL);
+ if (!in)
+ return -ENOMEM;
+
+ MLX5_SET(create_tir_in, in, uid, ndev->mvdev.res.uid);
+ tirc = MLX5_ADDR_OF(create_tir_in, in, ctx);
+ MLX5_SET(tirc, tirc, disp_type, MLX5_TIRC_DISP_TYPE_INDIRECT);
+
+ MLX5_SET(tirc, tirc, rx_hash_symmetric, 1);
+ MLX5_SET(tirc, tirc, rx_hash_fn, MLX5_RX_HASH_FN_TOEPLITZ);
+ rss_key = MLX5_ADDR_OF(tirc, tirc, rx_hash_toeplitz_key);
+ memcpy(rss_key, rx_hash_toeplitz_key, sizeof(rx_hash_toeplitz_key));
+
+ outer = MLX5_ADDR_OF(tirc, tirc, rx_hash_field_selector_outer);
+ MLX5_SET(rx_hash_field_select, outer, l3_prot_type, MLX5_L3_PROT_TYPE_IPV4);
+ MLX5_SET(rx_hash_field_select, outer, l4_prot_type, MLX5_L4_PROT_TYPE_TCP);
+ MLX5_SET(rx_hash_field_select, outer, selected_fields, HASH_IP_L4PORTS);
+
+ MLX5_SET(tirc, tirc, indirect_table, ndev->res.rqtn);
+ MLX5_SET(tirc, tirc, transport_domain, ndev->res.tdn);
+
+ err = mlx5_vdpa_create_tir(&ndev->mvdev, in, &ndev->res.tirn);
+ kfree(in);
+ if (err)
+ return err;
+
+ mlx5_vdpa_add_tirn(ndev);
+ return err;
+}
+
+static void destroy_tir(struct mlx5_vdpa_net *ndev)
+{
+ mlx5_vdpa_remove_tirn(ndev);
+ mlx5_vdpa_destroy_tir(&ndev->mvdev, ndev->res.tirn);
+}
+
+#define MAX_STEERING_ENT 0x8000
+#define MAX_STEERING_GROUPS 2
+
+#if defined(CONFIG_MLX5_VDPA_STEERING_DEBUG)
+ #define NUM_DESTS 2
+#else
+ #define NUM_DESTS 1
+#endif
+
+static int add_steering_counters(struct mlx5_vdpa_net *ndev,
+ struct macvlan_node *node,
+ struct mlx5_flow_act *flow_act,
+ struct mlx5_flow_destination *dests)
+{
+#if defined(CONFIG_MLX5_VDPA_STEERING_DEBUG)
+ int err;
+
+ node->ucast_counter.counter = mlx5_fc_create(ndev->mvdev.mdev, false);
+ if (IS_ERR(node->ucast_counter.counter))
+ return PTR_ERR(node->ucast_counter.counter);
+
+ node->mcast_counter.counter = mlx5_fc_create(ndev->mvdev.mdev, false);
+ if (IS_ERR(node->mcast_counter.counter)) {
+ err = PTR_ERR(node->mcast_counter.counter);
+ goto err_mcast_counter;
+ }
+
+ dests[1].type = MLX5_FLOW_DESTINATION_TYPE_COUNTER;
+ flow_act->action |= MLX5_FLOW_CONTEXT_ACTION_COUNT;
+ return 0;
+
+err_mcast_counter:
+ mlx5_fc_destroy(ndev->mvdev.mdev, node->ucast_counter.counter);
+ return err;
+#else
+ return 0;
+#endif
+}
+
+static void remove_steering_counters(struct mlx5_vdpa_net *ndev,
+ struct macvlan_node *node)
+{
+#if defined(CONFIG_MLX5_VDPA_STEERING_DEBUG)
+ mlx5_fc_destroy(ndev->mvdev.mdev, node->mcast_counter.counter);
+ mlx5_fc_destroy(ndev->mvdev.mdev, node->ucast_counter.counter);
+#endif
+}
+
+static int mlx5_vdpa_add_mac_vlan_rules(struct mlx5_vdpa_net *ndev, u8 *mac,
+ struct macvlan_node *node)
+{
+ struct mlx5_flow_destination dests[NUM_DESTS] = {};
+ struct mlx5_flow_act flow_act = {};
+ struct mlx5_flow_spec *spec;
+ void *headers_c;
+ void *headers_v;
+ u8 *dmac_c;
+ u8 *dmac_v;
+ int err;
+ u16 vid;
+
+ spec = kvzalloc(sizeof(*spec), GFP_KERNEL);
+ if (!spec)
+ return -ENOMEM;
+
+ vid = key2vid(node->macvlan);
+ spec->match_criteria_enable = MLX5_MATCH_OUTER_HEADERS;
+ headers_c = MLX5_ADDR_OF(fte_match_param, spec->match_criteria, outer_headers);
+ headers_v = MLX5_ADDR_OF(fte_match_param, spec->match_value, outer_headers);
+ dmac_c = MLX5_ADDR_OF(fte_match_param, headers_c, outer_headers.dmac_47_16);
+ dmac_v = MLX5_ADDR_OF(fte_match_param, headers_v, outer_headers.dmac_47_16);
+ eth_broadcast_addr(dmac_c);
+ ether_addr_copy(dmac_v, mac);
+ if (ndev->mvdev.actual_features & BIT_ULL(VIRTIO_NET_F_CTRL_VLAN)) {
+ MLX5_SET(fte_match_set_lyr_2_4, headers_c, cvlan_tag, 1);
+ MLX5_SET_TO_ONES(fte_match_set_lyr_2_4, headers_c, first_vid);
+ }
+ if (node->tagged) {
+ MLX5_SET(fte_match_set_lyr_2_4, headers_v, cvlan_tag, 1);
+ MLX5_SET(fte_match_set_lyr_2_4, headers_v, first_vid, vid);
+ }
+ flow_act.action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST;
+ dests[0].type = MLX5_FLOW_DESTINATION_TYPE_TIR;
+ dests[0].tir_num = ndev->res.tirn;
+ err = add_steering_counters(ndev, node, &flow_act, dests);
+ if (err)
+ goto out_free;
+
+#if defined(CONFIG_MLX5_VDPA_STEERING_DEBUG)
+ dests[1].counter_id = mlx5_fc_id(node->ucast_counter.counter);
+#endif
+ node->ucast_rule = mlx5_add_flow_rules(ndev->rxft, spec, &flow_act, dests, NUM_DESTS);
+ if (IS_ERR(node->ucast_rule)) {
+ err = PTR_ERR(node->ucast_rule);
+ goto err_ucast;
+ }
+
+#if defined(CONFIG_MLX5_VDPA_STEERING_DEBUG)
+ dests[1].counter_id = mlx5_fc_id(node->mcast_counter.counter);
+#endif
+
+ memset(dmac_c, 0, ETH_ALEN);
+ memset(dmac_v, 0, ETH_ALEN);
+ dmac_c[0] = 1;
+ dmac_v[0] = 1;
+ node->mcast_rule = mlx5_add_flow_rules(ndev->rxft, spec, &flow_act, dests, NUM_DESTS);
+ if (IS_ERR(node->mcast_rule)) {
+ err = PTR_ERR(node->mcast_rule);
+ goto err_mcast;
+ }
+ kvfree(spec);
+ mlx5_vdpa_add_rx_counters(ndev, node);
+ return 0;
+
+err_mcast:
+ mlx5_del_flow_rules(node->ucast_rule);
+err_ucast:
+ remove_steering_counters(ndev, node);
+out_free:
+ kvfree(spec);
+ return err;
+}
+
+static void mlx5_vdpa_del_mac_vlan_rules(struct mlx5_vdpa_net *ndev,
+ struct macvlan_node *node)
+{
+ mlx5_vdpa_remove_rx_counters(ndev, node);
+ mlx5_del_flow_rules(node->ucast_rule);
+ mlx5_del_flow_rules(node->mcast_rule);
+}
+
+static u64 search_val(u8 *mac, u16 vlan, bool tagged)
+{
+ u64 val;
+
+ if (!tagged)
+ vlan = MLX5V_UNTAGGED;
+
+ val = (u64)vlan << 48 |
+ (u64)mac[0] << 40 |
+ (u64)mac[1] << 32 |
+ (u64)mac[2] << 24 |
+ (u64)mac[3] << 16 |
+ (u64)mac[4] << 8 |
+ (u64)mac[5];
+
+ return val;
+}
+
+static struct macvlan_node *mac_vlan_lookup(struct mlx5_vdpa_net *ndev, u64 value)
+{
+ struct macvlan_node *pos;
+ u32 idx;
+
+ idx = hash_64(value, 8); // tbd 8
+ hlist_for_each_entry(pos, &ndev->macvlan_hash[idx], hlist) {
+ if (pos->macvlan == value)
+ return pos;
+ }
+ return NULL;
+}
+
+static int mac_vlan_add(struct mlx5_vdpa_net *ndev, u8 *mac, u16 vid, bool tagged)
+{
+ struct macvlan_node *ptr;
+ u64 val;
+ u32 idx;
+ int err;
+
+ val = search_val(mac, vid, tagged);
+ if (mac_vlan_lookup(ndev, val))
+ return -EEXIST;
+
+ ptr = kzalloc(sizeof(*ptr), GFP_KERNEL);
+ if (!ptr)
+ return -ENOMEM;
+
+ ptr->tagged = tagged;
+ ptr->macvlan = val;
+ ptr->ndev = ndev;
+ err = mlx5_vdpa_add_mac_vlan_rules(ndev, ndev->config.mac, ptr);
+ if (err)
+ goto err_add;
+
+ idx = hash_64(val, 8);
+ hlist_add_head(&ptr->hlist, &ndev->macvlan_hash[idx]);
+ return 0;
+
+err_add:
+ kfree(ptr);
+ return err;
+}
+
+static void mac_vlan_del(struct mlx5_vdpa_net *ndev, u8 *mac, u16 vlan, bool tagged)
+{
+ struct macvlan_node *ptr;
+
+ ptr = mac_vlan_lookup(ndev, search_val(mac, vlan, tagged));
+ if (!ptr)
+ return;
+
+ hlist_del(&ptr->hlist);
+ mlx5_vdpa_del_mac_vlan_rules(ndev, ptr);
+ remove_steering_counters(ndev, ptr);
+ kfree(ptr);
+}
+
+static void clear_mac_vlan_table(struct mlx5_vdpa_net *ndev)
+{
+ struct macvlan_node *pos;
+ struct hlist_node *n;
+ int i;
+
+ for (i = 0; i < MLX5V_MACVLAN_SIZE; i++) {
+ hlist_for_each_entry_safe(pos, n, &ndev->macvlan_hash[i], hlist) {
+ hlist_del(&pos->hlist);
+ mlx5_vdpa_del_mac_vlan_rules(ndev, pos);
+ remove_steering_counters(ndev, pos);
+ kfree(pos);
+ }
+ }
+}
+
+static int setup_steering(struct mlx5_vdpa_net *ndev)
+{
+ struct mlx5_flow_table_attr ft_attr = {};
+ struct mlx5_flow_namespace *ns;
+ int err;
+
+ ft_attr.max_fte = MAX_STEERING_ENT;
+ ft_attr.autogroup.max_num_groups = MAX_STEERING_GROUPS;
+
+ ns = mlx5_get_flow_namespace(ndev->mvdev.mdev, MLX5_FLOW_NAMESPACE_BYPASS);
+ if (!ns) {
+ mlx5_vdpa_warn(&ndev->mvdev, "failed to get flow namespace\n");
+ return -EOPNOTSUPP;
+ }
+
+ ndev->rxft = mlx5_create_auto_grouped_flow_table(ns, &ft_attr);
+ if (IS_ERR(ndev->rxft)) {
+ mlx5_vdpa_warn(&ndev->mvdev, "failed to create flow table\n");
+ return PTR_ERR(ndev->rxft);
+ }
+ mlx5_vdpa_add_rx_flow_table(ndev);
+
+ err = mac_vlan_add(ndev, ndev->config.mac, 0, false);
+ if (err)
+ goto err_add;
+
+ return 0;
+
+err_add:
+ mlx5_vdpa_remove_rx_flow_table(ndev);
+ mlx5_destroy_flow_table(ndev->rxft);
+ return err;
+}
+
+static void teardown_steering(struct mlx5_vdpa_net *ndev)
+{
+ clear_mac_vlan_table(ndev);
+ mlx5_vdpa_remove_rx_flow_table(ndev);
+ mlx5_destroy_flow_table(ndev->rxft);
+}
+
+static virtio_net_ctrl_ack handle_ctrl_mac(struct mlx5_vdpa_dev *mvdev, u8 cmd)
+{
+ struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
+ struct mlx5_control_vq *cvq = &mvdev->cvq;
+ virtio_net_ctrl_ack status = VIRTIO_NET_ERR;
+ struct mlx5_core_dev *pfmdev;
+ size_t read;
+ u8 mac[ETH_ALEN], mac_back[ETH_ALEN];
+
+ pfmdev = pci_get_drvdata(pci_physfn(mvdev->mdev->pdev));
+ switch (cmd) {
+ case VIRTIO_NET_CTRL_MAC_ADDR_SET:
+ read = vringh_iov_pull_iotlb(&cvq->vring, &cvq->riov, (void *)mac, ETH_ALEN);
+ if (read != ETH_ALEN)
+ break;
+
+ if (!memcmp(ndev->config.mac, mac, 6)) {
+ status = VIRTIO_NET_OK;
+ break;
+ }
+
+ if (is_zero_ether_addr(mac))
+ break;
+
+ if (!is_zero_ether_addr(ndev->config.mac)) {
+ if (mlx5_mpfs_del_mac(pfmdev, ndev->config.mac)) {
+ mlx5_vdpa_warn(mvdev, "failed to delete old MAC %pM from MPFS table\n",
+ ndev->config.mac);
+ break;
+ }
+ }
+
+ if (mlx5_mpfs_add_mac(pfmdev, mac)) {
+ mlx5_vdpa_warn(mvdev, "failed to insert new MAC %pM into MPFS table\n",
+ mac);
+ break;
+ }
+
+ /* backup the original mac address so that if failed to add the forward rules
+ * we could restore it
+ */
+ memcpy(mac_back, ndev->config.mac, ETH_ALEN);
+
+ memcpy(ndev->config.mac, mac, ETH_ALEN);
+
+ /* Need recreate the flow table entry, so that the packet could forward back
+ */
+ mac_vlan_del(ndev, mac_back, 0, false);
+
+ if (mac_vlan_add(ndev, ndev->config.mac, 0, false)) {
+ mlx5_vdpa_warn(mvdev, "failed to insert forward rules, try to restore\n");
+
+ /* Although it hardly run here, we still need double check */
+ if (is_zero_ether_addr(mac_back)) {
+ mlx5_vdpa_warn(mvdev, "restore mac failed: Original MAC is zero\n");
+ break;
+ }
+
+ /* Try to restore original mac address to MFPS table, and try to restore
+ * the forward rule entry.
+ */
+ if (mlx5_mpfs_del_mac(pfmdev, ndev->config.mac)) {
+ mlx5_vdpa_warn(mvdev, "restore mac failed: delete MAC %pM from MPFS table failed\n",
+ ndev->config.mac);
+ }
+
+ if (mlx5_mpfs_add_mac(pfmdev, mac_back)) {
+ mlx5_vdpa_warn(mvdev, "restore mac failed: insert old MAC %pM into MPFS table failed\n",
+ mac_back);
+ }
+
+ memcpy(ndev->config.mac, mac_back, ETH_ALEN);
+
+ if (mac_vlan_add(ndev, ndev->config.mac, 0, false))
+ mlx5_vdpa_warn(mvdev, "restore forward rules failed: insert forward rules failed\n");
+
+ break;
+ }
+
+ status = VIRTIO_NET_OK;
+ break;
+
+ default:
+ break;
+ }
+
+ return status;
+}
+
+static int change_num_qps(struct mlx5_vdpa_dev *mvdev, int newqps)
+{
+ struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
+ int cur_qps = ndev->cur_num_vqs / 2;
+ int err;
+ int i;
+
+ if (cur_qps > newqps) {
+ err = modify_rqt(ndev, 2 * newqps);
+ if (err)
+ return err;
+
+ for (i = ndev->cur_num_vqs - 1; i >= 2 * newqps; i--)
+ teardown_vq(ndev, &ndev->vqs[i]);
+
+ ndev->cur_num_vqs = 2 * newqps;
+ } else {
+ ndev->cur_num_vqs = 2 * newqps;
+ for (i = cur_qps * 2; i < 2 * newqps; i++) {
+ err = setup_vq(ndev, &ndev->vqs[i]);
+ if (err)
+ goto clean_added;
+ }
+ err = modify_rqt(ndev, 2 * newqps);
+ if (err)
+ goto clean_added;
+ }
+ return 0;
+
+clean_added:
+ for (--i; i >= 2 * cur_qps; --i)
+ teardown_vq(ndev, &ndev->vqs[i]);
+
+ ndev->cur_num_vqs = 2 * cur_qps;
+
+ return err;
+}
+
+static virtio_net_ctrl_ack handle_ctrl_mq(struct mlx5_vdpa_dev *mvdev, u8 cmd)
+{
+ struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
+ virtio_net_ctrl_ack status = VIRTIO_NET_ERR;
+ struct mlx5_control_vq *cvq = &mvdev->cvq;
+ struct virtio_net_ctrl_mq mq;
+ size_t read;
+ u16 newqps;
+
+ switch (cmd) {
+ case VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET:
+ /* This mq feature check aligns with pre-existing userspace
+ * implementation.
+ *
+ * Without it, an untrusted driver could fake a multiqueue config
+ * request down to a non-mq device that may cause kernel to
+ * panic due to uninitialized resources for extra vqs. Even with
+ * a well behaving guest driver, it is not expected to allow
+ * changing the number of vqs on a non-mq device.
+ */
+ if (!MLX5_FEATURE(mvdev, VIRTIO_NET_F_MQ))
+ break;
+
+ read = vringh_iov_pull_iotlb(&cvq->vring, &cvq->riov, (void *)&mq, sizeof(mq));
+ if (read != sizeof(mq))
+ break;
+
+ newqps = mlx5vdpa16_to_cpu(mvdev, mq.virtqueue_pairs);
+ if (newqps < VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MIN ||
+ newqps > ndev->rqt_size)
+ break;
+
+ if (ndev->cur_num_vqs == 2 * newqps) {
+ status = VIRTIO_NET_OK;
+ break;
+ }
+
+ if (!change_num_qps(mvdev, newqps))
+ status = VIRTIO_NET_OK;
+
+ break;
+ default:
+ break;
+ }
+
+ return status;
+}
+
+static virtio_net_ctrl_ack handle_ctrl_vlan(struct mlx5_vdpa_dev *mvdev, u8 cmd)
+{
+ struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
+ virtio_net_ctrl_ack status = VIRTIO_NET_ERR;
+ struct mlx5_control_vq *cvq = &mvdev->cvq;
+ __virtio16 vlan;
+ size_t read;
+ u16 id;
+
+ if (!(ndev->mvdev.actual_features & BIT_ULL(VIRTIO_NET_F_CTRL_VLAN)))
+ return status;
+
+ switch (cmd) {
+ case VIRTIO_NET_CTRL_VLAN_ADD:
+ read = vringh_iov_pull_iotlb(&cvq->vring, &cvq->riov, &vlan, sizeof(vlan));
+ if (read != sizeof(vlan))
+ break;
+
+ id = mlx5vdpa16_to_cpu(mvdev, vlan);
+ if (mac_vlan_add(ndev, ndev->config.mac, id, true))
+ break;
+
+ status = VIRTIO_NET_OK;
+ break;
+ case VIRTIO_NET_CTRL_VLAN_DEL:
+ read = vringh_iov_pull_iotlb(&cvq->vring, &cvq->riov, &vlan, sizeof(vlan));
+ if (read != sizeof(vlan))
+ break;
+
+ id = mlx5vdpa16_to_cpu(mvdev, vlan);
+ mac_vlan_del(ndev, ndev->config.mac, id, true);
+ status = VIRTIO_NET_OK;
+ break;
+ default:
+ break;
+ }
+
+ return status;
+}
+
+static void mlx5_cvq_kick_handler(struct work_struct *work)
+{
+ virtio_net_ctrl_ack status = VIRTIO_NET_ERR;
+ struct virtio_net_ctrl_hdr ctrl;
+ struct mlx5_vdpa_wq_ent *wqent;
+ struct mlx5_vdpa_dev *mvdev;
+ struct mlx5_control_vq *cvq;
+ struct mlx5_vdpa_net *ndev;
+ size_t read, write;
+ int err;
+
+ wqent = container_of(work, struct mlx5_vdpa_wq_ent, work);
+ mvdev = wqent->mvdev;
+ ndev = to_mlx5_vdpa_ndev(mvdev);
+ cvq = &mvdev->cvq;
+
+ down_write(&ndev->reslock);
+
+ if (!(mvdev->status & VIRTIO_CONFIG_S_DRIVER_OK))
+ goto out;
+
+ if (!(ndev->mvdev.actual_features & BIT_ULL(VIRTIO_NET_F_CTRL_VQ)))
+ goto out;
+
+ if (!cvq->ready)
+ goto out;
+
+ while (true) {
+ err = vringh_getdesc_iotlb(&cvq->vring, &cvq->riov, &cvq->wiov, &cvq->head,
+ GFP_ATOMIC);
+ if (err <= 0)
+ break;
+
+ read = vringh_iov_pull_iotlb(&cvq->vring, &cvq->riov, &ctrl, sizeof(ctrl));
+ if (read != sizeof(ctrl))
+ break;
+
+ cvq->received_desc++;
+ switch (ctrl.class) {
+ case VIRTIO_NET_CTRL_MAC:
+ status = handle_ctrl_mac(mvdev, ctrl.cmd);
+ break;
+ case VIRTIO_NET_CTRL_MQ:
+ status = handle_ctrl_mq(mvdev, ctrl.cmd);
+ break;
+ case VIRTIO_NET_CTRL_VLAN:
+ status = handle_ctrl_vlan(mvdev, ctrl.cmd);
+ break;
+ default:
+ break;
+ }
+
+ /* Make sure data is written before advancing index */
+ smp_wmb();
+
+ write = vringh_iov_push_iotlb(&cvq->vring, &cvq->wiov, &status, sizeof(status));
+ vringh_complete_iotlb(&cvq->vring, cvq->head, write);
+ vringh_kiov_cleanup(&cvq->riov);
+ vringh_kiov_cleanup(&cvq->wiov);
+
+ if (vringh_need_notify_iotlb(&cvq->vring))
+ vringh_notify(&cvq->vring);
+
+ cvq->completed_desc++;
+ queue_work(mvdev->wq, &wqent->work);
+ break;
+ }
+
+out:
+ up_write(&ndev->reslock);
+}
+
+static void mlx5_vdpa_kick_vq(struct vdpa_device *vdev, u16 idx)
+{
+ struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
+ struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
+ struct mlx5_vdpa_virtqueue *mvq;
+
+ if (!is_index_valid(mvdev, idx))
+ return;
+
+ if (unlikely(is_ctrl_vq_idx(mvdev, idx))) {
+ if (!mvdev->wq || !mvdev->cvq.ready)
+ return;
+
+ queue_work(mvdev->wq, &ndev->cvq_ent.work);
+ return;
+ }
+
+ mvq = &ndev->vqs[idx];
+ if (unlikely(!mvq->ready))
+ return;
+
+ iowrite16(idx, ndev->mvdev.res.kick_addr);
+}
+
+static int mlx5_vdpa_set_vq_address(struct vdpa_device *vdev, u16 idx, u64 desc_area,
+ u64 driver_area, u64 device_area)
+{
+ struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
+ struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
+ struct mlx5_vdpa_virtqueue *mvq;
+
+ if (!is_index_valid(mvdev, idx))
+ return -EINVAL;
+
+ if (is_ctrl_vq_idx(mvdev, idx)) {
+ mvdev->cvq.desc_addr = desc_area;
+ mvdev->cvq.device_addr = device_area;
+ mvdev->cvq.driver_addr = driver_area;
+ return 0;
+ }
+
+ mvq = &ndev->vqs[idx];
+ mvq->desc_addr = desc_area;
+ mvq->device_addr = device_area;
+ mvq->driver_addr = driver_area;
+ return 0;
+}
+
+static void mlx5_vdpa_set_vq_num(struct vdpa_device *vdev, u16 idx, u32 num)
+{
+ struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
+ struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
+ struct mlx5_vdpa_virtqueue *mvq;
+
+ if (!is_index_valid(mvdev, idx) || is_ctrl_vq_idx(mvdev, idx))
+ return;
+
+ mvq = &ndev->vqs[idx];
+ mvq->num_ent = num;
+}
+
+static void mlx5_vdpa_set_vq_cb(struct vdpa_device *vdev, u16 idx, struct vdpa_callback *cb)
+{
+ struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
+ struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
+
+ ndev->event_cbs[idx] = *cb;
+ if (is_ctrl_vq_idx(mvdev, idx))
+ mvdev->cvq.event_cb = *cb;
+}
+
+static void mlx5_cvq_notify(struct vringh *vring)
+{
+ struct mlx5_control_vq *cvq = container_of(vring, struct mlx5_control_vq, vring);
+
+ if (!cvq->event_cb.callback)
+ return;
+
+ cvq->event_cb.callback(cvq->event_cb.private);
+}
+
+static void set_cvq_ready(struct mlx5_vdpa_dev *mvdev, bool ready)
+{
+ struct mlx5_control_vq *cvq = &mvdev->cvq;
+
+ cvq->ready = ready;
+ if (!ready)
+ return;
+
+ cvq->vring.notify = mlx5_cvq_notify;
+}
+
+static void mlx5_vdpa_set_vq_ready(struct vdpa_device *vdev, u16 idx, bool ready)
+{
+ struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
+ struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
+ struct mlx5_vdpa_virtqueue *mvq;
+ int err;
+
+ if (!mvdev->actual_features)
+ return;
+
+ if (!is_index_valid(mvdev, idx))
+ return;
+
+ if (is_ctrl_vq_idx(mvdev, idx)) {
+ set_cvq_ready(mvdev, ready);
+ return;
+ }
+
+ mvq = &ndev->vqs[idx];
+ if (!ready) {
+ suspend_vq(ndev, mvq);
+ } else {
+ err = modify_virtqueue(ndev, mvq, MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY);
+ if (err) {
+ mlx5_vdpa_warn(mvdev, "modify VQ %d to ready failed (%d)\n", idx, err);
+ ready = false;
+ }
+ }
+
+
+ mvq->ready = ready;
+}
+
+static bool mlx5_vdpa_get_vq_ready(struct vdpa_device *vdev, u16 idx)
+{
+ struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
+ struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
+
+ if (!is_index_valid(mvdev, idx))
+ return false;
+
+ if (is_ctrl_vq_idx(mvdev, idx))
+ return mvdev->cvq.ready;
+
+ return ndev->vqs[idx].ready;
+}
+
+static int mlx5_vdpa_set_vq_state(struct vdpa_device *vdev, u16 idx,
+ const struct vdpa_vq_state *state)
+{
+ struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
+ struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
+ struct mlx5_vdpa_virtqueue *mvq;
+
+ if (!is_index_valid(mvdev, idx))
+ return -EINVAL;
+
+ if (is_ctrl_vq_idx(mvdev, idx)) {
+ mvdev->cvq.vring.last_avail_idx = state->split.avail_index;
+ return 0;
+ }
+
+ mvq = &ndev->vqs[idx];
+ if (mvq->fw_state == MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY) {
+ mlx5_vdpa_warn(mvdev, "can't modify available index\n");
+ return -EINVAL;
+ }
+
+ mvq->used_idx = state->split.avail_index;
+ mvq->avail_idx = state->split.avail_index;
+ return 0;
+}
+
+static int mlx5_vdpa_get_vq_state(struct vdpa_device *vdev, u16 idx, struct vdpa_vq_state *state)
+{
+ struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
+ struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
+ struct mlx5_vdpa_virtqueue *mvq;
+ struct mlx5_virtq_attr attr;
+ int err;
+
+ if (!is_index_valid(mvdev, idx))
+ return -EINVAL;
+
+ if (is_ctrl_vq_idx(mvdev, idx)) {
+ state->split.avail_index = mvdev->cvq.vring.last_avail_idx;
+ return 0;
+ }
+
+ mvq = &ndev->vqs[idx];
+ /* If the virtq object was destroyed, use the value saved at
+ * the last minute of suspend_vq. This caters for userspace
+ * that cares about emulating the index after vq is stopped.
+ */
+ if (!mvq->initialized) {
+ /* Firmware returns a wrong value for the available index.
+ * Since both values should be identical, we take the value of
+ * used_idx which is reported correctly.
+ */
+ state->split.avail_index = mvq->used_idx;
+ return 0;
+ }
+
+ err = query_virtqueue(ndev, mvq, &attr);
+ if (err) {
+ mlx5_vdpa_warn(mvdev, "failed to query virtqueue\n");
+ return err;
+ }
+ state->split.avail_index = attr.used_index;
+ return 0;
+}
+
+static u32 mlx5_vdpa_get_vq_align(struct vdpa_device *vdev)
+{
+ return PAGE_SIZE;
+}
+
+static u32 mlx5_vdpa_get_vq_group(struct vdpa_device *vdev, u16 idx)
+{
+ struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
+
+ if (is_ctrl_vq_idx(mvdev, idx))
+ return MLX5_VDPA_CVQ_GROUP;
+
+ return MLX5_VDPA_DATAVQ_GROUP;
+}
+
+static u64 mlx_to_vritio_features(u16 dev_features)
+{
+ u64 result = 0;
+
+ if (dev_features & BIT_ULL(MLX5_VIRTIO_NET_F_MRG_RXBUF))
+ result |= BIT_ULL(VIRTIO_NET_F_MRG_RXBUF);
+ if (dev_features & BIT_ULL(MLX5_VIRTIO_NET_F_HOST_ECN))
+ result |= BIT_ULL(VIRTIO_NET_F_HOST_ECN);
+ if (dev_features & BIT_ULL(MLX5_VIRTIO_NET_F_GUEST_ECN))
+ result |= BIT_ULL(VIRTIO_NET_F_GUEST_ECN);
+ if (dev_features & BIT_ULL(MLX5_VIRTIO_NET_F_GUEST_TSO6))
+ result |= BIT_ULL(VIRTIO_NET_F_GUEST_TSO6);
+ if (dev_features & BIT_ULL(MLX5_VIRTIO_NET_F_GUEST_TSO4))
+ result |= BIT_ULL(VIRTIO_NET_F_GUEST_TSO4);
+ if (dev_features & BIT_ULL(MLX5_VIRTIO_NET_F_GUEST_CSUM))
+ result |= BIT_ULL(VIRTIO_NET_F_GUEST_CSUM);
+ if (dev_features & BIT_ULL(MLX5_VIRTIO_NET_F_CSUM))
+ result |= BIT_ULL(VIRTIO_NET_F_CSUM);
+ if (dev_features & BIT_ULL(MLX5_VIRTIO_NET_F_HOST_TSO6))
+ result |= BIT_ULL(VIRTIO_NET_F_HOST_TSO6);
+ if (dev_features & BIT_ULL(MLX5_VIRTIO_NET_F_HOST_TSO4))
+ result |= BIT_ULL(VIRTIO_NET_F_HOST_TSO4);
+
+ return result;
+}
+
+static u64 get_supported_features(struct mlx5_core_dev *mdev)
+{
+ u64 mlx_vdpa_features = 0;
+ u16 dev_features;
+
+ dev_features = MLX5_CAP_DEV_VDPA_EMULATION(mdev, device_features_bits_mask);
+ mlx_vdpa_features |= mlx_to_vritio_features(dev_features);
+ if (MLX5_CAP_DEV_VDPA_EMULATION(mdev, virtio_version_1_0))
+ mlx_vdpa_features |= BIT_ULL(VIRTIO_F_VERSION_1);
+ mlx_vdpa_features |= BIT_ULL(VIRTIO_F_ACCESS_PLATFORM);
+ mlx_vdpa_features |= BIT_ULL(VIRTIO_NET_F_CTRL_VQ);
+ mlx_vdpa_features |= BIT_ULL(VIRTIO_NET_F_CTRL_MAC_ADDR);
+ mlx_vdpa_features |= BIT_ULL(VIRTIO_NET_F_MQ);
+ mlx_vdpa_features |= BIT_ULL(VIRTIO_NET_F_STATUS);
+ mlx_vdpa_features |= BIT_ULL(VIRTIO_NET_F_MTU);
+ mlx_vdpa_features |= BIT_ULL(VIRTIO_NET_F_CTRL_VLAN);
+ mlx_vdpa_features |= BIT_ULL(VIRTIO_NET_F_MAC);
+
+ return mlx_vdpa_features;
+}
+
+static u64 mlx5_vdpa_get_device_features(struct vdpa_device *vdev)
+{
+ struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
+ struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
+
+ print_features(mvdev, ndev->mvdev.mlx_features, false);
+ return ndev->mvdev.mlx_features;
+}
+
+static int verify_driver_features(struct mlx5_vdpa_dev *mvdev, u64 features)
+{
+ /* Minimum features to expect */
+ if (!(features & BIT_ULL(VIRTIO_F_ACCESS_PLATFORM)))
+ return -EOPNOTSUPP;
+
+ /* Double check features combination sent down by the driver.
+ * Fail invalid features due to absence of the depended feature.
+ *
+ * Per VIRTIO v1.1 specification, section 5.1.3.1 Feature bit
+ * requirements: "VIRTIO_NET_F_MQ Requires VIRTIO_NET_F_CTRL_VQ".
+ * By failing the invalid features sent down by untrusted drivers,
+ * we're assured the assumption made upon is_index_valid() and
+ * is_ctrl_vq_idx() will not be compromised.
+ */
+ if ((features & (BIT_ULL(VIRTIO_NET_F_MQ) | BIT_ULL(VIRTIO_NET_F_CTRL_VQ))) ==
+ BIT_ULL(VIRTIO_NET_F_MQ))
+ return -EINVAL;
+
+ return 0;
+}
+
+static int setup_virtqueues(struct mlx5_vdpa_dev *mvdev)
+{
+ struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
+ int err;
+ int i;
+
+ for (i = 0; i < mvdev->max_vqs; i++) {
+ err = setup_vq(ndev, &ndev->vqs[i]);
+ if (err)
+ goto err_vq;
+ }
+
+ return 0;
+
+err_vq:
+ for (--i; i >= 0; i--)
+ teardown_vq(ndev, &ndev->vqs[i]);
+
+ return err;
+}
+
+static void teardown_virtqueues(struct mlx5_vdpa_net *ndev)
+{
+ struct mlx5_vdpa_virtqueue *mvq;
+ int i;
+
+ for (i = ndev->mvdev.max_vqs - 1; i >= 0; i--) {
+ mvq = &ndev->vqs[i];
+ if (!mvq->initialized)
+ continue;
+
+ teardown_vq(ndev, mvq);
+ }
+}
+
+static void update_cvq_info(struct mlx5_vdpa_dev *mvdev)
+{
+ if (MLX5_FEATURE(mvdev, VIRTIO_NET_F_CTRL_VQ)) {
+ if (MLX5_FEATURE(mvdev, VIRTIO_NET_F_MQ)) {
+ /* MQ supported. CVQ index is right above the last data virtqueue's */
+ mvdev->max_idx = mvdev->max_vqs;
+ } else {
+ /* Only CVQ supportted. data virtqueues occupy indices 0 and 1.
+ * CVQ gets index 2
+ */
+ mvdev->max_idx = 2;
+ }
+ } else {
+ /* Two data virtqueues only: one for rx and one for tx */
+ mvdev->max_idx = 1;
+ }
+}
+
+static u8 query_vport_state(struct mlx5_core_dev *mdev, u8 opmod, u16 vport)
+{
+ u32 out[MLX5_ST_SZ_DW(query_vport_state_out)] = {};
+ u32 in[MLX5_ST_SZ_DW(query_vport_state_in)] = {};
+ int err;
+
+ MLX5_SET(query_vport_state_in, in, opcode, MLX5_CMD_OP_QUERY_VPORT_STATE);
+ MLX5_SET(query_vport_state_in, in, op_mod, opmod);
+ MLX5_SET(query_vport_state_in, in, vport_number, vport);
+ if (vport)
+ MLX5_SET(query_vport_state_in, in, other_vport, 1);
+
+ err = mlx5_cmd_exec_inout(mdev, query_vport_state, in, out);
+ if (err)
+ return 0;
+
+ return MLX5_GET(query_vport_state_out, out, state);
+}
+
+static bool get_link_state(struct mlx5_vdpa_dev *mvdev)
+{
+ if (query_vport_state(mvdev->mdev, MLX5_VPORT_STATE_OP_MOD_VNIC_VPORT, 0) ==
+ VPORT_STATE_UP)
+ return true;
+
+ return false;
+}
+
+static void update_carrier(struct work_struct *work)
+{
+ struct mlx5_vdpa_wq_ent *wqent;
+ struct mlx5_vdpa_dev *mvdev;
+ struct mlx5_vdpa_net *ndev;
+
+ wqent = container_of(work, struct mlx5_vdpa_wq_ent, work);
+ mvdev = wqent->mvdev;
+ ndev = to_mlx5_vdpa_ndev(mvdev);
+ if (get_link_state(mvdev))
+ ndev->config.status |= cpu_to_mlx5vdpa16(mvdev, VIRTIO_NET_S_LINK_UP);
+ else
+ ndev->config.status &= cpu_to_mlx5vdpa16(mvdev, ~VIRTIO_NET_S_LINK_UP);
+
+ if (ndev->config_cb.callback)
+ ndev->config_cb.callback(ndev->config_cb.private);
+
+ kfree(wqent);
+}
+
+static int queue_link_work(struct mlx5_vdpa_net *ndev)
+{
+ struct mlx5_vdpa_wq_ent *wqent;
+
+ wqent = kzalloc(sizeof(*wqent), GFP_ATOMIC);
+ if (!wqent)
+ return -ENOMEM;
+
+ wqent->mvdev = &ndev->mvdev;
+ INIT_WORK(&wqent->work, update_carrier);
+ queue_work(ndev->mvdev.wq, &wqent->work);
+ return 0;
+}
+
+static int event_handler(struct notifier_block *nb, unsigned long event, void *param)
+{
+ struct mlx5_vdpa_net *ndev = container_of(nb, struct mlx5_vdpa_net, nb);
+ struct mlx5_eqe *eqe = param;
+ int ret = NOTIFY_DONE;
+
+ if (event == MLX5_EVENT_TYPE_PORT_CHANGE) {
+ switch (eqe->sub_type) {
+ case MLX5_PORT_CHANGE_SUBTYPE_DOWN:
+ case MLX5_PORT_CHANGE_SUBTYPE_ACTIVE:
+ if (queue_link_work(ndev))
+ return NOTIFY_DONE;
+
+ ret = NOTIFY_OK;
+ break;
+ default:
+ return NOTIFY_DONE;
+ }
+ return ret;
+ }
+ return ret;
+}
+
+static void register_link_notifier(struct mlx5_vdpa_net *ndev)
+{
+ if (!(ndev->mvdev.actual_features & BIT_ULL(VIRTIO_NET_F_STATUS)))
+ return;
+
+ ndev->nb.notifier_call = event_handler;
+ mlx5_notifier_register(ndev->mvdev.mdev, &ndev->nb);
+ ndev->nb_registered = true;
+ queue_link_work(ndev);
+}
+
+static void unregister_link_notifier(struct mlx5_vdpa_net *ndev)
+{
+ if (!ndev->nb_registered)
+ return;
+
+ ndev->nb_registered = false;
+ mlx5_notifier_unregister(ndev->mvdev.mdev, &ndev->nb);
+ if (ndev->mvdev.wq)
+ flush_workqueue(ndev->mvdev.wq);
+}
+
+static int mlx5_vdpa_set_driver_features(struct vdpa_device *vdev, u64 features)
+{
+ struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
+ struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
+ int err;
+
+ print_features(mvdev, features, true);
+
+ err = verify_driver_features(mvdev, features);
+ if (err)
+ return err;
+
+ ndev->mvdev.actual_features = features & ndev->mvdev.mlx_features;
+ if (ndev->mvdev.actual_features & BIT_ULL(VIRTIO_NET_F_MQ))
+ ndev->rqt_size = mlx5vdpa16_to_cpu(mvdev, ndev->config.max_virtqueue_pairs);
+ else
+ ndev->rqt_size = 1;
+
+ /* Device must start with 1 queue pair, as per VIRTIO v1.2 spec, section
+ * 5.1.6.5.5 "Device operation in multiqueue mode":
+ *
+ * Multiqueue is disabled by default.
+ * The driver enables multiqueue by sending a command using class
+ * VIRTIO_NET_CTRL_MQ. The command selects the mode of multiqueue
+ * operation, as follows: ...
+ */
+ ndev->cur_num_vqs = 2;
+
+ update_cvq_info(mvdev);
+ return err;
+}
+
+static void mlx5_vdpa_set_config_cb(struct vdpa_device *vdev, struct vdpa_callback *cb)
+{
+ struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
+ struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
+
+ ndev->config_cb = *cb;
+}
+
+#define MLX5_VDPA_MAX_VQ_ENTRIES 256
+static u16 mlx5_vdpa_get_vq_num_max(struct vdpa_device *vdev)
+{
+ return MLX5_VDPA_MAX_VQ_ENTRIES;
+}
+
+static u32 mlx5_vdpa_get_device_id(struct vdpa_device *vdev)
+{
+ return VIRTIO_ID_NET;
+}
+
+static u32 mlx5_vdpa_get_vendor_id(struct vdpa_device *vdev)
+{
+ return PCI_VENDOR_ID_MELLANOX;
+}
+
+static u8 mlx5_vdpa_get_status(struct vdpa_device *vdev)
+{
+ struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
+ struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
+
+ print_status(mvdev, ndev->mvdev.status, false);
+ return ndev->mvdev.status;
+}
+
+static int save_channel_info(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
+{
+ struct mlx5_vq_restore_info *ri = &mvq->ri;
+ struct mlx5_virtq_attr attr = {};
+ int err;
+
+ if (mvq->initialized) {
+ err = query_virtqueue(ndev, mvq, &attr);
+ if (err)
+ return err;
+ }
+
+ ri->avail_index = attr.available_index;
+ ri->used_index = attr.used_index;
+ ri->ready = mvq->ready;
+ ri->num_ent = mvq->num_ent;
+ ri->desc_addr = mvq->desc_addr;
+ ri->device_addr = mvq->device_addr;
+ ri->driver_addr = mvq->driver_addr;
+ ri->map = mvq->map;
+ ri->restore = true;
+ return 0;
+}
+
+static int save_channels_info(struct mlx5_vdpa_net *ndev)
+{
+ int i;
+
+ for (i = 0; i < ndev->mvdev.max_vqs; i++) {
+ memset(&ndev->vqs[i].ri, 0, sizeof(ndev->vqs[i].ri));
+ save_channel_info(ndev, &ndev->vqs[i]);
+ }
+ return 0;
+}
+
+static void mlx5_clear_vqs(struct mlx5_vdpa_net *ndev)
+{
+ int i;
+
+ for (i = 0; i < ndev->mvdev.max_vqs; i++)
+ memset(&ndev->vqs[i], 0, offsetof(struct mlx5_vdpa_virtqueue, ri));
+}
+
+static void restore_channels_info(struct mlx5_vdpa_net *ndev)
+{
+ struct mlx5_vdpa_virtqueue *mvq;
+ struct mlx5_vq_restore_info *ri;
+ int i;
+
+ mlx5_clear_vqs(ndev);
+ init_mvqs(ndev);
+ for (i = 0; i < ndev->mvdev.max_vqs; i++) {
+ mvq = &ndev->vqs[i];
+ ri = &mvq->ri;
+ if (!ri->restore)
+ continue;
+
+ mvq->avail_idx = ri->avail_index;
+ mvq->used_idx = ri->used_index;
+ mvq->ready = ri->ready;
+ mvq->num_ent = ri->num_ent;
+ mvq->desc_addr = ri->desc_addr;
+ mvq->device_addr = ri->device_addr;
+ mvq->driver_addr = ri->driver_addr;
+ mvq->map = ri->map;
+ }
+}
+
+static int mlx5_vdpa_change_map(struct mlx5_vdpa_dev *mvdev,
+ struct vhost_iotlb *iotlb, unsigned int asid)
+{
+ struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
+ int err;
+
+ suspend_vqs(ndev);
+ err = save_channels_info(ndev);
+ if (err)
+ goto err_mr;
+
+ teardown_driver(ndev);
+ mlx5_vdpa_destroy_mr_asid(mvdev, asid);
+ err = mlx5_vdpa_create_mr(mvdev, iotlb, asid);
+ if (err)
+ goto err_mr;
+
+ if (!(mvdev->status & VIRTIO_CONFIG_S_DRIVER_OK) || mvdev->suspended)
+ goto err_mr;
+
+ restore_channels_info(ndev);
+ err = setup_driver(mvdev);
+ if (err)
+ goto err_setup;
+
+ return 0;
+
+err_setup:
+ mlx5_vdpa_destroy_mr_asid(mvdev, asid);
+err_mr:
+ return err;
+}
+
+/* reslock must be held for this function */
+static int setup_driver(struct mlx5_vdpa_dev *mvdev)
+{
+ struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
+ int err;
+
+ WARN_ON(!rwsem_is_locked(&ndev->reslock));
+
+ if (ndev->setup) {
+ mlx5_vdpa_warn(mvdev, "setup driver called for already setup driver\n");
+ err = 0;
+ goto out;
+ }
+ mlx5_vdpa_add_debugfs(ndev);
+
+ err = read_umem_params(ndev);
+ if (err)
+ goto err_setup;
+
+ err = setup_virtqueues(mvdev);
+ if (err) {
+ mlx5_vdpa_warn(mvdev, "setup_virtqueues\n");
+ goto err_setup;
+ }
+
+ err = create_rqt(ndev);
+ if (err) {
+ mlx5_vdpa_warn(mvdev, "create_rqt\n");
+ goto err_rqt;
+ }
+
+ err = create_tir(ndev);
+ if (err) {
+ mlx5_vdpa_warn(mvdev, "create_tir\n");
+ goto err_tir;
+ }
+
+ err = setup_steering(ndev);
+ if (err) {
+ mlx5_vdpa_warn(mvdev, "setup_steering\n");
+ goto err_fwd;
+ }
+ ndev->setup = true;
+
+ return 0;
+
+err_fwd:
+ destroy_tir(ndev);
+err_tir:
+ destroy_rqt(ndev);
+err_rqt:
+ teardown_virtqueues(ndev);
+err_setup:
+ mlx5_vdpa_remove_debugfs(ndev);
+out:
+ return err;
+}
+
+/* reslock must be held for this function */
+static void teardown_driver(struct mlx5_vdpa_net *ndev)
+{
+
+ WARN_ON(!rwsem_is_locked(&ndev->reslock));
+
+ if (!ndev->setup)
+ return;
+
+ mlx5_vdpa_remove_debugfs(ndev);
+ teardown_steering(ndev);
+ destroy_tir(ndev);
+ destroy_rqt(ndev);
+ teardown_virtqueues(ndev);
+ ndev->setup = false;
+}
+
+static void clear_vqs_ready(struct mlx5_vdpa_net *ndev)
+{
+ int i;
+
+ for (i = 0; i < ndev->mvdev.max_vqs; i++)
+ ndev->vqs[i].ready = false;
+
+ ndev->mvdev.cvq.ready = false;
+}
+
+static int setup_cvq_vring(struct mlx5_vdpa_dev *mvdev)
+{
+ struct mlx5_control_vq *cvq = &mvdev->cvq;
+ int err = 0;
+
+ if (mvdev->actual_features & BIT_ULL(VIRTIO_NET_F_CTRL_VQ)) {
+ u16 idx = cvq->vring.last_avail_idx;
+
+ err = vringh_init_iotlb(&cvq->vring, mvdev->actual_features,
+ MLX5_CVQ_MAX_ENT, false,
+ (struct vring_desc *)(uintptr_t)cvq->desc_addr,
+ (struct vring_avail *)(uintptr_t)cvq->driver_addr,
+ (struct vring_used *)(uintptr_t)cvq->device_addr);
+
+ if (!err)
+ cvq->vring.last_avail_idx = cvq->vring.last_used_idx = idx;
+ }
+ return err;
+}
+
+static void mlx5_vdpa_set_status(struct vdpa_device *vdev, u8 status)
+{
+ struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
+ struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
+ int err;
+
+ print_status(mvdev, status, true);
+
+ down_write(&ndev->reslock);
+
+ if ((status ^ ndev->mvdev.status) & VIRTIO_CONFIG_S_DRIVER_OK) {
+ if (status & VIRTIO_CONFIG_S_DRIVER_OK) {
+ err = setup_cvq_vring(mvdev);
+ if (err) {
+ mlx5_vdpa_warn(mvdev, "failed to setup control VQ vring\n");
+ goto err_setup;
+ }
+ register_link_notifier(ndev);
+ err = setup_driver(mvdev);
+ if (err) {
+ mlx5_vdpa_warn(mvdev, "failed to setup driver\n");
+ goto err_driver;
+ }
+ } else {
+ mlx5_vdpa_warn(mvdev, "did not expect DRIVER_OK to be cleared\n");
+ goto err_clear;
+ }
+ }
+
+ ndev->mvdev.status = status;
+ up_write(&ndev->reslock);
+ return;
+
+err_driver:
+ unregister_link_notifier(ndev);
+err_setup:
+ mlx5_vdpa_destroy_mr(&ndev->mvdev);
+ ndev->mvdev.status |= VIRTIO_CONFIG_S_FAILED;
+err_clear:
+ up_write(&ndev->reslock);
+}
+
+static void init_group_to_asid_map(struct mlx5_vdpa_dev *mvdev)
+{
+ int i;
+
+ /* default mapping all groups are mapped to asid 0 */
+ for (i = 0; i < MLX5_VDPA_NUMVQ_GROUPS; i++)
+ mvdev->group2asid[i] = 0;
+}
+
+static int mlx5_vdpa_reset(struct vdpa_device *vdev)
+{
+ struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
+ struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
+
+ print_status(mvdev, 0, true);
+ mlx5_vdpa_info(mvdev, "performing device reset\n");
+
+ down_write(&ndev->reslock);
+ unregister_link_notifier(ndev);
+ teardown_driver(ndev);
+ clear_vqs_ready(ndev);
+ mlx5_vdpa_destroy_mr(&ndev->mvdev);
+ ndev->mvdev.status = 0;
+ ndev->mvdev.suspended = false;
+ ndev->cur_num_vqs = 0;
+ ndev->mvdev.cvq.received_desc = 0;
+ ndev->mvdev.cvq.completed_desc = 0;
+ memset(ndev->event_cbs, 0, sizeof(*ndev->event_cbs) * (mvdev->max_vqs + 1));
+ ndev->mvdev.actual_features = 0;
+ init_group_to_asid_map(mvdev);
+ ++mvdev->generation;
+
+ if (MLX5_CAP_GEN(mvdev->mdev, umem_uid_0)) {
+ if (mlx5_vdpa_create_mr(mvdev, NULL, 0))
+ mlx5_vdpa_warn(mvdev, "create MR failed\n");
+ }
+ up_write(&ndev->reslock);
+
+ return 0;
+}
+
+static size_t mlx5_vdpa_get_config_size(struct vdpa_device *vdev)
+{
+ return sizeof(struct virtio_net_config);
+}
+
+static void mlx5_vdpa_get_config(struct vdpa_device *vdev, unsigned int offset, void *buf,
+ unsigned int len)
+{
+ struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
+ struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
+
+ if (offset + len <= sizeof(struct virtio_net_config))
+ memcpy(buf, (u8 *)&ndev->config + offset, len);
+}
+
+static void mlx5_vdpa_set_config(struct vdpa_device *vdev, unsigned int offset, const void *buf,
+ unsigned int len)
+{
+ /* not supported */
+}
+
+static u32 mlx5_vdpa_get_generation(struct vdpa_device *vdev)
+{
+ struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
+
+ return mvdev->generation;
+}
+
+static int set_map_data(struct mlx5_vdpa_dev *mvdev, struct vhost_iotlb *iotlb,
+ unsigned int asid)
+{
+ bool change_map;
+ int err;
+
+ err = mlx5_vdpa_handle_set_map(mvdev, iotlb, &change_map, asid);
+ if (err) {
+ mlx5_vdpa_warn(mvdev, "set map failed(%d)\n", err);
+ return err;
+ }
+
+ if (change_map)
+ err = mlx5_vdpa_change_map(mvdev, iotlb, asid);
+
+ return err;
+}
+
+static int mlx5_vdpa_set_map(struct vdpa_device *vdev, unsigned int asid,
+ struct vhost_iotlb *iotlb)
+{
+ struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
+ struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
+ int err = -EINVAL;
+
+ down_write(&ndev->reslock);
+ err = set_map_data(mvdev, iotlb, asid);
+ up_write(&ndev->reslock);
+ return err;
+}
+
+static struct device *mlx5_get_vq_dma_dev(struct vdpa_device *vdev, u16 idx)
+{
+ struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
+
+ if (is_ctrl_vq_idx(mvdev, idx))
+ return &vdev->dev;
+
+ return mvdev->vdev.dma_dev;
+}
+
+static void free_irqs(struct mlx5_vdpa_net *ndev)
+{
+ struct mlx5_vdpa_irq_pool_entry *ent;
+ int i;
+
+ if (!msix_mode_supported(&ndev->mvdev))
+ return;
+
+ if (!ndev->irqp.entries)
+ return;
+
+ for (i = ndev->irqp.num_ent - 1; i >= 0; i--) {
+ ent = ndev->irqp.entries + i;
+ if (ent->map.virq)
+ pci_msix_free_irq(ndev->mvdev.mdev->pdev, ent->map);
+ }
+ kfree(ndev->irqp.entries);
+}
+
+static void mlx5_vdpa_free(struct vdpa_device *vdev)
+{
+ struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
+ struct mlx5_core_dev *pfmdev;
+ struct mlx5_vdpa_net *ndev;
+
+ ndev = to_mlx5_vdpa_ndev(mvdev);
+
+ free_resources(ndev);
+ mlx5_vdpa_destroy_mr(mvdev);
+ if (!is_zero_ether_addr(ndev->config.mac)) {
+ pfmdev = pci_get_drvdata(pci_physfn(mvdev->mdev->pdev));
+ mlx5_mpfs_del_mac(pfmdev, ndev->config.mac);
+ }
+ mlx5_vdpa_free_resources(&ndev->mvdev);
+ free_irqs(ndev);
+ kfree(ndev->event_cbs);
+ kfree(ndev->vqs);
+}
+
+static struct vdpa_notification_area mlx5_get_vq_notification(struct vdpa_device *vdev, u16 idx)
+{
+ struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
+ struct vdpa_notification_area ret = {};
+ struct mlx5_vdpa_net *ndev;
+ phys_addr_t addr;
+
+ if (!is_index_valid(mvdev, idx) || is_ctrl_vq_idx(mvdev, idx))
+ return ret;
+
+ /* If SF BAR size is smaller than PAGE_SIZE, do not use direct
+ * notification to avoid the risk of mapping pages that contain BAR of more
+ * than one SF
+ */
+ if (MLX5_CAP_GEN(mvdev->mdev, log_min_sf_size) + 12 < PAGE_SHIFT)
+ return ret;
+
+ ndev = to_mlx5_vdpa_ndev(mvdev);
+ addr = (phys_addr_t)ndev->mvdev.res.phys_kick_addr;
+ ret.addr = addr;
+ ret.size = PAGE_SIZE;
+ return ret;
+}
+
+static int mlx5_get_vq_irq(struct vdpa_device *vdev, u16 idx)
+{
+ struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
+ struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
+ struct mlx5_vdpa_virtqueue *mvq;
+
+ if (!is_index_valid(mvdev, idx))
+ return -EINVAL;
+
+ if (is_ctrl_vq_idx(mvdev, idx))
+ return -EOPNOTSUPP;
+
+ mvq = &ndev->vqs[idx];
+ if (!mvq->map.virq)
+ return -EOPNOTSUPP;
+
+ return mvq->map.virq;
+}
+
+static u64 mlx5_vdpa_get_driver_features(struct vdpa_device *vdev)
+{
+ struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
+
+ return mvdev->actual_features;
+}
+
+static int counter_set_query(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq,
+ u64 *received_desc, u64 *completed_desc)
+{
+ u32 in[MLX5_ST_SZ_DW(query_virtio_q_counters_in)] = {};
+ u32 out[MLX5_ST_SZ_DW(query_virtio_q_counters_out)] = {};
+ void *cmd_hdr;
+ void *ctx;
+ int err;
+
+ if (!counters_supported(&ndev->mvdev))
+ return -EOPNOTSUPP;
+
+ if (mvq->fw_state != MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY)
+ return -EAGAIN;
+
+ cmd_hdr = MLX5_ADDR_OF(query_virtio_q_counters_in, in, hdr);
+
+ MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode, MLX5_CMD_OP_QUERY_GENERAL_OBJECT);
+ MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type, MLX5_OBJ_TYPE_VIRTIO_Q_COUNTERS);
+ MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, uid, ndev->mvdev.res.uid);
+ MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_id, mvq->counter_set_id);
+
+ err = mlx5_cmd_exec(ndev->mvdev.mdev, in, sizeof(in), out, sizeof(out));
+ if (err)
+ return err;
+
+ ctx = MLX5_ADDR_OF(query_virtio_q_counters_out, out, counters);
+ *received_desc = MLX5_GET64(virtio_q_counters, ctx, received_desc);
+ *completed_desc = MLX5_GET64(virtio_q_counters, ctx, completed_desc);
+ return 0;
+}
+
+static int mlx5_vdpa_get_vendor_vq_stats(struct vdpa_device *vdev, u16 idx,
+ struct sk_buff *msg,
+ struct netlink_ext_ack *extack)
+{
+ struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
+ struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
+ struct mlx5_vdpa_virtqueue *mvq;
+ struct mlx5_control_vq *cvq;
+ u64 received_desc;
+ u64 completed_desc;
+ int err = 0;
+
+ down_read(&ndev->reslock);
+ if (!is_index_valid(mvdev, idx)) {
+ NL_SET_ERR_MSG_MOD(extack, "virtqueue index is not valid");
+ err = -EINVAL;
+ goto out_err;
+ }
+
+ if (idx == ctrl_vq_idx(mvdev)) {
+ cvq = &mvdev->cvq;
+ received_desc = cvq->received_desc;
+ completed_desc = cvq->completed_desc;
+ goto out;
+ }
+
+ mvq = &ndev->vqs[idx];
+ err = counter_set_query(ndev, mvq, &received_desc, &completed_desc);
+ if (err) {
+ NL_SET_ERR_MSG_MOD(extack, "failed to query hardware");
+ goto out_err;
+ }
+
+out:
+ err = -EMSGSIZE;
+ if (nla_put_string(msg, VDPA_ATTR_DEV_VENDOR_ATTR_NAME, "received_desc"))
+ goto out_err;
+
+ if (nla_put_u64_64bit(msg, VDPA_ATTR_DEV_VENDOR_ATTR_VALUE, received_desc,
+ VDPA_ATTR_PAD))
+ goto out_err;
+
+ if (nla_put_string(msg, VDPA_ATTR_DEV_VENDOR_ATTR_NAME, "completed_desc"))
+ goto out_err;
+
+ if (nla_put_u64_64bit(msg, VDPA_ATTR_DEV_VENDOR_ATTR_VALUE, completed_desc,
+ VDPA_ATTR_PAD))
+ goto out_err;
+
+ err = 0;
+out_err:
+ up_read(&ndev->reslock);
+ return err;
+}
+
+static void mlx5_vdpa_cvq_suspend(struct mlx5_vdpa_dev *mvdev)
+{
+ struct mlx5_control_vq *cvq;
+
+ if (!(mvdev->actual_features & BIT_ULL(VIRTIO_NET_F_CTRL_VQ)))
+ return;
+
+ cvq = &mvdev->cvq;
+ cvq->ready = false;
+}
+
+static int mlx5_vdpa_suspend(struct vdpa_device *vdev)
+{
+ struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
+ struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
+ struct mlx5_vdpa_virtqueue *mvq;
+ int i;
+
+ mlx5_vdpa_info(mvdev, "suspending device\n");
+
+ down_write(&ndev->reslock);
+ unregister_link_notifier(ndev);
+ for (i = 0; i < ndev->cur_num_vqs; i++) {
+ mvq = &ndev->vqs[i];
+ suspend_vq(ndev, mvq);
+ }
+ mlx5_vdpa_cvq_suspend(mvdev);
+ mvdev->suspended = true;
+ up_write(&ndev->reslock);
+ return 0;
+}
+
+static int mlx5_set_group_asid(struct vdpa_device *vdev, u32 group,
+ unsigned int asid)
+{
+ struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
+
+ if (group >= MLX5_VDPA_NUMVQ_GROUPS)
+ return -EINVAL;
+
+ mvdev->group2asid[group] = asid;
+ return 0;
+}
+
+static const struct vdpa_config_ops mlx5_vdpa_ops = {
+ .set_vq_address = mlx5_vdpa_set_vq_address,
+ .set_vq_num = mlx5_vdpa_set_vq_num,
+ .kick_vq = mlx5_vdpa_kick_vq,
+ .set_vq_cb = mlx5_vdpa_set_vq_cb,
+ .set_vq_ready = mlx5_vdpa_set_vq_ready,
+ .get_vq_ready = mlx5_vdpa_get_vq_ready,
+ .set_vq_state = mlx5_vdpa_set_vq_state,
+ .get_vq_state = mlx5_vdpa_get_vq_state,
+ .get_vendor_vq_stats = mlx5_vdpa_get_vendor_vq_stats,
+ .get_vq_notification = mlx5_get_vq_notification,
+ .get_vq_irq = mlx5_get_vq_irq,
+ .get_vq_align = mlx5_vdpa_get_vq_align,
+ .get_vq_group = mlx5_vdpa_get_vq_group,
+ .get_device_features = mlx5_vdpa_get_device_features,
+ .set_driver_features = mlx5_vdpa_set_driver_features,
+ .get_driver_features = mlx5_vdpa_get_driver_features,
+ .set_config_cb = mlx5_vdpa_set_config_cb,
+ .get_vq_num_max = mlx5_vdpa_get_vq_num_max,
+ .get_device_id = mlx5_vdpa_get_device_id,
+ .get_vendor_id = mlx5_vdpa_get_vendor_id,
+ .get_status = mlx5_vdpa_get_status,
+ .set_status = mlx5_vdpa_set_status,
+ .reset = mlx5_vdpa_reset,
+ .get_config_size = mlx5_vdpa_get_config_size,
+ .get_config = mlx5_vdpa_get_config,
+ .set_config = mlx5_vdpa_set_config,
+ .get_generation = mlx5_vdpa_get_generation,
+ .set_map = mlx5_vdpa_set_map,
+ .set_group_asid = mlx5_set_group_asid,
+ .get_vq_dma_dev = mlx5_get_vq_dma_dev,
+ .free = mlx5_vdpa_free,
+ .suspend = mlx5_vdpa_suspend,
+};
+
+static int query_mtu(struct mlx5_core_dev *mdev, u16 *mtu)
+{
+ u16 hw_mtu;
+ int err;
+
+ err = mlx5_query_nic_vport_mtu(mdev, &hw_mtu);
+ if (err)
+ return err;
+
+ *mtu = hw_mtu - MLX5V_ETH_HARD_MTU;
+ return 0;
+}
+
+static int alloc_resources(struct mlx5_vdpa_net *ndev)
+{
+ struct mlx5_vdpa_net_resources *res = &ndev->res;
+ int err;
+
+ if (res->valid) {
+ mlx5_vdpa_warn(&ndev->mvdev, "resources already allocated\n");
+ return -EEXIST;
+ }
+
+ err = mlx5_vdpa_alloc_transport_domain(&ndev->mvdev, &res->tdn);
+ if (err)
+ return err;
+
+ err = create_tis(ndev);
+ if (err)
+ goto err_tis;
+
+ res->valid = true;
+
+ return 0;
+
+err_tis:
+ mlx5_vdpa_dealloc_transport_domain(&ndev->mvdev, res->tdn);
+ return err;
+}
+
+static void free_resources(struct mlx5_vdpa_net *ndev)
+{
+ struct mlx5_vdpa_net_resources *res = &ndev->res;
+
+ if (!res->valid)
+ return;
+
+ destroy_tis(ndev);
+ mlx5_vdpa_dealloc_transport_domain(&ndev->mvdev, res->tdn);
+ res->valid = false;
+}
+
+static void init_mvqs(struct mlx5_vdpa_net *ndev)
+{
+ struct mlx5_vdpa_virtqueue *mvq;
+ int i;
+
+ for (i = 0; i < ndev->mvdev.max_vqs; ++i) {
+ mvq = &ndev->vqs[i];
+ memset(mvq, 0, offsetof(struct mlx5_vdpa_virtqueue, ri));
+ mvq->index = i;
+ mvq->ndev = ndev;
+ mvq->fwqp.fw = true;
+ mvq->fw_state = MLX5_VIRTIO_NET_Q_OBJECT_NONE;
+ }
+ for (; i < ndev->mvdev.max_vqs; i++) {
+ mvq = &ndev->vqs[i];
+ memset(mvq, 0, offsetof(struct mlx5_vdpa_virtqueue, ri));
+ mvq->index = i;
+ mvq->ndev = ndev;
+ }
+}
+
+struct mlx5_vdpa_mgmtdev {
+ struct vdpa_mgmt_dev mgtdev;
+ struct mlx5_adev *madev;
+ struct mlx5_vdpa_net *ndev;
+};
+
+static int config_func_mtu(struct mlx5_core_dev *mdev, u16 mtu)
+{
+ int inlen = MLX5_ST_SZ_BYTES(modify_nic_vport_context_in);
+ void *in;
+ int err;
+
+ in = kvzalloc(inlen, GFP_KERNEL);
+ if (!in)
+ return -ENOMEM;
+
+ MLX5_SET(modify_nic_vport_context_in, in, field_select.mtu, 1);
+ MLX5_SET(modify_nic_vport_context_in, in, nic_vport_context.mtu,
+ mtu + MLX5V_ETH_HARD_MTU);
+ MLX5_SET(modify_nic_vport_context_in, in, opcode,
+ MLX5_CMD_OP_MODIFY_NIC_VPORT_CONTEXT);
+
+ err = mlx5_cmd_exec_in(mdev, modify_nic_vport_context, in);
+
+ kvfree(in);
+ return err;
+}
+
+static void allocate_irqs(struct mlx5_vdpa_net *ndev)
+{
+ struct mlx5_vdpa_irq_pool_entry *ent;
+ int i;
+
+ if (!msix_mode_supported(&ndev->mvdev))
+ return;
+
+ if (!ndev->mvdev.mdev->pdev)
+ return;
+
+ ndev->irqp.entries = kcalloc(ndev->mvdev.max_vqs, sizeof(*ndev->irqp.entries), GFP_KERNEL);
+ if (!ndev->irqp.entries)
+ return;
+
+
+ for (i = 0; i < ndev->mvdev.max_vqs; i++) {
+ ent = ndev->irqp.entries + i;
+ snprintf(ent->name, MLX5_VDPA_IRQ_NAME_LEN, "%s-vq-%d",
+ dev_name(&ndev->mvdev.vdev.dev), i);
+ ent->map = pci_msix_alloc_irq_at(ndev->mvdev.mdev->pdev, MSI_ANY_INDEX, NULL);
+ if (!ent->map.virq)
+ return;
+
+ ndev->irqp.num_ent++;
+ }
+}
+
+static int mlx5_vdpa_dev_add(struct vdpa_mgmt_dev *v_mdev, const char *name,
+ const struct vdpa_dev_set_config *add_config)
+{
+ struct mlx5_vdpa_mgmtdev *mgtdev = container_of(v_mdev, struct mlx5_vdpa_mgmtdev, mgtdev);
+ struct virtio_net_config *config;
+ struct mlx5_core_dev *pfmdev;
+ struct mlx5_vdpa_dev *mvdev;
+ struct mlx5_vdpa_net *ndev;
+ struct mlx5_core_dev *mdev;
+ u64 device_features;
+ u32 max_vqs;
+ u16 mtu;
+ int err;
+
+ if (mgtdev->ndev)
+ return -ENOSPC;
+
+ mdev = mgtdev->madev->mdev;
+ device_features = mgtdev->mgtdev.supported_features;
+ if (add_config->mask & BIT_ULL(VDPA_ATTR_DEV_FEATURES)) {
+ if (add_config->device_features & ~device_features) {
+ dev_warn(mdev->device,
+ "The provisioned features 0x%llx are not supported by this device with features 0x%llx\n",
+ add_config->device_features, device_features);
+ return -EINVAL;
+ }
+ device_features &= add_config->device_features;
+ } else {
+ device_features &= ~BIT_ULL(VIRTIO_NET_F_MRG_RXBUF);
+ }
+ if (!(device_features & BIT_ULL(VIRTIO_F_VERSION_1) &&
+ device_features & BIT_ULL(VIRTIO_F_ACCESS_PLATFORM))) {
+ dev_warn(mdev->device,
+ "Must provision minimum features 0x%llx for this device",
+ BIT_ULL(VIRTIO_F_VERSION_1) | BIT_ULL(VIRTIO_F_ACCESS_PLATFORM));
+ return -EOPNOTSUPP;
+ }
+
+ if (!(MLX5_CAP_DEV_VDPA_EMULATION(mdev, virtio_queue_type) &
+ MLX5_VIRTIO_EMULATION_CAP_VIRTIO_QUEUE_TYPE_SPLIT)) {
+ dev_warn(mdev->device, "missing support for split virtqueues\n");
+ return -EOPNOTSUPP;
+ }
+
+ max_vqs = min_t(int, MLX5_CAP_DEV_VDPA_EMULATION(mdev, max_num_virtio_queues),
+ 1 << MLX5_CAP_GEN(mdev, log_max_rqt_size));
+ if (max_vqs < 2) {
+ dev_warn(mdev->device,
+ "%d virtqueues are supported. At least 2 are required\n",
+ max_vqs);
+ return -EAGAIN;
+ }
+
+ if (add_config->mask & BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MAX_VQP)) {
+ if (add_config->net.max_vq_pairs > max_vqs / 2)
+ return -EINVAL;
+ max_vqs = min_t(u32, max_vqs, 2 * add_config->net.max_vq_pairs);
+ } else {
+ max_vqs = 2;
+ }
+
+ ndev = vdpa_alloc_device(struct mlx5_vdpa_net, mvdev.vdev, mdev->device, &mlx5_vdpa_ops,
+ MLX5_VDPA_NUMVQ_GROUPS, MLX5_VDPA_NUM_AS, name, false);
+ if (IS_ERR(ndev))
+ return PTR_ERR(ndev);
+
+ ndev->mvdev.max_vqs = max_vqs;
+ mvdev = &ndev->mvdev;
+ mvdev->mdev = mdev;
+
+ ndev->vqs = kcalloc(max_vqs, sizeof(*ndev->vqs), GFP_KERNEL);
+ ndev->event_cbs = kcalloc(max_vqs + 1, sizeof(*ndev->event_cbs), GFP_KERNEL);
+ if (!ndev->vqs || !ndev->event_cbs) {
+ err = -ENOMEM;
+ goto err_alloc;
+ }
+
+ init_mvqs(ndev);
+ allocate_irqs(ndev);
+ init_rwsem(&ndev->reslock);
+ config = &ndev->config;
+
+ if (add_config->mask & BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MTU)) {
+ err = config_func_mtu(mdev, add_config->net.mtu);
+ if (err)
+ goto err_alloc;
+ }
+
+ if (device_features & BIT_ULL(VIRTIO_NET_F_MTU)) {
+ err = query_mtu(mdev, &mtu);
+ if (err)
+ goto err_alloc;
+
+ ndev->config.mtu = cpu_to_mlx5vdpa16(mvdev, mtu);
+ }
+
+ if (device_features & BIT_ULL(VIRTIO_NET_F_STATUS)) {
+ if (get_link_state(mvdev))
+ ndev->config.status |= cpu_to_mlx5vdpa16(mvdev, VIRTIO_NET_S_LINK_UP);
+ else
+ ndev->config.status &= cpu_to_mlx5vdpa16(mvdev, ~VIRTIO_NET_S_LINK_UP);
+ }
+
+ if (add_config->mask & (1 << VDPA_ATTR_DEV_NET_CFG_MACADDR)) {
+ memcpy(ndev->config.mac, add_config->net.mac, ETH_ALEN);
+ /* No bother setting mac address in config if not going to provision _F_MAC */
+ } else if ((add_config->mask & BIT_ULL(VDPA_ATTR_DEV_FEATURES)) == 0 ||
+ device_features & BIT_ULL(VIRTIO_NET_F_MAC)) {
+ err = mlx5_query_nic_vport_mac_address(mdev, 0, 0, config->mac);
+ if (err)
+ goto err_alloc;
+ }
+
+ if (!is_zero_ether_addr(config->mac)) {
+ pfmdev = pci_get_drvdata(pci_physfn(mdev->pdev));
+ err = mlx5_mpfs_add_mac(pfmdev, config->mac);
+ if (err)
+ goto err_alloc;
+ } else if ((add_config->mask & BIT_ULL(VDPA_ATTR_DEV_FEATURES)) == 0) {
+ /*
+ * We used to clear _F_MAC feature bit if seeing
+ * zero mac address when device features are not
+ * specifically provisioned. Keep the behaviour
+ * so old scripts do not break.
+ */
+ device_features &= ~BIT_ULL(VIRTIO_NET_F_MAC);
+ } else if (device_features & BIT_ULL(VIRTIO_NET_F_MAC)) {
+ /* Don't provision zero mac address for _F_MAC */
+ mlx5_vdpa_warn(&ndev->mvdev,
+ "No mac address provisioned?\n");
+ err = -EINVAL;
+ goto err_alloc;
+ }
+
+ if (device_features & BIT_ULL(VIRTIO_NET_F_MQ))
+ config->max_virtqueue_pairs = cpu_to_mlx5vdpa16(mvdev, max_vqs / 2);
+
+ ndev->mvdev.mlx_features = device_features;
+ mvdev->vdev.dma_dev = &mdev->pdev->dev;
+ err = mlx5_vdpa_alloc_resources(&ndev->mvdev);
+ if (err)
+ goto err_mpfs;
+
+ if (MLX5_CAP_GEN(mvdev->mdev, umem_uid_0)) {
+ err = mlx5_vdpa_create_mr(mvdev, NULL, 0);
+ if (err)
+ goto err_res;
+ }
+
+ err = alloc_resources(ndev);
+ if (err)
+ goto err_mr;
+
+ ndev->cvq_ent.mvdev = mvdev;
+ INIT_WORK(&ndev->cvq_ent.work, mlx5_cvq_kick_handler);
+ mvdev->wq = create_singlethread_workqueue("mlx5_vdpa_wq");
+ if (!mvdev->wq) {
+ err = -ENOMEM;
+ goto err_res2;
+ }
+
+ mvdev->vdev.mdev = &mgtdev->mgtdev;
+ err = _vdpa_register_device(&mvdev->vdev, max_vqs + 1);
+ if (err)
+ goto err_reg;
+
+ mgtdev->ndev = ndev;
+ return 0;
+
+err_reg:
+ destroy_workqueue(mvdev->wq);
+err_res2:
+ free_resources(ndev);
+err_mr:
+ mlx5_vdpa_destroy_mr(mvdev);
+err_res:
+ mlx5_vdpa_free_resources(&ndev->mvdev);
+err_mpfs:
+ if (!is_zero_ether_addr(config->mac))
+ mlx5_mpfs_del_mac(pfmdev, config->mac);
+err_alloc:
+ put_device(&mvdev->vdev.dev);
+ return err;
+}
+
+static void mlx5_vdpa_dev_del(struct vdpa_mgmt_dev *v_mdev, struct vdpa_device *dev)
+{
+ struct mlx5_vdpa_mgmtdev *mgtdev = container_of(v_mdev, struct mlx5_vdpa_mgmtdev, mgtdev);
+ struct mlx5_vdpa_dev *mvdev = to_mvdev(dev);
+ struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
+ struct workqueue_struct *wq;
+
+ unregister_link_notifier(ndev);
+ _vdpa_unregister_device(dev);
+ wq = mvdev->wq;
+ mvdev->wq = NULL;
+ destroy_workqueue(wq);
+ mgtdev->ndev = NULL;
+}
+
+static const struct vdpa_mgmtdev_ops mdev_ops = {
+ .dev_add = mlx5_vdpa_dev_add,
+ .dev_del = mlx5_vdpa_dev_del,
+};
+
+static struct virtio_device_id id_table[] = {
+ { VIRTIO_ID_NET, VIRTIO_DEV_ANY_ID },
+ { 0 },
+};
+
+static int mlx5v_probe(struct auxiliary_device *adev,
+ const struct auxiliary_device_id *id)
+
+{
+ struct mlx5_adev *madev = container_of(adev, struct mlx5_adev, adev);
+ struct mlx5_core_dev *mdev = madev->mdev;
+ struct mlx5_vdpa_mgmtdev *mgtdev;
+ int err;
+
+ mgtdev = kzalloc(sizeof(*mgtdev), GFP_KERNEL);
+ if (!mgtdev)
+ return -ENOMEM;
+
+ mgtdev->mgtdev.ops = &mdev_ops;
+ mgtdev->mgtdev.device = mdev->device;
+ mgtdev->mgtdev.id_table = id_table;
+ mgtdev->mgtdev.config_attr_mask = BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MACADDR) |
+ BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MAX_VQP) |
+ BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MTU) |
+ BIT_ULL(VDPA_ATTR_DEV_FEATURES);
+ mgtdev->mgtdev.max_supported_vqs =
+ MLX5_CAP_DEV_VDPA_EMULATION(mdev, max_num_virtio_queues) + 1;
+ mgtdev->mgtdev.supported_features = get_supported_features(mdev);
+ mgtdev->madev = madev;
+
+ err = vdpa_mgmtdev_register(&mgtdev->mgtdev);
+ if (err)
+ goto reg_err;
+
+ auxiliary_set_drvdata(adev, mgtdev);
+
+ return 0;
+
+reg_err:
+ kfree(mgtdev);
+ return err;
+}
+
+static void mlx5v_remove(struct auxiliary_device *adev)
+{
+ struct mlx5_vdpa_mgmtdev *mgtdev;
+
+ mgtdev = auxiliary_get_drvdata(adev);
+ vdpa_mgmtdev_unregister(&mgtdev->mgtdev);
+ kfree(mgtdev);
+}
+
+static const struct auxiliary_device_id mlx5v_id_table[] = {
+ { .name = MLX5_ADEV_NAME ".vnet", },
+ {},
+};
+
+MODULE_DEVICE_TABLE(auxiliary, mlx5v_id_table);
+
+static struct auxiliary_driver mlx5v_driver = {
+ .name = "vnet",
+ .probe = mlx5v_probe,
+ .remove = mlx5v_remove,
+ .id_table = mlx5v_id_table,
+};
+
+module_auxiliary_driver(mlx5v_driver);
diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.h b/drivers/vdpa/mlx5/net/mlx5_vnet.h
new file mode 100644
index 0000000000..90b556a579
--- /dev/null
+++ b/drivers/vdpa/mlx5/net/mlx5_vnet.h
@@ -0,0 +1,118 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/* Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. */
+
+#ifndef __MLX5_VNET_H__
+#define __MLX5_VNET_H__
+
+#include "mlx5_vdpa.h"
+
+#define to_mlx5_vdpa_ndev(__mvdev) \
+ container_of(__mvdev, struct mlx5_vdpa_net, mvdev)
+#define to_mvdev(__vdev) container_of((__vdev), struct mlx5_vdpa_dev, vdev)
+
+struct mlx5_vdpa_net_resources {
+ u32 tisn;
+ u32 tdn;
+ u32 tirn;
+ u32 rqtn;
+ bool valid;
+ struct dentry *tirn_dent;
+};
+
+#define MLX5V_MACVLAN_SIZE 256
+
+static inline u16 key2vid(u64 key)
+{
+ return (u16)(key >> 48) & 0xfff;
+}
+
+#define MLX5_VDPA_IRQ_NAME_LEN 32
+
+struct mlx5_vdpa_irq_pool_entry {
+ struct msi_map map;
+ bool used;
+ char name[MLX5_VDPA_IRQ_NAME_LEN];
+ void *dev_id;
+};
+
+struct mlx5_vdpa_irq_pool {
+ int num_ent;
+ struct mlx5_vdpa_irq_pool_entry *entries;
+};
+
+struct mlx5_vdpa_net {
+ struct mlx5_vdpa_dev mvdev;
+ struct mlx5_vdpa_net_resources res;
+ struct virtio_net_config config;
+ struct mlx5_vdpa_virtqueue *vqs;
+ struct vdpa_callback *event_cbs;
+
+ /* Serialize vq resources creation and destruction. This is required
+ * since memory map might change and we need to destroy and create
+ * resources while driver in operational.
+ */
+ struct rw_semaphore reslock;
+ struct mlx5_flow_table *rxft;
+ struct dentry *rx_dent;
+ struct dentry *rx_table_dent;
+ bool setup;
+ u32 cur_num_vqs;
+ u32 rqt_size;
+ bool nb_registered;
+ struct notifier_block nb;
+ struct vdpa_callback config_cb;
+ struct mlx5_vdpa_wq_ent cvq_ent;
+ struct hlist_head macvlan_hash[MLX5V_MACVLAN_SIZE];
+ struct mlx5_vdpa_irq_pool irqp;
+ struct dentry *debugfs;
+
+ u32 umem_1_buffer_param_a;
+ u32 umem_1_buffer_param_b;
+
+ u32 umem_2_buffer_param_a;
+ u32 umem_2_buffer_param_b;
+
+ u32 umem_3_buffer_param_a;
+ u32 umem_3_buffer_param_b;
+};
+
+struct mlx5_vdpa_counter {
+ struct mlx5_fc *counter;
+ struct dentry *dent;
+ struct mlx5_core_dev *mdev;
+};
+
+struct macvlan_node {
+ struct hlist_node hlist;
+ struct mlx5_flow_handle *ucast_rule;
+ struct mlx5_flow_handle *mcast_rule;
+ u64 macvlan;
+ struct mlx5_vdpa_net *ndev;
+ bool tagged;
+#if defined(CONFIG_MLX5_VDPA_STEERING_DEBUG)
+ struct dentry *dent;
+ struct mlx5_vdpa_counter ucast_counter;
+ struct mlx5_vdpa_counter mcast_counter;
+#endif
+};
+
+void mlx5_vdpa_add_debugfs(struct mlx5_vdpa_net *ndev);
+void mlx5_vdpa_remove_debugfs(struct mlx5_vdpa_net *ndev);
+void mlx5_vdpa_add_rx_flow_table(struct mlx5_vdpa_net *ndev);
+void mlx5_vdpa_remove_rx_flow_table(struct mlx5_vdpa_net *ndev);
+void mlx5_vdpa_add_tirn(struct mlx5_vdpa_net *ndev);
+void mlx5_vdpa_remove_tirn(struct mlx5_vdpa_net *ndev);
+#if defined(CONFIG_MLX5_VDPA_STEERING_DEBUG)
+void mlx5_vdpa_add_rx_counters(struct mlx5_vdpa_net *ndev,
+ struct macvlan_node *node);
+void mlx5_vdpa_remove_rx_counters(struct mlx5_vdpa_net *ndev,
+ struct macvlan_node *node);
+#else
+static inline void mlx5_vdpa_add_rx_counters(struct mlx5_vdpa_net *ndev,
+ struct macvlan_node *node) {}
+static inline void mlx5_vdpa_remove_rx_counters(struct mlx5_vdpa_net *ndev,
+ struct macvlan_node *node) {}
+#endif
+
+
+#endif /* __MLX5_VNET_H__ */
diff --git a/drivers/vdpa/pds/Makefile b/drivers/vdpa/pds/Makefile
new file mode 100644
index 0000000000..c2d314d461
--- /dev/null
+++ b/drivers/vdpa/pds/Makefile
@@ -0,0 +1,9 @@
+# SPDX-License-Identifier: GPL-2.0-only
+# Copyright(c) 2023 Advanced Micro Devices, Inc
+
+obj-$(CONFIG_PDS_VDPA) := pds_vdpa.o
+
+pds_vdpa-y := aux_drv.o \
+ cmds.o \
+ debugfs.o \
+ vdpa_dev.o
diff --git a/drivers/vdpa/pds/aux_drv.c b/drivers/vdpa/pds/aux_drv.c
new file mode 100644
index 0000000000..186e9ee22e
--- /dev/null
+++ b/drivers/vdpa/pds/aux_drv.c
@@ -0,0 +1,140 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright(c) 2023 Advanced Micro Devices, Inc */
+
+#include <linux/auxiliary_bus.h>
+#include <linux/pci.h>
+#include <linux/vdpa.h>
+#include <linux/virtio_pci_modern.h>
+
+#include <linux/pds/pds_common.h>
+#include <linux/pds/pds_core_if.h>
+#include <linux/pds/pds_adminq.h>
+#include <linux/pds/pds_auxbus.h>
+
+#include "aux_drv.h"
+#include "debugfs.h"
+#include "vdpa_dev.h"
+
+static const struct auxiliary_device_id pds_vdpa_id_table[] = {
+ { .name = PDS_VDPA_DEV_NAME, },
+ {},
+};
+
+static int pds_vdpa_device_id_check(struct pci_dev *pdev)
+{
+ if (pdev->device != PCI_DEVICE_ID_PENSANDO_VDPA_VF ||
+ pdev->vendor != PCI_VENDOR_ID_PENSANDO)
+ return -ENODEV;
+
+ return PCI_DEVICE_ID_PENSANDO_VDPA_VF;
+}
+
+static int pds_vdpa_probe(struct auxiliary_device *aux_dev,
+ const struct auxiliary_device_id *id)
+
+{
+ struct pds_auxiliary_dev *padev =
+ container_of(aux_dev, struct pds_auxiliary_dev, aux_dev);
+ struct device *dev = &aux_dev->dev;
+ struct pds_vdpa_aux *vdpa_aux;
+ int err;
+
+ vdpa_aux = kzalloc(sizeof(*vdpa_aux), GFP_KERNEL);
+ if (!vdpa_aux)
+ return -ENOMEM;
+
+ vdpa_aux->padev = padev;
+ vdpa_aux->vf_id = pci_iov_vf_id(padev->vf_pdev);
+ auxiliary_set_drvdata(aux_dev, vdpa_aux);
+
+ /* Get device ident info and set up the vdpa_mgmt_dev */
+ err = pds_vdpa_get_mgmt_info(vdpa_aux);
+ if (err)
+ goto err_free_mem;
+
+ /* Find the virtio configuration */
+ vdpa_aux->vd_mdev.pci_dev = padev->vf_pdev;
+ vdpa_aux->vd_mdev.device_id_check = pds_vdpa_device_id_check;
+ vdpa_aux->vd_mdev.dma_mask = DMA_BIT_MASK(PDS_CORE_ADDR_LEN);
+ err = vp_modern_probe(&vdpa_aux->vd_mdev);
+ if (err) {
+ dev_err(dev, "Unable to probe for virtio configuration: %pe\n",
+ ERR_PTR(err));
+ goto err_free_mgmt_info;
+ }
+
+ /* Let vdpa know that we can provide devices */
+ err = vdpa_mgmtdev_register(&vdpa_aux->vdpa_mdev);
+ if (err) {
+ dev_err(dev, "%s: Failed to initialize vdpa_mgmt interface: %pe\n",
+ __func__, ERR_PTR(err));
+ goto err_free_virtio;
+ }
+
+ pds_vdpa_debugfs_add_pcidev(vdpa_aux);
+ pds_vdpa_debugfs_add_ident(vdpa_aux);
+
+ return 0;
+
+err_free_virtio:
+ vp_modern_remove(&vdpa_aux->vd_mdev);
+err_free_mgmt_info:
+ pci_free_irq_vectors(padev->vf_pdev);
+err_free_mem:
+ kfree(vdpa_aux);
+ auxiliary_set_drvdata(aux_dev, NULL);
+
+ return err;
+}
+
+static void pds_vdpa_remove(struct auxiliary_device *aux_dev)
+{
+ struct pds_vdpa_aux *vdpa_aux = auxiliary_get_drvdata(aux_dev);
+ struct device *dev = &aux_dev->dev;
+
+ vdpa_mgmtdev_unregister(&vdpa_aux->vdpa_mdev);
+ vp_modern_remove(&vdpa_aux->vd_mdev);
+ pci_free_irq_vectors(vdpa_aux->padev->vf_pdev);
+
+ pds_vdpa_debugfs_del_vdpadev(vdpa_aux);
+ kfree(vdpa_aux);
+ auxiliary_set_drvdata(aux_dev, NULL);
+
+ dev_info(dev, "Removed\n");
+}
+
+static struct auxiliary_driver pds_vdpa_driver = {
+ .name = PDS_DEV_TYPE_VDPA_STR,
+ .probe = pds_vdpa_probe,
+ .remove = pds_vdpa_remove,
+ .id_table = pds_vdpa_id_table,
+};
+
+static void __exit pds_vdpa_cleanup(void)
+{
+ auxiliary_driver_unregister(&pds_vdpa_driver);
+
+ pds_vdpa_debugfs_destroy();
+}
+module_exit(pds_vdpa_cleanup);
+
+static int __init pds_vdpa_init(void)
+{
+ int err;
+
+ pds_vdpa_debugfs_create();
+
+ err = auxiliary_driver_register(&pds_vdpa_driver);
+ if (err) {
+ pr_err("%s: aux driver register failed: %pe\n",
+ PDS_VDPA_DRV_NAME, ERR_PTR(err));
+ pds_vdpa_debugfs_destroy();
+ }
+
+ return err;
+}
+module_init(pds_vdpa_init);
+
+MODULE_DESCRIPTION(PDS_VDPA_DRV_DESCRIPTION);
+MODULE_AUTHOR("Advanced Micro Devices, Inc");
+MODULE_LICENSE("GPL");
diff --git a/drivers/vdpa/pds/aux_drv.h b/drivers/vdpa/pds/aux_drv.h
new file mode 100644
index 0000000000..26b7534415
--- /dev/null
+++ b/drivers/vdpa/pds/aux_drv.h
@@ -0,0 +1,26 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/* Copyright(c) 2023 Advanced Micro Devices, Inc */
+
+#ifndef _AUX_DRV_H_
+#define _AUX_DRV_H_
+
+#include <linux/virtio_pci_modern.h>
+
+#define PDS_VDPA_DRV_DESCRIPTION "AMD/Pensando vDPA VF Device Driver"
+#define PDS_VDPA_DRV_NAME KBUILD_MODNAME
+
+struct pds_vdpa_aux {
+ struct pds_auxiliary_dev *padev;
+
+ struct vdpa_mgmt_dev vdpa_mdev;
+ struct pds_vdpa_device *pdsv;
+
+ struct pds_vdpa_ident ident;
+
+ int vf_id;
+ struct dentry *dentry;
+ struct virtio_pci_modern_device vd_mdev;
+
+ int nintrs;
+};
+#endif /* _AUX_DRV_H_ */
diff --git a/drivers/vdpa/pds/cmds.c b/drivers/vdpa/pds/cmds.c
new file mode 100644
index 0000000000..80863a41c3
--- /dev/null
+++ b/drivers/vdpa/pds/cmds.c
@@ -0,0 +1,185 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright(c) 2023 Advanced Micro Devices, Inc */
+
+#include <linux/vdpa.h>
+#include <linux/virtio_pci_modern.h>
+
+#include <linux/pds/pds_common.h>
+#include <linux/pds/pds_core_if.h>
+#include <linux/pds/pds_adminq.h>
+#include <linux/pds/pds_auxbus.h>
+
+#include "vdpa_dev.h"
+#include "aux_drv.h"
+#include "cmds.h"
+
+int pds_vdpa_init_hw(struct pds_vdpa_device *pdsv)
+{
+ struct pds_auxiliary_dev *padev = pdsv->vdpa_aux->padev;
+ struct device *dev = &padev->aux_dev.dev;
+ union pds_core_adminq_cmd cmd = {
+ .vdpa_init.opcode = PDS_VDPA_CMD_INIT,
+ .vdpa_init.vdpa_index = pdsv->vdpa_index,
+ .vdpa_init.vf_id = cpu_to_le16(pdsv->vdpa_aux->vf_id),
+ };
+ union pds_core_adminq_comp comp = {};
+ int err;
+
+ /* Initialize the vdpa/virtio device */
+ err = pds_client_adminq_cmd(padev, &cmd, sizeof(cmd.vdpa_init),
+ &comp, 0);
+ if (err)
+ dev_dbg(dev, "Failed to init hw, status %d: %pe\n",
+ comp.status, ERR_PTR(err));
+
+ return err;
+}
+
+int pds_vdpa_cmd_reset(struct pds_vdpa_device *pdsv)
+{
+ struct pds_auxiliary_dev *padev = pdsv->vdpa_aux->padev;
+ struct device *dev = &padev->aux_dev.dev;
+ union pds_core_adminq_cmd cmd = {
+ .vdpa.opcode = PDS_VDPA_CMD_RESET,
+ .vdpa.vdpa_index = pdsv->vdpa_index,
+ .vdpa.vf_id = cpu_to_le16(pdsv->vdpa_aux->vf_id),
+ };
+ union pds_core_adminq_comp comp = {};
+ int err;
+
+ err = pds_client_adminq_cmd(padev, &cmd, sizeof(cmd.vdpa), &comp, 0);
+ if (err)
+ dev_dbg(dev, "Failed to reset hw, status %d: %pe\n",
+ comp.status, ERR_PTR(err));
+
+ return err;
+}
+
+int pds_vdpa_cmd_set_status(struct pds_vdpa_device *pdsv, u8 status)
+{
+ struct pds_auxiliary_dev *padev = pdsv->vdpa_aux->padev;
+ struct device *dev = &padev->aux_dev.dev;
+ union pds_core_adminq_cmd cmd = {
+ .vdpa_status.opcode = PDS_VDPA_CMD_STATUS_UPDATE,
+ .vdpa_status.vdpa_index = pdsv->vdpa_index,
+ .vdpa_status.vf_id = cpu_to_le16(pdsv->vdpa_aux->vf_id),
+ .vdpa_status.status = status,
+ };
+ union pds_core_adminq_comp comp = {};
+ int err;
+
+ err = pds_client_adminq_cmd(padev, &cmd, sizeof(cmd.vdpa_status), &comp, 0);
+ if (err)
+ dev_dbg(dev, "Failed to set status to %#x, error status %d: %pe\n",
+ status, comp.status, ERR_PTR(err));
+
+ return err;
+}
+
+int pds_vdpa_cmd_set_mac(struct pds_vdpa_device *pdsv, u8 *mac)
+{
+ struct pds_auxiliary_dev *padev = pdsv->vdpa_aux->padev;
+ struct device *dev = &padev->aux_dev.dev;
+ union pds_core_adminq_cmd cmd = {
+ .vdpa_setattr.opcode = PDS_VDPA_CMD_SET_ATTR,
+ .vdpa_setattr.vdpa_index = pdsv->vdpa_index,
+ .vdpa_setattr.vf_id = cpu_to_le16(pdsv->vdpa_aux->vf_id),
+ .vdpa_setattr.attr = PDS_VDPA_ATTR_MAC,
+ };
+ union pds_core_adminq_comp comp = {};
+ int err;
+
+ ether_addr_copy(cmd.vdpa_setattr.mac, mac);
+ err = pds_client_adminq_cmd(padev, &cmd, sizeof(cmd.vdpa_setattr),
+ &comp, 0);
+ if (err)
+ dev_dbg(dev, "Failed to set mac address %pM, status %d: %pe\n",
+ mac, comp.status, ERR_PTR(err));
+
+ return err;
+}
+
+int pds_vdpa_cmd_set_max_vq_pairs(struct pds_vdpa_device *pdsv, u16 max_vqp)
+{
+ struct pds_auxiliary_dev *padev = pdsv->vdpa_aux->padev;
+ struct device *dev = &padev->aux_dev.dev;
+ union pds_core_adminq_cmd cmd = {
+ .vdpa_setattr.opcode = PDS_VDPA_CMD_SET_ATTR,
+ .vdpa_setattr.vdpa_index = pdsv->vdpa_index,
+ .vdpa_setattr.vf_id = cpu_to_le16(pdsv->vdpa_aux->vf_id),
+ .vdpa_setattr.attr = PDS_VDPA_ATTR_MAX_VQ_PAIRS,
+ .vdpa_setattr.max_vq_pairs = cpu_to_le16(max_vqp),
+ };
+ union pds_core_adminq_comp comp = {};
+ int err;
+
+ err = pds_client_adminq_cmd(padev, &cmd, sizeof(cmd.vdpa_setattr),
+ &comp, 0);
+ if (err)
+ dev_dbg(dev, "Failed to set max vq pairs %u, status %d: %pe\n",
+ max_vqp, comp.status, ERR_PTR(err));
+
+ return err;
+}
+
+int pds_vdpa_cmd_init_vq(struct pds_vdpa_device *pdsv, u16 qid, u16 invert_idx,
+ struct pds_vdpa_vq_info *vq_info)
+{
+ struct pds_auxiliary_dev *padev = pdsv->vdpa_aux->padev;
+ struct device *dev = &padev->aux_dev.dev;
+ union pds_core_adminq_cmd cmd = {
+ .vdpa_vq_init.opcode = PDS_VDPA_CMD_VQ_INIT,
+ .vdpa_vq_init.vdpa_index = pdsv->vdpa_index,
+ .vdpa_vq_init.vf_id = cpu_to_le16(pdsv->vdpa_aux->vf_id),
+ .vdpa_vq_init.qid = cpu_to_le16(qid),
+ .vdpa_vq_init.len = cpu_to_le16(ilog2(vq_info->q_len)),
+ .vdpa_vq_init.desc_addr = cpu_to_le64(vq_info->desc_addr),
+ .vdpa_vq_init.avail_addr = cpu_to_le64(vq_info->avail_addr),
+ .vdpa_vq_init.used_addr = cpu_to_le64(vq_info->used_addr),
+ .vdpa_vq_init.intr_index = cpu_to_le16(qid),
+ .vdpa_vq_init.avail_index = cpu_to_le16(vq_info->avail_idx ^ invert_idx),
+ .vdpa_vq_init.used_index = cpu_to_le16(vq_info->used_idx ^ invert_idx),
+ };
+ union pds_core_adminq_comp comp = {};
+ int err;
+
+ dev_dbg(dev, "%s: qid %d len %d desc_addr %#llx avail_addr %#llx used_addr %#llx\n",
+ __func__, qid, ilog2(vq_info->q_len),
+ vq_info->desc_addr, vq_info->avail_addr, vq_info->used_addr);
+
+ err = pds_client_adminq_cmd(padev, &cmd, sizeof(cmd.vdpa_vq_init),
+ &comp, 0);
+ if (err)
+ dev_dbg(dev, "Failed to init vq %d, status %d: %pe\n",
+ qid, comp.status, ERR_PTR(err));
+
+ return err;
+}
+
+int pds_vdpa_cmd_reset_vq(struct pds_vdpa_device *pdsv, u16 qid, u16 invert_idx,
+ struct pds_vdpa_vq_info *vq_info)
+{
+ struct pds_auxiliary_dev *padev = pdsv->vdpa_aux->padev;
+ struct device *dev = &padev->aux_dev.dev;
+ union pds_core_adminq_cmd cmd = {
+ .vdpa_vq_reset.opcode = PDS_VDPA_CMD_VQ_RESET,
+ .vdpa_vq_reset.vdpa_index = pdsv->vdpa_index,
+ .vdpa_vq_reset.vf_id = cpu_to_le16(pdsv->vdpa_aux->vf_id),
+ .vdpa_vq_reset.qid = cpu_to_le16(qid),
+ };
+ union pds_core_adminq_comp comp = {};
+ int err;
+
+ err = pds_client_adminq_cmd(padev, &cmd, sizeof(cmd.vdpa_vq_reset),
+ &comp, 0);
+ if (err) {
+ dev_dbg(dev, "Failed to reset vq %d, status %d: %pe\n",
+ qid, comp.status, ERR_PTR(err));
+ return err;
+ }
+
+ vq_info->avail_idx = le16_to_cpu(comp.vdpa_vq_reset.avail_index) ^ invert_idx;
+ vq_info->used_idx = le16_to_cpu(comp.vdpa_vq_reset.used_index) ^ invert_idx;
+
+ return 0;
+}
diff --git a/drivers/vdpa/pds/cmds.h b/drivers/vdpa/pds/cmds.h
new file mode 100644
index 0000000000..e24d85cb8f
--- /dev/null
+++ b/drivers/vdpa/pds/cmds.h
@@ -0,0 +1,18 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/* Copyright(c) 2023 Advanced Micro Devices, Inc */
+
+#ifndef _VDPA_CMDS_H_
+#define _VDPA_CMDS_H_
+
+int pds_vdpa_init_hw(struct pds_vdpa_device *pdsv);
+
+int pds_vdpa_cmd_reset(struct pds_vdpa_device *pdsv);
+int pds_vdpa_cmd_set_status(struct pds_vdpa_device *pdsv, u8 status);
+int pds_vdpa_cmd_set_mac(struct pds_vdpa_device *pdsv, u8 *mac);
+int pds_vdpa_cmd_set_max_vq_pairs(struct pds_vdpa_device *pdsv, u16 max_vqp);
+int pds_vdpa_cmd_init_vq(struct pds_vdpa_device *pdsv, u16 qid, u16 invert_idx,
+ struct pds_vdpa_vq_info *vq_info);
+int pds_vdpa_cmd_reset_vq(struct pds_vdpa_device *pdsv, u16 qid, u16 invert_idx,
+ struct pds_vdpa_vq_info *vq_info);
+int pds_vdpa_cmd_set_features(struct pds_vdpa_device *pdsv, u64 features);
+#endif /* _VDPA_CMDS_H_ */
diff --git a/drivers/vdpa/pds/debugfs.c b/drivers/vdpa/pds/debugfs.c
new file mode 100644
index 0000000000..c328e694f6
--- /dev/null
+++ b/drivers/vdpa/pds/debugfs.c
@@ -0,0 +1,286 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright(c) 2023 Advanced Micro Devices, Inc */
+
+#include <linux/pci.h>
+#include <linux/vdpa.h>
+
+#include <linux/pds/pds_common.h>
+#include <linux/pds/pds_core_if.h>
+#include <linux/pds/pds_adminq.h>
+#include <linux/pds/pds_auxbus.h>
+
+#include "aux_drv.h"
+#include "vdpa_dev.h"
+#include "debugfs.h"
+
+static struct dentry *dbfs_dir;
+
+void pds_vdpa_debugfs_create(void)
+{
+ dbfs_dir = debugfs_create_dir(PDS_VDPA_DRV_NAME, NULL);
+}
+
+void pds_vdpa_debugfs_destroy(void)
+{
+ debugfs_remove_recursive(dbfs_dir);
+ dbfs_dir = NULL;
+}
+
+#define PRINT_SBIT_NAME(__seq, __f, __name) \
+ do { \
+ if ((__f) & (__name)) \
+ seq_printf(__seq, " %s", &#__name[16]); \
+ } while (0)
+
+static void print_status_bits(struct seq_file *seq, u8 status)
+{
+ seq_puts(seq, "status:");
+ PRINT_SBIT_NAME(seq, status, VIRTIO_CONFIG_S_ACKNOWLEDGE);
+ PRINT_SBIT_NAME(seq, status, VIRTIO_CONFIG_S_DRIVER);
+ PRINT_SBIT_NAME(seq, status, VIRTIO_CONFIG_S_DRIVER_OK);
+ PRINT_SBIT_NAME(seq, status, VIRTIO_CONFIG_S_FEATURES_OK);
+ PRINT_SBIT_NAME(seq, status, VIRTIO_CONFIG_S_NEEDS_RESET);
+ PRINT_SBIT_NAME(seq, status, VIRTIO_CONFIG_S_FAILED);
+ seq_puts(seq, "\n");
+}
+
+static void print_feature_bits_all(struct seq_file *seq, u64 features)
+{
+ int i;
+
+ seq_puts(seq, "features:");
+
+ for (i = 0; i < (sizeof(u64) * 8); i++) {
+ u64 mask = BIT_ULL(i);
+
+ switch (features & mask) {
+ case BIT_ULL(VIRTIO_NET_F_CSUM):
+ seq_puts(seq, " VIRTIO_NET_F_CSUM");
+ break;
+ case BIT_ULL(VIRTIO_NET_F_GUEST_CSUM):
+ seq_puts(seq, " VIRTIO_NET_F_GUEST_CSUM");
+ break;
+ case BIT_ULL(VIRTIO_NET_F_CTRL_GUEST_OFFLOADS):
+ seq_puts(seq, " VIRTIO_NET_F_CTRL_GUEST_OFFLOADS");
+ break;
+ case BIT_ULL(VIRTIO_NET_F_MTU):
+ seq_puts(seq, " VIRTIO_NET_F_MTU");
+ break;
+ case BIT_ULL(VIRTIO_NET_F_MAC):
+ seq_puts(seq, " VIRTIO_NET_F_MAC");
+ break;
+ case BIT_ULL(VIRTIO_NET_F_GUEST_TSO4):
+ seq_puts(seq, " VIRTIO_NET_F_GUEST_TSO4");
+ break;
+ case BIT_ULL(VIRTIO_NET_F_GUEST_TSO6):
+ seq_puts(seq, " VIRTIO_NET_F_GUEST_TSO6");
+ break;
+ case BIT_ULL(VIRTIO_NET_F_GUEST_ECN):
+ seq_puts(seq, " VIRTIO_NET_F_GUEST_ECN");
+ break;
+ case BIT_ULL(VIRTIO_NET_F_GUEST_UFO):
+ seq_puts(seq, " VIRTIO_NET_F_GUEST_UFO");
+ break;
+ case BIT_ULL(VIRTIO_NET_F_HOST_TSO4):
+ seq_puts(seq, " VIRTIO_NET_F_HOST_TSO4");
+ break;
+ case BIT_ULL(VIRTIO_NET_F_HOST_TSO6):
+ seq_puts(seq, " VIRTIO_NET_F_HOST_TSO6");
+ break;
+ case BIT_ULL(VIRTIO_NET_F_HOST_ECN):
+ seq_puts(seq, " VIRTIO_NET_F_HOST_ECN");
+ break;
+ case BIT_ULL(VIRTIO_NET_F_HOST_UFO):
+ seq_puts(seq, " VIRTIO_NET_F_HOST_UFO");
+ break;
+ case BIT_ULL(VIRTIO_NET_F_MRG_RXBUF):
+ seq_puts(seq, " VIRTIO_NET_F_MRG_RXBUF");
+ break;
+ case BIT_ULL(VIRTIO_NET_F_STATUS):
+ seq_puts(seq, " VIRTIO_NET_F_STATUS");
+ break;
+ case BIT_ULL(VIRTIO_NET_F_CTRL_VQ):
+ seq_puts(seq, " VIRTIO_NET_F_CTRL_VQ");
+ break;
+ case BIT_ULL(VIRTIO_NET_F_CTRL_RX):
+ seq_puts(seq, " VIRTIO_NET_F_CTRL_RX");
+ break;
+ case BIT_ULL(VIRTIO_NET_F_CTRL_VLAN):
+ seq_puts(seq, " VIRTIO_NET_F_CTRL_VLAN");
+ break;
+ case BIT_ULL(VIRTIO_NET_F_CTRL_RX_EXTRA):
+ seq_puts(seq, " VIRTIO_NET_F_CTRL_RX_EXTRA");
+ break;
+ case BIT_ULL(VIRTIO_NET_F_GUEST_ANNOUNCE):
+ seq_puts(seq, " VIRTIO_NET_F_GUEST_ANNOUNCE");
+ break;
+ case BIT_ULL(VIRTIO_NET_F_MQ):
+ seq_puts(seq, " VIRTIO_NET_F_MQ");
+ break;
+ case BIT_ULL(VIRTIO_NET_F_CTRL_MAC_ADDR):
+ seq_puts(seq, " VIRTIO_NET_F_CTRL_MAC_ADDR");
+ break;
+ case BIT_ULL(VIRTIO_NET_F_HASH_REPORT):
+ seq_puts(seq, " VIRTIO_NET_F_HASH_REPORT");
+ break;
+ case BIT_ULL(VIRTIO_NET_F_RSS):
+ seq_puts(seq, " VIRTIO_NET_F_RSS");
+ break;
+ case BIT_ULL(VIRTIO_NET_F_RSC_EXT):
+ seq_puts(seq, " VIRTIO_NET_F_RSC_EXT");
+ break;
+ case BIT_ULL(VIRTIO_NET_F_STANDBY):
+ seq_puts(seq, " VIRTIO_NET_F_STANDBY");
+ break;
+ case BIT_ULL(VIRTIO_NET_F_SPEED_DUPLEX):
+ seq_puts(seq, " VIRTIO_NET_F_SPEED_DUPLEX");
+ break;
+ case BIT_ULL(VIRTIO_F_NOTIFY_ON_EMPTY):
+ seq_puts(seq, " VIRTIO_F_NOTIFY_ON_EMPTY");
+ break;
+ case BIT_ULL(VIRTIO_F_ANY_LAYOUT):
+ seq_puts(seq, " VIRTIO_F_ANY_LAYOUT");
+ break;
+ case BIT_ULL(VIRTIO_F_VERSION_1):
+ seq_puts(seq, " VIRTIO_F_VERSION_1");
+ break;
+ case BIT_ULL(VIRTIO_F_ACCESS_PLATFORM):
+ seq_puts(seq, " VIRTIO_F_ACCESS_PLATFORM");
+ break;
+ case BIT_ULL(VIRTIO_F_RING_PACKED):
+ seq_puts(seq, " VIRTIO_F_RING_PACKED");
+ break;
+ case BIT_ULL(VIRTIO_F_ORDER_PLATFORM):
+ seq_puts(seq, " VIRTIO_F_ORDER_PLATFORM");
+ break;
+ case BIT_ULL(VIRTIO_F_SR_IOV):
+ seq_puts(seq, " VIRTIO_F_SR_IOV");
+ break;
+ case 0:
+ break;
+ default:
+ seq_printf(seq, " bit_%d", i);
+ break;
+ }
+ }
+
+ seq_puts(seq, "\n");
+}
+
+void pds_vdpa_debugfs_add_pcidev(struct pds_vdpa_aux *vdpa_aux)
+{
+ vdpa_aux->dentry = debugfs_create_dir(pci_name(vdpa_aux->padev->vf_pdev), dbfs_dir);
+}
+
+static int identity_show(struct seq_file *seq, void *v)
+{
+ struct pds_vdpa_aux *vdpa_aux = seq->private;
+ struct vdpa_mgmt_dev *mgmt;
+ u64 hw_features;
+
+ seq_printf(seq, "aux_dev: %s\n",
+ dev_name(&vdpa_aux->padev->aux_dev.dev));
+
+ mgmt = &vdpa_aux->vdpa_mdev;
+ seq_printf(seq, "max_vqs: %d\n", mgmt->max_supported_vqs);
+ seq_printf(seq, "config_attr_mask: %#llx\n", mgmt->config_attr_mask);
+ hw_features = le64_to_cpu(vdpa_aux->ident.hw_features);
+ seq_printf(seq, "hw_features: %#llx\n", hw_features);
+ print_feature_bits_all(seq, hw_features);
+
+ return 0;
+}
+DEFINE_SHOW_ATTRIBUTE(identity);
+
+void pds_vdpa_debugfs_add_ident(struct pds_vdpa_aux *vdpa_aux)
+{
+ debugfs_create_file("identity", 0400, vdpa_aux->dentry,
+ vdpa_aux, &identity_fops);
+}
+
+static int config_show(struct seq_file *seq, void *v)
+{
+ struct pds_vdpa_device *pdsv = seq->private;
+ struct virtio_net_config vc;
+ u8 status;
+
+ memcpy_fromio(&vc, pdsv->vdpa_aux->vd_mdev.device,
+ sizeof(struct virtio_net_config));
+
+ seq_printf(seq, "mac: %pM\n", vc.mac);
+ seq_printf(seq, "max_virtqueue_pairs: %d\n",
+ __virtio16_to_cpu(true, vc.max_virtqueue_pairs));
+ seq_printf(seq, "mtu: %d\n", __virtio16_to_cpu(true, vc.mtu));
+ seq_printf(seq, "speed: %d\n", le32_to_cpu(vc.speed));
+ seq_printf(seq, "duplex: %d\n", vc.duplex);
+ seq_printf(seq, "rss_max_key_size: %d\n", vc.rss_max_key_size);
+ seq_printf(seq, "rss_max_indirection_table_length: %d\n",
+ le16_to_cpu(vc.rss_max_indirection_table_length));
+ seq_printf(seq, "supported_hash_types: %#x\n",
+ le32_to_cpu(vc.supported_hash_types));
+ seq_printf(seq, "vn_status: %#x\n",
+ __virtio16_to_cpu(true, vc.status));
+
+ status = vp_modern_get_status(&pdsv->vdpa_aux->vd_mdev);
+ seq_printf(seq, "dev_status: %#x\n", status);
+ print_status_bits(seq, status);
+ seq_printf(seq, "negotiated_features: %#llx\n", pdsv->negotiated_features);
+ print_feature_bits_all(seq, pdsv->negotiated_features);
+ seq_printf(seq, "vdpa_index: %d\n", pdsv->vdpa_index);
+ seq_printf(seq, "num_vqs: %d\n", pdsv->num_vqs);
+
+ return 0;
+}
+DEFINE_SHOW_ATTRIBUTE(config);
+
+static int vq_show(struct seq_file *seq, void *v)
+{
+ struct pds_vdpa_vq_info *vq = seq->private;
+
+ seq_printf(seq, "ready: %d\n", vq->ready);
+ seq_printf(seq, "desc_addr: %#llx\n", vq->desc_addr);
+ seq_printf(seq, "avail_addr: %#llx\n", vq->avail_addr);
+ seq_printf(seq, "used_addr: %#llx\n", vq->used_addr);
+ seq_printf(seq, "q_len: %d\n", vq->q_len);
+ seq_printf(seq, "qid: %d\n", vq->qid);
+
+ seq_printf(seq, "doorbell: %#llx\n", vq->doorbell);
+ seq_printf(seq, "avail_idx: %d\n", vq->avail_idx);
+ seq_printf(seq, "used_idx: %d\n", vq->used_idx);
+ seq_printf(seq, "irq: %d\n", vq->irq);
+ seq_printf(seq, "irq-name: %s\n", vq->irq_name);
+
+ return 0;
+}
+DEFINE_SHOW_ATTRIBUTE(vq);
+
+void pds_vdpa_debugfs_add_vdpadev(struct pds_vdpa_aux *vdpa_aux)
+{
+ int i;
+
+ debugfs_create_file("config", 0400, vdpa_aux->dentry, vdpa_aux->pdsv, &config_fops);
+
+ for (i = 0; i < vdpa_aux->pdsv->num_vqs; i++) {
+ char name[16];
+
+ snprintf(name, sizeof(name), "vq%02d", i);
+ debugfs_create_file(name, 0400, vdpa_aux->dentry,
+ &vdpa_aux->pdsv->vqs[i], &vq_fops);
+ }
+}
+
+void pds_vdpa_debugfs_del_vdpadev(struct pds_vdpa_aux *vdpa_aux)
+{
+ debugfs_remove_recursive(vdpa_aux->dentry);
+ vdpa_aux->dentry = NULL;
+}
+
+void pds_vdpa_debugfs_reset_vdpadev(struct pds_vdpa_aux *vdpa_aux)
+{
+ /* we don't keep track of the entries, so remove it all
+ * then rebuild the basics
+ */
+ pds_vdpa_debugfs_del_vdpadev(vdpa_aux);
+ pds_vdpa_debugfs_add_pcidev(vdpa_aux);
+ pds_vdpa_debugfs_add_ident(vdpa_aux);
+}
diff --git a/drivers/vdpa/pds/debugfs.h b/drivers/vdpa/pds/debugfs.h
new file mode 100644
index 0000000000..c088a4e8f1
--- /dev/null
+++ b/drivers/vdpa/pds/debugfs.h
@@ -0,0 +1,17 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright(c) 2023 Advanced Micro Devices, Inc */
+
+#ifndef _PDS_VDPA_DEBUGFS_H_
+#define _PDS_VDPA_DEBUGFS_H_
+
+#include <linux/debugfs.h>
+
+void pds_vdpa_debugfs_create(void);
+void pds_vdpa_debugfs_destroy(void);
+void pds_vdpa_debugfs_add_pcidev(struct pds_vdpa_aux *vdpa_aux);
+void pds_vdpa_debugfs_add_ident(struct pds_vdpa_aux *vdpa_aux);
+void pds_vdpa_debugfs_add_vdpadev(struct pds_vdpa_aux *vdpa_aux);
+void pds_vdpa_debugfs_del_vdpadev(struct pds_vdpa_aux *vdpa_aux);
+void pds_vdpa_debugfs_reset_vdpadev(struct pds_vdpa_aux *vdpa_aux);
+
+#endif /* _PDS_VDPA_DEBUGFS_H_ */
diff --git a/drivers/vdpa/pds/vdpa_dev.c b/drivers/vdpa/pds/vdpa_dev.c
new file mode 100644
index 0000000000..25c0fe5ec3
--- /dev/null
+++ b/drivers/vdpa/pds/vdpa_dev.c
@@ -0,0 +1,844 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright(c) 2023 Advanced Micro Devices, Inc */
+
+#include <linux/pci.h>
+#include <linux/vdpa.h>
+#include <uapi/linux/vdpa.h>
+#include <linux/virtio_pci_modern.h>
+
+#include <linux/pds/pds_common.h>
+#include <linux/pds/pds_core_if.h>
+#include <linux/pds/pds_adminq.h>
+#include <linux/pds/pds_auxbus.h>
+
+#include "vdpa_dev.h"
+#include "aux_drv.h"
+#include "cmds.h"
+#include "debugfs.h"
+
+static u64 pds_vdpa_get_driver_features(struct vdpa_device *vdpa_dev);
+
+static struct pds_vdpa_device *vdpa_to_pdsv(struct vdpa_device *vdpa_dev)
+{
+ return container_of(vdpa_dev, struct pds_vdpa_device, vdpa_dev);
+}
+
+static int pds_vdpa_notify_handler(struct notifier_block *nb,
+ unsigned long ecode,
+ void *data)
+{
+ struct pds_vdpa_device *pdsv = container_of(nb, struct pds_vdpa_device, nb);
+ struct device *dev = &pdsv->vdpa_aux->padev->aux_dev.dev;
+
+ dev_dbg(dev, "%s: event code %lu\n", __func__, ecode);
+
+ if (ecode == PDS_EVENT_RESET || ecode == PDS_EVENT_LINK_CHANGE) {
+ if (pdsv->config_cb.callback)
+ pdsv->config_cb.callback(pdsv->config_cb.private);
+ }
+
+ return 0;
+}
+
+static int pds_vdpa_register_event_handler(struct pds_vdpa_device *pdsv)
+{
+ struct device *dev = &pdsv->vdpa_aux->padev->aux_dev.dev;
+ struct notifier_block *nb = &pdsv->nb;
+ int err;
+
+ if (!nb->notifier_call) {
+ nb->notifier_call = pds_vdpa_notify_handler;
+ err = pdsc_register_notify(nb);
+ if (err) {
+ nb->notifier_call = NULL;
+ dev_err(dev, "failed to register pds event handler: %ps\n",
+ ERR_PTR(err));
+ return -EINVAL;
+ }
+ dev_dbg(dev, "pds event handler registered\n");
+ }
+
+ return 0;
+}
+
+static void pds_vdpa_unregister_event_handler(struct pds_vdpa_device *pdsv)
+{
+ if (pdsv->nb.notifier_call) {
+ pdsc_unregister_notify(&pdsv->nb);
+ pdsv->nb.notifier_call = NULL;
+ }
+}
+
+static int pds_vdpa_set_vq_address(struct vdpa_device *vdpa_dev, u16 qid,
+ u64 desc_addr, u64 driver_addr, u64 device_addr)
+{
+ struct pds_vdpa_device *pdsv = vdpa_to_pdsv(vdpa_dev);
+
+ pdsv->vqs[qid].desc_addr = desc_addr;
+ pdsv->vqs[qid].avail_addr = driver_addr;
+ pdsv->vqs[qid].used_addr = device_addr;
+
+ return 0;
+}
+
+static void pds_vdpa_set_vq_num(struct vdpa_device *vdpa_dev, u16 qid, u32 num)
+{
+ struct pds_vdpa_device *pdsv = vdpa_to_pdsv(vdpa_dev);
+
+ pdsv->vqs[qid].q_len = num;
+}
+
+static void pds_vdpa_kick_vq(struct vdpa_device *vdpa_dev, u16 qid)
+{
+ struct pds_vdpa_device *pdsv = vdpa_to_pdsv(vdpa_dev);
+
+ iowrite16(qid, pdsv->vqs[qid].notify);
+}
+
+static void pds_vdpa_set_vq_cb(struct vdpa_device *vdpa_dev, u16 qid,
+ struct vdpa_callback *cb)
+{
+ struct pds_vdpa_device *pdsv = vdpa_to_pdsv(vdpa_dev);
+
+ pdsv->vqs[qid].event_cb = *cb;
+}
+
+static irqreturn_t pds_vdpa_isr(int irq, void *data)
+{
+ struct pds_vdpa_vq_info *vq;
+
+ vq = data;
+ if (vq->event_cb.callback)
+ vq->event_cb.callback(vq->event_cb.private);
+
+ return IRQ_HANDLED;
+}
+
+static void pds_vdpa_release_irq(struct pds_vdpa_device *pdsv, int qid)
+{
+ if (pdsv->vqs[qid].irq == VIRTIO_MSI_NO_VECTOR)
+ return;
+
+ free_irq(pdsv->vqs[qid].irq, &pdsv->vqs[qid]);
+ pdsv->vqs[qid].irq = VIRTIO_MSI_NO_VECTOR;
+}
+
+static void pds_vdpa_set_vq_ready(struct vdpa_device *vdpa_dev, u16 qid, bool ready)
+{
+ struct pds_vdpa_device *pdsv = vdpa_to_pdsv(vdpa_dev);
+ struct device *dev = &pdsv->vdpa_dev.dev;
+ u64 driver_features;
+ u16 invert_idx = 0;
+ int err;
+
+ dev_dbg(dev, "%s: qid %d ready %d => %d\n",
+ __func__, qid, pdsv->vqs[qid].ready, ready);
+ if (ready == pdsv->vqs[qid].ready)
+ return;
+
+ driver_features = pds_vdpa_get_driver_features(vdpa_dev);
+ if (driver_features & BIT_ULL(VIRTIO_F_RING_PACKED))
+ invert_idx = PDS_VDPA_PACKED_INVERT_IDX;
+
+ if (ready) {
+ /* Pass vq setup info to DSC using adminq to gather up and
+ * send all info at once so FW can do its full set up in
+ * one easy operation
+ */
+ err = pds_vdpa_cmd_init_vq(pdsv, qid, invert_idx, &pdsv->vqs[qid]);
+ if (err) {
+ dev_err(dev, "Failed to init vq %d: %pe\n",
+ qid, ERR_PTR(err));
+ ready = false;
+ }
+ } else {
+ err = pds_vdpa_cmd_reset_vq(pdsv, qid, invert_idx, &pdsv->vqs[qid]);
+ if (err)
+ dev_err(dev, "%s: reset_vq failed qid %d: %pe\n",
+ __func__, qid, ERR_PTR(err));
+ }
+
+ pdsv->vqs[qid].ready = ready;
+}
+
+static bool pds_vdpa_get_vq_ready(struct vdpa_device *vdpa_dev, u16 qid)
+{
+ struct pds_vdpa_device *pdsv = vdpa_to_pdsv(vdpa_dev);
+
+ return pdsv->vqs[qid].ready;
+}
+
+static int pds_vdpa_set_vq_state(struct vdpa_device *vdpa_dev, u16 qid,
+ const struct vdpa_vq_state *state)
+{
+ struct pds_vdpa_device *pdsv = vdpa_to_pdsv(vdpa_dev);
+ struct pds_auxiliary_dev *padev = pdsv->vdpa_aux->padev;
+ struct device *dev = &padev->aux_dev.dev;
+ u64 driver_features;
+ u16 avail;
+ u16 used;
+
+ if (pdsv->vqs[qid].ready) {
+ dev_err(dev, "Setting device position is denied while vq is enabled\n");
+ return -EINVAL;
+ }
+
+ driver_features = pds_vdpa_get_driver_features(vdpa_dev);
+ if (driver_features & BIT_ULL(VIRTIO_F_RING_PACKED)) {
+ avail = state->packed.last_avail_idx |
+ (state->packed.last_avail_counter << 15);
+ used = state->packed.last_used_idx |
+ (state->packed.last_used_counter << 15);
+
+ /* The avail and used index are stored with the packed wrap
+ * counter bit inverted. This way, in case set_vq_state is
+ * not called, the initial value can be set to zero prior to
+ * feature negotiation, and it is good for both packed and
+ * split vq.
+ */
+ avail ^= PDS_VDPA_PACKED_INVERT_IDX;
+ used ^= PDS_VDPA_PACKED_INVERT_IDX;
+ } else {
+ avail = state->split.avail_index;
+ /* state->split does not provide a used_index:
+ * the vq will be set to "empty" here, and the vq will read
+ * the current used index the next time the vq is kicked.
+ */
+ used = avail;
+ }
+
+ if (used != avail) {
+ dev_dbg(dev, "Setting used equal to avail, for interoperability\n");
+ used = avail;
+ }
+
+ pdsv->vqs[qid].avail_idx = avail;
+ pdsv->vqs[qid].used_idx = used;
+
+ return 0;
+}
+
+static int pds_vdpa_get_vq_state(struct vdpa_device *vdpa_dev, u16 qid,
+ struct vdpa_vq_state *state)
+{
+ struct pds_vdpa_device *pdsv = vdpa_to_pdsv(vdpa_dev);
+ struct pds_auxiliary_dev *padev = pdsv->vdpa_aux->padev;
+ struct device *dev = &padev->aux_dev.dev;
+ u64 driver_features;
+ u16 avail;
+ u16 used;
+
+ if (pdsv->vqs[qid].ready) {
+ dev_err(dev, "Getting device position is denied while vq is enabled\n");
+ return -EINVAL;
+ }
+
+ avail = pdsv->vqs[qid].avail_idx;
+ used = pdsv->vqs[qid].used_idx;
+
+ driver_features = pds_vdpa_get_driver_features(vdpa_dev);
+ if (driver_features & BIT_ULL(VIRTIO_F_RING_PACKED)) {
+ avail ^= PDS_VDPA_PACKED_INVERT_IDX;
+ used ^= PDS_VDPA_PACKED_INVERT_IDX;
+
+ state->packed.last_avail_idx = avail & 0x7fff;
+ state->packed.last_avail_counter = avail >> 15;
+ state->packed.last_used_idx = used & 0x7fff;
+ state->packed.last_used_counter = used >> 15;
+ } else {
+ state->split.avail_index = avail;
+ /* state->split does not provide a used_index. */
+ }
+
+ return 0;
+}
+
+static struct vdpa_notification_area
+pds_vdpa_get_vq_notification(struct vdpa_device *vdpa_dev, u16 qid)
+{
+ struct pds_vdpa_device *pdsv = vdpa_to_pdsv(vdpa_dev);
+ struct virtio_pci_modern_device *vd_mdev;
+ struct vdpa_notification_area area;
+
+ area.addr = pdsv->vqs[qid].notify_pa;
+
+ vd_mdev = &pdsv->vdpa_aux->vd_mdev;
+ if (!vd_mdev->notify_offset_multiplier)
+ area.size = PDS_PAGE_SIZE;
+ else
+ area.size = vd_mdev->notify_offset_multiplier;
+
+ return area;
+}
+
+static int pds_vdpa_get_vq_irq(struct vdpa_device *vdpa_dev, u16 qid)
+{
+ struct pds_vdpa_device *pdsv = vdpa_to_pdsv(vdpa_dev);
+
+ return pdsv->vqs[qid].irq;
+}
+
+static u32 pds_vdpa_get_vq_align(struct vdpa_device *vdpa_dev)
+{
+ return PDS_PAGE_SIZE;
+}
+
+static u32 pds_vdpa_get_vq_group(struct vdpa_device *vdpa_dev, u16 idx)
+{
+ return 0;
+}
+
+static u64 pds_vdpa_get_device_features(struct vdpa_device *vdpa_dev)
+{
+ struct pds_vdpa_device *pdsv = vdpa_to_pdsv(vdpa_dev);
+
+ return pdsv->supported_features;
+}
+
+static int pds_vdpa_set_driver_features(struct vdpa_device *vdpa_dev, u64 features)
+{
+ struct pds_vdpa_device *pdsv = vdpa_to_pdsv(vdpa_dev);
+ struct device *dev = &pdsv->vdpa_dev.dev;
+ u64 driver_features;
+ u64 nego_features;
+ u64 hw_features;
+ u64 missing;
+
+ if (!(features & BIT_ULL(VIRTIO_F_ACCESS_PLATFORM)) && features) {
+ dev_err(dev, "VIRTIO_F_ACCESS_PLATFORM is not negotiated\n");
+ return -EOPNOTSUPP;
+ }
+
+ /* Check for valid feature bits */
+ nego_features = features & pdsv->supported_features;
+ missing = features & ~nego_features;
+ if (missing) {
+ dev_err(dev, "Can't support all requested features in %#llx, missing %#llx features\n",
+ features, missing);
+ return -EOPNOTSUPP;
+ }
+
+ driver_features = pds_vdpa_get_driver_features(vdpa_dev);
+ pdsv->negotiated_features = nego_features;
+ dev_dbg(dev, "%s: %#llx => %#llx\n",
+ __func__, driver_features, nego_features);
+
+ /* if we're faking the F_MAC, strip it before writing to device */
+ hw_features = le64_to_cpu(pdsv->vdpa_aux->ident.hw_features);
+ if (!(hw_features & BIT_ULL(VIRTIO_NET_F_MAC)))
+ nego_features &= ~BIT_ULL(VIRTIO_NET_F_MAC);
+
+ if (driver_features == nego_features)
+ return 0;
+
+ vp_modern_set_features(&pdsv->vdpa_aux->vd_mdev, nego_features);
+
+ return 0;
+}
+
+static u64 pds_vdpa_get_driver_features(struct vdpa_device *vdpa_dev)
+{
+ struct pds_vdpa_device *pdsv = vdpa_to_pdsv(vdpa_dev);
+
+ return pdsv->negotiated_features;
+}
+
+static void pds_vdpa_set_config_cb(struct vdpa_device *vdpa_dev,
+ struct vdpa_callback *cb)
+{
+ struct pds_vdpa_device *pdsv = vdpa_to_pdsv(vdpa_dev);
+
+ pdsv->config_cb.callback = cb->callback;
+ pdsv->config_cb.private = cb->private;
+}
+
+static u16 pds_vdpa_get_vq_num_max(struct vdpa_device *vdpa_dev)
+{
+ struct pds_vdpa_device *pdsv = vdpa_to_pdsv(vdpa_dev);
+
+ /* qemu has assert() that vq_num_max <= VIRTQUEUE_MAX_SIZE (1024) */
+ return min_t(u16, 1024, BIT(le16_to_cpu(pdsv->vdpa_aux->ident.max_qlen)));
+}
+
+static u32 pds_vdpa_get_device_id(struct vdpa_device *vdpa_dev)
+{
+ return VIRTIO_ID_NET;
+}
+
+static u32 pds_vdpa_get_vendor_id(struct vdpa_device *vdpa_dev)
+{
+ return PCI_VENDOR_ID_PENSANDO;
+}
+
+static u8 pds_vdpa_get_status(struct vdpa_device *vdpa_dev)
+{
+ struct pds_vdpa_device *pdsv = vdpa_to_pdsv(vdpa_dev);
+
+ return vp_modern_get_status(&pdsv->vdpa_aux->vd_mdev);
+}
+
+static int pds_vdpa_request_irqs(struct pds_vdpa_device *pdsv)
+{
+ struct pci_dev *pdev = pdsv->vdpa_aux->padev->vf_pdev;
+ struct pds_vdpa_aux *vdpa_aux = pdsv->vdpa_aux;
+ struct device *dev = &pdsv->vdpa_dev.dev;
+ int max_vq, nintrs, qid, err;
+
+ max_vq = vdpa_aux->vdpa_mdev.max_supported_vqs;
+
+ nintrs = pci_alloc_irq_vectors(pdev, max_vq, max_vq, PCI_IRQ_MSIX);
+ if (nintrs < 0) {
+ dev_err(dev, "Couldn't get %d msix vectors: %pe\n",
+ max_vq, ERR_PTR(nintrs));
+ return nintrs;
+ }
+
+ for (qid = 0; qid < pdsv->num_vqs; ++qid) {
+ int irq = pci_irq_vector(pdev, qid);
+
+ snprintf(pdsv->vqs[qid].irq_name, sizeof(pdsv->vqs[qid].irq_name),
+ "vdpa-%s-%d", dev_name(dev), qid);
+
+ err = request_irq(irq, pds_vdpa_isr, 0,
+ pdsv->vqs[qid].irq_name,
+ &pdsv->vqs[qid]);
+ if (err) {
+ dev_err(dev, "%s: no irq for qid %d: %pe\n",
+ __func__, qid, ERR_PTR(err));
+ goto err_release;
+ }
+
+ pdsv->vqs[qid].irq = irq;
+ }
+
+ vdpa_aux->nintrs = nintrs;
+
+ return 0;
+
+err_release:
+ while (qid--)
+ pds_vdpa_release_irq(pdsv, qid);
+
+ pci_free_irq_vectors(pdev);
+
+ vdpa_aux->nintrs = 0;
+
+ return err;
+}
+
+static void pds_vdpa_release_irqs(struct pds_vdpa_device *pdsv)
+{
+ struct pci_dev *pdev = pdsv->vdpa_aux->padev->vf_pdev;
+ struct pds_vdpa_aux *vdpa_aux = pdsv->vdpa_aux;
+ int qid;
+
+ if (!vdpa_aux->nintrs)
+ return;
+
+ for (qid = 0; qid < pdsv->num_vqs; qid++)
+ pds_vdpa_release_irq(pdsv, qid);
+
+ pci_free_irq_vectors(pdev);
+
+ vdpa_aux->nintrs = 0;
+}
+
+static void pds_vdpa_set_status(struct vdpa_device *vdpa_dev, u8 status)
+{
+ struct pds_vdpa_device *pdsv = vdpa_to_pdsv(vdpa_dev);
+ struct device *dev = &pdsv->vdpa_dev.dev;
+ u8 old_status;
+ int i;
+
+ old_status = pds_vdpa_get_status(vdpa_dev);
+ dev_dbg(dev, "%s: old %#x new %#x\n", __func__, old_status, status);
+
+ if (status & ~old_status & VIRTIO_CONFIG_S_DRIVER_OK) {
+ if (pds_vdpa_request_irqs(pdsv))
+ status = old_status | VIRTIO_CONFIG_S_FAILED;
+ }
+
+ pds_vdpa_cmd_set_status(pdsv, status);
+
+ if (status == 0) {
+ struct vdpa_callback null_cb = { };
+
+ pds_vdpa_set_config_cb(vdpa_dev, &null_cb);
+ pds_vdpa_cmd_reset(pdsv);
+
+ for (i = 0; i < pdsv->num_vqs; i++) {
+ pdsv->vqs[i].avail_idx = 0;
+ pdsv->vqs[i].used_idx = 0;
+ }
+
+ pds_vdpa_cmd_set_mac(pdsv, pdsv->mac);
+ }
+
+ if (status & ~old_status & VIRTIO_CONFIG_S_FEATURES_OK) {
+ for (i = 0; i < pdsv->num_vqs; i++) {
+ pdsv->vqs[i].notify =
+ vp_modern_map_vq_notify(&pdsv->vdpa_aux->vd_mdev,
+ i, &pdsv->vqs[i].notify_pa);
+ }
+ }
+
+ if (old_status & ~status & VIRTIO_CONFIG_S_DRIVER_OK)
+ pds_vdpa_release_irqs(pdsv);
+}
+
+static void pds_vdpa_init_vqs_entry(struct pds_vdpa_device *pdsv, int qid,
+ void __iomem *notify)
+{
+ memset(&pdsv->vqs[qid], 0, sizeof(pdsv->vqs[0]));
+ pdsv->vqs[qid].qid = qid;
+ pdsv->vqs[qid].pdsv = pdsv;
+ pdsv->vqs[qid].ready = false;
+ pdsv->vqs[qid].irq = VIRTIO_MSI_NO_VECTOR;
+ pdsv->vqs[qid].notify = notify;
+}
+
+static int pds_vdpa_reset(struct vdpa_device *vdpa_dev)
+{
+ struct pds_vdpa_device *pdsv = vdpa_to_pdsv(vdpa_dev);
+ struct device *dev;
+ int err = 0;
+ u8 status;
+ int i;
+
+ dev = &pdsv->vdpa_aux->padev->aux_dev.dev;
+ status = pds_vdpa_get_status(vdpa_dev);
+
+ if (status == 0)
+ return 0;
+
+ if (status & VIRTIO_CONFIG_S_DRIVER_OK) {
+ /* Reset the vqs */
+ for (i = 0; i < pdsv->num_vqs && !err; i++) {
+ err = pds_vdpa_cmd_reset_vq(pdsv, i, 0, &pdsv->vqs[i]);
+ if (err)
+ dev_err(dev, "%s: reset_vq failed qid %d: %pe\n",
+ __func__, i, ERR_PTR(err));
+ }
+ }
+
+ pds_vdpa_set_status(vdpa_dev, 0);
+
+ if (status & VIRTIO_CONFIG_S_DRIVER_OK) {
+ /* Reset the vq info */
+ for (i = 0; i < pdsv->num_vqs && !err; i++)
+ pds_vdpa_init_vqs_entry(pdsv, i, pdsv->vqs[i].notify);
+ }
+
+ return 0;
+}
+
+static size_t pds_vdpa_get_config_size(struct vdpa_device *vdpa_dev)
+{
+ return sizeof(struct virtio_net_config);
+}
+
+static void pds_vdpa_get_config(struct vdpa_device *vdpa_dev,
+ unsigned int offset,
+ void *buf, unsigned int len)
+{
+ struct pds_vdpa_device *pdsv = vdpa_to_pdsv(vdpa_dev);
+ void __iomem *device;
+
+ if (offset + len > sizeof(struct virtio_net_config)) {
+ WARN(true, "%s: bad read, offset %d len %d\n", __func__, offset, len);
+ return;
+ }
+
+ device = pdsv->vdpa_aux->vd_mdev.device;
+ memcpy_fromio(buf, device + offset, len);
+}
+
+static void pds_vdpa_set_config(struct vdpa_device *vdpa_dev,
+ unsigned int offset, const void *buf,
+ unsigned int len)
+{
+ struct pds_vdpa_device *pdsv = vdpa_to_pdsv(vdpa_dev);
+ void __iomem *device;
+
+ if (offset + len > sizeof(struct virtio_net_config)) {
+ WARN(true, "%s: bad read, offset %d len %d\n", __func__, offset, len);
+ return;
+ }
+
+ device = pdsv->vdpa_aux->vd_mdev.device;
+ memcpy_toio(device + offset, buf, len);
+}
+
+static const struct vdpa_config_ops pds_vdpa_ops = {
+ .set_vq_address = pds_vdpa_set_vq_address,
+ .set_vq_num = pds_vdpa_set_vq_num,
+ .kick_vq = pds_vdpa_kick_vq,
+ .set_vq_cb = pds_vdpa_set_vq_cb,
+ .set_vq_ready = pds_vdpa_set_vq_ready,
+ .get_vq_ready = pds_vdpa_get_vq_ready,
+ .set_vq_state = pds_vdpa_set_vq_state,
+ .get_vq_state = pds_vdpa_get_vq_state,
+ .get_vq_notification = pds_vdpa_get_vq_notification,
+ .get_vq_irq = pds_vdpa_get_vq_irq,
+ .get_vq_align = pds_vdpa_get_vq_align,
+ .get_vq_group = pds_vdpa_get_vq_group,
+
+ .get_device_features = pds_vdpa_get_device_features,
+ .set_driver_features = pds_vdpa_set_driver_features,
+ .get_driver_features = pds_vdpa_get_driver_features,
+ .set_config_cb = pds_vdpa_set_config_cb,
+ .get_vq_num_max = pds_vdpa_get_vq_num_max,
+ .get_device_id = pds_vdpa_get_device_id,
+ .get_vendor_id = pds_vdpa_get_vendor_id,
+ .get_status = pds_vdpa_get_status,
+ .set_status = pds_vdpa_set_status,
+ .reset = pds_vdpa_reset,
+ .get_config_size = pds_vdpa_get_config_size,
+ .get_config = pds_vdpa_get_config,
+ .set_config = pds_vdpa_set_config,
+};
+static struct virtio_device_id pds_vdpa_id_table[] = {
+ {VIRTIO_ID_NET, VIRTIO_DEV_ANY_ID},
+ {0},
+};
+
+static int pds_vdpa_dev_add(struct vdpa_mgmt_dev *mdev, const char *name,
+ const struct vdpa_dev_set_config *add_config)
+{
+ struct pds_vdpa_aux *vdpa_aux;
+ struct pds_vdpa_device *pdsv;
+ struct vdpa_mgmt_dev *mgmt;
+ u16 fw_max_vqs, vq_pairs;
+ struct device *dma_dev;
+ struct pci_dev *pdev;
+ struct device *dev;
+ int err;
+ int i;
+
+ vdpa_aux = container_of(mdev, struct pds_vdpa_aux, vdpa_mdev);
+ dev = &vdpa_aux->padev->aux_dev.dev;
+ mgmt = &vdpa_aux->vdpa_mdev;
+
+ if (vdpa_aux->pdsv) {
+ dev_warn(dev, "Multiple vDPA devices on a VF is not supported.\n");
+ return -EOPNOTSUPP;
+ }
+
+ pdsv = vdpa_alloc_device(struct pds_vdpa_device, vdpa_dev,
+ dev, &pds_vdpa_ops, 1, 1, name, false);
+ if (IS_ERR(pdsv)) {
+ dev_err(dev, "Failed to allocate vDPA structure: %pe\n", pdsv);
+ return PTR_ERR(pdsv);
+ }
+
+ vdpa_aux->pdsv = pdsv;
+ pdsv->vdpa_aux = vdpa_aux;
+
+ pdev = vdpa_aux->padev->vf_pdev;
+ dma_dev = &pdev->dev;
+ pdsv->vdpa_dev.dma_dev = dma_dev;
+
+ pdsv->supported_features = mgmt->supported_features;
+
+ if (add_config->mask & BIT_ULL(VDPA_ATTR_DEV_FEATURES)) {
+ u64 unsupp_features =
+ add_config->device_features & ~pdsv->supported_features;
+
+ if (unsupp_features) {
+ dev_err(dev, "Unsupported features: %#llx\n", unsupp_features);
+ err = -EOPNOTSUPP;
+ goto err_unmap;
+ }
+
+ pdsv->supported_features = add_config->device_features;
+ }
+
+ err = pds_vdpa_cmd_reset(pdsv);
+ if (err) {
+ dev_err(dev, "Failed to reset hw: %pe\n", ERR_PTR(err));
+ goto err_unmap;
+ }
+
+ err = pds_vdpa_init_hw(pdsv);
+ if (err) {
+ dev_err(dev, "Failed to init hw: %pe\n", ERR_PTR(err));
+ goto err_unmap;
+ }
+
+ fw_max_vqs = le16_to_cpu(pdsv->vdpa_aux->ident.max_vqs);
+ vq_pairs = fw_max_vqs / 2;
+
+ /* Make sure we have the queues being requested */
+ if (add_config->mask & (1 << VDPA_ATTR_DEV_NET_CFG_MAX_VQP))
+ vq_pairs = add_config->net.max_vq_pairs;
+
+ pdsv->num_vqs = 2 * vq_pairs;
+ if (pdsv->supported_features & BIT_ULL(VIRTIO_NET_F_CTRL_VQ))
+ pdsv->num_vqs++;
+
+ if (pdsv->num_vqs > fw_max_vqs) {
+ dev_err(dev, "%s: queue count requested %u greater than max %u\n",
+ __func__, pdsv->num_vqs, fw_max_vqs);
+ err = -ENOSPC;
+ goto err_unmap;
+ }
+
+ if (pdsv->num_vqs != fw_max_vqs) {
+ err = pds_vdpa_cmd_set_max_vq_pairs(pdsv, vq_pairs);
+ if (err) {
+ dev_err(dev, "Failed to set max_vq_pairs: %pe\n",
+ ERR_PTR(err));
+ goto err_unmap;
+ }
+ }
+
+ /* Set a mac, either from the user config if provided
+ * or use the device's mac if not 00:..:00
+ * or set a random mac
+ */
+ if (add_config->mask & BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MACADDR)) {
+ ether_addr_copy(pdsv->mac, add_config->net.mac);
+ } else {
+ struct virtio_net_config __iomem *vc;
+
+ vc = pdsv->vdpa_aux->vd_mdev.device;
+ memcpy_fromio(pdsv->mac, vc->mac, sizeof(pdsv->mac));
+ if (is_zero_ether_addr(pdsv->mac) &&
+ (pdsv->supported_features & BIT_ULL(VIRTIO_NET_F_MAC))) {
+ eth_random_addr(pdsv->mac);
+ dev_info(dev, "setting random mac %pM\n", pdsv->mac);
+ }
+ }
+ pds_vdpa_cmd_set_mac(pdsv, pdsv->mac);
+
+ for (i = 0; i < pdsv->num_vqs; i++) {
+ void __iomem *notify;
+
+ notify = vp_modern_map_vq_notify(&pdsv->vdpa_aux->vd_mdev,
+ i, &pdsv->vqs[i].notify_pa);
+ pds_vdpa_init_vqs_entry(pdsv, i, notify);
+ }
+
+ pdsv->vdpa_dev.mdev = &vdpa_aux->vdpa_mdev;
+
+ err = pds_vdpa_register_event_handler(pdsv);
+ if (err) {
+ dev_err(dev, "Failed to register for PDS events: %pe\n", ERR_PTR(err));
+ goto err_unmap;
+ }
+
+ /* We use the _vdpa_register_device() call rather than the
+ * vdpa_register_device() to avoid a deadlock because our
+ * dev_add() is called with the vdpa_dev_lock already set
+ * by vdpa_nl_cmd_dev_add_set_doit()
+ */
+ err = _vdpa_register_device(&pdsv->vdpa_dev, pdsv->num_vqs);
+ if (err) {
+ dev_err(dev, "Failed to register to vDPA bus: %pe\n", ERR_PTR(err));
+ goto err_unevent;
+ }
+
+ pds_vdpa_debugfs_add_vdpadev(vdpa_aux);
+
+ return 0;
+
+err_unevent:
+ pds_vdpa_unregister_event_handler(pdsv);
+err_unmap:
+ put_device(&pdsv->vdpa_dev.dev);
+ vdpa_aux->pdsv = NULL;
+ return err;
+}
+
+static void pds_vdpa_dev_del(struct vdpa_mgmt_dev *mdev,
+ struct vdpa_device *vdpa_dev)
+{
+ struct pds_vdpa_device *pdsv = vdpa_to_pdsv(vdpa_dev);
+ struct pds_vdpa_aux *vdpa_aux;
+
+ pds_vdpa_unregister_event_handler(pdsv);
+
+ vdpa_aux = container_of(mdev, struct pds_vdpa_aux, vdpa_mdev);
+ _vdpa_unregister_device(vdpa_dev);
+
+ pds_vdpa_cmd_reset(vdpa_aux->pdsv);
+ pds_vdpa_debugfs_reset_vdpadev(vdpa_aux);
+
+ vdpa_aux->pdsv = NULL;
+
+ dev_info(&vdpa_aux->padev->aux_dev.dev, "Removed vdpa device\n");
+}
+
+static const struct vdpa_mgmtdev_ops pds_vdpa_mgmt_dev_ops = {
+ .dev_add = pds_vdpa_dev_add,
+ .dev_del = pds_vdpa_dev_del
+};
+
+int pds_vdpa_get_mgmt_info(struct pds_vdpa_aux *vdpa_aux)
+{
+ union pds_core_adminq_cmd cmd = {
+ .vdpa_ident.opcode = PDS_VDPA_CMD_IDENT,
+ .vdpa_ident.vf_id = cpu_to_le16(vdpa_aux->vf_id),
+ };
+ union pds_core_adminq_comp comp = {};
+ struct vdpa_mgmt_dev *mgmt;
+ struct pci_dev *pf_pdev;
+ struct device *pf_dev;
+ struct pci_dev *pdev;
+ dma_addr_t ident_pa;
+ struct device *dev;
+ u16 dev_intrs;
+ u16 max_vqs;
+ int err;
+
+ dev = &vdpa_aux->padev->aux_dev.dev;
+ pdev = vdpa_aux->padev->vf_pdev;
+ mgmt = &vdpa_aux->vdpa_mdev;
+
+ /* Get resource info through the PF's adminq. It is a block of info,
+ * so we need to map some memory for PF to make available to the
+ * firmware for writing the data.
+ */
+ pf_pdev = pci_physfn(vdpa_aux->padev->vf_pdev);
+ pf_dev = &pf_pdev->dev;
+ ident_pa = dma_map_single(pf_dev, &vdpa_aux->ident,
+ sizeof(vdpa_aux->ident), DMA_FROM_DEVICE);
+ if (dma_mapping_error(pf_dev, ident_pa)) {
+ dev_err(dev, "Failed to map ident space\n");
+ return -ENOMEM;
+ }
+
+ cmd.vdpa_ident.ident_pa = cpu_to_le64(ident_pa);
+ cmd.vdpa_ident.len = cpu_to_le32(sizeof(vdpa_aux->ident));
+ err = pds_client_adminq_cmd(vdpa_aux->padev, &cmd,
+ sizeof(cmd.vdpa_ident), &comp, 0);
+ dma_unmap_single(pf_dev, ident_pa,
+ sizeof(vdpa_aux->ident), DMA_FROM_DEVICE);
+ if (err) {
+ dev_err(dev, "Failed to ident hw, status %d: %pe\n",
+ comp.status, ERR_PTR(err));
+ return err;
+ }
+
+ max_vqs = le16_to_cpu(vdpa_aux->ident.max_vqs);
+ dev_intrs = pci_msix_vec_count(pdev);
+ dev_dbg(dev, "ident.max_vqs %d dev_intrs %d\n", max_vqs, dev_intrs);
+
+ max_vqs = min_t(u16, dev_intrs, max_vqs);
+ mgmt->max_supported_vqs = min_t(u16, PDS_VDPA_MAX_QUEUES, max_vqs);
+ vdpa_aux->nintrs = 0;
+
+ mgmt->ops = &pds_vdpa_mgmt_dev_ops;
+ mgmt->id_table = pds_vdpa_id_table;
+ mgmt->device = dev;
+ mgmt->supported_features = le64_to_cpu(vdpa_aux->ident.hw_features);
+
+ /* advertise F_MAC even if the device doesn't */
+ mgmt->supported_features |= BIT_ULL(VIRTIO_NET_F_MAC);
+
+ mgmt->config_attr_mask = BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MACADDR);
+ mgmt->config_attr_mask |= BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MAX_VQP);
+ mgmt->config_attr_mask |= BIT_ULL(VDPA_ATTR_DEV_FEATURES);
+
+ return 0;
+}
diff --git a/drivers/vdpa/pds/vdpa_dev.h b/drivers/vdpa/pds/vdpa_dev.h
new file mode 100644
index 0000000000..d984ba24a7
--- /dev/null
+++ b/drivers/vdpa/pds/vdpa_dev.h
@@ -0,0 +1,50 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/* Copyright(c) 2023 Advanced Micro Devices, Inc */
+
+#ifndef _VDPA_DEV_H_
+#define _VDPA_DEV_H_
+
+#include <linux/pci.h>
+#include <linux/vdpa.h>
+
+struct pds_vdpa_vq_info {
+ bool ready;
+ u64 desc_addr;
+ u64 avail_addr;
+ u64 used_addr;
+ u32 q_len;
+ u16 qid;
+ int irq;
+ char irq_name[32];
+
+ void __iomem *notify;
+ dma_addr_t notify_pa;
+
+ u64 doorbell;
+ u16 avail_idx;
+ u16 used_idx;
+
+ struct vdpa_callback event_cb;
+ struct pds_vdpa_device *pdsv;
+};
+
+#define PDS_VDPA_MAX_QUEUES 65
+#define PDS_VDPA_MAX_QLEN 32768
+struct pds_vdpa_device {
+ struct vdpa_device vdpa_dev;
+ struct pds_vdpa_aux *vdpa_aux;
+
+ struct pds_vdpa_vq_info vqs[PDS_VDPA_MAX_QUEUES];
+ u64 supported_features; /* supported device features */
+ u64 negotiated_features; /* negotiated features */
+ u8 vdpa_index; /* rsvd for future subdevice use */
+ u8 num_vqs; /* num vqs in use */
+ u8 mac[ETH_ALEN]; /* mac selected when the device was added */
+ struct vdpa_callback config_cb;
+ struct notifier_block nb;
+};
+
+#define PDS_VDPA_PACKED_INVERT_IDX 0x8000
+
+int pds_vdpa_get_mgmt_info(struct pds_vdpa_aux *vdpa_aux);
+#endif /* _VDPA_DEV_H_ */
diff --git a/drivers/vdpa/solidrun/Makefile b/drivers/vdpa/solidrun/Makefile
new file mode 100644
index 0000000000..9116252cd5
--- /dev/null
+++ b/drivers/vdpa/solidrun/Makefile
@@ -0,0 +1,7 @@
+# SPDX-License-Identifier: GPL-2.0
+obj-$(CONFIG_SNET_VDPA) += snet_vdpa.o
+snet_vdpa-$(CONFIG_SNET_VDPA) += snet_main.o
+snet_vdpa-$(CONFIG_SNET_VDPA) += snet_ctrl.o
+ifdef CONFIG_HWMON
+snet_vdpa-$(CONFIG_SNET_VDPA) += snet_hwmon.o
+endif
diff --git a/drivers/vdpa/solidrun/snet_ctrl.c b/drivers/vdpa/solidrun/snet_ctrl.c
new file mode 100644
index 0000000000..3cef2571d1
--- /dev/null
+++ b/drivers/vdpa/solidrun/snet_ctrl.c
@@ -0,0 +1,336 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * SolidRun DPU driver for control plane
+ *
+ * Copyright (C) 2022-2023 SolidRun
+ *
+ * Author: Alvaro Karsz <alvaro.karsz@solid-run.com>
+ *
+ */
+
+#include <linux/iopoll.h>
+
+#include "snet_vdpa.h"
+
+enum snet_ctrl_opcodes {
+ SNET_CTRL_OP_DESTROY = 1,
+ SNET_CTRL_OP_READ_VQ_STATE,
+ SNET_CTRL_OP_SUSPEND,
+ SNET_CTRL_OP_RESUME,
+};
+
+#define SNET_CTRL_TIMEOUT 2000000
+
+#define SNET_CTRL_DATA_SIZE_MASK 0x0000FFFF
+#define SNET_CTRL_IN_PROCESS_MASK 0x00010000
+#define SNET_CTRL_CHUNK_RDY_MASK 0x00020000
+#define SNET_CTRL_ERROR_MASK 0x0FFC0000
+
+#define SNET_VAL_TO_ERR(val) (-(((val) & SNET_CTRL_ERROR_MASK) >> 18))
+#define SNET_EMPTY_CTRL(val) (((val) & SNET_CTRL_ERROR_MASK) || \
+ !((val) & SNET_CTRL_IN_PROCESS_MASK))
+#define SNET_DATA_READY(val) ((val) & (SNET_CTRL_ERROR_MASK | SNET_CTRL_CHUNK_RDY_MASK))
+
+/* Control register used to read data from the DPU */
+struct snet_ctrl_reg_ctrl {
+ /* Chunk size in 4B words */
+ u16 data_size;
+ /* We are in the middle of a command */
+ u16 in_process:1;
+ /* A data chunk is ready and can be consumed */
+ u16 chunk_ready:1;
+ /* Error code */
+ u16 error:10;
+ /* Saved for future usage */
+ u16 rsvd:4;
+};
+
+/* Opcode register */
+struct snet_ctrl_reg_op {
+ u16 opcode;
+ /* Only if VQ index is relevant for the command */
+ u16 vq_idx;
+};
+
+struct snet_ctrl_regs {
+ struct snet_ctrl_reg_op op;
+ struct snet_ctrl_reg_ctrl ctrl;
+ u32 rsvd;
+ u32 data[];
+};
+
+static struct snet_ctrl_regs __iomem *snet_get_ctrl(struct snet *snet)
+{
+ return snet->bar + snet->psnet->cfg.ctrl_off;
+}
+
+static int snet_wait_for_empty_ctrl(struct snet_ctrl_regs __iomem *regs)
+{
+ u32 val;
+
+ return readx_poll_timeout(ioread32, &regs->ctrl, val, SNET_EMPTY_CTRL(val), 10,
+ SNET_CTRL_TIMEOUT);
+}
+
+static int snet_wait_for_empty_op(struct snet_ctrl_regs __iomem *regs)
+{
+ u32 val;
+
+ return readx_poll_timeout(ioread32, &regs->op, val, !val, 10, SNET_CTRL_TIMEOUT);
+}
+
+static int snet_wait_for_data(struct snet_ctrl_regs __iomem *regs)
+{
+ u32 val;
+
+ return readx_poll_timeout(ioread32, &regs->ctrl, val, SNET_DATA_READY(val), 10,
+ SNET_CTRL_TIMEOUT);
+}
+
+static u32 snet_read32_word(struct snet_ctrl_regs __iomem *ctrl_regs, u16 word_idx)
+{
+ return ioread32(&ctrl_regs->data[word_idx]);
+}
+
+static u32 snet_read_ctrl(struct snet_ctrl_regs __iomem *ctrl_regs)
+{
+ return ioread32(&ctrl_regs->ctrl);
+}
+
+static void snet_write_ctrl(struct snet_ctrl_regs __iomem *ctrl_regs, u32 val)
+{
+ iowrite32(val, &ctrl_regs->ctrl);
+}
+
+static void snet_write_op(struct snet_ctrl_regs __iomem *ctrl_regs, u32 val)
+{
+ iowrite32(val, &ctrl_regs->op);
+}
+
+static int snet_wait_for_dpu_completion(struct snet_ctrl_regs __iomem *ctrl_regs)
+{
+ /* Wait until the DPU finishes completely.
+ * It will clear the opcode register.
+ */
+ return snet_wait_for_empty_op(ctrl_regs);
+}
+
+/* Reading ctrl from the DPU:
+ * buf_size must be 4B aligned
+ *
+ * Steps:
+ *
+ * (1) Verify that the DPU is not in the middle of another operation by
+ * reading the in_process and error bits in the control register.
+ * (2) Write the request opcode and the VQ idx in the opcode register
+ * and write the buffer size in the control register.
+ * (3) Start readind chunks of data, chunk_ready bit indicates that a
+ * data chunk is available, we signal that we read the data by clearing the bit.
+ * (4) Detect that the transfer is completed when the in_process bit
+ * in the control register is cleared or when the an error appears.
+ */
+static int snet_ctrl_read_from_dpu(struct snet *snet, u16 opcode, u16 vq_idx, void *buffer,
+ u32 buf_size)
+{
+ struct pci_dev *pdev = snet->pdev;
+ struct snet_ctrl_regs __iomem *regs = snet_get_ctrl(snet);
+ u32 *bfr_ptr = (u32 *)buffer;
+ u32 val;
+ u16 buf_words;
+ int ret;
+ u16 words, i, tot_words = 0;
+
+ /* Supported for config 2+ */
+ if (!SNET_CFG_VER(snet, 2))
+ return -EOPNOTSUPP;
+
+ if (!IS_ALIGNED(buf_size, 4))
+ return -EINVAL;
+
+ mutex_lock(&snet->ctrl_lock);
+
+ buf_words = buf_size / 4;
+
+ /* Make sure control register is empty */
+ ret = snet_wait_for_empty_ctrl(regs);
+ if (ret) {
+ SNET_WARN(pdev, "Timeout waiting for previous control data to be consumed\n");
+ goto exit;
+ }
+
+ /* We need to write the buffer size in the control register, and the opcode + vq index in
+ * the opcode register.
+ * We use a spinlock to serialize the writes.
+ */
+ spin_lock(&snet->ctrl_spinlock);
+
+ snet_write_ctrl(regs, buf_words);
+ snet_write_op(regs, opcode | (vq_idx << 16));
+
+ spin_unlock(&snet->ctrl_spinlock);
+
+ while (buf_words != tot_words) {
+ ret = snet_wait_for_data(regs);
+ if (ret) {
+ SNET_WARN(pdev, "Timeout waiting for control data\n");
+ goto exit;
+ }
+
+ val = snet_read_ctrl(regs);
+
+ /* Error? */
+ if (val & SNET_CTRL_ERROR_MASK) {
+ ret = SNET_VAL_TO_ERR(val);
+ SNET_WARN(pdev, "Error while reading control data from DPU, err %d\n", ret);
+ goto exit;
+ }
+
+ words = min_t(u16, val & SNET_CTRL_DATA_SIZE_MASK, buf_words - tot_words);
+
+ for (i = 0; i < words; i++) {
+ *bfr_ptr = snet_read32_word(regs, i);
+ bfr_ptr++;
+ }
+
+ tot_words += words;
+
+ /* Is the job completed? */
+ if (!(val & SNET_CTRL_IN_PROCESS_MASK))
+ break;
+
+ /* Clear the chunk ready bit and continue */
+ val &= ~SNET_CTRL_CHUNK_RDY_MASK;
+ snet_write_ctrl(regs, val);
+ }
+
+ ret = snet_wait_for_dpu_completion(regs);
+ if (ret)
+ SNET_WARN(pdev, "Timeout waiting for the DPU to complete a control command\n");
+
+exit:
+ mutex_unlock(&snet->ctrl_lock);
+ return ret;
+}
+
+/* Send a control message to the DPU using the old mechanism
+ * used with config version 1.
+ */
+static int snet_send_ctrl_msg_old(struct snet *snet, u32 opcode)
+{
+ struct pci_dev *pdev = snet->pdev;
+ struct snet_ctrl_regs __iomem *regs = snet_get_ctrl(snet);
+ int ret;
+
+ mutex_lock(&snet->ctrl_lock);
+
+ /* Old mechanism uses just 1 register, the opcode register.
+ * Make sure that the opcode register is empty, and that the DPU isn't
+ * processing an old message.
+ */
+ ret = snet_wait_for_empty_op(regs);
+ if (ret) {
+ SNET_WARN(pdev, "Timeout waiting for previous control message to be ACKed\n");
+ goto exit;
+ }
+
+ /* Write the message */
+ snet_write_op(regs, opcode);
+
+ /* DPU ACKs the message by clearing the opcode register */
+ ret = snet_wait_for_empty_op(regs);
+ if (ret)
+ SNET_WARN(pdev, "Timeout waiting for a control message to be ACKed\n");
+
+exit:
+ mutex_unlock(&snet->ctrl_lock);
+ return ret;
+}
+
+/* Send a control message to the DPU.
+ * A control message is a message without payload.
+ */
+static int snet_send_ctrl_msg(struct snet *snet, u16 opcode, u16 vq_idx)
+{
+ struct pci_dev *pdev = snet->pdev;
+ struct snet_ctrl_regs __iomem *regs = snet_get_ctrl(snet);
+ u32 val;
+ int ret;
+
+ /* If config version is not 2+, use the old mechanism */
+ if (!SNET_CFG_VER(snet, 2))
+ return snet_send_ctrl_msg_old(snet, opcode);
+
+ mutex_lock(&snet->ctrl_lock);
+
+ /* Make sure control register is empty */
+ ret = snet_wait_for_empty_ctrl(regs);
+ if (ret) {
+ SNET_WARN(pdev, "Timeout waiting for previous control data to be consumed\n");
+ goto exit;
+ }
+
+ /* We need to clear the control register and write the opcode + vq index in the opcode
+ * register.
+ * We use a spinlock to serialize the writes.
+ */
+ spin_lock(&snet->ctrl_spinlock);
+
+ snet_write_ctrl(regs, 0);
+ snet_write_op(regs, opcode | (vq_idx << 16));
+
+ spin_unlock(&snet->ctrl_spinlock);
+
+ /* The DPU ACKs control messages by setting the chunk ready bit
+ * without data.
+ */
+ ret = snet_wait_for_data(regs);
+ if (ret) {
+ SNET_WARN(pdev, "Timeout waiting for control message to be ACKed\n");
+ goto exit;
+ }
+
+ /* Check for errors */
+ val = snet_read_ctrl(regs);
+ ret = SNET_VAL_TO_ERR(val);
+
+ /* Clear the chunk ready bit */
+ val &= ~SNET_CTRL_CHUNK_RDY_MASK;
+ snet_write_ctrl(regs, val);
+
+ ret = snet_wait_for_dpu_completion(regs);
+ if (ret)
+ SNET_WARN(pdev, "Timeout waiting for DPU to complete a control command, err %d\n",
+ ret);
+
+exit:
+ mutex_unlock(&snet->ctrl_lock);
+ return ret;
+}
+
+void snet_ctrl_clear(struct snet *snet)
+{
+ struct snet_ctrl_regs __iomem *regs = snet_get_ctrl(snet);
+
+ snet_write_op(regs, 0);
+}
+
+int snet_destroy_dev(struct snet *snet)
+{
+ return snet_send_ctrl_msg(snet, SNET_CTRL_OP_DESTROY, 0);
+}
+
+int snet_read_vq_state(struct snet *snet, u16 idx, struct vdpa_vq_state *state)
+{
+ return snet_ctrl_read_from_dpu(snet, SNET_CTRL_OP_READ_VQ_STATE, idx, state,
+ sizeof(*state));
+}
+
+int snet_suspend_dev(struct snet *snet)
+{
+ return snet_send_ctrl_msg(snet, SNET_CTRL_OP_SUSPEND, 0);
+}
+
+int snet_resume_dev(struct snet *snet)
+{
+ return snet_send_ctrl_msg(snet, SNET_CTRL_OP_RESUME, 0);
+}
diff --git a/drivers/vdpa/solidrun/snet_hwmon.c b/drivers/vdpa/solidrun/snet_hwmon.c
new file mode 100644
index 0000000000..af531a3390
--- /dev/null
+++ b/drivers/vdpa/solidrun/snet_hwmon.c
@@ -0,0 +1,188 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * SolidRun DPU driver for control plane
+ *
+ * Copyright (C) 2022-2023 SolidRun
+ *
+ * Author: Alvaro Karsz <alvaro.karsz@solid-run.com>
+ *
+ */
+#include <linux/hwmon.h>
+
+#include "snet_vdpa.h"
+
+/* Monitor offsets */
+#define SNET_MON_TMP0_IN_OFF 0x00
+#define SNET_MON_TMP0_MAX_OFF 0x08
+#define SNET_MON_TMP0_CRIT_OFF 0x10
+#define SNET_MON_TMP1_IN_OFF 0x18
+#define SNET_MON_TMP1_CRIT_OFF 0x20
+#define SNET_MON_CURR_IN_OFF 0x28
+#define SNET_MON_CURR_MAX_OFF 0x30
+#define SNET_MON_CURR_CRIT_OFF 0x38
+#define SNET_MON_PWR_IN_OFF 0x40
+#define SNET_MON_VOLT_IN_OFF 0x48
+#define SNET_MON_VOLT_CRIT_OFF 0x50
+#define SNET_MON_VOLT_LCRIT_OFF 0x58
+
+static void snet_hwmon_read_reg(struct psnet *psnet, u32 reg, long *out)
+{
+ *out = psnet_read64(psnet, psnet->cfg.hwmon_off + reg);
+}
+
+static umode_t snet_howmon_is_visible(const void *data,
+ enum hwmon_sensor_types type,
+ u32 attr, int channel)
+{
+ return 0444;
+}
+
+static int snet_howmon_read(struct device *dev, enum hwmon_sensor_types type,
+ u32 attr, int channel, long *val)
+{
+ struct psnet *psnet = dev_get_drvdata(dev);
+ int ret = 0;
+
+ switch (type) {
+ case hwmon_in:
+ switch (attr) {
+ case hwmon_in_lcrit:
+ snet_hwmon_read_reg(psnet, SNET_MON_VOLT_LCRIT_OFF, val);
+ break;
+ case hwmon_in_crit:
+ snet_hwmon_read_reg(psnet, SNET_MON_VOLT_CRIT_OFF, val);
+ break;
+ case hwmon_in_input:
+ snet_hwmon_read_reg(psnet, SNET_MON_VOLT_IN_OFF, val);
+ break;
+ default:
+ ret = -EOPNOTSUPP;
+ break;
+ }
+ break;
+
+ case hwmon_power:
+ switch (attr) {
+ case hwmon_power_input:
+ snet_hwmon_read_reg(psnet, SNET_MON_PWR_IN_OFF, val);
+ break;
+
+ default:
+ ret = -EOPNOTSUPP;
+ break;
+ }
+ break;
+
+ case hwmon_curr:
+ switch (attr) {
+ case hwmon_curr_input:
+ snet_hwmon_read_reg(psnet, SNET_MON_CURR_IN_OFF, val);
+ break;
+ case hwmon_curr_max:
+ snet_hwmon_read_reg(psnet, SNET_MON_CURR_MAX_OFF, val);
+ break;
+ case hwmon_curr_crit:
+ snet_hwmon_read_reg(psnet, SNET_MON_CURR_CRIT_OFF, val);
+ break;
+ default:
+ ret = -EOPNOTSUPP;
+ break;
+ }
+ break;
+
+ case hwmon_temp:
+ switch (attr) {
+ case hwmon_temp_input:
+ if (channel == 0)
+ snet_hwmon_read_reg(psnet, SNET_MON_TMP0_IN_OFF, val);
+ else
+ snet_hwmon_read_reg(psnet, SNET_MON_TMP1_IN_OFF, val);
+ break;
+ case hwmon_temp_max:
+ if (channel == 0)
+ snet_hwmon_read_reg(psnet, SNET_MON_TMP0_MAX_OFF, val);
+ else
+ ret = -EOPNOTSUPP;
+ break;
+ case hwmon_temp_crit:
+ if (channel == 0)
+ snet_hwmon_read_reg(psnet, SNET_MON_TMP0_CRIT_OFF, val);
+ else
+ snet_hwmon_read_reg(psnet, SNET_MON_TMP1_CRIT_OFF, val);
+ break;
+
+ default:
+ ret = -EOPNOTSUPP;
+ break;
+ }
+ break;
+
+ default:
+ ret = -EOPNOTSUPP;
+ break;
+ }
+ return ret;
+}
+
+static int snet_hwmon_read_string(struct device *dev,
+ enum hwmon_sensor_types type, u32 attr,
+ int channel, const char **str)
+{
+ int ret = 0;
+
+ switch (type) {
+ case hwmon_in:
+ *str = "main_vin";
+ break;
+ case hwmon_power:
+ *str = "soc_pin";
+ break;
+ case hwmon_curr:
+ *str = "soc_iin";
+ break;
+ case hwmon_temp:
+ if (channel == 0)
+ *str = "power_stage_temp";
+ else
+ *str = "ic_junction_temp";
+ break;
+ default:
+ ret = -EOPNOTSUPP;
+ break;
+ }
+ return ret;
+}
+
+static const struct hwmon_ops snet_hwmon_ops = {
+ .is_visible = snet_howmon_is_visible,
+ .read = snet_howmon_read,
+ .read_string = snet_hwmon_read_string
+};
+
+static const struct hwmon_channel_info * const snet_hwmon_info[] = {
+ HWMON_CHANNEL_INFO(temp, HWMON_T_INPUT | HWMON_T_MAX | HWMON_T_CRIT | HWMON_T_LABEL,
+ HWMON_T_INPUT | HWMON_T_CRIT | HWMON_T_LABEL),
+ HWMON_CHANNEL_INFO(power, HWMON_P_INPUT | HWMON_P_LABEL),
+ HWMON_CHANNEL_INFO(curr, HWMON_C_INPUT | HWMON_C_MAX | HWMON_C_CRIT | HWMON_C_LABEL),
+ HWMON_CHANNEL_INFO(in, HWMON_I_INPUT | HWMON_I_CRIT | HWMON_I_LCRIT | HWMON_I_LABEL),
+ NULL
+};
+
+static const struct hwmon_chip_info snet_hwmono_info = {
+ .ops = &snet_hwmon_ops,
+ .info = snet_hwmon_info,
+};
+
+/* Create an HW monitor device */
+void psnet_create_hwmon(struct pci_dev *pdev)
+{
+ struct device *hwmon;
+ struct psnet *psnet = pci_get_drvdata(pdev);
+
+ snprintf(psnet->hwmon_name, SNET_NAME_SIZE, "snet_%s", pci_name(pdev));
+ hwmon = devm_hwmon_device_register_with_info(&pdev->dev, psnet->hwmon_name, psnet,
+ &snet_hwmono_info, NULL);
+ /* The monitor is not mandatory, Just alert user in case of an error */
+ if (IS_ERR(hwmon))
+ SNET_WARN(pdev, "Failed to create SNET hwmon, error %ld\n", PTR_ERR(hwmon));
+}
diff --git a/drivers/vdpa/solidrun/snet_main.c b/drivers/vdpa/solidrun/snet_main.c
new file mode 100644
index 0000000000..99428a0406
--- /dev/null
+++ b/drivers/vdpa/solidrun/snet_main.c
@@ -0,0 +1,1132 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * SolidRun DPU driver for control plane
+ *
+ * Copyright (C) 2022-2023 SolidRun
+ *
+ * Author: Alvaro Karsz <alvaro.karsz@solid-run.com>
+ *
+ */
+#include <linux/iopoll.h>
+
+#include "snet_vdpa.h"
+
+/* SNET DPU device ID */
+#define SNET_DEVICE_ID 0x1000
+/* SNET signature */
+#define SNET_SIGNATURE 0xD0D06363
+/* Max. config version that we can work with */
+#define SNET_CFG_VERSION 0x2
+/* Queue align */
+#define SNET_QUEUE_ALIGNMENT PAGE_SIZE
+/* Kick value to notify that new data is available */
+#define SNET_KICK_VAL 0x1
+#define SNET_CONFIG_OFF 0x0
+/* How long we are willing to wait for a SNET device */
+#define SNET_DETECT_TIMEOUT 5000000
+/* How long should we wait for the DPU to read our config */
+#define SNET_READ_CFG_TIMEOUT 3000000
+/* Size of configs written to the DPU */
+#define SNET_GENERAL_CFG_LEN 36
+#define SNET_GENERAL_CFG_VQ_LEN 40
+
+static struct snet *vdpa_to_snet(struct vdpa_device *vdpa)
+{
+ return container_of(vdpa, struct snet, vdpa);
+}
+
+static irqreturn_t snet_cfg_irq_hndlr(int irq, void *data)
+{
+ struct snet *snet = data;
+ /* Call callback if any */
+ if (likely(snet->cb.callback))
+ return snet->cb.callback(snet->cb.private);
+
+ return IRQ_HANDLED;
+}
+
+static irqreturn_t snet_vq_irq_hndlr(int irq, void *data)
+{
+ struct snet_vq *vq = data;
+ /* Call callback if any */
+ if (likely(vq->cb.callback))
+ return vq->cb.callback(vq->cb.private);
+
+ return IRQ_HANDLED;
+}
+
+static void snet_free_irqs(struct snet *snet)
+{
+ struct psnet *psnet = snet->psnet;
+ struct pci_dev *pdev;
+ u32 i;
+
+ /* Which Device allcoated the IRQs? */
+ if (PSNET_FLAG_ON(psnet, SNET_CFG_FLAG_IRQ_PF))
+ pdev = snet->pdev->physfn;
+ else
+ pdev = snet->pdev;
+
+ /* Free config's IRQ */
+ if (snet->cfg_irq != -1) {
+ devm_free_irq(&pdev->dev, snet->cfg_irq, snet);
+ snet->cfg_irq = -1;
+ }
+ /* Free VQ IRQs */
+ for (i = 0; i < snet->cfg->vq_num; i++) {
+ if (snet->vqs[i] && snet->vqs[i]->irq != -1) {
+ devm_free_irq(&pdev->dev, snet->vqs[i]->irq, snet->vqs[i]);
+ snet->vqs[i]->irq = -1;
+ }
+ }
+
+ /* IRQ vectors are freed when the pci remove callback is called */
+}
+
+static int snet_set_vq_address(struct vdpa_device *vdev, u16 idx, u64 desc_area,
+ u64 driver_area, u64 device_area)
+{
+ struct snet *snet = vdpa_to_snet(vdev);
+ /* save received parameters in vqueue sturct */
+ snet->vqs[idx]->desc_area = desc_area;
+ snet->vqs[idx]->driver_area = driver_area;
+ snet->vqs[idx]->device_area = device_area;
+
+ return 0;
+}
+
+static void snet_set_vq_num(struct vdpa_device *vdev, u16 idx, u32 num)
+{
+ struct snet *snet = vdpa_to_snet(vdev);
+ /* save num in vqueue */
+ snet->vqs[idx]->num = num;
+}
+
+static void snet_kick_vq(struct vdpa_device *vdev, u16 idx)
+{
+ struct snet *snet = vdpa_to_snet(vdev);
+ /* not ready - ignore */
+ if (unlikely(!snet->vqs[idx]->ready))
+ return;
+
+ iowrite32(SNET_KICK_VAL, snet->vqs[idx]->kick_ptr);
+}
+
+static void snet_kick_vq_with_data(struct vdpa_device *vdev, u32 data)
+{
+ struct snet *snet = vdpa_to_snet(vdev);
+ u16 idx = data & 0xFFFF;
+
+ /* not ready - ignore */
+ if (unlikely(!snet->vqs[idx]->ready))
+ return;
+
+ iowrite32((data & 0xFFFF0000) | SNET_KICK_VAL, snet->vqs[idx]->kick_ptr);
+}
+
+static void snet_set_vq_cb(struct vdpa_device *vdev, u16 idx, struct vdpa_callback *cb)
+{
+ struct snet *snet = vdpa_to_snet(vdev);
+
+ snet->vqs[idx]->cb.callback = cb->callback;
+ snet->vqs[idx]->cb.private = cb->private;
+}
+
+static void snet_set_vq_ready(struct vdpa_device *vdev, u16 idx, bool ready)
+{
+ struct snet *snet = vdpa_to_snet(vdev);
+
+ snet->vqs[idx]->ready = ready;
+}
+
+static bool snet_get_vq_ready(struct vdpa_device *vdev, u16 idx)
+{
+ struct snet *snet = vdpa_to_snet(vdev);
+
+ return snet->vqs[idx]->ready;
+}
+
+static bool snet_vq_state_is_initial(struct snet *snet, const struct vdpa_vq_state *state)
+{
+ if (SNET_HAS_FEATURE(snet, VIRTIO_F_RING_PACKED)) {
+ const struct vdpa_vq_state_packed *p = &state->packed;
+
+ if (p->last_avail_counter == 1 && p->last_used_counter == 1 &&
+ p->last_avail_idx == 0 && p->last_used_idx == 0)
+ return true;
+ } else {
+ const struct vdpa_vq_state_split *s = &state->split;
+
+ if (s->avail_index == 0)
+ return true;
+ }
+
+ return false;
+}
+
+static int snet_set_vq_state(struct vdpa_device *vdev, u16 idx, const struct vdpa_vq_state *state)
+{
+ struct snet *snet = vdpa_to_snet(vdev);
+
+ /* We can set any state for config version 2+ */
+ if (SNET_CFG_VER(snet, 2)) {
+ memcpy(&snet->vqs[idx]->vq_state, state, sizeof(*state));
+ return 0;
+ }
+
+ /* Older config - we can't set the VQ state.
+ * Return 0 only if this is the initial state we use in the DPU.
+ */
+ if (snet_vq_state_is_initial(snet, state))
+ return 0;
+
+ return -EOPNOTSUPP;
+}
+
+static int snet_get_vq_state(struct vdpa_device *vdev, u16 idx, struct vdpa_vq_state *state)
+{
+ struct snet *snet = vdpa_to_snet(vdev);
+
+ return snet_read_vq_state(snet, idx, state);
+}
+
+static int snet_get_vq_irq(struct vdpa_device *vdev, u16 idx)
+{
+ struct snet *snet = vdpa_to_snet(vdev);
+
+ return snet->vqs[idx]->irq;
+}
+
+static u32 snet_get_vq_align(struct vdpa_device *vdev)
+{
+ return (u32)SNET_QUEUE_ALIGNMENT;
+}
+
+static int snet_reset_dev(struct snet *snet)
+{
+ struct pci_dev *pdev = snet->pdev;
+ int ret = 0;
+ u32 i;
+
+ /* If status is 0, nothing to do */
+ if (!snet->status)
+ return 0;
+
+ /* If DPU started, destroy it */
+ if (snet->status & VIRTIO_CONFIG_S_DRIVER_OK)
+ ret = snet_destroy_dev(snet);
+
+ /* Clear VQs */
+ for (i = 0; i < snet->cfg->vq_num; i++) {
+ if (!snet->vqs[i])
+ continue;
+ snet->vqs[i]->cb.callback = NULL;
+ snet->vqs[i]->cb.private = NULL;
+ snet->vqs[i]->desc_area = 0;
+ snet->vqs[i]->device_area = 0;
+ snet->vqs[i]->driver_area = 0;
+ snet->vqs[i]->ready = false;
+ }
+
+ /* Clear config callback */
+ snet->cb.callback = NULL;
+ snet->cb.private = NULL;
+ /* Free IRQs */
+ snet_free_irqs(snet);
+ /* Reset status */
+ snet->status = 0;
+ snet->dpu_ready = false;
+
+ if (ret)
+ SNET_WARN(pdev, "Incomplete reset to SNET[%u] device, err: %d\n", snet->sid, ret);
+ else
+ SNET_DBG(pdev, "Reset SNET[%u] device\n", snet->sid);
+
+ return 0;
+}
+
+static int snet_reset(struct vdpa_device *vdev)
+{
+ struct snet *snet = vdpa_to_snet(vdev);
+
+ return snet_reset_dev(snet);
+}
+
+static size_t snet_get_config_size(struct vdpa_device *vdev)
+{
+ struct snet *snet = vdpa_to_snet(vdev);
+
+ return (size_t)snet->cfg->cfg_size;
+}
+
+static u64 snet_get_features(struct vdpa_device *vdev)
+{
+ struct snet *snet = vdpa_to_snet(vdev);
+
+ return snet->cfg->features;
+}
+
+static int snet_set_drv_features(struct vdpa_device *vdev, u64 features)
+{
+ struct snet *snet = vdpa_to_snet(vdev);
+
+ snet->negotiated_features = snet->cfg->features & features;
+ return 0;
+}
+
+static u64 snet_get_drv_features(struct vdpa_device *vdev)
+{
+ struct snet *snet = vdpa_to_snet(vdev);
+
+ return snet->negotiated_features;
+}
+
+static u16 snet_get_vq_num_max(struct vdpa_device *vdev)
+{
+ struct snet *snet = vdpa_to_snet(vdev);
+
+ return (u16)snet->cfg->vq_size;
+}
+
+static void snet_set_config_cb(struct vdpa_device *vdev, struct vdpa_callback *cb)
+{
+ struct snet *snet = vdpa_to_snet(vdev);
+
+ snet->cb.callback = cb->callback;
+ snet->cb.private = cb->private;
+}
+
+static u32 snet_get_device_id(struct vdpa_device *vdev)
+{
+ struct snet *snet = vdpa_to_snet(vdev);
+
+ return snet->cfg->virtio_id;
+}
+
+static u32 snet_get_vendor_id(struct vdpa_device *vdev)
+{
+ return (u32)PCI_VENDOR_ID_SOLIDRUN;
+}
+
+static u8 snet_get_status(struct vdpa_device *vdev)
+{
+ struct snet *snet = vdpa_to_snet(vdev);
+
+ return snet->status;
+}
+
+static int snet_write_conf(struct snet *snet)
+{
+ u32 off, i, tmp;
+ int ret;
+
+ /* No need to write the config twice */
+ if (snet->dpu_ready)
+ return true;
+
+ /* Snet data :
+ *
+ * General data: SNET_GENERAL_CFG_LEN bytes long
+ * 0 0x4 0x8 0xC 0x10 0x14 0x1C 0x24
+ * | MAGIC NUMBER | CFG VER | SNET SID | NUMBER OF QUEUES | IRQ IDX | FEATURES | RSVD |
+ *
+ * For every VQ: SNET_GENERAL_CFG_VQ_LEN bytes long
+ * 0 0x4 0x8
+ * | VQ SID AND QUEUE SIZE | IRQ Index |
+ * | DESC AREA |
+ * | DEVICE AREA |
+ * | DRIVER AREA |
+ * | VQ STATE (CFG 2+) | RSVD |
+ *
+ * Magic number should be written last, this is the DPU indication that the data is ready
+ */
+
+ /* Init offset */
+ off = snet->psnet->cfg.host_cfg_off;
+
+ /* Ignore magic number for now */
+ off += 4;
+ snet_write32(snet, off, snet->psnet->negotiated_cfg_ver);
+ off += 4;
+ snet_write32(snet, off, snet->sid);
+ off += 4;
+ snet_write32(snet, off, snet->cfg->vq_num);
+ off += 4;
+ snet_write32(snet, off, snet->cfg_irq_idx);
+ off += 4;
+ snet_write64(snet, off, snet->negotiated_features);
+ off += 8;
+ /* Ignore reserved */
+ off += 8;
+ /* Write VQs */
+ for (i = 0 ; i < snet->cfg->vq_num ; i++) {
+ tmp = (i << 16) | (snet->vqs[i]->num & 0xFFFF);
+ snet_write32(snet, off, tmp);
+ off += 4;
+ snet_write32(snet, off, snet->vqs[i]->irq_idx);
+ off += 4;
+ snet_write64(snet, off, snet->vqs[i]->desc_area);
+ off += 8;
+ snet_write64(snet, off, snet->vqs[i]->device_area);
+ off += 8;
+ snet_write64(snet, off, snet->vqs[i]->driver_area);
+ off += 8;
+ /* Write VQ state if config version is 2+ */
+ if (SNET_CFG_VER(snet, 2))
+ snet_write32(snet, off, *(u32 *)&snet->vqs[i]->vq_state);
+ off += 4;
+
+ /* Ignore reserved */
+ off += 4;
+ }
+
+ /* Write magic number - data is ready */
+ snet_write32(snet, snet->psnet->cfg.host_cfg_off, SNET_SIGNATURE);
+
+ /* The DPU will ACK the config by clearing the signature */
+ ret = readx_poll_timeout(ioread32, snet->bar + snet->psnet->cfg.host_cfg_off,
+ tmp, !tmp, 10, SNET_READ_CFG_TIMEOUT);
+ if (ret) {
+ SNET_ERR(snet->pdev, "Timeout waiting for the DPU to read the config\n");
+ return false;
+ }
+
+ /* set DPU flag */
+ snet->dpu_ready = true;
+
+ return true;
+}
+
+static int snet_request_irqs(struct pci_dev *pdev, struct snet *snet)
+{
+ int ret, i, irq;
+
+ /* Request config IRQ */
+ irq = pci_irq_vector(pdev, snet->cfg_irq_idx);
+ ret = devm_request_irq(&pdev->dev, irq, snet_cfg_irq_hndlr, 0,
+ snet->cfg_irq_name, snet);
+ if (ret) {
+ SNET_ERR(pdev, "Failed to request IRQ\n");
+ return ret;
+ }
+ snet->cfg_irq = irq;
+
+ /* Request IRQ for every VQ */
+ for (i = 0; i < snet->cfg->vq_num; i++) {
+ irq = pci_irq_vector(pdev, snet->vqs[i]->irq_idx);
+ ret = devm_request_irq(&pdev->dev, irq, snet_vq_irq_hndlr, 0,
+ snet->vqs[i]->irq_name, snet->vqs[i]);
+ if (ret) {
+ SNET_ERR(pdev, "Failed to request IRQ\n");
+ return ret;
+ }
+ snet->vqs[i]->irq = irq;
+ }
+ return 0;
+}
+
+static void snet_set_status(struct vdpa_device *vdev, u8 status)
+{
+ struct snet *snet = vdpa_to_snet(vdev);
+ struct psnet *psnet = snet->psnet;
+ struct pci_dev *pdev = snet->pdev;
+ int ret;
+ bool pf_irqs;
+
+ if (status == snet->status)
+ return;
+
+ if ((status & VIRTIO_CONFIG_S_DRIVER_OK) &&
+ !(snet->status & VIRTIO_CONFIG_S_DRIVER_OK)) {
+ /* Request IRQs */
+ pf_irqs = PSNET_FLAG_ON(psnet, SNET_CFG_FLAG_IRQ_PF);
+ ret = snet_request_irqs(pf_irqs ? pdev->physfn : pdev, snet);
+ if (ret)
+ goto set_err;
+
+ /* Write config to the DPU */
+ if (snet_write_conf(snet)) {
+ SNET_INFO(pdev, "Create SNET[%u] device\n", snet->sid);
+ } else {
+ snet_free_irqs(snet);
+ goto set_err;
+ }
+ }
+
+ /* Save the new status */
+ snet->status = status;
+ return;
+
+set_err:
+ snet->status |= VIRTIO_CONFIG_S_FAILED;
+}
+
+static void snet_get_config(struct vdpa_device *vdev, unsigned int offset,
+ void *buf, unsigned int len)
+{
+ struct snet *snet = vdpa_to_snet(vdev);
+ void __iomem *cfg_ptr = snet->cfg->virtio_cfg + offset;
+ u8 *buf_ptr = buf;
+ u32 i;
+
+ /* check for offset error */
+ if (offset + len > snet->cfg->cfg_size)
+ return;
+
+ /* Write into buffer */
+ for (i = 0; i < len; i++)
+ *buf_ptr++ = ioread8(cfg_ptr + i);
+}
+
+static void snet_set_config(struct vdpa_device *vdev, unsigned int offset,
+ const void *buf, unsigned int len)
+{
+ struct snet *snet = vdpa_to_snet(vdev);
+ void __iomem *cfg_ptr = snet->cfg->virtio_cfg + offset;
+ const u8 *buf_ptr = buf;
+ u32 i;
+
+ /* check for offset error */
+ if (offset + len > snet->cfg->cfg_size)
+ return;
+
+ /* Write into PCI BAR */
+ for (i = 0; i < len; i++)
+ iowrite8(*buf_ptr++, cfg_ptr + i);
+}
+
+static int snet_suspend(struct vdpa_device *vdev)
+{
+ struct snet *snet = vdpa_to_snet(vdev);
+ int ret;
+
+ ret = snet_suspend_dev(snet);
+ if (ret)
+ SNET_ERR(snet->pdev, "SNET[%u] suspend failed, err: %d\n", snet->sid, ret);
+ else
+ SNET_DBG(snet->pdev, "Suspend SNET[%u] device\n", snet->sid);
+
+ return ret;
+}
+
+static int snet_resume(struct vdpa_device *vdev)
+{
+ struct snet *snet = vdpa_to_snet(vdev);
+ int ret;
+
+ ret = snet_resume_dev(snet);
+ if (ret)
+ SNET_ERR(snet->pdev, "SNET[%u] resume failed, err: %d\n", snet->sid, ret);
+ else
+ SNET_DBG(snet->pdev, "Resume SNET[%u] device\n", snet->sid);
+
+ return ret;
+}
+
+static const struct vdpa_config_ops snet_config_ops = {
+ .set_vq_address = snet_set_vq_address,
+ .set_vq_num = snet_set_vq_num,
+ .kick_vq = snet_kick_vq,
+ .kick_vq_with_data = snet_kick_vq_with_data,
+ .set_vq_cb = snet_set_vq_cb,
+ .set_vq_ready = snet_set_vq_ready,
+ .get_vq_ready = snet_get_vq_ready,
+ .set_vq_state = snet_set_vq_state,
+ .get_vq_state = snet_get_vq_state,
+ .get_vq_irq = snet_get_vq_irq,
+ .get_vq_align = snet_get_vq_align,
+ .reset = snet_reset,
+ .get_config_size = snet_get_config_size,
+ .get_device_features = snet_get_features,
+ .set_driver_features = snet_set_drv_features,
+ .get_driver_features = snet_get_drv_features,
+ .get_vq_num_min = snet_get_vq_num_max,
+ .get_vq_num_max = snet_get_vq_num_max,
+ .set_config_cb = snet_set_config_cb,
+ .get_device_id = snet_get_device_id,
+ .get_vendor_id = snet_get_vendor_id,
+ .get_status = snet_get_status,
+ .set_status = snet_set_status,
+ .get_config = snet_get_config,
+ .set_config = snet_set_config,
+ .suspend = snet_suspend,
+ .resume = snet_resume,
+};
+
+static int psnet_open_pf_bar(struct pci_dev *pdev, struct psnet *psnet)
+{
+ char name[50];
+ int ret, i, mask = 0;
+ /* We don't know which BAR will be used to communicate..
+ * We will map every bar with len > 0.
+ *
+ * Later, we will discover the BAR and unmap all other BARs.
+ */
+ for (i = 0; i < PCI_STD_NUM_BARS; i++) {
+ if (pci_resource_len(pdev, i))
+ mask |= (1 << i);
+ }
+
+ /* No BAR can be used.. */
+ if (!mask) {
+ SNET_ERR(pdev, "Failed to find a PCI BAR\n");
+ return -ENODEV;
+ }
+
+ snprintf(name, sizeof(name), "psnet[%s]-bars", pci_name(pdev));
+ ret = pcim_iomap_regions(pdev, mask, name);
+ if (ret) {
+ SNET_ERR(pdev, "Failed to request and map PCI BARs\n");
+ return ret;
+ }
+
+ for (i = 0; i < PCI_STD_NUM_BARS; i++) {
+ if (mask & (1 << i))
+ psnet->bars[i] = pcim_iomap_table(pdev)[i];
+ }
+
+ return 0;
+}
+
+static int snet_open_vf_bar(struct pci_dev *pdev, struct snet *snet)
+{
+ char name[50];
+ int ret;
+
+ snprintf(name, sizeof(name), "snet[%s]-bar", pci_name(pdev));
+ /* Request and map BAR */
+ ret = pcim_iomap_regions(pdev, BIT(snet->psnet->cfg.vf_bar), name);
+ if (ret) {
+ SNET_ERR(pdev, "Failed to request and map PCI BAR for a VF\n");
+ return ret;
+ }
+
+ snet->bar = pcim_iomap_table(pdev)[snet->psnet->cfg.vf_bar];
+
+ return 0;
+}
+
+static void snet_free_cfg(struct snet_cfg *cfg)
+{
+ u32 i;
+
+ if (!cfg->devs)
+ return;
+
+ /* Free devices */
+ for (i = 0; i < cfg->devices_num; i++) {
+ if (!cfg->devs[i])
+ break;
+
+ kfree(cfg->devs[i]);
+ }
+ /* Free pointers to devices */
+ kfree(cfg->devs);
+}
+
+/* Detect which BAR is used for communication with the device. */
+static int psnet_detect_bar(struct psnet *psnet, u32 off)
+{
+ unsigned long exit_time;
+ int i;
+
+ exit_time = jiffies + usecs_to_jiffies(SNET_DETECT_TIMEOUT);
+
+ /* SNET DPU will write SNET's signature when the config is ready. */
+ while (time_before(jiffies, exit_time)) {
+ for (i = 0; i < PCI_STD_NUM_BARS; i++) {
+ /* Is this BAR mapped? */
+ if (!psnet->bars[i])
+ continue;
+
+ if (ioread32(psnet->bars[i] + off) == SNET_SIGNATURE)
+ return i;
+ }
+ usleep_range(1000, 10000);
+ }
+
+ return -ENODEV;
+}
+
+static void psnet_unmap_unused_bars(struct pci_dev *pdev, struct psnet *psnet)
+{
+ int i, mask = 0;
+
+ for (i = 0; i < PCI_STD_NUM_BARS; i++) {
+ if (psnet->bars[i] && i != psnet->barno)
+ mask |= (1 << i);
+ }
+
+ if (mask)
+ pcim_iounmap_regions(pdev, mask);
+}
+
+/* Read SNET config from PCI BAR */
+static int psnet_read_cfg(struct pci_dev *pdev, struct psnet *psnet)
+{
+ struct snet_cfg *cfg = &psnet->cfg;
+ u32 i, off;
+ int barno;
+
+ /* Move to where the config starts */
+ off = SNET_CONFIG_OFF;
+
+ /* Find BAR used for communication */
+ barno = psnet_detect_bar(psnet, off);
+ if (barno < 0) {
+ SNET_ERR(pdev, "SNET config is not ready.\n");
+ return barno;
+ }
+
+ /* Save used BAR number and unmap all other BARs */
+ psnet->barno = barno;
+ SNET_DBG(pdev, "Using BAR number %d\n", barno);
+
+ psnet_unmap_unused_bars(pdev, psnet);
+
+ /* load config from BAR */
+ cfg->key = psnet_read32(psnet, off);
+ off += 4;
+ cfg->cfg_size = psnet_read32(psnet, off);
+ off += 4;
+ cfg->cfg_ver = psnet_read32(psnet, off);
+ off += 4;
+ /* The negotiated config version is the lower one between this driver's config
+ * and the DPU's.
+ */
+ psnet->negotiated_cfg_ver = min_t(u32, cfg->cfg_ver, SNET_CFG_VERSION);
+ SNET_DBG(pdev, "SNET config version %u\n", psnet->negotiated_cfg_ver);
+
+ cfg->vf_num = psnet_read32(psnet, off);
+ off += 4;
+ cfg->vf_bar = psnet_read32(psnet, off);
+ off += 4;
+ cfg->host_cfg_off = psnet_read32(psnet, off);
+ off += 4;
+ cfg->max_size_host_cfg = psnet_read32(psnet, off);
+ off += 4;
+ cfg->virtio_cfg_off = psnet_read32(psnet, off);
+ off += 4;
+ cfg->kick_off = psnet_read32(psnet, off);
+ off += 4;
+ cfg->hwmon_off = psnet_read32(psnet, off);
+ off += 4;
+ cfg->ctrl_off = psnet_read32(psnet, off);
+ off += 4;
+ cfg->flags = psnet_read32(psnet, off);
+ off += 4;
+ /* Ignore Reserved */
+ off += sizeof(cfg->rsvd);
+
+ cfg->devices_num = psnet_read32(psnet, off);
+ off += 4;
+ /* Allocate memory to hold pointer to the devices */
+ cfg->devs = kcalloc(cfg->devices_num, sizeof(void *), GFP_KERNEL);
+ if (!cfg->devs)
+ return -ENOMEM;
+
+ /* Load device configuration from BAR */
+ for (i = 0; i < cfg->devices_num; i++) {
+ cfg->devs[i] = kzalloc(sizeof(*cfg->devs[i]), GFP_KERNEL);
+ if (!cfg->devs[i]) {
+ snet_free_cfg(cfg);
+ return -ENOMEM;
+ }
+ /* Read device config */
+ cfg->devs[i]->virtio_id = psnet_read32(psnet, off);
+ off += 4;
+ cfg->devs[i]->vq_num = psnet_read32(psnet, off);
+ off += 4;
+ cfg->devs[i]->vq_size = psnet_read32(psnet, off);
+ off += 4;
+ cfg->devs[i]->vfid = psnet_read32(psnet, off);
+ off += 4;
+ cfg->devs[i]->features = psnet_read64(psnet, off);
+ off += 8;
+ /* Ignore Reserved */
+ off += sizeof(cfg->devs[i]->rsvd);
+
+ cfg->devs[i]->cfg_size = psnet_read32(psnet, off);
+ off += 4;
+
+ /* Is the config witten to the DPU going to be too big? */
+ if (SNET_GENERAL_CFG_LEN + SNET_GENERAL_CFG_VQ_LEN * cfg->devs[i]->vq_num >
+ cfg->max_size_host_cfg) {
+ SNET_ERR(pdev, "Failed to read SNET config, the config is too big..\n");
+ snet_free_cfg(cfg);
+ return -EINVAL;
+ }
+ }
+ return 0;
+}
+
+static int psnet_alloc_irq_vector(struct pci_dev *pdev, struct psnet *psnet)
+{
+ int ret = 0;
+ u32 i, irq_num = 0;
+
+ /* Let's count how many IRQs we need, 1 for every VQ + 1 for config change */
+ for (i = 0; i < psnet->cfg.devices_num; i++)
+ irq_num += psnet->cfg.devs[i]->vq_num + 1;
+
+ ret = pci_alloc_irq_vectors(pdev, irq_num, irq_num, PCI_IRQ_MSIX);
+ if (ret != irq_num) {
+ SNET_ERR(pdev, "Failed to allocate IRQ vectors\n");
+ return ret;
+ }
+ SNET_DBG(pdev, "Allocated %u IRQ vectors from physical function\n", irq_num);
+
+ return 0;
+}
+
+static int snet_alloc_irq_vector(struct pci_dev *pdev, struct snet_dev_cfg *snet_cfg)
+{
+ int ret = 0;
+ u32 irq_num;
+
+ /* We want 1 IRQ for every VQ + 1 for config change events */
+ irq_num = snet_cfg->vq_num + 1;
+
+ ret = pci_alloc_irq_vectors(pdev, irq_num, irq_num, PCI_IRQ_MSIX);
+ if (ret <= 0) {
+ SNET_ERR(pdev, "Failed to allocate IRQ vectors\n");
+ return ret;
+ }
+
+ return 0;
+}
+
+static void snet_free_vqs(struct snet *snet)
+{
+ u32 i;
+
+ if (!snet->vqs)
+ return;
+
+ for (i = 0 ; i < snet->cfg->vq_num ; i++) {
+ if (!snet->vqs[i])
+ break;
+
+ kfree(snet->vqs[i]);
+ }
+ kfree(snet->vqs);
+}
+
+static int snet_build_vqs(struct snet *snet)
+{
+ u32 i;
+ /* Allocate the VQ pointers array */
+ snet->vqs = kcalloc(snet->cfg->vq_num, sizeof(void *), GFP_KERNEL);
+ if (!snet->vqs)
+ return -ENOMEM;
+
+ /* Allocate the VQs */
+ for (i = 0; i < snet->cfg->vq_num; i++) {
+ snet->vqs[i] = kzalloc(sizeof(*snet->vqs[i]), GFP_KERNEL);
+ if (!snet->vqs[i]) {
+ snet_free_vqs(snet);
+ return -ENOMEM;
+ }
+ /* Reset IRQ num */
+ snet->vqs[i]->irq = -1;
+ /* VQ serial ID */
+ snet->vqs[i]->sid = i;
+ /* Kick address - every VQ gets 4B */
+ snet->vqs[i]->kick_ptr = snet->bar + snet->psnet->cfg.kick_off +
+ snet->vqs[i]->sid * 4;
+ /* Clear kick address for this VQ */
+ iowrite32(0, snet->vqs[i]->kick_ptr);
+ }
+ return 0;
+}
+
+static int psnet_get_next_irq_num(struct psnet *psnet)
+{
+ int irq;
+
+ spin_lock(&psnet->lock);
+ irq = psnet->next_irq++;
+ spin_unlock(&psnet->lock);
+
+ return irq;
+}
+
+static void snet_reserve_irq_idx(struct pci_dev *pdev, struct snet *snet)
+{
+ struct psnet *psnet = snet->psnet;
+ int i;
+
+ /* one IRQ for every VQ, and one for config changes */
+ snet->cfg_irq_idx = psnet_get_next_irq_num(psnet);
+ snprintf(snet->cfg_irq_name, SNET_NAME_SIZE, "snet[%s]-cfg[%d]",
+ pci_name(pdev), snet->cfg_irq_idx);
+
+ for (i = 0; i < snet->cfg->vq_num; i++) {
+ /* Get next free IRQ ID */
+ snet->vqs[i]->irq_idx = psnet_get_next_irq_num(psnet);
+ /* Write IRQ name */
+ snprintf(snet->vqs[i]->irq_name, SNET_NAME_SIZE, "snet[%s]-vq[%d]",
+ pci_name(pdev), snet->vqs[i]->irq_idx);
+ }
+}
+
+/* Find a device config based on virtual function id */
+static struct snet_dev_cfg *snet_find_dev_cfg(struct snet_cfg *cfg, u32 vfid)
+{
+ u32 i;
+
+ for (i = 0; i < cfg->devices_num; i++) {
+ if (cfg->devs[i]->vfid == vfid)
+ return cfg->devs[i];
+ }
+ /* Oppss.. no config found.. */
+ return NULL;
+}
+
+/* Probe function for a physical PCI function */
+static int snet_vdpa_probe_pf(struct pci_dev *pdev)
+{
+ struct psnet *psnet;
+ int ret = 0;
+ bool pf_irqs = false;
+
+ ret = pcim_enable_device(pdev);
+ if (ret) {
+ SNET_ERR(pdev, "Failed to enable PCI device\n");
+ return ret;
+ }
+
+ /* Allocate a PCI physical function device */
+ psnet = kzalloc(sizeof(*psnet), GFP_KERNEL);
+ if (!psnet)
+ return -ENOMEM;
+
+ /* Init PSNET spinlock */
+ spin_lock_init(&psnet->lock);
+
+ pci_set_master(pdev);
+ pci_set_drvdata(pdev, psnet);
+
+ /* Open SNET MAIN BAR */
+ ret = psnet_open_pf_bar(pdev, psnet);
+ if (ret)
+ goto free_psnet;
+
+ /* Try to read SNET's config from PCI BAR */
+ ret = psnet_read_cfg(pdev, psnet);
+ if (ret)
+ goto free_psnet;
+
+ /* If SNET_CFG_FLAG_IRQ_PF flag is set, we should use
+ * PF MSI-X vectors
+ */
+ pf_irqs = PSNET_FLAG_ON(psnet, SNET_CFG_FLAG_IRQ_PF);
+
+ if (pf_irqs) {
+ ret = psnet_alloc_irq_vector(pdev, psnet);
+ if (ret)
+ goto free_cfg;
+ }
+
+ SNET_DBG(pdev, "Enable %u virtual functions\n", psnet->cfg.vf_num);
+ ret = pci_enable_sriov(pdev, psnet->cfg.vf_num);
+ if (ret) {
+ SNET_ERR(pdev, "Failed to enable SR-IOV\n");
+ goto free_irq;
+ }
+
+ /* Create HW monitor device */
+ if (PSNET_FLAG_ON(psnet, SNET_CFG_FLAG_HWMON)) {
+#if IS_ENABLED(CONFIG_HWMON)
+ psnet_create_hwmon(pdev);
+#else
+ SNET_WARN(pdev, "Can't start HWMON, CONFIG_HWMON is not enabled\n");
+#endif
+ }
+
+ return 0;
+
+free_irq:
+ if (pf_irqs)
+ pci_free_irq_vectors(pdev);
+free_cfg:
+ snet_free_cfg(&psnet->cfg);
+free_psnet:
+ kfree(psnet);
+ return ret;
+}
+
+/* Probe function for a virtual PCI function */
+static int snet_vdpa_probe_vf(struct pci_dev *pdev)
+{
+ struct pci_dev *pdev_pf = pdev->physfn;
+ struct psnet *psnet = pci_get_drvdata(pdev_pf);
+ struct snet_dev_cfg *dev_cfg;
+ struct snet *snet;
+ u32 vfid;
+ int ret;
+ bool pf_irqs = false;
+
+ /* Get virtual function id.
+ * (the DPU counts the VFs from 1)
+ */
+ ret = pci_iov_vf_id(pdev);
+ if (ret < 0) {
+ SNET_ERR(pdev, "Failed to find a VF id\n");
+ return ret;
+ }
+ vfid = ret + 1;
+
+ /* Find the snet_dev_cfg based on vfid */
+ dev_cfg = snet_find_dev_cfg(&psnet->cfg, vfid);
+ if (!dev_cfg) {
+ SNET_WARN(pdev, "Failed to find a VF config..\n");
+ return -ENODEV;
+ }
+
+ /* Which PCI device should allocate the IRQs?
+ * If the SNET_CFG_FLAG_IRQ_PF flag set, the PF device allocates the IRQs
+ */
+ pf_irqs = PSNET_FLAG_ON(psnet, SNET_CFG_FLAG_IRQ_PF);
+
+ ret = pcim_enable_device(pdev);
+ if (ret) {
+ SNET_ERR(pdev, "Failed to enable PCI VF device\n");
+ return ret;
+ }
+
+ /* Request for MSI-X IRQs */
+ if (!pf_irqs) {
+ ret = snet_alloc_irq_vector(pdev, dev_cfg);
+ if (ret)
+ return ret;
+ }
+
+ /* Allocate vdpa device */
+ snet = vdpa_alloc_device(struct snet, vdpa, &pdev->dev, &snet_config_ops, 1, 1, NULL,
+ false);
+ if (!snet) {
+ SNET_ERR(pdev, "Failed to allocate a vdpa device\n");
+ ret = -ENOMEM;
+ goto free_irqs;
+ }
+
+ /* Init control mutex and spinlock */
+ mutex_init(&snet->ctrl_lock);
+ spin_lock_init(&snet->ctrl_spinlock);
+
+ /* Save pci device pointer */
+ snet->pdev = pdev;
+ snet->psnet = psnet;
+ snet->cfg = dev_cfg;
+ snet->dpu_ready = false;
+ snet->sid = vfid;
+ /* Reset IRQ value */
+ snet->cfg_irq = -1;
+
+ ret = snet_open_vf_bar(pdev, snet);
+ if (ret)
+ goto put_device;
+
+ /* Create a VirtIO config pointer */
+ snet->cfg->virtio_cfg = snet->bar + snet->psnet->cfg.virtio_cfg_off;
+
+ /* Clear control registers */
+ snet_ctrl_clear(snet);
+
+ pci_set_master(pdev);
+ pci_set_drvdata(pdev, snet);
+
+ ret = snet_build_vqs(snet);
+ if (ret)
+ goto put_device;
+
+ /* Reserve IRQ indexes,
+ * The IRQs may be requested and freed multiple times,
+ * but the indexes won't change.
+ */
+ snet_reserve_irq_idx(pf_irqs ? pdev_pf : pdev, snet);
+
+ /*set DMA device*/
+ snet->vdpa.dma_dev = &pdev->dev;
+
+ /* Register VDPA device */
+ ret = vdpa_register_device(&snet->vdpa, snet->cfg->vq_num);
+ if (ret) {
+ SNET_ERR(pdev, "Failed to register vdpa device\n");
+ goto free_vqs;
+ }
+
+ return 0;
+
+free_vqs:
+ snet_free_vqs(snet);
+put_device:
+ put_device(&snet->vdpa.dev);
+free_irqs:
+ if (!pf_irqs)
+ pci_free_irq_vectors(pdev);
+ return ret;
+}
+
+static int snet_vdpa_probe(struct pci_dev *pdev, const struct pci_device_id *id)
+{
+ if (pdev->is_virtfn)
+ return snet_vdpa_probe_vf(pdev);
+ else
+ return snet_vdpa_probe_pf(pdev);
+}
+
+static void snet_vdpa_remove_pf(struct pci_dev *pdev)
+{
+ struct psnet *psnet = pci_get_drvdata(pdev);
+
+ pci_disable_sriov(pdev);
+ /* If IRQs are allocated from the PF, we should free the IRQs */
+ if (PSNET_FLAG_ON(psnet, SNET_CFG_FLAG_IRQ_PF))
+ pci_free_irq_vectors(pdev);
+
+ snet_free_cfg(&psnet->cfg);
+ kfree(psnet);
+}
+
+static void snet_vdpa_remove_vf(struct pci_dev *pdev)
+{
+ struct snet *snet = pci_get_drvdata(pdev);
+ struct psnet *psnet = snet->psnet;
+
+ vdpa_unregister_device(&snet->vdpa);
+ snet_free_vqs(snet);
+ /* If IRQs are allocated from the VF, we should free the IRQs */
+ if (!PSNET_FLAG_ON(psnet, SNET_CFG_FLAG_IRQ_PF))
+ pci_free_irq_vectors(pdev);
+}
+
+static void snet_vdpa_remove(struct pci_dev *pdev)
+{
+ if (pdev->is_virtfn)
+ snet_vdpa_remove_vf(pdev);
+ else
+ snet_vdpa_remove_pf(pdev);
+}
+
+static struct pci_device_id snet_driver_pci_ids[] = {
+ { PCI_DEVICE_SUB(PCI_VENDOR_ID_SOLIDRUN, SNET_DEVICE_ID,
+ PCI_VENDOR_ID_SOLIDRUN, SNET_DEVICE_ID) },
+ { 0 },
+};
+
+MODULE_DEVICE_TABLE(pci, snet_driver_pci_ids);
+
+static struct pci_driver snet_vdpa_driver = {
+ .name = "snet-vdpa-driver",
+ .id_table = snet_driver_pci_ids,
+ .probe = snet_vdpa_probe,
+ .remove = snet_vdpa_remove,
+};
+
+module_pci_driver(snet_vdpa_driver);
+
+MODULE_AUTHOR("Alvaro Karsz <alvaro.karsz@solid-run.com>");
+MODULE_DESCRIPTION("SolidRun vDPA driver");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/vdpa/solidrun/snet_vdpa.h b/drivers/vdpa/solidrun/snet_vdpa.h
new file mode 100644
index 0000000000..36ac285835
--- /dev/null
+++ b/drivers/vdpa/solidrun/snet_vdpa.h
@@ -0,0 +1,209 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * SolidRun DPU driver for control plane
+ *
+ * Copyright (C) 2022-2023 SolidRun
+ *
+ * Author: Alvaro Karsz <alvaro.karsz@solid-run.com>
+ *
+ */
+#ifndef _SNET_VDPA_H_
+#define _SNET_VDPA_H_
+
+#include <linux/vdpa.h>
+#include <linux/pci.h>
+
+#define SNET_NAME_SIZE 256
+
+#define SNET_ERR(pdev, fmt, ...) dev_err(&(pdev)->dev, "%s"fmt, "snet_vdpa: ", ##__VA_ARGS__)
+#define SNET_WARN(pdev, fmt, ...) dev_warn(&(pdev)->dev, "%s"fmt, "snet_vdpa: ", ##__VA_ARGS__)
+#define SNET_INFO(pdev, fmt, ...) dev_info(&(pdev)->dev, "%s"fmt, "snet_vdpa: ", ##__VA_ARGS__)
+#define SNET_DBG(pdev, fmt, ...) dev_dbg(&(pdev)->dev, "%s"fmt, "snet_vdpa: ", ##__VA_ARGS__)
+#define SNET_HAS_FEATURE(s, f) ((s)->negotiated_features & BIT_ULL(f))
+/* Check if negotiated config version is at least @ver */
+#define SNET_CFG_VER(snet, ver) ((snet)->psnet->negotiated_cfg_ver >= (ver))
+
+/* VQ struct */
+struct snet_vq {
+ /* VQ callback */
+ struct vdpa_callback cb;
+ /* VQ state received from bus */
+ struct vdpa_vq_state vq_state;
+ /* desc base address */
+ u64 desc_area;
+ /* device base address */
+ u64 device_area;
+ /* driver base address */
+ u64 driver_area;
+ /* Queue size */
+ u32 num;
+ /* Serial ID for VQ */
+ u32 sid;
+ /* is ready flag */
+ bool ready;
+ /* IRQ number */
+ u32 irq;
+ /* IRQ index, DPU uses this to parse data from MSI-X table */
+ u32 irq_idx;
+ /* IRQ name */
+ char irq_name[SNET_NAME_SIZE];
+ /* pointer to mapped PCI BAR register used by this VQ to kick */
+ void __iomem *kick_ptr;
+};
+
+struct snet {
+ /* vdpa device */
+ struct vdpa_device vdpa;
+ /* Config callback */
+ struct vdpa_callback cb;
+ /* To lock the control mechanism */
+ struct mutex ctrl_lock;
+ /* Spinlock to protect critical parts in the control mechanism */
+ spinlock_t ctrl_spinlock;
+ /* array of virqueues */
+ struct snet_vq **vqs;
+ /* Used features */
+ u64 negotiated_features;
+ /* Device serial ID */
+ u32 sid;
+ /* device status */
+ u8 status;
+ /* boolean indicating if snet config was passed to the device */
+ bool dpu_ready;
+ /* IRQ number */
+ u32 cfg_irq;
+ /* IRQ index, DPU uses this to parse data from MSI-X table */
+ u32 cfg_irq_idx;
+ /* IRQ name */
+ char cfg_irq_name[SNET_NAME_SIZE];
+ /* BAR to access the VF */
+ void __iomem *bar;
+ /* PCI device */
+ struct pci_dev *pdev;
+ /* Pointer to snet pdev parent device */
+ struct psnet *psnet;
+ /* Pointer to snet config device */
+ struct snet_dev_cfg *cfg;
+};
+
+struct snet_dev_cfg {
+ /* Device ID following VirtIO spec. */
+ u32 virtio_id;
+ /* Number of VQs for this device */
+ u32 vq_num;
+ /* Size of every VQ */
+ u32 vq_size;
+ /* Virtual Function id */
+ u32 vfid;
+ /* Device features, following VirtIO spec */
+ u64 features;
+ /* Reserved for future usage */
+ u32 rsvd[6];
+ /* VirtIO device specific config size */
+ u32 cfg_size;
+ /* VirtIO device specific config address */
+ void __iomem *virtio_cfg;
+} __packed;
+
+struct snet_cfg {
+ /* Magic key */
+ u32 key;
+ /* Size of total config in bytes */
+ u32 cfg_size;
+ /* Config version */
+ u32 cfg_ver;
+ /* Number of Virtual Functions to create */
+ u32 vf_num;
+ /* BAR to use for the VFs */
+ u32 vf_bar;
+ /* Where should we write the SNET's config */
+ u32 host_cfg_off;
+ /* Max. allowed size for a SNET's config */
+ u32 max_size_host_cfg;
+ /* VirtIO config offset in BAR */
+ u32 virtio_cfg_off;
+ /* Offset in PCI BAR for VQ kicks */
+ u32 kick_off;
+ /* Offset in PCI BAR for HW monitoring */
+ u32 hwmon_off;
+ /* Offset in PCI BAR for Control mechanism */
+ u32 ctrl_off;
+ /* Config general flags - enum snet_cfg_flags */
+ u32 flags;
+ /* Reserved for future usage */
+ u32 rsvd[6];
+ /* Number of snet devices */
+ u32 devices_num;
+ /* The actual devices */
+ struct snet_dev_cfg **devs;
+} __packed;
+
+/* SolidNET PCIe device, one device per PCIe physical function */
+struct psnet {
+ /* PCI BARs */
+ void __iomem *bars[PCI_STD_NUM_BARS];
+ /* Negotiated config version */
+ u32 negotiated_cfg_ver;
+ /* Next IRQ index to use in case when the IRQs are allocated from this device */
+ u32 next_irq;
+ /* BAR number used to communicate with the device */
+ u8 barno;
+ /* spinlock to protect data that can be changed by SNET devices */
+ spinlock_t lock;
+ /* Pointer to the device's config read from BAR */
+ struct snet_cfg cfg;
+ /* Name of monitor device */
+ char hwmon_name[SNET_NAME_SIZE];
+};
+
+enum snet_cfg_flags {
+ /* Create a HWMON device */
+ SNET_CFG_FLAG_HWMON = BIT(0),
+ /* USE IRQs from the physical function */
+ SNET_CFG_FLAG_IRQ_PF = BIT(1),
+};
+
+#define PSNET_FLAG_ON(p, f) ((p)->cfg.flags & (f))
+
+static inline u32 psnet_read32(struct psnet *psnet, u32 off)
+{
+ return ioread32(psnet->bars[psnet->barno] + off);
+}
+
+static inline u32 snet_read32(struct snet *snet, u32 off)
+{
+ return ioread32(snet->bar + off);
+}
+
+static inline void snet_write32(struct snet *snet, u32 off, u32 val)
+{
+ iowrite32(val, snet->bar + off);
+}
+
+static inline u64 psnet_read64(struct psnet *psnet, u32 off)
+{
+ u64 val;
+ /* 64bits are written in 2 halves, low part first */
+ val = (u64)psnet_read32(psnet, off);
+ val |= ((u64)psnet_read32(psnet, off + 4) << 32);
+ return val;
+}
+
+static inline void snet_write64(struct snet *snet, u32 off, u64 val)
+{
+ /* The DPU expects a 64bit integer in 2 halves, the low part first */
+ snet_write32(snet, off, (u32)val);
+ snet_write32(snet, off + 4, (u32)(val >> 32));
+}
+
+#if IS_ENABLED(CONFIG_HWMON)
+void psnet_create_hwmon(struct pci_dev *pdev);
+#endif
+
+void snet_ctrl_clear(struct snet *snet);
+int snet_destroy_dev(struct snet *snet);
+int snet_read_vq_state(struct snet *snet, u16 idx, struct vdpa_vq_state *state);
+int snet_suspend_dev(struct snet *snet);
+int snet_resume_dev(struct snet *snet);
+
+#endif //_SNET_VDPA_H_
diff --git a/drivers/vdpa/vdpa.c b/drivers/vdpa/vdpa.c
new file mode 100644
index 0000000000..a7612e0783
--- /dev/null
+++ b/drivers/vdpa/vdpa.c
@@ -0,0 +1,1329 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * vDPA bus.
+ *
+ * Copyright (c) 2020, Red Hat. All rights reserved.
+ * Author: Jason Wang <jasowang@redhat.com>
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/idr.h>
+#include <linux/slab.h>
+#include <linux/vdpa.h>
+#include <uapi/linux/vdpa.h>
+#include <net/genetlink.h>
+#include <linux/mod_devicetable.h>
+#include <linux/virtio_ids.h>
+
+static LIST_HEAD(mdev_head);
+/* A global mutex that protects vdpa management device and device level operations. */
+static DECLARE_RWSEM(vdpa_dev_lock);
+static DEFINE_IDA(vdpa_index_ida);
+
+void vdpa_set_status(struct vdpa_device *vdev, u8 status)
+{
+ down_write(&vdev->cf_lock);
+ vdev->config->set_status(vdev, status);
+ up_write(&vdev->cf_lock);
+}
+EXPORT_SYMBOL(vdpa_set_status);
+
+static struct genl_family vdpa_nl_family;
+
+static int vdpa_dev_probe(struct device *d)
+{
+ struct vdpa_device *vdev = dev_to_vdpa(d);
+ struct vdpa_driver *drv = drv_to_vdpa(vdev->dev.driver);
+ const struct vdpa_config_ops *ops = vdev->config;
+ u32 max_num, min_num = 1;
+ int ret = 0;
+
+ d->dma_mask = &d->coherent_dma_mask;
+ ret = dma_set_mask_and_coherent(d, DMA_BIT_MASK(64));
+ if (ret)
+ return ret;
+
+ max_num = ops->get_vq_num_max(vdev);
+ if (ops->get_vq_num_min)
+ min_num = ops->get_vq_num_min(vdev);
+ if (max_num < min_num)
+ return -EINVAL;
+
+ if (drv && drv->probe)
+ ret = drv->probe(vdev);
+
+ return ret;
+}
+
+static void vdpa_dev_remove(struct device *d)
+{
+ struct vdpa_device *vdev = dev_to_vdpa(d);
+ struct vdpa_driver *drv = drv_to_vdpa(vdev->dev.driver);
+
+ if (drv && drv->remove)
+ drv->remove(vdev);
+}
+
+static int vdpa_dev_match(struct device *dev, struct device_driver *drv)
+{
+ struct vdpa_device *vdev = dev_to_vdpa(dev);
+
+ /* Check override first, and if set, only use the named driver */
+ if (vdev->driver_override)
+ return strcmp(vdev->driver_override, drv->name) == 0;
+
+ /* Currently devices must be supported by all vDPA bus drivers */
+ return 1;
+}
+
+static ssize_t driver_override_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct vdpa_device *vdev = dev_to_vdpa(dev);
+ int ret;
+
+ ret = driver_set_override(dev, &vdev->driver_override, buf, count);
+ if (ret)
+ return ret;
+
+ return count;
+}
+
+static ssize_t driver_override_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct vdpa_device *vdev = dev_to_vdpa(dev);
+ ssize_t len;
+
+ device_lock(dev);
+ len = snprintf(buf, PAGE_SIZE, "%s\n", vdev->driver_override);
+ device_unlock(dev);
+
+ return len;
+}
+static DEVICE_ATTR_RW(driver_override);
+
+static struct attribute *vdpa_dev_attrs[] = {
+ &dev_attr_driver_override.attr,
+ NULL,
+};
+
+static const struct attribute_group vdpa_dev_group = {
+ .attrs = vdpa_dev_attrs,
+};
+__ATTRIBUTE_GROUPS(vdpa_dev);
+
+static struct bus_type vdpa_bus = {
+ .name = "vdpa",
+ .dev_groups = vdpa_dev_groups,
+ .match = vdpa_dev_match,
+ .probe = vdpa_dev_probe,
+ .remove = vdpa_dev_remove,
+};
+
+static void vdpa_release_dev(struct device *d)
+{
+ struct vdpa_device *vdev = dev_to_vdpa(d);
+ const struct vdpa_config_ops *ops = vdev->config;
+
+ if (ops->free)
+ ops->free(vdev);
+
+ ida_simple_remove(&vdpa_index_ida, vdev->index);
+ kfree(vdev->driver_override);
+ kfree(vdev);
+}
+
+/**
+ * __vdpa_alloc_device - allocate and initilaize a vDPA device
+ * This allows driver to some prepartion after device is
+ * initialized but before registered.
+ * @parent: the parent device
+ * @config: the bus operations that is supported by this device
+ * @ngroups: number of groups supported by this device
+ * @nas: number of address spaces supported by this device
+ * @size: size of the parent structure that contains private data
+ * @name: name of the vdpa device; optional.
+ * @use_va: indicate whether virtual address must be used by this device
+ *
+ * Driver should use vdpa_alloc_device() wrapper macro instead of
+ * using this directly.
+ *
+ * Return: Returns an error when parent/config/dma_dev is not set or fail to get
+ * ida.
+ */
+struct vdpa_device *__vdpa_alloc_device(struct device *parent,
+ const struct vdpa_config_ops *config,
+ unsigned int ngroups, unsigned int nas,
+ size_t size, const char *name,
+ bool use_va)
+{
+ struct vdpa_device *vdev;
+ int err = -EINVAL;
+
+ if (!config)
+ goto err;
+
+ if (!!config->dma_map != !!config->dma_unmap)
+ goto err;
+
+ /* It should only work for the device that use on-chip IOMMU */
+ if (use_va && !(config->dma_map || config->set_map))
+ goto err;
+
+ err = -ENOMEM;
+ vdev = kzalloc(size, GFP_KERNEL);
+ if (!vdev)
+ goto err;
+
+ err = ida_alloc(&vdpa_index_ida, GFP_KERNEL);
+ if (err < 0)
+ goto err_ida;
+
+ vdev->dev.bus = &vdpa_bus;
+ vdev->dev.parent = parent;
+ vdev->dev.release = vdpa_release_dev;
+ vdev->index = err;
+ vdev->config = config;
+ vdev->features_valid = false;
+ vdev->use_va = use_va;
+ vdev->ngroups = ngroups;
+ vdev->nas = nas;
+
+ if (name)
+ err = dev_set_name(&vdev->dev, "%s", name);
+ else
+ err = dev_set_name(&vdev->dev, "vdpa%u", vdev->index);
+ if (err)
+ goto err_name;
+
+ init_rwsem(&vdev->cf_lock);
+ device_initialize(&vdev->dev);
+
+ return vdev;
+
+err_name:
+ ida_simple_remove(&vdpa_index_ida, vdev->index);
+err_ida:
+ kfree(vdev);
+err:
+ return ERR_PTR(err);
+}
+EXPORT_SYMBOL_GPL(__vdpa_alloc_device);
+
+static int vdpa_name_match(struct device *dev, const void *data)
+{
+ struct vdpa_device *vdev = container_of(dev, struct vdpa_device, dev);
+
+ return (strcmp(dev_name(&vdev->dev), data) == 0);
+}
+
+static int __vdpa_register_device(struct vdpa_device *vdev, u32 nvqs)
+{
+ struct device *dev;
+
+ vdev->nvqs = nvqs;
+
+ lockdep_assert_held(&vdpa_dev_lock);
+ dev = bus_find_device(&vdpa_bus, NULL, dev_name(&vdev->dev), vdpa_name_match);
+ if (dev) {
+ put_device(dev);
+ return -EEXIST;
+ }
+ return device_add(&vdev->dev);
+}
+
+/**
+ * _vdpa_register_device - register a vDPA device with vdpa lock held
+ * Caller must have a succeed call of vdpa_alloc_device() before.
+ * Caller must invoke this routine in the management device dev_add()
+ * callback after setting up valid mgmtdev for this vdpa device.
+ * @vdev: the vdpa device to be registered to vDPA bus
+ * @nvqs: number of virtqueues supported by this device
+ *
+ * Return: Returns an error when fail to add device to vDPA bus
+ */
+int _vdpa_register_device(struct vdpa_device *vdev, u32 nvqs)
+{
+ if (!vdev->mdev)
+ return -EINVAL;
+
+ return __vdpa_register_device(vdev, nvqs);
+}
+EXPORT_SYMBOL_GPL(_vdpa_register_device);
+
+/**
+ * vdpa_register_device - register a vDPA device
+ * Callers must have a succeed call of vdpa_alloc_device() before.
+ * @vdev: the vdpa device to be registered to vDPA bus
+ * @nvqs: number of virtqueues supported by this device
+ *
+ * Return: Returns an error when fail to add to vDPA bus
+ */
+int vdpa_register_device(struct vdpa_device *vdev, u32 nvqs)
+{
+ int err;
+
+ down_write(&vdpa_dev_lock);
+ err = __vdpa_register_device(vdev, nvqs);
+ up_write(&vdpa_dev_lock);
+ return err;
+}
+EXPORT_SYMBOL_GPL(vdpa_register_device);
+
+/**
+ * _vdpa_unregister_device - unregister a vDPA device
+ * Caller must invoke this routine as part of management device dev_del()
+ * callback.
+ * @vdev: the vdpa device to be unregisted from vDPA bus
+ */
+void _vdpa_unregister_device(struct vdpa_device *vdev)
+{
+ lockdep_assert_held(&vdpa_dev_lock);
+ WARN_ON(!vdev->mdev);
+ device_unregister(&vdev->dev);
+}
+EXPORT_SYMBOL_GPL(_vdpa_unregister_device);
+
+/**
+ * vdpa_unregister_device - unregister a vDPA device
+ * @vdev: the vdpa device to be unregisted from vDPA bus
+ */
+void vdpa_unregister_device(struct vdpa_device *vdev)
+{
+ down_write(&vdpa_dev_lock);
+ device_unregister(&vdev->dev);
+ up_write(&vdpa_dev_lock);
+}
+EXPORT_SYMBOL_GPL(vdpa_unregister_device);
+
+/**
+ * __vdpa_register_driver - register a vDPA device driver
+ * @drv: the vdpa device driver to be registered
+ * @owner: module owner of the driver
+ *
+ * Return: Returns an err when fail to do the registration
+ */
+int __vdpa_register_driver(struct vdpa_driver *drv, struct module *owner)
+{
+ drv->driver.bus = &vdpa_bus;
+ drv->driver.owner = owner;
+
+ return driver_register(&drv->driver);
+}
+EXPORT_SYMBOL_GPL(__vdpa_register_driver);
+
+/**
+ * vdpa_unregister_driver - unregister a vDPA device driver
+ * @drv: the vdpa device driver to be unregistered
+ */
+void vdpa_unregister_driver(struct vdpa_driver *drv)
+{
+ driver_unregister(&drv->driver);
+}
+EXPORT_SYMBOL_GPL(vdpa_unregister_driver);
+
+/**
+ * vdpa_mgmtdev_register - register a vdpa management device
+ *
+ * @mdev: Pointer to vdpa management device
+ * vdpa_mgmtdev_register() register a vdpa management device which supports
+ * vdpa device management.
+ * Return: Returns 0 on success or failure when required callback ops are not
+ * initialized.
+ */
+int vdpa_mgmtdev_register(struct vdpa_mgmt_dev *mdev)
+{
+ if (!mdev->device || !mdev->ops || !mdev->ops->dev_add || !mdev->ops->dev_del)
+ return -EINVAL;
+
+ INIT_LIST_HEAD(&mdev->list);
+ down_write(&vdpa_dev_lock);
+ list_add_tail(&mdev->list, &mdev_head);
+ up_write(&vdpa_dev_lock);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(vdpa_mgmtdev_register);
+
+static int vdpa_match_remove(struct device *dev, void *data)
+{
+ struct vdpa_device *vdev = container_of(dev, struct vdpa_device, dev);
+ struct vdpa_mgmt_dev *mdev = vdev->mdev;
+
+ if (mdev == data)
+ mdev->ops->dev_del(mdev, vdev);
+ return 0;
+}
+
+void vdpa_mgmtdev_unregister(struct vdpa_mgmt_dev *mdev)
+{
+ down_write(&vdpa_dev_lock);
+
+ list_del(&mdev->list);
+
+ /* Filter out all the entries belong to this management device and delete it. */
+ bus_for_each_dev(&vdpa_bus, NULL, mdev, vdpa_match_remove);
+
+ up_write(&vdpa_dev_lock);
+}
+EXPORT_SYMBOL_GPL(vdpa_mgmtdev_unregister);
+
+static void vdpa_get_config_unlocked(struct vdpa_device *vdev,
+ unsigned int offset,
+ void *buf, unsigned int len)
+{
+ const struct vdpa_config_ops *ops = vdev->config;
+
+ /*
+ * Config accesses aren't supposed to trigger before features are set.
+ * If it does happen we assume a legacy guest.
+ */
+ if (!vdev->features_valid)
+ vdpa_set_features_unlocked(vdev, 0);
+ ops->get_config(vdev, offset, buf, len);
+}
+
+/**
+ * vdpa_get_config - Get one or more device configuration fields.
+ * @vdev: vdpa device to operate on
+ * @offset: starting byte offset of the field
+ * @buf: buffer pointer to read to
+ * @len: length of the configuration fields in bytes
+ */
+void vdpa_get_config(struct vdpa_device *vdev, unsigned int offset,
+ void *buf, unsigned int len)
+{
+ down_read(&vdev->cf_lock);
+ vdpa_get_config_unlocked(vdev, offset, buf, len);
+ up_read(&vdev->cf_lock);
+}
+EXPORT_SYMBOL_GPL(vdpa_get_config);
+
+/**
+ * vdpa_set_config - Set one or more device configuration fields.
+ * @vdev: vdpa device to operate on
+ * @offset: starting byte offset of the field
+ * @buf: buffer pointer to read from
+ * @length: length of the configuration fields in bytes
+ */
+void vdpa_set_config(struct vdpa_device *vdev, unsigned int offset,
+ const void *buf, unsigned int length)
+{
+ down_write(&vdev->cf_lock);
+ vdev->config->set_config(vdev, offset, buf, length);
+ up_write(&vdev->cf_lock);
+}
+EXPORT_SYMBOL_GPL(vdpa_set_config);
+
+static bool mgmtdev_handle_match(const struct vdpa_mgmt_dev *mdev,
+ const char *busname, const char *devname)
+{
+ /* Bus name is optional for simulated management device, so ignore the
+ * device with bus if bus attribute is provided.
+ */
+ if ((busname && !mdev->device->bus) || (!busname && mdev->device->bus))
+ return false;
+
+ if (!busname && strcmp(dev_name(mdev->device), devname) == 0)
+ return true;
+
+ if (busname && (strcmp(mdev->device->bus->name, busname) == 0) &&
+ (strcmp(dev_name(mdev->device), devname) == 0))
+ return true;
+
+ return false;
+}
+
+static struct vdpa_mgmt_dev *vdpa_mgmtdev_get_from_attr(struct nlattr **attrs)
+{
+ struct vdpa_mgmt_dev *mdev;
+ const char *busname = NULL;
+ const char *devname;
+
+ if (!attrs[VDPA_ATTR_MGMTDEV_DEV_NAME])
+ return ERR_PTR(-EINVAL);
+ devname = nla_data(attrs[VDPA_ATTR_MGMTDEV_DEV_NAME]);
+ if (attrs[VDPA_ATTR_MGMTDEV_BUS_NAME])
+ busname = nla_data(attrs[VDPA_ATTR_MGMTDEV_BUS_NAME]);
+
+ list_for_each_entry(mdev, &mdev_head, list) {
+ if (mgmtdev_handle_match(mdev, busname, devname))
+ return mdev;
+ }
+ return ERR_PTR(-ENODEV);
+}
+
+static int vdpa_nl_mgmtdev_handle_fill(struct sk_buff *msg, const struct vdpa_mgmt_dev *mdev)
+{
+ if (mdev->device->bus &&
+ nla_put_string(msg, VDPA_ATTR_MGMTDEV_BUS_NAME, mdev->device->bus->name))
+ return -EMSGSIZE;
+ if (nla_put_string(msg, VDPA_ATTR_MGMTDEV_DEV_NAME, dev_name(mdev->device)))
+ return -EMSGSIZE;
+ return 0;
+}
+
+static u64 vdpa_mgmtdev_get_classes(const struct vdpa_mgmt_dev *mdev,
+ unsigned int *nclasses)
+{
+ u64 supported_classes = 0;
+ unsigned int n = 0;
+
+ for (int i = 0; mdev->id_table[i].device; i++) {
+ if (mdev->id_table[i].device > 63)
+ continue;
+ supported_classes |= BIT_ULL(mdev->id_table[i].device);
+ n++;
+ }
+ if (nclasses)
+ *nclasses = n;
+
+ return supported_classes;
+}
+
+static int vdpa_mgmtdev_fill(const struct vdpa_mgmt_dev *mdev, struct sk_buff *msg,
+ u32 portid, u32 seq, int flags)
+{
+ void *hdr;
+ int err;
+
+ hdr = genlmsg_put(msg, portid, seq, &vdpa_nl_family, flags, VDPA_CMD_MGMTDEV_NEW);
+ if (!hdr)
+ return -EMSGSIZE;
+ err = vdpa_nl_mgmtdev_handle_fill(msg, mdev);
+ if (err)
+ goto msg_err;
+
+ if (nla_put_u64_64bit(msg, VDPA_ATTR_MGMTDEV_SUPPORTED_CLASSES,
+ vdpa_mgmtdev_get_classes(mdev, NULL),
+ VDPA_ATTR_UNSPEC)) {
+ err = -EMSGSIZE;
+ goto msg_err;
+ }
+ if (nla_put_u32(msg, VDPA_ATTR_DEV_MGMTDEV_MAX_VQS,
+ mdev->max_supported_vqs)) {
+ err = -EMSGSIZE;
+ goto msg_err;
+ }
+ if (nla_put_u64_64bit(msg, VDPA_ATTR_DEV_SUPPORTED_FEATURES,
+ mdev->supported_features, VDPA_ATTR_PAD)) {
+ err = -EMSGSIZE;
+ goto msg_err;
+ }
+
+ genlmsg_end(msg, hdr);
+ return 0;
+
+msg_err:
+ genlmsg_cancel(msg, hdr);
+ return err;
+}
+
+static int vdpa_nl_cmd_mgmtdev_get_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ struct vdpa_mgmt_dev *mdev;
+ struct sk_buff *msg;
+ int err;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ down_read(&vdpa_dev_lock);
+ mdev = vdpa_mgmtdev_get_from_attr(info->attrs);
+ if (IS_ERR(mdev)) {
+ up_read(&vdpa_dev_lock);
+ NL_SET_ERR_MSG_MOD(info->extack, "Fail to find the specified mgmt device");
+ err = PTR_ERR(mdev);
+ goto out;
+ }
+
+ err = vdpa_mgmtdev_fill(mdev, msg, info->snd_portid, info->snd_seq, 0);
+ up_read(&vdpa_dev_lock);
+ if (err)
+ goto out;
+ err = genlmsg_reply(msg, info);
+ return err;
+
+out:
+ nlmsg_free(msg);
+ return err;
+}
+
+static int
+vdpa_nl_cmd_mgmtdev_get_dumpit(struct sk_buff *msg, struct netlink_callback *cb)
+{
+ struct vdpa_mgmt_dev *mdev;
+ int start = cb->args[0];
+ int idx = 0;
+ int err;
+
+ down_read(&vdpa_dev_lock);
+ list_for_each_entry(mdev, &mdev_head, list) {
+ if (idx < start) {
+ idx++;
+ continue;
+ }
+ err = vdpa_mgmtdev_fill(mdev, msg, NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, NLM_F_MULTI);
+ if (err)
+ goto out;
+ idx++;
+ }
+out:
+ up_read(&vdpa_dev_lock);
+ cb->args[0] = idx;
+ return msg->len;
+}
+
+#define VDPA_DEV_NET_ATTRS_MASK (BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MACADDR) | \
+ BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MTU) | \
+ BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MAX_VQP))
+
+/*
+ * Bitmask for all per-device features: feature bits VIRTIO_TRANSPORT_F_START
+ * through VIRTIO_TRANSPORT_F_END are unset, i.e. 0xfffffc000fffffff for
+ * all 64bit features. If the features are extended beyond 64 bits, or new
+ * "holes" are reserved for other type of features than per-device, this
+ * macro would have to be updated.
+ */
+#define VIRTIO_DEVICE_F_MASK (~0ULL << (VIRTIO_TRANSPORT_F_END + 1) | \
+ ((1ULL << VIRTIO_TRANSPORT_F_START) - 1))
+
+static int vdpa_nl_cmd_dev_add_set_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ struct vdpa_dev_set_config config = {};
+ struct nlattr **nl_attrs = info->attrs;
+ struct vdpa_mgmt_dev *mdev;
+ unsigned int ncls = 0;
+ const u8 *macaddr;
+ const char *name;
+ u64 classes;
+ int err = 0;
+
+ if (!info->attrs[VDPA_ATTR_DEV_NAME])
+ return -EINVAL;
+
+ name = nla_data(info->attrs[VDPA_ATTR_DEV_NAME]);
+
+ if (nl_attrs[VDPA_ATTR_DEV_NET_CFG_MACADDR]) {
+ macaddr = nla_data(nl_attrs[VDPA_ATTR_DEV_NET_CFG_MACADDR]);
+ memcpy(config.net.mac, macaddr, sizeof(config.net.mac));
+ config.mask |= BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MACADDR);
+ }
+ if (nl_attrs[VDPA_ATTR_DEV_NET_CFG_MTU]) {
+ config.net.mtu =
+ nla_get_u16(nl_attrs[VDPA_ATTR_DEV_NET_CFG_MTU]);
+ config.mask |= BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MTU);
+ }
+ if (nl_attrs[VDPA_ATTR_DEV_NET_CFG_MAX_VQP]) {
+ config.net.max_vq_pairs =
+ nla_get_u16(nl_attrs[VDPA_ATTR_DEV_NET_CFG_MAX_VQP]);
+ if (!config.net.max_vq_pairs) {
+ NL_SET_ERR_MSG_MOD(info->extack,
+ "At least one pair of VQs is required");
+ return -EINVAL;
+ }
+ config.mask |= BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MAX_VQP);
+ }
+ if (nl_attrs[VDPA_ATTR_DEV_FEATURES]) {
+ u64 missing = 0x0ULL;
+
+ config.device_features =
+ nla_get_u64(nl_attrs[VDPA_ATTR_DEV_FEATURES]);
+ if (nl_attrs[VDPA_ATTR_DEV_NET_CFG_MACADDR] &&
+ !(config.device_features & BIT_ULL(VIRTIO_NET_F_MAC)))
+ missing |= BIT_ULL(VIRTIO_NET_F_MAC);
+ if (nl_attrs[VDPA_ATTR_DEV_NET_CFG_MTU] &&
+ !(config.device_features & BIT_ULL(VIRTIO_NET_F_MTU)))
+ missing |= BIT_ULL(VIRTIO_NET_F_MTU);
+ if (nl_attrs[VDPA_ATTR_DEV_NET_CFG_MAX_VQP] &&
+ config.net.max_vq_pairs > 1 &&
+ !(config.device_features & BIT_ULL(VIRTIO_NET_F_MQ)))
+ missing |= BIT_ULL(VIRTIO_NET_F_MQ);
+ if (missing) {
+ NL_SET_ERR_MSG_FMT_MOD(info->extack,
+ "Missing features 0x%llx for provided attributes",
+ missing);
+ return -EINVAL;
+ }
+ config.mask |= BIT_ULL(VDPA_ATTR_DEV_FEATURES);
+ }
+
+ /* Skip checking capability if user didn't prefer to configure any
+ * device networking attributes. It is likely that user might have used
+ * a device specific method to configure such attributes or using device
+ * default attributes.
+ */
+ if ((config.mask & VDPA_DEV_NET_ATTRS_MASK) &&
+ !netlink_capable(skb, CAP_NET_ADMIN))
+ return -EPERM;
+
+ down_write(&vdpa_dev_lock);
+ mdev = vdpa_mgmtdev_get_from_attr(info->attrs);
+ if (IS_ERR(mdev)) {
+ NL_SET_ERR_MSG_MOD(info->extack, "Fail to find the specified management device");
+ err = PTR_ERR(mdev);
+ goto err;
+ }
+
+ if ((config.mask & mdev->config_attr_mask) != config.mask) {
+ NL_SET_ERR_MSG_FMT_MOD(info->extack,
+ "Some provided attributes are not supported: 0x%llx",
+ config.mask & ~mdev->config_attr_mask);
+ err = -EOPNOTSUPP;
+ goto err;
+ }
+
+ classes = vdpa_mgmtdev_get_classes(mdev, &ncls);
+ if (config.mask & VDPA_DEV_NET_ATTRS_MASK &&
+ !(classes & BIT_ULL(VIRTIO_ID_NET))) {
+ NL_SET_ERR_MSG_MOD(info->extack,
+ "Network class attributes provided on unsupported management device");
+ err = -EINVAL;
+ goto err;
+ }
+ if (!(config.mask & VDPA_DEV_NET_ATTRS_MASK) &&
+ config.mask & BIT_ULL(VDPA_ATTR_DEV_FEATURES) &&
+ classes & BIT_ULL(VIRTIO_ID_NET) && ncls > 1 &&
+ config.device_features & VIRTIO_DEVICE_F_MASK) {
+ NL_SET_ERR_MSG_MOD(info->extack,
+ "Management device supports multi-class while device features specified are ambiguous");
+ err = -EINVAL;
+ goto err;
+ }
+
+ err = mdev->ops->dev_add(mdev, name, &config);
+err:
+ up_write(&vdpa_dev_lock);
+ return err;
+}
+
+static int vdpa_nl_cmd_dev_del_set_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ struct vdpa_mgmt_dev *mdev;
+ struct vdpa_device *vdev;
+ struct device *dev;
+ const char *name;
+ int err = 0;
+
+ if (!info->attrs[VDPA_ATTR_DEV_NAME])
+ return -EINVAL;
+ name = nla_data(info->attrs[VDPA_ATTR_DEV_NAME]);
+
+ down_write(&vdpa_dev_lock);
+ dev = bus_find_device(&vdpa_bus, NULL, name, vdpa_name_match);
+ if (!dev) {
+ NL_SET_ERR_MSG_MOD(info->extack, "device not found");
+ err = -ENODEV;
+ goto dev_err;
+ }
+ vdev = container_of(dev, struct vdpa_device, dev);
+ if (!vdev->mdev) {
+ NL_SET_ERR_MSG_MOD(info->extack, "Only user created device can be deleted by user");
+ err = -EINVAL;
+ goto mdev_err;
+ }
+ mdev = vdev->mdev;
+ mdev->ops->dev_del(mdev, vdev);
+mdev_err:
+ put_device(dev);
+dev_err:
+ up_write(&vdpa_dev_lock);
+ return err;
+}
+
+static int
+vdpa_dev_fill(struct vdpa_device *vdev, struct sk_buff *msg, u32 portid, u32 seq,
+ int flags, struct netlink_ext_ack *extack)
+{
+ u16 max_vq_size;
+ u16 min_vq_size = 1;
+ u32 device_id;
+ u32 vendor_id;
+ void *hdr;
+ int err;
+
+ hdr = genlmsg_put(msg, portid, seq, &vdpa_nl_family, flags, VDPA_CMD_DEV_NEW);
+ if (!hdr)
+ return -EMSGSIZE;
+
+ err = vdpa_nl_mgmtdev_handle_fill(msg, vdev->mdev);
+ if (err)
+ goto msg_err;
+
+ device_id = vdev->config->get_device_id(vdev);
+ vendor_id = vdev->config->get_vendor_id(vdev);
+ max_vq_size = vdev->config->get_vq_num_max(vdev);
+ if (vdev->config->get_vq_num_min)
+ min_vq_size = vdev->config->get_vq_num_min(vdev);
+
+ err = -EMSGSIZE;
+ if (nla_put_string(msg, VDPA_ATTR_DEV_NAME, dev_name(&vdev->dev)))
+ goto msg_err;
+ if (nla_put_u32(msg, VDPA_ATTR_DEV_ID, device_id))
+ goto msg_err;
+ if (nla_put_u32(msg, VDPA_ATTR_DEV_VENDOR_ID, vendor_id))
+ goto msg_err;
+ if (nla_put_u32(msg, VDPA_ATTR_DEV_MAX_VQS, vdev->nvqs))
+ goto msg_err;
+ if (nla_put_u16(msg, VDPA_ATTR_DEV_MAX_VQ_SIZE, max_vq_size))
+ goto msg_err;
+ if (nla_put_u16(msg, VDPA_ATTR_DEV_MIN_VQ_SIZE, min_vq_size))
+ goto msg_err;
+
+ genlmsg_end(msg, hdr);
+ return 0;
+
+msg_err:
+ genlmsg_cancel(msg, hdr);
+ return err;
+}
+
+static int vdpa_nl_cmd_dev_get_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ struct vdpa_device *vdev;
+ struct sk_buff *msg;
+ const char *devname;
+ struct device *dev;
+ int err;
+
+ if (!info->attrs[VDPA_ATTR_DEV_NAME])
+ return -EINVAL;
+ devname = nla_data(info->attrs[VDPA_ATTR_DEV_NAME]);
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ down_read(&vdpa_dev_lock);
+ dev = bus_find_device(&vdpa_bus, NULL, devname, vdpa_name_match);
+ if (!dev) {
+ NL_SET_ERR_MSG_MOD(info->extack, "device not found");
+ err = -ENODEV;
+ goto err;
+ }
+ vdev = container_of(dev, struct vdpa_device, dev);
+ if (!vdev->mdev) {
+ err = -EINVAL;
+ goto mdev_err;
+ }
+ err = vdpa_dev_fill(vdev, msg, info->snd_portid, info->snd_seq, 0, info->extack);
+ if (err)
+ goto mdev_err;
+
+ err = genlmsg_reply(msg, info);
+ put_device(dev);
+ up_read(&vdpa_dev_lock);
+ return err;
+
+mdev_err:
+ put_device(dev);
+err:
+ up_read(&vdpa_dev_lock);
+ nlmsg_free(msg);
+ return err;
+}
+
+struct vdpa_dev_dump_info {
+ struct sk_buff *msg;
+ struct netlink_callback *cb;
+ int start_idx;
+ int idx;
+};
+
+static int vdpa_dev_dump(struct device *dev, void *data)
+{
+ struct vdpa_device *vdev = container_of(dev, struct vdpa_device, dev);
+ struct vdpa_dev_dump_info *info = data;
+ int err;
+
+ if (!vdev->mdev)
+ return 0;
+ if (info->idx < info->start_idx) {
+ info->idx++;
+ return 0;
+ }
+ err = vdpa_dev_fill(vdev, info->msg, NETLINK_CB(info->cb->skb).portid,
+ info->cb->nlh->nlmsg_seq, NLM_F_MULTI, info->cb->extack);
+ if (err)
+ return err;
+
+ info->idx++;
+ return 0;
+}
+
+static int vdpa_nl_cmd_dev_get_dumpit(struct sk_buff *msg, struct netlink_callback *cb)
+{
+ struct vdpa_dev_dump_info info;
+
+ info.msg = msg;
+ info.cb = cb;
+ info.start_idx = cb->args[0];
+ info.idx = 0;
+
+ down_read(&vdpa_dev_lock);
+ bus_for_each_dev(&vdpa_bus, NULL, &info, vdpa_dev_dump);
+ up_read(&vdpa_dev_lock);
+ cb->args[0] = info.idx;
+ return msg->len;
+}
+
+static int vdpa_dev_net_mq_config_fill(struct sk_buff *msg, u64 features,
+ const struct virtio_net_config *config)
+{
+ u16 val_u16;
+
+ if ((features & BIT_ULL(VIRTIO_NET_F_MQ)) == 0 &&
+ (features & BIT_ULL(VIRTIO_NET_F_RSS)) == 0)
+ return 0;
+
+ val_u16 = __virtio16_to_cpu(true, config->max_virtqueue_pairs);
+
+ return nla_put_u16(msg, VDPA_ATTR_DEV_NET_CFG_MAX_VQP, val_u16);
+}
+
+static int vdpa_dev_net_mtu_config_fill(struct sk_buff *msg, u64 features,
+ const struct virtio_net_config *config)
+{
+ u16 val_u16;
+
+ if ((features & BIT_ULL(VIRTIO_NET_F_MTU)) == 0)
+ return 0;
+
+ val_u16 = __virtio16_to_cpu(true, config->mtu);
+
+ return nla_put_u16(msg, VDPA_ATTR_DEV_NET_CFG_MTU, val_u16);
+}
+
+static int vdpa_dev_net_mac_config_fill(struct sk_buff *msg, u64 features,
+ const struct virtio_net_config *config)
+{
+ if ((features & BIT_ULL(VIRTIO_NET_F_MAC)) == 0)
+ return 0;
+
+ return nla_put(msg, VDPA_ATTR_DEV_NET_CFG_MACADDR,
+ sizeof(config->mac), config->mac);
+}
+
+static int vdpa_dev_net_status_config_fill(struct sk_buff *msg, u64 features,
+ const struct virtio_net_config *config)
+{
+ u16 val_u16;
+
+ if ((features & BIT_ULL(VIRTIO_NET_F_STATUS)) == 0)
+ return 0;
+
+ val_u16 = __virtio16_to_cpu(true, config->status);
+ return nla_put_u16(msg, VDPA_ATTR_DEV_NET_STATUS, val_u16);
+}
+
+static int vdpa_dev_net_config_fill(struct vdpa_device *vdev, struct sk_buff *msg)
+{
+ struct virtio_net_config config = {};
+ u64 features_device;
+
+ vdev->config->get_config(vdev, 0, &config, sizeof(config));
+
+ features_device = vdev->config->get_device_features(vdev);
+
+ if (nla_put_u64_64bit(msg, VDPA_ATTR_DEV_FEATURES, features_device,
+ VDPA_ATTR_PAD))
+ return -EMSGSIZE;
+
+ if (vdpa_dev_net_mtu_config_fill(msg, features_device, &config))
+ return -EMSGSIZE;
+
+ if (vdpa_dev_net_mac_config_fill(msg, features_device, &config))
+ return -EMSGSIZE;
+
+ if (vdpa_dev_net_status_config_fill(msg, features_device, &config))
+ return -EMSGSIZE;
+
+ return vdpa_dev_net_mq_config_fill(msg, features_device, &config);
+}
+
+static int
+vdpa_dev_config_fill(struct vdpa_device *vdev, struct sk_buff *msg, u32 portid, u32 seq,
+ int flags, struct netlink_ext_ack *extack)
+{
+ u64 features_driver;
+ u8 status = 0;
+ u32 device_id;
+ void *hdr;
+ int err;
+
+ down_read(&vdev->cf_lock);
+ hdr = genlmsg_put(msg, portid, seq, &vdpa_nl_family, flags,
+ VDPA_CMD_DEV_CONFIG_GET);
+ if (!hdr) {
+ err = -EMSGSIZE;
+ goto out;
+ }
+
+ if (nla_put_string(msg, VDPA_ATTR_DEV_NAME, dev_name(&vdev->dev))) {
+ err = -EMSGSIZE;
+ goto msg_err;
+ }
+
+ device_id = vdev->config->get_device_id(vdev);
+ if (nla_put_u32(msg, VDPA_ATTR_DEV_ID, device_id)) {
+ err = -EMSGSIZE;
+ goto msg_err;
+ }
+
+ /* only read driver features after the feature negotiation is done */
+ status = vdev->config->get_status(vdev);
+ if (status & VIRTIO_CONFIG_S_FEATURES_OK) {
+ features_driver = vdev->config->get_driver_features(vdev);
+ if (nla_put_u64_64bit(msg, VDPA_ATTR_DEV_NEGOTIATED_FEATURES, features_driver,
+ VDPA_ATTR_PAD)) {
+ err = -EMSGSIZE;
+ goto msg_err;
+ }
+ }
+
+ switch (device_id) {
+ case VIRTIO_ID_NET:
+ err = vdpa_dev_net_config_fill(vdev, msg);
+ break;
+ default:
+ err = -EOPNOTSUPP;
+ break;
+ }
+ if (err)
+ goto msg_err;
+
+ up_read(&vdev->cf_lock);
+ genlmsg_end(msg, hdr);
+ return 0;
+
+msg_err:
+ genlmsg_cancel(msg, hdr);
+out:
+ up_read(&vdev->cf_lock);
+ return err;
+}
+
+static int vdpa_fill_stats_rec(struct vdpa_device *vdev, struct sk_buff *msg,
+ struct genl_info *info, u32 index)
+{
+ struct virtio_net_config config = {};
+ u64 features;
+ u8 status;
+ int err;
+
+ status = vdev->config->get_status(vdev);
+ if (!(status & VIRTIO_CONFIG_S_FEATURES_OK)) {
+ NL_SET_ERR_MSG_MOD(info->extack, "feature negotiation not complete");
+ return -EAGAIN;
+ }
+ vdpa_get_config_unlocked(vdev, 0, &config, sizeof(config));
+
+ features = vdev->config->get_driver_features(vdev);
+ if (nla_put_u64_64bit(msg, VDPA_ATTR_DEV_NEGOTIATED_FEATURES,
+ features, VDPA_ATTR_PAD))
+ return -EMSGSIZE;
+
+ err = vdpa_dev_net_mq_config_fill(msg, features, &config);
+ if (err)
+ return err;
+
+ if (nla_put_u32(msg, VDPA_ATTR_DEV_QUEUE_INDEX, index))
+ return -EMSGSIZE;
+
+ err = vdev->config->get_vendor_vq_stats(vdev, index, msg, info->extack);
+ if (err)
+ return err;
+
+ return 0;
+}
+
+static int vendor_stats_fill(struct vdpa_device *vdev, struct sk_buff *msg,
+ struct genl_info *info, u32 index)
+{
+ int err;
+
+ down_read(&vdev->cf_lock);
+ if (!vdev->config->get_vendor_vq_stats) {
+ err = -EOPNOTSUPP;
+ goto out;
+ }
+
+ err = vdpa_fill_stats_rec(vdev, msg, info, index);
+out:
+ up_read(&vdev->cf_lock);
+ return err;
+}
+
+static int vdpa_dev_vendor_stats_fill(struct vdpa_device *vdev,
+ struct sk_buff *msg,
+ struct genl_info *info, u32 index)
+{
+ u32 device_id;
+ void *hdr;
+ int err;
+ u32 portid = info->snd_portid;
+ u32 seq = info->snd_seq;
+ u32 flags = 0;
+
+ hdr = genlmsg_put(msg, portid, seq, &vdpa_nl_family, flags,
+ VDPA_CMD_DEV_VSTATS_GET);
+ if (!hdr)
+ return -EMSGSIZE;
+
+ if (nla_put_string(msg, VDPA_ATTR_DEV_NAME, dev_name(&vdev->dev))) {
+ err = -EMSGSIZE;
+ goto undo_msg;
+ }
+
+ device_id = vdev->config->get_device_id(vdev);
+ if (nla_put_u32(msg, VDPA_ATTR_DEV_ID, device_id)) {
+ err = -EMSGSIZE;
+ goto undo_msg;
+ }
+
+ switch (device_id) {
+ case VIRTIO_ID_NET:
+ if (index > VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MAX) {
+ NL_SET_ERR_MSG_MOD(info->extack, "queue index exceeds max value");
+ err = -ERANGE;
+ break;
+ }
+
+ err = vendor_stats_fill(vdev, msg, info, index);
+ break;
+ default:
+ err = -EOPNOTSUPP;
+ break;
+ }
+ genlmsg_end(msg, hdr);
+
+ return err;
+
+undo_msg:
+ genlmsg_cancel(msg, hdr);
+ return err;
+}
+
+static int vdpa_nl_cmd_dev_config_get_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ struct vdpa_device *vdev;
+ struct sk_buff *msg;
+ const char *devname;
+ struct device *dev;
+ int err;
+
+ if (!info->attrs[VDPA_ATTR_DEV_NAME])
+ return -EINVAL;
+ devname = nla_data(info->attrs[VDPA_ATTR_DEV_NAME]);
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ down_read(&vdpa_dev_lock);
+ dev = bus_find_device(&vdpa_bus, NULL, devname, vdpa_name_match);
+ if (!dev) {
+ NL_SET_ERR_MSG_MOD(info->extack, "device not found");
+ err = -ENODEV;
+ goto dev_err;
+ }
+ vdev = container_of(dev, struct vdpa_device, dev);
+ if (!vdev->mdev) {
+ NL_SET_ERR_MSG_MOD(info->extack, "unmanaged vdpa device");
+ err = -EINVAL;
+ goto mdev_err;
+ }
+ err = vdpa_dev_config_fill(vdev, msg, info->snd_portid, info->snd_seq,
+ 0, info->extack);
+ if (!err)
+ err = genlmsg_reply(msg, info);
+
+mdev_err:
+ put_device(dev);
+dev_err:
+ up_read(&vdpa_dev_lock);
+ if (err)
+ nlmsg_free(msg);
+ return err;
+}
+
+static int vdpa_dev_config_dump(struct device *dev, void *data)
+{
+ struct vdpa_device *vdev = container_of(dev, struct vdpa_device, dev);
+ struct vdpa_dev_dump_info *info = data;
+ int err;
+
+ if (!vdev->mdev)
+ return 0;
+ if (info->idx < info->start_idx) {
+ info->idx++;
+ return 0;
+ }
+ err = vdpa_dev_config_fill(vdev, info->msg, NETLINK_CB(info->cb->skb).portid,
+ info->cb->nlh->nlmsg_seq, NLM_F_MULTI,
+ info->cb->extack);
+ if (err)
+ return err;
+
+ info->idx++;
+ return 0;
+}
+
+static int
+vdpa_nl_cmd_dev_config_get_dumpit(struct sk_buff *msg, struct netlink_callback *cb)
+{
+ struct vdpa_dev_dump_info info;
+
+ info.msg = msg;
+ info.cb = cb;
+ info.start_idx = cb->args[0];
+ info.idx = 0;
+
+ down_read(&vdpa_dev_lock);
+ bus_for_each_dev(&vdpa_bus, NULL, &info, vdpa_dev_config_dump);
+ up_read(&vdpa_dev_lock);
+ cb->args[0] = info.idx;
+ return msg->len;
+}
+
+static int vdpa_nl_cmd_dev_stats_get_doit(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct vdpa_device *vdev;
+ struct sk_buff *msg;
+ const char *devname;
+ struct device *dev;
+ u32 index;
+ int err;
+
+ if (!info->attrs[VDPA_ATTR_DEV_NAME])
+ return -EINVAL;
+
+ if (!info->attrs[VDPA_ATTR_DEV_QUEUE_INDEX])
+ return -EINVAL;
+
+ devname = nla_data(info->attrs[VDPA_ATTR_DEV_NAME]);
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ index = nla_get_u32(info->attrs[VDPA_ATTR_DEV_QUEUE_INDEX]);
+ down_read(&vdpa_dev_lock);
+ dev = bus_find_device(&vdpa_bus, NULL, devname, vdpa_name_match);
+ if (!dev) {
+ NL_SET_ERR_MSG_MOD(info->extack, "device not found");
+ err = -ENODEV;
+ goto dev_err;
+ }
+ vdev = container_of(dev, struct vdpa_device, dev);
+ if (!vdev->mdev) {
+ NL_SET_ERR_MSG_MOD(info->extack, "unmanaged vdpa device");
+ err = -EINVAL;
+ goto mdev_err;
+ }
+ err = vdpa_dev_vendor_stats_fill(vdev, msg, info, index);
+ if (err)
+ goto mdev_err;
+
+ err = genlmsg_reply(msg, info);
+
+ put_device(dev);
+ up_read(&vdpa_dev_lock);
+
+ return err;
+
+mdev_err:
+ put_device(dev);
+dev_err:
+ nlmsg_free(msg);
+ up_read(&vdpa_dev_lock);
+ return err;
+}
+
+static const struct nla_policy vdpa_nl_policy[VDPA_ATTR_MAX + 1] = {
+ [VDPA_ATTR_MGMTDEV_BUS_NAME] = { .type = NLA_NUL_STRING },
+ [VDPA_ATTR_MGMTDEV_DEV_NAME] = { .type = NLA_STRING },
+ [VDPA_ATTR_DEV_NAME] = { .type = NLA_STRING },
+ [VDPA_ATTR_DEV_NET_CFG_MACADDR] = NLA_POLICY_ETH_ADDR,
+ [VDPA_ATTR_DEV_NET_CFG_MAX_VQP] = { .type = NLA_U16 },
+ /* virtio spec 1.1 section 5.1.4.1 for valid MTU range */
+ [VDPA_ATTR_DEV_NET_CFG_MTU] = NLA_POLICY_MIN(NLA_U16, 68),
+ [VDPA_ATTR_DEV_QUEUE_INDEX] = { .type = NLA_U32 },
+ [VDPA_ATTR_DEV_FEATURES] = { .type = NLA_U64 },
+};
+
+static const struct genl_ops vdpa_nl_ops[] = {
+ {
+ .cmd = VDPA_CMD_MGMTDEV_GET,
+ .doit = vdpa_nl_cmd_mgmtdev_get_doit,
+ .dumpit = vdpa_nl_cmd_mgmtdev_get_dumpit,
+ },
+ {
+ .cmd = VDPA_CMD_DEV_NEW,
+ .doit = vdpa_nl_cmd_dev_add_set_doit,
+ .flags = GENL_ADMIN_PERM,
+ },
+ {
+ .cmd = VDPA_CMD_DEV_DEL,
+ .doit = vdpa_nl_cmd_dev_del_set_doit,
+ .flags = GENL_ADMIN_PERM,
+ },
+ {
+ .cmd = VDPA_CMD_DEV_GET,
+ .doit = vdpa_nl_cmd_dev_get_doit,
+ .dumpit = vdpa_nl_cmd_dev_get_dumpit,
+ },
+ {
+ .cmd = VDPA_CMD_DEV_CONFIG_GET,
+ .doit = vdpa_nl_cmd_dev_config_get_doit,
+ .dumpit = vdpa_nl_cmd_dev_config_get_dumpit,
+ },
+ {
+ .cmd = VDPA_CMD_DEV_VSTATS_GET,
+ .doit = vdpa_nl_cmd_dev_stats_get_doit,
+ .flags = GENL_ADMIN_PERM,
+ },
+};
+
+static struct genl_family vdpa_nl_family __ro_after_init = {
+ .name = VDPA_GENL_NAME,
+ .version = VDPA_GENL_VERSION,
+ .maxattr = VDPA_ATTR_MAX,
+ .policy = vdpa_nl_policy,
+ .netnsok = false,
+ .module = THIS_MODULE,
+ .ops = vdpa_nl_ops,
+ .n_ops = ARRAY_SIZE(vdpa_nl_ops),
+ .resv_start_op = VDPA_CMD_DEV_VSTATS_GET + 1,
+};
+
+static int vdpa_init(void)
+{
+ int err;
+
+ err = bus_register(&vdpa_bus);
+ if (err)
+ return err;
+ err = genl_register_family(&vdpa_nl_family);
+ if (err)
+ goto err;
+ return 0;
+
+err:
+ bus_unregister(&vdpa_bus);
+ return err;
+}
+
+static void __exit vdpa_exit(void)
+{
+ genl_unregister_family(&vdpa_nl_family);
+ bus_unregister(&vdpa_bus);
+ ida_destroy(&vdpa_index_ida);
+}
+core_initcall(vdpa_init);
+module_exit(vdpa_exit);
+
+MODULE_AUTHOR("Jason Wang <jasowang@redhat.com>");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/vdpa/vdpa_sim/Makefile b/drivers/vdpa/vdpa_sim/Makefile
new file mode 100644
index 0000000000..d458103302
--- /dev/null
+++ b/drivers/vdpa/vdpa_sim/Makefile
@@ -0,0 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0
+obj-$(CONFIG_VDPA_SIM) += vdpa_sim.o
+obj-$(CONFIG_VDPA_SIM_NET) += vdpa_sim_net.o
+obj-$(CONFIG_VDPA_SIM_BLOCK) += vdpa_sim_blk.o
diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.c b/drivers/vdpa/vdpa_sim/vdpa_sim.c
new file mode 100644
index 0000000000..76d41058ad
--- /dev/null
+++ b/drivers/vdpa/vdpa_sim/vdpa_sim.c
@@ -0,0 +1,807 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * VDPA device simulator core.
+ *
+ * Copyright (c) 2020, Red Hat Inc. All rights reserved.
+ * Author: Jason Wang <jasowang@redhat.com>
+ *
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/device.h>
+#include <linux/kernel.h>
+#include <linux/kthread.h>
+#include <linux/slab.h>
+#include <linux/dma-map-ops.h>
+#include <linux/vringh.h>
+#include <linux/vdpa.h>
+#include <linux/vhost_iotlb.h>
+#include <uapi/linux/vdpa.h>
+#include <uapi/linux/vhost_types.h>
+
+#include "vdpa_sim.h"
+
+#define DRV_VERSION "0.1"
+#define DRV_AUTHOR "Jason Wang <jasowang@redhat.com>"
+#define DRV_DESC "vDPA Device Simulator core"
+#define DRV_LICENSE "GPL v2"
+
+static int batch_mapping = 1;
+module_param(batch_mapping, int, 0444);
+MODULE_PARM_DESC(batch_mapping, "Batched mapping 1 -Enable; 0 - Disable");
+
+static int max_iotlb_entries = 2048;
+module_param(max_iotlb_entries, int, 0444);
+MODULE_PARM_DESC(max_iotlb_entries,
+ "Maximum number of iotlb entries for each address space. 0 means unlimited. (default: 2048)");
+
+static bool use_va = true;
+module_param(use_va, bool, 0444);
+MODULE_PARM_DESC(use_va, "Enable/disable the device's ability to use VA");
+
+#define VDPASIM_QUEUE_ALIGN PAGE_SIZE
+#define VDPASIM_QUEUE_MAX 256
+#define VDPASIM_VENDOR_ID 0
+
+struct vdpasim_mm_work {
+ struct kthread_work work;
+ struct vdpasim *vdpasim;
+ struct mm_struct *mm_to_bind;
+ int ret;
+};
+
+static void vdpasim_mm_work_fn(struct kthread_work *work)
+{
+ struct vdpasim_mm_work *mm_work =
+ container_of(work, struct vdpasim_mm_work, work);
+ struct vdpasim *vdpasim = mm_work->vdpasim;
+
+ mm_work->ret = 0;
+
+ //TODO: should we attach the cgroup of the mm owner?
+ vdpasim->mm_bound = mm_work->mm_to_bind;
+}
+
+static void vdpasim_worker_change_mm_sync(struct vdpasim *vdpasim,
+ struct vdpasim_mm_work *mm_work)
+{
+ struct kthread_work *work = &mm_work->work;
+
+ kthread_init_work(work, vdpasim_mm_work_fn);
+ kthread_queue_work(vdpasim->worker, work);
+
+ kthread_flush_work(work);
+}
+
+static struct vdpasim *vdpa_to_sim(struct vdpa_device *vdpa)
+{
+ return container_of(vdpa, struct vdpasim, vdpa);
+}
+
+static void vdpasim_vq_notify(struct vringh *vring)
+{
+ struct vdpasim_virtqueue *vq =
+ container_of(vring, struct vdpasim_virtqueue, vring);
+
+ if (!vq->cb)
+ return;
+
+ vq->cb(vq->private);
+}
+
+static void vdpasim_queue_ready(struct vdpasim *vdpasim, unsigned int idx)
+{
+ struct vdpasim_virtqueue *vq = &vdpasim->vqs[idx];
+ uint16_t last_avail_idx = vq->vring.last_avail_idx;
+ struct vring_desc *desc = (struct vring_desc *)
+ (uintptr_t)vq->desc_addr;
+ struct vring_avail *avail = (struct vring_avail *)
+ (uintptr_t)vq->driver_addr;
+ struct vring_used *used = (struct vring_used *)
+ (uintptr_t)vq->device_addr;
+
+ if (use_va && vdpasim->mm_bound) {
+ vringh_init_iotlb_va(&vq->vring, vdpasim->features, vq->num,
+ true, desc, avail, used);
+ } else {
+ vringh_init_iotlb(&vq->vring, vdpasim->features, vq->num,
+ true, desc, avail, used);
+ }
+
+ vq->vring.last_avail_idx = last_avail_idx;
+
+ /*
+ * Since vdpa_sim does not support receive inflight descriptors as a
+ * destination of a migration, let's set both avail_idx and used_idx
+ * the same at vq start. This is how vhost-user works in a
+ * VHOST_SET_VRING_BASE call.
+ *
+ * Although the simple fix is to set last_used_idx at
+ * vdpasim_set_vq_state, it would be reset at vdpasim_queue_ready.
+ */
+ vq->vring.last_used_idx = last_avail_idx;
+ vq->vring.notify = vdpasim_vq_notify;
+}
+
+static void vdpasim_vq_reset(struct vdpasim *vdpasim,
+ struct vdpasim_virtqueue *vq)
+{
+ vq->ready = false;
+ vq->desc_addr = 0;
+ vq->driver_addr = 0;
+ vq->device_addr = 0;
+ vq->cb = NULL;
+ vq->private = NULL;
+ vringh_init_iotlb(&vq->vring, vdpasim->dev_attr.supported_features,
+ VDPASIM_QUEUE_MAX, false, NULL, NULL, NULL);
+
+ vq->vring.notify = NULL;
+}
+
+static void vdpasim_do_reset(struct vdpasim *vdpasim)
+{
+ int i;
+
+ spin_lock(&vdpasim->iommu_lock);
+
+ for (i = 0; i < vdpasim->dev_attr.nvqs; i++) {
+ vdpasim_vq_reset(vdpasim, &vdpasim->vqs[i]);
+ vringh_set_iotlb(&vdpasim->vqs[i].vring, &vdpasim->iommu[0],
+ &vdpasim->iommu_lock);
+ }
+
+ for (i = 0; i < vdpasim->dev_attr.nas; i++) {
+ vhost_iotlb_reset(&vdpasim->iommu[i]);
+ vhost_iotlb_add_range(&vdpasim->iommu[i], 0, ULONG_MAX,
+ 0, VHOST_MAP_RW);
+ vdpasim->iommu_pt[i] = true;
+ }
+
+ vdpasim->running = true;
+ spin_unlock(&vdpasim->iommu_lock);
+
+ vdpasim->features = 0;
+ vdpasim->status = 0;
+ ++vdpasim->generation;
+}
+
+static const struct vdpa_config_ops vdpasim_config_ops;
+static const struct vdpa_config_ops vdpasim_batch_config_ops;
+
+static void vdpasim_work_fn(struct kthread_work *work)
+{
+ struct vdpasim *vdpasim = container_of(work, struct vdpasim, work);
+ struct mm_struct *mm = vdpasim->mm_bound;
+
+ if (use_va && mm) {
+ if (!mmget_not_zero(mm))
+ return;
+ kthread_use_mm(mm);
+ }
+
+ vdpasim->dev_attr.work_fn(vdpasim);
+
+ if (use_va && mm) {
+ kthread_unuse_mm(mm);
+ mmput(mm);
+ }
+}
+
+struct vdpasim *vdpasim_create(struct vdpasim_dev_attr *dev_attr,
+ const struct vdpa_dev_set_config *config)
+{
+ const struct vdpa_config_ops *ops;
+ struct vdpa_device *vdpa;
+ struct vdpasim *vdpasim;
+ struct device *dev;
+ int i, ret = -ENOMEM;
+
+ if (!dev_attr->alloc_size)
+ return ERR_PTR(-EINVAL);
+
+ if (config->mask & BIT_ULL(VDPA_ATTR_DEV_FEATURES)) {
+ if (config->device_features &
+ ~dev_attr->supported_features)
+ return ERR_PTR(-EINVAL);
+ dev_attr->supported_features =
+ config->device_features;
+ }
+
+ if (batch_mapping)
+ ops = &vdpasim_batch_config_ops;
+ else
+ ops = &vdpasim_config_ops;
+
+ vdpa = __vdpa_alloc_device(NULL, ops,
+ dev_attr->ngroups, dev_attr->nas,
+ dev_attr->alloc_size,
+ dev_attr->name, use_va);
+ if (IS_ERR(vdpa)) {
+ ret = PTR_ERR(vdpa);
+ goto err_alloc;
+ }
+
+ vdpasim = vdpa_to_sim(vdpa);
+ vdpasim->dev_attr = *dev_attr;
+ dev = &vdpasim->vdpa.dev;
+
+ kthread_init_work(&vdpasim->work, vdpasim_work_fn);
+ vdpasim->worker = kthread_create_worker(0, "vDPA sim worker: %s",
+ dev_attr->name);
+ if (IS_ERR(vdpasim->worker))
+ goto err_iommu;
+
+ mutex_init(&vdpasim->mutex);
+ spin_lock_init(&vdpasim->iommu_lock);
+
+ dev->dma_mask = &dev->coherent_dma_mask;
+ if (dma_set_mask_and_coherent(dev, DMA_BIT_MASK(64)))
+ goto err_iommu;
+ vdpasim->vdpa.mdev = dev_attr->mgmt_dev;
+
+ vdpasim->config = kzalloc(dev_attr->config_size, GFP_KERNEL);
+ if (!vdpasim->config)
+ goto err_iommu;
+
+ vdpasim->vqs = kcalloc(dev_attr->nvqs, sizeof(struct vdpasim_virtqueue),
+ GFP_KERNEL);
+ if (!vdpasim->vqs)
+ goto err_iommu;
+
+ vdpasim->iommu = kmalloc_array(vdpasim->dev_attr.nas,
+ sizeof(*vdpasim->iommu), GFP_KERNEL);
+ if (!vdpasim->iommu)
+ goto err_iommu;
+
+ vdpasim->iommu_pt = kmalloc_array(vdpasim->dev_attr.nas,
+ sizeof(*vdpasim->iommu_pt), GFP_KERNEL);
+ if (!vdpasim->iommu_pt)
+ goto err_iommu;
+
+ for (i = 0; i < vdpasim->dev_attr.nas; i++)
+ vhost_iotlb_init(&vdpasim->iommu[i], max_iotlb_entries, 0);
+
+ for (i = 0; i < dev_attr->nvqs; i++)
+ vringh_set_iotlb(&vdpasim->vqs[i].vring, &vdpasim->iommu[0],
+ &vdpasim->iommu_lock);
+
+ vdpasim->vdpa.dma_dev = dev;
+
+ return vdpasim;
+
+err_iommu:
+ put_device(dev);
+err_alloc:
+ return ERR_PTR(ret);
+}
+EXPORT_SYMBOL_GPL(vdpasim_create);
+
+void vdpasim_schedule_work(struct vdpasim *vdpasim)
+{
+ kthread_queue_work(vdpasim->worker, &vdpasim->work);
+}
+EXPORT_SYMBOL_GPL(vdpasim_schedule_work);
+
+static int vdpasim_set_vq_address(struct vdpa_device *vdpa, u16 idx,
+ u64 desc_area, u64 driver_area,
+ u64 device_area)
+{
+ struct vdpasim *vdpasim = vdpa_to_sim(vdpa);
+ struct vdpasim_virtqueue *vq = &vdpasim->vqs[idx];
+
+ vq->desc_addr = desc_area;
+ vq->driver_addr = driver_area;
+ vq->device_addr = device_area;
+
+ return 0;
+}
+
+static void vdpasim_set_vq_num(struct vdpa_device *vdpa, u16 idx, u32 num)
+{
+ struct vdpasim *vdpasim = vdpa_to_sim(vdpa);
+ struct vdpasim_virtqueue *vq = &vdpasim->vqs[idx];
+
+ vq->num = num;
+}
+
+static void vdpasim_kick_vq(struct vdpa_device *vdpa, u16 idx)
+{
+ struct vdpasim *vdpasim = vdpa_to_sim(vdpa);
+ struct vdpasim_virtqueue *vq = &vdpasim->vqs[idx];
+
+ if (!vdpasim->running &&
+ (vdpasim->status & VIRTIO_CONFIG_S_DRIVER_OK)) {
+ vdpasim->pending_kick = true;
+ return;
+ }
+
+ if (vq->ready)
+ vdpasim_schedule_work(vdpasim);
+}
+
+static void vdpasim_set_vq_cb(struct vdpa_device *vdpa, u16 idx,
+ struct vdpa_callback *cb)
+{
+ struct vdpasim *vdpasim = vdpa_to_sim(vdpa);
+ struct vdpasim_virtqueue *vq = &vdpasim->vqs[idx];
+
+ vq->cb = cb->callback;
+ vq->private = cb->private;
+}
+
+static void vdpasim_set_vq_ready(struct vdpa_device *vdpa, u16 idx, bool ready)
+{
+ struct vdpasim *vdpasim = vdpa_to_sim(vdpa);
+ struct vdpasim_virtqueue *vq = &vdpasim->vqs[idx];
+ bool old_ready;
+
+ mutex_lock(&vdpasim->mutex);
+ old_ready = vq->ready;
+ vq->ready = ready;
+ if (vq->ready && !old_ready) {
+ vdpasim_queue_ready(vdpasim, idx);
+ }
+ mutex_unlock(&vdpasim->mutex);
+}
+
+static bool vdpasim_get_vq_ready(struct vdpa_device *vdpa, u16 idx)
+{
+ struct vdpasim *vdpasim = vdpa_to_sim(vdpa);
+ struct vdpasim_virtqueue *vq = &vdpasim->vqs[idx];
+
+ return vq->ready;
+}
+
+static int vdpasim_set_vq_state(struct vdpa_device *vdpa, u16 idx,
+ const struct vdpa_vq_state *state)
+{
+ struct vdpasim *vdpasim = vdpa_to_sim(vdpa);
+ struct vdpasim_virtqueue *vq = &vdpasim->vqs[idx];
+ struct vringh *vrh = &vq->vring;
+
+ mutex_lock(&vdpasim->mutex);
+ vrh->last_avail_idx = state->split.avail_index;
+ mutex_unlock(&vdpasim->mutex);
+
+ return 0;
+}
+
+static int vdpasim_get_vq_state(struct vdpa_device *vdpa, u16 idx,
+ struct vdpa_vq_state *state)
+{
+ struct vdpasim *vdpasim = vdpa_to_sim(vdpa);
+ struct vdpasim_virtqueue *vq = &vdpasim->vqs[idx];
+ struct vringh *vrh = &vq->vring;
+
+ state->split.avail_index = vrh->last_avail_idx;
+ return 0;
+}
+
+static int vdpasim_get_vq_stats(struct vdpa_device *vdpa, u16 idx,
+ struct sk_buff *msg,
+ struct netlink_ext_ack *extack)
+{
+ struct vdpasim *vdpasim = vdpa_to_sim(vdpa);
+
+ if (vdpasim->dev_attr.get_stats)
+ return vdpasim->dev_attr.get_stats(vdpasim, idx,
+ msg, extack);
+ return -EOPNOTSUPP;
+}
+
+static u32 vdpasim_get_vq_align(struct vdpa_device *vdpa)
+{
+ return VDPASIM_QUEUE_ALIGN;
+}
+
+static u32 vdpasim_get_vq_group(struct vdpa_device *vdpa, u16 idx)
+{
+ /* RX and TX belongs to group 0, CVQ belongs to group 1 */
+ if (idx == 2)
+ return 1;
+ else
+ return 0;
+}
+
+static u64 vdpasim_get_device_features(struct vdpa_device *vdpa)
+{
+ struct vdpasim *vdpasim = vdpa_to_sim(vdpa);
+
+ return vdpasim->dev_attr.supported_features;
+}
+
+static u64 vdpasim_get_backend_features(const struct vdpa_device *vdpa)
+{
+ return BIT_ULL(VHOST_BACKEND_F_ENABLE_AFTER_DRIVER_OK);
+}
+
+static int vdpasim_set_driver_features(struct vdpa_device *vdpa, u64 features)
+{
+ struct vdpasim *vdpasim = vdpa_to_sim(vdpa);
+
+ /* DMA mapping must be done by driver */
+ if (!(features & (1ULL << VIRTIO_F_ACCESS_PLATFORM)))
+ return -EINVAL;
+
+ vdpasim->features = features & vdpasim->dev_attr.supported_features;
+
+ return 0;
+}
+
+static u64 vdpasim_get_driver_features(struct vdpa_device *vdpa)
+{
+ struct vdpasim *vdpasim = vdpa_to_sim(vdpa);
+
+ return vdpasim->features;
+}
+
+static void vdpasim_set_config_cb(struct vdpa_device *vdpa,
+ struct vdpa_callback *cb)
+{
+ /* We don't support config interrupt */
+}
+
+static u16 vdpasim_get_vq_num_max(struct vdpa_device *vdpa)
+{
+ return VDPASIM_QUEUE_MAX;
+}
+
+static u32 vdpasim_get_device_id(struct vdpa_device *vdpa)
+{
+ struct vdpasim *vdpasim = vdpa_to_sim(vdpa);
+
+ return vdpasim->dev_attr.id;
+}
+
+static u32 vdpasim_get_vendor_id(struct vdpa_device *vdpa)
+{
+ return VDPASIM_VENDOR_ID;
+}
+
+static u8 vdpasim_get_status(struct vdpa_device *vdpa)
+{
+ struct vdpasim *vdpasim = vdpa_to_sim(vdpa);
+ u8 status;
+
+ mutex_lock(&vdpasim->mutex);
+ status = vdpasim->status;
+ mutex_unlock(&vdpasim->mutex);
+
+ return status;
+}
+
+static void vdpasim_set_status(struct vdpa_device *vdpa, u8 status)
+{
+ struct vdpasim *vdpasim = vdpa_to_sim(vdpa);
+
+ mutex_lock(&vdpasim->mutex);
+ vdpasim->status = status;
+ mutex_unlock(&vdpasim->mutex);
+}
+
+static int vdpasim_reset(struct vdpa_device *vdpa)
+{
+ struct vdpasim *vdpasim = vdpa_to_sim(vdpa);
+
+ mutex_lock(&vdpasim->mutex);
+ vdpasim->status = 0;
+ vdpasim_do_reset(vdpasim);
+ mutex_unlock(&vdpasim->mutex);
+
+ return 0;
+}
+
+static int vdpasim_suspend(struct vdpa_device *vdpa)
+{
+ struct vdpasim *vdpasim = vdpa_to_sim(vdpa);
+
+ mutex_lock(&vdpasim->mutex);
+ vdpasim->running = false;
+ mutex_unlock(&vdpasim->mutex);
+
+ return 0;
+}
+
+static int vdpasim_resume(struct vdpa_device *vdpa)
+{
+ struct vdpasim *vdpasim = vdpa_to_sim(vdpa);
+ int i;
+
+ mutex_lock(&vdpasim->mutex);
+ vdpasim->running = true;
+
+ if (vdpasim->pending_kick) {
+ /* Process pending descriptors */
+ for (i = 0; i < vdpasim->dev_attr.nvqs; ++i)
+ vdpasim_kick_vq(vdpa, i);
+
+ vdpasim->pending_kick = false;
+ }
+
+ mutex_unlock(&vdpasim->mutex);
+
+ return 0;
+}
+
+static size_t vdpasim_get_config_size(struct vdpa_device *vdpa)
+{
+ struct vdpasim *vdpasim = vdpa_to_sim(vdpa);
+
+ return vdpasim->dev_attr.config_size;
+}
+
+static void vdpasim_get_config(struct vdpa_device *vdpa, unsigned int offset,
+ void *buf, unsigned int len)
+{
+ struct vdpasim *vdpasim = vdpa_to_sim(vdpa);
+
+ if (offset + len > vdpasim->dev_attr.config_size)
+ return;
+
+ if (vdpasim->dev_attr.get_config)
+ vdpasim->dev_attr.get_config(vdpasim, vdpasim->config);
+
+ memcpy(buf, vdpasim->config + offset, len);
+}
+
+static void vdpasim_set_config(struct vdpa_device *vdpa, unsigned int offset,
+ const void *buf, unsigned int len)
+{
+ struct vdpasim *vdpasim = vdpa_to_sim(vdpa);
+
+ if (offset + len > vdpasim->dev_attr.config_size)
+ return;
+
+ memcpy(vdpasim->config + offset, buf, len);
+
+ if (vdpasim->dev_attr.set_config)
+ vdpasim->dev_attr.set_config(vdpasim, vdpasim->config);
+}
+
+static u32 vdpasim_get_generation(struct vdpa_device *vdpa)
+{
+ struct vdpasim *vdpasim = vdpa_to_sim(vdpa);
+
+ return vdpasim->generation;
+}
+
+static struct vdpa_iova_range vdpasim_get_iova_range(struct vdpa_device *vdpa)
+{
+ struct vdpa_iova_range range = {
+ .first = 0ULL,
+ .last = ULLONG_MAX,
+ };
+
+ return range;
+}
+
+static int vdpasim_set_group_asid(struct vdpa_device *vdpa, unsigned int group,
+ unsigned int asid)
+{
+ struct vdpasim *vdpasim = vdpa_to_sim(vdpa);
+ struct vhost_iotlb *iommu;
+ int i;
+
+ if (group > vdpasim->dev_attr.ngroups)
+ return -EINVAL;
+
+ if (asid >= vdpasim->dev_attr.nas)
+ return -EINVAL;
+
+ iommu = &vdpasim->iommu[asid];
+
+ mutex_lock(&vdpasim->mutex);
+
+ for (i = 0; i < vdpasim->dev_attr.nvqs; i++)
+ if (vdpasim_get_vq_group(vdpa, i) == group)
+ vringh_set_iotlb(&vdpasim->vqs[i].vring, iommu,
+ &vdpasim->iommu_lock);
+
+ mutex_unlock(&vdpasim->mutex);
+
+ return 0;
+}
+
+static int vdpasim_set_map(struct vdpa_device *vdpa, unsigned int asid,
+ struct vhost_iotlb *iotlb)
+{
+ struct vdpasim *vdpasim = vdpa_to_sim(vdpa);
+ struct vhost_iotlb_map *map;
+ struct vhost_iotlb *iommu;
+ u64 start = 0ULL, last = 0ULL - 1;
+ int ret;
+
+ if (asid >= vdpasim->dev_attr.nas)
+ return -EINVAL;
+
+ spin_lock(&vdpasim->iommu_lock);
+
+ iommu = &vdpasim->iommu[asid];
+ vhost_iotlb_reset(iommu);
+ vdpasim->iommu_pt[asid] = false;
+
+ for (map = vhost_iotlb_itree_first(iotlb, start, last); map;
+ map = vhost_iotlb_itree_next(map, start, last)) {
+ ret = vhost_iotlb_add_range(iommu, map->start,
+ map->last, map->addr, map->perm);
+ if (ret)
+ goto err;
+ }
+ spin_unlock(&vdpasim->iommu_lock);
+ return 0;
+
+err:
+ vhost_iotlb_reset(iommu);
+ spin_unlock(&vdpasim->iommu_lock);
+ return ret;
+}
+
+static int vdpasim_bind_mm(struct vdpa_device *vdpa, struct mm_struct *mm)
+{
+ struct vdpasim *vdpasim = vdpa_to_sim(vdpa);
+ struct vdpasim_mm_work mm_work;
+
+ mm_work.vdpasim = vdpasim;
+ mm_work.mm_to_bind = mm;
+
+ vdpasim_worker_change_mm_sync(vdpasim, &mm_work);
+
+ return mm_work.ret;
+}
+
+static void vdpasim_unbind_mm(struct vdpa_device *vdpa)
+{
+ struct vdpasim *vdpasim = vdpa_to_sim(vdpa);
+ struct vdpasim_mm_work mm_work;
+
+ mm_work.vdpasim = vdpasim;
+ mm_work.mm_to_bind = NULL;
+
+ vdpasim_worker_change_mm_sync(vdpasim, &mm_work);
+}
+
+static int vdpasim_dma_map(struct vdpa_device *vdpa, unsigned int asid,
+ u64 iova, u64 size,
+ u64 pa, u32 perm, void *opaque)
+{
+ struct vdpasim *vdpasim = vdpa_to_sim(vdpa);
+ int ret;
+
+ if (asid >= vdpasim->dev_attr.nas)
+ return -EINVAL;
+
+ spin_lock(&vdpasim->iommu_lock);
+ if (vdpasim->iommu_pt[asid]) {
+ vhost_iotlb_reset(&vdpasim->iommu[asid]);
+ vdpasim->iommu_pt[asid] = false;
+ }
+ ret = vhost_iotlb_add_range_ctx(&vdpasim->iommu[asid], iova,
+ iova + size - 1, pa, perm, opaque);
+ spin_unlock(&vdpasim->iommu_lock);
+
+ return ret;
+}
+
+static int vdpasim_dma_unmap(struct vdpa_device *vdpa, unsigned int asid,
+ u64 iova, u64 size)
+{
+ struct vdpasim *vdpasim = vdpa_to_sim(vdpa);
+
+ if (asid >= vdpasim->dev_attr.nas)
+ return -EINVAL;
+
+ if (vdpasim->iommu_pt[asid]) {
+ vhost_iotlb_reset(&vdpasim->iommu[asid]);
+ vdpasim->iommu_pt[asid] = false;
+ }
+
+ spin_lock(&vdpasim->iommu_lock);
+ vhost_iotlb_del_range(&vdpasim->iommu[asid], iova, iova + size - 1);
+ spin_unlock(&vdpasim->iommu_lock);
+
+ return 0;
+}
+
+static void vdpasim_free(struct vdpa_device *vdpa)
+{
+ struct vdpasim *vdpasim = vdpa_to_sim(vdpa);
+ int i;
+
+ kthread_cancel_work_sync(&vdpasim->work);
+ kthread_destroy_worker(vdpasim->worker);
+
+ for (i = 0; i < vdpasim->dev_attr.nvqs; i++) {
+ vringh_kiov_cleanup(&vdpasim->vqs[i].out_iov);
+ vringh_kiov_cleanup(&vdpasim->vqs[i].in_iov);
+ }
+
+ vdpasim->dev_attr.free(vdpasim);
+
+ for (i = 0; i < vdpasim->dev_attr.nas; i++)
+ vhost_iotlb_reset(&vdpasim->iommu[i]);
+ kfree(vdpasim->iommu);
+ kfree(vdpasim->iommu_pt);
+ kfree(vdpasim->vqs);
+ kfree(vdpasim->config);
+}
+
+static const struct vdpa_config_ops vdpasim_config_ops = {
+ .set_vq_address = vdpasim_set_vq_address,
+ .set_vq_num = vdpasim_set_vq_num,
+ .kick_vq = vdpasim_kick_vq,
+ .set_vq_cb = vdpasim_set_vq_cb,
+ .set_vq_ready = vdpasim_set_vq_ready,
+ .get_vq_ready = vdpasim_get_vq_ready,
+ .set_vq_state = vdpasim_set_vq_state,
+ .get_vendor_vq_stats = vdpasim_get_vq_stats,
+ .get_vq_state = vdpasim_get_vq_state,
+ .get_vq_align = vdpasim_get_vq_align,
+ .get_vq_group = vdpasim_get_vq_group,
+ .get_device_features = vdpasim_get_device_features,
+ .get_backend_features = vdpasim_get_backend_features,
+ .set_driver_features = vdpasim_set_driver_features,
+ .get_driver_features = vdpasim_get_driver_features,
+ .set_config_cb = vdpasim_set_config_cb,
+ .get_vq_num_max = vdpasim_get_vq_num_max,
+ .get_device_id = vdpasim_get_device_id,
+ .get_vendor_id = vdpasim_get_vendor_id,
+ .get_status = vdpasim_get_status,
+ .set_status = vdpasim_set_status,
+ .reset = vdpasim_reset,
+ .suspend = vdpasim_suspend,
+ .resume = vdpasim_resume,
+ .get_config_size = vdpasim_get_config_size,
+ .get_config = vdpasim_get_config,
+ .set_config = vdpasim_set_config,
+ .get_generation = vdpasim_get_generation,
+ .get_iova_range = vdpasim_get_iova_range,
+ .set_group_asid = vdpasim_set_group_asid,
+ .dma_map = vdpasim_dma_map,
+ .dma_unmap = vdpasim_dma_unmap,
+ .bind_mm = vdpasim_bind_mm,
+ .unbind_mm = vdpasim_unbind_mm,
+ .free = vdpasim_free,
+};
+
+static const struct vdpa_config_ops vdpasim_batch_config_ops = {
+ .set_vq_address = vdpasim_set_vq_address,
+ .set_vq_num = vdpasim_set_vq_num,
+ .kick_vq = vdpasim_kick_vq,
+ .set_vq_cb = vdpasim_set_vq_cb,
+ .set_vq_ready = vdpasim_set_vq_ready,
+ .get_vq_ready = vdpasim_get_vq_ready,
+ .set_vq_state = vdpasim_set_vq_state,
+ .get_vendor_vq_stats = vdpasim_get_vq_stats,
+ .get_vq_state = vdpasim_get_vq_state,
+ .get_vq_align = vdpasim_get_vq_align,
+ .get_vq_group = vdpasim_get_vq_group,
+ .get_device_features = vdpasim_get_device_features,
+ .get_backend_features = vdpasim_get_backend_features,
+ .set_driver_features = vdpasim_set_driver_features,
+ .get_driver_features = vdpasim_get_driver_features,
+ .set_config_cb = vdpasim_set_config_cb,
+ .get_vq_num_max = vdpasim_get_vq_num_max,
+ .get_device_id = vdpasim_get_device_id,
+ .get_vendor_id = vdpasim_get_vendor_id,
+ .get_status = vdpasim_get_status,
+ .set_status = vdpasim_set_status,
+ .reset = vdpasim_reset,
+ .suspend = vdpasim_suspend,
+ .resume = vdpasim_resume,
+ .get_config_size = vdpasim_get_config_size,
+ .get_config = vdpasim_get_config,
+ .set_config = vdpasim_set_config,
+ .get_generation = vdpasim_get_generation,
+ .get_iova_range = vdpasim_get_iova_range,
+ .set_group_asid = vdpasim_set_group_asid,
+ .set_map = vdpasim_set_map,
+ .bind_mm = vdpasim_bind_mm,
+ .unbind_mm = vdpasim_unbind_mm,
+ .free = vdpasim_free,
+};
+
+MODULE_VERSION(DRV_VERSION);
+MODULE_LICENSE(DRV_LICENSE);
+MODULE_AUTHOR(DRV_AUTHOR);
+MODULE_DESCRIPTION(DRV_DESC);
diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.h b/drivers/vdpa/vdpa_sim/vdpa_sim.h
new file mode 100644
index 0000000000..bb137e4797
--- /dev/null
+++ b/drivers/vdpa/vdpa_sim/vdpa_sim.h
@@ -0,0 +1,121 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2020, Red Hat Inc. All rights reserved.
+ */
+
+#ifndef _VDPA_SIM_H
+#define _VDPA_SIM_H
+
+#include <linux/iova.h>
+#include <linux/vringh.h>
+#include <linux/vdpa.h>
+#include <linux/virtio_byteorder.h>
+#include <linux/vhost_iotlb.h>
+#include <uapi/linux/virtio_config.h>
+
+#define VDPASIM_FEATURES ((1ULL << VIRTIO_F_ANY_LAYOUT) | \
+ (1ULL << VIRTIO_F_VERSION_1) | \
+ (1ULL << VIRTIO_F_ACCESS_PLATFORM))
+
+struct vdpasim;
+
+struct vdpasim_virtqueue {
+ struct vringh vring;
+ struct vringh_kiov in_iov;
+ struct vringh_kiov out_iov;
+ unsigned short head;
+ bool ready;
+ u64 desc_addr;
+ u64 device_addr;
+ u64 driver_addr;
+ u32 num;
+ void *private;
+ irqreturn_t (*cb)(void *data);
+};
+
+struct vdpasim_dev_attr {
+ struct vdpa_mgmt_dev *mgmt_dev;
+ const char *name;
+ u64 supported_features;
+ size_t alloc_size;
+ size_t config_size;
+ int nvqs;
+ u32 id;
+ u32 ngroups;
+ u32 nas;
+
+ void (*work_fn)(struct vdpasim *vdpasim);
+ void (*get_config)(struct vdpasim *vdpasim, void *config);
+ void (*set_config)(struct vdpasim *vdpasim, const void *config);
+ int (*get_stats)(struct vdpasim *vdpasim, u16 idx,
+ struct sk_buff *msg,
+ struct netlink_ext_ack *extack);
+ void (*free)(struct vdpasim *vdpasim);
+};
+
+/* State of each vdpasim device */
+struct vdpasim {
+ struct vdpa_device vdpa;
+ struct vdpasim_virtqueue *vqs;
+ struct kthread_worker *worker;
+ struct kthread_work work;
+ struct mm_struct *mm_bound;
+ struct vdpasim_dev_attr dev_attr;
+ /* mutex to synchronize virtqueue state */
+ struct mutex mutex;
+ /* virtio config according to device type */
+ void *config;
+ struct vhost_iotlb *iommu;
+ bool *iommu_pt;
+ u32 status;
+ u32 generation;
+ u64 features;
+ u32 groups;
+ bool running;
+ bool pending_kick;
+ /* spinlock to synchronize iommu table */
+ spinlock_t iommu_lock;
+};
+
+struct vdpasim *vdpasim_create(struct vdpasim_dev_attr *attr,
+ const struct vdpa_dev_set_config *config);
+void vdpasim_schedule_work(struct vdpasim *vdpasim);
+
+/* TODO: cross-endian support */
+static inline bool vdpasim_is_little_endian(struct vdpasim *vdpasim)
+{
+ return virtio_legacy_is_little_endian() ||
+ (vdpasim->features & (1ULL << VIRTIO_F_VERSION_1));
+}
+
+static inline u16 vdpasim16_to_cpu(struct vdpasim *vdpasim, __virtio16 val)
+{
+ return __virtio16_to_cpu(vdpasim_is_little_endian(vdpasim), val);
+}
+
+static inline __virtio16 cpu_to_vdpasim16(struct vdpasim *vdpasim, u16 val)
+{
+ return __cpu_to_virtio16(vdpasim_is_little_endian(vdpasim), val);
+}
+
+static inline u32 vdpasim32_to_cpu(struct vdpasim *vdpasim, __virtio32 val)
+{
+ return __virtio32_to_cpu(vdpasim_is_little_endian(vdpasim), val);
+}
+
+static inline __virtio32 cpu_to_vdpasim32(struct vdpasim *vdpasim, u32 val)
+{
+ return __cpu_to_virtio32(vdpasim_is_little_endian(vdpasim), val);
+}
+
+static inline u64 vdpasim64_to_cpu(struct vdpasim *vdpasim, __virtio64 val)
+{
+ return __virtio64_to_cpu(vdpasim_is_little_endian(vdpasim), val);
+}
+
+static inline __virtio64 cpu_to_vdpasim64(struct vdpasim *vdpasim, u64 val)
+{
+ return __cpu_to_virtio64(vdpasim_is_little_endian(vdpasim), val);
+}
+
+#endif
diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim_blk.c b/drivers/vdpa/vdpa_sim/vdpa_sim_blk.c
new file mode 100644
index 0000000000..b137f36793
--- /dev/null
+++ b/drivers/vdpa/vdpa_sim/vdpa_sim_blk.c
@@ -0,0 +1,527 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * VDPA simulator for block device.
+ *
+ * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2021, Red Hat Inc. All rights reserved.
+ *
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/device.h>
+#include <linux/kernel.h>
+#include <linux/blkdev.h>
+#include <linux/vringh.h>
+#include <linux/vdpa.h>
+#include <uapi/linux/virtio_blk.h>
+
+#include "vdpa_sim.h"
+
+#define DRV_VERSION "0.1"
+#define DRV_AUTHOR "Max Gurtovoy <mgurtovoy@nvidia.com>"
+#define DRV_DESC "vDPA Device Simulator for block device"
+#define DRV_LICENSE "GPL v2"
+
+#define VDPASIM_BLK_FEATURES (VDPASIM_FEATURES | \
+ (1ULL << VIRTIO_BLK_F_FLUSH) | \
+ (1ULL << VIRTIO_BLK_F_SIZE_MAX) | \
+ (1ULL << VIRTIO_BLK_F_SEG_MAX) | \
+ (1ULL << VIRTIO_BLK_F_BLK_SIZE) | \
+ (1ULL << VIRTIO_BLK_F_TOPOLOGY) | \
+ (1ULL << VIRTIO_BLK_F_MQ) | \
+ (1ULL << VIRTIO_BLK_F_DISCARD) | \
+ (1ULL << VIRTIO_BLK_F_WRITE_ZEROES))
+
+#define VDPASIM_BLK_CAPACITY 0x40000
+#define VDPASIM_BLK_SIZE_MAX 0x1000
+#define VDPASIM_BLK_SEG_MAX 32
+#define VDPASIM_BLK_DWZ_MAX_SECTORS UINT_MAX
+
+/* 1 virtqueue, 1 address space, 1 virtqueue group */
+#define VDPASIM_BLK_VQ_NUM 1
+#define VDPASIM_BLK_AS_NUM 1
+#define VDPASIM_BLK_GROUP_NUM 1
+
+struct vdpasim_blk {
+ struct vdpasim vdpasim;
+ void *buffer;
+ bool shared_backend;
+};
+
+static struct vdpasim_blk *sim_to_blk(struct vdpasim *vdpasim)
+{
+ return container_of(vdpasim, struct vdpasim_blk, vdpasim);
+}
+
+static char vdpasim_blk_id[VIRTIO_BLK_ID_BYTES] = "vdpa_blk_sim";
+
+static bool shared_backend;
+module_param(shared_backend, bool, 0444);
+MODULE_PARM_DESC(shared_backend, "Enable the shared backend between virtio-blk devices");
+
+static void *shared_buffer;
+/* mutex to synchronize shared_buffer access */
+static DEFINE_MUTEX(shared_buffer_mutex);
+
+static void vdpasim_blk_buffer_lock(struct vdpasim_blk *blk)
+{
+ if (blk->shared_backend)
+ mutex_lock(&shared_buffer_mutex);
+}
+
+static void vdpasim_blk_buffer_unlock(struct vdpasim_blk *blk)
+{
+ if (blk->shared_backend)
+ mutex_unlock(&shared_buffer_mutex);
+}
+
+static bool vdpasim_blk_check_range(struct vdpasim *vdpasim, u64 start_sector,
+ u64 num_sectors, u64 max_sectors)
+{
+ if (start_sector > VDPASIM_BLK_CAPACITY) {
+ dev_dbg(&vdpasim->vdpa.dev,
+ "starting sector exceeds the capacity - start: 0x%llx capacity: 0x%x\n",
+ start_sector, VDPASIM_BLK_CAPACITY);
+ }
+
+ if (num_sectors > max_sectors) {
+ dev_dbg(&vdpasim->vdpa.dev,
+ "number of sectors exceeds the max allowed in a request - num: 0x%llx max: 0x%llx\n",
+ num_sectors, max_sectors);
+ return false;
+ }
+
+ if (num_sectors > VDPASIM_BLK_CAPACITY - start_sector) {
+ dev_dbg(&vdpasim->vdpa.dev,
+ "request exceeds the capacity - start: 0x%llx num: 0x%llx capacity: 0x%x\n",
+ start_sector, num_sectors, VDPASIM_BLK_CAPACITY);
+ return false;
+ }
+
+ return true;
+}
+
+/* Returns 'true' if the request is handled (with or without an I/O error)
+ * and the status is correctly written in the last byte of the 'in iov',
+ * 'false' otherwise.
+ */
+static bool vdpasim_blk_handle_req(struct vdpasim *vdpasim,
+ struct vdpasim_virtqueue *vq)
+{
+ struct vdpasim_blk *blk = sim_to_blk(vdpasim);
+ size_t pushed = 0, to_pull, to_push;
+ struct virtio_blk_outhdr hdr;
+ bool handled = false;
+ ssize_t bytes;
+ loff_t offset;
+ u64 sector;
+ u8 status;
+ u32 type;
+ int ret;
+
+ ret = vringh_getdesc_iotlb(&vq->vring, &vq->out_iov, &vq->in_iov,
+ &vq->head, GFP_ATOMIC);
+ if (ret != 1)
+ return false;
+
+ if (vq->out_iov.used < 1 || vq->in_iov.used < 1) {
+ dev_dbg(&vdpasim->vdpa.dev, "missing headers - out_iov: %u in_iov %u\n",
+ vq->out_iov.used, vq->in_iov.used);
+ goto err;
+ }
+
+ if (vq->in_iov.iov[vq->in_iov.used - 1].iov_len < 1) {
+ dev_dbg(&vdpasim->vdpa.dev, "request in header too short\n");
+ goto err;
+ }
+
+ /* The last byte is the status and we checked if the last iov has
+ * enough room for it.
+ */
+ to_push = vringh_kiov_length(&vq->in_iov) - 1;
+
+ to_pull = vringh_kiov_length(&vq->out_iov);
+
+ bytes = vringh_iov_pull_iotlb(&vq->vring, &vq->out_iov, &hdr,
+ sizeof(hdr));
+ if (bytes != sizeof(hdr)) {
+ dev_dbg(&vdpasim->vdpa.dev, "request out header too short\n");
+ goto err;
+ }
+
+ to_pull -= bytes;
+
+ type = vdpasim32_to_cpu(vdpasim, hdr.type);
+ sector = vdpasim64_to_cpu(vdpasim, hdr.sector);
+ offset = sector << SECTOR_SHIFT;
+ status = VIRTIO_BLK_S_OK;
+
+ if (type != VIRTIO_BLK_T_IN && type != VIRTIO_BLK_T_OUT &&
+ sector != 0) {
+ dev_dbg(&vdpasim->vdpa.dev,
+ "sector must be 0 for %u request - sector: 0x%llx\n",
+ type, sector);
+ status = VIRTIO_BLK_S_IOERR;
+ goto err_status;
+ }
+
+ switch (type) {
+ case VIRTIO_BLK_T_IN:
+ if (!vdpasim_blk_check_range(vdpasim, sector,
+ to_push >> SECTOR_SHIFT,
+ VDPASIM_BLK_SIZE_MAX * VDPASIM_BLK_SEG_MAX)) {
+ status = VIRTIO_BLK_S_IOERR;
+ break;
+ }
+
+ vdpasim_blk_buffer_lock(blk);
+ bytes = vringh_iov_push_iotlb(&vq->vring, &vq->in_iov,
+ blk->buffer + offset, to_push);
+ vdpasim_blk_buffer_unlock(blk);
+ if (bytes < 0) {
+ dev_dbg(&vdpasim->vdpa.dev,
+ "vringh_iov_push_iotlb() error: %zd offset: 0x%llx len: 0x%zx\n",
+ bytes, offset, to_push);
+ status = VIRTIO_BLK_S_IOERR;
+ break;
+ }
+
+ pushed += bytes;
+ break;
+
+ case VIRTIO_BLK_T_OUT:
+ if (!vdpasim_blk_check_range(vdpasim, sector,
+ to_pull >> SECTOR_SHIFT,
+ VDPASIM_BLK_SIZE_MAX * VDPASIM_BLK_SEG_MAX)) {
+ status = VIRTIO_BLK_S_IOERR;
+ break;
+ }
+
+ vdpasim_blk_buffer_lock(blk);
+ bytes = vringh_iov_pull_iotlb(&vq->vring, &vq->out_iov,
+ blk->buffer + offset, to_pull);
+ vdpasim_blk_buffer_unlock(blk);
+ if (bytes < 0) {
+ dev_dbg(&vdpasim->vdpa.dev,
+ "vringh_iov_pull_iotlb() error: %zd offset: 0x%llx len: 0x%zx\n",
+ bytes, offset, to_pull);
+ status = VIRTIO_BLK_S_IOERR;
+ break;
+ }
+ break;
+
+ case VIRTIO_BLK_T_GET_ID:
+ bytes = vringh_iov_push_iotlb(&vq->vring, &vq->in_iov,
+ vdpasim_blk_id,
+ VIRTIO_BLK_ID_BYTES);
+ if (bytes < 0) {
+ dev_dbg(&vdpasim->vdpa.dev,
+ "vringh_iov_push_iotlb() error: %zd\n", bytes);
+ status = VIRTIO_BLK_S_IOERR;
+ break;
+ }
+
+ pushed += bytes;
+ break;
+
+ case VIRTIO_BLK_T_FLUSH:
+ /* nothing to do */
+ break;
+
+ case VIRTIO_BLK_T_DISCARD:
+ case VIRTIO_BLK_T_WRITE_ZEROES: {
+ struct virtio_blk_discard_write_zeroes range;
+ u32 num_sectors, flags;
+
+ if (to_pull != sizeof(range)) {
+ dev_dbg(&vdpasim->vdpa.dev,
+ "discard/write_zeroes header len: 0x%zx [expected: 0x%zx]\n",
+ to_pull, sizeof(range));
+ status = VIRTIO_BLK_S_IOERR;
+ break;
+ }
+
+ bytes = vringh_iov_pull_iotlb(&vq->vring, &vq->out_iov, &range,
+ to_pull);
+ if (bytes < 0) {
+ dev_dbg(&vdpasim->vdpa.dev,
+ "vringh_iov_pull_iotlb() error: %zd offset: 0x%llx len: 0x%zx\n",
+ bytes, offset, to_pull);
+ status = VIRTIO_BLK_S_IOERR;
+ break;
+ }
+
+ sector = le64_to_cpu(range.sector);
+ offset = sector << SECTOR_SHIFT;
+ num_sectors = le32_to_cpu(range.num_sectors);
+ flags = le32_to_cpu(range.flags);
+
+ if (type == VIRTIO_BLK_T_DISCARD && flags != 0) {
+ dev_dbg(&vdpasim->vdpa.dev,
+ "discard unexpected flags set - flags: 0x%x\n",
+ flags);
+ status = VIRTIO_BLK_S_UNSUPP;
+ break;
+ }
+
+ if (type == VIRTIO_BLK_T_WRITE_ZEROES &&
+ flags & ~VIRTIO_BLK_WRITE_ZEROES_FLAG_UNMAP) {
+ dev_dbg(&vdpasim->vdpa.dev,
+ "write_zeroes unexpected flags set - flags: 0x%x\n",
+ flags);
+ status = VIRTIO_BLK_S_UNSUPP;
+ break;
+ }
+
+ if (!vdpasim_blk_check_range(vdpasim, sector, num_sectors,
+ VDPASIM_BLK_DWZ_MAX_SECTORS)) {
+ status = VIRTIO_BLK_S_IOERR;
+ break;
+ }
+
+ if (type == VIRTIO_BLK_T_WRITE_ZEROES) {
+ vdpasim_blk_buffer_lock(blk);
+ memset(blk->buffer + offset, 0,
+ num_sectors << SECTOR_SHIFT);
+ vdpasim_blk_buffer_unlock(blk);
+ }
+
+ break;
+ }
+ default:
+ dev_dbg(&vdpasim->vdpa.dev,
+ "Unsupported request type %d\n", type);
+ status = VIRTIO_BLK_S_IOERR;
+ break;
+ }
+
+err_status:
+ /* If some operations fail, we need to skip the remaining bytes
+ * to put the status in the last byte
+ */
+ if (to_push - pushed > 0)
+ vringh_kiov_advance(&vq->in_iov, to_push - pushed);
+
+ /* Last byte is the status */
+ bytes = vringh_iov_push_iotlb(&vq->vring, &vq->in_iov, &status, 1);
+ if (bytes != 1)
+ goto err;
+
+ pushed += bytes;
+
+ /* Make sure data is wrote before advancing index */
+ smp_wmb();
+
+ handled = true;
+
+err:
+ vringh_complete_iotlb(&vq->vring, vq->head, pushed);
+
+ return handled;
+}
+
+static void vdpasim_blk_work(struct vdpasim *vdpasim)
+{
+ bool reschedule = false;
+ int i;
+
+ mutex_lock(&vdpasim->mutex);
+
+ if (!(vdpasim->status & VIRTIO_CONFIG_S_DRIVER_OK))
+ goto out;
+
+ if (!vdpasim->running)
+ goto out;
+
+ for (i = 0; i < VDPASIM_BLK_VQ_NUM; i++) {
+ struct vdpasim_virtqueue *vq = &vdpasim->vqs[i];
+ int reqs = 0;
+
+ if (!vq->ready)
+ continue;
+
+ while (vdpasim_blk_handle_req(vdpasim, vq)) {
+ /* Make sure used is visible before rasing the interrupt. */
+ smp_wmb();
+
+ local_bh_disable();
+ if (vringh_need_notify_iotlb(&vq->vring) > 0)
+ vringh_notify(&vq->vring);
+ local_bh_enable();
+
+ if (++reqs > 4) {
+ reschedule = true;
+ break;
+ }
+ }
+ }
+out:
+ mutex_unlock(&vdpasim->mutex);
+
+ if (reschedule)
+ vdpasim_schedule_work(vdpasim);
+}
+
+static void vdpasim_blk_get_config(struct vdpasim *vdpasim, void *config)
+{
+ struct virtio_blk_config *blk_config = config;
+
+ memset(config, 0, sizeof(struct virtio_blk_config));
+
+ blk_config->capacity = cpu_to_vdpasim64(vdpasim, VDPASIM_BLK_CAPACITY);
+ blk_config->size_max = cpu_to_vdpasim32(vdpasim, VDPASIM_BLK_SIZE_MAX);
+ blk_config->seg_max = cpu_to_vdpasim32(vdpasim, VDPASIM_BLK_SEG_MAX);
+ blk_config->num_queues = cpu_to_vdpasim16(vdpasim, VDPASIM_BLK_VQ_NUM);
+ blk_config->min_io_size = cpu_to_vdpasim16(vdpasim, 1);
+ blk_config->opt_io_size = cpu_to_vdpasim32(vdpasim, 1);
+ blk_config->blk_size = cpu_to_vdpasim32(vdpasim, SECTOR_SIZE);
+ /* VIRTIO_BLK_F_DISCARD */
+ blk_config->discard_sector_alignment =
+ cpu_to_vdpasim32(vdpasim, SECTOR_SIZE);
+ blk_config->max_discard_sectors =
+ cpu_to_vdpasim32(vdpasim, VDPASIM_BLK_DWZ_MAX_SECTORS);
+ blk_config->max_discard_seg = cpu_to_vdpasim32(vdpasim, 1);
+ /* VIRTIO_BLK_F_WRITE_ZEROES */
+ blk_config->max_write_zeroes_sectors =
+ cpu_to_vdpasim32(vdpasim, VDPASIM_BLK_DWZ_MAX_SECTORS);
+ blk_config->max_write_zeroes_seg = cpu_to_vdpasim32(vdpasim, 1);
+
+}
+
+static void vdpasim_blk_free(struct vdpasim *vdpasim)
+{
+ struct vdpasim_blk *blk = sim_to_blk(vdpasim);
+
+ if (!blk->shared_backend)
+ kvfree(blk->buffer);
+}
+
+static void vdpasim_blk_mgmtdev_release(struct device *dev)
+{
+}
+
+static struct device vdpasim_blk_mgmtdev = {
+ .init_name = "vdpasim_blk",
+ .release = vdpasim_blk_mgmtdev_release,
+};
+
+static int vdpasim_blk_dev_add(struct vdpa_mgmt_dev *mdev, const char *name,
+ const struct vdpa_dev_set_config *config)
+{
+ struct vdpasim_dev_attr dev_attr = {};
+ struct vdpasim_blk *blk;
+ struct vdpasim *simdev;
+ int ret;
+
+ dev_attr.mgmt_dev = mdev;
+ dev_attr.name = name;
+ dev_attr.id = VIRTIO_ID_BLOCK;
+ dev_attr.supported_features = VDPASIM_BLK_FEATURES;
+ dev_attr.nvqs = VDPASIM_BLK_VQ_NUM;
+ dev_attr.ngroups = VDPASIM_BLK_GROUP_NUM;
+ dev_attr.nas = VDPASIM_BLK_AS_NUM;
+ dev_attr.alloc_size = sizeof(struct vdpasim_blk);
+ dev_attr.config_size = sizeof(struct virtio_blk_config);
+ dev_attr.get_config = vdpasim_blk_get_config;
+ dev_attr.work_fn = vdpasim_blk_work;
+ dev_attr.free = vdpasim_blk_free;
+
+ simdev = vdpasim_create(&dev_attr, config);
+ if (IS_ERR(simdev))
+ return PTR_ERR(simdev);
+
+ blk = sim_to_blk(simdev);
+ blk->shared_backend = shared_backend;
+
+ if (blk->shared_backend) {
+ blk->buffer = shared_buffer;
+ } else {
+ blk->buffer = kvzalloc(VDPASIM_BLK_CAPACITY << SECTOR_SHIFT,
+ GFP_KERNEL);
+ if (!blk->buffer) {
+ ret = -ENOMEM;
+ goto put_dev;
+ }
+ }
+
+ ret = _vdpa_register_device(&simdev->vdpa, VDPASIM_BLK_VQ_NUM);
+ if (ret)
+ goto put_dev;
+
+ return 0;
+
+put_dev:
+ put_device(&simdev->vdpa.dev);
+ return ret;
+}
+
+static void vdpasim_blk_dev_del(struct vdpa_mgmt_dev *mdev,
+ struct vdpa_device *dev)
+{
+ struct vdpasim *simdev = container_of(dev, struct vdpasim, vdpa);
+
+ _vdpa_unregister_device(&simdev->vdpa);
+}
+
+static const struct vdpa_mgmtdev_ops vdpasim_blk_mgmtdev_ops = {
+ .dev_add = vdpasim_blk_dev_add,
+ .dev_del = vdpasim_blk_dev_del
+};
+
+static struct virtio_device_id id_table[] = {
+ { VIRTIO_ID_BLOCK, VIRTIO_DEV_ANY_ID },
+ { 0 },
+};
+
+static struct vdpa_mgmt_dev mgmt_dev = {
+ .device = &vdpasim_blk_mgmtdev,
+ .id_table = id_table,
+ .ops = &vdpasim_blk_mgmtdev_ops,
+};
+
+static int __init vdpasim_blk_init(void)
+{
+ int ret;
+
+ ret = device_register(&vdpasim_blk_mgmtdev);
+ if (ret) {
+ put_device(&vdpasim_blk_mgmtdev);
+ return ret;
+ }
+
+ ret = vdpa_mgmtdev_register(&mgmt_dev);
+ if (ret)
+ goto parent_err;
+
+ if (shared_backend) {
+ shared_buffer = kvzalloc(VDPASIM_BLK_CAPACITY << SECTOR_SHIFT,
+ GFP_KERNEL);
+ if (!shared_buffer) {
+ ret = -ENOMEM;
+ goto mgmt_dev_err;
+ }
+ }
+
+ return 0;
+mgmt_dev_err:
+ vdpa_mgmtdev_unregister(&mgmt_dev);
+parent_err:
+ device_unregister(&vdpasim_blk_mgmtdev);
+ return ret;
+}
+
+static void __exit vdpasim_blk_exit(void)
+{
+ kvfree(shared_buffer);
+ vdpa_mgmtdev_unregister(&mgmt_dev);
+ device_unregister(&vdpasim_blk_mgmtdev);
+}
+
+module_init(vdpasim_blk_init)
+module_exit(vdpasim_blk_exit)
+
+MODULE_VERSION(DRV_VERSION);
+MODULE_LICENSE(DRV_LICENSE);
+MODULE_AUTHOR(DRV_AUTHOR);
+MODULE_DESCRIPTION(DRV_DESC);
diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim_net.c b/drivers/vdpa/vdpa_sim/vdpa_sim_net.c
new file mode 100644
index 0000000000..cfe9629118
--- /dev/null
+++ b/drivers/vdpa/vdpa_sim/vdpa_sim_net.c
@@ -0,0 +1,564 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * VDPA simulator for networking device.
+ *
+ * Copyright (c) 2020, Red Hat Inc. All rights reserved.
+ * Author: Jason Wang <jasowang@redhat.com>
+ *
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/device.h>
+#include <linux/kernel.h>
+#include <linux/etherdevice.h>
+#include <linux/vringh.h>
+#include <linux/vdpa.h>
+#include <net/netlink.h>
+#include <uapi/linux/virtio_net.h>
+#include <uapi/linux/vdpa.h>
+
+#include "vdpa_sim.h"
+
+#define DRV_VERSION "0.1"
+#define DRV_AUTHOR "Jason Wang <jasowang@redhat.com>"
+#define DRV_DESC "vDPA Device Simulator for networking device"
+#define DRV_LICENSE "GPL v2"
+
+#define VDPASIM_NET_FEATURES (VDPASIM_FEATURES | \
+ (1ULL << VIRTIO_NET_F_MAC) | \
+ (1ULL << VIRTIO_NET_F_STATUS) | \
+ (1ULL << VIRTIO_NET_F_MTU) | \
+ (1ULL << VIRTIO_NET_F_CTRL_VQ) | \
+ (1ULL << VIRTIO_NET_F_CTRL_MAC_ADDR))
+
+/* 3 virtqueues, 2 address spaces, 2 virtqueue groups */
+#define VDPASIM_NET_VQ_NUM 3
+#define VDPASIM_NET_AS_NUM 2
+#define VDPASIM_NET_GROUP_NUM 2
+
+struct vdpasim_dataq_stats {
+ struct u64_stats_sync syncp;
+ u64 pkts;
+ u64 bytes;
+ u64 drops;
+ u64 errors;
+ u64 overruns;
+};
+
+struct vdpasim_cq_stats {
+ struct u64_stats_sync syncp;
+ u64 requests;
+ u64 successes;
+ u64 errors;
+};
+
+struct vdpasim_net{
+ struct vdpasim vdpasim;
+ struct vdpasim_dataq_stats tx_stats;
+ struct vdpasim_dataq_stats rx_stats;
+ struct vdpasim_cq_stats cq_stats;
+ void *buffer;
+};
+
+static struct vdpasim_net *sim_to_net(struct vdpasim *vdpasim)
+{
+ return container_of(vdpasim, struct vdpasim_net, vdpasim);
+}
+
+static void vdpasim_net_complete(struct vdpasim_virtqueue *vq, size_t len)
+{
+ /* Make sure data is wrote before advancing index */
+ smp_wmb();
+
+ vringh_complete_iotlb(&vq->vring, vq->head, len);
+
+ /* Make sure used is visible before rasing the interrupt. */
+ smp_wmb();
+
+ local_bh_disable();
+ if (vringh_need_notify_iotlb(&vq->vring) > 0)
+ vringh_notify(&vq->vring);
+ local_bh_enable();
+}
+
+static bool receive_filter(struct vdpasim *vdpasim, size_t len)
+{
+ bool modern = vdpasim->features & (1ULL << VIRTIO_F_VERSION_1);
+ size_t hdr_len = modern ? sizeof(struct virtio_net_hdr_v1) :
+ sizeof(struct virtio_net_hdr);
+ struct virtio_net_config *vio_config = vdpasim->config;
+ struct vdpasim_net *net = sim_to_net(vdpasim);
+
+ if (len < ETH_ALEN + hdr_len)
+ return false;
+
+ if (is_broadcast_ether_addr(net->buffer + hdr_len) ||
+ is_multicast_ether_addr(net->buffer + hdr_len))
+ return true;
+ if (!strncmp(net->buffer + hdr_len, vio_config->mac, ETH_ALEN))
+ return true;
+
+ return false;
+}
+
+static virtio_net_ctrl_ack vdpasim_handle_ctrl_mac(struct vdpasim *vdpasim,
+ u8 cmd)
+{
+ struct virtio_net_config *vio_config = vdpasim->config;
+ struct vdpasim_virtqueue *cvq = &vdpasim->vqs[2];
+ virtio_net_ctrl_ack status = VIRTIO_NET_ERR;
+ size_t read;
+
+ switch (cmd) {
+ case VIRTIO_NET_CTRL_MAC_ADDR_SET:
+ read = vringh_iov_pull_iotlb(&cvq->vring, &cvq->in_iov,
+ vio_config->mac, ETH_ALEN);
+ if (read == ETH_ALEN)
+ status = VIRTIO_NET_OK;
+ break;
+ default:
+ break;
+ }
+
+ return status;
+}
+
+static void vdpasim_handle_cvq(struct vdpasim *vdpasim)
+{
+ struct vdpasim_virtqueue *cvq = &vdpasim->vqs[2];
+ struct vdpasim_net *net = sim_to_net(vdpasim);
+ virtio_net_ctrl_ack status = VIRTIO_NET_ERR;
+ struct virtio_net_ctrl_hdr ctrl;
+ size_t read, write;
+ u64 requests = 0, errors = 0, successes = 0;
+ int err;
+
+ if (!(vdpasim->features & (1ULL << VIRTIO_NET_F_CTRL_VQ)))
+ return;
+
+ if (!cvq->ready)
+ return;
+
+ while (true) {
+ err = vringh_getdesc_iotlb(&cvq->vring, &cvq->in_iov,
+ &cvq->out_iov,
+ &cvq->head, GFP_ATOMIC);
+ if (err <= 0)
+ break;
+
+ ++requests;
+ read = vringh_iov_pull_iotlb(&cvq->vring, &cvq->in_iov, &ctrl,
+ sizeof(ctrl));
+ if (read != sizeof(ctrl)) {
+ ++errors;
+ break;
+ }
+
+ switch (ctrl.class) {
+ case VIRTIO_NET_CTRL_MAC:
+ status = vdpasim_handle_ctrl_mac(vdpasim, ctrl.cmd);
+ break;
+ default:
+ break;
+ }
+
+ if (status == VIRTIO_NET_OK)
+ ++successes;
+ else
+ ++errors;
+
+ /* Make sure data is wrote before advancing index */
+ smp_wmb();
+
+ write = vringh_iov_push_iotlb(&cvq->vring, &cvq->out_iov,
+ &status, sizeof(status));
+ vringh_complete_iotlb(&cvq->vring, cvq->head, write);
+ vringh_kiov_cleanup(&cvq->in_iov);
+ vringh_kiov_cleanup(&cvq->out_iov);
+
+ /* Make sure used is visible before rasing the interrupt. */
+ smp_wmb();
+
+ local_bh_disable();
+ if (cvq->cb)
+ cvq->cb(cvq->private);
+ local_bh_enable();
+ }
+
+ u64_stats_update_begin(&net->cq_stats.syncp);
+ net->cq_stats.requests += requests;
+ net->cq_stats.errors += errors;
+ net->cq_stats.successes += successes;
+ u64_stats_update_end(&net->cq_stats.syncp);
+}
+
+static void vdpasim_net_work(struct vdpasim *vdpasim)
+{
+ struct vdpasim_virtqueue *txq = &vdpasim->vqs[1];
+ struct vdpasim_virtqueue *rxq = &vdpasim->vqs[0];
+ struct vdpasim_net *net = sim_to_net(vdpasim);
+ ssize_t read, write;
+ u64 tx_pkts = 0, rx_pkts = 0, tx_bytes = 0, rx_bytes = 0;
+ u64 rx_drops = 0, rx_overruns = 0, rx_errors = 0, tx_errors = 0;
+ int err;
+
+ mutex_lock(&vdpasim->mutex);
+
+ if (!vdpasim->running)
+ goto out;
+
+ if (!(vdpasim->status & VIRTIO_CONFIG_S_DRIVER_OK))
+ goto out;
+
+ vdpasim_handle_cvq(vdpasim);
+
+ if (!txq->ready || !rxq->ready)
+ goto out;
+
+ while (true) {
+ err = vringh_getdesc_iotlb(&txq->vring, &txq->out_iov, NULL,
+ &txq->head, GFP_ATOMIC);
+ if (err <= 0) {
+ if (err)
+ ++tx_errors;
+ break;
+ }
+
+ ++tx_pkts;
+ read = vringh_iov_pull_iotlb(&txq->vring, &txq->out_iov,
+ net->buffer, PAGE_SIZE);
+
+ tx_bytes += read;
+
+ if (!receive_filter(vdpasim, read)) {
+ ++rx_drops;
+ vdpasim_net_complete(txq, 0);
+ continue;
+ }
+
+ err = vringh_getdesc_iotlb(&rxq->vring, NULL, &rxq->in_iov,
+ &rxq->head, GFP_ATOMIC);
+ if (err <= 0) {
+ ++rx_overruns;
+ vdpasim_net_complete(txq, 0);
+ break;
+ }
+
+ write = vringh_iov_push_iotlb(&rxq->vring, &rxq->in_iov,
+ net->buffer, read);
+ if (write <= 0) {
+ ++rx_errors;
+ break;
+ }
+
+ ++rx_pkts;
+ rx_bytes += write;
+
+ vdpasim_net_complete(txq, 0);
+ vdpasim_net_complete(rxq, write);
+
+ if (tx_pkts > 4) {
+ vdpasim_schedule_work(vdpasim);
+ goto out;
+ }
+ }
+
+out:
+ mutex_unlock(&vdpasim->mutex);
+
+ u64_stats_update_begin(&net->tx_stats.syncp);
+ net->tx_stats.pkts += tx_pkts;
+ net->tx_stats.bytes += tx_bytes;
+ net->tx_stats.errors += tx_errors;
+ u64_stats_update_end(&net->tx_stats.syncp);
+
+ u64_stats_update_begin(&net->rx_stats.syncp);
+ net->rx_stats.pkts += rx_pkts;
+ net->rx_stats.bytes += rx_bytes;
+ net->rx_stats.drops += rx_drops;
+ net->rx_stats.errors += rx_errors;
+ net->rx_stats.overruns += rx_overruns;
+ u64_stats_update_end(&net->rx_stats.syncp);
+}
+
+static int vdpasim_net_get_stats(struct vdpasim *vdpasim, u16 idx,
+ struct sk_buff *msg,
+ struct netlink_ext_ack *extack)
+{
+ struct vdpasim_net *net = sim_to_net(vdpasim);
+ u64 rx_pkts, rx_bytes, rx_errors, rx_overruns, rx_drops;
+ u64 tx_pkts, tx_bytes, tx_errors, tx_drops;
+ u64 cq_requests, cq_successes, cq_errors;
+ unsigned int start;
+ int err = -EMSGSIZE;
+
+ switch(idx) {
+ case 0:
+ do {
+ start = u64_stats_fetch_begin(&net->rx_stats.syncp);
+ rx_pkts = net->rx_stats.pkts;
+ rx_bytes = net->rx_stats.bytes;
+ rx_errors = net->rx_stats.errors;
+ rx_overruns = net->rx_stats.overruns;
+ rx_drops = net->rx_stats.drops;
+ } while (u64_stats_fetch_retry(&net->rx_stats.syncp, start));
+
+ if (nla_put_string(msg, VDPA_ATTR_DEV_VENDOR_ATTR_NAME,
+ "rx packets"))
+ break;
+ if (nla_put_u64_64bit(msg, VDPA_ATTR_DEV_VENDOR_ATTR_VALUE,
+ rx_pkts, VDPA_ATTR_PAD))
+ break;
+ if (nla_put_string(msg, VDPA_ATTR_DEV_VENDOR_ATTR_NAME,
+ "rx bytes"))
+ break;
+ if (nla_put_u64_64bit(msg, VDPA_ATTR_DEV_VENDOR_ATTR_VALUE,
+ rx_bytes, VDPA_ATTR_PAD))
+ break;
+ if (nla_put_string(msg, VDPA_ATTR_DEV_VENDOR_ATTR_NAME,
+ "rx errors"))
+ break;
+ if (nla_put_u64_64bit(msg, VDPA_ATTR_DEV_VENDOR_ATTR_VALUE,
+ rx_errors, VDPA_ATTR_PAD))
+ break;
+ if (nla_put_string(msg, VDPA_ATTR_DEV_VENDOR_ATTR_NAME,
+ "rx overruns"))
+ break;
+ if (nla_put_u64_64bit(msg, VDPA_ATTR_DEV_VENDOR_ATTR_VALUE,
+ rx_overruns, VDPA_ATTR_PAD))
+ break;
+ if (nla_put_string(msg, VDPA_ATTR_DEV_VENDOR_ATTR_NAME,
+ "rx drops"))
+ break;
+ if (nla_put_u64_64bit(msg, VDPA_ATTR_DEV_VENDOR_ATTR_VALUE,
+ rx_drops, VDPA_ATTR_PAD))
+ break;
+ err = 0;
+ break;
+ case 1:
+ do {
+ start = u64_stats_fetch_begin(&net->tx_stats.syncp);
+ tx_pkts = net->tx_stats.pkts;
+ tx_bytes = net->tx_stats.bytes;
+ tx_errors = net->tx_stats.errors;
+ tx_drops = net->tx_stats.drops;
+ } while (u64_stats_fetch_retry(&net->tx_stats.syncp, start));
+
+ if (nla_put_string(msg, VDPA_ATTR_DEV_VENDOR_ATTR_NAME,
+ "tx packets"))
+ break;
+ if (nla_put_u64_64bit(msg, VDPA_ATTR_DEV_VENDOR_ATTR_VALUE,
+ tx_pkts, VDPA_ATTR_PAD))
+ break;
+ if (nla_put_string(msg, VDPA_ATTR_DEV_VENDOR_ATTR_NAME,
+ "tx bytes"))
+ break;
+ if (nla_put_u64_64bit(msg, VDPA_ATTR_DEV_VENDOR_ATTR_VALUE,
+ tx_bytes, VDPA_ATTR_PAD))
+ break;
+ if (nla_put_string(msg, VDPA_ATTR_DEV_VENDOR_ATTR_NAME,
+ "tx errors"))
+ break;
+ if (nla_put_u64_64bit(msg, VDPA_ATTR_DEV_VENDOR_ATTR_VALUE,
+ tx_errors, VDPA_ATTR_PAD))
+ break;
+ if (nla_put_string(msg, VDPA_ATTR_DEV_VENDOR_ATTR_NAME,
+ "tx drops"))
+ break;
+ if (nla_put_u64_64bit(msg, VDPA_ATTR_DEV_VENDOR_ATTR_VALUE,
+ tx_drops, VDPA_ATTR_PAD))
+ break;
+ err = 0;
+ break;
+ case 2:
+ do {
+ start = u64_stats_fetch_begin(&net->cq_stats.syncp);
+ cq_requests = net->cq_stats.requests;
+ cq_successes = net->cq_stats.successes;
+ cq_errors = net->cq_stats.errors;
+ } while (u64_stats_fetch_retry(&net->cq_stats.syncp, start));
+
+ if (nla_put_string(msg, VDPA_ATTR_DEV_VENDOR_ATTR_NAME,
+ "cvq requests"))
+ break;
+ if (nla_put_u64_64bit(msg, VDPA_ATTR_DEV_VENDOR_ATTR_VALUE,
+ cq_requests, VDPA_ATTR_PAD))
+ break;
+ if (nla_put_string(msg, VDPA_ATTR_DEV_VENDOR_ATTR_NAME,
+ "cvq successes"))
+ break;
+ if (nla_put_u64_64bit(msg, VDPA_ATTR_DEV_VENDOR_ATTR_VALUE,
+ cq_successes, VDPA_ATTR_PAD))
+ break;
+ if (nla_put_string(msg, VDPA_ATTR_DEV_VENDOR_ATTR_NAME,
+ "cvq errors"))
+ break;
+ if (nla_put_u64_64bit(msg, VDPA_ATTR_DEV_VENDOR_ATTR_VALUE,
+ cq_errors, VDPA_ATTR_PAD))
+ break;
+ err = 0;
+ break;
+ default:
+ err = -EINVAL;
+ break;
+ }
+
+ return err;
+}
+
+static void vdpasim_net_get_config(struct vdpasim *vdpasim, void *config)
+{
+ struct virtio_net_config *net_config = config;
+
+ net_config->status = cpu_to_vdpasim16(vdpasim, VIRTIO_NET_S_LINK_UP);
+}
+
+static void vdpasim_net_setup_config(struct vdpasim *vdpasim,
+ const struct vdpa_dev_set_config *config)
+{
+ struct virtio_net_config *vio_config = vdpasim->config;
+
+ if (config->mask & (1 << VDPA_ATTR_DEV_NET_CFG_MACADDR))
+ memcpy(vio_config->mac, config->net.mac, ETH_ALEN);
+ if (config->mask & (1 << VDPA_ATTR_DEV_NET_CFG_MTU))
+ vio_config->mtu = cpu_to_vdpasim16(vdpasim, config->net.mtu);
+ else
+ /* Setup default MTU to be 1500 */
+ vio_config->mtu = cpu_to_vdpasim16(vdpasim, 1500);
+}
+
+static void vdpasim_net_free(struct vdpasim *vdpasim)
+{
+ struct vdpasim_net *net = sim_to_net(vdpasim);
+
+ kvfree(net->buffer);
+}
+
+static void vdpasim_net_mgmtdev_release(struct device *dev)
+{
+}
+
+static struct device vdpasim_net_mgmtdev = {
+ .init_name = "vdpasim_net",
+ .release = vdpasim_net_mgmtdev_release,
+};
+
+static int vdpasim_net_dev_add(struct vdpa_mgmt_dev *mdev, const char *name,
+ const struct vdpa_dev_set_config *config)
+{
+ struct vdpasim_dev_attr dev_attr = {};
+ struct vdpasim_net *net;
+ struct vdpasim *simdev;
+ int ret;
+
+ dev_attr.mgmt_dev = mdev;
+ dev_attr.name = name;
+ dev_attr.id = VIRTIO_ID_NET;
+ dev_attr.supported_features = VDPASIM_NET_FEATURES;
+ dev_attr.nvqs = VDPASIM_NET_VQ_NUM;
+ dev_attr.ngroups = VDPASIM_NET_GROUP_NUM;
+ dev_attr.nas = VDPASIM_NET_AS_NUM;
+ dev_attr.alloc_size = sizeof(struct vdpasim_net);
+ dev_attr.config_size = sizeof(struct virtio_net_config);
+ dev_attr.get_config = vdpasim_net_get_config;
+ dev_attr.work_fn = vdpasim_net_work;
+ dev_attr.get_stats = vdpasim_net_get_stats;
+ dev_attr.free = vdpasim_net_free;
+
+ simdev = vdpasim_create(&dev_attr, config);
+ if (IS_ERR(simdev))
+ return PTR_ERR(simdev);
+
+ vdpasim_net_setup_config(simdev, config);
+
+ net = sim_to_net(simdev);
+
+ u64_stats_init(&net->tx_stats.syncp);
+ u64_stats_init(&net->rx_stats.syncp);
+ u64_stats_init(&net->cq_stats.syncp);
+
+ net->buffer = kvmalloc(PAGE_SIZE, GFP_KERNEL);
+ if (!net->buffer) {
+ ret = -ENOMEM;
+ goto reg_err;
+ }
+
+ /*
+ * Initialization must be completed before this call, since it can
+ * connect the device to the vDPA bus, so requests can arrive after
+ * this call.
+ */
+ ret = _vdpa_register_device(&simdev->vdpa, VDPASIM_NET_VQ_NUM);
+ if (ret)
+ goto reg_err;
+
+ return 0;
+
+reg_err:
+ put_device(&simdev->vdpa.dev);
+ return ret;
+}
+
+static void vdpasim_net_dev_del(struct vdpa_mgmt_dev *mdev,
+ struct vdpa_device *dev)
+{
+ struct vdpasim *simdev = container_of(dev, struct vdpasim, vdpa);
+
+ _vdpa_unregister_device(&simdev->vdpa);
+}
+
+static const struct vdpa_mgmtdev_ops vdpasim_net_mgmtdev_ops = {
+ .dev_add = vdpasim_net_dev_add,
+ .dev_del = vdpasim_net_dev_del
+};
+
+static struct virtio_device_id id_table[] = {
+ { VIRTIO_ID_NET, VIRTIO_DEV_ANY_ID },
+ { 0 },
+};
+
+static struct vdpa_mgmt_dev mgmt_dev = {
+ .device = &vdpasim_net_mgmtdev,
+ .id_table = id_table,
+ .ops = &vdpasim_net_mgmtdev_ops,
+ .config_attr_mask = (1 << VDPA_ATTR_DEV_NET_CFG_MACADDR |
+ 1 << VDPA_ATTR_DEV_NET_CFG_MTU |
+ 1 << VDPA_ATTR_DEV_FEATURES),
+ .max_supported_vqs = VDPASIM_NET_VQ_NUM,
+ .supported_features = VDPASIM_NET_FEATURES,
+};
+
+static int __init vdpasim_net_init(void)
+{
+ int ret;
+
+ ret = device_register(&vdpasim_net_mgmtdev);
+ if (ret) {
+ put_device(&vdpasim_net_mgmtdev);
+ return ret;
+ }
+
+ ret = vdpa_mgmtdev_register(&mgmt_dev);
+ if (ret)
+ goto parent_err;
+ return 0;
+
+parent_err:
+ device_unregister(&vdpasim_net_mgmtdev);
+ return ret;
+}
+
+static void __exit vdpasim_net_exit(void)
+{
+ vdpa_mgmtdev_unregister(&mgmt_dev);
+ device_unregister(&vdpasim_net_mgmtdev);
+}
+
+module_init(vdpasim_net_init);
+module_exit(vdpasim_net_exit);
+
+MODULE_VERSION(DRV_VERSION);
+MODULE_LICENSE(DRV_LICENSE);
+MODULE_AUTHOR(DRV_AUTHOR);
+MODULE_DESCRIPTION(DRV_DESC);
diff --git a/drivers/vdpa/vdpa_user/Makefile b/drivers/vdpa/vdpa_user/Makefile
new file mode 100644
index 0000000000..260e0b26af
--- /dev/null
+++ b/drivers/vdpa/vdpa_user/Makefile
@@ -0,0 +1,5 @@
+# SPDX-License-Identifier: GPL-2.0
+
+vduse-y := vduse_dev.o iova_domain.o
+
+obj-$(CONFIG_VDPA_USER) += vduse.o
diff --git a/drivers/vdpa/vdpa_user/iova_domain.c b/drivers/vdpa/vdpa_user/iova_domain.c
new file mode 100644
index 0000000000..5e4a77b9ba
--- /dev/null
+++ b/drivers/vdpa/vdpa_user/iova_domain.c
@@ -0,0 +1,624 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * MMU-based software IOTLB.
+ *
+ * Copyright (C) 2020-2021 Bytedance Inc. and/or its affiliates. All rights reserved.
+ *
+ * Author: Xie Yongji <xieyongji@bytedance.com>
+ *
+ */
+
+#include <linux/slab.h>
+#include <linux/file.h>
+#include <linux/anon_inodes.h>
+#include <linux/highmem.h>
+#include <linux/vmalloc.h>
+#include <linux/vdpa.h>
+
+#include "iova_domain.h"
+
+static int vduse_iotlb_add_range(struct vduse_iova_domain *domain,
+ u64 start, u64 last,
+ u64 addr, unsigned int perm,
+ struct file *file, u64 offset)
+{
+ struct vdpa_map_file *map_file;
+ int ret;
+
+ map_file = kmalloc(sizeof(*map_file), GFP_ATOMIC);
+ if (!map_file)
+ return -ENOMEM;
+
+ map_file->file = get_file(file);
+ map_file->offset = offset;
+
+ ret = vhost_iotlb_add_range_ctx(domain->iotlb, start, last,
+ addr, perm, map_file);
+ if (ret) {
+ fput(map_file->file);
+ kfree(map_file);
+ return ret;
+ }
+ return 0;
+}
+
+static void vduse_iotlb_del_range(struct vduse_iova_domain *domain,
+ u64 start, u64 last)
+{
+ struct vdpa_map_file *map_file;
+ struct vhost_iotlb_map *map;
+
+ while ((map = vhost_iotlb_itree_first(domain->iotlb, start, last))) {
+ map_file = (struct vdpa_map_file *)map->opaque;
+ fput(map_file->file);
+ kfree(map_file);
+ vhost_iotlb_map_free(domain->iotlb, map);
+ }
+}
+
+int vduse_domain_set_map(struct vduse_iova_domain *domain,
+ struct vhost_iotlb *iotlb)
+{
+ struct vdpa_map_file *map_file;
+ struct vhost_iotlb_map *map;
+ u64 start = 0ULL, last = ULLONG_MAX;
+ int ret;
+
+ spin_lock(&domain->iotlb_lock);
+ vduse_iotlb_del_range(domain, start, last);
+
+ for (map = vhost_iotlb_itree_first(iotlb, start, last); map;
+ map = vhost_iotlb_itree_next(map, start, last)) {
+ map_file = (struct vdpa_map_file *)map->opaque;
+ ret = vduse_iotlb_add_range(domain, map->start, map->last,
+ map->addr, map->perm,
+ map_file->file,
+ map_file->offset);
+ if (ret)
+ goto err;
+ }
+ spin_unlock(&domain->iotlb_lock);
+
+ return 0;
+err:
+ vduse_iotlb_del_range(domain, start, last);
+ spin_unlock(&domain->iotlb_lock);
+ return ret;
+}
+
+void vduse_domain_clear_map(struct vduse_iova_domain *domain,
+ struct vhost_iotlb *iotlb)
+{
+ struct vhost_iotlb_map *map;
+ u64 start = 0ULL, last = ULLONG_MAX;
+
+ spin_lock(&domain->iotlb_lock);
+ for (map = vhost_iotlb_itree_first(iotlb, start, last); map;
+ map = vhost_iotlb_itree_next(map, start, last)) {
+ vduse_iotlb_del_range(domain, map->start, map->last);
+ }
+ spin_unlock(&domain->iotlb_lock);
+}
+
+static int vduse_domain_map_bounce_page(struct vduse_iova_domain *domain,
+ u64 iova, u64 size, u64 paddr)
+{
+ struct vduse_bounce_map *map;
+ u64 last = iova + size - 1;
+
+ while (iova <= last) {
+ map = &domain->bounce_maps[iova >> PAGE_SHIFT];
+ if (!map->bounce_page) {
+ map->bounce_page = alloc_page(GFP_ATOMIC);
+ if (!map->bounce_page)
+ return -ENOMEM;
+ }
+ map->orig_phys = paddr;
+ paddr += PAGE_SIZE;
+ iova += PAGE_SIZE;
+ }
+ return 0;
+}
+
+static void vduse_domain_unmap_bounce_page(struct vduse_iova_domain *domain,
+ u64 iova, u64 size)
+{
+ struct vduse_bounce_map *map;
+ u64 last = iova + size - 1;
+
+ while (iova <= last) {
+ map = &domain->bounce_maps[iova >> PAGE_SHIFT];
+ map->orig_phys = INVALID_PHYS_ADDR;
+ iova += PAGE_SIZE;
+ }
+}
+
+static void do_bounce(phys_addr_t orig, void *addr, size_t size,
+ enum dma_data_direction dir)
+{
+ unsigned long pfn = PFN_DOWN(orig);
+ unsigned int offset = offset_in_page(orig);
+ struct page *page;
+ unsigned int sz = 0;
+
+ while (size) {
+ sz = min_t(size_t, PAGE_SIZE - offset, size);
+
+ page = pfn_to_page(pfn);
+ if (dir == DMA_TO_DEVICE)
+ memcpy_from_page(addr, page, offset, sz);
+ else
+ memcpy_to_page(page, offset, addr, sz);
+
+ size -= sz;
+ pfn++;
+ addr += sz;
+ offset = 0;
+ }
+}
+
+static void vduse_domain_bounce(struct vduse_iova_domain *domain,
+ dma_addr_t iova, size_t size,
+ enum dma_data_direction dir)
+{
+ struct vduse_bounce_map *map;
+ unsigned int offset;
+ void *addr;
+ size_t sz;
+
+ if (iova >= domain->bounce_size)
+ return;
+
+ while (size) {
+ map = &domain->bounce_maps[iova >> PAGE_SHIFT];
+ offset = offset_in_page(iova);
+ sz = min_t(size_t, PAGE_SIZE - offset, size);
+
+ if (WARN_ON(!map->bounce_page ||
+ map->orig_phys == INVALID_PHYS_ADDR))
+ return;
+
+ addr = kmap_local_page(map->bounce_page);
+ do_bounce(map->orig_phys + offset, addr + offset, sz, dir);
+ kunmap_local(addr);
+ size -= sz;
+ iova += sz;
+ }
+}
+
+static struct page *
+vduse_domain_get_coherent_page(struct vduse_iova_domain *domain, u64 iova)
+{
+ u64 start = iova & PAGE_MASK;
+ u64 last = start + PAGE_SIZE - 1;
+ struct vhost_iotlb_map *map;
+ struct page *page = NULL;
+
+ spin_lock(&domain->iotlb_lock);
+ map = vhost_iotlb_itree_first(domain->iotlb, start, last);
+ if (!map)
+ goto out;
+
+ page = pfn_to_page((map->addr + iova - map->start) >> PAGE_SHIFT);
+ get_page(page);
+out:
+ spin_unlock(&domain->iotlb_lock);
+
+ return page;
+}
+
+static struct page *
+vduse_domain_get_bounce_page(struct vduse_iova_domain *domain, u64 iova)
+{
+ struct vduse_bounce_map *map;
+ struct page *page = NULL;
+
+ read_lock(&domain->bounce_lock);
+ map = &domain->bounce_maps[iova >> PAGE_SHIFT];
+ if (domain->user_bounce_pages || !map->bounce_page)
+ goto out;
+
+ page = map->bounce_page;
+ get_page(page);
+out:
+ read_unlock(&domain->bounce_lock);
+
+ return page;
+}
+
+static void
+vduse_domain_free_kernel_bounce_pages(struct vduse_iova_domain *domain)
+{
+ struct vduse_bounce_map *map;
+ unsigned long pfn, bounce_pfns;
+
+ bounce_pfns = domain->bounce_size >> PAGE_SHIFT;
+
+ for (pfn = 0; pfn < bounce_pfns; pfn++) {
+ map = &domain->bounce_maps[pfn];
+ if (WARN_ON(map->orig_phys != INVALID_PHYS_ADDR))
+ continue;
+
+ if (!map->bounce_page)
+ continue;
+
+ __free_page(map->bounce_page);
+ map->bounce_page = NULL;
+ }
+}
+
+int vduse_domain_add_user_bounce_pages(struct vduse_iova_domain *domain,
+ struct page **pages, int count)
+{
+ struct vduse_bounce_map *map;
+ int i, ret;
+
+ /* Now we don't support partial mapping */
+ if (count != (domain->bounce_size >> PAGE_SHIFT))
+ return -EINVAL;
+
+ write_lock(&domain->bounce_lock);
+ ret = -EEXIST;
+ if (domain->user_bounce_pages)
+ goto out;
+
+ for (i = 0; i < count; i++) {
+ map = &domain->bounce_maps[i];
+ if (map->bounce_page) {
+ /* Copy kernel page to user page if it's in use */
+ if (map->orig_phys != INVALID_PHYS_ADDR)
+ memcpy_to_page(pages[i], 0,
+ page_address(map->bounce_page),
+ PAGE_SIZE);
+ __free_page(map->bounce_page);
+ }
+ map->bounce_page = pages[i];
+ get_page(pages[i]);
+ }
+ domain->user_bounce_pages = true;
+ ret = 0;
+out:
+ write_unlock(&domain->bounce_lock);
+
+ return ret;
+}
+
+void vduse_domain_remove_user_bounce_pages(struct vduse_iova_domain *domain)
+{
+ struct vduse_bounce_map *map;
+ unsigned long i, count;
+
+ write_lock(&domain->bounce_lock);
+ if (!domain->user_bounce_pages)
+ goto out;
+
+ count = domain->bounce_size >> PAGE_SHIFT;
+ for (i = 0; i < count; i++) {
+ struct page *page = NULL;
+
+ map = &domain->bounce_maps[i];
+ if (WARN_ON(!map->bounce_page))
+ continue;
+
+ /* Copy user page to kernel page if it's in use */
+ if (map->orig_phys != INVALID_PHYS_ADDR) {
+ page = alloc_page(GFP_ATOMIC | __GFP_NOFAIL);
+ memcpy_from_page(page_address(page),
+ map->bounce_page, 0, PAGE_SIZE);
+ }
+ put_page(map->bounce_page);
+ map->bounce_page = page;
+ }
+ domain->user_bounce_pages = false;
+out:
+ write_unlock(&domain->bounce_lock);
+}
+
+void vduse_domain_reset_bounce_map(struct vduse_iova_domain *domain)
+{
+ if (!domain->bounce_map)
+ return;
+
+ spin_lock(&domain->iotlb_lock);
+ if (!domain->bounce_map)
+ goto unlock;
+
+ vduse_iotlb_del_range(domain, 0, domain->bounce_size - 1);
+ domain->bounce_map = 0;
+unlock:
+ spin_unlock(&domain->iotlb_lock);
+}
+
+static int vduse_domain_init_bounce_map(struct vduse_iova_domain *domain)
+{
+ int ret = 0;
+
+ if (domain->bounce_map)
+ return 0;
+
+ spin_lock(&domain->iotlb_lock);
+ if (domain->bounce_map)
+ goto unlock;
+
+ ret = vduse_iotlb_add_range(domain, 0, domain->bounce_size - 1,
+ 0, VHOST_MAP_RW, domain->file, 0);
+ if (ret)
+ goto unlock;
+
+ domain->bounce_map = 1;
+unlock:
+ spin_unlock(&domain->iotlb_lock);
+ return ret;
+}
+
+static dma_addr_t
+vduse_domain_alloc_iova(struct iova_domain *iovad,
+ unsigned long size, unsigned long limit)
+{
+ unsigned long shift = iova_shift(iovad);
+ unsigned long iova_len = iova_align(iovad, size) >> shift;
+ unsigned long iova_pfn;
+
+ iova_pfn = alloc_iova_fast(iovad, iova_len, limit >> shift, true);
+
+ return (dma_addr_t)iova_pfn << shift;
+}
+
+static void vduse_domain_free_iova(struct iova_domain *iovad,
+ dma_addr_t iova, size_t size)
+{
+ unsigned long shift = iova_shift(iovad);
+ unsigned long iova_len = iova_align(iovad, size) >> shift;
+
+ free_iova_fast(iovad, iova >> shift, iova_len);
+}
+
+dma_addr_t vduse_domain_map_page(struct vduse_iova_domain *domain,
+ struct page *page, unsigned long offset,
+ size_t size, enum dma_data_direction dir,
+ unsigned long attrs)
+{
+ struct iova_domain *iovad = &domain->stream_iovad;
+ unsigned long limit = domain->bounce_size - 1;
+ phys_addr_t pa = page_to_phys(page) + offset;
+ dma_addr_t iova = vduse_domain_alloc_iova(iovad, size, limit);
+
+ if (!iova)
+ return DMA_MAPPING_ERROR;
+
+ if (vduse_domain_init_bounce_map(domain))
+ goto err;
+
+ read_lock(&domain->bounce_lock);
+ if (vduse_domain_map_bounce_page(domain, (u64)iova, (u64)size, pa))
+ goto err_unlock;
+
+ if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL)
+ vduse_domain_bounce(domain, iova, size, DMA_TO_DEVICE);
+
+ read_unlock(&domain->bounce_lock);
+
+ return iova;
+err_unlock:
+ read_unlock(&domain->bounce_lock);
+err:
+ vduse_domain_free_iova(iovad, iova, size);
+ return DMA_MAPPING_ERROR;
+}
+
+void vduse_domain_unmap_page(struct vduse_iova_domain *domain,
+ dma_addr_t dma_addr, size_t size,
+ enum dma_data_direction dir, unsigned long attrs)
+{
+ struct iova_domain *iovad = &domain->stream_iovad;
+
+ read_lock(&domain->bounce_lock);
+ if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
+ vduse_domain_bounce(domain, dma_addr, size, DMA_FROM_DEVICE);
+
+ vduse_domain_unmap_bounce_page(domain, (u64)dma_addr, (u64)size);
+ read_unlock(&domain->bounce_lock);
+ vduse_domain_free_iova(iovad, dma_addr, size);
+}
+
+void *vduse_domain_alloc_coherent(struct vduse_iova_domain *domain,
+ size_t size, dma_addr_t *dma_addr,
+ gfp_t flag, unsigned long attrs)
+{
+ struct iova_domain *iovad = &domain->consistent_iovad;
+ unsigned long limit = domain->iova_limit;
+ dma_addr_t iova = vduse_domain_alloc_iova(iovad, size, limit);
+ void *orig = alloc_pages_exact(size, flag);
+
+ if (!iova || !orig)
+ goto err;
+
+ spin_lock(&domain->iotlb_lock);
+ if (vduse_iotlb_add_range(domain, (u64)iova, (u64)iova + size - 1,
+ virt_to_phys(orig), VHOST_MAP_RW,
+ domain->file, (u64)iova)) {
+ spin_unlock(&domain->iotlb_lock);
+ goto err;
+ }
+ spin_unlock(&domain->iotlb_lock);
+
+ *dma_addr = iova;
+
+ return orig;
+err:
+ *dma_addr = DMA_MAPPING_ERROR;
+ if (orig)
+ free_pages_exact(orig, size);
+ if (iova)
+ vduse_domain_free_iova(iovad, iova, size);
+
+ return NULL;
+}
+
+void vduse_domain_free_coherent(struct vduse_iova_domain *domain, size_t size,
+ void *vaddr, dma_addr_t dma_addr,
+ unsigned long attrs)
+{
+ struct iova_domain *iovad = &domain->consistent_iovad;
+ struct vhost_iotlb_map *map;
+ struct vdpa_map_file *map_file;
+ phys_addr_t pa;
+
+ spin_lock(&domain->iotlb_lock);
+ map = vhost_iotlb_itree_first(domain->iotlb, (u64)dma_addr,
+ (u64)dma_addr + size - 1);
+ if (WARN_ON(!map)) {
+ spin_unlock(&domain->iotlb_lock);
+ return;
+ }
+ map_file = (struct vdpa_map_file *)map->opaque;
+ fput(map_file->file);
+ kfree(map_file);
+ pa = map->addr;
+ vhost_iotlb_map_free(domain->iotlb, map);
+ spin_unlock(&domain->iotlb_lock);
+
+ vduse_domain_free_iova(iovad, dma_addr, size);
+ free_pages_exact(phys_to_virt(pa), size);
+}
+
+static vm_fault_t vduse_domain_mmap_fault(struct vm_fault *vmf)
+{
+ struct vduse_iova_domain *domain = vmf->vma->vm_private_data;
+ unsigned long iova = vmf->pgoff << PAGE_SHIFT;
+ struct page *page;
+
+ if (!domain)
+ return VM_FAULT_SIGBUS;
+
+ if (iova < domain->bounce_size)
+ page = vduse_domain_get_bounce_page(domain, iova);
+ else
+ page = vduse_domain_get_coherent_page(domain, iova);
+
+ if (!page)
+ return VM_FAULT_SIGBUS;
+
+ vmf->page = page;
+
+ return 0;
+}
+
+static const struct vm_operations_struct vduse_domain_mmap_ops = {
+ .fault = vduse_domain_mmap_fault,
+};
+
+static int vduse_domain_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ struct vduse_iova_domain *domain = file->private_data;
+
+ vm_flags_set(vma, VM_DONTDUMP | VM_DONTEXPAND);
+ vma->vm_private_data = domain;
+ vma->vm_ops = &vduse_domain_mmap_ops;
+
+ return 0;
+}
+
+static int vduse_domain_release(struct inode *inode, struct file *file)
+{
+ struct vduse_iova_domain *domain = file->private_data;
+
+ spin_lock(&domain->iotlb_lock);
+ vduse_iotlb_del_range(domain, 0, ULLONG_MAX);
+ vduse_domain_remove_user_bounce_pages(domain);
+ vduse_domain_free_kernel_bounce_pages(domain);
+ spin_unlock(&domain->iotlb_lock);
+ put_iova_domain(&domain->stream_iovad);
+ put_iova_domain(&domain->consistent_iovad);
+ vhost_iotlb_free(domain->iotlb);
+ vfree(domain->bounce_maps);
+ kfree(domain);
+
+ return 0;
+}
+
+static const struct file_operations vduse_domain_fops = {
+ .owner = THIS_MODULE,
+ .mmap = vduse_domain_mmap,
+ .release = vduse_domain_release,
+};
+
+void vduse_domain_destroy(struct vduse_iova_domain *domain)
+{
+ fput(domain->file);
+}
+
+struct vduse_iova_domain *
+vduse_domain_create(unsigned long iova_limit, size_t bounce_size)
+{
+ struct vduse_iova_domain *domain;
+ struct file *file;
+ struct vduse_bounce_map *map;
+ unsigned long pfn, bounce_pfns;
+ int ret;
+
+ bounce_pfns = PAGE_ALIGN(bounce_size) >> PAGE_SHIFT;
+ if (iova_limit <= bounce_size)
+ return NULL;
+
+ domain = kzalloc(sizeof(*domain), GFP_KERNEL);
+ if (!domain)
+ return NULL;
+
+ domain->iotlb = vhost_iotlb_alloc(0, 0);
+ if (!domain->iotlb)
+ goto err_iotlb;
+
+ domain->iova_limit = iova_limit;
+ domain->bounce_size = PAGE_ALIGN(bounce_size);
+ domain->bounce_maps = vzalloc(bounce_pfns *
+ sizeof(struct vduse_bounce_map));
+ if (!domain->bounce_maps)
+ goto err_map;
+
+ for (pfn = 0; pfn < bounce_pfns; pfn++) {
+ map = &domain->bounce_maps[pfn];
+ map->orig_phys = INVALID_PHYS_ADDR;
+ }
+ file = anon_inode_getfile("[vduse-domain]", &vduse_domain_fops,
+ domain, O_RDWR);
+ if (IS_ERR(file))
+ goto err_file;
+
+ domain->file = file;
+ rwlock_init(&domain->bounce_lock);
+ spin_lock_init(&domain->iotlb_lock);
+ init_iova_domain(&domain->stream_iovad,
+ PAGE_SIZE, IOVA_START_PFN);
+ ret = iova_domain_init_rcaches(&domain->stream_iovad);
+ if (ret)
+ goto err_iovad_stream;
+ init_iova_domain(&domain->consistent_iovad,
+ PAGE_SIZE, bounce_pfns);
+ ret = iova_domain_init_rcaches(&domain->consistent_iovad);
+ if (ret)
+ goto err_iovad_consistent;
+
+ return domain;
+err_iovad_consistent:
+ put_iova_domain(&domain->stream_iovad);
+err_iovad_stream:
+ fput(file);
+err_file:
+ vfree(domain->bounce_maps);
+err_map:
+ vhost_iotlb_free(domain->iotlb);
+err_iotlb:
+ kfree(domain);
+ return NULL;
+}
+
+int vduse_domain_init(void)
+{
+ return iova_cache_get();
+}
+
+void vduse_domain_exit(void)
+{
+ iova_cache_put();
+}
diff --git a/drivers/vdpa/vdpa_user/iova_domain.h b/drivers/vdpa/vdpa_user/iova_domain.h
new file mode 100644
index 0000000000..173e979b84
--- /dev/null
+++ b/drivers/vdpa/vdpa_user/iova_domain.h
@@ -0,0 +1,80 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * MMU-based software IOTLB.
+ *
+ * Copyright (C) 2020-2021 Bytedance Inc. and/or its affiliates. All rights reserved.
+ *
+ * Author: Xie Yongji <xieyongji@bytedance.com>
+ *
+ */
+
+#ifndef _VDUSE_IOVA_DOMAIN_H
+#define _VDUSE_IOVA_DOMAIN_H
+
+#include <linux/iova.h>
+#include <linux/dma-mapping.h>
+#include <linux/vhost_iotlb.h>
+
+#define IOVA_START_PFN 1
+
+#define INVALID_PHYS_ADDR (~(phys_addr_t)0)
+
+struct vduse_bounce_map {
+ struct page *bounce_page;
+ u64 orig_phys;
+};
+
+struct vduse_iova_domain {
+ struct iova_domain stream_iovad;
+ struct iova_domain consistent_iovad;
+ struct vduse_bounce_map *bounce_maps;
+ size_t bounce_size;
+ unsigned long iova_limit;
+ int bounce_map;
+ struct vhost_iotlb *iotlb;
+ spinlock_t iotlb_lock;
+ struct file *file;
+ bool user_bounce_pages;
+ rwlock_t bounce_lock;
+};
+
+int vduse_domain_set_map(struct vduse_iova_domain *domain,
+ struct vhost_iotlb *iotlb);
+
+void vduse_domain_clear_map(struct vduse_iova_domain *domain,
+ struct vhost_iotlb *iotlb);
+
+dma_addr_t vduse_domain_map_page(struct vduse_iova_domain *domain,
+ struct page *page, unsigned long offset,
+ size_t size, enum dma_data_direction dir,
+ unsigned long attrs);
+
+void vduse_domain_unmap_page(struct vduse_iova_domain *domain,
+ dma_addr_t dma_addr, size_t size,
+ enum dma_data_direction dir, unsigned long attrs);
+
+void *vduse_domain_alloc_coherent(struct vduse_iova_domain *domain,
+ size_t size, dma_addr_t *dma_addr,
+ gfp_t flag, unsigned long attrs);
+
+void vduse_domain_free_coherent(struct vduse_iova_domain *domain, size_t size,
+ void *vaddr, dma_addr_t dma_addr,
+ unsigned long attrs);
+
+void vduse_domain_reset_bounce_map(struct vduse_iova_domain *domain);
+
+int vduse_domain_add_user_bounce_pages(struct vduse_iova_domain *domain,
+ struct page **pages, int count);
+
+void vduse_domain_remove_user_bounce_pages(struct vduse_iova_domain *domain);
+
+void vduse_domain_destroy(struct vduse_iova_domain *domain);
+
+struct vduse_iova_domain *vduse_domain_create(unsigned long iova_limit,
+ size_t bounce_size);
+
+int vduse_domain_init(void);
+
+void vduse_domain_exit(void);
+
+#endif /* _VDUSE_IOVA_DOMAIN_H */
diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c
new file mode 100644
index 0000000000..df7869537e
--- /dev/null
+++ b/drivers/vdpa/vdpa_user/vduse_dev.c
@@ -0,0 +1,2171 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * VDUSE: vDPA Device in Userspace
+ *
+ * Copyright (C) 2020-2021 Bytedance Inc. and/or its affiliates. All rights reserved.
+ *
+ * Author: Xie Yongji <xieyongji@bytedance.com>
+ *
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/cdev.h>
+#include <linux/device.h>
+#include <linux/eventfd.h>
+#include <linux/slab.h>
+#include <linux/wait.h>
+#include <linux/dma-map-ops.h>
+#include <linux/poll.h>
+#include <linux/file.h>
+#include <linux/uio.h>
+#include <linux/vdpa.h>
+#include <linux/nospec.h>
+#include <linux/vmalloc.h>
+#include <linux/sched/mm.h>
+#include <uapi/linux/vduse.h>
+#include <uapi/linux/vdpa.h>
+#include <uapi/linux/virtio_config.h>
+#include <uapi/linux/virtio_ids.h>
+#include <uapi/linux/virtio_blk.h>
+#include <linux/mod_devicetable.h>
+
+#include "iova_domain.h"
+
+#define DRV_AUTHOR "Yongji Xie <xieyongji@bytedance.com>"
+#define DRV_DESC "vDPA Device in Userspace"
+#define DRV_LICENSE "GPL v2"
+
+#define VDUSE_DEV_MAX (1U << MINORBITS)
+#define VDUSE_MAX_BOUNCE_SIZE (1024 * 1024 * 1024)
+#define VDUSE_MIN_BOUNCE_SIZE (1024 * 1024)
+#define VDUSE_BOUNCE_SIZE (64 * 1024 * 1024)
+/* 128 MB reserved for virtqueue creation */
+#define VDUSE_IOVA_SIZE (VDUSE_MAX_BOUNCE_SIZE + 128 * 1024 * 1024)
+#define VDUSE_MSG_DEFAULT_TIMEOUT 30
+
+#define IRQ_UNBOUND -1
+
+struct vduse_virtqueue {
+ u16 index;
+ u16 num_max;
+ u32 num;
+ u64 desc_addr;
+ u64 driver_addr;
+ u64 device_addr;
+ struct vdpa_vq_state state;
+ bool ready;
+ bool kicked;
+ spinlock_t kick_lock;
+ spinlock_t irq_lock;
+ struct eventfd_ctx *kickfd;
+ struct vdpa_callback cb;
+ struct work_struct inject;
+ struct work_struct kick;
+ int irq_effective_cpu;
+ struct cpumask irq_affinity;
+ struct kobject kobj;
+};
+
+struct vduse_dev;
+
+struct vduse_vdpa {
+ struct vdpa_device vdpa;
+ struct vduse_dev *dev;
+};
+
+struct vduse_umem {
+ unsigned long iova;
+ unsigned long npages;
+ struct page **pages;
+ struct mm_struct *mm;
+};
+
+struct vduse_dev {
+ struct vduse_vdpa *vdev;
+ struct device *dev;
+ struct vduse_virtqueue **vqs;
+ struct vduse_iova_domain *domain;
+ char *name;
+ struct mutex lock;
+ spinlock_t msg_lock;
+ u64 msg_unique;
+ u32 msg_timeout;
+ wait_queue_head_t waitq;
+ struct list_head send_list;
+ struct list_head recv_list;
+ struct vdpa_callback config_cb;
+ struct work_struct inject;
+ spinlock_t irq_lock;
+ struct rw_semaphore rwsem;
+ int minor;
+ bool broken;
+ bool connected;
+ u64 api_version;
+ u64 device_features;
+ u64 driver_features;
+ u32 device_id;
+ u32 vendor_id;
+ u32 generation;
+ u32 config_size;
+ void *config;
+ u8 status;
+ u32 vq_num;
+ u32 vq_align;
+ struct vduse_umem *umem;
+ struct mutex mem_lock;
+ unsigned int bounce_size;
+ struct mutex domain_lock;
+};
+
+struct vduse_dev_msg {
+ struct vduse_dev_request req;
+ struct vduse_dev_response resp;
+ struct list_head list;
+ wait_queue_head_t waitq;
+ bool completed;
+};
+
+struct vduse_control {
+ u64 api_version;
+};
+
+static DEFINE_MUTEX(vduse_lock);
+static DEFINE_IDR(vduse_idr);
+
+static dev_t vduse_major;
+static struct class *vduse_class;
+static struct cdev vduse_ctrl_cdev;
+static struct cdev vduse_cdev;
+static struct workqueue_struct *vduse_irq_wq;
+static struct workqueue_struct *vduse_irq_bound_wq;
+
+static u32 allowed_device_id[] = {
+ VIRTIO_ID_BLOCK,
+};
+
+static inline struct vduse_dev *vdpa_to_vduse(struct vdpa_device *vdpa)
+{
+ struct vduse_vdpa *vdev = container_of(vdpa, struct vduse_vdpa, vdpa);
+
+ return vdev->dev;
+}
+
+static inline struct vduse_dev *dev_to_vduse(struct device *dev)
+{
+ struct vdpa_device *vdpa = dev_to_vdpa(dev);
+
+ return vdpa_to_vduse(vdpa);
+}
+
+static struct vduse_dev_msg *vduse_find_msg(struct list_head *head,
+ uint32_t request_id)
+{
+ struct vduse_dev_msg *msg;
+
+ list_for_each_entry(msg, head, list) {
+ if (msg->req.request_id == request_id) {
+ list_del(&msg->list);
+ return msg;
+ }
+ }
+
+ return NULL;
+}
+
+static struct vduse_dev_msg *vduse_dequeue_msg(struct list_head *head)
+{
+ struct vduse_dev_msg *msg = NULL;
+
+ if (!list_empty(head)) {
+ msg = list_first_entry(head, struct vduse_dev_msg, list);
+ list_del(&msg->list);
+ }
+
+ return msg;
+}
+
+static void vduse_enqueue_msg(struct list_head *head,
+ struct vduse_dev_msg *msg)
+{
+ list_add_tail(&msg->list, head);
+}
+
+static void vduse_dev_broken(struct vduse_dev *dev)
+{
+ struct vduse_dev_msg *msg, *tmp;
+
+ if (unlikely(dev->broken))
+ return;
+
+ list_splice_init(&dev->recv_list, &dev->send_list);
+ list_for_each_entry_safe(msg, tmp, &dev->send_list, list) {
+ list_del(&msg->list);
+ msg->completed = 1;
+ msg->resp.result = VDUSE_REQ_RESULT_FAILED;
+ wake_up(&msg->waitq);
+ }
+ dev->broken = true;
+ wake_up(&dev->waitq);
+}
+
+static int vduse_dev_msg_sync(struct vduse_dev *dev,
+ struct vduse_dev_msg *msg)
+{
+ int ret;
+
+ if (unlikely(dev->broken))
+ return -EIO;
+
+ init_waitqueue_head(&msg->waitq);
+ spin_lock(&dev->msg_lock);
+ if (unlikely(dev->broken)) {
+ spin_unlock(&dev->msg_lock);
+ return -EIO;
+ }
+ msg->req.request_id = dev->msg_unique++;
+ vduse_enqueue_msg(&dev->send_list, msg);
+ wake_up(&dev->waitq);
+ spin_unlock(&dev->msg_lock);
+ if (dev->msg_timeout)
+ ret = wait_event_killable_timeout(msg->waitq, msg->completed,
+ (long)dev->msg_timeout * HZ);
+ else
+ ret = wait_event_killable(msg->waitq, msg->completed);
+
+ spin_lock(&dev->msg_lock);
+ if (!msg->completed) {
+ list_del(&msg->list);
+ msg->resp.result = VDUSE_REQ_RESULT_FAILED;
+ /* Mark the device as malfunction when there is a timeout */
+ if (!ret)
+ vduse_dev_broken(dev);
+ }
+ ret = (msg->resp.result == VDUSE_REQ_RESULT_OK) ? 0 : -EIO;
+ spin_unlock(&dev->msg_lock);
+
+ return ret;
+}
+
+static int vduse_dev_get_vq_state_packed(struct vduse_dev *dev,
+ struct vduse_virtqueue *vq,
+ struct vdpa_vq_state_packed *packed)
+{
+ struct vduse_dev_msg msg = { 0 };
+ int ret;
+
+ msg.req.type = VDUSE_GET_VQ_STATE;
+ msg.req.vq_state.index = vq->index;
+
+ ret = vduse_dev_msg_sync(dev, &msg);
+ if (ret)
+ return ret;
+
+ packed->last_avail_counter =
+ msg.resp.vq_state.packed.last_avail_counter & 0x0001;
+ packed->last_avail_idx =
+ msg.resp.vq_state.packed.last_avail_idx & 0x7FFF;
+ packed->last_used_counter =
+ msg.resp.vq_state.packed.last_used_counter & 0x0001;
+ packed->last_used_idx =
+ msg.resp.vq_state.packed.last_used_idx & 0x7FFF;
+
+ return 0;
+}
+
+static int vduse_dev_get_vq_state_split(struct vduse_dev *dev,
+ struct vduse_virtqueue *vq,
+ struct vdpa_vq_state_split *split)
+{
+ struct vduse_dev_msg msg = { 0 };
+ int ret;
+
+ msg.req.type = VDUSE_GET_VQ_STATE;
+ msg.req.vq_state.index = vq->index;
+
+ ret = vduse_dev_msg_sync(dev, &msg);
+ if (ret)
+ return ret;
+
+ split->avail_index = msg.resp.vq_state.split.avail_index;
+
+ return 0;
+}
+
+static int vduse_dev_set_status(struct vduse_dev *dev, u8 status)
+{
+ struct vduse_dev_msg msg = { 0 };
+
+ msg.req.type = VDUSE_SET_STATUS;
+ msg.req.s.status = status;
+
+ return vduse_dev_msg_sync(dev, &msg);
+}
+
+static int vduse_dev_update_iotlb(struct vduse_dev *dev,
+ u64 start, u64 last)
+{
+ struct vduse_dev_msg msg = { 0 };
+
+ if (last < start)
+ return -EINVAL;
+
+ msg.req.type = VDUSE_UPDATE_IOTLB;
+ msg.req.iova.start = start;
+ msg.req.iova.last = last;
+
+ return vduse_dev_msg_sync(dev, &msg);
+}
+
+static ssize_t vduse_dev_read_iter(struct kiocb *iocb, struct iov_iter *to)
+{
+ struct file *file = iocb->ki_filp;
+ struct vduse_dev *dev = file->private_data;
+ struct vduse_dev_msg *msg;
+ int size = sizeof(struct vduse_dev_request);
+ ssize_t ret;
+
+ if (iov_iter_count(to) < size)
+ return -EINVAL;
+
+ spin_lock(&dev->msg_lock);
+ while (1) {
+ msg = vduse_dequeue_msg(&dev->send_list);
+ if (msg)
+ break;
+
+ ret = -EAGAIN;
+ if (file->f_flags & O_NONBLOCK)
+ goto unlock;
+
+ spin_unlock(&dev->msg_lock);
+ ret = wait_event_interruptible_exclusive(dev->waitq,
+ !list_empty(&dev->send_list));
+ if (ret)
+ return ret;
+
+ spin_lock(&dev->msg_lock);
+ }
+ spin_unlock(&dev->msg_lock);
+ ret = copy_to_iter(&msg->req, size, to);
+ spin_lock(&dev->msg_lock);
+ if (ret != size) {
+ ret = -EFAULT;
+ vduse_enqueue_msg(&dev->send_list, msg);
+ goto unlock;
+ }
+ vduse_enqueue_msg(&dev->recv_list, msg);
+unlock:
+ spin_unlock(&dev->msg_lock);
+
+ return ret;
+}
+
+static bool is_mem_zero(const char *ptr, int size)
+{
+ int i;
+
+ for (i = 0; i < size; i++) {
+ if (ptr[i])
+ return false;
+ }
+ return true;
+}
+
+static ssize_t vduse_dev_write_iter(struct kiocb *iocb, struct iov_iter *from)
+{
+ struct file *file = iocb->ki_filp;
+ struct vduse_dev *dev = file->private_data;
+ struct vduse_dev_response resp;
+ struct vduse_dev_msg *msg;
+ size_t ret;
+
+ ret = copy_from_iter(&resp, sizeof(resp), from);
+ if (ret != sizeof(resp))
+ return -EINVAL;
+
+ if (!is_mem_zero((const char *)resp.reserved, sizeof(resp.reserved)))
+ return -EINVAL;
+
+ spin_lock(&dev->msg_lock);
+ msg = vduse_find_msg(&dev->recv_list, resp.request_id);
+ if (!msg) {
+ ret = -ENOENT;
+ goto unlock;
+ }
+
+ memcpy(&msg->resp, &resp, sizeof(resp));
+ msg->completed = 1;
+ wake_up(&msg->waitq);
+unlock:
+ spin_unlock(&dev->msg_lock);
+
+ return ret;
+}
+
+static __poll_t vduse_dev_poll(struct file *file, poll_table *wait)
+{
+ struct vduse_dev *dev = file->private_data;
+ __poll_t mask = 0;
+
+ poll_wait(file, &dev->waitq, wait);
+
+ spin_lock(&dev->msg_lock);
+
+ if (unlikely(dev->broken))
+ mask |= EPOLLERR;
+ if (!list_empty(&dev->send_list))
+ mask |= EPOLLIN | EPOLLRDNORM;
+ if (!list_empty(&dev->recv_list))
+ mask |= EPOLLOUT | EPOLLWRNORM;
+
+ spin_unlock(&dev->msg_lock);
+
+ return mask;
+}
+
+static void vduse_dev_reset(struct vduse_dev *dev)
+{
+ int i;
+ struct vduse_iova_domain *domain = dev->domain;
+
+ /* The coherent mappings are handled in vduse_dev_free_coherent() */
+ if (domain && domain->bounce_map)
+ vduse_domain_reset_bounce_map(domain);
+
+ down_write(&dev->rwsem);
+
+ dev->status = 0;
+ dev->driver_features = 0;
+ dev->generation++;
+ spin_lock(&dev->irq_lock);
+ dev->config_cb.callback = NULL;
+ dev->config_cb.private = NULL;
+ spin_unlock(&dev->irq_lock);
+ flush_work(&dev->inject);
+
+ for (i = 0; i < dev->vq_num; i++) {
+ struct vduse_virtqueue *vq = dev->vqs[i];
+
+ vq->ready = false;
+ vq->desc_addr = 0;
+ vq->driver_addr = 0;
+ vq->device_addr = 0;
+ vq->num = 0;
+ memset(&vq->state, 0, sizeof(vq->state));
+
+ spin_lock(&vq->kick_lock);
+ vq->kicked = false;
+ if (vq->kickfd)
+ eventfd_ctx_put(vq->kickfd);
+ vq->kickfd = NULL;
+ spin_unlock(&vq->kick_lock);
+
+ spin_lock(&vq->irq_lock);
+ vq->cb.callback = NULL;
+ vq->cb.private = NULL;
+ vq->cb.trigger = NULL;
+ spin_unlock(&vq->irq_lock);
+ flush_work(&vq->inject);
+ flush_work(&vq->kick);
+ }
+
+ up_write(&dev->rwsem);
+}
+
+static int vduse_vdpa_set_vq_address(struct vdpa_device *vdpa, u16 idx,
+ u64 desc_area, u64 driver_area,
+ u64 device_area)
+{
+ struct vduse_dev *dev = vdpa_to_vduse(vdpa);
+ struct vduse_virtqueue *vq = dev->vqs[idx];
+
+ vq->desc_addr = desc_area;
+ vq->driver_addr = driver_area;
+ vq->device_addr = device_area;
+
+ return 0;
+}
+
+static void vduse_vq_kick(struct vduse_virtqueue *vq)
+{
+ spin_lock(&vq->kick_lock);
+ if (!vq->ready)
+ goto unlock;
+
+ if (vq->kickfd)
+ eventfd_signal(vq->kickfd, 1);
+ else
+ vq->kicked = true;
+unlock:
+ spin_unlock(&vq->kick_lock);
+}
+
+static void vduse_vq_kick_work(struct work_struct *work)
+{
+ struct vduse_virtqueue *vq = container_of(work,
+ struct vduse_virtqueue, kick);
+
+ vduse_vq_kick(vq);
+}
+
+static void vduse_vdpa_kick_vq(struct vdpa_device *vdpa, u16 idx)
+{
+ struct vduse_dev *dev = vdpa_to_vduse(vdpa);
+ struct vduse_virtqueue *vq = dev->vqs[idx];
+
+ if (!eventfd_signal_allowed()) {
+ schedule_work(&vq->kick);
+ return;
+ }
+ vduse_vq_kick(vq);
+}
+
+static void vduse_vdpa_set_vq_cb(struct vdpa_device *vdpa, u16 idx,
+ struct vdpa_callback *cb)
+{
+ struct vduse_dev *dev = vdpa_to_vduse(vdpa);
+ struct vduse_virtqueue *vq = dev->vqs[idx];
+
+ spin_lock(&vq->irq_lock);
+ vq->cb.callback = cb->callback;
+ vq->cb.private = cb->private;
+ vq->cb.trigger = cb->trigger;
+ spin_unlock(&vq->irq_lock);
+}
+
+static void vduse_vdpa_set_vq_num(struct vdpa_device *vdpa, u16 idx, u32 num)
+{
+ struct vduse_dev *dev = vdpa_to_vduse(vdpa);
+ struct vduse_virtqueue *vq = dev->vqs[idx];
+
+ vq->num = num;
+}
+
+static void vduse_vdpa_set_vq_ready(struct vdpa_device *vdpa,
+ u16 idx, bool ready)
+{
+ struct vduse_dev *dev = vdpa_to_vduse(vdpa);
+ struct vduse_virtqueue *vq = dev->vqs[idx];
+
+ vq->ready = ready;
+}
+
+static bool vduse_vdpa_get_vq_ready(struct vdpa_device *vdpa, u16 idx)
+{
+ struct vduse_dev *dev = vdpa_to_vduse(vdpa);
+ struct vduse_virtqueue *vq = dev->vqs[idx];
+
+ return vq->ready;
+}
+
+static int vduse_vdpa_set_vq_state(struct vdpa_device *vdpa, u16 idx,
+ const struct vdpa_vq_state *state)
+{
+ struct vduse_dev *dev = vdpa_to_vduse(vdpa);
+ struct vduse_virtqueue *vq = dev->vqs[idx];
+
+ if (dev->driver_features & BIT_ULL(VIRTIO_F_RING_PACKED)) {
+ vq->state.packed.last_avail_counter =
+ state->packed.last_avail_counter;
+ vq->state.packed.last_avail_idx = state->packed.last_avail_idx;
+ vq->state.packed.last_used_counter =
+ state->packed.last_used_counter;
+ vq->state.packed.last_used_idx = state->packed.last_used_idx;
+ } else
+ vq->state.split.avail_index = state->split.avail_index;
+
+ return 0;
+}
+
+static int vduse_vdpa_get_vq_state(struct vdpa_device *vdpa, u16 idx,
+ struct vdpa_vq_state *state)
+{
+ struct vduse_dev *dev = vdpa_to_vduse(vdpa);
+ struct vduse_virtqueue *vq = dev->vqs[idx];
+
+ if (dev->driver_features & BIT_ULL(VIRTIO_F_RING_PACKED))
+ return vduse_dev_get_vq_state_packed(dev, vq, &state->packed);
+
+ return vduse_dev_get_vq_state_split(dev, vq, &state->split);
+}
+
+static u32 vduse_vdpa_get_vq_align(struct vdpa_device *vdpa)
+{
+ struct vduse_dev *dev = vdpa_to_vduse(vdpa);
+
+ return dev->vq_align;
+}
+
+static u64 vduse_vdpa_get_device_features(struct vdpa_device *vdpa)
+{
+ struct vduse_dev *dev = vdpa_to_vduse(vdpa);
+
+ return dev->device_features;
+}
+
+static int vduse_vdpa_set_driver_features(struct vdpa_device *vdpa, u64 features)
+{
+ struct vduse_dev *dev = vdpa_to_vduse(vdpa);
+
+ dev->driver_features = features;
+ return 0;
+}
+
+static u64 vduse_vdpa_get_driver_features(struct vdpa_device *vdpa)
+{
+ struct vduse_dev *dev = vdpa_to_vduse(vdpa);
+
+ return dev->driver_features;
+}
+
+static void vduse_vdpa_set_config_cb(struct vdpa_device *vdpa,
+ struct vdpa_callback *cb)
+{
+ struct vduse_dev *dev = vdpa_to_vduse(vdpa);
+
+ spin_lock(&dev->irq_lock);
+ dev->config_cb.callback = cb->callback;
+ dev->config_cb.private = cb->private;
+ spin_unlock(&dev->irq_lock);
+}
+
+static u16 vduse_vdpa_get_vq_num_max(struct vdpa_device *vdpa)
+{
+ struct vduse_dev *dev = vdpa_to_vduse(vdpa);
+ u16 num_max = 0;
+ int i;
+
+ for (i = 0; i < dev->vq_num; i++)
+ if (num_max < dev->vqs[i]->num_max)
+ num_max = dev->vqs[i]->num_max;
+
+ return num_max;
+}
+
+static u32 vduse_vdpa_get_device_id(struct vdpa_device *vdpa)
+{
+ struct vduse_dev *dev = vdpa_to_vduse(vdpa);
+
+ return dev->device_id;
+}
+
+static u32 vduse_vdpa_get_vendor_id(struct vdpa_device *vdpa)
+{
+ struct vduse_dev *dev = vdpa_to_vduse(vdpa);
+
+ return dev->vendor_id;
+}
+
+static u8 vduse_vdpa_get_status(struct vdpa_device *vdpa)
+{
+ struct vduse_dev *dev = vdpa_to_vduse(vdpa);
+
+ return dev->status;
+}
+
+static void vduse_vdpa_set_status(struct vdpa_device *vdpa, u8 status)
+{
+ struct vduse_dev *dev = vdpa_to_vduse(vdpa);
+
+ if (vduse_dev_set_status(dev, status))
+ return;
+
+ dev->status = status;
+}
+
+static size_t vduse_vdpa_get_config_size(struct vdpa_device *vdpa)
+{
+ struct vduse_dev *dev = vdpa_to_vduse(vdpa);
+
+ return dev->config_size;
+}
+
+static void vduse_vdpa_get_config(struct vdpa_device *vdpa, unsigned int offset,
+ void *buf, unsigned int len)
+{
+ struct vduse_dev *dev = vdpa_to_vduse(vdpa);
+
+ /* Initialize the buffer in case of partial copy. */
+ memset(buf, 0, len);
+
+ if (offset > dev->config_size)
+ return;
+
+ if (len > dev->config_size - offset)
+ len = dev->config_size - offset;
+
+ memcpy(buf, dev->config + offset, len);
+}
+
+static void vduse_vdpa_set_config(struct vdpa_device *vdpa, unsigned int offset,
+ const void *buf, unsigned int len)
+{
+ /* Now we only support read-only configuration space */
+}
+
+static int vduse_vdpa_reset(struct vdpa_device *vdpa)
+{
+ struct vduse_dev *dev = vdpa_to_vduse(vdpa);
+ int ret = vduse_dev_set_status(dev, 0);
+
+ vduse_dev_reset(dev);
+
+ return ret;
+}
+
+static u32 vduse_vdpa_get_generation(struct vdpa_device *vdpa)
+{
+ struct vduse_dev *dev = vdpa_to_vduse(vdpa);
+
+ return dev->generation;
+}
+
+static int vduse_vdpa_set_vq_affinity(struct vdpa_device *vdpa, u16 idx,
+ const struct cpumask *cpu_mask)
+{
+ struct vduse_dev *dev = vdpa_to_vduse(vdpa);
+
+ if (cpu_mask)
+ cpumask_copy(&dev->vqs[idx]->irq_affinity, cpu_mask);
+ else
+ cpumask_setall(&dev->vqs[idx]->irq_affinity);
+
+ return 0;
+}
+
+static const struct cpumask *
+vduse_vdpa_get_vq_affinity(struct vdpa_device *vdpa, u16 idx)
+{
+ struct vduse_dev *dev = vdpa_to_vduse(vdpa);
+
+ return &dev->vqs[idx]->irq_affinity;
+}
+
+static int vduse_vdpa_set_map(struct vdpa_device *vdpa,
+ unsigned int asid,
+ struct vhost_iotlb *iotlb)
+{
+ struct vduse_dev *dev = vdpa_to_vduse(vdpa);
+ int ret;
+
+ ret = vduse_domain_set_map(dev->domain, iotlb);
+ if (ret)
+ return ret;
+
+ ret = vduse_dev_update_iotlb(dev, 0ULL, ULLONG_MAX);
+ if (ret) {
+ vduse_domain_clear_map(dev->domain, iotlb);
+ return ret;
+ }
+
+ return 0;
+}
+
+static void vduse_vdpa_free(struct vdpa_device *vdpa)
+{
+ struct vduse_dev *dev = vdpa_to_vduse(vdpa);
+
+ dev->vdev = NULL;
+}
+
+static const struct vdpa_config_ops vduse_vdpa_config_ops = {
+ .set_vq_address = vduse_vdpa_set_vq_address,
+ .kick_vq = vduse_vdpa_kick_vq,
+ .set_vq_cb = vduse_vdpa_set_vq_cb,
+ .set_vq_num = vduse_vdpa_set_vq_num,
+ .set_vq_ready = vduse_vdpa_set_vq_ready,
+ .get_vq_ready = vduse_vdpa_get_vq_ready,
+ .set_vq_state = vduse_vdpa_set_vq_state,
+ .get_vq_state = vduse_vdpa_get_vq_state,
+ .get_vq_align = vduse_vdpa_get_vq_align,
+ .get_device_features = vduse_vdpa_get_device_features,
+ .set_driver_features = vduse_vdpa_set_driver_features,
+ .get_driver_features = vduse_vdpa_get_driver_features,
+ .set_config_cb = vduse_vdpa_set_config_cb,
+ .get_vq_num_max = vduse_vdpa_get_vq_num_max,
+ .get_device_id = vduse_vdpa_get_device_id,
+ .get_vendor_id = vduse_vdpa_get_vendor_id,
+ .get_status = vduse_vdpa_get_status,
+ .set_status = vduse_vdpa_set_status,
+ .get_config_size = vduse_vdpa_get_config_size,
+ .get_config = vduse_vdpa_get_config,
+ .set_config = vduse_vdpa_set_config,
+ .get_generation = vduse_vdpa_get_generation,
+ .set_vq_affinity = vduse_vdpa_set_vq_affinity,
+ .get_vq_affinity = vduse_vdpa_get_vq_affinity,
+ .reset = vduse_vdpa_reset,
+ .set_map = vduse_vdpa_set_map,
+ .free = vduse_vdpa_free,
+};
+
+static dma_addr_t vduse_dev_map_page(struct device *dev, struct page *page,
+ unsigned long offset, size_t size,
+ enum dma_data_direction dir,
+ unsigned long attrs)
+{
+ struct vduse_dev *vdev = dev_to_vduse(dev);
+ struct vduse_iova_domain *domain = vdev->domain;
+
+ return vduse_domain_map_page(domain, page, offset, size, dir, attrs);
+}
+
+static void vduse_dev_unmap_page(struct device *dev, dma_addr_t dma_addr,
+ size_t size, enum dma_data_direction dir,
+ unsigned long attrs)
+{
+ struct vduse_dev *vdev = dev_to_vduse(dev);
+ struct vduse_iova_domain *domain = vdev->domain;
+
+ return vduse_domain_unmap_page(domain, dma_addr, size, dir, attrs);
+}
+
+static void *vduse_dev_alloc_coherent(struct device *dev, size_t size,
+ dma_addr_t *dma_addr, gfp_t flag,
+ unsigned long attrs)
+{
+ struct vduse_dev *vdev = dev_to_vduse(dev);
+ struct vduse_iova_domain *domain = vdev->domain;
+ unsigned long iova;
+ void *addr;
+
+ *dma_addr = DMA_MAPPING_ERROR;
+ addr = vduse_domain_alloc_coherent(domain, size,
+ (dma_addr_t *)&iova, flag, attrs);
+ if (!addr)
+ return NULL;
+
+ *dma_addr = (dma_addr_t)iova;
+
+ return addr;
+}
+
+static void vduse_dev_free_coherent(struct device *dev, size_t size,
+ void *vaddr, dma_addr_t dma_addr,
+ unsigned long attrs)
+{
+ struct vduse_dev *vdev = dev_to_vduse(dev);
+ struct vduse_iova_domain *domain = vdev->domain;
+
+ vduse_domain_free_coherent(domain, size, vaddr, dma_addr, attrs);
+}
+
+static size_t vduse_dev_max_mapping_size(struct device *dev)
+{
+ struct vduse_dev *vdev = dev_to_vduse(dev);
+ struct vduse_iova_domain *domain = vdev->domain;
+
+ return domain->bounce_size;
+}
+
+static const struct dma_map_ops vduse_dev_dma_ops = {
+ .map_page = vduse_dev_map_page,
+ .unmap_page = vduse_dev_unmap_page,
+ .alloc = vduse_dev_alloc_coherent,
+ .free = vduse_dev_free_coherent,
+ .max_mapping_size = vduse_dev_max_mapping_size,
+};
+
+static unsigned int perm_to_file_flags(u8 perm)
+{
+ unsigned int flags = 0;
+
+ switch (perm) {
+ case VDUSE_ACCESS_WO:
+ flags |= O_WRONLY;
+ break;
+ case VDUSE_ACCESS_RO:
+ flags |= O_RDONLY;
+ break;
+ case VDUSE_ACCESS_RW:
+ flags |= O_RDWR;
+ break;
+ default:
+ WARN(1, "invalidate vhost IOTLB permission\n");
+ break;
+ }
+
+ return flags;
+}
+
+static int vduse_kickfd_setup(struct vduse_dev *dev,
+ struct vduse_vq_eventfd *eventfd)
+{
+ struct eventfd_ctx *ctx = NULL;
+ struct vduse_virtqueue *vq;
+ u32 index;
+
+ if (eventfd->index >= dev->vq_num)
+ return -EINVAL;
+
+ index = array_index_nospec(eventfd->index, dev->vq_num);
+ vq = dev->vqs[index];
+ if (eventfd->fd >= 0) {
+ ctx = eventfd_ctx_fdget(eventfd->fd);
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
+ } else if (eventfd->fd != VDUSE_EVENTFD_DEASSIGN)
+ return 0;
+
+ spin_lock(&vq->kick_lock);
+ if (vq->kickfd)
+ eventfd_ctx_put(vq->kickfd);
+ vq->kickfd = ctx;
+ if (vq->ready && vq->kicked && vq->kickfd) {
+ eventfd_signal(vq->kickfd, 1);
+ vq->kicked = false;
+ }
+ spin_unlock(&vq->kick_lock);
+
+ return 0;
+}
+
+static bool vduse_dev_is_ready(struct vduse_dev *dev)
+{
+ int i;
+
+ for (i = 0; i < dev->vq_num; i++)
+ if (!dev->vqs[i]->num_max)
+ return false;
+
+ return true;
+}
+
+static void vduse_dev_irq_inject(struct work_struct *work)
+{
+ struct vduse_dev *dev = container_of(work, struct vduse_dev, inject);
+
+ spin_lock_bh(&dev->irq_lock);
+ if (dev->config_cb.callback)
+ dev->config_cb.callback(dev->config_cb.private);
+ spin_unlock_bh(&dev->irq_lock);
+}
+
+static void vduse_vq_irq_inject(struct work_struct *work)
+{
+ struct vduse_virtqueue *vq = container_of(work,
+ struct vduse_virtqueue, inject);
+
+ spin_lock_bh(&vq->irq_lock);
+ if (vq->ready && vq->cb.callback)
+ vq->cb.callback(vq->cb.private);
+ spin_unlock_bh(&vq->irq_lock);
+}
+
+static bool vduse_vq_signal_irqfd(struct vduse_virtqueue *vq)
+{
+ bool signal = false;
+
+ if (!vq->cb.trigger)
+ return false;
+
+ spin_lock_irq(&vq->irq_lock);
+ if (vq->ready && vq->cb.trigger) {
+ eventfd_signal(vq->cb.trigger, 1);
+ signal = true;
+ }
+ spin_unlock_irq(&vq->irq_lock);
+
+ return signal;
+}
+
+static int vduse_dev_queue_irq_work(struct vduse_dev *dev,
+ struct work_struct *irq_work,
+ int irq_effective_cpu)
+{
+ int ret = -EINVAL;
+
+ down_read(&dev->rwsem);
+ if (!(dev->status & VIRTIO_CONFIG_S_DRIVER_OK))
+ goto unlock;
+
+ ret = 0;
+ if (irq_effective_cpu == IRQ_UNBOUND)
+ queue_work(vduse_irq_wq, irq_work);
+ else
+ queue_work_on(irq_effective_cpu,
+ vduse_irq_bound_wq, irq_work);
+unlock:
+ up_read(&dev->rwsem);
+
+ return ret;
+}
+
+static int vduse_dev_dereg_umem(struct vduse_dev *dev,
+ u64 iova, u64 size)
+{
+ int ret;
+
+ mutex_lock(&dev->mem_lock);
+ ret = -ENOENT;
+ if (!dev->umem)
+ goto unlock;
+
+ ret = -EINVAL;
+ if (!dev->domain)
+ goto unlock;
+
+ if (dev->umem->iova != iova || size != dev->domain->bounce_size)
+ goto unlock;
+
+ vduse_domain_remove_user_bounce_pages(dev->domain);
+ unpin_user_pages_dirty_lock(dev->umem->pages,
+ dev->umem->npages, true);
+ atomic64_sub(dev->umem->npages, &dev->umem->mm->pinned_vm);
+ mmdrop(dev->umem->mm);
+ vfree(dev->umem->pages);
+ kfree(dev->umem);
+ dev->umem = NULL;
+ ret = 0;
+unlock:
+ mutex_unlock(&dev->mem_lock);
+ return ret;
+}
+
+static int vduse_dev_reg_umem(struct vduse_dev *dev,
+ u64 iova, u64 uaddr, u64 size)
+{
+ struct page **page_list = NULL;
+ struct vduse_umem *umem = NULL;
+ long pinned = 0;
+ unsigned long npages, lock_limit;
+ int ret;
+
+ if (!dev->domain || !dev->domain->bounce_map ||
+ size != dev->domain->bounce_size ||
+ iova != 0 || uaddr & ~PAGE_MASK)
+ return -EINVAL;
+
+ mutex_lock(&dev->mem_lock);
+ ret = -EEXIST;
+ if (dev->umem)
+ goto unlock;
+
+ ret = -ENOMEM;
+ npages = size >> PAGE_SHIFT;
+ page_list = __vmalloc(array_size(npages, sizeof(struct page *)),
+ GFP_KERNEL_ACCOUNT);
+ umem = kzalloc(sizeof(*umem), GFP_KERNEL);
+ if (!page_list || !umem)
+ goto unlock;
+
+ mmap_read_lock(current->mm);
+
+ lock_limit = PFN_DOWN(rlimit(RLIMIT_MEMLOCK));
+ if (npages + atomic64_read(&current->mm->pinned_vm) > lock_limit)
+ goto out;
+
+ pinned = pin_user_pages(uaddr, npages, FOLL_LONGTERM | FOLL_WRITE,
+ page_list);
+ if (pinned != npages) {
+ ret = pinned < 0 ? pinned : -ENOMEM;
+ goto out;
+ }
+
+ ret = vduse_domain_add_user_bounce_pages(dev->domain,
+ page_list, pinned);
+ if (ret)
+ goto out;
+
+ atomic64_add(npages, &current->mm->pinned_vm);
+
+ umem->pages = page_list;
+ umem->npages = pinned;
+ umem->iova = iova;
+ umem->mm = current->mm;
+ mmgrab(current->mm);
+
+ dev->umem = umem;
+out:
+ if (ret && pinned > 0)
+ unpin_user_pages(page_list, pinned);
+
+ mmap_read_unlock(current->mm);
+unlock:
+ if (ret) {
+ vfree(page_list);
+ kfree(umem);
+ }
+ mutex_unlock(&dev->mem_lock);
+ return ret;
+}
+
+static void vduse_vq_update_effective_cpu(struct vduse_virtqueue *vq)
+{
+ int curr_cpu = vq->irq_effective_cpu;
+
+ while (true) {
+ curr_cpu = cpumask_next(curr_cpu, &vq->irq_affinity);
+ if (cpu_online(curr_cpu))
+ break;
+
+ if (curr_cpu >= nr_cpu_ids)
+ curr_cpu = IRQ_UNBOUND;
+ }
+
+ vq->irq_effective_cpu = curr_cpu;
+}
+
+static long vduse_dev_ioctl(struct file *file, unsigned int cmd,
+ unsigned long arg)
+{
+ struct vduse_dev *dev = file->private_data;
+ void __user *argp = (void __user *)arg;
+ int ret;
+
+ if (unlikely(dev->broken))
+ return -EPERM;
+
+ switch (cmd) {
+ case VDUSE_IOTLB_GET_FD: {
+ struct vduse_iotlb_entry entry;
+ struct vhost_iotlb_map *map;
+ struct vdpa_map_file *map_file;
+ struct file *f = NULL;
+
+ ret = -EFAULT;
+ if (copy_from_user(&entry, argp, sizeof(entry)))
+ break;
+
+ ret = -EINVAL;
+ if (entry.start > entry.last)
+ break;
+
+ mutex_lock(&dev->domain_lock);
+ if (!dev->domain) {
+ mutex_unlock(&dev->domain_lock);
+ break;
+ }
+ spin_lock(&dev->domain->iotlb_lock);
+ map = vhost_iotlb_itree_first(dev->domain->iotlb,
+ entry.start, entry.last);
+ if (map) {
+ map_file = (struct vdpa_map_file *)map->opaque;
+ f = get_file(map_file->file);
+ entry.offset = map_file->offset;
+ entry.start = map->start;
+ entry.last = map->last;
+ entry.perm = map->perm;
+ }
+ spin_unlock(&dev->domain->iotlb_lock);
+ mutex_unlock(&dev->domain_lock);
+ ret = -EINVAL;
+ if (!f)
+ break;
+
+ ret = -EFAULT;
+ if (copy_to_user(argp, &entry, sizeof(entry))) {
+ fput(f);
+ break;
+ }
+ ret = receive_fd(f, perm_to_file_flags(entry.perm));
+ fput(f);
+ break;
+ }
+ case VDUSE_DEV_GET_FEATURES:
+ /*
+ * Just mirror what driver wrote here.
+ * The driver is expected to check FEATURE_OK later.
+ */
+ ret = put_user(dev->driver_features, (u64 __user *)argp);
+ break;
+ case VDUSE_DEV_SET_CONFIG: {
+ struct vduse_config_data config;
+ unsigned long size = offsetof(struct vduse_config_data,
+ buffer);
+
+ ret = -EFAULT;
+ if (copy_from_user(&config, argp, size))
+ break;
+
+ ret = -EINVAL;
+ if (config.offset > dev->config_size ||
+ config.length == 0 ||
+ config.length > dev->config_size - config.offset)
+ break;
+
+ ret = -EFAULT;
+ if (copy_from_user(dev->config + config.offset, argp + size,
+ config.length))
+ break;
+
+ ret = 0;
+ break;
+ }
+ case VDUSE_DEV_INJECT_CONFIG_IRQ:
+ ret = vduse_dev_queue_irq_work(dev, &dev->inject, IRQ_UNBOUND);
+ break;
+ case VDUSE_VQ_SETUP: {
+ struct vduse_vq_config config;
+ u32 index;
+
+ ret = -EFAULT;
+ if (copy_from_user(&config, argp, sizeof(config)))
+ break;
+
+ ret = -EINVAL;
+ if (config.index >= dev->vq_num)
+ break;
+
+ if (!is_mem_zero((const char *)config.reserved,
+ sizeof(config.reserved)))
+ break;
+
+ index = array_index_nospec(config.index, dev->vq_num);
+ dev->vqs[index]->num_max = config.max_size;
+ ret = 0;
+ break;
+ }
+ case VDUSE_VQ_GET_INFO: {
+ struct vduse_vq_info vq_info;
+ struct vduse_virtqueue *vq;
+ u32 index;
+
+ ret = -EFAULT;
+ if (copy_from_user(&vq_info, argp, sizeof(vq_info)))
+ break;
+
+ ret = -EINVAL;
+ if (vq_info.index >= dev->vq_num)
+ break;
+
+ index = array_index_nospec(vq_info.index, dev->vq_num);
+ vq = dev->vqs[index];
+ vq_info.desc_addr = vq->desc_addr;
+ vq_info.driver_addr = vq->driver_addr;
+ vq_info.device_addr = vq->device_addr;
+ vq_info.num = vq->num;
+
+ if (dev->driver_features & BIT_ULL(VIRTIO_F_RING_PACKED)) {
+ vq_info.packed.last_avail_counter =
+ vq->state.packed.last_avail_counter;
+ vq_info.packed.last_avail_idx =
+ vq->state.packed.last_avail_idx;
+ vq_info.packed.last_used_counter =
+ vq->state.packed.last_used_counter;
+ vq_info.packed.last_used_idx =
+ vq->state.packed.last_used_idx;
+ } else
+ vq_info.split.avail_index =
+ vq->state.split.avail_index;
+
+ vq_info.ready = vq->ready;
+
+ ret = -EFAULT;
+ if (copy_to_user(argp, &vq_info, sizeof(vq_info)))
+ break;
+
+ ret = 0;
+ break;
+ }
+ case VDUSE_VQ_SETUP_KICKFD: {
+ struct vduse_vq_eventfd eventfd;
+
+ ret = -EFAULT;
+ if (copy_from_user(&eventfd, argp, sizeof(eventfd)))
+ break;
+
+ ret = vduse_kickfd_setup(dev, &eventfd);
+ break;
+ }
+ case VDUSE_VQ_INJECT_IRQ: {
+ u32 index;
+
+ ret = -EFAULT;
+ if (get_user(index, (u32 __user *)argp))
+ break;
+
+ ret = -EINVAL;
+ if (index >= dev->vq_num)
+ break;
+
+ ret = 0;
+ index = array_index_nospec(index, dev->vq_num);
+ if (!vduse_vq_signal_irqfd(dev->vqs[index])) {
+ vduse_vq_update_effective_cpu(dev->vqs[index]);
+ ret = vduse_dev_queue_irq_work(dev,
+ &dev->vqs[index]->inject,
+ dev->vqs[index]->irq_effective_cpu);
+ }
+ break;
+ }
+ case VDUSE_IOTLB_REG_UMEM: {
+ struct vduse_iova_umem umem;
+
+ ret = -EFAULT;
+ if (copy_from_user(&umem, argp, sizeof(umem)))
+ break;
+
+ ret = -EINVAL;
+ if (!is_mem_zero((const char *)umem.reserved,
+ sizeof(umem.reserved)))
+ break;
+
+ mutex_lock(&dev->domain_lock);
+ ret = vduse_dev_reg_umem(dev, umem.iova,
+ umem.uaddr, umem.size);
+ mutex_unlock(&dev->domain_lock);
+ break;
+ }
+ case VDUSE_IOTLB_DEREG_UMEM: {
+ struct vduse_iova_umem umem;
+
+ ret = -EFAULT;
+ if (copy_from_user(&umem, argp, sizeof(umem)))
+ break;
+
+ ret = -EINVAL;
+ if (!is_mem_zero((const char *)umem.reserved,
+ sizeof(umem.reserved)))
+ break;
+ mutex_lock(&dev->domain_lock);
+ ret = vduse_dev_dereg_umem(dev, umem.iova,
+ umem.size);
+ mutex_unlock(&dev->domain_lock);
+ break;
+ }
+ case VDUSE_IOTLB_GET_INFO: {
+ struct vduse_iova_info info;
+ struct vhost_iotlb_map *map;
+
+ ret = -EFAULT;
+ if (copy_from_user(&info, argp, sizeof(info)))
+ break;
+
+ ret = -EINVAL;
+ if (info.start > info.last)
+ break;
+
+ if (!is_mem_zero((const char *)info.reserved,
+ sizeof(info.reserved)))
+ break;
+
+ mutex_lock(&dev->domain_lock);
+ if (!dev->domain) {
+ mutex_unlock(&dev->domain_lock);
+ break;
+ }
+ spin_lock(&dev->domain->iotlb_lock);
+ map = vhost_iotlb_itree_first(dev->domain->iotlb,
+ info.start, info.last);
+ if (map) {
+ info.start = map->start;
+ info.last = map->last;
+ info.capability = 0;
+ if (dev->domain->bounce_map && map->start == 0 &&
+ map->last == dev->domain->bounce_size - 1)
+ info.capability |= VDUSE_IOVA_CAP_UMEM;
+ }
+ spin_unlock(&dev->domain->iotlb_lock);
+ mutex_unlock(&dev->domain_lock);
+ if (!map)
+ break;
+
+ ret = -EFAULT;
+ if (copy_to_user(argp, &info, sizeof(info)))
+ break;
+
+ ret = 0;
+ break;
+ }
+ default:
+ ret = -ENOIOCTLCMD;
+ break;
+ }
+
+ return ret;
+}
+
+static int vduse_dev_release(struct inode *inode, struct file *file)
+{
+ struct vduse_dev *dev = file->private_data;
+
+ mutex_lock(&dev->domain_lock);
+ if (dev->domain)
+ vduse_dev_dereg_umem(dev, 0, dev->domain->bounce_size);
+ mutex_unlock(&dev->domain_lock);
+ spin_lock(&dev->msg_lock);
+ /* Make sure the inflight messages can processed after reconncection */
+ list_splice_init(&dev->recv_list, &dev->send_list);
+ spin_unlock(&dev->msg_lock);
+ dev->connected = false;
+
+ return 0;
+}
+
+static struct vduse_dev *vduse_dev_get_from_minor(int minor)
+{
+ struct vduse_dev *dev;
+
+ mutex_lock(&vduse_lock);
+ dev = idr_find(&vduse_idr, minor);
+ mutex_unlock(&vduse_lock);
+
+ return dev;
+}
+
+static int vduse_dev_open(struct inode *inode, struct file *file)
+{
+ int ret;
+ struct vduse_dev *dev = vduse_dev_get_from_minor(iminor(inode));
+
+ if (!dev)
+ return -ENODEV;
+
+ ret = -EBUSY;
+ mutex_lock(&dev->lock);
+ if (dev->connected)
+ goto unlock;
+
+ ret = 0;
+ dev->connected = true;
+ file->private_data = dev;
+unlock:
+ mutex_unlock(&dev->lock);
+
+ return ret;
+}
+
+static const struct file_operations vduse_dev_fops = {
+ .owner = THIS_MODULE,
+ .open = vduse_dev_open,
+ .release = vduse_dev_release,
+ .read_iter = vduse_dev_read_iter,
+ .write_iter = vduse_dev_write_iter,
+ .poll = vduse_dev_poll,
+ .unlocked_ioctl = vduse_dev_ioctl,
+ .compat_ioctl = compat_ptr_ioctl,
+ .llseek = noop_llseek,
+};
+
+static ssize_t irq_cb_affinity_show(struct vduse_virtqueue *vq, char *buf)
+{
+ return sprintf(buf, "%*pb\n", cpumask_pr_args(&vq->irq_affinity));
+}
+
+static ssize_t irq_cb_affinity_store(struct vduse_virtqueue *vq,
+ const char *buf, size_t count)
+{
+ cpumask_var_t new_value;
+ int ret;
+
+ if (!zalloc_cpumask_var(&new_value, GFP_KERNEL))
+ return -ENOMEM;
+
+ ret = cpumask_parse(buf, new_value);
+ if (ret)
+ goto free_mask;
+
+ ret = -EINVAL;
+ if (!cpumask_intersects(new_value, cpu_online_mask))
+ goto free_mask;
+
+ cpumask_copy(&vq->irq_affinity, new_value);
+ ret = count;
+free_mask:
+ free_cpumask_var(new_value);
+ return ret;
+}
+
+struct vq_sysfs_entry {
+ struct attribute attr;
+ ssize_t (*show)(struct vduse_virtqueue *vq, char *buf);
+ ssize_t (*store)(struct vduse_virtqueue *vq, const char *buf,
+ size_t count);
+};
+
+static struct vq_sysfs_entry irq_cb_affinity_attr = __ATTR_RW(irq_cb_affinity);
+
+static struct attribute *vq_attrs[] = {
+ &irq_cb_affinity_attr.attr,
+ NULL,
+};
+ATTRIBUTE_GROUPS(vq);
+
+static ssize_t vq_attr_show(struct kobject *kobj, struct attribute *attr,
+ char *buf)
+{
+ struct vduse_virtqueue *vq = container_of(kobj,
+ struct vduse_virtqueue, kobj);
+ struct vq_sysfs_entry *entry = container_of(attr,
+ struct vq_sysfs_entry, attr);
+
+ if (!entry->show)
+ return -EIO;
+
+ return entry->show(vq, buf);
+}
+
+static ssize_t vq_attr_store(struct kobject *kobj, struct attribute *attr,
+ const char *buf, size_t count)
+{
+ struct vduse_virtqueue *vq = container_of(kobj,
+ struct vduse_virtqueue, kobj);
+ struct vq_sysfs_entry *entry = container_of(attr,
+ struct vq_sysfs_entry, attr);
+
+ if (!entry->store)
+ return -EIO;
+
+ return entry->store(vq, buf, count);
+}
+
+static const struct sysfs_ops vq_sysfs_ops = {
+ .show = vq_attr_show,
+ .store = vq_attr_store,
+};
+
+static void vq_release(struct kobject *kobj)
+{
+ struct vduse_virtqueue *vq = container_of(kobj,
+ struct vduse_virtqueue, kobj);
+ kfree(vq);
+}
+
+static const struct kobj_type vq_type = {
+ .release = vq_release,
+ .sysfs_ops = &vq_sysfs_ops,
+ .default_groups = vq_groups,
+};
+
+static void vduse_dev_deinit_vqs(struct vduse_dev *dev)
+{
+ int i;
+
+ if (!dev->vqs)
+ return;
+
+ for (i = 0; i < dev->vq_num; i++)
+ kobject_put(&dev->vqs[i]->kobj);
+ kfree(dev->vqs);
+}
+
+static int vduse_dev_init_vqs(struct vduse_dev *dev, u32 vq_align, u32 vq_num)
+{
+ int ret, i;
+
+ dev->vq_align = vq_align;
+ dev->vq_num = vq_num;
+ dev->vqs = kcalloc(dev->vq_num, sizeof(*dev->vqs), GFP_KERNEL);
+ if (!dev->vqs)
+ return -ENOMEM;
+
+ for (i = 0; i < vq_num; i++) {
+ dev->vqs[i] = kzalloc(sizeof(*dev->vqs[i]), GFP_KERNEL);
+ if (!dev->vqs[i]) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ dev->vqs[i]->index = i;
+ dev->vqs[i]->irq_effective_cpu = IRQ_UNBOUND;
+ INIT_WORK(&dev->vqs[i]->inject, vduse_vq_irq_inject);
+ INIT_WORK(&dev->vqs[i]->kick, vduse_vq_kick_work);
+ spin_lock_init(&dev->vqs[i]->kick_lock);
+ spin_lock_init(&dev->vqs[i]->irq_lock);
+ cpumask_setall(&dev->vqs[i]->irq_affinity);
+
+ kobject_init(&dev->vqs[i]->kobj, &vq_type);
+ ret = kobject_add(&dev->vqs[i]->kobj,
+ &dev->dev->kobj, "vq%d", i);
+ if (ret) {
+ kfree(dev->vqs[i]);
+ goto err;
+ }
+ }
+
+ return 0;
+err:
+ while (i--)
+ kobject_put(&dev->vqs[i]->kobj);
+ kfree(dev->vqs);
+ dev->vqs = NULL;
+ return ret;
+}
+
+static struct vduse_dev *vduse_dev_create(void)
+{
+ struct vduse_dev *dev = kzalloc(sizeof(*dev), GFP_KERNEL);
+
+ if (!dev)
+ return NULL;
+
+ mutex_init(&dev->lock);
+ mutex_init(&dev->mem_lock);
+ mutex_init(&dev->domain_lock);
+ spin_lock_init(&dev->msg_lock);
+ INIT_LIST_HEAD(&dev->send_list);
+ INIT_LIST_HEAD(&dev->recv_list);
+ spin_lock_init(&dev->irq_lock);
+ init_rwsem(&dev->rwsem);
+
+ INIT_WORK(&dev->inject, vduse_dev_irq_inject);
+ init_waitqueue_head(&dev->waitq);
+
+ return dev;
+}
+
+static void vduse_dev_destroy(struct vduse_dev *dev)
+{
+ kfree(dev);
+}
+
+static struct vduse_dev *vduse_find_dev(const char *name)
+{
+ struct vduse_dev *dev;
+ int id;
+
+ idr_for_each_entry(&vduse_idr, dev, id)
+ if (!strcmp(dev->name, name))
+ return dev;
+
+ return NULL;
+}
+
+static int vduse_destroy_dev(char *name)
+{
+ struct vduse_dev *dev = vduse_find_dev(name);
+
+ if (!dev)
+ return -EINVAL;
+
+ mutex_lock(&dev->lock);
+ if (dev->vdev || dev->connected) {
+ mutex_unlock(&dev->lock);
+ return -EBUSY;
+ }
+ dev->connected = true;
+ mutex_unlock(&dev->lock);
+
+ vduse_dev_reset(dev);
+ device_destroy(vduse_class, MKDEV(MAJOR(vduse_major), dev->minor));
+ idr_remove(&vduse_idr, dev->minor);
+ kvfree(dev->config);
+ vduse_dev_deinit_vqs(dev);
+ if (dev->domain)
+ vduse_domain_destroy(dev->domain);
+ kfree(dev->name);
+ vduse_dev_destroy(dev);
+ module_put(THIS_MODULE);
+
+ return 0;
+}
+
+static bool device_is_allowed(u32 device_id)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(allowed_device_id); i++)
+ if (allowed_device_id[i] == device_id)
+ return true;
+
+ return false;
+}
+
+static bool features_is_valid(u64 features)
+{
+ if (!(features & (1ULL << VIRTIO_F_ACCESS_PLATFORM)))
+ return false;
+
+ /* Now we only support read-only configuration space */
+ if (features & (1ULL << VIRTIO_BLK_F_CONFIG_WCE))
+ return false;
+
+ return true;
+}
+
+static bool vduse_validate_config(struct vduse_dev_config *config)
+{
+ if (!is_mem_zero((const char *)config->reserved,
+ sizeof(config->reserved)))
+ return false;
+
+ if (config->vq_align > PAGE_SIZE)
+ return false;
+
+ if (config->config_size > PAGE_SIZE)
+ return false;
+
+ if (config->vq_num > 0xffff)
+ return false;
+
+ if (!config->name[0])
+ return false;
+
+ if (!device_is_allowed(config->device_id))
+ return false;
+
+ if (!features_is_valid(config->features))
+ return false;
+
+ return true;
+}
+
+static ssize_t msg_timeout_show(struct device *device,
+ struct device_attribute *attr, char *buf)
+{
+ struct vduse_dev *dev = dev_get_drvdata(device);
+
+ return sysfs_emit(buf, "%u\n", dev->msg_timeout);
+}
+
+static ssize_t msg_timeout_store(struct device *device,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct vduse_dev *dev = dev_get_drvdata(device);
+ int ret;
+
+ ret = kstrtouint(buf, 10, &dev->msg_timeout);
+ if (ret < 0)
+ return ret;
+
+ return count;
+}
+
+static DEVICE_ATTR_RW(msg_timeout);
+
+static ssize_t bounce_size_show(struct device *device,
+ struct device_attribute *attr, char *buf)
+{
+ struct vduse_dev *dev = dev_get_drvdata(device);
+
+ return sysfs_emit(buf, "%u\n", dev->bounce_size);
+}
+
+static ssize_t bounce_size_store(struct device *device,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct vduse_dev *dev = dev_get_drvdata(device);
+ unsigned int bounce_size;
+ int ret;
+
+ ret = -EPERM;
+ mutex_lock(&dev->domain_lock);
+ if (dev->domain)
+ goto unlock;
+
+ ret = kstrtouint(buf, 10, &bounce_size);
+ if (ret < 0)
+ goto unlock;
+
+ ret = -EINVAL;
+ if (bounce_size > VDUSE_MAX_BOUNCE_SIZE ||
+ bounce_size < VDUSE_MIN_BOUNCE_SIZE)
+ goto unlock;
+
+ dev->bounce_size = bounce_size & PAGE_MASK;
+ ret = count;
+unlock:
+ mutex_unlock(&dev->domain_lock);
+ return ret;
+}
+
+static DEVICE_ATTR_RW(bounce_size);
+
+static struct attribute *vduse_dev_attrs[] = {
+ &dev_attr_msg_timeout.attr,
+ &dev_attr_bounce_size.attr,
+ NULL
+};
+
+ATTRIBUTE_GROUPS(vduse_dev);
+
+static int vduse_create_dev(struct vduse_dev_config *config,
+ void *config_buf, u64 api_version)
+{
+ int ret;
+ struct vduse_dev *dev;
+
+ ret = -EEXIST;
+ if (vduse_find_dev(config->name))
+ goto err;
+
+ ret = -ENOMEM;
+ dev = vduse_dev_create();
+ if (!dev)
+ goto err;
+
+ dev->api_version = api_version;
+ dev->device_features = config->features;
+ dev->device_id = config->device_id;
+ dev->vendor_id = config->vendor_id;
+ dev->name = kstrdup(config->name, GFP_KERNEL);
+ if (!dev->name)
+ goto err_str;
+
+ dev->bounce_size = VDUSE_BOUNCE_SIZE;
+ dev->config = config_buf;
+ dev->config_size = config->config_size;
+
+ ret = idr_alloc(&vduse_idr, dev, 1, VDUSE_DEV_MAX, GFP_KERNEL);
+ if (ret < 0)
+ goto err_idr;
+
+ dev->minor = ret;
+ dev->msg_timeout = VDUSE_MSG_DEFAULT_TIMEOUT;
+ dev->dev = device_create_with_groups(vduse_class, NULL,
+ MKDEV(MAJOR(vduse_major), dev->minor),
+ dev, vduse_dev_groups, "%s", config->name);
+ if (IS_ERR(dev->dev)) {
+ ret = PTR_ERR(dev->dev);
+ goto err_dev;
+ }
+
+ ret = vduse_dev_init_vqs(dev, config->vq_align, config->vq_num);
+ if (ret)
+ goto err_vqs;
+
+ __module_get(THIS_MODULE);
+
+ return 0;
+err_vqs:
+ device_destroy(vduse_class, MKDEV(MAJOR(vduse_major), dev->minor));
+err_dev:
+ idr_remove(&vduse_idr, dev->minor);
+err_idr:
+ kfree(dev->name);
+err_str:
+ vduse_dev_destroy(dev);
+err:
+ return ret;
+}
+
+static long vduse_ioctl(struct file *file, unsigned int cmd,
+ unsigned long arg)
+{
+ int ret;
+ void __user *argp = (void __user *)arg;
+ struct vduse_control *control = file->private_data;
+
+ mutex_lock(&vduse_lock);
+ switch (cmd) {
+ case VDUSE_GET_API_VERSION:
+ ret = put_user(control->api_version, (u64 __user *)argp);
+ break;
+ case VDUSE_SET_API_VERSION: {
+ u64 api_version;
+
+ ret = -EFAULT;
+ if (get_user(api_version, (u64 __user *)argp))
+ break;
+
+ ret = -EINVAL;
+ if (api_version > VDUSE_API_VERSION)
+ break;
+
+ ret = 0;
+ control->api_version = api_version;
+ break;
+ }
+ case VDUSE_CREATE_DEV: {
+ struct vduse_dev_config config;
+ unsigned long size = offsetof(struct vduse_dev_config, config);
+ void *buf;
+
+ ret = -EFAULT;
+ if (copy_from_user(&config, argp, size))
+ break;
+
+ ret = -EINVAL;
+ if (vduse_validate_config(&config) == false)
+ break;
+
+ buf = vmemdup_user(argp + size, config.config_size);
+ if (IS_ERR(buf)) {
+ ret = PTR_ERR(buf);
+ break;
+ }
+ config.name[VDUSE_NAME_MAX - 1] = '\0';
+ ret = vduse_create_dev(&config, buf, control->api_version);
+ if (ret)
+ kvfree(buf);
+ break;
+ }
+ case VDUSE_DESTROY_DEV: {
+ char name[VDUSE_NAME_MAX];
+
+ ret = -EFAULT;
+ if (copy_from_user(name, argp, VDUSE_NAME_MAX))
+ break;
+
+ name[VDUSE_NAME_MAX - 1] = '\0';
+ ret = vduse_destroy_dev(name);
+ break;
+ }
+ default:
+ ret = -EINVAL;
+ break;
+ }
+ mutex_unlock(&vduse_lock);
+
+ return ret;
+}
+
+static int vduse_release(struct inode *inode, struct file *file)
+{
+ struct vduse_control *control = file->private_data;
+
+ kfree(control);
+ return 0;
+}
+
+static int vduse_open(struct inode *inode, struct file *file)
+{
+ struct vduse_control *control;
+
+ control = kmalloc(sizeof(struct vduse_control), GFP_KERNEL);
+ if (!control)
+ return -ENOMEM;
+
+ control->api_version = VDUSE_API_VERSION;
+ file->private_data = control;
+
+ return 0;
+}
+
+static const struct file_operations vduse_ctrl_fops = {
+ .owner = THIS_MODULE,
+ .open = vduse_open,
+ .release = vduse_release,
+ .unlocked_ioctl = vduse_ioctl,
+ .compat_ioctl = compat_ptr_ioctl,
+ .llseek = noop_llseek,
+};
+
+static char *vduse_devnode(const struct device *dev, umode_t *mode)
+{
+ return kasprintf(GFP_KERNEL, "vduse/%s", dev_name(dev));
+}
+
+struct vduse_mgmt_dev {
+ struct vdpa_mgmt_dev mgmt_dev;
+ struct device dev;
+};
+
+static struct vduse_mgmt_dev *vduse_mgmt;
+
+static int vduse_dev_init_vdpa(struct vduse_dev *dev, const char *name)
+{
+ struct vduse_vdpa *vdev;
+ int ret;
+
+ if (dev->vdev)
+ return -EEXIST;
+
+ vdev = vdpa_alloc_device(struct vduse_vdpa, vdpa, dev->dev,
+ &vduse_vdpa_config_ops, 1, 1, name, true);
+ if (IS_ERR(vdev))
+ return PTR_ERR(vdev);
+
+ dev->vdev = vdev;
+ vdev->dev = dev;
+ vdev->vdpa.dev.dma_mask = &vdev->vdpa.dev.coherent_dma_mask;
+ ret = dma_set_mask_and_coherent(&vdev->vdpa.dev, DMA_BIT_MASK(64));
+ if (ret) {
+ put_device(&vdev->vdpa.dev);
+ return ret;
+ }
+ set_dma_ops(&vdev->vdpa.dev, &vduse_dev_dma_ops);
+ vdev->vdpa.dma_dev = &vdev->vdpa.dev;
+ vdev->vdpa.mdev = &vduse_mgmt->mgmt_dev;
+
+ return 0;
+}
+
+static int vdpa_dev_add(struct vdpa_mgmt_dev *mdev, const char *name,
+ const struct vdpa_dev_set_config *config)
+{
+ struct vduse_dev *dev;
+ int ret;
+
+ mutex_lock(&vduse_lock);
+ dev = vduse_find_dev(name);
+ if (!dev || !vduse_dev_is_ready(dev)) {
+ mutex_unlock(&vduse_lock);
+ return -EINVAL;
+ }
+ ret = vduse_dev_init_vdpa(dev, name);
+ mutex_unlock(&vduse_lock);
+ if (ret)
+ return ret;
+
+ mutex_lock(&dev->domain_lock);
+ if (!dev->domain)
+ dev->domain = vduse_domain_create(VDUSE_IOVA_SIZE - 1,
+ dev->bounce_size);
+ mutex_unlock(&dev->domain_lock);
+ if (!dev->domain) {
+ put_device(&dev->vdev->vdpa.dev);
+ return -ENOMEM;
+ }
+
+ ret = _vdpa_register_device(&dev->vdev->vdpa, dev->vq_num);
+ if (ret) {
+ put_device(&dev->vdev->vdpa.dev);
+ mutex_lock(&dev->domain_lock);
+ vduse_domain_destroy(dev->domain);
+ dev->domain = NULL;
+ mutex_unlock(&dev->domain_lock);
+ return ret;
+ }
+
+ return 0;
+}
+
+static void vdpa_dev_del(struct vdpa_mgmt_dev *mdev, struct vdpa_device *dev)
+{
+ _vdpa_unregister_device(dev);
+}
+
+static const struct vdpa_mgmtdev_ops vdpa_dev_mgmtdev_ops = {
+ .dev_add = vdpa_dev_add,
+ .dev_del = vdpa_dev_del,
+};
+
+static struct virtio_device_id id_table[] = {
+ { VIRTIO_ID_BLOCK, VIRTIO_DEV_ANY_ID },
+ { 0 },
+};
+
+static void vduse_mgmtdev_release(struct device *dev)
+{
+ struct vduse_mgmt_dev *mgmt_dev;
+
+ mgmt_dev = container_of(dev, struct vduse_mgmt_dev, dev);
+ kfree(mgmt_dev);
+}
+
+static int vduse_mgmtdev_init(void)
+{
+ int ret;
+
+ vduse_mgmt = kzalloc(sizeof(*vduse_mgmt), GFP_KERNEL);
+ if (!vduse_mgmt)
+ return -ENOMEM;
+
+ ret = dev_set_name(&vduse_mgmt->dev, "vduse");
+ if (ret) {
+ kfree(vduse_mgmt);
+ return ret;
+ }
+
+ vduse_mgmt->dev.release = vduse_mgmtdev_release;
+
+ ret = device_register(&vduse_mgmt->dev);
+ if (ret)
+ goto dev_reg_err;
+
+ vduse_mgmt->mgmt_dev.id_table = id_table;
+ vduse_mgmt->mgmt_dev.ops = &vdpa_dev_mgmtdev_ops;
+ vduse_mgmt->mgmt_dev.device = &vduse_mgmt->dev;
+ ret = vdpa_mgmtdev_register(&vduse_mgmt->mgmt_dev);
+ if (ret)
+ device_unregister(&vduse_mgmt->dev);
+
+ return ret;
+
+dev_reg_err:
+ put_device(&vduse_mgmt->dev);
+ return ret;
+}
+
+static void vduse_mgmtdev_exit(void)
+{
+ vdpa_mgmtdev_unregister(&vduse_mgmt->mgmt_dev);
+ device_unregister(&vduse_mgmt->dev);
+}
+
+static int vduse_init(void)
+{
+ int ret;
+ struct device *dev;
+
+ vduse_class = class_create("vduse");
+ if (IS_ERR(vduse_class))
+ return PTR_ERR(vduse_class);
+
+ vduse_class->devnode = vduse_devnode;
+
+ ret = alloc_chrdev_region(&vduse_major, 0, VDUSE_DEV_MAX, "vduse");
+ if (ret)
+ goto err_chardev_region;
+
+ /* /dev/vduse/control */
+ cdev_init(&vduse_ctrl_cdev, &vduse_ctrl_fops);
+ vduse_ctrl_cdev.owner = THIS_MODULE;
+ ret = cdev_add(&vduse_ctrl_cdev, vduse_major, 1);
+ if (ret)
+ goto err_ctrl_cdev;
+
+ dev = device_create(vduse_class, NULL, vduse_major, NULL, "control");
+ if (IS_ERR(dev)) {
+ ret = PTR_ERR(dev);
+ goto err_device;
+ }
+
+ /* /dev/vduse/$DEVICE */
+ cdev_init(&vduse_cdev, &vduse_dev_fops);
+ vduse_cdev.owner = THIS_MODULE;
+ ret = cdev_add(&vduse_cdev, MKDEV(MAJOR(vduse_major), 1),
+ VDUSE_DEV_MAX - 1);
+ if (ret)
+ goto err_cdev;
+
+ ret = -ENOMEM;
+ vduse_irq_wq = alloc_workqueue("vduse-irq",
+ WQ_HIGHPRI | WQ_SYSFS | WQ_UNBOUND, 0);
+ if (!vduse_irq_wq)
+ goto err_wq;
+
+ vduse_irq_bound_wq = alloc_workqueue("vduse-irq-bound", WQ_HIGHPRI, 0);
+ if (!vduse_irq_bound_wq)
+ goto err_bound_wq;
+
+ ret = vduse_domain_init();
+ if (ret)
+ goto err_domain;
+
+ ret = vduse_mgmtdev_init();
+ if (ret)
+ goto err_mgmtdev;
+
+ return 0;
+err_mgmtdev:
+ vduse_domain_exit();
+err_domain:
+ destroy_workqueue(vduse_irq_bound_wq);
+err_bound_wq:
+ destroy_workqueue(vduse_irq_wq);
+err_wq:
+ cdev_del(&vduse_cdev);
+err_cdev:
+ device_destroy(vduse_class, vduse_major);
+err_device:
+ cdev_del(&vduse_ctrl_cdev);
+err_ctrl_cdev:
+ unregister_chrdev_region(vduse_major, VDUSE_DEV_MAX);
+err_chardev_region:
+ class_destroy(vduse_class);
+ return ret;
+}
+module_init(vduse_init);
+
+static void vduse_exit(void)
+{
+ vduse_mgmtdev_exit();
+ vduse_domain_exit();
+ destroy_workqueue(vduse_irq_bound_wq);
+ destroy_workqueue(vduse_irq_wq);
+ cdev_del(&vduse_cdev);
+ device_destroy(vduse_class, vduse_major);
+ cdev_del(&vduse_ctrl_cdev);
+ unregister_chrdev_region(vduse_major, VDUSE_DEV_MAX);
+ class_destroy(vduse_class);
+}
+module_exit(vduse_exit);
+
+MODULE_LICENSE(DRV_LICENSE);
+MODULE_AUTHOR(DRV_AUTHOR);
+MODULE_DESCRIPTION(DRV_DESC);
diff --git a/drivers/vdpa/virtio_pci/Makefile b/drivers/vdpa/virtio_pci/Makefile
new file mode 100644
index 0000000000..231088d3af
--- /dev/null
+++ b/drivers/vdpa/virtio_pci/Makefile
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0
+obj-$(CONFIG_VP_VDPA) += vp_vdpa.o
diff --git a/drivers/vdpa/virtio_pci/vp_vdpa.c b/drivers/vdpa/virtio_pci/vp_vdpa.c
new file mode 100644
index 0000000000..281287fae8
--- /dev/null
+++ b/drivers/vdpa/virtio_pci/vp_vdpa.c
@@ -0,0 +1,667 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * vDPA bridge driver for modern virtio-pci device
+ *
+ * Copyright (c) 2020, Red Hat Inc. All rights reserved.
+ * Author: Jason Wang <jasowang@redhat.com>
+ *
+ * Based on virtio_pci_modern.c.
+ */
+
+#include <linux/interrupt.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/vdpa.h>
+#include <linux/virtio.h>
+#include <linux/virtio_config.h>
+#include <linux/virtio_ring.h>
+#include <linux/virtio_pci.h>
+#include <linux/virtio_pci_modern.h>
+#include <uapi/linux/vdpa.h>
+
+#define VP_VDPA_QUEUE_MAX 256
+#define VP_VDPA_DRIVER_NAME "vp_vdpa"
+#define VP_VDPA_NAME_SIZE 256
+
+struct vp_vring {
+ void __iomem *notify;
+ char msix_name[VP_VDPA_NAME_SIZE];
+ struct vdpa_callback cb;
+ resource_size_t notify_pa;
+ int irq;
+};
+
+struct vp_vdpa {
+ struct vdpa_device vdpa;
+ struct virtio_pci_modern_device *mdev;
+ struct vp_vring *vring;
+ struct vdpa_callback config_cb;
+ u64 device_features;
+ char msix_name[VP_VDPA_NAME_SIZE];
+ int config_irq;
+ int queues;
+ int vectors;
+};
+
+struct vp_vdpa_mgmtdev {
+ struct vdpa_mgmt_dev mgtdev;
+ struct virtio_pci_modern_device *mdev;
+ struct vp_vdpa *vp_vdpa;
+};
+
+static struct vp_vdpa *vdpa_to_vp(struct vdpa_device *vdpa)
+{
+ return container_of(vdpa, struct vp_vdpa, vdpa);
+}
+
+static struct virtio_pci_modern_device *vdpa_to_mdev(struct vdpa_device *vdpa)
+{
+ struct vp_vdpa *vp_vdpa = vdpa_to_vp(vdpa);
+
+ return vp_vdpa->mdev;
+}
+
+static struct virtio_pci_modern_device *vp_vdpa_to_mdev(struct vp_vdpa *vp_vdpa)
+{
+ return vp_vdpa->mdev;
+}
+
+static u64 vp_vdpa_get_device_features(struct vdpa_device *vdpa)
+{
+ struct vp_vdpa *vp_vdpa = vdpa_to_vp(vdpa);
+
+ return vp_vdpa->device_features;
+}
+
+static int vp_vdpa_set_driver_features(struct vdpa_device *vdpa, u64 features)
+{
+ struct virtio_pci_modern_device *mdev = vdpa_to_mdev(vdpa);
+
+ vp_modern_set_features(mdev, features);
+
+ return 0;
+}
+
+static u64 vp_vdpa_get_driver_features(struct vdpa_device *vdpa)
+{
+ struct virtio_pci_modern_device *mdev = vdpa_to_mdev(vdpa);
+
+ return vp_modern_get_driver_features(mdev);
+}
+
+static u8 vp_vdpa_get_status(struct vdpa_device *vdpa)
+{
+ struct virtio_pci_modern_device *mdev = vdpa_to_mdev(vdpa);
+
+ return vp_modern_get_status(mdev);
+}
+
+static int vp_vdpa_get_vq_irq(struct vdpa_device *vdpa, u16 idx)
+{
+ struct vp_vdpa *vp_vdpa = vdpa_to_vp(vdpa);
+ int irq = vp_vdpa->vring[idx].irq;
+
+ if (irq == VIRTIO_MSI_NO_VECTOR)
+ return -EINVAL;
+
+ return irq;
+}
+
+static void vp_vdpa_free_irq(struct vp_vdpa *vp_vdpa)
+{
+ struct virtio_pci_modern_device *mdev = vp_vdpa_to_mdev(vp_vdpa);
+ struct pci_dev *pdev = mdev->pci_dev;
+ int i;
+
+ for (i = 0; i < vp_vdpa->queues; i++) {
+ if (vp_vdpa->vring[i].irq != VIRTIO_MSI_NO_VECTOR) {
+ vp_modern_queue_vector(mdev, i, VIRTIO_MSI_NO_VECTOR);
+ devm_free_irq(&pdev->dev, vp_vdpa->vring[i].irq,
+ &vp_vdpa->vring[i]);
+ vp_vdpa->vring[i].irq = VIRTIO_MSI_NO_VECTOR;
+ }
+ }
+
+ if (vp_vdpa->config_irq != VIRTIO_MSI_NO_VECTOR) {
+ vp_modern_config_vector(mdev, VIRTIO_MSI_NO_VECTOR);
+ devm_free_irq(&pdev->dev, vp_vdpa->config_irq, vp_vdpa);
+ vp_vdpa->config_irq = VIRTIO_MSI_NO_VECTOR;
+ }
+
+ if (vp_vdpa->vectors) {
+ pci_free_irq_vectors(pdev);
+ vp_vdpa->vectors = 0;
+ }
+}
+
+static irqreturn_t vp_vdpa_vq_handler(int irq, void *arg)
+{
+ struct vp_vring *vring = arg;
+
+ if (vring->cb.callback)
+ return vring->cb.callback(vring->cb.private);
+
+ return IRQ_HANDLED;
+}
+
+static irqreturn_t vp_vdpa_config_handler(int irq, void *arg)
+{
+ struct vp_vdpa *vp_vdpa = arg;
+
+ if (vp_vdpa->config_cb.callback)
+ return vp_vdpa->config_cb.callback(vp_vdpa->config_cb.private);
+
+ return IRQ_HANDLED;
+}
+
+static int vp_vdpa_request_irq(struct vp_vdpa *vp_vdpa)
+{
+ struct virtio_pci_modern_device *mdev = vp_vdpa_to_mdev(vp_vdpa);
+ struct pci_dev *pdev = mdev->pci_dev;
+ int i, ret, irq;
+ int queues = vp_vdpa->queues;
+ int vectors = queues + 1;
+
+ ret = pci_alloc_irq_vectors(pdev, vectors, vectors, PCI_IRQ_MSIX);
+ if (ret != vectors) {
+ dev_err(&pdev->dev,
+ "vp_vdpa: fail to allocate irq vectors want %d but %d\n",
+ vectors, ret);
+ return ret;
+ }
+
+ vp_vdpa->vectors = vectors;
+
+ for (i = 0; i < queues; i++) {
+ snprintf(vp_vdpa->vring[i].msix_name, VP_VDPA_NAME_SIZE,
+ "vp-vdpa[%s]-%d\n", pci_name(pdev), i);
+ irq = pci_irq_vector(pdev, i);
+ ret = devm_request_irq(&pdev->dev, irq,
+ vp_vdpa_vq_handler,
+ 0, vp_vdpa->vring[i].msix_name,
+ &vp_vdpa->vring[i]);
+ if (ret) {
+ dev_err(&pdev->dev,
+ "vp_vdpa: fail to request irq for vq %d\n", i);
+ goto err;
+ }
+ vp_modern_queue_vector(mdev, i, i);
+ vp_vdpa->vring[i].irq = irq;
+ }
+
+ snprintf(vp_vdpa->msix_name, VP_VDPA_NAME_SIZE, "vp-vdpa[%s]-config\n",
+ pci_name(pdev));
+ irq = pci_irq_vector(pdev, queues);
+ ret = devm_request_irq(&pdev->dev, irq, vp_vdpa_config_handler, 0,
+ vp_vdpa->msix_name, vp_vdpa);
+ if (ret) {
+ dev_err(&pdev->dev,
+ "vp_vdpa: fail to request irq for vq %d\n", i);
+ goto err;
+ }
+ vp_modern_config_vector(mdev, queues);
+ vp_vdpa->config_irq = irq;
+
+ return 0;
+err:
+ vp_vdpa_free_irq(vp_vdpa);
+ return ret;
+}
+
+static void vp_vdpa_set_status(struct vdpa_device *vdpa, u8 status)
+{
+ struct vp_vdpa *vp_vdpa = vdpa_to_vp(vdpa);
+ struct virtio_pci_modern_device *mdev = vp_vdpa_to_mdev(vp_vdpa);
+ u8 s = vp_vdpa_get_status(vdpa);
+
+ if (status & VIRTIO_CONFIG_S_DRIVER_OK &&
+ !(s & VIRTIO_CONFIG_S_DRIVER_OK)) {
+ vp_vdpa_request_irq(vp_vdpa);
+ }
+
+ vp_modern_set_status(mdev, status);
+}
+
+static int vp_vdpa_reset(struct vdpa_device *vdpa)
+{
+ struct vp_vdpa *vp_vdpa = vdpa_to_vp(vdpa);
+ struct virtio_pci_modern_device *mdev = vp_vdpa_to_mdev(vp_vdpa);
+ u8 s = vp_vdpa_get_status(vdpa);
+
+ vp_modern_set_status(mdev, 0);
+
+ if (s & VIRTIO_CONFIG_S_DRIVER_OK)
+ vp_vdpa_free_irq(vp_vdpa);
+
+ return 0;
+}
+
+static u16 vp_vdpa_get_vq_num_max(struct vdpa_device *vdpa)
+{
+ return VP_VDPA_QUEUE_MAX;
+}
+
+static int vp_vdpa_get_vq_state(struct vdpa_device *vdpa, u16 qid,
+ struct vdpa_vq_state *state)
+{
+ /* Note that this is not supported by virtio specification, so
+ * we return -EOPNOTSUPP here. This means we can't support live
+ * migration, vhost device start/stop.
+ */
+ return -EOPNOTSUPP;
+}
+
+static int vp_vdpa_set_vq_state_split(struct vdpa_device *vdpa,
+ const struct vdpa_vq_state *state)
+{
+ const struct vdpa_vq_state_split *split = &state->split;
+
+ if (split->avail_index == 0)
+ return 0;
+
+ return -EOPNOTSUPP;
+}
+
+static int vp_vdpa_set_vq_state_packed(struct vdpa_device *vdpa,
+ const struct vdpa_vq_state *state)
+{
+ const struct vdpa_vq_state_packed *packed = &state->packed;
+
+ if (packed->last_avail_counter == 1 &&
+ packed->last_avail_idx == 0 &&
+ packed->last_used_counter == 1 &&
+ packed->last_used_idx == 0)
+ return 0;
+
+ return -EOPNOTSUPP;
+}
+
+static int vp_vdpa_set_vq_state(struct vdpa_device *vdpa, u16 qid,
+ const struct vdpa_vq_state *state)
+{
+ struct virtio_pci_modern_device *mdev = vdpa_to_mdev(vdpa);
+
+ /* Note that this is not supported by virtio specification.
+ * But if the state is by chance equal to the device initial
+ * state, we can let it go.
+ */
+ if ((vp_modern_get_status(mdev) & VIRTIO_CONFIG_S_FEATURES_OK) &&
+ !vp_modern_get_queue_enable(mdev, qid)) {
+ if (vp_modern_get_driver_features(mdev) &
+ BIT_ULL(VIRTIO_F_RING_PACKED))
+ return vp_vdpa_set_vq_state_packed(vdpa, state);
+ else
+ return vp_vdpa_set_vq_state_split(vdpa, state);
+ }
+
+ return -EOPNOTSUPP;
+}
+
+static void vp_vdpa_set_vq_cb(struct vdpa_device *vdpa, u16 qid,
+ struct vdpa_callback *cb)
+{
+ struct vp_vdpa *vp_vdpa = vdpa_to_vp(vdpa);
+
+ vp_vdpa->vring[qid].cb = *cb;
+}
+
+static void vp_vdpa_set_vq_ready(struct vdpa_device *vdpa,
+ u16 qid, bool ready)
+{
+ struct virtio_pci_modern_device *mdev = vdpa_to_mdev(vdpa);
+
+ vp_modern_set_queue_enable(mdev, qid, ready);
+}
+
+static bool vp_vdpa_get_vq_ready(struct vdpa_device *vdpa, u16 qid)
+{
+ struct virtio_pci_modern_device *mdev = vdpa_to_mdev(vdpa);
+
+ return vp_modern_get_queue_enable(mdev, qid);
+}
+
+static void vp_vdpa_set_vq_num(struct vdpa_device *vdpa, u16 qid,
+ u32 num)
+{
+ struct virtio_pci_modern_device *mdev = vdpa_to_mdev(vdpa);
+
+ vp_modern_set_queue_size(mdev, qid, num);
+}
+
+static int vp_vdpa_set_vq_address(struct vdpa_device *vdpa, u16 qid,
+ u64 desc_area, u64 driver_area,
+ u64 device_area)
+{
+ struct virtio_pci_modern_device *mdev = vdpa_to_mdev(vdpa);
+
+ vp_modern_queue_address(mdev, qid, desc_area,
+ driver_area, device_area);
+
+ return 0;
+}
+
+static void vp_vdpa_kick_vq(struct vdpa_device *vdpa, u16 qid)
+{
+ struct vp_vdpa *vp_vdpa = vdpa_to_vp(vdpa);
+
+ vp_iowrite16(qid, vp_vdpa->vring[qid].notify);
+}
+
+static u32 vp_vdpa_get_generation(struct vdpa_device *vdpa)
+{
+ struct virtio_pci_modern_device *mdev = vdpa_to_mdev(vdpa);
+
+ return vp_modern_generation(mdev);
+}
+
+static u32 vp_vdpa_get_device_id(struct vdpa_device *vdpa)
+{
+ struct virtio_pci_modern_device *mdev = vdpa_to_mdev(vdpa);
+
+ return mdev->id.device;
+}
+
+static u32 vp_vdpa_get_vendor_id(struct vdpa_device *vdpa)
+{
+ struct virtio_pci_modern_device *mdev = vdpa_to_mdev(vdpa);
+
+ return mdev->id.vendor;
+}
+
+static u32 vp_vdpa_get_vq_align(struct vdpa_device *vdpa)
+{
+ return PAGE_SIZE;
+}
+
+static size_t vp_vdpa_get_config_size(struct vdpa_device *vdpa)
+{
+ struct virtio_pci_modern_device *mdev = vdpa_to_mdev(vdpa);
+
+ return mdev->device_len;
+}
+
+static void vp_vdpa_get_config(struct vdpa_device *vdpa,
+ unsigned int offset,
+ void *buf, unsigned int len)
+{
+ struct vp_vdpa *vp_vdpa = vdpa_to_vp(vdpa);
+ struct virtio_pci_modern_device *mdev = vp_vdpa_to_mdev(vp_vdpa);
+ u8 old, new;
+ u8 *p;
+ int i;
+
+ do {
+ old = vp_ioread8(&mdev->common->config_generation);
+ p = buf;
+ for (i = 0; i < len; i++)
+ *p++ = vp_ioread8(mdev->device + offset + i);
+
+ new = vp_ioread8(&mdev->common->config_generation);
+ } while (old != new);
+}
+
+static void vp_vdpa_set_config(struct vdpa_device *vdpa,
+ unsigned int offset, const void *buf,
+ unsigned int len)
+{
+ struct vp_vdpa *vp_vdpa = vdpa_to_vp(vdpa);
+ struct virtio_pci_modern_device *mdev = vp_vdpa_to_mdev(vp_vdpa);
+ const u8 *p = buf;
+ int i;
+
+ for (i = 0; i < len; i++)
+ vp_iowrite8(*p++, mdev->device + offset + i);
+}
+
+static void vp_vdpa_set_config_cb(struct vdpa_device *vdpa,
+ struct vdpa_callback *cb)
+{
+ struct vp_vdpa *vp_vdpa = vdpa_to_vp(vdpa);
+
+ vp_vdpa->config_cb = *cb;
+}
+
+static struct vdpa_notification_area
+vp_vdpa_get_vq_notification(struct vdpa_device *vdpa, u16 qid)
+{
+ struct vp_vdpa *vp_vdpa = vdpa_to_vp(vdpa);
+ struct virtio_pci_modern_device *mdev = vp_vdpa_to_mdev(vp_vdpa);
+ struct vdpa_notification_area notify;
+
+ notify.addr = vp_vdpa->vring[qid].notify_pa;
+ notify.size = mdev->notify_offset_multiplier;
+
+ return notify;
+}
+
+static const struct vdpa_config_ops vp_vdpa_ops = {
+ .get_device_features = vp_vdpa_get_device_features,
+ .set_driver_features = vp_vdpa_set_driver_features,
+ .get_driver_features = vp_vdpa_get_driver_features,
+ .get_status = vp_vdpa_get_status,
+ .set_status = vp_vdpa_set_status,
+ .reset = vp_vdpa_reset,
+ .get_vq_num_max = vp_vdpa_get_vq_num_max,
+ .get_vq_state = vp_vdpa_get_vq_state,
+ .get_vq_notification = vp_vdpa_get_vq_notification,
+ .set_vq_state = vp_vdpa_set_vq_state,
+ .set_vq_cb = vp_vdpa_set_vq_cb,
+ .set_vq_ready = vp_vdpa_set_vq_ready,
+ .get_vq_ready = vp_vdpa_get_vq_ready,
+ .set_vq_num = vp_vdpa_set_vq_num,
+ .set_vq_address = vp_vdpa_set_vq_address,
+ .kick_vq = vp_vdpa_kick_vq,
+ .get_generation = vp_vdpa_get_generation,
+ .get_device_id = vp_vdpa_get_device_id,
+ .get_vendor_id = vp_vdpa_get_vendor_id,
+ .get_vq_align = vp_vdpa_get_vq_align,
+ .get_config_size = vp_vdpa_get_config_size,
+ .get_config = vp_vdpa_get_config,
+ .set_config = vp_vdpa_set_config,
+ .set_config_cb = vp_vdpa_set_config_cb,
+ .get_vq_irq = vp_vdpa_get_vq_irq,
+};
+
+static void vp_vdpa_free_irq_vectors(void *data)
+{
+ pci_free_irq_vectors(data);
+}
+
+static int vp_vdpa_dev_add(struct vdpa_mgmt_dev *v_mdev, const char *name,
+ const struct vdpa_dev_set_config *add_config)
+{
+ struct vp_vdpa_mgmtdev *vp_vdpa_mgtdev =
+ container_of(v_mdev, struct vp_vdpa_mgmtdev, mgtdev);
+
+ struct virtio_pci_modern_device *mdev = vp_vdpa_mgtdev->mdev;
+ struct pci_dev *pdev = mdev->pci_dev;
+ struct device *dev = &pdev->dev;
+ struct vp_vdpa *vp_vdpa = NULL;
+ u64 device_features;
+ int ret, i;
+
+ vp_vdpa = vdpa_alloc_device(struct vp_vdpa, vdpa,
+ dev, &vp_vdpa_ops, 1, 1, name, false);
+
+ if (IS_ERR(vp_vdpa)) {
+ dev_err(dev, "vp_vdpa: Failed to allocate vDPA structure\n");
+ return PTR_ERR(vp_vdpa);
+ }
+
+ vp_vdpa_mgtdev->vp_vdpa = vp_vdpa;
+
+ vp_vdpa->vdpa.dma_dev = &pdev->dev;
+ vp_vdpa->queues = vp_modern_get_num_queues(mdev);
+ vp_vdpa->mdev = mdev;
+
+ device_features = vp_modern_get_features(mdev);
+ if (add_config->mask & BIT_ULL(VDPA_ATTR_DEV_FEATURES)) {
+ if (add_config->device_features & ~device_features) {
+ ret = -EINVAL;
+ dev_err(&pdev->dev, "Try to provision features "
+ "that are not supported by the device: "
+ "device_features 0x%llx provisioned 0x%llx\n",
+ device_features, add_config->device_features);
+ goto err;
+ }
+ device_features = add_config->device_features;
+ }
+ vp_vdpa->device_features = device_features;
+
+ ret = devm_add_action_or_reset(dev, vp_vdpa_free_irq_vectors, pdev);
+ if (ret) {
+ dev_err(&pdev->dev,
+ "Failed for adding devres for freeing irq vectors\n");
+ goto err;
+ }
+
+ vp_vdpa->vring = devm_kcalloc(&pdev->dev, vp_vdpa->queues,
+ sizeof(*vp_vdpa->vring),
+ GFP_KERNEL);
+ if (!vp_vdpa->vring) {
+ ret = -ENOMEM;
+ dev_err(&pdev->dev, "Fail to allocate virtqueues\n");
+ goto err;
+ }
+
+ for (i = 0; i < vp_vdpa->queues; i++) {
+ vp_vdpa->vring[i].irq = VIRTIO_MSI_NO_VECTOR;
+ vp_vdpa->vring[i].notify =
+ vp_modern_map_vq_notify(mdev, i,
+ &vp_vdpa->vring[i].notify_pa);
+ if (!vp_vdpa->vring[i].notify) {
+ ret = -EINVAL;
+ dev_warn(&pdev->dev, "Fail to map vq notify %d\n", i);
+ goto err;
+ }
+ }
+ vp_vdpa->config_irq = VIRTIO_MSI_NO_VECTOR;
+
+ vp_vdpa->vdpa.mdev = &vp_vdpa_mgtdev->mgtdev;
+ ret = _vdpa_register_device(&vp_vdpa->vdpa, vp_vdpa->queues);
+ if (ret) {
+ dev_err(&pdev->dev, "Failed to register to vdpa bus\n");
+ goto err;
+ }
+
+ return 0;
+
+err:
+ put_device(&vp_vdpa->vdpa.dev);
+ return ret;
+}
+
+static void vp_vdpa_dev_del(struct vdpa_mgmt_dev *v_mdev,
+ struct vdpa_device *dev)
+{
+ struct vp_vdpa_mgmtdev *vp_vdpa_mgtdev =
+ container_of(v_mdev, struct vp_vdpa_mgmtdev, mgtdev);
+
+ struct vp_vdpa *vp_vdpa = vp_vdpa_mgtdev->vp_vdpa;
+
+ _vdpa_unregister_device(&vp_vdpa->vdpa);
+ vp_vdpa_mgtdev->vp_vdpa = NULL;
+}
+
+static const struct vdpa_mgmtdev_ops vp_vdpa_mdev_ops = {
+ .dev_add = vp_vdpa_dev_add,
+ .dev_del = vp_vdpa_dev_del,
+};
+
+static int vp_vdpa_probe(struct pci_dev *pdev, const struct pci_device_id *id)
+{
+ struct vp_vdpa_mgmtdev *vp_vdpa_mgtdev = NULL;
+ struct vdpa_mgmt_dev *mgtdev;
+ struct device *dev = &pdev->dev;
+ struct virtio_pci_modern_device *mdev = NULL;
+ struct virtio_device_id *mdev_id = NULL;
+ int err;
+
+ vp_vdpa_mgtdev = kzalloc(sizeof(*vp_vdpa_mgtdev), GFP_KERNEL);
+ if (!vp_vdpa_mgtdev)
+ return -ENOMEM;
+
+ mgtdev = &vp_vdpa_mgtdev->mgtdev;
+ mgtdev->ops = &vp_vdpa_mdev_ops;
+ mgtdev->device = dev;
+
+ mdev = kzalloc(sizeof(struct virtio_pci_modern_device), GFP_KERNEL);
+ if (!mdev) {
+ err = -ENOMEM;
+ goto mdev_err;
+ }
+
+ mdev_id = kzalloc(sizeof(struct virtio_device_id), GFP_KERNEL);
+ if (!mdev_id) {
+ err = -ENOMEM;
+ goto mdev_id_err;
+ }
+
+ vp_vdpa_mgtdev->mdev = mdev;
+ mdev->pci_dev = pdev;
+
+ err = pcim_enable_device(pdev);
+ if (err) {
+ goto probe_err;
+ }
+
+ err = vp_modern_probe(mdev);
+ if (err) {
+ dev_err(&pdev->dev, "Failed to probe modern PCI device\n");
+ goto probe_err;
+ }
+
+ mdev_id->device = mdev->id.device;
+ mdev_id->vendor = mdev->id.vendor;
+ mgtdev->id_table = mdev_id;
+ mgtdev->max_supported_vqs = vp_modern_get_num_queues(mdev);
+ mgtdev->supported_features = vp_modern_get_features(mdev);
+ mgtdev->config_attr_mask = (1 << VDPA_ATTR_DEV_FEATURES);
+ pci_set_master(pdev);
+ pci_set_drvdata(pdev, vp_vdpa_mgtdev);
+
+ err = vdpa_mgmtdev_register(mgtdev);
+ if (err) {
+ dev_err(&pdev->dev, "Failed to register vdpa mgmtdev device\n");
+ goto register_err;
+ }
+
+ return 0;
+
+register_err:
+ vp_modern_remove(vp_vdpa_mgtdev->mdev);
+probe_err:
+ kfree(mdev_id);
+mdev_id_err:
+ kfree(mdev);
+mdev_err:
+ kfree(vp_vdpa_mgtdev);
+ return err;
+}
+
+static void vp_vdpa_remove(struct pci_dev *pdev)
+{
+ struct vp_vdpa_mgmtdev *vp_vdpa_mgtdev = pci_get_drvdata(pdev);
+ struct virtio_pci_modern_device *mdev = NULL;
+
+ mdev = vp_vdpa_mgtdev->mdev;
+ vdpa_mgmtdev_unregister(&vp_vdpa_mgtdev->mgtdev);
+ vp_modern_remove(mdev);
+ kfree(vp_vdpa_mgtdev->mgtdev.id_table);
+ kfree(mdev);
+ kfree(vp_vdpa_mgtdev);
+}
+
+static struct pci_driver vp_vdpa_driver = {
+ .name = "vp-vdpa",
+ .id_table = NULL, /* only dynamic ids */
+ .probe = vp_vdpa_probe,
+ .remove = vp_vdpa_remove,
+};
+
+module_pci_driver(vp_vdpa_driver);
+
+MODULE_AUTHOR("Jason Wang <jasowang@redhat.com>");
+MODULE_DESCRIPTION("vp-vdpa");
+MODULE_LICENSE("GPL");
+MODULE_VERSION("1");