diff options
Diffstat (limited to 'drivers/vdpa')
41 files changed, 17467 insertions, 0 deletions
diff --git a/drivers/vdpa/Kconfig b/drivers/vdpa/Kconfig new file mode 100644 index 0000000000..656c1cb541 --- /dev/null +++ b/drivers/vdpa/Kconfig @@ -0,0 +1,129 @@ +# SPDX-License-Identifier: GPL-2.0-only +menuconfig VDPA + tristate "vDPA drivers" + depends on NET + help + Enable this module to support vDPA device that uses a + datapath which complies with virtio specifications with + vendor specific control path. + +if VDPA + +config VDPA_SIM + tristate "vDPA device simulator core" + depends on RUNTIME_TESTING_MENU && HAS_DMA + select DMA_OPS + select VHOST_RING + select IOMMU_IOVA + help + Enable this module to support vDPA device simulators. These devices + are used for testing, prototyping and development of vDPA. + +config VDPA_SIM_NET + tristate "vDPA simulator for networking device" + depends on VDPA_SIM + select GENERIC_NET_UTILS + help + vDPA networking device simulator which loops TX traffic back to RX. + +config VDPA_SIM_BLOCK + tristate "vDPA simulator for block device" + depends on VDPA_SIM + help + vDPA block device simulator which terminates IO request in a + memory buffer. + +config VDPA_USER + tristate "VDUSE (vDPA Device in Userspace) support" + depends on EVENTFD && MMU && HAS_DMA + select DMA_OPS + select VHOST_IOTLB + select IOMMU_IOVA + help + With VDUSE it is possible to emulate a vDPA Device + in a userspace program. + +config IFCVF + tristate "Intel IFC VF vDPA driver" + depends on PCI_MSI + help + This kernel module can drive Intel IFC VF NIC to offload + virtio dataplane traffic to hardware. + To compile this driver as a module, choose M here: the module will + be called ifcvf. + +config MLX5_VDPA + bool + select VHOST_IOTLB + help + Support library for Mellanox VDPA drivers. Provides code that is + common for all types of VDPA drivers. The following drivers are planned: + net, block. + +config MLX5_VDPA_NET + tristate "vDPA driver for ConnectX devices" + select MLX5_VDPA + select VHOST_RING + depends on MLX5_CORE + help + VDPA network driver for ConnectX6 and newer. Provides offloading + of virtio net datapath such that descriptors put on the ring will + be executed by the hardware. It also supports a variety of stateless + offloads depending on the actual device used and firmware version. + +config MLX5_VDPA_STEERING_DEBUG + bool "expose steering counters on debugfs" + select MLX5_VDPA + help + Expose RX steering counters in debugfs to aid in debugging. For each VLAN + or non VLAN interface, two hardware counters are added to the RX flow + table: one for unicast and one for multicast. + The counters counts the number of packets and bytes and exposes them in + debugfs. Once can read the counters using, e.g.: + cat /sys/kernel/debug/mlx5/mlx5_core.sf.1/vdpa-0/rx/untagged/ucast/packets + cat /sys/kernel/debug/mlx5/mlx5_core.sf.1/vdpa-0/rx/untagged/mcast/bytes + +config VP_VDPA + tristate "Virtio PCI bridge vDPA driver" + select VIRTIO_PCI_LIB + depends on PCI_MSI + help + This kernel module bridges virtio PCI device to vDPA bus. + +config ALIBABA_ENI_VDPA + tristate "vDPA driver for Alibaba ENI" + select VIRTIO_PCI_LIB_LEGACY + depends on PCI_MSI && X86 + help + VDPA driver for Alibaba ENI (Elastic Network Interface) which is built upon + virtio 0.9.5 specification. + + config SNET_VDPA + tristate "SolidRun's vDPA driver for SolidNET" + depends on PCI_MSI && PCI_IOV && (HWMON || HWMON=n) + + # This driver MAY create a HWMON device. + # Depending on (HWMON || HWMON=n) ensures that: + # If HWMON=n the driver can be compiled either as a module or built-in. + # If HWMON=y the driver can be compiled either as a module or built-in. + # If HWMON=m the driver is forced to be compiled as a module. + # By doing so, IS_ENABLED can be used instead of IS_REACHABLE + + help + vDPA driver for SolidNET DPU. + With this driver, the VirtIO dataplane can be + offloaded to a SolidNET DPU. + This driver includes a HW monitor device that + reads health values from the DPU. + +config PDS_VDPA + tristate "vDPA driver for AMD/Pensando DSC devices" + select VIRTIO_PCI_LIB + depends on PCI_MSI + depends on PDS_CORE + help + vDPA network driver for AMD/Pensando's PDS Core devices. + With this driver, the VirtIO dataplane can be + offloaded to an AMD/Pensando DSC device. + +endif # VDPA diff --git a/drivers/vdpa/Makefile b/drivers/vdpa/Makefile new file mode 100644 index 0000000000..8f53c6f3cc --- /dev/null +++ b/drivers/vdpa/Makefile @@ -0,0 +1,10 @@ +# SPDX-License-Identifier: GPL-2.0 +obj-$(CONFIG_VDPA) += vdpa.o +obj-$(CONFIG_VDPA_SIM) += vdpa_sim/ +obj-$(CONFIG_VDPA_USER) += vdpa_user/ +obj-$(CONFIG_IFCVF) += ifcvf/ +obj-$(CONFIG_MLX5_VDPA) += mlx5/ +obj-$(CONFIG_VP_VDPA) += virtio_pci/ +obj-$(CONFIG_ALIBABA_ENI_VDPA) += alibaba/ +obj-$(CONFIG_SNET_VDPA) += solidrun/ +obj-$(CONFIG_PDS_VDPA) += pds/ diff --git a/drivers/vdpa/alibaba/Makefile b/drivers/vdpa/alibaba/Makefile new file mode 100644 index 0000000000..ef4aae69f8 --- /dev/null +++ b/drivers/vdpa/alibaba/Makefile @@ -0,0 +1,3 @@ +# SPDX-License-Identifier: GPL-2.0 +obj-$(CONFIG_ALIBABA_ENI_VDPA) += eni_vdpa.o + diff --git a/drivers/vdpa/alibaba/eni_vdpa.c b/drivers/vdpa/alibaba/eni_vdpa.c new file mode 100644 index 0000000000..cce3d18371 --- /dev/null +++ b/drivers/vdpa/alibaba/eni_vdpa.c @@ -0,0 +1,551 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * vDPA bridge driver for Alibaba ENI(Elastic Network Interface) + * + * Copyright (c) 2021, Alibaba Inc. All rights reserved. + * Author: Wu Zongyong <wuzongyong@linux.alibaba.com> + * + */ + +#include "linux/bits.h" +#include <linux/interrupt.h> +#include <linux/module.h> +#include <linux/pci.h> +#include <linux/vdpa.h> +#include <linux/virtio.h> +#include <linux/virtio_config.h> +#include <linux/virtio_ring.h> +#include <linux/virtio_pci.h> +#include <linux/virtio_pci_legacy.h> +#include <uapi/linux/virtio_net.h> + +#define ENI_MSIX_NAME_SIZE 256 + +#define ENI_ERR(pdev, fmt, ...) \ + dev_err(&pdev->dev, "%s"fmt, "eni_vdpa: ", ##__VA_ARGS__) +#define ENI_DBG(pdev, fmt, ...) \ + dev_dbg(&pdev->dev, "%s"fmt, "eni_vdpa: ", ##__VA_ARGS__) +#define ENI_INFO(pdev, fmt, ...) \ + dev_info(&pdev->dev, "%s"fmt, "eni_vdpa: ", ##__VA_ARGS__) + +struct eni_vring { + void __iomem *notify; + char msix_name[ENI_MSIX_NAME_SIZE]; + struct vdpa_callback cb; + int irq; +}; + +struct eni_vdpa { + struct vdpa_device vdpa; + struct virtio_pci_legacy_device ldev; + struct eni_vring *vring; + struct vdpa_callback config_cb; + char msix_name[ENI_MSIX_NAME_SIZE]; + int config_irq; + int queues; + int vectors; +}; + +static struct eni_vdpa *vdpa_to_eni(struct vdpa_device *vdpa) +{ + return container_of(vdpa, struct eni_vdpa, vdpa); +} + +static struct virtio_pci_legacy_device *vdpa_to_ldev(struct vdpa_device *vdpa) +{ + struct eni_vdpa *eni_vdpa = vdpa_to_eni(vdpa); + + return &eni_vdpa->ldev; +} + +static u64 eni_vdpa_get_device_features(struct vdpa_device *vdpa) +{ + struct virtio_pci_legacy_device *ldev = vdpa_to_ldev(vdpa); + u64 features = vp_legacy_get_features(ldev); + + features |= BIT_ULL(VIRTIO_F_ACCESS_PLATFORM); + features |= BIT_ULL(VIRTIO_F_ORDER_PLATFORM); + + return features; +} + +static int eni_vdpa_set_driver_features(struct vdpa_device *vdpa, u64 features) +{ + struct virtio_pci_legacy_device *ldev = vdpa_to_ldev(vdpa); + + if (!(features & BIT_ULL(VIRTIO_NET_F_MRG_RXBUF)) && features) { + ENI_ERR(ldev->pci_dev, + "VIRTIO_NET_F_MRG_RXBUF is not negotiated\n"); + return -EINVAL; + } + + vp_legacy_set_features(ldev, (u32)features); + + return 0; +} + +static u64 eni_vdpa_get_driver_features(struct vdpa_device *vdpa) +{ + struct virtio_pci_legacy_device *ldev = vdpa_to_ldev(vdpa); + + return vp_legacy_get_driver_features(ldev); +} + +static u8 eni_vdpa_get_status(struct vdpa_device *vdpa) +{ + struct virtio_pci_legacy_device *ldev = vdpa_to_ldev(vdpa); + + return vp_legacy_get_status(ldev); +} + +static int eni_vdpa_get_vq_irq(struct vdpa_device *vdpa, u16 idx) +{ + struct eni_vdpa *eni_vdpa = vdpa_to_eni(vdpa); + int irq = eni_vdpa->vring[idx].irq; + + if (irq == VIRTIO_MSI_NO_VECTOR) + return -EINVAL; + + return irq; +} + +static void eni_vdpa_free_irq(struct eni_vdpa *eni_vdpa) +{ + struct virtio_pci_legacy_device *ldev = &eni_vdpa->ldev; + struct pci_dev *pdev = ldev->pci_dev; + int i; + + for (i = 0; i < eni_vdpa->queues; i++) { + if (eni_vdpa->vring[i].irq != VIRTIO_MSI_NO_VECTOR) { + vp_legacy_queue_vector(ldev, i, VIRTIO_MSI_NO_VECTOR); + devm_free_irq(&pdev->dev, eni_vdpa->vring[i].irq, + &eni_vdpa->vring[i]); + eni_vdpa->vring[i].irq = VIRTIO_MSI_NO_VECTOR; + } + } + + if (eni_vdpa->config_irq != VIRTIO_MSI_NO_VECTOR) { + vp_legacy_config_vector(ldev, VIRTIO_MSI_NO_VECTOR); + devm_free_irq(&pdev->dev, eni_vdpa->config_irq, eni_vdpa); + eni_vdpa->config_irq = VIRTIO_MSI_NO_VECTOR; + } + + if (eni_vdpa->vectors) { + pci_free_irq_vectors(pdev); + eni_vdpa->vectors = 0; + } +} + +static irqreturn_t eni_vdpa_vq_handler(int irq, void *arg) +{ + struct eni_vring *vring = arg; + + if (vring->cb.callback) + return vring->cb.callback(vring->cb.private); + + return IRQ_HANDLED; +} + +static irqreturn_t eni_vdpa_config_handler(int irq, void *arg) +{ + struct eni_vdpa *eni_vdpa = arg; + + if (eni_vdpa->config_cb.callback) + return eni_vdpa->config_cb.callback(eni_vdpa->config_cb.private); + + return IRQ_HANDLED; +} + +static int eni_vdpa_request_irq(struct eni_vdpa *eni_vdpa) +{ + struct virtio_pci_legacy_device *ldev = &eni_vdpa->ldev; + struct pci_dev *pdev = ldev->pci_dev; + int i, ret, irq; + int queues = eni_vdpa->queues; + int vectors = queues + 1; + + ret = pci_alloc_irq_vectors(pdev, vectors, vectors, PCI_IRQ_MSIX); + if (ret != vectors) { + ENI_ERR(pdev, + "failed to allocate irq vectors want %d but %d\n", + vectors, ret); + return ret; + } + + eni_vdpa->vectors = vectors; + + for (i = 0; i < queues; i++) { + snprintf(eni_vdpa->vring[i].msix_name, ENI_MSIX_NAME_SIZE, + "eni-vdpa[%s]-%d\n", pci_name(pdev), i); + irq = pci_irq_vector(pdev, i); + ret = devm_request_irq(&pdev->dev, irq, + eni_vdpa_vq_handler, + 0, eni_vdpa->vring[i].msix_name, + &eni_vdpa->vring[i]); + if (ret) { + ENI_ERR(pdev, "failed to request irq for vq %d\n", i); + goto err; + } + vp_legacy_queue_vector(ldev, i, i); + eni_vdpa->vring[i].irq = irq; + } + + snprintf(eni_vdpa->msix_name, ENI_MSIX_NAME_SIZE, "eni-vdpa[%s]-config\n", + pci_name(pdev)); + irq = pci_irq_vector(pdev, queues); + ret = devm_request_irq(&pdev->dev, irq, eni_vdpa_config_handler, 0, + eni_vdpa->msix_name, eni_vdpa); + if (ret) { + ENI_ERR(pdev, "failed to request irq for config vq %d\n", i); + goto err; + } + vp_legacy_config_vector(ldev, queues); + eni_vdpa->config_irq = irq; + + return 0; +err: + eni_vdpa_free_irq(eni_vdpa); + return ret; +} + +static void eni_vdpa_set_status(struct vdpa_device *vdpa, u8 status) +{ + struct eni_vdpa *eni_vdpa = vdpa_to_eni(vdpa); + struct virtio_pci_legacy_device *ldev = &eni_vdpa->ldev; + u8 s = eni_vdpa_get_status(vdpa); + + if (status & VIRTIO_CONFIG_S_DRIVER_OK && + !(s & VIRTIO_CONFIG_S_DRIVER_OK)) { + eni_vdpa_request_irq(eni_vdpa); + } + + vp_legacy_set_status(ldev, status); + + if (!(status & VIRTIO_CONFIG_S_DRIVER_OK) && + (s & VIRTIO_CONFIG_S_DRIVER_OK)) + eni_vdpa_free_irq(eni_vdpa); +} + +static int eni_vdpa_reset(struct vdpa_device *vdpa) +{ + struct eni_vdpa *eni_vdpa = vdpa_to_eni(vdpa); + struct virtio_pci_legacy_device *ldev = &eni_vdpa->ldev; + u8 s = eni_vdpa_get_status(vdpa); + + vp_legacy_set_status(ldev, 0); + + if (s & VIRTIO_CONFIG_S_DRIVER_OK) + eni_vdpa_free_irq(eni_vdpa); + + return 0; +} + +static u16 eni_vdpa_get_vq_num_max(struct vdpa_device *vdpa) +{ + struct virtio_pci_legacy_device *ldev = vdpa_to_ldev(vdpa); + + return vp_legacy_get_queue_size(ldev, 0); +} + +static u16 eni_vdpa_get_vq_num_min(struct vdpa_device *vdpa) +{ + struct virtio_pci_legacy_device *ldev = vdpa_to_ldev(vdpa); + + return vp_legacy_get_queue_size(ldev, 0); +} + +static int eni_vdpa_get_vq_state(struct vdpa_device *vdpa, u16 qid, + struct vdpa_vq_state *state) +{ + return -EOPNOTSUPP; +} + +static int eni_vdpa_set_vq_state(struct vdpa_device *vdpa, u16 qid, + const struct vdpa_vq_state *state) +{ + struct virtio_pci_legacy_device *ldev = vdpa_to_ldev(vdpa); + const struct vdpa_vq_state_split *split = &state->split; + + /* ENI is build upon virtio-pci specfication which not support + * to set state of virtqueue. But if the state is equal to the + * device initial state by chance, we can let it go. + */ + if (!vp_legacy_get_queue_enable(ldev, qid) + && split->avail_index == 0) + return 0; + + return -EOPNOTSUPP; +} + + +static void eni_vdpa_set_vq_cb(struct vdpa_device *vdpa, u16 qid, + struct vdpa_callback *cb) +{ + struct eni_vdpa *eni_vdpa = vdpa_to_eni(vdpa); + + eni_vdpa->vring[qid].cb = *cb; +} + +static void eni_vdpa_set_vq_ready(struct vdpa_device *vdpa, u16 qid, + bool ready) +{ + struct virtio_pci_legacy_device *ldev = vdpa_to_ldev(vdpa); + + /* ENI is a legacy virtio-pci device. This is not supported + * by specification. But we can disable virtqueue by setting + * address to 0. + */ + if (!ready) + vp_legacy_set_queue_address(ldev, qid, 0); +} + +static bool eni_vdpa_get_vq_ready(struct vdpa_device *vdpa, u16 qid) +{ + struct virtio_pci_legacy_device *ldev = vdpa_to_ldev(vdpa); + + return vp_legacy_get_queue_enable(ldev, qid); +} + +static void eni_vdpa_set_vq_num(struct vdpa_device *vdpa, u16 qid, + u32 num) +{ + struct virtio_pci_legacy_device *ldev = vdpa_to_ldev(vdpa); + struct pci_dev *pdev = ldev->pci_dev; + u16 n = vp_legacy_get_queue_size(ldev, qid); + + /* ENI is a legacy virtio-pci device which not allow to change + * virtqueue size. Just report a error if someone tries to + * change it. + */ + if (num != n) + ENI_ERR(pdev, + "not support to set vq %u fixed num %u to %u\n", + qid, n, num); +} + +static int eni_vdpa_set_vq_address(struct vdpa_device *vdpa, u16 qid, + u64 desc_area, u64 driver_area, + u64 device_area) +{ + struct virtio_pci_legacy_device *ldev = vdpa_to_ldev(vdpa); + u32 pfn = desc_area >> VIRTIO_PCI_QUEUE_ADDR_SHIFT; + + vp_legacy_set_queue_address(ldev, qid, pfn); + + return 0; +} + +static void eni_vdpa_kick_vq(struct vdpa_device *vdpa, u16 qid) +{ + struct eni_vdpa *eni_vdpa = vdpa_to_eni(vdpa); + + iowrite16(qid, eni_vdpa->vring[qid].notify); +} + +static u32 eni_vdpa_get_device_id(struct vdpa_device *vdpa) +{ + struct virtio_pci_legacy_device *ldev = vdpa_to_ldev(vdpa); + + return ldev->id.device; +} + +static u32 eni_vdpa_get_vendor_id(struct vdpa_device *vdpa) +{ + struct virtio_pci_legacy_device *ldev = vdpa_to_ldev(vdpa); + + return ldev->id.vendor; +} + +static u32 eni_vdpa_get_vq_align(struct vdpa_device *vdpa) +{ + return VIRTIO_PCI_VRING_ALIGN; +} + +static size_t eni_vdpa_get_config_size(struct vdpa_device *vdpa) +{ + return sizeof(struct virtio_net_config); +} + + +static void eni_vdpa_get_config(struct vdpa_device *vdpa, + unsigned int offset, + void *buf, unsigned int len) +{ + struct eni_vdpa *eni_vdpa = vdpa_to_eni(vdpa); + struct virtio_pci_legacy_device *ldev = &eni_vdpa->ldev; + void __iomem *ioaddr = ldev->ioaddr + + VIRTIO_PCI_CONFIG_OFF(eni_vdpa->vectors) + + offset; + u8 *p = buf; + int i; + + for (i = 0; i < len; i++) + *p++ = ioread8(ioaddr + i); +} + +static void eni_vdpa_set_config(struct vdpa_device *vdpa, + unsigned int offset, const void *buf, + unsigned int len) +{ + struct eni_vdpa *eni_vdpa = vdpa_to_eni(vdpa); + struct virtio_pci_legacy_device *ldev = &eni_vdpa->ldev; + void __iomem *ioaddr = ldev->ioaddr + + VIRTIO_PCI_CONFIG_OFF(eni_vdpa->vectors) + + offset; + const u8 *p = buf; + int i; + + for (i = 0; i < len; i++) + iowrite8(*p++, ioaddr + i); +} + +static void eni_vdpa_set_config_cb(struct vdpa_device *vdpa, + struct vdpa_callback *cb) +{ + struct eni_vdpa *eni_vdpa = vdpa_to_eni(vdpa); + + eni_vdpa->config_cb = *cb; +} + +static const struct vdpa_config_ops eni_vdpa_ops = { + .get_device_features = eni_vdpa_get_device_features, + .set_driver_features = eni_vdpa_set_driver_features, + .get_driver_features = eni_vdpa_get_driver_features, + .get_status = eni_vdpa_get_status, + .set_status = eni_vdpa_set_status, + .reset = eni_vdpa_reset, + .get_vq_num_max = eni_vdpa_get_vq_num_max, + .get_vq_num_min = eni_vdpa_get_vq_num_min, + .get_vq_state = eni_vdpa_get_vq_state, + .set_vq_state = eni_vdpa_set_vq_state, + .set_vq_cb = eni_vdpa_set_vq_cb, + .set_vq_ready = eni_vdpa_set_vq_ready, + .get_vq_ready = eni_vdpa_get_vq_ready, + .set_vq_num = eni_vdpa_set_vq_num, + .set_vq_address = eni_vdpa_set_vq_address, + .kick_vq = eni_vdpa_kick_vq, + .get_device_id = eni_vdpa_get_device_id, + .get_vendor_id = eni_vdpa_get_vendor_id, + .get_vq_align = eni_vdpa_get_vq_align, + .get_config_size = eni_vdpa_get_config_size, + .get_config = eni_vdpa_get_config, + .set_config = eni_vdpa_set_config, + .set_config_cb = eni_vdpa_set_config_cb, + .get_vq_irq = eni_vdpa_get_vq_irq, +}; + + +static u16 eni_vdpa_get_num_queues(struct eni_vdpa *eni_vdpa) +{ + struct virtio_pci_legacy_device *ldev = &eni_vdpa->ldev; + u32 features = vp_legacy_get_features(ldev); + u16 num = 2; + + if (features & BIT_ULL(VIRTIO_NET_F_MQ)) { + __virtio16 max_virtqueue_pairs; + + eni_vdpa_get_config(&eni_vdpa->vdpa, + offsetof(struct virtio_net_config, max_virtqueue_pairs), + &max_virtqueue_pairs, + sizeof(max_virtqueue_pairs)); + num = 2 * __virtio16_to_cpu(virtio_legacy_is_little_endian(), + max_virtqueue_pairs); + } + + if (features & BIT_ULL(VIRTIO_NET_F_CTRL_VQ)) + num += 1; + + return num; +} + +static int eni_vdpa_probe(struct pci_dev *pdev, const struct pci_device_id *id) +{ + struct device *dev = &pdev->dev; + struct eni_vdpa *eni_vdpa; + struct virtio_pci_legacy_device *ldev; + int ret, i; + + ret = pcim_enable_device(pdev); + if (ret) + return ret; + + eni_vdpa = vdpa_alloc_device(struct eni_vdpa, vdpa, + dev, &eni_vdpa_ops, 1, 1, NULL, false); + if (IS_ERR(eni_vdpa)) { + ENI_ERR(pdev, "failed to allocate vDPA structure\n"); + return PTR_ERR(eni_vdpa); + } + + ldev = &eni_vdpa->ldev; + ldev->pci_dev = pdev; + + ret = vp_legacy_probe(ldev); + if (ret) { + ENI_ERR(pdev, "failed to probe legacy PCI device\n"); + goto err; + } + + pci_set_master(pdev); + pci_set_drvdata(pdev, eni_vdpa); + + eni_vdpa->vdpa.dma_dev = &pdev->dev; + eni_vdpa->queues = eni_vdpa_get_num_queues(eni_vdpa); + + eni_vdpa->vring = devm_kcalloc(&pdev->dev, eni_vdpa->queues, + sizeof(*eni_vdpa->vring), + GFP_KERNEL); + if (!eni_vdpa->vring) { + ret = -ENOMEM; + ENI_ERR(pdev, "failed to allocate virtqueues\n"); + goto err_remove_vp_legacy; + } + + for (i = 0; i < eni_vdpa->queues; i++) { + eni_vdpa->vring[i].irq = VIRTIO_MSI_NO_VECTOR; + eni_vdpa->vring[i].notify = ldev->ioaddr + VIRTIO_PCI_QUEUE_NOTIFY; + } + eni_vdpa->config_irq = VIRTIO_MSI_NO_VECTOR; + + ret = vdpa_register_device(&eni_vdpa->vdpa, eni_vdpa->queues); + if (ret) { + ENI_ERR(pdev, "failed to register to vdpa bus\n"); + goto err_remove_vp_legacy; + } + + return 0; + +err_remove_vp_legacy: + vp_legacy_remove(&eni_vdpa->ldev); +err: + put_device(&eni_vdpa->vdpa.dev); + return ret; +} + +static void eni_vdpa_remove(struct pci_dev *pdev) +{ + struct eni_vdpa *eni_vdpa = pci_get_drvdata(pdev); + + vdpa_unregister_device(&eni_vdpa->vdpa); + vp_legacy_remove(&eni_vdpa->ldev); +} + +static struct pci_device_id eni_pci_ids[] = { + { PCI_DEVICE_SUB(PCI_VENDOR_ID_REDHAT_QUMRANET, + VIRTIO_TRANS_ID_NET, + PCI_SUBVENDOR_ID_REDHAT_QUMRANET, + VIRTIO_ID_NET) }, + { 0 }, +}; + +static struct pci_driver eni_vdpa_driver = { + .name = "alibaba-eni-vdpa", + .id_table = eni_pci_ids, + .probe = eni_vdpa_probe, + .remove = eni_vdpa_remove, +}; + +module_pci_driver(eni_vdpa_driver); + +MODULE_AUTHOR("Wu Zongyong <wuzongyong@linux.alibaba.com>"); +MODULE_DESCRIPTION("Alibaba ENI vDPA driver"); +MODULE_LICENSE("GPL v2"); diff --git a/drivers/vdpa/ifcvf/Makefile b/drivers/vdpa/ifcvf/Makefile new file mode 100644 index 0000000000..d709915995 --- /dev/null +++ b/drivers/vdpa/ifcvf/Makefile @@ -0,0 +1,3 @@ +# SPDX-License-Identifier: GPL-2.0 +obj-$(CONFIG_IFCVF) += ifcvf.o +ifcvf-$(CONFIG_IFCVF) += ifcvf_main.o ifcvf_base.o diff --git a/drivers/vdpa/ifcvf/ifcvf_base.c b/drivers/vdpa/ifcvf/ifcvf_base.c new file mode 100644 index 0000000000..060f837a4f --- /dev/null +++ b/drivers/vdpa/ifcvf/ifcvf_base.c @@ -0,0 +1,432 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Intel IFC VF NIC driver for virtio dataplane offloading + * + * Copyright (C) 2020 Intel Corporation. + * + * Author: Zhu Lingshan <lingshan.zhu@intel.com> + * + */ + +#include "ifcvf_base.h" + +u16 ifcvf_set_vq_vector(struct ifcvf_hw *hw, u16 qid, int vector) +{ + struct virtio_pci_common_cfg __iomem *cfg = hw->common_cfg; + + vp_iowrite16(qid, &cfg->queue_select); + vp_iowrite16(vector, &cfg->queue_msix_vector); + + return vp_ioread16(&cfg->queue_msix_vector); +} + +u16 ifcvf_set_config_vector(struct ifcvf_hw *hw, int vector) +{ + struct virtio_pci_common_cfg __iomem *cfg = hw->common_cfg; + + vp_iowrite16(vector, &cfg->msix_config); + + return vp_ioread16(&cfg->msix_config); +} + +static void __iomem *get_cap_addr(struct ifcvf_hw *hw, + struct virtio_pci_cap *cap) +{ + u32 length, offset; + u8 bar; + + length = le32_to_cpu(cap->length); + offset = le32_to_cpu(cap->offset); + bar = cap->bar; + + if (bar >= IFCVF_PCI_MAX_RESOURCE) { + IFCVF_DBG(hw->pdev, + "Invalid bar number %u to get capabilities\n", bar); + return NULL; + } + + if (offset + length > pci_resource_len(hw->pdev, bar)) { + IFCVF_DBG(hw->pdev, + "offset(%u) + len(%u) overflows bar%u's capability\n", + offset, length, bar); + return NULL; + } + + return hw->base[bar] + offset; +} + +static int ifcvf_read_config_range(struct pci_dev *dev, + uint32_t *val, int size, int where) +{ + int ret, i; + + for (i = 0; i < size; i += 4) { + ret = pci_read_config_dword(dev, where + i, val + i / 4); + if (ret < 0) + return ret; + } + + return 0; +} + +static u16 ifcvf_get_vq_size(struct ifcvf_hw *hw, u16 qid) +{ + u16 queue_size; + + vp_iowrite16(qid, &hw->common_cfg->queue_select); + queue_size = vp_ioread16(&hw->common_cfg->queue_size); + + return queue_size; +} + +/* This function returns the max allowed safe size for + * all virtqueues. It is the minimal size that can be + * suppprted by all virtqueues. + */ +u16 ifcvf_get_max_vq_size(struct ifcvf_hw *hw) +{ + u16 queue_size, max_size, qid; + + max_size = ifcvf_get_vq_size(hw, 0); + for (qid = 1; qid < hw->nr_vring; qid++) { + queue_size = ifcvf_get_vq_size(hw, qid); + /* 0 means the queue is unavailable */ + if (!queue_size) + continue; + + max_size = min(queue_size, max_size); + } + + return max_size; +} + +int ifcvf_init_hw(struct ifcvf_hw *hw, struct pci_dev *pdev) +{ + struct virtio_pci_cap cap; + u16 notify_off; + int ret; + u8 pos; + u32 i; + + ret = pci_read_config_byte(pdev, PCI_CAPABILITY_LIST, &pos); + if (ret < 0) { + IFCVF_ERR(pdev, "Failed to read PCI capability list\n"); + return -EIO; + } + hw->pdev = pdev; + + while (pos) { + ret = ifcvf_read_config_range(pdev, (u32 *)&cap, + sizeof(cap), pos); + if (ret < 0) { + IFCVF_ERR(pdev, + "Failed to get PCI capability at %x\n", pos); + break; + } + + if (cap.cap_vndr != PCI_CAP_ID_VNDR) + goto next; + + switch (cap.cfg_type) { + case VIRTIO_PCI_CAP_COMMON_CFG: + hw->common_cfg = get_cap_addr(hw, &cap); + IFCVF_DBG(pdev, "hw->common_cfg = %p\n", + hw->common_cfg); + break; + case VIRTIO_PCI_CAP_NOTIFY_CFG: + pci_read_config_dword(pdev, pos + sizeof(cap), + &hw->notify_off_multiplier); + hw->notify_bar = cap.bar; + hw->notify_base = get_cap_addr(hw, &cap); + hw->notify_base_pa = pci_resource_start(pdev, cap.bar) + + le32_to_cpu(cap.offset); + IFCVF_DBG(pdev, "hw->notify_base = %p\n", + hw->notify_base); + break; + case VIRTIO_PCI_CAP_ISR_CFG: + hw->isr = get_cap_addr(hw, &cap); + IFCVF_DBG(pdev, "hw->isr = %p\n", hw->isr); + break; + case VIRTIO_PCI_CAP_DEVICE_CFG: + hw->dev_cfg = get_cap_addr(hw, &cap); + hw->cap_dev_config_size = le32_to_cpu(cap.length); + IFCVF_DBG(pdev, "hw->dev_cfg = %p\n", hw->dev_cfg); + break; + } + +next: + pos = cap.cap_next; + } + + if (hw->common_cfg == NULL || hw->notify_base == NULL || + hw->isr == NULL || hw->dev_cfg == NULL) { + IFCVF_ERR(pdev, "Incomplete PCI capabilities\n"); + return -EIO; + } + + hw->nr_vring = vp_ioread16(&hw->common_cfg->num_queues); + hw->vring = kzalloc(sizeof(struct vring_info) * hw->nr_vring, GFP_KERNEL); + if (!hw->vring) + return -ENOMEM; + + for (i = 0; i < hw->nr_vring; i++) { + vp_iowrite16(i, &hw->common_cfg->queue_select); + notify_off = vp_ioread16(&hw->common_cfg->queue_notify_off); + hw->vring[i].notify_addr = hw->notify_base + + notify_off * hw->notify_off_multiplier; + hw->vring[i].notify_pa = hw->notify_base_pa + + notify_off * hw->notify_off_multiplier; + hw->vring[i].irq = -EINVAL; + } + + hw->lm_cfg = hw->base[IFCVF_LM_BAR]; + + IFCVF_DBG(pdev, + "PCI capability mapping: common cfg: %p, notify base: %p\n, isr cfg: %p, device cfg: %p, multiplier: %u\n", + hw->common_cfg, hw->notify_base, hw->isr, + hw->dev_cfg, hw->notify_off_multiplier); + + hw->vqs_reused_irq = -EINVAL; + hw->config_irq = -EINVAL; + + return 0; +} + +u8 ifcvf_get_status(struct ifcvf_hw *hw) +{ + return vp_ioread8(&hw->common_cfg->device_status); +} + +void ifcvf_set_status(struct ifcvf_hw *hw, u8 status) +{ + vp_iowrite8(status, &hw->common_cfg->device_status); +} + +void ifcvf_reset(struct ifcvf_hw *hw) +{ + ifcvf_set_status(hw, 0); + while (ifcvf_get_status(hw)) + msleep(1); +} + +u64 ifcvf_get_hw_features(struct ifcvf_hw *hw) +{ + struct virtio_pci_common_cfg __iomem *cfg = hw->common_cfg; + u32 features_lo, features_hi; + u64 features; + + vp_iowrite32(0, &cfg->device_feature_select); + features_lo = vp_ioread32(&cfg->device_feature); + + vp_iowrite32(1, &cfg->device_feature_select); + features_hi = vp_ioread32(&cfg->device_feature); + + features = ((u64)features_hi << 32) | features_lo; + + return features; +} + +/* return provisioned vDPA dev features */ +u64 ifcvf_get_dev_features(struct ifcvf_hw *hw) +{ + return hw->dev_features; +} + +u64 ifcvf_get_driver_features(struct ifcvf_hw *hw) +{ + struct virtio_pci_common_cfg __iomem *cfg = hw->common_cfg; + u32 features_lo, features_hi; + u64 features; + + vp_iowrite32(0, &cfg->device_feature_select); + features_lo = vp_ioread32(&cfg->guest_feature); + + vp_iowrite32(1, &cfg->device_feature_select); + features_hi = vp_ioread32(&cfg->guest_feature); + + features = ((u64)features_hi << 32) | features_lo; + + return features; +} + +int ifcvf_verify_min_features(struct ifcvf_hw *hw, u64 features) +{ + if (!(features & BIT_ULL(VIRTIO_F_ACCESS_PLATFORM)) && features) { + IFCVF_ERR(hw->pdev, "VIRTIO_F_ACCESS_PLATFORM is not negotiated\n"); + return -EINVAL; + } + + return 0; +} + +u32 ifcvf_get_config_size(struct ifcvf_hw *hw) +{ + u32 net_config_size = sizeof(struct virtio_net_config); + u32 blk_config_size = sizeof(struct virtio_blk_config); + u32 cap_size = hw->cap_dev_config_size; + u32 config_size; + + /* If the onboard device config space size is greater than + * the size of struct virtio_net/blk_config, only the spec + * implementing contents size is returned, this is very + * unlikely, defensive programming. + */ + switch (hw->dev_type) { + case VIRTIO_ID_NET: + config_size = min(cap_size, net_config_size); + break; + case VIRTIO_ID_BLOCK: + config_size = min(cap_size, blk_config_size); + break; + default: + config_size = 0; + IFCVF_ERR(hw->pdev, "VIRTIO ID %u not supported\n", hw->dev_type); + } + + return config_size; +} + +void ifcvf_read_dev_config(struct ifcvf_hw *hw, u64 offset, + void *dst, int length) +{ + u8 old_gen, new_gen, *p; + int i; + + WARN_ON(offset + length > hw->config_size); + do { + old_gen = vp_ioread8(&hw->common_cfg->config_generation); + p = dst; + for (i = 0; i < length; i++) + *p++ = vp_ioread8(hw->dev_cfg + offset + i); + + new_gen = vp_ioread8(&hw->common_cfg->config_generation); + } while (old_gen != new_gen); +} + +void ifcvf_write_dev_config(struct ifcvf_hw *hw, u64 offset, + const void *src, int length) +{ + const u8 *p; + int i; + + p = src; + WARN_ON(offset + length > hw->config_size); + for (i = 0; i < length; i++) + vp_iowrite8(*p++, hw->dev_cfg + offset + i); +} + +void ifcvf_set_driver_features(struct ifcvf_hw *hw, u64 features) +{ + struct virtio_pci_common_cfg __iomem *cfg = hw->common_cfg; + + vp_iowrite32(0, &cfg->guest_feature_select); + vp_iowrite32((u32)features, &cfg->guest_feature); + + vp_iowrite32(1, &cfg->guest_feature_select); + vp_iowrite32(features >> 32, &cfg->guest_feature); +} + +u16 ifcvf_get_vq_state(struct ifcvf_hw *hw, u16 qid) +{ + struct ifcvf_lm_cfg __iomem *lm_cfg = hw->lm_cfg; + u16 last_avail_idx; + + last_avail_idx = vp_ioread16(&lm_cfg->vq_state_region + qid * 2); + + return last_avail_idx; +} + +int ifcvf_set_vq_state(struct ifcvf_hw *hw, u16 qid, u16 num) +{ + struct ifcvf_lm_cfg __iomem *lm_cfg = hw->lm_cfg; + + vp_iowrite16(num, &lm_cfg->vq_state_region + qid * 2); + + return 0; +} + +void ifcvf_set_vq_num(struct ifcvf_hw *hw, u16 qid, u32 num) +{ + struct virtio_pci_common_cfg __iomem *cfg = hw->common_cfg; + + vp_iowrite16(qid, &cfg->queue_select); + vp_iowrite16(num, &cfg->queue_size); +} + +int ifcvf_set_vq_address(struct ifcvf_hw *hw, u16 qid, u64 desc_area, + u64 driver_area, u64 device_area) +{ + struct virtio_pci_common_cfg __iomem *cfg = hw->common_cfg; + + vp_iowrite16(qid, &cfg->queue_select); + vp_iowrite64_twopart(desc_area, &cfg->queue_desc_lo, + &cfg->queue_desc_hi); + vp_iowrite64_twopart(driver_area, &cfg->queue_avail_lo, + &cfg->queue_avail_hi); + vp_iowrite64_twopart(device_area, &cfg->queue_used_lo, + &cfg->queue_used_hi); + + return 0; +} + +bool ifcvf_get_vq_ready(struct ifcvf_hw *hw, u16 qid) +{ + struct virtio_pci_common_cfg __iomem *cfg = hw->common_cfg; + u16 queue_enable; + + vp_iowrite16(qid, &cfg->queue_select); + queue_enable = vp_ioread16(&cfg->queue_enable); + + return (bool)queue_enable; +} + +void ifcvf_set_vq_ready(struct ifcvf_hw *hw, u16 qid, bool ready) +{ + struct virtio_pci_common_cfg __iomem *cfg = hw->common_cfg; + + vp_iowrite16(qid, &cfg->queue_select); + vp_iowrite16(ready, &cfg->queue_enable); +} + +static void ifcvf_reset_vring(struct ifcvf_hw *hw) +{ + u16 qid; + + for (qid = 0; qid < hw->nr_vring; qid++) { + hw->vring[qid].cb.callback = NULL; + hw->vring[qid].cb.private = NULL; + ifcvf_set_vq_vector(hw, qid, VIRTIO_MSI_NO_VECTOR); + } +} + +static void ifcvf_reset_config_handler(struct ifcvf_hw *hw) +{ + hw->config_cb.callback = NULL; + hw->config_cb.private = NULL; + ifcvf_set_config_vector(hw, VIRTIO_MSI_NO_VECTOR); +} + +static void ifcvf_synchronize_irq(struct ifcvf_hw *hw) +{ + u32 nvectors = hw->num_msix_vectors; + struct pci_dev *pdev = hw->pdev; + int i, irq; + + for (i = 0; i < nvectors; i++) { + irq = pci_irq_vector(pdev, i); + if (irq >= 0) + synchronize_irq(irq); + } +} + +void ifcvf_stop(struct ifcvf_hw *hw) +{ + ifcvf_synchronize_irq(hw); + ifcvf_reset_vring(hw); + ifcvf_reset_config_handler(hw); +} + +void ifcvf_notify_queue(struct ifcvf_hw *hw, u16 qid) +{ + vp_iowrite16(qid, hw->vring[qid].notify_addr); +} diff --git a/drivers/vdpa/ifcvf/ifcvf_base.h b/drivers/vdpa/ifcvf/ifcvf_base.h new file mode 100644 index 0000000000..b57849c643 --- /dev/null +++ b/drivers/vdpa/ifcvf/ifcvf_base.h @@ -0,0 +1,134 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Intel IFC VF NIC driver for virtio dataplane offloading + * + * Copyright (C) 2020 Intel Corporation. + * + * Author: Zhu Lingshan <lingshan.zhu@intel.com> + * + */ + +#ifndef _IFCVF_H_ +#define _IFCVF_H_ + +#include <linux/pci.h> +#include <linux/pci_regs.h> +#include <linux/vdpa.h> +#include <linux/virtio_pci_modern.h> +#include <uapi/linux/virtio_net.h> +#include <uapi/linux/virtio_blk.h> +#include <uapi/linux/virtio_config.h> +#include <uapi/linux/virtio_pci.h> +#include <uapi/linux/vdpa.h> + +#define N3000_DEVICE_ID 0x1041 +#define N3000_SUBSYS_DEVICE_ID 0x001A + +#define IFCVF_QUEUE_ALIGNMENT PAGE_SIZE +#define IFCVF_PCI_MAX_RESOURCE 6 + +#define IFCVF_LM_BAR 4 + +#define IFCVF_ERR(pdev, fmt, ...) dev_err(&pdev->dev, fmt, ##__VA_ARGS__) +#define IFCVF_DBG(pdev, fmt, ...) dev_dbg(&pdev->dev, fmt, ##__VA_ARGS__) +#define IFCVF_INFO(pdev, fmt, ...) dev_info(&pdev->dev, fmt, ##__VA_ARGS__) + +/* all vqs and config interrupt has its own vector */ +#define MSIX_VECTOR_PER_VQ_AND_CONFIG 1 +/* all vqs share a vector, and config interrupt has a separate vector */ +#define MSIX_VECTOR_SHARED_VQ_AND_CONFIG 2 +/* all vqs and config interrupt share a vector */ +#define MSIX_VECTOR_DEV_SHARED 3 + +struct vring_info { + u16 last_avail_idx; + void __iomem *notify_addr; + phys_addr_t notify_pa; + u32 irq; + struct vdpa_callback cb; + char msix_name[256]; +}; + +struct ifcvf_lm_cfg { + __le64 control; + __le64 status; + __le64 lm_mem_log_start_addr; + __le64 lm_mem_log_end_addr; + __le16 vq_state_region; +}; + +struct ifcvf_hw { + u8 __iomem *isr; + /* Live migration */ + struct ifcvf_lm_cfg __iomem *lm_cfg; + /* Notification bar number */ + u8 notify_bar; + u8 msix_vector_status; + /* virtio-net or virtio-blk device config size */ + u32 config_size; + /* Notificaiton bar address */ + void __iomem *notify_base; + phys_addr_t notify_base_pa; + u32 notify_off_multiplier; + u32 dev_type; + u64 hw_features; + /* provisioned device features */ + u64 dev_features; + struct virtio_pci_common_cfg __iomem *common_cfg; + void __iomem *dev_cfg; + struct vring_info *vring; + void __iomem * const *base; + char config_msix_name[256]; + struct vdpa_callback config_cb; + int config_irq; + int vqs_reused_irq; + u16 nr_vring; + /* VIRTIO_PCI_CAP_DEVICE_CFG size */ + u32 num_msix_vectors; + u32 cap_dev_config_size; + struct pci_dev *pdev; +}; + +struct ifcvf_adapter { + struct vdpa_device vdpa; + struct pci_dev *pdev; + struct ifcvf_hw *vf; +}; + +struct ifcvf_vdpa_mgmt_dev { + struct vdpa_mgmt_dev mdev; + struct ifcvf_hw vf; + struct ifcvf_adapter *adapter; + struct pci_dev *pdev; +}; + +int ifcvf_init_hw(struct ifcvf_hw *hw, struct pci_dev *dev); +void ifcvf_stop(struct ifcvf_hw *hw); +void ifcvf_notify_queue(struct ifcvf_hw *hw, u16 qid); +void ifcvf_read_dev_config(struct ifcvf_hw *hw, u64 offset, + void *dst, int length); +void ifcvf_write_dev_config(struct ifcvf_hw *hw, u64 offset, + const void *src, int length); +u8 ifcvf_get_status(struct ifcvf_hw *hw); +void ifcvf_set_status(struct ifcvf_hw *hw, u8 status); +void io_write64_twopart(u64 val, u32 *lo, u32 *hi); +void ifcvf_reset(struct ifcvf_hw *hw); +u64 ifcvf_get_dev_features(struct ifcvf_hw *hw); +u64 ifcvf_get_hw_features(struct ifcvf_hw *hw); +int ifcvf_verify_min_features(struct ifcvf_hw *hw, u64 features); +u16 ifcvf_get_vq_state(struct ifcvf_hw *hw, u16 qid); +int ifcvf_set_vq_state(struct ifcvf_hw *hw, u16 qid, u16 num); +struct ifcvf_adapter *vf_to_adapter(struct ifcvf_hw *hw); +int ifcvf_probed_virtio_net(struct ifcvf_hw *hw); +u32 ifcvf_get_config_size(struct ifcvf_hw *hw); +u16 ifcvf_set_vq_vector(struct ifcvf_hw *hw, u16 qid, int vector); +u16 ifcvf_set_config_vector(struct ifcvf_hw *hw, int vector); +void ifcvf_set_vq_num(struct ifcvf_hw *hw, u16 qid, u32 num); +int ifcvf_set_vq_address(struct ifcvf_hw *hw, u16 qid, u64 desc_area, + u64 driver_area, u64 device_area); +bool ifcvf_get_vq_ready(struct ifcvf_hw *hw, u16 qid); +void ifcvf_set_vq_ready(struct ifcvf_hw *hw, u16 qid, bool ready); +void ifcvf_set_driver_features(struct ifcvf_hw *hw, u64 features); +u64 ifcvf_get_driver_features(struct ifcvf_hw *hw); +u16 ifcvf_get_max_vq_size(struct ifcvf_hw *hw); +#endif /* _IFCVF_H_ */ diff --git a/drivers/vdpa/ifcvf/ifcvf_main.c b/drivers/vdpa/ifcvf/ifcvf_main.c new file mode 100644 index 0000000000..e98fa8100f --- /dev/null +++ b/drivers/vdpa/ifcvf/ifcvf_main.c @@ -0,0 +1,882 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Intel IFC VF NIC driver for virtio dataplane offloading + * + * Copyright (C) 2020 Intel Corporation. + * + * Author: Zhu Lingshan <lingshan.zhu@intel.com> + * + */ + +#include <linux/interrupt.h> +#include <linux/module.h> +#include <linux/pci.h> +#include <linux/sysfs.h> +#include "ifcvf_base.h" + +#define DRIVER_AUTHOR "Intel Corporation" +#define IFCVF_DRIVER_NAME "ifcvf" + +static irqreturn_t ifcvf_config_changed(int irq, void *arg) +{ + struct ifcvf_hw *vf = arg; + + if (vf->config_cb.callback) + return vf->config_cb.callback(vf->config_cb.private); + + return IRQ_HANDLED; +} + +static irqreturn_t ifcvf_vq_intr_handler(int irq, void *arg) +{ + struct vring_info *vring = arg; + + if (vring->cb.callback) + return vring->cb.callback(vring->cb.private); + + return IRQ_HANDLED; +} + +static irqreturn_t ifcvf_vqs_reused_intr_handler(int irq, void *arg) +{ + struct ifcvf_hw *vf = arg; + struct vring_info *vring; + int i; + + for (i = 0; i < vf->nr_vring; i++) { + vring = &vf->vring[i]; + if (vring->cb.callback) + vring->cb.callback(vring->cb.private); + } + + return IRQ_HANDLED; +} + +static irqreturn_t ifcvf_dev_intr_handler(int irq, void *arg) +{ + struct ifcvf_hw *vf = arg; + u8 isr; + + isr = vp_ioread8(vf->isr); + if (isr & VIRTIO_PCI_ISR_CONFIG) + ifcvf_config_changed(irq, arg); + + return ifcvf_vqs_reused_intr_handler(irq, arg); +} + +static void ifcvf_free_irq_vectors(void *data) +{ + pci_free_irq_vectors(data); +} + +static void ifcvf_free_per_vq_irq(struct ifcvf_hw *vf) +{ + struct pci_dev *pdev = vf->pdev; + int i; + + for (i = 0; i < vf->nr_vring; i++) { + if (vf->vring[i].irq != -EINVAL) { + devm_free_irq(&pdev->dev, vf->vring[i].irq, &vf->vring[i]); + vf->vring[i].irq = -EINVAL; + } + } +} + +static void ifcvf_free_vqs_reused_irq(struct ifcvf_hw *vf) +{ + struct pci_dev *pdev = vf->pdev; + + if (vf->vqs_reused_irq != -EINVAL) { + devm_free_irq(&pdev->dev, vf->vqs_reused_irq, vf); + vf->vqs_reused_irq = -EINVAL; + } + +} + +static void ifcvf_free_vq_irq(struct ifcvf_hw *vf) +{ + if (vf->msix_vector_status == MSIX_VECTOR_PER_VQ_AND_CONFIG) + ifcvf_free_per_vq_irq(vf); + else + ifcvf_free_vqs_reused_irq(vf); +} + +static void ifcvf_free_config_irq(struct ifcvf_hw *vf) +{ + struct pci_dev *pdev = vf->pdev; + + if (vf->config_irq == -EINVAL) + return; + + /* If the irq is shared by all vqs and the config interrupt, + * it is already freed in ifcvf_free_vq_irq, so here only + * need to free config irq when msix_vector_status != MSIX_VECTOR_DEV_SHARED + */ + if (vf->msix_vector_status != MSIX_VECTOR_DEV_SHARED) { + devm_free_irq(&pdev->dev, vf->config_irq, vf); + vf->config_irq = -EINVAL; + } +} + +static void ifcvf_free_irq(struct ifcvf_hw *vf) +{ + struct pci_dev *pdev = vf->pdev; + + ifcvf_free_vq_irq(vf); + ifcvf_free_config_irq(vf); + ifcvf_free_irq_vectors(pdev); + vf->num_msix_vectors = 0; +} + +/* ifcvf MSIX vectors allocator, this helper tries to allocate + * vectors for all virtqueues and the config interrupt. + * It returns the number of allocated vectors, negative + * return value when fails. + */ +static int ifcvf_alloc_vectors(struct ifcvf_hw *vf) +{ + struct pci_dev *pdev = vf->pdev; + int max_intr, ret; + + /* all queues and config interrupt */ + max_intr = vf->nr_vring + 1; + ret = pci_alloc_irq_vectors(pdev, 1, max_intr, PCI_IRQ_MSIX | PCI_IRQ_AFFINITY); + + if (ret < 0) { + IFCVF_ERR(pdev, "Failed to alloc IRQ vectors\n"); + return ret; + } + + if (ret < max_intr) + IFCVF_INFO(pdev, + "Requested %u vectors, however only %u allocated, lower performance\n", + max_intr, ret); + + return ret; +} + +static int ifcvf_request_per_vq_irq(struct ifcvf_hw *vf) +{ + struct pci_dev *pdev = vf->pdev; + int i, vector, ret, irq; + + vf->vqs_reused_irq = -EINVAL; + for (i = 0; i < vf->nr_vring; i++) { + snprintf(vf->vring[i].msix_name, 256, "ifcvf[%s]-%d\n", pci_name(pdev), i); + vector = i; + irq = pci_irq_vector(pdev, vector); + ret = devm_request_irq(&pdev->dev, irq, + ifcvf_vq_intr_handler, 0, + vf->vring[i].msix_name, + &vf->vring[i]); + if (ret) { + IFCVF_ERR(pdev, "Failed to request irq for vq %d\n", i); + goto err; + } + + vf->vring[i].irq = irq; + ret = ifcvf_set_vq_vector(vf, i, vector); + if (ret == VIRTIO_MSI_NO_VECTOR) { + IFCVF_ERR(pdev, "No msix vector for vq %u\n", i); + goto err; + } + } + + return 0; +err: + ifcvf_free_irq(vf); + + return -EFAULT; +} + +static int ifcvf_request_vqs_reused_irq(struct ifcvf_hw *vf) +{ + struct pci_dev *pdev = vf->pdev; + int i, vector, ret, irq; + + vector = 0; + snprintf(vf->vring[0].msix_name, 256, "ifcvf[%s]-vqs-reused-irq\n", pci_name(pdev)); + irq = pci_irq_vector(pdev, vector); + ret = devm_request_irq(&pdev->dev, irq, + ifcvf_vqs_reused_intr_handler, 0, + vf->vring[0].msix_name, vf); + if (ret) { + IFCVF_ERR(pdev, "Failed to request reused irq for the device\n"); + goto err; + } + + vf->vqs_reused_irq = irq; + for (i = 0; i < vf->nr_vring; i++) { + vf->vring[i].irq = -EINVAL; + ret = ifcvf_set_vq_vector(vf, i, vector); + if (ret == VIRTIO_MSI_NO_VECTOR) { + IFCVF_ERR(pdev, "No msix vector for vq %u\n", i); + goto err; + } + } + + return 0; +err: + ifcvf_free_irq(vf); + + return -EFAULT; +} + +static int ifcvf_request_dev_irq(struct ifcvf_hw *vf) +{ + struct pci_dev *pdev = vf->pdev; + int i, vector, ret, irq; + + vector = 0; + snprintf(vf->vring[0].msix_name, 256, "ifcvf[%s]-dev-irq\n", pci_name(pdev)); + irq = pci_irq_vector(pdev, vector); + ret = devm_request_irq(&pdev->dev, irq, + ifcvf_dev_intr_handler, 0, + vf->vring[0].msix_name, vf); + if (ret) { + IFCVF_ERR(pdev, "Failed to request irq for the device\n"); + goto err; + } + + vf->vqs_reused_irq = irq; + for (i = 0; i < vf->nr_vring; i++) { + vf->vring[i].irq = -EINVAL; + ret = ifcvf_set_vq_vector(vf, i, vector); + if (ret == VIRTIO_MSI_NO_VECTOR) { + IFCVF_ERR(pdev, "No msix vector for vq %u\n", i); + goto err; + } + } + + vf->config_irq = irq; + ret = ifcvf_set_config_vector(vf, vector); + if (ret == VIRTIO_MSI_NO_VECTOR) { + IFCVF_ERR(pdev, "No msix vector for device config\n"); + goto err; + } + + return 0; +err: + ifcvf_free_irq(vf); + + return -EFAULT; + +} + +static int ifcvf_request_vq_irq(struct ifcvf_hw *vf) +{ + int ret; + + if (vf->msix_vector_status == MSIX_VECTOR_PER_VQ_AND_CONFIG) + ret = ifcvf_request_per_vq_irq(vf); + else + ret = ifcvf_request_vqs_reused_irq(vf); + + return ret; +} + +static int ifcvf_request_config_irq(struct ifcvf_hw *vf) +{ + struct pci_dev *pdev = vf->pdev; + int config_vector, ret; + + if (vf->msix_vector_status == MSIX_VECTOR_PER_VQ_AND_CONFIG) + config_vector = vf->nr_vring; + else if (vf->msix_vector_status == MSIX_VECTOR_SHARED_VQ_AND_CONFIG) + /* vector 0 for vqs and 1 for config interrupt */ + config_vector = 1; + else if (vf->msix_vector_status == MSIX_VECTOR_DEV_SHARED) + /* re-use the vqs vector */ + return 0; + else + return -EINVAL; + + snprintf(vf->config_msix_name, 256, "ifcvf[%s]-config\n", + pci_name(pdev)); + vf->config_irq = pci_irq_vector(pdev, config_vector); + ret = devm_request_irq(&pdev->dev, vf->config_irq, + ifcvf_config_changed, 0, + vf->config_msix_name, vf); + if (ret) { + IFCVF_ERR(pdev, "Failed to request config irq\n"); + goto err; + } + + ret = ifcvf_set_config_vector(vf, config_vector); + if (ret == VIRTIO_MSI_NO_VECTOR) { + IFCVF_ERR(pdev, "No msix vector for device config\n"); + goto err; + } + + return 0; +err: + ifcvf_free_irq(vf); + + return -EFAULT; +} + +static int ifcvf_request_irq(struct ifcvf_hw *vf) +{ + int nvectors, ret, max_intr; + + nvectors = ifcvf_alloc_vectors(vf); + if (nvectors <= 0) + return -EFAULT; + + vf->msix_vector_status = MSIX_VECTOR_PER_VQ_AND_CONFIG; + max_intr = vf->nr_vring + 1; + if (nvectors < max_intr) + vf->msix_vector_status = MSIX_VECTOR_SHARED_VQ_AND_CONFIG; + + if (nvectors == 1) { + vf->msix_vector_status = MSIX_VECTOR_DEV_SHARED; + ret = ifcvf_request_dev_irq(vf); + + return ret; + } + + ret = ifcvf_request_vq_irq(vf); + if (ret) + return ret; + + ret = ifcvf_request_config_irq(vf); + + if (ret) + return ret; + + vf->num_msix_vectors = nvectors; + + return 0; +} + +static struct ifcvf_adapter *vdpa_to_adapter(struct vdpa_device *vdpa_dev) +{ + return container_of(vdpa_dev, struct ifcvf_adapter, vdpa); +} + +static struct ifcvf_hw *vdpa_to_vf(struct vdpa_device *vdpa_dev) +{ + struct ifcvf_adapter *adapter = vdpa_to_adapter(vdpa_dev); + + return adapter->vf; +} + +static u64 ifcvf_vdpa_get_device_features(struct vdpa_device *vdpa_dev) +{ + struct ifcvf_adapter *adapter = vdpa_to_adapter(vdpa_dev); + struct ifcvf_hw *vf = vdpa_to_vf(vdpa_dev); + struct pci_dev *pdev = adapter->pdev; + u32 type = vf->dev_type; + u64 features; + + if (type == VIRTIO_ID_NET || type == VIRTIO_ID_BLOCK) + features = ifcvf_get_dev_features(vf); + else { + features = 0; + IFCVF_ERR(pdev, "VIRTIO ID %u not supported\n", vf->dev_type); + } + + return features; +} + +static int ifcvf_vdpa_set_driver_features(struct vdpa_device *vdpa_dev, u64 features) +{ + struct ifcvf_hw *vf = vdpa_to_vf(vdpa_dev); + int ret; + + ret = ifcvf_verify_min_features(vf, features); + if (ret) + return ret; + + ifcvf_set_driver_features(vf, features); + + return 0; +} + +static u64 ifcvf_vdpa_get_driver_features(struct vdpa_device *vdpa_dev) +{ + struct ifcvf_hw *vf = vdpa_to_vf(vdpa_dev); + u64 features; + + features = ifcvf_get_driver_features(vf); + + return features; +} + +static u8 ifcvf_vdpa_get_status(struct vdpa_device *vdpa_dev) +{ + struct ifcvf_hw *vf = vdpa_to_vf(vdpa_dev); + + return ifcvf_get_status(vf); +} + +static void ifcvf_vdpa_set_status(struct vdpa_device *vdpa_dev, u8 status) +{ + struct ifcvf_hw *vf; + u8 status_old; + int ret; + + vf = vdpa_to_vf(vdpa_dev); + status_old = ifcvf_get_status(vf); + + if (status_old == status) + return; + + if ((status & VIRTIO_CONFIG_S_DRIVER_OK) && + !(status_old & VIRTIO_CONFIG_S_DRIVER_OK)) { + ret = ifcvf_request_irq(vf); + if (ret) { + IFCVF_ERR(vf->pdev, "failed to request irq with error %d\n", ret); + return; + } + } + + ifcvf_set_status(vf, status); +} + +static int ifcvf_vdpa_reset(struct vdpa_device *vdpa_dev) +{ + struct ifcvf_hw *vf = vdpa_to_vf(vdpa_dev); + u8 status = ifcvf_get_status(vf); + + ifcvf_stop(vf); + + if (status & VIRTIO_CONFIG_S_DRIVER_OK) + ifcvf_free_irq(vf); + + ifcvf_reset(vf); + + return 0; +} + +static u16 ifcvf_vdpa_get_vq_num_max(struct vdpa_device *vdpa_dev) +{ + struct ifcvf_hw *vf = vdpa_to_vf(vdpa_dev); + + return ifcvf_get_max_vq_size(vf); +} + +static int ifcvf_vdpa_get_vq_state(struct vdpa_device *vdpa_dev, u16 qid, + struct vdpa_vq_state *state) +{ + struct ifcvf_hw *vf = vdpa_to_vf(vdpa_dev); + + state->split.avail_index = ifcvf_get_vq_state(vf, qid); + return 0; +} + +static int ifcvf_vdpa_set_vq_state(struct vdpa_device *vdpa_dev, u16 qid, + const struct vdpa_vq_state *state) +{ + struct ifcvf_hw *vf = vdpa_to_vf(vdpa_dev); + + return ifcvf_set_vq_state(vf, qid, state->split.avail_index); +} + +static void ifcvf_vdpa_set_vq_cb(struct vdpa_device *vdpa_dev, u16 qid, + struct vdpa_callback *cb) +{ + struct ifcvf_hw *vf = vdpa_to_vf(vdpa_dev); + + vf->vring[qid].cb = *cb; +} + +static void ifcvf_vdpa_set_vq_ready(struct vdpa_device *vdpa_dev, + u16 qid, bool ready) +{ + struct ifcvf_hw *vf = vdpa_to_vf(vdpa_dev); + + ifcvf_set_vq_ready(vf, qid, ready); +} + +static bool ifcvf_vdpa_get_vq_ready(struct vdpa_device *vdpa_dev, u16 qid) +{ + struct ifcvf_hw *vf = vdpa_to_vf(vdpa_dev); + + return ifcvf_get_vq_ready(vf, qid); +} + +static void ifcvf_vdpa_set_vq_num(struct vdpa_device *vdpa_dev, u16 qid, + u32 num) +{ + struct ifcvf_hw *vf = vdpa_to_vf(vdpa_dev); + + ifcvf_set_vq_num(vf, qid, num); +} + +static int ifcvf_vdpa_set_vq_address(struct vdpa_device *vdpa_dev, u16 qid, + u64 desc_area, u64 driver_area, + u64 device_area) +{ + struct ifcvf_hw *vf = vdpa_to_vf(vdpa_dev); + + return ifcvf_set_vq_address(vf, qid, desc_area, driver_area, device_area); +} + +static void ifcvf_vdpa_kick_vq(struct vdpa_device *vdpa_dev, u16 qid) +{ + struct ifcvf_hw *vf = vdpa_to_vf(vdpa_dev); + + ifcvf_notify_queue(vf, qid); +} + +static u32 ifcvf_vdpa_get_generation(struct vdpa_device *vdpa_dev) +{ + struct ifcvf_hw *vf = vdpa_to_vf(vdpa_dev); + + return vp_ioread8(&vf->common_cfg->config_generation); +} + +static u32 ifcvf_vdpa_get_device_id(struct vdpa_device *vdpa_dev) +{ + struct ifcvf_hw *vf = vdpa_to_vf(vdpa_dev); + + return vf->dev_type; +} + +static u32 ifcvf_vdpa_get_vendor_id(struct vdpa_device *vdpa_dev) +{ + struct ifcvf_adapter *adapter = vdpa_to_adapter(vdpa_dev); + struct pci_dev *pdev = adapter->pdev; + + return pdev->subsystem_vendor; +} + +static u32 ifcvf_vdpa_get_vq_align(struct vdpa_device *vdpa_dev) +{ + return IFCVF_QUEUE_ALIGNMENT; +} + +static size_t ifcvf_vdpa_get_config_size(struct vdpa_device *vdpa_dev) +{ + struct ifcvf_hw *vf = vdpa_to_vf(vdpa_dev); + + return vf->config_size; +} + +static u32 ifcvf_vdpa_get_vq_group(struct vdpa_device *vdpa, u16 idx) +{ + return 0; +} + +static void ifcvf_vdpa_get_config(struct vdpa_device *vdpa_dev, + unsigned int offset, + void *buf, unsigned int len) +{ + struct ifcvf_hw *vf = vdpa_to_vf(vdpa_dev); + + ifcvf_read_dev_config(vf, offset, buf, len); +} + +static void ifcvf_vdpa_set_config(struct vdpa_device *vdpa_dev, + unsigned int offset, const void *buf, + unsigned int len) +{ + struct ifcvf_hw *vf = vdpa_to_vf(vdpa_dev); + + ifcvf_write_dev_config(vf, offset, buf, len); +} + +static void ifcvf_vdpa_set_config_cb(struct vdpa_device *vdpa_dev, + struct vdpa_callback *cb) +{ + struct ifcvf_hw *vf = vdpa_to_vf(vdpa_dev); + + vf->config_cb.callback = cb->callback; + vf->config_cb.private = cb->private; +} + +static int ifcvf_vdpa_get_vq_irq(struct vdpa_device *vdpa_dev, + u16 qid) +{ + struct ifcvf_hw *vf = vdpa_to_vf(vdpa_dev); + + if (vf->vqs_reused_irq < 0) + return vf->vring[qid].irq; + else + return -EINVAL; +} + +static struct vdpa_notification_area ifcvf_get_vq_notification(struct vdpa_device *vdpa_dev, + u16 idx) +{ + struct ifcvf_hw *vf = vdpa_to_vf(vdpa_dev); + struct vdpa_notification_area area; + + area.addr = vf->vring[idx].notify_pa; + if (!vf->notify_off_multiplier) + area.size = PAGE_SIZE; + else + area.size = vf->notify_off_multiplier; + + return area; +} + +/* + * IFCVF currently doesn't have on-chip IOMMU, so not + * implemented set_map()/dma_map()/dma_unmap() + */ +static const struct vdpa_config_ops ifc_vdpa_ops = { + .get_device_features = ifcvf_vdpa_get_device_features, + .set_driver_features = ifcvf_vdpa_set_driver_features, + .get_driver_features = ifcvf_vdpa_get_driver_features, + .get_status = ifcvf_vdpa_get_status, + .set_status = ifcvf_vdpa_set_status, + .reset = ifcvf_vdpa_reset, + .get_vq_num_max = ifcvf_vdpa_get_vq_num_max, + .get_vq_state = ifcvf_vdpa_get_vq_state, + .set_vq_state = ifcvf_vdpa_set_vq_state, + .set_vq_cb = ifcvf_vdpa_set_vq_cb, + .set_vq_ready = ifcvf_vdpa_set_vq_ready, + .get_vq_ready = ifcvf_vdpa_get_vq_ready, + .set_vq_num = ifcvf_vdpa_set_vq_num, + .set_vq_address = ifcvf_vdpa_set_vq_address, + .get_vq_irq = ifcvf_vdpa_get_vq_irq, + .kick_vq = ifcvf_vdpa_kick_vq, + .get_generation = ifcvf_vdpa_get_generation, + .get_device_id = ifcvf_vdpa_get_device_id, + .get_vendor_id = ifcvf_vdpa_get_vendor_id, + .get_vq_align = ifcvf_vdpa_get_vq_align, + .get_vq_group = ifcvf_vdpa_get_vq_group, + .get_config_size = ifcvf_vdpa_get_config_size, + .get_config = ifcvf_vdpa_get_config, + .set_config = ifcvf_vdpa_set_config, + .set_config_cb = ifcvf_vdpa_set_config_cb, + .get_vq_notification = ifcvf_get_vq_notification, +}; + +static struct virtio_device_id id_table_net[] = { + {VIRTIO_ID_NET, VIRTIO_DEV_ANY_ID}, + {0}, +}; + +static struct virtio_device_id id_table_blk[] = { + {VIRTIO_ID_BLOCK, VIRTIO_DEV_ANY_ID}, + {0}, +}; + +static u32 get_dev_type(struct pci_dev *pdev) +{ + u32 dev_type; + + /* This drirver drives both modern virtio devices and transitional + * devices in modern mode. + * vDPA requires feature bit VIRTIO_F_ACCESS_PLATFORM, + * so legacy devices and transitional devices in legacy + * mode will not work for vDPA, this driver will not + * drive devices with legacy interface. + */ + + if (pdev->device < 0x1040) + dev_type = pdev->subsystem_device; + else + dev_type = pdev->device - 0x1040; + + return dev_type; +} + +static int ifcvf_vdpa_dev_add(struct vdpa_mgmt_dev *mdev, const char *name, + const struct vdpa_dev_set_config *config) +{ + struct ifcvf_vdpa_mgmt_dev *ifcvf_mgmt_dev; + struct ifcvf_adapter *adapter; + struct vdpa_device *vdpa_dev; + struct pci_dev *pdev; + struct ifcvf_hw *vf; + u64 device_features; + int ret; + + ifcvf_mgmt_dev = container_of(mdev, struct ifcvf_vdpa_mgmt_dev, mdev); + vf = &ifcvf_mgmt_dev->vf; + pdev = vf->pdev; + adapter = vdpa_alloc_device(struct ifcvf_adapter, vdpa, + &pdev->dev, &ifc_vdpa_ops, 1, 1, NULL, false); + if (IS_ERR(adapter)) { + IFCVF_ERR(pdev, "Failed to allocate vDPA structure"); + return PTR_ERR(adapter); + } + + ifcvf_mgmt_dev->adapter = adapter; + adapter->pdev = pdev; + adapter->vdpa.dma_dev = &pdev->dev; + adapter->vdpa.mdev = mdev; + adapter->vf = vf; + vdpa_dev = &adapter->vdpa; + + device_features = vf->hw_features; + if (config->mask & BIT_ULL(VDPA_ATTR_DEV_FEATURES)) { + if (config->device_features & ~device_features) { + IFCVF_ERR(pdev, "The provisioned features 0x%llx are not supported by this device with features 0x%llx\n", + config->device_features, device_features); + return -EINVAL; + } + device_features &= config->device_features; + } + vf->dev_features = device_features; + + if (name) + ret = dev_set_name(&vdpa_dev->dev, "%s", name); + else + ret = dev_set_name(&vdpa_dev->dev, "vdpa%u", vdpa_dev->index); + + ret = _vdpa_register_device(&adapter->vdpa, vf->nr_vring); + if (ret) { + put_device(&adapter->vdpa.dev); + IFCVF_ERR(pdev, "Failed to register to vDPA bus"); + return ret; + } + + return 0; +} + +static void ifcvf_vdpa_dev_del(struct vdpa_mgmt_dev *mdev, struct vdpa_device *dev) +{ + struct ifcvf_vdpa_mgmt_dev *ifcvf_mgmt_dev; + + ifcvf_mgmt_dev = container_of(mdev, struct ifcvf_vdpa_mgmt_dev, mdev); + _vdpa_unregister_device(dev); + ifcvf_mgmt_dev->adapter = NULL; +} + +static const struct vdpa_mgmtdev_ops ifcvf_vdpa_mgmt_dev_ops = { + .dev_add = ifcvf_vdpa_dev_add, + .dev_del = ifcvf_vdpa_dev_del +}; + +static int ifcvf_probe(struct pci_dev *pdev, const struct pci_device_id *id) +{ + struct ifcvf_vdpa_mgmt_dev *ifcvf_mgmt_dev; + struct device *dev = &pdev->dev; + struct ifcvf_hw *vf; + u32 dev_type; + int ret, i; + + ret = pcim_enable_device(pdev); + if (ret) { + IFCVF_ERR(pdev, "Failed to enable device\n"); + return ret; + } + ret = pcim_iomap_regions(pdev, BIT(0) | BIT(2) | BIT(4), + IFCVF_DRIVER_NAME); + if (ret) { + IFCVF_ERR(pdev, "Failed to request MMIO region\n"); + return ret; + } + + ret = dma_set_mask_and_coherent(dev, DMA_BIT_MASK(64)); + if (ret) { + IFCVF_ERR(pdev, "No usable DMA configuration\n"); + return ret; + } + + ret = devm_add_action_or_reset(dev, ifcvf_free_irq_vectors, pdev); + if (ret) { + IFCVF_ERR(pdev, + "Failed for adding devres for freeing irq vectors\n"); + return ret; + } + + pci_set_master(pdev); + ifcvf_mgmt_dev = kzalloc(sizeof(struct ifcvf_vdpa_mgmt_dev), GFP_KERNEL); + if (!ifcvf_mgmt_dev) { + IFCVF_ERR(pdev, "Failed to alloc memory for the vDPA management device\n"); + return -ENOMEM; + } + + vf = &ifcvf_mgmt_dev->vf; + vf->dev_type = get_dev_type(pdev); + vf->base = pcim_iomap_table(pdev); + vf->pdev = pdev; + + ret = ifcvf_init_hw(vf, pdev); + if (ret) { + IFCVF_ERR(pdev, "Failed to init IFCVF hw\n"); + goto err; + } + + for (i = 0; i < vf->nr_vring; i++) + vf->vring[i].irq = -EINVAL; + + vf->hw_features = ifcvf_get_hw_features(vf); + vf->config_size = ifcvf_get_config_size(vf); + + dev_type = get_dev_type(pdev); + switch (dev_type) { + case VIRTIO_ID_NET: + ifcvf_mgmt_dev->mdev.id_table = id_table_net; + break; + case VIRTIO_ID_BLOCK: + ifcvf_mgmt_dev->mdev.id_table = id_table_blk; + break; + default: + IFCVF_ERR(pdev, "VIRTIO ID %u not supported\n", dev_type); + ret = -EOPNOTSUPP; + goto err; + } + + ifcvf_mgmt_dev->mdev.ops = &ifcvf_vdpa_mgmt_dev_ops; + ifcvf_mgmt_dev->mdev.device = dev; + ifcvf_mgmt_dev->mdev.max_supported_vqs = vf->nr_vring; + ifcvf_mgmt_dev->mdev.supported_features = vf->hw_features; + ifcvf_mgmt_dev->mdev.config_attr_mask = (1 << VDPA_ATTR_DEV_FEATURES); + + ret = vdpa_mgmtdev_register(&ifcvf_mgmt_dev->mdev); + if (ret) { + IFCVF_ERR(pdev, + "Failed to initialize the management interfaces\n"); + goto err; + } + + pci_set_drvdata(pdev, ifcvf_mgmt_dev); + + return 0; + +err: + kfree(ifcvf_mgmt_dev->vf.vring); + kfree(ifcvf_mgmt_dev); + return ret; +} + +static void ifcvf_remove(struct pci_dev *pdev) +{ + struct ifcvf_vdpa_mgmt_dev *ifcvf_mgmt_dev; + + ifcvf_mgmt_dev = pci_get_drvdata(pdev); + vdpa_mgmtdev_unregister(&ifcvf_mgmt_dev->mdev); + kfree(ifcvf_mgmt_dev->vf.vring); + kfree(ifcvf_mgmt_dev); +} + +static struct pci_device_id ifcvf_pci_ids[] = { + /* N3000 network device */ + { PCI_DEVICE_SUB(PCI_VENDOR_ID_REDHAT_QUMRANET, + N3000_DEVICE_ID, + PCI_VENDOR_ID_INTEL, + N3000_SUBSYS_DEVICE_ID) }, + /* C5000X-PL network device + * F2000X-PL network device + */ + { PCI_DEVICE_SUB(PCI_VENDOR_ID_REDHAT_QUMRANET, + VIRTIO_TRANS_ID_NET, + PCI_VENDOR_ID_INTEL, + VIRTIO_ID_NET) }, + /* C5000X-PL block device */ + { PCI_DEVICE_SUB(PCI_VENDOR_ID_REDHAT_QUMRANET, + VIRTIO_TRANS_ID_BLOCK, + PCI_VENDOR_ID_INTEL, + VIRTIO_ID_BLOCK) }, + + { 0 }, +}; +MODULE_DEVICE_TABLE(pci, ifcvf_pci_ids); + +static struct pci_driver ifcvf_driver = { + .name = IFCVF_DRIVER_NAME, + .id_table = ifcvf_pci_ids, + .probe = ifcvf_probe, + .remove = ifcvf_remove, +}; + +module_pci_driver(ifcvf_driver); + +MODULE_LICENSE("GPL v2"); diff --git a/drivers/vdpa/mlx5/Makefile b/drivers/vdpa/mlx5/Makefile new file mode 100644 index 0000000000..e791394c33 --- /dev/null +++ b/drivers/vdpa/mlx5/Makefile @@ -0,0 +1,4 @@ +subdir-ccflags-y += -I$(srctree)/drivers/vdpa/mlx5/core + +obj-$(CONFIG_MLX5_VDPA_NET) += mlx5_vdpa.o +mlx5_vdpa-$(CONFIG_MLX5_VDPA_NET) += net/mlx5_vnet.o core/resources.o core/mr.o net/debug.o diff --git a/drivers/vdpa/mlx5/core/mlx5_vdpa.h b/drivers/vdpa/mlx5/core/mlx5_vdpa.h new file mode 100644 index 0000000000..ca56242972 --- /dev/null +++ b/drivers/vdpa/mlx5/core/mlx5_vdpa.h @@ -0,0 +1,136 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2020 Mellanox Technologies Ltd. */ + +#ifndef __MLX5_VDPA_H__ +#define __MLX5_VDPA_H__ + +#include <linux/etherdevice.h> +#include <linux/vringh.h> +#include <linux/vdpa.h> +#include <linux/mlx5/driver.h> + +#define MLX5V_ETH_HARD_MTU (ETH_HLEN + VLAN_HLEN + ETH_FCS_LEN) + +struct mlx5_vdpa_direct_mr { + u64 start; + u64 end; + u32 perm; + u32 mr; + struct sg_table sg_head; + int log_size; + int nsg; + int nent; + struct list_head list; + u64 offset; +}; + +struct mlx5_vdpa_mr { + u32 mkey; + + /* list of direct MRs descendants of this indirect mr */ + struct list_head head; + unsigned long num_directs; + unsigned long num_klms; + /* state of dvq mr */ + bool initialized; + + /* serialize mkey creation and destruction */ + struct mutex mkey_mtx; + bool user_mr; +}; + +struct mlx5_vdpa_resources { + u32 pdn; + struct mlx5_uars_page *uar; + void __iomem *kick_addr; + u64 phys_kick_addr; + u16 uid; + u32 null_mkey; + bool valid; +}; + +struct mlx5_control_vq { + struct vhost_iotlb *iotlb; + /* spinlock to synchronize iommu table */ + spinlock_t iommu_lock; + struct vringh vring; + bool ready; + u64 desc_addr; + u64 device_addr; + u64 driver_addr; + struct vdpa_callback event_cb; + struct vringh_kiov riov; + struct vringh_kiov wiov; + unsigned short head; + unsigned int received_desc; + unsigned int completed_desc; +}; + +struct mlx5_vdpa_wq_ent { + struct work_struct work; + struct mlx5_vdpa_dev *mvdev; +}; + +enum { + MLX5_VDPA_DATAVQ_GROUP, + MLX5_VDPA_CVQ_GROUP, + MLX5_VDPA_NUMVQ_GROUPS +}; + +enum { + MLX5_VDPA_NUM_AS = MLX5_VDPA_NUMVQ_GROUPS +}; + +struct mlx5_vdpa_dev { + struct vdpa_device vdev; + struct mlx5_core_dev *mdev; + struct mlx5_vdpa_resources res; + + u64 mlx_features; + u64 actual_features; + u8 status; + u32 max_vqs; + u16 max_idx; + u32 generation; + + struct mlx5_vdpa_mr mr; + struct mlx5_control_vq cvq; + struct workqueue_struct *wq; + unsigned int group2asid[MLX5_VDPA_NUMVQ_GROUPS]; + bool suspended; +}; + +int mlx5_vdpa_create_tis(struct mlx5_vdpa_dev *mvdev, void *in, u32 *tisn); +void mlx5_vdpa_destroy_tis(struct mlx5_vdpa_dev *mvdev, u32 tisn); +int mlx5_vdpa_create_rqt(struct mlx5_vdpa_dev *mvdev, void *in, int inlen, u32 *rqtn); +int mlx5_vdpa_modify_rqt(struct mlx5_vdpa_dev *mvdev, void *in, int inlen, u32 rqtn); +void mlx5_vdpa_destroy_rqt(struct mlx5_vdpa_dev *mvdev, u32 rqtn); +int mlx5_vdpa_create_tir(struct mlx5_vdpa_dev *mvdev, void *in, u32 *tirn); +void mlx5_vdpa_destroy_tir(struct mlx5_vdpa_dev *mvdev, u32 tirn); +int mlx5_vdpa_alloc_transport_domain(struct mlx5_vdpa_dev *mvdev, u32 *tdn); +void mlx5_vdpa_dealloc_transport_domain(struct mlx5_vdpa_dev *mvdev, u32 tdn); +int mlx5_vdpa_alloc_resources(struct mlx5_vdpa_dev *mvdev); +void mlx5_vdpa_free_resources(struct mlx5_vdpa_dev *mvdev); +int mlx5_vdpa_create_mkey(struct mlx5_vdpa_dev *mvdev, u32 *mkey, u32 *in, + int inlen); +int mlx5_vdpa_destroy_mkey(struct mlx5_vdpa_dev *mvdev, u32 mkey); +int mlx5_vdpa_handle_set_map(struct mlx5_vdpa_dev *mvdev, struct vhost_iotlb *iotlb, + bool *change_map, unsigned int asid); +int mlx5_vdpa_create_mr(struct mlx5_vdpa_dev *mvdev, struct vhost_iotlb *iotlb, + unsigned int asid); +void mlx5_vdpa_destroy_mr(struct mlx5_vdpa_dev *mvdev); +void mlx5_vdpa_destroy_mr_asid(struct mlx5_vdpa_dev *mvdev, unsigned int asid); + +#define mlx5_vdpa_warn(__dev, format, ...) \ + dev_warn((__dev)->mdev->device, "%s:%d:(pid %d) warning: " format, __func__, __LINE__, \ + current->pid, ##__VA_ARGS__) + +#define mlx5_vdpa_info(__dev, format, ...) \ + dev_info((__dev)->mdev->device, "%s:%d:(pid %d): " format, __func__, __LINE__, \ + current->pid, ##__VA_ARGS__) + +#define mlx5_vdpa_dbg(__dev, format, ...) \ + dev_debug((__dev)->mdev->device, "%s:%d:(pid %d): " format, __func__, __LINE__, \ + current->pid, ##__VA_ARGS__) + +#endif /* __MLX5_VDPA_H__ */ diff --git a/drivers/vdpa/mlx5/core/mr.c b/drivers/vdpa/mlx5/core/mr.c new file mode 100644 index 0000000000..5a1971fcd8 --- /dev/null +++ b/drivers/vdpa/mlx5/core/mr.c @@ -0,0 +1,621 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* Copyright (c) 2020 Mellanox Technologies Ltd. */ + +#include <linux/vhost_types.h> +#include <linux/vdpa.h> +#include <linux/gcd.h> +#include <linux/string.h> +#include <linux/mlx5/qp.h> +#include "mlx5_vdpa.h" + +/* DIV_ROUND_UP where the divider is a power of 2 give by its log base 2 value */ +#define MLX5_DIV_ROUND_UP_POW2(_n, _s) \ +({ \ + u64 __s = _s; \ + u64 _res; \ + _res = (((_n) + (1 << (__s)) - 1) >> (__s)); \ + _res; \ +}) + +static int get_octo_len(u64 len, int page_shift) +{ + u64 page_size = 1ULL << page_shift; + int npages; + + npages = ALIGN(len, page_size) >> page_shift; + return (npages + 1) / 2; +} + +static void mlx5_set_access_mode(void *mkc, int mode) +{ + MLX5_SET(mkc, mkc, access_mode_1_0, mode & 0x3); + MLX5_SET(mkc, mkc, access_mode_4_2, mode >> 2); +} + +static void populate_mtts(struct mlx5_vdpa_direct_mr *mr, __be64 *mtt) +{ + struct scatterlist *sg; + int nsg = mr->nsg; + u64 dma_addr; + u64 dma_len; + int j = 0; + int i; + + for_each_sg(mr->sg_head.sgl, sg, mr->nent, i) { + for (dma_addr = sg_dma_address(sg), dma_len = sg_dma_len(sg); + nsg && dma_len; + nsg--, dma_addr += BIT(mr->log_size), dma_len -= BIT(mr->log_size)) + mtt[j++] = cpu_to_be64(dma_addr); + } +} + +static int create_direct_mr(struct mlx5_vdpa_dev *mvdev, struct mlx5_vdpa_direct_mr *mr) +{ + int inlen; + void *mkc; + void *in; + int err; + + inlen = MLX5_ST_SZ_BYTES(create_mkey_in) + roundup(MLX5_ST_SZ_BYTES(mtt) * mr->nsg, 16); + in = kvzalloc(inlen, GFP_KERNEL); + if (!in) + return -ENOMEM; + + MLX5_SET(create_mkey_in, in, uid, mvdev->res.uid); + mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); + MLX5_SET(mkc, mkc, lw, !!(mr->perm & VHOST_MAP_WO)); + MLX5_SET(mkc, mkc, lr, !!(mr->perm & VHOST_MAP_RO)); + mlx5_set_access_mode(mkc, MLX5_MKC_ACCESS_MODE_MTT); + MLX5_SET(mkc, mkc, qpn, 0xffffff); + MLX5_SET(mkc, mkc, pd, mvdev->res.pdn); + MLX5_SET64(mkc, mkc, start_addr, mr->offset); + MLX5_SET64(mkc, mkc, len, mr->end - mr->start); + MLX5_SET(mkc, mkc, log_page_size, mr->log_size); + MLX5_SET(mkc, mkc, translations_octword_size, + get_octo_len(mr->end - mr->start, mr->log_size)); + MLX5_SET(create_mkey_in, in, translations_octword_actual_size, + get_octo_len(mr->end - mr->start, mr->log_size)); + populate_mtts(mr, MLX5_ADDR_OF(create_mkey_in, in, klm_pas_mtt)); + err = mlx5_vdpa_create_mkey(mvdev, &mr->mr, in, inlen); + kvfree(in); + if (err) { + mlx5_vdpa_warn(mvdev, "Failed to create direct MR\n"); + return err; + } + + return 0; +} + +static void destroy_direct_mr(struct mlx5_vdpa_dev *mvdev, struct mlx5_vdpa_direct_mr *mr) +{ + mlx5_vdpa_destroy_mkey(mvdev, mr->mr); +} + +static u64 map_start(struct vhost_iotlb_map *map, struct mlx5_vdpa_direct_mr *mr) +{ + return max_t(u64, map->start, mr->start); +} + +static u64 map_end(struct vhost_iotlb_map *map, struct mlx5_vdpa_direct_mr *mr) +{ + return min_t(u64, map->last + 1, mr->end); +} + +static u64 maplen(struct vhost_iotlb_map *map, struct mlx5_vdpa_direct_mr *mr) +{ + return map_end(map, mr) - map_start(map, mr); +} + +#define MLX5_VDPA_INVALID_START_ADDR ((u64)-1) +#define MLX5_VDPA_INVALID_LEN ((u64)-1) + +static u64 indir_start_addr(struct mlx5_vdpa_mr *mkey) +{ + struct mlx5_vdpa_direct_mr *s; + + s = list_first_entry_or_null(&mkey->head, struct mlx5_vdpa_direct_mr, list); + if (!s) + return MLX5_VDPA_INVALID_START_ADDR; + + return s->start; +} + +static u64 indir_len(struct mlx5_vdpa_mr *mkey) +{ + struct mlx5_vdpa_direct_mr *s; + struct mlx5_vdpa_direct_mr *e; + + s = list_first_entry_or_null(&mkey->head, struct mlx5_vdpa_direct_mr, list); + if (!s) + return MLX5_VDPA_INVALID_LEN; + + e = list_last_entry(&mkey->head, struct mlx5_vdpa_direct_mr, list); + + return e->end - s->start; +} + +#define LOG_MAX_KLM_SIZE 30 +#define MAX_KLM_SIZE BIT(LOG_MAX_KLM_SIZE) + +static u32 klm_bcount(u64 size) +{ + return (u32)size; +} + +static void fill_indir(struct mlx5_vdpa_dev *mvdev, struct mlx5_vdpa_mr *mkey, void *in) +{ + struct mlx5_vdpa_direct_mr *dmr; + struct mlx5_klm *klmarr; + struct mlx5_klm *klm; + bool first = true; + u64 preve; + int i; + + klmarr = MLX5_ADDR_OF(create_mkey_in, in, klm_pas_mtt); + i = 0; + list_for_each_entry(dmr, &mkey->head, list) { +again: + klm = &klmarr[i++]; + if (first) { + preve = dmr->start; + first = false; + } + + if (preve == dmr->start) { + klm->key = cpu_to_be32(dmr->mr); + klm->bcount = cpu_to_be32(klm_bcount(dmr->end - dmr->start)); + preve = dmr->end; + } else { + klm->key = cpu_to_be32(mvdev->res.null_mkey); + klm->bcount = cpu_to_be32(klm_bcount(dmr->start - preve)); + preve = dmr->start; + goto again; + } + } +} + +static int klm_byte_size(int nklms) +{ + return 16 * ALIGN(nklms, 4); +} + +static int create_indirect_key(struct mlx5_vdpa_dev *mvdev, struct mlx5_vdpa_mr *mr) +{ + int inlen; + void *mkc; + void *in; + int err; + u64 start; + u64 len; + + start = indir_start_addr(mr); + len = indir_len(mr); + if (start == MLX5_VDPA_INVALID_START_ADDR || len == MLX5_VDPA_INVALID_LEN) + return -EINVAL; + + inlen = MLX5_ST_SZ_BYTES(create_mkey_in) + klm_byte_size(mr->num_klms); + in = kzalloc(inlen, GFP_KERNEL); + if (!in) + return -ENOMEM; + + MLX5_SET(create_mkey_in, in, uid, mvdev->res.uid); + mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); + MLX5_SET(mkc, mkc, lw, 1); + MLX5_SET(mkc, mkc, lr, 1); + mlx5_set_access_mode(mkc, MLX5_MKC_ACCESS_MODE_KLMS); + MLX5_SET(mkc, mkc, qpn, 0xffffff); + MLX5_SET(mkc, mkc, pd, mvdev->res.pdn); + MLX5_SET64(mkc, mkc, start_addr, start); + MLX5_SET64(mkc, mkc, len, len); + MLX5_SET(mkc, mkc, translations_octword_size, klm_byte_size(mr->num_klms) / 16); + MLX5_SET(create_mkey_in, in, translations_octword_actual_size, mr->num_klms); + fill_indir(mvdev, mr, in); + err = mlx5_vdpa_create_mkey(mvdev, &mr->mkey, in, inlen); + kfree(in); + return err; +} + +static void destroy_indirect_key(struct mlx5_vdpa_dev *mvdev, struct mlx5_vdpa_mr *mkey) +{ + mlx5_vdpa_destroy_mkey(mvdev, mkey->mkey); +} + +static int map_direct_mr(struct mlx5_vdpa_dev *mvdev, struct mlx5_vdpa_direct_mr *mr, + struct vhost_iotlb *iotlb) +{ + struct vhost_iotlb_map *map; + unsigned long lgcd = 0; + int log_entity_size; + unsigned long size; + u64 start = 0; + int err; + struct page *pg; + unsigned int nsg; + int sglen; + u64 pa; + u64 paend; + struct scatterlist *sg; + struct device *dma = mvdev->vdev.dma_dev; + + for (map = vhost_iotlb_itree_first(iotlb, mr->start, mr->end - 1); + map; map = vhost_iotlb_itree_next(map, start, mr->end - 1)) { + size = maplen(map, mr); + lgcd = gcd(lgcd, size); + start += size; + } + log_entity_size = ilog2(lgcd); + + sglen = 1 << log_entity_size; + nsg = MLX5_DIV_ROUND_UP_POW2(mr->end - mr->start, log_entity_size); + + err = sg_alloc_table(&mr->sg_head, nsg, GFP_KERNEL); + if (err) + return err; + + sg = mr->sg_head.sgl; + for (map = vhost_iotlb_itree_first(iotlb, mr->start, mr->end - 1); + map; map = vhost_iotlb_itree_next(map, mr->start, mr->end - 1)) { + paend = map->addr + maplen(map, mr); + for (pa = map->addr; pa < paend; pa += sglen) { + pg = pfn_to_page(__phys_to_pfn(pa)); + if (!sg) { + mlx5_vdpa_warn(mvdev, "sg null. start 0x%llx, end 0x%llx\n", + map->start, map->last + 1); + err = -ENOMEM; + goto err_map; + } + sg_set_page(sg, pg, sglen, 0); + sg = sg_next(sg); + if (!sg) + goto done; + } + } +done: + mr->log_size = log_entity_size; + mr->nsg = nsg; + mr->nent = dma_map_sg_attrs(dma, mr->sg_head.sgl, mr->nsg, DMA_BIDIRECTIONAL, 0); + if (!mr->nent) { + err = -ENOMEM; + goto err_map; + } + + err = create_direct_mr(mvdev, mr); + if (err) + goto err_direct; + + return 0; + +err_direct: + dma_unmap_sg_attrs(dma, mr->sg_head.sgl, mr->nsg, DMA_BIDIRECTIONAL, 0); +err_map: + sg_free_table(&mr->sg_head); + return err; +} + +static void unmap_direct_mr(struct mlx5_vdpa_dev *mvdev, struct mlx5_vdpa_direct_mr *mr) +{ + struct device *dma = mvdev->vdev.dma_dev; + + destroy_direct_mr(mvdev, mr); + dma_unmap_sg_attrs(dma, mr->sg_head.sgl, mr->nsg, DMA_BIDIRECTIONAL, 0); + sg_free_table(&mr->sg_head); +} + +static int add_direct_chain(struct mlx5_vdpa_dev *mvdev, u64 start, u64 size, u8 perm, + struct vhost_iotlb *iotlb) +{ + struct mlx5_vdpa_mr *mr = &mvdev->mr; + struct mlx5_vdpa_direct_mr *dmr; + struct mlx5_vdpa_direct_mr *n; + LIST_HEAD(tmp); + u64 st; + u64 sz; + int err; + + st = start; + while (size) { + sz = (u32)min_t(u64, MAX_KLM_SIZE, size); + dmr = kzalloc(sizeof(*dmr), GFP_KERNEL); + if (!dmr) { + err = -ENOMEM; + goto err_alloc; + } + + dmr->start = st; + dmr->end = st + sz; + dmr->perm = perm; + err = map_direct_mr(mvdev, dmr, iotlb); + if (err) { + kfree(dmr); + goto err_alloc; + } + + list_add_tail(&dmr->list, &tmp); + size -= sz; + mr->num_directs++; + mr->num_klms++; + st += sz; + } + list_splice_tail(&tmp, &mr->head); + return 0; + +err_alloc: + list_for_each_entry_safe(dmr, n, &mr->head, list) { + list_del_init(&dmr->list); + unmap_direct_mr(mvdev, dmr); + kfree(dmr); + } + return err; +} + +/* The iotlb pointer contains a list of maps. Go over the maps, possibly + * merging mergeable maps, and create direct memory keys that provide the + * device access to memory. The direct mkeys are then referred to by the + * indirect memory key that provides access to the enitre address space given + * by iotlb. + */ +static int create_user_mr(struct mlx5_vdpa_dev *mvdev, struct vhost_iotlb *iotlb) +{ + struct mlx5_vdpa_mr *mr = &mvdev->mr; + struct mlx5_vdpa_direct_mr *dmr; + struct mlx5_vdpa_direct_mr *n; + struct vhost_iotlb_map *map; + u32 pperm = U16_MAX; + u64 last = U64_MAX; + u64 ps = U64_MAX; + u64 pe = U64_MAX; + u64 start = 0; + int err = 0; + int nnuls; + + INIT_LIST_HEAD(&mr->head); + for (map = vhost_iotlb_itree_first(iotlb, start, last); map; + map = vhost_iotlb_itree_next(map, start, last)) { + start = map->start; + if (pe == map->start && pperm == map->perm) { + pe = map->last + 1; + } else { + if (ps != U64_MAX) { + if (pe < map->start) { + /* We have a hole in the map. Check how + * many null keys are required to fill it. + */ + nnuls = MLX5_DIV_ROUND_UP_POW2(map->start - pe, + LOG_MAX_KLM_SIZE); + mr->num_klms += nnuls; + } + err = add_direct_chain(mvdev, ps, pe - ps, pperm, iotlb); + if (err) + goto err_chain; + } + ps = map->start; + pe = map->last + 1; + pperm = map->perm; + } + } + err = add_direct_chain(mvdev, ps, pe - ps, pperm, iotlb); + if (err) + goto err_chain; + + /* Create the memory key that defines the guests's address space. This + * memory key refers to the direct keys that contain the MTT + * translations + */ + err = create_indirect_key(mvdev, mr); + if (err) + goto err_chain; + + mr->user_mr = true; + return 0; + +err_chain: + list_for_each_entry_safe_reverse(dmr, n, &mr->head, list) { + list_del_init(&dmr->list); + unmap_direct_mr(mvdev, dmr); + kfree(dmr); + } + return err; +} + +static int create_dma_mr(struct mlx5_vdpa_dev *mvdev, struct mlx5_vdpa_mr *mr) +{ + int inlen = MLX5_ST_SZ_BYTES(create_mkey_in); + void *mkc; + u32 *in; + int err; + + in = kzalloc(inlen, GFP_KERNEL); + if (!in) + return -ENOMEM; + + mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); + + MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_PA); + MLX5_SET(mkc, mkc, length64, 1); + MLX5_SET(mkc, mkc, lw, 1); + MLX5_SET(mkc, mkc, lr, 1); + MLX5_SET(mkc, mkc, pd, mvdev->res.pdn); + MLX5_SET(mkc, mkc, qpn, 0xffffff); + + err = mlx5_vdpa_create_mkey(mvdev, &mr->mkey, in, inlen); + if (!err) + mr->user_mr = false; + + kfree(in); + return err; +} + +static void destroy_dma_mr(struct mlx5_vdpa_dev *mvdev, struct mlx5_vdpa_mr *mr) +{ + mlx5_vdpa_destroy_mkey(mvdev, mr->mkey); +} + +static int dup_iotlb(struct mlx5_vdpa_dev *mvdev, struct vhost_iotlb *src) +{ + struct vhost_iotlb_map *map; + u64 start = 0, last = ULLONG_MAX; + int err; + + if (!src) { + err = vhost_iotlb_add_range(mvdev->cvq.iotlb, start, last, start, VHOST_ACCESS_RW); + return err; + } + + for (map = vhost_iotlb_itree_first(src, start, last); map; + map = vhost_iotlb_itree_next(map, start, last)) { + err = vhost_iotlb_add_range(mvdev->cvq.iotlb, map->start, map->last, + map->addr, map->perm); + if (err) + return err; + } + return 0; +} + +static void prune_iotlb(struct mlx5_vdpa_dev *mvdev) +{ + vhost_iotlb_del_range(mvdev->cvq.iotlb, 0, ULLONG_MAX); +} + +static void destroy_user_mr(struct mlx5_vdpa_dev *mvdev, struct mlx5_vdpa_mr *mr) +{ + struct mlx5_vdpa_direct_mr *dmr; + struct mlx5_vdpa_direct_mr *n; + + destroy_indirect_key(mvdev, mr); + list_for_each_entry_safe_reverse(dmr, n, &mr->head, list) { + list_del_init(&dmr->list); + unmap_direct_mr(mvdev, dmr); + kfree(dmr); + } +} + +static void _mlx5_vdpa_destroy_cvq_mr(struct mlx5_vdpa_dev *mvdev, unsigned int asid) +{ + if (mvdev->group2asid[MLX5_VDPA_CVQ_GROUP] != asid) + return; + + prune_iotlb(mvdev); +} + +static void _mlx5_vdpa_destroy_dvq_mr(struct mlx5_vdpa_dev *mvdev, unsigned int asid) +{ + struct mlx5_vdpa_mr *mr = &mvdev->mr; + + if (mvdev->group2asid[MLX5_VDPA_DATAVQ_GROUP] != asid) + return; + + if (!mr->initialized) + return; + + if (mr->user_mr) + destroy_user_mr(mvdev, mr); + else + destroy_dma_mr(mvdev, mr); + + mr->initialized = false; +} + +void mlx5_vdpa_destroy_mr_asid(struct mlx5_vdpa_dev *mvdev, unsigned int asid) +{ + struct mlx5_vdpa_mr *mr = &mvdev->mr; + + mutex_lock(&mr->mkey_mtx); + + _mlx5_vdpa_destroy_dvq_mr(mvdev, asid); + _mlx5_vdpa_destroy_cvq_mr(mvdev, asid); + + mutex_unlock(&mr->mkey_mtx); +} + +void mlx5_vdpa_destroy_mr(struct mlx5_vdpa_dev *mvdev) +{ + mlx5_vdpa_destroy_mr_asid(mvdev, mvdev->group2asid[MLX5_VDPA_CVQ_GROUP]); + mlx5_vdpa_destroy_mr_asid(mvdev, mvdev->group2asid[MLX5_VDPA_DATAVQ_GROUP]); +} + +static int _mlx5_vdpa_create_cvq_mr(struct mlx5_vdpa_dev *mvdev, + struct vhost_iotlb *iotlb, + unsigned int asid) +{ + if (mvdev->group2asid[MLX5_VDPA_CVQ_GROUP] != asid) + return 0; + + return dup_iotlb(mvdev, iotlb); +} + +static int _mlx5_vdpa_create_dvq_mr(struct mlx5_vdpa_dev *mvdev, + struct vhost_iotlb *iotlb, + unsigned int asid) +{ + struct mlx5_vdpa_mr *mr = &mvdev->mr; + int err; + + if (mvdev->group2asid[MLX5_VDPA_DATAVQ_GROUP] != asid) + return 0; + + if (mr->initialized) + return 0; + + if (iotlb) + err = create_user_mr(mvdev, iotlb); + else + err = create_dma_mr(mvdev, mr); + + if (err) + return err; + + mr->initialized = true; + + return 0; +} + +static int _mlx5_vdpa_create_mr(struct mlx5_vdpa_dev *mvdev, + struct vhost_iotlb *iotlb, unsigned int asid) +{ + int err; + + err = _mlx5_vdpa_create_dvq_mr(mvdev, iotlb, asid); + if (err) + return err; + + err = _mlx5_vdpa_create_cvq_mr(mvdev, iotlb, asid); + if (err) + goto out_err; + + return 0; + +out_err: + _mlx5_vdpa_destroy_dvq_mr(mvdev, asid); + + return err; +} + +int mlx5_vdpa_create_mr(struct mlx5_vdpa_dev *mvdev, struct vhost_iotlb *iotlb, + unsigned int asid) +{ + int err; + + mutex_lock(&mvdev->mr.mkey_mtx); + err = _mlx5_vdpa_create_mr(mvdev, iotlb, asid); + mutex_unlock(&mvdev->mr.mkey_mtx); + return err; +} + +int mlx5_vdpa_handle_set_map(struct mlx5_vdpa_dev *mvdev, struct vhost_iotlb *iotlb, + bool *change_map, unsigned int asid) +{ + struct mlx5_vdpa_mr *mr = &mvdev->mr; + int err = 0; + + *change_map = false; + mutex_lock(&mr->mkey_mtx); + if (mr->initialized) { + mlx5_vdpa_info(mvdev, "memory map update\n"); + *change_map = true; + } + if (!*change_map) + err = _mlx5_vdpa_create_mr(mvdev, iotlb, asid); + mutex_unlock(&mr->mkey_mtx); + + return err; +} diff --git a/drivers/vdpa/mlx5/core/resources.c b/drivers/vdpa/mlx5/core/resources.c new file mode 100644 index 0000000000..d5a59c9035 --- /dev/null +++ b/drivers/vdpa/mlx5/core/resources.c @@ -0,0 +1,323 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* Copyright (c) 2020 Mellanox Technologies Ltd. */ + +#include <linux/iova.h> +#include <linux/mlx5/driver.h> +#include "mlx5_vdpa.h" + +static int alloc_pd(struct mlx5_vdpa_dev *dev, u32 *pdn, u16 uid) +{ + struct mlx5_core_dev *mdev = dev->mdev; + + u32 out[MLX5_ST_SZ_DW(alloc_pd_out)] = {}; + u32 in[MLX5_ST_SZ_DW(alloc_pd_in)] = {}; + int err; + + MLX5_SET(alloc_pd_in, in, opcode, MLX5_CMD_OP_ALLOC_PD); + MLX5_SET(alloc_pd_in, in, uid, uid); + + err = mlx5_cmd_exec_inout(mdev, alloc_pd, in, out); + if (!err) + *pdn = MLX5_GET(alloc_pd_out, out, pd); + + return err; +} + +static int dealloc_pd(struct mlx5_vdpa_dev *dev, u32 pdn, u16 uid) +{ + u32 in[MLX5_ST_SZ_DW(dealloc_pd_in)] = {}; + struct mlx5_core_dev *mdev = dev->mdev; + + MLX5_SET(dealloc_pd_in, in, opcode, MLX5_CMD_OP_DEALLOC_PD); + MLX5_SET(dealloc_pd_in, in, pd, pdn); + MLX5_SET(dealloc_pd_in, in, uid, uid); + return mlx5_cmd_exec_in(mdev, dealloc_pd, in); +} + +static int get_null_mkey(struct mlx5_vdpa_dev *dev, u32 *null_mkey) +{ + u32 out[MLX5_ST_SZ_DW(query_special_contexts_out)] = {}; + u32 in[MLX5_ST_SZ_DW(query_special_contexts_in)] = {}; + struct mlx5_core_dev *mdev = dev->mdev; + int err; + + MLX5_SET(query_special_contexts_in, in, opcode, MLX5_CMD_OP_QUERY_SPECIAL_CONTEXTS); + err = mlx5_cmd_exec_inout(mdev, query_special_contexts, in, out); + if (!err) + *null_mkey = MLX5_GET(query_special_contexts_out, out, null_mkey); + return err; +} + +static int create_uctx(struct mlx5_vdpa_dev *mvdev, u16 *uid) +{ + u32 out[MLX5_ST_SZ_DW(create_uctx_out)] = {}; + int inlen; + void *in; + int err; + + if (MLX5_CAP_GEN(mvdev->mdev, umem_uid_0)) + return 0; + + /* 0 means not supported */ + if (!MLX5_CAP_GEN(mvdev->mdev, log_max_uctx)) + return -EOPNOTSUPP; + + inlen = MLX5_ST_SZ_BYTES(create_uctx_in); + in = kzalloc(inlen, GFP_KERNEL); + if (!in) + return -ENOMEM; + + MLX5_SET(create_uctx_in, in, opcode, MLX5_CMD_OP_CREATE_UCTX); + MLX5_SET(create_uctx_in, in, uctx.cap, MLX5_UCTX_CAP_RAW_TX); + + err = mlx5_cmd_exec(mvdev->mdev, in, inlen, out, sizeof(out)); + kfree(in); + if (!err) + *uid = MLX5_GET(create_uctx_out, out, uid); + + return err; +} + +static void destroy_uctx(struct mlx5_vdpa_dev *mvdev, u32 uid) +{ + u32 out[MLX5_ST_SZ_DW(destroy_uctx_out)] = {}; + u32 in[MLX5_ST_SZ_DW(destroy_uctx_in)] = {}; + + if (!uid) + return; + + MLX5_SET(destroy_uctx_in, in, opcode, MLX5_CMD_OP_DESTROY_UCTX); + MLX5_SET(destroy_uctx_in, in, uid, uid); + + mlx5_cmd_exec(mvdev->mdev, in, sizeof(in), out, sizeof(out)); +} + +int mlx5_vdpa_create_tis(struct mlx5_vdpa_dev *mvdev, void *in, u32 *tisn) +{ + u32 out[MLX5_ST_SZ_DW(create_tis_out)] = {}; + int err; + + MLX5_SET(create_tis_in, in, opcode, MLX5_CMD_OP_CREATE_TIS); + MLX5_SET(create_tis_in, in, uid, mvdev->res.uid); + err = mlx5_cmd_exec_inout(mvdev->mdev, create_tis, in, out); + if (!err) + *tisn = MLX5_GET(create_tis_out, out, tisn); + + return err; +} + +void mlx5_vdpa_destroy_tis(struct mlx5_vdpa_dev *mvdev, u32 tisn) +{ + u32 in[MLX5_ST_SZ_DW(destroy_tis_in)] = {}; + + MLX5_SET(destroy_tis_in, in, opcode, MLX5_CMD_OP_DESTROY_TIS); + MLX5_SET(destroy_tis_in, in, uid, mvdev->res.uid); + MLX5_SET(destroy_tis_in, in, tisn, tisn); + mlx5_cmd_exec_in(mvdev->mdev, destroy_tis, in); +} + +int mlx5_vdpa_create_rqt(struct mlx5_vdpa_dev *mvdev, void *in, int inlen, u32 *rqtn) +{ + u32 out[MLX5_ST_SZ_DW(create_rqt_out)] = {}; + int err; + + MLX5_SET(create_rqt_in, in, opcode, MLX5_CMD_OP_CREATE_RQT); + err = mlx5_cmd_exec(mvdev->mdev, in, inlen, out, sizeof(out)); + if (!err) + *rqtn = MLX5_GET(create_rqt_out, out, rqtn); + + return err; +} + +int mlx5_vdpa_modify_rqt(struct mlx5_vdpa_dev *mvdev, void *in, int inlen, u32 rqtn) +{ + u32 out[MLX5_ST_SZ_DW(create_rqt_out)] = {}; + + MLX5_SET(modify_rqt_in, in, uid, mvdev->res.uid); + MLX5_SET(modify_rqt_in, in, rqtn, rqtn); + MLX5_SET(modify_rqt_in, in, opcode, MLX5_CMD_OP_MODIFY_RQT); + return mlx5_cmd_exec(mvdev->mdev, in, inlen, out, sizeof(out)); +} + +void mlx5_vdpa_destroy_rqt(struct mlx5_vdpa_dev *mvdev, u32 rqtn) +{ + u32 in[MLX5_ST_SZ_DW(destroy_rqt_in)] = {}; + + MLX5_SET(destroy_rqt_in, in, opcode, MLX5_CMD_OP_DESTROY_RQT); + MLX5_SET(destroy_rqt_in, in, uid, mvdev->res.uid); + MLX5_SET(destroy_rqt_in, in, rqtn, rqtn); + mlx5_cmd_exec_in(mvdev->mdev, destroy_rqt, in); +} + +int mlx5_vdpa_create_tir(struct mlx5_vdpa_dev *mvdev, void *in, u32 *tirn) +{ + u32 out[MLX5_ST_SZ_DW(create_tir_out)] = {}; + int err; + + MLX5_SET(create_tir_in, in, opcode, MLX5_CMD_OP_CREATE_TIR); + err = mlx5_cmd_exec_inout(mvdev->mdev, create_tir, in, out); + if (!err) + *tirn = MLX5_GET(create_tir_out, out, tirn); + + return err; +} + +void mlx5_vdpa_destroy_tir(struct mlx5_vdpa_dev *mvdev, u32 tirn) +{ + u32 in[MLX5_ST_SZ_DW(destroy_tir_in)] = {}; + + MLX5_SET(destroy_tir_in, in, opcode, MLX5_CMD_OP_DESTROY_TIR); + MLX5_SET(destroy_tir_in, in, uid, mvdev->res.uid); + MLX5_SET(destroy_tir_in, in, tirn, tirn); + mlx5_cmd_exec_in(mvdev->mdev, destroy_tir, in); +} + +int mlx5_vdpa_alloc_transport_domain(struct mlx5_vdpa_dev *mvdev, u32 *tdn) +{ + u32 out[MLX5_ST_SZ_DW(alloc_transport_domain_out)] = {}; + u32 in[MLX5_ST_SZ_DW(alloc_transport_domain_in)] = {}; + int err; + + MLX5_SET(alloc_transport_domain_in, in, opcode, MLX5_CMD_OP_ALLOC_TRANSPORT_DOMAIN); + MLX5_SET(alloc_transport_domain_in, in, uid, mvdev->res.uid); + + err = mlx5_cmd_exec_inout(mvdev->mdev, alloc_transport_domain, in, out); + if (!err) + *tdn = MLX5_GET(alloc_transport_domain_out, out, transport_domain); + + return err; +} + +void mlx5_vdpa_dealloc_transport_domain(struct mlx5_vdpa_dev *mvdev, u32 tdn) +{ + u32 in[MLX5_ST_SZ_DW(dealloc_transport_domain_in)] = {}; + + MLX5_SET(dealloc_transport_domain_in, in, opcode, MLX5_CMD_OP_DEALLOC_TRANSPORT_DOMAIN); + MLX5_SET(dealloc_transport_domain_in, in, uid, mvdev->res.uid); + MLX5_SET(dealloc_transport_domain_in, in, transport_domain, tdn); + mlx5_cmd_exec_in(mvdev->mdev, dealloc_transport_domain, in); +} + +int mlx5_vdpa_create_mkey(struct mlx5_vdpa_dev *mvdev, u32 *mkey, u32 *in, + int inlen) +{ + u32 lout[MLX5_ST_SZ_DW(create_mkey_out)] = {}; + u32 mkey_index; + int err; + + MLX5_SET(create_mkey_in, in, opcode, MLX5_CMD_OP_CREATE_MKEY); + MLX5_SET(create_mkey_in, in, uid, mvdev->res.uid); + + err = mlx5_cmd_exec(mvdev->mdev, in, inlen, lout, sizeof(lout)); + if (err) + return err; + + mkey_index = MLX5_GET(create_mkey_out, lout, mkey_index); + *mkey = mlx5_idx_to_mkey(mkey_index); + return 0; +} + +int mlx5_vdpa_destroy_mkey(struct mlx5_vdpa_dev *mvdev, u32 mkey) +{ + u32 in[MLX5_ST_SZ_DW(destroy_mkey_in)] = {}; + + MLX5_SET(destroy_mkey_in, in, uid, mvdev->res.uid); + MLX5_SET(destroy_mkey_in, in, opcode, MLX5_CMD_OP_DESTROY_MKEY); + MLX5_SET(destroy_mkey_in, in, mkey_index, mlx5_mkey_to_idx(mkey)); + return mlx5_cmd_exec_in(mvdev->mdev, destroy_mkey, in); +} + +static int init_ctrl_vq(struct mlx5_vdpa_dev *mvdev) +{ + mvdev->cvq.iotlb = vhost_iotlb_alloc(0, 0); + if (!mvdev->cvq.iotlb) + return -ENOMEM; + + spin_lock_init(&mvdev->cvq.iommu_lock); + vringh_set_iotlb(&mvdev->cvq.vring, mvdev->cvq.iotlb, &mvdev->cvq.iommu_lock); + + return 0; +} + +static void cleanup_ctrl_vq(struct mlx5_vdpa_dev *mvdev) +{ + vhost_iotlb_free(mvdev->cvq.iotlb); +} + +int mlx5_vdpa_alloc_resources(struct mlx5_vdpa_dev *mvdev) +{ + u64 offset = MLX5_CAP64_DEV_VDPA_EMULATION(mvdev->mdev, doorbell_bar_offset); + struct mlx5_vdpa_resources *res = &mvdev->res; + struct mlx5_core_dev *mdev = mvdev->mdev; + u64 kick_addr; + int err; + + if (res->valid) { + mlx5_vdpa_warn(mvdev, "resources already allocated\n"); + return -EINVAL; + } + mutex_init(&mvdev->mr.mkey_mtx); + res->uar = mlx5_get_uars_page(mdev); + if (IS_ERR(res->uar)) { + err = PTR_ERR(res->uar); + goto err_uars; + } + + err = create_uctx(mvdev, &res->uid); + if (err) + goto err_uctx; + + err = alloc_pd(mvdev, &res->pdn, res->uid); + if (err) + goto err_pd; + + err = get_null_mkey(mvdev, &res->null_mkey); + if (err) + goto err_key; + + kick_addr = mdev->bar_addr + offset; + res->phys_kick_addr = kick_addr; + + res->kick_addr = ioremap(kick_addr, PAGE_SIZE); + if (!res->kick_addr) { + err = -ENOMEM; + goto err_key; + } + + err = init_ctrl_vq(mvdev); + if (err) + goto err_ctrl; + + res->valid = true; + + return 0; + +err_ctrl: + iounmap(res->kick_addr); +err_key: + dealloc_pd(mvdev, res->pdn, res->uid); +err_pd: + destroy_uctx(mvdev, res->uid); +err_uctx: + mlx5_put_uars_page(mdev, res->uar); +err_uars: + mutex_destroy(&mvdev->mr.mkey_mtx); + return err; +} + +void mlx5_vdpa_free_resources(struct mlx5_vdpa_dev *mvdev) +{ + struct mlx5_vdpa_resources *res = &mvdev->res; + + if (!res->valid) + return; + + cleanup_ctrl_vq(mvdev); + iounmap(res->kick_addr); + res->kick_addr = NULL; + dealloc_pd(mvdev, res->pdn, res->uid); + destroy_uctx(mvdev, res->uid); + mlx5_put_uars_page(mvdev->mdev, res->uar); + mutex_destroy(&mvdev->mr.mkey_mtx); + res->valid = false; +} diff --git a/drivers/vdpa/mlx5/net/debug.c b/drivers/vdpa/mlx5/net/debug.c new file mode 100644 index 0000000000..9c85162c19 --- /dev/null +++ b/drivers/vdpa/mlx5/net/debug.c @@ -0,0 +1,153 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. */ + +#include <linux/debugfs.h> +#include <linux/mlx5/fs.h> +#include "mlx5_vnet.h" + +static int tirn_show(struct seq_file *file, void *priv) +{ + struct mlx5_vdpa_net *ndev = file->private; + + seq_printf(file, "0x%x\n", ndev->res.tirn); + return 0; +} + +DEFINE_SHOW_ATTRIBUTE(tirn); + +void mlx5_vdpa_remove_tirn(struct mlx5_vdpa_net *ndev) +{ + if (ndev->debugfs) + debugfs_remove(ndev->res.tirn_dent); +} + +void mlx5_vdpa_add_tirn(struct mlx5_vdpa_net *ndev) +{ + ndev->res.tirn_dent = debugfs_create_file("tirn", 0444, ndev->rx_dent, + ndev, &tirn_fops); +} + +static int rx_flow_table_show(struct seq_file *file, void *priv) +{ + struct mlx5_vdpa_net *ndev = file->private; + + seq_printf(file, "0x%x\n", mlx5_flow_table_id(ndev->rxft)); + return 0; +} + +DEFINE_SHOW_ATTRIBUTE(rx_flow_table); + +void mlx5_vdpa_remove_rx_flow_table(struct mlx5_vdpa_net *ndev) +{ + if (ndev->debugfs) + debugfs_remove(ndev->rx_table_dent); +} + +void mlx5_vdpa_add_rx_flow_table(struct mlx5_vdpa_net *ndev) +{ + ndev->rx_table_dent = debugfs_create_file("table_id", 0444, ndev->rx_dent, + ndev, &rx_flow_table_fops); +} + +#if defined(CONFIG_MLX5_VDPA_STEERING_DEBUG) +static int packets_show(struct seq_file *file, void *priv) +{ + struct mlx5_vdpa_counter *counter = file->private; + u64 packets; + u64 bytes; + int err; + + err = mlx5_fc_query(counter->mdev, counter->counter, &packets, &bytes); + if (err) + return err; + + seq_printf(file, "0x%llx\n", packets); + return 0; +} + +static int bytes_show(struct seq_file *file, void *priv) +{ + struct mlx5_vdpa_counter *counter = file->private; + u64 packets; + u64 bytes; + int err; + + err = mlx5_fc_query(counter->mdev, counter->counter, &packets, &bytes); + if (err) + return err; + + seq_printf(file, "0x%llx\n", bytes); + return 0; +} + +DEFINE_SHOW_ATTRIBUTE(packets); +DEFINE_SHOW_ATTRIBUTE(bytes); + +static void add_counter_node(struct mlx5_vdpa_counter *counter, + struct dentry *parent) +{ + debugfs_create_file("packets", 0444, parent, counter, + &packets_fops); + debugfs_create_file("bytes", 0444, parent, counter, + &bytes_fops); +} + +void mlx5_vdpa_add_rx_counters(struct mlx5_vdpa_net *ndev, + struct macvlan_node *node) +{ + static const char *ut = "untagged"; + char vidstr[9]; + u16 vid; + + node->ucast_counter.mdev = ndev->mvdev.mdev; + node->mcast_counter.mdev = ndev->mvdev.mdev; + if (node->tagged) { + vid = key2vid(node->macvlan); + snprintf(vidstr, sizeof(vidstr), "0x%x", vid); + } else { + strcpy(vidstr, ut); + } + + node->dent = debugfs_create_dir(vidstr, ndev->rx_dent); + if (IS_ERR(node->dent)) { + node->dent = NULL; + return; + } + + node->ucast_counter.dent = debugfs_create_dir("ucast", node->dent); + if (IS_ERR(node->ucast_counter.dent)) + return; + + add_counter_node(&node->ucast_counter, node->ucast_counter.dent); + + node->mcast_counter.dent = debugfs_create_dir("mcast", node->dent); + if (IS_ERR(node->mcast_counter.dent)) + return; + + add_counter_node(&node->mcast_counter, node->mcast_counter.dent); +} + +void mlx5_vdpa_remove_rx_counters(struct mlx5_vdpa_net *ndev, + struct macvlan_node *node) +{ + if (node->dent && ndev->debugfs) + debugfs_remove_recursive(node->dent); +} +#endif + +void mlx5_vdpa_add_debugfs(struct mlx5_vdpa_net *ndev) +{ + struct mlx5_core_dev *mdev; + + mdev = ndev->mvdev.mdev; + ndev->debugfs = debugfs_create_dir(dev_name(&ndev->mvdev.vdev.dev), + mlx5_debugfs_get_dev_root(mdev)); + if (!IS_ERR(ndev->debugfs)) + ndev->rx_dent = debugfs_create_dir("rx", ndev->debugfs); +} + +void mlx5_vdpa_remove_debugfs(struct mlx5_vdpa_net *ndev) +{ + debugfs_remove_recursive(ndev->debugfs); + ndev->debugfs = NULL; +} diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c b/drivers/vdpa/mlx5/net/mlx5_vnet.c new file mode 100644 index 0000000000..ca972af3c8 --- /dev/null +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c @@ -0,0 +1,3620 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* Copyright (c) 2020 Mellanox Technologies Ltd. */ + +#include <linux/module.h> +#include <linux/vdpa.h> +#include <linux/vringh.h> +#include <uapi/linux/virtio_net.h> +#include <uapi/linux/virtio_ids.h> +#include <uapi/linux/vdpa.h> +#include <linux/virtio_config.h> +#include <linux/auxiliary_bus.h> +#include <linux/mlx5/cq.h> +#include <linux/mlx5/qp.h> +#include <linux/mlx5/device.h> +#include <linux/mlx5/driver.h> +#include <linux/mlx5/vport.h> +#include <linux/mlx5/fs.h> +#include <linux/mlx5/mlx5_ifc_vdpa.h> +#include <linux/mlx5/mpfs.h> +#include "mlx5_vdpa.h" +#include "mlx5_vnet.h" + +MODULE_AUTHOR("Eli Cohen <eli@mellanox.com>"); +MODULE_DESCRIPTION("Mellanox VDPA driver"); +MODULE_LICENSE("Dual BSD/GPL"); + +#define VALID_FEATURES_MASK \ + (BIT_ULL(VIRTIO_NET_F_CSUM) | BIT_ULL(VIRTIO_NET_F_GUEST_CSUM) | \ + BIT_ULL(VIRTIO_NET_F_CTRL_GUEST_OFFLOADS) | BIT_ULL(VIRTIO_NET_F_MTU) | BIT_ULL(VIRTIO_NET_F_MAC) | \ + BIT_ULL(VIRTIO_NET_F_GUEST_TSO4) | BIT_ULL(VIRTIO_NET_F_GUEST_TSO6) | \ + BIT_ULL(VIRTIO_NET_F_GUEST_ECN) | BIT_ULL(VIRTIO_NET_F_GUEST_UFO) | BIT_ULL(VIRTIO_NET_F_HOST_TSO4) | \ + BIT_ULL(VIRTIO_NET_F_HOST_TSO6) | BIT_ULL(VIRTIO_NET_F_HOST_ECN) | BIT_ULL(VIRTIO_NET_F_HOST_UFO) | \ + BIT_ULL(VIRTIO_NET_F_MRG_RXBUF) | BIT_ULL(VIRTIO_NET_F_STATUS) | BIT_ULL(VIRTIO_NET_F_CTRL_VQ) | \ + BIT_ULL(VIRTIO_NET_F_CTRL_RX) | BIT_ULL(VIRTIO_NET_F_CTRL_VLAN) | \ + BIT_ULL(VIRTIO_NET_F_CTRL_RX_EXTRA) | BIT_ULL(VIRTIO_NET_F_GUEST_ANNOUNCE) | \ + BIT_ULL(VIRTIO_NET_F_MQ) | BIT_ULL(VIRTIO_NET_F_CTRL_MAC_ADDR) | BIT_ULL(VIRTIO_NET_F_HASH_REPORT) | \ + BIT_ULL(VIRTIO_NET_F_RSS) | BIT_ULL(VIRTIO_NET_F_RSC_EXT) | BIT_ULL(VIRTIO_NET_F_STANDBY) | \ + BIT_ULL(VIRTIO_NET_F_SPEED_DUPLEX) | BIT_ULL(VIRTIO_F_NOTIFY_ON_EMPTY) | \ + BIT_ULL(VIRTIO_F_ANY_LAYOUT) | BIT_ULL(VIRTIO_F_VERSION_1) | BIT_ULL(VIRTIO_F_ACCESS_PLATFORM) | \ + BIT_ULL(VIRTIO_F_RING_PACKED) | BIT_ULL(VIRTIO_F_ORDER_PLATFORM) | BIT_ULL(VIRTIO_F_SR_IOV)) + +#define VALID_STATUS_MASK \ + (VIRTIO_CONFIG_S_ACKNOWLEDGE | VIRTIO_CONFIG_S_DRIVER | VIRTIO_CONFIG_S_DRIVER_OK | \ + VIRTIO_CONFIG_S_FEATURES_OK | VIRTIO_CONFIG_S_NEEDS_RESET | VIRTIO_CONFIG_S_FAILED) + +#define MLX5_FEATURE(_mvdev, _feature) (!!((_mvdev)->actual_features & BIT_ULL(_feature))) + +#define MLX5V_UNTAGGED 0x1000 + +struct mlx5_vdpa_cq_buf { + struct mlx5_frag_buf_ctrl fbc; + struct mlx5_frag_buf frag_buf; + int cqe_size; + int nent; +}; + +struct mlx5_vdpa_cq { + struct mlx5_core_cq mcq; + struct mlx5_vdpa_cq_buf buf; + struct mlx5_db db; + int cqe; +}; + +struct mlx5_vdpa_umem { + struct mlx5_frag_buf_ctrl fbc; + struct mlx5_frag_buf frag_buf; + int size; + u32 id; +}; + +struct mlx5_vdpa_qp { + struct mlx5_core_qp mqp; + struct mlx5_frag_buf frag_buf; + struct mlx5_db db; + u16 head; + bool fw; +}; + +struct mlx5_vq_restore_info { + u32 num_ent; + u64 desc_addr; + u64 device_addr; + u64 driver_addr; + u16 avail_index; + u16 used_index; + struct msi_map map; + bool ready; + bool restore; +}; + +struct mlx5_vdpa_virtqueue { + bool ready; + u64 desc_addr; + u64 device_addr; + u64 driver_addr; + u32 num_ent; + + /* Resources for implementing the notification channel from the device + * to the driver. fwqp is the firmware end of an RC connection; the + * other end is vqqp used by the driver. cq is where completions are + * reported. + */ + struct mlx5_vdpa_cq cq; + struct mlx5_vdpa_qp fwqp; + struct mlx5_vdpa_qp vqqp; + + /* umem resources are required for the virtqueue operation. They're use + * is internal and they must be provided by the driver. + */ + struct mlx5_vdpa_umem umem1; + struct mlx5_vdpa_umem umem2; + struct mlx5_vdpa_umem umem3; + + u32 counter_set_id; + bool initialized; + int index; + u32 virtq_id; + struct mlx5_vdpa_net *ndev; + u16 avail_idx; + u16 used_idx; + int fw_state; + struct msi_map map; + + /* keep last in the struct */ + struct mlx5_vq_restore_info ri; +}; + +static bool is_index_valid(struct mlx5_vdpa_dev *mvdev, u16 idx) +{ + if (!(mvdev->actual_features & BIT_ULL(VIRTIO_NET_F_MQ))) { + if (!(mvdev->actual_features & BIT_ULL(VIRTIO_NET_F_CTRL_VQ))) + return idx < 2; + else + return idx < 3; + } + + return idx <= mvdev->max_idx; +} + +static void free_resources(struct mlx5_vdpa_net *ndev); +static void init_mvqs(struct mlx5_vdpa_net *ndev); +static int setup_driver(struct mlx5_vdpa_dev *mvdev); +static void teardown_driver(struct mlx5_vdpa_net *ndev); + +static bool mlx5_vdpa_debug; + +#define MLX5_CVQ_MAX_ENT 16 + +#define MLX5_LOG_VIO_FLAG(_feature) \ + do { \ + if (features & BIT_ULL(_feature)) \ + mlx5_vdpa_info(mvdev, "%s\n", #_feature); \ + } while (0) + +#define MLX5_LOG_VIO_STAT(_status) \ + do { \ + if (status & (_status)) \ + mlx5_vdpa_info(mvdev, "%s\n", #_status); \ + } while (0) + +/* TODO: cross-endian support */ +static inline bool mlx5_vdpa_is_little_endian(struct mlx5_vdpa_dev *mvdev) +{ + return virtio_legacy_is_little_endian() || + (mvdev->actual_features & BIT_ULL(VIRTIO_F_VERSION_1)); +} + +static u16 mlx5vdpa16_to_cpu(struct mlx5_vdpa_dev *mvdev, __virtio16 val) +{ + return __virtio16_to_cpu(mlx5_vdpa_is_little_endian(mvdev), val); +} + +static __virtio16 cpu_to_mlx5vdpa16(struct mlx5_vdpa_dev *mvdev, u16 val) +{ + return __cpu_to_virtio16(mlx5_vdpa_is_little_endian(mvdev), val); +} + +static u16 ctrl_vq_idx(struct mlx5_vdpa_dev *mvdev) +{ + if (!(mvdev->actual_features & BIT_ULL(VIRTIO_NET_F_MQ))) + return 2; + + return mvdev->max_vqs; +} + +static bool is_ctrl_vq_idx(struct mlx5_vdpa_dev *mvdev, u16 idx) +{ + return idx == ctrl_vq_idx(mvdev); +} + +static void print_status(struct mlx5_vdpa_dev *mvdev, u8 status, bool set) +{ + if (status & ~VALID_STATUS_MASK) + mlx5_vdpa_warn(mvdev, "Warning: there are invalid status bits 0x%x\n", + status & ~VALID_STATUS_MASK); + + if (!mlx5_vdpa_debug) + return; + + mlx5_vdpa_info(mvdev, "driver status %s", set ? "set" : "get"); + if (set && !status) { + mlx5_vdpa_info(mvdev, "driver resets the device\n"); + return; + } + + MLX5_LOG_VIO_STAT(VIRTIO_CONFIG_S_ACKNOWLEDGE); + MLX5_LOG_VIO_STAT(VIRTIO_CONFIG_S_DRIVER); + MLX5_LOG_VIO_STAT(VIRTIO_CONFIG_S_DRIVER_OK); + MLX5_LOG_VIO_STAT(VIRTIO_CONFIG_S_FEATURES_OK); + MLX5_LOG_VIO_STAT(VIRTIO_CONFIG_S_NEEDS_RESET); + MLX5_LOG_VIO_STAT(VIRTIO_CONFIG_S_FAILED); +} + +static void print_features(struct mlx5_vdpa_dev *mvdev, u64 features, bool set) +{ + if (features & ~VALID_FEATURES_MASK) + mlx5_vdpa_warn(mvdev, "There are invalid feature bits 0x%llx\n", + features & ~VALID_FEATURES_MASK); + + if (!mlx5_vdpa_debug) + return; + + mlx5_vdpa_info(mvdev, "driver %s feature bits:\n", set ? "sets" : "reads"); + if (!features) + mlx5_vdpa_info(mvdev, "all feature bits are cleared\n"); + + MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_CSUM); + MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_GUEST_CSUM); + MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_CTRL_GUEST_OFFLOADS); + MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_MTU); + MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_MAC); + MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_GUEST_TSO4); + MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_GUEST_TSO6); + MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_GUEST_ECN); + MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_GUEST_UFO); + MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_HOST_TSO4); + MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_HOST_TSO6); + MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_HOST_ECN); + MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_HOST_UFO); + MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_MRG_RXBUF); + MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_STATUS); + MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_CTRL_VQ); + MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_CTRL_RX); + MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_CTRL_VLAN); + MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_CTRL_RX_EXTRA); + MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_GUEST_ANNOUNCE); + MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_MQ); + MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_CTRL_MAC_ADDR); + MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_HASH_REPORT); + MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_RSS); + MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_RSC_EXT); + MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_STANDBY); + MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_SPEED_DUPLEX); + MLX5_LOG_VIO_FLAG(VIRTIO_F_NOTIFY_ON_EMPTY); + MLX5_LOG_VIO_FLAG(VIRTIO_F_ANY_LAYOUT); + MLX5_LOG_VIO_FLAG(VIRTIO_F_VERSION_1); + MLX5_LOG_VIO_FLAG(VIRTIO_F_ACCESS_PLATFORM); + MLX5_LOG_VIO_FLAG(VIRTIO_F_RING_PACKED); + MLX5_LOG_VIO_FLAG(VIRTIO_F_ORDER_PLATFORM); + MLX5_LOG_VIO_FLAG(VIRTIO_F_SR_IOV); +} + +static int create_tis(struct mlx5_vdpa_net *ndev) +{ + struct mlx5_vdpa_dev *mvdev = &ndev->mvdev; + u32 in[MLX5_ST_SZ_DW(create_tis_in)] = {}; + void *tisc; + int err; + + tisc = MLX5_ADDR_OF(create_tis_in, in, ctx); + MLX5_SET(tisc, tisc, transport_domain, ndev->res.tdn); + err = mlx5_vdpa_create_tis(mvdev, in, &ndev->res.tisn); + if (err) + mlx5_vdpa_warn(mvdev, "create TIS (%d)\n", err); + + return err; +} + +static void destroy_tis(struct mlx5_vdpa_net *ndev) +{ + mlx5_vdpa_destroy_tis(&ndev->mvdev, ndev->res.tisn); +} + +#define MLX5_VDPA_CQE_SIZE 64 +#define MLX5_VDPA_LOG_CQE_SIZE ilog2(MLX5_VDPA_CQE_SIZE) + +static int cq_frag_buf_alloc(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_cq_buf *buf, int nent) +{ + struct mlx5_frag_buf *frag_buf = &buf->frag_buf; + u8 log_wq_stride = MLX5_VDPA_LOG_CQE_SIZE; + u8 log_wq_sz = MLX5_VDPA_LOG_CQE_SIZE; + int err; + + err = mlx5_frag_buf_alloc_node(ndev->mvdev.mdev, nent * MLX5_VDPA_CQE_SIZE, frag_buf, + ndev->mvdev.mdev->priv.numa_node); + if (err) + return err; + + mlx5_init_fbc(frag_buf->frags, log_wq_stride, log_wq_sz, &buf->fbc); + + buf->cqe_size = MLX5_VDPA_CQE_SIZE; + buf->nent = nent; + + return 0; +} + +static int umem_frag_buf_alloc(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_umem *umem, int size) +{ + struct mlx5_frag_buf *frag_buf = &umem->frag_buf; + + return mlx5_frag_buf_alloc_node(ndev->mvdev.mdev, size, frag_buf, + ndev->mvdev.mdev->priv.numa_node); +} + +static void cq_frag_buf_free(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_cq_buf *buf) +{ + mlx5_frag_buf_free(ndev->mvdev.mdev, &buf->frag_buf); +} + +static void *get_cqe(struct mlx5_vdpa_cq *vcq, int n) +{ + return mlx5_frag_buf_get_wqe(&vcq->buf.fbc, n); +} + +static void cq_frag_buf_init(struct mlx5_vdpa_cq *vcq, struct mlx5_vdpa_cq_buf *buf) +{ + struct mlx5_cqe64 *cqe64; + void *cqe; + int i; + + for (i = 0; i < buf->nent; i++) { + cqe = get_cqe(vcq, i); + cqe64 = cqe; + cqe64->op_own = MLX5_CQE_INVALID << 4; + } +} + +static void *get_sw_cqe(struct mlx5_vdpa_cq *cq, int n) +{ + struct mlx5_cqe64 *cqe64 = get_cqe(cq, n & (cq->cqe - 1)); + + if (likely(get_cqe_opcode(cqe64) != MLX5_CQE_INVALID) && + !((cqe64->op_own & MLX5_CQE_OWNER_MASK) ^ !!(n & cq->cqe))) + return cqe64; + + return NULL; +} + +static void rx_post(struct mlx5_vdpa_qp *vqp, int n) +{ + vqp->head += n; + vqp->db.db[0] = cpu_to_be32(vqp->head); +} + +static void qp_prepare(struct mlx5_vdpa_net *ndev, bool fw, void *in, + struct mlx5_vdpa_virtqueue *mvq, u32 num_ent) +{ + struct mlx5_vdpa_qp *vqp; + __be64 *pas; + void *qpc; + + vqp = fw ? &mvq->fwqp : &mvq->vqqp; + MLX5_SET(create_qp_in, in, uid, ndev->mvdev.res.uid); + qpc = MLX5_ADDR_OF(create_qp_in, in, qpc); + if (vqp->fw) { + /* Firmware QP is allocated by the driver for the firmware's + * use so we can skip part of the params as they will be chosen by firmware + */ + qpc = MLX5_ADDR_OF(create_qp_in, in, qpc); + MLX5_SET(qpc, qpc, rq_type, MLX5_ZERO_LEN_RQ); + MLX5_SET(qpc, qpc, no_sq, 1); + return; + } + + MLX5_SET(qpc, qpc, st, MLX5_QP_ST_RC); + MLX5_SET(qpc, qpc, pm_state, MLX5_QP_PM_MIGRATED); + MLX5_SET(qpc, qpc, pd, ndev->mvdev.res.pdn); + MLX5_SET(qpc, qpc, mtu, MLX5_QPC_MTU_256_BYTES); + MLX5_SET(qpc, qpc, uar_page, ndev->mvdev.res.uar->index); + MLX5_SET(qpc, qpc, log_page_size, vqp->frag_buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT); + MLX5_SET(qpc, qpc, no_sq, 1); + MLX5_SET(qpc, qpc, cqn_rcv, mvq->cq.mcq.cqn); + MLX5_SET(qpc, qpc, log_rq_size, ilog2(num_ent)); + MLX5_SET(qpc, qpc, rq_type, MLX5_NON_ZERO_RQ); + pas = (__be64 *)MLX5_ADDR_OF(create_qp_in, in, pas); + mlx5_fill_page_frag_array(&vqp->frag_buf, pas); +} + +static int rq_buf_alloc(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_qp *vqp, u32 num_ent) +{ + return mlx5_frag_buf_alloc_node(ndev->mvdev.mdev, + num_ent * sizeof(struct mlx5_wqe_data_seg), &vqp->frag_buf, + ndev->mvdev.mdev->priv.numa_node); +} + +static void rq_buf_free(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_qp *vqp) +{ + mlx5_frag_buf_free(ndev->mvdev.mdev, &vqp->frag_buf); +} + +static int qp_create(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq, + struct mlx5_vdpa_qp *vqp) +{ + struct mlx5_core_dev *mdev = ndev->mvdev.mdev; + int inlen = MLX5_ST_SZ_BYTES(create_qp_in); + u32 out[MLX5_ST_SZ_DW(create_qp_out)] = {}; + void *qpc; + void *in; + int err; + + if (!vqp->fw) { + vqp = &mvq->vqqp; + err = rq_buf_alloc(ndev, vqp, mvq->num_ent); + if (err) + return err; + + err = mlx5_db_alloc(ndev->mvdev.mdev, &vqp->db); + if (err) + goto err_db; + inlen += vqp->frag_buf.npages * sizeof(__be64); + } + + in = kzalloc(inlen, GFP_KERNEL); + if (!in) { + err = -ENOMEM; + goto err_kzalloc; + } + + qp_prepare(ndev, vqp->fw, in, mvq, mvq->num_ent); + qpc = MLX5_ADDR_OF(create_qp_in, in, qpc); + MLX5_SET(qpc, qpc, st, MLX5_QP_ST_RC); + MLX5_SET(qpc, qpc, pm_state, MLX5_QP_PM_MIGRATED); + MLX5_SET(qpc, qpc, pd, ndev->mvdev.res.pdn); + MLX5_SET(qpc, qpc, mtu, MLX5_QPC_MTU_256_BYTES); + if (!vqp->fw) + MLX5_SET64(qpc, qpc, dbr_addr, vqp->db.dma); + MLX5_SET(create_qp_in, in, opcode, MLX5_CMD_OP_CREATE_QP); + err = mlx5_cmd_exec(mdev, in, inlen, out, sizeof(out)); + kfree(in); + if (err) + goto err_kzalloc; + + vqp->mqp.uid = ndev->mvdev.res.uid; + vqp->mqp.qpn = MLX5_GET(create_qp_out, out, qpn); + + if (!vqp->fw) + rx_post(vqp, mvq->num_ent); + + return 0; + +err_kzalloc: + if (!vqp->fw) + mlx5_db_free(ndev->mvdev.mdev, &vqp->db); +err_db: + if (!vqp->fw) + rq_buf_free(ndev, vqp); + + return err; +} + +static void qp_destroy(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_qp *vqp) +{ + u32 in[MLX5_ST_SZ_DW(destroy_qp_in)] = {}; + + MLX5_SET(destroy_qp_in, in, opcode, MLX5_CMD_OP_DESTROY_QP); + MLX5_SET(destroy_qp_in, in, qpn, vqp->mqp.qpn); + MLX5_SET(destroy_qp_in, in, uid, ndev->mvdev.res.uid); + if (mlx5_cmd_exec_in(ndev->mvdev.mdev, destroy_qp, in)) + mlx5_vdpa_warn(&ndev->mvdev, "destroy qp 0x%x\n", vqp->mqp.qpn); + if (!vqp->fw) { + mlx5_db_free(ndev->mvdev.mdev, &vqp->db); + rq_buf_free(ndev, vqp); + } +} + +static void *next_cqe_sw(struct mlx5_vdpa_cq *cq) +{ + return get_sw_cqe(cq, cq->mcq.cons_index); +} + +static int mlx5_vdpa_poll_one(struct mlx5_vdpa_cq *vcq) +{ + struct mlx5_cqe64 *cqe64; + + cqe64 = next_cqe_sw(vcq); + if (!cqe64) + return -EAGAIN; + + vcq->mcq.cons_index++; + return 0; +} + +static void mlx5_vdpa_handle_completions(struct mlx5_vdpa_virtqueue *mvq, int num) +{ + struct mlx5_vdpa_net *ndev = mvq->ndev; + struct vdpa_callback *event_cb; + + event_cb = &ndev->event_cbs[mvq->index]; + mlx5_cq_set_ci(&mvq->cq.mcq); + + /* make sure CQ cosumer update is visible to the hardware before updating + * RX doorbell record. + */ + dma_wmb(); + rx_post(&mvq->vqqp, num); + if (event_cb->callback) + event_cb->callback(event_cb->private); +} + +static void mlx5_vdpa_cq_comp(struct mlx5_core_cq *mcq, struct mlx5_eqe *eqe) +{ + struct mlx5_vdpa_virtqueue *mvq = container_of(mcq, struct mlx5_vdpa_virtqueue, cq.mcq); + struct mlx5_vdpa_net *ndev = mvq->ndev; + void __iomem *uar_page = ndev->mvdev.res.uar->map; + int num = 0; + + while (!mlx5_vdpa_poll_one(&mvq->cq)) { + num++; + if (num > mvq->num_ent / 2) { + /* If completions keep coming while we poll, we want to + * let the hardware know that we consumed them by + * updating the doorbell record. We also let vdpa core + * know about this so it passes it on the virtio driver + * on the guest. + */ + mlx5_vdpa_handle_completions(mvq, num); + num = 0; + } + } + + if (num) + mlx5_vdpa_handle_completions(mvq, num); + + mlx5_cq_arm(&mvq->cq.mcq, MLX5_CQ_DB_REQ_NOT, uar_page, mvq->cq.mcq.cons_index); +} + +static int cq_create(struct mlx5_vdpa_net *ndev, u16 idx, u32 num_ent) +{ + struct mlx5_vdpa_virtqueue *mvq = &ndev->vqs[idx]; + struct mlx5_core_dev *mdev = ndev->mvdev.mdev; + void __iomem *uar_page = ndev->mvdev.res.uar->map; + u32 out[MLX5_ST_SZ_DW(create_cq_out)]; + struct mlx5_vdpa_cq *vcq = &mvq->cq; + __be64 *pas; + int inlen; + void *cqc; + void *in; + int err; + int eqn; + + err = mlx5_db_alloc(mdev, &vcq->db); + if (err) + return err; + + vcq->mcq.set_ci_db = vcq->db.db; + vcq->mcq.arm_db = vcq->db.db + 1; + vcq->mcq.cqe_sz = 64; + + err = cq_frag_buf_alloc(ndev, &vcq->buf, num_ent); + if (err) + goto err_db; + + cq_frag_buf_init(vcq, &vcq->buf); + + inlen = MLX5_ST_SZ_BYTES(create_cq_in) + + MLX5_FLD_SZ_BYTES(create_cq_in, pas[0]) * vcq->buf.frag_buf.npages; + in = kzalloc(inlen, GFP_KERNEL); + if (!in) { + err = -ENOMEM; + goto err_vzalloc; + } + + MLX5_SET(create_cq_in, in, uid, ndev->mvdev.res.uid); + pas = (__be64 *)MLX5_ADDR_OF(create_cq_in, in, pas); + mlx5_fill_page_frag_array(&vcq->buf.frag_buf, pas); + + cqc = MLX5_ADDR_OF(create_cq_in, in, cq_context); + MLX5_SET(cqc, cqc, log_page_size, vcq->buf.frag_buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT); + + /* Use vector 0 by default. Consider adding code to choose least used + * vector. + */ + err = mlx5_comp_eqn_get(mdev, 0, &eqn); + if (err) + goto err_vec; + + cqc = MLX5_ADDR_OF(create_cq_in, in, cq_context); + MLX5_SET(cqc, cqc, log_cq_size, ilog2(num_ent)); + MLX5_SET(cqc, cqc, uar_page, ndev->mvdev.res.uar->index); + MLX5_SET(cqc, cqc, c_eqn_or_apu_element, eqn); + MLX5_SET64(cqc, cqc, dbr_addr, vcq->db.dma); + + err = mlx5_core_create_cq(mdev, &vcq->mcq, in, inlen, out, sizeof(out)); + if (err) + goto err_vec; + + vcq->mcq.comp = mlx5_vdpa_cq_comp; + vcq->cqe = num_ent; + vcq->mcq.set_ci_db = vcq->db.db; + vcq->mcq.arm_db = vcq->db.db + 1; + mlx5_cq_arm(&mvq->cq.mcq, MLX5_CQ_DB_REQ_NOT, uar_page, mvq->cq.mcq.cons_index); + kfree(in); + return 0; + +err_vec: + kfree(in); +err_vzalloc: + cq_frag_buf_free(ndev, &vcq->buf); +err_db: + mlx5_db_free(ndev->mvdev.mdev, &vcq->db); + return err; +} + +static void cq_destroy(struct mlx5_vdpa_net *ndev, u16 idx) +{ + struct mlx5_vdpa_virtqueue *mvq = &ndev->vqs[idx]; + struct mlx5_core_dev *mdev = ndev->mvdev.mdev; + struct mlx5_vdpa_cq *vcq = &mvq->cq; + + if (mlx5_core_destroy_cq(mdev, &vcq->mcq)) { + mlx5_vdpa_warn(&ndev->mvdev, "destroy CQ 0x%x\n", vcq->mcq.cqn); + return; + } + cq_frag_buf_free(ndev, &vcq->buf); + mlx5_db_free(ndev->mvdev.mdev, &vcq->db); +} + +static int read_umem_params(struct mlx5_vdpa_net *ndev) +{ + u32 in[MLX5_ST_SZ_DW(query_hca_cap_in)] = {}; + u16 opmod = (MLX5_CAP_VDPA_EMULATION << 1) | (HCA_CAP_OPMOD_GET_CUR & 0x01); + struct mlx5_core_dev *mdev = ndev->mvdev.mdev; + int out_size; + void *caps; + void *out; + int err; + + out_size = MLX5_ST_SZ_BYTES(query_hca_cap_out); + out = kzalloc(out_size, GFP_KERNEL); + if (!out) + return -ENOMEM; + + MLX5_SET(query_hca_cap_in, in, opcode, MLX5_CMD_OP_QUERY_HCA_CAP); + MLX5_SET(query_hca_cap_in, in, op_mod, opmod); + err = mlx5_cmd_exec_inout(mdev, query_hca_cap, in, out); + if (err) { + mlx5_vdpa_warn(&ndev->mvdev, + "Failed reading vdpa umem capabilities with err %d\n", err); + goto out; + } + + caps = MLX5_ADDR_OF(query_hca_cap_out, out, capability); + + ndev->umem_1_buffer_param_a = MLX5_GET(virtio_emulation_cap, caps, umem_1_buffer_param_a); + ndev->umem_1_buffer_param_b = MLX5_GET(virtio_emulation_cap, caps, umem_1_buffer_param_b); + + ndev->umem_2_buffer_param_a = MLX5_GET(virtio_emulation_cap, caps, umem_2_buffer_param_a); + ndev->umem_2_buffer_param_b = MLX5_GET(virtio_emulation_cap, caps, umem_2_buffer_param_b); + + ndev->umem_3_buffer_param_a = MLX5_GET(virtio_emulation_cap, caps, umem_3_buffer_param_a); + ndev->umem_3_buffer_param_b = MLX5_GET(virtio_emulation_cap, caps, umem_3_buffer_param_b); + +out: + kfree(out); + return 0; +} + +static void set_umem_size(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq, int num, + struct mlx5_vdpa_umem **umemp) +{ + u32 p_a; + u32 p_b; + + switch (num) { + case 1: + p_a = ndev->umem_1_buffer_param_a; + p_b = ndev->umem_1_buffer_param_b; + *umemp = &mvq->umem1; + break; + case 2: + p_a = ndev->umem_2_buffer_param_a; + p_b = ndev->umem_2_buffer_param_b; + *umemp = &mvq->umem2; + break; + case 3: + p_a = ndev->umem_3_buffer_param_a; + p_b = ndev->umem_3_buffer_param_b; + *umemp = &mvq->umem3; + break; + } + + (*umemp)->size = p_a * mvq->num_ent + p_b; +} + +static void umem_frag_buf_free(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_umem *umem) +{ + mlx5_frag_buf_free(ndev->mvdev.mdev, &umem->frag_buf); +} + +static int create_umem(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq, int num) +{ + int inlen; + u32 out[MLX5_ST_SZ_DW(create_umem_out)] = {}; + void *um; + void *in; + int err; + __be64 *pas; + struct mlx5_vdpa_umem *umem; + + set_umem_size(ndev, mvq, num, &umem); + err = umem_frag_buf_alloc(ndev, umem, umem->size); + if (err) + return err; + + inlen = MLX5_ST_SZ_BYTES(create_umem_in) + MLX5_ST_SZ_BYTES(mtt) * umem->frag_buf.npages; + + in = kzalloc(inlen, GFP_KERNEL); + if (!in) { + err = -ENOMEM; + goto err_in; + } + + MLX5_SET(create_umem_in, in, opcode, MLX5_CMD_OP_CREATE_UMEM); + MLX5_SET(create_umem_in, in, uid, ndev->mvdev.res.uid); + um = MLX5_ADDR_OF(create_umem_in, in, umem); + MLX5_SET(umem, um, log_page_size, umem->frag_buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT); + MLX5_SET64(umem, um, num_of_mtt, umem->frag_buf.npages); + + pas = (__be64 *)MLX5_ADDR_OF(umem, um, mtt[0]); + mlx5_fill_page_frag_array_perm(&umem->frag_buf, pas, MLX5_MTT_PERM_RW); + + err = mlx5_cmd_exec(ndev->mvdev.mdev, in, inlen, out, sizeof(out)); + if (err) { + mlx5_vdpa_warn(&ndev->mvdev, "create umem(%d)\n", err); + goto err_cmd; + } + + kfree(in); + umem->id = MLX5_GET(create_umem_out, out, umem_id); + + return 0; + +err_cmd: + kfree(in); +err_in: + umem_frag_buf_free(ndev, umem); + return err; +} + +static void umem_destroy(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq, int num) +{ + u32 in[MLX5_ST_SZ_DW(destroy_umem_in)] = {}; + u32 out[MLX5_ST_SZ_DW(destroy_umem_out)] = {}; + struct mlx5_vdpa_umem *umem; + + switch (num) { + case 1: + umem = &mvq->umem1; + break; + case 2: + umem = &mvq->umem2; + break; + case 3: + umem = &mvq->umem3; + break; + } + + MLX5_SET(destroy_umem_in, in, opcode, MLX5_CMD_OP_DESTROY_UMEM); + MLX5_SET(destroy_umem_in, in, umem_id, umem->id); + if (mlx5_cmd_exec(ndev->mvdev.mdev, in, sizeof(in), out, sizeof(out))) + return; + + umem_frag_buf_free(ndev, umem); +} + +static int umems_create(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq) +{ + int num; + int err; + + for (num = 1; num <= 3; num++) { + err = create_umem(ndev, mvq, num); + if (err) + goto err_umem; + } + return 0; + +err_umem: + for (num--; num > 0; num--) + umem_destroy(ndev, mvq, num); + + return err; +} + +static void umems_destroy(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq) +{ + int num; + + for (num = 3; num > 0; num--) + umem_destroy(ndev, mvq, num); +} + +static int get_queue_type(struct mlx5_vdpa_net *ndev) +{ + u32 type_mask; + + type_mask = MLX5_CAP_DEV_VDPA_EMULATION(ndev->mvdev.mdev, virtio_queue_type); + + /* prefer split queue */ + if (type_mask & MLX5_VIRTIO_EMULATION_CAP_VIRTIO_QUEUE_TYPE_SPLIT) + return MLX5_VIRTIO_EMULATION_VIRTIO_QUEUE_TYPE_SPLIT; + + WARN_ON(!(type_mask & MLX5_VIRTIO_EMULATION_CAP_VIRTIO_QUEUE_TYPE_PACKED)); + + return MLX5_VIRTIO_EMULATION_VIRTIO_QUEUE_TYPE_PACKED; +} + +static bool vq_is_tx(u16 idx) +{ + return idx % 2; +} + +enum { + MLX5_VIRTIO_NET_F_MRG_RXBUF = 2, + MLX5_VIRTIO_NET_F_HOST_ECN = 4, + MLX5_VIRTIO_NET_F_GUEST_ECN = 6, + MLX5_VIRTIO_NET_F_GUEST_TSO6 = 7, + MLX5_VIRTIO_NET_F_GUEST_TSO4 = 8, + MLX5_VIRTIO_NET_F_GUEST_CSUM = 9, + MLX5_VIRTIO_NET_F_CSUM = 10, + MLX5_VIRTIO_NET_F_HOST_TSO6 = 11, + MLX5_VIRTIO_NET_F_HOST_TSO4 = 12, +}; + +static u16 get_features(u64 features) +{ + return (!!(features & BIT_ULL(VIRTIO_NET_F_MRG_RXBUF)) << MLX5_VIRTIO_NET_F_MRG_RXBUF) | + (!!(features & BIT_ULL(VIRTIO_NET_F_HOST_ECN)) << MLX5_VIRTIO_NET_F_HOST_ECN) | + (!!(features & BIT_ULL(VIRTIO_NET_F_GUEST_ECN)) << MLX5_VIRTIO_NET_F_GUEST_ECN) | + (!!(features & BIT_ULL(VIRTIO_NET_F_GUEST_TSO6)) << MLX5_VIRTIO_NET_F_GUEST_TSO6) | + (!!(features & BIT_ULL(VIRTIO_NET_F_GUEST_TSO4)) << MLX5_VIRTIO_NET_F_GUEST_TSO4) | + (!!(features & BIT_ULL(VIRTIO_NET_F_CSUM)) << MLX5_VIRTIO_NET_F_CSUM) | + (!!(features & BIT_ULL(VIRTIO_NET_F_HOST_TSO6)) << MLX5_VIRTIO_NET_F_HOST_TSO6) | + (!!(features & BIT_ULL(VIRTIO_NET_F_HOST_TSO4)) << MLX5_VIRTIO_NET_F_HOST_TSO4); +} + +static bool counters_supported(const struct mlx5_vdpa_dev *mvdev) +{ + return MLX5_CAP_GEN_64(mvdev->mdev, general_obj_types) & + BIT_ULL(MLX5_OBJ_TYPE_VIRTIO_Q_COUNTERS); +} + +static bool msix_mode_supported(struct mlx5_vdpa_dev *mvdev) +{ + return MLX5_CAP_DEV_VDPA_EMULATION(mvdev->mdev, event_mode) & + (1 << MLX5_VIRTIO_Q_EVENT_MODE_MSIX_MODE) && + pci_msix_can_alloc_dyn(mvdev->mdev->pdev); +} + +static int create_virtqueue(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq) +{ + int inlen = MLX5_ST_SZ_BYTES(create_virtio_net_q_in); + u32 out[MLX5_ST_SZ_DW(create_virtio_net_q_out)] = {}; + void *obj_context; + u16 mlx_features; + void *cmd_hdr; + void *vq_ctx; + void *in; + int err; + + err = umems_create(ndev, mvq); + if (err) + return err; + + in = kzalloc(inlen, GFP_KERNEL); + if (!in) { + err = -ENOMEM; + goto err_alloc; + } + + mlx_features = get_features(ndev->mvdev.actual_features); + cmd_hdr = MLX5_ADDR_OF(create_virtio_net_q_in, in, general_obj_in_cmd_hdr); + + MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode, MLX5_CMD_OP_CREATE_GENERAL_OBJECT); + MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type, MLX5_OBJ_TYPE_VIRTIO_NET_Q); + MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, uid, ndev->mvdev.res.uid); + + obj_context = MLX5_ADDR_OF(create_virtio_net_q_in, in, obj_context); + MLX5_SET(virtio_net_q_object, obj_context, hw_available_index, mvq->avail_idx); + MLX5_SET(virtio_net_q_object, obj_context, hw_used_index, mvq->used_idx); + MLX5_SET(virtio_net_q_object, obj_context, queue_feature_bit_mask_12_3, + mlx_features >> 3); + MLX5_SET(virtio_net_q_object, obj_context, queue_feature_bit_mask_2_0, + mlx_features & 7); + vq_ctx = MLX5_ADDR_OF(virtio_net_q_object, obj_context, virtio_q_context); + MLX5_SET(virtio_q, vq_ctx, virtio_q_type, get_queue_type(ndev)); + + if (vq_is_tx(mvq->index)) + MLX5_SET(virtio_net_q_object, obj_context, tisn_or_qpn, ndev->res.tisn); + + if (mvq->map.virq) { + MLX5_SET(virtio_q, vq_ctx, event_mode, MLX5_VIRTIO_Q_EVENT_MODE_MSIX_MODE); + MLX5_SET(virtio_q, vq_ctx, event_qpn_or_msix, mvq->map.index); + } else { + MLX5_SET(virtio_q, vq_ctx, event_mode, MLX5_VIRTIO_Q_EVENT_MODE_QP_MODE); + MLX5_SET(virtio_q, vq_ctx, event_qpn_or_msix, mvq->fwqp.mqp.qpn); + } + + MLX5_SET(virtio_q, vq_ctx, queue_index, mvq->index); + MLX5_SET(virtio_q, vq_ctx, queue_size, mvq->num_ent); + MLX5_SET(virtio_q, vq_ctx, virtio_version_1_0, + !!(ndev->mvdev.actual_features & BIT_ULL(VIRTIO_F_VERSION_1))); + MLX5_SET64(virtio_q, vq_ctx, desc_addr, mvq->desc_addr); + MLX5_SET64(virtio_q, vq_ctx, used_addr, mvq->device_addr); + MLX5_SET64(virtio_q, vq_ctx, available_addr, mvq->driver_addr); + MLX5_SET(virtio_q, vq_ctx, virtio_q_mkey, ndev->mvdev.mr.mkey); + MLX5_SET(virtio_q, vq_ctx, umem_1_id, mvq->umem1.id); + MLX5_SET(virtio_q, vq_ctx, umem_1_size, mvq->umem1.size); + MLX5_SET(virtio_q, vq_ctx, umem_2_id, mvq->umem2.id); + MLX5_SET(virtio_q, vq_ctx, umem_2_size, mvq->umem2.size); + MLX5_SET(virtio_q, vq_ctx, umem_3_id, mvq->umem3.id); + MLX5_SET(virtio_q, vq_ctx, umem_3_size, mvq->umem3.size); + MLX5_SET(virtio_q, vq_ctx, pd, ndev->mvdev.res.pdn); + if (counters_supported(&ndev->mvdev)) + MLX5_SET(virtio_q, vq_ctx, counter_set_id, mvq->counter_set_id); + + err = mlx5_cmd_exec(ndev->mvdev.mdev, in, inlen, out, sizeof(out)); + if (err) + goto err_cmd; + + mvq->fw_state = MLX5_VIRTIO_NET_Q_OBJECT_STATE_INIT; + kfree(in); + mvq->virtq_id = MLX5_GET(general_obj_out_cmd_hdr, out, obj_id); + + return 0; + +err_cmd: + kfree(in); +err_alloc: + umems_destroy(ndev, mvq); + return err; +} + +static void destroy_virtqueue(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq) +{ + u32 in[MLX5_ST_SZ_DW(destroy_virtio_net_q_in)] = {}; + u32 out[MLX5_ST_SZ_DW(destroy_virtio_net_q_out)] = {}; + + MLX5_SET(destroy_virtio_net_q_in, in, general_obj_out_cmd_hdr.opcode, + MLX5_CMD_OP_DESTROY_GENERAL_OBJECT); + MLX5_SET(destroy_virtio_net_q_in, in, general_obj_out_cmd_hdr.obj_id, mvq->virtq_id); + MLX5_SET(destroy_virtio_net_q_in, in, general_obj_out_cmd_hdr.uid, ndev->mvdev.res.uid); + MLX5_SET(destroy_virtio_net_q_in, in, general_obj_out_cmd_hdr.obj_type, + MLX5_OBJ_TYPE_VIRTIO_NET_Q); + if (mlx5_cmd_exec(ndev->mvdev.mdev, in, sizeof(in), out, sizeof(out))) { + mlx5_vdpa_warn(&ndev->mvdev, "destroy virtqueue 0x%x\n", mvq->virtq_id); + return; + } + mvq->fw_state = MLX5_VIRTIO_NET_Q_OBJECT_NONE; + umems_destroy(ndev, mvq); +} + +static u32 get_rqpn(struct mlx5_vdpa_virtqueue *mvq, bool fw) +{ + return fw ? mvq->vqqp.mqp.qpn : mvq->fwqp.mqp.qpn; +} + +static u32 get_qpn(struct mlx5_vdpa_virtqueue *mvq, bool fw) +{ + return fw ? mvq->fwqp.mqp.qpn : mvq->vqqp.mqp.qpn; +} + +static void alloc_inout(struct mlx5_vdpa_net *ndev, int cmd, void **in, int *inlen, void **out, + int *outlen, u32 qpn, u32 rqpn) +{ + void *qpc; + void *pp; + + switch (cmd) { + case MLX5_CMD_OP_2RST_QP: + *inlen = MLX5_ST_SZ_BYTES(qp_2rst_in); + *outlen = MLX5_ST_SZ_BYTES(qp_2rst_out); + *in = kzalloc(*inlen, GFP_KERNEL); + *out = kzalloc(*outlen, GFP_KERNEL); + if (!*in || !*out) + goto outerr; + + MLX5_SET(qp_2rst_in, *in, opcode, cmd); + MLX5_SET(qp_2rst_in, *in, uid, ndev->mvdev.res.uid); + MLX5_SET(qp_2rst_in, *in, qpn, qpn); + break; + case MLX5_CMD_OP_RST2INIT_QP: + *inlen = MLX5_ST_SZ_BYTES(rst2init_qp_in); + *outlen = MLX5_ST_SZ_BYTES(rst2init_qp_out); + *in = kzalloc(*inlen, GFP_KERNEL); + *out = kzalloc(MLX5_ST_SZ_BYTES(rst2init_qp_out), GFP_KERNEL); + if (!*in || !*out) + goto outerr; + + MLX5_SET(rst2init_qp_in, *in, opcode, cmd); + MLX5_SET(rst2init_qp_in, *in, uid, ndev->mvdev.res.uid); + MLX5_SET(rst2init_qp_in, *in, qpn, qpn); + qpc = MLX5_ADDR_OF(rst2init_qp_in, *in, qpc); + MLX5_SET(qpc, qpc, remote_qpn, rqpn); + MLX5_SET(qpc, qpc, rwe, 1); + pp = MLX5_ADDR_OF(qpc, qpc, primary_address_path); + MLX5_SET(ads, pp, vhca_port_num, 1); + break; + case MLX5_CMD_OP_INIT2RTR_QP: + *inlen = MLX5_ST_SZ_BYTES(init2rtr_qp_in); + *outlen = MLX5_ST_SZ_BYTES(init2rtr_qp_out); + *in = kzalloc(*inlen, GFP_KERNEL); + *out = kzalloc(MLX5_ST_SZ_BYTES(init2rtr_qp_out), GFP_KERNEL); + if (!*in || !*out) + goto outerr; + + MLX5_SET(init2rtr_qp_in, *in, opcode, cmd); + MLX5_SET(init2rtr_qp_in, *in, uid, ndev->mvdev.res.uid); + MLX5_SET(init2rtr_qp_in, *in, qpn, qpn); + qpc = MLX5_ADDR_OF(rst2init_qp_in, *in, qpc); + MLX5_SET(qpc, qpc, mtu, MLX5_QPC_MTU_256_BYTES); + MLX5_SET(qpc, qpc, log_msg_max, 30); + MLX5_SET(qpc, qpc, remote_qpn, rqpn); + pp = MLX5_ADDR_OF(qpc, qpc, primary_address_path); + MLX5_SET(ads, pp, fl, 1); + break; + case MLX5_CMD_OP_RTR2RTS_QP: + *inlen = MLX5_ST_SZ_BYTES(rtr2rts_qp_in); + *outlen = MLX5_ST_SZ_BYTES(rtr2rts_qp_out); + *in = kzalloc(*inlen, GFP_KERNEL); + *out = kzalloc(MLX5_ST_SZ_BYTES(rtr2rts_qp_out), GFP_KERNEL); + if (!*in || !*out) + goto outerr; + + MLX5_SET(rtr2rts_qp_in, *in, opcode, cmd); + MLX5_SET(rtr2rts_qp_in, *in, uid, ndev->mvdev.res.uid); + MLX5_SET(rtr2rts_qp_in, *in, qpn, qpn); + qpc = MLX5_ADDR_OF(rst2init_qp_in, *in, qpc); + pp = MLX5_ADDR_OF(qpc, qpc, primary_address_path); + MLX5_SET(ads, pp, ack_timeout, 14); + MLX5_SET(qpc, qpc, retry_count, 7); + MLX5_SET(qpc, qpc, rnr_retry, 7); + break; + default: + goto outerr_nullify; + } + + return; + +outerr: + kfree(*in); + kfree(*out); +outerr_nullify: + *in = NULL; + *out = NULL; +} + +static void free_inout(void *in, void *out) +{ + kfree(in); + kfree(out); +} + +/* Two QPs are used by each virtqueue. One is used by the driver and one by + * firmware. The fw argument indicates whether the subjected QP is the one used + * by firmware. + */ +static int modify_qp(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq, bool fw, int cmd) +{ + int outlen; + int inlen; + void *out; + void *in; + int err; + + alloc_inout(ndev, cmd, &in, &inlen, &out, &outlen, get_qpn(mvq, fw), get_rqpn(mvq, fw)); + if (!in || !out) + return -ENOMEM; + + err = mlx5_cmd_exec(ndev->mvdev.mdev, in, inlen, out, outlen); + free_inout(in, out); + return err; +} + +static int connect_qps(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq) +{ + int err; + + err = modify_qp(ndev, mvq, true, MLX5_CMD_OP_2RST_QP); + if (err) + return err; + + err = modify_qp(ndev, mvq, false, MLX5_CMD_OP_2RST_QP); + if (err) + return err; + + err = modify_qp(ndev, mvq, true, MLX5_CMD_OP_RST2INIT_QP); + if (err) + return err; + + err = modify_qp(ndev, mvq, false, MLX5_CMD_OP_RST2INIT_QP); + if (err) + return err; + + err = modify_qp(ndev, mvq, true, MLX5_CMD_OP_INIT2RTR_QP); + if (err) + return err; + + err = modify_qp(ndev, mvq, false, MLX5_CMD_OP_INIT2RTR_QP); + if (err) + return err; + + return modify_qp(ndev, mvq, true, MLX5_CMD_OP_RTR2RTS_QP); +} + +struct mlx5_virtq_attr { + u8 state; + u16 available_index; + u16 used_index; +}; + +static int query_virtqueue(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq, + struct mlx5_virtq_attr *attr) +{ + int outlen = MLX5_ST_SZ_BYTES(query_virtio_net_q_out); + u32 in[MLX5_ST_SZ_DW(query_virtio_net_q_in)] = {}; + void *out; + void *obj_context; + void *cmd_hdr; + int err; + + out = kzalloc(outlen, GFP_KERNEL); + if (!out) + return -ENOMEM; + + cmd_hdr = MLX5_ADDR_OF(query_virtio_net_q_in, in, general_obj_in_cmd_hdr); + + MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode, MLX5_CMD_OP_QUERY_GENERAL_OBJECT); + MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type, MLX5_OBJ_TYPE_VIRTIO_NET_Q); + MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_id, mvq->virtq_id); + MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, uid, ndev->mvdev.res.uid); + err = mlx5_cmd_exec(ndev->mvdev.mdev, in, sizeof(in), out, outlen); + if (err) + goto err_cmd; + + obj_context = MLX5_ADDR_OF(query_virtio_net_q_out, out, obj_context); + memset(attr, 0, sizeof(*attr)); + attr->state = MLX5_GET(virtio_net_q_object, obj_context, state); + attr->available_index = MLX5_GET(virtio_net_q_object, obj_context, hw_available_index); + attr->used_index = MLX5_GET(virtio_net_q_object, obj_context, hw_used_index); + kfree(out); + return 0; + +err_cmd: + kfree(out); + return err; +} + +static bool is_valid_state_change(int oldstate, int newstate) +{ + switch (oldstate) { + case MLX5_VIRTIO_NET_Q_OBJECT_STATE_INIT: + return newstate == MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY; + case MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY: + return newstate == MLX5_VIRTIO_NET_Q_OBJECT_STATE_SUSPEND; + case MLX5_VIRTIO_NET_Q_OBJECT_STATE_SUSPEND: + case MLX5_VIRTIO_NET_Q_OBJECT_STATE_ERR: + default: + return false; + } +} + +static int modify_virtqueue(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq, int state) +{ + int inlen = MLX5_ST_SZ_BYTES(modify_virtio_net_q_in); + u32 out[MLX5_ST_SZ_DW(modify_virtio_net_q_out)] = {}; + void *obj_context; + void *cmd_hdr; + void *in; + int err; + + if (mvq->fw_state == MLX5_VIRTIO_NET_Q_OBJECT_NONE) + return 0; + + if (!is_valid_state_change(mvq->fw_state, state)) + return -EINVAL; + + in = kzalloc(inlen, GFP_KERNEL); + if (!in) + return -ENOMEM; + + cmd_hdr = MLX5_ADDR_OF(modify_virtio_net_q_in, in, general_obj_in_cmd_hdr); + + MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode, MLX5_CMD_OP_MODIFY_GENERAL_OBJECT); + MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type, MLX5_OBJ_TYPE_VIRTIO_NET_Q); + MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_id, mvq->virtq_id); + MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, uid, ndev->mvdev.res.uid); + + obj_context = MLX5_ADDR_OF(modify_virtio_net_q_in, in, obj_context); + MLX5_SET64(virtio_net_q_object, obj_context, modify_field_select, + MLX5_VIRTQ_MODIFY_MASK_STATE); + MLX5_SET(virtio_net_q_object, obj_context, state, state); + err = mlx5_cmd_exec(ndev->mvdev.mdev, in, inlen, out, sizeof(out)); + kfree(in); + if (!err) + mvq->fw_state = state; + + return err; +} + +static int counter_set_alloc(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq) +{ + u32 in[MLX5_ST_SZ_DW(create_virtio_q_counters_in)] = {}; + u32 out[MLX5_ST_SZ_DW(create_virtio_q_counters_out)] = {}; + void *cmd_hdr; + int err; + + if (!counters_supported(&ndev->mvdev)) + return 0; + + cmd_hdr = MLX5_ADDR_OF(create_virtio_q_counters_in, in, hdr); + + MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode, MLX5_CMD_OP_CREATE_GENERAL_OBJECT); + MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type, MLX5_OBJ_TYPE_VIRTIO_Q_COUNTERS); + MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, uid, ndev->mvdev.res.uid); + + err = mlx5_cmd_exec(ndev->mvdev.mdev, in, sizeof(in), out, sizeof(out)); + if (err) + return err; + + mvq->counter_set_id = MLX5_GET(general_obj_out_cmd_hdr, out, obj_id); + + return 0; +} + +static void counter_set_dealloc(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq) +{ + u32 in[MLX5_ST_SZ_DW(destroy_virtio_q_counters_in)] = {}; + u32 out[MLX5_ST_SZ_DW(destroy_virtio_q_counters_out)] = {}; + + if (!counters_supported(&ndev->mvdev)) + return; + + MLX5_SET(destroy_virtio_q_counters_in, in, hdr.opcode, MLX5_CMD_OP_DESTROY_GENERAL_OBJECT); + MLX5_SET(destroy_virtio_q_counters_in, in, hdr.obj_id, mvq->counter_set_id); + MLX5_SET(destroy_virtio_q_counters_in, in, hdr.uid, ndev->mvdev.res.uid); + MLX5_SET(destroy_virtio_q_counters_in, in, hdr.obj_type, MLX5_OBJ_TYPE_VIRTIO_Q_COUNTERS); + if (mlx5_cmd_exec(ndev->mvdev.mdev, in, sizeof(in), out, sizeof(out))) + mlx5_vdpa_warn(&ndev->mvdev, "dealloc counter set 0x%x\n", mvq->counter_set_id); +} + +static irqreturn_t mlx5_vdpa_int_handler(int irq, void *priv) +{ + struct vdpa_callback *cb = priv; + + if (cb->callback) + return cb->callback(cb->private); + + return IRQ_HANDLED; +} + +static void alloc_vector(struct mlx5_vdpa_net *ndev, + struct mlx5_vdpa_virtqueue *mvq) +{ + struct mlx5_vdpa_irq_pool *irqp = &ndev->irqp; + struct mlx5_vdpa_irq_pool_entry *ent; + int err; + int i; + + for (i = 0; i < irqp->num_ent; i++) { + ent = &irqp->entries[i]; + if (!ent->used) { + snprintf(ent->name, MLX5_VDPA_IRQ_NAME_LEN, "%s-vq-%d", + dev_name(&ndev->mvdev.vdev.dev), mvq->index); + ent->dev_id = &ndev->event_cbs[mvq->index]; + err = request_irq(ent->map.virq, mlx5_vdpa_int_handler, 0, + ent->name, ent->dev_id); + if (err) + return; + + ent->used = true; + mvq->map = ent->map; + return; + } + } +} + +static void dealloc_vector(struct mlx5_vdpa_net *ndev, + struct mlx5_vdpa_virtqueue *mvq) +{ + struct mlx5_vdpa_irq_pool *irqp = &ndev->irqp; + int i; + + for (i = 0; i < irqp->num_ent; i++) + if (mvq->map.virq == irqp->entries[i].map.virq) { + free_irq(mvq->map.virq, irqp->entries[i].dev_id); + irqp->entries[i].used = false; + return; + } +} + +static int setup_vq(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq) +{ + u16 idx = mvq->index; + int err; + + if (!mvq->num_ent) + return 0; + + if (mvq->initialized) + return 0; + + err = cq_create(ndev, idx, mvq->num_ent); + if (err) + return err; + + err = qp_create(ndev, mvq, &mvq->fwqp); + if (err) + goto err_fwqp; + + err = qp_create(ndev, mvq, &mvq->vqqp); + if (err) + goto err_vqqp; + + err = connect_qps(ndev, mvq); + if (err) + goto err_connect; + + err = counter_set_alloc(ndev, mvq); + if (err) + goto err_connect; + + alloc_vector(ndev, mvq); + err = create_virtqueue(ndev, mvq); + if (err) + goto err_vq; + + if (mvq->ready) { + err = modify_virtqueue(ndev, mvq, MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY); + if (err) { + mlx5_vdpa_warn(&ndev->mvdev, "failed to modify to ready vq idx %d(%d)\n", + idx, err); + goto err_modify; + } + } + + mvq->initialized = true; + return 0; + +err_modify: + destroy_virtqueue(ndev, mvq); +err_vq: + dealloc_vector(ndev, mvq); + counter_set_dealloc(ndev, mvq); +err_connect: + qp_destroy(ndev, &mvq->vqqp); +err_vqqp: + qp_destroy(ndev, &mvq->fwqp); +err_fwqp: + cq_destroy(ndev, idx); + return err; +} + +static void suspend_vq(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq) +{ + struct mlx5_virtq_attr attr; + + if (!mvq->initialized) + return; + + if (mvq->fw_state != MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY) + return; + + if (modify_virtqueue(ndev, mvq, MLX5_VIRTIO_NET_Q_OBJECT_STATE_SUSPEND)) + mlx5_vdpa_warn(&ndev->mvdev, "modify to suspend failed\n"); + + if (query_virtqueue(ndev, mvq, &attr)) { + mlx5_vdpa_warn(&ndev->mvdev, "failed to query virtqueue\n"); + return; + } + mvq->avail_idx = attr.available_index; + mvq->used_idx = attr.used_index; +} + +static void suspend_vqs(struct mlx5_vdpa_net *ndev) +{ + int i; + + for (i = 0; i < ndev->mvdev.max_vqs; i++) + suspend_vq(ndev, &ndev->vqs[i]); +} + +static void teardown_vq(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq) +{ + if (!mvq->initialized) + return; + + suspend_vq(ndev, mvq); + destroy_virtqueue(ndev, mvq); + dealloc_vector(ndev, mvq); + counter_set_dealloc(ndev, mvq); + qp_destroy(ndev, &mvq->vqqp); + qp_destroy(ndev, &mvq->fwqp); + cq_destroy(ndev, mvq->index); + mvq->initialized = false; +} + +static int create_rqt(struct mlx5_vdpa_net *ndev) +{ + int rqt_table_size = roundup_pow_of_two(ndev->rqt_size); + int act_sz = roundup_pow_of_two(ndev->cur_num_vqs / 2); + __be32 *list; + void *rqtc; + int inlen; + void *in; + int i, j; + int err; + + inlen = MLX5_ST_SZ_BYTES(create_rqt_in) + rqt_table_size * MLX5_ST_SZ_BYTES(rq_num); + in = kzalloc(inlen, GFP_KERNEL); + if (!in) + return -ENOMEM; + + MLX5_SET(create_rqt_in, in, uid, ndev->mvdev.res.uid); + rqtc = MLX5_ADDR_OF(create_rqt_in, in, rqt_context); + + MLX5_SET(rqtc, rqtc, list_q_type, MLX5_RQTC_LIST_Q_TYPE_VIRTIO_NET_Q); + MLX5_SET(rqtc, rqtc, rqt_max_size, rqt_table_size); + list = MLX5_ADDR_OF(rqtc, rqtc, rq_num[0]); + for (i = 0, j = 0; i < act_sz; i++, j += 2) + list[i] = cpu_to_be32(ndev->vqs[j % ndev->cur_num_vqs].virtq_id); + + MLX5_SET(rqtc, rqtc, rqt_actual_size, act_sz); + err = mlx5_vdpa_create_rqt(&ndev->mvdev, in, inlen, &ndev->res.rqtn); + kfree(in); + if (err) + return err; + + return 0; +} + +#define MLX5_MODIFY_RQT_NUM_RQS ((u64)1) + +static int modify_rqt(struct mlx5_vdpa_net *ndev, int num) +{ + int act_sz = roundup_pow_of_two(num / 2); + __be32 *list; + void *rqtc; + int inlen; + void *in; + int i, j; + int err; + + inlen = MLX5_ST_SZ_BYTES(modify_rqt_in) + act_sz * MLX5_ST_SZ_BYTES(rq_num); + in = kzalloc(inlen, GFP_KERNEL); + if (!in) + return -ENOMEM; + + MLX5_SET(modify_rqt_in, in, uid, ndev->mvdev.res.uid); + MLX5_SET64(modify_rqt_in, in, bitmask, MLX5_MODIFY_RQT_NUM_RQS); + rqtc = MLX5_ADDR_OF(modify_rqt_in, in, ctx); + MLX5_SET(rqtc, rqtc, list_q_type, MLX5_RQTC_LIST_Q_TYPE_VIRTIO_NET_Q); + + list = MLX5_ADDR_OF(rqtc, rqtc, rq_num[0]); + for (i = 0, j = 0; i < act_sz; i++, j = j + 2) + list[i] = cpu_to_be32(ndev->vqs[j % num].virtq_id); + + MLX5_SET(rqtc, rqtc, rqt_actual_size, act_sz); + err = mlx5_vdpa_modify_rqt(&ndev->mvdev, in, inlen, ndev->res.rqtn); + kfree(in); + if (err) + return err; + + return 0; +} + +static void destroy_rqt(struct mlx5_vdpa_net *ndev) +{ + mlx5_vdpa_destroy_rqt(&ndev->mvdev, ndev->res.rqtn); +} + +static int create_tir(struct mlx5_vdpa_net *ndev) +{ +#define HASH_IP_L4PORTS \ + (MLX5_HASH_FIELD_SEL_SRC_IP | MLX5_HASH_FIELD_SEL_DST_IP | MLX5_HASH_FIELD_SEL_L4_SPORT | \ + MLX5_HASH_FIELD_SEL_L4_DPORT) + static const u8 rx_hash_toeplitz_key[] = { 0x2c, 0xc6, 0x81, 0xd1, 0x5b, 0xdb, 0xf4, 0xf7, + 0xfc, 0xa2, 0x83, 0x19, 0xdb, 0x1a, 0x3e, 0x94, + 0x6b, 0x9e, 0x38, 0xd9, 0x2c, 0x9c, 0x03, 0xd1, + 0xad, 0x99, 0x44, 0xa7, 0xd9, 0x56, 0x3d, 0x59, + 0x06, 0x3c, 0x25, 0xf3, 0xfc, 0x1f, 0xdc, 0x2a }; + void *rss_key; + void *outer; + void *tirc; + void *in; + int err; + + in = kzalloc(MLX5_ST_SZ_BYTES(create_tir_in), GFP_KERNEL); + if (!in) + return -ENOMEM; + + MLX5_SET(create_tir_in, in, uid, ndev->mvdev.res.uid); + tirc = MLX5_ADDR_OF(create_tir_in, in, ctx); + MLX5_SET(tirc, tirc, disp_type, MLX5_TIRC_DISP_TYPE_INDIRECT); + + MLX5_SET(tirc, tirc, rx_hash_symmetric, 1); + MLX5_SET(tirc, tirc, rx_hash_fn, MLX5_RX_HASH_FN_TOEPLITZ); + rss_key = MLX5_ADDR_OF(tirc, tirc, rx_hash_toeplitz_key); + memcpy(rss_key, rx_hash_toeplitz_key, sizeof(rx_hash_toeplitz_key)); + + outer = MLX5_ADDR_OF(tirc, tirc, rx_hash_field_selector_outer); + MLX5_SET(rx_hash_field_select, outer, l3_prot_type, MLX5_L3_PROT_TYPE_IPV4); + MLX5_SET(rx_hash_field_select, outer, l4_prot_type, MLX5_L4_PROT_TYPE_TCP); + MLX5_SET(rx_hash_field_select, outer, selected_fields, HASH_IP_L4PORTS); + + MLX5_SET(tirc, tirc, indirect_table, ndev->res.rqtn); + MLX5_SET(tirc, tirc, transport_domain, ndev->res.tdn); + + err = mlx5_vdpa_create_tir(&ndev->mvdev, in, &ndev->res.tirn); + kfree(in); + if (err) + return err; + + mlx5_vdpa_add_tirn(ndev); + return err; +} + +static void destroy_tir(struct mlx5_vdpa_net *ndev) +{ + mlx5_vdpa_remove_tirn(ndev); + mlx5_vdpa_destroy_tir(&ndev->mvdev, ndev->res.tirn); +} + +#define MAX_STEERING_ENT 0x8000 +#define MAX_STEERING_GROUPS 2 + +#if defined(CONFIG_MLX5_VDPA_STEERING_DEBUG) + #define NUM_DESTS 2 +#else + #define NUM_DESTS 1 +#endif + +static int add_steering_counters(struct mlx5_vdpa_net *ndev, + struct macvlan_node *node, + struct mlx5_flow_act *flow_act, + struct mlx5_flow_destination *dests) +{ +#if defined(CONFIG_MLX5_VDPA_STEERING_DEBUG) + int err; + + node->ucast_counter.counter = mlx5_fc_create(ndev->mvdev.mdev, false); + if (IS_ERR(node->ucast_counter.counter)) + return PTR_ERR(node->ucast_counter.counter); + + node->mcast_counter.counter = mlx5_fc_create(ndev->mvdev.mdev, false); + if (IS_ERR(node->mcast_counter.counter)) { + err = PTR_ERR(node->mcast_counter.counter); + goto err_mcast_counter; + } + + dests[1].type = MLX5_FLOW_DESTINATION_TYPE_COUNTER; + flow_act->action |= MLX5_FLOW_CONTEXT_ACTION_COUNT; + return 0; + +err_mcast_counter: + mlx5_fc_destroy(ndev->mvdev.mdev, node->ucast_counter.counter); + return err; +#else + return 0; +#endif +} + +static void remove_steering_counters(struct mlx5_vdpa_net *ndev, + struct macvlan_node *node) +{ +#if defined(CONFIG_MLX5_VDPA_STEERING_DEBUG) + mlx5_fc_destroy(ndev->mvdev.mdev, node->mcast_counter.counter); + mlx5_fc_destroy(ndev->mvdev.mdev, node->ucast_counter.counter); +#endif +} + +static int mlx5_vdpa_add_mac_vlan_rules(struct mlx5_vdpa_net *ndev, u8 *mac, + struct macvlan_node *node) +{ + struct mlx5_flow_destination dests[NUM_DESTS] = {}; + struct mlx5_flow_act flow_act = {}; + struct mlx5_flow_spec *spec; + void *headers_c; + void *headers_v; + u8 *dmac_c; + u8 *dmac_v; + int err; + u16 vid; + + spec = kvzalloc(sizeof(*spec), GFP_KERNEL); + if (!spec) + return -ENOMEM; + + vid = key2vid(node->macvlan); + spec->match_criteria_enable = MLX5_MATCH_OUTER_HEADERS; + headers_c = MLX5_ADDR_OF(fte_match_param, spec->match_criteria, outer_headers); + headers_v = MLX5_ADDR_OF(fte_match_param, spec->match_value, outer_headers); + dmac_c = MLX5_ADDR_OF(fte_match_param, headers_c, outer_headers.dmac_47_16); + dmac_v = MLX5_ADDR_OF(fte_match_param, headers_v, outer_headers.dmac_47_16); + eth_broadcast_addr(dmac_c); + ether_addr_copy(dmac_v, mac); + if (ndev->mvdev.actual_features & BIT_ULL(VIRTIO_NET_F_CTRL_VLAN)) { + MLX5_SET(fte_match_set_lyr_2_4, headers_c, cvlan_tag, 1); + MLX5_SET_TO_ONES(fte_match_set_lyr_2_4, headers_c, first_vid); + } + if (node->tagged) { + MLX5_SET(fte_match_set_lyr_2_4, headers_v, cvlan_tag, 1); + MLX5_SET(fte_match_set_lyr_2_4, headers_v, first_vid, vid); + } + flow_act.action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST; + dests[0].type = MLX5_FLOW_DESTINATION_TYPE_TIR; + dests[0].tir_num = ndev->res.tirn; + err = add_steering_counters(ndev, node, &flow_act, dests); + if (err) + goto out_free; + +#if defined(CONFIG_MLX5_VDPA_STEERING_DEBUG) + dests[1].counter_id = mlx5_fc_id(node->ucast_counter.counter); +#endif + node->ucast_rule = mlx5_add_flow_rules(ndev->rxft, spec, &flow_act, dests, NUM_DESTS); + if (IS_ERR(node->ucast_rule)) { + err = PTR_ERR(node->ucast_rule); + goto err_ucast; + } + +#if defined(CONFIG_MLX5_VDPA_STEERING_DEBUG) + dests[1].counter_id = mlx5_fc_id(node->mcast_counter.counter); +#endif + + memset(dmac_c, 0, ETH_ALEN); + memset(dmac_v, 0, ETH_ALEN); + dmac_c[0] = 1; + dmac_v[0] = 1; + node->mcast_rule = mlx5_add_flow_rules(ndev->rxft, spec, &flow_act, dests, NUM_DESTS); + if (IS_ERR(node->mcast_rule)) { + err = PTR_ERR(node->mcast_rule); + goto err_mcast; + } + kvfree(spec); + mlx5_vdpa_add_rx_counters(ndev, node); + return 0; + +err_mcast: + mlx5_del_flow_rules(node->ucast_rule); +err_ucast: + remove_steering_counters(ndev, node); +out_free: + kvfree(spec); + return err; +} + +static void mlx5_vdpa_del_mac_vlan_rules(struct mlx5_vdpa_net *ndev, + struct macvlan_node *node) +{ + mlx5_vdpa_remove_rx_counters(ndev, node); + mlx5_del_flow_rules(node->ucast_rule); + mlx5_del_flow_rules(node->mcast_rule); +} + +static u64 search_val(u8 *mac, u16 vlan, bool tagged) +{ + u64 val; + + if (!tagged) + vlan = MLX5V_UNTAGGED; + + val = (u64)vlan << 48 | + (u64)mac[0] << 40 | + (u64)mac[1] << 32 | + (u64)mac[2] << 24 | + (u64)mac[3] << 16 | + (u64)mac[4] << 8 | + (u64)mac[5]; + + return val; +} + +static struct macvlan_node *mac_vlan_lookup(struct mlx5_vdpa_net *ndev, u64 value) +{ + struct macvlan_node *pos; + u32 idx; + + idx = hash_64(value, 8); // tbd 8 + hlist_for_each_entry(pos, &ndev->macvlan_hash[idx], hlist) { + if (pos->macvlan == value) + return pos; + } + return NULL; +} + +static int mac_vlan_add(struct mlx5_vdpa_net *ndev, u8 *mac, u16 vid, bool tagged) +{ + struct macvlan_node *ptr; + u64 val; + u32 idx; + int err; + + val = search_val(mac, vid, tagged); + if (mac_vlan_lookup(ndev, val)) + return -EEXIST; + + ptr = kzalloc(sizeof(*ptr), GFP_KERNEL); + if (!ptr) + return -ENOMEM; + + ptr->tagged = tagged; + ptr->macvlan = val; + ptr->ndev = ndev; + err = mlx5_vdpa_add_mac_vlan_rules(ndev, ndev->config.mac, ptr); + if (err) + goto err_add; + + idx = hash_64(val, 8); + hlist_add_head(&ptr->hlist, &ndev->macvlan_hash[idx]); + return 0; + +err_add: + kfree(ptr); + return err; +} + +static void mac_vlan_del(struct mlx5_vdpa_net *ndev, u8 *mac, u16 vlan, bool tagged) +{ + struct macvlan_node *ptr; + + ptr = mac_vlan_lookup(ndev, search_val(mac, vlan, tagged)); + if (!ptr) + return; + + hlist_del(&ptr->hlist); + mlx5_vdpa_del_mac_vlan_rules(ndev, ptr); + remove_steering_counters(ndev, ptr); + kfree(ptr); +} + +static void clear_mac_vlan_table(struct mlx5_vdpa_net *ndev) +{ + struct macvlan_node *pos; + struct hlist_node *n; + int i; + + for (i = 0; i < MLX5V_MACVLAN_SIZE; i++) { + hlist_for_each_entry_safe(pos, n, &ndev->macvlan_hash[i], hlist) { + hlist_del(&pos->hlist); + mlx5_vdpa_del_mac_vlan_rules(ndev, pos); + remove_steering_counters(ndev, pos); + kfree(pos); + } + } +} + +static int setup_steering(struct mlx5_vdpa_net *ndev) +{ + struct mlx5_flow_table_attr ft_attr = {}; + struct mlx5_flow_namespace *ns; + int err; + + ft_attr.max_fte = MAX_STEERING_ENT; + ft_attr.autogroup.max_num_groups = MAX_STEERING_GROUPS; + + ns = mlx5_get_flow_namespace(ndev->mvdev.mdev, MLX5_FLOW_NAMESPACE_BYPASS); + if (!ns) { + mlx5_vdpa_warn(&ndev->mvdev, "failed to get flow namespace\n"); + return -EOPNOTSUPP; + } + + ndev->rxft = mlx5_create_auto_grouped_flow_table(ns, &ft_attr); + if (IS_ERR(ndev->rxft)) { + mlx5_vdpa_warn(&ndev->mvdev, "failed to create flow table\n"); + return PTR_ERR(ndev->rxft); + } + mlx5_vdpa_add_rx_flow_table(ndev); + + err = mac_vlan_add(ndev, ndev->config.mac, 0, false); + if (err) + goto err_add; + + return 0; + +err_add: + mlx5_vdpa_remove_rx_flow_table(ndev); + mlx5_destroy_flow_table(ndev->rxft); + return err; +} + +static void teardown_steering(struct mlx5_vdpa_net *ndev) +{ + clear_mac_vlan_table(ndev); + mlx5_vdpa_remove_rx_flow_table(ndev); + mlx5_destroy_flow_table(ndev->rxft); +} + +static virtio_net_ctrl_ack handle_ctrl_mac(struct mlx5_vdpa_dev *mvdev, u8 cmd) +{ + struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev); + struct mlx5_control_vq *cvq = &mvdev->cvq; + virtio_net_ctrl_ack status = VIRTIO_NET_ERR; + struct mlx5_core_dev *pfmdev; + size_t read; + u8 mac[ETH_ALEN], mac_back[ETH_ALEN]; + + pfmdev = pci_get_drvdata(pci_physfn(mvdev->mdev->pdev)); + switch (cmd) { + case VIRTIO_NET_CTRL_MAC_ADDR_SET: + read = vringh_iov_pull_iotlb(&cvq->vring, &cvq->riov, (void *)mac, ETH_ALEN); + if (read != ETH_ALEN) + break; + + if (!memcmp(ndev->config.mac, mac, 6)) { + status = VIRTIO_NET_OK; + break; + } + + if (is_zero_ether_addr(mac)) + break; + + if (!is_zero_ether_addr(ndev->config.mac)) { + if (mlx5_mpfs_del_mac(pfmdev, ndev->config.mac)) { + mlx5_vdpa_warn(mvdev, "failed to delete old MAC %pM from MPFS table\n", + ndev->config.mac); + break; + } + } + + if (mlx5_mpfs_add_mac(pfmdev, mac)) { + mlx5_vdpa_warn(mvdev, "failed to insert new MAC %pM into MPFS table\n", + mac); + break; + } + + /* backup the original mac address so that if failed to add the forward rules + * we could restore it + */ + memcpy(mac_back, ndev->config.mac, ETH_ALEN); + + memcpy(ndev->config.mac, mac, ETH_ALEN); + + /* Need recreate the flow table entry, so that the packet could forward back + */ + mac_vlan_del(ndev, mac_back, 0, false); + + if (mac_vlan_add(ndev, ndev->config.mac, 0, false)) { + mlx5_vdpa_warn(mvdev, "failed to insert forward rules, try to restore\n"); + + /* Although it hardly run here, we still need double check */ + if (is_zero_ether_addr(mac_back)) { + mlx5_vdpa_warn(mvdev, "restore mac failed: Original MAC is zero\n"); + break; + } + + /* Try to restore original mac address to MFPS table, and try to restore + * the forward rule entry. + */ + if (mlx5_mpfs_del_mac(pfmdev, ndev->config.mac)) { + mlx5_vdpa_warn(mvdev, "restore mac failed: delete MAC %pM from MPFS table failed\n", + ndev->config.mac); + } + + if (mlx5_mpfs_add_mac(pfmdev, mac_back)) { + mlx5_vdpa_warn(mvdev, "restore mac failed: insert old MAC %pM into MPFS table failed\n", + mac_back); + } + + memcpy(ndev->config.mac, mac_back, ETH_ALEN); + + if (mac_vlan_add(ndev, ndev->config.mac, 0, false)) + mlx5_vdpa_warn(mvdev, "restore forward rules failed: insert forward rules failed\n"); + + break; + } + + status = VIRTIO_NET_OK; + break; + + default: + break; + } + + return status; +} + +static int change_num_qps(struct mlx5_vdpa_dev *mvdev, int newqps) +{ + struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev); + int cur_qps = ndev->cur_num_vqs / 2; + int err; + int i; + + if (cur_qps > newqps) { + err = modify_rqt(ndev, 2 * newqps); + if (err) + return err; + + for (i = ndev->cur_num_vqs - 1; i >= 2 * newqps; i--) + teardown_vq(ndev, &ndev->vqs[i]); + + ndev->cur_num_vqs = 2 * newqps; + } else { + ndev->cur_num_vqs = 2 * newqps; + for (i = cur_qps * 2; i < 2 * newqps; i++) { + err = setup_vq(ndev, &ndev->vqs[i]); + if (err) + goto clean_added; + } + err = modify_rqt(ndev, 2 * newqps); + if (err) + goto clean_added; + } + return 0; + +clean_added: + for (--i; i >= 2 * cur_qps; --i) + teardown_vq(ndev, &ndev->vqs[i]); + + ndev->cur_num_vqs = 2 * cur_qps; + + return err; +} + +static virtio_net_ctrl_ack handle_ctrl_mq(struct mlx5_vdpa_dev *mvdev, u8 cmd) +{ + struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev); + virtio_net_ctrl_ack status = VIRTIO_NET_ERR; + struct mlx5_control_vq *cvq = &mvdev->cvq; + struct virtio_net_ctrl_mq mq; + size_t read; + u16 newqps; + + switch (cmd) { + case VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET: + /* This mq feature check aligns with pre-existing userspace + * implementation. + * + * Without it, an untrusted driver could fake a multiqueue config + * request down to a non-mq device that may cause kernel to + * panic due to uninitialized resources for extra vqs. Even with + * a well behaving guest driver, it is not expected to allow + * changing the number of vqs on a non-mq device. + */ + if (!MLX5_FEATURE(mvdev, VIRTIO_NET_F_MQ)) + break; + + read = vringh_iov_pull_iotlb(&cvq->vring, &cvq->riov, (void *)&mq, sizeof(mq)); + if (read != sizeof(mq)) + break; + + newqps = mlx5vdpa16_to_cpu(mvdev, mq.virtqueue_pairs); + if (newqps < VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MIN || + newqps > ndev->rqt_size) + break; + + if (ndev->cur_num_vqs == 2 * newqps) { + status = VIRTIO_NET_OK; + break; + } + + if (!change_num_qps(mvdev, newqps)) + status = VIRTIO_NET_OK; + + break; + default: + break; + } + + return status; +} + +static virtio_net_ctrl_ack handle_ctrl_vlan(struct mlx5_vdpa_dev *mvdev, u8 cmd) +{ + struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev); + virtio_net_ctrl_ack status = VIRTIO_NET_ERR; + struct mlx5_control_vq *cvq = &mvdev->cvq; + __virtio16 vlan; + size_t read; + u16 id; + + if (!(ndev->mvdev.actual_features & BIT_ULL(VIRTIO_NET_F_CTRL_VLAN))) + return status; + + switch (cmd) { + case VIRTIO_NET_CTRL_VLAN_ADD: + read = vringh_iov_pull_iotlb(&cvq->vring, &cvq->riov, &vlan, sizeof(vlan)); + if (read != sizeof(vlan)) + break; + + id = mlx5vdpa16_to_cpu(mvdev, vlan); + if (mac_vlan_add(ndev, ndev->config.mac, id, true)) + break; + + status = VIRTIO_NET_OK; + break; + case VIRTIO_NET_CTRL_VLAN_DEL: + read = vringh_iov_pull_iotlb(&cvq->vring, &cvq->riov, &vlan, sizeof(vlan)); + if (read != sizeof(vlan)) + break; + + id = mlx5vdpa16_to_cpu(mvdev, vlan); + mac_vlan_del(ndev, ndev->config.mac, id, true); + status = VIRTIO_NET_OK; + break; + default: + break; + } + + return status; +} + +static void mlx5_cvq_kick_handler(struct work_struct *work) +{ + virtio_net_ctrl_ack status = VIRTIO_NET_ERR; + struct virtio_net_ctrl_hdr ctrl; + struct mlx5_vdpa_wq_ent *wqent; + struct mlx5_vdpa_dev *mvdev; + struct mlx5_control_vq *cvq; + struct mlx5_vdpa_net *ndev; + size_t read, write; + int err; + + wqent = container_of(work, struct mlx5_vdpa_wq_ent, work); + mvdev = wqent->mvdev; + ndev = to_mlx5_vdpa_ndev(mvdev); + cvq = &mvdev->cvq; + + down_write(&ndev->reslock); + + if (!(mvdev->status & VIRTIO_CONFIG_S_DRIVER_OK)) + goto out; + + if (!(ndev->mvdev.actual_features & BIT_ULL(VIRTIO_NET_F_CTRL_VQ))) + goto out; + + if (!cvq->ready) + goto out; + + while (true) { + err = vringh_getdesc_iotlb(&cvq->vring, &cvq->riov, &cvq->wiov, &cvq->head, + GFP_ATOMIC); + if (err <= 0) + break; + + read = vringh_iov_pull_iotlb(&cvq->vring, &cvq->riov, &ctrl, sizeof(ctrl)); + if (read != sizeof(ctrl)) + break; + + cvq->received_desc++; + switch (ctrl.class) { + case VIRTIO_NET_CTRL_MAC: + status = handle_ctrl_mac(mvdev, ctrl.cmd); + break; + case VIRTIO_NET_CTRL_MQ: + status = handle_ctrl_mq(mvdev, ctrl.cmd); + break; + case VIRTIO_NET_CTRL_VLAN: + status = handle_ctrl_vlan(mvdev, ctrl.cmd); + break; + default: + break; + } + + /* Make sure data is written before advancing index */ + smp_wmb(); + + write = vringh_iov_push_iotlb(&cvq->vring, &cvq->wiov, &status, sizeof(status)); + vringh_complete_iotlb(&cvq->vring, cvq->head, write); + vringh_kiov_cleanup(&cvq->riov); + vringh_kiov_cleanup(&cvq->wiov); + + if (vringh_need_notify_iotlb(&cvq->vring)) + vringh_notify(&cvq->vring); + + cvq->completed_desc++; + queue_work(mvdev->wq, &wqent->work); + break; + } + +out: + up_write(&ndev->reslock); +} + +static void mlx5_vdpa_kick_vq(struct vdpa_device *vdev, u16 idx) +{ + struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev); + struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev); + struct mlx5_vdpa_virtqueue *mvq; + + if (!is_index_valid(mvdev, idx)) + return; + + if (unlikely(is_ctrl_vq_idx(mvdev, idx))) { + if (!mvdev->wq || !mvdev->cvq.ready) + return; + + queue_work(mvdev->wq, &ndev->cvq_ent.work); + return; + } + + mvq = &ndev->vqs[idx]; + if (unlikely(!mvq->ready)) + return; + + iowrite16(idx, ndev->mvdev.res.kick_addr); +} + +static int mlx5_vdpa_set_vq_address(struct vdpa_device *vdev, u16 idx, u64 desc_area, + u64 driver_area, u64 device_area) +{ + struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev); + struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev); + struct mlx5_vdpa_virtqueue *mvq; + + if (!is_index_valid(mvdev, idx)) + return -EINVAL; + + if (is_ctrl_vq_idx(mvdev, idx)) { + mvdev->cvq.desc_addr = desc_area; + mvdev->cvq.device_addr = device_area; + mvdev->cvq.driver_addr = driver_area; + return 0; + } + + mvq = &ndev->vqs[idx]; + mvq->desc_addr = desc_area; + mvq->device_addr = device_area; + mvq->driver_addr = driver_area; + return 0; +} + +static void mlx5_vdpa_set_vq_num(struct vdpa_device *vdev, u16 idx, u32 num) +{ + struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev); + struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev); + struct mlx5_vdpa_virtqueue *mvq; + + if (!is_index_valid(mvdev, idx) || is_ctrl_vq_idx(mvdev, idx)) + return; + + mvq = &ndev->vqs[idx]; + mvq->num_ent = num; +} + +static void mlx5_vdpa_set_vq_cb(struct vdpa_device *vdev, u16 idx, struct vdpa_callback *cb) +{ + struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev); + struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev); + + ndev->event_cbs[idx] = *cb; + if (is_ctrl_vq_idx(mvdev, idx)) + mvdev->cvq.event_cb = *cb; +} + +static void mlx5_cvq_notify(struct vringh *vring) +{ + struct mlx5_control_vq *cvq = container_of(vring, struct mlx5_control_vq, vring); + + if (!cvq->event_cb.callback) + return; + + cvq->event_cb.callback(cvq->event_cb.private); +} + +static void set_cvq_ready(struct mlx5_vdpa_dev *mvdev, bool ready) +{ + struct mlx5_control_vq *cvq = &mvdev->cvq; + + cvq->ready = ready; + if (!ready) + return; + + cvq->vring.notify = mlx5_cvq_notify; +} + +static void mlx5_vdpa_set_vq_ready(struct vdpa_device *vdev, u16 idx, bool ready) +{ + struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev); + struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev); + struct mlx5_vdpa_virtqueue *mvq; + int err; + + if (!mvdev->actual_features) + return; + + if (!is_index_valid(mvdev, idx)) + return; + + if (is_ctrl_vq_idx(mvdev, idx)) { + set_cvq_ready(mvdev, ready); + return; + } + + mvq = &ndev->vqs[idx]; + if (!ready) { + suspend_vq(ndev, mvq); + } else { + err = modify_virtqueue(ndev, mvq, MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY); + if (err) { + mlx5_vdpa_warn(mvdev, "modify VQ %d to ready failed (%d)\n", idx, err); + ready = false; + } + } + + + mvq->ready = ready; +} + +static bool mlx5_vdpa_get_vq_ready(struct vdpa_device *vdev, u16 idx) +{ + struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev); + struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev); + + if (!is_index_valid(mvdev, idx)) + return false; + + if (is_ctrl_vq_idx(mvdev, idx)) + return mvdev->cvq.ready; + + return ndev->vqs[idx].ready; +} + +static int mlx5_vdpa_set_vq_state(struct vdpa_device *vdev, u16 idx, + const struct vdpa_vq_state *state) +{ + struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev); + struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev); + struct mlx5_vdpa_virtqueue *mvq; + + if (!is_index_valid(mvdev, idx)) + return -EINVAL; + + if (is_ctrl_vq_idx(mvdev, idx)) { + mvdev->cvq.vring.last_avail_idx = state->split.avail_index; + return 0; + } + + mvq = &ndev->vqs[idx]; + if (mvq->fw_state == MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY) { + mlx5_vdpa_warn(mvdev, "can't modify available index\n"); + return -EINVAL; + } + + mvq->used_idx = state->split.avail_index; + mvq->avail_idx = state->split.avail_index; + return 0; +} + +static int mlx5_vdpa_get_vq_state(struct vdpa_device *vdev, u16 idx, struct vdpa_vq_state *state) +{ + struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev); + struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev); + struct mlx5_vdpa_virtqueue *mvq; + struct mlx5_virtq_attr attr; + int err; + + if (!is_index_valid(mvdev, idx)) + return -EINVAL; + + if (is_ctrl_vq_idx(mvdev, idx)) { + state->split.avail_index = mvdev->cvq.vring.last_avail_idx; + return 0; + } + + mvq = &ndev->vqs[idx]; + /* If the virtq object was destroyed, use the value saved at + * the last minute of suspend_vq. This caters for userspace + * that cares about emulating the index after vq is stopped. + */ + if (!mvq->initialized) { + /* Firmware returns a wrong value for the available index. + * Since both values should be identical, we take the value of + * used_idx which is reported correctly. + */ + state->split.avail_index = mvq->used_idx; + return 0; + } + + err = query_virtqueue(ndev, mvq, &attr); + if (err) { + mlx5_vdpa_warn(mvdev, "failed to query virtqueue\n"); + return err; + } + state->split.avail_index = attr.used_index; + return 0; +} + +static u32 mlx5_vdpa_get_vq_align(struct vdpa_device *vdev) +{ + return PAGE_SIZE; +} + +static u32 mlx5_vdpa_get_vq_group(struct vdpa_device *vdev, u16 idx) +{ + struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev); + + if (is_ctrl_vq_idx(mvdev, idx)) + return MLX5_VDPA_CVQ_GROUP; + + return MLX5_VDPA_DATAVQ_GROUP; +} + +static u64 mlx_to_vritio_features(u16 dev_features) +{ + u64 result = 0; + + if (dev_features & BIT_ULL(MLX5_VIRTIO_NET_F_MRG_RXBUF)) + result |= BIT_ULL(VIRTIO_NET_F_MRG_RXBUF); + if (dev_features & BIT_ULL(MLX5_VIRTIO_NET_F_HOST_ECN)) + result |= BIT_ULL(VIRTIO_NET_F_HOST_ECN); + if (dev_features & BIT_ULL(MLX5_VIRTIO_NET_F_GUEST_ECN)) + result |= BIT_ULL(VIRTIO_NET_F_GUEST_ECN); + if (dev_features & BIT_ULL(MLX5_VIRTIO_NET_F_GUEST_TSO6)) + result |= BIT_ULL(VIRTIO_NET_F_GUEST_TSO6); + if (dev_features & BIT_ULL(MLX5_VIRTIO_NET_F_GUEST_TSO4)) + result |= BIT_ULL(VIRTIO_NET_F_GUEST_TSO4); + if (dev_features & BIT_ULL(MLX5_VIRTIO_NET_F_GUEST_CSUM)) + result |= BIT_ULL(VIRTIO_NET_F_GUEST_CSUM); + if (dev_features & BIT_ULL(MLX5_VIRTIO_NET_F_CSUM)) + result |= BIT_ULL(VIRTIO_NET_F_CSUM); + if (dev_features & BIT_ULL(MLX5_VIRTIO_NET_F_HOST_TSO6)) + result |= BIT_ULL(VIRTIO_NET_F_HOST_TSO6); + if (dev_features & BIT_ULL(MLX5_VIRTIO_NET_F_HOST_TSO4)) + result |= BIT_ULL(VIRTIO_NET_F_HOST_TSO4); + + return result; +} + +static u64 get_supported_features(struct mlx5_core_dev *mdev) +{ + u64 mlx_vdpa_features = 0; + u16 dev_features; + + dev_features = MLX5_CAP_DEV_VDPA_EMULATION(mdev, device_features_bits_mask); + mlx_vdpa_features |= mlx_to_vritio_features(dev_features); + if (MLX5_CAP_DEV_VDPA_EMULATION(mdev, virtio_version_1_0)) + mlx_vdpa_features |= BIT_ULL(VIRTIO_F_VERSION_1); + mlx_vdpa_features |= BIT_ULL(VIRTIO_F_ACCESS_PLATFORM); + mlx_vdpa_features |= BIT_ULL(VIRTIO_NET_F_CTRL_VQ); + mlx_vdpa_features |= BIT_ULL(VIRTIO_NET_F_CTRL_MAC_ADDR); + mlx_vdpa_features |= BIT_ULL(VIRTIO_NET_F_MQ); + mlx_vdpa_features |= BIT_ULL(VIRTIO_NET_F_STATUS); + mlx_vdpa_features |= BIT_ULL(VIRTIO_NET_F_MTU); + mlx_vdpa_features |= BIT_ULL(VIRTIO_NET_F_CTRL_VLAN); + mlx_vdpa_features |= BIT_ULL(VIRTIO_NET_F_MAC); + + return mlx_vdpa_features; +} + +static u64 mlx5_vdpa_get_device_features(struct vdpa_device *vdev) +{ + struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev); + struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev); + + print_features(mvdev, ndev->mvdev.mlx_features, false); + return ndev->mvdev.mlx_features; +} + +static int verify_driver_features(struct mlx5_vdpa_dev *mvdev, u64 features) +{ + /* Minimum features to expect */ + if (!(features & BIT_ULL(VIRTIO_F_ACCESS_PLATFORM))) + return -EOPNOTSUPP; + + /* Double check features combination sent down by the driver. + * Fail invalid features due to absence of the depended feature. + * + * Per VIRTIO v1.1 specification, section 5.1.3.1 Feature bit + * requirements: "VIRTIO_NET_F_MQ Requires VIRTIO_NET_F_CTRL_VQ". + * By failing the invalid features sent down by untrusted drivers, + * we're assured the assumption made upon is_index_valid() and + * is_ctrl_vq_idx() will not be compromised. + */ + if ((features & (BIT_ULL(VIRTIO_NET_F_MQ) | BIT_ULL(VIRTIO_NET_F_CTRL_VQ))) == + BIT_ULL(VIRTIO_NET_F_MQ)) + return -EINVAL; + + return 0; +} + +static int setup_virtqueues(struct mlx5_vdpa_dev *mvdev) +{ + struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev); + int err; + int i; + + for (i = 0; i < mvdev->max_vqs; i++) { + err = setup_vq(ndev, &ndev->vqs[i]); + if (err) + goto err_vq; + } + + return 0; + +err_vq: + for (--i; i >= 0; i--) + teardown_vq(ndev, &ndev->vqs[i]); + + return err; +} + +static void teardown_virtqueues(struct mlx5_vdpa_net *ndev) +{ + struct mlx5_vdpa_virtqueue *mvq; + int i; + + for (i = ndev->mvdev.max_vqs - 1; i >= 0; i--) { + mvq = &ndev->vqs[i]; + if (!mvq->initialized) + continue; + + teardown_vq(ndev, mvq); + } +} + +static void update_cvq_info(struct mlx5_vdpa_dev *mvdev) +{ + if (MLX5_FEATURE(mvdev, VIRTIO_NET_F_CTRL_VQ)) { + if (MLX5_FEATURE(mvdev, VIRTIO_NET_F_MQ)) { + /* MQ supported. CVQ index is right above the last data virtqueue's */ + mvdev->max_idx = mvdev->max_vqs; + } else { + /* Only CVQ supportted. data virtqueues occupy indices 0 and 1. + * CVQ gets index 2 + */ + mvdev->max_idx = 2; + } + } else { + /* Two data virtqueues only: one for rx and one for tx */ + mvdev->max_idx = 1; + } +} + +static u8 query_vport_state(struct mlx5_core_dev *mdev, u8 opmod, u16 vport) +{ + u32 out[MLX5_ST_SZ_DW(query_vport_state_out)] = {}; + u32 in[MLX5_ST_SZ_DW(query_vport_state_in)] = {}; + int err; + + MLX5_SET(query_vport_state_in, in, opcode, MLX5_CMD_OP_QUERY_VPORT_STATE); + MLX5_SET(query_vport_state_in, in, op_mod, opmod); + MLX5_SET(query_vport_state_in, in, vport_number, vport); + if (vport) + MLX5_SET(query_vport_state_in, in, other_vport, 1); + + err = mlx5_cmd_exec_inout(mdev, query_vport_state, in, out); + if (err) + return 0; + + return MLX5_GET(query_vport_state_out, out, state); +} + +static bool get_link_state(struct mlx5_vdpa_dev *mvdev) +{ + if (query_vport_state(mvdev->mdev, MLX5_VPORT_STATE_OP_MOD_VNIC_VPORT, 0) == + VPORT_STATE_UP) + return true; + + return false; +} + +static void update_carrier(struct work_struct *work) +{ + struct mlx5_vdpa_wq_ent *wqent; + struct mlx5_vdpa_dev *mvdev; + struct mlx5_vdpa_net *ndev; + + wqent = container_of(work, struct mlx5_vdpa_wq_ent, work); + mvdev = wqent->mvdev; + ndev = to_mlx5_vdpa_ndev(mvdev); + if (get_link_state(mvdev)) + ndev->config.status |= cpu_to_mlx5vdpa16(mvdev, VIRTIO_NET_S_LINK_UP); + else + ndev->config.status &= cpu_to_mlx5vdpa16(mvdev, ~VIRTIO_NET_S_LINK_UP); + + if (ndev->config_cb.callback) + ndev->config_cb.callback(ndev->config_cb.private); + + kfree(wqent); +} + +static int queue_link_work(struct mlx5_vdpa_net *ndev) +{ + struct mlx5_vdpa_wq_ent *wqent; + + wqent = kzalloc(sizeof(*wqent), GFP_ATOMIC); + if (!wqent) + return -ENOMEM; + + wqent->mvdev = &ndev->mvdev; + INIT_WORK(&wqent->work, update_carrier); + queue_work(ndev->mvdev.wq, &wqent->work); + return 0; +} + +static int event_handler(struct notifier_block *nb, unsigned long event, void *param) +{ + struct mlx5_vdpa_net *ndev = container_of(nb, struct mlx5_vdpa_net, nb); + struct mlx5_eqe *eqe = param; + int ret = NOTIFY_DONE; + + if (event == MLX5_EVENT_TYPE_PORT_CHANGE) { + switch (eqe->sub_type) { + case MLX5_PORT_CHANGE_SUBTYPE_DOWN: + case MLX5_PORT_CHANGE_SUBTYPE_ACTIVE: + if (queue_link_work(ndev)) + return NOTIFY_DONE; + + ret = NOTIFY_OK; + break; + default: + return NOTIFY_DONE; + } + return ret; + } + return ret; +} + +static void register_link_notifier(struct mlx5_vdpa_net *ndev) +{ + if (!(ndev->mvdev.actual_features & BIT_ULL(VIRTIO_NET_F_STATUS))) + return; + + ndev->nb.notifier_call = event_handler; + mlx5_notifier_register(ndev->mvdev.mdev, &ndev->nb); + ndev->nb_registered = true; + queue_link_work(ndev); +} + +static void unregister_link_notifier(struct mlx5_vdpa_net *ndev) +{ + if (!ndev->nb_registered) + return; + + ndev->nb_registered = false; + mlx5_notifier_unregister(ndev->mvdev.mdev, &ndev->nb); + if (ndev->mvdev.wq) + flush_workqueue(ndev->mvdev.wq); +} + +static int mlx5_vdpa_set_driver_features(struct vdpa_device *vdev, u64 features) +{ + struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev); + struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev); + int err; + + print_features(mvdev, features, true); + + err = verify_driver_features(mvdev, features); + if (err) + return err; + + ndev->mvdev.actual_features = features & ndev->mvdev.mlx_features; + if (ndev->mvdev.actual_features & BIT_ULL(VIRTIO_NET_F_MQ)) + ndev->rqt_size = mlx5vdpa16_to_cpu(mvdev, ndev->config.max_virtqueue_pairs); + else + ndev->rqt_size = 1; + + /* Device must start with 1 queue pair, as per VIRTIO v1.2 spec, section + * 5.1.6.5.5 "Device operation in multiqueue mode": + * + * Multiqueue is disabled by default. + * The driver enables multiqueue by sending a command using class + * VIRTIO_NET_CTRL_MQ. The command selects the mode of multiqueue + * operation, as follows: ... + */ + ndev->cur_num_vqs = 2; + + update_cvq_info(mvdev); + return err; +} + +static void mlx5_vdpa_set_config_cb(struct vdpa_device *vdev, struct vdpa_callback *cb) +{ + struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev); + struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev); + + ndev->config_cb = *cb; +} + +#define MLX5_VDPA_MAX_VQ_ENTRIES 256 +static u16 mlx5_vdpa_get_vq_num_max(struct vdpa_device *vdev) +{ + return MLX5_VDPA_MAX_VQ_ENTRIES; +} + +static u32 mlx5_vdpa_get_device_id(struct vdpa_device *vdev) +{ + return VIRTIO_ID_NET; +} + +static u32 mlx5_vdpa_get_vendor_id(struct vdpa_device *vdev) +{ + return PCI_VENDOR_ID_MELLANOX; +} + +static u8 mlx5_vdpa_get_status(struct vdpa_device *vdev) +{ + struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev); + struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev); + + print_status(mvdev, ndev->mvdev.status, false); + return ndev->mvdev.status; +} + +static int save_channel_info(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq) +{ + struct mlx5_vq_restore_info *ri = &mvq->ri; + struct mlx5_virtq_attr attr = {}; + int err; + + if (mvq->initialized) { + err = query_virtqueue(ndev, mvq, &attr); + if (err) + return err; + } + + ri->avail_index = attr.available_index; + ri->used_index = attr.used_index; + ri->ready = mvq->ready; + ri->num_ent = mvq->num_ent; + ri->desc_addr = mvq->desc_addr; + ri->device_addr = mvq->device_addr; + ri->driver_addr = mvq->driver_addr; + ri->map = mvq->map; + ri->restore = true; + return 0; +} + +static int save_channels_info(struct mlx5_vdpa_net *ndev) +{ + int i; + + for (i = 0; i < ndev->mvdev.max_vqs; i++) { + memset(&ndev->vqs[i].ri, 0, sizeof(ndev->vqs[i].ri)); + save_channel_info(ndev, &ndev->vqs[i]); + } + return 0; +} + +static void mlx5_clear_vqs(struct mlx5_vdpa_net *ndev) +{ + int i; + + for (i = 0; i < ndev->mvdev.max_vqs; i++) + memset(&ndev->vqs[i], 0, offsetof(struct mlx5_vdpa_virtqueue, ri)); +} + +static void restore_channels_info(struct mlx5_vdpa_net *ndev) +{ + struct mlx5_vdpa_virtqueue *mvq; + struct mlx5_vq_restore_info *ri; + int i; + + mlx5_clear_vqs(ndev); + init_mvqs(ndev); + for (i = 0; i < ndev->mvdev.max_vqs; i++) { + mvq = &ndev->vqs[i]; + ri = &mvq->ri; + if (!ri->restore) + continue; + + mvq->avail_idx = ri->avail_index; + mvq->used_idx = ri->used_index; + mvq->ready = ri->ready; + mvq->num_ent = ri->num_ent; + mvq->desc_addr = ri->desc_addr; + mvq->device_addr = ri->device_addr; + mvq->driver_addr = ri->driver_addr; + mvq->map = ri->map; + } +} + +static int mlx5_vdpa_change_map(struct mlx5_vdpa_dev *mvdev, + struct vhost_iotlb *iotlb, unsigned int asid) +{ + struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev); + int err; + + suspend_vqs(ndev); + err = save_channels_info(ndev); + if (err) + goto err_mr; + + teardown_driver(ndev); + mlx5_vdpa_destroy_mr_asid(mvdev, asid); + err = mlx5_vdpa_create_mr(mvdev, iotlb, asid); + if (err) + goto err_mr; + + if (!(mvdev->status & VIRTIO_CONFIG_S_DRIVER_OK) || mvdev->suspended) + goto err_mr; + + restore_channels_info(ndev); + err = setup_driver(mvdev); + if (err) + goto err_setup; + + return 0; + +err_setup: + mlx5_vdpa_destroy_mr_asid(mvdev, asid); +err_mr: + return err; +} + +/* reslock must be held for this function */ +static int setup_driver(struct mlx5_vdpa_dev *mvdev) +{ + struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev); + int err; + + WARN_ON(!rwsem_is_locked(&ndev->reslock)); + + if (ndev->setup) { + mlx5_vdpa_warn(mvdev, "setup driver called for already setup driver\n"); + err = 0; + goto out; + } + mlx5_vdpa_add_debugfs(ndev); + + err = read_umem_params(ndev); + if (err) + goto err_setup; + + err = setup_virtqueues(mvdev); + if (err) { + mlx5_vdpa_warn(mvdev, "setup_virtqueues\n"); + goto err_setup; + } + + err = create_rqt(ndev); + if (err) { + mlx5_vdpa_warn(mvdev, "create_rqt\n"); + goto err_rqt; + } + + err = create_tir(ndev); + if (err) { + mlx5_vdpa_warn(mvdev, "create_tir\n"); + goto err_tir; + } + + err = setup_steering(ndev); + if (err) { + mlx5_vdpa_warn(mvdev, "setup_steering\n"); + goto err_fwd; + } + ndev->setup = true; + + return 0; + +err_fwd: + destroy_tir(ndev); +err_tir: + destroy_rqt(ndev); +err_rqt: + teardown_virtqueues(ndev); +err_setup: + mlx5_vdpa_remove_debugfs(ndev); +out: + return err; +} + +/* reslock must be held for this function */ +static void teardown_driver(struct mlx5_vdpa_net *ndev) +{ + + WARN_ON(!rwsem_is_locked(&ndev->reslock)); + + if (!ndev->setup) + return; + + mlx5_vdpa_remove_debugfs(ndev); + teardown_steering(ndev); + destroy_tir(ndev); + destroy_rqt(ndev); + teardown_virtqueues(ndev); + ndev->setup = false; +} + +static void clear_vqs_ready(struct mlx5_vdpa_net *ndev) +{ + int i; + + for (i = 0; i < ndev->mvdev.max_vqs; i++) + ndev->vqs[i].ready = false; + + ndev->mvdev.cvq.ready = false; +} + +static int setup_cvq_vring(struct mlx5_vdpa_dev *mvdev) +{ + struct mlx5_control_vq *cvq = &mvdev->cvq; + int err = 0; + + if (mvdev->actual_features & BIT_ULL(VIRTIO_NET_F_CTRL_VQ)) { + u16 idx = cvq->vring.last_avail_idx; + + err = vringh_init_iotlb(&cvq->vring, mvdev->actual_features, + MLX5_CVQ_MAX_ENT, false, + (struct vring_desc *)(uintptr_t)cvq->desc_addr, + (struct vring_avail *)(uintptr_t)cvq->driver_addr, + (struct vring_used *)(uintptr_t)cvq->device_addr); + + if (!err) + cvq->vring.last_avail_idx = cvq->vring.last_used_idx = idx; + } + return err; +} + +static void mlx5_vdpa_set_status(struct vdpa_device *vdev, u8 status) +{ + struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev); + struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev); + int err; + + print_status(mvdev, status, true); + + down_write(&ndev->reslock); + + if ((status ^ ndev->mvdev.status) & VIRTIO_CONFIG_S_DRIVER_OK) { + if (status & VIRTIO_CONFIG_S_DRIVER_OK) { + err = setup_cvq_vring(mvdev); + if (err) { + mlx5_vdpa_warn(mvdev, "failed to setup control VQ vring\n"); + goto err_setup; + } + register_link_notifier(ndev); + err = setup_driver(mvdev); + if (err) { + mlx5_vdpa_warn(mvdev, "failed to setup driver\n"); + goto err_driver; + } + } else { + mlx5_vdpa_warn(mvdev, "did not expect DRIVER_OK to be cleared\n"); + goto err_clear; + } + } + + ndev->mvdev.status = status; + up_write(&ndev->reslock); + return; + +err_driver: + unregister_link_notifier(ndev); +err_setup: + mlx5_vdpa_destroy_mr(&ndev->mvdev); + ndev->mvdev.status |= VIRTIO_CONFIG_S_FAILED; +err_clear: + up_write(&ndev->reslock); +} + +static void init_group_to_asid_map(struct mlx5_vdpa_dev *mvdev) +{ + int i; + + /* default mapping all groups are mapped to asid 0 */ + for (i = 0; i < MLX5_VDPA_NUMVQ_GROUPS; i++) + mvdev->group2asid[i] = 0; +} + +static int mlx5_vdpa_reset(struct vdpa_device *vdev) +{ + struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev); + struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev); + + print_status(mvdev, 0, true); + mlx5_vdpa_info(mvdev, "performing device reset\n"); + + down_write(&ndev->reslock); + unregister_link_notifier(ndev); + teardown_driver(ndev); + clear_vqs_ready(ndev); + mlx5_vdpa_destroy_mr(&ndev->mvdev); + ndev->mvdev.status = 0; + ndev->mvdev.suspended = false; + ndev->cur_num_vqs = 0; + ndev->mvdev.cvq.received_desc = 0; + ndev->mvdev.cvq.completed_desc = 0; + memset(ndev->event_cbs, 0, sizeof(*ndev->event_cbs) * (mvdev->max_vqs + 1)); + ndev->mvdev.actual_features = 0; + init_group_to_asid_map(mvdev); + ++mvdev->generation; + + if (MLX5_CAP_GEN(mvdev->mdev, umem_uid_0)) { + if (mlx5_vdpa_create_mr(mvdev, NULL, 0)) + mlx5_vdpa_warn(mvdev, "create MR failed\n"); + } + up_write(&ndev->reslock); + + return 0; +} + +static size_t mlx5_vdpa_get_config_size(struct vdpa_device *vdev) +{ + return sizeof(struct virtio_net_config); +} + +static void mlx5_vdpa_get_config(struct vdpa_device *vdev, unsigned int offset, void *buf, + unsigned int len) +{ + struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev); + struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev); + + if (offset + len <= sizeof(struct virtio_net_config)) + memcpy(buf, (u8 *)&ndev->config + offset, len); +} + +static void mlx5_vdpa_set_config(struct vdpa_device *vdev, unsigned int offset, const void *buf, + unsigned int len) +{ + /* not supported */ +} + +static u32 mlx5_vdpa_get_generation(struct vdpa_device *vdev) +{ + struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev); + + return mvdev->generation; +} + +static int set_map_data(struct mlx5_vdpa_dev *mvdev, struct vhost_iotlb *iotlb, + unsigned int asid) +{ + bool change_map; + int err; + + err = mlx5_vdpa_handle_set_map(mvdev, iotlb, &change_map, asid); + if (err) { + mlx5_vdpa_warn(mvdev, "set map failed(%d)\n", err); + return err; + } + + if (change_map) + err = mlx5_vdpa_change_map(mvdev, iotlb, asid); + + return err; +} + +static int mlx5_vdpa_set_map(struct vdpa_device *vdev, unsigned int asid, + struct vhost_iotlb *iotlb) +{ + struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev); + struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev); + int err = -EINVAL; + + down_write(&ndev->reslock); + err = set_map_data(mvdev, iotlb, asid); + up_write(&ndev->reslock); + return err; +} + +static struct device *mlx5_get_vq_dma_dev(struct vdpa_device *vdev, u16 idx) +{ + struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev); + + if (is_ctrl_vq_idx(mvdev, idx)) + return &vdev->dev; + + return mvdev->vdev.dma_dev; +} + +static void free_irqs(struct mlx5_vdpa_net *ndev) +{ + struct mlx5_vdpa_irq_pool_entry *ent; + int i; + + if (!msix_mode_supported(&ndev->mvdev)) + return; + + if (!ndev->irqp.entries) + return; + + for (i = ndev->irqp.num_ent - 1; i >= 0; i--) { + ent = ndev->irqp.entries + i; + if (ent->map.virq) + pci_msix_free_irq(ndev->mvdev.mdev->pdev, ent->map); + } + kfree(ndev->irqp.entries); +} + +static void mlx5_vdpa_free(struct vdpa_device *vdev) +{ + struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev); + struct mlx5_core_dev *pfmdev; + struct mlx5_vdpa_net *ndev; + + ndev = to_mlx5_vdpa_ndev(mvdev); + + free_resources(ndev); + mlx5_vdpa_destroy_mr(mvdev); + if (!is_zero_ether_addr(ndev->config.mac)) { + pfmdev = pci_get_drvdata(pci_physfn(mvdev->mdev->pdev)); + mlx5_mpfs_del_mac(pfmdev, ndev->config.mac); + } + mlx5_vdpa_free_resources(&ndev->mvdev); + free_irqs(ndev); + kfree(ndev->event_cbs); + kfree(ndev->vqs); +} + +static struct vdpa_notification_area mlx5_get_vq_notification(struct vdpa_device *vdev, u16 idx) +{ + struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev); + struct vdpa_notification_area ret = {}; + struct mlx5_vdpa_net *ndev; + phys_addr_t addr; + + if (!is_index_valid(mvdev, idx) || is_ctrl_vq_idx(mvdev, idx)) + return ret; + + /* If SF BAR size is smaller than PAGE_SIZE, do not use direct + * notification to avoid the risk of mapping pages that contain BAR of more + * than one SF + */ + if (MLX5_CAP_GEN(mvdev->mdev, log_min_sf_size) + 12 < PAGE_SHIFT) + return ret; + + ndev = to_mlx5_vdpa_ndev(mvdev); + addr = (phys_addr_t)ndev->mvdev.res.phys_kick_addr; + ret.addr = addr; + ret.size = PAGE_SIZE; + return ret; +} + +static int mlx5_get_vq_irq(struct vdpa_device *vdev, u16 idx) +{ + struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev); + struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev); + struct mlx5_vdpa_virtqueue *mvq; + + if (!is_index_valid(mvdev, idx)) + return -EINVAL; + + if (is_ctrl_vq_idx(mvdev, idx)) + return -EOPNOTSUPP; + + mvq = &ndev->vqs[idx]; + if (!mvq->map.virq) + return -EOPNOTSUPP; + + return mvq->map.virq; +} + +static u64 mlx5_vdpa_get_driver_features(struct vdpa_device *vdev) +{ + struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev); + + return mvdev->actual_features; +} + +static int counter_set_query(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq, + u64 *received_desc, u64 *completed_desc) +{ + u32 in[MLX5_ST_SZ_DW(query_virtio_q_counters_in)] = {}; + u32 out[MLX5_ST_SZ_DW(query_virtio_q_counters_out)] = {}; + void *cmd_hdr; + void *ctx; + int err; + + if (!counters_supported(&ndev->mvdev)) + return -EOPNOTSUPP; + + if (mvq->fw_state != MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY) + return -EAGAIN; + + cmd_hdr = MLX5_ADDR_OF(query_virtio_q_counters_in, in, hdr); + + MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode, MLX5_CMD_OP_QUERY_GENERAL_OBJECT); + MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type, MLX5_OBJ_TYPE_VIRTIO_Q_COUNTERS); + MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, uid, ndev->mvdev.res.uid); + MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_id, mvq->counter_set_id); + + err = mlx5_cmd_exec(ndev->mvdev.mdev, in, sizeof(in), out, sizeof(out)); + if (err) + return err; + + ctx = MLX5_ADDR_OF(query_virtio_q_counters_out, out, counters); + *received_desc = MLX5_GET64(virtio_q_counters, ctx, received_desc); + *completed_desc = MLX5_GET64(virtio_q_counters, ctx, completed_desc); + return 0; +} + +static int mlx5_vdpa_get_vendor_vq_stats(struct vdpa_device *vdev, u16 idx, + struct sk_buff *msg, + struct netlink_ext_ack *extack) +{ + struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev); + struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev); + struct mlx5_vdpa_virtqueue *mvq; + struct mlx5_control_vq *cvq; + u64 received_desc; + u64 completed_desc; + int err = 0; + + down_read(&ndev->reslock); + if (!is_index_valid(mvdev, idx)) { + NL_SET_ERR_MSG_MOD(extack, "virtqueue index is not valid"); + err = -EINVAL; + goto out_err; + } + + if (idx == ctrl_vq_idx(mvdev)) { + cvq = &mvdev->cvq; + received_desc = cvq->received_desc; + completed_desc = cvq->completed_desc; + goto out; + } + + mvq = &ndev->vqs[idx]; + err = counter_set_query(ndev, mvq, &received_desc, &completed_desc); + if (err) { + NL_SET_ERR_MSG_MOD(extack, "failed to query hardware"); + goto out_err; + } + +out: + err = -EMSGSIZE; + if (nla_put_string(msg, VDPA_ATTR_DEV_VENDOR_ATTR_NAME, "received_desc")) + goto out_err; + + if (nla_put_u64_64bit(msg, VDPA_ATTR_DEV_VENDOR_ATTR_VALUE, received_desc, + VDPA_ATTR_PAD)) + goto out_err; + + if (nla_put_string(msg, VDPA_ATTR_DEV_VENDOR_ATTR_NAME, "completed_desc")) + goto out_err; + + if (nla_put_u64_64bit(msg, VDPA_ATTR_DEV_VENDOR_ATTR_VALUE, completed_desc, + VDPA_ATTR_PAD)) + goto out_err; + + err = 0; +out_err: + up_read(&ndev->reslock); + return err; +} + +static void mlx5_vdpa_cvq_suspend(struct mlx5_vdpa_dev *mvdev) +{ + struct mlx5_control_vq *cvq; + + if (!(mvdev->actual_features & BIT_ULL(VIRTIO_NET_F_CTRL_VQ))) + return; + + cvq = &mvdev->cvq; + cvq->ready = false; +} + +static int mlx5_vdpa_suspend(struct vdpa_device *vdev) +{ + struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev); + struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev); + struct mlx5_vdpa_virtqueue *mvq; + int i; + + mlx5_vdpa_info(mvdev, "suspending device\n"); + + down_write(&ndev->reslock); + unregister_link_notifier(ndev); + for (i = 0; i < ndev->cur_num_vqs; i++) { + mvq = &ndev->vqs[i]; + suspend_vq(ndev, mvq); + } + mlx5_vdpa_cvq_suspend(mvdev); + mvdev->suspended = true; + up_write(&ndev->reslock); + return 0; +} + +static int mlx5_set_group_asid(struct vdpa_device *vdev, u32 group, + unsigned int asid) +{ + struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev); + + if (group >= MLX5_VDPA_NUMVQ_GROUPS) + return -EINVAL; + + mvdev->group2asid[group] = asid; + return 0; +} + +static const struct vdpa_config_ops mlx5_vdpa_ops = { + .set_vq_address = mlx5_vdpa_set_vq_address, + .set_vq_num = mlx5_vdpa_set_vq_num, + .kick_vq = mlx5_vdpa_kick_vq, + .set_vq_cb = mlx5_vdpa_set_vq_cb, + .set_vq_ready = mlx5_vdpa_set_vq_ready, + .get_vq_ready = mlx5_vdpa_get_vq_ready, + .set_vq_state = mlx5_vdpa_set_vq_state, + .get_vq_state = mlx5_vdpa_get_vq_state, + .get_vendor_vq_stats = mlx5_vdpa_get_vendor_vq_stats, + .get_vq_notification = mlx5_get_vq_notification, + .get_vq_irq = mlx5_get_vq_irq, + .get_vq_align = mlx5_vdpa_get_vq_align, + .get_vq_group = mlx5_vdpa_get_vq_group, + .get_device_features = mlx5_vdpa_get_device_features, + .set_driver_features = mlx5_vdpa_set_driver_features, + .get_driver_features = mlx5_vdpa_get_driver_features, + .set_config_cb = mlx5_vdpa_set_config_cb, + .get_vq_num_max = mlx5_vdpa_get_vq_num_max, + .get_device_id = mlx5_vdpa_get_device_id, + .get_vendor_id = mlx5_vdpa_get_vendor_id, + .get_status = mlx5_vdpa_get_status, + .set_status = mlx5_vdpa_set_status, + .reset = mlx5_vdpa_reset, + .get_config_size = mlx5_vdpa_get_config_size, + .get_config = mlx5_vdpa_get_config, + .set_config = mlx5_vdpa_set_config, + .get_generation = mlx5_vdpa_get_generation, + .set_map = mlx5_vdpa_set_map, + .set_group_asid = mlx5_set_group_asid, + .get_vq_dma_dev = mlx5_get_vq_dma_dev, + .free = mlx5_vdpa_free, + .suspend = mlx5_vdpa_suspend, +}; + +static int query_mtu(struct mlx5_core_dev *mdev, u16 *mtu) +{ + u16 hw_mtu; + int err; + + err = mlx5_query_nic_vport_mtu(mdev, &hw_mtu); + if (err) + return err; + + *mtu = hw_mtu - MLX5V_ETH_HARD_MTU; + return 0; +} + +static int alloc_resources(struct mlx5_vdpa_net *ndev) +{ + struct mlx5_vdpa_net_resources *res = &ndev->res; + int err; + + if (res->valid) { + mlx5_vdpa_warn(&ndev->mvdev, "resources already allocated\n"); + return -EEXIST; + } + + err = mlx5_vdpa_alloc_transport_domain(&ndev->mvdev, &res->tdn); + if (err) + return err; + + err = create_tis(ndev); + if (err) + goto err_tis; + + res->valid = true; + + return 0; + +err_tis: + mlx5_vdpa_dealloc_transport_domain(&ndev->mvdev, res->tdn); + return err; +} + +static void free_resources(struct mlx5_vdpa_net *ndev) +{ + struct mlx5_vdpa_net_resources *res = &ndev->res; + + if (!res->valid) + return; + + destroy_tis(ndev); + mlx5_vdpa_dealloc_transport_domain(&ndev->mvdev, res->tdn); + res->valid = false; +} + +static void init_mvqs(struct mlx5_vdpa_net *ndev) +{ + struct mlx5_vdpa_virtqueue *mvq; + int i; + + for (i = 0; i < ndev->mvdev.max_vqs; ++i) { + mvq = &ndev->vqs[i]; + memset(mvq, 0, offsetof(struct mlx5_vdpa_virtqueue, ri)); + mvq->index = i; + mvq->ndev = ndev; + mvq->fwqp.fw = true; + mvq->fw_state = MLX5_VIRTIO_NET_Q_OBJECT_NONE; + } + for (; i < ndev->mvdev.max_vqs; i++) { + mvq = &ndev->vqs[i]; + memset(mvq, 0, offsetof(struct mlx5_vdpa_virtqueue, ri)); + mvq->index = i; + mvq->ndev = ndev; + } +} + +struct mlx5_vdpa_mgmtdev { + struct vdpa_mgmt_dev mgtdev; + struct mlx5_adev *madev; + struct mlx5_vdpa_net *ndev; +}; + +static int config_func_mtu(struct mlx5_core_dev *mdev, u16 mtu) +{ + int inlen = MLX5_ST_SZ_BYTES(modify_nic_vport_context_in); + void *in; + int err; + + in = kvzalloc(inlen, GFP_KERNEL); + if (!in) + return -ENOMEM; + + MLX5_SET(modify_nic_vport_context_in, in, field_select.mtu, 1); + MLX5_SET(modify_nic_vport_context_in, in, nic_vport_context.mtu, + mtu + MLX5V_ETH_HARD_MTU); + MLX5_SET(modify_nic_vport_context_in, in, opcode, + MLX5_CMD_OP_MODIFY_NIC_VPORT_CONTEXT); + + err = mlx5_cmd_exec_in(mdev, modify_nic_vport_context, in); + + kvfree(in); + return err; +} + +static void allocate_irqs(struct mlx5_vdpa_net *ndev) +{ + struct mlx5_vdpa_irq_pool_entry *ent; + int i; + + if (!msix_mode_supported(&ndev->mvdev)) + return; + + if (!ndev->mvdev.mdev->pdev) + return; + + ndev->irqp.entries = kcalloc(ndev->mvdev.max_vqs, sizeof(*ndev->irqp.entries), GFP_KERNEL); + if (!ndev->irqp.entries) + return; + + + for (i = 0; i < ndev->mvdev.max_vqs; i++) { + ent = ndev->irqp.entries + i; + snprintf(ent->name, MLX5_VDPA_IRQ_NAME_LEN, "%s-vq-%d", + dev_name(&ndev->mvdev.vdev.dev), i); + ent->map = pci_msix_alloc_irq_at(ndev->mvdev.mdev->pdev, MSI_ANY_INDEX, NULL); + if (!ent->map.virq) + return; + + ndev->irqp.num_ent++; + } +} + +static int mlx5_vdpa_dev_add(struct vdpa_mgmt_dev *v_mdev, const char *name, + const struct vdpa_dev_set_config *add_config) +{ + struct mlx5_vdpa_mgmtdev *mgtdev = container_of(v_mdev, struct mlx5_vdpa_mgmtdev, mgtdev); + struct virtio_net_config *config; + struct mlx5_core_dev *pfmdev; + struct mlx5_vdpa_dev *mvdev; + struct mlx5_vdpa_net *ndev; + struct mlx5_core_dev *mdev; + u64 device_features; + u32 max_vqs; + u16 mtu; + int err; + + if (mgtdev->ndev) + return -ENOSPC; + + mdev = mgtdev->madev->mdev; + device_features = mgtdev->mgtdev.supported_features; + if (add_config->mask & BIT_ULL(VDPA_ATTR_DEV_FEATURES)) { + if (add_config->device_features & ~device_features) { + dev_warn(mdev->device, + "The provisioned features 0x%llx are not supported by this device with features 0x%llx\n", + add_config->device_features, device_features); + return -EINVAL; + } + device_features &= add_config->device_features; + } else { + device_features &= ~BIT_ULL(VIRTIO_NET_F_MRG_RXBUF); + } + if (!(device_features & BIT_ULL(VIRTIO_F_VERSION_1) && + device_features & BIT_ULL(VIRTIO_F_ACCESS_PLATFORM))) { + dev_warn(mdev->device, + "Must provision minimum features 0x%llx for this device", + BIT_ULL(VIRTIO_F_VERSION_1) | BIT_ULL(VIRTIO_F_ACCESS_PLATFORM)); + return -EOPNOTSUPP; + } + + if (!(MLX5_CAP_DEV_VDPA_EMULATION(mdev, virtio_queue_type) & + MLX5_VIRTIO_EMULATION_CAP_VIRTIO_QUEUE_TYPE_SPLIT)) { + dev_warn(mdev->device, "missing support for split virtqueues\n"); + return -EOPNOTSUPP; + } + + max_vqs = min_t(int, MLX5_CAP_DEV_VDPA_EMULATION(mdev, max_num_virtio_queues), + 1 << MLX5_CAP_GEN(mdev, log_max_rqt_size)); + if (max_vqs < 2) { + dev_warn(mdev->device, + "%d virtqueues are supported. At least 2 are required\n", + max_vqs); + return -EAGAIN; + } + + if (add_config->mask & BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MAX_VQP)) { + if (add_config->net.max_vq_pairs > max_vqs / 2) + return -EINVAL; + max_vqs = min_t(u32, max_vqs, 2 * add_config->net.max_vq_pairs); + } else { + max_vqs = 2; + } + + ndev = vdpa_alloc_device(struct mlx5_vdpa_net, mvdev.vdev, mdev->device, &mlx5_vdpa_ops, + MLX5_VDPA_NUMVQ_GROUPS, MLX5_VDPA_NUM_AS, name, false); + if (IS_ERR(ndev)) + return PTR_ERR(ndev); + + ndev->mvdev.max_vqs = max_vqs; + mvdev = &ndev->mvdev; + mvdev->mdev = mdev; + + ndev->vqs = kcalloc(max_vqs, sizeof(*ndev->vqs), GFP_KERNEL); + ndev->event_cbs = kcalloc(max_vqs + 1, sizeof(*ndev->event_cbs), GFP_KERNEL); + if (!ndev->vqs || !ndev->event_cbs) { + err = -ENOMEM; + goto err_alloc; + } + + init_mvqs(ndev); + allocate_irqs(ndev); + init_rwsem(&ndev->reslock); + config = &ndev->config; + + if (add_config->mask & BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MTU)) { + err = config_func_mtu(mdev, add_config->net.mtu); + if (err) + goto err_alloc; + } + + if (device_features & BIT_ULL(VIRTIO_NET_F_MTU)) { + err = query_mtu(mdev, &mtu); + if (err) + goto err_alloc; + + ndev->config.mtu = cpu_to_mlx5vdpa16(mvdev, mtu); + } + + if (device_features & BIT_ULL(VIRTIO_NET_F_STATUS)) { + if (get_link_state(mvdev)) + ndev->config.status |= cpu_to_mlx5vdpa16(mvdev, VIRTIO_NET_S_LINK_UP); + else + ndev->config.status &= cpu_to_mlx5vdpa16(mvdev, ~VIRTIO_NET_S_LINK_UP); + } + + if (add_config->mask & (1 << VDPA_ATTR_DEV_NET_CFG_MACADDR)) { + memcpy(ndev->config.mac, add_config->net.mac, ETH_ALEN); + /* No bother setting mac address in config if not going to provision _F_MAC */ + } else if ((add_config->mask & BIT_ULL(VDPA_ATTR_DEV_FEATURES)) == 0 || + device_features & BIT_ULL(VIRTIO_NET_F_MAC)) { + err = mlx5_query_nic_vport_mac_address(mdev, 0, 0, config->mac); + if (err) + goto err_alloc; + } + + if (!is_zero_ether_addr(config->mac)) { + pfmdev = pci_get_drvdata(pci_physfn(mdev->pdev)); + err = mlx5_mpfs_add_mac(pfmdev, config->mac); + if (err) + goto err_alloc; + } else if ((add_config->mask & BIT_ULL(VDPA_ATTR_DEV_FEATURES)) == 0) { + /* + * We used to clear _F_MAC feature bit if seeing + * zero mac address when device features are not + * specifically provisioned. Keep the behaviour + * so old scripts do not break. + */ + device_features &= ~BIT_ULL(VIRTIO_NET_F_MAC); + } else if (device_features & BIT_ULL(VIRTIO_NET_F_MAC)) { + /* Don't provision zero mac address for _F_MAC */ + mlx5_vdpa_warn(&ndev->mvdev, + "No mac address provisioned?\n"); + err = -EINVAL; + goto err_alloc; + } + + if (device_features & BIT_ULL(VIRTIO_NET_F_MQ)) + config->max_virtqueue_pairs = cpu_to_mlx5vdpa16(mvdev, max_vqs / 2); + + ndev->mvdev.mlx_features = device_features; + mvdev->vdev.dma_dev = &mdev->pdev->dev; + err = mlx5_vdpa_alloc_resources(&ndev->mvdev); + if (err) + goto err_mpfs; + + if (MLX5_CAP_GEN(mvdev->mdev, umem_uid_0)) { + err = mlx5_vdpa_create_mr(mvdev, NULL, 0); + if (err) + goto err_res; + } + + err = alloc_resources(ndev); + if (err) + goto err_mr; + + ndev->cvq_ent.mvdev = mvdev; + INIT_WORK(&ndev->cvq_ent.work, mlx5_cvq_kick_handler); + mvdev->wq = create_singlethread_workqueue("mlx5_vdpa_wq"); + if (!mvdev->wq) { + err = -ENOMEM; + goto err_res2; + } + + mvdev->vdev.mdev = &mgtdev->mgtdev; + err = _vdpa_register_device(&mvdev->vdev, max_vqs + 1); + if (err) + goto err_reg; + + mgtdev->ndev = ndev; + return 0; + +err_reg: + destroy_workqueue(mvdev->wq); +err_res2: + free_resources(ndev); +err_mr: + mlx5_vdpa_destroy_mr(mvdev); +err_res: + mlx5_vdpa_free_resources(&ndev->mvdev); +err_mpfs: + if (!is_zero_ether_addr(config->mac)) + mlx5_mpfs_del_mac(pfmdev, config->mac); +err_alloc: + put_device(&mvdev->vdev.dev); + return err; +} + +static void mlx5_vdpa_dev_del(struct vdpa_mgmt_dev *v_mdev, struct vdpa_device *dev) +{ + struct mlx5_vdpa_mgmtdev *mgtdev = container_of(v_mdev, struct mlx5_vdpa_mgmtdev, mgtdev); + struct mlx5_vdpa_dev *mvdev = to_mvdev(dev); + struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev); + struct workqueue_struct *wq; + + unregister_link_notifier(ndev); + _vdpa_unregister_device(dev); + wq = mvdev->wq; + mvdev->wq = NULL; + destroy_workqueue(wq); + mgtdev->ndev = NULL; +} + +static const struct vdpa_mgmtdev_ops mdev_ops = { + .dev_add = mlx5_vdpa_dev_add, + .dev_del = mlx5_vdpa_dev_del, +}; + +static struct virtio_device_id id_table[] = { + { VIRTIO_ID_NET, VIRTIO_DEV_ANY_ID }, + { 0 }, +}; + +static int mlx5v_probe(struct auxiliary_device *adev, + const struct auxiliary_device_id *id) + +{ + struct mlx5_adev *madev = container_of(adev, struct mlx5_adev, adev); + struct mlx5_core_dev *mdev = madev->mdev; + struct mlx5_vdpa_mgmtdev *mgtdev; + int err; + + mgtdev = kzalloc(sizeof(*mgtdev), GFP_KERNEL); + if (!mgtdev) + return -ENOMEM; + + mgtdev->mgtdev.ops = &mdev_ops; + mgtdev->mgtdev.device = mdev->device; + mgtdev->mgtdev.id_table = id_table; + mgtdev->mgtdev.config_attr_mask = BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MACADDR) | + BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MAX_VQP) | + BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MTU) | + BIT_ULL(VDPA_ATTR_DEV_FEATURES); + mgtdev->mgtdev.max_supported_vqs = + MLX5_CAP_DEV_VDPA_EMULATION(mdev, max_num_virtio_queues) + 1; + mgtdev->mgtdev.supported_features = get_supported_features(mdev); + mgtdev->madev = madev; + + err = vdpa_mgmtdev_register(&mgtdev->mgtdev); + if (err) + goto reg_err; + + auxiliary_set_drvdata(adev, mgtdev); + + return 0; + +reg_err: + kfree(mgtdev); + return err; +} + +static void mlx5v_remove(struct auxiliary_device *adev) +{ + struct mlx5_vdpa_mgmtdev *mgtdev; + + mgtdev = auxiliary_get_drvdata(adev); + vdpa_mgmtdev_unregister(&mgtdev->mgtdev); + kfree(mgtdev); +} + +static const struct auxiliary_device_id mlx5v_id_table[] = { + { .name = MLX5_ADEV_NAME ".vnet", }, + {}, +}; + +MODULE_DEVICE_TABLE(auxiliary, mlx5v_id_table); + +static struct auxiliary_driver mlx5v_driver = { + .name = "vnet", + .probe = mlx5v_probe, + .remove = mlx5v_remove, + .id_table = mlx5v_id_table, +}; + +module_auxiliary_driver(mlx5v_driver); diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.h b/drivers/vdpa/mlx5/net/mlx5_vnet.h new file mode 100644 index 0000000000..90b556a579 --- /dev/null +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.h @@ -0,0 +1,118 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. */ + +#ifndef __MLX5_VNET_H__ +#define __MLX5_VNET_H__ + +#include "mlx5_vdpa.h" + +#define to_mlx5_vdpa_ndev(__mvdev) \ + container_of(__mvdev, struct mlx5_vdpa_net, mvdev) +#define to_mvdev(__vdev) container_of((__vdev), struct mlx5_vdpa_dev, vdev) + +struct mlx5_vdpa_net_resources { + u32 tisn; + u32 tdn; + u32 tirn; + u32 rqtn; + bool valid; + struct dentry *tirn_dent; +}; + +#define MLX5V_MACVLAN_SIZE 256 + +static inline u16 key2vid(u64 key) +{ + return (u16)(key >> 48) & 0xfff; +} + +#define MLX5_VDPA_IRQ_NAME_LEN 32 + +struct mlx5_vdpa_irq_pool_entry { + struct msi_map map; + bool used; + char name[MLX5_VDPA_IRQ_NAME_LEN]; + void *dev_id; +}; + +struct mlx5_vdpa_irq_pool { + int num_ent; + struct mlx5_vdpa_irq_pool_entry *entries; +}; + +struct mlx5_vdpa_net { + struct mlx5_vdpa_dev mvdev; + struct mlx5_vdpa_net_resources res; + struct virtio_net_config config; + struct mlx5_vdpa_virtqueue *vqs; + struct vdpa_callback *event_cbs; + + /* Serialize vq resources creation and destruction. This is required + * since memory map might change and we need to destroy and create + * resources while driver in operational. + */ + struct rw_semaphore reslock; + struct mlx5_flow_table *rxft; + struct dentry *rx_dent; + struct dentry *rx_table_dent; + bool setup; + u32 cur_num_vqs; + u32 rqt_size; + bool nb_registered; + struct notifier_block nb; + struct vdpa_callback config_cb; + struct mlx5_vdpa_wq_ent cvq_ent; + struct hlist_head macvlan_hash[MLX5V_MACVLAN_SIZE]; + struct mlx5_vdpa_irq_pool irqp; + struct dentry *debugfs; + + u32 umem_1_buffer_param_a; + u32 umem_1_buffer_param_b; + + u32 umem_2_buffer_param_a; + u32 umem_2_buffer_param_b; + + u32 umem_3_buffer_param_a; + u32 umem_3_buffer_param_b; +}; + +struct mlx5_vdpa_counter { + struct mlx5_fc *counter; + struct dentry *dent; + struct mlx5_core_dev *mdev; +}; + +struct macvlan_node { + struct hlist_node hlist; + struct mlx5_flow_handle *ucast_rule; + struct mlx5_flow_handle *mcast_rule; + u64 macvlan; + struct mlx5_vdpa_net *ndev; + bool tagged; +#if defined(CONFIG_MLX5_VDPA_STEERING_DEBUG) + struct dentry *dent; + struct mlx5_vdpa_counter ucast_counter; + struct mlx5_vdpa_counter mcast_counter; +#endif +}; + +void mlx5_vdpa_add_debugfs(struct mlx5_vdpa_net *ndev); +void mlx5_vdpa_remove_debugfs(struct mlx5_vdpa_net *ndev); +void mlx5_vdpa_add_rx_flow_table(struct mlx5_vdpa_net *ndev); +void mlx5_vdpa_remove_rx_flow_table(struct mlx5_vdpa_net *ndev); +void mlx5_vdpa_add_tirn(struct mlx5_vdpa_net *ndev); +void mlx5_vdpa_remove_tirn(struct mlx5_vdpa_net *ndev); +#if defined(CONFIG_MLX5_VDPA_STEERING_DEBUG) +void mlx5_vdpa_add_rx_counters(struct mlx5_vdpa_net *ndev, + struct macvlan_node *node); +void mlx5_vdpa_remove_rx_counters(struct mlx5_vdpa_net *ndev, + struct macvlan_node *node); +#else +static inline void mlx5_vdpa_add_rx_counters(struct mlx5_vdpa_net *ndev, + struct macvlan_node *node) {} +static inline void mlx5_vdpa_remove_rx_counters(struct mlx5_vdpa_net *ndev, + struct macvlan_node *node) {} +#endif + + +#endif /* __MLX5_VNET_H__ */ diff --git a/drivers/vdpa/pds/Makefile b/drivers/vdpa/pds/Makefile new file mode 100644 index 0000000000..c2d314d461 --- /dev/null +++ b/drivers/vdpa/pds/Makefile @@ -0,0 +1,9 @@ +# SPDX-License-Identifier: GPL-2.0-only +# Copyright(c) 2023 Advanced Micro Devices, Inc + +obj-$(CONFIG_PDS_VDPA) := pds_vdpa.o + +pds_vdpa-y := aux_drv.o \ + cmds.o \ + debugfs.o \ + vdpa_dev.o diff --git a/drivers/vdpa/pds/aux_drv.c b/drivers/vdpa/pds/aux_drv.c new file mode 100644 index 0000000000..186e9ee22e --- /dev/null +++ b/drivers/vdpa/pds/aux_drv.c @@ -0,0 +1,140 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright(c) 2023 Advanced Micro Devices, Inc */ + +#include <linux/auxiliary_bus.h> +#include <linux/pci.h> +#include <linux/vdpa.h> +#include <linux/virtio_pci_modern.h> + +#include <linux/pds/pds_common.h> +#include <linux/pds/pds_core_if.h> +#include <linux/pds/pds_adminq.h> +#include <linux/pds/pds_auxbus.h> + +#include "aux_drv.h" +#include "debugfs.h" +#include "vdpa_dev.h" + +static const struct auxiliary_device_id pds_vdpa_id_table[] = { + { .name = PDS_VDPA_DEV_NAME, }, + {}, +}; + +static int pds_vdpa_device_id_check(struct pci_dev *pdev) +{ + if (pdev->device != PCI_DEVICE_ID_PENSANDO_VDPA_VF || + pdev->vendor != PCI_VENDOR_ID_PENSANDO) + return -ENODEV; + + return PCI_DEVICE_ID_PENSANDO_VDPA_VF; +} + +static int pds_vdpa_probe(struct auxiliary_device *aux_dev, + const struct auxiliary_device_id *id) + +{ + struct pds_auxiliary_dev *padev = + container_of(aux_dev, struct pds_auxiliary_dev, aux_dev); + struct device *dev = &aux_dev->dev; + struct pds_vdpa_aux *vdpa_aux; + int err; + + vdpa_aux = kzalloc(sizeof(*vdpa_aux), GFP_KERNEL); + if (!vdpa_aux) + return -ENOMEM; + + vdpa_aux->padev = padev; + vdpa_aux->vf_id = pci_iov_vf_id(padev->vf_pdev); + auxiliary_set_drvdata(aux_dev, vdpa_aux); + + /* Get device ident info and set up the vdpa_mgmt_dev */ + err = pds_vdpa_get_mgmt_info(vdpa_aux); + if (err) + goto err_free_mem; + + /* Find the virtio configuration */ + vdpa_aux->vd_mdev.pci_dev = padev->vf_pdev; + vdpa_aux->vd_mdev.device_id_check = pds_vdpa_device_id_check; + vdpa_aux->vd_mdev.dma_mask = DMA_BIT_MASK(PDS_CORE_ADDR_LEN); + err = vp_modern_probe(&vdpa_aux->vd_mdev); + if (err) { + dev_err(dev, "Unable to probe for virtio configuration: %pe\n", + ERR_PTR(err)); + goto err_free_mgmt_info; + } + + /* Let vdpa know that we can provide devices */ + err = vdpa_mgmtdev_register(&vdpa_aux->vdpa_mdev); + if (err) { + dev_err(dev, "%s: Failed to initialize vdpa_mgmt interface: %pe\n", + __func__, ERR_PTR(err)); + goto err_free_virtio; + } + + pds_vdpa_debugfs_add_pcidev(vdpa_aux); + pds_vdpa_debugfs_add_ident(vdpa_aux); + + return 0; + +err_free_virtio: + vp_modern_remove(&vdpa_aux->vd_mdev); +err_free_mgmt_info: + pci_free_irq_vectors(padev->vf_pdev); +err_free_mem: + kfree(vdpa_aux); + auxiliary_set_drvdata(aux_dev, NULL); + + return err; +} + +static void pds_vdpa_remove(struct auxiliary_device *aux_dev) +{ + struct pds_vdpa_aux *vdpa_aux = auxiliary_get_drvdata(aux_dev); + struct device *dev = &aux_dev->dev; + + vdpa_mgmtdev_unregister(&vdpa_aux->vdpa_mdev); + vp_modern_remove(&vdpa_aux->vd_mdev); + pci_free_irq_vectors(vdpa_aux->padev->vf_pdev); + + pds_vdpa_debugfs_del_vdpadev(vdpa_aux); + kfree(vdpa_aux); + auxiliary_set_drvdata(aux_dev, NULL); + + dev_info(dev, "Removed\n"); +} + +static struct auxiliary_driver pds_vdpa_driver = { + .name = PDS_DEV_TYPE_VDPA_STR, + .probe = pds_vdpa_probe, + .remove = pds_vdpa_remove, + .id_table = pds_vdpa_id_table, +}; + +static void __exit pds_vdpa_cleanup(void) +{ + auxiliary_driver_unregister(&pds_vdpa_driver); + + pds_vdpa_debugfs_destroy(); +} +module_exit(pds_vdpa_cleanup); + +static int __init pds_vdpa_init(void) +{ + int err; + + pds_vdpa_debugfs_create(); + + err = auxiliary_driver_register(&pds_vdpa_driver); + if (err) { + pr_err("%s: aux driver register failed: %pe\n", + PDS_VDPA_DRV_NAME, ERR_PTR(err)); + pds_vdpa_debugfs_destroy(); + } + + return err; +} +module_init(pds_vdpa_init); + +MODULE_DESCRIPTION(PDS_VDPA_DRV_DESCRIPTION); +MODULE_AUTHOR("Advanced Micro Devices, Inc"); +MODULE_LICENSE("GPL"); diff --git a/drivers/vdpa/pds/aux_drv.h b/drivers/vdpa/pds/aux_drv.h new file mode 100644 index 0000000000..26b7534415 --- /dev/null +++ b/drivers/vdpa/pds/aux_drv.h @@ -0,0 +1,26 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* Copyright(c) 2023 Advanced Micro Devices, Inc */ + +#ifndef _AUX_DRV_H_ +#define _AUX_DRV_H_ + +#include <linux/virtio_pci_modern.h> + +#define PDS_VDPA_DRV_DESCRIPTION "AMD/Pensando vDPA VF Device Driver" +#define PDS_VDPA_DRV_NAME KBUILD_MODNAME + +struct pds_vdpa_aux { + struct pds_auxiliary_dev *padev; + + struct vdpa_mgmt_dev vdpa_mdev; + struct pds_vdpa_device *pdsv; + + struct pds_vdpa_ident ident; + + int vf_id; + struct dentry *dentry; + struct virtio_pci_modern_device vd_mdev; + + int nintrs; +}; +#endif /* _AUX_DRV_H_ */ diff --git a/drivers/vdpa/pds/cmds.c b/drivers/vdpa/pds/cmds.c new file mode 100644 index 0000000000..80863a41c3 --- /dev/null +++ b/drivers/vdpa/pds/cmds.c @@ -0,0 +1,185 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright(c) 2023 Advanced Micro Devices, Inc */ + +#include <linux/vdpa.h> +#include <linux/virtio_pci_modern.h> + +#include <linux/pds/pds_common.h> +#include <linux/pds/pds_core_if.h> +#include <linux/pds/pds_adminq.h> +#include <linux/pds/pds_auxbus.h> + +#include "vdpa_dev.h" +#include "aux_drv.h" +#include "cmds.h" + +int pds_vdpa_init_hw(struct pds_vdpa_device *pdsv) +{ + struct pds_auxiliary_dev *padev = pdsv->vdpa_aux->padev; + struct device *dev = &padev->aux_dev.dev; + union pds_core_adminq_cmd cmd = { + .vdpa_init.opcode = PDS_VDPA_CMD_INIT, + .vdpa_init.vdpa_index = pdsv->vdpa_index, + .vdpa_init.vf_id = cpu_to_le16(pdsv->vdpa_aux->vf_id), + }; + union pds_core_adminq_comp comp = {}; + int err; + + /* Initialize the vdpa/virtio device */ + err = pds_client_adminq_cmd(padev, &cmd, sizeof(cmd.vdpa_init), + &comp, 0); + if (err) + dev_dbg(dev, "Failed to init hw, status %d: %pe\n", + comp.status, ERR_PTR(err)); + + return err; +} + +int pds_vdpa_cmd_reset(struct pds_vdpa_device *pdsv) +{ + struct pds_auxiliary_dev *padev = pdsv->vdpa_aux->padev; + struct device *dev = &padev->aux_dev.dev; + union pds_core_adminq_cmd cmd = { + .vdpa.opcode = PDS_VDPA_CMD_RESET, + .vdpa.vdpa_index = pdsv->vdpa_index, + .vdpa.vf_id = cpu_to_le16(pdsv->vdpa_aux->vf_id), + }; + union pds_core_adminq_comp comp = {}; + int err; + + err = pds_client_adminq_cmd(padev, &cmd, sizeof(cmd.vdpa), &comp, 0); + if (err) + dev_dbg(dev, "Failed to reset hw, status %d: %pe\n", + comp.status, ERR_PTR(err)); + + return err; +} + +int pds_vdpa_cmd_set_status(struct pds_vdpa_device *pdsv, u8 status) +{ + struct pds_auxiliary_dev *padev = pdsv->vdpa_aux->padev; + struct device *dev = &padev->aux_dev.dev; + union pds_core_adminq_cmd cmd = { + .vdpa_status.opcode = PDS_VDPA_CMD_STATUS_UPDATE, + .vdpa_status.vdpa_index = pdsv->vdpa_index, + .vdpa_status.vf_id = cpu_to_le16(pdsv->vdpa_aux->vf_id), + .vdpa_status.status = status, + }; + union pds_core_adminq_comp comp = {}; + int err; + + err = pds_client_adminq_cmd(padev, &cmd, sizeof(cmd.vdpa_status), &comp, 0); + if (err) + dev_dbg(dev, "Failed to set status to %#x, error status %d: %pe\n", + status, comp.status, ERR_PTR(err)); + + return err; +} + +int pds_vdpa_cmd_set_mac(struct pds_vdpa_device *pdsv, u8 *mac) +{ + struct pds_auxiliary_dev *padev = pdsv->vdpa_aux->padev; + struct device *dev = &padev->aux_dev.dev; + union pds_core_adminq_cmd cmd = { + .vdpa_setattr.opcode = PDS_VDPA_CMD_SET_ATTR, + .vdpa_setattr.vdpa_index = pdsv->vdpa_index, + .vdpa_setattr.vf_id = cpu_to_le16(pdsv->vdpa_aux->vf_id), + .vdpa_setattr.attr = PDS_VDPA_ATTR_MAC, + }; + union pds_core_adminq_comp comp = {}; + int err; + + ether_addr_copy(cmd.vdpa_setattr.mac, mac); + err = pds_client_adminq_cmd(padev, &cmd, sizeof(cmd.vdpa_setattr), + &comp, 0); + if (err) + dev_dbg(dev, "Failed to set mac address %pM, status %d: %pe\n", + mac, comp.status, ERR_PTR(err)); + + return err; +} + +int pds_vdpa_cmd_set_max_vq_pairs(struct pds_vdpa_device *pdsv, u16 max_vqp) +{ + struct pds_auxiliary_dev *padev = pdsv->vdpa_aux->padev; + struct device *dev = &padev->aux_dev.dev; + union pds_core_adminq_cmd cmd = { + .vdpa_setattr.opcode = PDS_VDPA_CMD_SET_ATTR, + .vdpa_setattr.vdpa_index = pdsv->vdpa_index, + .vdpa_setattr.vf_id = cpu_to_le16(pdsv->vdpa_aux->vf_id), + .vdpa_setattr.attr = PDS_VDPA_ATTR_MAX_VQ_PAIRS, + .vdpa_setattr.max_vq_pairs = cpu_to_le16(max_vqp), + }; + union pds_core_adminq_comp comp = {}; + int err; + + err = pds_client_adminq_cmd(padev, &cmd, sizeof(cmd.vdpa_setattr), + &comp, 0); + if (err) + dev_dbg(dev, "Failed to set max vq pairs %u, status %d: %pe\n", + max_vqp, comp.status, ERR_PTR(err)); + + return err; +} + +int pds_vdpa_cmd_init_vq(struct pds_vdpa_device *pdsv, u16 qid, u16 invert_idx, + struct pds_vdpa_vq_info *vq_info) +{ + struct pds_auxiliary_dev *padev = pdsv->vdpa_aux->padev; + struct device *dev = &padev->aux_dev.dev; + union pds_core_adminq_cmd cmd = { + .vdpa_vq_init.opcode = PDS_VDPA_CMD_VQ_INIT, + .vdpa_vq_init.vdpa_index = pdsv->vdpa_index, + .vdpa_vq_init.vf_id = cpu_to_le16(pdsv->vdpa_aux->vf_id), + .vdpa_vq_init.qid = cpu_to_le16(qid), + .vdpa_vq_init.len = cpu_to_le16(ilog2(vq_info->q_len)), + .vdpa_vq_init.desc_addr = cpu_to_le64(vq_info->desc_addr), + .vdpa_vq_init.avail_addr = cpu_to_le64(vq_info->avail_addr), + .vdpa_vq_init.used_addr = cpu_to_le64(vq_info->used_addr), + .vdpa_vq_init.intr_index = cpu_to_le16(qid), + .vdpa_vq_init.avail_index = cpu_to_le16(vq_info->avail_idx ^ invert_idx), + .vdpa_vq_init.used_index = cpu_to_le16(vq_info->used_idx ^ invert_idx), + }; + union pds_core_adminq_comp comp = {}; + int err; + + dev_dbg(dev, "%s: qid %d len %d desc_addr %#llx avail_addr %#llx used_addr %#llx\n", + __func__, qid, ilog2(vq_info->q_len), + vq_info->desc_addr, vq_info->avail_addr, vq_info->used_addr); + + err = pds_client_adminq_cmd(padev, &cmd, sizeof(cmd.vdpa_vq_init), + &comp, 0); + if (err) + dev_dbg(dev, "Failed to init vq %d, status %d: %pe\n", + qid, comp.status, ERR_PTR(err)); + + return err; +} + +int pds_vdpa_cmd_reset_vq(struct pds_vdpa_device *pdsv, u16 qid, u16 invert_idx, + struct pds_vdpa_vq_info *vq_info) +{ + struct pds_auxiliary_dev *padev = pdsv->vdpa_aux->padev; + struct device *dev = &padev->aux_dev.dev; + union pds_core_adminq_cmd cmd = { + .vdpa_vq_reset.opcode = PDS_VDPA_CMD_VQ_RESET, + .vdpa_vq_reset.vdpa_index = pdsv->vdpa_index, + .vdpa_vq_reset.vf_id = cpu_to_le16(pdsv->vdpa_aux->vf_id), + .vdpa_vq_reset.qid = cpu_to_le16(qid), + }; + union pds_core_adminq_comp comp = {}; + int err; + + err = pds_client_adminq_cmd(padev, &cmd, sizeof(cmd.vdpa_vq_reset), + &comp, 0); + if (err) { + dev_dbg(dev, "Failed to reset vq %d, status %d: %pe\n", + qid, comp.status, ERR_PTR(err)); + return err; + } + + vq_info->avail_idx = le16_to_cpu(comp.vdpa_vq_reset.avail_index) ^ invert_idx; + vq_info->used_idx = le16_to_cpu(comp.vdpa_vq_reset.used_index) ^ invert_idx; + + return 0; +} diff --git a/drivers/vdpa/pds/cmds.h b/drivers/vdpa/pds/cmds.h new file mode 100644 index 0000000000..e24d85cb8f --- /dev/null +++ b/drivers/vdpa/pds/cmds.h @@ -0,0 +1,18 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* Copyright(c) 2023 Advanced Micro Devices, Inc */ + +#ifndef _VDPA_CMDS_H_ +#define _VDPA_CMDS_H_ + +int pds_vdpa_init_hw(struct pds_vdpa_device *pdsv); + +int pds_vdpa_cmd_reset(struct pds_vdpa_device *pdsv); +int pds_vdpa_cmd_set_status(struct pds_vdpa_device *pdsv, u8 status); +int pds_vdpa_cmd_set_mac(struct pds_vdpa_device *pdsv, u8 *mac); +int pds_vdpa_cmd_set_max_vq_pairs(struct pds_vdpa_device *pdsv, u16 max_vqp); +int pds_vdpa_cmd_init_vq(struct pds_vdpa_device *pdsv, u16 qid, u16 invert_idx, + struct pds_vdpa_vq_info *vq_info); +int pds_vdpa_cmd_reset_vq(struct pds_vdpa_device *pdsv, u16 qid, u16 invert_idx, + struct pds_vdpa_vq_info *vq_info); +int pds_vdpa_cmd_set_features(struct pds_vdpa_device *pdsv, u64 features); +#endif /* _VDPA_CMDS_H_ */ diff --git a/drivers/vdpa/pds/debugfs.c b/drivers/vdpa/pds/debugfs.c new file mode 100644 index 0000000000..c328e694f6 --- /dev/null +++ b/drivers/vdpa/pds/debugfs.c @@ -0,0 +1,286 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright(c) 2023 Advanced Micro Devices, Inc */ + +#include <linux/pci.h> +#include <linux/vdpa.h> + +#include <linux/pds/pds_common.h> +#include <linux/pds/pds_core_if.h> +#include <linux/pds/pds_adminq.h> +#include <linux/pds/pds_auxbus.h> + +#include "aux_drv.h" +#include "vdpa_dev.h" +#include "debugfs.h" + +static struct dentry *dbfs_dir; + +void pds_vdpa_debugfs_create(void) +{ + dbfs_dir = debugfs_create_dir(PDS_VDPA_DRV_NAME, NULL); +} + +void pds_vdpa_debugfs_destroy(void) +{ + debugfs_remove_recursive(dbfs_dir); + dbfs_dir = NULL; +} + +#define PRINT_SBIT_NAME(__seq, __f, __name) \ + do { \ + if ((__f) & (__name)) \ + seq_printf(__seq, " %s", &#__name[16]); \ + } while (0) + +static void print_status_bits(struct seq_file *seq, u8 status) +{ + seq_puts(seq, "status:"); + PRINT_SBIT_NAME(seq, status, VIRTIO_CONFIG_S_ACKNOWLEDGE); + PRINT_SBIT_NAME(seq, status, VIRTIO_CONFIG_S_DRIVER); + PRINT_SBIT_NAME(seq, status, VIRTIO_CONFIG_S_DRIVER_OK); + PRINT_SBIT_NAME(seq, status, VIRTIO_CONFIG_S_FEATURES_OK); + PRINT_SBIT_NAME(seq, status, VIRTIO_CONFIG_S_NEEDS_RESET); + PRINT_SBIT_NAME(seq, status, VIRTIO_CONFIG_S_FAILED); + seq_puts(seq, "\n"); +} + +static void print_feature_bits_all(struct seq_file *seq, u64 features) +{ + int i; + + seq_puts(seq, "features:"); + + for (i = 0; i < (sizeof(u64) * 8); i++) { + u64 mask = BIT_ULL(i); + + switch (features & mask) { + case BIT_ULL(VIRTIO_NET_F_CSUM): + seq_puts(seq, " VIRTIO_NET_F_CSUM"); + break; + case BIT_ULL(VIRTIO_NET_F_GUEST_CSUM): + seq_puts(seq, " VIRTIO_NET_F_GUEST_CSUM"); + break; + case BIT_ULL(VIRTIO_NET_F_CTRL_GUEST_OFFLOADS): + seq_puts(seq, " VIRTIO_NET_F_CTRL_GUEST_OFFLOADS"); + break; + case BIT_ULL(VIRTIO_NET_F_MTU): + seq_puts(seq, " VIRTIO_NET_F_MTU"); + break; + case BIT_ULL(VIRTIO_NET_F_MAC): + seq_puts(seq, " VIRTIO_NET_F_MAC"); + break; + case BIT_ULL(VIRTIO_NET_F_GUEST_TSO4): + seq_puts(seq, " VIRTIO_NET_F_GUEST_TSO4"); + break; + case BIT_ULL(VIRTIO_NET_F_GUEST_TSO6): + seq_puts(seq, " VIRTIO_NET_F_GUEST_TSO6"); + break; + case BIT_ULL(VIRTIO_NET_F_GUEST_ECN): + seq_puts(seq, " VIRTIO_NET_F_GUEST_ECN"); + break; + case BIT_ULL(VIRTIO_NET_F_GUEST_UFO): + seq_puts(seq, " VIRTIO_NET_F_GUEST_UFO"); + break; + case BIT_ULL(VIRTIO_NET_F_HOST_TSO4): + seq_puts(seq, " VIRTIO_NET_F_HOST_TSO4"); + break; + case BIT_ULL(VIRTIO_NET_F_HOST_TSO6): + seq_puts(seq, " VIRTIO_NET_F_HOST_TSO6"); + break; + case BIT_ULL(VIRTIO_NET_F_HOST_ECN): + seq_puts(seq, " VIRTIO_NET_F_HOST_ECN"); + break; + case BIT_ULL(VIRTIO_NET_F_HOST_UFO): + seq_puts(seq, " VIRTIO_NET_F_HOST_UFO"); + break; + case BIT_ULL(VIRTIO_NET_F_MRG_RXBUF): + seq_puts(seq, " VIRTIO_NET_F_MRG_RXBUF"); + break; + case BIT_ULL(VIRTIO_NET_F_STATUS): + seq_puts(seq, " VIRTIO_NET_F_STATUS"); + break; + case BIT_ULL(VIRTIO_NET_F_CTRL_VQ): + seq_puts(seq, " VIRTIO_NET_F_CTRL_VQ"); + break; + case BIT_ULL(VIRTIO_NET_F_CTRL_RX): + seq_puts(seq, " VIRTIO_NET_F_CTRL_RX"); + break; + case BIT_ULL(VIRTIO_NET_F_CTRL_VLAN): + seq_puts(seq, " VIRTIO_NET_F_CTRL_VLAN"); + break; + case BIT_ULL(VIRTIO_NET_F_CTRL_RX_EXTRA): + seq_puts(seq, " VIRTIO_NET_F_CTRL_RX_EXTRA"); + break; + case BIT_ULL(VIRTIO_NET_F_GUEST_ANNOUNCE): + seq_puts(seq, " VIRTIO_NET_F_GUEST_ANNOUNCE"); + break; + case BIT_ULL(VIRTIO_NET_F_MQ): + seq_puts(seq, " VIRTIO_NET_F_MQ"); + break; + case BIT_ULL(VIRTIO_NET_F_CTRL_MAC_ADDR): + seq_puts(seq, " VIRTIO_NET_F_CTRL_MAC_ADDR"); + break; + case BIT_ULL(VIRTIO_NET_F_HASH_REPORT): + seq_puts(seq, " VIRTIO_NET_F_HASH_REPORT"); + break; + case BIT_ULL(VIRTIO_NET_F_RSS): + seq_puts(seq, " VIRTIO_NET_F_RSS"); + break; + case BIT_ULL(VIRTIO_NET_F_RSC_EXT): + seq_puts(seq, " VIRTIO_NET_F_RSC_EXT"); + break; + case BIT_ULL(VIRTIO_NET_F_STANDBY): + seq_puts(seq, " VIRTIO_NET_F_STANDBY"); + break; + case BIT_ULL(VIRTIO_NET_F_SPEED_DUPLEX): + seq_puts(seq, " VIRTIO_NET_F_SPEED_DUPLEX"); + break; + case BIT_ULL(VIRTIO_F_NOTIFY_ON_EMPTY): + seq_puts(seq, " VIRTIO_F_NOTIFY_ON_EMPTY"); + break; + case BIT_ULL(VIRTIO_F_ANY_LAYOUT): + seq_puts(seq, " VIRTIO_F_ANY_LAYOUT"); + break; + case BIT_ULL(VIRTIO_F_VERSION_1): + seq_puts(seq, " VIRTIO_F_VERSION_1"); + break; + case BIT_ULL(VIRTIO_F_ACCESS_PLATFORM): + seq_puts(seq, " VIRTIO_F_ACCESS_PLATFORM"); + break; + case BIT_ULL(VIRTIO_F_RING_PACKED): + seq_puts(seq, " VIRTIO_F_RING_PACKED"); + break; + case BIT_ULL(VIRTIO_F_ORDER_PLATFORM): + seq_puts(seq, " VIRTIO_F_ORDER_PLATFORM"); + break; + case BIT_ULL(VIRTIO_F_SR_IOV): + seq_puts(seq, " VIRTIO_F_SR_IOV"); + break; + case 0: + break; + default: + seq_printf(seq, " bit_%d", i); + break; + } + } + + seq_puts(seq, "\n"); +} + +void pds_vdpa_debugfs_add_pcidev(struct pds_vdpa_aux *vdpa_aux) +{ + vdpa_aux->dentry = debugfs_create_dir(pci_name(vdpa_aux->padev->vf_pdev), dbfs_dir); +} + +static int identity_show(struct seq_file *seq, void *v) +{ + struct pds_vdpa_aux *vdpa_aux = seq->private; + struct vdpa_mgmt_dev *mgmt; + u64 hw_features; + + seq_printf(seq, "aux_dev: %s\n", + dev_name(&vdpa_aux->padev->aux_dev.dev)); + + mgmt = &vdpa_aux->vdpa_mdev; + seq_printf(seq, "max_vqs: %d\n", mgmt->max_supported_vqs); + seq_printf(seq, "config_attr_mask: %#llx\n", mgmt->config_attr_mask); + hw_features = le64_to_cpu(vdpa_aux->ident.hw_features); + seq_printf(seq, "hw_features: %#llx\n", hw_features); + print_feature_bits_all(seq, hw_features); + + return 0; +} +DEFINE_SHOW_ATTRIBUTE(identity); + +void pds_vdpa_debugfs_add_ident(struct pds_vdpa_aux *vdpa_aux) +{ + debugfs_create_file("identity", 0400, vdpa_aux->dentry, + vdpa_aux, &identity_fops); +} + +static int config_show(struct seq_file *seq, void *v) +{ + struct pds_vdpa_device *pdsv = seq->private; + struct virtio_net_config vc; + u8 status; + + memcpy_fromio(&vc, pdsv->vdpa_aux->vd_mdev.device, + sizeof(struct virtio_net_config)); + + seq_printf(seq, "mac: %pM\n", vc.mac); + seq_printf(seq, "max_virtqueue_pairs: %d\n", + __virtio16_to_cpu(true, vc.max_virtqueue_pairs)); + seq_printf(seq, "mtu: %d\n", __virtio16_to_cpu(true, vc.mtu)); + seq_printf(seq, "speed: %d\n", le32_to_cpu(vc.speed)); + seq_printf(seq, "duplex: %d\n", vc.duplex); + seq_printf(seq, "rss_max_key_size: %d\n", vc.rss_max_key_size); + seq_printf(seq, "rss_max_indirection_table_length: %d\n", + le16_to_cpu(vc.rss_max_indirection_table_length)); + seq_printf(seq, "supported_hash_types: %#x\n", + le32_to_cpu(vc.supported_hash_types)); + seq_printf(seq, "vn_status: %#x\n", + __virtio16_to_cpu(true, vc.status)); + + status = vp_modern_get_status(&pdsv->vdpa_aux->vd_mdev); + seq_printf(seq, "dev_status: %#x\n", status); + print_status_bits(seq, status); + seq_printf(seq, "negotiated_features: %#llx\n", pdsv->negotiated_features); + print_feature_bits_all(seq, pdsv->negotiated_features); + seq_printf(seq, "vdpa_index: %d\n", pdsv->vdpa_index); + seq_printf(seq, "num_vqs: %d\n", pdsv->num_vqs); + + return 0; +} +DEFINE_SHOW_ATTRIBUTE(config); + +static int vq_show(struct seq_file *seq, void *v) +{ + struct pds_vdpa_vq_info *vq = seq->private; + + seq_printf(seq, "ready: %d\n", vq->ready); + seq_printf(seq, "desc_addr: %#llx\n", vq->desc_addr); + seq_printf(seq, "avail_addr: %#llx\n", vq->avail_addr); + seq_printf(seq, "used_addr: %#llx\n", vq->used_addr); + seq_printf(seq, "q_len: %d\n", vq->q_len); + seq_printf(seq, "qid: %d\n", vq->qid); + + seq_printf(seq, "doorbell: %#llx\n", vq->doorbell); + seq_printf(seq, "avail_idx: %d\n", vq->avail_idx); + seq_printf(seq, "used_idx: %d\n", vq->used_idx); + seq_printf(seq, "irq: %d\n", vq->irq); + seq_printf(seq, "irq-name: %s\n", vq->irq_name); + + return 0; +} +DEFINE_SHOW_ATTRIBUTE(vq); + +void pds_vdpa_debugfs_add_vdpadev(struct pds_vdpa_aux *vdpa_aux) +{ + int i; + + debugfs_create_file("config", 0400, vdpa_aux->dentry, vdpa_aux->pdsv, &config_fops); + + for (i = 0; i < vdpa_aux->pdsv->num_vqs; i++) { + char name[16]; + + snprintf(name, sizeof(name), "vq%02d", i); + debugfs_create_file(name, 0400, vdpa_aux->dentry, + &vdpa_aux->pdsv->vqs[i], &vq_fops); + } +} + +void pds_vdpa_debugfs_del_vdpadev(struct pds_vdpa_aux *vdpa_aux) +{ + debugfs_remove_recursive(vdpa_aux->dentry); + vdpa_aux->dentry = NULL; +} + +void pds_vdpa_debugfs_reset_vdpadev(struct pds_vdpa_aux *vdpa_aux) +{ + /* we don't keep track of the entries, so remove it all + * then rebuild the basics + */ + pds_vdpa_debugfs_del_vdpadev(vdpa_aux); + pds_vdpa_debugfs_add_pcidev(vdpa_aux); + pds_vdpa_debugfs_add_ident(vdpa_aux); +} diff --git a/drivers/vdpa/pds/debugfs.h b/drivers/vdpa/pds/debugfs.h new file mode 100644 index 0000000000..c088a4e8f1 --- /dev/null +++ b/drivers/vdpa/pds/debugfs.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright(c) 2023 Advanced Micro Devices, Inc */ + +#ifndef _PDS_VDPA_DEBUGFS_H_ +#define _PDS_VDPA_DEBUGFS_H_ + +#include <linux/debugfs.h> + +void pds_vdpa_debugfs_create(void); +void pds_vdpa_debugfs_destroy(void); +void pds_vdpa_debugfs_add_pcidev(struct pds_vdpa_aux *vdpa_aux); +void pds_vdpa_debugfs_add_ident(struct pds_vdpa_aux *vdpa_aux); +void pds_vdpa_debugfs_add_vdpadev(struct pds_vdpa_aux *vdpa_aux); +void pds_vdpa_debugfs_del_vdpadev(struct pds_vdpa_aux *vdpa_aux); +void pds_vdpa_debugfs_reset_vdpadev(struct pds_vdpa_aux *vdpa_aux); + +#endif /* _PDS_VDPA_DEBUGFS_H_ */ diff --git a/drivers/vdpa/pds/vdpa_dev.c b/drivers/vdpa/pds/vdpa_dev.c new file mode 100644 index 0000000000..25c0fe5ec3 --- /dev/null +++ b/drivers/vdpa/pds/vdpa_dev.c @@ -0,0 +1,844 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright(c) 2023 Advanced Micro Devices, Inc */ + +#include <linux/pci.h> +#include <linux/vdpa.h> +#include <uapi/linux/vdpa.h> +#include <linux/virtio_pci_modern.h> + +#include <linux/pds/pds_common.h> +#include <linux/pds/pds_core_if.h> +#include <linux/pds/pds_adminq.h> +#include <linux/pds/pds_auxbus.h> + +#include "vdpa_dev.h" +#include "aux_drv.h" +#include "cmds.h" +#include "debugfs.h" + +static u64 pds_vdpa_get_driver_features(struct vdpa_device *vdpa_dev); + +static struct pds_vdpa_device *vdpa_to_pdsv(struct vdpa_device *vdpa_dev) +{ + return container_of(vdpa_dev, struct pds_vdpa_device, vdpa_dev); +} + +static int pds_vdpa_notify_handler(struct notifier_block *nb, + unsigned long ecode, + void *data) +{ + struct pds_vdpa_device *pdsv = container_of(nb, struct pds_vdpa_device, nb); + struct device *dev = &pdsv->vdpa_aux->padev->aux_dev.dev; + + dev_dbg(dev, "%s: event code %lu\n", __func__, ecode); + + if (ecode == PDS_EVENT_RESET || ecode == PDS_EVENT_LINK_CHANGE) { + if (pdsv->config_cb.callback) + pdsv->config_cb.callback(pdsv->config_cb.private); + } + + return 0; +} + +static int pds_vdpa_register_event_handler(struct pds_vdpa_device *pdsv) +{ + struct device *dev = &pdsv->vdpa_aux->padev->aux_dev.dev; + struct notifier_block *nb = &pdsv->nb; + int err; + + if (!nb->notifier_call) { + nb->notifier_call = pds_vdpa_notify_handler; + err = pdsc_register_notify(nb); + if (err) { + nb->notifier_call = NULL; + dev_err(dev, "failed to register pds event handler: %ps\n", + ERR_PTR(err)); + return -EINVAL; + } + dev_dbg(dev, "pds event handler registered\n"); + } + + return 0; +} + +static void pds_vdpa_unregister_event_handler(struct pds_vdpa_device *pdsv) +{ + if (pdsv->nb.notifier_call) { + pdsc_unregister_notify(&pdsv->nb); + pdsv->nb.notifier_call = NULL; + } +} + +static int pds_vdpa_set_vq_address(struct vdpa_device *vdpa_dev, u16 qid, + u64 desc_addr, u64 driver_addr, u64 device_addr) +{ + struct pds_vdpa_device *pdsv = vdpa_to_pdsv(vdpa_dev); + + pdsv->vqs[qid].desc_addr = desc_addr; + pdsv->vqs[qid].avail_addr = driver_addr; + pdsv->vqs[qid].used_addr = device_addr; + + return 0; +} + +static void pds_vdpa_set_vq_num(struct vdpa_device *vdpa_dev, u16 qid, u32 num) +{ + struct pds_vdpa_device *pdsv = vdpa_to_pdsv(vdpa_dev); + + pdsv->vqs[qid].q_len = num; +} + +static void pds_vdpa_kick_vq(struct vdpa_device *vdpa_dev, u16 qid) +{ + struct pds_vdpa_device *pdsv = vdpa_to_pdsv(vdpa_dev); + + iowrite16(qid, pdsv->vqs[qid].notify); +} + +static void pds_vdpa_set_vq_cb(struct vdpa_device *vdpa_dev, u16 qid, + struct vdpa_callback *cb) +{ + struct pds_vdpa_device *pdsv = vdpa_to_pdsv(vdpa_dev); + + pdsv->vqs[qid].event_cb = *cb; +} + +static irqreturn_t pds_vdpa_isr(int irq, void *data) +{ + struct pds_vdpa_vq_info *vq; + + vq = data; + if (vq->event_cb.callback) + vq->event_cb.callback(vq->event_cb.private); + + return IRQ_HANDLED; +} + +static void pds_vdpa_release_irq(struct pds_vdpa_device *pdsv, int qid) +{ + if (pdsv->vqs[qid].irq == VIRTIO_MSI_NO_VECTOR) + return; + + free_irq(pdsv->vqs[qid].irq, &pdsv->vqs[qid]); + pdsv->vqs[qid].irq = VIRTIO_MSI_NO_VECTOR; +} + +static void pds_vdpa_set_vq_ready(struct vdpa_device *vdpa_dev, u16 qid, bool ready) +{ + struct pds_vdpa_device *pdsv = vdpa_to_pdsv(vdpa_dev); + struct device *dev = &pdsv->vdpa_dev.dev; + u64 driver_features; + u16 invert_idx = 0; + int err; + + dev_dbg(dev, "%s: qid %d ready %d => %d\n", + __func__, qid, pdsv->vqs[qid].ready, ready); + if (ready == pdsv->vqs[qid].ready) + return; + + driver_features = pds_vdpa_get_driver_features(vdpa_dev); + if (driver_features & BIT_ULL(VIRTIO_F_RING_PACKED)) + invert_idx = PDS_VDPA_PACKED_INVERT_IDX; + + if (ready) { + /* Pass vq setup info to DSC using adminq to gather up and + * send all info at once so FW can do its full set up in + * one easy operation + */ + err = pds_vdpa_cmd_init_vq(pdsv, qid, invert_idx, &pdsv->vqs[qid]); + if (err) { + dev_err(dev, "Failed to init vq %d: %pe\n", + qid, ERR_PTR(err)); + ready = false; + } + } else { + err = pds_vdpa_cmd_reset_vq(pdsv, qid, invert_idx, &pdsv->vqs[qid]); + if (err) + dev_err(dev, "%s: reset_vq failed qid %d: %pe\n", + __func__, qid, ERR_PTR(err)); + } + + pdsv->vqs[qid].ready = ready; +} + +static bool pds_vdpa_get_vq_ready(struct vdpa_device *vdpa_dev, u16 qid) +{ + struct pds_vdpa_device *pdsv = vdpa_to_pdsv(vdpa_dev); + + return pdsv->vqs[qid].ready; +} + +static int pds_vdpa_set_vq_state(struct vdpa_device *vdpa_dev, u16 qid, + const struct vdpa_vq_state *state) +{ + struct pds_vdpa_device *pdsv = vdpa_to_pdsv(vdpa_dev); + struct pds_auxiliary_dev *padev = pdsv->vdpa_aux->padev; + struct device *dev = &padev->aux_dev.dev; + u64 driver_features; + u16 avail; + u16 used; + + if (pdsv->vqs[qid].ready) { + dev_err(dev, "Setting device position is denied while vq is enabled\n"); + return -EINVAL; + } + + driver_features = pds_vdpa_get_driver_features(vdpa_dev); + if (driver_features & BIT_ULL(VIRTIO_F_RING_PACKED)) { + avail = state->packed.last_avail_idx | + (state->packed.last_avail_counter << 15); + used = state->packed.last_used_idx | + (state->packed.last_used_counter << 15); + + /* The avail and used index are stored with the packed wrap + * counter bit inverted. This way, in case set_vq_state is + * not called, the initial value can be set to zero prior to + * feature negotiation, and it is good for both packed and + * split vq. + */ + avail ^= PDS_VDPA_PACKED_INVERT_IDX; + used ^= PDS_VDPA_PACKED_INVERT_IDX; + } else { + avail = state->split.avail_index; + /* state->split does not provide a used_index: + * the vq will be set to "empty" here, and the vq will read + * the current used index the next time the vq is kicked. + */ + used = avail; + } + + if (used != avail) { + dev_dbg(dev, "Setting used equal to avail, for interoperability\n"); + used = avail; + } + + pdsv->vqs[qid].avail_idx = avail; + pdsv->vqs[qid].used_idx = used; + + return 0; +} + +static int pds_vdpa_get_vq_state(struct vdpa_device *vdpa_dev, u16 qid, + struct vdpa_vq_state *state) +{ + struct pds_vdpa_device *pdsv = vdpa_to_pdsv(vdpa_dev); + struct pds_auxiliary_dev *padev = pdsv->vdpa_aux->padev; + struct device *dev = &padev->aux_dev.dev; + u64 driver_features; + u16 avail; + u16 used; + + if (pdsv->vqs[qid].ready) { + dev_err(dev, "Getting device position is denied while vq is enabled\n"); + return -EINVAL; + } + + avail = pdsv->vqs[qid].avail_idx; + used = pdsv->vqs[qid].used_idx; + + driver_features = pds_vdpa_get_driver_features(vdpa_dev); + if (driver_features & BIT_ULL(VIRTIO_F_RING_PACKED)) { + avail ^= PDS_VDPA_PACKED_INVERT_IDX; + used ^= PDS_VDPA_PACKED_INVERT_IDX; + + state->packed.last_avail_idx = avail & 0x7fff; + state->packed.last_avail_counter = avail >> 15; + state->packed.last_used_idx = used & 0x7fff; + state->packed.last_used_counter = used >> 15; + } else { + state->split.avail_index = avail; + /* state->split does not provide a used_index. */ + } + + return 0; +} + +static struct vdpa_notification_area +pds_vdpa_get_vq_notification(struct vdpa_device *vdpa_dev, u16 qid) +{ + struct pds_vdpa_device *pdsv = vdpa_to_pdsv(vdpa_dev); + struct virtio_pci_modern_device *vd_mdev; + struct vdpa_notification_area area; + + area.addr = pdsv->vqs[qid].notify_pa; + + vd_mdev = &pdsv->vdpa_aux->vd_mdev; + if (!vd_mdev->notify_offset_multiplier) + area.size = PDS_PAGE_SIZE; + else + area.size = vd_mdev->notify_offset_multiplier; + + return area; +} + +static int pds_vdpa_get_vq_irq(struct vdpa_device *vdpa_dev, u16 qid) +{ + struct pds_vdpa_device *pdsv = vdpa_to_pdsv(vdpa_dev); + + return pdsv->vqs[qid].irq; +} + +static u32 pds_vdpa_get_vq_align(struct vdpa_device *vdpa_dev) +{ + return PDS_PAGE_SIZE; +} + +static u32 pds_vdpa_get_vq_group(struct vdpa_device *vdpa_dev, u16 idx) +{ + return 0; +} + +static u64 pds_vdpa_get_device_features(struct vdpa_device *vdpa_dev) +{ + struct pds_vdpa_device *pdsv = vdpa_to_pdsv(vdpa_dev); + + return pdsv->supported_features; +} + +static int pds_vdpa_set_driver_features(struct vdpa_device *vdpa_dev, u64 features) +{ + struct pds_vdpa_device *pdsv = vdpa_to_pdsv(vdpa_dev); + struct device *dev = &pdsv->vdpa_dev.dev; + u64 driver_features; + u64 nego_features; + u64 hw_features; + u64 missing; + + if (!(features & BIT_ULL(VIRTIO_F_ACCESS_PLATFORM)) && features) { + dev_err(dev, "VIRTIO_F_ACCESS_PLATFORM is not negotiated\n"); + return -EOPNOTSUPP; + } + + /* Check for valid feature bits */ + nego_features = features & pdsv->supported_features; + missing = features & ~nego_features; + if (missing) { + dev_err(dev, "Can't support all requested features in %#llx, missing %#llx features\n", + features, missing); + return -EOPNOTSUPP; + } + + driver_features = pds_vdpa_get_driver_features(vdpa_dev); + pdsv->negotiated_features = nego_features; + dev_dbg(dev, "%s: %#llx => %#llx\n", + __func__, driver_features, nego_features); + + /* if we're faking the F_MAC, strip it before writing to device */ + hw_features = le64_to_cpu(pdsv->vdpa_aux->ident.hw_features); + if (!(hw_features & BIT_ULL(VIRTIO_NET_F_MAC))) + nego_features &= ~BIT_ULL(VIRTIO_NET_F_MAC); + + if (driver_features == nego_features) + return 0; + + vp_modern_set_features(&pdsv->vdpa_aux->vd_mdev, nego_features); + + return 0; +} + +static u64 pds_vdpa_get_driver_features(struct vdpa_device *vdpa_dev) +{ + struct pds_vdpa_device *pdsv = vdpa_to_pdsv(vdpa_dev); + + return pdsv->negotiated_features; +} + +static void pds_vdpa_set_config_cb(struct vdpa_device *vdpa_dev, + struct vdpa_callback *cb) +{ + struct pds_vdpa_device *pdsv = vdpa_to_pdsv(vdpa_dev); + + pdsv->config_cb.callback = cb->callback; + pdsv->config_cb.private = cb->private; +} + +static u16 pds_vdpa_get_vq_num_max(struct vdpa_device *vdpa_dev) +{ + struct pds_vdpa_device *pdsv = vdpa_to_pdsv(vdpa_dev); + + /* qemu has assert() that vq_num_max <= VIRTQUEUE_MAX_SIZE (1024) */ + return min_t(u16, 1024, BIT(le16_to_cpu(pdsv->vdpa_aux->ident.max_qlen))); +} + +static u32 pds_vdpa_get_device_id(struct vdpa_device *vdpa_dev) +{ + return VIRTIO_ID_NET; +} + +static u32 pds_vdpa_get_vendor_id(struct vdpa_device *vdpa_dev) +{ + return PCI_VENDOR_ID_PENSANDO; +} + +static u8 pds_vdpa_get_status(struct vdpa_device *vdpa_dev) +{ + struct pds_vdpa_device *pdsv = vdpa_to_pdsv(vdpa_dev); + + return vp_modern_get_status(&pdsv->vdpa_aux->vd_mdev); +} + +static int pds_vdpa_request_irqs(struct pds_vdpa_device *pdsv) +{ + struct pci_dev *pdev = pdsv->vdpa_aux->padev->vf_pdev; + struct pds_vdpa_aux *vdpa_aux = pdsv->vdpa_aux; + struct device *dev = &pdsv->vdpa_dev.dev; + int max_vq, nintrs, qid, err; + + max_vq = vdpa_aux->vdpa_mdev.max_supported_vqs; + + nintrs = pci_alloc_irq_vectors(pdev, max_vq, max_vq, PCI_IRQ_MSIX); + if (nintrs < 0) { + dev_err(dev, "Couldn't get %d msix vectors: %pe\n", + max_vq, ERR_PTR(nintrs)); + return nintrs; + } + + for (qid = 0; qid < pdsv->num_vqs; ++qid) { + int irq = pci_irq_vector(pdev, qid); + + snprintf(pdsv->vqs[qid].irq_name, sizeof(pdsv->vqs[qid].irq_name), + "vdpa-%s-%d", dev_name(dev), qid); + + err = request_irq(irq, pds_vdpa_isr, 0, + pdsv->vqs[qid].irq_name, + &pdsv->vqs[qid]); + if (err) { + dev_err(dev, "%s: no irq for qid %d: %pe\n", + __func__, qid, ERR_PTR(err)); + goto err_release; + } + + pdsv->vqs[qid].irq = irq; + } + + vdpa_aux->nintrs = nintrs; + + return 0; + +err_release: + while (qid--) + pds_vdpa_release_irq(pdsv, qid); + + pci_free_irq_vectors(pdev); + + vdpa_aux->nintrs = 0; + + return err; +} + +static void pds_vdpa_release_irqs(struct pds_vdpa_device *pdsv) +{ + struct pci_dev *pdev = pdsv->vdpa_aux->padev->vf_pdev; + struct pds_vdpa_aux *vdpa_aux = pdsv->vdpa_aux; + int qid; + + if (!vdpa_aux->nintrs) + return; + + for (qid = 0; qid < pdsv->num_vqs; qid++) + pds_vdpa_release_irq(pdsv, qid); + + pci_free_irq_vectors(pdev); + + vdpa_aux->nintrs = 0; +} + +static void pds_vdpa_set_status(struct vdpa_device *vdpa_dev, u8 status) +{ + struct pds_vdpa_device *pdsv = vdpa_to_pdsv(vdpa_dev); + struct device *dev = &pdsv->vdpa_dev.dev; + u8 old_status; + int i; + + old_status = pds_vdpa_get_status(vdpa_dev); + dev_dbg(dev, "%s: old %#x new %#x\n", __func__, old_status, status); + + if (status & ~old_status & VIRTIO_CONFIG_S_DRIVER_OK) { + if (pds_vdpa_request_irqs(pdsv)) + status = old_status | VIRTIO_CONFIG_S_FAILED; + } + + pds_vdpa_cmd_set_status(pdsv, status); + + if (status == 0) { + struct vdpa_callback null_cb = { }; + + pds_vdpa_set_config_cb(vdpa_dev, &null_cb); + pds_vdpa_cmd_reset(pdsv); + + for (i = 0; i < pdsv->num_vqs; i++) { + pdsv->vqs[i].avail_idx = 0; + pdsv->vqs[i].used_idx = 0; + } + + pds_vdpa_cmd_set_mac(pdsv, pdsv->mac); + } + + if (status & ~old_status & VIRTIO_CONFIG_S_FEATURES_OK) { + for (i = 0; i < pdsv->num_vqs; i++) { + pdsv->vqs[i].notify = + vp_modern_map_vq_notify(&pdsv->vdpa_aux->vd_mdev, + i, &pdsv->vqs[i].notify_pa); + } + } + + if (old_status & ~status & VIRTIO_CONFIG_S_DRIVER_OK) + pds_vdpa_release_irqs(pdsv); +} + +static void pds_vdpa_init_vqs_entry(struct pds_vdpa_device *pdsv, int qid, + void __iomem *notify) +{ + memset(&pdsv->vqs[qid], 0, sizeof(pdsv->vqs[0])); + pdsv->vqs[qid].qid = qid; + pdsv->vqs[qid].pdsv = pdsv; + pdsv->vqs[qid].ready = false; + pdsv->vqs[qid].irq = VIRTIO_MSI_NO_VECTOR; + pdsv->vqs[qid].notify = notify; +} + +static int pds_vdpa_reset(struct vdpa_device *vdpa_dev) +{ + struct pds_vdpa_device *pdsv = vdpa_to_pdsv(vdpa_dev); + struct device *dev; + int err = 0; + u8 status; + int i; + + dev = &pdsv->vdpa_aux->padev->aux_dev.dev; + status = pds_vdpa_get_status(vdpa_dev); + + if (status == 0) + return 0; + + if (status & VIRTIO_CONFIG_S_DRIVER_OK) { + /* Reset the vqs */ + for (i = 0; i < pdsv->num_vqs && !err; i++) { + err = pds_vdpa_cmd_reset_vq(pdsv, i, 0, &pdsv->vqs[i]); + if (err) + dev_err(dev, "%s: reset_vq failed qid %d: %pe\n", + __func__, i, ERR_PTR(err)); + } + } + + pds_vdpa_set_status(vdpa_dev, 0); + + if (status & VIRTIO_CONFIG_S_DRIVER_OK) { + /* Reset the vq info */ + for (i = 0; i < pdsv->num_vqs && !err; i++) + pds_vdpa_init_vqs_entry(pdsv, i, pdsv->vqs[i].notify); + } + + return 0; +} + +static size_t pds_vdpa_get_config_size(struct vdpa_device *vdpa_dev) +{ + return sizeof(struct virtio_net_config); +} + +static void pds_vdpa_get_config(struct vdpa_device *vdpa_dev, + unsigned int offset, + void *buf, unsigned int len) +{ + struct pds_vdpa_device *pdsv = vdpa_to_pdsv(vdpa_dev); + void __iomem *device; + + if (offset + len > sizeof(struct virtio_net_config)) { + WARN(true, "%s: bad read, offset %d len %d\n", __func__, offset, len); + return; + } + + device = pdsv->vdpa_aux->vd_mdev.device; + memcpy_fromio(buf, device + offset, len); +} + +static void pds_vdpa_set_config(struct vdpa_device *vdpa_dev, + unsigned int offset, const void *buf, + unsigned int len) +{ + struct pds_vdpa_device *pdsv = vdpa_to_pdsv(vdpa_dev); + void __iomem *device; + + if (offset + len > sizeof(struct virtio_net_config)) { + WARN(true, "%s: bad read, offset %d len %d\n", __func__, offset, len); + return; + } + + device = pdsv->vdpa_aux->vd_mdev.device; + memcpy_toio(device + offset, buf, len); +} + +static const struct vdpa_config_ops pds_vdpa_ops = { + .set_vq_address = pds_vdpa_set_vq_address, + .set_vq_num = pds_vdpa_set_vq_num, + .kick_vq = pds_vdpa_kick_vq, + .set_vq_cb = pds_vdpa_set_vq_cb, + .set_vq_ready = pds_vdpa_set_vq_ready, + .get_vq_ready = pds_vdpa_get_vq_ready, + .set_vq_state = pds_vdpa_set_vq_state, + .get_vq_state = pds_vdpa_get_vq_state, + .get_vq_notification = pds_vdpa_get_vq_notification, + .get_vq_irq = pds_vdpa_get_vq_irq, + .get_vq_align = pds_vdpa_get_vq_align, + .get_vq_group = pds_vdpa_get_vq_group, + + .get_device_features = pds_vdpa_get_device_features, + .set_driver_features = pds_vdpa_set_driver_features, + .get_driver_features = pds_vdpa_get_driver_features, + .set_config_cb = pds_vdpa_set_config_cb, + .get_vq_num_max = pds_vdpa_get_vq_num_max, + .get_device_id = pds_vdpa_get_device_id, + .get_vendor_id = pds_vdpa_get_vendor_id, + .get_status = pds_vdpa_get_status, + .set_status = pds_vdpa_set_status, + .reset = pds_vdpa_reset, + .get_config_size = pds_vdpa_get_config_size, + .get_config = pds_vdpa_get_config, + .set_config = pds_vdpa_set_config, +}; +static struct virtio_device_id pds_vdpa_id_table[] = { + {VIRTIO_ID_NET, VIRTIO_DEV_ANY_ID}, + {0}, +}; + +static int pds_vdpa_dev_add(struct vdpa_mgmt_dev *mdev, const char *name, + const struct vdpa_dev_set_config *add_config) +{ + struct pds_vdpa_aux *vdpa_aux; + struct pds_vdpa_device *pdsv; + struct vdpa_mgmt_dev *mgmt; + u16 fw_max_vqs, vq_pairs; + struct device *dma_dev; + struct pci_dev *pdev; + struct device *dev; + int err; + int i; + + vdpa_aux = container_of(mdev, struct pds_vdpa_aux, vdpa_mdev); + dev = &vdpa_aux->padev->aux_dev.dev; + mgmt = &vdpa_aux->vdpa_mdev; + + if (vdpa_aux->pdsv) { + dev_warn(dev, "Multiple vDPA devices on a VF is not supported.\n"); + return -EOPNOTSUPP; + } + + pdsv = vdpa_alloc_device(struct pds_vdpa_device, vdpa_dev, + dev, &pds_vdpa_ops, 1, 1, name, false); + if (IS_ERR(pdsv)) { + dev_err(dev, "Failed to allocate vDPA structure: %pe\n", pdsv); + return PTR_ERR(pdsv); + } + + vdpa_aux->pdsv = pdsv; + pdsv->vdpa_aux = vdpa_aux; + + pdev = vdpa_aux->padev->vf_pdev; + dma_dev = &pdev->dev; + pdsv->vdpa_dev.dma_dev = dma_dev; + + pdsv->supported_features = mgmt->supported_features; + + if (add_config->mask & BIT_ULL(VDPA_ATTR_DEV_FEATURES)) { + u64 unsupp_features = + add_config->device_features & ~pdsv->supported_features; + + if (unsupp_features) { + dev_err(dev, "Unsupported features: %#llx\n", unsupp_features); + err = -EOPNOTSUPP; + goto err_unmap; + } + + pdsv->supported_features = add_config->device_features; + } + + err = pds_vdpa_cmd_reset(pdsv); + if (err) { + dev_err(dev, "Failed to reset hw: %pe\n", ERR_PTR(err)); + goto err_unmap; + } + + err = pds_vdpa_init_hw(pdsv); + if (err) { + dev_err(dev, "Failed to init hw: %pe\n", ERR_PTR(err)); + goto err_unmap; + } + + fw_max_vqs = le16_to_cpu(pdsv->vdpa_aux->ident.max_vqs); + vq_pairs = fw_max_vqs / 2; + + /* Make sure we have the queues being requested */ + if (add_config->mask & (1 << VDPA_ATTR_DEV_NET_CFG_MAX_VQP)) + vq_pairs = add_config->net.max_vq_pairs; + + pdsv->num_vqs = 2 * vq_pairs; + if (pdsv->supported_features & BIT_ULL(VIRTIO_NET_F_CTRL_VQ)) + pdsv->num_vqs++; + + if (pdsv->num_vqs > fw_max_vqs) { + dev_err(dev, "%s: queue count requested %u greater than max %u\n", + __func__, pdsv->num_vqs, fw_max_vqs); + err = -ENOSPC; + goto err_unmap; + } + + if (pdsv->num_vqs != fw_max_vqs) { + err = pds_vdpa_cmd_set_max_vq_pairs(pdsv, vq_pairs); + if (err) { + dev_err(dev, "Failed to set max_vq_pairs: %pe\n", + ERR_PTR(err)); + goto err_unmap; + } + } + + /* Set a mac, either from the user config if provided + * or use the device's mac if not 00:..:00 + * or set a random mac + */ + if (add_config->mask & BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MACADDR)) { + ether_addr_copy(pdsv->mac, add_config->net.mac); + } else { + struct virtio_net_config __iomem *vc; + + vc = pdsv->vdpa_aux->vd_mdev.device; + memcpy_fromio(pdsv->mac, vc->mac, sizeof(pdsv->mac)); + if (is_zero_ether_addr(pdsv->mac) && + (pdsv->supported_features & BIT_ULL(VIRTIO_NET_F_MAC))) { + eth_random_addr(pdsv->mac); + dev_info(dev, "setting random mac %pM\n", pdsv->mac); + } + } + pds_vdpa_cmd_set_mac(pdsv, pdsv->mac); + + for (i = 0; i < pdsv->num_vqs; i++) { + void __iomem *notify; + + notify = vp_modern_map_vq_notify(&pdsv->vdpa_aux->vd_mdev, + i, &pdsv->vqs[i].notify_pa); + pds_vdpa_init_vqs_entry(pdsv, i, notify); + } + + pdsv->vdpa_dev.mdev = &vdpa_aux->vdpa_mdev; + + err = pds_vdpa_register_event_handler(pdsv); + if (err) { + dev_err(dev, "Failed to register for PDS events: %pe\n", ERR_PTR(err)); + goto err_unmap; + } + + /* We use the _vdpa_register_device() call rather than the + * vdpa_register_device() to avoid a deadlock because our + * dev_add() is called with the vdpa_dev_lock already set + * by vdpa_nl_cmd_dev_add_set_doit() + */ + err = _vdpa_register_device(&pdsv->vdpa_dev, pdsv->num_vqs); + if (err) { + dev_err(dev, "Failed to register to vDPA bus: %pe\n", ERR_PTR(err)); + goto err_unevent; + } + + pds_vdpa_debugfs_add_vdpadev(vdpa_aux); + + return 0; + +err_unevent: + pds_vdpa_unregister_event_handler(pdsv); +err_unmap: + put_device(&pdsv->vdpa_dev.dev); + vdpa_aux->pdsv = NULL; + return err; +} + +static void pds_vdpa_dev_del(struct vdpa_mgmt_dev *mdev, + struct vdpa_device *vdpa_dev) +{ + struct pds_vdpa_device *pdsv = vdpa_to_pdsv(vdpa_dev); + struct pds_vdpa_aux *vdpa_aux; + + pds_vdpa_unregister_event_handler(pdsv); + + vdpa_aux = container_of(mdev, struct pds_vdpa_aux, vdpa_mdev); + _vdpa_unregister_device(vdpa_dev); + + pds_vdpa_cmd_reset(vdpa_aux->pdsv); + pds_vdpa_debugfs_reset_vdpadev(vdpa_aux); + + vdpa_aux->pdsv = NULL; + + dev_info(&vdpa_aux->padev->aux_dev.dev, "Removed vdpa device\n"); +} + +static const struct vdpa_mgmtdev_ops pds_vdpa_mgmt_dev_ops = { + .dev_add = pds_vdpa_dev_add, + .dev_del = pds_vdpa_dev_del +}; + +int pds_vdpa_get_mgmt_info(struct pds_vdpa_aux *vdpa_aux) +{ + union pds_core_adminq_cmd cmd = { + .vdpa_ident.opcode = PDS_VDPA_CMD_IDENT, + .vdpa_ident.vf_id = cpu_to_le16(vdpa_aux->vf_id), + }; + union pds_core_adminq_comp comp = {}; + struct vdpa_mgmt_dev *mgmt; + struct pci_dev *pf_pdev; + struct device *pf_dev; + struct pci_dev *pdev; + dma_addr_t ident_pa; + struct device *dev; + u16 dev_intrs; + u16 max_vqs; + int err; + + dev = &vdpa_aux->padev->aux_dev.dev; + pdev = vdpa_aux->padev->vf_pdev; + mgmt = &vdpa_aux->vdpa_mdev; + + /* Get resource info through the PF's adminq. It is a block of info, + * so we need to map some memory for PF to make available to the + * firmware for writing the data. + */ + pf_pdev = pci_physfn(vdpa_aux->padev->vf_pdev); + pf_dev = &pf_pdev->dev; + ident_pa = dma_map_single(pf_dev, &vdpa_aux->ident, + sizeof(vdpa_aux->ident), DMA_FROM_DEVICE); + if (dma_mapping_error(pf_dev, ident_pa)) { + dev_err(dev, "Failed to map ident space\n"); + return -ENOMEM; + } + + cmd.vdpa_ident.ident_pa = cpu_to_le64(ident_pa); + cmd.vdpa_ident.len = cpu_to_le32(sizeof(vdpa_aux->ident)); + err = pds_client_adminq_cmd(vdpa_aux->padev, &cmd, + sizeof(cmd.vdpa_ident), &comp, 0); + dma_unmap_single(pf_dev, ident_pa, + sizeof(vdpa_aux->ident), DMA_FROM_DEVICE); + if (err) { + dev_err(dev, "Failed to ident hw, status %d: %pe\n", + comp.status, ERR_PTR(err)); + return err; + } + + max_vqs = le16_to_cpu(vdpa_aux->ident.max_vqs); + dev_intrs = pci_msix_vec_count(pdev); + dev_dbg(dev, "ident.max_vqs %d dev_intrs %d\n", max_vqs, dev_intrs); + + max_vqs = min_t(u16, dev_intrs, max_vqs); + mgmt->max_supported_vqs = min_t(u16, PDS_VDPA_MAX_QUEUES, max_vqs); + vdpa_aux->nintrs = 0; + + mgmt->ops = &pds_vdpa_mgmt_dev_ops; + mgmt->id_table = pds_vdpa_id_table; + mgmt->device = dev; + mgmt->supported_features = le64_to_cpu(vdpa_aux->ident.hw_features); + + /* advertise F_MAC even if the device doesn't */ + mgmt->supported_features |= BIT_ULL(VIRTIO_NET_F_MAC); + + mgmt->config_attr_mask = BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MACADDR); + mgmt->config_attr_mask |= BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MAX_VQP); + mgmt->config_attr_mask |= BIT_ULL(VDPA_ATTR_DEV_FEATURES); + + return 0; +} diff --git a/drivers/vdpa/pds/vdpa_dev.h b/drivers/vdpa/pds/vdpa_dev.h new file mode 100644 index 0000000000..d984ba24a7 --- /dev/null +++ b/drivers/vdpa/pds/vdpa_dev.h @@ -0,0 +1,50 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* Copyright(c) 2023 Advanced Micro Devices, Inc */ + +#ifndef _VDPA_DEV_H_ +#define _VDPA_DEV_H_ + +#include <linux/pci.h> +#include <linux/vdpa.h> + +struct pds_vdpa_vq_info { + bool ready; + u64 desc_addr; + u64 avail_addr; + u64 used_addr; + u32 q_len; + u16 qid; + int irq; + char irq_name[32]; + + void __iomem *notify; + dma_addr_t notify_pa; + + u64 doorbell; + u16 avail_idx; + u16 used_idx; + + struct vdpa_callback event_cb; + struct pds_vdpa_device *pdsv; +}; + +#define PDS_VDPA_MAX_QUEUES 65 +#define PDS_VDPA_MAX_QLEN 32768 +struct pds_vdpa_device { + struct vdpa_device vdpa_dev; + struct pds_vdpa_aux *vdpa_aux; + + struct pds_vdpa_vq_info vqs[PDS_VDPA_MAX_QUEUES]; + u64 supported_features; /* supported device features */ + u64 negotiated_features; /* negotiated features */ + u8 vdpa_index; /* rsvd for future subdevice use */ + u8 num_vqs; /* num vqs in use */ + u8 mac[ETH_ALEN]; /* mac selected when the device was added */ + struct vdpa_callback config_cb; + struct notifier_block nb; +}; + +#define PDS_VDPA_PACKED_INVERT_IDX 0x8000 + +int pds_vdpa_get_mgmt_info(struct pds_vdpa_aux *vdpa_aux); +#endif /* _VDPA_DEV_H_ */ diff --git a/drivers/vdpa/solidrun/Makefile b/drivers/vdpa/solidrun/Makefile new file mode 100644 index 0000000000..9116252cd5 --- /dev/null +++ b/drivers/vdpa/solidrun/Makefile @@ -0,0 +1,7 @@ +# SPDX-License-Identifier: GPL-2.0 +obj-$(CONFIG_SNET_VDPA) += snet_vdpa.o +snet_vdpa-$(CONFIG_SNET_VDPA) += snet_main.o +snet_vdpa-$(CONFIG_SNET_VDPA) += snet_ctrl.o +ifdef CONFIG_HWMON +snet_vdpa-$(CONFIG_SNET_VDPA) += snet_hwmon.o +endif diff --git a/drivers/vdpa/solidrun/snet_ctrl.c b/drivers/vdpa/solidrun/snet_ctrl.c new file mode 100644 index 0000000000..3cef2571d1 --- /dev/null +++ b/drivers/vdpa/solidrun/snet_ctrl.c @@ -0,0 +1,336 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * SolidRun DPU driver for control plane + * + * Copyright (C) 2022-2023 SolidRun + * + * Author: Alvaro Karsz <alvaro.karsz@solid-run.com> + * + */ + +#include <linux/iopoll.h> + +#include "snet_vdpa.h" + +enum snet_ctrl_opcodes { + SNET_CTRL_OP_DESTROY = 1, + SNET_CTRL_OP_READ_VQ_STATE, + SNET_CTRL_OP_SUSPEND, + SNET_CTRL_OP_RESUME, +}; + +#define SNET_CTRL_TIMEOUT 2000000 + +#define SNET_CTRL_DATA_SIZE_MASK 0x0000FFFF +#define SNET_CTRL_IN_PROCESS_MASK 0x00010000 +#define SNET_CTRL_CHUNK_RDY_MASK 0x00020000 +#define SNET_CTRL_ERROR_MASK 0x0FFC0000 + +#define SNET_VAL_TO_ERR(val) (-(((val) & SNET_CTRL_ERROR_MASK) >> 18)) +#define SNET_EMPTY_CTRL(val) (((val) & SNET_CTRL_ERROR_MASK) || \ + !((val) & SNET_CTRL_IN_PROCESS_MASK)) +#define SNET_DATA_READY(val) ((val) & (SNET_CTRL_ERROR_MASK | SNET_CTRL_CHUNK_RDY_MASK)) + +/* Control register used to read data from the DPU */ +struct snet_ctrl_reg_ctrl { + /* Chunk size in 4B words */ + u16 data_size; + /* We are in the middle of a command */ + u16 in_process:1; + /* A data chunk is ready and can be consumed */ + u16 chunk_ready:1; + /* Error code */ + u16 error:10; + /* Saved for future usage */ + u16 rsvd:4; +}; + +/* Opcode register */ +struct snet_ctrl_reg_op { + u16 opcode; + /* Only if VQ index is relevant for the command */ + u16 vq_idx; +}; + +struct snet_ctrl_regs { + struct snet_ctrl_reg_op op; + struct snet_ctrl_reg_ctrl ctrl; + u32 rsvd; + u32 data[]; +}; + +static struct snet_ctrl_regs __iomem *snet_get_ctrl(struct snet *snet) +{ + return snet->bar + snet->psnet->cfg.ctrl_off; +} + +static int snet_wait_for_empty_ctrl(struct snet_ctrl_regs __iomem *regs) +{ + u32 val; + + return readx_poll_timeout(ioread32, ®s->ctrl, val, SNET_EMPTY_CTRL(val), 10, + SNET_CTRL_TIMEOUT); +} + +static int snet_wait_for_empty_op(struct snet_ctrl_regs __iomem *regs) +{ + u32 val; + + return readx_poll_timeout(ioread32, ®s->op, val, !val, 10, SNET_CTRL_TIMEOUT); +} + +static int snet_wait_for_data(struct snet_ctrl_regs __iomem *regs) +{ + u32 val; + + return readx_poll_timeout(ioread32, ®s->ctrl, val, SNET_DATA_READY(val), 10, + SNET_CTRL_TIMEOUT); +} + +static u32 snet_read32_word(struct snet_ctrl_regs __iomem *ctrl_regs, u16 word_idx) +{ + return ioread32(&ctrl_regs->data[word_idx]); +} + +static u32 snet_read_ctrl(struct snet_ctrl_regs __iomem *ctrl_regs) +{ + return ioread32(&ctrl_regs->ctrl); +} + +static void snet_write_ctrl(struct snet_ctrl_regs __iomem *ctrl_regs, u32 val) +{ + iowrite32(val, &ctrl_regs->ctrl); +} + +static void snet_write_op(struct snet_ctrl_regs __iomem *ctrl_regs, u32 val) +{ + iowrite32(val, &ctrl_regs->op); +} + +static int snet_wait_for_dpu_completion(struct snet_ctrl_regs __iomem *ctrl_regs) +{ + /* Wait until the DPU finishes completely. + * It will clear the opcode register. + */ + return snet_wait_for_empty_op(ctrl_regs); +} + +/* Reading ctrl from the DPU: + * buf_size must be 4B aligned + * + * Steps: + * + * (1) Verify that the DPU is not in the middle of another operation by + * reading the in_process and error bits in the control register. + * (2) Write the request opcode and the VQ idx in the opcode register + * and write the buffer size in the control register. + * (3) Start readind chunks of data, chunk_ready bit indicates that a + * data chunk is available, we signal that we read the data by clearing the bit. + * (4) Detect that the transfer is completed when the in_process bit + * in the control register is cleared or when the an error appears. + */ +static int snet_ctrl_read_from_dpu(struct snet *snet, u16 opcode, u16 vq_idx, void *buffer, + u32 buf_size) +{ + struct pci_dev *pdev = snet->pdev; + struct snet_ctrl_regs __iomem *regs = snet_get_ctrl(snet); + u32 *bfr_ptr = (u32 *)buffer; + u32 val; + u16 buf_words; + int ret; + u16 words, i, tot_words = 0; + + /* Supported for config 2+ */ + if (!SNET_CFG_VER(snet, 2)) + return -EOPNOTSUPP; + + if (!IS_ALIGNED(buf_size, 4)) + return -EINVAL; + + mutex_lock(&snet->ctrl_lock); + + buf_words = buf_size / 4; + + /* Make sure control register is empty */ + ret = snet_wait_for_empty_ctrl(regs); + if (ret) { + SNET_WARN(pdev, "Timeout waiting for previous control data to be consumed\n"); + goto exit; + } + + /* We need to write the buffer size in the control register, and the opcode + vq index in + * the opcode register. + * We use a spinlock to serialize the writes. + */ + spin_lock(&snet->ctrl_spinlock); + + snet_write_ctrl(regs, buf_words); + snet_write_op(regs, opcode | (vq_idx << 16)); + + spin_unlock(&snet->ctrl_spinlock); + + while (buf_words != tot_words) { + ret = snet_wait_for_data(regs); + if (ret) { + SNET_WARN(pdev, "Timeout waiting for control data\n"); + goto exit; + } + + val = snet_read_ctrl(regs); + + /* Error? */ + if (val & SNET_CTRL_ERROR_MASK) { + ret = SNET_VAL_TO_ERR(val); + SNET_WARN(pdev, "Error while reading control data from DPU, err %d\n", ret); + goto exit; + } + + words = min_t(u16, val & SNET_CTRL_DATA_SIZE_MASK, buf_words - tot_words); + + for (i = 0; i < words; i++) { + *bfr_ptr = snet_read32_word(regs, i); + bfr_ptr++; + } + + tot_words += words; + + /* Is the job completed? */ + if (!(val & SNET_CTRL_IN_PROCESS_MASK)) + break; + + /* Clear the chunk ready bit and continue */ + val &= ~SNET_CTRL_CHUNK_RDY_MASK; + snet_write_ctrl(regs, val); + } + + ret = snet_wait_for_dpu_completion(regs); + if (ret) + SNET_WARN(pdev, "Timeout waiting for the DPU to complete a control command\n"); + +exit: + mutex_unlock(&snet->ctrl_lock); + return ret; +} + +/* Send a control message to the DPU using the old mechanism + * used with config version 1. + */ +static int snet_send_ctrl_msg_old(struct snet *snet, u32 opcode) +{ + struct pci_dev *pdev = snet->pdev; + struct snet_ctrl_regs __iomem *regs = snet_get_ctrl(snet); + int ret; + + mutex_lock(&snet->ctrl_lock); + + /* Old mechanism uses just 1 register, the opcode register. + * Make sure that the opcode register is empty, and that the DPU isn't + * processing an old message. + */ + ret = snet_wait_for_empty_op(regs); + if (ret) { + SNET_WARN(pdev, "Timeout waiting for previous control message to be ACKed\n"); + goto exit; + } + + /* Write the message */ + snet_write_op(regs, opcode); + + /* DPU ACKs the message by clearing the opcode register */ + ret = snet_wait_for_empty_op(regs); + if (ret) + SNET_WARN(pdev, "Timeout waiting for a control message to be ACKed\n"); + +exit: + mutex_unlock(&snet->ctrl_lock); + return ret; +} + +/* Send a control message to the DPU. + * A control message is a message without payload. + */ +static int snet_send_ctrl_msg(struct snet *snet, u16 opcode, u16 vq_idx) +{ + struct pci_dev *pdev = snet->pdev; + struct snet_ctrl_regs __iomem *regs = snet_get_ctrl(snet); + u32 val; + int ret; + + /* If config version is not 2+, use the old mechanism */ + if (!SNET_CFG_VER(snet, 2)) + return snet_send_ctrl_msg_old(snet, opcode); + + mutex_lock(&snet->ctrl_lock); + + /* Make sure control register is empty */ + ret = snet_wait_for_empty_ctrl(regs); + if (ret) { + SNET_WARN(pdev, "Timeout waiting for previous control data to be consumed\n"); + goto exit; + } + + /* We need to clear the control register and write the opcode + vq index in the opcode + * register. + * We use a spinlock to serialize the writes. + */ + spin_lock(&snet->ctrl_spinlock); + + snet_write_ctrl(regs, 0); + snet_write_op(regs, opcode | (vq_idx << 16)); + + spin_unlock(&snet->ctrl_spinlock); + + /* The DPU ACKs control messages by setting the chunk ready bit + * without data. + */ + ret = snet_wait_for_data(regs); + if (ret) { + SNET_WARN(pdev, "Timeout waiting for control message to be ACKed\n"); + goto exit; + } + + /* Check for errors */ + val = snet_read_ctrl(regs); + ret = SNET_VAL_TO_ERR(val); + + /* Clear the chunk ready bit */ + val &= ~SNET_CTRL_CHUNK_RDY_MASK; + snet_write_ctrl(regs, val); + + ret = snet_wait_for_dpu_completion(regs); + if (ret) + SNET_WARN(pdev, "Timeout waiting for DPU to complete a control command, err %d\n", + ret); + +exit: + mutex_unlock(&snet->ctrl_lock); + return ret; +} + +void snet_ctrl_clear(struct snet *snet) +{ + struct snet_ctrl_regs __iomem *regs = snet_get_ctrl(snet); + + snet_write_op(regs, 0); +} + +int snet_destroy_dev(struct snet *snet) +{ + return snet_send_ctrl_msg(snet, SNET_CTRL_OP_DESTROY, 0); +} + +int snet_read_vq_state(struct snet *snet, u16 idx, struct vdpa_vq_state *state) +{ + return snet_ctrl_read_from_dpu(snet, SNET_CTRL_OP_READ_VQ_STATE, idx, state, + sizeof(*state)); +} + +int snet_suspend_dev(struct snet *snet) +{ + return snet_send_ctrl_msg(snet, SNET_CTRL_OP_SUSPEND, 0); +} + +int snet_resume_dev(struct snet *snet) +{ + return snet_send_ctrl_msg(snet, SNET_CTRL_OP_RESUME, 0); +} diff --git a/drivers/vdpa/solidrun/snet_hwmon.c b/drivers/vdpa/solidrun/snet_hwmon.c new file mode 100644 index 0000000000..af531a3390 --- /dev/null +++ b/drivers/vdpa/solidrun/snet_hwmon.c @@ -0,0 +1,188 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * SolidRun DPU driver for control plane + * + * Copyright (C) 2022-2023 SolidRun + * + * Author: Alvaro Karsz <alvaro.karsz@solid-run.com> + * + */ +#include <linux/hwmon.h> + +#include "snet_vdpa.h" + +/* Monitor offsets */ +#define SNET_MON_TMP0_IN_OFF 0x00 +#define SNET_MON_TMP0_MAX_OFF 0x08 +#define SNET_MON_TMP0_CRIT_OFF 0x10 +#define SNET_MON_TMP1_IN_OFF 0x18 +#define SNET_MON_TMP1_CRIT_OFF 0x20 +#define SNET_MON_CURR_IN_OFF 0x28 +#define SNET_MON_CURR_MAX_OFF 0x30 +#define SNET_MON_CURR_CRIT_OFF 0x38 +#define SNET_MON_PWR_IN_OFF 0x40 +#define SNET_MON_VOLT_IN_OFF 0x48 +#define SNET_MON_VOLT_CRIT_OFF 0x50 +#define SNET_MON_VOLT_LCRIT_OFF 0x58 + +static void snet_hwmon_read_reg(struct psnet *psnet, u32 reg, long *out) +{ + *out = psnet_read64(psnet, psnet->cfg.hwmon_off + reg); +} + +static umode_t snet_howmon_is_visible(const void *data, + enum hwmon_sensor_types type, + u32 attr, int channel) +{ + return 0444; +} + +static int snet_howmon_read(struct device *dev, enum hwmon_sensor_types type, + u32 attr, int channel, long *val) +{ + struct psnet *psnet = dev_get_drvdata(dev); + int ret = 0; + + switch (type) { + case hwmon_in: + switch (attr) { + case hwmon_in_lcrit: + snet_hwmon_read_reg(psnet, SNET_MON_VOLT_LCRIT_OFF, val); + break; + case hwmon_in_crit: + snet_hwmon_read_reg(psnet, SNET_MON_VOLT_CRIT_OFF, val); + break; + case hwmon_in_input: + snet_hwmon_read_reg(psnet, SNET_MON_VOLT_IN_OFF, val); + break; + default: + ret = -EOPNOTSUPP; + break; + } + break; + + case hwmon_power: + switch (attr) { + case hwmon_power_input: + snet_hwmon_read_reg(psnet, SNET_MON_PWR_IN_OFF, val); + break; + + default: + ret = -EOPNOTSUPP; + break; + } + break; + + case hwmon_curr: + switch (attr) { + case hwmon_curr_input: + snet_hwmon_read_reg(psnet, SNET_MON_CURR_IN_OFF, val); + break; + case hwmon_curr_max: + snet_hwmon_read_reg(psnet, SNET_MON_CURR_MAX_OFF, val); + break; + case hwmon_curr_crit: + snet_hwmon_read_reg(psnet, SNET_MON_CURR_CRIT_OFF, val); + break; + default: + ret = -EOPNOTSUPP; + break; + } + break; + + case hwmon_temp: + switch (attr) { + case hwmon_temp_input: + if (channel == 0) + snet_hwmon_read_reg(psnet, SNET_MON_TMP0_IN_OFF, val); + else + snet_hwmon_read_reg(psnet, SNET_MON_TMP1_IN_OFF, val); + break; + case hwmon_temp_max: + if (channel == 0) + snet_hwmon_read_reg(psnet, SNET_MON_TMP0_MAX_OFF, val); + else + ret = -EOPNOTSUPP; + break; + case hwmon_temp_crit: + if (channel == 0) + snet_hwmon_read_reg(psnet, SNET_MON_TMP0_CRIT_OFF, val); + else + snet_hwmon_read_reg(psnet, SNET_MON_TMP1_CRIT_OFF, val); + break; + + default: + ret = -EOPNOTSUPP; + break; + } + break; + + default: + ret = -EOPNOTSUPP; + break; + } + return ret; +} + +static int snet_hwmon_read_string(struct device *dev, + enum hwmon_sensor_types type, u32 attr, + int channel, const char **str) +{ + int ret = 0; + + switch (type) { + case hwmon_in: + *str = "main_vin"; + break; + case hwmon_power: + *str = "soc_pin"; + break; + case hwmon_curr: + *str = "soc_iin"; + break; + case hwmon_temp: + if (channel == 0) + *str = "power_stage_temp"; + else + *str = "ic_junction_temp"; + break; + default: + ret = -EOPNOTSUPP; + break; + } + return ret; +} + +static const struct hwmon_ops snet_hwmon_ops = { + .is_visible = snet_howmon_is_visible, + .read = snet_howmon_read, + .read_string = snet_hwmon_read_string +}; + +static const struct hwmon_channel_info * const snet_hwmon_info[] = { + HWMON_CHANNEL_INFO(temp, HWMON_T_INPUT | HWMON_T_MAX | HWMON_T_CRIT | HWMON_T_LABEL, + HWMON_T_INPUT | HWMON_T_CRIT | HWMON_T_LABEL), + HWMON_CHANNEL_INFO(power, HWMON_P_INPUT | HWMON_P_LABEL), + HWMON_CHANNEL_INFO(curr, HWMON_C_INPUT | HWMON_C_MAX | HWMON_C_CRIT | HWMON_C_LABEL), + HWMON_CHANNEL_INFO(in, HWMON_I_INPUT | HWMON_I_CRIT | HWMON_I_LCRIT | HWMON_I_LABEL), + NULL +}; + +static const struct hwmon_chip_info snet_hwmono_info = { + .ops = &snet_hwmon_ops, + .info = snet_hwmon_info, +}; + +/* Create an HW monitor device */ +void psnet_create_hwmon(struct pci_dev *pdev) +{ + struct device *hwmon; + struct psnet *psnet = pci_get_drvdata(pdev); + + snprintf(psnet->hwmon_name, SNET_NAME_SIZE, "snet_%s", pci_name(pdev)); + hwmon = devm_hwmon_device_register_with_info(&pdev->dev, psnet->hwmon_name, psnet, + &snet_hwmono_info, NULL); + /* The monitor is not mandatory, Just alert user in case of an error */ + if (IS_ERR(hwmon)) + SNET_WARN(pdev, "Failed to create SNET hwmon, error %ld\n", PTR_ERR(hwmon)); +} diff --git a/drivers/vdpa/solidrun/snet_main.c b/drivers/vdpa/solidrun/snet_main.c new file mode 100644 index 0000000000..99428a0406 --- /dev/null +++ b/drivers/vdpa/solidrun/snet_main.c @@ -0,0 +1,1132 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * SolidRun DPU driver for control plane + * + * Copyright (C) 2022-2023 SolidRun + * + * Author: Alvaro Karsz <alvaro.karsz@solid-run.com> + * + */ +#include <linux/iopoll.h> + +#include "snet_vdpa.h" + +/* SNET DPU device ID */ +#define SNET_DEVICE_ID 0x1000 +/* SNET signature */ +#define SNET_SIGNATURE 0xD0D06363 +/* Max. config version that we can work with */ +#define SNET_CFG_VERSION 0x2 +/* Queue align */ +#define SNET_QUEUE_ALIGNMENT PAGE_SIZE +/* Kick value to notify that new data is available */ +#define SNET_KICK_VAL 0x1 +#define SNET_CONFIG_OFF 0x0 +/* How long we are willing to wait for a SNET device */ +#define SNET_DETECT_TIMEOUT 5000000 +/* How long should we wait for the DPU to read our config */ +#define SNET_READ_CFG_TIMEOUT 3000000 +/* Size of configs written to the DPU */ +#define SNET_GENERAL_CFG_LEN 36 +#define SNET_GENERAL_CFG_VQ_LEN 40 + +static struct snet *vdpa_to_snet(struct vdpa_device *vdpa) +{ + return container_of(vdpa, struct snet, vdpa); +} + +static irqreturn_t snet_cfg_irq_hndlr(int irq, void *data) +{ + struct snet *snet = data; + /* Call callback if any */ + if (likely(snet->cb.callback)) + return snet->cb.callback(snet->cb.private); + + return IRQ_HANDLED; +} + +static irqreturn_t snet_vq_irq_hndlr(int irq, void *data) +{ + struct snet_vq *vq = data; + /* Call callback if any */ + if (likely(vq->cb.callback)) + return vq->cb.callback(vq->cb.private); + + return IRQ_HANDLED; +} + +static void snet_free_irqs(struct snet *snet) +{ + struct psnet *psnet = snet->psnet; + struct pci_dev *pdev; + u32 i; + + /* Which Device allcoated the IRQs? */ + if (PSNET_FLAG_ON(psnet, SNET_CFG_FLAG_IRQ_PF)) + pdev = snet->pdev->physfn; + else + pdev = snet->pdev; + + /* Free config's IRQ */ + if (snet->cfg_irq != -1) { + devm_free_irq(&pdev->dev, snet->cfg_irq, snet); + snet->cfg_irq = -1; + } + /* Free VQ IRQs */ + for (i = 0; i < snet->cfg->vq_num; i++) { + if (snet->vqs[i] && snet->vqs[i]->irq != -1) { + devm_free_irq(&pdev->dev, snet->vqs[i]->irq, snet->vqs[i]); + snet->vqs[i]->irq = -1; + } + } + + /* IRQ vectors are freed when the pci remove callback is called */ +} + +static int snet_set_vq_address(struct vdpa_device *vdev, u16 idx, u64 desc_area, + u64 driver_area, u64 device_area) +{ + struct snet *snet = vdpa_to_snet(vdev); + /* save received parameters in vqueue sturct */ + snet->vqs[idx]->desc_area = desc_area; + snet->vqs[idx]->driver_area = driver_area; + snet->vqs[idx]->device_area = device_area; + + return 0; +} + +static void snet_set_vq_num(struct vdpa_device *vdev, u16 idx, u32 num) +{ + struct snet *snet = vdpa_to_snet(vdev); + /* save num in vqueue */ + snet->vqs[idx]->num = num; +} + +static void snet_kick_vq(struct vdpa_device *vdev, u16 idx) +{ + struct snet *snet = vdpa_to_snet(vdev); + /* not ready - ignore */ + if (unlikely(!snet->vqs[idx]->ready)) + return; + + iowrite32(SNET_KICK_VAL, snet->vqs[idx]->kick_ptr); +} + +static void snet_kick_vq_with_data(struct vdpa_device *vdev, u32 data) +{ + struct snet *snet = vdpa_to_snet(vdev); + u16 idx = data & 0xFFFF; + + /* not ready - ignore */ + if (unlikely(!snet->vqs[idx]->ready)) + return; + + iowrite32((data & 0xFFFF0000) | SNET_KICK_VAL, snet->vqs[idx]->kick_ptr); +} + +static void snet_set_vq_cb(struct vdpa_device *vdev, u16 idx, struct vdpa_callback *cb) +{ + struct snet *snet = vdpa_to_snet(vdev); + + snet->vqs[idx]->cb.callback = cb->callback; + snet->vqs[idx]->cb.private = cb->private; +} + +static void snet_set_vq_ready(struct vdpa_device *vdev, u16 idx, bool ready) +{ + struct snet *snet = vdpa_to_snet(vdev); + + snet->vqs[idx]->ready = ready; +} + +static bool snet_get_vq_ready(struct vdpa_device *vdev, u16 idx) +{ + struct snet *snet = vdpa_to_snet(vdev); + + return snet->vqs[idx]->ready; +} + +static bool snet_vq_state_is_initial(struct snet *snet, const struct vdpa_vq_state *state) +{ + if (SNET_HAS_FEATURE(snet, VIRTIO_F_RING_PACKED)) { + const struct vdpa_vq_state_packed *p = &state->packed; + + if (p->last_avail_counter == 1 && p->last_used_counter == 1 && + p->last_avail_idx == 0 && p->last_used_idx == 0) + return true; + } else { + const struct vdpa_vq_state_split *s = &state->split; + + if (s->avail_index == 0) + return true; + } + + return false; +} + +static int snet_set_vq_state(struct vdpa_device *vdev, u16 idx, const struct vdpa_vq_state *state) +{ + struct snet *snet = vdpa_to_snet(vdev); + + /* We can set any state for config version 2+ */ + if (SNET_CFG_VER(snet, 2)) { + memcpy(&snet->vqs[idx]->vq_state, state, sizeof(*state)); + return 0; + } + + /* Older config - we can't set the VQ state. + * Return 0 only if this is the initial state we use in the DPU. + */ + if (snet_vq_state_is_initial(snet, state)) + return 0; + + return -EOPNOTSUPP; +} + +static int snet_get_vq_state(struct vdpa_device *vdev, u16 idx, struct vdpa_vq_state *state) +{ + struct snet *snet = vdpa_to_snet(vdev); + + return snet_read_vq_state(snet, idx, state); +} + +static int snet_get_vq_irq(struct vdpa_device *vdev, u16 idx) +{ + struct snet *snet = vdpa_to_snet(vdev); + + return snet->vqs[idx]->irq; +} + +static u32 snet_get_vq_align(struct vdpa_device *vdev) +{ + return (u32)SNET_QUEUE_ALIGNMENT; +} + +static int snet_reset_dev(struct snet *snet) +{ + struct pci_dev *pdev = snet->pdev; + int ret = 0; + u32 i; + + /* If status is 0, nothing to do */ + if (!snet->status) + return 0; + + /* If DPU started, destroy it */ + if (snet->status & VIRTIO_CONFIG_S_DRIVER_OK) + ret = snet_destroy_dev(snet); + + /* Clear VQs */ + for (i = 0; i < snet->cfg->vq_num; i++) { + if (!snet->vqs[i]) + continue; + snet->vqs[i]->cb.callback = NULL; + snet->vqs[i]->cb.private = NULL; + snet->vqs[i]->desc_area = 0; + snet->vqs[i]->device_area = 0; + snet->vqs[i]->driver_area = 0; + snet->vqs[i]->ready = false; + } + + /* Clear config callback */ + snet->cb.callback = NULL; + snet->cb.private = NULL; + /* Free IRQs */ + snet_free_irqs(snet); + /* Reset status */ + snet->status = 0; + snet->dpu_ready = false; + + if (ret) + SNET_WARN(pdev, "Incomplete reset to SNET[%u] device, err: %d\n", snet->sid, ret); + else + SNET_DBG(pdev, "Reset SNET[%u] device\n", snet->sid); + + return 0; +} + +static int snet_reset(struct vdpa_device *vdev) +{ + struct snet *snet = vdpa_to_snet(vdev); + + return snet_reset_dev(snet); +} + +static size_t snet_get_config_size(struct vdpa_device *vdev) +{ + struct snet *snet = vdpa_to_snet(vdev); + + return (size_t)snet->cfg->cfg_size; +} + +static u64 snet_get_features(struct vdpa_device *vdev) +{ + struct snet *snet = vdpa_to_snet(vdev); + + return snet->cfg->features; +} + +static int snet_set_drv_features(struct vdpa_device *vdev, u64 features) +{ + struct snet *snet = vdpa_to_snet(vdev); + + snet->negotiated_features = snet->cfg->features & features; + return 0; +} + +static u64 snet_get_drv_features(struct vdpa_device *vdev) +{ + struct snet *snet = vdpa_to_snet(vdev); + + return snet->negotiated_features; +} + +static u16 snet_get_vq_num_max(struct vdpa_device *vdev) +{ + struct snet *snet = vdpa_to_snet(vdev); + + return (u16)snet->cfg->vq_size; +} + +static void snet_set_config_cb(struct vdpa_device *vdev, struct vdpa_callback *cb) +{ + struct snet *snet = vdpa_to_snet(vdev); + + snet->cb.callback = cb->callback; + snet->cb.private = cb->private; +} + +static u32 snet_get_device_id(struct vdpa_device *vdev) +{ + struct snet *snet = vdpa_to_snet(vdev); + + return snet->cfg->virtio_id; +} + +static u32 snet_get_vendor_id(struct vdpa_device *vdev) +{ + return (u32)PCI_VENDOR_ID_SOLIDRUN; +} + +static u8 snet_get_status(struct vdpa_device *vdev) +{ + struct snet *snet = vdpa_to_snet(vdev); + + return snet->status; +} + +static int snet_write_conf(struct snet *snet) +{ + u32 off, i, tmp; + int ret; + + /* No need to write the config twice */ + if (snet->dpu_ready) + return true; + + /* Snet data : + * + * General data: SNET_GENERAL_CFG_LEN bytes long + * 0 0x4 0x8 0xC 0x10 0x14 0x1C 0x24 + * | MAGIC NUMBER | CFG VER | SNET SID | NUMBER OF QUEUES | IRQ IDX | FEATURES | RSVD | + * + * For every VQ: SNET_GENERAL_CFG_VQ_LEN bytes long + * 0 0x4 0x8 + * | VQ SID AND QUEUE SIZE | IRQ Index | + * | DESC AREA | + * | DEVICE AREA | + * | DRIVER AREA | + * | VQ STATE (CFG 2+) | RSVD | + * + * Magic number should be written last, this is the DPU indication that the data is ready + */ + + /* Init offset */ + off = snet->psnet->cfg.host_cfg_off; + + /* Ignore magic number for now */ + off += 4; + snet_write32(snet, off, snet->psnet->negotiated_cfg_ver); + off += 4; + snet_write32(snet, off, snet->sid); + off += 4; + snet_write32(snet, off, snet->cfg->vq_num); + off += 4; + snet_write32(snet, off, snet->cfg_irq_idx); + off += 4; + snet_write64(snet, off, snet->negotiated_features); + off += 8; + /* Ignore reserved */ + off += 8; + /* Write VQs */ + for (i = 0 ; i < snet->cfg->vq_num ; i++) { + tmp = (i << 16) | (snet->vqs[i]->num & 0xFFFF); + snet_write32(snet, off, tmp); + off += 4; + snet_write32(snet, off, snet->vqs[i]->irq_idx); + off += 4; + snet_write64(snet, off, snet->vqs[i]->desc_area); + off += 8; + snet_write64(snet, off, snet->vqs[i]->device_area); + off += 8; + snet_write64(snet, off, snet->vqs[i]->driver_area); + off += 8; + /* Write VQ state if config version is 2+ */ + if (SNET_CFG_VER(snet, 2)) + snet_write32(snet, off, *(u32 *)&snet->vqs[i]->vq_state); + off += 4; + + /* Ignore reserved */ + off += 4; + } + + /* Write magic number - data is ready */ + snet_write32(snet, snet->psnet->cfg.host_cfg_off, SNET_SIGNATURE); + + /* The DPU will ACK the config by clearing the signature */ + ret = readx_poll_timeout(ioread32, snet->bar + snet->psnet->cfg.host_cfg_off, + tmp, !tmp, 10, SNET_READ_CFG_TIMEOUT); + if (ret) { + SNET_ERR(snet->pdev, "Timeout waiting for the DPU to read the config\n"); + return false; + } + + /* set DPU flag */ + snet->dpu_ready = true; + + return true; +} + +static int snet_request_irqs(struct pci_dev *pdev, struct snet *snet) +{ + int ret, i, irq; + + /* Request config IRQ */ + irq = pci_irq_vector(pdev, snet->cfg_irq_idx); + ret = devm_request_irq(&pdev->dev, irq, snet_cfg_irq_hndlr, 0, + snet->cfg_irq_name, snet); + if (ret) { + SNET_ERR(pdev, "Failed to request IRQ\n"); + return ret; + } + snet->cfg_irq = irq; + + /* Request IRQ for every VQ */ + for (i = 0; i < snet->cfg->vq_num; i++) { + irq = pci_irq_vector(pdev, snet->vqs[i]->irq_idx); + ret = devm_request_irq(&pdev->dev, irq, snet_vq_irq_hndlr, 0, + snet->vqs[i]->irq_name, snet->vqs[i]); + if (ret) { + SNET_ERR(pdev, "Failed to request IRQ\n"); + return ret; + } + snet->vqs[i]->irq = irq; + } + return 0; +} + +static void snet_set_status(struct vdpa_device *vdev, u8 status) +{ + struct snet *snet = vdpa_to_snet(vdev); + struct psnet *psnet = snet->psnet; + struct pci_dev *pdev = snet->pdev; + int ret; + bool pf_irqs; + + if (status == snet->status) + return; + + if ((status & VIRTIO_CONFIG_S_DRIVER_OK) && + !(snet->status & VIRTIO_CONFIG_S_DRIVER_OK)) { + /* Request IRQs */ + pf_irqs = PSNET_FLAG_ON(psnet, SNET_CFG_FLAG_IRQ_PF); + ret = snet_request_irqs(pf_irqs ? pdev->physfn : pdev, snet); + if (ret) + goto set_err; + + /* Write config to the DPU */ + if (snet_write_conf(snet)) { + SNET_INFO(pdev, "Create SNET[%u] device\n", snet->sid); + } else { + snet_free_irqs(snet); + goto set_err; + } + } + + /* Save the new status */ + snet->status = status; + return; + +set_err: + snet->status |= VIRTIO_CONFIG_S_FAILED; +} + +static void snet_get_config(struct vdpa_device *vdev, unsigned int offset, + void *buf, unsigned int len) +{ + struct snet *snet = vdpa_to_snet(vdev); + void __iomem *cfg_ptr = snet->cfg->virtio_cfg + offset; + u8 *buf_ptr = buf; + u32 i; + + /* check for offset error */ + if (offset + len > snet->cfg->cfg_size) + return; + + /* Write into buffer */ + for (i = 0; i < len; i++) + *buf_ptr++ = ioread8(cfg_ptr + i); +} + +static void snet_set_config(struct vdpa_device *vdev, unsigned int offset, + const void *buf, unsigned int len) +{ + struct snet *snet = vdpa_to_snet(vdev); + void __iomem *cfg_ptr = snet->cfg->virtio_cfg + offset; + const u8 *buf_ptr = buf; + u32 i; + + /* check for offset error */ + if (offset + len > snet->cfg->cfg_size) + return; + + /* Write into PCI BAR */ + for (i = 0; i < len; i++) + iowrite8(*buf_ptr++, cfg_ptr + i); +} + +static int snet_suspend(struct vdpa_device *vdev) +{ + struct snet *snet = vdpa_to_snet(vdev); + int ret; + + ret = snet_suspend_dev(snet); + if (ret) + SNET_ERR(snet->pdev, "SNET[%u] suspend failed, err: %d\n", snet->sid, ret); + else + SNET_DBG(snet->pdev, "Suspend SNET[%u] device\n", snet->sid); + + return ret; +} + +static int snet_resume(struct vdpa_device *vdev) +{ + struct snet *snet = vdpa_to_snet(vdev); + int ret; + + ret = snet_resume_dev(snet); + if (ret) + SNET_ERR(snet->pdev, "SNET[%u] resume failed, err: %d\n", snet->sid, ret); + else + SNET_DBG(snet->pdev, "Resume SNET[%u] device\n", snet->sid); + + return ret; +} + +static const struct vdpa_config_ops snet_config_ops = { + .set_vq_address = snet_set_vq_address, + .set_vq_num = snet_set_vq_num, + .kick_vq = snet_kick_vq, + .kick_vq_with_data = snet_kick_vq_with_data, + .set_vq_cb = snet_set_vq_cb, + .set_vq_ready = snet_set_vq_ready, + .get_vq_ready = snet_get_vq_ready, + .set_vq_state = snet_set_vq_state, + .get_vq_state = snet_get_vq_state, + .get_vq_irq = snet_get_vq_irq, + .get_vq_align = snet_get_vq_align, + .reset = snet_reset, + .get_config_size = snet_get_config_size, + .get_device_features = snet_get_features, + .set_driver_features = snet_set_drv_features, + .get_driver_features = snet_get_drv_features, + .get_vq_num_min = snet_get_vq_num_max, + .get_vq_num_max = snet_get_vq_num_max, + .set_config_cb = snet_set_config_cb, + .get_device_id = snet_get_device_id, + .get_vendor_id = snet_get_vendor_id, + .get_status = snet_get_status, + .set_status = snet_set_status, + .get_config = snet_get_config, + .set_config = snet_set_config, + .suspend = snet_suspend, + .resume = snet_resume, +}; + +static int psnet_open_pf_bar(struct pci_dev *pdev, struct psnet *psnet) +{ + char name[50]; + int ret, i, mask = 0; + /* We don't know which BAR will be used to communicate.. + * We will map every bar with len > 0. + * + * Later, we will discover the BAR and unmap all other BARs. + */ + for (i = 0; i < PCI_STD_NUM_BARS; i++) { + if (pci_resource_len(pdev, i)) + mask |= (1 << i); + } + + /* No BAR can be used.. */ + if (!mask) { + SNET_ERR(pdev, "Failed to find a PCI BAR\n"); + return -ENODEV; + } + + snprintf(name, sizeof(name), "psnet[%s]-bars", pci_name(pdev)); + ret = pcim_iomap_regions(pdev, mask, name); + if (ret) { + SNET_ERR(pdev, "Failed to request and map PCI BARs\n"); + return ret; + } + + for (i = 0; i < PCI_STD_NUM_BARS; i++) { + if (mask & (1 << i)) + psnet->bars[i] = pcim_iomap_table(pdev)[i]; + } + + return 0; +} + +static int snet_open_vf_bar(struct pci_dev *pdev, struct snet *snet) +{ + char name[50]; + int ret; + + snprintf(name, sizeof(name), "snet[%s]-bar", pci_name(pdev)); + /* Request and map BAR */ + ret = pcim_iomap_regions(pdev, BIT(snet->psnet->cfg.vf_bar), name); + if (ret) { + SNET_ERR(pdev, "Failed to request and map PCI BAR for a VF\n"); + return ret; + } + + snet->bar = pcim_iomap_table(pdev)[snet->psnet->cfg.vf_bar]; + + return 0; +} + +static void snet_free_cfg(struct snet_cfg *cfg) +{ + u32 i; + + if (!cfg->devs) + return; + + /* Free devices */ + for (i = 0; i < cfg->devices_num; i++) { + if (!cfg->devs[i]) + break; + + kfree(cfg->devs[i]); + } + /* Free pointers to devices */ + kfree(cfg->devs); +} + +/* Detect which BAR is used for communication with the device. */ +static int psnet_detect_bar(struct psnet *psnet, u32 off) +{ + unsigned long exit_time; + int i; + + exit_time = jiffies + usecs_to_jiffies(SNET_DETECT_TIMEOUT); + + /* SNET DPU will write SNET's signature when the config is ready. */ + while (time_before(jiffies, exit_time)) { + for (i = 0; i < PCI_STD_NUM_BARS; i++) { + /* Is this BAR mapped? */ + if (!psnet->bars[i]) + continue; + + if (ioread32(psnet->bars[i] + off) == SNET_SIGNATURE) + return i; + } + usleep_range(1000, 10000); + } + + return -ENODEV; +} + +static void psnet_unmap_unused_bars(struct pci_dev *pdev, struct psnet *psnet) +{ + int i, mask = 0; + + for (i = 0; i < PCI_STD_NUM_BARS; i++) { + if (psnet->bars[i] && i != psnet->barno) + mask |= (1 << i); + } + + if (mask) + pcim_iounmap_regions(pdev, mask); +} + +/* Read SNET config from PCI BAR */ +static int psnet_read_cfg(struct pci_dev *pdev, struct psnet *psnet) +{ + struct snet_cfg *cfg = &psnet->cfg; + u32 i, off; + int barno; + + /* Move to where the config starts */ + off = SNET_CONFIG_OFF; + + /* Find BAR used for communication */ + barno = psnet_detect_bar(psnet, off); + if (barno < 0) { + SNET_ERR(pdev, "SNET config is not ready.\n"); + return barno; + } + + /* Save used BAR number and unmap all other BARs */ + psnet->barno = barno; + SNET_DBG(pdev, "Using BAR number %d\n", barno); + + psnet_unmap_unused_bars(pdev, psnet); + + /* load config from BAR */ + cfg->key = psnet_read32(psnet, off); + off += 4; + cfg->cfg_size = psnet_read32(psnet, off); + off += 4; + cfg->cfg_ver = psnet_read32(psnet, off); + off += 4; + /* The negotiated config version is the lower one between this driver's config + * and the DPU's. + */ + psnet->negotiated_cfg_ver = min_t(u32, cfg->cfg_ver, SNET_CFG_VERSION); + SNET_DBG(pdev, "SNET config version %u\n", psnet->negotiated_cfg_ver); + + cfg->vf_num = psnet_read32(psnet, off); + off += 4; + cfg->vf_bar = psnet_read32(psnet, off); + off += 4; + cfg->host_cfg_off = psnet_read32(psnet, off); + off += 4; + cfg->max_size_host_cfg = psnet_read32(psnet, off); + off += 4; + cfg->virtio_cfg_off = psnet_read32(psnet, off); + off += 4; + cfg->kick_off = psnet_read32(psnet, off); + off += 4; + cfg->hwmon_off = psnet_read32(psnet, off); + off += 4; + cfg->ctrl_off = psnet_read32(psnet, off); + off += 4; + cfg->flags = psnet_read32(psnet, off); + off += 4; + /* Ignore Reserved */ + off += sizeof(cfg->rsvd); + + cfg->devices_num = psnet_read32(psnet, off); + off += 4; + /* Allocate memory to hold pointer to the devices */ + cfg->devs = kcalloc(cfg->devices_num, sizeof(void *), GFP_KERNEL); + if (!cfg->devs) + return -ENOMEM; + + /* Load device configuration from BAR */ + for (i = 0; i < cfg->devices_num; i++) { + cfg->devs[i] = kzalloc(sizeof(*cfg->devs[i]), GFP_KERNEL); + if (!cfg->devs[i]) { + snet_free_cfg(cfg); + return -ENOMEM; + } + /* Read device config */ + cfg->devs[i]->virtio_id = psnet_read32(psnet, off); + off += 4; + cfg->devs[i]->vq_num = psnet_read32(psnet, off); + off += 4; + cfg->devs[i]->vq_size = psnet_read32(psnet, off); + off += 4; + cfg->devs[i]->vfid = psnet_read32(psnet, off); + off += 4; + cfg->devs[i]->features = psnet_read64(psnet, off); + off += 8; + /* Ignore Reserved */ + off += sizeof(cfg->devs[i]->rsvd); + + cfg->devs[i]->cfg_size = psnet_read32(psnet, off); + off += 4; + + /* Is the config witten to the DPU going to be too big? */ + if (SNET_GENERAL_CFG_LEN + SNET_GENERAL_CFG_VQ_LEN * cfg->devs[i]->vq_num > + cfg->max_size_host_cfg) { + SNET_ERR(pdev, "Failed to read SNET config, the config is too big..\n"); + snet_free_cfg(cfg); + return -EINVAL; + } + } + return 0; +} + +static int psnet_alloc_irq_vector(struct pci_dev *pdev, struct psnet *psnet) +{ + int ret = 0; + u32 i, irq_num = 0; + + /* Let's count how many IRQs we need, 1 for every VQ + 1 for config change */ + for (i = 0; i < psnet->cfg.devices_num; i++) + irq_num += psnet->cfg.devs[i]->vq_num + 1; + + ret = pci_alloc_irq_vectors(pdev, irq_num, irq_num, PCI_IRQ_MSIX); + if (ret != irq_num) { + SNET_ERR(pdev, "Failed to allocate IRQ vectors\n"); + return ret; + } + SNET_DBG(pdev, "Allocated %u IRQ vectors from physical function\n", irq_num); + + return 0; +} + +static int snet_alloc_irq_vector(struct pci_dev *pdev, struct snet_dev_cfg *snet_cfg) +{ + int ret = 0; + u32 irq_num; + + /* We want 1 IRQ for every VQ + 1 for config change events */ + irq_num = snet_cfg->vq_num + 1; + + ret = pci_alloc_irq_vectors(pdev, irq_num, irq_num, PCI_IRQ_MSIX); + if (ret <= 0) { + SNET_ERR(pdev, "Failed to allocate IRQ vectors\n"); + return ret; + } + + return 0; +} + +static void snet_free_vqs(struct snet *snet) +{ + u32 i; + + if (!snet->vqs) + return; + + for (i = 0 ; i < snet->cfg->vq_num ; i++) { + if (!snet->vqs[i]) + break; + + kfree(snet->vqs[i]); + } + kfree(snet->vqs); +} + +static int snet_build_vqs(struct snet *snet) +{ + u32 i; + /* Allocate the VQ pointers array */ + snet->vqs = kcalloc(snet->cfg->vq_num, sizeof(void *), GFP_KERNEL); + if (!snet->vqs) + return -ENOMEM; + + /* Allocate the VQs */ + for (i = 0; i < snet->cfg->vq_num; i++) { + snet->vqs[i] = kzalloc(sizeof(*snet->vqs[i]), GFP_KERNEL); + if (!snet->vqs[i]) { + snet_free_vqs(snet); + return -ENOMEM; + } + /* Reset IRQ num */ + snet->vqs[i]->irq = -1; + /* VQ serial ID */ + snet->vqs[i]->sid = i; + /* Kick address - every VQ gets 4B */ + snet->vqs[i]->kick_ptr = snet->bar + snet->psnet->cfg.kick_off + + snet->vqs[i]->sid * 4; + /* Clear kick address for this VQ */ + iowrite32(0, snet->vqs[i]->kick_ptr); + } + return 0; +} + +static int psnet_get_next_irq_num(struct psnet *psnet) +{ + int irq; + + spin_lock(&psnet->lock); + irq = psnet->next_irq++; + spin_unlock(&psnet->lock); + + return irq; +} + +static void snet_reserve_irq_idx(struct pci_dev *pdev, struct snet *snet) +{ + struct psnet *psnet = snet->psnet; + int i; + + /* one IRQ for every VQ, and one for config changes */ + snet->cfg_irq_idx = psnet_get_next_irq_num(psnet); + snprintf(snet->cfg_irq_name, SNET_NAME_SIZE, "snet[%s]-cfg[%d]", + pci_name(pdev), snet->cfg_irq_idx); + + for (i = 0; i < snet->cfg->vq_num; i++) { + /* Get next free IRQ ID */ + snet->vqs[i]->irq_idx = psnet_get_next_irq_num(psnet); + /* Write IRQ name */ + snprintf(snet->vqs[i]->irq_name, SNET_NAME_SIZE, "snet[%s]-vq[%d]", + pci_name(pdev), snet->vqs[i]->irq_idx); + } +} + +/* Find a device config based on virtual function id */ +static struct snet_dev_cfg *snet_find_dev_cfg(struct snet_cfg *cfg, u32 vfid) +{ + u32 i; + + for (i = 0; i < cfg->devices_num; i++) { + if (cfg->devs[i]->vfid == vfid) + return cfg->devs[i]; + } + /* Oppss.. no config found.. */ + return NULL; +} + +/* Probe function for a physical PCI function */ +static int snet_vdpa_probe_pf(struct pci_dev *pdev) +{ + struct psnet *psnet; + int ret = 0; + bool pf_irqs = false; + + ret = pcim_enable_device(pdev); + if (ret) { + SNET_ERR(pdev, "Failed to enable PCI device\n"); + return ret; + } + + /* Allocate a PCI physical function device */ + psnet = kzalloc(sizeof(*psnet), GFP_KERNEL); + if (!psnet) + return -ENOMEM; + + /* Init PSNET spinlock */ + spin_lock_init(&psnet->lock); + + pci_set_master(pdev); + pci_set_drvdata(pdev, psnet); + + /* Open SNET MAIN BAR */ + ret = psnet_open_pf_bar(pdev, psnet); + if (ret) + goto free_psnet; + + /* Try to read SNET's config from PCI BAR */ + ret = psnet_read_cfg(pdev, psnet); + if (ret) + goto free_psnet; + + /* If SNET_CFG_FLAG_IRQ_PF flag is set, we should use + * PF MSI-X vectors + */ + pf_irqs = PSNET_FLAG_ON(psnet, SNET_CFG_FLAG_IRQ_PF); + + if (pf_irqs) { + ret = psnet_alloc_irq_vector(pdev, psnet); + if (ret) + goto free_cfg; + } + + SNET_DBG(pdev, "Enable %u virtual functions\n", psnet->cfg.vf_num); + ret = pci_enable_sriov(pdev, psnet->cfg.vf_num); + if (ret) { + SNET_ERR(pdev, "Failed to enable SR-IOV\n"); + goto free_irq; + } + + /* Create HW monitor device */ + if (PSNET_FLAG_ON(psnet, SNET_CFG_FLAG_HWMON)) { +#if IS_ENABLED(CONFIG_HWMON) + psnet_create_hwmon(pdev); +#else + SNET_WARN(pdev, "Can't start HWMON, CONFIG_HWMON is not enabled\n"); +#endif + } + + return 0; + +free_irq: + if (pf_irqs) + pci_free_irq_vectors(pdev); +free_cfg: + snet_free_cfg(&psnet->cfg); +free_psnet: + kfree(psnet); + return ret; +} + +/* Probe function for a virtual PCI function */ +static int snet_vdpa_probe_vf(struct pci_dev *pdev) +{ + struct pci_dev *pdev_pf = pdev->physfn; + struct psnet *psnet = pci_get_drvdata(pdev_pf); + struct snet_dev_cfg *dev_cfg; + struct snet *snet; + u32 vfid; + int ret; + bool pf_irqs = false; + + /* Get virtual function id. + * (the DPU counts the VFs from 1) + */ + ret = pci_iov_vf_id(pdev); + if (ret < 0) { + SNET_ERR(pdev, "Failed to find a VF id\n"); + return ret; + } + vfid = ret + 1; + + /* Find the snet_dev_cfg based on vfid */ + dev_cfg = snet_find_dev_cfg(&psnet->cfg, vfid); + if (!dev_cfg) { + SNET_WARN(pdev, "Failed to find a VF config..\n"); + return -ENODEV; + } + + /* Which PCI device should allocate the IRQs? + * If the SNET_CFG_FLAG_IRQ_PF flag set, the PF device allocates the IRQs + */ + pf_irqs = PSNET_FLAG_ON(psnet, SNET_CFG_FLAG_IRQ_PF); + + ret = pcim_enable_device(pdev); + if (ret) { + SNET_ERR(pdev, "Failed to enable PCI VF device\n"); + return ret; + } + + /* Request for MSI-X IRQs */ + if (!pf_irqs) { + ret = snet_alloc_irq_vector(pdev, dev_cfg); + if (ret) + return ret; + } + + /* Allocate vdpa device */ + snet = vdpa_alloc_device(struct snet, vdpa, &pdev->dev, &snet_config_ops, 1, 1, NULL, + false); + if (!snet) { + SNET_ERR(pdev, "Failed to allocate a vdpa device\n"); + ret = -ENOMEM; + goto free_irqs; + } + + /* Init control mutex and spinlock */ + mutex_init(&snet->ctrl_lock); + spin_lock_init(&snet->ctrl_spinlock); + + /* Save pci device pointer */ + snet->pdev = pdev; + snet->psnet = psnet; + snet->cfg = dev_cfg; + snet->dpu_ready = false; + snet->sid = vfid; + /* Reset IRQ value */ + snet->cfg_irq = -1; + + ret = snet_open_vf_bar(pdev, snet); + if (ret) + goto put_device; + + /* Create a VirtIO config pointer */ + snet->cfg->virtio_cfg = snet->bar + snet->psnet->cfg.virtio_cfg_off; + + /* Clear control registers */ + snet_ctrl_clear(snet); + + pci_set_master(pdev); + pci_set_drvdata(pdev, snet); + + ret = snet_build_vqs(snet); + if (ret) + goto put_device; + + /* Reserve IRQ indexes, + * The IRQs may be requested and freed multiple times, + * but the indexes won't change. + */ + snet_reserve_irq_idx(pf_irqs ? pdev_pf : pdev, snet); + + /*set DMA device*/ + snet->vdpa.dma_dev = &pdev->dev; + + /* Register VDPA device */ + ret = vdpa_register_device(&snet->vdpa, snet->cfg->vq_num); + if (ret) { + SNET_ERR(pdev, "Failed to register vdpa device\n"); + goto free_vqs; + } + + return 0; + +free_vqs: + snet_free_vqs(snet); +put_device: + put_device(&snet->vdpa.dev); +free_irqs: + if (!pf_irqs) + pci_free_irq_vectors(pdev); + return ret; +} + +static int snet_vdpa_probe(struct pci_dev *pdev, const struct pci_device_id *id) +{ + if (pdev->is_virtfn) + return snet_vdpa_probe_vf(pdev); + else + return snet_vdpa_probe_pf(pdev); +} + +static void snet_vdpa_remove_pf(struct pci_dev *pdev) +{ + struct psnet *psnet = pci_get_drvdata(pdev); + + pci_disable_sriov(pdev); + /* If IRQs are allocated from the PF, we should free the IRQs */ + if (PSNET_FLAG_ON(psnet, SNET_CFG_FLAG_IRQ_PF)) + pci_free_irq_vectors(pdev); + + snet_free_cfg(&psnet->cfg); + kfree(psnet); +} + +static void snet_vdpa_remove_vf(struct pci_dev *pdev) +{ + struct snet *snet = pci_get_drvdata(pdev); + struct psnet *psnet = snet->psnet; + + vdpa_unregister_device(&snet->vdpa); + snet_free_vqs(snet); + /* If IRQs are allocated from the VF, we should free the IRQs */ + if (!PSNET_FLAG_ON(psnet, SNET_CFG_FLAG_IRQ_PF)) + pci_free_irq_vectors(pdev); +} + +static void snet_vdpa_remove(struct pci_dev *pdev) +{ + if (pdev->is_virtfn) + snet_vdpa_remove_vf(pdev); + else + snet_vdpa_remove_pf(pdev); +} + +static struct pci_device_id snet_driver_pci_ids[] = { + { PCI_DEVICE_SUB(PCI_VENDOR_ID_SOLIDRUN, SNET_DEVICE_ID, + PCI_VENDOR_ID_SOLIDRUN, SNET_DEVICE_ID) }, + { 0 }, +}; + +MODULE_DEVICE_TABLE(pci, snet_driver_pci_ids); + +static struct pci_driver snet_vdpa_driver = { + .name = "snet-vdpa-driver", + .id_table = snet_driver_pci_ids, + .probe = snet_vdpa_probe, + .remove = snet_vdpa_remove, +}; + +module_pci_driver(snet_vdpa_driver); + +MODULE_AUTHOR("Alvaro Karsz <alvaro.karsz@solid-run.com>"); +MODULE_DESCRIPTION("SolidRun vDPA driver"); +MODULE_LICENSE("GPL v2"); diff --git a/drivers/vdpa/solidrun/snet_vdpa.h b/drivers/vdpa/solidrun/snet_vdpa.h new file mode 100644 index 0000000000..36ac285835 --- /dev/null +++ b/drivers/vdpa/solidrun/snet_vdpa.h @@ -0,0 +1,209 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * SolidRun DPU driver for control plane + * + * Copyright (C) 2022-2023 SolidRun + * + * Author: Alvaro Karsz <alvaro.karsz@solid-run.com> + * + */ +#ifndef _SNET_VDPA_H_ +#define _SNET_VDPA_H_ + +#include <linux/vdpa.h> +#include <linux/pci.h> + +#define SNET_NAME_SIZE 256 + +#define SNET_ERR(pdev, fmt, ...) dev_err(&(pdev)->dev, "%s"fmt, "snet_vdpa: ", ##__VA_ARGS__) +#define SNET_WARN(pdev, fmt, ...) dev_warn(&(pdev)->dev, "%s"fmt, "snet_vdpa: ", ##__VA_ARGS__) +#define SNET_INFO(pdev, fmt, ...) dev_info(&(pdev)->dev, "%s"fmt, "snet_vdpa: ", ##__VA_ARGS__) +#define SNET_DBG(pdev, fmt, ...) dev_dbg(&(pdev)->dev, "%s"fmt, "snet_vdpa: ", ##__VA_ARGS__) +#define SNET_HAS_FEATURE(s, f) ((s)->negotiated_features & BIT_ULL(f)) +/* Check if negotiated config version is at least @ver */ +#define SNET_CFG_VER(snet, ver) ((snet)->psnet->negotiated_cfg_ver >= (ver)) + +/* VQ struct */ +struct snet_vq { + /* VQ callback */ + struct vdpa_callback cb; + /* VQ state received from bus */ + struct vdpa_vq_state vq_state; + /* desc base address */ + u64 desc_area; + /* device base address */ + u64 device_area; + /* driver base address */ + u64 driver_area; + /* Queue size */ + u32 num; + /* Serial ID for VQ */ + u32 sid; + /* is ready flag */ + bool ready; + /* IRQ number */ + u32 irq; + /* IRQ index, DPU uses this to parse data from MSI-X table */ + u32 irq_idx; + /* IRQ name */ + char irq_name[SNET_NAME_SIZE]; + /* pointer to mapped PCI BAR register used by this VQ to kick */ + void __iomem *kick_ptr; +}; + +struct snet { + /* vdpa device */ + struct vdpa_device vdpa; + /* Config callback */ + struct vdpa_callback cb; + /* To lock the control mechanism */ + struct mutex ctrl_lock; + /* Spinlock to protect critical parts in the control mechanism */ + spinlock_t ctrl_spinlock; + /* array of virqueues */ + struct snet_vq **vqs; + /* Used features */ + u64 negotiated_features; + /* Device serial ID */ + u32 sid; + /* device status */ + u8 status; + /* boolean indicating if snet config was passed to the device */ + bool dpu_ready; + /* IRQ number */ + u32 cfg_irq; + /* IRQ index, DPU uses this to parse data from MSI-X table */ + u32 cfg_irq_idx; + /* IRQ name */ + char cfg_irq_name[SNET_NAME_SIZE]; + /* BAR to access the VF */ + void __iomem *bar; + /* PCI device */ + struct pci_dev *pdev; + /* Pointer to snet pdev parent device */ + struct psnet *psnet; + /* Pointer to snet config device */ + struct snet_dev_cfg *cfg; +}; + +struct snet_dev_cfg { + /* Device ID following VirtIO spec. */ + u32 virtio_id; + /* Number of VQs for this device */ + u32 vq_num; + /* Size of every VQ */ + u32 vq_size; + /* Virtual Function id */ + u32 vfid; + /* Device features, following VirtIO spec */ + u64 features; + /* Reserved for future usage */ + u32 rsvd[6]; + /* VirtIO device specific config size */ + u32 cfg_size; + /* VirtIO device specific config address */ + void __iomem *virtio_cfg; +} __packed; + +struct snet_cfg { + /* Magic key */ + u32 key; + /* Size of total config in bytes */ + u32 cfg_size; + /* Config version */ + u32 cfg_ver; + /* Number of Virtual Functions to create */ + u32 vf_num; + /* BAR to use for the VFs */ + u32 vf_bar; + /* Where should we write the SNET's config */ + u32 host_cfg_off; + /* Max. allowed size for a SNET's config */ + u32 max_size_host_cfg; + /* VirtIO config offset in BAR */ + u32 virtio_cfg_off; + /* Offset in PCI BAR for VQ kicks */ + u32 kick_off; + /* Offset in PCI BAR for HW monitoring */ + u32 hwmon_off; + /* Offset in PCI BAR for Control mechanism */ + u32 ctrl_off; + /* Config general flags - enum snet_cfg_flags */ + u32 flags; + /* Reserved for future usage */ + u32 rsvd[6]; + /* Number of snet devices */ + u32 devices_num; + /* The actual devices */ + struct snet_dev_cfg **devs; +} __packed; + +/* SolidNET PCIe device, one device per PCIe physical function */ +struct psnet { + /* PCI BARs */ + void __iomem *bars[PCI_STD_NUM_BARS]; + /* Negotiated config version */ + u32 negotiated_cfg_ver; + /* Next IRQ index to use in case when the IRQs are allocated from this device */ + u32 next_irq; + /* BAR number used to communicate with the device */ + u8 barno; + /* spinlock to protect data that can be changed by SNET devices */ + spinlock_t lock; + /* Pointer to the device's config read from BAR */ + struct snet_cfg cfg; + /* Name of monitor device */ + char hwmon_name[SNET_NAME_SIZE]; +}; + +enum snet_cfg_flags { + /* Create a HWMON device */ + SNET_CFG_FLAG_HWMON = BIT(0), + /* USE IRQs from the physical function */ + SNET_CFG_FLAG_IRQ_PF = BIT(1), +}; + +#define PSNET_FLAG_ON(p, f) ((p)->cfg.flags & (f)) + +static inline u32 psnet_read32(struct psnet *psnet, u32 off) +{ + return ioread32(psnet->bars[psnet->barno] + off); +} + +static inline u32 snet_read32(struct snet *snet, u32 off) +{ + return ioread32(snet->bar + off); +} + +static inline void snet_write32(struct snet *snet, u32 off, u32 val) +{ + iowrite32(val, snet->bar + off); +} + +static inline u64 psnet_read64(struct psnet *psnet, u32 off) +{ + u64 val; + /* 64bits are written in 2 halves, low part first */ + val = (u64)psnet_read32(psnet, off); + val |= ((u64)psnet_read32(psnet, off + 4) << 32); + return val; +} + +static inline void snet_write64(struct snet *snet, u32 off, u64 val) +{ + /* The DPU expects a 64bit integer in 2 halves, the low part first */ + snet_write32(snet, off, (u32)val); + snet_write32(snet, off + 4, (u32)(val >> 32)); +} + +#if IS_ENABLED(CONFIG_HWMON) +void psnet_create_hwmon(struct pci_dev *pdev); +#endif + +void snet_ctrl_clear(struct snet *snet); +int snet_destroy_dev(struct snet *snet); +int snet_read_vq_state(struct snet *snet, u16 idx, struct vdpa_vq_state *state); +int snet_suspend_dev(struct snet *snet); +int snet_resume_dev(struct snet *snet); + +#endif //_SNET_VDPA_H_ diff --git a/drivers/vdpa/vdpa.c b/drivers/vdpa/vdpa.c new file mode 100644 index 0000000000..a7612e0783 --- /dev/null +++ b/drivers/vdpa/vdpa.c @@ -0,0 +1,1329 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * vDPA bus. + * + * Copyright (c) 2020, Red Hat. All rights reserved. + * Author: Jason Wang <jasowang@redhat.com> + * + */ + +#include <linux/module.h> +#include <linux/idr.h> +#include <linux/slab.h> +#include <linux/vdpa.h> +#include <uapi/linux/vdpa.h> +#include <net/genetlink.h> +#include <linux/mod_devicetable.h> +#include <linux/virtio_ids.h> + +static LIST_HEAD(mdev_head); +/* A global mutex that protects vdpa management device and device level operations. */ +static DECLARE_RWSEM(vdpa_dev_lock); +static DEFINE_IDA(vdpa_index_ida); + +void vdpa_set_status(struct vdpa_device *vdev, u8 status) +{ + down_write(&vdev->cf_lock); + vdev->config->set_status(vdev, status); + up_write(&vdev->cf_lock); +} +EXPORT_SYMBOL(vdpa_set_status); + +static struct genl_family vdpa_nl_family; + +static int vdpa_dev_probe(struct device *d) +{ + struct vdpa_device *vdev = dev_to_vdpa(d); + struct vdpa_driver *drv = drv_to_vdpa(vdev->dev.driver); + const struct vdpa_config_ops *ops = vdev->config; + u32 max_num, min_num = 1; + int ret = 0; + + d->dma_mask = &d->coherent_dma_mask; + ret = dma_set_mask_and_coherent(d, DMA_BIT_MASK(64)); + if (ret) + return ret; + + max_num = ops->get_vq_num_max(vdev); + if (ops->get_vq_num_min) + min_num = ops->get_vq_num_min(vdev); + if (max_num < min_num) + return -EINVAL; + + if (drv && drv->probe) + ret = drv->probe(vdev); + + return ret; +} + +static void vdpa_dev_remove(struct device *d) +{ + struct vdpa_device *vdev = dev_to_vdpa(d); + struct vdpa_driver *drv = drv_to_vdpa(vdev->dev.driver); + + if (drv && drv->remove) + drv->remove(vdev); +} + +static int vdpa_dev_match(struct device *dev, struct device_driver *drv) +{ + struct vdpa_device *vdev = dev_to_vdpa(dev); + + /* Check override first, and if set, only use the named driver */ + if (vdev->driver_override) + return strcmp(vdev->driver_override, drv->name) == 0; + + /* Currently devices must be supported by all vDPA bus drivers */ + return 1; +} + +static ssize_t driver_override_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct vdpa_device *vdev = dev_to_vdpa(dev); + int ret; + + ret = driver_set_override(dev, &vdev->driver_override, buf, count); + if (ret) + return ret; + + return count; +} + +static ssize_t driver_override_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct vdpa_device *vdev = dev_to_vdpa(dev); + ssize_t len; + + device_lock(dev); + len = snprintf(buf, PAGE_SIZE, "%s\n", vdev->driver_override); + device_unlock(dev); + + return len; +} +static DEVICE_ATTR_RW(driver_override); + +static struct attribute *vdpa_dev_attrs[] = { + &dev_attr_driver_override.attr, + NULL, +}; + +static const struct attribute_group vdpa_dev_group = { + .attrs = vdpa_dev_attrs, +}; +__ATTRIBUTE_GROUPS(vdpa_dev); + +static struct bus_type vdpa_bus = { + .name = "vdpa", + .dev_groups = vdpa_dev_groups, + .match = vdpa_dev_match, + .probe = vdpa_dev_probe, + .remove = vdpa_dev_remove, +}; + +static void vdpa_release_dev(struct device *d) +{ + struct vdpa_device *vdev = dev_to_vdpa(d); + const struct vdpa_config_ops *ops = vdev->config; + + if (ops->free) + ops->free(vdev); + + ida_simple_remove(&vdpa_index_ida, vdev->index); + kfree(vdev->driver_override); + kfree(vdev); +} + +/** + * __vdpa_alloc_device - allocate and initilaize a vDPA device + * This allows driver to some prepartion after device is + * initialized but before registered. + * @parent: the parent device + * @config: the bus operations that is supported by this device + * @ngroups: number of groups supported by this device + * @nas: number of address spaces supported by this device + * @size: size of the parent structure that contains private data + * @name: name of the vdpa device; optional. + * @use_va: indicate whether virtual address must be used by this device + * + * Driver should use vdpa_alloc_device() wrapper macro instead of + * using this directly. + * + * Return: Returns an error when parent/config/dma_dev is not set or fail to get + * ida. + */ +struct vdpa_device *__vdpa_alloc_device(struct device *parent, + const struct vdpa_config_ops *config, + unsigned int ngroups, unsigned int nas, + size_t size, const char *name, + bool use_va) +{ + struct vdpa_device *vdev; + int err = -EINVAL; + + if (!config) + goto err; + + if (!!config->dma_map != !!config->dma_unmap) + goto err; + + /* It should only work for the device that use on-chip IOMMU */ + if (use_va && !(config->dma_map || config->set_map)) + goto err; + + err = -ENOMEM; + vdev = kzalloc(size, GFP_KERNEL); + if (!vdev) + goto err; + + err = ida_alloc(&vdpa_index_ida, GFP_KERNEL); + if (err < 0) + goto err_ida; + + vdev->dev.bus = &vdpa_bus; + vdev->dev.parent = parent; + vdev->dev.release = vdpa_release_dev; + vdev->index = err; + vdev->config = config; + vdev->features_valid = false; + vdev->use_va = use_va; + vdev->ngroups = ngroups; + vdev->nas = nas; + + if (name) + err = dev_set_name(&vdev->dev, "%s", name); + else + err = dev_set_name(&vdev->dev, "vdpa%u", vdev->index); + if (err) + goto err_name; + + init_rwsem(&vdev->cf_lock); + device_initialize(&vdev->dev); + + return vdev; + +err_name: + ida_simple_remove(&vdpa_index_ida, vdev->index); +err_ida: + kfree(vdev); +err: + return ERR_PTR(err); +} +EXPORT_SYMBOL_GPL(__vdpa_alloc_device); + +static int vdpa_name_match(struct device *dev, const void *data) +{ + struct vdpa_device *vdev = container_of(dev, struct vdpa_device, dev); + + return (strcmp(dev_name(&vdev->dev), data) == 0); +} + +static int __vdpa_register_device(struct vdpa_device *vdev, u32 nvqs) +{ + struct device *dev; + + vdev->nvqs = nvqs; + + lockdep_assert_held(&vdpa_dev_lock); + dev = bus_find_device(&vdpa_bus, NULL, dev_name(&vdev->dev), vdpa_name_match); + if (dev) { + put_device(dev); + return -EEXIST; + } + return device_add(&vdev->dev); +} + +/** + * _vdpa_register_device - register a vDPA device with vdpa lock held + * Caller must have a succeed call of vdpa_alloc_device() before. + * Caller must invoke this routine in the management device dev_add() + * callback after setting up valid mgmtdev for this vdpa device. + * @vdev: the vdpa device to be registered to vDPA bus + * @nvqs: number of virtqueues supported by this device + * + * Return: Returns an error when fail to add device to vDPA bus + */ +int _vdpa_register_device(struct vdpa_device *vdev, u32 nvqs) +{ + if (!vdev->mdev) + return -EINVAL; + + return __vdpa_register_device(vdev, nvqs); +} +EXPORT_SYMBOL_GPL(_vdpa_register_device); + +/** + * vdpa_register_device - register a vDPA device + * Callers must have a succeed call of vdpa_alloc_device() before. + * @vdev: the vdpa device to be registered to vDPA bus + * @nvqs: number of virtqueues supported by this device + * + * Return: Returns an error when fail to add to vDPA bus + */ +int vdpa_register_device(struct vdpa_device *vdev, u32 nvqs) +{ + int err; + + down_write(&vdpa_dev_lock); + err = __vdpa_register_device(vdev, nvqs); + up_write(&vdpa_dev_lock); + return err; +} +EXPORT_SYMBOL_GPL(vdpa_register_device); + +/** + * _vdpa_unregister_device - unregister a vDPA device + * Caller must invoke this routine as part of management device dev_del() + * callback. + * @vdev: the vdpa device to be unregisted from vDPA bus + */ +void _vdpa_unregister_device(struct vdpa_device *vdev) +{ + lockdep_assert_held(&vdpa_dev_lock); + WARN_ON(!vdev->mdev); + device_unregister(&vdev->dev); +} +EXPORT_SYMBOL_GPL(_vdpa_unregister_device); + +/** + * vdpa_unregister_device - unregister a vDPA device + * @vdev: the vdpa device to be unregisted from vDPA bus + */ +void vdpa_unregister_device(struct vdpa_device *vdev) +{ + down_write(&vdpa_dev_lock); + device_unregister(&vdev->dev); + up_write(&vdpa_dev_lock); +} +EXPORT_SYMBOL_GPL(vdpa_unregister_device); + +/** + * __vdpa_register_driver - register a vDPA device driver + * @drv: the vdpa device driver to be registered + * @owner: module owner of the driver + * + * Return: Returns an err when fail to do the registration + */ +int __vdpa_register_driver(struct vdpa_driver *drv, struct module *owner) +{ + drv->driver.bus = &vdpa_bus; + drv->driver.owner = owner; + + return driver_register(&drv->driver); +} +EXPORT_SYMBOL_GPL(__vdpa_register_driver); + +/** + * vdpa_unregister_driver - unregister a vDPA device driver + * @drv: the vdpa device driver to be unregistered + */ +void vdpa_unregister_driver(struct vdpa_driver *drv) +{ + driver_unregister(&drv->driver); +} +EXPORT_SYMBOL_GPL(vdpa_unregister_driver); + +/** + * vdpa_mgmtdev_register - register a vdpa management device + * + * @mdev: Pointer to vdpa management device + * vdpa_mgmtdev_register() register a vdpa management device which supports + * vdpa device management. + * Return: Returns 0 on success or failure when required callback ops are not + * initialized. + */ +int vdpa_mgmtdev_register(struct vdpa_mgmt_dev *mdev) +{ + if (!mdev->device || !mdev->ops || !mdev->ops->dev_add || !mdev->ops->dev_del) + return -EINVAL; + + INIT_LIST_HEAD(&mdev->list); + down_write(&vdpa_dev_lock); + list_add_tail(&mdev->list, &mdev_head); + up_write(&vdpa_dev_lock); + return 0; +} +EXPORT_SYMBOL_GPL(vdpa_mgmtdev_register); + +static int vdpa_match_remove(struct device *dev, void *data) +{ + struct vdpa_device *vdev = container_of(dev, struct vdpa_device, dev); + struct vdpa_mgmt_dev *mdev = vdev->mdev; + + if (mdev == data) + mdev->ops->dev_del(mdev, vdev); + return 0; +} + +void vdpa_mgmtdev_unregister(struct vdpa_mgmt_dev *mdev) +{ + down_write(&vdpa_dev_lock); + + list_del(&mdev->list); + + /* Filter out all the entries belong to this management device and delete it. */ + bus_for_each_dev(&vdpa_bus, NULL, mdev, vdpa_match_remove); + + up_write(&vdpa_dev_lock); +} +EXPORT_SYMBOL_GPL(vdpa_mgmtdev_unregister); + +static void vdpa_get_config_unlocked(struct vdpa_device *vdev, + unsigned int offset, + void *buf, unsigned int len) +{ + const struct vdpa_config_ops *ops = vdev->config; + + /* + * Config accesses aren't supposed to trigger before features are set. + * If it does happen we assume a legacy guest. + */ + if (!vdev->features_valid) + vdpa_set_features_unlocked(vdev, 0); + ops->get_config(vdev, offset, buf, len); +} + +/** + * vdpa_get_config - Get one or more device configuration fields. + * @vdev: vdpa device to operate on + * @offset: starting byte offset of the field + * @buf: buffer pointer to read to + * @len: length of the configuration fields in bytes + */ +void vdpa_get_config(struct vdpa_device *vdev, unsigned int offset, + void *buf, unsigned int len) +{ + down_read(&vdev->cf_lock); + vdpa_get_config_unlocked(vdev, offset, buf, len); + up_read(&vdev->cf_lock); +} +EXPORT_SYMBOL_GPL(vdpa_get_config); + +/** + * vdpa_set_config - Set one or more device configuration fields. + * @vdev: vdpa device to operate on + * @offset: starting byte offset of the field + * @buf: buffer pointer to read from + * @length: length of the configuration fields in bytes + */ +void vdpa_set_config(struct vdpa_device *vdev, unsigned int offset, + const void *buf, unsigned int length) +{ + down_write(&vdev->cf_lock); + vdev->config->set_config(vdev, offset, buf, length); + up_write(&vdev->cf_lock); +} +EXPORT_SYMBOL_GPL(vdpa_set_config); + +static bool mgmtdev_handle_match(const struct vdpa_mgmt_dev *mdev, + const char *busname, const char *devname) +{ + /* Bus name is optional for simulated management device, so ignore the + * device with bus if bus attribute is provided. + */ + if ((busname && !mdev->device->bus) || (!busname && mdev->device->bus)) + return false; + + if (!busname && strcmp(dev_name(mdev->device), devname) == 0) + return true; + + if (busname && (strcmp(mdev->device->bus->name, busname) == 0) && + (strcmp(dev_name(mdev->device), devname) == 0)) + return true; + + return false; +} + +static struct vdpa_mgmt_dev *vdpa_mgmtdev_get_from_attr(struct nlattr **attrs) +{ + struct vdpa_mgmt_dev *mdev; + const char *busname = NULL; + const char *devname; + + if (!attrs[VDPA_ATTR_MGMTDEV_DEV_NAME]) + return ERR_PTR(-EINVAL); + devname = nla_data(attrs[VDPA_ATTR_MGMTDEV_DEV_NAME]); + if (attrs[VDPA_ATTR_MGMTDEV_BUS_NAME]) + busname = nla_data(attrs[VDPA_ATTR_MGMTDEV_BUS_NAME]); + + list_for_each_entry(mdev, &mdev_head, list) { + if (mgmtdev_handle_match(mdev, busname, devname)) + return mdev; + } + return ERR_PTR(-ENODEV); +} + +static int vdpa_nl_mgmtdev_handle_fill(struct sk_buff *msg, const struct vdpa_mgmt_dev *mdev) +{ + if (mdev->device->bus && + nla_put_string(msg, VDPA_ATTR_MGMTDEV_BUS_NAME, mdev->device->bus->name)) + return -EMSGSIZE; + if (nla_put_string(msg, VDPA_ATTR_MGMTDEV_DEV_NAME, dev_name(mdev->device))) + return -EMSGSIZE; + return 0; +} + +static u64 vdpa_mgmtdev_get_classes(const struct vdpa_mgmt_dev *mdev, + unsigned int *nclasses) +{ + u64 supported_classes = 0; + unsigned int n = 0; + + for (int i = 0; mdev->id_table[i].device; i++) { + if (mdev->id_table[i].device > 63) + continue; + supported_classes |= BIT_ULL(mdev->id_table[i].device); + n++; + } + if (nclasses) + *nclasses = n; + + return supported_classes; +} + +static int vdpa_mgmtdev_fill(const struct vdpa_mgmt_dev *mdev, struct sk_buff *msg, + u32 portid, u32 seq, int flags) +{ + void *hdr; + int err; + + hdr = genlmsg_put(msg, portid, seq, &vdpa_nl_family, flags, VDPA_CMD_MGMTDEV_NEW); + if (!hdr) + return -EMSGSIZE; + err = vdpa_nl_mgmtdev_handle_fill(msg, mdev); + if (err) + goto msg_err; + + if (nla_put_u64_64bit(msg, VDPA_ATTR_MGMTDEV_SUPPORTED_CLASSES, + vdpa_mgmtdev_get_classes(mdev, NULL), + VDPA_ATTR_UNSPEC)) { + err = -EMSGSIZE; + goto msg_err; + } + if (nla_put_u32(msg, VDPA_ATTR_DEV_MGMTDEV_MAX_VQS, + mdev->max_supported_vqs)) { + err = -EMSGSIZE; + goto msg_err; + } + if (nla_put_u64_64bit(msg, VDPA_ATTR_DEV_SUPPORTED_FEATURES, + mdev->supported_features, VDPA_ATTR_PAD)) { + err = -EMSGSIZE; + goto msg_err; + } + + genlmsg_end(msg, hdr); + return 0; + +msg_err: + genlmsg_cancel(msg, hdr); + return err; +} + +static int vdpa_nl_cmd_mgmtdev_get_doit(struct sk_buff *skb, struct genl_info *info) +{ + struct vdpa_mgmt_dev *mdev; + struct sk_buff *msg; + int err; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + down_read(&vdpa_dev_lock); + mdev = vdpa_mgmtdev_get_from_attr(info->attrs); + if (IS_ERR(mdev)) { + up_read(&vdpa_dev_lock); + NL_SET_ERR_MSG_MOD(info->extack, "Fail to find the specified mgmt device"); + err = PTR_ERR(mdev); + goto out; + } + + err = vdpa_mgmtdev_fill(mdev, msg, info->snd_portid, info->snd_seq, 0); + up_read(&vdpa_dev_lock); + if (err) + goto out; + err = genlmsg_reply(msg, info); + return err; + +out: + nlmsg_free(msg); + return err; +} + +static int +vdpa_nl_cmd_mgmtdev_get_dumpit(struct sk_buff *msg, struct netlink_callback *cb) +{ + struct vdpa_mgmt_dev *mdev; + int start = cb->args[0]; + int idx = 0; + int err; + + down_read(&vdpa_dev_lock); + list_for_each_entry(mdev, &mdev_head, list) { + if (idx < start) { + idx++; + continue; + } + err = vdpa_mgmtdev_fill(mdev, msg, NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, NLM_F_MULTI); + if (err) + goto out; + idx++; + } +out: + up_read(&vdpa_dev_lock); + cb->args[0] = idx; + return msg->len; +} + +#define VDPA_DEV_NET_ATTRS_MASK (BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MACADDR) | \ + BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MTU) | \ + BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MAX_VQP)) + +/* + * Bitmask for all per-device features: feature bits VIRTIO_TRANSPORT_F_START + * through VIRTIO_TRANSPORT_F_END are unset, i.e. 0xfffffc000fffffff for + * all 64bit features. If the features are extended beyond 64 bits, or new + * "holes" are reserved for other type of features than per-device, this + * macro would have to be updated. + */ +#define VIRTIO_DEVICE_F_MASK (~0ULL << (VIRTIO_TRANSPORT_F_END + 1) | \ + ((1ULL << VIRTIO_TRANSPORT_F_START) - 1)) + +static int vdpa_nl_cmd_dev_add_set_doit(struct sk_buff *skb, struct genl_info *info) +{ + struct vdpa_dev_set_config config = {}; + struct nlattr **nl_attrs = info->attrs; + struct vdpa_mgmt_dev *mdev; + unsigned int ncls = 0; + const u8 *macaddr; + const char *name; + u64 classes; + int err = 0; + + if (!info->attrs[VDPA_ATTR_DEV_NAME]) + return -EINVAL; + + name = nla_data(info->attrs[VDPA_ATTR_DEV_NAME]); + + if (nl_attrs[VDPA_ATTR_DEV_NET_CFG_MACADDR]) { + macaddr = nla_data(nl_attrs[VDPA_ATTR_DEV_NET_CFG_MACADDR]); + memcpy(config.net.mac, macaddr, sizeof(config.net.mac)); + config.mask |= BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MACADDR); + } + if (nl_attrs[VDPA_ATTR_DEV_NET_CFG_MTU]) { + config.net.mtu = + nla_get_u16(nl_attrs[VDPA_ATTR_DEV_NET_CFG_MTU]); + config.mask |= BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MTU); + } + if (nl_attrs[VDPA_ATTR_DEV_NET_CFG_MAX_VQP]) { + config.net.max_vq_pairs = + nla_get_u16(nl_attrs[VDPA_ATTR_DEV_NET_CFG_MAX_VQP]); + if (!config.net.max_vq_pairs) { + NL_SET_ERR_MSG_MOD(info->extack, + "At least one pair of VQs is required"); + return -EINVAL; + } + config.mask |= BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MAX_VQP); + } + if (nl_attrs[VDPA_ATTR_DEV_FEATURES]) { + u64 missing = 0x0ULL; + + config.device_features = + nla_get_u64(nl_attrs[VDPA_ATTR_DEV_FEATURES]); + if (nl_attrs[VDPA_ATTR_DEV_NET_CFG_MACADDR] && + !(config.device_features & BIT_ULL(VIRTIO_NET_F_MAC))) + missing |= BIT_ULL(VIRTIO_NET_F_MAC); + if (nl_attrs[VDPA_ATTR_DEV_NET_CFG_MTU] && + !(config.device_features & BIT_ULL(VIRTIO_NET_F_MTU))) + missing |= BIT_ULL(VIRTIO_NET_F_MTU); + if (nl_attrs[VDPA_ATTR_DEV_NET_CFG_MAX_VQP] && + config.net.max_vq_pairs > 1 && + !(config.device_features & BIT_ULL(VIRTIO_NET_F_MQ))) + missing |= BIT_ULL(VIRTIO_NET_F_MQ); + if (missing) { + NL_SET_ERR_MSG_FMT_MOD(info->extack, + "Missing features 0x%llx for provided attributes", + missing); + return -EINVAL; + } + config.mask |= BIT_ULL(VDPA_ATTR_DEV_FEATURES); + } + + /* Skip checking capability if user didn't prefer to configure any + * device networking attributes. It is likely that user might have used + * a device specific method to configure such attributes or using device + * default attributes. + */ + if ((config.mask & VDPA_DEV_NET_ATTRS_MASK) && + !netlink_capable(skb, CAP_NET_ADMIN)) + return -EPERM; + + down_write(&vdpa_dev_lock); + mdev = vdpa_mgmtdev_get_from_attr(info->attrs); + if (IS_ERR(mdev)) { + NL_SET_ERR_MSG_MOD(info->extack, "Fail to find the specified management device"); + err = PTR_ERR(mdev); + goto err; + } + + if ((config.mask & mdev->config_attr_mask) != config.mask) { + NL_SET_ERR_MSG_FMT_MOD(info->extack, + "Some provided attributes are not supported: 0x%llx", + config.mask & ~mdev->config_attr_mask); + err = -EOPNOTSUPP; + goto err; + } + + classes = vdpa_mgmtdev_get_classes(mdev, &ncls); + if (config.mask & VDPA_DEV_NET_ATTRS_MASK && + !(classes & BIT_ULL(VIRTIO_ID_NET))) { + NL_SET_ERR_MSG_MOD(info->extack, + "Network class attributes provided on unsupported management device"); + err = -EINVAL; + goto err; + } + if (!(config.mask & VDPA_DEV_NET_ATTRS_MASK) && + config.mask & BIT_ULL(VDPA_ATTR_DEV_FEATURES) && + classes & BIT_ULL(VIRTIO_ID_NET) && ncls > 1 && + config.device_features & VIRTIO_DEVICE_F_MASK) { + NL_SET_ERR_MSG_MOD(info->extack, + "Management device supports multi-class while device features specified are ambiguous"); + err = -EINVAL; + goto err; + } + + err = mdev->ops->dev_add(mdev, name, &config); +err: + up_write(&vdpa_dev_lock); + return err; +} + +static int vdpa_nl_cmd_dev_del_set_doit(struct sk_buff *skb, struct genl_info *info) +{ + struct vdpa_mgmt_dev *mdev; + struct vdpa_device *vdev; + struct device *dev; + const char *name; + int err = 0; + + if (!info->attrs[VDPA_ATTR_DEV_NAME]) + return -EINVAL; + name = nla_data(info->attrs[VDPA_ATTR_DEV_NAME]); + + down_write(&vdpa_dev_lock); + dev = bus_find_device(&vdpa_bus, NULL, name, vdpa_name_match); + if (!dev) { + NL_SET_ERR_MSG_MOD(info->extack, "device not found"); + err = -ENODEV; + goto dev_err; + } + vdev = container_of(dev, struct vdpa_device, dev); + if (!vdev->mdev) { + NL_SET_ERR_MSG_MOD(info->extack, "Only user created device can be deleted by user"); + err = -EINVAL; + goto mdev_err; + } + mdev = vdev->mdev; + mdev->ops->dev_del(mdev, vdev); +mdev_err: + put_device(dev); +dev_err: + up_write(&vdpa_dev_lock); + return err; +} + +static int +vdpa_dev_fill(struct vdpa_device *vdev, struct sk_buff *msg, u32 portid, u32 seq, + int flags, struct netlink_ext_ack *extack) +{ + u16 max_vq_size; + u16 min_vq_size = 1; + u32 device_id; + u32 vendor_id; + void *hdr; + int err; + + hdr = genlmsg_put(msg, portid, seq, &vdpa_nl_family, flags, VDPA_CMD_DEV_NEW); + if (!hdr) + return -EMSGSIZE; + + err = vdpa_nl_mgmtdev_handle_fill(msg, vdev->mdev); + if (err) + goto msg_err; + + device_id = vdev->config->get_device_id(vdev); + vendor_id = vdev->config->get_vendor_id(vdev); + max_vq_size = vdev->config->get_vq_num_max(vdev); + if (vdev->config->get_vq_num_min) + min_vq_size = vdev->config->get_vq_num_min(vdev); + + err = -EMSGSIZE; + if (nla_put_string(msg, VDPA_ATTR_DEV_NAME, dev_name(&vdev->dev))) + goto msg_err; + if (nla_put_u32(msg, VDPA_ATTR_DEV_ID, device_id)) + goto msg_err; + if (nla_put_u32(msg, VDPA_ATTR_DEV_VENDOR_ID, vendor_id)) + goto msg_err; + if (nla_put_u32(msg, VDPA_ATTR_DEV_MAX_VQS, vdev->nvqs)) + goto msg_err; + if (nla_put_u16(msg, VDPA_ATTR_DEV_MAX_VQ_SIZE, max_vq_size)) + goto msg_err; + if (nla_put_u16(msg, VDPA_ATTR_DEV_MIN_VQ_SIZE, min_vq_size)) + goto msg_err; + + genlmsg_end(msg, hdr); + return 0; + +msg_err: + genlmsg_cancel(msg, hdr); + return err; +} + +static int vdpa_nl_cmd_dev_get_doit(struct sk_buff *skb, struct genl_info *info) +{ + struct vdpa_device *vdev; + struct sk_buff *msg; + const char *devname; + struct device *dev; + int err; + + if (!info->attrs[VDPA_ATTR_DEV_NAME]) + return -EINVAL; + devname = nla_data(info->attrs[VDPA_ATTR_DEV_NAME]); + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + down_read(&vdpa_dev_lock); + dev = bus_find_device(&vdpa_bus, NULL, devname, vdpa_name_match); + if (!dev) { + NL_SET_ERR_MSG_MOD(info->extack, "device not found"); + err = -ENODEV; + goto err; + } + vdev = container_of(dev, struct vdpa_device, dev); + if (!vdev->mdev) { + err = -EINVAL; + goto mdev_err; + } + err = vdpa_dev_fill(vdev, msg, info->snd_portid, info->snd_seq, 0, info->extack); + if (err) + goto mdev_err; + + err = genlmsg_reply(msg, info); + put_device(dev); + up_read(&vdpa_dev_lock); + return err; + +mdev_err: + put_device(dev); +err: + up_read(&vdpa_dev_lock); + nlmsg_free(msg); + return err; +} + +struct vdpa_dev_dump_info { + struct sk_buff *msg; + struct netlink_callback *cb; + int start_idx; + int idx; +}; + +static int vdpa_dev_dump(struct device *dev, void *data) +{ + struct vdpa_device *vdev = container_of(dev, struct vdpa_device, dev); + struct vdpa_dev_dump_info *info = data; + int err; + + if (!vdev->mdev) + return 0; + if (info->idx < info->start_idx) { + info->idx++; + return 0; + } + err = vdpa_dev_fill(vdev, info->msg, NETLINK_CB(info->cb->skb).portid, + info->cb->nlh->nlmsg_seq, NLM_F_MULTI, info->cb->extack); + if (err) + return err; + + info->idx++; + return 0; +} + +static int vdpa_nl_cmd_dev_get_dumpit(struct sk_buff *msg, struct netlink_callback *cb) +{ + struct vdpa_dev_dump_info info; + + info.msg = msg; + info.cb = cb; + info.start_idx = cb->args[0]; + info.idx = 0; + + down_read(&vdpa_dev_lock); + bus_for_each_dev(&vdpa_bus, NULL, &info, vdpa_dev_dump); + up_read(&vdpa_dev_lock); + cb->args[0] = info.idx; + return msg->len; +} + +static int vdpa_dev_net_mq_config_fill(struct sk_buff *msg, u64 features, + const struct virtio_net_config *config) +{ + u16 val_u16; + + if ((features & BIT_ULL(VIRTIO_NET_F_MQ)) == 0 && + (features & BIT_ULL(VIRTIO_NET_F_RSS)) == 0) + return 0; + + val_u16 = __virtio16_to_cpu(true, config->max_virtqueue_pairs); + + return nla_put_u16(msg, VDPA_ATTR_DEV_NET_CFG_MAX_VQP, val_u16); +} + +static int vdpa_dev_net_mtu_config_fill(struct sk_buff *msg, u64 features, + const struct virtio_net_config *config) +{ + u16 val_u16; + + if ((features & BIT_ULL(VIRTIO_NET_F_MTU)) == 0) + return 0; + + val_u16 = __virtio16_to_cpu(true, config->mtu); + + return nla_put_u16(msg, VDPA_ATTR_DEV_NET_CFG_MTU, val_u16); +} + +static int vdpa_dev_net_mac_config_fill(struct sk_buff *msg, u64 features, + const struct virtio_net_config *config) +{ + if ((features & BIT_ULL(VIRTIO_NET_F_MAC)) == 0) + return 0; + + return nla_put(msg, VDPA_ATTR_DEV_NET_CFG_MACADDR, + sizeof(config->mac), config->mac); +} + +static int vdpa_dev_net_status_config_fill(struct sk_buff *msg, u64 features, + const struct virtio_net_config *config) +{ + u16 val_u16; + + if ((features & BIT_ULL(VIRTIO_NET_F_STATUS)) == 0) + return 0; + + val_u16 = __virtio16_to_cpu(true, config->status); + return nla_put_u16(msg, VDPA_ATTR_DEV_NET_STATUS, val_u16); +} + +static int vdpa_dev_net_config_fill(struct vdpa_device *vdev, struct sk_buff *msg) +{ + struct virtio_net_config config = {}; + u64 features_device; + + vdev->config->get_config(vdev, 0, &config, sizeof(config)); + + features_device = vdev->config->get_device_features(vdev); + + if (nla_put_u64_64bit(msg, VDPA_ATTR_DEV_FEATURES, features_device, + VDPA_ATTR_PAD)) + return -EMSGSIZE; + + if (vdpa_dev_net_mtu_config_fill(msg, features_device, &config)) + return -EMSGSIZE; + + if (vdpa_dev_net_mac_config_fill(msg, features_device, &config)) + return -EMSGSIZE; + + if (vdpa_dev_net_status_config_fill(msg, features_device, &config)) + return -EMSGSIZE; + + return vdpa_dev_net_mq_config_fill(msg, features_device, &config); +} + +static int +vdpa_dev_config_fill(struct vdpa_device *vdev, struct sk_buff *msg, u32 portid, u32 seq, + int flags, struct netlink_ext_ack *extack) +{ + u64 features_driver; + u8 status = 0; + u32 device_id; + void *hdr; + int err; + + down_read(&vdev->cf_lock); + hdr = genlmsg_put(msg, portid, seq, &vdpa_nl_family, flags, + VDPA_CMD_DEV_CONFIG_GET); + if (!hdr) { + err = -EMSGSIZE; + goto out; + } + + if (nla_put_string(msg, VDPA_ATTR_DEV_NAME, dev_name(&vdev->dev))) { + err = -EMSGSIZE; + goto msg_err; + } + + device_id = vdev->config->get_device_id(vdev); + if (nla_put_u32(msg, VDPA_ATTR_DEV_ID, device_id)) { + err = -EMSGSIZE; + goto msg_err; + } + + /* only read driver features after the feature negotiation is done */ + status = vdev->config->get_status(vdev); + if (status & VIRTIO_CONFIG_S_FEATURES_OK) { + features_driver = vdev->config->get_driver_features(vdev); + if (nla_put_u64_64bit(msg, VDPA_ATTR_DEV_NEGOTIATED_FEATURES, features_driver, + VDPA_ATTR_PAD)) { + err = -EMSGSIZE; + goto msg_err; + } + } + + switch (device_id) { + case VIRTIO_ID_NET: + err = vdpa_dev_net_config_fill(vdev, msg); + break; + default: + err = -EOPNOTSUPP; + break; + } + if (err) + goto msg_err; + + up_read(&vdev->cf_lock); + genlmsg_end(msg, hdr); + return 0; + +msg_err: + genlmsg_cancel(msg, hdr); +out: + up_read(&vdev->cf_lock); + return err; +} + +static int vdpa_fill_stats_rec(struct vdpa_device *vdev, struct sk_buff *msg, + struct genl_info *info, u32 index) +{ + struct virtio_net_config config = {}; + u64 features; + u8 status; + int err; + + status = vdev->config->get_status(vdev); + if (!(status & VIRTIO_CONFIG_S_FEATURES_OK)) { + NL_SET_ERR_MSG_MOD(info->extack, "feature negotiation not complete"); + return -EAGAIN; + } + vdpa_get_config_unlocked(vdev, 0, &config, sizeof(config)); + + features = vdev->config->get_driver_features(vdev); + if (nla_put_u64_64bit(msg, VDPA_ATTR_DEV_NEGOTIATED_FEATURES, + features, VDPA_ATTR_PAD)) + return -EMSGSIZE; + + err = vdpa_dev_net_mq_config_fill(msg, features, &config); + if (err) + return err; + + if (nla_put_u32(msg, VDPA_ATTR_DEV_QUEUE_INDEX, index)) + return -EMSGSIZE; + + err = vdev->config->get_vendor_vq_stats(vdev, index, msg, info->extack); + if (err) + return err; + + return 0; +} + +static int vendor_stats_fill(struct vdpa_device *vdev, struct sk_buff *msg, + struct genl_info *info, u32 index) +{ + int err; + + down_read(&vdev->cf_lock); + if (!vdev->config->get_vendor_vq_stats) { + err = -EOPNOTSUPP; + goto out; + } + + err = vdpa_fill_stats_rec(vdev, msg, info, index); +out: + up_read(&vdev->cf_lock); + return err; +} + +static int vdpa_dev_vendor_stats_fill(struct vdpa_device *vdev, + struct sk_buff *msg, + struct genl_info *info, u32 index) +{ + u32 device_id; + void *hdr; + int err; + u32 portid = info->snd_portid; + u32 seq = info->snd_seq; + u32 flags = 0; + + hdr = genlmsg_put(msg, portid, seq, &vdpa_nl_family, flags, + VDPA_CMD_DEV_VSTATS_GET); + if (!hdr) + return -EMSGSIZE; + + if (nla_put_string(msg, VDPA_ATTR_DEV_NAME, dev_name(&vdev->dev))) { + err = -EMSGSIZE; + goto undo_msg; + } + + device_id = vdev->config->get_device_id(vdev); + if (nla_put_u32(msg, VDPA_ATTR_DEV_ID, device_id)) { + err = -EMSGSIZE; + goto undo_msg; + } + + switch (device_id) { + case VIRTIO_ID_NET: + if (index > VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MAX) { + NL_SET_ERR_MSG_MOD(info->extack, "queue index exceeds max value"); + err = -ERANGE; + break; + } + + err = vendor_stats_fill(vdev, msg, info, index); + break; + default: + err = -EOPNOTSUPP; + break; + } + genlmsg_end(msg, hdr); + + return err; + +undo_msg: + genlmsg_cancel(msg, hdr); + return err; +} + +static int vdpa_nl_cmd_dev_config_get_doit(struct sk_buff *skb, struct genl_info *info) +{ + struct vdpa_device *vdev; + struct sk_buff *msg; + const char *devname; + struct device *dev; + int err; + + if (!info->attrs[VDPA_ATTR_DEV_NAME]) + return -EINVAL; + devname = nla_data(info->attrs[VDPA_ATTR_DEV_NAME]); + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + down_read(&vdpa_dev_lock); + dev = bus_find_device(&vdpa_bus, NULL, devname, vdpa_name_match); + if (!dev) { + NL_SET_ERR_MSG_MOD(info->extack, "device not found"); + err = -ENODEV; + goto dev_err; + } + vdev = container_of(dev, struct vdpa_device, dev); + if (!vdev->mdev) { + NL_SET_ERR_MSG_MOD(info->extack, "unmanaged vdpa device"); + err = -EINVAL; + goto mdev_err; + } + err = vdpa_dev_config_fill(vdev, msg, info->snd_portid, info->snd_seq, + 0, info->extack); + if (!err) + err = genlmsg_reply(msg, info); + +mdev_err: + put_device(dev); +dev_err: + up_read(&vdpa_dev_lock); + if (err) + nlmsg_free(msg); + return err; +} + +static int vdpa_dev_config_dump(struct device *dev, void *data) +{ + struct vdpa_device *vdev = container_of(dev, struct vdpa_device, dev); + struct vdpa_dev_dump_info *info = data; + int err; + + if (!vdev->mdev) + return 0; + if (info->idx < info->start_idx) { + info->idx++; + return 0; + } + err = vdpa_dev_config_fill(vdev, info->msg, NETLINK_CB(info->cb->skb).portid, + info->cb->nlh->nlmsg_seq, NLM_F_MULTI, + info->cb->extack); + if (err) + return err; + + info->idx++; + return 0; +} + +static int +vdpa_nl_cmd_dev_config_get_dumpit(struct sk_buff *msg, struct netlink_callback *cb) +{ + struct vdpa_dev_dump_info info; + + info.msg = msg; + info.cb = cb; + info.start_idx = cb->args[0]; + info.idx = 0; + + down_read(&vdpa_dev_lock); + bus_for_each_dev(&vdpa_bus, NULL, &info, vdpa_dev_config_dump); + up_read(&vdpa_dev_lock); + cb->args[0] = info.idx; + return msg->len; +} + +static int vdpa_nl_cmd_dev_stats_get_doit(struct sk_buff *skb, + struct genl_info *info) +{ + struct vdpa_device *vdev; + struct sk_buff *msg; + const char *devname; + struct device *dev; + u32 index; + int err; + + if (!info->attrs[VDPA_ATTR_DEV_NAME]) + return -EINVAL; + + if (!info->attrs[VDPA_ATTR_DEV_QUEUE_INDEX]) + return -EINVAL; + + devname = nla_data(info->attrs[VDPA_ATTR_DEV_NAME]); + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + index = nla_get_u32(info->attrs[VDPA_ATTR_DEV_QUEUE_INDEX]); + down_read(&vdpa_dev_lock); + dev = bus_find_device(&vdpa_bus, NULL, devname, vdpa_name_match); + if (!dev) { + NL_SET_ERR_MSG_MOD(info->extack, "device not found"); + err = -ENODEV; + goto dev_err; + } + vdev = container_of(dev, struct vdpa_device, dev); + if (!vdev->mdev) { + NL_SET_ERR_MSG_MOD(info->extack, "unmanaged vdpa device"); + err = -EINVAL; + goto mdev_err; + } + err = vdpa_dev_vendor_stats_fill(vdev, msg, info, index); + if (err) + goto mdev_err; + + err = genlmsg_reply(msg, info); + + put_device(dev); + up_read(&vdpa_dev_lock); + + return err; + +mdev_err: + put_device(dev); +dev_err: + nlmsg_free(msg); + up_read(&vdpa_dev_lock); + return err; +} + +static const struct nla_policy vdpa_nl_policy[VDPA_ATTR_MAX + 1] = { + [VDPA_ATTR_MGMTDEV_BUS_NAME] = { .type = NLA_NUL_STRING }, + [VDPA_ATTR_MGMTDEV_DEV_NAME] = { .type = NLA_STRING }, + [VDPA_ATTR_DEV_NAME] = { .type = NLA_STRING }, + [VDPA_ATTR_DEV_NET_CFG_MACADDR] = NLA_POLICY_ETH_ADDR, + [VDPA_ATTR_DEV_NET_CFG_MAX_VQP] = { .type = NLA_U16 }, + /* virtio spec 1.1 section 5.1.4.1 for valid MTU range */ + [VDPA_ATTR_DEV_NET_CFG_MTU] = NLA_POLICY_MIN(NLA_U16, 68), + [VDPA_ATTR_DEV_QUEUE_INDEX] = { .type = NLA_U32 }, + [VDPA_ATTR_DEV_FEATURES] = { .type = NLA_U64 }, +}; + +static const struct genl_ops vdpa_nl_ops[] = { + { + .cmd = VDPA_CMD_MGMTDEV_GET, + .doit = vdpa_nl_cmd_mgmtdev_get_doit, + .dumpit = vdpa_nl_cmd_mgmtdev_get_dumpit, + }, + { + .cmd = VDPA_CMD_DEV_NEW, + .doit = vdpa_nl_cmd_dev_add_set_doit, + .flags = GENL_ADMIN_PERM, + }, + { + .cmd = VDPA_CMD_DEV_DEL, + .doit = vdpa_nl_cmd_dev_del_set_doit, + .flags = GENL_ADMIN_PERM, + }, + { + .cmd = VDPA_CMD_DEV_GET, + .doit = vdpa_nl_cmd_dev_get_doit, + .dumpit = vdpa_nl_cmd_dev_get_dumpit, + }, + { + .cmd = VDPA_CMD_DEV_CONFIG_GET, + .doit = vdpa_nl_cmd_dev_config_get_doit, + .dumpit = vdpa_nl_cmd_dev_config_get_dumpit, + }, + { + .cmd = VDPA_CMD_DEV_VSTATS_GET, + .doit = vdpa_nl_cmd_dev_stats_get_doit, + .flags = GENL_ADMIN_PERM, + }, +}; + +static struct genl_family vdpa_nl_family __ro_after_init = { + .name = VDPA_GENL_NAME, + .version = VDPA_GENL_VERSION, + .maxattr = VDPA_ATTR_MAX, + .policy = vdpa_nl_policy, + .netnsok = false, + .module = THIS_MODULE, + .ops = vdpa_nl_ops, + .n_ops = ARRAY_SIZE(vdpa_nl_ops), + .resv_start_op = VDPA_CMD_DEV_VSTATS_GET + 1, +}; + +static int vdpa_init(void) +{ + int err; + + err = bus_register(&vdpa_bus); + if (err) + return err; + err = genl_register_family(&vdpa_nl_family); + if (err) + goto err; + return 0; + +err: + bus_unregister(&vdpa_bus); + return err; +} + +static void __exit vdpa_exit(void) +{ + genl_unregister_family(&vdpa_nl_family); + bus_unregister(&vdpa_bus); + ida_destroy(&vdpa_index_ida); +} +core_initcall(vdpa_init); +module_exit(vdpa_exit); + +MODULE_AUTHOR("Jason Wang <jasowang@redhat.com>"); +MODULE_LICENSE("GPL v2"); diff --git a/drivers/vdpa/vdpa_sim/Makefile b/drivers/vdpa/vdpa_sim/Makefile new file mode 100644 index 0000000000..d458103302 --- /dev/null +++ b/drivers/vdpa/vdpa_sim/Makefile @@ -0,0 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0 +obj-$(CONFIG_VDPA_SIM) += vdpa_sim.o +obj-$(CONFIG_VDPA_SIM_NET) += vdpa_sim_net.o +obj-$(CONFIG_VDPA_SIM_BLOCK) += vdpa_sim_blk.o diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.c b/drivers/vdpa/vdpa_sim/vdpa_sim.c new file mode 100644 index 0000000000..76d41058ad --- /dev/null +++ b/drivers/vdpa/vdpa_sim/vdpa_sim.c @@ -0,0 +1,807 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * VDPA device simulator core. + * + * Copyright (c) 2020, Red Hat Inc. All rights reserved. + * Author: Jason Wang <jasowang@redhat.com> + * + */ + +#include <linux/init.h> +#include <linux/module.h> +#include <linux/device.h> +#include <linux/kernel.h> +#include <linux/kthread.h> +#include <linux/slab.h> +#include <linux/dma-map-ops.h> +#include <linux/vringh.h> +#include <linux/vdpa.h> +#include <linux/vhost_iotlb.h> +#include <uapi/linux/vdpa.h> +#include <uapi/linux/vhost_types.h> + +#include "vdpa_sim.h" + +#define DRV_VERSION "0.1" +#define DRV_AUTHOR "Jason Wang <jasowang@redhat.com>" +#define DRV_DESC "vDPA Device Simulator core" +#define DRV_LICENSE "GPL v2" + +static int batch_mapping = 1; +module_param(batch_mapping, int, 0444); +MODULE_PARM_DESC(batch_mapping, "Batched mapping 1 -Enable; 0 - Disable"); + +static int max_iotlb_entries = 2048; +module_param(max_iotlb_entries, int, 0444); +MODULE_PARM_DESC(max_iotlb_entries, + "Maximum number of iotlb entries for each address space. 0 means unlimited. (default: 2048)"); + +static bool use_va = true; +module_param(use_va, bool, 0444); +MODULE_PARM_DESC(use_va, "Enable/disable the device's ability to use VA"); + +#define VDPASIM_QUEUE_ALIGN PAGE_SIZE +#define VDPASIM_QUEUE_MAX 256 +#define VDPASIM_VENDOR_ID 0 + +struct vdpasim_mm_work { + struct kthread_work work; + struct vdpasim *vdpasim; + struct mm_struct *mm_to_bind; + int ret; +}; + +static void vdpasim_mm_work_fn(struct kthread_work *work) +{ + struct vdpasim_mm_work *mm_work = + container_of(work, struct vdpasim_mm_work, work); + struct vdpasim *vdpasim = mm_work->vdpasim; + + mm_work->ret = 0; + + //TODO: should we attach the cgroup of the mm owner? + vdpasim->mm_bound = mm_work->mm_to_bind; +} + +static void vdpasim_worker_change_mm_sync(struct vdpasim *vdpasim, + struct vdpasim_mm_work *mm_work) +{ + struct kthread_work *work = &mm_work->work; + + kthread_init_work(work, vdpasim_mm_work_fn); + kthread_queue_work(vdpasim->worker, work); + + kthread_flush_work(work); +} + +static struct vdpasim *vdpa_to_sim(struct vdpa_device *vdpa) +{ + return container_of(vdpa, struct vdpasim, vdpa); +} + +static void vdpasim_vq_notify(struct vringh *vring) +{ + struct vdpasim_virtqueue *vq = + container_of(vring, struct vdpasim_virtqueue, vring); + + if (!vq->cb) + return; + + vq->cb(vq->private); +} + +static void vdpasim_queue_ready(struct vdpasim *vdpasim, unsigned int idx) +{ + struct vdpasim_virtqueue *vq = &vdpasim->vqs[idx]; + uint16_t last_avail_idx = vq->vring.last_avail_idx; + struct vring_desc *desc = (struct vring_desc *) + (uintptr_t)vq->desc_addr; + struct vring_avail *avail = (struct vring_avail *) + (uintptr_t)vq->driver_addr; + struct vring_used *used = (struct vring_used *) + (uintptr_t)vq->device_addr; + + if (use_va && vdpasim->mm_bound) { + vringh_init_iotlb_va(&vq->vring, vdpasim->features, vq->num, + true, desc, avail, used); + } else { + vringh_init_iotlb(&vq->vring, vdpasim->features, vq->num, + true, desc, avail, used); + } + + vq->vring.last_avail_idx = last_avail_idx; + + /* + * Since vdpa_sim does not support receive inflight descriptors as a + * destination of a migration, let's set both avail_idx and used_idx + * the same at vq start. This is how vhost-user works in a + * VHOST_SET_VRING_BASE call. + * + * Although the simple fix is to set last_used_idx at + * vdpasim_set_vq_state, it would be reset at vdpasim_queue_ready. + */ + vq->vring.last_used_idx = last_avail_idx; + vq->vring.notify = vdpasim_vq_notify; +} + +static void vdpasim_vq_reset(struct vdpasim *vdpasim, + struct vdpasim_virtqueue *vq) +{ + vq->ready = false; + vq->desc_addr = 0; + vq->driver_addr = 0; + vq->device_addr = 0; + vq->cb = NULL; + vq->private = NULL; + vringh_init_iotlb(&vq->vring, vdpasim->dev_attr.supported_features, + VDPASIM_QUEUE_MAX, false, NULL, NULL, NULL); + + vq->vring.notify = NULL; +} + +static void vdpasim_do_reset(struct vdpasim *vdpasim) +{ + int i; + + spin_lock(&vdpasim->iommu_lock); + + for (i = 0; i < vdpasim->dev_attr.nvqs; i++) { + vdpasim_vq_reset(vdpasim, &vdpasim->vqs[i]); + vringh_set_iotlb(&vdpasim->vqs[i].vring, &vdpasim->iommu[0], + &vdpasim->iommu_lock); + } + + for (i = 0; i < vdpasim->dev_attr.nas; i++) { + vhost_iotlb_reset(&vdpasim->iommu[i]); + vhost_iotlb_add_range(&vdpasim->iommu[i], 0, ULONG_MAX, + 0, VHOST_MAP_RW); + vdpasim->iommu_pt[i] = true; + } + + vdpasim->running = true; + spin_unlock(&vdpasim->iommu_lock); + + vdpasim->features = 0; + vdpasim->status = 0; + ++vdpasim->generation; +} + +static const struct vdpa_config_ops vdpasim_config_ops; +static const struct vdpa_config_ops vdpasim_batch_config_ops; + +static void vdpasim_work_fn(struct kthread_work *work) +{ + struct vdpasim *vdpasim = container_of(work, struct vdpasim, work); + struct mm_struct *mm = vdpasim->mm_bound; + + if (use_va && mm) { + if (!mmget_not_zero(mm)) + return; + kthread_use_mm(mm); + } + + vdpasim->dev_attr.work_fn(vdpasim); + + if (use_va && mm) { + kthread_unuse_mm(mm); + mmput(mm); + } +} + +struct vdpasim *vdpasim_create(struct vdpasim_dev_attr *dev_attr, + const struct vdpa_dev_set_config *config) +{ + const struct vdpa_config_ops *ops; + struct vdpa_device *vdpa; + struct vdpasim *vdpasim; + struct device *dev; + int i, ret = -ENOMEM; + + if (!dev_attr->alloc_size) + return ERR_PTR(-EINVAL); + + if (config->mask & BIT_ULL(VDPA_ATTR_DEV_FEATURES)) { + if (config->device_features & + ~dev_attr->supported_features) + return ERR_PTR(-EINVAL); + dev_attr->supported_features = + config->device_features; + } + + if (batch_mapping) + ops = &vdpasim_batch_config_ops; + else + ops = &vdpasim_config_ops; + + vdpa = __vdpa_alloc_device(NULL, ops, + dev_attr->ngroups, dev_attr->nas, + dev_attr->alloc_size, + dev_attr->name, use_va); + if (IS_ERR(vdpa)) { + ret = PTR_ERR(vdpa); + goto err_alloc; + } + + vdpasim = vdpa_to_sim(vdpa); + vdpasim->dev_attr = *dev_attr; + dev = &vdpasim->vdpa.dev; + + kthread_init_work(&vdpasim->work, vdpasim_work_fn); + vdpasim->worker = kthread_create_worker(0, "vDPA sim worker: %s", + dev_attr->name); + if (IS_ERR(vdpasim->worker)) + goto err_iommu; + + mutex_init(&vdpasim->mutex); + spin_lock_init(&vdpasim->iommu_lock); + + dev->dma_mask = &dev->coherent_dma_mask; + if (dma_set_mask_and_coherent(dev, DMA_BIT_MASK(64))) + goto err_iommu; + vdpasim->vdpa.mdev = dev_attr->mgmt_dev; + + vdpasim->config = kzalloc(dev_attr->config_size, GFP_KERNEL); + if (!vdpasim->config) + goto err_iommu; + + vdpasim->vqs = kcalloc(dev_attr->nvqs, sizeof(struct vdpasim_virtqueue), + GFP_KERNEL); + if (!vdpasim->vqs) + goto err_iommu; + + vdpasim->iommu = kmalloc_array(vdpasim->dev_attr.nas, + sizeof(*vdpasim->iommu), GFP_KERNEL); + if (!vdpasim->iommu) + goto err_iommu; + + vdpasim->iommu_pt = kmalloc_array(vdpasim->dev_attr.nas, + sizeof(*vdpasim->iommu_pt), GFP_KERNEL); + if (!vdpasim->iommu_pt) + goto err_iommu; + + for (i = 0; i < vdpasim->dev_attr.nas; i++) + vhost_iotlb_init(&vdpasim->iommu[i], max_iotlb_entries, 0); + + for (i = 0; i < dev_attr->nvqs; i++) + vringh_set_iotlb(&vdpasim->vqs[i].vring, &vdpasim->iommu[0], + &vdpasim->iommu_lock); + + vdpasim->vdpa.dma_dev = dev; + + return vdpasim; + +err_iommu: + put_device(dev); +err_alloc: + return ERR_PTR(ret); +} +EXPORT_SYMBOL_GPL(vdpasim_create); + +void vdpasim_schedule_work(struct vdpasim *vdpasim) +{ + kthread_queue_work(vdpasim->worker, &vdpasim->work); +} +EXPORT_SYMBOL_GPL(vdpasim_schedule_work); + +static int vdpasim_set_vq_address(struct vdpa_device *vdpa, u16 idx, + u64 desc_area, u64 driver_area, + u64 device_area) +{ + struct vdpasim *vdpasim = vdpa_to_sim(vdpa); + struct vdpasim_virtqueue *vq = &vdpasim->vqs[idx]; + + vq->desc_addr = desc_area; + vq->driver_addr = driver_area; + vq->device_addr = device_area; + + return 0; +} + +static void vdpasim_set_vq_num(struct vdpa_device *vdpa, u16 idx, u32 num) +{ + struct vdpasim *vdpasim = vdpa_to_sim(vdpa); + struct vdpasim_virtqueue *vq = &vdpasim->vqs[idx]; + + vq->num = num; +} + +static void vdpasim_kick_vq(struct vdpa_device *vdpa, u16 idx) +{ + struct vdpasim *vdpasim = vdpa_to_sim(vdpa); + struct vdpasim_virtqueue *vq = &vdpasim->vqs[idx]; + + if (!vdpasim->running && + (vdpasim->status & VIRTIO_CONFIG_S_DRIVER_OK)) { + vdpasim->pending_kick = true; + return; + } + + if (vq->ready) + vdpasim_schedule_work(vdpasim); +} + +static void vdpasim_set_vq_cb(struct vdpa_device *vdpa, u16 idx, + struct vdpa_callback *cb) +{ + struct vdpasim *vdpasim = vdpa_to_sim(vdpa); + struct vdpasim_virtqueue *vq = &vdpasim->vqs[idx]; + + vq->cb = cb->callback; + vq->private = cb->private; +} + +static void vdpasim_set_vq_ready(struct vdpa_device *vdpa, u16 idx, bool ready) +{ + struct vdpasim *vdpasim = vdpa_to_sim(vdpa); + struct vdpasim_virtqueue *vq = &vdpasim->vqs[idx]; + bool old_ready; + + mutex_lock(&vdpasim->mutex); + old_ready = vq->ready; + vq->ready = ready; + if (vq->ready && !old_ready) { + vdpasim_queue_ready(vdpasim, idx); + } + mutex_unlock(&vdpasim->mutex); +} + +static bool vdpasim_get_vq_ready(struct vdpa_device *vdpa, u16 idx) +{ + struct vdpasim *vdpasim = vdpa_to_sim(vdpa); + struct vdpasim_virtqueue *vq = &vdpasim->vqs[idx]; + + return vq->ready; +} + +static int vdpasim_set_vq_state(struct vdpa_device *vdpa, u16 idx, + const struct vdpa_vq_state *state) +{ + struct vdpasim *vdpasim = vdpa_to_sim(vdpa); + struct vdpasim_virtqueue *vq = &vdpasim->vqs[idx]; + struct vringh *vrh = &vq->vring; + + mutex_lock(&vdpasim->mutex); + vrh->last_avail_idx = state->split.avail_index; + mutex_unlock(&vdpasim->mutex); + + return 0; +} + +static int vdpasim_get_vq_state(struct vdpa_device *vdpa, u16 idx, + struct vdpa_vq_state *state) +{ + struct vdpasim *vdpasim = vdpa_to_sim(vdpa); + struct vdpasim_virtqueue *vq = &vdpasim->vqs[idx]; + struct vringh *vrh = &vq->vring; + + state->split.avail_index = vrh->last_avail_idx; + return 0; +} + +static int vdpasim_get_vq_stats(struct vdpa_device *vdpa, u16 idx, + struct sk_buff *msg, + struct netlink_ext_ack *extack) +{ + struct vdpasim *vdpasim = vdpa_to_sim(vdpa); + + if (vdpasim->dev_attr.get_stats) + return vdpasim->dev_attr.get_stats(vdpasim, idx, + msg, extack); + return -EOPNOTSUPP; +} + +static u32 vdpasim_get_vq_align(struct vdpa_device *vdpa) +{ + return VDPASIM_QUEUE_ALIGN; +} + +static u32 vdpasim_get_vq_group(struct vdpa_device *vdpa, u16 idx) +{ + /* RX and TX belongs to group 0, CVQ belongs to group 1 */ + if (idx == 2) + return 1; + else + return 0; +} + +static u64 vdpasim_get_device_features(struct vdpa_device *vdpa) +{ + struct vdpasim *vdpasim = vdpa_to_sim(vdpa); + + return vdpasim->dev_attr.supported_features; +} + +static u64 vdpasim_get_backend_features(const struct vdpa_device *vdpa) +{ + return BIT_ULL(VHOST_BACKEND_F_ENABLE_AFTER_DRIVER_OK); +} + +static int vdpasim_set_driver_features(struct vdpa_device *vdpa, u64 features) +{ + struct vdpasim *vdpasim = vdpa_to_sim(vdpa); + + /* DMA mapping must be done by driver */ + if (!(features & (1ULL << VIRTIO_F_ACCESS_PLATFORM))) + return -EINVAL; + + vdpasim->features = features & vdpasim->dev_attr.supported_features; + + return 0; +} + +static u64 vdpasim_get_driver_features(struct vdpa_device *vdpa) +{ + struct vdpasim *vdpasim = vdpa_to_sim(vdpa); + + return vdpasim->features; +} + +static void vdpasim_set_config_cb(struct vdpa_device *vdpa, + struct vdpa_callback *cb) +{ + /* We don't support config interrupt */ +} + +static u16 vdpasim_get_vq_num_max(struct vdpa_device *vdpa) +{ + return VDPASIM_QUEUE_MAX; +} + +static u32 vdpasim_get_device_id(struct vdpa_device *vdpa) +{ + struct vdpasim *vdpasim = vdpa_to_sim(vdpa); + + return vdpasim->dev_attr.id; +} + +static u32 vdpasim_get_vendor_id(struct vdpa_device *vdpa) +{ + return VDPASIM_VENDOR_ID; +} + +static u8 vdpasim_get_status(struct vdpa_device *vdpa) +{ + struct vdpasim *vdpasim = vdpa_to_sim(vdpa); + u8 status; + + mutex_lock(&vdpasim->mutex); + status = vdpasim->status; + mutex_unlock(&vdpasim->mutex); + + return status; +} + +static void vdpasim_set_status(struct vdpa_device *vdpa, u8 status) +{ + struct vdpasim *vdpasim = vdpa_to_sim(vdpa); + + mutex_lock(&vdpasim->mutex); + vdpasim->status = status; + mutex_unlock(&vdpasim->mutex); +} + +static int vdpasim_reset(struct vdpa_device *vdpa) +{ + struct vdpasim *vdpasim = vdpa_to_sim(vdpa); + + mutex_lock(&vdpasim->mutex); + vdpasim->status = 0; + vdpasim_do_reset(vdpasim); + mutex_unlock(&vdpasim->mutex); + + return 0; +} + +static int vdpasim_suspend(struct vdpa_device *vdpa) +{ + struct vdpasim *vdpasim = vdpa_to_sim(vdpa); + + mutex_lock(&vdpasim->mutex); + vdpasim->running = false; + mutex_unlock(&vdpasim->mutex); + + return 0; +} + +static int vdpasim_resume(struct vdpa_device *vdpa) +{ + struct vdpasim *vdpasim = vdpa_to_sim(vdpa); + int i; + + mutex_lock(&vdpasim->mutex); + vdpasim->running = true; + + if (vdpasim->pending_kick) { + /* Process pending descriptors */ + for (i = 0; i < vdpasim->dev_attr.nvqs; ++i) + vdpasim_kick_vq(vdpa, i); + + vdpasim->pending_kick = false; + } + + mutex_unlock(&vdpasim->mutex); + + return 0; +} + +static size_t vdpasim_get_config_size(struct vdpa_device *vdpa) +{ + struct vdpasim *vdpasim = vdpa_to_sim(vdpa); + + return vdpasim->dev_attr.config_size; +} + +static void vdpasim_get_config(struct vdpa_device *vdpa, unsigned int offset, + void *buf, unsigned int len) +{ + struct vdpasim *vdpasim = vdpa_to_sim(vdpa); + + if (offset + len > vdpasim->dev_attr.config_size) + return; + + if (vdpasim->dev_attr.get_config) + vdpasim->dev_attr.get_config(vdpasim, vdpasim->config); + + memcpy(buf, vdpasim->config + offset, len); +} + +static void vdpasim_set_config(struct vdpa_device *vdpa, unsigned int offset, + const void *buf, unsigned int len) +{ + struct vdpasim *vdpasim = vdpa_to_sim(vdpa); + + if (offset + len > vdpasim->dev_attr.config_size) + return; + + memcpy(vdpasim->config + offset, buf, len); + + if (vdpasim->dev_attr.set_config) + vdpasim->dev_attr.set_config(vdpasim, vdpasim->config); +} + +static u32 vdpasim_get_generation(struct vdpa_device *vdpa) +{ + struct vdpasim *vdpasim = vdpa_to_sim(vdpa); + + return vdpasim->generation; +} + +static struct vdpa_iova_range vdpasim_get_iova_range(struct vdpa_device *vdpa) +{ + struct vdpa_iova_range range = { + .first = 0ULL, + .last = ULLONG_MAX, + }; + + return range; +} + +static int vdpasim_set_group_asid(struct vdpa_device *vdpa, unsigned int group, + unsigned int asid) +{ + struct vdpasim *vdpasim = vdpa_to_sim(vdpa); + struct vhost_iotlb *iommu; + int i; + + if (group > vdpasim->dev_attr.ngroups) + return -EINVAL; + + if (asid >= vdpasim->dev_attr.nas) + return -EINVAL; + + iommu = &vdpasim->iommu[asid]; + + mutex_lock(&vdpasim->mutex); + + for (i = 0; i < vdpasim->dev_attr.nvqs; i++) + if (vdpasim_get_vq_group(vdpa, i) == group) + vringh_set_iotlb(&vdpasim->vqs[i].vring, iommu, + &vdpasim->iommu_lock); + + mutex_unlock(&vdpasim->mutex); + + return 0; +} + +static int vdpasim_set_map(struct vdpa_device *vdpa, unsigned int asid, + struct vhost_iotlb *iotlb) +{ + struct vdpasim *vdpasim = vdpa_to_sim(vdpa); + struct vhost_iotlb_map *map; + struct vhost_iotlb *iommu; + u64 start = 0ULL, last = 0ULL - 1; + int ret; + + if (asid >= vdpasim->dev_attr.nas) + return -EINVAL; + + spin_lock(&vdpasim->iommu_lock); + + iommu = &vdpasim->iommu[asid]; + vhost_iotlb_reset(iommu); + vdpasim->iommu_pt[asid] = false; + + for (map = vhost_iotlb_itree_first(iotlb, start, last); map; + map = vhost_iotlb_itree_next(map, start, last)) { + ret = vhost_iotlb_add_range(iommu, map->start, + map->last, map->addr, map->perm); + if (ret) + goto err; + } + spin_unlock(&vdpasim->iommu_lock); + return 0; + +err: + vhost_iotlb_reset(iommu); + spin_unlock(&vdpasim->iommu_lock); + return ret; +} + +static int vdpasim_bind_mm(struct vdpa_device *vdpa, struct mm_struct *mm) +{ + struct vdpasim *vdpasim = vdpa_to_sim(vdpa); + struct vdpasim_mm_work mm_work; + + mm_work.vdpasim = vdpasim; + mm_work.mm_to_bind = mm; + + vdpasim_worker_change_mm_sync(vdpasim, &mm_work); + + return mm_work.ret; +} + +static void vdpasim_unbind_mm(struct vdpa_device *vdpa) +{ + struct vdpasim *vdpasim = vdpa_to_sim(vdpa); + struct vdpasim_mm_work mm_work; + + mm_work.vdpasim = vdpasim; + mm_work.mm_to_bind = NULL; + + vdpasim_worker_change_mm_sync(vdpasim, &mm_work); +} + +static int vdpasim_dma_map(struct vdpa_device *vdpa, unsigned int asid, + u64 iova, u64 size, + u64 pa, u32 perm, void *opaque) +{ + struct vdpasim *vdpasim = vdpa_to_sim(vdpa); + int ret; + + if (asid >= vdpasim->dev_attr.nas) + return -EINVAL; + + spin_lock(&vdpasim->iommu_lock); + if (vdpasim->iommu_pt[asid]) { + vhost_iotlb_reset(&vdpasim->iommu[asid]); + vdpasim->iommu_pt[asid] = false; + } + ret = vhost_iotlb_add_range_ctx(&vdpasim->iommu[asid], iova, + iova + size - 1, pa, perm, opaque); + spin_unlock(&vdpasim->iommu_lock); + + return ret; +} + +static int vdpasim_dma_unmap(struct vdpa_device *vdpa, unsigned int asid, + u64 iova, u64 size) +{ + struct vdpasim *vdpasim = vdpa_to_sim(vdpa); + + if (asid >= vdpasim->dev_attr.nas) + return -EINVAL; + + if (vdpasim->iommu_pt[asid]) { + vhost_iotlb_reset(&vdpasim->iommu[asid]); + vdpasim->iommu_pt[asid] = false; + } + + spin_lock(&vdpasim->iommu_lock); + vhost_iotlb_del_range(&vdpasim->iommu[asid], iova, iova + size - 1); + spin_unlock(&vdpasim->iommu_lock); + + return 0; +} + +static void vdpasim_free(struct vdpa_device *vdpa) +{ + struct vdpasim *vdpasim = vdpa_to_sim(vdpa); + int i; + + kthread_cancel_work_sync(&vdpasim->work); + kthread_destroy_worker(vdpasim->worker); + + for (i = 0; i < vdpasim->dev_attr.nvqs; i++) { + vringh_kiov_cleanup(&vdpasim->vqs[i].out_iov); + vringh_kiov_cleanup(&vdpasim->vqs[i].in_iov); + } + + vdpasim->dev_attr.free(vdpasim); + + for (i = 0; i < vdpasim->dev_attr.nas; i++) + vhost_iotlb_reset(&vdpasim->iommu[i]); + kfree(vdpasim->iommu); + kfree(vdpasim->iommu_pt); + kfree(vdpasim->vqs); + kfree(vdpasim->config); +} + +static const struct vdpa_config_ops vdpasim_config_ops = { + .set_vq_address = vdpasim_set_vq_address, + .set_vq_num = vdpasim_set_vq_num, + .kick_vq = vdpasim_kick_vq, + .set_vq_cb = vdpasim_set_vq_cb, + .set_vq_ready = vdpasim_set_vq_ready, + .get_vq_ready = vdpasim_get_vq_ready, + .set_vq_state = vdpasim_set_vq_state, + .get_vendor_vq_stats = vdpasim_get_vq_stats, + .get_vq_state = vdpasim_get_vq_state, + .get_vq_align = vdpasim_get_vq_align, + .get_vq_group = vdpasim_get_vq_group, + .get_device_features = vdpasim_get_device_features, + .get_backend_features = vdpasim_get_backend_features, + .set_driver_features = vdpasim_set_driver_features, + .get_driver_features = vdpasim_get_driver_features, + .set_config_cb = vdpasim_set_config_cb, + .get_vq_num_max = vdpasim_get_vq_num_max, + .get_device_id = vdpasim_get_device_id, + .get_vendor_id = vdpasim_get_vendor_id, + .get_status = vdpasim_get_status, + .set_status = vdpasim_set_status, + .reset = vdpasim_reset, + .suspend = vdpasim_suspend, + .resume = vdpasim_resume, + .get_config_size = vdpasim_get_config_size, + .get_config = vdpasim_get_config, + .set_config = vdpasim_set_config, + .get_generation = vdpasim_get_generation, + .get_iova_range = vdpasim_get_iova_range, + .set_group_asid = vdpasim_set_group_asid, + .dma_map = vdpasim_dma_map, + .dma_unmap = vdpasim_dma_unmap, + .bind_mm = vdpasim_bind_mm, + .unbind_mm = vdpasim_unbind_mm, + .free = vdpasim_free, +}; + +static const struct vdpa_config_ops vdpasim_batch_config_ops = { + .set_vq_address = vdpasim_set_vq_address, + .set_vq_num = vdpasim_set_vq_num, + .kick_vq = vdpasim_kick_vq, + .set_vq_cb = vdpasim_set_vq_cb, + .set_vq_ready = vdpasim_set_vq_ready, + .get_vq_ready = vdpasim_get_vq_ready, + .set_vq_state = vdpasim_set_vq_state, + .get_vendor_vq_stats = vdpasim_get_vq_stats, + .get_vq_state = vdpasim_get_vq_state, + .get_vq_align = vdpasim_get_vq_align, + .get_vq_group = vdpasim_get_vq_group, + .get_device_features = vdpasim_get_device_features, + .get_backend_features = vdpasim_get_backend_features, + .set_driver_features = vdpasim_set_driver_features, + .get_driver_features = vdpasim_get_driver_features, + .set_config_cb = vdpasim_set_config_cb, + .get_vq_num_max = vdpasim_get_vq_num_max, + .get_device_id = vdpasim_get_device_id, + .get_vendor_id = vdpasim_get_vendor_id, + .get_status = vdpasim_get_status, + .set_status = vdpasim_set_status, + .reset = vdpasim_reset, + .suspend = vdpasim_suspend, + .resume = vdpasim_resume, + .get_config_size = vdpasim_get_config_size, + .get_config = vdpasim_get_config, + .set_config = vdpasim_set_config, + .get_generation = vdpasim_get_generation, + .get_iova_range = vdpasim_get_iova_range, + .set_group_asid = vdpasim_set_group_asid, + .set_map = vdpasim_set_map, + .bind_mm = vdpasim_bind_mm, + .unbind_mm = vdpasim_unbind_mm, + .free = vdpasim_free, +}; + +MODULE_VERSION(DRV_VERSION); +MODULE_LICENSE(DRV_LICENSE); +MODULE_AUTHOR(DRV_AUTHOR); +MODULE_DESCRIPTION(DRV_DESC); diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.h b/drivers/vdpa/vdpa_sim/vdpa_sim.h new file mode 100644 index 0000000000..bb137e4797 --- /dev/null +++ b/drivers/vdpa/vdpa_sim/vdpa_sim.h @@ -0,0 +1,121 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2020, Red Hat Inc. All rights reserved. + */ + +#ifndef _VDPA_SIM_H +#define _VDPA_SIM_H + +#include <linux/iova.h> +#include <linux/vringh.h> +#include <linux/vdpa.h> +#include <linux/virtio_byteorder.h> +#include <linux/vhost_iotlb.h> +#include <uapi/linux/virtio_config.h> + +#define VDPASIM_FEATURES ((1ULL << VIRTIO_F_ANY_LAYOUT) | \ + (1ULL << VIRTIO_F_VERSION_1) | \ + (1ULL << VIRTIO_F_ACCESS_PLATFORM)) + +struct vdpasim; + +struct vdpasim_virtqueue { + struct vringh vring; + struct vringh_kiov in_iov; + struct vringh_kiov out_iov; + unsigned short head; + bool ready; + u64 desc_addr; + u64 device_addr; + u64 driver_addr; + u32 num; + void *private; + irqreturn_t (*cb)(void *data); +}; + +struct vdpasim_dev_attr { + struct vdpa_mgmt_dev *mgmt_dev; + const char *name; + u64 supported_features; + size_t alloc_size; + size_t config_size; + int nvqs; + u32 id; + u32 ngroups; + u32 nas; + + void (*work_fn)(struct vdpasim *vdpasim); + void (*get_config)(struct vdpasim *vdpasim, void *config); + void (*set_config)(struct vdpasim *vdpasim, const void *config); + int (*get_stats)(struct vdpasim *vdpasim, u16 idx, + struct sk_buff *msg, + struct netlink_ext_ack *extack); + void (*free)(struct vdpasim *vdpasim); +}; + +/* State of each vdpasim device */ +struct vdpasim { + struct vdpa_device vdpa; + struct vdpasim_virtqueue *vqs; + struct kthread_worker *worker; + struct kthread_work work; + struct mm_struct *mm_bound; + struct vdpasim_dev_attr dev_attr; + /* mutex to synchronize virtqueue state */ + struct mutex mutex; + /* virtio config according to device type */ + void *config; + struct vhost_iotlb *iommu; + bool *iommu_pt; + u32 status; + u32 generation; + u64 features; + u32 groups; + bool running; + bool pending_kick; + /* spinlock to synchronize iommu table */ + spinlock_t iommu_lock; +}; + +struct vdpasim *vdpasim_create(struct vdpasim_dev_attr *attr, + const struct vdpa_dev_set_config *config); +void vdpasim_schedule_work(struct vdpasim *vdpasim); + +/* TODO: cross-endian support */ +static inline bool vdpasim_is_little_endian(struct vdpasim *vdpasim) +{ + return virtio_legacy_is_little_endian() || + (vdpasim->features & (1ULL << VIRTIO_F_VERSION_1)); +} + +static inline u16 vdpasim16_to_cpu(struct vdpasim *vdpasim, __virtio16 val) +{ + return __virtio16_to_cpu(vdpasim_is_little_endian(vdpasim), val); +} + +static inline __virtio16 cpu_to_vdpasim16(struct vdpasim *vdpasim, u16 val) +{ + return __cpu_to_virtio16(vdpasim_is_little_endian(vdpasim), val); +} + +static inline u32 vdpasim32_to_cpu(struct vdpasim *vdpasim, __virtio32 val) +{ + return __virtio32_to_cpu(vdpasim_is_little_endian(vdpasim), val); +} + +static inline __virtio32 cpu_to_vdpasim32(struct vdpasim *vdpasim, u32 val) +{ + return __cpu_to_virtio32(vdpasim_is_little_endian(vdpasim), val); +} + +static inline u64 vdpasim64_to_cpu(struct vdpasim *vdpasim, __virtio64 val) +{ + return __virtio64_to_cpu(vdpasim_is_little_endian(vdpasim), val); +} + +static inline __virtio64 cpu_to_vdpasim64(struct vdpasim *vdpasim, u64 val) +{ + return __cpu_to_virtio64(vdpasim_is_little_endian(vdpasim), val); +} + +#endif diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim_blk.c b/drivers/vdpa/vdpa_sim/vdpa_sim_blk.c new file mode 100644 index 0000000000..b137f36793 --- /dev/null +++ b/drivers/vdpa/vdpa_sim/vdpa_sim_blk.c @@ -0,0 +1,527 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * VDPA simulator for block device. + * + * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2021, Red Hat Inc. All rights reserved. + * + */ + +#include <linux/init.h> +#include <linux/module.h> +#include <linux/device.h> +#include <linux/kernel.h> +#include <linux/blkdev.h> +#include <linux/vringh.h> +#include <linux/vdpa.h> +#include <uapi/linux/virtio_blk.h> + +#include "vdpa_sim.h" + +#define DRV_VERSION "0.1" +#define DRV_AUTHOR "Max Gurtovoy <mgurtovoy@nvidia.com>" +#define DRV_DESC "vDPA Device Simulator for block device" +#define DRV_LICENSE "GPL v2" + +#define VDPASIM_BLK_FEATURES (VDPASIM_FEATURES | \ + (1ULL << VIRTIO_BLK_F_FLUSH) | \ + (1ULL << VIRTIO_BLK_F_SIZE_MAX) | \ + (1ULL << VIRTIO_BLK_F_SEG_MAX) | \ + (1ULL << VIRTIO_BLK_F_BLK_SIZE) | \ + (1ULL << VIRTIO_BLK_F_TOPOLOGY) | \ + (1ULL << VIRTIO_BLK_F_MQ) | \ + (1ULL << VIRTIO_BLK_F_DISCARD) | \ + (1ULL << VIRTIO_BLK_F_WRITE_ZEROES)) + +#define VDPASIM_BLK_CAPACITY 0x40000 +#define VDPASIM_BLK_SIZE_MAX 0x1000 +#define VDPASIM_BLK_SEG_MAX 32 +#define VDPASIM_BLK_DWZ_MAX_SECTORS UINT_MAX + +/* 1 virtqueue, 1 address space, 1 virtqueue group */ +#define VDPASIM_BLK_VQ_NUM 1 +#define VDPASIM_BLK_AS_NUM 1 +#define VDPASIM_BLK_GROUP_NUM 1 + +struct vdpasim_blk { + struct vdpasim vdpasim; + void *buffer; + bool shared_backend; +}; + +static struct vdpasim_blk *sim_to_blk(struct vdpasim *vdpasim) +{ + return container_of(vdpasim, struct vdpasim_blk, vdpasim); +} + +static char vdpasim_blk_id[VIRTIO_BLK_ID_BYTES] = "vdpa_blk_sim"; + +static bool shared_backend; +module_param(shared_backend, bool, 0444); +MODULE_PARM_DESC(shared_backend, "Enable the shared backend between virtio-blk devices"); + +static void *shared_buffer; +/* mutex to synchronize shared_buffer access */ +static DEFINE_MUTEX(shared_buffer_mutex); + +static void vdpasim_blk_buffer_lock(struct vdpasim_blk *blk) +{ + if (blk->shared_backend) + mutex_lock(&shared_buffer_mutex); +} + +static void vdpasim_blk_buffer_unlock(struct vdpasim_blk *blk) +{ + if (blk->shared_backend) + mutex_unlock(&shared_buffer_mutex); +} + +static bool vdpasim_blk_check_range(struct vdpasim *vdpasim, u64 start_sector, + u64 num_sectors, u64 max_sectors) +{ + if (start_sector > VDPASIM_BLK_CAPACITY) { + dev_dbg(&vdpasim->vdpa.dev, + "starting sector exceeds the capacity - start: 0x%llx capacity: 0x%x\n", + start_sector, VDPASIM_BLK_CAPACITY); + } + + if (num_sectors > max_sectors) { + dev_dbg(&vdpasim->vdpa.dev, + "number of sectors exceeds the max allowed in a request - num: 0x%llx max: 0x%llx\n", + num_sectors, max_sectors); + return false; + } + + if (num_sectors > VDPASIM_BLK_CAPACITY - start_sector) { + dev_dbg(&vdpasim->vdpa.dev, + "request exceeds the capacity - start: 0x%llx num: 0x%llx capacity: 0x%x\n", + start_sector, num_sectors, VDPASIM_BLK_CAPACITY); + return false; + } + + return true; +} + +/* Returns 'true' if the request is handled (with or without an I/O error) + * and the status is correctly written in the last byte of the 'in iov', + * 'false' otherwise. + */ +static bool vdpasim_blk_handle_req(struct vdpasim *vdpasim, + struct vdpasim_virtqueue *vq) +{ + struct vdpasim_blk *blk = sim_to_blk(vdpasim); + size_t pushed = 0, to_pull, to_push; + struct virtio_blk_outhdr hdr; + bool handled = false; + ssize_t bytes; + loff_t offset; + u64 sector; + u8 status; + u32 type; + int ret; + + ret = vringh_getdesc_iotlb(&vq->vring, &vq->out_iov, &vq->in_iov, + &vq->head, GFP_ATOMIC); + if (ret != 1) + return false; + + if (vq->out_iov.used < 1 || vq->in_iov.used < 1) { + dev_dbg(&vdpasim->vdpa.dev, "missing headers - out_iov: %u in_iov %u\n", + vq->out_iov.used, vq->in_iov.used); + goto err; + } + + if (vq->in_iov.iov[vq->in_iov.used - 1].iov_len < 1) { + dev_dbg(&vdpasim->vdpa.dev, "request in header too short\n"); + goto err; + } + + /* The last byte is the status and we checked if the last iov has + * enough room for it. + */ + to_push = vringh_kiov_length(&vq->in_iov) - 1; + + to_pull = vringh_kiov_length(&vq->out_iov); + + bytes = vringh_iov_pull_iotlb(&vq->vring, &vq->out_iov, &hdr, + sizeof(hdr)); + if (bytes != sizeof(hdr)) { + dev_dbg(&vdpasim->vdpa.dev, "request out header too short\n"); + goto err; + } + + to_pull -= bytes; + + type = vdpasim32_to_cpu(vdpasim, hdr.type); + sector = vdpasim64_to_cpu(vdpasim, hdr.sector); + offset = sector << SECTOR_SHIFT; + status = VIRTIO_BLK_S_OK; + + if (type != VIRTIO_BLK_T_IN && type != VIRTIO_BLK_T_OUT && + sector != 0) { + dev_dbg(&vdpasim->vdpa.dev, + "sector must be 0 for %u request - sector: 0x%llx\n", + type, sector); + status = VIRTIO_BLK_S_IOERR; + goto err_status; + } + + switch (type) { + case VIRTIO_BLK_T_IN: + if (!vdpasim_blk_check_range(vdpasim, sector, + to_push >> SECTOR_SHIFT, + VDPASIM_BLK_SIZE_MAX * VDPASIM_BLK_SEG_MAX)) { + status = VIRTIO_BLK_S_IOERR; + break; + } + + vdpasim_blk_buffer_lock(blk); + bytes = vringh_iov_push_iotlb(&vq->vring, &vq->in_iov, + blk->buffer + offset, to_push); + vdpasim_blk_buffer_unlock(blk); + if (bytes < 0) { + dev_dbg(&vdpasim->vdpa.dev, + "vringh_iov_push_iotlb() error: %zd offset: 0x%llx len: 0x%zx\n", + bytes, offset, to_push); + status = VIRTIO_BLK_S_IOERR; + break; + } + + pushed += bytes; + break; + + case VIRTIO_BLK_T_OUT: + if (!vdpasim_blk_check_range(vdpasim, sector, + to_pull >> SECTOR_SHIFT, + VDPASIM_BLK_SIZE_MAX * VDPASIM_BLK_SEG_MAX)) { + status = VIRTIO_BLK_S_IOERR; + break; + } + + vdpasim_blk_buffer_lock(blk); + bytes = vringh_iov_pull_iotlb(&vq->vring, &vq->out_iov, + blk->buffer + offset, to_pull); + vdpasim_blk_buffer_unlock(blk); + if (bytes < 0) { + dev_dbg(&vdpasim->vdpa.dev, + "vringh_iov_pull_iotlb() error: %zd offset: 0x%llx len: 0x%zx\n", + bytes, offset, to_pull); + status = VIRTIO_BLK_S_IOERR; + break; + } + break; + + case VIRTIO_BLK_T_GET_ID: + bytes = vringh_iov_push_iotlb(&vq->vring, &vq->in_iov, + vdpasim_blk_id, + VIRTIO_BLK_ID_BYTES); + if (bytes < 0) { + dev_dbg(&vdpasim->vdpa.dev, + "vringh_iov_push_iotlb() error: %zd\n", bytes); + status = VIRTIO_BLK_S_IOERR; + break; + } + + pushed += bytes; + break; + + case VIRTIO_BLK_T_FLUSH: + /* nothing to do */ + break; + + case VIRTIO_BLK_T_DISCARD: + case VIRTIO_BLK_T_WRITE_ZEROES: { + struct virtio_blk_discard_write_zeroes range; + u32 num_sectors, flags; + + if (to_pull != sizeof(range)) { + dev_dbg(&vdpasim->vdpa.dev, + "discard/write_zeroes header len: 0x%zx [expected: 0x%zx]\n", + to_pull, sizeof(range)); + status = VIRTIO_BLK_S_IOERR; + break; + } + + bytes = vringh_iov_pull_iotlb(&vq->vring, &vq->out_iov, &range, + to_pull); + if (bytes < 0) { + dev_dbg(&vdpasim->vdpa.dev, + "vringh_iov_pull_iotlb() error: %zd offset: 0x%llx len: 0x%zx\n", + bytes, offset, to_pull); + status = VIRTIO_BLK_S_IOERR; + break; + } + + sector = le64_to_cpu(range.sector); + offset = sector << SECTOR_SHIFT; + num_sectors = le32_to_cpu(range.num_sectors); + flags = le32_to_cpu(range.flags); + + if (type == VIRTIO_BLK_T_DISCARD && flags != 0) { + dev_dbg(&vdpasim->vdpa.dev, + "discard unexpected flags set - flags: 0x%x\n", + flags); + status = VIRTIO_BLK_S_UNSUPP; + break; + } + + if (type == VIRTIO_BLK_T_WRITE_ZEROES && + flags & ~VIRTIO_BLK_WRITE_ZEROES_FLAG_UNMAP) { + dev_dbg(&vdpasim->vdpa.dev, + "write_zeroes unexpected flags set - flags: 0x%x\n", + flags); + status = VIRTIO_BLK_S_UNSUPP; + break; + } + + if (!vdpasim_blk_check_range(vdpasim, sector, num_sectors, + VDPASIM_BLK_DWZ_MAX_SECTORS)) { + status = VIRTIO_BLK_S_IOERR; + break; + } + + if (type == VIRTIO_BLK_T_WRITE_ZEROES) { + vdpasim_blk_buffer_lock(blk); + memset(blk->buffer + offset, 0, + num_sectors << SECTOR_SHIFT); + vdpasim_blk_buffer_unlock(blk); + } + + break; + } + default: + dev_dbg(&vdpasim->vdpa.dev, + "Unsupported request type %d\n", type); + status = VIRTIO_BLK_S_IOERR; + break; + } + +err_status: + /* If some operations fail, we need to skip the remaining bytes + * to put the status in the last byte + */ + if (to_push - pushed > 0) + vringh_kiov_advance(&vq->in_iov, to_push - pushed); + + /* Last byte is the status */ + bytes = vringh_iov_push_iotlb(&vq->vring, &vq->in_iov, &status, 1); + if (bytes != 1) + goto err; + + pushed += bytes; + + /* Make sure data is wrote before advancing index */ + smp_wmb(); + + handled = true; + +err: + vringh_complete_iotlb(&vq->vring, vq->head, pushed); + + return handled; +} + +static void vdpasim_blk_work(struct vdpasim *vdpasim) +{ + bool reschedule = false; + int i; + + mutex_lock(&vdpasim->mutex); + + if (!(vdpasim->status & VIRTIO_CONFIG_S_DRIVER_OK)) + goto out; + + if (!vdpasim->running) + goto out; + + for (i = 0; i < VDPASIM_BLK_VQ_NUM; i++) { + struct vdpasim_virtqueue *vq = &vdpasim->vqs[i]; + int reqs = 0; + + if (!vq->ready) + continue; + + while (vdpasim_blk_handle_req(vdpasim, vq)) { + /* Make sure used is visible before rasing the interrupt. */ + smp_wmb(); + + local_bh_disable(); + if (vringh_need_notify_iotlb(&vq->vring) > 0) + vringh_notify(&vq->vring); + local_bh_enable(); + + if (++reqs > 4) { + reschedule = true; + break; + } + } + } +out: + mutex_unlock(&vdpasim->mutex); + + if (reschedule) + vdpasim_schedule_work(vdpasim); +} + +static void vdpasim_blk_get_config(struct vdpasim *vdpasim, void *config) +{ + struct virtio_blk_config *blk_config = config; + + memset(config, 0, sizeof(struct virtio_blk_config)); + + blk_config->capacity = cpu_to_vdpasim64(vdpasim, VDPASIM_BLK_CAPACITY); + blk_config->size_max = cpu_to_vdpasim32(vdpasim, VDPASIM_BLK_SIZE_MAX); + blk_config->seg_max = cpu_to_vdpasim32(vdpasim, VDPASIM_BLK_SEG_MAX); + blk_config->num_queues = cpu_to_vdpasim16(vdpasim, VDPASIM_BLK_VQ_NUM); + blk_config->min_io_size = cpu_to_vdpasim16(vdpasim, 1); + blk_config->opt_io_size = cpu_to_vdpasim32(vdpasim, 1); + blk_config->blk_size = cpu_to_vdpasim32(vdpasim, SECTOR_SIZE); + /* VIRTIO_BLK_F_DISCARD */ + blk_config->discard_sector_alignment = + cpu_to_vdpasim32(vdpasim, SECTOR_SIZE); + blk_config->max_discard_sectors = + cpu_to_vdpasim32(vdpasim, VDPASIM_BLK_DWZ_MAX_SECTORS); + blk_config->max_discard_seg = cpu_to_vdpasim32(vdpasim, 1); + /* VIRTIO_BLK_F_WRITE_ZEROES */ + blk_config->max_write_zeroes_sectors = + cpu_to_vdpasim32(vdpasim, VDPASIM_BLK_DWZ_MAX_SECTORS); + blk_config->max_write_zeroes_seg = cpu_to_vdpasim32(vdpasim, 1); + +} + +static void vdpasim_blk_free(struct vdpasim *vdpasim) +{ + struct vdpasim_blk *blk = sim_to_blk(vdpasim); + + if (!blk->shared_backend) + kvfree(blk->buffer); +} + +static void vdpasim_blk_mgmtdev_release(struct device *dev) +{ +} + +static struct device vdpasim_blk_mgmtdev = { + .init_name = "vdpasim_blk", + .release = vdpasim_blk_mgmtdev_release, +}; + +static int vdpasim_blk_dev_add(struct vdpa_mgmt_dev *mdev, const char *name, + const struct vdpa_dev_set_config *config) +{ + struct vdpasim_dev_attr dev_attr = {}; + struct vdpasim_blk *blk; + struct vdpasim *simdev; + int ret; + + dev_attr.mgmt_dev = mdev; + dev_attr.name = name; + dev_attr.id = VIRTIO_ID_BLOCK; + dev_attr.supported_features = VDPASIM_BLK_FEATURES; + dev_attr.nvqs = VDPASIM_BLK_VQ_NUM; + dev_attr.ngroups = VDPASIM_BLK_GROUP_NUM; + dev_attr.nas = VDPASIM_BLK_AS_NUM; + dev_attr.alloc_size = sizeof(struct vdpasim_blk); + dev_attr.config_size = sizeof(struct virtio_blk_config); + dev_attr.get_config = vdpasim_blk_get_config; + dev_attr.work_fn = vdpasim_blk_work; + dev_attr.free = vdpasim_blk_free; + + simdev = vdpasim_create(&dev_attr, config); + if (IS_ERR(simdev)) + return PTR_ERR(simdev); + + blk = sim_to_blk(simdev); + blk->shared_backend = shared_backend; + + if (blk->shared_backend) { + blk->buffer = shared_buffer; + } else { + blk->buffer = kvzalloc(VDPASIM_BLK_CAPACITY << SECTOR_SHIFT, + GFP_KERNEL); + if (!blk->buffer) { + ret = -ENOMEM; + goto put_dev; + } + } + + ret = _vdpa_register_device(&simdev->vdpa, VDPASIM_BLK_VQ_NUM); + if (ret) + goto put_dev; + + return 0; + +put_dev: + put_device(&simdev->vdpa.dev); + return ret; +} + +static void vdpasim_blk_dev_del(struct vdpa_mgmt_dev *mdev, + struct vdpa_device *dev) +{ + struct vdpasim *simdev = container_of(dev, struct vdpasim, vdpa); + + _vdpa_unregister_device(&simdev->vdpa); +} + +static const struct vdpa_mgmtdev_ops vdpasim_blk_mgmtdev_ops = { + .dev_add = vdpasim_blk_dev_add, + .dev_del = vdpasim_blk_dev_del +}; + +static struct virtio_device_id id_table[] = { + { VIRTIO_ID_BLOCK, VIRTIO_DEV_ANY_ID }, + { 0 }, +}; + +static struct vdpa_mgmt_dev mgmt_dev = { + .device = &vdpasim_blk_mgmtdev, + .id_table = id_table, + .ops = &vdpasim_blk_mgmtdev_ops, +}; + +static int __init vdpasim_blk_init(void) +{ + int ret; + + ret = device_register(&vdpasim_blk_mgmtdev); + if (ret) { + put_device(&vdpasim_blk_mgmtdev); + return ret; + } + + ret = vdpa_mgmtdev_register(&mgmt_dev); + if (ret) + goto parent_err; + + if (shared_backend) { + shared_buffer = kvzalloc(VDPASIM_BLK_CAPACITY << SECTOR_SHIFT, + GFP_KERNEL); + if (!shared_buffer) { + ret = -ENOMEM; + goto mgmt_dev_err; + } + } + + return 0; +mgmt_dev_err: + vdpa_mgmtdev_unregister(&mgmt_dev); +parent_err: + device_unregister(&vdpasim_blk_mgmtdev); + return ret; +} + +static void __exit vdpasim_blk_exit(void) +{ + kvfree(shared_buffer); + vdpa_mgmtdev_unregister(&mgmt_dev); + device_unregister(&vdpasim_blk_mgmtdev); +} + +module_init(vdpasim_blk_init) +module_exit(vdpasim_blk_exit) + +MODULE_VERSION(DRV_VERSION); +MODULE_LICENSE(DRV_LICENSE); +MODULE_AUTHOR(DRV_AUTHOR); +MODULE_DESCRIPTION(DRV_DESC); diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim_net.c b/drivers/vdpa/vdpa_sim/vdpa_sim_net.c new file mode 100644 index 0000000000..cfe9629118 --- /dev/null +++ b/drivers/vdpa/vdpa_sim/vdpa_sim_net.c @@ -0,0 +1,564 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * VDPA simulator for networking device. + * + * Copyright (c) 2020, Red Hat Inc. All rights reserved. + * Author: Jason Wang <jasowang@redhat.com> + * + */ + +#include <linux/init.h> +#include <linux/module.h> +#include <linux/device.h> +#include <linux/kernel.h> +#include <linux/etherdevice.h> +#include <linux/vringh.h> +#include <linux/vdpa.h> +#include <net/netlink.h> +#include <uapi/linux/virtio_net.h> +#include <uapi/linux/vdpa.h> + +#include "vdpa_sim.h" + +#define DRV_VERSION "0.1" +#define DRV_AUTHOR "Jason Wang <jasowang@redhat.com>" +#define DRV_DESC "vDPA Device Simulator for networking device" +#define DRV_LICENSE "GPL v2" + +#define VDPASIM_NET_FEATURES (VDPASIM_FEATURES | \ + (1ULL << VIRTIO_NET_F_MAC) | \ + (1ULL << VIRTIO_NET_F_STATUS) | \ + (1ULL << VIRTIO_NET_F_MTU) | \ + (1ULL << VIRTIO_NET_F_CTRL_VQ) | \ + (1ULL << VIRTIO_NET_F_CTRL_MAC_ADDR)) + +/* 3 virtqueues, 2 address spaces, 2 virtqueue groups */ +#define VDPASIM_NET_VQ_NUM 3 +#define VDPASIM_NET_AS_NUM 2 +#define VDPASIM_NET_GROUP_NUM 2 + +struct vdpasim_dataq_stats { + struct u64_stats_sync syncp; + u64 pkts; + u64 bytes; + u64 drops; + u64 errors; + u64 overruns; +}; + +struct vdpasim_cq_stats { + struct u64_stats_sync syncp; + u64 requests; + u64 successes; + u64 errors; +}; + +struct vdpasim_net{ + struct vdpasim vdpasim; + struct vdpasim_dataq_stats tx_stats; + struct vdpasim_dataq_stats rx_stats; + struct vdpasim_cq_stats cq_stats; + void *buffer; +}; + +static struct vdpasim_net *sim_to_net(struct vdpasim *vdpasim) +{ + return container_of(vdpasim, struct vdpasim_net, vdpasim); +} + +static void vdpasim_net_complete(struct vdpasim_virtqueue *vq, size_t len) +{ + /* Make sure data is wrote before advancing index */ + smp_wmb(); + + vringh_complete_iotlb(&vq->vring, vq->head, len); + + /* Make sure used is visible before rasing the interrupt. */ + smp_wmb(); + + local_bh_disable(); + if (vringh_need_notify_iotlb(&vq->vring) > 0) + vringh_notify(&vq->vring); + local_bh_enable(); +} + +static bool receive_filter(struct vdpasim *vdpasim, size_t len) +{ + bool modern = vdpasim->features & (1ULL << VIRTIO_F_VERSION_1); + size_t hdr_len = modern ? sizeof(struct virtio_net_hdr_v1) : + sizeof(struct virtio_net_hdr); + struct virtio_net_config *vio_config = vdpasim->config; + struct vdpasim_net *net = sim_to_net(vdpasim); + + if (len < ETH_ALEN + hdr_len) + return false; + + if (is_broadcast_ether_addr(net->buffer + hdr_len) || + is_multicast_ether_addr(net->buffer + hdr_len)) + return true; + if (!strncmp(net->buffer + hdr_len, vio_config->mac, ETH_ALEN)) + return true; + + return false; +} + +static virtio_net_ctrl_ack vdpasim_handle_ctrl_mac(struct vdpasim *vdpasim, + u8 cmd) +{ + struct virtio_net_config *vio_config = vdpasim->config; + struct vdpasim_virtqueue *cvq = &vdpasim->vqs[2]; + virtio_net_ctrl_ack status = VIRTIO_NET_ERR; + size_t read; + + switch (cmd) { + case VIRTIO_NET_CTRL_MAC_ADDR_SET: + read = vringh_iov_pull_iotlb(&cvq->vring, &cvq->in_iov, + vio_config->mac, ETH_ALEN); + if (read == ETH_ALEN) + status = VIRTIO_NET_OK; + break; + default: + break; + } + + return status; +} + +static void vdpasim_handle_cvq(struct vdpasim *vdpasim) +{ + struct vdpasim_virtqueue *cvq = &vdpasim->vqs[2]; + struct vdpasim_net *net = sim_to_net(vdpasim); + virtio_net_ctrl_ack status = VIRTIO_NET_ERR; + struct virtio_net_ctrl_hdr ctrl; + size_t read, write; + u64 requests = 0, errors = 0, successes = 0; + int err; + + if (!(vdpasim->features & (1ULL << VIRTIO_NET_F_CTRL_VQ))) + return; + + if (!cvq->ready) + return; + + while (true) { + err = vringh_getdesc_iotlb(&cvq->vring, &cvq->in_iov, + &cvq->out_iov, + &cvq->head, GFP_ATOMIC); + if (err <= 0) + break; + + ++requests; + read = vringh_iov_pull_iotlb(&cvq->vring, &cvq->in_iov, &ctrl, + sizeof(ctrl)); + if (read != sizeof(ctrl)) { + ++errors; + break; + } + + switch (ctrl.class) { + case VIRTIO_NET_CTRL_MAC: + status = vdpasim_handle_ctrl_mac(vdpasim, ctrl.cmd); + break; + default: + break; + } + + if (status == VIRTIO_NET_OK) + ++successes; + else + ++errors; + + /* Make sure data is wrote before advancing index */ + smp_wmb(); + + write = vringh_iov_push_iotlb(&cvq->vring, &cvq->out_iov, + &status, sizeof(status)); + vringh_complete_iotlb(&cvq->vring, cvq->head, write); + vringh_kiov_cleanup(&cvq->in_iov); + vringh_kiov_cleanup(&cvq->out_iov); + + /* Make sure used is visible before rasing the interrupt. */ + smp_wmb(); + + local_bh_disable(); + if (cvq->cb) + cvq->cb(cvq->private); + local_bh_enable(); + } + + u64_stats_update_begin(&net->cq_stats.syncp); + net->cq_stats.requests += requests; + net->cq_stats.errors += errors; + net->cq_stats.successes += successes; + u64_stats_update_end(&net->cq_stats.syncp); +} + +static void vdpasim_net_work(struct vdpasim *vdpasim) +{ + struct vdpasim_virtqueue *txq = &vdpasim->vqs[1]; + struct vdpasim_virtqueue *rxq = &vdpasim->vqs[0]; + struct vdpasim_net *net = sim_to_net(vdpasim); + ssize_t read, write; + u64 tx_pkts = 0, rx_pkts = 0, tx_bytes = 0, rx_bytes = 0; + u64 rx_drops = 0, rx_overruns = 0, rx_errors = 0, tx_errors = 0; + int err; + + mutex_lock(&vdpasim->mutex); + + if (!vdpasim->running) + goto out; + + if (!(vdpasim->status & VIRTIO_CONFIG_S_DRIVER_OK)) + goto out; + + vdpasim_handle_cvq(vdpasim); + + if (!txq->ready || !rxq->ready) + goto out; + + while (true) { + err = vringh_getdesc_iotlb(&txq->vring, &txq->out_iov, NULL, + &txq->head, GFP_ATOMIC); + if (err <= 0) { + if (err) + ++tx_errors; + break; + } + + ++tx_pkts; + read = vringh_iov_pull_iotlb(&txq->vring, &txq->out_iov, + net->buffer, PAGE_SIZE); + + tx_bytes += read; + + if (!receive_filter(vdpasim, read)) { + ++rx_drops; + vdpasim_net_complete(txq, 0); + continue; + } + + err = vringh_getdesc_iotlb(&rxq->vring, NULL, &rxq->in_iov, + &rxq->head, GFP_ATOMIC); + if (err <= 0) { + ++rx_overruns; + vdpasim_net_complete(txq, 0); + break; + } + + write = vringh_iov_push_iotlb(&rxq->vring, &rxq->in_iov, + net->buffer, read); + if (write <= 0) { + ++rx_errors; + break; + } + + ++rx_pkts; + rx_bytes += write; + + vdpasim_net_complete(txq, 0); + vdpasim_net_complete(rxq, write); + + if (tx_pkts > 4) { + vdpasim_schedule_work(vdpasim); + goto out; + } + } + +out: + mutex_unlock(&vdpasim->mutex); + + u64_stats_update_begin(&net->tx_stats.syncp); + net->tx_stats.pkts += tx_pkts; + net->tx_stats.bytes += tx_bytes; + net->tx_stats.errors += tx_errors; + u64_stats_update_end(&net->tx_stats.syncp); + + u64_stats_update_begin(&net->rx_stats.syncp); + net->rx_stats.pkts += rx_pkts; + net->rx_stats.bytes += rx_bytes; + net->rx_stats.drops += rx_drops; + net->rx_stats.errors += rx_errors; + net->rx_stats.overruns += rx_overruns; + u64_stats_update_end(&net->rx_stats.syncp); +} + +static int vdpasim_net_get_stats(struct vdpasim *vdpasim, u16 idx, + struct sk_buff *msg, + struct netlink_ext_ack *extack) +{ + struct vdpasim_net *net = sim_to_net(vdpasim); + u64 rx_pkts, rx_bytes, rx_errors, rx_overruns, rx_drops; + u64 tx_pkts, tx_bytes, tx_errors, tx_drops; + u64 cq_requests, cq_successes, cq_errors; + unsigned int start; + int err = -EMSGSIZE; + + switch(idx) { + case 0: + do { + start = u64_stats_fetch_begin(&net->rx_stats.syncp); + rx_pkts = net->rx_stats.pkts; + rx_bytes = net->rx_stats.bytes; + rx_errors = net->rx_stats.errors; + rx_overruns = net->rx_stats.overruns; + rx_drops = net->rx_stats.drops; + } while (u64_stats_fetch_retry(&net->rx_stats.syncp, start)); + + if (nla_put_string(msg, VDPA_ATTR_DEV_VENDOR_ATTR_NAME, + "rx packets")) + break; + if (nla_put_u64_64bit(msg, VDPA_ATTR_DEV_VENDOR_ATTR_VALUE, + rx_pkts, VDPA_ATTR_PAD)) + break; + if (nla_put_string(msg, VDPA_ATTR_DEV_VENDOR_ATTR_NAME, + "rx bytes")) + break; + if (nla_put_u64_64bit(msg, VDPA_ATTR_DEV_VENDOR_ATTR_VALUE, + rx_bytes, VDPA_ATTR_PAD)) + break; + if (nla_put_string(msg, VDPA_ATTR_DEV_VENDOR_ATTR_NAME, + "rx errors")) + break; + if (nla_put_u64_64bit(msg, VDPA_ATTR_DEV_VENDOR_ATTR_VALUE, + rx_errors, VDPA_ATTR_PAD)) + break; + if (nla_put_string(msg, VDPA_ATTR_DEV_VENDOR_ATTR_NAME, + "rx overruns")) + break; + if (nla_put_u64_64bit(msg, VDPA_ATTR_DEV_VENDOR_ATTR_VALUE, + rx_overruns, VDPA_ATTR_PAD)) + break; + if (nla_put_string(msg, VDPA_ATTR_DEV_VENDOR_ATTR_NAME, + "rx drops")) + break; + if (nla_put_u64_64bit(msg, VDPA_ATTR_DEV_VENDOR_ATTR_VALUE, + rx_drops, VDPA_ATTR_PAD)) + break; + err = 0; + break; + case 1: + do { + start = u64_stats_fetch_begin(&net->tx_stats.syncp); + tx_pkts = net->tx_stats.pkts; + tx_bytes = net->tx_stats.bytes; + tx_errors = net->tx_stats.errors; + tx_drops = net->tx_stats.drops; + } while (u64_stats_fetch_retry(&net->tx_stats.syncp, start)); + + if (nla_put_string(msg, VDPA_ATTR_DEV_VENDOR_ATTR_NAME, + "tx packets")) + break; + if (nla_put_u64_64bit(msg, VDPA_ATTR_DEV_VENDOR_ATTR_VALUE, + tx_pkts, VDPA_ATTR_PAD)) + break; + if (nla_put_string(msg, VDPA_ATTR_DEV_VENDOR_ATTR_NAME, + "tx bytes")) + break; + if (nla_put_u64_64bit(msg, VDPA_ATTR_DEV_VENDOR_ATTR_VALUE, + tx_bytes, VDPA_ATTR_PAD)) + break; + if (nla_put_string(msg, VDPA_ATTR_DEV_VENDOR_ATTR_NAME, + "tx errors")) + break; + if (nla_put_u64_64bit(msg, VDPA_ATTR_DEV_VENDOR_ATTR_VALUE, + tx_errors, VDPA_ATTR_PAD)) + break; + if (nla_put_string(msg, VDPA_ATTR_DEV_VENDOR_ATTR_NAME, + "tx drops")) + break; + if (nla_put_u64_64bit(msg, VDPA_ATTR_DEV_VENDOR_ATTR_VALUE, + tx_drops, VDPA_ATTR_PAD)) + break; + err = 0; + break; + case 2: + do { + start = u64_stats_fetch_begin(&net->cq_stats.syncp); + cq_requests = net->cq_stats.requests; + cq_successes = net->cq_stats.successes; + cq_errors = net->cq_stats.errors; + } while (u64_stats_fetch_retry(&net->cq_stats.syncp, start)); + + if (nla_put_string(msg, VDPA_ATTR_DEV_VENDOR_ATTR_NAME, + "cvq requests")) + break; + if (nla_put_u64_64bit(msg, VDPA_ATTR_DEV_VENDOR_ATTR_VALUE, + cq_requests, VDPA_ATTR_PAD)) + break; + if (nla_put_string(msg, VDPA_ATTR_DEV_VENDOR_ATTR_NAME, + "cvq successes")) + break; + if (nla_put_u64_64bit(msg, VDPA_ATTR_DEV_VENDOR_ATTR_VALUE, + cq_successes, VDPA_ATTR_PAD)) + break; + if (nla_put_string(msg, VDPA_ATTR_DEV_VENDOR_ATTR_NAME, + "cvq errors")) + break; + if (nla_put_u64_64bit(msg, VDPA_ATTR_DEV_VENDOR_ATTR_VALUE, + cq_errors, VDPA_ATTR_PAD)) + break; + err = 0; + break; + default: + err = -EINVAL; + break; + } + + return err; +} + +static void vdpasim_net_get_config(struct vdpasim *vdpasim, void *config) +{ + struct virtio_net_config *net_config = config; + + net_config->status = cpu_to_vdpasim16(vdpasim, VIRTIO_NET_S_LINK_UP); +} + +static void vdpasim_net_setup_config(struct vdpasim *vdpasim, + const struct vdpa_dev_set_config *config) +{ + struct virtio_net_config *vio_config = vdpasim->config; + + if (config->mask & (1 << VDPA_ATTR_DEV_NET_CFG_MACADDR)) + memcpy(vio_config->mac, config->net.mac, ETH_ALEN); + if (config->mask & (1 << VDPA_ATTR_DEV_NET_CFG_MTU)) + vio_config->mtu = cpu_to_vdpasim16(vdpasim, config->net.mtu); + else + /* Setup default MTU to be 1500 */ + vio_config->mtu = cpu_to_vdpasim16(vdpasim, 1500); +} + +static void vdpasim_net_free(struct vdpasim *vdpasim) +{ + struct vdpasim_net *net = sim_to_net(vdpasim); + + kvfree(net->buffer); +} + +static void vdpasim_net_mgmtdev_release(struct device *dev) +{ +} + +static struct device vdpasim_net_mgmtdev = { + .init_name = "vdpasim_net", + .release = vdpasim_net_mgmtdev_release, +}; + +static int vdpasim_net_dev_add(struct vdpa_mgmt_dev *mdev, const char *name, + const struct vdpa_dev_set_config *config) +{ + struct vdpasim_dev_attr dev_attr = {}; + struct vdpasim_net *net; + struct vdpasim *simdev; + int ret; + + dev_attr.mgmt_dev = mdev; + dev_attr.name = name; + dev_attr.id = VIRTIO_ID_NET; + dev_attr.supported_features = VDPASIM_NET_FEATURES; + dev_attr.nvqs = VDPASIM_NET_VQ_NUM; + dev_attr.ngroups = VDPASIM_NET_GROUP_NUM; + dev_attr.nas = VDPASIM_NET_AS_NUM; + dev_attr.alloc_size = sizeof(struct vdpasim_net); + dev_attr.config_size = sizeof(struct virtio_net_config); + dev_attr.get_config = vdpasim_net_get_config; + dev_attr.work_fn = vdpasim_net_work; + dev_attr.get_stats = vdpasim_net_get_stats; + dev_attr.free = vdpasim_net_free; + + simdev = vdpasim_create(&dev_attr, config); + if (IS_ERR(simdev)) + return PTR_ERR(simdev); + + vdpasim_net_setup_config(simdev, config); + + net = sim_to_net(simdev); + + u64_stats_init(&net->tx_stats.syncp); + u64_stats_init(&net->rx_stats.syncp); + u64_stats_init(&net->cq_stats.syncp); + + net->buffer = kvmalloc(PAGE_SIZE, GFP_KERNEL); + if (!net->buffer) { + ret = -ENOMEM; + goto reg_err; + } + + /* + * Initialization must be completed before this call, since it can + * connect the device to the vDPA bus, so requests can arrive after + * this call. + */ + ret = _vdpa_register_device(&simdev->vdpa, VDPASIM_NET_VQ_NUM); + if (ret) + goto reg_err; + + return 0; + +reg_err: + put_device(&simdev->vdpa.dev); + return ret; +} + +static void vdpasim_net_dev_del(struct vdpa_mgmt_dev *mdev, + struct vdpa_device *dev) +{ + struct vdpasim *simdev = container_of(dev, struct vdpasim, vdpa); + + _vdpa_unregister_device(&simdev->vdpa); +} + +static const struct vdpa_mgmtdev_ops vdpasim_net_mgmtdev_ops = { + .dev_add = vdpasim_net_dev_add, + .dev_del = vdpasim_net_dev_del +}; + +static struct virtio_device_id id_table[] = { + { VIRTIO_ID_NET, VIRTIO_DEV_ANY_ID }, + { 0 }, +}; + +static struct vdpa_mgmt_dev mgmt_dev = { + .device = &vdpasim_net_mgmtdev, + .id_table = id_table, + .ops = &vdpasim_net_mgmtdev_ops, + .config_attr_mask = (1 << VDPA_ATTR_DEV_NET_CFG_MACADDR | + 1 << VDPA_ATTR_DEV_NET_CFG_MTU | + 1 << VDPA_ATTR_DEV_FEATURES), + .max_supported_vqs = VDPASIM_NET_VQ_NUM, + .supported_features = VDPASIM_NET_FEATURES, +}; + +static int __init vdpasim_net_init(void) +{ + int ret; + + ret = device_register(&vdpasim_net_mgmtdev); + if (ret) { + put_device(&vdpasim_net_mgmtdev); + return ret; + } + + ret = vdpa_mgmtdev_register(&mgmt_dev); + if (ret) + goto parent_err; + return 0; + +parent_err: + device_unregister(&vdpasim_net_mgmtdev); + return ret; +} + +static void __exit vdpasim_net_exit(void) +{ + vdpa_mgmtdev_unregister(&mgmt_dev); + device_unregister(&vdpasim_net_mgmtdev); +} + +module_init(vdpasim_net_init); +module_exit(vdpasim_net_exit); + +MODULE_VERSION(DRV_VERSION); +MODULE_LICENSE(DRV_LICENSE); +MODULE_AUTHOR(DRV_AUTHOR); +MODULE_DESCRIPTION(DRV_DESC); diff --git a/drivers/vdpa/vdpa_user/Makefile b/drivers/vdpa/vdpa_user/Makefile new file mode 100644 index 0000000000..260e0b26af --- /dev/null +++ b/drivers/vdpa/vdpa_user/Makefile @@ -0,0 +1,5 @@ +# SPDX-License-Identifier: GPL-2.0 + +vduse-y := vduse_dev.o iova_domain.o + +obj-$(CONFIG_VDPA_USER) += vduse.o diff --git a/drivers/vdpa/vdpa_user/iova_domain.c b/drivers/vdpa/vdpa_user/iova_domain.c new file mode 100644 index 0000000000..5e4a77b9ba --- /dev/null +++ b/drivers/vdpa/vdpa_user/iova_domain.c @@ -0,0 +1,624 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * MMU-based software IOTLB. + * + * Copyright (C) 2020-2021 Bytedance Inc. and/or its affiliates. All rights reserved. + * + * Author: Xie Yongji <xieyongji@bytedance.com> + * + */ + +#include <linux/slab.h> +#include <linux/file.h> +#include <linux/anon_inodes.h> +#include <linux/highmem.h> +#include <linux/vmalloc.h> +#include <linux/vdpa.h> + +#include "iova_domain.h" + +static int vduse_iotlb_add_range(struct vduse_iova_domain *domain, + u64 start, u64 last, + u64 addr, unsigned int perm, + struct file *file, u64 offset) +{ + struct vdpa_map_file *map_file; + int ret; + + map_file = kmalloc(sizeof(*map_file), GFP_ATOMIC); + if (!map_file) + return -ENOMEM; + + map_file->file = get_file(file); + map_file->offset = offset; + + ret = vhost_iotlb_add_range_ctx(domain->iotlb, start, last, + addr, perm, map_file); + if (ret) { + fput(map_file->file); + kfree(map_file); + return ret; + } + return 0; +} + +static void vduse_iotlb_del_range(struct vduse_iova_domain *domain, + u64 start, u64 last) +{ + struct vdpa_map_file *map_file; + struct vhost_iotlb_map *map; + + while ((map = vhost_iotlb_itree_first(domain->iotlb, start, last))) { + map_file = (struct vdpa_map_file *)map->opaque; + fput(map_file->file); + kfree(map_file); + vhost_iotlb_map_free(domain->iotlb, map); + } +} + +int vduse_domain_set_map(struct vduse_iova_domain *domain, + struct vhost_iotlb *iotlb) +{ + struct vdpa_map_file *map_file; + struct vhost_iotlb_map *map; + u64 start = 0ULL, last = ULLONG_MAX; + int ret; + + spin_lock(&domain->iotlb_lock); + vduse_iotlb_del_range(domain, start, last); + + for (map = vhost_iotlb_itree_first(iotlb, start, last); map; + map = vhost_iotlb_itree_next(map, start, last)) { + map_file = (struct vdpa_map_file *)map->opaque; + ret = vduse_iotlb_add_range(domain, map->start, map->last, + map->addr, map->perm, + map_file->file, + map_file->offset); + if (ret) + goto err; + } + spin_unlock(&domain->iotlb_lock); + + return 0; +err: + vduse_iotlb_del_range(domain, start, last); + spin_unlock(&domain->iotlb_lock); + return ret; +} + +void vduse_domain_clear_map(struct vduse_iova_domain *domain, + struct vhost_iotlb *iotlb) +{ + struct vhost_iotlb_map *map; + u64 start = 0ULL, last = ULLONG_MAX; + + spin_lock(&domain->iotlb_lock); + for (map = vhost_iotlb_itree_first(iotlb, start, last); map; + map = vhost_iotlb_itree_next(map, start, last)) { + vduse_iotlb_del_range(domain, map->start, map->last); + } + spin_unlock(&domain->iotlb_lock); +} + +static int vduse_domain_map_bounce_page(struct vduse_iova_domain *domain, + u64 iova, u64 size, u64 paddr) +{ + struct vduse_bounce_map *map; + u64 last = iova + size - 1; + + while (iova <= last) { + map = &domain->bounce_maps[iova >> PAGE_SHIFT]; + if (!map->bounce_page) { + map->bounce_page = alloc_page(GFP_ATOMIC); + if (!map->bounce_page) + return -ENOMEM; + } + map->orig_phys = paddr; + paddr += PAGE_SIZE; + iova += PAGE_SIZE; + } + return 0; +} + +static void vduse_domain_unmap_bounce_page(struct vduse_iova_domain *domain, + u64 iova, u64 size) +{ + struct vduse_bounce_map *map; + u64 last = iova + size - 1; + + while (iova <= last) { + map = &domain->bounce_maps[iova >> PAGE_SHIFT]; + map->orig_phys = INVALID_PHYS_ADDR; + iova += PAGE_SIZE; + } +} + +static void do_bounce(phys_addr_t orig, void *addr, size_t size, + enum dma_data_direction dir) +{ + unsigned long pfn = PFN_DOWN(orig); + unsigned int offset = offset_in_page(orig); + struct page *page; + unsigned int sz = 0; + + while (size) { + sz = min_t(size_t, PAGE_SIZE - offset, size); + + page = pfn_to_page(pfn); + if (dir == DMA_TO_DEVICE) + memcpy_from_page(addr, page, offset, sz); + else + memcpy_to_page(page, offset, addr, sz); + + size -= sz; + pfn++; + addr += sz; + offset = 0; + } +} + +static void vduse_domain_bounce(struct vduse_iova_domain *domain, + dma_addr_t iova, size_t size, + enum dma_data_direction dir) +{ + struct vduse_bounce_map *map; + unsigned int offset; + void *addr; + size_t sz; + + if (iova >= domain->bounce_size) + return; + + while (size) { + map = &domain->bounce_maps[iova >> PAGE_SHIFT]; + offset = offset_in_page(iova); + sz = min_t(size_t, PAGE_SIZE - offset, size); + + if (WARN_ON(!map->bounce_page || + map->orig_phys == INVALID_PHYS_ADDR)) + return; + + addr = kmap_local_page(map->bounce_page); + do_bounce(map->orig_phys + offset, addr + offset, sz, dir); + kunmap_local(addr); + size -= sz; + iova += sz; + } +} + +static struct page * +vduse_domain_get_coherent_page(struct vduse_iova_domain *domain, u64 iova) +{ + u64 start = iova & PAGE_MASK; + u64 last = start + PAGE_SIZE - 1; + struct vhost_iotlb_map *map; + struct page *page = NULL; + + spin_lock(&domain->iotlb_lock); + map = vhost_iotlb_itree_first(domain->iotlb, start, last); + if (!map) + goto out; + + page = pfn_to_page((map->addr + iova - map->start) >> PAGE_SHIFT); + get_page(page); +out: + spin_unlock(&domain->iotlb_lock); + + return page; +} + +static struct page * +vduse_domain_get_bounce_page(struct vduse_iova_domain *domain, u64 iova) +{ + struct vduse_bounce_map *map; + struct page *page = NULL; + + read_lock(&domain->bounce_lock); + map = &domain->bounce_maps[iova >> PAGE_SHIFT]; + if (domain->user_bounce_pages || !map->bounce_page) + goto out; + + page = map->bounce_page; + get_page(page); +out: + read_unlock(&domain->bounce_lock); + + return page; +} + +static void +vduse_domain_free_kernel_bounce_pages(struct vduse_iova_domain *domain) +{ + struct vduse_bounce_map *map; + unsigned long pfn, bounce_pfns; + + bounce_pfns = domain->bounce_size >> PAGE_SHIFT; + + for (pfn = 0; pfn < bounce_pfns; pfn++) { + map = &domain->bounce_maps[pfn]; + if (WARN_ON(map->orig_phys != INVALID_PHYS_ADDR)) + continue; + + if (!map->bounce_page) + continue; + + __free_page(map->bounce_page); + map->bounce_page = NULL; + } +} + +int vduse_domain_add_user_bounce_pages(struct vduse_iova_domain *domain, + struct page **pages, int count) +{ + struct vduse_bounce_map *map; + int i, ret; + + /* Now we don't support partial mapping */ + if (count != (domain->bounce_size >> PAGE_SHIFT)) + return -EINVAL; + + write_lock(&domain->bounce_lock); + ret = -EEXIST; + if (domain->user_bounce_pages) + goto out; + + for (i = 0; i < count; i++) { + map = &domain->bounce_maps[i]; + if (map->bounce_page) { + /* Copy kernel page to user page if it's in use */ + if (map->orig_phys != INVALID_PHYS_ADDR) + memcpy_to_page(pages[i], 0, + page_address(map->bounce_page), + PAGE_SIZE); + __free_page(map->bounce_page); + } + map->bounce_page = pages[i]; + get_page(pages[i]); + } + domain->user_bounce_pages = true; + ret = 0; +out: + write_unlock(&domain->bounce_lock); + + return ret; +} + +void vduse_domain_remove_user_bounce_pages(struct vduse_iova_domain *domain) +{ + struct vduse_bounce_map *map; + unsigned long i, count; + + write_lock(&domain->bounce_lock); + if (!domain->user_bounce_pages) + goto out; + + count = domain->bounce_size >> PAGE_SHIFT; + for (i = 0; i < count; i++) { + struct page *page = NULL; + + map = &domain->bounce_maps[i]; + if (WARN_ON(!map->bounce_page)) + continue; + + /* Copy user page to kernel page if it's in use */ + if (map->orig_phys != INVALID_PHYS_ADDR) { + page = alloc_page(GFP_ATOMIC | __GFP_NOFAIL); + memcpy_from_page(page_address(page), + map->bounce_page, 0, PAGE_SIZE); + } + put_page(map->bounce_page); + map->bounce_page = page; + } + domain->user_bounce_pages = false; +out: + write_unlock(&domain->bounce_lock); +} + +void vduse_domain_reset_bounce_map(struct vduse_iova_domain *domain) +{ + if (!domain->bounce_map) + return; + + spin_lock(&domain->iotlb_lock); + if (!domain->bounce_map) + goto unlock; + + vduse_iotlb_del_range(domain, 0, domain->bounce_size - 1); + domain->bounce_map = 0; +unlock: + spin_unlock(&domain->iotlb_lock); +} + +static int vduse_domain_init_bounce_map(struct vduse_iova_domain *domain) +{ + int ret = 0; + + if (domain->bounce_map) + return 0; + + spin_lock(&domain->iotlb_lock); + if (domain->bounce_map) + goto unlock; + + ret = vduse_iotlb_add_range(domain, 0, domain->bounce_size - 1, + 0, VHOST_MAP_RW, domain->file, 0); + if (ret) + goto unlock; + + domain->bounce_map = 1; +unlock: + spin_unlock(&domain->iotlb_lock); + return ret; +} + +static dma_addr_t +vduse_domain_alloc_iova(struct iova_domain *iovad, + unsigned long size, unsigned long limit) +{ + unsigned long shift = iova_shift(iovad); + unsigned long iova_len = iova_align(iovad, size) >> shift; + unsigned long iova_pfn; + + iova_pfn = alloc_iova_fast(iovad, iova_len, limit >> shift, true); + + return (dma_addr_t)iova_pfn << shift; +} + +static void vduse_domain_free_iova(struct iova_domain *iovad, + dma_addr_t iova, size_t size) +{ + unsigned long shift = iova_shift(iovad); + unsigned long iova_len = iova_align(iovad, size) >> shift; + + free_iova_fast(iovad, iova >> shift, iova_len); +} + +dma_addr_t vduse_domain_map_page(struct vduse_iova_domain *domain, + struct page *page, unsigned long offset, + size_t size, enum dma_data_direction dir, + unsigned long attrs) +{ + struct iova_domain *iovad = &domain->stream_iovad; + unsigned long limit = domain->bounce_size - 1; + phys_addr_t pa = page_to_phys(page) + offset; + dma_addr_t iova = vduse_domain_alloc_iova(iovad, size, limit); + + if (!iova) + return DMA_MAPPING_ERROR; + + if (vduse_domain_init_bounce_map(domain)) + goto err; + + read_lock(&domain->bounce_lock); + if (vduse_domain_map_bounce_page(domain, (u64)iova, (u64)size, pa)) + goto err_unlock; + + if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL) + vduse_domain_bounce(domain, iova, size, DMA_TO_DEVICE); + + read_unlock(&domain->bounce_lock); + + return iova; +err_unlock: + read_unlock(&domain->bounce_lock); +err: + vduse_domain_free_iova(iovad, iova, size); + return DMA_MAPPING_ERROR; +} + +void vduse_domain_unmap_page(struct vduse_iova_domain *domain, + dma_addr_t dma_addr, size_t size, + enum dma_data_direction dir, unsigned long attrs) +{ + struct iova_domain *iovad = &domain->stream_iovad; + + read_lock(&domain->bounce_lock); + if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL) + vduse_domain_bounce(domain, dma_addr, size, DMA_FROM_DEVICE); + + vduse_domain_unmap_bounce_page(domain, (u64)dma_addr, (u64)size); + read_unlock(&domain->bounce_lock); + vduse_domain_free_iova(iovad, dma_addr, size); +} + +void *vduse_domain_alloc_coherent(struct vduse_iova_domain *domain, + size_t size, dma_addr_t *dma_addr, + gfp_t flag, unsigned long attrs) +{ + struct iova_domain *iovad = &domain->consistent_iovad; + unsigned long limit = domain->iova_limit; + dma_addr_t iova = vduse_domain_alloc_iova(iovad, size, limit); + void *orig = alloc_pages_exact(size, flag); + + if (!iova || !orig) + goto err; + + spin_lock(&domain->iotlb_lock); + if (vduse_iotlb_add_range(domain, (u64)iova, (u64)iova + size - 1, + virt_to_phys(orig), VHOST_MAP_RW, + domain->file, (u64)iova)) { + spin_unlock(&domain->iotlb_lock); + goto err; + } + spin_unlock(&domain->iotlb_lock); + + *dma_addr = iova; + + return orig; +err: + *dma_addr = DMA_MAPPING_ERROR; + if (orig) + free_pages_exact(orig, size); + if (iova) + vduse_domain_free_iova(iovad, iova, size); + + return NULL; +} + +void vduse_domain_free_coherent(struct vduse_iova_domain *domain, size_t size, + void *vaddr, dma_addr_t dma_addr, + unsigned long attrs) +{ + struct iova_domain *iovad = &domain->consistent_iovad; + struct vhost_iotlb_map *map; + struct vdpa_map_file *map_file; + phys_addr_t pa; + + spin_lock(&domain->iotlb_lock); + map = vhost_iotlb_itree_first(domain->iotlb, (u64)dma_addr, + (u64)dma_addr + size - 1); + if (WARN_ON(!map)) { + spin_unlock(&domain->iotlb_lock); + return; + } + map_file = (struct vdpa_map_file *)map->opaque; + fput(map_file->file); + kfree(map_file); + pa = map->addr; + vhost_iotlb_map_free(domain->iotlb, map); + spin_unlock(&domain->iotlb_lock); + + vduse_domain_free_iova(iovad, dma_addr, size); + free_pages_exact(phys_to_virt(pa), size); +} + +static vm_fault_t vduse_domain_mmap_fault(struct vm_fault *vmf) +{ + struct vduse_iova_domain *domain = vmf->vma->vm_private_data; + unsigned long iova = vmf->pgoff << PAGE_SHIFT; + struct page *page; + + if (!domain) + return VM_FAULT_SIGBUS; + + if (iova < domain->bounce_size) + page = vduse_domain_get_bounce_page(domain, iova); + else + page = vduse_domain_get_coherent_page(domain, iova); + + if (!page) + return VM_FAULT_SIGBUS; + + vmf->page = page; + + return 0; +} + +static const struct vm_operations_struct vduse_domain_mmap_ops = { + .fault = vduse_domain_mmap_fault, +}; + +static int vduse_domain_mmap(struct file *file, struct vm_area_struct *vma) +{ + struct vduse_iova_domain *domain = file->private_data; + + vm_flags_set(vma, VM_DONTDUMP | VM_DONTEXPAND); + vma->vm_private_data = domain; + vma->vm_ops = &vduse_domain_mmap_ops; + + return 0; +} + +static int vduse_domain_release(struct inode *inode, struct file *file) +{ + struct vduse_iova_domain *domain = file->private_data; + + spin_lock(&domain->iotlb_lock); + vduse_iotlb_del_range(domain, 0, ULLONG_MAX); + vduse_domain_remove_user_bounce_pages(domain); + vduse_domain_free_kernel_bounce_pages(domain); + spin_unlock(&domain->iotlb_lock); + put_iova_domain(&domain->stream_iovad); + put_iova_domain(&domain->consistent_iovad); + vhost_iotlb_free(domain->iotlb); + vfree(domain->bounce_maps); + kfree(domain); + + return 0; +} + +static const struct file_operations vduse_domain_fops = { + .owner = THIS_MODULE, + .mmap = vduse_domain_mmap, + .release = vduse_domain_release, +}; + +void vduse_domain_destroy(struct vduse_iova_domain *domain) +{ + fput(domain->file); +} + +struct vduse_iova_domain * +vduse_domain_create(unsigned long iova_limit, size_t bounce_size) +{ + struct vduse_iova_domain *domain; + struct file *file; + struct vduse_bounce_map *map; + unsigned long pfn, bounce_pfns; + int ret; + + bounce_pfns = PAGE_ALIGN(bounce_size) >> PAGE_SHIFT; + if (iova_limit <= bounce_size) + return NULL; + + domain = kzalloc(sizeof(*domain), GFP_KERNEL); + if (!domain) + return NULL; + + domain->iotlb = vhost_iotlb_alloc(0, 0); + if (!domain->iotlb) + goto err_iotlb; + + domain->iova_limit = iova_limit; + domain->bounce_size = PAGE_ALIGN(bounce_size); + domain->bounce_maps = vzalloc(bounce_pfns * + sizeof(struct vduse_bounce_map)); + if (!domain->bounce_maps) + goto err_map; + + for (pfn = 0; pfn < bounce_pfns; pfn++) { + map = &domain->bounce_maps[pfn]; + map->orig_phys = INVALID_PHYS_ADDR; + } + file = anon_inode_getfile("[vduse-domain]", &vduse_domain_fops, + domain, O_RDWR); + if (IS_ERR(file)) + goto err_file; + + domain->file = file; + rwlock_init(&domain->bounce_lock); + spin_lock_init(&domain->iotlb_lock); + init_iova_domain(&domain->stream_iovad, + PAGE_SIZE, IOVA_START_PFN); + ret = iova_domain_init_rcaches(&domain->stream_iovad); + if (ret) + goto err_iovad_stream; + init_iova_domain(&domain->consistent_iovad, + PAGE_SIZE, bounce_pfns); + ret = iova_domain_init_rcaches(&domain->consistent_iovad); + if (ret) + goto err_iovad_consistent; + + return domain; +err_iovad_consistent: + put_iova_domain(&domain->stream_iovad); +err_iovad_stream: + fput(file); +err_file: + vfree(domain->bounce_maps); +err_map: + vhost_iotlb_free(domain->iotlb); +err_iotlb: + kfree(domain); + return NULL; +} + +int vduse_domain_init(void) +{ + return iova_cache_get(); +} + +void vduse_domain_exit(void) +{ + iova_cache_put(); +} diff --git a/drivers/vdpa/vdpa_user/iova_domain.h b/drivers/vdpa/vdpa_user/iova_domain.h new file mode 100644 index 0000000000..173e979b84 --- /dev/null +++ b/drivers/vdpa/vdpa_user/iova_domain.h @@ -0,0 +1,80 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * MMU-based software IOTLB. + * + * Copyright (C) 2020-2021 Bytedance Inc. and/or its affiliates. All rights reserved. + * + * Author: Xie Yongji <xieyongji@bytedance.com> + * + */ + +#ifndef _VDUSE_IOVA_DOMAIN_H +#define _VDUSE_IOVA_DOMAIN_H + +#include <linux/iova.h> +#include <linux/dma-mapping.h> +#include <linux/vhost_iotlb.h> + +#define IOVA_START_PFN 1 + +#define INVALID_PHYS_ADDR (~(phys_addr_t)0) + +struct vduse_bounce_map { + struct page *bounce_page; + u64 orig_phys; +}; + +struct vduse_iova_domain { + struct iova_domain stream_iovad; + struct iova_domain consistent_iovad; + struct vduse_bounce_map *bounce_maps; + size_t bounce_size; + unsigned long iova_limit; + int bounce_map; + struct vhost_iotlb *iotlb; + spinlock_t iotlb_lock; + struct file *file; + bool user_bounce_pages; + rwlock_t bounce_lock; +}; + +int vduse_domain_set_map(struct vduse_iova_domain *domain, + struct vhost_iotlb *iotlb); + +void vduse_domain_clear_map(struct vduse_iova_domain *domain, + struct vhost_iotlb *iotlb); + +dma_addr_t vduse_domain_map_page(struct vduse_iova_domain *domain, + struct page *page, unsigned long offset, + size_t size, enum dma_data_direction dir, + unsigned long attrs); + +void vduse_domain_unmap_page(struct vduse_iova_domain *domain, + dma_addr_t dma_addr, size_t size, + enum dma_data_direction dir, unsigned long attrs); + +void *vduse_domain_alloc_coherent(struct vduse_iova_domain *domain, + size_t size, dma_addr_t *dma_addr, + gfp_t flag, unsigned long attrs); + +void vduse_domain_free_coherent(struct vduse_iova_domain *domain, size_t size, + void *vaddr, dma_addr_t dma_addr, + unsigned long attrs); + +void vduse_domain_reset_bounce_map(struct vduse_iova_domain *domain); + +int vduse_domain_add_user_bounce_pages(struct vduse_iova_domain *domain, + struct page **pages, int count); + +void vduse_domain_remove_user_bounce_pages(struct vduse_iova_domain *domain); + +void vduse_domain_destroy(struct vduse_iova_domain *domain); + +struct vduse_iova_domain *vduse_domain_create(unsigned long iova_limit, + size_t bounce_size); + +int vduse_domain_init(void); + +void vduse_domain_exit(void); + +#endif /* _VDUSE_IOVA_DOMAIN_H */ diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c new file mode 100644 index 0000000000..df7869537e --- /dev/null +++ b/drivers/vdpa/vdpa_user/vduse_dev.c @@ -0,0 +1,2171 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * VDUSE: vDPA Device in Userspace + * + * Copyright (C) 2020-2021 Bytedance Inc. and/or its affiliates. All rights reserved. + * + * Author: Xie Yongji <xieyongji@bytedance.com> + * + */ + +#include <linux/init.h> +#include <linux/module.h> +#include <linux/cdev.h> +#include <linux/device.h> +#include <linux/eventfd.h> +#include <linux/slab.h> +#include <linux/wait.h> +#include <linux/dma-map-ops.h> +#include <linux/poll.h> +#include <linux/file.h> +#include <linux/uio.h> +#include <linux/vdpa.h> +#include <linux/nospec.h> +#include <linux/vmalloc.h> +#include <linux/sched/mm.h> +#include <uapi/linux/vduse.h> +#include <uapi/linux/vdpa.h> +#include <uapi/linux/virtio_config.h> +#include <uapi/linux/virtio_ids.h> +#include <uapi/linux/virtio_blk.h> +#include <linux/mod_devicetable.h> + +#include "iova_domain.h" + +#define DRV_AUTHOR "Yongji Xie <xieyongji@bytedance.com>" +#define DRV_DESC "vDPA Device in Userspace" +#define DRV_LICENSE "GPL v2" + +#define VDUSE_DEV_MAX (1U << MINORBITS) +#define VDUSE_MAX_BOUNCE_SIZE (1024 * 1024 * 1024) +#define VDUSE_MIN_BOUNCE_SIZE (1024 * 1024) +#define VDUSE_BOUNCE_SIZE (64 * 1024 * 1024) +/* 128 MB reserved for virtqueue creation */ +#define VDUSE_IOVA_SIZE (VDUSE_MAX_BOUNCE_SIZE + 128 * 1024 * 1024) +#define VDUSE_MSG_DEFAULT_TIMEOUT 30 + +#define IRQ_UNBOUND -1 + +struct vduse_virtqueue { + u16 index; + u16 num_max; + u32 num; + u64 desc_addr; + u64 driver_addr; + u64 device_addr; + struct vdpa_vq_state state; + bool ready; + bool kicked; + spinlock_t kick_lock; + spinlock_t irq_lock; + struct eventfd_ctx *kickfd; + struct vdpa_callback cb; + struct work_struct inject; + struct work_struct kick; + int irq_effective_cpu; + struct cpumask irq_affinity; + struct kobject kobj; +}; + +struct vduse_dev; + +struct vduse_vdpa { + struct vdpa_device vdpa; + struct vduse_dev *dev; +}; + +struct vduse_umem { + unsigned long iova; + unsigned long npages; + struct page **pages; + struct mm_struct *mm; +}; + +struct vduse_dev { + struct vduse_vdpa *vdev; + struct device *dev; + struct vduse_virtqueue **vqs; + struct vduse_iova_domain *domain; + char *name; + struct mutex lock; + spinlock_t msg_lock; + u64 msg_unique; + u32 msg_timeout; + wait_queue_head_t waitq; + struct list_head send_list; + struct list_head recv_list; + struct vdpa_callback config_cb; + struct work_struct inject; + spinlock_t irq_lock; + struct rw_semaphore rwsem; + int minor; + bool broken; + bool connected; + u64 api_version; + u64 device_features; + u64 driver_features; + u32 device_id; + u32 vendor_id; + u32 generation; + u32 config_size; + void *config; + u8 status; + u32 vq_num; + u32 vq_align; + struct vduse_umem *umem; + struct mutex mem_lock; + unsigned int bounce_size; + struct mutex domain_lock; +}; + +struct vduse_dev_msg { + struct vduse_dev_request req; + struct vduse_dev_response resp; + struct list_head list; + wait_queue_head_t waitq; + bool completed; +}; + +struct vduse_control { + u64 api_version; +}; + +static DEFINE_MUTEX(vduse_lock); +static DEFINE_IDR(vduse_idr); + +static dev_t vduse_major; +static struct class *vduse_class; +static struct cdev vduse_ctrl_cdev; +static struct cdev vduse_cdev; +static struct workqueue_struct *vduse_irq_wq; +static struct workqueue_struct *vduse_irq_bound_wq; + +static u32 allowed_device_id[] = { + VIRTIO_ID_BLOCK, +}; + +static inline struct vduse_dev *vdpa_to_vduse(struct vdpa_device *vdpa) +{ + struct vduse_vdpa *vdev = container_of(vdpa, struct vduse_vdpa, vdpa); + + return vdev->dev; +} + +static inline struct vduse_dev *dev_to_vduse(struct device *dev) +{ + struct vdpa_device *vdpa = dev_to_vdpa(dev); + + return vdpa_to_vduse(vdpa); +} + +static struct vduse_dev_msg *vduse_find_msg(struct list_head *head, + uint32_t request_id) +{ + struct vduse_dev_msg *msg; + + list_for_each_entry(msg, head, list) { + if (msg->req.request_id == request_id) { + list_del(&msg->list); + return msg; + } + } + + return NULL; +} + +static struct vduse_dev_msg *vduse_dequeue_msg(struct list_head *head) +{ + struct vduse_dev_msg *msg = NULL; + + if (!list_empty(head)) { + msg = list_first_entry(head, struct vduse_dev_msg, list); + list_del(&msg->list); + } + + return msg; +} + +static void vduse_enqueue_msg(struct list_head *head, + struct vduse_dev_msg *msg) +{ + list_add_tail(&msg->list, head); +} + +static void vduse_dev_broken(struct vduse_dev *dev) +{ + struct vduse_dev_msg *msg, *tmp; + + if (unlikely(dev->broken)) + return; + + list_splice_init(&dev->recv_list, &dev->send_list); + list_for_each_entry_safe(msg, tmp, &dev->send_list, list) { + list_del(&msg->list); + msg->completed = 1; + msg->resp.result = VDUSE_REQ_RESULT_FAILED; + wake_up(&msg->waitq); + } + dev->broken = true; + wake_up(&dev->waitq); +} + +static int vduse_dev_msg_sync(struct vduse_dev *dev, + struct vduse_dev_msg *msg) +{ + int ret; + + if (unlikely(dev->broken)) + return -EIO; + + init_waitqueue_head(&msg->waitq); + spin_lock(&dev->msg_lock); + if (unlikely(dev->broken)) { + spin_unlock(&dev->msg_lock); + return -EIO; + } + msg->req.request_id = dev->msg_unique++; + vduse_enqueue_msg(&dev->send_list, msg); + wake_up(&dev->waitq); + spin_unlock(&dev->msg_lock); + if (dev->msg_timeout) + ret = wait_event_killable_timeout(msg->waitq, msg->completed, + (long)dev->msg_timeout * HZ); + else + ret = wait_event_killable(msg->waitq, msg->completed); + + spin_lock(&dev->msg_lock); + if (!msg->completed) { + list_del(&msg->list); + msg->resp.result = VDUSE_REQ_RESULT_FAILED; + /* Mark the device as malfunction when there is a timeout */ + if (!ret) + vduse_dev_broken(dev); + } + ret = (msg->resp.result == VDUSE_REQ_RESULT_OK) ? 0 : -EIO; + spin_unlock(&dev->msg_lock); + + return ret; +} + +static int vduse_dev_get_vq_state_packed(struct vduse_dev *dev, + struct vduse_virtqueue *vq, + struct vdpa_vq_state_packed *packed) +{ + struct vduse_dev_msg msg = { 0 }; + int ret; + + msg.req.type = VDUSE_GET_VQ_STATE; + msg.req.vq_state.index = vq->index; + + ret = vduse_dev_msg_sync(dev, &msg); + if (ret) + return ret; + + packed->last_avail_counter = + msg.resp.vq_state.packed.last_avail_counter & 0x0001; + packed->last_avail_idx = + msg.resp.vq_state.packed.last_avail_idx & 0x7FFF; + packed->last_used_counter = + msg.resp.vq_state.packed.last_used_counter & 0x0001; + packed->last_used_idx = + msg.resp.vq_state.packed.last_used_idx & 0x7FFF; + + return 0; +} + +static int vduse_dev_get_vq_state_split(struct vduse_dev *dev, + struct vduse_virtqueue *vq, + struct vdpa_vq_state_split *split) +{ + struct vduse_dev_msg msg = { 0 }; + int ret; + + msg.req.type = VDUSE_GET_VQ_STATE; + msg.req.vq_state.index = vq->index; + + ret = vduse_dev_msg_sync(dev, &msg); + if (ret) + return ret; + + split->avail_index = msg.resp.vq_state.split.avail_index; + + return 0; +} + +static int vduse_dev_set_status(struct vduse_dev *dev, u8 status) +{ + struct vduse_dev_msg msg = { 0 }; + + msg.req.type = VDUSE_SET_STATUS; + msg.req.s.status = status; + + return vduse_dev_msg_sync(dev, &msg); +} + +static int vduse_dev_update_iotlb(struct vduse_dev *dev, + u64 start, u64 last) +{ + struct vduse_dev_msg msg = { 0 }; + + if (last < start) + return -EINVAL; + + msg.req.type = VDUSE_UPDATE_IOTLB; + msg.req.iova.start = start; + msg.req.iova.last = last; + + return vduse_dev_msg_sync(dev, &msg); +} + +static ssize_t vduse_dev_read_iter(struct kiocb *iocb, struct iov_iter *to) +{ + struct file *file = iocb->ki_filp; + struct vduse_dev *dev = file->private_data; + struct vduse_dev_msg *msg; + int size = sizeof(struct vduse_dev_request); + ssize_t ret; + + if (iov_iter_count(to) < size) + return -EINVAL; + + spin_lock(&dev->msg_lock); + while (1) { + msg = vduse_dequeue_msg(&dev->send_list); + if (msg) + break; + + ret = -EAGAIN; + if (file->f_flags & O_NONBLOCK) + goto unlock; + + spin_unlock(&dev->msg_lock); + ret = wait_event_interruptible_exclusive(dev->waitq, + !list_empty(&dev->send_list)); + if (ret) + return ret; + + spin_lock(&dev->msg_lock); + } + spin_unlock(&dev->msg_lock); + ret = copy_to_iter(&msg->req, size, to); + spin_lock(&dev->msg_lock); + if (ret != size) { + ret = -EFAULT; + vduse_enqueue_msg(&dev->send_list, msg); + goto unlock; + } + vduse_enqueue_msg(&dev->recv_list, msg); +unlock: + spin_unlock(&dev->msg_lock); + + return ret; +} + +static bool is_mem_zero(const char *ptr, int size) +{ + int i; + + for (i = 0; i < size; i++) { + if (ptr[i]) + return false; + } + return true; +} + +static ssize_t vduse_dev_write_iter(struct kiocb *iocb, struct iov_iter *from) +{ + struct file *file = iocb->ki_filp; + struct vduse_dev *dev = file->private_data; + struct vduse_dev_response resp; + struct vduse_dev_msg *msg; + size_t ret; + + ret = copy_from_iter(&resp, sizeof(resp), from); + if (ret != sizeof(resp)) + return -EINVAL; + + if (!is_mem_zero((const char *)resp.reserved, sizeof(resp.reserved))) + return -EINVAL; + + spin_lock(&dev->msg_lock); + msg = vduse_find_msg(&dev->recv_list, resp.request_id); + if (!msg) { + ret = -ENOENT; + goto unlock; + } + + memcpy(&msg->resp, &resp, sizeof(resp)); + msg->completed = 1; + wake_up(&msg->waitq); +unlock: + spin_unlock(&dev->msg_lock); + + return ret; +} + +static __poll_t vduse_dev_poll(struct file *file, poll_table *wait) +{ + struct vduse_dev *dev = file->private_data; + __poll_t mask = 0; + + poll_wait(file, &dev->waitq, wait); + + spin_lock(&dev->msg_lock); + + if (unlikely(dev->broken)) + mask |= EPOLLERR; + if (!list_empty(&dev->send_list)) + mask |= EPOLLIN | EPOLLRDNORM; + if (!list_empty(&dev->recv_list)) + mask |= EPOLLOUT | EPOLLWRNORM; + + spin_unlock(&dev->msg_lock); + + return mask; +} + +static void vduse_dev_reset(struct vduse_dev *dev) +{ + int i; + struct vduse_iova_domain *domain = dev->domain; + + /* The coherent mappings are handled in vduse_dev_free_coherent() */ + if (domain && domain->bounce_map) + vduse_domain_reset_bounce_map(domain); + + down_write(&dev->rwsem); + + dev->status = 0; + dev->driver_features = 0; + dev->generation++; + spin_lock(&dev->irq_lock); + dev->config_cb.callback = NULL; + dev->config_cb.private = NULL; + spin_unlock(&dev->irq_lock); + flush_work(&dev->inject); + + for (i = 0; i < dev->vq_num; i++) { + struct vduse_virtqueue *vq = dev->vqs[i]; + + vq->ready = false; + vq->desc_addr = 0; + vq->driver_addr = 0; + vq->device_addr = 0; + vq->num = 0; + memset(&vq->state, 0, sizeof(vq->state)); + + spin_lock(&vq->kick_lock); + vq->kicked = false; + if (vq->kickfd) + eventfd_ctx_put(vq->kickfd); + vq->kickfd = NULL; + spin_unlock(&vq->kick_lock); + + spin_lock(&vq->irq_lock); + vq->cb.callback = NULL; + vq->cb.private = NULL; + vq->cb.trigger = NULL; + spin_unlock(&vq->irq_lock); + flush_work(&vq->inject); + flush_work(&vq->kick); + } + + up_write(&dev->rwsem); +} + +static int vduse_vdpa_set_vq_address(struct vdpa_device *vdpa, u16 idx, + u64 desc_area, u64 driver_area, + u64 device_area) +{ + struct vduse_dev *dev = vdpa_to_vduse(vdpa); + struct vduse_virtqueue *vq = dev->vqs[idx]; + + vq->desc_addr = desc_area; + vq->driver_addr = driver_area; + vq->device_addr = device_area; + + return 0; +} + +static void vduse_vq_kick(struct vduse_virtqueue *vq) +{ + spin_lock(&vq->kick_lock); + if (!vq->ready) + goto unlock; + + if (vq->kickfd) + eventfd_signal(vq->kickfd, 1); + else + vq->kicked = true; +unlock: + spin_unlock(&vq->kick_lock); +} + +static void vduse_vq_kick_work(struct work_struct *work) +{ + struct vduse_virtqueue *vq = container_of(work, + struct vduse_virtqueue, kick); + + vduse_vq_kick(vq); +} + +static void vduse_vdpa_kick_vq(struct vdpa_device *vdpa, u16 idx) +{ + struct vduse_dev *dev = vdpa_to_vduse(vdpa); + struct vduse_virtqueue *vq = dev->vqs[idx]; + + if (!eventfd_signal_allowed()) { + schedule_work(&vq->kick); + return; + } + vduse_vq_kick(vq); +} + +static void vduse_vdpa_set_vq_cb(struct vdpa_device *vdpa, u16 idx, + struct vdpa_callback *cb) +{ + struct vduse_dev *dev = vdpa_to_vduse(vdpa); + struct vduse_virtqueue *vq = dev->vqs[idx]; + + spin_lock(&vq->irq_lock); + vq->cb.callback = cb->callback; + vq->cb.private = cb->private; + vq->cb.trigger = cb->trigger; + spin_unlock(&vq->irq_lock); +} + +static void vduse_vdpa_set_vq_num(struct vdpa_device *vdpa, u16 idx, u32 num) +{ + struct vduse_dev *dev = vdpa_to_vduse(vdpa); + struct vduse_virtqueue *vq = dev->vqs[idx]; + + vq->num = num; +} + +static void vduse_vdpa_set_vq_ready(struct vdpa_device *vdpa, + u16 idx, bool ready) +{ + struct vduse_dev *dev = vdpa_to_vduse(vdpa); + struct vduse_virtqueue *vq = dev->vqs[idx]; + + vq->ready = ready; +} + +static bool vduse_vdpa_get_vq_ready(struct vdpa_device *vdpa, u16 idx) +{ + struct vduse_dev *dev = vdpa_to_vduse(vdpa); + struct vduse_virtqueue *vq = dev->vqs[idx]; + + return vq->ready; +} + +static int vduse_vdpa_set_vq_state(struct vdpa_device *vdpa, u16 idx, + const struct vdpa_vq_state *state) +{ + struct vduse_dev *dev = vdpa_to_vduse(vdpa); + struct vduse_virtqueue *vq = dev->vqs[idx]; + + if (dev->driver_features & BIT_ULL(VIRTIO_F_RING_PACKED)) { + vq->state.packed.last_avail_counter = + state->packed.last_avail_counter; + vq->state.packed.last_avail_idx = state->packed.last_avail_idx; + vq->state.packed.last_used_counter = + state->packed.last_used_counter; + vq->state.packed.last_used_idx = state->packed.last_used_idx; + } else + vq->state.split.avail_index = state->split.avail_index; + + return 0; +} + +static int vduse_vdpa_get_vq_state(struct vdpa_device *vdpa, u16 idx, + struct vdpa_vq_state *state) +{ + struct vduse_dev *dev = vdpa_to_vduse(vdpa); + struct vduse_virtqueue *vq = dev->vqs[idx]; + + if (dev->driver_features & BIT_ULL(VIRTIO_F_RING_PACKED)) + return vduse_dev_get_vq_state_packed(dev, vq, &state->packed); + + return vduse_dev_get_vq_state_split(dev, vq, &state->split); +} + +static u32 vduse_vdpa_get_vq_align(struct vdpa_device *vdpa) +{ + struct vduse_dev *dev = vdpa_to_vduse(vdpa); + + return dev->vq_align; +} + +static u64 vduse_vdpa_get_device_features(struct vdpa_device *vdpa) +{ + struct vduse_dev *dev = vdpa_to_vduse(vdpa); + + return dev->device_features; +} + +static int vduse_vdpa_set_driver_features(struct vdpa_device *vdpa, u64 features) +{ + struct vduse_dev *dev = vdpa_to_vduse(vdpa); + + dev->driver_features = features; + return 0; +} + +static u64 vduse_vdpa_get_driver_features(struct vdpa_device *vdpa) +{ + struct vduse_dev *dev = vdpa_to_vduse(vdpa); + + return dev->driver_features; +} + +static void vduse_vdpa_set_config_cb(struct vdpa_device *vdpa, + struct vdpa_callback *cb) +{ + struct vduse_dev *dev = vdpa_to_vduse(vdpa); + + spin_lock(&dev->irq_lock); + dev->config_cb.callback = cb->callback; + dev->config_cb.private = cb->private; + spin_unlock(&dev->irq_lock); +} + +static u16 vduse_vdpa_get_vq_num_max(struct vdpa_device *vdpa) +{ + struct vduse_dev *dev = vdpa_to_vduse(vdpa); + u16 num_max = 0; + int i; + + for (i = 0; i < dev->vq_num; i++) + if (num_max < dev->vqs[i]->num_max) + num_max = dev->vqs[i]->num_max; + + return num_max; +} + +static u32 vduse_vdpa_get_device_id(struct vdpa_device *vdpa) +{ + struct vduse_dev *dev = vdpa_to_vduse(vdpa); + + return dev->device_id; +} + +static u32 vduse_vdpa_get_vendor_id(struct vdpa_device *vdpa) +{ + struct vduse_dev *dev = vdpa_to_vduse(vdpa); + + return dev->vendor_id; +} + +static u8 vduse_vdpa_get_status(struct vdpa_device *vdpa) +{ + struct vduse_dev *dev = vdpa_to_vduse(vdpa); + + return dev->status; +} + +static void vduse_vdpa_set_status(struct vdpa_device *vdpa, u8 status) +{ + struct vduse_dev *dev = vdpa_to_vduse(vdpa); + + if (vduse_dev_set_status(dev, status)) + return; + + dev->status = status; +} + +static size_t vduse_vdpa_get_config_size(struct vdpa_device *vdpa) +{ + struct vduse_dev *dev = vdpa_to_vduse(vdpa); + + return dev->config_size; +} + +static void vduse_vdpa_get_config(struct vdpa_device *vdpa, unsigned int offset, + void *buf, unsigned int len) +{ + struct vduse_dev *dev = vdpa_to_vduse(vdpa); + + /* Initialize the buffer in case of partial copy. */ + memset(buf, 0, len); + + if (offset > dev->config_size) + return; + + if (len > dev->config_size - offset) + len = dev->config_size - offset; + + memcpy(buf, dev->config + offset, len); +} + +static void vduse_vdpa_set_config(struct vdpa_device *vdpa, unsigned int offset, + const void *buf, unsigned int len) +{ + /* Now we only support read-only configuration space */ +} + +static int vduse_vdpa_reset(struct vdpa_device *vdpa) +{ + struct vduse_dev *dev = vdpa_to_vduse(vdpa); + int ret = vduse_dev_set_status(dev, 0); + + vduse_dev_reset(dev); + + return ret; +} + +static u32 vduse_vdpa_get_generation(struct vdpa_device *vdpa) +{ + struct vduse_dev *dev = vdpa_to_vduse(vdpa); + + return dev->generation; +} + +static int vduse_vdpa_set_vq_affinity(struct vdpa_device *vdpa, u16 idx, + const struct cpumask *cpu_mask) +{ + struct vduse_dev *dev = vdpa_to_vduse(vdpa); + + if (cpu_mask) + cpumask_copy(&dev->vqs[idx]->irq_affinity, cpu_mask); + else + cpumask_setall(&dev->vqs[idx]->irq_affinity); + + return 0; +} + +static const struct cpumask * +vduse_vdpa_get_vq_affinity(struct vdpa_device *vdpa, u16 idx) +{ + struct vduse_dev *dev = vdpa_to_vduse(vdpa); + + return &dev->vqs[idx]->irq_affinity; +} + +static int vduse_vdpa_set_map(struct vdpa_device *vdpa, + unsigned int asid, + struct vhost_iotlb *iotlb) +{ + struct vduse_dev *dev = vdpa_to_vduse(vdpa); + int ret; + + ret = vduse_domain_set_map(dev->domain, iotlb); + if (ret) + return ret; + + ret = vduse_dev_update_iotlb(dev, 0ULL, ULLONG_MAX); + if (ret) { + vduse_domain_clear_map(dev->domain, iotlb); + return ret; + } + + return 0; +} + +static void vduse_vdpa_free(struct vdpa_device *vdpa) +{ + struct vduse_dev *dev = vdpa_to_vduse(vdpa); + + dev->vdev = NULL; +} + +static const struct vdpa_config_ops vduse_vdpa_config_ops = { + .set_vq_address = vduse_vdpa_set_vq_address, + .kick_vq = vduse_vdpa_kick_vq, + .set_vq_cb = vduse_vdpa_set_vq_cb, + .set_vq_num = vduse_vdpa_set_vq_num, + .set_vq_ready = vduse_vdpa_set_vq_ready, + .get_vq_ready = vduse_vdpa_get_vq_ready, + .set_vq_state = vduse_vdpa_set_vq_state, + .get_vq_state = vduse_vdpa_get_vq_state, + .get_vq_align = vduse_vdpa_get_vq_align, + .get_device_features = vduse_vdpa_get_device_features, + .set_driver_features = vduse_vdpa_set_driver_features, + .get_driver_features = vduse_vdpa_get_driver_features, + .set_config_cb = vduse_vdpa_set_config_cb, + .get_vq_num_max = vduse_vdpa_get_vq_num_max, + .get_device_id = vduse_vdpa_get_device_id, + .get_vendor_id = vduse_vdpa_get_vendor_id, + .get_status = vduse_vdpa_get_status, + .set_status = vduse_vdpa_set_status, + .get_config_size = vduse_vdpa_get_config_size, + .get_config = vduse_vdpa_get_config, + .set_config = vduse_vdpa_set_config, + .get_generation = vduse_vdpa_get_generation, + .set_vq_affinity = vduse_vdpa_set_vq_affinity, + .get_vq_affinity = vduse_vdpa_get_vq_affinity, + .reset = vduse_vdpa_reset, + .set_map = vduse_vdpa_set_map, + .free = vduse_vdpa_free, +}; + +static dma_addr_t vduse_dev_map_page(struct device *dev, struct page *page, + unsigned long offset, size_t size, + enum dma_data_direction dir, + unsigned long attrs) +{ + struct vduse_dev *vdev = dev_to_vduse(dev); + struct vduse_iova_domain *domain = vdev->domain; + + return vduse_domain_map_page(domain, page, offset, size, dir, attrs); +} + +static void vduse_dev_unmap_page(struct device *dev, dma_addr_t dma_addr, + size_t size, enum dma_data_direction dir, + unsigned long attrs) +{ + struct vduse_dev *vdev = dev_to_vduse(dev); + struct vduse_iova_domain *domain = vdev->domain; + + return vduse_domain_unmap_page(domain, dma_addr, size, dir, attrs); +} + +static void *vduse_dev_alloc_coherent(struct device *dev, size_t size, + dma_addr_t *dma_addr, gfp_t flag, + unsigned long attrs) +{ + struct vduse_dev *vdev = dev_to_vduse(dev); + struct vduse_iova_domain *domain = vdev->domain; + unsigned long iova; + void *addr; + + *dma_addr = DMA_MAPPING_ERROR; + addr = vduse_domain_alloc_coherent(domain, size, + (dma_addr_t *)&iova, flag, attrs); + if (!addr) + return NULL; + + *dma_addr = (dma_addr_t)iova; + + return addr; +} + +static void vduse_dev_free_coherent(struct device *dev, size_t size, + void *vaddr, dma_addr_t dma_addr, + unsigned long attrs) +{ + struct vduse_dev *vdev = dev_to_vduse(dev); + struct vduse_iova_domain *domain = vdev->domain; + + vduse_domain_free_coherent(domain, size, vaddr, dma_addr, attrs); +} + +static size_t vduse_dev_max_mapping_size(struct device *dev) +{ + struct vduse_dev *vdev = dev_to_vduse(dev); + struct vduse_iova_domain *domain = vdev->domain; + + return domain->bounce_size; +} + +static const struct dma_map_ops vduse_dev_dma_ops = { + .map_page = vduse_dev_map_page, + .unmap_page = vduse_dev_unmap_page, + .alloc = vduse_dev_alloc_coherent, + .free = vduse_dev_free_coherent, + .max_mapping_size = vduse_dev_max_mapping_size, +}; + +static unsigned int perm_to_file_flags(u8 perm) +{ + unsigned int flags = 0; + + switch (perm) { + case VDUSE_ACCESS_WO: + flags |= O_WRONLY; + break; + case VDUSE_ACCESS_RO: + flags |= O_RDONLY; + break; + case VDUSE_ACCESS_RW: + flags |= O_RDWR; + break; + default: + WARN(1, "invalidate vhost IOTLB permission\n"); + break; + } + + return flags; +} + +static int vduse_kickfd_setup(struct vduse_dev *dev, + struct vduse_vq_eventfd *eventfd) +{ + struct eventfd_ctx *ctx = NULL; + struct vduse_virtqueue *vq; + u32 index; + + if (eventfd->index >= dev->vq_num) + return -EINVAL; + + index = array_index_nospec(eventfd->index, dev->vq_num); + vq = dev->vqs[index]; + if (eventfd->fd >= 0) { + ctx = eventfd_ctx_fdget(eventfd->fd); + if (IS_ERR(ctx)) + return PTR_ERR(ctx); + } else if (eventfd->fd != VDUSE_EVENTFD_DEASSIGN) + return 0; + + spin_lock(&vq->kick_lock); + if (vq->kickfd) + eventfd_ctx_put(vq->kickfd); + vq->kickfd = ctx; + if (vq->ready && vq->kicked && vq->kickfd) { + eventfd_signal(vq->kickfd, 1); + vq->kicked = false; + } + spin_unlock(&vq->kick_lock); + + return 0; +} + +static bool vduse_dev_is_ready(struct vduse_dev *dev) +{ + int i; + + for (i = 0; i < dev->vq_num; i++) + if (!dev->vqs[i]->num_max) + return false; + + return true; +} + +static void vduse_dev_irq_inject(struct work_struct *work) +{ + struct vduse_dev *dev = container_of(work, struct vduse_dev, inject); + + spin_lock_bh(&dev->irq_lock); + if (dev->config_cb.callback) + dev->config_cb.callback(dev->config_cb.private); + spin_unlock_bh(&dev->irq_lock); +} + +static void vduse_vq_irq_inject(struct work_struct *work) +{ + struct vduse_virtqueue *vq = container_of(work, + struct vduse_virtqueue, inject); + + spin_lock_bh(&vq->irq_lock); + if (vq->ready && vq->cb.callback) + vq->cb.callback(vq->cb.private); + spin_unlock_bh(&vq->irq_lock); +} + +static bool vduse_vq_signal_irqfd(struct vduse_virtqueue *vq) +{ + bool signal = false; + + if (!vq->cb.trigger) + return false; + + spin_lock_irq(&vq->irq_lock); + if (vq->ready && vq->cb.trigger) { + eventfd_signal(vq->cb.trigger, 1); + signal = true; + } + spin_unlock_irq(&vq->irq_lock); + + return signal; +} + +static int vduse_dev_queue_irq_work(struct vduse_dev *dev, + struct work_struct *irq_work, + int irq_effective_cpu) +{ + int ret = -EINVAL; + + down_read(&dev->rwsem); + if (!(dev->status & VIRTIO_CONFIG_S_DRIVER_OK)) + goto unlock; + + ret = 0; + if (irq_effective_cpu == IRQ_UNBOUND) + queue_work(vduse_irq_wq, irq_work); + else + queue_work_on(irq_effective_cpu, + vduse_irq_bound_wq, irq_work); +unlock: + up_read(&dev->rwsem); + + return ret; +} + +static int vduse_dev_dereg_umem(struct vduse_dev *dev, + u64 iova, u64 size) +{ + int ret; + + mutex_lock(&dev->mem_lock); + ret = -ENOENT; + if (!dev->umem) + goto unlock; + + ret = -EINVAL; + if (!dev->domain) + goto unlock; + + if (dev->umem->iova != iova || size != dev->domain->bounce_size) + goto unlock; + + vduse_domain_remove_user_bounce_pages(dev->domain); + unpin_user_pages_dirty_lock(dev->umem->pages, + dev->umem->npages, true); + atomic64_sub(dev->umem->npages, &dev->umem->mm->pinned_vm); + mmdrop(dev->umem->mm); + vfree(dev->umem->pages); + kfree(dev->umem); + dev->umem = NULL; + ret = 0; +unlock: + mutex_unlock(&dev->mem_lock); + return ret; +} + +static int vduse_dev_reg_umem(struct vduse_dev *dev, + u64 iova, u64 uaddr, u64 size) +{ + struct page **page_list = NULL; + struct vduse_umem *umem = NULL; + long pinned = 0; + unsigned long npages, lock_limit; + int ret; + + if (!dev->domain || !dev->domain->bounce_map || + size != dev->domain->bounce_size || + iova != 0 || uaddr & ~PAGE_MASK) + return -EINVAL; + + mutex_lock(&dev->mem_lock); + ret = -EEXIST; + if (dev->umem) + goto unlock; + + ret = -ENOMEM; + npages = size >> PAGE_SHIFT; + page_list = __vmalloc(array_size(npages, sizeof(struct page *)), + GFP_KERNEL_ACCOUNT); + umem = kzalloc(sizeof(*umem), GFP_KERNEL); + if (!page_list || !umem) + goto unlock; + + mmap_read_lock(current->mm); + + lock_limit = PFN_DOWN(rlimit(RLIMIT_MEMLOCK)); + if (npages + atomic64_read(¤t->mm->pinned_vm) > lock_limit) + goto out; + + pinned = pin_user_pages(uaddr, npages, FOLL_LONGTERM | FOLL_WRITE, + page_list); + if (pinned != npages) { + ret = pinned < 0 ? pinned : -ENOMEM; + goto out; + } + + ret = vduse_domain_add_user_bounce_pages(dev->domain, + page_list, pinned); + if (ret) + goto out; + + atomic64_add(npages, ¤t->mm->pinned_vm); + + umem->pages = page_list; + umem->npages = pinned; + umem->iova = iova; + umem->mm = current->mm; + mmgrab(current->mm); + + dev->umem = umem; +out: + if (ret && pinned > 0) + unpin_user_pages(page_list, pinned); + + mmap_read_unlock(current->mm); +unlock: + if (ret) { + vfree(page_list); + kfree(umem); + } + mutex_unlock(&dev->mem_lock); + return ret; +} + +static void vduse_vq_update_effective_cpu(struct vduse_virtqueue *vq) +{ + int curr_cpu = vq->irq_effective_cpu; + + while (true) { + curr_cpu = cpumask_next(curr_cpu, &vq->irq_affinity); + if (cpu_online(curr_cpu)) + break; + + if (curr_cpu >= nr_cpu_ids) + curr_cpu = IRQ_UNBOUND; + } + + vq->irq_effective_cpu = curr_cpu; +} + +static long vduse_dev_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + struct vduse_dev *dev = file->private_data; + void __user *argp = (void __user *)arg; + int ret; + + if (unlikely(dev->broken)) + return -EPERM; + + switch (cmd) { + case VDUSE_IOTLB_GET_FD: { + struct vduse_iotlb_entry entry; + struct vhost_iotlb_map *map; + struct vdpa_map_file *map_file; + struct file *f = NULL; + + ret = -EFAULT; + if (copy_from_user(&entry, argp, sizeof(entry))) + break; + + ret = -EINVAL; + if (entry.start > entry.last) + break; + + mutex_lock(&dev->domain_lock); + if (!dev->domain) { + mutex_unlock(&dev->domain_lock); + break; + } + spin_lock(&dev->domain->iotlb_lock); + map = vhost_iotlb_itree_first(dev->domain->iotlb, + entry.start, entry.last); + if (map) { + map_file = (struct vdpa_map_file *)map->opaque; + f = get_file(map_file->file); + entry.offset = map_file->offset; + entry.start = map->start; + entry.last = map->last; + entry.perm = map->perm; + } + spin_unlock(&dev->domain->iotlb_lock); + mutex_unlock(&dev->domain_lock); + ret = -EINVAL; + if (!f) + break; + + ret = -EFAULT; + if (copy_to_user(argp, &entry, sizeof(entry))) { + fput(f); + break; + } + ret = receive_fd(f, perm_to_file_flags(entry.perm)); + fput(f); + break; + } + case VDUSE_DEV_GET_FEATURES: + /* + * Just mirror what driver wrote here. + * The driver is expected to check FEATURE_OK later. + */ + ret = put_user(dev->driver_features, (u64 __user *)argp); + break; + case VDUSE_DEV_SET_CONFIG: { + struct vduse_config_data config; + unsigned long size = offsetof(struct vduse_config_data, + buffer); + + ret = -EFAULT; + if (copy_from_user(&config, argp, size)) + break; + + ret = -EINVAL; + if (config.offset > dev->config_size || + config.length == 0 || + config.length > dev->config_size - config.offset) + break; + + ret = -EFAULT; + if (copy_from_user(dev->config + config.offset, argp + size, + config.length)) + break; + + ret = 0; + break; + } + case VDUSE_DEV_INJECT_CONFIG_IRQ: + ret = vduse_dev_queue_irq_work(dev, &dev->inject, IRQ_UNBOUND); + break; + case VDUSE_VQ_SETUP: { + struct vduse_vq_config config; + u32 index; + + ret = -EFAULT; + if (copy_from_user(&config, argp, sizeof(config))) + break; + + ret = -EINVAL; + if (config.index >= dev->vq_num) + break; + + if (!is_mem_zero((const char *)config.reserved, + sizeof(config.reserved))) + break; + + index = array_index_nospec(config.index, dev->vq_num); + dev->vqs[index]->num_max = config.max_size; + ret = 0; + break; + } + case VDUSE_VQ_GET_INFO: { + struct vduse_vq_info vq_info; + struct vduse_virtqueue *vq; + u32 index; + + ret = -EFAULT; + if (copy_from_user(&vq_info, argp, sizeof(vq_info))) + break; + + ret = -EINVAL; + if (vq_info.index >= dev->vq_num) + break; + + index = array_index_nospec(vq_info.index, dev->vq_num); + vq = dev->vqs[index]; + vq_info.desc_addr = vq->desc_addr; + vq_info.driver_addr = vq->driver_addr; + vq_info.device_addr = vq->device_addr; + vq_info.num = vq->num; + + if (dev->driver_features & BIT_ULL(VIRTIO_F_RING_PACKED)) { + vq_info.packed.last_avail_counter = + vq->state.packed.last_avail_counter; + vq_info.packed.last_avail_idx = + vq->state.packed.last_avail_idx; + vq_info.packed.last_used_counter = + vq->state.packed.last_used_counter; + vq_info.packed.last_used_idx = + vq->state.packed.last_used_idx; + } else + vq_info.split.avail_index = + vq->state.split.avail_index; + + vq_info.ready = vq->ready; + + ret = -EFAULT; + if (copy_to_user(argp, &vq_info, sizeof(vq_info))) + break; + + ret = 0; + break; + } + case VDUSE_VQ_SETUP_KICKFD: { + struct vduse_vq_eventfd eventfd; + + ret = -EFAULT; + if (copy_from_user(&eventfd, argp, sizeof(eventfd))) + break; + + ret = vduse_kickfd_setup(dev, &eventfd); + break; + } + case VDUSE_VQ_INJECT_IRQ: { + u32 index; + + ret = -EFAULT; + if (get_user(index, (u32 __user *)argp)) + break; + + ret = -EINVAL; + if (index >= dev->vq_num) + break; + + ret = 0; + index = array_index_nospec(index, dev->vq_num); + if (!vduse_vq_signal_irqfd(dev->vqs[index])) { + vduse_vq_update_effective_cpu(dev->vqs[index]); + ret = vduse_dev_queue_irq_work(dev, + &dev->vqs[index]->inject, + dev->vqs[index]->irq_effective_cpu); + } + break; + } + case VDUSE_IOTLB_REG_UMEM: { + struct vduse_iova_umem umem; + + ret = -EFAULT; + if (copy_from_user(&umem, argp, sizeof(umem))) + break; + + ret = -EINVAL; + if (!is_mem_zero((const char *)umem.reserved, + sizeof(umem.reserved))) + break; + + mutex_lock(&dev->domain_lock); + ret = vduse_dev_reg_umem(dev, umem.iova, + umem.uaddr, umem.size); + mutex_unlock(&dev->domain_lock); + break; + } + case VDUSE_IOTLB_DEREG_UMEM: { + struct vduse_iova_umem umem; + + ret = -EFAULT; + if (copy_from_user(&umem, argp, sizeof(umem))) + break; + + ret = -EINVAL; + if (!is_mem_zero((const char *)umem.reserved, + sizeof(umem.reserved))) + break; + mutex_lock(&dev->domain_lock); + ret = vduse_dev_dereg_umem(dev, umem.iova, + umem.size); + mutex_unlock(&dev->domain_lock); + break; + } + case VDUSE_IOTLB_GET_INFO: { + struct vduse_iova_info info; + struct vhost_iotlb_map *map; + + ret = -EFAULT; + if (copy_from_user(&info, argp, sizeof(info))) + break; + + ret = -EINVAL; + if (info.start > info.last) + break; + + if (!is_mem_zero((const char *)info.reserved, + sizeof(info.reserved))) + break; + + mutex_lock(&dev->domain_lock); + if (!dev->domain) { + mutex_unlock(&dev->domain_lock); + break; + } + spin_lock(&dev->domain->iotlb_lock); + map = vhost_iotlb_itree_first(dev->domain->iotlb, + info.start, info.last); + if (map) { + info.start = map->start; + info.last = map->last; + info.capability = 0; + if (dev->domain->bounce_map && map->start == 0 && + map->last == dev->domain->bounce_size - 1) + info.capability |= VDUSE_IOVA_CAP_UMEM; + } + spin_unlock(&dev->domain->iotlb_lock); + mutex_unlock(&dev->domain_lock); + if (!map) + break; + + ret = -EFAULT; + if (copy_to_user(argp, &info, sizeof(info))) + break; + + ret = 0; + break; + } + default: + ret = -ENOIOCTLCMD; + break; + } + + return ret; +} + +static int vduse_dev_release(struct inode *inode, struct file *file) +{ + struct vduse_dev *dev = file->private_data; + + mutex_lock(&dev->domain_lock); + if (dev->domain) + vduse_dev_dereg_umem(dev, 0, dev->domain->bounce_size); + mutex_unlock(&dev->domain_lock); + spin_lock(&dev->msg_lock); + /* Make sure the inflight messages can processed after reconncection */ + list_splice_init(&dev->recv_list, &dev->send_list); + spin_unlock(&dev->msg_lock); + dev->connected = false; + + return 0; +} + +static struct vduse_dev *vduse_dev_get_from_minor(int minor) +{ + struct vduse_dev *dev; + + mutex_lock(&vduse_lock); + dev = idr_find(&vduse_idr, minor); + mutex_unlock(&vduse_lock); + + return dev; +} + +static int vduse_dev_open(struct inode *inode, struct file *file) +{ + int ret; + struct vduse_dev *dev = vduse_dev_get_from_minor(iminor(inode)); + + if (!dev) + return -ENODEV; + + ret = -EBUSY; + mutex_lock(&dev->lock); + if (dev->connected) + goto unlock; + + ret = 0; + dev->connected = true; + file->private_data = dev; +unlock: + mutex_unlock(&dev->lock); + + return ret; +} + +static const struct file_operations vduse_dev_fops = { + .owner = THIS_MODULE, + .open = vduse_dev_open, + .release = vduse_dev_release, + .read_iter = vduse_dev_read_iter, + .write_iter = vduse_dev_write_iter, + .poll = vduse_dev_poll, + .unlocked_ioctl = vduse_dev_ioctl, + .compat_ioctl = compat_ptr_ioctl, + .llseek = noop_llseek, +}; + +static ssize_t irq_cb_affinity_show(struct vduse_virtqueue *vq, char *buf) +{ + return sprintf(buf, "%*pb\n", cpumask_pr_args(&vq->irq_affinity)); +} + +static ssize_t irq_cb_affinity_store(struct vduse_virtqueue *vq, + const char *buf, size_t count) +{ + cpumask_var_t new_value; + int ret; + + if (!zalloc_cpumask_var(&new_value, GFP_KERNEL)) + return -ENOMEM; + + ret = cpumask_parse(buf, new_value); + if (ret) + goto free_mask; + + ret = -EINVAL; + if (!cpumask_intersects(new_value, cpu_online_mask)) + goto free_mask; + + cpumask_copy(&vq->irq_affinity, new_value); + ret = count; +free_mask: + free_cpumask_var(new_value); + return ret; +} + +struct vq_sysfs_entry { + struct attribute attr; + ssize_t (*show)(struct vduse_virtqueue *vq, char *buf); + ssize_t (*store)(struct vduse_virtqueue *vq, const char *buf, + size_t count); +}; + +static struct vq_sysfs_entry irq_cb_affinity_attr = __ATTR_RW(irq_cb_affinity); + +static struct attribute *vq_attrs[] = { + &irq_cb_affinity_attr.attr, + NULL, +}; +ATTRIBUTE_GROUPS(vq); + +static ssize_t vq_attr_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct vduse_virtqueue *vq = container_of(kobj, + struct vduse_virtqueue, kobj); + struct vq_sysfs_entry *entry = container_of(attr, + struct vq_sysfs_entry, attr); + + if (!entry->show) + return -EIO; + + return entry->show(vq, buf); +} + +static ssize_t vq_attr_store(struct kobject *kobj, struct attribute *attr, + const char *buf, size_t count) +{ + struct vduse_virtqueue *vq = container_of(kobj, + struct vduse_virtqueue, kobj); + struct vq_sysfs_entry *entry = container_of(attr, + struct vq_sysfs_entry, attr); + + if (!entry->store) + return -EIO; + + return entry->store(vq, buf, count); +} + +static const struct sysfs_ops vq_sysfs_ops = { + .show = vq_attr_show, + .store = vq_attr_store, +}; + +static void vq_release(struct kobject *kobj) +{ + struct vduse_virtqueue *vq = container_of(kobj, + struct vduse_virtqueue, kobj); + kfree(vq); +} + +static const struct kobj_type vq_type = { + .release = vq_release, + .sysfs_ops = &vq_sysfs_ops, + .default_groups = vq_groups, +}; + +static void vduse_dev_deinit_vqs(struct vduse_dev *dev) +{ + int i; + + if (!dev->vqs) + return; + + for (i = 0; i < dev->vq_num; i++) + kobject_put(&dev->vqs[i]->kobj); + kfree(dev->vqs); +} + +static int vduse_dev_init_vqs(struct vduse_dev *dev, u32 vq_align, u32 vq_num) +{ + int ret, i; + + dev->vq_align = vq_align; + dev->vq_num = vq_num; + dev->vqs = kcalloc(dev->vq_num, sizeof(*dev->vqs), GFP_KERNEL); + if (!dev->vqs) + return -ENOMEM; + + for (i = 0; i < vq_num; i++) { + dev->vqs[i] = kzalloc(sizeof(*dev->vqs[i]), GFP_KERNEL); + if (!dev->vqs[i]) { + ret = -ENOMEM; + goto err; + } + + dev->vqs[i]->index = i; + dev->vqs[i]->irq_effective_cpu = IRQ_UNBOUND; + INIT_WORK(&dev->vqs[i]->inject, vduse_vq_irq_inject); + INIT_WORK(&dev->vqs[i]->kick, vduse_vq_kick_work); + spin_lock_init(&dev->vqs[i]->kick_lock); + spin_lock_init(&dev->vqs[i]->irq_lock); + cpumask_setall(&dev->vqs[i]->irq_affinity); + + kobject_init(&dev->vqs[i]->kobj, &vq_type); + ret = kobject_add(&dev->vqs[i]->kobj, + &dev->dev->kobj, "vq%d", i); + if (ret) { + kfree(dev->vqs[i]); + goto err; + } + } + + return 0; +err: + while (i--) + kobject_put(&dev->vqs[i]->kobj); + kfree(dev->vqs); + dev->vqs = NULL; + return ret; +} + +static struct vduse_dev *vduse_dev_create(void) +{ + struct vduse_dev *dev = kzalloc(sizeof(*dev), GFP_KERNEL); + + if (!dev) + return NULL; + + mutex_init(&dev->lock); + mutex_init(&dev->mem_lock); + mutex_init(&dev->domain_lock); + spin_lock_init(&dev->msg_lock); + INIT_LIST_HEAD(&dev->send_list); + INIT_LIST_HEAD(&dev->recv_list); + spin_lock_init(&dev->irq_lock); + init_rwsem(&dev->rwsem); + + INIT_WORK(&dev->inject, vduse_dev_irq_inject); + init_waitqueue_head(&dev->waitq); + + return dev; +} + +static void vduse_dev_destroy(struct vduse_dev *dev) +{ + kfree(dev); +} + +static struct vduse_dev *vduse_find_dev(const char *name) +{ + struct vduse_dev *dev; + int id; + + idr_for_each_entry(&vduse_idr, dev, id) + if (!strcmp(dev->name, name)) + return dev; + + return NULL; +} + +static int vduse_destroy_dev(char *name) +{ + struct vduse_dev *dev = vduse_find_dev(name); + + if (!dev) + return -EINVAL; + + mutex_lock(&dev->lock); + if (dev->vdev || dev->connected) { + mutex_unlock(&dev->lock); + return -EBUSY; + } + dev->connected = true; + mutex_unlock(&dev->lock); + + vduse_dev_reset(dev); + device_destroy(vduse_class, MKDEV(MAJOR(vduse_major), dev->minor)); + idr_remove(&vduse_idr, dev->minor); + kvfree(dev->config); + vduse_dev_deinit_vqs(dev); + if (dev->domain) + vduse_domain_destroy(dev->domain); + kfree(dev->name); + vduse_dev_destroy(dev); + module_put(THIS_MODULE); + + return 0; +} + +static bool device_is_allowed(u32 device_id) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(allowed_device_id); i++) + if (allowed_device_id[i] == device_id) + return true; + + return false; +} + +static bool features_is_valid(u64 features) +{ + if (!(features & (1ULL << VIRTIO_F_ACCESS_PLATFORM))) + return false; + + /* Now we only support read-only configuration space */ + if (features & (1ULL << VIRTIO_BLK_F_CONFIG_WCE)) + return false; + + return true; +} + +static bool vduse_validate_config(struct vduse_dev_config *config) +{ + if (!is_mem_zero((const char *)config->reserved, + sizeof(config->reserved))) + return false; + + if (config->vq_align > PAGE_SIZE) + return false; + + if (config->config_size > PAGE_SIZE) + return false; + + if (config->vq_num > 0xffff) + return false; + + if (!config->name[0]) + return false; + + if (!device_is_allowed(config->device_id)) + return false; + + if (!features_is_valid(config->features)) + return false; + + return true; +} + +static ssize_t msg_timeout_show(struct device *device, + struct device_attribute *attr, char *buf) +{ + struct vduse_dev *dev = dev_get_drvdata(device); + + return sysfs_emit(buf, "%u\n", dev->msg_timeout); +} + +static ssize_t msg_timeout_store(struct device *device, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct vduse_dev *dev = dev_get_drvdata(device); + int ret; + + ret = kstrtouint(buf, 10, &dev->msg_timeout); + if (ret < 0) + return ret; + + return count; +} + +static DEVICE_ATTR_RW(msg_timeout); + +static ssize_t bounce_size_show(struct device *device, + struct device_attribute *attr, char *buf) +{ + struct vduse_dev *dev = dev_get_drvdata(device); + + return sysfs_emit(buf, "%u\n", dev->bounce_size); +} + +static ssize_t bounce_size_store(struct device *device, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct vduse_dev *dev = dev_get_drvdata(device); + unsigned int bounce_size; + int ret; + + ret = -EPERM; + mutex_lock(&dev->domain_lock); + if (dev->domain) + goto unlock; + + ret = kstrtouint(buf, 10, &bounce_size); + if (ret < 0) + goto unlock; + + ret = -EINVAL; + if (bounce_size > VDUSE_MAX_BOUNCE_SIZE || + bounce_size < VDUSE_MIN_BOUNCE_SIZE) + goto unlock; + + dev->bounce_size = bounce_size & PAGE_MASK; + ret = count; +unlock: + mutex_unlock(&dev->domain_lock); + return ret; +} + +static DEVICE_ATTR_RW(bounce_size); + +static struct attribute *vduse_dev_attrs[] = { + &dev_attr_msg_timeout.attr, + &dev_attr_bounce_size.attr, + NULL +}; + +ATTRIBUTE_GROUPS(vduse_dev); + +static int vduse_create_dev(struct vduse_dev_config *config, + void *config_buf, u64 api_version) +{ + int ret; + struct vduse_dev *dev; + + ret = -EEXIST; + if (vduse_find_dev(config->name)) + goto err; + + ret = -ENOMEM; + dev = vduse_dev_create(); + if (!dev) + goto err; + + dev->api_version = api_version; + dev->device_features = config->features; + dev->device_id = config->device_id; + dev->vendor_id = config->vendor_id; + dev->name = kstrdup(config->name, GFP_KERNEL); + if (!dev->name) + goto err_str; + + dev->bounce_size = VDUSE_BOUNCE_SIZE; + dev->config = config_buf; + dev->config_size = config->config_size; + + ret = idr_alloc(&vduse_idr, dev, 1, VDUSE_DEV_MAX, GFP_KERNEL); + if (ret < 0) + goto err_idr; + + dev->minor = ret; + dev->msg_timeout = VDUSE_MSG_DEFAULT_TIMEOUT; + dev->dev = device_create_with_groups(vduse_class, NULL, + MKDEV(MAJOR(vduse_major), dev->minor), + dev, vduse_dev_groups, "%s", config->name); + if (IS_ERR(dev->dev)) { + ret = PTR_ERR(dev->dev); + goto err_dev; + } + + ret = vduse_dev_init_vqs(dev, config->vq_align, config->vq_num); + if (ret) + goto err_vqs; + + __module_get(THIS_MODULE); + + return 0; +err_vqs: + device_destroy(vduse_class, MKDEV(MAJOR(vduse_major), dev->minor)); +err_dev: + idr_remove(&vduse_idr, dev->minor); +err_idr: + kfree(dev->name); +err_str: + vduse_dev_destroy(dev); +err: + return ret; +} + +static long vduse_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + int ret; + void __user *argp = (void __user *)arg; + struct vduse_control *control = file->private_data; + + mutex_lock(&vduse_lock); + switch (cmd) { + case VDUSE_GET_API_VERSION: + ret = put_user(control->api_version, (u64 __user *)argp); + break; + case VDUSE_SET_API_VERSION: { + u64 api_version; + + ret = -EFAULT; + if (get_user(api_version, (u64 __user *)argp)) + break; + + ret = -EINVAL; + if (api_version > VDUSE_API_VERSION) + break; + + ret = 0; + control->api_version = api_version; + break; + } + case VDUSE_CREATE_DEV: { + struct vduse_dev_config config; + unsigned long size = offsetof(struct vduse_dev_config, config); + void *buf; + + ret = -EFAULT; + if (copy_from_user(&config, argp, size)) + break; + + ret = -EINVAL; + if (vduse_validate_config(&config) == false) + break; + + buf = vmemdup_user(argp + size, config.config_size); + if (IS_ERR(buf)) { + ret = PTR_ERR(buf); + break; + } + config.name[VDUSE_NAME_MAX - 1] = '\0'; + ret = vduse_create_dev(&config, buf, control->api_version); + if (ret) + kvfree(buf); + break; + } + case VDUSE_DESTROY_DEV: { + char name[VDUSE_NAME_MAX]; + + ret = -EFAULT; + if (copy_from_user(name, argp, VDUSE_NAME_MAX)) + break; + + name[VDUSE_NAME_MAX - 1] = '\0'; + ret = vduse_destroy_dev(name); + break; + } + default: + ret = -EINVAL; + break; + } + mutex_unlock(&vduse_lock); + + return ret; +} + +static int vduse_release(struct inode *inode, struct file *file) +{ + struct vduse_control *control = file->private_data; + + kfree(control); + return 0; +} + +static int vduse_open(struct inode *inode, struct file *file) +{ + struct vduse_control *control; + + control = kmalloc(sizeof(struct vduse_control), GFP_KERNEL); + if (!control) + return -ENOMEM; + + control->api_version = VDUSE_API_VERSION; + file->private_data = control; + + return 0; +} + +static const struct file_operations vduse_ctrl_fops = { + .owner = THIS_MODULE, + .open = vduse_open, + .release = vduse_release, + .unlocked_ioctl = vduse_ioctl, + .compat_ioctl = compat_ptr_ioctl, + .llseek = noop_llseek, +}; + +static char *vduse_devnode(const struct device *dev, umode_t *mode) +{ + return kasprintf(GFP_KERNEL, "vduse/%s", dev_name(dev)); +} + +struct vduse_mgmt_dev { + struct vdpa_mgmt_dev mgmt_dev; + struct device dev; +}; + +static struct vduse_mgmt_dev *vduse_mgmt; + +static int vduse_dev_init_vdpa(struct vduse_dev *dev, const char *name) +{ + struct vduse_vdpa *vdev; + int ret; + + if (dev->vdev) + return -EEXIST; + + vdev = vdpa_alloc_device(struct vduse_vdpa, vdpa, dev->dev, + &vduse_vdpa_config_ops, 1, 1, name, true); + if (IS_ERR(vdev)) + return PTR_ERR(vdev); + + dev->vdev = vdev; + vdev->dev = dev; + vdev->vdpa.dev.dma_mask = &vdev->vdpa.dev.coherent_dma_mask; + ret = dma_set_mask_and_coherent(&vdev->vdpa.dev, DMA_BIT_MASK(64)); + if (ret) { + put_device(&vdev->vdpa.dev); + return ret; + } + set_dma_ops(&vdev->vdpa.dev, &vduse_dev_dma_ops); + vdev->vdpa.dma_dev = &vdev->vdpa.dev; + vdev->vdpa.mdev = &vduse_mgmt->mgmt_dev; + + return 0; +} + +static int vdpa_dev_add(struct vdpa_mgmt_dev *mdev, const char *name, + const struct vdpa_dev_set_config *config) +{ + struct vduse_dev *dev; + int ret; + + mutex_lock(&vduse_lock); + dev = vduse_find_dev(name); + if (!dev || !vduse_dev_is_ready(dev)) { + mutex_unlock(&vduse_lock); + return -EINVAL; + } + ret = vduse_dev_init_vdpa(dev, name); + mutex_unlock(&vduse_lock); + if (ret) + return ret; + + mutex_lock(&dev->domain_lock); + if (!dev->domain) + dev->domain = vduse_domain_create(VDUSE_IOVA_SIZE - 1, + dev->bounce_size); + mutex_unlock(&dev->domain_lock); + if (!dev->domain) { + put_device(&dev->vdev->vdpa.dev); + return -ENOMEM; + } + + ret = _vdpa_register_device(&dev->vdev->vdpa, dev->vq_num); + if (ret) { + put_device(&dev->vdev->vdpa.dev); + mutex_lock(&dev->domain_lock); + vduse_domain_destroy(dev->domain); + dev->domain = NULL; + mutex_unlock(&dev->domain_lock); + return ret; + } + + return 0; +} + +static void vdpa_dev_del(struct vdpa_mgmt_dev *mdev, struct vdpa_device *dev) +{ + _vdpa_unregister_device(dev); +} + +static const struct vdpa_mgmtdev_ops vdpa_dev_mgmtdev_ops = { + .dev_add = vdpa_dev_add, + .dev_del = vdpa_dev_del, +}; + +static struct virtio_device_id id_table[] = { + { VIRTIO_ID_BLOCK, VIRTIO_DEV_ANY_ID }, + { 0 }, +}; + +static void vduse_mgmtdev_release(struct device *dev) +{ + struct vduse_mgmt_dev *mgmt_dev; + + mgmt_dev = container_of(dev, struct vduse_mgmt_dev, dev); + kfree(mgmt_dev); +} + +static int vduse_mgmtdev_init(void) +{ + int ret; + + vduse_mgmt = kzalloc(sizeof(*vduse_mgmt), GFP_KERNEL); + if (!vduse_mgmt) + return -ENOMEM; + + ret = dev_set_name(&vduse_mgmt->dev, "vduse"); + if (ret) { + kfree(vduse_mgmt); + return ret; + } + + vduse_mgmt->dev.release = vduse_mgmtdev_release; + + ret = device_register(&vduse_mgmt->dev); + if (ret) + goto dev_reg_err; + + vduse_mgmt->mgmt_dev.id_table = id_table; + vduse_mgmt->mgmt_dev.ops = &vdpa_dev_mgmtdev_ops; + vduse_mgmt->mgmt_dev.device = &vduse_mgmt->dev; + ret = vdpa_mgmtdev_register(&vduse_mgmt->mgmt_dev); + if (ret) + device_unregister(&vduse_mgmt->dev); + + return ret; + +dev_reg_err: + put_device(&vduse_mgmt->dev); + return ret; +} + +static void vduse_mgmtdev_exit(void) +{ + vdpa_mgmtdev_unregister(&vduse_mgmt->mgmt_dev); + device_unregister(&vduse_mgmt->dev); +} + +static int vduse_init(void) +{ + int ret; + struct device *dev; + + vduse_class = class_create("vduse"); + if (IS_ERR(vduse_class)) + return PTR_ERR(vduse_class); + + vduse_class->devnode = vduse_devnode; + + ret = alloc_chrdev_region(&vduse_major, 0, VDUSE_DEV_MAX, "vduse"); + if (ret) + goto err_chardev_region; + + /* /dev/vduse/control */ + cdev_init(&vduse_ctrl_cdev, &vduse_ctrl_fops); + vduse_ctrl_cdev.owner = THIS_MODULE; + ret = cdev_add(&vduse_ctrl_cdev, vduse_major, 1); + if (ret) + goto err_ctrl_cdev; + + dev = device_create(vduse_class, NULL, vduse_major, NULL, "control"); + if (IS_ERR(dev)) { + ret = PTR_ERR(dev); + goto err_device; + } + + /* /dev/vduse/$DEVICE */ + cdev_init(&vduse_cdev, &vduse_dev_fops); + vduse_cdev.owner = THIS_MODULE; + ret = cdev_add(&vduse_cdev, MKDEV(MAJOR(vduse_major), 1), + VDUSE_DEV_MAX - 1); + if (ret) + goto err_cdev; + + ret = -ENOMEM; + vduse_irq_wq = alloc_workqueue("vduse-irq", + WQ_HIGHPRI | WQ_SYSFS | WQ_UNBOUND, 0); + if (!vduse_irq_wq) + goto err_wq; + + vduse_irq_bound_wq = alloc_workqueue("vduse-irq-bound", WQ_HIGHPRI, 0); + if (!vduse_irq_bound_wq) + goto err_bound_wq; + + ret = vduse_domain_init(); + if (ret) + goto err_domain; + + ret = vduse_mgmtdev_init(); + if (ret) + goto err_mgmtdev; + + return 0; +err_mgmtdev: + vduse_domain_exit(); +err_domain: + destroy_workqueue(vduse_irq_bound_wq); +err_bound_wq: + destroy_workqueue(vduse_irq_wq); +err_wq: + cdev_del(&vduse_cdev); +err_cdev: + device_destroy(vduse_class, vduse_major); +err_device: + cdev_del(&vduse_ctrl_cdev); +err_ctrl_cdev: + unregister_chrdev_region(vduse_major, VDUSE_DEV_MAX); +err_chardev_region: + class_destroy(vduse_class); + return ret; +} +module_init(vduse_init); + +static void vduse_exit(void) +{ + vduse_mgmtdev_exit(); + vduse_domain_exit(); + destroy_workqueue(vduse_irq_bound_wq); + destroy_workqueue(vduse_irq_wq); + cdev_del(&vduse_cdev); + device_destroy(vduse_class, vduse_major); + cdev_del(&vduse_ctrl_cdev); + unregister_chrdev_region(vduse_major, VDUSE_DEV_MAX); + class_destroy(vduse_class); +} +module_exit(vduse_exit); + +MODULE_LICENSE(DRV_LICENSE); +MODULE_AUTHOR(DRV_AUTHOR); +MODULE_DESCRIPTION(DRV_DESC); diff --git a/drivers/vdpa/virtio_pci/Makefile b/drivers/vdpa/virtio_pci/Makefile new file mode 100644 index 0000000000..231088d3af --- /dev/null +++ b/drivers/vdpa/virtio_pci/Makefile @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0 +obj-$(CONFIG_VP_VDPA) += vp_vdpa.o diff --git a/drivers/vdpa/virtio_pci/vp_vdpa.c b/drivers/vdpa/virtio_pci/vp_vdpa.c new file mode 100644 index 0000000000..281287fae8 --- /dev/null +++ b/drivers/vdpa/virtio_pci/vp_vdpa.c @@ -0,0 +1,667 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * vDPA bridge driver for modern virtio-pci device + * + * Copyright (c) 2020, Red Hat Inc. All rights reserved. + * Author: Jason Wang <jasowang@redhat.com> + * + * Based on virtio_pci_modern.c. + */ + +#include <linux/interrupt.h> +#include <linux/module.h> +#include <linux/pci.h> +#include <linux/vdpa.h> +#include <linux/virtio.h> +#include <linux/virtio_config.h> +#include <linux/virtio_ring.h> +#include <linux/virtio_pci.h> +#include <linux/virtio_pci_modern.h> +#include <uapi/linux/vdpa.h> + +#define VP_VDPA_QUEUE_MAX 256 +#define VP_VDPA_DRIVER_NAME "vp_vdpa" +#define VP_VDPA_NAME_SIZE 256 + +struct vp_vring { + void __iomem *notify; + char msix_name[VP_VDPA_NAME_SIZE]; + struct vdpa_callback cb; + resource_size_t notify_pa; + int irq; +}; + +struct vp_vdpa { + struct vdpa_device vdpa; + struct virtio_pci_modern_device *mdev; + struct vp_vring *vring; + struct vdpa_callback config_cb; + u64 device_features; + char msix_name[VP_VDPA_NAME_SIZE]; + int config_irq; + int queues; + int vectors; +}; + +struct vp_vdpa_mgmtdev { + struct vdpa_mgmt_dev mgtdev; + struct virtio_pci_modern_device *mdev; + struct vp_vdpa *vp_vdpa; +}; + +static struct vp_vdpa *vdpa_to_vp(struct vdpa_device *vdpa) +{ + return container_of(vdpa, struct vp_vdpa, vdpa); +} + +static struct virtio_pci_modern_device *vdpa_to_mdev(struct vdpa_device *vdpa) +{ + struct vp_vdpa *vp_vdpa = vdpa_to_vp(vdpa); + + return vp_vdpa->mdev; +} + +static struct virtio_pci_modern_device *vp_vdpa_to_mdev(struct vp_vdpa *vp_vdpa) +{ + return vp_vdpa->mdev; +} + +static u64 vp_vdpa_get_device_features(struct vdpa_device *vdpa) +{ + struct vp_vdpa *vp_vdpa = vdpa_to_vp(vdpa); + + return vp_vdpa->device_features; +} + +static int vp_vdpa_set_driver_features(struct vdpa_device *vdpa, u64 features) +{ + struct virtio_pci_modern_device *mdev = vdpa_to_mdev(vdpa); + + vp_modern_set_features(mdev, features); + + return 0; +} + +static u64 vp_vdpa_get_driver_features(struct vdpa_device *vdpa) +{ + struct virtio_pci_modern_device *mdev = vdpa_to_mdev(vdpa); + + return vp_modern_get_driver_features(mdev); +} + +static u8 vp_vdpa_get_status(struct vdpa_device *vdpa) +{ + struct virtio_pci_modern_device *mdev = vdpa_to_mdev(vdpa); + + return vp_modern_get_status(mdev); +} + +static int vp_vdpa_get_vq_irq(struct vdpa_device *vdpa, u16 idx) +{ + struct vp_vdpa *vp_vdpa = vdpa_to_vp(vdpa); + int irq = vp_vdpa->vring[idx].irq; + + if (irq == VIRTIO_MSI_NO_VECTOR) + return -EINVAL; + + return irq; +} + +static void vp_vdpa_free_irq(struct vp_vdpa *vp_vdpa) +{ + struct virtio_pci_modern_device *mdev = vp_vdpa_to_mdev(vp_vdpa); + struct pci_dev *pdev = mdev->pci_dev; + int i; + + for (i = 0; i < vp_vdpa->queues; i++) { + if (vp_vdpa->vring[i].irq != VIRTIO_MSI_NO_VECTOR) { + vp_modern_queue_vector(mdev, i, VIRTIO_MSI_NO_VECTOR); + devm_free_irq(&pdev->dev, vp_vdpa->vring[i].irq, + &vp_vdpa->vring[i]); + vp_vdpa->vring[i].irq = VIRTIO_MSI_NO_VECTOR; + } + } + + if (vp_vdpa->config_irq != VIRTIO_MSI_NO_VECTOR) { + vp_modern_config_vector(mdev, VIRTIO_MSI_NO_VECTOR); + devm_free_irq(&pdev->dev, vp_vdpa->config_irq, vp_vdpa); + vp_vdpa->config_irq = VIRTIO_MSI_NO_VECTOR; + } + + if (vp_vdpa->vectors) { + pci_free_irq_vectors(pdev); + vp_vdpa->vectors = 0; + } +} + +static irqreturn_t vp_vdpa_vq_handler(int irq, void *arg) +{ + struct vp_vring *vring = arg; + + if (vring->cb.callback) + return vring->cb.callback(vring->cb.private); + + return IRQ_HANDLED; +} + +static irqreturn_t vp_vdpa_config_handler(int irq, void *arg) +{ + struct vp_vdpa *vp_vdpa = arg; + + if (vp_vdpa->config_cb.callback) + return vp_vdpa->config_cb.callback(vp_vdpa->config_cb.private); + + return IRQ_HANDLED; +} + +static int vp_vdpa_request_irq(struct vp_vdpa *vp_vdpa) +{ + struct virtio_pci_modern_device *mdev = vp_vdpa_to_mdev(vp_vdpa); + struct pci_dev *pdev = mdev->pci_dev; + int i, ret, irq; + int queues = vp_vdpa->queues; + int vectors = queues + 1; + + ret = pci_alloc_irq_vectors(pdev, vectors, vectors, PCI_IRQ_MSIX); + if (ret != vectors) { + dev_err(&pdev->dev, + "vp_vdpa: fail to allocate irq vectors want %d but %d\n", + vectors, ret); + return ret; + } + + vp_vdpa->vectors = vectors; + + for (i = 0; i < queues; i++) { + snprintf(vp_vdpa->vring[i].msix_name, VP_VDPA_NAME_SIZE, + "vp-vdpa[%s]-%d\n", pci_name(pdev), i); + irq = pci_irq_vector(pdev, i); + ret = devm_request_irq(&pdev->dev, irq, + vp_vdpa_vq_handler, + 0, vp_vdpa->vring[i].msix_name, + &vp_vdpa->vring[i]); + if (ret) { + dev_err(&pdev->dev, + "vp_vdpa: fail to request irq for vq %d\n", i); + goto err; + } + vp_modern_queue_vector(mdev, i, i); + vp_vdpa->vring[i].irq = irq; + } + + snprintf(vp_vdpa->msix_name, VP_VDPA_NAME_SIZE, "vp-vdpa[%s]-config\n", + pci_name(pdev)); + irq = pci_irq_vector(pdev, queues); + ret = devm_request_irq(&pdev->dev, irq, vp_vdpa_config_handler, 0, + vp_vdpa->msix_name, vp_vdpa); + if (ret) { + dev_err(&pdev->dev, + "vp_vdpa: fail to request irq for vq %d\n", i); + goto err; + } + vp_modern_config_vector(mdev, queues); + vp_vdpa->config_irq = irq; + + return 0; +err: + vp_vdpa_free_irq(vp_vdpa); + return ret; +} + +static void vp_vdpa_set_status(struct vdpa_device *vdpa, u8 status) +{ + struct vp_vdpa *vp_vdpa = vdpa_to_vp(vdpa); + struct virtio_pci_modern_device *mdev = vp_vdpa_to_mdev(vp_vdpa); + u8 s = vp_vdpa_get_status(vdpa); + + if (status & VIRTIO_CONFIG_S_DRIVER_OK && + !(s & VIRTIO_CONFIG_S_DRIVER_OK)) { + vp_vdpa_request_irq(vp_vdpa); + } + + vp_modern_set_status(mdev, status); +} + +static int vp_vdpa_reset(struct vdpa_device *vdpa) +{ + struct vp_vdpa *vp_vdpa = vdpa_to_vp(vdpa); + struct virtio_pci_modern_device *mdev = vp_vdpa_to_mdev(vp_vdpa); + u8 s = vp_vdpa_get_status(vdpa); + + vp_modern_set_status(mdev, 0); + + if (s & VIRTIO_CONFIG_S_DRIVER_OK) + vp_vdpa_free_irq(vp_vdpa); + + return 0; +} + +static u16 vp_vdpa_get_vq_num_max(struct vdpa_device *vdpa) +{ + return VP_VDPA_QUEUE_MAX; +} + +static int vp_vdpa_get_vq_state(struct vdpa_device *vdpa, u16 qid, + struct vdpa_vq_state *state) +{ + /* Note that this is not supported by virtio specification, so + * we return -EOPNOTSUPP here. This means we can't support live + * migration, vhost device start/stop. + */ + return -EOPNOTSUPP; +} + +static int vp_vdpa_set_vq_state_split(struct vdpa_device *vdpa, + const struct vdpa_vq_state *state) +{ + const struct vdpa_vq_state_split *split = &state->split; + + if (split->avail_index == 0) + return 0; + + return -EOPNOTSUPP; +} + +static int vp_vdpa_set_vq_state_packed(struct vdpa_device *vdpa, + const struct vdpa_vq_state *state) +{ + const struct vdpa_vq_state_packed *packed = &state->packed; + + if (packed->last_avail_counter == 1 && + packed->last_avail_idx == 0 && + packed->last_used_counter == 1 && + packed->last_used_idx == 0) + return 0; + + return -EOPNOTSUPP; +} + +static int vp_vdpa_set_vq_state(struct vdpa_device *vdpa, u16 qid, + const struct vdpa_vq_state *state) +{ + struct virtio_pci_modern_device *mdev = vdpa_to_mdev(vdpa); + + /* Note that this is not supported by virtio specification. + * But if the state is by chance equal to the device initial + * state, we can let it go. + */ + if ((vp_modern_get_status(mdev) & VIRTIO_CONFIG_S_FEATURES_OK) && + !vp_modern_get_queue_enable(mdev, qid)) { + if (vp_modern_get_driver_features(mdev) & + BIT_ULL(VIRTIO_F_RING_PACKED)) + return vp_vdpa_set_vq_state_packed(vdpa, state); + else + return vp_vdpa_set_vq_state_split(vdpa, state); + } + + return -EOPNOTSUPP; +} + +static void vp_vdpa_set_vq_cb(struct vdpa_device *vdpa, u16 qid, + struct vdpa_callback *cb) +{ + struct vp_vdpa *vp_vdpa = vdpa_to_vp(vdpa); + + vp_vdpa->vring[qid].cb = *cb; +} + +static void vp_vdpa_set_vq_ready(struct vdpa_device *vdpa, + u16 qid, bool ready) +{ + struct virtio_pci_modern_device *mdev = vdpa_to_mdev(vdpa); + + vp_modern_set_queue_enable(mdev, qid, ready); +} + +static bool vp_vdpa_get_vq_ready(struct vdpa_device *vdpa, u16 qid) +{ + struct virtio_pci_modern_device *mdev = vdpa_to_mdev(vdpa); + + return vp_modern_get_queue_enable(mdev, qid); +} + +static void vp_vdpa_set_vq_num(struct vdpa_device *vdpa, u16 qid, + u32 num) +{ + struct virtio_pci_modern_device *mdev = vdpa_to_mdev(vdpa); + + vp_modern_set_queue_size(mdev, qid, num); +} + +static int vp_vdpa_set_vq_address(struct vdpa_device *vdpa, u16 qid, + u64 desc_area, u64 driver_area, + u64 device_area) +{ + struct virtio_pci_modern_device *mdev = vdpa_to_mdev(vdpa); + + vp_modern_queue_address(mdev, qid, desc_area, + driver_area, device_area); + + return 0; +} + +static void vp_vdpa_kick_vq(struct vdpa_device *vdpa, u16 qid) +{ + struct vp_vdpa *vp_vdpa = vdpa_to_vp(vdpa); + + vp_iowrite16(qid, vp_vdpa->vring[qid].notify); +} + +static u32 vp_vdpa_get_generation(struct vdpa_device *vdpa) +{ + struct virtio_pci_modern_device *mdev = vdpa_to_mdev(vdpa); + + return vp_modern_generation(mdev); +} + +static u32 vp_vdpa_get_device_id(struct vdpa_device *vdpa) +{ + struct virtio_pci_modern_device *mdev = vdpa_to_mdev(vdpa); + + return mdev->id.device; +} + +static u32 vp_vdpa_get_vendor_id(struct vdpa_device *vdpa) +{ + struct virtio_pci_modern_device *mdev = vdpa_to_mdev(vdpa); + + return mdev->id.vendor; +} + +static u32 vp_vdpa_get_vq_align(struct vdpa_device *vdpa) +{ + return PAGE_SIZE; +} + +static size_t vp_vdpa_get_config_size(struct vdpa_device *vdpa) +{ + struct virtio_pci_modern_device *mdev = vdpa_to_mdev(vdpa); + + return mdev->device_len; +} + +static void vp_vdpa_get_config(struct vdpa_device *vdpa, + unsigned int offset, + void *buf, unsigned int len) +{ + struct vp_vdpa *vp_vdpa = vdpa_to_vp(vdpa); + struct virtio_pci_modern_device *mdev = vp_vdpa_to_mdev(vp_vdpa); + u8 old, new; + u8 *p; + int i; + + do { + old = vp_ioread8(&mdev->common->config_generation); + p = buf; + for (i = 0; i < len; i++) + *p++ = vp_ioread8(mdev->device + offset + i); + + new = vp_ioread8(&mdev->common->config_generation); + } while (old != new); +} + +static void vp_vdpa_set_config(struct vdpa_device *vdpa, + unsigned int offset, const void *buf, + unsigned int len) +{ + struct vp_vdpa *vp_vdpa = vdpa_to_vp(vdpa); + struct virtio_pci_modern_device *mdev = vp_vdpa_to_mdev(vp_vdpa); + const u8 *p = buf; + int i; + + for (i = 0; i < len; i++) + vp_iowrite8(*p++, mdev->device + offset + i); +} + +static void vp_vdpa_set_config_cb(struct vdpa_device *vdpa, + struct vdpa_callback *cb) +{ + struct vp_vdpa *vp_vdpa = vdpa_to_vp(vdpa); + + vp_vdpa->config_cb = *cb; +} + +static struct vdpa_notification_area +vp_vdpa_get_vq_notification(struct vdpa_device *vdpa, u16 qid) +{ + struct vp_vdpa *vp_vdpa = vdpa_to_vp(vdpa); + struct virtio_pci_modern_device *mdev = vp_vdpa_to_mdev(vp_vdpa); + struct vdpa_notification_area notify; + + notify.addr = vp_vdpa->vring[qid].notify_pa; + notify.size = mdev->notify_offset_multiplier; + + return notify; +} + +static const struct vdpa_config_ops vp_vdpa_ops = { + .get_device_features = vp_vdpa_get_device_features, + .set_driver_features = vp_vdpa_set_driver_features, + .get_driver_features = vp_vdpa_get_driver_features, + .get_status = vp_vdpa_get_status, + .set_status = vp_vdpa_set_status, + .reset = vp_vdpa_reset, + .get_vq_num_max = vp_vdpa_get_vq_num_max, + .get_vq_state = vp_vdpa_get_vq_state, + .get_vq_notification = vp_vdpa_get_vq_notification, + .set_vq_state = vp_vdpa_set_vq_state, + .set_vq_cb = vp_vdpa_set_vq_cb, + .set_vq_ready = vp_vdpa_set_vq_ready, + .get_vq_ready = vp_vdpa_get_vq_ready, + .set_vq_num = vp_vdpa_set_vq_num, + .set_vq_address = vp_vdpa_set_vq_address, + .kick_vq = vp_vdpa_kick_vq, + .get_generation = vp_vdpa_get_generation, + .get_device_id = vp_vdpa_get_device_id, + .get_vendor_id = vp_vdpa_get_vendor_id, + .get_vq_align = vp_vdpa_get_vq_align, + .get_config_size = vp_vdpa_get_config_size, + .get_config = vp_vdpa_get_config, + .set_config = vp_vdpa_set_config, + .set_config_cb = vp_vdpa_set_config_cb, + .get_vq_irq = vp_vdpa_get_vq_irq, +}; + +static void vp_vdpa_free_irq_vectors(void *data) +{ + pci_free_irq_vectors(data); +} + +static int vp_vdpa_dev_add(struct vdpa_mgmt_dev *v_mdev, const char *name, + const struct vdpa_dev_set_config *add_config) +{ + struct vp_vdpa_mgmtdev *vp_vdpa_mgtdev = + container_of(v_mdev, struct vp_vdpa_mgmtdev, mgtdev); + + struct virtio_pci_modern_device *mdev = vp_vdpa_mgtdev->mdev; + struct pci_dev *pdev = mdev->pci_dev; + struct device *dev = &pdev->dev; + struct vp_vdpa *vp_vdpa = NULL; + u64 device_features; + int ret, i; + + vp_vdpa = vdpa_alloc_device(struct vp_vdpa, vdpa, + dev, &vp_vdpa_ops, 1, 1, name, false); + + if (IS_ERR(vp_vdpa)) { + dev_err(dev, "vp_vdpa: Failed to allocate vDPA structure\n"); + return PTR_ERR(vp_vdpa); + } + + vp_vdpa_mgtdev->vp_vdpa = vp_vdpa; + + vp_vdpa->vdpa.dma_dev = &pdev->dev; + vp_vdpa->queues = vp_modern_get_num_queues(mdev); + vp_vdpa->mdev = mdev; + + device_features = vp_modern_get_features(mdev); + if (add_config->mask & BIT_ULL(VDPA_ATTR_DEV_FEATURES)) { + if (add_config->device_features & ~device_features) { + ret = -EINVAL; + dev_err(&pdev->dev, "Try to provision features " + "that are not supported by the device: " + "device_features 0x%llx provisioned 0x%llx\n", + device_features, add_config->device_features); + goto err; + } + device_features = add_config->device_features; + } + vp_vdpa->device_features = device_features; + + ret = devm_add_action_or_reset(dev, vp_vdpa_free_irq_vectors, pdev); + if (ret) { + dev_err(&pdev->dev, + "Failed for adding devres for freeing irq vectors\n"); + goto err; + } + + vp_vdpa->vring = devm_kcalloc(&pdev->dev, vp_vdpa->queues, + sizeof(*vp_vdpa->vring), + GFP_KERNEL); + if (!vp_vdpa->vring) { + ret = -ENOMEM; + dev_err(&pdev->dev, "Fail to allocate virtqueues\n"); + goto err; + } + + for (i = 0; i < vp_vdpa->queues; i++) { + vp_vdpa->vring[i].irq = VIRTIO_MSI_NO_VECTOR; + vp_vdpa->vring[i].notify = + vp_modern_map_vq_notify(mdev, i, + &vp_vdpa->vring[i].notify_pa); + if (!vp_vdpa->vring[i].notify) { + ret = -EINVAL; + dev_warn(&pdev->dev, "Fail to map vq notify %d\n", i); + goto err; + } + } + vp_vdpa->config_irq = VIRTIO_MSI_NO_VECTOR; + + vp_vdpa->vdpa.mdev = &vp_vdpa_mgtdev->mgtdev; + ret = _vdpa_register_device(&vp_vdpa->vdpa, vp_vdpa->queues); + if (ret) { + dev_err(&pdev->dev, "Failed to register to vdpa bus\n"); + goto err; + } + + return 0; + +err: + put_device(&vp_vdpa->vdpa.dev); + return ret; +} + +static void vp_vdpa_dev_del(struct vdpa_mgmt_dev *v_mdev, + struct vdpa_device *dev) +{ + struct vp_vdpa_mgmtdev *vp_vdpa_mgtdev = + container_of(v_mdev, struct vp_vdpa_mgmtdev, mgtdev); + + struct vp_vdpa *vp_vdpa = vp_vdpa_mgtdev->vp_vdpa; + + _vdpa_unregister_device(&vp_vdpa->vdpa); + vp_vdpa_mgtdev->vp_vdpa = NULL; +} + +static const struct vdpa_mgmtdev_ops vp_vdpa_mdev_ops = { + .dev_add = vp_vdpa_dev_add, + .dev_del = vp_vdpa_dev_del, +}; + +static int vp_vdpa_probe(struct pci_dev *pdev, const struct pci_device_id *id) +{ + struct vp_vdpa_mgmtdev *vp_vdpa_mgtdev = NULL; + struct vdpa_mgmt_dev *mgtdev; + struct device *dev = &pdev->dev; + struct virtio_pci_modern_device *mdev = NULL; + struct virtio_device_id *mdev_id = NULL; + int err; + + vp_vdpa_mgtdev = kzalloc(sizeof(*vp_vdpa_mgtdev), GFP_KERNEL); + if (!vp_vdpa_mgtdev) + return -ENOMEM; + + mgtdev = &vp_vdpa_mgtdev->mgtdev; + mgtdev->ops = &vp_vdpa_mdev_ops; + mgtdev->device = dev; + + mdev = kzalloc(sizeof(struct virtio_pci_modern_device), GFP_KERNEL); + if (!mdev) { + err = -ENOMEM; + goto mdev_err; + } + + mdev_id = kzalloc(sizeof(struct virtio_device_id), GFP_KERNEL); + if (!mdev_id) { + err = -ENOMEM; + goto mdev_id_err; + } + + vp_vdpa_mgtdev->mdev = mdev; + mdev->pci_dev = pdev; + + err = pcim_enable_device(pdev); + if (err) { + goto probe_err; + } + + err = vp_modern_probe(mdev); + if (err) { + dev_err(&pdev->dev, "Failed to probe modern PCI device\n"); + goto probe_err; + } + + mdev_id->device = mdev->id.device; + mdev_id->vendor = mdev->id.vendor; + mgtdev->id_table = mdev_id; + mgtdev->max_supported_vqs = vp_modern_get_num_queues(mdev); + mgtdev->supported_features = vp_modern_get_features(mdev); + mgtdev->config_attr_mask = (1 << VDPA_ATTR_DEV_FEATURES); + pci_set_master(pdev); + pci_set_drvdata(pdev, vp_vdpa_mgtdev); + + err = vdpa_mgmtdev_register(mgtdev); + if (err) { + dev_err(&pdev->dev, "Failed to register vdpa mgmtdev device\n"); + goto register_err; + } + + return 0; + +register_err: + vp_modern_remove(vp_vdpa_mgtdev->mdev); +probe_err: + kfree(mdev_id); +mdev_id_err: + kfree(mdev); +mdev_err: + kfree(vp_vdpa_mgtdev); + return err; +} + +static void vp_vdpa_remove(struct pci_dev *pdev) +{ + struct vp_vdpa_mgmtdev *vp_vdpa_mgtdev = pci_get_drvdata(pdev); + struct virtio_pci_modern_device *mdev = NULL; + + mdev = vp_vdpa_mgtdev->mdev; + vdpa_mgmtdev_unregister(&vp_vdpa_mgtdev->mgtdev); + vp_modern_remove(mdev); + kfree(vp_vdpa_mgtdev->mgtdev.id_table); + kfree(mdev); + kfree(vp_vdpa_mgtdev); +} + +static struct pci_driver vp_vdpa_driver = { + .name = "vp-vdpa", + .id_table = NULL, /* only dynamic ids */ + .probe = vp_vdpa_probe, + .remove = vp_vdpa_remove, +}; + +module_pci_driver(vp_vdpa_driver); + +MODULE_AUTHOR("Jason Wang <jasowang@redhat.com>"); +MODULE_DESCRIPTION("vp-vdpa"); +MODULE_LICENSE("GPL"); +MODULE_VERSION("1"); |