summaryrefslogtreecommitdiffstats
path: root/drivers/virtio
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-11 08:27:49 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-11 08:27:49 +0000
commitace9429bb58fd418f0c81d4c2835699bddf6bde6 (patch)
treeb2d64bc10158fdd5497876388cd68142ca374ed3 /drivers/virtio
parentInitial commit. (diff)
downloadlinux-ace9429bb58fd418f0c81d4c2835699bddf6bde6.tar.xz
linux-ace9429bb58fd418f0c81d4c2835699bddf6bde6.zip
Adding upstream version 6.6.15.upstream/6.6.15
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'drivers/virtio')
-rw-r--r--drivers/virtio/Kconfig176
-rw-r--r--drivers/virtio/Makefile14
-rw-r--r--drivers/virtio/virtio.c571
-rw-r--r--drivers/virtio/virtio_anchor.c18
-rw-r--r--drivers/virtio/virtio_balloon.c1132
-rw-r--r--drivers/virtio/virtio_dma_buf.c89
-rw-r--r--drivers/virtio/virtio_input.c414
-rw-r--r--drivers/virtio/virtio_mem.c3008
-rw-r--r--drivers/virtio/virtio_mmio.c877
-rw-r--r--drivers/virtio/virtio_pci_common.c650
-rw-r--r--drivers/virtio/virtio_pci_common.h142
-rw-r--r--drivers/virtio/virtio_pci_legacy.c236
-rw-r--r--drivers/virtio/virtio_pci_legacy_dev.c222
-rw-r--r--drivers/virtio/virtio_pci_modern.c566
-rw-r--r--drivers/virtio/virtio_pci_modern_dev.c720
-rw-r--r--drivers/virtio/virtio_ring.c3252
-rw-r--r--drivers/virtio/virtio_vdpa.c548
17 files changed, 12635 insertions, 0 deletions
diff --git a/drivers/virtio/Kconfig b/drivers/virtio/Kconfig
new file mode 100644
index 000000000..0a53a6123
--- /dev/null
+++ b/drivers/virtio/Kconfig
@@ -0,0 +1,176 @@
+# SPDX-License-Identifier: GPL-2.0-only
+config VIRTIO_ANCHOR
+ bool
+
+config VIRTIO
+ tristate
+ select VIRTIO_ANCHOR
+ help
+ This option is selected by any driver which implements the virtio
+ bus, such as CONFIG_VIRTIO_PCI, CONFIG_VIRTIO_MMIO, CONFIG_RPMSG
+ or CONFIG_S390_GUEST.
+
+config VIRTIO_PCI_LIB
+ tristate
+ help
+ Modern PCI device implementation. This module implements the
+ basic probe and control for devices which are based on modern
+ PCI device with possible vendor specific extensions. Any
+ module that selects this module must depend on PCI.
+
+config VIRTIO_PCI_LIB_LEGACY
+ tristate
+ help
+ Legacy PCI device (Virtio PCI Card 0.9.x Draft and older device)
+ implementation.
+ This module implements the basic probe and control for devices
+ which are based on legacy PCI device. Any module that selects this
+ module must depend on PCI.
+
+menuconfig VIRTIO_MENU
+ bool "Virtio drivers"
+ default y
+
+if VIRTIO_MENU
+
+config VIRTIO_HARDEN_NOTIFICATION
+ bool "Harden virtio notification"
+ depends on BROKEN
+ help
+ Enable this to harden the device notifications and suppress
+ those that happen at a time where notifications are illegal.
+
+ Experimental: Note that several drivers still have issues that
+ may cause crashes or hangs when correct handling of
+ notifications is enforced; depending on the subset of
+ drivers and devices you use, this may or may not work.
+
+ If unsure, say N.
+
+config VIRTIO_PCI
+ tristate "PCI driver for virtio devices"
+ depends on PCI
+ select VIRTIO_PCI_LIB
+ select VIRTIO
+ help
+ This driver provides support for virtio based paravirtual device
+ drivers over PCI. This requires that your VMM has appropriate PCI
+ virtio backends. Most QEMU based VMMs should support these devices
+ (like KVM or Xen).
+
+ If unsure, say M.
+
+config VIRTIO_PCI_LEGACY
+ bool "Support for legacy virtio draft 0.9.X and older devices"
+ default y
+ depends on VIRTIO_PCI
+ select VIRTIO_PCI_LIB_LEGACY
+ help
+ Virtio PCI Card 0.9.X Draft (circa 2014) and older device support.
+
+ This option enables building a transitional driver, supporting
+ both devices conforming to Virtio 1 specification, and legacy devices.
+ If disabled, you get a slightly smaller, non-transitional driver,
+ with no legacy compatibility.
+
+ So look out into your driveway. Do you have a flying car? If
+ so, you can happily disable this option and virtio will not
+ break. Otherwise, leave it set. Unless you're testing what
+ life will be like in The Future.
+
+ If unsure, say Y.
+
+config VIRTIO_VDPA
+ tristate "vDPA driver for virtio devices"
+ depends on VDPA
+ select VIRTIO
+ help
+ This driver provides support for virtio based paravirtual
+ device driver over vDPA bus. For this to be useful, you need
+ an appropriate vDPA device implementation that operates on a
+ physical device to allow the datapath of virtio to be
+ offloaded to hardware.
+
+ If unsure, say M.
+
+config VIRTIO_PMEM
+ tristate "Support for virtio pmem driver"
+ depends on VIRTIO
+ depends on LIBNVDIMM
+ help
+ This driver provides access to virtio-pmem devices, storage devices
+ that are mapped into the physical address space - similar to NVDIMMs
+ - with a virtio-based flushing interface.
+
+ If unsure, say Y.
+
+config VIRTIO_BALLOON
+ tristate "Virtio balloon driver"
+ depends on VIRTIO
+ select MEMORY_BALLOON
+ select PAGE_REPORTING
+ help
+ This driver supports increasing and decreasing the amount
+ of memory within a KVM guest.
+
+ If unsure, say M.
+
+config VIRTIO_MEM
+ tristate "Virtio mem driver"
+ depends on X86_64 || ARM64
+ depends on VIRTIO
+ depends on MEMORY_HOTPLUG
+ depends on MEMORY_HOTREMOVE
+ depends on CONTIG_ALLOC
+ depends on EXCLUSIVE_SYSTEM_RAM
+ help
+ This driver provides access to virtio-mem paravirtualized memory
+ devices, allowing to hotplug and hotunplug memory.
+
+ This driver currently only supports x86-64 and arm64. Although it
+ should compile on other architectures that implement memory
+ hot(un)plug, architecture-specific and/or common
+ code changes may be required for virtio-mem, kdump and kexec to work as
+ expected.
+
+ If unsure, say M.
+
+config VIRTIO_INPUT
+ tristate "Virtio input driver"
+ depends on VIRTIO
+ depends on INPUT
+ help
+ This driver supports virtio input devices such as
+ keyboards, mice and tablets.
+
+ If unsure, say M.
+
+config VIRTIO_MMIO
+ tristate "Platform bus driver for memory mapped virtio devices"
+ depends on HAS_IOMEM && HAS_DMA
+ select VIRTIO
+ help
+ This drivers provides support for memory mapped virtio
+ platform device driver.
+
+ If unsure, say N.
+
+config VIRTIO_MMIO_CMDLINE_DEVICES
+ bool "Memory mapped virtio devices parameter parsing"
+ depends on VIRTIO_MMIO
+ help
+ Allow virtio-mmio devices instantiation via the kernel command line
+ or module parameters. Be aware that using incorrect parameters (base
+ address in particular) can crash your system - you have been warned.
+ See Documentation/admin-guide/kernel-parameters.rst for details.
+
+ If unsure, say 'N'.
+
+config VIRTIO_DMA_SHARED_BUFFER
+ tristate
+ depends on DMA_SHARED_BUFFER
+ help
+ This option adds a flavor of dma buffers that are backed by
+ virtio resources.
+
+endif # VIRTIO_MENU
diff --git a/drivers/virtio/Makefile b/drivers/virtio/Makefile
new file mode 100644
index 000000000..8e98d2491
--- /dev/null
+++ b/drivers/virtio/Makefile
@@ -0,0 +1,14 @@
+# SPDX-License-Identifier: GPL-2.0
+obj-$(CONFIG_VIRTIO) += virtio.o virtio_ring.o
+obj-$(CONFIG_VIRTIO_ANCHOR) += virtio_anchor.o
+obj-$(CONFIG_VIRTIO_PCI_LIB) += virtio_pci_modern_dev.o
+obj-$(CONFIG_VIRTIO_PCI_LIB_LEGACY) += virtio_pci_legacy_dev.o
+obj-$(CONFIG_VIRTIO_MMIO) += virtio_mmio.o
+obj-$(CONFIG_VIRTIO_PCI) += virtio_pci.o
+virtio_pci-y := virtio_pci_modern.o virtio_pci_common.o
+virtio_pci-$(CONFIG_VIRTIO_PCI_LEGACY) += virtio_pci_legacy.o
+obj-$(CONFIG_VIRTIO_BALLOON) += virtio_balloon.o
+obj-$(CONFIG_VIRTIO_INPUT) += virtio_input.o
+obj-$(CONFIG_VIRTIO_VDPA) += virtio_vdpa.o
+obj-$(CONFIG_VIRTIO_MEM) += virtio_mem.o
+obj-$(CONFIG_VIRTIO_DMA_SHARED_BUFFER) += virtio_dma_buf.o
diff --git a/drivers/virtio/virtio.c b/drivers/virtio/virtio.c
new file mode 100644
index 000000000..3893dc29e
--- /dev/null
+++ b/drivers/virtio/virtio.c
@@ -0,0 +1,571 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <linux/virtio.h>
+#include <linux/spinlock.h>
+#include <linux/virtio_config.h>
+#include <linux/virtio_anchor.h>
+#include <linux/module.h>
+#include <linux/idr.h>
+#include <linux/of.h>
+#include <uapi/linux/virtio_ids.h>
+
+/* Unique numbering for virtio devices. */
+static DEFINE_IDA(virtio_index_ida);
+
+static ssize_t device_show(struct device *_d,
+ struct device_attribute *attr, char *buf)
+{
+ struct virtio_device *dev = dev_to_virtio(_d);
+ return sysfs_emit(buf, "0x%04x\n", dev->id.device);
+}
+static DEVICE_ATTR_RO(device);
+
+static ssize_t vendor_show(struct device *_d,
+ struct device_attribute *attr, char *buf)
+{
+ struct virtio_device *dev = dev_to_virtio(_d);
+ return sysfs_emit(buf, "0x%04x\n", dev->id.vendor);
+}
+static DEVICE_ATTR_RO(vendor);
+
+static ssize_t status_show(struct device *_d,
+ struct device_attribute *attr, char *buf)
+{
+ struct virtio_device *dev = dev_to_virtio(_d);
+ return sysfs_emit(buf, "0x%08x\n", dev->config->get_status(dev));
+}
+static DEVICE_ATTR_RO(status);
+
+static ssize_t modalias_show(struct device *_d,
+ struct device_attribute *attr, char *buf)
+{
+ struct virtio_device *dev = dev_to_virtio(_d);
+ return sysfs_emit(buf, "virtio:d%08Xv%08X\n",
+ dev->id.device, dev->id.vendor);
+}
+static DEVICE_ATTR_RO(modalias);
+
+static ssize_t features_show(struct device *_d,
+ struct device_attribute *attr, char *buf)
+{
+ struct virtio_device *dev = dev_to_virtio(_d);
+ unsigned int i;
+ ssize_t len = 0;
+
+ /* We actually represent this as a bitstring, as it could be
+ * arbitrary length in future. */
+ for (i = 0; i < sizeof(dev->features)*8; i++)
+ len += sysfs_emit_at(buf, len, "%c",
+ __virtio_test_bit(dev, i) ? '1' : '0');
+ len += sysfs_emit_at(buf, len, "\n");
+ return len;
+}
+static DEVICE_ATTR_RO(features);
+
+static struct attribute *virtio_dev_attrs[] = {
+ &dev_attr_device.attr,
+ &dev_attr_vendor.attr,
+ &dev_attr_status.attr,
+ &dev_attr_modalias.attr,
+ &dev_attr_features.attr,
+ NULL,
+};
+ATTRIBUTE_GROUPS(virtio_dev);
+
+static inline int virtio_id_match(const struct virtio_device *dev,
+ const struct virtio_device_id *id)
+{
+ if (id->device != dev->id.device && id->device != VIRTIO_DEV_ANY_ID)
+ return 0;
+
+ return id->vendor == VIRTIO_DEV_ANY_ID || id->vendor == dev->id.vendor;
+}
+
+/* This looks through all the IDs a driver claims to support. If any of them
+ * match, we return 1 and the kernel will call virtio_dev_probe(). */
+static int virtio_dev_match(struct device *_dv, struct device_driver *_dr)
+{
+ unsigned int i;
+ struct virtio_device *dev = dev_to_virtio(_dv);
+ const struct virtio_device_id *ids;
+
+ ids = drv_to_virtio(_dr)->id_table;
+ for (i = 0; ids[i].device; i++)
+ if (virtio_id_match(dev, &ids[i]))
+ return 1;
+ return 0;
+}
+
+static int virtio_uevent(const struct device *_dv, struct kobj_uevent_env *env)
+{
+ const struct virtio_device *dev = dev_to_virtio(_dv);
+
+ return add_uevent_var(env, "MODALIAS=virtio:d%08Xv%08X",
+ dev->id.device, dev->id.vendor);
+}
+
+void virtio_check_driver_offered_feature(const struct virtio_device *vdev,
+ unsigned int fbit)
+{
+ unsigned int i;
+ struct virtio_driver *drv = drv_to_virtio(vdev->dev.driver);
+
+ for (i = 0; i < drv->feature_table_size; i++)
+ if (drv->feature_table[i] == fbit)
+ return;
+
+ if (drv->feature_table_legacy) {
+ for (i = 0; i < drv->feature_table_size_legacy; i++)
+ if (drv->feature_table_legacy[i] == fbit)
+ return;
+ }
+
+ BUG();
+}
+EXPORT_SYMBOL_GPL(virtio_check_driver_offered_feature);
+
+static void __virtio_config_changed(struct virtio_device *dev)
+{
+ struct virtio_driver *drv = drv_to_virtio(dev->dev.driver);
+
+ if (!dev->config_enabled)
+ dev->config_change_pending = true;
+ else if (drv && drv->config_changed)
+ drv->config_changed(dev);
+}
+
+void virtio_config_changed(struct virtio_device *dev)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&dev->config_lock, flags);
+ __virtio_config_changed(dev);
+ spin_unlock_irqrestore(&dev->config_lock, flags);
+}
+EXPORT_SYMBOL_GPL(virtio_config_changed);
+
+static void virtio_config_disable(struct virtio_device *dev)
+{
+ spin_lock_irq(&dev->config_lock);
+ dev->config_enabled = false;
+ spin_unlock_irq(&dev->config_lock);
+}
+
+static void virtio_config_enable(struct virtio_device *dev)
+{
+ spin_lock_irq(&dev->config_lock);
+ dev->config_enabled = true;
+ if (dev->config_change_pending)
+ __virtio_config_changed(dev);
+ dev->config_change_pending = false;
+ spin_unlock_irq(&dev->config_lock);
+}
+
+void virtio_add_status(struct virtio_device *dev, unsigned int status)
+{
+ might_sleep();
+ dev->config->set_status(dev, dev->config->get_status(dev) | status);
+}
+EXPORT_SYMBOL_GPL(virtio_add_status);
+
+/* Do some validation, then set FEATURES_OK */
+static int virtio_features_ok(struct virtio_device *dev)
+{
+ unsigned int status;
+
+ might_sleep();
+
+ if (virtio_check_mem_acc_cb(dev)) {
+ if (!virtio_has_feature(dev, VIRTIO_F_VERSION_1)) {
+ dev_warn(&dev->dev,
+ "device must provide VIRTIO_F_VERSION_1\n");
+ return -ENODEV;
+ }
+
+ if (!virtio_has_feature(dev, VIRTIO_F_ACCESS_PLATFORM)) {
+ dev_warn(&dev->dev,
+ "device must provide VIRTIO_F_ACCESS_PLATFORM\n");
+ return -ENODEV;
+ }
+ }
+
+ if (!virtio_has_feature(dev, VIRTIO_F_VERSION_1))
+ return 0;
+
+ virtio_add_status(dev, VIRTIO_CONFIG_S_FEATURES_OK);
+ status = dev->config->get_status(dev);
+ if (!(status & VIRTIO_CONFIG_S_FEATURES_OK)) {
+ dev_err(&dev->dev, "virtio: device refuses features: %x\n",
+ status);
+ return -ENODEV;
+ }
+ return 0;
+}
+
+/**
+ * virtio_reset_device - quiesce device for removal
+ * @dev: the device to reset
+ *
+ * Prevents device from sending interrupts and accessing memory.
+ *
+ * Generally used for cleanup during driver / device removal.
+ *
+ * Once this has been invoked, caller must ensure that
+ * virtqueue_notify / virtqueue_kick are not in progress.
+ *
+ * Note: this guarantees that vq callbacks are not in progress, however caller
+ * is responsible for preventing access from other contexts, such as a system
+ * call/workqueue/bh. Invoking virtio_break_device then flushing any such
+ * contexts is one way to handle that.
+ * */
+void virtio_reset_device(struct virtio_device *dev)
+{
+#ifdef CONFIG_VIRTIO_HARDEN_NOTIFICATION
+ /*
+ * The below virtio_synchronize_cbs() guarantees that any
+ * interrupt for this line arriving after
+ * virtio_synchronize_vqs() has completed is guaranteed to see
+ * vq->broken as true.
+ */
+ virtio_break_device(dev);
+ virtio_synchronize_cbs(dev);
+#endif
+
+ dev->config->reset(dev);
+}
+EXPORT_SYMBOL_GPL(virtio_reset_device);
+
+static int virtio_dev_probe(struct device *_d)
+{
+ int err, i;
+ struct virtio_device *dev = dev_to_virtio(_d);
+ struct virtio_driver *drv = drv_to_virtio(dev->dev.driver);
+ u64 device_features;
+ u64 driver_features;
+ u64 driver_features_legacy;
+
+ /* We have a driver! */
+ virtio_add_status(dev, VIRTIO_CONFIG_S_DRIVER);
+
+ /* Figure out what features the device supports. */
+ device_features = dev->config->get_features(dev);
+
+ /* Figure out what features the driver supports. */
+ driver_features = 0;
+ for (i = 0; i < drv->feature_table_size; i++) {
+ unsigned int f = drv->feature_table[i];
+ BUG_ON(f >= 64);
+ driver_features |= (1ULL << f);
+ }
+
+ /* Some drivers have a separate feature table for virtio v1.0 */
+ if (drv->feature_table_legacy) {
+ driver_features_legacy = 0;
+ for (i = 0; i < drv->feature_table_size_legacy; i++) {
+ unsigned int f = drv->feature_table_legacy[i];
+ BUG_ON(f >= 64);
+ driver_features_legacy |= (1ULL << f);
+ }
+ } else {
+ driver_features_legacy = driver_features;
+ }
+
+ if (device_features & (1ULL << VIRTIO_F_VERSION_1))
+ dev->features = driver_features & device_features;
+ else
+ dev->features = driver_features_legacy & device_features;
+
+ /* Transport features always preserved to pass to finalize_features. */
+ for (i = VIRTIO_TRANSPORT_F_START; i < VIRTIO_TRANSPORT_F_END; i++)
+ if (device_features & (1ULL << i))
+ __virtio_set_bit(dev, i);
+
+ err = dev->config->finalize_features(dev);
+ if (err)
+ goto err;
+
+ if (drv->validate) {
+ u64 features = dev->features;
+
+ err = drv->validate(dev);
+ if (err)
+ goto err;
+
+ /* Did validation change any features? Then write them again. */
+ if (features != dev->features) {
+ err = dev->config->finalize_features(dev);
+ if (err)
+ goto err;
+ }
+ }
+
+ err = virtio_features_ok(dev);
+ if (err)
+ goto err;
+
+ err = drv->probe(dev);
+ if (err)
+ goto err;
+
+ /* If probe didn't do it, mark device DRIVER_OK ourselves. */
+ if (!(dev->config->get_status(dev) & VIRTIO_CONFIG_S_DRIVER_OK))
+ virtio_device_ready(dev);
+
+ if (drv->scan)
+ drv->scan(dev);
+
+ virtio_config_enable(dev);
+
+ return 0;
+err:
+ virtio_add_status(dev, VIRTIO_CONFIG_S_FAILED);
+ return err;
+
+}
+
+static void virtio_dev_remove(struct device *_d)
+{
+ struct virtio_device *dev = dev_to_virtio(_d);
+ struct virtio_driver *drv = drv_to_virtio(dev->dev.driver);
+
+ virtio_config_disable(dev);
+
+ drv->remove(dev);
+
+ /* Driver should have reset device. */
+ WARN_ON_ONCE(dev->config->get_status(dev));
+
+ /* Acknowledge the device's existence again. */
+ virtio_add_status(dev, VIRTIO_CONFIG_S_ACKNOWLEDGE);
+
+ of_node_put(dev->dev.of_node);
+}
+
+static struct bus_type virtio_bus = {
+ .name = "virtio",
+ .match = virtio_dev_match,
+ .dev_groups = virtio_dev_groups,
+ .uevent = virtio_uevent,
+ .probe = virtio_dev_probe,
+ .remove = virtio_dev_remove,
+};
+
+int register_virtio_driver(struct virtio_driver *driver)
+{
+ /* Catch this early. */
+ BUG_ON(driver->feature_table_size && !driver->feature_table);
+ driver->driver.bus = &virtio_bus;
+ return driver_register(&driver->driver);
+}
+EXPORT_SYMBOL_GPL(register_virtio_driver);
+
+void unregister_virtio_driver(struct virtio_driver *driver)
+{
+ driver_unregister(&driver->driver);
+}
+EXPORT_SYMBOL_GPL(unregister_virtio_driver);
+
+static int virtio_device_of_init(struct virtio_device *dev)
+{
+ struct device_node *np, *pnode = dev_of_node(dev->dev.parent);
+ char compat[] = "virtio,deviceXXXXXXXX";
+ int ret, count;
+
+ if (!pnode)
+ return 0;
+
+ count = of_get_available_child_count(pnode);
+ if (!count)
+ return 0;
+
+ /* There can be only 1 child node */
+ if (WARN_ON(count > 1))
+ return -EINVAL;
+
+ np = of_get_next_available_child(pnode, NULL);
+ if (WARN_ON(!np))
+ return -ENODEV;
+
+ ret = snprintf(compat, sizeof(compat), "virtio,device%x", dev->id.device);
+ BUG_ON(ret >= sizeof(compat));
+
+ /*
+ * On powerpc/pseries virtio devices are PCI devices so PCI
+ * vendor/device ids play the role of the "compatible" property.
+ * Simply don't init of_node in this case.
+ */
+ if (!of_device_is_compatible(np, compat)) {
+ ret = 0;
+ goto out;
+ }
+
+ dev->dev.of_node = np;
+ return 0;
+
+out:
+ of_node_put(np);
+ return ret;
+}
+
+/**
+ * register_virtio_device - register virtio device
+ * @dev : virtio device to be registered
+ *
+ * On error, the caller must call put_device on &@dev->dev (and not kfree),
+ * as another code path may have obtained a reference to @dev.
+ *
+ * Returns: 0 on suceess, -error on failure
+ */
+int register_virtio_device(struct virtio_device *dev)
+{
+ int err;
+
+ dev->dev.bus = &virtio_bus;
+ device_initialize(&dev->dev);
+
+ /* Assign a unique device index and hence name. */
+ err = ida_alloc(&virtio_index_ida, GFP_KERNEL);
+ if (err < 0)
+ goto out;
+
+ dev->index = err;
+ err = dev_set_name(&dev->dev, "virtio%u", dev->index);
+ if (err)
+ goto out_ida_remove;
+
+ err = virtio_device_of_init(dev);
+ if (err)
+ goto out_ida_remove;
+
+ spin_lock_init(&dev->config_lock);
+ dev->config_enabled = false;
+ dev->config_change_pending = false;
+
+ INIT_LIST_HEAD(&dev->vqs);
+ spin_lock_init(&dev->vqs_list_lock);
+
+ /* We always start by resetting the device, in case a previous
+ * driver messed it up. This also tests that code path a little. */
+ virtio_reset_device(dev);
+
+ /* Acknowledge that we've seen the device. */
+ virtio_add_status(dev, VIRTIO_CONFIG_S_ACKNOWLEDGE);
+
+ /*
+ * device_add() causes the bus infrastructure to look for a matching
+ * driver.
+ */
+ err = device_add(&dev->dev);
+ if (err)
+ goto out_of_node_put;
+
+ return 0;
+
+out_of_node_put:
+ of_node_put(dev->dev.of_node);
+out_ida_remove:
+ ida_free(&virtio_index_ida, dev->index);
+out:
+ virtio_add_status(dev, VIRTIO_CONFIG_S_FAILED);
+ return err;
+}
+EXPORT_SYMBOL_GPL(register_virtio_device);
+
+bool is_virtio_device(struct device *dev)
+{
+ return dev->bus == &virtio_bus;
+}
+EXPORT_SYMBOL_GPL(is_virtio_device);
+
+void unregister_virtio_device(struct virtio_device *dev)
+{
+ int index = dev->index; /* save for after device release */
+
+ device_unregister(&dev->dev);
+ ida_free(&virtio_index_ida, index);
+}
+EXPORT_SYMBOL_GPL(unregister_virtio_device);
+
+#ifdef CONFIG_PM_SLEEP
+int virtio_device_freeze(struct virtio_device *dev)
+{
+ struct virtio_driver *drv = drv_to_virtio(dev->dev.driver);
+
+ virtio_config_disable(dev);
+
+ dev->failed = dev->config->get_status(dev) & VIRTIO_CONFIG_S_FAILED;
+
+ if (drv && drv->freeze)
+ return drv->freeze(dev);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(virtio_device_freeze);
+
+int virtio_device_restore(struct virtio_device *dev)
+{
+ struct virtio_driver *drv = drv_to_virtio(dev->dev.driver);
+ int ret;
+
+ /* We always start by resetting the device, in case a previous
+ * driver messed it up. */
+ virtio_reset_device(dev);
+
+ /* Acknowledge that we've seen the device. */
+ virtio_add_status(dev, VIRTIO_CONFIG_S_ACKNOWLEDGE);
+
+ /* Maybe driver failed before freeze.
+ * Restore the failed status, for debugging. */
+ if (dev->failed)
+ virtio_add_status(dev, VIRTIO_CONFIG_S_FAILED);
+
+ if (!drv)
+ return 0;
+
+ /* We have a driver! */
+ virtio_add_status(dev, VIRTIO_CONFIG_S_DRIVER);
+
+ ret = dev->config->finalize_features(dev);
+ if (ret)
+ goto err;
+
+ ret = virtio_features_ok(dev);
+ if (ret)
+ goto err;
+
+ if (drv->restore) {
+ ret = drv->restore(dev);
+ if (ret)
+ goto err;
+ }
+
+ /* If restore didn't do it, mark device DRIVER_OK ourselves. */
+ if (!(dev->config->get_status(dev) & VIRTIO_CONFIG_S_DRIVER_OK))
+ virtio_device_ready(dev);
+
+ virtio_config_enable(dev);
+
+ return 0;
+
+err:
+ virtio_add_status(dev, VIRTIO_CONFIG_S_FAILED);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(virtio_device_restore);
+#endif
+
+static int virtio_init(void)
+{
+ if (bus_register(&virtio_bus) != 0)
+ panic("virtio bus registration failed");
+ return 0;
+}
+
+static void __exit virtio_exit(void)
+{
+ bus_unregister(&virtio_bus);
+ ida_destroy(&virtio_index_ida);
+}
+core_initcall(virtio_init);
+module_exit(virtio_exit);
+
+MODULE_LICENSE("GPL");
diff --git a/drivers/virtio/virtio_anchor.c b/drivers/virtio/virtio_anchor.c
new file mode 100644
index 000000000..4d6a5d269
--- /dev/null
+++ b/drivers/virtio/virtio_anchor.c
@@ -0,0 +1,18 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <linux/virtio.h>
+#include <linux/virtio_anchor.h>
+
+bool virtio_require_restricted_mem_acc(struct virtio_device *dev)
+{
+ return true;
+}
+EXPORT_SYMBOL_GPL(virtio_require_restricted_mem_acc);
+
+static bool virtio_no_restricted_mem_acc(struct virtio_device *dev)
+{
+ return false;
+}
+
+bool (*virtio_check_mem_acc_cb)(struct virtio_device *dev) =
+ virtio_no_restricted_mem_acc;
+EXPORT_SYMBOL_GPL(virtio_check_mem_acc_cb);
diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
new file mode 100644
index 000000000..2d5d252ef
--- /dev/null
+++ b/drivers/virtio/virtio_balloon.c
@@ -0,0 +1,1132 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Virtio balloon implementation, inspired by Dor Laor and Marcelo
+ * Tosatti's implementations.
+ *
+ * Copyright 2008 Rusty Russell IBM Corporation
+ */
+
+#include <linux/virtio.h>
+#include <linux/virtio_balloon.h>
+#include <linux/swap.h>
+#include <linux/workqueue.h>
+#include <linux/delay.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/balloon_compaction.h>
+#include <linux/oom.h>
+#include <linux/wait.h>
+#include <linux/mm.h>
+#include <linux/page_reporting.h>
+
+/*
+ * Balloon device works in 4K page units. So each page is pointed to by
+ * multiple balloon pages. All memory counters in this driver are in balloon
+ * page units.
+ */
+#define VIRTIO_BALLOON_PAGES_PER_PAGE (unsigned int)(PAGE_SIZE >> VIRTIO_BALLOON_PFN_SHIFT)
+#define VIRTIO_BALLOON_ARRAY_PFNS_MAX 256
+/* Maximum number of (4k) pages to deflate on OOM notifications. */
+#define VIRTIO_BALLOON_OOM_NR_PAGES 256
+#define VIRTIO_BALLOON_OOM_NOTIFY_PRIORITY 80
+
+#define VIRTIO_BALLOON_FREE_PAGE_ALLOC_FLAG (__GFP_NORETRY | __GFP_NOWARN | \
+ __GFP_NOMEMALLOC)
+/* The order of free page blocks to report to host */
+#define VIRTIO_BALLOON_HINT_BLOCK_ORDER MAX_ORDER
+/* The size of a free page block in bytes */
+#define VIRTIO_BALLOON_HINT_BLOCK_BYTES \
+ (1 << (VIRTIO_BALLOON_HINT_BLOCK_ORDER + PAGE_SHIFT))
+#define VIRTIO_BALLOON_HINT_BLOCK_PAGES (1 << VIRTIO_BALLOON_HINT_BLOCK_ORDER)
+
+enum virtio_balloon_vq {
+ VIRTIO_BALLOON_VQ_INFLATE,
+ VIRTIO_BALLOON_VQ_DEFLATE,
+ VIRTIO_BALLOON_VQ_STATS,
+ VIRTIO_BALLOON_VQ_FREE_PAGE,
+ VIRTIO_BALLOON_VQ_REPORTING,
+ VIRTIO_BALLOON_VQ_MAX
+};
+
+enum virtio_balloon_config_read {
+ VIRTIO_BALLOON_CONFIG_READ_CMD_ID = 0,
+};
+
+struct virtio_balloon {
+ struct virtio_device *vdev;
+ struct virtqueue *inflate_vq, *deflate_vq, *stats_vq, *free_page_vq;
+
+ /* Balloon's own wq for cpu-intensive work items */
+ struct workqueue_struct *balloon_wq;
+ /* The free page reporting work item submitted to the balloon wq */
+ struct work_struct report_free_page_work;
+
+ /* The balloon servicing is delegated to a freezable workqueue. */
+ struct work_struct update_balloon_stats_work;
+ struct work_struct update_balloon_size_work;
+
+ /* Prevent updating balloon when it is being canceled. */
+ spinlock_t stop_update_lock;
+ bool stop_update;
+ /* Bitmap to indicate if reading the related config fields are needed */
+ unsigned long config_read_bitmap;
+
+ /* The list of allocated free pages, waiting to be given back to mm */
+ struct list_head free_page_list;
+ spinlock_t free_page_list_lock;
+ /* The number of free page blocks on the above list */
+ unsigned long num_free_page_blocks;
+ /*
+ * The cmd id received from host.
+ * Read it via virtio_balloon_cmd_id_received to get the latest value
+ * sent from host.
+ */
+ u32 cmd_id_received_cache;
+ /* The cmd id that is actively in use */
+ __virtio32 cmd_id_active;
+ /* Buffer to store the stop sign */
+ __virtio32 cmd_id_stop;
+
+ /* Waiting for host to ack the pages we released. */
+ wait_queue_head_t acked;
+
+ /* Number of balloon pages we've told the Host we're not using. */
+ unsigned int num_pages;
+ /*
+ * The pages we've told the Host we're not using are enqueued
+ * at vb_dev_info->pages list.
+ * Each page on this list adds VIRTIO_BALLOON_PAGES_PER_PAGE
+ * to num_pages above.
+ */
+ struct balloon_dev_info vb_dev_info;
+
+ /* Synchronize access/update to this struct virtio_balloon elements */
+ struct mutex balloon_lock;
+
+ /* The array of pfns we tell the Host about. */
+ unsigned int num_pfns;
+ __virtio32 pfns[VIRTIO_BALLOON_ARRAY_PFNS_MAX];
+
+ /* Memory statistics */
+ struct virtio_balloon_stat stats[VIRTIO_BALLOON_S_NR];
+
+ /* Shrinker to return free pages - VIRTIO_BALLOON_F_FREE_PAGE_HINT */
+ struct shrinker shrinker;
+
+ /* OOM notifier to deflate on OOM - VIRTIO_BALLOON_F_DEFLATE_ON_OOM */
+ struct notifier_block oom_nb;
+
+ /* Free page reporting device */
+ struct virtqueue *reporting_vq;
+ struct page_reporting_dev_info pr_dev_info;
+};
+
+static const struct virtio_device_id id_table[] = {
+ { VIRTIO_ID_BALLOON, VIRTIO_DEV_ANY_ID },
+ { 0 },
+};
+
+static u32 page_to_balloon_pfn(struct page *page)
+{
+ unsigned long pfn = page_to_pfn(page);
+
+ BUILD_BUG_ON(PAGE_SHIFT < VIRTIO_BALLOON_PFN_SHIFT);
+ /* Convert pfn from Linux page size to balloon page size. */
+ return pfn * VIRTIO_BALLOON_PAGES_PER_PAGE;
+}
+
+static void balloon_ack(struct virtqueue *vq)
+{
+ struct virtio_balloon *vb = vq->vdev->priv;
+
+ wake_up(&vb->acked);
+}
+
+static void tell_host(struct virtio_balloon *vb, struct virtqueue *vq)
+{
+ struct scatterlist sg;
+ unsigned int len;
+
+ sg_init_one(&sg, vb->pfns, sizeof(vb->pfns[0]) * vb->num_pfns);
+
+ /* We should always be able to add one buffer to an empty queue. */
+ virtqueue_add_outbuf(vq, &sg, 1, vb, GFP_KERNEL);
+ virtqueue_kick(vq);
+
+ /* When host has read buffer, this completes via balloon_ack */
+ wait_event(vb->acked, virtqueue_get_buf(vq, &len));
+
+}
+
+static int virtballoon_free_page_report(struct page_reporting_dev_info *pr_dev_info,
+ struct scatterlist *sg, unsigned int nents)
+{
+ struct virtio_balloon *vb =
+ container_of(pr_dev_info, struct virtio_balloon, pr_dev_info);
+ struct virtqueue *vq = vb->reporting_vq;
+ unsigned int unused, err;
+
+ /* We should always be able to add these buffers to an empty queue. */
+ err = virtqueue_add_inbuf(vq, sg, nents, vb, GFP_NOWAIT | __GFP_NOWARN);
+
+ /*
+ * In the extremely unlikely case that something has occurred and we
+ * are able to trigger an error we will simply display a warning
+ * and exit without actually processing the pages.
+ */
+ if (WARN_ON_ONCE(err))
+ return err;
+
+ virtqueue_kick(vq);
+
+ /* When host has read buffer, this completes via balloon_ack */
+ wait_event(vb->acked, virtqueue_get_buf(vq, &unused));
+
+ return 0;
+}
+
+static void set_page_pfns(struct virtio_balloon *vb,
+ __virtio32 pfns[], struct page *page)
+{
+ unsigned int i;
+
+ BUILD_BUG_ON(VIRTIO_BALLOON_PAGES_PER_PAGE > VIRTIO_BALLOON_ARRAY_PFNS_MAX);
+
+ /*
+ * Set balloon pfns pointing at this page.
+ * Note that the first pfn points at start of the page.
+ */
+ for (i = 0; i < VIRTIO_BALLOON_PAGES_PER_PAGE; i++)
+ pfns[i] = cpu_to_virtio32(vb->vdev,
+ page_to_balloon_pfn(page) + i);
+}
+
+static unsigned int fill_balloon(struct virtio_balloon *vb, size_t num)
+{
+ unsigned int num_allocated_pages;
+ unsigned int num_pfns;
+ struct page *page;
+ LIST_HEAD(pages);
+
+ /* We can only do one array worth at a time. */
+ num = min(num, ARRAY_SIZE(vb->pfns));
+
+ for (num_pfns = 0; num_pfns < num;
+ num_pfns += VIRTIO_BALLOON_PAGES_PER_PAGE) {
+ struct page *page = balloon_page_alloc();
+
+ if (!page) {
+ dev_info_ratelimited(&vb->vdev->dev,
+ "Out of puff! Can't get %u pages\n",
+ VIRTIO_BALLOON_PAGES_PER_PAGE);
+ /* Sleep for at least 1/5 of a second before retry. */
+ msleep(200);
+ break;
+ }
+
+ balloon_page_push(&pages, page);
+ }
+
+ mutex_lock(&vb->balloon_lock);
+
+ vb->num_pfns = 0;
+
+ while ((page = balloon_page_pop(&pages))) {
+ balloon_page_enqueue(&vb->vb_dev_info, page);
+
+ set_page_pfns(vb, vb->pfns + vb->num_pfns, page);
+ vb->num_pages += VIRTIO_BALLOON_PAGES_PER_PAGE;
+ if (!virtio_has_feature(vb->vdev,
+ VIRTIO_BALLOON_F_DEFLATE_ON_OOM))
+ adjust_managed_page_count(page, -1);
+ vb->num_pfns += VIRTIO_BALLOON_PAGES_PER_PAGE;
+ }
+
+ num_allocated_pages = vb->num_pfns;
+ /* Did we get any? */
+ if (vb->num_pfns != 0)
+ tell_host(vb, vb->inflate_vq);
+ mutex_unlock(&vb->balloon_lock);
+
+ return num_allocated_pages;
+}
+
+static void release_pages_balloon(struct virtio_balloon *vb,
+ struct list_head *pages)
+{
+ struct page *page, *next;
+
+ list_for_each_entry_safe(page, next, pages, lru) {
+ if (!virtio_has_feature(vb->vdev,
+ VIRTIO_BALLOON_F_DEFLATE_ON_OOM))
+ adjust_managed_page_count(page, 1);
+ list_del(&page->lru);
+ put_page(page); /* balloon reference */
+ }
+}
+
+static unsigned int leak_balloon(struct virtio_balloon *vb, size_t num)
+{
+ unsigned int num_freed_pages;
+ struct page *page;
+ struct balloon_dev_info *vb_dev_info = &vb->vb_dev_info;
+ LIST_HEAD(pages);
+
+ /* We can only do one array worth at a time. */
+ num = min(num, ARRAY_SIZE(vb->pfns));
+
+ mutex_lock(&vb->balloon_lock);
+ /* We can't release more pages than taken */
+ num = min(num, (size_t)vb->num_pages);
+ for (vb->num_pfns = 0; vb->num_pfns < num;
+ vb->num_pfns += VIRTIO_BALLOON_PAGES_PER_PAGE) {
+ page = balloon_page_dequeue(vb_dev_info);
+ if (!page)
+ break;
+ set_page_pfns(vb, vb->pfns + vb->num_pfns, page);
+ list_add(&page->lru, &pages);
+ vb->num_pages -= VIRTIO_BALLOON_PAGES_PER_PAGE;
+ }
+
+ num_freed_pages = vb->num_pfns;
+ /*
+ * Note that if
+ * virtio_has_feature(vdev, VIRTIO_BALLOON_F_MUST_TELL_HOST);
+ * is true, we *have* to do it in this order
+ */
+ if (vb->num_pfns != 0)
+ tell_host(vb, vb->deflate_vq);
+ release_pages_balloon(vb, &pages);
+ mutex_unlock(&vb->balloon_lock);
+ return num_freed_pages;
+}
+
+static inline void update_stat(struct virtio_balloon *vb, int idx,
+ u16 tag, u64 val)
+{
+ BUG_ON(idx >= VIRTIO_BALLOON_S_NR);
+ vb->stats[idx].tag = cpu_to_virtio16(vb->vdev, tag);
+ vb->stats[idx].val = cpu_to_virtio64(vb->vdev, val);
+}
+
+#define pages_to_bytes(x) ((u64)(x) << PAGE_SHIFT)
+
+static unsigned int update_balloon_stats(struct virtio_balloon *vb)
+{
+ unsigned long events[NR_VM_EVENT_ITEMS];
+ struct sysinfo i;
+ unsigned int idx = 0;
+ long available;
+ unsigned long caches;
+
+ all_vm_events(events);
+ si_meminfo(&i);
+
+ available = si_mem_available();
+ caches = global_node_page_state(NR_FILE_PAGES);
+
+#ifdef CONFIG_VM_EVENT_COUNTERS
+ update_stat(vb, idx++, VIRTIO_BALLOON_S_SWAP_IN,
+ pages_to_bytes(events[PSWPIN]));
+ update_stat(vb, idx++, VIRTIO_BALLOON_S_SWAP_OUT,
+ pages_to_bytes(events[PSWPOUT]));
+ update_stat(vb, idx++, VIRTIO_BALLOON_S_MAJFLT, events[PGMAJFAULT]);
+ update_stat(vb, idx++, VIRTIO_BALLOON_S_MINFLT, events[PGFAULT]);
+#ifdef CONFIG_HUGETLB_PAGE
+ update_stat(vb, idx++, VIRTIO_BALLOON_S_HTLB_PGALLOC,
+ events[HTLB_BUDDY_PGALLOC]);
+ update_stat(vb, idx++, VIRTIO_BALLOON_S_HTLB_PGFAIL,
+ events[HTLB_BUDDY_PGALLOC_FAIL]);
+#endif
+#endif
+ update_stat(vb, idx++, VIRTIO_BALLOON_S_MEMFREE,
+ pages_to_bytes(i.freeram));
+ update_stat(vb, idx++, VIRTIO_BALLOON_S_MEMTOT,
+ pages_to_bytes(i.totalram));
+ update_stat(vb, idx++, VIRTIO_BALLOON_S_AVAIL,
+ pages_to_bytes(available));
+ update_stat(vb, idx++, VIRTIO_BALLOON_S_CACHES,
+ pages_to_bytes(caches));
+
+ return idx;
+}
+
+/*
+ * While most virtqueues communicate guest-initiated requests to the hypervisor,
+ * the stats queue operates in reverse. The driver initializes the virtqueue
+ * with a single buffer. From that point forward, all conversations consist of
+ * a hypervisor request (a call to this function) which directs us to refill
+ * the virtqueue with a fresh stats buffer. Since stats collection can sleep,
+ * we delegate the job to a freezable workqueue that will do the actual work via
+ * stats_handle_request().
+ */
+static void stats_request(struct virtqueue *vq)
+{
+ struct virtio_balloon *vb = vq->vdev->priv;
+
+ spin_lock(&vb->stop_update_lock);
+ if (!vb->stop_update)
+ queue_work(system_freezable_wq, &vb->update_balloon_stats_work);
+ spin_unlock(&vb->stop_update_lock);
+}
+
+static void stats_handle_request(struct virtio_balloon *vb)
+{
+ struct virtqueue *vq;
+ struct scatterlist sg;
+ unsigned int len, num_stats;
+
+ num_stats = update_balloon_stats(vb);
+
+ vq = vb->stats_vq;
+ if (!virtqueue_get_buf(vq, &len))
+ return;
+ sg_init_one(&sg, vb->stats, sizeof(vb->stats[0]) * num_stats);
+ virtqueue_add_outbuf(vq, &sg, 1, vb, GFP_KERNEL);
+ virtqueue_kick(vq);
+}
+
+static inline s64 towards_target(struct virtio_balloon *vb)
+{
+ s64 target;
+ u32 num_pages;
+
+ /* Legacy balloon config space is LE, unlike all other devices. */
+ virtio_cread_le(vb->vdev, struct virtio_balloon_config, num_pages,
+ &num_pages);
+
+ /*
+ * Aligned up to guest page size to avoid inflating and deflating
+ * balloon endlessly.
+ */
+ target = ALIGN(num_pages, VIRTIO_BALLOON_PAGES_PER_PAGE);
+ return target - vb->num_pages;
+}
+
+/* Gives back @num_to_return blocks of free pages to mm. */
+static unsigned long return_free_pages_to_mm(struct virtio_balloon *vb,
+ unsigned long num_to_return)
+{
+ struct page *page;
+ unsigned long num_returned;
+
+ spin_lock_irq(&vb->free_page_list_lock);
+ for (num_returned = 0; num_returned < num_to_return; num_returned++) {
+ page = balloon_page_pop(&vb->free_page_list);
+ if (!page)
+ break;
+ free_pages((unsigned long)page_address(page),
+ VIRTIO_BALLOON_HINT_BLOCK_ORDER);
+ }
+ vb->num_free_page_blocks -= num_returned;
+ spin_unlock_irq(&vb->free_page_list_lock);
+
+ return num_returned;
+}
+
+static void virtio_balloon_queue_free_page_work(struct virtio_balloon *vb)
+{
+ if (!virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT))
+ return;
+
+ /* No need to queue the work if the bit was already set. */
+ if (test_and_set_bit(VIRTIO_BALLOON_CONFIG_READ_CMD_ID,
+ &vb->config_read_bitmap))
+ return;
+
+ queue_work(vb->balloon_wq, &vb->report_free_page_work);
+}
+
+static void virtballoon_changed(struct virtio_device *vdev)
+{
+ struct virtio_balloon *vb = vdev->priv;
+ unsigned long flags;
+
+ spin_lock_irqsave(&vb->stop_update_lock, flags);
+ if (!vb->stop_update) {
+ queue_work(system_freezable_wq,
+ &vb->update_balloon_size_work);
+ virtio_balloon_queue_free_page_work(vb);
+ }
+ spin_unlock_irqrestore(&vb->stop_update_lock, flags);
+}
+
+static void update_balloon_size(struct virtio_balloon *vb)
+{
+ u32 actual = vb->num_pages;
+
+ /* Legacy balloon config space is LE, unlike all other devices. */
+ virtio_cwrite_le(vb->vdev, struct virtio_balloon_config, actual,
+ &actual);
+}
+
+static void update_balloon_stats_func(struct work_struct *work)
+{
+ struct virtio_balloon *vb;
+
+ vb = container_of(work, struct virtio_balloon,
+ update_balloon_stats_work);
+ stats_handle_request(vb);
+}
+
+static void update_balloon_size_func(struct work_struct *work)
+{
+ struct virtio_balloon *vb;
+ s64 diff;
+
+ vb = container_of(work, struct virtio_balloon,
+ update_balloon_size_work);
+ diff = towards_target(vb);
+
+ if (!diff)
+ return;
+
+ if (diff > 0)
+ diff -= fill_balloon(vb, diff);
+ else
+ diff += leak_balloon(vb, -diff);
+ update_balloon_size(vb);
+
+ if (diff)
+ queue_work(system_freezable_wq, work);
+}
+
+static int init_vqs(struct virtio_balloon *vb)
+{
+ struct virtqueue *vqs[VIRTIO_BALLOON_VQ_MAX];
+ vq_callback_t *callbacks[VIRTIO_BALLOON_VQ_MAX];
+ const char *names[VIRTIO_BALLOON_VQ_MAX];
+ int err;
+
+ /*
+ * Inflateq and deflateq are used unconditionally. The names[]
+ * will be NULL if the related feature is not enabled, which will
+ * cause no allocation for the corresponding virtqueue in find_vqs.
+ */
+ callbacks[VIRTIO_BALLOON_VQ_INFLATE] = balloon_ack;
+ names[VIRTIO_BALLOON_VQ_INFLATE] = "inflate";
+ callbacks[VIRTIO_BALLOON_VQ_DEFLATE] = balloon_ack;
+ names[VIRTIO_BALLOON_VQ_DEFLATE] = "deflate";
+ callbacks[VIRTIO_BALLOON_VQ_STATS] = NULL;
+ names[VIRTIO_BALLOON_VQ_STATS] = NULL;
+ callbacks[VIRTIO_BALLOON_VQ_FREE_PAGE] = NULL;
+ names[VIRTIO_BALLOON_VQ_FREE_PAGE] = NULL;
+ names[VIRTIO_BALLOON_VQ_REPORTING] = NULL;
+
+ if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_STATS_VQ)) {
+ names[VIRTIO_BALLOON_VQ_STATS] = "stats";
+ callbacks[VIRTIO_BALLOON_VQ_STATS] = stats_request;
+ }
+
+ if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT)) {
+ names[VIRTIO_BALLOON_VQ_FREE_PAGE] = "free_page_vq";
+ callbacks[VIRTIO_BALLOON_VQ_FREE_PAGE] = NULL;
+ }
+
+ if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_REPORTING)) {
+ names[VIRTIO_BALLOON_VQ_REPORTING] = "reporting_vq";
+ callbacks[VIRTIO_BALLOON_VQ_REPORTING] = balloon_ack;
+ }
+
+ err = virtio_find_vqs(vb->vdev, VIRTIO_BALLOON_VQ_MAX, vqs,
+ callbacks, names, NULL);
+ if (err)
+ return err;
+
+ vb->inflate_vq = vqs[VIRTIO_BALLOON_VQ_INFLATE];
+ vb->deflate_vq = vqs[VIRTIO_BALLOON_VQ_DEFLATE];
+ if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_STATS_VQ)) {
+ struct scatterlist sg;
+ unsigned int num_stats;
+ vb->stats_vq = vqs[VIRTIO_BALLOON_VQ_STATS];
+
+ /*
+ * Prime this virtqueue with one buffer so the hypervisor can
+ * use it to signal us later (it can't be broken yet!).
+ */
+ num_stats = update_balloon_stats(vb);
+
+ sg_init_one(&sg, vb->stats, sizeof(vb->stats[0]) * num_stats);
+ err = virtqueue_add_outbuf(vb->stats_vq, &sg, 1, vb,
+ GFP_KERNEL);
+ if (err) {
+ dev_warn(&vb->vdev->dev, "%s: add stat_vq failed\n",
+ __func__);
+ return err;
+ }
+ virtqueue_kick(vb->stats_vq);
+ }
+
+ if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT))
+ vb->free_page_vq = vqs[VIRTIO_BALLOON_VQ_FREE_PAGE];
+
+ if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_REPORTING))
+ vb->reporting_vq = vqs[VIRTIO_BALLOON_VQ_REPORTING];
+
+ return 0;
+}
+
+static u32 virtio_balloon_cmd_id_received(struct virtio_balloon *vb)
+{
+ if (test_and_clear_bit(VIRTIO_BALLOON_CONFIG_READ_CMD_ID,
+ &vb->config_read_bitmap)) {
+ /* Legacy balloon config space is LE, unlike all other devices. */
+ virtio_cread_le(vb->vdev, struct virtio_balloon_config,
+ free_page_hint_cmd_id,
+ &vb->cmd_id_received_cache);
+ }
+
+ return vb->cmd_id_received_cache;
+}
+
+static int send_cmd_id_start(struct virtio_balloon *vb)
+{
+ struct scatterlist sg;
+ struct virtqueue *vq = vb->free_page_vq;
+ int err, unused;
+
+ /* Detach all the used buffers from the vq */
+ while (virtqueue_get_buf(vq, &unused))
+ ;
+
+ vb->cmd_id_active = cpu_to_virtio32(vb->vdev,
+ virtio_balloon_cmd_id_received(vb));
+ sg_init_one(&sg, &vb->cmd_id_active, sizeof(vb->cmd_id_active));
+ err = virtqueue_add_outbuf(vq, &sg, 1, &vb->cmd_id_active, GFP_KERNEL);
+ if (!err)
+ virtqueue_kick(vq);
+ return err;
+}
+
+static int send_cmd_id_stop(struct virtio_balloon *vb)
+{
+ struct scatterlist sg;
+ struct virtqueue *vq = vb->free_page_vq;
+ int err, unused;
+
+ /* Detach all the used buffers from the vq */
+ while (virtqueue_get_buf(vq, &unused))
+ ;
+
+ sg_init_one(&sg, &vb->cmd_id_stop, sizeof(vb->cmd_id_stop));
+ err = virtqueue_add_outbuf(vq, &sg, 1, &vb->cmd_id_stop, GFP_KERNEL);
+ if (!err)
+ virtqueue_kick(vq);
+ return err;
+}
+
+static int get_free_page_and_send(struct virtio_balloon *vb)
+{
+ struct virtqueue *vq = vb->free_page_vq;
+ struct page *page;
+ struct scatterlist sg;
+ int err, unused;
+ void *p;
+
+ /* Detach all the used buffers from the vq */
+ while (virtqueue_get_buf(vq, &unused))
+ ;
+
+ page = alloc_pages(VIRTIO_BALLOON_FREE_PAGE_ALLOC_FLAG,
+ VIRTIO_BALLOON_HINT_BLOCK_ORDER);
+ /*
+ * When the allocation returns NULL, it indicates that we have got all
+ * the possible free pages, so return -EINTR to stop.
+ */
+ if (!page)
+ return -EINTR;
+
+ p = page_address(page);
+ sg_init_one(&sg, p, VIRTIO_BALLOON_HINT_BLOCK_BYTES);
+ /* There is always 1 entry reserved for the cmd id to use. */
+ if (vq->num_free > 1) {
+ err = virtqueue_add_inbuf(vq, &sg, 1, p, GFP_KERNEL);
+ if (unlikely(err)) {
+ free_pages((unsigned long)p,
+ VIRTIO_BALLOON_HINT_BLOCK_ORDER);
+ return err;
+ }
+ virtqueue_kick(vq);
+ spin_lock_irq(&vb->free_page_list_lock);
+ balloon_page_push(&vb->free_page_list, page);
+ vb->num_free_page_blocks++;
+ spin_unlock_irq(&vb->free_page_list_lock);
+ } else {
+ /*
+ * The vq has no available entry to add this page block, so
+ * just free it.
+ */
+ free_pages((unsigned long)p, VIRTIO_BALLOON_HINT_BLOCK_ORDER);
+ }
+
+ return 0;
+}
+
+static int send_free_pages(struct virtio_balloon *vb)
+{
+ int err;
+ u32 cmd_id_active;
+
+ while (1) {
+ /*
+ * If a stop id or a new cmd id was just received from host,
+ * stop the reporting.
+ */
+ cmd_id_active = virtio32_to_cpu(vb->vdev, vb->cmd_id_active);
+ if (unlikely(cmd_id_active !=
+ virtio_balloon_cmd_id_received(vb)))
+ break;
+
+ /*
+ * The free page blocks are allocated and sent to host one by
+ * one.
+ */
+ err = get_free_page_and_send(vb);
+ if (err == -EINTR)
+ break;
+ else if (unlikely(err))
+ return err;
+ }
+
+ return 0;
+}
+
+static void virtio_balloon_report_free_page(struct virtio_balloon *vb)
+{
+ int err;
+ struct device *dev = &vb->vdev->dev;
+
+ /* Start by sending the received cmd id to host with an outbuf. */
+ err = send_cmd_id_start(vb);
+ if (unlikely(err))
+ dev_err(dev, "Failed to send a start id, err = %d\n", err);
+
+ err = send_free_pages(vb);
+ if (unlikely(err))
+ dev_err(dev, "Failed to send a free page, err = %d\n", err);
+
+ /* End by sending a stop id to host with an outbuf. */
+ err = send_cmd_id_stop(vb);
+ if (unlikely(err))
+ dev_err(dev, "Failed to send a stop id, err = %d\n", err);
+}
+
+static void report_free_page_func(struct work_struct *work)
+{
+ struct virtio_balloon *vb = container_of(work, struct virtio_balloon,
+ report_free_page_work);
+ u32 cmd_id_received;
+
+ cmd_id_received = virtio_balloon_cmd_id_received(vb);
+ if (cmd_id_received == VIRTIO_BALLOON_CMD_ID_DONE) {
+ /* Pass ULONG_MAX to give back all the free pages */
+ return_free_pages_to_mm(vb, ULONG_MAX);
+ } else if (cmd_id_received != VIRTIO_BALLOON_CMD_ID_STOP &&
+ cmd_id_received !=
+ virtio32_to_cpu(vb->vdev, vb->cmd_id_active)) {
+ virtio_balloon_report_free_page(vb);
+ }
+}
+
+#ifdef CONFIG_BALLOON_COMPACTION
+/*
+ * virtballoon_migratepage - perform the balloon page migration on behalf of
+ * a compaction thread. (called under page lock)
+ * @vb_dev_info: the balloon device
+ * @newpage: page that will replace the isolated page after migration finishes.
+ * @page : the isolated (old) page that is about to be migrated to newpage.
+ * @mode : compaction mode -- not used for balloon page migration.
+ *
+ * After a ballooned page gets isolated by compaction procedures, this is the
+ * function that performs the page migration on behalf of a compaction thread
+ * The page migration for virtio balloon is done in a simple swap fashion which
+ * follows these two macro steps:
+ * 1) insert newpage into vb->pages list and update the host about it;
+ * 2) update the host about the old page removed from vb->pages list;
+ *
+ * This function preforms the balloon page migration task.
+ * Called through balloon_mapping->a_ops->migratepage
+ */
+static int virtballoon_migratepage(struct balloon_dev_info *vb_dev_info,
+ struct page *newpage, struct page *page, enum migrate_mode mode)
+{
+ struct virtio_balloon *vb = container_of(vb_dev_info,
+ struct virtio_balloon, vb_dev_info);
+ unsigned long flags;
+
+ /*
+ * In order to avoid lock contention while migrating pages concurrently
+ * to leak_balloon() or fill_balloon() we just give up the balloon_lock
+ * this turn, as it is easier to retry the page migration later.
+ * This also prevents fill_balloon() getting stuck into a mutex
+ * recursion in the case it ends up triggering memory compaction
+ * while it is attempting to inflate the ballon.
+ */
+ if (!mutex_trylock(&vb->balloon_lock))
+ return -EAGAIN;
+
+ get_page(newpage); /* balloon reference */
+
+ /*
+ * When we migrate a page to a different zone and adjusted the
+ * managed page count when inflating, we have to fixup the count of
+ * both involved zones.
+ */
+ if (!virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_DEFLATE_ON_OOM) &&
+ page_zone(page) != page_zone(newpage)) {
+ adjust_managed_page_count(page, 1);
+ adjust_managed_page_count(newpage, -1);
+ }
+
+ /* balloon's page migration 1st step -- inflate "newpage" */
+ spin_lock_irqsave(&vb_dev_info->pages_lock, flags);
+ balloon_page_insert(vb_dev_info, newpage);
+ vb_dev_info->isolated_pages--;
+ __count_vm_event(BALLOON_MIGRATE);
+ spin_unlock_irqrestore(&vb_dev_info->pages_lock, flags);
+ vb->num_pfns = VIRTIO_BALLOON_PAGES_PER_PAGE;
+ set_page_pfns(vb, vb->pfns, newpage);
+ tell_host(vb, vb->inflate_vq);
+
+ /* balloon's page migration 2nd step -- deflate "page" */
+ spin_lock_irqsave(&vb_dev_info->pages_lock, flags);
+ balloon_page_delete(page);
+ spin_unlock_irqrestore(&vb_dev_info->pages_lock, flags);
+ vb->num_pfns = VIRTIO_BALLOON_PAGES_PER_PAGE;
+ set_page_pfns(vb, vb->pfns, page);
+ tell_host(vb, vb->deflate_vq);
+
+ mutex_unlock(&vb->balloon_lock);
+
+ put_page(page); /* balloon reference */
+
+ return MIGRATEPAGE_SUCCESS;
+}
+#endif /* CONFIG_BALLOON_COMPACTION */
+
+static unsigned long shrink_free_pages(struct virtio_balloon *vb,
+ unsigned long pages_to_free)
+{
+ unsigned long blocks_to_free, blocks_freed;
+
+ pages_to_free = round_up(pages_to_free,
+ VIRTIO_BALLOON_HINT_BLOCK_PAGES);
+ blocks_to_free = pages_to_free / VIRTIO_BALLOON_HINT_BLOCK_PAGES;
+ blocks_freed = return_free_pages_to_mm(vb, blocks_to_free);
+
+ return blocks_freed * VIRTIO_BALLOON_HINT_BLOCK_PAGES;
+}
+
+static unsigned long virtio_balloon_shrinker_scan(struct shrinker *shrinker,
+ struct shrink_control *sc)
+{
+ struct virtio_balloon *vb = container_of(shrinker,
+ struct virtio_balloon, shrinker);
+
+ return shrink_free_pages(vb, sc->nr_to_scan);
+}
+
+static unsigned long virtio_balloon_shrinker_count(struct shrinker *shrinker,
+ struct shrink_control *sc)
+{
+ struct virtio_balloon *vb = container_of(shrinker,
+ struct virtio_balloon, shrinker);
+
+ return vb->num_free_page_blocks * VIRTIO_BALLOON_HINT_BLOCK_PAGES;
+}
+
+static int virtio_balloon_oom_notify(struct notifier_block *nb,
+ unsigned long dummy, void *parm)
+{
+ struct virtio_balloon *vb = container_of(nb,
+ struct virtio_balloon, oom_nb);
+ unsigned long *freed = parm;
+
+ *freed += leak_balloon(vb, VIRTIO_BALLOON_OOM_NR_PAGES) /
+ VIRTIO_BALLOON_PAGES_PER_PAGE;
+ update_balloon_size(vb);
+
+ return NOTIFY_OK;
+}
+
+static void virtio_balloon_unregister_shrinker(struct virtio_balloon *vb)
+{
+ unregister_shrinker(&vb->shrinker);
+}
+
+static int virtio_balloon_register_shrinker(struct virtio_balloon *vb)
+{
+ vb->shrinker.scan_objects = virtio_balloon_shrinker_scan;
+ vb->shrinker.count_objects = virtio_balloon_shrinker_count;
+ vb->shrinker.seeks = DEFAULT_SEEKS;
+
+ return register_shrinker(&vb->shrinker, "virtio-balloon");
+}
+
+static int virtballoon_probe(struct virtio_device *vdev)
+{
+ struct virtio_balloon *vb;
+ int err;
+
+ if (!vdev->config->get) {
+ dev_err(&vdev->dev, "%s failure: config access disabled\n",
+ __func__);
+ return -EINVAL;
+ }
+
+ vdev->priv = vb = kzalloc(sizeof(*vb), GFP_KERNEL);
+ if (!vb) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ INIT_WORK(&vb->update_balloon_stats_work, update_balloon_stats_func);
+ INIT_WORK(&vb->update_balloon_size_work, update_balloon_size_func);
+ spin_lock_init(&vb->stop_update_lock);
+ mutex_init(&vb->balloon_lock);
+ init_waitqueue_head(&vb->acked);
+ vb->vdev = vdev;
+
+ balloon_devinfo_init(&vb->vb_dev_info);
+
+ err = init_vqs(vb);
+ if (err)
+ goto out_free_vb;
+
+#ifdef CONFIG_BALLOON_COMPACTION
+ vb->vb_dev_info.migratepage = virtballoon_migratepage;
+#endif
+ if (virtio_has_feature(vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT)) {
+ /*
+ * There is always one entry reserved for cmd id, so the ring
+ * size needs to be at least two to report free page hints.
+ */
+ if (virtqueue_get_vring_size(vb->free_page_vq) < 2) {
+ err = -ENOSPC;
+ goto out_del_vqs;
+ }
+ vb->balloon_wq = alloc_workqueue("balloon-wq",
+ WQ_FREEZABLE | WQ_CPU_INTENSIVE, 0);
+ if (!vb->balloon_wq) {
+ err = -ENOMEM;
+ goto out_del_vqs;
+ }
+ INIT_WORK(&vb->report_free_page_work, report_free_page_func);
+ vb->cmd_id_received_cache = VIRTIO_BALLOON_CMD_ID_STOP;
+ vb->cmd_id_active = cpu_to_virtio32(vb->vdev,
+ VIRTIO_BALLOON_CMD_ID_STOP);
+ vb->cmd_id_stop = cpu_to_virtio32(vb->vdev,
+ VIRTIO_BALLOON_CMD_ID_STOP);
+ spin_lock_init(&vb->free_page_list_lock);
+ INIT_LIST_HEAD(&vb->free_page_list);
+ /*
+ * We're allowed to reuse any free pages, even if they are
+ * still to be processed by the host.
+ */
+ err = virtio_balloon_register_shrinker(vb);
+ if (err)
+ goto out_del_balloon_wq;
+ }
+
+ if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_DEFLATE_ON_OOM)) {
+ vb->oom_nb.notifier_call = virtio_balloon_oom_notify;
+ vb->oom_nb.priority = VIRTIO_BALLOON_OOM_NOTIFY_PRIORITY;
+ err = register_oom_notifier(&vb->oom_nb);
+ if (err < 0)
+ goto out_unregister_shrinker;
+ }
+
+ if (virtio_has_feature(vdev, VIRTIO_BALLOON_F_PAGE_POISON)) {
+ /* Start with poison val of 0 representing general init */
+ __u32 poison_val = 0;
+
+ /*
+ * Let the hypervisor know that we are expecting a
+ * specific value to be written back in balloon pages.
+ *
+ * If the PAGE_POISON value was larger than a byte we would
+ * need to byte swap poison_val here to guarantee it is
+ * little-endian. However for now it is a single byte so we
+ * can pass it as-is.
+ */
+ if (!want_init_on_free())
+ memset(&poison_val, PAGE_POISON, sizeof(poison_val));
+
+ virtio_cwrite_le(vb->vdev, struct virtio_balloon_config,
+ poison_val, &poison_val);
+ }
+
+ vb->pr_dev_info.report = virtballoon_free_page_report;
+ if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_REPORTING)) {
+ unsigned int capacity;
+
+ capacity = virtqueue_get_vring_size(vb->reporting_vq);
+ if (capacity < PAGE_REPORTING_CAPACITY) {
+ err = -ENOSPC;
+ goto out_unregister_oom;
+ }
+
+ /*
+ * The default page reporting order is @pageblock_order, which
+ * corresponds to 512MB in size on ARM64 when 64KB base page
+ * size is used. The page reporting won't be triggered if the
+ * freeing page can't come up with a free area like that huge.
+ * So we specify the page reporting order to 5, corresponding
+ * to 2MB. It helps to avoid THP splitting if 4KB base page
+ * size is used by host.
+ *
+ * Ideally, the page reporting order is selected based on the
+ * host's base page size. However, it needs more work to report
+ * that value. The hard-coded order would be fine currently.
+ */
+#if defined(CONFIG_ARM64) && defined(CONFIG_ARM64_64K_PAGES)
+ vb->pr_dev_info.order = 5;
+#endif
+
+ err = page_reporting_register(&vb->pr_dev_info);
+ if (err)
+ goto out_unregister_oom;
+ }
+
+ virtio_device_ready(vdev);
+
+ if (towards_target(vb))
+ virtballoon_changed(vdev);
+ return 0;
+
+out_unregister_oom:
+ if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_DEFLATE_ON_OOM))
+ unregister_oom_notifier(&vb->oom_nb);
+out_unregister_shrinker:
+ if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT))
+ virtio_balloon_unregister_shrinker(vb);
+out_del_balloon_wq:
+ if (virtio_has_feature(vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT))
+ destroy_workqueue(vb->balloon_wq);
+out_del_vqs:
+ vdev->config->del_vqs(vdev);
+out_free_vb:
+ kfree(vb);
+out:
+ return err;
+}
+
+static void remove_common(struct virtio_balloon *vb)
+{
+ /* There might be pages left in the balloon: free them. */
+ while (vb->num_pages)
+ leak_balloon(vb, vb->num_pages);
+ update_balloon_size(vb);
+
+ /* There might be free pages that are being reported: release them. */
+ if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT))
+ return_free_pages_to_mm(vb, ULONG_MAX);
+
+ /* Now we reset the device so we can clean up the queues. */
+ virtio_reset_device(vb->vdev);
+
+ vb->vdev->config->del_vqs(vb->vdev);
+}
+
+static void virtballoon_remove(struct virtio_device *vdev)
+{
+ struct virtio_balloon *vb = vdev->priv;
+
+ if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_REPORTING))
+ page_reporting_unregister(&vb->pr_dev_info);
+ if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_DEFLATE_ON_OOM))
+ unregister_oom_notifier(&vb->oom_nb);
+ if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT))
+ virtio_balloon_unregister_shrinker(vb);
+ spin_lock_irq(&vb->stop_update_lock);
+ vb->stop_update = true;
+ spin_unlock_irq(&vb->stop_update_lock);
+ cancel_work_sync(&vb->update_balloon_size_work);
+ cancel_work_sync(&vb->update_balloon_stats_work);
+
+ if (virtio_has_feature(vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT)) {
+ cancel_work_sync(&vb->report_free_page_work);
+ destroy_workqueue(vb->balloon_wq);
+ }
+
+ remove_common(vb);
+ kfree(vb);
+}
+
+#ifdef CONFIG_PM_SLEEP
+static int virtballoon_freeze(struct virtio_device *vdev)
+{
+ struct virtio_balloon *vb = vdev->priv;
+
+ /*
+ * The workqueue is already frozen by the PM core before this
+ * function is called.
+ */
+ remove_common(vb);
+ return 0;
+}
+
+static int virtballoon_restore(struct virtio_device *vdev)
+{
+ struct virtio_balloon *vb = vdev->priv;
+ int ret;
+
+ ret = init_vqs(vdev->priv);
+ if (ret)
+ return ret;
+
+ virtio_device_ready(vdev);
+
+ if (towards_target(vb))
+ virtballoon_changed(vdev);
+ update_balloon_size(vb);
+ return 0;
+}
+#endif
+
+static int virtballoon_validate(struct virtio_device *vdev)
+{
+ /*
+ * Inform the hypervisor that our pages are poisoned or
+ * initialized. If we cannot do that then we should disable
+ * page reporting as it could potentially change the contents
+ * of our free pages.
+ */
+ if (!want_init_on_free() && !page_poisoning_enabled_static())
+ __virtio_clear_bit(vdev, VIRTIO_BALLOON_F_PAGE_POISON);
+ else if (!virtio_has_feature(vdev, VIRTIO_BALLOON_F_PAGE_POISON))
+ __virtio_clear_bit(vdev, VIRTIO_BALLOON_F_REPORTING);
+
+ __virtio_clear_bit(vdev, VIRTIO_F_ACCESS_PLATFORM);
+ return 0;
+}
+
+static unsigned int features[] = {
+ VIRTIO_BALLOON_F_MUST_TELL_HOST,
+ VIRTIO_BALLOON_F_STATS_VQ,
+ VIRTIO_BALLOON_F_DEFLATE_ON_OOM,
+ VIRTIO_BALLOON_F_FREE_PAGE_HINT,
+ VIRTIO_BALLOON_F_PAGE_POISON,
+ VIRTIO_BALLOON_F_REPORTING,
+};
+
+static struct virtio_driver virtio_balloon_driver = {
+ .feature_table = features,
+ .feature_table_size = ARRAY_SIZE(features),
+ .driver.name = KBUILD_MODNAME,
+ .driver.owner = THIS_MODULE,
+ .id_table = id_table,
+ .validate = virtballoon_validate,
+ .probe = virtballoon_probe,
+ .remove = virtballoon_remove,
+ .config_changed = virtballoon_changed,
+#ifdef CONFIG_PM_SLEEP
+ .freeze = virtballoon_freeze,
+ .restore = virtballoon_restore,
+#endif
+};
+
+module_virtio_driver(virtio_balloon_driver);
+MODULE_DEVICE_TABLE(virtio, id_table);
+MODULE_DESCRIPTION("Virtio balloon driver");
+MODULE_LICENSE("GPL");
diff --git a/drivers/virtio/virtio_dma_buf.c b/drivers/virtio/virtio_dma_buf.c
new file mode 100644
index 000000000..2521a7500
--- /dev/null
+++ b/drivers/virtio/virtio_dma_buf.c
@@ -0,0 +1,89 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * dma-bufs for virtio exported objects
+ *
+ * Copyright (C) 2020 Google, Inc.
+ */
+
+#include <linux/module.h>
+#include <linux/virtio_dma_buf.h>
+
+/**
+ * virtio_dma_buf_export - Creates a new dma-buf for a virtio exported object
+ * @exp_info: [in] see dma_buf_export(). ops MUST refer to a dma_buf_ops
+ * struct embedded in a virtio_dma_buf_ops.
+ *
+ * This wraps dma_buf_export() to allow virtio drivers to create a dma-buf
+ * for an virtio exported object that can be queried by other virtio drivers
+ * for the object's UUID.
+ */
+struct dma_buf *virtio_dma_buf_export
+ (const struct dma_buf_export_info *exp_info)
+{
+ const struct virtio_dma_buf_ops *virtio_ops =
+ container_of(exp_info->ops,
+ const struct virtio_dma_buf_ops, ops);
+
+ if (!exp_info->ops ||
+ exp_info->ops->attach != &virtio_dma_buf_attach ||
+ !virtio_ops->get_uuid) {
+ return ERR_PTR(-EINVAL);
+ }
+
+ return dma_buf_export(exp_info);
+}
+EXPORT_SYMBOL(virtio_dma_buf_export);
+
+/**
+ * virtio_dma_buf_attach - mandatory attach callback for virtio dma-bufs
+ */
+int virtio_dma_buf_attach(struct dma_buf *dma_buf,
+ struct dma_buf_attachment *attach)
+{
+ int ret;
+ const struct virtio_dma_buf_ops *ops =
+ container_of(dma_buf->ops,
+ const struct virtio_dma_buf_ops, ops);
+
+ if (ops->device_attach) {
+ ret = ops->device_attach(dma_buf, attach);
+ if (ret)
+ return ret;
+ }
+ return 0;
+}
+EXPORT_SYMBOL(virtio_dma_buf_attach);
+
+/**
+ * is_virtio_dma_buf - returns true if the given dma-buf is a virtio dma-buf
+ * @dma_buf: buffer to query
+ */
+bool is_virtio_dma_buf(struct dma_buf *dma_buf)
+{
+ return dma_buf->ops->attach == &virtio_dma_buf_attach;
+}
+EXPORT_SYMBOL(is_virtio_dma_buf);
+
+/**
+ * virtio_dma_buf_get_uuid - gets a virtio dma-buf's exported object's uuid
+ * @dma_buf: [in] buffer to query
+ * @uuid: [out] the uuid
+ *
+ * Returns: 0 on success, negative on failure.
+ */
+int virtio_dma_buf_get_uuid(struct dma_buf *dma_buf,
+ uuid_t *uuid)
+{
+ const struct virtio_dma_buf_ops *ops =
+ container_of(dma_buf->ops,
+ const struct virtio_dma_buf_ops, ops);
+
+ if (!is_virtio_dma_buf(dma_buf))
+ return -EINVAL;
+
+ return ops->get_uuid(dma_buf, uuid);
+}
+EXPORT_SYMBOL(virtio_dma_buf_get_uuid);
+
+MODULE_LICENSE("GPL");
+MODULE_IMPORT_NS(DMA_BUF);
diff --git a/drivers/virtio/virtio_input.c b/drivers/virtio/virtio_input.c
new file mode 100644
index 000000000..3aa467038
--- /dev/null
+++ b/drivers/virtio/virtio_input.c
@@ -0,0 +1,414 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <linux/module.h>
+#include <linux/virtio.h>
+#include <linux/virtio_config.h>
+#include <linux/input.h>
+#include <linux/slab.h>
+
+#include <uapi/linux/virtio_ids.h>
+#include <uapi/linux/virtio_input.h>
+#include <linux/input/mt.h>
+
+struct virtio_input {
+ struct virtio_device *vdev;
+ struct input_dev *idev;
+ char name[64];
+ char serial[64];
+ char phys[64];
+ struct virtqueue *evt, *sts;
+ struct virtio_input_event evts[64];
+ spinlock_t lock;
+ bool ready;
+};
+
+static void virtinput_queue_evtbuf(struct virtio_input *vi,
+ struct virtio_input_event *evtbuf)
+{
+ struct scatterlist sg[1];
+
+ sg_init_one(sg, evtbuf, sizeof(*evtbuf));
+ virtqueue_add_inbuf(vi->evt, sg, 1, evtbuf, GFP_ATOMIC);
+}
+
+static void virtinput_recv_events(struct virtqueue *vq)
+{
+ struct virtio_input *vi = vq->vdev->priv;
+ struct virtio_input_event *event;
+ unsigned long flags;
+ unsigned int len;
+
+ spin_lock_irqsave(&vi->lock, flags);
+ if (vi->ready) {
+ while ((event = virtqueue_get_buf(vi->evt, &len)) != NULL) {
+ spin_unlock_irqrestore(&vi->lock, flags);
+ input_event(vi->idev,
+ le16_to_cpu(event->type),
+ le16_to_cpu(event->code),
+ le32_to_cpu(event->value));
+ spin_lock_irqsave(&vi->lock, flags);
+ virtinput_queue_evtbuf(vi, event);
+ }
+ virtqueue_kick(vq);
+ }
+ spin_unlock_irqrestore(&vi->lock, flags);
+}
+
+/*
+ * On error we are losing the status update, which isn't critical as
+ * this is typically used for stuff like keyboard leds.
+ */
+static int virtinput_send_status(struct virtio_input *vi,
+ u16 type, u16 code, s32 value)
+{
+ struct virtio_input_event *stsbuf;
+ struct scatterlist sg[1];
+ unsigned long flags;
+ int rc;
+
+ /*
+ * Since 29cc309d8bf1 (HID: hid-multitouch: forward MSC_TIMESTAMP),
+ * EV_MSC/MSC_TIMESTAMP is added to each before EV_SYN event.
+ * EV_MSC is configured as INPUT_PASS_TO_ALL.
+ * In case of touch device:
+ * BE pass EV_MSC/MSC_TIMESTAMP to FE on receiving event from evdev.
+ * FE pass EV_MSC/MSC_TIMESTAMP back to BE.
+ * BE writes EV_MSC/MSC_TIMESTAMP to evdev due to INPUT_PASS_TO_ALL.
+ * BE receives extra EV_MSC/MSC_TIMESTAMP and pass to FE.
+ * >>> Each new frame becomes larger and larger.
+ * Disable EV_MSC/MSC_TIMESTAMP forwarding for MT.
+ */
+ if (vi->idev->mt && type == EV_MSC && code == MSC_TIMESTAMP)
+ return 0;
+
+ stsbuf = kzalloc(sizeof(*stsbuf), GFP_ATOMIC);
+ if (!stsbuf)
+ return -ENOMEM;
+
+ stsbuf->type = cpu_to_le16(type);
+ stsbuf->code = cpu_to_le16(code);
+ stsbuf->value = cpu_to_le32(value);
+ sg_init_one(sg, stsbuf, sizeof(*stsbuf));
+
+ spin_lock_irqsave(&vi->lock, flags);
+ if (vi->ready) {
+ rc = virtqueue_add_outbuf(vi->sts, sg, 1, stsbuf, GFP_ATOMIC);
+ virtqueue_kick(vi->sts);
+ } else {
+ rc = -ENODEV;
+ }
+ spin_unlock_irqrestore(&vi->lock, flags);
+
+ if (rc != 0)
+ kfree(stsbuf);
+ return rc;
+}
+
+static void virtinput_recv_status(struct virtqueue *vq)
+{
+ struct virtio_input *vi = vq->vdev->priv;
+ struct virtio_input_event *stsbuf;
+ unsigned long flags;
+ unsigned int len;
+
+ spin_lock_irqsave(&vi->lock, flags);
+ while ((stsbuf = virtqueue_get_buf(vi->sts, &len)) != NULL)
+ kfree(stsbuf);
+ spin_unlock_irqrestore(&vi->lock, flags);
+}
+
+static int virtinput_status(struct input_dev *idev, unsigned int type,
+ unsigned int code, int value)
+{
+ struct virtio_input *vi = input_get_drvdata(idev);
+
+ return virtinput_send_status(vi, type, code, value);
+}
+
+static u8 virtinput_cfg_select(struct virtio_input *vi,
+ u8 select, u8 subsel)
+{
+ u8 size;
+
+ virtio_cwrite_le(vi->vdev, struct virtio_input_config, select, &select);
+ virtio_cwrite_le(vi->vdev, struct virtio_input_config, subsel, &subsel);
+ virtio_cread_le(vi->vdev, struct virtio_input_config, size, &size);
+ return size;
+}
+
+static void virtinput_cfg_bits(struct virtio_input *vi, int select, int subsel,
+ unsigned long *bits, unsigned int bitcount)
+{
+ unsigned int bit;
+ u8 *virtio_bits;
+ u8 bytes;
+
+ bytes = virtinput_cfg_select(vi, select, subsel);
+ if (!bytes)
+ return;
+ if (bitcount > bytes * 8)
+ bitcount = bytes * 8;
+
+ /*
+ * Bitmap in virtio config space is a simple stream of bytes,
+ * with the first byte carrying bits 0-7, second bits 8-15 and
+ * so on.
+ */
+ virtio_bits = kzalloc(bytes, GFP_KERNEL);
+ if (!virtio_bits)
+ return;
+ virtio_cread_bytes(vi->vdev, offsetof(struct virtio_input_config,
+ u.bitmap),
+ virtio_bits, bytes);
+ for (bit = 0; bit < bitcount; bit++) {
+ if (virtio_bits[bit / 8] & (1 << (bit % 8)))
+ __set_bit(bit, bits);
+ }
+ kfree(virtio_bits);
+
+ if (select == VIRTIO_INPUT_CFG_EV_BITS)
+ __set_bit(subsel, vi->idev->evbit);
+}
+
+static void virtinput_cfg_abs(struct virtio_input *vi, int abs)
+{
+ u32 mi, ma, re, fu, fl;
+
+ virtinput_cfg_select(vi, VIRTIO_INPUT_CFG_ABS_INFO, abs);
+ virtio_cread_le(vi->vdev, struct virtio_input_config, u.abs.min, &mi);
+ virtio_cread_le(vi->vdev, struct virtio_input_config, u.abs.max, &ma);
+ virtio_cread_le(vi->vdev, struct virtio_input_config, u.abs.res, &re);
+ virtio_cread_le(vi->vdev, struct virtio_input_config, u.abs.fuzz, &fu);
+ virtio_cread_le(vi->vdev, struct virtio_input_config, u.abs.flat, &fl);
+ input_set_abs_params(vi->idev, abs, mi, ma, fu, fl);
+ input_abs_set_res(vi->idev, abs, re);
+}
+
+static int virtinput_init_vqs(struct virtio_input *vi)
+{
+ struct virtqueue *vqs[2];
+ vq_callback_t *cbs[] = { virtinput_recv_events,
+ virtinput_recv_status };
+ static const char * const names[] = { "events", "status" };
+ int err;
+
+ err = virtio_find_vqs(vi->vdev, 2, vqs, cbs, names, NULL);
+ if (err)
+ return err;
+ vi->evt = vqs[0];
+ vi->sts = vqs[1];
+
+ return 0;
+}
+
+static void virtinput_fill_evt(struct virtio_input *vi)
+{
+ unsigned long flags;
+ int i, size;
+
+ spin_lock_irqsave(&vi->lock, flags);
+ size = virtqueue_get_vring_size(vi->evt);
+ if (size > ARRAY_SIZE(vi->evts))
+ size = ARRAY_SIZE(vi->evts);
+ for (i = 0; i < size; i++)
+ virtinput_queue_evtbuf(vi, &vi->evts[i]);
+ virtqueue_kick(vi->evt);
+ spin_unlock_irqrestore(&vi->lock, flags);
+}
+
+static int virtinput_probe(struct virtio_device *vdev)
+{
+ struct virtio_input *vi;
+ unsigned long flags;
+ size_t size;
+ int abs, err, nslots;
+
+ if (!virtio_has_feature(vdev, VIRTIO_F_VERSION_1))
+ return -ENODEV;
+
+ vi = kzalloc(sizeof(*vi), GFP_KERNEL);
+ if (!vi)
+ return -ENOMEM;
+
+ vdev->priv = vi;
+ vi->vdev = vdev;
+ spin_lock_init(&vi->lock);
+
+ err = virtinput_init_vqs(vi);
+ if (err)
+ goto err_init_vq;
+
+ vi->idev = input_allocate_device();
+ if (!vi->idev) {
+ err = -ENOMEM;
+ goto err_input_alloc;
+ }
+ input_set_drvdata(vi->idev, vi);
+
+ size = virtinput_cfg_select(vi, VIRTIO_INPUT_CFG_ID_NAME, 0);
+ virtio_cread_bytes(vi->vdev, offsetof(struct virtio_input_config,
+ u.string),
+ vi->name, min(size, sizeof(vi->name)));
+ size = virtinput_cfg_select(vi, VIRTIO_INPUT_CFG_ID_SERIAL, 0);
+ virtio_cread_bytes(vi->vdev, offsetof(struct virtio_input_config,
+ u.string),
+ vi->serial, min(size, sizeof(vi->serial)));
+ snprintf(vi->phys, sizeof(vi->phys),
+ "virtio%d/input0", vdev->index);
+ vi->idev->name = vi->name;
+ vi->idev->phys = vi->phys;
+ vi->idev->uniq = vi->serial;
+
+ size = virtinput_cfg_select(vi, VIRTIO_INPUT_CFG_ID_DEVIDS, 0);
+ if (size >= sizeof(struct virtio_input_devids)) {
+ virtio_cread_le(vi->vdev, struct virtio_input_config,
+ u.ids.bustype, &vi->idev->id.bustype);
+ virtio_cread_le(vi->vdev, struct virtio_input_config,
+ u.ids.vendor, &vi->idev->id.vendor);
+ virtio_cread_le(vi->vdev, struct virtio_input_config,
+ u.ids.product, &vi->idev->id.product);
+ virtio_cread_le(vi->vdev, struct virtio_input_config,
+ u.ids.version, &vi->idev->id.version);
+ } else {
+ vi->idev->id.bustype = BUS_VIRTUAL;
+ }
+
+ virtinput_cfg_bits(vi, VIRTIO_INPUT_CFG_PROP_BITS, 0,
+ vi->idev->propbit, INPUT_PROP_CNT);
+ size = virtinput_cfg_select(vi, VIRTIO_INPUT_CFG_EV_BITS, EV_REP);
+ if (size)
+ __set_bit(EV_REP, vi->idev->evbit);
+
+ vi->idev->dev.parent = &vdev->dev;
+ vi->idev->event = virtinput_status;
+
+ /* device -> kernel */
+ virtinput_cfg_bits(vi, VIRTIO_INPUT_CFG_EV_BITS, EV_KEY,
+ vi->idev->keybit, KEY_CNT);
+ virtinput_cfg_bits(vi, VIRTIO_INPUT_CFG_EV_BITS, EV_REL,
+ vi->idev->relbit, REL_CNT);
+ virtinput_cfg_bits(vi, VIRTIO_INPUT_CFG_EV_BITS, EV_ABS,
+ vi->idev->absbit, ABS_CNT);
+ virtinput_cfg_bits(vi, VIRTIO_INPUT_CFG_EV_BITS, EV_MSC,
+ vi->idev->mscbit, MSC_CNT);
+ virtinput_cfg_bits(vi, VIRTIO_INPUT_CFG_EV_BITS, EV_SW,
+ vi->idev->swbit, SW_CNT);
+
+ /* kernel -> device */
+ virtinput_cfg_bits(vi, VIRTIO_INPUT_CFG_EV_BITS, EV_LED,
+ vi->idev->ledbit, LED_CNT);
+ virtinput_cfg_bits(vi, VIRTIO_INPUT_CFG_EV_BITS, EV_SND,
+ vi->idev->sndbit, SND_CNT);
+
+ if (test_bit(EV_ABS, vi->idev->evbit)) {
+ for (abs = 0; abs < ABS_CNT; abs++) {
+ if (!test_bit(abs, vi->idev->absbit))
+ continue;
+ virtinput_cfg_abs(vi, abs);
+ }
+
+ if (test_bit(ABS_MT_SLOT, vi->idev->absbit)) {
+ nslots = input_abs_get_max(vi->idev, ABS_MT_SLOT) + 1;
+ err = input_mt_init_slots(vi->idev, nslots, 0);
+ if (err)
+ goto err_mt_init_slots;
+ }
+ }
+
+ virtio_device_ready(vdev);
+ vi->ready = true;
+ err = input_register_device(vi->idev);
+ if (err)
+ goto err_input_register;
+
+ virtinput_fill_evt(vi);
+ return 0;
+
+err_input_register:
+ spin_lock_irqsave(&vi->lock, flags);
+ vi->ready = false;
+ spin_unlock_irqrestore(&vi->lock, flags);
+err_mt_init_slots:
+ input_free_device(vi->idev);
+err_input_alloc:
+ vdev->config->del_vqs(vdev);
+err_init_vq:
+ kfree(vi);
+ return err;
+}
+
+static void virtinput_remove(struct virtio_device *vdev)
+{
+ struct virtio_input *vi = vdev->priv;
+ void *buf;
+ unsigned long flags;
+
+ spin_lock_irqsave(&vi->lock, flags);
+ vi->ready = false;
+ spin_unlock_irqrestore(&vi->lock, flags);
+
+ input_unregister_device(vi->idev);
+ virtio_reset_device(vdev);
+ while ((buf = virtqueue_detach_unused_buf(vi->sts)) != NULL)
+ kfree(buf);
+ vdev->config->del_vqs(vdev);
+ kfree(vi);
+}
+
+#ifdef CONFIG_PM_SLEEP
+static int virtinput_freeze(struct virtio_device *vdev)
+{
+ struct virtio_input *vi = vdev->priv;
+ unsigned long flags;
+
+ spin_lock_irqsave(&vi->lock, flags);
+ vi->ready = false;
+ spin_unlock_irqrestore(&vi->lock, flags);
+
+ vdev->config->del_vqs(vdev);
+ return 0;
+}
+
+static int virtinput_restore(struct virtio_device *vdev)
+{
+ struct virtio_input *vi = vdev->priv;
+ int err;
+
+ err = virtinput_init_vqs(vi);
+ if (err)
+ return err;
+
+ virtio_device_ready(vdev);
+ vi->ready = true;
+ virtinput_fill_evt(vi);
+ return 0;
+}
+#endif
+
+static unsigned int features[] = {
+ /* none */
+};
+static const struct virtio_device_id id_table[] = {
+ { VIRTIO_ID_INPUT, VIRTIO_DEV_ANY_ID },
+ { 0 },
+};
+
+static struct virtio_driver virtio_input_driver = {
+ .driver.name = KBUILD_MODNAME,
+ .driver.owner = THIS_MODULE,
+ .feature_table = features,
+ .feature_table_size = ARRAY_SIZE(features),
+ .id_table = id_table,
+ .probe = virtinput_probe,
+ .remove = virtinput_remove,
+#ifdef CONFIG_PM_SLEEP
+ .freeze = virtinput_freeze,
+ .restore = virtinput_restore,
+#endif
+};
+
+module_virtio_driver(virtio_input_driver);
+MODULE_DEVICE_TABLE(virtio, id_table);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Virtio input device driver");
+MODULE_AUTHOR("Gerd Hoffmann <kraxel@redhat.com>");
diff --git a/drivers/virtio/virtio_mem.c b/drivers/virtio/virtio_mem.c
new file mode 100644
index 000000000..fa5226c19
--- /dev/null
+++ b/drivers/virtio/virtio_mem.c
@@ -0,0 +1,3008 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Virtio-mem device driver.
+ *
+ * Copyright Red Hat, Inc. 2020
+ *
+ * Author(s): David Hildenbrand <david@redhat.com>
+ */
+
+#include <linux/virtio.h>
+#include <linux/virtio_mem.h>
+#include <linux/workqueue.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/memory_hotplug.h>
+#include <linux/memory.h>
+#include <linux/hrtimer.h>
+#include <linux/crash_dump.h>
+#include <linux/mutex.h>
+#include <linux/bitmap.h>
+#include <linux/lockdep.h>
+#include <linux/log2.h>
+
+#include <acpi/acpi_numa.h>
+
+static bool unplug_online = true;
+module_param(unplug_online, bool, 0644);
+MODULE_PARM_DESC(unplug_online, "Try to unplug online memory");
+
+static bool force_bbm;
+module_param(force_bbm, bool, 0444);
+MODULE_PARM_DESC(force_bbm,
+ "Force Big Block Mode. Default is 0 (auto-selection)");
+
+static unsigned long bbm_block_size;
+module_param(bbm_block_size, ulong, 0444);
+MODULE_PARM_DESC(bbm_block_size,
+ "Big Block size in bytes. Default is 0 (auto-detection).");
+
+/*
+ * virtio-mem currently supports the following modes of operation:
+ *
+ * * Sub Block Mode (SBM): A Linux memory block spans 2..X subblocks (SB). The
+ * size of a Sub Block (SB) is determined based on the device block size, the
+ * pageblock size, and the maximum allocation granularity of the buddy.
+ * Subblocks within a Linux memory block might either be plugged or unplugged.
+ * Memory is added/removed to Linux MM in Linux memory block granularity.
+ *
+ * * Big Block Mode (BBM): A Big Block (BB) spans 1..X Linux memory blocks.
+ * Memory is added/removed to Linux MM in Big Block granularity.
+ *
+ * The mode is determined automatically based on the Linux memory block size
+ * and the device block size.
+ *
+ * User space / core MM (auto onlining) is responsible for onlining added
+ * Linux memory blocks - and for selecting a zone. Linux Memory Blocks are
+ * always onlined separately, and all memory within a Linux memory block is
+ * onlined to the same zone - virtio-mem relies on this behavior.
+ */
+
+/*
+ * State of a Linux memory block in SBM.
+ */
+enum virtio_mem_sbm_mb_state {
+ /* Unplugged, not added to Linux. Can be reused later. */
+ VIRTIO_MEM_SBM_MB_UNUSED = 0,
+ /* (Partially) plugged, not added to Linux. Error on add_memory(). */
+ VIRTIO_MEM_SBM_MB_PLUGGED,
+ /* Fully plugged, fully added to Linux, offline. */
+ VIRTIO_MEM_SBM_MB_OFFLINE,
+ /* Partially plugged, fully added to Linux, offline. */
+ VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL,
+ /* Fully plugged, fully added to Linux, onlined to a kernel zone. */
+ VIRTIO_MEM_SBM_MB_KERNEL,
+ /* Partially plugged, fully added to Linux, online to a kernel zone */
+ VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL,
+ /* Fully plugged, fully added to Linux, onlined to ZONE_MOVABLE. */
+ VIRTIO_MEM_SBM_MB_MOVABLE,
+ /* Partially plugged, fully added to Linux, onlined to ZONE_MOVABLE. */
+ VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL,
+ VIRTIO_MEM_SBM_MB_COUNT
+};
+
+/*
+ * State of a Big Block (BB) in BBM, covering 1..X Linux memory blocks.
+ */
+enum virtio_mem_bbm_bb_state {
+ /* Unplugged, not added to Linux. Can be reused later. */
+ VIRTIO_MEM_BBM_BB_UNUSED = 0,
+ /* Plugged, not added to Linux. Error on add_memory(). */
+ VIRTIO_MEM_BBM_BB_PLUGGED,
+ /* Plugged and added to Linux. */
+ VIRTIO_MEM_BBM_BB_ADDED,
+ /* All online parts are fake-offline, ready to remove. */
+ VIRTIO_MEM_BBM_BB_FAKE_OFFLINE,
+ VIRTIO_MEM_BBM_BB_COUNT
+};
+
+struct virtio_mem {
+ struct virtio_device *vdev;
+
+ /* We might first have to unplug all memory when starting up. */
+ bool unplug_all_required;
+
+ /* Workqueue that processes the plug/unplug requests. */
+ struct work_struct wq;
+ atomic_t wq_active;
+ atomic_t config_changed;
+
+ /* Virtqueue for guest->host requests. */
+ struct virtqueue *vq;
+
+ /* Wait for a host response to a guest request. */
+ wait_queue_head_t host_resp;
+
+ /* Space for one guest request and the host response. */
+ struct virtio_mem_req req;
+ struct virtio_mem_resp resp;
+
+ /* The current size of the device. */
+ uint64_t plugged_size;
+ /* The requested size of the device. */
+ uint64_t requested_size;
+
+ /* The device block size (for communicating with the device). */
+ uint64_t device_block_size;
+ /* The determined node id for all memory of the device. */
+ int nid;
+ /* Physical start address of the memory region. */
+ uint64_t addr;
+ /* Maximum region size in bytes. */
+ uint64_t region_size;
+
+ /* The parent resource for all memory added via this device. */
+ struct resource *parent_resource;
+ /*
+ * Copy of "System RAM (virtio_mem)" to be used for
+ * add_memory_driver_managed().
+ */
+ const char *resource_name;
+ /* Memory group identification. */
+ int mgid;
+
+ /*
+ * We don't want to add too much memory if it's not getting onlined,
+ * to avoid running OOM. Besides this threshold, we allow to have at
+ * least two offline blocks at a time (whatever is bigger).
+ */
+#define VIRTIO_MEM_DEFAULT_OFFLINE_THRESHOLD (1024 * 1024 * 1024)
+ atomic64_t offline_size;
+ uint64_t offline_threshold;
+
+ /* If set, the driver is in SBM, otherwise in BBM. */
+ bool in_sbm;
+
+ union {
+ struct {
+ /* Id of the first memory block of this device. */
+ unsigned long first_mb_id;
+ /* Id of the last usable memory block of this device. */
+ unsigned long last_usable_mb_id;
+ /* Id of the next memory bock to prepare when needed. */
+ unsigned long next_mb_id;
+
+ /* The subblock size. */
+ uint64_t sb_size;
+ /* The number of subblocks per Linux memory block. */
+ uint32_t sbs_per_mb;
+
+ /*
+ * Some of the Linux memory blocks tracked as "partially
+ * plugged" are completely unplugged and can be offlined
+ * and removed -- which previously failed.
+ */
+ bool have_unplugged_mb;
+
+ /* Summary of all memory block states. */
+ unsigned long mb_count[VIRTIO_MEM_SBM_MB_COUNT];
+
+ /*
+ * One byte state per memory block. Allocated via
+ * vmalloc(). Resized (alloc+copy+free) on demand.
+ *
+ * With 128 MiB memory blocks, we have states for 512
+ * GiB of memory in one 4 KiB page.
+ */
+ uint8_t *mb_states;
+
+ /*
+ * Bitmap: one bit per subblock. Allocated similar to
+ * sbm.mb_states.
+ *
+ * A set bit means the corresponding subblock is
+ * plugged, otherwise it's unblocked.
+ *
+ * With 4 MiB subblocks, we manage 128 GiB of memory
+ * in one 4 KiB page.
+ */
+ unsigned long *sb_states;
+ } sbm;
+
+ struct {
+ /* Id of the first big block of this device. */
+ unsigned long first_bb_id;
+ /* Id of the last usable big block of this device. */
+ unsigned long last_usable_bb_id;
+ /* Id of the next device bock to prepare when needed. */
+ unsigned long next_bb_id;
+
+ /* Summary of all big block states. */
+ unsigned long bb_count[VIRTIO_MEM_BBM_BB_COUNT];
+
+ /* One byte state per big block. See sbm.mb_states. */
+ uint8_t *bb_states;
+
+ /* The block size used for plugging/adding/removing. */
+ uint64_t bb_size;
+ } bbm;
+ };
+
+ /*
+ * Mutex that protects the sbm.mb_count, sbm.mb_states,
+ * sbm.sb_states, bbm.bb_count, and bbm.bb_states
+ *
+ * When this lock is held the pointers can't change, ONLINE and
+ * OFFLINE blocks can't change the state and no subblocks will get
+ * plugged/unplugged.
+ *
+ * In kdump mode, used to serialize requests, last_block_addr and
+ * last_block_plugged.
+ */
+ struct mutex hotplug_mutex;
+ bool hotplug_active;
+
+ /* An error occurred we cannot handle - stop processing requests. */
+ bool broken;
+
+ /* Cached valued of is_kdump_kernel() when the device was probed. */
+ bool in_kdump;
+
+ /* The driver is being removed. */
+ spinlock_t removal_lock;
+ bool removing;
+
+ /* Timer for retrying to plug/unplug memory. */
+ struct hrtimer retry_timer;
+ unsigned int retry_timer_ms;
+#define VIRTIO_MEM_RETRY_TIMER_MIN_MS 50000
+#define VIRTIO_MEM_RETRY_TIMER_MAX_MS 300000
+
+ /* Memory notifier (online/offline events). */
+ struct notifier_block memory_notifier;
+
+#ifdef CONFIG_PROC_VMCORE
+ /* vmcore callback for /proc/vmcore handling in kdump mode */
+ struct vmcore_cb vmcore_cb;
+ uint64_t last_block_addr;
+ bool last_block_plugged;
+#endif /* CONFIG_PROC_VMCORE */
+
+ /* Next device in the list of virtio-mem devices. */
+ struct list_head next;
+};
+
+/*
+ * We have to share a single online_page callback among all virtio-mem
+ * devices. We use RCU to iterate the list in the callback.
+ */
+static DEFINE_MUTEX(virtio_mem_mutex);
+static LIST_HEAD(virtio_mem_devices);
+
+static void virtio_mem_online_page_cb(struct page *page, unsigned int order);
+static void virtio_mem_fake_offline_going_offline(unsigned long pfn,
+ unsigned long nr_pages);
+static void virtio_mem_fake_offline_cancel_offline(unsigned long pfn,
+ unsigned long nr_pages);
+static void virtio_mem_retry(struct virtio_mem *vm);
+static int virtio_mem_create_resource(struct virtio_mem *vm);
+static void virtio_mem_delete_resource(struct virtio_mem *vm);
+
+/*
+ * Register a virtio-mem device so it will be considered for the online_page
+ * callback.
+ */
+static int register_virtio_mem_device(struct virtio_mem *vm)
+{
+ int rc = 0;
+
+ /* First device registers the callback. */
+ mutex_lock(&virtio_mem_mutex);
+ if (list_empty(&virtio_mem_devices))
+ rc = set_online_page_callback(&virtio_mem_online_page_cb);
+ if (!rc)
+ list_add_rcu(&vm->next, &virtio_mem_devices);
+ mutex_unlock(&virtio_mem_mutex);
+
+ return rc;
+}
+
+/*
+ * Unregister a virtio-mem device so it will no longer be considered for the
+ * online_page callback.
+ */
+static void unregister_virtio_mem_device(struct virtio_mem *vm)
+{
+ /* Last device unregisters the callback. */
+ mutex_lock(&virtio_mem_mutex);
+ list_del_rcu(&vm->next);
+ if (list_empty(&virtio_mem_devices))
+ restore_online_page_callback(&virtio_mem_online_page_cb);
+ mutex_unlock(&virtio_mem_mutex);
+
+ synchronize_rcu();
+}
+
+/*
+ * Calculate the memory block id of a given address.
+ */
+static unsigned long virtio_mem_phys_to_mb_id(unsigned long addr)
+{
+ return addr / memory_block_size_bytes();
+}
+
+/*
+ * Calculate the physical start address of a given memory block id.
+ */
+static unsigned long virtio_mem_mb_id_to_phys(unsigned long mb_id)
+{
+ return mb_id * memory_block_size_bytes();
+}
+
+/*
+ * Calculate the big block id of a given address.
+ */
+static unsigned long virtio_mem_phys_to_bb_id(struct virtio_mem *vm,
+ uint64_t addr)
+{
+ return addr / vm->bbm.bb_size;
+}
+
+/*
+ * Calculate the physical start address of a given big block id.
+ */
+static uint64_t virtio_mem_bb_id_to_phys(struct virtio_mem *vm,
+ unsigned long bb_id)
+{
+ return bb_id * vm->bbm.bb_size;
+}
+
+/*
+ * Calculate the subblock id of a given address.
+ */
+static unsigned long virtio_mem_phys_to_sb_id(struct virtio_mem *vm,
+ unsigned long addr)
+{
+ const unsigned long mb_id = virtio_mem_phys_to_mb_id(addr);
+ const unsigned long mb_addr = virtio_mem_mb_id_to_phys(mb_id);
+
+ return (addr - mb_addr) / vm->sbm.sb_size;
+}
+
+/*
+ * Set the state of a big block, taking care of the state counter.
+ */
+static void virtio_mem_bbm_set_bb_state(struct virtio_mem *vm,
+ unsigned long bb_id,
+ enum virtio_mem_bbm_bb_state state)
+{
+ const unsigned long idx = bb_id - vm->bbm.first_bb_id;
+ enum virtio_mem_bbm_bb_state old_state;
+
+ old_state = vm->bbm.bb_states[idx];
+ vm->bbm.bb_states[idx] = state;
+
+ BUG_ON(vm->bbm.bb_count[old_state] == 0);
+ vm->bbm.bb_count[old_state]--;
+ vm->bbm.bb_count[state]++;
+}
+
+/*
+ * Get the state of a big block.
+ */
+static enum virtio_mem_bbm_bb_state virtio_mem_bbm_get_bb_state(struct virtio_mem *vm,
+ unsigned long bb_id)
+{
+ return vm->bbm.bb_states[bb_id - vm->bbm.first_bb_id];
+}
+
+/*
+ * Prepare the big block state array for the next big block.
+ */
+static int virtio_mem_bbm_bb_states_prepare_next_bb(struct virtio_mem *vm)
+{
+ unsigned long old_bytes = vm->bbm.next_bb_id - vm->bbm.first_bb_id;
+ unsigned long new_bytes = old_bytes + 1;
+ int old_pages = PFN_UP(old_bytes);
+ int new_pages = PFN_UP(new_bytes);
+ uint8_t *new_array;
+
+ if (vm->bbm.bb_states && old_pages == new_pages)
+ return 0;
+
+ new_array = vzalloc(new_pages * PAGE_SIZE);
+ if (!new_array)
+ return -ENOMEM;
+
+ mutex_lock(&vm->hotplug_mutex);
+ if (vm->bbm.bb_states)
+ memcpy(new_array, vm->bbm.bb_states, old_pages * PAGE_SIZE);
+ vfree(vm->bbm.bb_states);
+ vm->bbm.bb_states = new_array;
+ mutex_unlock(&vm->hotplug_mutex);
+
+ return 0;
+}
+
+#define virtio_mem_bbm_for_each_bb(_vm, _bb_id, _state) \
+ for (_bb_id = vm->bbm.first_bb_id; \
+ _bb_id < vm->bbm.next_bb_id && _vm->bbm.bb_count[_state]; \
+ _bb_id++) \
+ if (virtio_mem_bbm_get_bb_state(_vm, _bb_id) == _state)
+
+#define virtio_mem_bbm_for_each_bb_rev(_vm, _bb_id, _state) \
+ for (_bb_id = vm->bbm.next_bb_id - 1; \
+ _bb_id >= vm->bbm.first_bb_id && _vm->bbm.bb_count[_state]; \
+ _bb_id--) \
+ if (virtio_mem_bbm_get_bb_state(_vm, _bb_id) == _state)
+
+/*
+ * Set the state of a memory block, taking care of the state counter.
+ */
+static void virtio_mem_sbm_set_mb_state(struct virtio_mem *vm,
+ unsigned long mb_id, uint8_t state)
+{
+ const unsigned long idx = mb_id - vm->sbm.first_mb_id;
+ uint8_t old_state;
+
+ old_state = vm->sbm.mb_states[idx];
+ vm->sbm.mb_states[idx] = state;
+
+ BUG_ON(vm->sbm.mb_count[old_state] == 0);
+ vm->sbm.mb_count[old_state]--;
+ vm->sbm.mb_count[state]++;
+}
+
+/*
+ * Get the state of a memory block.
+ */
+static uint8_t virtio_mem_sbm_get_mb_state(struct virtio_mem *vm,
+ unsigned long mb_id)
+{
+ const unsigned long idx = mb_id - vm->sbm.first_mb_id;
+
+ return vm->sbm.mb_states[idx];
+}
+
+/*
+ * Prepare the state array for the next memory block.
+ */
+static int virtio_mem_sbm_mb_states_prepare_next_mb(struct virtio_mem *vm)
+{
+ int old_pages = PFN_UP(vm->sbm.next_mb_id - vm->sbm.first_mb_id);
+ int new_pages = PFN_UP(vm->sbm.next_mb_id - vm->sbm.first_mb_id + 1);
+ uint8_t *new_array;
+
+ if (vm->sbm.mb_states && old_pages == new_pages)
+ return 0;
+
+ new_array = vzalloc(new_pages * PAGE_SIZE);
+ if (!new_array)
+ return -ENOMEM;
+
+ mutex_lock(&vm->hotplug_mutex);
+ if (vm->sbm.mb_states)
+ memcpy(new_array, vm->sbm.mb_states, old_pages * PAGE_SIZE);
+ vfree(vm->sbm.mb_states);
+ vm->sbm.mb_states = new_array;
+ mutex_unlock(&vm->hotplug_mutex);
+
+ return 0;
+}
+
+#define virtio_mem_sbm_for_each_mb(_vm, _mb_id, _state) \
+ for (_mb_id = _vm->sbm.first_mb_id; \
+ _mb_id < _vm->sbm.next_mb_id && _vm->sbm.mb_count[_state]; \
+ _mb_id++) \
+ if (virtio_mem_sbm_get_mb_state(_vm, _mb_id) == _state)
+
+#define virtio_mem_sbm_for_each_mb_rev(_vm, _mb_id, _state) \
+ for (_mb_id = _vm->sbm.next_mb_id - 1; \
+ _mb_id >= _vm->sbm.first_mb_id && _vm->sbm.mb_count[_state]; \
+ _mb_id--) \
+ if (virtio_mem_sbm_get_mb_state(_vm, _mb_id) == _state)
+
+/*
+ * Calculate the bit number in the subblock bitmap for the given subblock
+ * inside the given memory block.
+ */
+static int virtio_mem_sbm_sb_state_bit_nr(struct virtio_mem *vm,
+ unsigned long mb_id, int sb_id)
+{
+ return (mb_id - vm->sbm.first_mb_id) * vm->sbm.sbs_per_mb + sb_id;
+}
+
+/*
+ * Mark all selected subblocks plugged.
+ *
+ * Will not modify the state of the memory block.
+ */
+static void virtio_mem_sbm_set_sb_plugged(struct virtio_mem *vm,
+ unsigned long mb_id, int sb_id,
+ int count)
+{
+ const int bit = virtio_mem_sbm_sb_state_bit_nr(vm, mb_id, sb_id);
+
+ __bitmap_set(vm->sbm.sb_states, bit, count);
+}
+
+/*
+ * Mark all selected subblocks unplugged.
+ *
+ * Will not modify the state of the memory block.
+ */
+static void virtio_mem_sbm_set_sb_unplugged(struct virtio_mem *vm,
+ unsigned long mb_id, int sb_id,
+ int count)
+{
+ const int bit = virtio_mem_sbm_sb_state_bit_nr(vm, mb_id, sb_id);
+
+ __bitmap_clear(vm->sbm.sb_states, bit, count);
+}
+
+/*
+ * Test if all selected subblocks are plugged.
+ */
+static bool virtio_mem_sbm_test_sb_plugged(struct virtio_mem *vm,
+ unsigned long mb_id, int sb_id,
+ int count)
+{
+ const int bit = virtio_mem_sbm_sb_state_bit_nr(vm, mb_id, sb_id);
+
+ if (count == 1)
+ return test_bit(bit, vm->sbm.sb_states);
+
+ /* TODO: Helper similar to bitmap_set() */
+ return find_next_zero_bit(vm->sbm.sb_states, bit + count, bit) >=
+ bit + count;
+}
+
+/*
+ * Test if all selected subblocks are unplugged.
+ */
+static bool virtio_mem_sbm_test_sb_unplugged(struct virtio_mem *vm,
+ unsigned long mb_id, int sb_id,
+ int count)
+{
+ const int bit = virtio_mem_sbm_sb_state_bit_nr(vm, mb_id, sb_id);
+
+ /* TODO: Helper similar to bitmap_set() */
+ return find_next_bit(vm->sbm.sb_states, bit + count, bit) >=
+ bit + count;
+}
+
+/*
+ * Find the first unplugged subblock. Returns vm->sbm.sbs_per_mb in case there is
+ * none.
+ */
+static int virtio_mem_sbm_first_unplugged_sb(struct virtio_mem *vm,
+ unsigned long mb_id)
+{
+ const int bit = virtio_mem_sbm_sb_state_bit_nr(vm, mb_id, 0);
+
+ return find_next_zero_bit(vm->sbm.sb_states,
+ bit + vm->sbm.sbs_per_mb, bit) - bit;
+}
+
+/*
+ * Prepare the subblock bitmap for the next memory block.
+ */
+static int virtio_mem_sbm_sb_states_prepare_next_mb(struct virtio_mem *vm)
+{
+ const unsigned long old_nb_mb = vm->sbm.next_mb_id - vm->sbm.first_mb_id;
+ const unsigned long old_nb_bits = old_nb_mb * vm->sbm.sbs_per_mb;
+ const unsigned long new_nb_bits = (old_nb_mb + 1) * vm->sbm.sbs_per_mb;
+ int old_pages = PFN_UP(BITS_TO_LONGS(old_nb_bits) * sizeof(long));
+ int new_pages = PFN_UP(BITS_TO_LONGS(new_nb_bits) * sizeof(long));
+ unsigned long *new_bitmap, *old_bitmap;
+
+ if (vm->sbm.sb_states && old_pages == new_pages)
+ return 0;
+
+ new_bitmap = vzalloc(new_pages * PAGE_SIZE);
+ if (!new_bitmap)
+ return -ENOMEM;
+
+ mutex_lock(&vm->hotplug_mutex);
+ if (vm->sbm.sb_states)
+ memcpy(new_bitmap, vm->sbm.sb_states, old_pages * PAGE_SIZE);
+
+ old_bitmap = vm->sbm.sb_states;
+ vm->sbm.sb_states = new_bitmap;
+ mutex_unlock(&vm->hotplug_mutex);
+
+ vfree(old_bitmap);
+ return 0;
+}
+
+/*
+ * Test if we could add memory without creating too much offline memory -
+ * to avoid running OOM if memory is getting onlined deferred.
+ */
+static bool virtio_mem_could_add_memory(struct virtio_mem *vm, uint64_t size)
+{
+ if (WARN_ON_ONCE(size > vm->offline_threshold))
+ return false;
+
+ return atomic64_read(&vm->offline_size) + size <= vm->offline_threshold;
+}
+
+/*
+ * Try adding memory to Linux. Will usually only fail if out of memory.
+ *
+ * Must not be called with the vm->hotplug_mutex held (possible deadlock with
+ * onlining code).
+ *
+ * Will not modify the state of memory blocks in virtio-mem.
+ */
+static int virtio_mem_add_memory(struct virtio_mem *vm, uint64_t addr,
+ uint64_t size)
+{
+ int rc;
+
+ /*
+ * When force-unloading the driver and we still have memory added to
+ * Linux, the resource name has to stay.
+ */
+ if (!vm->resource_name) {
+ vm->resource_name = kstrdup_const("System RAM (virtio_mem)",
+ GFP_KERNEL);
+ if (!vm->resource_name)
+ return -ENOMEM;
+ }
+
+ dev_dbg(&vm->vdev->dev, "adding memory: 0x%llx - 0x%llx\n", addr,
+ addr + size - 1);
+ /* Memory might get onlined immediately. */
+ atomic64_add(size, &vm->offline_size);
+ rc = add_memory_driver_managed(vm->mgid, addr, size, vm->resource_name,
+ MHP_MERGE_RESOURCE | MHP_NID_IS_MGID);
+ if (rc) {
+ atomic64_sub(size, &vm->offline_size);
+ dev_warn(&vm->vdev->dev, "adding memory failed: %d\n", rc);
+ /*
+ * TODO: Linux MM does not properly clean up yet in all cases
+ * where adding of memory failed - especially on -ENOMEM.
+ */
+ }
+ return rc;
+}
+
+/*
+ * See virtio_mem_add_memory(): Try adding a single Linux memory block.
+ */
+static int virtio_mem_sbm_add_mb(struct virtio_mem *vm, unsigned long mb_id)
+{
+ const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id);
+ const uint64_t size = memory_block_size_bytes();
+
+ return virtio_mem_add_memory(vm, addr, size);
+}
+
+/*
+ * See virtio_mem_add_memory(): Try adding a big block.
+ */
+static int virtio_mem_bbm_add_bb(struct virtio_mem *vm, unsigned long bb_id)
+{
+ const uint64_t addr = virtio_mem_bb_id_to_phys(vm, bb_id);
+ const uint64_t size = vm->bbm.bb_size;
+
+ return virtio_mem_add_memory(vm, addr, size);
+}
+
+/*
+ * Try removing memory from Linux. Will only fail if memory blocks aren't
+ * offline.
+ *
+ * Must not be called with the vm->hotplug_mutex held (possible deadlock with
+ * onlining code).
+ *
+ * Will not modify the state of memory blocks in virtio-mem.
+ */
+static int virtio_mem_remove_memory(struct virtio_mem *vm, uint64_t addr,
+ uint64_t size)
+{
+ int rc;
+
+ dev_dbg(&vm->vdev->dev, "removing memory: 0x%llx - 0x%llx\n", addr,
+ addr + size - 1);
+ rc = remove_memory(addr, size);
+ if (!rc) {
+ atomic64_sub(size, &vm->offline_size);
+ /*
+ * We might have freed up memory we can now unplug, retry
+ * immediately instead of waiting.
+ */
+ virtio_mem_retry(vm);
+ } else {
+ dev_dbg(&vm->vdev->dev, "removing memory failed: %d\n", rc);
+ }
+ return rc;
+}
+
+/*
+ * See virtio_mem_remove_memory(): Try removing a single Linux memory block.
+ */
+static int virtio_mem_sbm_remove_mb(struct virtio_mem *vm, unsigned long mb_id)
+{
+ const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id);
+ const uint64_t size = memory_block_size_bytes();
+
+ return virtio_mem_remove_memory(vm, addr, size);
+}
+
+/*
+ * Try offlining and removing memory from Linux.
+ *
+ * Must not be called with the vm->hotplug_mutex held (possible deadlock with
+ * onlining code).
+ *
+ * Will not modify the state of memory blocks in virtio-mem.
+ */
+static int virtio_mem_offline_and_remove_memory(struct virtio_mem *vm,
+ uint64_t addr,
+ uint64_t size)
+{
+ int rc;
+
+ dev_dbg(&vm->vdev->dev,
+ "offlining and removing memory: 0x%llx - 0x%llx\n", addr,
+ addr + size - 1);
+
+ rc = offline_and_remove_memory(addr, size);
+ if (!rc) {
+ atomic64_sub(size, &vm->offline_size);
+ /*
+ * We might have freed up memory we can now unplug, retry
+ * immediately instead of waiting.
+ */
+ virtio_mem_retry(vm);
+ return 0;
+ }
+ dev_dbg(&vm->vdev->dev, "offlining and removing memory failed: %d\n", rc);
+ /*
+ * We don't really expect this to fail, because we fake-offlined all
+ * memory already. But it could fail in corner cases.
+ */
+ WARN_ON_ONCE(rc != -ENOMEM && rc != -EBUSY);
+ return rc == -ENOMEM ? -ENOMEM : -EBUSY;
+}
+
+/*
+ * See virtio_mem_offline_and_remove_memory(): Try offlining and removing
+ * a single Linux memory block.
+ */
+static int virtio_mem_sbm_offline_and_remove_mb(struct virtio_mem *vm,
+ unsigned long mb_id)
+{
+ const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id);
+ const uint64_t size = memory_block_size_bytes();
+
+ return virtio_mem_offline_and_remove_memory(vm, addr, size);
+}
+
+/*
+ * Try (offlining and) removing memory from Linux in case all subblocks are
+ * unplugged. Can be called on online and offline memory blocks.
+ *
+ * May modify the state of memory blocks in virtio-mem.
+ */
+static int virtio_mem_sbm_try_remove_unplugged_mb(struct virtio_mem *vm,
+ unsigned long mb_id)
+{
+ int rc;
+
+ /*
+ * Once all subblocks of a memory block were unplugged, offline and
+ * remove it.
+ */
+ if (!virtio_mem_sbm_test_sb_unplugged(vm, mb_id, 0, vm->sbm.sbs_per_mb))
+ return 0;
+
+ /* offline_and_remove_memory() works for online and offline memory. */
+ mutex_unlock(&vm->hotplug_mutex);
+ rc = virtio_mem_sbm_offline_and_remove_mb(vm, mb_id);
+ mutex_lock(&vm->hotplug_mutex);
+ if (!rc)
+ virtio_mem_sbm_set_mb_state(vm, mb_id,
+ VIRTIO_MEM_SBM_MB_UNUSED);
+ return rc;
+}
+
+/*
+ * See virtio_mem_offline_and_remove_memory(): Try to offline and remove a
+ * all Linux memory blocks covered by the big block.
+ */
+static int virtio_mem_bbm_offline_and_remove_bb(struct virtio_mem *vm,
+ unsigned long bb_id)
+{
+ const uint64_t addr = virtio_mem_bb_id_to_phys(vm, bb_id);
+ const uint64_t size = vm->bbm.bb_size;
+
+ return virtio_mem_offline_and_remove_memory(vm, addr, size);
+}
+
+/*
+ * Trigger the workqueue so the device can perform its magic.
+ */
+static void virtio_mem_retry(struct virtio_mem *vm)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&vm->removal_lock, flags);
+ if (!vm->removing)
+ queue_work(system_freezable_wq, &vm->wq);
+ spin_unlock_irqrestore(&vm->removal_lock, flags);
+}
+
+static int virtio_mem_translate_node_id(struct virtio_mem *vm, uint16_t node_id)
+{
+ int node = NUMA_NO_NODE;
+
+#if defined(CONFIG_ACPI_NUMA)
+ if (virtio_has_feature(vm->vdev, VIRTIO_MEM_F_ACPI_PXM))
+ node = pxm_to_node(node_id);
+#endif
+ return node;
+}
+
+/*
+ * Test if a virtio-mem device overlaps with the given range. Can be called
+ * from (notifier) callbacks lockless.
+ */
+static bool virtio_mem_overlaps_range(struct virtio_mem *vm, uint64_t start,
+ uint64_t size)
+{
+ return start < vm->addr + vm->region_size && vm->addr < start + size;
+}
+
+/*
+ * Test if a virtio-mem device contains a given range. Can be called from
+ * (notifier) callbacks lockless.
+ */
+static bool virtio_mem_contains_range(struct virtio_mem *vm, uint64_t start,
+ uint64_t size)
+{
+ return start >= vm->addr && start + size <= vm->addr + vm->region_size;
+}
+
+static int virtio_mem_sbm_notify_going_online(struct virtio_mem *vm,
+ unsigned long mb_id)
+{
+ switch (virtio_mem_sbm_get_mb_state(vm, mb_id)) {
+ case VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL:
+ case VIRTIO_MEM_SBM_MB_OFFLINE:
+ return NOTIFY_OK;
+ default:
+ break;
+ }
+ dev_warn_ratelimited(&vm->vdev->dev,
+ "memory block onlining denied\n");
+ return NOTIFY_BAD;
+}
+
+static void virtio_mem_sbm_notify_offline(struct virtio_mem *vm,
+ unsigned long mb_id)
+{
+ switch (virtio_mem_sbm_get_mb_state(vm, mb_id)) {
+ case VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL:
+ case VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL:
+ virtio_mem_sbm_set_mb_state(vm, mb_id,
+ VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL);
+ break;
+ case VIRTIO_MEM_SBM_MB_KERNEL:
+ case VIRTIO_MEM_SBM_MB_MOVABLE:
+ virtio_mem_sbm_set_mb_state(vm, mb_id,
+ VIRTIO_MEM_SBM_MB_OFFLINE);
+ break;
+ default:
+ BUG();
+ break;
+ }
+}
+
+static void virtio_mem_sbm_notify_online(struct virtio_mem *vm,
+ unsigned long mb_id,
+ unsigned long start_pfn)
+{
+ const bool is_movable = is_zone_movable_page(pfn_to_page(start_pfn));
+ int new_state;
+
+ switch (virtio_mem_sbm_get_mb_state(vm, mb_id)) {
+ case VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL:
+ new_state = VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL;
+ if (is_movable)
+ new_state = VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL;
+ break;
+ case VIRTIO_MEM_SBM_MB_OFFLINE:
+ new_state = VIRTIO_MEM_SBM_MB_KERNEL;
+ if (is_movable)
+ new_state = VIRTIO_MEM_SBM_MB_MOVABLE;
+ break;
+ default:
+ BUG();
+ break;
+ }
+ virtio_mem_sbm_set_mb_state(vm, mb_id, new_state);
+}
+
+static void virtio_mem_sbm_notify_going_offline(struct virtio_mem *vm,
+ unsigned long mb_id)
+{
+ const unsigned long nr_pages = PFN_DOWN(vm->sbm.sb_size);
+ unsigned long pfn;
+ int sb_id;
+
+ for (sb_id = 0; sb_id < vm->sbm.sbs_per_mb; sb_id++) {
+ if (virtio_mem_sbm_test_sb_plugged(vm, mb_id, sb_id, 1))
+ continue;
+ pfn = PFN_DOWN(virtio_mem_mb_id_to_phys(mb_id) +
+ sb_id * vm->sbm.sb_size);
+ virtio_mem_fake_offline_going_offline(pfn, nr_pages);
+ }
+}
+
+static void virtio_mem_sbm_notify_cancel_offline(struct virtio_mem *vm,
+ unsigned long mb_id)
+{
+ const unsigned long nr_pages = PFN_DOWN(vm->sbm.sb_size);
+ unsigned long pfn;
+ int sb_id;
+
+ for (sb_id = 0; sb_id < vm->sbm.sbs_per_mb; sb_id++) {
+ if (virtio_mem_sbm_test_sb_plugged(vm, mb_id, sb_id, 1))
+ continue;
+ pfn = PFN_DOWN(virtio_mem_mb_id_to_phys(mb_id) +
+ sb_id * vm->sbm.sb_size);
+ virtio_mem_fake_offline_cancel_offline(pfn, nr_pages);
+ }
+}
+
+static void virtio_mem_bbm_notify_going_offline(struct virtio_mem *vm,
+ unsigned long bb_id,
+ unsigned long pfn,
+ unsigned long nr_pages)
+{
+ /*
+ * When marked as "fake-offline", all online memory of this device block
+ * is allocated by us. Otherwise, we don't have any memory allocated.
+ */
+ if (virtio_mem_bbm_get_bb_state(vm, bb_id) !=
+ VIRTIO_MEM_BBM_BB_FAKE_OFFLINE)
+ return;
+ virtio_mem_fake_offline_going_offline(pfn, nr_pages);
+}
+
+static void virtio_mem_bbm_notify_cancel_offline(struct virtio_mem *vm,
+ unsigned long bb_id,
+ unsigned long pfn,
+ unsigned long nr_pages)
+{
+ if (virtio_mem_bbm_get_bb_state(vm, bb_id) !=
+ VIRTIO_MEM_BBM_BB_FAKE_OFFLINE)
+ return;
+ virtio_mem_fake_offline_cancel_offline(pfn, nr_pages);
+}
+
+/*
+ * This callback will either be called synchronously from add_memory() or
+ * asynchronously (e.g., triggered via user space). We have to be careful
+ * with locking when calling add_memory().
+ */
+static int virtio_mem_memory_notifier_cb(struct notifier_block *nb,
+ unsigned long action, void *arg)
+{
+ struct virtio_mem *vm = container_of(nb, struct virtio_mem,
+ memory_notifier);
+ struct memory_notify *mhp = arg;
+ const unsigned long start = PFN_PHYS(mhp->start_pfn);
+ const unsigned long size = PFN_PHYS(mhp->nr_pages);
+ int rc = NOTIFY_OK;
+ unsigned long id;
+
+ if (!virtio_mem_overlaps_range(vm, start, size))
+ return NOTIFY_DONE;
+
+ if (vm->in_sbm) {
+ id = virtio_mem_phys_to_mb_id(start);
+ /*
+ * In SBM, we add memory in separate memory blocks - we expect
+ * it to be onlined/offlined in the same granularity. Bail out
+ * if this ever changes.
+ */
+ if (WARN_ON_ONCE(size != memory_block_size_bytes() ||
+ !IS_ALIGNED(start, memory_block_size_bytes())))
+ return NOTIFY_BAD;
+ } else {
+ id = virtio_mem_phys_to_bb_id(vm, start);
+ /*
+ * In BBM, we only care about onlining/offlining happening
+ * within a single big block, we don't care about the
+ * actual granularity as we don't track individual Linux
+ * memory blocks.
+ */
+ if (WARN_ON_ONCE(id != virtio_mem_phys_to_bb_id(vm, start + size - 1)))
+ return NOTIFY_BAD;
+ }
+
+ /*
+ * Avoid circular locking lockdep warnings. We lock the mutex
+ * e.g., in MEM_GOING_ONLINE and unlock it in MEM_ONLINE. The
+ * blocking_notifier_call_chain() has it's own lock, which gets unlocked
+ * between both notifier calls and will bail out. False positive.
+ */
+ lockdep_off();
+
+ switch (action) {
+ case MEM_GOING_OFFLINE:
+ mutex_lock(&vm->hotplug_mutex);
+ if (vm->removing) {
+ rc = notifier_from_errno(-EBUSY);
+ mutex_unlock(&vm->hotplug_mutex);
+ break;
+ }
+ vm->hotplug_active = true;
+ if (vm->in_sbm)
+ virtio_mem_sbm_notify_going_offline(vm, id);
+ else
+ virtio_mem_bbm_notify_going_offline(vm, id,
+ mhp->start_pfn,
+ mhp->nr_pages);
+ break;
+ case MEM_GOING_ONLINE:
+ mutex_lock(&vm->hotplug_mutex);
+ if (vm->removing) {
+ rc = notifier_from_errno(-EBUSY);
+ mutex_unlock(&vm->hotplug_mutex);
+ break;
+ }
+ vm->hotplug_active = true;
+ if (vm->in_sbm)
+ rc = virtio_mem_sbm_notify_going_online(vm, id);
+ break;
+ case MEM_OFFLINE:
+ if (vm->in_sbm)
+ virtio_mem_sbm_notify_offline(vm, id);
+
+ atomic64_add(size, &vm->offline_size);
+ /*
+ * Trigger the workqueue. Now that we have some offline memory,
+ * maybe we can handle pending unplug requests.
+ */
+ if (!unplug_online)
+ virtio_mem_retry(vm);
+
+ vm->hotplug_active = false;
+ mutex_unlock(&vm->hotplug_mutex);
+ break;
+ case MEM_ONLINE:
+ if (vm->in_sbm)
+ virtio_mem_sbm_notify_online(vm, id, mhp->start_pfn);
+
+ atomic64_sub(size, &vm->offline_size);
+ /*
+ * Start adding more memory once we onlined half of our
+ * threshold. Don't trigger if it's possibly due to our actipn
+ * (e.g., us adding memory which gets onlined immediately from
+ * the core).
+ */
+ if (!atomic_read(&vm->wq_active) &&
+ virtio_mem_could_add_memory(vm, vm->offline_threshold / 2))
+ virtio_mem_retry(vm);
+
+ vm->hotplug_active = false;
+ mutex_unlock(&vm->hotplug_mutex);
+ break;
+ case MEM_CANCEL_OFFLINE:
+ if (!vm->hotplug_active)
+ break;
+ if (vm->in_sbm)
+ virtio_mem_sbm_notify_cancel_offline(vm, id);
+ else
+ virtio_mem_bbm_notify_cancel_offline(vm, id,
+ mhp->start_pfn,
+ mhp->nr_pages);
+ vm->hotplug_active = false;
+ mutex_unlock(&vm->hotplug_mutex);
+ break;
+ case MEM_CANCEL_ONLINE:
+ if (!vm->hotplug_active)
+ break;
+ vm->hotplug_active = false;
+ mutex_unlock(&vm->hotplug_mutex);
+ break;
+ default:
+ break;
+ }
+
+ lockdep_on();
+
+ return rc;
+}
+
+/*
+ * Set a range of pages PG_offline. Remember pages that were never onlined
+ * (via generic_online_page()) using PageDirty().
+ */
+static void virtio_mem_set_fake_offline(unsigned long pfn,
+ unsigned long nr_pages, bool onlined)
+{
+ page_offline_begin();
+ for (; nr_pages--; pfn++) {
+ struct page *page = pfn_to_page(pfn);
+
+ __SetPageOffline(page);
+ if (!onlined) {
+ SetPageDirty(page);
+ /* FIXME: remove after cleanups */
+ ClearPageReserved(page);
+ }
+ }
+ page_offline_end();
+}
+
+/*
+ * Clear PG_offline from a range of pages. If the pages were never onlined,
+ * (via generic_online_page()), clear PageDirty().
+ */
+static void virtio_mem_clear_fake_offline(unsigned long pfn,
+ unsigned long nr_pages, bool onlined)
+{
+ for (; nr_pages--; pfn++) {
+ struct page *page = pfn_to_page(pfn);
+
+ __ClearPageOffline(page);
+ if (!onlined)
+ ClearPageDirty(page);
+ }
+}
+
+/*
+ * Release a range of fake-offline pages to the buddy, effectively
+ * fake-onlining them.
+ */
+static void virtio_mem_fake_online(unsigned long pfn, unsigned long nr_pages)
+{
+ unsigned long order = MAX_ORDER;
+ unsigned long i;
+
+ /*
+ * We might get called for ranges that don't cover properly aligned
+ * MAX_ORDER pages; however, we can only online properly aligned
+ * pages with an order of MAX_ORDER at maximum.
+ */
+ while (!IS_ALIGNED(pfn | nr_pages, 1 << order))
+ order--;
+
+ for (i = 0; i < nr_pages; i += 1 << order) {
+ struct page *page = pfn_to_page(pfn + i);
+
+ /*
+ * If the page is PageDirty(), it was kept fake-offline when
+ * onlining the memory block. Otherwise, it was allocated
+ * using alloc_contig_range(). All pages in a subblock are
+ * alike.
+ */
+ if (PageDirty(page)) {
+ virtio_mem_clear_fake_offline(pfn + i, 1 << order, false);
+ generic_online_page(page, order);
+ } else {
+ virtio_mem_clear_fake_offline(pfn + i, 1 << order, true);
+ free_contig_range(pfn + i, 1 << order);
+ adjust_managed_page_count(page, 1 << order);
+ }
+ }
+}
+
+/*
+ * Try to allocate a range, marking pages fake-offline, effectively
+ * fake-offlining them.
+ */
+static int virtio_mem_fake_offline(struct virtio_mem *vm, unsigned long pfn,
+ unsigned long nr_pages)
+{
+ const bool is_movable = is_zone_movable_page(pfn_to_page(pfn));
+ int rc, retry_count;
+
+ /*
+ * TODO: We want an alloc_contig_range() mode that tries to allocate
+ * harder (e.g., dealing with temporarily pinned pages, PCP), especially
+ * with ZONE_MOVABLE. So for now, retry a couple of times with
+ * ZONE_MOVABLE before giving up - because that zone is supposed to give
+ * some guarantees.
+ */
+ for (retry_count = 0; retry_count < 5; retry_count++) {
+ /*
+ * If the config changed, stop immediately and go back to the
+ * main loop: avoid trying to keep unplugging if the device
+ * might have decided to not remove any more memory.
+ */
+ if (atomic_read(&vm->config_changed))
+ return -EAGAIN;
+
+ rc = alloc_contig_range(pfn, pfn + nr_pages, MIGRATE_MOVABLE,
+ GFP_KERNEL);
+ if (rc == -ENOMEM)
+ /* whoops, out of memory */
+ return rc;
+ else if (rc && !is_movable)
+ break;
+ else if (rc)
+ continue;
+
+ virtio_mem_set_fake_offline(pfn, nr_pages, true);
+ adjust_managed_page_count(pfn_to_page(pfn), -nr_pages);
+ return 0;
+ }
+
+ return -EBUSY;
+}
+
+/*
+ * Handle fake-offline pages when memory is going offline - such that the
+ * pages can be skipped by mm-core when offlining.
+ */
+static void virtio_mem_fake_offline_going_offline(unsigned long pfn,
+ unsigned long nr_pages)
+{
+ struct page *page;
+ unsigned long i;
+
+ /*
+ * Drop our reference to the pages so the memory can get offlined
+ * and add the unplugged pages to the managed page counters (so
+ * offlining code can correctly subtract them again).
+ */
+ adjust_managed_page_count(pfn_to_page(pfn), nr_pages);
+ /* Drop our reference to the pages so the memory can get offlined. */
+ for (i = 0; i < nr_pages; i++) {
+ page = pfn_to_page(pfn + i);
+ if (WARN_ON(!page_ref_dec_and_test(page)))
+ dump_page(page, "fake-offline page referenced");
+ }
+}
+
+/*
+ * Handle fake-offline pages when memory offlining is canceled - to undo
+ * what we did in virtio_mem_fake_offline_going_offline().
+ */
+static void virtio_mem_fake_offline_cancel_offline(unsigned long pfn,
+ unsigned long nr_pages)
+{
+ unsigned long i;
+
+ /*
+ * Get the reference we dropped when going offline and subtract the
+ * unplugged pages from the managed page counters.
+ */
+ adjust_managed_page_count(pfn_to_page(pfn), -nr_pages);
+ for (i = 0; i < nr_pages; i++)
+ page_ref_inc(pfn_to_page(pfn + i));
+}
+
+static void virtio_mem_online_page(struct virtio_mem *vm,
+ struct page *page, unsigned int order)
+{
+ const unsigned long start = page_to_phys(page);
+ const unsigned long end = start + PFN_PHYS(1 << order);
+ unsigned long addr, next, id, sb_id, count;
+ bool do_online;
+
+ /*
+ * We can get called with any order up to MAX_ORDER. If our subblock
+ * size is smaller than that and we have a mixture of plugged and
+ * unplugged subblocks within such a page, we have to process in
+ * smaller granularity. In that case we'll adjust the order exactly once
+ * within the loop.
+ */
+ for (addr = start; addr < end; ) {
+ next = addr + PFN_PHYS(1 << order);
+
+ if (vm->in_sbm) {
+ id = virtio_mem_phys_to_mb_id(addr);
+ sb_id = virtio_mem_phys_to_sb_id(vm, addr);
+ count = virtio_mem_phys_to_sb_id(vm, next - 1) - sb_id + 1;
+
+ if (virtio_mem_sbm_test_sb_plugged(vm, id, sb_id, count)) {
+ /* Fully plugged. */
+ do_online = true;
+ } else if (count == 1 ||
+ virtio_mem_sbm_test_sb_unplugged(vm, id, sb_id, count)) {
+ /* Fully unplugged. */
+ do_online = false;
+ } else {
+ /*
+ * Mixture, process sub-blocks instead. This
+ * will be at least the size of a pageblock.
+ * We'll run into this case exactly once.
+ */
+ order = ilog2(vm->sbm.sb_size) - PAGE_SHIFT;
+ do_online = virtio_mem_sbm_test_sb_plugged(vm, id, sb_id, 1);
+ continue;
+ }
+ } else {
+ /*
+ * If the whole block is marked fake offline, keep
+ * everything that way.
+ */
+ id = virtio_mem_phys_to_bb_id(vm, addr);
+ do_online = virtio_mem_bbm_get_bb_state(vm, id) !=
+ VIRTIO_MEM_BBM_BB_FAKE_OFFLINE;
+ }
+
+ if (do_online)
+ generic_online_page(pfn_to_page(PFN_DOWN(addr)), order);
+ else
+ virtio_mem_set_fake_offline(PFN_DOWN(addr), 1 << order,
+ false);
+ addr = next;
+ }
+}
+
+static void virtio_mem_online_page_cb(struct page *page, unsigned int order)
+{
+ const unsigned long addr = page_to_phys(page);
+ struct virtio_mem *vm;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(vm, &virtio_mem_devices, next) {
+ /*
+ * Pages we're onlining will never cross memory blocks and,
+ * therefore, not virtio-mem devices.
+ */
+ if (!virtio_mem_contains_range(vm, addr, PFN_PHYS(1 << order)))
+ continue;
+
+ /*
+ * virtio_mem_set_fake_offline() might sleep. We can safely
+ * drop the RCU lock at this point because the device
+ * cannot go away. See virtio_mem_remove() how races
+ * between memory onlining and device removal are handled.
+ */
+ rcu_read_unlock();
+
+ virtio_mem_online_page(vm, page, order);
+ return;
+ }
+ rcu_read_unlock();
+
+ /* not virtio-mem memory, but e.g., a DIMM. online it */
+ generic_online_page(page, order);
+}
+
+static uint64_t virtio_mem_send_request(struct virtio_mem *vm,
+ const struct virtio_mem_req *req)
+{
+ struct scatterlist *sgs[2], sg_req, sg_resp;
+ unsigned int len;
+ int rc;
+
+ /* don't use the request residing on the stack (vaddr) */
+ vm->req = *req;
+
+ /* out: buffer for request */
+ sg_init_one(&sg_req, &vm->req, sizeof(vm->req));
+ sgs[0] = &sg_req;
+
+ /* in: buffer for response */
+ sg_init_one(&sg_resp, &vm->resp, sizeof(vm->resp));
+ sgs[1] = &sg_resp;
+
+ rc = virtqueue_add_sgs(vm->vq, sgs, 1, 1, vm, GFP_KERNEL);
+ if (rc < 0)
+ return rc;
+
+ virtqueue_kick(vm->vq);
+
+ /* wait for a response */
+ wait_event(vm->host_resp, virtqueue_get_buf(vm->vq, &len));
+
+ return virtio16_to_cpu(vm->vdev, vm->resp.type);
+}
+
+static int virtio_mem_send_plug_request(struct virtio_mem *vm, uint64_t addr,
+ uint64_t size)
+{
+ const uint64_t nb_vm_blocks = size / vm->device_block_size;
+ const struct virtio_mem_req req = {
+ .type = cpu_to_virtio16(vm->vdev, VIRTIO_MEM_REQ_PLUG),
+ .u.plug.addr = cpu_to_virtio64(vm->vdev, addr),
+ .u.plug.nb_blocks = cpu_to_virtio16(vm->vdev, nb_vm_blocks),
+ };
+ int rc = -ENOMEM;
+
+ if (atomic_read(&vm->config_changed))
+ return -EAGAIN;
+
+ dev_dbg(&vm->vdev->dev, "plugging memory: 0x%llx - 0x%llx\n", addr,
+ addr + size - 1);
+
+ switch (virtio_mem_send_request(vm, &req)) {
+ case VIRTIO_MEM_RESP_ACK:
+ vm->plugged_size += size;
+ return 0;
+ case VIRTIO_MEM_RESP_NACK:
+ rc = -EAGAIN;
+ break;
+ case VIRTIO_MEM_RESP_BUSY:
+ rc = -ETXTBSY;
+ break;
+ case VIRTIO_MEM_RESP_ERROR:
+ rc = -EINVAL;
+ break;
+ default:
+ break;
+ }
+
+ dev_dbg(&vm->vdev->dev, "plugging memory failed: %d\n", rc);
+ return rc;
+}
+
+static int virtio_mem_send_unplug_request(struct virtio_mem *vm, uint64_t addr,
+ uint64_t size)
+{
+ const uint64_t nb_vm_blocks = size / vm->device_block_size;
+ const struct virtio_mem_req req = {
+ .type = cpu_to_virtio16(vm->vdev, VIRTIO_MEM_REQ_UNPLUG),
+ .u.unplug.addr = cpu_to_virtio64(vm->vdev, addr),
+ .u.unplug.nb_blocks = cpu_to_virtio16(vm->vdev, nb_vm_blocks),
+ };
+ int rc = -ENOMEM;
+
+ if (atomic_read(&vm->config_changed))
+ return -EAGAIN;
+
+ dev_dbg(&vm->vdev->dev, "unplugging memory: 0x%llx - 0x%llx\n", addr,
+ addr + size - 1);
+
+ switch (virtio_mem_send_request(vm, &req)) {
+ case VIRTIO_MEM_RESP_ACK:
+ vm->plugged_size -= size;
+ return 0;
+ case VIRTIO_MEM_RESP_BUSY:
+ rc = -ETXTBSY;
+ break;
+ case VIRTIO_MEM_RESP_ERROR:
+ rc = -EINVAL;
+ break;
+ default:
+ break;
+ }
+
+ dev_dbg(&vm->vdev->dev, "unplugging memory failed: %d\n", rc);
+ return rc;
+}
+
+static int virtio_mem_send_unplug_all_request(struct virtio_mem *vm)
+{
+ const struct virtio_mem_req req = {
+ .type = cpu_to_virtio16(vm->vdev, VIRTIO_MEM_REQ_UNPLUG_ALL),
+ };
+ int rc = -ENOMEM;
+
+ dev_dbg(&vm->vdev->dev, "unplugging all memory");
+
+ switch (virtio_mem_send_request(vm, &req)) {
+ case VIRTIO_MEM_RESP_ACK:
+ vm->unplug_all_required = false;
+ vm->plugged_size = 0;
+ /* usable region might have shrunk */
+ atomic_set(&vm->config_changed, 1);
+ return 0;
+ case VIRTIO_MEM_RESP_BUSY:
+ rc = -ETXTBSY;
+ break;
+ default:
+ break;
+ }
+
+ dev_dbg(&vm->vdev->dev, "unplugging all memory failed: %d\n", rc);
+ return rc;
+}
+
+/*
+ * Plug selected subblocks. Updates the plugged state, but not the state
+ * of the memory block.
+ */
+static int virtio_mem_sbm_plug_sb(struct virtio_mem *vm, unsigned long mb_id,
+ int sb_id, int count)
+{
+ const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id) +
+ sb_id * vm->sbm.sb_size;
+ const uint64_t size = count * vm->sbm.sb_size;
+ int rc;
+
+ rc = virtio_mem_send_plug_request(vm, addr, size);
+ if (!rc)
+ virtio_mem_sbm_set_sb_plugged(vm, mb_id, sb_id, count);
+ return rc;
+}
+
+/*
+ * Unplug selected subblocks. Updates the plugged state, but not the state
+ * of the memory block.
+ */
+static int virtio_mem_sbm_unplug_sb(struct virtio_mem *vm, unsigned long mb_id,
+ int sb_id, int count)
+{
+ const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id) +
+ sb_id * vm->sbm.sb_size;
+ const uint64_t size = count * vm->sbm.sb_size;
+ int rc;
+
+ rc = virtio_mem_send_unplug_request(vm, addr, size);
+ if (!rc)
+ virtio_mem_sbm_set_sb_unplugged(vm, mb_id, sb_id, count);
+ return rc;
+}
+
+/*
+ * Request to unplug a big block.
+ *
+ * Will not modify the state of the big block.
+ */
+static int virtio_mem_bbm_unplug_bb(struct virtio_mem *vm, unsigned long bb_id)
+{
+ const uint64_t addr = virtio_mem_bb_id_to_phys(vm, bb_id);
+ const uint64_t size = vm->bbm.bb_size;
+
+ return virtio_mem_send_unplug_request(vm, addr, size);
+}
+
+/*
+ * Request to plug a big block.
+ *
+ * Will not modify the state of the big block.
+ */
+static int virtio_mem_bbm_plug_bb(struct virtio_mem *vm, unsigned long bb_id)
+{
+ const uint64_t addr = virtio_mem_bb_id_to_phys(vm, bb_id);
+ const uint64_t size = vm->bbm.bb_size;
+
+ return virtio_mem_send_plug_request(vm, addr, size);
+}
+
+/*
+ * Unplug the desired number of plugged subblocks of a offline or not-added
+ * memory block. Will fail if any subblock cannot get unplugged (instead of
+ * skipping it).
+ *
+ * Will not modify the state of the memory block.
+ *
+ * Note: can fail after some subblocks were unplugged.
+ */
+static int virtio_mem_sbm_unplug_any_sb_raw(struct virtio_mem *vm,
+ unsigned long mb_id, uint64_t *nb_sb)
+{
+ int sb_id, count;
+ int rc;
+
+ sb_id = vm->sbm.sbs_per_mb - 1;
+ while (*nb_sb) {
+ /* Find the next candidate subblock */
+ while (sb_id >= 0 &&
+ virtio_mem_sbm_test_sb_unplugged(vm, mb_id, sb_id, 1))
+ sb_id--;
+ if (sb_id < 0)
+ break;
+ /* Try to unplug multiple subblocks at a time */
+ count = 1;
+ while (count < *nb_sb && sb_id > 0 &&
+ virtio_mem_sbm_test_sb_plugged(vm, mb_id, sb_id - 1, 1)) {
+ count++;
+ sb_id--;
+ }
+
+ rc = virtio_mem_sbm_unplug_sb(vm, mb_id, sb_id, count);
+ if (rc)
+ return rc;
+ *nb_sb -= count;
+ sb_id--;
+ }
+
+ return 0;
+}
+
+/*
+ * Unplug all plugged subblocks of an offline or not-added memory block.
+ *
+ * Will not modify the state of the memory block.
+ *
+ * Note: can fail after some subblocks were unplugged.
+ */
+static int virtio_mem_sbm_unplug_mb(struct virtio_mem *vm, unsigned long mb_id)
+{
+ uint64_t nb_sb = vm->sbm.sbs_per_mb;
+
+ return virtio_mem_sbm_unplug_any_sb_raw(vm, mb_id, &nb_sb);
+}
+
+/*
+ * Prepare tracking data for the next memory block.
+ */
+static int virtio_mem_sbm_prepare_next_mb(struct virtio_mem *vm,
+ unsigned long *mb_id)
+{
+ int rc;
+
+ if (vm->sbm.next_mb_id > vm->sbm.last_usable_mb_id)
+ return -ENOSPC;
+
+ /* Resize the state array if required. */
+ rc = virtio_mem_sbm_mb_states_prepare_next_mb(vm);
+ if (rc)
+ return rc;
+
+ /* Resize the subblock bitmap if required. */
+ rc = virtio_mem_sbm_sb_states_prepare_next_mb(vm);
+ if (rc)
+ return rc;
+
+ vm->sbm.mb_count[VIRTIO_MEM_SBM_MB_UNUSED]++;
+ *mb_id = vm->sbm.next_mb_id++;
+ return 0;
+}
+
+/*
+ * Try to plug the desired number of subblocks and add the memory block
+ * to Linux.
+ *
+ * Will modify the state of the memory block.
+ */
+static int virtio_mem_sbm_plug_and_add_mb(struct virtio_mem *vm,
+ unsigned long mb_id, uint64_t *nb_sb)
+{
+ const int count = min_t(int, *nb_sb, vm->sbm.sbs_per_mb);
+ int rc;
+
+ if (WARN_ON_ONCE(!count))
+ return -EINVAL;
+
+ /*
+ * Plug the requested number of subblocks before adding it to linux,
+ * so that onlining will directly online all plugged subblocks.
+ */
+ rc = virtio_mem_sbm_plug_sb(vm, mb_id, 0, count);
+ if (rc)
+ return rc;
+
+ /*
+ * Mark the block properly offline before adding it to Linux,
+ * so the memory notifiers will find the block in the right state.
+ */
+ if (count == vm->sbm.sbs_per_mb)
+ virtio_mem_sbm_set_mb_state(vm, mb_id,
+ VIRTIO_MEM_SBM_MB_OFFLINE);
+ else
+ virtio_mem_sbm_set_mb_state(vm, mb_id,
+ VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL);
+
+ /* Add the memory block to linux - if that fails, try to unplug. */
+ rc = virtio_mem_sbm_add_mb(vm, mb_id);
+ if (rc) {
+ int new_state = VIRTIO_MEM_SBM_MB_UNUSED;
+
+ if (virtio_mem_sbm_unplug_sb(vm, mb_id, 0, count))
+ new_state = VIRTIO_MEM_SBM_MB_PLUGGED;
+ virtio_mem_sbm_set_mb_state(vm, mb_id, new_state);
+ return rc;
+ }
+
+ *nb_sb -= count;
+ return 0;
+}
+
+/*
+ * Try to plug the desired number of subblocks of a memory block that
+ * is already added to Linux.
+ *
+ * Will modify the state of the memory block.
+ *
+ * Note: Can fail after some subblocks were successfully plugged.
+ */
+static int virtio_mem_sbm_plug_any_sb(struct virtio_mem *vm,
+ unsigned long mb_id, uint64_t *nb_sb)
+{
+ const int old_state = virtio_mem_sbm_get_mb_state(vm, mb_id);
+ unsigned long pfn, nr_pages;
+ int sb_id, count;
+ int rc;
+
+ if (WARN_ON_ONCE(!*nb_sb))
+ return -EINVAL;
+
+ while (*nb_sb) {
+ sb_id = virtio_mem_sbm_first_unplugged_sb(vm, mb_id);
+ if (sb_id >= vm->sbm.sbs_per_mb)
+ break;
+ count = 1;
+ while (count < *nb_sb &&
+ sb_id + count < vm->sbm.sbs_per_mb &&
+ !virtio_mem_sbm_test_sb_plugged(vm, mb_id, sb_id + count, 1))
+ count++;
+
+ rc = virtio_mem_sbm_plug_sb(vm, mb_id, sb_id, count);
+ if (rc)
+ return rc;
+ *nb_sb -= count;
+ if (old_state == VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL)
+ continue;
+
+ /* fake-online the pages if the memory block is online */
+ pfn = PFN_DOWN(virtio_mem_mb_id_to_phys(mb_id) +
+ sb_id * vm->sbm.sb_size);
+ nr_pages = PFN_DOWN(count * vm->sbm.sb_size);
+ virtio_mem_fake_online(pfn, nr_pages);
+ }
+
+ if (virtio_mem_sbm_test_sb_plugged(vm, mb_id, 0, vm->sbm.sbs_per_mb))
+ virtio_mem_sbm_set_mb_state(vm, mb_id, old_state - 1);
+
+ return 0;
+}
+
+static int virtio_mem_sbm_plug_request(struct virtio_mem *vm, uint64_t diff)
+{
+ const int mb_states[] = {
+ VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL,
+ VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL,
+ VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL,
+ };
+ uint64_t nb_sb = diff / vm->sbm.sb_size;
+ unsigned long mb_id;
+ int rc, i;
+
+ if (!nb_sb)
+ return 0;
+
+ /* Don't race with onlining/offlining */
+ mutex_lock(&vm->hotplug_mutex);
+
+ for (i = 0; i < ARRAY_SIZE(mb_states); i++) {
+ virtio_mem_sbm_for_each_mb(vm, mb_id, mb_states[i]) {
+ rc = virtio_mem_sbm_plug_any_sb(vm, mb_id, &nb_sb);
+ if (rc || !nb_sb)
+ goto out_unlock;
+ cond_resched();
+ }
+ }
+
+ /*
+ * We won't be working on online/offline memory blocks from this point,
+ * so we can't race with memory onlining/offlining. Drop the mutex.
+ */
+ mutex_unlock(&vm->hotplug_mutex);
+
+ /* Try to plug and add unused blocks */
+ virtio_mem_sbm_for_each_mb(vm, mb_id, VIRTIO_MEM_SBM_MB_UNUSED) {
+ if (!virtio_mem_could_add_memory(vm, memory_block_size_bytes()))
+ return -ENOSPC;
+
+ rc = virtio_mem_sbm_plug_and_add_mb(vm, mb_id, &nb_sb);
+ if (rc || !nb_sb)
+ return rc;
+ cond_resched();
+ }
+
+ /* Try to prepare, plug and add new blocks */
+ while (nb_sb) {
+ if (!virtio_mem_could_add_memory(vm, memory_block_size_bytes()))
+ return -ENOSPC;
+
+ rc = virtio_mem_sbm_prepare_next_mb(vm, &mb_id);
+ if (rc)
+ return rc;
+ rc = virtio_mem_sbm_plug_and_add_mb(vm, mb_id, &nb_sb);
+ if (rc)
+ return rc;
+ cond_resched();
+ }
+
+ return 0;
+out_unlock:
+ mutex_unlock(&vm->hotplug_mutex);
+ return rc;
+}
+
+/*
+ * Plug a big block and add it to Linux.
+ *
+ * Will modify the state of the big block.
+ */
+static int virtio_mem_bbm_plug_and_add_bb(struct virtio_mem *vm,
+ unsigned long bb_id)
+{
+ int rc;
+
+ if (WARN_ON_ONCE(virtio_mem_bbm_get_bb_state(vm, bb_id) !=
+ VIRTIO_MEM_BBM_BB_UNUSED))
+ return -EINVAL;
+
+ rc = virtio_mem_bbm_plug_bb(vm, bb_id);
+ if (rc)
+ return rc;
+ virtio_mem_bbm_set_bb_state(vm, bb_id, VIRTIO_MEM_BBM_BB_ADDED);
+
+ rc = virtio_mem_bbm_add_bb(vm, bb_id);
+ if (rc) {
+ if (!virtio_mem_bbm_unplug_bb(vm, bb_id))
+ virtio_mem_bbm_set_bb_state(vm, bb_id,
+ VIRTIO_MEM_BBM_BB_UNUSED);
+ else
+ /* Retry from the main loop. */
+ virtio_mem_bbm_set_bb_state(vm, bb_id,
+ VIRTIO_MEM_BBM_BB_PLUGGED);
+ return rc;
+ }
+ return 0;
+}
+
+/*
+ * Prepare tracking data for the next big block.
+ */
+static int virtio_mem_bbm_prepare_next_bb(struct virtio_mem *vm,
+ unsigned long *bb_id)
+{
+ int rc;
+
+ if (vm->bbm.next_bb_id > vm->bbm.last_usable_bb_id)
+ return -ENOSPC;
+
+ /* Resize the big block state array if required. */
+ rc = virtio_mem_bbm_bb_states_prepare_next_bb(vm);
+ if (rc)
+ return rc;
+
+ vm->bbm.bb_count[VIRTIO_MEM_BBM_BB_UNUSED]++;
+ *bb_id = vm->bbm.next_bb_id;
+ vm->bbm.next_bb_id++;
+ return 0;
+}
+
+static int virtio_mem_bbm_plug_request(struct virtio_mem *vm, uint64_t diff)
+{
+ uint64_t nb_bb = diff / vm->bbm.bb_size;
+ unsigned long bb_id;
+ int rc;
+
+ if (!nb_bb)
+ return 0;
+
+ /* Try to plug and add unused big blocks */
+ virtio_mem_bbm_for_each_bb(vm, bb_id, VIRTIO_MEM_BBM_BB_UNUSED) {
+ if (!virtio_mem_could_add_memory(vm, vm->bbm.bb_size))
+ return -ENOSPC;
+
+ rc = virtio_mem_bbm_plug_and_add_bb(vm, bb_id);
+ if (!rc)
+ nb_bb--;
+ if (rc || !nb_bb)
+ return rc;
+ cond_resched();
+ }
+
+ /* Try to prepare, plug and add new big blocks */
+ while (nb_bb) {
+ if (!virtio_mem_could_add_memory(vm, vm->bbm.bb_size))
+ return -ENOSPC;
+
+ rc = virtio_mem_bbm_prepare_next_bb(vm, &bb_id);
+ if (rc)
+ return rc;
+ rc = virtio_mem_bbm_plug_and_add_bb(vm, bb_id);
+ if (!rc)
+ nb_bb--;
+ if (rc)
+ return rc;
+ cond_resched();
+ }
+
+ return 0;
+}
+
+/*
+ * Try to plug the requested amount of memory.
+ */
+static int virtio_mem_plug_request(struct virtio_mem *vm, uint64_t diff)
+{
+ if (vm->in_sbm)
+ return virtio_mem_sbm_plug_request(vm, diff);
+ return virtio_mem_bbm_plug_request(vm, diff);
+}
+
+/*
+ * Unplug the desired number of plugged subblocks of an offline memory block.
+ * Will fail if any subblock cannot get unplugged (instead of skipping it).
+ *
+ * Will modify the state of the memory block. Might temporarily drop the
+ * hotplug_mutex.
+ *
+ * Note: Can fail after some subblocks were successfully unplugged.
+ */
+static int virtio_mem_sbm_unplug_any_sb_offline(struct virtio_mem *vm,
+ unsigned long mb_id,
+ uint64_t *nb_sb)
+{
+ int rc;
+
+ rc = virtio_mem_sbm_unplug_any_sb_raw(vm, mb_id, nb_sb);
+
+ /* some subblocks might have been unplugged even on failure */
+ if (!virtio_mem_sbm_test_sb_plugged(vm, mb_id, 0, vm->sbm.sbs_per_mb))
+ virtio_mem_sbm_set_mb_state(vm, mb_id,
+ VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL);
+ if (rc)
+ return rc;
+
+ if (virtio_mem_sbm_test_sb_unplugged(vm, mb_id, 0, vm->sbm.sbs_per_mb)) {
+ /*
+ * Remove the block from Linux - this should never fail.
+ * Hinder the block from getting onlined by marking it
+ * unplugged. Temporarily drop the mutex, so
+ * any pending GOING_ONLINE requests can be serviced/rejected.
+ */
+ virtio_mem_sbm_set_mb_state(vm, mb_id,
+ VIRTIO_MEM_SBM_MB_UNUSED);
+
+ mutex_unlock(&vm->hotplug_mutex);
+ rc = virtio_mem_sbm_remove_mb(vm, mb_id);
+ BUG_ON(rc);
+ mutex_lock(&vm->hotplug_mutex);
+ }
+ return 0;
+}
+
+/*
+ * Unplug the given plugged subblocks of an online memory block.
+ *
+ * Will modify the state of the memory block.
+ */
+static int virtio_mem_sbm_unplug_sb_online(struct virtio_mem *vm,
+ unsigned long mb_id, int sb_id,
+ int count)
+{
+ const unsigned long nr_pages = PFN_DOWN(vm->sbm.sb_size) * count;
+ const int old_state = virtio_mem_sbm_get_mb_state(vm, mb_id);
+ unsigned long start_pfn;
+ int rc;
+
+ start_pfn = PFN_DOWN(virtio_mem_mb_id_to_phys(mb_id) +
+ sb_id * vm->sbm.sb_size);
+
+ rc = virtio_mem_fake_offline(vm, start_pfn, nr_pages);
+ if (rc)
+ return rc;
+
+ /* Try to unplug the allocated memory */
+ rc = virtio_mem_sbm_unplug_sb(vm, mb_id, sb_id, count);
+ if (rc) {
+ /* Return the memory to the buddy. */
+ virtio_mem_fake_online(start_pfn, nr_pages);
+ return rc;
+ }
+
+ switch (old_state) {
+ case VIRTIO_MEM_SBM_MB_KERNEL:
+ virtio_mem_sbm_set_mb_state(vm, mb_id,
+ VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL);
+ break;
+ case VIRTIO_MEM_SBM_MB_MOVABLE:
+ virtio_mem_sbm_set_mb_state(vm, mb_id,
+ VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL);
+ break;
+ }
+
+ return 0;
+}
+
+/*
+ * Unplug the desired number of plugged subblocks of an online memory block.
+ * Will skip subblock that are busy.
+ *
+ * Will modify the state of the memory block. Might temporarily drop the
+ * hotplug_mutex.
+ *
+ * Note: Can fail after some subblocks were successfully unplugged. Can
+ * return 0 even if subblocks were busy and could not get unplugged.
+ */
+static int virtio_mem_sbm_unplug_any_sb_online(struct virtio_mem *vm,
+ unsigned long mb_id,
+ uint64_t *nb_sb)
+{
+ int rc, sb_id;
+
+ /* If possible, try to unplug the complete block in one shot. */
+ if (*nb_sb >= vm->sbm.sbs_per_mb &&
+ virtio_mem_sbm_test_sb_plugged(vm, mb_id, 0, vm->sbm.sbs_per_mb)) {
+ rc = virtio_mem_sbm_unplug_sb_online(vm, mb_id, 0,
+ vm->sbm.sbs_per_mb);
+ if (!rc) {
+ *nb_sb -= vm->sbm.sbs_per_mb;
+ goto unplugged;
+ } else if (rc != -EBUSY)
+ return rc;
+ }
+
+ /* Fallback to single subblocks. */
+ for (sb_id = vm->sbm.sbs_per_mb - 1; sb_id >= 0 && *nb_sb; sb_id--) {
+ /* Find the next candidate subblock */
+ while (sb_id >= 0 &&
+ !virtio_mem_sbm_test_sb_plugged(vm, mb_id, sb_id, 1))
+ sb_id--;
+ if (sb_id < 0)
+ break;
+
+ rc = virtio_mem_sbm_unplug_sb_online(vm, mb_id, sb_id, 1);
+ if (rc == -EBUSY)
+ continue;
+ else if (rc)
+ return rc;
+ *nb_sb -= 1;
+ }
+
+unplugged:
+ rc = virtio_mem_sbm_try_remove_unplugged_mb(vm, mb_id);
+ if (rc)
+ vm->sbm.have_unplugged_mb = 1;
+ /* Ignore errors, this is not critical. We'll retry later. */
+ return 0;
+}
+
+/*
+ * Unplug the desired number of plugged subblocks of a memory block that is
+ * already added to Linux. Will skip subblock of online memory blocks that are
+ * busy (by the OS). Will fail if any subblock that's not busy cannot get
+ * unplugged.
+ *
+ * Will modify the state of the memory block. Might temporarily drop the
+ * hotplug_mutex.
+ *
+ * Note: Can fail after some subblocks were successfully unplugged. Can
+ * return 0 even if subblocks were busy and could not get unplugged.
+ */
+static int virtio_mem_sbm_unplug_any_sb(struct virtio_mem *vm,
+ unsigned long mb_id,
+ uint64_t *nb_sb)
+{
+ const int old_state = virtio_mem_sbm_get_mb_state(vm, mb_id);
+
+ switch (old_state) {
+ case VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL:
+ case VIRTIO_MEM_SBM_MB_KERNEL:
+ case VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL:
+ case VIRTIO_MEM_SBM_MB_MOVABLE:
+ return virtio_mem_sbm_unplug_any_sb_online(vm, mb_id, nb_sb);
+ case VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL:
+ case VIRTIO_MEM_SBM_MB_OFFLINE:
+ return virtio_mem_sbm_unplug_any_sb_offline(vm, mb_id, nb_sb);
+ }
+ return -EINVAL;
+}
+
+static int virtio_mem_sbm_unplug_request(struct virtio_mem *vm, uint64_t diff)
+{
+ const int mb_states[] = {
+ VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL,
+ VIRTIO_MEM_SBM_MB_OFFLINE,
+ VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL,
+ VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL,
+ VIRTIO_MEM_SBM_MB_MOVABLE,
+ VIRTIO_MEM_SBM_MB_KERNEL,
+ };
+ uint64_t nb_sb = diff / vm->sbm.sb_size;
+ unsigned long mb_id;
+ int rc, i;
+
+ if (!nb_sb)
+ return 0;
+
+ /*
+ * We'll drop the mutex a couple of times when it is safe to do so.
+ * This might result in some blocks switching the state (online/offline)
+ * and we could miss them in this run - we will retry again later.
+ */
+ mutex_lock(&vm->hotplug_mutex);
+
+ /*
+ * We try unplug from partially plugged blocks first, to try removing
+ * whole memory blocks along with metadata. We prioritize ZONE_MOVABLE
+ * as it's more reliable to unplug memory and remove whole memory
+ * blocks, and we don't want to trigger a zone imbalances by
+ * accidentially removing too much kernel memory.
+ */
+ for (i = 0; i < ARRAY_SIZE(mb_states); i++) {
+ virtio_mem_sbm_for_each_mb_rev(vm, mb_id, mb_states[i]) {
+ rc = virtio_mem_sbm_unplug_any_sb(vm, mb_id, &nb_sb);
+ if (rc || !nb_sb)
+ goto out_unlock;
+ mutex_unlock(&vm->hotplug_mutex);
+ cond_resched();
+ mutex_lock(&vm->hotplug_mutex);
+ }
+ if (!unplug_online && i == 1) {
+ mutex_unlock(&vm->hotplug_mutex);
+ return 0;
+ }
+ }
+
+ mutex_unlock(&vm->hotplug_mutex);
+ return nb_sb ? -EBUSY : 0;
+out_unlock:
+ mutex_unlock(&vm->hotplug_mutex);
+ return rc;
+}
+
+/*
+ * Try to offline and remove a big block from Linux and unplug it. Will fail
+ * with -EBUSY if some memory is busy and cannot get unplugged.
+ *
+ * Will modify the state of the memory block. Might temporarily drop the
+ * hotplug_mutex.
+ */
+static int virtio_mem_bbm_offline_remove_and_unplug_bb(struct virtio_mem *vm,
+ unsigned long bb_id)
+{
+ const unsigned long start_pfn = PFN_DOWN(virtio_mem_bb_id_to_phys(vm, bb_id));
+ const unsigned long nr_pages = PFN_DOWN(vm->bbm.bb_size);
+ unsigned long end_pfn = start_pfn + nr_pages;
+ unsigned long pfn;
+ struct page *page;
+ int rc;
+
+ if (WARN_ON_ONCE(virtio_mem_bbm_get_bb_state(vm, bb_id) !=
+ VIRTIO_MEM_BBM_BB_ADDED))
+ return -EINVAL;
+
+ /*
+ * Start by fake-offlining all memory. Once we marked the device
+ * block as fake-offline, all newly onlined memory will
+ * automatically be kept fake-offline. Protect from concurrent
+ * onlining/offlining until we have a consistent state.
+ */
+ mutex_lock(&vm->hotplug_mutex);
+ virtio_mem_bbm_set_bb_state(vm, bb_id, VIRTIO_MEM_BBM_BB_FAKE_OFFLINE);
+
+ for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
+ page = pfn_to_online_page(pfn);
+ if (!page)
+ continue;
+
+ rc = virtio_mem_fake_offline(vm, pfn, PAGES_PER_SECTION);
+ if (rc) {
+ end_pfn = pfn;
+ goto rollback;
+ }
+ }
+ mutex_unlock(&vm->hotplug_mutex);
+
+ rc = virtio_mem_bbm_offline_and_remove_bb(vm, bb_id);
+ if (rc) {
+ mutex_lock(&vm->hotplug_mutex);
+ goto rollback;
+ }
+
+ rc = virtio_mem_bbm_unplug_bb(vm, bb_id);
+ if (rc)
+ virtio_mem_bbm_set_bb_state(vm, bb_id,
+ VIRTIO_MEM_BBM_BB_PLUGGED);
+ else
+ virtio_mem_bbm_set_bb_state(vm, bb_id,
+ VIRTIO_MEM_BBM_BB_UNUSED);
+ return rc;
+
+rollback:
+ for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
+ page = pfn_to_online_page(pfn);
+ if (!page)
+ continue;
+ virtio_mem_fake_online(pfn, PAGES_PER_SECTION);
+ }
+ virtio_mem_bbm_set_bb_state(vm, bb_id, VIRTIO_MEM_BBM_BB_ADDED);
+ mutex_unlock(&vm->hotplug_mutex);
+ return rc;
+}
+
+/*
+ * Test if a big block is completely offline.
+ */
+static bool virtio_mem_bbm_bb_is_offline(struct virtio_mem *vm,
+ unsigned long bb_id)
+{
+ const unsigned long start_pfn = PFN_DOWN(virtio_mem_bb_id_to_phys(vm, bb_id));
+ const unsigned long nr_pages = PFN_DOWN(vm->bbm.bb_size);
+ unsigned long pfn;
+
+ for (pfn = start_pfn; pfn < start_pfn + nr_pages;
+ pfn += PAGES_PER_SECTION) {
+ if (pfn_to_online_page(pfn))
+ return false;
+ }
+
+ return true;
+}
+
+/*
+ * Test if a big block is completely onlined to ZONE_MOVABLE (or offline).
+ */
+static bool virtio_mem_bbm_bb_is_movable(struct virtio_mem *vm,
+ unsigned long bb_id)
+{
+ const unsigned long start_pfn = PFN_DOWN(virtio_mem_bb_id_to_phys(vm, bb_id));
+ const unsigned long nr_pages = PFN_DOWN(vm->bbm.bb_size);
+ struct page *page;
+ unsigned long pfn;
+
+ for (pfn = start_pfn; pfn < start_pfn + nr_pages;
+ pfn += PAGES_PER_SECTION) {
+ page = pfn_to_online_page(pfn);
+ if (!page)
+ continue;
+ if (page_zonenum(page) != ZONE_MOVABLE)
+ return false;
+ }
+
+ return true;
+}
+
+static int virtio_mem_bbm_unplug_request(struct virtio_mem *vm, uint64_t diff)
+{
+ uint64_t nb_bb = diff / vm->bbm.bb_size;
+ uint64_t bb_id;
+ int rc, i;
+
+ if (!nb_bb)
+ return 0;
+
+ /*
+ * Try to unplug big blocks. Similar to SBM, start with offline
+ * big blocks.
+ */
+ for (i = 0; i < 3; i++) {
+ virtio_mem_bbm_for_each_bb_rev(vm, bb_id, VIRTIO_MEM_BBM_BB_ADDED) {
+ cond_resched();
+
+ /*
+ * As we're holding no locks, these checks are racy,
+ * but we don't care.
+ */
+ if (i == 0 && !virtio_mem_bbm_bb_is_offline(vm, bb_id))
+ continue;
+ if (i == 1 && !virtio_mem_bbm_bb_is_movable(vm, bb_id))
+ continue;
+ rc = virtio_mem_bbm_offline_remove_and_unplug_bb(vm, bb_id);
+ if (rc == -EBUSY)
+ continue;
+ if (!rc)
+ nb_bb--;
+ if (rc || !nb_bb)
+ return rc;
+ }
+ if (i == 0 && !unplug_online)
+ return 0;
+ }
+
+ return nb_bb ? -EBUSY : 0;
+}
+
+/*
+ * Try to unplug the requested amount of memory.
+ */
+static int virtio_mem_unplug_request(struct virtio_mem *vm, uint64_t diff)
+{
+ if (vm->in_sbm)
+ return virtio_mem_sbm_unplug_request(vm, diff);
+ return virtio_mem_bbm_unplug_request(vm, diff);
+}
+
+/*
+ * Try to unplug all blocks that couldn't be unplugged before, for example,
+ * because the hypervisor was busy. Further, offline and remove any memory
+ * blocks where we previously failed.
+ */
+static int virtio_mem_cleanup_pending_mb(struct virtio_mem *vm)
+{
+ unsigned long id;
+ int rc = 0;
+
+ if (!vm->in_sbm) {
+ virtio_mem_bbm_for_each_bb(vm, id,
+ VIRTIO_MEM_BBM_BB_PLUGGED) {
+ rc = virtio_mem_bbm_unplug_bb(vm, id);
+ if (rc)
+ return rc;
+ virtio_mem_bbm_set_bb_state(vm, id,
+ VIRTIO_MEM_BBM_BB_UNUSED);
+ }
+ return 0;
+ }
+
+ virtio_mem_sbm_for_each_mb(vm, id, VIRTIO_MEM_SBM_MB_PLUGGED) {
+ rc = virtio_mem_sbm_unplug_mb(vm, id);
+ if (rc)
+ return rc;
+ virtio_mem_sbm_set_mb_state(vm, id,
+ VIRTIO_MEM_SBM_MB_UNUSED);
+ }
+
+ if (!vm->sbm.have_unplugged_mb)
+ return 0;
+
+ /*
+ * Let's retry (offlining and) removing completely unplugged Linux
+ * memory blocks.
+ */
+ vm->sbm.have_unplugged_mb = false;
+
+ mutex_lock(&vm->hotplug_mutex);
+ virtio_mem_sbm_for_each_mb(vm, id, VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL)
+ rc |= virtio_mem_sbm_try_remove_unplugged_mb(vm, id);
+ virtio_mem_sbm_for_each_mb(vm, id, VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL)
+ rc |= virtio_mem_sbm_try_remove_unplugged_mb(vm, id);
+ virtio_mem_sbm_for_each_mb(vm, id, VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL)
+ rc |= virtio_mem_sbm_try_remove_unplugged_mb(vm, id);
+ mutex_unlock(&vm->hotplug_mutex);
+
+ if (rc)
+ vm->sbm.have_unplugged_mb = true;
+ /* Ignore errors, this is not critical. We'll retry later. */
+ return 0;
+}
+
+/*
+ * Update all parts of the config that could have changed.
+ */
+static void virtio_mem_refresh_config(struct virtio_mem *vm)
+{
+ const struct range pluggable_range = mhp_get_pluggable_range(true);
+ uint64_t new_plugged_size, usable_region_size, end_addr;
+
+ /* the plugged_size is just a reflection of what _we_ did previously */
+ virtio_cread_le(vm->vdev, struct virtio_mem_config, plugged_size,
+ &new_plugged_size);
+ if (WARN_ON_ONCE(new_plugged_size != vm->plugged_size))
+ vm->plugged_size = new_plugged_size;
+
+ /* calculate the last usable memory block id */
+ virtio_cread_le(vm->vdev, struct virtio_mem_config,
+ usable_region_size, &usable_region_size);
+ end_addr = min(vm->addr + usable_region_size - 1,
+ pluggable_range.end);
+
+ if (vm->in_sbm) {
+ vm->sbm.last_usable_mb_id = virtio_mem_phys_to_mb_id(end_addr);
+ if (!IS_ALIGNED(end_addr + 1, memory_block_size_bytes()))
+ vm->sbm.last_usable_mb_id--;
+ } else {
+ vm->bbm.last_usable_bb_id = virtio_mem_phys_to_bb_id(vm,
+ end_addr);
+ if (!IS_ALIGNED(end_addr + 1, vm->bbm.bb_size))
+ vm->bbm.last_usable_bb_id--;
+ }
+ /*
+ * If we cannot plug any of our device memory (e.g., nothing in the
+ * usable region is addressable), the last usable memory block id will
+ * be smaller than the first usable memory block id. We'll stop
+ * attempting to add memory with -ENOSPC from our main loop.
+ */
+
+ /* see if there is a request to change the size */
+ virtio_cread_le(vm->vdev, struct virtio_mem_config, requested_size,
+ &vm->requested_size);
+
+ dev_info(&vm->vdev->dev, "plugged size: 0x%llx", vm->plugged_size);
+ dev_info(&vm->vdev->dev, "requested size: 0x%llx", vm->requested_size);
+}
+
+/*
+ * Workqueue function for handling plug/unplug requests and config updates.
+ */
+static void virtio_mem_run_wq(struct work_struct *work)
+{
+ struct virtio_mem *vm = container_of(work, struct virtio_mem, wq);
+ uint64_t diff;
+ int rc;
+
+ if (unlikely(vm->in_kdump)) {
+ dev_warn_once(&vm->vdev->dev,
+ "unexpected workqueue run in kdump kernel\n");
+ return;
+ }
+
+ hrtimer_cancel(&vm->retry_timer);
+
+ if (vm->broken)
+ return;
+
+ atomic_set(&vm->wq_active, 1);
+retry:
+ rc = 0;
+
+ /* Make sure we start with a clean state if there are leftovers. */
+ if (unlikely(vm->unplug_all_required))
+ rc = virtio_mem_send_unplug_all_request(vm);
+
+ if (atomic_read(&vm->config_changed)) {
+ atomic_set(&vm->config_changed, 0);
+ virtio_mem_refresh_config(vm);
+ }
+
+ /* Cleanup any leftovers from previous runs */
+ if (!rc)
+ rc = virtio_mem_cleanup_pending_mb(vm);
+
+ if (!rc && vm->requested_size != vm->plugged_size) {
+ if (vm->requested_size > vm->plugged_size) {
+ diff = vm->requested_size - vm->plugged_size;
+ rc = virtio_mem_plug_request(vm, diff);
+ } else {
+ diff = vm->plugged_size - vm->requested_size;
+ rc = virtio_mem_unplug_request(vm, diff);
+ }
+ }
+
+ /*
+ * Keep retrying to offline and remove completely unplugged Linux
+ * memory blocks.
+ */
+ if (!rc && vm->in_sbm && vm->sbm.have_unplugged_mb)
+ rc = -EBUSY;
+
+ switch (rc) {
+ case 0:
+ vm->retry_timer_ms = VIRTIO_MEM_RETRY_TIMER_MIN_MS;
+ break;
+ case -ENOSPC:
+ /*
+ * We cannot add any more memory (alignment, physical limit)
+ * or we have too many offline memory blocks.
+ */
+ break;
+ case -ETXTBSY:
+ /*
+ * The hypervisor cannot process our request right now
+ * (e.g., out of memory, migrating);
+ */
+ case -EBUSY:
+ /*
+ * We cannot free up any memory to unplug it (all plugged memory
+ * is busy).
+ */
+ case -ENOMEM:
+ /* Out of memory, try again later. */
+ hrtimer_start(&vm->retry_timer, ms_to_ktime(vm->retry_timer_ms),
+ HRTIMER_MODE_REL);
+ break;
+ case -EAGAIN:
+ /* Retry immediately (e.g., the config changed). */
+ goto retry;
+ default:
+ /* Unknown error, mark as broken */
+ dev_err(&vm->vdev->dev,
+ "unknown error, marking device broken: %d\n", rc);
+ vm->broken = true;
+ }
+
+ atomic_set(&vm->wq_active, 0);
+}
+
+static enum hrtimer_restart virtio_mem_timer_expired(struct hrtimer *timer)
+{
+ struct virtio_mem *vm = container_of(timer, struct virtio_mem,
+ retry_timer);
+
+ virtio_mem_retry(vm);
+ vm->retry_timer_ms = min_t(unsigned int, vm->retry_timer_ms * 2,
+ VIRTIO_MEM_RETRY_TIMER_MAX_MS);
+ return HRTIMER_NORESTART;
+}
+
+static void virtio_mem_handle_response(struct virtqueue *vq)
+{
+ struct virtio_mem *vm = vq->vdev->priv;
+
+ wake_up(&vm->host_resp);
+}
+
+static int virtio_mem_init_vq(struct virtio_mem *vm)
+{
+ struct virtqueue *vq;
+
+ vq = virtio_find_single_vq(vm->vdev, virtio_mem_handle_response,
+ "guest-request");
+ if (IS_ERR(vq))
+ return PTR_ERR(vq);
+ vm->vq = vq;
+
+ return 0;
+}
+
+static int virtio_mem_init_hotplug(struct virtio_mem *vm)
+{
+ const struct range pluggable_range = mhp_get_pluggable_range(true);
+ uint64_t unit_pages, sb_size, addr;
+ int rc;
+
+ /* bad device setup - warn only */
+ if (!IS_ALIGNED(vm->addr, memory_block_size_bytes()))
+ dev_warn(&vm->vdev->dev,
+ "The alignment of the physical start address can make some memory unusable.\n");
+ if (!IS_ALIGNED(vm->addr + vm->region_size, memory_block_size_bytes()))
+ dev_warn(&vm->vdev->dev,
+ "The alignment of the physical end address can make some memory unusable.\n");
+ if (vm->addr < pluggable_range.start ||
+ vm->addr + vm->region_size - 1 > pluggable_range.end)
+ dev_warn(&vm->vdev->dev,
+ "Some device memory is not addressable/pluggable. This can make some memory unusable.\n");
+
+ /* Prepare the offline threshold - make sure we can add two blocks. */
+ vm->offline_threshold = max_t(uint64_t, 2 * memory_block_size_bytes(),
+ VIRTIO_MEM_DEFAULT_OFFLINE_THRESHOLD);
+
+ /*
+ * alloc_contig_range() works reliably with pageblock
+ * granularity on ZONE_NORMAL, use pageblock_nr_pages.
+ */
+ sb_size = PAGE_SIZE * pageblock_nr_pages;
+ sb_size = max_t(uint64_t, vm->device_block_size, sb_size);
+
+ if (sb_size < memory_block_size_bytes() && !force_bbm) {
+ /* SBM: At least two subblocks per Linux memory block. */
+ vm->in_sbm = true;
+ vm->sbm.sb_size = sb_size;
+ vm->sbm.sbs_per_mb = memory_block_size_bytes() /
+ vm->sbm.sb_size;
+
+ /* Round up to the next full memory block */
+ addr = max_t(uint64_t, vm->addr, pluggable_range.start) +
+ memory_block_size_bytes() - 1;
+ vm->sbm.first_mb_id = virtio_mem_phys_to_mb_id(addr);
+ vm->sbm.next_mb_id = vm->sbm.first_mb_id;
+ } else {
+ /* BBM: At least one Linux memory block. */
+ vm->bbm.bb_size = max_t(uint64_t, vm->device_block_size,
+ memory_block_size_bytes());
+
+ if (bbm_block_size) {
+ if (!is_power_of_2(bbm_block_size)) {
+ dev_warn(&vm->vdev->dev,
+ "bbm_block_size is not a power of 2");
+ } else if (bbm_block_size < vm->bbm.bb_size) {
+ dev_warn(&vm->vdev->dev,
+ "bbm_block_size is too small");
+ } else {
+ vm->bbm.bb_size = bbm_block_size;
+ }
+ }
+
+ /* Round up to the next aligned big block */
+ addr = max_t(uint64_t, vm->addr, pluggable_range.start) +
+ vm->bbm.bb_size - 1;
+ vm->bbm.first_bb_id = virtio_mem_phys_to_bb_id(vm, addr);
+ vm->bbm.next_bb_id = vm->bbm.first_bb_id;
+
+ /* Make sure we can add two big blocks. */
+ vm->offline_threshold = max_t(uint64_t, 2 * vm->bbm.bb_size,
+ vm->offline_threshold);
+ }
+
+ dev_info(&vm->vdev->dev, "memory block size: 0x%lx",
+ memory_block_size_bytes());
+ if (vm->in_sbm)
+ dev_info(&vm->vdev->dev, "subblock size: 0x%llx",
+ (unsigned long long)vm->sbm.sb_size);
+ else
+ dev_info(&vm->vdev->dev, "big block size: 0x%llx",
+ (unsigned long long)vm->bbm.bb_size);
+
+ /* create the parent resource for all memory */
+ rc = virtio_mem_create_resource(vm);
+ if (rc)
+ return rc;
+
+ /* use a single dynamic memory group to cover the whole memory device */
+ if (vm->in_sbm)
+ unit_pages = PHYS_PFN(memory_block_size_bytes());
+ else
+ unit_pages = PHYS_PFN(vm->bbm.bb_size);
+ rc = memory_group_register_dynamic(vm->nid, unit_pages);
+ if (rc < 0)
+ goto out_del_resource;
+ vm->mgid = rc;
+
+ /*
+ * If we still have memory plugged, we have to unplug all memory first.
+ * Registering our parent resource makes sure that this memory isn't
+ * actually in use (e.g., trying to reload the driver).
+ */
+ if (vm->plugged_size) {
+ vm->unplug_all_required = true;
+ dev_info(&vm->vdev->dev, "unplugging all memory is required\n");
+ }
+
+ /* register callbacks */
+ vm->memory_notifier.notifier_call = virtio_mem_memory_notifier_cb;
+ rc = register_memory_notifier(&vm->memory_notifier);
+ if (rc)
+ goto out_unreg_group;
+ rc = register_virtio_mem_device(vm);
+ if (rc)
+ goto out_unreg_mem;
+
+ return 0;
+out_unreg_mem:
+ unregister_memory_notifier(&vm->memory_notifier);
+out_unreg_group:
+ memory_group_unregister(vm->mgid);
+out_del_resource:
+ virtio_mem_delete_resource(vm);
+ return rc;
+}
+
+#ifdef CONFIG_PROC_VMCORE
+static int virtio_mem_send_state_request(struct virtio_mem *vm, uint64_t addr,
+ uint64_t size)
+{
+ const uint64_t nb_vm_blocks = size / vm->device_block_size;
+ const struct virtio_mem_req req = {
+ .type = cpu_to_virtio16(vm->vdev, VIRTIO_MEM_REQ_STATE),
+ .u.state.addr = cpu_to_virtio64(vm->vdev, addr),
+ .u.state.nb_blocks = cpu_to_virtio16(vm->vdev, nb_vm_blocks),
+ };
+ int rc = -ENOMEM;
+
+ dev_dbg(&vm->vdev->dev, "requesting state: 0x%llx - 0x%llx\n", addr,
+ addr + size - 1);
+
+ switch (virtio_mem_send_request(vm, &req)) {
+ case VIRTIO_MEM_RESP_ACK:
+ return virtio16_to_cpu(vm->vdev, vm->resp.u.state.state);
+ case VIRTIO_MEM_RESP_ERROR:
+ rc = -EINVAL;
+ break;
+ default:
+ break;
+ }
+
+ dev_dbg(&vm->vdev->dev, "requesting state failed: %d\n", rc);
+ return rc;
+}
+
+static bool virtio_mem_vmcore_pfn_is_ram(struct vmcore_cb *cb,
+ unsigned long pfn)
+{
+ struct virtio_mem *vm = container_of(cb, struct virtio_mem,
+ vmcore_cb);
+ uint64_t addr = PFN_PHYS(pfn);
+ bool is_ram;
+ int rc;
+
+ if (!virtio_mem_contains_range(vm, addr, PAGE_SIZE))
+ return true;
+ if (!vm->plugged_size)
+ return false;
+
+ /*
+ * We have to serialize device requests and access to the information
+ * about the block queried last.
+ */
+ mutex_lock(&vm->hotplug_mutex);
+
+ addr = ALIGN_DOWN(addr, vm->device_block_size);
+ if (addr != vm->last_block_addr) {
+ rc = virtio_mem_send_state_request(vm, addr,
+ vm->device_block_size);
+ /* On any kind of error, we're going to signal !ram. */
+ if (rc == VIRTIO_MEM_STATE_PLUGGED)
+ vm->last_block_plugged = true;
+ else
+ vm->last_block_plugged = false;
+ vm->last_block_addr = addr;
+ }
+
+ is_ram = vm->last_block_plugged;
+ mutex_unlock(&vm->hotplug_mutex);
+ return is_ram;
+}
+#endif /* CONFIG_PROC_VMCORE */
+
+static int virtio_mem_init_kdump(struct virtio_mem *vm)
+{
+#ifdef CONFIG_PROC_VMCORE
+ dev_info(&vm->vdev->dev, "memory hot(un)plug disabled in kdump kernel\n");
+ vm->vmcore_cb.pfn_is_ram = virtio_mem_vmcore_pfn_is_ram;
+ register_vmcore_cb(&vm->vmcore_cb);
+ return 0;
+#else /* CONFIG_PROC_VMCORE */
+ dev_warn(&vm->vdev->dev, "disabled in kdump kernel without vmcore\n");
+ return -EBUSY;
+#endif /* CONFIG_PROC_VMCORE */
+}
+
+static int virtio_mem_init(struct virtio_mem *vm)
+{
+ uint16_t node_id;
+
+ if (!vm->vdev->config->get) {
+ dev_err(&vm->vdev->dev, "config access disabled\n");
+ return -EINVAL;
+ }
+
+ /* Fetch all properties that can't change. */
+ virtio_cread_le(vm->vdev, struct virtio_mem_config, plugged_size,
+ &vm->plugged_size);
+ virtio_cread_le(vm->vdev, struct virtio_mem_config, block_size,
+ &vm->device_block_size);
+ virtio_cread_le(vm->vdev, struct virtio_mem_config, node_id,
+ &node_id);
+ vm->nid = virtio_mem_translate_node_id(vm, node_id);
+ virtio_cread_le(vm->vdev, struct virtio_mem_config, addr, &vm->addr);
+ virtio_cread_le(vm->vdev, struct virtio_mem_config, region_size,
+ &vm->region_size);
+
+ /* Determine the nid for the device based on the lowest address. */
+ if (vm->nid == NUMA_NO_NODE)
+ vm->nid = memory_add_physaddr_to_nid(vm->addr);
+
+ dev_info(&vm->vdev->dev, "start address: 0x%llx", vm->addr);
+ dev_info(&vm->vdev->dev, "region size: 0x%llx", vm->region_size);
+ dev_info(&vm->vdev->dev, "device block size: 0x%llx",
+ (unsigned long long)vm->device_block_size);
+ if (vm->nid != NUMA_NO_NODE && IS_ENABLED(CONFIG_NUMA))
+ dev_info(&vm->vdev->dev, "nid: %d", vm->nid);
+
+ /*
+ * We don't want to (un)plug or reuse any memory when in kdump. The
+ * memory is still accessible (but not exposed to Linux).
+ */
+ if (vm->in_kdump)
+ return virtio_mem_init_kdump(vm);
+ return virtio_mem_init_hotplug(vm);
+}
+
+static int virtio_mem_create_resource(struct virtio_mem *vm)
+{
+ /*
+ * When force-unloading the driver and removing the device, we
+ * could have a garbage pointer. Duplicate the string.
+ */
+ const char *name = kstrdup(dev_name(&vm->vdev->dev), GFP_KERNEL);
+
+ if (!name)
+ return -ENOMEM;
+
+ /* Disallow mapping device memory via /dev/mem completely. */
+ vm->parent_resource = __request_mem_region(vm->addr, vm->region_size,
+ name, IORESOURCE_SYSTEM_RAM |
+ IORESOURCE_EXCLUSIVE);
+ if (!vm->parent_resource) {
+ kfree(name);
+ dev_warn(&vm->vdev->dev, "could not reserve device region\n");
+ dev_info(&vm->vdev->dev,
+ "reloading the driver is not supported\n");
+ return -EBUSY;
+ }
+
+ /* The memory is not actually busy - make add_memory() work. */
+ vm->parent_resource->flags &= ~IORESOURCE_BUSY;
+ return 0;
+}
+
+static void virtio_mem_delete_resource(struct virtio_mem *vm)
+{
+ const char *name;
+
+ if (!vm->parent_resource)
+ return;
+
+ name = vm->parent_resource->name;
+ release_resource(vm->parent_resource);
+ kfree(vm->parent_resource);
+ kfree(name);
+ vm->parent_resource = NULL;
+}
+
+static int virtio_mem_range_has_system_ram(struct resource *res, void *arg)
+{
+ return 1;
+}
+
+static bool virtio_mem_has_memory_added(struct virtio_mem *vm)
+{
+ const unsigned long flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
+
+ return walk_iomem_res_desc(IORES_DESC_NONE, flags, vm->addr,
+ vm->addr + vm->region_size, NULL,
+ virtio_mem_range_has_system_ram) == 1;
+}
+
+static int virtio_mem_probe(struct virtio_device *vdev)
+{
+ struct virtio_mem *vm;
+ int rc;
+
+ BUILD_BUG_ON(sizeof(struct virtio_mem_req) != 24);
+ BUILD_BUG_ON(sizeof(struct virtio_mem_resp) != 10);
+
+ vdev->priv = vm = kzalloc(sizeof(*vm), GFP_KERNEL);
+ if (!vm)
+ return -ENOMEM;
+
+ init_waitqueue_head(&vm->host_resp);
+ vm->vdev = vdev;
+ INIT_WORK(&vm->wq, virtio_mem_run_wq);
+ mutex_init(&vm->hotplug_mutex);
+ INIT_LIST_HEAD(&vm->next);
+ spin_lock_init(&vm->removal_lock);
+ hrtimer_init(&vm->retry_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ vm->retry_timer.function = virtio_mem_timer_expired;
+ vm->retry_timer_ms = VIRTIO_MEM_RETRY_TIMER_MIN_MS;
+ vm->in_kdump = is_kdump_kernel();
+
+ /* register the virtqueue */
+ rc = virtio_mem_init_vq(vm);
+ if (rc)
+ goto out_free_vm;
+
+ /* initialize the device by querying the config */
+ rc = virtio_mem_init(vm);
+ if (rc)
+ goto out_del_vq;
+
+ virtio_device_ready(vdev);
+
+ /* trigger a config update to start processing the requested_size */
+ if (!vm->in_kdump) {
+ atomic_set(&vm->config_changed, 1);
+ queue_work(system_freezable_wq, &vm->wq);
+ }
+
+ return 0;
+out_del_vq:
+ vdev->config->del_vqs(vdev);
+out_free_vm:
+ kfree(vm);
+ vdev->priv = NULL;
+
+ return rc;
+}
+
+static void virtio_mem_deinit_hotplug(struct virtio_mem *vm)
+{
+ unsigned long mb_id;
+ int rc;
+
+ /*
+ * Make sure the workqueue won't be triggered anymore and no memory
+ * blocks can be onlined/offlined until we're finished here.
+ */
+ mutex_lock(&vm->hotplug_mutex);
+ spin_lock_irq(&vm->removal_lock);
+ vm->removing = true;
+ spin_unlock_irq(&vm->removal_lock);
+ mutex_unlock(&vm->hotplug_mutex);
+
+ /* wait until the workqueue stopped */
+ cancel_work_sync(&vm->wq);
+ hrtimer_cancel(&vm->retry_timer);
+
+ if (vm->in_sbm) {
+ /*
+ * After we unregistered our callbacks, user space can online
+ * partially plugged offline blocks. Make sure to remove them.
+ */
+ virtio_mem_sbm_for_each_mb(vm, mb_id,
+ VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL) {
+ rc = virtio_mem_sbm_remove_mb(vm, mb_id);
+ BUG_ON(rc);
+ virtio_mem_sbm_set_mb_state(vm, mb_id,
+ VIRTIO_MEM_SBM_MB_UNUSED);
+ }
+ /*
+ * After we unregistered our callbacks, user space can no longer
+ * offline partially plugged online memory blocks. No need to
+ * worry about them.
+ */
+ }
+
+ /* unregister callbacks */
+ unregister_virtio_mem_device(vm);
+ unregister_memory_notifier(&vm->memory_notifier);
+
+ /*
+ * There is no way we could reliably remove all memory we have added to
+ * the system. And there is no way to stop the driver/device from going
+ * away. Warn at least.
+ */
+ if (virtio_mem_has_memory_added(vm)) {
+ dev_warn(&vm->vdev->dev,
+ "device still has system memory added\n");
+ } else {
+ virtio_mem_delete_resource(vm);
+ kfree_const(vm->resource_name);
+ memory_group_unregister(vm->mgid);
+ }
+
+ /* remove all tracking data - no locking needed */
+ if (vm->in_sbm) {
+ vfree(vm->sbm.mb_states);
+ vfree(vm->sbm.sb_states);
+ } else {
+ vfree(vm->bbm.bb_states);
+ }
+}
+
+static void virtio_mem_deinit_kdump(struct virtio_mem *vm)
+{
+#ifdef CONFIG_PROC_VMCORE
+ unregister_vmcore_cb(&vm->vmcore_cb);
+#endif /* CONFIG_PROC_VMCORE */
+}
+
+static void virtio_mem_remove(struct virtio_device *vdev)
+{
+ struct virtio_mem *vm = vdev->priv;
+
+ if (vm->in_kdump)
+ virtio_mem_deinit_kdump(vm);
+ else
+ virtio_mem_deinit_hotplug(vm);
+
+ /* reset the device and cleanup the queues */
+ virtio_reset_device(vdev);
+ vdev->config->del_vqs(vdev);
+
+ kfree(vm);
+ vdev->priv = NULL;
+}
+
+static void virtio_mem_config_changed(struct virtio_device *vdev)
+{
+ struct virtio_mem *vm = vdev->priv;
+
+ if (unlikely(vm->in_kdump))
+ return;
+
+ atomic_set(&vm->config_changed, 1);
+ virtio_mem_retry(vm);
+}
+
+#ifdef CONFIG_PM_SLEEP
+static int virtio_mem_freeze(struct virtio_device *vdev)
+{
+ /*
+ * When restarting the VM, all memory is usually unplugged. Don't
+ * allow to suspend/hibernate.
+ */
+ dev_err(&vdev->dev, "save/restore not supported.\n");
+ return -EPERM;
+}
+
+static int virtio_mem_restore(struct virtio_device *vdev)
+{
+ return -EPERM;
+}
+#endif
+
+static unsigned int virtio_mem_features[] = {
+#if defined(CONFIG_NUMA) && defined(CONFIG_ACPI_NUMA)
+ VIRTIO_MEM_F_ACPI_PXM,
+#endif
+ VIRTIO_MEM_F_UNPLUGGED_INACCESSIBLE,
+};
+
+static const struct virtio_device_id virtio_mem_id_table[] = {
+ { VIRTIO_ID_MEM, VIRTIO_DEV_ANY_ID },
+ { 0 },
+};
+
+static struct virtio_driver virtio_mem_driver = {
+ .feature_table = virtio_mem_features,
+ .feature_table_size = ARRAY_SIZE(virtio_mem_features),
+ .driver.name = KBUILD_MODNAME,
+ .driver.owner = THIS_MODULE,
+ .id_table = virtio_mem_id_table,
+ .probe = virtio_mem_probe,
+ .remove = virtio_mem_remove,
+ .config_changed = virtio_mem_config_changed,
+#ifdef CONFIG_PM_SLEEP
+ .freeze = virtio_mem_freeze,
+ .restore = virtio_mem_restore,
+#endif
+};
+
+module_virtio_driver(virtio_mem_driver);
+MODULE_DEVICE_TABLE(virtio, virtio_mem_id_table);
+MODULE_AUTHOR("David Hildenbrand <david@redhat.com>");
+MODULE_DESCRIPTION("Virtio-mem driver");
+MODULE_LICENSE("GPL");
diff --git a/drivers/virtio/virtio_mmio.c b/drivers/virtio/virtio_mmio.c
new file mode 100644
index 000000000..59892a31c
--- /dev/null
+++ b/drivers/virtio/virtio_mmio.c
@@ -0,0 +1,877 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Virtio memory mapped device driver
+ *
+ * Copyright 2011-2014, ARM Ltd.
+ *
+ * This module allows virtio devices to be used over a virtual, memory mapped
+ * platform device.
+ *
+ * The guest device(s) may be instantiated in one of three equivalent ways:
+ *
+ * 1. Static platform device in board's code, eg.:
+ *
+ * static struct platform_device v2m_virtio_device = {
+ * .name = "virtio-mmio",
+ * .id = -1,
+ * .num_resources = 2,
+ * .resource = (struct resource []) {
+ * {
+ * .start = 0x1001e000,
+ * .end = 0x1001e0ff,
+ * .flags = IORESOURCE_MEM,
+ * }, {
+ * .start = 42 + 32,
+ * .end = 42 + 32,
+ * .flags = IORESOURCE_IRQ,
+ * },
+ * }
+ * };
+ *
+ * 2. Device Tree node, eg.:
+ *
+ * virtio_block@1e000 {
+ * compatible = "virtio,mmio";
+ * reg = <0x1e000 0x100>;
+ * interrupts = <42>;
+ * }
+ *
+ * 3. Kernel module (or command line) parameter. Can be used more than once -
+ * one device will be created for each one. Syntax:
+ *
+ * [virtio_mmio.]device=<size>@<baseaddr>:<irq>[:<id>]
+ * where:
+ * <size> := size (can use standard suffixes like K, M or G)
+ * <baseaddr> := physical base address
+ * <irq> := interrupt number (as passed to request_irq())
+ * <id> := (optional) platform device id
+ * eg.:
+ * virtio_mmio.device=0x100@0x100b0000:48 \
+ * virtio_mmio.device=1K@0x1001e000:74
+ *
+ * Based on Virtio PCI driver by Anthony Liguori, copyright IBM Corp. 2007
+ */
+
+#define pr_fmt(fmt) "virtio-mmio: " fmt
+
+#include <linux/acpi.h>
+#include <linux/dma-mapping.h>
+#include <linux/highmem.h>
+#include <linux/interrupt.h>
+#include <linux/io.h>
+#include <linux/list.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/platform_device.h>
+#include <linux/pm.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/virtio.h>
+#include <linux/virtio_config.h>
+#include <uapi/linux/virtio_mmio.h>
+#include <linux/virtio_ring.h>
+
+
+
+/* The alignment to use between consumer and producer parts of vring.
+ * Currently hardcoded to the page size. */
+#define VIRTIO_MMIO_VRING_ALIGN PAGE_SIZE
+
+
+
+#define to_virtio_mmio_device(_plat_dev) \
+ container_of(_plat_dev, struct virtio_mmio_device, vdev)
+
+struct virtio_mmio_device {
+ struct virtio_device vdev;
+ struct platform_device *pdev;
+
+ void __iomem *base;
+ unsigned long version;
+
+ /* a list of queues so we can dispatch IRQs */
+ spinlock_t lock;
+ struct list_head virtqueues;
+};
+
+struct virtio_mmio_vq_info {
+ /* the actual virtqueue */
+ struct virtqueue *vq;
+
+ /* the list node for the virtqueues list */
+ struct list_head node;
+};
+
+
+
+/* Configuration interface */
+
+static u64 vm_get_features(struct virtio_device *vdev)
+{
+ struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vdev);
+ u64 features;
+
+ writel(1, vm_dev->base + VIRTIO_MMIO_DEVICE_FEATURES_SEL);
+ features = readl(vm_dev->base + VIRTIO_MMIO_DEVICE_FEATURES);
+ features <<= 32;
+
+ writel(0, vm_dev->base + VIRTIO_MMIO_DEVICE_FEATURES_SEL);
+ features |= readl(vm_dev->base + VIRTIO_MMIO_DEVICE_FEATURES);
+
+ return features;
+}
+
+static int vm_finalize_features(struct virtio_device *vdev)
+{
+ struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vdev);
+
+ /* Give virtio_ring a chance to accept features. */
+ vring_transport_features(vdev);
+
+ /* Make sure there are no mixed devices */
+ if (vm_dev->version == 2 &&
+ !__virtio_test_bit(vdev, VIRTIO_F_VERSION_1)) {
+ dev_err(&vdev->dev, "New virtio-mmio devices (version 2) must provide VIRTIO_F_VERSION_1 feature!\n");
+ return -EINVAL;
+ }
+
+ writel(1, vm_dev->base + VIRTIO_MMIO_DRIVER_FEATURES_SEL);
+ writel((u32)(vdev->features >> 32),
+ vm_dev->base + VIRTIO_MMIO_DRIVER_FEATURES);
+
+ writel(0, vm_dev->base + VIRTIO_MMIO_DRIVER_FEATURES_SEL);
+ writel((u32)vdev->features,
+ vm_dev->base + VIRTIO_MMIO_DRIVER_FEATURES);
+
+ return 0;
+}
+
+static void vm_get(struct virtio_device *vdev, unsigned int offset,
+ void *buf, unsigned int len)
+{
+ struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vdev);
+ void __iomem *base = vm_dev->base + VIRTIO_MMIO_CONFIG;
+ u8 b;
+ __le16 w;
+ __le32 l;
+
+ if (vm_dev->version == 1) {
+ u8 *ptr = buf;
+ int i;
+
+ for (i = 0; i < len; i++)
+ ptr[i] = readb(base + offset + i);
+ return;
+ }
+
+ switch (len) {
+ case 1:
+ b = readb(base + offset);
+ memcpy(buf, &b, sizeof b);
+ break;
+ case 2:
+ w = cpu_to_le16(readw(base + offset));
+ memcpy(buf, &w, sizeof w);
+ break;
+ case 4:
+ l = cpu_to_le32(readl(base + offset));
+ memcpy(buf, &l, sizeof l);
+ break;
+ case 8:
+ l = cpu_to_le32(readl(base + offset));
+ memcpy(buf, &l, sizeof l);
+ l = cpu_to_le32(ioread32(base + offset + sizeof l));
+ memcpy(buf + sizeof l, &l, sizeof l);
+ break;
+ default:
+ BUG();
+ }
+}
+
+static void vm_set(struct virtio_device *vdev, unsigned int offset,
+ const void *buf, unsigned int len)
+{
+ struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vdev);
+ void __iomem *base = vm_dev->base + VIRTIO_MMIO_CONFIG;
+ u8 b;
+ __le16 w;
+ __le32 l;
+
+ if (vm_dev->version == 1) {
+ const u8 *ptr = buf;
+ int i;
+
+ for (i = 0; i < len; i++)
+ writeb(ptr[i], base + offset + i);
+
+ return;
+ }
+
+ switch (len) {
+ case 1:
+ memcpy(&b, buf, sizeof b);
+ writeb(b, base + offset);
+ break;
+ case 2:
+ memcpy(&w, buf, sizeof w);
+ writew(le16_to_cpu(w), base + offset);
+ break;
+ case 4:
+ memcpy(&l, buf, sizeof l);
+ writel(le32_to_cpu(l), base + offset);
+ break;
+ case 8:
+ memcpy(&l, buf, sizeof l);
+ writel(le32_to_cpu(l), base + offset);
+ memcpy(&l, buf + sizeof l, sizeof l);
+ writel(le32_to_cpu(l), base + offset + sizeof l);
+ break;
+ default:
+ BUG();
+ }
+}
+
+static u32 vm_generation(struct virtio_device *vdev)
+{
+ struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vdev);
+
+ if (vm_dev->version == 1)
+ return 0;
+ else
+ return readl(vm_dev->base + VIRTIO_MMIO_CONFIG_GENERATION);
+}
+
+static u8 vm_get_status(struct virtio_device *vdev)
+{
+ struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vdev);
+
+ return readl(vm_dev->base + VIRTIO_MMIO_STATUS) & 0xff;
+}
+
+static void vm_set_status(struct virtio_device *vdev, u8 status)
+{
+ struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vdev);
+
+ /* We should never be setting status to 0. */
+ BUG_ON(status == 0);
+
+ /*
+ * Per memory-barriers.txt, wmb() is not needed to guarantee
+ * that the cache coherent memory writes have completed
+ * before writing to the MMIO region.
+ */
+ writel(status, vm_dev->base + VIRTIO_MMIO_STATUS);
+}
+
+static void vm_reset(struct virtio_device *vdev)
+{
+ struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vdev);
+
+ /* 0 status means a reset. */
+ writel(0, vm_dev->base + VIRTIO_MMIO_STATUS);
+}
+
+
+
+/* Transport interface */
+
+/* the notify function used when creating a virt queue */
+static bool vm_notify(struct virtqueue *vq)
+{
+ struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vq->vdev);
+
+ /* We write the queue's selector into the notification register to
+ * signal the other end */
+ writel(vq->index, vm_dev->base + VIRTIO_MMIO_QUEUE_NOTIFY);
+ return true;
+}
+
+static bool vm_notify_with_data(struct virtqueue *vq)
+{
+ struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vq->vdev);
+ u32 data = vring_notification_data(vq);
+
+ writel(data, vm_dev->base + VIRTIO_MMIO_QUEUE_NOTIFY);
+
+ return true;
+}
+
+/* Notify all virtqueues on an interrupt. */
+static irqreturn_t vm_interrupt(int irq, void *opaque)
+{
+ struct virtio_mmio_device *vm_dev = opaque;
+ struct virtio_mmio_vq_info *info;
+ unsigned long status;
+ unsigned long flags;
+ irqreturn_t ret = IRQ_NONE;
+
+ /* Read and acknowledge interrupts */
+ status = readl(vm_dev->base + VIRTIO_MMIO_INTERRUPT_STATUS);
+ writel(status, vm_dev->base + VIRTIO_MMIO_INTERRUPT_ACK);
+
+ if (unlikely(status & VIRTIO_MMIO_INT_CONFIG)) {
+ virtio_config_changed(&vm_dev->vdev);
+ ret = IRQ_HANDLED;
+ }
+
+ if (likely(status & VIRTIO_MMIO_INT_VRING)) {
+ spin_lock_irqsave(&vm_dev->lock, flags);
+ list_for_each_entry(info, &vm_dev->virtqueues, node)
+ ret |= vring_interrupt(irq, info->vq);
+ spin_unlock_irqrestore(&vm_dev->lock, flags);
+ }
+
+ return ret;
+}
+
+
+
+static void vm_del_vq(struct virtqueue *vq)
+{
+ struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vq->vdev);
+ struct virtio_mmio_vq_info *info = vq->priv;
+ unsigned long flags;
+ unsigned int index = vq->index;
+
+ spin_lock_irqsave(&vm_dev->lock, flags);
+ list_del(&info->node);
+ spin_unlock_irqrestore(&vm_dev->lock, flags);
+
+ /* Select and deactivate the queue */
+ writel(index, vm_dev->base + VIRTIO_MMIO_QUEUE_SEL);
+ if (vm_dev->version == 1) {
+ writel(0, vm_dev->base + VIRTIO_MMIO_QUEUE_PFN);
+ } else {
+ writel(0, vm_dev->base + VIRTIO_MMIO_QUEUE_READY);
+ WARN_ON(readl(vm_dev->base + VIRTIO_MMIO_QUEUE_READY));
+ }
+
+ vring_del_virtqueue(vq);
+
+ kfree(info);
+}
+
+static void vm_del_vqs(struct virtio_device *vdev)
+{
+ struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vdev);
+ struct virtqueue *vq, *n;
+
+ list_for_each_entry_safe(vq, n, &vdev->vqs, list)
+ vm_del_vq(vq);
+
+ free_irq(platform_get_irq(vm_dev->pdev, 0), vm_dev);
+}
+
+static void vm_synchronize_cbs(struct virtio_device *vdev)
+{
+ struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vdev);
+
+ synchronize_irq(platform_get_irq(vm_dev->pdev, 0));
+}
+
+static struct virtqueue *vm_setup_vq(struct virtio_device *vdev, unsigned int index,
+ void (*callback)(struct virtqueue *vq),
+ const char *name, bool ctx)
+{
+ struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vdev);
+ bool (*notify)(struct virtqueue *vq);
+ struct virtio_mmio_vq_info *info;
+ struct virtqueue *vq;
+ unsigned long flags;
+ unsigned int num;
+ int err;
+
+ if (__virtio_test_bit(vdev, VIRTIO_F_NOTIFICATION_DATA))
+ notify = vm_notify_with_data;
+ else
+ notify = vm_notify;
+
+ if (!name)
+ return NULL;
+
+ /* Select the queue we're interested in */
+ writel(index, vm_dev->base + VIRTIO_MMIO_QUEUE_SEL);
+
+ /* Queue shouldn't already be set up. */
+ if (readl(vm_dev->base + (vm_dev->version == 1 ?
+ VIRTIO_MMIO_QUEUE_PFN : VIRTIO_MMIO_QUEUE_READY))) {
+ err = -ENOENT;
+ goto error_available;
+ }
+
+ /* Allocate and fill out our active queue description */
+ info = kmalloc(sizeof(*info), GFP_KERNEL);
+ if (!info) {
+ err = -ENOMEM;
+ goto error_kmalloc;
+ }
+
+ num = readl(vm_dev->base + VIRTIO_MMIO_QUEUE_NUM_MAX);
+ if (num == 0) {
+ err = -ENOENT;
+ goto error_new_virtqueue;
+ }
+
+ /* Create the vring */
+ vq = vring_create_virtqueue(index, num, VIRTIO_MMIO_VRING_ALIGN, vdev,
+ true, true, ctx, notify, callback, name);
+ if (!vq) {
+ err = -ENOMEM;
+ goto error_new_virtqueue;
+ }
+
+ vq->num_max = num;
+
+ /* Activate the queue */
+ writel(virtqueue_get_vring_size(vq), vm_dev->base + VIRTIO_MMIO_QUEUE_NUM);
+ if (vm_dev->version == 1) {
+ u64 q_pfn = virtqueue_get_desc_addr(vq) >> PAGE_SHIFT;
+
+ /*
+ * virtio-mmio v1 uses a 32bit QUEUE PFN. If we have something
+ * that doesn't fit in 32bit, fail the setup rather than
+ * pretending to be successful.
+ */
+ if (q_pfn >> 32) {
+ dev_err(&vdev->dev,
+ "platform bug: legacy virtio-mmio must not be used with RAM above 0x%llxGB\n",
+ 0x1ULL << (32 + PAGE_SHIFT - 30));
+ err = -E2BIG;
+ goto error_bad_pfn;
+ }
+
+ writel(PAGE_SIZE, vm_dev->base + VIRTIO_MMIO_QUEUE_ALIGN);
+ writel(q_pfn, vm_dev->base + VIRTIO_MMIO_QUEUE_PFN);
+ } else {
+ u64 addr;
+
+ addr = virtqueue_get_desc_addr(vq);
+ writel((u32)addr, vm_dev->base + VIRTIO_MMIO_QUEUE_DESC_LOW);
+ writel((u32)(addr >> 32),
+ vm_dev->base + VIRTIO_MMIO_QUEUE_DESC_HIGH);
+
+ addr = virtqueue_get_avail_addr(vq);
+ writel((u32)addr, vm_dev->base + VIRTIO_MMIO_QUEUE_AVAIL_LOW);
+ writel((u32)(addr >> 32),
+ vm_dev->base + VIRTIO_MMIO_QUEUE_AVAIL_HIGH);
+
+ addr = virtqueue_get_used_addr(vq);
+ writel((u32)addr, vm_dev->base + VIRTIO_MMIO_QUEUE_USED_LOW);
+ writel((u32)(addr >> 32),
+ vm_dev->base + VIRTIO_MMIO_QUEUE_USED_HIGH);
+
+ writel(1, vm_dev->base + VIRTIO_MMIO_QUEUE_READY);
+ }
+
+ vq->priv = info;
+ info->vq = vq;
+
+ spin_lock_irqsave(&vm_dev->lock, flags);
+ list_add(&info->node, &vm_dev->virtqueues);
+ spin_unlock_irqrestore(&vm_dev->lock, flags);
+
+ return vq;
+
+error_bad_pfn:
+ vring_del_virtqueue(vq);
+error_new_virtqueue:
+ if (vm_dev->version == 1) {
+ writel(0, vm_dev->base + VIRTIO_MMIO_QUEUE_PFN);
+ } else {
+ writel(0, vm_dev->base + VIRTIO_MMIO_QUEUE_READY);
+ WARN_ON(readl(vm_dev->base + VIRTIO_MMIO_QUEUE_READY));
+ }
+ kfree(info);
+error_kmalloc:
+error_available:
+ return ERR_PTR(err);
+}
+
+static int vm_find_vqs(struct virtio_device *vdev, unsigned int nvqs,
+ struct virtqueue *vqs[],
+ vq_callback_t *callbacks[],
+ const char * const names[],
+ const bool *ctx,
+ struct irq_affinity *desc)
+{
+ struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vdev);
+ int irq = platform_get_irq(vm_dev->pdev, 0);
+ int i, err, queue_idx = 0;
+
+ if (irq < 0)
+ return irq;
+
+ err = request_irq(irq, vm_interrupt, IRQF_SHARED,
+ dev_name(&vdev->dev), vm_dev);
+ if (err)
+ return err;
+
+ if (of_property_read_bool(vm_dev->pdev->dev.of_node, "wakeup-source"))
+ enable_irq_wake(irq);
+
+ for (i = 0; i < nvqs; ++i) {
+ if (!names[i]) {
+ vqs[i] = NULL;
+ continue;
+ }
+
+ vqs[i] = vm_setup_vq(vdev, queue_idx++, callbacks[i], names[i],
+ ctx ? ctx[i] : false);
+ if (IS_ERR(vqs[i])) {
+ vm_del_vqs(vdev);
+ return PTR_ERR(vqs[i]);
+ }
+ }
+
+ return 0;
+}
+
+static const char *vm_bus_name(struct virtio_device *vdev)
+{
+ struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vdev);
+
+ return vm_dev->pdev->name;
+}
+
+static bool vm_get_shm_region(struct virtio_device *vdev,
+ struct virtio_shm_region *region, u8 id)
+{
+ struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vdev);
+ u64 len, addr;
+
+ /* Select the region we're interested in */
+ writel(id, vm_dev->base + VIRTIO_MMIO_SHM_SEL);
+
+ /* Read the region size */
+ len = (u64) readl(vm_dev->base + VIRTIO_MMIO_SHM_LEN_LOW);
+ len |= (u64) readl(vm_dev->base + VIRTIO_MMIO_SHM_LEN_HIGH) << 32;
+
+ region->len = len;
+
+ /* Check if region length is -1. If that's the case, the shared memory
+ * region does not exist and there is no need to proceed further.
+ */
+ if (len == ~(u64)0)
+ return false;
+
+ /* Read the region base address */
+ addr = (u64) readl(vm_dev->base + VIRTIO_MMIO_SHM_BASE_LOW);
+ addr |= (u64) readl(vm_dev->base + VIRTIO_MMIO_SHM_BASE_HIGH) << 32;
+
+ region->addr = addr;
+
+ return true;
+}
+
+static const struct virtio_config_ops virtio_mmio_config_ops = {
+ .get = vm_get,
+ .set = vm_set,
+ .generation = vm_generation,
+ .get_status = vm_get_status,
+ .set_status = vm_set_status,
+ .reset = vm_reset,
+ .find_vqs = vm_find_vqs,
+ .del_vqs = vm_del_vqs,
+ .get_features = vm_get_features,
+ .finalize_features = vm_finalize_features,
+ .bus_name = vm_bus_name,
+ .get_shm_region = vm_get_shm_region,
+ .synchronize_cbs = vm_synchronize_cbs,
+};
+
+#ifdef CONFIG_PM_SLEEP
+static int virtio_mmio_freeze(struct device *dev)
+{
+ struct virtio_mmio_device *vm_dev = dev_get_drvdata(dev);
+
+ return virtio_device_freeze(&vm_dev->vdev);
+}
+
+static int virtio_mmio_restore(struct device *dev)
+{
+ struct virtio_mmio_device *vm_dev = dev_get_drvdata(dev);
+
+ if (vm_dev->version == 1)
+ writel(PAGE_SIZE, vm_dev->base + VIRTIO_MMIO_GUEST_PAGE_SIZE);
+
+ return virtio_device_restore(&vm_dev->vdev);
+}
+
+static const struct dev_pm_ops virtio_mmio_pm_ops = {
+ SET_SYSTEM_SLEEP_PM_OPS(virtio_mmio_freeze, virtio_mmio_restore)
+};
+#endif
+
+static void virtio_mmio_release_dev(struct device *_d)
+{
+ struct virtio_device *vdev =
+ container_of(_d, struct virtio_device, dev);
+ struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vdev);
+
+ kfree(vm_dev);
+}
+
+/* Platform device */
+
+static int virtio_mmio_probe(struct platform_device *pdev)
+{
+ struct virtio_mmio_device *vm_dev;
+ unsigned long magic;
+ int rc;
+
+ vm_dev = kzalloc(sizeof(*vm_dev), GFP_KERNEL);
+ if (!vm_dev)
+ return -ENOMEM;
+
+ vm_dev->vdev.dev.parent = &pdev->dev;
+ vm_dev->vdev.dev.release = virtio_mmio_release_dev;
+ vm_dev->vdev.config = &virtio_mmio_config_ops;
+ vm_dev->pdev = pdev;
+ INIT_LIST_HEAD(&vm_dev->virtqueues);
+ spin_lock_init(&vm_dev->lock);
+
+ vm_dev->base = devm_platform_ioremap_resource(pdev, 0);
+ if (IS_ERR(vm_dev->base)) {
+ rc = PTR_ERR(vm_dev->base);
+ goto free_vm_dev;
+ }
+
+ /* Check magic value */
+ magic = readl(vm_dev->base + VIRTIO_MMIO_MAGIC_VALUE);
+ if (magic != ('v' | 'i' << 8 | 'r' << 16 | 't' << 24)) {
+ dev_warn(&pdev->dev, "Wrong magic value 0x%08lx!\n", magic);
+ rc = -ENODEV;
+ goto free_vm_dev;
+ }
+
+ /* Check device version */
+ vm_dev->version = readl(vm_dev->base + VIRTIO_MMIO_VERSION);
+ if (vm_dev->version < 1 || vm_dev->version > 2) {
+ dev_err(&pdev->dev, "Version %ld not supported!\n",
+ vm_dev->version);
+ rc = -ENXIO;
+ goto free_vm_dev;
+ }
+
+ vm_dev->vdev.id.device = readl(vm_dev->base + VIRTIO_MMIO_DEVICE_ID);
+ if (vm_dev->vdev.id.device == 0) {
+ /*
+ * virtio-mmio device with an ID 0 is a (dummy) placeholder
+ * with no function. End probing now with no error reported.
+ */
+ rc = -ENODEV;
+ goto free_vm_dev;
+ }
+ vm_dev->vdev.id.vendor = readl(vm_dev->base + VIRTIO_MMIO_VENDOR_ID);
+
+ if (vm_dev->version == 1) {
+ writel(PAGE_SIZE, vm_dev->base + VIRTIO_MMIO_GUEST_PAGE_SIZE);
+
+ rc = dma_set_mask(&pdev->dev, DMA_BIT_MASK(64));
+ /*
+ * In the legacy case, ensure our coherently-allocated virtio
+ * ring will be at an address expressable as a 32-bit PFN.
+ */
+ if (!rc)
+ dma_set_coherent_mask(&pdev->dev,
+ DMA_BIT_MASK(32 + PAGE_SHIFT));
+ } else {
+ rc = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64));
+ }
+ if (rc)
+ rc = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(32));
+ if (rc)
+ dev_warn(&pdev->dev, "Failed to enable 64-bit or 32-bit DMA. Trying to continue, but this might not work.\n");
+
+ platform_set_drvdata(pdev, vm_dev);
+
+ rc = register_virtio_device(&vm_dev->vdev);
+ if (rc)
+ put_device(&vm_dev->vdev.dev);
+
+ return rc;
+
+free_vm_dev:
+ kfree(vm_dev);
+ return rc;
+}
+
+static int virtio_mmio_remove(struct platform_device *pdev)
+{
+ struct virtio_mmio_device *vm_dev = platform_get_drvdata(pdev);
+ unregister_virtio_device(&vm_dev->vdev);
+
+ return 0;
+}
+
+
+
+/* Devices list parameter */
+
+#if defined(CONFIG_VIRTIO_MMIO_CMDLINE_DEVICES)
+
+static struct device vm_cmdline_parent = {
+ .init_name = "virtio-mmio-cmdline",
+};
+
+static int vm_cmdline_parent_registered;
+static int vm_cmdline_id;
+
+static int vm_cmdline_set(const char *device,
+ const struct kernel_param *kp)
+{
+ int err;
+ struct resource resources[2] = {};
+ char *str;
+ long long base, size;
+ unsigned int irq;
+ int processed, consumed = 0;
+ struct platform_device *pdev;
+
+ /* Consume "size" part of the command line parameter */
+ size = memparse(device, &str);
+
+ /* Get "@<base>:<irq>[:<id>]" chunks */
+ processed = sscanf(str, "@%lli:%u%n:%d%n",
+ &base, &irq, &consumed,
+ &vm_cmdline_id, &consumed);
+
+ /*
+ * sscanf() must process at least 2 chunks; also there
+ * must be no extra characters after the last chunk, so
+ * str[consumed] must be '\0'
+ */
+ if (processed < 2 || str[consumed] || irq == 0)
+ return -EINVAL;
+
+ resources[0].flags = IORESOURCE_MEM;
+ resources[0].start = base;
+ resources[0].end = base + size - 1;
+
+ resources[1].flags = IORESOURCE_IRQ;
+ resources[1].start = resources[1].end = irq;
+
+ if (!vm_cmdline_parent_registered) {
+ err = device_register(&vm_cmdline_parent);
+ if (err) {
+ put_device(&vm_cmdline_parent);
+ pr_err("Failed to register parent device!\n");
+ return err;
+ }
+ vm_cmdline_parent_registered = 1;
+ }
+
+ pr_info("Registering device virtio-mmio.%d at 0x%llx-0x%llx, IRQ %d.\n",
+ vm_cmdline_id,
+ (unsigned long long)resources[0].start,
+ (unsigned long long)resources[0].end,
+ (int)resources[1].start);
+
+ pdev = platform_device_register_resndata(&vm_cmdline_parent,
+ "virtio-mmio", vm_cmdline_id++,
+ resources, ARRAY_SIZE(resources), NULL, 0);
+
+ return PTR_ERR_OR_ZERO(pdev);
+}
+
+static int vm_cmdline_get_device(struct device *dev, void *data)
+{
+ char *buffer = data;
+ unsigned int len = strlen(buffer);
+ struct platform_device *pdev = to_platform_device(dev);
+
+ snprintf(buffer + len, PAGE_SIZE - len, "0x%llx@0x%llx:%llu:%d\n",
+ pdev->resource[0].end - pdev->resource[0].start + 1ULL,
+ (unsigned long long)pdev->resource[0].start,
+ (unsigned long long)pdev->resource[1].start,
+ pdev->id);
+ return 0;
+}
+
+static int vm_cmdline_get(char *buffer, const struct kernel_param *kp)
+{
+ buffer[0] = '\0';
+ device_for_each_child(&vm_cmdline_parent, buffer,
+ vm_cmdline_get_device);
+ return strlen(buffer) + 1;
+}
+
+static const struct kernel_param_ops vm_cmdline_param_ops = {
+ .set = vm_cmdline_set,
+ .get = vm_cmdline_get,
+};
+
+device_param_cb(device, &vm_cmdline_param_ops, NULL, S_IRUSR);
+
+static int vm_unregister_cmdline_device(struct device *dev,
+ void *data)
+{
+ platform_device_unregister(to_platform_device(dev));
+
+ return 0;
+}
+
+static void vm_unregister_cmdline_devices(void)
+{
+ if (vm_cmdline_parent_registered) {
+ device_for_each_child(&vm_cmdline_parent, NULL,
+ vm_unregister_cmdline_device);
+ device_unregister(&vm_cmdline_parent);
+ vm_cmdline_parent_registered = 0;
+ }
+}
+
+#else
+
+static void vm_unregister_cmdline_devices(void)
+{
+}
+
+#endif
+
+/* Platform driver */
+
+static const struct of_device_id virtio_mmio_match[] = {
+ { .compatible = "virtio,mmio", },
+ {},
+};
+MODULE_DEVICE_TABLE(of, virtio_mmio_match);
+
+#ifdef CONFIG_ACPI
+static const struct acpi_device_id virtio_mmio_acpi_match[] = {
+ { "LNRO0005", },
+ { }
+};
+MODULE_DEVICE_TABLE(acpi, virtio_mmio_acpi_match);
+#endif
+
+static struct platform_driver virtio_mmio_driver = {
+ .probe = virtio_mmio_probe,
+ .remove = virtio_mmio_remove,
+ .driver = {
+ .name = "virtio-mmio",
+ .of_match_table = virtio_mmio_match,
+ .acpi_match_table = ACPI_PTR(virtio_mmio_acpi_match),
+#ifdef CONFIG_PM_SLEEP
+ .pm = &virtio_mmio_pm_ops,
+#endif
+ },
+};
+
+static int __init virtio_mmio_init(void)
+{
+ return platform_driver_register(&virtio_mmio_driver);
+}
+
+static void __exit virtio_mmio_exit(void)
+{
+ platform_driver_unregister(&virtio_mmio_driver);
+ vm_unregister_cmdline_devices();
+}
+
+module_init(virtio_mmio_init);
+module_exit(virtio_mmio_exit);
+
+MODULE_AUTHOR("Pawel Moll <pawel.moll@arm.com>");
+MODULE_DESCRIPTION("Platform bus driver for memory mapped virtio devices");
+MODULE_LICENSE("GPL");
diff --git a/drivers/virtio/virtio_pci_common.c b/drivers/virtio/virtio_pci_common.c
new file mode 100644
index 000000000..c2524a720
--- /dev/null
+++ b/drivers/virtio/virtio_pci_common.c
@@ -0,0 +1,650 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Virtio PCI driver - common functionality for all device versions
+ *
+ * This module allows virtio devices to be used over a virtual PCI device.
+ * This can be used with QEMU based VMMs like KVM or Xen.
+ *
+ * Copyright IBM Corp. 2007
+ * Copyright Red Hat, Inc. 2014
+ *
+ * Authors:
+ * Anthony Liguori <aliguori@us.ibm.com>
+ * Rusty Russell <rusty@rustcorp.com.au>
+ * Michael S. Tsirkin <mst@redhat.com>
+ */
+
+#include "virtio_pci_common.h"
+
+static bool force_legacy = false;
+
+#if IS_ENABLED(CONFIG_VIRTIO_PCI_LEGACY)
+module_param(force_legacy, bool, 0444);
+MODULE_PARM_DESC(force_legacy,
+ "Force legacy mode for transitional virtio 1 devices");
+#endif
+
+/* wait for pending irq handlers */
+void vp_synchronize_vectors(struct virtio_device *vdev)
+{
+ struct virtio_pci_device *vp_dev = to_vp_device(vdev);
+ int i;
+
+ if (vp_dev->intx_enabled)
+ synchronize_irq(vp_dev->pci_dev->irq);
+
+ for (i = 0; i < vp_dev->msix_vectors; ++i)
+ synchronize_irq(pci_irq_vector(vp_dev->pci_dev, i));
+}
+
+/* the notify function used when creating a virt queue */
+bool vp_notify(struct virtqueue *vq)
+{
+ /* we write the queue's selector into the notification register to
+ * signal the other end */
+ iowrite16(vq->index, (void __iomem *)vq->priv);
+ return true;
+}
+
+/* Handle a configuration change: Tell driver if it wants to know. */
+static irqreturn_t vp_config_changed(int irq, void *opaque)
+{
+ struct virtio_pci_device *vp_dev = opaque;
+
+ virtio_config_changed(&vp_dev->vdev);
+ return IRQ_HANDLED;
+}
+
+/* Notify all virtqueues on an interrupt. */
+static irqreturn_t vp_vring_interrupt(int irq, void *opaque)
+{
+ struct virtio_pci_device *vp_dev = opaque;
+ struct virtio_pci_vq_info *info;
+ irqreturn_t ret = IRQ_NONE;
+ unsigned long flags;
+
+ spin_lock_irqsave(&vp_dev->lock, flags);
+ list_for_each_entry(info, &vp_dev->virtqueues, node) {
+ if (vring_interrupt(irq, info->vq) == IRQ_HANDLED)
+ ret = IRQ_HANDLED;
+ }
+ spin_unlock_irqrestore(&vp_dev->lock, flags);
+
+ return ret;
+}
+
+/* A small wrapper to also acknowledge the interrupt when it's handled.
+ * I really need an EIO hook for the vring so I can ack the interrupt once we
+ * know that we'll be handling the IRQ but before we invoke the callback since
+ * the callback may notify the host which results in the host attempting to
+ * raise an interrupt that we would then mask once we acknowledged the
+ * interrupt. */
+static irqreturn_t vp_interrupt(int irq, void *opaque)
+{
+ struct virtio_pci_device *vp_dev = opaque;
+ u8 isr;
+
+ /* reading the ISR has the effect of also clearing it so it's very
+ * important to save off the value. */
+ isr = ioread8(vp_dev->isr);
+
+ /* It's definitely not us if the ISR was not high */
+ if (!isr)
+ return IRQ_NONE;
+
+ /* Configuration change? Tell driver if it wants to know. */
+ if (isr & VIRTIO_PCI_ISR_CONFIG)
+ vp_config_changed(irq, opaque);
+
+ return vp_vring_interrupt(irq, opaque);
+}
+
+static int vp_request_msix_vectors(struct virtio_device *vdev, int nvectors,
+ bool per_vq_vectors, struct irq_affinity *desc)
+{
+ struct virtio_pci_device *vp_dev = to_vp_device(vdev);
+ const char *name = dev_name(&vp_dev->vdev.dev);
+ unsigned int flags = PCI_IRQ_MSIX;
+ unsigned int i, v;
+ int err = -ENOMEM;
+
+ vp_dev->msix_vectors = nvectors;
+
+ vp_dev->msix_names = kmalloc_array(nvectors,
+ sizeof(*vp_dev->msix_names),
+ GFP_KERNEL);
+ if (!vp_dev->msix_names)
+ goto error;
+ vp_dev->msix_affinity_masks
+ = kcalloc(nvectors, sizeof(*vp_dev->msix_affinity_masks),
+ GFP_KERNEL);
+ if (!vp_dev->msix_affinity_masks)
+ goto error;
+ for (i = 0; i < nvectors; ++i)
+ if (!alloc_cpumask_var(&vp_dev->msix_affinity_masks[i],
+ GFP_KERNEL))
+ goto error;
+
+ if (desc) {
+ flags |= PCI_IRQ_AFFINITY;
+ desc->pre_vectors++; /* virtio config vector */
+ }
+
+ err = pci_alloc_irq_vectors_affinity(vp_dev->pci_dev, nvectors,
+ nvectors, flags, desc);
+ if (err < 0)
+ goto error;
+ vp_dev->msix_enabled = 1;
+
+ /* Set the vector used for configuration */
+ v = vp_dev->msix_used_vectors;
+ snprintf(vp_dev->msix_names[v], sizeof *vp_dev->msix_names,
+ "%s-config", name);
+ err = request_irq(pci_irq_vector(vp_dev->pci_dev, v),
+ vp_config_changed, 0, vp_dev->msix_names[v],
+ vp_dev);
+ if (err)
+ goto error;
+ ++vp_dev->msix_used_vectors;
+
+ v = vp_dev->config_vector(vp_dev, v);
+ /* Verify we had enough resources to assign the vector */
+ if (v == VIRTIO_MSI_NO_VECTOR) {
+ err = -EBUSY;
+ goto error;
+ }
+
+ if (!per_vq_vectors) {
+ /* Shared vector for all VQs */
+ v = vp_dev->msix_used_vectors;
+ snprintf(vp_dev->msix_names[v], sizeof *vp_dev->msix_names,
+ "%s-virtqueues", name);
+ err = request_irq(pci_irq_vector(vp_dev->pci_dev, v),
+ vp_vring_interrupt, 0, vp_dev->msix_names[v],
+ vp_dev);
+ if (err)
+ goto error;
+ ++vp_dev->msix_used_vectors;
+ }
+ return 0;
+error:
+ return err;
+}
+
+static struct virtqueue *vp_setup_vq(struct virtio_device *vdev, unsigned int index,
+ void (*callback)(struct virtqueue *vq),
+ const char *name,
+ bool ctx,
+ u16 msix_vec)
+{
+ struct virtio_pci_device *vp_dev = to_vp_device(vdev);
+ struct virtio_pci_vq_info *info = kmalloc(sizeof *info, GFP_KERNEL);
+ struct virtqueue *vq;
+ unsigned long flags;
+
+ /* fill out our structure that represents an active queue */
+ if (!info)
+ return ERR_PTR(-ENOMEM);
+
+ vq = vp_dev->setup_vq(vp_dev, info, index, callback, name, ctx,
+ msix_vec);
+ if (IS_ERR(vq))
+ goto out_info;
+
+ info->vq = vq;
+ if (callback) {
+ spin_lock_irqsave(&vp_dev->lock, flags);
+ list_add(&info->node, &vp_dev->virtqueues);
+ spin_unlock_irqrestore(&vp_dev->lock, flags);
+ } else {
+ INIT_LIST_HEAD(&info->node);
+ }
+
+ vp_dev->vqs[index] = info;
+ return vq;
+
+out_info:
+ kfree(info);
+ return vq;
+}
+
+static void vp_del_vq(struct virtqueue *vq)
+{
+ struct virtio_pci_device *vp_dev = to_vp_device(vq->vdev);
+ struct virtio_pci_vq_info *info = vp_dev->vqs[vq->index];
+ unsigned long flags;
+
+ /*
+ * If it fails during re-enable reset vq. This way we won't rejoin
+ * info->node to the queue. Prevent unexpected irqs.
+ */
+ if (!vq->reset) {
+ spin_lock_irqsave(&vp_dev->lock, flags);
+ list_del(&info->node);
+ spin_unlock_irqrestore(&vp_dev->lock, flags);
+ }
+
+ vp_dev->del_vq(info);
+ kfree(info);
+}
+
+/* the config->del_vqs() implementation */
+void vp_del_vqs(struct virtio_device *vdev)
+{
+ struct virtio_pci_device *vp_dev = to_vp_device(vdev);
+ struct virtqueue *vq, *n;
+ int i;
+
+ list_for_each_entry_safe(vq, n, &vdev->vqs, list) {
+ if (vp_dev->per_vq_vectors) {
+ int v = vp_dev->vqs[vq->index]->msix_vector;
+
+ if (v != VIRTIO_MSI_NO_VECTOR) {
+ int irq = pci_irq_vector(vp_dev->pci_dev, v);
+
+ irq_set_affinity_hint(irq, NULL);
+ free_irq(irq, vq);
+ }
+ }
+ vp_del_vq(vq);
+ }
+ vp_dev->per_vq_vectors = false;
+
+ if (vp_dev->intx_enabled) {
+ free_irq(vp_dev->pci_dev->irq, vp_dev);
+ vp_dev->intx_enabled = 0;
+ }
+
+ for (i = 0; i < vp_dev->msix_used_vectors; ++i)
+ free_irq(pci_irq_vector(vp_dev->pci_dev, i), vp_dev);
+
+ if (vp_dev->msix_affinity_masks) {
+ for (i = 0; i < vp_dev->msix_vectors; i++)
+ free_cpumask_var(vp_dev->msix_affinity_masks[i]);
+ }
+
+ if (vp_dev->msix_enabled) {
+ /* Disable the vector used for configuration */
+ vp_dev->config_vector(vp_dev, VIRTIO_MSI_NO_VECTOR);
+
+ pci_free_irq_vectors(vp_dev->pci_dev);
+ vp_dev->msix_enabled = 0;
+ }
+
+ vp_dev->msix_vectors = 0;
+ vp_dev->msix_used_vectors = 0;
+ kfree(vp_dev->msix_names);
+ vp_dev->msix_names = NULL;
+ kfree(vp_dev->msix_affinity_masks);
+ vp_dev->msix_affinity_masks = NULL;
+ kfree(vp_dev->vqs);
+ vp_dev->vqs = NULL;
+}
+
+static int vp_find_vqs_msix(struct virtio_device *vdev, unsigned int nvqs,
+ struct virtqueue *vqs[], vq_callback_t *callbacks[],
+ const char * const names[], bool per_vq_vectors,
+ const bool *ctx,
+ struct irq_affinity *desc)
+{
+ struct virtio_pci_device *vp_dev = to_vp_device(vdev);
+ u16 msix_vec;
+ int i, err, nvectors, allocated_vectors, queue_idx = 0;
+
+ vp_dev->vqs = kcalloc(nvqs, sizeof(*vp_dev->vqs), GFP_KERNEL);
+ if (!vp_dev->vqs)
+ return -ENOMEM;
+
+ if (per_vq_vectors) {
+ /* Best option: one for change interrupt, one per vq. */
+ nvectors = 1;
+ for (i = 0; i < nvqs; ++i)
+ if (names[i] && callbacks[i])
+ ++nvectors;
+ } else {
+ /* Second best: one for change, shared for all vqs. */
+ nvectors = 2;
+ }
+
+ err = vp_request_msix_vectors(vdev, nvectors, per_vq_vectors,
+ per_vq_vectors ? desc : NULL);
+ if (err)
+ goto error_find;
+
+ vp_dev->per_vq_vectors = per_vq_vectors;
+ allocated_vectors = vp_dev->msix_used_vectors;
+ for (i = 0; i < nvqs; ++i) {
+ if (!names[i]) {
+ vqs[i] = NULL;
+ continue;
+ }
+
+ if (!callbacks[i])
+ msix_vec = VIRTIO_MSI_NO_VECTOR;
+ else if (vp_dev->per_vq_vectors)
+ msix_vec = allocated_vectors++;
+ else
+ msix_vec = VP_MSIX_VQ_VECTOR;
+ vqs[i] = vp_setup_vq(vdev, queue_idx++, callbacks[i], names[i],
+ ctx ? ctx[i] : false,
+ msix_vec);
+ if (IS_ERR(vqs[i])) {
+ err = PTR_ERR(vqs[i]);
+ goto error_find;
+ }
+
+ if (!vp_dev->per_vq_vectors || msix_vec == VIRTIO_MSI_NO_VECTOR)
+ continue;
+
+ /* allocate per-vq irq if available and necessary */
+ snprintf(vp_dev->msix_names[msix_vec],
+ sizeof *vp_dev->msix_names,
+ "%s-%s",
+ dev_name(&vp_dev->vdev.dev), names[i]);
+ err = request_irq(pci_irq_vector(vp_dev->pci_dev, msix_vec),
+ vring_interrupt, 0,
+ vp_dev->msix_names[msix_vec],
+ vqs[i]);
+ if (err)
+ goto error_find;
+ }
+ return 0;
+
+error_find:
+ vp_del_vqs(vdev);
+ return err;
+}
+
+static int vp_find_vqs_intx(struct virtio_device *vdev, unsigned int nvqs,
+ struct virtqueue *vqs[], vq_callback_t *callbacks[],
+ const char * const names[], const bool *ctx)
+{
+ struct virtio_pci_device *vp_dev = to_vp_device(vdev);
+ int i, err, queue_idx = 0;
+
+ vp_dev->vqs = kcalloc(nvqs, sizeof(*vp_dev->vqs), GFP_KERNEL);
+ if (!vp_dev->vqs)
+ return -ENOMEM;
+
+ err = request_irq(vp_dev->pci_dev->irq, vp_interrupt, IRQF_SHARED,
+ dev_name(&vdev->dev), vp_dev);
+ if (err)
+ goto out_del_vqs;
+
+ vp_dev->intx_enabled = 1;
+ vp_dev->per_vq_vectors = false;
+ for (i = 0; i < nvqs; ++i) {
+ if (!names[i]) {
+ vqs[i] = NULL;
+ continue;
+ }
+ vqs[i] = vp_setup_vq(vdev, queue_idx++, callbacks[i], names[i],
+ ctx ? ctx[i] : false,
+ VIRTIO_MSI_NO_VECTOR);
+ if (IS_ERR(vqs[i])) {
+ err = PTR_ERR(vqs[i]);
+ goto out_del_vqs;
+ }
+ }
+
+ return 0;
+out_del_vqs:
+ vp_del_vqs(vdev);
+ return err;
+}
+
+/* the config->find_vqs() implementation */
+int vp_find_vqs(struct virtio_device *vdev, unsigned int nvqs,
+ struct virtqueue *vqs[], vq_callback_t *callbacks[],
+ const char * const names[], const bool *ctx,
+ struct irq_affinity *desc)
+{
+ int err;
+
+ /* Try MSI-X with one vector per queue. */
+ err = vp_find_vqs_msix(vdev, nvqs, vqs, callbacks, names, true, ctx, desc);
+ if (!err)
+ return 0;
+ /* Fallback: MSI-X with one vector for config, one shared for queues. */
+ err = vp_find_vqs_msix(vdev, nvqs, vqs, callbacks, names, false, ctx, desc);
+ if (!err)
+ return 0;
+ /* Is there an interrupt? If not give up. */
+ if (!(to_vp_device(vdev)->pci_dev->irq))
+ return err;
+ /* Finally fall back to regular interrupts. */
+ return vp_find_vqs_intx(vdev, nvqs, vqs, callbacks, names, ctx);
+}
+
+const char *vp_bus_name(struct virtio_device *vdev)
+{
+ struct virtio_pci_device *vp_dev = to_vp_device(vdev);
+
+ return pci_name(vp_dev->pci_dev);
+}
+
+/* Setup the affinity for a virtqueue:
+ * - force the affinity for per vq vector
+ * - OR over all affinities for shared MSI
+ * - ignore the affinity request if we're using INTX
+ */
+int vp_set_vq_affinity(struct virtqueue *vq, const struct cpumask *cpu_mask)
+{
+ struct virtio_device *vdev = vq->vdev;
+ struct virtio_pci_device *vp_dev = to_vp_device(vdev);
+ struct virtio_pci_vq_info *info = vp_dev->vqs[vq->index];
+ struct cpumask *mask;
+ unsigned int irq;
+
+ if (!vq->callback)
+ return -EINVAL;
+
+ if (vp_dev->msix_enabled) {
+ mask = vp_dev->msix_affinity_masks[info->msix_vector];
+ irq = pci_irq_vector(vp_dev->pci_dev, info->msix_vector);
+ if (!cpu_mask)
+ irq_set_affinity_hint(irq, NULL);
+ else {
+ cpumask_copy(mask, cpu_mask);
+ irq_set_affinity_hint(irq, mask);
+ }
+ }
+ return 0;
+}
+
+const struct cpumask *vp_get_vq_affinity(struct virtio_device *vdev, int index)
+{
+ struct virtio_pci_device *vp_dev = to_vp_device(vdev);
+
+ if (!vp_dev->per_vq_vectors ||
+ vp_dev->vqs[index]->msix_vector == VIRTIO_MSI_NO_VECTOR)
+ return NULL;
+
+ return pci_irq_get_affinity(vp_dev->pci_dev,
+ vp_dev->vqs[index]->msix_vector);
+}
+
+#ifdef CONFIG_PM_SLEEP
+static int virtio_pci_freeze(struct device *dev)
+{
+ struct pci_dev *pci_dev = to_pci_dev(dev);
+ struct virtio_pci_device *vp_dev = pci_get_drvdata(pci_dev);
+ int ret;
+
+ ret = virtio_device_freeze(&vp_dev->vdev);
+
+ if (!ret)
+ pci_disable_device(pci_dev);
+ return ret;
+}
+
+static int virtio_pci_restore(struct device *dev)
+{
+ struct pci_dev *pci_dev = to_pci_dev(dev);
+ struct virtio_pci_device *vp_dev = pci_get_drvdata(pci_dev);
+ int ret;
+
+ ret = pci_enable_device(pci_dev);
+ if (ret)
+ return ret;
+
+ pci_set_master(pci_dev);
+ return virtio_device_restore(&vp_dev->vdev);
+}
+
+static const struct dev_pm_ops virtio_pci_pm_ops = {
+ SET_SYSTEM_SLEEP_PM_OPS(virtio_pci_freeze, virtio_pci_restore)
+};
+#endif
+
+
+/* Qumranet donated their vendor ID for devices 0x1000 thru 0x10FF. */
+static const struct pci_device_id virtio_pci_id_table[] = {
+ { PCI_DEVICE(PCI_VENDOR_ID_REDHAT_QUMRANET, PCI_ANY_ID) },
+ { 0 }
+};
+
+MODULE_DEVICE_TABLE(pci, virtio_pci_id_table);
+
+static void virtio_pci_release_dev(struct device *_d)
+{
+ struct virtio_device *vdev = dev_to_virtio(_d);
+ struct virtio_pci_device *vp_dev = to_vp_device(vdev);
+
+ /* As struct device is a kobject, it's not safe to
+ * free the memory (including the reference counter itself)
+ * until it's release callback. */
+ kfree(vp_dev);
+}
+
+static int virtio_pci_probe(struct pci_dev *pci_dev,
+ const struct pci_device_id *id)
+{
+ struct virtio_pci_device *vp_dev, *reg_dev = NULL;
+ int rc;
+
+ /* allocate our structure and fill it out */
+ vp_dev = kzalloc(sizeof(struct virtio_pci_device), GFP_KERNEL);
+ if (!vp_dev)
+ return -ENOMEM;
+
+ pci_set_drvdata(pci_dev, vp_dev);
+ vp_dev->vdev.dev.parent = &pci_dev->dev;
+ vp_dev->vdev.dev.release = virtio_pci_release_dev;
+ vp_dev->pci_dev = pci_dev;
+ INIT_LIST_HEAD(&vp_dev->virtqueues);
+ spin_lock_init(&vp_dev->lock);
+
+ /* enable the device */
+ rc = pci_enable_device(pci_dev);
+ if (rc)
+ goto err_enable_device;
+
+ if (force_legacy) {
+ rc = virtio_pci_legacy_probe(vp_dev);
+ /* Also try modern mode if we can't map BAR0 (no IO space). */
+ if (rc == -ENODEV || rc == -ENOMEM)
+ rc = virtio_pci_modern_probe(vp_dev);
+ if (rc)
+ goto err_probe;
+ } else {
+ rc = virtio_pci_modern_probe(vp_dev);
+ if (rc == -ENODEV)
+ rc = virtio_pci_legacy_probe(vp_dev);
+ if (rc)
+ goto err_probe;
+ }
+
+ pci_set_master(pci_dev);
+
+ rc = register_virtio_device(&vp_dev->vdev);
+ reg_dev = vp_dev;
+ if (rc)
+ goto err_register;
+
+ return 0;
+
+err_register:
+ if (vp_dev->is_legacy)
+ virtio_pci_legacy_remove(vp_dev);
+ else
+ virtio_pci_modern_remove(vp_dev);
+err_probe:
+ pci_disable_device(pci_dev);
+err_enable_device:
+ if (reg_dev)
+ put_device(&vp_dev->vdev.dev);
+ else
+ kfree(vp_dev);
+ return rc;
+}
+
+static void virtio_pci_remove(struct pci_dev *pci_dev)
+{
+ struct virtio_pci_device *vp_dev = pci_get_drvdata(pci_dev);
+ struct device *dev = get_device(&vp_dev->vdev.dev);
+
+ /*
+ * Device is marked broken on surprise removal so that virtio upper
+ * layers can abort any ongoing operation.
+ */
+ if (!pci_device_is_present(pci_dev))
+ virtio_break_device(&vp_dev->vdev);
+
+ pci_disable_sriov(pci_dev);
+
+ unregister_virtio_device(&vp_dev->vdev);
+
+ if (vp_dev->is_legacy)
+ virtio_pci_legacy_remove(vp_dev);
+ else
+ virtio_pci_modern_remove(vp_dev);
+
+ pci_disable_device(pci_dev);
+ put_device(dev);
+}
+
+static int virtio_pci_sriov_configure(struct pci_dev *pci_dev, int num_vfs)
+{
+ struct virtio_pci_device *vp_dev = pci_get_drvdata(pci_dev);
+ struct virtio_device *vdev = &vp_dev->vdev;
+ int ret;
+
+ if (!(vdev->config->get_status(vdev) & VIRTIO_CONFIG_S_DRIVER_OK))
+ return -EBUSY;
+
+ if (!__virtio_test_bit(vdev, VIRTIO_F_SR_IOV))
+ return -EINVAL;
+
+ if (pci_vfs_assigned(pci_dev))
+ return -EPERM;
+
+ if (num_vfs == 0) {
+ pci_disable_sriov(pci_dev);
+ return 0;
+ }
+
+ ret = pci_enable_sriov(pci_dev, num_vfs);
+ if (ret < 0)
+ return ret;
+
+ return num_vfs;
+}
+
+static struct pci_driver virtio_pci_driver = {
+ .name = "virtio-pci",
+ .id_table = virtio_pci_id_table,
+ .probe = virtio_pci_probe,
+ .remove = virtio_pci_remove,
+#ifdef CONFIG_PM_SLEEP
+ .driver.pm = &virtio_pci_pm_ops,
+#endif
+ .sriov_configure = virtio_pci_sriov_configure,
+};
+
+module_pci_driver(virtio_pci_driver);
+
+MODULE_AUTHOR("Anthony Liguori <aliguori@us.ibm.com>");
+MODULE_DESCRIPTION("virtio-pci");
+MODULE_LICENSE("GPL");
+MODULE_VERSION("1");
diff --git a/drivers/virtio/virtio_pci_common.h b/drivers/virtio/virtio_pci_common.h
new file mode 100644
index 000000000..4b773bd7c
--- /dev/null
+++ b/drivers/virtio/virtio_pci_common.h
@@ -0,0 +1,142 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#ifndef _DRIVERS_VIRTIO_VIRTIO_PCI_COMMON_H
+#define _DRIVERS_VIRTIO_VIRTIO_PCI_COMMON_H
+/*
+ * Virtio PCI driver - APIs for common functionality for all device versions
+ *
+ * This module allows virtio devices to be used over a virtual PCI device.
+ * This can be used with QEMU based VMMs like KVM or Xen.
+ *
+ * Copyright IBM Corp. 2007
+ * Copyright Red Hat, Inc. 2014
+ *
+ * Authors:
+ * Anthony Liguori <aliguori@us.ibm.com>
+ * Rusty Russell <rusty@rustcorp.com.au>
+ * Michael S. Tsirkin <mst@redhat.com>
+ */
+
+#include <linux/module.h>
+#include <linux/list.h>
+#include <linux/pci.h>
+#include <linux/slab.h>
+#include <linux/interrupt.h>
+#include <linux/virtio.h>
+#include <linux/virtio_config.h>
+#include <linux/virtio_ring.h>
+#include <linux/virtio_pci.h>
+#include <linux/virtio_pci_legacy.h>
+#include <linux/virtio_pci_modern.h>
+#include <linux/highmem.h>
+#include <linux/spinlock.h>
+
+struct virtio_pci_vq_info {
+ /* the actual virtqueue */
+ struct virtqueue *vq;
+
+ /* the list node for the virtqueues list */
+ struct list_head node;
+
+ /* MSI-X vector (or none) */
+ unsigned int msix_vector;
+};
+
+/* Our device structure */
+struct virtio_pci_device {
+ struct virtio_device vdev;
+ struct pci_dev *pci_dev;
+ union {
+ struct virtio_pci_legacy_device ldev;
+ struct virtio_pci_modern_device mdev;
+ };
+ bool is_legacy;
+
+ /* Where to read and clear interrupt */
+ u8 __iomem *isr;
+
+ /* a list of queues so we can dispatch IRQs */
+ spinlock_t lock;
+ struct list_head virtqueues;
+
+ /* array of all queues for house-keeping */
+ struct virtio_pci_vq_info **vqs;
+
+ /* MSI-X support */
+ int msix_enabled;
+ int intx_enabled;
+ cpumask_var_t *msix_affinity_masks;
+ /* Name strings for interrupts. This size should be enough,
+ * and I'm too lazy to allocate each name separately. */
+ char (*msix_names)[256];
+ /* Number of available vectors */
+ unsigned int msix_vectors;
+ /* Vectors allocated, excluding per-vq vectors if any */
+ unsigned int msix_used_vectors;
+
+ /* Whether we have vector per vq */
+ bool per_vq_vectors;
+
+ struct virtqueue *(*setup_vq)(struct virtio_pci_device *vp_dev,
+ struct virtio_pci_vq_info *info,
+ unsigned int idx,
+ void (*callback)(struct virtqueue *vq),
+ const char *name,
+ bool ctx,
+ u16 msix_vec);
+ void (*del_vq)(struct virtio_pci_vq_info *info);
+
+ u16 (*config_vector)(struct virtio_pci_device *vp_dev, u16 vector);
+};
+
+/* Constants for MSI-X */
+/* Use first vector for configuration changes, second and the rest for
+ * virtqueues Thus, we need at least 2 vectors for MSI. */
+enum {
+ VP_MSIX_CONFIG_VECTOR = 0,
+ VP_MSIX_VQ_VECTOR = 1,
+};
+
+/* Convert a generic virtio device to our structure */
+static struct virtio_pci_device *to_vp_device(struct virtio_device *vdev)
+{
+ return container_of(vdev, struct virtio_pci_device, vdev);
+}
+
+/* wait for pending irq handlers */
+void vp_synchronize_vectors(struct virtio_device *vdev);
+/* the notify function used when creating a virt queue */
+bool vp_notify(struct virtqueue *vq);
+/* the config->del_vqs() implementation */
+void vp_del_vqs(struct virtio_device *vdev);
+/* the config->find_vqs() implementation */
+int vp_find_vqs(struct virtio_device *vdev, unsigned int nvqs,
+ struct virtqueue *vqs[], vq_callback_t *callbacks[],
+ const char * const names[], const bool *ctx,
+ struct irq_affinity *desc);
+const char *vp_bus_name(struct virtio_device *vdev);
+
+/* Setup the affinity for a virtqueue:
+ * - force the affinity for per vq vector
+ * - OR over all affinities for shared MSI
+ * - ignore the affinity request if we're using INTX
+ */
+int vp_set_vq_affinity(struct virtqueue *vq, const struct cpumask *cpu_mask);
+
+const struct cpumask *vp_get_vq_affinity(struct virtio_device *vdev, int index);
+
+#if IS_ENABLED(CONFIG_VIRTIO_PCI_LEGACY)
+int virtio_pci_legacy_probe(struct virtio_pci_device *);
+void virtio_pci_legacy_remove(struct virtio_pci_device *);
+#else
+static inline int virtio_pci_legacy_probe(struct virtio_pci_device *vp_dev)
+{
+ return -ENODEV;
+}
+static inline void virtio_pci_legacy_remove(struct virtio_pci_device *vp_dev)
+{
+}
+#endif
+int virtio_pci_modern_probe(struct virtio_pci_device *);
+void virtio_pci_modern_remove(struct virtio_pci_device *);
+
+#endif
diff --git a/drivers/virtio/virtio_pci_legacy.c b/drivers/virtio/virtio_pci_legacy.c
new file mode 100644
index 000000000..d9cbb02b3
--- /dev/null
+++ b/drivers/virtio/virtio_pci_legacy.c
@@ -0,0 +1,236 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Virtio PCI driver - legacy device support
+ *
+ * This module allows virtio devices to be used over a virtual PCI device.
+ * This can be used with QEMU based VMMs like KVM or Xen.
+ *
+ * Copyright IBM Corp. 2007
+ * Copyright Red Hat, Inc. 2014
+ *
+ * Authors:
+ * Anthony Liguori <aliguori@us.ibm.com>
+ * Rusty Russell <rusty@rustcorp.com.au>
+ * Michael S. Tsirkin <mst@redhat.com>
+ */
+
+#include "linux/virtio_pci_legacy.h"
+#include "virtio_pci_common.h"
+
+/* virtio config->get_features() implementation */
+static u64 vp_get_features(struct virtio_device *vdev)
+{
+ struct virtio_pci_device *vp_dev = to_vp_device(vdev);
+
+ /* When someone needs more than 32 feature bits, we'll need to
+ * steal a bit to indicate that the rest are somewhere else. */
+ return vp_legacy_get_features(&vp_dev->ldev);
+}
+
+/* virtio config->finalize_features() implementation */
+static int vp_finalize_features(struct virtio_device *vdev)
+{
+ struct virtio_pci_device *vp_dev = to_vp_device(vdev);
+
+ /* Give virtio_ring a chance to accept features. */
+ vring_transport_features(vdev);
+
+ /* Make sure we don't have any features > 32 bits! */
+ BUG_ON((u32)vdev->features != vdev->features);
+
+ /* We only support 32 feature bits. */
+ vp_legacy_set_features(&vp_dev->ldev, vdev->features);
+
+ return 0;
+}
+
+/* virtio config->get() implementation */
+static void vp_get(struct virtio_device *vdev, unsigned int offset,
+ void *buf, unsigned int len)
+{
+ struct virtio_pci_device *vp_dev = to_vp_device(vdev);
+ void __iomem *ioaddr = vp_dev->ldev.ioaddr +
+ VIRTIO_PCI_CONFIG_OFF(vp_dev->msix_enabled) +
+ offset;
+ u8 *ptr = buf;
+ int i;
+
+ for (i = 0; i < len; i++)
+ ptr[i] = ioread8(ioaddr + i);
+}
+
+/* the config->set() implementation. it's symmetric to the config->get()
+ * implementation */
+static void vp_set(struct virtio_device *vdev, unsigned int offset,
+ const void *buf, unsigned int len)
+{
+ struct virtio_pci_device *vp_dev = to_vp_device(vdev);
+ void __iomem *ioaddr = vp_dev->ldev.ioaddr +
+ VIRTIO_PCI_CONFIG_OFF(vp_dev->msix_enabled) +
+ offset;
+ const u8 *ptr = buf;
+ int i;
+
+ for (i = 0; i < len; i++)
+ iowrite8(ptr[i], ioaddr + i);
+}
+
+/* config->{get,set}_status() implementations */
+static u8 vp_get_status(struct virtio_device *vdev)
+{
+ struct virtio_pci_device *vp_dev = to_vp_device(vdev);
+ return vp_legacy_get_status(&vp_dev->ldev);
+}
+
+static void vp_set_status(struct virtio_device *vdev, u8 status)
+{
+ struct virtio_pci_device *vp_dev = to_vp_device(vdev);
+ /* We should never be setting status to 0. */
+ BUG_ON(status == 0);
+ vp_legacy_set_status(&vp_dev->ldev, status);
+}
+
+static void vp_reset(struct virtio_device *vdev)
+{
+ struct virtio_pci_device *vp_dev = to_vp_device(vdev);
+ /* 0 status means a reset. */
+ vp_legacy_set_status(&vp_dev->ldev, 0);
+ /* Flush out the status write, and flush in device writes,
+ * including MSi-X interrupts, if any. */
+ vp_legacy_get_status(&vp_dev->ldev);
+ /* Flush pending VQ/configuration callbacks. */
+ vp_synchronize_vectors(vdev);
+}
+
+static u16 vp_config_vector(struct virtio_pci_device *vp_dev, u16 vector)
+{
+ return vp_legacy_config_vector(&vp_dev->ldev, vector);
+}
+
+static struct virtqueue *setup_vq(struct virtio_pci_device *vp_dev,
+ struct virtio_pci_vq_info *info,
+ unsigned int index,
+ void (*callback)(struct virtqueue *vq),
+ const char *name,
+ bool ctx,
+ u16 msix_vec)
+{
+ struct virtqueue *vq;
+ u16 num;
+ int err;
+ u64 q_pfn;
+
+ /* Check if queue is either not available or already active. */
+ num = vp_legacy_get_queue_size(&vp_dev->ldev, index);
+ if (!num || vp_legacy_get_queue_enable(&vp_dev->ldev, index))
+ return ERR_PTR(-ENOENT);
+
+ info->msix_vector = msix_vec;
+
+ /* create the vring */
+ vq = vring_create_virtqueue(index, num,
+ VIRTIO_PCI_VRING_ALIGN, &vp_dev->vdev,
+ true, false, ctx,
+ vp_notify, callback, name);
+ if (!vq)
+ return ERR_PTR(-ENOMEM);
+
+ vq->num_max = num;
+
+ q_pfn = virtqueue_get_desc_addr(vq) >> VIRTIO_PCI_QUEUE_ADDR_SHIFT;
+ if (q_pfn >> 32) {
+ dev_err(&vp_dev->pci_dev->dev,
+ "platform bug: legacy virtio-pci must not be used with RAM above 0x%llxGB\n",
+ 0x1ULL << (32 + PAGE_SHIFT - 30));
+ err = -E2BIG;
+ goto out_del_vq;
+ }
+
+ /* activate the queue */
+ vp_legacy_set_queue_address(&vp_dev->ldev, index, q_pfn);
+
+ vq->priv = (void __force *)vp_dev->ldev.ioaddr + VIRTIO_PCI_QUEUE_NOTIFY;
+
+ if (msix_vec != VIRTIO_MSI_NO_VECTOR) {
+ msix_vec = vp_legacy_queue_vector(&vp_dev->ldev, index, msix_vec);
+ if (msix_vec == VIRTIO_MSI_NO_VECTOR) {
+ err = -EBUSY;
+ goto out_deactivate;
+ }
+ }
+
+ return vq;
+
+out_deactivate:
+ vp_legacy_set_queue_address(&vp_dev->ldev, index, 0);
+out_del_vq:
+ vring_del_virtqueue(vq);
+ return ERR_PTR(err);
+}
+
+static void del_vq(struct virtio_pci_vq_info *info)
+{
+ struct virtqueue *vq = info->vq;
+ struct virtio_pci_device *vp_dev = to_vp_device(vq->vdev);
+
+ if (vp_dev->msix_enabled) {
+ vp_legacy_queue_vector(&vp_dev->ldev, vq->index,
+ VIRTIO_MSI_NO_VECTOR);
+ /* Flush the write out to device */
+ ioread8(vp_dev->ldev.ioaddr + VIRTIO_PCI_ISR);
+ }
+
+ /* Select and deactivate the queue */
+ vp_legacy_set_queue_address(&vp_dev->ldev, vq->index, 0);
+
+ vring_del_virtqueue(vq);
+}
+
+static const struct virtio_config_ops virtio_pci_config_ops = {
+ .get = vp_get,
+ .set = vp_set,
+ .get_status = vp_get_status,
+ .set_status = vp_set_status,
+ .reset = vp_reset,
+ .find_vqs = vp_find_vqs,
+ .del_vqs = vp_del_vqs,
+ .synchronize_cbs = vp_synchronize_vectors,
+ .get_features = vp_get_features,
+ .finalize_features = vp_finalize_features,
+ .bus_name = vp_bus_name,
+ .set_vq_affinity = vp_set_vq_affinity,
+ .get_vq_affinity = vp_get_vq_affinity,
+};
+
+/* the PCI probing function */
+int virtio_pci_legacy_probe(struct virtio_pci_device *vp_dev)
+{
+ struct virtio_pci_legacy_device *ldev = &vp_dev->ldev;
+ struct pci_dev *pci_dev = vp_dev->pci_dev;
+ int rc;
+
+ ldev->pci_dev = pci_dev;
+
+ rc = vp_legacy_probe(ldev);
+ if (rc)
+ return rc;
+
+ vp_dev->isr = ldev->isr;
+ vp_dev->vdev.id = ldev->id;
+
+ vp_dev->vdev.config = &virtio_pci_config_ops;
+
+ vp_dev->config_vector = vp_config_vector;
+ vp_dev->setup_vq = setup_vq;
+ vp_dev->del_vq = del_vq;
+ vp_dev->is_legacy = true;
+
+ return 0;
+}
+
+void virtio_pci_legacy_remove(struct virtio_pci_device *vp_dev)
+{
+ struct virtio_pci_legacy_device *ldev = &vp_dev->ldev;
+
+ vp_legacy_remove(ldev);
+}
diff --git a/drivers/virtio/virtio_pci_legacy_dev.c b/drivers/virtio/virtio_pci_legacy_dev.c
new file mode 100644
index 000000000..677d1f68b
--- /dev/null
+++ b/drivers/virtio/virtio_pci_legacy_dev.c
@@ -0,0 +1,222 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include "linux/virtio_pci.h"
+#include <linux/virtio_pci_legacy.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+
+
+/*
+ * vp_legacy_probe: probe the legacy virtio pci device, note that the
+ * caller is required to enable PCI device before calling this function.
+ * @ldev: the legacy virtio-pci device
+ *
+ * Return 0 on succeed otherwise fail
+ */
+int vp_legacy_probe(struct virtio_pci_legacy_device *ldev)
+{
+ struct pci_dev *pci_dev = ldev->pci_dev;
+ int rc;
+
+ /* We only own devices >= 0x1000 and <= 0x103f: leave the rest. */
+ if (pci_dev->device < 0x1000 || pci_dev->device > 0x103f)
+ return -ENODEV;
+
+ if (pci_dev->revision != VIRTIO_PCI_ABI_VERSION)
+ return -ENODEV;
+
+ rc = dma_set_mask(&pci_dev->dev, DMA_BIT_MASK(64));
+ if (rc) {
+ rc = dma_set_mask_and_coherent(&pci_dev->dev, DMA_BIT_MASK(32));
+ } else {
+ /*
+ * The virtio ring base address is expressed as a 32-bit PFN,
+ * with a page size of 1 << VIRTIO_PCI_QUEUE_ADDR_SHIFT.
+ */
+ dma_set_coherent_mask(&pci_dev->dev,
+ DMA_BIT_MASK(32 + VIRTIO_PCI_QUEUE_ADDR_SHIFT));
+ }
+
+ if (rc)
+ dev_warn(&pci_dev->dev, "Failed to enable 64-bit or 32-bit DMA. Trying to continue, but this might not work.\n");
+
+ rc = pci_request_region(pci_dev, 0, "virtio-pci-legacy");
+ if (rc)
+ return rc;
+
+ ldev->ioaddr = pci_iomap(pci_dev, 0, 0);
+ if (!ldev->ioaddr) {
+ rc = -EIO;
+ goto err_iomap;
+ }
+
+ ldev->isr = ldev->ioaddr + VIRTIO_PCI_ISR;
+
+ ldev->id.vendor = pci_dev->subsystem_vendor;
+ ldev->id.device = pci_dev->subsystem_device;
+
+ return 0;
+err_iomap:
+ pci_release_region(pci_dev, 0);
+ return rc;
+}
+EXPORT_SYMBOL_GPL(vp_legacy_probe);
+
+/*
+ * vp_legacy_probe: remove and cleanup the legacy virtio pci device
+ * @ldev: the legacy virtio-pci device
+ */
+void vp_legacy_remove(struct virtio_pci_legacy_device *ldev)
+{
+ struct pci_dev *pci_dev = ldev->pci_dev;
+
+ pci_iounmap(pci_dev, ldev->ioaddr);
+ pci_release_region(pci_dev, 0);
+}
+EXPORT_SYMBOL_GPL(vp_legacy_remove);
+
+/*
+ * vp_legacy_get_features - get features from device
+ * @ldev: the legacy virtio-pci device
+ *
+ * Returns the features read from the device
+ */
+u64 vp_legacy_get_features(struct virtio_pci_legacy_device *ldev)
+{
+
+ return ioread32(ldev->ioaddr + VIRTIO_PCI_HOST_FEATURES);
+}
+EXPORT_SYMBOL_GPL(vp_legacy_get_features);
+
+/*
+ * vp_legacy_get_driver_features - get driver features from device
+ * @ldev: the legacy virtio-pci device
+ *
+ * Returns the driver features read from the device
+ */
+u64 vp_legacy_get_driver_features(struct virtio_pci_legacy_device *ldev)
+{
+ return ioread32(ldev->ioaddr + VIRTIO_PCI_GUEST_FEATURES);
+}
+EXPORT_SYMBOL_GPL(vp_legacy_get_driver_features);
+
+/*
+ * vp_legacy_set_features - set features to device
+ * @ldev: the legacy virtio-pci device
+ * @features: the features set to device
+ */
+void vp_legacy_set_features(struct virtio_pci_legacy_device *ldev,
+ u32 features)
+{
+ iowrite32(features, ldev->ioaddr + VIRTIO_PCI_GUEST_FEATURES);
+}
+EXPORT_SYMBOL_GPL(vp_legacy_set_features);
+
+/*
+ * vp_legacy_get_status - get the device status
+ * @ldev: the legacy virtio-pci device
+ *
+ * Returns the status read from device
+ */
+u8 vp_legacy_get_status(struct virtio_pci_legacy_device *ldev)
+{
+ return ioread8(ldev->ioaddr + VIRTIO_PCI_STATUS);
+}
+EXPORT_SYMBOL_GPL(vp_legacy_get_status);
+
+/*
+ * vp_legacy_set_status - set status to device
+ * @ldev: the legacy virtio-pci device
+ * @status: the status set to device
+ */
+void vp_legacy_set_status(struct virtio_pci_legacy_device *ldev,
+ u8 status)
+{
+ iowrite8(status, ldev->ioaddr + VIRTIO_PCI_STATUS);
+}
+EXPORT_SYMBOL_GPL(vp_legacy_set_status);
+
+/*
+ * vp_legacy_queue_vector - set the MSIX vector for a specific virtqueue
+ * @ldev: the legacy virtio-pci device
+ * @index: queue index
+ * @vector: the config vector
+ *
+ * Returns the config vector read from the device
+ */
+u16 vp_legacy_queue_vector(struct virtio_pci_legacy_device *ldev,
+ u16 index, u16 vector)
+{
+ iowrite16(index, ldev->ioaddr + VIRTIO_PCI_QUEUE_SEL);
+ iowrite16(vector, ldev->ioaddr + VIRTIO_MSI_QUEUE_VECTOR);
+ /* Flush the write out to device */
+ return ioread16(ldev->ioaddr + VIRTIO_MSI_QUEUE_VECTOR);
+}
+EXPORT_SYMBOL_GPL(vp_legacy_queue_vector);
+
+/*
+ * vp_legacy_config_vector - set the vector for config interrupt
+ * @ldev: the legacy virtio-pci device
+ * @vector: the config vector
+ *
+ * Returns the config vector read from the device
+ */
+u16 vp_legacy_config_vector(struct virtio_pci_legacy_device *ldev,
+ u16 vector)
+{
+ /* Setup the vector used for configuration events */
+ iowrite16(vector, ldev->ioaddr + VIRTIO_MSI_CONFIG_VECTOR);
+ /* Verify we had enough resources to assign the vector */
+ /* Will also flush the write out to device */
+ return ioread16(ldev->ioaddr + VIRTIO_MSI_CONFIG_VECTOR);
+}
+EXPORT_SYMBOL_GPL(vp_legacy_config_vector);
+
+/*
+ * vp_legacy_set_queue_address - set the virtqueue address
+ * @ldev: the legacy virtio-pci device
+ * @index: the queue index
+ * @queue_pfn: pfn of the virtqueue
+ */
+void vp_legacy_set_queue_address(struct virtio_pci_legacy_device *ldev,
+ u16 index, u32 queue_pfn)
+{
+ iowrite16(index, ldev->ioaddr + VIRTIO_PCI_QUEUE_SEL);
+ iowrite32(queue_pfn, ldev->ioaddr + VIRTIO_PCI_QUEUE_PFN);
+}
+EXPORT_SYMBOL_GPL(vp_legacy_set_queue_address);
+
+/*
+ * vp_legacy_get_queue_enable - enable a virtqueue
+ * @ldev: the legacy virtio-pci device
+ * @index: the queue index
+ *
+ * Returns whether a virtqueue is enabled or not
+ */
+bool vp_legacy_get_queue_enable(struct virtio_pci_legacy_device *ldev,
+ u16 index)
+{
+ iowrite16(index, ldev->ioaddr + VIRTIO_PCI_QUEUE_SEL);
+ return ioread32(ldev->ioaddr + VIRTIO_PCI_QUEUE_PFN);
+}
+EXPORT_SYMBOL_GPL(vp_legacy_get_queue_enable);
+
+/*
+ * vp_legacy_get_queue_size - get size for a virtqueue
+ * @ldev: the legacy virtio-pci device
+ * @index: the queue index
+ *
+ * Returns the size of the virtqueue
+ */
+u16 vp_legacy_get_queue_size(struct virtio_pci_legacy_device *ldev,
+ u16 index)
+{
+ iowrite16(index, ldev->ioaddr + VIRTIO_PCI_QUEUE_SEL);
+ return ioread16(ldev->ioaddr + VIRTIO_PCI_QUEUE_NUM);
+}
+EXPORT_SYMBOL_GPL(vp_legacy_get_queue_size);
+
+MODULE_VERSION("0.1");
+MODULE_DESCRIPTION("Legacy Virtio PCI Device");
+MODULE_AUTHOR("Wu Zongyong <wuzongyong@linux.alibaba.com>");
+MODULE_LICENSE("GPL");
diff --git a/drivers/virtio/virtio_pci_modern.c b/drivers/virtio/virtio_pci_modern.c
new file mode 100644
index 000000000..d6bb68ba8
--- /dev/null
+++ b/drivers/virtio/virtio_pci_modern.c
@@ -0,0 +1,566 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Virtio PCI driver - modern (virtio 1.0) device support
+ *
+ * This module allows virtio devices to be used over a virtual PCI device.
+ * This can be used with QEMU based VMMs like KVM or Xen.
+ *
+ * Copyright IBM Corp. 2007
+ * Copyright Red Hat, Inc. 2014
+ *
+ * Authors:
+ * Anthony Liguori <aliguori@us.ibm.com>
+ * Rusty Russell <rusty@rustcorp.com.au>
+ * Michael S. Tsirkin <mst@redhat.com>
+ */
+
+#include <linux/delay.h>
+#define VIRTIO_PCI_NO_LEGACY
+#define VIRTIO_RING_NO_LEGACY
+#include "virtio_pci_common.h"
+
+static u64 vp_get_features(struct virtio_device *vdev)
+{
+ struct virtio_pci_device *vp_dev = to_vp_device(vdev);
+
+ return vp_modern_get_features(&vp_dev->mdev);
+}
+
+static void vp_transport_features(struct virtio_device *vdev, u64 features)
+{
+ struct virtio_pci_device *vp_dev = to_vp_device(vdev);
+ struct pci_dev *pci_dev = vp_dev->pci_dev;
+
+ if ((features & BIT_ULL(VIRTIO_F_SR_IOV)) &&
+ pci_find_ext_capability(pci_dev, PCI_EXT_CAP_ID_SRIOV))
+ __virtio_set_bit(vdev, VIRTIO_F_SR_IOV);
+
+ if (features & BIT_ULL(VIRTIO_F_RING_RESET))
+ __virtio_set_bit(vdev, VIRTIO_F_RING_RESET);
+}
+
+/* virtio config->finalize_features() implementation */
+static int vp_finalize_features(struct virtio_device *vdev)
+{
+ struct virtio_pci_device *vp_dev = to_vp_device(vdev);
+ u64 features = vdev->features;
+
+ /* Give virtio_ring a chance to accept features. */
+ vring_transport_features(vdev);
+
+ /* Give virtio_pci a chance to accept features. */
+ vp_transport_features(vdev, features);
+
+ if (!__virtio_test_bit(vdev, VIRTIO_F_VERSION_1)) {
+ dev_err(&vdev->dev, "virtio: device uses modern interface "
+ "but does not have VIRTIO_F_VERSION_1\n");
+ return -EINVAL;
+ }
+
+ vp_modern_set_features(&vp_dev->mdev, vdev->features);
+
+ return 0;
+}
+
+/* virtio config->get() implementation */
+static void vp_get(struct virtio_device *vdev, unsigned int offset,
+ void *buf, unsigned int len)
+{
+ struct virtio_pci_device *vp_dev = to_vp_device(vdev);
+ struct virtio_pci_modern_device *mdev = &vp_dev->mdev;
+ void __iomem *device = mdev->device;
+ u8 b;
+ __le16 w;
+ __le32 l;
+
+ BUG_ON(offset + len > mdev->device_len);
+
+ switch (len) {
+ case 1:
+ b = ioread8(device + offset);
+ memcpy(buf, &b, sizeof b);
+ break;
+ case 2:
+ w = cpu_to_le16(ioread16(device + offset));
+ memcpy(buf, &w, sizeof w);
+ break;
+ case 4:
+ l = cpu_to_le32(ioread32(device + offset));
+ memcpy(buf, &l, sizeof l);
+ break;
+ case 8:
+ l = cpu_to_le32(ioread32(device + offset));
+ memcpy(buf, &l, sizeof l);
+ l = cpu_to_le32(ioread32(device + offset + sizeof l));
+ memcpy(buf + sizeof l, &l, sizeof l);
+ break;
+ default:
+ BUG();
+ }
+}
+
+/* the config->set() implementation. it's symmetric to the config->get()
+ * implementation */
+static void vp_set(struct virtio_device *vdev, unsigned int offset,
+ const void *buf, unsigned int len)
+{
+ struct virtio_pci_device *vp_dev = to_vp_device(vdev);
+ struct virtio_pci_modern_device *mdev = &vp_dev->mdev;
+ void __iomem *device = mdev->device;
+ u8 b;
+ __le16 w;
+ __le32 l;
+
+ BUG_ON(offset + len > mdev->device_len);
+
+ switch (len) {
+ case 1:
+ memcpy(&b, buf, sizeof b);
+ iowrite8(b, device + offset);
+ break;
+ case 2:
+ memcpy(&w, buf, sizeof w);
+ iowrite16(le16_to_cpu(w), device + offset);
+ break;
+ case 4:
+ memcpy(&l, buf, sizeof l);
+ iowrite32(le32_to_cpu(l), device + offset);
+ break;
+ case 8:
+ memcpy(&l, buf, sizeof l);
+ iowrite32(le32_to_cpu(l), device + offset);
+ memcpy(&l, buf + sizeof l, sizeof l);
+ iowrite32(le32_to_cpu(l), device + offset + sizeof l);
+ break;
+ default:
+ BUG();
+ }
+}
+
+static u32 vp_generation(struct virtio_device *vdev)
+{
+ struct virtio_pci_device *vp_dev = to_vp_device(vdev);
+
+ return vp_modern_generation(&vp_dev->mdev);
+}
+
+/* config->{get,set}_status() implementations */
+static u8 vp_get_status(struct virtio_device *vdev)
+{
+ struct virtio_pci_device *vp_dev = to_vp_device(vdev);
+
+ return vp_modern_get_status(&vp_dev->mdev);
+}
+
+static void vp_set_status(struct virtio_device *vdev, u8 status)
+{
+ struct virtio_pci_device *vp_dev = to_vp_device(vdev);
+
+ /* We should never be setting status to 0. */
+ BUG_ON(status == 0);
+ vp_modern_set_status(&vp_dev->mdev, status);
+}
+
+static void vp_reset(struct virtio_device *vdev)
+{
+ struct virtio_pci_device *vp_dev = to_vp_device(vdev);
+ struct virtio_pci_modern_device *mdev = &vp_dev->mdev;
+
+ /* 0 status means a reset. */
+ vp_modern_set_status(mdev, 0);
+ /* After writing 0 to device_status, the driver MUST wait for a read of
+ * device_status to return 0 before reinitializing the device.
+ * This will flush out the status write, and flush in device writes,
+ * including MSI-X interrupts, if any.
+ */
+ while (vp_modern_get_status(mdev))
+ msleep(1);
+ /* Flush pending VQ/configuration callbacks. */
+ vp_synchronize_vectors(vdev);
+}
+
+static int vp_active_vq(struct virtqueue *vq, u16 msix_vec)
+{
+ struct virtio_pci_device *vp_dev = to_vp_device(vq->vdev);
+ struct virtio_pci_modern_device *mdev = &vp_dev->mdev;
+ unsigned long index;
+
+ index = vq->index;
+
+ /* activate the queue */
+ vp_modern_set_queue_size(mdev, index, virtqueue_get_vring_size(vq));
+ vp_modern_queue_address(mdev, index, virtqueue_get_desc_addr(vq),
+ virtqueue_get_avail_addr(vq),
+ virtqueue_get_used_addr(vq));
+
+ if (msix_vec != VIRTIO_MSI_NO_VECTOR) {
+ msix_vec = vp_modern_queue_vector(mdev, index, msix_vec);
+ if (msix_vec == VIRTIO_MSI_NO_VECTOR)
+ return -EBUSY;
+ }
+
+ return 0;
+}
+
+static int vp_modern_disable_vq_and_reset(struct virtqueue *vq)
+{
+ struct virtio_pci_device *vp_dev = to_vp_device(vq->vdev);
+ struct virtio_pci_modern_device *mdev = &vp_dev->mdev;
+ struct virtio_pci_vq_info *info;
+ unsigned long flags;
+
+ if (!virtio_has_feature(vq->vdev, VIRTIO_F_RING_RESET))
+ return -ENOENT;
+
+ vp_modern_set_queue_reset(mdev, vq->index);
+
+ info = vp_dev->vqs[vq->index];
+
+ /* delete vq from irq handler */
+ spin_lock_irqsave(&vp_dev->lock, flags);
+ list_del(&info->node);
+ spin_unlock_irqrestore(&vp_dev->lock, flags);
+
+ INIT_LIST_HEAD(&info->node);
+
+#ifdef CONFIG_VIRTIO_HARDEN_NOTIFICATION
+ __virtqueue_break(vq);
+#endif
+
+ /* For the case where vq has an exclusive irq, call synchronize_irq() to
+ * wait for completion.
+ *
+ * note: We can't use disable_irq() since it conflicts with the affinity
+ * managed IRQ that is used by some drivers.
+ */
+ if (vp_dev->per_vq_vectors && info->msix_vector != VIRTIO_MSI_NO_VECTOR)
+ synchronize_irq(pci_irq_vector(vp_dev->pci_dev, info->msix_vector));
+
+ vq->reset = true;
+
+ return 0;
+}
+
+static int vp_modern_enable_vq_after_reset(struct virtqueue *vq)
+{
+ struct virtio_pci_device *vp_dev = to_vp_device(vq->vdev);
+ struct virtio_pci_modern_device *mdev = &vp_dev->mdev;
+ struct virtio_pci_vq_info *info;
+ unsigned long flags, index;
+ int err;
+
+ if (!vq->reset)
+ return -EBUSY;
+
+ index = vq->index;
+ info = vp_dev->vqs[index];
+
+ if (vp_modern_get_queue_reset(mdev, index))
+ return -EBUSY;
+
+ if (vp_modern_get_queue_enable(mdev, index))
+ return -EBUSY;
+
+ err = vp_active_vq(vq, info->msix_vector);
+ if (err)
+ return err;
+
+ if (vq->callback) {
+ spin_lock_irqsave(&vp_dev->lock, flags);
+ list_add(&info->node, &vp_dev->virtqueues);
+ spin_unlock_irqrestore(&vp_dev->lock, flags);
+ } else {
+ INIT_LIST_HEAD(&info->node);
+ }
+
+#ifdef CONFIG_VIRTIO_HARDEN_NOTIFICATION
+ __virtqueue_unbreak(vq);
+#endif
+
+ vp_modern_set_queue_enable(&vp_dev->mdev, index, true);
+ vq->reset = false;
+
+ return 0;
+}
+
+static u16 vp_config_vector(struct virtio_pci_device *vp_dev, u16 vector)
+{
+ return vp_modern_config_vector(&vp_dev->mdev, vector);
+}
+
+static bool vp_notify_with_data(struct virtqueue *vq)
+{
+ u32 data = vring_notification_data(vq);
+
+ iowrite32(data, (void __iomem *)vq->priv);
+
+ return true;
+}
+
+static struct virtqueue *setup_vq(struct virtio_pci_device *vp_dev,
+ struct virtio_pci_vq_info *info,
+ unsigned int index,
+ void (*callback)(struct virtqueue *vq),
+ const char *name,
+ bool ctx,
+ u16 msix_vec)
+{
+
+ struct virtio_pci_modern_device *mdev = &vp_dev->mdev;
+ bool (*notify)(struct virtqueue *vq);
+ struct virtqueue *vq;
+ u16 num;
+ int err;
+
+ if (__virtio_test_bit(&vp_dev->vdev, VIRTIO_F_NOTIFICATION_DATA))
+ notify = vp_notify_with_data;
+ else
+ notify = vp_notify;
+
+ if (index >= vp_modern_get_num_queues(mdev))
+ return ERR_PTR(-EINVAL);
+
+ /* Check if queue is either not available or already active. */
+ num = vp_modern_get_queue_size(mdev, index);
+ if (!num || vp_modern_get_queue_enable(mdev, index))
+ return ERR_PTR(-ENOENT);
+
+ info->msix_vector = msix_vec;
+
+ /* create the vring */
+ vq = vring_create_virtqueue(index, num,
+ SMP_CACHE_BYTES, &vp_dev->vdev,
+ true, true, ctx,
+ notify, callback, name);
+ if (!vq)
+ return ERR_PTR(-ENOMEM);
+
+ vq->num_max = num;
+
+ err = vp_active_vq(vq, msix_vec);
+ if (err)
+ goto err;
+
+ vq->priv = (void __force *)vp_modern_map_vq_notify(mdev, index, NULL);
+ if (!vq->priv) {
+ err = -ENOMEM;
+ goto err;
+ }
+
+ return vq;
+
+err:
+ vring_del_virtqueue(vq);
+ return ERR_PTR(err);
+}
+
+static int vp_modern_find_vqs(struct virtio_device *vdev, unsigned int nvqs,
+ struct virtqueue *vqs[],
+ vq_callback_t *callbacks[],
+ const char * const names[], const bool *ctx,
+ struct irq_affinity *desc)
+{
+ struct virtio_pci_device *vp_dev = to_vp_device(vdev);
+ struct virtqueue *vq;
+ int rc = vp_find_vqs(vdev, nvqs, vqs, callbacks, names, ctx, desc);
+
+ if (rc)
+ return rc;
+
+ /* Select and activate all queues. Has to be done last: once we do
+ * this, there's no way to go back except reset.
+ */
+ list_for_each_entry(vq, &vdev->vqs, list)
+ vp_modern_set_queue_enable(&vp_dev->mdev, vq->index, true);
+
+ return 0;
+}
+
+static void del_vq(struct virtio_pci_vq_info *info)
+{
+ struct virtqueue *vq = info->vq;
+ struct virtio_pci_device *vp_dev = to_vp_device(vq->vdev);
+ struct virtio_pci_modern_device *mdev = &vp_dev->mdev;
+
+ if (vp_dev->msix_enabled)
+ vp_modern_queue_vector(mdev, vq->index,
+ VIRTIO_MSI_NO_VECTOR);
+
+ if (!mdev->notify_base)
+ pci_iounmap(mdev->pci_dev, (void __force __iomem *)vq->priv);
+
+ vring_del_virtqueue(vq);
+}
+
+static int virtio_pci_find_shm_cap(struct pci_dev *dev, u8 required_id,
+ u8 *bar, u64 *offset, u64 *len)
+{
+ int pos;
+
+ for (pos = pci_find_capability(dev, PCI_CAP_ID_VNDR); pos > 0;
+ pos = pci_find_next_capability(dev, pos, PCI_CAP_ID_VNDR)) {
+ u8 type, cap_len, id, res_bar;
+ u32 tmp32;
+ u64 res_offset, res_length;
+
+ pci_read_config_byte(dev, pos + offsetof(struct virtio_pci_cap,
+ cfg_type), &type);
+ if (type != VIRTIO_PCI_CAP_SHARED_MEMORY_CFG)
+ continue;
+
+ pci_read_config_byte(dev, pos + offsetof(struct virtio_pci_cap,
+ cap_len), &cap_len);
+ if (cap_len != sizeof(struct virtio_pci_cap64)) {
+ dev_err(&dev->dev, "%s: shm cap with bad size offset:"
+ " %d size: %d\n", __func__, pos, cap_len);
+ continue;
+ }
+
+ pci_read_config_byte(dev, pos + offsetof(struct virtio_pci_cap,
+ id), &id);
+ if (id != required_id)
+ continue;
+
+ pci_read_config_byte(dev, pos + offsetof(struct virtio_pci_cap,
+ bar), &res_bar);
+ if (res_bar >= PCI_STD_NUM_BARS)
+ continue;
+
+ /* Type and ID match, and the BAR value isn't reserved.
+ * Looks good.
+ */
+
+ /* Read the lower 32bit of length and offset */
+ pci_read_config_dword(dev, pos + offsetof(struct virtio_pci_cap,
+ offset), &tmp32);
+ res_offset = tmp32;
+ pci_read_config_dword(dev, pos + offsetof(struct virtio_pci_cap,
+ length), &tmp32);
+ res_length = tmp32;
+
+ /* and now the top half */
+ pci_read_config_dword(dev,
+ pos + offsetof(struct virtio_pci_cap64,
+ offset_hi), &tmp32);
+ res_offset |= ((u64)tmp32) << 32;
+ pci_read_config_dword(dev,
+ pos + offsetof(struct virtio_pci_cap64,
+ length_hi), &tmp32);
+ res_length |= ((u64)tmp32) << 32;
+
+ *bar = res_bar;
+ *offset = res_offset;
+ *len = res_length;
+
+ return pos;
+ }
+ return 0;
+}
+
+static bool vp_get_shm_region(struct virtio_device *vdev,
+ struct virtio_shm_region *region, u8 id)
+{
+ struct virtio_pci_device *vp_dev = to_vp_device(vdev);
+ struct pci_dev *pci_dev = vp_dev->pci_dev;
+ u8 bar;
+ u64 offset, len;
+ phys_addr_t phys_addr;
+ size_t bar_len;
+
+ if (!virtio_pci_find_shm_cap(pci_dev, id, &bar, &offset, &len))
+ return false;
+
+ phys_addr = pci_resource_start(pci_dev, bar);
+ bar_len = pci_resource_len(pci_dev, bar);
+
+ if ((offset + len) < offset) {
+ dev_err(&pci_dev->dev, "%s: cap offset+len overflow detected\n",
+ __func__);
+ return false;
+ }
+
+ if (offset + len > bar_len) {
+ dev_err(&pci_dev->dev, "%s: bar shorter than cap offset+len\n",
+ __func__);
+ return false;
+ }
+
+ region->len = len;
+ region->addr = (u64) phys_addr + offset;
+
+ return true;
+}
+
+static const struct virtio_config_ops virtio_pci_config_nodev_ops = {
+ .get = NULL,
+ .set = NULL,
+ .generation = vp_generation,
+ .get_status = vp_get_status,
+ .set_status = vp_set_status,
+ .reset = vp_reset,
+ .find_vqs = vp_modern_find_vqs,
+ .del_vqs = vp_del_vqs,
+ .synchronize_cbs = vp_synchronize_vectors,
+ .get_features = vp_get_features,
+ .finalize_features = vp_finalize_features,
+ .bus_name = vp_bus_name,
+ .set_vq_affinity = vp_set_vq_affinity,
+ .get_vq_affinity = vp_get_vq_affinity,
+ .get_shm_region = vp_get_shm_region,
+ .disable_vq_and_reset = vp_modern_disable_vq_and_reset,
+ .enable_vq_after_reset = vp_modern_enable_vq_after_reset,
+};
+
+static const struct virtio_config_ops virtio_pci_config_ops = {
+ .get = vp_get,
+ .set = vp_set,
+ .generation = vp_generation,
+ .get_status = vp_get_status,
+ .set_status = vp_set_status,
+ .reset = vp_reset,
+ .find_vqs = vp_modern_find_vqs,
+ .del_vqs = vp_del_vqs,
+ .synchronize_cbs = vp_synchronize_vectors,
+ .get_features = vp_get_features,
+ .finalize_features = vp_finalize_features,
+ .bus_name = vp_bus_name,
+ .set_vq_affinity = vp_set_vq_affinity,
+ .get_vq_affinity = vp_get_vq_affinity,
+ .get_shm_region = vp_get_shm_region,
+ .disable_vq_and_reset = vp_modern_disable_vq_and_reset,
+ .enable_vq_after_reset = vp_modern_enable_vq_after_reset,
+};
+
+/* the PCI probing function */
+int virtio_pci_modern_probe(struct virtio_pci_device *vp_dev)
+{
+ struct virtio_pci_modern_device *mdev = &vp_dev->mdev;
+ struct pci_dev *pci_dev = vp_dev->pci_dev;
+ int err;
+
+ mdev->pci_dev = pci_dev;
+
+ err = vp_modern_probe(mdev);
+ if (err)
+ return err;
+
+ if (mdev->device)
+ vp_dev->vdev.config = &virtio_pci_config_ops;
+ else
+ vp_dev->vdev.config = &virtio_pci_config_nodev_ops;
+
+ vp_dev->config_vector = vp_config_vector;
+ vp_dev->setup_vq = setup_vq;
+ vp_dev->del_vq = del_vq;
+ vp_dev->isr = mdev->isr;
+ vp_dev->vdev.id = mdev->id;
+
+ return 0;
+}
+
+void virtio_pci_modern_remove(struct virtio_pci_device *vp_dev)
+{
+ struct virtio_pci_modern_device *mdev = &vp_dev->mdev;
+
+ vp_modern_remove(mdev);
+}
diff --git a/drivers/virtio/virtio_pci_modern_dev.c b/drivers/virtio/virtio_pci_modern_dev.c
new file mode 100644
index 000000000..9cb601e16
--- /dev/null
+++ b/drivers/virtio/virtio_pci_modern_dev.c
@@ -0,0 +1,720 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include <linux/virtio_pci_modern.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/delay.h>
+
+/*
+ * vp_modern_map_capability - map a part of virtio pci capability
+ * @mdev: the modern virtio-pci device
+ * @off: offset of the capability
+ * @minlen: minimal length of the capability
+ * @align: align requirement
+ * @start: start from the capability
+ * @size: map size
+ * @len: the length that is actually mapped
+ * @pa: physical address of the capability
+ *
+ * Returns the io address of for the part of the capability
+ */
+static void __iomem *
+vp_modern_map_capability(struct virtio_pci_modern_device *mdev, int off,
+ size_t minlen, u32 align, u32 start, u32 size,
+ size_t *len, resource_size_t *pa)
+{
+ struct pci_dev *dev = mdev->pci_dev;
+ u8 bar;
+ u32 offset, length;
+ void __iomem *p;
+
+ pci_read_config_byte(dev, off + offsetof(struct virtio_pci_cap,
+ bar),
+ &bar);
+ pci_read_config_dword(dev, off + offsetof(struct virtio_pci_cap, offset),
+ &offset);
+ pci_read_config_dword(dev, off + offsetof(struct virtio_pci_cap, length),
+ &length);
+
+ /* Check if the BAR may have changed since we requested the region. */
+ if (bar >= PCI_STD_NUM_BARS || !(mdev->modern_bars & (1 << bar))) {
+ dev_err(&dev->dev,
+ "virtio_pci: bar unexpectedly changed to %u\n", bar);
+ return NULL;
+ }
+
+ if (length <= start) {
+ dev_err(&dev->dev,
+ "virtio_pci: bad capability len %u (>%u expected)\n",
+ length, start);
+ return NULL;
+ }
+
+ if (length - start < minlen) {
+ dev_err(&dev->dev,
+ "virtio_pci: bad capability len %u (>=%zu expected)\n",
+ length, minlen);
+ return NULL;
+ }
+
+ length -= start;
+
+ if (start + offset < offset) {
+ dev_err(&dev->dev,
+ "virtio_pci: map wrap-around %u+%u\n",
+ start, offset);
+ return NULL;
+ }
+
+ offset += start;
+
+ if (offset & (align - 1)) {
+ dev_err(&dev->dev,
+ "virtio_pci: offset %u not aligned to %u\n",
+ offset, align);
+ return NULL;
+ }
+
+ if (length > size)
+ length = size;
+
+ if (len)
+ *len = length;
+
+ if (minlen + offset < minlen ||
+ minlen + offset > pci_resource_len(dev, bar)) {
+ dev_err(&dev->dev,
+ "virtio_pci: map virtio %zu@%u "
+ "out of range on bar %i length %lu\n",
+ minlen, offset,
+ bar, (unsigned long)pci_resource_len(dev, bar));
+ return NULL;
+ }
+
+ p = pci_iomap_range(dev, bar, offset, length);
+ if (!p)
+ dev_err(&dev->dev,
+ "virtio_pci: unable to map virtio %u@%u on bar %i\n",
+ length, offset, bar);
+ else if (pa)
+ *pa = pci_resource_start(dev, bar) + offset;
+
+ return p;
+}
+
+/**
+ * virtio_pci_find_capability - walk capabilities to find device info.
+ * @dev: the pci device
+ * @cfg_type: the VIRTIO_PCI_CAP_* value we seek
+ * @ioresource_types: IORESOURCE_MEM and/or IORESOURCE_IO.
+ * @bars: the bitmask of BARs
+ *
+ * Returns offset of the capability, or 0.
+ */
+static inline int virtio_pci_find_capability(struct pci_dev *dev, u8 cfg_type,
+ u32 ioresource_types, int *bars)
+{
+ int pos;
+
+ for (pos = pci_find_capability(dev, PCI_CAP_ID_VNDR);
+ pos > 0;
+ pos = pci_find_next_capability(dev, pos, PCI_CAP_ID_VNDR)) {
+ u8 type, bar;
+ pci_read_config_byte(dev, pos + offsetof(struct virtio_pci_cap,
+ cfg_type),
+ &type);
+ pci_read_config_byte(dev, pos + offsetof(struct virtio_pci_cap,
+ bar),
+ &bar);
+
+ /* Ignore structures with reserved BAR values */
+ if (bar >= PCI_STD_NUM_BARS)
+ continue;
+
+ if (type == cfg_type) {
+ if (pci_resource_len(dev, bar) &&
+ pci_resource_flags(dev, bar) & ioresource_types) {
+ *bars |= (1 << bar);
+ return pos;
+ }
+ }
+ }
+ return 0;
+}
+
+/* This is part of the ABI. Don't screw with it. */
+static inline void check_offsets(void)
+{
+ /* Note: disk space was harmed in compilation of this function. */
+ BUILD_BUG_ON(VIRTIO_PCI_CAP_VNDR !=
+ offsetof(struct virtio_pci_cap, cap_vndr));
+ BUILD_BUG_ON(VIRTIO_PCI_CAP_NEXT !=
+ offsetof(struct virtio_pci_cap, cap_next));
+ BUILD_BUG_ON(VIRTIO_PCI_CAP_LEN !=
+ offsetof(struct virtio_pci_cap, cap_len));
+ BUILD_BUG_ON(VIRTIO_PCI_CAP_CFG_TYPE !=
+ offsetof(struct virtio_pci_cap, cfg_type));
+ BUILD_BUG_ON(VIRTIO_PCI_CAP_BAR !=
+ offsetof(struct virtio_pci_cap, bar));
+ BUILD_BUG_ON(VIRTIO_PCI_CAP_OFFSET !=
+ offsetof(struct virtio_pci_cap, offset));
+ BUILD_BUG_ON(VIRTIO_PCI_CAP_LENGTH !=
+ offsetof(struct virtio_pci_cap, length));
+ BUILD_BUG_ON(VIRTIO_PCI_NOTIFY_CAP_MULT !=
+ offsetof(struct virtio_pci_notify_cap,
+ notify_off_multiplier));
+ BUILD_BUG_ON(VIRTIO_PCI_COMMON_DFSELECT !=
+ offsetof(struct virtio_pci_common_cfg,
+ device_feature_select));
+ BUILD_BUG_ON(VIRTIO_PCI_COMMON_DF !=
+ offsetof(struct virtio_pci_common_cfg, device_feature));
+ BUILD_BUG_ON(VIRTIO_PCI_COMMON_GFSELECT !=
+ offsetof(struct virtio_pci_common_cfg,
+ guest_feature_select));
+ BUILD_BUG_ON(VIRTIO_PCI_COMMON_GF !=
+ offsetof(struct virtio_pci_common_cfg, guest_feature));
+ BUILD_BUG_ON(VIRTIO_PCI_COMMON_MSIX !=
+ offsetof(struct virtio_pci_common_cfg, msix_config));
+ BUILD_BUG_ON(VIRTIO_PCI_COMMON_NUMQ !=
+ offsetof(struct virtio_pci_common_cfg, num_queues));
+ BUILD_BUG_ON(VIRTIO_PCI_COMMON_STATUS !=
+ offsetof(struct virtio_pci_common_cfg, device_status));
+ BUILD_BUG_ON(VIRTIO_PCI_COMMON_CFGGENERATION !=
+ offsetof(struct virtio_pci_common_cfg, config_generation));
+ BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_SELECT !=
+ offsetof(struct virtio_pci_common_cfg, queue_select));
+ BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_SIZE !=
+ offsetof(struct virtio_pci_common_cfg, queue_size));
+ BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_MSIX !=
+ offsetof(struct virtio_pci_common_cfg, queue_msix_vector));
+ BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_ENABLE !=
+ offsetof(struct virtio_pci_common_cfg, queue_enable));
+ BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_NOFF !=
+ offsetof(struct virtio_pci_common_cfg, queue_notify_off));
+ BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_DESCLO !=
+ offsetof(struct virtio_pci_common_cfg, queue_desc_lo));
+ BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_DESCHI !=
+ offsetof(struct virtio_pci_common_cfg, queue_desc_hi));
+ BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_AVAILLO !=
+ offsetof(struct virtio_pci_common_cfg, queue_avail_lo));
+ BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_AVAILHI !=
+ offsetof(struct virtio_pci_common_cfg, queue_avail_hi));
+ BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_USEDLO !=
+ offsetof(struct virtio_pci_common_cfg, queue_used_lo));
+ BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_USEDHI !=
+ offsetof(struct virtio_pci_common_cfg, queue_used_hi));
+}
+
+/*
+ * vp_modern_probe: probe the modern virtio pci device, note that the
+ * caller is required to enable PCI device before calling this function.
+ * @mdev: the modern virtio-pci device
+ *
+ * Return 0 on succeed otherwise fail
+ */
+int vp_modern_probe(struct virtio_pci_modern_device *mdev)
+{
+ struct pci_dev *pci_dev = mdev->pci_dev;
+ int err, common, isr, notify, device;
+ u32 notify_length;
+ u32 notify_offset;
+ int devid;
+
+ check_offsets();
+
+ if (mdev->device_id_check) {
+ devid = mdev->device_id_check(pci_dev);
+ if (devid < 0)
+ return devid;
+ mdev->id.device = devid;
+ } else {
+ /* We only own devices >= 0x1000 and <= 0x107f: leave the rest. */
+ if (pci_dev->device < 0x1000 || pci_dev->device > 0x107f)
+ return -ENODEV;
+
+ if (pci_dev->device < 0x1040) {
+ /* Transitional devices: use the PCI subsystem device id as
+ * virtio device id, same as legacy driver always did.
+ */
+ mdev->id.device = pci_dev->subsystem_device;
+ } else {
+ /* Modern devices: simply use PCI device id, but start from 0x1040. */
+ mdev->id.device = pci_dev->device - 0x1040;
+ }
+ }
+ mdev->id.vendor = pci_dev->subsystem_vendor;
+
+ /* check for a common config: if not, use legacy mode (bar 0). */
+ common = virtio_pci_find_capability(pci_dev, VIRTIO_PCI_CAP_COMMON_CFG,
+ IORESOURCE_IO | IORESOURCE_MEM,
+ &mdev->modern_bars);
+ if (!common) {
+ dev_info(&pci_dev->dev,
+ "virtio_pci: leaving for legacy driver\n");
+ return -ENODEV;
+ }
+
+ /* If common is there, these should be too... */
+ isr = virtio_pci_find_capability(pci_dev, VIRTIO_PCI_CAP_ISR_CFG,
+ IORESOURCE_IO | IORESOURCE_MEM,
+ &mdev->modern_bars);
+ notify = virtio_pci_find_capability(pci_dev, VIRTIO_PCI_CAP_NOTIFY_CFG,
+ IORESOURCE_IO | IORESOURCE_MEM,
+ &mdev->modern_bars);
+ if (!isr || !notify) {
+ dev_err(&pci_dev->dev,
+ "virtio_pci: missing capabilities %i/%i/%i\n",
+ common, isr, notify);
+ return -EINVAL;
+ }
+
+ err = dma_set_mask_and_coherent(&pci_dev->dev,
+ mdev->dma_mask ? : DMA_BIT_MASK(64));
+ if (err)
+ err = dma_set_mask_and_coherent(&pci_dev->dev,
+ DMA_BIT_MASK(32));
+ if (err)
+ dev_warn(&pci_dev->dev, "Failed to enable 64-bit or 32-bit DMA. Trying to continue, but this might not work.\n");
+
+ /* Device capability is only mandatory for devices that have
+ * device-specific configuration.
+ */
+ device = virtio_pci_find_capability(pci_dev, VIRTIO_PCI_CAP_DEVICE_CFG,
+ IORESOURCE_IO | IORESOURCE_MEM,
+ &mdev->modern_bars);
+
+ err = pci_request_selected_regions(pci_dev, mdev->modern_bars,
+ "virtio-pci-modern");
+ if (err)
+ return err;
+
+ err = -EINVAL;
+ mdev->common = vp_modern_map_capability(mdev, common,
+ sizeof(struct virtio_pci_common_cfg), 4,
+ 0, sizeof(struct virtio_pci_modern_common_cfg),
+ NULL, NULL);
+ if (!mdev->common)
+ goto err_map_common;
+ mdev->isr = vp_modern_map_capability(mdev, isr, sizeof(u8), 1,
+ 0, 1,
+ NULL, NULL);
+ if (!mdev->isr)
+ goto err_map_isr;
+
+ /* Read notify_off_multiplier from config space. */
+ pci_read_config_dword(pci_dev,
+ notify + offsetof(struct virtio_pci_notify_cap,
+ notify_off_multiplier),
+ &mdev->notify_offset_multiplier);
+ /* Read notify length and offset from config space. */
+ pci_read_config_dword(pci_dev,
+ notify + offsetof(struct virtio_pci_notify_cap,
+ cap.length),
+ &notify_length);
+
+ pci_read_config_dword(pci_dev,
+ notify + offsetof(struct virtio_pci_notify_cap,
+ cap.offset),
+ &notify_offset);
+
+ /* We don't know how many VQs we'll map, ahead of the time.
+ * If notify length is small, map it all now.
+ * Otherwise, map each VQ individually later.
+ */
+ if ((u64)notify_length + (notify_offset % PAGE_SIZE) <= PAGE_SIZE) {
+ mdev->notify_base = vp_modern_map_capability(mdev, notify,
+ 2, 2,
+ 0, notify_length,
+ &mdev->notify_len,
+ &mdev->notify_pa);
+ if (!mdev->notify_base)
+ goto err_map_notify;
+ } else {
+ mdev->notify_map_cap = notify;
+ }
+
+ /* Again, we don't know how much we should map, but PAGE_SIZE
+ * is more than enough for all existing devices.
+ */
+ if (device) {
+ mdev->device = vp_modern_map_capability(mdev, device, 0, 4,
+ 0, PAGE_SIZE,
+ &mdev->device_len,
+ NULL);
+ if (!mdev->device)
+ goto err_map_device;
+ }
+
+ return 0;
+
+err_map_device:
+ if (mdev->notify_base)
+ pci_iounmap(pci_dev, mdev->notify_base);
+err_map_notify:
+ pci_iounmap(pci_dev, mdev->isr);
+err_map_isr:
+ pci_iounmap(pci_dev, mdev->common);
+err_map_common:
+ pci_release_selected_regions(pci_dev, mdev->modern_bars);
+ return err;
+}
+EXPORT_SYMBOL_GPL(vp_modern_probe);
+
+/*
+ * vp_modern_remove: remove and cleanup the modern virtio pci device
+ * @mdev: the modern virtio-pci device
+ */
+void vp_modern_remove(struct virtio_pci_modern_device *mdev)
+{
+ struct pci_dev *pci_dev = mdev->pci_dev;
+
+ if (mdev->device)
+ pci_iounmap(pci_dev, mdev->device);
+ if (mdev->notify_base)
+ pci_iounmap(pci_dev, mdev->notify_base);
+ pci_iounmap(pci_dev, mdev->isr);
+ pci_iounmap(pci_dev, mdev->common);
+ pci_release_selected_regions(pci_dev, mdev->modern_bars);
+}
+EXPORT_SYMBOL_GPL(vp_modern_remove);
+
+/*
+ * vp_modern_get_features - get features from device
+ * @mdev: the modern virtio-pci device
+ *
+ * Returns the features read from the device
+ */
+u64 vp_modern_get_features(struct virtio_pci_modern_device *mdev)
+{
+ struct virtio_pci_common_cfg __iomem *cfg = mdev->common;
+
+ u64 features;
+
+ vp_iowrite32(0, &cfg->device_feature_select);
+ features = vp_ioread32(&cfg->device_feature);
+ vp_iowrite32(1, &cfg->device_feature_select);
+ features |= ((u64)vp_ioread32(&cfg->device_feature) << 32);
+
+ return features;
+}
+EXPORT_SYMBOL_GPL(vp_modern_get_features);
+
+/*
+ * vp_modern_get_driver_features - get driver features from device
+ * @mdev: the modern virtio-pci device
+ *
+ * Returns the driver features read from the device
+ */
+u64 vp_modern_get_driver_features(struct virtio_pci_modern_device *mdev)
+{
+ struct virtio_pci_common_cfg __iomem *cfg = mdev->common;
+
+ u64 features;
+
+ vp_iowrite32(0, &cfg->guest_feature_select);
+ features = vp_ioread32(&cfg->guest_feature);
+ vp_iowrite32(1, &cfg->guest_feature_select);
+ features |= ((u64)vp_ioread32(&cfg->guest_feature) << 32);
+
+ return features;
+}
+EXPORT_SYMBOL_GPL(vp_modern_get_driver_features);
+
+/*
+ * vp_modern_set_features - set features to device
+ * @mdev: the modern virtio-pci device
+ * @features: the features set to device
+ */
+void vp_modern_set_features(struct virtio_pci_modern_device *mdev,
+ u64 features)
+{
+ struct virtio_pci_common_cfg __iomem *cfg = mdev->common;
+
+ vp_iowrite32(0, &cfg->guest_feature_select);
+ vp_iowrite32((u32)features, &cfg->guest_feature);
+ vp_iowrite32(1, &cfg->guest_feature_select);
+ vp_iowrite32(features >> 32, &cfg->guest_feature);
+}
+EXPORT_SYMBOL_GPL(vp_modern_set_features);
+
+/*
+ * vp_modern_generation - get the device genreation
+ * @mdev: the modern virtio-pci device
+ *
+ * Returns the genreation read from device
+ */
+u32 vp_modern_generation(struct virtio_pci_modern_device *mdev)
+{
+ struct virtio_pci_common_cfg __iomem *cfg = mdev->common;
+
+ return vp_ioread8(&cfg->config_generation);
+}
+EXPORT_SYMBOL_GPL(vp_modern_generation);
+
+/*
+ * vp_modern_get_status - get the device status
+ * @mdev: the modern virtio-pci device
+ *
+ * Returns the status read from device
+ */
+u8 vp_modern_get_status(struct virtio_pci_modern_device *mdev)
+{
+ struct virtio_pci_common_cfg __iomem *cfg = mdev->common;
+
+ return vp_ioread8(&cfg->device_status);
+}
+EXPORT_SYMBOL_GPL(vp_modern_get_status);
+
+/*
+ * vp_modern_set_status - set status to device
+ * @mdev: the modern virtio-pci device
+ * @status: the status set to device
+ */
+void vp_modern_set_status(struct virtio_pci_modern_device *mdev,
+ u8 status)
+{
+ struct virtio_pci_common_cfg __iomem *cfg = mdev->common;
+
+ /*
+ * Per memory-barriers.txt, wmb() is not needed to guarantee
+ * that the cache coherent memory writes have completed
+ * before writing to the MMIO region.
+ */
+ vp_iowrite8(status, &cfg->device_status);
+}
+EXPORT_SYMBOL_GPL(vp_modern_set_status);
+
+/*
+ * vp_modern_get_queue_reset - get the queue reset status
+ * @mdev: the modern virtio-pci device
+ * @index: queue index
+ */
+int vp_modern_get_queue_reset(struct virtio_pci_modern_device *mdev, u16 index)
+{
+ struct virtio_pci_modern_common_cfg __iomem *cfg;
+
+ cfg = (struct virtio_pci_modern_common_cfg __iomem *)mdev->common;
+
+ vp_iowrite16(index, &cfg->cfg.queue_select);
+ return vp_ioread16(&cfg->queue_reset);
+}
+EXPORT_SYMBOL_GPL(vp_modern_get_queue_reset);
+
+/*
+ * vp_modern_set_queue_reset - reset the queue
+ * @mdev: the modern virtio-pci device
+ * @index: queue index
+ */
+void vp_modern_set_queue_reset(struct virtio_pci_modern_device *mdev, u16 index)
+{
+ struct virtio_pci_modern_common_cfg __iomem *cfg;
+
+ cfg = (struct virtio_pci_modern_common_cfg __iomem *)mdev->common;
+
+ vp_iowrite16(index, &cfg->cfg.queue_select);
+ vp_iowrite16(1, &cfg->queue_reset);
+
+ while (vp_ioread16(&cfg->queue_reset))
+ msleep(1);
+
+ while (vp_ioread16(&cfg->cfg.queue_enable))
+ msleep(1);
+}
+EXPORT_SYMBOL_GPL(vp_modern_set_queue_reset);
+
+/*
+ * vp_modern_queue_vector - set the MSIX vector for a specific virtqueue
+ * @mdev: the modern virtio-pci device
+ * @index: queue index
+ * @vector: the config vector
+ *
+ * Returns the config vector read from the device
+ */
+u16 vp_modern_queue_vector(struct virtio_pci_modern_device *mdev,
+ u16 index, u16 vector)
+{
+ struct virtio_pci_common_cfg __iomem *cfg = mdev->common;
+
+ vp_iowrite16(index, &cfg->queue_select);
+ vp_iowrite16(vector, &cfg->queue_msix_vector);
+ /* Flush the write out to device */
+ return vp_ioread16(&cfg->queue_msix_vector);
+}
+EXPORT_SYMBOL_GPL(vp_modern_queue_vector);
+
+/*
+ * vp_modern_config_vector - set the vector for config interrupt
+ * @mdev: the modern virtio-pci device
+ * @vector: the config vector
+ *
+ * Returns the config vector read from the device
+ */
+u16 vp_modern_config_vector(struct virtio_pci_modern_device *mdev,
+ u16 vector)
+{
+ struct virtio_pci_common_cfg __iomem *cfg = mdev->common;
+
+ /* Setup the vector used for configuration events */
+ vp_iowrite16(vector, &cfg->msix_config);
+ /* Verify we had enough resources to assign the vector */
+ /* Will also flush the write out to device */
+ return vp_ioread16(&cfg->msix_config);
+}
+EXPORT_SYMBOL_GPL(vp_modern_config_vector);
+
+/*
+ * vp_modern_queue_address - set the virtqueue address
+ * @mdev: the modern virtio-pci device
+ * @index: the queue index
+ * @desc_addr: address of the descriptor area
+ * @driver_addr: address of the driver area
+ * @device_addr: address of the device area
+ */
+void vp_modern_queue_address(struct virtio_pci_modern_device *mdev,
+ u16 index, u64 desc_addr, u64 driver_addr,
+ u64 device_addr)
+{
+ struct virtio_pci_common_cfg __iomem *cfg = mdev->common;
+
+ vp_iowrite16(index, &cfg->queue_select);
+
+ vp_iowrite64_twopart(desc_addr, &cfg->queue_desc_lo,
+ &cfg->queue_desc_hi);
+ vp_iowrite64_twopart(driver_addr, &cfg->queue_avail_lo,
+ &cfg->queue_avail_hi);
+ vp_iowrite64_twopart(device_addr, &cfg->queue_used_lo,
+ &cfg->queue_used_hi);
+}
+EXPORT_SYMBOL_GPL(vp_modern_queue_address);
+
+/*
+ * vp_modern_set_queue_enable - enable a virtqueue
+ * @mdev: the modern virtio-pci device
+ * @index: the queue index
+ * @enable: whether the virtqueue is enable or not
+ */
+void vp_modern_set_queue_enable(struct virtio_pci_modern_device *mdev,
+ u16 index, bool enable)
+{
+ vp_iowrite16(index, &mdev->common->queue_select);
+ vp_iowrite16(enable, &mdev->common->queue_enable);
+}
+EXPORT_SYMBOL_GPL(vp_modern_set_queue_enable);
+
+/*
+ * vp_modern_get_queue_enable - enable a virtqueue
+ * @mdev: the modern virtio-pci device
+ * @index: the queue index
+ *
+ * Returns whether a virtqueue is enabled or not
+ */
+bool vp_modern_get_queue_enable(struct virtio_pci_modern_device *mdev,
+ u16 index)
+{
+ vp_iowrite16(index, &mdev->common->queue_select);
+
+ return vp_ioread16(&mdev->common->queue_enable);
+}
+EXPORT_SYMBOL_GPL(vp_modern_get_queue_enable);
+
+/*
+ * vp_modern_set_queue_size - set size for a virtqueue
+ * @mdev: the modern virtio-pci device
+ * @index: the queue index
+ * @size: the size of the virtqueue
+ */
+void vp_modern_set_queue_size(struct virtio_pci_modern_device *mdev,
+ u16 index, u16 size)
+{
+ vp_iowrite16(index, &mdev->common->queue_select);
+ vp_iowrite16(size, &mdev->common->queue_size);
+
+}
+EXPORT_SYMBOL_GPL(vp_modern_set_queue_size);
+
+/*
+ * vp_modern_get_queue_size - get size for a virtqueue
+ * @mdev: the modern virtio-pci device
+ * @index: the queue index
+ *
+ * Returns the size of the virtqueue
+ */
+u16 vp_modern_get_queue_size(struct virtio_pci_modern_device *mdev,
+ u16 index)
+{
+ vp_iowrite16(index, &mdev->common->queue_select);
+
+ return vp_ioread16(&mdev->common->queue_size);
+
+}
+EXPORT_SYMBOL_GPL(vp_modern_get_queue_size);
+
+/*
+ * vp_modern_get_num_queues - get the number of virtqueues
+ * @mdev: the modern virtio-pci device
+ *
+ * Returns the number of virtqueues
+ */
+u16 vp_modern_get_num_queues(struct virtio_pci_modern_device *mdev)
+{
+ return vp_ioread16(&mdev->common->num_queues);
+}
+EXPORT_SYMBOL_GPL(vp_modern_get_num_queues);
+
+/*
+ * vp_modern_get_queue_notify_off - get notification offset for a virtqueue
+ * @mdev: the modern virtio-pci device
+ * @index: the queue index
+ *
+ * Returns the notification offset for a virtqueue
+ */
+static u16 vp_modern_get_queue_notify_off(struct virtio_pci_modern_device *mdev,
+ u16 index)
+{
+ vp_iowrite16(index, &mdev->common->queue_select);
+
+ return vp_ioread16(&mdev->common->queue_notify_off);
+}
+
+/*
+ * vp_modern_map_vq_notify - map notification area for a
+ * specific virtqueue
+ * @mdev: the modern virtio-pci device
+ * @index: the queue index
+ * @pa: the pointer to the physical address of the nofity area
+ *
+ * Returns the address of the notification area
+ */
+void __iomem *vp_modern_map_vq_notify(struct virtio_pci_modern_device *mdev,
+ u16 index, resource_size_t *pa)
+{
+ u16 off = vp_modern_get_queue_notify_off(mdev, index);
+
+ if (mdev->notify_base) {
+ /* offset should not wrap */
+ if ((u64)off * mdev->notify_offset_multiplier + 2
+ > mdev->notify_len) {
+ dev_warn(&mdev->pci_dev->dev,
+ "bad notification offset %u (x %u) "
+ "for queue %u > %zd",
+ off, mdev->notify_offset_multiplier,
+ index, mdev->notify_len);
+ return NULL;
+ }
+ if (pa)
+ *pa = mdev->notify_pa +
+ off * mdev->notify_offset_multiplier;
+ return mdev->notify_base + off * mdev->notify_offset_multiplier;
+ } else {
+ return vp_modern_map_capability(mdev,
+ mdev->notify_map_cap, 2, 2,
+ off * mdev->notify_offset_multiplier, 2,
+ NULL, pa);
+ }
+}
+EXPORT_SYMBOL_GPL(vp_modern_map_vq_notify);
+
+MODULE_VERSION("0.1");
+MODULE_DESCRIPTION("Modern Virtio PCI Device");
+MODULE_AUTHOR("Jason Wang <jasowang@redhat.com>");
+MODULE_LICENSE("GPL");
diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
new file mode 100644
index 000000000..49299b1f9
--- /dev/null
+++ b/drivers/virtio/virtio_ring.c
@@ -0,0 +1,3252 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* Virtio ring implementation.
+ *
+ * Copyright 2007 Rusty Russell IBM Corporation
+ */
+#include <linux/virtio.h>
+#include <linux/virtio_ring.h>
+#include <linux/virtio_config.h>
+#include <linux/device.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/hrtimer.h>
+#include <linux/dma-mapping.h>
+#include <linux/kmsan.h>
+#include <linux/spinlock.h>
+#include <xen/xen.h>
+
+#ifdef DEBUG
+/* For development, we want to crash whenever the ring is screwed. */
+#define BAD_RING(_vq, fmt, args...) \
+ do { \
+ dev_err(&(_vq)->vq.vdev->dev, \
+ "%s:"fmt, (_vq)->vq.name, ##args); \
+ BUG(); \
+ } while (0)
+/* Caller is supposed to guarantee no reentry. */
+#define START_USE(_vq) \
+ do { \
+ if ((_vq)->in_use) \
+ panic("%s:in_use = %i\n", \
+ (_vq)->vq.name, (_vq)->in_use); \
+ (_vq)->in_use = __LINE__; \
+ } while (0)
+#define END_USE(_vq) \
+ do { BUG_ON(!(_vq)->in_use); (_vq)->in_use = 0; } while(0)
+#define LAST_ADD_TIME_UPDATE(_vq) \
+ do { \
+ ktime_t now = ktime_get(); \
+ \
+ /* No kick or get, with .1 second between? Warn. */ \
+ if ((_vq)->last_add_time_valid) \
+ WARN_ON(ktime_to_ms(ktime_sub(now, \
+ (_vq)->last_add_time)) > 100); \
+ (_vq)->last_add_time = now; \
+ (_vq)->last_add_time_valid = true; \
+ } while (0)
+#define LAST_ADD_TIME_CHECK(_vq) \
+ do { \
+ if ((_vq)->last_add_time_valid) { \
+ WARN_ON(ktime_to_ms(ktime_sub(ktime_get(), \
+ (_vq)->last_add_time)) > 100); \
+ } \
+ } while (0)
+#define LAST_ADD_TIME_INVALID(_vq) \
+ ((_vq)->last_add_time_valid = false)
+#else
+#define BAD_RING(_vq, fmt, args...) \
+ do { \
+ dev_err(&_vq->vq.vdev->dev, \
+ "%s:"fmt, (_vq)->vq.name, ##args); \
+ (_vq)->broken = true; \
+ } while (0)
+#define START_USE(vq)
+#define END_USE(vq)
+#define LAST_ADD_TIME_UPDATE(vq)
+#define LAST_ADD_TIME_CHECK(vq)
+#define LAST_ADD_TIME_INVALID(vq)
+#endif
+
+struct vring_desc_state_split {
+ void *data; /* Data for callback. */
+ struct vring_desc *indir_desc; /* Indirect descriptor, if any. */
+};
+
+struct vring_desc_state_packed {
+ void *data; /* Data for callback. */
+ struct vring_packed_desc *indir_desc; /* Indirect descriptor, if any. */
+ u16 num; /* Descriptor list length. */
+ u16 last; /* The last desc state in a list. */
+};
+
+struct vring_desc_extra {
+ dma_addr_t addr; /* Descriptor DMA addr. */
+ u32 len; /* Descriptor length. */
+ u16 flags; /* Descriptor flags. */
+ u16 next; /* The next desc state in a list. */
+};
+
+struct vring_virtqueue_split {
+ /* Actual memory layout for this queue. */
+ struct vring vring;
+
+ /* Last written value to avail->flags */
+ u16 avail_flags_shadow;
+
+ /*
+ * Last written value to avail->idx in
+ * guest byte order.
+ */
+ u16 avail_idx_shadow;
+
+ /* Per-descriptor state. */
+ struct vring_desc_state_split *desc_state;
+ struct vring_desc_extra *desc_extra;
+
+ /* DMA address and size information */
+ dma_addr_t queue_dma_addr;
+ size_t queue_size_in_bytes;
+
+ /*
+ * The parameters for creating vrings are reserved for creating new
+ * vring.
+ */
+ u32 vring_align;
+ bool may_reduce_num;
+};
+
+struct vring_virtqueue_packed {
+ /* Actual memory layout for this queue. */
+ struct {
+ unsigned int num;
+ struct vring_packed_desc *desc;
+ struct vring_packed_desc_event *driver;
+ struct vring_packed_desc_event *device;
+ } vring;
+
+ /* Driver ring wrap counter. */
+ bool avail_wrap_counter;
+
+ /* Avail used flags. */
+ u16 avail_used_flags;
+
+ /* Index of the next avail descriptor. */
+ u16 next_avail_idx;
+
+ /*
+ * Last written value to driver->flags in
+ * guest byte order.
+ */
+ u16 event_flags_shadow;
+
+ /* Per-descriptor state. */
+ struct vring_desc_state_packed *desc_state;
+ struct vring_desc_extra *desc_extra;
+
+ /* DMA address and size information */
+ dma_addr_t ring_dma_addr;
+ dma_addr_t driver_event_dma_addr;
+ dma_addr_t device_event_dma_addr;
+ size_t ring_size_in_bytes;
+ size_t event_size_in_bytes;
+};
+
+struct vring_virtqueue {
+ struct virtqueue vq;
+
+ /* Is this a packed ring? */
+ bool packed_ring;
+
+ /* Is DMA API used? */
+ bool use_dma_api;
+
+ /* Can we use weak barriers? */
+ bool weak_barriers;
+
+ /* Other side has made a mess, don't try any more. */
+ bool broken;
+
+ /* Host supports indirect buffers */
+ bool indirect;
+
+ /* Host publishes avail event idx */
+ bool event;
+
+ /* Do DMA mapping by driver */
+ bool premapped;
+
+ /* Do unmap or not for desc. Just when premapped is False and
+ * use_dma_api is true, this is true.
+ */
+ bool do_unmap;
+
+ /* Head of free buffer list. */
+ unsigned int free_head;
+ /* Number we've added since last sync. */
+ unsigned int num_added;
+
+ /* Last used index we've seen.
+ * for split ring, it just contains last used index
+ * for packed ring:
+ * bits up to VRING_PACKED_EVENT_F_WRAP_CTR include the last used index.
+ * bits from VRING_PACKED_EVENT_F_WRAP_CTR include the used wrap counter.
+ */
+ u16 last_used_idx;
+
+ /* Hint for event idx: already triggered no need to disable. */
+ bool event_triggered;
+
+ union {
+ /* Available for split ring */
+ struct vring_virtqueue_split split;
+
+ /* Available for packed ring */
+ struct vring_virtqueue_packed packed;
+ };
+
+ /* How to notify other side. FIXME: commonalize hcalls! */
+ bool (*notify)(struct virtqueue *vq);
+
+ /* DMA, allocation, and size information */
+ bool we_own_ring;
+
+ /* Device used for doing DMA */
+ struct device *dma_dev;
+
+#ifdef DEBUG
+ /* They're supposed to lock for us. */
+ unsigned int in_use;
+
+ /* Figure out if their kicks are too delayed. */
+ bool last_add_time_valid;
+ ktime_t last_add_time;
+#endif
+};
+
+static struct virtqueue *__vring_new_virtqueue(unsigned int index,
+ struct vring_virtqueue_split *vring_split,
+ struct virtio_device *vdev,
+ bool weak_barriers,
+ bool context,
+ bool (*notify)(struct virtqueue *),
+ void (*callback)(struct virtqueue *),
+ const char *name,
+ struct device *dma_dev);
+static struct vring_desc_extra *vring_alloc_desc_extra(unsigned int num);
+static void vring_free(struct virtqueue *_vq);
+
+/*
+ * Helpers.
+ */
+
+#define to_vvq(_vq) container_of_const(_vq, struct vring_virtqueue, vq)
+
+static bool virtqueue_use_indirect(const struct vring_virtqueue *vq,
+ unsigned int total_sg)
+{
+ /*
+ * If the host supports indirect descriptor tables, and we have multiple
+ * buffers, then go indirect. FIXME: tune this threshold
+ */
+ return (vq->indirect && total_sg > 1 && vq->vq.num_free);
+}
+
+/*
+ * Modern virtio devices have feature bits to specify whether they need a
+ * quirk and bypass the IOMMU. If not there, just use the DMA API.
+ *
+ * If there, the interaction between virtio and DMA API is messy.
+ *
+ * On most systems with virtio, physical addresses match bus addresses,
+ * and it doesn't particularly matter whether we use the DMA API.
+ *
+ * On some systems, including Xen and any system with a physical device
+ * that speaks virtio behind a physical IOMMU, we must use the DMA API
+ * for virtio DMA to work at all.
+ *
+ * On other systems, including SPARC and PPC64, virtio-pci devices are
+ * enumerated as though they are behind an IOMMU, but the virtio host
+ * ignores the IOMMU, so we must either pretend that the IOMMU isn't
+ * there or somehow map everything as the identity.
+ *
+ * For the time being, we preserve historic behavior and bypass the DMA
+ * API.
+ *
+ * TODO: install a per-device DMA ops structure that does the right thing
+ * taking into account all the above quirks, and use the DMA API
+ * unconditionally on data path.
+ */
+
+static bool vring_use_dma_api(const struct virtio_device *vdev)
+{
+ if (!virtio_has_dma_quirk(vdev))
+ return true;
+
+ /* Otherwise, we are left to guess. */
+ /*
+ * In theory, it's possible to have a buggy QEMU-supposed
+ * emulated Q35 IOMMU and Xen enabled at the same time. On
+ * such a configuration, virtio has never worked and will
+ * not work without an even larger kludge. Instead, enable
+ * the DMA API if we're a Xen guest, which at least allows
+ * all of the sensible Xen configurations to work correctly.
+ */
+ if (xen_domain())
+ return true;
+
+ return false;
+}
+
+size_t virtio_max_dma_size(const struct virtio_device *vdev)
+{
+ size_t max_segment_size = SIZE_MAX;
+
+ if (vring_use_dma_api(vdev))
+ max_segment_size = dma_max_mapping_size(vdev->dev.parent);
+
+ return max_segment_size;
+}
+EXPORT_SYMBOL_GPL(virtio_max_dma_size);
+
+static void *vring_alloc_queue(struct virtio_device *vdev, size_t size,
+ dma_addr_t *dma_handle, gfp_t flag,
+ struct device *dma_dev)
+{
+ if (vring_use_dma_api(vdev)) {
+ return dma_alloc_coherent(dma_dev, size,
+ dma_handle, flag);
+ } else {
+ void *queue = alloc_pages_exact(PAGE_ALIGN(size), flag);
+
+ if (queue) {
+ phys_addr_t phys_addr = virt_to_phys(queue);
+ *dma_handle = (dma_addr_t)phys_addr;
+
+ /*
+ * Sanity check: make sure we dind't truncate
+ * the address. The only arches I can find that
+ * have 64-bit phys_addr_t but 32-bit dma_addr_t
+ * are certain non-highmem MIPS and x86
+ * configurations, but these configurations
+ * should never allocate physical pages above 32
+ * bits, so this is fine. Just in case, throw a
+ * warning and abort if we end up with an
+ * unrepresentable address.
+ */
+ if (WARN_ON_ONCE(*dma_handle != phys_addr)) {
+ free_pages_exact(queue, PAGE_ALIGN(size));
+ return NULL;
+ }
+ }
+ return queue;
+ }
+}
+
+static void vring_free_queue(struct virtio_device *vdev, size_t size,
+ void *queue, dma_addr_t dma_handle,
+ struct device *dma_dev)
+{
+ if (vring_use_dma_api(vdev))
+ dma_free_coherent(dma_dev, size, queue, dma_handle);
+ else
+ free_pages_exact(queue, PAGE_ALIGN(size));
+}
+
+/*
+ * The DMA ops on various arches are rather gnarly right now, and
+ * making all of the arch DMA ops work on the vring device itself
+ * is a mess.
+ */
+static struct device *vring_dma_dev(const struct vring_virtqueue *vq)
+{
+ return vq->dma_dev;
+}
+
+/* Map one sg entry. */
+static int vring_map_one_sg(const struct vring_virtqueue *vq, struct scatterlist *sg,
+ enum dma_data_direction direction, dma_addr_t *addr)
+{
+ if (vq->premapped) {
+ *addr = sg_dma_address(sg);
+ return 0;
+ }
+
+ if (!vq->use_dma_api) {
+ /*
+ * If DMA is not used, KMSAN doesn't know that the scatterlist
+ * is initialized by the hardware. Explicitly check/unpoison it
+ * depending on the direction.
+ */
+ kmsan_handle_dma(sg_page(sg), sg->offset, sg->length, direction);
+ *addr = (dma_addr_t)sg_phys(sg);
+ return 0;
+ }
+
+ /*
+ * We can't use dma_map_sg, because we don't use scatterlists in
+ * the way it expects (we don't guarantee that the scatterlist
+ * will exist for the lifetime of the mapping).
+ */
+ *addr = dma_map_page(vring_dma_dev(vq),
+ sg_page(sg), sg->offset, sg->length,
+ direction);
+
+ if (dma_mapping_error(vring_dma_dev(vq), *addr))
+ return -ENOMEM;
+
+ return 0;
+}
+
+static dma_addr_t vring_map_single(const struct vring_virtqueue *vq,
+ void *cpu_addr, size_t size,
+ enum dma_data_direction direction)
+{
+ if (!vq->use_dma_api)
+ return (dma_addr_t)virt_to_phys(cpu_addr);
+
+ return dma_map_single(vring_dma_dev(vq),
+ cpu_addr, size, direction);
+}
+
+static int vring_mapping_error(const struct vring_virtqueue *vq,
+ dma_addr_t addr)
+{
+ if (!vq->use_dma_api)
+ return 0;
+
+ return dma_mapping_error(vring_dma_dev(vq), addr);
+}
+
+static void virtqueue_init(struct vring_virtqueue *vq, u32 num)
+{
+ vq->vq.num_free = num;
+
+ if (vq->packed_ring)
+ vq->last_used_idx = 0 | (1 << VRING_PACKED_EVENT_F_WRAP_CTR);
+ else
+ vq->last_used_idx = 0;
+
+ vq->event_triggered = false;
+ vq->num_added = 0;
+
+#ifdef DEBUG
+ vq->in_use = false;
+ vq->last_add_time_valid = false;
+#endif
+}
+
+
+/*
+ * Split ring specific functions - *_split().
+ */
+
+static void vring_unmap_one_split_indirect(const struct vring_virtqueue *vq,
+ const struct vring_desc *desc)
+{
+ u16 flags;
+
+ if (!vq->do_unmap)
+ return;
+
+ flags = virtio16_to_cpu(vq->vq.vdev, desc->flags);
+
+ dma_unmap_page(vring_dma_dev(vq),
+ virtio64_to_cpu(vq->vq.vdev, desc->addr),
+ virtio32_to_cpu(vq->vq.vdev, desc->len),
+ (flags & VRING_DESC_F_WRITE) ?
+ DMA_FROM_DEVICE : DMA_TO_DEVICE);
+}
+
+static unsigned int vring_unmap_one_split(const struct vring_virtqueue *vq,
+ unsigned int i)
+{
+ struct vring_desc_extra *extra = vq->split.desc_extra;
+ u16 flags;
+
+ flags = extra[i].flags;
+
+ if (flags & VRING_DESC_F_INDIRECT) {
+ if (!vq->use_dma_api)
+ goto out;
+
+ dma_unmap_single(vring_dma_dev(vq),
+ extra[i].addr,
+ extra[i].len,
+ (flags & VRING_DESC_F_WRITE) ?
+ DMA_FROM_DEVICE : DMA_TO_DEVICE);
+ } else {
+ if (!vq->do_unmap)
+ goto out;
+
+ dma_unmap_page(vring_dma_dev(vq),
+ extra[i].addr,
+ extra[i].len,
+ (flags & VRING_DESC_F_WRITE) ?
+ DMA_FROM_DEVICE : DMA_TO_DEVICE);
+ }
+
+out:
+ return extra[i].next;
+}
+
+static struct vring_desc *alloc_indirect_split(struct virtqueue *_vq,
+ unsigned int total_sg,
+ gfp_t gfp)
+{
+ struct vring_desc *desc;
+ unsigned int i;
+
+ /*
+ * We require lowmem mappings for the descriptors because
+ * otherwise virt_to_phys will give us bogus addresses in the
+ * virtqueue.
+ */
+ gfp &= ~__GFP_HIGHMEM;
+
+ desc = kmalloc_array(total_sg, sizeof(struct vring_desc), gfp);
+ if (!desc)
+ return NULL;
+
+ for (i = 0; i < total_sg; i++)
+ desc[i].next = cpu_to_virtio16(_vq->vdev, i + 1);
+ return desc;
+}
+
+static inline unsigned int virtqueue_add_desc_split(struct virtqueue *vq,
+ struct vring_desc *desc,
+ unsigned int i,
+ dma_addr_t addr,
+ unsigned int len,
+ u16 flags,
+ bool indirect)
+{
+ struct vring_virtqueue *vring = to_vvq(vq);
+ struct vring_desc_extra *extra = vring->split.desc_extra;
+ u16 next;
+
+ desc[i].flags = cpu_to_virtio16(vq->vdev, flags);
+ desc[i].addr = cpu_to_virtio64(vq->vdev, addr);
+ desc[i].len = cpu_to_virtio32(vq->vdev, len);
+
+ if (!indirect) {
+ next = extra[i].next;
+ desc[i].next = cpu_to_virtio16(vq->vdev, next);
+
+ extra[i].addr = addr;
+ extra[i].len = len;
+ extra[i].flags = flags;
+ } else
+ next = virtio16_to_cpu(vq->vdev, desc[i].next);
+
+ return next;
+}
+
+static inline int virtqueue_add_split(struct virtqueue *_vq,
+ struct scatterlist *sgs[],
+ unsigned int total_sg,
+ unsigned int out_sgs,
+ unsigned int in_sgs,
+ void *data,
+ void *ctx,
+ gfp_t gfp)
+{
+ struct vring_virtqueue *vq = to_vvq(_vq);
+ struct scatterlist *sg;
+ struct vring_desc *desc;
+ unsigned int i, n, avail, descs_used, prev, err_idx;
+ int head;
+ bool indirect;
+
+ START_USE(vq);
+
+ BUG_ON(data == NULL);
+ BUG_ON(ctx && vq->indirect);
+
+ if (unlikely(vq->broken)) {
+ END_USE(vq);
+ return -EIO;
+ }
+
+ LAST_ADD_TIME_UPDATE(vq);
+
+ BUG_ON(total_sg == 0);
+
+ head = vq->free_head;
+
+ if (virtqueue_use_indirect(vq, total_sg))
+ desc = alloc_indirect_split(_vq, total_sg, gfp);
+ else {
+ desc = NULL;
+ WARN_ON_ONCE(total_sg > vq->split.vring.num && !vq->indirect);
+ }
+
+ if (desc) {
+ /* Use a single buffer which doesn't continue */
+ indirect = true;
+ /* Set up rest to use this indirect table. */
+ i = 0;
+ descs_used = 1;
+ } else {
+ indirect = false;
+ desc = vq->split.vring.desc;
+ i = head;
+ descs_used = total_sg;
+ }
+
+ if (unlikely(vq->vq.num_free < descs_used)) {
+ pr_debug("Can't add buf len %i - avail = %i\n",
+ descs_used, vq->vq.num_free);
+ /* FIXME: for historical reasons, we force a notify here if
+ * there are outgoing parts to the buffer. Presumably the
+ * host should service the ring ASAP. */
+ if (out_sgs)
+ vq->notify(&vq->vq);
+ if (indirect)
+ kfree(desc);
+ END_USE(vq);
+ return -ENOSPC;
+ }
+
+ for (n = 0; n < out_sgs; n++) {
+ for (sg = sgs[n]; sg; sg = sg_next(sg)) {
+ dma_addr_t addr;
+
+ if (vring_map_one_sg(vq, sg, DMA_TO_DEVICE, &addr))
+ goto unmap_release;
+
+ prev = i;
+ /* Note that we trust indirect descriptor
+ * table since it use stream DMA mapping.
+ */
+ i = virtqueue_add_desc_split(_vq, desc, i, addr, sg->length,
+ VRING_DESC_F_NEXT,
+ indirect);
+ }
+ }
+ for (; n < (out_sgs + in_sgs); n++) {
+ for (sg = sgs[n]; sg; sg = sg_next(sg)) {
+ dma_addr_t addr;
+
+ if (vring_map_one_sg(vq, sg, DMA_FROM_DEVICE, &addr))
+ goto unmap_release;
+
+ prev = i;
+ /* Note that we trust indirect descriptor
+ * table since it use stream DMA mapping.
+ */
+ i = virtqueue_add_desc_split(_vq, desc, i, addr,
+ sg->length,
+ VRING_DESC_F_NEXT |
+ VRING_DESC_F_WRITE,
+ indirect);
+ }
+ }
+ /* Last one doesn't continue. */
+ desc[prev].flags &= cpu_to_virtio16(_vq->vdev, ~VRING_DESC_F_NEXT);
+ if (!indirect && vq->do_unmap)
+ vq->split.desc_extra[prev & (vq->split.vring.num - 1)].flags &=
+ ~VRING_DESC_F_NEXT;
+
+ if (indirect) {
+ /* Now that the indirect table is filled in, map it. */
+ dma_addr_t addr = vring_map_single(
+ vq, desc, total_sg * sizeof(struct vring_desc),
+ DMA_TO_DEVICE);
+ if (vring_mapping_error(vq, addr)) {
+ if (vq->premapped)
+ goto free_indirect;
+
+ goto unmap_release;
+ }
+
+ virtqueue_add_desc_split(_vq, vq->split.vring.desc,
+ head, addr,
+ total_sg * sizeof(struct vring_desc),
+ VRING_DESC_F_INDIRECT,
+ false);
+ }
+
+ /* We're using some buffers from the free list. */
+ vq->vq.num_free -= descs_used;
+
+ /* Update free pointer */
+ if (indirect)
+ vq->free_head = vq->split.desc_extra[head].next;
+ else
+ vq->free_head = i;
+
+ /* Store token and indirect buffer state. */
+ vq->split.desc_state[head].data = data;
+ if (indirect)
+ vq->split.desc_state[head].indir_desc = desc;
+ else
+ vq->split.desc_state[head].indir_desc = ctx;
+
+ /* Put entry in available array (but don't update avail->idx until they
+ * do sync). */
+ avail = vq->split.avail_idx_shadow & (vq->split.vring.num - 1);
+ vq->split.vring.avail->ring[avail] = cpu_to_virtio16(_vq->vdev, head);
+
+ /* Descriptors and available array need to be set before we expose the
+ * new available array entries. */
+ virtio_wmb(vq->weak_barriers);
+ vq->split.avail_idx_shadow++;
+ vq->split.vring.avail->idx = cpu_to_virtio16(_vq->vdev,
+ vq->split.avail_idx_shadow);
+ vq->num_added++;
+
+ pr_debug("Added buffer head %i to %p\n", head, vq);
+ END_USE(vq);
+
+ /* This is very unlikely, but theoretically possible. Kick
+ * just in case. */
+ if (unlikely(vq->num_added == (1 << 16) - 1))
+ virtqueue_kick(_vq);
+
+ return 0;
+
+unmap_release:
+ err_idx = i;
+
+ if (indirect)
+ i = 0;
+ else
+ i = head;
+
+ for (n = 0; n < total_sg; n++) {
+ if (i == err_idx)
+ break;
+ if (indirect) {
+ vring_unmap_one_split_indirect(vq, &desc[i]);
+ i = virtio16_to_cpu(_vq->vdev, desc[i].next);
+ } else
+ i = vring_unmap_one_split(vq, i);
+ }
+
+free_indirect:
+ if (indirect)
+ kfree(desc);
+
+ END_USE(vq);
+ return -ENOMEM;
+}
+
+static bool virtqueue_kick_prepare_split(struct virtqueue *_vq)
+{
+ struct vring_virtqueue *vq = to_vvq(_vq);
+ u16 new, old;
+ bool needs_kick;
+
+ START_USE(vq);
+ /* We need to expose available array entries before checking avail
+ * event. */
+ virtio_mb(vq->weak_barriers);
+
+ old = vq->split.avail_idx_shadow - vq->num_added;
+ new = vq->split.avail_idx_shadow;
+ vq->num_added = 0;
+
+ LAST_ADD_TIME_CHECK(vq);
+ LAST_ADD_TIME_INVALID(vq);
+
+ if (vq->event) {
+ needs_kick = vring_need_event(virtio16_to_cpu(_vq->vdev,
+ vring_avail_event(&vq->split.vring)),
+ new, old);
+ } else {
+ needs_kick = !(vq->split.vring.used->flags &
+ cpu_to_virtio16(_vq->vdev,
+ VRING_USED_F_NO_NOTIFY));
+ }
+ END_USE(vq);
+ return needs_kick;
+}
+
+static void detach_buf_split(struct vring_virtqueue *vq, unsigned int head,
+ void **ctx)
+{
+ unsigned int i, j;
+ __virtio16 nextflag = cpu_to_virtio16(vq->vq.vdev, VRING_DESC_F_NEXT);
+
+ /* Clear data ptr. */
+ vq->split.desc_state[head].data = NULL;
+
+ /* Put back on free list: unmap first-level descriptors and find end */
+ i = head;
+
+ while (vq->split.vring.desc[i].flags & nextflag) {
+ vring_unmap_one_split(vq, i);
+ i = vq->split.desc_extra[i].next;
+ vq->vq.num_free++;
+ }
+
+ vring_unmap_one_split(vq, i);
+ vq->split.desc_extra[i].next = vq->free_head;
+ vq->free_head = head;
+
+ /* Plus final descriptor */
+ vq->vq.num_free++;
+
+ if (vq->indirect) {
+ struct vring_desc *indir_desc =
+ vq->split.desc_state[head].indir_desc;
+ u32 len;
+
+ /* Free the indirect table, if any, now that it's unmapped. */
+ if (!indir_desc)
+ return;
+
+ len = vq->split.desc_extra[head].len;
+
+ BUG_ON(!(vq->split.desc_extra[head].flags &
+ VRING_DESC_F_INDIRECT));
+ BUG_ON(len == 0 || len % sizeof(struct vring_desc));
+
+ if (vq->do_unmap) {
+ for (j = 0; j < len / sizeof(struct vring_desc); j++)
+ vring_unmap_one_split_indirect(vq, &indir_desc[j]);
+ }
+
+ kfree(indir_desc);
+ vq->split.desc_state[head].indir_desc = NULL;
+ } else if (ctx) {
+ *ctx = vq->split.desc_state[head].indir_desc;
+ }
+}
+
+static bool more_used_split(const struct vring_virtqueue *vq)
+{
+ return vq->last_used_idx != virtio16_to_cpu(vq->vq.vdev,
+ vq->split.vring.used->idx);
+}
+
+static void *virtqueue_get_buf_ctx_split(struct virtqueue *_vq,
+ unsigned int *len,
+ void **ctx)
+{
+ struct vring_virtqueue *vq = to_vvq(_vq);
+ void *ret;
+ unsigned int i;
+ u16 last_used;
+
+ START_USE(vq);
+
+ if (unlikely(vq->broken)) {
+ END_USE(vq);
+ return NULL;
+ }
+
+ if (!more_used_split(vq)) {
+ pr_debug("No more buffers in queue\n");
+ END_USE(vq);
+ return NULL;
+ }
+
+ /* Only get used array entries after they have been exposed by host. */
+ virtio_rmb(vq->weak_barriers);
+
+ last_used = (vq->last_used_idx & (vq->split.vring.num - 1));
+ i = virtio32_to_cpu(_vq->vdev,
+ vq->split.vring.used->ring[last_used].id);
+ *len = virtio32_to_cpu(_vq->vdev,
+ vq->split.vring.used->ring[last_used].len);
+
+ if (unlikely(i >= vq->split.vring.num)) {
+ BAD_RING(vq, "id %u out of range\n", i);
+ return NULL;
+ }
+ if (unlikely(!vq->split.desc_state[i].data)) {
+ BAD_RING(vq, "id %u is not a head!\n", i);
+ return NULL;
+ }
+
+ /* detach_buf_split clears data, so grab it now. */
+ ret = vq->split.desc_state[i].data;
+ detach_buf_split(vq, i, ctx);
+ vq->last_used_idx++;
+ /* If we expect an interrupt for the next entry, tell host
+ * by writing event index and flush out the write before
+ * the read in the next get_buf call. */
+ if (!(vq->split.avail_flags_shadow & VRING_AVAIL_F_NO_INTERRUPT))
+ virtio_store_mb(vq->weak_barriers,
+ &vring_used_event(&vq->split.vring),
+ cpu_to_virtio16(_vq->vdev, vq->last_used_idx));
+
+ LAST_ADD_TIME_INVALID(vq);
+
+ END_USE(vq);
+ return ret;
+}
+
+static void virtqueue_disable_cb_split(struct virtqueue *_vq)
+{
+ struct vring_virtqueue *vq = to_vvq(_vq);
+
+ if (!(vq->split.avail_flags_shadow & VRING_AVAIL_F_NO_INTERRUPT)) {
+ vq->split.avail_flags_shadow |= VRING_AVAIL_F_NO_INTERRUPT;
+
+ /*
+ * If device triggered an event already it won't trigger one again:
+ * no need to disable.
+ */
+ if (vq->event_triggered)
+ return;
+
+ if (vq->event)
+ /* TODO: this is a hack. Figure out a cleaner value to write. */
+ vring_used_event(&vq->split.vring) = 0x0;
+ else
+ vq->split.vring.avail->flags =
+ cpu_to_virtio16(_vq->vdev,
+ vq->split.avail_flags_shadow);
+ }
+}
+
+static unsigned int virtqueue_enable_cb_prepare_split(struct virtqueue *_vq)
+{
+ struct vring_virtqueue *vq = to_vvq(_vq);
+ u16 last_used_idx;
+
+ START_USE(vq);
+
+ /* We optimistically turn back on interrupts, then check if there was
+ * more to do. */
+ /* Depending on the VIRTIO_RING_F_EVENT_IDX feature, we need to
+ * either clear the flags bit or point the event index at the next
+ * entry. Always do both to keep code simple. */
+ if (vq->split.avail_flags_shadow & VRING_AVAIL_F_NO_INTERRUPT) {
+ vq->split.avail_flags_shadow &= ~VRING_AVAIL_F_NO_INTERRUPT;
+ if (!vq->event)
+ vq->split.vring.avail->flags =
+ cpu_to_virtio16(_vq->vdev,
+ vq->split.avail_flags_shadow);
+ }
+ vring_used_event(&vq->split.vring) = cpu_to_virtio16(_vq->vdev,
+ last_used_idx = vq->last_used_idx);
+ END_USE(vq);
+ return last_used_idx;
+}
+
+static bool virtqueue_poll_split(struct virtqueue *_vq, unsigned int last_used_idx)
+{
+ struct vring_virtqueue *vq = to_vvq(_vq);
+
+ return (u16)last_used_idx != virtio16_to_cpu(_vq->vdev,
+ vq->split.vring.used->idx);
+}
+
+static bool virtqueue_enable_cb_delayed_split(struct virtqueue *_vq)
+{
+ struct vring_virtqueue *vq = to_vvq(_vq);
+ u16 bufs;
+
+ START_USE(vq);
+
+ /* We optimistically turn back on interrupts, then check if there was
+ * more to do. */
+ /* Depending on the VIRTIO_RING_F_USED_EVENT_IDX feature, we need to
+ * either clear the flags bit or point the event index at the next
+ * entry. Always update the event index to keep code simple. */
+ if (vq->split.avail_flags_shadow & VRING_AVAIL_F_NO_INTERRUPT) {
+ vq->split.avail_flags_shadow &= ~VRING_AVAIL_F_NO_INTERRUPT;
+ if (!vq->event)
+ vq->split.vring.avail->flags =
+ cpu_to_virtio16(_vq->vdev,
+ vq->split.avail_flags_shadow);
+ }
+ /* TODO: tune this threshold */
+ bufs = (u16)(vq->split.avail_idx_shadow - vq->last_used_idx) * 3 / 4;
+
+ virtio_store_mb(vq->weak_barriers,
+ &vring_used_event(&vq->split.vring),
+ cpu_to_virtio16(_vq->vdev, vq->last_used_idx + bufs));
+
+ if (unlikely((u16)(virtio16_to_cpu(_vq->vdev, vq->split.vring.used->idx)
+ - vq->last_used_idx) > bufs)) {
+ END_USE(vq);
+ return false;
+ }
+
+ END_USE(vq);
+ return true;
+}
+
+static void *virtqueue_detach_unused_buf_split(struct virtqueue *_vq)
+{
+ struct vring_virtqueue *vq = to_vvq(_vq);
+ unsigned int i;
+ void *buf;
+
+ START_USE(vq);
+
+ for (i = 0; i < vq->split.vring.num; i++) {
+ if (!vq->split.desc_state[i].data)
+ continue;
+ /* detach_buf_split clears data, so grab it now. */
+ buf = vq->split.desc_state[i].data;
+ detach_buf_split(vq, i, NULL);
+ vq->split.avail_idx_shadow--;
+ vq->split.vring.avail->idx = cpu_to_virtio16(_vq->vdev,
+ vq->split.avail_idx_shadow);
+ END_USE(vq);
+ return buf;
+ }
+ /* That should have freed everything. */
+ BUG_ON(vq->vq.num_free != vq->split.vring.num);
+
+ END_USE(vq);
+ return NULL;
+}
+
+static void virtqueue_vring_init_split(struct vring_virtqueue_split *vring_split,
+ struct vring_virtqueue *vq)
+{
+ struct virtio_device *vdev;
+
+ vdev = vq->vq.vdev;
+
+ vring_split->avail_flags_shadow = 0;
+ vring_split->avail_idx_shadow = 0;
+
+ /* No callback? Tell other side not to bother us. */
+ if (!vq->vq.callback) {
+ vring_split->avail_flags_shadow |= VRING_AVAIL_F_NO_INTERRUPT;
+ if (!vq->event)
+ vring_split->vring.avail->flags = cpu_to_virtio16(vdev,
+ vring_split->avail_flags_shadow);
+ }
+}
+
+static void virtqueue_reinit_split(struct vring_virtqueue *vq)
+{
+ int num;
+
+ num = vq->split.vring.num;
+
+ vq->split.vring.avail->flags = 0;
+ vq->split.vring.avail->idx = 0;
+
+ /* reset avail event */
+ vq->split.vring.avail->ring[num] = 0;
+
+ vq->split.vring.used->flags = 0;
+ vq->split.vring.used->idx = 0;
+
+ /* reset used event */
+ *(__virtio16 *)&(vq->split.vring.used->ring[num]) = 0;
+
+ virtqueue_init(vq, num);
+
+ virtqueue_vring_init_split(&vq->split, vq);
+}
+
+static void virtqueue_vring_attach_split(struct vring_virtqueue *vq,
+ struct vring_virtqueue_split *vring_split)
+{
+ vq->split = *vring_split;
+
+ /* Put everything in free lists. */
+ vq->free_head = 0;
+}
+
+static int vring_alloc_state_extra_split(struct vring_virtqueue_split *vring_split)
+{
+ struct vring_desc_state_split *state;
+ struct vring_desc_extra *extra;
+ u32 num = vring_split->vring.num;
+
+ state = kmalloc_array(num, sizeof(struct vring_desc_state_split), GFP_KERNEL);
+ if (!state)
+ goto err_state;
+
+ extra = vring_alloc_desc_extra(num);
+ if (!extra)
+ goto err_extra;
+
+ memset(state, 0, num * sizeof(struct vring_desc_state_split));
+
+ vring_split->desc_state = state;
+ vring_split->desc_extra = extra;
+ return 0;
+
+err_extra:
+ kfree(state);
+err_state:
+ return -ENOMEM;
+}
+
+static void vring_free_split(struct vring_virtqueue_split *vring_split,
+ struct virtio_device *vdev, struct device *dma_dev)
+{
+ vring_free_queue(vdev, vring_split->queue_size_in_bytes,
+ vring_split->vring.desc,
+ vring_split->queue_dma_addr,
+ dma_dev);
+
+ kfree(vring_split->desc_state);
+ kfree(vring_split->desc_extra);
+}
+
+static int vring_alloc_queue_split(struct vring_virtqueue_split *vring_split,
+ struct virtio_device *vdev,
+ u32 num,
+ unsigned int vring_align,
+ bool may_reduce_num,
+ struct device *dma_dev)
+{
+ void *queue = NULL;
+ dma_addr_t dma_addr;
+
+ /* We assume num is a power of 2. */
+ if (!is_power_of_2(num)) {
+ dev_warn(&vdev->dev, "Bad virtqueue length %u\n", num);
+ return -EINVAL;
+ }
+
+ /* TODO: allocate each queue chunk individually */
+ for (; num && vring_size(num, vring_align) > PAGE_SIZE; num /= 2) {
+ queue = vring_alloc_queue(vdev, vring_size(num, vring_align),
+ &dma_addr,
+ GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO,
+ dma_dev);
+ if (queue)
+ break;
+ if (!may_reduce_num)
+ return -ENOMEM;
+ }
+
+ if (!num)
+ return -ENOMEM;
+
+ if (!queue) {
+ /* Try to get a single page. You are my only hope! */
+ queue = vring_alloc_queue(vdev, vring_size(num, vring_align),
+ &dma_addr, GFP_KERNEL | __GFP_ZERO,
+ dma_dev);
+ }
+ if (!queue)
+ return -ENOMEM;
+
+ vring_init(&vring_split->vring, num, queue, vring_align);
+
+ vring_split->queue_dma_addr = dma_addr;
+ vring_split->queue_size_in_bytes = vring_size(num, vring_align);
+
+ vring_split->vring_align = vring_align;
+ vring_split->may_reduce_num = may_reduce_num;
+
+ return 0;
+}
+
+static struct virtqueue *vring_create_virtqueue_split(
+ unsigned int index,
+ unsigned int num,
+ unsigned int vring_align,
+ struct virtio_device *vdev,
+ bool weak_barriers,
+ bool may_reduce_num,
+ bool context,
+ bool (*notify)(struct virtqueue *),
+ void (*callback)(struct virtqueue *),
+ const char *name,
+ struct device *dma_dev)
+{
+ struct vring_virtqueue_split vring_split = {};
+ struct virtqueue *vq;
+ int err;
+
+ err = vring_alloc_queue_split(&vring_split, vdev, num, vring_align,
+ may_reduce_num, dma_dev);
+ if (err)
+ return NULL;
+
+ vq = __vring_new_virtqueue(index, &vring_split, vdev, weak_barriers,
+ context, notify, callback, name, dma_dev);
+ if (!vq) {
+ vring_free_split(&vring_split, vdev, dma_dev);
+ return NULL;
+ }
+
+ to_vvq(vq)->we_own_ring = true;
+
+ return vq;
+}
+
+static int virtqueue_resize_split(struct virtqueue *_vq, u32 num)
+{
+ struct vring_virtqueue_split vring_split = {};
+ struct vring_virtqueue *vq = to_vvq(_vq);
+ struct virtio_device *vdev = _vq->vdev;
+ int err;
+
+ err = vring_alloc_queue_split(&vring_split, vdev, num,
+ vq->split.vring_align,
+ vq->split.may_reduce_num,
+ vring_dma_dev(vq));
+ if (err)
+ goto err;
+
+ err = vring_alloc_state_extra_split(&vring_split);
+ if (err)
+ goto err_state_extra;
+
+ vring_free(&vq->vq);
+
+ virtqueue_vring_init_split(&vring_split, vq);
+
+ virtqueue_init(vq, vring_split.vring.num);
+ virtqueue_vring_attach_split(vq, &vring_split);
+
+ return 0;
+
+err_state_extra:
+ vring_free_split(&vring_split, vdev, vring_dma_dev(vq));
+err:
+ virtqueue_reinit_split(vq);
+ return -ENOMEM;
+}
+
+
+/*
+ * Packed ring specific functions - *_packed().
+ */
+static bool packed_used_wrap_counter(u16 last_used_idx)
+{
+ return !!(last_used_idx & (1 << VRING_PACKED_EVENT_F_WRAP_CTR));
+}
+
+static u16 packed_last_used(u16 last_used_idx)
+{
+ return last_used_idx & ~(-(1 << VRING_PACKED_EVENT_F_WRAP_CTR));
+}
+
+static void vring_unmap_extra_packed(const struct vring_virtqueue *vq,
+ const struct vring_desc_extra *extra)
+{
+ u16 flags;
+
+ flags = extra->flags;
+
+ if (flags & VRING_DESC_F_INDIRECT) {
+ if (!vq->use_dma_api)
+ return;
+
+ dma_unmap_single(vring_dma_dev(vq),
+ extra->addr, extra->len,
+ (flags & VRING_DESC_F_WRITE) ?
+ DMA_FROM_DEVICE : DMA_TO_DEVICE);
+ } else {
+ if (!vq->do_unmap)
+ return;
+
+ dma_unmap_page(vring_dma_dev(vq),
+ extra->addr, extra->len,
+ (flags & VRING_DESC_F_WRITE) ?
+ DMA_FROM_DEVICE : DMA_TO_DEVICE);
+ }
+}
+
+static void vring_unmap_desc_packed(const struct vring_virtqueue *vq,
+ const struct vring_packed_desc *desc)
+{
+ u16 flags;
+
+ if (!vq->do_unmap)
+ return;
+
+ flags = le16_to_cpu(desc->flags);
+
+ dma_unmap_page(vring_dma_dev(vq),
+ le64_to_cpu(desc->addr),
+ le32_to_cpu(desc->len),
+ (flags & VRING_DESC_F_WRITE) ?
+ DMA_FROM_DEVICE : DMA_TO_DEVICE);
+}
+
+static struct vring_packed_desc *alloc_indirect_packed(unsigned int total_sg,
+ gfp_t gfp)
+{
+ struct vring_packed_desc *desc;
+
+ /*
+ * We require lowmem mappings for the descriptors because
+ * otherwise virt_to_phys will give us bogus addresses in the
+ * virtqueue.
+ */
+ gfp &= ~__GFP_HIGHMEM;
+
+ desc = kmalloc_array(total_sg, sizeof(struct vring_packed_desc), gfp);
+
+ return desc;
+}
+
+static int virtqueue_add_indirect_packed(struct vring_virtqueue *vq,
+ struct scatterlist *sgs[],
+ unsigned int total_sg,
+ unsigned int out_sgs,
+ unsigned int in_sgs,
+ void *data,
+ gfp_t gfp)
+{
+ struct vring_packed_desc *desc;
+ struct scatterlist *sg;
+ unsigned int i, n, err_idx;
+ u16 head, id;
+ dma_addr_t addr;
+
+ head = vq->packed.next_avail_idx;
+ desc = alloc_indirect_packed(total_sg, gfp);
+ if (!desc)
+ return -ENOMEM;
+
+ if (unlikely(vq->vq.num_free < 1)) {
+ pr_debug("Can't add buf len 1 - avail = 0\n");
+ kfree(desc);
+ END_USE(vq);
+ return -ENOSPC;
+ }
+
+ i = 0;
+ id = vq->free_head;
+ BUG_ON(id == vq->packed.vring.num);
+
+ for (n = 0; n < out_sgs + in_sgs; n++) {
+ for (sg = sgs[n]; sg; sg = sg_next(sg)) {
+ if (vring_map_one_sg(vq, sg, n < out_sgs ?
+ DMA_TO_DEVICE : DMA_FROM_DEVICE, &addr))
+ goto unmap_release;
+
+ desc[i].flags = cpu_to_le16(n < out_sgs ?
+ 0 : VRING_DESC_F_WRITE);
+ desc[i].addr = cpu_to_le64(addr);
+ desc[i].len = cpu_to_le32(sg->length);
+ i++;
+ }
+ }
+
+ /* Now that the indirect table is filled in, map it. */
+ addr = vring_map_single(vq, desc,
+ total_sg * sizeof(struct vring_packed_desc),
+ DMA_TO_DEVICE);
+ if (vring_mapping_error(vq, addr)) {
+ if (vq->premapped)
+ goto free_desc;
+
+ goto unmap_release;
+ }
+
+ vq->packed.vring.desc[head].addr = cpu_to_le64(addr);
+ vq->packed.vring.desc[head].len = cpu_to_le32(total_sg *
+ sizeof(struct vring_packed_desc));
+ vq->packed.vring.desc[head].id = cpu_to_le16(id);
+
+ if (vq->do_unmap) {
+ vq->packed.desc_extra[id].addr = addr;
+ vq->packed.desc_extra[id].len = total_sg *
+ sizeof(struct vring_packed_desc);
+ vq->packed.desc_extra[id].flags = VRING_DESC_F_INDIRECT |
+ vq->packed.avail_used_flags;
+ }
+
+ /*
+ * A driver MUST NOT make the first descriptor in the list
+ * available before all subsequent descriptors comprising
+ * the list are made available.
+ */
+ virtio_wmb(vq->weak_barriers);
+ vq->packed.vring.desc[head].flags = cpu_to_le16(VRING_DESC_F_INDIRECT |
+ vq->packed.avail_used_flags);
+
+ /* We're using some buffers from the free list. */
+ vq->vq.num_free -= 1;
+
+ /* Update free pointer */
+ n = head + 1;
+ if (n >= vq->packed.vring.num) {
+ n = 0;
+ vq->packed.avail_wrap_counter ^= 1;
+ vq->packed.avail_used_flags ^=
+ 1 << VRING_PACKED_DESC_F_AVAIL |
+ 1 << VRING_PACKED_DESC_F_USED;
+ }
+ vq->packed.next_avail_idx = n;
+ vq->free_head = vq->packed.desc_extra[id].next;
+
+ /* Store token and indirect buffer state. */
+ vq->packed.desc_state[id].num = 1;
+ vq->packed.desc_state[id].data = data;
+ vq->packed.desc_state[id].indir_desc = desc;
+ vq->packed.desc_state[id].last = id;
+
+ vq->num_added += 1;
+
+ pr_debug("Added buffer head %i to %p\n", head, vq);
+ END_USE(vq);
+
+ return 0;
+
+unmap_release:
+ err_idx = i;
+
+ for (i = 0; i < err_idx; i++)
+ vring_unmap_desc_packed(vq, &desc[i]);
+
+free_desc:
+ kfree(desc);
+
+ END_USE(vq);
+ return -ENOMEM;
+}
+
+static inline int virtqueue_add_packed(struct virtqueue *_vq,
+ struct scatterlist *sgs[],
+ unsigned int total_sg,
+ unsigned int out_sgs,
+ unsigned int in_sgs,
+ void *data,
+ void *ctx,
+ gfp_t gfp)
+{
+ struct vring_virtqueue *vq = to_vvq(_vq);
+ struct vring_packed_desc *desc;
+ struct scatterlist *sg;
+ unsigned int i, n, c, descs_used, err_idx;
+ __le16 head_flags, flags;
+ u16 head, id, prev, curr, avail_used_flags;
+ int err;
+
+ START_USE(vq);
+
+ BUG_ON(data == NULL);
+ BUG_ON(ctx && vq->indirect);
+
+ if (unlikely(vq->broken)) {
+ END_USE(vq);
+ return -EIO;
+ }
+
+ LAST_ADD_TIME_UPDATE(vq);
+
+ BUG_ON(total_sg == 0);
+
+ if (virtqueue_use_indirect(vq, total_sg)) {
+ err = virtqueue_add_indirect_packed(vq, sgs, total_sg, out_sgs,
+ in_sgs, data, gfp);
+ if (err != -ENOMEM) {
+ END_USE(vq);
+ return err;
+ }
+
+ /* fall back on direct */
+ }
+
+ head = vq->packed.next_avail_idx;
+ avail_used_flags = vq->packed.avail_used_flags;
+
+ WARN_ON_ONCE(total_sg > vq->packed.vring.num && !vq->indirect);
+
+ desc = vq->packed.vring.desc;
+ i = head;
+ descs_used = total_sg;
+
+ if (unlikely(vq->vq.num_free < descs_used)) {
+ pr_debug("Can't add buf len %i - avail = %i\n",
+ descs_used, vq->vq.num_free);
+ END_USE(vq);
+ return -ENOSPC;
+ }
+
+ id = vq->free_head;
+ BUG_ON(id == vq->packed.vring.num);
+
+ curr = id;
+ c = 0;
+ for (n = 0; n < out_sgs + in_sgs; n++) {
+ for (sg = sgs[n]; sg; sg = sg_next(sg)) {
+ dma_addr_t addr;
+
+ if (vring_map_one_sg(vq, sg, n < out_sgs ?
+ DMA_TO_DEVICE : DMA_FROM_DEVICE, &addr))
+ goto unmap_release;
+
+ flags = cpu_to_le16(vq->packed.avail_used_flags |
+ (++c == total_sg ? 0 : VRING_DESC_F_NEXT) |
+ (n < out_sgs ? 0 : VRING_DESC_F_WRITE));
+ if (i == head)
+ head_flags = flags;
+ else
+ desc[i].flags = flags;
+
+ desc[i].addr = cpu_to_le64(addr);
+ desc[i].len = cpu_to_le32(sg->length);
+ desc[i].id = cpu_to_le16(id);
+
+ if (unlikely(vq->do_unmap)) {
+ vq->packed.desc_extra[curr].addr = addr;
+ vq->packed.desc_extra[curr].len = sg->length;
+ vq->packed.desc_extra[curr].flags =
+ le16_to_cpu(flags);
+ }
+ prev = curr;
+ curr = vq->packed.desc_extra[curr].next;
+
+ if ((unlikely(++i >= vq->packed.vring.num))) {
+ i = 0;
+ vq->packed.avail_used_flags ^=
+ 1 << VRING_PACKED_DESC_F_AVAIL |
+ 1 << VRING_PACKED_DESC_F_USED;
+ }
+ }
+ }
+
+ if (i <= head)
+ vq->packed.avail_wrap_counter ^= 1;
+
+ /* We're using some buffers from the free list. */
+ vq->vq.num_free -= descs_used;
+
+ /* Update free pointer */
+ vq->packed.next_avail_idx = i;
+ vq->free_head = curr;
+
+ /* Store token. */
+ vq->packed.desc_state[id].num = descs_used;
+ vq->packed.desc_state[id].data = data;
+ vq->packed.desc_state[id].indir_desc = ctx;
+ vq->packed.desc_state[id].last = prev;
+
+ /*
+ * A driver MUST NOT make the first descriptor in the list
+ * available before all subsequent descriptors comprising
+ * the list are made available.
+ */
+ virtio_wmb(vq->weak_barriers);
+ vq->packed.vring.desc[head].flags = head_flags;
+ vq->num_added += descs_used;
+
+ pr_debug("Added buffer head %i to %p\n", head, vq);
+ END_USE(vq);
+
+ return 0;
+
+unmap_release:
+ err_idx = i;
+ i = head;
+ curr = vq->free_head;
+
+ vq->packed.avail_used_flags = avail_used_flags;
+
+ for (n = 0; n < total_sg; n++) {
+ if (i == err_idx)
+ break;
+ vring_unmap_extra_packed(vq, &vq->packed.desc_extra[curr]);
+ curr = vq->packed.desc_extra[curr].next;
+ i++;
+ if (i >= vq->packed.vring.num)
+ i = 0;
+ }
+
+ END_USE(vq);
+ return -EIO;
+}
+
+static bool virtqueue_kick_prepare_packed(struct virtqueue *_vq)
+{
+ struct vring_virtqueue *vq = to_vvq(_vq);
+ u16 new, old, off_wrap, flags, wrap_counter, event_idx;
+ bool needs_kick;
+ union {
+ struct {
+ __le16 off_wrap;
+ __le16 flags;
+ };
+ u32 u32;
+ } snapshot;
+
+ START_USE(vq);
+
+ /*
+ * We need to expose the new flags value before checking notification
+ * suppressions.
+ */
+ virtio_mb(vq->weak_barriers);
+
+ old = vq->packed.next_avail_idx - vq->num_added;
+ new = vq->packed.next_avail_idx;
+ vq->num_added = 0;
+
+ snapshot.u32 = *(u32 *)vq->packed.vring.device;
+ flags = le16_to_cpu(snapshot.flags);
+
+ LAST_ADD_TIME_CHECK(vq);
+ LAST_ADD_TIME_INVALID(vq);
+
+ if (flags != VRING_PACKED_EVENT_FLAG_DESC) {
+ needs_kick = (flags != VRING_PACKED_EVENT_FLAG_DISABLE);
+ goto out;
+ }
+
+ off_wrap = le16_to_cpu(snapshot.off_wrap);
+
+ wrap_counter = off_wrap >> VRING_PACKED_EVENT_F_WRAP_CTR;
+ event_idx = off_wrap & ~(1 << VRING_PACKED_EVENT_F_WRAP_CTR);
+ if (wrap_counter != vq->packed.avail_wrap_counter)
+ event_idx -= vq->packed.vring.num;
+
+ needs_kick = vring_need_event(event_idx, new, old);
+out:
+ END_USE(vq);
+ return needs_kick;
+}
+
+static void detach_buf_packed(struct vring_virtqueue *vq,
+ unsigned int id, void **ctx)
+{
+ struct vring_desc_state_packed *state = NULL;
+ struct vring_packed_desc *desc;
+ unsigned int i, curr;
+
+ state = &vq->packed.desc_state[id];
+
+ /* Clear data ptr. */
+ state->data = NULL;
+
+ vq->packed.desc_extra[state->last].next = vq->free_head;
+ vq->free_head = id;
+ vq->vq.num_free += state->num;
+
+ if (unlikely(vq->do_unmap)) {
+ curr = id;
+ for (i = 0; i < state->num; i++) {
+ vring_unmap_extra_packed(vq,
+ &vq->packed.desc_extra[curr]);
+ curr = vq->packed.desc_extra[curr].next;
+ }
+ }
+
+ if (vq->indirect) {
+ u32 len;
+
+ /* Free the indirect table, if any, now that it's unmapped. */
+ desc = state->indir_desc;
+ if (!desc)
+ return;
+
+ if (vq->do_unmap) {
+ len = vq->packed.desc_extra[id].len;
+ for (i = 0; i < len / sizeof(struct vring_packed_desc);
+ i++)
+ vring_unmap_desc_packed(vq, &desc[i]);
+ }
+ kfree(desc);
+ state->indir_desc = NULL;
+ } else if (ctx) {
+ *ctx = state->indir_desc;
+ }
+}
+
+static inline bool is_used_desc_packed(const struct vring_virtqueue *vq,
+ u16 idx, bool used_wrap_counter)
+{
+ bool avail, used;
+ u16 flags;
+
+ flags = le16_to_cpu(vq->packed.vring.desc[idx].flags);
+ avail = !!(flags & (1 << VRING_PACKED_DESC_F_AVAIL));
+ used = !!(flags & (1 << VRING_PACKED_DESC_F_USED));
+
+ return avail == used && used == used_wrap_counter;
+}
+
+static bool more_used_packed(const struct vring_virtqueue *vq)
+{
+ u16 last_used;
+ u16 last_used_idx;
+ bool used_wrap_counter;
+
+ last_used_idx = READ_ONCE(vq->last_used_idx);
+ last_used = packed_last_used(last_used_idx);
+ used_wrap_counter = packed_used_wrap_counter(last_used_idx);
+ return is_used_desc_packed(vq, last_used, used_wrap_counter);
+}
+
+static void *virtqueue_get_buf_ctx_packed(struct virtqueue *_vq,
+ unsigned int *len,
+ void **ctx)
+{
+ struct vring_virtqueue *vq = to_vvq(_vq);
+ u16 last_used, id, last_used_idx;
+ bool used_wrap_counter;
+ void *ret;
+
+ START_USE(vq);
+
+ if (unlikely(vq->broken)) {
+ END_USE(vq);
+ return NULL;
+ }
+
+ if (!more_used_packed(vq)) {
+ pr_debug("No more buffers in queue\n");
+ END_USE(vq);
+ return NULL;
+ }
+
+ /* Only get used elements after they have been exposed by host. */
+ virtio_rmb(vq->weak_barriers);
+
+ last_used_idx = READ_ONCE(vq->last_used_idx);
+ used_wrap_counter = packed_used_wrap_counter(last_used_idx);
+ last_used = packed_last_used(last_used_idx);
+ id = le16_to_cpu(vq->packed.vring.desc[last_used].id);
+ *len = le32_to_cpu(vq->packed.vring.desc[last_used].len);
+
+ if (unlikely(id >= vq->packed.vring.num)) {
+ BAD_RING(vq, "id %u out of range\n", id);
+ return NULL;
+ }
+ if (unlikely(!vq->packed.desc_state[id].data)) {
+ BAD_RING(vq, "id %u is not a head!\n", id);
+ return NULL;
+ }
+
+ /* detach_buf_packed clears data, so grab it now. */
+ ret = vq->packed.desc_state[id].data;
+ detach_buf_packed(vq, id, ctx);
+
+ last_used += vq->packed.desc_state[id].num;
+ if (unlikely(last_used >= vq->packed.vring.num)) {
+ last_used -= vq->packed.vring.num;
+ used_wrap_counter ^= 1;
+ }
+
+ last_used = (last_used | (used_wrap_counter << VRING_PACKED_EVENT_F_WRAP_CTR));
+ WRITE_ONCE(vq->last_used_idx, last_used);
+
+ /*
+ * If we expect an interrupt for the next entry, tell host
+ * by writing event index and flush out the write before
+ * the read in the next get_buf call.
+ */
+ if (vq->packed.event_flags_shadow == VRING_PACKED_EVENT_FLAG_DESC)
+ virtio_store_mb(vq->weak_barriers,
+ &vq->packed.vring.driver->off_wrap,
+ cpu_to_le16(vq->last_used_idx));
+
+ LAST_ADD_TIME_INVALID(vq);
+
+ END_USE(vq);
+ return ret;
+}
+
+static void virtqueue_disable_cb_packed(struct virtqueue *_vq)
+{
+ struct vring_virtqueue *vq = to_vvq(_vq);
+
+ if (vq->packed.event_flags_shadow != VRING_PACKED_EVENT_FLAG_DISABLE) {
+ vq->packed.event_flags_shadow = VRING_PACKED_EVENT_FLAG_DISABLE;
+
+ /*
+ * If device triggered an event already it won't trigger one again:
+ * no need to disable.
+ */
+ if (vq->event_triggered)
+ return;
+
+ vq->packed.vring.driver->flags =
+ cpu_to_le16(vq->packed.event_flags_shadow);
+ }
+}
+
+static unsigned int virtqueue_enable_cb_prepare_packed(struct virtqueue *_vq)
+{
+ struct vring_virtqueue *vq = to_vvq(_vq);
+
+ START_USE(vq);
+
+ /*
+ * We optimistically turn back on interrupts, then check if there was
+ * more to do.
+ */
+
+ if (vq->event) {
+ vq->packed.vring.driver->off_wrap =
+ cpu_to_le16(vq->last_used_idx);
+ /*
+ * We need to update event offset and event wrap
+ * counter first before updating event flags.
+ */
+ virtio_wmb(vq->weak_barriers);
+ }
+
+ if (vq->packed.event_flags_shadow == VRING_PACKED_EVENT_FLAG_DISABLE) {
+ vq->packed.event_flags_shadow = vq->event ?
+ VRING_PACKED_EVENT_FLAG_DESC :
+ VRING_PACKED_EVENT_FLAG_ENABLE;
+ vq->packed.vring.driver->flags =
+ cpu_to_le16(vq->packed.event_flags_shadow);
+ }
+
+ END_USE(vq);
+ return vq->last_used_idx;
+}
+
+static bool virtqueue_poll_packed(struct virtqueue *_vq, u16 off_wrap)
+{
+ struct vring_virtqueue *vq = to_vvq(_vq);
+ bool wrap_counter;
+ u16 used_idx;
+
+ wrap_counter = off_wrap >> VRING_PACKED_EVENT_F_WRAP_CTR;
+ used_idx = off_wrap & ~(1 << VRING_PACKED_EVENT_F_WRAP_CTR);
+
+ return is_used_desc_packed(vq, used_idx, wrap_counter);
+}
+
+static bool virtqueue_enable_cb_delayed_packed(struct virtqueue *_vq)
+{
+ struct vring_virtqueue *vq = to_vvq(_vq);
+ u16 used_idx, wrap_counter, last_used_idx;
+ u16 bufs;
+
+ START_USE(vq);
+
+ /*
+ * We optimistically turn back on interrupts, then check if there was
+ * more to do.
+ */
+
+ if (vq->event) {
+ /* TODO: tune this threshold */
+ bufs = (vq->packed.vring.num - vq->vq.num_free) * 3 / 4;
+ last_used_idx = READ_ONCE(vq->last_used_idx);
+ wrap_counter = packed_used_wrap_counter(last_used_idx);
+
+ used_idx = packed_last_used(last_used_idx) + bufs;
+ if (used_idx >= vq->packed.vring.num) {
+ used_idx -= vq->packed.vring.num;
+ wrap_counter ^= 1;
+ }
+
+ vq->packed.vring.driver->off_wrap = cpu_to_le16(used_idx |
+ (wrap_counter << VRING_PACKED_EVENT_F_WRAP_CTR));
+
+ /*
+ * We need to update event offset and event wrap
+ * counter first before updating event flags.
+ */
+ virtio_wmb(vq->weak_barriers);
+ }
+
+ if (vq->packed.event_flags_shadow == VRING_PACKED_EVENT_FLAG_DISABLE) {
+ vq->packed.event_flags_shadow = vq->event ?
+ VRING_PACKED_EVENT_FLAG_DESC :
+ VRING_PACKED_EVENT_FLAG_ENABLE;
+ vq->packed.vring.driver->flags =
+ cpu_to_le16(vq->packed.event_flags_shadow);
+ }
+
+ /*
+ * We need to update event suppression structure first
+ * before re-checking for more used buffers.
+ */
+ virtio_mb(vq->weak_barriers);
+
+ last_used_idx = READ_ONCE(vq->last_used_idx);
+ wrap_counter = packed_used_wrap_counter(last_used_idx);
+ used_idx = packed_last_used(last_used_idx);
+ if (is_used_desc_packed(vq, used_idx, wrap_counter)) {
+ END_USE(vq);
+ return false;
+ }
+
+ END_USE(vq);
+ return true;
+}
+
+static void *virtqueue_detach_unused_buf_packed(struct virtqueue *_vq)
+{
+ struct vring_virtqueue *vq = to_vvq(_vq);
+ unsigned int i;
+ void *buf;
+
+ START_USE(vq);
+
+ for (i = 0; i < vq->packed.vring.num; i++) {
+ if (!vq->packed.desc_state[i].data)
+ continue;
+ /* detach_buf clears data, so grab it now. */
+ buf = vq->packed.desc_state[i].data;
+ detach_buf_packed(vq, i, NULL);
+ END_USE(vq);
+ return buf;
+ }
+ /* That should have freed everything. */
+ BUG_ON(vq->vq.num_free != vq->packed.vring.num);
+
+ END_USE(vq);
+ return NULL;
+}
+
+static struct vring_desc_extra *vring_alloc_desc_extra(unsigned int num)
+{
+ struct vring_desc_extra *desc_extra;
+ unsigned int i;
+
+ desc_extra = kmalloc_array(num, sizeof(struct vring_desc_extra),
+ GFP_KERNEL);
+ if (!desc_extra)
+ return NULL;
+
+ memset(desc_extra, 0, num * sizeof(struct vring_desc_extra));
+
+ for (i = 0; i < num - 1; i++)
+ desc_extra[i].next = i + 1;
+
+ return desc_extra;
+}
+
+static void vring_free_packed(struct vring_virtqueue_packed *vring_packed,
+ struct virtio_device *vdev,
+ struct device *dma_dev)
+{
+ if (vring_packed->vring.desc)
+ vring_free_queue(vdev, vring_packed->ring_size_in_bytes,
+ vring_packed->vring.desc,
+ vring_packed->ring_dma_addr,
+ dma_dev);
+
+ if (vring_packed->vring.driver)
+ vring_free_queue(vdev, vring_packed->event_size_in_bytes,
+ vring_packed->vring.driver,
+ vring_packed->driver_event_dma_addr,
+ dma_dev);
+
+ if (vring_packed->vring.device)
+ vring_free_queue(vdev, vring_packed->event_size_in_bytes,
+ vring_packed->vring.device,
+ vring_packed->device_event_dma_addr,
+ dma_dev);
+
+ kfree(vring_packed->desc_state);
+ kfree(vring_packed->desc_extra);
+}
+
+static int vring_alloc_queue_packed(struct vring_virtqueue_packed *vring_packed,
+ struct virtio_device *vdev,
+ u32 num, struct device *dma_dev)
+{
+ struct vring_packed_desc *ring;
+ struct vring_packed_desc_event *driver, *device;
+ dma_addr_t ring_dma_addr, driver_event_dma_addr, device_event_dma_addr;
+ size_t ring_size_in_bytes, event_size_in_bytes;
+
+ ring_size_in_bytes = num * sizeof(struct vring_packed_desc);
+
+ ring = vring_alloc_queue(vdev, ring_size_in_bytes,
+ &ring_dma_addr,
+ GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO,
+ dma_dev);
+ if (!ring)
+ goto err;
+
+ vring_packed->vring.desc = ring;
+ vring_packed->ring_dma_addr = ring_dma_addr;
+ vring_packed->ring_size_in_bytes = ring_size_in_bytes;
+
+ event_size_in_bytes = sizeof(struct vring_packed_desc_event);
+
+ driver = vring_alloc_queue(vdev, event_size_in_bytes,
+ &driver_event_dma_addr,
+ GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO,
+ dma_dev);
+ if (!driver)
+ goto err;
+
+ vring_packed->vring.driver = driver;
+ vring_packed->event_size_in_bytes = event_size_in_bytes;
+ vring_packed->driver_event_dma_addr = driver_event_dma_addr;
+
+ device = vring_alloc_queue(vdev, event_size_in_bytes,
+ &device_event_dma_addr,
+ GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO,
+ dma_dev);
+ if (!device)
+ goto err;
+
+ vring_packed->vring.device = device;
+ vring_packed->device_event_dma_addr = device_event_dma_addr;
+
+ vring_packed->vring.num = num;
+
+ return 0;
+
+err:
+ vring_free_packed(vring_packed, vdev, dma_dev);
+ return -ENOMEM;
+}
+
+static int vring_alloc_state_extra_packed(struct vring_virtqueue_packed *vring_packed)
+{
+ struct vring_desc_state_packed *state;
+ struct vring_desc_extra *extra;
+ u32 num = vring_packed->vring.num;
+
+ state = kmalloc_array(num, sizeof(struct vring_desc_state_packed), GFP_KERNEL);
+ if (!state)
+ goto err_desc_state;
+
+ memset(state, 0, num * sizeof(struct vring_desc_state_packed));
+
+ extra = vring_alloc_desc_extra(num);
+ if (!extra)
+ goto err_desc_extra;
+
+ vring_packed->desc_state = state;
+ vring_packed->desc_extra = extra;
+
+ return 0;
+
+err_desc_extra:
+ kfree(state);
+err_desc_state:
+ return -ENOMEM;
+}
+
+static void virtqueue_vring_init_packed(struct vring_virtqueue_packed *vring_packed,
+ bool callback)
+{
+ vring_packed->next_avail_idx = 0;
+ vring_packed->avail_wrap_counter = 1;
+ vring_packed->event_flags_shadow = 0;
+ vring_packed->avail_used_flags = 1 << VRING_PACKED_DESC_F_AVAIL;
+
+ /* No callback? Tell other side not to bother us. */
+ if (!callback) {
+ vring_packed->event_flags_shadow = VRING_PACKED_EVENT_FLAG_DISABLE;
+ vring_packed->vring.driver->flags =
+ cpu_to_le16(vring_packed->event_flags_shadow);
+ }
+}
+
+static void virtqueue_vring_attach_packed(struct vring_virtqueue *vq,
+ struct vring_virtqueue_packed *vring_packed)
+{
+ vq->packed = *vring_packed;
+
+ /* Put everything in free lists. */
+ vq->free_head = 0;
+}
+
+static void virtqueue_reinit_packed(struct vring_virtqueue *vq)
+{
+ memset(vq->packed.vring.device, 0, vq->packed.event_size_in_bytes);
+ memset(vq->packed.vring.driver, 0, vq->packed.event_size_in_bytes);
+
+ /* we need to reset the desc.flags. For more, see is_used_desc_packed() */
+ memset(vq->packed.vring.desc, 0, vq->packed.ring_size_in_bytes);
+
+ virtqueue_init(vq, vq->packed.vring.num);
+ virtqueue_vring_init_packed(&vq->packed, !!vq->vq.callback);
+}
+
+static struct virtqueue *vring_create_virtqueue_packed(
+ unsigned int index,
+ unsigned int num,
+ unsigned int vring_align,
+ struct virtio_device *vdev,
+ bool weak_barriers,
+ bool may_reduce_num,
+ bool context,
+ bool (*notify)(struct virtqueue *),
+ void (*callback)(struct virtqueue *),
+ const char *name,
+ struct device *dma_dev)
+{
+ struct vring_virtqueue_packed vring_packed = {};
+ struct vring_virtqueue *vq;
+ int err;
+
+ if (vring_alloc_queue_packed(&vring_packed, vdev, num, dma_dev))
+ goto err_ring;
+
+ vq = kmalloc(sizeof(*vq), GFP_KERNEL);
+ if (!vq)
+ goto err_vq;
+
+ vq->vq.callback = callback;
+ vq->vq.vdev = vdev;
+ vq->vq.name = name;
+ vq->vq.index = index;
+ vq->vq.reset = false;
+ vq->we_own_ring = true;
+ vq->notify = notify;
+ vq->weak_barriers = weak_barriers;
+#ifdef CONFIG_VIRTIO_HARDEN_NOTIFICATION
+ vq->broken = true;
+#else
+ vq->broken = false;
+#endif
+ vq->packed_ring = true;
+ vq->dma_dev = dma_dev;
+ vq->use_dma_api = vring_use_dma_api(vdev);
+ vq->premapped = false;
+ vq->do_unmap = vq->use_dma_api;
+
+ vq->indirect = virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC) &&
+ !context;
+ vq->event = virtio_has_feature(vdev, VIRTIO_RING_F_EVENT_IDX);
+
+ if (virtio_has_feature(vdev, VIRTIO_F_ORDER_PLATFORM))
+ vq->weak_barriers = false;
+
+ err = vring_alloc_state_extra_packed(&vring_packed);
+ if (err)
+ goto err_state_extra;
+
+ virtqueue_vring_init_packed(&vring_packed, !!callback);
+
+ virtqueue_init(vq, num);
+ virtqueue_vring_attach_packed(vq, &vring_packed);
+
+ spin_lock(&vdev->vqs_list_lock);
+ list_add_tail(&vq->vq.list, &vdev->vqs);
+ spin_unlock(&vdev->vqs_list_lock);
+ return &vq->vq;
+
+err_state_extra:
+ kfree(vq);
+err_vq:
+ vring_free_packed(&vring_packed, vdev, dma_dev);
+err_ring:
+ return NULL;
+}
+
+static int virtqueue_resize_packed(struct virtqueue *_vq, u32 num)
+{
+ struct vring_virtqueue_packed vring_packed = {};
+ struct vring_virtqueue *vq = to_vvq(_vq);
+ struct virtio_device *vdev = _vq->vdev;
+ int err;
+
+ if (vring_alloc_queue_packed(&vring_packed, vdev, num, vring_dma_dev(vq)))
+ goto err_ring;
+
+ err = vring_alloc_state_extra_packed(&vring_packed);
+ if (err)
+ goto err_state_extra;
+
+ vring_free(&vq->vq);
+
+ virtqueue_vring_init_packed(&vring_packed, !!vq->vq.callback);
+
+ virtqueue_init(vq, vring_packed.vring.num);
+ virtqueue_vring_attach_packed(vq, &vring_packed);
+
+ return 0;
+
+err_state_extra:
+ vring_free_packed(&vring_packed, vdev, vring_dma_dev(vq));
+err_ring:
+ virtqueue_reinit_packed(vq);
+ return -ENOMEM;
+}
+
+static int virtqueue_disable_and_recycle(struct virtqueue *_vq,
+ void (*recycle)(struct virtqueue *vq, void *buf))
+{
+ struct vring_virtqueue *vq = to_vvq(_vq);
+ struct virtio_device *vdev = vq->vq.vdev;
+ void *buf;
+ int err;
+
+ if (!vq->we_own_ring)
+ return -EPERM;
+
+ if (!vdev->config->disable_vq_and_reset)
+ return -ENOENT;
+
+ if (!vdev->config->enable_vq_after_reset)
+ return -ENOENT;
+
+ err = vdev->config->disable_vq_and_reset(_vq);
+ if (err)
+ return err;
+
+ while ((buf = virtqueue_detach_unused_buf(_vq)) != NULL)
+ recycle(_vq, buf);
+
+ return 0;
+}
+
+static int virtqueue_enable_after_reset(struct virtqueue *_vq)
+{
+ struct vring_virtqueue *vq = to_vvq(_vq);
+ struct virtio_device *vdev = vq->vq.vdev;
+
+ if (vdev->config->enable_vq_after_reset(_vq))
+ return -EBUSY;
+
+ return 0;
+}
+
+/*
+ * Generic functions and exported symbols.
+ */
+
+static inline int virtqueue_add(struct virtqueue *_vq,
+ struct scatterlist *sgs[],
+ unsigned int total_sg,
+ unsigned int out_sgs,
+ unsigned int in_sgs,
+ void *data,
+ void *ctx,
+ gfp_t gfp)
+{
+ struct vring_virtqueue *vq = to_vvq(_vq);
+
+ return vq->packed_ring ? virtqueue_add_packed(_vq, sgs, total_sg,
+ out_sgs, in_sgs, data, ctx, gfp) :
+ virtqueue_add_split(_vq, sgs, total_sg,
+ out_sgs, in_sgs, data, ctx, gfp);
+}
+
+/**
+ * virtqueue_add_sgs - expose buffers to other end
+ * @_vq: the struct virtqueue we're talking about.
+ * @sgs: array of terminated scatterlists.
+ * @out_sgs: the number of scatterlists readable by other side
+ * @in_sgs: the number of scatterlists which are writable (after readable ones)
+ * @data: the token identifying the buffer.
+ * @gfp: how to do memory allocations (if necessary).
+ *
+ * Caller must ensure we don't call this with other virtqueue operations
+ * at the same time (except where noted).
+ *
+ * Returns zero or a negative error (ie. ENOSPC, ENOMEM, EIO).
+ */
+int virtqueue_add_sgs(struct virtqueue *_vq,
+ struct scatterlist *sgs[],
+ unsigned int out_sgs,
+ unsigned int in_sgs,
+ void *data,
+ gfp_t gfp)
+{
+ unsigned int i, total_sg = 0;
+
+ /* Count them first. */
+ for (i = 0; i < out_sgs + in_sgs; i++) {
+ struct scatterlist *sg;
+
+ for (sg = sgs[i]; sg; sg = sg_next(sg))
+ total_sg++;
+ }
+ return virtqueue_add(_vq, sgs, total_sg, out_sgs, in_sgs,
+ data, NULL, gfp);
+}
+EXPORT_SYMBOL_GPL(virtqueue_add_sgs);
+
+/**
+ * virtqueue_add_outbuf - expose output buffers to other end
+ * @vq: the struct virtqueue we're talking about.
+ * @sg: scatterlist (must be well-formed and terminated!)
+ * @num: the number of entries in @sg readable by other side
+ * @data: the token identifying the buffer.
+ * @gfp: how to do memory allocations (if necessary).
+ *
+ * Caller must ensure we don't call this with other virtqueue operations
+ * at the same time (except where noted).
+ *
+ * Returns zero or a negative error (ie. ENOSPC, ENOMEM, EIO).
+ */
+int virtqueue_add_outbuf(struct virtqueue *vq,
+ struct scatterlist *sg, unsigned int num,
+ void *data,
+ gfp_t gfp)
+{
+ return virtqueue_add(vq, &sg, num, 1, 0, data, NULL, gfp);
+}
+EXPORT_SYMBOL_GPL(virtqueue_add_outbuf);
+
+/**
+ * virtqueue_add_inbuf - expose input buffers to other end
+ * @vq: the struct virtqueue we're talking about.
+ * @sg: scatterlist (must be well-formed and terminated!)
+ * @num: the number of entries in @sg writable by other side
+ * @data: the token identifying the buffer.
+ * @gfp: how to do memory allocations (if necessary).
+ *
+ * Caller must ensure we don't call this with other virtqueue operations
+ * at the same time (except where noted).
+ *
+ * Returns zero or a negative error (ie. ENOSPC, ENOMEM, EIO).
+ */
+int virtqueue_add_inbuf(struct virtqueue *vq,
+ struct scatterlist *sg, unsigned int num,
+ void *data,
+ gfp_t gfp)
+{
+ return virtqueue_add(vq, &sg, num, 0, 1, data, NULL, gfp);
+}
+EXPORT_SYMBOL_GPL(virtqueue_add_inbuf);
+
+/**
+ * virtqueue_add_inbuf_ctx - expose input buffers to other end
+ * @vq: the struct virtqueue we're talking about.
+ * @sg: scatterlist (must be well-formed and terminated!)
+ * @num: the number of entries in @sg writable by other side
+ * @data: the token identifying the buffer.
+ * @ctx: extra context for the token
+ * @gfp: how to do memory allocations (if necessary).
+ *
+ * Caller must ensure we don't call this with other virtqueue operations
+ * at the same time (except where noted).
+ *
+ * Returns zero or a negative error (ie. ENOSPC, ENOMEM, EIO).
+ */
+int virtqueue_add_inbuf_ctx(struct virtqueue *vq,
+ struct scatterlist *sg, unsigned int num,
+ void *data,
+ void *ctx,
+ gfp_t gfp)
+{
+ return virtqueue_add(vq, &sg, num, 0, 1, data, ctx, gfp);
+}
+EXPORT_SYMBOL_GPL(virtqueue_add_inbuf_ctx);
+
+/**
+ * virtqueue_dma_dev - get the dma dev
+ * @_vq: the struct virtqueue we're talking about.
+ *
+ * Returns the dma dev. That can been used for dma api.
+ */
+struct device *virtqueue_dma_dev(struct virtqueue *_vq)
+{
+ struct vring_virtqueue *vq = to_vvq(_vq);
+
+ if (vq->use_dma_api)
+ return vring_dma_dev(vq);
+ else
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(virtqueue_dma_dev);
+
+/**
+ * virtqueue_kick_prepare - first half of split virtqueue_kick call.
+ * @_vq: the struct virtqueue
+ *
+ * Instead of virtqueue_kick(), you can do:
+ * if (virtqueue_kick_prepare(vq))
+ * virtqueue_notify(vq);
+ *
+ * This is sometimes useful because the virtqueue_kick_prepare() needs
+ * to be serialized, but the actual virtqueue_notify() call does not.
+ */
+bool virtqueue_kick_prepare(struct virtqueue *_vq)
+{
+ struct vring_virtqueue *vq = to_vvq(_vq);
+
+ return vq->packed_ring ? virtqueue_kick_prepare_packed(_vq) :
+ virtqueue_kick_prepare_split(_vq);
+}
+EXPORT_SYMBOL_GPL(virtqueue_kick_prepare);
+
+/**
+ * virtqueue_notify - second half of split virtqueue_kick call.
+ * @_vq: the struct virtqueue
+ *
+ * This does not need to be serialized.
+ *
+ * Returns false if host notify failed or queue is broken, otherwise true.
+ */
+bool virtqueue_notify(struct virtqueue *_vq)
+{
+ struct vring_virtqueue *vq = to_vvq(_vq);
+
+ if (unlikely(vq->broken))
+ return false;
+
+ /* Prod other side to tell it about changes. */
+ if (!vq->notify(_vq)) {
+ vq->broken = true;
+ return false;
+ }
+ return true;
+}
+EXPORT_SYMBOL_GPL(virtqueue_notify);
+
+/**
+ * virtqueue_kick - update after add_buf
+ * @vq: the struct virtqueue
+ *
+ * After one or more virtqueue_add_* calls, invoke this to kick
+ * the other side.
+ *
+ * Caller must ensure we don't call this with other virtqueue
+ * operations at the same time (except where noted).
+ *
+ * Returns false if kick failed, otherwise true.
+ */
+bool virtqueue_kick(struct virtqueue *vq)
+{
+ if (virtqueue_kick_prepare(vq))
+ return virtqueue_notify(vq);
+ return true;
+}
+EXPORT_SYMBOL_GPL(virtqueue_kick);
+
+/**
+ * virtqueue_get_buf_ctx - get the next used buffer
+ * @_vq: the struct virtqueue we're talking about.
+ * @len: the length written into the buffer
+ * @ctx: extra context for the token
+ *
+ * If the device wrote data into the buffer, @len will be set to the
+ * amount written. This means you don't need to clear the buffer
+ * beforehand to ensure there's no data leakage in the case of short
+ * writes.
+ *
+ * Caller must ensure we don't call this with other virtqueue
+ * operations at the same time (except where noted).
+ *
+ * Returns NULL if there are no used buffers, or the "data" token
+ * handed to virtqueue_add_*().
+ */
+void *virtqueue_get_buf_ctx(struct virtqueue *_vq, unsigned int *len,
+ void **ctx)
+{
+ struct vring_virtqueue *vq = to_vvq(_vq);
+
+ return vq->packed_ring ? virtqueue_get_buf_ctx_packed(_vq, len, ctx) :
+ virtqueue_get_buf_ctx_split(_vq, len, ctx);
+}
+EXPORT_SYMBOL_GPL(virtqueue_get_buf_ctx);
+
+void *virtqueue_get_buf(struct virtqueue *_vq, unsigned int *len)
+{
+ return virtqueue_get_buf_ctx(_vq, len, NULL);
+}
+EXPORT_SYMBOL_GPL(virtqueue_get_buf);
+/**
+ * virtqueue_disable_cb - disable callbacks
+ * @_vq: the struct virtqueue we're talking about.
+ *
+ * Note that this is not necessarily synchronous, hence unreliable and only
+ * useful as an optimization.
+ *
+ * Unlike other operations, this need not be serialized.
+ */
+void virtqueue_disable_cb(struct virtqueue *_vq)
+{
+ struct vring_virtqueue *vq = to_vvq(_vq);
+
+ if (vq->packed_ring)
+ virtqueue_disable_cb_packed(_vq);
+ else
+ virtqueue_disable_cb_split(_vq);
+}
+EXPORT_SYMBOL_GPL(virtqueue_disable_cb);
+
+/**
+ * virtqueue_enable_cb_prepare - restart callbacks after disable_cb
+ * @_vq: the struct virtqueue we're talking about.
+ *
+ * This re-enables callbacks; it returns current queue state
+ * in an opaque unsigned value. This value should be later tested by
+ * virtqueue_poll, to detect a possible race between the driver checking for
+ * more work, and enabling callbacks.
+ *
+ * Caller must ensure we don't call this with other virtqueue
+ * operations at the same time (except where noted).
+ */
+unsigned int virtqueue_enable_cb_prepare(struct virtqueue *_vq)
+{
+ struct vring_virtqueue *vq = to_vvq(_vq);
+
+ if (vq->event_triggered)
+ vq->event_triggered = false;
+
+ return vq->packed_ring ? virtqueue_enable_cb_prepare_packed(_vq) :
+ virtqueue_enable_cb_prepare_split(_vq);
+}
+EXPORT_SYMBOL_GPL(virtqueue_enable_cb_prepare);
+
+/**
+ * virtqueue_poll - query pending used buffers
+ * @_vq: the struct virtqueue we're talking about.
+ * @last_used_idx: virtqueue state (from call to virtqueue_enable_cb_prepare).
+ *
+ * Returns "true" if there are pending used buffers in the queue.
+ *
+ * This does not need to be serialized.
+ */
+bool virtqueue_poll(struct virtqueue *_vq, unsigned int last_used_idx)
+{
+ struct vring_virtqueue *vq = to_vvq(_vq);
+
+ if (unlikely(vq->broken))
+ return false;
+
+ virtio_mb(vq->weak_barriers);
+ return vq->packed_ring ? virtqueue_poll_packed(_vq, last_used_idx) :
+ virtqueue_poll_split(_vq, last_used_idx);
+}
+EXPORT_SYMBOL_GPL(virtqueue_poll);
+
+/**
+ * virtqueue_enable_cb - restart callbacks after disable_cb.
+ * @_vq: the struct virtqueue we're talking about.
+ *
+ * This re-enables callbacks; it returns "false" if there are pending
+ * buffers in the queue, to detect a possible race between the driver
+ * checking for more work, and enabling callbacks.
+ *
+ * Caller must ensure we don't call this with other virtqueue
+ * operations at the same time (except where noted).
+ */
+bool virtqueue_enable_cb(struct virtqueue *_vq)
+{
+ unsigned int last_used_idx = virtqueue_enable_cb_prepare(_vq);
+
+ return !virtqueue_poll(_vq, last_used_idx);
+}
+EXPORT_SYMBOL_GPL(virtqueue_enable_cb);
+
+/**
+ * virtqueue_enable_cb_delayed - restart callbacks after disable_cb.
+ * @_vq: the struct virtqueue we're talking about.
+ *
+ * This re-enables callbacks but hints to the other side to delay
+ * interrupts until most of the available buffers have been processed;
+ * it returns "false" if there are many pending buffers in the queue,
+ * to detect a possible race between the driver checking for more work,
+ * and enabling callbacks.
+ *
+ * Caller must ensure we don't call this with other virtqueue
+ * operations at the same time (except where noted).
+ */
+bool virtqueue_enable_cb_delayed(struct virtqueue *_vq)
+{
+ struct vring_virtqueue *vq = to_vvq(_vq);
+
+ if (vq->event_triggered)
+ vq->event_triggered = false;
+
+ return vq->packed_ring ? virtqueue_enable_cb_delayed_packed(_vq) :
+ virtqueue_enable_cb_delayed_split(_vq);
+}
+EXPORT_SYMBOL_GPL(virtqueue_enable_cb_delayed);
+
+/**
+ * virtqueue_detach_unused_buf - detach first unused buffer
+ * @_vq: the struct virtqueue we're talking about.
+ *
+ * Returns NULL or the "data" token handed to virtqueue_add_*().
+ * This is not valid on an active queue; it is useful for device
+ * shutdown or the reset queue.
+ */
+void *virtqueue_detach_unused_buf(struct virtqueue *_vq)
+{
+ struct vring_virtqueue *vq = to_vvq(_vq);
+
+ return vq->packed_ring ? virtqueue_detach_unused_buf_packed(_vq) :
+ virtqueue_detach_unused_buf_split(_vq);
+}
+EXPORT_SYMBOL_GPL(virtqueue_detach_unused_buf);
+
+static inline bool more_used(const struct vring_virtqueue *vq)
+{
+ return vq->packed_ring ? more_used_packed(vq) : more_used_split(vq);
+}
+
+/**
+ * vring_interrupt - notify a virtqueue on an interrupt
+ * @irq: the IRQ number (ignored)
+ * @_vq: the struct virtqueue to notify
+ *
+ * Calls the callback function of @_vq to process the virtqueue
+ * notification.
+ */
+irqreturn_t vring_interrupt(int irq, void *_vq)
+{
+ struct vring_virtqueue *vq = to_vvq(_vq);
+
+ if (!more_used(vq)) {
+ pr_debug("virtqueue interrupt with no work for %p\n", vq);
+ return IRQ_NONE;
+ }
+
+ if (unlikely(vq->broken)) {
+#ifdef CONFIG_VIRTIO_HARDEN_NOTIFICATION
+ dev_warn_once(&vq->vq.vdev->dev,
+ "virtio vring IRQ raised before DRIVER_OK");
+ return IRQ_NONE;
+#else
+ return IRQ_HANDLED;
+#endif
+ }
+
+ /* Just a hint for performance: so it's ok that this can be racy! */
+ if (vq->event)
+ vq->event_triggered = true;
+
+ pr_debug("virtqueue callback for %p (%p)\n", vq, vq->vq.callback);
+ if (vq->vq.callback)
+ vq->vq.callback(&vq->vq);
+
+ return IRQ_HANDLED;
+}
+EXPORT_SYMBOL_GPL(vring_interrupt);
+
+/* Only available for split ring */
+static struct virtqueue *__vring_new_virtqueue(unsigned int index,
+ struct vring_virtqueue_split *vring_split,
+ struct virtio_device *vdev,
+ bool weak_barriers,
+ bool context,
+ bool (*notify)(struct virtqueue *),
+ void (*callback)(struct virtqueue *),
+ const char *name,
+ struct device *dma_dev)
+{
+ struct vring_virtqueue *vq;
+ int err;
+
+ if (virtio_has_feature(vdev, VIRTIO_F_RING_PACKED))
+ return NULL;
+
+ vq = kmalloc(sizeof(*vq), GFP_KERNEL);
+ if (!vq)
+ return NULL;
+
+ vq->packed_ring = false;
+ vq->vq.callback = callback;
+ vq->vq.vdev = vdev;
+ vq->vq.name = name;
+ vq->vq.index = index;
+ vq->vq.reset = false;
+ vq->we_own_ring = false;
+ vq->notify = notify;
+ vq->weak_barriers = weak_barriers;
+#ifdef CONFIG_VIRTIO_HARDEN_NOTIFICATION
+ vq->broken = true;
+#else
+ vq->broken = false;
+#endif
+ vq->dma_dev = dma_dev;
+ vq->use_dma_api = vring_use_dma_api(vdev);
+ vq->premapped = false;
+ vq->do_unmap = vq->use_dma_api;
+
+ vq->indirect = virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC) &&
+ !context;
+ vq->event = virtio_has_feature(vdev, VIRTIO_RING_F_EVENT_IDX);
+
+ if (virtio_has_feature(vdev, VIRTIO_F_ORDER_PLATFORM))
+ vq->weak_barriers = false;
+
+ err = vring_alloc_state_extra_split(vring_split);
+ if (err) {
+ kfree(vq);
+ return NULL;
+ }
+
+ virtqueue_vring_init_split(vring_split, vq);
+
+ virtqueue_init(vq, vring_split->vring.num);
+ virtqueue_vring_attach_split(vq, vring_split);
+
+ spin_lock(&vdev->vqs_list_lock);
+ list_add_tail(&vq->vq.list, &vdev->vqs);
+ spin_unlock(&vdev->vqs_list_lock);
+ return &vq->vq;
+}
+
+struct virtqueue *vring_create_virtqueue(
+ unsigned int index,
+ unsigned int num,
+ unsigned int vring_align,
+ struct virtio_device *vdev,
+ bool weak_barriers,
+ bool may_reduce_num,
+ bool context,
+ bool (*notify)(struct virtqueue *),
+ void (*callback)(struct virtqueue *),
+ const char *name)
+{
+
+ if (virtio_has_feature(vdev, VIRTIO_F_RING_PACKED))
+ return vring_create_virtqueue_packed(index, num, vring_align,
+ vdev, weak_barriers, may_reduce_num,
+ context, notify, callback, name, vdev->dev.parent);
+
+ return vring_create_virtqueue_split(index, num, vring_align,
+ vdev, weak_barriers, may_reduce_num,
+ context, notify, callback, name, vdev->dev.parent);
+}
+EXPORT_SYMBOL_GPL(vring_create_virtqueue);
+
+struct virtqueue *vring_create_virtqueue_dma(
+ unsigned int index,
+ unsigned int num,
+ unsigned int vring_align,
+ struct virtio_device *vdev,
+ bool weak_barriers,
+ bool may_reduce_num,
+ bool context,
+ bool (*notify)(struct virtqueue *),
+ void (*callback)(struct virtqueue *),
+ const char *name,
+ struct device *dma_dev)
+{
+
+ if (virtio_has_feature(vdev, VIRTIO_F_RING_PACKED))
+ return vring_create_virtqueue_packed(index, num, vring_align,
+ vdev, weak_barriers, may_reduce_num,
+ context, notify, callback, name, dma_dev);
+
+ return vring_create_virtqueue_split(index, num, vring_align,
+ vdev, weak_barriers, may_reduce_num,
+ context, notify, callback, name, dma_dev);
+}
+EXPORT_SYMBOL_GPL(vring_create_virtqueue_dma);
+
+/**
+ * virtqueue_resize - resize the vring of vq
+ * @_vq: the struct virtqueue we're talking about.
+ * @num: new ring num
+ * @recycle: callback to recycle unused buffers
+ *
+ * When it is really necessary to create a new vring, it will set the current vq
+ * into the reset state. Then call the passed callback to recycle the buffer
+ * that is no longer used. Only after the new vring is successfully created, the
+ * old vring will be released.
+ *
+ * Caller must ensure we don't call this with other virtqueue operations
+ * at the same time (except where noted).
+ *
+ * Returns zero or a negative error.
+ * 0: success.
+ * -ENOMEM: Failed to allocate a new ring, fall back to the original ring size.
+ * vq can still work normally
+ * -EBUSY: Failed to sync with device, vq may not work properly
+ * -ENOENT: Transport or device not supported
+ * -E2BIG/-EINVAL: num error
+ * -EPERM: Operation not permitted
+ *
+ */
+int virtqueue_resize(struct virtqueue *_vq, u32 num,
+ void (*recycle)(struct virtqueue *vq, void *buf))
+{
+ struct vring_virtqueue *vq = to_vvq(_vq);
+ int err;
+
+ if (num > vq->vq.num_max)
+ return -E2BIG;
+
+ if (!num)
+ return -EINVAL;
+
+ if ((vq->packed_ring ? vq->packed.vring.num : vq->split.vring.num) == num)
+ return 0;
+
+ err = virtqueue_disable_and_recycle(_vq, recycle);
+ if (err)
+ return err;
+
+ if (vq->packed_ring)
+ err = virtqueue_resize_packed(_vq, num);
+ else
+ err = virtqueue_resize_split(_vq, num);
+
+ return virtqueue_enable_after_reset(_vq);
+}
+EXPORT_SYMBOL_GPL(virtqueue_resize);
+
+/**
+ * virtqueue_set_dma_premapped - set the vring premapped mode
+ * @_vq: the struct virtqueue we're talking about.
+ *
+ * Enable the premapped mode of the vq.
+ *
+ * The vring in premapped mode does not do dma internally, so the driver must
+ * do dma mapping in advance. The driver must pass the dma_address through
+ * dma_address of scatterlist. When the driver got a used buffer from
+ * the vring, it has to unmap the dma address.
+ *
+ * This function must be called immediately after creating the vq, or after vq
+ * reset, and before adding any buffers to it.
+ *
+ * Caller must ensure we don't call this with other virtqueue operations
+ * at the same time (except where noted).
+ *
+ * Returns zero or a negative error.
+ * 0: success.
+ * -EINVAL: vring does not use the dma api, so we can not enable premapped mode.
+ */
+int virtqueue_set_dma_premapped(struct virtqueue *_vq)
+{
+ struct vring_virtqueue *vq = to_vvq(_vq);
+ u32 num;
+
+ START_USE(vq);
+
+ num = vq->packed_ring ? vq->packed.vring.num : vq->split.vring.num;
+
+ if (num != vq->vq.num_free) {
+ END_USE(vq);
+ return -EINVAL;
+ }
+
+ if (!vq->use_dma_api) {
+ END_USE(vq);
+ return -EINVAL;
+ }
+
+ vq->premapped = true;
+ vq->do_unmap = false;
+
+ END_USE(vq);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(virtqueue_set_dma_premapped);
+
+/**
+ * virtqueue_reset - detach and recycle all unused buffers
+ * @_vq: the struct virtqueue we're talking about.
+ * @recycle: callback to recycle unused buffers
+ *
+ * Caller must ensure we don't call this with other virtqueue operations
+ * at the same time (except where noted).
+ *
+ * Returns zero or a negative error.
+ * 0: success.
+ * -EBUSY: Failed to sync with device, vq may not work properly
+ * -ENOENT: Transport or device not supported
+ * -EPERM: Operation not permitted
+ */
+int virtqueue_reset(struct virtqueue *_vq,
+ void (*recycle)(struct virtqueue *vq, void *buf))
+{
+ struct vring_virtqueue *vq = to_vvq(_vq);
+ int err;
+
+ err = virtqueue_disable_and_recycle(_vq, recycle);
+ if (err)
+ return err;
+
+ if (vq->packed_ring)
+ virtqueue_reinit_packed(vq);
+ else
+ virtqueue_reinit_split(vq);
+
+ return virtqueue_enable_after_reset(_vq);
+}
+EXPORT_SYMBOL_GPL(virtqueue_reset);
+
+/* Only available for split ring */
+struct virtqueue *vring_new_virtqueue(unsigned int index,
+ unsigned int num,
+ unsigned int vring_align,
+ struct virtio_device *vdev,
+ bool weak_barriers,
+ bool context,
+ void *pages,
+ bool (*notify)(struct virtqueue *vq),
+ void (*callback)(struct virtqueue *vq),
+ const char *name)
+{
+ struct vring_virtqueue_split vring_split = {};
+
+ if (virtio_has_feature(vdev, VIRTIO_F_RING_PACKED))
+ return NULL;
+
+ vring_init(&vring_split.vring, num, pages, vring_align);
+ return __vring_new_virtqueue(index, &vring_split, vdev, weak_barriers,
+ context, notify, callback, name,
+ vdev->dev.parent);
+}
+EXPORT_SYMBOL_GPL(vring_new_virtqueue);
+
+static void vring_free(struct virtqueue *_vq)
+{
+ struct vring_virtqueue *vq = to_vvq(_vq);
+
+ if (vq->we_own_ring) {
+ if (vq->packed_ring) {
+ vring_free_queue(vq->vq.vdev,
+ vq->packed.ring_size_in_bytes,
+ vq->packed.vring.desc,
+ vq->packed.ring_dma_addr,
+ vring_dma_dev(vq));
+
+ vring_free_queue(vq->vq.vdev,
+ vq->packed.event_size_in_bytes,
+ vq->packed.vring.driver,
+ vq->packed.driver_event_dma_addr,
+ vring_dma_dev(vq));
+
+ vring_free_queue(vq->vq.vdev,
+ vq->packed.event_size_in_bytes,
+ vq->packed.vring.device,
+ vq->packed.device_event_dma_addr,
+ vring_dma_dev(vq));
+
+ kfree(vq->packed.desc_state);
+ kfree(vq->packed.desc_extra);
+ } else {
+ vring_free_queue(vq->vq.vdev,
+ vq->split.queue_size_in_bytes,
+ vq->split.vring.desc,
+ vq->split.queue_dma_addr,
+ vring_dma_dev(vq));
+ }
+ }
+ if (!vq->packed_ring) {
+ kfree(vq->split.desc_state);
+ kfree(vq->split.desc_extra);
+ }
+}
+
+void vring_del_virtqueue(struct virtqueue *_vq)
+{
+ struct vring_virtqueue *vq = to_vvq(_vq);
+
+ spin_lock(&vq->vq.vdev->vqs_list_lock);
+ list_del(&_vq->list);
+ spin_unlock(&vq->vq.vdev->vqs_list_lock);
+
+ vring_free(_vq);
+
+ kfree(vq);
+}
+EXPORT_SYMBOL_GPL(vring_del_virtqueue);
+
+u32 vring_notification_data(struct virtqueue *_vq)
+{
+ struct vring_virtqueue *vq = to_vvq(_vq);
+ u16 next;
+
+ if (vq->packed_ring)
+ next = (vq->packed.next_avail_idx &
+ ~(-(1 << VRING_PACKED_EVENT_F_WRAP_CTR))) |
+ vq->packed.avail_wrap_counter <<
+ VRING_PACKED_EVENT_F_WRAP_CTR;
+ else
+ next = vq->split.avail_idx_shadow;
+
+ return next << 16 | _vq->index;
+}
+EXPORT_SYMBOL_GPL(vring_notification_data);
+
+/* Manipulates transport-specific feature bits. */
+void vring_transport_features(struct virtio_device *vdev)
+{
+ unsigned int i;
+
+ for (i = VIRTIO_TRANSPORT_F_START; i < VIRTIO_TRANSPORT_F_END; i++) {
+ switch (i) {
+ case VIRTIO_RING_F_INDIRECT_DESC:
+ break;
+ case VIRTIO_RING_F_EVENT_IDX:
+ break;
+ case VIRTIO_F_VERSION_1:
+ break;
+ case VIRTIO_F_ACCESS_PLATFORM:
+ break;
+ case VIRTIO_F_RING_PACKED:
+ break;
+ case VIRTIO_F_ORDER_PLATFORM:
+ break;
+ case VIRTIO_F_NOTIFICATION_DATA:
+ break;
+ default:
+ /* We don't understand this bit. */
+ __virtio_clear_bit(vdev, i);
+ }
+ }
+}
+EXPORT_SYMBOL_GPL(vring_transport_features);
+
+/**
+ * virtqueue_get_vring_size - return the size of the virtqueue's vring
+ * @_vq: the struct virtqueue containing the vring of interest.
+ *
+ * Returns the size of the vring. This is mainly used for boasting to
+ * userspace. Unlike other operations, this need not be serialized.
+ */
+unsigned int virtqueue_get_vring_size(const struct virtqueue *_vq)
+{
+
+ const struct vring_virtqueue *vq = to_vvq(_vq);
+
+ return vq->packed_ring ? vq->packed.vring.num : vq->split.vring.num;
+}
+EXPORT_SYMBOL_GPL(virtqueue_get_vring_size);
+
+/*
+ * This function should only be called by the core, not directly by the driver.
+ */
+void __virtqueue_break(struct virtqueue *_vq)
+{
+ struct vring_virtqueue *vq = to_vvq(_vq);
+
+ /* Pairs with READ_ONCE() in virtqueue_is_broken(). */
+ WRITE_ONCE(vq->broken, true);
+}
+EXPORT_SYMBOL_GPL(__virtqueue_break);
+
+/*
+ * This function should only be called by the core, not directly by the driver.
+ */
+void __virtqueue_unbreak(struct virtqueue *_vq)
+{
+ struct vring_virtqueue *vq = to_vvq(_vq);
+
+ /* Pairs with READ_ONCE() in virtqueue_is_broken(). */
+ WRITE_ONCE(vq->broken, false);
+}
+EXPORT_SYMBOL_GPL(__virtqueue_unbreak);
+
+bool virtqueue_is_broken(const struct virtqueue *_vq)
+{
+ const struct vring_virtqueue *vq = to_vvq(_vq);
+
+ return READ_ONCE(vq->broken);
+}
+EXPORT_SYMBOL_GPL(virtqueue_is_broken);
+
+/*
+ * This should prevent the device from being used, allowing drivers to
+ * recover. You may need to grab appropriate locks to flush.
+ */
+void virtio_break_device(struct virtio_device *dev)
+{
+ struct virtqueue *_vq;
+
+ spin_lock(&dev->vqs_list_lock);
+ list_for_each_entry(_vq, &dev->vqs, list) {
+ struct vring_virtqueue *vq = to_vvq(_vq);
+
+ /* Pairs with READ_ONCE() in virtqueue_is_broken(). */
+ WRITE_ONCE(vq->broken, true);
+ }
+ spin_unlock(&dev->vqs_list_lock);
+}
+EXPORT_SYMBOL_GPL(virtio_break_device);
+
+/*
+ * This should allow the device to be used by the driver. You may
+ * need to grab appropriate locks to flush the write to
+ * vq->broken. This should only be used in some specific case e.g
+ * (probing and restoring). This function should only be called by the
+ * core, not directly by the driver.
+ */
+void __virtio_unbreak_device(struct virtio_device *dev)
+{
+ struct virtqueue *_vq;
+
+ spin_lock(&dev->vqs_list_lock);
+ list_for_each_entry(_vq, &dev->vqs, list) {
+ struct vring_virtqueue *vq = to_vvq(_vq);
+
+ /* Pairs with READ_ONCE() in virtqueue_is_broken(). */
+ WRITE_ONCE(vq->broken, false);
+ }
+ spin_unlock(&dev->vqs_list_lock);
+}
+EXPORT_SYMBOL_GPL(__virtio_unbreak_device);
+
+dma_addr_t virtqueue_get_desc_addr(const struct virtqueue *_vq)
+{
+ const struct vring_virtqueue *vq = to_vvq(_vq);
+
+ BUG_ON(!vq->we_own_ring);
+
+ if (vq->packed_ring)
+ return vq->packed.ring_dma_addr;
+
+ return vq->split.queue_dma_addr;
+}
+EXPORT_SYMBOL_GPL(virtqueue_get_desc_addr);
+
+dma_addr_t virtqueue_get_avail_addr(const struct virtqueue *_vq)
+{
+ const struct vring_virtqueue *vq = to_vvq(_vq);
+
+ BUG_ON(!vq->we_own_ring);
+
+ if (vq->packed_ring)
+ return vq->packed.driver_event_dma_addr;
+
+ return vq->split.queue_dma_addr +
+ ((char *)vq->split.vring.avail - (char *)vq->split.vring.desc);
+}
+EXPORT_SYMBOL_GPL(virtqueue_get_avail_addr);
+
+dma_addr_t virtqueue_get_used_addr(const struct virtqueue *_vq)
+{
+ const struct vring_virtqueue *vq = to_vvq(_vq);
+
+ BUG_ON(!vq->we_own_ring);
+
+ if (vq->packed_ring)
+ return vq->packed.device_event_dma_addr;
+
+ return vq->split.queue_dma_addr +
+ ((char *)vq->split.vring.used - (char *)vq->split.vring.desc);
+}
+EXPORT_SYMBOL_GPL(virtqueue_get_used_addr);
+
+/* Only available for split ring */
+const struct vring *virtqueue_get_vring(const struct virtqueue *vq)
+{
+ return &to_vvq(vq)->split.vring;
+}
+EXPORT_SYMBOL_GPL(virtqueue_get_vring);
+
+/**
+ * virtqueue_dma_map_single_attrs - map DMA for _vq
+ * @_vq: the struct virtqueue we're talking about.
+ * @ptr: the pointer of the buffer to do dma
+ * @size: the size of the buffer to do dma
+ * @dir: DMA direction
+ * @attrs: DMA Attrs
+ *
+ * The caller calls this to do dma mapping in advance. The DMA address can be
+ * passed to this _vq when it is in pre-mapped mode.
+ *
+ * return DMA address. Caller should check that by virtqueue_dma_mapping_error().
+ */
+dma_addr_t virtqueue_dma_map_single_attrs(struct virtqueue *_vq, void *ptr,
+ size_t size,
+ enum dma_data_direction dir,
+ unsigned long attrs)
+{
+ struct vring_virtqueue *vq = to_vvq(_vq);
+
+ if (!vq->use_dma_api)
+ return (dma_addr_t)virt_to_phys(ptr);
+
+ return dma_map_single_attrs(vring_dma_dev(vq), ptr, size, dir, attrs);
+}
+EXPORT_SYMBOL_GPL(virtqueue_dma_map_single_attrs);
+
+/**
+ * virtqueue_dma_unmap_single_attrs - unmap DMA for _vq
+ * @_vq: the struct virtqueue we're talking about.
+ * @addr: the dma address to unmap
+ * @size: the size of the buffer
+ * @dir: DMA direction
+ * @attrs: DMA Attrs
+ *
+ * Unmap the address that is mapped by the virtqueue_dma_map_* APIs.
+ *
+ */
+void virtqueue_dma_unmap_single_attrs(struct virtqueue *_vq, dma_addr_t addr,
+ size_t size, enum dma_data_direction dir,
+ unsigned long attrs)
+{
+ struct vring_virtqueue *vq = to_vvq(_vq);
+
+ if (!vq->use_dma_api)
+ return;
+
+ dma_unmap_single_attrs(vring_dma_dev(vq), addr, size, dir, attrs);
+}
+EXPORT_SYMBOL_GPL(virtqueue_dma_unmap_single_attrs);
+
+/**
+ * virtqueue_dma_mapping_error - check dma address
+ * @_vq: the struct virtqueue we're talking about.
+ * @addr: DMA address
+ *
+ * Returns 0 means dma valid. Other means invalid dma address.
+ */
+int virtqueue_dma_mapping_error(struct virtqueue *_vq, dma_addr_t addr)
+{
+ struct vring_virtqueue *vq = to_vvq(_vq);
+
+ if (!vq->use_dma_api)
+ return 0;
+
+ return dma_mapping_error(vring_dma_dev(vq), addr);
+}
+EXPORT_SYMBOL_GPL(virtqueue_dma_mapping_error);
+
+/**
+ * virtqueue_dma_need_sync - check a dma address needs sync
+ * @_vq: the struct virtqueue we're talking about.
+ * @addr: DMA address
+ *
+ * Check if the dma address mapped by the virtqueue_dma_map_* APIs needs to be
+ * synchronized
+ *
+ * return bool
+ */
+bool virtqueue_dma_need_sync(struct virtqueue *_vq, dma_addr_t addr)
+{
+ struct vring_virtqueue *vq = to_vvq(_vq);
+
+ if (!vq->use_dma_api)
+ return false;
+
+ return dma_need_sync(vring_dma_dev(vq), addr);
+}
+EXPORT_SYMBOL_GPL(virtqueue_dma_need_sync);
+
+/**
+ * virtqueue_dma_sync_single_range_for_cpu - dma sync for cpu
+ * @_vq: the struct virtqueue we're talking about.
+ * @addr: DMA address
+ * @offset: DMA address offset
+ * @size: buf size for sync
+ * @dir: DMA direction
+ *
+ * Before calling this function, use virtqueue_dma_need_sync() to confirm that
+ * the DMA address really needs to be synchronized
+ *
+ */
+void virtqueue_dma_sync_single_range_for_cpu(struct virtqueue *_vq,
+ dma_addr_t addr,
+ unsigned long offset, size_t size,
+ enum dma_data_direction dir)
+{
+ struct vring_virtqueue *vq = to_vvq(_vq);
+ struct device *dev = vring_dma_dev(vq);
+
+ if (!vq->use_dma_api)
+ return;
+
+ dma_sync_single_range_for_cpu(dev, addr, offset, size, dir);
+}
+EXPORT_SYMBOL_GPL(virtqueue_dma_sync_single_range_for_cpu);
+
+/**
+ * virtqueue_dma_sync_single_range_for_device - dma sync for device
+ * @_vq: the struct virtqueue we're talking about.
+ * @addr: DMA address
+ * @offset: DMA address offset
+ * @size: buf size for sync
+ * @dir: DMA direction
+ *
+ * Before calling this function, use virtqueue_dma_need_sync() to confirm that
+ * the DMA address really needs to be synchronized
+ */
+void virtqueue_dma_sync_single_range_for_device(struct virtqueue *_vq,
+ dma_addr_t addr,
+ unsigned long offset, size_t size,
+ enum dma_data_direction dir)
+{
+ struct vring_virtqueue *vq = to_vvq(_vq);
+ struct device *dev = vring_dma_dev(vq);
+
+ if (!vq->use_dma_api)
+ return;
+
+ dma_sync_single_range_for_device(dev, addr, offset, size, dir);
+}
+EXPORT_SYMBOL_GPL(virtqueue_dma_sync_single_range_for_device);
+
+MODULE_LICENSE("GPL");
diff --git a/drivers/virtio/virtio_vdpa.c b/drivers/virtio/virtio_vdpa.c
new file mode 100644
index 000000000..06ce6d8c2
--- /dev/null
+++ b/drivers/virtio/virtio_vdpa.c
@@ -0,0 +1,548 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * VIRTIO based driver for vDPA device
+ *
+ * Copyright (c) 2020, Red Hat. All rights reserved.
+ * Author: Jason Wang <jasowang@redhat.com>
+ *
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/device.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/uuid.h>
+#include <linux/group_cpus.h>
+#include <linux/virtio.h>
+#include <linux/vdpa.h>
+#include <linux/virtio_config.h>
+#include <linux/virtio_ring.h>
+
+#define MOD_VERSION "0.1"
+#define MOD_AUTHOR "Jason Wang <jasowang@redhat.com>"
+#define MOD_DESC "vDPA bus driver for virtio devices"
+#define MOD_LICENSE "GPL v2"
+
+struct virtio_vdpa_device {
+ struct virtio_device vdev;
+ struct vdpa_device *vdpa;
+ u64 features;
+
+ /* The lock to protect virtqueue list */
+ spinlock_t lock;
+ /* List of virtio_vdpa_vq_info */
+ struct list_head virtqueues;
+};
+
+struct virtio_vdpa_vq_info {
+ /* the actual virtqueue */
+ struct virtqueue *vq;
+
+ /* the list node for the virtqueues list */
+ struct list_head node;
+};
+
+static inline struct virtio_vdpa_device *
+to_virtio_vdpa_device(struct virtio_device *dev)
+{
+ return container_of(dev, struct virtio_vdpa_device, vdev);
+}
+
+static struct vdpa_device *vd_get_vdpa(struct virtio_device *vdev)
+{
+ return to_virtio_vdpa_device(vdev)->vdpa;
+}
+
+static void virtio_vdpa_get(struct virtio_device *vdev, unsigned int offset,
+ void *buf, unsigned int len)
+{
+ struct vdpa_device *vdpa = vd_get_vdpa(vdev);
+
+ vdpa_get_config(vdpa, offset, buf, len);
+}
+
+static void virtio_vdpa_set(struct virtio_device *vdev, unsigned int offset,
+ const void *buf, unsigned int len)
+{
+ struct vdpa_device *vdpa = vd_get_vdpa(vdev);
+
+ vdpa_set_config(vdpa, offset, buf, len);
+}
+
+static u32 virtio_vdpa_generation(struct virtio_device *vdev)
+{
+ struct vdpa_device *vdpa = vd_get_vdpa(vdev);
+ const struct vdpa_config_ops *ops = vdpa->config;
+
+ if (ops->get_generation)
+ return ops->get_generation(vdpa);
+
+ return 0;
+}
+
+static u8 virtio_vdpa_get_status(struct virtio_device *vdev)
+{
+ struct vdpa_device *vdpa = vd_get_vdpa(vdev);
+ const struct vdpa_config_ops *ops = vdpa->config;
+
+ return ops->get_status(vdpa);
+}
+
+static void virtio_vdpa_set_status(struct virtio_device *vdev, u8 status)
+{
+ struct vdpa_device *vdpa = vd_get_vdpa(vdev);
+
+ return vdpa_set_status(vdpa, status);
+}
+
+static void virtio_vdpa_reset(struct virtio_device *vdev)
+{
+ struct vdpa_device *vdpa = vd_get_vdpa(vdev);
+
+ vdpa_reset(vdpa);
+}
+
+static bool virtio_vdpa_notify(struct virtqueue *vq)
+{
+ struct vdpa_device *vdpa = vd_get_vdpa(vq->vdev);
+ const struct vdpa_config_ops *ops = vdpa->config;
+
+ ops->kick_vq(vdpa, vq->index);
+
+ return true;
+}
+
+static bool virtio_vdpa_notify_with_data(struct virtqueue *vq)
+{
+ struct vdpa_device *vdpa = vd_get_vdpa(vq->vdev);
+ const struct vdpa_config_ops *ops = vdpa->config;
+ u32 data = vring_notification_data(vq);
+
+ ops->kick_vq_with_data(vdpa, data);
+
+ return true;
+}
+
+static irqreturn_t virtio_vdpa_config_cb(void *private)
+{
+ struct virtio_vdpa_device *vd_dev = private;
+
+ virtio_config_changed(&vd_dev->vdev);
+
+ return IRQ_HANDLED;
+}
+
+static irqreturn_t virtio_vdpa_virtqueue_cb(void *private)
+{
+ struct virtio_vdpa_vq_info *info = private;
+
+ return vring_interrupt(0, info->vq);
+}
+
+static struct virtqueue *
+virtio_vdpa_setup_vq(struct virtio_device *vdev, unsigned int index,
+ void (*callback)(struct virtqueue *vq),
+ const char *name, bool ctx)
+{
+ struct virtio_vdpa_device *vd_dev = to_virtio_vdpa_device(vdev);
+ struct vdpa_device *vdpa = vd_get_vdpa(vdev);
+ struct device *dma_dev;
+ const struct vdpa_config_ops *ops = vdpa->config;
+ struct virtio_vdpa_vq_info *info;
+ bool (*notify)(struct virtqueue *vq) = virtio_vdpa_notify;
+ struct vdpa_callback cb;
+ struct virtqueue *vq;
+ u64 desc_addr, driver_addr, device_addr;
+ /* Assume split virtqueue, switch to packed if necessary */
+ struct vdpa_vq_state state = {0};
+ unsigned long flags;
+ u32 align, max_num, min_num = 1;
+ bool may_reduce_num = true;
+ int err;
+
+ if (!name)
+ return NULL;
+
+ if (index >= vdpa->nvqs)
+ return ERR_PTR(-ENOENT);
+
+ /* We cannot accept VIRTIO_F_NOTIFICATION_DATA without kick_vq_with_data */
+ if (__virtio_test_bit(vdev, VIRTIO_F_NOTIFICATION_DATA)) {
+ if (ops->kick_vq_with_data)
+ notify = virtio_vdpa_notify_with_data;
+ else
+ __virtio_clear_bit(vdev, VIRTIO_F_NOTIFICATION_DATA);
+ }
+
+ /* Queue shouldn't already be set up. */
+ if (ops->get_vq_ready(vdpa, index))
+ return ERR_PTR(-ENOENT);
+
+ /* Allocate and fill out our active queue description */
+ info = kmalloc(sizeof(*info), GFP_KERNEL);
+ if (!info)
+ return ERR_PTR(-ENOMEM);
+
+ max_num = ops->get_vq_num_max(vdpa);
+ if (max_num == 0) {
+ err = -ENOENT;
+ goto error_new_virtqueue;
+ }
+
+ if (ops->get_vq_num_min)
+ min_num = ops->get_vq_num_min(vdpa);
+
+ may_reduce_num = (max_num == min_num) ? false : true;
+
+ /* Create the vring */
+ align = ops->get_vq_align(vdpa);
+
+ if (ops->get_vq_dma_dev)
+ dma_dev = ops->get_vq_dma_dev(vdpa, index);
+ else
+ dma_dev = vdpa_get_dma_dev(vdpa);
+ vq = vring_create_virtqueue_dma(index, max_num, align, vdev,
+ true, may_reduce_num, ctx,
+ notify, callback, name, dma_dev);
+ if (!vq) {
+ err = -ENOMEM;
+ goto error_new_virtqueue;
+ }
+
+ vq->num_max = max_num;
+
+ /* Setup virtqueue callback */
+ cb.callback = callback ? virtio_vdpa_virtqueue_cb : NULL;
+ cb.private = info;
+ cb.trigger = NULL;
+ ops->set_vq_cb(vdpa, index, &cb);
+ ops->set_vq_num(vdpa, index, virtqueue_get_vring_size(vq));
+
+ desc_addr = virtqueue_get_desc_addr(vq);
+ driver_addr = virtqueue_get_avail_addr(vq);
+ device_addr = virtqueue_get_used_addr(vq);
+
+ if (ops->set_vq_address(vdpa, index,
+ desc_addr, driver_addr,
+ device_addr)) {
+ err = -EINVAL;
+ goto err_vq;
+ }
+
+ /* reset virtqueue state index */
+ if (virtio_has_feature(vdev, VIRTIO_F_RING_PACKED)) {
+ struct vdpa_vq_state_packed *s = &state.packed;
+
+ s->last_avail_counter = 1;
+ s->last_avail_idx = 0;
+ s->last_used_counter = 1;
+ s->last_used_idx = 0;
+ }
+ err = ops->set_vq_state(vdpa, index, &state);
+ if (err)
+ goto err_vq;
+
+ ops->set_vq_ready(vdpa, index, 1);
+
+ vq->priv = info;
+ info->vq = vq;
+
+ spin_lock_irqsave(&vd_dev->lock, flags);
+ list_add(&info->node, &vd_dev->virtqueues);
+ spin_unlock_irqrestore(&vd_dev->lock, flags);
+
+ return vq;
+
+err_vq:
+ vring_del_virtqueue(vq);
+error_new_virtqueue:
+ ops->set_vq_ready(vdpa, index, 0);
+ /* VDPA driver should make sure vq is stopeed here */
+ WARN_ON(ops->get_vq_ready(vdpa, index));
+ kfree(info);
+ return ERR_PTR(err);
+}
+
+static void virtio_vdpa_del_vq(struct virtqueue *vq)
+{
+ struct virtio_vdpa_device *vd_dev = to_virtio_vdpa_device(vq->vdev);
+ struct vdpa_device *vdpa = vd_dev->vdpa;
+ const struct vdpa_config_ops *ops = vdpa->config;
+ struct virtio_vdpa_vq_info *info = vq->priv;
+ unsigned int index = vq->index;
+ unsigned long flags;
+
+ spin_lock_irqsave(&vd_dev->lock, flags);
+ list_del(&info->node);
+ spin_unlock_irqrestore(&vd_dev->lock, flags);
+
+ /* Select and deactivate the queue (best effort) */
+ ops->set_vq_ready(vdpa, index, 0);
+
+ vring_del_virtqueue(vq);
+
+ kfree(info);
+}
+
+static void virtio_vdpa_del_vqs(struct virtio_device *vdev)
+{
+ struct virtqueue *vq, *n;
+
+ list_for_each_entry_safe(vq, n, &vdev->vqs, list)
+ virtio_vdpa_del_vq(vq);
+}
+
+static void default_calc_sets(struct irq_affinity *affd, unsigned int affvecs)
+{
+ affd->nr_sets = 1;
+ affd->set_size[0] = affvecs;
+}
+
+static struct cpumask *
+create_affinity_masks(unsigned int nvecs, struct irq_affinity *affd)
+{
+ unsigned int affvecs = 0, curvec, usedvecs, i;
+ struct cpumask *masks = NULL;
+
+ if (nvecs > affd->pre_vectors + affd->post_vectors)
+ affvecs = nvecs - affd->pre_vectors - affd->post_vectors;
+
+ if (!affd->calc_sets)
+ affd->calc_sets = default_calc_sets;
+
+ affd->calc_sets(affd, affvecs);
+
+ if (!affvecs)
+ return NULL;
+
+ masks = kcalloc(nvecs, sizeof(*masks), GFP_KERNEL);
+ if (!masks)
+ return NULL;
+
+ /* Fill out vectors at the beginning that don't need affinity */
+ for (curvec = 0; curvec < affd->pre_vectors; curvec++)
+ cpumask_setall(&masks[curvec]);
+
+ for (i = 0, usedvecs = 0; i < affd->nr_sets; i++) {
+ unsigned int this_vecs = affd->set_size[i];
+ int j;
+ struct cpumask *result = group_cpus_evenly(this_vecs);
+
+ if (!result) {
+ kfree(masks);
+ return NULL;
+ }
+
+ for (j = 0; j < this_vecs; j++)
+ cpumask_copy(&masks[curvec + j], &result[j]);
+ kfree(result);
+
+ curvec += this_vecs;
+ usedvecs += this_vecs;
+ }
+
+ /* Fill out vectors at the end that don't need affinity */
+ if (usedvecs >= affvecs)
+ curvec = affd->pre_vectors + affvecs;
+ else
+ curvec = affd->pre_vectors + usedvecs;
+ for (; curvec < nvecs; curvec++)
+ cpumask_setall(&masks[curvec]);
+
+ return masks;
+}
+
+static int virtio_vdpa_find_vqs(struct virtio_device *vdev, unsigned int nvqs,
+ struct virtqueue *vqs[],
+ vq_callback_t *callbacks[],
+ const char * const names[],
+ const bool *ctx,
+ struct irq_affinity *desc)
+{
+ struct virtio_vdpa_device *vd_dev = to_virtio_vdpa_device(vdev);
+ struct vdpa_device *vdpa = vd_get_vdpa(vdev);
+ const struct vdpa_config_ops *ops = vdpa->config;
+ struct irq_affinity default_affd = { 0 };
+ struct cpumask *masks;
+ struct vdpa_callback cb;
+ bool has_affinity = desc && ops->set_vq_affinity;
+ int i, err, queue_idx = 0;
+
+ if (has_affinity) {
+ masks = create_affinity_masks(nvqs, desc ? desc : &default_affd);
+ if (!masks)
+ return -ENOMEM;
+ }
+
+ for (i = 0; i < nvqs; ++i) {
+ if (!names[i]) {
+ vqs[i] = NULL;
+ continue;
+ }
+
+ vqs[i] = virtio_vdpa_setup_vq(vdev, queue_idx++,
+ callbacks[i], names[i], ctx ?
+ ctx[i] : false);
+ if (IS_ERR(vqs[i])) {
+ err = PTR_ERR(vqs[i]);
+ goto err_setup_vq;
+ }
+
+ if (has_affinity)
+ ops->set_vq_affinity(vdpa, i, &masks[i]);
+ }
+
+ cb.callback = virtio_vdpa_config_cb;
+ cb.private = vd_dev;
+ ops->set_config_cb(vdpa, &cb);
+ if (has_affinity)
+ kfree(masks);
+
+ return 0;
+
+err_setup_vq:
+ virtio_vdpa_del_vqs(vdev);
+ if (has_affinity)
+ kfree(masks);
+ return err;
+}
+
+static u64 virtio_vdpa_get_features(struct virtio_device *vdev)
+{
+ struct vdpa_device *vdpa = vd_get_vdpa(vdev);
+ const struct vdpa_config_ops *ops = vdpa->config;
+
+ return ops->get_device_features(vdpa);
+}
+
+static int virtio_vdpa_finalize_features(struct virtio_device *vdev)
+{
+ struct vdpa_device *vdpa = vd_get_vdpa(vdev);
+
+ /* Give virtio_ring a chance to accept features. */
+ vring_transport_features(vdev);
+
+ return vdpa_set_features(vdpa, vdev->features);
+}
+
+static const char *virtio_vdpa_bus_name(struct virtio_device *vdev)
+{
+ struct virtio_vdpa_device *vd_dev = to_virtio_vdpa_device(vdev);
+ struct vdpa_device *vdpa = vd_dev->vdpa;
+
+ return dev_name(&vdpa->dev);
+}
+
+static int virtio_vdpa_set_vq_affinity(struct virtqueue *vq,
+ const struct cpumask *cpu_mask)
+{
+ struct virtio_vdpa_device *vd_dev = to_virtio_vdpa_device(vq->vdev);
+ struct vdpa_device *vdpa = vd_dev->vdpa;
+ const struct vdpa_config_ops *ops = vdpa->config;
+ unsigned int index = vq->index;
+
+ if (ops->set_vq_affinity)
+ return ops->set_vq_affinity(vdpa, index, cpu_mask);
+
+ return 0;
+}
+
+static const struct cpumask *
+virtio_vdpa_get_vq_affinity(struct virtio_device *vdev, int index)
+{
+ struct vdpa_device *vdpa = vd_get_vdpa(vdev);
+ const struct vdpa_config_ops *ops = vdpa->config;
+
+ if (ops->get_vq_affinity)
+ return ops->get_vq_affinity(vdpa, index);
+
+ return NULL;
+}
+
+static const struct virtio_config_ops virtio_vdpa_config_ops = {
+ .get = virtio_vdpa_get,
+ .set = virtio_vdpa_set,
+ .generation = virtio_vdpa_generation,
+ .get_status = virtio_vdpa_get_status,
+ .set_status = virtio_vdpa_set_status,
+ .reset = virtio_vdpa_reset,
+ .find_vqs = virtio_vdpa_find_vqs,
+ .del_vqs = virtio_vdpa_del_vqs,
+ .get_features = virtio_vdpa_get_features,
+ .finalize_features = virtio_vdpa_finalize_features,
+ .bus_name = virtio_vdpa_bus_name,
+ .set_vq_affinity = virtio_vdpa_set_vq_affinity,
+ .get_vq_affinity = virtio_vdpa_get_vq_affinity,
+};
+
+static void virtio_vdpa_release_dev(struct device *_d)
+{
+ struct virtio_device *vdev =
+ container_of(_d, struct virtio_device, dev);
+ struct virtio_vdpa_device *vd_dev =
+ container_of(vdev, struct virtio_vdpa_device, vdev);
+
+ kfree(vd_dev);
+}
+
+static int virtio_vdpa_probe(struct vdpa_device *vdpa)
+{
+ const struct vdpa_config_ops *ops = vdpa->config;
+ struct virtio_vdpa_device *vd_dev, *reg_dev = NULL;
+ int ret = -EINVAL;
+
+ vd_dev = kzalloc(sizeof(*vd_dev), GFP_KERNEL);
+ if (!vd_dev)
+ return -ENOMEM;
+
+ vd_dev->vdev.dev.parent = vdpa_get_dma_dev(vdpa);
+ vd_dev->vdev.dev.release = virtio_vdpa_release_dev;
+ vd_dev->vdev.config = &virtio_vdpa_config_ops;
+ vd_dev->vdpa = vdpa;
+ INIT_LIST_HEAD(&vd_dev->virtqueues);
+ spin_lock_init(&vd_dev->lock);
+
+ vd_dev->vdev.id.device = ops->get_device_id(vdpa);
+ if (vd_dev->vdev.id.device == 0)
+ goto err;
+
+ vd_dev->vdev.id.vendor = ops->get_vendor_id(vdpa);
+ ret = register_virtio_device(&vd_dev->vdev);
+ reg_dev = vd_dev;
+ if (ret)
+ goto err;
+
+ vdpa_set_drvdata(vdpa, vd_dev);
+
+ return 0;
+
+err:
+ if (reg_dev)
+ put_device(&vd_dev->vdev.dev);
+ else
+ kfree(vd_dev);
+ return ret;
+}
+
+static void virtio_vdpa_remove(struct vdpa_device *vdpa)
+{
+ struct virtio_vdpa_device *vd_dev = vdpa_get_drvdata(vdpa);
+
+ unregister_virtio_device(&vd_dev->vdev);
+}
+
+static struct vdpa_driver virtio_vdpa_driver = {
+ .driver = {
+ .name = "virtio_vdpa",
+ },
+ .probe = virtio_vdpa_probe,
+ .remove = virtio_vdpa_remove,
+};
+
+module_vdpa_driver(virtio_vdpa_driver);
+
+MODULE_VERSION(MOD_VERSION);
+MODULE_LICENSE(MOD_LICENSE);
+MODULE_AUTHOR(MOD_AUTHOR);
+MODULE_DESCRIPTION(MOD_DESC);