diff options
Diffstat (limited to 'drivers/virtio')
-rw-r--r-- | drivers/virtio/Kconfig | 176 | ||||
-rw-r--r-- | drivers/virtio/Makefile | 14 | ||||
-rw-r--r-- | drivers/virtio/virtio.c | 571 | ||||
-rw-r--r-- | drivers/virtio/virtio_anchor.c | 18 | ||||
-rw-r--r-- | drivers/virtio/virtio_balloon.c | 1132 | ||||
-rw-r--r-- | drivers/virtio/virtio_dma_buf.c | 89 | ||||
-rw-r--r-- | drivers/virtio/virtio_input.c | 414 | ||||
-rw-r--r-- | drivers/virtio/virtio_mem.c | 3008 | ||||
-rw-r--r-- | drivers/virtio/virtio_mmio.c | 877 | ||||
-rw-r--r-- | drivers/virtio/virtio_pci_common.c | 650 | ||||
-rw-r--r-- | drivers/virtio/virtio_pci_common.h | 142 | ||||
-rw-r--r-- | drivers/virtio/virtio_pci_legacy.c | 236 | ||||
-rw-r--r-- | drivers/virtio/virtio_pci_legacy_dev.c | 222 | ||||
-rw-r--r-- | drivers/virtio/virtio_pci_modern.c | 566 | ||||
-rw-r--r-- | drivers/virtio/virtio_pci_modern_dev.c | 720 | ||||
-rw-r--r-- | drivers/virtio/virtio_ring.c | 3252 | ||||
-rw-r--r-- | drivers/virtio/virtio_vdpa.c | 548 |
17 files changed, 12635 insertions, 0 deletions
diff --git a/drivers/virtio/Kconfig b/drivers/virtio/Kconfig new file mode 100644 index 0000000000..0a53a61231 --- /dev/null +++ b/drivers/virtio/Kconfig @@ -0,0 +1,176 @@ +# SPDX-License-Identifier: GPL-2.0-only +config VIRTIO_ANCHOR + bool + +config VIRTIO + tristate + select VIRTIO_ANCHOR + help + This option is selected by any driver which implements the virtio + bus, such as CONFIG_VIRTIO_PCI, CONFIG_VIRTIO_MMIO, CONFIG_RPMSG + or CONFIG_S390_GUEST. + +config VIRTIO_PCI_LIB + tristate + help + Modern PCI device implementation. This module implements the + basic probe and control for devices which are based on modern + PCI device with possible vendor specific extensions. Any + module that selects this module must depend on PCI. + +config VIRTIO_PCI_LIB_LEGACY + tristate + help + Legacy PCI device (Virtio PCI Card 0.9.x Draft and older device) + implementation. + This module implements the basic probe and control for devices + which are based on legacy PCI device. Any module that selects this + module must depend on PCI. + +menuconfig VIRTIO_MENU + bool "Virtio drivers" + default y + +if VIRTIO_MENU + +config VIRTIO_HARDEN_NOTIFICATION + bool "Harden virtio notification" + depends on BROKEN + help + Enable this to harden the device notifications and suppress + those that happen at a time where notifications are illegal. + + Experimental: Note that several drivers still have issues that + may cause crashes or hangs when correct handling of + notifications is enforced; depending on the subset of + drivers and devices you use, this may or may not work. + + If unsure, say N. + +config VIRTIO_PCI + tristate "PCI driver for virtio devices" + depends on PCI + select VIRTIO_PCI_LIB + select VIRTIO + help + This driver provides support for virtio based paravirtual device + drivers over PCI. This requires that your VMM has appropriate PCI + virtio backends. Most QEMU based VMMs should support these devices + (like KVM or Xen). + + If unsure, say M. + +config VIRTIO_PCI_LEGACY + bool "Support for legacy virtio draft 0.9.X and older devices" + default y + depends on VIRTIO_PCI + select VIRTIO_PCI_LIB_LEGACY + help + Virtio PCI Card 0.9.X Draft (circa 2014) and older device support. + + This option enables building a transitional driver, supporting + both devices conforming to Virtio 1 specification, and legacy devices. + If disabled, you get a slightly smaller, non-transitional driver, + with no legacy compatibility. + + So look out into your driveway. Do you have a flying car? If + so, you can happily disable this option and virtio will not + break. Otherwise, leave it set. Unless you're testing what + life will be like in The Future. + + If unsure, say Y. + +config VIRTIO_VDPA + tristate "vDPA driver for virtio devices" + depends on VDPA + select VIRTIO + help + This driver provides support for virtio based paravirtual + device driver over vDPA bus. For this to be useful, you need + an appropriate vDPA device implementation that operates on a + physical device to allow the datapath of virtio to be + offloaded to hardware. + + If unsure, say M. + +config VIRTIO_PMEM + tristate "Support for virtio pmem driver" + depends on VIRTIO + depends on LIBNVDIMM + help + This driver provides access to virtio-pmem devices, storage devices + that are mapped into the physical address space - similar to NVDIMMs + - with a virtio-based flushing interface. + + If unsure, say Y. + +config VIRTIO_BALLOON + tristate "Virtio balloon driver" + depends on VIRTIO + select MEMORY_BALLOON + select PAGE_REPORTING + help + This driver supports increasing and decreasing the amount + of memory within a KVM guest. + + If unsure, say M. + +config VIRTIO_MEM + tristate "Virtio mem driver" + depends on X86_64 || ARM64 + depends on VIRTIO + depends on MEMORY_HOTPLUG + depends on MEMORY_HOTREMOVE + depends on CONTIG_ALLOC + depends on EXCLUSIVE_SYSTEM_RAM + help + This driver provides access to virtio-mem paravirtualized memory + devices, allowing to hotplug and hotunplug memory. + + This driver currently only supports x86-64 and arm64. Although it + should compile on other architectures that implement memory + hot(un)plug, architecture-specific and/or common + code changes may be required for virtio-mem, kdump and kexec to work as + expected. + + If unsure, say M. + +config VIRTIO_INPUT + tristate "Virtio input driver" + depends on VIRTIO + depends on INPUT + help + This driver supports virtio input devices such as + keyboards, mice and tablets. + + If unsure, say M. + +config VIRTIO_MMIO + tristate "Platform bus driver for memory mapped virtio devices" + depends on HAS_IOMEM && HAS_DMA + select VIRTIO + help + This drivers provides support for memory mapped virtio + platform device driver. + + If unsure, say N. + +config VIRTIO_MMIO_CMDLINE_DEVICES + bool "Memory mapped virtio devices parameter parsing" + depends on VIRTIO_MMIO + help + Allow virtio-mmio devices instantiation via the kernel command line + or module parameters. Be aware that using incorrect parameters (base + address in particular) can crash your system - you have been warned. + See Documentation/admin-guide/kernel-parameters.rst for details. + + If unsure, say 'N'. + +config VIRTIO_DMA_SHARED_BUFFER + tristate + depends on DMA_SHARED_BUFFER + help + This option adds a flavor of dma buffers that are backed by + virtio resources. + +endif # VIRTIO_MENU diff --git a/drivers/virtio/Makefile b/drivers/virtio/Makefile new file mode 100644 index 0000000000..8e98d24917 --- /dev/null +++ b/drivers/virtio/Makefile @@ -0,0 +1,14 @@ +# SPDX-License-Identifier: GPL-2.0 +obj-$(CONFIG_VIRTIO) += virtio.o virtio_ring.o +obj-$(CONFIG_VIRTIO_ANCHOR) += virtio_anchor.o +obj-$(CONFIG_VIRTIO_PCI_LIB) += virtio_pci_modern_dev.o +obj-$(CONFIG_VIRTIO_PCI_LIB_LEGACY) += virtio_pci_legacy_dev.o +obj-$(CONFIG_VIRTIO_MMIO) += virtio_mmio.o +obj-$(CONFIG_VIRTIO_PCI) += virtio_pci.o +virtio_pci-y := virtio_pci_modern.o virtio_pci_common.o +virtio_pci-$(CONFIG_VIRTIO_PCI_LEGACY) += virtio_pci_legacy.o +obj-$(CONFIG_VIRTIO_BALLOON) += virtio_balloon.o +obj-$(CONFIG_VIRTIO_INPUT) += virtio_input.o +obj-$(CONFIG_VIRTIO_VDPA) += virtio_vdpa.o +obj-$(CONFIG_VIRTIO_MEM) += virtio_mem.o +obj-$(CONFIG_VIRTIO_DMA_SHARED_BUFFER) += virtio_dma_buf.o diff --git a/drivers/virtio/virtio.c b/drivers/virtio/virtio.c new file mode 100644 index 0000000000..3893dc29eb --- /dev/null +++ b/drivers/virtio/virtio.c @@ -0,0 +1,571 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include <linux/virtio.h> +#include <linux/spinlock.h> +#include <linux/virtio_config.h> +#include <linux/virtio_anchor.h> +#include <linux/module.h> +#include <linux/idr.h> +#include <linux/of.h> +#include <uapi/linux/virtio_ids.h> + +/* Unique numbering for virtio devices. */ +static DEFINE_IDA(virtio_index_ida); + +static ssize_t device_show(struct device *_d, + struct device_attribute *attr, char *buf) +{ + struct virtio_device *dev = dev_to_virtio(_d); + return sysfs_emit(buf, "0x%04x\n", dev->id.device); +} +static DEVICE_ATTR_RO(device); + +static ssize_t vendor_show(struct device *_d, + struct device_attribute *attr, char *buf) +{ + struct virtio_device *dev = dev_to_virtio(_d); + return sysfs_emit(buf, "0x%04x\n", dev->id.vendor); +} +static DEVICE_ATTR_RO(vendor); + +static ssize_t status_show(struct device *_d, + struct device_attribute *attr, char *buf) +{ + struct virtio_device *dev = dev_to_virtio(_d); + return sysfs_emit(buf, "0x%08x\n", dev->config->get_status(dev)); +} +static DEVICE_ATTR_RO(status); + +static ssize_t modalias_show(struct device *_d, + struct device_attribute *attr, char *buf) +{ + struct virtio_device *dev = dev_to_virtio(_d); + return sysfs_emit(buf, "virtio:d%08Xv%08X\n", + dev->id.device, dev->id.vendor); +} +static DEVICE_ATTR_RO(modalias); + +static ssize_t features_show(struct device *_d, + struct device_attribute *attr, char *buf) +{ + struct virtio_device *dev = dev_to_virtio(_d); + unsigned int i; + ssize_t len = 0; + + /* We actually represent this as a bitstring, as it could be + * arbitrary length in future. */ + for (i = 0; i < sizeof(dev->features)*8; i++) + len += sysfs_emit_at(buf, len, "%c", + __virtio_test_bit(dev, i) ? '1' : '0'); + len += sysfs_emit_at(buf, len, "\n"); + return len; +} +static DEVICE_ATTR_RO(features); + +static struct attribute *virtio_dev_attrs[] = { + &dev_attr_device.attr, + &dev_attr_vendor.attr, + &dev_attr_status.attr, + &dev_attr_modalias.attr, + &dev_attr_features.attr, + NULL, +}; +ATTRIBUTE_GROUPS(virtio_dev); + +static inline int virtio_id_match(const struct virtio_device *dev, + const struct virtio_device_id *id) +{ + if (id->device != dev->id.device && id->device != VIRTIO_DEV_ANY_ID) + return 0; + + return id->vendor == VIRTIO_DEV_ANY_ID || id->vendor == dev->id.vendor; +} + +/* This looks through all the IDs a driver claims to support. If any of them + * match, we return 1 and the kernel will call virtio_dev_probe(). */ +static int virtio_dev_match(struct device *_dv, struct device_driver *_dr) +{ + unsigned int i; + struct virtio_device *dev = dev_to_virtio(_dv); + const struct virtio_device_id *ids; + + ids = drv_to_virtio(_dr)->id_table; + for (i = 0; ids[i].device; i++) + if (virtio_id_match(dev, &ids[i])) + return 1; + return 0; +} + +static int virtio_uevent(const struct device *_dv, struct kobj_uevent_env *env) +{ + const struct virtio_device *dev = dev_to_virtio(_dv); + + return add_uevent_var(env, "MODALIAS=virtio:d%08Xv%08X", + dev->id.device, dev->id.vendor); +} + +void virtio_check_driver_offered_feature(const struct virtio_device *vdev, + unsigned int fbit) +{ + unsigned int i; + struct virtio_driver *drv = drv_to_virtio(vdev->dev.driver); + + for (i = 0; i < drv->feature_table_size; i++) + if (drv->feature_table[i] == fbit) + return; + + if (drv->feature_table_legacy) { + for (i = 0; i < drv->feature_table_size_legacy; i++) + if (drv->feature_table_legacy[i] == fbit) + return; + } + + BUG(); +} +EXPORT_SYMBOL_GPL(virtio_check_driver_offered_feature); + +static void __virtio_config_changed(struct virtio_device *dev) +{ + struct virtio_driver *drv = drv_to_virtio(dev->dev.driver); + + if (!dev->config_enabled) + dev->config_change_pending = true; + else if (drv && drv->config_changed) + drv->config_changed(dev); +} + +void virtio_config_changed(struct virtio_device *dev) +{ + unsigned long flags; + + spin_lock_irqsave(&dev->config_lock, flags); + __virtio_config_changed(dev); + spin_unlock_irqrestore(&dev->config_lock, flags); +} +EXPORT_SYMBOL_GPL(virtio_config_changed); + +static void virtio_config_disable(struct virtio_device *dev) +{ + spin_lock_irq(&dev->config_lock); + dev->config_enabled = false; + spin_unlock_irq(&dev->config_lock); +} + +static void virtio_config_enable(struct virtio_device *dev) +{ + spin_lock_irq(&dev->config_lock); + dev->config_enabled = true; + if (dev->config_change_pending) + __virtio_config_changed(dev); + dev->config_change_pending = false; + spin_unlock_irq(&dev->config_lock); +} + +void virtio_add_status(struct virtio_device *dev, unsigned int status) +{ + might_sleep(); + dev->config->set_status(dev, dev->config->get_status(dev) | status); +} +EXPORT_SYMBOL_GPL(virtio_add_status); + +/* Do some validation, then set FEATURES_OK */ +static int virtio_features_ok(struct virtio_device *dev) +{ + unsigned int status; + + might_sleep(); + + if (virtio_check_mem_acc_cb(dev)) { + if (!virtio_has_feature(dev, VIRTIO_F_VERSION_1)) { + dev_warn(&dev->dev, + "device must provide VIRTIO_F_VERSION_1\n"); + return -ENODEV; + } + + if (!virtio_has_feature(dev, VIRTIO_F_ACCESS_PLATFORM)) { + dev_warn(&dev->dev, + "device must provide VIRTIO_F_ACCESS_PLATFORM\n"); + return -ENODEV; + } + } + + if (!virtio_has_feature(dev, VIRTIO_F_VERSION_1)) + return 0; + + virtio_add_status(dev, VIRTIO_CONFIG_S_FEATURES_OK); + status = dev->config->get_status(dev); + if (!(status & VIRTIO_CONFIG_S_FEATURES_OK)) { + dev_err(&dev->dev, "virtio: device refuses features: %x\n", + status); + return -ENODEV; + } + return 0; +} + +/** + * virtio_reset_device - quiesce device for removal + * @dev: the device to reset + * + * Prevents device from sending interrupts and accessing memory. + * + * Generally used for cleanup during driver / device removal. + * + * Once this has been invoked, caller must ensure that + * virtqueue_notify / virtqueue_kick are not in progress. + * + * Note: this guarantees that vq callbacks are not in progress, however caller + * is responsible for preventing access from other contexts, such as a system + * call/workqueue/bh. Invoking virtio_break_device then flushing any such + * contexts is one way to handle that. + * */ +void virtio_reset_device(struct virtio_device *dev) +{ +#ifdef CONFIG_VIRTIO_HARDEN_NOTIFICATION + /* + * The below virtio_synchronize_cbs() guarantees that any + * interrupt for this line arriving after + * virtio_synchronize_vqs() has completed is guaranteed to see + * vq->broken as true. + */ + virtio_break_device(dev); + virtio_synchronize_cbs(dev); +#endif + + dev->config->reset(dev); +} +EXPORT_SYMBOL_GPL(virtio_reset_device); + +static int virtio_dev_probe(struct device *_d) +{ + int err, i; + struct virtio_device *dev = dev_to_virtio(_d); + struct virtio_driver *drv = drv_to_virtio(dev->dev.driver); + u64 device_features; + u64 driver_features; + u64 driver_features_legacy; + + /* We have a driver! */ + virtio_add_status(dev, VIRTIO_CONFIG_S_DRIVER); + + /* Figure out what features the device supports. */ + device_features = dev->config->get_features(dev); + + /* Figure out what features the driver supports. */ + driver_features = 0; + for (i = 0; i < drv->feature_table_size; i++) { + unsigned int f = drv->feature_table[i]; + BUG_ON(f >= 64); + driver_features |= (1ULL << f); + } + + /* Some drivers have a separate feature table for virtio v1.0 */ + if (drv->feature_table_legacy) { + driver_features_legacy = 0; + for (i = 0; i < drv->feature_table_size_legacy; i++) { + unsigned int f = drv->feature_table_legacy[i]; + BUG_ON(f >= 64); + driver_features_legacy |= (1ULL << f); + } + } else { + driver_features_legacy = driver_features; + } + + if (device_features & (1ULL << VIRTIO_F_VERSION_1)) + dev->features = driver_features & device_features; + else + dev->features = driver_features_legacy & device_features; + + /* Transport features always preserved to pass to finalize_features. */ + for (i = VIRTIO_TRANSPORT_F_START; i < VIRTIO_TRANSPORT_F_END; i++) + if (device_features & (1ULL << i)) + __virtio_set_bit(dev, i); + + err = dev->config->finalize_features(dev); + if (err) + goto err; + + if (drv->validate) { + u64 features = dev->features; + + err = drv->validate(dev); + if (err) + goto err; + + /* Did validation change any features? Then write them again. */ + if (features != dev->features) { + err = dev->config->finalize_features(dev); + if (err) + goto err; + } + } + + err = virtio_features_ok(dev); + if (err) + goto err; + + err = drv->probe(dev); + if (err) + goto err; + + /* If probe didn't do it, mark device DRIVER_OK ourselves. */ + if (!(dev->config->get_status(dev) & VIRTIO_CONFIG_S_DRIVER_OK)) + virtio_device_ready(dev); + + if (drv->scan) + drv->scan(dev); + + virtio_config_enable(dev); + + return 0; +err: + virtio_add_status(dev, VIRTIO_CONFIG_S_FAILED); + return err; + +} + +static void virtio_dev_remove(struct device *_d) +{ + struct virtio_device *dev = dev_to_virtio(_d); + struct virtio_driver *drv = drv_to_virtio(dev->dev.driver); + + virtio_config_disable(dev); + + drv->remove(dev); + + /* Driver should have reset device. */ + WARN_ON_ONCE(dev->config->get_status(dev)); + + /* Acknowledge the device's existence again. */ + virtio_add_status(dev, VIRTIO_CONFIG_S_ACKNOWLEDGE); + + of_node_put(dev->dev.of_node); +} + +static struct bus_type virtio_bus = { + .name = "virtio", + .match = virtio_dev_match, + .dev_groups = virtio_dev_groups, + .uevent = virtio_uevent, + .probe = virtio_dev_probe, + .remove = virtio_dev_remove, +}; + +int register_virtio_driver(struct virtio_driver *driver) +{ + /* Catch this early. */ + BUG_ON(driver->feature_table_size && !driver->feature_table); + driver->driver.bus = &virtio_bus; + return driver_register(&driver->driver); +} +EXPORT_SYMBOL_GPL(register_virtio_driver); + +void unregister_virtio_driver(struct virtio_driver *driver) +{ + driver_unregister(&driver->driver); +} +EXPORT_SYMBOL_GPL(unregister_virtio_driver); + +static int virtio_device_of_init(struct virtio_device *dev) +{ + struct device_node *np, *pnode = dev_of_node(dev->dev.parent); + char compat[] = "virtio,deviceXXXXXXXX"; + int ret, count; + + if (!pnode) + return 0; + + count = of_get_available_child_count(pnode); + if (!count) + return 0; + + /* There can be only 1 child node */ + if (WARN_ON(count > 1)) + return -EINVAL; + + np = of_get_next_available_child(pnode, NULL); + if (WARN_ON(!np)) + return -ENODEV; + + ret = snprintf(compat, sizeof(compat), "virtio,device%x", dev->id.device); + BUG_ON(ret >= sizeof(compat)); + + /* + * On powerpc/pseries virtio devices are PCI devices so PCI + * vendor/device ids play the role of the "compatible" property. + * Simply don't init of_node in this case. + */ + if (!of_device_is_compatible(np, compat)) { + ret = 0; + goto out; + } + + dev->dev.of_node = np; + return 0; + +out: + of_node_put(np); + return ret; +} + +/** + * register_virtio_device - register virtio device + * @dev : virtio device to be registered + * + * On error, the caller must call put_device on &@dev->dev (and not kfree), + * as another code path may have obtained a reference to @dev. + * + * Returns: 0 on suceess, -error on failure + */ +int register_virtio_device(struct virtio_device *dev) +{ + int err; + + dev->dev.bus = &virtio_bus; + device_initialize(&dev->dev); + + /* Assign a unique device index and hence name. */ + err = ida_alloc(&virtio_index_ida, GFP_KERNEL); + if (err < 0) + goto out; + + dev->index = err; + err = dev_set_name(&dev->dev, "virtio%u", dev->index); + if (err) + goto out_ida_remove; + + err = virtio_device_of_init(dev); + if (err) + goto out_ida_remove; + + spin_lock_init(&dev->config_lock); + dev->config_enabled = false; + dev->config_change_pending = false; + + INIT_LIST_HEAD(&dev->vqs); + spin_lock_init(&dev->vqs_list_lock); + + /* We always start by resetting the device, in case a previous + * driver messed it up. This also tests that code path a little. */ + virtio_reset_device(dev); + + /* Acknowledge that we've seen the device. */ + virtio_add_status(dev, VIRTIO_CONFIG_S_ACKNOWLEDGE); + + /* + * device_add() causes the bus infrastructure to look for a matching + * driver. + */ + err = device_add(&dev->dev); + if (err) + goto out_of_node_put; + + return 0; + +out_of_node_put: + of_node_put(dev->dev.of_node); +out_ida_remove: + ida_free(&virtio_index_ida, dev->index); +out: + virtio_add_status(dev, VIRTIO_CONFIG_S_FAILED); + return err; +} +EXPORT_SYMBOL_GPL(register_virtio_device); + +bool is_virtio_device(struct device *dev) +{ + return dev->bus == &virtio_bus; +} +EXPORT_SYMBOL_GPL(is_virtio_device); + +void unregister_virtio_device(struct virtio_device *dev) +{ + int index = dev->index; /* save for after device release */ + + device_unregister(&dev->dev); + ida_free(&virtio_index_ida, index); +} +EXPORT_SYMBOL_GPL(unregister_virtio_device); + +#ifdef CONFIG_PM_SLEEP +int virtio_device_freeze(struct virtio_device *dev) +{ + struct virtio_driver *drv = drv_to_virtio(dev->dev.driver); + + virtio_config_disable(dev); + + dev->failed = dev->config->get_status(dev) & VIRTIO_CONFIG_S_FAILED; + + if (drv && drv->freeze) + return drv->freeze(dev); + + return 0; +} +EXPORT_SYMBOL_GPL(virtio_device_freeze); + +int virtio_device_restore(struct virtio_device *dev) +{ + struct virtio_driver *drv = drv_to_virtio(dev->dev.driver); + int ret; + + /* We always start by resetting the device, in case a previous + * driver messed it up. */ + virtio_reset_device(dev); + + /* Acknowledge that we've seen the device. */ + virtio_add_status(dev, VIRTIO_CONFIG_S_ACKNOWLEDGE); + + /* Maybe driver failed before freeze. + * Restore the failed status, for debugging. */ + if (dev->failed) + virtio_add_status(dev, VIRTIO_CONFIG_S_FAILED); + + if (!drv) + return 0; + + /* We have a driver! */ + virtio_add_status(dev, VIRTIO_CONFIG_S_DRIVER); + + ret = dev->config->finalize_features(dev); + if (ret) + goto err; + + ret = virtio_features_ok(dev); + if (ret) + goto err; + + if (drv->restore) { + ret = drv->restore(dev); + if (ret) + goto err; + } + + /* If restore didn't do it, mark device DRIVER_OK ourselves. */ + if (!(dev->config->get_status(dev) & VIRTIO_CONFIG_S_DRIVER_OK)) + virtio_device_ready(dev); + + virtio_config_enable(dev); + + return 0; + +err: + virtio_add_status(dev, VIRTIO_CONFIG_S_FAILED); + return ret; +} +EXPORT_SYMBOL_GPL(virtio_device_restore); +#endif + +static int virtio_init(void) +{ + if (bus_register(&virtio_bus) != 0) + panic("virtio bus registration failed"); + return 0; +} + +static void __exit virtio_exit(void) +{ + bus_unregister(&virtio_bus); + ida_destroy(&virtio_index_ida); +} +core_initcall(virtio_init); +module_exit(virtio_exit); + +MODULE_LICENSE("GPL"); diff --git a/drivers/virtio/virtio_anchor.c b/drivers/virtio/virtio_anchor.c new file mode 100644 index 0000000000..4d6a5d269b --- /dev/null +++ b/drivers/virtio/virtio_anchor.c @@ -0,0 +1,18 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include <linux/virtio.h> +#include <linux/virtio_anchor.h> + +bool virtio_require_restricted_mem_acc(struct virtio_device *dev) +{ + return true; +} +EXPORT_SYMBOL_GPL(virtio_require_restricted_mem_acc); + +static bool virtio_no_restricted_mem_acc(struct virtio_device *dev) +{ + return false; +} + +bool (*virtio_check_mem_acc_cb)(struct virtio_device *dev) = + virtio_no_restricted_mem_acc; +EXPORT_SYMBOL_GPL(virtio_check_mem_acc_cb); diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c new file mode 100644 index 0000000000..2d5d252ef4 --- /dev/null +++ b/drivers/virtio/virtio_balloon.c @@ -0,0 +1,1132 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Virtio balloon implementation, inspired by Dor Laor and Marcelo + * Tosatti's implementations. + * + * Copyright 2008 Rusty Russell IBM Corporation + */ + +#include <linux/virtio.h> +#include <linux/virtio_balloon.h> +#include <linux/swap.h> +#include <linux/workqueue.h> +#include <linux/delay.h> +#include <linux/slab.h> +#include <linux/module.h> +#include <linux/balloon_compaction.h> +#include <linux/oom.h> +#include <linux/wait.h> +#include <linux/mm.h> +#include <linux/page_reporting.h> + +/* + * Balloon device works in 4K page units. So each page is pointed to by + * multiple balloon pages. All memory counters in this driver are in balloon + * page units. + */ +#define VIRTIO_BALLOON_PAGES_PER_PAGE (unsigned int)(PAGE_SIZE >> VIRTIO_BALLOON_PFN_SHIFT) +#define VIRTIO_BALLOON_ARRAY_PFNS_MAX 256 +/* Maximum number of (4k) pages to deflate on OOM notifications. */ +#define VIRTIO_BALLOON_OOM_NR_PAGES 256 +#define VIRTIO_BALLOON_OOM_NOTIFY_PRIORITY 80 + +#define VIRTIO_BALLOON_FREE_PAGE_ALLOC_FLAG (__GFP_NORETRY | __GFP_NOWARN | \ + __GFP_NOMEMALLOC) +/* The order of free page blocks to report to host */ +#define VIRTIO_BALLOON_HINT_BLOCK_ORDER MAX_ORDER +/* The size of a free page block in bytes */ +#define VIRTIO_BALLOON_HINT_BLOCK_BYTES \ + (1 << (VIRTIO_BALLOON_HINT_BLOCK_ORDER + PAGE_SHIFT)) +#define VIRTIO_BALLOON_HINT_BLOCK_PAGES (1 << VIRTIO_BALLOON_HINT_BLOCK_ORDER) + +enum virtio_balloon_vq { + VIRTIO_BALLOON_VQ_INFLATE, + VIRTIO_BALLOON_VQ_DEFLATE, + VIRTIO_BALLOON_VQ_STATS, + VIRTIO_BALLOON_VQ_FREE_PAGE, + VIRTIO_BALLOON_VQ_REPORTING, + VIRTIO_BALLOON_VQ_MAX +}; + +enum virtio_balloon_config_read { + VIRTIO_BALLOON_CONFIG_READ_CMD_ID = 0, +}; + +struct virtio_balloon { + struct virtio_device *vdev; + struct virtqueue *inflate_vq, *deflate_vq, *stats_vq, *free_page_vq; + + /* Balloon's own wq for cpu-intensive work items */ + struct workqueue_struct *balloon_wq; + /* The free page reporting work item submitted to the balloon wq */ + struct work_struct report_free_page_work; + + /* The balloon servicing is delegated to a freezable workqueue. */ + struct work_struct update_balloon_stats_work; + struct work_struct update_balloon_size_work; + + /* Prevent updating balloon when it is being canceled. */ + spinlock_t stop_update_lock; + bool stop_update; + /* Bitmap to indicate if reading the related config fields are needed */ + unsigned long config_read_bitmap; + + /* The list of allocated free pages, waiting to be given back to mm */ + struct list_head free_page_list; + spinlock_t free_page_list_lock; + /* The number of free page blocks on the above list */ + unsigned long num_free_page_blocks; + /* + * The cmd id received from host. + * Read it via virtio_balloon_cmd_id_received to get the latest value + * sent from host. + */ + u32 cmd_id_received_cache; + /* The cmd id that is actively in use */ + __virtio32 cmd_id_active; + /* Buffer to store the stop sign */ + __virtio32 cmd_id_stop; + + /* Waiting for host to ack the pages we released. */ + wait_queue_head_t acked; + + /* Number of balloon pages we've told the Host we're not using. */ + unsigned int num_pages; + /* + * The pages we've told the Host we're not using are enqueued + * at vb_dev_info->pages list. + * Each page on this list adds VIRTIO_BALLOON_PAGES_PER_PAGE + * to num_pages above. + */ + struct balloon_dev_info vb_dev_info; + + /* Synchronize access/update to this struct virtio_balloon elements */ + struct mutex balloon_lock; + + /* The array of pfns we tell the Host about. */ + unsigned int num_pfns; + __virtio32 pfns[VIRTIO_BALLOON_ARRAY_PFNS_MAX]; + + /* Memory statistics */ + struct virtio_balloon_stat stats[VIRTIO_BALLOON_S_NR]; + + /* Shrinker to return free pages - VIRTIO_BALLOON_F_FREE_PAGE_HINT */ + struct shrinker shrinker; + + /* OOM notifier to deflate on OOM - VIRTIO_BALLOON_F_DEFLATE_ON_OOM */ + struct notifier_block oom_nb; + + /* Free page reporting device */ + struct virtqueue *reporting_vq; + struct page_reporting_dev_info pr_dev_info; +}; + +static const struct virtio_device_id id_table[] = { + { VIRTIO_ID_BALLOON, VIRTIO_DEV_ANY_ID }, + { 0 }, +}; + +static u32 page_to_balloon_pfn(struct page *page) +{ + unsigned long pfn = page_to_pfn(page); + + BUILD_BUG_ON(PAGE_SHIFT < VIRTIO_BALLOON_PFN_SHIFT); + /* Convert pfn from Linux page size to balloon page size. */ + return pfn * VIRTIO_BALLOON_PAGES_PER_PAGE; +} + +static void balloon_ack(struct virtqueue *vq) +{ + struct virtio_balloon *vb = vq->vdev->priv; + + wake_up(&vb->acked); +} + +static void tell_host(struct virtio_balloon *vb, struct virtqueue *vq) +{ + struct scatterlist sg; + unsigned int len; + + sg_init_one(&sg, vb->pfns, sizeof(vb->pfns[0]) * vb->num_pfns); + + /* We should always be able to add one buffer to an empty queue. */ + virtqueue_add_outbuf(vq, &sg, 1, vb, GFP_KERNEL); + virtqueue_kick(vq); + + /* When host has read buffer, this completes via balloon_ack */ + wait_event(vb->acked, virtqueue_get_buf(vq, &len)); + +} + +static int virtballoon_free_page_report(struct page_reporting_dev_info *pr_dev_info, + struct scatterlist *sg, unsigned int nents) +{ + struct virtio_balloon *vb = + container_of(pr_dev_info, struct virtio_balloon, pr_dev_info); + struct virtqueue *vq = vb->reporting_vq; + unsigned int unused, err; + + /* We should always be able to add these buffers to an empty queue. */ + err = virtqueue_add_inbuf(vq, sg, nents, vb, GFP_NOWAIT | __GFP_NOWARN); + + /* + * In the extremely unlikely case that something has occurred and we + * are able to trigger an error we will simply display a warning + * and exit without actually processing the pages. + */ + if (WARN_ON_ONCE(err)) + return err; + + virtqueue_kick(vq); + + /* When host has read buffer, this completes via balloon_ack */ + wait_event(vb->acked, virtqueue_get_buf(vq, &unused)); + + return 0; +} + +static void set_page_pfns(struct virtio_balloon *vb, + __virtio32 pfns[], struct page *page) +{ + unsigned int i; + + BUILD_BUG_ON(VIRTIO_BALLOON_PAGES_PER_PAGE > VIRTIO_BALLOON_ARRAY_PFNS_MAX); + + /* + * Set balloon pfns pointing at this page. + * Note that the first pfn points at start of the page. + */ + for (i = 0; i < VIRTIO_BALLOON_PAGES_PER_PAGE; i++) + pfns[i] = cpu_to_virtio32(vb->vdev, + page_to_balloon_pfn(page) + i); +} + +static unsigned int fill_balloon(struct virtio_balloon *vb, size_t num) +{ + unsigned int num_allocated_pages; + unsigned int num_pfns; + struct page *page; + LIST_HEAD(pages); + + /* We can only do one array worth at a time. */ + num = min(num, ARRAY_SIZE(vb->pfns)); + + for (num_pfns = 0; num_pfns < num; + num_pfns += VIRTIO_BALLOON_PAGES_PER_PAGE) { + struct page *page = balloon_page_alloc(); + + if (!page) { + dev_info_ratelimited(&vb->vdev->dev, + "Out of puff! Can't get %u pages\n", + VIRTIO_BALLOON_PAGES_PER_PAGE); + /* Sleep for at least 1/5 of a second before retry. */ + msleep(200); + break; + } + + balloon_page_push(&pages, page); + } + + mutex_lock(&vb->balloon_lock); + + vb->num_pfns = 0; + + while ((page = balloon_page_pop(&pages))) { + balloon_page_enqueue(&vb->vb_dev_info, page); + + set_page_pfns(vb, vb->pfns + vb->num_pfns, page); + vb->num_pages += VIRTIO_BALLOON_PAGES_PER_PAGE; + if (!virtio_has_feature(vb->vdev, + VIRTIO_BALLOON_F_DEFLATE_ON_OOM)) + adjust_managed_page_count(page, -1); + vb->num_pfns += VIRTIO_BALLOON_PAGES_PER_PAGE; + } + + num_allocated_pages = vb->num_pfns; + /* Did we get any? */ + if (vb->num_pfns != 0) + tell_host(vb, vb->inflate_vq); + mutex_unlock(&vb->balloon_lock); + + return num_allocated_pages; +} + +static void release_pages_balloon(struct virtio_balloon *vb, + struct list_head *pages) +{ + struct page *page, *next; + + list_for_each_entry_safe(page, next, pages, lru) { + if (!virtio_has_feature(vb->vdev, + VIRTIO_BALLOON_F_DEFLATE_ON_OOM)) + adjust_managed_page_count(page, 1); + list_del(&page->lru); + put_page(page); /* balloon reference */ + } +} + +static unsigned int leak_balloon(struct virtio_balloon *vb, size_t num) +{ + unsigned int num_freed_pages; + struct page *page; + struct balloon_dev_info *vb_dev_info = &vb->vb_dev_info; + LIST_HEAD(pages); + + /* We can only do one array worth at a time. */ + num = min(num, ARRAY_SIZE(vb->pfns)); + + mutex_lock(&vb->balloon_lock); + /* We can't release more pages than taken */ + num = min(num, (size_t)vb->num_pages); + for (vb->num_pfns = 0; vb->num_pfns < num; + vb->num_pfns += VIRTIO_BALLOON_PAGES_PER_PAGE) { + page = balloon_page_dequeue(vb_dev_info); + if (!page) + break; + set_page_pfns(vb, vb->pfns + vb->num_pfns, page); + list_add(&page->lru, &pages); + vb->num_pages -= VIRTIO_BALLOON_PAGES_PER_PAGE; + } + + num_freed_pages = vb->num_pfns; + /* + * Note that if + * virtio_has_feature(vdev, VIRTIO_BALLOON_F_MUST_TELL_HOST); + * is true, we *have* to do it in this order + */ + if (vb->num_pfns != 0) + tell_host(vb, vb->deflate_vq); + release_pages_balloon(vb, &pages); + mutex_unlock(&vb->balloon_lock); + return num_freed_pages; +} + +static inline void update_stat(struct virtio_balloon *vb, int idx, + u16 tag, u64 val) +{ + BUG_ON(idx >= VIRTIO_BALLOON_S_NR); + vb->stats[idx].tag = cpu_to_virtio16(vb->vdev, tag); + vb->stats[idx].val = cpu_to_virtio64(vb->vdev, val); +} + +#define pages_to_bytes(x) ((u64)(x) << PAGE_SHIFT) + +static unsigned int update_balloon_stats(struct virtio_balloon *vb) +{ + unsigned long events[NR_VM_EVENT_ITEMS]; + struct sysinfo i; + unsigned int idx = 0; + long available; + unsigned long caches; + + all_vm_events(events); + si_meminfo(&i); + + available = si_mem_available(); + caches = global_node_page_state(NR_FILE_PAGES); + +#ifdef CONFIG_VM_EVENT_COUNTERS + update_stat(vb, idx++, VIRTIO_BALLOON_S_SWAP_IN, + pages_to_bytes(events[PSWPIN])); + update_stat(vb, idx++, VIRTIO_BALLOON_S_SWAP_OUT, + pages_to_bytes(events[PSWPOUT])); + update_stat(vb, idx++, VIRTIO_BALLOON_S_MAJFLT, events[PGMAJFAULT]); + update_stat(vb, idx++, VIRTIO_BALLOON_S_MINFLT, events[PGFAULT]); +#ifdef CONFIG_HUGETLB_PAGE + update_stat(vb, idx++, VIRTIO_BALLOON_S_HTLB_PGALLOC, + events[HTLB_BUDDY_PGALLOC]); + update_stat(vb, idx++, VIRTIO_BALLOON_S_HTLB_PGFAIL, + events[HTLB_BUDDY_PGALLOC_FAIL]); +#endif +#endif + update_stat(vb, idx++, VIRTIO_BALLOON_S_MEMFREE, + pages_to_bytes(i.freeram)); + update_stat(vb, idx++, VIRTIO_BALLOON_S_MEMTOT, + pages_to_bytes(i.totalram)); + update_stat(vb, idx++, VIRTIO_BALLOON_S_AVAIL, + pages_to_bytes(available)); + update_stat(vb, idx++, VIRTIO_BALLOON_S_CACHES, + pages_to_bytes(caches)); + + return idx; +} + +/* + * While most virtqueues communicate guest-initiated requests to the hypervisor, + * the stats queue operates in reverse. The driver initializes the virtqueue + * with a single buffer. From that point forward, all conversations consist of + * a hypervisor request (a call to this function) which directs us to refill + * the virtqueue with a fresh stats buffer. Since stats collection can sleep, + * we delegate the job to a freezable workqueue that will do the actual work via + * stats_handle_request(). + */ +static void stats_request(struct virtqueue *vq) +{ + struct virtio_balloon *vb = vq->vdev->priv; + + spin_lock(&vb->stop_update_lock); + if (!vb->stop_update) + queue_work(system_freezable_wq, &vb->update_balloon_stats_work); + spin_unlock(&vb->stop_update_lock); +} + +static void stats_handle_request(struct virtio_balloon *vb) +{ + struct virtqueue *vq; + struct scatterlist sg; + unsigned int len, num_stats; + + num_stats = update_balloon_stats(vb); + + vq = vb->stats_vq; + if (!virtqueue_get_buf(vq, &len)) + return; + sg_init_one(&sg, vb->stats, sizeof(vb->stats[0]) * num_stats); + virtqueue_add_outbuf(vq, &sg, 1, vb, GFP_KERNEL); + virtqueue_kick(vq); +} + +static inline s64 towards_target(struct virtio_balloon *vb) +{ + s64 target; + u32 num_pages; + + /* Legacy balloon config space is LE, unlike all other devices. */ + virtio_cread_le(vb->vdev, struct virtio_balloon_config, num_pages, + &num_pages); + + /* + * Aligned up to guest page size to avoid inflating and deflating + * balloon endlessly. + */ + target = ALIGN(num_pages, VIRTIO_BALLOON_PAGES_PER_PAGE); + return target - vb->num_pages; +} + +/* Gives back @num_to_return blocks of free pages to mm. */ +static unsigned long return_free_pages_to_mm(struct virtio_balloon *vb, + unsigned long num_to_return) +{ + struct page *page; + unsigned long num_returned; + + spin_lock_irq(&vb->free_page_list_lock); + for (num_returned = 0; num_returned < num_to_return; num_returned++) { + page = balloon_page_pop(&vb->free_page_list); + if (!page) + break; + free_pages((unsigned long)page_address(page), + VIRTIO_BALLOON_HINT_BLOCK_ORDER); + } + vb->num_free_page_blocks -= num_returned; + spin_unlock_irq(&vb->free_page_list_lock); + + return num_returned; +} + +static void virtio_balloon_queue_free_page_work(struct virtio_balloon *vb) +{ + if (!virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT)) + return; + + /* No need to queue the work if the bit was already set. */ + if (test_and_set_bit(VIRTIO_BALLOON_CONFIG_READ_CMD_ID, + &vb->config_read_bitmap)) + return; + + queue_work(vb->balloon_wq, &vb->report_free_page_work); +} + +static void virtballoon_changed(struct virtio_device *vdev) +{ + struct virtio_balloon *vb = vdev->priv; + unsigned long flags; + + spin_lock_irqsave(&vb->stop_update_lock, flags); + if (!vb->stop_update) { + queue_work(system_freezable_wq, + &vb->update_balloon_size_work); + virtio_balloon_queue_free_page_work(vb); + } + spin_unlock_irqrestore(&vb->stop_update_lock, flags); +} + +static void update_balloon_size(struct virtio_balloon *vb) +{ + u32 actual = vb->num_pages; + + /* Legacy balloon config space is LE, unlike all other devices. */ + virtio_cwrite_le(vb->vdev, struct virtio_balloon_config, actual, + &actual); +} + +static void update_balloon_stats_func(struct work_struct *work) +{ + struct virtio_balloon *vb; + + vb = container_of(work, struct virtio_balloon, + update_balloon_stats_work); + stats_handle_request(vb); +} + +static void update_balloon_size_func(struct work_struct *work) +{ + struct virtio_balloon *vb; + s64 diff; + + vb = container_of(work, struct virtio_balloon, + update_balloon_size_work); + diff = towards_target(vb); + + if (!diff) + return; + + if (diff > 0) + diff -= fill_balloon(vb, diff); + else + diff += leak_balloon(vb, -diff); + update_balloon_size(vb); + + if (diff) + queue_work(system_freezable_wq, work); +} + +static int init_vqs(struct virtio_balloon *vb) +{ + struct virtqueue *vqs[VIRTIO_BALLOON_VQ_MAX]; + vq_callback_t *callbacks[VIRTIO_BALLOON_VQ_MAX]; + const char *names[VIRTIO_BALLOON_VQ_MAX]; + int err; + + /* + * Inflateq and deflateq are used unconditionally. The names[] + * will be NULL if the related feature is not enabled, which will + * cause no allocation for the corresponding virtqueue in find_vqs. + */ + callbacks[VIRTIO_BALLOON_VQ_INFLATE] = balloon_ack; + names[VIRTIO_BALLOON_VQ_INFLATE] = "inflate"; + callbacks[VIRTIO_BALLOON_VQ_DEFLATE] = balloon_ack; + names[VIRTIO_BALLOON_VQ_DEFLATE] = "deflate"; + callbacks[VIRTIO_BALLOON_VQ_STATS] = NULL; + names[VIRTIO_BALLOON_VQ_STATS] = NULL; + callbacks[VIRTIO_BALLOON_VQ_FREE_PAGE] = NULL; + names[VIRTIO_BALLOON_VQ_FREE_PAGE] = NULL; + names[VIRTIO_BALLOON_VQ_REPORTING] = NULL; + + if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_STATS_VQ)) { + names[VIRTIO_BALLOON_VQ_STATS] = "stats"; + callbacks[VIRTIO_BALLOON_VQ_STATS] = stats_request; + } + + if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT)) { + names[VIRTIO_BALLOON_VQ_FREE_PAGE] = "free_page_vq"; + callbacks[VIRTIO_BALLOON_VQ_FREE_PAGE] = NULL; + } + + if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_REPORTING)) { + names[VIRTIO_BALLOON_VQ_REPORTING] = "reporting_vq"; + callbacks[VIRTIO_BALLOON_VQ_REPORTING] = balloon_ack; + } + + err = virtio_find_vqs(vb->vdev, VIRTIO_BALLOON_VQ_MAX, vqs, + callbacks, names, NULL); + if (err) + return err; + + vb->inflate_vq = vqs[VIRTIO_BALLOON_VQ_INFLATE]; + vb->deflate_vq = vqs[VIRTIO_BALLOON_VQ_DEFLATE]; + if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_STATS_VQ)) { + struct scatterlist sg; + unsigned int num_stats; + vb->stats_vq = vqs[VIRTIO_BALLOON_VQ_STATS]; + + /* + * Prime this virtqueue with one buffer so the hypervisor can + * use it to signal us later (it can't be broken yet!). + */ + num_stats = update_balloon_stats(vb); + + sg_init_one(&sg, vb->stats, sizeof(vb->stats[0]) * num_stats); + err = virtqueue_add_outbuf(vb->stats_vq, &sg, 1, vb, + GFP_KERNEL); + if (err) { + dev_warn(&vb->vdev->dev, "%s: add stat_vq failed\n", + __func__); + return err; + } + virtqueue_kick(vb->stats_vq); + } + + if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT)) + vb->free_page_vq = vqs[VIRTIO_BALLOON_VQ_FREE_PAGE]; + + if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_REPORTING)) + vb->reporting_vq = vqs[VIRTIO_BALLOON_VQ_REPORTING]; + + return 0; +} + +static u32 virtio_balloon_cmd_id_received(struct virtio_balloon *vb) +{ + if (test_and_clear_bit(VIRTIO_BALLOON_CONFIG_READ_CMD_ID, + &vb->config_read_bitmap)) { + /* Legacy balloon config space is LE, unlike all other devices. */ + virtio_cread_le(vb->vdev, struct virtio_balloon_config, + free_page_hint_cmd_id, + &vb->cmd_id_received_cache); + } + + return vb->cmd_id_received_cache; +} + +static int send_cmd_id_start(struct virtio_balloon *vb) +{ + struct scatterlist sg; + struct virtqueue *vq = vb->free_page_vq; + int err, unused; + + /* Detach all the used buffers from the vq */ + while (virtqueue_get_buf(vq, &unused)) + ; + + vb->cmd_id_active = cpu_to_virtio32(vb->vdev, + virtio_balloon_cmd_id_received(vb)); + sg_init_one(&sg, &vb->cmd_id_active, sizeof(vb->cmd_id_active)); + err = virtqueue_add_outbuf(vq, &sg, 1, &vb->cmd_id_active, GFP_KERNEL); + if (!err) + virtqueue_kick(vq); + return err; +} + +static int send_cmd_id_stop(struct virtio_balloon *vb) +{ + struct scatterlist sg; + struct virtqueue *vq = vb->free_page_vq; + int err, unused; + + /* Detach all the used buffers from the vq */ + while (virtqueue_get_buf(vq, &unused)) + ; + + sg_init_one(&sg, &vb->cmd_id_stop, sizeof(vb->cmd_id_stop)); + err = virtqueue_add_outbuf(vq, &sg, 1, &vb->cmd_id_stop, GFP_KERNEL); + if (!err) + virtqueue_kick(vq); + return err; +} + +static int get_free_page_and_send(struct virtio_balloon *vb) +{ + struct virtqueue *vq = vb->free_page_vq; + struct page *page; + struct scatterlist sg; + int err, unused; + void *p; + + /* Detach all the used buffers from the vq */ + while (virtqueue_get_buf(vq, &unused)) + ; + + page = alloc_pages(VIRTIO_BALLOON_FREE_PAGE_ALLOC_FLAG, + VIRTIO_BALLOON_HINT_BLOCK_ORDER); + /* + * When the allocation returns NULL, it indicates that we have got all + * the possible free pages, so return -EINTR to stop. + */ + if (!page) + return -EINTR; + + p = page_address(page); + sg_init_one(&sg, p, VIRTIO_BALLOON_HINT_BLOCK_BYTES); + /* There is always 1 entry reserved for the cmd id to use. */ + if (vq->num_free > 1) { + err = virtqueue_add_inbuf(vq, &sg, 1, p, GFP_KERNEL); + if (unlikely(err)) { + free_pages((unsigned long)p, + VIRTIO_BALLOON_HINT_BLOCK_ORDER); + return err; + } + virtqueue_kick(vq); + spin_lock_irq(&vb->free_page_list_lock); + balloon_page_push(&vb->free_page_list, page); + vb->num_free_page_blocks++; + spin_unlock_irq(&vb->free_page_list_lock); + } else { + /* + * The vq has no available entry to add this page block, so + * just free it. + */ + free_pages((unsigned long)p, VIRTIO_BALLOON_HINT_BLOCK_ORDER); + } + + return 0; +} + +static int send_free_pages(struct virtio_balloon *vb) +{ + int err; + u32 cmd_id_active; + + while (1) { + /* + * If a stop id or a new cmd id was just received from host, + * stop the reporting. + */ + cmd_id_active = virtio32_to_cpu(vb->vdev, vb->cmd_id_active); + if (unlikely(cmd_id_active != + virtio_balloon_cmd_id_received(vb))) + break; + + /* + * The free page blocks are allocated and sent to host one by + * one. + */ + err = get_free_page_and_send(vb); + if (err == -EINTR) + break; + else if (unlikely(err)) + return err; + } + + return 0; +} + +static void virtio_balloon_report_free_page(struct virtio_balloon *vb) +{ + int err; + struct device *dev = &vb->vdev->dev; + + /* Start by sending the received cmd id to host with an outbuf. */ + err = send_cmd_id_start(vb); + if (unlikely(err)) + dev_err(dev, "Failed to send a start id, err = %d\n", err); + + err = send_free_pages(vb); + if (unlikely(err)) + dev_err(dev, "Failed to send a free page, err = %d\n", err); + + /* End by sending a stop id to host with an outbuf. */ + err = send_cmd_id_stop(vb); + if (unlikely(err)) + dev_err(dev, "Failed to send a stop id, err = %d\n", err); +} + +static void report_free_page_func(struct work_struct *work) +{ + struct virtio_balloon *vb = container_of(work, struct virtio_balloon, + report_free_page_work); + u32 cmd_id_received; + + cmd_id_received = virtio_balloon_cmd_id_received(vb); + if (cmd_id_received == VIRTIO_BALLOON_CMD_ID_DONE) { + /* Pass ULONG_MAX to give back all the free pages */ + return_free_pages_to_mm(vb, ULONG_MAX); + } else if (cmd_id_received != VIRTIO_BALLOON_CMD_ID_STOP && + cmd_id_received != + virtio32_to_cpu(vb->vdev, vb->cmd_id_active)) { + virtio_balloon_report_free_page(vb); + } +} + +#ifdef CONFIG_BALLOON_COMPACTION +/* + * virtballoon_migratepage - perform the balloon page migration on behalf of + * a compaction thread. (called under page lock) + * @vb_dev_info: the balloon device + * @newpage: page that will replace the isolated page after migration finishes. + * @page : the isolated (old) page that is about to be migrated to newpage. + * @mode : compaction mode -- not used for balloon page migration. + * + * After a ballooned page gets isolated by compaction procedures, this is the + * function that performs the page migration on behalf of a compaction thread + * The page migration for virtio balloon is done in a simple swap fashion which + * follows these two macro steps: + * 1) insert newpage into vb->pages list and update the host about it; + * 2) update the host about the old page removed from vb->pages list; + * + * This function preforms the balloon page migration task. + * Called through balloon_mapping->a_ops->migratepage + */ +static int virtballoon_migratepage(struct balloon_dev_info *vb_dev_info, + struct page *newpage, struct page *page, enum migrate_mode mode) +{ + struct virtio_balloon *vb = container_of(vb_dev_info, + struct virtio_balloon, vb_dev_info); + unsigned long flags; + + /* + * In order to avoid lock contention while migrating pages concurrently + * to leak_balloon() or fill_balloon() we just give up the balloon_lock + * this turn, as it is easier to retry the page migration later. + * This also prevents fill_balloon() getting stuck into a mutex + * recursion in the case it ends up triggering memory compaction + * while it is attempting to inflate the ballon. + */ + if (!mutex_trylock(&vb->balloon_lock)) + return -EAGAIN; + + get_page(newpage); /* balloon reference */ + + /* + * When we migrate a page to a different zone and adjusted the + * managed page count when inflating, we have to fixup the count of + * both involved zones. + */ + if (!virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_DEFLATE_ON_OOM) && + page_zone(page) != page_zone(newpage)) { + adjust_managed_page_count(page, 1); + adjust_managed_page_count(newpage, -1); + } + + /* balloon's page migration 1st step -- inflate "newpage" */ + spin_lock_irqsave(&vb_dev_info->pages_lock, flags); + balloon_page_insert(vb_dev_info, newpage); + vb_dev_info->isolated_pages--; + __count_vm_event(BALLOON_MIGRATE); + spin_unlock_irqrestore(&vb_dev_info->pages_lock, flags); + vb->num_pfns = VIRTIO_BALLOON_PAGES_PER_PAGE; + set_page_pfns(vb, vb->pfns, newpage); + tell_host(vb, vb->inflate_vq); + + /* balloon's page migration 2nd step -- deflate "page" */ + spin_lock_irqsave(&vb_dev_info->pages_lock, flags); + balloon_page_delete(page); + spin_unlock_irqrestore(&vb_dev_info->pages_lock, flags); + vb->num_pfns = VIRTIO_BALLOON_PAGES_PER_PAGE; + set_page_pfns(vb, vb->pfns, page); + tell_host(vb, vb->deflate_vq); + + mutex_unlock(&vb->balloon_lock); + + put_page(page); /* balloon reference */ + + return MIGRATEPAGE_SUCCESS; +} +#endif /* CONFIG_BALLOON_COMPACTION */ + +static unsigned long shrink_free_pages(struct virtio_balloon *vb, + unsigned long pages_to_free) +{ + unsigned long blocks_to_free, blocks_freed; + + pages_to_free = round_up(pages_to_free, + VIRTIO_BALLOON_HINT_BLOCK_PAGES); + blocks_to_free = pages_to_free / VIRTIO_BALLOON_HINT_BLOCK_PAGES; + blocks_freed = return_free_pages_to_mm(vb, blocks_to_free); + + return blocks_freed * VIRTIO_BALLOON_HINT_BLOCK_PAGES; +} + +static unsigned long virtio_balloon_shrinker_scan(struct shrinker *shrinker, + struct shrink_control *sc) +{ + struct virtio_balloon *vb = container_of(shrinker, + struct virtio_balloon, shrinker); + + return shrink_free_pages(vb, sc->nr_to_scan); +} + +static unsigned long virtio_balloon_shrinker_count(struct shrinker *shrinker, + struct shrink_control *sc) +{ + struct virtio_balloon *vb = container_of(shrinker, + struct virtio_balloon, shrinker); + + return vb->num_free_page_blocks * VIRTIO_BALLOON_HINT_BLOCK_PAGES; +} + +static int virtio_balloon_oom_notify(struct notifier_block *nb, + unsigned long dummy, void *parm) +{ + struct virtio_balloon *vb = container_of(nb, + struct virtio_balloon, oom_nb); + unsigned long *freed = parm; + + *freed += leak_balloon(vb, VIRTIO_BALLOON_OOM_NR_PAGES) / + VIRTIO_BALLOON_PAGES_PER_PAGE; + update_balloon_size(vb); + + return NOTIFY_OK; +} + +static void virtio_balloon_unregister_shrinker(struct virtio_balloon *vb) +{ + unregister_shrinker(&vb->shrinker); +} + +static int virtio_balloon_register_shrinker(struct virtio_balloon *vb) +{ + vb->shrinker.scan_objects = virtio_balloon_shrinker_scan; + vb->shrinker.count_objects = virtio_balloon_shrinker_count; + vb->shrinker.seeks = DEFAULT_SEEKS; + + return register_shrinker(&vb->shrinker, "virtio-balloon"); +} + +static int virtballoon_probe(struct virtio_device *vdev) +{ + struct virtio_balloon *vb; + int err; + + if (!vdev->config->get) { + dev_err(&vdev->dev, "%s failure: config access disabled\n", + __func__); + return -EINVAL; + } + + vdev->priv = vb = kzalloc(sizeof(*vb), GFP_KERNEL); + if (!vb) { + err = -ENOMEM; + goto out; + } + + INIT_WORK(&vb->update_balloon_stats_work, update_balloon_stats_func); + INIT_WORK(&vb->update_balloon_size_work, update_balloon_size_func); + spin_lock_init(&vb->stop_update_lock); + mutex_init(&vb->balloon_lock); + init_waitqueue_head(&vb->acked); + vb->vdev = vdev; + + balloon_devinfo_init(&vb->vb_dev_info); + + err = init_vqs(vb); + if (err) + goto out_free_vb; + +#ifdef CONFIG_BALLOON_COMPACTION + vb->vb_dev_info.migratepage = virtballoon_migratepage; +#endif + if (virtio_has_feature(vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT)) { + /* + * There is always one entry reserved for cmd id, so the ring + * size needs to be at least two to report free page hints. + */ + if (virtqueue_get_vring_size(vb->free_page_vq) < 2) { + err = -ENOSPC; + goto out_del_vqs; + } + vb->balloon_wq = alloc_workqueue("balloon-wq", + WQ_FREEZABLE | WQ_CPU_INTENSIVE, 0); + if (!vb->balloon_wq) { + err = -ENOMEM; + goto out_del_vqs; + } + INIT_WORK(&vb->report_free_page_work, report_free_page_func); + vb->cmd_id_received_cache = VIRTIO_BALLOON_CMD_ID_STOP; + vb->cmd_id_active = cpu_to_virtio32(vb->vdev, + VIRTIO_BALLOON_CMD_ID_STOP); + vb->cmd_id_stop = cpu_to_virtio32(vb->vdev, + VIRTIO_BALLOON_CMD_ID_STOP); + spin_lock_init(&vb->free_page_list_lock); + INIT_LIST_HEAD(&vb->free_page_list); + /* + * We're allowed to reuse any free pages, even if they are + * still to be processed by the host. + */ + err = virtio_balloon_register_shrinker(vb); + if (err) + goto out_del_balloon_wq; + } + + if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_DEFLATE_ON_OOM)) { + vb->oom_nb.notifier_call = virtio_balloon_oom_notify; + vb->oom_nb.priority = VIRTIO_BALLOON_OOM_NOTIFY_PRIORITY; + err = register_oom_notifier(&vb->oom_nb); + if (err < 0) + goto out_unregister_shrinker; + } + + if (virtio_has_feature(vdev, VIRTIO_BALLOON_F_PAGE_POISON)) { + /* Start with poison val of 0 representing general init */ + __u32 poison_val = 0; + + /* + * Let the hypervisor know that we are expecting a + * specific value to be written back in balloon pages. + * + * If the PAGE_POISON value was larger than a byte we would + * need to byte swap poison_val here to guarantee it is + * little-endian. However for now it is a single byte so we + * can pass it as-is. + */ + if (!want_init_on_free()) + memset(&poison_val, PAGE_POISON, sizeof(poison_val)); + + virtio_cwrite_le(vb->vdev, struct virtio_balloon_config, + poison_val, &poison_val); + } + + vb->pr_dev_info.report = virtballoon_free_page_report; + if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_REPORTING)) { + unsigned int capacity; + + capacity = virtqueue_get_vring_size(vb->reporting_vq); + if (capacity < PAGE_REPORTING_CAPACITY) { + err = -ENOSPC; + goto out_unregister_oom; + } + + /* + * The default page reporting order is @pageblock_order, which + * corresponds to 512MB in size on ARM64 when 64KB base page + * size is used. The page reporting won't be triggered if the + * freeing page can't come up with a free area like that huge. + * So we specify the page reporting order to 5, corresponding + * to 2MB. It helps to avoid THP splitting if 4KB base page + * size is used by host. + * + * Ideally, the page reporting order is selected based on the + * host's base page size. However, it needs more work to report + * that value. The hard-coded order would be fine currently. + */ +#if defined(CONFIG_ARM64) && defined(CONFIG_ARM64_64K_PAGES) + vb->pr_dev_info.order = 5; +#endif + + err = page_reporting_register(&vb->pr_dev_info); + if (err) + goto out_unregister_oom; + } + + virtio_device_ready(vdev); + + if (towards_target(vb)) + virtballoon_changed(vdev); + return 0; + +out_unregister_oom: + if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_DEFLATE_ON_OOM)) + unregister_oom_notifier(&vb->oom_nb); +out_unregister_shrinker: + if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT)) + virtio_balloon_unregister_shrinker(vb); +out_del_balloon_wq: + if (virtio_has_feature(vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT)) + destroy_workqueue(vb->balloon_wq); +out_del_vqs: + vdev->config->del_vqs(vdev); +out_free_vb: + kfree(vb); +out: + return err; +} + +static void remove_common(struct virtio_balloon *vb) +{ + /* There might be pages left in the balloon: free them. */ + while (vb->num_pages) + leak_balloon(vb, vb->num_pages); + update_balloon_size(vb); + + /* There might be free pages that are being reported: release them. */ + if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT)) + return_free_pages_to_mm(vb, ULONG_MAX); + + /* Now we reset the device so we can clean up the queues. */ + virtio_reset_device(vb->vdev); + + vb->vdev->config->del_vqs(vb->vdev); +} + +static void virtballoon_remove(struct virtio_device *vdev) +{ + struct virtio_balloon *vb = vdev->priv; + + if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_REPORTING)) + page_reporting_unregister(&vb->pr_dev_info); + if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_DEFLATE_ON_OOM)) + unregister_oom_notifier(&vb->oom_nb); + if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT)) + virtio_balloon_unregister_shrinker(vb); + spin_lock_irq(&vb->stop_update_lock); + vb->stop_update = true; + spin_unlock_irq(&vb->stop_update_lock); + cancel_work_sync(&vb->update_balloon_size_work); + cancel_work_sync(&vb->update_balloon_stats_work); + + if (virtio_has_feature(vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT)) { + cancel_work_sync(&vb->report_free_page_work); + destroy_workqueue(vb->balloon_wq); + } + + remove_common(vb); + kfree(vb); +} + +#ifdef CONFIG_PM_SLEEP +static int virtballoon_freeze(struct virtio_device *vdev) +{ + struct virtio_balloon *vb = vdev->priv; + + /* + * The workqueue is already frozen by the PM core before this + * function is called. + */ + remove_common(vb); + return 0; +} + +static int virtballoon_restore(struct virtio_device *vdev) +{ + struct virtio_balloon *vb = vdev->priv; + int ret; + + ret = init_vqs(vdev->priv); + if (ret) + return ret; + + virtio_device_ready(vdev); + + if (towards_target(vb)) + virtballoon_changed(vdev); + update_balloon_size(vb); + return 0; +} +#endif + +static int virtballoon_validate(struct virtio_device *vdev) +{ + /* + * Inform the hypervisor that our pages are poisoned or + * initialized. If we cannot do that then we should disable + * page reporting as it could potentially change the contents + * of our free pages. + */ + if (!want_init_on_free() && !page_poisoning_enabled_static()) + __virtio_clear_bit(vdev, VIRTIO_BALLOON_F_PAGE_POISON); + else if (!virtio_has_feature(vdev, VIRTIO_BALLOON_F_PAGE_POISON)) + __virtio_clear_bit(vdev, VIRTIO_BALLOON_F_REPORTING); + + __virtio_clear_bit(vdev, VIRTIO_F_ACCESS_PLATFORM); + return 0; +} + +static unsigned int features[] = { + VIRTIO_BALLOON_F_MUST_TELL_HOST, + VIRTIO_BALLOON_F_STATS_VQ, + VIRTIO_BALLOON_F_DEFLATE_ON_OOM, + VIRTIO_BALLOON_F_FREE_PAGE_HINT, + VIRTIO_BALLOON_F_PAGE_POISON, + VIRTIO_BALLOON_F_REPORTING, +}; + +static struct virtio_driver virtio_balloon_driver = { + .feature_table = features, + .feature_table_size = ARRAY_SIZE(features), + .driver.name = KBUILD_MODNAME, + .driver.owner = THIS_MODULE, + .id_table = id_table, + .validate = virtballoon_validate, + .probe = virtballoon_probe, + .remove = virtballoon_remove, + .config_changed = virtballoon_changed, +#ifdef CONFIG_PM_SLEEP + .freeze = virtballoon_freeze, + .restore = virtballoon_restore, +#endif +}; + +module_virtio_driver(virtio_balloon_driver); +MODULE_DEVICE_TABLE(virtio, id_table); +MODULE_DESCRIPTION("Virtio balloon driver"); +MODULE_LICENSE("GPL"); diff --git a/drivers/virtio/virtio_dma_buf.c b/drivers/virtio/virtio_dma_buf.c new file mode 100644 index 0000000000..2521a75009 --- /dev/null +++ b/drivers/virtio/virtio_dma_buf.c @@ -0,0 +1,89 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * dma-bufs for virtio exported objects + * + * Copyright (C) 2020 Google, Inc. + */ + +#include <linux/module.h> +#include <linux/virtio_dma_buf.h> + +/** + * virtio_dma_buf_export - Creates a new dma-buf for a virtio exported object + * @exp_info: [in] see dma_buf_export(). ops MUST refer to a dma_buf_ops + * struct embedded in a virtio_dma_buf_ops. + * + * This wraps dma_buf_export() to allow virtio drivers to create a dma-buf + * for an virtio exported object that can be queried by other virtio drivers + * for the object's UUID. + */ +struct dma_buf *virtio_dma_buf_export + (const struct dma_buf_export_info *exp_info) +{ + const struct virtio_dma_buf_ops *virtio_ops = + container_of(exp_info->ops, + const struct virtio_dma_buf_ops, ops); + + if (!exp_info->ops || + exp_info->ops->attach != &virtio_dma_buf_attach || + !virtio_ops->get_uuid) { + return ERR_PTR(-EINVAL); + } + + return dma_buf_export(exp_info); +} +EXPORT_SYMBOL(virtio_dma_buf_export); + +/** + * virtio_dma_buf_attach - mandatory attach callback for virtio dma-bufs + */ +int virtio_dma_buf_attach(struct dma_buf *dma_buf, + struct dma_buf_attachment *attach) +{ + int ret; + const struct virtio_dma_buf_ops *ops = + container_of(dma_buf->ops, + const struct virtio_dma_buf_ops, ops); + + if (ops->device_attach) { + ret = ops->device_attach(dma_buf, attach); + if (ret) + return ret; + } + return 0; +} +EXPORT_SYMBOL(virtio_dma_buf_attach); + +/** + * is_virtio_dma_buf - returns true if the given dma-buf is a virtio dma-buf + * @dma_buf: buffer to query + */ +bool is_virtio_dma_buf(struct dma_buf *dma_buf) +{ + return dma_buf->ops->attach == &virtio_dma_buf_attach; +} +EXPORT_SYMBOL(is_virtio_dma_buf); + +/** + * virtio_dma_buf_get_uuid - gets a virtio dma-buf's exported object's uuid + * @dma_buf: [in] buffer to query + * @uuid: [out] the uuid + * + * Returns: 0 on success, negative on failure. + */ +int virtio_dma_buf_get_uuid(struct dma_buf *dma_buf, + uuid_t *uuid) +{ + const struct virtio_dma_buf_ops *ops = + container_of(dma_buf->ops, + const struct virtio_dma_buf_ops, ops); + + if (!is_virtio_dma_buf(dma_buf)) + return -EINVAL; + + return ops->get_uuid(dma_buf, uuid); +} +EXPORT_SYMBOL(virtio_dma_buf_get_uuid); + +MODULE_LICENSE("GPL"); +MODULE_IMPORT_NS(DMA_BUF); diff --git a/drivers/virtio/virtio_input.c b/drivers/virtio/virtio_input.c new file mode 100644 index 0000000000..3aa4670387 --- /dev/null +++ b/drivers/virtio/virtio_input.c @@ -0,0 +1,414 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include <linux/module.h> +#include <linux/virtio.h> +#include <linux/virtio_config.h> +#include <linux/input.h> +#include <linux/slab.h> + +#include <uapi/linux/virtio_ids.h> +#include <uapi/linux/virtio_input.h> +#include <linux/input/mt.h> + +struct virtio_input { + struct virtio_device *vdev; + struct input_dev *idev; + char name[64]; + char serial[64]; + char phys[64]; + struct virtqueue *evt, *sts; + struct virtio_input_event evts[64]; + spinlock_t lock; + bool ready; +}; + +static void virtinput_queue_evtbuf(struct virtio_input *vi, + struct virtio_input_event *evtbuf) +{ + struct scatterlist sg[1]; + + sg_init_one(sg, evtbuf, sizeof(*evtbuf)); + virtqueue_add_inbuf(vi->evt, sg, 1, evtbuf, GFP_ATOMIC); +} + +static void virtinput_recv_events(struct virtqueue *vq) +{ + struct virtio_input *vi = vq->vdev->priv; + struct virtio_input_event *event; + unsigned long flags; + unsigned int len; + + spin_lock_irqsave(&vi->lock, flags); + if (vi->ready) { + while ((event = virtqueue_get_buf(vi->evt, &len)) != NULL) { + spin_unlock_irqrestore(&vi->lock, flags); + input_event(vi->idev, + le16_to_cpu(event->type), + le16_to_cpu(event->code), + le32_to_cpu(event->value)); + spin_lock_irqsave(&vi->lock, flags); + virtinput_queue_evtbuf(vi, event); + } + virtqueue_kick(vq); + } + spin_unlock_irqrestore(&vi->lock, flags); +} + +/* + * On error we are losing the status update, which isn't critical as + * this is typically used for stuff like keyboard leds. + */ +static int virtinput_send_status(struct virtio_input *vi, + u16 type, u16 code, s32 value) +{ + struct virtio_input_event *stsbuf; + struct scatterlist sg[1]; + unsigned long flags; + int rc; + + /* + * Since 29cc309d8bf1 (HID: hid-multitouch: forward MSC_TIMESTAMP), + * EV_MSC/MSC_TIMESTAMP is added to each before EV_SYN event. + * EV_MSC is configured as INPUT_PASS_TO_ALL. + * In case of touch device: + * BE pass EV_MSC/MSC_TIMESTAMP to FE on receiving event from evdev. + * FE pass EV_MSC/MSC_TIMESTAMP back to BE. + * BE writes EV_MSC/MSC_TIMESTAMP to evdev due to INPUT_PASS_TO_ALL. + * BE receives extra EV_MSC/MSC_TIMESTAMP and pass to FE. + * >>> Each new frame becomes larger and larger. + * Disable EV_MSC/MSC_TIMESTAMP forwarding for MT. + */ + if (vi->idev->mt && type == EV_MSC && code == MSC_TIMESTAMP) + return 0; + + stsbuf = kzalloc(sizeof(*stsbuf), GFP_ATOMIC); + if (!stsbuf) + return -ENOMEM; + + stsbuf->type = cpu_to_le16(type); + stsbuf->code = cpu_to_le16(code); + stsbuf->value = cpu_to_le32(value); + sg_init_one(sg, stsbuf, sizeof(*stsbuf)); + + spin_lock_irqsave(&vi->lock, flags); + if (vi->ready) { + rc = virtqueue_add_outbuf(vi->sts, sg, 1, stsbuf, GFP_ATOMIC); + virtqueue_kick(vi->sts); + } else { + rc = -ENODEV; + } + spin_unlock_irqrestore(&vi->lock, flags); + + if (rc != 0) + kfree(stsbuf); + return rc; +} + +static void virtinput_recv_status(struct virtqueue *vq) +{ + struct virtio_input *vi = vq->vdev->priv; + struct virtio_input_event *stsbuf; + unsigned long flags; + unsigned int len; + + spin_lock_irqsave(&vi->lock, flags); + while ((stsbuf = virtqueue_get_buf(vi->sts, &len)) != NULL) + kfree(stsbuf); + spin_unlock_irqrestore(&vi->lock, flags); +} + +static int virtinput_status(struct input_dev *idev, unsigned int type, + unsigned int code, int value) +{ + struct virtio_input *vi = input_get_drvdata(idev); + + return virtinput_send_status(vi, type, code, value); +} + +static u8 virtinput_cfg_select(struct virtio_input *vi, + u8 select, u8 subsel) +{ + u8 size; + + virtio_cwrite_le(vi->vdev, struct virtio_input_config, select, &select); + virtio_cwrite_le(vi->vdev, struct virtio_input_config, subsel, &subsel); + virtio_cread_le(vi->vdev, struct virtio_input_config, size, &size); + return size; +} + +static void virtinput_cfg_bits(struct virtio_input *vi, int select, int subsel, + unsigned long *bits, unsigned int bitcount) +{ + unsigned int bit; + u8 *virtio_bits; + u8 bytes; + + bytes = virtinput_cfg_select(vi, select, subsel); + if (!bytes) + return; + if (bitcount > bytes * 8) + bitcount = bytes * 8; + + /* + * Bitmap in virtio config space is a simple stream of bytes, + * with the first byte carrying bits 0-7, second bits 8-15 and + * so on. + */ + virtio_bits = kzalloc(bytes, GFP_KERNEL); + if (!virtio_bits) + return; + virtio_cread_bytes(vi->vdev, offsetof(struct virtio_input_config, + u.bitmap), + virtio_bits, bytes); + for (bit = 0; bit < bitcount; bit++) { + if (virtio_bits[bit / 8] & (1 << (bit % 8))) + __set_bit(bit, bits); + } + kfree(virtio_bits); + + if (select == VIRTIO_INPUT_CFG_EV_BITS) + __set_bit(subsel, vi->idev->evbit); +} + +static void virtinput_cfg_abs(struct virtio_input *vi, int abs) +{ + u32 mi, ma, re, fu, fl; + + virtinput_cfg_select(vi, VIRTIO_INPUT_CFG_ABS_INFO, abs); + virtio_cread_le(vi->vdev, struct virtio_input_config, u.abs.min, &mi); + virtio_cread_le(vi->vdev, struct virtio_input_config, u.abs.max, &ma); + virtio_cread_le(vi->vdev, struct virtio_input_config, u.abs.res, &re); + virtio_cread_le(vi->vdev, struct virtio_input_config, u.abs.fuzz, &fu); + virtio_cread_le(vi->vdev, struct virtio_input_config, u.abs.flat, &fl); + input_set_abs_params(vi->idev, abs, mi, ma, fu, fl); + input_abs_set_res(vi->idev, abs, re); +} + +static int virtinput_init_vqs(struct virtio_input *vi) +{ + struct virtqueue *vqs[2]; + vq_callback_t *cbs[] = { virtinput_recv_events, + virtinput_recv_status }; + static const char * const names[] = { "events", "status" }; + int err; + + err = virtio_find_vqs(vi->vdev, 2, vqs, cbs, names, NULL); + if (err) + return err; + vi->evt = vqs[0]; + vi->sts = vqs[1]; + + return 0; +} + +static void virtinput_fill_evt(struct virtio_input *vi) +{ + unsigned long flags; + int i, size; + + spin_lock_irqsave(&vi->lock, flags); + size = virtqueue_get_vring_size(vi->evt); + if (size > ARRAY_SIZE(vi->evts)) + size = ARRAY_SIZE(vi->evts); + for (i = 0; i < size; i++) + virtinput_queue_evtbuf(vi, &vi->evts[i]); + virtqueue_kick(vi->evt); + spin_unlock_irqrestore(&vi->lock, flags); +} + +static int virtinput_probe(struct virtio_device *vdev) +{ + struct virtio_input *vi; + unsigned long flags; + size_t size; + int abs, err, nslots; + + if (!virtio_has_feature(vdev, VIRTIO_F_VERSION_1)) + return -ENODEV; + + vi = kzalloc(sizeof(*vi), GFP_KERNEL); + if (!vi) + return -ENOMEM; + + vdev->priv = vi; + vi->vdev = vdev; + spin_lock_init(&vi->lock); + + err = virtinput_init_vqs(vi); + if (err) + goto err_init_vq; + + vi->idev = input_allocate_device(); + if (!vi->idev) { + err = -ENOMEM; + goto err_input_alloc; + } + input_set_drvdata(vi->idev, vi); + + size = virtinput_cfg_select(vi, VIRTIO_INPUT_CFG_ID_NAME, 0); + virtio_cread_bytes(vi->vdev, offsetof(struct virtio_input_config, + u.string), + vi->name, min(size, sizeof(vi->name))); + size = virtinput_cfg_select(vi, VIRTIO_INPUT_CFG_ID_SERIAL, 0); + virtio_cread_bytes(vi->vdev, offsetof(struct virtio_input_config, + u.string), + vi->serial, min(size, sizeof(vi->serial))); + snprintf(vi->phys, sizeof(vi->phys), + "virtio%d/input0", vdev->index); + vi->idev->name = vi->name; + vi->idev->phys = vi->phys; + vi->idev->uniq = vi->serial; + + size = virtinput_cfg_select(vi, VIRTIO_INPUT_CFG_ID_DEVIDS, 0); + if (size >= sizeof(struct virtio_input_devids)) { + virtio_cread_le(vi->vdev, struct virtio_input_config, + u.ids.bustype, &vi->idev->id.bustype); + virtio_cread_le(vi->vdev, struct virtio_input_config, + u.ids.vendor, &vi->idev->id.vendor); + virtio_cread_le(vi->vdev, struct virtio_input_config, + u.ids.product, &vi->idev->id.product); + virtio_cread_le(vi->vdev, struct virtio_input_config, + u.ids.version, &vi->idev->id.version); + } else { + vi->idev->id.bustype = BUS_VIRTUAL; + } + + virtinput_cfg_bits(vi, VIRTIO_INPUT_CFG_PROP_BITS, 0, + vi->idev->propbit, INPUT_PROP_CNT); + size = virtinput_cfg_select(vi, VIRTIO_INPUT_CFG_EV_BITS, EV_REP); + if (size) + __set_bit(EV_REP, vi->idev->evbit); + + vi->idev->dev.parent = &vdev->dev; + vi->idev->event = virtinput_status; + + /* device -> kernel */ + virtinput_cfg_bits(vi, VIRTIO_INPUT_CFG_EV_BITS, EV_KEY, + vi->idev->keybit, KEY_CNT); + virtinput_cfg_bits(vi, VIRTIO_INPUT_CFG_EV_BITS, EV_REL, + vi->idev->relbit, REL_CNT); + virtinput_cfg_bits(vi, VIRTIO_INPUT_CFG_EV_BITS, EV_ABS, + vi->idev->absbit, ABS_CNT); + virtinput_cfg_bits(vi, VIRTIO_INPUT_CFG_EV_BITS, EV_MSC, + vi->idev->mscbit, MSC_CNT); + virtinput_cfg_bits(vi, VIRTIO_INPUT_CFG_EV_BITS, EV_SW, + vi->idev->swbit, SW_CNT); + + /* kernel -> device */ + virtinput_cfg_bits(vi, VIRTIO_INPUT_CFG_EV_BITS, EV_LED, + vi->idev->ledbit, LED_CNT); + virtinput_cfg_bits(vi, VIRTIO_INPUT_CFG_EV_BITS, EV_SND, + vi->idev->sndbit, SND_CNT); + + if (test_bit(EV_ABS, vi->idev->evbit)) { + for (abs = 0; abs < ABS_CNT; abs++) { + if (!test_bit(abs, vi->idev->absbit)) + continue; + virtinput_cfg_abs(vi, abs); + } + + if (test_bit(ABS_MT_SLOT, vi->idev->absbit)) { + nslots = input_abs_get_max(vi->idev, ABS_MT_SLOT) + 1; + err = input_mt_init_slots(vi->idev, nslots, 0); + if (err) + goto err_mt_init_slots; + } + } + + virtio_device_ready(vdev); + vi->ready = true; + err = input_register_device(vi->idev); + if (err) + goto err_input_register; + + virtinput_fill_evt(vi); + return 0; + +err_input_register: + spin_lock_irqsave(&vi->lock, flags); + vi->ready = false; + spin_unlock_irqrestore(&vi->lock, flags); +err_mt_init_slots: + input_free_device(vi->idev); +err_input_alloc: + vdev->config->del_vqs(vdev); +err_init_vq: + kfree(vi); + return err; +} + +static void virtinput_remove(struct virtio_device *vdev) +{ + struct virtio_input *vi = vdev->priv; + void *buf; + unsigned long flags; + + spin_lock_irqsave(&vi->lock, flags); + vi->ready = false; + spin_unlock_irqrestore(&vi->lock, flags); + + input_unregister_device(vi->idev); + virtio_reset_device(vdev); + while ((buf = virtqueue_detach_unused_buf(vi->sts)) != NULL) + kfree(buf); + vdev->config->del_vqs(vdev); + kfree(vi); +} + +#ifdef CONFIG_PM_SLEEP +static int virtinput_freeze(struct virtio_device *vdev) +{ + struct virtio_input *vi = vdev->priv; + unsigned long flags; + + spin_lock_irqsave(&vi->lock, flags); + vi->ready = false; + spin_unlock_irqrestore(&vi->lock, flags); + + vdev->config->del_vqs(vdev); + return 0; +} + +static int virtinput_restore(struct virtio_device *vdev) +{ + struct virtio_input *vi = vdev->priv; + int err; + + err = virtinput_init_vqs(vi); + if (err) + return err; + + virtio_device_ready(vdev); + vi->ready = true; + virtinput_fill_evt(vi); + return 0; +} +#endif + +static unsigned int features[] = { + /* none */ +}; +static const struct virtio_device_id id_table[] = { + { VIRTIO_ID_INPUT, VIRTIO_DEV_ANY_ID }, + { 0 }, +}; + +static struct virtio_driver virtio_input_driver = { + .driver.name = KBUILD_MODNAME, + .driver.owner = THIS_MODULE, + .feature_table = features, + .feature_table_size = ARRAY_SIZE(features), + .id_table = id_table, + .probe = virtinput_probe, + .remove = virtinput_remove, +#ifdef CONFIG_PM_SLEEP + .freeze = virtinput_freeze, + .restore = virtinput_restore, +#endif +}; + +module_virtio_driver(virtio_input_driver); +MODULE_DEVICE_TABLE(virtio, id_table); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Virtio input device driver"); +MODULE_AUTHOR("Gerd Hoffmann <kraxel@redhat.com>"); diff --git a/drivers/virtio/virtio_mem.c b/drivers/virtio/virtio_mem.c new file mode 100644 index 0000000000..fa5226c198 --- /dev/null +++ b/drivers/virtio/virtio_mem.c @@ -0,0 +1,3008 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Virtio-mem device driver. + * + * Copyright Red Hat, Inc. 2020 + * + * Author(s): David Hildenbrand <david@redhat.com> + */ + +#include <linux/virtio.h> +#include <linux/virtio_mem.h> +#include <linux/workqueue.h> +#include <linux/slab.h> +#include <linux/module.h> +#include <linux/mm.h> +#include <linux/memory_hotplug.h> +#include <linux/memory.h> +#include <linux/hrtimer.h> +#include <linux/crash_dump.h> +#include <linux/mutex.h> +#include <linux/bitmap.h> +#include <linux/lockdep.h> +#include <linux/log2.h> + +#include <acpi/acpi_numa.h> + +static bool unplug_online = true; +module_param(unplug_online, bool, 0644); +MODULE_PARM_DESC(unplug_online, "Try to unplug online memory"); + +static bool force_bbm; +module_param(force_bbm, bool, 0444); +MODULE_PARM_DESC(force_bbm, + "Force Big Block Mode. Default is 0 (auto-selection)"); + +static unsigned long bbm_block_size; +module_param(bbm_block_size, ulong, 0444); +MODULE_PARM_DESC(bbm_block_size, + "Big Block size in bytes. Default is 0 (auto-detection)."); + +/* + * virtio-mem currently supports the following modes of operation: + * + * * Sub Block Mode (SBM): A Linux memory block spans 2..X subblocks (SB). The + * size of a Sub Block (SB) is determined based on the device block size, the + * pageblock size, and the maximum allocation granularity of the buddy. + * Subblocks within a Linux memory block might either be plugged or unplugged. + * Memory is added/removed to Linux MM in Linux memory block granularity. + * + * * Big Block Mode (BBM): A Big Block (BB) spans 1..X Linux memory blocks. + * Memory is added/removed to Linux MM in Big Block granularity. + * + * The mode is determined automatically based on the Linux memory block size + * and the device block size. + * + * User space / core MM (auto onlining) is responsible for onlining added + * Linux memory blocks - and for selecting a zone. Linux Memory Blocks are + * always onlined separately, and all memory within a Linux memory block is + * onlined to the same zone - virtio-mem relies on this behavior. + */ + +/* + * State of a Linux memory block in SBM. + */ +enum virtio_mem_sbm_mb_state { + /* Unplugged, not added to Linux. Can be reused later. */ + VIRTIO_MEM_SBM_MB_UNUSED = 0, + /* (Partially) plugged, not added to Linux. Error on add_memory(). */ + VIRTIO_MEM_SBM_MB_PLUGGED, + /* Fully plugged, fully added to Linux, offline. */ + VIRTIO_MEM_SBM_MB_OFFLINE, + /* Partially plugged, fully added to Linux, offline. */ + VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL, + /* Fully plugged, fully added to Linux, onlined to a kernel zone. */ + VIRTIO_MEM_SBM_MB_KERNEL, + /* Partially plugged, fully added to Linux, online to a kernel zone */ + VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL, + /* Fully plugged, fully added to Linux, onlined to ZONE_MOVABLE. */ + VIRTIO_MEM_SBM_MB_MOVABLE, + /* Partially plugged, fully added to Linux, onlined to ZONE_MOVABLE. */ + VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL, + VIRTIO_MEM_SBM_MB_COUNT +}; + +/* + * State of a Big Block (BB) in BBM, covering 1..X Linux memory blocks. + */ +enum virtio_mem_bbm_bb_state { + /* Unplugged, not added to Linux. Can be reused later. */ + VIRTIO_MEM_BBM_BB_UNUSED = 0, + /* Plugged, not added to Linux. Error on add_memory(). */ + VIRTIO_MEM_BBM_BB_PLUGGED, + /* Plugged and added to Linux. */ + VIRTIO_MEM_BBM_BB_ADDED, + /* All online parts are fake-offline, ready to remove. */ + VIRTIO_MEM_BBM_BB_FAKE_OFFLINE, + VIRTIO_MEM_BBM_BB_COUNT +}; + +struct virtio_mem { + struct virtio_device *vdev; + + /* We might first have to unplug all memory when starting up. */ + bool unplug_all_required; + + /* Workqueue that processes the plug/unplug requests. */ + struct work_struct wq; + atomic_t wq_active; + atomic_t config_changed; + + /* Virtqueue for guest->host requests. */ + struct virtqueue *vq; + + /* Wait for a host response to a guest request. */ + wait_queue_head_t host_resp; + + /* Space for one guest request and the host response. */ + struct virtio_mem_req req; + struct virtio_mem_resp resp; + + /* The current size of the device. */ + uint64_t plugged_size; + /* The requested size of the device. */ + uint64_t requested_size; + + /* The device block size (for communicating with the device). */ + uint64_t device_block_size; + /* The determined node id for all memory of the device. */ + int nid; + /* Physical start address of the memory region. */ + uint64_t addr; + /* Maximum region size in bytes. */ + uint64_t region_size; + + /* The parent resource for all memory added via this device. */ + struct resource *parent_resource; + /* + * Copy of "System RAM (virtio_mem)" to be used for + * add_memory_driver_managed(). + */ + const char *resource_name; + /* Memory group identification. */ + int mgid; + + /* + * We don't want to add too much memory if it's not getting onlined, + * to avoid running OOM. Besides this threshold, we allow to have at + * least two offline blocks at a time (whatever is bigger). + */ +#define VIRTIO_MEM_DEFAULT_OFFLINE_THRESHOLD (1024 * 1024 * 1024) + atomic64_t offline_size; + uint64_t offline_threshold; + + /* If set, the driver is in SBM, otherwise in BBM. */ + bool in_sbm; + + union { + struct { + /* Id of the first memory block of this device. */ + unsigned long first_mb_id; + /* Id of the last usable memory block of this device. */ + unsigned long last_usable_mb_id; + /* Id of the next memory bock to prepare when needed. */ + unsigned long next_mb_id; + + /* The subblock size. */ + uint64_t sb_size; + /* The number of subblocks per Linux memory block. */ + uint32_t sbs_per_mb; + + /* + * Some of the Linux memory blocks tracked as "partially + * plugged" are completely unplugged and can be offlined + * and removed -- which previously failed. + */ + bool have_unplugged_mb; + + /* Summary of all memory block states. */ + unsigned long mb_count[VIRTIO_MEM_SBM_MB_COUNT]; + + /* + * One byte state per memory block. Allocated via + * vmalloc(). Resized (alloc+copy+free) on demand. + * + * With 128 MiB memory blocks, we have states for 512 + * GiB of memory in one 4 KiB page. + */ + uint8_t *mb_states; + + /* + * Bitmap: one bit per subblock. Allocated similar to + * sbm.mb_states. + * + * A set bit means the corresponding subblock is + * plugged, otherwise it's unblocked. + * + * With 4 MiB subblocks, we manage 128 GiB of memory + * in one 4 KiB page. + */ + unsigned long *sb_states; + } sbm; + + struct { + /* Id of the first big block of this device. */ + unsigned long first_bb_id; + /* Id of the last usable big block of this device. */ + unsigned long last_usable_bb_id; + /* Id of the next device bock to prepare when needed. */ + unsigned long next_bb_id; + + /* Summary of all big block states. */ + unsigned long bb_count[VIRTIO_MEM_BBM_BB_COUNT]; + + /* One byte state per big block. See sbm.mb_states. */ + uint8_t *bb_states; + + /* The block size used for plugging/adding/removing. */ + uint64_t bb_size; + } bbm; + }; + + /* + * Mutex that protects the sbm.mb_count, sbm.mb_states, + * sbm.sb_states, bbm.bb_count, and bbm.bb_states + * + * When this lock is held the pointers can't change, ONLINE and + * OFFLINE blocks can't change the state and no subblocks will get + * plugged/unplugged. + * + * In kdump mode, used to serialize requests, last_block_addr and + * last_block_plugged. + */ + struct mutex hotplug_mutex; + bool hotplug_active; + + /* An error occurred we cannot handle - stop processing requests. */ + bool broken; + + /* Cached valued of is_kdump_kernel() when the device was probed. */ + bool in_kdump; + + /* The driver is being removed. */ + spinlock_t removal_lock; + bool removing; + + /* Timer for retrying to plug/unplug memory. */ + struct hrtimer retry_timer; + unsigned int retry_timer_ms; +#define VIRTIO_MEM_RETRY_TIMER_MIN_MS 50000 +#define VIRTIO_MEM_RETRY_TIMER_MAX_MS 300000 + + /* Memory notifier (online/offline events). */ + struct notifier_block memory_notifier; + +#ifdef CONFIG_PROC_VMCORE + /* vmcore callback for /proc/vmcore handling in kdump mode */ + struct vmcore_cb vmcore_cb; + uint64_t last_block_addr; + bool last_block_plugged; +#endif /* CONFIG_PROC_VMCORE */ + + /* Next device in the list of virtio-mem devices. */ + struct list_head next; +}; + +/* + * We have to share a single online_page callback among all virtio-mem + * devices. We use RCU to iterate the list in the callback. + */ +static DEFINE_MUTEX(virtio_mem_mutex); +static LIST_HEAD(virtio_mem_devices); + +static void virtio_mem_online_page_cb(struct page *page, unsigned int order); +static void virtio_mem_fake_offline_going_offline(unsigned long pfn, + unsigned long nr_pages); +static void virtio_mem_fake_offline_cancel_offline(unsigned long pfn, + unsigned long nr_pages); +static void virtio_mem_retry(struct virtio_mem *vm); +static int virtio_mem_create_resource(struct virtio_mem *vm); +static void virtio_mem_delete_resource(struct virtio_mem *vm); + +/* + * Register a virtio-mem device so it will be considered for the online_page + * callback. + */ +static int register_virtio_mem_device(struct virtio_mem *vm) +{ + int rc = 0; + + /* First device registers the callback. */ + mutex_lock(&virtio_mem_mutex); + if (list_empty(&virtio_mem_devices)) + rc = set_online_page_callback(&virtio_mem_online_page_cb); + if (!rc) + list_add_rcu(&vm->next, &virtio_mem_devices); + mutex_unlock(&virtio_mem_mutex); + + return rc; +} + +/* + * Unregister a virtio-mem device so it will no longer be considered for the + * online_page callback. + */ +static void unregister_virtio_mem_device(struct virtio_mem *vm) +{ + /* Last device unregisters the callback. */ + mutex_lock(&virtio_mem_mutex); + list_del_rcu(&vm->next); + if (list_empty(&virtio_mem_devices)) + restore_online_page_callback(&virtio_mem_online_page_cb); + mutex_unlock(&virtio_mem_mutex); + + synchronize_rcu(); +} + +/* + * Calculate the memory block id of a given address. + */ +static unsigned long virtio_mem_phys_to_mb_id(unsigned long addr) +{ + return addr / memory_block_size_bytes(); +} + +/* + * Calculate the physical start address of a given memory block id. + */ +static unsigned long virtio_mem_mb_id_to_phys(unsigned long mb_id) +{ + return mb_id * memory_block_size_bytes(); +} + +/* + * Calculate the big block id of a given address. + */ +static unsigned long virtio_mem_phys_to_bb_id(struct virtio_mem *vm, + uint64_t addr) +{ + return addr / vm->bbm.bb_size; +} + +/* + * Calculate the physical start address of a given big block id. + */ +static uint64_t virtio_mem_bb_id_to_phys(struct virtio_mem *vm, + unsigned long bb_id) +{ + return bb_id * vm->bbm.bb_size; +} + +/* + * Calculate the subblock id of a given address. + */ +static unsigned long virtio_mem_phys_to_sb_id(struct virtio_mem *vm, + unsigned long addr) +{ + const unsigned long mb_id = virtio_mem_phys_to_mb_id(addr); + const unsigned long mb_addr = virtio_mem_mb_id_to_phys(mb_id); + + return (addr - mb_addr) / vm->sbm.sb_size; +} + +/* + * Set the state of a big block, taking care of the state counter. + */ +static void virtio_mem_bbm_set_bb_state(struct virtio_mem *vm, + unsigned long bb_id, + enum virtio_mem_bbm_bb_state state) +{ + const unsigned long idx = bb_id - vm->bbm.first_bb_id; + enum virtio_mem_bbm_bb_state old_state; + + old_state = vm->bbm.bb_states[idx]; + vm->bbm.bb_states[idx] = state; + + BUG_ON(vm->bbm.bb_count[old_state] == 0); + vm->bbm.bb_count[old_state]--; + vm->bbm.bb_count[state]++; +} + +/* + * Get the state of a big block. + */ +static enum virtio_mem_bbm_bb_state virtio_mem_bbm_get_bb_state(struct virtio_mem *vm, + unsigned long bb_id) +{ + return vm->bbm.bb_states[bb_id - vm->bbm.first_bb_id]; +} + +/* + * Prepare the big block state array for the next big block. + */ +static int virtio_mem_bbm_bb_states_prepare_next_bb(struct virtio_mem *vm) +{ + unsigned long old_bytes = vm->bbm.next_bb_id - vm->bbm.first_bb_id; + unsigned long new_bytes = old_bytes + 1; + int old_pages = PFN_UP(old_bytes); + int new_pages = PFN_UP(new_bytes); + uint8_t *new_array; + + if (vm->bbm.bb_states && old_pages == new_pages) + return 0; + + new_array = vzalloc(new_pages * PAGE_SIZE); + if (!new_array) + return -ENOMEM; + + mutex_lock(&vm->hotplug_mutex); + if (vm->bbm.bb_states) + memcpy(new_array, vm->bbm.bb_states, old_pages * PAGE_SIZE); + vfree(vm->bbm.bb_states); + vm->bbm.bb_states = new_array; + mutex_unlock(&vm->hotplug_mutex); + + return 0; +} + +#define virtio_mem_bbm_for_each_bb(_vm, _bb_id, _state) \ + for (_bb_id = vm->bbm.first_bb_id; \ + _bb_id < vm->bbm.next_bb_id && _vm->bbm.bb_count[_state]; \ + _bb_id++) \ + if (virtio_mem_bbm_get_bb_state(_vm, _bb_id) == _state) + +#define virtio_mem_bbm_for_each_bb_rev(_vm, _bb_id, _state) \ + for (_bb_id = vm->bbm.next_bb_id - 1; \ + _bb_id >= vm->bbm.first_bb_id && _vm->bbm.bb_count[_state]; \ + _bb_id--) \ + if (virtio_mem_bbm_get_bb_state(_vm, _bb_id) == _state) + +/* + * Set the state of a memory block, taking care of the state counter. + */ +static void virtio_mem_sbm_set_mb_state(struct virtio_mem *vm, + unsigned long mb_id, uint8_t state) +{ + const unsigned long idx = mb_id - vm->sbm.first_mb_id; + uint8_t old_state; + + old_state = vm->sbm.mb_states[idx]; + vm->sbm.mb_states[idx] = state; + + BUG_ON(vm->sbm.mb_count[old_state] == 0); + vm->sbm.mb_count[old_state]--; + vm->sbm.mb_count[state]++; +} + +/* + * Get the state of a memory block. + */ +static uint8_t virtio_mem_sbm_get_mb_state(struct virtio_mem *vm, + unsigned long mb_id) +{ + const unsigned long idx = mb_id - vm->sbm.first_mb_id; + + return vm->sbm.mb_states[idx]; +} + +/* + * Prepare the state array for the next memory block. + */ +static int virtio_mem_sbm_mb_states_prepare_next_mb(struct virtio_mem *vm) +{ + int old_pages = PFN_UP(vm->sbm.next_mb_id - vm->sbm.first_mb_id); + int new_pages = PFN_UP(vm->sbm.next_mb_id - vm->sbm.first_mb_id + 1); + uint8_t *new_array; + + if (vm->sbm.mb_states && old_pages == new_pages) + return 0; + + new_array = vzalloc(new_pages * PAGE_SIZE); + if (!new_array) + return -ENOMEM; + + mutex_lock(&vm->hotplug_mutex); + if (vm->sbm.mb_states) + memcpy(new_array, vm->sbm.mb_states, old_pages * PAGE_SIZE); + vfree(vm->sbm.mb_states); + vm->sbm.mb_states = new_array; + mutex_unlock(&vm->hotplug_mutex); + + return 0; +} + +#define virtio_mem_sbm_for_each_mb(_vm, _mb_id, _state) \ + for (_mb_id = _vm->sbm.first_mb_id; \ + _mb_id < _vm->sbm.next_mb_id && _vm->sbm.mb_count[_state]; \ + _mb_id++) \ + if (virtio_mem_sbm_get_mb_state(_vm, _mb_id) == _state) + +#define virtio_mem_sbm_for_each_mb_rev(_vm, _mb_id, _state) \ + for (_mb_id = _vm->sbm.next_mb_id - 1; \ + _mb_id >= _vm->sbm.first_mb_id && _vm->sbm.mb_count[_state]; \ + _mb_id--) \ + if (virtio_mem_sbm_get_mb_state(_vm, _mb_id) == _state) + +/* + * Calculate the bit number in the subblock bitmap for the given subblock + * inside the given memory block. + */ +static int virtio_mem_sbm_sb_state_bit_nr(struct virtio_mem *vm, + unsigned long mb_id, int sb_id) +{ + return (mb_id - vm->sbm.first_mb_id) * vm->sbm.sbs_per_mb + sb_id; +} + +/* + * Mark all selected subblocks plugged. + * + * Will not modify the state of the memory block. + */ +static void virtio_mem_sbm_set_sb_plugged(struct virtio_mem *vm, + unsigned long mb_id, int sb_id, + int count) +{ + const int bit = virtio_mem_sbm_sb_state_bit_nr(vm, mb_id, sb_id); + + __bitmap_set(vm->sbm.sb_states, bit, count); +} + +/* + * Mark all selected subblocks unplugged. + * + * Will not modify the state of the memory block. + */ +static void virtio_mem_sbm_set_sb_unplugged(struct virtio_mem *vm, + unsigned long mb_id, int sb_id, + int count) +{ + const int bit = virtio_mem_sbm_sb_state_bit_nr(vm, mb_id, sb_id); + + __bitmap_clear(vm->sbm.sb_states, bit, count); +} + +/* + * Test if all selected subblocks are plugged. + */ +static bool virtio_mem_sbm_test_sb_plugged(struct virtio_mem *vm, + unsigned long mb_id, int sb_id, + int count) +{ + const int bit = virtio_mem_sbm_sb_state_bit_nr(vm, mb_id, sb_id); + + if (count == 1) + return test_bit(bit, vm->sbm.sb_states); + + /* TODO: Helper similar to bitmap_set() */ + return find_next_zero_bit(vm->sbm.sb_states, bit + count, bit) >= + bit + count; +} + +/* + * Test if all selected subblocks are unplugged. + */ +static bool virtio_mem_sbm_test_sb_unplugged(struct virtio_mem *vm, + unsigned long mb_id, int sb_id, + int count) +{ + const int bit = virtio_mem_sbm_sb_state_bit_nr(vm, mb_id, sb_id); + + /* TODO: Helper similar to bitmap_set() */ + return find_next_bit(vm->sbm.sb_states, bit + count, bit) >= + bit + count; +} + +/* + * Find the first unplugged subblock. Returns vm->sbm.sbs_per_mb in case there is + * none. + */ +static int virtio_mem_sbm_first_unplugged_sb(struct virtio_mem *vm, + unsigned long mb_id) +{ + const int bit = virtio_mem_sbm_sb_state_bit_nr(vm, mb_id, 0); + + return find_next_zero_bit(vm->sbm.sb_states, + bit + vm->sbm.sbs_per_mb, bit) - bit; +} + +/* + * Prepare the subblock bitmap for the next memory block. + */ +static int virtio_mem_sbm_sb_states_prepare_next_mb(struct virtio_mem *vm) +{ + const unsigned long old_nb_mb = vm->sbm.next_mb_id - vm->sbm.first_mb_id; + const unsigned long old_nb_bits = old_nb_mb * vm->sbm.sbs_per_mb; + const unsigned long new_nb_bits = (old_nb_mb + 1) * vm->sbm.sbs_per_mb; + int old_pages = PFN_UP(BITS_TO_LONGS(old_nb_bits) * sizeof(long)); + int new_pages = PFN_UP(BITS_TO_LONGS(new_nb_bits) * sizeof(long)); + unsigned long *new_bitmap, *old_bitmap; + + if (vm->sbm.sb_states && old_pages == new_pages) + return 0; + + new_bitmap = vzalloc(new_pages * PAGE_SIZE); + if (!new_bitmap) + return -ENOMEM; + + mutex_lock(&vm->hotplug_mutex); + if (vm->sbm.sb_states) + memcpy(new_bitmap, vm->sbm.sb_states, old_pages * PAGE_SIZE); + + old_bitmap = vm->sbm.sb_states; + vm->sbm.sb_states = new_bitmap; + mutex_unlock(&vm->hotplug_mutex); + + vfree(old_bitmap); + return 0; +} + +/* + * Test if we could add memory without creating too much offline memory - + * to avoid running OOM if memory is getting onlined deferred. + */ +static bool virtio_mem_could_add_memory(struct virtio_mem *vm, uint64_t size) +{ + if (WARN_ON_ONCE(size > vm->offline_threshold)) + return false; + + return atomic64_read(&vm->offline_size) + size <= vm->offline_threshold; +} + +/* + * Try adding memory to Linux. Will usually only fail if out of memory. + * + * Must not be called with the vm->hotplug_mutex held (possible deadlock with + * onlining code). + * + * Will not modify the state of memory blocks in virtio-mem. + */ +static int virtio_mem_add_memory(struct virtio_mem *vm, uint64_t addr, + uint64_t size) +{ + int rc; + + /* + * When force-unloading the driver and we still have memory added to + * Linux, the resource name has to stay. + */ + if (!vm->resource_name) { + vm->resource_name = kstrdup_const("System RAM (virtio_mem)", + GFP_KERNEL); + if (!vm->resource_name) + return -ENOMEM; + } + + dev_dbg(&vm->vdev->dev, "adding memory: 0x%llx - 0x%llx\n", addr, + addr + size - 1); + /* Memory might get onlined immediately. */ + atomic64_add(size, &vm->offline_size); + rc = add_memory_driver_managed(vm->mgid, addr, size, vm->resource_name, + MHP_MERGE_RESOURCE | MHP_NID_IS_MGID); + if (rc) { + atomic64_sub(size, &vm->offline_size); + dev_warn(&vm->vdev->dev, "adding memory failed: %d\n", rc); + /* + * TODO: Linux MM does not properly clean up yet in all cases + * where adding of memory failed - especially on -ENOMEM. + */ + } + return rc; +} + +/* + * See virtio_mem_add_memory(): Try adding a single Linux memory block. + */ +static int virtio_mem_sbm_add_mb(struct virtio_mem *vm, unsigned long mb_id) +{ + const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id); + const uint64_t size = memory_block_size_bytes(); + + return virtio_mem_add_memory(vm, addr, size); +} + +/* + * See virtio_mem_add_memory(): Try adding a big block. + */ +static int virtio_mem_bbm_add_bb(struct virtio_mem *vm, unsigned long bb_id) +{ + const uint64_t addr = virtio_mem_bb_id_to_phys(vm, bb_id); + const uint64_t size = vm->bbm.bb_size; + + return virtio_mem_add_memory(vm, addr, size); +} + +/* + * Try removing memory from Linux. Will only fail if memory blocks aren't + * offline. + * + * Must not be called with the vm->hotplug_mutex held (possible deadlock with + * onlining code). + * + * Will not modify the state of memory blocks in virtio-mem. + */ +static int virtio_mem_remove_memory(struct virtio_mem *vm, uint64_t addr, + uint64_t size) +{ + int rc; + + dev_dbg(&vm->vdev->dev, "removing memory: 0x%llx - 0x%llx\n", addr, + addr + size - 1); + rc = remove_memory(addr, size); + if (!rc) { + atomic64_sub(size, &vm->offline_size); + /* + * We might have freed up memory we can now unplug, retry + * immediately instead of waiting. + */ + virtio_mem_retry(vm); + } else { + dev_dbg(&vm->vdev->dev, "removing memory failed: %d\n", rc); + } + return rc; +} + +/* + * See virtio_mem_remove_memory(): Try removing a single Linux memory block. + */ +static int virtio_mem_sbm_remove_mb(struct virtio_mem *vm, unsigned long mb_id) +{ + const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id); + const uint64_t size = memory_block_size_bytes(); + + return virtio_mem_remove_memory(vm, addr, size); +} + +/* + * Try offlining and removing memory from Linux. + * + * Must not be called with the vm->hotplug_mutex held (possible deadlock with + * onlining code). + * + * Will not modify the state of memory blocks in virtio-mem. + */ +static int virtio_mem_offline_and_remove_memory(struct virtio_mem *vm, + uint64_t addr, + uint64_t size) +{ + int rc; + + dev_dbg(&vm->vdev->dev, + "offlining and removing memory: 0x%llx - 0x%llx\n", addr, + addr + size - 1); + + rc = offline_and_remove_memory(addr, size); + if (!rc) { + atomic64_sub(size, &vm->offline_size); + /* + * We might have freed up memory we can now unplug, retry + * immediately instead of waiting. + */ + virtio_mem_retry(vm); + return 0; + } + dev_dbg(&vm->vdev->dev, "offlining and removing memory failed: %d\n", rc); + /* + * We don't really expect this to fail, because we fake-offlined all + * memory already. But it could fail in corner cases. + */ + WARN_ON_ONCE(rc != -ENOMEM && rc != -EBUSY); + return rc == -ENOMEM ? -ENOMEM : -EBUSY; +} + +/* + * See virtio_mem_offline_and_remove_memory(): Try offlining and removing + * a single Linux memory block. + */ +static int virtio_mem_sbm_offline_and_remove_mb(struct virtio_mem *vm, + unsigned long mb_id) +{ + const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id); + const uint64_t size = memory_block_size_bytes(); + + return virtio_mem_offline_and_remove_memory(vm, addr, size); +} + +/* + * Try (offlining and) removing memory from Linux in case all subblocks are + * unplugged. Can be called on online and offline memory blocks. + * + * May modify the state of memory blocks in virtio-mem. + */ +static int virtio_mem_sbm_try_remove_unplugged_mb(struct virtio_mem *vm, + unsigned long mb_id) +{ + int rc; + + /* + * Once all subblocks of a memory block were unplugged, offline and + * remove it. + */ + if (!virtio_mem_sbm_test_sb_unplugged(vm, mb_id, 0, vm->sbm.sbs_per_mb)) + return 0; + + /* offline_and_remove_memory() works for online and offline memory. */ + mutex_unlock(&vm->hotplug_mutex); + rc = virtio_mem_sbm_offline_and_remove_mb(vm, mb_id); + mutex_lock(&vm->hotplug_mutex); + if (!rc) + virtio_mem_sbm_set_mb_state(vm, mb_id, + VIRTIO_MEM_SBM_MB_UNUSED); + return rc; +} + +/* + * See virtio_mem_offline_and_remove_memory(): Try to offline and remove a + * all Linux memory blocks covered by the big block. + */ +static int virtio_mem_bbm_offline_and_remove_bb(struct virtio_mem *vm, + unsigned long bb_id) +{ + const uint64_t addr = virtio_mem_bb_id_to_phys(vm, bb_id); + const uint64_t size = vm->bbm.bb_size; + + return virtio_mem_offline_and_remove_memory(vm, addr, size); +} + +/* + * Trigger the workqueue so the device can perform its magic. + */ +static void virtio_mem_retry(struct virtio_mem *vm) +{ + unsigned long flags; + + spin_lock_irqsave(&vm->removal_lock, flags); + if (!vm->removing) + queue_work(system_freezable_wq, &vm->wq); + spin_unlock_irqrestore(&vm->removal_lock, flags); +} + +static int virtio_mem_translate_node_id(struct virtio_mem *vm, uint16_t node_id) +{ + int node = NUMA_NO_NODE; + +#if defined(CONFIG_ACPI_NUMA) + if (virtio_has_feature(vm->vdev, VIRTIO_MEM_F_ACPI_PXM)) + node = pxm_to_node(node_id); +#endif + return node; +} + +/* + * Test if a virtio-mem device overlaps with the given range. Can be called + * from (notifier) callbacks lockless. + */ +static bool virtio_mem_overlaps_range(struct virtio_mem *vm, uint64_t start, + uint64_t size) +{ + return start < vm->addr + vm->region_size && vm->addr < start + size; +} + +/* + * Test if a virtio-mem device contains a given range. Can be called from + * (notifier) callbacks lockless. + */ +static bool virtio_mem_contains_range(struct virtio_mem *vm, uint64_t start, + uint64_t size) +{ + return start >= vm->addr && start + size <= vm->addr + vm->region_size; +} + +static int virtio_mem_sbm_notify_going_online(struct virtio_mem *vm, + unsigned long mb_id) +{ + switch (virtio_mem_sbm_get_mb_state(vm, mb_id)) { + case VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL: + case VIRTIO_MEM_SBM_MB_OFFLINE: + return NOTIFY_OK; + default: + break; + } + dev_warn_ratelimited(&vm->vdev->dev, + "memory block onlining denied\n"); + return NOTIFY_BAD; +} + +static void virtio_mem_sbm_notify_offline(struct virtio_mem *vm, + unsigned long mb_id) +{ + switch (virtio_mem_sbm_get_mb_state(vm, mb_id)) { + case VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL: + case VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL: + virtio_mem_sbm_set_mb_state(vm, mb_id, + VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL); + break; + case VIRTIO_MEM_SBM_MB_KERNEL: + case VIRTIO_MEM_SBM_MB_MOVABLE: + virtio_mem_sbm_set_mb_state(vm, mb_id, + VIRTIO_MEM_SBM_MB_OFFLINE); + break; + default: + BUG(); + break; + } +} + +static void virtio_mem_sbm_notify_online(struct virtio_mem *vm, + unsigned long mb_id, + unsigned long start_pfn) +{ + const bool is_movable = is_zone_movable_page(pfn_to_page(start_pfn)); + int new_state; + + switch (virtio_mem_sbm_get_mb_state(vm, mb_id)) { + case VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL: + new_state = VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL; + if (is_movable) + new_state = VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL; + break; + case VIRTIO_MEM_SBM_MB_OFFLINE: + new_state = VIRTIO_MEM_SBM_MB_KERNEL; + if (is_movable) + new_state = VIRTIO_MEM_SBM_MB_MOVABLE; + break; + default: + BUG(); + break; + } + virtio_mem_sbm_set_mb_state(vm, mb_id, new_state); +} + +static void virtio_mem_sbm_notify_going_offline(struct virtio_mem *vm, + unsigned long mb_id) +{ + const unsigned long nr_pages = PFN_DOWN(vm->sbm.sb_size); + unsigned long pfn; + int sb_id; + + for (sb_id = 0; sb_id < vm->sbm.sbs_per_mb; sb_id++) { + if (virtio_mem_sbm_test_sb_plugged(vm, mb_id, sb_id, 1)) + continue; + pfn = PFN_DOWN(virtio_mem_mb_id_to_phys(mb_id) + + sb_id * vm->sbm.sb_size); + virtio_mem_fake_offline_going_offline(pfn, nr_pages); + } +} + +static void virtio_mem_sbm_notify_cancel_offline(struct virtio_mem *vm, + unsigned long mb_id) +{ + const unsigned long nr_pages = PFN_DOWN(vm->sbm.sb_size); + unsigned long pfn; + int sb_id; + + for (sb_id = 0; sb_id < vm->sbm.sbs_per_mb; sb_id++) { + if (virtio_mem_sbm_test_sb_plugged(vm, mb_id, sb_id, 1)) + continue; + pfn = PFN_DOWN(virtio_mem_mb_id_to_phys(mb_id) + + sb_id * vm->sbm.sb_size); + virtio_mem_fake_offline_cancel_offline(pfn, nr_pages); + } +} + +static void virtio_mem_bbm_notify_going_offline(struct virtio_mem *vm, + unsigned long bb_id, + unsigned long pfn, + unsigned long nr_pages) +{ + /* + * When marked as "fake-offline", all online memory of this device block + * is allocated by us. Otherwise, we don't have any memory allocated. + */ + if (virtio_mem_bbm_get_bb_state(vm, bb_id) != + VIRTIO_MEM_BBM_BB_FAKE_OFFLINE) + return; + virtio_mem_fake_offline_going_offline(pfn, nr_pages); +} + +static void virtio_mem_bbm_notify_cancel_offline(struct virtio_mem *vm, + unsigned long bb_id, + unsigned long pfn, + unsigned long nr_pages) +{ + if (virtio_mem_bbm_get_bb_state(vm, bb_id) != + VIRTIO_MEM_BBM_BB_FAKE_OFFLINE) + return; + virtio_mem_fake_offline_cancel_offline(pfn, nr_pages); +} + +/* + * This callback will either be called synchronously from add_memory() or + * asynchronously (e.g., triggered via user space). We have to be careful + * with locking when calling add_memory(). + */ +static int virtio_mem_memory_notifier_cb(struct notifier_block *nb, + unsigned long action, void *arg) +{ + struct virtio_mem *vm = container_of(nb, struct virtio_mem, + memory_notifier); + struct memory_notify *mhp = arg; + const unsigned long start = PFN_PHYS(mhp->start_pfn); + const unsigned long size = PFN_PHYS(mhp->nr_pages); + int rc = NOTIFY_OK; + unsigned long id; + + if (!virtio_mem_overlaps_range(vm, start, size)) + return NOTIFY_DONE; + + if (vm->in_sbm) { + id = virtio_mem_phys_to_mb_id(start); + /* + * In SBM, we add memory in separate memory blocks - we expect + * it to be onlined/offlined in the same granularity. Bail out + * if this ever changes. + */ + if (WARN_ON_ONCE(size != memory_block_size_bytes() || + !IS_ALIGNED(start, memory_block_size_bytes()))) + return NOTIFY_BAD; + } else { + id = virtio_mem_phys_to_bb_id(vm, start); + /* + * In BBM, we only care about onlining/offlining happening + * within a single big block, we don't care about the + * actual granularity as we don't track individual Linux + * memory blocks. + */ + if (WARN_ON_ONCE(id != virtio_mem_phys_to_bb_id(vm, start + size - 1))) + return NOTIFY_BAD; + } + + /* + * Avoid circular locking lockdep warnings. We lock the mutex + * e.g., in MEM_GOING_ONLINE and unlock it in MEM_ONLINE. The + * blocking_notifier_call_chain() has it's own lock, which gets unlocked + * between both notifier calls and will bail out. False positive. + */ + lockdep_off(); + + switch (action) { + case MEM_GOING_OFFLINE: + mutex_lock(&vm->hotplug_mutex); + if (vm->removing) { + rc = notifier_from_errno(-EBUSY); + mutex_unlock(&vm->hotplug_mutex); + break; + } + vm->hotplug_active = true; + if (vm->in_sbm) + virtio_mem_sbm_notify_going_offline(vm, id); + else + virtio_mem_bbm_notify_going_offline(vm, id, + mhp->start_pfn, + mhp->nr_pages); + break; + case MEM_GOING_ONLINE: + mutex_lock(&vm->hotplug_mutex); + if (vm->removing) { + rc = notifier_from_errno(-EBUSY); + mutex_unlock(&vm->hotplug_mutex); + break; + } + vm->hotplug_active = true; + if (vm->in_sbm) + rc = virtio_mem_sbm_notify_going_online(vm, id); + break; + case MEM_OFFLINE: + if (vm->in_sbm) + virtio_mem_sbm_notify_offline(vm, id); + + atomic64_add(size, &vm->offline_size); + /* + * Trigger the workqueue. Now that we have some offline memory, + * maybe we can handle pending unplug requests. + */ + if (!unplug_online) + virtio_mem_retry(vm); + + vm->hotplug_active = false; + mutex_unlock(&vm->hotplug_mutex); + break; + case MEM_ONLINE: + if (vm->in_sbm) + virtio_mem_sbm_notify_online(vm, id, mhp->start_pfn); + + atomic64_sub(size, &vm->offline_size); + /* + * Start adding more memory once we onlined half of our + * threshold. Don't trigger if it's possibly due to our actipn + * (e.g., us adding memory which gets onlined immediately from + * the core). + */ + if (!atomic_read(&vm->wq_active) && + virtio_mem_could_add_memory(vm, vm->offline_threshold / 2)) + virtio_mem_retry(vm); + + vm->hotplug_active = false; + mutex_unlock(&vm->hotplug_mutex); + break; + case MEM_CANCEL_OFFLINE: + if (!vm->hotplug_active) + break; + if (vm->in_sbm) + virtio_mem_sbm_notify_cancel_offline(vm, id); + else + virtio_mem_bbm_notify_cancel_offline(vm, id, + mhp->start_pfn, + mhp->nr_pages); + vm->hotplug_active = false; + mutex_unlock(&vm->hotplug_mutex); + break; + case MEM_CANCEL_ONLINE: + if (!vm->hotplug_active) + break; + vm->hotplug_active = false; + mutex_unlock(&vm->hotplug_mutex); + break; + default: + break; + } + + lockdep_on(); + + return rc; +} + +/* + * Set a range of pages PG_offline. Remember pages that were never onlined + * (via generic_online_page()) using PageDirty(). + */ +static void virtio_mem_set_fake_offline(unsigned long pfn, + unsigned long nr_pages, bool onlined) +{ + page_offline_begin(); + for (; nr_pages--; pfn++) { + struct page *page = pfn_to_page(pfn); + + __SetPageOffline(page); + if (!onlined) { + SetPageDirty(page); + /* FIXME: remove after cleanups */ + ClearPageReserved(page); + } + } + page_offline_end(); +} + +/* + * Clear PG_offline from a range of pages. If the pages were never onlined, + * (via generic_online_page()), clear PageDirty(). + */ +static void virtio_mem_clear_fake_offline(unsigned long pfn, + unsigned long nr_pages, bool onlined) +{ + for (; nr_pages--; pfn++) { + struct page *page = pfn_to_page(pfn); + + __ClearPageOffline(page); + if (!onlined) + ClearPageDirty(page); + } +} + +/* + * Release a range of fake-offline pages to the buddy, effectively + * fake-onlining them. + */ +static void virtio_mem_fake_online(unsigned long pfn, unsigned long nr_pages) +{ + unsigned long order = MAX_ORDER; + unsigned long i; + + /* + * We might get called for ranges that don't cover properly aligned + * MAX_ORDER pages; however, we can only online properly aligned + * pages with an order of MAX_ORDER at maximum. + */ + while (!IS_ALIGNED(pfn | nr_pages, 1 << order)) + order--; + + for (i = 0; i < nr_pages; i += 1 << order) { + struct page *page = pfn_to_page(pfn + i); + + /* + * If the page is PageDirty(), it was kept fake-offline when + * onlining the memory block. Otherwise, it was allocated + * using alloc_contig_range(). All pages in a subblock are + * alike. + */ + if (PageDirty(page)) { + virtio_mem_clear_fake_offline(pfn + i, 1 << order, false); + generic_online_page(page, order); + } else { + virtio_mem_clear_fake_offline(pfn + i, 1 << order, true); + free_contig_range(pfn + i, 1 << order); + adjust_managed_page_count(page, 1 << order); + } + } +} + +/* + * Try to allocate a range, marking pages fake-offline, effectively + * fake-offlining them. + */ +static int virtio_mem_fake_offline(struct virtio_mem *vm, unsigned long pfn, + unsigned long nr_pages) +{ + const bool is_movable = is_zone_movable_page(pfn_to_page(pfn)); + int rc, retry_count; + + /* + * TODO: We want an alloc_contig_range() mode that tries to allocate + * harder (e.g., dealing with temporarily pinned pages, PCP), especially + * with ZONE_MOVABLE. So for now, retry a couple of times with + * ZONE_MOVABLE before giving up - because that zone is supposed to give + * some guarantees. + */ + for (retry_count = 0; retry_count < 5; retry_count++) { + /* + * If the config changed, stop immediately and go back to the + * main loop: avoid trying to keep unplugging if the device + * might have decided to not remove any more memory. + */ + if (atomic_read(&vm->config_changed)) + return -EAGAIN; + + rc = alloc_contig_range(pfn, pfn + nr_pages, MIGRATE_MOVABLE, + GFP_KERNEL); + if (rc == -ENOMEM) + /* whoops, out of memory */ + return rc; + else if (rc && !is_movable) + break; + else if (rc) + continue; + + virtio_mem_set_fake_offline(pfn, nr_pages, true); + adjust_managed_page_count(pfn_to_page(pfn), -nr_pages); + return 0; + } + + return -EBUSY; +} + +/* + * Handle fake-offline pages when memory is going offline - such that the + * pages can be skipped by mm-core when offlining. + */ +static void virtio_mem_fake_offline_going_offline(unsigned long pfn, + unsigned long nr_pages) +{ + struct page *page; + unsigned long i; + + /* + * Drop our reference to the pages so the memory can get offlined + * and add the unplugged pages to the managed page counters (so + * offlining code can correctly subtract them again). + */ + adjust_managed_page_count(pfn_to_page(pfn), nr_pages); + /* Drop our reference to the pages so the memory can get offlined. */ + for (i = 0; i < nr_pages; i++) { + page = pfn_to_page(pfn + i); + if (WARN_ON(!page_ref_dec_and_test(page))) + dump_page(page, "fake-offline page referenced"); + } +} + +/* + * Handle fake-offline pages when memory offlining is canceled - to undo + * what we did in virtio_mem_fake_offline_going_offline(). + */ +static void virtio_mem_fake_offline_cancel_offline(unsigned long pfn, + unsigned long nr_pages) +{ + unsigned long i; + + /* + * Get the reference we dropped when going offline and subtract the + * unplugged pages from the managed page counters. + */ + adjust_managed_page_count(pfn_to_page(pfn), -nr_pages); + for (i = 0; i < nr_pages; i++) + page_ref_inc(pfn_to_page(pfn + i)); +} + +static void virtio_mem_online_page(struct virtio_mem *vm, + struct page *page, unsigned int order) +{ + const unsigned long start = page_to_phys(page); + const unsigned long end = start + PFN_PHYS(1 << order); + unsigned long addr, next, id, sb_id, count; + bool do_online; + + /* + * We can get called with any order up to MAX_ORDER. If our subblock + * size is smaller than that and we have a mixture of plugged and + * unplugged subblocks within such a page, we have to process in + * smaller granularity. In that case we'll adjust the order exactly once + * within the loop. + */ + for (addr = start; addr < end; ) { + next = addr + PFN_PHYS(1 << order); + + if (vm->in_sbm) { + id = virtio_mem_phys_to_mb_id(addr); + sb_id = virtio_mem_phys_to_sb_id(vm, addr); + count = virtio_mem_phys_to_sb_id(vm, next - 1) - sb_id + 1; + + if (virtio_mem_sbm_test_sb_plugged(vm, id, sb_id, count)) { + /* Fully plugged. */ + do_online = true; + } else if (count == 1 || + virtio_mem_sbm_test_sb_unplugged(vm, id, sb_id, count)) { + /* Fully unplugged. */ + do_online = false; + } else { + /* + * Mixture, process sub-blocks instead. This + * will be at least the size of a pageblock. + * We'll run into this case exactly once. + */ + order = ilog2(vm->sbm.sb_size) - PAGE_SHIFT; + do_online = virtio_mem_sbm_test_sb_plugged(vm, id, sb_id, 1); + continue; + } + } else { + /* + * If the whole block is marked fake offline, keep + * everything that way. + */ + id = virtio_mem_phys_to_bb_id(vm, addr); + do_online = virtio_mem_bbm_get_bb_state(vm, id) != + VIRTIO_MEM_BBM_BB_FAKE_OFFLINE; + } + + if (do_online) + generic_online_page(pfn_to_page(PFN_DOWN(addr)), order); + else + virtio_mem_set_fake_offline(PFN_DOWN(addr), 1 << order, + false); + addr = next; + } +} + +static void virtio_mem_online_page_cb(struct page *page, unsigned int order) +{ + const unsigned long addr = page_to_phys(page); + struct virtio_mem *vm; + + rcu_read_lock(); + list_for_each_entry_rcu(vm, &virtio_mem_devices, next) { + /* + * Pages we're onlining will never cross memory blocks and, + * therefore, not virtio-mem devices. + */ + if (!virtio_mem_contains_range(vm, addr, PFN_PHYS(1 << order))) + continue; + + /* + * virtio_mem_set_fake_offline() might sleep. We can safely + * drop the RCU lock at this point because the device + * cannot go away. See virtio_mem_remove() how races + * between memory onlining and device removal are handled. + */ + rcu_read_unlock(); + + virtio_mem_online_page(vm, page, order); + return; + } + rcu_read_unlock(); + + /* not virtio-mem memory, but e.g., a DIMM. online it */ + generic_online_page(page, order); +} + +static uint64_t virtio_mem_send_request(struct virtio_mem *vm, + const struct virtio_mem_req *req) +{ + struct scatterlist *sgs[2], sg_req, sg_resp; + unsigned int len; + int rc; + + /* don't use the request residing on the stack (vaddr) */ + vm->req = *req; + + /* out: buffer for request */ + sg_init_one(&sg_req, &vm->req, sizeof(vm->req)); + sgs[0] = &sg_req; + + /* in: buffer for response */ + sg_init_one(&sg_resp, &vm->resp, sizeof(vm->resp)); + sgs[1] = &sg_resp; + + rc = virtqueue_add_sgs(vm->vq, sgs, 1, 1, vm, GFP_KERNEL); + if (rc < 0) + return rc; + + virtqueue_kick(vm->vq); + + /* wait for a response */ + wait_event(vm->host_resp, virtqueue_get_buf(vm->vq, &len)); + + return virtio16_to_cpu(vm->vdev, vm->resp.type); +} + +static int virtio_mem_send_plug_request(struct virtio_mem *vm, uint64_t addr, + uint64_t size) +{ + const uint64_t nb_vm_blocks = size / vm->device_block_size; + const struct virtio_mem_req req = { + .type = cpu_to_virtio16(vm->vdev, VIRTIO_MEM_REQ_PLUG), + .u.plug.addr = cpu_to_virtio64(vm->vdev, addr), + .u.plug.nb_blocks = cpu_to_virtio16(vm->vdev, nb_vm_blocks), + }; + int rc = -ENOMEM; + + if (atomic_read(&vm->config_changed)) + return -EAGAIN; + + dev_dbg(&vm->vdev->dev, "plugging memory: 0x%llx - 0x%llx\n", addr, + addr + size - 1); + + switch (virtio_mem_send_request(vm, &req)) { + case VIRTIO_MEM_RESP_ACK: + vm->plugged_size += size; + return 0; + case VIRTIO_MEM_RESP_NACK: + rc = -EAGAIN; + break; + case VIRTIO_MEM_RESP_BUSY: + rc = -ETXTBSY; + break; + case VIRTIO_MEM_RESP_ERROR: + rc = -EINVAL; + break; + default: + break; + } + + dev_dbg(&vm->vdev->dev, "plugging memory failed: %d\n", rc); + return rc; +} + +static int virtio_mem_send_unplug_request(struct virtio_mem *vm, uint64_t addr, + uint64_t size) +{ + const uint64_t nb_vm_blocks = size / vm->device_block_size; + const struct virtio_mem_req req = { + .type = cpu_to_virtio16(vm->vdev, VIRTIO_MEM_REQ_UNPLUG), + .u.unplug.addr = cpu_to_virtio64(vm->vdev, addr), + .u.unplug.nb_blocks = cpu_to_virtio16(vm->vdev, nb_vm_blocks), + }; + int rc = -ENOMEM; + + if (atomic_read(&vm->config_changed)) + return -EAGAIN; + + dev_dbg(&vm->vdev->dev, "unplugging memory: 0x%llx - 0x%llx\n", addr, + addr + size - 1); + + switch (virtio_mem_send_request(vm, &req)) { + case VIRTIO_MEM_RESP_ACK: + vm->plugged_size -= size; + return 0; + case VIRTIO_MEM_RESP_BUSY: + rc = -ETXTBSY; + break; + case VIRTIO_MEM_RESP_ERROR: + rc = -EINVAL; + break; + default: + break; + } + + dev_dbg(&vm->vdev->dev, "unplugging memory failed: %d\n", rc); + return rc; +} + +static int virtio_mem_send_unplug_all_request(struct virtio_mem *vm) +{ + const struct virtio_mem_req req = { + .type = cpu_to_virtio16(vm->vdev, VIRTIO_MEM_REQ_UNPLUG_ALL), + }; + int rc = -ENOMEM; + + dev_dbg(&vm->vdev->dev, "unplugging all memory"); + + switch (virtio_mem_send_request(vm, &req)) { + case VIRTIO_MEM_RESP_ACK: + vm->unplug_all_required = false; + vm->plugged_size = 0; + /* usable region might have shrunk */ + atomic_set(&vm->config_changed, 1); + return 0; + case VIRTIO_MEM_RESP_BUSY: + rc = -ETXTBSY; + break; + default: + break; + } + + dev_dbg(&vm->vdev->dev, "unplugging all memory failed: %d\n", rc); + return rc; +} + +/* + * Plug selected subblocks. Updates the plugged state, but not the state + * of the memory block. + */ +static int virtio_mem_sbm_plug_sb(struct virtio_mem *vm, unsigned long mb_id, + int sb_id, int count) +{ + const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id) + + sb_id * vm->sbm.sb_size; + const uint64_t size = count * vm->sbm.sb_size; + int rc; + + rc = virtio_mem_send_plug_request(vm, addr, size); + if (!rc) + virtio_mem_sbm_set_sb_plugged(vm, mb_id, sb_id, count); + return rc; +} + +/* + * Unplug selected subblocks. Updates the plugged state, but not the state + * of the memory block. + */ +static int virtio_mem_sbm_unplug_sb(struct virtio_mem *vm, unsigned long mb_id, + int sb_id, int count) +{ + const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id) + + sb_id * vm->sbm.sb_size; + const uint64_t size = count * vm->sbm.sb_size; + int rc; + + rc = virtio_mem_send_unplug_request(vm, addr, size); + if (!rc) + virtio_mem_sbm_set_sb_unplugged(vm, mb_id, sb_id, count); + return rc; +} + +/* + * Request to unplug a big block. + * + * Will not modify the state of the big block. + */ +static int virtio_mem_bbm_unplug_bb(struct virtio_mem *vm, unsigned long bb_id) +{ + const uint64_t addr = virtio_mem_bb_id_to_phys(vm, bb_id); + const uint64_t size = vm->bbm.bb_size; + + return virtio_mem_send_unplug_request(vm, addr, size); +} + +/* + * Request to plug a big block. + * + * Will not modify the state of the big block. + */ +static int virtio_mem_bbm_plug_bb(struct virtio_mem *vm, unsigned long bb_id) +{ + const uint64_t addr = virtio_mem_bb_id_to_phys(vm, bb_id); + const uint64_t size = vm->bbm.bb_size; + + return virtio_mem_send_plug_request(vm, addr, size); +} + +/* + * Unplug the desired number of plugged subblocks of a offline or not-added + * memory block. Will fail if any subblock cannot get unplugged (instead of + * skipping it). + * + * Will not modify the state of the memory block. + * + * Note: can fail after some subblocks were unplugged. + */ +static int virtio_mem_sbm_unplug_any_sb_raw(struct virtio_mem *vm, + unsigned long mb_id, uint64_t *nb_sb) +{ + int sb_id, count; + int rc; + + sb_id = vm->sbm.sbs_per_mb - 1; + while (*nb_sb) { + /* Find the next candidate subblock */ + while (sb_id >= 0 && + virtio_mem_sbm_test_sb_unplugged(vm, mb_id, sb_id, 1)) + sb_id--; + if (sb_id < 0) + break; + /* Try to unplug multiple subblocks at a time */ + count = 1; + while (count < *nb_sb && sb_id > 0 && + virtio_mem_sbm_test_sb_plugged(vm, mb_id, sb_id - 1, 1)) { + count++; + sb_id--; + } + + rc = virtio_mem_sbm_unplug_sb(vm, mb_id, sb_id, count); + if (rc) + return rc; + *nb_sb -= count; + sb_id--; + } + + return 0; +} + +/* + * Unplug all plugged subblocks of an offline or not-added memory block. + * + * Will not modify the state of the memory block. + * + * Note: can fail after some subblocks were unplugged. + */ +static int virtio_mem_sbm_unplug_mb(struct virtio_mem *vm, unsigned long mb_id) +{ + uint64_t nb_sb = vm->sbm.sbs_per_mb; + + return virtio_mem_sbm_unplug_any_sb_raw(vm, mb_id, &nb_sb); +} + +/* + * Prepare tracking data for the next memory block. + */ +static int virtio_mem_sbm_prepare_next_mb(struct virtio_mem *vm, + unsigned long *mb_id) +{ + int rc; + + if (vm->sbm.next_mb_id > vm->sbm.last_usable_mb_id) + return -ENOSPC; + + /* Resize the state array if required. */ + rc = virtio_mem_sbm_mb_states_prepare_next_mb(vm); + if (rc) + return rc; + + /* Resize the subblock bitmap if required. */ + rc = virtio_mem_sbm_sb_states_prepare_next_mb(vm); + if (rc) + return rc; + + vm->sbm.mb_count[VIRTIO_MEM_SBM_MB_UNUSED]++; + *mb_id = vm->sbm.next_mb_id++; + return 0; +} + +/* + * Try to plug the desired number of subblocks and add the memory block + * to Linux. + * + * Will modify the state of the memory block. + */ +static int virtio_mem_sbm_plug_and_add_mb(struct virtio_mem *vm, + unsigned long mb_id, uint64_t *nb_sb) +{ + const int count = min_t(int, *nb_sb, vm->sbm.sbs_per_mb); + int rc; + + if (WARN_ON_ONCE(!count)) + return -EINVAL; + + /* + * Plug the requested number of subblocks before adding it to linux, + * so that onlining will directly online all plugged subblocks. + */ + rc = virtio_mem_sbm_plug_sb(vm, mb_id, 0, count); + if (rc) + return rc; + + /* + * Mark the block properly offline before adding it to Linux, + * so the memory notifiers will find the block in the right state. + */ + if (count == vm->sbm.sbs_per_mb) + virtio_mem_sbm_set_mb_state(vm, mb_id, + VIRTIO_MEM_SBM_MB_OFFLINE); + else + virtio_mem_sbm_set_mb_state(vm, mb_id, + VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL); + + /* Add the memory block to linux - if that fails, try to unplug. */ + rc = virtio_mem_sbm_add_mb(vm, mb_id); + if (rc) { + int new_state = VIRTIO_MEM_SBM_MB_UNUSED; + + if (virtio_mem_sbm_unplug_sb(vm, mb_id, 0, count)) + new_state = VIRTIO_MEM_SBM_MB_PLUGGED; + virtio_mem_sbm_set_mb_state(vm, mb_id, new_state); + return rc; + } + + *nb_sb -= count; + return 0; +} + +/* + * Try to plug the desired number of subblocks of a memory block that + * is already added to Linux. + * + * Will modify the state of the memory block. + * + * Note: Can fail after some subblocks were successfully plugged. + */ +static int virtio_mem_sbm_plug_any_sb(struct virtio_mem *vm, + unsigned long mb_id, uint64_t *nb_sb) +{ + const int old_state = virtio_mem_sbm_get_mb_state(vm, mb_id); + unsigned long pfn, nr_pages; + int sb_id, count; + int rc; + + if (WARN_ON_ONCE(!*nb_sb)) + return -EINVAL; + + while (*nb_sb) { + sb_id = virtio_mem_sbm_first_unplugged_sb(vm, mb_id); + if (sb_id >= vm->sbm.sbs_per_mb) + break; + count = 1; + while (count < *nb_sb && + sb_id + count < vm->sbm.sbs_per_mb && + !virtio_mem_sbm_test_sb_plugged(vm, mb_id, sb_id + count, 1)) + count++; + + rc = virtio_mem_sbm_plug_sb(vm, mb_id, sb_id, count); + if (rc) + return rc; + *nb_sb -= count; + if (old_state == VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL) + continue; + + /* fake-online the pages if the memory block is online */ + pfn = PFN_DOWN(virtio_mem_mb_id_to_phys(mb_id) + + sb_id * vm->sbm.sb_size); + nr_pages = PFN_DOWN(count * vm->sbm.sb_size); + virtio_mem_fake_online(pfn, nr_pages); + } + + if (virtio_mem_sbm_test_sb_plugged(vm, mb_id, 0, vm->sbm.sbs_per_mb)) + virtio_mem_sbm_set_mb_state(vm, mb_id, old_state - 1); + + return 0; +} + +static int virtio_mem_sbm_plug_request(struct virtio_mem *vm, uint64_t diff) +{ + const int mb_states[] = { + VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL, + VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL, + VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL, + }; + uint64_t nb_sb = diff / vm->sbm.sb_size; + unsigned long mb_id; + int rc, i; + + if (!nb_sb) + return 0; + + /* Don't race with onlining/offlining */ + mutex_lock(&vm->hotplug_mutex); + + for (i = 0; i < ARRAY_SIZE(mb_states); i++) { + virtio_mem_sbm_for_each_mb(vm, mb_id, mb_states[i]) { + rc = virtio_mem_sbm_plug_any_sb(vm, mb_id, &nb_sb); + if (rc || !nb_sb) + goto out_unlock; + cond_resched(); + } + } + + /* + * We won't be working on online/offline memory blocks from this point, + * so we can't race with memory onlining/offlining. Drop the mutex. + */ + mutex_unlock(&vm->hotplug_mutex); + + /* Try to plug and add unused blocks */ + virtio_mem_sbm_for_each_mb(vm, mb_id, VIRTIO_MEM_SBM_MB_UNUSED) { + if (!virtio_mem_could_add_memory(vm, memory_block_size_bytes())) + return -ENOSPC; + + rc = virtio_mem_sbm_plug_and_add_mb(vm, mb_id, &nb_sb); + if (rc || !nb_sb) + return rc; + cond_resched(); + } + + /* Try to prepare, plug and add new blocks */ + while (nb_sb) { + if (!virtio_mem_could_add_memory(vm, memory_block_size_bytes())) + return -ENOSPC; + + rc = virtio_mem_sbm_prepare_next_mb(vm, &mb_id); + if (rc) + return rc; + rc = virtio_mem_sbm_plug_and_add_mb(vm, mb_id, &nb_sb); + if (rc) + return rc; + cond_resched(); + } + + return 0; +out_unlock: + mutex_unlock(&vm->hotplug_mutex); + return rc; +} + +/* + * Plug a big block and add it to Linux. + * + * Will modify the state of the big block. + */ +static int virtio_mem_bbm_plug_and_add_bb(struct virtio_mem *vm, + unsigned long bb_id) +{ + int rc; + + if (WARN_ON_ONCE(virtio_mem_bbm_get_bb_state(vm, bb_id) != + VIRTIO_MEM_BBM_BB_UNUSED)) + return -EINVAL; + + rc = virtio_mem_bbm_plug_bb(vm, bb_id); + if (rc) + return rc; + virtio_mem_bbm_set_bb_state(vm, bb_id, VIRTIO_MEM_BBM_BB_ADDED); + + rc = virtio_mem_bbm_add_bb(vm, bb_id); + if (rc) { + if (!virtio_mem_bbm_unplug_bb(vm, bb_id)) + virtio_mem_bbm_set_bb_state(vm, bb_id, + VIRTIO_MEM_BBM_BB_UNUSED); + else + /* Retry from the main loop. */ + virtio_mem_bbm_set_bb_state(vm, bb_id, + VIRTIO_MEM_BBM_BB_PLUGGED); + return rc; + } + return 0; +} + +/* + * Prepare tracking data for the next big block. + */ +static int virtio_mem_bbm_prepare_next_bb(struct virtio_mem *vm, + unsigned long *bb_id) +{ + int rc; + + if (vm->bbm.next_bb_id > vm->bbm.last_usable_bb_id) + return -ENOSPC; + + /* Resize the big block state array if required. */ + rc = virtio_mem_bbm_bb_states_prepare_next_bb(vm); + if (rc) + return rc; + + vm->bbm.bb_count[VIRTIO_MEM_BBM_BB_UNUSED]++; + *bb_id = vm->bbm.next_bb_id; + vm->bbm.next_bb_id++; + return 0; +} + +static int virtio_mem_bbm_plug_request(struct virtio_mem *vm, uint64_t diff) +{ + uint64_t nb_bb = diff / vm->bbm.bb_size; + unsigned long bb_id; + int rc; + + if (!nb_bb) + return 0; + + /* Try to plug and add unused big blocks */ + virtio_mem_bbm_for_each_bb(vm, bb_id, VIRTIO_MEM_BBM_BB_UNUSED) { + if (!virtio_mem_could_add_memory(vm, vm->bbm.bb_size)) + return -ENOSPC; + + rc = virtio_mem_bbm_plug_and_add_bb(vm, bb_id); + if (!rc) + nb_bb--; + if (rc || !nb_bb) + return rc; + cond_resched(); + } + + /* Try to prepare, plug and add new big blocks */ + while (nb_bb) { + if (!virtio_mem_could_add_memory(vm, vm->bbm.bb_size)) + return -ENOSPC; + + rc = virtio_mem_bbm_prepare_next_bb(vm, &bb_id); + if (rc) + return rc; + rc = virtio_mem_bbm_plug_and_add_bb(vm, bb_id); + if (!rc) + nb_bb--; + if (rc) + return rc; + cond_resched(); + } + + return 0; +} + +/* + * Try to plug the requested amount of memory. + */ +static int virtio_mem_plug_request(struct virtio_mem *vm, uint64_t diff) +{ + if (vm->in_sbm) + return virtio_mem_sbm_plug_request(vm, diff); + return virtio_mem_bbm_plug_request(vm, diff); +} + +/* + * Unplug the desired number of plugged subblocks of an offline memory block. + * Will fail if any subblock cannot get unplugged (instead of skipping it). + * + * Will modify the state of the memory block. Might temporarily drop the + * hotplug_mutex. + * + * Note: Can fail after some subblocks were successfully unplugged. + */ +static int virtio_mem_sbm_unplug_any_sb_offline(struct virtio_mem *vm, + unsigned long mb_id, + uint64_t *nb_sb) +{ + int rc; + + rc = virtio_mem_sbm_unplug_any_sb_raw(vm, mb_id, nb_sb); + + /* some subblocks might have been unplugged even on failure */ + if (!virtio_mem_sbm_test_sb_plugged(vm, mb_id, 0, vm->sbm.sbs_per_mb)) + virtio_mem_sbm_set_mb_state(vm, mb_id, + VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL); + if (rc) + return rc; + + if (virtio_mem_sbm_test_sb_unplugged(vm, mb_id, 0, vm->sbm.sbs_per_mb)) { + /* + * Remove the block from Linux - this should never fail. + * Hinder the block from getting onlined by marking it + * unplugged. Temporarily drop the mutex, so + * any pending GOING_ONLINE requests can be serviced/rejected. + */ + virtio_mem_sbm_set_mb_state(vm, mb_id, + VIRTIO_MEM_SBM_MB_UNUSED); + + mutex_unlock(&vm->hotplug_mutex); + rc = virtio_mem_sbm_remove_mb(vm, mb_id); + BUG_ON(rc); + mutex_lock(&vm->hotplug_mutex); + } + return 0; +} + +/* + * Unplug the given plugged subblocks of an online memory block. + * + * Will modify the state of the memory block. + */ +static int virtio_mem_sbm_unplug_sb_online(struct virtio_mem *vm, + unsigned long mb_id, int sb_id, + int count) +{ + const unsigned long nr_pages = PFN_DOWN(vm->sbm.sb_size) * count; + const int old_state = virtio_mem_sbm_get_mb_state(vm, mb_id); + unsigned long start_pfn; + int rc; + + start_pfn = PFN_DOWN(virtio_mem_mb_id_to_phys(mb_id) + + sb_id * vm->sbm.sb_size); + + rc = virtio_mem_fake_offline(vm, start_pfn, nr_pages); + if (rc) + return rc; + + /* Try to unplug the allocated memory */ + rc = virtio_mem_sbm_unplug_sb(vm, mb_id, sb_id, count); + if (rc) { + /* Return the memory to the buddy. */ + virtio_mem_fake_online(start_pfn, nr_pages); + return rc; + } + + switch (old_state) { + case VIRTIO_MEM_SBM_MB_KERNEL: + virtio_mem_sbm_set_mb_state(vm, mb_id, + VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL); + break; + case VIRTIO_MEM_SBM_MB_MOVABLE: + virtio_mem_sbm_set_mb_state(vm, mb_id, + VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL); + break; + } + + return 0; +} + +/* + * Unplug the desired number of plugged subblocks of an online memory block. + * Will skip subblock that are busy. + * + * Will modify the state of the memory block. Might temporarily drop the + * hotplug_mutex. + * + * Note: Can fail after some subblocks were successfully unplugged. Can + * return 0 even if subblocks were busy and could not get unplugged. + */ +static int virtio_mem_sbm_unplug_any_sb_online(struct virtio_mem *vm, + unsigned long mb_id, + uint64_t *nb_sb) +{ + int rc, sb_id; + + /* If possible, try to unplug the complete block in one shot. */ + if (*nb_sb >= vm->sbm.sbs_per_mb && + virtio_mem_sbm_test_sb_plugged(vm, mb_id, 0, vm->sbm.sbs_per_mb)) { + rc = virtio_mem_sbm_unplug_sb_online(vm, mb_id, 0, + vm->sbm.sbs_per_mb); + if (!rc) { + *nb_sb -= vm->sbm.sbs_per_mb; + goto unplugged; + } else if (rc != -EBUSY) + return rc; + } + + /* Fallback to single subblocks. */ + for (sb_id = vm->sbm.sbs_per_mb - 1; sb_id >= 0 && *nb_sb; sb_id--) { + /* Find the next candidate subblock */ + while (sb_id >= 0 && + !virtio_mem_sbm_test_sb_plugged(vm, mb_id, sb_id, 1)) + sb_id--; + if (sb_id < 0) + break; + + rc = virtio_mem_sbm_unplug_sb_online(vm, mb_id, sb_id, 1); + if (rc == -EBUSY) + continue; + else if (rc) + return rc; + *nb_sb -= 1; + } + +unplugged: + rc = virtio_mem_sbm_try_remove_unplugged_mb(vm, mb_id); + if (rc) + vm->sbm.have_unplugged_mb = 1; + /* Ignore errors, this is not critical. We'll retry later. */ + return 0; +} + +/* + * Unplug the desired number of plugged subblocks of a memory block that is + * already added to Linux. Will skip subblock of online memory blocks that are + * busy (by the OS). Will fail if any subblock that's not busy cannot get + * unplugged. + * + * Will modify the state of the memory block. Might temporarily drop the + * hotplug_mutex. + * + * Note: Can fail after some subblocks were successfully unplugged. Can + * return 0 even if subblocks were busy and could not get unplugged. + */ +static int virtio_mem_sbm_unplug_any_sb(struct virtio_mem *vm, + unsigned long mb_id, + uint64_t *nb_sb) +{ + const int old_state = virtio_mem_sbm_get_mb_state(vm, mb_id); + + switch (old_state) { + case VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL: + case VIRTIO_MEM_SBM_MB_KERNEL: + case VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL: + case VIRTIO_MEM_SBM_MB_MOVABLE: + return virtio_mem_sbm_unplug_any_sb_online(vm, mb_id, nb_sb); + case VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL: + case VIRTIO_MEM_SBM_MB_OFFLINE: + return virtio_mem_sbm_unplug_any_sb_offline(vm, mb_id, nb_sb); + } + return -EINVAL; +} + +static int virtio_mem_sbm_unplug_request(struct virtio_mem *vm, uint64_t diff) +{ + const int mb_states[] = { + VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL, + VIRTIO_MEM_SBM_MB_OFFLINE, + VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL, + VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL, + VIRTIO_MEM_SBM_MB_MOVABLE, + VIRTIO_MEM_SBM_MB_KERNEL, + }; + uint64_t nb_sb = diff / vm->sbm.sb_size; + unsigned long mb_id; + int rc, i; + + if (!nb_sb) + return 0; + + /* + * We'll drop the mutex a couple of times when it is safe to do so. + * This might result in some blocks switching the state (online/offline) + * and we could miss them in this run - we will retry again later. + */ + mutex_lock(&vm->hotplug_mutex); + + /* + * We try unplug from partially plugged blocks first, to try removing + * whole memory blocks along with metadata. We prioritize ZONE_MOVABLE + * as it's more reliable to unplug memory and remove whole memory + * blocks, and we don't want to trigger a zone imbalances by + * accidentially removing too much kernel memory. + */ + for (i = 0; i < ARRAY_SIZE(mb_states); i++) { + virtio_mem_sbm_for_each_mb_rev(vm, mb_id, mb_states[i]) { + rc = virtio_mem_sbm_unplug_any_sb(vm, mb_id, &nb_sb); + if (rc || !nb_sb) + goto out_unlock; + mutex_unlock(&vm->hotplug_mutex); + cond_resched(); + mutex_lock(&vm->hotplug_mutex); + } + if (!unplug_online && i == 1) { + mutex_unlock(&vm->hotplug_mutex); + return 0; + } + } + + mutex_unlock(&vm->hotplug_mutex); + return nb_sb ? -EBUSY : 0; +out_unlock: + mutex_unlock(&vm->hotplug_mutex); + return rc; +} + +/* + * Try to offline and remove a big block from Linux and unplug it. Will fail + * with -EBUSY if some memory is busy and cannot get unplugged. + * + * Will modify the state of the memory block. Might temporarily drop the + * hotplug_mutex. + */ +static int virtio_mem_bbm_offline_remove_and_unplug_bb(struct virtio_mem *vm, + unsigned long bb_id) +{ + const unsigned long start_pfn = PFN_DOWN(virtio_mem_bb_id_to_phys(vm, bb_id)); + const unsigned long nr_pages = PFN_DOWN(vm->bbm.bb_size); + unsigned long end_pfn = start_pfn + nr_pages; + unsigned long pfn; + struct page *page; + int rc; + + if (WARN_ON_ONCE(virtio_mem_bbm_get_bb_state(vm, bb_id) != + VIRTIO_MEM_BBM_BB_ADDED)) + return -EINVAL; + + /* + * Start by fake-offlining all memory. Once we marked the device + * block as fake-offline, all newly onlined memory will + * automatically be kept fake-offline. Protect from concurrent + * onlining/offlining until we have a consistent state. + */ + mutex_lock(&vm->hotplug_mutex); + virtio_mem_bbm_set_bb_state(vm, bb_id, VIRTIO_MEM_BBM_BB_FAKE_OFFLINE); + + for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { + page = pfn_to_online_page(pfn); + if (!page) + continue; + + rc = virtio_mem_fake_offline(vm, pfn, PAGES_PER_SECTION); + if (rc) { + end_pfn = pfn; + goto rollback; + } + } + mutex_unlock(&vm->hotplug_mutex); + + rc = virtio_mem_bbm_offline_and_remove_bb(vm, bb_id); + if (rc) { + mutex_lock(&vm->hotplug_mutex); + goto rollback; + } + + rc = virtio_mem_bbm_unplug_bb(vm, bb_id); + if (rc) + virtio_mem_bbm_set_bb_state(vm, bb_id, + VIRTIO_MEM_BBM_BB_PLUGGED); + else + virtio_mem_bbm_set_bb_state(vm, bb_id, + VIRTIO_MEM_BBM_BB_UNUSED); + return rc; + +rollback: + for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { + page = pfn_to_online_page(pfn); + if (!page) + continue; + virtio_mem_fake_online(pfn, PAGES_PER_SECTION); + } + virtio_mem_bbm_set_bb_state(vm, bb_id, VIRTIO_MEM_BBM_BB_ADDED); + mutex_unlock(&vm->hotplug_mutex); + return rc; +} + +/* + * Test if a big block is completely offline. + */ +static bool virtio_mem_bbm_bb_is_offline(struct virtio_mem *vm, + unsigned long bb_id) +{ + const unsigned long start_pfn = PFN_DOWN(virtio_mem_bb_id_to_phys(vm, bb_id)); + const unsigned long nr_pages = PFN_DOWN(vm->bbm.bb_size); + unsigned long pfn; + + for (pfn = start_pfn; pfn < start_pfn + nr_pages; + pfn += PAGES_PER_SECTION) { + if (pfn_to_online_page(pfn)) + return false; + } + + return true; +} + +/* + * Test if a big block is completely onlined to ZONE_MOVABLE (or offline). + */ +static bool virtio_mem_bbm_bb_is_movable(struct virtio_mem *vm, + unsigned long bb_id) +{ + const unsigned long start_pfn = PFN_DOWN(virtio_mem_bb_id_to_phys(vm, bb_id)); + const unsigned long nr_pages = PFN_DOWN(vm->bbm.bb_size); + struct page *page; + unsigned long pfn; + + for (pfn = start_pfn; pfn < start_pfn + nr_pages; + pfn += PAGES_PER_SECTION) { + page = pfn_to_online_page(pfn); + if (!page) + continue; + if (page_zonenum(page) != ZONE_MOVABLE) + return false; + } + + return true; +} + +static int virtio_mem_bbm_unplug_request(struct virtio_mem *vm, uint64_t diff) +{ + uint64_t nb_bb = diff / vm->bbm.bb_size; + uint64_t bb_id; + int rc, i; + + if (!nb_bb) + return 0; + + /* + * Try to unplug big blocks. Similar to SBM, start with offline + * big blocks. + */ + for (i = 0; i < 3; i++) { + virtio_mem_bbm_for_each_bb_rev(vm, bb_id, VIRTIO_MEM_BBM_BB_ADDED) { + cond_resched(); + + /* + * As we're holding no locks, these checks are racy, + * but we don't care. + */ + if (i == 0 && !virtio_mem_bbm_bb_is_offline(vm, bb_id)) + continue; + if (i == 1 && !virtio_mem_bbm_bb_is_movable(vm, bb_id)) + continue; + rc = virtio_mem_bbm_offline_remove_and_unplug_bb(vm, bb_id); + if (rc == -EBUSY) + continue; + if (!rc) + nb_bb--; + if (rc || !nb_bb) + return rc; + } + if (i == 0 && !unplug_online) + return 0; + } + + return nb_bb ? -EBUSY : 0; +} + +/* + * Try to unplug the requested amount of memory. + */ +static int virtio_mem_unplug_request(struct virtio_mem *vm, uint64_t diff) +{ + if (vm->in_sbm) + return virtio_mem_sbm_unplug_request(vm, diff); + return virtio_mem_bbm_unplug_request(vm, diff); +} + +/* + * Try to unplug all blocks that couldn't be unplugged before, for example, + * because the hypervisor was busy. Further, offline and remove any memory + * blocks where we previously failed. + */ +static int virtio_mem_cleanup_pending_mb(struct virtio_mem *vm) +{ + unsigned long id; + int rc = 0; + + if (!vm->in_sbm) { + virtio_mem_bbm_for_each_bb(vm, id, + VIRTIO_MEM_BBM_BB_PLUGGED) { + rc = virtio_mem_bbm_unplug_bb(vm, id); + if (rc) + return rc; + virtio_mem_bbm_set_bb_state(vm, id, + VIRTIO_MEM_BBM_BB_UNUSED); + } + return 0; + } + + virtio_mem_sbm_for_each_mb(vm, id, VIRTIO_MEM_SBM_MB_PLUGGED) { + rc = virtio_mem_sbm_unplug_mb(vm, id); + if (rc) + return rc; + virtio_mem_sbm_set_mb_state(vm, id, + VIRTIO_MEM_SBM_MB_UNUSED); + } + + if (!vm->sbm.have_unplugged_mb) + return 0; + + /* + * Let's retry (offlining and) removing completely unplugged Linux + * memory blocks. + */ + vm->sbm.have_unplugged_mb = false; + + mutex_lock(&vm->hotplug_mutex); + virtio_mem_sbm_for_each_mb(vm, id, VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL) + rc |= virtio_mem_sbm_try_remove_unplugged_mb(vm, id); + virtio_mem_sbm_for_each_mb(vm, id, VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL) + rc |= virtio_mem_sbm_try_remove_unplugged_mb(vm, id); + virtio_mem_sbm_for_each_mb(vm, id, VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL) + rc |= virtio_mem_sbm_try_remove_unplugged_mb(vm, id); + mutex_unlock(&vm->hotplug_mutex); + + if (rc) + vm->sbm.have_unplugged_mb = true; + /* Ignore errors, this is not critical. We'll retry later. */ + return 0; +} + +/* + * Update all parts of the config that could have changed. + */ +static void virtio_mem_refresh_config(struct virtio_mem *vm) +{ + const struct range pluggable_range = mhp_get_pluggable_range(true); + uint64_t new_plugged_size, usable_region_size, end_addr; + + /* the plugged_size is just a reflection of what _we_ did previously */ + virtio_cread_le(vm->vdev, struct virtio_mem_config, plugged_size, + &new_plugged_size); + if (WARN_ON_ONCE(new_plugged_size != vm->plugged_size)) + vm->plugged_size = new_plugged_size; + + /* calculate the last usable memory block id */ + virtio_cread_le(vm->vdev, struct virtio_mem_config, + usable_region_size, &usable_region_size); + end_addr = min(vm->addr + usable_region_size - 1, + pluggable_range.end); + + if (vm->in_sbm) { + vm->sbm.last_usable_mb_id = virtio_mem_phys_to_mb_id(end_addr); + if (!IS_ALIGNED(end_addr + 1, memory_block_size_bytes())) + vm->sbm.last_usable_mb_id--; + } else { + vm->bbm.last_usable_bb_id = virtio_mem_phys_to_bb_id(vm, + end_addr); + if (!IS_ALIGNED(end_addr + 1, vm->bbm.bb_size)) + vm->bbm.last_usable_bb_id--; + } + /* + * If we cannot plug any of our device memory (e.g., nothing in the + * usable region is addressable), the last usable memory block id will + * be smaller than the first usable memory block id. We'll stop + * attempting to add memory with -ENOSPC from our main loop. + */ + + /* see if there is a request to change the size */ + virtio_cread_le(vm->vdev, struct virtio_mem_config, requested_size, + &vm->requested_size); + + dev_info(&vm->vdev->dev, "plugged size: 0x%llx", vm->plugged_size); + dev_info(&vm->vdev->dev, "requested size: 0x%llx", vm->requested_size); +} + +/* + * Workqueue function for handling plug/unplug requests and config updates. + */ +static void virtio_mem_run_wq(struct work_struct *work) +{ + struct virtio_mem *vm = container_of(work, struct virtio_mem, wq); + uint64_t diff; + int rc; + + if (unlikely(vm->in_kdump)) { + dev_warn_once(&vm->vdev->dev, + "unexpected workqueue run in kdump kernel\n"); + return; + } + + hrtimer_cancel(&vm->retry_timer); + + if (vm->broken) + return; + + atomic_set(&vm->wq_active, 1); +retry: + rc = 0; + + /* Make sure we start with a clean state if there are leftovers. */ + if (unlikely(vm->unplug_all_required)) + rc = virtio_mem_send_unplug_all_request(vm); + + if (atomic_read(&vm->config_changed)) { + atomic_set(&vm->config_changed, 0); + virtio_mem_refresh_config(vm); + } + + /* Cleanup any leftovers from previous runs */ + if (!rc) + rc = virtio_mem_cleanup_pending_mb(vm); + + if (!rc && vm->requested_size != vm->plugged_size) { + if (vm->requested_size > vm->plugged_size) { + diff = vm->requested_size - vm->plugged_size; + rc = virtio_mem_plug_request(vm, diff); + } else { + diff = vm->plugged_size - vm->requested_size; + rc = virtio_mem_unplug_request(vm, diff); + } + } + + /* + * Keep retrying to offline and remove completely unplugged Linux + * memory blocks. + */ + if (!rc && vm->in_sbm && vm->sbm.have_unplugged_mb) + rc = -EBUSY; + + switch (rc) { + case 0: + vm->retry_timer_ms = VIRTIO_MEM_RETRY_TIMER_MIN_MS; + break; + case -ENOSPC: + /* + * We cannot add any more memory (alignment, physical limit) + * or we have too many offline memory blocks. + */ + break; + case -ETXTBSY: + /* + * The hypervisor cannot process our request right now + * (e.g., out of memory, migrating); + */ + case -EBUSY: + /* + * We cannot free up any memory to unplug it (all plugged memory + * is busy). + */ + case -ENOMEM: + /* Out of memory, try again later. */ + hrtimer_start(&vm->retry_timer, ms_to_ktime(vm->retry_timer_ms), + HRTIMER_MODE_REL); + break; + case -EAGAIN: + /* Retry immediately (e.g., the config changed). */ + goto retry; + default: + /* Unknown error, mark as broken */ + dev_err(&vm->vdev->dev, + "unknown error, marking device broken: %d\n", rc); + vm->broken = true; + } + + atomic_set(&vm->wq_active, 0); +} + +static enum hrtimer_restart virtio_mem_timer_expired(struct hrtimer *timer) +{ + struct virtio_mem *vm = container_of(timer, struct virtio_mem, + retry_timer); + + virtio_mem_retry(vm); + vm->retry_timer_ms = min_t(unsigned int, vm->retry_timer_ms * 2, + VIRTIO_MEM_RETRY_TIMER_MAX_MS); + return HRTIMER_NORESTART; +} + +static void virtio_mem_handle_response(struct virtqueue *vq) +{ + struct virtio_mem *vm = vq->vdev->priv; + + wake_up(&vm->host_resp); +} + +static int virtio_mem_init_vq(struct virtio_mem *vm) +{ + struct virtqueue *vq; + + vq = virtio_find_single_vq(vm->vdev, virtio_mem_handle_response, + "guest-request"); + if (IS_ERR(vq)) + return PTR_ERR(vq); + vm->vq = vq; + + return 0; +} + +static int virtio_mem_init_hotplug(struct virtio_mem *vm) +{ + const struct range pluggable_range = mhp_get_pluggable_range(true); + uint64_t unit_pages, sb_size, addr; + int rc; + + /* bad device setup - warn only */ + if (!IS_ALIGNED(vm->addr, memory_block_size_bytes())) + dev_warn(&vm->vdev->dev, + "The alignment of the physical start address can make some memory unusable.\n"); + if (!IS_ALIGNED(vm->addr + vm->region_size, memory_block_size_bytes())) + dev_warn(&vm->vdev->dev, + "The alignment of the physical end address can make some memory unusable.\n"); + if (vm->addr < pluggable_range.start || + vm->addr + vm->region_size - 1 > pluggable_range.end) + dev_warn(&vm->vdev->dev, + "Some device memory is not addressable/pluggable. This can make some memory unusable.\n"); + + /* Prepare the offline threshold - make sure we can add two blocks. */ + vm->offline_threshold = max_t(uint64_t, 2 * memory_block_size_bytes(), + VIRTIO_MEM_DEFAULT_OFFLINE_THRESHOLD); + + /* + * alloc_contig_range() works reliably with pageblock + * granularity on ZONE_NORMAL, use pageblock_nr_pages. + */ + sb_size = PAGE_SIZE * pageblock_nr_pages; + sb_size = max_t(uint64_t, vm->device_block_size, sb_size); + + if (sb_size < memory_block_size_bytes() && !force_bbm) { + /* SBM: At least two subblocks per Linux memory block. */ + vm->in_sbm = true; + vm->sbm.sb_size = sb_size; + vm->sbm.sbs_per_mb = memory_block_size_bytes() / + vm->sbm.sb_size; + + /* Round up to the next full memory block */ + addr = max_t(uint64_t, vm->addr, pluggable_range.start) + + memory_block_size_bytes() - 1; + vm->sbm.first_mb_id = virtio_mem_phys_to_mb_id(addr); + vm->sbm.next_mb_id = vm->sbm.first_mb_id; + } else { + /* BBM: At least one Linux memory block. */ + vm->bbm.bb_size = max_t(uint64_t, vm->device_block_size, + memory_block_size_bytes()); + + if (bbm_block_size) { + if (!is_power_of_2(bbm_block_size)) { + dev_warn(&vm->vdev->dev, + "bbm_block_size is not a power of 2"); + } else if (bbm_block_size < vm->bbm.bb_size) { + dev_warn(&vm->vdev->dev, + "bbm_block_size is too small"); + } else { + vm->bbm.bb_size = bbm_block_size; + } + } + + /* Round up to the next aligned big block */ + addr = max_t(uint64_t, vm->addr, pluggable_range.start) + + vm->bbm.bb_size - 1; + vm->bbm.first_bb_id = virtio_mem_phys_to_bb_id(vm, addr); + vm->bbm.next_bb_id = vm->bbm.first_bb_id; + + /* Make sure we can add two big blocks. */ + vm->offline_threshold = max_t(uint64_t, 2 * vm->bbm.bb_size, + vm->offline_threshold); + } + + dev_info(&vm->vdev->dev, "memory block size: 0x%lx", + memory_block_size_bytes()); + if (vm->in_sbm) + dev_info(&vm->vdev->dev, "subblock size: 0x%llx", + (unsigned long long)vm->sbm.sb_size); + else + dev_info(&vm->vdev->dev, "big block size: 0x%llx", + (unsigned long long)vm->bbm.bb_size); + + /* create the parent resource for all memory */ + rc = virtio_mem_create_resource(vm); + if (rc) + return rc; + + /* use a single dynamic memory group to cover the whole memory device */ + if (vm->in_sbm) + unit_pages = PHYS_PFN(memory_block_size_bytes()); + else + unit_pages = PHYS_PFN(vm->bbm.bb_size); + rc = memory_group_register_dynamic(vm->nid, unit_pages); + if (rc < 0) + goto out_del_resource; + vm->mgid = rc; + + /* + * If we still have memory plugged, we have to unplug all memory first. + * Registering our parent resource makes sure that this memory isn't + * actually in use (e.g., trying to reload the driver). + */ + if (vm->plugged_size) { + vm->unplug_all_required = true; + dev_info(&vm->vdev->dev, "unplugging all memory is required\n"); + } + + /* register callbacks */ + vm->memory_notifier.notifier_call = virtio_mem_memory_notifier_cb; + rc = register_memory_notifier(&vm->memory_notifier); + if (rc) + goto out_unreg_group; + rc = register_virtio_mem_device(vm); + if (rc) + goto out_unreg_mem; + + return 0; +out_unreg_mem: + unregister_memory_notifier(&vm->memory_notifier); +out_unreg_group: + memory_group_unregister(vm->mgid); +out_del_resource: + virtio_mem_delete_resource(vm); + return rc; +} + +#ifdef CONFIG_PROC_VMCORE +static int virtio_mem_send_state_request(struct virtio_mem *vm, uint64_t addr, + uint64_t size) +{ + const uint64_t nb_vm_blocks = size / vm->device_block_size; + const struct virtio_mem_req req = { + .type = cpu_to_virtio16(vm->vdev, VIRTIO_MEM_REQ_STATE), + .u.state.addr = cpu_to_virtio64(vm->vdev, addr), + .u.state.nb_blocks = cpu_to_virtio16(vm->vdev, nb_vm_blocks), + }; + int rc = -ENOMEM; + + dev_dbg(&vm->vdev->dev, "requesting state: 0x%llx - 0x%llx\n", addr, + addr + size - 1); + + switch (virtio_mem_send_request(vm, &req)) { + case VIRTIO_MEM_RESP_ACK: + return virtio16_to_cpu(vm->vdev, vm->resp.u.state.state); + case VIRTIO_MEM_RESP_ERROR: + rc = -EINVAL; + break; + default: + break; + } + + dev_dbg(&vm->vdev->dev, "requesting state failed: %d\n", rc); + return rc; +} + +static bool virtio_mem_vmcore_pfn_is_ram(struct vmcore_cb *cb, + unsigned long pfn) +{ + struct virtio_mem *vm = container_of(cb, struct virtio_mem, + vmcore_cb); + uint64_t addr = PFN_PHYS(pfn); + bool is_ram; + int rc; + + if (!virtio_mem_contains_range(vm, addr, PAGE_SIZE)) + return true; + if (!vm->plugged_size) + return false; + + /* + * We have to serialize device requests and access to the information + * about the block queried last. + */ + mutex_lock(&vm->hotplug_mutex); + + addr = ALIGN_DOWN(addr, vm->device_block_size); + if (addr != vm->last_block_addr) { + rc = virtio_mem_send_state_request(vm, addr, + vm->device_block_size); + /* On any kind of error, we're going to signal !ram. */ + if (rc == VIRTIO_MEM_STATE_PLUGGED) + vm->last_block_plugged = true; + else + vm->last_block_plugged = false; + vm->last_block_addr = addr; + } + + is_ram = vm->last_block_plugged; + mutex_unlock(&vm->hotplug_mutex); + return is_ram; +} +#endif /* CONFIG_PROC_VMCORE */ + +static int virtio_mem_init_kdump(struct virtio_mem *vm) +{ +#ifdef CONFIG_PROC_VMCORE + dev_info(&vm->vdev->dev, "memory hot(un)plug disabled in kdump kernel\n"); + vm->vmcore_cb.pfn_is_ram = virtio_mem_vmcore_pfn_is_ram; + register_vmcore_cb(&vm->vmcore_cb); + return 0; +#else /* CONFIG_PROC_VMCORE */ + dev_warn(&vm->vdev->dev, "disabled in kdump kernel without vmcore\n"); + return -EBUSY; +#endif /* CONFIG_PROC_VMCORE */ +} + +static int virtio_mem_init(struct virtio_mem *vm) +{ + uint16_t node_id; + + if (!vm->vdev->config->get) { + dev_err(&vm->vdev->dev, "config access disabled\n"); + return -EINVAL; + } + + /* Fetch all properties that can't change. */ + virtio_cread_le(vm->vdev, struct virtio_mem_config, plugged_size, + &vm->plugged_size); + virtio_cread_le(vm->vdev, struct virtio_mem_config, block_size, + &vm->device_block_size); + virtio_cread_le(vm->vdev, struct virtio_mem_config, node_id, + &node_id); + vm->nid = virtio_mem_translate_node_id(vm, node_id); + virtio_cread_le(vm->vdev, struct virtio_mem_config, addr, &vm->addr); + virtio_cread_le(vm->vdev, struct virtio_mem_config, region_size, + &vm->region_size); + + /* Determine the nid for the device based on the lowest address. */ + if (vm->nid == NUMA_NO_NODE) + vm->nid = memory_add_physaddr_to_nid(vm->addr); + + dev_info(&vm->vdev->dev, "start address: 0x%llx", vm->addr); + dev_info(&vm->vdev->dev, "region size: 0x%llx", vm->region_size); + dev_info(&vm->vdev->dev, "device block size: 0x%llx", + (unsigned long long)vm->device_block_size); + if (vm->nid != NUMA_NO_NODE && IS_ENABLED(CONFIG_NUMA)) + dev_info(&vm->vdev->dev, "nid: %d", vm->nid); + + /* + * We don't want to (un)plug or reuse any memory when in kdump. The + * memory is still accessible (but not exposed to Linux). + */ + if (vm->in_kdump) + return virtio_mem_init_kdump(vm); + return virtio_mem_init_hotplug(vm); +} + +static int virtio_mem_create_resource(struct virtio_mem *vm) +{ + /* + * When force-unloading the driver and removing the device, we + * could have a garbage pointer. Duplicate the string. + */ + const char *name = kstrdup(dev_name(&vm->vdev->dev), GFP_KERNEL); + + if (!name) + return -ENOMEM; + + /* Disallow mapping device memory via /dev/mem completely. */ + vm->parent_resource = __request_mem_region(vm->addr, vm->region_size, + name, IORESOURCE_SYSTEM_RAM | + IORESOURCE_EXCLUSIVE); + if (!vm->parent_resource) { + kfree(name); + dev_warn(&vm->vdev->dev, "could not reserve device region\n"); + dev_info(&vm->vdev->dev, + "reloading the driver is not supported\n"); + return -EBUSY; + } + + /* The memory is not actually busy - make add_memory() work. */ + vm->parent_resource->flags &= ~IORESOURCE_BUSY; + return 0; +} + +static void virtio_mem_delete_resource(struct virtio_mem *vm) +{ + const char *name; + + if (!vm->parent_resource) + return; + + name = vm->parent_resource->name; + release_resource(vm->parent_resource); + kfree(vm->parent_resource); + kfree(name); + vm->parent_resource = NULL; +} + +static int virtio_mem_range_has_system_ram(struct resource *res, void *arg) +{ + return 1; +} + +static bool virtio_mem_has_memory_added(struct virtio_mem *vm) +{ + const unsigned long flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; + + return walk_iomem_res_desc(IORES_DESC_NONE, flags, vm->addr, + vm->addr + vm->region_size, NULL, + virtio_mem_range_has_system_ram) == 1; +} + +static int virtio_mem_probe(struct virtio_device *vdev) +{ + struct virtio_mem *vm; + int rc; + + BUILD_BUG_ON(sizeof(struct virtio_mem_req) != 24); + BUILD_BUG_ON(sizeof(struct virtio_mem_resp) != 10); + + vdev->priv = vm = kzalloc(sizeof(*vm), GFP_KERNEL); + if (!vm) + return -ENOMEM; + + init_waitqueue_head(&vm->host_resp); + vm->vdev = vdev; + INIT_WORK(&vm->wq, virtio_mem_run_wq); + mutex_init(&vm->hotplug_mutex); + INIT_LIST_HEAD(&vm->next); + spin_lock_init(&vm->removal_lock); + hrtimer_init(&vm->retry_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + vm->retry_timer.function = virtio_mem_timer_expired; + vm->retry_timer_ms = VIRTIO_MEM_RETRY_TIMER_MIN_MS; + vm->in_kdump = is_kdump_kernel(); + + /* register the virtqueue */ + rc = virtio_mem_init_vq(vm); + if (rc) + goto out_free_vm; + + /* initialize the device by querying the config */ + rc = virtio_mem_init(vm); + if (rc) + goto out_del_vq; + + virtio_device_ready(vdev); + + /* trigger a config update to start processing the requested_size */ + if (!vm->in_kdump) { + atomic_set(&vm->config_changed, 1); + queue_work(system_freezable_wq, &vm->wq); + } + + return 0; +out_del_vq: + vdev->config->del_vqs(vdev); +out_free_vm: + kfree(vm); + vdev->priv = NULL; + + return rc; +} + +static void virtio_mem_deinit_hotplug(struct virtio_mem *vm) +{ + unsigned long mb_id; + int rc; + + /* + * Make sure the workqueue won't be triggered anymore and no memory + * blocks can be onlined/offlined until we're finished here. + */ + mutex_lock(&vm->hotplug_mutex); + spin_lock_irq(&vm->removal_lock); + vm->removing = true; + spin_unlock_irq(&vm->removal_lock); + mutex_unlock(&vm->hotplug_mutex); + + /* wait until the workqueue stopped */ + cancel_work_sync(&vm->wq); + hrtimer_cancel(&vm->retry_timer); + + if (vm->in_sbm) { + /* + * After we unregistered our callbacks, user space can online + * partially plugged offline blocks. Make sure to remove them. + */ + virtio_mem_sbm_for_each_mb(vm, mb_id, + VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL) { + rc = virtio_mem_sbm_remove_mb(vm, mb_id); + BUG_ON(rc); + virtio_mem_sbm_set_mb_state(vm, mb_id, + VIRTIO_MEM_SBM_MB_UNUSED); + } + /* + * After we unregistered our callbacks, user space can no longer + * offline partially plugged online memory blocks. No need to + * worry about them. + */ + } + + /* unregister callbacks */ + unregister_virtio_mem_device(vm); + unregister_memory_notifier(&vm->memory_notifier); + + /* + * There is no way we could reliably remove all memory we have added to + * the system. And there is no way to stop the driver/device from going + * away. Warn at least. + */ + if (virtio_mem_has_memory_added(vm)) { + dev_warn(&vm->vdev->dev, + "device still has system memory added\n"); + } else { + virtio_mem_delete_resource(vm); + kfree_const(vm->resource_name); + memory_group_unregister(vm->mgid); + } + + /* remove all tracking data - no locking needed */ + if (vm->in_sbm) { + vfree(vm->sbm.mb_states); + vfree(vm->sbm.sb_states); + } else { + vfree(vm->bbm.bb_states); + } +} + +static void virtio_mem_deinit_kdump(struct virtio_mem *vm) +{ +#ifdef CONFIG_PROC_VMCORE + unregister_vmcore_cb(&vm->vmcore_cb); +#endif /* CONFIG_PROC_VMCORE */ +} + +static void virtio_mem_remove(struct virtio_device *vdev) +{ + struct virtio_mem *vm = vdev->priv; + + if (vm->in_kdump) + virtio_mem_deinit_kdump(vm); + else + virtio_mem_deinit_hotplug(vm); + + /* reset the device and cleanup the queues */ + virtio_reset_device(vdev); + vdev->config->del_vqs(vdev); + + kfree(vm); + vdev->priv = NULL; +} + +static void virtio_mem_config_changed(struct virtio_device *vdev) +{ + struct virtio_mem *vm = vdev->priv; + + if (unlikely(vm->in_kdump)) + return; + + atomic_set(&vm->config_changed, 1); + virtio_mem_retry(vm); +} + +#ifdef CONFIG_PM_SLEEP +static int virtio_mem_freeze(struct virtio_device *vdev) +{ + /* + * When restarting the VM, all memory is usually unplugged. Don't + * allow to suspend/hibernate. + */ + dev_err(&vdev->dev, "save/restore not supported.\n"); + return -EPERM; +} + +static int virtio_mem_restore(struct virtio_device *vdev) +{ + return -EPERM; +} +#endif + +static unsigned int virtio_mem_features[] = { +#if defined(CONFIG_NUMA) && defined(CONFIG_ACPI_NUMA) + VIRTIO_MEM_F_ACPI_PXM, +#endif + VIRTIO_MEM_F_UNPLUGGED_INACCESSIBLE, +}; + +static const struct virtio_device_id virtio_mem_id_table[] = { + { VIRTIO_ID_MEM, VIRTIO_DEV_ANY_ID }, + { 0 }, +}; + +static struct virtio_driver virtio_mem_driver = { + .feature_table = virtio_mem_features, + .feature_table_size = ARRAY_SIZE(virtio_mem_features), + .driver.name = KBUILD_MODNAME, + .driver.owner = THIS_MODULE, + .id_table = virtio_mem_id_table, + .probe = virtio_mem_probe, + .remove = virtio_mem_remove, + .config_changed = virtio_mem_config_changed, +#ifdef CONFIG_PM_SLEEP + .freeze = virtio_mem_freeze, + .restore = virtio_mem_restore, +#endif +}; + +module_virtio_driver(virtio_mem_driver); +MODULE_DEVICE_TABLE(virtio, virtio_mem_id_table); +MODULE_AUTHOR("David Hildenbrand <david@redhat.com>"); +MODULE_DESCRIPTION("Virtio-mem driver"); +MODULE_LICENSE("GPL"); diff --git a/drivers/virtio/virtio_mmio.c b/drivers/virtio/virtio_mmio.c new file mode 100644 index 0000000000..59892a31cf --- /dev/null +++ b/drivers/virtio/virtio_mmio.c @@ -0,0 +1,877 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Virtio memory mapped device driver + * + * Copyright 2011-2014, ARM Ltd. + * + * This module allows virtio devices to be used over a virtual, memory mapped + * platform device. + * + * The guest device(s) may be instantiated in one of three equivalent ways: + * + * 1. Static platform device in board's code, eg.: + * + * static struct platform_device v2m_virtio_device = { + * .name = "virtio-mmio", + * .id = -1, + * .num_resources = 2, + * .resource = (struct resource []) { + * { + * .start = 0x1001e000, + * .end = 0x1001e0ff, + * .flags = IORESOURCE_MEM, + * }, { + * .start = 42 + 32, + * .end = 42 + 32, + * .flags = IORESOURCE_IRQ, + * }, + * } + * }; + * + * 2. Device Tree node, eg.: + * + * virtio_block@1e000 { + * compatible = "virtio,mmio"; + * reg = <0x1e000 0x100>; + * interrupts = <42>; + * } + * + * 3. Kernel module (or command line) parameter. Can be used more than once - + * one device will be created for each one. Syntax: + * + * [virtio_mmio.]device=<size>@<baseaddr>:<irq>[:<id>] + * where: + * <size> := size (can use standard suffixes like K, M or G) + * <baseaddr> := physical base address + * <irq> := interrupt number (as passed to request_irq()) + * <id> := (optional) platform device id + * eg.: + * virtio_mmio.device=0x100@0x100b0000:48 \ + * virtio_mmio.device=1K@0x1001e000:74 + * + * Based on Virtio PCI driver by Anthony Liguori, copyright IBM Corp. 2007 + */ + +#define pr_fmt(fmt) "virtio-mmio: " fmt + +#include <linux/acpi.h> +#include <linux/dma-mapping.h> +#include <linux/highmem.h> +#include <linux/interrupt.h> +#include <linux/io.h> +#include <linux/list.h> +#include <linux/module.h> +#include <linux/of.h> +#include <linux/platform_device.h> +#include <linux/pm.h> +#include <linux/slab.h> +#include <linux/spinlock.h> +#include <linux/virtio.h> +#include <linux/virtio_config.h> +#include <uapi/linux/virtio_mmio.h> +#include <linux/virtio_ring.h> + + + +/* The alignment to use between consumer and producer parts of vring. + * Currently hardcoded to the page size. */ +#define VIRTIO_MMIO_VRING_ALIGN PAGE_SIZE + + + +#define to_virtio_mmio_device(_plat_dev) \ + container_of(_plat_dev, struct virtio_mmio_device, vdev) + +struct virtio_mmio_device { + struct virtio_device vdev; + struct platform_device *pdev; + + void __iomem *base; + unsigned long version; + + /* a list of queues so we can dispatch IRQs */ + spinlock_t lock; + struct list_head virtqueues; +}; + +struct virtio_mmio_vq_info { + /* the actual virtqueue */ + struct virtqueue *vq; + + /* the list node for the virtqueues list */ + struct list_head node; +}; + + + +/* Configuration interface */ + +static u64 vm_get_features(struct virtio_device *vdev) +{ + struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vdev); + u64 features; + + writel(1, vm_dev->base + VIRTIO_MMIO_DEVICE_FEATURES_SEL); + features = readl(vm_dev->base + VIRTIO_MMIO_DEVICE_FEATURES); + features <<= 32; + + writel(0, vm_dev->base + VIRTIO_MMIO_DEVICE_FEATURES_SEL); + features |= readl(vm_dev->base + VIRTIO_MMIO_DEVICE_FEATURES); + + return features; +} + +static int vm_finalize_features(struct virtio_device *vdev) +{ + struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vdev); + + /* Give virtio_ring a chance to accept features. */ + vring_transport_features(vdev); + + /* Make sure there are no mixed devices */ + if (vm_dev->version == 2 && + !__virtio_test_bit(vdev, VIRTIO_F_VERSION_1)) { + dev_err(&vdev->dev, "New virtio-mmio devices (version 2) must provide VIRTIO_F_VERSION_1 feature!\n"); + return -EINVAL; + } + + writel(1, vm_dev->base + VIRTIO_MMIO_DRIVER_FEATURES_SEL); + writel((u32)(vdev->features >> 32), + vm_dev->base + VIRTIO_MMIO_DRIVER_FEATURES); + + writel(0, vm_dev->base + VIRTIO_MMIO_DRIVER_FEATURES_SEL); + writel((u32)vdev->features, + vm_dev->base + VIRTIO_MMIO_DRIVER_FEATURES); + + return 0; +} + +static void vm_get(struct virtio_device *vdev, unsigned int offset, + void *buf, unsigned int len) +{ + struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vdev); + void __iomem *base = vm_dev->base + VIRTIO_MMIO_CONFIG; + u8 b; + __le16 w; + __le32 l; + + if (vm_dev->version == 1) { + u8 *ptr = buf; + int i; + + for (i = 0; i < len; i++) + ptr[i] = readb(base + offset + i); + return; + } + + switch (len) { + case 1: + b = readb(base + offset); + memcpy(buf, &b, sizeof b); + break; + case 2: + w = cpu_to_le16(readw(base + offset)); + memcpy(buf, &w, sizeof w); + break; + case 4: + l = cpu_to_le32(readl(base + offset)); + memcpy(buf, &l, sizeof l); + break; + case 8: + l = cpu_to_le32(readl(base + offset)); + memcpy(buf, &l, sizeof l); + l = cpu_to_le32(ioread32(base + offset + sizeof l)); + memcpy(buf + sizeof l, &l, sizeof l); + break; + default: + BUG(); + } +} + +static void vm_set(struct virtio_device *vdev, unsigned int offset, + const void *buf, unsigned int len) +{ + struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vdev); + void __iomem *base = vm_dev->base + VIRTIO_MMIO_CONFIG; + u8 b; + __le16 w; + __le32 l; + + if (vm_dev->version == 1) { + const u8 *ptr = buf; + int i; + + for (i = 0; i < len; i++) + writeb(ptr[i], base + offset + i); + + return; + } + + switch (len) { + case 1: + memcpy(&b, buf, sizeof b); + writeb(b, base + offset); + break; + case 2: + memcpy(&w, buf, sizeof w); + writew(le16_to_cpu(w), base + offset); + break; + case 4: + memcpy(&l, buf, sizeof l); + writel(le32_to_cpu(l), base + offset); + break; + case 8: + memcpy(&l, buf, sizeof l); + writel(le32_to_cpu(l), base + offset); + memcpy(&l, buf + sizeof l, sizeof l); + writel(le32_to_cpu(l), base + offset + sizeof l); + break; + default: + BUG(); + } +} + +static u32 vm_generation(struct virtio_device *vdev) +{ + struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vdev); + + if (vm_dev->version == 1) + return 0; + else + return readl(vm_dev->base + VIRTIO_MMIO_CONFIG_GENERATION); +} + +static u8 vm_get_status(struct virtio_device *vdev) +{ + struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vdev); + + return readl(vm_dev->base + VIRTIO_MMIO_STATUS) & 0xff; +} + +static void vm_set_status(struct virtio_device *vdev, u8 status) +{ + struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vdev); + + /* We should never be setting status to 0. */ + BUG_ON(status == 0); + + /* + * Per memory-barriers.txt, wmb() is not needed to guarantee + * that the cache coherent memory writes have completed + * before writing to the MMIO region. + */ + writel(status, vm_dev->base + VIRTIO_MMIO_STATUS); +} + +static void vm_reset(struct virtio_device *vdev) +{ + struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vdev); + + /* 0 status means a reset. */ + writel(0, vm_dev->base + VIRTIO_MMIO_STATUS); +} + + + +/* Transport interface */ + +/* the notify function used when creating a virt queue */ +static bool vm_notify(struct virtqueue *vq) +{ + struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vq->vdev); + + /* We write the queue's selector into the notification register to + * signal the other end */ + writel(vq->index, vm_dev->base + VIRTIO_MMIO_QUEUE_NOTIFY); + return true; +} + +static bool vm_notify_with_data(struct virtqueue *vq) +{ + struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vq->vdev); + u32 data = vring_notification_data(vq); + + writel(data, vm_dev->base + VIRTIO_MMIO_QUEUE_NOTIFY); + + return true; +} + +/* Notify all virtqueues on an interrupt. */ +static irqreturn_t vm_interrupt(int irq, void *opaque) +{ + struct virtio_mmio_device *vm_dev = opaque; + struct virtio_mmio_vq_info *info; + unsigned long status; + unsigned long flags; + irqreturn_t ret = IRQ_NONE; + + /* Read and acknowledge interrupts */ + status = readl(vm_dev->base + VIRTIO_MMIO_INTERRUPT_STATUS); + writel(status, vm_dev->base + VIRTIO_MMIO_INTERRUPT_ACK); + + if (unlikely(status & VIRTIO_MMIO_INT_CONFIG)) { + virtio_config_changed(&vm_dev->vdev); + ret = IRQ_HANDLED; + } + + if (likely(status & VIRTIO_MMIO_INT_VRING)) { + spin_lock_irqsave(&vm_dev->lock, flags); + list_for_each_entry(info, &vm_dev->virtqueues, node) + ret |= vring_interrupt(irq, info->vq); + spin_unlock_irqrestore(&vm_dev->lock, flags); + } + + return ret; +} + + + +static void vm_del_vq(struct virtqueue *vq) +{ + struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vq->vdev); + struct virtio_mmio_vq_info *info = vq->priv; + unsigned long flags; + unsigned int index = vq->index; + + spin_lock_irqsave(&vm_dev->lock, flags); + list_del(&info->node); + spin_unlock_irqrestore(&vm_dev->lock, flags); + + /* Select and deactivate the queue */ + writel(index, vm_dev->base + VIRTIO_MMIO_QUEUE_SEL); + if (vm_dev->version == 1) { + writel(0, vm_dev->base + VIRTIO_MMIO_QUEUE_PFN); + } else { + writel(0, vm_dev->base + VIRTIO_MMIO_QUEUE_READY); + WARN_ON(readl(vm_dev->base + VIRTIO_MMIO_QUEUE_READY)); + } + + vring_del_virtqueue(vq); + + kfree(info); +} + +static void vm_del_vqs(struct virtio_device *vdev) +{ + struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vdev); + struct virtqueue *vq, *n; + + list_for_each_entry_safe(vq, n, &vdev->vqs, list) + vm_del_vq(vq); + + free_irq(platform_get_irq(vm_dev->pdev, 0), vm_dev); +} + +static void vm_synchronize_cbs(struct virtio_device *vdev) +{ + struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vdev); + + synchronize_irq(platform_get_irq(vm_dev->pdev, 0)); +} + +static struct virtqueue *vm_setup_vq(struct virtio_device *vdev, unsigned int index, + void (*callback)(struct virtqueue *vq), + const char *name, bool ctx) +{ + struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vdev); + bool (*notify)(struct virtqueue *vq); + struct virtio_mmio_vq_info *info; + struct virtqueue *vq; + unsigned long flags; + unsigned int num; + int err; + + if (__virtio_test_bit(vdev, VIRTIO_F_NOTIFICATION_DATA)) + notify = vm_notify_with_data; + else + notify = vm_notify; + + if (!name) + return NULL; + + /* Select the queue we're interested in */ + writel(index, vm_dev->base + VIRTIO_MMIO_QUEUE_SEL); + + /* Queue shouldn't already be set up. */ + if (readl(vm_dev->base + (vm_dev->version == 1 ? + VIRTIO_MMIO_QUEUE_PFN : VIRTIO_MMIO_QUEUE_READY))) { + err = -ENOENT; + goto error_available; + } + + /* Allocate and fill out our active queue description */ + info = kmalloc(sizeof(*info), GFP_KERNEL); + if (!info) { + err = -ENOMEM; + goto error_kmalloc; + } + + num = readl(vm_dev->base + VIRTIO_MMIO_QUEUE_NUM_MAX); + if (num == 0) { + err = -ENOENT; + goto error_new_virtqueue; + } + + /* Create the vring */ + vq = vring_create_virtqueue(index, num, VIRTIO_MMIO_VRING_ALIGN, vdev, + true, true, ctx, notify, callback, name); + if (!vq) { + err = -ENOMEM; + goto error_new_virtqueue; + } + + vq->num_max = num; + + /* Activate the queue */ + writel(virtqueue_get_vring_size(vq), vm_dev->base + VIRTIO_MMIO_QUEUE_NUM); + if (vm_dev->version == 1) { + u64 q_pfn = virtqueue_get_desc_addr(vq) >> PAGE_SHIFT; + + /* + * virtio-mmio v1 uses a 32bit QUEUE PFN. If we have something + * that doesn't fit in 32bit, fail the setup rather than + * pretending to be successful. + */ + if (q_pfn >> 32) { + dev_err(&vdev->dev, + "platform bug: legacy virtio-mmio must not be used with RAM above 0x%llxGB\n", + 0x1ULL << (32 + PAGE_SHIFT - 30)); + err = -E2BIG; + goto error_bad_pfn; + } + + writel(PAGE_SIZE, vm_dev->base + VIRTIO_MMIO_QUEUE_ALIGN); + writel(q_pfn, vm_dev->base + VIRTIO_MMIO_QUEUE_PFN); + } else { + u64 addr; + + addr = virtqueue_get_desc_addr(vq); + writel((u32)addr, vm_dev->base + VIRTIO_MMIO_QUEUE_DESC_LOW); + writel((u32)(addr >> 32), + vm_dev->base + VIRTIO_MMIO_QUEUE_DESC_HIGH); + + addr = virtqueue_get_avail_addr(vq); + writel((u32)addr, vm_dev->base + VIRTIO_MMIO_QUEUE_AVAIL_LOW); + writel((u32)(addr >> 32), + vm_dev->base + VIRTIO_MMIO_QUEUE_AVAIL_HIGH); + + addr = virtqueue_get_used_addr(vq); + writel((u32)addr, vm_dev->base + VIRTIO_MMIO_QUEUE_USED_LOW); + writel((u32)(addr >> 32), + vm_dev->base + VIRTIO_MMIO_QUEUE_USED_HIGH); + + writel(1, vm_dev->base + VIRTIO_MMIO_QUEUE_READY); + } + + vq->priv = info; + info->vq = vq; + + spin_lock_irqsave(&vm_dev->lock, flags); + list_add(&info->node, &vm_dev->virtqueues); + spin_unlock_irqrestore(&vm_dev->lock, flags); + + return vq; + +error_bad_pfn: + vring_del_virtqueue(vq); +error_new_virtqueue: + if (vm_dev->version == 1) { + writel(0, vm_dev->base + VIRTIO_MMIO_QUEUE_PFN); + } else { + writel(0, vm_dev->base + VIRTIO_MMIO_QUEUE_READY); + WARN_ON(readl(vm_dev->base + VIRTIO_MMIO_QUEUE_READY)); + } + kfree(info); +error_kmalloc: +error_available: + return ERR_PTR(err); +} + +static int vm_find_vqs(struct virtio_device *vdev, unsigned int nvqs, + struct virtqueue *vqs[], + vq_callback_t *callbacks[], + const char * const names[], + const bool *ctx, + struct irq_affinity *desc) +{ + struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vdev); + int irq = platform_get_irq(vm_dev->pdev, 0); + int i, err, queue_idx = 0; + + if (irq < 0) + return irq; + + err = request_irq(irq, vm_interrupt, IRQF_SHARED, + dev_name(&vdev->dev), vm_dev); + if (err) + return err; + + if (of_property_read_bool(vm_dev->pdev->dev.of_node, "wakeup-source")) + enable_irq_wake(irq); + + for (i = 0; i < nvqs; ++i) { + if (!names[i]) { + vqs[i] = NULL; + continue; + } + + vqs[i] = vm_setup_vq(vdev, queue_idx++, callbacks[i], names[i], + ctx ? ctx[i] : false); + if (IS_ERR(vqs[i])) { + vm_del_vqs(vdev); + return PTR_ERR(vqs[i]); + } + } + + return 0; +} + +static const char *vm_bus_name(struct virtio_device *vdev) +{ + struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vdev); + + return vm_dev->pdev->name; +} + +static bool vm_get_shm_region(struct virtio_device *vdev, + struct virtio_shm_region *region, u8 id) +{ + struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vdev); + u64 len, addr; + + /* Select the region we're interested in */ + writel(id, vm_dev->base + VIRTIO_MMIO_SHM_SEL); + + /* Read the region size */ + len = (u64) readl(vm_dev->base + VIRTIO_MMIO_SHM_LEN_LOW); + len |= (u64) readl(vm_dev->base + VIRTIO_MMIO_SHM_LEN_HIGH) << 32; + + region->len = len; + + /* Check if region length is -1. If that's the case, the shared memory + * region does not exist and there is no need to proceed further. + */ + if (len == ~(u64)0) + return false; + + /* Read the region base address */ + addr = (u64) readl(vm_dev->base + VIRTIO_MMIO_SHM_BASE_LOW); + addr |= (u64) readl(vm_dev->base + VIRTIO_MMIO_SHM_BASE_HIGH) << 32; + + region->addr = addr; + + return true; +} + +static const struct virtio_config_ops virtio_mmio_config_ops = { + .get = vm_get, + .set = vm_set, + .generation = vm_generation, + .get_status = vm_get_status, + .set_status = vm_set_status, + .reset = vm_reset, + .find_vqs = vm_find_vqs, + .del_vqs = vm_del_vqs, + .get_features = vm_get_features, + .finalize_features = vm_finalize_features, + .bus_name = vm_bus_name, + .get_shm_region = vm_get_shm_region, + .synchronize_cbs = vm_synchronize_cbs, +}; + +#ifdef CONFIG_PM_SLEEP +static int virtio_mmio_freeze(struct device *dev) +{ + struct virtio_mmio_device *vm_dev = dev_get_drvdata(dev); + + return virtio_device_freeze(&vm_dev->vdev); +} + +static int virtio_mmio_restore(struct device *dev) +{ + struct virtio_mmio_device *vm_dev = dev_get_drvdata(dev); + + if (vm_dev->version == 1) + writel(PAGE_SIZE, vm_dev->base + VIRTIO_MMIO_GUEST_PAGE_SIZE); + + return virtio_device_restore(&vm_dev->vdev); +} + +static const struct dev_pm_ops virtio_mmio_pm_ops = { + SET_SYSTEM_SLEEP_PM_OPS(virtio_mmio_freeze, virtio_mmio_restore) +}; +#endif + +static void virtio_mmio_release_dev(struct device *_d) +{ + struct virtio_device *vdev = + container_of(_d, struct virtio_device, dev); + struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vdev); + + kfree(vm_dev); +} + +/* Platform device */ + +static int virtio_mmio_probe(struct platform_device *pdev) +{ + struct virtio_mmio_device *vm_dev; + unsigned long magic; + int rc; + + vm_dev = kzalloc(sizeof(*vm_dev), GFP_KERNEL); + if (!vm_dev) + return -ENOMEM; + + vm_dev->vdev.dev.parent = &pdev->dev; + vm_dev->vdev.dev.release = virtio_mmio_release_dev; + vm_dev->vdev.config = &virtio_mmio_config_ops; + vm_dev->pdev = pdev; + INIT_LIST_HEAD(&vm_dev->virtqueues); + spin_lock_init(&vm_dev->lock); + + vm_dev->base = devm_platform_ioremap_resource(pdev, 0); + if (IS_ERR(vm_dev->base)) { + rc = PTR_ERR(vm_dev->base); + goto free_vm_dev; + } + + /* Check magic value */ + magic = readl(vm_dev->base + VIRTIO_MMIO_MAGIC_VALUE); + if (magic != ('v' | 'i' << 8 | 'r' << 16 | 't' << 24)) { + dev_warn(&pdev->dev, "Wrong magic value 0x%08lx!\n", magic); + rc = -ENODEV; + goto free_vm_dev; + } + + /* Check device version */ + vm_dev->version = readl(vm_dev->base + VIRTIO_MMIO_VERSION); + if (vm_dev->version < 1 || vm_dev->version > 2) { + dev_err(&pdev->dev, "Version %ld not supported!\n", + vm_dev->version); + rc = -ENXIO; + goto free_vm_dev; + } + + vm_dev->vdev.id.device = readl(vm_dev->base + VIRTIO_MMIO_DEVICE_ID); + if (vm_dev->vdev.id.device == 0) { + /* + * virtio-mmio device with an ID 0 is a (dummy) placeholder + * with no function. End probing now with no error reported. + */ + rc = -ENODEV; + goto free_vm_dev; + } + vm_dev->vdev.id.vendor = readl(vm_dev->base + VIRTIO_MMIO_VENDOR_ID); + + if (vm_dev->version == 1) { + writel(PAGE_SIZE, vm_dev->base + VIRTIO_MMIO_GUEST_PAGE_SIZE); + + rc = dma_set_mask(&pdev->dev, DMA_BIT_MASK(64)); + /* + * In the legacy case, ensure our coherently-allocated virtio + * ring will be at an address expressable as a 32-bit PFN. + */ + if (!rc) + dma_set_coherent_mask(&pdev->dev, + DMA_BIT_MASK(32 + PAGE_SHIFT)); + } else { + rc = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64)); + } + if (rc) + rc = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(32)); + if (rc) + dev_warn(&pdev->dev, "Failed to enable 64-bit or 32-bit DMA. Trying to continue, but this might not work.\n"); + + platform_set_drvdata(pdev, vm_dev); + + rc = register_virtio_device(&vm_dev->vdev); + if (rc) + put_device(&vm_dev->vdev.dev); + + return rc; + +free_vm_dev: + kfree(vm_dev); + return rc; +} + +static int virtio_mmio_remove(struct platform_device *pdev) +{ + struct virtio_mmio_device *vm_dev = platform_get_drvdata(pdev); + unregister_virtio_device(&vm_dev->vdev); + + return 0; +} + + + +/* Devices list parameter */ + +#if defined(CONFIG_VIRTIO_MMIO_CMDLINE_DEVICES) + +static struct device vm_cmdline_parent = { + .init_name = "virtio-mmio-cmdline", +}; + +static int vm_cmdline_parent_registered; +static int vm_cmdline_id; + +static int vm_cmdline_set(const char *device, + const struct kernel_param *kp) +{ + int err; + struct resource resources[2] = {}; + char *str; + long long base, size; + unsigned int irq; + int processed, consumed = 0; + struct platform_device *pdev; + + /* Consume "size" part of the command line parameter */ + size = memparse(device, &str); + + /* Get "@<base>:<irq>[:<id>]" chunks */ + processed = sscanf(str, "@%lli:%u%n:%d%n", + &base, &irq, &consumed, + &vm_cmdline_id, &consumed); + + /* + * sscanf() must process at least 2 chunks; also there + * must be no extra characters after the last chunk, so + * str[consumed] must be '\0' + */ + if (processed < 2 || str[consumed] || irq == 0) + return -EINVAL; + + resources[0].flags = IORESOURCE_MEM; + resources[0].start = base; + resources[0].end = base + size - 1; + + resources[1].flags = IORESOURCE_IRQ; + resources[1].start = resources[1].end = irq; + + if (!vm_cmdline_parent_registered) { + err = device_register(&vm_cmdline_parent); + if (err) { + put_device(&vm_cmdline_parent); + pr_err("Failed to register parent device!\n"); + return err; + } + vm_cmdline_parent_registered = 1; + } + + pr_info("Registering device virtio-mmio.%d at 0x%llx-0x%llx, IRQ %d.\n", + vm_cmdline_id, + (unsigned long long)resources[0].start, + (unsigned long long)resources[0].end, + (int)resources[1].start); + + pdev = platform_device_register_resndata(&vm_cmdline_parent, + "virtio-mmio", vm_cmdline_id++, + resources, ARRAY_SIZE(resources), NULL, 0); + + return PTR_ERR_OR_ZERO(pdev); +} + +static int vm_cmdline_get_device(struct device *dev, void *data) +{ + char *buffer = data; + unsigned int len = strlen(buffer); + struct platform_device *pdev = to_platform_device(dev); + + snprintf(buffer + len, PAGE_SIZE - len, "0x%llx@0x%llx:%llu:%d\n", + pdev->resource[0].end - pdev->resource[0].start + 1ULL, + (unsigned long long)pdev->resource[0].start, + (unsigned long long)pdev->resource[1].start, + pdev->id); + return 0; +} + +static int vm_cmdline_get(char *buffer, const struct kernel_param *kp) +{ + buffer[0] = '\0'; + device_for_each_child(&vm_cmdline_parent, buffer, + vm_cmdline_get_device); + return strlen(buffer) + 1; +} + +static const struct kernel_param_ops vm_cmdline_param_ops = { + .set = vm_cmdline_set, + .get = vm_cmdline_get, +}; + +device_param_cb(device, &vm_cmdline_param_ops, NULL, S_IRUSR); + +static int vm_unregister_cmdline_device(struct device *dev, + void *data) +{ + platform_device_unregister(to_platform_device(dev)); + + return 0; +} + +static void vm_unregister_cmdline_devices(void) +{ + if (vm_cmdline_parent_registered) { + device_for_each_child(&vm_cmdline_parent, NULL, + vm_unregister_cmdline_device); + device_unregister(&vm_cmdline_parent); + vm_cmdline_parent_registered = 0; + } +} + +#else + +static void vm_unregister_cmdline_devices(void) +{ +} + +#endif + +/* Platform driver */ + +static const struct of_device_id virtio_mmio_match[] = { + { .compatible = "virtio,mmio", }, + {}, +}; +MODULE_DEVICE_TABLE(of, virtio_mmio_match); + +#ifdef CONFIG_ACPI +static const struct acpi_device_id virtio_mmio_acpi_match[] = { + { "LNRO0005", }, + { } +}; +MODULE_DEVICE_TABLE(acpi, virtio_mmio_acpi_match); +#endif + +static struct platform_driver virtio_mmio_driver = { + .probe = virtio_mmio_probe, + .remove = virtio_mmio_remove, + .driver = { + .name = "virtio-mmio", + .of_match_table = virtio_mmio_match, + .acpi_match_table = ACPI_PTR(virtio_mmio_acpi_match), +#ifdef CONFIG_PM_SLEEP + .pm = &virtio_mmio_pm_ops, +#endif + }, +}; + +static int __init virtio_mmio_init(void) +{ + return platform_driver_register(&virtio_mmio_driver); +} + +static void __exit virtio_mmio_exit(void) +{ + platform_driver_unregister(&virtio_mmio_driver); + vm_unregister_cmdline_devices(); +} + +module_init(virtio_mmio_init); +module_exit(virtio_mmio_exit); + +MODULE_AUTHOR("Pawel Moll <pawel.moll@arm.com>"); +MODULE_DESCRIPTION("Platform bus driver for memory mapped virtio devices"); +MODULE_LICENSE("GPL"); diff --git a/drivers/virtio/virtio_pci_common.c b/drivers/virtio/virtio_pci_common.c new file mode 100644 index 0000000000..c2524a7207 --- /dev/null +++ b/drivers/virtio/virtio_pci_common.c @@ -0,0 +1,650 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Virtio PCI driver - common functionality for all device versions + * + * This module allows virtio devices to be used over a virtual PCI device. + * This can be used with QEMU based VMMs like KVM or Xen. + * + * Copyright IBM Corp. 2007 + * Copyright Red Hat, Inc. 2014 + * + * Authors: + * Anthony Liguori <aliguori@us.ibm.com> + * Rusty Russell <rusty@rustcorp.com.au> + * Michael S. Tsirkin <mst@redhat.com> + */ + +#include "virtio_pci_common.h" + +static bool force_legacy = false; + +#if IS_ENABLED(CONFIG_VIRTIO_PCI_LEGACY) +module_param(force_legacy, bool, 0444); +MODULE_PARM_DESC(force_legacy, + "Force legacy mode for transitional virtio 1 devices"); +#endif + +/* wait for pending irq handlers */ +void vp_synchronize_vectors(struct virtio_device *vdev) +{ + struct virtio_pci_device *vp_dev = to_vp_device(vdev); + int i; + + if (vp_dev->intx_enabled) + synchronize_irq(vp_dev->pci_dev->irq); + + for (i = 0; i < vp_dev->msix_vectors; ++i) + synchronize_irq(pci_irq_vector(vp_dev->pci_dev, i)); +} + +/* the notify function used when creating a virt queue */ +bool vp_notify(struct virtqueue *vq) +{ + /* we write the queue's selector into the notification register to + * signal the other end */ + iowrite16(vq->index, (void __iomem *)vq->priv); + return true; +} + +/* Handle a configuration change: Tell driver if it wants to know. */ +static irqreturn_t vp_config_changed(int irq, void *opaque) +{ + struct virtio_pci_device *vp_dev = opaque; + + virtio_config_changed(&vp_dev->vdev); + return IRQ_HANDLED; +} + +/* Notify all virtqueues on an interrupt. */ +static irqreturn_t vp_vring_interrupt(int irq, void *opaque) +{ + struct virtio_pci_device *vp_dev = opaque; + struct virtio_pci_vq_info *info; + irqreturn_t ret = IRQ_NONE; + unsigned long flags; + + spin_lock_irqsave(&vp_dev->lock, flags); + list_for_each_entry(info, &vp_dev->virtqueues, node) { + if (vring_interrupt(irq, info->vq) == IRQ_HANDLED) + ret = IRQ_HANDLED; + } + spin_unlock_irqrestore(&vp_dev->lock, flags); + + return ret; +} + +/* A small wrapper to also acknowledge the interrupt when it's handled. + * I really need an EIO hook for the vring so I can ack the interrupt once we + * know that we'll be handling the IRQ but before we invoke the callback since + * the callback may notify the host which results in the host attempting to + * raise an interrupt that we would then mask once we acknowledged the + * interrupt. */ +static irqreturn_t vp_interrupt(int irq, void *opaque) +{ + struct virtio_pci_device *vp_dev = opaque; + u8 isr; + + /* reading the ISR has the effect of also clearing it so it's very + * important to save off the value. */ + isr = ioread8(vp_dev->isr); + + /* It's definitely not us if the ISR was not high */ + if (!isr) + return IRQ_NONE; + + /* Configuration change? Tell driver if it wants to know. */ + if (isr & VIRTIO_PCI_ISR_CONFIG) + vp_config_changed(irq, opaque); + + return vp_vring_interrupt(irq, opaque); +} + +static int vp_request_msix_vectors(struct virtio_device *vdev, int nvectors, + bool per_vq_vectors, struct irq_affinity *desc) +{ + struct virtio_pci_device *vp_dev = to_vp_device(vdev); + const char *name = dev_name(&vp_dev->vdev.dev); + unsigned int flags = PCI_IRQ_MSIX; + unsigned int i, v; + int err = -ENOMEM; + + vp_dev->msix_vectors = nvectors; + + vp_dev->msix_names = kmalloc_array(nvectors, + sizeof(*vp_dev->msix_names), + GFP_KERNEL); + if (!vp_dev->msix_names) + goto error; + vp_dev->msix_affinity_masks + = kcalloc(nvectors, sizeof(*vp_dev->msix_affinity_masks), + GFP_KERNEL); + if (!vp_dev->msix_affinity_masks) + goto error; + for (i = 0; i < nvectors; ++i) + if (!alloc_cpumask_var(&vp_dev->msix_affinity_masks[i], + GFP_KERNEL)) + goto error; + + if (desc) { + flags |= PCI_IRQ_AFFINITY; + desc->pre_vectors++; /* virtio config vector */ + } + + err = pci_alloc_irq_vectors_affinity(vp_dev->pci_dev, nvectors, + nvectors, flags, desc); + if (err < 0) + goto error; + vp_dev->msix_enabled = 1; + + /* Set the vector used for configuration */ + v = vp_dev->msix_used_vectors; + snprintf(vp_dev->msix_names[v], sizeof *vp_dev->msix_names, + "%s-config", name); + err = request_irq(pci_irq_vector(vp_dev->pci_dev, v), + vp_config_changed, 0, vp_dev->msix_names[v], + vp_dev); + if (err) + goto error; + ++vp_dev->msix_used_vectors; + + v = vp_dev->config_vector(vp_dev, v); + /* Verify we had enough resources to assign the vector */ + if (v == VIRTIO_MSI_NO_VECTOR) { + err = -EBUSY; + goto error; + } + + if (!per_vq_vectors) { + /* Shared vector for all VQs */ + v = vp_dev->msix_used_vectors; + snprintf(vp_dev->msix_names[v], sizeof *vp_dev->msix_names, + "%s-virtqueues", name); + err = request_irq(pci_irq_vector(vp_dev->pci_dev, v), + vp_vring_interrupt, 0, vp_dev->msix_names[v], + vp_dev); + if (err) + goto error; + ++vp_dev->msix_used_vectors; + } + return 0; +error: + return err; +} + +static struct virtqueue *vp_setup_vq(struct virtio_device *vdev, unsigned int index, + void (*callback)(struct virtqueue *vq), + const char *name, + bool ctx, + u16 msix_vec) +{ + struct virtio_pci_device *vp_dev = to_vp_device(vdev); + struct virtio_pci_vq_info *info = kmalloc(sizeof *info, GFP_KERNEL); + struct virtqueue *vq; + unsigned long flags; + + /* fill out our structure that represents an active queue */ + if (!info) + return ERR_PTR(-ENOMEM); + + vq = vp_dev->setup_vq(vp_dev, info, index, callback, name, ctx, + msix_vec); + if (IS_ERR(vq)) + goto out_info; + + info->vq = vq; + if (callback) { + spin_lock_irqsave(&vp_dev->lock, flags); + list_add(&info->node, &vp_dev->virtqueues); + spin_unlock_irqrestore(&vp_dev->lock, flags); + } else { + INIT_LIST_HEAD(&info->node); + } + + vp_dev->vqs[index] = info; + return vq; + +out_info: + kfree(info); + return vq; +} + +static void vp_del_vq(struct virtqueue *vq) +{ + struct virtio_pci_device *vp_dev = to_vp_device(vq->vdev); + struct virtio_pci_vq_info *info = vp_dev->vqs[vq->index]; + unsigned long flags; + + /* + * If it fails during re-enable reset vq. This way we won't rejoin + * info->node to the queue. Prevent unexpected irqs. + */ + if (!vq->reset) { + spin_lock_irqsave(&vp_dev->lock, flags); + list_del(&info->node); + spin_unlock_irqrestore(&vp_dev->lock, flags); + } + + vp_dev->del_vq(info); + kfree(info); +} + +/* the config->del_vqs() implementation */ +void vp_del_vqs(struct virtio_device *vdev) +{ + struct virtio_pci_device *vp_dev = to_vp_device(vdev); + struct virtqueue *vq, *n; + int i; + + list_for_each_entry_safe(vq, n, &vdev->vqs, list) { + if (vp_dev->per_vq_vectors) { + int v = vp_dev->vqs[vq->index]->msix_vector; + + if (v != VIRTIO_MSI_NO_VECTOR) { + int irq = pci_irq_vector(vp_dev->pci_dev, v); + + irq_set_affinity_hint(irq, NULL); + free_irq(irq, vq); + } + } + vp_del_vq(vq); + } + vp_dev->per_vq_vectors = false; + + if (vp_dev->intx_enabled) { + free_irq(vp_dev->pci_dev->irq, vp_dev); + vp_dev->intx_enabled = 0; + } + + for (i = 0; i < vp_dev->msix_used_vectors; ++i) + free_irq(pci_irq_vector(vp_dev->pci_dev, i), vp_dev); + + if (vp_dev->msix_affinity_masks) { + for (i = 0; i < vp_dev->msix_vectors; i++) + free_cpumask_var(vp_dev->msix_affinity_masks[i]); + } + + if (vp_dev->msix_enabled) { + /* Disable the vector used for configuration */ + vp_dev->config_vector(vp_dev, VIRTIO_MSI_NO_VECTOR); + + pci_free_irq_vectors(vp_dev->pci_dev); + vp_dev->msix_enabled = 0; + } + + vp_dev->msix_vectors = 0; + vp_dev->msix_used_vectors = 0; + kfree(vp_dev->msix_names); + vp_dev->msix_names = NULL; + kfree(vp_dev->msix_affinity_masks); + vp_dev->msix_affinity_masks = NULL; + kfree(vp_dev->vqs); + vp_dev->vqs = NULL; +} + +static int vp_find_vqs_msix(struct virtio_device *vdev, unsigned int nvqs, + struct virtqueue *vqs[], vq_callback_t *callbacks[], + const char * const names[], bool per_vq_vectors, + const bool *ctx, + struct irq_affinity *desc) +{ + struct virtio_pci_device *vp_dev = to_vp_device(vdev); + u16 msix_vec; + int i, err, nvectors, allocated_vectors, queue_idx = 0; + + vp_dev->vqs = kcalloc(nvqs, sizeof(*vp_dev->vqs), GFP_KERNEL); + if (!vp_dev->vqs) + return -ENOMEM; + + if (per_vq_vectors) { + /* Best option: one for change interrupt, one per vq. */ + nvectors = 1; + for (i = 0; i < nvqs; ++i) + if (names[i] && callbacks[i]) + ++nvectors; + } else { + /* Second best: one for change, shared for all vqs. */ + nvectors = 2; + } + + err = vp_request_msix_vectors(vdev, nvectors, per_vq_vectors, + per_vq_vectors ? desc : NULL); + if (err) + goto error_find; + + vp_dev->per_vq_vectors = per_vq_vectors; + allocated_vectors = vp_dev->msix_used_vectors; + for (i = 0; i < nvqs; ++i) { + if (!names[i]) { + vqs[i] = NULL; + continue; + } + + if (!callbacks[i]) + msix_vec = VIRTIO_MSI_NO_VECTOR; + else if (vp_dev->per_vq_vectors) + msix_vec = allocated_vectors++; + else + msix_vec = VP_MSIX_VQ_VECTOR; + vqs[i] = vp_setup_vq(vdev, queue_idx++, callbacks[i], names[i], + ctx ? ctx[i] : false, + msix_vec); + if (IS_ERR(vqs[i])) { + err = PTR_ERR(vqs[i]); + goto error_find; + } + + if (!vp_dev->per_vq_vectors || msix_vec == VIRTIO_MSI_NO_VECTOR) + continue; + + /* allocate per-vq irq if available and necessary */ + snprintf(vp_dev->msix_names[msix_vec], + sizeof *vp_dev->msix_names, + "%s-%s", + dev_name(&vp_dev->vdev.dev), names[i]); + err = request_irq(pci_irq_vector(vp_dev->pci_dev, msix_vec), + vring_interrupt, 0, + vp_dev->msix_names[msix_vec], + vqs[i]); + if (err) + goto error_find; + } + return 0; + +error_find: + vp_del_vqs(vdev); + return err; +} + +static int vp_find_vqs_intx(struct virtio_device *vdev, unsigned int nvqs, + struct virtqueue *vqs[], vq_callback_t *callbacks[], + const char * const names[], const bool *ctx) +{ + struct virtio_pci_device *vp_dev = to_vp_device(vdev); + int i, err, queue_idx = 0; + + vp_dev->vqs = kcalloc(nvqs, sizeof(*vp_dev->vqs), GFP_KERNEL); + if (!vp_dev->vqs) + return -ENOMEM; + + err = request_irq(vp_dev->pci_dev->irq, vp_interrupt, IRQF_SHARED, + dev_name(&vdev->dev), vp_dev); + if (err) + goto out_del_vqs; + + vp_dev->intx_enabled = 1; + vp_dev->per_vq_vectors = false; + for (i = 0; i < nvqs; ++i) { + if (!names[i]) { + vqs[i] = NULL; + continue; + } + vqs[i] = vp_setup_vq(vdev, queue_idx++, callbacks[i], names[i], + ctx ? ctx[i] : false, + VIRTIO_MSI_NO_VECTOR); + if (IS_ERR(vqs[i])) { + err = PTR_ERR(vqs[i]); + goto out_del_vqs; + } + } + + return 0; +out_del_vqs: + vp_del_vqs(vdev); + return err; +} + +/* the config->find_vqs() implementation */ +int vp_find_vqs(struct virtio_device *vdev, unsigned int nvqs, + struct virtqueue *vqs[], vq_callback_t *callbacks[], + const char * const names[], const bool *ctx, + struct irq_affinity *desc) +{ + int err; + + /* Try MSI-X with one vector per queue. */ + err = vp_find_vqs_msix(vdev, nvqs, vqs, callbacks, names, true, ctx, desc); + if (!err) + return 0; + /* Fallback: MSI-X with one vector for config, one shared for queues. */ + err = vp_find_vqs_msix(vdev, nvqs, vqs, callbacks, names, false, ctx, desc); + if (!err) + return 0; + /* Is there an interrupt? If not give up. */ + if (!(to_vp_device(vdev)->pci_dev->irq)) + return err; + /* Finally fall back to regular interrupts. */ + return vp_find_vqs_intx(vdev, nvqs, vqs, callbacks, names, ctx); +} + +const char *vp_bus_name(struct virtio_device *vdev) +{ + struct virtio_pci_device *vp_dev = to_vp_device(vdev); + + return pci_name(vp_dev->pci_dev); +} + +/* Setup the affinity for a virtqueue: + * - force the affinity for per vq vector + * - OR over all affinities for shared MSI + * - ignore the affinity request if we're using INTX + */ +int vp_set_vq_affinity(struct virtqueue *vq, const struct cpumask *cpu_mask) +{ + struct virtio_device *vdev = vq->vdev; + struct virtio_pci_device *vp_dev = to_vp_device(vdev); + struct virtio_pci_vq_info *info = vp_dev->vqs[vq->index]; + struct cpumask *mask; + unsigned int irq; + + if (!vq->callback) + return -EINVAL; + + if (vp_dev->msix_enabled) { + mask = vp_dev->msix_affinity_masks[info->msix_vector]; + irq = pci_irq_vector(vp_dev->pci_dev, info->msix_vector); + if (!cpu_mask) + irq_set_affinity_hint(irq, NULL); + else { + cpumask_copy(mask, cpu_mask); + irq_set_affinity_hint(irq, mask); + } + } + return 0; +} + +const struct cpumask *vp_get_vq_affinity(struct virtio_device *vdev, int index) +{ + struct virtio_pci_device *vp_dev = to_vp_device(vdev); + + if (!vp_dev->per_vq_vectors || + vp_dev->vqs[index]->msix_vector == VIRTIO_MSI_NO_VECTOR) + return NULL; + + return pci_irq_get_affinity(vp_dev->pci_dev, + vp_dev->vqs[index]->msix_vector); +} + +#ifdef CONFIG_PM_SLEEP +static int virtio_pci_freeze(struct device *dev) +{ + struct pci_dev *pci_dev = to_pci_dev(dev); + struct virtio_pci_device *vp_dev = pci_get_drvdata(pci_dev); + int ret; + + ret = virtio_device_freeze(&vp_dev->vdev); + + if (!ret) + pci_disable_device(pci_dev); + return ret; +} + +static int virtio_pci_restore(struct device *dev) +{ + struct pci_dev *pci_dev = to_pci_dev(dev); + struct virtio_pci_device *vp_dev = pci_get_drvdata(pci_dev); + int ret; + + ret = pci_enable_device(pci_dev); + if (ret) + return ret; + + pci_set_master(pci_dev); + return virtio_device_restore(&vp_dev->vdev); +} + +static const struct dev_pm_ops virtio_pci_pm_ops = { + SET_SYSTEM_SLEEP_PM_OPS(virtio_pci_freeze, virtio_pci_restore) +}; +#endif + + +/* Qumranet donated their vendor ID for devices 0x1000 thru 0x10FF. */ +static const struct pci_device_id virtio_pci_id_table[] = { + { PCI_DEVICE(PCI_VENDOR_ID_REDHAT_QUMRANET, PCI_ANY_ID) }, + { 0 } +}; + +MODULE_DEVICE_TABLE(pci, virtio_pci_id_table); + +static void virtio_pci_release_dev(struct device *_d) +{ + struct virtio_device *vdev = dev_to_virtio(_d); + struct virtio_pci_device *vp_dev = to_vp_device(vdev); + + /* As struct device is a kobject, it's not safe to + * free the memory (including the reference counter itself) + * until it's release callback. */ + kfree(vp_dev); +} + +static int virtio_pci_probe(struct pci_dev *pci_dev, + const struct pci_device_id *id) +{ + struct virtio_pci_device *vp_dev, *reg_dev = NULL; + int rc; + + /* allocate our structure and fill it out */ + vp_dev = kzalloc(sizeof(struct virtio_pci_device), GFP_KERNEL); + if (!vp_dev) + return -ENOMEM; + + pci_set_drvdata(pci_dev, vp_dev); + vp_dev->vdev.dev.parent = &pci_dev->dev; + vp_dev->vdev.dev.release = virtio_pci_release_dev; + vp_dev->pci_dev = pci_dev; + INIT_LIST_HEAD(&vp_dev->virtqueues); + spin_lock_init(&vp_dev->lock); + + /* enable the device */ + rc = pci_enable_device(pci_dev); + if (rc) + goto err_enable_device; + + if (force_legacy) { + rc = virtio_pci_legacy_probe(vp_dev); + /* Also try modern mode if we can't map BAR0 (no IO space). */ + if (rc == -ENODEV || rc == -ENOMEM) + rc = virtio_pci_modern_probe(vp_dev); + if (rc) + goto err_probe; + } else { + rc = virtio_pci_modern_probe(vp_dev); + if (rc == -ENODEV) + rc = virtio_pci_legacy_probe(vp_dev); + if (rc) + goto err_probe; + } + + pci_set_master(pci_dev); + + rc = register_virtio_device(&vp_dev->vdev); + reg_dev = vp_dev; + if (rc) + goto err_register; + + return 0; + +err_register: + if (vp_dev->is_legacy) + virtio_pci_legacy_remove(vp_dev); + else + virtio_pci_modern_remove(vp_dev); +err_probe: + pci_disable_device(pci_dev); +err_enable_device: + if (reg_dev) + put_device(&vp_dev->vdev.dev); + else + kfree(vp_dev); + return rc; +} + +static void virtio_pci_remove(struct pci_dev *pci_dev) +{ + struct virtio_pci_device *vp_dev = pci_get_drvdata(pci_dev); + struct device *dev = get_device(&vp_dev->vdev.dev); + + /* + * Device is marked broken on surprise removal so that virtio upper + * layers can abort any ongoing operation. + */ + if (!pci_device_is_present(pci_dev)) + virtio_break_device(&vp_dev->vdev); + + pci_disable_sriov(pci_dev); + + unregister_virtio_device(&vp_dev->vdev); + + if (vp_dev->is_legacy) + virtio_pci_legacy_remove(vp_dev); + else + virtio_pci_modern_remove(vp_dev); + + pci_disable_device(pci_dev); + put_device(dev); +} + +static int virtio_pci_sriov_configure(struct pci_dev *pci_dev, int num_vfs) +{ + struct virtio_pci_device *vp_dev = pci_get_drvdata(pci_dev); + struct virtio_device *vdev = &vp_dev->vdev; + int ret; + + if (!(vdev->config->get_status(vdev) & VIRTIO_CONFIG_S_DRIVER_OK)) + return -EBUSY; + + if (!__virtio_test_bit(vdev, VIRTIO_F_SR_IOV)) + return -EINVAL; + + if (pci_vfs_assigned(pci_dev)) + return -EPERM; + + if (num_vfs == 0) { + pci_disable_sriov(pci_dev); + return 0; + } + + ret = pci_enable_sriov(pci_dev, num_vfs); + if (ret < 0) + return ret; + + return num_vfs; +} + +static struct pci_driver virtio_pci_driver = { + .name = "virtio-pci", + .id_table = virtio_pci_id_table, + .probe = virtio_pci_probe, + .remove = virtio_pci_remove, +#ifdef CONFIG_PM_SLEEP + .driver.pm = &virtio_pci_pm_ops, +#endif + .sriov_configure = virtio_pci_sriov_configure, +}; + +module_pci_driver(virtio_pci_driver); + +MODULE_AUTHOR("Anthony Liguori <aliguori@us.ibm.com>"); +MODULE_DESCRIPTION("virtio-pci"); +MODULE_LICENSE("GPL"); +MODULE_VERSION("1"); diff --git a/drivers/virtio/virtio_pci_common.h b/drivers/virtio/virtio_pci_common.h new file mode 100644 index 0000000000..4b773bd7c5 --- /dev/null +++ b/drivers/virtio/virtio_pci_common.h @@ -0,0 +1,142 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifndef _DRIVERS_VIRTIO_VIRTIO_PCI_COMMON_H +#define _DRIVERS_VIRTIO_VIRTIO_PCI_COMMON_H +/* + * Virtio PCI driver - APIs for common functionality for all device versions + * + * This module allows virtio devices to be used over a virtual PCI device. + * This can be used with QEMU based VMMs like KVM or Xen. + * + * Copyright IBM Corp. 2007 + * Copyright Red Hat, Inc. 2014 + * + * Authors: + * Anthony Liguori <aliguori@us.ibm.com> + * Rusty Russell <rusty@rustcorp.com.au> + * Michael S. Tsirkin <mst@redhat.com> + */ + +#include <linux/module.h> +#include <linux/list.h> +#include <linux/pci.h> +#include <linux/slab.h> +#include <linux/interrupt.h> +#include <linux/virtio.h> +#include <linux/virtio_config.h> +#include <linux/virtio_ring.h> +#include <linux/virtio_pci.h> +#include <linux/virtio_pci_legacy.h> +#include <linux/virtio_pci_modern.h> +#include <linux/highmem.h> +#include <linux/spinlock.h> + +struct virtio_pci_vq_info { + /* the actual virtqueue */ + struct virtqueue *vq; + + /* the list node for the virtqueues list */ + struct list_head node; + + /* MSI-X vector (or none) */ + unsigned int msix_vector; +}; + +/* Our device structure */ +struct virtio_pci_device { + struct virtio_device vdev; + struct pci_dev *pci_dev; + union { + struct virtio_pci_legacy_device ldev; + struct virtio_pci_modern_device mdev; + }; + bool is_legacy; + + /* Where to read and clear interrupt */ + u8 __iomem *isr; + + /* a list of queues so we can dispatch IRQs */ + spinlock_t lock; + struct list_head virtqueues; + + /* array of all queues for house-keeping */ + struct virtio_pci_vq_info **vqs; + + /* MSI-X support */ + int msix_enabled; + int intx_enabled; + cpumask_var_t *msix_affinity_masks; + /* Name strings for interrupts. This size should be enough, + * and I'm too lazy to allocate each name separately. */ + char (*msix_names)[256]; + /* Number of available vectors */ + unsigned int msix_vectors; + /* Vectors allocated, excluding per-vq vectors if any */ + unsigned int msix_used_vectors; + + /* Whether we have vector per vq */ + bool per_vq_vectors; + + struct virtqueue *(*setup_vq)(struct virtio_pci_device *vp_dev, + struct virtio_pci_vq_info *info, + unsigned int idx, + void (*callback)(struct virtqueue *vq), + const char *name, + bool ctx, + u16 msix_vec); + void (*del_vq)(struct virtio_pci_vq_info *info); + + u16 (*config_vector)(struct virtio_pci_device *vp_dev, u16 vector); +}; + +/* Constants for MSI-X */ +/* Use first vector for configuration changes, second and the rest for + * virtqueues Thus, we need at least 2 vectors for MSI. */ +enum { + VP_MSIX_CONFIG_VECTOR = 0, + VP_MSIX_VQ_VECTOR = 1, +}; + +/* Convert a generic virtio device to our structure */ +static struct virtio_pci_device *to_vp_device(struct virtio_device *vdev) +{ + return container_of(vdev, struct virtio_pci_device, vdev); +} + +/* wait for pending irq handlers */ +void vp_synchronize_vectors(struct virtio_device *vdev); +/* the notify function used when creating a virt queue */ +bool vp_notify(struct virtqueue *vq); +/* the config->del_vqs() implementation */ +void vp_del_vqs(struct virtio_device *vdev); +/* the config->find_vqs() implementation */ +int vp_find_vqs(struct virtio_device *vdev, unsigned int nvqs, + struct virtqueue *vqs[], vq_callback_t *callbacks[], + const char * const names[], const bool *ctx, + struct irq_affinity *desc); +const char *vp_bus_name(struct virtio_device *vdev); + +/* Setup the affinity for a virtqueue: + * - force the affinity for per vq vector + * - OR over all affinities for shared MSI + * - ignore the affinity request if we're using INTX + */ +int vp_set_vq_affinity(struct virtqueue *vq, const struct cpumask *cpu_mask); + +const struct cpumask *vp_get_vq_affinity(struct virtio_device *vdev, int index); + +#if IS_ENABLED(CONFIG_VIRTIO_PCI_LEGACY) +int virtio_pci_legacy_probe(struct virtio_pci_device *); +void virtio_pci_legacy_remove(struct virtio_pci_device *); +#else +static inline int virtio_pci_legacy_probe(struct virtio_pci_device *vp_dev) +{ + return -ENODEV; +} +static inline void virtio_pci_legacy_remove(struct virtio_pci_device *vp_dev) +{ +} +#endif +int virtio_pci_modern_probe(struct virtio_pci_device *); +void virtio_pci_modern_remove(struct virtio_pci_device *); + +#endif diff --git a/drivers/virtio/virtio_pci_legacy.c b/drivers/virtio/virtio_pci_legacy.c new file mode 100644 index 0000000000..d9cbb02b35 --- /dev/null +++ b/drivers/virtio/virtio_pci_legacy.c @@ -0,0 +1,236 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Virtio PCI driver - legacy device support + * + * This module allows virtio devices to be used over a virtual PCI device. + * This can be used with QEMU based VMMs like KVM or Xen. + * + * Copyright IBM Corp. 2007 + * Copyright Red Hat, Inc. 2014 + * + * Authors: + * Anthony Liguori <aliguori@us.ibm.com> + * Rusty Russell <rusty@rustcorp.com.au> + * Michael S. Tsirkin <mst@redhat.com> + */ + +#include "linux/virtio_pci_legacy.h" +#include "virtio_pci_common.h" + +/* virtio config->get_features() implementation */ +static u64 vp_get_features(struct virtio_device *vdev) +{ + struct virtio_pci_device *vp_dev = to_vp_device(vdev); + + /* When someone needs more than 32 feature bits, we'll need to + * steal a bit to indicate that the rest are somewhere else. */ + return vp_legacy_get_features(&vp_dev->ldev); +} + +/* virtio config->finalize_features() implementation */ +static int vp_finalize_features(struct virtio_device *vdev) +{ + struct virtio_pci_device *vp_dev = to_vp_device(vdev); + + /* Give virtio_ring a chance to accept features. */ + vring_transport_features(vdev); + + /* Make sure we don't have any features > 32 bits! */ + BUG_ON((u32)vdev->features != vdev->features); + + /* We only support 32 feature bits. */ + vp_legacy_set_features(&vp_dev->ldev, vdev->features); + + return 0; +} + +/* virtio config->get() implementation */ +static void vp_get(struct virtio_device *vdev, unsigned int offset, + void *buf, unsigned int len) +{ + struct virtio_pci_device *vp_dev = to_vp_device(vdev); + void __iomem *ioaddr = vp_dev->ldev.ioaddr + + VIRTIO_PCI_CONFIG_OFF(vp_dev->msix_enabled) + + offset; + u8 *ptr = buf; + int i; + + for (i = 0; i < len; i++) + ptr[i] = ioread8(ioaddr + i); +} + +/* the config->set() implementation. it's symmetric to the config->get() + * implementation */ +static void vp_set(struct virtio_device *vdev, unsigned int offset, + const void *buf, unsigned int len) +{ + struct virtio_pci_device *vp_dev = to_vp_device(vdev); + void __iomem *ioaddr = vp_dev->ldev.ioaddr + + VIRTIO_PCI_CONFIG_OFF(vp_dev->msix_enabled) + + offset; + const u8 *ptr = buf; + int i; + + for (i = 0; i < len; i++) + iowrite8(ptr[i], ioaddr + i); +} + +/* config->{get,set}_status() implementations */ +static u8 vp_get_status(struct virtio_device *vdev) +{ + struct virtio_pci_device *vp_dev = to_vp_device(vdev); + return vp_legacy_get_status(&vp_dev->ldev); +} + +static void vp_set_status(struct virtio_device *vdev, u8 status) +{ + struct virtio_pci_device *vp_dev = to_vp_device(vdev); + /* We should never be setting status to 0. */ + BUG_ON(status == 0); + vp_legacy_set_status(&vp_dev->ldev, status); +} + +static void vp_reset(struct virtio_device *vdev) +{ + struct virtio_pci_device *vp_dev = to_vp_device(vdev); + /* 0 status means a reset. */ + vp_legacy_set_status(&vp_dev->ldev, 0); + /* Flush out the status write, and flush in device writes, + * including MSi-X interrupts, if any. */ + vp_legacy_get_status(&vp_dev->ldev); + /* Flush pending VQ/configuration callbacks. */ + vp_synchronize_vectors(vdev); +} + +static u16 vp_config_vector(struct virtio_pci_device *vp_dev, u16 vector) +{ + return vp_legacy_config_vector(&vp_dev->ldev, vector); +} + +static struct virtqueue *setup_vq(struct virtio_pci_device *vp_dev, + struct virtio_pci_vq_info *info, + unsigned int index, + void (*callback)(struct virtqueue *vq), + const char *name, + bool ctx, + u16 msix_vec) +{ + struct virtqueue *vq; + u16 num; + int err; + u64 q_pfn; + + /* Check if queue is either not available or already active. */ + num = vp_legacy_get_queue_size(&vp_dev->ldev, index); + if (!num || vp_legacy_get_queue_enable(&vp_dev->ldev, index)) + return ERR_PTR(-ENOENT); + + info->msix_vector = msix_vec; + + /* create the vring */ + vq = vring_create_virtqueue(index, num, + VIRTIO_PCI_VRING_ALIGN, &vp_dev->vdev, + true, false, ctx, + vp_notify, callback, name); + if (!vq) + return ERR_PTR(-ENOMEM); + + vq->num_max = num; + + q_pfn = virtqueue_get_desc_addr(vq) >> VIRTIO_PCI_QUEUE_ADDR_SHIFT; + if (q_pfn >> 32) { + dev_err(&vp_dev->pci_dev->dev, + "platform bug: legacy virtio-pci must not be used with RAM above 0x%llxGB\n", + 0x1ULL << (32 + PAGE_SHIFT - 30)); + err = -E2BIG; + goto out_del_vq; + } + + /* activate the queue */ + vp_legacy_set_queue_address(&vp_dev->ldev, index, q_pfn); + + vq->priv = (void __force *)vp_dev->ldev.ioaddr + VIRTIO_PCI_QUEUE_NOTIFY; + + if (msix_vec != VIRTIO_MSI_NO_VECTOR) { + msix_vec = vp_legacy_queue_vector(&vp_dev->ldev, index, msix_vec); + if (msix_vec == VIRTIO_MSI_NO_VECTOR) { + err = -EBUSY; + goto out_deactivate; + } + } + + return vq; + +out_deactivate: + vp_legacy_set_queue_address(&vp_dev->ldev, index, 0); +out_del_vq: + vring_del_virtqueue(vq); + return ERR_PTR(err); +} + +static void del_vq(struct virtio_pci_vq_info *info) +{ + struct virtqueue *vq = info->vq; + struct virtio_pci_device *vp_dev = to_vp_device(vq->vdev); + + if (vp_dev->msix_enabled) { + vp_legacy_queue_vector(&vp_dev->ldev, vq->index, + VIRTIO_MSI_NO_VECTOR); + /* Flush the write out to device */ + ioread8(vp_dev->ldev.ioaddr + VIRTIO_PCI_ISR); + } + + /* Select and deactivate the queue */ + vp_legacy_set_queue_address(&vp_dev->ldev, vq->index, 0); + + vring_del_virtqueue(vq); +} + +static const struct virtio_config_ops virtio_pci_config_ops = { + .get = vp_get, + .set = vp_set, + .get_status = vp_get_status, + .set_status = vp_set_status, + .reset = vp_reset, + .find_vqs = vp_find_vqs, + .del_vqs = vp_del_vqs, + .synchronize_cbs = vp_synchronize_vectors, + .get_features = vp_get_features, + .finalize_features = vp_finalize_features, + .bus_name = vp_bus_name, + .set_vq_affinity = vp_set_vq_affinity, + .get_vq_affinity = vp_get_vq_affinity, +}; + +/* the PCI probing function */ +int virtio_pci_legacy_probe(struct virtio_pci_device *vp_dev) +{ + struct virtio_pci_legacy_device *ldev = &vp_dev->ldev; + struct pci_dev *pci_dev = vp_dev->pci_dev; + int rc; + + ldev->pci_dev = pci_dev; + + rc = vp_legacy_probe(ldev); + if (rc) + return rc; + + vp_dev->isr = ldev->isr; + vp_dev->vdev.id = ldev->id; + + vp_dev->vdev.config = &virtio_pci_config_ops; + + vp_dev->config_vector = vp_config_vector; + vp_dev->setup_vq = setup_vq; + vp_dev->del_vq = del_vq; + vp_dev->is_legacy = true; + + return 0; +} + +void virtio_pci_legacy_remove(struct virtio_pci_device *vp_dev) +{ + struct virtio_pci_legacy_device *ldev = &vp_dev->ldev; + + vp_legacy_remove(ldev); +} diff --git a/drivers/virtio/virtio_pci_legacy_dev.c b/drivers/virtio/virtio_pci_legacy_dev.c new file mode 100644 index 0000000000..677d1f68bc --- /dev/null +++ b/drivers/virtio/virtio_pci_legacy_dev.c @@ -0,0 +1,222 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include "linux/virtio_pci.h" +#include <linux/virtio_pci_legacy.h> +#include <linux/module.h> +#include <linux/pci.h> + + +/* + * vp_legacy_probe: probe the legacy virtio pci device, note that the + * caller is required to enable PCI device before calling this function. + * @ldev: the legacy virtio-pci device + * + * Return 0 on succeed otherwise fail + */ +int vp_legacy_probe(struct virtio_pci_legacy_device *ldev) +{ + struct pci_dev *pci_dev = ldev->pci_dev; + int rc; + + /* We only own devices >= 0x1000 and <= 0x103f: leave the rest. */ + if (pci_dev->device < 0x1000 || pci_dev->device > 0x103f) + return -ENODEV; + + if (pci_dev->revision != VIRTIO_PCI_ABI_VERSION) + return -ENODEV; + + rc = dma_set_mask(&pci_dev->dev, DMA_BIT_MASK(64)); + if (rc) { + rc = dma_set_mask_and_coherent(&pci_dev->dev, DMA_BIT_MASK(32)); + } else { + /* + * The virtio ring base address is expressed as a 32-bit PFN, + * with a page size of 1 << VIRTIO_PCI_QUEUE_ADDR_SHIFT. + */ + dma_set_coherent_mask(&pci_dev->dev, + DMA_BIT_MASK(32 + VIRTIO_PCI_QUEUE_ADDR_SHIFT)); + } + + if (rc) + dev_warn(&pci_dev->dev, "Failed to enable 64-bit or 32-bit DMA. Trying to continue, but this might not work.\n"); + + rc = pci_request_region(pci_dev, 0, "virtio-pci-legacy"); + if (rc) + return rc; + + ldev->ioaddr = pci_iomap(pci_dev, 0, 0); + if (!ldev->ioaddr) { + rc = -EIO; + goto err_iomap; + } + + ldev->isr = ldev->ioaddr + VIRTIO_PCI_ISR; + + ldev->id.vendor = pci_dev->subsystem_vendor; + ldev->id.device = pci_dev->subsystem_device; + + return 0; +err_iomap: + pci_release_region(pci_dev, 0); + return rc; +} +EXPORT_SYMBOL_GPL(vp_legacy_probe); + +/* + * vp_legacy_probe: remove and cleanup the legacy virtio pci device + * @ldev: the legacy virtio-pci device + */ +void vp_legacy_remove(struct virtio_pci_legacy_device *ldev) +{ + struct pci_dev *pci_dev = ldev->pci_dev; + + pci_iounmap(pci_dev, ldev->ioaddr); + pci_release_region(pci_dev, 0); +} +EXPORT_SYMBOL_GPL(vp_legacy_remove); + +/* + * vp_legacy_get_features - get features from device + * @ldev: the legacy virtio-pci device + * + * Returns the features read from the device + */ +u64 vp_legacy_get_features(struct virtio_pci_legacy_device *ldev) +{ + + return ioread32(ldev->ioaddr + VIRTIO_PCI_HOST_FEATURES); +} +EXPORT_SYMBOL_GPL(vp_legacy_get_features); + +/* + * vp_legacy_get_driver_features - get driver features from device + * @ldev: the legacy virtio-pci device + * + * Returns the driver features read from the device + */ +u64 vp_legacy_get_driver_features(struct virtio_pci_legacy_device *ldev) +{ + return ioread32(ldev->ioaddr + VIRTIO_PCI_GUEST_FEATURES); +} +EXPORT_SYMBOL_GPL(vp_legacy_get_driver_features); + +/* + * vp_legacy_set_features - set features to device + * @ldev: the legacy virtio-pci device + * @features: the features set to device + */ +void vp_legacy_set_features(struct virtio_pci_legacy_device *ldev, + u32 features) +{ + iowrite32(features, ldev->ioaddr + VIRTIO_PCI_GUEST_FEATURES); +} +EXPORT_SYMBOL_GPL(vp_legacy_set_features); + +/* + * vp_legacy_get_status - get the device status + * @ldev: the legacy virtio-pci device + * + * Returns the status read from device + */ +u8 vp_legacy_get_status(struct virtio_pci_legacy_device *ldev) +{ + return ioread8(ldev->ioaddr + VIRTIO_PCI_STATUS); +} +EXPORT_SYMBOL_GPL(vp_legacy_get_status); + +/* + * vp_legacy_set_status - set status to device + * @ldev: the legacy virtio-pci device + * @status: the status set to device + */ +void vp_legacy_set_status(struct virtio_pci_legacy_device *ldev, + u8 status) +{ + iowrite8(status, ldev->ioaddr + VIRTIO_PCI_STATUS); +} +EXPORT_SYMBOL_GPL(vp_legacy_set_status); + +/* + * vp_legacy_queue_vector - set the MSIX vector for a specific virtqueue + * @ldev: the legacy virtio-pci device + * @index: queue index + * @vector: the config vector + * + * Returns the config vector read from the device + */ +u16 vp_legacy_queue_vector(struct virtio_pci_legacy_device *ldev, + u16 index, u16 vector) +{ + iowrite16(index, ldev->ioaddr + VIRTIO_PCI_QUEUE_SEL); + iowrite16(vector, ldev->ioaddr + VIRTIO_MSI_QUEUE_VECTOR); + /* Flush the write out to device */ + return ioread16(ldev->ioaddr + VIRTIO_MSI_QUEUE_VECTOR); +} +EXPORT_SYMBOL_GPL(vp_legacy_queue_vector); + +/* + * vp_legacy_config_vector - set the vector for config interrupt + * @ldev: the legacy virtio-pci device + * @vector: the config vector + * + * Returns the config vector read from the device + */ +u16 vp_legacy_config_vector(struct virtio_pci_legacy_device *ldev, + u16 vector) +{ + /* Setup the vector used for configuration events */ + iowrite16(vector, ldev->ioaddr + VIRTIO_MSI_CONFIG_VECTOR); + /* Verify we had enough resources to assign the vector */ + /* Will also flush the write out to device */ + return ioread16(ldev->ioaddr + VIRTIO_MSI_CONFIG_VECTOR); +} +EXPORT_SYMBOL_GPL(vp_legacy_config_vector); + +/* + * vp_legacy_set_queue_address - set the virtqueue address + * @ldev: the legacy virtio-pci device + * @index: the queue index + * @queue_pfn: pfn of the virtqueue + */ +void vp_legacy_set_queue_address(struct virtio_pci_legacy_device *ldev, + u16 index, u32 queue_pfn) +{ + iowrite16(index, ldev->ioaddr + VIRTIO_PCI_QUEUE_SEL); + iowrite32(queue_pfn, ldev->ioaddr + VIRTIO_PCI_QUEUE_PFN); +} +EXPORT_SYMBOL_GPL(vp_legacy_set_queue_address); + +/* + * vp_legacy_get_queue_enable - enable a virtqueue + * @ldev: the legacy virtio-pci device + * @index: the queue index + * + * Returns whether a virtqueue is enabled or not + */ +bool vp_legacy_get_queue_enable(struct virtio_pci_legacy_device *ldev, + u16 index) +{ + iowrite16(index, ldev->ioaddr + VIRTIO_PCI_QUEUE_SEL); + return ioread32(ldev->ioaddr + VIRTIO_PCI_QUEUE_PFN); +} +EXPORT_SYMBOL_GPL(vp_legacy_get_queue_enable); + +/* + * vp_legacy_get_queue_size - get size for a virtqueue + * @ldev: the legacy virtio-pci device + * @index: the queue index + * + * Returns the size of the virtqueue + */ +u16 vp_legacy_get_queue_size(struct virtio_pci_legacy_device *ldev, + u16 index) +{ + iowrite16(index, ldev->ioaddr + VIRTIO_PCI_QUEUE_SEL); + return ioread16(ldev->ioaddr + VIRTIO_PCI_QUEUE_NUM); +} +EXPORT_SYMBOL_GPL(vp_legacy_get_queue_size); + +MODULE_VERSION("0.1"); +MODULE_DESCRIPTION("Legacy Virtio PCI Device"); +MODULE_AUTHOR("Wu Zongyong <wuzongyong@linux.alibaba.com>"); +MODULE_LICENSE("GPL"); diff --git a/drivers/virtio/virtio_pci_modern.c b/drivers/virtio/virtio_pci_modern.c new file mode 100644 index 0000000000..d6bb68ba84 --- /dev/null +++ b/drivers/virtio/virtio_pci_modern.c @@ -0,0 +1,566 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Virtio PCI driver - modern (virtio 1.0) device support + * + * This module allows virtio devices to be used over a virtual PCI device. + * This can be used with QEMU based VMMs like KVM or Xen. + * + * Copyright IBM Corp. 2007 + * Copyright Red Hat, Inc. 2014 + * + * Authors: + * Anthony Liguori <aliguori@us.ibm.com> + * Rusty Russell <rusty@rustcorp.com.au> + * Michael S. Tsirkin <mst@redhat.com> + */ + +#include <linux/delay.h> +#define VIRTIO_PCI_NO_LEGACY +#define VIRTIO_RING_NO_LEGACY +#include "virtio_pci_common.h" + +static u64 vp_get_features(struct virtio_device *vdev) +{ + struct virtio_pci_device *vp_dev = to_vp_device(vdev); + + return vp_modern_get_features(&vp_dev->mdev); +} + +static void vp_transport_features(struct virtio_device *vdev, u64 features) +{ + struct virtio_pci_device *vp_dev = to_vp_device(vdev); + struct pci_dev *pci_dev = vp_dev->pci_dev; + + if ((features & BIT_ULL(VIRTIO_F_SR_IOV)) && + pci_find_ext_capability(pci_dev, PCI_EXT_CAP_ID_SRIOV)) + __virtio_set_bit(vdev, VIRTIO_F_SR_IOV); + + if (features & BIT_ULL(VIRTIO_F_RING_RESET)) + __virtio_set_bit(vdev, VIRTIO_F_RING_RESET); +} + +/* virtio config->finalize_features() implementation */ +static int vp_finalize_features(struct virtio_device *vdev) +{ + struct virtio_pci_device *vp_dev = to_vp_device(vdev); + u64 features = vdev->features; + + /* Give virtio_ring a chance to accept features. */ + vring_transport_features(vdev); + + /* Give virtio_pci a chance to accept features. */ + vp_transport_features(vdev, features); + + if (!__virtio_test_bit(vdev, VIRTIO_F_VERSION_1)) { + dev_err(&vdev->dev, "virtio: device uses modern interface " + "but does not have VIRTIO_F_VERSION_1\n"); + return -EINVAL; + } + + vp_modern_set_features(&vp_dev->mdev, vdev->features); + + return 0; +} + +/* virtio config->get() implementation */ +static void vp_get(struct virtio_device *vdev, unsigned int offset, + void *buf, unsigned int len) +{ + struct virtio_pci_device *vp_dev = to_vp_device(vdev); + struct virtio_pci_modern_device *mdev = &vp_dev->mdev; + void __iomem *device = mdev->device; + u8 b; + __le16 w; + __le32 l; + + BUG_ON(offset + len > mdev->device_len); + + switch (len) { + case 1: + b = ioread8(device + offset); + memcpy(buf, &b, sizeof b); + break; + case 2: + w = cpu_to_le16(ioread16(device + offset)); + memcpy(buf, &w, sizeof w); + break; + case 4: + l = cpu_to_le32(ioread32(device + offset)); + memcpy(buf, &l, sizeof l); + break; + case 8: + l = cpu_to_le32(ioread32(device + offset)); + memcpy(buf, &l, sizeof l); + l = cpu_to_le32(ioread32(device + offset + sizeof l)); + memcpy(buf + sizeof l, &l, sizeof l); + break; + default: + BUG(); + } +} + +/* the config->set() implementation. it's symmetric to the config->get() + * implementation */ +static void vp_set(struct virtio_device *vdev, unsigned int offset, + const void *buf, unsigned int len) +{ + struct virtio_pci_device *vp_dev = to_vp_device(vdev); + struct virtio_pci_modern_device *mdev = &vp_dev->mdev; + void __iomem *device = mdev->device; + u8 b; + __le16 w; + __le32 l; + + BUG_ON(offset + len > mdev->device_len); + + switch (len) { + case 1: + memcpy(&b, buf, sizeof b); + iowrite8(b, device + offset); + break; + case 2: + memcpy(&w, buf, sizeof w); + iowrite16(le16_to_cpu(w), device + offset); + break; + case 4: + memcpy(&l, buf, sizeof l); + iowrite32(le32_to_cpu(l), device + offset); + break; + case 8: + memcpy(&l, buf, sizeof l); + iowrite32(le32_to_cpu(l), device + offset); + memcpy(&l, buf + sizeof l, sizeof l); + iowrite32(le32_to_cpu(l), device + offset + sizeof l); + break; + default: + BUG(); + } +} + +static u32 vp_generation(struct virtio_device *vdev) +{ + struct virtio_pci_device *vp_dev = to_vp_device(vdev); + + return vp_modern_generation(&vp_dev->mdev); +} + +/* config->{get,set}_status() implementations */ +static u8 vp_get_status(struct virtio_device *vdev) +{ + struct virtio_pci_device *vp_dev = to_vp_device(vdev); + + return vp_modern_get_status(&vp_dev->mdev); +} + +static void vp_set_status(struct virtio_device *vdev, u8 status) +{ + struct virtio_pci_device *vp_dev = to_vp_device(vdev); + + /* We should never be setting status to 0. */ + BUG_ON(status == 0); + vp_modern_set_status(&vp_dev->mdev, status); +} + +static void vp_reset(struct virtio_device *vdev) +{ + struct virtio_pci_device *vp_dev = to_vp_device(vdev); + struct virtio_pci_modern_device *mdev = &vp_dev->mdev; + + /* 0 status means a reset. */ + vp_modern_set_status(mdev, 0); + /* After writing 0 to device_status, the driver MUST wait for a read of + * device_status to return 0 before reinitializing the device. + * This will flush out the status write, and flush in device writes, + * including MSI-X interrupts, if any. + */ + while (vp_modern_get_status(mdev)) + msleep(1); + /* Flush pending VQ/configuration callbacks. */ + vp_synchronize_vectors(vdev); +} + +static int vp_active_vq(struct virtqueue *vq, u16 msix_vec) +{ + struct virtio_pci_device *vp_dev = to_vp_device(vq->vdev); + struct virtio_pci_modern_device *mdev = &vp_dev->mdev; + unsigned long index; + + index = vq->index; + + /* activate the queue */ + vp_modern_set_queue_size(mdev, index, virtqueue_get_vring_size(vq)); + vp_modern_queue_address(mdev, index, virtqueue_get_desc_addr(vq), + virtqueue_get_avail_addr(vq), + virtqueue_get_used_addr(vq)); + + if (msix_vec != VIRTIO_MSI_NO_VECTOR) { + msix_vec = vp_modern_queue_vector(mdev, index, msix_vec); + if (msix_vec == VIRTIO_MSI_NO_VECTOR) + return -EBUSY; + } + + return 0; +} + +static int vp_modern_disable_vq_and_reset(struct virtqueue *vq) +{ + struct virtio_pci_device *vp_dev = to_vp_device(vq->vdev); + struct virtio_pci_modern_device *mdev = &vp_dev->mdev; + struct virtio_pci_vq_info *info; + unsigned long flags; + + if (!virtio_has_feature(vq->vdev, VIRTIO_F_RING_RESET)) + return -ENOENT; + + vp_modern_set_queue_reset(mdev, vq->index); + + info = vp_dev->vqs[vq->index]; + + /* delete vq from irq handler */ + spin_lock_irqsave(&vp_dev->lock, flags); + list_del(&info->node); + spin_unlock_irqrestore(&vp_dev->lock, flags); + + INIT_LIST_HEAD(&info->node); + +#ifdef CONFIG_VIRTIO_HARDEN_NOTIFICATION + __virtqueue_break(vq); +#endif + + /* For the case where vq has an exclusive irq, call synchronize_irq() to + * wait for completion. + * + * note: We can't use disable_irq() since it conflicts with the affinity + * managed IRQ that is used by some drivers. + */ + if (vp_dev->per_vq_vectors && info->msix_vector != VIRTIO_MSI_NO_VECTOR) + synchronize_irq(pci_irq_vector(vp_dev->pci_dev, info->msix_vector)); + + vq->reset = true; + + return 0; +} + +static int vp_modern_enable_vq_after_reset(struct virtqueue *vq) +{ + struct virtio_pci_device *vp_dev = to_vp_device(vq->vdev); + struct virtio_pci_modern_device *mdev = &vp_dev->mdev; + struct virtio_pci_vq_info *info; + unsigned long flags, index; + int err; + + if (!vq->reset) + return -EBUSY; + + index = vq->index; + info = vp_dev->vqs[index]; + + if (vp_modern_get_queue_reset(mdev, index)) + return -EBUSY; + + if (vp_modern_get_queue_enable(mdev, index)) + return -EBUSY; + + err = vp_active_vq(vq, info->msix_vector); + if (err) + return err; + + if (vq->callback) { + spin_lock_irqsave(&vp_dev->lock, flags); + list_add(&info->node, &vp_dev->virtqueues); + spin_unlock_irqrestore(&vp_dev->lock, flags); + } else { + INIT_LIST_HEAD(&info->node); + } + +#ifdef CONFIG_VIRTIO_HARDEN_NOTIFICATION + __virtqueue_unbreak(vq); +#endif + + vp_modern_set_queue_enable(&vp_dev->mdev, index, true); + vq->reset = false; + + return 0; +} + +static u16 vp_config_vector(struct virtio_pci_device *vp_dev, u16 vector) +{ + return vp_modern_config_vector(&vp_dev->mdev, vector); +} + +static bool vp_notify_with_data(struct virtqueue *vq) +{ + u32 data = vring_notification_data(vq); + + iowrite32(data, (void __iomem *)vq->priv); + + return true; +} + +static struct virtqueue *setup_vq(struct virtio_pci_device *vp_dev, + struct virtio_pci_vq_info *info, + unsigned int index, + void (*callback)(struct virtqueue *vq), + const char *name, + bool ctx, + u16 msix_vec) +{ + + struct virtio_pci_modern_device *mdev = &vp_dev->mdev; + bool (*notify)(struct virtqueue *vq); + struct virtqueue *vq; + u16 num; + int err; + + if (__virtio_test_bit(&vp_dev->vdev, VIRTIO_F_NOTIFICATION_DATA)) + notify = vp_notify_with_data; + else + notify = vp_notify; + + if (index >= vp_modern_get_num_queues(mdev)) + return ERR_PTR(-EINVAL); + + /* Check if queue is either not available or already active. */ + num = vp_modern_get_queue_size(mdev, index); + if (!num || vp_modern_get_queue_enable(mdev, index)) + return ERR_PTR(-ENOENT); + + info->msix_vector = msix_vec; + + /* create the vring */ + vq = vring_create_virtqueue(index, num, + SMP_CACHE_BYTES, &vp_dev->vdev, + true, true, ctx, + notify, callback, name); + if (!vq) + return ERR_PTR(-ENOMEM); + + vq->num_max = num; + + err = vp_active_vq(vq, msix_vec); + if (err) + goto err; + + vq->priv = (void __force *)vp_modern_map_vq_notify(mdev, index, NULL); + if (!vq->priv) { + err = -ENOMEM; + goto err; + } + + return vq; + +err: + vring_del_virtqueue(vq); + return ERR_PTR(err); +} + +static int vp_modern_find_vqs(struct virtio_device *vdev, unsigned int nvqs, + struct virtqueue *vqs[], + vq_callback_t *callbacks[], + const char * const names[], const bool *ctx, + struct irq_affinity *desc) +{ + struct virtio_pci_device *vp_dev = to_vp_device(vdev); + struct virtqueue *vq; + int rc = vp_find_vqs(vdev, nvqs, vqs, callbacks, names, ctx, desc); + + if (rc) + return rc; + + /* Select and activate all queues. Has to be done last: once we do + * this, there's no way to go back except reset. + */ + list_for_each_entry(vq, &vdev->vqs, list) + vp_modern_set_queue_enable(&vp_dev->mdev, vq->index, true); + + return 0; +} + +static void del_vq(struct virtio_pci_vq_info *info) +{ + struct virtqueue *vq = info->vq; + struct virtio_pci_device *vp_dev = to_vp_device(vq->vdev); + struct virtio_pci_modern_device *mdev = &vp_dev->mdev; + + if (vp_dev->msix_enabled) + vp_modern_queue_vector(mdev, vq->index, + VIRTIO_MSI_NO_VECTOR); + + if (!mdev->notify_base) + pci_iounmap(mdev->pci_dev, (void __force __iomem *)vq->priv); + + vring_del_virtqueue(vq); +} + +static int virtio_pci_find_shm_cap(struct pci_dev *dev, u8 required_id, + u8 *bar, u64 *offset, u64 *len) +{ + int pos; + + for (pos = pci_find_capability(dev, PCI_CAP_ID_VNDR); pos > 0; + pos = pci_find_next_capability(dev, pos, PCI_CAP_ID_VNDR)) { + u8 type, cap_len, id, res_bar; + u32 tmp32; + u64 res_offset, res_length; + + pci_read_config_byte(dev, pos + offsetof(struct virtio_pci_cap, + cfg_type), &type); + if (type != VIRTIO_PCI_CAP_SHARED_MEMORY_CFG) + continue; + + pci_read_config_byte(dev, pos + offsetof(struct virtio_pci_cap, + cap_len), &cap_len); + if (cap_len != sizeof(struct virtio_pci_cap64)) { + dev_err(&dev->dev, "%s: shm cap with bad size offset:" + " %d size: %d\n", __func__, pos, cap_len); + continue; + } + + pci_read_config_byte(dev, pos + offsetof(struct virtio_pci_cap, + id), &id); + if (id != required_id) + continue; + + pci_read_config_byte(dev, pos + offsetof(struct virtio_pci_cap, + bar), &res_bar); + if (res_bar >= PCI_STD_NUM_BARS) + continue; + + /* Type and ID match, and the BAR value isn't reserved. + * Looks good. + */ + + /* Read the lower 32bit of length and offset */ + pci_read_config_dword(dev, pos + offsetof(struct virtio_pci_cap, + offset), &tmp32); + res_offset = tmp32; + pci_read_config_dword(dev, pos + offsetof(struct virtio_pci_cap, + length), &tmp32); + res_length = tmp32; + + /* and now the top half */ + pci_read_config_dword(dev, + pos + offsetof(struct virtio_pci_cap64, + offset_hi), &tmp32); + res_offset |= ((u64)tmp32) << 32; + pci_read_config_dword(dev, + pos + offsetof(struct virtio_pci_cap64, + length_hi), &tmp32); + res_length |= ((u64)tmp32) << 32; + + *bar = res_bar; + *offset = res_offset; + *len = res_length; + + return pos; + } + return 0; +} + +static bool vp_get_shm_region(struct virtio_device *vdev, + struct virtio_shm_region *region, u8 id) +{ + struct virtio_pci_device *vp_dev = to_vp_device(vdev); + struct pci_dev *pci_dev = vp_dev->pci_dev; + u8 bar; + u64 offset, len; + phys_addr_t phys_addr; + size_t bar_len; + + if (!virtio_pci_find_shm_cap(pci_dev, id, &bar, &offset, &len)) + return false; + + phys_addr = pci_resource_start(pci_dev, bar); + bar_len = pci_resource_len(pci_dev, bar); + + if ((offset + len) < offset) { + dev_err(&pci_dev->dev, "%s: cap offset+len overflow detected\n", + __func__); + return false; + } + + if (offset + len > bar_len) { + dev_err(&pci_dev->dev, "%s: bar shorter than cap offset+len\n", + __func__); + return false; + } + + region->len = len; + region->addr = (u64) phys_addr + offset; + + return true; +} + +static const struct virtio_config_ops virtio_pci_config_nodev_ops = { + .get = NULL, + .set = NULL, + .generation = vp_generation, + .get_status = vp_get_status, + .set_status = vp_set_status, + .reset = vp_reset, + .find_vqs = vp_modern_find_vqs, + .del_vqs = vp_del_vqs, + .synchronize_cbs = vp_synchronize_vectors, + .get_features = vp_get_features, + .finalize_features = vp_finalize_features, + .bus_name = vp_bus_name, + .set_vq_affinity = vp_set_vq_affinity, + .get_vq_affinity = vp_get_vq_affinity, + .get_shm_region = vp_get_shm_region, + .disable_vq_and_reset = vp_modern_disable_vq_and_reset, + .enable_vq_after_reset = vp_modern_enable_vq_after_reset, +}; + +static const struct virtio_config_ops virtio_pci_config_ops = { + .get = vp_get, + .set = vp_set, + .generation = vp_generation, + .get_status = vp_get_status, + .set_status = vp_set_status, + .reset = vp_reset, + .find_vqs = vp_modern_find_vqs, + .del_vqs = vp_del_vqs, + .synchronize_cbs = vp_synchronize_vectors, + .get_features = vp_get_features, + .finalize_features = vp_finalize_features, + .bus_name = vp_bus_name, + .set_vq_affinity = vp_set_vq_affinity, + .get_vq_affinity = vp_get_vq_affinity, + .get_shm_region = vp_get_shm_region, + .disable_vq_and_reset = vp_modern_disable_vq_and_reset, + .enable_vq_after_reset = vp_modern_enable_vq_after_reset, +}; + +/* the PCI probing function */ +int virtio_pci_modern_probe(struct virtio_pci_device *vp_dev) +{ + struct virtio_pci_modern_device *mdev = &vp_dev->mdev; + struct pci_dev *pci_dev = vp_dev->pci_dev; + int err; + + mdev->pci_dev = pci_dev; + + err = vp_modern_probe(mdev); + if (err) + return err; + + if (mdev->device) + vp_dev->vdev.config = &virtio_pci_config_ops; + else + vp_dev->vdev.config = &virtio_pci_config_nodev_ops; + + vp_dev->config_vector = vp_config_vector; + vp_dev->setup_vq = setup_vq; + vp_dev->del_vq = del_vq; + vp_dev->isr = mdev->isr; + vp_dev->vdev.id = mdev->id; + + return 0; +} + +void virtio_pci_modern_remove(struct virtio_pci_device *vp_dev) +{ + struct virtio_pci_modern_device *mdev = &vp_dev->mdev; + + vp_modern_remove(mdev); +} diff --git a/drivers/virtio/virtio_pci_modern_dev.c b/drivers/virtio/virtio_pci_modern_dev.c new file mode 100644 index 0000000000..9cb601e166 --- /dev/null +++ b/drivers/virtio/virtio_pci_modern_dev.c @@ -0,0 +1,720 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include <linux/virtio_pci_modern.h> +#include <linux/module.h> +#include <linux/pci.h> +#include <linux/delay.h> + +/* + * vp_modern_map_capability - map a part of virtio pci capability + * @mdev: the modern virtio-pci device + * @off: offset of the capability + * @minlen: minimal length of the capability + * @align: align requirement + * @start: start from the capability + * @size: map size + * @len: the length that is actually mapped + * @pa: physical address of the capability + * + * Returns the io address of for the part of the capability + */ +static void __iomem * +vp_modern_map_capability(struct virtio_pci_modern_device *mdev, int off, + size_t minlen, u32 align, u32 start, u32 size, + size_t *len, resource_size_t *pa) +{ + struct pci_dev *dev = mdev->pci_dev; + u8 bar; + u32 offset, length; + void __iomem *p; + + pci_read_config_byte(dev, off + offsetof(struct virtio_pci_cap, + bar), + &bar); + pci_read_config_dword(dev, off + offsetof(struct virtio_pci_cap, offset), + &offset); + pci_read_config_dword(dev, off + offsetof(struct virtio_pci_cap, length), + &length); + + /* Check if the BAR may have changed since we requested the region. */ + if (bar >= PCI_STD_NUM_BARS || !(mdev->modern_bars & (1 << bar))) { + dev_err(&dev->dev, + "virtio_pci: bar unexpectedly changed to %u\n", bar); + return NULL; + } + + if (length <= start) { + dev_err(&dev->dev, + "virtio_pci: bad capability len %u (>%u expected)\n", + length, start); + return NULL; + } + + if (length - start < minlen) { + dev_err(&dev->dev, + "virtio_pci: bad capability len %u (>=%zu expected)\n", + length, minlen); + return NULL; + } + + length -= start; + + if (start + offset < offset) { + dev_err(&dev->dev, + "virtio_pci: map wrap-around %u+%u\n", + start, offset); + return NULL; + } + + offset += start; + + if (offset & (align - 1)) { + dev_err(&dev->dev, + "virtio_pci: offset %u not aligned to %u\n", + offset, align); + return NULL; + } + + if (length > size) + length = size; + + if (len) + *len = length; + + if (minlen + offset < minlen || + minlen + offset > pci_resource_len(dev, bar)) { + dev_err(&dev->dev, + "virtio_pci: map virtio %zu@%u " + "out of range on bar %i length %lu\n", + minlen, offset, + bar, (unsigned long)pci_resource_len(dev, bar)); + return NULL; + } + + p = pci_iomap_range(dev, bar, offset, length); + if (!p) + dev_err(&dev->dev, + "virtio_pci: unable to map virtio %u@%u on bar %i\n", + length, offset, bar); + else if (pa) + *pa = pci_resource_start(dev, bar) + offset; + + return p; +} + +/** + * virtio_pci_find_capability - walk capabilities to find device info. + * @dev: the pci device + * @cfg_type: the VIRTIO_PCI_CAP_* value we seek + * @ioresource_types: IORESOURCE_MEM and/or IORESOURCE_IO. + * @bars: the bitmask of BARs + * + * Returns offset of the capability, or 0. + */ +static inline int virtio_pci_find_capability(struct pci_dev *dev, u8 cfg_type, + u32 ioresource_types, int *bars) +{ + int pos; + + for (pos = pci_find_capability(dev, PCI_CAP_ID_VNDR); + pos > 0; + pos = pci_find_next_capability(dev, pos, PCI_CAP_ID_VNDR)) { + u8 type, bar; + pci_read_config_byte(dev, pos + offsetof(struct virtio_pci_cap, + cfg_type), + &type); + pci_read_config_byte(dev, pos + offsetof(struct virtio_pci_cap, + bar), + &bar); + + /* Ignore structures with reserved BAR values */ + if (bar >= PCI_STD_NUM_BARS) + continue; + + if (type == cfg_type) { + if (pci_resource_len(dev, bar) && + pci_resource_flags(dev, bar) & ioresource_types) { + *bars |= (1 << bar); + return pos; + } + } + } + return 0; +} + +/* This is part of the ABI. Don't screw with it. */ +static inline void check_offsets(void) +{ + /* Note: disk space was harmed in compilation of this function. */ + BUILD_BUG_ON(VIRTIO_PCI_CAP_VNDR != + offsetof(struct virtio_pci_cap, cap_vndr)); + BUILD_BUG_ON(VIRTIO_PCI_CAP_NEXT != + offsetof(struct virtio_pci_cap, cap_next)); + BUILD_BUG_ON(VIRTIO_PCI_CAP_LEN != + offsetof(struct virtio_pci_cap, cap_len)); + BUILD_BUG_ON(VIRTIO_PCI_CAP_CFG_TYPE != + offsetof(struct virtio_pci_cap, cfg_type)); + BUILD_BUG_ON(VIRTIO_PCI_CAP_BAR != + offsetof(struct virtio_pci_cap, bar)); + BUILD_BUG_ON(VIRTIO_PCI_CAP_OFFSET != + offsetof(struct virtio_pci_cap, offset)); + BUILD_BUG_ON(VIRTIO_PCI_CAP_LENGTH != + offsetof(struct virtio_pci_cap, length)); + BUILD_BUG_ON(VIRTIO_PCI_NOTIFY_CAP_MULT != + offsetof(struct virtio_pci_notify_cap, + notify_off_multiplier)); + BUILD_BUG_ON(VIRTIO_PCI_COMMON_DFSELECT != + offsetof(struct virtio_pci_common_cfg, + device_feature_select)); + BUILD_BUG_ON(VIRTIO_PCI_COMMON_DF != + offsetof(struct virtio_pci_common_cfg, device_feature)); + BUILD_BUG_ON(VIRTIO_PCI_COMMON_GFSELECT != + offsetof(struct virtio_pci_common_cfg, + guest_feature_select)); + BUILD_BUG_ON(VIRTIO_PCI_COMMON_GF != + offsetof(struct virtio_pci_common_cfg, guest_feature)); + BUILD_BUG_ON(VIRTIO_PCI_COMMON_MSIX != + offsetof(struct virtio_pci_common_cfg, msix_config)); + BUILD_BUG_ON(VIRTIO_PCI_COMMON_NUMQ != + offsetof(struct virtio_pci_common_cfg, num_queues)); + BUILD_BUG_ON(VIRTIO_PCI_COMMON_STATUS != + offsetof(struct virtio_pci_common_cfg, device_status)); + BUILD_BUG_ON(VIRTIO_PCI_COMMON_CFGGENERATION != + offsetof(struct virtio_pci_common_cfg, config_generation)); + BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_SELECT != + offsetof(struct virtio_pci_common_cfg, queue_select)); + BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_SIZE != + offsetof(struct virtio_pci_common_cfg, queue_size)); + BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_MSIX != + offsetof(struct virtio_pci_common_cfg, queue_msix_vector)); + BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_ENABLE != + offsetof(struct virtio_pci_common_cfg, queue_enable)); + BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_NOFF != + offsetof(struct virtio_pci_common_cfg, queue_notify_off)); + BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_DESCLO != + offsetof(struct virtio_pci_common_cfg, queue_desc_lo)); + BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_DESCHI != + offsetof(struct virtio_pci_common_cfg, queue_desc_hi)); + BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_AVAILLO != + offsetof(struct virtio_pci_common_cfg, queue_avail_lo)); + BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_AVAILHI != + offsetof(struct virtio_pci_common_cfg, queue_avail_hi)); + BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_USEDLO != + offsetof(struct virtio_pci_common_cfg, queue_used_lo)); + BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_USEDHI != + offsetof(struct virtio_pci_common_cfg, queue_used_hi)); +} + +/* + * vp_modern_probe: probe the modern virtio pci device, note that the + * caller is required to enable PCI device before calling this function. + * @mdev: the modern virtio-pci device + * + * Return 0 on succeed otherwise fail + */ +int vp_modern_probe(struct virtio_pci_modern_device *mdev) +{ + struct pci_dev *pci_dev = mdev->pci_dev; + int err, common, isr, notify, device; + u32 notify_length; + u32 notify_offset; + int devid; + + check_offsets(); + + if (mdev->device_id_check) { + devid = mdev->device_id_check(pci_dev); + if (devid < 0) + return devid; + mdev->id.device = devid; + } else { + /* We only own devices >= 0x1000 and <= 0x107f: leave the rest. */ + if (pci_dev->device < 0x1000 || pci_dev->device > 0x107f) + return -ENODEV; + + if (pci_dev->device < 0x1040) { + /* Transitional devices: use the PCI subsystem device id as + * virtio device id, same as legacy driver always did. + */ + mdev->id.device = pci_dev->subsystem_device; + } else { + /* Modern devices: simply use PCI device id, but start from 0x1040. */ + mdev->id.device = pci_dev->device - 0x1040; + } + } + mdev->id.vendor = pci_dev->subsystem_vendor; + + /* check for a common config: if not, use legacy mode (bar 0). */ + common = virtio_pci_find_capability(pci_dev, VIRTIO_PCI_CAP_COMMON_CFG, + IORESOURCE_IO | IORESOURCE_MEM, + &mdev->modern_bars); + if (!common) { + dev_info(&pci_dev->dev, + "virtio_pci: leaving for legacy driver\n"); + return -ENODEV; + } + + /* If common is there, these should be too... */ + isr = virtio_pci_find_capability(pci_dev, VIRTIO_PCI_CAP_ISR_CFG, + IORESOURCE_IO | IORESOURCE_MEM, + &mdev->modern_bars); + notify = virtio_pci_find_capability(pci_dev, VIRTIO_PCI_CAP_NOTIFY_CFG, + IORESOURCE_IO | IORESOURCE_MEM, + &mdev->modern_bars); + if (!isr || !notify) { + dev_err(&pci_dev->dev, + "virtio_pci: missing capabilities %i/%i/%i\n", + common, isr, notify); + return -EINVAL; + } + + err = dma_set_mask_and_coherent(&pci_dev->dev, + mdev->dma_mask ? : DMA_BIT_MASK(64)); + if (err) + err = dma_set_mask_and_coherent(&pci_dev->dev, + DMA_BIT_MASK(32)); + if (err) + dev_warn(&pci_dev->dev, "Failed to enable 64-bit or 32-bit DMA. Trying to continue, but this might not work.\n"); + + /* Device capability is only mandatory for devices that have + * device-specific configuration. + */ + device = virtio_pci_find_capability(pci_dev, VIRTIO_PCI_CAP_DEVICE_CFG, + IORESOURCE_IO | IORESOURCE_MEM, + &mdev->modern_bars); + + err = pci_request_selected_regions(pci_dev, mdev->modern_bars, + "virtio-pci-modern"); + if (err) + return err; + + err = -EINVAL; + mdev->common = vp_modern_map_capability(mdev, common, + sizeof(struct virtio_pci_common_cfg), 4, + 0, sizeof(struct virtio_pci_modern_common_cfg), + NULL, NULL); + if (!mdev->common) + goto err_map_common; + mdev->isr = vp_modern_map_capability(mdev, isr, sizeof(u8), 1, + 0, 1, + NULL, NULL); + if (!mdev->isr) + goto err_map_isr; + + /* Read notify_off_multiplier from config space. */ + pci_read_config_dword(pci_dev, + notify + offsetof(struct virtio_pci_notify_cap, + notify_off_multiplier), + &mdev->notify_offset_multiplier); + /* Read notify length and offset from config space. */ + pci_read_config_dword(pci_dev, + notify + offsetof(struct virtio_pci_notify_cap, + cap.length), + ¬ify_length); + + pci_read_config_dword(pci_dev, + notify + offsetof(struct virtio_pci_notify_cap, + cap.offset), + ¬ify_offset); + + /* We don't know how many VQs we'll map, ahead of the time. + * If notify length is small, map it all now. + * Otherwise, map each VQ individually later. + */ + if ((u64)notify_length + (notify_offset % PAGE_SIZE) <= PAGE_SIZE) { + mdev->notify_base = vp_modern_map_capability(mdev, notify, + 2, 2, + 0, notify_length, + &mdev->notify_len, + &mdev->notify_pa); + if (!mdev->notify_base) + goto err_map_notify; + } else { + mdev->notify_map_cap = notify; + } + + /* Again, we don't know how much we should map, but PAGE_SIZE + * is more than enough for all existing devices. + */ + if (device) { + mdev->device = vp_modern_map_capability(mdev, device, 0, 4, + 0, PAGE_SIZE, + &mdev->device_len, + NULL); + if (!mdev->device) + goto err_map_device; + } + + return 0; + +err_map_device: + if (mdev->notify_base) + pci_iounmap(pci_dev, mdev->notify_base); +err_map_notify: + pci_iounmap(pci_dev, mdev->isr); +err_map_isr: + pci_iounmap(pci_dev, mdev->common); +err_map_common: + pci_release_selected_regions(pci_dev, mdev->modern_bars); + return err; +} +EXPORT_SYMBOL_GPL(vp_modern_probe); + +/* + * vp_modern_remove: remove and cleanup the modern virtio pci device + * @mdev: the modern virtio-pci device + */ +void vp_modern_remove(struct virtio_pci_modern_device *mdev) +{ + struct pci_dev *pci_dev = mdev->pci_dev; + + if (mdev->device) + pci_iounmap(pci_dev, mdev->device); + if (mdev->notify_base) + pci_iounmap(pci_dev, mdev->notify_base); + pci_iounmap(pci_dev, mdev->isr); + pci_iounmap(pci_dev, mdev->common); + pci_release_selected_regions(pci_dev, mdev->modern_bars); +} +EXPORT_SYMBOL_GPL(vp_modern_remove); + +/* + * vp_modern_get_features - get features from device + * @mdev: the modern virtio-pci device + * + * Returns the features read from the device + */ +u64 vp_modern_get_features(struct virtio_pci_modern_device *mdev) +{ + struct virtio_pci_common_cfg __iomem *cfg = mdev->common; + + u64 features; + + vp_iowrite32(0, &cfg->device_feature_select); + features = vp_ioread32(&cfg->device_feature); + vp_iowrite32(1, &cfg->device_feature_select); + features |= ((u64)vp_ioread32(&cfg->device_feature) << 32); + + return features; +} +EXPORT_SYMBOL_GPL(vp_modern_get_features); + +/* + * vp_modern_get_driver_features - get driver features from device + * @mdev: the modern virtio-pci device + * + * Returns the driver features read from the device + */ +u64 vp_modern_get_driver_features(struct virtio_pci_modern_device *mdev) +{ + struct virtio_pci_common_cfg __iomem *cfg = mdev->common; + + u64 features; + + vp_iowrite32(0, &cfg->guest_feature_select); + features = vp_ioread32(&cfg->guest_feature); + vp_iowrite32(1, &cfg->guest_feature_select); + features |= ((u64)vp_ioread32(&cfg->guest_feature) << 32); + + return features; +} +EXPORT_SYMBOL_GPL(vp_modern_get_driver_features); + +/* + * vp_modern_set_features - set features to device + * @mdev: the modern virtio-pci device + * @features: the features set to device + */ +void vp_modern_set_features(struct virtio_pci_modern_device *mdev, + u64 features) +{ + struct virtio_pci_common_cfg __iomem *cfg = mdev->common; + + vp_iowrite32(0, &cfg->guest_feature_select); + vp_iowrite32((u32)features, &cfg->guest_feature); + vp_iowrite32(1, &cfg->guest_feature_select); + vp_iowrite32(features >> 32, &cfg->guest_feature); +} +EXPORT_SYMBOL_GPL(vp_modern_set_features); + +/* + * vp_modern_generation - get the device genreation + * @mdev: the modern virtio-pci device + * + * Returns the genreation read from device + */ +u32 vp_modern_generation(struct virtio_pci_modern_device *mdev) +{ + struct virtio_pci_common_cfg __iomem *cfg = mdev->common; + + return vp_ioread8(&cfg->config_generation); +} +EXPORT_SYMBOL_GPL(vp_modern_generation); + +/* + * vp_modern_get_status - get the device status + * @mdev: the modern virtio-pci device + * + * Returns the status read from device + */ +u8 vp_modern_get_status(struct virtio_pci_modern_device *mdev) +{ + struct virtio_pci_common_cfg __iomem *cfg = mdev->common; + + return vp_ioread8(&cfg->device_status); +} +EXPORT_SYMBOL_GPL(vp_modern_get_status); + +/* + * vp_modern_set_status - set status to device + * @mdev: the modern virtio-pci device + * @status: the status set to device + */ +void vp_modern_set_status(struct virtio_pci_modern_device *mdev, + u8 status) +{ + struct virtio_pci_common_cfg __iomem *cfg = mdev->common; + + /* + * Per memory-barriers.txt, wmb() is not needed to guarantee + * that the cache coherent memory writes have completed + * before writing to the MMIO region. + */ + vp_iowrite8(status, &cfg->device_status); +} +EXPORT_SYMBOL_GPL(vp_modern_set_status); + +/* + * vp_modern_get_queue_reset - get the queue reset status + * @mdev: the modern virtio-pci device + * @index: queue index + */ +int vp_modern_get_queue_reset(struct virtio_pci_modern_device *mdev, u16 index) +{ + struct virtio_pci_modern_common_cfg __iomem *cfg; + + cfg = (struct virtio_pci_modern_common_cfg __iomem *)mdev->common; + + vp_iowrite16(index, &cfg->cfg.queue_select); + return vp_ioread16(&cfg->queue_reset); +} +EXPORT_SYMBOL_GPL(vp_modern_get_queue_reset); + +/* + * vp_modern_set_queue_reset - reset the queue + * @mdev: the modern virtio-pci device + * @index: queue index + */ +void vp_modern_set_queue_reset(struct virtio_pci_modern_device *mdev, u16 index) +{ + struct virtio_pci_modern_common_cfg __iomem *cfg; + + cfg = (struct virtio_pci_modern_common_cfg __iomem *)mdev->common; + + vp_iowrite16(index, &cfg->cfg.queue_select); + vp_iowrite16(1, &cfg->queue_reset); + + while (vp_ioread16(&cfg->queue_reset)) + msleep(1); + + while (vp_ioread16(&cfg->cfg.queue_enable)) + msleep(1); +} +EXPORT_SYMBOL_GPL(vp_modern_set_queue_reset); + +/* + * vp_modern_queue_vector - set the MSIX vector for a specific virtqueue + * @mdev: the modern virtio-pci device + * @index: queue index + * @vector: the config vector + * + * Returns the config vector read from the device + */ +u16 vp_modern_queue_vector(struct virtio_pci_modern_device *mdev, + u16 index, u16 vector) +{ + struct virtio_pci_common_cfg __iomem *cfg = mdev->common; + + vp_iowrite16(index, &cfg->queue_select); + vp_iowrite16(vector, &cfg->queue_msix_vector); + /* Flush the write out to device */ + return vp_ioread16(&cfg->queue_msix_vector); +} +EXPORT_SYMBOL_GPL(vp_modern_queue_vector); + +/* + * vp_modern_config_vector - set the vector for config interrupt + * @mdev: the modern virtio-pci device + * @vector: the config vector + * + * Returns the config vector read from the device + */ +u16 vp_modern_config_vector(struct virtio_pci_modern_device *mdev, + u16 vector) +{ + struct virtio_pci_common_cfg __iomem *cfg = mdev->common; + + /* Setup the vector used for configuration events */ + vp_iowrite16(vector, &cfg->msix_config); + /* Verify we had enough resources to assign the vector */ + /* Will also flush the write out to device */ + return vp_ioread16(&cfg->msix_config); +} +EXPORT_SYMBOL_GPL(vp_modern_config_vector); + +/* + * vp_modern_queue_address - set the virtqueue address + * @mdev: the modern virtio-pci device + * @index: the queue index + * @desc_addr: address of the descriptor area + * @driver_addr: address of the driver area + * @device_addr: address of the device area + */ +void vp_modern_queue_address(struct virtio_pci_modern_device *mdev, + u16 index, u64 desc_addr, u64 driver_addr, + u64 device_addr) +{ + struct virtio_pci_common_cfg __iomem *cfg = mdev->common; + + vp_iowrite16(index, &cfg->queue_select); + + vp_iowrite64_twopart(desc_addr, &cfg->queue_desc_lo, + &cfg->queue_desc_hi); + vp_iowrite64_twopart(driver_addr, &cfg->queue_avail_lo, + &cfg->queue_avail_hi); + vp_iowrite64_twopart(device_addr, &cfg->queue_used_lo, + &cfg->queue_used_hi); +} +EXPORT_SYMBOL_GPL(vp_modern_queue_address); + +/* + * vp_modern_set_queue_enable - enable a virtqueue + * @mdev: the modern virtio-pci device + * @index: the queue index + * @enable: whether the virtqueue is enable or not + */ +void vp_modern_set_queue_enable(struct virtio_pci_modern_device *mdev, + u16 index, bool enable) +{ + vp_iowrite16(index, &mdev->common->queue_select); + vp_iowrite16(enable, &mdev->common->queue_enable); +} +EXPORT_SYMBOL_GPL(vp_modern_set_queue_enable); + +/* + * vp_modern_get_queue_enable - enable a virtqueue + * @mdev: the modern virtio-pci device + * @index: the queue index + * + * Returns whether a virtqueue is enabled or not + */ +bool vp_modern_get_queue_enable(struct virtio_pci_modern_device *mdev, + u16 index) +{ + vp_iowrite16(index, &mdev->common->queue_select); + + return vp_ioread16(&mdev->common->queue_enable); +} +EXPORT_SYMBOL_GPL(vp_modern_get_queue_enable); + +/* + * vp_modern_set_queue_size - set size for a virtqueue + * @mdev: the modern virtio-pci device + * @index: the queue index + * @size: the size of the virtqueue + */ +void vp_modern_set_queue_size(struct virtio_pci_modern_device *mdev, + u16 index, u16 size) +{ + vp_iowrite16(index, &mdev->common->queue_select); + vp_iowrite16(size, &mdev->common->queue_size); + +} +EXPORT_SYMBOL_GPL(vp_modern_set_queue_size); + +/* + * vp_modern_get_queue_size - get size for a virtqueue + * @mdev: the modern virtio-pci device + * @index: the queue index + * + * Returns the size of the virtqueue + */ +u16 vp_modern_get_queue_size(struct virtio_pci_modern_device *mdev, + u16 index) +{ + vp_iowrite16(index, &mdev->common->queue_select); + + return vp_ioread16(&mdev->common->queue_size); + +} +EXPORT_SYMBOL_GPL(vp_modern_get_queue_size); + +/* + * vp_modern_get_num_queues - get the number of virtqueues + * @mdev: the modern virtio-pci device + * + * Returns the number of virtqueues + */ +u16 vp_modern_get_num_queues(struct virtio_pci_modern_device *mdev) +{ + return vp_ioread16(&mdev->common->num_queues); +} +EXPORT_SYMBOL_GPL(vp_modern_get_num_queues); + +/* + * vp_modern_get_queue_notify_off - get notification offset for a virtqueue + * @mdev: the modern virtio-pci device + * @index: the queue index + * + * Returns the notification offset for a virtqueue + */ +static u16 vp_modern_get_queue_notify_off(struct virtio_pci_modern_device *mdev, + u16 index) +{ + vp_iowrite16(index, &mdev->common->queue_select); + + return vp_ioread16(&mdev->common->queue_notify_off); +} + +/* + * vp_modern_map_vq_notify - map notification area for a + * specific virtqueue + * @mdev: the modern virtio-pci device + * @index: the queue index + * @pa: the pointer to the physical address of the nofity area + * + * Returns the address of the notification area + */ +void __iomem *vp_modern_map_vq_notify(struct virtio_pci_modern_device *mdev, + u16 index, resource_size_t *pa) +{ + u16 off = vp_modern_get_queue_notify_off(mdev, index); + + if (mdev->notify_base) { + /* offset should not wrap */ + if ((u64)off * mdev->notify_offset_multiplier + 2 + > mdev->notify_len) { + dev_warn(&mdev->pci_dev->dev, + "bad notification offset %u (x %u) " + "for queue %u > %zd", + off, mdev->notify_offset_multiplier, + index, mdev->notify_len); + return NULL; + } + if (pa) + *pa = mdev->notify_pa + + off * mdev->notify_offset_multiplier; + return mdev->notify_base + off * mdev->notify_offset_multiplier; + } else { + return vp_modern_map_capability(mdev, + mdev->notify_map_cap, 2, 2, + off * mdev->notify_offset_multiplier, 2, + NULL, pa); + } +} +EXPORT_SYMBOL_GPL(vp_modern_map_vq_notify); + +MODULE_VERSION("0.1"); +MODULE_DESCRIPTION("Modern Virtio PCI Device"); +MODULE_AUTHOR("Jason Wang <jasowang@redhat.com>"); +MODULE_LICENSE("GPL"); diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c new file mode 100644 index 0000000000..49299b1f9e --- /dev/null +++ b/drivers/virtio/virtio_ring.c @@ -0,0 +1,3252 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* Virtio ring implementation. + * + * Copyright 2007 Rusty Russell IBM Corporation + */ +#include <linux/virtio.h> +#include <linux/virtio_ring.h> +#include <linux/virtio_config.h> +#include <linux/device.h> +#include <linux/slab.h> +#include <linux/module.h> +#include <linux/hrtimer.h> +#include <linux/dma-mapping.h> +#include <linux/kmsan.h> +#include <linux/spinlock.h> +#include <xen/xen.h> + +#ifdef DEBUG +/* For development, we want to crash whenever the ring is screwed. */ +#define BAD_RING(_vq, fmt, args...) \ + do { \ + dev_err(&(_vq)->vq.vdev->dev, \ + "%s:"fmt, (_vq)->vq.name, ##args); \ + BUG(); \ + } while (0) +/* Caller is supposed to guarantee no reentry. */ +#define START_USE(_vq) \ + do { \ + if ((_vq)->in_use) \ + panic("%s:in_use = %i\n", \ + (_vq)->vq.name, (_vq)->in_use); \ + (_vq)->in_use = __LINE__; \ + } while (0) +#define END_USE(_vq) \ + do { BUG_ON(!(_vq)->in_use); (_vq)->in_use = 0; } while(0) +#define LAST_ADD_TIME_UPDATE(_vq) \ + do { \ + ktime_t now = ktime_get(); \ + \ + /* No kick or get, with .1 second between? Warn. */ \ + if ((_vq)->last_add_time_valid) \ + WARN_ON(ktime_to_ms(ktime_sub(now, \ + (_vq)->last_add_time)) > 100); \ + (_vq)->last_add_time = now; \ + (_vq)->last_add_time_valid = true; \ + } while (0) +#define LAST_ADD_TIME_CHECK(_vq) \ + do { \ + if ((_vq)->last_add_time_valid) { \ + WARN_ON(ktime_to_ms(ktime_sub(ktime_get(), \ + (_vq)->last_add_time)) > 100); \ + } \ + } while (0) +#define LAST_ADD_TIME_INVALID(_vq) \ + ((_vq)->last_add_time_valid = false) +#else +#define BAD_RING(_vq, fmt, args...) \ + do { \ + dev_err(&_vq->vq.vdev->dev, \ + "%s:"fmt, (_vq)->vq.name, ##args); \ + (_vq)->broken = true; \ + } while (0) +#define START_USE(vq) +#define END_USE(vq) +#define LAST_ADD_TIME_UPDATE(vq) +#define LAST_ADD_TIME_CHECK(vq) +#define LAST_ADD_TIME_INVALID(vq) +#endif + +struct vring_desc_state_split { + void *data; /* Data for callback. */ + struct vring_desc *indir_desc; /* Indirect descriptor, if any. */ +}; + +struct vring_desc_state_packed { + void *data; /* Data for callback. */ + struct vring_packed_desc *indir_desc; /* Indirect descriptor, if any. */ + u16 num; /* Descriptor list length. */ + u16 last; /* The last desc state in a list. */ +}; + +struct vring_desc_extra { + dma_addr_t addr; /* Descriptor DMA addr. */ + u32 len; /* Descriptor length. */ + u16 flags; /* Descriptor flags. */ + u16 next; /* The next desc state in a list. */ +}; + +struct vring_virtqueue_split { + /* Actual memory layout for this queue. */ + struct vring vring; + + /* Last written value to avail->flags */ + u16 avail_flags_shadow; + + /* + * Last written value to avail->idx in + * guest byte order. + */ + u16 avail_idx_shadow; + + /* Per-descriptor state. */ + struct vring_desc_state_split *desc_state; + struct vring_desc_extra *desc_extra; + + /* DMA address and size information */ + dma_addr_t queue_dma_addr; + size_t queue_size_in_bytes; + + /* + * The parameters for creating vrings are reserved for creating new + * vring. + */ + u32 vring_align; + bool may_reduce_num; +}; + +struct vring_virtqueue_packed { + /* Actual memory layout for this queue. */ + struct { + unsigned int num; + struct vring_packed_desc *desc; + struct vring_packed_desc_event *driver; + struct vring_packed_desc_event *device; + } vring; + + /* Driver ring wrap counter. */ + bool avail_wrap_counter; + + /* Avail used flags. */ + u16 avail_used_flags; + + /* Index of the next avail descriptor. */ + u16 next_avail_idx; + + /* + * Last written value to driver->flags in + * guest byte order. + */ + u16 event_flags_shadow; + + /* Per-descriptor state. */ + struct vring_desc_state_packed *desc_state; + struct vring_desc_extra *desc_extra; + + /* DMA address and size information */ + dma_addr_t ring_dma_addr; + dma_addr_t driver_event_dma_addr; + dma_addr_t device_event_dma_addr; + size_t ring_size_in_bytes; + size_t event_size_in_bytes; +}; + +struct vring_virtqueue { + struct virtqueue vq; + + /* Is this a packed ring? */ + bool packed_ring; + + /* Is DMA API used? */ + bool use_dma_api; + + /* Can we use weak barriers? */ + bool weak_barriers; + + /* Other side has made a mess, don't try any more. */ + bool broken; + + /* Host supports indirect buffers */ + bool indirect; + + /* Host publishes avail event idx */ + bool event; + + /* Do DMA mapping by driver */ + bool premapped; + + /* Do unmap or not for desc. Just when premapped is False and + * use_dma_api is true, this is true. + */ + bool do_unmap; + + /* Head of free buffer list. */ + unsigned int free_head; + /* Number we've added since last sync. */ + unsigned int num_added; + + /* Last used index we've seen. + * for split ring, it just contains last used index + * for packed ring: + * bits up to VRING_PACKED_EVENT_F_WRAP_CTR include the last used index. + * bits from VRING_PACKED_EVENT_F_WRAP_CTR include the used wrap counter. + */ + u16 last_used_idx; + + /* Hint for event idx: already triggered no need to disable. */ + bool event_triggered; + + union { + /* Available for split ring */ + struct vring_virtqueue_split split; + + /* Available for packed ring */ + struct vring_virtqueue_packed packed; + }; + + /* How to notify other side. FIXME: commonalize hcalls! */ + bool (*notify)(struct virtqueue *vq); + + /* DMA, allocation, and size information */ + bool we_own_ring; + + /* Device used for doing DMA */ + struct device *dma_dev; + +#ifdef DEBUG + /* They're supposed to lock for us. */ + unsigned int in_use; + + /* Figure out if their kicks are too delayed. */ + bool last_add_time_valid; + ktime_t last_add_time; +#endif +}; + +static struct virtqueue *__vring_new_virtqueue(unsigned int index, + struct vring_virtqueue_split *vring_split, + struct virtio_device *vdev, + bool weak_barriers, + bool context, + bool (*notify)(struct virtqueue *), + void (*callback)(struct virtqueue *), + const char *name, + struct device *dma_dev); +static struct vring_desc_extra *vring_alloc_desc_extra(unsigned int num); +static void vring_free(struct virtqueue *_vq); + +/* + * Helpers. + */ + +#define to_vvq(_vq) container_of_const(_vq, struct vring_virtqueue, vq) + +static bool virtqueue_use_indirect(const struct vring_virtqueue *vq, + unsigned int total_sg) +{ + /* + * If the host supports indirect descriptor tables, and we have multiple + * buffers, then go indirect. FIXME: tune this threshold + */ + return (vq->indirect && total_sg > 1 && vq->vq.num_free); +} + +/* + * Modern virtio devices have feature bits to specify whether they need a + * quirk and bypass the IOMMU. If not there, just use the DMA API. + * + * If there, the interaction between virtio and DMA API is messy. + * + * On most systems with virtio, physical addresses match bus addresses, + * and it doesn't particularly matter whether we use the DMA API. + * + * On some systems, including Xen and any system with a physical device + * that speaks virtio behind a physical IOMMU, we must use the DMA API + * for virtio DMA to work at all. + * + * On other systems, including SPARC and PPC64, virtio-pci devices are + * enumerated as though they are behind an IOMMU, but the virtio host + * ignores the IOMMU, so we must either pretend that the IOMMU isn't + * there or somehow map everything as the identity. + * + * For the time being, we preserve historic behavior and bypass the DMA + * API. + * + * TODO: install a per-device DMA ops structure that does the right thing + * taking into account all the above quirks, and use the DMA API + * unconditionally on data path. + */ + +static bool vring_use_dma_api(const struct virtio_device *vdev) +{ + if (!virtio_has_dma_quirk(vdev)) + return true; + + /* Otherwise, we are left to guess. */ + /* + * In theory, it's possible to have a buggy QEMU-supposed + * emulated Q35 IOMMU and Xen enabled at the same time. On + * such a configuration, virtio has never worked and will + * not work without an even larger kludge. Instead, enable + * the DMA API if we're a Xen guest, which at least allows + * all of the sensible Xen configurations to work correctly. + */ + if (xen_domain()) + return true; + + return false; +} + +size_t virtio_max_dma_size(const struct virtio_device *vdev) +{ + size_t max_segment_size = SIZE_MAX; + + if (vring_use_dma_api(vdev)) + max_segment_size = dma_max_mapping_size(vdev->dev.parent); + + return max_segment_size; +} +EXPORT_SYMBOL_GPL(virtio_max_dma_size); + +static void *vring_alloc_queue(struct virtio_device *vdev, size_t size, + dma_addr_t *dma_handle, gfp_t flag, + struct device *dma_dev) +{ + if (vring_use_dma_api(vdev)) { + return dma_alloc_coherent(dma_dev, size, + dma_handle, flag); + } else { + void *queue = alloc_pages_exact(PAGE_ALIGN(size), flag); + + if (queue) { + phys_addr_t phys_addr = virt_to_phys(queue); + *dma_handle = (dma_addr_t)phys_addr; + + /* + * Sanity check: make sure we dind't truncate + * the address. The only arches I can find that + * have 64-bit phys_addr_t but 32-bit dma_addr_t + * are certain non-highmem MIPS and x86 + * configurations, but these configurations + * should never allocate physical pages above 32 + * bits, so this is fine. Just in case, throw a + * warning and abort if we end up with an + * unrepresentable address. + */ + if (WARN_ON_ONCE(*dma_handle != phys_addr)) { + free_pages_exact(queue, PAGE_ALIGN(size)); + return NULL; + } + } + return queue; + } +} + +static void vring_free_queue(struct virtio_device *vdev, size_t size, + void *queue, dma_addr_t dma_handle, + struct device *dma_dev) +{ + if (vring_use_dma_api(vdev)) + dma_free_coherent(dma_dev, size, queue, dma_handle); + else + free_pages_exact(queue, PAGE_ALIGN(size)); +} + +/* + * The DMA ops on various arches are rather gnarly right now, and + * making all of the arch DMA ops work on the vring device itself + * is a mess. + */ +static struct device *vring_dma_dev(const struct vring_virtqueue *vq) +{ + return vq->dma_dev; +} + +/* Map one sg entry. */ +static int vring_map_one_sg(const struct vring_virtqueue *vq, struct scatterlist *sg, + enum dma_data_direction direction, dma_addr_t *addr) +{ + if (vq->premapped) { + *addr = sg_dma_address(sg); + return 0; + } + + if (!vq->use_dma_api) { + /* + * If DMA is not used, KMSAN doesn't know that the scatterlist + * is initialized by the hardware. Explicitly check/unpoison it + * depending on the direction. + */ + kmsan_handle_dma(sg_page(sg), sg->offset, sg->length, direction); + *addr = (dma_addr_t)sg_phys(sg); + return 0; + } + + /* + * We can't use dma_map_sg, because we don't use scatterlists in + * the way it expects (we don't guarantee that the scatterlist + * will exist for the lifetime of the mapping). + */ + *addr = dma_map_page(vring_dma_dev(vq), + sg_page(sg), sg->offset, sg->length, + direction); + + if (dma_mapping_error(vring_dma_dev(vq), *addr)) + return -ENOMEM; + + return 0; +} + +static dma_addr_t vring_map_single(const struct vring_virtqueue *vq, + void *cpu_addr, size_t size, + enum dma_data_direction direction) +{ + if (!vq->use_dma_api) + return (dma_addr_t)virt_to_phys(cpu_addr); + + return dma_map_single(vring_dma_dev(vq), + cpu_addr, size, direction); +} + +static int vring_mapping_error(const struct vring_virtqueue *vq, + dma_addr_t addr) +{ + if (!vq->use_dma_api) + return 0; + + return dma_mapping_error(vring_dma_dev(vq), addr); +} + +static void virtqueue_init(struct vring_virtqueue *vq, u32 num) +{ + vq->vq.num_free = num; + + if (vq->packed_ring) + vq->last_used_idx = 0 | (1 << VRING_PACKED_EVENT_F_WRAP_CTR); + else + vq->last_used_idx = 0; + + vq->event_triggered = false; + vq->num_added = 0; + +#ifdef DEBUG + vq->in_use = false; + vq->last_add_time_valid = false; +#endif +} + + +/* + * Split ring specific functions - *_split(). + */ + +static void vring_unmap_one_split_indirect(const struct vring_virtqueue *vq, + const struct vring_desc *desc) +{ + u16 flags; + + if (!vq->do_unmap) + return; + + flags = virtio16_to_cpu(vq->vq.vdev, desc->flags); + + dma_unmap_page(vring_dma_dev(vq), + virtio64_to_cpu(vq->vq.vdev, desc->addr), + virtio32_to_cpu(vq->vq.vdev, desc->len), + (flags & VRING_DESC_F_WRITE) ? + DMA_FROM_DEVICE : DMA_TO_DEVICE); +} + +static unsigned int vring_unmap_one_split(const struct vring_virtqueue *vq, + unsigned int i) +{ + struct vring_desc_extra *extra = vq->split.desc_extra; + u16 flags; + + flags = extra[i].flags; + + if (flags & VRING_DESC_F_INDIRECT) { + if (!vq->use_dma_api) + goto out; + + dma_unmap_single(vring_dma_dev(vq), + extra[i].addr, + extra[i].len, + (flags & VRING_DESC_F_WRITE) ? + DMA_FROM_DEVICE : DMA_TO_DEVICE); + } else { + if (!vq->do_unmap) + goto out; + + dma_unmap_page(vring_dma_dev(vq), + extra[i].addr, + extra[i].len, + (flags & VRING_DESC_F_WRITE) ? + DMA_FROM_DEVICE : DMA_TO_DEVICE); + } + +out: + return extra[i].next; +} + +static struct vring_desc *alloc_indirect_split(struct virtqueue *_vq, + unsigned int total_sg, + gfp_t gfp) +{ + struct vring_desc *desc; + unsigned int i; + + /* + * We require lowmem mappings for the descriptors because + * otherwise virt_to_phys will give us bogus addresses in the + * virtqueue. + */ + gfp &= ~__GFP_HIGHMEM; + + desc = kmalloc_array(total_sg, sizeof(struct vring_desc), gfp); + if (!desc) + return NULL; + + for (i = 0; i < total_sg; i++) + desc[i].next = cpu_to_virtio16(_vq->vdev, i + 1); + return desc; +} + +static inline unsigned int virtqueue_add_desc_split(struct virtqueue *vq, + struct vring_desc *desc, + unsigned int i, + dma_addr_t addr, + unsigned int len, + u16 flags, + bool indirect) +{ + struct vring_virtqueue *vring = to_vvq(vq); + struct vring_desc_extra *extra = vring->split.desc_extra; + u16 next; + + desc[i].flags = cpu_to_virtio16(vq->vdev, flags); + desc[i].addr = cpu_to_virtio64(vq->vdev, addr); + desc[i].len = cpu_to_virtio32(vq->vdev, len); + + if (!indirect) { + next = extra[i].next; + desc[i].next = cpu_to_virtio16(vq->vdev, next); + + extra[i].addr = addr; + extra[i].len = len; + extra[i].flags = flags; + } else + next = virtio16_to_cpu(vq->vdev, desc[i].next); + + return next; +} + +static inline int virtqueue_add_split(struct virtqueue *_vq, + struct scatterlist *sgs[], + unsigned int total_sg, + unsigned int out_sgs, + unsigned int in_sgs, + void *data, + void *ctx, + gfp_t gfp) +{ + struct vring_virtqueue *vq = to_vvq(_vq); + struct scatterlist *sg; + struct vring_desc *desc; + unsigned int i, n, avail, descs_used, prev, err_idx; + int head; + bool indirect; + + START_USE(vq); + + BUG_ON(data == NULL); + BUG_ON(ctx && vq->indirect); + + if (unlikely(vq->broken)) { + END_USE(vq); + return -EIO; + } + + LAST_ADD_TIME_UPDATE(vq); + + BUG_ON(total_sg == 0); + + head = vq->free_head; + + if (virtqueue_use_indirect(vq, total_sg)) + desc = alloc_indirect_split(_vq, total_sg, gfp); + else { + desc = NULL; + WARN_ON_ONCE(total_sg > vq->split.vring.num && !vq->indirect); + } + + if (desc) { + /* Use a single buffer which doesn't continue */ + indirect = true; + /* Set up rest to use this indirect table. */ + i = 0; + descs_used = 1; + } else { + indirect = false; + desc = vq->split.vring.desc; + i = head; + descs_used = total_sg; + } + + if (unlikely(vq->vq.num_free < descs_used)) { + pr_debug("Can't add buf len %i - avail = %i\n", + descs_used, vq->vq.num_free); + /* FIXME: for historical reasons, we force a notify here if + * there are outgoing parts to the buffer. Presumably the + * host should service the ring ASAP. */ + if (out_sgs) + vq->notify(&vq->vq); + if (indirect) + kfree(desc); + END_USE(vq); + return -ENOSPC; + } + + for (n = 0; n < out_sgs; n++) { + for (sg = sgs[n]; sg; sg = sg_next(sg)) { + dma_addr_t addr; + + if (vring_map_one_sg(vq, sg, DMA_TO_DEVICE, &addr)) + goto unmap_release; + + prev = i; + /* Note that we trust indirect descriptor + * table since it use stream DMA mapping. + */ + i = virtqueue_add_desc_split(_vq, desc, i, addr, sg->length, + VRING_DESC_F_NEXT, + indirect); + } + } + for (; n < (out_sgs + in_sgs); n++) { + for (sg = sgs[n]; sg; sg = sg_next(sg)) { + dma_addr_t addr; + + if (vring_map_one_sg(vq, sg, DMA_FROM_DEVICE, &addr)) + goto unmap_release; + + prev = i; + /* Note that we trust indirect descriptor + * table since it use stream DMA mapping. + */ + i = virtqueue_add_desc_split(_vq, desc, i, addr, + sg->length, + VRING_DESC_F_NEXT | + VRING_DESC_F_WRITE, + indirect); + } + } + /* Last one doesn't continue. */ + desc[prev].flags &= cpu_to_virtio16(_vq->vdev, ~VRING_DESC_F_NEXT); + if (!indirect && vq->do_unmap) + vq->split.desc_extra[prev & (vq->split.vring.num - 1)].flags &= + ~VRING_DESC_F_NEXT; + + if (indirect) { + /* Now that the indirect table is filled in, map it. */ + dma_addr_t addr = vring_map_single( + vq, desc, total_sg * sizeof(struct vring_desc), + DMA_TO_DEVICE); + if (vring_mapping_error(vq, addr)) { + if (vq->premapped) + goto free_indirect; + + goto unmap_release; + } + + virtqueue_add_desc_split(_vq, vq->split.vring.desc, + head, addr, + total_sg * sizeof(struct vring_desc), + VRING_DESC_F_INDIRECT, + false); + } + + /* We're using some buffers from the free list. */ + vq->vq.num_free -= descs_used; + + /* Update free pointer */ + if (indirect) + vq->free_head = vq->split.desc_extra[head].next; + else + vq->free_head = i; + + /* Store token and indirect buffer state. */ + vq->split.desc_state[head].data = data; + if (indirect) + vq->split.desc_state[head].indir_desc = desc; + else + vq->split.desc_state[head].indir_desc = ctx; + + /* Put entry in available array (but don't update avail->idx until they + * do sync). */ + avail = vq->split.avail_idx_shadow & (vq->split.vring.num - 1); + vq->split.vring.avail->ring[avail] = cpu_to_virtio16(_vq->vdev, head); + + /* Descriptors and available array need to be set before we expose the + * new available array entries. */ + virtio_wmb(vq->weak_barriers); + vq->split.avail_idx_shadow++; + vq->split.vring.avail->idx = cpu_to_virtio16(_vq->vdev, + vq->split.avail_idx_shadow); + vq->num_added++; + + pr_debug("Added buffer head %i to %p\n", head, vq); + END_USE(vq); + + /* This is very unlikely, but theoretically possible. Kick + * just in case. */ + if (unlikely(vq->num_added == (1 << 16) - 1)) + virtqueue_kick(_vq); + + return 0; + +unmap_release: + err_idx = i; + + if (indirect) + i = 0; + else + i = head; + + for (n = 0; n < total_sg; n++) { + if (i == err_idx) + break; + if (indirect) { + vring_unmap_one_split_indirect(vq, &desc[i]); + i = virtio16_to_cpu(_vq->vdev, desc[i].next); + } else + i = vring_unmap_one_split(vq, i); + } + +free_indirect: + if (indirect) + kfree(desc); + + END_USE(vq); + return -ENOMEM; +} + +static bool virtqueue_kick_prepare_split(struct virtqueue *_vq) +{ + struct vring_virtqueue *vq = to_vvq(_vq); + u16 new, old; + bool needs_kick; + + START_USE(vq); + /* We need to expose available array entries before checking avail + * event. */ + virtio_mb(vq->weak_barriers); + + old = vq->split.avail_idx_shadow - vq->num_added; + new = vq->split.avail_idx_shadow; + vq->num_added = 0; + + LAST_ADD_TIME_CHECK(vq); + LAST_ADD_TIME_INVALID(vq); + + if (vq->event) { + needs_kick = vring_need_event(virtio16_to_cpu(_vq->vdev, + vring_avail_event(&vq->split.vring)), + new, old); + } else { + needs_kick = !(vq->split.vring.used->flags & + cpu_to_virtio16(_vq->vdev, + VRING_USED_F_NO_NOTIFY)); + } + END_USE(vq); + return needs_kick; +} + +static void detach_buf_split(struct vring_virtqueue *vq, unsigned int head, + void **ctx) +{ + unsigned int i, j; + __virtio16 nextflag = cpu_to_virtio16(vq->vq.vdev, VRING_DESC_F_NEXT); + + /* Clear data ptr. */ + vq->split.desc_state[head].data = NULL; + + /* Put back on free list: unmap first-level descriptors and find end */ + i = head; + + while (vq->split.vring.desc[i].flags & nextflag) { + vring_unmap_one_split(vq, i); + i = vq->split.desc_extra[i].next; + vq->vq.num_free++; + } + + vring_unmap_one_split(vq, i); + vq->split.desc_extra[i].next = vq->free_head; + vq->free_head = head; + + /* Plus final descriptor */ + vq->vq.num_free++; + + if (vq->indirect) { + struct vring_desc *indir_desc = + vq->split.desc_state[head].indir_desc; + u32 len; + + /* Free the indirect table, if any, now that it's unmapped. */ + if (!indir_desc) + return; + + len = vq->split.desc_extra[head].len; + + BUG_ON(!(vq->split.desc_extra[head].flags & + VRING_DESC_F_INDIRECT)); + BUG_ON(len == 0 || len % sizeof(struct vring_desc)); + + if (vq->do_unmap) { + for (j = 0; j < len / sizeof(struct vring_desc); j++) + vring_unmap_one_split_indirect(vq, &indir_desc[j]); + } + + kfree(indir_desc); + vq->split.desc_state[head].indir_desc = NULL; + } else if (ctx) { + *ctx = vq->split.desc_state[head].indir_desc; + } +} + +static bool more_used_split(const struct vring_virtqueue *vq) +{ + return vq->last_used_idx != virtio16_to_cpu(vq->vq.vdev, + vq->split.vring.used->idx); +} + +static void *virtqueue_get_buf_ctx_split(struct virtqueue *_vq, + unsigned int *len, + void **ctx) +{ + struct vring_virtqueue *vq = to_vvq(_vq); + void *ret; + unsigned int i; + u16 last_used; + + START_USE(vq); + + if (unlikely(vq->broken)) { + END_USE(vq); + return NULL; + } + + if (!more_used_split(vq)) { + pr_debug("No more buffers in queue\n"); + END_USE(vq); + return NULL; + } + + /* Only get used array entries after they have been exposed by host. */ + virtio_rmb(vq->weak_barriers); + + last_used = (vq->last_used_idx & (vq->split.vring.num - 1)); + i = virtio32_to_cpu(_vq->vdev, + vq->split.vring.used->ring[last_used].id); + *len = virtio32_to_cpu(_vq->vdev, + vq->split.vring.used->ring[last_used].len); + + if (unlikely(i >= vq->split.vring.num)) { + BAD_RING(vq, "id %u out of range\n", i); + return NULL; + } + if (unlikely(!vq->split.desc_state[i].data)) { + BAD_RING(vq, "id %u is not a head!\n", i); + return NULL; + } + + /* detach_buf_split clears data, so grab it now. */ + ret = vq->split.desc_state[i].data; + detach_buf_split(vq, i, ctx); + vq->last_used_idx++; + /* If we expect an interrupt for the next entry, tell host + * by writing event index and flush out the write before + * the read in the next get_buf call. */ + if (!(vq->split.avail_flags_shadow & VRING_AVAIL_F_NO_INTERRUPT)) + virtio_store_mb(vq->weak_barriers, + &vring_used_event(&vq->split.vring), + cpu_to_virtio16(_vq->vdev, vq->last_used_idx)); + + LAST_ADD_TIME_INVALID(vq); + + END_USE(vq); + return ret; +} + +static void virtqueue_disable_cb_split(struct virtqueue *_vq) +{ + struct vring_virtqueue *vq = to_vvq(_vq); + + if (!(vq->split.avail_flags_shadow & VRING_AVAIL_F_NO_INTERRUPT)) { + vq->split.avail_flags_shadow |= VRING_AVAIL_F_NO_INTERRUPT; + + /* + * If device triggered an event already it won't trigger one again: + * no need to disable. + */ + if (vq->event_triggered) + return; + + if (vq->event) + /* TODO: this is a hack. Figure out a cleaner value to write. */ + vring_used_event(&vq->split.vring) = 0x0; + else + vq->split.vring.avail->flags = + cpu_to_virtio16(_vq->vdev, + vq->split.avail_flags_shadow); + } +} + +static unsigned int virtqueue_enable_cb_prepare_split(struct virtqueue *_vq) +{ + struct vring_virtqueue *vq = to_vvq(_vq); + u16 last_used_idx; + + START_USE(vq); + + /* We optimistically turn back on interrupts, then check if there was + * more to do. */ + /* Depending on the VIRTIO_RING_F_EVENT_IDX feature, we need to + * either clear the flags bit or point the event index at the next + * entry. Always do both to keep code simple. */ + if (vq->split.avail_flags_shadow & VRING_AVAIL_F_NO_INTERRUPT) { + vq->split.avail_flags_shadow &= ~VRING_AVAIL_F_NO_INTERRUPT; + if (!vq->event) + vq->split.vring.avail->flags = + cpu_to_virtio16(_vq->vdev, + vq->split.avail_flags_shadow); + } + vring_used_event(&vq->split.vring) = cpu_to_virtio16(_vq->vdev, + last_used_idx = vq->last_used_idx); + END_USE(vq); + return last_used_idx; +} + +static bool virtqueue_poll_split(struct virtqueue *_vq, unsigned int last_used_idx) +{ + struct vring_virtqueue *vq = to_vvq(_vq); + + return (u16)last_used_idx != virtio16_to_cpu(_vq->vdev, + vq->split.vring.used->idx); +} + +static bool virtqueue_enable_cb_delayed_split(struct virtqueue *_vq) +{ + struct vring_virtqueue *vq = to_vvq(_vq); + u16 bufs; + + START_USE(vq); + + /* We optimistically turn back on interrupts, then check if there was + * more to do. */ + /* Depending on the VIRTIO_RING_F_USED_EVENT_IDX feature, we need to + * either clear the flags bit or point the event index at the next + * entry. Always update the event index to keep code simple. */ + if (vq->split.avail_flags_shadow & VRING_AVAIL_F_NO_INTERRUPT) { + vq->split.avail_flags_shadow &= ~VRING_AVAIL_F_NO_INTERRUPT; + if (!vq->event) + vq->split.vring.avail->flags = + cpu_to_virtio16(_vq->vdev, + vq->split.avail_flags_shadow); + } + /* TODO: tune this threshold */ + bufs = (u16)(vq->split.avail_idx_shadow - vq->last_used_idx) * 3 / 4; + + virtio_store_mb(vq->weak_barriers, + &vring_used_event(&vq->split.vring), + cpu_to_virtio16(_vq->vdev, vq->last_used_idx + bufs)); + + if (unlikely((u16)(virtio16_to_cpu(_vq->vdev, vq->split.vring.used->idx) + - vq->last_used_idx) > bufs)) { + END_USE(vq); + return false; + } + + END_USE(vq); + return true; +} + +static void *virtqueue_detach_unused_buf_split(struct virtqueue *_vq) +{ + struct vring_virtqueue *vq = to_vvq(_vq); + unsigned int i; + void *buf; + + START_USE(vq); + + for (i = 0; i < vq->split.vring.num; i++) { + if (!vq->split.desc_state[i].data) + continue; + /* detach_buf_split clears data, so grab it now. */ + buf = vq->split.desc_state[i].data; + detach_buf_split(vq, i, NULL); + vq->split.avail_idx_shadow--; + vq->split.vring.avail->idx = cpu_to_virtio16(_vq->vdev, + vq->split.avail_idx_shadow); + END_USE(vq); + return buf; + } + /* That should have freed everything. */ + BUG_ON(vq->vq.num_free != vq->split.vring.num); + + END_USE(vq); + return NULL; +} + +static void virtqueue_vring_init_split(struct vring_virtqueue_split *vring_split, + struct vring_virtqueue *vq) +{ + struct virtio_device *vdev; + + vdev = vq->vq.vdev; + + vring_split->avail_flags_shadow = 0; + vring_split->avail_idx_shadow = 0; + + /* No callback? Tell other side not to bother us. */ + if (!vq->vq.callback) { + vring_split->avail_flags_shadow |= VRING_AVAIL_F_NO_INTERRUPT; + if (!vq->event) + vring_split->vring.avail->flags = cpu_to_virtio16(vdev, + vring_split->avail_flags_shadow); + } +} + +static void virtqueue_reinit_split(struct vring_virtqueue *vq) +{ + int num; + + num = vq->split.vring.num; + + vq->split.vring.avail->flags = 0; + vq->split.vring.avail->idx = 0; + + /* reset avail event */ + vq->split.vring.avail->ring[num] = 0; + + vq->split.vring.used->flags = 0; + vq->split.vring.used->idx = 0; + + /* reset used event */ + *(__virtio16 *)&(vq->split.vring.used->ring[num]) = 0; + + virtqueue_init(vq, num); + + virtqueue_vring_init_split(&vq->split, vq); +} + +static void virtqueue_vring_attach_split(struct vring_virtqueue *vq, + struct vring_virtqueue_split *vring_split) +{ + vq->split = *vring_split; + + /* Put everything in free lists. */ + vq->free_head = 0; +} + +static int vring_alloc_state_extra_split(struct vring_virtqueue_split *vring_split) +{ + struct vring_desc_state_split *state; + struct vring_desc_extra *extra; + u32 num = vring_split->vring.num; + + state = kmalloc_array(num, sizeof(struct vring_desc_state_split), GFP_KERNEL); + if (!state) + goto err_state; + + extra = vring_alloc_desc_extra(num); + if (!extra) + goto err_extra; + + memset(state, 0, num * sizeof(struct vring_desc_state_split)); + + vring_split->desc_state = state; + vring_split->desc_extra = extra; + return 0; + +err_extra: + kfree(state); +err_state: + return -ENOMEM; +} + +static void vring_free_split(struct vring_virtqueue_split *vring_split, + struct virtio_device *vdev, struct device *dma_dev) +{ + vring_free_queue(vdev, vring_split->queue_size_in_bytes, + vring_split->vring.desc, + vring_split->queue_dma_addr, + dma_dev); + + kfree(vring_split->desc_state); + kfree(vring_split->desc_extra); +} + +static int vring_alloc_queue_split(struct vring_virtqueue_split *vring_split, + struct virtio_device *vdev, + u32 num, + unsigned int vring_align, + bool may_reduce_num, + struct device *dma_dev) +{ + void *queue = NULL; + dma_addr_t dma_addr; + + /* We assume num is a power of 2. */ + if (!is_power_of_2(num)) { + dev_warn(&vdev->dev, "Bad virtqueue length %u\n", num); + return -EINVAL; + } + + /* TODO: allocate each queue chunk individually */ + for (; num && vring_size(num, vring_align) > PAGE_SIZE; num /= 2) { + queue = vring_alloc_queue(vdev, vring_size(num, vring_align), + &dma_addr, + GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO, + dma_dev); + if (queue) + break; + if (!may_reduce_num) + return -ENOMEM; + } + + if (!num) + return -ENOMEM; + + if (!queue) { + /* Try to get a single page. You are my only hope! */ + queue = vring_alloc_queue(vdev, vring_size(num, vring_align), + &dma_addr, GFP_KERNEL | __GFP_ZERO, + dma_dev); + } + if (!queue) + return -ENOMEM; + + vring_init(&vring_split->vring, num, queue, vring_align); + + vring_split->queue_dma_addr = dma_addr; + vring_split->queue_size_in_bytes = vring_size(num, vring_align); + + vring_split->vring_align = vring_align; + vring_split->may_reduce_num = may_reduce_num; + + return 0; +} + +static struct virtqueue *vring_create_virtqueue_split( + unsigned int index, + unsigned int num, + unsigned int vring_align, + struct virtio_device *vdev, + bool weak_barriers, + bool may_reduce_num, + bool context, + bool (*notify)(struct virtqueue *), + void (*callback)(struct virtqueue *), + const char *name, + struct device *dma_dev) +{ + struct vring_virtqueue_split vring_split = {}; + struct virtqueue *vq; + int err; + + err = vring_alloc_queue_split(&vring_split, vdev, num, vring_align, + may_reduce_num, dma_dev); + if (err) + return NULL; + + vq = __vring_new_virtqueue(index, &vring_split, vdev, weak_barriers, + context, notify, callback, name, dma_dev); + if (!vq) { + vring_free_split(&vring_split, vdev, dma_dev); + return NULL; + } + + to_vvq(vq)->we_own_ring = true; + + return vq; +} + +static int virtqueue_resize_split(struct virtqueue *_vq, u32 num) +{ + struct vring_virtqueue_split vring_split = {}; + struct vring_virtqueue *vq = to_vvq(_vq); + struct virtio_device *vdev = _vq->vdev; + int err; + + err = vring_alloc_queue_split(&vring_split, vdev, num, + vq->split.vring_align, + vq->split.may_reduce_num, + vring_dma_dev(vq)); + if (err) + goto err; + + err = vring_alloc_state_extra_split(&vring_split); + if (err) + goto err_state_extra; + + vring_free(&vq->vq); + + virtqueue_vring_init_split(&vring_split, vq); + + virtqueue_init(vq, vring_split.vring.num); + virtqueue_vring_attach_split(vq, &vring_split); + + return 0; + +err_state_extra: + vring_free_split(&vring_split, vdev, vring_dma_dev(vq)); +err: + virtqueue_reinit_split(vq); + return -ENOMEM; +} + + +/* + * Packed ring specific functions - *_packed(). + */ +static bool packed_used_wrap_counter(u16 last_used_idx) +{ + return !!(last_used_idx & (1 << VRING_PACKED_EVENT_F_WRAP_CTR)); +} + +static u16 packed_last_used(u16 last_used_idx) +{ + return last_used_idx & ~(-(1 << VRING_PACKED_EVENT_F_WRAP_CTR)); +} + +static void vring_unmap_extra_packed(const struct vring_virtqueue *vq, + const struct vring_desc_extra *extra) +{ + u16 flags; + + flags = extra->flags; + + if (flags & VRING_DESC_F_INDIRECT) { + if (!vq->use_dma_api) + return; + + dma_unmap_single(vring_dma_dev(vq), + extra->addr, extra->len, + (flags & VRING_DESC_F_WRITE) ? + DMA_FROM_DEVICE : DMA_TO_DEVICE); + } else { + if (!vq->do_unmap) + return; + + dma_unmap_page(vring_dma_dev(vq), + extra->addr, extra->len, + (flags & VRING_DESC_F_WRITE) ? + DMA_FROM_DEVICE : DMA_TO_DEVICE); + } +} + +static void vring_unmap_desc_packed(const struct vring_virtqueue *vq, + const struct vring_packed_desc *desc) +{ + u16 flags; + + if (!vq->do_unmap) + return; + + flags = le16_to_cpu(desc->flags); + + dma_unmap_page(vring_dma_dev(vq), + le64_to_cpu(desc->addr), + le32_to_cpu(desc->len), + (flags & VRING_DESC_F_WRITE) ? + DMA_FROM_DEVICE : DMA_TO_DEVICE); +} + +static struct vring_packed_desc *alloc_indirect_packed(unsigned int total_sg, + gfp_t gfp) +{ + struct vring_packed_desc *desc; + + /* + * We require lowmem mappings for the descriptors because + * otherwise virt_to_phys will give us bogus addresses in the + * virtqueue. + */ + gfp &= ~__GFP_HIGHMEM; + + desc = kmalloc_array(total_sg, sizeof(struct vring_packed_desc), gfp); + + return desc; +} + +static int virtqueue_add_indirect_packed(struct vring_virtqueue *vq, + struct scatterlist *sgs[], + unsigned int total_sg, + unsigned int out_sgs, + unsigned int in_sgs, + void *data, + gfp_t gfp) +{ + struct vring_packed_desc *desc; + struct scatterlist *sg; + unsigned int i, n, err_idx; + u16 head, id; + dma_addr_t addr; + + head = vq->packed.next_avail_idx; + desc = alloc_indirect_packed(total_sg, gfp); + if (!desc) + return -ENOMEM; + + if (unlikely(vq->vq.num_free < 1)) { + pr_debug("Can't add buf len 1 - avail = 0\n"); + kfree(desc); + END_USE(vq); + return -ENOSPC; + } + + i = 0; + id = vq->free_head; + BUG_ON(id == vq->packed.vring.num); + + for (n = 0; n < out_sgs + in_sgs; n++) { + for (sg = sgs[n]; sg; sg = sg_next(sg)) { + if (vring_map_one_sg(vq, sg, n < out_sgs ? + DMA_TO_DEVICE : DMA_FROM_DEVICE, &addr)) + goto unmap_release; + + desc[i].flags = cpu_to_le16(n < out_sgs ? + 0 : VRING_DESC_F_WRITE); + desc[i].addr = cpu_to_le64(addr); + desc[i].len = cpu_to_le32(sg->length); + i++; + } + } + + /* Now that the indirect table is filled in, map it. */ + addr = vring_map_single(vq, desc, + total_sg * sizeof(struct vring_packed_desc), + DMA_TO_DEVICE); + if (vring_mapping_error(vq, addr)) { + if (vq->premapped) + goto free_desc; + + goto unmap_release; + } + + vq->packed.vring.desc[head].addr = cpu_to_le64(addr); + vq->packed.vring.desc[head].len = cpu_to_le32(total_sg * + sizeof(struct vring_packed_desc)); + vq->packed.vring.desc[head].id = cpu_to_le16(id); + + if (vq->do_unmap) { + vq->packed.desc_extra[id].addr = addr; + vq->packed.desc_extra[id].len = total_sg * + sizeof(struct vring_packed_desc); + vq->packed.desc_extra[id].flags = VRING_DESC_F_INDIRECT | + vq->packed.avail_used_flags; + } + + /* + * A driver MUST NOT make the first descriptor in the list + * available before all subsequent descriptors comprising + * the list are made available. + */ + virtio_wmb(vq->weak_barriers); + vq->packed.vring.desc[head].flags = cpu_to_le16(VRING_DESC_F_INDIRECT | + vq->packed.avail_used_flags); + + /* We're using some buffers from the free list. */ + vq->vq.num_free -= 1; + + /* Update free pointer */ + n = head + 1; + if (n >= vq->packed.vring.num) { + n = 0; + vq->packed.avail_wrap_counter ^= 1; + vq->packed.avail_used_flags ^= + 1 << VRING_PACKED_DESC_F_AVAIL | + 1 << VRING_PACKED_DESC_F_USED; + } + vq->packed.next_avail_idx = n; + vq->free_head = vq->packed.desc_extra[id].next; + + /* Store token and indirect buffer state. */ + vq->packed.desc_state[id].num = 1; + vq->packed.desc_state[id].data = data; + vq->packed.desc_state[id].indir_desc = desc; + vq->packed.desc_state[id].last = id; + + vq->num_added += 1; + + pr_debug("Added buffer head %i to %p\n", head, vq); + END_USE(vq); + + return 0; + +unmap_release: + err_idx = i; + + for (i = 0; i < err_idx; i++) + vring_unmap_desc_packed(vq, &desc[i]); + +free_desc: + kfree(desc); + + END_USE(vq); + return -ENOMEM; +} + +static inline int virtqueue_add_packed(struct virtqueue *_vq, + struct scatterlist *sgs[], + unsigned int total_sg, + unsigned int out_sgs, + unsigned int in_sgs, + void *data, + void *ctx, + gfp_t gfp) +{ + struct vring_virtqueue *vq = to_vvq(_vq); + struct vring_packed_desc *desc; + struct scatterlist *sg; + unsigned int i, n, c, descs_used, err_idx; + __le16 head_flags, flags; + u16 head, id, prev, curr, avail_used_flags; + int err; + + START_USE(vq); + + BUG_ON(data == NULL); + BUG_ON(ctx && vq->indirect); + + if (unlikely(vq->broken)) { + END_USE(vq); + return -EIO; + } + + LAST_ADD_TIME_UPDATE(vq); + + BUG_ON(total_sg == 0); + + if (virtqueue_use_indirect(vq, total_sg)) { + err = virtqueue_add_indirect_packed(vq, sgs, total_sg, out_sgs, + in_sgs, data, gfp); + if (err != -ENOMEM) { + END_USE(vq); + return err; + } + + /* fall back on direct */ + } + + head = vq->packed.next_avail_idx; + avail_used_flags = vq->packed.avail_used_flags; + + WARN_ON_ONCE(total_sg > vq->packed.vring.num && !vq->indirect); + + desc = vq->packed.vring.desc; + i = head; + descs_used = total_sg; + + if (unlikely(vq->vq.num_free < descs_used)) { + pr_debug("Can't add buf len %i - avail = %i\n", + descs_used, vq->vq.num_free); + END_USE(vq); + return -ENOSPC; + } + + id = vq->free_head; + BUG_ON(id == vq->packed.vring.num); + + curr = id; + c = 0; + for (n = 0; n < out_sgs + in_sgs; n++) { + for (sg = sgs[n]; sg; sg = sg_next(sg)) { + dma_addr_t addr; + + if (vring_map_one_sg(vq, sg, n < out_sgs ? + DMA_TO_DEVICE : DMA_FROM_DEVICE, &addr)) + goto unmap_release; + + flags = cpu_to_le16(vq->packed.avail_used_flags | + (++c == total_sg ? 0 : VRING_DESC_F_NEXT) | + (n < out_sgs ? 0 : VRING_DESC_F_WRITE)); + if (i == head) + head_flags = flags; + else + desc[i].flags = flags; + + desc[i].addr = cpu_to_le64(addr); + desc[i].len = cpu_to_le32(sg->length); + desc[i].id = cpu_to_le16(id); + + if (unlikely(vq->do_unmap)) { + vq->packed.desc_extra[curr].addr = addr; + vq->packed.desc_extra[curr].len = sg->length; + vq->packed.desc_extra[curr].flags = + le16_to_cpu(flags); + } + prev = curr; + curr = vq->packed.desc_extra[curr].next; + + if ((unlikely(++i >= vq->packed.vring.num))) { + i = 0; + vq->packed.avail_used_flags ^= + 1 << VRING_PACKED_DESC_F_AVAIL | + 1 << VRING_PACKED_DESC_F_USED; + } + } + } + + if (i <= head) + vq->packed.avail_wrap_counter ^= 1; + + /* We're using some buffers from the free list. */ + vq->vq.num_free -= descs_used; + + /* Update free pointer */ + vq->packed.next_avail_idx = i; + vq->free_head = curr; + + /* Store token. */ + vq->packed.desc_state[id].num = descs_used; + vq->packed.desc_state[id].data = data; + vq->packed.desc_state[id].indir_desc = ctx; + vq->packed.desc_state[id].last = prev; + + /* + * A driver MUST NOT make the first descriptor in the list + * available before all subsequent descriptors comprising + * the list are made available. + */ + virtio_wmb(vq->weak_barriers); + vq->packed.vring.desc[head].flags = head_flags; + vq->num_added += descs_used; + + pr_debug("Added buffer head %i to %p\n", head, vq); + END_USE(vq); + + return 0; + +unmap_release: + err_idx = i; + i = head; + curr = vq->free_head; + + vq->packed.avail_used_flags = avail_used_flags; + + for (n = 0; n < total_sg; n++) { + if (i == err_idx) + break; + vring_unmap_extra_packed(vq, &vq->packed.desc_extra[curr]); + curr = vq->packed.desc_extra[curr].next; + i++; + if (i >= vq->packed.vring.num) + i = 0; + } + + END_USE(vq); + return -EIO; +} + +static bool virtqueue_kick_prepare_packed(struct virtqueue *_vq) +{ + struct vring_virtqueue *vq = to_vvq(_vq); + u16 new, old, off_wrap, flags, wrap_counter, event_idx; + bool needs_kick; + union { + struct { + __le16 off_wrap; + __le16 flags; + }; + u32 u32; + } snapshot; + + START_USE(vq); + + /* + * We need to expose the new flags value before checking notification + * suppressions. + */ + virtio_mb(vq->weak_barriers); + + old = vq->packed.next_avail_idx - vq->num_added; + new = vq->packed.next_avail_idx; + vq->num_added = 0; + + snapshot.u32 = *(u32 *)vq->packed.vring.device; + flags = le16_to_cpu(snapshot.flags); + + LAST_ADD_TIME_CHECK(vq); + LAST_ADD_TIME_INVALID(vq); + + if (flags != VRING_PACKED_EVENT_FLAG_DESC) { + needs_kick = (flags != VRING_PACKED_EVENT_FLAG_DISABLE); + goto out; + } + + off_wrap = le16_to_cpu(snapshot.off_wrap); + + wrap_counter = off_wrap >> VRING_PACKED_EVENT_F_WRAP_CTR; + event_idx = off_wrap & ~(1 << VRING_PACKED_EVENT_F_WRAP_CTR); + if (wrap_counter != vq->packed.avail_wrap_counter) + event_idx -= vq->packed.vring.num; + + needs_kick = vring_need_event(event_idx, new, old); +out: + END_USE(vq); + return needs_kick; +} + +static void detach_buf_packed(struct vring_virtqueue *vq, + unsigned int id, void **ctx) +{ + struct vring_desc_state_packed *state = NULL; + struct vring_packed_desc *desc; + unsigned int i, curr; + + state = &vq->packed.desc_state[id]; + + /* Clear data ptr. */ + state->data = NULL; + + vq->packed.desc_extra[state->last].next = vq->free_head; + vq->free_head = id; + vq->vq.num_free += state->num; + + if (unlikely(vq->do_unmap)) { + curr = id; + for (i = 0; i < state->num; i++) { + vring_unmap_extra_packed(vq, + &vq->packed.desc_extra[curr]); + curr = vq->packed.desc_extra[curr].next; + } + } + + if (vq->indirect) { + u32 len; + + /* Free the indirect table, if any, now that it's unmapped. */ + desc = state->indir_desc; + if (!desc) + return; + + if (vq->do_unmap) { + len = vq->packed.desc_extra[id].len; + for (i = 0; i < len / sizeof(struct vring_packed_desc); + i++) + vring_unmap_desc_packed(vq, &desc[i]); + } + kfree(desc); + state->indir_desc = NULL; + } else if (ctx) { + *ctx = state->indir_desc; + } +} + +static inline bool is_used_desc_packed(const struct vring_virtqueue *vq, + u16 idx, bool used_wrap_counter) +{ + bool avail, used; + u16 flags; + + flags = le16_to_cpu(vq->packed.vring.desc[idx].flags); + avail = !!(flags & (1 << VRING_PACKED_DESC_F_AVAIL)); + used = !!(flags & (1 << VRING_PACKED_DESC_F_USED)); + + return avail == used && used == used_wrap_counter; +} + +static bool more_used_packed(const struct vring_virtqueue *vq) +{ + u16 last_used; + u16 last_used_idx; + bool used_wrap_counter; + + last_used_idx = READ_ONCE(vq->last_used_idx); + last_used = packed_last_used(last_used_idx); + used_wrap_counter = packed_used_wrap_counter(last_used_idx); + return is_used_desc_packed(vq, last_used, used_wrap_counter); +} + +static void *virtqueue_get_buf_ctx_packed(struct virtqueue *_vq, + unsigned int *len, + void **ctx) +{ + struct vring_virtqueue *vq = to_vvq(_vq); + u16 last_used, id, last_used_idx; + bool used_wrap_counter; + void *ret; + + START_USE(vq); + + if (unlikely(vq->broken)) { + END_USE(vq); + return NULL; + } + + if (!more_used_packed(vq)) { + pr_debug("No more buffers in queue\n"); + END_USE(vq); + return NULL; + } + + /* Only get used elements after they have been exposed by host. */ + virtio_rmb(vq->weak_barriers); + + last_used_idx = READ_ONCE(vq->last_used_idx); + used_wrap_counter = packed_used_wrap_counter(last_used_idx); + last_used = packed_last_used(last_used_idx); + id = le16_to_cpu(vq->packed.vring.desc[last_used].id); + *len = le32_to_cpu(vq->packed.vring.desc[last_used].len); + + if (unlikely(id >= vq->packed.vring.num)) { + BAD_RING(vq, "id %u out of range\n", id); + return NULL; + } + if (unlikely(!vq->packed.desc_state[id].data)) { + BAD_RING(vq, "id %u is not a head!\n", id); + return NULL; + } + + /* detach_buf_packed clears data, so grab it now. */ + ret = vq->packed.desc_state[id].data; + detach_buf_packed(vq, id, ctx); + + last_used += vq->packed.desc_state[id].num; + if (unlikely(last_used >= vq->packed.vring.num)) { + last_used -= vq->packed.vring.num; + used_wrap_counter ^= 1; + } + + last_used = (last_used | (used_wrap_counter << VRING_PACKED_EVENT_F_WRAP_CTR)); + WRITE_ONCE(vq->last_used_idx, last_used); + + /* + * If we expect an interrupt for the next entry, tell host + * by writing event index and flush out the write before + * the read in the next get_buf call. + */ + if (vq->packed.event_flags_shadow == VRING_PACKED_EVENT_FLAG_DESC) + virtio_store_mb(vq->weak_barriers, + &vq->packed.vring.driver->off_wrap, + cpu_to_le16(vq->last_used_idx)); + + LAST_ADD_TIME_INVALID(vq); + + END_USE(vq); + return ret; +} + +static void virtqueue_disable_cb_packed(struct virtqueue *_vq) +{ + struct vring_virtqueue *vq = to_vvq(_vq); + + if (vq->packed.event_flags_shadow != VRING_PACKED_EVENT_FLAG_DISABLE) { + vq->packed.event_flags_shadow = VRING_PACKED_EVENT_FLAG_DISABLE; + + /* + * If device triggered an event already it won't trigger one again: + * no need to disable. + */ + if (vq->event_triggered) + return; + + vq->packed.vring.driver->flags = + cpu_to_le16(vq->packed.event_flags_shadow); + } +} + +static unsigned int virtqueue_enable_cb_prepare_packed(struct virtqueue *_vq) +{ + struct vring_virtqueue *vq = to_vvq(_vq); + + START_USE(vq); + + /* + * We optimistically turn back on interrupts, then check if there was + * more to do. + */ + + if (vq->event) { + vq->packed.vring.driver->off_wrap = + cpu_to_le16(vq->last_used_idx); + /* + * We need to update event offset and event wrap + * counter first before updating event flags. + */ + virtio_wmb(vq->weak_barriers); + } + + if (vq->packed.event_flags_shadow == VRING_PACKED_EVENT_FLAG_DISABLE) { + vq->packed.event_flags_shadow = vq->event ? + VRING_PACKED_EVENT_FLAG_DESC : + VRING_PACKED_EVENT_FLAG_ENABLE; + vq->packed.vring.driver->flags = + cpu_to_le16(vq->packed.event_flags_shadow); + } + + END_USE(vq); + return vq->last_used_idx; +} + +static bool virtqueue_poll_packed(struct virtqueue *_vq, u16 off_wrap) +{ + struct vring_virtqueue *vq = to_vvq(_vq); + bool wrap_counter; + u16 used_idx; + + wrap_counter = off_wrap >> VRING_PACKED_EVENT_F_WRAP_CTR; + used_idx = off_wrap & ~(1 << VRING_PACKED_EVENT_F_WRAP_CTR); + + return is_used_desc_packed(vq, used_idx, wrap_counter); +} + +static bool virtqueue_enable_cb_delayed_packed(struct virtqueue *_vq) +{ + struct vring_virtqueue *vq = to_vvq(_vq); + u16 used_idx, wrap_counter, last_used_idx; + u16 bufs; + + START_USE(vq); + + /* + * We optimistically turn back on interrupts, then check if there was + * more to do. + */ + + if (vq->event) { + /* TODO: tune this threshold */ + bufs = (vq->packed.vring.num - vq->vq.num_free) * 3 / 4; + last_used_idx = READ_ONCE(vq->last_used_idx); + wrap_counter = packed_used_wrap_counter(last_used_idx); + + used_idx = packed_last_used(last_used_idx) + bufs; + if (used_idx >= vq->packed.vring.num) { + used_idx -= vq->packed.vring.num; + wrap_counter ^= 1; + } + + vq->packed.vring.driver->off_wrap = cpu_to_le16(used_idx | + (wrap_counter << VRING_PACKED_EVENT_F_WRAP_CTR)); + + /* + * We need to update event offset and event wrap + * counter first before updating event flags. + */ + virtio_wmb(vq->weak_barriers); + } + + if (vq->packed.event_flags_shadow == VRING_PACKED_EVENT_FLAG_DISABLE) { + vq->packed.event_flags_shadow = vq->event ? + VRING_PACKED_EVENT_FLAG_DESC : + VRING_PACKED_EVENT_FLAG_ENABLE; + vq->packed.vring.driver->flags = + cpu_to_le16(vq->packed.event_flags_shadow); + } + + /* + * We need to update event suppression structure first + * before re-checking for more used buffers. + */ + virtio_mb(vq->weak_barriers); + + last_used_idx = READ_ONCE(vq->last_used_idx); + wrap_counter = packed_used_wrap_counter(last_used_idx); + used_idx = packed_last_used(last_used_idx); + if (is_used_desc_packed(vq, used_idx, wrap_counter)) { + END_USE(vq); + return false; + } + + END_USE(vq); + return true; +} + +static void *virtqueue_detach_unused_buf_packed(struct virtqueue *_vq) +{ + struct vring_virtqueue *vq = to_vvq(_vq); + unsigned int i; + void *buf; + + START_USE(vq); + + for (i = 0; i < vq->packed.vring.num; i++) { + if (!vq->packed.desc_state[i].data) + continue; + /* detach_buf clears data, so grab it now. */ + buf = vq->packed.desc_state[i].data; + detach_buf_packed(vq, i, NULL); + END_USE(vq); + return buf; + } + /* That should have freed everything. */ + BUG_ON(vq->vq.num_free != vq->packed.vring.num); + + END_USE(vq); + return NULL; +} + +static struct vring_desc_extra *vring_alloc_desc_extra(unsigned int num) +{ + struct vring_desc_extra *desc_extra; + unsigned int i; + + desc_extra = kmalloc_array(num, sizeof(struct vring_desc_extra), + GFP_KERNEL); + if (!desc_extra) + return NULL; + + memset(desc_extra, 0, num * sizeof(struct vring_desc_extra)); + + for (i = 0; i < num - 1; i++) + desc_extra[i].next = i + 1; + + return desc_extra; +} + +static void vring_free_packed(struct vring_virtqueue_packed *vring_packed, + struct virtio_device *vdev, + struct device *dma_dev) +{ + if (vring_packed->vring.desc) + vring_free_queue(vdev, vring_packed->ring_size_in_bytes, + vring_packed->vring.desc, + vring_packed->ring_dma_addr, + dma_dev); + + if (vring_packed->vring.driver) + vring_free_queue(vdev, vring_packed->event_size_in_bytes, + vring_packed->vring.driver, + vring_packed->driver_event_dma_addr, + dma_dev); + + if (vring_packed->vring.device) + vring_free_queue(vdev, vring_packed->event_size_in_bytes, + vring_packed->vring.device, + vring_packed->device_event_dma_addr, + dma_dev); + + kfree(vring_packed->desc_state); + kfree(vring_packed->desc_extra); +} + +static int vring_alloc_queue_packed(struct vring_virtqueue_packed *vring_packed, + struct virtio_device *vdev, + u32 num, struct device *dma_dev) +{ + struct vring_packed_desc *ring; + struct vring_packed_desc_event *driver, *device; + dma_addr_t ring_dma_addr, driver_event_dma_addr, device_event_dma_addr; + size_t ring_size_in_bytes, event_size_in_bytes; + + ring_size_in_bytes = num * sizeof(struct vring_packed_desc); + + ring = vring_alloc_queue(vdev, ring_size_in_bytes, + &ring_dma_addr, + GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO, + dma_dev); + if (!ring) + goto err; + + vring_packed->vring.desc = ring; + vring_packed->ring_dma_addr = ring_dma_addr; + vring_packed->ring_size_in_bytes = ring_size_in_bytes; + + event_size_in_bytes = sizeof(struct vring_packed_desc_event); + + driver = vring_alloc_queue(vdev, event_size_in_bytes, + &driver_event_dma_addr, + GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO, + dma_dev); + if (!driver) + goto err; + + vring_packed->vring.driver = driver; + vring_packed->event_size_in_bytes = event_size_in_bytes; + vring_packed->driver_event_dma_addr = driver_event_dma_addr; + + device = vring_alloc_queue(vdev, event_size_in_bytes, + &device_event_dma_addr, + GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO, + dma_dev); + if (!device) + goto err; + + vring_packed->vring.device = device; + vring_packed->device_event_dma_addr = device_event_dma_addr; + + vring_packed->vring.num = num; + + return 0; + +err: + vring_free_packed(vring_packed, vdev, dma_dev); + return -ENOMEM; +} + +static int vring_alloc_state_extra_packed(struct vring_virtqueue_packed *vring_packed) +{ + struct vring_desc_state_packed *state; + struct vring_desc_extra *extra; + u32 num = vring_packed->vring.num; + + state = kmalloc_array(num, sizeof(struct vring_desc_state_packed), GFP_KERNEL); + if (!state) + goto err_desc_state; + + memset(state, 0, num * sizeof(struct vring_desc_state_packed)); + + extra = vring_alloc_desc_extra(num); + if (!extra) + goto err_desc_extra; + + vring_packed->desc_state = state; + vring_packed->desc_extra = extra; + + return 0; + +err_desc_extra: + kfree(state); +err_desc_state: + return -ENOMEM; +} + +static void virtqueue_vring_init_packed(struct vring_virtqueue_packed *vring_packed, + bool callback) +{ + vring_packed->next_avail_idx = 0; + vring_packed->avail_wrap_counter = 1; + vring_packed->event_flags_shadow = 0; + vring_packed->avail_used_flags = 1 << VRING_PACKED_DESC_F_AVAIL; + + /* No callback? Tell other side not to bother us. */ + if (!callback) { + vring_packed->event_flags_shadow = VRING_PACKED_EVENT_FLAG_DISABLE; + vring_packed->vring.driver->flags = + cpu_to_le16(vring_packed->event_flags_shadow); + } +} + +static void virtqueue_vring_attach_packed(struct vring_virtqueue *vq, + struct vring_virtqueue_packed *vring_packed) +{ + vq->packed = *vring_packed; + + /* Put everything in free lists. */ + vq->free_head = 0; +} + +static void virtqueue_reinit_packed(struct vring_virtqueue *vq) +{ + memset(vq->packed.vring.device, 0, vq->packed.event_size_in_bytes); + memset(vq->packed.vring.driver, 0, vq->packed.event_size_in_bytes); + + /* we need to reset the desc.flags. For more, see is_used_desc_packed() */ + memset(vq->packed.vring.desc, 0, vq->packed.ring_size_in_bytes); + + virtqueue_init(vq, vq->packed.vring.num); + virtqueue_vring_init_packed(&vq->packed, !!vq->vq.callback); +} + +static struct virtqueue *vring_create_virtqueue_packed( + unsigned int index, + unsigned int num, + unsigned int vring_align, + struct virtio_device *vdev, + bool weak_barriers, + bool may_reduce_num, + bool context, + bool (*notify)(struct virtqueue *), + void (*callback)(struct virtqueue *), + const char *name, + struct device *dma_dev) +{ + struct vring_virtqueue_packed vring_packed = {}; + struct vring_virtqueue *vq; + int err; + + if (vring_alloc_queue_packed(&vring_packed, vdev, num, dma_dev)) + goto err_ring; + + vq = kmalloc(sizeof(*vq), GFP_KERNEL); + if (!vq) + goto err_vq; + + vq->vq.callback = callback; + vq->vq.vdev = vdev; + vq->vq.name = name; + vq->vq.index = index; + vq->vq.reset = false; + vq->we_own_ring = true; + vq->notify = notify; + vq->weak_barriers = weak_barriers; +#ifdef CONFIG_VIRTIO_HARDEN_NOTIFICATION + vq->broken = true; +#else + vq->broken = false; +#endif + vq->packed_ring = true; + vq->dma_dev = dma_dev; + vq->use_dma_api = vring_use_dma_api(vdev); + vq->premapped = false; + vq->do_unmap = vq->use_dma_api; + + vq->indirect = virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC) && + !context; + vq->event = virtio_has_feature(vdev, VIRTIO_RING_F_EVENT_IDX); + + if (virtio_has_feature(vdev, VIRTIO_F_ORDER_PLATFORM)) + vq->weak_barriers = false; + + err = vring_alloc_state_extra_packed(&vring_packed); + if (err) + goto err_state_extra; + + virtqueue_vring_init_packed(&vring_packed, !!callback); + + virtqueue_init(vq, num); + virtqueue_vring_attach_packed(vq, &vring_packed); + + spin_lock(&vdev->vqs_list_lock); + list_add_tail(&vq->vq.list, &vdev->vqs); + spin_unlock(&vdev->vqs_list_lock); + return &vq->vq; + +err_state_extra: + kfree(vq); +err_vq: + vring_free_packed(&vring_packed, vdev, dma_dev); +err_ring: + return NULL; +} + +static int virtqueue_resize_packed(struct virtqueue *_vq, u32 num) +{ + struct vring_virtqueue_packed vring_packed = {}; + struct vring_virtqueue *vq = to_vvq(_vq); + struct virtio_device *vdev = _vq->vdev; + int err; + + if (vring_alloc_queue_packed(&vring_packed, vdev, num, vring_dma_dev(vq))) + goto err_ring; + + err = vring_alloc_state_extra_packed(&vring_packed); + if (err) + goto err_state_extra; + + vring_free(&vq->vq); + + virtqueue_vring_init_packed(&vring_packed, !!vq->vq.callback); + + virtqueue_init(vq, vring_packed.vring.num); + virtqueue_vring_attach_packed(vq, &vring_packed); + + return 0; + +err_state_extra: + vring_free_packed(&vring_packed, vdev, vring_dma_dev(vq)); +err_ring: + virtqueue_reinit_packed(vq); + return -ENOMEM; +} + +static int virtqueue_disable_and_recycle(struct virtqueue *_vq, + void (*recycle)(struct virtqueue *vq, void *buf)) +{ + struct vring_virtqueue *vq = to_vvq(_vq); + struct virtio_device *vdev = vq->vq.vdev; + void *buf; + int err; + + if (!vq->we_own_ring) + return -EPERM; + + if (!vdev->config->disable_vq_and_reset) + return -ENOENT; + + if (!vdev->config->enable_vq_after_reset) + return -ENOENT; + + err = vdev->config->disable_vq_and_reset(_vq); + if (err) + return err; + + while ((buf = virtqueue_detach_unused_buf(_vq)) != NULL) + recycle(_vq, buf); + + return 0; +} + +static int virtqueue_enable_after_reset(struct virtqueue *_vq) +{ + struct vring_virtqueue *vq = to_vvq(_vq); + struct virtio_device *vdev = vq->vq.vdev; + + if (vdev->config->enable_vq_after_reset(_vq)) + return -EBUSY; + + return 0; +} + +/* + * Generic functions and exported symbols. + */ + +static inline int virtqueue_add(struct virtqueue *_vq, + struct scatterlist *sgs[], + unsigned int total_sg, + unsigned int out_sgs, + unsigned int in_sgs, + void *data, + void *ctx, + gfp_t gfp) +{ + struct vring_virtqueue *vq = to_vvq(_vq); + + return vq->packed_ring ? virtqueue_add_packed(_vq, sgs, total_sg, + out_sgs, in_sgs, data, ctx, gfp) : + virtqueue_add_split(_vq, sgs, total_sg, + out_sgs, in_sgs, data, ctx, gfp); +} + +/** + * virtqueue_add_sgs - expose buffers to other end + * @_vq: the struct virtqueue we're talking about. + * @sgs: array of terminated scatterlists. + * @out_sgs: the number of scatterlists readable by other side + * @in_sgs: the number of scatterlists which are writable (after readable ones) + * @data: the token identifying the buffer. + * @gfp: how to do memory allocations (if necessary). + * + * Caller must ensure we don't call this with other virtqueue operations + * at the same time (except where noted). + * + * Returns zero or a negative error (ie. ENOSPC, ENOMEM, EIO). + */ +int virtqueue_add_sgs(struct virtqueue *_vq, + struct scatterlist *sgs[], + unsigned int out_sgs, + unsigned int in_sgs, + void *data, + gfp_t gfp) +{ + unsigned int i, total_sg = 0; + + /* Count them first. */ + for (i = 0; i < out_sgs + in_sgs; i++) { + struct scatterlist *sg; + + for (sg = sgs[i]; sg; sg = sg_next(sg)) + total_sg++; + } + return virtqueue_add(_vq, sgs, total_sg, out_sgs, in_sgs, + data, NULL, gfp); +} +EXPORT_SYMBOL_GPL(virtqueue_add_sgs); + +/** + * virtqueue_add_outbuf - expose output buffers to other end + * @vq: the struct virtqueue we're talking about. + * @sg: scatterlist (must be well-formed and terminated!) + * @num: the number of entries in @sg readable by other side + * @data: the token identifying the buffer. + * @gfp: how to do memory allocations (if necessary). + * + * Caller must ensure we don't call this with other virtqueue operations + * at the same time (except where noted). + * + * Returns zero or a negative error (ie. ENOSPC, ENOMEM, EIO). + */ +int virtqueue_add_outbuf(struct virtqueue *vq, + struct scatterlist *sg, unsigned int num, + void *data, + gfp_t gfp) +{ + return virtqueue_add(vq, &sg, num, 1, 0, data, NULL, gfp); +} +EXPORT_SYMBOL_GPL(virtqueue_add_outbuf); + +/** + * virtqueue_add_inbuf - expose input buffers to other end + * @vq: the struct virtqueue we're talking about. + * @sg: scatterlist (must be well-formed and terminated!) + * @num: the number of entries in @sg writable by other side + * @data: the token identifying the buffer. + * @gfp: how to do memory allocations (if necessary). + * + * Caller must ensure we don't call this with other virtqueue operations + * at the same time (except where noted). + * + * Returns zero or a negative error (ie. ENOSPC, ENOMEM, EIO). + */ +int virtqueue_add_inbuf(struct virtqueue *vq, + struct scatterlist *sg, unsigned int num, + void *data, + gfp_t gfp) +{ + return virtqueue_add(vq, &sg, num, 0, 1, data, NULL, gfp); +} +EXPORT_SYMBOL_GPL(virtqueue_add_inbuf); + +/** + * virtqueue_add_inbuf_ctx - expose input buffers to other end + * @vq: the struct virtqueue we're talking about. + * @sg: scatterlist (must be well-formed and terminated!) + * @num: the number of entries in @sg writable by other side + * @data: the token identifying the buffer. + * @ctx: extra context for the token + * @gfp: how to do memory allocations (if necessary). + * + * Caller must ensure we don't call this with other virtqueue operations + * at the same time (except where noted). + * + * Returns zero or a negative error (ie. ENOSPC, ENOMEM, EIO). + */ +int virtqueue_add_inbuf_ctx(struct virtqueue *vq, + struct scatterlist *sg, unsigned int num, + void *data, + void *ctx, + gfp_t gfp) +{ + return virtqueue_add(vq, &sg, num, 0, 1, data, ctx, gfp); +} +EXPORT_SYMBOL_GPL(virtqueue_add_inbuf_ctx); + +/** + * virtqueue_dma_dev - get the dma dev + * @_vq: the struct virtqueue we're talking about. + * + * Returns the dma dev. That can been used for dma api. + */ +struct device *virtqueue_dma_dev(struct virtqueue *_vq) +{ + struct vring_virtqueue *vq = to_vvq(_vq); + + if (vq->use_dma_api) + return vring_dma_dev(vq); + else + return NULL; +} +EXPORT_SYMBOL_GPL(virtqueue_dma_dev); + +/** + * virtqueue_kick_prepare - first half of split virtqueue_kick call. + * @_vq: the struct virtqueue + * + * Instead of virtqueue_kick(), you can do: + * if (virtqueue_kick_prepare(vq)) + * virtqueue_notify(vq); + * + * This is sometimes useful because the virtqueue_kick_prepare() needs + * to be serialized, but the actual virtqueue_notify() call does not. + */ +bool virtqueue_kick_prepare(struct virtqueue *_vq) +{ + struct vring_virtqueue *vq = to_vvq(_vq); + + return vq->packed_ring ? virtqueue_kick_prepare_packed(_vq) : + virtqueue_kick_prepare_split(_vq); +} +EXPORT_SYMBOL_GPL(virtqueue_kick_prepare); + +/** + * virtqueue_notify - second half of split virtqueue_kick call. + * @_vq: the struct virtqueue + * + * This does not need to be serialized. + * + * Returns false if host notify failed or queue is broken, otherwise true. + */ +bool virtqueue_notify(struct virtqueue *_vq) +{ + struct vring_virtqueue *vq = to_vvq(_vq); + + if (unlikely(vq->broken)) + return false; + + /* Prod other side to tell it about changes. */ + if (!vq->notify(_vq)) { + vq->broken = true; + return false; + } + return true; +} +EXPORT_SYMBOL_GPL(virtqueue_notify); + +/** + * virtqueue_kick - update after add_buf + * @vq: the struct virtqueue + * + * After one or more virtqueue_add_* calls, invoke this to kick + * the other side. + * + * Caller must ensure we don't call this with other virtqueue + * operations at the same time (except where noted). + * + * Returns false if kick failed, otherwise true. + */ +bool virtqueue_kick(struct virtqueue *vq) +{ + if (virtqueue_kick_prepare(vq)) + return virtqueue_notify(vq); + return true; +} +EXPORT_SYMBOL_GPL(virtqueue_kick); + +/** + * virtqueue_get_buf_ctx - get the next used buffer + * @_vq: the struct virtqueue we're talking about. + * @len: the length written into the buffer + * @ctx: extra context for the token + * + * If the device wrote data into the buffer, @len will be set to the + * amount written. This means you don't need to clear the buffer + * beforehand to ensure there's no data leakage in the case of short + * writes. + * + * Caller must ensure we don't call this with other virtqueue + * operations at the same time (except where noted). + * + * Returns NULL if there are no used buffers, or the "data" token + * handed to virtqueue_add_*(). + */ +void *virtqueue_get_buf_ctx(struct virtqueue *_vq, unsigned int *len, + void **ctx) +{ + struct vring_virtqueue *vq = to_vvq(_vq); + + return vq->packed_ring ? virtqueue_get_buf_ctx_packed(_vq, len, ctx) : + virtqueue_get_buf_ctx_split(_vq, len, ctx); +} +EXPORT_SYMBOL_GPL(virtqueue_get_buf_ctx); + +void *virtqueue_get_buf(struct virtqueue *_vq, unsigned int *len) +{ + return virtqueue_get_buf_ctx(_vq, len, NULL); +} +EXPORT_SYMBOL_GPL(virtqueue_get_buf); +/** + * virtqueue_disable_cb - disable callbacks + * @_vq: the struct virtqueue we're talking about. + * + * Note that this is not necessarily synchronous, hence unreliable and only + * useful as an optimization. + * + * Unlike other operations, this need not be serialized. + */ +void virtqueue_disable_cb(struct virtqueue *_vq) +{ + struct vring_virtqueue *vq = to_vvq(_vq); + + if (vq->packed_ring) + virtqueue_disable_cb_packed(_vq); + else + virtqueue_disable_cb_split(_vq); +} +EXPORT_SYMBOL_GPL(virtqueue_disable_cb); + +/** + * virtqueue_enable_cb_prepare - restart callbacks after disable_cb + * @_vq: the struct virtqueue we're talking about. + * + * This re-enables callbacks; it returns current queue state + * in an opaque unsigned value. This value should be later tested by + * virtqueue_poll, to detect a possible race between the driver checking for + * more work, and enabling callbacks. + * + * Caller must ensure we don't call this with other virtqueue + * operations at the same time (except where noted). + */ +unsigned int virtqueue_enable_cb_prepare(struct virtqueue *_vq) +{ + struct vring_virtqueue *vq = to_vvq(_vq); + + if (vq->event_triggered) + vq->event_triggered = false; + + return vq->packed_ring ? virtqueue_enable_cb_prepare_packed(_vq) : + virtqueue_enable_cb_prepare_split(_vq); +} +EXPORT_SYMBOL_GPL(virtqueue_enable_cb_prepare); + +/** + * virtqueue_poll - query pending used buffers + * @_vq: the struct virtqueue we're talking about. + * @last_used_idx: virtqueue state (from call to virtqueue_enable_cb_prepare). + * + * Returns "true" if there are pending used buffers in the queue. + * + * This does not need to be serialized. + */ +bool virtqueue_poll(struct virtqueue *_vq, unsigned int last_used_idx) +{ + struct vring_virtqueue *vq = to_vvq(_vq); + + if (unlikely(vq->broken)) + return false; + + virtio_mb(vq->weak_barriers); + return vq->packed_ring ? virtqueue_poll_packed(_vq, last_used_idx) : + virtqueue_poll_split(_vq, last_used_idx); +} +EXPORT_SYMBOL_GPL(virtqueue_poll); + +/** + * virtqueue_enable_cb - restart callbacks after disable_cb. + * @_vq: the struct virtqueue we're talking about. + * + * This re-enables callbacks; it returns "false" if there are pending + * buffers in the queue, to detect a possible race between the driver + * checking for more work, and enabling callbacks. + * + * Caller must ensure we don't call this with other virtqueue + * operations at the same time (except where noted). + */ +bool virtqueue_enable_cb(struct virtqueue *_vq) +{ + unsigned int last_used_idx = virtqueue_enable_cb_prepare(_vq); + + return !virtqueue_poll(_vq, last_used_idx); +} +EXPORT_SYMBOL_GPL(virtqueue_enable_cb); + +/** + * virtqueue_enable_cb_delayed - restart callbacks after disable_cb. + * @_vq: the struct virtqueue we're talking about. + * + * This re-enables callbacks but hints to the other side to delay + * interrupts until most of the available buffers have been processed; + * it returns "false" if there are many pending buffers in the queue, + * to detect a possible race between the driver checking for more work, + * and enabling callbacks. + * + * Caller must ensure we don't call this with other virtqueue + * operations at the same time (except where noted). + */ +bool virtqueue_enable_cb_delayed(struct virtqueue *_vq) +{ + struct vring_virtqueue *vq = to_vvq(_vq); + + if (vq->event_triggered) + vq->event_triggered = false; + + return vq->packed_ring ? virtqueue_enable_cb_delayed_packed(_vq) : + virtqueue_enable_cb_delayed_split(_vq); +} +EXPORT_SYMBOL_GPL(virtqueue_enable_cb_delayed); + +/** + * virtqueue_detach_unused_buf - detach first unused buffer + * @_vq: the struct virtqueue we're talking about. + * + * Returns NULL or the "data" token handed to virtqueue_add_*(). + * This is not valid on an active queue; it is useful for device + * shutdown or the reset queue. + */ +void *virtqueue_detach_unused_buf(struct virtqueue *_vq) +{ + struct vring_virtqueue *vq = to_vvq(_vq); + + return vq->packed_ring ? virtqueue_detach_unused_buf_packed(_vq) : + virtqueue_detach_unused_buf_split(_vq); +} +EXPORT_SYMBOL_GPL(virtqueue_detach_unused_buf); + +static inline bool more_used(const struct vring_virtqueue *vq) +{ + return vq->packed_ring ? more_used_packed(vq) : more_used_split(vq); +} + +/** + * vring_interrupt - notify a virtqueue on an interrupt + * @irq: the IRQ number (ignored) + * @_vq: the struct virtqueue to notify + * + * Calls the callback function of @_vq to process the virtqueue + * notification. + */ +irqreturn_t vring_interrupt(int irq, void *_vq) +{ + struct vring_virtqueue *vq = to_vvq(_vq); + + if (!more_used(vq)) { + pr_debug("virtqueue interrupt with no work for %p\n", vq); + return IRQ_NONE; + } + + if (unlikely(vq->broken)) { +#ifdef CONFIG_VIRTIO_HARDEN_NOTIFICATION + dev_warn_once(&vq->vq.vdev->dev, + "virtio vring IRQ raised before DRIVER_OK"); + return IRQ_NONE; +#else + return IRQ_HANDLED; +#endif + } + + /* Just a hint for performance: so it's ok that this can be racy! */ + if (vq->event) + vq->event_triggered = true; + + pr_debug("virtqueue callback for %p (%p)\n", vq, vq->vq.callback); + if (vq->vq.callback) + vq->vq.callback(&vq->vq); + + return IRQ_HANDLED; +} +EXPORT_SYMBOL_GPL(vring_interrupt); + +/* Only available for split ring */ +static struct virtqueue *__vring_new_virtqueue(unsigned int index, + struct vring_virtqueue_split *vring_split, + struct virtio_device *vdev, + bool weak_barriers, + bool context, + bool (*notify)(struct virtqueue *), + void (*callback)(struct virtqueue *), + const char *name, + struct device *dma_dev) +{ + struct vring_virtqueue *vq; + int err; + + if (virtio_has_feature(vdev, VIRTIO_F_RING_PACKED)) + return NULL; + + vq = kmalloc(sizeof(*vq), GFP_KERNEL); + if (!vq) + return NULL; + + vq->packed_ring = false; + vq->vq.callback = callback; + vq->vq.vdev = vdev; + vq->vq.name = name; + vq->vq.index = index; + vq->vq.reset = false; + vq->we_own_ring = false; + vq->notify = notify; + vq->weak_barriers = weak_barriers; +#ifdef CONFIG_VIRTIO_HARDEN_NOTIFICATION + vq->broken = true; +#else + vq->broken = false; +#endif + vq->dma_dev = dma_dev; + vq->use_dma_api = vring_use_dma_api(vdev); + vq->premapped = false; + vq->do_unmap = vq->use_dma_api; + + vq->indirect = virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC) && + !context; + vq->event = virtio_has_feature(vdev, VIRTIO_RING_F_EVENT_IDX); + + if (virtio_has_feature(vdev, VIRTIO_F_ORDER_PLATFORM)) + vq->weak_barriers = false; + + err = vring_alloc_state_extra_split(vring_split); + if (err) { + kfree(vq); + return NULL; + } + + virtqueue_vring_init_split(vring_split, vq); + + virtqueue_init(vq, vring_split->vring.num); + virtqueue_vring_attach_split(vq, vring_split); + + spin_lock(&vdev->vqs_list_lock); + list_add_tail(&vq->vq.list, &vdev->vqs); + spin_unlock(&vdev->vqs_list_lock); + return &vq->vq; +} + +struct virtqueue *vring_create_virtqueue( + unsigned int index, + unsigned int num, + unsigned int vring_align, + struct virtio_device *vdev, + bool weak_barriers, + bool may_reduce_num, + bool context, + bool (*notify)(struct virtqueue *), + void (*callback)(struct virtqueue *), + const char *name) +{ + + if (virtio_has_feature(vdev, VIRTIO_F_RING_PACKED)) + return vring_create_virtqueue_packed(index, num, vring_align, + vdev, weak_barriers, may_reduce_num, + context, notify, callback, name, vdev->dev.parent); + + return vring_create_virtqueue_split(index, num, vring_align, + vdev, weak_barriers, may_reduce_num, + context, notify, callback, name, vdev->dev.parent); +} +EXPORT_SYMBOL_GPL(vring_create_virtqueue); + +struct virtqueue *vring_create_virtqueue_dma( + unsigned int index, + unsigned int num, + unsigned int vring_align, + struct virtio_device *vdev, + bool weak_barriers, + bool may_reduce_num, + bool context, + bool (*notify)(struct virtqueue *), + void (*callback)(struct virtqueue *), + const char *name, + struct device *dma_dev) +{ + + if (virtio_has_feature(vdev, VIRTIO_F_RING_PACKED)) + return vring_create_virtqueue_packed(index, num, vring_align, + vdev, weak_barriers, may_reduce_num, + context, notify, callback, name, dma_dev); + + return vring_create_virtqueue_split(index, num, vring_align, + vdev, weak_barriers, may_reduce_num, + context, notify, callback, name, dma_dev); +} +EXPORT_SYMBOL_GPL(vring_create_virtqueue_dma); + +/** + * virtqueue_resize - resize the vring of vq + * @_vq: the struct virtqueue we're talking about. + * @num: new ring num + * @recycle: callback to recycle unused buffers + * + * When it is really necessary to create a new vring, it will set the current vq + * into the reset state. Then call the passed callback to recycle the buffer + * that is no longer used. Only after the new vring is successfully created, the + * old vring will be released. + * + * Caller must ensure we don't call this with other virtqueue operations + * at the same time (except where noted). + * + * Returns zero or a negative error. + * 0: success. + * -ENOMEM: Failed to allocate a new ring, fall back to the original ring size. + * vq can still work normally + * -EBUSY: Failed to sync with device, vq may not work properly + * -ENOENT: Transport or device not supported + * -E2BIG/-EINVAL: num error + * -EPERM: Operation not permitted + * + */ +int virtqueue_resize(struct virtqueue *_vq, u32 num, + void (*recycle)(struct virtqueue *vq, void *buf)) +{ + struct vring_virtqueue *vq = to_vvq(_vq); + int err; + + if (num > vq->vq.num_max) + return -E2BIG; + + if (!num) + return -EINVAL; + + if ((vq->packed_ring ? vq->packed.vring.num : vq->split.vring.num) == num) + return 0; + + err = virtqueue_disable_and_recycle(_vq, recycle); + if (err) + return err; + + if (vq->packed_ring) + err = virtqueue_resize_packed(_vq, num); + else + err = virtqueue_resize_split(_vq, num); + + return virtqueue_enable_after_reset(_vq); +} +EXPORT_SYMBOL_GPL(virtqueue_resize); + +/** + * virtqueue_set_dma_premapped - set the vring premapped mode + * @_vq: the struct virtqueue we're talking about. + * + * Enable the premapped mode of the vq. + * + * The vring in premapped mode does not do dma internally, so the driver must + * do dma mapping in advance. The driver must pass the dma_address through + * dma_address of scatterlist. When the driver got a used buffer from + * the vring, it has to unmap the dma address. + * + * This function must be called immediately after creating the vq, or after vq + * reset, and before adding any buffers to it. + * + * Caller must ensure we don't call this with other virtqueue operations + * at the same time (except where noted). + * + * Returns zero or a negative error. + * 0: success. + * -EINVAL: vring does not use the dma api, so we can not enable premapped mode. + */ +int virtqueue_set_dma_premapped(struct virtqueue *_vq) +{ + struct vring_virtqueue *vq = to_vvq(_vq); + u32 num; + + START_USE(vq); + + num = vq->packed_ring ? vq->packed.vring.num : vq->split.vring.num; + + if (num != vq->vq.num_free) { + END_USE(vq); + return -EINVAL; + } + + if (!vq->use_dma_api) { + END_USE(vq); + return -EINVAL; + } + + vq->premapped = true; + vq->do_unmap = false; + + END_USE(vq); + + return 0; +} +EXPORT_SYMBOL_GPL(virtqueue_set_dma_premapped); + +/** + * virtqueue_reset - detach and recycle all unused buffers + * @_vq: the struct virtqueue we're talking about. + * @recycle: callback to recycle unused buffers + * + * Caller must ensure we don't call this with other virtqueue operations + * at the same time (except where noted). + * + * Returns zero or a negative error. + * 0: success. + * -EBUSY: Failed to sync with device, vq may not work properly + * -ENOENT: Transport or device not supported + * -EPERM: Operation not permitted + */ +int virtqueue_reset(struct virtqueue *_vq, + void (*recycle)(struct virtqueue *vq, void *buf)) +{ + struct vring_virtqueue *vq = to_vvq(_vq); + int err; + + err = virtqueue_disable_and_recycle(_vq, recycle); + if (err) + return err; + + if (vq->packed_ring) + virtqueue_reinit_packed(vq); + else + virtqueue_reinit_split(vq); + + return virtqueue_enable_after_reset(_vq); +} +EXPORT_SYMBOL_GPL(virtqueue_reset); + +/* Only available for split ring */ +struct virtqueue *vring_new_virtqueue(unsigned int index, + unsigned int num, + unsigned int vring_align, + struct virtio_device *vdev, + bool weak_barriers, + bool context, + void *pages, + bool (*notify)(struct virtqueue *vq), + void (*callback)(struct virtqueue *vq), + const char *name) +{ + struct vring_virtqueue_split vring_split = {}; + + if (virtio_has_feature(vdev, VIRTIO_F_RING_PACKED)) + return NULL; + + vring_init(&vring_split.vring, num, pages, vring_align); + return __vring_new_virtqueue(index, &vring_split, vdev, weak_barriers, + context, notify, callback, name, + vdev->dev.parent); +} +EXPORT_SYMBOL_GPL(vring_new_virtqueue); + +static void vring_free(struct virtqueue *_vq) +{ + struct vring_virtqueue *vq = to_vvq(_vq); + + if (vq->we_own_ring) { + if (vq->packed_ring) { + vring_free_queue(vq->vq.vdev, + vq->packed.ring_size_in_bytes, + vq->packed.vring.desc, + vq->packed.ring_dma_addr, + vring_dma_dev(vq)); + + vring_free_queue(vq->vq.vdev, + vq->packed.event_size_in_bytes, + vq->packed.vring.driver, + vq->packed.driver_event_dma_addr, + vring_dma_dev(vq)); + + vring_free_queue(vq->vq.vdev, + vq->packed.event_size_in_bytes, + vq->packed.vring.device, + vq->packed.device_event_dma_addr, + vring_dma_dev(vq)); + + kfree(vq->packed.desc_state); + kfree(vq->packed.desc_extra); + } else { + vring_free_queue(vq->vq.vdev, + vq->split.queue_size_in_bytes, + vq->split.vring.desc, + vq->split.queue_dma_addr, + vring_dma_dev(vq)); + } + } + if (!vq->packed_ring) { + kfree(vq->split.desc_state); + kfree(vq->split.desc_extra); + } +} + +void vring_del_virtqueue(struct virtqueue *_vq) +{ + struct vring_virtqueue *vq = to_vvq(_vq); + + spin_lock(&vq->vq.vdev->vqs_list_lock); + list_del(&_vq->list); + spin_unlock(&vq->vq.vdev->vqs_list_lock); + + vring_free(_vq); + + kfree(vq); +} +EXPORT_SYMBOL_GPL(vring_del_virtqueue); + +u32 vring_notification_data(struct virtqueue *_vq) +{ + struct vring_virtqueue *vq = to_vvq(_vq); + u16 next; + + if (vq->packed_ring) + next = (vq->packed.next_avail_idx & + ~(-(1 << VRING_PACKED_EVENT_F_WRAP_CTR))) | + vq->packed.avail_wrap_counter << + VRING_PACKED_EVENT_F_WRAP_CTR; + else + next = vq->split.avail_idx_shadow; + + return next << 16 | _vq->index; +} +EXPORT_SYMBOL_GPL(vring_notification_data); + +/* Manipulates transport-specific feature bits. */ +void vring_transport_features(struct virtio_device *vdev) +{ + unsigned int i; + + for (i = VIRTIO_TRANSPORT_F_START; i < VIRTIO_TRANSPORT_F_END; i++) { + switch (i) { + case VIRTIO_RING_F_INDIRECT_DESC: + break; + case VIRTIO_RING_F_EVENT_IDX: + break; + case VIRTIO_F_VERSION_1: + break; + case VIRTIO_F_ACCESS_PLATFORM: + break; + case VIRTIO_F_RING_PACKED: + break; + case VIRTIO_F_ORDER_PLATFORM: + break; + case VIRTIO_F_NOTIFICATION_DATA: + break; + default: + /* We don't understand this bit. */ + __virtio_clear_bit(vdev, i); + } + } +} +EXPORT_SYMBOL_GPL(vring_transport_features); + +/** + * virtqueue_get_vring_size - return the size of the virtqueue's vring + * @_vq: the struct virtqueue containing the vring of interest. + * + * Returns the size of the vring. This is mainly used for boasting to + * userspace. Unlike other operations, this need not be serialized. + */ +unsigned int virtqueue_get_vring_size(const struct virtqueue *_vq) +{ + + const struct vring_virtqueue *vq = to_vvq(_vq); + + return vq->packed_ring ? vq->packed.vring.num : vq->split.vring.num; +} +EXPORT_SYMBOL_GPL(virtqueue_get_vring_size); + +/* + * This function should only be called by the core, not directly by the driver. + */ +void __virtqueue_break(struct virtqueue *_vq) +{ + struct vring_virtqueue *vq = to_vvq(_vq); + + /* Pairs with READ_ONCE() in virtqueue_is_broken(). */ + WRITE_ONCE(vq->broken, true); +} +EXPORT_SYMBOL_GPL(__virtqueue_break); + +/* + * This function should only be called by the core, not directly by the driver. + */ +void __virtqueue_unbreak(struct virtqueue *_vq) +{ + struct vring_virtqueue *vq = to_vvq(_vq); + + /* Pairs with READ_ONCE() in virtqueue_is_broken(). */ + WRITE_ONCE(vq->broken, false); +} +EXPORT_SYMBOL_GPL(__virtqueue_unbreak); + +bool virtqueue_is_broken(const struct virtqueue *_vq) +{ + const struct vring_virtqueue *vq = to_vvq(_vq); + + return READ_ONCE(vq->broken); +} +EXPORT_SYMBOL_GPL(virtqueue_is_broken); + +/* + * This should prevent the device from being used, allowing drivers to + * recover. You may need to grab appropriate locks to flush. + */ +void virtio_break_device(struct virtio_device *dev) +{ + struct virtqueue *_vq; + + spin_lock(&dev->vqs_list_lock); + list_for_each_entry(_vq, &dev->vqs, list) { + struct vring_virtqueue *vq = to_vvq(_vq); + + /* Pairs with READ_ONCE() in virtqueue_is_broken(). */ + WRITE_ONCE(vq->broken, true); + } + spin_unlock(&dev->vqs_list_lock); +} +EXPORT_SYMBOL_GPL(virtio_break_device); + +/* + * This should allow the device to be used by the driver. You may + * need to grab appropriate locks to flush the write to + * vq->broken. This should only be used in some specific case e.g + * (probing and restoring). This function should only be called by the + * core, not directly by the driver. + */ +void __virtio_unbreak_device(struct virtio_device *dev) +{ + struct virtqueue *_vq; + + spin_lock(&dev->vqs_list_lock); + list_for_each_entry(_vq, &dev->vqs, list) { + struct vring_virtqueue *vq = to_vvq(_vq); + + /* Pairs with READ_ONCE() in virtqueue_is_broken(). */ + WRITE_ONCE(vq->broken, false); + } + spin_unlock(&dev->vqs_list_lock); +} +EXPORT_SYMBOL_GPL(__virtio_unbreak_device); + +dma_addr_t virtqueue_get_desc_addr(const struct virtqueue *_vq) +{ + const struct vring_virtqueue *vq = to_vvq(_vq); + + BUG_ON(!vq->we_own_ring); + + if (vq->packed_ring) + return vq->packed.ring_dma_addr; + + return vq->split.queue_dma_addr; +} +EXPORT_SYMBOL_GPL(virtqueue_get_desc_addr); + +dma_addr_t virtqueue_get_avail_addr(const struct virtqueue *_vq) +{ + const struct vring_virtqueue *vq = to_vvq(_vq); + + BUG_ON(!vq->we_own_ring); + + if (vq->packed_ring) + return vq->packed.driver_event_dma_addr; + + return vq->split.queue_dma_addr + + ((char *)vq->split.vring.avail - (char *)vq->split.vring.desc); +} +EXPORT_SYMBOL_GPL(virtqueue_get_avail_addr); + +dma_addr_t virtqueue_get_used_addr(const struct virtqueue *_vq) +{ + const struct vring_virtqueue *vq = to_vvq(_vq); + + BUG_ON(!vq->we_own_ring); + + if (vq->packed_ring) + return vq->packed.device_event_dma_addr; + + return vq->split.queue_dma_addr + + ((char *)vq->split.vring.used - (char *)vq->split.vring.desc); +} +EXPORT_SYMBOL_GPL(virtqueue_get_used_addr); + +/* Only available for split ring */ +const struct vring *virtqueue_get_vring(const struct virtqueue *vq) +{ + return &to_vvq(vq)->split.vring; +} +EXPORT_SYMBOL_GPL(virtqueue_get_vring); + +/** + * virtqueue_dma_map_single_attrs - map DMA for _vq + * @_vq: the struct virtqueue we're talking about. + * @ptr: the pointer of the buffer to do dma + * @size: the size of the buffer to do dma + * @dir: DMA direction + * @attrs: DMA Attrs + * + * The caller calls this to do dma mapping in advance. The DMA address can be + * passed to this _vq when it is in pre-mapped mode. + * + * return DMA address. Caller should check that by virtqueue_dma_mapping_error(). + */ +dma_addr_t virtqueue_dma_map_single_attrs(struct virtqueue *_vq, void *ptr, + size_t size, + enum dma_data_direction dir, + unsigned long attrs) +{ + struct vring_virtqueue *vq = to_vvq(_vq); + + if (!vq->use_dma_api) + return (dma_addr_t)virt_to_phys(ptr); + + return dma_map_single_attrs(vring_dma_dev(vq), ptr, size, dir, attrs); +} +EXPORT_SYMBOL_GPL(virtqueue_dma_map_single_attrs); + +/** + * virtqueue_dma_unmap_single_attrs - unmap DMA for _vq + * @_vq: the struct virtqueue we're talking about. + * @addr: the dma address to unmap + * @size: the size of the buffer + * @dir: DMA direction + * @attrs: DMA Attrs + * + * Unmap the address that is mapped by the virtqueue_dma_map_* APIs. + * + */ +void virtqueue_dma_unmap_single_attrs(struct virtqueue *_vq, dma_addr_t addr, + size_t size, enum dma_data_direction dir, + unsigned long attrs) +{ + struct vring_virtqueue *vq = to_vvq(_vq); + + if (!vq->use_dma_api) + return; + + dma_unmap_single_attrs(vring_dma_dev(vq), addr, size, dir, attrs); +} +EXPORT_SYMBOL_GPL(virtqueue_dma_unmap_single_attrs); + +/** + * virtqueue_dma_mapping_error - check dma address + * @_vq: the struct virtqueue we're talking about. + * @addr: DMA address + * + * Returns 0 means dma valid. Other means invalid dma address. + */ +int virtqueue_dma_mapping_error(struct virtqueue *_vq, dma_addr_t addr) +{ + struct vring_virtqueue *vq = to_vvq(_vq); + + if (!vq->use_dma_api) + return 0; + + return dma_mapping_error(vring_dma_dev(vq), addr); +} +EXPORT_SYMBOL_GPL(virtqueue_dma_mapping_error); + +/** + * virtqueue_dma_need_sync - check a dma address needs sync + * @_vq: the struct virtqueue we're talking about. + * @addr: DMA address + * + * Check if the dma address mapped by the virtqueue_dma_map_* APIs needs to be + * synchronized + * + * return bool + */ +bool virtqueue_dma_need_sync(struct virtqueue *_vq, dma_addr_t addr) +{ + struct vring_virtqueue *vq = to_vvq(_vq); + + if (!vq->use_dma_api) + return false; + + return dma_need_sync(vring_dma_dev(vq), addr); +} +EXPORT_SYMBOL_GPL(virtqueue_dma_need_sync); + +/** + * virtqueue_dma_sync_single_range_for_cpu - dma sync for cpu + * @_vq: the struct virtqueue we're talking about. + * @addr: DMA address + * @offset: DMA address offset + * @size: buf size for sync + * @dir: DMA direction + * + * Before calling this function, use virtqueue_dma_need_sync() to confirm that + * the DMA address really needs to be synchronized + * + */ +void virtqueue_dma_sync_single_range_for_cpu(struct virtqueue *_vq, + dma_addr_t addr, + unsigned long offset, size_t size, + enum dma_data_direction dir) +{ + struct vring_virtqueue *vq = to_vvq(_vq); + struct device *dev = vring_dma_dev(vq); + + if (!vq->use_dma_api) + return; + + dma_sync_single_range_for_cpu(dev, addr, offset, size, dir); +} +EXPORT_SYMBOL_GPL(virtqueue_dma_sync_single_range_for_cpu); + +/** + * virtqueue_dma_sync_single_range_for_device - dma sync for device + * @_vq: the struct virtqueue we're talking about. + * @addr: DMA address + * @offset: DMA address offset + * @size: buf size for sync + * @dir: DMA direction + * + * Before calling this function, use virtqueue_dma_need_sync() to confirm that + * the DMA address really needs to be synchronized + */ +void virtqueue_dma_sync_single_range_for_device(struct virtqueue *_vq, + dma_addr_t addr, + unsigned long offset, size_t size, + enum dma_data_direction dir) +{ + struct vring_virtqueue *vq = to_vvq(_vq); + struct device *dev = vring_dma_dev(vq); + + if (!vq->use_dma_api) + return; + + dma_sync_single_range_for_device(dev, addr, offset, size, dir); +} +EXPORT_SYMBOL_GPL(virtqueue_dma_sync_single_range_for_device); + +MODULE_LICENSE("GPL"); diff --git a/drivers/virtio/virtio_vdpa.c b/drivers/virtio/virtio_vdpa.c new file mode 100644 index 0000000000..06ce6d8c2e --- /dev/null +++ b/drivers/virtio/virtio_vdpa.c @@ -0,0 +1,548 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * VIRTIO based driver for vDPA device + * + * Copyright (c) 2020, Red Hat. All rights reserved. + * Author: Jason Wang <jasowang@redhat.com> + * + */ + +#include <linux/init.h> +#include <linux/module.h> +#include <linux/device.h> +#include <linux/kernel.h> +#include <linux/slab.h> +#include <linux/uuid.h> +#include <linux/group_cpus.h> +#include <linux/virtio.h> +#include <linux/vdpa.h> +#include <linux/virtio_config.h> +#include <linux/virtio_ring.h> + +#define MOD_VERSION "0.1" +#define MOD_AUTHOR "Jason Wang <jasowang@redhat.com>" +#define MOD_DESC "vDPA bus driver for virtio devices" +#define MOD_LICENSE "GPL v2" + +struct virtio_vdpa_device { + struct virtio_device vdev; + struct vdpa_device *vdpa; + u64 features; + + /* The lock to protect virtqueue list */ + spinlock_t lock; + /* List of virtio_vdpa_vq_info */ + struct list_head virtqueues; +}; + +struct virtio_vdpa_vq_info { + /* the actual virtqueue */ + struct virtqueue *vq; + + /* the list node for the virtqueues list */ + struct list_head node; +}; + +static inline struct virtio_vdpa_device * +to_virtio_vdpa_device(struct virtio_device *dev) +{ + return container_of(dev, struct virtio_vdpa_device, vdev); +} + +static struct vdpa_device *vd_get_vdpa(struct virtio_device *vdev) +{ + return to_virtio_vdpa_device(vdev)->vdpa; +} + +static void virtio_vdpa_get(struct virtio_device *vdev, unsigned int offset, + void *buf, unsigned int len) +{ + struct vdpa_device *vdpa = vd_get_vdpa(vdev); + + vdpa_get_config(vdpa, offset, buf, len); +} + +static void virtio_vdpa_set(struct virtio_device *vdev, unsigned int offset, + const void *buf, unsigned int len) +{ + struct vdpa_device *vdpa = vd_get_vdpa(vdev); + + vdpa_set_config(vdpa, offset, buf, len); +} + +static u32 virtio_vdpa_generation(struct virtio_device *vdev) +{ + struct vdpa_device *vdpa = vd_get_vdpa(vdev); + const struct vdpa_config_ops *ops = vdpa->config; + + if (ops->get_generation) + return ops->get_generation(vdpa); + + return 0; +} + +static u8 virtio_vdpa_get_status(struct virtio_device *vdev) +{ + struct vdpa_device *vdpa = vd_get_vdpa(vdev); + const struct vdpa_config_ops *ops = vdpa->config; + + return ops->get_status(vdpa); +} + +static void virtio_vdpa_set_status(struct virtio_device *vdev, u8 status) +{ + struct vdpa_device *vdpa = vd_get_vdpa(vdev); + + return vdpa_set_status(vdpa, status); +} + +static void virtio_vdpa_reset(struct virtio_device *vdev) +{ + struct vdpa_device *vdpa = vd_get_vdpa(vdev); + + vdpa_reset(vdpa); +} + +static bool virtio_vdpa_notify(struct virtqueue *vq) +{ + struct vdpa_device *vdpa = vd_get_vdpa(vq->vdev); + const struct vdpa_config_ops *ops = vdpa->config; + + ops->kick_vq(vdpa, vq->index); + + return true; +} + +static bool virtio_vdpa_notify_with_data(struct virtqueue *vq) +{ + struct vdpa_device *vdpa = vd_get_vdpa(vq->vdev); + const struct vdpa_config_ops *ops = vdpa->config; + u32 data = vring_notification_data(vq); + + ops->kick_vq_with_data(vdpa, data); + + return true; +} + +static irqreturn_t virtio_vdpa_config_cb(void *private) +{ + struct virtio_vdpa_device *vd_dev = private; + + virtio_config_changed(&vd_dev->vdev); + + return IRQ_HANDLED; +} + +static irqreturn_t virtio_vdpa_virtqueue_cb(void *private) +{ + struct virtio_vdpa_vq_info *info = private; + + return vring_interrupt(0, info->vq); +} + +static struct virtqueue * +virtio_vdpa_setup_vq(struct virtio_device *vdev, unsigned int index, + void (*callback)(struct virtqueue *vq), + const char *name, bool ctx) +{ + struct virtio_vdpa_device *vd_dev = to_virtio_vdpa_device(vdev); + struct vdpa_device *vdpa = vd_get_vdpa(vdev); + struct device *dma_dev; + const struct vdpa_config_ops *ops = vdpa->config; + struct virtio_vdpa_vq_info *info; + bool (*notify)(struct virtqueue *vq) = virtio_vdpa_notify; + struct vdpa_callback cb; + struct virtqueue *vq; + u64 desc_addr, driver_addr, device_addr; + /* Assume split virtqueue, switch to packed if necessary */ + struct vdpa_vq_state state = {0}; + unsigned long flags; + u32 align, max_num, min_num = 1; + bool may_reduce_num = true; + int err; + + if (!name) + return NULL; + + if (index >= vdpa->nvqs) + return ERR_PTR(-ENOENT); + + /* We cannot accept VIRTIO_F_NOTIFICATION_DATA without kick_vq_with_data */ + if (__virtio_test_bit(vdev, VIRTIO_F_NOTIFICATION_DATA)) { + if (ops->kick_vq_with_data) + notify = virtio_vdpa_notify_with_data; + else + __virtio_clear_bit(vdev, VIRTIO_F_NOTIFICATION_DATA); + } + + /* Queue shouldn't already be set up. */ + if (ops->get_vq_ready(vdpa, index)) + return ERR_PTR(-ENOENT); + + /* Allocate and fill out our active queue description */ + info = kmalloc(sizeof(*info), GFP_KERNEL); + if (!info) + return ERR_PTR(-ENOMEM); + + max_num = ops->get_vq_num_max(vdpa); + if (max_num == 0) { + err = -ENOENT; + goto error_new_virtqueue; + } + + if (ops->get_vq_num_min) + min_num = ops->get_vq_num_min(vdpa); + + may_reduce_num = (max_num == min_num) ? false : true; + + /* Create the vring */ + align = ops->get_vq_align(vdpa); + + if (ops->get_vq_dma_dev) + dma_dev = ops->get_vq_dma_dev(vdpa, index); + else + dma_dev = vdpa_get_dma_dev(vdpa); + vq = vring_create_virtqueue_dma(index, max_num, align, vdev, + true, may_reduce_num, ctx, + notify, callback, name, dma_dev); + if (!vq) { + err = -ENOMEM; + goto error_new_virtqueue; + } + + vq->num_max = max_num; + + /* Setup virtqueue callback */ + cb.callback = callback ? virtio_vdpa_virtqueue_cb : NULL; + cb.private = info; + cb.trigger = NULL; + ops->set_vq_cb(vdpa, index, &cb); + ops->set_vq_num(vdpa, index, virtqueue_get_vring_size(vq)); + + desc_addr = virtqueue_get_desc_addr(vq); + driver_addr = virtqueue_get_avail_addr(vq); + device_addr = virtqueue_get_used_addr(vq); + + if (ops->set_vq_address(vdpa, index, + desc_addr, driver_addr, + device_addr)) { + err = -EINVAL; + goto err_vq; + } + + /* reset virtqueue state index */ + if (virtio_has_feature(vdev, VIRTIO_F_RING_PACKED)) { + struct vdpa_vq_state_packed *s = &state.packed; + + s->last_avail_counter = 1; + s->last_avail_idx = 0; + s->last_used_counter = 1; + s->last_used_idx = 0; + } + err = ops->set_vq_state(vdpa, index, &state); + if (err) + goto err_vq; + + ops->set_vq_ready(vdpa, index, 1); + + vq->priv = info; + info->vq = vq; + + spin_lock_irqsave(&vd_dev->lock, flags); + list_add(&info->node, &vd_dev->virtqueues); + spin_unlock_irqrestore(&vd_dev->lock, flags); + + return vq; + +err_vq: + vring_del_virtqueue(vq); +error_new_virtqueue: + ops->set_vq_ready(vdpa, index, 0); + /* VDPA driver should make sure vq is stopeed here */ + WARN_ON(ops->get_vq_ready(vdpa, index)); + kfree(info); + return ERR_PTR(err); +} + +static void virtio_vdpa_del_vq(struct virtqueue *vq) +{ + struct virtio_vdpa_device *vd_dev = to_virtio_vdpa_device(vq->vdev); + struct vdpa_device *vdpa = vd_dev->vdpa; + const struct vdpa_config_ops *ops = vdpa->config; + struct virtio_vdpa_vq_info *info = vq->priv; + unsigned int index = vq->index; + unsigned long flags; + + spin_lock_irqsave(&vd_dev->lock, flags); + list_del(&info->node); + spin_unlock_irqrestore(&vd_dev->lock, flags); + + /* Select and deactivate the queue (best effort) */ + ops->set_vq_ready(vdpa, index, 0); + + vring_del_virtqueue(vq); + + kfree(info); +} + +static void virtio_vdpa_del_vqs(struct virtio_device *vdev) +{ + struct virtqueue *vq, *n; + + list_for_each_entry_safe(vq, n, &vdev->vqs, list) + virtio_vdpa_del_vq(vq); +} + +static void default_calc_sets(struct irq_affinity *affd, unsigned int affvecs) +{ + affd->nr_sets = 1; + affd->set_size[0] = affvecs; +} + +static struct cpumask * +create_affinity_masks(unsigned int nvecs, struct irq_affinity *affd) +{ + unsigned int affvecs = 0, curvec, usedvecs, i; + struct cpumask *masks = NULL; + + if (nvecs > affd->pre_vectors + affd->post_vectors) + affvecs = nvecs - affd->pre_vectors - affd->post_vectors; + + if (!affd->calc_sets) + affd->calc_sets = default_calc_sets; + + affd->calc_sets(affd, affvecs); + + if (!affvecs) + return NULL; + + masks = kcalloc(nvecs, sizeof(*masks), GFP_KERNEL); + if (!masks) + return NULL; + + /* Fill out vectors at the beginning that don't need affinity */ + for (curvec = 0; curvec < affd->pre_vectors; curvec++) + cpumask_setall(&masks[curvec]); + + for (i = 0, usedvecs = 0; i < affd->nr_sets; i++) { + unsigned int this_vecs = affd->set_size[i]; + int j; + struct cpumask *result = group_cpus_evenly(this_vecs); + + if (!result) { + kfree(masks); + return NULL; + } + + for (j = 0; j < this_vecs; j++) + cpumask_copy(&masks[curvec + j], &result[j]); + kfree(result); + + curvec += this_vecs; + usedvecs += this_vecs; + } + + /* Fill out vectors at the end that don't need affinity */ + if (usedvecs >= affvecs) + curvec = affd->pre_vectors + affvecs; + else + curvec = affd->pre_vectors + usedvecs; + for (; curvec < nvecs; curvec++) + cpumask_setall(&masks[curvec]); + + return masks; +} + +static int virtio_vdpa_find_vqs(struct virtio_device *vdev, unsigned int nvqs, + struct virtqueue *vqs[], + vq_callback_t *callbacks[], + const char * const names[], + const bool *ctx, + struct irq_affinity *desc) +{ + struct virtio_vdpa_device *vd_dev = to_virtio_vdpa_device(vdev); + struct vdpa_device *vdpa = vd_get_vdpa(vdev); + const struct vdpa_config_ops *ops = vdpa->config; + struct irq_affinity default_affd = { 0 }; + struct cpumask *masks; + struct vdpa_callback cb; + bool has_affinity = desc && ops->set_vq_affinity; + int i, err, queue_idx = 0; + + if (has_affinity) { + masks = create_affinity_masks(nvqs, desc ? desc : &default_affd); + if (!masks) + return -ENOMEM; + } + + for (i = 0; i < nvqs; ++i) { + if (!names[i]) { + vqs[i] = NULL; + continue; + } + + vqs[i] = virtio_vdpa_setup_vq(vdev, queue_idx++, + callbacks[i], names[i], ctx ? + ctx[i] : false); + if (IS_ERR(vqs[i])) { + err = PTR_ERR(vqs[i]); + goto err_setup_vq; + } + + if (has_affinity) + ops->set_vq_affinity(vdpa, i, &masks[i]); + } + + cb.callback = virtio_vdpa_config_cb; + cb.private = vd_dev; + ops->set_config_cb(vdpa, &cb); + if (has_affinity) + kfree(masks); + + return 0; + +err_setup_vq: + virtio_vdpa_del_vqs(vdev); + if (has_affinity) + kfree(masks); + return err; +} + +static u64 virtio_vdpa_get_features(struct virtio_device *vdev) +{ + struct vdpa_device *vdpa = vd_get_vdpa(vdev); + const struct vdpa_config_ops *ops = vdpa->config; + + return ops->get_device_features(vdpa); +} + +static int virtio_vdpa_finalize_features(struct virtio_device *vdev) +{ + struct vdpa_device *vdpa = vd_get_vdpa(vdev); + + /* Give virtio_ring a chance to accept features. */ + vring_transport_features(vdev); + + return vdpa_set_features(vdpa, vdev->features); +} + +static const char *virtio_vdpa_bus_name(struct virtio_device *vdev) +{ + struct virtio_vdpa_device *vd_dev = to_virtio_vdpa_device(vdev); + struct vdpa_device *vdpa = vd_dev->vdpa; + + return dev_name(&vdpa->dev); +} + +static int virtio_vdpa_set_vq_affinity(struct virtqueue *vq, + const struct cpumask *cpu_mask) +{ + struct virtio_vdpa_device *vd_dev = to_virtio_vdpa_device(vq->vdev); + struct vdpa_device *vdpa = vd_dev->vdpa; + const struct vdpa_config_ops *ops = vdpa->config; + unsigned int index = vq->index; + + if (ops->set_vq_affinity) + return ops->set_vq_affinity(vdpa, index, cpu_mask); + + return 0; +} + +static const struct cpumask * +virtio_vdpa_get_vq_affinity(struct virtio_device *vdev, int index) +{ + struct vdpa_device *vdpa = vd_get_vdpa(vdev); + const struct vdpa_config_ops *ops = vdpa->config; + + if (ops->get_vq_affinity) + return ops->get_vq_affinity(vdpa, index); + + return NULL; +} + +static const struct virtio_config_ops virtio_vdpa_config_ops = { + .get = virtio_vdpa_get, + .set = virtio_vdpa_set, + .generation = virtio_vdpa_generation, + .get_status = virtio_vdpa_get_status, + .set_status = virtio_vdpa_set_status, + .reset = virtio_vdpa_reset, + .find_vqs = virtio_vdpa_find_vqs, + .del_vqs = virtio_vdpa_del_vqs, + .get_features = virtio_vdpa_get_features, + .finalize_features = virtio_vdpa_finalize_features, + .bus_name = virtio_vdpa_bus_name, + .set_vq_affinity = virtio_vdpa_set_vq_affinity, + .get_vq_affinity = virtio_vdpa_get_vq_affinity, +}; + +static void virtio_vdpa_release_dev(struct device *_d) +{ + struct virtio_device *vdev = + container_of(_d, struct virtio_device, dev); + struct virtio_vdpa_device *vd_dev = + container_of(vdev, struct virtio_vdpa_device, vdev); + + kfree(vd_dev); +} + +static int virtio_vdpa_probe(struct vdpa_device *vdpa) +{ + const struct vdpa_config_ops *ops = vdpa->config; + struct virtio_vdpa_device *vd_dev, *reg_dev = NULL; + int ret = -EINVAL; + + vd_dev = kzalloc(sizeof(*vd_dev), GFP_KERNEL); + if (!vd_dev) + return -ENOMEM; + + vd_dev->vdev.dev.parent = vdpa_get_dma_dev(vdpa); + vd_dev->vdev.dev.release = virtio_vdpa_release_dev; + vd_dev->vdev.config = &virtio_vdpa_config_ops; + vd_dev->vdpa = vdpa; + INIT_LIST_HEAD(&vd_dev->virtqueues); + spin_lock_init(&vd_dev->lock); + + vd_dev->vdev.id.device = ops->get_device_id(vdpa); + if (vd_dev->vdev.id.device == 0) + goto err; + + vd_dev->vdev.id.vendor = ops->get_vendor_id(vdpa); + ret = register_virtio_device(&vd_dev->vdev); + reg_dev = vd_dev; + if (ret) + goto err; + + vdpa_set_drvdata(vdpa, vd_dev); + + return 0; + +err: + if (reg_dev) + put_device(&vd_dev->vdev.dev); + else + kfree(vd_dev); + return ret; +} + +static void virtio_vdpa_remove(struct vdpa_device *vdpa) +{ + struct virtio_vdpa_device *vd_dev = vdpa_get_drvdata(vdpa); + + unregister_virtio_device(&vd_dev->vdev); +} + +static struct vdpa_driver virtio_vdpa_driver = { + .driver = { + .name = "virtio_vdpa", + }, + .probe = virtio_vdpa_probe, + .remove = virtio_vdpa_remove, +}; + +module_vdpa_driver(virtio_vdpa_driver); + +MODULE_VERSION(MOD_VERSION); +MODULE_LICENSE(MOD_LICENSE); +MODULE_AUTHOR(MOD_AUTHOR); +MODULE_DESCRIPTION(MOD_DESC); |