diff options
Diffstat (limited to '')
-rw-r--r-- | drivers/virt/Kconfig | 34 | ||||
-rw-r--r-- | drivers/virt/Makefile | 6 | ||||
-rw-r--r-- | drivers/virt/fsl_hypervisor.c | 935 | ||||
-rw-r--r-- | drivers/virt/vboxguest/Kconfig | 18 | ||||
-rw-r--r-- | drivers/virt/vboxguest/Makefile | 3 | ||||
-rw-r--r-- | drivers/virt/vboxguest/vboxguest_core.c | 1611 | ||||
-rw-r--r-- | drivers/virt/vboxguest/vboxguest_core.h | 198 | ||||
-rw-r--r-- | drivers/virt/vboxguest/vboxguest_linux.c | 482 | ||||
-rw-r--r-- | drivers/virt/vboxguest/vboxguest_utils.c | 813 | ||||
-rw-r--r-- | drivers/virt/vboxguest/vboxguest_version.h | 19 | ||||
-rw-r--r-- | drivers/virt/vboxguest/vmmdev.h | 451 | ||||
-rw-r--r-- | drivers/virtio/Kconfig | 86 | ||||
-rw-r--r-- | drivers/virtio/Makefile | 8 | ||||
-rw-r--r-- | drivers/virtio/virtio.c | 457 | ||||
-rw-r--r-- | drivers/virtio/virtio_balloon.c | 740 | ||||
-rw-r--r-- | drivers/virtio/virtio_input.c | 388 | ||||
-rw-r--r-- | drivers/virtio/virtio_mmio.c | 783 | ||||
-rw-r--r-- | drivers/virtio/virtio_pci_common.c | 644 | ||||
-rw-r--r-- | drivers/virtio/virtio_pci_common.h | 163 | ||||
-rw-r--r-- | drivers/virtio/virtio_pci_legacy.c | 280 | ||||
-rw-r--r-- | drivers/virtio/virtio_pci_modern.c | 737 | ||||
-rw-r--r-- | drivers/virtio/virtio_ring.c | 1259 |
22 files changed, 10115 insertions, 0 deletions
diff --git a/drivers/virt/Kconfig b/drivers/virt/Kconfig new file mode 100644 index 000000000..8d9cdfbd6 --- /dev/null +++ b/drivers/virt/Kconfig @@ -0,0 +1,34 @@ +# +# Virtualization support drivers +# + +menuconfig VIRT_DRIVERS + bool "Virtualization drivers" + ---help--- + Say Y here to get to see options for device drivers that support + virtualization environments. + + If you say N, all options in this submenu will be skipped and disabled. + +if VIRT_DRIVERS + +config FSL_HV_MANAGER + tristate "Freescale hypervisor management driver" + depends on FSL_SOC + select EPAPR_PARAVIRT + help + The Freescale hypervisor management driver provides several services + to drivers and applications related to the Freescale hypervisor: + + 1) An ioctl interface for querying and managing partitions. + + 2) A file interface to reading incoming doorbells. + + 3) An interrupt handler for shutting down the partition upon + receiving the shutdown doorbell from a manager partition. + + 4) A kernel interface for receiving callbacks when a managed + partition shuts down. + +source "drivers/virt/vboxguest/Kconfig" +endif diff --git a/drivers/virt/Makefile b/drivers/virt/Makefile new file mode 100644 index 000000000..d3f7b2540 --- /dev/null +++ b/drivers/virt/Makefile @@ -0,0 +1,6 @@ +# +# Makefile for drivers that support virtualization +# + +obj-$(CONFIG_FSL_HV_MANAGER) += fsl_hypervisor.o +obj-y += vboxguest/ diff --git a/drivers/virt/fsl_hypervisor.c b/drivers/virt/fsl_hypervisor.c new file mode 100644 index 000000000..2a7f7f47f --- /dev/null +++ b/drivers/virt/fsl_hypervisor.c @@ -0,0 +1,935 @@ +/* + * Freescale Hypervisor Management Driver + + * Copyright (C) 2008-2011 Freescale Semiconductor, Inc. + * Author: Timur Tabi <timur@freescale.com> + * + * This file is licensed under the terms of the GNU General Public License + * version 2. This program is licensed "as is" without any warranty of any + * kind, whether express or implied. + * + * The Freescale hypervisor management driver provides several services to + * drivers and applications related to the Freescale hypervisor: + * + * 1. An ioctl interface for querying and managing partitions. + * + * 2. A file interface to reading incoming doorbells. + * + * 3. An interrupt handler for shutting down the partition upon receiving the + * shutdown doorbell from a manager partition. + * + * 4. A kernel interface for receiving callbacks when a managed partition + * shuts down. + */ + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/types.h> +#include <linux/err.h> +#include <linux/fs.h> +#include <linux/miscdevice.h> +#include <linux/mm.h> +#include <linux/pagemap.h> +#include <linux/slab.h> +#include <linux/poll.h> +#include <linux/of.h> +#include <linux/of_irq.h> +#include <linux/reboot.h> +#include <linux/uaccess.h> +#include <linux/notifier.h> +#include <linux/interrupt.h> + +#include <linux/io.h> +#include <asm/fsl_hcalls.h> + +#include <linux/fsl_hypervisor.h> + +static BLOCKING_NOTIFIER_HEAD(failover_subscribers); + +/* + * Ioctl interface for FSL_HV_IOCTL_PARTITION_RESTART + * + * Restart a running partition + */ +static long ioctl_restart(struct fsl_hv_ioctl_restart __user *p) +{ + struct fsl_hv_ioctl_restart param; + + /* Get the parameters from the user */ + if (copy_from_user(¶m, p, sizeof(struct fsl_hv_ioctl_restart))) + return -EFAULT; + + param.ret = fh_partition_restart(param.partition); + + if (copy_to_user(&p->ret, ¶m.ret, sizeof(__u32))) + return -EFAULT; + + return 0; +} + +/* + * Ioctl interface for FSL_HV_IOCTL_PARTITION_STATUS + * + * Query the status of a partition + */ +static long ioctl_status(struct fsl_hv_ioctl_status __user *p) +{ + struct fsl_hv_ioctl_status param; + u32 status; + + /* Get the parameters from the user */ + if (copy_from_user(¶m, p, sizeof(struct fsl_hv_ioctl_status))) + return -EFAULT; + + param.ret = fh_partition_get_status(param.partition, &status); + if (!param.ret) + param.status = status; + + if (copy_to_user(p, ¶m, sizeof(struct fsl_hv_ioctl_status))) + return -EFAULT; + + return 0; +} + +/* + * Ioctl interface for FSL_HV_IOCTL_PARTITION_START + * + * Start a stopped partition. + */ +static long ioctl_start(struct fsl_hv_ioctl_start __user *p) +{ + struct fsl_hv_ioctl_start param; + + /* Get the parameters from the user */ + if (copy_from_user(¶m, p, sizeof(struct fsl_hv_ioctl_start))) + return -EFAULT; + + param.ret = fh_partition_start(param.partition, param.entry_point, + param.load); + + if (copy_to_user(&p->ret, ¶m.ret, sizeof(__u32))) + return -EFAULT; + + return 0; +} + +/* + * Ioctl interface for FSL_HV_IOCTL_PARTITION_STOP + * + * Stop a running partition + */ +static long ioctl_stop(struct fsl_hv_ioctl_stop __user *p) +{ + struct fsl_hv_ioctl_stop param; + + /* Get the parameters from the user */ + if (copy_from_user(¶m, p, sizeof(struct fsl_hv_ioctl_stop))) + return -EFAULT; + + param.ret = fh_partition_stop(param.partition); + + if (copy_to_user(&p->ret, ¶m.ret, sizeof(__u32))) + return -EFAULT; + + return 0; +} + +/* + * Ioctl interface for FSL_HV_IOCTL_MEMCPY + * + * The FH_MEMCPY hypercall takes an array of address/address/size structures + * to represent the data being copied. As a convenience to the user, this + * ioctl takes a user-create buffer and a pointer to a guest physically + * contiguous buffer in the remote partition, and creates the + * address/address/size array for the hypercall. + */ +static long ioctl_memcpy(struct fsl_hv_ioctl_memcpy __user *p) +{ + struct fsl_hv_ioctl_memcpy param; + + struct page **pages = NULL; + void *sg_list_unaligned = NULL; + struct fh_sg_list *sg_list = NULL; + + unsigned int num_pages; + unsigned long lb_offset; /* Offset within a page of the local buffer */ + + unsigned int i; + long ret = 0; + int num_pinned = 0; /* return value from get_user_pages_fast() */ + phys_addr_t remote_paddr; /* The next address in the remote buffer */ + uint32_t count; /* The number of bytes left to copy */ + + /* Get the parameters from the user */ + if (copy_from_user(¶m, p, sizeof(struct fsl_hv_ioctl_memcpy))) + return -EFAULT; + + /* + * One partition must be local, the other must be remote. In other + * words, if source and target are both -1, or are both not -1, then + * return an error. + */ + if ((param.source == -1) == (param.target == -1)) + return -EINVAL; + + /* + * The array of pages returned by get_user_pages_fast() covers only + * page-aligned memory. Since the user buffer is probably not + * page-aligned, we need to handle the discrepancy. + * + * We calculate the offset within a page of the S/G list, and make + * adjustments accordingly. This will result in a page list that looks + * like this: + * + * ---- <-- first page starts before the buffer + * | | + * |////|-> ---- + * |////| | | + * ---- | | + * | | + * ---- | | + * |////| | | + * |////| | | + * |////| | | + * ---- | | + * | | + * ---- | | + * |////| | | + * |////| | | + * |////| | | + * ---- | | + * | | + * ---- | | + * |////| | | + * |////|-> ---- + * | | <-- last page ends after the buffer + * ---- + * + * The distance between the start of the first page and the start of the + * buffer is lb_offset. The hashed (///) areas are the parts of the + * page list that contain the actual buffer. + * + * The advantage of this approach is that the number of pages is + * equal to the number of entries in the S/G list that we give to the + * hypervisor. + */ + lb_offset = param.local_vaddr & (PAGE_SIZE - 1); + if (param.count == 0 || + param.count > U64_MAX - lb_offset - PAGE_SIZE + 1) + return -EINVAL; + num_pages = (param.count + lb_offset + PAGE_SIZE - 1) >> PAGE_SHIFT; + + /* Allocate the buffers we need */ + + /* + * 'pages' is an array of struct page pointers that's initialized by + * get_user_pages_fast(). + */ + pages = kcalloc(num_pages, sizeof(struct page *), GFP_KERNEL); + if (!pages) { + pr_debug("fsl-hv: could not allocate page list\n"); + return -ENOMEM; + } + + /* + * sg_list is the list of fh_sg_list objects that we pass to the + * hypervisor. + */ + sg_list_unaligned = kmalloc(num_pages * sizeof(struct fh_sg_list) + + sizeof(struct fh_sg_list) - 1, GFP_KERNEL); + if (!sg_list_unaligned) { + pr_debug("fsl-hv: could not allocate S/G list\n"); + ret = -ENOMEM; + goto free_pages; + } + sg_list = PTR_ALIGN(sg_list_unaligned, sizeof(struct fh_sg_list)); + + /* Get the physical addresses of the source buffer */ + num_pinned = get_user_pages_fast(param.local_vaddr - lb_offset, + num_pages, param.source != -1, pages); + + if (num_pinned != num_pages) { + pr_debug("fsl-hv: could not lock source buffer\n"); + ret = (num_pinned < 0) ? num_pinned : -EFAULT; + goto exit; + } + + /* + * Build the fh_sg_list[] array. The first page is special + * because it's misaligned. + */ + if (param.source == -1) { + sg_list[0].source = page_to_phys(pages[0]) + lb_offset; + sg_list[0].target = param.remote_paddr; + } else { + sg_list[0].source = param.remote_paddr; + sg_list[0].target = page_to_phys(pages[0]) + lb_offset; + } + sg_list[0].size = min_t(uint64_t, param.count, PAGE_SIZE - lb_offset); + + remote_paddr = param.remote_paddr + sg_list[0].size; + count = param.count - sg_list[0].size; + + for (i = 1; i < num_pages; i++) { + if (param.source == -1) { + /* local to remote */ + sg_list[i].source = page_to_phys(pages[i]); + sg_list[i].target = remote_paddr; + } else { + /* remote to local */ + sg_list[i].source = remote_paddr; + sg_list[i].target = page_to_phys(pages[i]); + } + sg_list[i].size = min_t(uint64_t, count, PAGE_SIZE); + + remote_paddr += sg_list[i].size; + count -= sg_list[i].size; + } + + param.ret = fh_partition_memcpy(param.source, param.target, + virt_to_phys(sg_list), num_pages); + +exit: + if (pages && (num_pinned > 0)) { + for (i = 0; i < num_pinned; i++) + put_page(pages[i]); + } + + kfree(sg_list_unaligned); +free_pages: + kfree(pages); + + if (!ret) + if (copy_to_user(&p->ret, ¶m.ret, sizeof(__u32))) + return -EFAULT; + + return ret; +} + +/* + * Ioctl interface for FSL_HV_IOCTL_DOORBELL + * + * Ring a doorbell + */ +static long ioctl_doorbell(struct fsl_hv_ioctl_doorbell __user *p) +{ + struct fsl_hv_ioctl_doorbell param; + + /* Get the parameters from the user. */ + if (copy_from_user(¶m, p, sizeof(struct fsl_hv_ioctl_doorbell))) + return -EFAULT; + + param.ret = ev_doorbell_send(param.doorbell); + + if (copy_to_user(&p->ret, ¶m.ret, sizeof(__u32))) + return -EFAULT; + + return 0; +} + +static long ioctl_dtprop(struct fsl_hv_ioctl_prop __user *p, int set) +{ + struct fsl_hv_ioctl_prop param; + char __user *upath, *upropname; + void __user *upropval; + char *path, *propname; + void *propval; + int ret = 0; + + /* Get the parameters from the user. */ + if (copy_from_user(¶m, p, sizeof(struct fsl_hv_ioctl_prop))) + return -EFAULT; + + upath = (char __user *)(uintptr_t)param.path; + upropname = (char __user *)(uintptr_t)param.propname; + upropval = (void __user *)(uintptr_t)param.propval; + + path = strndup_user(upath, FH_DTPROP_MAX_PATHLEN); + if (IS_ERR(path)) + return PTR_ERR(path); + + propname = strndup_user(upropname, FH_DTPROP_MAX_PATHLEN); + if (IS_ERR(propname)) { + ret = PTR_ERR(propname); + goto err_free_path; + } + + if (param.proplen > FH_DTPROP_MAX_PROPLEN) { + ret = -EINVAL; + goto err_free_propname; + } + + propval = kmalloc(param.proplen, GFP_KERNEL); + if (!propval) { + ret = -ENOMEM; + goto err_free_propname; + } + + if (set) { + if (copy_from_user(propval, upropval, param.proplen)) { + ret = -EFAULT; + goto err_free_propval; + } + + param.ret = fh_partition_set_dtprop(param.handle, + virt_to_phys(path), + virt_to_phys(propname), + virt_to_phys(propval), + param.proplen); + } else { + param.ret = fh_partition_get_dtprop(param.handle, + virt_to_phys(path), + virt_to_phys(propname), + virt_to_phys(propval), + ¶m.proplen); + + if (param.ret == 0) { + if (copy_to_user(upropval, propval, param.proplen) || + put_user(param.proplen, &p->proplen)) { + ret = -EFAULT; + goto err_free_propval; + } + } + } + + if (put_user(param.ret, &p->ret)) + ret = -EFAULT; + +err_free_propval: + kfree(propval); +err_free_propname: + kfree(propname); +err_free_path: + kfree(path); + + return ret; +} + +/* + * Ioctl main entry point + */ +static long fsl_hv_ioctl(struct file *file, unsigned int cmd, + unsigned long argaddr) +{ + void __user *arg = (void __user *)argaddr; + long ret; + + switch (cmd) { + case FSL_HV_IOCTL_PARTITION_RESTART: + ret = ioctl_restart(arg); + break; + case FSL_HV_IOCTL_PARTITION_GET_STATUS: + ret = ioctl_status(arg); + break; + case FSL_HV_IOCTL_PARTITION_START: + ret = ioctl_start(arg); + break; + case FSL_HV_IOCTL_PARTITION_STOP: + ret = ioctl_stop(arg); + break; + case FSL_HV_IOCTL_MEMCPY: + ret = ioctl_memcpy(arg); + break; + case FSL_HV_IOCTL_DOORBELL: + ret = ioctl_doorbell(arg); + break; + case FSL_HV_IOCTL_GETPROP: + ret = ioctl_dtprop(arg, 0); + break; + case FSL_HV_IOCTL_SETPROP: + ret = ioctl_dtprop(arg, 1); + break; + default: + pr_debug("fsl-hv: bad ioctl dir=%u type=%u cmd=%u size=%u\n", + _IOC_DIR(cmd), _IOC_TYPE(cmd), _IOC_NR(cmd), + _IOC_SIZE(cmd)); + return -ENOTTY; + } + + return ret; +} + +/* Linked list of processes that have us open */ +static struct list_head db_list; + +/* spinlock for db_list */ +static DEFINE_SPINLOCK(db_list_lock); + +/* The size of the doorbell event queue. This must be a power of two. */ +#define QSIZE 16 + +/* Returns the next head/tail pointer, wrapping around the queue if necessary */ +#define nextp(x) (((x) + 1) & (QSIZE - 1)) + +/* Per-open data structure */ +struct doorbell_queue { + struct list_head list; + spinlock_t lock; + wait_queue_head_t wait; + unsigned int head; + unsigned int tail; + uint32_t q[QSIZE]; +}; + +/* Linked list of ISRs that we registered */ +struct list_head isr_list; + +/* Per-ISR data structure */ +struct doorbell_isr { + struct list_head list; + unsigned int irq; + uint32_t doorbell; /* The doorbell handle */ + uint32_t partition; /* The partition handle, if used */ +}; + +/* + * Add a doorbell to all of the doorbell queues + */ +static void fsl_hv_queue_doorbell(uint32_t doorbell) +{ + struct doorbell_queue *dbq; + unsigned long flags; + + /* Prevent another core from modifying db_list */ + spin_lock_irqsave(&db_list_lock, flags); + + list_for_each_entry(dbq, &db_list, list) { + if (dbq->head != nextp(dbq->tail)) { + dbq->q[dbq->tail] = doorbell; + /* + * This memory barrier eliminates the need to grab + * the spinlock for dbq. + */ + smp_wmb(); + dbq->tail = nextp(dbq->tail); + wake_up_interruptible(&dbq->wait); + } + } + + spin_unlock_irqrestore(&db_list_lock, flags); +} + +/* + * Interrupt handler for all doorbells + * + * We use the same interrupt handler for all doorbells. Whenever a doorbell + * is rung, and we receive an interrupt, we just put the handle for that + * doorbell (passed to us as *data) into all of the queues. + */ +static irqreturn_t fsl_hv_isr(int irq, void *data) +{ + fsl_hv_queue_doorbell((uintptr_t) data); + + return IRQ_HANDLED; +} + +/* + * State change thread function + * + * The state change notification arrives in an interrupt, but we can't call + * blocking_notifier_call_chain() in an interrupt handler. We could call + * atomic_notifier_call_chain(), but that would require the clients' call-back + * function to run in interrupt context. Since we don't want to impose that + * restriction on the clients, we use a threaded IRQ to process the + * notification in kernel context. + */ +static irqreturn_t fsl_hv_state_change_thread(int irq, void *data) +{ + struct doorbell_isr *dbisr = data; + + blocking_notifier_call_chain(&failover_subscribers, dbisr->partition, + NULL); + + return IRQ_HANDLED; +} + +/* + * Interrupt handler for state-change doorbells + */ +static irqreturn_t fsl_hv_state_change_isr(int irq, void *data) +{ + unsigned int status; + struct doorbell_isr *dbisr = data; + int ret; + + /* It's still a doorbell, so add it to all the queues. */ + fsl_hv_queue_doorbell(dbisr->doorbell); + + /* Determine the new state, and if it's stopped, notify the clients. */ + ret = fh_partition_get_status(dbisr->partition, &status); + if (!ret && (status == FH_PARTITION_STOPPED)) + return IRQ_WAKE_THREAD; + + return IRQ_HANDLED; +} + +/* + * Returns a bitmask indicating whether a read will block + */ +static __poll_t fsl_hv_poll(struct file *filp, struct poll_table_struct *p) +{ + struct doorbell_queue *dbq = filp->private_data; + unsigned long flags; + __poll_t mask; + + spin_lock_irqsave(&dbq->lock, flags); + + poll_wait(filp, &dbq->wait, p); + mask = (dbq->head == dbq->tail) ? 0 : (EPOLLIN | EPOLLRDNORM); + + spin_unlock_irqrestore(&dbq->lock, flags); + + return mask; +} + +/* + * Return the handles for any incoming doorbells + * + * If there are doorbell handles in the queue for this open instance, then + * return them to the caller as an array of 32-bit integers. Otherwise, + * block until there is at least one handle to return. + */ +static ssize_t fsl_hv_read(struct file *filp, char __user *buf, size_t len, + loff_t *off) +{ + struct doorbell_queue *dbq = filp->private_data; + uint32_t __user *p = (uint32_t __user *) buf; /* for put_user() */ + unsigned long flags; + ssize_t count = 0; + + /* Make sure we stop when the user buffer is full. */ + while (len >= sizeof(uint32_t)) { + uint32_t dbell; /* Local copy of doorbell queue data */ + + spin_lock_irqsave(&dbq->lock, flags); + + /* + * If the queue is empty, then either we're done or we need + * to block. If the application specified O_NONBLOCK, then + * we return the appropriate error code. + */ + if (dbq->head == dbq->tail) { + spin_unlock_irqrestore(&dbq->lock, flags); + if (count) + break; + if (filp->f_flags & O_NONBLOCK) + return -EAGAIN; + if (wait_event_interruptible(dbq->wait, + dbq->head != dbq->tail)) + return -ERESTARTSYS; + continue; + } + + /* + * Even though we have an smp_wmb() in the ISR, the core + * might speculatively execute the "dbell = ..." below while + * it's evaluating the if-statement above. In that case, the + * value put into dbell could be stale if the core accepts the + * speculation. To prevent that, we need a read memory barrier + * here as well. + */ + smp_rmb(); + + /* Copy the data to a temporary local buffer, because + * we can't call copy_to_user() from inside a spinlock + */ + dbell = dbq->q[dbq->head]; + dbq->head = nextp(dbq->head); + + spin_unlock_irqrestore(&dbq->lock, flags); + + if (put_user(dbell, p)) + return -EFAULT; + p++; + count += sizeof(uint32_t); + len -= sizeof(uint32_t); + } + + return count; +} + +/* + * Open the driver and prepare for reading doorbells. + * + * Every time an application opens the driver, we create a doorbell queue + * for that file handle. This queue is used for any incoming doorbells. + */ +static int fsl_hv_open(struct inode *inode, struct file *filp) +{ + struct doorbell_queue *dbq; + unsigned long flags; + int ret = 0; + + dbq = kzalloc(sizeof(struct doorbell_queue), GFP_KERNEL); + if (!dbq) { + pr_err("fsl-hv: out of memory\n"); + return -ENOMEM; + } + + spin_lock_init(&dbq->lock); + init_waitqueue_head(&dbq->wait); + + spin_lock_irqsave(&db_list_lock, flags); + list_add(&dbq->list, &db_list); + spin_unlock_irqrestore(&db_list_lock, flags); + + filp->private_data = dbq; + + return ret; +} + +/* + * Close the driver + */ +static int fsl_hv_close(struct inode *inode, struct file *filp) +{ + struct doorbell_queue *dbq = filp->private_data; + unsigned long flags; + + int ret = 0; + + spin_lock_irqsave(&db_list_lock, flags); + list_del(&dbq->list); + spin_unlock_irqrestore(&db_list_lock, flags); + + kfree(dbq); + + return ret; +} + +static const struct file_operations fsl_hv_fops = { + .owner = THIS_MODULE, + .open = fsl_hv_open, + .release = fsl_hv_close, + .poll = fsl_hv_poll, + .read = fsl_hv_read, + .unlocked_ioctl = fsl_hv_ioctl, + .compat_ioctl = fsl_hv_ioctl, +}; + +static struct miscdevice fsl_hv_misc_dev = { + MISC_DYNAMIC_MINOR, + "fsl-hv", + &fsl_hv_fops +}; + +static irqreturn_t fsl_hv_shutdown_isr(int irq, void *data) +{ + orderly_poweroff(false); + + return IRQ_HANDLED; +} + +/* + * Returns the handle of the parent of the given node + * + * The handle is the value of the 'hv-handle' property + */ +static int get_parent_handle(struct device_node *np) +{ + struct device_node *parent; + const uint32_t *prop; + uint32_t handle; + int len; + + parent = of_get_parent(np); + if (!parent) + /* It's not really possible for this to fail */ + return -ENODEV; + + /* + * The proper name for the handle property is "hv-handle", but some + * older versions of the hypervisor used "reg". + */ + prop = of_get_property(parent, "hv-handle", &len); + if (!prop) + prop = of_get_property(parent, "reg", &len); + + if (!prop || (len != sizeof(uint32_t))) { + /* This can happen only if the node is malformed */ + of_node_put(parent); + return -ENODEV; + } + + handle = be32_to_cpup(prop); + of_node_put(parent); + + return handle; +} + +/* + * Register a callback for failover events + * + * This function is called by device drivers to register their callback + * functions for fail-over events. + */ +int fsl_hv_failover_register(struct notifier_block *nb) +{ + return blocking_notifier_chain_register(&failover_subscribers, nb); +} +EXPORT_SYMBOL(fsl_hv_failover_register); + +/* + * Unregister a callback for failover events + */ +int fsl_hv_failover_unregister(struct notifier_block *nb) +{ + return blocking_notifier_chain_unregister(&failover_subscribers, nb); +} +EXPORT_SYMBOL(fsl_hv_failover_unregister); + +/* + * Return TRUE if we're running under FSL hypervisor + * + * This function checks to see if we're running under the Freescale + * hypervisor, and returns zero if we're not, or non-zero if we are. + * + * First, it checks if MSR[GS]==1, which means we're running under some + * hypervisor. Then it checks if there is a hypervisor node in the device + * tree. Currently, that means there needs to be a node in the root called + * "hypervisor" and which has a property named "fsl,hv-version". + */ +static int has_fsl_hypervisor(void) +{ + struct device_node *node; + int ret; + + node = of_find_node_by_path("/hypervisor"); + if (!node) + return 0; + + ret = of_find_property(node, "fsl,hv-version", NULL) != NULL; + + of_node_put(node); + + return ret; +} + +/* + * Freescale hypervisor management driver init + * + * This function is called when this module is loaded. + * + * Register ourselves as a miscellaneous driver. This will register the + * fops structure and create the right sysfs entries for udev. + */ +static int __init fsl_hypervisor_init(void) +{ + struct device_node *np; + struct doorbell_isr *dbisr, *n; + int ret; + + pr_info("Freescale hypervisor management driver\n"); + + if (!has_fsl_hypervisor()) { + pr_info("fsl-hv: no hypervisor found\n"); + return -ENODEV; + } + + ret = misc_register(&fsl_hv_misc_dev); + if (ret) { + pr_err("fsl-hv: cannot register device\n"); + return ret; + } + + INIT_LIST_HEAD(&db_list); + INIT_LIST_HEAD(&isr_list); + + for_each_compatible_node(np, NULL, "epapr,hv-receive-doorbell") { + unsigned int irq; + const uint32_t *handle; + + handle = of_get_property(np, "interrupts", NULL); + irq = irq_of_parse_and_map(np, 0); + if (!handle || (irq == NO_IRQ)) { + pr_err("fsl-hv: no 'interrupts' property in %pOF node\n", + np); + continue; + } + + dbisr = kzalloc(sizeof(*dbisr), GFP_KERNEL); + if (!dbisr) + goto out_of_memory; + + dbisr->irq = irq; + dbisr->doorbell = be32_to_cpup(handle); + + if (of_device_is_compatible(np, "fsl,hv-shutdown-doorbell")) { + /* The shutdown doorbell gets its own ISR */ + ret = request_irq(irq, fsl_hv_shutdown_isr, 0, + np->name, NULL); + } else if (of_device_is_compatible(np, + "fsl,hv-state-change-doorbell")) { + /* + * The state change doorbell triggers a notification if + * the state of the managed partition changes to + * "stopped". We need a separate interrupt handler for + * that, and we also need to know the handle of the + * target partition, not just the handle of the + * doorbell. + */ + dbisr->partition = ret = get_parent_handle(np); + if (ret < 0) { + pr_err("fsl-hv: node %pOF has missing or " + "malformed parent\n", np); + kfree(dbisr); + continue; + } + ret = request_threaded_irq(irq, fsl_hv_state_change_isr, + fsl_hv_state_change_thread, + 0, np->name, dbisr); + } else + ret = request_irq(irq, fsl_hv_isr, 0, np->name, dbisr); + + if (ret < 0) { + pr_err("fsl-hv: could not request irq %u for node %pOF\n", + irq, np); + kfree(dbisr); + continue; + } + + list_add(&dbisr->list, &isr_list); + + pr_info("fsl-hv: registered handler for doorbell %u\n", + dbisr->doorbell); + } + + return 0; + +out_of_memory: + list_for_each_entry_safe(dbisr, n, &isr_list, list) { + free_irq(dbisr->irq, dbisr); + list_del(&dbisr->list); + kfree(dbisr); + } + + misc_deregister(&fsl_hv_misc_dev); + + return -ENOMEM; +} + +/* + * Freescale hypervisor management driver termination + * + * This function is called when this driver is unloaded. + */ +static void __exit fsl_hypervisor_exit(void) +{ + struct doorbell_isr *dbisr, *n; + + list_for_each_entry_safe(dbisr, n, &isr_list, list) { + free_irq(dbisr->irq, dbisr); + list_del(&dbisr->list); + kfree(dbisr); + } + + misc_deregister(&fsl_hv_misc_dev); +} + +module_init(fsl_hypervisor_init); +module_exit(fsl_hypervisor_exit); + +MODULE_AUTHOR("Timur Tabi <timur@freescale.com>"); +MODULE_DESCRIPTION("Freescale hypervisor management driver"); +MODULE_LICENSE("GPL v2"); diff --git a/drivers/virt/vboxguest/Kconfig b/drivers/virt/vboxguest/Kconfig new file mode 100644 index 000000000..fffd318a1 --- /dev/null +++ b/drivers/virt/vboxguest/Kconfig @@ -0,0 +1,18 @@ +config VBOXGUEST + tristate "Virtual Box Guest integration support" + depends on X86 && PCI && INPUT + help + This is a driver for the Virtual Box Guest PCI device used in + Virtual Box virtual machines. Enabling this driver will add + support for Virtual Box Guest integration features such as + copy-and-paste, seamless mode and OpenGL pass-through. + + This driver also offers vboxguest IPC functionality which is needed + for the vboxfs driver which offers folder sharing support. + + If you enable this driver you should also enable the VBOXVIDEO option. + + Although it is possible to build this module in, it is advised + to build this driver as a module, so that it can be updated + independently of the kernel. Select M to build this driver as a + module. diff --git a/drivers/virt/vboxguest/Makefile b/drivers/virt/vboxguest/Makefile new file mode 100644 index 000000000..203b8f465 --- /dev/null +++ b/drivers/virt/vboxguest/Makefile @@ -0,0 +1,3 @@ +vboxguest-y := vboxguest_linux.o vboxguest_core.o vboxguest_utils.o + +obj-$(CONFIG_VBOXGUEST) += vboxguest.o diff --git a/drivers/virt/vboxguest/vboxguest_core.c b/drivers/virt/vboxguest/vboxguest_core.c new file mode 100644 index 000000000..b3bc70a50 --- /dev/null +++ b/drivers/virt/vboxguest/vboxguest_core.c @@ -0,0 +1,1611 @@ +/* SPDX-License-Identifier: (GPL-2.0 OR CDDL-1.0) */ +/* + * vboxguest core guest-device handling code, VBoxGuest.cpp in upstream svn. + * + * Copyright (C) 2007-2016 Oracle Corporation + */ + +#include <linux/device.h> +#include <linux/mm.h> +#include <linux/sched.h> +#include <linux/sizes.h> +#include <linux/slab.h> +#include <linux/vbox_err.h> +#include <linux/vbox_utils.h> +#include <linux/vmalloc.h> +#include "vboxguest_core.h" +#include "vboxguest_version.h" + +/* Get the pointer to the first HGCM parameter. */ +#define VBG_IOCTL_HGCM_CALL_PARMS(a) \ + ((struct vmmdev_hgcm_function_parameter *)( \ + (u8 *)(a) + sizeof(struct vbg_ioctl_hgcm_call))) +/* Get the pointer to the first HGCM parameter in a 32-bit request. */ +#define VBG_IOCTL_HGCM_CALL_PARMS32(a) \ + ((struct vmmdev_hgcm_function_parameter32 *)( \ + (u8 *)(a) + sizeof(struct vbg_ioctl_hgcm_call))) + +#define GUEST_MAPPINGS_TRIES 5 + +/** + * Reserves memory in which the VMM can relocate any guest mappings + * that are floating around. + * + * This operation is a little bit tricky since the VMM might not accept + * just any address because of address clashes between the three contexts + * it operates in, so we try several times. + * + * Failure to reserve the guest mappings is ignored. + * + * @gdev: The Guest extension device. + */ +static void vbg_guest_mappings_init(struct vbg_dev *gdev) +{ + struct vmmdev_hypervisorinfo *req; + void *guest_mappings[GUEST_MAPPINGS_TRIES]; + struct page **pages = NULL; + u32 size, hypervisor_size; + int i, rc; + + /* Query the required space. */ + req = vbg_req_alloc(sizeof(*req), VMMDEVREQ_GET_HYPERVISOR_INFO); + if (!req) + return; + + req->hypervisor_start = 0; + req->hypervisor_size = 0; + rc = vbg_req_perform(gdev, req); + if (rc < 0) + goto out; + + /* + * The VMM will report back if there is nothing it wants to map, like + * for instance in VT-x and AMD-V mode. + */ + if (req->hypervisor_size == 0) + goto out; + + hypervisor_size = req->hypervisor_size; + /* Add 4M so that we can align the vmap to 4MiB as the host requires. */ + size = PAGE_ALIGN(req->hypervisor_size) + SZ_4M; + + pages = kmalloc_array(size >> PAGE_SHIFT, sizeof(*pages), GFP_KERNEL); + if (!pages) + goto out; + + gdev->guest_mappings_dummy_page = alloc_page(GFP_HIGHUSER); + if (!gdev->guest_mappings_dummy_page) + goto out; + + for (i = 0; i < (size >> PAGE_SHIFT); i++) + pages[i] = gdev->guest_mappings_dummy_page; + + /* + * Try several times, the VMM might not accept some addresses because + * of address clashes between the three contexts. + */ + for (i = 0; i < GUEST_MAPPINGS_TRIES; i++) { + guest_mappings[i] = vmap(pages, (size >> PAGE_SHIFT), + VM_MAP, PAGE_KERNEL_RO); + if (!guest_mappings[i]) + break; + + req->header.request_type = VMMDEVREQ_SET_HYPERVISOR_INFO; + req->header.rc = VERR_INTERNAL_ERROR; + req->hypervisor_size = hypervisor_size; + req->hypervisor_start = + (unsigned long)PTR_ALIGN(guest_mappings[i], SZ_4M); + + rc = vbg_req_perform(gdev, req); + if (rc >= 0) { + gdev->guest_mappings = guest_mappings[i]; + break; + } + } + + /* Free vmap's from failed attempts. */ + while (--i >= 0) + vunmap(guest_mappings[i]); + + /* On failure free the dummy-page backing the vmap */ + if (!gdev->guest_mappings) { + __free_page(gdev->guest_mappings_dummy_page); + gdev->guest_mappings_dummy_page = NULL; + } + +out: + vbg_req_free(req, sizeof(*req)); + kfree(pages); +} + +/** + * Undo what vbg_guest_mappings_init did. + * + * @gdev: The Guest extension device. + */ +static void vbg_guest_mappings_exit(struct vbg_dev *gdev) +{ + struct vmmdev_hypervisorinfo *req; + int rc; + + if (!gdev->guest_mappings) + return; + + /* + * Tell the host that we're going to free the memory we reserved for + * it, the free it up. (Leak the memory if anything goes wrong here.) + */ + req = vbg_req_alloc(sizeof(*req), VMMDEVREQ_SET_HYPERVISOR_INFO); + if (!req) + return; + + req->hypervisor_start = 0; + req->hypervisor_size = 0; + + rc = vbg_req_perform(gdev, req); + + vbg_req_free(req, sizeof(*req)); + + if (rc < 0) { + vbg_err("%s error: %d\n", __func__, rc); + return; + } + + vunmap(gdev->guest_mappings); + gdev->guest_mappings = NULL; + + __free_page(gdev->guest_mappings_dummy_page); + gdev->guest_mappings_dummy_page = NULL; +} + +/** + * Report the guest information to the host. + * Return: 0 or negative errno value. + * @gdev: The Guest extension device. + */ +static int vbg_report_guest_info(struct vbg_dev *gdev) +{ + /* + * Allocate and fill in the two guest info reports. + */ + struct vmmdev_guest_info *req1 = NULL; + struct vmmdev_guest_info2 *req2 = NULL; + int rc, ret = -ENOMEM; + + req1 = vbg_req_alloc(sizeof(*req1), VMMDEVREQ_REPORT_GUEST_INFO); + req2 = vbg_req_alloc(sizeof(*req2), VMMDEVREQ_REPORT_GUEST_INFO2); + if (!req1 || !req2) + goto out_free; + + req1->interface_version = VMMDEV_VERSION; + req1->os_type = VMMDEV_OSTYPE_LINUX26; +#if __BITS_PER_LONG == 64 + req1->os_type |= VMMDEV_OSTYPE_X64; +#endif + + req2->additions_major = VBG_VERSION_MAJOR; + req2->additions_minor = VBG_VERSION_MINOR; + req2->additions_build = VBG_VERSION_BUILD; + req2->additions_revision = VBG_SVN_REV; + /* (no features defined yet) */ + req2->additions_features = 0; + strlcpy(req2->name, VBG_VERSION_STRING, + sizeof(req2->name)); + + /* + * There are two protocols here: + * 1. INFO2 + INFO1. Supported by >=3.2.51. + * 2. INFO1 and optionally INFO2. The old protocol. + * + * We try protocol 2 first. It will fail with VERR_NOT_SUPPORTED + * if not supported by the VMMDev (message ordering requirement). + */ + rc = vbg_req_perform(gdev, req2); + if (rc >= 0) { + rc = vbg_req_perform(gdev, req1); + } else if (rc == VERR_NOT_SUPPORTED || rc == VERR_NOT_IMPLEMENTED) { + rc = vbg_req_perform(gdev, req1); + if (rc >= 0) { + rc = vbg_req_perform(gdev, req2); + if (rc == VERR_NOT_IMPLEMENTED) + rc = VINF_SUCCESS; + } + } + ret = vbg_status_code_to_errno(rc); + +out_free: + vbg_req_free(req2, sizeof(*req2)); + vbg_req_free(req1, sizeof(*req1)); + return ret; +} + +/** + * Report the guest driver status to the host. + * Return: 0 or negative errno value. + * @gdev: The Guest extension device. + * @active: Flag whether the driver is now active or not. + */ +static int vbg_report_driver_status(struct vbg_dev *gdev, bool active) +{ + struct vmmdev_guest_status *req; + int rc; + + req = vbg_req_alloc(sizeof(*req), VMMDEVREQ_REPORT_GUEST_STATUS); + if (!req) + return -ENOMEM; + + req->facility = VBOXGUEST_FACILITY_TYPE_VBOXGUEST_DRIVER; + if (active) + req->status = VBOXGUEST_FACILITY_STATUS_ACTIVE; + else + req->status = VBOXGUEST_FACILITY_STATUS_INACTIVE; + req->flags = 0; + + rc = vbg_req_perform(gdev, req); + if (rc == VERR_NOT_IMPLEMENTED) /* Compatibility with older hosts. */ + rc = VINF_SUCCESS; + + vbg_req_free(req, sizeof(*req)); + + return vbg_status_code_to_errno(rc); +} + +/** + * Inflate the balloon by one chunk. The caller owns the balloon mutex. + * Return: 0 or negative errno value. + * @gdev: The Guest extension device. + * @chunk_idx: Index of the chunk. + */ +static int vbg_balloon_inflate(struct vbg_dev *gdev, u32 chunk_idx) +{ + struct vmmdev_memballoon_change *req = gdev->mem_balloon.change_req; + struct page **pages; + int i, rc, ret; + + pages = kmalloc_array(VMMDEV_MEMORY_BALLOON_CHUNK_PAGES, + sizeof(*pages), + GFP_KERNEL | __GFP_NOWARN); + if (!pages) + return -ENOMEM; + + req->header.size = sizeof(*req); + req->inflate = true; + req->pages = VMMDEV_MEMORY_BALLOON_CHUNK_PAGES; + + for (i = 0; i < VMMDEV_MEMORY_BALLOON_CHUNK_PAGES; i++) { + pages[i] = alloc_page(GFP_KERNEL | __GFP_NOWARN); + if (!pages[i]) { + ret = -ENOMEM; + goto out_error; + } + + req->phys_page[i] = page_to_phys(pages[i]); + } + + rc = vbg_req_perform(gdev, req); + if (rc < 0) { + vbg_err("%s error, rc: %d\n", __func__, rc); + ret = vbg_status_code_to_errno(rc); + goto out_error; + } + + gdev->mem_balloon.pages[chunk_idx] = pages; + + return 0; + +out_error: + while (--i >= 0) + __free_page(pages[i]); + kfree(pages); + + return ret; +} + +/** + * Deflate the balloon by one chunk. The caller owns the balloon mutex. + * Return: 0 or negative errno value. + * @gdev: The Guest extension device. + * @chunk_idx: Index of the chunk. + */ +static int vbg_balloon_deflate(struct vbg_dev *gdev, u32 chunk_idx) +{ + struct vmmdev_memballoon_change *req = gdev->mem_balloon.change_req; + struct page **pages = gdev->mem_balloon.pages[chunk_idx]; + int i, rc; + + req->header.size = sizeof(*req); + req->inflate = false; + req->pages = VMMDEV_MEMORY_BALLOON_CHUNK_PAGES; + + for (i = 0; i < VMMDEV_MEMORY_BALLOON_CHUNK_PAGES; i++) + req->phys_page[i] = page_to_phys(pages[i]); + + rc = vbg_req_perform(gdev, req); + if (rc < 0) { + vbg_err("%s error, rc: %d\n", __func__, rc); + return vbg_status_code_to_errno(rc); + } + + for (i = 0; i < VMMDEV_MEMORY_BALLOON_CHUNK_PAGES; i++) + __free_page(pages[i]); + kfree(pages); + gdev->mem_balloon.pages[chunk_idx] = NULL; + + return 0; +} + +/** + * Respond to VMMDEV_EVENT_BALLOON_CHANGE_REQUEST events, query the size + * the host wants the balloon to be and adjust accordingly. + */ +static void vbg_balloon_work(struct work_struct *work) +{ + struct vbg_dev *gdev = + container_of(work, struct vbg_dev, mem_balloon.work); + struct vmmdev_memballoon_info *req = gdev->mem_balloon.get_req; + u32 i, chunks; + int rc, ret; + + /* + * Setting this bit means that we request the value from the host and + * change the guest memory balloon according to the returned value. + */ + req->event_ack = VMMDEV_EVENT_BALLOON_CHANGE_REQUEST; + rc = vbg_req_perform(gdev, req); + if (rc < 0) { + vbg_err("%s error, rc: %d)\n", __func__, rc); + return; + } + + /* + * The host always returns the same maximum amount of chunks, so + * we do this once. + */ + if (!gdev->mem_balloon.max_chunks) { + gdev->mem_balloon.pages = + devm_kcalloc(gdev->dev, req->phys_mem_chunks, + sizeof(struct page **), GFP_KERNEL); + if (!gdev->mem_balloon.pages) + return; + + gdev->mem_balloon.max_chunks = req->phys_mem_chunks; + } + + chunks = req->balloon_chunks; + if (chunks > gdev->mem_balloon.max_chunks) { + vbg_err("%s: illegal balloon size %u (max=%u)\n", + __func__, chunks, gdev->mem_balloon.max_chunks); + return; + } + + if (chunks > gdev->mem_balloon.chunks) { + /* inflate */ + for (i = gdev->mem_balloon.chunks; i < chunks; i++) { + ret = vbg_balloon_inflate(gdev, i); + if (ret < 0) + return; + + gdev->mem_balloon.chunks++; + } + } else { + /* deflate */ + for (i = gdev->mem_balloon.chunks; i-- > chunks;) { + ret = vbg_balloon_deflate(gdev, i); + if (ret < 0) + return; + + gdev->mem_balloon.chunks--; + } + } +} + +/** + * Callback for heartbeat timer. + */ +static void vbg_heartbeat_timer(struct timer_list *t) +{ + struct vbg_dev *gdev = from_timer(gdev, t, heartbeat_timer); + + vbg_req_perform(gdev, gdev->guest_heartbeat_req); + mod_timer(&gdev->heartbeat_timer, + msecs_to_jiffies(gdev->heartbeat_interval_ms)); +} + +/** + * Configure the host to check guest's heartbeat + * and get heartbeat interval from the host. + * Return: 0 or negative errno value. + * @gdev: The Guest extension device. + * @enabled: Set true to enable guest heartbeat checks on host. + */ +static int vbg_heartbeat_host_config(struct vbg_dev *gdev, bool enabled) +{ + struct vmmdev_heartbeat *req; + int rc; + + req = vbg_req_alloc(sizeof(*req), VMMDEVREQ_HEARTBEAT_CONFIGURE); + if (!req) + return -ENOMEM; + + req->enabled = enabled; + req->interval_ns = 0; + rc = vbg_req_perform(gdev, req); + do_div(req->interval_ns, 1000000); /* ns -> ms */ + gdev->heartbeat_interval_ms = req->interval_ns; + vbg_req_free(req, sizeof(*req)); + + return vbg_status_code_to_errno(rc); +} + +/** + * Initializes the heartbeat timer. This feature may be disabled by the host. + * Return: 0 or negative errno value. + * @gdev: The Guest extension device. + */ +static int vbg_heartbeat_init(struct vbg_dev *gdev) +{ + int ret; + + /* Make sure that heartbeat checking is disabled if we fail. */ + ret = vbg_heartbeat_host_config(gdev, false); + if (ret < 0) + return ret; + + ret = vbg_heartbeat_host_config(gdev, true); + if (ret < 0) + return ret; + + gdev->guest_heartbeat_req = vbg_req_alloc( + sizeof(*gdev->guest_heartbeat_req), + VMMDEVREQ_GUEST_HEARTBEAT); + if (!gdev->guest_heartbeat_req) + return -ENOMEM; + + vbg_info("%s: Setting up heartbeat to trigger every %d milliseconds\n", + __func__, gdev->heartbeat_interval_ms); + mod_timer(&gdev->heartbeat_timer, 0); + + return 0; +} + +/** + * Cleanup hearbeat code, stop HB timer and disable host heartbeat checking. + * @gdev: The Guest extension device. + */ +static void vbg_heartbeat_exit(struct vbg_dev *gdev) +{ + del_timer_sync(&gdev->heartbeat_timer); + vbg_heartbeat_host_config(gdev, false); + vbg_req_free(gdev->guest_heartbeat_req, + sizeof(*gdev->guest_heartbeat_req)); +} + +/** + * Applies a change to the bit usage tracker. + * Return: true if the mask changed, false if not. + * @tracker: The bit usage tracker. + * @changed: The bits to change. + * @previous: The previous value of the bits. + */ +static bool vbg_track_bit_usage(struct vbg_bit_usage_tracker *tracker, + u32 changed, u32 previous) +{ + bool global_change = false; + + while (changed) { + u32 bit = ffs(changed) - 1; + u32 bitmask = BIT(bit); + + if (bitmask & previous) { + tracker->per_bit_usage[bit] -= 1; + if (tracker->per_bit_usage[bit] == 0) { + global_change = true; + tracker->mask &= ~bitmask; + } + } else { + tracker->per_bit_usage[bit] += 1; + if (tracker->per_bit_usage[bit] == 1) { + global_change = true; + tracker->mask |= bitmask; + } + } + + changed &= ~bitmask; + } + + return global_change; +} + +/** + * Init and termination worker for resetting the (host) event filter on the host + * Return: 0 or negative errno value. + * @gdev: The Guest extension device. + * @fixed_events: Fixed events (init time). + */ +static int vbg_reset_host_event_filter(struct vbg_dev *gdev, + u32 fixed_events) +{ + struct vmmdev_mask *req; + int rc; + + req = vbg_req_alloc(sizeof(*req), VMMDEVREQ_CTL_GUEST_FILTER_MASK); + if (!req) + return -ENOMEM; + + req->not_mask = U32_MAX & ~fixed_events; + req->or_mask = fixed_events; + rc = vbg_req_perform(gdev, req); + if (rc < 0) + vbg_err("%s error, rc: %d\n", __func__, rc); + + vbg_req_free(req, sizeof(*req)); + return vbg_status_code_to_errno(rc); +} + +/** + * Changes the event filter mask for the given session. + * + * This is called in response to VBG_IOCTL_CHANGE_FILTER_MASK as well as to + * do session cleanup. Takes the session spinlock. + * + * Return: 0 or negative errno value. + * @gdev: The Guest extension device. + * @session: The session. + * @or_mask: The events to add. + * @not_mask: The events to remove. + * @session_termination: Set if we're called by the session cleanup code. + * This tweaks the error handling so we perform + * proper session cleanup even if the host + * misbehaves. + */ +static int vbg_set_session_event_filter(struct vbg_dev *gdev, + struct vbg_session *session, + u32 or_mask, u32 not_mask, + bool session_termination) +{ + struct vmmdev_mask *req; + u32 changed, previous; + int rc, ret = 0; + + /* Allocate a request buffer before taking the spinlock */ + req = vbg_req_alloc(sizeof(*req), VMMDEVREQ_CTL_GUEST_FILTER_MASK); + if (!req) { + if (!session_termination) + return -ENOMEM; + /* Ignore allocation failure, we must do session cleanup. */ + } + + mutex_lock(&gdev->session_mutex); + + /* Apply the changes to the session mask. */ + previous = session->event_filter; + session->event_filter |= or_mask; + session->event_filter &= ~not_mask; + + /* If anything actually changed, update the global usage counters. */ + changed = previous ^ session->event_filter; + if (!changed) + goto out; + + vbg_track_bit_usage(&gdev->event_filter_tracker, changed, previous); + or_mask = gdev->fixed_events | gdev->event_filter_tracker.mask; + + if (gdev->event_filter_host == or_mask || !req) + goto out; + + gdev->event_filter_host = or_mask; + req->or_mask = or_mask; + req->not_mask = ~or_mask; + rc = vbg_req_perform(gdev, req); + if (rc < 0) { + ret = vbg_status_code_to_errno(rc); + + /* Failed, roll back (unless it's session termination time). */ + gdev->event_filter_host = U32_MAX; + if (session_termination) + goto out; + + vbg_track_bit_usage(&gdev->event_filter_tracker, changed, + session->event_filter); + session->event_filter = previous; + } + +out: + mutex_unlock(&gdev->session_mutex); + vbg_req_free(req, sizeof(*req)); + + return ret; +} + +/** + * Init and termination worker for set guest capabilities to zero on the host. + * Return: 0 or negative errno value. + * @gdev: The Guest extension device. + */ +static int vbg_reset_host_capabilities(struct vbg_dev *gdev) +{ + struct vmmdev_mask *req; + int rc; + + req = vbg_req_alloc(sizeof(*req), VMMDEVREQ_SET_GUEST_CAPABILITIES); + if (!req) + return -ENOMEM; + + req->not_mask = U32_MAX; + req->or_mask = 0; + rc = vbg_req_perform(gdev, req); + if (rc < 0) + vbg_err("%s error, rc: %d\n", __func__, rc); + + vbg_req_free(req, sizeof(*req)); + return vbg_status_code_to_errno(rc); +} + +/** + * Sets the guest capabilities for a session. Takes the session spinlock. + * Return: 0 or negative errno value. + * @gdev: The Guest extension device. + * @session: The session. + * @or_mask: The capabilities to add. + * @not_mask: The capabilities to remove. + * @session_termination: Set if we're called by the session cleanup code. + * This tweaks the error handling so we perform + * proper session cleanup even if the host + * misbehaves. + */ +static int vbg_set_session_capabilities(struct vbg_dev *gdev, + struct vbg_session *session, + u32 or_mask, u32 not_mask, + bool session_termination) +{ + struct vmmdev_mask *req; + u32 changed, previous; + int rc, ret = 0; + + /* Allocate a request buffer before taking the spinlock */ + req = vbg_req_alloc(sizeof(*req), VMMDEVREQ_SET_GUEST_CAPABILITIES); + if (!req) { + if (!session_termination) + return -ENOMEM; + /* Ignore allocation failure, we must do session cleanup. */ + } + + mutex_lock(&gdev->session_mutex); + + /* Apply the changes to the session mask. */ + previous = session->guest_caps; + session->guest_caps |= or_mask; + session->guest_caps &= ~not_mask; + + /* If anything actually changed, update the global usage counters. */ + changed = previous ^ session->guest_caps; + if (!changed) + goto out; + + vbg_track_bit_usage(&gdev->guest_caps_tracker, changed, previous); + or_mask = gdev->guest_caps_tracker.mask; + + if (gdev->guest_caps_host == or_mask || !req) + goto out; + + gdev->guest_caps_host = or_mask; + req->or_mask = or_mask; + req->not_mask = ~or_mask; + rc = vbg_req_perform(gdev, req); + if (rc < 0) { + ret = vbg_status_code_to_errno(rc); + + /* Failed, roll back (unless it's session termination time). */ + gdev->guest_caps_host = U32_MAX; + if (session_termination) + goto out; + + vbg_track_bit_usage(&gdev->guest_caps_tracker, changed, + session->guest_caps); + session->guest_caps = previous; + } + +out: + mutex_unlock(&gdev->session_mutex); + vbg_req_free(req, sizeof(*req)); + + return ret; +} + +/** + * vbg_query_host_version get the host feature mask and version information. + * Return: 0 or negative errno value. + * @gdev: The Guest extension device. + */ +static int vbg_query_host_version(struct vbg_dev *gdev) +{ + struct vmmdev_host_version *req; + int rc, ret; + + req = vbg_req_alloc(sizeof(*req), VMMDEVREQ_GET_HOST_VERSION); + if (!req) + return -ENOMEM; + + rc = vbg_req_perform(gdev, req); + ret = vbg_status_code_to_errno(rc); + if (ret) { + vbg_err("%s error: %d\n", __func__, rc); + goto out; + } + + snprintf(gdev->host_version, sizeof(gdev->host_version), "%u.%u.%ur%u", + req->major, req->minor, req->build, req->revision); + gdev->host_features = req->features; + + vbg_info("vboxguest: host-version: %s %#x\n", gdev->host_version, + gdev->host_features); + + if (!(req->features & VMMDEV_HVF_HGCM_PHYS_PAGE_LIST)) { + vbg_err("vboxguest: Error host too old (does not support page-lists)\n"); + ret = -ENODEV; + } + +out: + vbg_req_free(req, sizeof(*req)); + return ret; +} + +/** + * Initializes the VBoxGuest device extension when the + * device driver is loaded. + * + * The native code locates the VMMDev on the PCI bus and retrieve + * the MMIO and I/O port ranges, this function will take care of + * mapping the MMIO memory (if present). Upon successful return + * the native code should set up the interrupt handler. + * + * Return: 0 or negative errno value. + * + * @gdev: The Guest extension device. + * @fixed_events: Events that will be enabled upon init and no client + * will ever be allowed to mask. + */ +int vbg_core_init(struct vbg_dev *gdev, u32 fixed_events) +{ + int ret = -ENOMEM; + + gdev->fixed_events = fixed_events | VMMDEV_EVENT_HGCM; + gdev->event_filter_host = U32_MAX; /* forces a report */ + gdev->guest_caps_host = U32_MAX; /* forces a report */ + + init_waitqueue_head(&gdev->event_wq); + init_waitqueue_head(&gdev->hgcm_wq); + spin_lock_init(&gdev->event_spinlock); + mutex_init(&gdev->session_mutex); + mutex_init(&gdev->cancel_req_mutex); + timer_setup(&gdev->heartbeat_timer, vbg_heartbeat_timer, 0); + INIT_WORK(&gdev->mem_balloon.work, vbg_balloon_work); + + gdev->mem_balloon.get_req = + vbg_req_alloc(sizeof(*gdev->mem_balloon.get_req), + VMMDEVREQ_GET_MEMBALLOON_CHANGE_REQ); + gdev->mem_balloon.change_req = + vbg_req_alloc(sizeof(*gdev->mem_balloon.change_req), + VMMDEVREQ_CHANGE_MEMBALLOON); + gdev->cancel_req = + vbg_req_alloc(sizeof(*(gdev->cancel_req)), + VMMDEVREQ_HGCM_CANCEL2); + gdev->ack_events_req = + vbg_req_alloc(sizeof(*gdev->ack_events_req), + VMMDEVREQ_ACKNOWLEDGE_EVENTS); + gdev->mouse_status_req = + vbg_req_alloc(sizeof(*gdev->mouse_status_req), + VMMDEVREQ_GET_MOUSE_STATUS); + + if (!gdev->mem_balloon.get_req || !gdev->mem_balloon.change_req || + !gdev->cancel_req || !gdev->ack_events_req || + !gdev->mouse_status_req) + goto err_free_reqs; + + ret = vbg_query_host_version(gdev); + if (ret) + goto err_free_reqs; + + ret = vbg_report_guest_info(gdev); + if (ret) { + vbg_err("vboxguest: vbg_report_guest_info error: %d\n", ret); + goto err_free_reqs; + } + + ret = vbg_reset_host_event_filter(gdev, gdev->fixed_events); + if (ret) { + vbg_err("vboxguest: Error setting fixed event filter: %d\n", + ret); + goto err_free_reqs; + } + + ret = vbg_reset_host_capabilities(gdev); + if (ret) { + vbg_err("vboxguest: Error clearing guest capabilities: %d\n", + ret); + goto err_free_reqs; + } + + ret = vbg_core_set_mouse_status(gdev, 0); + if (ret) { + vbg_err("vboxguest: Error clearing mouse status: %d\n", ret); + goto err_free_reqs; + } + + /* These may fail without requiring the driver init to fail. */ + vbg_guest_mappings_init(gdev); + vbg_heartbeat_init(gdev); + + /* All Done! */ + ret = vbg_report_driver_status(gdev, true); + if (ret < 0) + vbg_err("vboxguest: Error reporting driver status: %d\n", ret); + + return 0; + +err_free_reqs: + vbg_req_free(gdev->mouse_status_req, + sizeof(*gdev->mouse_status_req)); + vbg_req_free(gdev->ack_events_req, + sizeof(*gdev->ack_events_req)); + vbg_req_free(gdev->cancel_req, + sizeof(*gdev->cancel_req)); + vbg_req_free(gdev->mem_balloon.change_req, + sizeof(*gdev->mem_balloon.change_req)); + vbg_req_free(gdev->mem_balloon.get_req, + sizeof(*gdev->mem_balloon.get_req)); + return ret; +} + +/** + * Call this on exit to clean-up vboxguest-core managed resources. + * + * The native code should call this before the driver is loaded, + * but don't call this on shutdown. + * @gdev: The Guest extension device. + */ +void vbg_core_exit(struct vbg_dev *gdev) +{ + vbg_heartbeat_exit(gdev); + vbg_guest_mappings_exit(gdev); + + /* Clear the host flags (mouse status etc). */ + vbg_reset_host_event_filter(gdev, 0); + vbg_reset_host_capabilities(gdev); + vbg_core_set_mouse_status(gdev, 0); + + vbg_req_free(gdev->mouse_status_req, + sizeof(*gdev->mouse_status_req)); + vbg_req_free(gdev->ack_events_req, + sizeof(*gdev->ack_events_req)); + vbg_req_free(gdev->cancel_req, + sizeof(*gdev->cancel_req)); + vbg_req_free(gdev->mem_balloon.change_req, + sizeof(*gdev->mem_balloon.change_req)); + vbg_req_free(gdev->mem_balloon.get_req, + sizeof(*gdev->mem_balloon.get_req)); +} + +/** + * Creates a VBoxGuest user session. + * + * vboxguest_linux.c calls this when userspace opens the char-device. + * Return: A pointer to the new session or an ERR_PTR on error. + * @gdev: The Guest extension device. + * @user: Set if this is a session for the vboxuser device. + */ +struct vbg_session *vbg_core_open_session(struct vbg_dev *gdev, bool user) +{ + struct vbg_session *session; + + session = kzalloc(sizeof(*session), GFP_KERNEL); + if (!session) + return ERR_PTR(-ENOMEM); + + session->gdev = gdev; + session->user_session = user; + + return session; +} + +/** + * Closes a VBoxGuest session. + * @session: The session to close (and free). + */ +void vbg_core_close_session(struct vbg_session *session) +{ + struct vbg_dev *gdev = session->gdev; + int i, rc; + + vbg_set_session_capabilities(gdev, session, 0, U32_MAX, true); + vbg_set_session_event_filter(gdev, session, 0, U32_MAX, true); + + for (i = 0; i < ARRAY_SIZE(session->hgcm_client_ids); i++) { + if (!session->hgcm_client_ids[i]) + continue; + + vbg_hgcm_disconnect(gdev, session->hgcm_client_ids[i], &rc); + } + + kfree(session); +} + +static int vbg_ioctl_chk(struct vbg_ioctl_hdr *hdr, size_t in_size, + size_t out_size) +{ + if (hdr->size_in != (sizeof(*hdr) + in_size) || + hdr->size_out != (sizeof(*hdr) + out_size)) + return -EINVAL; + + return 0; +} + +static int vbg_ioctl_driver_version_info( + struct vbg_ioctl_driver_version_info *info) +{ + const u16 vbg_maj_version = VBG_IOC_VERSION >> 16; + u16 min_maj_version, req_maj_version; + + if (vbg_ioctl_chk(&info->hdr, sizeof(info->u.in), sizeof(info->u.out))) + return -EINVAL; + + req_maj_version = info->u.in.req_version >> 16; + min_maj_version = info->u.in.min_version >> 16; + + if (info->u.in.min_version > info->u.in.req_version || + min_maj_version != req_maj_version) + return -EINVAL; + + if (info->u.in.min_version <= VBG_IOC_VERSION && + min_maj_version == vbg_maj_version) { + info->u.out.session_version = VBG_IOC_VERSION; + } else { + info->u.out.session_version = U32_MAX; + info->hdr.rc = VERR_VERSION_MISMATCH; + } + + info->u.out.driver_version = VBG_IOC_VERSION; + info->u.out.driver_revision = 0; + info->u.out.reserved1 = 0; + info->u.out.reserved2 = 0; + + return 0; +} + +static bool vbg_wait_event_cond(struct vbg_dev *gdev, + struct vbg_session *session, + u32 event_mask) +{ + unsigned long flags; + bool wakeup; + u32 events; + + spin_lock_irqsave(&gdev->event_spinlock, flags); + + events = gdev->pending_events & event_mask; + wakeup = events || session->cancel_waiters; + + spin_unlock_irqrestore(&gdev->event_spinlock, flags); + + return wakeup; +} + +/* Must be called with the event_lock held */ +static u32 vbg_consume_events_locked(struct vbg_dev *gdev, + struct vbg_session *session, + u32 event_mask) +{ + u32 events = gdev->pending_events & event_mask; + + gdev->pending_events &= ~events; + return events; +} + +static int vbg_ioctl_wait_for_events(struct vbg_dev *gdev, + struct vbg_session *session, + struct vbg_ioctl_wait_for_events *wait) +{ + u32 timeout_ms = wait->u.in.timeout_ms; + u32 event_mask = wait->u.in.events; + unsigned long flags; + long timeout; + int ret = 0; + + if (vbg_ioctl_chk(&wait->hdr, sizeof(wait->u.in), sizeof(wait->u.out))) + return -EINVAL; + + if (timeout_ms == U32_MAX) + timeout = MAX_SCHEDULE_TIMEOUT; + else + timeout = msecs_to_jiffies(timeout_ms); + + wait->u.out.events = 0; + do { + timeout = wait_event_interruptible_timeout( + gdev->event_wq, + vbg_wait_event_cond(gdev, session, event_mask), + timeout); + + spin_lock_irqsave(&gdev->event_spinlock, flags); + + if (timeout < 0 || session->cancel_waiters) { + ret = -EINTR; + } else if (timeout == 0) { + ret = -ETIMEDOUT; + } else { + wait->u.out.events = + vbg_consume_events_locked(gdev, session, event_mask); + } + + spin_unlock_irqrestore(&gdev->event_spinlock, flags); + + /* + * Someone else may have consumed the event(s) first, in + * which case we go back to waiting. + */ + } while (ret == 0 && wait->u.out.events == 0); + + return ret; +} + +static int vbg_ioctl_interrupt_all_wait_events(struct vbg_dev *gdev, + struct vbg_session *session, + struct vbg_ioctl_hdr *hdr) +{ + unsigned long flags; + + if (hdr->size_in != sizeof(*hdr) || hdr->size_out != sizeof(*hdr)) + return -EINVAL; + + spin_lock_irqsave(&gdev->event_spinlock, flags); + session->cancel_waiters = true; + spin_unlock_irqrestore(&gdev->event_spinlock, flags); + + wake_up(&gdev->event_wq); + + return 0; +} + +/** + * Checks if the VMM request is allowed in the context of the given session. + * Return: 0 or negative errno value. + * @gdev: The Guest extension device. + * @session: The calling session. + * @req: The request. + */ +static int vbg_req_allowed(struct vbg_dev *gdev, struct vbg_session *session, + const struct vmmdev_request_header *req) +{ + const struct vmmdev_guest_status *guest_status; + bool trusted_apps_only; + + switch (req->request_type) { + /* Trusted users apps only. */ + case VMMDEVREQ_QUERY_CREDENTIALS: + case VMMDEVREQ_REPORT_CREDENTIALS_JUDGEMENT: + case VMMDEVREQ_REGISTER_SHARED_MODULE: + case VMMDEVREQ_UNREGISTER_SHARED_MODULE: + case VMMDEVREQ_WRITE_COREDUMP: + case VMMDEVREQ_GET_CPU_HOTPLUG_REQ: + case VMMDEVREQ_SET_CPU_HOTPLUG_STATUS: + case VMMDEVREQ_CHECK_SHARED_MODULES: + case VMMDEVREQ_GET_PAGE_SHARING_STATUS: + case VMMDEVREQ_DEBUG_IS_PAGE_SHARED: + case VMMDEVREQ_REPORT_GUEST_STATS: + case VMMDEVREQ_REPORT_GUEST_USER_STATE: + case VMMDEVREQ_GET_STATISTICS_CHANGE_REQ: + trusted_apps_only = true; + break; + + /* Anyone. */ + case VMMDEVREQ_GET_MOUSE_STATUS: + case VMMDEVREQ_SET_MOUSE_STATUS: + case VMMDEVREQ_SET_POINTER_SHAPE: + case VMMDEVREQ_GET_HOST_VERSION: + case VMMDEVREQ_IDLE: + case VMMDEVREQ_GET_HOST_TIME: + case VMMDEVREQ_SET_POWER_STATUS: + case VMMDEVREQ_ACKNOWLEDGE_EVENTS: + case VMMDEVREQ_CTL_GUEST_FILTER_MASK: + case VMMDEVREQ_REPORT_GUEST_STATUS: + case VMMDEVREQ_GET_DISPLAY_CHANGE_REQ: + case VMMDEVREQ_VIDEMODE_SUPPORTED: + case VMMDEVREQ_GET_HEIGHT_REDUCTION: + case VMMDEVREQ_GET_DISPLAY_CHANGE_REQ2: + case VMMDEVREQ_VIDEMODE_SUPPORTED2: + case VMMDEVREQ_VIDEO_ACCEL_ENABLE: + case VMMDEVREQ_VIDEO_ACCEL_FLUSH: + case VMMDEVREQ_VIDEO_SET_VISIBLE_REGION: + case VMMDEVREQ_GET_DISPLAY_CHANGE_REQEX: + case VMMDEVREQ_GET_SEAMLESS_CHANGE_REQ: + case VMMDEVREQ_GET_VRDPCHANGE_REQ: + case VMMDEVREQ_LOG_STRING: + case VMMDEVREQ_GET_SESSION_ID: + trusted_apps_only = false; + break; + + /* Depends on the request parameters... */ + case VMMDEVREQ_REPORT_GUEST_CAPABILITIES: + guest_status = (const struct vmmdev_guest_status *)req; + switch (guest_status->facility) { + case VBOXGUEST_FACILITY_TYPE_ALL: + case VBOXGUEST_FACILITY_TYPE_VBOXGUEST_DRIVER: + vbg_err("Denying userspace vmm report guest cap. call facility %#08x\n", + guest_status->facility); + return -EPERM; + case VBOXGUEST_FACILITY_TYPE_VBOX_SERVICE: + trusted_apps_only = true; + break; + case VBOXGUEST_FACILITY_TYPE_VBOX_TRAY_CLIENT: + case VBOXGUEST_FACILITY_TYPE_SEAMLESS: + case VBOXGUEST_FACILITY_TYPE_GRAPHICS: + default: + trusted_apps_only = false; + break; + } + break; + + /* Anything else is not allowed. */ + default: + vbg_err("Denying userspace vmm call type %#08x\n", + req->request_type); + return -EPERM; + } + + if (trusted_apps_only && session->user_session) { + vbg_err("Denying userspace vmm call type %#08x through vboxuser device node\n", + req->request_type); + return -EPERM; + } + + return 0; +} + +static int vbg_ioctl_vmmrequest(struct vbg_dev *gdev, + struct vbg_session *session, void *data) +{ + struct vbg_ioctl_hdr *hdr = data; + int ret; + + if (hdr->size_in != hdr->size_out) + return -EINVAL; + + if (hdr->size_in > VMMDEV_MAX_VMMDEVREQ_SIZE) + return -E2BIG; + + if (hdr->type == VBG_IOCTL_HDR_TYPE_DEFAULT) + return -EINVAL; + + ret = vbg_req_allowed(gdev, session, data); + if (ret < 0) + return ret; + + vbg_req_perform(gdev, data); + WARN_ON(hdr->rc == VINF_HGCM_ASYNC_EXECUTE); + + return 0; +} + +static int vbg_ioctl_hgcm_connect(struct vbg_dev *gdev, + struct vbg_session *session, + struct vbg_ioctl_hgcm_connect *conn) +{ + u32 client_id; + int i, ret; + + if (vbg_ioctl_chk(&conn->hdr, sizeof(conn->u.in), sizeof(conn->u.out))) + return -EINVAL; + + /* Find a free place in the sessions clients array and claim it */ + mutex_lock(&gdev->session_mutex); + for (i = 0; i < ARRAY_SIZE(session->hgcm_client_ids); i++) { + if (!session->hgcm_client_ids[i]) { + session->hgcm_client_ids[i] = U32_MAX; + break; + } + } + mutex_unlock(&gdev->session_mutex); + + if (i >= ARRAY_SIZE(session->hgcm_client_ids)) + return -EMFILE; + + ret = vbg_hgcm_connect(gdev, &conn->u.in.loc, &client_id, + &conn->hdr.rc); + + mutex_lock(&gdev->session_mutex); + if (ret == 0 && conn->hdr.rc >= 0) { + conn->u.out.client_id = client_id; + session->hgcm_client_ids[i] = client_id; + } else { + conn->u.out.client_id = 0; + session->hgcm_client_ids[i] = 0; + } + mutex_unlock(&gdev->session_mutex); + + return ret; +} + +static int vbg_ioctl_hgcm_disconnect(struct vbg_dev *gdev, + struct vbg_session *session, + struct vbg_ioctl_hgcm_disconnect *disconn) +{ + u32 client_id; + int i, ret; + + if (vbg_ioctl_chk(&disconn->hdr, sizeof(disconn->u.in), 0)) + return -EINVAL; + + client_id = disconn->u.in.client_id; + if (client_id == 0 || client_id == U32_MAX) + return -EINVAL; + + mutex_lock(&gdev->session_mutex); + for (i = 0; i < ARRAY_SIZE(session->hgcm_client_ids); i++) { + if (session->hgcm_client_ids[i] == client_id) { + session->hgcm_client_ids[i] = U32_MAX; + break; + } + } + mutex_unlock(&gdev->session_mutex); + + if (i >= ARRAY_SIZE(session->hgcm_client_ids)) + return -EINVAL; + + ret = vbg_hgcm_disconnect(gdev, client_id, &disconn->hdr.rc); + + mutex_lock(&gdev->session_mutex); + if (ret == 0 && disconn->hdr.rc >= 0) + session->hgcm_client_ids[i] = 0; + else + session->hgcm_client_ids[i] = client_id; + mutex_unlock(&gdev->session_mutex); + + return ret; +} + +static bool vbg_param_valid(enum vmmdev_hgcm_function_parameter_type type) +{ + switch (type) { + case VMMDEV_HGCM_PARM_TYPE_32BIT: + case VMMDEV_HGCM_PARM_TYPE_64BIT: + case VMMDEV_HGCM_PARM_TYPE_LINADDR: + case VMMDEV_HGCM_PARM_TYPE_LINADDR_IN: + case VMMDEV_HGCM_PARM_TYPE_LINADDR_OUT: + return true; + default: + return false; + } +} + +static int vbg_ioctl_hgcm_call(struct vbg_dev *gdev, + struct vbg_session *session, bool f32bit, + struct vbg_ioctl_hgcm_call *call) +{ + size_t actual_size; + u32 client_id; + int i, ret; + + if (call->hdr.size_in < sizeof(*call)) + return -EINVAL; + + if (call->hdr.size_in != call->hdr.size_out) + return -EINVAL; + + if (call->parm_count > VMMDEV_HGCM_MAX_PARMS) + return -E2BIG; + + client_id = call->client_id; + if (client_id == 0 || client_id == U32_MAX) + return -EINVAL; + + actual_size = sizeof(*call); + if (f32bit) + actual_size += call->parm_count * + sizeof(struct vmmdev_hgcm_function_parameter32); + else + actual_size += call->parm_count * + sizeof(struct vmmdev_hgcm_function_parameter); + if (call->hdr.size_in < actual_size) { + vbg_debug("VBG_IOCTL_HGCM_CALL: hdr.size_in %d required size is %zd\n", + call->hdr.size_in, actual_size); + return -EINVAL; + } + call->hdr.size_out = actual_size; + + /* Validate parameter types */ + if (f32bit) { + struct vmmdev_hgcm_function_parameter32 *parm = + VBG_IOCTL_HGCM_CALL_PARMS32(call); + + for (i = 0; i < call->parm_count; i++) + if (!vbg_param_valid(parm[i].type)) + return -EINVAL; + } else { + struct vmmdev_hgcm_function_parameter *parm = + VBG_IOCTL_HGCM_CALL_PARMS(call); + + for (i = 0; i < call->parm_count; i++) + if (!vbg_param_valid(parm[i].type)) + return -EINVAL; + } + + /* + * Validate the client id. + */ + mutex_lock(&gdev->session_mutex); + for (i = 0; i < ARRAY_SIZE(session->hgcm_client_ids); i++) + if (session->hgcm_client_ids[i] == client_id) + break; + mutex_unlock(&gdev->session_mutex); + if (i >= ARRAY_SIZE(session->hgcm_client_ids)) { + vbg_debug("VBG_IOCTL_HGCM_CALL: INVALID handle. u32Client=%#08x\n", + client_id); + return -EINVAL; + } + + if (IS_ENABLED(CONFIG_COMPAT) && f32bit) + ret = vbg_hgcm_call32(gdev, client_id, + call->function, call->timeout_ms, + VBG_IOCTL_HGCM_CALL_PARMS32(call), + call->parm_count, &call->hdr.rc); + else + ret = vbg_hgcm_call(gdev, client_id, + call->function, call->timeout_ms, + VBG_IOCTL_HGCM_CALL_PARMS(call), + call->parm_count, &call->hdr.rc); + + if (ret == -E2BIG) { + /* E2BIG needs to be reported through the hdr.rc field. */ + call->hdr.rc = VERR_OUT_OF_RANGE; + ret = 0; + } + + if (ret && ret != -EINTR && ret != -ETIMEDOUT) + vbg_err("VBG_IOCTL_HGCM_CALL error: %d\n", ret); + + return ret; +} + +static int vbg_ioctl_log(struct vbg_ioctl_log *log) +{ + if (log->hdr.size_out != sizeof(log->hdr)) + return -EINVAL; + + vbg_info("%.*s", (int)(log->hdr.size_in - sizeof(log->hdr)), + log->u.in.msg); + + return 0; +} + +static int vbg_ioctl_change_filter_mask(struct vbg_dev *gdev, + struct vbg_session *session, + struct vbg_ioctl_change_filter *filter) +{ + u32 or_mask, not_mask; + + if (vbg_ioctl_chk(&filter->hdr, sizeof(filter->u.in), 0)) + return -EINVAL; + + or_mask = filter->u.in.or_mask; + not_mask = filter->u.in.not_mask; + + if ((or_mask | not_mask) & ~VMMDEV_EVENT_VALID_EVENT_MASK) + return -EINVAL; + + return vbg_set_session_event_filter(gdev, session, or_mask, not_mask, + false); +} + +static int vbg_ioctl_change_guest_capabilities(struct vbg_dev *gdev, + struct vbg_session *session, struct vbg_ioctl_set_guest_caps *caps) +{ + u32 or_mask, not_mask; + int ret; + + if (vbg_ioctl_chk(&caps->hdr, sizeof(caps->u.in), sizeof(caps->u.out))) + return -EINVAL; + + or_mask = caps->u.in.or_mask; + not_mask = caps->u.in.not_mask; + + if ((or_mask | not_mask) & ~VMMDEV_GUEST_CAPABILITIES_MASK) + return -EINVAL; + + ret = vbg_set_session_capabilities(gdev, session, or_mask, not_mask, + false); + if (ret) + return ret; + + caps->u.out.session_caps = session->guest_caps; + caps->u.out.global_caps = gdev->guest_caps_host; + + return 0; +} + +static int vbg_ioctl_check_balloon(struct vbg_dev *gdev, + struct vbg_ioctl_check_balloon *balloon_info) +{ + if (vbg_ioctl_chk(&balloon_info->hdr, 0, sizeof(balloon_info->u.out))) + return -EINVAL; + + balloon_info->u.out.balloon_chunks = gdev->mem_balloon.chunks; + /* + * Under Linux we handle VMMDEV_EVENT_BALLOON_CHANGE_REQUEST + * events entirely in the kernel, see vbg_core_isr(). + */ + balloon_info->u.out.handle_in_r3 = false; + + return 0; +} + +static int vbg_ioctl_write_core_dump(struct vbg_dev *gdev, + struct vbg_ioctl_write_coredump *dump) +{ + struct vmmdev_write_core_dump *req; + + if (vbg_ioctl_chk(&dump->hdr, sizeof(dump->u.in), 0)) + return -EINVAL; + + req = vbg_req_alloc(sizeof(*req), VMMDEVREQ_WRITE_COREDUMP); + if (!req) + return -ENOMEM; + + req->flags = dump->u.in.flags; + dump->hdr.rc = vbg_req_perform(gdev, req); + + vbg_req_free(req, sizeof(*req)); + return 0; +} + +/** + * Common IOCtl for user to kernel communication. + * Return: 0 or negative errno value. + * @session: The client session. + * @req: The requested function. + * @data: The i/o data buffer, minimum size sizeof(struct vbg_ioctl_hdr). + */ +int vbg_core_ioctl(struct vbg_session *session, unsigned int req, void *data) +{ + unsigned int req_no_size = req & ~IOCSIZE_MASK; + struct vbg_dev *gdev = session->gdev; + struct vbg_ioctl_hdr *hdr = data; + bool f32bit = false; + + hdr->rc = VINF_SUCCESS; + if (!hdr->size_out) + hdr->size_out = hdr->size_in; + + /* + * hdr->version and hdr->size_in / hdr->size_out minimum size are + * already checked by vbg_misc_device_ioctl(). + */ + + /* For VMMDEV_REQUEST hdr->type != VBG_IOCTL_HDR_TYPE_DEFAULT */ + if (req_no_size == VBG_IOCTL_VMMDEV_REQUEST(0) || + req == VBG_IOCTL_VMMDEV_REQUEST_BIG || + req == VBG_IOCTL_VMMDEV_REQUEST_BIG_ALT) + return vbg_ioctl_vmmrequest(gdev, session, data); + + if (hdr->type != VBG_IOCTL_HDR_TYPE_DEFAULT) + return -EINVAL; + + /* Fixed size requests. */ + switch (req) { + case VBG_IOCTL_DRIVER_VERSION_INFO: + return vbg_ioctl_driver_version_info(data); + case VBG_IOCTL_HGCM_CONNECT: + return vbg_ioctl_hgcm_connect(gdev, session, data); + case VBG_IOCTL_HGCM_DISCONNECT: + return vbg_ioctl_hgcm_disconnect(gdev, session, data); + case VBG_IOCTL_WAIT_FOR_EVENTS: + return vbg_ioctl_wait_for_events(gdev, session, data); + case VBG_IOCTL_INTERRUPT_ALL_WAIT_FOR_EVENTS: + return vbg_ioctl_interrupt_all_wait_events(gdev, session, data); + case VBG_IOCTL_CHANGE_FILTER_MASK: + return vbg_ioctl_change_filter_mask(gdev, session, data); + case VBG_IOCTL_CHANGE_GUEST_CAPABILITIES: + return vbg_ioctl_change_guest_capabilities(gdev, session, data); + case VBG_IOCTL_CHECK_BALLOON: + return vbg_ioctl_check_balloon(gdev, data); + case VBG_IOCTL_WRITE_CORE_DUMP: + return vbg_ioctl_write_core_dump(gdev, data); + } + + /* Variable sized requests. */ + switch (req_no_size) { +#ifdef CONFIG_COMPAT + case VBG_IOCTL_HGCM_CALL_32(0): + f32bit = true; + /* Fall through */ +#endif + case VBG_IOCTL_HGCM_CALL(0): + return vbg_ioctl_hgcm_call(gdev, session, f32bit, data); + case VBG_IOCTL_LOG(0): + case VBG_IOCTL_LOG_ALT(0): + return vbg_ioctl_log(data); + } + + vbg_debug("VGDrvCommonIoCtl: Unknown req %#08x\n", req); + return -ENOTTY; +} + +/** + * Report guest supported mouse-features to the host. + * + * Return: 0 or negative errno value. + * @gdev: The Guest extension device. + * @features: The set of features to report to the host. + */ +int vbg_core_set_mouse_status(struct vbg_dev *gdev, u32 features) +{ + struct vmmdev_mouse_status *req; + int rc; + + req = vbg_req_alloc(sizeof(*req), VMMDEVREQ_SET_MOUSE_STATUS); + if (!req) + return -ENOMEM; + + req->mouse_features = features; + req->pointer_pos_x = 0; + req->pointer_pos_y = 0; + + rc = vbg_req_perform(gdev, req); + if (rc < 0) + vbg_err("%s error, rc: %d\n", __func__, rc); + + vbg_req_free(req, sizeof(*req)); + return vbg_status_code_to_errno(rc); +} + +/** Core interrupt service routine. */ +irqreturn_t vbg_core_isr(int irq, void *dev_id) +{ + struct vbg_dev *gdev = dev_id; + struct vmmdev_events *req = gdev->ack_events_req; + bool mouse_position_changed = false; + unsigned long flags; + u32 events = 0; + int rc; + + if (!gdev->mmio->V.V1_04.have_events) + return IRQ_NONE; + + /* Get and acknowlegde events. */ + req->header.rc = VERR_INTERNAL_ERROR; + req->events = 0; + rc = vbg_req_perform(gdev, req); + if (rc < 0) { + vbg_err("Error performing events req, rc: %d\n", rc); + return IRQ_NONE; + } + + events = req->events; + + if (events & VMMDEV_EVENT_MOUSE_POSITION_CHANGED) { + mouse_position_changed = true; + events &= ~VMMDEV_EVENT_MOUSE_POSITION_CHANGED; + } + + if (events & VMMDEV_EVENT_HGCM) { + wake_up(&gdev->hgcm_wq); + events &= ~VMMDEV_EVENT_HGCM; + } + + if (events & VMMDEV_EVENT_BALLOON_CHANGE_REQUEST) { + schedule_work(&gdev->mem_balloon.work); + events &= ~VMMDEV_EVENT_BALLOON_CHANGE_REQUEST; + } + + if (events) { + spin_lock_irqsave(&gdev->event_spinlock, flags); + gdev->pending_events |= events; + spin_unlock_irqrestore(&gdev->event_spinlock, flags); + + wake_up(&gdev->event_wq); + } + + if (mouse_position_changed) + vbg_linux_mouse_event(gdev); + + return IRQ_HANDLED; +} diff --git a/drivers/virt/vboxguest/vboxguest_core.h b/drivers/virt/vboxguest/vboxguest_core.h new file mode 100644 index 000000000..61059e4d1 --- /dev/null +++ b/drivers/virt/vboxguest/vboxguest_core.h @@ -0,0 +1,198 @@ +/* SPDX-License-Identifier: (GPL-2.0 OR CDDL-1.0) */ +/* Copyright (C) 2010-2016 Oracle Corporation */ + +#ifndef __VBOXGUEST_CORE_H__ +#define __VBOXGUEST_CORE_H__ + +#include <linux/input.h> +#include <linux/interrupt.h> +#include <linux/kernel.h> +#include <linux/list.h> +#include <linux/miscdevice.h> +#include <linux/spinlock.h> +#include <linux/wait.h> +#include <linux/workqueue.h> +#include <linux/vboxguest.h> +#include "vmmdev.h" + +/* + * The mainline kernel version (this version) of the vboxguest module + * contained a bug where it defined VBGL_IOCTL_VMMDEV_REQUEST_BIG and + * VBGL_IOCTL_LOG using _IOC(_IOC_READ | _IOC_WRITE, 'V', ...) instead + * of _IO(V, ...) as the out of tree VirtualBox upstream version does. + * + * These _ALT definitions keep compatibility with the wrong defines the + * mainline kernel version used for a while. + * Note the VirtualBox userspace bits have always been built against + * VirtualBox upstream's headers, so this is likely not necessary. But + * we must never break our ABI so we keep these around to be 100% sure. + */ +#define VBG_IOCTL_VMMDEV_REQUEST_BIG_ALT _IOC(_IOC_READ | _IOC_WRITE, 'V', 3, 0) +#define VBG_IOCTL_LOG_ALT(s) _IOC(_IOC_READ | _IOC_WRITE, 'V', 9, s) + +struct vbg_session; + +/** VBox guest memory balloon. */ +struct vbg_mem_balloon { + /** Work handling VMMDEV_EVENT_BALLOON_CHANGE_REQUEST events */ + struct work_struct work; + /** Pre-allocated vmmdev_memballoon_info req for query */ + struct vmmdev_memballoon_info *get_req; + /** Pre-allocated vmmdev_memballoon_change req for inflate / deflate */ + struct vmmdev_memballoon_change *change_req; + /** The current number of chunks in the balloon. */ + u32 chunks; + /** The maximum number of chunks in the balloon. */ + u32 max_chunks; + /** + * Array of pointers to page arrays. A page * array is allocated for + * each chunk when inflating, and freed when the deflating. + */ + struct page ***pages; +}; + +/** + * Per bit usage tracker for a u32 mask. + * + * Used for optimal handling of guest properties and event filter. + */ +struct vbg_bit_usage_tracker { + /** Per bit usage counters. */ + u32 per_bit_usage[32]; + /** The current mask according to per_bit_usage. */ + u32 mask; +}; + +/** VBox guest device (data) extension. */ +struct vbg_dev { + struct device *dev; + /** The base of the adapter I/O ports. */ + u16 io_port; + /** Pointer to the mapping of the VMMDev adapter memory. */ + struct vmmdev_memory *mmio; + /** Host version */ + char host_version[64]; + /** Host features */ + unsigned int host_features; + /** + * Dummy page and vmap address for reserved kernel virtual-address + * space for the guest mappings, only used on hosts lacking vtx. + */ + struct page *guest_mappings_dummy_page; + void *guest_mappings; + /** Spinlock protecting pending_events. */ + spinlock_t event_spinlock; + /** Preallocated struct vmmdev_events for the IRQ handler. */ + struct vmmdev_events *ack_events_req; + /** Wait-for-event list for threads waiting for multiple events. */ + wait_queue_head_t event_wq; + /** Mask of pending events. */ + u32 pending_events; + /** Wait-for-event list for threads waiting on HGCM async completion. */ + wait_queue_head_t hgcm_wq; + /** Pre-allocated hgcm cancel2 req. for cancellation on timeout */ + struct vmmdev_hgcm_cancel2 *cancel_req; + /** Mutex protecting cancel_req accesses */ + struct mutex cancel_req_mutex; + /** Pre-allocated mouse-status request for the input-device handling. */ + struct vmmdev_mouse_status *mouse_status_req; + /** Input device for reporting abs mouse coordinates to the guest. */ + struct input_dev *input; + + /** Memory balloon information. */ + struct vbg_mem_balloon mem_balloon; + + /** Lock for session related items in vbg_dev and vbg_session */ + struct mutex session_mutex; + /** Events we won't permit anyone to filter out. */ + u32 fixed_events; + /** + * Usage counters for the host events (excludes fixed events), + * Protected by session_mutex. + */ + struct vbg_bit_usage_tracker event_filter_tracker; + /** + * The event filter last reported to the host (or UINT32_MAX). + * Protected by session_mutex. + */ + u32 event_filter_host; + + /** + * Usage counters for guest capabilities. Indexed by capability bit + * number, one count per session using a capability. + * Protected by session_mutex. + */ + struct vbg_bit_usage_tracker guest_caps_tracker; + /** + * The guest capabilities last reported to the host (or UINT32_MAX). + * Protected by session_mutex. + */ + u32 guest_caps_host; + + /** + * Heartbeat timer which fires with interval + * cNsHearbeatInterval and its handler sends + * VMMDEVREQ_GUEST_HEARTBEAT to VMMDev. + */ + struct timer_list heartbeat_timer; + /** Heartbeat timer interval in ms. */ + int heartbeat_interval_ms; + /** Preallocated VMMDEVREQ_GUEST_HEARTBEAT request. */ + struct vmmdev_request_header *guest_heartbeat_req; + + /** "vboxguest" char-device */ + struct miscdevice misc_device; + /** "vboxuser" char-device */ + struct miscdevice misc_device_user; +}; + +/** The VBoxGuest per session data. */ +struct vbg_session { + /** Pointer to the device extension. */ + struct vbg_dev *gdev; + + /** + * Array containing HGCM client IDs associated with this session. + * These will be automatically disconnected when the session is closed. + * Protected by vbg_gdev.session_mutex. + */ + u32 hgcm_client_ids[64]; + /** + * Host events requested by the session. + * An event type requested in any guest session will be added to the + * host filter. Protected by vbg_gdev.session_mutex. + */ + u32 event_filter; + /** + * Guest capabilities for this session. + * A capability claimed by any guest session will be reported to the + * host. Protected by vbg_gdev.session_mutex. + */ + u32 guest_caps; + /** Does this session belong to a root process or a user one? */ + bool user_session; + /** Set on CANCEL_ALL_WAITEVENTS, protected by vbg_devevent_spinlock. */ + bool cancel_waiters; +}; + +int vbg_core_init(struct vbg_dev *gdev, u32 fixed_events); +void vbg_core_exit(struct vbg_dev *gdev); +struct vbg_session *vbg_core_open_session(struct vbg_dev *gdev, bool user); +void vbg_core_close_session(struct vbg_session *session); +int vbg_core_ioctl(struct vbg_session *session, unsigned int req, void *data); +int vbg_core_set_mouse_status(struct vbg_dev *gdev, u32 features); + +irqreturn_t vbg_core_isr(int irq, void *dev_id); + +void vbg_linux_mouse_event(struct vbg_dev *gdev); + +/* Private (non exported) functions form vboxguest_utils.c */ +void *vbg_req_alloc(size_t len, enum vmmdev_request_type req_type); +void vbg_req_free(void *req, size_t len); +int vbg_req_perform(struct vbg_dev *gdev, void *req); +int vbg_hgcm_call32( + struct vbg_dev *gdev, u32 client_id, u32 function, u32 timeout_ms, + struct vmmdev_hgcm_function_parameter32 *parm32, u32 parm_count, + int *vbox_status); + +#endif diff --git a/drivers/virt/vboxguest/vboxguest_linux.c b/drivers/virt/vboxguest/vboxguest_linux.c new file mode 100644 index 000000000..94e055ee7 --- /dev/null +++ b/drivers/virt/vboxguest/vboxguest_linux.c @@ -0,0 +1,482 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * vboxguest linux pci driver, char-dev and input-device code, + * + * Copyright (C) 2006-2016 Oracle Corporation + */ + +#include <linux/input.h> +#include <linux/kernel.h> +#include <linux/miscdevice.h> +#include <linux/module.h> +#include <linux/pci.h> +#include <linux/poll.h> +#include <linux/vbox_utils.h> +#include "vboxguest_core.h" + +/** The device name. */ +#define DEVICE_NAME "vboxguest" +/** The device name for the device node open to everyone. */ +#define DEVICE_NAME_USER "vboxuser" +/** VirtualBox PCI vendor ID. */ +#define VBOX_VENDORID 0x80ee +/** VMMDev PCI card product ID. */ +#define VMMDEV_DEVICEID 0xcafe + +/** Mutex protecting the global vbg_gdev pointer used by vbg_get/put_gdev. */ +static DEFINE_MUTEX(vbg_gdev_mutex); +/** Global vbg_gdev pointer used by vbg_get/put_gdev. */ +static struct vbg_dev *vbg_gdev; + +static int vbg_misc_device_open(struct inode *inode, struct file *filp) +{ + struct vbg_session *session; + struct vbg_dev *gdev; + + /* misc_open sets filp->private_data to our misc device */ + gdev = container_of(filp->private_data, struct vbg_dev, misc_device); + + session = vbg_core_open_session(gdev, false); + if (IS_ERR(session)) + return PTR_ERR(session); + + filp->private_data = session; + return 0; +} + +static int vbg_misc_device_user_open(struct inode *inode, struct file *filp) +{ + struct vbg_session *session; + struct vbg_dev *gdev; + + /* misc_open sets filp->private_data to our misc device */ + gdev = container_of(filp->private_data, struct vbg_dev, + misc_device_user); + + session = vbg_core_open_session(gdev, false); + if (IS_ERR(session)) + return PTR_ERR(session); + + filp->private_data = session; + return 0; +} + +/** + * Close device. + * Return: 0 on success, negated errno on failure. + * @inode: Pointer to inode info structure. + * @filp: Associated file pointer. + */ +static int vbg_misc_device_close(struct inode *inode, struct file *filp) +{ + vbg_core_close_session(filp->private_data); + filp->private_data = NULL; + return 0; +} + +/** + * Device I/O Control entry point. + * Return: 0 on success, negated errno on failure. + * @filp: Associated file pointer. + * @req: The request specified to ioctl(). + * @arg: The argument specified to ioctl(). + */ +static long vbg_misc_device_ioctl(struct file *filp, unsigned int req, + unsigned long arg) +{ + struct vbg_session *session = filp->private_data; + size_t returned_size, size; + struct vbg_ioctl_hdr hdr; + bool is_vmmdev_req; + int ret = 0; + void *buf; + + if (copy_from_user(&hdr, (void *)arg, sizeof(hdr))) + return -EFAULT; + + if (hdr.version != VBG_IOCTL_HDR_VERSION) + return -EINVAL; + + if (hdr.size_in < sizeof(hdr) || + (hdr.size_out && hdr.size_out < sizeof(hdr))) + return -EINVAL; + + size = max(hdr.size_in, hdr.size_out); + if (_IOC_SIZE(req) && _IOC_SIZE(req) != size) + return -EINVAL; + if (size > SZ_16M) + return -E2BIG; + + /* + * IOCTL_VMMDEV_REQUEST needs the buffer to be below 4G to avoid + * the need for a bounce-buffer and another copy later on. + */ + is_vmmdev_req = (req & ~IOCSIZE_MASK) == VBG_IOCTL_VMMDEV_REQUEST(0) || + req == VBG_IOCTL_VMMDEV_REQUEST_BIG || + req == VBG_IOCTL_VMMDEV_REQUEST_BIG_ALT; + + if (is_vmmdev_req) + buf = vbg_req_alloc(size, VBG_IOCTL_HDR_TYPE_DEFAULT); + else + buf = kmalloc(size, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + *((struct vbg_ioctl_hdr *)buf) = hdr; + if (copy_from_user(buf + sizeof(hdr), (void *)arg + sizeof(hdr), + hdr.size_in - sizeof(hdr))) { + ret = -EFAULT; + goto out; + } + if (hdr.size_in < size) + memset(buf + hdr.size_in, 0, size - hdr.size_in); + + ret = vbg_core_ioctl(session, req, buf); + if (ret) + goto out; + + returned_size = ((struct vbg_ioctl_hdr *)buf)->size_out; + if (returned_size > size) { + vbg_debug("%s: too much output data %zu > %zu\n", + __func__, returned_size, size); + returned_size = size; + } + if (copy_to_user((void *)arg, buf, returned_size) != 0) + ret = -EFAULT; + +out: + if (is_vmmdev_req) + vbg_req_free(buf, size); + else + kfree(buf); + + return ret; +} + +/** The file_operations structures. */ +static const struct file_operations vbg_misc_device_fops = { + .owner = THIS_MODULE, + .open = vbg_misc_device_open, + .release = vbg_misc_device_close, + .unlocked_ioctl = vbg_misc_device_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = vbg_misc_device_ioctl, +#endif +}; +static const struct file_operations vbg_misc_device_user_fops = { + .owner = THIS_MODULE, + .open = vbg_misc_device_user_open, + .release = vbg_misc_device_close, + .unlocked_ioctl = vbg_misc_device_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = vbg_misc_device_ioctl, +#endif +}; + +/** + * Called when the input device is first opened. + * + * Sets up absolute mouse reporting. + */ +static int vbg_input_open(struct input_dev *input) +{ + struct vbg_dev *gdev = input_get_drvdata(input); + u32 feat = VMMDEV_MOUSE_GUEST_CAN_ABSOLUTE | VMMDEV_MOUSE_NEW_PROTOCOL; + int ret; + + ret = vbg_core_set_mouse_status(gdev, feat); + if (ret) + return ret; + + return 0; +} + +/** + * Called if all open handles to the input device are closed. + * + * Disables absolute reporting. + */ +static void vbg_input_close(struct input_dev *input) +{ + struct vbg_dev *gdev = input_get_drvdata(input); + + vbg_core_set_mouse_status(gdev, 0); +} + +/** + * Creates the kernel input device. + * + * Return: 0 on success, negated errno on failure. + */ +static int vbg_create_input_device(struct vbg_dev *gdev) +{ + struct input_dev *input; + + input = devm_input_allocate_device(gdev->dev); + if (!input) + return -ENOMEM; + + input->id.bustype = BUS_PCI; + input->id.vendor = VBOX_VENDORID; + input->id.product = VMMDEV_DEVICEID; + input->open = vbg_input_open; + input->close = vbg_input_close; + input->dev.parent = gdev->dev; + input->name = "VirtualBox mouse integration"; + + input_set_abs_params(input, ABS_X, VMMDEV_MOUSE_RANGE_MIN, + VMMDEV_MOUSE_RANGE_MAX, 0, 0); + input_set_abs_params(input, ABS_Y, VMMDEV_MOUSE_RANGE_MIN, + VMMDEV_MOUSE_RANGE_MAX, 0, 0); + input_set_capability(input, EV_KEY, BTN_MOUSE); + input_set_drvdata(input, gdev); + + gdev->input = input; + + return input_register_device(gdev->input); +} + +static ssize_t host_version_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct vbg_dev *gdev = dev_get_drvdata(dev); + + return sprintf(buf, "%s\n", gdev->host_version); +} + +static ssize_t host_features_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct vbg_dev *gdev = dev_get_drvdata(dev); + + return sprintf(buf, "%#x\n", gdev->host_features); +} + +static DEVICE_ATTR_RO(host_version); +static DEVICE_ATTR_RO(host_features); + +/** + * Does the PCI detection and init of the device. + * + * Return: 0 on success, negated errno on failure. + */ +static int vbg_pci_probe(struct pci_dev *pci, const struct pci_device_id *id) +{ + struct device *dev = &pci->dev; + resource_size_t io, io_len, mmio, mmio_len; + struct vmmdev_memory *vmmdev; + struct vbg_dev *gdev; + int ret; + + gdev = devm_kzalloc(dev, sizeof(*gdev), GFP_KERNEL); + if (!gdev) + return -ENOMEM; + + ret = pci_enable_device(pci); + if (ret != 0) { + vbg_err("vboxguest: Error enabling device: %d\n", ret); + return ret; + } + + ret = -ENODEV; + + io = pci_resource_start(pci, 0); + io_len = pci_resource_len(pci, 0); + if (!io || !io_len) { + vbg_err("vboxguest: Error IO-port resource (0) is missing\n"); + goto err_disable_pcidev; + } + if (devm_request_region(dev, io, io_len, DEVICE_NAME) == NULL) { + vbg_err("vboxguest: Error could not claim IO resource\n"); + ret = -EBUSY; + goto err_disable_pcidev; + } + + mmio = pci_resource_start(pci, 1); + mmio_len = pci_resource_len(pci, 1); + if (!mmio || !mmio_len) { + vbg_err("vboxguest: Error MMIO resource (1) is missing\n"); + goto err_disable_pcidev; + } + + if (devm_request_mem_region(dev, mmio, mmio_len, DEVICE_NAME) == NULL) { + vbg_err("vboxguest: Error could not claim MMIO resource\n"); + ret = -EBUSY; + goto err_disable_pcidev; + } + + vmmdev = devm_ioremap(dev, mmio, mmio_len); + if (!vmmdev) { + vbg_err("vboxguest: Error ioremap failed; MMIO addr=%pap size=%pap\n", + &mmio, &mmio_len); + goto err_disable_pcidev; + } + + /* Validate MMIO region version and size. */ + if (vmmdev->version != VMMDEV_MEMORY_VERSION || + vmmdev->size < 32 || vmmdev->size > mmio_len) { + vbg_err("vboxguest: Bogus VMMDev memory; version=%08x (expected %08x) size=%d (expected <= %d)\n", + vmmdev->version, VMMDEV_MEMORY_VERSION, + vmmdev->size, (int)mmio_len); + goto err_disable_pcidev; + } + + gdev->io_port = io; + gdev->mmio = vmmdev; + gdev->dev = dev; + gdev->misc_device.minor = MISC_DYNAMIC_MINOR; + gdev->misc_device.name = DEVICE_NAME; + gdev->misc_device.fops = &vbg_misc_device_fops; + gdev->misc_device_user.minor = MISC_DYNAMIC_MINOR; + gdev->misc_device_user.name = DEVICE_NAME_USER; + gdev->misc_device_user.fops = &vbg_misc_device_user_fops; + + ret = vbg_core_init(gdev, VMMDEV_EVENT_MOUSE_POSITION_CHANGED); + if (ret) + goto err_disable_pcidev; + + ret = vbg_create_input_device(gdev); + if (ret) { + vbg_err("vboxguest: Error creating input device: %d\n", ret); + goto err_vbg_core_exit; + } + + ret = devm_request_irq(dev, pci->irq, vbg_core_isr, IRQF_SHARED, + DEVICE_NAME, gdev); + if (ret) { + vbg_err("vboxguest: Error requesting irq: %d\n", ret); + goto err_vbg_core_exit; + } + + ret = misc_register(&gdev->misc_device); + if (ret) { + vbg_err("vboxguest: Error misc_register %s failed: %d\n", + DEVICE_NAME, ret); + goto err_vbg_core_exit; + } + + ret = misc_register(&gdev->misc_device_user); + if (ret) { + vbg_err("vboxguest: Error misc_register %s failed: %d\n", + DEVICE_NAME_USER, ret); + goto err_unregister_misc_device; + } + + mutex_lock(&vbg_gdev_mutex); + if (!vbg_gdev) + vbg_gdev = gdev; + else + ret = -EBUSY; + mutex_unlock(&vbg_gdev_mutex); + + if (ret) { + vbg_err("vboxguest: Error more then 1 vbox guest pci device\n"); + goto err_unregister_misc_device_user; + } + + pci_set_drvdata(pci, gdev); + device_create_file(dev, &dev_attr_host_version); + device_create_file(dev, &dev_attr_host_features); + + vbg_info("vboxguest: misc device minor %d, IRQ %d, I/O port %x, MMIO at %pap (size %pap)\n", + gdev->misc_device.minor, pci->irq, gdev->io_port, + &mmio, &mmio_len); + + return 0; + +err_unregister_misc_device_user: + misc_deregister(&gdev->misc_device_user); +err_unregister_misc_device: + misc_deregister(&gdev->misc_device); +err_vbg_core_exit: + vbg_core_exit(gdev); +err_disable_pcidev: + pci_disable_device(pci); + + return ret; +} + +static void vbg_pci_remove(struct pci_dev *pci) +{ + struct vbg_dev *gdev = pci_get_drvdata(pci); + + mutex_lock(&vbg_gdev_mutex); + vbg_gdev = NULL; + mutex_unlock(&vbg_gdev_mutex); + + device_remove_file(gdev->dev, &dev_attr_host_features); + device_remove_file(gdev->dev, &dev_attr_host_version); + misc_deregister(&gdev->misc_device_user); + misc_deregister(&gdev->misc_device); + vbg_core_exit(gdev); + pci_disable_device(pci); +} + +struct vbg_dev *vbg_get_gdev(void) +{ + mutex_lock(&vbg_gdev_mutex); + + /* + * Note on success we keep the mutex locked until vbg_put_gdev(), + * this stops vbg_pci_remove from removing the device from underneath + * vboxsf. vboxsf will only hold a reference for a short while. + */ + if (vbg_gdev) + return vbg_gdev; + + mutex_unlock(&vbg_gdev_mutex); + return ERR_PTR(-ENODEV); +} +EXPORT_SYMBOL(vbg_get_gdev); + +void vbg_put_gdev(struct vbg_dev *gdev) +{ + WARN_ON(gdev != vbg_gdev); + mutex_unlock(&vbg_gdev_mutex); +} +EXPORT_SYMBOL(vbg_put_gdev); + +/** + * Callback for mouse events. + * + * This is called at the end of the ISR, after leaving the event spinlock, if + * VMMDEV_EVENT_MOUSE_POSITION_CHANGED was raised by the host. + * + * @gdev: The device extension. + */ +void vbg_linux_mouse_event(struct vbg_dev *gdev) +{ + int rc; + + /* Report events to the kernel input device */ + gdev->mouse_status_req->mouse_features = 0; + gdev->mouse_status_req->pointer_pos_x = 0; + gdev->mouse_status_req->pointer_pos_y = 0; + rc = vbg_req_perform(gdev, gdev->mouse_status_req); + if (rc >= 0) { + input_report_abs(gdev->input, ABS_X, + gdev->mouse_status_req->pointer_pos_x); + input_report_abs(gdev->input, ABS_Y, + gdev->mouse_status_req->pointer_pos_y); + input_sync(gdev->input); + } +} + +static const struct pci_device_id vbg_pci_ids[] = { + { .vendor = VBOX_VENDORID, .device = VMMDEV_DEVICEID }, + {} +}; +MODULE_DEVICE_TABLE(pci, vbg_pci_ids); + +static struct pci_driver vbg_pci_driver = { + .name = DEVICE_NAME, + .id_table = vbg_pci_ids, + .probe = vbg_pci_probe, + .remove = vbg_pci_remove, +}; + +module_pci_driver(vbg_pci_driver); + +MODULE_AUTHOR("Oracle Corporation"); +MODULE_DESCRIPTION("Oracle VM VirtualBox Guest Additions for Linux Module"); +MODULE_LICENSE("GPL"); diff --git a/drivers/virt/vboxguest/vboxguest_utils.c b/drivers/virt/vboxguest/vboxguest_utils.c new file mode 100644 index 000000000..920910065 --- /dev/null +++ b/drivers/virt/vboxguest/vboxguest_utils.c @@ -0,0 +1,813 @@ +/* SPDX-License-Identifier: (GPL-2.0 OR CDDL-1.0) */ +/* + * vboxguest vmm-req and hgcm-call code, VBoxGuestR0LibHGCMInternal.cpp, + * VBoxGuestR0LibGenericRequest.cpp and RTErrConvertToErrno.cpp in vbox svn. + * + * Copyright (C) 2006-2016 Oracle Corporation + */ + +#include <linux/errno.h> +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/module.h> +#include <linux/sizes.h> +#include <linux/slab.h> +#include <linux/uaccess.h> +#include <linux/vmalloc.h> +#include <linux/vbox_err.h> +#include <linux/vbox_utils.h> +#include "vboxguest_core.h" + +/* Get the pointer to the first parameter of a HGCM call request. */ +#define VMMDEV_HGCM_CALL_PARMS(a) \ + ((struct vmmdev_hgcm_function_parameter *)( \ + (u8 *)(a) + sizeof(struct vmmdev_hgcm_call))) + +/* The max parameter buffer size for a user request. */ +#define VBG_MAX_HGCM_USER_PARM (24 * SZ_1M) +/* The max parameter buffer size for a kernel request. */ +#define VBG_MAX_HGCM_KERNEL_PARM (16 * SZ_1M) + +#define VBG_DEBUG_PORT 0x504 + +/* This protects vbg_log_buf and serializes VBG_DEBUG_PORT accesses */ +static DEFINE_SPINLOCK(vbg_log_lock); +static char vbg_log_buf[128]; + +#define VBG_LOG(name, pr_func) \ +void name(const char *fmt, ...) \ +{ \ + unsigned long flags; \ + va_list args; \ + int i, count; \ + \ + va_start(args, fmt); \ + spin_lock_irqsave(&vbg_log_lock, flags); \ + \ + count = vscnprintf(vbg_log_buf, sizeof(vbg_log_buf), fmt, args);\ + for (i = 0; i < count; i++) \ + outb(vbg_log_buf[i], VBG_DEBUG_PORT); \ + \ + pr_func("%s", vbg_log_buf); \ + \ + spin_unlock_irqrestore(&vbg_log_lock, flags); \ + va_end(args); \ +} \ +EXPORT_SYMBOL(name) + +VBG_LOG(vbg_info, pr_info); +VBG_LOG(vbg_warn, pr_warn); +VBG_LOG(vbg_err, pr_err); +#if defined(DEBUG) && !defined(CONFIG_DYNAMIC_DEBUG) +VBG_LOG(vbg_debug, pr_debug); +#endif + +void *vbg_req_alloc(size_t len, enum vmmdev_request_type req_type) +{ + struct vmmdev_request_header *req; + int order = get_order(PAGE_ALIGN(len)); + + req = (void *)__get_free_pages(GFP_KERNEL | GFP_DMA32, order); + if (!req) + return NULL; + + memset(req, 0xaa, len); + + req->size = len; + req->version = VMMDEV_REQUEST_HEADER_VERSION; + req->request_type = req_type; + req->rc = VERR_GENERAL_FAILURE; + req->reserved1 = 0; + req->reserved2 = 0; + + return req; +} + +void vbg_req_free(void *req, size_t len) +{ + if (!req) + return; + + free_pages((unsigned long)req, get_order(PAGE_ALIGN(len))); +} + +/* Note this function returns a VBox status code, not a negative errno!! */ +int vbg_req_perform(struct vbg_dev *gdev, void *req) +{ + unsigned long phys_req = virt_to_phys(req); + + outl(phys_req, gdev->io_port + VMMDEV_PORT_OFF_REQUEST); + /* + * The host changes the request as a result of the outl, make sure + * the outl and any reads of the req happen in the correct order. + */ + mb(); + + return ((struct vmmdev_request_header *)req)->rc; +} + +static bool hgcm_req_done(struct vbg_dev *gdev, + struct vmmdev_hgcmreq_header *header) +{ + unsigned long flags; + bool done; + + spin_lock_irqsave(&gdev->event_spinlock, flags); + done = header->flags & VMMDEV_HGCM_REQ_DONE; + spin_unlock_irqrestore(&gdev->event_spinlock, flags); + + return done; +} + +int vbg_hgcm_connect(struct vbg_dev *gdev, + struct vmmdev_hgcm_service_location *loc, + u32 *client_id, int *vbox_status) +{ + struct vmmdev_hgcm_connect *hgcm_connect = NULL; + int rc; + + hgcm_connect = vbg_req_alloc(sizeof(*hgcm_connect), + VMMDEVREQ_HGCM_CONNECT); + if (!hgcm_connect) + return -ENOMEM; + + hgcm_connect->header.flags = 0; + memcpy(&hgcm_connect->loc, loc, sizeof(*loc)); + hgcm_connect->client_id = 0; + + rc = vbg_req_perform(gdev, hgcm_connect); + + if (rc == VINF_HGCM_ASYNC_EXECUTE) + wait_event(gdev->hgcm_wq, + hgcm_req_done(gdev, &hgcm_connect->header)); + + if (rc >= 0) { + *client_id = hgcm_connect->client_id; + rc = hgcm_connect->header.result; + } + + vbg_req_free(hgcm_connect, sizeof(*hgcm_connect)); + + *vbox_status = rc; + return 0; +} +EXPORT_SYMBOL(vbg_hgcm_connect); + +int vbg_hgcm_disconnect(struct vbg_dev *gdev, u32 client_id, int *vbox_status) +{ + struct vmmdev_hgcm_disconnect *hgcm_disconnect = NULL; + int rc; + + hgcm_disconnect = vbg_req_alloc(sizeof(*hgcm_disconnect), + VMMDEVREQ_HGCM_DISCONNECT); + if (!hgcm_disconnect) + return -ENOMEM; + + hgcm_disconnect->header.flags = 0; + hgcm_disconnect->client_id = client_id; + + rc = vbg_req_perform(gdev, hgcm_disconnect); + + if (rc == VINF_HGCM_ASYNC_EXECUTE) + wait_event(gdev->hgcm_wq, + hgcm_req_done(gdev, &hgcm_disconnect->header)); + + if (rc >= 0) + rc = hgcm_disconnect->header.result; + + vbg_req_free(hgcm_disconnect, sizeof(*hgcm_disconnect)); + + *vbox_status = rc; + return 0; +} +EXPORT_SYMBOL(vbg_hgcm_disconnect); + +static u32 hgcm_call_buf_size_in_pages(void *buf, u32 len) +{ + u32 size = PAGE_ALIGN(len + ((unsigned long)buf & ~PAGE_MASK)); + + return size >> PAGE_SHIFT; +} + +static void hgcm_call_add_pagelist_size(void *buf, u32 len, size_t *extra) +{ + u32 page_count; + + page_count = hgcm_call_buf_size_in_pages(buf, len); + *extra += offsetof(struct vmmdev_hgcm_pagelist, pages[page_count]); +} + +static int hgcm_call_preprocess_linaddr( + const struct vmmdev_hgcm_function_parameter *src_parm, + void **bounce_buf_ret, size_t *extra) +{ + void *buf, *bounce_buf; + bool copy_in; + u32 len; + int ret; + + buf = (void *)src_parm->u.pointer.u.linear_addr; + len = src_parm->u.pointer.size; + copy_in = src_parm->type != VMMDEV_HGCM_PARM_TYPE_LINADDR_OUT; + + if (len > VBG_MAX_HGCM_USER_PARM) + return -E2BIG; + + bounce_buf = kvmalloc(len, GFP_KERNEL); + if (!bounce_buf) + return -ENOMEM; + + *bounce_buf_ret = bounce_buf; + + if (copy_in) { + ret = copy_from_user(bounce_buf, (void __user *)buf, len); + if (ret) + return -EFAULT; + } else { + memset(bounce_buf, 0, len); + } + + hgcm_call_add_pagelist_size(bounce_buf, len, extra); + return 0; +} + +/** + * Preprocesses the HGCM call, validate parameters, alloc bounce buffers and + * figure out how much extra storage we need for page lists. + * Return: 0 or negative errno value. + * @src_parm: Pointer to source function call parameters + * @parm_count: Number of function call parameters. + * @bounce_bufs_ret: Where to return the allocated bouncebuffer array + * @extra: Where to return the extra request space needed for + * physical page lists. + */ +static int hgcm_call_preprocess( + const struct vmmdev_hgcm_function_parameter *src_parm, + u32 parm_count, void ***bounce_bufs_ret, size_t *extra) +{ + void *buf, **bounce_bufs = NULL; + u32 i, len; + int ret; + + for (i = 0; i < parm_count; i++, src_parm++) { + switch (src_parm->type) { + case VMMDEV_HGCM_PARM_TYPE_32BIT: + case VMMDEV_HGCM_PARM_TYPE_64BIT: + break; + + case VMMDEV_HGCM_PARM_TYPE_LINADDR: + case VMMDEV_HGCM_PARM_TYPE_LINADDR_IN: + case VMMDEV_HGCM_PARM_TYPE_LINADDR_OUT: + if (!bounce_bufs) { + bounce_bufs = kcalloc(parm_count, + sizeof(void *), + GFP_KERNEL); + if (!bounce_bufs) + return -ENOMEM; + + *bounce_bufs_ret = bounce_bufs; + } + + ret = hgcm_call_preprocess_linaddr(src_parm, + &bounce_bufs[i], + extra); + if (ret) + return ret; + + break; + + case VMMDEV_HGCM_PARM_TYPE_LINADDR_KERNEL: + case VMMDEV_HGCM_PARM_TYPE_LINADDR_KERNEL_IN: + case VMMDEV_HGCM_PARM_TYPE_LINADDR_KERNEL_OUT: + buf = (void *)src_parm->u.pointer.u.linear_addr; + len = src_parm->u.pointer.size; + if (WARN_ON(len > VBG_MAX_HGCM_KERNEL_PARM)) + return -E2BIG; + + hgcm_call_add_pagelist_size(buf, len, extra); + break; + + default: + return -EINVAL; + } + } + + return 0; +} + +/** + * Translates linear address types to page list direction flags. + * + * Return: page list flags. + * @type: The type. + */ +static u32 hgcm_call_linear_addr_type_to_pagelist_flags( + enum vmmdev_hgcm_function_parameter_type type) +{ + switch (type) { + default: + WARN_ON(1); + /* Fall through */ + case VMMDEV_HGCM_PARM_TYPE_LINADDR: + case VMMDEV_HGCM_PARM_TYPE_LINADDR_KERNEL: + return VMMDEV_HGCM_F_PARM_DIRECTION_BOTH; + + case VMMDEV_HGCM_PARM_TYPE_LINADDR_IN: + case VMMDEV_HGCM_PARM_TYPE_LINADDR_KERNEL_IN: + return VMMDEV_HGCM_F_PARM_DIRECTION_TO_HOST; + + case VMMDEV_HGCM_PARM_TYPE_LINADDR_OUT: + case VMMDEV_HGCM_PARM_TYPE_LINADDR_KERNEL_OUT: + return VMMDEV_HGCM_F_PARM_DIRECTION_FROM_HOST; + } +} + +static void hgcm_call_init_linaddr(struct vmmdev_hgcm_call *call, + struct vmmdev_hgcm_function_parameter *dst_parm, void *buf, u32 len, + enum vmmdev_hgcm_function_parameter_type type, u32 *off_extra) +{ + struct vmmdev_hgcm_pagelist *dst_pg_lst; + struct page *page; + bool is_vmalloc; + u32 i, page_count; + + dst_parm->type = type; + + if (len == 0) { + dst_parm->u.pointer.size = 0; + dst_parm->u.pointer.u.linear_addr = 0; + return; + } + + dst_pg_lst = (void *)call + *off_extra; + page_count = hgcm_call_buf_size_in_pages(buf, len); + is_vmalloc = is_vmalloc_addr(buf); + + dst_parm->type = VMMDEV_HGCM_PARM_TYPE_PAGELIST; + dst_parm->u.page_list.size = len; + dst_parm->u.page_list.offset = *off_extra; + dst_pg_lst->flags = hgcm_call_linear_addr_type_to_pagelist_flags(type); + dst_pg_lst->offset_first_page = (unsigned long)buf & ~PAGE_MASK; + dst_pg_lst->page_count = page_count; + + for (i = 0; i < page_count; i++) { + if (is_vmalloc) + page = vmalloc_to_page(buf); + else + page = virt_to_page(buf); + + dst_pg_lst->pages[i] = page_to_phys(page); + buf += PAGE_SIZE; + } + + *off_extra += offsetof(struct vmmdev_hgcm_pagelist, pages[page_count]); +} + +/** + * Initializes the call request that we're sending to the host. + * @call: The call to initialize. + * @client_id: The client ID of the caller. + * @function: The function number of the function to call. + * @src_parm: Pointer to source function call parameters. + * @parm_count: Number of function call parameters. + * @bounce_bufs: The bouncebuffer array. + */ +static void hgcm_call_init_call( + struct vmmdev_hgcm_call *call, u32 client_id, u32 function, + const struct vmmdev_hgcm_function_parameter *src_parm, + u32 parm_count, void **bounce_bufs) +{ + struct vmmdev_hgcm_function_parameter *dst_parm = + VMMDEV_HGCM_CALL_PARMS(call); + u32 i, off_extra = (uintptr_t)(dst_parm + parm_count) - (uintptr_t)call; + void *buf; + + call->header.flags = 0; + call->header.result = VINF_SUCCESS; + call->client_id = client_id; + call->function = function; + call->parm_count = parm_count; + + for (i = 0; i < parm_count; i++, src_parm++, dst_parm++) { + switch (src_parm->type) { + case VMMDEV_HGCM_PARM_TYPE_32BIT: + case VMMDEV_HGCM_PARM_TYPE_64BIT: + *dst_parm = *src_parm; + break; + + case VMMDEV_HGCM_PARM_TYPE_LINADDR: + case VMMDEV_HGCM_PARM_TYPE_LINADDR_IN: + case VMMDEV_HGCM_PARM_TYPE_LINADDR_OUT: + hgcm_call_init_linaddr(call, dst_parm, bounce_bufs[i], + src_parm->u.pointer.size, + src_parm->type, &off_extra); + break; + + case VMMDEV_HGCM_PARM_TYPE_LINADDR_KERNEL: + case VMMDEV_HGCM_PARM_TYPE_LINADDR_KERNEL_IN: + case VMMDEV_HGCM_PARM_TYPE_LINADDR_KERNEL_OUT: + buf = (void *)src_parm->u.pointer.u.linear_addr; + hgcm_call_init_linaddr(call, dst_parm, buf, + src_parm->u.pointer.size, + src_parm->type, &off_extra); + break; + + default: + WARN_ON(1); + dst_parm->type = VMMDEV_HGCM_PARM_TYPE_INVALID; + } + } +} + +/** + * Tries to cancel a pending HGCM call. + * + * Return: VBox status code + */ +static int hgcm_cancel_call(struct vbg_dev *gdev, struct vmmdev_hgcm_call *call) +{ + int rc; + + /* + * We use a pre-allocated request for cancellations, which is + * protected by cancel_req_mutex. This means that all cancellations + * get serialized, this should be fine since they should be rare. + */ + mutex_lock(&gdev->cancel_req_mutex); + gdev->cancel_req->phys_req_to_cancel = virt_to_phys(call); + rc = vbg_req_perform(gdev, gdev->cancel_req); + mutex_unlock(&gdev->cancel_req_mutex); + + if (rc == VERR_NOT_IMPLEMENTED) { + call->header.flags |= VMMDEV_HGCM_REQ_CANCELLED; + call->header.header.request_type = VMMDEVREQ_HGCM_CANCEL; + + rc = vbg_req_perform(gdev, call); + if (rc == VERR_INVALID_PARAMETER) + rc = VERR_NOT_FOUND; + } + + if (rc >= 0) + call->header.flags |= VMMDEV_HGCM_REQ_CANCELLED; + + return rc; +} + +/** + * Performs the call and completion wait. + * Return: 0 or negative errno value. + * @gdev: The VBoxGuest device extension. + * @call: The call to execute. + * @timeout_ms: Timeout in ms. + * @leak_it: Where to return the leak it / free it, indicator. + * Cancellation fun. + */ +static int vbg_hgcm_do_call(struct vbg_dev *gdev, struct vmmdev_hgcm_call *call, + u32 timeout_ms, bool *leak_it) +{ + int rc, cancel_rc, ret; + long timeout; + + *leak_it = false; + + rc = vbg_req_perform(gdev, call); + + /* + * If the call failed, then pretend success. Upper layers will + * interpret the result code in the packet. + */ + if (rc < 0) { + call->header.result = rc; + return 0; + } + + if (rc != VINF_HGCM_ASYNC_EXECUTE) + return 0; + + /* Host decided to process the request asynchronously, wait for it */ + if (timeout_ms == U32_MAX) + timeout = MAX_SCHEDULE_TIMEOUT; + else + timeout = msecs_to_jiffies(timeout_ms); + + timeout = wait_event_interruptible_timeout( + gdev->hgcm_wq, + hgcm_req_done(gdev, &call->header), + timeout); + + /* timeout > 0 means hgcm_req_done has returned true, so success */ + if (timeout > 0) + return 0; + + if (timeout == 0) + ret = -ETIMEDOUT; + else + ret = -EINTR; + + /* Cancel the request */ + cancel_rc = hgcm_cancel_call(gdev, call); + if (cancel_rc >= 0) + return ret; + + /* + * Failed to cancel, this should mean that the cancel has lost the + * race with normal completion, wait while the host completes it. + */ + if (cancel_rc == VERR_NOT_FOUND || cancel_rc == VERR_SEM_DESTROYED) + timeout = msecs_to_jiffies(500); + else + timeout = msecs_to_jiffies(2000); + + timeout = wait_event_timeout(gdev->hgcm_wq, + hgcm_req_done(gdev, &call->header), + timeout); + + if (WARN_ON(timeout == 0)) { + /* We really should never get here */ + vbg_err("%s: Call timedout and cancellation failed, leaking the request\n", + __func__); + *leak_it = true; + return ret; + } + + /* The call has completed normally after all */ + return 0; +} + +/** + * Copies the result of the call back to the caller info structure and user + * buffers. + * Return: 0 or negative errno value. + * @call: HGCM call request. + * @dst_parm: Pointer to function call parameters destination. + * @parm_count: Number of function call parameters. + * @bounce_bufs: The bouncebuffer array. + */ +static int hgcm_call_copy_back_result( + const struct vmmdev_hgcm_call *call, + struct vmmdev_hgcm_function_parameter *dst_parm, + u32 parm_count, void **bounce_bufs) +{ + const struct vmmdev_hgcm_function_parameter *src_parm = + VMMDEV_HGCM_CALL_PARMS(call); + void __user *p; + int ret; + u32 i; + + /* Copy back parameters. */ + for (i = 0; i < parm_count; i++, src_parm++, dst_parm++) { + switch (dst_parm->type) { + case VMMDEV_HGCM_PARM_TYPE_32BIT: + case VMMDEV_HGCM_PARM_TYPE_64BIT: + *dst_parm = *src_parm; + break; + + case VMMDEV_HGCM_PARM_TYPE_PAGELIST: + dst_parm->u.page_list.size = src_parm->u.page_list.size; + break; + + case VMMDEV_HGCM_PARM_TYPE_LINADDR_IN: + case VMMDEV_HGCM_PARM_TYPE_LINADDR_KERNEL: + case VMMDEV_HGCM_PARM_TYPE_LINADDR_KERNEL_IN: + case VMMDEV_HGCM_PARM_TYPE_LINADDR_KERNEL_OUT: + dst_parm->u.pointer.size = src_parm->u.pointer.size; + break; + + case VMMDEV_HGCM_PARM_TYPE_LINADDR: + case VMMDEV_HGCM_PARM_TYPE_LINADDR_OUT: + dst_parm->u.pointer.size = src_parm->u.pointer.size; + + p = (void __user *)dst_parm->u.pointer.u.linear_addr; + ret = copy_to_user(p, bounce_bufs[i], + min(src_parm->u.pointer.size, + dst_parm->u.pointer.size)); + if (ret) + return -EFAULT; + break; + + default: + WARN_ON(1); + return -EINVAL; + } + } + + return 0; +} + +int vbg_hgcm_call(struct vbg_dev *gdev, u32 client_id, u32 function, + u32 timeout_ms, struct vmmdev_hgcm_function_parameter *parms, + u32 parm_count, int *vbox_status) +{ + struct vmmdev_hgcm_call *call; + void **bounce_bufs = NULL; + bool leak_it; + size_t size; + int i, ret; + + size = sizeof(struct vmmdev_hgcm_call) + + parm_count * sizeof(struct vmmdev_hgcm_function_parameter); + /* + * Validate and buffer the parameters for the call. This also increases + * call_size with the amount of extra space needed for page lists. + */ + ret = hgcm_call_preprocess(parms, parm_count, &bounce_bufs, &size); + if (ret) { + /* Even on error bounce bufs may still have been allocated */ + goto free_bounce_bufs; + } + + call = vbg_req_alloc(size, VMMDEVREQ_HGCM_CALL); + if (!call) { + ret = -ENOMEM; + goto free_bounce_bufs; + } + + hgcm_call_init_call(call, client_id, function, parms, parm_count, + bounce_bufs); + + ret = vbg_hgcm_do_call(gdev, call, timeout_ms, &leak_it); + if (ret == 0) { + *vbox_status = call->header.result; + ret = hgcm_call_copy_back_result(call, parms, parm_count, + bounce_bufs); + } + + if (!leak_it) + vbg_req_free(call, size); + +free_bounce_bufs: + if (bounce_bufs) { + for (i = 0; i < parm_count; i++) + kvfree(bounce_bufs[i]); + kfree(bounce_bufs); + } + + return ret; +} +EXPORT_SYMBOL(vbg_hgcm_call); + +#ifdef CONFIG_COMPAT +int vbg_hgcm_call32( + struct vbg_dev *gdev, u32 client_id, u32 function, u32 timeout_ms, + struct vmmdev_hgcm_function_parameter32 *parm32, u32 parm_count, + int *vbox_status) +{ + struct vmmdev_hgcm_function_parameter *parm64 = NULL; + u32 i, size; + int ret = 0; + + /* KISS allocate a temporary request and convert the parameters. */ + size = parm_count * sizeof(struct vmmdev_hgcm_function_parameter); + parm64 = kzalloc(size, GFP_KERNEL); + if (!parm64) + return -ENOMEM; + + for (i = 0; i < parm_count; i++) { + switch (parm32[i].type) { + case VMMDEV_HGCM_PARM_TYPE_32BIT: + parm64[i].type = VMMDEV_HGCM_PARM_TYPE_32BIT; + parm64[i].u.value32 = parm32[i].u.value32; + break; + + case VMMDEV_HGCM_PARM_TYPE_64BIT: + parm64[i].type = VMMDEV_HGCM_PARM_TYPE_64BIT; + parm64[i].u.value64 = parm32[i].u.value64; + break; + + case VMMDEV_HGCM_PARM_TYPE_LINADDR_OUT: + case VMMDEV_HGCM_PARM_TYPE_LINADDR: + case VMMDEV_HGCM_PARM_TYPE_LINADDR_IN: + parm64[i].type = parm32[i].type; + parm64[i].u.pointer.size = parm32[i].u.pointer.size; + parm64[i].u.pointer.u.linear_addr = + parm32[i].u.pointer.u.linear_addr; + break; + + default: + ret = -EINVAL; + } + if (ret < 0) + goto out_free; + } + + ret = vbg_hgcm_call(gdev, client_id, function, timeout_ms, + parm64, parm_count, vbox_status); + if (ret < 0) + goto out_free; + + /* Copy back. */ + for (i = 0; i < parm_count; i++, parm32++, parm64++) { + switch (parm64[i].type) { + case VMMDEV_HGCM_PARM_TYPE_32BIT: + parm32[i].u.value32 = parm64[i].u.value32; + break; + + case VMMDEV_HGCM_PARM_TYPE_64BIT: + parm32[i].u.value64 = parm64[i].u.value64; + break; + + case VMMDEV_HGCM_PARM_TYPE_LINADDR_OUT: + case VMMDEV_HGCM_PARM_TYPE_LINADDR: + case VMMDEV_HGCM_PARM_TYPE_LINADDR_IN: + parm32[i].u.pointer.size = parm64[i].u.pointer.size; + break; + + default: + WARN_ON(1); + ret = -EINVAL; + } + } + +out_free: + kfree(parm64); + return ret; +} +#endif + +static const int vbg_status_code_to_errno_table[] = { + [-VERR_ACCESS_DENIED] = -EPERM, + [-VERR_FILE_NOT_FOUND] = -ENOENT, + [-VERR_PROCESS_NOT_FOUND] = -ESRCH, + [-VERR_INTERRUPTED] = -EINTR, + [-VERR_DEV_IO_ERROR] = -EIO, + [-VERR_TOO_MUCH_DATA] = -E2BIG, + [-VERR_BAD_EXE_FORMAT] = -ENOEXEC, + [-VERR_INVALID_HANDLE] = -EBADF, + [-VERR_TRY_AGAIN] = -EAGAIN, + [-VERR_NO_MEMORY] = -ENOMEM, + [-VERR_INVALID_POINTER] = -EFAULT, + [-VERR_RESOURCE_BUSY] = -EBUSY, + [-VERR_ALREADY_EXISTS] = -EEXIST, + [-VERR_NOT_SAME_DEVICE] = -EXDEV, + [-VERR_NOT_A_DIRECTORY] = -ENOTDIR, + [-VERR_PATH_NOT_FOUND] = -ENOTDIR, + [-VERR_INVALID_NAME] = -ENOENT, + [-VERR_IS_A_DIRECTORY] = -EISDIR, + [-VERR_INVALID_PARAMETER] = -EINVAL, + [-VERR_TOO_MANY_OPEN_FILES] = -ENFILE, + [-VERR_INVALID_FUNCTION] = -ENOTTY, + [-VERR_SHARING_VIOLATION] = -ETXTBSY, + [-VERR_FILE_TOO_BIG] = -EFBIG, + [-VERR_DISK_FULL] = -ENOSPC, + [-VERR_SEEK_ON_DEVICE] = -ESPIPE, + [-VERR_WRITE_PROTECT] = -EROFS, + [-VERR_BROKEN_PIPE] = -EPIPE, + [-VERR_DEADLOCK] = -EDEADLK, + [-VERR_FILENAME_TOO_LONG] = -ENAMETOOLONG, + [-VERR_FILE_LOCK_FAILED] = -ENOLCK, + [-VERR_NOT_IMPLEMENTED] = -ENOSYS, + [-VERR_NOT_SUPPORTED] = -ENOSYS, + [-VERR_DIR_NOT_EMPTY] = -ENOTEMPTY, + [-VERR_TOO_MANY_SYMLINKS] = -ELOOP, + [-VERR_NO_MORE_FILES] = -ENODATA, + [-VERR_NO_DATA] = -ENODATA, + [-VERR_NET_NO_NETWORK] = -ENONET, + [-VERR_NET_NOT_UNIQUE_NAME] = -ENOTUNIQ, + [-VERR_NO_TRANSLATION] = -EILSEQ, + [-VERR_NET_NOT_SOCKET] = -ENOTSOCK, + [-VERR_NET_DEST_ADDRESS_REQUIRED] = -EDESTADDRREQ, + [-VERR_NET_MSG_SIZE] = -EMSGSIZE, + [-VERR_NET_PROTOCOL_TYPE] = -EPROTOTYPE, + [-VERR_NET_PROTOCOL_NOT_AVAILABLE] = -ENOPROTOOPT, + [-VERR_NET_PROTOCOL_NOT_SUPPORTED] = -EPROTONOSUPPORT, + [-VERR_NET_SOCKET_TYPE_NOT_SUPPORTED] = -ESOCKTNOSUPPORT, + [-VERR_NET_OPERATION_NOT_SUPPORTED] = -EOPNOTSUPP, + [-VERR_NET_PROTOCOL_FAMILY_NOT_SUPPORTED] = -EPFNOSUPPORT, + [-VERR_NET_ADDRESS_FAMILY_NOT_SUPPORTED] = -EAFNOSUPPORT, + [-VERR_NET_ADDRESS_IN_USE] = -EADDRINUSE, + [-VERR_NET_ADDRESS_NOT_AVAILABLE] = -EADDRNOTAVAIL, + [-VERR_NET_DOWN] = -ENETDOWN, + [-VERR_NET_UNREACHABLE] = -ENETUNREACH, + [-VERR_NET_CONNECTION_RESET] = -ENETRESET, + [-VERR_NET_CONNECTION_ABORTED] = -ECONNABORTED, + [-VERR_NET_CONNECTION_RESET_BY_PEER] = -ECONNRESET, + [-VERR_NET_NO_BUFFER_SPACE] = -ENOBUFS, + [-VERR_NET_ALREADY_CONNECTED] = -EISCONN, + [-VERR_NET_NOT_CONNECTED] = -ENOTCONN, + [-VERR_NET_SHUTDOWN] = -ESHUTDOWN, + [-VERR_NET_TOO_MANY_REFERENCES] = -ETOOMANYREFS, + [-VERR_TIMEOUT] = -ETIMEDOUT, + [-VERR_NET_CONNECTION_REFUSED] = -ECONNREFUSED, + [-VERR_NET_HOST_DOWN] = -EHOSTDOWN, + [-VERR_NET_HOST_UNREACHABLE] = -EHOSTUNREACH, + [-VERR_NET_ALREADY_IN_PROGRESS] = -EALREADY, + [-VERR_NET_IN_PROGRESS] = -EINPROGRESS, + [-VERR_MEDIA_NOT_PRESENT] = -ENOMEDIUM, + [-VERR_MEDIA_NOT_RECOGNIZED] = -EMEDIUMTYPE, +}; + +int vbg_status_code_to_errno(int rc) +{ + if (rc >= 0) + return 0; + + rc = -rc; + if (rc >= ARRAY_SIZE(vbg_status_code_to_errno_table) || + vbg_status_code_to_errno_table[rc] == 0) { + vbg_warn("%s: Unhandled err %d\n", __func__, -rc); + return -EPROTO; + } + + return vbg_status_code_to_errno_table[rc]; +} +EXPORT_SYMBOL(vbg_status_code_to_errno); diff --git a/drivers/virt/vboxguest/vboxguest_version.h b/drivers/virt/vboxguest/vboxguest_version.h new file mode 100644 index 000000000..77f0c8f8a --- /dev/null +++ b/drivers/virt/vboxguest/vboxguest_version.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: (GPL-2.0 OR CDDL-1.0) */ +/* + * VBox Guest additions version info, this is used by the host to determine + * supported guest-addition features in some cases. So this will need to be + * synced with vbox upstreams versioning scheme when we implement / port + * new features from the upstream out-of-tree vboxguest driver. + */ + +#ifndef __VBOX_VERSION_H__ +#define __VBOX_VERSION_H__ + +/* Last synced October 4th 2017 */ +#define VBG_VERSION_MAJOR 5 +#define VBG_VERSION_MINOR 2 +#define VBG_VERSION_BUILD 0 +#define VBG_SVN_REV 68940 +#define VBG_VERSION_STRING "5.2.0" + +#endif diff --git a/drivers/virt/vboxguest/vmmdev.h b/drivers/virt/vboxguest/vmmdev.h new file mode 100644 index 000000000..63733ab2a --- /dev/null +++ b/drivers/virt/vboxguest/vmmdev.h @@ -0,0 +1,451 @@ +/* SPDX-License-Identifier: (GPL-2.0 OR CDDL-1.0) */ +/* + * Virtual Device for Guest <-> VMM/Host communication interface + * + * Copyright (C) 2006-2016 Oracle Corporation + */ + +#ifndef __VBOX_VMMDEV_H__ +#define __VBOX_VMMDEV_H__ + +#include <asm/bitsperlong.h> +#include <linux/sizes.h> +#include <linux/types.h> +#include <linux/vbox_vmmdev_types.h> + +/* Port for generic request interface (relative offset). */ +#define VMMDEV_PORT_OFF_REQUEST 0 + +/** Layout of VMMDEV RAM region that contains information for guest. */ +struct vmmdev_memory { + /** The size of this structure. */ + u32 size; + /** The structure version. (VMMDEV_MEMORY_VERSION) */ + u32 version; + + union { + struct { + /** Flag telling that VMMDev has events pending. */ + u8 have_events; + /** Explicit padding, MBZ. */ + u8 padding[3]; + } V1_04; + + struct { + /** Pending events flags, set by host. */ + u32 host_events; + /** Mask of events the guest wants, set by guest. */ + u32 guest_event_mask; + } V1_03; + } V; + + /* struct vbva_memory, not used */ +}; +VMMDEV_ASSERT_SIZE(vmmdev_memory, 8 + 8); + +/** Version of vmmdev_memory structure (vmmdev_memory::version). */ +#define VMMDEV_MEMORY_VERSION (1) + +/* Host mouse capabilities has been changed. */ +#define VMMDEV_EVENT_MOUSE_CAPABILITIES_CHANGED BIT(0) +/* HGCM event. */ +#define VMMDEV_EVENT_HGCM BIT(1) +/* A display change request has been issued. */ +#define VMMDEV_EVENT_DISPLAY_CHANGE_REQUEST BIT(2) +/* Credentials are available for judgement. */ +#define VMMDEV_EVENT_JUDGE_CREDENTIALS BIT(3) +/* The guest has been restored. */ +#define VMMDEV_EVENT_RESTORED BIT(4) +/* Seamless mode state changed. */ +#define VMMDEV_EVENT_SEAMLESS_MODE_CHANGE_REQUEST BIT(5) +/* Memory balloon size changed. */ +#define VMMDEV_EVENT_BALLOON_CHANGE_REQUEST BIT(6) +/* Statistics interval changed. */ +#define VMMDEV_EVENT_STATISTICS_INTERVAL_CHANGE_REQUEST BIT(7) +/* VRDP status changed. */ +#define VMMDEV_EVENT_VRDP BIT(8) +/* New mouse position data available. */ +#define VMMDEV_EVENT_MOUSE_POSITION_CHANGED BIT(9) +/* CPU hotplug event occurred. */ +#define VMMDEV_EVENT_CPU_HOTPLUG BIT(10) +/* The mask of valid events, for sanity checking. */ +#define VMMDEV_EVENT_VALID_EVENT_MASK 0x000007ffU + +/* + * Additions are allowed to work only if additions_major == vmmdev_current && + * additions_minor <= vmmdev_current. Additions version is reported to host + * (VMMDev) by VMMDEVREQ_REPORT_GUEST_INFO. + */ +#define VMMDEV_VERSION 0x00010004 +#define VMMDEV_VERSION_MAJOR (VMMDEV_VERSION >> 16) +#define VMMDEV_VERSION_MINOR (VMMDEV_VERSION & 0xffff) + +/* Maximum request packet size. */ +#define VMMDEV_MAX_VMMDEVREQ_SIZE 1048576 + +/* Version of vmmdev_request_header structure. */ +#define VMMDEV_REQUEST_HEADER_VERSION 0x10001 + +/** struct vmmdev_request_header - Generic VMMDev request header. */ +struct vmmdev_request_header { + /** IN: Size of the structure in bytes (including body). */ + u32 size; + /** IN: Version of the structure. */ + u32 version; + /** IN: Type of the request. */ + enum vmmdev_request_type request_type; + /** OUT: Return code. */ + s32 rc; + /** Reserved field no.1. MBZ. */ + u32 reserved1; + /** Reserved field no.2. MBZ. */ + u32 reserved2; +}; +VMMDEV_ASSERT_SIZE(vmmdev_request_header, 24); + +/** + * struct vmmdev_mouse_status - Mouse status request structure. + * + * Used by VMMDEVREQ_GET_MOUSE_STATUS and VMMDEVREQ_SET_MOUSE_STATUS. + */ +struct vmmdev_mouse_status { + /** header */ + struct vmmdev_request_header header; + /** Mouse feature mask. See VMMDEV_MOUSE_*. */ + u32 mouse_features; + /** Mouse x position. */ + s32 pointer_pos_x; + /** Mouse y position. */ + s32 pointer_pos_y; +}; +VMMDEV_ASSERT_SIZE(vmmdev_mouse_status, 24 + 12); + +/* The guest can (== wants to) handle absolute coordinates. */ +#define VMMDEV_MOUSE_GUEST_CAN_ABSOLUTE BIT(0) +/* + * The host can (== wants to) send absolute coordinates. + * (Input not captured.) + */ +#define VMMDEV_MOUSE_HOST_WANTS_ABSOLUTE BIT(1) +/* + * The guest can *NOT* switch to software cursor and therefore depends on the + * host cursor. + * + * When guest additions are installed and the host has promised to display the + * cursor itself, the guest installs a hardware mouse driver. Don't ask the + * guest to switch to a software cursor then. + */ +#define VMMDEV_MOUSE_GUEST_NEEDS_HOST_CURSOR BIT(2) +/* The host does NOT provide support for drawing the cursor itself. */ +#define VMMDEV_MOUSE_HOST_CANNOT_HWPOINTER BIT(3) +/* The guest can read VMMDev events to find out about pointer movement */ +#define VMMDEV_MOUSE_NEW_PROTOCOL BIT(4) +/* + * If the guest changes the status of the VMMDEV_MOUSE_GUEST_NEEDS_HOST_CURSOR + * bit, the host will honour this. + */ +#define VMMDEV_MOUSE_HOST_RECHECKS_NEEDS_HOST_CURSOR BIT(5) +/* + * The host supplies an absolute pointing device. The Guest Additions may + * wish to use this to decide whether to install their own driver. + */ +#define VMMDEV_MOUSE_HOST_HAS_ABS_DEV BIT(6) + +/* The minimum value our pointing device can return. */ +#define VMMDEV_MOUSE_RANGE_MIN 0 +/* The maximum value our pointing device can return. */ +#define VMMDEV_MOUSE_RANGE_MAX 0xFFFF + +/** + * struct vmmdev_host_version - VirtualBox host version request structure. + * + * VBG uses this to detect the precense of new features in the interface. + */ +struct vmmdev_host_version { + /** Header. */ + struct vmmdev_request_header header; + /** Major version. */ + u16 major; + /** Minor version. */ + u16 minor; + /** Build number. */ + u32 build; + /** SVN revision. */ + u32 revision; + /** Feature mask. */ + u32 features; +}; +VMMDEV_ASSERT_SIZE(vmmdev_host_version, 24 + 16); + +/* Physical page lists are supported by HGCM. */ +#define VMMDEV_HVF_HGCM_PHYS_PAGE_LIST BIT(0) + +/** + * struct vmmdev_mask - Structure to set / clear bits in a mask used for + * VMMDEVREQ_SET_GUEST_CAPABILITIES and VMMDEVREQ_CTL_GUEST_FILTER_MASK. + */ +struct vmmdev_mask { + /** Header. */ + struct vmmdev_request_header header; + /** Mask of bits to be set. */ + u32 or_mask; + /** Mask of bits to be cleared. */ + u32 not_mask; +}; +VMMDEV_ASSERT_SIZE(vmmdev_mask, 24 + 8); + +/* The guest supports seamless display rendering. */ +#define VMMDEV_GUEST_SUPPORTS_SEAMLESS BIT(0) +/* The guest supports mapping guest to host windows. */ +#define VMMDEV_GUEST_SUPPORTS_GUEST_HOST_WINDOW_MAPPING BIT(1) +/* + * The guest graphical additions are active. + * Used for fast activation and deactivation of certain graphical operations + * (e.g. resizing & seamless). The legacy VMMDEVREQ_REPORT_GUEST_CAPABILITIES + * request sets this automatically, but VMMDEVREQ_SET_GUEST_CAPABILITIES does + * not. + */ +#define VMMDEV_GUEST_SUPPORTS_GRAPHICS BIT(2) +/* The mask of valid capabilities, for sanity checking. */ +#define VMMDEV_GUEST_CAPABILITIES_MASK 0x00000007U + +/** struct vmmdev_hypervisorinfo - Hypervisor info structure. */ +struct vmmdev_hypervisorinfo { + /** Header. */ + struct vmmdev_request_header header; + /** + * Guest virtual address of proposed hypervisor start. + * Not used by VMMDEVREQ_GET_HYPERVISOR_INFO. + */ + u32 hypervisor_start; + /** Hypervisor size in bytes. */ + u32 hypervisor_size; +}; +VMMDEV_ASSERT_SIZE(vmmdev_hypervisorinfo, 24 + 8); + +/** struct vmmdev_events - Pending events structure. */ +struct vmmdev_events { + /** Header. */ + struct vmmdev_request_header header; + /** OUT: Pending event mask. */ + u32 events; +}; +VMMDEV_ASSERT_SIZE(vmmdev_events, 24 + 4); + +#define VMMDEV_OSTYPE_LINUX26 0x53000 +#define VMMDEV_OSTYPE_X64 BIT(8) + +/** struct vmmdev_guestinfo - Guest information report. */ +struct vmmdev_guest_info { + /** Header. */ + struct vmmdev_request_header header; + /** + * The VMMDev interface version expected by additions. + * *Deprecated*, do not use anymore! Will be removed. + */ + u32 interface_version; + /** Guest OS type. */ + u32 os_type; +}; +VMMDEV_ASSERT_SIZE(vmmdev_guest_info, 24 + 8); + +/** struct vmmdev_guestinfo2 - Guest information report, version 2. */ +struct vmmdev_guest_info2 { + /** Header. */ + struct vmmdev_request_header header; + /** Major version. */ + u16 additions_major; + /** Minor version. */ + u16 additions_minor; + /** Build number. */ + u32 additions_build; + /** SVN revision. */ + u32 additions_revision; + /** Feature mask, currently unused. */ + u32 additions_features; + /** + * The intentional meaning of this field was: + * Some additional information, for example 'Beta 1' or something like + * that. + * + * The way it was implemented was implemented: VBG_VERSION_STRING. + * + * This means the first three members are duplicated in this field (if + * the guest build config is sane). So, the user must check this and + * chop it off before usage. There is, because of the Main code's blind + * trust in the field's content, no way back. + */ + char name[128]; +}; +VMMDEV_ASSERT_SIZE(vmmdev_guest_info2, 24 + 144); + +enum vmmdev_guest_facility_type { + VBOXGUEST_FACILITY_TYPE_UNKNOWN = 0, + VBOXGUEST_FACILITY_TYPE_VBOXGUEST_DRIVER = 20, + /* VBoxGINA / VBoxCredProv / pam_vbox. */ + VBOXGUEST_FACILITY_TYPE_AUTO_LOGON = 90, + VBOXGUEST_FACILITY_TYPE_VBOX_SERVICE = 100, + /* VBoxTray (Windows), VBoxClient (Linux, Unix). */ + VBOXGUEST_FACILITY_TYPE_VBOX_TRAY_CLIENT = 101, + VBOXGUEST_FACILITY_TYPE_SEAMLESS = 1000, + VBOXGUEST_FACILITY_TYPE_GRAPHICS = 1100, + VBOXGUEST_FACILITY_TYPE_ALL = 0x7ffffffe, + /* Ensure the enum is a 32 bit data-type */ + VBOXGUEST_FACILITY_TYPE_SIZEHACK = 0x7fffffff +}; + +enum vmmdev_guest_facility_status { + VBOXGUEST_FACILITY_STATUS_INACTIVE = 0, + VBOXGUEST_FACILITY_STATUS_PAUSED = 1, + VBOXGUEST_FACILITY_STATUS_PRE_INIT = 20, + VBOXGUEST_FACILITY_STATUS_INIT = 30, + VBOXGUEST_FACILITY_STATUS_ACTIVE = 50, + VBOXGUEST_FACILITY_STATUS_TERMINATING = 100, + VBOXGUEST_FACILITY_STATUS_TERMINATED = 101, + VBOXGUEST_FACILITY_STATUS_FAILED = 800, + VBOXGUEST_FACILITY_STATUS_UNKNOWN = 999, + /* Ensure the enum is a 32 bit data-type */ + VBOXGUEST_FACILITY_STATUS_SIZEHACK = 0x7fffffff +}; + +/** struct vmmdev_guest_status - Guest Additions status structure. */ +struct vmmdev_guest_status { + /** Header. */ + struct vmmdev_request_header header; + /** Facility the status is indicated for. */ + enum vmmdev_guest_facility_type facility; + /** Current guest status. */ + enum vmmdev_guest_facility_status status; + /** Flags, not used at the moment. */ + u32 flags; +}; +VMMDEV_ASSERT_SIZE(vmmdev_guest_status, 24 + 12); + +#define VMMDEV_MEMORY_BALLOON_CHUNK_SIZE (1048576) +#define VMMDEV_MEMORY_BALLOON_CHUNK_PAGES (1048576 / 4096) + +/** struct vmmdev_memballoon_info - Memory-balloon info structure. */ +struct vmmdev_memballoon_info { + /** Header. */ + struct vmmdev_request_header header; + /** Balloon size in megabytes. */ + u32 balloon_chunks; + /** Guest ram size in megabytes. */ + u32 phys_mem_chunks; + /** + * Setting this to VMMDEV_EVENT_BALLOON_CHANGE_REQUEST indicates that + * the request is a response to that event. + * (Don't confuse this with VMMDEVREQ_ACKNOWLEDGE_EVENTS.) + */ + u32 event_ack; +}; +VMMDEV_ASSERT_SIZE(vmmdev_memballoon_info, 24 + 12); + +/** struct vmmdev_memballoon_change - Change the size of the balloon. */ +struct vmmdev_memballoon_change { + /** Header. */ + struct vmmdev_request_header header; + /** The number of pages in the array. */ + u32 pages; + /** true = inflate, false = deflate. */ + u32 inflate; + /** Physical address (u64) of each page. */ + u64 phys_page[VMMDEV_MEMORY_BALLOON_CHUNK_PAGES]; +}; + +/** struct vmmdev_write_core_dump - Write Core Dump request data. */ +struct vmmdev_write_core_dump { + /** Header. */ + struct vmmdev_request_header header; + /** Flags (reserved, MBZ). */ + u32 flags; +}; +VMMDEV_ASSERT_SIZE(vmmdev_write_core_dump, 24 + 4); + +/** struct vmmdev_heartbeat - Heart beat check state structure. */ +struct vmmdev_heartbeat { + /** Header. */ + struct vmmdev_request_header header; + /** OUT: Guest heartbeat interval in nanosec. */ + u64 interval_ns; + /** Heartbeat check flag. */ + u8 enabled; + /** Explicit padding, MBZ. */ + u8 padding[3]; +} __packed; +VMMDEV_ASSERT_SIZE(vmmdev_heartbeat, 24 + 12); + +#define VMMDEV_HGCM_REQ_DONE BIT(0) +#define VMMDEV_HGCM_REQ_CANCELLED BIT(1) + +/** struct vmmdev_hgcmreq_header - vmmdev HGCM requests header. */ +struct vmmdev_hgcmreq_header { + /** Request header. */ + struct vmmdev_request_header header; + + /** HGCM flags. */ + u32 flags; + + /** Result code. */ + s32 result; +}; +VMMDEV_ASSERT_SIZE(vmmdev_hgcmreq_header, 24 + 8); + +/** struct vmmdev_hgcm_connect - HGCM connect request structure. */ +struct vmmdev_hgcm_connect { + /** HGCM request header. */ + struct vmmdev_hgcmreq_header header; + + /** IN: Description of service to connect to. */ + struct vmmdev_hgcm_service_location loc; + + /** OUT: Client identifier assigned by local instance of HGCM. */ + u32 client_id; +}; +VMMDEV_ASSERT_SIZE(vmmdev_hgcm_connect, 32 + 132 + 4); + +/** struct vmmdev_hgcm_disconnect - HGCM disconnect request structure. */ +struct vmmdev_hgcm_disconnect { + /** HGCM request header. */ + struct vmmdev_hgcmreq_header header; + + /** IN: Client identifier. */ + u32 client_id; +}; +VMMDEV_ASSERT_SIZE(vmmdev_hgcm_disconnect, 32 + 4); + +#define VMMDEV_HGCM_MAX_PARMS 32 + +/** struct vmmdev_hgcm_call - HGCM call request structure. */ +struct vmmdev_hgcm_call { + /* request header */ + struct vmmdev_hgcmreq_header header; + + /** IN: Client identifier. */ + u32 client_id; + /** IN: Service function number. */ + u32 function; + /** IN: Number of parameters. */ + u32 parm_count; + /** Parameters follow in form: HGCMFunctionParameter32|64 parms[X]; */ +}; +VMMDEV_ASSERT_SIZE(vmmdev_hgcm_call, 32 + 12); + +/** + * struct vmmdev_hgcm_cancel2 - HGCM cancel request structure, version 2. + * + * After the request header.rc will be: + * + * VINF_SUCCESS when cancelled. + * VERR_NOT_FOUND if the specified request cannot be found. + * VERR_INVALID_PARAMETER if the address is invalid valid. + */ +struct vmmdev_hgcm_cancel2 { + /** Header. */ + struct vmmdev_request_header header; + /** The physical address of the request to cancel. */ + u32 phys_req_to_cancel; +}; +VMMDEV_ASSERT_SIZE(vmmdev_hgcm_cancel2, 24 + 4); + +#endif diff --git a/drivers/virtio/Kconfig b/drivers/virtio/Kconfig new file mode 100644 index 000000000..35897649c --- /dev/null +++ b/drivers/virtio/Kconfig @@ -0,0 +1,86 @@ +config VIRTIO + tristate + ---help--- + This option is selected by any driver which implements the virtio + bus, such as CONFIG_VIRTIO_PCI, CONFIG_VIRTIO_MMIO, CONFIG_RPMSG + or CONFIG_S390_GUEST. + +menuconfig VIRTIO_MENU + bool "Virtio drivers" + default y + +if VIRTIO_MENU + +config VIRTIO_PCI + tristate "PCI driver for virtio devices" + depends on PCI + select VIRTIO + ---help--- + This driver provides support for virtio based paravirtual device + drivers over PCI. This requires that your VMM has appropriate PCI + virtio backends. Most QEMU based VMMs should support these devices + (like KVM or Xen). + + If unsure, say M. + +config VIRTIO_PCI_LEGACY + bool "Support for legacy virtio draft 0.9.X and older devices" + default y + depends on VIRTIO_PCI + ---help--- + Virtio PCI Card 0.9.X Draft (circa 2014) and older device support. + + This option enables building a transitional driver, supporting + both devices conforming to Virtio 1 specification, and legacy devices. + If disabled, you get a slightly smaller, non-transitional driver, + with no legacy compatibility. + + So look out into your driveway. Do you have a flying car? If + so, you can happily disable this option and virtio will not + break. Otherwise, leave it set. Unless you're testing what + life will be like in The Future. + + If unsure, say Y. + +config VIRTIO_BALLOON + tristate "Virtio balloon driver" + depends on VIRTIO + select MEMORY_BALLOON + ---help--- + This driver supports increasing and decreasing the amount + of memory within a KVM guest. + + If unsure, say M. + +config VIRTIO_INPUT + tristate "Virtio input driver" + depends on VIRTIO + depends on INPUT + ---help--- + This driver supports virtio input devices such as + keyboards, mice and tablets. + + If unsure, say M. + + config VIRTIO_MMIO + tristate "Platform bus driver for memory mapped virtio devices" + depends on HAS_IOMEM && HAS_DMA + select VIRTIO + ---help--- + This drivers provides support for memory mapped virtio + platform device driver. + + If unsure, say N. + +config VIRTIO_MMIO_CMDLINE_DEVICES + bool "Memory mapped virtio devices parameter parsing" + depends on VIRTIO_MMIO + ---help--- + Allow virtio-mmio devices instantiation via the kernel command line + or module parameters. Be aware that using incorrect parameters (base + address in particular) can crash your system - you have been warned. + See Documentation/admin-guide/kernel-parameters.rst for details. + + If unsure, say 'N'. + +endif # VIRTIO_MENU diff --git a/drivers/virtio/Makefile b/drivers/virtio/Makefile new file mode 100644 index 000000000..3a2b5c5dc --- /dev/null +++ b/drivers/virtio/Makefile @@ -0,0 +1,8 @@ +# SPDX-License-Identifier: GPL-2.0 +obj-$(CONFIG_VIRTIO) += virtio.o virtio_ring.o +obj-$(CONFIG_VIRTIO_MMIO) += virtio_mmio.o +obj-$(CONFIG_VIRTIO_PCI) += virtio_pci.o +virtio_pci-y := virtio_pci_modern.o virtio_pci_common.o +virtio_pci-$(CONFIG_VIRTIO_PCI_LEGACY) += virtio_pci_legacy.o +obj-$(CONFIG_VIRTIO_BALLOON) += virtio_balloon.o +obj-$(CONFIG_VIRTIO_INPUT) += virtio_input.o diff --git a/drivers/virtio/virtio.c b/drivers/virtio/virtio.c new file mode 100644 index 000000000..89935105d --- /dev/null +++ b/drivers/virtio/virtio.c @@ -0,0 +1,457 @@ +#include <linux/virtio.h> +#include <linux/spinlock.h> +#include <linux/virtio_config.h> +#include <linux/module.h> +#include <linux/idr.h> +#include <uapi/linux/virtio_ids.h> + +/* Unique numbering for virtio devices. */ +static DEFINE_IDA(virtio_index_ida); + +static ssize_t device_show(struct device *_d, + struct device_attribute *attr, char *buf) +{ + struct virtio_device *dev = dev_to_virtio(_d); + return sprintf(buf, "0x%04x\n", dev->id.device); +} +static DEVICE_ATTR_RO(device); + +static ssize_t vendor_show(struct device *_d, + struct device_attribute *attr, char *buf) +{ + struct virtio_device *dev = dev_to_virtio(_d); + return sprintf(buf, "0x%04x\n", dev->id.vendor); +} +static DEVICE_ATTR_RO(vendor); + +static ssize_t status_show(struct device *_d, + struct device_attribute *attr, char *buf) +{ + struct virtio_device *dev = dev_to_virtio(_d); + return sprintf(buf, "0x%08x\n", dev->config->get_status(dev)); +} +static DEVICE_ATTR_RO(status); + +static ssize_t modalias_show(struct device *_d, + struct device_attribute *attr, char *buf) +{ + struct virtio_device *dev = dev_to_virtio(_d); + return sprintf(buf, "virtio:d%08Xv%08X\n", + dev->id.device, dev->id.vendor); +} +static DEVICE_ATTR_RO(modalias); + +static ssize_t features_show(struct device *_d, + struct device_attribute *attr, char *buf) +{ + struct virtio_device *dev = dev_to_virtio(_d); + unsigned int i; + ssize_t len = 0; + + /* We actually represent this as a bitstring, as it could be + * arbitrary length in future. */ + for (i = 0; i < sizeof(dev->features)*8; i++) + len += sprintf(buf+len, "%c", + __virtio_test_bit(dev, i) ? '1' : '0'); + len += sprintf(buf+len, "\n"); + return len; +} +static DEVICE_ATTR_RO(features); + +static struct attribute *virtio_dev_attrs[] = { + &dev_attr_device.attr, + &dev_attr_vendor.attr, + &dev_attr_status.attr, + &dev_attr_modalias.attr, + &dev_attr_features.attr, + NULL, +}; +ATTRIBUTE_GROUPS(virtio_dev); + +static inline int virtio_id_match(const struct virtio_device *dev, + const struct virtio_device_id *id) +{ + if (id->device != dev->id.device && id->device != VIRTIO_DEV_ANY_ID) + return 0; + + return id->vendor == VIRTIO_DEV_ANY_ID || id->vendor == dev->id.vendor; +} + +/* This looks through all the IDs a driver claims to support. If any of them + * match, we return 1 and the kernel will call virtio_dev_probe(). */ +static int virtio_dev_match(struct device *_dv, struct device_driver *_dr) +{ + unsigned int i; + struct virtio_device *dev = dev_to_virtio(_dv); + const struct virtio_device_id *ids; + + ids = drv_to_virtio(_dr)->id_table; + for (i = 0; ids[i].device; i++) + if (virtio_id_match(dev, &ids[i])) + return 1; + return 0; +} + +static int virtio_uevent(struct device *_dv, struct kobj_uevent_env *env) +{ + struct virtio_device *dev = dev_to_virtio(_dv); + + return add_uevent_var(env, "MODALIAS=virtio:d%08Xv%08X", + dev->id.device, dev->id.vendor); +} + +void virtio_check_driver_offered_feature(const struct virtio_device *vdev, + unsigned int fbit) +{ + unsigned int i; + struct virtio_driver *drv = drv_to_virtio(vdev->dev.driver); + + for (i = 0; i < drv->feature_table_size; i++) + if (drv->feature_table[i] == fbit) + return; + + if (drv->feature_table_legacy) { + for (i = 0; i < drv->feature_table_size_legacy; i++) + if (drv->feature_table_legacy[i] == fbit) + return; + } + + BUG(); +} +EXPORT_SYMBOL_GPL(virtio_check_driver_offered_feature); + +static void __virtio_config_changed(struct virtio_device *dev) +{ + struct virtio_driver *drv = drv_to_virtio(dev->dev.driver); + + if (!dev->config_enabled) + dev->config_change_pending = true; + else if (drv && drv->config_changed) + drv->config_changed(dev); +} + +void virtio_config_changed(struct virtio_device *dev) +{ + unsigned long flags; + + spin_lock_irqsave(&dev->config_lock, flags); + __virtio_config_changed(dev); + spin_unlock_irqrestore(&dev->config_lock, flags); +} +EXPORT_SYMBOL_GPL(virtio_config_changed); + +void virtio_config_disable(struct virtio_device *dev) +{ + spin_lock_irq(&dev->config_lock); + dev->config_enabled = false; + spin_unlock_irq(&dev->config_lock); +} +EXPORT_SYMBOL_GPL(virtio_config_disable); + +void virtio_config_enable(struct virtio_device *dev) +{ + spin_lock_irq(&dev->config_lock); + dev->config_enabled = true; + if (dev->config_change_pending) + __virtio_config_changed(dev); + dev->config_change_pending = false; + spin_unlock_irq(&dev->config_lock); +} +EXPORT_SYMBOL_GPL(virtio_config_enable); + +void virtio_add_status(struct virtio_device *dev, unsigned int status) +{ + dev->config->set_status(dev, dev->config->get_status(dev) | status); +} +EXPORT_SYMBOL_GPL(virtio_add_status); + +/* Do some validation, then set FEATURES_OK */ +static int virtio_features_ok(struct virtio_device *dev) +{ + unsigned status; + + if (!virtio_has_feature(dev, VIRTIO_F_VERSION_1)) + return 0; + + virtio_add_status(dev, VIRTIO_CONFIG_S_FEATURES_OK); + status = dev->config->get_status(dev); + if (!(status & VIRTIO_CONFIG_S_FEATURES_OK)) { + dev_err(&dev->dev, "virtio: device refuses features: %x\n", + status); + return -ENODEV; + } + return 0; +} + +static int virtio_dev_probe(struct device *_d) +{ + int err, i; + struct virtio_device *dev = dev_to_virtio(_d); + struct virtio_driver *drv = drv_to_virtio(dev->dev.driver); + u64 device_features; + u64 driver_features; + u64 driver_features_legacy; + + /* We have a driver! */ + virtio_add_status(dev, VIRTIO_CONFIG_S_DRIVER); + + /* Figure out what features the device supports. */ + device_features = dev->config->get_features(dev); + + /* Figure out what features the driver supports. */ + driver_features = 0; + for (i = 0; i < drv->feature_table_size; i++) { + unsigned int f = drv->feature_table[i]; + BUG_ON(f >= 64); + driver_features |= (1ULL << f); + } + + /* Some drivers have a separate feature table for virtio v1.0 */ + if (drv->feature_table_legacy) { + driver_features_legacy = 0; + for (i = 0; i < drv->feature_table_size_legacy; i++) { + unsigned int f = drv->feature_table_legacy[i]; + BUG_ON(f >= 64); + driver_features_legacy |= (1ULL << f); + } + } else { + driver_features_legacy = driver_features; + } + + if (device_features & (1ULL << VIRTIO_F_VERSION_1)) + dev->features = driver_features & device_features; + else + dev->features = driver_features_legacy & device_features; + + /* Transport features always preserved to pass to finalize_features. */ + for (i = VIRTIO_TRANSPORT_F_START; i < VIRTIO_TRANSPORT_F_END; i++) + if (device_features & (1ULL << i)) + __virtio_set_bit(dev, i); + + err = dev->config->finalize_features(dev); + if (err) + goto err; + + if (drv->validate) { + u64 features = dev->features; + + err = drv->validate(dev); + if (err) + goto err; + + /* Did validation change any features? Then write them again. */ + if (features != dev->features) { + err = dev->config->finalize_features(dev); + if (err) + goto err; + } + } + + err = virtio_features_ok(dev); + if (err) + goto err; + + err = drv->probe(dev); + if (err) + goto err; + + /* If probe didn't do it, mark device DRIVER_OK ourselves. */ + if (!(dev->config->get_status(dev) & VIRTIO_CONFIG_S_DRIVER_OK)) + virtio_device_ready(dev); + + if (drv->scan) + drv->scan(dev); + + virtio_config_enable(dev); + + return 0; +err: + virtio_add_status(dev, VIRTIO_CONFIG_S_FAILED); + return err; + +} + +static int virtio_dev_remove(struct device *_d) +{ + struct virtio_device *dev = dev_to_virtio(_d); + struct virtio_driver *drv = drv_to_virtio(dev->dev.driver); + + virtio_config_disable(dev); + + drv->remove(dev); + + /* Driver should have reset device. */ + WARN_ON_ONCE(dev->config->get_status(dev)); + + /* Acknowledge the device's existence again. */ + virtio_add_status(dev, VIRTIO_CONFIG_S_ACKNOWLEDGE); + return 0; +} + +static struct bus_type virtio_bus = { + .name = "virtio", + .match = virtio_dev_match, + .dev_groups = virtio_dev_groups, + .uevent = virtio_uevent, + .probe = virtio_dev_probe, + .remove = virtio_dev_remove, +}; + +int register_virtio_driver(struct virtio_driver *driver) +{ + /* Catch this early. */ + BUG_ON(driver->feature_table_size && !driver->feature_table); + driver->driver.bus = &virtio_bus; + return driver_register(&driver->driver); +} +EXPORT_SYMBOL_GPL(register_virtio_driver); + +void unregister_virtio_driver(struct virtio_driver *driver) +{ + driver_unregister(&driver->driver); +} +EXPORT_SYMBOL_GPL(unregister_virtio_driver); + +/** + * register_virtio_device - register virtio device + * @dev : virtio device to be registered + * + * On error, the caller must call put_device on &@dev->dev (and not kfree), + * as another code path may have obtained a reference to @dev. + * + * Returns: 0 on suceess, -error on failure + */ +int register_virtio_device(struct virtio_device *dev) +{ + int err; + + dev->dev.bus = &virtio_bus; + device_initialize(&dev->dev); + + /* Assign a unique device index and hence name. */ + err = ida_simple_get(&virtio_index_ida, 0, 0, GFP_KERNEL); + if (err < 0) + goto out; + + dev->index = err; + dev_set_name(&dev->dev, "virtio%u", dev->index); + + spin_lock_init(&dev->config_lock); + dev->config_enabled = false; + dev->config_change_pending = false; + + /* We always start by resetting the device, in case a previous + * driver messed it up. This also tests that code path a little. */ + dev->config->reset(dev); + + /* Acknowledge that we've seen the device. */ + virtio_add_status(dev, VIRTIO_CONFIG_S_ACKNOWLEDGE); + + INIT_LIST_HEAD(&dev->vqs); + + /* + * device_add() causes the bus infrastructure to look for a matching + * driver. + */ + err = device_add(&dev->dev); + if (err) + ida_simple_remove(&virtio_index_ida, dev->index); +out: + if (err) + virtio_add_status(dev, VIRTIO_CONFIG_S_FAILED); + return err; +} +EXPORT_SYMBOL_GPL(register_virtio_device); + +void unregister_virtio_device(struct virtio_device *dev) +{ + int index = dev->index; /* save for after device release */ + + device_unregister(&dev->dev); + ida_simple_remove(&virtio_index_ida, index); +} +EXPORT_SYMBOL_GPL(unregister_virtio_device); + +#ifdef CONFIG_PM_SLEEP +int virtio_device_freeze(struct virtio_device *dev) +{ + struct virtio_driver *drv = drv_to_virtio(dev->dev.driver); + + virtio_config_disable(dev); + + dev->failed = dev->config->get_status(dev) & VIRTIO_CONFIG_S_FAILED; + + if (drv && drv->freeze) + return drv->freeze(dev); + + return 0; +} +EXPORT_SYMBOL_GPL(virtio_device_freeze); + +int virtio_device_restore(struct virtio_device *dev) +{ + struct virtio_driver *drv = drv_to_virtio(dev->dev.driver); + int ret; + + /* We always start by resetting the device, in case a previous + * driver messed it up. */ + dev->config->reset(dev); + + /* Acknowledge that we've seen the device. */ + virtio_add_status(dev, VIRTIO_CONFIG_S_ACKNOWLEDGE); + + /* Maybe driver failed before freeze. + * Restore the failed status, for debugging. */ + if (dev->failed) + virtio_add_status(dev, VIRTIO_CONFIG_S_FAILED); + + if (!drv) + return 0; + + /* We have a driver! */ + virtio_add_status(dev, VIRTIO_CONFIG_S_DRIVER); + + ret = dev->config->finalize_features(dev); + if (ret) + goto err; + + ret = virtio_features_ok(dev); + if (ret) + goto err; + + if (drv->restore) { + ret = drv->restore(dev); + if (ret) + goto err; + } + + /* Finally, tell the device we're all set */ + virtio_add_status(dev, VIRTIO_CONFIG_S_DRIVER_OK); + + virtio_config_enable(dev); + + return 0; + +err: + virtio_add_status(dev, VIRTIO_CONFIG_S_FAILED); + return ret; +} +EXPORT_SYMBOL_GPL(virtio_device_restore); +#endif + +static int virtio_init(void) +{ + if (bus_register(&virtio_bus) != 0) + panic("virtio bus registration failed"); + return 0; +} + +static void __exit virtio_exit(void) +{ + bus_unregister(&virtio_bus); + ida_destroy(&virtio_index_ida); +} +core_initcall(virtio_init); +module_exit(virtio_exit); + +MODULE_LICENSE("GPL"); diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c new file mode 100644 index 000000000..1afcbef39 --- /dev/null +++ b/drivers/virtio/virtio_balloon.c @@ -0,0 +1,740 @@ +/* + * Virtio balloon implementation, inspired by Dor Laor and Marcelo + * Tosatti's implementations. + * + * Copyright 2008 Rusty Russell IBM Corporation + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include <linux/virtio.h> +#include <linux/virtio_balloon.h> +#include <linux/swap.h> +#include <linux/workqueue.h> +#include <linux/delay.h> +#include <linux/slab.h> +#include <linux/module.h> +#include <linux/balloon_compaction.h> +#include <linux/wait.h> +#include <linux/mm.h> +#include <linux/mount.h> +#include <linux/magic.h> + +/* + * Balloon device works in 4K page units. So each page is pointed to by + * multiple balloon pages. All memory counters in this driver are in balloon + * page units. + */ +#define VIRTIO_BALLOON_PAGES_PER_PAGE (unsigned)(PAGE_SIZE >> VIRTIO_BALLOON_PFN_SHIFT) +#define VIRTIO_BALLOON_ARRAY_PFNS_MAX 256 +#define VIRTBALLOON_OOM_NOTIFY_PRIORITY 80 + +#ifdef CONFIG_BALLOON_COMPACTION +static struct vfsmount *balloon_mnt; +#endif + +struct virtio_balloon { + struct virtio_device *vdev; + struct virtqueue *inflate_vq, *deflate_vq, *stats_vq; + + /* The balloon servicing is delegated to a freezable workqueue. */ + struct work_struct update_balloon_stats_work; + struct work_struct update_balloon_size_work; + + /* Prevent updating balloon when it is being canceled. */ + spinlock_t stop_update_lock; + bool stop_update; + + /* Waiting for host to ack the pages we released. */ + wait_queue_head_t acked; + + /* Number of balloon pages we've told the Host we're not using. */ + unsigned int num_pages; + /* + * The pages we've told the Host we're not using are enqueued + * at vb_dev_info->pages list. + * Each page on this list adds VIRTIO_BALLOON_PAGES_PER_PAGE + * to num_pages above. + */ + struct balloon_dev_info vb_dev_info; + + /* Synchronize access/update to this struct virtio_balloon elements */ + struct mutex balloon_lock; + + /* The array of pfns we tell the Host about. */ + unsigned int num_pfns; + __virtio32 pfns[VIRTIO_BALLOON_ARRAY_PFNS_MAX]; + + /* Memory statistics */ + struct virtio_balloon_stat stats[VIRTIO_BALLOON_S_NR]; + + /* To register a shrinker to shrink memory upon memory pressure */ + struct shrinker shrinker; +}; + +static struct virtio_device_id id_table[] = { + { VIRTIO_ID_BALLOON, VIRTIO_DEV_ANY_ID }, + { 0 }, +}; + +static u32 page_to_balloon_pfn(struct page *page) +{ + unsigned long pfn = page_to_pfn(page); + + BUILD_BUG_ON(PAGE_SHIFT < VIRTIO_BALLOON_PFN_SHIFT); + /* Convert pfn from Linux page size to balloon page size. */ + return pfn * VIRTIO_BALLOON_PAGES_PER_PAGE; +} + +static void balloon_ack(struct virtqueue *vq) +{ + struct virtio_balloon *vb = vq->vdev->priv; + + wake_up(&vb->acked); +} + +static void tell_host(struct virtio_balloon *vb, struct virtqueue *vq) +{ + struct scatterlist sg; + unsigned int len; + + sg_init_one(&sg, vb->pfns, sizeof(vb->pfns[0]) * vb->num_pfns); + + /* We should always be able to add one buffer to an empty queue. */ + virtqueue_add_outbuf(vq, &sg, 1, vb, GFP_KERNEL); + virtqueue_kick(vq); + + /* When host has read buffer, this completes via balloon_ack */ + wait_event(vb->acked, virtqueue_get_buf(vq, &len)); + +} + +static void set_page_pfns(struct virtio_balloon *vb, + __virtio32 pfns[], struct page *page) +{ + unsigned int i; + + BUILD_BUG_ON(VIRTIO_BALLOON_PAGES_PER_PAGE > VIRTIO_BALLOON_ARRAY_PFNS_MAX); + + /* + * Set balloon pfns pointing at this page. + * Note that the first pfn points at start of the page. + */ + for (i = 0; i < VIRTIO_BALLOON_PAGES_PER_PAGE; i++) + pfns[i] = cpu_to_virtio32(vb->vdev, + page_to_balloon_pfn(page) + i); +} + +static unsigned fill_balloon(struct virtio_balloon *vb, size_t num) +{ + unsigned num_allocated_pages; + unsigned num_pfns; + struct page *page; + LIST_HEAD(pages); + + /* We can only do one array worth at a time. */ + num = min(num, ARRAY_SIZE(vb->pfns)); + + for (num_pfns = 0; num_pfns < num; + num_pfns += VIRTIO_BALLOON_PAGES_PER_PAGE) { + struct page *page = balloon_page_alloc(); + + if (!page) { + dev_info_ratelimited(&vb->vdev->dev, + "Out of puff! Can't get %u pages\n", + VIRTIO_BALLOON_PAGES_PER_PAGE); + /* Sleep for at least 1/5 of a second before retry. */ + msleep(200); + break; + } + + balloon_page_push(&pages, page); + } + + mutex_lock(&vb->balloon_lock); + + vb->num_pfns = 0; + + while ((page = balloon_page_pop(&pages))) { + balloon_page_enqueue(&vb->vb_dev_info, page); + + set_page_pfns(vb, vb->pfns + vb->num_pfns, page); + vb->num_pages += VIRTIO_BALLOON_PAGES_PER_PAGE; + if (!virtio_has_feature(vb->vdev, + VIRTIO_BALLOON_F_DEFLATE_ON_OOM)) + adjust_managed_page_count(page, -1); + vb->num_pfns += VIRTIO_BALLOON_PAGES_PER_PAGE; + } + + num_allocated_pages = vb->num_pfns; + /* Did we get any? */ + if (vb->num_pfns != 0) + tell_host(vb, vb->inflate_vq); + mutex_unlock(&vb->balloon_lock); + + return num_allocated_pages; +} + +static void release_pages_balloon(struct virtio_balloon *vb, + struct list_head *pages) +{ + struct page *page, *next; + + list_for_each_entry_safe(page, next, pages, lru) { + if (!virtio_has_feature(vb->vdev, + VIRTIO_BALLOON_F_DEFLATE_ON_OOM)) + adjust_managed_page_count(page, 1); + list_del(&page->lru); + put_page(page); /* balloon reference */ + } +} + +static unsigned leak_balloon(struct virtio_balloon *vb, size_t num) +{ + unsigned num_freed_pages; + struct page *page; + struct balloon_dev_info *vb_dev_info = &vb->vb_dev_info; + LIST_HEAD(pages); + + /* We can only do one array worth at a time. */ + num = min(num, ARRAY_SIZE(vb->pfns)); + + mutex_lock(&vb->balloon_lock); + /* We can't release more pages than taken */ + num = min(num, (size_t)vb->num_pages); + for (vb->num_pfns = 0; vb->num_pfns < num; + vb->num_pfns += VIRTIO_BALLOON_PAGES_PER_PAGE) { + page = balloon_page_dequeue(vb_dev_info); + if (!page) + break; + set_page_pfns(vb, vb->pfns + vb->num_pfns, page); + list_add(&page->lru, &pages); + vb->num_pages -= VIRTIO_BALLOON_PAGES_PER_PAGE; + } + + num_freed_pages = vb->num_pfns; + /* + * Note that if + * virtio_has_feature(vdev, VIRTIO_BALLOON_F_MUST_TELL_HOST); + * is true, we *have* to do it in this order + */ + if (vb->num_pfns != 0) + tell_host(vb, vb->deflate_vq); + release_pages_balloon(vb, &pages); + mutex_unlock(&vb->balloon_lock); + return num_freed_pages; +} + +static inline void update_stat(struct virtio_balloon *vb, int idx, + u16 tag, u64 val) +{ + BUG_ON(idx >= VIRTIO_BALLOON_S_NR); + vb->stats[idx].tag = cpu_to_virtio16(vb->vdev, tag); + vb->stats[idx].val = cpu_to_virtio64(vb->vdev, val); +} + +#define pages_to_bytes(x) ((u64)(x) << PAGE_SHIFT) + +static unsigned int update_balloon_stats(struct virtio_balloon *vb) +{ + unsigned long events[NR_VM_EVENT_ITEMS]; + struct sysinfo i; + unsigned int idx = 0; + long available; + unsigned long caches; + + all_vm_events(events); + si_meminfo(&i); + + available = si_mem_available(); + caches = global_node_page_state(NR_FILE_PAGES); + +#ifdef CONFIG_VM_EVENT_COUNTERS + update_stat(vb, idx++, VIRTIO_BALLOON_S_SWAP_IN, + pages_to_bytes(events[PSWPIN])); + update_stat(vb, idx++, VIRTIO_BALLOON_S_SWAP_OUT, + pages_to_bytes(events[PSWPOUT])); + update_stat(vb, idx++, VIRTIO_BALLOON_S_MAJFLT, events[PGMAJFAULT]); + update_stat(vb, idx++, VIRTIO_BALLOON_S_MINFLT, events[PGFAULT]); +#ifdef CONFIG_HUGETLB_PAGE + update_stat(vb, idx++, VIRTIO_BALLOON_S_HTLB_PGALLOC, + events[HTLB_BUDDY_PGALLOC]); + update_stat(vb, idx++, VIRTIO_BALLOON_S_HTLB_PGFAIL, + events[HTLB_BUDDY_PGALLOC_FAIL]); +#endif +#endif + update_stat(vb, idx++, VIRTIO_BALLOON_S_MEMFREE, + pages_to_bytes(i.freeram)); + update_stat(vb, idx++, VIRTIO_BALLOON_S_MEMTOT, + pages_to_bytes(i.totalram)); + update_stat(vb, idx++, VIRTIO_BALLOON_S_AVAIL, + pages_to_bytes(available)); + update_stat(vb, idx++, VIRTIO_BALLOON_S_CACHES, + pages_to_bytes(caches)); + + return idx; +} + +/* + * While most virtqueues communicate guest-initiated requests to the hypervisor, + * the stats queue operates in reverse. The driver initializes the virtqueue + * with a single buffer. From that point forward, all conversations consist of + * a hypervisor request (a call to this function) which directs us to refill + * the virtqueue with a fresh stats buffer. Since stats collection can sleep, + * we delegate the job to a freezable workqueue that will do the actual work via + * stats_handle_request(). + */ +static void stats_request(struct virtqueue *vq) +{ + struct virtio_balloon *vb = vq->vdev->priv; + + spin_lock(&vb->stop_update_lock); + if (!vb->stop_update) + queue_work(system_freezable_wq, &vb->update_balloon_stats_work); + spin_unlock(&vb->stop_update_lock); +} + +static void stats_handle_request(struct virtio_balloon *vb) +{ + struct virtqueue *vq; + struct scatterlist sg; + unsigned int len, num_stats; + + num_stats = update_balloon_stats(vb); + + vq = vb->stats_vq; + if (!virtqueue_get_buf(vq, &len)) + return; + sg_init_one(&sg, vb->stats, sizeof(vb->stats[0]) * num_stats); + virtqueue_add_outbuf(vq, &sg, 1, vb, GFP_KERNEL); + virtqueue_kick(vq); +} + +static void virtballoon_changed(struct virtio_device *vdev) +{ + struct virtio_balloon *vb = vdev->priv; + unsigned long flags; + + spin_lock_irqsave(&vb->stop_update_lock, flags); + if (!vb->stop_update) + queue_work(system_freezable_wq, &vb->update_balloon_size_work); + spin_unlock_irqrestore(&vb->stop_update_lock, flags); +} + +static inline s64 towards_target(struct virtio_balloon *vb) +{ + s64 target; + u32 num_pages; + + virtio_cread(vb->vdev, struct virtio_balloon_config, num_pages, + &num_pages); + + /* Legacy balloon config space is LE, unlike all other devices. */ + if (!virtio_has_feature(vb->vdev, VIRTIO_F_VERSION_1)) + num_pages = le32_to_cpu((__force __le32)num_pages); + + target = num_pages; + return target - vb->num_pages; +} + +static void update_balloon_size(struct virtio_balloon *vb) +{ + u32 actual = vb->num_pages; + + /* Legacy balloon config space is LE, unlike all other devices. */ + if (!virtio_has_feature(vb->vdev, VIRTIO_F_VERSION_1)) + actual = (__force u32)cpu_to_le32(actual); + + virtio_cwrite(vb->vdev, struct virtio_balloon_config, actual, + &actual); +} + +static void update_balloon_stats_func(struct work_struct *work) +{ + struct virtio_balloon *vb; + + vb = container_of(work, struct virtio_balloon, + update_balloon_stats_work); + stats_handle_request(vb); +} + +static void update_balloon_size_func(struct work_struct *work) +{ + struct virtio_balloon *vb; + s64 diff; + + vb = container_of(work, struct virtio_balloon, + update_balloon_size_work); + diff = towards_target(vb); + + if (diff > 0) + diff -= fill_balloon(vb, diff); + else if (diff < 0) + diff += leak_balloon(vb, -diff); + update_balloon_size(vb); + + if (diff) + queue_work(system_freezable_wq, work); +} + +static int init_vqs(struct virtio_balloon *vb) +{ + struct virtqueue *vqs[3]; + vq_callback_t *callbacks[] = { balloon_ack, balloon_ack, stats_request }; + static const char * const names[] = { "inflate", "deflate", "stats" }; + int err, nvqs; + + /* + * We expect two virtqueues: inflate and deflate, and + * optionally stat. + */ + nvqs = virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_STATS_VQ) ? 3 : 2; + err = virtio_find_vqs(vb->vdev, nvqs, vqs, callbacks, names, NULL); + if (err) + return err; + + vb->inflate_vq = vqs[0]; + vb->deflate_vq = vqs[1]; + if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_STATS_VQ)) { + struct scatterlist sg; + unsigned int num_stats; + vb->stats_vq = vqs[2]; + + /* + * Prime this virtqueue with one buffer so the hypervisor can + * use it to signal us later (it can't be broken yet!). + */ + num_stats = update_balloon_stats(vb); + + sg_init_one(&sg, vb->stats, sizeof(vb->stats[0]) * num_stats); + err = virtqueue_add_outbuf(vb->stats_vq, &sg, 1, vb, + GFP_KERNEL); + if (err) { + dev_warn(&vb->vdev->dev, "%s: add stat_vq failed\n", + __func__); + return err; + } + virtqueue_kick(vb->stats_vq); + } + return 0; +} + +#ifdef CONFIG_BALLOON_COMPACTION +/* + * virtballoon_migratepage - perform the balloon page migration on behalf of + * a compation thread. (called under page lock) + * @vb_dev_info: the balloon device + * @newpage: page that will replace the isolated page after migration finishes. + * @page : the isolated (old) page that is about to be migrated to newpage. + * @mode : compaction mode -- not used for balloon page migration. + * + * After a ballooned page gets isolated by compaction procedures, this is the + * function that performs the page migration on behalf of a compaction thread + * The page migration for virtio balloon is done in a simple swap fashion which + * follows these two macro steps: + * 1) insert newpage into vb->pages list and update the host about it; + * 2) update the host about the old page removed from vb->pages list; + * + * This function preforms the balloon page migration task. + * Called through balloon_mapping->a_ops->migratepage + */ +static int virtballoon_migratepage(struct balloon_dev_info *vb_dev_info, + struct page *newpage, struct page *page, enum migrate_mode mode) +{ + struct virtio_balloon *vb = container_of(vb_dev_info, + struct virtio_balloon, vb_dev_info); + unsigned long flags; + + /* + * In order to avoid lock contention while migrating pages concurrently + * to leak_balloon() or fill_balloon() we just give up the balloon_lock + * this turn, as it is easier to retry the page migration later. + * This also prevents fill_balloon() getting stuck into a mutex + * recursion in the case it ends up triggering memory compaction + * while it is attempting to inflate the ballon. + */ + if (!mutex_trylock(&vb->balloon_lock)) + return -EAGAIN; + + get_page(newpage); /* balloon reference */ + + /* + * When we migrate a page to a different zone and adjusted the + * managed page count when inflating, we have to fixup the count of + * both involved zones. + */ + if (!virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_DEFLATE_ON_OOM) && + page_zone(page) != page_zone(newpage)) { + adjust_managed_page_count(page, 1); + adjust_managed_page_count(newpage, -1); + } + + /* balloon's page migration 1st step -- inflate "newpage" */ + spin_lock_irqsave(&vb_dev_info->pages_lock, flags); + balloon_page_insert(vb_dev_info, newpage); + vb_dev_info->isolated_pages--; + __count_vm_event(BALLOON_MIGRATE); + spin_unlock_irqrestore(&vb_dev_info->pages_lock, flags); + vb->num_pfns = VIRTIO_BALLOON_PAGES_PER_PAGE; + set_page_pfns(vb, vb->pfns, newpage); + tell_host(vb, vb->inflate_vq); + + /* balloon's page migration 2nd step -- deflate "page" */ + spin_lock_irqsave(&vb_dev_info->pages_lock, flags); + balloon_page_delete(page); + spin_unlock_irqrestore(&vb_dev_info->pages_lock, flags); + vb->num_pfns = VIRTIO_BALLOON_PAGES_PER_PAGE; + set_page_pfns(vb, vb->pfns, page); + tell_host(vb, vb->deflate_vq); + + mutex_unlock(&vb->balloon_lock); + + put_page(page); /* balloon reference */ + + return MIGRATEPAGE_SUCCESS; +} + +static struct dentry *balloon_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) +{ + static const struct dentry_operations ops = { + .d_dname = simple_dname, + }; + + return mount_pseudo(fs_type, "balloon-kvm:", NULL, &ops, + BALLOON_KVM_MAGIC); +} + +static struct file_system_type balloon_fs = { + .name = "balloon-kvm", + .mount = balloon_mount, + .kill_sb = kill_anon_super, +}; + +#endif /* CONFIG_BALLOON_COMPACTION */ + +static unsigned long virtio_balloon_shrinker_scan(struct shrinker *shrinker, + struct shrink_control *sc) +{ + unsigned long pages_to_free, pages_freed = 0; + struct virtio_balloon *vb = container_of(shrinker, + struct virtio_balloon, shrinker); + + pages_to_free = sc->nr_to_scan * VIRTIO_BALLOON_PAGES_PER_PAGE; + + /* + * One invocation of leak_balloon can deflate at most + * VIRTIO_BALLOON_ARRAY_PFNS_MAX balloon pages, so we call it + * multiple times to deflate pages till reaching pages_to_free. + */ + while (vb->num_pages && pages_to_free) { + pages_to_free -= pages_freed; + pages_freed += leak_balloon(vb, pages_to_free); + } + update_balloon_size(vb); + + return pages_freed / VIRTIO_BALLOON_PAGES_PER_PAGE; +} + +static unsigned long virtio_balloon_shrinker_count(struct shrinker *shrinker, + struct shrink_control *sc) +{ + struct virtio_balloon *vb = container_of(shrinker, + struct virtio_balloon, shrinker); + + return vb->num_pages / VIRTIO_BALLOON_PAGES_PER_PAGE; +} + +static void virtio_balloon_unregister_shrinker(struct virtio_balloon *vb) +{ + unregister_shrinker(&vb->shrinker); +} + +static int virtio_balloon_register_shrinker(struct virtio_balloon *vb) +{ + vb->shrinker.scan_objects = virtio_balloon_shrinker_scan; + vb->shrinker.count_objects = virtio_balloon_shrinker_count; + vb->shrinker.seeks = DEFAULT_SEEKS; + + return register_shrinker(&vb->shrinker); +} + +static int virtballoon_probe(struct virtio_device *vdev) +{ + struct virtio_balloon *vb; + int err; + + if (!vdev->config->get) { + dev_err(&vdev->dev, "%s failure: config access disabled\n", + __func__); + return -EINVAL; + } + + vdev->priv = vb = kzalloc(sizeof(*vb), GFP_KERNEL); + if (!vb) { + err = -ENOMEM; + goto out; + } + + INIT_WORK(&vb->update_balloon_stats_work, update_balloon_stats_func); + INIT_WORK(&vb->update_balloon_size_work, update_balloon_size_func); + spin_lock_init(&vb->stop_update_lock); + mutex_init(&vb->balloon_lock); + init_waitqueue_head(&vb->acked); + vb->vdev = vdev; + + balloon_devinfo_init(&vb->vb_dev_info); + + err = init_vqs(vb); + if (err) + goto out_free_vb; + +#ifdef CONFIG_BALLOON_COMPACTION + balloon_mnt = kern_mount(&balloon_fs); + if (IS_ERR(balloon_mnt)) { + err = PTR_ERR(balloon_mnt); + goto out_del_vqs; + } + + vb->vb_dev_info.migratepage = virtballoon_migratepage; + vb->vb_dev_info.inode = alloc_anon_inode(balloon_mnt->mnt_sb); + if (IS_ERR(vb->vb_dev_info.inode)) { + err = PTR_ERR(vb->vb_dev_info.inode); + kern_unmount(balloon_mnt); + goto out_del_vqs; + } + vb->vb_dev_info.inode->i_mapping->a_ops = &balloon_aops; +#endif + /* + * We continue to use VIRTIO_BALLOON_F_DEFLATE_ON_OOM to decide if a + * shrinker needs to be registered to relieve memory pressure. + */ + if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_DEFLATE_ON_OOM)) { + err = virtio_balloon_register_shrinker(vb); + if (err) + goto out_del_vqs; + } + virtio_device_ready(vdev); + + if (towards_target(vb)) + virtballoon_changed(vdev); + return 0; + +out_del_vqs: + vdev->config->del_vqs(vdev); +out_free_vb: + kfree(vb); +out: + return err; +} + +static void remove_common(struct virtio_balloon *vb) +{ + /* There might be pages left in the balloon: free them. */ + while (vb->num_pages) + leak_balloon(vb, vb->num_pages); + update_balloon_size(vb); + + /* Now we reset the device so we can clean up the queues. */ + vb->vdev->config->reset(vb->vdev); + + vb->vdev->config->del_vqs(vb->vdev); +} + +static void virtballoon_remove(struct virtio_device *vdev) +{ + struct virtio_balloon *vb = vdev->priv; + + if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_DEFLATE_ON_OOM)) + virtio_balloon_unregister_shrinker(vb); + spin_lock_irq(&vb->stop_update_lock); + vb->stop_update = true; + spin_unlock_irq(&vb->stop_update_lock); + cancel_work_sync(&vb->update_balloon_size_work); + cancel_work_sync(&vb->update_balloon_stats_work); + + remove_common(vb); +#ifdef CONFIG_BALLOON_COMPACTION + if (vb->vb_dev_info.inode) + iput(vb->vb_dev_info.inode); + + kern_unmount(balloon_mnt); +#endif + kfree(vb); +} + +#ifdef CONFIG_PM_SLEEP +static int virtballoon_freeze(struct virtio_device *vdev) +{ + struct virtio_balloon *vb = vdev->priv; + + /* + * The workqueue is already frozen by the PM core before this + * function is called. + */ + remove_common(vb); + return 0; +} + +static int virtballoon_restore(struct virtio_device *vdev) +{ + struct virtio_balloon *vb = vdev->priv; + int ret; + + ret = init_vqs(vdev->priv); + if (ret) + return ret; + + virtio_device_ready(vdev); + + if (towards_target(vb)) + virtballoon_changed(vdev); + update_balloon_size(vb); + return 0; +} +#endif + +static int virtballoon_validate(struct virtio_device *vdev) +{ + __virtio_clear_bit(vdev, VIRTIO_F_IOMMU_PLATFORM); + return 0; +} + +static unsigned int features[] = { + VIRTIO_BALLOON_F_MUST_TELL_HOST, + VIRTIO_BALLOON_F_STATS_VQ, + VIRTIO_BALLOON_F_DEFLATE_ON_OOM, +}; + +static struct virtio_driver virtio_balloon_driver = { + .feature_table = features, + .feature_table_size = ARRAY_SIZE(features), + .driver.name = KBUILD_MODNAME, + .driver.owner = THIS_MODULE, + .id_table = id_table, + .validate = virtballoon_validate, + .probe = virtballoon_probe, + .remove = virtballoon_remove, + .config_changed = virtballoon_changed, +#ifdef CONFIG_PM_SLEEP + .freeze = virtballoon_freeze, + .restore = virtballoon_restore, +#endif +}; + +module_virtio_driver(virtio_balloon_driver); +MODULE_DEVICE_TABLE(virtio, id_table); +MODULE_DESCRIPTION("Virtio balloon driver"); +MODULE_LICENSE("GPL"); diff --git a/drivers/virtio/virtio_input.c b/drivers/virtio/virtio_input.c new file mode 100644 index 000000000..3a0468f2c --- /dev/null +++ b/drivers/virtio/virtio_input.c @@ -0,0 +1,388 @@ +#include <linux/module.h> +#include <linux/virtio.h> +#include <linux/virtio_config.h> +#include <linux/input.h> + +#include <uapi/linux/virtio_ids.h> +#include <uapi/linux/virtio_input.h> + +struct virtio_input { + struct virtio_device *vdev; + struct input_dev *idev; + char name[64]; + char serial[64]; + char phys[64]; + struct virtqueue *evt, *sts; + struct virtio_input_event evts[64]; + spinlock_t lock; + bool ready; +}; + +static void virtinput_queue_evtbuf(struct virtio_input *vi, + struct virtio_input_event *evtbuf) +{ + struct scatterlist sg[1]; + + sg_init_one(sg, evtbuf, sizeof(*evtbuf)); + virtqueue_add_inbuf(vi->evt, sg, 1, evtbuf, GFP_ATOMIC); +} + +static void virtinput_recv_events(struct virtqueue *vq) +{ + struct virtio_input *vi = vq->vdev->priv; + struct virtio_input_event *event; + unsigned long flags; + unsigned int len; + + spin_lock_irqsave(&vi->lock, flags); + if (vi->ready) { + while ((event = virtqueue_get_buf(vi->evt, &len)) != NULL) { + spin_unlock_irqrestore(&vi->lock, flags); + input_event(vi->idev, + le16_to_cpu(event->type), + le16_to_cpu(event->code), + le32_to_cpu(event->value)); + spin_lock_irqsave(&vi->lock, flags); + virtinput_queue_evtbuf(vi, event); + } + virtqueue_kick(vq); + } + spin_unlock_irqrestore(&vi->lock, flags); +} + +/* + * On error we are losing the status update, which isn't critical as + * this is typically used for stuff like keyboard leds. + */ +static int virtinput_send_status(struct virtio_input *vi, + u16 type, u16 code, s32 value) +{ + struct virtio_input_event *stsbuf; + struct scatterlist sg[1]; + unsigned long flags; + int rc; + + stsbuf = kzalloc(sizeof(*stsbuf), GFP_ATOMIC); + if (!stsbuf) + return -ENOMEM; + + stsbuf->type = cpu_to_le16(type); + stsbuf->code = cpu_to_le16(code); + stsbuf->value = cpu_to_le32(value); + sg_init_one(sg, stsbuf, sizeof(*stsbuf)); + + spin_lock_irqsave(&vi->lock, flags); + if (vi->ready) { + rc = virtqueue_add_outbuf(vi->sts, sg, 1, stsbuf, GFP_ATOMIC); + virtqueue_kick(vi->sts); + } else { + rc = -ENODEV; + } + spin_unlock_irqrestore(&vi->lock, flags); + + if (rc != 0) + kfree(stsbuf); + return rc; +} + +static void virtinput_recv_status(struct virtqueue *vq) +{ + struct virtio_input *vi = vq->vdev->priv; + struct virtio_input_event *stsbuf; + unsigned long flags; + unsigned int len; + + spin_lock_irqsave(&vi->lock, flags); + while ((stsbuf = virtqueue_get_buf(vi->sts, &len)) != NULL) + kfree(stsbuf); + spin_unlock_irqrestore(&vi->lock, flags); +} + +static int virtinput_status(struct input_dev *idev, unsigned int type, + unsigned int code, int value) +{ + struct virtio_input *vi = input_get_drvdata(idev); + + return virtinput_send_status(vi, type, code, value); +} + +static u8 virtinput_cfg_select(struct virtio_input *vi, + u8 select, u8 subsel) +{ + u8 size; + + virtio_cwrite(vi->vdev, struct virtio_input_config, select, &select); + virtio_cwrite(vi->vdev, struct virtio_input_config, subsel, &subsel); + virtio_cread(vi->vdev, struct virtio_input_config, size, &size); + return size; +} + +static void virtinput_cfg_bits(struct virtio_input *vi, int select, int subsel, + unsigned long *bits, unsigned int bitcount) +{ + unsigned int bit; + u8 *virtio_bits; + u8 bytes; + + bytes = virtinput_cfg_select(vi, select, subsel); + if (!bytes) + return; + if (bitcount > bytes * 8) + bitcount = bytes * 8; + + /* + * Bitmap in virtio config space is a simple stream of bytes, + * with the first byte carrying bits 0-7, second bits 8-15 and + * so on. + */ + virtio_bits = kzalloc(bytes, GFP_KERNEL); + if (!virtio_bits) + return; + virtio_cread_bytes(vi->vdev, offsetof(struct virtio_input_config, + u.bitmap), + virtio_bits, bytes); + for (bit = 0; bit < bitcount; bit++) { + if (virtio_bits[bit / 8] & (1 << (bit % 8))) + __set_bit(bit, bits); + } + kfree(virtio_bits); + + if (select == VIRTIO_INPUT_CFG_EV_BITS) + __set_bit(subsel, vi->idev->evbit); +} + +static void virtinput_cfg_abs(struct virtio_input *vi, int abs) +{ + u32 mi, ma, re, fu, fl; + + virtinput_cfg_select(vi, VIRTIO_INPUT_CFG_ABS_INFO, abs); + virtio_cread(vi->vdev, struct virtio_input_config, u.abs.min, &mi); + virtio_cread(vi->vdev, struct virtio_input_config, u.abs.max, &ma); + virtio_cread(vi->vdev, struct virtio_input_config, u.abs.res, &re); + virtio_cread(vi->vdev, struct virtio_input_config, u.abs.fuzz, &fu); + virtio_cread(vi->vdev, struct virtio_input_config, u.abs.flat, &fl); + input_set_abs_params(vi->idev, abs, mi, ma, fu, fl); + input_abs_set_res(vi->idev, abs, re); +} + +static int virtinput_init_vqs(struct virtio_input *vi) +{ + struct virtqueue *vqs[2]; + vq_callback_t *cbs[] = { virtinput_recv_events, + virtinput_recv_status }; + static const char * const names[] = { "events", "status" }; + int err; + + err = virtio_find_vqs(vi->vdev, 2, vqs, cbs, names, NULL); + if (err) + return err; + vi->evt = vqs[0]; + vi->sts = vqs[1]; + + return 0; +} + +static void virtinput_fill_evt(struct virtio_input *vi) +{ + unsigned long flags; + int i, size; + + spin_lock_irqsave(&vi->lock, flags); + size = virtqueue_get_vring_size(vi->evt); + if (size > ARRAY_SIZE(vi->evts)) + size = ARRAY_SIZE(vi->evts); + for (i = 0; i < size; i++) + virtinput_queue_evtbuf(vi, &vi->evts[i]); + virtqueue_kick(vi->evt); + spin_unlock_irqrestore(&vi->lock, flags); +} + +static int virtinput_probe(struct virtio_device *vdev) +{ + struct virtio_input *vi; + unsigned long flags; + size_t size; + int abs, err; + + if (!virtio_has_feature(vdev, VIRTIO_F_VERSION_1)) + return -ENODEV; + + vi = kzalloc(sizeof(*vi), GFP_KERNEL); + if (!vi) + return -ENOMEM; + + vdev->priv = vi; + vi->vdev = vdev; + spin_lock_init(&vi->lock); + + err = virtinput_init_vqs(vi); + if (err) + goto err_init_vq; + + vi->idev = input_allocate_device(); + if (!vi->idev) { + err = -ENOMEM; + goto err_input_alloc; + } + input_set_drvdata(vi->idev, vi); + + size = virtinput_cfg_select(vi, VIRTIO_INPUT_CFG_ID_NAME, 0); + virtio_cread_bytes(vi->vdev, offsetof(struct virtio_input_config, + u.string), + vi->name, min(size, sizeof(vi->name))); + size = virtinput_cfg_select(vi, VIRTIO_INPUT_CFG_ID_SERIAL, 0); + virtio_cread_bytes(vi->vdev, offsetof(struct virtio_input_config, + u.string), + vi->serial, min(size, sizeof(vi->serial))); + snprintf(vi->phys, sizeof(vi->phys), + "virtio%d/input0", vdev->index); + vi->idev->name = vi->name; + vi->idev->phys = vi->phys; + vi->idev->uniq = vi->serial; + + size = virtinput_cfg_select(vi, VIRTIO_INPUT_CFG_ID_DEVIDS, 0); + if (size >= sizeof(struct virtio_input_devids)) { + virtio_cread(vi->vdev, struct virtio_input_config, + u.ids.bustype, &vi->idev->id.bustype); + virtio_cread(vi->vdev, struct virtio_input_config, + u.ids.vendor, &vi->idev->id.vendor); + virtio_cread(vi->vdev, struct virtio_input_config, + u.ids.product, &vi->idev->id.product); + virtio_cread(vi->vdev, struct virtio_input_config, + u.ids.version, &vi->idev->id.version); + } else { + vi->idev->id.bustype = BUS_VIRTUAL; + } + + virtinput_cfg_bits(vi, VIRTIO_INPUT_CFG_PROP_BITS, 0, + vi->idev->propbit, INPUT_PROP_CNT); + size = virtinput_cfg_select(vi, VIRTIO_INPUT_CFG_EV_BITS, EV_REP); + if (size) + __set_bit(EV_REP, vi->idev->evbit); + + vi->idev->dev.parent = &vdev->dev; + vi->idev->event = virtinput_status; + + /* device -> kernel */ + virtinput_cfg_bits(vi, VIRTIO_INPUT_CFG_EV_BITS, EV_KEY, + vi->idev->keybit, KEY_CNT); + virtinput_cfg_bits(vi, VIRTIO_INPUT_CFG_EV_BITS, EV_REL, + vi->idev->relbit, REL_CNT); + virtinput_cfg_bits(vi, VIRTIO_INPUT_CFG_EV_BITS, EV_ABS, + vi->idev->absbit, ABS_CNT); + virtinput_cfg_bits(vi, VIRTIO_INPUT_CFG_EV_BITS, EV_MSC, + vi->idev->mscbit, MSC_CNT); + virtinput_cfg_bits(vi, VIRTIO_INPUT_CFG_EV_BITS, EV_SW, + vi->idev->swbit, SW_CNT); + + /* kernel -> device */ + virtinput_cfg_bits(vi, VIRTIO_INPUT_CFG_EV_BITS, EV_LED, + vi->idev->ledbit, LED_CNT); + virtinput_cfg_bits(vi, VIRTIO_INPUT_CFG_EV_BITS, EV_SND, + vi->idev->sndbit, SND_CNT); + + if (test_bit(EV_ABS, vi->idev->evbit)) { + for (abs = 0; abs < ABS_CNT; abs++) { + if (!test_bit(abs, vi->idev->absbit)) + continue; + virtinput_cfg_abs(vi, abs); + } + } + + virtio_device_ready(vdev); + vi->ready = true; + err = input_register_device(vi->idev); + if (err) + goto err_input_register; + + virtinput_fill_evt(vi); + return 0; + +err_input_register: + spin_lock_irqsave(&vi->lock, flags); + vi->ready = false; + spin_unlock_irqrestore(&vi->lock, flags); + input_free_device(vi->idev); +err_input_alloc: + vdev->config->del_vqs(vdev); +err_init_vq: + kfree(vi); + return err; +} + +static void virtinput_remove(struct virtio_device *vdev) +{ + struct virtio_input *vi = vdev->priv; + void *buf; + unsigned long flags; + + spin_lock_irqsave(&vi->lock, flags); + vi->ready = false; + spin_unlock_irqrestore(&vi->lock, flags); + + input_unregister_device(vi->idev); + vdev->config->reset(vdev); + while ((buf = virtqueue_detach_unused_buf(vi->sts)) != NULL) + kfree(buf); + vdev->config->del_vqs(vdev); + kfree(vi); +} + +#ifdef CONFIG_PM_SLEEP +static int virtinput_freeze(struct virtio_device *vdev) +{ + struct virtio_input *vi = vdev->priv; + unsigned long flags; + + spin_lock_irqsave(&vi->lock, flags); + vi->ready = false; + spin_unlock_irqrestore(&vi->lock, flags); + + vdev->config->del_vqs(vdev); + return 0; +} + +static int virtinput_restore(struct virtio_device *vdev) +{ + struct virtio_input *vi = vdev->priv; + int err; + + err = virtinput_init_vqs(vi); + if (err) + return err; + + virtio_device_ready(vdev); + vi->ready = true; + virtinput_fill_evt(vi); + return 0; +} +#endif + +static unsigned int features[] = { + /* none */ +}; +static struct virtio_device_id id_table[] = { + { VIRTIO_ID_INPUT, VIRTIO_DEV_ANY_ID }, + { 0 }, +}; + +static struct virtio_driver virtio_input_driver = { + .driver.name = KBUILD_MODNAME, + .driver.owner = THIS_MODULE, + .feature_table = features, + .feature_table_size = ARRAY_SIZE(features), + .id_table = id_table, + .probe = virtinput_probe, + .remove = virtinput_remove, +#ifdef CONFIG_PM_SLEEP + .freeze = virtinput_freeze, + .restore = virtinput_restore, +#endif +}; + +module_virtio_driver(virtio_input_driver); +MODULE_DEVICE_TABLE(virtio, id_table); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Virtio input device driver"); +MODULE_AUTHOR("Gerd Hoffmann <kraxel@redhat.com>"); diff --git a/drivers/virtio/virtio_mmio.c b/drivers/virtio/virtio_mmio.c new file mode 100644 index 000000000..c69c755bf --- /dev/null +++ b/drivers/virtio/virtio_mmio.c @@ -0,0 +1,783 @@ +/* + * Virtio memory mapped device driver + * + * Copyright 2011-2014, ARM Ltd. + * + * This module allows virtio devices to be used over a virtual, memory mapped + * platform device. + * + * The guest device(s) may be instantiated in one of three equivalent ways: + * + * 1. Static platform device in board's code, eg.: + * + * static struct platform_device v2m_virtio_device = { + * .name = "virtio-mmio", + * .id = -1, + * .num_resources = 2, + * .resource = (struct resource []) { + * { + * .start = 0x1001e000, + * .end = 0x1001e0ff, + * .flags = IORESOURCE_MEM, + * }, { + * .start = 42 + 32, + * .end = 42 + 32, + * .flags = IORESOURCE_IRQ, + * }, + * } + * }; + * + * 2. Device Tree node, eg.: + * + * virtio_block@1e000 { + * compatible = "virtio,mmio"; + * reg = <0x1e000 0x100>; + * interrupts = <42>; + * } + * + * 3. Kernel module (or command line) parameter. Can be used more than once - + * one device will be created for each one. Syntax: + * + * [virtio_mmio.]device=<size>@<baseaddr>:<irq>[:<id>] + * where: + * <size> := size (can use standard suffixes like K, M or G) + * <baseaddr> := physical base address + * <irq> := interrupt number (as passed to request_irq()) + * <id> := (optional) platform device id + * eg.: + * virtio_mmio.device=0x100@0x100b0000:48 \ + * virtio_mmio.device=1K@0x1001e000:74 + * + * + * + * Based on Virtio PCI driver by Anthony Liguori, copyright IBM Corp. 2007 + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ + +#define pr_fmt(fmt) "virtio-mmio: " fmt + +#include <linux/acpi.h> +#include <linux/dma-mapping.h> +#include <linux/highmem.h> +#include <linux/interrupt.h> +#include <linux/io.h> +#include <linux/list.h> +#include <linux/module.h> +#include <linux/platform_device.h> +#include <linux/slab.h> +#include <linux/spinlock.h> +#include <linux/virtio.h> +#include <linux/virtio_config.h> +#include <uapi/linux/virtio_mmio.h> +#include <linux/virtio_ring.h> + + + +/* The alignment to use between consumer and producer parts of vring. + * Currently hardcoded to the page size. */ +#define VIRTIO_MMIO_VRING_ALIGN PAGE_SIZE + + + +#define to_virtio_mmio_device(_plat_dev) \ + container_of(_plat_dev, struct virtio_mmio_device, vdev) + +struct virtio_mmio_device { + struct virtio_device vdev; + struct platform_device *pdev; + + void __iomem *base; + unsigned long version; + + /* a list of queues so we can dispatch IRQs */ + spinlock_t lock; + struct list_head virtqueues; +}; + +struct virtio_mmio_vq_info { + /* the actual virtqueue */ + struct virtqueue *vq; + + /* the list node for the virtqueues list */ + struct list_head node; +}; + + + +/* Configuration interface */ + +static u64 vm_get_features(struct virtio_device *vdev) +{ + struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vdev); + u64 features; + + writel(1, vm_dev->base + VIRTIO_MMIO_DEVICE_FEATURES_SEL); + features = readl(vm_dev->base + VIRTIO_MMIO_DEVICE_FEATURES); + features <<= 32; + + writel(0, vm_dev->base + VIRTIO_MMIO_DEVICE_FEATURES_SEL); + features |= readl(vm_dev->base + VIRTIO_MMIO_DEVICE_FEATURES); + + return features; +} + +static int vm_finalize_features(struct virtio_device *vdev) +{ + struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vdev); + + /* Give virtio_ring a chance to accept features. */ + vring_transport_features(vdev); + + /* Make sure there is are no mixed devices */ + if (vm_dev->version == 2 && + !__virtio_test_bit(vdev, VIRTIO_F_VERSION_1)) { + dev_err(&vdev->dev, "New virtio-mmio devices (version 2) must provide VIRTIO_F_VERSION_1 feature!\n"); + return -EINVAL; + } + + writel(1, vm_dev->base + VIRTIO_MMIO_DRIVER_FEATURES_SEL); + writel((u32)(vdev->features >> 32), + vm_dev->base + VIRTIO_MMIO_DRIVER_FEATURES); + + writel(0, vm_dev->base + VIRTIO_MMIO_DRIVER_FEATURES_SEL); + writel((u32)vdev->features, + vm_dev->base + VIRTIO_MMIO_DRIVER_FEATURES); + + return 0; +} + +static void vm_get(struct virtio_device *vdev, unsigned offset, + void *buf, unsigned len) +{ + struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vdev); + void __iomem *base = vm_dev->base + VIRTIO_MMIO_CONFIG; + u8 b; + __le16 w; + __le32 l; + + if (vm_dev->version == 1) { + u8 *ptr = buf; + int i; + + for (i = 0; i < len; i++) + ptr[i] = readb(base + offset + i); + return; + } + + switch (len) { + case 1: + b = readb(base + offset); + memcpy(buf, &b, sizeof b); + break; + case 2: + w = cpu_to_le16(readw(base + offset)); + memcpy(buf, &w, sizeof w); + break; + case 4: + l = cpu_to_le32(readl(base + offset)); + memcpy(buf, &l, sizeof l); + break; + case 8: + l = cpu_to_le32(readl(base + offset)); + memcpy(buf, &l, sizeof l); + l = cpu_to_le32(ioread32(base + offset + sizeof l)); + memcpy(buf + sizeof l, &l, sizeof l); + break; + default: + BUG(); + } +} + +static void vm_set(struct virtio_device *vdev, unsigned offset, + const void *buf, unsigned len) +{ + struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vdev); + void __iomem *base = vm_dev->base + VIRTIO_MMIO_CONFIG; + u8 b; + __le16 w; + __le32 l; + + if (vm_dev->version == 1) { + const u8 *ptr = buf; + int i; + + for (i = 0; i < len; i++) + writeb(ptr[i], base + offset + i); + + return; + } + + switch (len) { + case 1: + memcpy(&b, buf, sizeof b); + writeb(b, base + offset); + break; + case 2: + memcpy(&w, buf, sizeof w); + writew(le16_to_cpu(w), base + offset); + break; + case 4: + memcpy(&l, buf, sizeof l); + writel(le32_to_cpu(l), base + offset); + break; + case 8: + memcpy(&l, buf, sizeof l); + writel(le32_to_cpu(l), base + offset); + memcpy(&l, buf + sizeof l, sizeof l); + writel(le32_to_cpu(l), base + offset + sizeof l); + break; + default: + BUG(); + } +} + +static u32 vm_generation(struct virtio_device *vdev) +{ + struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vdev); + + if (vm_dev->version == 1) + return 0; + else + return readl(vm_dev->base + VIRTIO_MMIO_CONFIG_GENERATION); +} + +static u8 vm_get_status(struct virtio_device *vdev) +{ + struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vdev); + + return readl(vm_dev->base + VIRTIO_MMIO_STATUS) & 0xff; +} + +static void vm_set_status(struct virtio_device *vdev, u8 status) +{ + struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vdev); + + /* We should never be setting status to 0. */ + BUG_ON(status == 0); + + writel(status, vm_dev->base + VIRTIO_MMIO_STATUS); +} + +static void vm_reset(struct virtio_device *vdev) +{ + struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vdev); + + /* 0 status means a reset. */ + writel(0, vm_dev->base + VIRTIO_MMIO_STATUS); +} + + + +/* Transport interface */ + +/* the notify function used when creating a virt queue */ +static bool vm_notify(struct virtqueue *vq) +{ + struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vq->vdev); + + /* We write the queue's selector into the notification register to + * signal the other end */ + writel(vq->index, vm_dev->base + VIRTIO_MMIO_QUEUE_NOTIFY); + return true; +} + +/* Notify all virtqueues on an interrupt. */ +static irqreturn_t vm_interrupt(int irq, void *opaque) +{ + struct virtio_mmio_device *vm_dev = opaque; + struct virtio_mmio_vq_info *info; + unsigned long status; + unsigned long flags; + irqreturn_t ret = IRQ_NONE; + + /* Read and acknowledge interrupts */ + status = readl(vm_dev->base + VIRTIO_MMIO_INTERRUPT_STATUS); + writel(status, vm_dev->base + VIRTIO_MMIO_INTERRUPT_ACK); + + if (unlikely(status & VIRTIO_MMIO_INT_CONFIG)) { + virtio_config_changed(&vm_dev->vdev); + ret = IRQ_HANDLED; + } + + if (likely(status & VIRTIO_MMIO_INT_VRING)) { + spin_lock_irqsave(&vm_dev->lock, flags); + list_for_each_entry(info, &vm_dev->virtqueues, node) + ret |= vring_interrupt(irq, info->vq); + spin_unlock_irqrestore(&vm_dev->lock, flags); + } + + return ret; +} + + + +static void vm_del_vq(struct virtqueue *vq) +{ + struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vq->vdev); + struct virtio_mmio_vq_info *info = vq->priv; + unsigned long flags; + unsigned int index = vq->index; + + spin_lock_irqsave(&vm_dev->lock, flags); + list_del(&info->node); + spin_unlock_irqrestore(&vm_dev->lock, flags); + + /* Select and deactivate the queue */ + writel(index, vm_dev->base + VIRTIO_MMIO_QUEUE_SEL); + if (vm_dev->version == 1) { + writel(0, vm_dev->base + VIRTIO_MMIO_QUEUE_PFN); + } else { + writel(0, vm_dev->base + VIRTIO_MMIO_QUEUE_READY); + WARN_ON(readl(vm_dev->base + VIRTIO_MMIO_QUEUE_READY)); + } + + vring_del_virtqueue(vq); + + kfree(info); +} + +static void vm_del_vqs(struct virtio_device *vdev) +{ + struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vdev); + struct virtqueue *vq, *n; + + list_for_each_entry_safe(vq, n, &vdev->vqs, list) + vm_del_vq(vq); + + free_irq(platform_get_irq(vm_dev->pdev, 0), vm_dev); +} + +static struct virtqueue *vm_setup_vq(struct virtio_device *vdev, unsigned index, + void (*callback)(struct virtqueue *vq), + const char *name, bool ctx) +{ + struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vdev); + struct virtio_mmio_vq_info *info; + struct virtqueue *vq; + unsigned long flags; + unsigned int num; + int err; + + if (!name) + return NULL; + + /* Select the queue we're interested in */ + writel(index, vm_dev->base + VIRTIO_MMIO_QUEUE_SEL); + + /* Queue shouldn't already be set up. */ + if (readl(vm_dev->base + (vm_dev->version == 1 ? + VIRTIO_MMIO_QUEUE_PFN : VIRTIO_MMIO_QUEUE_READY))) { + err = -ENOENT; + goto error_available; + } + + /* Allocate and fill out our active queue description */ + info = kmalloc(sizeof(*info), GFP_KERNEL); + if (!info) { + err = -ENOMEM; + goto error_kmalloc; + } + + num = readl(vm_dev->base + VIRTIO_MMIO_QUEUE_NUM_MAX); + if (num == 0) { + err = -ENOENT; + goto error_new_virtqueue; + } + + /* Create the vring */ + vq = vring_create_virtqueue(index, num, VIRTIO_MMIO_VRING_ALIGN, vdev, + true, true, ctx, vm_notify, callback, name); + if (!vq) { + err = -ENOMEM; + goto error_new_virtqueue; + } + + /* Activate the queue */ + writel(virtqueue_get_vring_size(vq), vm_dev->base + VIRTIO_MMIO_QUEUE_NUM); + if (vm_dev->version == 1) { + u64 q_pfn = virtqueue_get_desc_addr(vq) >> PAGE_SHIFT; + + /* + * virtio-mmio v1 uses a 32bit QUEUE PFN. If we have something + * that doesn't fit in 32bit, fail the setup rather than + * pretending to be successful. + */ + if (q_pfn >> 32) { + dev_err(&vdev->dev, + "platform bug: legacy virtio-mmio must not be used with RAM above 0x%llxGB\n", + 0x1ULL << (32 + PAGE_SHIFT - 30)); + err = -E2BIG; + goto error_bad_pfn; + } + + writel(PAGE_SIZE, vm_dev->base + VIRTIO_MMIO_QUEUE_ALIGN); + writel(q_pfn, vm_dev->base + VIRTIO_MMIO_QUEUE_PFN); + } else { + u64 addr; + + addr = virtqueue_get_desc_addr(vq); + writel((u32)addr, vm_dev->base + VIRTIO_MMIO_QUEUE_DESC_LOW); + writel((u32)(addr >> 32), + vm_dev->base + VIRTIO_MMIO_QUEUE_DESC_HIGH); + + addr = virtqueue_get_avail_addr(vq); + writel((u32)addr, vm_dev->base + VIRTIO_MMIO_QUEUE_AVAIL_LOW); + writel((u32)(addr >> 32), + vm_dev->base + VIRTIO_MMIO_QUEUE_AVAIL_HIGH); + + addr = virtqueue_get_used_addr(vq); + writel((u32)addr, vm_dev->base + VIRTIO_MMIO_QUEUE_USED_LOW); + writel((u32)(addr >> 32), + vm_dev->base + VIRTIO_MMIO_QUEUE_USED_HIGH); + + writel(1, vm_dev->base + VIRTIO_MMIO_QUEUE_READY); + } + + vq->priv = info; + info->vq = vq; + + spin_lock_irqsave(&vm_dev->lock, flags); + list_add(&info->node, &vm_dev->virtqueues); + spin_unlock_irqrestore(&vm_dev->lock, flags); + + return vq; + +error_bad_pfn: + vring_del_virtqueue(vq); +error_new_virtqueue: + if (vm_dev->version == 1) { + writel(0, vm_dev->base + VIRTIO_MMIO_QUEUE_PFN); + } else { + writel(0, vm_dev->base + VIRTIO_MMIO_QUEUE_READY); + WARN_ON(readl(vm_dev->base + VIRTIO_MMIO_QUEUE_READY)); + } + kfree(info); +error_kmalloc: +error_available: + return ERR_PTR(err); +} + +static int vm_find_vqs(struct virtio_device *vdev, unsigned nvqs, + struct virtqueue *vqs[], + vq_callback_t *callbacks[], + const char * const names[], + const bool *ctx, + struct irq_affinity *desc) +{ + struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vdev); + unsigned int irq = platform_get_irq(vm_dev->pdev, 0); + int i, err; + + err = request_irq(irq, vm_interrupt, IRQF_SHARED, + dev_name(&vdev->dev), vm_dev); + if (err) + return err; + + for (i = 0; i < nvqs; ++i) { + vqs[i] = vm_setup_vq(vdev, i, callbacks[i], names[i], + ctx ? ctx[i] : false); + if (IS_ERR(vqs[i])) { + vm_del_vqs(vdev); + return PTR_ERR(vqs[i]); + } + } + + return 0; +} + +static const char *vm_bus_name(struct virtio_device *vdev) +{ + struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vdev); + + return vm_dev->pdev->name; +} + +static const struct virtio_config_ops virtio_mmio_config_ops = { + .get = vm_get, + .set = vm_set, + .generation = vm_generation, + .get_status = vm_get_status, + .set_status = vm_set_status, + .reset = vm_reset, + .find_vqs = vm_find_vqs, + .del_vqs = vm_del_vqs, + .get_features = vm_get_features, + .finalize_features = vm_finalize_features, + .bus_name = vm_bus_name, +}; + + +static void virtio_mmio_release_dev(struct device *_d) +{ + struct virtio_device *vdev = + container_of(_d, struct virtio_device, dev); + struct virtio_mmio_device *vm_dev = + container_of(vdev, struct virtio_mmio_device, vdev); + struct platform_device *pdev = vm_dev->pdev; + + devm_kfree(&pdev->dev, vm_dev); +} + +/* Platform device */ + +static int virtio_mmio_probe(struct platform_device *pdev) +{ + struct virtio_mmio_device *vm_dev; + struct resource *mem; + unsigned long magic; + int rc; + + mem = platform_get_resource(pdev, IORESOURCE_MEM, 0); + if (!mem) + return -EINVAL; + + if (!devm_request_mem_region(&pdev->dev, mem->start, + resource_size(mem), pdev->name)) + return -EBUSY; + + vm_dev = devm_kzalloc(&pdev->dev, sizeof(*vm_dev), GFP_KERNEL); + if (!vm_dev) + return -ENOMEM; + + vm_dev->vdev.dev.parent = &pdev->dev; + vm_dev->vdev.dev.release = virtio_mmio_release_dev; + vm_dev->vdev.config = &virtio_mmio_config_ops; + vm_dev->pdev = pdev; + INIT_LIST_HEAD(&vm_dev->virtqueues); + spin_lock_init(&vm_dev->lock); + + vm_dev->base = devm_ioremap(&pdev->dev, mem->start, resource_size(mem)); + if (vm_dev->base == NULL) + return -EFAULT; + + /* Check magic value */ + magic = readl(vm_dev->base + VIRTIO_MMIO_MAGIC_VALUE); + if (magic != ('v' | 'i' << 8 | 'r' << 16 | 't' << 24)) { + dev_warn(&pdev->dev, "Wrong magic value 0x%08lx!\n", magic); + return -ENODEV; + } + + /* Check device version */ + vm_dev->version = readl(vm_dev->base + VIRTIO_MMIO_VERSION); + if (vm_dev->version < 1 || vm_dev->version > 2) { + dev_err(&pdev->dev, "Version %ld not supported!\n", + vm_dev->version); + return -ENXIO; + } + + vm_dev->vdev.id.device = readl(vm_dev->base + VIRTIO_MMIO_DEVICE_ID); + if (vm_dev->vdev.id.device == 0) { + /* + * virtio-mmio device with an ID 0 is a (dummy) placeholder + * with no function. End probing now with no error reported. + */ + return -ENODEV; + } + vm_dev->vdev.id.vendor = readl(vm_dev->base + VIRTIO_MMIO_VENDOR_ID); + + if (vm_dev->version == 1) { + writel(PAGE_SIZE, vm_dev->base + VIRTIO_MMIO_GUEST_PAGE_SIZE); + + rc = dma_set_mask(&pdev->dev, DMA_BIT_MASK(64)); + /* + * In the legacy case, ensure our coherently-allocated virtio + * ring will be at an address expressable as a 32-bit PFN. + */ + if (!rc) + dma_set_coherent_mask(&pdev->dev, + DMA_BIT_MASK(32 + PAGE_SHIFT)); + } else { + rc = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64)); + } + if (rc) + rc = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(32)); + if (rc) + dev_warn(&pdev->dev, "Failed to enable 64-bit or 32-bit DMA. Trying to continue, but this might not work.\n"); + + platform_set_drvdata(pdev, vm_dev); + + rc = register_virtio_device(&vm_dev->vdev); + if (rc) + put_device(&vm_dev->vdev.dev); + + return rc; +} + +static int virtio_mmio_remove(struct platform_device *pdev) +{ + struct virtio_mmio_device *vm_dev = platform_get_drvdata(pdev); + unregister_virtio_device(&vm_dev->vdev); + + return 0; +} + + + +/* Devices list parameter */ + +#if defined(CONFIG_VIRTIO_MMIO_CMDLINE_DEVICES) + +static struct device vm_cmdline_parent = { + .init_name = "virtio-mmio-cmdline", +}; + +static int vm_cmdline_parent_registered; +static int vm_cmdline_id; + +static int vm_cmdline_set(const char *device, + const struct kernel_param *kp) +{ + int err; + struct resource resources[2] = {}; + char *str; + long long int base, size; + unsigned int irq; + int processed, consumed = 0; + struct platform_device *pdev; + + /* Consume "size" part of the command line parameter */ + size = memparse(device, &str); + + /* Get "@<base>:<irq>[:<id>]" chunks */ + processed = sscanf(str, "@%lli:%u%n:%d%n", + &base, &irq, &consumed, + &vm_cmdline_id, &consumed); + + /* + * sscanf() must processes at least 2 chunks; also there + * must be no extra characters after the last chunk, so + * str[consumed] must be '\0' + */ + if (processed < 2 || str[consumed]) + return -EINVAL; + + resources[0].flags = IORESOURCE_MEM; + resources[0].start = base; + resources[0].end = base + size - 1; + + resources[1].flags = IORESOURCE_IRQ; + resources[1].start = resources[1].end = irq; + + if (!vm_cmdline_parent_registered) { + err = device_register(&vm_cmdline_parent); + if (err) { + put_device(&vm_cmdline_parent); + pr_err("Failed to register parent device!\n"); + return err; + } + vm_cmdline_parent_registered = 1; + } + + pr_info("Registering device virtio-mmio.%d at 0x%llx-0x%llx, IRQ %d.\n", + vm_cmdline_id, + (unsigned long long)resources[0].start, + (unsigned long long)resources[0].end, + (int)resources[1].start); + + pdev = platform_device_register_resndata(&vm_cmdline_parent, + "virtio-mmio", vm_cmdline_id++, + resources, ARRAY_SIZE(resources), NULL, 0); + + return PTR_ERR_OR_ZERO(pdev); +} + +static int vm_cmdline_get_device(struct device *dev, void *data) +{ + char *buffer = data; + unsigned int len = strlen(buffer); + struct platform_device *pdev = to_platform_device(dev); + + snprintf(buffer + len, PAGE_SIZE - len, "0x%llx@0x%llx:%llu:%d\n", + pdev->resource[0].end - pdev->resource[0].start + 1ULL, + (unsigned long long)pdev->resource[0].start, + (unsigned long long)pdev->resource[1].start, + pdev->id); + return 0; +} + +static int vm_cmdline_get(char *buffer, const struct kernel_param *kp) +{ + buffer[0] = '\0'; + device_for_each_child(&vm_cmdline_parent, buffer, + vm_cmdline_get_device); + return strlen(buffer) + 1; +} + +static const struct kernel_param_ops vm_cmdline_param_ops = { + .set = vm_cmdline_set, + .get = vm_cmdline_get, +}; + +device_param_cb(device, &vm_cmdline_param_ops, NULL, S_IRUSR); + +static int vm_unregister_cmdline_device(struct device *dev, + void *data) +{ + platform_device_unregister(to_platform_device(dev)); + + return 0; +} + +static void vm_unregister_cmdline_devices(void) +{ + if (vm_cmdline_parent_registered) { + device_for_each_child(&vm_cmdline_parent, NULL, + vm_unregister_cmdline_device); + device_unregister(&vm_cmdline_parent); + vm_cmdline_parent_registered = 0; + } +} + +#else + +static void vm_unregister_cmdline_devices(void) +{ +} + +#endif + +/* Platform driver */ + +static const struct of_device_id virtio_mmio_match[] = { + { .compatible = "virtio,mmio", }, + {}, +}; +MODULE_DEVICE_TABLE(of, virtio_mmio_match); + +#ifdef CONFIG_ACPI +static const struct acpi_device_id virtio_mmio_acpi_match[] = { + { "LNRO0005", }, + { } +}; +MODULE_DEVICE_TABLE(acpi, virtio_mmio_acpi_match); +#endif + +static struct platform_driver virtio_mmio_driver = { + .probe = virtio_mmio_probe, + .remove = virtio_mmio_remove, + .driver = { + .name = "virtio-mmio", + .of_match_table = virtio_mmio_match, + .acpi_match_table = ACPI_PTR(virtio_mmio_acpi_match), + }, +}; + +static int __init virtio_mmio_init(void) +{ + return platform_driver_register(&virtio_mmio_driver); +} + +static void __exit virtio_mmio_exit(void) +{ + platform_driver_unregister(&virtio_mmio_driver); + vm_unregister_cmdline_devices(); +} + +module_init(virtio_mmio_init); +module_exit(virtio_mmio_exit); + +MODULE_AUTHOR("Pawel Moll <pawel.moll@arm.com>"); +MODULE_DESCRIPTION("Platform bus driver for memory mapped virtio devices"); +MODULE_LICENSE("GPL"); diff --git a/drivers/virtio/virtio_pci_common.c b/drivers/virtio/virtio_pci_common.c new file mode 100644 index 000000000..40618ccff --- /dev/null +++ b/drivers/virtio/virtio_pci_common.c @@ -0,0 +1,644 @@ +/* + * Virtio PCI driver - common functionality for all device versions + * + * This module allows virtio devices to be used over a virtual PCI device. + * This can be used with QEMU based VMMs like KVM or Xen. + * + * Copyright IBM Corp. 2007 + * Copyright Red Hat, Inc. 2014 + * + * Authors: + * Anthony Liguori <aliguori@us.ibm.com> + * Rusty Russell <rusty@rustcorp.com.au> + * Michael S. Tsirkin <mst@redhat.com> + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + */ + +#include "virtio_pci_common.h" + +static bool force_legacy = false; + +#if IS_ENABLED(CONFIG_VIRTIO_PCI_LEGACY) +module_param(force_legacy, bool, 0444); +MODULE_PARM_DESC(force_legacy, + "Force legacy mode for transitional virtio 1 devices"); +#endif + +/* wait for pending irq handlers */ +void vp_synchronize_vectors(struct virtio_device *vdev) +{ + struct virtio_pci_device *vp_dev = to_vp_device(vdev); + int i; + + if (vp_dev->intx_enabled) + synchronize_irq(vp_dev->pci_dev->irq); + + for (i = 0; i < vp_dev->msix_vectors; ++i) + synchronize_irq(pci_irq_vector(vp_dev->pci_dev, i)); +} + +/* the notify function used when creating a virt queue */ +bool vp_notify(struct virtqueue *vq) +{ + /* we write the queue's selector into the notification register to + * signal the other end */ + iowrite16(vq->index, (void __iomem *)vq->priv); + return true; +} + +/* Handle a configuration change: Tell driver if it wants to know. */ +static irqreturn_t vp_config_changed(int irq, void *opaque) +{ + struct virtio_pci_device *vp_dev = opaque; + + virtio_config_changed(&vp_dev->vdev); + return IRQ_HANDLED; +} + +/* Notify all virtqueues on an interrupt. */ +static irqreturn_t vp_vring_interrupt(int irq, void *opaque) +{ + struct virtio_pci_device *vp_dev = opaque; + struct virtio_pci_vq_info *info; + irqreturn_t ret = IRQ_NONE; + unsigned long flags; + + spin_lock_irqsave(&vp_dev->lock, flags); + list_for_each_entry(info, &vp_dev->virtqueues, node) { + if (vring_interrupt(irq, info->vq) == IRQ_HANDLED) + ret = IRQ_HANDLED; + } + spin_unlock_irqrestore(&vp_dev->lock, flags); + + return ret; +} + +/* A small wrapper to also acknowledge the interrupt when it's handled. + * I really need an EIO hook for the vring so I can ack the interrupt once we + * know that we'll be handling the IRQ but before we invoke the callback since + * the callback may notify the host which results in the host attempting to + * raise an interrupt that we would then mask once we acknowledged the + * interrupt. */ +static irqreturn_t vp_interrupt(int irq, void *opaque) +{ + struct virtio_pci_device *vp_dev = opaque; + u8 isr; + + /* reading the ISR has the effect of also clearing it so it's very + * important to save off the value. */ + isr = ioread8(vp_dev->isr); + + /* It's definitely not us if the ISR was not high */ + if (!isr) + return IRQ_NONE; + + /* Configuration change? Tell driver if it wants to know. */ + if (isr & VIRTIO_PCI_ISR_CONFIG) + vp_config_changed(irq, opaque); + + return vp_vring_interrupt(irq, opaque); +} + +static int vp_request_msix_vectors(struct virtio_device *vdev, int nvectors, + bool per_vq_vectors, struct irq_affinity *desc) +{ + struct virtio_pci_device *vp_dev = to_vp_device(vdev); + const char *name = dev_name(&vp_dev->vdev.dev); + unsigned flags = PCI_IRQ_MSIX; + unsigned i, v; + int err = -ENOMEM; + + vp_dev->msix_vectors = nvectors; + + vp_dev->msix_names = kmalloc_array(nvectors, + sizeof(*vp_dev->msix_names), + GFP_KERNEL); + if (!vp_dev->msix_names) + goto error; + vp_dev->msix_affinity_masks + = kcalloc(nvectors, sizeof(*vp_dev->msix_affinity_masks), + GFP_KERNEL); + if (!vp_dev->msix_affinity_masks) + goto error; + for (i = 0; i < nvectors; ++i) + if (!alloc_cpumask_var(&vp_dev->msix_affinity_masks[i], + GFP_KERNEL)) + goto error; + + if (desc) { + flags |= PCI_IRQ_AFFINITY; + desc->pre_vectors++; /* virtio config vector */ + } + + err = pci_alloc_irq_vectors_affinity(vp_dev->pci_dev, nvectors, + nvectors, flags, desc); + if (err < 0) + goto error; + vp_dev->msix_enabled = 1; + + /* Set the vector used for configuration */ + v = vp_dev->msix_used_vectors; + snprintf(vp_dev->msix_names[v], sizeof *vp_dev->msix_names, + "%s-config", name); + err = request_irq(pci_irq_vector(vp_dev->pci_dev, v), + vp_config_changed, 0, vp_dev->msix_names[v], + vp_dev); + if (err) + goto error; + ++vp_dev->msix_used_vectors; + + v = vp_dev->config_vector(vp_dev, v); + /* Verify we had enough resources to assign the vector */ + if (v == VIRTIO_MSI_NO_VECTOR) { + err = -EBUSY; + goto error; + } + + if (!per_vq_vectors) { + /* Shared vector for all VQs */ + v = vp_dev->msix_used_vectors; + snprintf(vp_dev->msix_names[v], sizeof *vp_dev->msix_names, + "%s-virtqueues", name); + err = request_irq(pci_irq_vector(vp_dev->pci_dev, v), + vp_vring_interrupt, 0, vp_dev->msix_names[v], + vp_dev); + if (err) + goto error; + ++vp_dev->msix_used_vectors; + } + return 0; +error: + return err; +} + +static struct virtqueue *vp_setup_vq(struct virtio_device *vdev, unsigned index, + void (*callback)(struct virtqueue *vq), + const char *name, + bool ctx, + u16 msix_vec) +{ + struct virtio_pci_device *vp_dev = to_vp_device(vdev); + struct virtio_pci_vq_info *info = kmalloc(sizeof *info, GFP_KERNEL); + struct virtqueue *vq; + unsigned long flags; + + /* fill out our structure that represents an active queue */ + if (!info) + return ERR_PTR(-ENOMEM); + + vq = vp_dev->setup_vq(vp_dev, info, index, callback, name, ctx, + msix_vec); + if (IS_ERR(vq)) + goto out_info; + + info->vq = vq; + if (callback) { + spin_lock_irqsave(&vp_dev->lock, flags); + list_add(&info->node, &vp_dev->virtqueues); + spin_unlock_irqrestore(&vp_dev->lock, flags); + } else { + INIT_LIST_HEAD(&info->node); + } + + vp_dev->vqs[index] = info; + return vq; + +out_info: + kfree(info); + return vq; +} + +static void vp_del_vq(struct virtqueue *vq) +{ + struct virtio_pci_device *vp_dev = to_vp_device(vq->vdev); + struct virtio_pci_vq_info *info = vp_dev->vqs[vq->index]; + unsigned long flags; + + spin_lock_irqsave(&vp_dev->lock, flags); + list_del(&info->node); + spin_unlock_irqrestore(&vp_dev->lock, flags); + + vp_dev->del_vq(info); + kfree(info); +} + +/* the config->del_vqs() implementation */ +void vp_del_vqs(struct virtio_device *vdev) +{ + struct virtio_pci_device *vp_dev = to_vp_device(vdev); + struct virtqueue *vq, *n; + int i; + + list_for_each_entry_safe(vq, n, &vdev->vqs, list) { + if (vp_dev->per_vq_vectors) { + int v = vp_dev->vqs[vq->index]->msix_vector; + + if (v != VIRTIO_MSI_NO_VECTOR) { + int irq = pci_irq_vector(vp_dev->pci_dev, v); + + irq_set_affinity_hint(irq, NULL); + free_irq(irq, vq); + } + } + vp_del_vq(vq); + } + vp_dev->per_vq_vectors = false; + + if (vp_dev->intx_enabled) { + free_irq(vp_dev->pci_dev->irq, vp_dev); + vp_dev->intx_enabled = 0; + } + + for (i = 0; i < vp_dev->msix_used_vectors; ++i) + free_irq(pci_irq_vector(vp_dev->pci_dev, i), vp_dev); + + if (vp_dev->msix_affinity_masks) { + for (i = 0; i < vp_dev->msix_vectors; i++) + free_cpumask_var(vp_dev->msix_affinity_masks[i]); + } + + if (vp_dev->msix_enabled) { + /* Disable the vector used for configuration */ + vp_dev->config_vector(vp_dev, VIRTIO_MSI_NO_VECTOR); + + pci_free_irq_vectors(vp_dev->pci_dev); + vp_dev->msix_enabled = 0; + } + + vp_dev->msix_vectors = 0; + vp_dev->msix_used_vectors = 0; + kfree(vp_dev->msix_names); + vp_dev->msix_names = NULL; + kfree(vp_dev->msix_affinity_masks); + vp_dev->msix_affinity_masks = NULL; + kfree(vp_dev->vqs); + vp_dev->vqs = NULL; +} + +static int vp_find_vqs_msix(struct virtio_device *vdev, unsigned nvqs, + struct virtqueue *vqs[], vq_callback_t *callbacks[], + const char * const names[], bool per_vq_vectors, + const bool *ctx, + struct irq_affinity *desc) +{ + struct virtio_pci_device *vp_dev = to_vp_device(vdev); + u16 msix_vec; + int i, err, nvectors, allocated_vectors; + + vp_dev->vqs = kcalloc(nvqs, sizeof(*vp_dev->vqs), GFP_KERNEL); + if (!vp_dev->vqs) + return -ENOMEM; + + if (per_vq_vectors) { + /* Best option: one for change interrupt, one per vq. */ + nvectors = 1; + for (i = 0; i < nvqs; ++i) + if (callbacks[i]) + ++nvectors; + } else { + /* Second best: one for change, shared for all vqs. */ + nvectors = 2; + } + + err = vp_request_msix_vectors(vdev, nvectors, per_vq_vectors, + per_vq_vectors ? desc : NULL); + if (err) + goto error_find; + + vp_dev->per_vq_vectors = per_vq_vectors; + allocated_vectors = vp_dev->msix_used_vectors; + for (i = 0; i < nvqs; ++i) { + if (!names[i]) { + vqs[i] = NULL; + continue; + } + + if (!callbacks[i]) + msix_vec = VIRTIO_MSI_NO_VECTOR; + else if (vp_dev->per_vq_vectors) + msix_vec = allocated_vectors++; + else + msix_vec = VP_MSIX_VQ_VECTOR; + vqs[i] = vp_setup_vq(vdev, i, callbacks[i], names[i], + ctx ? ctx[i] : false, + msix_vec); + if (IS_ERR(vqs[i])) { + err = PTR_ERR(vqs[i]); + goto error_find; + } + + if (!vp_dev->per_vq_vectors || msix_vec == VIRTIO_MSI_NO_VECTOR) + continue; + + /* allocate per-vq irq if available and necessary */ + snprintf(vp_dev->msix_names[msix_vec], + sizeof *vp_dev->msix_names, + "%s-%s", + dev_name(&vp_dev->vdev.dev), names[i]); + err = request_irq(pci_irq_vector(vp_dev->pci_dev, msix_vec), + vring_interrupt, 0, + vp_dev->msix_names[msix_vec], + vqs[i]); + if (err) + goto error_find; + } + return 0; + +error_find: + vp_del_vqs(vdev); + return err; +} + +static int vp_find_vqs_intx(struct virtio_device *vdev, unsigned nvqs, + struct virtqueue *vqs[], vq_callback_t *callbacks[], + const char * const names[], const bool *ctx) +{ + struct virtio_pci_device *vp_dev = to_vp_device(vdev); + int i, err; + + vp_dev->vqs = kcalloc(nvqs, sizeof(*vp_dev->vqs), GFP_KERNEL); + if (!vp_dev->vqs) + return -ENOMEM; + + err = request_irq(vp_dev->pci_dev->irq, vp_interrupt, IRQF_SHARED, + dev_name(&vdev->dev), vp_dev); + if (err) + goto out_del_vqs; + + vp_dev->intx_enabled = 1; + vp_dev->per_vq_vectors = false; + for (i = 0; i < nvqs; ++i) { + if (!names[i]) { + vqs[i] = NULL; + continue; + } + vqs[i] = vp_setup_vq(vdev, i, callbacks[i], names[i], + ctx ? ctx[i] : false, + VIRTIO_MSI_NO_VECTOR); + if (IS_ERR(vqs[i])) { + err = PTR_ERR(vqs[i]); + goto out_del_vqs; + } + } + + return 0; +out_del_vqs: + vp_del_vqs(vdev); + return err; +} + +/* the config->find_vqs() implementation */ +int vp_find_vqs(struct virtio_device *vdev, unsigned nvqs, + struct virtqueue *vqs[], vq_callback_t *callbacks[], + const char * const names[], const bool *ctx, + struct irq_affinity *desc) +{ + int err; + + /* Try MSI-X with one vector per queue. */ + err = vp_find_vqs_msix(vdev, nvqs, vqs, callbacks, names, true, ctx, desc); + if (!err) + return 0; + /* Fallback: MSI-X with one vector for config, one shared for queues. */ + err = vp_find_vqs_msix(vdev, nvqs, vqs, callbacks, names, false, ctx, desc); + if (!err) + return 0; + /* Finally fall back to regular interrupts. */ + return vp_find_vqs_intx(vdev, nvqs, vqs, callbacks, names, ctx); +} + +const char *vp_bus_name(struct virtio_device *vdev) +{ + struct virtio_pci_device *vp_dev = to_vp_device(vdev); + + return pci_name(vp_dev->pci_dev); +} + +/* Setup the affinity for a virtqueue: + * - force the affinity for per vq vector + * - OR over all affinities for shared MSI + * - ignore the affinity request if we're using INTX + */ +int vp_set_vq_affinity(struct virtqueue *vq, const struct cpumask *cpu_mask) +{ + struct virtio_device *vdev = vq->vdev; + struct virtio_pci_device *vp_dev = to_vp_device(vdev); + struct virtio_pci_vq_info *info = vp_dev->vqs[vq->index]; + struct cpumask *mask; + unsigned int irq; + + if (!vq->callback) + return -EINVAL; + + if (vp_dev->msix_enabled) { + mask = vp_dev->msix_affinity_masks[info->msix_vector]; + irq = pci_irq_vector(vp_dev->pci_dev, info->msix_vector); + if (!cpu_mask) + irq_set_affinity_hint(irq, NULL); + else { + cpumask_copy(mask, cpu_mask); + irq_set_affinity_hint(irq, mask); + } + } + return 0; +} + +const struct cpumask *vp_get_vq_affinity(struct virtio_device *vdev, int index) +{ + struct virtio_pci_device *vp_dev = to_vp_device(vdev); + + if (!vp_dev->per_vq_vectors || + vp_dev->vqs[index]->msix_vector == VIRTIO_MSI_NO_VECTOR) + return NULL; + + return pci_irq_get_affinity(vp_dev->pci_dev, + vp_dev->vqs[index]->msix_vector); +} + +#ifdef CONFIG_PM_SLEEP +static int virtio_pci_freeze(struct device *dev) +{ + struct pci_dev *pci_dev = to_pci_dev(dev); + struct virtio_pci_device *vp_dev = pci_get_drvdata(pci_dev); + int ret; + + ret = virtio_device_freeze(&vp_dev->vdev); + + if (!ret) + pci_disable_device(pci_dev); + return ret; +} + +static int virtio_pci_restore(struct device *dev) +{ + struct pci_dev *pci_dev = to_pci_dev(dev); + struct virtio_pci_device *vp_dev = pci_get_drvdata(pci_dev); + int ret; + + ret = pci_enable_device(pci_dev); + if (ret) + return ret; + + pci_set_master(pci_dev); + return virtio_device_restore(&vp_dev->vdev); +} + +static const struct dev_pm_ops virtio_pci_pm_ops = { + SET_SYSTEM_SLEEP_PM_OPS(virtio_pci_freeze, virtio_pci_restore) +}; +#endif + + +/* Qumranet donated their vendor ID for devices 0x1000 thru 0x10FF. */ +static const struct pci_device_id virtio_pci_id_table[] = { + { PCI_DEVICE(PCI_VENDOR_ID_REDHAT_QUMRANET, PCI_ANY_ID) }, + { 0 } +}; + +MODULE_DEVICE_TABLE(pci, virtio_pci_id_table); + +static void virtio_pci_release_dev(struct device *_d) +{ + struct virtio_device *vdev = dev_to_virtio(_d); + struct virtio_pci_device *vp_dev = to_vp_device(vdev); + + /* As struct device is a kobject, it's not safe to + * free the memory (including the reference counter itself) + * until it's release callback. */ + kfree(vp_dev); +} + +static int virtio_pci_probe(struct pci_dev *pci_dev, + const struct pci_device_id *id) +{ + struct virtio_pci_device *vp_dev, *reg_dev = NULL; + int rc; + + /* allocate our structure and fill it out */ + vp_dev = kzalloc(sizeof(struct virtio_pci_device), GFP_KERNEL); + if (!vp_dev) + return -ENOMEM; + + pci_set_drvdata(pci_dev, vp_dev); + vp_dev->vdev.dev.parent = &pci_dev->dev; + vp_dev->vdev.dev.release = virtio_pci_release_dev; + vp_dev->pci_dev = pci_dev; + INIT_LIST_HEAD(&vp_dev->virtqueues); + spin_lock_init(&vp_dev->lock); + + /* enable the device */ + rc = pci_enable_device(pci_dev); + if (rc) + goto err_enable_device; + + if (force_legacy) { + rc = virtio_pci_legacy_probe(vp_dev); + /* Also try modern mode if we can't map BAR0 (no IO space). */ + if (rc == -ENODEV || rc == -ENOMEM) + rc = virtio_pci_modern_probe(vp_dev); + if (rc) + goto err_probe; + } else { + rc = virtio_pci_modern_probe(vp_dev); + if (rc == -ENODEV) + rc = virtio_pci_legacy_probe(vp_dev); + if (rc) + goto err_probe; + } + + pci_set_master(pci_dev); + + rc = register_virtio_device(&vp_dev->vdev); + reg_dev = vp_dev; + if (rc) + goto err_register; + + return 0; + +err_register: + if (vp_dev->ioaddr) + virtio_pci_legacy_remove(vp_dev); + else + virtio_pci_modern_remove(vp_dev); +err_probe: + pci_disable_device(pci_dev); +err_enable_device: + if (reg_dev) + put_device(&vp_dev->vdev.dev); + else + kfree(vp_dev); + return rc; +} + +static void virtio_pci_remove(struct pci_dev *pci_dev) +{ + struct virtio_pci_device *vp_dev = pci_get_drvdata(pci_dev); + struct device *dev = get_device(&vp_dev->vdev.dev); + + /* + * Device is marked broken on surprise removal so that virtio upper + * layers can abort any ongoing operation. + */ + if (!pci_device_is_present(pci_dev)) + virtio_break_device(&vp_dev->vdev); + + pci_disable_sriov(pci_dev); + + unregister_virtio_device(&vp_dev->vdev); + + if (vp_dev->ioaddr) + virtio_pci_legacy_remove(vp_dev); + else + virtio_pci_modern_remove(vp_dev); + + pci_disable_device(pci_dev); + put_device(dev); +} + +static int virtio_pci_sriov_configure(struct pci_dev *pci_dev, int num_vfs) +{ + struct virtio_pci_device *vp_dev = pci_get_drvdata(pci_dev); + struct virtio_device *vdev = &vp_dev->vdev; + int ret; + + if (!(vdev->config->get_status(vdev) & VIRTIO_CONFIG_S_DRIVER_OK)) + return -EBUSY; + + if (!__virtio_test_bit(vdev, VIRTIO_F_SR_IOV)) + return -EINVAL; + + if (pci_vfs_assigned(pci_dev)) + return -EPERM; + + if (num_vfs == 0) { + pci_disable_sriov(pci_dev); + return 0; + } + + ret = pci_enable_sriov(pci_dev, num_vfs); + if (ret < 0) + return ret; + + return num_vfs; +} + +static struct pci_driver virtio_pci_driver = { + .name = "virtio-pci", + .id_table = virtio_pci_id_table, + .probe = virtio_pci_probe, + .remove = virtio_pci_remove, +#ifdef CONFIG_PM_SLEEP + .driver.pm = &virtio_pci_pm_ops, +#endif + .sriov_configure = virtio_pci_sriov_configure, +}; + +module_pci_driver(virtio_pci_driver); + +MODULE_AUTHOR("Anthony Liguori <aliguori@us.ibm.com>"); +MODULE_DESCRIPTION("virtio-pci"); +MODULE_LICENSE("GPL"); +MODULE_VERSION("1"); diff --git a/drivers/virtio/virtio_pci_common.h b/drivers/virtio/virtio_pci_common.h new file mode 100644 index 000000000..02271002c --- /dev/null +++ b/drivers/virtio/virtio_pci_common.h @@ -0,0 +1,163 @@ +#ifndef _DRIVERS_VIRTIO_VIRTIO_PCI_COMMON_H +#define _DRIVERS_VIRTIO_VIRTIO_PCI_COMMON_H +/* + * Virtio PCI driver - APIs for common functionality for all device versions + * + * This module allows virtio devices to be used over a virtual PCI device. + * This can be used with QEMU based VMMs like KVM or Xen. + * + * Copyright IBM Corp. 2007 + * Copyright Red Hat, Inc. 2014 + * + * Authors: + * Anthony Liguori <aliguori@us.ibm.com> + * Rusty Russell <rusty@rustcorp.com.au> + * Michael S. Tsirkin <mst@redhat.com> + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + */ + +#include <linux/module.h> +#include <linux/list.h> +#include <linux/pci.h> +#include <linux/slab.h> +#include <linux/interrupt.h> +#include <linux/virtio.h> +#include <linux/virtio_config.h> +#include <linux/virtio_ring.h> +#include <linux/virtio_pci.h> +#include <linux/highmem.h> +#include <linux/spinlock.h> + +struct virtio_pci_vq_info { + /* the actual virtqueue */ + struct virtqueue *vq; + + /* the list node for the virtqueues list */ + struct list_head node; + + /* MSI-X vector (or none) */ + unsigned msix_vector; +}; + +/* Our device structure */ +struct virtio_pci_device { + struct virtio_device vdev; + struct pci_dev *pci_dev; + + /* In legacy mode, these two point to within ->legacy. */ + /* Where to read and clear interrupt */ + u8 __iomem *isr; + + /* Modern only fields */ + /* The IO mapping for the PCI config space (non-legacy mode) */ + struct virtio_pci_common_cfg __iomem *common; + /* Device-specific data (non-legacy mode) */ + void __iomem *device; + /* Base of vq notifications (non-legacy mode). */ + void __iomem *notify_base; + + /* So we can sanity-check accesses. */ + size_t notify_len; + size_t device_len; + + /* Capability for when we need to map notifications per-vq. */ + int notify_map_cap; + + /* Multiply queue_notify_off by this value. (non-legacy mode). */ + u32 notify_offset_multiplier; + + int modern_bars; + + /* Legacy only field */ + /* the IO mapping for the PCI config space */ + void __iomem *ioaddr; + + /* a list of queues so we can dispatch IRQs */ + spinlock_t lock; + struct list_head virtqueues; + + /* array of all queues for house-keeping */ + struct virtio_pci_vq_info **vqs; + + /* MSI-X support */ + int msix_enabled; + int intx_enabled; + cpumask_var_t *msix_affinity_masks; + /* Name strings for interrupts. This size should be enough, + * and I'm too lazy to allocate each name separately. */ + char (*msix_names)[256]; + /* Number of available vectors */ + unsigned msix_vectors; + /* Vectors allocated, excluding per-vq vectors if any */ + unsigned msix_used_vectors; + + /* Whether we have vector per vq */ + bool per_vq_vectors; + + struct virtqueue *(*setup_vq)(struct virtio_pci_device *vp_dev, + struct virtio_pci_vq_info *info, + unsigned idx, + void (*callback)(struct virtqueue *vq), + const char *name, + bool ctx, + u16 msix_vec); + void (*del_vq)(struct virtio_pci_vq_info *info); + + u16 (*config_vector)(struct virtio_pci_device *vp_dev, u16 vector); +}; + +/* Constants for MSI-X */ +/* Use first vector for configuration changes, second and the rest for + * virtqueues Thus, we need at least 2 vectors for MSI. */ +enum { + VP_MSIX_CONFIG_VECTOR = 0, + VP_MSIX_VQ_VECTOR = 1, +}; + +/* Convert a generic virtio device to our structure */ +static struct virtio_pci_device *to_vp_device(struct virtio_device *vdev) +{ + return container_of(vdev, struct virtio_pci_device, vdev); +} + +/* wait for pending irq handlers */ +void vp_synchronize_vectors(struct virtio_device *vdev); +/* the notify function used when creating a virt queue */ +bool vp_notify(struct virtqueue *vq); +/* the config->del_vqs() implementation */ +void vp_del_vqs(struct virtio_device *vdev); +/* the config->find_vqs() implementation */ +int vp_find_vqs(struct virtio_device *vdev, unsigned nvqs, + struct virtqueue *vqs[], vq_callback_t *callbacks[], + const char * const names[], const bool *ctx, + struct irq_affinity *desc); +const char *vp_bus_name(struct virtio_device *vdev); + +/* Setup the affinity for a virtqueue: + * - force the affinity for per vq vector + * - OR over all affinities for shared MSI + * - ignore the affinity request if we're using INTX + */ +int vp_set_vq_affinity(struct virtqueue *vq, const struct cpumask *cpu_mask); + +const struct cpumask *vp_get_vq_affinity(struct virtio_device *vdev, int index); + +#if IS_ENABLED(CONFIG_VIRTIO_PCI_LEGACY) +int virtio_pci_legacy_probe(struct virtio_pci_device *); +void virtio_pci_legacy_remove(struct virtio_pci_device *); +#else +static inline int virtio_pci_legacy_probe(struct virtio_pci_device *vp_dev) +{ + return -ENODEV; +} +static inline void virtio_pci_legacy_remove(struct virtio_pci_device *vp_dev) +{ +} +#endif +int virtio_pci_modern_probe(struct virtio_pci_device *); +void virtio_pci_modern_remove(struct virtio_pci_device *); + +#endif diff --git a/drivers/virtio/virtio_pci_legacy.c b/drivers/virtio/virtio_pci_legacy.c new file mode 100644 index 000000000..de062fb20 --- /dev/null +++ b/drivers/virtio/virtio_pci_legacy.c @@ -0,0 +1,280 @@ +/* + * Virtio PCI driver - legacy device support + * + * This module allows virtio devices to be used over a virtual PCI device. + * This can be used with QEMU based VMMs like KVM or Xen. + * + * Copyright IBM Corp. 2007 + * Copyright Red Hat, Inc. 2014 + * + * Authors: + * Anthony Liguori <aliguori@us.ibm.com> + * Rusty Russell <rusty@rustcorp.com.au> + * Michael S. Tsirkin <mst@redhat.com> + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + */ + +#include "virtio_pci_common.h" + +/* virtio config->get_features() implementation */ +static u64 vp_get_features(struct virtio_device *vdev) +{ + struct virtio_pci_device *vp_dev = to_vp_device(vdev); + + /* When someone needs more than 32 feature bits, we'll need to + * steal a bit to indicate that the rest are somewhere else. */ + return ioread32(vp_dev->ioaddr + VIRTIO_PCI_HOST_FEATURES); +} + +/* virtio config->finalize_features() implementation */ +static int vp_finalize_features(struct virtio_device *vdev) +{ + struct virtio_pci_device *vp_dev = to_vp_device(vdev); + + /* Give virtio_ring a chance to accept features. */ + vring_transport_features(vdev); + + /* Make sure we don't have any features > 32 bits! */ + BUG_ON((u32)vdev->features != vdev->features); + + /* We only support 32 feature bits. */ + iowrite32(vdev->features, vp_dev->ioaddr + VIRTIO_PCI_GUEST_FEATURES); + + return 0; +} + +/* virtio config->get() implementation */ +static void vp_get(struct virtio_device *vdev, unsigned offset, + void *buf, unsigned len) +{ + struct virtio_pci_device *vp_dev = to_vp_device(vdev); + void __iomem *ioaddr = vp_dev->ioaddr + + VIRTIO_PCI_CONFIG(vp_dev) + offset; + u8 *ptr = buf; + int i; + + for (i = 0; i < len; i++) + ptr[i] = ioread8(ioaddr + i); +} + +/* the config->set() implementation. it's symmetric to the config->get() + * implementation */ +static void vp_set(struct virtio_device *vdev, unsigned offset, + const void *buf, unsigned len) +{ + struct virtio_pci_device *vp_dev = to_vp_device(vdev); + void __iomem *ioaddr = vp_dev->ioaddr + + VIRTIO_PCI_CONFIG(vp_dev) + offset; + const u8 *ptr = buf; + int i; + + for (i = 0; i < len; i++) + iowrite8(ptr[i], ioaddr + i); +} + +/* config->{get,set}_status() implementations */ +static u8 vp_get_status(struct virtio_device *vdev) +{ + struct virtio_pci_device *vp_dev = to_vp_device(vdev); + return ioread8(vp_dev->ioaddr + VIRTIO_PCI_STATUS); +} + +static void vp_set_status(struct virtio_device *vdev, u8 status) +{ + struct virtio_pci_device *vp_dev = to_vp_device(vdev); + /* We should never be setting status to 0. */ + BUG_ON(status == 0); + iowrite8(status, vp_dev->ioaddr + VIRTIO_PCI_STATUS); +} + +static void vp_reset(struct virtio_device *vdev) +{ + struct virtio_pci_device *vp_dev = to_vp_device(vdev); + /* 0 status means a reset. */ + iowrite8(0, vp_dev->ioaddr + VIRTIO_PCI_STATUS); + /* Flush out the status write, and flush in device writes, + * including MSi-X interrupts, if any. */ + ioread8(vp_dev->ioaddr + VIRTIO_PCI_STATUS); + /* Flush pending VQ/configuration callbacks. */ + vp_synchronize_vectors(vdev); +} + +static u16 vp_config_vector(struct virtio_pci_device *vp_dev, u16 vector) +{ + /* Setup the vector used for configuration events */ + iowrite16(vector, vp_dev->ioaddr + VIRTIO_MSI_CONFIG_VECTOR); + /* Verify we had enough resources to assign the vector */ + /* Will also flush the write out to device */ + return ioread16(vp_dev->ioaddr + VIRTIO_MSI_CONFIG_VECTOR); +} + +static struct virtqueue *setup_vq(struct virtio_pci_device *vp_dev, + struct virtio_pci_vq_info *info, + unsigned index, + void (*callback)(struct virtqueue *vq), + const char *name, + bool ctx, + u16 msix_vec) +{ + struct virtqueue *vq; + u16 num; + int err; + u64 q_pfn; + + /* Select the queue we're interested in */ + iowrite16(index, vp_dev->ioaddr + VIRTIO_PCI_QUEUE_SEL); + + /* Check if queue is either not available or already active. */ + num = ioread16(vp_dev->ioaddr + VIRTIO_PCI_QUEUE_NUM); + if (!num || ioread32(vp_dev->ioaddr + VIRTIO_PCI_QUEUE_PFN)) + return ERR_PTR(-ENOENT); + + info->msix_vector = msix_vec; + + /* create the vring */ + vq = vring_create_virtqueue(index, num, + VIRTIO_PCI_VRING_ALIGN, &vp_dev->vdev, + true, false, ctx, + vp_notify, callback, name); + if (!vq) + return ERR_PTR(-ENOMEM); + + q_pfn = virtqueue_get_desc_addr(vq) >> VIRTIO_PCI_QUEUE_ADDR_SHIFT; + if (q_pfn >> 32) { + dev_err(&vp_dev->pci_dev->dev, + "platform bug: legacy virtio-mmio must not be used with RAM above 0x%llxGB\n", + 0x1ULL << (32 + PAGE_SHIFT - 30)); + err = -E2BIG; + goto out_del_vq; + } + + /* activate the queue */ + iowrite32(q_pfn, vp_dev->ioaddr + VIRTIO_PCI_QUEUE_PFN); + + vq->priv = (void __force *)vp_dev->ioaddr + VIRTIO_PCI_QUEUE_NOTIFY; + + if (msix_vec != VIRTIO_MSI_NO_VECTOR) { + iowrite16(msix_vec, vp_dev->ioaddr + VIRTIO_MSI_QUEUE_VECTOR); + msix_vec = ioread16(vp_dev->ioaddr + VIRTIO_MSI_QUEUE_VECTOR); + if (msix_vec == VIRTIO_MSI_NO_VECTOR) { + err = -EBUSY; + goto out_deactivate; + } + } + + return vq; + +out_deactivate: + iowrite32(0, vp_dev->ioaddr + VIRTIO_PCI_QUEUE_PFN); +out_del_vq: + vring_del_virtqueue(vq); + return ERR_PTR(err); +} + +static void del_vq(struct virtio_pci_vq_info *info) +{ + struct virtqueue *vq = info->vq; + struct virtio_pci_device *vp_dev = to_vp_device(vq->vdev); + + iowrite16(vq->index, vp_dev->ioaddr + VIRTIO_PCI_QUEUE_SEL); + + if (vp_dev->msix_enabled) { + iowrite16(VIRTIO_MSI_NO_VECTOR, + vp_dev->ioaddr + VIRTIO_MSI_QUEUE_VECTOR); + /* Flush the write out to device */ + ioread8(vp_dev->ioaddr + VIRTIO_PCI_ISR); + } + + /* Select and deactivate the queue */ + iowrite32(0, vp_dev->ioaddr + VIRTIO_PCI_QUEUE_PFN); + + vring_del_virtqueue(vq); +} + +static const struct virtio_config_ops virtio_pci_config_ops = { + .get = vp_get, + .set = vp_set, + .get_status = vp_get_status, + .set_status = vp_set_status, + .reset = vp_reset, + .find_vqs = vp_find_vqs, + .del_vqs = vp_del_vqs, + .get_features = vp_get_features, + .finalize_features = vp_finalize_features, + .bus_name = vp_bus_name, + .set_vq_affinity = vp_set_vq_affinity, + .get_vq_affinity = vp_get_vq_affinity, +}; + +/* the PCI probing function */ +int virtio_pci_legacy_probe(struct virtio_pci_device *vp_dev) +{ + struct pci_dev *pci_dev = vp_dev->pci_dev; + int rc; + + /* We only own devices >= 0x1000 and <= 0x103f: leave the rest. */ + if (pci_dev->device < 0x1000 || pci_dev->device > 0x103f) + return -ENODEV; + + if (pci_dev->revision != VIRTIO_PCI_ABI_VERSION) { + printk(KERN_ERR "virtio_pci: expected ABI version %d, got %d\n", + VIRTIO_PCI_ABI_VERSION, pci_dev->revision); + return -ENODEV; + } + + rc = dma_set_mask(&pci_dev->dev, DMA_BIT_MASK(64)); + if (rc) { + rc = dma_set_mask_and_coherent(&pci_dev->dev, DMA_BIT_MASK(32)); + } else { + /* + * The virtio ring base address is expressed as a 32-bit PFN, + * with a page size of 1 << VIRTIO_PCI_QUEUE_ADDR_SHIFT. + */ + dma_set_coherent_mask(&pci_dev->dev, + DMA_BIT_MASK(32 + VIRTIO_PCI_QUEUE_ADDR_SHIFT)); + } + + if (rc) + dev_warn(&pci_dev->dev, "Failed to enable 64-bit or 32-bit DMA. Trying to continue, but this might not work.\n"); + + rc = pci_request_region(pci_dev, 0, "virtio-pci-legacy"); + if (rc) + return rc; + + rc = -ENOMEM; + vp_dev->ioaddr = pci_iomap(pci_dev, 0, 0); + if (!vp_dev->ioaddr) + goto err_iomap; + + vp_dev->isr = vp_dev->ioaddr + VIRTIO_PCI_ISR; + + /* we use the subsystem vendor/device id as the virtio vendor/device + * id. this allows us to use the same PCI vendor/device id for all + * virtio devices and to identify the particular virtio driver by + * the subsystem ids */ + vp_dev->vdev.id.vendor = pci_dev->subsystem_vendor; + vp_dev->vdev.id.device = pci_dev->subsystem_device; + + vp_dev->vdev.config = &virtio_pci_config_ops; + + vp_dev->config_vector = vp_config_vector; + vp_dev->setup_vq = setup_vq; + vp_dev->del_vq = del_vq; + + return 0; + +err_iomap: + pci_release_region(pci_dev, 0); + return rc; +} + +void virtio_pci_legacy_remove(struct virtio_pci_device *vp_dev) +{ + struct pci_dev *pci_dev = vp_dev->pci_dev; + + pci_iounmap(pci_dev, vp_dev->ioaddr); + pci_release_region(pci_dev, 0); +} diff --git a/drivers/virtio/virtio_pci_modern.c b/drivers/virtio/virtio_pci_modern.c new file mode 100644 index 000000000..07571dacc --- /dev/null +++ b/drivers/virtio/virtio_pci_modern.c @@ -0,0 +1,737 @@ +/* + * Virtio PCI driver - modern (virtio 1.0) device support + * + * This module allows virtio devices to be used over a virtual PCI device. + * This can be used with QEMU based VMMs like KVM or Xen. + * + * Copyright IBM Corp. 2007 + * Copyright Red Hat, Inc. 2014 + * + * Authors: + * Anthony Liguori <aliguori@us.ibm.com> + * Rusty Russell <rusty@rustcorp.com.au> + * Michael S. Tsirkin <mst@redhat.com> + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + */ + +#include <linux/delay.h> +#define VIRTIO_PCI_NO_LEGACY +#include "virtio_pci_common.h" + +/* + * Type-safe wrappers for io accesses. + * Use these to enforce at compile time the following spec requirement: + * + * The driver MUST access each field using the “natural” access + * method, i.e. 32-bit accesses for 32-bit fields, 16-bit accesses + * for 16-bit fields and 8-bit accesses for 8-bit fields. + */ +static inline u8 vp_ioread8(u8 __iomem *addr) +{ + return ioread8(addr); +} +static inline u16 vp_ioread16 (__le16 __iomem *addr) +{ + return ioread16(addr); +} + +static inline u32 vp_ioread32(__le32 __iomem *addr) +{ + return ioread32(addr); +} + +static inline void vp_iowrite8(u8 value, u8 __iomem *addr) +{ + iowrite8(value, addr); +} + +static inline void vp_iowrite16(u16 value, __le16 __iomem *addr) +{ + iowrite16(value, addr); +} + +static inline void vp_iowrite32(u32 value, __le32 __iomem *addr) +{ + iowrite32(value, addr); +} + +static void vp_iowrite64_twopart(u64 val, + __le32 __iomem *lo, __le32 __iomem *hi) +{ + vp_iowrite32((u32)val, lo); + vp_iowrite32(val >> 32, hi); +} + +static void __iomem *map_capability(struct pci_dev *dev, int off, + size_t minlen, + u32 align, + u32 start, u32 size, + size_t *len) +{ + u8 bar; + u32 offset, length; + void __iomem *p; + + pci_read_config_byte(dev, off + offsetof(struct virtio_pci_cap, + bar), + &bar); + pci_read_config_dword(dev, off + offsetof(struct virtio_pci_cap, offset), + &offset); + pci_read_config_dword(dev, off + offsetof(struct virtio_pci_cap, length), + &length); + + if (length <= start) { + dev_err(&dev->dev, + "virtio_pci: bad capability len %u (>%u expected)\n", + length, start); + return NULL; + } + + if (length - start < minlen) { + dev_err(&dev->dev, + "virtio_pci: bad capability len %u (>=%zu expected)\n", + length, minlen); + return NULL; + } + + length -= start; + + if (start + offset < offset) { + dev_err(&dev->dev, + "virtio_pci: map wrap-around %u+%u\n", + start, offset); + return NULL; + } + + offset += start; + + if (offset & (align - 1)) { + dev_err(&dev->dev, + "virtio_pci: offset %u not aligned to %u\n", + offset, align); + return NULL; + } + + if (length > size) + length = size; + + if (len) + *len = length; + + if (minlen + offset < minlen || + minlen + offset > pci_resource_len(dev, bar)) { + dev_err(&dev->dev, + "virtio_pci: map virtio %zu@%u " + "out of range on bar %i length %lu\n", + minlen, offset, + bar, (unsigned long)pci_resource_len(dev, bar)); + return NULL; + } + + p = pci_iomap_range(dev, bar, offset, length); + if (!p) + dev_err(&dev->dev, + "virtio_pci: unable to map virtio %u@%u on bar %i\n", + length, offset, bar); + return p; +} + +/* virtio config->get_features() implementation */ +static u64 vp_get_features(struct virtio_device *vdev) +{ + struct virtio_pci_device *vp_dev = to_vp_device(vdev); + u64 features; + + vp_iowrite32(0, &vp_dev->common->device_feature_select); + features = vp_ioread32(&vp_dev->common->device_feature); + vp_iowrite32(1, &vp_dev->common->device_feature_select); + features |= ((u64)vp_ioread32(&vp_dev->common->device_feature) << 32); + + return features; +} + +static void vp_transport_features(struct virtio_device *vdev, u64 features) +{ + struct virtio_pci_device *vp_dev = to_vp_device(vdev); + struct pci_dev *pci_dev = vp_dev->pci_dev; + + if ((features & BIT_ULL(VIRTIO_F_SR_IOV)) && + pci_find_ext_capability(pci_dev, PCI_EXT_CAP_ID_SRIOV)) + __virtio_set_bit(vdev, VIRTIO_F_SR_IOV); +} + +/* virtio config->finalize_features() implementation */ +static int vp_finalize_features(struct virtio_device *vdev) +{ + struct virtio_pci_device *vp_dev = to_vp_device(vdev); + u64 features = vdev->features; + + /* Give virtio_ring a chance to accept features. */ + vring_transport_features(vdev); + + /* Give virtio_pci a chance to accept features. */ + vp_transport_features(vdev, features); + + if (!__virtio_test_bit(vdev, VIRTIO_F_VERSION_1)) { + dev_err(&vdev->dev, "virtio: device uses modern interface " + "but does not have VIRTIO_F_VERSION_1\n"); + return -EINVAL; + } + + vp_iowrite32(0, &vp_dev->common->guest_feature_select); + vp_iowrite32((u32)vdev->features, &vp_dev->common->guest_feature); + vp_iowrite32(1, &vp_dev->common->guest_feature_select); + vp_iowrite32(vdev->features >> 32, &vp_dev->common->guest_feature); + + return 0; +} + +/* virtio config->get() implementation */ +static void vp_get(struct virtio_device *vdev, unsigned offset, + void *buf, unsigned len) +{ + struct virtio_pci_device *vp_dev = to_vp_device(vdev); + u8 b; + __le16 w; + __le32 l; + + BUG_ON(offset + len > vp_dev->device_len); + + switch (len) { + case 1: + b = ioread8(vp_dev->device + offset); + memcpy(buf, &b, sizeof b); + break; + case 2: + w = cpu_to_le16(ioread16(vp_dev->device + offset)); + memcpy(buf, &w, sizeof w); + break; + case 4: + l = cpu_to_le32(ioread32(vp_dev->device + offset)); + memcpy(buf, &l, sizeof l); + break; + case 8: + l = cpu_to_le32(ioread32(vp_dev->device + offset)); + memcpy(buf, &l, sizeof l); + l = cpu_to_le32(ioread32(vp_dev->device + offset + sizeof l)); + memcpy(buf + sizeof l, &l, sizeof l); + break; + default: + BUG(); + } +} + +/* the config->set() implementation. it's symmetric to the config->get() + * implementation */ +static void vp_set(struct virtio_device *vdev, unsigned offset, + const void *buf, unsigned len) +{ + struct virtio_pci_device *vp_dev = to_vp_device(vdev); + u8 b; + __le16 w; + __le32 l; + + BUG_ON(offset + len > vp_dev->device_len); + + switch (len) { + case 1: + memcpy(&b, buf, sizeof b); + iowrite8(b, vp_dev->device + offset); + break; + case 2: + memcpy(&w, buf, sizeof w); + iowrite16(le16_to_cpu(w), vp_dev->device + offset); + break; + case 4: + memcpy(&l, buf, sizeof l); + iowrite32(le32_to_cpu(l), vp_dev->device + offset); + break; + case 8: + memcpy(&l, buf, sizeof l); + iowrite32(le32_to_cpu(l), vp_dev->device + offset); + memcpy(&l, buf + sizeof l, sizeof l); + iowrite32(le32_to_cpu(l), vp_dev->device + offset + sizeof l); + break; + default: + BUG(); + } +} + +static u32 vp_generation(struct virtio_device *vdev) +{ + struct virtio_pci_device *vp_dev = to_vp_device(vdev); + return vp_ioread8(&vp_dev->common->config_generation); +} + +/* config->{get,set}_status() implementations */ +static u8 vp_get_status(struct virtio_device *vdev) +{ + struct virtio_pci_device *vp_dev = to_vp_device(vdev); + return vp_ioread8(&vp_dev->common->device_status); +} + +static void vp_set_status(struct virtio_device *vdev, u8 status) +{ + struct virtio_pci_device *vp_dev = to_vp_device(vdev); + /* We should never be setting status to 0. */ + BUG_ON(status == 0); + vp_iowrite8(status, &vp_dev->common->device_status); +} + +static void vp_reset(struct virtio_device *vdev) +{ + struct virtio_pci_device *vp_dev = to_vp_device(vdev); + /* 0 status means a reset. */ + vp_iowrite8(0, &vp_dev->common->device_status); + /* After writing 0 to device_status, the driver MUST wait for a read of + * device_status to return 0 before reinitializing the device. + * This will flush out the status write, and flush in device writes, + * including MSI-X interrupts, if any. + */ + while (vp_ioread8(&vp_dev->common->device_status)) + msleep(1); + /* Flush pending VQ/configuration callbacks. */ + vp_synchronize_vectors(vdev); +} + +static u16 vp_config_vector(struct virtio_pci_device *vp_dev, u16 vector) +{ + /* Setup the vector used for configuration events */ + vp_iowrite16(vector, &vp_dev->common->msix_config); + /* Verify we had enough resources to assign the vector */ + /* Will also flush the write out to device */ + return vp_ioread16(&vp_dev->common->msix_config); +} + +static struct virtqueue *setup_vq(struct virtio_pci_device *vp_dev, + struct virtio_pci_vq_info *info, + unsigned index, + void (*callback)(struct virtqueue *vq), + const char *name, + bool ctx, + u16 msix_vec) +{ + struct virtio_pci_common_cfg __iomem *cfg = vp_dev->common; + struct virtqueue *vq; + u16 num, off; + int err; + + if (index >= vp_ioread16(&cfg->num_queues)) + return ERR_PTR(-ENOENT); + + /* Select the queue we're interested in */ + vp_iowrite16(index, &cfg->queue_select); + + /* Check if queue is either not available or already active. */ + num = vp_ioread16(&cfg->queue_size); + if (!num || vp_ioread16(&cfg->queue_enable)) + return ERR_PTR(-ENOENT); + + if (num & (num - 1)) { + dev_warn(&vp_dev->pci_dev->dev, "bad queue size %u", num); + return ERR_PTR(-EINVAL); + } + + /* get offset of notification word for this vq */ + off = vp_ioread16(&cfg->queue_notify_off); + + info->msix_vector = msix_vec; + + /* create the vring */ + vq = vring_create_virtqueue(index, num, + SMP_CACHE_BYTES, &vp_dev->vdev, + true, true, ctx, + vp_notify, callback, name); + if (!vq) + return ERR_PTR(-ENOMEM); + + /* activate the queue */ + vp_iowrite16(virtqueue_get_vring_size(vq), &cfg->queue_size); + vp_iowrite64_twopart(virtqueue_get_desc_addr(vq), + &cfg->queue_desc_lo, &cfg->queue_desc_hi); + vp_iowrite64_twopart(virtqueue_get_avail_addr(vq), + &cfg->queue_avail_lo, &cfg->queue_avail_hi); + vp_iowrite64_twopart(virtqueue_get_used_addr(vq), + &cfg->queue_used_lo, &cfg->queue_used_hi); + + if (vp_dev->notify_base) { + /* offset should not wrap */ + if ((u64)off * vp_dev->notify_offset_multiplier + 2 + > vp_dev->notify_len) { + dev_warn(&vp_dev->pci_dev->dev, + "bad notification offset %u (x %u) " + "for queue %u > %zd", + off, vp_dev->notify_offset_multiplier, + index, vp_dev->notify_len); + err = -EINVAL; + goto err_map_notify; + } + vq->priv = (void __force *)vp_dev->notify_base + + off * vp_dev->notify_offset_multiplier; + } else { + vq->priv = (void __force *)map_capability(vp_dev->pci_dev, + vp_dev->notify_map_cap, 2, 2, + off * vp_dev->notify_offset_multiplier, 2, + NULL); + } + + if (!vq->priv) { + err = -ENOMEM; + goto err_map_notify; + } + + if (msix_vec != VIRTIO_MSI_NO_VECTOR) { + vp_iowrite16(msix_vec, &cfg->queue_msix_vector); + msix_vec = vp_ioread16(&cfg->queue_msix_vector); + if (msix_vec == VIRTIO_MSI_NO_VECTOR) { + err = -EBUSY; + goto err_assign_vector; + } + } + + return vq; + +err_assign_vector: + if (!vp_dev->notify_base) + pci_iounmap(vp_dev->pci_dev, (void __iomem __force *)vq->priv); +err_map_notify: + vring_del_virtqueue(vq); + return ERR_PTR(err); +} + +static int vp_modern_find_vqs(struct virtio_device *vdev, unsigned nvqs, + struct virtqueue *vqs[], + vq_callback_t *callbacks[], + const char * const names[], const bool *ctx, + struct irq_affinity *desc) +{ + struct virtio_pci_device *vp_dev = to_vp_device(vdev); + struct virtqueue *vq; + int rc = vp_find_vqs(vdev, nvqs, vqs, callbacks, names, ctx, desc); + + if (rc) + return rc; + + /* Select and activate all queues. Has to be done last: once we do + * this, there's no way to go back except reset. + */ + list_for_each_entry(vq, &vdev->vqs, list) { + vp_iowrite16(vq->index, &vp_dev->common->queue_select); + vp_iowrite16(1, &vp_dev->common->queue_enable); + } + + return 0; +} + +static void del_vq(struct virtio_pci_vq_info *info) +{ + struct virtqueue *vq = info->vq; + struct virtio_pci_device *vp_dev = to_vp_device(vq->vdev); + + vp_iowrite16(vq->index, &vp_dev->common->queue_select); + + if (vp_dev->msix_enabled) { + vp_iowrite16(VIRTIO_MSI_NO_VECTOR, + &vp_dev->common->queue_msix_vector); + /* Flush the write out to device */ + vp_ioread16(&vp_dev->common->queue_msix_vector); + } + + if (!vp_dev->notify_base) + pci_iounmap(vp_dev->pci_dev, (void __force __iomem *)vq->priv); + + vring_del_virtqueue(vq); +} + +static const struct virtio_config_ops virtio_pci_config_nodev_ops = { + .get = NULL, + .set = NULL, + .generation = vp_generation, + .get_status = vp_get_status, + .set_status = vp_set_status, + .reset = vp_reset, + .find_vqs = vp_modern_find_vqs, + .del_vqs = vp_del_vqs, + .get_features = vp_get_features, + .finalize_features = vp_finalize_features, + .bus_name = vp_bus_name, + .set_vq_affinity = vp_set_vq_affinity, + .get_vq_affinity = vp_get_vq_affinity, +}; + +static const struct virtio_config_ops virtio_pci_config_ops = { + .get = vp_get, + .set = vp_set, + .generation = vp_generation, + .get_status = vp_get_status, + .set_status = vp_set_status, + .reset = vp_reset, + .find_vqs = vp_modern_find_vqs, + .del_vqs = vp_del_vqs, + .get_features = vp_get_features, + .finalize_features = vp_finalize_features, + .bus_name = vp_bus_name, + .set_vq_affinity = vp_set_vq_affinity, + .get_vq_affinity = vp_get_vq_affinity, +}; + +/** + * virtio_pci_find_capability - walk capabilities to find device info. + * @dev: the pci device + * @cfg_type: the VIRTIO_PCI_CAP_* value we seek + * @ioresource_types: IORESOURCE_MEM and/or IORESOURCE_IO. + * + * Returns offset of the capability, or 0. + */ +static inline int virtio_pci_find_capability(struct pci_dev *dev, u8 cfg_type, + u32 ioresource_types, int *bars) +{ + int pos; + + for (pos = pci_find_capability(dev, PCI_CAP_ID_VNDR); + pos > 0; + pos = pci_find_next_capability(dev, pos, PCI_CAP_ID_VNDR)) { + u8 type, bar; + pci_read_config_byte(dev, pos + offsetof(struct virtio_pci_cap, + cfg_type), + &type); + pci_read_config_byte(dev, pos + offsetof(struct virtio_pci_cap, + bar), + &bar); + + /* Ignore structures with reserved BAR values */ + if (bar > 0x5) + continue; + + if (type == cfg_type) { + if (pci_resource_len(dev, bar) && + pci_resource_flags(dev, bar) & ioresource_types) { + *bars |= (1 << bar); + return pos; + } + } + } + return 0; +} + +/* This is part of the ABI. Don't screw with it. */ +static inline void check_offsets(void) +{ + /* Note: disk space was harmed in compilation of this function. */ + BUILD_BUG_ON(VIRTIO_PCI_CAP_VNDR != + offsetof(struct virtio_pci_cap, cap_vndr)); + BUILD_BUG_ON(VIRTIO_PCI_CAP_NEXT != + offsetof(struct virtio_pci_cap, cap_next)); + BUILD_BUG_ON(VIRTIO_PCI_CAP_LEN != + offsetof(struct virtio_pci_cap, cap_len)); + BUILD_BUG_ON(VIRTIO_PCI_CAP_CFG_TYPE != + offsetof(struct virtio_pci_cap, cfg_type)); + BUILD_BUG_ON(VIRTIO_PCI_CAP_BAR != + offsetof(struct virtio_pci_cap, bar)); + BUILD_BUG_ON(VIRTIO_PCI_CAP_OFFSET != + offsetof(struct virtio_pci_cap, offset)); + BUILD_BUG_ON(VIRTIO_PCI_CAP_LENGTH != + offsetof(struct virtio_pci_cap, length)); + BUILD_BUG_ON(VIRTIO_PCI_NOTIFY_CAP_MULT != + offsetof(struct virtio_pci_notify_cap, + notify_off_multiplier)); + BUILD_BUG_ON(VIRTIO_PCI_COMMON_DFSELECT != + offsetof(struct virtio_pci_common_cfg, + device_feature_select)); + BUILD_BUG_ON(VIRTIO_PCI_COMMON_DF != + offsetof(struct virtio_pci_common_cfg, device_feature)); + BUILD_BUG_ON(VIRTIO_PCI_COMMON_GFSELECT != + offsetof(struct virtio_pci_common_cfg, + guest_feature_select)); + BUILD_BUG_ON(VIRTIO_PCI_COMMON_GF != + offsetof(struct virtio_pci_common_cfg, guest_feature)); + BUILD_BUG_ON(VIRTIO_PCI_COMMON_MSIX != + offsetof(struct virtio_pci_common_cfg, msix_config)); + BUILD_BUG_ON(VIRTIO_PCI_COMMON_NUMQ != + offsetof(struct virtio_pci_common_cfg, num_queues)); + BUILD_BUG_ON(VIRTIO_PCI_COMMON_STATUS != + offsetof(struct virtio_pci_common_cfg, device_status)); + BUILD_BUG_ON(VIRTIO_PCI_COMMON_CFGGENERATION != + offsetof(struct virtio_pci_common_cfg, config_generation)); + BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_SELECT != + offsetof(struct virtio_pci_common_cfg, queue_select)); + BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_SIZE != + offsetof(struct virtio_pci_common_cfg, queue_size)); + BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_MSIX != + offsetof(struct virtio_pci_common_cfg, queue_msix_vector)); + BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_ENABLE != + offsetof(struct virtio_pci_common_cfg, queue_enable)); + BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_NOFF != + offsetof(struct virtio_pci_common_cfg, queue_notify_off)); + BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_DESCLO != + offsetof(struct virtio_pci_common_cfg, queue_desc_lo)); + BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_DESCHI != + offsetof(struct virtio_pci_common_cfg, queue_desc_hi)); + BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_AVAILLO != + offsetof(struct virtio_pci_common_cfg, queue_avail_lo)); + BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_AVAILHI != + offsetof(struct virtio_pci_common_cfg, queue_avail_hi)); + BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_USEDLO != + offsetof(struct virtio_pci_common_cfg, queue_used_lo)); + BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_USEDHI != + offsetof(struct virtio_pci_common_cfg, queue_used_hi)); +} + +/* the PCI probing function */ +int virtio_pci_modern_probe(struct virtio_pci_device *vp_dev) +{ + struct pci_dev *pci_dev = vp_dev->pci_dev; + int err, common, isr, notify, device; + u32 notify_length; + u32 notify_offset; + + check_offsets(); + + /* We only own devices >= 0x1000 and <= 0x107f: leave the rest. */ + if (pci_dev->device < 0x1000 || pci_dev->device > 0x107f) + return -ENODEV; + + if (pci_dev->device < 0x1040) { + /* Transitional devices: use the PCI subsystem device id as + * virtio device id, same as legacy driver always did. + */ + vp_dev->vdev.id.device = pci_dev->subsystem_device; + } else { + /* Modern devices: simply use PCI device id, but start from 0x1040. */ + vp_dev->vdev.id.device = pci_dev->device - 0x1040; + } + vp_dev->vdev.id.vendor = pci_dev->subsystem_vendor; + + /* check for a common config: if not, use legacy mode (bar 0). */ + common = virtio_pci_find_capability(pci_dev, VIRTIO_PCI_CAP_COMMON_CFG, + IORESOURCE_IO | IORESOURCE_MEM, + &vp_dev->modern_bars); + if (!common) { + dev_info(&pci_dev->dev, + "virtio_pci: leaving for legacy driver\n"); + return -ENODEV; + } + + /* If common is there, these should be too... */ + isr = virtio_pci_find_capability(pci_dev, VIRTIO_PCI_CAP_ISR_CFG, + IORESOURCE_IO | IORESOURCE_MEM, + &vp_dev->modern_bars); + notify = virtio_pci_find_capability(pci_dev, VIRTIO_PCI_CAP_NOTIFY_CFG, + IORESOURCE_IO | IORESOURCE_MEM, + &vp_dev->modern_bars); + if (!isr || !notify) { + dev_err(&pci_dev->dev, + "virtio_pci: missing capabilities %i/%i/%i\n", + common, isr, notify); + return -EINVAL; + } + + err = dma_set_mask_and_coherent(&pci_dev->dev, DMA_BIT_MASK(64)); + if (err) + err = dma_set_mask_and_coherent(&pci_dev->dev, + DMA_BIT_MASK(32)); + if (err) + dev_warn(&pci_dev->dev, "Failed to enable 64-bit or 32-bit DMA. Trying to continue, but this might not work.\n"); + + /* Device capability is only mandatory for devices that have + * device-specific configuration. + */ + device = virtio_pci_find_capability(pci_dev, VIRTIO_PCI_CAP_DEVICE_CFG, + IORESOURCE_IO | IORESOURCE_MEM, + &vp_dev->modern_bars); + + err = pci_request_selected_regions(pci_dev, vp_dev->modern_bars, + "virtio-pci-modern"); + if (err) + return err; + + err = -EINVAL; + vp_dev->common = map_capability(pci_dev, common, + sizeof(struct virtio_pci_common_cfg), 4, + 0, sizeof(struct virtio_pci_common_cfg), + NULL); + if (!vp_dev->common) + goto err_map_common; + vp_dev->isr = map_capability(pci_dev, isr, sizeof(u8), 1, + 0, 1, + NULL); + if (!vp_dev->isr) + goto err_map_isr; + + /* Read notify_off_multiplier from config space. */ + pci_read_config_dword(pci_dev, + notify + offsetof(struct virtio_pci_notify_cap, + notify_off_multiplier), + &vp_dev->notify_offset_multiplier); + /* Read notify length and offset from config space. */ + pci_read_config_dword(pci_dev, + notify + offsetof(struct virtio_pci_notify_cap, + cap.length), + ¬ify_length); + + pci_read_config_dword(pci_dev, + notify + offsetof(struct virtio_pci_notify_cap, + cap.offset), + ¬ify_offset); + + /* We don't know how many VQs we'll map, ahead of the time. + * If notify length is small, map it all now. + * Otherwise, map each VQ individually later. + */ + if ((u64)notify_length + (notify_offset % PAGE_SIZE) <= PAGE_SIZE) { + vp_dev->notify_base = map_capability(pci_dev, notify, 2, 2, + 0, notify_length, + &vp_dev->notify_len); + if (!vp_dev->notify_base) + goto err_map_notify; + } else { + vp_dev->notify_map_cap = notify; + } + + /* Again, we don't know how much we should map, but PAGE_SIZE + * is more than enough for all existing devices. + */ + if (device) { + vp_dev->device = map_capability(pci_dev, device, 0, 4, + 0, PAGE_SIZE, + &vp_dev->device_len); + if (!vp_dev->device) + goto err_map_device; + + vp_dev->vdev.config = &virtio_pci_config_ops; + } else { + vp_dev->vdev.config = &virtio_pci_config_nodev_ops; + } + + vp_dev->config_vector = vp_config_vector; + vp_dev->setup_vq = setup_vq; + vp_dev->del_vq = del_vq; + + return 0; + +err_map_device: + if (vp_dev->notify_base) + pci_iounmap(pci_dev, vp_dev->notify_base); +err_map_notify: + pci_iounmap(pci_dev, vp_dev->isr); +err_map_isr: + pci_iounmap(pci_dev, vp_dev->common); +err_map_common: + return err; +} + +void virtio_pci_modern_remove(struct virtio_pci_device *vp_dev) +{ + struct pci_dev *pci_dev = vp_dev->pci_dev; + + if (vp_dev->device) + pci_iounmap(pci_dev, vp_dev->device); + if (vp_dev->notify_base) + pci_iounmap(pci_dev, vp_dev->notify_base); + pci_iounmap(pci_dev, vp_dev->isr); + pci_iounmap(pci_dev, vp_dev->common); + pci_release_selected_regions(pci_dev, vp_dev->modern_bars); +} diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c new file mode 100644 index 000000000..0cc0cfd3a --- /dev/null +++ b/drivers/virtio/virtio_ring.c @@ -0,0 +1,1259 @@ +/* Virtio ring implementation. + * + * Copyright 2007 Rusty Russell IBM Corporation + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include <linux/virtio.h> +#include <linux/virtio_ring.h> +#include <linux/virtio_config.h> +#include <linux/device.h> +#include <linux/slab.h> +#include <linux/module.h> +#include <linux/hrtimer.h> +#include <linux/dma-mapping.h> +#include <xen/xen.h> + +#ifdef DEBUG +/* For development, we want to crash whenever the ring is screwed. */ +#define BAD_RING(_vq, fmt, args...) \ + do { \ + dev_err(&(_vq)->vq.vdev->dev, \ + "%s:"fmt, (_vq)->vq.name, ##args); \ + BUG(); \ + } while (0) +/* Caller is supposed to guarantee no reentry. */ +#define START_USE(_vq) \ + do { \ + if ((_vq)->in_use) \ + panic("%s:in_use = %i\n", \ + (_vq)->vq.name, (_vq)->in_use); \ + (_vq)->in_use = __LINE__; \ + } while (0) +#define END_USE(_vq) \ + do { BUG_ON(!(_vq)->in_use); (_vq)->in_use = 0; } while(0) +#else +#define BAD_RING(_vq, fmt, args...) \ + do { \ + dev_err(&_vq->vq.vdev->dev, \ + "%s:"fmt, (_vq)->vq.name, ##args); \ + (_vq)->broken = true; \ + } while (0) +#define START_USE(vq) +#define END_USE(vq) +#endif + +struct vring_desc_state { + void *data; /* Data for callback. */ + struct vring_desc *indir_desc; /* Indirect descriptor, if any. */ +}; + +struct vring_virtqueue { + struct virtqueue vq; + + /* Actual memory layout for this queue */ + struct vring vring; + + /* Can we use weak barriers? */ + bool weak_barriers; + + /* Other side has made a mess, don't try any more. */ + bool broken; + + /* Host supports indirect buffers */ + bool indirect; + + /* Host publishes avail event idx */ + bool event; + + /* Head of free buffer list. */ + unsigned int free_head; + /* Number we've added since last sync. */ + unsigned int num_added; + + /* Last used index we've seen. */ + u16 last_used_idx; + + /* Last written value to avail->flags */ + u16 avail_flags_shadow; + + /* Last written value to avail->idx in guest byte order */ + u16 avail_idx_shadow; + + /* How to notify other side. FIXME: commonalize hcalls! */ + bool (*notify)(struct virtqueue *vq); + + /* DMA, allocation, and size information */ + bool we_own_ring; + size_t queue_size_in_bytes; + dma_addr_t queue_dma_addr; + +#ifdef DEBUG + /* They're supposed to lock for us. */ + unsigned int in_use; + + /* Figure out if their kicks are too delayed. */ + bool last_add_time_valid; + ktime_t last_add_time; +#endif + + /* Per-descriptor state. */ + struct vring_desc_state desc_state[]; +}; + +#define to_vvq(_vq) container_of(_vq, struct vring_virtqueue, vq) + +/* + * Modern virtio devices have feature bits to specify whether they need a + * quirk and bypass the IOMMU. If not there, just use the DMA API. + * + * If there, the interaction between virtio and DMA API is messy. + * + * On most systems with virtio, physical addresses match bus addresses, + * and it doesn't particularly matter whether we use the DMA API. + * + * On some systems, including Xen and any system with a physical device + * that speaks virtio behind a physical IOMMU, we must use the DMA API + * for virtio DMA to work at all. + * + * On other systems, including SPARC and PPC64, virtio-pci devices are + * enumerated as though they are behind an IOMMU, but the virtio host + * ignores the IOMMU, so we must either pretend that the IOMMU isn't + * there or somehow map everything as the identity. + * + * For the time being, we preserve historic behavior and bypass the DMA + * API. + * + * TODO: install a per-device DMA ops structure that does the right thing + * taking into account all the above quirks, and use the DMA API + * unconditionally on data path. + */ + +static bool vring_use_dma_api(struct virtio_device *vdev) +{ + if (!virtio_has_iommu_quirk(vdev)) + return true; + + /* Otherwise, we are left to guess. */ + /* + * In theory, it's possible to have a buggy QEMU-supposed + * emulated Q35 IOMMU and Xen enabled at the same time. On + * such a configuration, virtio has never worked and will + * not work without an even larger kludge. Instead, enable + * the DMA API if we're a Xen guest, which at least allows + * all of the sensible Xen configurations to work correctly. + */ + if (xen_domain()) + return true; + + return false; +} + +/* + * The DMA ops on various arches are rather gnarly right now, and + * making all of the arch DMA ops work on the vring device itself + * is a mess. For now, we use the parent device for DMA ops. + */ +static inline struct device *vring_dma_dev(const struct vring_virtqueue *vq) +{ + return vq->vq.vdev->dev.parent; +} + +/* Map one sg entry. */ +static dma_addr_t vring_map_one_sg(const struct vring_virtqueue *vq, + struct scatterlist *sg, + enum dma_data_direction direction) +{ + if (!vring_use_dma_api(vq->vq.vdev)) + return (dma_addr_t)sg_phys(sg); + + /* + * We can't use dma_map_sg, because we don't use scatterlists in + * the way it expects (we don't guarantee that the scatterlist + * will exist for the lifetime of the mapping). + */ + return dma_map_page(vring_dma_dev(vq), + sg_page(sg), sg->offset, sg->length, + direction); +} + +static dma_addr_t vring_map_single(const struct vring_virtqueue *vq, + void *cpu_addr, size_t size, + enum dma_data_direction direction) +{ + if (!vring_use_dma_api(vq->vq.vdev)) + return (dma_addr_t)virt_to_phys(cpu_addr); + + return dma_map_single(vring_dma_dev(vq), + cpu_addr, size, direction); +} + +static void vring_unmap_one(const struct vring_virtqueue *vq, + struct vring_desc *desc) +{ + u16 flags; + + if (!vring_use_dma_api(vq->vq.vdev)) + return; + + flags = virtio16_to_cpu(vq->vq.vdev, desc->flags); + + if (flags & VRING_DESC_F_INDIRECT) { + dma_unmap_single(vring_dma_dev(vq), + virtio64_to_cpu(vq->vq.vdev, desc->addr), + virtio32_to_cpu(vq->vq.vdev, desc->len), + (flags & VRING_DESC_F_WRITE) ? + DMA_FROM_DEVICE : DMA_TO_DEVICE); + } else { + dma_unmap_page(vring_dma_dev(vq), + virtio64_to_cpu(vq->vq.vdev, desc->addr), + virtio32_to_cpu(vq->vq.vdev, desc->len), + (flags & VRING_DESC_F_WRITE) ? + DMA_FROM_DEVICE : DMA_TO_DEVICE); + } +} + +static int vring_mapping_error(const struct vring_virtqueue *vq, + dma_addr_t addr) +{ + if (!vring_use_dma_api(vq->vq.vdev)) + return 0; + + return dma_mapping_error(vring_dma_dev(vq), addr); +} + +static struct vring_desc *alloc_indirect(struct virtqueue *_vq, + unsigned int total_sg, gfp_t gfp) +{ + struct vring_desc *desc; + unsigned int i; + + /* + * We require lowmem mappings for the descriptors because + * otherwise virt_to_phys will give us bogus addresses in the + * virtqueue. + */ + gfp &= ~__GFP_HIGHMEM; + + desc = kmalloc_array(total_sg, sizeof(struct vring_desc), gfp); + if (!desc) + return NULL; + + for (i = 0; i < total_sg; i++) + desc[i].next = cpu_to_virtio16(_vq->vdev, i + 1); + return desc; +} + +static inline int virtqueue_add(struct virtqueue *_vq, + struct scatterlist *sgs[], + unsigned int total_sg, + unsigned int out_sgs, + unsigned int in_sgs, + void *data, + void *ctx, + gfp_t gfp) +{ + struct vring_virtqueue *vq = to_vvq(_vq); + struct scatterlist *sg; + struct vring_desc *desc; + unsigned int i, n, avail, descs_used, uninitialized_var(prev), err_idx; + int head; + bool indirect; + + START_USE(vq); + + BUG_ON(data == NULL); + BUG_ON(ctx && vq->indirect); + + if (unlikely(vq->broken)) { + END_USE(vq); + return -EIO; + } + +#ifdef DEBUG + { + ktime_t now = ktime_get(); + + /* No kick or get, with .1 second between? Warn. */ + if (vq->last_add_time_valid) + WARN_ON(ktime_to_ms(ktime_sub(now, vq->last_add_time)) + > 100); + vq->last_add_time = now; + vq->last_add_time_valid = true; + } +#endif + + BUG_ON(total_sg == 0); + + head = vq->free_head; + + /* If the host supports indirect descriptor tables, and we have multiple + * buffers, then go indirect. FIXME: tune this threshold */ + if (vq->indirect && total_sg > 1 && vq->vq.num_free) + desc = alloc_indirect(_vq, total_sg, gfp); + else { + desc = NULL; + WARN_ON_ONCE(total_sg > vq->vring.num && !vq->indirect); + } + + if (desc) { + /* Use a single buffer which doesn't continue */ + indirect = true; + /* Set up rest to use this indirect table. */ + i = 0; + descs_used = 1; + } else { + indirect = false; + desc = vq->vring.desc; + i = head; + descs_used = total_sg; + } + + if (vq->vq.num_free < descs_used) { + pr_debug("Can't add buf len %i - avail = %i\n", + descs_used, vq->vq.num_free); + /* FIXME: for historical reasons, we force a notify here if + * there are outgoing parts to the buffer. Presumably the + * host should service the ring ASAP. */ + if (out_sgs) + vq->notify(&vq->vq); + if (indirect) + kfree(desc); + END_USE(vq); + return -ENOSPC; + } + + for (n = 0; n < out_sgs; n++) { + for (sg = sgs[n]; sg; sg = sg_next(sg)) { + dma_addr_t addr = vring_map_one_sg(vq, sg, DMA_TO_DEVICE); + if (vring_mapping_error(vq, addr)) + goto unmap_release; + + desc[i].flags = cpu_to_virtio16(_vq->vdev, VRING_DESC_F_NEXT); + desc[i].addr = cpu_to_virtio64(_vq->vdev, addr); + desc[i].len = cpu_to_virtio32(_vq->vdev, sg->length); + prev = i; + i = virtio16_to_cpu(_vq->vdev, desc[i].next); + } + } + for (; n < (out_sgs + in_sgs); n++) { + for (sg = sgs[n]; sg; sg = sg_next(sg)) { + dma_addr_t addr = vring_map_one_sg(vq, sg, DMA_FROM_DEVICE); + if (vring_mapping_error(vq, addr)) + goto unmap_release; + + desc[i].flags = cpu_to_virtio16(_vq->vdev, VRING_DESC_F_NEXT | VRING_DESC_F_WRITE); + desc[i].addr = cpu_to_virtio64(_vq->vdev, addr); + desc[i].len = cpu_to_virtio32(_vq->vdev, sg->length); + prev = i; + i = virtio16_to_cpu(_vq->vdev, desc[i].next); + } + } + /* Last one doesn't continue. */ + desc[prev].flags &= cpu_to_virtio16(_vq->vdev, ~VRING_DESC_F_NEXT); + + if (indirect) { + /* Now that the indirect table is filled in, map it. */ + dma_addr_t addr = vring_map_single( + vq, desc, total_sg * sizeof(struct vring_desc), + DMA_TO_DEVICE); + if (vring_mapping_error(vq, addr)) + goto unmap_release; + + vq->vring.desc[head].flags = cpu_to_virtio16(_vq->vdev, VRING_DESC_F_INDIRECT); + vq->vring.desc[head].addr = cpu_to_virtio64(_vq->vdev, addr); + + vq->vring.desc[head].len = cpu_to_virtio32(_vq->vdev, total_sg * sizeof(struct vring_desc)); + } + + /* We're using some buffers from the free list. */ + vq->vq.num_free -= descs_used; + + /* Update free pointer */ + if (indirect) + vq->free_head = virtio16_to_cpu(_vq->vdev, vq->vring.desc[head].next); + else + vq->free_head = i; + + /* Store token and indirect buffer state. */ + vq->desc_state[head].data = data; + if (indirect) + vq->desc_state[head].indir_desc = desc; + else + vq->desc_state[head].indir_desc = ctx; + + /* Put entry in available array (but don't update avail->idx until they + * do sync). */ + avail = vq->avail_idx_shadow & (vq->vring.num - 1); + vq->vring.avail->ring[avail] = cpu_to_virtio16(_vq->vdev, head); + + /* Descriptors and available array need to be set before we expose the + * new available array entries. */ + virtio_wmb(vq->weak_barriers); + vq->avail_idx_shadow++; + vq->vring.avail->idx = cpu_to_virtio16(_vq->vdev, vq->avail_idx_shadow); + vq->num_added++; + + pr_debug("Added buffer head %i to %p\n", head, vq); + END_USE(vq); + + /* This is very unlikely, but theoretically possible. Kick + * just in case. */ + if (unlikely(vq->num_added == (1 << 16) - 1)) + virtqueue_kick(_vq); + + return 0; + +unmap_release: + err_idx = i; + i = head; + + for (n = 0; n < total_sg; n++) { + if (i == err_idx) + break; + vring_unmap_one(vq, &desc[i]); + i = virtio16_to_cpu(_vq->vdev, vq->vring.desc[i].next); + } + + if (indirect) + kfree(desc); + + END_USE(vq); + return -ENOMEM; +} + +/** + * virtqueue_add_sgs - expose buffers to other end + * @vq: the struct virtqueue we're talking about. + * @sgs: array of terminated scatterlists. + * @out_num: the number of scatterlists readable by other side + * @in_num: the number of scatterlists which are writable (after readable ones) + * @data: the token identifying the buffer. + * @gfp: how to do memory allocations (if necessary). + * + * Caller must ensure we don't call this with other virtqueue operations + * at the same time (except where noted). + * + * Returns zero or a negative error (ie. ENOSPC, ENOMEM, EIO). + */ +int virtqueue_add_sgs(struct virtqueue *_vq, + struct scatterlist *sgs[], + unsigned int out_sgs, + unsigned int in_sgs, + void *data, + gfp_t gfp) +{ + unsigned int i, total_sg = 0; + + /* Count them first. */ + for (i = 0; i < out_sgs + in_sgs; i++) { + struct scatterlist *sg; + for (sg = sgs[i]; sg; sg = sg_next(sg)) + total_sg++; + } + return virtqueue_add(_vq, sgs, total_sg, out_sgs, in_sgs, + data, NULL, gfp); +} +EXPORT_SYMBOL_GPL(virtqueue_add_sgs); + +/** + * virtqueue_add_outbuf - expose output buffers to other end + * @vq: the struct virtqueue we're talking about. + * @sg: scatterlist (must be well-formed and terminated!) + * @num: the number of entries in @sg readable by other side + * @data: the token identifying the buffer. + * @gfp: how to do memory allocations (if necessary). + * + * Caller must ensure we don't call this with other virtqueue operations + * at the same time (except where noted). + * + * Returns zero or a negative error (ie. ENOSPC, ENOMEM, EIO). + */ +int virtqueue_add_outbuf(struct virtqueue *vq, + struct scatterlist *sg, unsigned int num, + void *data, + gfp_t gfp) +{ + return virtqueue_add(vq, &sg, num, 1, 0, data, NULL, gfp); +} +EXPORT_SYMBOL_GPL(virtqueue_add_outbuf); + +/** + * virtqueue_add_inbuf - expose input buffers to other end + * @vq: the struct virtqueue we're talking about. + * @sg: scatterlist (must be well-formed and terminated!) + * @num: the number of entries in @sg writable by other side + * @data: the token identifying the buffer. + * @gfp: how to do memory allocations (if necessary). + * + * Caller must ensure we don't call this with other virtqueue operations + * at the same time (except where noted). + * + * Returns zero or a negative error (ie. ENOSPC, ENOMEM, EIO). + */ +int virtqueue_add_inbuf(struct virtqueue *vq, + struct scatterlist *sg, unsigned int num, + void *data, + gfp_t gfp) +{ + return virtqueue_add(vq, &sg, num, 0, 1, data, NULL, gfp); +} +EXPORT_SYMBOL_GPL(virtqueue_add_inbuf); + +/** + * virtqueue_add_inbuf_ctx - expose input buffers to other end + * @vq: the struct virtqueue we're talking about. + * @sg: scatterlist (must be well-formed and terminated!) + * @num: the number of entries in @sg writable by other side + * @data: the token identifying the buffer. + * @ctx: extra context for the token + * @gfp: how to do memory allocations (if necessary). + * + * Caller must ensure we don't call this with other virtqueue operations + * at the same time (except where noted). + * + * Returns zero or a negative error (ie. ENOSPC, ENOMEM, EIO). + */ +int virtqueue_add_inbuf_ctx(struct virtqueue *vq, + struct scatterlist *sg, unsigned int num, + void *data, + void *ctx, + gfp_t gfp) +{ + return virtqueue_add(vq, &sg, num, 0, 1, data, ctx, gfp); +} +EXPORT_SYMBOL_GPL(virtqueue_add_inbuf_ctx); + +/** + * virtqueue_kick_prepare - first half of split virtqueue_kick call. + * @vq: the struct virtqueue + * + * Instead of virtqueue_kick(), you can do: + * if (virtqueue_kick_prepare(vq)) + * virtqueue_notify(vq); + * + * This is sometimes useful because the virtqueue_kick_prepare() needs + * to be serialized, but the actual virtqueue_notify() call does not. + */ +bool virtqueue_kick_prepare(struct virtqueue *_vq) +{ + struct vring_virtqueue *vq = to_vvq(_vq); + u16 new, old; + bool needs_kick; + + START_USE(vq); + /* We need to expose available array entries before checking avail + * event. */ + virtio_mb(vq->weak_barriers); + + old = vq->avail_idx_shadow - vq->num_added; + new = vq->avail_idx_shadow; + vq->num_added = 0; + +#ifdef DEBUG + if (vq->last_add_time_valid) { + WARN_ON(ktime_to_ms(ktime_sub(ktime_get(), + vq->last_add_time)) > 100); + } + vq->last_add_time_valid = false; +#endif + + if (vq->event) { + needs_kick = vring_need_event(virtio16_to_cpu(_vq->vdev, vring_avail_event(&vq->vring)), + new, old); + } else { + needs_kick = !(vq->vring.used->flags & cpu_to_virtio16(_vq->vdev, VRING_USED_F_NO_NOTIFY)); + } + END_USE(vq); + return needs_kick; +} +EXPORT_SYMBOL_GPL(virtqueue_kick_prepare); + +/** + * virtqueue_notify - second half of split virtqueue_kick call. + * @vq: the struct virtqueue + * + * This does not need to be serialized. + * + * Returns false if host notify failed or queue is broken, otherwise true. + */ +bool virtqueue_notify(struct virtqueue *_vq) +{ + struct vring_virtqueue *vq = to_vvq(_vq); + + if (unlikely(vq->broken)) + return false; + + /* Prod other side to tell it about changes. */ + if (!vq->notify(_vq)) { + vq->broken = true; + return false; + } + return true; +} +EXPORT_SYMBOL_GPL(virtqueue_notify); + +/** + * virtqueue_kick - update after add_buf + * @vq: the struct virtqueue + * + * After one or more virtqueue_add_* calls, invoke this to kick + * the other side. + * + * Caller must ensure we don't call this with other virtqueue + * operations at the same time (except where noted). + * + * Returns false if kick failed, otherwise true. + */ +bool virtqueue_kick(struct virtqueue *vq) +{ + if (virtqueue_kick_prepare(vq)) + return virtqueue_notify(vq); + return true; +} +EXPORT_SYMBOL_GPL(virtqueue_kick); + +static void detach_buf(struct vring_virtqueue *vq, unsigned int head, + void **ctx) +{ + unsigned int i, j; + __virtio16 nextflag = cpu_to_virtio16(vq->vq.vdev, VRING_DESC_F_NEXT); + + /* Clear data ptr. */ + vq->desc_state[head].data = NULL; + + /* Put back on free list: unmap first-level descriptors and find end */ + i = head; + + while (vq->vring.desc[i].flags & nextflag) { + vring_unmap_one(vq, &vq->vring.desc[i]); + i = virtio16_to_cpu(vq->vq.vdev, vq->vring.desc[i].next); + vq->vq.num_free++; + } + + vring_unmap_one(vq, &vq->vring.desc[i]); + vq->vring.desc[i].next = cpu_to_virtio16(vq->vq.vdev, vq->free_head); + vq->free_head = head; + + /* Plus final descriptor */ + vq->vq.num_free++; + + if (vq->indirect) { + struct vring_desc *indir_desc = vq->desc_state[head].indir_desc; + u32 len; + + /* Free the indirect table, if any, now that it's unmapped. */ + if (!indir_desc) + return; + + len = virtio32_to_cpu(vq->vq.vdev, vq->vring.desc[head].len); + + BUG_ON(!(vq->vring.desc[head].flags & + cpu_to_virtio16(vq->vq.vdev, VRING_DESC_F_INDIRECT))); + BUG_ON(len == 0 || len % sizeof(struct vring_desc)); + + for (j = 0; j < len / sizeof(struct vring_desc); j++) + vring_unmap_one(vq, &indir_desc[j]); + + kfree(indir_desc); + vq->desc_state[head].indir_desc = NULL; + } else if (ctx) { + *ctx = vq->desc_state[head].indir_desc; + } +} + +static inline bool more_used(const struct vring_virtqueue *vq) +{ + return vq->last_used_idx != virtio16_to_cpu(vq->vq.vdev, vq->vring.used->idx); +} + +/** + * virtqueue_get_buf - get the next used buffer + * @vq: the struct virtqueue we're talking about. + * @len: the length written into the buffer + * + * If the device wrote data into the buffer, @len will be set to the + * amount written. This means you don't need to clear the buffer + * beforehand to ensure there's no data leakage in the case of short + * writes. + * + * Caller must ensure we don't call this with other virtqueue + * operations at the same time (except where noted). + * + * Returns NULL if there are no used buffers, or the "data" token + * handed to virtqueue_add_*(). + */ +void *virtqueue_get_buf_ctx(struct virtqueue *_vq, unsigned int *len, + void **ctx) +{ + struct vring_virtqueue *vq = to_vvq(_vq); + void *ret; + unsigned int i; + u16 last_used; + + START_USE(vq); + + if (unlikely(vq->broken)) { + END_USE(vq); + return NULL; + } + + if (!more_used(vq)) { + pr_debug("No more buffers in queue\n"); + END_USE(vq); + return NULL; + } + + /* Only get used array entries after they have been exposed by host. */ + virtio_rmb(vq->weak_barriers); + + last_used = (vq->last_used_idx & (vq->vring.num - 1)); + i = virtio32_to_cpu(_vq->vdev, vq->vring.used->ring[last_used].id); + *len = virtio32_to_cpu(_vq->vdev, vq->vring.used->ring[last_used].len); + + if (unlikely(i >= vq->vring.num)) { + BAD_RING(vq, "id %u out of range\n", i); + return NULL; + } + if (unlikely(!vq->desc_state[i].data)) { + BAD_RING(vq, "id %u is not a head!\n", i); + return NULL; + } + + /* detach_buf clears data, so grab it now. */ + ret = vq->desc_state[i].data; + detach_buf(vq, i, ctx); + vq->last_used_idx++; + /* If we expect an interrupt for the next entry, tell host + * by writing event index and flush out the write before + * the read in the next get_buf call. */ + if (!(vq->avail_flags_shadow & VRING_AVAIL_F_NO_INTERRUPT)) + virtio_store_mb(vq->weak_barriers, + &vring_used_event(&vq->vring), + cpu_to_virtio16(_vq->vdev, vq->last_used_idx)); + +#ifdef DEBUG + vq->last_add_time_valid = false; +#endif + + END_USE(vq); + return ret; +} +EXPORT_SYMBOL_GPL(virtqueue_get_buf_ctx); + +void *virtqueue_get_buf(struct virtqueue *_vq, unsigned int *len) +{ + return virtqueue_get_buf_ctx(_vq, len, NULL); +} +EXPORT_SYMBOL_GPL(virtqueue_get_buf); +/** + * virtqueue_disable_cb - disable callbacks + * @vq: the struct virtqueue we're talking about. + * + * Note that this is not necessarily synchronous, hence unreliable and only + * useful as an optimization. + * + * Unlike other operations, this need not be serialized. + */ +void virtqueue_disable_cb(struct virtqueue *_vq) +{ + struct vring_virtqueue *vq = to_vvq(_vq); + + if (!(vq->avail_flags_shadow & VRING_AVAIL_F_NO_INTERRUPT)) { + vq->avail_flags_shadow |= VRING_AVAIL_F_NO_INTERRUPT; + if (!vq->event) + vq->vring.avail->flags = cpu_to_virtio16(_vq->vdev, vq->avail_flags_shadow); + } + +} +EXPORT_SYMBOL_GPL(virtqueue_disable_cb); + +/** + * virtqueue_enable_cb_prepare - restart callbacks after disable_cb + * @vq: the struct virtqueue we're talking about. + * + * This re-enables callbacks; it returns current queue state + * in an opaque unsigned value. This value should be later tested by + * virtqueue_poll, to detect a possible race between the driver checking for + * more work, and enabling callbacks. + * + * Caller must ensure we don't call this with other virtqueue + * operations at the same time (except where noted). + */ +unsigned virtqueue_enable_cb_prepare(struct virtqueue *_vq) +{ + struct vring_virtqueue *vq = to_vvq(_vq); + u16 last_used_idx; + + START_USE(vq); + + /* We optimistically turn back on interrupts, then check if there was + * more to do. */ + /* Depending on the VIRTIO_RING_F_EVENT_IDX feature, we need to + * either clear the flags bit or point the event index at the next + * entry. Always do both to keep code simple. */ + if (vq->avail_flags_shadow & VRING_AVAIL_F_NO_INTERRUPT) { + vq->avail_flags_shadow &= ~VRING_AVAIL_F_NO_INTERRUPT; + if (!vq->event) + vq->vring.avail->flags = cpu_to_virtio16(_vq->vdev, vq->avail_flags_shadow); + } + vring_used_event(&vq->vring) = cpu_to_virtio16(_vq->vdev, last_used_idx = vq->last_used_idx); + END_USE(vq); + return last_used_idx; +} +EXPORT_SYMBOL_GPL(virtqueue_enable_cb_prepare); + +/** + * virtqueue_poll - query pending used buffers + * @vq: the struct virtqueue we're talking about. + * @last_used_idx: virtqueue state (from call to virtqueue_enable_cb_prepare). + * + * Returns "true" if there are pending used buffers in the queue. + * + * This does not need to be serialized. + */ +bool virtqueue_poll(struct virtqueue *_vq, unsigned last_used_idx) +{ + struct vring_virtqueue *vq = to_vvq(_vq); + + if (unlikely(vq->broken)) + return false; + + virtio_mb(vq->weak_barriers); + return (u16)last_used_idx != virtio16_to_cpu(_vq->vdev, vq->vring.used->idx); +} +EXPORT_SYMBOL_GPL(virtqueue_poll); + +/** + * virtqueue_enable_cb - restart callbacks after disable_cb. + * @vq: the struct virtqueue we're talking about. + * + * This re-enables callbacks; it returns "false" if there are pending + * buffers in the queue, to detect a possible race between the driver + * checking for more work, and enabling callbacks. + * + * Caller must ensure we don't call this with other virtqueue + * operations at the same time (except where noted). + */ +bool virtqueue_enable_cb(struct virtqueue *_vq) +{ + unsigned last_used_idx = virtqueue_enable_cb_prepare(_vq); + return !virtqueue_poll(_vq, last_used_idx); +} +EXPORT_SYMBOL_GPL(virtqueue_enable_cb); + +/** + * virtqueue_enable_cb_delayed - restart callbacks after disable_cb. + * @vq: the struct virtqueue we're talking about. + * + * This re-enables callbacks but hints to the other side to delay + * interrupts until most of the available buffers have been processed; + * it returns "false" if there are many pending buffers in the queue, + * to detect a possible race between the driver checking for more work, + * and enabling callbacks. + * + * Caller must ensure we don't call this with other virtqueue + * operations at the same time (except where noted). + */ +bool virtqueue_enable_cb_delayed(struct virtqueue *_vq) +{ + struct vring_virtqueue *vq = to_vvq(_vq); + u16 bufs; + + START_USE(vq); + + /* We optimistically turn back on interrupts, then check if there was + * more to do. */ + /* Depending on the VIRTIO_RING_F_USED_EVENT_IDX feature, we need to + * either clear the flags bit or point the event index at the next + * entry. Always update the event index to keep code simple. */ + if (vq->avail_flags_shadow & VRING_AVAIL_F_NO_INTERRUPT) { + vq->avail_flags_shadow &= ~VRING_AVAIL_F_NO_INTERRUPT; + if (!vq->event) + vq->vring.avail->flags = cpu_to_virtio16(_vq->vdev, vq->avail_flags_shadow); + } + /* TODO: tune this threshold */ + bufs = (u16)(vq->avail_idx_shadow - vq->last_used_idx) * 3 / 4; + + virtio_store_mb(vq->weak_barriers, + &vring_used_event(&vq->vring), + cpu_to_virtio16(_vq->vdev, vq->last_used_idx + bufs)); + + if (unlikely((u16)(virtio16_to_cpu(_vq->vdev, vq->vring.used->idx) - vq->last_used_idx) > bufs)) { + END_USE(vq); + return false; + } + + END_USE(vq); + return true; +} +EXPORT_SYMBOL_GPL(virtqueue_enable_cb_delayed); + +/** + * virtqueue_detach_unused_buf - detach first unused buffer + * @vq: the struct virtqueue we're talking about. + * + * Returns NULL or the "data" token handed to virtqueue_add_*(). + * This is not valid on an active queue; it is useful only for device + * shutdown. + */ +void *virtqueue_detach_unused_buf(struct virtqueue *_vq) +{ + struct vring_virtqueue *vq = to_vvq(_vq); + unsigned int i; + void *buf; + + START_USE(vq); + + for (i = 0; i < vq->vring.num; i++) { + if (!vq->desc_state[i].data) + continue; + /* detach_buf clears data, so grab it now. */ + buf = vq->desc_state[i].data; + detach_buf(vq, i, NULL); + vq->avail_idx_shadow--; + vq->vring.avail->idx = cpu_to_virtio16(_vq->vdev, vq->avail_idx_shadow); + END_USE(vq); + return buf; + } + /* That should have freed everything. */ + BUG_ON(vq->vq.num_free != vq->vring.num); + + END_USE(vq); + return NULL; +} +EXPORT_SYMBOL_GPL(virtqueue_detach_unused_buf); + +irqreturn_t vring_interrupt(int irq, void *_vq) +{ + struct vring_virtqueue *vq = to_vvq(_vq); + + if (!more_used(vq)) { + pr_debug("virtqueue interrupt with no work for %p\n", vq); + return IRQ_NONE; + } + + if (unlikely(vq->broken)) + return IRQ_HANDLED; + + pr_debug("virtqueue callback for %p (%p)\n", vq, vq->vq.callback); + if (vq->vq.callback) + vq->vq.callback(&vq->vq); + + return IRQ_HANDLED; +} +EXPORT_SYMBOL_GPL(vring_interrupt); + +struct virtqueue *__vring_new_virtqueue(unsigned int index, + struct vring vring, + struct virtio_device *vdev, + bool weak_barriers, + bool context, + bool (*notify)(struct virtqueue *), + void (*callback)(struct virtqueue *), + const char *name) +{ + unsigned int i; + struct vring_virtqueue *vq; + + vq = kmalloc(sizeof(*vq) + vring.num * sizeof(struct vring_desc_state), + GFP_KERNEL); + if (!vq) + return NULL; + + vq->vring = vring; + vq->vq.callback = callback; + vq->vq.vdev = vdev; + vq->vq.name = name; + vq->vq.num_free = vring.num; + vq->vq.index = index; + vq->we_own_ring = false; + vq->queue_dma_addr = 0; + vq->queue_size_in_bytes = 0; + vq->notify = notify; + vq->weak_barriers = weak_barriers; + vq->broken = false; + vq->last_used_idx = 0; + vq->avail_flags_shadow = 0; + vq->avail_idx_shadow = 0; + vq->num_added = 0; + list_add_tail(&vq->vq.list, &vdev->vqs); +#ifdef DEBUG + vq->in_use = false; + vq->last_add_time_valid = false; +#endif + + vq->indirect = virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC) && + !context; + vq->event = virtio_has_feature(vdev, VIRTIO_RING_F_EVENT_IDX); + + /* No callback? Tell other side not to bother us. */ + if (!callback) { + vq->avail_flags_shadow |= VRING_AVAIL_F_NO_INTERRUPT; + if (!vq->event) + vq->vring.avail->flags = cpu_to_virtio16(vdev, vq->avail_flags_shadow); + } + + /* Put everything in free lists. */ + vq->free_head = 0; + for (i = 0; i < vring.num-1; i++) + vq->vring.desc[i].next = cpu_to_virtio16(vdev, i + 1); + memset(vq->desc_state, 0, vring.num * sizeof(struct vring_desc_state)); + + return &vq->vq; +} +EXPORT_SYMBOL_GPL(__vring_new_virtqueue); + +static void *vring_alloc_queue(struct virtio_device *vdev, size_t size, + dma_addr_t *dma_handle, gfp_t flag) +{ + if (vring_use_dma_api(vdev)) { + return dma_alloc_coherent(vdev->dev.parent, size, + dma_handle, flag); + } else { + void *queue = alloc_pages_exact(PAGE_ALIGN(size), flag); + if (queue) { + phys_addr_t phys_addr = virt_to_phys(queue); + *dma_handle = (dma_addr_t)phys_addr; + + /* + * Sanity check: make sure we dind't truncate + * the address. The only arches I can find that + * have 64-bit phys_addr_t but 32-bit dma_addr_t + * are certain non-highmem MIPS and x86 + * configurations, but these configurations + * should never allocate physical pages above 32 + * bits, so this is fine. Just in case, throw a + * warning and abort if we end up with an + * unrepresentable address. + */ + if (WARN_ON_ONCE(*dma_handle != phys_addr)) { + free_pages_exact(queue, PAGE_ALIGN(size)); + return NULL; + } + } + return queue; + } +} + +static void vring_free_queue(struct virtio_device *vdev, size_t size, + void *queue, dma_addr_t dma_handle) +{ + if (vring_use_dma_api(vdev)) { + dma_free_coherent(vdev->dev.parent, size, queue, dma_handle); + } else { + free_pages_exact(queue, PAGE_ALIGN(size)); + } +} + +struct virtqueue *vring_create_virtqueue( + unsigned int index, + unsigned int num, + unsigned int vring_align, + struct virtio_device *vdev, + bool weak_barriers, + bool may_reduce_num, + bool context, + bool (*notify)(struct virtqueue *), + void (*callback)(struct virtqueue *), + const char *name) +{ + struct virtqueue *vq; + void *queue = NULL; + dma_addr_t dma_addr; + size_t queue_size_in_bytes; + struct vring vring; + + /* We assume num is a power of 2. */ + if (num & (num - 1)) { + dev_warn(&vdev->dev, "Bad virtqueue length %u\n", num); + return NULL; + } + + /* TODO: allocate each queue chunk individually */ + for (; num && vring_size(num, vring_align) > PAGE_SIZE; num /= 2) { + queue = vring_alloc_queue(vdev, vring_size(num, vring_align), + &dma_addr, + GFP_KERNEL|__GFP_NOWARN|__GFP_ZERO); + if (queue) + break; + if (!may_reduce_num) + return NULL; + } + + if (!num) + return NULL; + + if (!queue) { + /* Try to get a single page. You are my only hope! */ + queue = vring_alloc_queue(vdev, vring_size(num, vring_align), + &dma_addr, GFP_KERNEL|__GFP_ZERO); + } + if (!queue) + return NULL; + + queue_size_in_bytes = vring_size(num, vring_align); + vring_init(&vring, num, queue, vring_align); + + vq = __vring_new_virtqueue(index, vring, vdev, weak_barriers, context, + notify, callback, name); + if (!vq) { + vring_free_queue(vdev, queue_size_in_bytes, queue, + dma_addr); + return NULL; + } + + to_vvq(vq)->queue_dma_addr = dma_addr; + to_vvq(vq)->queue_size_in_bytes = queue_size_in_bytes; + to_vvq(vq)->we_own_ring = true; + + return vq; +} +EXPORT_SYMBOL_GPL(vring_create_virtqueue); + +struct virtqueue *vring_new_virtqueue(unsigned int index, + unsigned int num, + unsigned int vring_align, + struct virtio_device *vdev, + bool weak_barriers, + bool context, + void *pages, + bool (*notify)(struct virtqueue *vq), + void (*callback)(struct virtqueue *vq), + const char *name) +{ + struct vring vring; + vring_init(&vring, num, pages, vring_align); + return __vring_new_virtqueue(index, vring, vdev, weak_barriers, context, + notify, callback, name); +} +EXPORT_SYMBOL_GPL(vring_new_virtqueue); + +void vring_del_virtqueue(struct virtqueue *_vq) +{ + struct vring_virtqueue *vq = to_vvq(_vq); + + if (vq->we_own_ring) { + vring_free_queue(vq->vq.vdev, vq->queue_size_in_bytes, + vq->vring.desc, vq->queue_dma_addr); + } + list_del(&_vq->list); + kfree(vq); +} +EXPORT_SYMBOL_GPL(vring_del_virtqueue); + +/* Manipulates transport-specific feature bits. */ +void vring_transport_features(struct virtio_device *vdev) +{ + unsigned int i; + + for (i = VIRTIO_TRANSPORT_F_START; i < VIRTIO_TRANSPORT_F_END; i++) { + switch (i) { + case VIRTIO_RING_F_INDIRECT_DESC: + break; + case VIRTIO_RING_F_EVENT_IDX: + break; + case VIRTIO_F_VERSION_1: + break; + case VIRTIO_F_IOMMU_PLATFORM: + break; + default: + /* We don't understand this bit. */ + __virtio_clear_bit(vdev, i); + } + } +} +EXPORT_SYMBOL_GPL(vring_transport_features); + +/** + * virtqueue_get_vring_size - return the size of the virtqueue's vring + * @vq: the struct virtqueue containing the vring of interest. + * + * Returns the size of the vring. This is mainly used for boasting to + * userspace. Unlike other operations, this need not be serialized. + */ +unsigned int virtqueue_get_vring_size(struct virtqueue *_vq) +{ + + struct vring_virtqueue *vq = to_vvq(_vq); + + return vq->vring.num; +} +EXPORT_SYMBOL_GPL(virtqueue_get_vring_size); + +bool virtqueue_is_broken(struct virtqueue *_vq) +{ + struct vring_virtqueue *vq = to_vvq(_vq); + + return READ_ONCE(vq->broken); +} +EXPORT_SYMBOL_GPL(virtqueue_is_broken); + +/* + * This should prevent the device from being used, allowing drivers to + * recover. You may need to grab appropriate locks to flush. + */ +void virtio_break_device(struct virtio_device *dev) +{ + struct virtqueue *_vq; + + list_for_each_entry(_vq, &dev->vqs, list) { + struct vring_virtqueue *vq = to_vvq(_vq); + + /* Pairs with READ_ONCE() in virtqueue_is_broken(). */ + WRITE_ONCE(vq->broken, true); + } +} +EXPORT_SYMBOL_GPL(virtio_break_device); + +dma_addr_t virtqueue_get_desc_addr(struct virtqueue *_vq) +{ + struct vring_virtqueue *vq = to_vvq(_vq); + + BUG_ON(!vq->we_own_ring); + + return vq->queue_dma_addr; +} +EXPORT_SYMBOL_GPL(virtqueue_get_desc_addr); + +dma_addr_t virtqueue_get_avail_addr(struct virtqueue *_vq) +{ + struct vring_virtqueue *vq = to_vvq(_vq); + + BUG_ON(!vq->we_own_ring); + + return vq->queue_dma_addr + + ((char *)vq->vring.avail - (char *)vq->vring.desc); +} +EXPORT_SYMBOL_GPL(virtqueue_get_avail_addr); + +dma_addr_t virtqueue_get_used_addr(struct virtqueue *_vq) +{ + struct vring_virtqueue *vq = to_vvq(_vq); + + BUG_ON(!vq->we_own_ring); + + return vq->queue_dma_addr + + ((char *)vq->vring.used - (char *)vq->vring.desc); +} +EXPORT_SYMBOL_GPL(virtqueue_get_used_addr); + +const struct vring *virtqueue_get_vring(struct virtqueue *vq) +{ + return &to_vvq(vq)->vring; +} +EXPORT_SYMBOL_GPL(virtqueue_get_vring); + +MODULE_LICENSE("GPL"); |