diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-07 18:49:45 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-07 18:49:45 +0000 |
commit | 2c3c1048746a4622d8c89a29670120dc8fab93c4 (patch) | |
tree | 848558de17fb3008cdf4d861b01ac7781903ce39 /drivers/virt | |
parent | Initial commit. (diff) | |
download | linux-2c3c1048746a4622d8c89a29670120dc8fab93c4.tar.xz linux-2c3c1048746a4622d8c89a29670120dc8fab93c4.zip |
Adding upstream version 6.1.76.upstream/6.1.76
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'drivers/virt')
36 files changed, 12025 insertions, 0 deletions
diff --git a/drivers/virt/Kconfig b/drivers/virt/Kconfig new file mode 100644 index 000000000..87ef258ce --- /dev/null +++ b/drivers/virt/Kconfig @@ -0,0 +1,55 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +# Virtualization support drivers +# + +menuconfig VIRT_DRIVERS + bool "Virtualization drivers" + help + Say Y here to get to see options for device drivers that support + virtualization environments. + + If you say N, all options in this submenu will be skipped and disabled. + +if VIRT_DRIVERS + +config VMGENID + tristate "Virtual Machine Generation ID driver" + default y + depends on ACPI + help + Say Y here to use the hypervisor-provided Virtual Machine Generation ID + to reseed the RNG when the VM is cloned. This is highly recommended if + you intend to do any rollback / cloning / snapshotting of VMs. + + Prefer Y to M so that this protection is activated very early. + +config FSL_HV_MANAGER + tristate "Freescale hypervisor management driver" + depends on FSL_SOC + select EPAPR_PARAVIRT + help + The Freescale hypervisor management driver provides several services + to drivers and applications related to the Freescale hypervisor: + + 1) An ioctl interface for querying and managing partitions. + + 2) A file interface to reading incoming doorbells. + + 3) An interrupt handler for shutting down the partition upon + receiving the shutdown doorbell from a manager partition. + + 4) A kernel interface for receiving callbacks when a managed + partition shuts down. + +source "drivers/virt/vboxguest/Kconfig" + +source "drivers/virt/nitro_enclaves/Kconfig" + +source "drivers/virt/acrn/Kconfig" + +source "drivers/virt/coco/efi_secret/Kconfig" + +source "drivers/virt/coco/sev-guest/Kconfig" + +endif diff --git a/drivers/virt/Makefile b/drivers/virt/Makefile new file mode 100644 index 000000000..093674e05 --- /dev/null +++ b/drivers/virt/Makefile @@ -0,0 +1,13 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +# Makefile for drivers that support virtualization +# + +obj-$(CONFIG_FSL_HV_MANAGER) += fsl_hypervisor.o +obj-$(CONFIG_VMGENID) += vmgenid.o +obj-y += vboxguest/ + +obj-$(CONFIG_NITRO_ENCLAVES) += nitro_enclaves/ +obj-$(CONFIG_ACRN_HSM) += acrn/ +obj-$(CONFIG_EFI_SECRET) += coco/efi_secret/ +obj-$(CONFIG_SEV_GUEST) += coco/sev-guest/ diff --git a/drivers/virt/acrn/Kconfig b/drivers/virt/acrn/Kconfig new file mode 100644 index 000000000..3e1a61c9d --- /dev/null +++ b/drivers/virt/acrn/Kconfig @@ -0,0 +1,15 @@ +# SPDX-License-Identifier: GPL-2.0 +config ACRN_HSM + tristate "ACRN Hypervisor Service Module" + depends on ACRN_GUEST + select EVENTFD + help + ACRN Hypervisor Service Module (HSM) is a kernel module which + communicates with ACRN userspace through ioctls and talks to + the ACRN Hypervisor through hypercalls. HSM will only run in + a privileged management VM, called Service VM, to manage User + VMs and do I/O emulation. Not required for simply running + under ACRN as a User VM. + + To compile as a module, choose M, the module will be called + acrn. If unsure, say N. diff --git a/drivers/virt/acrn/Makefile b/drivers/virt/acrn/Makefile new file mode 100644 index 000000000..08ce641dc --- /dev/null +++ b/drivers/virt/acrn/Makefile @@ -0,0 +1,3 @@ +# SPDX-License-Identifier: GPL-2.0 +obj-$(CONFIG_ACRN_HSM) := acrn.o +acrn-y := hsm.o vm.o mm.o ioreq.o ioeventfd.o irqfd.o diff --git a/drivers/virt/acrn/acrn_drv.h b/drivers/virt/acrn/acrn_drv.h new file mode 100644 index 000000000..5663c17ad --- /dev/null +++ b/drivers/virt/acrn/acrn_drv.h @@ -0,0 +1,229 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef __ACRN_HSM_DRV_H +#define __ACRN_HSM_DRV_H + +#include <linux/acrn.h> +#include <linux/dev_printk.h> +#include <linux/miscdevice.h> +#include <linux/types.h> + +#include "hypercall.h" + +extern struct miscdevice acrn_dev; + +#define ACRN_NAME_LEN 16 +#define ACRN_MEM_MAPPING_MAX 256 + +#define ACRN_MEM_REGION_ADD 0 +#define ACRN_MEM_REGION_DEL 2 + +struct acrn_vm; +struct acrn_ioreq_client; + +/** + * struct vm_memory_region_op - Hypervisor memory operation + * @type: Operation type (ACRN_MEM_REGION_*) + * @attr: Memory attribute (ACRN_MEM_TYPE_* | ACRN_MEM_ACCESS_*) + * @user_vm_pa: Physical address of User VM to be mapped. + * @service_vm_pa: Physical address of Service VM to be mapped. + * @size: Size of this region. + * + * Structure containing needed information that is provided to ACRN Hypervisor + * to manage the EPT mappings of a single memory region of the User VM. Several + * &struct vm_memory_region_op can be batched to ACRN Hypervisor, see &struct + * vm_memory_region_batch. + */ +struct vm_memory_region_op { + u32 type; + u32 attr; + u64 user_vm_pa; + u64 service_vm_pa; + u64 size; +}; + +/** + * struct vm_memory_region_batch - A batch of vm_memory_region_op. + * @vmid: A User VM ID. + * @reserved: Reserved. + * @regions_num: The number of vm_memory_region_op. + * @regions_gpa: Physical address of a vm_memory_region_op array. + * @regions_op: Flexible array of vm_memory_region_op. + * + * HC_VM_SET_MEMORY_REGIONS uses this structure to manage EPT mappings of + * multiple memory regions of a User VM. A &struct vm_memory_region_batch + * contains multiple &struct vm_memory_region_op for batch processing in the + * ACRN Hypervisor. + */ +struct vm_memory_region_batch { + u16 vmid; + u16 reserved[3]; + u32 regions_num; + u64 regions_gpa; + struct vm_memory_region_op regions_op[]; +}; + +/** + * struct vm_memory_mapping - Memory map between a User VM and the Service VM + * @pages: Pages in Service VM kernel. + * @npages: Number of pages. + * @service_vm_va: Virtual address in Service VM kernel. + * @user_vm_pa: Physical address in User VM. + * @size: Size of this memory region. + * + * HSM maintains memory mappings between a User VM GPA and the Service VM + * kernel VA for accelerating the User VM GPA translation. + */ +struct vm_memory_mapping { + struct page **pages; + int npages; + void *service_vm_va; + u64 user_vm_pa; + size_t size; +}; + +/** + * struct acrn_ioreq_buffer - Data for setting the ioreq buffer of User VM + * @ioreq_buf: The GPA of the IO request shared buffer of a VM + * + * The parameter for the HC_SET_IOREQ_BUFFER hypercall used to set up + * the shared I/O request buffer between Service VM and ACRN hypervisor. + */ +struct acrn_ioreq_buffer { + u64 ioreq_buf; +}; + +struct acrn_ioreq_range { + struct list_head list; + u32 type; + u64 start; + u64 end; +}; + +#define ACRN_IOREQ_CLIENT_DESTROYING 0U +typedef int (*ioreq_handler_t)(struct acrn_ioreq_client *client, + struct acrn_io_request *req); +/** + * struct acrn_ioreq_client - Structure of I/O client. + * @name: Client name + * @vm: The VM that the client belongs to + * @list: List node for this acrn_ioreq_client + * @is_default: If this client is the default one + * @flags: Flags (ACRN_IOREQ_CLIENT_*) + * @range_list: I/O ranges + * @range_lock: Lock to protect range_list + * @ioreqs_map: The pending I/O requests bitmap. + * @handler: I/O requests handler of this client + * @thread: The thread which executes the handler + * @wq: The wait queue for the handler thread parking + * @priv: Data for the thread + */ +struct acrn_ioreq_client { + char name[ACRN_NAME_LEN]; + struct acrn_vm *vm; + struct list_head list; + bool is_default; + unsigned long flags; + struct list_head range_list; + rwlock_t range_lock; + DECLARE_BITMAP(ioreqs_map, ACRN_IO_REQUEST_MAX); + ioreq_handler_t handler; + struct task_struct *thread; + wait_queue_head_t wq; + void *priv; +}; + +#define ACRN_INVALID_VMID (0xffffU) + +#define ACRN_VM_FLAG_DESTROYED 0U +#define ACRN_VM_FLAG_CLEARING_IOREQ 1U +extern struct list_head acrn_vm_list; +extern rwlock_t acrn_vm_list_lock; +/** + * struct acrn_vm - Properties of ACRN User VM. + * @list: Entry within global list of all VMs. + * @vmid: User VM ID. + * @vcpu_num: Number of virtual CPUs in the VM. + * @flags: Flags (ACRN_VM_FLAG_*) of the VM. This is VM + * flag management in HSM which is different + * from the &acrn_vm_creation.vm_flag. + * @regions_mapping_lock: Lock to protect &acrn_vm.regions_mapping and + * &acrn_vm.regions_mapping_count. + * @regions_mapping: Memory mappings of this VM. + * @regions_mapping_count: Number of memory mapping of this VM. + * @ioreq_clients_lock: Lock to protect ioreq_clients and default_client + * @ioreq_clients: The I/O request clients list of this VM + * @default_client: The default I/O request client + * @ioreq_buf: I/O request shared buffer + * @ioreq_page: The page of the I/O request shared buffer + * @pci_conf_addr: Address of a PCI configuration access emulation + * @monitor_page: Page of interrupt statistics of User VM + * @ioeventfds_lock: Lock to protect ioeventfds list + * @ioeventfds: List to link all hsm_ioeventfd + * @ioeventfd_client: I/O client for ioeventfds of the VM + * @irqfds_lock: Lock to protect irqfds list + * @irqfds: List to link all hsm_irqfd + * @irqfd_wq: Workqueue for irqfd async shutdown + */ +struct acrn_vm { + struct list_head list; + u16 vmid; + int vcpu_num; + unsigned long flags; + struct mutex regions_mapping_lock; + struct vm_memory_mapping regions_mapping[ACRN_MEM_MAPPING_MAX]; + int regions_mapping_count; + spinlock_t ioreq_clients_lock; + struct list_head ioreq_clients; + struct acrn_ioreq_client *default_client; + struct acrn_io_request_buffer *ioreq_buf; + struct page *ioreq_page; + u32 pci_conf_addr; + struct page *monitor_page; + struct mutex ioeventfds_lock; + struct list_head ioeventfds; + struct acrn_ioreq_client *ioeventfd_client; + struct mutex irqfds_lock; + struct list_head irqfds; + struct workqueue_struct *irqfd_wq; +}; + +struct acrn_vm *acrn_vm_create(struct acrn_vm *vm, + struct acrn_vm_creation *vm_param); +int acrn_vm_destroy(struct acrn_vm *vm); +int acrn_mm_region_add(struct acrn_vm *vm, u64 user_gpa, u64 service_gpa, + u64 size, u32 mem_type, u32 mem_access_right); +int acrn_mm_region_del(struct acrn_vm *vm, u64 user_gpa, u64 size); +int acrn_vm_memseg_map(struct acrn_vm *vm, struct acrn_vm_memmap *memmap); +int acrn_vm_memseg_unmap(struct acrn_vm *vm, struct acrn_vm_memmap *memmap); +int acrn_vm_ram_map(struct acrn_vm *vm, struct acrn_vm_memmap *memmap); +void acrn_vm_all_ram_unmap(struct acrn_vm *vm); + +int acrn_ioreq_init(struct acrn_vm *vm, u64 buf_vma); +void acrn_ioreq_deinit(struct acrn_vm *vm); +int acrn_ioreq_intr_setup(void); +void acrn_ioreq_intr_remove(void); +void acrn_ioreq_request_clear(struct acrn_vm *vm); +int acrn_ioreq_client_wait(struct acrn_ioreq_client *client); +int acrn_ioreq_request_default_complete(struct acrn_vm *vm, u16 vcpu); +struct acrn_ioreq_client *acrn_ioreq_client_create(struct acrn_vm *vm, + ioreq_handler_t handler, + void *data, bool is_default, + const char *name); +void acrn_ioreq_client_destroy(struct acrn_ioreq_client *client); +int acrn_ioreq_range_add(struct acrn_ioreq_client *client, + u32 type, u64 start, u64 end); +void acrn_ioreq_range_del(struct acrn_ioreq_client *client, + u32 type, u64 start, u64 end); + +int acrn_msi_inject(struct acrn_vm *vm, u64 msi_addr, u64 msi_data); + +int acrn_ioeventfd_init(struct acrn_vm *vm); +int acrn_ioeventfd_config(struct acrn_vm *vm, struct acrn_ioeventfd *args); +void acrn_ioeventfd_deinit(struct acrn_vm *vm); + +int acrn_irqfd_init(struct acrn_vm *vm); +int acrn_irqfd_config(struct acrn_vm *vm, struct acrn_irqfd *args); +void acrn_irqfd_deinit(struct acrn_vm *vm); + +#endif /* __ACRN_HSM_DRV_H */ diff --git a/drivers/virt/acrn/hsm.c b/drivers/virt/acrn/hsm.c new file mode 100644 index 000000000..423ea888d --- /dev/null +++ b/drivers/virt/acrn/hsm.c @@ -0,0 +1,538 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * ACRN Hypervisor Service Module (HSM) + * + * Copyright (C) 2020 Intel Corporation. All rights reserved. + * + * Authors: + * Fengwei Yin <fengwei.yin@intel.com> + * Yakui Zhao <yakui.zhao@intel.com> + */ + +#include <linux/cpu.h> +#include <linux/io.h> +#include <linux/mm.h> +#include <linux/module.h> +#include <linux/slab.h> + +#include <asm/acrn.h> +#include <asm/hypervisor.h> + +#include "acrn_drv.h" + +/* + * When /dev/acrn_hsm is opened, a 'struct acrn_vm' object is created to + * represent a VM instance and continues to be associated with the opened file + * descriptor. All ioctl operations on this file descriptor will be targeted to + * the VM instance. Release of this file descriptor will destroy the object. + */ +static int acrn_dev_open(struct inode *inode, struct file *filp) +{ + struct acrn_vm *vm; + + vm = kzalloc(sizeof(*vm), GFP_KERNEL); + if (!vm) + return -ENOMEM; + + vm->vmid = ACRN_INVALID_VMID; + filp->private_data = vm; + return 0; +} + +static int pmcmd_ioctl(u64 cmd, void __user *uptr) +{ + struct acrn_pstate_data *px_data; + struct acrn_cstate_data *cx_data; + u64 *pm_info; + int ret = 0; + + switch (cmd & PMCMD_TYPE_MASK) { + case ACRN_PMCMD_GET_PX_CNT: + case ACRN_PMCMD_GET_CX_CNT: + pm_info = kmalloc(sizeof(u64), GFP_KERNEL); + if (!pm_info) + return -ENOMEM; + + ret = hcall_get_cpu_state(cmd, virt_to_phys(pm_info)); + if (ret < 0) { + kfree(pm_info); + break; + } + + if (copy_to_user(uptr, pm_info, sizeof(u64))) + ret = -EFAULT; + kfree(pm_info); + break; + case ACRN_PMCMD_GET_PX_DATA: + px_data = kmalloc(sizeof(*px_data), GFP_KERNEL); + if (!px_data) + return -ENOMEM; + + ret = hcall_get_cpu_state(cmd, virt_to_phys(px_data)); + if (ret < 0) { + kfree(px_data); + break; + } + + if (copy_to_user(uptr, px_data, sizeof(*px_data))) + ret = -EFAULT; + kfree(px_data); + break; + case ACRN_PMCMD_GET_CX_DATA: + cx_data = kmalloc(sizeof(*cx_data), GFP_KERNEL); + if (!cx_data) + return -ENOMEM; + + ret = hcall_get_cpu_state(cmd, virt_to_phys(cx_data)); + if (ret < 0) { + kfree(cx_data); + break; + } + + if (copy_to_user(uptr, cx_data, sizeof(*cx_data))) + ret = -EFAULT; + kfree(cx_data); + break; + default: + break; + } + + return ret; +} + +/* + * HSM relies on hypercall layer of the ACRN hypervisor to do the + * sanity check against the input parameters. + */ +static long acrn_dev_ioctl(struct file *filp, unsigned int cmd, + unsigned long ioctl_param) +{ + struct acrn_vm *vm = filp->private_data; + struct acrn_vm_creation *vm_param; + struct acrn_vcpu_regs *cpu_regs; + struct acrn_ioreq_notify notify; + struct acrn_ptdev_irq *irq_info; + struct acrn_ioeventfd ioeventfd; + struct acrn_vm_memmap memmap; + struct acrn_mmiodev *mmiodev; + struct acrn_msi_entry *msi; + struct acrn_pcidev *pcidev; + struct acrn_irqfd irqfd; + struct acrn_vdev *vdev; + struct page *page; + u64 cstate_cmd; + int i, ret = 0; + + if (vm->vmid == ACRN_INVALID_VMID && cmd != ACRN_IOCTL_CREATE_VM) { + dev_dbg(acrn_dev.this_device, + "ioctl 0x%x: Invalid VM state!\n", cmd); + return -EINVAL; + } + + switch (cmd) { + case ACRN_IOCTL_CREATE_VM: + vm_param = memdup_user((void __user *)ioctl_param, + sizeof(struct acrn_vm_creation)); + if (IS_ERR(vm_param)) + return PTR_ERR(vm_param); + + if ((vm_param->reserved0 | vm_param->reserved1) != 0) { + kfree(vm_param); + return -EINVAL; + } + + vm = acrn_vm_create(vm, vm_param); + if (!vm) { + ret = -EINVAL; + kfree(vm_param); + break; + } + + if (copy_to_user((void __user *)ioctl_param, vm_param, + sizeof(struct acrn_vm_creation))) { + acrn_vm_destroy(vm); + ret = -EFAULT; + } + + kfree(vm_param); + break; + case ACRN_IOCTL_START_VM: + ret = hcall_start_vm(vm->vmid); + if (ret < 0) + dev_dbg(acrn_dev.this_device, + "Failed to start VM %u!\n", vm->vmid); + break; + case ACRN_IOCTL_PAUSE_VM: + ret = hcall_pause_vm(vm->vmid); + if (ret < 0) + dev_dbg(acrn_dev.this_device, + "Failed to pause VM %u!\n", vm->vmid); + break; + case ACRN_IOCTL_RESET_VM: + ret = hcall_reset_vm(vm->vmid); + if (ret < 0) + dev_dbg(acrn_dev.this_device, + "Failed to restart VM %u!\n", vm->vmid); + break; + case ACRN_IOCTL_DESTROY_VM: + ret = acrn_vm_destroy(vm); + break; + case ACRN_IOCTL_SET_VCPU_REGS: + cpu_regs = memdup_user((void __user *)ioctl_param, + sizeof(struct acrn_vcpu_regs)); + if (IS_ERR(cpu_regs)) + return PTR_ERR(cpu_regs); + + for (i = 0; i < ARRAY_SIZE(cpu_regs->reserved); i++) + if (cpu_regs->reserved[i]) { + kfree(cpu_regs); + return -EINVAL; + } + + for (i = 0; i < ARRAY_SIZE(cpu_regs->vcpu_regs.reserved_32); i++) + if (cpu_regs->vcpu_regs.reserved_32[i]) { + kfree(cpu_regs); + return -EINVAL; + } + + for (i = 0; i < ARRAY_SIZE(cpu_regs->vcpu_regs.reserved_64); i++) + if (cpu_regs->vcpu_regs.reserved_64[i]) { + kfree(cpu_regs); + return -EINVAL; + } + + for (i = 0; i < ARRAY_SIZE(cpu_regs->vcpu_regs.gdt.reserved); i++) + if (cpu_regs->vcpu_regs.gdt.reserved[i] | + cpu_regs->vcpu_regs.idt.reserved[i]) { + kfree(cpu_regs); + return -EINVAL; + } + + ret = hcall_set_vcpu_regs(vm->vmid, virt_to_phys(cpu_regs)); + if (ret < 0) + dev_dbg(acrn_dev.this_device, + "Failed to set regs state of VM%u!\n", + vm->vmid); + kfree(cpu_regs); + break; + case ACRN_IOCTL_SET_MEMSEG: + if (copy_from_user(&memmap, (void __user *)ioctl_param, + sizeof(memmap))) + return -EFAULT; + + ret = acrn_vm_memseg_map(vm, &memmap); + break; + case ACRN_IOCTL_UNSET_MEMSEG: + if (copy_from_user(&memmap, (void __user *)ioctl_param, + sizeof(memmap))) + return -EFAULT; + + ret = acrn_vm_memseg_unmap(vm, &memmap); + break; + case ACRN_IOCTL_ASSIGN_MMIODEV: + mmiodev = memdup_user((void __user *)ioctl_param, + sizeof(struct acrn_mmiodev)); + if (IS_ERR(mmiodev)) + return PTR_ERR(mmiodev); + + ret = hcall_assign_mmiodev(vm->vmid, virt_to_phys(mmiodev)); + if (ret < 0) + dev_dbg(acrn_dev.this_device, + "Failed to assign MMIO device!\n"); + kfree(mmiodev); + break; + case ACRN_IOCTL_DEASSIGN_MMIODEV: + mmiodev = memdup_user((void __user *)ioctl_param, + sizeof(struct acrn_mmiodev)); + if (IS_ERR(mmiodev)) + return PTR_ERR(mmiodev); + + ret = hcall_deassign_mmiodev(vm->vmid, virt_to_phys(mmiodev)); + if (ret < 0) + dev_dbg(acrn_dev.this_device, + "Failed to deassign MMIO device!\n"); + kfree(mmiodev); + break; + case ACRN_IOCTL_ASSIGN_PCIDEV: + pcidev = memdup_user((void __user *)ioctl_param, + sizeof(struct acrn_pcidev)); + if (IS_ERR(pcidev)) + return PTR_ERR(pcidev); + + ret = hcall_assign_pcidev(vm->vmid, virt_to_phys(pcidev)); + if (ret < 0) + dev_dbg(acrn_dev.this_device, + "Failed to assign pci device!\n"); + kfree(pcidev); + break; + case ACRN_IOCTL_DEASSIGN_PCIDEV: + pcidev = memdup_user((void __user *)ioctl_param, + sizeof(struct acrn_pcidev)); + if (IS_ERR(pcidev)) + return PTR_ERR(pcidev); + + ret = hcall_deassign_pcidev(vm->vmid, virt_to_phys(pcidev)); + if (ret < 0) + dev_dbg(acrn_dev.this_device, + "Failed to deassign pci device!\n"); + kfree(pcidev); + break; + case ACRN_IOCTL_CREATE_VDEV: + vdev = memdup_user((void __user *)ioctl_param, + sizeof(struct acrn_vdev)); + if (IS_ERR(vdev)) + return PTR_ERR(vdev); + + ret = hcall_create_vdev(vm->vmid, virt_to_phys(vdev)); + if (ret < 0) + dev_dbg(acrn_dev.this_device, + "Failed to create virtual device!\n"); + kfree(vdev); + break; + case ACRN_IOCTL_DESTROY_VDEV: + vdev = memdup_user((void __user *)ioctl_param, + sizeof(struct acrn_vdev)); + if (IS_ERR(vdev)) + return PTR_ERR(vdev); + ret = hcall_destroy_vdev(vm->vmid, virt_to_phys(vdev)); + if (ret < 0) + dev_dbg(acrn_dev.this_device, + "Failed to destroy virtual device!\n"); + kfree(vdev); + break; + case ACRN_IOCTL_SET_PTDEV_INTR: + irq_info = memdup_user((void __user *)ioctl_param, + sizeof(struct acrn_ptdev_irq)); + if (IS_ERR(irq_info)) + return PTR_ERR(irq_info); + + ret = hcall_set_ptdev_intr(vm->vmid, virt_to_phys(irq_info)); + if (ret < 0) + dev_dbg(acrn_dev.this_device, + "Failed to configure intr for ptdev!\n"); + kfree(irq_info); + break; + case ACRN_IOCTL_RESET_PTDEV_INTR: + irq_info = memdup_user((void __user *)ioctl_param, + sizeof(struct acrn_ptdev_irq)); + if (IS_ERR(irq_info)) + return PTR_ERR(irq_info); + + ret = hcall_reset_ptdev_intr(vm->vmid, virt_to_phys(irq_info)); + if (ret < 0) + dev_dbg(acrn_dev.this_device, + "Failed to reset intr for ptdev!\n"); + kfree(irq_info); + break; + case ACRN_IOCTL_SET_IRQLINE: + ret = hcall_set_irqline(vm->vmid, ioctl_param); + if (ret < 0) + dev_dbg(acrn_dev.this_device, + "Failed to set interrupt line!\n"); + break; + case ACRN_IOCTL_INJECT_MSI: + msi = memdup_user((void __user *)ioctl_param, + sizeof(struct acrn_msi_entry)); + if (IS_ERR(msi)) + return PTR_ERR(msi); + + ret = hcall_inject_msi(vm->vmid, virt_to_phys(msi)); + if (ret < 0) + dev_dbg(acrn_dev.this_device, + "Failed to inject MSI!\n"); + kfree(msi); + break; + case ACRN_IOCTL_VM_INTR_MONITOR: + ret = pin_user_pages_fast(ioctl_param, 1, + FOLL_WRITE | FOLL_LONGTERM, &page); + if (unlikely(ret != 1)) { + dev_dbg(acrn_dev.this_device, + "Failed to pin intr hdr buffer!\n"); + return -EFAULT; + } + + ret = hcall_vm_intr_monitor(vm->vmid, page_to_phys(page)); + if (ret < 0) { + unpin_user_page(page); + dev_dbg(acrn_dev.this_device, + "Failed to monitor intr data!\n"); + return ret; + } + if (vm->monitor_page) + unpin_user_page(vm->monitor_page); + vm->monitor_page = page; + break; + case ACRN_IOCTL_CREATE_IOREQ_CLIENT: + if (vm->default_client) + return -EEXIST; + if (!acrn_ioreq_client_create(vm, NULL, NULL, true, "acrndm")) + ret = -EINVAL; + break; + case ACRN_IOCTL_DESTROY_IOREQ_CLIENT: + if (vm->default_client) + acrn_ioreq_client_destroy(vm->default_client); + break; + case ACRN_IOCTL_ATTACH_IOREQ_CLIENT: + if (vm->default_client) + ret = acrn_ioreq_client_wait(vm->default_client); + else + ret = -ENODEV; + break; + case ACRN_IOCTL_NOTIFY_REQUEST_FINISH: + if (copy_from_user(¬ify, (void __user *)ioctl_param, + sizeof(struct acrn_ioreq_notify))) + return -EFAULT; + + if (notify.reserved != 0) + return -EINVAL; + + ret = acrn_ioreq_request_default_complete(vm, notify.vcpu); + break; + case ACRN_IOCTL_CLEAR_VM_IOREQ: + acrn_ioreq_request_clear(vm); + break; + case ACRN_IOCTL_PM_GET_CPU_STATE: + if (copy_from_user(&cstate_cmd, (void __user *)ioctl_param, + sizeof(cstate_cmd))) + return -EFAULT; + + ret = pmcmd_ioctl(cstate_cmd, (void __user *)ioctl_param); + break; + case ACRN_IOCTL_IOEVENTFD: + if (copy_from_user(&ioeventfd, (void __user *)ioctl_param, + sizeof(ioeventfd))) + return -EFAULT; + + if (ioeventfd.reserved != 0) + return -EINVAL; + + ret = acrn_ioeventfd_config(vm, &ioeventfd); + break; + case ACRN_IOCTL_IRQFD: + if (copy_from_user(&irqfd, (void __user *)ioctl_param, + sizeof(irqfd))) + return -EFAULT; + ret = acrn_irqfd_config(vm, &irqfd); + break; + default: + dev_dbg(acrn_dev.this_device, "Unknown IOCTL 0x%x!\n", cmd); + ret = -ENOTTY; + } + + return ret; +} + +static int acrn_dev_release(struct inode *inode, struct file *filp) +{ + struct acrn_vm *vm = filp->private_data; + + acrn_vm_destroy(vm); + kfree(vm); + return 0; +} + +static ssize_t remove_cpu_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + u64 cpu, lapicid; + int ret; + + if (kstrtoull(buf, 0, &cpu) < 0) + return -EINVAL; + + if (cpu >= num_possible_cpus() || cpu == 0 || !cpu_is_hotpluggable(cpu)) + return -EINVAL; + + if (cpu_online(cpu)) + remove_cpu(cpu); + + lapicid = cpu_data(cpu).apicid; + dev_dbg(dev, "Try to remove cpu %lld with lapicid %lld\n", cpu, lapicid); + ret = hcall_sos_remove_cpu(lapicid); + if (ret < 0) { + dev_err(dev, "Failed to remove cpu %lld!\n", cpu); + goto fail_remove; + } + + return count; + +fail_remove: + add_cpu(cpu); + return ret; +} +static DEVICE_ATTR_WO(remove_cpu); + +static umode_t acrn_attr_visible(struct kobject *kobj, struct attribute *a, int n) +{ + if (a == &dev_attr_remove_cpu.attr) + return IS_ENABLED(CONFIG_HOTPLUG_CPU) ? a->mode : 0; + + return a->mode; +} + +static struct attribute *acrn_attrs[] = { + &dev_attr_remove_cpu.attr, + NULL +}; + +static struct attribute_group acrn_attr_group = { + .attrs = acrn_attrs, + .is_visible = acrn_attr_visible, +}; + +static const struct attribute_group *acrn_attr_groups[] = { + &acrn_attr_group, + NULL +}; + +static const struct file_operations acrn_fops = { + .owner = THIS_MODULE, + .open = acrn_dev_open, + .release = acrn_dev_release, + .unlocked_ioctl = acrn_dev_ioctl, +}; + +struct miscdevice acrn_dev = { + .minor = MISC_DYNAMIC_MINOR, + .name = "acrn_hsm", + .fops = &acrn_fops, + .groups = acrn_attr_groups, +}; + +static int __init hsm_init(void) +{ + int ret; + + if (x86_hyper_type != X86_HYPER_ACRN) + return -ENODEV; + + if (!(cpuid_eax(ACRN_CPUID_FEATURES) & ACRN_FEATURE_PRIVILEGED_VM)) + return -EPERM; + + ret = misc_register(&acrn_dev); + if (ret) { + pr_err("Create misc dev failed!\n"); + return ret; + } + + ret = acrn_ioreq_intr_setup(); + if (ret) { + pr_err("Setup I/O request handler failed!\n"); + misc_deregister(&acrn_dev); + return ret; + } + return 0; +} + +static void __exit hsm_exit(void) +{ + acrn_ioreq_intr_remove(); + misc_deregister(&acrn_dev); +} +module_init(hsm_init); +module_exit(hsm_exit); + +MODULE_AUTHOR("Intel Corporation"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("ACRN Hypervisor Service Module (HSM)"); diff --git a/drivers/virt/acrn/hypercall.h b/drivers/virt/acrn/hypercall.h new file mode 100644 index 000000000..71d300821 --- /dev/null +++ b/drivers/virt/acrn/hypercall.h @@ -0,0 +1,306 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * ACRN HSM: hypercalls of ACRN Hypervisor + */ +#ifndef __ACRN_HSM_HYPERCALL_H +#define __ACRN_HSM_HYPERCALL_H +#include <asm/acrn.h> + +/* + * Hypercall IDs of the ACRN Hypervisor + */ +#define _HC_ID(x, y) (((x) << 24) | (y)) + +#define HC_ID 0x80UL + +#define HC_ID_GEN_BASE 0x0UL +#define HC_SOS_REMOVE_CPU _HC_ID(HC_ID, HC_ID_GEN_BASE + 0x01) + +#define HC_ID_VM_BASE 0x10UL +#define HC_CREATE_VM _HC_ID(HC_ID, HC_ID_VM_BASE + 0x00) +#define HC_DESTROY_VM _HC_ID(HC_ID, HC_ID_VM_BASE + 0x01) +#define HC_START_VM _HC_ID(HC_ID, HC_ID_VM_BASE + 0x02) +#define HC_PAUSE_VM _HC_ID(HC_ID, HC_ID_VM_BASE + 0x03) +#define HC_RESET_VM _HC_ID(HC_ID, HC_ID_VM_BASE + 0x05) +#define HC_SET_VCPU_REGS _HC_ID(HC_ID, HC_ID_VM_BASE + 0x06) + +#define HC_ID_IRQ_BASE 0x20UL +#define HC_INJECT_MSI _HC_ID(HC_ID, HC_ID_IRQ_BASE + 0x03) +#define HC_VM_INTR_MONITOR _HC_ID(HC_ID, HC_ID_IRQ_BASE + 0x04) +#define HC_SET_IRQLINE _HC_ID(HC_ID, HC_ID_IRQ_BASE + 0x05) + +#define HC_ID_IOREQ_BASE 0x30UL +#define HC_SET_IOREQ_BUFFER _HC_ID(HC_ID, HC_ID_IOREQ_BASE + 0x00) +#define HC_NOTIFY_REQUEST_FINISH _HC_ID(HC_ID, HC_ID_IOREQ_BASE + 0x01) + +#define HC_ID_MEM_BASE 0x40UL +#define HC_VM_SET_MEMORY_REGIONS _HC_ID(HC_ID, HC_ID_MEM_BASE + 0x02) + +#define HC_ID_PCI_BASE 0x50UL +#define HC_SET_PTDEV_INTR _HC_ID(HC_ID, HC_ID_PCI_BASE + 0x03) +#define HC_RESET_PTDEV_INTR _HC_ID(HC_ID, HC_ID_PCI_BASE + 0x04) +#define HC_ASSIGN_PCIDEV _HC_ID(HC_ID, HC_ID_PCI_BASE + 0x05) +#define HC_DEASSIGN_PCIDEV _HC_ID(HC_ID, HC_ID_PCI_BASE + 0x06) +#define HC_ASSIGN_MMIODEV _HC_ID(HC_ID, HC_ID_PCI_BASE + 0x07) +#define HC_DEASSIGN_MMIODEV _HC_ID(HC_ID, HC_ID_PCI_BASE + 0x08) +#define HC_CREATE_VDEV _HC_ID(HC_ID, HC_ID_PCI_BASE + 0x09) +#define HC_DESTROY_VDEV _HC_ID(HC_ID, HC_ID_PCI_BASE + 0x0A) + +#define HC_ID_PM_BASE 0x80UL +#define HC_PM_GET_CPU_STATE _HC_ID(HC_ID, HC_ID_PM_BASE + 0x00) + +/** + * hcall_sos_remove_cpu() - Remove a vCPU of Service VM + * @cpu: The vCPU to be removed + * + * Return: 0 on success, <0 on failure + */ +static inline long hcall_sos_remove_cpu(u64 cpu) +{ + return acrn_hypercall1(HC_SOS_REMOVE_CPU, cpu); +} + +/** + * hcall_create_vm() - Create a User VM + * @vminfo: Service VM GPA of info of User VM creation + * + * Return: 0 on success, <0 on failure + */ +static inline long hcall_create_vm(u64 vminfo) +{ + return acrn_hypercall1(HC_CREATE_VM, vminfo); +} + +/** + * hcall_start_vm() - Start a User VM + * @vmid: User VM ID + * + * Return: 0 on success, <0 on failure + */ +static inline long hcall_start_vm(u64 vmid) +{ + return acrn_hypercall1(HC_START_VM, vmid); +} + +/** + * hcall_pause_vm() - Pause a User VM + * @vmid: User VM ID + * + * Return: 0 on success, <0 on failure + */ +static inline long hcall_pause_vm(u64 vmid) +{ + return acrn_hypercall1(HC_PAUSE_VM, vmid); +} + +/** + * hcall_destroy_vm() - Destroy a User VM + * @vmid: User VM ID + * + * Return: 0 on success, <0 on failure + */ +static inline long hcall_destroy_vm(u64 vmid) +{ + return acrn_hypercall1(HC_DESTROY_VM, vmid); +} + +/** + * hcall_reset_vm() - Reset a User VM + * @vmid: User VM ID + * + * Return: 0 on success, <0 on failure + */ +static inline long hcall_reset_vm(u64 vmid) +{ + return acrn_hypercall1(HC_RESET_VM, vmid); +} + +/** + * hcall_set_vcpu_regs() - Set up registers of virtual CPU of a User VM + * @vmid: User VM ID + * @regs_state: Service VM GPA of registers state + * + * Return: 0 on success, <0 on failure + */ +static inline long hcall_set_vcpu_regs(u64 vmid, u64 regs_state) +{ + return acrn_hypercall2(HC_SET_VCPU_REGS, vmid, regs_state); +} + +/** + * hcall_inject_msi() - Deliver a MSI interrupt to a User VM + * @vmid: User VM ID + * @msi: Service VM GPA of MSI message + * + * Return: 0 on success, <0 on failure + */ +static inline long hcall_inject_msi(u64 vmid, u64 msi) +{ + return acrn_hypercall2(HC_INJECT_MSI, vmid, msi); +} + +/** + * hcall_vm_intr_monitor() - Set a shared page for User VM interrupt statistics + * @vmid: User VM ID + * @addr: Service VM GPA of the shared page + * + * Return: 0 on success, <0 on failure + */ +static inline long hcall_vm_intr_monitor(u64 vmid, u64 addr) +{ + return acrn_hypercall2(HC_VM_INTR_MONITOR, vmid, addr); +} + +/** + * hcall_set_irqline() - Set or clear an interrupt line + * @vmid: User VM ID + * @op: Service VM GPA of interrupt line operations + * + * Return: 0 on success, <0 on failure + */ +static inline long hcall_set_irqline(u64 vmid, u64 op) +{ + return acrn_hypercall2(HC_SET_IRQLINE, vmid, op); +} + +/** + * hcall_set_ioreq_buffer() - Set up the shared buffer for I/O Requests. + * @vmid: User VM ID + * @buffer: Service VM GPA of the shared buffer + * + * Return: 0 on success, <0 on failure + */ +static inline long hcall_set_ioreq_buffer(u64 vmid, u64 buffer) +{ + return acrn_hypercall2(HC_SET_IOREQ_BUFFER, vmid, buffer); +} + +/** + * hcall_notify_req_finish() - Notify ACRN Hypervisor of I/O request completion. + * @vmid: User VM ID + * @vcpu: The vCPU which initiated the I/O request + * + * Return: 0 on success, <0 on failure + */ +static inline long hcall_notify_req_finish(u64 vmid, u64 vcpu) +{ + return acrn_hypercall2(HC_NOTIFY_REQUEST_FINISH, vmid, vcpu); +} + +/** + * hcall_set_memory_regions() - Inform the hypervisor to set up EPT mappings + * @regions_pa: Service VM GPA of &struct vm_memory_region_batch + * + * Return: 0 on success, <0 on failure + */ +static inline long hcall_set_memory_regions(u64 regions_pa) +{ + return acrn_hypercall1(HC_VM_SET_MEMORY_REGIONS, regions_pa); +} + +/** + * hcall_create_vdev() - Create a virtual device for a User VM + * @vmid: User VM ID + * @addr: Service VM GPA of the &struct acrn_vdev + * + * Return: 0 on success, <0 on failure + */ +static inline long hcall_create_vdev(u64 vmid, u64 addr) +{ + return acrn_hypercall2(HC_CREATE_VDEV, vmid, addr); +} + +/** + * hcall_destroy_vdev() - Destroy a virtual device of a User VM + * @vmid: User VM ID + * @addr: Service VM GPA of the &struct acrn_vdev + * + * Return: 0 on success, <0 on failure + */ +static inline long hcall_destroy_vdev(u64 vmid, u64 addr) +{ + return acrn_hypercall2(HC_DESTROY_VDEV, vmid, addr); +} + +/** + * hcall_assign_mmiodev() - Assign a MMIO device to a User VM + * @vmid: User VM ID + * @addr: Service VM GPA of the &struct acrn_mmiodev + * + * Return: 0 on success, <0 on failure + */ +static inline long hcall_assign_mmiodev(u64 vmid, u64 addr) +{ + return acrn_hypercall2(HC_ASSIGN_MMIODEV, vmid, addr); +} + +/** + * hcall_deassign_mmiodev() - De-assign a PCI device from a User VM + * @vmid: User VM ID + * @addr: Service VM GPA of the &struct acrn_mmiodev + * + * Return: 0 on success, <0 on failure + */ +static inline long hcall_deassign_mmiodev(u64 vmid, u64 addr) +{ + return acrn_hypercall2(HC_DEASSIGN_MMIODEV, vmid, addr); +} + +/** + * hcall_assign_pcidev() - Assign a PCI device to a User VM + * @vmid: User VM ID + * @addr: Service VM GPA of the &struct acrn_pcidev + * + * Return: 0 on success, <0 on failure + */ +static inline long hcall_assign_pcidev(u64 vmid, u64 addr) +{ + return acrn_hypercall2(HC_ASSIGN_PCIDEV, vmid, addr); +} + +/** + * hcall_deassign_pcidev() - De-assign a PCI device from a User VM + * @vmid: User VM ID + * @addr: Service VM GPA of the &struct acrn_pcidev + * + * Return: 0 on success, <0 on failure + */ +static inline long hcall_deassign_pcidev(u64 vmid, u64 addr) +{ + return acrn_hypercall2(HC_DEASSIGN_PCIDEV, vmid, addr); +} + +/** + * hcall_set_ptdev_intr() - Configure an interrupt for an assigned PCI device. + * @vmid: User VM ID + * @irq: Service VM GPA of the &struct acrn_ptdev_irq + * + * Return: 0 on success, <0 on failure + */ +static inline long hcall_set_ptdev_intr(u64 vmid, u64 irq) +{ + return acrn_hypercall2(HC_SET_PTDEV_INTR, vmid, irq); +} + +/** + * hcall_reset_ptdev_intr() - Reset an interrupt for an assigned PCI device. + * @vmid: User VM ID + * @irq: Service VM GPA of the &struct acrn_ptdev_irq + * + * Return: 0 on success, <0 on failure + */ +static inline long hcall_reset_ptdev_intr(u64 vmid, u64 irq) +{ + return acrn_hypercall2(HC_RESET_PTDEV_INTR, vmid, irq); +} + +/* + * hcall_get_cpu_state() - Get P-states and C-states info from the hypervisor + * @state: Service VM GPA of buffer of P-states and C-states + */ +static inline long hcall_get_cpu_state(u64 cmd, u64 state) +{ + return acrn_hypercall2(HC_PM_GET_CPU_STATE, cmd, state); +} + +#endif /* __ACRN_HSM_HYPERCALL_H */ diff --git a/drivers/virt/acrn/ioeventfd.c b/drivers/virt/acrn/ioeventfd.c new file mode 100644 index 000000000..ac4037e9f --- /dev/null +++ b/drivers/virt/acrn/ioeventfd.c @@ -0,0 +1,273 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * ACRN HSM eventfd - use eventfd objects to signal expected I/O requests + * + * Copyright (C) 2020 Intel Corporation. All rights reserved. + * + * Authors: + * Shuo Liu <shuo.a.liu@intel.com> + * Yakui Zhao <yakui.zhao@intel.com> + */ + +#include <linux/eventfd.h> +#include <linux/slab.h> + +#include "acrn_drv.h" + +/** + * struct hsm_ioeventfd - Properties of HSM ioeventfd + * @list: Entry within &acrn_vm.ioeventfds of ioeventfds of a VM + * @eventfd: Eventfd of the HSM ioeventfd + * @addr: Address of I/O range + * @data: Data for matching + * @length: Length of I/O range + * @type: Type of I/O range (ACRN_IOREQ_TYPE_MMIO/ACRN_IOREQ_TYPE_PORTIO) + * @wildcard: Data matching or not + */ +struct hsm_ioeventfd { + struct list_head list; + struct eventfd_ctx *eventfd; + u64 addr; + u64 data; + int length; + int type; + bool wildcard; +}; + +static inline int ioreq_type_from_flags(int flags) +{ + return flags & ACRN_IOEVENTFD_FLAG_PIO ? + ACRN_IOREQ_TYPE_PORTIO : ACRN_IOREQ_TYPE_MMIO; +} + +static void acrn_ioeventfd_shutdown(struct acrn_vm *vm, struct hsm_ioeventfd *p) +{ + lockdep_assert_held(&vm->ioeventfds_lock); + + eventfd_ctx_put(p->eventfd); + list_del(&p->list); + kfree(p); +} + +static bool hsm_ioeventfd_is_conflict(struct acrn_vm *vm, + struct hsm_ioeventfd *ioeventfd) +{ + struct hsm_ioeventfd *p; + + lockdep_assert_held(&vm->ioeventfds_lock); + + /* Either one is wildcard, the data matching will be skipped. */ + list_for_each_entry(p, &vm->ioeventfds, list) + if (p->eventfd == ioeventfd->eventfd && + p->addr == ioeventfd->addr && + p->type == ioeventfd->type && + (p->wildcard || ioeventfd->wildcard || + p->data == ioeventfd->data)) + return true; + + return false; +} + +/* + * Assign an eventfd to a VM and create a HSM ioeventfd associated with the + * eventfd. The properties of the HSM ioeventfd are built from a &struct + * acrn_ioeventfd. + */ +static int acrn_ioeventfd_assign(struct acrn_vm *vm, + struct acrn_ioeventfd *args) +{ + struct eventfd_ctx *eventfd; + struct hsm_ioeventfd *p; + int ret; + + /* Check for range overflow */ + if (args->addr + args->len < args->addr) + return -EINVAL; + + /* + * Currently, acrn_ioeventfd is used to support vhost. 1,2,4,8 width + * accesses can cover vhost's requirements. + */ + if (!(args->len == 1 || args->len == 2 || + args->len == 4 || args->len == 8)) + return -EINVAL; + + eventfd = eventfd_ctx_fdget(args->fd); + if (IS_ERR(eventfd)) + return PTR_ERR(eventfd); + + p = kzalloc(sizeof(*p), GFP_KERNEL); + if (!p) { + ret = -ENOMEM; + goto fail; + } + + INIT_LIST_HEAD(&p->list); + p->addr = args->addr; + p->length = args->len; + p->eventfd = eventfd; + p->type = ioreq_type_from_flags(args->flags); + + /* + * ACRN_IOEVENTFD_FLAG_DATAMATCH flag is set in virtio 1.0 support, the + * writing of notification register of each virtqueue may trigger the + * notification. There is no data matching requirement. + */ + if (args->flags & ACRN_IOEVENTFD_FLAG_DATAMATCH) + p->data = args->data; + else + p->wildcard = true; + + mutex_lock(&vm->ioeventfds_lock); + + if (hsm_ioeventfd_is_conflict(vm, p)) { + ret = -EEXIST; + goto unlock_fail; + } + + /* register the I/O range into ioreq client */ + ret = acrn_ioreq_range_add(vm->ioeventfd_client, p->type, + p->addr, p->addr + p->length - 1); + if (ret < 0) + goto unlock_fail; + + list_add_tail(&p->list, &vm->ioeventfds); + mutex_unlock(&vm->ioeventfds_lock); + + return 0; + +unlock_fail: + mutex_unlock(&vm->ioeventfds_lock); + kfree(p); +fail: + eventfd_ctx_put(eventfd); + return ret; +} + +static int acrn_ioeventfd_deassign(struct acrn_vm *vm, + struct acrn_ioeventfd *args) +{ + struct hsm_ioeventfd *p; + struct eventfd_ctx *eventfd; + + eventfd = eventfd_ctx_fdget(args->fd); + if (IS_ERR(eventfd)) + return PTR_ERR(eventfd); + + mutex_lock(&vm->ioeventfds_lock); + list_for_each_entry(p, &vm->ioeventfds, list) { + if (p->eventfd != eventfd) + continue; + + acrn_ioreq_range_del(vm->ioeventfd_client, p->type, + p->addr, p->addr + p->length - 1); + acrn_ioeventfd_shutdown(vm, p); + break; + } + mutex_unlock(&vm->ioeventfds_lock); + + eventfd_ctx_put(eventfd); + return 0; +} + +static struct hsm_ioeventfd *hsm_ioeventfd_match(struct acrn_vm *vm, u64 addr, + u64 data, int len, int type) +{ + struct hsm_ioeventfd *p = NULL; + + lockdep_assert_held(&vm->ioeventfds_lock); + + list_for_each_entry(p, &vm->ioeventfds, list) { + if (p->type == type && p->addr == addr && p->length >= len && + (p->wildcard || p->data == data)) + return p; + } + + return NULL; +} + +static int acrn_ioeventfd_handler(struct acrn_ioreq_client *client, + struct acrn_io_request *req) +{ + struct hsm_ioeventfd *p; + u64 addr, val; + int size; + + if (req->type == ACRN_IOREQ_TYPE_MMIO) { + /* + * I/O requests are dispatched by range check only, so a + * acrn_ioreq_client need process both READ and WRITE accesses + * of same range. READ accesses are safe to be ignored here + * because virtio PCI devices write the notify registers for + * notification. + */ + if (req->reqs.mmio_request.direction == ACRN_IOREQ_DIR_READ) { + /* reading does nothing and return 0 */ + req->reqs.mmio_request.value = 0; + return 0; + } + addr = req->reqs.mmio_request.address; + size = req->reqs.mmio_request.size; + val = req->reqs.mmio_request.value; + } else { + if (req->reqs.pio_request.direction == ACRN_IOREQ_DIR_READ) { + /* reading does nothing and return 0 */ + req->reqs.pio_request.value = 0; + return 0; + } + addr = req->reqs.pio_request.address; + size = req->reqs.pio_request.size; + val = req->reqs.pio_request.value; + } + + mutex_lock(&client->vm->ioeventfds_lock); + p = hsm_ioeventfd_match(client->vm, addr, val, size, req->type); + if (p) + eventfd_signal(p->eventfd, 1); + mutex_unlock(&client->vm->ioeventfds_lock); + + return 0; +} + +int acrn_ioeventfd_config(struct acrn_vm *vm, struct acrn_ioeventfd *args) +{ + int ret; + + if (args->flags & ACRN_IOEVENTFD_FLAG_DEASSIGN) + ret = acrn_ioeventfd_deassign(vm, args); + else + ret = acrn_ioeventfd_assign(vm, args); + + return ret; +} + +int acrn_ioeventfd_init(struct acrn_vm *vm) +{ + char name[ACRN_NAME_LEN]; + + mutex_init(&vm->ioeventfds_lock); + INIT_LIST_HEAD(&vm->ioeventfds); + snprintf(name, sizeof(name), "ioeventfd-%u", vm->vmid); + vm->ioeventfd_client = acrn_ioreq_client_create(vm, + acrn_ioeventfd_handler, + NULL, false, name); + if (!vm->ioeventfd_client) { + dev_err(acrn_dev.this_device, "Failed to create ioeventfd ioreq client!\n"); + return -EINVAL; + } + + dev_dbg(acrn_dev.this_device, "VM %u ioeventfd init.\n", vm->vmid); + return 0; +} + +void acrn_ioeventfd_deinit(struct acrn_vm *vm) +{ + struct hsm_ioeventfd *p, *next; + + dev_dbg(acrn_dev.this_device, "VM %u ioeventfd deinit.\n", vm->vmid); + acrn_ioreq_client_destroy(vm->ioeventfd_client); + mutex_lock(&vm->ioeventfds_lock); + list_for_each_entry_safe(p, next, &vm->ioeventfds, list) + acrn_ioeventfd_shutdown(vm, p); + mutex_unlock(&vm->ioeventfds_lock); +} diff --git a/drivers/virt/acrn/ioreq.c b/drivers/virt/acrn/ioreq.c new file mode 100644 index 000000000..d75ab3f66 --- /dev/null +++ b/drivers/virt/acrn/ioreq.c @@ -0,0 +1,652 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * ACRN_HSM: Handle I/O requests + * + * Copyright (C) 2020 Intel Corporation. All rights reserved. + * + * Authors: + * Jason Chen CJ <jason.cj.chen@intel.com> + * Fengwei Yin <fengwei.yin@intel.com> + */ + +#include <linux/interrupt.h> +#include <linux/io.h> +#include <linux/kthread.h> +#include <linux/mm.h> +#include <linux/slab.h> + +#include <asm/acrn.h> + +#include "acrn_drv.h" + +static void ioreq_pause(void); +static void ioreq_resume(void); + +static void ioreq_dispatcher(struct work_struct *work); +static struct workqueue_struct *ioreq_wq; +static DECLARE_WORK(ioreq_work, ioreq_dispatcher); + +static inline bool has_pending_request(struct acrn_ioreq_client *client) +{ + return !bitmap_empty(client->ioreqs_map, ACRN_IO_REQUEST_MAX); +} + +static inline bool is_destroying(struct acrn_ioreq_client *client) +{ + return test_bit(ACRN_IOREQ_CLIENT_DESTROYING, &client->flags); +} + +static int ioreq_complete_request(struct acrn_vm *vm, u16 vcpu, + struct acrn_io_request *acrn_req) +{ + bool polling_mode; + int ret = 0; + + polling_mode = acrn_req->completion_polling; + /* Add barrier() to make sure the writes are done before completion */ + smp_store_release(&acrn_req->processed, ACRN_IOREQ_STATE_COMPLETE); + + /* + * To fulfill the requirement of real-time in several industry + * scenarios, like automotive, ACRN can run under the partition mode, + * in which User VMs and Service VM are bound to dedicated CPU cores. + * Polling mode of handling the I/O request is introduced to achieve a + * faster I/O request handling. In polling mode, the hypervisor polls + * I/O request's completion. Once an I/O request is marked as + * ACRN_IOREQ_STATE_COMPLETE, hypervisor resumes from the polling point + * to continue the I/O request flow. Thus, the completion notification + * from HSM of I/O request is not needed. Please note, + * completion_polling needs to be read before the I/O request being + * marked as ACRN_IOREQ_STATE_COMPLETE to avoid racing with the + * hypervisor. + */ + if (!polling_mode) { + ret = hcall_notify_req_finish(vm->vmid, vcpu); + if (ret < 0) + dev_err(acrn_dev.this_device, + "Notify I/O request finished failed!\n"); + } + + return ret; +} + +static int acrn_ioreq_complete_request(struct acrn_ioreq_client *client, + u16 vcpu, + struct acrn_io_request *acrn_req) +{ + int ret; + + if (vcpu >= client->vm->vcpu_num) + return -EINVAL; + + clear_bit(vcpu, client->ioreqs_map); + if (!acrn_req) { + acrn_req = (struct acrn_io_request *)client->vm->ioreq_buf; + acrn_req += vcpu; + } + + ret = ioreq_complete_request(client->vm, vcpu, acrn_req); + + return ret; +} + +int acrn_ioreq_request_default_complete(struct acrn_vm *vm, u16 vcpu) +{ + int ret = 0; + + spin_lock_bh(&vm->ioreq_clients_lock); + if (vm->default_client) + ret = acrn_ioreq_complete_request(vm->default_client, + vcpu, NULL); + spin_unlock_bh(&vm->ioreq_clients_lock); + + return ret; +} + +/** + * acrn_ioreq_range_add() - Add an iorange monitored by an ioreq client + * @client: The ioreq client + * @type: Type (ACRN_IOREQ_TYPE_MMIO or ACRN_IOREQ_TYPE_PORTIO) + * @start: Start address of iorange + * @end: End address of iorange + * + * Return: 0 on success, <0 on error + */ +int acrn_ioreq_range_add(struct acrn_ioreq_client *client, + u32 type, u64 start, u64 end) +{ + struct acrn_ioreq_range *range; + + if (end < start) { + dev_err(acrn_dev.this_device, + "Invalid IO range [0x%llx,0x%llx]\n", start, end); + return -EINVAL; + } + + range = kzalloc(sizeof(*range), GFP_KERNEL); + if (!range) + return -ENOMEM; + + range->type = type; + range->start = start; + range->end = end; + + write_lock_bh(&client->range_lock); + list_add(&range->list, &client->range_list); + write_unlock_bh(&client->range_lock); + + return 0; +} + +/** + * acrn_ioreq_range_del() - Del an iorange monitored by an ioreq client + * @client: The ioreq client + * @type: Type (ACRN_IOREQ_TYPE_MMIO or ACRN_IOREQ_TYPE_PORTIO) + * @start: Start address of iorange + * @end: End address of iorange + */ +void acrn_ioreq_range_del(struct acrn_ioreq_client *client, + u32 type, u64 start, u64 end) +{ + struct acrn_ioreq_range *range; + + write_lock_bh(&client->range_lock); + list_for_each_entry(range, &client->range_list, list) { + if (type == range->type && + start == range->start && + end == range->end) { + list_del(&range->list); + kfree(range); + break; + } + } + write_unlock_bh(&client->range_lock); +} + +/* + * ioreq_task() is the execution entity of handler thread of an I/O client. + * The handler callback of the I/O client is called within the handler thread. + */ +static int ioreq_task(void *data) +{ + struct acrn_ioreq_client *client = data; + struct acrn_io_request *req; + unsigned long *ioreqs_map; + int vcpu, ret; + + /* + * Lockless access to ioreqs_map is safe, because + * 1) set_bit() and clear_bit() are atomic operations. + * 2) I/O requests arrives serialized. The access flow of ioreqs_map is: + * set_bit() - in ioreq_work handler + * Handler callback handles corresponding I/O request + * clear_bit() - in handler thread (include ACRN userspace) + * Mark corresponding I/O request completed + * Loop again if a new I/O request occurs + */ + ioreqs_map = client->ioreqs_map; + while (!kthread_should_stop()) { + acrn_ioreq_client_wait(client); + while (has_pending_request(client)) { + vcpu = find_first_bit(ioreqs_map, client->vm->vcpu_num); + req = client->vm->ioreq_buf->req_slot + vcpu; + ret = client->handler(client, req); + if (ret < 0) { + dev_err(acrn_dev.this_device, + "IO handle failure: %d\n", ret); + break; + } + acrn_ioreq_complete_request(client, vcpu, req); + } + } + + return 0; +} + +/* + * For the non-default I/O clients, give them chance to complete the current + * I/O requests if there are any. For the default I/O client, it is safe to + * clear all pending I/O requests because the clearing request is from ACRN + * userspace. + */ +void acrn_ioreq_request_clear(struct acrn_vm *vm) +{ + struct acrn_ioreq_client *client; + bool has_pending = false; + unsigned long vcpu; + int retry = 10; + + /* + * IO requests of this VM will be completed directly in + * acrn_ioreq_dispatch if ACRN_VM_FLAG_CLEARING_IOREQ flag is set. + */ + set_bit(ACRN_VM_FLAG_CLEARING_IOREQ, &vm->flags); + + /* + * acrn_ioreq_request_clear is only called in VM reset case. Simply + * wait 100ms in total for the IO requests' completion. + */ + do { + spin_lock_bh(&vm->ioreq_clients_lock); + list_for_each_entry(client, &vm->ioreq_clients, list) { + has_pending = has_pending_request(client); + if (has_pending) + break; + } + spin_unlock_bh(&vm->ioreq_clients_lock); + + if (has_pending) + schedule_timeout_interruptible(HZ / 100); + } while (has_pending && --retry > 0); + if (retry == 0) + dev_warn(acrn_dev.this_device, + "%s cannot flush pending request!\n", client->name); + + /* Clear all ioreqs belonging to the default client */ + spin_lock_bh(&vm->ioreq_clients_lock); + client = vm->default_client; + if (client) { + for_each_set_bit(vcpu, client->ioreqs_map, ACRN_IO_REQUEST_MAX) + acrn_ioreq_complete_request(client, vcpu, NULL); + } + spin_unlock_bh(&vm->ioreq_clients_lock); + + /* Clear ACRN_VM_FLAG_CLEARING_IOREQ flag after the clearing */ + clear_bit(ACRN_VM_FLAG_CLEARING_IOREQ, &vm->flags); +} + +int acrn_ioreq_client_wait(struct acrn_ioreq_client *client) +{ + if (client->is_default) { + /* + * In the default client, a user space thread waits on the + * waitqueue. The is_destroying() check is used to notify user + * space the client is going to be destroyed. + */ + wait_event_interruptible(client->wq, + has_pending_request(client) || + is_destroying(client)); + if (is_destroying(client)) + return -ENODEV; + } else { + wait_event_interruptible(client->wq, + has_pending_request(client) || + kthread_should_stop()); + } + + return 0; +} + +static bool is_cfg_addr(struct acrn_io_request *req) +{ + return ((req->type == ACRN_IOREQ_TYPE_PORTIO) && + (req->reqs.pio_request.address == 0xcf8)); +} + +static bool is_cfg_data(struct acrn_io_request *req) +{ + return ((req->type == ACRN_IOREQ_TYPE_PORTIO) && + ((req->reqs.pio_request.address >= 0xcfc) && + (req->reqs.pio_request.address < (0xcfc + 4)))); +} + +/* The low 8-bit of supported pci_reg addr.*/ +#define PCI_LOWREG_MASK 0xFC +/* The high 4-bit of supported pci_reg addr */ +#define PCI_HIGHREG_MASK 0xF00 +/* Max number of supported functions */ +#define PCI_FUNCMAX 7 +/* Max number of supported slots */ +#define PCI_SLOTMAX 31 +/* Max number of supported buses */ +#define PCI_BUSMAX 255 +#define CONF1_ENABLE 0x80000000UL +/* + * A PCI configuration space access via PIO 0xCF8 and 0xCFC normally has two + * following steps: + * 1) writes address into 0xCF8 port + * 2) accesses data in/from 0xCFC + * This function combines such paired PCI configuration space I/O requests into + * one ACRN_IOREQ_TYPE_PCICFG type I/O request and continues the processing. + */ +static bool handle_cf8cfc(struct acrn_vm *vm, + struct acrn_io_request *req, u16 vcpu) +{ + int offset, pci_cfg_addr, pci_reg; + bool is_handled = false; + + if (is_cfg_addr(req)) { + WARN_ON(req->reqs.pio_request.size != 4); + if (req->reqs.pio_request.direction == ACRN_IOREQ_DIR_WRITE) + vm->pci_conf_addr = req->reqs.pio_request.value; + else + req->reqs.pio_request.value = vm->pci_conf_addr; + is_handled = true; + } else if (is_cfg_data(req)) { + if (!(vm->pci_conf_addr & CONF1_ENABLE)) { + if (req->reqs.pio_request.direction == + ACRN_IOREQ_DIR_READ) + req->reqs.pio_request.value = 0xffffffff; + is_handled = true; + } else { + offset = req->reqs.pio_request.address - 0xcfc; + + req->type = ACRN_IOREQ_TYPE_PCICFG; + pci_cfg_addr = vm->pci_conf_addr; + req->reqs.pci_request.bus = + (pci_cfg_addr >> 16) & PCI_BUSMAX; + req->reqs.pci_request.dev = + (pci_cfg_addr >> 11) & PCI_SLOTMAX; + req->reqs.pci_request.func = + (pci_cfg_addr >> 8) & PCI_FUNCMAX; + pci_reg = (pci_cfg_addr & PCI_LOWREG_MASK) + + ((pci_cfg_addr >> 16) & PCI_HIGHREG_MASK); + req->reqs.pci_request.reg = pci_reg + offset; + } + } + + if (is_handled) + ioreq_complete_request(vm, vcpu, req); + + return is_handled; +} + +static bool in_range(struct acrn_ioreq_range *range, + struct acrn_io_request *req) +{ + bool ret = false; + + if (range->type == req->type) { + switch (req->type) { + case ACRN_IOREQ_TYPE_MMIO: + if (req->reqs.mmio_request.address >= range->start && + (req->reqs.mmio_request.address + + req->reqs.mmio_request.size - 1) <= range->end) + ret = true; + break; + case ACRN_IOREQ_TYPE_PORTIO: + if (req->reqs.pio_request.address >= range->start && + (req->reqs.pio_request.address + + req->reqs.pio_request.size - 1) <= range->end) + ret = true; + break; + default: + break; + } + } + + return ret; +} + +static struct acrn_ioreq_client *find_ioreq_client(struct acrn_vm *vm, + struct acrn_io_request *req) +{ + struct acrn_ioreq_client *client, *found = NULL; + struct acrn_ioreq_range *range; + + lockdep_assert_held(&vm->ioreq_clients_lock); + + list_for_each_entry(client, &vm->ioreq_clients, list) { + read_lock_bh(&client->range_lock); + list_for_each_entry(range, &client->range_list, list) { + if (in_range(range, req)) { + found = client; + break; + } + } + read_unlock_bh(&client->range_lock); + if (found) + break; + } + return found ? found : vm->default_client; +} + +/** + * acrn_ioreq_client_create() - Create an ioreq client + * @vm: The VM that this client belongs to + * @handler: The ioreq_handler of ioreq client acrn_hsm will create a kernel + * thread and call the handler to handle I/O requests. + * @priv: Private data for the handler + * @is_default: If it is the default client + * @name: The name of ioreq client + * + * Return: acrn_ioreq_client pointer on success, NULL on error + */ +struct acrn_ioreq_client *acrn_ioreq_client_create(struct acrn_vm *vm, + ioreq_handler_t handler, + void *priv, bool is_default, + const char *name) +{ + struct acrn_ioreq_client *client; + + if (!handler && !is_default) { + dev_dbg(acrn_dev.this_device, + "Cannot create non-default client w/o handler!\n"); + return NULL; + } + client = kzalloc(sizeof(*client), GFP_KERNEL); + if (!client) + return NULL; + + client->handler = handler; + client->vm = vm; + client->priv = priv; + client->is_default = is_default; + if (name) + strncpy(client->name, name, sizeof(client->name) - 1); + rwlock_init(&client->range_lock); + INIT_LIST_HEAD(&client->range_list); + init_waitqueue_head(&client->wq); + + if (client->handler) { + client->thread = kthread_run(ioreq_task, client, "VM%u-%s", + client->vm->vmid, client->name); + if (IS_ERR(client->thread)) { + kfree(client); + return NULL; + } + } + + spin_lock_bh(&vm->ioreq_clients_lock); + if (is_default) + vm->default_client = client; + else + list_add(&client->list, &vm->ioreq_clients); + spin_unlock_bh(&vm->ioreq_clients_lock); + + dev_dbg(acrn_dev.this_device, "Created ioreq client %s.\n", name); + return client; +} + +/** + * acrn_ioreq_client_destroy() - Destroy an ioreq client + * @client: The ioreq client + */ +void acrn_ioreq_client_destroy(struct acrn_ioreq_client *client) +{ + struct acrn_ioreq_range *range, *next; + struct acrn_vm *vm = client->vm; + + dev_dbg(acrn_dev.this_device, + "Destroy ioreq client %s.\n", client->name); + ioreq_pause(); + set_bit(ACRN_IOREQ_CLIENT_DESTROYING, &client->flags); + if (client->is_default) + wake_up_interruptible(&client->wq); + else + kthread_stop(client->thread); + + spin_lock_bh(&vm->ioreq_clients_lock); + if (client->is_default) + vm->default_client = NULL; + else + list_del(&client->list); + spin_unlock_bh(&vm->ioreq_clients_lock); + + write_lock_bh(&client->range_lock); + list_for_each_entry_safe(range, next, &client->range_list, list) { + list_del(&range->list); + kfree(range); + } + write_unlock_bh(&client->range_lock); + kfree(client); + + ioreq_resume(); +} + +static int acrn_ioreq_dispatch(struct acrn_vm *vm) +{ + struct acrn_ioreq_client *client; + struct acrn_io_request *req; + int i; + + for (i = 0; i < vm->vcpu_num; i++) { + req = vm->ioreq_buf->req_slot + i; + + /* barrier the read of processed of acrn_io_request */ + if (smp_load_acquire(&req->processed) == + ACRN_IOREQ_STATE_PENDING) { + /* Complete the IO request directly in clearing stage */ + if (test_bit(ACRN_VM_FLAG_CLEARING_IOREQ, &vm->flags)) { + ioreq_complete_request(vm, i, req); + continue; + } + if (handle_cf8cfc(vm, req, i)) + continue; + + spin_lock_bh(&vm->ioreq_clients_lock); + client = find_ioreq_client(vm, req); + if (!client) { + dev_err(acrn_dev.this_device, + "Failed to find ioreq client!\n"); + spin_unlock_bh(&vm->ioreq_clients_lock); + return -EINVAL; + } + if (!client->is_default) + req->kernel_handled = 1; + else + req->kernel_handled = 0; + /* + * Add barrier() to make sure the writes are done + * before setting ACRN_IOREQ_STATE_PROCESSING + */ + smp_store_release(&req->processed, + ACRN_IOREQ_STATE_PROCESSING); + set_bit(i, client->ioreqs_map); + wake_up_interruptible(&client->wq); + spin_unlock_bh(&vm->ioreq_clients_lock); + } + } + + return 0; +} + +static void ioreq_dispatcher(struct work_struct *work) +{ + struct acrn_vm *vm; + + read_lock(&acrn_vm_list_lock); + list_for_each_entry(vm, &acrn_vm_list, list) { + if (!vm->ioreq_buf) + break; + acrn_ioreq_dispatch(vm); + } + read_unlock(&acrn_vm_list_lock); +} + +static void ioreq_intr_handler(void) +{ + queue_work(ioreq_wq, &ioreq_work); +} + +static void ioreq_pause(void) +{ + /* Flush and unarm the handler to ensure no I/O requests pending */ + acrn_remove_intr_handler(); + drain_workqueue(ioreq_wq); +} + +static void ioreq_resume(void) +{ + /* Schedule after enabling in case other clients miss interrupt */ + acrn_setup_intr_handler(ioreq_intr_handler); + queue_work(ioreq_wq, &ioreq_work); +} + +int acrn_ioreq_intr_setup(void) +{ + acrn_setup_intr_handler(ioreq_intr_handler); + ioreq_wq = alloc_workqueue("ioreq_wq", + WQ_HIGHPRI | WQ_MEM_RECLAIM | WQ_UNBOUND, 1); + if (!ioreq_wq) { + dev_err(acrn_dev.this_device, "Failed to alloc workqueue!\n"); + acrn_remove_intr_handler(); + return -ENOMEM; + } + return 0; +} + +void acrn_ioreq_intr_remove(void) +{ + if (ioreq_wq) + destroy_workqueue(ioreq_wq); + acrn_remove_intr_handler(); +} + +int acrn_ioreq_init(struct acrn_vm *vm, u64 buf_vma) +{ + struct acrn_ioreq_buffer *set_buffer; + struct page *page; + int ret; + + if (vm->ioreq_buf) + return -EEXIST; + + set_buffer = kzalloc(sizeof(*set_buffer), GFP_KERNEL); + if (!set_buffer) + return -ENOMEM; + + ret = pin_user_pages_fast(buf_vma, 1, + FOLL_WRITE | FOLL_LONGTERM, &page); + if (unlikely(ret != 1) || !page) { + dev_err(acrn_dev.this_device, "Failed to pin ioreq page!\n"); + ret = -EFAULT; + goto free_buf; + } + + vm->ioreq_buf = page_address(page); + vm->ioreq_page = page; + set_buffer->ioreq_buf = page_to_phys(page); + ret = hcall_set_ioreq_buffer(vm->vmid, virt_to_phys(set_buffer)); + if (ret < 0) { + dev_err(acrn_dev.this_device, "Failed to init ioreq buffer!\n"); + unpin_user_page(page); + vm->ioreq_buf = NULL; + goto free_buf; + } + + dev_dbg(acrn_dev.this_device, + "Init ioreq buffer %pK!\n", vm->ioreq_buf); + ret = 0; +free_buf: + kfree(set_buffer); + return ret; +} + +void acrn_ioreq_deinit(struct acrn_vm *vm) +{ + struct acrn_ioreq_client *client, *next; + + dev_dbg(acrn_dev.this_device, + "Deinit ioreq buffer %pK!\n", vm->ioreq_buf); + /* Destroy all clients belonging to this VM */ + list_for_each_entry_safe(client, next, &vm->ioreq_clients, list) + acrn_ioreq_client_destroy(client); + if (vm->default_client) + acrn_ioreq_client_destroy(vm->default_client); + + if (vm->ioreq_buf && vm->ioreq_page) { + unpin_user_page(vm->ioreq_page); + vm->ioreq_buf = NULL; + } +} diff --git a/drivers/virt/acrn/irqfd.c b/drivers/virt/acrn/irqfd.c new file mode 100644 index 000000000..d4ad211dc --- /dev/null +++ b/drivers/virt/acrn/irqfd.c @@ -0,0 +1,234 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * ACRN HSM irqfd: use eventfd objects to inject virtual interrupts + * + * Copyright (C) 2020 Intel Corporation. All rights reserved. + * + * Authors: + * Shuo Liu <shuo.a.liu@intel.com> + * Yakui Zhao <yakui.zhao@intel.com> + */ + +#include <linux/eventfd.h> +#include <linux/file.h> +#include <linux/poll.h> +#include <linux/slab.h> + +#include "acrn_drv.h" + +static LIST_HEAD(acrn_irqfd_clients); + +/** + * struct hsm_irqfd - Properties of HSM irqfd + * @vm: Associated VM pointer + * @wait: Entry of wait-queue + * @shutdown: Async shutdown work + * @eventfd: Associated eventfd + * @list: Entry within &acrn_vm.irqfds of irqfds of a VM + * @pt: Structure for select/poll on the associated eventfd + * @msi: MSI data + */ +struct hsm_irqfd { + struct acrn_vm *vm; + wait_queue_entry_t wait; + struct work_struct shutdown; + struct eventfd_ctx *eventfd; + struct list_head list; + poll_table pt; + struct acrn_msi_entry msi; +}; + +static void acrn_irqfd_inject(struct hsm_irqfd *irqfd) +{ + struct acrn_vm *vm = irqfd->vm; + + acrn_msi_inject(vm, irqfd->msi.msi_addr, + irqfd->msi.msi_data); +} + +static void hsm_irqfd_shutdown(struct hsm_irqfd *irqfd) +{ + u64 cnt; + + lockdep_assert_held(&irqfd->vm->irqfds_lock); + + /* remove from wait queue */ + list_del_init(&irqfd->list); + eventfd_ctx_remove_wait_queue(irqfd->eventfd, &irqfd->wait, &cnt); + eventfd_ctx_put(irqfd->eventfd); + kfree(irqfd); +} + +static void hsm_irqfd_shutdown_work(struct work_struct *work) +{ + struct hsm_irqfd *irqfd; + struct acrn_vm *vm; + + irqfd = container_of(work, struct hsm_irqfd, shutdown); + vm = irqfd->vm; + mutex_lock(&vm->irqfds_lock); + if (!list_empty(&irqfd->list)) + hsm_irqfd_shutdown(irqfd); + mutex_unlock(&vm->irqfds_lock); +} + +/* Called with wqh->lock held and interrupts disabled */ +static int hsm_irqfd_wakeup(wait_queue_entry_t *wait, unsigned int mode, + int sync, void *key) +{ + unsigned long poll_bits = (unsigned long)key; + struct hsm_irqfd *irqfd; + struct acrn_vm *vm; + + irqfd = container_of(wait, struct hsm_irqfd, wait); + vm = irqfd->vm; + if (poll_bits & POLLIN) + /* An event has been signaled, inject an interrupt */ + acrn_irqfd_inject(irqfd); + + if (poll_bits & POLLHUP) + /* Do shutdown work in thread to hold wqh->lock */ + queue_work(vm->irqfd_wq, &irqfd->shutdown); + + return 0; +} + +static void hsm_irqfd_poll_func(struct file *file, wait_queue_head_t *wqh, + poll_table *pt) +{ + struct hsm_irqfd *irqfd; + + irqfd = container_of(pt, struct hsm_irqfd, pt); + add_wait_queue(wqh, &irqfd->wait); +} + +/* + * Assign an eventfd to a VM and create a HSM irqfd associated with the + * eventfd. The properties of the HSM irqfd are built from a &struct + * acrn_irqfd. + */ +static int acrn_irqfd_assign(struct acrn_vm *vm, struct acrn_irqfd *args) +{ + struct eventfd_ctx *eventfd = NULL; + struct hsm_irqfd *irqfd, *tmp; + __poll_t events; + struct fd f; + int ret = 0; + + irqfd = kzalloc(sizeof(*irqfd), GFP_KERNEL); + if (!irqfd) + return -ENOMEM; + + irqfd->vm = vm; + memcpy(&irqfd->msi, &args->msi, sizeof(args->msi)); + INIT_LIST_HEAD(&irqfd->list); + INIT_WORK(&irqfd->shutdown, hsm_irqfd_shutdown_work); + + f = fdget(args->fd); + if (!f.file) { + ret = -EBADF; + goto out; + } + + eventfd = eventfd_ctx_fileget(f.file); + if (IS_ERR(eventfd)) { + ret = PTR_ERR(eventfd); + goto fail; + } + + irqfd->eventfd = eventfd; + + /* + * Install custom wake-up handling to be notified whenever underlying + * eventfd is signaled. + */ + init_waitqueue_func_entry(&irqfd->wait, hsm_irqfd_wakeup); + init_poll_funcptr(&irqfd->pt, hsm_irqfd_poll_func); + + mutex_lock(&vm->irqfds_lock); + list_for_each_entry(tmp, &vm->irqfds, list) { + if (irqfd->eventfd != tmp->eventfd) + continue; + ret = -EBUSY; + mutex_unlock(&vm->irqfds_lock); + goto fail; + } + list_add_tail(&irqfd->list, &vm->irqfds); + mutex_unlock(&vm->irqfds_lock); + + /* Check the pending event in this stage */ + events = vfs_poll(f.file, &irqfd->pt); + + if (events & EPOLLIN) + acrn_irqfd_inject(irqfd); + + fdput(f); + return 0; +fail: + if (eventfd && !IS_ERR(eventfd)) + eventfd_ctx_put(eventfd); + + fdput(f); +out: + kfree(irqfd); + return ret; +} + +static int acrn_irqfd_deassign(struct acrn_vm *vm, + struct acrn_irqfd *args) +{ + struct hsm_irqfd *irqfd, *tmp; + struct eventfd_ctx *eventfd; + + eventfd = eventfd_ctx_fdget(args->fd); + if (IS_ERR(eventfd)) + return PTR_ERR(eventfd); + + mutex_lock(&vm->irqfds_lock); + list_for_each_entry_safe(irqfd, tmp, &vm->irqfds, list) { + if (irqfd->eventfd == eventfd) { + hsm_irqfd_shutdown(irqfd); + break; + } + } + mutex_unlock(&vm->irqfds_lock); + eventfd_ctx_put(eventfd); + + return 0; +} + +int acrn_irqfd_config(struct acrn_vm *vm, struct acrn_irqfd *args) +{ + int ret; + + if (args->flags & ACRN_IRQFD_FLAG_DEASSIGN) + ret = acrn_irqfd_deassign(vm, args); + else + ret = acrn_irqfd_assign(vm, args); + + return ret; +} + +int acrn_irqfd_init(struct acrn_vm *vm) +{ + INIT_LIST_HEAD(&vm->irqfds); + mutex_init(&vm->irqfds_lock); + vm->irqfd_wq = alloc_workqueue("acrn_irqfd-%u", 0, 0, vm->vmid); + if (!vm->irqfd_wq) + return -ENOMEM; + + dev_dbg(acrn_dev.this_device, "VM %u irqfd init.\n", vm->vmid); + return 0; +} + +void acrn_irqfd_deinit(struct acrn_vm *vm) +{ + struct hsm_irqfd *irqfd, *next; + + dev_dbg(acrn_dev.this_device, "VM %u irqfd deinit.\n", vm->vmid); + destroy_workqueue(vm->irqfd_wq); + mutex_lock(&vm->irqfds_lock); + list_for_each_entry_safe(irqfd, next, &vm->irqfds, list) + hsm_irqfd_shutdown(irqfd); + mutex_unlock(&vm->irqfds_lock); +} diff --git a/drivers/virt/acrn/mm.c b/drivers/virt/acrn/mm.c new file mode 100644 index 000000000..b4ad8d452 --- /dev/null +++ b/drivers/virt/acrn/mm.c @@ -0,0 +1,329 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * ACRN: Memory mapping management + * + * Copyright (C) 2020 Intel Corporation. All rights reserved. + * + * Authors: + * Fei Li <lei1.li@intel.com> + * Shuo Liu <shuo.a.liu@intel.com> + */ + +#include <linux/io.h> +#include <linux/mm.h> +#include <linux/slab.h> + +#include "acrn_drv.h" + +static int modify_region(struct acrn_vm *vm, struct vm_memory_region_op *region) +{ + struct vm_memory_region_batch *regions; + int ret; + + regions = kzalloc(sizeof(*regions), GFP_KERNEL); + if (!regions) + return -ENOMEM; + + regions->vmid = vm->vmid; + regions->regions_num = 1; + regions->regions_gpa = virt_to_phys(region); + + ret = hcall_set_memory_regions(virt_to_phys(regions)); + if (ret < 0) + dev_dbg(acrn_dev.this_device, + "Failed to set memory region for VM[%u]!\n", vm->vmid); + + kfree(regions); + return ret; +} + +/** + * acrn_mm_region_add() - Set up the EPT mapping of a memory region. + * @vm: User VM. + * @user_gpa: A GPA of User VM. + * @service_gpa: A GPA of Service VM. + * @size: Size of the region. + * @mem_type: Combination of ACRN_MEM_TYPE_*. + * @mem_access_right: Combination of ACRN_MEM_ACCESS_*. + * + * Return: 0 on success, <0 on error. + */ +int acrn_mm_region_add(struct acrn_vm *vm, u64 user_gpa, u64 service_gpa, + u64 size, u32 mem_type, u32 mem_access_right) +{ + struct vm_memory_region_op *region; + int ret = 0; + + region = kzalloc(sizeof(*region), GFP_KERNEL); + if (!region) + return -ENOMEM; + + region->type = ACRN_MEM_REGION_ADD; + region->user_vm_pa = user_gpa; + region->service_vm_pa = service_gpa; + region->size = size; + region->attr = ((mem_type & ACRN_MEM_TYPE_MASK) | + (mem_access_right & ACRN_MEM_ACCESS_RIGHT_MASK)); + ret = modify_region(vm, region); + + dev_dbg(acrn_dev.this_device, + "%s: user-GPA[%pK] service-GPA[%pK] size[0x%llx].\n", + __func__, (void *)user_gpa, (void *)service_gpa, size); + kfree(region); + return ret; +} + +/** + * acrn_mm_region_del() - Del the EPT mapping of a memory region. + * @vm: User VM. + * @user_gpa: A GPA of the User VM. + * @size: Size of the region. + * + * Return: 0 on success, <0 for error. + */ +int acrn_mm_region_del(struct acrn_vm *vm, u64 user_gpa, u64 size) +{ + struct vm_memory_region_op *region; + int ret = 0; + + region = kzalloc(sizeof(*region), GFP_KERNEL); + if (!region) + return -ENOMEM; + + region->type = ACRN_MEM_REGION_DEL; + region->user_vm_pa = user_gpa; + region->service_vm_pa = 0UL; + region->size = size; + region->attr = 0U; + + ret = modify_region(vm, region); + + dev_dbg(acrn_dev.this_device, "%s: user-GPA[%pK] size[0x%llx].\n", + __func__, (void *)user_gpa, size); + kfree(region); + return ret; +} + +int acrn_vm_memseg_map(struct acrn_vm *vm, struct acrn_vm_memmap *memmap) +{ + int ret; + + if (memmap->type == ACRN_MEMMAP_RAM) + return acrn_vm_ram_map(vm, memmap); + + if (memmap->type != ACRN_MEMMAP_MMIO) { + dev_dbg(acrn_dev.this_device, + "Invalid memmap type: %u\n", memmap->type); + return -EINVAL; + } + + ret = acrn_mm_region_add(vm, memmap->user_vm_pa, + memmap->service_vm_pa, memmap->len, + ACRN_MEM_TYPE_UC, memmap->attr); + if (ret < 0) + dev_dbg(acrn_dev.this_device, + "Add memory region failed, VM[%u]!\n", vm->vmid); + + return ret; +} + +int acrn_vm_memseg_unmap(struct acrn_vm *vm, struct acrn_vm_memmap *memmap) +{ + int ret; + + if (memmap->type != ACRN_MEMMAP_MMIO) { + dev_dbg(acrn_dev.this_device, + "Invalid memmap type: %u\n", memmap->type); + return -EINVAL; + } + + ret = acrn_mm_region_del(vm, memmap->user_vm_pa, memmap->len); + if (ret < 0) + dev_dbg(acrn_dev.this_device, + "Del memory region failed, VM[%u]!\n", vm->vmid); + + return ret; +} + +/** + * acrn_vm_ram_map() - Create a RAM EPT mapping of User VM. + * @vm: The User VM pointer + * @memmap: Info of the EPT mapping + * + * Return: 0 on success, <0 for error. + */ +int acrn_vm_ram_map(struct acrn_vm *vm, struct acrn_vm_memmap *memmap) +{ + struct vm_memory_region_batch *regions_info; + int nr_pages, i = 0, order, nr_regions = 0; + struct vm_memory_mapping *region_mapping; + struct vm_memory_region_op *vm_region; + struct page **pages = NULL, *page; + void *remap_vaddr; + int ret, pinned; + u64 user_vm_pa; + unsigned long pfn; + struct vm_area_struct *vma; + + if (!vm || !memmap) + return -EINVAL; + + mmap_read_lock(current->mm); + vma = vma_lookup(current->mm, memmap->vma_base); + if (vma && ((vma->vm_flags & VM_PFNMAP) != 0)) { + if ((memmap->vma_base + memmap->len) > vma->vm_end) { + mmap_read_unlock(current->mm); + return -EINVAL; + } + + ret = follow_pfn(vma, memmap->vma_base, &pfn); + mmap_read_unlock(current->mm); + if (ret < 0) { + dev_dbg(acrn_dev.this_device, + "Failed to lookup PFN at VMA:%pK.\n", (void *)memmap->vma_base); + return ret; + } + + return acrn_mm_region_add(vm, memmap->user_vm_pa, + PFN_PHYS(pfn), memmap->len, + ACRN_MEM_TYPE_WB, memmap->attr); + } + mmap_read_unlock(current->mm); + + /* Get the page number of the map region */ + nr_pages = memmap->len >> PAGE_SHIFT; + pages = vzalloc(array_size(nr_pages, sizeof(*pages))); + if (!pages) + return -ENOMEM; + + /* Lock the pages of user memory map region */ + pinned = pin_user_pages_fast(memmap->vma_base, + nr_pages, FOLL_WRITE | FOLL_LONGTERM, + pages); + if (pinned < 0) { + ret = pinned; + goto free_pages; + } else if (pinned != nr_pages) { + ret = -EFAULT; + goto put_pages; + } + + /* Create a kernel map for the map region */ + remap_vaddr = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL); + if (!remap_vaddr) { + ret = -ENOMEM; + goto put_pages; + } + + /* Record Service VM va <-> User VM pa mapping */ + mutex_lock(&vm->regions_mapping_lock); + region_mapping = &vm->regions_mapping[vm->regions_mapping_count]; + if (vm->regions_mapping_count < ACRN_MEM_MAPPING_MAX) { + region_mapping->pages = pages; + region_mapping->npages = nr_pages; + region_mapping->size = memmap->len; + region_mapping->service_vm_va = remap_vaddr; + region_mapping->user_vm_pa = memmap->user_vm_pa; + vm->regions_mapping_count++; + } else { + dev_warn(acrn_dev.this_device, + "Run out of memory mapping slots!\n"); + ret = -ENOMEM; + mutex_unlock(&vm->regions_mapping_lock); + goto unmap_no_count; + } + mutex_unlock(&vm->regions_mapping_lock); + + /* Calculate count of vm_memory_region_op */ + while (i < nr_pages) { + page = pages[i]; + VM_BUG_ON_PAGE(PageTail(page), page); + order = compound_order(page); + nr_regions++; + i += 1 << order; + } + + /* Prepare the vm_memory_region_batch */ + regions_info = kzalloc(struct_size(regions_info, regions_op, + nr_regions), GFP_KERNEL); + if (!regions_info) { + ret = -ENOMEM; + goto unmap_kernel_map; + } + + /* Fill each vm_memory_region_op */ + vm_region = regions_info->regions_op; + regions_info->vmid = vm->vmid; + regions_info->regions_num = nr_regions; + regions_info->regions_gpa = virt_to_phys(vm_region); + user_vm_pa = memmap->user_vm_pa; + i = 0; + while (i < nr_pages) { + u32 region_size; + + page = pages[i]; + VM_BUG_ON_PAGE(PageTail(page), page); + order = compound_order(page); + region_size = PAGE_SIZE << order; + vm_region->type = ACRN_MEM_REGION_ADD; + vm_region->user_vm_pa = user_vm_pa; + vm_region->service_vm_pa = page_to_phys(page); + vm_region->size = region_size; + vm_region->attr = (ACRN_MEM_TYPE_WB & ACRN_MEM_TYPE_MASK) | + (memmap->attr & ACRN_MEM_ACCESS_RIGHT_MASK); + + vm_region++; + user_vm_pa += region_size; + i += 1 << order; + } + + /* Inform the ACRN Hypervisor to set up EPT mappings */ + ret = hcall_set_memory_regions(virt_to_phys(regions_info)); + if (ret < 0) { + dev_dbg(acrn_dev.this_device, + "Failed to set regions, VM[%u]!\n", vm->vmid); + goto unset_region; + } + kfree(regions_info); + + dev_dbg(acrn_dev.this_device, + "%s: VM[%u] service-GVA[%pK] user-GPA[%pK] size[0x%llx]\n", + __func__, vm->vmid, + remap_vaddr, (void *)memmap->user_vm_pa, memmap->len); + return ret; + +unset_region: + kfree(regions_info); +unmap_kernel_map: + mutex_lock(&vm->regions_mapping_lock); + vm->regions_mapping_count--; + mutex_unlock(&vm->regions_mapping_lock); +unmap_no_count: + vunmap(remap_vaddr); +put_pages: + for (i = 0; i < pinned; i++) + unpin_user_page(pages[i]); +free_pages: + vfree(pages); + return ret; +} + +/** + * acrn_vm_all_ram_unmap() - Destroy a RAM EPT mapping of User VM. + * @vm: The User VM + */ +void acrn_vm_all_ram_unmap(struct acrn_vm *vm) +{ + struct vm_memory_mapping *region_mapping; + int i, j; + + mutex_lock(&vm->regions_mapping_lock); + for (i = 0; i < vm->regions_mapping_count; i++) { + region_mapping = &vm->regions_mapping[i]; + vunmap(region_mapping->service_vm_va); + for (j = 0; j < region_mapping->npages; j++) + unpin_user_page(region_mapping->pages[j]); + vfree(region_mapping->pages); + } + mutex_unlock(&vm->regions_mapping_lock); +} diff --git a/drivers/virt/acrn/vm.c b/drivers/virt/acrn/vm.c new file mode 100644 index 000000000..fbc9f1042 --- /dev/null +++ b/drivers/virt/acrn/vm.c @@ -0,0 +1,126 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * ACRN_HSM: Virtual Machine management + * + * Copyright (C) 2020 Intel Corporation. All rights reserved. + * + * Authors: + * Jason Chen CJ <jason.cj.chen@intel.com> + * Yakui Zhao <yakui.zhao@intel.com> + */ +#include <linux/io.h> +#include <linux/mm.h> +#include <linux/slab.h> + +#include "acrn_drv.h" + +/* List of VMs */ +LIST_HEAD(acrn_vm_list); +/* + * acrn_vm_list is read in a worker thread which dispatch I/O requests and + * is wrote in VM creation ioctl. Use the rwlock mechanism to protect it. + */ +DEFINE_RWLOCK(acrn_vm_list_lock); + +struct acrn_vm *acrn_vm_create(struct acrn_vm *vm, + struct acrn_vm_creation *vm_param) +{ + int ret; + + ret = hcall_create_vm(virt_to_phys(vm_param)); + if (ret < 0 || vm_param->vmid == ACRN_INVALID_VMID) { + dev_err(acrn_dev.this_device, + "Failed to create VM! Error: %d\n", ret); + return NULL; + } + + mutex_init(&vm->regions_mapping_lock); + INIT_LIST_HEAD(&vm->ioreq_clients); + spin_lock_init(&vm->ioreq_clients_lock); + vm->vmid = vm_param->vmid; + vm->vcpu_num = vm_param->vcpu_num; + + if (acrn_ioreq_init(vm, vm_param->ioreq_buf) < 0) { + hcall_destroy_vm(vm_param->vmid); + vm->vmid = ACRN_INVALID_VMID; + return NULL; + } + + write_lock_bh(&acrn_vm_list_lock); + list_add(&vm->list, &acrn_vm_list); + write_unlock_bh(&acrn_vm_list_lock); + + acrn_ioeventfd_init(vm); + acrn_irqfd_init(vm); + dev_dbg(acrn_dev.this_device, "VM %u created.\n", vm->vmid); + return vm; +} + +int acrn_vm_destroy(struct acrn_vm *vm) +{ + int ret; + + if (vm->vmid == ACRN_INVALID_VMID || + test_and_set_bit(ACRN_VM_FLAG_DESTROYED, &vm->flags)) + return 0; + + ret = hcall_destroy_vm(vm->vmid); + if (ret < 0) { + dev_err(acrn_dev.this_device, + "Failed to destroy VM %u\n", vm->vmid); + clear_bit(ACRN_VM_FLAG_DESTROYED, &vm->flags); + return ret; + } + + /* Remove from global VM list */ + write_lock_bh(&acrn_vm_list_lock); + list_del_init(&vm->list); + write_unlock_bh(&acrn_vm_list_lock); + + acrn_ioeventfd_deinit(vm); + acrn_irqfd_deinit(vm); + acrn_ioreq_deinit(vm); + + if (vm->monitor_page) { + put_page(vm->monitor_page); + vm->monitor_page = NULL; + } + + acrn_vm_all_ram_unmap(vm); + + dev_dbg(acrn_dev.this_device, "VM %u destroyed.\n", vm->vmid); + vm->vmid = ACRN_INVALID_VMID; + return 0; +} + +/** + * acrn_msi_inject() - Inject a MSI interrupt into a User VM + * @vm: User VM + * @msi_addr: The MSI address + * @msi_data: The MSI data + * + * Return: 0 on success, <0 on error + */ +int acrn_msi_inject(struct acrn_vm *vm, u64 msi_addr, u64 msi_data) +{ + struct acrn_msi_entry *msi; + int ret; + + /* might be used in interrupt context, so use GFP_ATOMIC */ + msi = kzalloc(sizeof(*msi), GFP_ATOMIC); + if (!msi) + return -ENOMEM; + + /* + * msi_addr: addr[19:12] with dest vcpu id + * msi_data: data[7:0] with vector + */ + msi->msi_addr = msi_addr; + msi->msi_data = msi_data; + ret = hcall_inject_msi(vm->vmid, virt_to_phys(msi)); + if (ret < 0) + dev_err(acrn_dev.this_device, + "Failed to inject MSI to VM %u!\n", vm->vmid); + kfree(msi); + return ret; +} diff --git a/drivers/virt/coco/efi_secret/Kconfig b/drivers/virt/coco/efi_secret/Kconfig new file mode 100644 index 000000000..4404d198f --- /dev/null +++ b/drivers/virt/coco/efi_secret/Kconfig @@ -0,0 +1,16 @@ +# SPDX-License-Identifier: GPL-2.0-only +config EFI_SECRET + tristate "EFI secret area securityfs support" + depends on EFI && X86_64 + select EFI_COCO_SECRET + select SECURITYFS + help + This is a driver for accessing the EFI secret area via securityfs. + The EFI secret area is a memory area designated by the firmware for + confidential computing secret injection (for example for AMD SEV + guests). The driver exposes the secrets as files in + <securityfs>/secrets/coco. Files can be read and deleted (deleting + a file wipes the secret from memory). + + To compile this driver as a module, choose M here. + The module will be called efi_secret. diff --git a/drivers/virt/coco/efi_secret/Makefile b/drivers/virt/coco/efi_secret/Makefile new file mode 100644 index 000000000..c7047ce80 --- /dev/null +++ b/drivers/virt/coco/efi_secret/Makefile @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0-only +obj-$(CONFIG_EFI_SECRET) += efi_secret.o diff --git a/drivers/virt/coco/efi_secret/efi_secret.c b/drivers/virt/coco/efi_secret/efi_secret.c new file mode 100644 index 000000000..e700a5ef7 --- /dev/null +++ b/drivers/virt/coco/efi_secret/efi_secret.c @@ -0,0 +1,349 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * efi_secret module + * + * Copyright (C) 2022 IBM Corporation + * Author: Dov Murik <dovmurik@linux.ibm.com> + */ + +/** + * DOC: efi_secret: Allow reading EFI confidential computing (coco) secret area + * via securityfs interface. + * + * When the module is loaded (and securityfs is mounted, typically under + * /sys/kernel/security), a "secrets/coco" directory is created in securityfs. + * In it, a file is created for each secret entry. The name of each such file + * is the GUID of the secret entry, and its content is the secret data. + */ + +#include <linux/platform_device.h> +#include <linux/seq_file.h> +#include <linux/fs.h> +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/io.h> +#include <linux/security.h> +#include <linux/efi.h> +#include <linux/cacheflush.h> + +#define EFI_SECRET_NUM_FILES 64 + +struct efi_secret { + struct dentry *secrets_dir; + struct dentry *fs_dir; + struct dentry *fs_files[EFI_SECRET_NUM_FILES]; + void __iomem *secret_data; + u64 secret_data_len; +}; + +/* + * Structure of the EFI secret area + * + * Offset Length + * (bytes) (bytes) Usage + * ------- ------- ----- + * 0 16 Secret table header GUID (must be 1e74f542-71dd-4d66-963e-ef4287ff173b) + * 16 4 Length of bytes of the entire secret area + * + * 20 16 First secret entry's GUID + * 36 4 First secret entry's length in bytes (= 16 + 4 + x) + * 40 x First secret entry's data + * + * 40+x 16 Second secret entry's GUID + * 56+x 4 Second secret entry's length in bytes (= 16 + 4 + y) + * 60+x y Second secret entry's data + * + * (... and so on for additional entries) + * + * The GUID of each secret entry designates the usage of the secret data. + */ + +/** + * struct secret_header - Header of entire secret area; this should be followed + * by instances of struct secret_entry. + * @guid: Must be EFI_SECRET_TABLE_HEADER_GUID + * @len: Length in bytes of entire secret area, including header + */ +struct secret_header { + efi_guid_t guid; + u32 len; +} __attribute((packed)); + +/** + * struct secret_entry - Holds one secret entry + * @guid: Secret-specific GUID (or NULL_GUID if this secret entry was deleted) + * @len: Length of secret entry, including its guid and len fields + * @data: The secret data (full of zeros if this secret entry was deleted) + */ +struct secret_entry { + efi_guid_t guid; + u32 len; + u8 data[]; +} __attribute((packed)); + +static size_t secret_entry_data_len(struct secret_entry *e) +{ + return e->len - sizeof(*e); +} + +static struct efi_secret the_efi_secret; + +static inline struct efi_secret *efi_secret_get(void) +{ + return &the_efi_secret; +} + +static int efi_secret_bin_file_show(struct seq_file *file, void *data) +{ + struct secret_entry *e = file->private; + + if (e) + seq_write(file, e->data, secret_entry_data_len(e)); + + return 0; +} +DEFINE_SHOW_ATTRIBUTE(efi_secret_bin_file); + +/* + * Overwrite memory content with zeroes, and ensure that dirty cache lines are + * actually written back to memory, to clear out the secret. + */ +static void wipe_memory(void *addr, size_t size) +{ + memzero_explicit(addr, size); +#ifdef CONFIG_X86 + clflush_cache_range(addr, size); +#endif +} + +static int efi_secret_unlink(struct inode *dir, struct dentry *dentry) +{ + struct efi_secret *s = efi_secret_get(); + struct inode *inode = d_inode(dentry); + struct secret_entry *e = (struct secret_entry *)inode->i_private; + int i; + + if (e) { + /* Zero out the secret data */ + wipe_memory(e->data, secret_entry_data_len(e)); + e->guid = NULL_GUID; + } + + inode->i_private = NULL; + + for (i = 0; i < EFI_SECRET_NUM_FILES; i++) + if (s->fs_files[i] == dentry) + s->fs_files[i] = NULL; + + /* + * securityfs_remove tries to lock the directory's inode, but we reach + * the unlink callback when it's already locked + */ + inode_unlock(dir); + securityfs_remove(dentry); + inode_lock(dir); + + return 0; +} + +static const struct inode_operations efi_secret_dir_inode_operations = { + .lookup = simple_lookup, + .unlink = efi_secret_unlink, +}; + +static int efi_secret_map_area(struct platform_device *dev) +{ + int ret; + struct efi_secret *s = efi_secret_get(); + struct linux_efi_coco_secret_area *secret_area; + + if (efi.coco_secret == EFI_INVALID_TABLE_ADDR) { + dev_err(&dev->dev, "Secret area address is not available\n"); + return -EINVAL; + } + + secret_area = memremap(efi.coco_secret, sizeof(*secret_area), MEMREMAP_WB); + if (secret_area == NULL) { + dev_err(&dev->dev, "Could not map secret area EFI config entry\n"); + return -ENOMEM; + } + if (!secret_area->base_pa || secret_area->size < sizeof(struct secret_header)) { + dev_err(&dev->dev, + "Invalid secret area memory location (base_pa=0x%llx size=0x%llx)\n", + secret_area->base_pa, secret_area->size); + ret = -EINVAL; + goto unmap; + } + + s->secret_data = ioremap_encrypted(secret_area->base_pa, secret_area->size); + if (s->secret_data == NULL) { + dev_err(&dev->dev, "Could not map secret area\n"); + ret = -ENOMEM; + goto unmap; + } + + s->secret_data_len = secret_area->size; + ret = 0; + +unmap: + memunmap(secret_area); + return ret; +} + +static void efi_secret_securityfs_teardown(struct platform_device *dev) +{ + struct efi_secret *s = efi_secret_get(); + int i; + + for (i = (EFI_SECRET_NUM_FILES - 1); i >= 0; i--) { + securityfs_remove(s->fs_files[i]); + s->fs_files[i] = NULL; + } + + securityfs_remove(s->fs_dir); + s->fs_dir = NULL; + + securityfs_remove(s->secrets_dir); + s->secrets_dir = NULL; + + dev_dbg(&dev->dev, "Removed securityfs entries\n"); +} + +static int efi_secret_securityfs_setup(struct platform_device *dev) +{ + struct efi_secret *s = efi_secret_get(); + int ret = 0, i = 0, bytes_left; + unsigned char *ptr; + struct secret_header *h; + struct secret_entry *e; + struct dentry *dent; + char guid_str[EFI_VARIABLE_GUID_LEN + 1]; + + ptr = (void __force *)s->secret_data; + h = (struct secret_header *)ptr; + if (efi_guidcmp(h->guid, EFI_SECRET_TABLE_HEADER_GUID)) { + /* + * This is not an error: it just means that EFI defines secret + * area but it was not populated by the Guest Owner. + */ + dev_dbg(&dev->dev, "EFI secret area does not start with correct GUID\n"); + return -ENODEV; + } + if (h->len < sizeof(*h)) { + dev_err(&dev->dev, "EFI secret area reported length is too small\n"); + return -EINVAL; + } + if (h->len > s->secret_data_len) { + dev_err(&dev->dev, "EFI secret area reported length is too big\n"); + return -EINVAL; + } + + s->secrets_dir = NULL; + s->fs_dir = NULL; + memset(s->fs_files, 0, sizeof(s->fs_files)); + + dent = securityfs_create_dir("secrets", NULL); + if (IS_ERR(dent)) { + dev_err(&dev->dev, "Error creating secrets securityfs directory entry err=%ld\n", + PTR_ERR(dent)); + return PTR_ERR(dent); + } + s->secrets_dir = dent; + + dent = securityfs_create_dir("coco", s->secrets_dir); + if (IS_ERR(dent)) { + dev_err(&dev->dev, "Error creating coco securityfs directory entry err=%ld\n", + PTR_ERR(dent)); + return PTR_ERR(dent); + } + d_inode(dent)->i_op = &efi_secret_dir_inode_operations; + s->fs_dir = dent; + + bytes_left = h->len - sizeof(*h); + ptr += sizeof(*h); + while (bytes_left >= (int)sizeof(*e) && i < EFI_SECRET_NUM_FILES) { + e = (struct secret_entry *)ptr; + if (e->len < sizeof(*e) || e->len > (unsigned int)bytes_left) { + dev_err(&dev->dev, "EFI secret area is corrupted\n"); + ret = -EINVAL; + goto err_cleanup; + } + + /* Skip deleted entries (which will have NULL_GUID) */ + if (efi_guidcmp(e->guid, NULL_GUID)) { + efi_guid_to_str(&e->guid, guid_str); + + dent = securityfs_create_file(guid_str, 0440, s->fs_dir, (void *)e, + &efi_secret_bin_file_fops); + if (IS_ERR(dent)) { + dev_err(&dev->dev, "Error creating efi_secret securityfs entry\n"); + ret = PTR_ERR(dent); + goto err_cleanup; + } + + s->fs_files[i++] = dent; + } + ptr += e->len; + bytes_left -= e->len; + } + + dev_info(&dev->dev, "Created %d entries in securityfs secrets/coco\n", i); + return 0; + +err_cleanup: + efi_secret_securityfs_teardown(dev); + return ret; +} + +static void efi_secret_unmap_area(void) +{ + struct efi_secret *s = efi_secret_get(); + + if (s->secret_data) { + iounmap(s->secret_data); + s->secret_data = NULL; + s->secret_data_len = 0; + } +} + +static int efi_secret_probe(struct platform_device *dev) +{ + int ret; + + ret = efi_secret_map_area(dev); + if (ret) + return ret; + + ret = efi_secret_securityfs_setup(dev); + if (ret) + goto err_unmap; + + return ret; + +err_unmap: + efi_secret_unmap_area(); + return ret; +} + +static int efi_secret_remove(struct platform_device *dev) +{ + efi_secret_securityfs_teardown(dev); + efi_secret_unmap_area(); + return 0; +} + +static struct platform_driver efi_secret_driver = { + .probe = efi_secret_probe, + .remove = efi_secret_remove, + .driver = { + .name = "efi_secret", + }, +}; + +module_platform_driver(efi_secret_driver); + +MODULE_DESCRIPTION("Confidential computing EFI secret area access"); +MODULE_AUTHOR("IBM"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("platform:efi_secret"); diff --git a/drivers/virt/coco/sev-guest/Kconfig b/drivers/virt/coco/sev-guest/Kconfig new file mode 100644 index 000000000..da2d7ca53 --- /dev/null +++ b/drivers/virt/coco/sev-guest/Kconfig @@ -0,0 +1,15 @@ +config SEV_GUEST + tristate "AMD SEV Guest driver" + default m + depends on AMD_MEM_ENCRYPT + select CRYPTO + select CRYPTO_AEAD2 + select CRYPTO_GCM + help + SEV-SNP firmware provides the guest a mechanism to communicate with + the PSP without risk from a malicious hypervisor who wishes to read, + alter, drop or replay the messages sent. The driver provides + userspace interface to communicate with the PSP to request the + attestation report and more. + + If you choose 'M' here, this module will be called sev-guest. diff --git a/drivers/virt/coco/sev-guest/Makefile b/drivers/virt/coco/sev-guest/Makefile new file mode 100644 index 000000000..63d67c277 --- /dev/null +++ b/drivers/virt/coco/sev-guest/Makefile @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0-only +obj-$(CONFIG_SEV_GUEST) += sev-guest.o diff --git a/drivers/virt/coco/sev-guest/sev-guest.c b/drivers/virt/coco/sev-guest/sev-guest.c new file mode 100644 index 000000000..c47e54b2a --- /dev/null +++ b/drivers/virt/coco/sev-guest/sev-guest.c @@ -0,0 +1,875 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * AMD Secure Encrypted Virtualization (SEV) guest driver interface + * + * Copyright (C) 2021 Advanced Micro Devices, Inc. + * + * Author: Brijesh Singh <brijesh.singh@amd.com> + */ + +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/types.h> +#include <linux/mutex.h> +#include <linux/io.h> +#include <linux/platform_device.h> +#include <linux/miscdevice.h> +#include <linux/set_memory.h> +#include <linux/fs.h> +#include <crypto/aead.h> +#include <linux/scatterlist.h> +#include <linux/psp-sev.h> +#include <uapi/linux/sev-guest.h> +#include <uapi/linux/psp-sev.h> + +#include <asm/svm.h> +#include <asm/sev.h> + +#include "sev-guest.h" + +#define DEVICE_NAME "sev-guest" +#define AAD_LEN 48 +#define MSG_HDR_VER 1 + +#define SNP_REQ_MAX_RETRY_DURATION (60*HZ) +#define SNP_REQ_RETRY_DELAY (2*HZ) + +struct snp_guest_crypto { + struct crypto_aead *tfm; + u8 *iv, *authtag; + int iv_len, a_len; +}; + +struct snp_guest_dev { + struct device *dev; + struct miscdevice misc; + + void *certs_data; + struct snp_guest_crypto *crypto; + /* request and response are in unencrypted memory */ + struct snp_guest_msg *request, *response; + + /* + * Avoid information leakage by double-buffering shared messages + * in fields that are in regular encrypted memory. + */ + struct snp_guest_msg secret_request, secret_response; + + struct snp_secrets_page_layout *layout; + struct snp_req_data input; + union { + struct snp_report_req report; + struct snp_derived_key_req derived_key; + struct snp_ext_report_req ext_report; + } req; + u32 *os_area_msg_seqno; + u8 *vmpck; +}; + +static u32 vmpck_id; +module_param(vmpck_id, uint, 0444); +MODULE_PARM_DESC(vmpck_id, "The VMPCK ID to use when communicating with the PSP."); + +/* Mutex to serialize the shared buffer access and command handling. */ +static DEFINE_MUTEX(snp_cmd_mutex); + +static bool is_vmpck_empty(struct snp_guest_dev *snp_dev) +{ + char zero_key[VMPCK_KEY_LEN] = {0}; + + if (snp_dev->vmpck) + return !memcmp(snp_dev->vmpck, zero_key, VMPCK_KEY_LEN); + + return true; +} + +/* + * If an error is received from the host or AMD Secure Processor (ASP) there + * are two options. Either retry the exact same encrypted request or discontinue + * using the VMPCK. + * + * This is because in the current encryption scheme GHCB v2 uses AES-GCM to + * encrypt the requests. The IV for this scheme is the sequence number. GCM + * cannot tolerate IV reuse. + * + * The ASP FW v1.51 only increments the sequence numbers on a successful + * guest<->ASP back and forth and only accepts messages at its exact sequence + * number. + * + * So if the sequence number were to be reused the encryption scheme is + * vulnerable. If the sequence number were incremented for a fresh IV the ASP + * will reject the request. + */ +static void snp_disable_vmpck(struct snp_guest_dev *snp_dev) +{ + dev_alert(snp_dev->dev, "Disabling vmpck_id %d to prevent IV reuse.\n", + vmpck_id); + memzero_explicit(snp_dev->vmpck, VMPCK_KEY_LEN); + snp_dev->vmpck = NULL; +} + +static inline u64 __snp_get_msg_seqno(struct snp_guest_dev *snp_dev) +{ + u64 count; + + lockdep_assert_held(&snp_cmd_mutex); + + /* Read the current message sequence counter from secrets pages */ + count = *snp_dev->os_area_msg_seqno; + + return count + 1; +} + +/* Return a non-zero on success */ +static u64 snp_get_msg_seqno(struct snp_guest_dev *snp_dev) +{ + u64 count = __snp_get_msg_seqno(snp_dev); + + /* + * The message sequence counter for the SNP guest request is a 64-bit + * value but the version 2 of GHCB specification defines a 32-bit storage + * for it. If the counter exceeds the 32-bit value then return zero. + * The caller should check the return value, but if the caller happens to + * not check the value and use it, then the firmware treats zero as an + * invalid number and will fail the message request. + */ + if (count >= UINT_MAX) { + dev_err(snp_dev->dev, "request message sequence counter overflow\n"); + return 0; + } + + return count; +} + +static void snp_inc_msg_seqno(struct snp_guest_dev *snp_dev) +{ + /* + * The counter is also incremented by the PSP, so increment it by 2 + * and save in secrets page. + */ + *snp_dev->os_area_msg_seqno += 2; +} + +static inline struct snp_guest_dev *to_snp_dev(struct file *file) +{ + struct miscdevice *dev = file->private_data; + + return container_of(dev, struct snp_guest_dev, misc); +} + +static struct snp_guest_crypto *init_crypto(struct snp_guest_dev *snp_dev, u8 *key, size_t keylen) +{ + struct snp_guest_crypto *crypto; + + crypto = kzalloc(sizeof(*crypto), GFP_KERNEL_ACCOUNT); + if (!crypto) + return NULL; + + crypto->tfm = crypto_alloc_aead("gcm(aes)", 0, 0); + if (IS_ERR(crypto->tfm)) + goto e_free; + + if (crypto_aead_setkey(crypto->tfm, key, keylen)) + goto e_free_crypto; + + crypto->iv_len = crypto_aead_ivsize(crypto->tfm); + crypto->iv = kmalloc(crypto->iv_len, GFP_KERNEL_ACCOUNT); + if (!crypto->iv) + goto e_free_crypto; + + if (crypto_aead_authsize(crypto->tfm) > MAX_AUTHTAG_LEN) { + if (crypto_aead_setauthsize(crypto->tfm, MAX_AUTHTAG_LEN)) { + dev_err(snp_dev->dev, "failed to set authsize to %d\n", MAX_AUTHTAG_LEN); + goto e_free_iv; + } + } + + crypto->a_len = crypto_aead_authsize(crypto->tfm); + crypto->authtag = kmalloc(crypto->a_len, GFP_KERNEL_ACCOUNT); + if (!crypto->authtag) + goto e_free_auth; + + return crypto; + +e_free_auth: + kfree(crypto->authtag); +e_free_iv: + kfree(crypto->iv); +e_free_crypto: + crypto_free_aead(crypto->tfm); +e_free: + kfree(crypto); + + return NULL; +} + +static void deinit_crypto(struct snp_guest_crypto *crypto) +{ + crypto_free_aead(crypto->tfm); + kfree(crypto->iv); + kfree(crypto->authtag); + kfree(crypto); +} + +static int enc_dec_message(struct snp_guest_crypto *crypto, struct snp_guest_msg *msg, + u8 *src_buf, u8 *dst_buf, size_t len, bool enc) +{ + struct snp_guest_msg_hdr *hdr = &msg->hdr; + struct scatterlist src[3], dst[3]; + DECLARE_CRYPTO_WAIT(wait); + struct aead_request *req; + int ret; + + req = aead_request_alloc(crypto->tfm, GFP_KERNEL); + if (!req) + return -ENOMEM; + + /* + * AEAD memory operations: + * +------ AAD -------+------- DATA -----+---- AUTHTAG----+ + * | msg header | plaintext | hdr->authtag | + * | bytes 30h - 5Fh | or | | + * | | cipher | | + * +------------------+------------------+----------------+ + */ + sg_init_table(src, 3); + sg_set_buf(&src[0], &hdr->algo, AAD_LEN); + sg_set_buf(&src[1], src_buf, hdr->msg_sz); + sg_set_buf(&src[2], hdr->authtag, crypto->a_len); + + sg_init_table(dst, 3); + sg_set_buf(&dst[0], &hdr->algo, AAD_LEN); + sg_set_buf(&dst[1], dst_buf, hdr->msg_sz); + sg_set_buf(&dst[2], hdr->authtag, crypto->a_len); + + aead_request_set_ad(req, AAD_LEN); + aead_request_set_tfm(req, crypto->tfm); + aead_request_set_callback(req, 0, crypto_req_done, &wait); + + aead_request_set_crypt(req, src, dst, len, crypto->iv); + ret = crypto_wait_req(enc ? crypto_aead_encrypt(req) : crypto_aead_decrypt(req), &wait); + + aead_request_free(req); + return ret; +} + +static int __enc_payload(struct snp_guest_dev *snp_dev, struct snp_guest_msg *msg, + void *plaintext, size_t len) +{ + struct snp_guest_crypto *crypto = snp_dev->crypto; + struct snp_guest_msg_hdr *hdr = &msg->hdr; + + memset(crypto->iv, 0, crypto->iv_len); + memcpy(crypto->iv, &hdr->msg_seqno, sizeof(hdr->msg_seqno)); + + return enc_dec_message(crypto, msg, plaintext, msg->payload, len, true); +} + +static int dec_payload(struct snp_guest_dev *snp_dev, struct snp_guest_msg *msg, + void *plaintext, size_t len) +{ + struct snp_guest_crypto *crypto = snp_dev->crypto; + struct snp_guest_msg_hdr *hdr = &msg->hdr; + + /* Build IV with response buffer sequence number */ + memset(crypto->iv, 0, crypto->iv_len); + memcpy(crypto->iv, &hdr->msg_seqno, sizeof(hdr->msg_seqno)); + + return enc_dec_message(crypto, msg, msg->payload, plaintext, len, false); +} + +static int verify_and_dec_payload(struct snp_guest_dev *snp_dev, void *payload, u32 sz) +{ + struct snp_guest_crypto *crypto = snp_dev->crypto; + struct snp_guest_msg *resp = &snp_dev->secret_response; + struct snp_guest_msg *req = &snp_dev->secret_request; + struct snp_guest_msg_hdr *req_hdr = &req->hdr; + struct snp_guest_msg_hdr *resp_hdr = &resp->hdr; + + dev_dbg(snp_dev->dev, "response [seqno %lld type %d version %d sz %d]\n", + resp_hdr->msg_seqno, resp_hdr->msg_type, resp_hdr->msg_version, resp_hdr->msg_sz); + + /* Copy response from shared memory to encrypted memory. */ + memcpy(resp, snp_dev->response, sizeof(*resp)); + + /* Verify that the sequence counter is incremented by 1 */ + if (unlikely(resp_hdr->msg_seqno != (req_hdr->msg_seqno + 1))) + return -EBADMSG; + + /* Verify response message type and version number. */ + if (resp_hdr->msg_type != (req_hdr->msg_type + 1) || + resp_hdr->msg_version != req_hdr->msg_version) + return -EBADMSG; + + /* + * If the message size is greater than our buffer length then return + * an error. + */ + if (unlikely((resp_hdr->msg_sz + crypto->a_len) > sz)) + return -EBADMSG; + + /* Decrypt the payload */ + return dec_payload(snp_dev, resp, payload, resp_hdr->msg_sz + crypto->a_len); +} + +static int enc_payload(struct snp_guest_dev *snp_dev, u64 seqno, int version, u8 type, + void *payload, size_t sz) +{ + struct snp_guest_msg *req = &snp_dev->secret_request; + struct snp_guest_msg_hdr *hdr = &req->hdr; + + memset(req, 0, sizeof(*req)); + + hdr->algo = SNP_AEAD_AES_256_GCM; + hdr->hdr_version = MSG_HDR_VER; + hdr->hdr_sz = sizeof(*hdr); + hdr->msg_type = type; + hdr->msg_version = version; + hdr->msg_seqno = seqno; + hdr->msg_vmpck = vmpck_id; + hdr->msg_sz = sz; + + /* Verify the sequence number is non-zero */ + if (!hdr->msg_seqno) + return -ENOSR; + + dev_dbg(snp_dev->dev, "request [seqno %lld type %d version %d sz %d]\n", + hdr->msg_seqno, hdr->msg_type, hdr->msg_version, hdr->msg_sz); + + return __enc_payload(snp_dev, req, payload, sz); +} + +static int __handle_guest_request(struct snp_guest_dev *snp_dev, u64 exit_code, + struct snp_guest_request_ioctl *rio) +{ + unsigned long req_start = jiffies; + unsigned int override_npages = 0; + u64 override_err = 0; + int rc; + +retry_request: + /* + * Call firmware to process the request. In this function the encrypted + * message enters shared memory with the host. So after this call the + * sequence number must be incremented or the VMPCK must be deleted to + * prevent reuse of the IV. + */ + rc = snp_issue_guest_request(exit_code, &snp_dev->input, rio); + switch (rc) { + case -ENOSPC: + /* + * If the extended guest request fails due to having too + * small of a certificate data buffer, retry the same + * guest request without the extended data request in + * order to increment the sequence number and thus avoid + * IV reuse. + */ + override_npages = snp_dev->input.data_npages; + exit_code = SVM_VMGEXIT_GUEST_REQUEST; + + /* + * Override the error to inform callers the given extended + * request buffer size was too small and give the caller the + * required buffer size. + */ + override_err = SNP_GUEST_VMM_ERR(SNP_GUEST_VMM_ERR_INVALID_LEN); + + /* + * If this call to the firmware succeeds, the sequence number can + * be incremented allowing for continued use of the VMPCK. If + * there is an error reflected in the return value, this value + * is checked further down and the result will be the deletion + * of the VMPCK and the error code being propagated back to the + * user as an ioctl() return code. + */ + goto retry_request; + + /* + * The host may return SNP_GUEST_VMM_ERR_BUSY if the request has been + * throttled. Retry in the driver to avoid returning and reusing the + * message sequence number on a different message. + */ + case -EAGAIN: + if (jiffies - req_start > SNP_REQ_MAX_RETRY_DURATION) { + rc = -ETIMEDOUT; + break; + } + schedule_timeout_killable(SNP_REQ_RETRY_DELAY); + goto retry_request; + } + + /* + * Increment the message sequence number. There is no harm in doing + * this now because decryption uses the value stored in the response + * structure and any failure will wipe the VMPCK, preventing further + * use anyway. + */ + snp_inc_msg_seqno(snp_dev); + + if (override_err) { + rio->exitinfo2 = override_err; + + /* + * If an extended guest request was issued and the supplied certificate + * buffer was not large enough, a standard guest request was issued to + * prevent IV reuse. If the standard request was successful, return -EIO + * back to the caller as would have originally been returned. + */ + if (!rc && override_err == SNP_GUEST_VMM_ERR(SNP_GUEST_VMM_ERR_INVALID_LEN)) + rc = -EIO; + } + + if (override_npages) + snp_dev->input.data_npages = override_npages; + + return rc; +} + +static int handle_guest_request(struct snp_guest_dev *snp_dev, u64 exit_code, + struct snp_guest_request_ioctl *rio, u8 type, + void *req_buf, size_t req_sz, void *resp_buf, + u32 resp_sz) +{ + u64 seqno; + int rc; + + /* Get message sequence and verify that its a non-zero */ + seqno = snp_get_msg_seqno(snp_dev); + if (!seqno) + return -EIO; + + /* Clear shared memory's response for the host to populate. */ + memset(snp_dev->response, 0, sizeof(struct snp_guest_msg)); + + /* Encrypt the userspace provided payload in snp_dev->secret_request. */ + rc = enc_payload(snp_dev, seqno, rio->msg_version, type, req_buf, req_sz); + if (rc) + return rc; + + /* + * Write the fully encrypted request to the shared unencrypted + * request page. + */ + memcpy(snp_dev->request, &snp_dev->secret_request, + sizeof(snp_dev->secret_request)); + + rc = __handle_guest_request(snp_dev, exit_code, rio); + if (rc) { + if (rc == -EIO && + rio->exitinfo2 == SNP_GUEST_VMM_ERR(SNP_GUEST_VMM_ERR_INVALID_LEN)) + return rc; + + dev_alert(snp_dev->dev, + "Detected error from ASP request. rc: %d, exitinfo2: 0x%llx\n", + rc, rio->exitinfo2); + + snp_disable_vmpck(snp_dev); + return rc; + } + + rc = verify_and_dec_payload(snp_dev, resp_buf, resp_sz); + if (rc) { + dev_alert(snp_dev->dev, "Detected unexpected decode failure from ASP. rc: %d\n", rc); + snp_disable_vmpck(snp_dev); + return rc; + } + + return 0; +} + +static int get_report(struct snp_guest_dev *snp_dev, struct snp_guest_request_ioctl *arg) +{ + struct snp_guest_crypto *crypto = snp_dev->crypto; + struct snp_report_req *req = &snp_dev->req.report; + struct snp_report_resp *resp; + int rc, resp_len; + + lockdep_assert_held(&snp_cmd_mutex); + + if (!arg->req_data || !arg->resp_data) + return -EINVAL; + + if (copy_from_user(req, (void __user *)arg->req_data, sizeof(*req))) + return -EFAULT; + + /* + * The intermediate response buffer is used while decrypting the + * response payload. Make sure that it has enough space to cover the + * authtag. + */ + resp_len = sizeof(resp->data) + crypto->a_len; + resp = kzalloc(resp_len, GFP_KERNEL_ACCOUNT); + if (!resp) + return -ENOMEM; + + rc = handle_guest_request(snp_dev, SVM_VMGEXIT_GUEST_REQUEST, arg, + SNP_MSG_REPORT_REQ, req, sizeof(*req), resp->data, + resp_len); + if (rc) + goto e_free; + + if (copy_to_user((void __user *)arg->resp_data, resp, sizeof(*resp))) + rc = -EFAULT; + +e_free: + kfree(resp); + return rc; +} + +static int get_derived_key(struct snp_guest_dev *snp_dev, struct snp_guest_request_ioctl *arg) +{ + struct snp_derived_key_req *req = &snp_dev->req.derived_key; + struct snp_guest_crypto *crypto = snp_dev->crypto; + struct snp_derived_key_resp resp = {0}; + int rc, resp_len; + /* Response data is 64 bytes and max authsize for GCM is 16 bytes. */ + u8 buf[64 + 16]; + + lockdep_assert_held(&snp_cmd_mutex); + + if (!arg->req_data || !arg->resp_data) + return -EINVAL; + + /* + * The intermediate response buffer is used while decrypting the + * response payload. Make sure that it has enough space to cover the + * authtag. + */ + resp_len = sizeof(resp.data) + crypto->a_len; + if (sizeof(buf) < resp_len) + return -ENOMEM; + + if (copy_from_user(req, (void __user *)arg->req_data, sizeof(*req))) + return -EFAULT; + + rc = handle_guest_request(snp_dev, SVM_VMGEXIT_GUEST_REQUEST, arg, + SNP_MSG_KEY_REQ, req, sizeof(*req), buf, resp_len); + if (rc) + return rc; + + memcpy(resp.data, buf, sizeof(resp.data)); + if (copy_to_user((void __user *)arg->resp_data, &resp, sizeof(resp))) + rc = -EFAULT; + + /* The response buffer contains the sensitive data, explicitly clear it. */ + memzero_explicit(buf, sizeof(buf)); + memzero_explicit(&resp, sizeof(resp)); + return rc; +} + +static int get_ext_report(struct snp_guest_dev *snp_dev, struct snp_guest_request_ioctl *arg) +{ + struct snp_ext_report_req *req = &snp_dev->req.ext_report; + struct snp_guest_crypto *crypto = snp_dev->crypto; + struct snp_report_resp *resp; + int ret, npages = 0, resp_len; + + lockdep_assert_held(&snp_cmd_mutex); + + if (!arg->req_data || !arg->resp_data) + return -EINVAL; + + if (copy_from_user(req, (void __user *)arg->req_data, sizeof(*req))) + return -EFAULT; + + /* userspace does not want certificate data */ + if (!req->certs_len || !req->certs_address) + goto cmd; + + if (req->certs_len > SEV_FW_BLOB_MAX_SIZE || + !IS_ALIGNED(req->certs_len, PAGE_SIZE)) + return -EINVAL; + + if (!access_ok((const void __user *)req->certs_address, req->certs_len)) + return -EFAULT; + + /* + * Initialize the intermediate buffer with all zeros. This buffer + * is used in the guest request message to get the certs blob from + * the host. If host does not supply any certs in it, then copy + * zeros to indicate that certificate data was not provided. + */ + memset(snp_dev->certs_data, 0, req->certs_len); + npages = req->certs_len >> PAGE_SHIFT; +cmd: + /* + * The intermediate response buffer is used while decrypting the + * response payload. Make sure that it has enough space to cover the + * authtag. + */ + resp_len = sizeof(resp->data) + crypto->a_len; + resp = kzalloc(resp_len, GFP_KERNEL_ACCOUNT); + if (!resp) + return -ENOMEM; + + snp_dev->input.data_npages = npages; + ret = handle_guest_request(snp_dev, SVM_VMGEXIT_EXT_GUEST_REQUEST, arg, + SNP_MSG_REPORT_REQ, &req->data, + sizeof(req->data), resp->data, resp_len); + + /* If certs length is invalid then copy the returned length */ + if (arg->vmm_error == SNP_GUEST_VMM_ERR_INVALID_LEN) { + req->certs_len = snp_dev->input.data_npages << PAGE_SHIFT; + + if (copy_to_user((void __user *)arg->req_data, req, sizeof(*req))) + ret = -EFAULT; + } + + if (ret) + goto e_free; + + if (npages && + copy_to_user((void __user *)req->certs_address, snp_dev->certs_data, + req->certs_len)) { + ret = -EFAULT; + goto e_free; + } + + if (copy_to_user((void __user *)arg->resp_data, resp, sizeof(*resp))) + ret = -EFAULT; + +e_free: + kfree(resp); + return ret; +} + +static long snp_guest_ioctl(struct file *file, unsigned int ioctl, unsigned long arg) +{ + struct snp_guest_dev *snp_dev = to_snp_dev(file); + void __user *argp = (void __user *)arg; + struct snp_guest_request_ioctl input; + int ret = -ENOTTY; + + if (copy_from_user(&input, argp, sizeof(input))) + return -EFAULT; + + input.exitinfo2 = 0xff; + + /* Message version must be non-zero */ + if (!input.msg_version) + return -EINVAL; + + mutex_lock(&snp_cmd_mutex); + + /* Check if the VMPCK is not empty */ + if (is_vmpck_empty(snp_dev)) { + dev_err_ratelimited(snp_dev->dev, "VMPCK is disabled\n"); + mutex_unlock(&snp_cmd_mutex); + return -ENOTTY; + } + + switch (ioctl) { + case SNP_GET_REPORT: + ret = get_report(snp_dev, &input); + break; + case SNP_GET_DERIVED_KEY: + ret = get_derived_key(snp_dev, &input); + break; + case SNP_GET_EXT_REPORT: + ret = get_ext_report(snp_dev, &input); + break; + default: + break; + } + + mutex_unlock(&snp_cmd_mutex); + + if (input.exitinfo2 && copy_to_user(argp, &input, sizeof(input))) + return -EFAULT; + + return ret; +} + +static void free_shared_pages(void *buf, size_t sz) +{ + unsigned int npages = PAGE_ALIGN(sz) >> PAGE_SHIFT; + int ret; + + if (!buf) + return; + + ret = set_memory_encrypted((unsigned long)buf, npages); + if (ret) { + WARN_ONCE(ret, "failed to restore encryption mask (leak it)\n"); + return; + } + + __free_pages(virt_to_page(buf), get_order(sz)); +} + +static void *alloc_shared_pages(struct device *dev, size_t sz) +{ + unsigned int npages = PAGE_ALIGN(sz) >> PAGE_SHIFT; + struct page *page; + int ret; + + page = alloc_pages(GFP_KERNEL_ACCOUNT, get_order(sz)); + if (!page) + return NULL; + + ret = set_memory_decrypted((unsigned long)page_address(page), npages); + if (ret) { + dev_err(dev, "failed to mark page shared, ret=%d\n", ret); + __free_pages(page, get_order(sz)); + return NULL; + } + + return page_address(page); +} + +static const struct file_operations snp_guest_fops = { + .owner = THIS_MODULE, + .unlocked_ioctl = snp_guest_ioctl, +}; + +static u8 *get_vmpck(int id, struct snp_secrets_page_layout *layout, u32 **seqno) +{ + u8 *key = NULL; + + switch (id) { + case 0: + *seqno = &layout->os_area.msg_seqno_0; + key = layout->vmpck0; + break; + case 1: + *seqno = &layout->os_area.msg_seqno_1; + key = layout->vmpck1; + break; + case 2: + *seqno = &layout->os_area.msg_seqno_2; + key = layout->vmpck2; + break; + case 3: + *seqno = &layout->os_area.msg_seqno_3; + key = layout->vmpck3; + break; + default: + break; + } + + return key; +} + +static int __init sev_guest_probe(struct platform_device *pdev) +{ + struct snp_secrets_page_layout *layout; + struct sev_guest_platform_data *data; + struct device *dev = &pdev->dev; + struct snp_guest_dev *snp_dev; + struct miscdevice *misc; + void __iomem *mapping; + int ret; + + if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) + return -ENODEV; + + if (!dev->platform_data) + return -ENODEV; + + data = (struct sev_guest_platform_data *)dev->platform_data; + mapping = ioremap_encrypted(data->secrets_gpa, PAGE_SIZE); + if (!mapping) + return -ENODEV; + + layout = (__force void *)mapping; + + ret = -ENOMEM; + snp_dev = devm_kzalloc(&pdev->dev, sizeof(struct snp_guest_dev), GFP_KERNEL); + if (!snp_dev) + goto e_unmap; + + ret = -EINVAL; + snp_dev->vmpck = get_vmpck(vmpck_id, layout, &snp_dev->os_area_msg_seqno); + if (!snp_dev->vmpck) { + dev_err(dev, "invalid vmpck id %d\n", vmpck_id); + goto e_unmap; + } + + /* Verify that VMPCK is not zero. */ + if (is_vmpck_empty(snp_dev)) { + dev_err(dev, "vmpck id %d is null\n", vmpck_id); + goto e_unmap; + } + + platform_set_drvdata(pdev, snp_dev); + snp_dev->dev = dev; + snp_dev->layout = layout; + + /* Allocate the shared page used for the request and response message. */ + snp_dev->request = alloc_shared_pages(dev, sizeof(struct snp_guest_msg)); + if (!snp_dev->request) + goto e_unmap; + + snp_dev->response = alloc_shared_pages(dev, sizeof(struct snp_guest_msg)); + if (!snp_dev->response) + goto e_free_request; + + snp_dev->certs_data = alloc_shared_pages(dev, SEV_FW_BLOB_MAX_SIZE); + if (!snp_dev->certs_data) + goto e_free_response; + + ret = -EIO; + snp_dev->crypto = init_crypto(snp_dev, snp_dev->vmpck, VMPCK_KEY_LEN); + if (!snp_dev->crypto) + goto e_free_cert_data; + + misc = &snp_dev->misc; + misc->minor = MISC_DYNAMIC_MINOR; + misc->name = DEVICE_NAME; + misc->fops = &snp_guest_fops; + + /* initial the input address for guest request */ + snp_dev->input.req_gpa = __pa(snp_dev->request); + snp_dev->input.resp_gpa = __pa(snp_dev->response); + snp_dev->input.data_gpa = __pa(snp_dev->certs_data); + + ret = misc_register(misc); + if (ret) + goto e_free_cert_data; + + dev_info(dev, "Initialized SEV guest driver (using vmpck_id %d)\n", vmpck_id); + return 0; + +e_free_cert_data: + free_shared_pages(snp_dev->certs_data, SEV_FW_BLOB_MAX_SIZE); +e_free_response: + free_shared_pages(snp_dev->response, sizeof(struct snp_guest_msg)); +e_free_request: + free_shared_pages(snp_dev->request, sizeof(struct snp_guest_msg)); +e_unmap: + iounmap(mapping); + return ret; +} + +static int __exit sev_guest_remove(struct platform_device *pdev) +{ + struct snp_guest_dev *snp_dev = platform_get_drvdata(pdev); + + free_shared_pages(snp_dev->certs_data, SEV_FW_BLOB_MAX_SIZE); + free_shared_pages(snp_dev->response, sizeof(struct snp_guest_msg)); + free_shared_pages(snp_dev->request, sizeof(struct snp_guest_msg)); + deinit_crypto(snp_dev->crypto); + misc_deregister(&snp_dev->misc); + + return 0; +} + +/* + * This driver is meant to be a common SEV guest interface driver and to + * support any SEV guest API. As such, even though it has been introduced + * with the SEV-SNP support, it is named "sev-guest". + */ +static struct platform_driver sev_guest_driver = { + .remove = __exit_p(sev_guest_remove), + .driver = { + .name = "sev-guest", + }, +}; + +module_platform_driver_probe(sev_guest_driver, sev_guest_probe); + +MODULE_AUTHOR("Brijesh Singh <brijesh.singh@amd.com>"); +MODULE_LICENSE("GPL"); +MODULE_VERSION("1.0.0"); +MODULE_DESCRIPTION("AMD SEV Guest Driver"); +MODULE_ALIAS("platform:sev-guest"); diff --git a/drivers/virt/coco/sev-guest/sev-guest.h b/drivers/virt/coco/sev-guest/sev-guest.h new file mode 100644 index 000000000..21bda26fd --- /dev/null +++ b/drivers/virt/coco/sev-guest/sev-guest.h @@ -0,0 +1,63 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2021 Advanced Micro Devices, Inc. + * + * Author: Brijesh Singh <brijesh.singh@amd.com> + * + * SEV-SNP API spec is available at https://developer.amd.com/sev + */ + +#ifndef __VIRT_SEVGUEST_H__ +#define __VIRT_SEVGUEST_H__ + +#include <linux/types.h> + +#define MAX_AUTHTAG_LEN 32 + +/* See SNP spec SNP_GUEST_REQUEST section for the structure */ +enum msg_type { + SNP_MSG_TYPE_INVALID = 0, + SNP_MSG_CPUID_REQ, + SNP_MSG_CPUID_RSP, + SNP_MSG_KEY_REQ, + SNP_MSG_KEY_RSP, + SNP_MSG_REPORT_REQ, + SNP_MSG_REPORT_RSP, + SNP_MSG_EXPORT_REQ, + SNP_MSG_EXPORT_RSP, + SNP_MSG_IMPORT_REQ, + SNP_MSG_IMPORT_RSP, + SNP_MSG_ABSORB_REQ, + SNP_MSG_ABSORB_RSP, + SNP_MSG_VMRK_REQ, + SNP_MSG_VMRK_RSP, + + SNP_MSG_TYPE_MAX +}; + +enum aead_algo { + SNP_AEAD_INVALID, + SNP_AEAD_AES_256_GCM, +}; + +struct snp_guest_msg_hdr { + u8 authtag[MAX_AUTHTAG_LEN]; + u64 msg_seqno; + u8 rsvd1[8]; + u8 algo; + u8 hdr_version; + u16 hdr_sz; + u8 msg_type; + u8 msg_version; + u16 msg_sz; + u32 rsvd2; + u8 msg_vmpck; + u8 rsvd3[35]; +} __packed; + +struct snp_guest_msg { + struct snp_guest_msg_hdr hdr; + u8 payload[4000]; +} __packed; + +#endif /* __VIRT_SEVGUEST_H__ */ diff --git a/drivers/virt/fsl_hypervisor.c b/drivers/virt/fsl_hypervisor.c new file mode 100644 index 000000000..07035249a --- /dev/null +++ b/drivers/virt/fsl_hypervisor.c @@ -0,0 +1,932 @@ +/* + * Freescale Hypervisor Management Driver + + * Copyright (C) 2008-2011 Freescale Semiconductor, Inc. + * Author: Timur Tabi <timur@freescale.com> + * + * This file is licensed under the terms of the GNU General Public License + * version 2. This program is licensed "as is" without any warranty of any + * kind, whether express or implied. + * + * The Freescale hypervisor management driver provides several services to + * drivers and applications related to the Freescale hypervisor: + * + * 1. An ioctl interface for querying and managing partitions. + * + * 2. A file interface to reading incoming doorbells. + * + * 3. An interrupt handler for shutting down the partition upon receiving the + * shutdown doorbell from a manager partition. + * + * 4. A kernel interface for receiving callbacks when a managed partition + * shuts down. + */ + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/types.h> +#include <linux/err.h> +#include <linux/fs.h> +#include <linux/miscdevice.h> +#include <linux/mm.h> +#include <linux/pagemap.h> +#include <linux/slab.h> +#include <linux/poll.h> +#include <linux/of.h> +#include <linux/of_irq.h> +#include <linux/reboot.h> +#include <linux/uaccess.h> +#include <linux/notifier.h> +#include <linux/interrupt.h> + +#include <linux/io.h> +#include <asm/fsl_hcalls.h> + +#include <linux/fsl_hypervisor.h> + +static BLOCKING_NOTIFIER_HEAD(failover_subscribers); + +/* + * Ioctl interface for FSL_HV_IOCTL_PARTITION_RESTART + * + * Restart a running partition + */ +static long ioctl_restart(struct fsl_hv_ioctl_restart __user *p) +{ + struct fsl_hv_ioctl_restart param; + + /* Get the parameters from the user */ + if (copy_from_user(¶m, p, sizeof(struct fsl_hv_ioctl_restart))) + return -EFAULT; + + param.ret = fh_partition_restart(param.partition); + + if (copy_to_user(&p->ret, ¶m.ret, sizeof(__u32))) + return -EFAULT; + + return 0; +} + +/* + * Ioctl interface for FSL_HV_IOCTL_PARTITION_STATUS + * + * Query the status of a partition + */ +static long ioctl_status(struct fsl_hv_ioctl_status __user *p) +{ + struct fsl_hv_ioctl_status param; + u32 status; + + /* Get the parameters from the user */ + if (copy_from_user(¶m, p, sizeof(struct fsl_hv_ioctl_status))) + return -EFAULT; + + param.ret = fh_partition_get_status(param.partition, &status); + if (!param.ret) + param.status = status; + + if (copy_to_user(p, ¶m, sizeof(struct fsl_hv_ioctl_status))) + return -EFAULT; + + return 0; +} + +/* + * Ioctl interface for FSL_HV_IOCTL_PARTITION_START + * + * Start a stopped partition. + */ +static long ioctl_start(struct fsl_hv_ioctl_start __user *p) +{ + struct fsl_hv_ioctl_start param; + + /* Get the parameters from the user */ + if (copy_from_user(¶m, p, sizeof(struct fsl_hv_ioctl_start))) + return -EFAULT; + + param.ret = fh_partition_start(param.partition, param.entry_point, + param.load); + + if (copy_to_user(&p->ret, ¶m.ret, sizeof(__u32))) + return -EFAULT; + + return 0; +} + +/* + * Ioctl interface for FSL_HV_IOCTL_PARTITION_STOP + * + * Stop a running partition + */ +static long ioctl_stop(struct fsl_hv_ioctl_stop __user *p) +{ + struct fsl_hv_ioctl_stop param; + + /* Get the parameters from the user */ + if (copy_from_user(¶m, p, sizeof(struct fsl_hv_ioctl_stop))) + return -EFAULT; + + param.ret = fh_partition_stop(param.partition); + + if (copy_to_user(&p->ret, ¶m.ret, sizeof(__u32))) + return -EFAULT; + + return 0; +} + +/* + * Ioctl interface for FSL_HV_IOCTL_MEMCPY + * + * The FH_MEMCPY hypercall takes an array of address/address/size structures + * to represent the data being copied. As a convenience to the user, this + * ioctl takes a user-create buffer and a pointer to a guest physically + * contiguous buffer in the remote partition, and creates the + * address/address/size array for the hypercall. + */ +static long ioctl_memcpy(struct fsl_hv_ioctl_memcpy __user *p) +{ + struct fsl_hv_ioctl_memcpy param; + + struct page **pages = NULL; + void *sg_list_unaligned = NULL; + struct fh_sg_list *sg_list = NULL; + + unsigned int num_pages; + unsigned long lb_offset; /* Offset within a page of the local buffer */ + + unsigned int i; + long ret = 0; + int num_pinned = 0; /* return value from get_user_pages_fast() */ + phys_addr_t remote_paddr; /* The next address in the remote buffer */ + uint32_t count; /* The number of bytes left to copy */ + + /* Get the parameters from the user */ + if (copy_from_user(¶m, p, sizeof(struct fsl_hv_ioctl_memcpy))) + return -EFAULT; + + /* + * One partition must be local, the other must be remote. In other + * words, if source and target are both -1, or are both not -1, then + * return an error. + */ + if ((param.source == -1) == (param.target == -1)) + return -EINVAL; + + /* + * The array of pages returned by get_user_pages_fast() covers only + * page-aligned memory. Since the user buffer is probably not + * page-aligned, we need to handle the discrepancy. + * + * We calculate the offset within a page of the S/G list, and make + * adjustments accordingly. This will result in a page list that looks + * like this: + * + * ---- <-- first page starts before the buffer + * | | + * |////|-> ---- + * |////| | | + * ---- | | + * | | + * ---- | | + * |////| | | + * |////| | | + * |////| | | + * ---- | | + * | | + * ---- | | + * |////| | | + * |////| | | + * |////| | | + * ---- | | + * | | + * ---- | | + * |////| | | + * |////|-> ---- + * | | <-- last page ends after the buffer + * ---- + * + * The distance between the start of the first page and the start of the + * buffer is lb_offset. The hashed (///) areas are the parts of the + * page list that contain the actual buffer. + * + * The advantage of this approach is that the number of pages is + * equal to the number of entries in the S/G list that we give to the + * hypervisor. + */ + lb_offset = param.local_vaddr & (PAGE_SIZE - 1); + if (param.count == 0 || + param.count > U64_MAX - lb_offset - PAGE_SIZE + 1) + return -EINVAL; + num_pages = (param.count + lb_offset + PAGE_SIZE - 1) >> PAGE_SHIFT; + + /* Allocate the buffers we need */ + + /* + * 'pages' is an array of struct page pointers that's initialized by + * get_user_pages_fast(). + */ + pages = kcalloc(num_pages, sizeof(struct page *), GFP_KERNEL); + if (!pages) { + pr_debug("fsl-hv: could not allocate page list\n"); + return -ENOMEM; + } + + /* + * sg_list is the list of fh_sg_list objects that we pass to the + * hypervisor. + */ + sg_list_unaligned = kmalloc(num_pages * sizeof(struct fh_sg_list) + + sizeof(struct fh_sg_list) - 1, GFP_KERNEL); + if (!sg_list_unaligned) { + pr_debug("fsl-hv: could not allocate S/G list\n"); + ret = -ENOMEM; + goto free_pages; + } + sg_list = PTR_ALIGN(sg_list_unaligned, sizeof(struct fh_sg_list)); + + /* Get the physical addresses of the source buffer */ + num_pinned = get_user_pages_fast(param.local_vaddr - lb_offset, + num_pages, param.source != -1 ? FOLL_WRITE : 0, pages); + + if (num_pinned != num_pages) { + pr_debug("fsl-hv: could not lock source buffer\n"); + ret = (num_pinned < 0) ? num_pinned : -EFAULT; + goto exit; + } + + /* + * Build the fh_sg_list[] array. The first page is special + * because it's misaligned. + */ + if (param.source == -1) { + sg_list[0].source = page_to_phys(pages[0]) + lb_offset; + sg_list[0].target = param.remote_paddr; + } else { + sg_list[0].source = param.remote_paddr; + sg_list[0].target = page_to_phys(pages[0]) + lb_offset; + } + sg_list[0].size = min_t(uint64_t, param.count, PAGE_SIZE - lb_offset); + + remote_paddr = param.remote_paddr + sg_list[0].size; + count = param.count - sg_list[0].size; + + for (i = 1; i < num_pages; i++) { + if (param.source == -1) { + /* local to remote */ + sg_list[i].source = page_to_phys(pages[i]); + sg_list[i].target = remote_paddr; + } else { + /* remote to local */ + sg_list[i].source = remote_paddr; + sg_list[i].target = page_to_phys(pages[i]); + } + sg_list[i].size = min_t(uint64_t, count, PAGE_SIZE); + + remote_paddr += sg_list[i].size; + count -= sg_list[i].size; + } + + param.ret = fh_partition_memcpy(param.source, param.target, + virt_to_phys(sg_list), num_pages); + +exit: + if (pages && (num_pinned > 0)) { + for (i = 0; i < num_pinned; i++) + put_page(pages[i]); + } + + kfree(sg_list_unaligned); +free_pages: + kfree(pages); + + if (!ret) + if (copy_to_user(&p->ret, ¶m.ret, sizeof(__u32))) + return -EFAULT; + + return ret; +} + +/* + * Ioctl interface for FSL_HV_IOCTL_DOORBELL + * + * Ring a doorbell + */ +static long ioctl_doorbell(struct fsl_hv_ioctl_doorbell __user *p) +{ + struct fsl_hv_ioctl_doorbell param; + + /* Get the parameters from the user. */ + if (copy_from_user(¶m, p, sizeof(struct fsl_hv_ioctl_doorbell))) + return -EFAULT; + + param.ret = ev_doorbell_send(param.doorbell); + + if (copy_to_user(&p->ret, ¶m.ret, sizeof(__u32))) + return -EFAULT; + + return 0; +} + +static long ioctl_dtprop(struct fsl_hv_ioctl_prop __user *p, int set) +{ + struct fsl_hv_ioctl_prop param; + char __user *upath, *upropname; + void __user *upropval; + char *path, *propname; + void *propval; + int ret = 0; + + /* Get the parameters from the user. */ + if (copy_from_user(¶m, p, sizeof(struct fsl_hv_ioctl_prop))) + return -EFAULT; + + upath = (char __user *)(uintptr_t)param.path; + upropname = (char __user *)(uintptr_t)param.propname; + upropval = (void __user *)(uintptr_t)param.propval; + + path = strndup_user(upath, FH_DTPROP_MAX_PATHLEN); + if (IS_ERR(path)) + return PTR_ERR(path); + + propname = strndup_user(upropname, FH_DTPROP_MAX_PATHLEN); + if (IS_ERR(propname)) { + ret = PTR_ERR(propname); + goto err_free_path; + } + + if (param.proplen > FH_DTPROP_MAX_PROPLEN) { + ret = -EINVAL; + goto err_free_propname; + } + + propval = kmalloc(param.proplen, GFP_KERNEL); + if (!propval) { + ret = -ENOMEM; + goto err_free_propname; + } + + if (set) { + if (copy_from_user(propval, upropval, param.proplen)) { + ret = -EFAULT; + goto err_free_propval; + } + + param.ret = fh_partition_set_dtprop(param.handle, + virt_to_phys(path), + virt_to_phys(propname), + virt_to_phys(propval), + param.proplen); + } else { + param.ret = fh_partition_get_dtprop(param.handle, + virt_to_phys(path), + virt_to_phys(propname), + virt_to_phys(propval), + ¶m.proplen); + + if (param.ret == 0) { + if (copy_to_user(upropval, propval, param.proplen) || + put_user(param.proplen, &p->proplen)) { + ret = -EFAULT; + goto err_free_propval; + } + } + } + + if (put_user(param.ret, &p->ret)) + ret = -EFAULT; + +err_free_propval: + kfree(propval); +err_free_propname: + kfree(propname); +err_free_path: + kfree(path); + + return ret; +} + +/* + * Ioctl main entry point + */ +static long fsl_hv_ioctl(struct file *file, unsigned int cmd, + unsigned long argaddr) +{ + void __user *arg = (void __user *)argaddr; + long ret; + + switch (cmd) { + case FSL_HV_IOCTL_PARTITION_RESTART: + ret = ioctl_restart(arg); + break; + case FSL_HV_IOCTL_PARTITION_GET_STATUS: + ret = ioctl_status(arg); + break; + case FSL_HV_IOCTL_PARTITION_START: + ret = ioctl_start(arg); + break; + case FSL_HV_IOCTL_PARTITION_STOP: + ret = ioctl_stop(arg); + break; + case FSL_HV_IOCTL_MEMCPY: + ret = ioctl_memcpy(arg); + break; + case FSL_HV_IOCTL_DOORBELL: + ret = ioctl_doorbell(arg); + break; + case FSL_HV_IOCTL_GETPROP: + ret = ioctl_dtprop(arg, 0); + break; + case FSL_HV_IOCTL_SETPROP: + ret = ioctl_dtprop(arg, 1); + break; + default: + pr_debug("fsl-hv: bad ioctl dir=%u type=%u cmd=%u size=%u\n", + _IOC_DIR(cmd), _IOC_TYPE(cmd), _IOC_NR(cmd), + _IOC_SIZE(cmd)); + return -ENOTTY; + } + + return ret; +} + +/* Linked list of processes that have us open */ +static struct list_head db_list; + +/* spinlock for db_list */ +static DEFINE_SPINLOCK(db_list_lock); + +/* The size of the doorbell event queue. This must be a power of two. */ +#define QSIZE 16 + +/* Returns the next head/tail pointer, wrapping around the queue if necessary */ +#define nextp(x) (((x) + 1) & (QSIZE - 1)) + +/* Per-open data structure */ +struct doorbell_queue { + struct list_head list; + spinlock_t lock; + wait_queue_head_t wait; + unsigned int head; + unsigned int tail; + uint32_t q[QSIZE]; +}; + +/* Linked list of ISRs that we registered */ +struct list_head isr_list; + +/* Per-ISR data structure */ +struct doorbell_isr { + struct list_head list; + unsigned int irq; + uint32_t doorbell; /* The doorbell handle */ + uint32_t partition; /* The partition handle, if used */ +}; + +/* + * Add a doorbell to all of the doorbell queues + */ +static void fsl_hv_queue_doorbell(uint32_t doorbell) +{ + struct doorbell_queue *dbq; + unsigned long flags; + + /* Prevent another core from modifying db_list */ + spin_lock_irqsave(&db_list_lock, flags); + + list_for_each_entry(dbq, &db_list, list) { + if (dbq->head != nextp(dbq->tail)) { + dbq->q[dbq->tail] = doorbell; + /* + * This memory barrier eliminates the need to grab + * the spinlock for dbq. + */ + smp_wmb(); + dbq->tail = nextp(dbq->tail); + wake_up_interruptible(&dbq->wait); + } + } + + spin_unlock_irqrestore(&db_list_lock, flags); +} + +/* + * Interrupt handler for all doorbells + * + * We use the same interrupt handler for all doorbells. Whenever a doorbell + * is rung, and we receive an interrupt, we just put the handle for that + * doorbell (passed to us as *data) into all of the queues. + */ +static irqreturn_t fsl_hv_isr(int irq, void *data) +{ + fsl_hv_queue_doorbell((uintptr_t) data); + + return IRQ_HANDLED; +} + +/* + * State change thread function + * + * The state change notification arrives in an interrupt, but we can't call + * blocking_notifier_call_chain() in an interrupt handler. We could call + * atomic_notifier_call_chain(), but that would require the clients' call-back + * function to run in interrupt context. Since we don't want to impose that + * restriction on the clients, we use a threaded IRQ to process the + * notification in kernel context. + */ +static irqreturn_t fsl_hv_state_change_thread(int irq, void *data) +{ + struct doorbell_isr *dbisr = data; + + blocking_notifier_call_chain(&failover_subscribers, dbisr->partition, + NULL); + + return IRQ_HANDLED; +} + +/* + * Interrupt handler for state-change doorbells + */ +static irqreturn_t fsl_hv_state_change_isr(int irq, void *data) +{ + unsigned int status; + struct doorbell_isr *dbisr = data; + int ret; + + /* It's still a doorbell, so add it to all the queues. */ + fsl_hv_queue_doorbell(dbisr->doorbell); + + /* Determine the new state, and if it's stopped, notify the clients. */ + ret = fh_partition_get_status(dbisr->partition, &status); + if (!ret && (status == FH_PARTITION_STOPPED)) + return IRQ_WAKE_THREAD; + + return IRQ_HANDLED; +} + +/* + * Returns a bitmask indicating whether a read will block + */ +static __poll_t fsl_hv_poll(struct file *filp, struct poll_table_struct *p) +{ + struct doorbell_queue *dbq = filp->private_data; + unsigned long flags; + __poll_t mask; + + spin_lock_irqsave(&dbq->lock, flags); + + poll_wait(filp, &dbq->wait, p); + mask = (dbq->head == dbq->tail) ? 0 : (EPOLLIN | EPOLLRDNORM); + + spin_unlock_irqrestore(&dbq->lock, flags); + + return mask; +} + +/* + * Return the handles for any incoming doorbells + * + * If there are doorbell handles in the queue for this open instance, then + * return them to the caller as an array of 32-bit integers. Otherwise, + * block until there is at least one handle to return. + */ +static ssize_t fsl_hv_read(struct file *filp, char __user *buf, size_t len, + loff_t *off) +{ + struct doorbell_queue *dbq = filp->private_data; + uint32_t __user *p = (uint32_t __user *) buf; /* for put_user() */ + unsigned long flags; + ssize_t count = 0; + + /* Make sure we stop when the user buffer is full. */ + while (len >= sizeof(uint32_t)) { + uint32_t dbell; /* Local copy of doorbell queue data */ + + spin_lock_irqsave(&dbq->lock, flags); + + /* + * If the queue is empty, then either we're done or we need + * to block. If the application specified O_NONBLOCK, then + * we return the appropriate error code. + */ + if (dbq->head == dbq->tail) { + spin_unlock_irqrestore(&dbq->lock, flags); + if (count) + break; + if (filp->f_flags & O_NONBLOCK) + return -EAGAIN; + if (wait_event_interruptible(dbq->wait, + dbq->head != dbq->tail)) + return -ERESTARTSYS; + continue; + } + + /* + * Even though we have an smp_wmb() in the ISR, the core + * might speculatively execute the "dbell = ..." below while + * it's evaluating the if-statement above. In that case, the + * value put into dbell could be stale if the core accepts the + * speculation. To prevent that, we need a read memory barrier + * here as well. + */ + smp_rmb(); + + /* Copy the data to a temporary local buffer, because + * we can't call copy_to_user() from inside a spinlock + */ + dbell = dbq->q[dbq->head]; + dbq->head = nextp(dbq->head); + + spin_unlock_irqrestore(&dbq->lock, flags); + + if (put_user(dbell, p)) + return -EFAULT; + p++; + count += sizeof(uint32_t); + len -= sizeof(uint32_t); + } + + return count; +} + +/* + * Open the driver and prepare for reading doorbells. + * + * Every time an application opens the driver, we create a doorbell queue + * for that file handle. This queue is used for any incoming doorbells. + */ +static int fsl_hv_open(struct inode *inode, struct file *filp) +{ + struct doorbell_queue *dbq; + unsigned long flags; + + dbq = kzalloc(sizeof(struct doorbell_queue), GFP_KERNEL); + if (!dbq) { + pr_err("fsl-hv: out of memory\n"); + return -ENOMEM; + } + + spin_lock_init(&dbq->lock); + init_waitqueue_head(&dbq->wait); + + spin_lock_irqsave(&db_list_lock, flags); + list_add(&dbq->list, &db_list); + spin_unlock_irqrestore(&db_list_lock, flags); + + filp->private_data = dbq; + + return 0; +} + +/* + * Close the driver + */ +static int fsl_hv_close(struct inode *inode, struct file *filp) +{ + struct doorbell_queue *dbq = filp->private_data; + unsigned long flags; + + spin_lock_irqsave(&db_list_lock, flags); + list_del(&dbq->list); + spin_unlock_irqrestore(&db_list_lock, flags); + + kfree(dbq); + + return 0; +} + +static const struct file_operations fsl_hv_fops = { + .owner = THIS_MODULE, + .open = fsl_hv_open, + .release = fsl_hv_close, + .poll = fsl_hv_poll, + .read = fsl_hv_read, + .unlocked_ioctl = fsl_hv_ioctl, + .compat_ioctl = compat_ptr_ioctl, +}; + +static struct miscdevice fsl_hv_misc_dev = { + MISC_DYNAMIC_MINOR, + "fsl-hv", + &fsl_hv_fops +}; + +static irqreturn_t fsl_hv_shutdown_isr(int irq, void *data) +{ + orderly_poweroff(false); + + return IRQ_HANDLED; +} + +/* + * Returns the handle of the parent of the given node + * + * The handle is the value of the 'hv-handle' property + */ +static int get_parent_handle(struct device_node *np) +{ + struct device_node *parent; + const uint32_t *prop; + uint32_t handle; + int len; + + parent = of_get_parent(np); + if (!parent) + /* It's not really possible for this to fail */ + return -ENODEV; + + /* + * The proper name for the handle property is "hv-handle", but some + * older versions of the hypervisor used "reg". + */ + prop = of_get_property(parent, "hv-handle", &len); + if (!prop) + prop = of_get_property(parent, "reg", &len); + + if (!prop || (len != sizeof(uint32_t))) { + /* This can happen only if the node is malformed */ + of_node_put(parent); + return -ENODEV; + } + + handle = be32_to_cpup(prop); + of_node_put(parent); + + return handle; +} + +/* + * Register a callback for failover events + * + * This function is called by device drivers to register their callback + * functions for fail-over events. + */ +int fsl_hv_failover_register(struct notifier_block *nb) +{ + return blocking_notifier_chain_register(&failover_subscribers, nb); +} +EXPORT_SYMBOL(fsl_hv_failover_register); + +/* + * Unregister a callback for failover events + */ +int fsl_hv_failover_unregister(struct notifier_block *nb) +{ + return blocking_notifier_chain_unregister(&failover_subscribers, nb); +} +EXPORT_SYMBOL(fsl_hv_failover_unregister); + +/* + * Return TRUE if we're running under FSL hypervisor + * + * This function checks to see if we're running under the Freescale + * hypervisor, and returns zero if we're not, or non-zero if we are. + * + * First, it checks if MSR[GS]==1, which means we're running under some + * hypervisor. Then it checks if there is a hypervisor node in the device + * tree. Currently, that means there needs to be a node in the root called + * "hypervisor" and which has a property named "fsl,hv-version". + */ +static int has_fsl_hypervisor(void) +{ + struct device_node *node; + int ret; + + node = of_find_node_by_path("/hypervisor"); + if (!node) + return 0; + + ret = of_find_property(node, "fsl,hv-version", NULL) != NULL; + + of_node_put(node); + + return ret; +} + +/* + * Freescale hypervisor management driver init + * + * This function is called when this module is loaded. + * + * Register ourselves as a miscellaneous driver. This will register the + * fops structure and create the right sysfs entries for udev. + */ +static int __init fsl_hypervisor_init(void) +{ + struct device_node *np; + struct doorbell_isr *dbisr, *n; + int ret; + + pr_info("Freescale hypervisor management driver\n"); + + if (!has_fsl_hypervisor()) { + pr_info("fsl-hv: no hypervisor found\n"); + return -ENODEV; + } + + ret = misc_register(&fsl_hv_misc_dev); + if (ret) { + pr_err("fsl-hv: cannot register device\n"); + return ret; + } + + INIT_LIST_HEAD(&db_list); + INIT_LIST_HEAD(&isr_list); + + for_each_compatible_node(np, NULL, "epapr,hv-receive-doorbell") { + unsigned int irq; + const uint32_t *handle; + + handle = of_get_property(np, "interrupts", NULL); + irq = irq_of_parse_and_map(np, 0); + if (!handle || (irq == NO_IRQ)) { + pr_err("fsl-hv: no 'interrupts' property in %pOF node\n", + np); + continue; + } + + dbisr = kzalloc(sizeof(*dbisr), GFP_KERNEL); + if (!dbisr) + goto out_of_memory; + + dbisr->irq = irq; + dbisr->doorbell = be32_to_cpup(handle); + + if (of_device_is_compatible(np, "fsl,hv-shutdown-doorbell")) { + /* The shutdown doorbell gets its own ISR */ + ret = request_irq(irq, fsl_hv_shutdown_isr, 0, + np->name, NULL); + } else if (of_device_is_compatible(np, + "fsl,hv-state-change-doorbell")) { + /* + * The state change doorbell triggers a notification if + * the state of the managed partition changes to + * "stopped". We need a separate interrupt handler for + * that, and we also need to know the handle of the + * target partition, not just the handle of the + * doorbell. + */ + dbisr->partition = ret = get_parent_handle(np); + if (ret < 0) { + pr_err("fsl-hv: node %pOF has missing or " + "malformed parent\n", np); + kfree(dbisr); + continue; + } + ret = request_threaded_irq(irq, fsl_hv_state_change_isr, + fsl_hv_state_change_thread, + 0, np->name, dbisr); + } else + ret = request_irq(irq, fsl_hv_isr, 0, np->name, dbisr); + + if (ret < 0) { + pr_err("fsl-hv: could not request irq %u for node %pOF\n", + irq, np); + kfree(dbisr); + continue; + } + + list_add(&dbisr->list, &isr_list); + + pr_info("fsl-hv: registered handler for doorbell %u\n", + dbisr->doorbell); + } + + return 0; + +out_of_memory: + list_for_each_entry_safe(dbisr, n, &isr_list, list) { + free_irq(dbisr->irq, dbisr); + list_del(&dbisr->list); + kfree(dbisr); + } + + misc_deregister(&fsl_hv_misc_dev); + + return -ENOMEM; +} + +/* + * Freescale hypervisor management driver termination + * + * This function is called when this driver is unloaded. + */ +static void __exit fsl_hypervisor_exit(void) +{ + struct doorbell_isr *dbisr, *n; + + list_for_each_entry_safe(dbisr, n, &isr_list, list) { + free_irq(dbisr->irq, dbisr); + list_del(&dbisr->list); + kfree(dbisr); + } + + misc_deregister(&fsl_hv_misc_dev); +} + +module_init(fsl_hypervisor_init); +module_exit(fsl_hypervisor_exit); + +MODULE_AUTHOR("Timur Tabi <timur@freescale.com>"); +MODULE_DESCRIPTION("Freescale hypervisor management driver"); +MODULE_LICENSE("GPL v2"); diff --git a/drivers/virt/nitro_enclaves/Kconfig b/drivers/virt/nitro_enclaves/Kconfig new file mode 100644 index 000000000..dc4d25c26 --- /dev/null +++ b/drivers/virt/nitro_enclaves/Kconfig @@ -0,0 +1,26 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# Copyright 2020-2021 Amazon.com, Inc. or its affiliates. All Rights Reserved. + +# Amazon Nitro Enclaves (NE) support. +# Nitro is a hypervisor that has been developed by Amazon. + +config NITRO_ENCLAVES + tristate "Nitro Enclaves Support" + depends on (ARM64 || X86) && HOTPLUG_CPU && PCI && SMP + help + This driver consists of support for enclave lifetime management + for Nitro Enclaves (NE). + + To compile this driver as a module, choose M here. + The module will be called nitro_enclaves. + +config NITRO_ENCLAVES_MISC_DEV_TEST + bool "Tests for the misc device functionality of the Nitro Enclaves" if !KUNIT_ALL_TESTS + depends on NITRO_ENCLAVES && KUNIT=y + default KUNIT_ALL_TESTS + help + Enable KUnit tests for the misc device functionality of the Nitro + Enclaves. Select this option only if you will boot the kernel for + the purpose of running unit tests (e.g. under UML or qemu). If + unsure, say N. diff --git a/drivers/virt/nitro_enclaves/Makefile b/drivers/virt/nitro_enclaves/Makefile new file mode 100644 index 000000000..da61260f2 --- /dev/null +++ b/drivers/virt/nitro_enclaves/Makefile @@ -0,0 +1,9 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. + +# Enclave lifetime management support for Nitro Enclaves (NE). + +obj-$(CONFIG_NITRO_ENCLAVES) += nitro_enclaves.o + +nitro_enclaves-y := ne_pci_dev.o ne_misc_dev.o diff --git a/drivers/virt/nitro_enclaves/ne_misc_dev.c b/drivers/virt/nitro_enclaves/ne_misc_dev.c new file mode 100644 index 000000000..241b94f62 --- /dev/null +++ b/drivers/virt/nitro_enclaves/ne_misc_dev.c @@ -0,0 +1,1783 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright 2020-2021 Amazon.com, Inc. or its affiliates. All Rights Reserved. + */ + +/** + * DOC: Enclave lifetime management driver for Nitro Enclaves (NE). + * Nitro is a hypervisor that has been developed by Amazon. + */ + +#include <linux/anon_inodes.h> +#include <linux/capability.h> +#include <linux/cpu.h> +#include <linux/device.h> +#include <linux/file.h> +#include <linux/hugetlb.h> +#include <linux/limits.h> +#include <linux/list.h> +#include <linux/miscdevice.h> +#include <linux/mm.h> +#include <linux/mman.h> +#include <linux/module.h> +#include <linux/mutex.h> +#include <linux/nitro_enclaves.h> +#include <linux/pci.h> +#include <linux/poll.h> +#include <linux/range.h> +#include <linux/slab.h> +#include <linux/types.h> +#include <uapi/linux/vm_sockets.h> + +#include "ne_misc_dev.h" +#include "ne_pci_dev.h" + +/** + * NE_CPUS_SIZE - Size for max 128 CPUs, for now, in a cpu-list string, comma + * separated. The NE CPU pool includes CPUs from a single NUMA + * node. + */ +#define NE_CPUS_SIZE (512) + +/** + * NE_EIF_LOAD_OFFSET - The offset where to copy the Enclave Image Format (EIF) + * image in enclave memory. + */ +#define NE_EIF_LOAD_OFFSET (8 * 1024UL * 1024UL) + +/** + * NE_MIN_ENCLAVE_MEM_SIZE - The minimum memory size an enclave can be launched + * with. + */ +#define NE_MIN_ENCLAVE_MEM_SIZE (64 * 1024UL * 1024UL) + +/** + * NE_MIN_MEM_REGION_SIZE - The minimum size of an enclave memory region. + */ +#define NE_MIN_MEM_REGION_SIZE (2 * 1024UL * 1024UL) + +/** + * NE_PARENT_VM_CID - The CID for the vsock device of the primary / parent VM. + */ +#define NE_PARENT_VM_CID (3) + +static long ne_ioctl(struct file *file, unsigned int cmd, unsigned long arg); + +static const struct file_operations ne_fops = { + .owner = THIS_MODULE, + .llseek = noop_llseek, + .unlocked_ioctl = ne_ioctl, +}; + +static struct miscdevice ne_misc_dev = { + .minor = MISC_DYNAMIC_MINOR, + .name = "nitro_enclaves", + .fops = &ne_fops, + .mode = 0660, +}; + +struct ne_devs ne_devs = { + .ne_misc_dev = &ne_misc_dev, +}; + +/* + * TODO: Update logic to create new sysfs entries instead of using + * a kernel parameter e.g. if multiple sysfs files needed. + */ +static int ne_set_kernel_param(const char *val, const struct kernel_param *kp); + +static const struct kernel_param_ops ne_cpu_pool_ops = { + .get = param_get_string, + .set = ne_set_kernel_param, +}; + +static char ne_cpus[NE_CPUS_SIZE]; +static struct kparam_string ne_cpus_arg = { + .maxlen = sizeof(ne_cpus), + .string = ne_cpus, +}; + +module_param_cb(ne_cpus, &ne_cpu_pool_ops, &ne_cpus_arg, 0644); +/* https://www.kernel.org/doc/html/latest/admin-guide/kernel-parameters.html#cpu-lists */ +MODULE_PARM_DESC(ne_cpus, "<cpu-list> - CPU pool used for Nitro Enclaves"); + +/** + * struct ne_cpu_pool - CPU pool used for Nitro Enclaves. + * @avail_threads_per_core: Available full CPU cores to be dedicated to + * enclave(s). The cpumasks from the array, indexed + * by core id, contain all the threads from the + * available cores, that are not set for created + * enclave(s). The full CPU cores are part of the + * NE CPU pool. + * @mutex: Mutex for the access to the NE CPU pool. + * @nr_parent_vm_cores : The size of the available threads per core array. + * The total number of CPU cores available on the + * primary / parent VM. + * @nr_threads_per_core: The number of threads that a full CPU core has. + * @numa_node: NUMA node of the CPUs in the pool. + */ +struct ne_cpu_pool { + cpumask_var_t *avail_threads_per_core; + struct mutex mutex; + unsigned int nr_parent_vm_cores; + unsigned int nr_threads_per_core; + int numa_node; +}; + +static struct ne_cpu_pool ne_cpu_pool; + +/** + * struct ne_phys_contig_mem_regions - Contiguous physical memory regions. + * @num: The number of regions that currently has. + * @regions: The array of physical memory regions. + */ +struct ne_phys_contig_mem_regions { + unsigned long num; + struct range *regions; +}; + +/** + * ne_check_enclaves_created() - Verify if at least one enclave has been created. + * @void: No parameters provided. + * + * Context: Process context. + * Return: + * * True if at least one enclave is created. + * * False otherwise. + */ +static bool ne_check_enclaves_created(void) +{ + struct ne_pci_dev *ne_pci_dev = ne_devs.ne_pci_dev; + bool ret = false; + + if (!ne_pci_dev) + return ret; + + mutex_lock(&ne_pci_dev->enclaves_list_mutex); + + if (!list_empty(&ne_pci_dev->enclaves_list)) + ret = true; + + mutex_unlock(&ne_pci_dev->enclaves_list_mutex); + + return ret; +} + +/** + * ne_setup_cpu_pool() - Set the NE CPU pool after handling sanity checks such + * as not sharing CPU cores with the primary / parent VM + * or not using CPU 0, which should remain available for + * the primary / parent VM. Offline the CPUs from the + * pool after the checks passed. + * @ne_cpu_list: The CPU list used for setting NE CPU pool. + * + * Context: Process context. + * Return: + * * 0 on success. + * * Negative return value on failure. + */ +static int ne_setup_cpu_pool(const char *ne_cpu_list) +{ + int core_id = -1; + unsigned int cpu = 0; + cpumask_var_t cpu_pool; + unsigned int cpu_sibling = 0; + unsigned int i = 0; + int numa_node = -1; + int rc = -EINVAL; + + if (!zalloc_cpumask_var(&cpu_pool, GFP_KERNEL)) + return -ENOMEM; + + mutex_lock(&ne_cpu_pool.mutex); + + rc = cpulist_parse(ne_cpu_list, cpu_pool); + if (rc < 0) { + pr_err("%s: Error in cpulist parse [rc=%d]\n", ne_misc_dev.name, rc); + + goto free_pool_cpumask; + } + + cpu = cpumask_any(cpu_pool); + if (cpu >= nr_cpu_ids) { + pr_err("%s: No CPUs available in CPU pool\n", ne_misc_dev.name); + + rc = -EINVAL; + + goto free_pool_cpumask; + } + + /* + * Check if the CPUs are online, to further get info about them + * e.g. numa node, core id, siblings. + */ + for_each_cpu(cpu, cpu_pool) + if (cpu_is_offline(cpu)) { + pr_err("%s: CPU %d is offline, has to be online to get its metadata\n", + ne_misc_dev.name, cpu); + + rc = -EINVAL; + + goto free_pool_cpumask; + } + + /* + * Check if the CPUs from the NE CPU pool are from the same NUMA node. + */ + for_each_cpu(cpu, cpu_pool) + if (numa_node < 0) { + numa_node = cpu_to_node(cpu); + if (numa_node < 0) { + pr_err("%s: Invalid NUMA node %d\n", + ne_misc_dev.name, numa_node); + + rc = -EINVAL; + + goto free_pool_cpumask; + } + } else { + if (numa_node != cpu_to_node(cpu)) { + pr_err("%s: CPUs with different NUMA nodes\n", + ne_misc_dev.name); + + rc = -EINVAL; + + goto free_pool_cpumask; + } + } + + /* + * Check if CPU 0 and its siblings are included in the provided CPU pool + * They should remain available for the primary / parent VM. + */ + if (cpumask_test_cpu(0, cpu_pool)) { + pr_err("%s: CPU 0 has to remain available\n", ne_misc_dev.name); + + rc = -EINVAL; + + goto free_pool_cpumask; + } + + for_each_cpu(cpu_sibling, topology_sibling_cpumask(0)) { + if (cpumask_test_cpu(cpu_sibling, cpu_pool)) { + pr_err("%s: CPU sibling %d for CPU 0 is in CPU pool\n", + ne_misc_dev.name, cpu_sibling); + + rc = -EINVAL; + + goto free_pool_cpumask; + } + } + + /* + * Check if CPU siblings are included in the provided CPU pool. The + * expectation is that full CPU cores are made available in the CPU pool + * for enclaves. + */ + for_each_cpu(cpu, cpu_pool) { + for_each_cpu(cpu_sibling, topology_sibling_cpumask(cpu)) { + if (!cpumask_test_cpu(cpu_sibling, cpu_pool)) { + pr_err("%s: CPU %d is not in CPU pool\n", + ne_misc_dev.name, cpu_sibling); + + rc = -EINVAL; + + goto free_pool_cpumask; + } + } + } + + /* Calculate the number of threads from a full CPU core. */ + cpu = cpumask_any(cpu_pool); + for_each_cpu(cpu_sibling, topology_sibling_cpumask(cpu)) + ne_cpu_pool.nr_threads_per_core++; + + ne_cpu_pool.nr_parent_vm_cores = nr_cpu_ids / ne_cpu_pool.nr_threads_per_core; + + ne_cpu_pool.avail_threads_per_core = kcalloc(ne_cpu_pool.nr_parent_vm_cores, + sizeof(*ne_cpu_pool.avail_threads_per_core), + GFP_KERNEL); + if (!ne_cpu_pool.avail_threads_per_core) { + rc = -ENOMEM; + + goto free_pool_cpumask; + } + + for (i = 0; i < ne_cpu_pool.nr_parent_vm_cores; i++) + if (!zalloc_cpumask_var(&ne_cpu_pool.avail_threads_per_core[i], GFP_KERNEL)) { + rc = -ENOMEM; + + goto free_cores_cpumask; + } + + /* + * Split the NE CPU pool in threads per core to keep the CPU topology + * after offlining the CPUs. + */ + for_each_cpu(cpu, cpu_pool) { + core_id = topology_core_id(cpu); + if (core_id < 0 || core_id >= ne_cpu_pool.nr_parent_vm_cores) { + pr_err("%s: Invalid core id %d for CPU %d\n", + ne_misc_dev.name, core_id, cpu); + + rc = -EINVAL; + + goto clear_cpumask; + } + + cpumask_set_cpu(cpu, ne_cpu_pool.avail_threads_per_core[core_id]); + } + + /* + * CPUs that are given to enclave(s) should not be considered online + * by Linux anymore, as the hypervisor will degrade them to floating. + * The physical CPUs (full cores) are carved out of the primary / parent + * VM and given to the enclave VM. The same number of vCPUs would run + * on less pCPUs for the primary / parent VM. + * + * We offline them here, to not degrade performance and expose correct + * topology to Linux and user space. + */ + for_each_cpu(cpu, cpu_pool) { + rc = remove_cpu(cpu); + if (rc != 0) { + pr_err("%s: CPU %d is not offlined [rc=%d]\n", + ne_misc_dev.name, cpu, rc); + + goto online_cpus; + } + } + + free_cpumask_var(cpu_pool); + + ne_cpu_pool.numa_node = numa_node; + + mutex_unlock(&ne_cpu_pool.mutex); + + return 0; + +online_cpus: + for_each_cpu(cpu, cpu_pool) + add_cpu(cpu); +clear_cpumask: + for (i = 0; i < ne_cpu_pool.nr_parent_vm_cores; i++) + cpumask_clear(ne_cpu_pool.avail_threads_per_core[i]); +free_cores_cpumask: + for (i = 0; i < ne_cpu_pool.nr_parent_vm_cores; i++) + free_cpumask_var(ne_cpu_pool.avail_threads_per_core[i]); + kfree(ne_cpu_pool.avail_threads_per_core); +free_pool_cpumask: + free_cpumask_var(cpu_pool); + ne_cpu_pool.nr_parent_vm_cores = 0; + ne_cpu_pool.nr_threads_per_core = 0; + ne_cpu_pool.numa_node = -1; + mutex_unlock(&ne_cpu_pool.mutex); + + return rc; +} + +/** + * ne_teardown_cpu_pool() - Online the CPUs from the NE CPU pool and cleanup the + * CPU pool. + * @void: No parameters provided. + * + * Context: Process context. + */ +static void ne_teardown_cpu_pool(void) +{ + unsigned int cpu = 0; + unsigned int i = 0; + int rc = -EINVAL; + + mutex_lock(&ne_cpu_pool.mutex); + + if (!ne_cpu_pool.nr_parent_vm_cores) { + mutex_unlock(&ne_cpu_pool.mutex); + + return; + } + + for (i = 0; i < ne_cpu_pool.nr_parent_vm_cores; i++) { + for_each_cpu(cpu, ne_cpu_pool.avail_threads_per_core[i]) { + rc = add_cpu(cpu); + if (rc != 0) + pr_err("%s: CPU %d is not onlined [rc=%d]\n", + ne_misc_dev.name, cpu, rc); + } + + cpumask_clear(ne_cpu_pool.avail_threads_per_core[i]); + + free_cpumask_var(ne_cpu_pool.avail_threads_per_core[i]); + } + + kfree(ne_cpu_pool.avail_threads_per_core); + ne_cpu_pool.nr_parent_vm_cores = 0; + ne_cpu_pool.nr_threads_per_core = 0; + ne_cpu_pool.numa_node = -1; + + mutex_unlock(&ne_cpu_pool.mutex); +} + +/** + * ne_set_kernel_param() - Set the NE CPU pool value via the NE kernel parameter. + * @val: NE CPU pool string value. + * @kp : NE kernel parameter associated with the NE CPU pool. + * + * Context: Process context. + * Return: + * * 0 on success. + * * Negative return value on failure. + */ +static int ne_set_kernel_param(const char *val, const struct kernel_param *kp) +{ + char error_val[] = ""; + int rc = -EINVAL; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (ne_check_enclaves_created()) { + pr_err("%s: The CPU pool is used by enclave(s)\n", ne_misc_dev.name); + + return -EPERM; + } + + ne_teardown_cpu_pool(); + + rc = ne_setup_cpu_pool(val); + if (rc < 0) { + pr_err("%s: Error in setup CPU pool [rc=%d]\n", ne_misc_dev.name, rc); + + param_set_copystring(error_val, kp); + + return rc; + } + + rc = param_set_copystring(val, kp); + if (rc < 0) { + pr_err("%s: Error in param set copystring [rc=%d]\n", ne_misc_dev.name, rc); + + ne_teardown_cpu_pool(); + + param_set_copystring(error_val, kp); + + return rc; + } + + return 0; +} + +/** + * ne_donated_cpu() - Check if the provided CPU is already used by the enclave. + * @ne_enclave : Private data associated with the current enclave. + * @cpu: CPU to check if already used. + * + * Context: Process context. This function is called with the ne_enclave mutex held. + * Return: + * * True if the provided CPU is already used by the enclave. + * * False otherwise. + */ +static bool ne_donated_cpu(struct ne_enclave *ne_enclave, unsigned int cpu) +{ + if (cpumask_test_cpu(cpu, ne_enclave->vcpu_ids)) + return true; + + return false; +} + +/** + * ne_get_unused_core_from_cpu_pool() - Get the id of a full core from the + * NE CPU pool. + * @void: No parameters provided. + * + * Context: Process context. This function is called with the ne_enclave and + * ne_cpu_pool mutexes held. + * Return: + * * Core id. + * * -1 if no CPU core available in the pool. + */ +static int ne_get_unused_core_from_cpu_pool(void) +{ + int core_id = -1; + unsigned int i = 0; + + for (i = 0; i < ne_cpu_pool.nr_parent_vm_cores; i++) + if (!cpumask_empty(ne_cpu_pool.avail_threads_per_core[i])) { + core_id = i; + + break; + } + + return core_id; +} + +/** + * ne_set_enclave_threads_per_core() - Set the threads of the provided core in + * the enclave data structure. + * @ne_enclave : Private data associated with the current enclave. + * @core_id: Core id to get its threads from the NE CPU pool. + * @vcpu_id: vCPU id part of the provided core. + * + * Context: Process context. This function is called with the ne_enclave and + * ne_cpu_pool mutexes held. + * Return: + * * 0 on success. + * * Negative return value on failure. + */ +static int ne_set_enclave_threads_per_core(struct ne_enclave *ne_enclave, + int core_id, u32 vcpu_id) +{ + unsigned int cpu = 0; + + if (core_id < 0 && vcpu_id == 0) { + dev_err_ratelimited(ne_misc_dev.this_device, + "No CPUs available in NE CPU pool\n"); + + return -NE_ERR_NO_CPUS_AVAIL_IN_POOL; + } + + if (core_id < 0) { + dev_err_ratelimited(ne_misc_dev.this_device, + "CPU %d is not in NE CPU pool\n", vcpu_id); + + return -NE_ERR_VCPU_NOT_IN_CPU_POOL; + } + + if (core_id >= ne_enclave->nr_parent_vm_cores) { + dev_err_ratelimited(ne_misc_dev.this_device, + "Invalid core id %d - ne_enclave\n", core_id); + + return -NE_ERR_VCPU_INVALID_CPU_CORE; + } + + for_each_cpu(cpu, ne_cpu_pool.avail_threads_per_core[core_id]) + cpumask_set_cpu(cpu, ne_enclave->threads_per_core[core_id]); + + cpumask_clear(ne_cpu_pool.avail_threads_per_core[core_id]); + + return 0; +} + +/** + * ne_get_cpu_from_cpu_pool() - Get a CPU from the NE CPU pool, either from the + * remaining sibling(s) of a CPU core or the first + * sibling of a new CPU core. + * @ne_enclave : Private data associated with the current enclave. + * @vcpu_id: vCPU to get from the NE CPU pool. + * + * Context: Process context. This function is called with the ne_enclave mutex held. + * Return: + * * 0 on success. + * * Negative return value on failure. + */ +static int ne_get_cpu_from_cpu_pool(struct ne_enclave *ne_enclave, u32 *vcpu_id) +{ + int core_id = -1; + unsigned int cpu = 0; + unsigned int i = 0; + int rc = -EINVAL; + + /* + * If previously allocated a thread of a core to this enclave, first + * check remaining sibling(s) for new CPU allocations, so that full + * CPU cores are used for the enclave. + */ + for (i = 0; i < ne_enclave->nr_parent_vm_cores; i++) + for_each_cpu(cpu, ne_enclave->threads_per_core[i]) + if (!ne_donated_cpu(ne_enclave, cpu)) { + *vcpu_id = cpu; + + return 0; + } + + mutex_lock(&ne_cpu_pool.mutex); + + /* + * If no remaining siblings, get a core from the NE CPU pool and keep + * track of all the threads in the enclave threads per core data structure. + */ + core_id = ne_get_unused_core_from_cpu_pool(); + + rc = ne_set_enclave_threads_per_core(ne_enclave, core_id, *vcpu_id); + if (rc < 0) + goto unlock_mutex; + + *vcpu_id = cpumask_any(ne_enclave->threads_per_core[core_id]); + + rc = 0; + +unlock_mutex: + mutex_unlock(&ne_cpu_pool.mutex); + + return rc; +} + +/** + * ne_get_vcpu_core_from_cpu_pool() - Get from the NE CPU pool the id of the + * core associated with the provided vCPU. + * @vcpu_id: Provided vCPU id to get its associated core id. + * + * Context: Process context. This function is called with the ne_enclave and + * ne_cpu_pool mutexes held. + * Return: + * * Core id. + * * -1 if the provided vCPU is not in the pool. + */ +static int ne_get_vcpu_core_from_cpu_pool(u32 vcpu_id) +{ + int core_id = -1; + unsigned int i = 0; + + for (i = 0; i < ne_cpu_pool.nr_parent_vm_cores; i++) + if (cpumask_test_cpu(vcpu_id, ne_cpu_pool.avail_threads_per_core[i])) { + core_id = i; + + break; + } + + return core_id; +} + +/** + * ne_check_cpu_in_cpu_pool() - Check if the given vCPU is in the available CPUs + * from the pool. + * @ne_enclave : Private data associated with the current enclave. + * @vcpu_id: ID of the vCPU to check if available in the NE CPU pool. + * + * Context: Process context. This function is called with the ne_enclave mutex held. + * Return: + * * 0 on success. + * * Negative return value on failure. + */ +static int ne_check_cpu_in_cpu_pool(struct ne_enclave *ne_enclave, u32 vcpu_id) +{ + int core_id = -1; + unsigned int i = 0; + int rc = -EINVAL; + + if (ne_donated_cpu(ne_enclave, vcpu_id)) { + dev_err_ratelimited(ne_misc_dev.this_device, + "CPU %d already used\n", vcpu_id); + + return -NE_ERR_VCPU_ALREADY_USED; + } + + /* + * If previously allocated a thread of a core to this enclave, but not + * the full core, first check remaining sibling(s). + */ + for (i = 0; i < ne_enclave->nr_parent_vm_cores; i++) + if (cpumask_test_cpu(vcpu_id, ne_enclave->threads_per_core[i])) + return 0; + + mutex_lock(&ne_cpu_pool.mutex); + + /* + * If no remaining siblings, get from the NE CPU pool the core + * associated with the vCPU and keep track of all the threads in the + * enclave threads per core data structure. + */ + core_id = ne_get_vcpu_core_from_cpu_pool(vcpu_id); + + rc = ne_set_enclave_threads_per_core(ne_enclave, core_id, vcpu_id); + if (rc < 0) + goto unlock_mutex; + + rc = 0; + +unlock_mutex: + mutex_unlock(&ne_cpu_pool.mutex); + + return rc; +} + +/** + * ne_add_vcpu_ioctl() - Add a vCPU to the slot associated with the current + * enclave. + * @ne_enclave : Private data associated with the current enclave. + * @vcpu_id: ID of the CPU to be associated with the given slot, + * apic id on x86. + * + * Context: Process context. This function is called with the ne_enclave mutex held. + * Return: + * * 0 on success. + * * Negative return value on failure. + */ +static int ne_add_vcpu_ioctl(struct ne_enclave *ne_enclave, u32 vcpu_id) +{ + struct ne_pci_dev_cmd_reply cmd_reply = {}; + struct pci_dev *pdev = ne_devs.ne_pci_dev->pdev; + int rc = -EINVAL; + struct slot_add_vcpu_req slot_add_vcpu_req = {}; + + if (ne_enclave->mm != current->mm) + return -EIO; + + slot_add_vcpu_req.slot_uid = ne_enclave->slot_uid; + slot_add_vcpu_req.vcpu_id = vcpu_id; + + rc = ne_do_request(pdev, SLOT_ADD_VCPU, + &slot_add_vcpu_req, sizeof(slot_add_vcpu_req), + &cmd_reply, sizeof(cmd_reply)); + if (rc < 0) { + dev_err_ratelimited(ne_misc_dev.this_device, + "Error in slot add vCPU [rc=%d]\n", rc); + + return rc; + } + + cpumask_set_cpu(vcpu_id, ne_enclave->vcpu_ids); + + ne_enclave->nr_vcpus++; + + return 0; +} + +/** + * ne_sanity_check_user_mem_region() - Sanity check the user space memory + * region received during the set user + * memory region ioctl call. + * @ne_enclave : Private data associated with the current enclave. + * @mem_region : User space memory region to be sanity checked. + * + * Context: Process context. This function is called with the ne_enclave mutex held. + * Return: + * * 0 on success. + * * Negative return value on failure. + */ +static int ne_sanity_check_user_mem_region(struct ne_enclave *ne_enclave, + struct ne_user_memory_region mem_region) +{ + struct ne_mem_region *ne_mem_region = NULL; + + if (ne_enclave->mm != current->mm) + return -EIO; + + if (mem_region.memory_size & (NE_MIN_MEM_REGION_SIZE - 1)) { + dev_err_ratelimited(ne_misc_dev.this_device, + "User space memory size is not multiple of 2 MiB\n"); + + return -NE_ERR_INVALID_MEM_REGION_SIZE; + } + + if (!IS_ALIGNED(mem_region.userspace_addr, NE_MIN_MEM_REGION_SIZE)) { + dev_err_ratelimited(ne_misc_dev.this_device, + "User space address is not 2 MiB aligned\n"); + + return -NE_ERR_UNALIGNED_MEM_REGION_ADDR; + } + + if ((mem_region.userspace_addr & (NE_MIN_MEM_REGION_SIZE - 1)) || + !access_ok((void __user *)(unsigned long)mem_region.userspace_addr, + mem_region.memory_size)) { + dev_err_ratelimited(ne_misc_dev.this_device, + "Invalid user space address range\n"); + + return -NE_ERR_INVALID_MEM_REGION_ADDR; + } + + list_for_each_entry(ne_mem_region, &ne_enclave->mem_regions_list, + mem_region_list_entry) { + u64 memory_size = ne_mem_region->memory_size; + u64 userspace_addr = ne_mem_region->userspace_addr; + + if ((userspace_addr <= mem_region.userspace_addr && + mem_region.userspace_addr < (userspace_addr + memory_size)) || + (mem_region.userspace_addr <= userspace_addr && + (mem_region.userspace_addr + mem_region.memory_size) > userspace_addr)) { + dev_err_ratelimited(ne_misc_dev.this_device, + "User space memory region already used\n"); + + return -NE_ERR_MEM_REGION_ALREADY_USED; + } + } + + return 0; +} + +/** + * ne_sanity_check_user_mem_region_page() - Sanity check a page from the user space + * memory region received during the set + * user memory region ioctl call. + * @ne_enclave : Private data associated with the current enclave. + * @mem_region_page: Page from the user space memory region to be sanity checked. + * + * Context: Process context. This function is called with the ne_enclave mutex held. + * Return: + * * 0 on success. + * * Negative return value on failure. + */ +static int ne_sanity_check_user_mem_region_page(struct ne_enclave *ne_enclave, + struct page *mem_region_page) +{ + if (!PageHuge(mem_region_page)) { + dev_err_ratelimited(ne_misc_dev.this_device, + "Not a hugetlbfs page\n"); + + return -NE_ERR_MEM_NOT_HUGE_PAGE; + } + + if (page_size(mem_region_page) & (NE_MIN_MEM_REGION_SIZE - 1)) { + dev_err_ratelimited(ne_misc_dev.this_device, + "Page size not multiple of 2 MiB\n"); + + return -NE_ERR_INVALID_PAGE_SIZE; + } + + if (ne_enclave->numa_node != page_to_nid(mem_region_page)) { + dev_err_ratelimited(ne_misc_dev.this_device, + "Page is not from NUMA node %d\n", + ne_enclave->numa_node); + + return -NE_ERR_MEM_DIFFERENT_NUMA_NODE; + } + + return 0; +} + +/** + * ne_sanity_check_phys_mem_region() - Sanity check the start address and the size + * of a physical memory region. + * @phys_mem_region_paddr : Physical start address of the region to be sanity checked. + * @phys_mem_region_size : Length of the region to be sanity checked. + * + * Context: Process context. This function is called with the ne_enclave mutex held. + * Return: + * * 0 on success. + * * Negative return value on failure. + */ +static int ne_sanity_check_phys_mem_region(u64 phys_mem_region_paddr, + u64 phys_mem_region_size) +{ + if (phys_mem_region_size & (NE_MIN_MEM_REGION_SIZE - 1)) { + dev_err_ratelimited(ne_misc_dev.this_device, + "Physical mem region size is not multiple of 2 MiB\n"); + + return -EINVAL; + } + + if (!IS_ALIGNED(phys_mem_region_paddr, NE_MIN_MEM_REGION_SIZE)) { + dev_err_ratelimited(ne_misc_dev.this_device, + "Physical mem region address is not 2 MiB aligned\n"); + + return -EINVAL; + } + + return 0; +} + +/** + * ne_merge_phys_contig_memory_regions() - Add a memory region and merge the adjacent + * regions if they are physically contiguous. + * @phys_contig_regions : Private data associated with the contiguous physical memory regions. + * @page_paddr : Physical start address of the region to be added. + * @page_size : Length of the region to be added. + * + * Context: Process context. This function is called with the ne_enclave mutex held. + * Return: + * * 0 on success. + * * Negative return value on failure. + */ +static int +ne_merge_phys_contig_memory_regions(struct ne_phys_contig_mem_regions *phys_contig_regions, + u64 page_paddr, u64 page_size) +{ + unsigned long num = phys_contig_regions->num; + int rc = 0; + + rc = ne_sanity_check_phys_mem_region(page_paddr, page_size); + if (rc < 0) + return rc; + + /* Physically contiguous, just merge */ + if (num && (phys_contig_regions->regions[num - 1].end + 1) == page_paddr) { + phys_contig_regions->regions[num - 1].end += page_size; + } else { + phys_contig_regions->regions[num].start = page_paddr; + phys_contig_regions->regions[num].end = page_paddr + page_size - 1; + phys_contig_regions->num++; + } + + return 0; +} + +/** + * ne_set_user_memory_region_ioctl() - Add user space memory region to the slot + * associated with the current enclave. + * @ne_enclave : Private data associated with the current enclave. + * @mem_region : User space memory region to be associated with the given slot. + * + * Context: Process context. This function is called with the ne_enclave mutex held. + * Return: + * * 0 on success. + * * Negative return value on failure. + */ +static int ne_set_user_memory_region_ioctl(struct ne_enclave *ne_enclave, + struct ne_user_memory_region mem_region) +{ + long gup_rc = 0; + unsigned long i = 0; + unsigned long max_nr_pages = 0; + unsigned long memory_size = 0; + struct ne_mem_region *ne_mem_region = NULL; + struct pci_dev *pdev = ne_devs.ne_pci_dev->pdev; + struct ne_phys_contig_mem_regions phys_contig_mem_regions = {}; + int rc = -EINVAL; + + rc = ne_sanity_check_user_mem_region(ne_enclave, mem_region); + if (rc < 0) + return rc; + + ne_mem_region = kzalloc(sizeof(*ne_mem_region), GFP_KERNEL); + if (!ne_mem_region) + return -ENOMEM; + + max_nr_pages = mem_region.memory_size / NE_MIN_MEM_REGION_SIZE; + + ne_mem_region->pages = kcalloc(max_nr_pages, sizeof(*ne_mem_region->pages), + GFP_KERNEL); + if (!ne_mem_region->pages) { + rc = -ENOMEM; + + goto free_mem_region; + } + + phys_contig_mem_regions.regions = kcalloc(max_nr_pages, + sizeof(*phys_contig_mem_regions.regions), + GFP_KERNEL); + if (!phys_contig_mem_regions.regions) { + rc = -ENOMEM; + + goto free_mem_region; + } + + do { + i = ne_mem_region->nr_pages; + + if (i == max_nr_pages) { + dev_err_ratelimited(ne_misc_dev.this_device, + "Reached max nr of pages in the pages data struct\n"); + + rc = -ENOMEM; + + goto put_pages; + } + + gup_rc = get_user_pages_unlocked(mem_region.userspace_addr + memory_size, 1, + ne_mem_region->pages + i, FOLL_GET); + + if (gup_rc < 0) { + rc = gup_rc; + + dev_err_ratelimited(ne_misc_dev.this_device, + "Error in get user pages [rc=%d]\n", rc); + + goto put_pages; + } + + rc = ne_sanity_check_user_mem_region_page(ne_enclave, ne_mem_region->pages[i]); + if (rc < 0) + goto put_pages; + + rc = ne_merge_phys_contig_memory_regions(&phys_contig_mem_regions, + page_to_phys(ne_mem_region->pages[i]), + page_size(ne_mem_region->pages[i])); + if (rc < 0) + goto put_pages; + + memory_size += page_size(ne_mem_region->pages[i]); + + ne_mem_region->nr_pages++; + } while (memory_size < mem_region.memory_size); + + if ((ne_enclave->nr_mem_regions + phys_contig_mem_regions.num) > + ne_enclave->max_mem_regions) { + dev_err_ratelimited(ne_misc_dev.this_device, + "Reached max memory regions %lld\n", + ne_enclave->max_mem_regions); + + rc = -NE_ERR_MEM_MAX_REGIONS; + + goto put_pages; + } + + for (i = 0; i < phys_contig_mem_regions.num; i++) { + u64 phys_region_addr = phys_contig_mem_regions.regions[i].start; + u64 phys_region_size = range_len(&phys_contig_mem_regions.regions[i]); + + rc = ne_sanity_check_phys_mem_region(phys_region_addr, phys_region_size); + if (rc < 0) + goto put_pages; + } + + ne_mem_region->memory_size = mem_region.memory_size; + ne_mem_region->userspace_addr = mem_region.userspace_addr; + + list_add(&ne_mem_region->mem_region_list_entry, &ne_enclave->mem_regions_list); + + for (i = 0; i < phys_contig_mem_regions.num; i++) { + struct ne_pci_dev_cmd_reply cmd_reply = {}; + struct slot_add_mem_req slot_add_mem_req = {}; + + slot_add_mem_req.slot_uid = ne_enclave->slot_uid; + slot_add_mem_req.paddr = phys_contig_mem_regions.regions[i].start; + slot_add_mem_req.size = range_len(&phys_contig_mem_regions.regions[i]); + + rc = ne_do_request(pdev, SLOT_ADD_MEM, + &slot_add_mem_req, sizeof(slot_add_mem_req), + &cmd_reply, sizeof(cmd_reply)); + if (rc < 0) { + dev_err_ratelimited(ne_misc_dev.this_device, + "Error in slot add mem [rc=%d]\n", rc); + + kfree(phys_contig_mem_regions.regions); + + /* + * Exit here without put pages as memory regions may + * already been added. + */ + return rc; + } + + ne_enclave->mem_size += slot_add_mem_req.size; + ne_enclave->nr_mem_regions++; + } + + kfree(phys_contig_mem_regions.regions); + + return 0; + +put_pages: + for (i = 0; i < ne_mem_region->nr_pages; i++) + put_page(ne_mem_region->pages[i]); +free_mem_region: + kfree(phys_contig_mem_regions.regions); + kfree(ne_mem_region->pages); + kfree(ne_mem_region); + + return rc; +} + +/** + * ne_start_enclave_ioctl() - Trigger enclave start after the enclave resources, + * such as memory and CPU, have been set. + * @ne_enclave : Private data associated with the current enclave. + * @enclave_start_info : Enclave info that includes enclave cid and flags. + * + * Context: Process context. This function is called with the ne_enclave mutex held. + * Return: + * * 0 on success. + * * Negative return value on failure. + */ +static int ne_start_enclave_ioctl(struct ne_enclave *ne_enclave, + struct ne_enclave_start_info *enclave_start_info) +{ + struct ne_pci_dev_cmd_reply cmd_reply = {}; + unsigned int cpu = 0; + struct enclave_start_req enclave_start_req = {}; + unsigned int i = 0; + struct pci_dev *pdev = ne_devs.ne_pci_dev->pdev; + int rc = -EINVAL; + + if (!ne_enclave->nr_mem_regions) { + dev_err_ratelimited(ne_misc_dev.this_device, + "Enclave has no mem regions\n"); + + return -NE_ERR_NO_MEM_REGIONS_ADDED; + } + + if (ne_enclave->mem_size < NE_MIN_ENCLAVE_MEM_SIZE) { + dev_err_ratelimited(ne_misc_dev.this_device, + "Enclave memory is less than %ld\n", + NE_MIN_ENCLAVE_MEM_SIZE); + + return -NE_ERR_ENCLAVE_MEM_MIN_SIZE; + } + + if (!ne_enclave->nr_vcpus) { + dev_err_ratelimited(ne_misc_dev.this_device, + "Enclave has no vCPUs\n"); + + return -NE_ERR_NO_VCPUS_ADDED; + } + + for (i = 0; i < ne_enclave->nr_parent_vm_cores; i++) + for_each_cpu(cpu, ne_enclave->threads_per_core[i]) + if (!cpumask_test_cpu(cpu, ne_enclave->vcpu_ids)) { + dev_err_ratelimited(ne_misc_dev.this_device, + "Full CPU cores not used\n"); + + return -NE_ERR_FULL_CORES_NOT_USED; + } + + enclave_start_req.enclave_cid = enclave_start_info->enclave_cid; + enclave_start_req.flags = enclave_start_info->flags; + enclave_start_req.slot_uid = ne_enclave->slot_uid; + + rc = ne_do_request(pdev, ENCLAVE_START, + &enclave_start_req, sizeof(enclave_start_req), + &cmd_reply, sizeof(cmd_reply)); + if (rc < 0) { + dev_err_ratelimited(ne_misc_dev.this_device, + "Error in enclave start [rc=%d]\n", rc); + + return rc; + } + + ne_enclave->state = NE_STATE_RUNNING; + + enclave_start_info->enclave_cid = cmd_reply.enclave_cid; + + return 0; +} + +/** + * ne_enclave_ioctl() - Ioctl function provided by the enclave file. + * @file: File associated with this ioctl function. + * @cmd: The command that is set for the ioctl call. + * @arg: The argument that is provided for the ioctl call. + * + * Context: Process context. + * Return: + * * 0 on success. + * * Negative return value on failure. + */ +static long ne_enclave_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + struct ne_enclave *ne_enclave = file->private_data; + + switch (cmd) { + case NE_ADD_VCPU: { + int rc = -EINVAL; + u32 vcpu_id = 0; + + if (copy_from_user(&vcpu_id, (void __user *)arg, sizeof(vcpu_id))) + return -EFAULT; + + mutex_lock(&ne_enclave->enclave_info_mutex); + + if (ne_enclave->state != NE_STATE_INIT) { + dev_err_ratelimited(ne_misc_dev.this_device, + "Enclave is not in init state\n"); + + mutex_unlock(&ne_enclave->enclave_info_mutex); + + return -NE_ERR_NOT_IN_INIT_STATE; + } + + if (vcpu_id >= (ne_enclave->nr_parent_vm_cores * + ne_enclave->nr_threads_per_core)) { + dev_err_ratelimited(ne_misc_dev.this_device, + "vCPU id higher than max CPU id\n"); + + mutex_unlock(&ne_enclave->enclave_info_mutex); + + return -NE_ERR_INVALID_VCPU; + } + + if (!vcpu_id) { + /* Use the CPU pool for choosing a CPU for the enclave. */ + rc = ne_get_cpu_from_cpu_pool(ne_enclave, &vcpu_id); + if (rc < 0) { + dev_err_ratelimited(ne_misc_dev.this_device, + "Error in get CPU from pool [rc=%d]\n", + rc); + + mutex_unlock(&ne_enclave->enclave_info_mutex); + + return rc; + } + } else { + /* Check if the provided vCPU is available in the NE CPU pool. */ + rc = ne_check_cpu_in_cpu_pool(ne_enclave, vcpu_id); + if (rc < 0) { + dev_err_ratelimited(ne_misc_dev.this_device, + "Error in check CPU %d in pool [rc=%d]\n", + vcpu_id, rc); + + mutex_unlock(&ne_enclave->enclave_info_mutex); + + return rc; + } + } + + rc = ne_add_vcpu_ioctl(ne_enclave, vcpu_id); + if (rc < 0) { + mutex_unlock(&ne_enclave->enclave_info_mutex); + + return rc; + } + + mutex_unlock(&ne_enclave->enclave_info_mutex); + + if (copy_to_user((void __user *)arg, &vcpu_id, sizeof(vcpu_id))) + return -EFAULT; + + return 0; + } + + case NE_GET_IMAGE_LOAD_INFO: { + struct ne_image_load_info image_load_info = {}; + + if (copy_from_user(&image_load_info, (void __user *)arg, sizeof(image_load_info))) + return -EFAULT; + + mutex_lock(&ne_enclave->enclave_info_mutex); + + if (ne_enclave->state != NE_STATE_INIT) { + dev_err_ratelimited(ne_misc_dev.this_device, + "Enclave is not in init state\n"); + + mutex_unlock(&ne_enclave->enclave_info_mutex); + + return -NE_ERR_NOT_IN_INIT_STATE; + } + + mutex_unlock(&ne_enclave->enclave_info_mutex); + + if (!image_load_info.flags || + image_load_info.flags >= NE_IMAGE_LOAD_MAX_FLAG_VAL) { + dev_err_ratelimited(ne_misc_dev.this_device, + "Incorrect flag in enclave image load info\n"); + + return -NE_ERR_INVALID_FLAG_VALUE; + } + + if (image_load_info.flags == NE_EIF_IMAGE) + image_load_info.memory_offset = NE_EIF_LOAD_OFFSET; + + if (copy_to_user((void __user *)arg, &image_load_info, sizeof(image_load_info))) + return -EFAULT; + + return 0; + } + + case NE_SET_USER_MEMORY_REGION: { + struct ne_user_memory_region mem_region = {}; + int rc = -EINVAL; + + if (copy_from_user(&mem_region, (void __user *)arg, sizeof(mem_region))) + return -EFAULT; + + if (mem_region.flags >= NE_MEMORY_REGION_MAX_FLAG_VAL) { + dev_err_ratelimited(ne_misc_dev.this_device, + "Incorrect flag for user memory region\n"); + + return -NE_ERR_INVALID_FLAG_VALUE; + } + + mutex_lock(&ne_enclave->enclave_info_mutex); + + if (ne_enclave->state != NE_STATE_INIT) { + dev_err_ratelimited(ne_misc_dev.this_device, + "Enclave is not in init state\n"); + + mutex_unlock(&ne_enclave->enclave_info_mutex); + + return -NE_ERR_NOT_IN_INIT_STATE; + } + + rc = ne_set_user_memory_region_ioctl(ne_enclave, mem_region); + if (rc < 0) { + mutex_unlock(&ne_enclave->enclave_info_mutex); + + return rc; + } + + mutex_unlock(&ne_enclave->enclave_info_mutex); + + return 0; + } + + case NE_START_ENCLAVE: { + struct ne_enclave_start_info enclave_start_info = {}; + int rc = -EINVAL; + + if (copy_from_user(&enclave_start_info, (void __user *)arg, + sizeof(enclave_start_info))) + return -EFAULT; + + if (enclave_start_info.flags >= NE_ENCLAVE_START_MAX_FLAG_VAL) { + dev_err_ratelimited(ne_misc_dev.this_device, + "Incorrect flag in enclave start info\n"); + + return -NE_ERR_INVALID_FLAG_VALUE; + } + + /* + * Do not use well-known CIDs - 0, 1, 2 - for enclaves. + * VMADDR_CID_ANY = -1U + * VMADDR_CID_HYPERVISOR = 0 + * VMADDR_CID_LOCAL = 1 + * VMADDR_CID_HOST = 2 + * Note: 0 is used as a placeholder to auto-generate an enclave CID. + * http://man7.org/linux/man-pages/man7/vsock.7.html + */ + if (enclave_start_info.enclave_cid > 0 && + enclave_start_info.enclave_cid <= VMADDR_CID_HOST) { + dev_err_ratelimited(ne_misc_dev.this_device, + "Well-known CID value, not to be used for enclaves\n"); + + return -NE_ERR_INVALID_ENCLAVE_CID; + } + + if (enclave_start_info.enclave_cid == U32_MAX) { + dev_err_ratelimited(ne_misc_dev.this_device, + "Well-known CID value, not to be used for enclaves\n"); + + return -NE_ERR_INVALID_ENCLAVE_CID; + } + + /* + * Do not use the CID of the primary / parent VM for enclaves. + */ + if (enclave_start_info.enclave_cid == NE_PARENT_VM_CID) { + dev_err_ratelimited(ne_misc_dev.this_device, + "CID of the parent VM, not to be used for enclaves\n"); + + return -NE_ERR_INVALID_ENCLAVE_CID; + } + + /* 64-bit CIDs are not yet supported for the vsock device. */ + if (enclave_start_info.enclave_cid > U32_MAX) { + dev_err_ratelimited(ne_misc_dev.this_device, + "64-bit CIDs not yet supported for the vsock device\n"); + + return -NE_ERR_INVALID_ENCLAVE_CID; + } + + mutex_lock(&ne_enclave->enclave_info_mutex); + + if (ne_enclave->state != NE_STATE_INIT) { + dev_err_ratelimited(ne_misc_dev.this_device, + "Enclave is not in init state\n"); + + mutex_unlock(&ne_enclave->enclave_info_mutex); + + return -NE_ERR_NOT_IN_INIT_STATE; + } + + rc = ne_start_enclave_ioctl(ne_enclave, &enclave_start_info); + if (rc < 0) { + mutex_unlock(&ne_enclave->enclave_info_mutex); + + return rc; + } + + mutex_unlock(&ne_enclave->enclave_info_mutex); + + if (copy_to_user((void __user *)arg, &enclave_start_info, + sizeof(enclave_start_info))) + return -EFAULT; + + return 0; + } + + default: + return -ENOTTY; + } + + return 0; +} + +/** + * ne_enclave_remove_all_mem_region_entries() - Remove all memory region entries + * from the enclave data structure. + * @ne_enclave : Private data associated with the current enclave. + * + * Context: Process context. This function is called with the ne_enclave mutex held. + */ +static void ne_enclave_remove_all_mem_region_entries(struct ne_enclave *ne_enclave) +{ + unsigned long i = 0; + struct ne_mem_region *ne_mem_region = NULL; + struct ne_mem_region *ne_mem_region_tmp = NULL; + + list_for_each_entry_safe(ne_mem_region, ne_mem_region_tmp, + &ne_enclave->mem_regions_list, + mem_region_list_entry) { + list_del(&ne_mem_region->mem_region_list_entry); + + for (i = 0; i < ne_mem_region->nr_pages; i++) + put_page(ne_mem_region->pages[i]); + + kfree(ne_mem_region->pages); + + kfree(ne_mem_region); + } +} + +/** + * ne_enclave_remove_all_vcpu_id_entries() - Remove all vCPU id entries from + * the enclave data structure. + * @ne_enclave : Private data associated with the current enclave. + * + * Context: Process context. This function is called with the ne_enclave mutex held. + */ +static void ne_enclave_remove_all_vcpu_id_entries(struct ne_enclave *ne_enclave) +{ + unsigned int cpu = 0; + unsigned int i = 0; + + mutex_lock(&ne_cpu_pool.mutex); + + for (i = 0; i < ne_enclave->nr_parent_vm_cores; i++) { + for_each_cpu(cpu, ne_enclave->threads_per_core[i]) + /* Update the available NE CPU pool. */ + cpumask_set_cpu(cpu, ne_cpu_pool.avail_threads_per_core[i]); + + free_cpumask_var(ne_enclave->threads_per_core[i]); + } + + mutex_unlock(&ne_cpu_pool.mutex); + + kfree(ne_enclave->threads_per_core); + + free_cpumask_var(ne_enclave->vcpu_ids); +} + +/** + * ne_pci_dev_remove_enclave_entry() - Remove the enclave entry from the data + * structure that is part of the NE PCI + * device private data. + * @ne_enclave : Private data associated with the current enclave. + * @ne_pci_dev : Private data associated with the PCI device. + * + * Context: Process context. This function is called with the ne_pci_dev enclave + * mutex held. + */ +static void ne_pci_dev_remove_enclave_entry(struct ne_enclave *ne_enclave, + struct ne_pci_dev *ne_pci_dev) +{ + struct ne_enclave *ne_enclave_entry = NULL; + struct ne_enclave *ne_enclave_entry_tmp = NULL; + + list_for_each_entry_safe(ne_enclave_entry, ne_enclave_entry_tmp, + &ne_pci_dev->enclaves_list, enclave_list_entry) { + if (ne_enclave_entry->slot_uid == ne_enclave->slot_uid) { + list_del(&ne_enclave_entry->enclave_list_entry); + + break; + } + } +} + +/** + * ne_enclave_release() - Release function provided by the enclave file. + * @inode: Inode associated with this file release function. + * @file: File associated with this release function. + * + * Context: Process context. + * Return: + * * 0 on success. + * * Negative return value on failure. + */ +static int ne_enclave_release(struct inode *inode, struct file *file) +{ + struct ne_pci_dev_cmd_reply cmd_reply = {}; + struct enclave_stop_req enclave_stop_request = {}; + struct ne_enclave *ne_enclave = file->private_data; + struct ne_pci_dev *ne_pci_dev = ne_devs.ne_pci_dev; + struct pci_dev *pdev = ne_pci_dev->pdev; + int rc = -EINVAL; + struct slot_free_req slot_free_req = {}; + + if (!ne_enclave) + return 0; + + /* + * Early exit in case there is an error in the enclave creation logic + * and fput() is called on the cleanup path. + */ + if (!ne_enclave->slot_uid) + return 0; + + /* + * Acquire the enclave list mutex before the enclave mutex + * in order to avoid deadlocks with @ref ne_event_work_handler. + */ + mutex_lock(&ne_pci_dev->enclaves_list_mutex); + mutex_lock(&ne_enclave->enclave_info_mutex); + + if (ne_enclave->state != NE_STATE_INIT && ne_enclave->state != NE_STATE_STOPPED) { + enclave_stop_request.slot_uid = ne_enclave->slot_uid; + + rc = ne_do_request(pdev, ENCLAVE_STOP, + &enclave_stop_request, sizeof(enclave_stop_request), + &cmd_reply, sizeof(cmd_reply)); + if (rc < 0) { + dev_err_ratelimited(ne_misc_dev.this_device, + "Error in enclave stop [rc=%d]\n", rc); + + goto unlock_mutex; + } + + memset(&cmd_reply, 0, sizeof(cmd_reply)); + } + + slot_free_req.slot_uid = ne_enclave->slot_uid; + + rc = ne_do_request(pdev, SLOT_FREE, + &slot_free_req, sizeof(slot_free_req), + &cmd_reply, sizeof(cmd_reply)); + if (rc < 0) { + dev_err_ratelimited(ne_misc_dev.this_device, + "Error in slot free [rc=%d]\n", rc); + + goto unlock_mutex; + } + + ne_pci_dev_remove_enclave_entry(ne_enclave, ne_pci_dev); + ne_enclave_remove_all_mem_region_entries(ne_enclave); + ne_enclave_remove_all_vcpu_id_entries(ne_enclave); + + mutex_unlock(&ne_enclave->enclave_info_mutex); + mutex_unlock(&ne_pci_dev->enclaves_list_mutex); + + kfree(ne_enclave); + + return 0; + +unlock_mutex: + mutex_unlock(&ne_enclave->enclave_info_mutex); + mutex_unlock(&ne_pci_dev->enclaves_list_mutex); + + return rc; +} + +/** + * ne_enclave_poll() - Poll functionality used for enclave out-of-band events. + * @file: File associated with this poll function. + * @wait: Poll table data structure. + * + * Context: Process context. + * Return: + * * Poll mask. + */ +static __poll_t ne_enclave_poll(struct file *file, poll_table *wait) +{ + __poll_t mask = 0; + struct ne_enclave *ne_enclave = file->private_data; + + poll_wait(file, &ne_enclave->eventq, wait); + + if (ne_enclave->has_event) + mask |= EPOLLHUP; + + return mask; +} + +static const struct file_operations ne_enclave_fops = { + .owner = THIS_MODULE, + .llseek = noop_llseek, + .poll = ne_enclave_poll, + .unlocked_ioctl = ne_enclave_ioctl, + .release = ne_enclave_release, +}; + +/** + * ne_create_vm_ioctl() - Alloc slot to be associated with an enclave. Create + * enclave file descriptor to be further used for enclave + * resources handling e.g. memory regions and CPUs. + * @ne_pci_dev : Private data associated with the PCI device. + * @slot_uid: User pointer to store the generated unique slot id + * associated with an enclave to. + * + * Context: Process context. This function is called with the ne_pci_dev enclave + * mutex held. + * Return: + * * Enclave fd on success. + * * Negative return value on failure. + */ +static int ne_create_vm_ioctl(struct ne_pci_dev *ne_pci_dev, u64 __user *slot_uid) +{ + struct ne_pci_dev_cmd_reply cmd_reply = {}; + int enclave_fd = -1; + struct file *enclave_file = NULL; + unsigned int i = 0; + struct ne_enclave *ne_enclave = NULL; + struct pci_dev *pdev = ne_pci_dev->pdev; + int rc = -EINVAL; + struct slot_alloc_req slot_alloc_req = {}; + + mutex_lock(&ne_cpu_pool.mutex); + + for (i = 0; i < ne_cpu_pool.nr_parent_vm_cores; i++) + if (!cpumask_empty(ne_cpu_pool.avail_threads_per_core[i])) + break; + + if (i == ne_cpu_pool.nr_parent_vm_cores) { + dev_err_ratelimited(ne_misc_dev.this_device, + "No CPUs available in CPU pool\n"); + + mutex_unlock(&ne_cpu_pool.mutex); + + return -NE_ERR_NO_CPUS_AVAIL_IN_POOL; + } + + mutex_unlock(&ne_cpu_pool.mutex); + + ne_enclave = kzalloc(sizeof(*ne_enclave), GFP_KERNEL); + if (!ne_enclave) + return -ENOMEM; + + mutex_lock(&ne_cpu_pool.mutex); + + ne_enclave->nr_parent_vm_cores = ne_cpu_pool.nr_parent_vm_cores; + ne_enclave->nr_threads_per_core = ne_cpu_pool.nr_threads_per_core; + ne_enclave->numa_node = ne_cpu_pool.numa_node; + + mutex_unlock(&ne_cpu_pool.mutex); + + ne_enclave->threads_per_core = kcalloc(ne_enclave->nr_parent_vm_cores, + sizeof(*ne_enclave->threads_per_core), + GFP_KERNEL); + if (!ne_enclave->threads_per_core) { + rc = -ENOMEM; + + goto free_ne_enclave; + } + + for (i = 0; i < ne_enclave->nr_parent_vm_cores; i++) + if (!zalloc_cpumask_var(&ne_enclave->threads_per_core[i], GFP_KERNEL)) { + rc = -ENOMEM; + + goto free_cpumask; + } + + if (!zalloc_cpumask_var(&ne_enclave->vcpu_ids, GFP_KERNEL)) { + rc = -ENOMEM; + + goto free_cpumask; + } + + enclave_fd = get_unused_fd_flags(O_CLOEXEC); + if (enclave_fd < 0) { + rc = enclave_fd; + + dev_err_ratelimited(ne_misc_dev.this_device, + "Error in getting unused fd [rc=%d]\n", rc); + + goto free_cpumask; + } + + enclave_file = anon_inode_getfile("ne-vm", &ne_enclave_fops, ne_enclave, O_RDWR); + if (IS_ERR(enclave_file)) { + rc = PTR_ERR(enclave_file); + + dev_err_ratelimited(ne_misc_dev.this_device, + "Error in anon inode get file [rc=%d]\n", rc); + + goto put_fd; + } + + rc = ne_do_request(pdev, SLOT_ALLOC, + &slot_alloc_req, sizeof(slot_alloc_req), + &cmd_reply, sizeof(cmd_reply)); + if (rc < 0) { + dev_err_ratelimited(ne_misc_dev.this_device, + "Error in slot alloc [rc=%d]\n", rc); + + goto put_file; + } + + init_waitqueue_head(&ne_enclave->eventq); + ne_enclave->has_event = false; + mutex_init(&ne_enclave->enclave_info_mutex); + ne_enclave->max_mem_regions = cmd_reply.mem_regions; + INIT_LIST_HEAD(&ne_enclave->mem_regions_list); + ne_enclave->mm = current->mm; + ne_enclave->slot_uid = cmd_reply.slot_uid; + ne_enclave->state = NE_STATE_INIT; + + list_add(&ne_enclave->enclave_list_entry, &ne_pci_dev->enclaves_list); + + if (copy_to_user(slot_uid, &ne_enclave->slot_uid, sizeof(ne_enclave->slot_uid))) { + /* + * As we're holding the only reference to 'enclave_file', fput() + * will call ne_enclave_release() which will do a proper cleanup + * of all so far allocated resources, leaving only the unused fd + * for us to free. + */ + fput(enclave_file); + put_unused_fd(enclave_fd); + + return -EFAULT; + } + + fd_install(enclave_fd, enclave_file); + + return enclave_fd; + +put_file: + fput(enclave_file); +put_fd: + put_unused_fd(enclave_fd); +free_cpumask: + free_cpumask_var(ne_enclave->vcpu_ids); + for (i = 0; i < ne_enclave->nr_parent_vm_cores; i++) + free_cpumask_var(ne_enclave->threads_per_core[i]); + kfree(ne_enclave->threads_per_core); +free_ne_enclave: + kfree(ne_enclave); + + return rc; +} + +/** + * ne_ioctl() - Ioctl function provided by the NE misc device. + * @file: File associated with this ioctl function. + * @cmd: The command that is set for the ioctl call. + * @arg: The argument that is provided for the ioctl call. + * + * Context: Process context. + * Return: + * * Ioctl result (e.g. enclave file descriptor) on success. + * * Negative return value on failure. + */ +static long ne_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + switch (cmd) { + case NE_CREATE_VM: { + int enclave_fd = -1; + struct ne_pci_dev *ne_pci_dev = ne_devs.ne_pci_dev; + u64 __user *slot_uid = (void __user *)arg; + + mutex_lock(&ne_pci_dev->enclaves_list_mutex); + enclave_fd = ne_create_vm_ioctl(ne_pci_dev, slot_uid); + mutex_unlock(&ne_pci_dev->enclaves_list_mutex); + + return enclave_fd; + } + + default: + return -ENOTTY; + } + + return 0; +} + +#if defined(CONFIG_NITRO_ENCLAVES_MISC_DEV_TEST) +#include "ne_misc_dev_test.c" +#endif + +static int __init ne_init(void) +{ + mutex_init(&ne_cpu_pool.mutex); + + return pci_register_driver(&ne_pci_driver); +} + +static void __exit ne_exit(void) +{ + pci_unregister_driver(&ne_pci_driver); + + ne_teardown_cpu_pool(); +} + +module_init(ne_init); +module_exit(ne_exit); + +MODULE_AUTHOR("Amazon.com, Inc. or its affiliates"); +MODULE_DESCRIPTION("Nitro Enclaves Driver"); +MODULE_LICENSE("GPL v2"); diff --git a/drivers/virt/nitro_enclaves/ne_misc_dev.h b/drivers/virt/nitro_enclaves/ne_misc_dev.h new file mode 100644 index 000000000..2a4d2224b --- /dev/null +++ b/drivers/virt/nitro_enclaves/ne_misc_dev.h @@ -0,0 +1,109 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. + */ + +#ifndef _NE_MISC_DEV_H_ +#define _NE_MISC_DEV_H_ + +#include <linux/cpumask.h> +#include <linux/list.h> +#include <linux/miscdevice.h> +#include <linux/mm.h> +#include <linux/mutex.h> +#include <linux/pci.h> +#include <linux/wait.h> + +#include "ne_pci_dev.h" + +/** + * struct ne_mem_region - Entry in the enclave user space memory regions list. + * @mem_region_list_entry: Entry in the list of enclave memory regions. + * @memory_size: Size of the user space memory region. + * @nr_pages: Number of pages that make up the memory region. + * @pages: Pages that make up the user space memory region. + * @userspace_addr: User space address of the memory region. + */ +struct ne_mem_region { + struct list_head mem_region_list_entry; + u64 memory_size; + unsigned long nr_pages; + struct page **pages; + u64 userspace_addr; +}; + +/** + * struct ne_enclave - Per-enclave data used for enclave lifetime management. + * @enclave_info_mutex : Mutex for accessing this internal state. + * @enclave_list_entry : Entry in the list of created enclaves. + * @eventq: Wait queue used for out-of-band event notifications + * triggered from the PCI device event handler to + * the enclave process via the poll function. + * @has_event: Variable used to determine if the out-of-band event + * was triggered. + * @max_mem_regions: The maximum number of memory regions that can be + * handled by the hypervisor. + * @mem_regions_list: Enclave user space memory regions list. + * @mem_size: Enclave memory size. + * @mm : Enclave process abstraction mm data struct. + * @nr_mem_regions: Number of memory regions associated with the enclave. + * @nr_parent_vm_cores : The size of the threads per core array. The + * total number of CPU cores available on the + * parent / primary VM. + * @nr_threads_per_core: The number of threads that a full CPU core has. + * @nr_vcpus: Number of vcpus associated with the enclave. + * @numa_node: NUMA node of the enclave memory and CPUs. + * @slot_uid: Slot unique id mapped to the enclave. + * @state: Enclave state, updated during enclave lifetime. + * @threads_per_core: Enclave full CPU cores array, indexed by core id, + * consisting of cpumasks with all their threads. + * Full CPU cores are taken from the NE CPU pool + * and are available to the enclave. + * @vcpu_ids: Cpumask of the vCPUs that are set for the enclave. + */ +struct ne_enclave { + struct mutex enclave_info_mutex; + struct list_head enclave_list_entry; + wait_queue_head_t eventq; + bool has_event; + u64 max_mem_regions; + struct list_head mem_regions_list; + u64 mem_size; + struct mm_struct *mm; + unsigned int nr_mem_regions; + unsigned int nr_parent_vm_cores; + unsigned int nr_threads_per_core; + unsigned int nr_vcpus; + int numa_node; + u64 slot_uid; + u16 state; + cpumask_var_t *threads_per_core; + cpumask_var_t vcpu_ids; +}; + +/** + * enum ne_state - States available for an enclave. + * @NE_STATE_INIT: The enclave has not been started yet. + * @NE_STATE_RUNNING: The enclave was started and is running as expected. + * @NE_STATE_STOPPED: The enclave exited without userspace interaction. + */ +enum ne_state { + NE_STATE_INIT = 0, + NE_STATE_RUNNING = 2, + NE_STATE_STOPPED = U16_MAX, +}; + +/** + * struct ne_devs - Data structure to keep refs to the NE misc and PCI devices. + * @ne_misc_dev: Nitro Enclaves misc device. + * @ne_pci_dev : Nitro Enclaves PCI device. + */ +struct ne_devs { + struct miscdevice *ne_misc_dev; + struct ne_pci_dev *ne_pci_dev; +}; + +/* Nitro Enclaves (NE) data structure for keeping refs to the NE misc and PCI devices. */ +extern struct ne_devs ne_devs; + +#endif /* _NE_MISC_DEV_H_ */ diff --git a/drivers/virt/nitro_enclaves/ne_misc_dev_test.c b/drivers/virt/nitro_enclaves/ne_misc_dev_test.c new file mode 100644 index 000000000..74df43b92 --- /dev/null +++ b/drivers/virt/nitro_enclaves/ne_misc_dev_test.c @@ -0,0 +1,154 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <kunit/test.h> + +#define MAX_PHYS_REGIONS 16 +#define INVALID_VALUE (~0ull) + +struct ne_phys_regions_test { + u64 paddr; + u64 size; + int expect_rc; + unsigned long expect_num; + u64 expect_last_paddr; + u64 expect_last_size; +} phys_regions_test_cases[] = { + /* + * Add the region from 0x1000 to (0x1000 + 0x200000 - 1): + * Expected result: + * Failed, start address is not 2M-aligned + * + * Now the instance of struct ne_phys_contig_mem_regions is: + * num = 0 + * regions = {} + */ + {0x1000, 0x200000, -EINVAL, 0, INVALID_VALUE, INVALID_VALUE}, + + /* + * Add the region from 0x200000 to (0x200000 + 0x1000 - 1): + * Expected result: + * Failed, size is not 2M-aligned + * + * Now the instance of struct ne_phys_contig_mem_regions is: + * num = 0 + * regions = {} + */ + {0x200000, 0x1000, -EINVAL, 0, INVALID_VALUE, INVALID_VALUE}, + + /* + * Add the region from 0x200000 to (0x200000 + 0x200000 - 1): + * Expected result: + * Successful + * + * Now the instance of struct ne_phys_contig_mem_regions is: + * num = 1 + * regions = { + * {start=0x200000, end=0x3fffff}, // len=0x200000 + * } + */ + {0x200000, 0x200000, 0, 1, 0x200000, 0x200000}, + + /* + * Add the region from 0x0 to (0x0 + 0x200000 - 1): + * Expected result: + * Successful + * + * Now the instance of struct ne_phys_contig_mem_regions is: + * num = 2 + * regions = { + * {start=0x200000, end=0x3fffff}, // len=0x200000 + * {start=0x0, end=0x1fffff}, // len=0x200000 + * } + */ + {0x0, 0x200000, 0, 2, 0x0, 0x200000}, + + /* + * Add the region from 0x600000 to (0x600000 + 0x400000 - 1): + * Expected result: + * Successful + * + * Now the instance of struct ne_phys_contig_mem_regions is: + * num = 3 + * regions = { + * {start=0x200000, end=0x3fffff}, // len=0x200000 + * {start=0x0, end=0x1fffff}, // len=0x200000 + * {start=0x600000, end=0x9fffff}, // len=0x400000 + * } + */ + {0x600000, 0x400000, 0, 3, 0x600000, 0x400000}, + + /* + * Add the region from 0xa00000 to (0xa00000 + 0x400000 - 1): + * Expected result: + * Successful, merging case! + * + * Now the instance of struct ne_phys_contig_mem_regions is: + * num = 3 + * regions = { + * {start=0x200000, end=0x3fffff}, // len=0x200000 + * {start=0x0, end=0x1fffff}, // len=0x200000 + * {start=0x600000, end=0xdfffff}, // len=0x800000 + * } + */ + {0xa00000, 0x400000, 0, 3, 0x600000, 0x800000}, + + /* + * Add the region from 0x1000 to (0x1000 + 0x200000 - 1): + * Expected result: + * Failed, start address is not 2M-aligned + * + * Now the instance of struct ne_phys_contig_mem_regions is: + * num = 3 + * regions = { + * {start=0x200000, end=0x3fffff}, // len=0x200000 + * {start=0x0, end=0x1fffff}, // len=0x200000 + * {start=0x600000, end=0xdfffff}, // len=0x800000 + * } + */ + {0x1000, 0x200000, -EINVAL, 3, 0x600000, 0x800000}, +}; + +static void ne_misc_dev_test_merge_phys_contig_memory_regions(struct kunit *test) +{ + struct ne_phys_contig_mem_regions phys_contig_mem_regions = {}; + int rc = 0; + int i = 0; + + phys_contig_mem_regions.regions = kunit_kcalloc(test, MAX_PHYS_REGIONS, + sizeof(*phys_contig_mem_regions.regions), + GFP_KERNEL); + KUNIT_ASSERT_TRUE(test, phys_contig_mem_regions.regions); + + for (i = 0; i < ARRAY_SIZE(phys_regions_test_cases); i++) { + struct ne_phys_regions_test *test_case = &phys_regions_test_cases[i]; + unsigned long num = 0; + + rc = ne_merge_phys_contig_memory_regions(&phys_contig_mem_regions, + test_case->paddr, test_case->size); + KUNIT_EXPECT_EQ(test, rc, test_case->expect_rc); + KUNIT_EXPECT_EQ(test, phys_contig_mem_regions.num, test_case->expect_num); + + if (test_case->expect_last_paddr == INVALID_VALUE) + continue; + + num = phys_contig_mem_regions.num; + KUNIT_EXPECT_EQ(test, phys_contig_mem_regions.regions[num - 1].start, + test_case->expect_last_paddr); + KUNIT_EXPECT_EQ(test, range_len(&phys_contig_mem_regions.regions[num - 1]), + test_case->expect_last_size); + } + + kunit_kfree(test, phys_contig_mem_regions.regions); +} + +static struct kunit_case ne_misc_dev_test_cases[] = { + KUNIT_CASE(ne_misc_dev_test_merge_phys_contig_memory_regions), + {} +}; + +static struct kunit_suite ne_misc_dev_test_suite = { + .name = "ne_misc_dev_test", + .test_cases = ne_misc_dev_test_cases, +}; + +kunit_test_suite(ne_misc_dev_test_suite); diff --git a/drivers/virt/nitro_enclaves/ne_pci_dev.c b/drivers/virt/nitro_enclaves/ne_pci_dev.c new file mode 100644 index 000000000..6b81e8f3a --- /dev/null +++ b/drivers/virt/nitro_enclaves/ne_pci_dev.c @@ -0,0 +1,626 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright 2020-2021 Amazon.com, Inc. or its affiliates. All Rights Reserved. + */ + +/** + * DOC: Nitro Enclaves (NE) PCI device driver. + */ + +#include <linux/delay.h> +#include <linux/device.h> +#include <linux/list.h> +#include <linux/module.h> +#include <linux/mutex.h> +#include <linux/nitro_enclaves.h> +#include <linux/pci.h> +#include <linux/types.h> +#include <linux/wait.h> + +#include "ne_misc_dev.h" +#include "ne_pci_dev.h" + +/** + * NE_DEFAULT_TIMEOUT_MSECS - Default timeout to wait for a reply from + * the NE PCI device. + */ +#define NE_DEFAULT_TIMEOUT_MSECS (120000) /* 120 sec */ + +static const struct pci_device_id ne_pci_ids[] = { + { PCI_DEVICE(PCI_VENDOR_ID_AMAZON, PCI_DEVICE_ID_NE) }, + { 0, } +}; + +MODULE_DEVICE_TABLE(pci, ne_pci_ids); + +/** + * ne_submit_request() - Submit command request to the PCI device based on the + * command type. + * @pdev: PCI device to send the command to. + * @cmd_type: Command type of the request sent to the PCI device. + * @cmd_request: Command request payload. + * @cmd_request_size: Size of the command request payload. + * + * Context: Process context. This function is called with the ne_pci_dev mutex held. + */ +static void ne_submit_request(struct pci_dev *pdev, enum ne_pci_dev_cmd_type cmd_type, + void *cmd_request, size_t cmd_request_size) +{ + struct ne_pci_dev *ne_pci_dev = pci_get_drvdata(pdev); + + memcpy_toio(ne_pci_dev->iomem_base + NE_SEND_DATA, cmd_request, cmd_request_size); + + iowrite32(cmd_type, ne_pci_dev->iomem_base + NE_COMMAND); +} + +/** + * ne_retrieve_reply() - Retrieve reply from the PCI device. + * @pdev: PCI device to receive the reply from. + * @cmd_reply: Command reply payload. + * @cmd_reply_size: Size of the command reply payload. + * + * Context: Process context. This function is called with the ne_pci_dev mutex held. + */ +static void ne_retrieve_reply(struct pci_dev *pdev, struct ne_pci_dev_cmd_reply *cmd_reply, + size_t cmd_reply_size) +{ + struct ne_pci_dev *ne_pci_dev = pci_get_drvdata(pdev); + + memcpy_fromio(cmd_reply, ne_pci_dev->iomem_base + NE_RECV_DATA, cmd_reply_size); +} + +/** + * ne_wait_for_reply() - Wait for a reply of a PCI device command. + * @pdev: PCI device for which a reply is waited. + * + * Context: Process context. This function is called with the ne_pci_dev mutex held. + * Return: + * * 0 on success. + * * Negative return value on failure. + */ +static int ne_wait_for_reply(struct pci_dev *pdev) +{ + struct ne_pci_dev *ne_pci_dev = pci_get_drvdata(pdev); + int rc = -EINVAL; + + /* + * TODO: Update to _interruptible and handle interrupted wait event + * e.g. -ERESTARTSYS, incoming signals + update timeout, if needed. + */ + rc = wait_event_timeout(ne_pci_dev->cmd_reply_wait_q, + atomic_read(&ne_pci_dev->cmd_reply_avail) != 0, + msecs_to_jiffies(NE_DEFAULT_TIMEOUT_MSECS)); + if (!rc) + return -ETIMEDOUT; + + return 0; +} + +int ne_do_request(struct pci_dev *pdev, enum ne_pci_dev_cmd_type cmd_type, + void *cmd_request, size_t cmd_request_size, + struct ne_pci_dev_cmd_reply *cmd_reply, size_t cmd_reply_size) +{ + struct ne_pci_dev *ne_pci_dev = pci_get_drvdata(pdev); + int rc = -EINVAL; + + if (cmd_type <= INVALID_CMD || cmd_type >= MAX_CMD) { + dev_err_ratelimited(&pdev->dev, "Invalid cmd type=%u\n", cmd_type); + + return -EINVAL; + } + + if (!cmd_request) { + dev_err_ratelimited(&pdev->dev, "Null cmd request for cmd type=%u\n", + cmd_type); + + return -EINVAL; + } + + if (cmd_request_size > NE_SEND_DATA_SIZE) { + dev_err_ratelimited(&pdev->dev, "Invalid req size=%zu for cmd type=%u\n", + cmd_request_size, cmd_type); + + return -EINVAL; + } + + if (!cmd_reply) { + dev_err_ratelimited(&pdev->dev, "Null cmd reply for cmd type=%u\n", + cmd_type); + + return -EINVAL; + } + + if (cmd_reply_size > NE_RECV_DATA_SIZE) { + dev_err_ratelimited(&pdev->dev, "Invalid reply size=%zu for cmd type=%u\n", + cmd_reply_size, cmd_type); + + return -EINVAL; + } + + /* + * Use this mutex so that the PCI device handles one command request at + * a time. + */ + mutex_lock(&ne_pci_dev->pci_dev_mutex); + + atomic_set(&ne_pci_dev->cmd_reply_avail, 0); + + ne_submit_request(pdev, cmd_type, cmd_request, cmd_request_size); + + rc = ne_wait_for_reply(pdev); + if (rc < 0) { + dev_err_ratelimited(&pdev->dev, "Error in wait for reply for cmd type=%u [rc=%d]\n", + cmd_type, rc); + + goto unlock_mutex; + } + + ne_retrieve_reply(pdev, cmd_reply, cmd_reply_size); + + atomic_set(&ne_pci_dev->cmd_reply_avail, 0); + + if (cmd_reply->rc < 0) { + rc = cmd_reply->rc; + + dev_err_ratelimited(&pdev->dev, "Error in cmd process logic, cmd type=%u [rc=%d]\n", + cmd_type, rc); + + goto unlock_mutex; + } + + rc = 0; + +unlock_mutex: + mutex_unlock(&ne_pci_dev->pci_dev_mutex); + + return rc; +} + +/** + * ne_reply_handler() - Interrupt handler for retrieving a reply matching a + * request sent to the PCI device for enclave lifetime + * management. + * @irq: Received interrupt for a reply sent by the PCI device. + * @args: PCI device private data structure. + * + * Context: Interrupt context. + * Return: + * * IRQ_HANDLED on handled interrupt. + */ +static irqreturn_t ne_reply_handler(int irq, void *args) +{ + struct ne_pci_dev *ne_pci_dev = (struct ne_pci_dev *)args; + + atomic_set(&ne_pci_dev->cmd_reply_avail, 1); + + /* TODO: Update to _interruptible. */ + wake_up(&ne_pci_dev->cmd_reply_wait_q); + + return IRQ_HANDLED; +} + +/** + * ne_event_work_handler() - Work queue handler for notifying enclaves on a + * state change received by the event interrupt + * handler. + * @work: Item containing the NE PCI device for which an out-of-band event + * was issued. + * + * An out-of-band event is being issued by the Nitro Hypervisor when at least + * one enclave is changing state without client interaction. + * + * Context: Work queue context. + */ +static void ne_event_work_handler(struct work_struct *work) +{ + struct ne_pci_dev_cmd_reply cmd_reply = {}; + struct ne_enclave *ne_enclave = NULL; + struct ne_pci_dev *ne_pci_dev = + container_of(work, struct ne_pci_dev, notify_work); + struct pci_dev *pdev = ne_pci_dev->pdev; + int rc = -EINVAL; + struct slot_info_req slot_info_req = {}; + + mutex_lock(&ne_pci_dev->enclaves_list_mutex); + + /* + * Iterate over all enclaves registered for the Nitro Enclaves + * PCI device and determine for which enclave(s) the out-of-band event + * is corresponding to. + */ + list_for_each_entry(ne_enclave, &ne_pci_dev->enclaves_list, enclave_list_entry) { + mutex_lock(&ne_enclave->enclave_info_mutex); + + /* + * Enclaves that were never started cannot receive out-of-band + * events. + */ + if (ne_enclave->state != NE_STATE_RUNNING) + goto unlock; + + slot_info_req.slot_uid = ne_enclave->slot_uid; + + rc = ne_do_request(pdev, SLOT_INFO, + &slot_info_req, sizeof(slot_info_req), + &cmd_reply, sizeof(cmd_reply)); + if (rc < 0) + dev_err(&pdev->dev, "Error in slot info [rc=%d]\n", rc); + + /* Notify enclave process that the enclave state changed. */ + if (ne_enclave->state != cmd_reply.state) { + ne_enclave->state = cmd_reply.state; + + ne_enclave->has_event = true; + + wake_up_interruptible(&ne_enclave->eventq); + } + +unlock: + mutex_unlock(&ne_enclave->enclave_info_mutex); + } + + mutex_unlock(&ne_pci_dev->enclaves_list_mutex); +} + +/** + * ne_event_handler() - Interrupt handler for PCI device out-of-band events. + * This interrupt does not supply any data in the MMIO + * region. It notifies a change in the state of any of + * the launched enclaves. + * @irq: Received interrupt for an out-of-band event. + * @args: PCI device private data structure. + * + * Context: Interrupt context. + * Return: + * * IRQ_HANDLED on handled interrupt. + */ +static irqreturn_t ne_event_handler(int irq, void *args) +{ + struct ne_pci_dev *ne_pci_dev = (struct ne_pci_dev *)args; + + queue_work(ne_pci_dev->event_wq, &ne_pci_dev->notify_work); + + return IRQ_HANDLED; +} + +/** + * ne_setup_msix() - Setup MSI-X vectors for the PCI device. + * @pdev: PCI device to setup the MSI-X for. + * + * Context: Process context. + * Return: + * * 0 on success. + * * Negative return value on failure. + */ +static int ne_setup_msix(struct pci_dev *pdev) +{ + struct ne_pci_dev *ne_pci_dev = pci_get_drvdata(pdev); + int nr_vecs = 0; + int rc = -EINVAL; + + nr_vecs = pci_msix_vec_count(pdev); + if (nr_vecs < 0) { + rc = nr_vecs; + + dev_err(&pdev->dev, "Error in getting vec count [rc=%d]\n", rc); + + return rc; + } + + rc = pci_alloc_irq_vectors(pdev, nr_vecs, nr_vecs, PCI_IRQ_MSIX); + if (rc < 0) { + dev_err(&pdev->dev, "Error in alloc MSI-X vecs [rc=%d]\n", rc); + + return rc; + } + + /* + * This IRQ gets triggered every time the PCI device responds to a + * command request. The reply is then retrieved, reading from the MMIO + * space of the PCI device. + */ + rc = request_irq(pci_irq_vector(pdev, NE_VEC_REPLY), ne_reply_handler, + 0, "enclave_cmd", ne_pci_dev); + if (rc < 0) { + dev_err(&pdev->dev, "Error in request irq reply [rc=%d]\n", rc); + + goto free_irq_vectors; + } + + ne_pci_dev->event_wq = create_singlethread_workqueue("ne_pci_dev_wq"); + if (!ne_pci_dev->event_wq) { + rc = -ENOMEM; + + dev_err(&pdev->dev, "Cannot get wq for dev events [rc=%d]\n", rc); + + goto free_reply_irq_vec; + } + + INIT_WORK(&ne_pci_dev->notify_work, ne_event_work_handler); + + /* + * This IRQ gets triggered every time any enclave's state changes. Its + * handler then scans for the changes and propagates them to the user + * space. + */ + rc = request_irq(pci_irq_vector(pdev, NE_VEC_EVENT), ne_event_handler, + 0, "enclave_evt", ne_pci_dev); + if (rc < 0) { + dev_err(&pdev->dev, "Error in request irq event [rc=%d]\n", rc); + + goto destroy_wq; + } + + return 0; + +destroy_wq: + destroy_workqueue(ne_pci_dev->event_wq); +free_reply_irq_vec: + free_irq(pci_irq_vector(pdev, NE_VEC_REPLY), ne_pci_dev); +free_irq_vectors: + pci_free_irq_vectors(pdev); + + return rc; +} + +/** + * ne_teardown_msix() - Teardown MSI-X vectors for the PCI device. + * @pdev: PCI device to teardown the MSI-X for. + * + * Context: Process context. + */ +static void ne_teardown_msix(struct pci_dev *pdev) +{ + struct ne_pci_dev *ne_pci_dev = pci_get_drvdata(pdev); + + free_irq(pci_irq_vector(pdev, NE_VEC_EVENT), ne_pci_dev); + + flush_work(&ne_pci_dev->notify_work); + destroy_workqueue(ne_pci_dev->event_wq); + + free_irq(pci_irq_vector(pdev, NE_VEC_REPLY), ne_pci_dev); + + pci_free_irq_vectors(pdev); +} + +/** + * ne_pci_dev_enable() - Select the PCI device version and enable it. + * @pdev: PCI device to select version for and then enable. + * + * Context: Process context. + * Return: + * * 0 on success. + * * Negative return value on failure. + */ +static int ne_pci_dev_enable(struct pci_dev *pdev) +{ + u8 dev_enable_reply = 0; + u16 dev_version_reply = 0; + struct ne_pci_dev *ne_pci_dev = pci_get_drvdata(pdev); + + iowrite16(NE_VERSION_MAX, ne_pci_dev->iomem_base + NE_VERSION); + + dev_version_reply = ioread16(ne_pci_dev->iomem_base + NE_VERSION); + if (dev_version_reply != NE_VERSION_MAX) { + dev_err(&pdev->dev, "Error in pci dev version cmd\n"); + + return -EIO; + } + + iowrite8(NE_ENABLE_ON, ne_pci_dev->iomem_base + NE_ENABLE); + + dev_enable_reply = ioread8(ne_pci_dev->iomem_base + NE_ENABLE); + if (dev_enable_reply != NE_ENABLE_ON) { + dev_err(&pdev->dev, "Error in pci dev enable cmd\n"); + + return -EIO; + } + + return 0; +} + +/** + * ne_pci_dev_disable() - Disable the PCI device. + * @pdev: PCI device to disable. + * + * Context: Process context. + */ +static void ne_pci_dev_disable(struct pci_dev *pdev) +{ + u8 dev_disable_reply = 0; + struct ne_pci_dev *ne_pci_dev = pci_get_drvdata(pdev); + const unsigned int sleep_time = 10; /* 10 ms */ + unsigned int sleep_time_count = 0; + + iowrite8(NE_ENABLE_OFF, ne_pci_dev->iomem_base + NE_ENABLE); + + /* + * Check for NE_ENABLE_OFF in a loop, to handle cases when the device + * state is not immediately set to disabled and going through a + * transitory state of disabling. + */ + while (sleep_time_count < NE_DEFAULT_TIMEOUT_MSECS) { + dev_disable_reply = ioread8(ne_pci_dev->iomem_base + NE_ENABLE); + if (dev_disable_reply == NE_ENABLE_OFF) + return; + + msleep_interruptible(sleep_time); + sleep_time_count += sleep_time; + } + + dev_disable_reply = ioread8(ne_pci_dev->iomem_base + NE_ENABLE); + if (dev_disable_reply != NE_ENABLE_OFF) + dev_err(&pdev->dev, "Error in pci dev disable cmd\n"); +} + +/** + * ne_pci_probe() - Probe function for the NE PCI device. + * @pdev: PCI device to match with the NE PCI driver. + * @id : PCI device id table associated with the NE PCI driver. + * + * Context: Process context. + * Return: + * * 0 on success. + * * Negative return value on failure. + */ +static int ne_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) +{ + struct ne_pci_dev *ne_pci_dev = NULL; + int rc = -EINVAL; + + ne_pci_dev = kzalloc(sizeof(*ne_pci_dev), GFP_KERNEL); + if (!ne_pci_dev) + return -ENOMEM; + + rc = pci_enable_device(pdev); + if (rc < 0) { + dev_err(&pdev->dev, "Error in pci dev enable [rc=%d]\n", rc); + + goto free_ne_pci_dev; + } + + pci_set_master(pdev); + + rc = pci_request_regions_exclusive(pdev, "nitro_enclaves"); + if (rc < 0) { + dev_err(&pdev->dev, "Error in pci request regions [rc=%d]\n", rc); + + goto disable_pci_dev; + } + + ne_pci_dev->iomem_base = pci_iomap(pdev, PCI_BAR_NE, 0); + if (!ne_pci_dev->iomem_base) { + rc = -ENOMEM; + + dev_err(&pdev->dev, "Error in pci iomap [rc=%d]\n", rc); + + goto release_pci_regions; + } + + pci_set_drvdata(pdev, ne_pci_dev); + + rc = ne_setup_msix(pdev); + if (rc < 0) { + dev_err(&pdev->dev, "Error in pci dev msix setup [rc=%d]\n", rc); + + goto iounmap_pci_bar; + } + + ne_pci_dev_disable(pdev); + + rc = ne_pci_dev_enable(pdev); + if (rc < 0) { + dev_err(&pdev->dev, "Error in ne_pci_dev enable [rc=%d]\n", rc); + + goto teardown_msix; + } + + atomic_set(&ne_pci_dev->cmd_reply_avail, 0); + init_waitqueue_head(&ne_pci_dev->cmd_reply_wait_q); + INIT_LIST_HEAD(&ne_pci_dev->enclaves_list); + mutex_init(&ne_pci_dev->enclaves_list_mutex); + mutex_init(&ne_pci_dev->pci_dev_mutex); + ne_pci_dev->pdev = pdev; + + ne_devs.ne_pci_dev = ne_pci_dev; + + rc = misc_register(ne_devs.ne_misc_dev); + if (rc < 0) { + dev_err(&pdev->dev, "Error in misc dev register [rc=%d]\n", rc); + + goto disable_ne_pci_dev; + } + + return 0; + +disable_ne_pci_dev: + ne_devs.ne_pci_dev = NULL; + ne_pci_dev_disable(pdev); +teardown_msix: + ne_teardown_msix(pdev); +iounmap_pci_bar: + pci_set_drvdata(pdev, NULL); + pci_iounmap(pdev, ne_pci_dev->iomem_base); +release_pci_regions: + pci_release_regions(pdev); +disable_pci_dev: + pci_disable_device(pdev); +free_ne_pci_dev: + kfree(ne_pci_dev); + + return rc; +} + +/** + * ne_pci_remove() - Remove function for the NE PCI device. + * @pdev: PCI device associated with the NE PCI driver. + * + * Context: Process context. + */ +static void ne_pci_remove(struct pci_dev *pdev) +{ + struct ne_pci_dev *ne_pci_dev = pci_get_drvdata(pdev); + + misc_deregister(ne_devs.ne_misc_dev); + + ne_devs.ne_pci_dev = NULL; + + ne_pci_dev_disable(pdev); + + ne_teardown_msix(pdev); + + pci_set_drvdata(pdev, NULL); + + pci_iounmap(pdev, ne_pci_dev->iomem_base); + + pci_release_regions(pdev); + + pci_disable_device(pdev); + + kfree(ne_pci_dev); +} + +/** + * ne_pci_shutdown() - Shutdown function for the NE PCI device. + * @pdev: PCI device associated with the NE PCI driver. + * + * Context: Process context. + */ +static void ne_pci_shutdown(struct pci_dev *pdev) +{ + struct ne_pci_dev *ne_pci_dev = pci_get_drvdata(pdev); + + if (!ne_pci_dev) + return; + + misc_deregister(ne_devs.ne_misc_dev); + + ne_devs.ne_pci_dev = NULL; + + ne_pci_dev_disable(pdev); + + ne_teardown_msix(pdev); + + pci_set_drvdata(pdev, NULL); + + pci_iounmap(pdev, ne_pci_dev->iomem_base); + + pci_release_regions(pdev); + + pci_disable_device(pdev); + + kfree(ne_pci_dev); +} + +/* + * TODO: Add suspend / resume functions for power management w/ CONFIG_PM, if + * needed. + */ +/* NE PCI device driver. */ +struct pci_driver ne_pci_driver = { + .name = "nitro_enclaves", + .id_table = ne_pci_ids, + .probe = ne_pci_probe, + .remove = ne_pci_remove, + .shutdown = ne_pci_shutdown, +}; diff --git a/drivers/virt/nitro_enclaves/ne_pci_dev.h b/drivers/virt/nitro_enclaves/ne_pci_dev.h new file mode 100644 index 000000000..6e9f28971 --- /dev/null +++ b/drivers/virt/nitro_enclaves/ne_pci_dev.h @@ -0,0 +1,331 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright 2020-2021 Amazon.com, Inc. or its affiliates. All Rights Reserved. + */ + +#ifndef _NE_PCI_DEV_H_ +#define _NE_PCI_DEV_H_ + +#include <linux/atomic.h> +#include <linux/list.h> +#include <linux/mutex.h> +#include <linux/pci.h> +#include <linux/pci_ids.h> +#include <linux/wait.h> + +/** + * DOC: Nitro Enclaves (NE) PCI device + */ + +/** + * PCI_DEVICE_ID_NE - Nitro Enclaves PCI device id. + */ +#define PCI_DEVICE_ID_NE (0xe4c1) +/** + * PCI_BAR_NE - Nitro Enclaves PCI device MMIO BAR. + */ +#define PCI_BAR_NE (0x03) + +/** + * DOC: Device registers in the NE PCI device MMIO BAR + */ + +/** + * NE_ENABLE - (1 byte) Register to notify the device that the driver is using + * it (Read/Write). + */ +#define NE_ENABLE (0x0000) +#define NE_ENABLE_OFF (0x00) +#define NE_ENABLE_ON (0x01) + +/** + * NE_VERSION - (2 bytes) Register to select the device run-time version + * (Read/Write). + */ +#define NE_VERSION (0x0002) +#define NE_VERSION_MAX (0x0001) + +/** + * NE_COMMAND - (4 bytes) Register to notify the device what command was + * requested (Write-Only). + */ +#define NE_COMMAND (0x0004) + +/** + * NE_EVTCNT - (4 bytes) Register to notify the driver that a reply or a device + * event is available (Read-Only): + * - Lower half - command reply counter + * - Higher half - out-of-band device event counter + */ +#define NE_EVTCNT (0x000c) +#define NE_EVTCNT_REPLY_SHIFT (0) +#define NE_EVTCNT_REPLY_MASK (0x0000ffff) +#define NE_EVTCNT_REPLY(cnt) (((cnt) & NE_EVTCNT_REPLY_MASK) >> \ + NE_EVTCNT_REPLY_SHIFT) +#define NE_EVTCNT_EVENT_SHIFT (16) +#define NE_EVTCNT_EVENT_MASK (0xffff0000) +#define NE_EVTCNT_EVENT(cnt) (((cnt) & NE_EVTCNT_EVENT_MASK) >> \ + NE_EVTCNT_EVENT_SHIFT) + +/** + * NE_SEND_DATA - (240 bytes) Buffer for sending the command request payload + * (Read/Write). + */ +#define NE_SEND_DATA (0x0010) + +/** + * NE_RECV_DATA - (240 bytes) Buffer for receiving the command reply payload + * (Read-Only). + */ +#define NE_RECV_DATA (0x0100) + +/** + * DOC: Device MMIO buffer sizes + */ + +/** + * NE_SEND_DATA_SIZE - Size of the send buffer, in bytes. + */ +#define NE_SEND_DATA_SIZE (240) + +/** + * NE_RECV_DATA_SIZE - Size of the receive buffer, in bytes. + */ +#define NE_RECV_DATA_SIZE (240) + +/** + * DOC: MSI-X interrupt vectors + */ + +/** + * NE_VEC_REPLY - MSI-X vector used for command reply notification. + */ +#define NE_VEC_REPLY (0) + +/** + * NE_VEC_EVENT - MSI-X vector used for out-of-band events e.g. enclave crash. + */ +#define NE_VEC_EVENT (1) + +/** + * enum ne_pci_dev_cmd_type - Device command types. + * @INVALID_CMD: Invalid command. + * @ENCLAVE_START: Start an enclave, after setting its resources. + * @ENCLAVE_GET_SLOT: Get the slot uid of an enclave. + * @ENCLAVE_STOP: Terminate an enclave. + * @SLOT_ALLOC : Allocate a slot for an enclave. + * @SLOT_FREE: Free the slot allocated for an enclave + * @SLOT_ADD_MEM: Add a memory region to an enclave slot. + * @SLOT_ADD_VCPU: Add a vCPU to an enclave slot. + * @SLOT_COUNT : Get the number of allocated slots. + * @NEXT_SLOT: Get the next slot in the list of allocated slots. + * @SLOT_INFO: Get the info for a slot e.g. slot uid, vCPUs count. + * @SLOT_ADD_BULK_VCPUS: Add a number of vCPUs, not providing CPU ids. + * @MAX_CMD: A gatekeeper for max possible command type. + */ +enum ne_pci_dev_cmd_type { + INVALID_CMD = 0, + ENCLAVE_START = 1, + ENCLAVE_GET_SLOT = 2, + ENCLAVE_STOP = 3, + SLOT_ALLOC = 4, + SLOT_FREE = 5, + SLOT_ADD_MEM = 6, + SLOT_ADD_VCPU = 7, + SLOT_COUNT = 8, + NEXT_SLOT = 9, + SLOT_INFO = 10, + SLOT_ADD_BULK_VCPUS = 11, + MAX_CMD, +}; + +/** + * DOC: Device commands - payload structure for requests and replies. + */ + +/** + * struct enclave_start_req - ENCLAVE_START request. + * @slot_uid: Slot unique id mapped to the enclave to start. + * @enclave_cid: Context ID (CID) for the enclave vsock device. + * If 0, CID is autogenerated. + * @flags: Flags for the enclave to start with (e.g. debug mode). + */ +struct enclave_start_req { + u64 slot_uid; + u64 enclave_cid; + u64 flags; +}; + +/** + * struct enclave_get_slot_req - ENCLAVE_GET_SLOT request. + * @enclave_cid: Context ID (CID) for the enclave vsock device. + */ +struct enclave_get_slot_req { + u64 enclave_cid; +}; + +/** + * struct enclave_stop_req - ENCLAVE_STOP request. + * @slot_uid: Slot unique id mapped to the enclave to stop. + */ +struct enclave_stop_req { + u64 slot_uid; +}; + +/** + * struct slot_alloc_req - SLOT_ALLOC request. + * @unused: In order to avoid weird sizeof edge cases. + */ +struct slot_alloc_req { + u8 unused; +}; + +/** + * struct slot_free_req - SLOT_FREE request. + * @slot_uid: Slot unique id mapped to the slot to free. + */ +struct slot_free_req { + u64 slot_uid; +}; + +/* TODO: Add flags field to the request to add memory region. */ +/** + * struct slot_add_mem_req - SLOT_ADD_MEM request. + * @slot_uid: Slot unique id mapped to the slot to add the memory region to. + * @paddr: Physical address of the memory region to add to the slot. + * @size: Memory size, in bytes, of the memory region to add to the slot. + */ +struct slot_add_mem_req { + u64 slot_uid; + u64 paddr; + u64 size; +}; + +/** + * struct slot_add_vcpu_req - SLOT_ADD_VCPU request. + * @slot_uid: Slot unique id mapped to the slot to add the vCPU to. + * @vcpu_id: vCPU ID of the CPU to add to the enclave. + * @padding: Padding for the overall data structure. + */ +struct slot_add_vcpu_req { + u64 slot_uid; + u32 vcpu_id; + u8 padding[4]; +}; + +/** + * struct slot_count_req - SLOT_COUNT request. + * @unused: In order to avoid weird sizeof edge cases. + */ +struct slot_count_req { + u8 unused; +}; + +/** + * struct next_slot_req - NEXT_SLOT request. + * @slot_uid: Slot unique id of the next slot in the iteration. + */ +struct next_slot_req { + u64 slot_uid; +}; + +/** + * struct slot_info_req - SLOT_INFO request. + * @slot_uid: Slot unique id mapped to the slot to get information about. + */ +struct slot_info_req { + u64 slot_uid; +}; + +/** + * struct slot_add_bulk_vcpus_req - SLOT_ADD_BULK_VCPUS request. + * @slot_uid: Slot unique id mapped to the slot to add vCPUs to. + * @nr_vcpus: Number of vCPUs to add to the slot. + */ +struct slot_add_bulk_vcpus_req { + u64 slot_uid; + u64 nr_vcpus; +}; + +/** + * struct ne_pci_dev_cmd_reply - NE PCI device command reply. + * @rc : Return code of the logic that processed the request. + * @padding0: Padding for the overall data structure. + * @slot_uid: Valid for all commands except SLOT_COUNT. + * @enclave_cid: Valid for ENCLAVE_START command. + * @slot_count : Valid for SLOT_COUNT command. + * @mem_regions: Valid for SLOT_ALLOC and SLOT_INFO commands. + * @mem_size: Valid for SLOT_INFO command. + * @nr_vcpus: Valid for SLOT_INFO command. + * @flags: Valid for SLOT_INFO command. + * @state: Valid for SLOT_INFO command. + * @padding1: Padding for the overall data structure. + */ +struct ne_pci_dev_cmd_reply { + s32 rc; + u8 padding0[4]; + u64 slot_uid; + u64 enclave_cid; + u64 slot_count; + u64 mem_regions; + u64 mem_size; + u64 nr_vcpus; + u64 flags; + u16 state; + u8 padding1[6]; +}; + +/** + * struct ne_pci_dev - Nitro Enclaves (NE) PCI device. + * @cmd_reply_avail: Variable set if a reply has been sent by the + * PCI device. + * @cmd_reply_wait_q: Wait queue for handling command reply from the + * PCI device. + * @enclaves_list: List of the enclaves managed by the PCI device. + * @enclaves_list_mutex: Mutex for accessing the list of enclaves. + * @event_wq: Work queue for handling out-of-band events + * triggered by the Nitro Hypervisor which require + * enclave state scanning and propagation to the + * enclave process. + * @iomem_base : MMIO region of the PCI device. + * @notify_work: Work item for every received out-of-band event. + * @pci_dev_mutex: Mutex for accessing the PCI device MMIO space. + * @pdev: PCI device data structure. + */ +struct ne_pci_dev { + atomic_t cmd_reply_avail; + wait_queue_head_t cmd_reply_wait_q; + struct list_head enclaves_list; + struct mutex enclaves_list_mutex; + struct workqueue_struct *event_wq; + void __iomem *iomem_base; + struct work_struct notify_work; + struct mutex pci_dev_mutex; + struct pci_dev *pdev; +}; + +/** + * ne_do_request() - Submit command request to the PCI device based on the command + * type and retrieve the associated reply. + * @pdev: PCI device to send the command to and receive the reply from. + * @cmd_type: Command type of the request sent to the PCI device. + * @cmd_request: Command request payload. + * @cmd_request_size: Size of the command request payload. + * @cmd_reply: Command reply payload. + * @cmd_reply_size: Size of the command reply payload. + * + * Context: Process context. This function uses the ne_pci_dev mutex to handle + * one command at a time. + * Return: + * * 0 on success. + * * Negative return value on failure. + */ +int ne_do_request(struct pci_dev *pdev, enum ne_pci_dev_cmd_type cmd_type, + void *cmd_request, size_t cmd_request_size, + struct ne_pci_dev_cmd_reply *cmd_reply, + size_t cmd_reply_size); + +/* Nitro Enclaves (NE) PCI device driver */ +extern struct pci_driver ne_pci_driver; + +#endif /* _NE_PCI_DEV_H_ */ diff --git a/drivers/virt/vboxguest/Kconfig b/drivers/virt/vboxguest/Kconfig new file mode 100644 index 000000000..cc329887b --- /dev/null +++ b/drivers/virt/vboxguest/Kconfig @@ -0,0 +1,19 @@ +# SPDX-License-Identifier: GPL-2.0-only +config VBOXGUEST + tristate "Virtual Box Guest integration support" + depends on X86 && PCI && INPUT + help + This is a driver for the Virtual Box Guest PCI device used in + Virtual Box virtual machines. Enabling this driver will add + support for Virtual Box Guest integration features such as + copy-and-paste, seamless mode and OpenGL pass-through. + + This driver also offers vboxguest IPC functionality which is needed + for the vboxfs driver which offers folder sharing support. + + If you enable this driver you should also enable the VBOXVIDEO option. + + Although it is possible to build this module in, it is advised + to build this driver as a module, so that it can be updated + independently of the kernel. Select M to build this driver as a + module. diff --git a/drivers/virt/vboxguest/Makefile b/drivers/virt/vboxguest/Makefile new file mode 100644 index 000000000..804279216 --- /dev/null +++ b/drivers/virt/vboxguest/Makefile @@ -0,0 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only +vboxguest-y := vboxguest_linux.o vboxguest_core.o vboxguest_utils.o + +obj-$(CONFIG_VBOXGUEST) += vboxguest.o diff --git a/drivers/virt/vboxguest/vboxguest_core.c b/drivers/virt/vboxguest/vboxguest_core.c new file mode 100644 index 000000000..dfd69bd77 --- /dev/null +++ b/drivers/virt/vboxguest/vboxguest_core.c @@ -0,0 +1,1826 @@ +/* SPDX-License-Identifier: (GPL-2.0 OR CDDL-1.0) */ +/* + * vboxguest core guest-device handling code, VBoxGuest.cpp in upstream svn. + * + * Copyright (C) 2007-2016 Oracle Corporation + */ + +#include <linux/device.h> +#include <linux/io.h> +#include <linux/mm.h> +#include <linux/sched.h> +#include <linux/sizes.h> +#include <linux/slab.h> +#include <linux/vbox_err.h> +#include <linux/vbox_utils.h> +#include <linux/vmalloc.h> +#include "vboxguest_core.h" +#include "vboxguest_version.h" + +/* Get the pointer to the first HGCM parameter. */ +#define VBG_IOCTL_HGCM_CALL_PARMS(a) \ + ((struct vmmdev_hgcm_function_parameter *)( \ + (u8 *)(a) + sizeof(struct vbg_ioctl_hgcm_call))) +/* Get the pointer to the first HGCM parameter in a 32-bit request. */ +#define VBG_IOCTL_HGCM_CALL_PARMS32(a) \ + ((struct vmmdev_hgcm_function_parameter32 *)( \ + (u8 *)(a) + sizeof(struct vbg_ioctl_hgcm_call))) + +#define GUEST_MAPPINGS_TRIES 5 + +#define VBG_KERNEL_REQUEST \ + (VMMDEV_REQUESTOR_KERNEL | VMMDEV_REQUESTOR_USR_DRV | \ + VMMDEV_REQUESTOR_CON_DONT_KNOW | VMMDEV_REQUESTOR_TRUST_NOT_GIVEN) + +/** + * Reserves memory in which the VMM can relocate any guest mappings + * that are floating around. + * + * This operation is a little bit tricky since the VMM might not accept + * just any address because of address clashes between the three contexts + * it operates in, so we try several times. + * + * Failure to reserve the guest mappings is ignored. + * + * @gdev: The Guest extension device. + */ +static void vbg_guest_mappings_init(struct vbg_dev *gdev) +{ + struct vmmdev_hypervisorinfo *req; + void *guest_mappings[GUEST_MAPPINGS_TRIES]; + struct page **pages = NULL; + u32 size, hypervisor_size; + int i, rc; + + /* Query the required space. */ + req = vbg_req_alloc(sizeof(*req), VMMDEVREQ_GET_HYPERVISOR_INFO, + VBG_KERNEL_REQUEST); + if (!req) + return; + + req->hypervisor_start = 0; + req->hypervisor_size = 0; + rc = vbg_req_perform(gdev, req); + if (rc < 0) + goto out; + + /* + * The VMM will report back if there is nothing it wants to map, like + * for instance in VT-x and AMD-V mode. + */ + if (req->hypervisor_size == 0) + goto out; + + hypervisor_size = req->hypervisor_size; + /* Add 4M so that we can align the vmap to 4MiB as the host requires. */ + size = PAGE_ALIGN(req->hypervisor_size) + SZ_4M; + + pages = kmalloc_array(size >> PAGE_SHIFT, sizeof(*pages), GFP_KERNEL); + if (!pages) + goto out; + + gdev->guest_mappings_dummy_page = alloc_page(GFP_HIGHUSER); + if (!gdev->guest_mappings_dummy_page) + goto out; + + for (i = 0; i < (size >> PAGE_SHIFT); i++) + pages[i] = gdev->guest_mappings_dummy_page; + + /* + * Try several times, the VMM might not accept some addresses because + * of address clashes between the three contexts. + */ + for (i = 0; i < GUEST_MAPPINGS_TRIES; i++) { + guest_mappings[i] = vmap(pages, (size >> PAGE_SHIFT), + VM_MAP, PAGE_KERNEL_RO); + if (!guest_mappings[i]) + break; + + req->header.request_type = VMMDEVREQ_SET_HYPERVISOR_INFO; + req->header.rc = VERR_INTERNAL_ERROR; + req->hypervisor_size = hypervisor_size; + req->hypervisor_start = + (unsigned long)PTR_ALIGN(guest_mappings[i], SZ_4M); + + rc = vbg_req_perform(gdev, req); + if (rc >= 0) { + gdev->guest_mappings = guest_mappings[i]; + break; + } + } + + /* Free vmap's from failed attempts. */ + while (--i >= 0) + vunmap(guest_mappings[i]); + + /* On failure free the dummy-page backing the vmap */ + if (!gdev->guest_mappings) { + __free_page(gdev->guest_mappings_dummy_page); + gdev->guest_mappings_dummy_page = NULL; + } + +out: + vbg_req_free(req, sizeof(*req)); + kfree(pages); +} + +/** + * Undo what vbg_guest_mappings_init did. + * + * @gdev: The Guest extension device. + */ +static void vbg_guest_mappings_exit(struct vbg_dev *gdev) +{ + struct vmmdev_hypervisorinfo *req; + int rc; + + if (!gdev->guest_mappings) + return; + + /* + * Tell the host that we're going to free the memory we reserved for + * it, the free it up. (Leak the memory if anything goes wrong here.) + */ + req = vbg_req_alloc(sizeof(*req), VMMDEVREQ_SET_HYPERVISOR_INFO, + VBG_KERNEL_REQUEST); + if (!req) + return; + + req->hypervisor_start = 0; + req->hypervisor_size = 0; + + rc = vbg_req_perform(gdev, req); + + vbg_req_free(req, sizeof(*req)); + + if (rc < 0) { + vbg_err("%s error: %d\n", __func__, rc); + return; + } + + vunmap(gdev->guest_mappings); + gdev->guest_mappings = NULL; + + __free_page(gdev->guest_mappings_dummy_page); + gdev->guest_mappings_dummy_page = NULL; +} + +/** + * Report the guest information to the host. + * Return: 0 or negative errno value. + * @gdev: The Guest extension device. + */ +static int vbg_report_guest_info(struct vbg_dev *gdev) +{ + /* + * Allocate and fill in the two guest info reports. + */ + struct vmmdev_guest_info *req1 = NULL; + struct vmmdev_guest_info2 *req2 = NULL; + int rc, ret = -ENOMEM; + + req1 = vbg_req_alloc(sizeof(*req1), VMMDEVREQ_REPORT_GUEST_INFO, + VBG_KERNEL_REQUEST); + req2 = vbg_req_alloc(sizeof(*req2), VMMDEVREQ_REPORT_GUEST_INFO2, + VBG_KERNEL_REQUEST); + if (!req1 || !req2) + goto out_free; + + req1->interface_version = VMMDEV_VERSION; + req1->os_type = VMMDEV_OSTYPE_LINUX26; +#if __BITS_PER_LONG == 64 + req1->os_type |= VMMDEV_OSTYPE_X64; +#endif + + req2->additions_major = VBG_VERSION_MAJOR; + req2->additions_minor = VBG_VERSION_MINOR; + req2->additions_build = VBG_VERSION_BUILD; + req2->additions_revision = VBG_SVN_REV; + req2->additions_features = + VMMDEV_GUEST_INFO2_ADDITIONS_FEATURES_REQUESTOR_INFO; + strscpy(req2->name, VBG_VERSION_STRING, + sizeof(req2->name)); + + /* + * There are two protocols here: + * 1. INFO2 + INFO1. Supported by >=3.2.51. + * 2. INFO1 and optionally INFO2. The old protocol. + * + * We try protocol 2 first. It will fail with VERR_NOT_SUPPORTED + * if not supported by the VMMDev (message ordering requirement). + */ + rc = vbg_req_perform(gdev, req2); + if (rc >= 0) { + rc = vbg_req_perform(gdev, req1); + } else if (rc == VERR_NOT_SUPPORTED || rc == VERR_NOT_IMPLEMENTED) { + rc = vbg_req_perform(gdev, req1); + if (rc >= 0) { + rc = vbg_req_perform(gdev, req2); + if (rc == VERR_NOT_IMPLEMENTED) + rc = VINF_SUCCESS; + } + } + ret = vbg_status_code_to_errno(rc); + +out_free: + vbg_req_free(req2, sizeof(*req2)); + vbg_req_free(req1, sizeof(*req1)); + return ret; +} + +/** + * Report the guest driver status to the host. + * Return: 0 or negative errno value. + * @gdev: The Guest extension device. + * @active: Flag whether the driver is now active or not. + */ +static int vbg_report_driver_status(struct vbg_dev *gdev, bool active) +{ + struct vmmdev_guest_status *req; + int rc; + + req = vbg_req_alloc(sizeof(*req), VMMDEVREQ_REPORT_GUEST_STATUS, + VBG_KERNEL_REQUEST); + if (!req) + return -ENOMEM; + + req->facility = VBOXGUEST_FACILITY_TYPE_VBOXGUEST_DRIVER; + if (active) + req->status = VBOXGUEST_FACILITY_STATUS_ACTIVE; + else + req->status = VBOXGUEST_FACILITY_STATUS_INACTIVE; + req->flags = 0; + + rc = vbg_req_perform(gdev, req); + if (rc == VERR_NOT_IMPLEMENTED) /* Compatibility with older hosts. */ + rc = VINF_SUCCESS; + + vbg_req_free(req, sizeof(*req)); + + return vbg_status_code_to_errno(rc); +} + +/** + * Inflate the balloon by one chunk. The caller owns the balloon mutex. + * Return: 0 or negative errno value. + * @gdev: The Guest extension device. + * @chunk_idx: Index of the chunk. + */ +static int vbg_balloon_inflate(struct vbg_dev *gdev, u32 chunk_idx) +{ + struct vmmdev_memballoon_change *req = gdev->mem_balloon.change_req; + struct page **pages; + int i, rc, ret; + + pages = kmalloc_array(VMMDEV_MEMORY_BALLOON_CHUNK_PAGES, + sizeof(*pages), + GFP_KERNEL | __GFP_NOWARN); + if (!pages) + return -ENOMEM; + + req->header.size = sizeof(*req); + req->inflate = true; + req->pages = VMMDEV_MEMORY_BALLOON_CHUNK_PAGES; + + for (i = 0; i < VMMDEV_MEMORY_BALLOON_CHUNK_PAGES; i++) { + pages[i] = alloc_page(GFP_KERNEL | __GFP_NOWARN); + if (!pages[i]) { + ret = -ENOMEM; + goto out_error; + } + + req->phys_page[i] = page_to_phys(pages[i]); + } + + rc = vbg_req_perform(gdev, req); + if (rc < 0) { + vbg_err("%s error, rc: %d\n", __func__, rc); + ret = vbg_status_code_to_errno(rc); + goto out_error; + } + + gdev->mem_balloon.pages[chunk_idx] = pages; + + return 0; + +out_error: + while (--i >= 0) + __free_page(pages[i]); + kfree(pages); + + return ret; +} + +/** + * Deflate the balloon by one chunk. The caller owns the balloon mutex. + * Return: 0 or negative errno value. + * @gdev: The Guest extension device. + * @chunk_idx: Index of the chunk. + */ +static int vbg_balloon_deflate(struct vbg_dev *gdev, u32 chunk_idx) +{ + struct vmmdev_memballoon_change *req = gdev->mem_balloon.change_req; + struct page **pages = gdev->mem_balloon.pages[chunk_idx]; + int i, rc; + + req->header.size = sizeof(*req); + req->inflate = false; + req->pages = VMMDEV_MEMORY_BALLOON_CHUNK_PAGES; + + for (i = 0; i < VMMDEV_MEMORY_BALLOON_CHUNK_PAGES; i++) + req->phys_page[i] = page_to_phys(pages[i]); + + rc = vbg_req_perform(gdev, req); + if (rc < 0) { + vbg_err("%s error, rc: %d\n", __func__, rc); + return vbg_status_code_to_errno(rc); + } + + for (i = 0; i < VMMDEV_MEMORY_BALLOON_CHUNK_PAGES; i++) + __free_page(pages[i]); + kfree(pages); + gdev->mem_balloon.pages[chunk_idx] = NULL; + + return 0; +} + +/** + * Respond to VMMDEV_EVENT_BALLOON_CHANGE_REQUEST events, query the size + * the host wants the balloon to be and adjust accordingly. + */ +static void vbg_balloon_work(struct work_struct *work) +{ + struct vbg_dev *gdev = + container_of(work, struct vbg_dev, mem_balloon.work); + struct vmmdev_memballoon_info *req = gdev->mem_balloon.get_req; + u32 i, chunks; + int rc, ret; + + /* + * Setting this bit means that we request the value from the host and + * change the guest memory balloon according to the returned value. + */ + req->event_ack = VMMDEV_EVENT_BALLOON_CHANGE_REQUEST; + rc = vbg_req_perform(gdev, req); + if (rc < 0) { + vbg_err("%s error, rc: %d)\n", __func__, rc); + return; + } + + /* + * The host always returns the same maximum amount of chunks, so + * we do this once. + */ + if (!gdev->mem_balloon.max_chunks) { + gdev->mem_balloon.pages = + devm_kcalloc(gdev->dev, req->phys_mem_chunks, + sizeof(struct page **), GFP_KERNEL); + if (!gdev->mem_balloon.pages) + return; + + gdev->mem_balloon.max_chunks = req->phys_mem_chunks; + } + + chunks = req->balloon_chunks; + if (chunks > gdev->mem_balloon.max_chunks) { + vbg_err("%s: illegal balloon size %u (max=%u)\n", + __func__, chunks, gdev->mem_balloon.max_chunks); + return; + } + + if (chunks > gdev->mem_balloon.chunks) { + /* inflate */ + for (i = gdev->mem_balloon.chunks; i < chunks; i++) { + ret = vbg_balloon_inflate(gdev, i); + if (ret < 0) + return; + + gdev->mem_balloon.chunks++; + } + } else { + /* deflate */ + for (i = gdev->mem_balloon.chunks; i-- > chunks;) { + ret = vbg_balloon_deflate(gdev, i); + if (ret < 0) + return; + + gdev->mem_balloon.chunks--; + } + } +} + +/** + * Callback for heartbeat timer. + */ +static void vbg_heartbeat_timer(struct timer_list *t) +{ + struct vbg_dev *gdev = from_timer(gdev, t, heartbeat_timer); + + vbg_req_perform(gdev, gdev->guest_heartbeat_req); + mod_timer(&gdev->heartbeat_timer, + msecs_to_jiffies(gdev->heartbeat_interval_ms)); +} + +/** + * Configure the host to check guest's heartbeat + * and get heartbeat interval from the host. + * Return: 0 or negative errno value. + * @gdev: The Guest extension device. + * @enabled: Set true to enable guest heartbeat checks on host. + */ +static int vbg_heartbeat_host_config(struct vbg_dev *gdev, bool enabled) +{ + struct vmmdev_heartbeat *req; + int rc; + + req = vbg_req_alloc(sizeof(*req), VMMDEVREQ_HEARTBEAT_CONFIGURE, + VBG_KERNEL_REQUEST); + if (!req) + return -ENOMEM; + + req->enabled = enabled; + req->interval_ns = 0; + rc = vbg_req_perform(gdev, req); + do_div(req->interval_ns, 1000000); /* ns -> ms */ + gdev->heartbeat_interval_ms = req->interval_ns; + vbg_req_free(req, sizeof(*req)); + + return vbg_status_code_to_errno(rc); +} + +/** + * Initializes the heartbeat timer. This feature may be disabled by the host. + * Return: 0 or negative errno value. + * @gdev: The Guest extension device. + */ +static int vbg_heartbeat_init(struct vbg_dev *gdev) +{ + int ret; + + /* Make sure that heartbeat checking is disabled if we fail. */ + ret = vbg_heartbeat_host_config(gdev, false); + if (ret < 0) + return ret; + + ret = vbg_heartbeat_host_config(gdev, true); + if (ret < 0) + return ret; + + gdev->guest_heartbeat_req = vbg_req_alloc( + sizeof(*gdev->guest_heartbeat_req), + VMMDEVREQ_GUEST_HEARTBEAT, + VBG_KERNEL_REQUEST); + if (!gdev->guest_heartbeat_req) + return -ENOMEM; + + vbg_info("%s: Setting up heartbeat to trigger every %d milliseconds\n", + __func__, gdev->heartbeat_interval_ms); + mod_timer(&gdev->heartbeat_timer, 0); + + return 0; +} + +/** + * Cleanup hearbeat code, stop HB timer and disable host heartbeat checking. + * @gdev: The Guest extension device. + */ +static void vbg_heartbeat_exit(struct vbg_dev *gdev) +{ + del_timer_sync(&gdev->heartbeat_timer); + vbg_heartbeat_host_config(gdev, false); + vbg_req_free(gdev->guest_heartbeat_req, + sizeof(*gdev->guest_heartbeat_req)); +} + +/** + * Applies a change to the bit usage tracker. + * Return: true if the mask changed, false if not. + * @tracker: The bit usage tracker. + * @changed: The bits to change. + * @previous: The previous value of the bits. + */ +static bool vbg_track_bit_usage(struct vbg_bit_usage_tracker *tracker, + u32 changed, u32 previous) +{ + bool global_change = false; + + while (changed) { + u32 bit = ffs(changed) - 1; + u32 bitmask = BIT(bit); + + if (bitmask & previous) { + tracker->per_bit_usage[bit] -= 1; + if (tracker->per_bit_usage[bit] == 0) { + global_change = true; + tracker->mask &= ~bitmask; + } + } else { + tracker->per_bit_usage[bit] += 1; + if (tracker->per_bit_usage[bit] == 1) { + global_change = true; + tracker->mask |= bitmask; + } + } + + changed &= ~bitmask; + } + + return global_change; +} + +/** + * Init and termination worker for resetting the (host) event filter on the host + * Return: 0 or negative errno value. + * @gdev: The Guest extension device. + * @fixed_events: Fixed events (init time). + */ +static int vbg_reset_host_event_filter(struct vbg_dev *gdev, + u32 fixed_events) +{ + struct vmmdev_mask *req; + int rc; + + req = vbg_req_alloc(sizeof(*req), VMMDEVREQ_CTL_GUEST_FILTER_MASK, + VBG_KERNEL_REQUEST); + if (!req) + return -ENOMEM; + + req->not_mask = U32_MAX & ~fixed_events; + req->or_mask = fixed_events; + rc = vbg_req_perform(gdev, req); + if (rc < 0) + vbg_err("%s error, rc: %d\n", __func__, rc); + + vbg_req_free(req, sizeof(*req)); + return vbg_status_code_to_errno(rc); +} + +/** + * Changes the event filter mask for the given session. + * + * This is called in response to VBG_IOCTL_CHANGE_FILTER_MASK as well as to + * do session cleanup. Takes the session mutex. + * + * Return: 0 or negative errno value. + * @gdev: The Guest extension device. + * @session: The session. + * @or_mask: The events to add. + * @not_mask: The events to remove. + * @session_termination: Set if we're called by the session cleanup code. + * This tweaks the error handling so we perform + * proper session cleanup even if the host + * misbehaves. + */ +static int vbg_set_session_event_filter(struct vbg_dev *gdev, + struct vbg_session *session, + u32 or_mask, u32 not_mask, + bool session_termination) +{ + struct vmmdev_mask *req; + u32 changed, previous; + int rc, ret = 0; + + /* + * Allocate a request buffer before taking the spinlock, when + * the session is being terminated the requestor is the kernel, + * as we're cleaning up. + */ + req = vbg_req_alloc(sizeof(*req), VMMDEVREQ_CTL_GUEST_FILTER_MASK, + session_termination ? VBG_KERNEL_REQUEST : + session->requestor); + if (!req) { + if (!session_termination) + return -ENOMEM; + /* Ignore allocation failure, we must do session cleanup. */ + } + + mutex_lock(&gdev->session_mutex); + + /* Apply the changes to the session mask. */ + previous = session->event_filter; + session->event_filter |= or_mask; + session->event_filter &= ~not_mask; + + /* If anything actually changed, update the global usage counters. */ + changed = previous ^ session->event_filter; + if (!changed) + goto out; + + vbg_track_bit_usage(&gdev->event_filter_tracker, changed, previous); + or_mask = gdev->fixed_events | gdev->event_filter_tracker.mask; + + if (gdev->event_filter_host == or_mask || !req) + goto out; + + gdev->event_filter_host = or_mask; + req->or_mask = or_mask; + req->not_mask = ~or_mask; + rc = vbg_req_perform(gdev, req); + if (rc < 0) { + ret = vbg_status_code_to_errno(rc); + + /* Failed, roll back (unless it's session termination time). */ + gdev->event_filter_host = U32_MAX; + if (session_termination) + goto out; + + vbg_track_bit_usage(&gdev->event_filter_tracker, changed, + session->event_filter); + session->event_filter = previous; + } + +out: + mutex_unlock(&gdev->session_mutex); + vbg_req_free(req, sizeof(*req)); + + return ret; +} + +/** + * Init and termination worker for set guest capabilities to zero on the host. + * Return: 0 or negative errno value. + * @gdev: The Guest extension device. + */ +static int vbg_reset_host_capabilities(struct vbg_dev *gdev) +{ + struct vmmdev_mask *req; + int rc; + + req = vbg_req_alloc(sizeof(*req), VMMDEVREQ_SET_GUEST_CAPABILITIES, + VBG_KERNEL_REQUEST); + if (!req) + return -ENOMEM; + + req->not_mask = U32_MAX; + req->or_mask = 0; + rc = vbg_req_perform(gdev, req); + if (rc < 0) + vbg_err("%s error, rc: %d\n", __func__, rc); + + vbg_req_free(req, sizeof(*req)); + return vbg_status_code_to_errno(rc); +} + +/** + * Set guest capabilities on the host. + * Must be called with gdev->session_mutex hold. + * Return: 0 or negative errno value. + * @gdev: The Guest extension device. + * @session: The session. + * @session_termination: Set if we're called by the session cleanup code. + */ +static int vbg_set_host_capabilities(struct vbg_dev *gdev, + struct vbg_session *session, + bool session_termination) +{ + struct vmmdev_mask *req; + u32 caps; + int rc; + + WARN_ON(!mutex_is_locked(&gdev->session_mutex)); + + caps = gdev->acquired_guest_caps | gdev->set_guest_caps_tracker.mask; + + if (gdev->guest_caps_host == caps) + return 0; + + /* On termination the requestor is the kernel, as we're cleaning up. */ + req = vbg_req_alloc(sizeof(*req), VMMDEVREQ_SET_GUEST_CAPABILITIES, + session_termination ? VBG_KERNEL_REQUEST : + session->requestor); + if (!req) { + gdev->guest_caps_host = U32_MAX; + return -ENOMEM; + } + + req->or_mask = caps; + req->not_mask = ~caps; + rc = vbg_req_perform(gdev, req); + vbg_req_free(req, sizeof(*req)); + + gdev->guest_caps_host = (rc >= 0) ? caps : U32_MAX; + + return vbg_status_code_to_errno(rc); +} + +/** + * Acquire (get exclusive access) guest capabilities for a session. + * Takes the session mutex. + * Return: 0 or negative errno value. + * @gdev: The Guest extension device. + * @session: The session. + * @flags: Flags (VBGL_IOC_AGC_FLAGS_XXX). + * @or_mask: The capabilities to add. + * @not_mask: The capabilities to remove. + * @session_termination: Set if we're called by the session cleanup code. + * This tweaks the error handling so we perform + * proper session cleanup even if the host + * misbehaves. + */ +static int vbg_acquire_session_capabilities(struct vbg_dev *gdev, + struct vbg_session *session, + u32 or_mask, u32 not_mask, + u32 flags, bool session_termination) +{ + unsigned long irqflags; + bool wakeup = false; + int ret = 0; + + mutex_lock(&gdev->session_mutex); + + if (gdev->set_guest_caps_tracker.mask & or_mask) { + vbg_err("%s error: cannot acquire caps which are currently set\n", + __func__); + ret = -EINVAL; + goto out; + } + + /* + * Mark any caps in the or_mask as now being in acquire-mode. Note + * once caps are in acquire_mode they always stay in this mode. + * This impacts event handling, so we take the event-lock. + */ + spin_lock_irqsave(&gdev->event_spinlock, irqflags); + gdev->acquire_mode_guest_caps |= or_mask; + spin_unlock_irqrestore(&gdev->event_spinlock, irqflags); + + /* If we only have to switch the caps to acquire mode, we're done. */ + if (flags & VBGL_IOC_AGC_FLAGS_CONFIG_ACQUIRE_MODE) + goto out; + + not_mask &= ~or_mask; /* or_mask takes priority over not_mask */ + not_mask &= session->acquired_guest_caps; + or_mask &= ~session->acquired_guest_caps; + + if (or_mask == 0 && not_mask == 0) + goto out; + + if (gdev->acquired_guest_caps & or_mask) { + ret = -EBUSY; + goto out; + } + + gdev->acquired_guest_caps |= or_mask; + gdev->acquired_guest_caps &= ~not_mask; + /* session->acquired_guest_caps impacts event handling, take the lock */ + spin_lock_irqsave(&gdev->event_spinlock, irqflags); + session->acquired_guest_caps |= or_mask; + session->acquired_guest_caps &= ~not_mask; + spin_unlock_irqrestore(&gdev->event_spinlock, irqflags); + + ret = vbg_set_host_capabilities(gdev, session, session_termination); + /* Roll back on failure, unless it's session termination time. */ + if (ret < 0 && !session_termination) { + gdev->acquired_guest_caps &= ~or_mask; + gdev->acquired_guest_caps |= not_mask; + spin_lock_irqsave(&gdev->event_spinlock, irqflags); + session->acquired_guest_caps &= ~or_mask; + session->acquired_guest_caps |= not_mask; + spin_unlock_irqrestore(&gdev->event_spinlock, irqflags); + } + + /* + * If we added a capability, check if that means some other thread in + * our session should be unblocked because there are events pending + * (the result of vbg_get_allowed_event_mask_for_session() may change). + * + * HACK ALERT! When the seamless support capability is added we generate + * a seamless change event so that the ring-3 client can sync with + * the seamless state. + */ + if (ret == 0 && or_mask != 0) { + spin_lock_irqsave(&gdev->event_spinlock, irqflags); + + if (or_mask & VMMDEV_GUEST_SUPPORTS_SEAMLESS) + gdev->pending_events |= + VMMDEV_EVENT_SEAMLESS_MODE_CHANGE_REQUEST; + + if (gdev->pending_events) + wakeup = true; + + spin_unlock_irqrestore(&gdev->event_spinlock, irqflags); + + if (wakeup) + wake_up(&gdev->event_wq); + } + +out: + mutex_unlock(&gdev->session_mutex); + + return ret; +} + +/** + * Sets the guest capabilities for a session. Takes the session mutex. + * Return: 0 or negative errno value. + * @gdev: The Guest extension device. + * @session: The session. + * @or_mask: The capabilities to add. + * @not_mask: The capabilities to remove. + * @session_termination: Set if we're called by the session cleanup code. + * This tweaks the error handling so we perform + * proper session cleanup even if the host + * misbehaves. + */ +static int vbg_set_session_capabilities(struct vbg_dev *gdev, + struct vbg_session *session, + u32 or_mask, u32 not_mask, + bool session_termination) +{ + u32 changed, previous; + int ret = 0; + + mutex_lock(&gdev->session_mutex); + + if (gdev->acquire_mode_guest_caps & or_mask) { + vbg_err("%s error: cannot set caps which are in acquire_mode\n", + __func__); + ret = -EBUSY; + goto out; + } + + /* Apply the changes to the session mask. */ + previous = session->set_guest_caps; + session->set_guest_caps |= or_mask; + session->set_guest_caps &= ~not_mask; + + /* If anything actually changed, update the global usage counters. */ + changed = previous ^ session->set_guest_caps; + if (!changed) + goto out; + + vbg_track_bit_usage(&gdev->set_guest_caps_tracker, changed, previous); + + ret = vbg_set_host_capabilities(gdev, session, session_termination); + /* Roll back on failure, unless it's session termination time. */ + if (ret < 0 && !session_termination) { + vbg_track_bit_usage(&gdev->set_guest_caps_tracker, changed, + session->set_guest_caps); + session->set_guest_caps = previous; + } + +out: + mutex_unlock(&gdev->session_mutex); + + return ret; +} + +/** + * vbg_query_host_version get the host feature mask and version information. + * Return: 0 or negative errno value. + * @gdev: The Guest extension device. + */ +static int vbg_query_host_version(struct vbg_dev *gdev) +{ + struct vmmdev_host_version *req; + int rc, ret; + + req = vbg_req_alloc(sizeof(*req), VMMDEVREQ_GET_HOST_VERSION, + VBG_KERNEL_REQUEST); + if (!req) + return -ENOMEM; + + rc = vbg_req_perform(gdev, req); + ret = vbg_status_code_to_errno(rc); + if (ret) { + vbg_err("%s error: %d\n", __func__, rc); + goto out; + } + + snprintf(gdev->host_version, sizeof(gdev->host_version), "%u.%u.%ur%u", + req->major, req->minor, req->build, req->revision); + gdev->host_features = req->features; + + vbg_info("vboxguest: host-version: %s %#x\n", gdev->host_version, + gdev->host_features); + + if (!(req->features & VMMDEV_HVF_HGCM_PHYS_PAGE_LIST)) { + vbg_err("vboxguest: Error host too old (does not support page-lists)\n"); + ret = -ENODEV; + } + +out: + vbg_req_free(req, sizeof(*req)); + return ret; +} + +/** + * Initializes the VBoxGuest device extension when the + * device driver is loaded. + * + * The native code locates the VMMDev on the PCI bus and retrieve + * the MMIO and I/O port ranges, this function will take care of + * mapping the MMIO memory (if present). Upon successful return + * the native code should set up the interrupt handler. + * + * Return: 0 or negative errno value. + * + * @gdev: The Guest extension device. + * @fixed_events: Events that will be enabled upon init and no client + * will ever be allowed to mask. + */ +int vbg_core_init(struct vbg_dev *gdev, u32 fixed_events) +{ + int ret = -ENOMEM; + + gdev->fixed_events = fixed_events | VMMDEV_EVENT_HGCM; + gdev->event_filter_host = U32_MAX; /* forces a report */ + gdev->guest_caps_host = U32_MAX; /* forces a report */ + + init_waitqueue_head(&gdev->event_wq); + init_waitqueue_head(&gdev->hgcm_wq); + spin_lock_init(&gdev->event_spinlock); + mutex_init(&gdev->session_mutex); + mutex_init(&gdev->cancel_req_mutex); + timer_setup(&gdev->heartbeat_timer, vbg_heartbeat_timer, 0); + INIT_WORK(&gdev->mem_balloon.work, vbg_balloon_work); + + gdev->mem_balloon.get_req = + vbg_req_alloc(sizeof(*gdev->mem_balloon.get_req), + VMMDEVREQ_GET_MEMBALLOON_CHANGE_REQ, + VBG_KERNEL_REQUEST); + gdev->mem_balloon.change_req = + vbg_req_alloc(sizeof(*gdev->mem_balloon.change_req), + VMMDEVREQ_CHANGE_MEMBALLOON, + VBG_KERNEL_REQUEST); + gdev->cancel_req = + vbg_req_alloc(sizeof(*(gdev->cancel_req)), + VMMDEVREQ_HGCM_CANCEL2, + VBG_KERNEL_REQUEST); + gdev->ack_events_req = + vbg_req_alloc(sizeof(*gdev->ack_events_req), + VMMDEVREQ_ACKNOWLEDGE_EVENTS, + VBG_KERNEL_REQUEST); + gdev->mouse_status_req = + vbg_req_alloc(sizeof(*gdev->mouse_status_req), + VMMDEVREQ_GET_MOUSE_STATUS, + VBG_KERNEL_REQUEST); + + if (!gdev->mem_balloon.get_req || !gdev->mem_balloon.change_req || + !gdev->cancel_req || !gdev->ack_events_req || + !gdev->mouse_status_req) + goto err_free_reqs; + + ret = vbg_query_host_version(gdev); + if (ret) + goto err_free_reqs; + + ret = vbg_report_guest_info(gdev); + if (ret) { + vbg_err("vboxguest: vbg_report_guest_info error: %d\n", ret); + goto err_free_reqs; + } + + ret = vbg_reset_host_event_filter(gdev, gdev->fixed_events); + if (ret) { + vbg_err("vboxguest: Error setting fixed event filter: %d\n", + ret); + goto err_free_reqs; + } + + ret = vbg_reset_host_capabilities(gdev); + if (ret) { + vbg_err("vboxguest: Error clearing guest capabilities: %d\n", + ret); + goto err_free_reqs; + } + + ret = vbg_core_set_mouse_status(gdev, 0); + if (ret) { + vbg_err("vboxguest: Error clearing mouse status: %d\n", ret); + goto err_free_reqs; + } + + /* These may fail without requiring the driver init to fail. */ + vbg_guest_mappings_init(gdev); + vbg_heartbeat_init(gdev); + + /* All Done! */ + ret = vbg_report_driver_status(gdev, true); + if (ret < 0) + vbg_err("vboxguest: Error reporting driver status: %d\n", ret); + + return 0; + +err_free_reqs: + vbg_req_free(gdev->mouse_status_req, + sizeof(*gdev->mouse_status_req)); + vbg_req_free(gdev->ack_events_req, + sizeof(*gdev->ack_events_req)); + vbg_req_free(gdev->cancel_req, + sizeof(*gdev->cancel_req)); + vbg_req_free(gdev->mem_balloon.change_req, + sizeof(*gdev->mem_balloon.change_req)); + vbg_req_free(gdev->mem_balloon.get_req, + sizeof(*gdev->mem_balloon.get_req)); + return ret; +} + +/** + * Call this on exit to clean-up vboxguest-core managed resources. + * + * The native code should call this before the driver is loaded, + * but don't call this on shutdown. + * @gdev: The Guest extension device. + */ +void vbg_core_exit(struct vbg_dev *gdev) +{ + vbg_heartbeat_exit(gdev); + vbg_guest_mappings_exit(gdev); + + /* Clear the host flags (mouse status etc). */ + vbg_reset_host_event_filter(gdev, 0); + vbg_reset_host_capabilities(gdev); + vbg_core_set_mouse_status(gdev, 0); + + vbg_req_free(gdev->mouse_status_req, + sizeof(*gdev->mouse_status_req)); + vbg_req_free(gdev->ack_events_req, + sizeof(*gdev->ack_events_req)); + vbg_req_free(gdev->cancel_req, + sizeof(*gdev->cancel_req)); + vbg_req_free(gdev->mem_balloon.change_req, + sizeof(*gdev->mem_balloon.change_req)); + vbg_req_free(gdev->mem_balloon.get_req, + sizeof(*gdev->mem_balloon.get_req)); +} + +/** + * Creates a VBoxGuest user session. + * + * vboxguest_linux.c calls this when userspace opens the char-device. + * Return: A pointer to the new session or an ERR_PTR on error. + * @gdev: The Guest extension device. + * @requestor: VMMDEV_REQUESTOR_* flags + */ +struct vbg_session *vbg_core_open_session(struct vbg_dev *gdev, u32 requestor) +{ + struct vbg_session *session; + + session = kzalloc(sizeof(*session), GFP_KERNEL); + if (!session) + return ERR_PTR(-ENOMEM); + + session->gdev = gdev; + session->requestor = requestor; + + return session; +} + +/** + * Closes a VBoxGuest session. + * @session: The session to close (and free). + */ +void vbg_core_close_session(struct vbg_session *session) +{ + struct vbg_dev *gdev = session->gdev; + int i, rc; + + vbg_acquire_session_capabilities(gdev, session, 0, U32_MAX, 0, true); + vbg_set_session_capabilities(gdev, session, 0, U32_MAX, true); + vbg_set_session_event_filter(gdev, session, 0, U32_MAX, true); + + for (i = 0; i < ARRAY_SIZE(session->hgcm_client_ids); i++) { + if (!session->hgcm_client_ids[i]) + continue; + + /* requestor is kernel here, as we're cleaning up. */ + vbg_hgcm_disconnect(gdev, VBG_KERNEL_REQUEST, + session->hgcm_client_ids[i], &rc); + } + + kfree(session); +} + +static int vbg_ioctl_chk(struct vbg_ioctl_hdr *hdr, size_t in_size, + size_t out_size) +{ + if (hdr->size_in != (sizeof(*hdr) + in_size) || + hdr->size_out != (sizeof(*hdr) + out_size)) + return -EINVAL; + + return 0; +} + +static int vbg_ioctl_driver_version_info( + struct vbg_ioctl_driver_version_info *info) +{ + const u16 vbg_maj_version = VBG_IOC_VERSION >> 16; + u16 min_maj_version, req_maj_version; + + if (vbg_ioctl_chk(&info->hdr, sizeof(info->u.in), sizeof(info->u.out))) + return -EINVAL; + + req_maj_version = info->u.in.req_version >> 16; + min_maj_version = info->u.in.min_version >> 16; + + if (info->u.in.min_version > info->u.in.req_version || + min_maj_version != req_maj_version) + return -EINVAL; + + if (info->u.in.min_version <= VBG_IOC_VERSION && + min_maj_version == vbg_maj_version) { + info->u.out.session_version = VBG_IOC_VERSION; + } else { + info->u.out.session_version = U32_MAX; + info->hdr.rc = VERR_VERSION_MISMATCH; + } + + info->u.out.driver_version = VBG_IOC_VERSION; + info->u.out.driver_revision = 0; + info->u.out.reserved1 = 0; + info->u.out.reserved2 = 0; + + return 0; +} + +/* Must be called with the event_lock held */ +static u32 vbg_get_allowed_event_mask_for_session(struct vbg_dev *gdev, + struct vbg_session *session) +{ + u32 acquire_mode_caps = gdev->acquire_mode_guest_caps; + u32 session_acquired_caps = session->acquired_guest_caps; + u32 allowed_events = VMMDEV_EVENT_VALID_EVENT_MASK; + + if ((acquire_mode_caps & VMMDEV_GUEST_SUPPORTS_GRAPHICS) && + !(session_acquired_caps & VMMDEV_GUEST_SUPPORTS_GRAPHICS)) + allowed_events &= ~VMMDEV_EVENT_DISPLAY_CHANGE_REQUEST; + + if ((acquire_mode_caps & VMMDEV_GUEST_SUPPORTS_SEAMLESS) && + !(session_acquired_caps & VMMDEV_GUEST_SUPPORTS_SEAMLESS)) + allowed_events &= ~VMMDEV_EVENT_SEAMLESS_MODE_CHANGE_REQUEST; + + return allowed_events; +} + +static bool vbg_wait_event_cond(struct vbg_dev *gdev, + struct vbg_session *session, + u32 event_mask) +{ + unsigned long flags; + bool wakeup; + u32 events; + + spin_lock_irqsave(&gdev->event_spinlock, flags); + + events = gdev->pending_events & event_mask; + events &= vbg_get_allowed_event_mask_for_session(gdev, session); + wakeup = events || session->cancel_waiters; + + spin_unlock_irqrestore(&gdev->event_spinlock, flags); + + return wakeup; +} + +/* Must be called with the event_lock held */ +static u32 vbg_consume_events_locked(struct vbg_dev *gdev, + struct vbg_session *session, + u32 event_mask) +{ + u32 events = gdev->pending_events & event_mask; + + events &= vbg_get_allowed_event_mask_for_session(gdev, session); + gdev->pending_events &= ~events; + return events; +} + +static int vbg_ioctl_wait_for_events(struct vbg_dev *gdev, + struct vbg_session *session, + struct vbg_ioctl_wait_for_events *wait) +{ + u32 timeout_ms = wait->u.in.timeout_ms; + u32 event_mask = wait->u.in.events; + unsigned long flags; + long timeout; + int ret = 0; + + if (vbg_ioctl_chk(&wait->hdr, sizeof(wait->u.in), sizeof(wait->u.out))) + return -EINVAL; + + if (timeout_ms == U32_MAX) + timeout = MAX_SCHEDULE_TIMEOUT; + else + timeout = msecs_to_jiffies(timeout_ms); + + wait->u.out.events = 0; + do { + timeout = wait_event_interruptible_timeout( + gdev->event_wq, + vbg_wait_event_cond(gdev, session, event_mask), + timeout); + + spin_lock_irqsave(&gdev->event_spinlock, flags); + + if (timeout < 0 || session->cancel_waiters) { + ret = -EINTR; + } else if (timeout == 0) { + ret = -ETIMEDOUT; + } else { + wait->u.out.events = + vbg_consume_events_locked(gdev, session, event_mask); + } + + spin_unlock_irqrestore(&gdev->event_spinlock, flags); + + /* + * Someone else may have consumed the event(s) first, in + * which case we go back to waiting. + */ + } while (ret == 0 && wait->u.out.events == 0); + + return ret; +} + +static int vbg_ioctl_interrupt_all_wait_events(struct vbg_dev *gdev, + struct vbg_session *session, + struct vbg_ioctl_hdr *hdr) +{ + unsigned long flags; + + if (hdr->size_in != sizeof(*hdr) || hdr->size_out != sizeof(*hdr)) + return -EINVAL; + + spin_lock_irqsave(&gdev->event_spinlock, flags); + session->cancel_waiters = true; + spin_unlock_irqrestore(&gdev->event_spinlock, flags); + + wake_up(&gdev->event_wq); + + return 0; +} + +/** + * Checks if the VMM request is allowed in the context of the given session. + * Return: 0 or negative errno value. + * @gdev: The Guest extension device. + * @session: The calling session. + * @req: The request. + */ +static int vbg_req_allowed(struct vbg_dev *gdev, struct vbg_session *session, + const struct vmmdev_request_header *req) +{ + const struct vmmdev_guest_status *guest_status; + bool trusted_apps_only; + + switch (req->request_type) { + /* Trusted users apps only. */ + case VMMDEVREQ_QUERY_CREDENTIALS: + case VMMDEVREQ_REPORT_CREDENTIALS_JUDGEMENT: + case VMMDEVREQ_REGISTER_SHARED_MODULE: + case VMMDEVREQ_UNREGISTER_SHARED_MODULE: + case VMMDEVREQ_WRITE_COREDUMP: + case VMMDEVREQ_GET_CPU_HOTPLUG_REQ: + case VMMDEVREQ_SET_CPU_HOTPLUG_STATUS: + case VMMDEVREQ_CHECK_SHARED_MODULES: + case VMMDEVREQ_GET_PAGE_SHARING_STATUS: + case VMMDEVREQ_DEBUG_IS_PAGE_SHARED: + case VMMDEVREQ_REPORT_GUEST_STATS: + case VMMDEVREQ_REPORT_GUEST_USER_STATE: + case VMMDEVREQ_GET_STATISTICS_CHANGE_REQ: + trusted_apps_only = true; + break; + + /* Anyone. */ + case VMMDEVREQ_GET_MOUSE_STATUS: + case VMMDEVREQ_SET_MOUSE_STATUS: + case VMMDEVREQ_SET_POINTER_SHAPE: + case VMMDEVREQ_GET_HOST_VERSION: + case VMMDEVREQ_IDLE: + case VMMDEVREQ_GET_HOST_TIME: + case VMMDEVREQ_SET_POWER_STATUS: + case VMMDEVREQ_ACKNOWLEDGE_EVENTS: + case VMMDEVREQ_CTL_GUEST_FILTER_MASK: + case VMMDEVREQ_REPORT_GUEST_STATUS: + case VMMDEVREQ_GET_DISPLAY_CHANGE_REQ: + case VMMDEVREQ_VIDEMODE_SUPPORTED: + case VMMDEVREQ_GET_HEIGHT_REDUCTION: + case VMMDEVREQ_GET_DISPLAY_CHANGE_REQ2: + case VMMDEVREQ_VIDEMODE_SUPPORTED2: + case VMMDEVREQ_VIDEO_ACCEL_ENABLE: + case VMMDEVREQ_VIDEO_ACCEL_FLUSH: + case VMMDEVREQ_VIDEO_SET_VISIBLE_REGION: + case VMMDEVREQ_VIDEO_UPDATE_MONITOR_POSITIONS: + case VMMDEVREQ_GET_DISPLAY_CHANGE_REQEX: + case VMMDEVREQ_GET_DISPLAY_CHANGE_REQ_MULTI: + case VMMDEVREQ_GET_SEAMLESS_CHANGE_REQ: + case VMMDEVREQ_GET_VRDPCHANGE_REQ: + case VMMDEVREQ_LOG_STRING: + case VMMDEVREQ_GET_SESSION_ID: + trusted_apps_only = false; + break; + + /* Depends on the request parameters... */ + case VMMDEVREQ_REPORT_GUEST_CAPABILITIES: + guest_status = (const struct vmmdev_guest_status *)req; + switch (guest_status->facility) { + case VBOXGUEST_FACILITY_TYPE_ALL: + case VBOXGUEST_FACILITY_TYPE_VBOXGUEST_DRIVER: + vbg_err("Denying userspace vmm report guest cap. call facility %#08x\n", + guest_status->facility); + return -EPERM; + case VBOXGUEST_FACILITY_TYPE_VBOX_SERVICE: + trusted_apps_only = true; + break; + case VBOXGUEST_FACILITY_TYPE_VBOX_TRAY_CLIENT: + case VBOXGUEST_FACILITY_TYPE_SEAMLESS: + case VBOXGUEST_FACILITY_TYPE_GRAPHICS: + default: + trusted_apps_only = false; + break; + } + break; + + /* Anything else is not allowed. */ + default: + vbg_err("Denying userspace vmm call type %#08x\n", + req->request_type); + return -EPERM; + } + + if (trusted_apps_only && + (session->requestor & VMMDEV_REQUESTOR_USER_DEVICE)) { + vbg_err("Denying userspace vmm call type %#08x through vboxuser device node\n", + req->request_type); + return -EPERM; + } + + return 0; +} + +static int vbg_ioctl_vmmrequest(struct vbg_dev *gdev, + struct vbg_session *session, void *data) +{ + struct vbg_ioctl_hdr *hdr = data; + int ret; + + if (hdr->size_in != hdr->size_out) + return -EINVAL; + + if (hdr->size_in > VMMDEV_MAX_VMMDEVREQ_SIZE) + return -E2BIG; + + if (hdr->type == VBG_IOCTL_HDR_TYPE_DEFAULT) + return -EINVAL; + + ret = vbg_req_allowed(gdev, session, data); + if (ret < 0) + return ret; + + vbg_req_perform(gdev, data); + WARN_ON(hdr->rc == VINF_HGCM_ASYNC_EXECUTE); + + return 0; +} + +static int vbg_ioctl_hgcm_connect(struct vbg_dev *gdev, + struct vbg_session *session, + struct vbg_ioctl_hgcm_connect *conn) +{ + u32 client_id; + int i, ret; + + if (vbg_ioctl_chk(&conn->hdr, sizeof(conn->u.in), sizeof(conn->u.out))) + return -EINVAL; + + /* Find a free place in the sessions clients array and claim it */ + mutex_lock(&gdev->session_mutex); + for (i = 0; i < ARRAY_SIZE(session->hgcm_client_ids); i++) { + if (!session->hgcm_client_ids[i]) { + session->hgcm_client_ids[i] = U32_MAX; + break; + } + } + mutex_unlock(&gdev->session_mutex); + + if (i >= ARRAY_SIZE(session->hgcm_client_ids)) + return -EMFILE; + + ret = vbg_hgcm_connect(gdev, session->requestor, &conn->u.in.loc, + &client_id, &conn->hdr.rc); + + mutex_lock(&gdev->session_mutex); + if (ret == 0 && conn->hdr.rc >= 0) { + conn->u.out.client_id = client_id; + session->hgcm_client_ids[i] = client_id; + } else { + conn->u.out.client_id = 0; + session->hgcm_client_ids[i] = 0; + } + mutex_unlock(&gdev->session_mutex); + + return ret; +} + +static int vbg_ioctl_hgcm_disconnect(struct vbg_dev *gdev, + struct vbg_session *session, + struct vbg_ioctl_hgcm_disconnect *disconn) +{ + u32 client_id; + int i, ret; + + if (vbg_ioctl_chk(&disconn->hdr, sizeof(disconn->u.in), 0)) + return -EINVAL; + + client_id = disconn->u.in.client_id; + if (client_id == 0 || client_id == U32_MAX) + return -EINVAL; + + mutex_lock(&gdev->session_mutex); + for (i = 0; i < ARRAY_SIZE(session->hgcm_client_ids); i++) { + if (session->hgcm_client_ids[i] == client_id) { + session->hgcm_client_ids[i] = U32_MAX; + break; + } + } + mutex_unlock(&gdev->session_mutex); + + if (i >= ARRAY_SIZE(session->hgcm_client_ids)) + return -EINVAL; + + ret = vbg_hgcm_disconnect(gdev, session->requestor, client_id, + &disconn->hdr.rc); + + mutex_lock(&gdev->session_mutex); + if (ret == 0 && disconn->hdr.rc >= 0) + session->hgcm_client_ids[i] = 0; + else + session->hgcm_client_ids[i] = client_id; + mutex_unlock(&gdev->session_mutex); + + return ret; +} + +static bool vbg_param_valid(enum vmmdev_hgcm_function_parameter_type type) +{ + switch (type) { + case VMMDEV_HGCM_PARM_TYPE_32BIT: + case VMMDEV_HGCM_PARM_TYPE_64BIT: + case VMMDEV_HGCM_PARM_TYPE_LINADDR: + case VMMDEV_HGCM_PARM_TYPE_LINADDR_IN: + case VMMDEV_HGCM_PARM_TYPE_LINADDR_OUT: + return true; + default: + return false; + } +} + +static int vbg_ioctl_hgcm_call(struct vbg_dev *gdev, + struct vbg_session *session, bool f32bit, + struct vbg_ioctl_hgcm_call *call) +{ + size_t actual_size; + u32 client_id; + int i, ret; + + if (call->hdr.size_in < sizeof(*call)) + return -EINVAL; + + if (call->hdr.size_in != call->hdr.size_out) + return -EINVAL; + + if (call->parm_count > VMMDEV_HGCM_MAX_PARMS) + return -E2BIG; + + client_id = call->client_id; + if (client_id == 0 || client_id == U32_MAX) + return -EINVAL; + + actual_size = sizeof(*call); + if (f32bit) + actual_size += call->parm_count * + sizeof(struct vmmdev_hgcm_function_parameter32); + else + actual_size += call->parm_count * + sizeof(struct vmmdev_hgcm_function_parameter); + if (call->hdr.size_in < actual_size) { + vbg_debug("VBG_IOCTL_HGCM_CALL: hdr.size_in %d required size is %zd\n", + call->hdr.size_in, actual_size); + return -EINVAL; + } + call->hdr.size_out = actual_size; + + /* Validate parameter types */ + if (f32bit) { + struct vmmdev_hgcm_function_parameter32 *parm = + VBG_IOCTL_HGCM_CALL_PARMS32(call); + + for (i = 0; i < call->parm_count; i++) + if (!vbg_param_valid(parm[i].type)) + return -EINVAL; + } else { + struct vmmdev_hgcm_function_parameter *parm = + VBG_IOCTL_HGCM_CALL_PARMS(call); + + for (i = 0; i < call->parm_count; i++) + if (!vbg_param_valid(parm[i].type)) + return -EINVAL; + } + + /* + * Validate the client id. + */ + mutex_lock(&gdev->session_mutex); + for (i = 0; i < ARRAY_SIZE(session->hgcm_client_ids); i++) + if (session->hgcm_client_ids[i] == client_id) + break; + mutex_unlock(&gdev->session_mutex); + if (i >= ARRAY_SIZE(session->hgcm_client_ids)) { + vbg_debug("VBG_IOCTL_HGCM_CALL: INVALID handle. u32Client=%#08x\n", + client_id); + return -EINVAL; + } + + if (IS_ENABLED(CONFIG_COMPAT) && f32bit) + ret = vbg_hgcm_call32(gdev, session->requestor, client_id, + call->function, call->timeout_ms, + VBG_IOCTL_HGCM_CALL_PARMS32(call), + call->parm_count, &call->hdr.rc); + else + ret = vbg_hgcm_call(gdev, session->requestor, client_id, + call->function, call->timeout_ms, + VBG_IOCTL_HGCM_CALL_PARMS(call), + call->parm_count, &call->hdr.rc); + + if (ret == -E2BIG) { + /* E2BIG needs to be reported through the hdr.rc field. */ + call->hdr.rc = VERR_OUT_OF_RANGE; + ret = 0; + } + + if (ret && ret != -EINTR && ret != -ETIMEDOUT) + vbg_err("VBG_IOCTL_HGCM_CALL error: %d\n", ret); + + return ret; +} + +static int vbg_ioctl_log(struct vbg_ioctl_log *log) +{ + if (log->hdr.size_out != sizeof(log->hdr)) + return -EINVAL; + + vbg_info("%.*s", (int)(log->hdr.size_in - sizeof(log->hdr)), + log->u.in.msg); + + return 0; +} + +static int vbg_ioctl_change_filter_mask(struct vbg_dev *gdev, + struct vbg_session *session, + struct vbg_ioctl_change_filter *filter) +{ + u32 or_mask, not_mask; + + if (vbg_ioctl_chk(&filter->hdr, sizeof(filter->u.in), 0)) + return -EINVAL; + + or_mask = filter->u.in.or_mask; + not_mask = filter->u.in.not_mask; + + if ((or_mask | not_mask) & ~VMMDEV_EVENT_VALID_EVENT_MASK) + return -EINVAL; + + return vbg_set_session_event_filter(gdev, session, or_mask, not_mask, + false); +} + +static int vbg_ioctl_acquire_guest_capabilities(struct vbg_dev *gdev, + struct vbg_session *session, + struct vbg_ioctl_acquire_guest_caps *caps) +{ + u32 flags, or_mask, not_mask; + + if (vbg_ioctl_chk(&caps->hdr, sizeof(caps->u.in), 0)) + return -EINVAL; + + flags = caps->u.in.flags; + or_mask = caps->u.in.or_mask; + not_mask = caps->u.in.not_mask; + + if (flags & ~VBGL_IOC_AGC_FLAGS_VALID_MASK) + return -EINVAL; + + if ((or_mask | not_mask) & ~VMMDEV_GUEST_CAPABILITIES_MASK) + return -EINVAL; + + return vbg_acquire_session_capabilities(gdev, session, or_mask, + not_mask, flags, false); +} + +static int vbg_ioctl_change_guest_capabilities(struct vbg_dev *gdev, + struct vbg_session *session, struct vbg_ioctl_set_guest_caps *caps) +{ + u32 or_mask, not_mask; + int ret; + + if (vbg_ioctl_chk(&caps->hdr, sizeof(caps->u.in), sizeof(caps->u.out))) + return -EINVAL; + + or_mask = caps->u.in.or_mask; + not_mask = caps->u.in.not_mask; + + if ((or_mask | not_mask) & ~VMMDEV_GUEST_CAPABILITIES_MASK) + return -EINVAL; + + ret = vbg_set_session_capabilities(gdev, session, or_mask, not_mask, + false); + if (ret) + return ret; + + caps->u.out.session_caps = session->set_guest_caps; + caps->u.out.global_caps = gdev->guest_caps_host; + + return 0; +} + +static int vbg_ioctl_check_balloon(struct vbg_dev *gdev, + struct vbg_ioctl_check_balloon *balloon_info) +{ + if (vbg_ioctl_chk(&balloon_info->hdr, 0, sizeof(balloon_info->u.out))) + return -EINVAL; + + balloon_info->u.out.balloon_chunks = gdev->mem_balloon.chunks; + /* + * Under Linux we handle VMMDEV_EVENT_BALLOON_CHANGE_REQUEST + * events entirely in the kernel, see vbg_core_isr(). + */ + balloon_info->u.out.handle_in_r3 = false; + + return 0; +} + +static int vbg_ioctl_write_core_dump(struct vbg_dev *gdev, + struct vbg_session *session, + struct vbg_ioctl_write_coredump *dump) +{ + struct vmmdev_write_core_dump *req; + + if (vbg_ioctl_chk(&dump->hdr, sizeof(dump->u.in), 0)) + return -EINVAL; + + req = vbg_req_alloc(sizeof(*req), VMMDEVREQ_WRITE_COREDUMP, + session->requestor); + if (!req) + return -ENOMEM; + + req->flags = dump->u.in.flags; + dump->hdr.rc = vbg_req_perform(gdev, req); + + vbg_req_free(req, sizeof(*req)); + return 0; +} + +/** + * Common IOCtl for user to kernel communication. + * Return: 0 or negative errno value. + * @session: The client session. + * @req: The requested function. + * @data: The i/o data buffer, minimum size sizeof(struct vbg_ioctl_hdr). + */ +int vbg_core_ioctl(struct vbg_session *session, unsigned int req, void *data) +{ + unsigned int req_no_size = req & ~IOCSIZE_MASK; + struct vbg_dev *gdev = session->gdev; + struct vbg_ioctl_hdr *hdr = data; + bool f32bit = false; + + hdr->rc = VINF_SUCCESS; + if (!hdr->size_out) + hdr->size_out = hdr->size_in; + + /* + * hdr->version and hdr->size_in / hdr->size_out minimum size are + * already checked by vbg_misc_device_ioctl(). + */ + + /* For VMMDEV_REQUEST hdr->type != VBG_IOCTL_HDR_TYPE_DEFAULT */ + if (req_no_size == VBG_IOCTL_VMMDEV_REQUEST(0) || + req == VBG_IOCTL_VMMDEV_REQUEST_BIG || + req == VBG_IOCTL_VMMDEV_REQUEST_BIG_ALT) + return vbg_ioctl_vmmrequest(gdev, session, data); + + if (hdr->type != VBG_IOCTL_HDR_TYPE_DEFAULT) + return -EINVAL; + + /* Fixed size requests. */ + switch (req) { + case VBG_IOCTL_DRIVER_VERSION_INFO: + return vbg_ioctl_driver_version_info(data); + case VBG_IOCTL_HGCM_CONNECT: + return vbg_ioctl_hgcm_connect(gdev, session, data); + case VBG_IOCTL_HGCM_DISCONNECT: + return vbg_ioctl_hgcm_disconnect(gdev, session, data); + case VBG_IOCTL_WAIT_FOR_EVENTS: + return vbg_ioctl_wait_for_events(gdev, session, data); + case VBG_IOCTL_INTERRUPT_ALL_WAIT_FOR_EVENTS: + return vbg_ioctl_interrupt_all_wait_events(gdev, session, data); + case VBG_IOCTL_CHANGE_FILTER_MASK: + return vbg_ioctl_change_filter_mask(gdev, session, data); + case VBG_IOCTL_ACQUIRE_GUEST_CAPABILITIES: + return vbg_ioctl_acquire_guest_capabilities(gdev, session, data); + case VBG_IOCTL_CHANGE_GUEST_CAPABILITIES: + return vbg_ioctl_change_guest_capabilities(gdev, session, data); + case VBG_IOCTL_CHECK_BALLOON: + return vbg_ioctl_check_balloon(gdev, data); + case VBG_IOCTL_WRITE_CORE_DUMP: + return vbg_ioctl_write_core_dump(gdev, session, data); + } + + /* Variable sized requests. */ + switch (req_no_size) { +#ifdef CONFIG_COMPAT + case VBG_IOCTL_HGCM_CALL_32(0): + f32bit = true; + fallthrough; +#endif + case VBG_IOCTL_HGCM_CALL(0): + return vbg_ioctl_hgcm_call(gdev, session, f32bit, data); + case VBG_IOCTL_LOG(0): + case VBG_IOCTL_LOG_ALT(0): + return vbg_ioctl_log(data); + } + + vbg_err_ratelimited("Userspace made an unknown ioctl req %#08x\n", req); + return -ENOTTY; +} + +/** + * Report guest supported mouse-features to the host. + * + * Return: 0 or negative errno value. + * @gdev: The Guest extension device. + * @features: The set of features to report to the host. + */ +int vbg_core_set_mouse_status(struct vbg_dev *gdev, u32 features) +{ + struct vmmdev_mouse_status *req; + int rc; + + req = vbg_req_alloc(sizeof(*req), VMMDEVREQ_SET_MOUSE_STATUS, + VBG_KERNEL_REQUEST); + if (!req) + return -ENOMEM; + + req->mouse_features = features; + req->pointer_pos_x = 0; + req->pointer_pos_y = 0; + + rc = vbg_req_perform(gdev, req); + if (rc < 0) + vbg_err("%s error, rc: %d\n", __func__, rc); + + vbg_req_free(req, sizeof(*req)); + return vbg_status_code_to_errno(rc); +} + +/** Core interrupt service routine. */ +irqreturn_t vbg_core_isr(int irq, void *dev_id) +{ + struct vbg_dev *gdev = dev_id; + struct vmmdev_events *req = gdev->ack_events_req; + bool mouse_position_changed = false; + unsigned long flags; + u32 events = 0; + int rc; + + if (!gdev->mmio->V.V1_04.have_events) + return IRQ_NONE; + + /* Get and acknowlegde events. */ + req->header.rc = VERR_INTERNAL_ERROR; + req->events = 0; + rc = vbg_req_perform(gdev, req); + if (rc < 0) { + vbg_err("Error performing events req, rc: %d\n", rc); + return IRQ_NONE; + } + + events = req->events; + + if (events & VMMDEV_EVENT_MOUSE_POSITION_CHANGED) { + mouse_position_changed = true; + events &= ~VMMDEV_EVENT_MOUSE_POSITION_CHANGED; + } + + if (events & VMMDEV_EVENT_HGCM) { + wake_up(&gdev->hgcm_wq); + events &= ~VMMDEV_EVENT_HGCM; + } + + if (events & VMMDEV_EVENT_BALLOON_CHANGE_REQUEST) { + schedule_work(&gdev->mem_balloon.work); + events &= ~VMMDEV_EVENT_BALLOON_CHANGE_REQUEST; + } + + if (events) { + spin_lock_irqsave(&gdev->event_spinlock, flags); + gdev->pending_events |= events; + spin_unlock_irqrestore(&gdev->event_spinlock, flags); + + wake_up(&gdev->event_wq); + } + + if (mouse_position_changed) + vbg_linux_mouse_event(gdev); + + return IRQ_HANDLED; +} diff --git a/drivers/virt/vboxguest/vboxguest_core.h b/drivers/virt/vboxguest/vboxguest_core.h new file mode 100644 index 000000000..ab4bf64e2 --- /dev/null +++ b/drivers/virt/vboxguest/vboxguest_core.h @@ -0,0 +1,214 @@ +/* SPDX-License-Identifier: (GPL-2.0 OR CDDL-1.0) */ +/* Copyright (C) 2010-2016 Oracle Corporation */ + +#ifndef __VBOXGUEST_CORE_H__ +#define __VBOXGUEST_CORE_H__ + +#include <linux/input.h> +#include <linux/interrupt.h> +#include <linux/kernel.h> +#include <linux/list.h> +#include <linux/miscdevice.h> +#include <linux/spinlock.h> +#include <linux/wait.h> +#include <linux/workqueue.h> +#include <linux/vboxguest.h> +#include "vmmdev.h" + +/* + * The mainline kernel version (this version) of the vboxguest module + * contained a bug where it defined VBGL_IOCTL_VMMDEV_REQUEST_BIG and + * VBGL_IOCTL_LOG using _IOC(_IOC_READ | _IOC_WRITE, 'V', ...) instead + * of _IO(V, ...) as the out of tree VirtualBox upstream version does. + * + * These _ALT definitions keep compatibility with the wrong defines the + * mainline kernel version used for a while. + * Note the VirtualBox userspace bits have always been built against + * VirtualBox upstream's headers, so this is likely not necessary. But + * we must never break our ABI so we keep these around to be 100% sure. + */ +#define VBG_IOCTL_VMMDEV_REQUEST_BIG_ALT _IOC(_IOC_READ | _IOC_WRITE, 'V', 3, 0) +#define VBG_IOCTL_LOG_ALT(s) _IOC(_IOC_READ | _IOC_WRITE, 'V', 9, s) + +struct vbg_session; + +/** VBox guest memory balloon. */ +struct vbg_mem_balloon { + /** Work handling VMMDEV_EVENT_BALLOON_CHANGE_REQUEST events */ + struct work_struct work; + /** Pre-allocated vmmdev_memballoon_info req for query */ + struct vmmdev_memballoon_info *get_req; + /** Pre-allocated vmmdev_memballoon_change req for inflate / deflate */ + struct vmmdev_memballoon_change *change_req; + /** The current number of chunks in the balloon. */ + u32 chunks; + /** The maximum number of chunks in the balloon. */ + u32 max_chunks; + /** + * Array of pointers to page arrays. A page * array is allocated for + * each chunk when inflating, and freed when the deflating. + */ + struct page ***pages; +}; + +/** + * Per bit usage tracker for a u32 mask. + * + * Used for optimal handling of guest properties and event filter. + */ +struct vbg_bit_usage_tracker { + /** Per bit usage counters. */ + u32 per_bit_usage[32]; + /** The current mask according to per_bit_usage. */ + u32 mask; +}; + +/** VBox guest device (data) extension. */ +struct vbg_dev { + struct device *dev; + /** The base of the adapter I/O ports. */ + u16 io_port; + /** Pointer to the mapping of the VMMDev adapter memory. */ + struct vmmdev_memory *mmio; + /** Host version */ + char host_version[64]; + /** Host features */ + unsigned int host_features; + /** + * Dummy page and vmap address for reserved kernel virtual-address + * space for the guest mappings, only used on hosts lacking vtx. + */ + struct page *guest_mappings_dummy_page; + void *guest_mappings; + /** Spinlock protecting pending_events. */ + spinlock_t event_spinlock; + /** Preallocated struct vmmdev_events for the IRQ handler. */ + struct vmmdev_events *ack_events_req; + /** Wait-for-event list for threads waiting for multiple events. */ + wait_queue_head_t event_wq; + /** Mask of pending events. */ + u32 pending_events; + /** Wait-for-event list for threads waiting on HGCM async completion. */ + wait_queue_head_t hgcm_wq; + /** Pre-allocated hgcm cancel2 req. for cancellation on timeout */ + struct vmmdev_hgcm_cancel2 *cancel_req; + /** Mutex protecting cancel_req accesses */ + struct mutex cancel_req_mutex; + /** Pre-allocated mouse-status request for the input-device handling. */ + struct vmmdev_mouse_status *mouse_status_req; + /** Input device for reporting abs mouse coordinates to the guest. */ + struct input_dev *input; + + /** Memory balloon information. */ + struct vbg_mem_balloon mem_balloon; + + /** Lock for session related items in vbg_dev and vbg_session */ + struct mutex session_mutex; + /** Events we won't permit anyone to filter out. */ + u32 fixed_events; + /** + * Usage counters for the host events (excludes fixed events), + * Protected by session_mutex. + */ + struct vbg_bit_usage_tracker event_filter_tracker; + /** + * The event filter last reported to the host (or UINT32_MAX). + * Protected by session_mutex. + */ + u32 event_filter_host; + + /** + * Guest capabilities which have been switched to acquire_mode. + */ + u32 acquire_mode_guest_caps; + /** + * Guest capabilities acquired by vbg_acquire_session_capabilities(). + * Only one session can acquire a capability at a time. + */ + u32 acquired_guest_caps; + /** + * Usage counters for guest capabilities requested through + * vbg_set_session_capabilities(). Indexed by capability bit + * number, one count per session using a capability. + * Protected by session_mutex. + */ + struct vbg_bit_usage_tracker set_guest_caps_tracker; + /** + * The guest capabilities last reported to the host (or UINT32_MAX). + * Protected by session_mutex. + */ + u32 guest_caps_host; + + /** + * Heartbeat timer which fires with interval + * cNsHearbeatInterval and its handler sends + * VMMDEVREQ_GUEST_HEARTBEAT to VMMDev. + */ + struct timer_list heartbeat_timer; + /** Heartbeat timer interval in ms. */ + int heartbeat_interval_ms; + /** Preallocated VMMDEVREQ_GUEST_HEARTBEAT request. */ + struct vmmdev_request_header *guest_heartbeat_req; + + /** "vboxguest" char-device */ + struct miscdevice misc_device; + /** "vboxuser" char-device */ + struct miscdevice misc_device_user; +}; + +/** The VBoxGuest per session data. */ +struct vbg_session { + /** Pointer to the device extension. */ + struct vbg_dev *gdev; + + /** + * Array containing HGCM client IDs associated with this session. + * These will be automatically disconnected when the session is closed. + * Protected by vbg_gdev.session_mutex. + */ + u32 hgcm_client_ids[64]; + /** + * Host events requested by the session. + * An event type requested in any guest session will be added to the + * host filter. Protected by vbg_gdev.session_mutex. + */ + u32 event_filter; + /** + * Guest capabilities acquired by vbg_acquire_session_capabilities(). + * Only one session can acquire a capability at a time. + */ + u32 acquired_guest_caps; + /** + * Guest capabilities set through vbg_set_session_capabilities(). + * A capability claimed by any guest session will be reported to the + * host. Protected by vbg_gdev.session_mutex. + */ + u32 set_guest_caps; + /** VMMDEV_REQUESTOR_* flags */ + u32 requestor; + /** Set on CANCEL_ALL_WAITEVENTS, protected by vbg_devevent_spinlock. */ + bool cancel_waiters; +}; + +int vbg_core_init(struct vbg_dev *gdev, u32 fixed_events); +void vbg_core_exit(struct vbg_dev *gdev); +struct vbg_session *vbg_core_open_session(struct vbg_dev *gdev, u32 requestor); +void vbg_core_close_session(struct vbg_session *session); +int vbg_core_ioctl(struct vbg_session *session, unsigned int req, void *data); +int vbg_core_set_mouse_status(struct vbg_dev *gdev, u32 features); + +irqreturn_t vbg_core_isr(int irq, void *dev_id); + +void vbg_linux_mouse_event(struct vbg_dev *gdev); + +/* Private (non exported) functions form vboxguest_utils.c */ +void *vbg_req_alloc(size_t len, enum vmmdev_request_type req_type, + u32 requestor); +void vbg_req_free(void *req, size_t len); +int vbg_req_perform(struct vbg_dev *gdev, void *req); +int vbg_hgcm_call32( + struct vbg_dev *gdev, u32 requestor, u32 client_id, u32 function, + u32 timeout_ms, struct vmmdev_hgcm_function_parameter32 *parm32, + u32 parm_count, int *vbox_status); + +#endif diff --git a/drivers/virt/vboxguest/vboxguest_linux.c b/drivers/virt/vboxguest/vboxguest_linux.c new file mode 100644 index 000000000..c47e62dc5 --- /dev/null +++ b/drivers/virt/vboxguest/vboxguest_linux.c @@ -0,0 +1,500 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * vboxguest linux pci driver, char-dev and input-device code, + * + * Copyright (C) 2006-2016 Oracle Corporation + */ + +#include <linux/cred.h> +#include <linux/input.h> +#include <linux/kernel.h> +#include <linux/miscdevice.h> +#include <linux/module.h> +#include <linux/pci.h> +#include <linux/poll.h> +#include <linux/vbox_utils.h> +#include "vboxguest_core.h" + +/** The device name. */ +#define DEVICE_NAME "vboxguest" +/** The device name for the device node open to everyone. */ +#define DEVICE_NAME_USER "vboxuser" +/** VirtualBox PCI vendor ID. */ +#define VBOX_VENDORID 0x80ee +/** VMMDev PCI card product ID. */ +#define VMMDEV_DEVICEID 0xcafe + +/** Mutex protecting the global vbg_gdev pointer used by vbg_get/put_gdev. */ +static DEFINE_MUTEX(vbg_gdev_mutex); +/** Global vbg_gdev pointer used by vbg_get/put_gdev. */ +static struct vbg_dev *vbg_gdev; + +static u32 vbg_misc_device_requestor(struct inode *inode) +{ + u32 requestor = VMMDEV_REQUESTOR_USERMODE | + VMMDEV_REQUESTOR_CON_DONT_KNOW | + VMMDEV_REQUESTOR_TRUST_NOT_GIVEN; + + if (from_kuid(current_user_ns(), current_uid()) == 0) + requestor |= VMMDEV_REQUESTOR_USR_ROOT; + else + requestor |= VMMDEV_REQUESTOR_USR_USER; + + if (in_egroup_p(inode->i_gid)) + requestor |= VMMDEV_REQUESTOR_GRP_VBOX; + + return requestor; +} + +static int vbg_misc_device_open(struct inode *inode, struct file *filp) +{ + struct vbg_session *session; + struct vbg_dev *gdev; + + /* misc_open sets filp->private_data to our misc device */ + gdev = container_of(filp->private_data, struct vbg_dev, misc_device); + + session = vbg_core_open_session(gdev, vbg_misc_device_requestor(inode)); + if (IS_ERR(session)) + return PTR_ERR(session); + + filp->private_data = session; + return 0; +} + +static int vbg_misc_device_user_open(struct inode *inode, struct file *filp) +{ + struct vbg_session *session; + struct vbg_dev *gdev; + + /* misc_open sets filp->private_data to our misc device */ + gdev = container_of(filp->private_data, struct vbg_dev, + misc_device_user); + + session = vbg_core_open_session(gdev, vbg_misc_device_requestor(inode) | + VMMDEV_REQUESTOR_USER_DEVICE); + if (IS_ERR(session)) + return PTR_ERR(session); + + filp->private_data = session; + return 0; +} + +/** + * Close device. + * Return: 0 on success, negated errno on failure. + * @inode: Pointer to inode info structure. + * @filp: Associated file pointer. + */ +static int vbg_misc_device_close(struct inode *inode, struct file *filp) +{ + vbg_core_close_session(filp->private_data); + filp->private_data = NULL; + return 0; +} + +/** + * Device I/O Control entry point. + * Return: 0 on success, negated errno on failure. + * @filp: Associated file pointer. + * @req: The request specified to ioctl(). + * @arg: The argument specified to ioctl(). + */ +static long vbg_misc_device_ioctl(struct file *filp, unsigned int req, + unsigned long arg) +{ + struct vbg_session *session = filp->private_data; + size_t returned_size, size; + struct vbg_ioctl_hdr hdr; + bool is_vmmdev_req; + int ret = 0; + void *buf; + + if (copy_from_user(&hdr, (void *)arg, sizeof(hdr))) + return -EFAULT; + + if (hdr.version != VBG_IOCTL_HDR_VERSION) + return -EINVAL; + + if (hdr.size_in < sizeof(hdr) || + (hdr.size_out && hdr.size_out < sizeof(hdr))) + return -EINVAL; + + size = max(hdr.size_in, hdr.size_out); + if (_IOC_SIZE(req) && _IOC_SIZE(req) != size) + return -EINVAL; + if (size > SZ_16M) + return -E2BIG; + + /* + * IOCTL_VMMDEV_REQUEST needs the buffer to be below 4G to avoid + * the need for a bounce-buffer and another copy later on. + */ + is_vmmdev_req = (req & ~IOCSIZE_MASK) == VBG_IOCTL_VMMDEV_REQUEST(0) || + req == VBG_IOCTL_VMMDEV_REQUEST_BIG || + req == VBG_IOCTL_VMMDEV_REQUEST_BIG_ALT; + + if (is_vmmdev_req) + buf = vbg_req_alloc(size, VBG_IOCTL_HDR_TYPE_DEFAULT, + session->requestor); + else + buf = kmalloc(size, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + *((struct vbg_ioctl_hdr *)buf) = hdr; + if (copy_from_user(buf + sizeof(hdr), (void *)arg + sizeof(hdr), + hdr.size_in - sizeof(hdr))) { + ret = -EFAULT; + goto out; + } + if (hdr.size_in < size) + memset(buf + hdr.size_in, 0, size - hdr.size_in); + + ret = vbg_core_ioctl(session, req, buf); + if (ret) + goto out; + + returned_size = ((struct vbg_ioctl_hdr *)buf)->size_out; + if (returned_size > size) { + vbg_debug("%s: too much output data %zu > %zu\n", + __func__, returned_size, size); + returned_size = size; + } + if (copy_to_user((void *)arg, buf, returned_size) != 0) + ret = -EFAULT; + +out: + if (is_vmmdev_req) + vbg_req_free(buf, size); + else + kfree(buf); + + return ret; +} + +/** The file_operations structures. */ +static const struct file_operations vbg_misc_device_fops = { + .owner = THIS_MODULE, + .open = vbg_misc_device_open, + .release = vbg_misc_device_close, + .unlocked_ioctl = vbg_misc_device_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = vbg_misc_device_ioctl, +#endif +}; +static const struct file_operations vbg_misc_device_user_fops = { + .owner = THIS_MODULE, + .open = vbg_misc_device_user_open, + .release = vbg_misc_device_close, + .unlocked_ioctl = vbg_misc_device_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = vbg_misc_device_ioctl, +#endif +}; + +/** + * Called when the input device is first opened. + * + * Sets up absolute mouse reporting. + */ +static int vbg_input_open(struct input_dev *input) +{ + struct vbg_dev *gdev = input_get_drvdata(input); + u32 feat = VMMDEV_MOUSE_GUEST_CAN_ABSOLUTE | VMMDEV_MOUSE_NEW_PROTOCOL; + + return vbg_core_set_mouse_status(gdev, feat); +} + +/** + * Called if all open handles to the input device are closed. + * + * Disables absolute reporting. + */ +static void vbg_input_close(struct input_dev *input) +{ + struct vbg_dev *gdev = input_get_drvdata(input); + + vbg_core_set_mouse_status(gdev, 0); +} + +/** + * Creates the kernel input device. + * + * Return: 0 on success, negated errno on failure. + */ +static int vbg_create_input_device(struct vbg_dev *gdev) +{ + struct input_dev *input; + + input = devm_input_allocate_device(gdev->dev); + if (!input) + return -ENOMEM; + + input->id.bustype = BUS_PCI; + input->id.vendor = VBOX_VENDORID; + input->id.product = VMMDEV_DEVICEID; + input->open = vbg_input_open; + input->close = vbg_input_close; + input->dev.parent = gdev->dev; + input->name = "VirtualBox mouse integration"; + + input_set_abs_params(input, ABS_X, VMMDEV_MOUSE_RANGE_MIN, + VMMDEV_MOUSE_RANGE_MAX, 0, 0); + input_set_abs_params(input, ABS_Y, VMMDEV_MOUSE_RANGE_MIN, + VMMDEV_MOUSE_RANGE_MAX, 0, 0); + input_set_capability(input, EV_KEY, BTN_MOUSE); + input_set_drvdata(input, gdev); + + gdev->input = input; + + return input_register_device(gdev->input); +} + +static ssize_t host_version_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct vbg_dev *gdev = dev_get_drvdata(dev); + + return sprintf(buf, "%s\n", gdev->host_version); +} + +static ssize_t host_features_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct vbg_dev *gdev = dev_get_drvdata(dev); + + return sprintf(buf, "%#x\n", gdev->host_features); +} + +static DEVICE_ATTR_RO(host_version); +static DEVICE_ATTR_RO(host_features); + +static struct attribute *vbg_pci_attrs[] = { + &dev_attr_host_version.attr, + &dev_attr_host_features.attr, + NULL, +}; +ATTRIBUTE_GROUPS(vbg_pci); + +/** + * Does the PCI detection and init of the device. + * + * Return: 0 on success, negated errno on failure. + */ +static int vbg_pci_probe(struct pci_dev *pci, const struct pci_device_id *id) +{ + struct device *dev = &pci->dev; + resource_size_t io, io_len, mmio, mmio_len; + struct vmmdev_memory *vmmdev; + struct vbg_dev *gdev; + int ret; + + gdev = devm_kzalloc(dev, sizeof(*gdev), GFP_KERNEL); + if (!gdev) + return -ENOMEM; + + ret = pci_enable_device(pci); + if (ret != 0) { + vbg_err("vboxguest: Error enabling device: %d\n", ret); + return ret; + } + + ret = -ENODEV; + + io = pci_resource_start(pci, 0); + io_len = pci_resource_len(pci, 0); + if (!io || !io_len) { + vbg_err("vboxguest: Error IO-port resource (0) is missing\n"); + goto err_disable_pcidev; + } + if (devm_request_region(dev, io, io_len, DEVICE_NAME) == NULL) { + vbg_err("vboxguest: Error could not claim IO resource\n"); + ret = -EBUSY; + goto err_disable_pcidev; + } + + mmio = pci_resource_start(pci, 1); + mmio_len = pci_resource_len(pci, 1); + if (!mmio || !mmio_len) { + vbg_err("vboxguest: Error MMIO resource (1) is missing\n"); + goto err_disable_pcidev; + } + + if (devm_request_mem_region(dev, mmio, mmio_len, DEVICE_NAME) == NULL) { + vbg_err("vboxguest: Error could not claim MMIO resource\n"); + ret = -EBUSY; + goto err_disable_pcidev; + } + + vmmdev = devm_ioremap(dev, mmio, mmio_len); + if (!vmmdev) { + vbg_err("vboxguest: Error ioremap failed; MMIO addr=%pap size=%pap\n", + &mmio, &mmio_len); + goto err_disable_pcidev; + } + + /* Validate MMIO region version and size. */ + if (vmmdev->version != VMMDEV_MEMORY_VERSION || + vmmdev->size < 32 || vmmdev->size > mmio_len) { + vbg_err("vboxguest: Bogus VMMDev memory; version=%08x (expected %08x) size=%d (expected <= %d)\n", + vmmdev->version, VMMDEV_MEMORY_VERSION, + vmmdev->size, (int)mmio_len); + goto err_disable_pcidev; + } + + gdev->io_port = io; + gdev->mmio = vmmdev; + gdev->dev = dev; + gdev->misc_device.minor = MISC_DYNAMIC_MINOR; + gdev->misc_device.name = DEVICE_NAME; + gdev->misc_device.fops = &vbg_misc_device_fops; + gdev->misc_device_user.minor = MISC_DYNAMIC_MINOR; + gdev->misc_device_user.name = DEVICE_NAME_USER; + gdev->misc_device_user.fops = &vbg_misc_device_user_fops; + + ret = vbg_core_init(gdev, VMMDEV_EVENT_MOUSE_POSITION_CHANGED); + if (ret) + goto err_disable_pcidev; + + ret = vbg_create_input_device(gdev); + if (ret) { + vbg_err("vboxguest: Error creating input device: %d\n", ret); + goto err_vbg_core_exit; + } + + ret = request_irq(pci->irq, vbg_core_isr, IRQF_SHARED, DEVICE_NAME, + gdev); + if (ret) { + vbg_err("vboxguest: Error requesting irq: %d\n", ret); + goto err_vbg_core_exit; + } + + ret = misc_register(&gdev->misc_device); + if (ret) { + vbg_err("vboxguest: Error misc_register %s failed: %d\n", + DEVICE_NAME, ret); + goto err_free_irq; + } + + ret = misc_register(&gdev->misc_device_user); + if (ret) { + vbg_err("vboxguest: Error misc_register %s failed: %d\n", + DEVICE_NAME_USER, ret); + goto err_unregister_misc_device; + } + + mutex_lock(&vbg_gdev_mutex); + if (!vbg_gdev) + vbg_gdev = gdev; + else + ret = -EBUSY; + mutex_unlock(&vbg_gdev_mutex); + + if (ret) { + vbg_err("vboxguest: Error more then 1 vbox guest pci device\n"); + goto err_unregister_misc_device_user; + } + + pci_set_drvdata(pci, gdev); + + return 0; + +err_unregister_misc_device_user: + misc_deregister(&gdev->misc_device_user); +err_unregister_misc_device: + misc_deregister(&gdev->misc_device); +err_free_irq: + free_irq(pci->irq, gdev); +err_vbg_core_exit: + vbg_core_exit(gdev); +err_disable_pcidev: + pci_disable_device(pci); + + return ret; +} + +static void vbg_pci_remove(struct pci_dev *pci) +{ + struct vbg_dev *gdev = pci_get_drvdata(pci); + + mutex_lock(&vbg_gdev_mutex); + vbg_gdev = NULL; + mutex_unlock(&vbg_gdev_mutex); + + free_irq(pci->irq, gdev); + misc_deregister(&gdev->misc_device_user); + misc_deregister(&gdev->misc_device); + vbg_core_exit(gdev); + pci_disable_device(pci); +} + +struct vbg_dev *vbg_get_gdev(void) +{ + mutex_lock(&vbg_gdev_mutex); + + /* + * Note on success we keep the mutex locked until vbg_put_gdev(), + * this stops vbg_pci_remove from removing the device from underneath + * vboxsf. vboxsf will only hold a reference for a short while. + */ + if (vbg_gdev) + return vbg_gdev; + + mutex_unlock(&vbg_gdev_mutex); + return ERR_PTR(-ENODEV); +} +EXPORT_SYMBOL(vbg_get_gdev); + +void vbg_put_gdev(struct vbg_dev *gdev) +{ + WARN_ON(gdev != vbg_gdev); + mutex_unlock(&vbg_gdev_mutex); +} +EXPORT_SYMBOL(vbg_put_gdev); + +/** + * Callback for mouse events. + * + * This is called at the end of the ISR, after leaving the event spinlock, if + * VMMDEV_EVENT_MOUSE_POSITION_CHANGED was raised by the host. + * + * @gdev: The device extension. + */ +void vbg_linux_mouse_event(struct vbg_dev *gdev) +{ + int rc; + + /* Report events to the kernel input device */ + gdev->mouse_status_req->mouse_features = 0; + gdev->mouse_status_req->pointer_pos_x = 0; + gdev->mouse_status_req->pointer_pos_y = 0; + rc = vbg_req_perform(gdev, gdev->mouse_status_req); + if (rc >= 0) { + input_report_abs(gdev->input, ABS_X, + gdev->mouse_status_req->pointer_pos_x); + input_report_abs(gdev->input, ABS_Y, + gdev->mouse_status_req->pointer_pos_y); + input_sync(gdev->input); + } +} + +static const struct pci_device_id vbg_pci_ids[] = { + { .vendor = VBOX_VENDORID, .device = VMMDEV_DEVICEID }, + {} +}; +MODULE_DEVICE_TABLE(pci, vbg_pci_ids); + +static struct pci_driver vbg_pci_driver = { + .name = DEVICE_NAME, + .dev_groups = vbg_pci_groups, + .id_table = vbg_pci_ids, + .probe = vbg_pci_probe, + .remove = vbg_pci_remove, +}; + +module_pci_driver(vbg_pci_driver); + +MODULE_AUTHOR("Oracle Corporation"); +MODULE_DESCRIPTION("Oracle VM VirtualBox Guest Additions for Linux Module"); +MODULE_LICENSE("GPL"); diff --git a/drivers/virt/vboxguest/vboxguest_utils.c b/drivers/virt/vboxguest/vboxguest_utils.c new file mode 100644 index 000000000..8d195e3f8 --- /dev/null +++ b/drivers/virt/vboxguest/vboxguest_utils.c @@ -0,0 +1,825 @@ +/* SPDX-License-Identifier: (GPL-2.0 OR CDDL-1.0) */ +/* + * vboxguest vmm-req and hgcm-call code, VBoxGuestR0LibHGCMInternal.cpp, + * VBoxGuestR0LibGenericRequest.cpp and RTErrConvertToErrno.cpp in vbox svn. + * + * Copyright (C) 2006-2016 Oracle Corporation + */ + +#include <linux/errno.h> +#include <linux/io.h> +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/module.h> +#include <linux/sizes.h> +#include <linux/slab.h> +#include <linux/uaccess.h> +#include <linux/vmalloc.h> +#include <linux/vbox_err.h> +#include <linux/vbox_utils.h> +#include "vboxguest_core.h" + +/* Get the pointer to the first parameter of a HGCM call request. */ +#define VMMDEV_HGCM_CALL_PARMS(a) \ + ((struct vmmdev_hgcm_function_parameter *)( \ + (u8 *)(a) + sizeof(struct vmmdev_hgcm_call))) + +/* The max parameter buffer size for a user request. */ +#define VBG_MAX_HGCM_USER_PARM (24 * SZ_1M) +/* The max parameter buffer size for a kernel request. */ +#define VBG_MAX_HGCM_KERNEL_PARM (16 * SZ_1M) + +#define VBG_DEBUG_PORT 0x504 + +/* This protects vbg_log_buf and serializes VBG_DEBUG_PORT accesses */ +static DEFINE_SPINLOCK(vbg_log_lock); +static char vbg_log_buf[128]; + +#define VBG_LOG(name, pr_func) \ +void name(const char *fmt, ...) \ +{ \ + unsigned long flags; \ + va_list args; \ + int i, count; \ + \ + va_start(args, fmt); \ + spin_lock_irqsave(&vbg_log_lock, flags); \ + \ + count = vscnprintf(vbg_log_buf, sizeof(vbg_log_buf), fmt, args);\ + for (i = 0; i < count; i++) \ + outb(vbg_log_buf[i], VBG_DEBUG_PORT); \ + \ + pr_func("%s", vbg_log_buf); \ + \ + spin_unlock_irqrestore(&vbg_log_lock, flags); \ + va_end(args); \ +} \ +EXPORT_SYMBOL(name) + +VBG_LOG(vbg_info, pr_info); +VBG_LOG(vbg_warn, pr_warn); +VBG_LOG(vbg_err, pr_err); +VBG_LOG(vbg_err_ratelimited, pr_err_ratelimited); +#if defined(DEBUG) && !defined(CONFIG_DYNAMIC_DEBUG) +VBG_LOG(vbg_debug, pr_debug); +#endif + +void *vbg_req_alloc(size_t len, enum vmmdev_request_type req_type, + u32 requestor) +{ + struct vmmdev_request_header *req; + int order = get_order(PAGE_ALIGN(len)); + + req = (void *)__get_free_pages(GFP_KERNEL | GFP_DMA32, order); + if (!req) + return NULL; + + memset(req, 0xaa, len); + + req->size = len; + req->version = VMMDEV_REQUEST_HEADER_VERSION; + req->request_type = req_type; + req->rc = VERR_GENERAL_FAILURE; + req->reserved1 = 0; + req->requestor = requestor; + + return req; +} + +void vbg_req_free(void *req, size_t len) +{ + if (!req) + return; + + free_pages((unsigned long)req, get_order(PAGE_ALIGN(len))); +} + +/* Note this function returns a VBox status code, not a negative errno!! */ +int vbg_req_perform(struct vbg_dev *gdev, void *req) +{ + unsigned long phys_req = virt_to_phys(req); + + outl(phys_req, gdev->io_port + VMMDEV_PORT_OFF_REQUEST); + /* + * The host changes the request as a result of the outl, make sure + * the outl and any reads of the req happen in the correct order. + */ + mb(); + + return ((struct vmmdev_request_header *)req)->rc; +} + +static bool hgcm_req_done(struct vbg_dev *gdev, + struct vmmdev_hgcmreq_header *header) +{ + unsigned long flags; + bool done; + + spin_lock_irqsave(&gdev->event_spinlock, flags); + done = header->flags & VMMDEV_HGCM_REQ_DONE; + spin_unlock_irqrestore(&gdev->event_spinlock, flags); + + return done; +} + +int vbg_hgcm_connect(struct vbg_dev *gdev, u32 requestor, + struct vmmdev_hgcm_service_location *loc, + u32 *client_id, int *vbox_status) +{ + struct vmmdev_hgcm_connect *hgcm_connect = NULL; + int rc; + + hgcm_connect = vbg_req_alloc(sizeof(*hgcm_connect), + VMMDEVREQ_HGCM_CONNECT, requestor); + if (!hgcm_connect) + return -ENOMEM; + + hgcm_connect->header.flags = 0; + memcpy(&hgcm_connect->loc, loc, sizeof(*loc)); + hgcm_connect->client_id = 0; + + rc = vbg_req_perform(gdev, hgcm_connect); + + if (rc == VINF_HGCM_ASYNC_EXECUTE) + wait_event(gdev->hgcm_wq, + hgcm_req_done(gdev, &hgcm_connect->header)); + + if (rc >= 0) { + *client_id = hgcm_connect->client_id; + rc = hgcm_connect->header.result; + } + + vbg_req_free(hgcm_connect, sizeof(*hgcm_connect)); + + *vbox_status = rc; + return 0; +} +EXPORT_SYMBOL(vbg_hgcm_connect); + +int vbg_hgcm_disconnect(struct vbg_dev *gdev, u32 requestor, + u32 client_id, int *vbox_status) +{ + struct vmmdev_hgcm_disconnect *hgcm_disconnect = NULL; + int rc; + + hgcm_disconnect = vbg_req_alloc(sizeof(*hgcm_disconnect), + VMMDEVREQ_HGCM_DISCONNECT, + requestor); + if (!hgcm_disconnect) + return -ENOMEM; + + hgcm_disconnect->header.flags = 0; + hgcm_disconnect->client_id = client_id; + + rc = vbg_req_perform(gdev, hgcm_disconnect); + + if (rc == VINF_HGCM_ASYNC_EXECUTE) + wait_event(gdev->hgcm_wq, + hgcm_req_done(gdev, &hgcm_disconnect->header)); + + if (rc >= 0) + rc = hgcm_disconnect->header.result; + + vbg_req_free(hgcm_disconnect, sizeof(*hgcm_disconnect)); + + *vbox_status = rc; + return 0; +} +EXPORT_SYMBOL(vbg_hgcm_disconnect); + +static u32 hgcm_call_buf_size_in_pages(void *buf, u32 len) +{ + u32 size = PAGE_ALIGN(len + ((unsigned long)buf & ~PAGE_MASK)); + + return size >> PAGE_SHIFT; +} + +static void hgcm_call_add_pagelist_size(void *buf, u32 len, size_t *extra) +{ + u32 page_count; + + page_count = hgcm_call_buf_size_in_pages(buf, len); + *extra += offsetof(struct vmmdev_hgcm_pagelist, pages[page_count]); +} + +static int hgcm_call_preprocess_linaddr( + const struct vmmdev_hgcm_function_parameter *src_parm, + void **bounce_buf_ret, size_t *extra) +{ + void *buf, *bounce_buf; + bool copy_in; + u32 len; + int ret; + + buf = (void *)src_parm->u.pointer.u.linear_addr; + len = src_parm->u.pointer.size; + copy_in = src_parm->type != VMMDEV_HGCM_PARM_TYPE_LINADDR_OUT; + + if (len > VBG_MAX_HGCM_USER_PARM) + return -E2BIG; + + bounce_buf = kvmalloc(len, GFP_KERNEL); + if (!bounce_buf) + return -ENOMEM; + + *bounce_buf_ret = bounce_buf; + + if (copy_in) { + ret = copy_from_user(bounce_buf, (void __user *)buf, len); + if (ret) + return -EFAULT; + } else { + memset(bounce_buf, 0, len); + } + + hgcm_call_add_pagelist_size(bounce_buf, len, extra); + return 0; +} + +/** + * Preprocesses the HGCM call, validate parameters, alloc bounce buffers and + * figure out how much extra storage we need for page lists. + * Return: 0 or negative errno value. + * @src_parm: Pointer to source function call parameters + * @parm_count: Number of function call parameters. + * @bounce_bufs_ret: Where to return the allocated bouncebuffer array + * @extra: Where to return the extra request space needed for + * physical page lists. + */ +static int hgcm_call_preprocess( + const struct vmmdev_hgcm_function_parameter *src_parm, + u32 parm_count, void ***bounce_bufs_ret, size_t *extra) +{ + void *buf, **bounce_bufs = NULL; + u32 i, len; + int ret; + + for (i = 0; i < parm_count; i++, src_parm++) { + switch (src_parm->type) { + case VMMDEV_HGCM_PARM_TYPE_32BIT: + case VMMDEV_HGCM_PARM_TYPE_64BIT: + break; + + case VMMDEV_HGCM_PARM_TYPE_LINADDR: + case VMMDEV_HGCM_PARM_TYPE_LINADDR_IN: + case VMMDEV_HGCM_PARM_TYPE_LINADDR_OUT: + if (!bounce_bufs) { + bounce_bufs = kcalloc(parm_count, + sizeof(void *), + GFP_KERNEL); + if (!bounce_bufs) + return -ENOMEM; + + *bounce_bufs_ret = bounce_bufs; + } + + ret = hgcm_call_preprocess_linaddr(src_parm, + &bounce_bufs[i], + extra); + if (ret) + return ret; + + break; + + case VMMDEV_HGCM_PARM_TYPE_LINADDR_KERNEL: + case VMMDEV_HGCM_PARM_TYPE_LINADDR_KERNEL_IN: + case VMMDEV_HGCM_PARM_TYPE_LINADDR_KERNEL_OUT: + buf = (void *)src_parm->u.pointer.u.linear_addr; + len = src_parm->u.pointer.size; + if (WARN_ON(len > VBG_MAX_HGCM_KERNEL_PARM)) + return -E2BIG; + + hgcm_call_add_pagelist_size(buf, len, extra); + break; + + default: + return -EINVAL; + } + } + + return 0; +} + +/** + * Translates linear address types to page list direction flags. + * + * Return: page list flags. + * @type: The type. + */ +static u32 hgcm_call_linear_addr_type_to_pagelist_flags( + enum vmmdev_hgcm_function_parameter_type type) +{ + switch (type) { + default: + WARN_ON(1); + fallthrough; + case VMMDEV_HGCM_PARM_TYPE_LINADDR: + case VMMDEV_HGCM_PARM_TYPE_LINADDR_KERNEL: + return VMMDEV_HGCM_F_PARM_DIRECTION_BOTH; + + case VMMDEV_HGCM_PARM_TYPE_LINADDR_IN: + case VMMDEV_HGCM_PARM_TYPE_LINADDR_KERNEL_IN: + return VMMDEV_HGCM_F_PARM_DIRECTION_TO_HOST; + + case VMMDEV_HGCM_PARM_TYPE_LINADDR_OUT: + case VMMDEV_HGCM_PARM_TYPE_LINADDR_KERNEL_OUT: + return VMMDEV_HGCM_F_PARM_DIRECTION_FROM_HOST; + } +} + +static void hgcm_call_init_linaddr(struct vmmdev_hgcm_call *call, + struct vmmdev_hgcm_function_parameter *dst_parm, void *buf, u32 len, + enum vmmdev_hgcm_function_parameter_type type, u32 *off_extra) +{ + struct vmmdev_hgcm_pagelist *dst_pg_lst; + struct page *page; + bool is_vmalloc; + u32 i, page_count; + + dst_parm->type = type; + + if (len == 0) { + dst_parm->u.pointer.size = 0; + dst_parm->u.pointer.u.linear_addr = 0; + return; + } + + dst_pg_lst = (void *)call + *off_extra; + page_count = hgcm_call_buf_size_in_pages(buf, len); + is_vmalloc = is_vmalloc_addr(buf); + + dst_parm->type = VMMDEV_HGCM_PARM_TYPE_PAGELIST; + dst_parm->u.page_list.size = len; + dst_parm->u.page_list.offset = *off_extra; + dst_pg_lst->flags = hgcm_call_linear_addr_type_to_pagelist_flags(type); + dst_pg_lst->offset_first_page = (unsigned long)buf & ~PAGE_MASK; + dst_pg_lst->page_count = page_count; + + for (i = 0; i < page_count; i++) { + if (is_vmalloc) + page = vmalloc_to_page(buf); + else + page = virt_to_page(buf); + + dst_pg_lst->pages[i] = page_to_phys(page); + buf += PAGE_SIZE; + } + + *off_extra += offsetof(struct vmmdev_hgcm_pagelist, pages[page_count]); +} + +/** + * Initializes the call request that we're sending to the host. + * @call: The call to initialize. + * @client_id: The client ID of the caller. + * @function: The function number of the function to call. + * @src_parm: Pointer to source function call parameters. + * @parm_count: Number of function call parameters. + * @bounce_bufs: The bouncebuffer array. + */ +static void hgcm_call_init_call( + struct vmmdev_hgcm_call *call, u32 client_id, u32 function, + const struct vmmdev_hgcm_function_parameter *src_parm, + u32 parm_count, void **bounce_bufs) +{ + struct vmmdev_hgcm_function_parameter *dst_parm = + VMMDEV_HGCM_CALL_PARMS(call); + u32 i, off_extra = (uintptr_t)(dst_parm + parm_count) - (uintptr_t)call; + void *buf; + + call->header.flags = 0; + call->header.result = VINF_SUCCESS; + call->client_id = client_id; + call->function = function; + call->parm_count = parm_count; + + for (i = 0; i < parm_count; i++, src_parm++, dst_parm++) { + switch (src_parm->type) { + case VMMDEV_HGCM_PARM_TYPE_32BIT: + case VMMDEV_HGCM_PARM_TYPE_64BIT: + *dst_parm = *src_parm; + break; + + case VMMDEV_HGCM_PARM_TYPE_LINADDR: + case VMMDEV_HGCM_PARM_TYPE_LINADDR_IN: + case VMMDEV_HGCM_PARM_TYPE_LINADDR_OUT: + hgcm_call_init_linaddr(call, dst_parm, bounce_bufs[i], + src_parm->u.pointer.size, + src_parm->type, &off_extra); + break; + + case VMMDEV_HGCM_PARM_TYPE_LINADDR_KERNEL: + case VMMDEV_HGCM_PARM_TYPE_LINADDR_KERNEL_IN: + case VMMDEV_HGCM_PARM_TYPE_LINADDR_KERNEL_OUT: + buf = (void *)src_parm->u.pointer.u.linear_addr; + hgcm_call_init_linaddr(call, dst_parm, buf, + src_parm->u.pointer.size, + src_parm->type, &off_extra); + break; + + default: + WARN_ON(1); + dst_parm->type = VMMDEV_HGCM_PARM_TYPE_INVALID; + } + } +} + +/** + * Tries to cancel a pending HGCM call. + * + * Return: VBox status code + */ +static int hgcm_cancel_call(struct vbg_dev *gdev, struct vmmdev_hgcm_call *call) +{ + int rc; + + /* + * We use a pre-allocated request for cancellations, which is + * protected by cancel_req_mutex. This means that all cancellations + * get serialized, this should be fine since they should be rare. + */ + mutex_lock(&gdev->cancel_req_mutex); + gdev->cancel_req->phys_req_to_cancel = virt_to_phys(call); + rc = vbg_req_perform(gdev, gdev->cancel_req); + mutex_unlock(&gdev->cancel_req_mutex); + + if (rc == VERR_NOT_IMPLEMENTED) { + call->header.flags |= VMMDEV_HGCM_REQ_CANCELLED; + call->header.header.request_type = VMMDEVREQ_HGCM_CANCEL; + + rc = vbg_req_perform(gdev, call); + if (rc == VERR_INVALID_PARAMETER) + rc = VERR_NOT_FOUND; + } + + if (rc >= 0) + call->header.flags |= VMMDEV_HGCM_REQ_CANCELLED; + + return rc; +} + +/** + * Performs the call and completion wait. + * Return: 0 or negative errno value. + * @gdev: The VBoxGuest device extension. + * @call: The call to execute. + * @timeout_ms: Timeout in ms. + * @leak_it: Where to return the leak it / free it, indicator. + * Cancellation fun. + */ +static int vbg_hgcm_do_call(struct vbg_dev *gdev, struct vmmdev_hgcm_call *call, + u32 timeout_ms, bool interruptible, bool *leak_it) +{ + int rc, cancel_rc, ret; + long timeout; + + *leak_it = false; + + rc = vbg_req_perform(gdev, call); + + /* + * If the call failed, then pretend success. Upper layers will + * interpret the result code in the packet. + */ + if (rc < 0) { + call->header.result = rc; + return 0; + } + + if (rc != VINF_HGCM_ASYNC_EXECUTE) + return 0; + + /* Host decided to process the request asynchronously, wait for it */ + if (timeout_ms == U32_MAX) + timeout = MAX_SCHEDULE_TIMEOUT; + else + timeout = msecs_to_jiffies(timeout_ms); + + if (interruptible) { + timeout = wait_event_interruptible_timeout(gdev->hgcm_wq, + hgcm_req_done(gdev, &call->header), + timeout); + } else { + timeout = wait_event_timeout(gdev->hgcm_wq, + hgcm_req_done(gdev, &call->header), + timeout); + } + + /* timeout > 0 means hgcm_req_done has returned true, so success */ + if (timeout > 0) + return 0; + + if (timeout == 0) + ret = -ETIMEDOUT; + else + ret = -EINTR; + + /* Cancel the request */ + cancel_rc = hgcm_cancel_call(gdev, call); + if (cancel_rc >= 0) + return ret; + + /* + * Failed to cancel, this should mean that the cancel has lost the + * race with normal completion, wait while the host completes it. + */ + if (cancel_rc == VERR_NOT_FOUND || cancel_rc == VERR_SEM_DESTROYED) + timeout = msecs_to_jiffies(500); + else + timeout = msecs_to_jiffies(2000); + + timeout = wait_event_timeout(gdev->hgcm_wq, + hgcm_req_done(gdev, &call->header), + timeout); + + if (WARN_ON(timeout == 0)) { + /* We really should never get here */ + vbg_err("%s: Call timedout and cancellation failed, leaking the request\n", + __func__); + *leak_it = true; + return ret; + } + + /* The call has completed normally after all */ + return 0; +} + +/** + * Copies the result of the call back to the caller info structure and user + * buffers. + * Return: 0 or negative errno value. + * @call: HGCM call request. + * @dst_parm: Pointer to function call parameters destination. + * @parm_count: Number of function call parameters. + * @bounce_bufs: The bouncebuffer array. + */ +static int hgcm_call_copy_back_result( + const struct vmmdev_hgcm_call *call, + struct vmmdev_hgcm_function_parameter *dst_parm, + u32 parm_count, void **bounce_bufs) +{ + const struct vmmdev_hgcm_function_parameter *src_parm = + VMMDEV_HGCM_CALL_PARMS(call); + void __user *p; + int ret; + u32 i; + + /* Copy back parameters. */ + for (i = 0; i < parm_count; i++, src_parm++, dst_parm++) { + switch (dst_parm->type) { + case VMMDEV_HGCM_PARM_TYPE_32BIT: + case VMMDEV_HGCM_PARM_TYPE_64BIT: + *dst_parm = *src_parm; + break; + + case VMMDEV_HGCM_PARM_TYPE_PAGELIST: + dst_parm->u.page_list.size = src_parm->u.page_list.size; + break; + + case VMMDEV_HGCM_PARM_TYPE_LINADDR_IN: + case VMMDEV_HGCM_PARM_TYPE_LINADDR_KERNEL: + case VMMDEV_HGCM_PARM_TYPE_LINADDR_KERNEL_IN: + case VMMDEV_HGCM_PARM_TYPE_LINADDR_KERNEL_OUT: + dst_parm->u.pointer.size = src_parm->u.pointer.size; + break; + + case VMMDEV_HGCM_PARM_TYPE_LINADDR: + case VMMDEV_HGCM_PARM_TYPE_LINADDR_OUT: + dst_parm->u.pointer.size = src_parm->u.pointer.size; + + p = (void __user *)dst_parm->u.pointer.u.linear_addr; + ret = copy_to_user(p, bounce_bufs[i], + min(src_parm->u.pointer.size, + dst_parm->u.pointer.size)); + if (ret) + return -EFAULT; + break; + + default: + WARN_ON(1); + return -EINVAL; + } + } + + return 0; +} + +int vbg_hgcm_call(struct vbg_dev *gdev, u32 requestor, u32 client_id, + u32 function, u32 timeout_ms, + struct vmmdev_hgcm_function_parameter *parms, u32 parm_count, + int *vbox_status) +{ + struct vmmdev_hgcm_call *call; + void **bounce_bufs = NULL; + bool leak_it; + size_t size; + int i, ret; + + size = sizeof(struct vmmdev_hgcm_call) + + parm_count * sizeof(struct vmmdev_hgcm_function_parameter); + /* + * Validate and buffer the parameters for the call. This also increases + * call_size with the amount of extra space needed for page lists. + */ + ret = hgcm_call_preprocess(parms, parm_count, &bounce_bufs, &size); + if (ret) { + /* Even on error bounce bufs may still have been allocated */ + goto free_bounce_bufs; + } + + call = vbg_req_alloc(size, VMMDEVREQ_HGCM_CALL, requestor); + if (!call) { + ret = -ENOMEM; + goto free_bounce_bufs; + } + + hgcm_call_init_call(call, client_id, function, parms, parm_count, + bounce_bufs); + + ret = vbg_hgcm_do_call(gdev, call, timeout_ms, + requestor & VMMDEV_REQUESTOR_USERMODE, &leak_it); + if (ret == 0) { + *vbox_status = call->header.result; + ret = hgcm_call_copy_back_result(call, parms, parm_count, + bounce_bufs); + } + + if (!leak_it) + vbg_req_free(call, size); + +free_bounce_bufs: + if (bounce_bufs) { + for (i = 0; i < parm_count; i++) + kvfree(bounce_bufs[i]); + kfree(bounce_bufs); + } + + return ret; +} +EXPORT_SYMBOL(vbg_hgcm_call); + +#ifdef CONFIG_COMPAT +int vbg_hgcm_call32( + struct vbg_dev *gdev, u32 requestor, u32 client_id, u32 function, + u32 timeout_ms, struct vmmdev_hgcm_function_parameter32 *parm32, + u32 parm_count, int *vbox_status) +{ + struct vmmdev_hgcm_function_parameter *parm64 = NULL; + u32 i, size; + int ret = 0; + + /* KISS allocate a temporary request and convert the parameters. */ + size = parm_count * sizeof(struct vmmdev_hgcm_function_parameter); + parm64 = kzalloc(size, GFP_KERNEL); + if (!parm64) + return -ENOMEM; + + for (i = 0; i < parm_count; i++) { + switch (parm32[i].type) { + case VMMDEV_HGCM_PARM_TYPE_32BIT: + parm64[i].type = VMMDEV_HGCM_PARM_TYPE_32BIT; + parm64[i].u.value32 = parm32[i].u.value32; + break; + + case VMMDEV_HGCM_PARM_TYPE_64BIT: + parm64[i].type = VMMDEV_HGCM_PARM_TYPE_64BIT; + parm64[i].u.value64 = parm32[i].u.value64; + break; + + case VMMDEV_HGCM_PARM_TYPE_LINADDR_OUT: + case VMMDEV_HGCM_PARM_TYPE_LINADDR: + case VMMDEV_HGCM_PARM_TYPE_LINADDR_IN: + parm64[i].type = parm32[i].type; + parm64[i].u.pointer.size = parm32[i].u.pointer.size; + parm64[i].u.pointer.u.linear_addr = + parm32[i].u.pointer.u.linear_addr; + break; + + default: + ret = -EINVAL; + } + if (ret < 0) + goto out_free; + } + + ret = vbg_hgcm_call(gdev, requestor, client_id, function, timeout_ms, + parm64, parm_count, vbox_status); + if (ret < 0) + goto out_free; + + /* Copy back. */ + for (i = 0; i < parm_count; i++, parm32++, parm64++) { + switch (parm64[i].type) { + case VMMDEV_HGCM_PARM_TYPE_32BIT: + parm32[i].u.value32 = parm64[i].u.value32; + break; + + case VMMDEV_HGCM_PARM_TYPE_64BIT: + parm32[i].u.value64 = parm64[i].u.value64; + break; + + case VMMDEV_HGCM_PARM_TYPE_LINADDR_OUT: + case VMMDEV_HGCM_PARM_TYPE_LINADDR: + case VMMDEV_HGCM_PARM_TYPE_LINADDR_IN: + parm32[i].u.pointer.size = parm64[i].u.pointer.size; + break; + + default: + WARN_ON(1); + ret = -EINVAL; + } + } + +out_free: + kfree(parm64); + return ret; +} +#endif + +static const int vbg_status_code_to_errno_table[] = { + [-VERR_ACCESS_DENIED] = -EPERM, + [-VERR_FILE_NOT_FOUND] = -ENOENT, + [-VERR_PROCESS_NOT_FOUND] = -ESRCH, + [-VERR_INTERRUPTED] = -EINTR, + [-VERR_DEV_IO_ERROR] = -EIO, + [-VERR_TOO_MUCH_DATA] = -E2BIG, + [-VERR_BAD_EXE_FORMAT] = -ENOEXEC, + [-VERR_INVALID_HANDLE] = -EBADF, + [-VERR_TRY_AGAIN] = -EAGAIN, + [-VERR_NO_MEMORY] = -ENOMEM, + [-VERR_INVALID_POINTER] = -EFAULT, + [-VERR_RESOURCE_BUSY] = -EBUSY, + [-VERR_ALREADY_EXISTS] = -EEXIST, + [-VERR_NOT_SAME_DEVICE] = -EXDEV, + [-VERR_NOT_A_DIRECTORY] = -ENOTDIR, + [-VERR_PATH_NOT_FOUND] = -ENOTDIR, + [-VERR_INVALID_NAME] = -ENOENT, + [-VERR_IS_A_DIRECTORY] = -EISDIR, + [-VERR_INVALID_PARAMETER] = -EINVAL, + [-VERR_TOO_MANY_OPEN_FILES] = -ENFILE, + [-VERR_INVALID_FUNCTION] = -ENOTTY, + [-VERR_SHARING_VIOLATION] = -ETXTBSY, + [-VERR_FILE_TOO_BIG] = -EFBIG, + [-VERR_DISK_FULL] = -ENOSPC, + [-VERR_SEEK_ON_DEVICE] = -ESPIPE, + [-VERR_WRITE_PROTECT] = -EROFS, + [-VERR_BROKEN_PIPE] = -EPIPE, + [-VERR_DEADLOCK] = -EDEADLK, + [-VERR_FILENAME_TOO_LONG] = -ENAMETOOLONG, + [-VERR_FILE_LOCK_FAILED] = -ENOLCK, + [-VERR_NOT_IMPLEMENTED] = -ENOSYS, + [-VERR_NOT_SUPPORTED] = -ENOSYS, + [-VERR_DIR_NOT_EMPTY] = -ENOTEMPTY, + [-VERR_TOO_MANY_SYMLINKS] = -ELOOP, + [-VERR_NO_MORE_FILES] = -ENODATA, + [-VERR_NO_DATA] = -ENODATA, + [-VERR_NET_NO_NETWORK] = -ENONET, + [-VERR_NET_NOT_UNIQUE_NAME] = -ENOTUNIQ, + [-VERR_NO_TRANSLATION] = -EILSEQ, + [-VERR_NET_NOT_SOCKET] = -ENOTSOCK, + [-VERR_NET_DEST_ADDRESS_REQUIRED] = -EDESTADDRREQ, + [-VERR_NET_MSG_SIZE] = -EMSGSIZE, + [-VERR_NET_PROTOCOL_TYPE] = -EPROTOTYPE, + [-VERR_NET_PROTOCOL_NOT_AVAILABLE] = -ENOPROTOOPT, + [-VERR_NET_PROTOCOL_NOT_SUPPORTED] = -EPROTONOSUPPORT, + [-VERR_NET_SOCKET_TYPE_NOT_SUPPORTED] = -ESOCKTNOSUPPORT, + [-VERR_NET_OPERATION_NOT_SUPPORTED] = -EOPNOTSUPP, + [-VERR_NET_PROTOCOL_FAMILY_NOT_SUPPORTED] = -EPFNOSUPPORT, + [-VERR_NET_ADDRESS_FAMILY_NOT_SUPPORTED] = -EAFNOSUPPORT, + [-VERR_NET_ADDRESS_IN_USE] = -EADDRINUSE, + [-VERR_NET_ADDRESS_NOT_AVAILABLE] = -EADDRNOTAVAIL, + [-VERR_NET_DOWN] = -ENETDOWN, + [-VERR_NET_UNREACHABLE] = -ENETUNREACH, + [-VERR_NET_CONNECTION_RESET] = -ENETRESET, + [-VERR_NET_CONNECTION_ABORTED] = -ECONNABORTED, + [-VERR_NET_CONNECTION_RESET_BY_PEER] = -ECONNRESET, + [-VERR_NET_NO_BUFFER_SPACE] = -ENOBUFS, + [-VERR_NET_ALREADY_CONNECTED] = -EISCONN, + [-VERR_NET_NOT_CONNECTED] = -ENOTCONN, + [-VERR_NET_SHUTDOWN] = -ESHUTDOWN, + [-VERR_NET_TOO_MANY_REFERENCES] = -ETOOMANYREFS, + [-VERR_TIMEOUT] = -ETIMEDOUT, + [-VERR_NET_CONNECTION_REFUSED] = -ECONNREFUSED, + [-VERR_NET_HOST_DOWN] = -EHOSTDOWN, + [-VERR_NET_HOST_UNREACHABLE] = -EHOSTUNREACH, + [-VERR_NET_ALREADY_IN_PROGRESS] = -EALREADY, + [-VERR_NET_IN_PROGRESS] = -EINPROGRESS, + [-VERR_MEDIA_NOT_PRESENT] = -ENOMEDIUM, + [-VERR_MEDIA_NOT_RECOGNIZED] = -EMEDIUMTYPE, +}; + +int vbg_status_code_to_errno(int rc) +{ + if (rc >= 0) + return 0; + + rc = -rc; + if (rc >= ARRAY_SIZE(vbg_status_code_to_errno_table) || + vbg_status_code_to_errno_table[rc] == 0) { + vbg_warn("%s: Unhandled err %d\n", __func__, -rc); + return -EPROTO; + } + + return vbg_status_code_to_errno_table[rc]; +} +EXPORT_SYMBOL(vbg_status_code_to_errno); diff --git a/drivers/virt/vboxguest/vboxguest_version.h b/drivers/virt/vboxguest/vboxguest_version.h new file mode 100644 index 000000000..84834dad3 --- /dev/null +++ b/drivers/virt/vboxguest/vboxguest_version.h @@ -0,0 +1,18 @@ +/* SPDX-License-Identifier: (GPL-2.0 OR CDDL-1.0) */ +/* + * VBox Guest additions version info, this is used by the host to determine + * supported guest-addition features in some cases. So this will need to be + * synced with vbox upstreams versioning scheme when we implement / port + * new features from the upstream out-of-tree vboxguest driver. + */ + +#ifndef __VBOX_VERSION_H__ +#define __VBOX_VERSION_H__ + +#define VBG_VERSION_MAJOR 6 +#define VBG_VERSION_MINOR 0 +#define VBG_VERSION_BUILD 0 +#define VBG_SVN_REV 127566 +#define VBG_VERSION_STRING "6.0.0" + +#endif diff --git a/drivers/virt/vboxguest/vmmdev.h b/drivers/virt/vboxguest/vmmdev.h new file mode 100644 index 000000000..21f408120 --- /dev/null +++ b/drivers/virt/vboxguest/vmmdev.h @@ -0,0 +1,453 @@ +/* SPDX-License-Identifier: (GPL-2.0 OR CDDL-1.0) */ +/* + * Virtual Device for Guest <-> VMM/Host communication interface + * + * Copyright (C) 2006-2016 Oracle Corporation + */ + +#ifndef __VBOX_VMMDEV_H__ +#define __VBOX_VMMDEV_H__ + +#include <asm/bitsperlong.h> +#include <linux/sizes.h> +#include <linux/types.h> +#include <linux/vbox_vmmdev_types.h> + +/* Port for generic request interface (relative offset). */ +#define VMMDEV_PORT_OFF_REQUEST 0 + +/** Layout of VMMDEV RAM region that contains information for guest. */ +struct vmmdev_memory { + /** The size of this structure. */ + u32 size; + /** The structure version. (VMMDEV_MEMORY_VERSION) */ + u32 version; + + union { + struct { + /** Flag telling that VMMDev has events pending. */ + u8 have_events; + /** Explicit padding, MBZ. */ + u8 padding[3]; + } V1_04; + + struct { + /** Pending events flags, set by host. */ + u32 host_events; + /** Mask of events the guest wants, set by guest. */ + u32 guest_event_mask; + } V1_03; + } V; + + /* struct vbva_memory, not used */ +}; +VMMDEV_ASSERT_SIZE(vmmdev_memory, 8 + 8); + +/** Version of vmmdev_memory structure (vmmdev_memory::version). */ +#define VMMDEV_MEMORY_VERSION (1) + +/* Host mouse capabilities has been changed. */ +#define VMMDEV_EVENT_MOUSE_CAPABILITIES_CHANGED BIT(0) +/* HGCM event. */ +#define VMMDEV_EVENT_HGCM BIT(1) +/* A display change request has been issued. */ +#define VMMDEV_EVENT_DISPLAY_CHANGE_REQUEST BIT(2) +/* Credentials are available for judgement. */ +#define VMMDEV_EVENT_JUDGE_CREDENTIALS BIT(3) +/* The guest has been restored. */ +#define VMMDEV_EVENT_RESTORED BIT(4) +/* Seamless mode state changed. */ +#define VMMDEV_EVENT_SEAMLESS_MODE_CHANGE_REQUEST BIT(5) +/* Memory balloon size changed. */ +#define VMMDEV_EVENT_BALLOON_CHANGE_REQUEST BIT(6) +/* Statistics interval changed. */ +#define VMMDEV_EVENT_STATISTICS_INTERVAL_CHANGE_REQUEST BIT(7) +/* VRDP status changed. */ +#define VMMDEV_EVENT_VRDP BIT(8) +/* New mouse position data available. */ +#define VMMDEV_EVENT_MOUSE_POSITION_CHANGED BIT(9) +/* CPU hotplug event occurred. */ +#define VMMDEV_EVENT_CPU_HOTPLUG BIT(10) +/* The mask of valid events, for sanity checking. */ +#define VMMDEV_EVENT_VALID_EVENT_MASK 0x000007ffU + +/* + * Additions are allowed to work only if additions_major == vmmdev_current && + * additions_minor <= vmmdev_current. Additions version is reported to host + * (VMMDev) by VMMDEVREQ_REPORT_GUEST_INFO. + */ +#define VMMDEV_VERSION 0x00010004 +#define VMMDEV_VERSION_MAJOR (VMMDEV_VERSION >> 16) +#define VMMDEV_VERSION_MINOR (VMMDEV_VERSION & 0xffff) + +/* Maximum request packet size. */ +#define VMMDEV_MAX_VMMDEVREQ_SIZE 1048576 + +/* Version of vmmdev_request_header structure. */ +#define VMMDEV_REQUEST_HEADER_VERSION 0x10001 + +/** struct vmmdev_request_header - Generic VMMDev request header. */ +struct vmmdev_request_header { + /** IN: Size of the structure in bytes (including body). */ + u32 size; + /** IN: Version of the structure. */ + u32 version; + /** IN: Type of the request. */ + enum vmmdev_request_type request_type; + /** OUT: Return code. */ + s32 rc; + /** Reserved field no.1. MBZ. */ + u32 reserved1; + /** IN: Requestor information (VMMDEV_REQUESTOR_*) */ + u32 requestor; +}; +VMMDEV_ASSERT_SIZE(vmmdev_request_header, 24); + +/** + * struct vmmdev_mouse_status - Mouse status request structure. + * + * Used by VMMDEVREQ_GET_MOUSE_STATUS and VMMDEVREQ_SET_MOUSE_STATUS. + */ +struct vmmdev_mouse_status { + /** header */ + struct vmmdev_request_header header; + /** Mouse feature mask. See VMMDEV_MOUSE_*. */ + u32 mouse_features; + /** Mouse x position. */ + s32 pointer_pos_x; + /** Mouse y position. */ + s32 pointer_pos_y; +}; +VMMDEV_ASSERT_SIZE(vmmdev_mouse_status, 24 + 12); + +/* The guest can (== wants to) handle absolute coordinates. */ +#define VMMDEV_MOUSE_GUEST_CAN_ABSOLUTE BIT(0) +/* + * The host can (== wants to) send absolute coordinates. + * (Input not captured.) + */ +#define VMMDEV_MOUSE_HOST_WANTS_ABSOLUTE BIT(1) +/* + * The guest can *NOT* switch to software cursor and therefore depends on the + * host cursor. + * + * When guest additions are installed and the host has promised to display the + * cursor itself, the guest installs a hardware mouse driver. Don't ask the + * guest to switch to a software cursor then. + */ +#define VMMDEV_MOUSE_GUEST_NEEDS_HOST_CURSOR BIT(2) +/* The host does NOT provide support for drawing the cursor itself. */ +#define VMMDEV_MOUSE_HOST_CANNOT_HWPOINTER BIT(3) +/* The guest can read VMMDev events to find out about pointer movement */ +#define VMMDEV_MOUSE_NEW_PROTOCOL BIT(4) +/* + * If the guest changes the status of the VMMDEV_MOUSE_GUEST_NEEDS_HOST_CURSOR + * bit, the host will honour this. + */ +#define VMMDEV_MOUSE_HOST_RECHECKS_NEEDS_HOST_CURSOR BIT(5) +/* + * The host supplies an absolute pointing device. The Guest Additions may + * wish to use this to decide whether to install their own driver. + */ +#define VMMDEV_MOUSE_HOST_HAS_ABS_DEV BIT(6) + +/* The minimum value our pointing device can return. */ +#define VMMDEV_MOUSE_RANGE_MIN 0 +/* The maximum value our pointing device can return. */ +#define VMMDEV_MOUSE_RANGE_MAX 0xFFFF + +/** + * struct vmmdev_host_version - VirtualBox host version request structure. + * + * VBG uses this to detect the precense of new features in the interface. + */ +struct vmmdev_host_version { + /** Header. */ + struct vmmdev_request_header header; + /** Major version. */ + u16 major; + /** Minor version. */ + u16 minor; + /** Build number. */ + u32 build; + /** SVN revision. */ + u32 revision; + /** Feature mask. */ + u32 features; +}; +VMMDEV_ASSERT_SIZE(vmmdev_host_version, 24 + 16); + +/* Physical page lists are supported by HGCM. */ +#define VMMDEV_HVF_HGCM_PHYS_PAGE_LIST BIT(0) + +/** + * struct vmmdev_mask - Structure to set / clear bits in a mask used for + * VMMDEVREQ_SET_GUEST_CAPABILITIES and VMMDEVREQ_CTL_GUEST_FILTER_MASK. + */ +struct vmmdev_mask { + /** Header. */ + struct vmmdev_request_header header; + /** Mask of bits to be set. */ + u32 or_mask; + /** Mask of bits to be cleared. */ + u32 not_mask; +}; +VMMDEV_ASSERT_SIZE(vmmdev_mask, 24 + 8); + +/* The guest supports seamless display rendering. */ +#define VMMDEV_GUEST_SUPPORTS_SEAMLESS BIT(0) +/* The guest supports mapping guest to host windows. */ +#define VMMDEV_GUEST_SUPPORTS_GUEST_HOST_WINDOW_MAPPING BIT(1) +/* + * The guest graphical additions are active. + * Used for fast activation and deactivation of certain graphical operations + * (e.g. resizing & seamless). The legacy VMMDEVREQ_REPORT_GUEST_CAPABILITIES + * request sets this automatically, but VMMDEVREQ_SET_GUEST_CAPABILITIES does + * not. + */ +#define VMMDEV_GUEST_SUPPORTS_GRAPHICS BIT(2) +/* The mask of valid capabilities, for sanity checking. */ +#define VMMDEV_GUEST_CAPABILITIES_MASK 0x00000007U + +/** struct vmmdev_hypervisorinfo - Hypervisor info structure. */ +struct vmmdev_hypervisorinfo { + /** Header. */ + struct vmmdev_request_header header; + /** + * Guest virtual address of proposed hypervisor start. + * Not used by VMMDEVREQ_GET_HYPERVISOR_INFO. + */ + u32 hypervisor_start; + /** Hypervisor size in bytes. */ + u32 hypervisor_size; +}; +VMMDEV_ASSERT_SIZE(vmmdev_hypervisorinfo, 24 + 8); + +/** struct vmmdev_events - Pending events structure. */ +struct vmmdev_events { + /** Header. */ + struct vmmdev_request_header header; + /** OUT: Pending event mask. */ + u32 events; +}; +VMMDEV_ASSERT_SIZE(vmmdev_events, 24 + 4); + +#define VMMDEV_OSTYPE_LINUX26 0x53000 +#define VMMDEV_OSTYPE_X64 BIT(8) + +/** struct vmmdev_guestinfo - Guest information report. */ +struct vmmdev_guest_info { + /** Header. */ + struct vmmdev_request_header header; + /** + * The VMMDev interface version expected by additions. + * *Deprecated*, do not use anymore! Will be removed. + */ + u32 interface_version; + /** Guest OS type. */ + u32 os_type; +}; +VMMDEV_ASSERT_SIZE(vmmdev_guest_info, 24 + 8); + +#define VMMDEV_GUEST_INFO2_ADDITIONS_FEATURES_REQUESTOR_INFO BIT(0) + +/** struct vmmdev_guestinfo2 - Guest information report, version 2. */ +struct vmmdev_guest_info2 { + /** Header. */ + struct vmmdev_request_header header; + /** Major version. */ + u16 additions_major; + /** Minor version. */ + u16 additions_minor; + /** Build number. */ + u32 additions_build; + /** SVN revision. */ + u32 additions_revision; + /** Feature mask. */ + u32 additions_features; + /** + * The intentional meaning of this field was: + * Some additional information, for example 'Beta 1' or something like + * that. + * + * The way it was implemented was implemented: VBG_VERSION_STRING. + * + * This means the first three members are duplicated in this field (if + * the guest build config is sane). So, the user must check this and + * chop it off before usage. There is, because of the Main code's blind + * trust in the field's content, no way back. + */ + char name[128]; +}; +VMMDEV_ASSERT_SIZE(vmmdev_guest_info2, 24 + 144); + +enum vmmdev_guest_facility_type { + VBOXGUEST_FACILITY_TYPE_UNKNOWN = 0, + VBOXGUEST_FACILITY_TYPE_VBOXGUEST_DRIVER = 20, + /* VBoxGINA / VBoxCredProv / pam_vbox. */ + VBOXGUEST_FACILITY_TYPE_AUTO_LOGON = 90, + VBOXGUEST_FACILITY_TYPE_VBOX_SERVICE = 100, + /* VBoxTray (Windows), VBoxClient (Linux, Unix). */ + VBOXGUEST_FACILITY_TYPE_VBOX_TRAY_CLIENT = 101, + VBOXGUEST_FACILITY_TYPE_SEAMLESS = 1000, + VBOXGUEST_FACILITY_TYPE_GRAPHICS = 1100, + VBOXGUEST_FACILITY_TYPE_ALL = 0x7ffffffe, + /* Ensure the enum is a 32 bit data-type */ + VBOXGUEST_FACILITY_TYPE_SIZEHACK = 0x7fffffff +}; + +enum vmmdev_guest_facility_status { + VBOXGUEST_FACILITY_STATUS_INACTIVE = 0, + VBOXGUEST_FACILITY_STATUS_PAUSED = 1, + VBOXGUEST_FACILITY_STATUS_PRE_INIT = 20, + VBOXGUEST_FACILITY_STATUS_INIT = 30, + VBOXGUEST_FACILITY_STATUS_ACTIVE = 50, + VBOXGUEST_FACILITY_STATUS_TERMINATING = 100, + VBOXGUEST_FACILITY_STATUS_TERMINATED = 101, + VBOXGUEST_FACILITY_STATUS_FAILED = 800, + VBOXGUEST_FACILITY_STATUS_UNKNOWN = 999, + /* Ensure the enum is a 32 bit data-type */ + VBOXGUEST_FACILITY_STATUS_SIZEHACK = 0x7fffffff +}; + +/** struct vmmdev_guest_status - Guest Additions status structure. */ +struct vmmdev_guest_status { + /** Header. */ + struct vmmdev_request_header header; + /** Facility the status is indicated for. */ + enum vmmdev_guest_facility_type facility; + /** Current guest status. */ + enum vmmdev_guest_facility_status status; + /** Flags, not used at the moment. */ + u32 flags; +}; +VMMDEV_ASSERT_SIZE(vmmdev_guest_status, 24 + 12); + +#define VMMDEV_MEMORY_BALLOON_CHUNK_SIZE (1048576) +#define VMMDEV_MEMORY_BALLOON_CHUNK_PAGES (1048576 / 4096) + +/** struct vmmdev_memballoon_info - Memory-balloon info structure. */ +struct vmmdev_memballoon_info { + /** Header. */ + struct vmmdev_request_header header; + /** Balloon size in megabytes. */ + u32 balloon_chunks; + /** Guest ram size in megabytes. */ + u32 phys_mem_chunks; + /** + * Setting this to VMMDEV_EVENT_BALLOON_CHANGE_REQUEST indicates that + * the request is a response to that event. + * (Don't confuse this with VMMDEVREQ_ACKNOWLEDGE_EVENTS.) + */ + u32 event_ack; +}; +VMMDEV_ASSERT_SIZE(vmmdev_memballoon_info, 24 + 12); + +/** struct vmmdev_memballoon_change - Change the size of the balloon. */ +struct vmmdev_memballoon_change { + /** Header. */ + struct vmmdev_request_header header; + /** The number of pages in the array. */ + u32 pages; + /** true = inflate, false = deflate. */ + u32 inflate; + /** Physical address (u64) of each page. */ + u64 phys_page[VMMDEV_MEMORY_BALLOON_CHUNK_PAGES]; +}; + +/** struct vmmdev_write_core_dump - Write Core Dump request data. */ +struct vmmdev_write_core_dump { + /** Header. */ + struct vmmdev_request_header header; + /** Flags (reserved, MBZ). */ + u32 flags; +}; +VMMDEV_ASSERT_SIZE(vmmdev_write_core_dump, 24 + 4); + +/** struct vmmdev_heartbeat - Heart beat check state structure. */ +struct vmmdev_heartbeat { + /** Header. */ + struct vmmdev_request_header header; + /** OUT: Guest heartbeat interval in nanosec. */ + u64 interval_ns; + /** Heartbeat check flag. */ + u8 enabled; + /** Explicit padding, MBZ. */ + u8 padding[3]; +} __packed; +VMMDEV_ASSERT_SIZE(vmmdev_heartbeat, 24 + 12); + +#define VMMDEV_HGCM_REQ_DONE BIT(0) +#define VMMDEV_HGCM_REQ_CANCELLED BIT(1) + +/** struct vmmdev_hgcmreq_header - vmmdev HGCM requests header. */ +struct vmmdev_hgcmreq_header { + /** Request header. */ + struct vmmdev_request_header header; + + /** HGCM flags. */ + u32 flags; + + /** Result code. */ + s32 result; +}; +VMMDEV_ASSERT_SIZE(vmmdev_hgcmreq_header, 24 + 8); + +/** struct vmmdev_hgcm_connect - HGCM connect request structure. */ +struct vmmdev_hgcm_connect { + /** HGCM request header. */ + struct vmmdev_hgcmreq_header header; + + /** IN: Description of service to connect to. */ + struct vmmdev_hgcm_service_location loc; + + /** OUT: Client identifier assigned by local instance of HGCM. */ + u32 client_id; +}; +VMMDEV_ASSERT_SIZE(vmmdev_hgcm_connect, 32 + 132 + 4); + +/** struct vmmdev_hgcm_disconnect - HGCM disconnect request structure. */ +struct vmmdev_hgcm_disconnect { + /** HGCM request header. */ + struct vmmdev_hgcmreq_header header; + + /** IN: Client identifier. */ + u32 client_id; +}; +VMMDEV_ASSERT_SIZE(vmmdev_hgcm_disconnect, 32 + 4); + +#define VMMDEV_HGCM_MAX_PARMS 32 + +/** struct vmmdev_hgcm_call - HGCM call request structure. */ +struct vmmdev_hgcm_call { + /* request header */ + struct vmmdev_hgcmreq_header header; + + /** IN: Client identifier. */ + u32 client_id; + /** IN: Service function number. */ + u32 function; + /** IN: Number of parameters. */ + u32 parm_count; + /** Parameters follow in form: HGCMFunctionParameter32|64 parms[X]; */ +}; +VMMDEV_ASSERT_SIZE(vmmdev_hgcm_call, 32 + 12); + +/** + * struct vmmdev_hgcm_cancel2 - HGCM cancel request structure, version 2. + * + * After the request header.rc will be: + * + * VINF_SUCCESS when cancelled. + * VERR_NOT_FOUND if the specified request cannot be found. + * VERR_INVALID_PARAMETER if the address is invalid valid. + */ +struct vmmdev_hgcm_cancel2 { + /** Header. */ + struct vmmdev_request_header header; + /** The physical address of the request to cancel. */ + u32 phys_req_to_cancel; +}; +VMMDEV_ASSERT_SIZE(vmmdev_hgcm_cancel2, 24 + 4); + +#endif diff --git a/drivers/virt/vmgenid.c b/drivers/virt/vmgenid.c new file mode 100644 index 000000000..a1c467a0e --- /dev/null +++ b/drivers/virt/vmgenid.c @@ -0,0 +1,101 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2022 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. + * + * The "Virtual Machine Generation ID" is exposed via ACPI and changes when a + * virtual machine forks or is cloned. This driver exists for shepherding that + * information to random.c. + */ + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/acpi.h> +#include <linux/random.h> + +ACPI_MODULE_NAME("vmgenid"); + +enum { VMGENID_SIZE = 16 }; + +struct vmgenid_state { + u8 *next_id; + u8 this_id[VMGENID_SIZE]; +}; + +static int vmgenid_add(struct acpi_device *device) +{ + struct acpi_buffer parsed = { ACPI_ALLOCATE_BUFFER }; + struct vmgenid_state *state; + union acpi_object *obj; + phys_addr_t phys_addr; + acpi_status status; + int ret = 0; + + state = devm_kmalloc(&device->dev, sizeof(*state), GFP_KERNEL); + if (!state) + return -ENOMEM; + + status = acpi_evaluate_object(device->handle, "ADDR", NULL, &parsed); + if (ACPI_FAILURE(status)) { + ACPI_EXCEPTION((AE_INFO, status, "Evaluating ADDR")); + return -ENODEV; + } + obj = parsed.pointer; + if (!obj || obj->type != ACPI_TYPE_PACKAGE || obj->package.count != 2 || + obj->package.elements[0].type != ACPI_TYPE_INTEGER || + obj->package.elements[1].type != ACPI_TYPE_INTEGER) { + ret = -EINVAL; + goto out; + } + + phys_addr = (obj->package.elements[0].integer.value << 0) | + (obj->package.elements[1].integer.value << 32); + state->next_id = devm_memremap(&device->dev, phys_addr, VMGENID_SIZE, MEMREMAP_WB); + if (IS_ERR(state->next_id)) { + ret = PTR_ERR(state->next_id); + goto out; + } + + memcpy(state->this_id, state->next_id, sizeof(state->this_id)); + add_device_randomness(state->this_id, sizeof(state->this_id)); + + device->driver_data = state; + +out: + ACPI_FREE(parsed.pointer); + return ret; +} + +static void vmgenid_notify(struct acpi_device *device, u32 event) +{ + struct vmgenid_state *state = acpi_driver_data(device); + u8 old_id[VMGENID_SIZE]; + + memcpy(old_id, state->this_id, sizeof(old_id)); + memcpy(state->this_id, state->next_id, sizeof(state->this_id)); + if (!memcmp(old_id, state->this_id, sizeof(old_id))) + return; + add_vmfork_randomness(state->this_id, sizeof(state->this_id)); +} + +static const struct acpi_device_id vmgenid_ids[] = { + { "VMGENCTR", 0 }, + { "VM_GEN_COUNTER", 0 }, + { } +}; + +static struct acpi_driver vmgenid_driver = { + .name = "vmgenid", + .ids = vmgenid_ids, + .owner = THIS_MODULE, + .ops = { + .add = vmgenid_add, + .notify = vmgenid_notify + } +}; + +module_acpi_driver(vmgenid_driver); + +MODULE_DEVICE_TABLE(acpi, vmgenid_ids); +MODULE_DESCRIPTION("Virtual Machine Generation ID"); +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR("Jason A. Donenfeld <Jason@zx2c4.com>"); |