Adding upstream version 6.6.15.upstream/6.6.15

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
author: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-11 08:27:49 +0000
committer: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-11 08:27:49 +0000
commit: ace9429bb58fd418f0c81d4c2835699bddf6bde6 (patch)
tree: b2d64bc10158fdd5497876388cd68142ca374ed3 /drivers/vfio
parent: Initial commit. (diff)
download: linux-ace9429bb58fd418f0c81d4c2835699bddf6bde6.tar.xz
linux-ace9429bb58fd418f0c81d4c2835699bddf6bde6.zip
71 files changed, 27308 insertions, 0 deletions
diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig
new file mode 100644
index 0000000000..6bda6dbb48
--- /dev/null
+++ b/drivers/vfio/Kconfig
@@ -0,0 +1,90 @@
+# SPDX-License-Identifier: GPL-2.0-only
+menuconfig VFIO
+	tristate "VFIO Non-Privileged userspace driver framework"
+	select IOMMU_API
+	depends on IOMMUFD || !IOMMUFD
+	select INTERVAL_TREE
+	select VFIO_GROUP if SPAPR_TCE_IOMMU || IOMMUFD=n
+	select VFIO_DEVICE_CDEV if !VFIO_GROUP
+	select VFIO_CONTAINER if IOMMUFD=n
+	help
+	  VFIO provides a framework for secure userspace device drivers.
+	  See Documentation/driver-api/vfio.rst for more details.
+
+	  If you don't know what to do here, say N.
+
+if VFIO
+config VFIO_DEVICE_CDEV
+	bool "Support for the VFIO cdev /dev/vfio/devices/vfioX"
+	depends on IOMMUFD && !SPAPR_TCE_IOMMU
+	default !VFIO_GROUP
+	help
+	  The VFIO device cdev is another way for userspace to get device
+	  access. Userspace gets device fd by opening device cdev under
+	  /dev/vfio/devices/vfioX, and then bind the device fd with an iommufd
+	  to set up secure DMA context for device access.  This interface does
+	  not support noiommu.
+
+	  If you don't know what to do here, say N.
+
+config VFIO_GROUP
+	bool "Support for the VFIO group /dev/vfio/$group_id"
+	default y
+	help
+	   VFIO group support provides the traditional model for accessing
+	   devices through VFIO and is used by the majority of userspace
+	   applications and drivers making use of VFIO.
+
+	   If you don't know what to do here, say Y.
+
+config VFIO_CONTAINER
+	bool "Support for the VFIO container /dev/vfio/vfio"
+	select VFIO_IOMMU_TYPE1 if MMU && (X86 || S390 || ARM || ARM64)
+	depends on VFIO_GROUP
+	default y
+	help
+	  The VFIO container is the classic interface to VFIO for establishing
+	  IOMMU mappings. If N is selected here then IOMMUFD must be used to
+	  manage the mappings.
+
+	  Unless testing IOMMUFD say Y here.
+
+if VFIO_CONTAINER
+config VFIO_IOMMU_TYPE1
+	tristate
+	default n
+
+config VFIO_IOMMU_SPAPR_TCE
+	tristate
+	depends on SPAPR_TCE_IOMMU
+	default VFIO
+endif
+
+config VFIO_NOIOMMU
+	bool "VFIO No-IOMMU support"
+	depends on VFIO_GROUP
+	help
+	  VFIO is built on the ability to isolate devices using the IOMMU.
+	  Only with an IOMMU can userspace access to DMA capable devices be
+	  considered secure.  VFIO No-IOMMU mode enables IOMMU groups for
+	  devices without IOMMU backing for the purpose of re-using the VFIO
+	  infrastructure in a non-secure mode.  Use of this mode will result
+	  in an unsupportable kernel and will therefore taint the kernel.
+	  Device assignment to virtual machines is also not possible with
+	  this mode since there is no IOMMU to provide DMA translation.
+
+	  If you don't know what to do here, say N.
+
+config VFIO_VIRQFD
+	bool
+	select EVENTFD
+	default n
+
+source "drivers/vfio/pci/Kconfig"
+source "drivers/vfio/platform/Kconfig"
+source "drivers/vfio/mdev/Kconfig"
+source "drivers/vfio/fsl-mc/Kconfig"
+source "drivers/vfio/cdx/Kconfig"
+endif
+
+source "virt/lib/Kconfig"
diff --git a/drivers/vfio/Makefile b/drivers/vfio/Makefile
new file mode 100644
index 0000000000..c82ea032d3
--- /dev/null
+++ b/drivers/vfio/Makefile
@@ -0,0 +1,18 @@
+# SPDX-License-Identifier: GPL-2.0
+obj-$(CONFIG_VFIO) += vfio.o
+
+vfio-y += vfio_main.o \
+	  iova_bitmap.o
+vfio-$(CONFIG_VFIO_DEVICE_CDEV) += device_cdev.o
+vfio-$(CONFIG_VFIO_GROUP) += group.o
+vfio-$(CONFIG_IOMMUFD) += iommufd.o
+vfio-$(CONFIG_VFIO_CONTAINER) += container.o
+vfio-$(CONFIG_VFIO_VIRQFD) += virqfd.o
+
+obj-$(CONFIG_VFIO_IOMMU_TYPE1) += vfio_iommu_type1.o
+obj-$(CONFIG_VFIO_IOMMU_SPAPR_TCE) += vfio_iommu_spapr_tce.o
+obj-$(CONFIG_VFIO_PCI_CORE) += pci/
+obj-$(CONFIG_VFIO_PLATFORM_BASE) += platform/
+obj-$(CONFIG_VFIO_MDEV) += mdev/
+obj-$(CONFIG_VFIO_FSL_MC) += fsl-mc/
+obj-$(CONFIG_VFIO_CDX) += cdx/
diff --git a/drivers/vfio/cdx/Kconfig b/drivers/vfio/cdx/Kconfig
new file mode 100644
index 0000000000..e6de0a0caa
--- /dev/null
+++ b/drivers/vfio/cdx/Kconfig
@@ -0,0 +1,17 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# VFIO CDX configuration
+#
+# Copyright (C) 2022-2023, Advanced Micro Devices, Inc.
+#
+
+config VFIO_CDX
+	tristate "VFIO support for CDX bus devices"
+	depends on CDX_BUS
+	select EVENTFD
+	help
+	  Driver to enable VFIO support for the devices on CDX bus.
+	  This is required to make use of CDX devices present in
+	  the system using the VFIO framework.
+
+	  If you don't know what to do here, say N.
diff --git a/drivers/vfio/cdx/Makefile b/drivers/vfio/cdx/Makefile
new file mode 100644
index 0000000000..cd4a2e6fe6
--- /dev/null
+++ b/drivers/vfio/cdx/Makefile
@@ -0,0 +1,8 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Copyright (C) 2022-2023, Advanced Micro Devices, Inc.
+#
+
+obj-$(CONFIG_VFIO_CDX) += vfio-cdx.o
+
+vfio-cdx-objs := main.o
diff --git a/drivers/vfio/cdx/main.c b/drivers/vfio/cdx/main.c
new file mode 100644
index 0000000000..de56686581
--- /dev/null
+++ b/drivers/vfio/cdx/main.c
@@ -0,0 +1,233 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2022-2023, Advanced Micro Devices, Inc.
+ */
+
+#include <linux/vfio.h>
+#include <linux/cdx/cdx_bus.h>
+
+#include "private.h"
+
+static int vfio_cdx_open_device(struct vfio_device *core_vdev)
+{
+	struct vfio_cdx_device *vdev =
+		container_of(core_vdev, struct vfio_cdx_device, vdev);
+	struct cdx_device *cdx_dev = to_cdx_device(core_vdev->dev);
+	int count = cdx_dev->res_count;
+	int i;
+
+	vdev->regions = kcalloc(count, sizeof(struct vfio_cdx_region),
+				GFP_KERNEL_ACCOUNT);
+	if (!vdev->regions)
+		return -ENOMEM;
+
+	for (i = 0; i < count; i++) {
+		struct resource *res = &cdx_dev->res[i];
+
+		vdev->regions[i].addr = res->start;
+		vdev->regions[i].size = resource_size(res);
+		vdev->regions[i].type = res->flags;
+		/*
+		 * Only regions addressed with PAGE granularity may be
+		 * MMAP'ed securely.
+		 */
+		if (!(vdev->regions[i].addr & ~PAGE_MASK) &&
+		    !(vdev->regions[i].size & ~PAGE_MASK))
+			vdev->regions[i].flags |=
+					VFIO_REGION_INFO_FLAG_MMAP;
+		vdev->regions[i].flags |= VFIO_REGION_INFO_FLAG_READ;
+		if (!(cdx_dev->res[i].flags & IORESOURCE_READONLY))
+			vdev->regions[i].flags |= VFIO_REGION_INFO_FLAG_WRITE;
+	}
+
+	return 0;
+}
+
+static void vfio_cdx_close_device(struct vfio_device *core_vdev)
+{
+	struct vfio_cdx_device *vdev =
+		container_of(core_vdev, struct vfio_cdx_device, vdev);
+
+	kfree(vdev->regions);
+	cdx_dev_reset(core_vdev->dev);
+}
+
+static int vfio_cdx_ioctl_get_info(struct vfio_cdx_device *vdev,
+				   struct vfio_device_info __user *arg)
+{
+	unsigned long minsz = offsetofend(struct vfio_device_info, num_irqs);
+	struct cdx_device *cdx_dev = to_cdx_device(vdev->vdev.dev);
+	struct vfio_device_info info;
+
+	if (copy_from_user(&info, arg, minsz))
+		return -EFAULT;
+
+	if (info.argsz < minsz)
+		return -EINVAL;
+
+	info.flags = VFIO_DEVICE_FLAGS_CDX;
+	info.flags |= VFIO_DEVICE_FLAGS_RESET;
+
+	info.num_regions = cdx_dev->res_count;
+	info.num_irqs = 0;
+
+	return copy_to_user(arg, &info, minsz) ? -EFAULT : 0;
+}
+
+static int vfio_cdx_ioctl_get_region_info(struct vfio_cdx_device *vdev,
+					  struct vfio_region_info __user *arg)
+{
+	unsigned long minsz = offsetofend(struct vfio_region_info, offset);
+	struct cdx_device *cdx_dev = to_cdx_device(vdev->vdev.dev);
+	struct vfio_region_info info;
+
+	if (copy_from_user(&info, arg, minsz))
+		return -EFAULT;
+
+	if (info.argsz < minsz)
+		return -EINVAL;
+
+	if (info.index >= cdx_dev->res_count)
+		return -EINVAL;
+
+	/* map offset to the physical address */
+	info.offset = vfio_cdx_index_to_offset(info.index);
+	info.size = vdev->regions[info.index].size;
+	info.flags = vdev->regions[info.index].flags;
+
+	return copy_to_user(arg, &info, minsz) ? -EFAULT : 0;
+}
+
+static long vfio_cdx_ioctl(struct vfio_device *core_vdev,
+			   unsigned int cmd, unsigned long arg)
+{
+	struct vfio_cdx_device *vdev =
+		container_of(core_vdev, struct vfio_cdx_device, vdev);
+	void __user *uarg = (void __user *)arg;
+
+	switch (cmd) {
+	case VFIO_DEVICE_GET_INFO:
+		return vfio_cdx_ioctl_get_info(vdev, uarg);
+	case VFIO_DEVICE_GET_REGION_INFO:
+		return vfio_cdx_ioctl_get_region_info(vdev, uarg);
+	case VFIO_DEVICE_RESET:
+		return cdx_dev_reset(core_vdev->dev);
+	default:
+		return -ENOTTY;
+	}
+}
+
+static int vfio_cdx_mmap_mmio(struct vfio_cdx_region region,
+			      struct vm_area_struct *vma)
+{
+	u64 size = vma->vm_end - vma->vm_start;
+	u64 pgoff, base;
+
+	pgoff = vma->vm_pgoff &
+		((1U << (VFIO_CDX_OFFSET_SHIFT - PAGE_SHIFT)) - 1);
+	base = pgoff << PAGE_SHIFT;
+
+	if (base + size > region.size)
+		return -EINVAL;
+
+	vma->vm_pgoff = (region.addr >> PAGE_SHIFT) + pgoff;
+	vma->vm_page_prot = pgprot_device(vma->vm_page_prot);
+
+	return io_remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff,
+				  size, vma->vm_page_prot);
+}
+
+static int vfio_cdx_mmap(struct vfio_device *core_vdev,
+			 struct vm_area_struct *vma)
+{
+	struct vfio_cdx_device *vdev =
+		container_of(core_vdev, struct vfio_cdx_device, vdev);
+	struct cdx_device *cdx_dev = to_cdx_device(core_vdev->dev);
+	unsigned int index;
+
+	index = vma->vm_pgoff >> (VFIO_CDX_OFFSET_SHIFT - PAGE_SHIFT);
+
+	if (index >= cdx_dev->res_count)
+		return -EINVAL;
+
+	if (!(vdev->regions[index].flags & VFIO_REGION_INFO_FLAG_MMAP))
+		return -EINVAL;
+
+	if (!(vdev->regions[index].flags & VFIO_REGION_INFO_FLAG_READ) &&
+	    (vma->vm_flags & VM_READ))
+		return -EPERM;
+
+	if (!(vdev->regions[index].flags & VFIO_REGION_INFO_FLAG_WRITE) &&
+	    (vma->vm_flags & VM_WRITE))
+		return -EPERM;
+
+	return vfio_cdx_mmap_mmio(vdev->regions[index], vma);
+}
+
+static const struct vfio_device_ops vfio_cdx_ops = {
+	.name		= "vfio-cdx",
+	.open_device	= vfio_cdx_open_device,
+	.close_device	= vfio_cdx_close_device,
+	.ioctl		= vfio_cdx_ioctl,
+	.mmap		= vfio_cdx_mmap,
+	.bind_iommufd	= vfio_iommufd_physical_bind,
+	.unbind_iommufd	= vfio_iommufd_physical_unbind,
+	.attach_ioas	= vfio_iommufd_physical_attach_ioas,
+};
+
+static int vfio_cdx_probe(struct cdx_device *cdx_dev)
+{
+	struct vfio_cdx_device *vdev;
+	struct device *dev = &cdx_dev->dev;
+	int ret;
+
+	vdev = vfio_alloc_device(vfio_cdx_device, vdev, dev,
+				 &vfio_cdx_ops);
+	if (IS_ERR(vdev))
+		return PTR_ERR(vdev);
+
+	ret = vfio_register_group_dev(&vdev->vdev);
+	if (ret)
+		goto out_uninit;
+
+	dev_set_drvdata(dev, vdev);
+	return 0;
+
+out_uninit:
+	vfio_put_device(&vdev->vdev);
+	return ret;
+}
+
+static int vfio_cdx_remove(struct cdx_device *cdx_dev)
+{
+	struct device *dev = &cdx_dev->dev;
+	struct vfio_cdx_device *vdev = dev_get_drvdata(dev);
+
+	vfio_unregister_group_dev(&vdev->vdev);
+	vfio_put_device(&vdev->vdev);
+
+	return 0;
+}
+
+static const struct cdx_device_id vfio_cdx_table[] = {
+	{ CDX_DEVICE_DRIVER_OVERRIDE(CDX_ANY_ID, CDX_ANY_ID,
+				     CDX_ID_F_VFIO_DRIVER_OVERRIDE) }, /* match all by default */
+	{}
+};
+
+MODULE_DEVICE_TABLE(cdx, vfio_cdx_table);
+
+static struct cdx_driver vfio_cdx_driver = {
+	.probe		= vfio_cdx_probe,
+	.remove		= vfio_cdx_remove,
+	.match_id_table	= vfio_cdx_table,
+	.driver	= {
+		.name	= "vfio-cdx",
+	},
+	.driver_managed_dma = true,
+};
+
+module_driver(vfio_cdx_driver, cdx_driver_register, cdx_driver_unregister);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("VFIO for CDX devices - User Level meta-driver");
diff --git a/drivers/vfio/cdx/private.h b/drivers/vfio/cdx/private.h
new file mode 100644
index 0000000000..8bdc117ea8
--- /dev/null
+++ b/drivers/vfio/cdx/private.h
@@ -0,0 +1,28 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2022-2023, Advanced Micro Devices, Inc.
+ */
+
+#ifndef VFIO_CDX_PRIVATE_H
+#define VFIO_CDX_PRIVATE_H
+
+#define VFIO_CDX_OFFSET_SHIFT    40
+
+static inline u64 vfio_cdx_index_to_offset(u32 index)
+{
+	return ((u64)(index) << VFIO_CDX_OFFSET_SHIFT);
+}
+
+struct vfio_cdx_region {
+	u32			flags;
+	u32			type;
+	u64			addr;
+	resource_size_t		size;
+};
+
+struct vfio_cdx_device {
+	struct vfio_device	vdev;
+	struct vfio_cdx_region	*regions;
+};
+
+#endif /* VFIO_CDX_PRIVATE_H */
diff --git a/drivers/vfio/container.c b/drivers/vfio/container.c
new file mode 100644
index 0000000000..d53d08f169
--- /dev/null
+++ b/drivers/vfio/container.c
@@ -0,0 +1,607 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
+ *
+ * VFIO container (/dev/vfio/vfio)
+ */
+#include <linux/file.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/capability.h>
+#include <linux/iommu.h>
+#include <linux/miscdevice.h>
+#include <linux/vfio.h>
+#include <uapi/linux/vfio.h>
+
+#include "vfio.h"
+
+struct vfio_container {
+	struct kref			kref;
+	struct list_head		group_list;
+	struct rw_semaphore		group_lock;
+	struct vfio_iommu_driver	*iommu_driver;
+	void				*iommu_data;
+	bool				noiommu;
+};
+
+static struct vfio {
+	struct list_head		iommu_drivers_list;
+	struct mutex			iommu_drivers_lock;
+} vfio;
+
+static void *vfio_noiommu_open(unsigned long arg)
+{
+	if (arg != VFIO_NOIOMMU_IOMMU)
+		return ERR_PTR(-EINVAL);
+	if (!capable(CAP_SYS_RAWIO))
+		return ERR_PTR(-EPERM);
+
+	return NULL;
+}
+
+static void vfio_noiommu_release(void *iommu_data)
+{
+}
+
+static long vfio_noiommu_ioctl(void *iommu_data,
+			       unsigned int cmd, unsigned long arg)
+{
+	if (cmd == VFIO_CHECK_EXTENSION)
+		return vfio_noiommu && (arg == VFIO_NOIOMMU_IOMMU) ? 1 : 0;
+
+	return -ENOTTY;
+}
+
+static int vfio_noiommu_attach_group(void *iommu_data,
+		struct iommu_group *iommu_group, enum vfio_group_type type)
+{
+	return 0;
+}
+
+static void vfio_noiommu_detach_group(void *iommu_data,
+				      struct iommu_group *iommu_group)
+{
+}
+
+static const struct vfio_iommu_driver_ops vfio_noiommu_ops = {
+	.name = "vfio-noiommu",
+	.owner = THIS_MODULE,
+	.open = vfio_noiommu_open,
+	.release = vfio_noiommu_release,
+	.ioctl = vfio_noiommu_ioctl,
+	.attach_group = vfio_noiommu_attach_group,
+	.detach_group = vfio_noiommu_detach_group,
+};
+
+/*
+ * Only noiommu containers can use vfio-noiommu and noiommu containers can only
+ * use vfio-noiommu.
+ */
+static bool vfio_iommu_driver_allowed(struct vfio_container *container,
+				      const struct vfio_iommu_driver *driver)
+{
+	if (!IS_ENABLED(CONFIG_VFIO_NOIOMMU))
+		return true;
+	return container->noiommu == (driver->ops == &vfio_noiommu_ops);
+}
+
+/*
+ * IOMMU driver registration
+ */
+int vfio_register_iommu_driver(const struct vfio_iommu_driver_ops *ops)
+{
+	struct vfio_iommu_driver *driver, *tmp;
+
+	if (WARN_ON(!ops->register_device != !ops->unregister_device))
+		return -EINVAL;
+
+	driver = kzalloc(sizeof(*driver), GFP_KERNEL);
+	if (!driver)
+		return -ENOMEM;
+
+	driver->ops = ops;
+
+	mutex_lock(&vfio.iommu_drivers_lock);
+
+	/* Check for duplicates */
+	list_for_each_entry(tmp, &vfio.iommu_drivers_list, vfio_next) {
+		if (tmp->ops == ops) {
+			mutex_unlock(&vfio.iommu_drivers_lock);
+			kfree(driver);
+			return -EINVAL;
+		}
+	}
+
+	list_add(&driver->vfio_next, &vfio.iommu_drivers_list);
+
+	mutex_unlock(&vfio.iommu_drivers_lock);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(vfio_register_iommu_driver);
+
+void vfio_unregister_iommu_driver(const struct vfio_iommu_driver_ops *ops)
+{
+	struct vfio_iommu_driver *driver;
+
+	mutex_lock(&vfio.iommu_drivers_lock);
+	list_for_each_entry(driver, &vfio.iommu_drivers_list, vfio_next) {
+		if (driver->ops == ops) {
+			list_del(&driver->vfio_next);
+			mutex_unlock(&vfio.iommu_drivers_lock);
+			kfree(driver);
+			return;
+		}
+	}
+	mutex_unlock(&vfio.iommu_drivers_lock);
+}
+EXPORT_SYMBOL_GPL(vfio_unregister_iommu_driver);
+
+/*
+ * Container objects - containers are created when /dev/vfio/vfio is
+ * opened, but their lifecycle extends until the last user is done, so
+ * it's freed via kref.  Must support container/group/device being
+ * closed in any order.
+ */
+static void vfio_container_release(struct kref *kref)
+{
+	struct vfio_container *container;
+	container = container_of(kref, struct vfio_container, kref);
+
+	kfree(container);
+}
+
+static void vfio_container_get(struct vfio_container *container)
+{
+	kref_get(&container->kref);
+}
+
+static void vfio_container_put(struct vfio_container *container)
+{
+	kref_put(&container->kref, vfio_container_release);
+}
+
+void vfio_device_container_register(struct vfio_device *device)
+{
+	struct vfio_iommu_driver *iommu_driver =
+		device->group->container->iommu_driver;
+
+	if (iommu_driver && iommu_driver->ops->register_device)
+		iommu_driver->ops->register_device(
+			device->group->container->iommu_data, device);
+}
+
+void vfio_device_container_unregister(struct vfio_device *device)
+{
+	struct vfio_iommu_driver *iommu_driver =
+		device->group->container->iommu_driver;
+
+	if (iommu_driver && iommu_driver->ops->unregister_device)
+		iommu_driver->ops->unregister_device(
+			device->group->container->iommu_data, device);
+}
+
+static long
+vfio_container_ioctl_check_extension(struct vfio_container *container,
+				     unsigned long arg)
+{
+	struct vfio_iommu_driver *driver;
+	long ret = 0;
+
+	down_read(&container->group_lock);
+
+	driver = container->iommu_driver;
+
+	switch (arg) {
+		/* No base extensions yet */
+	default:
+		/*
+		 * If no driver is set, poll all registered drivers for
+		 * extensions and return the first positive result.  If
+		 * a driver is already set, further queries will be passed
+		 * only to that driver.
+		 */
+		if (!driver) {
+			mutex_lock(&vfio.iommu_drivers_lock);
+			list_for_each_entry(driver, &vfio.iommu_drivers_list,
+					    vfio_next) {
+
+				if (!list_empty(&container->group_list) &&
+				    !vfio_iommu_driver_allowed(container,
+							       driver))
+					continue;
+				if (!try_module_get(driver->ops->owner))
+					continue;
+
+				ret = driver->ops->ioctl(NULL,
+							 VFIO_CHECK_EXTENSION,
+							 arg);
+				module_put(driver->ops->owner);
+				if (ret > 0)
+					break;
+			}
+			mutex_unlock(&vfio.iommu_drivers_lock);
+		} else
+			ret = driver->ops->ioctl(container->iommu_data,
+						 VFIO_CHECK_EXTENSION, arg);
+	}
+
+	up_read(&container->group_lock);
+
+	return ret;
+}
+
+/* hold write lock on container->group_lock */
+static int __vfio_container_attach_groups(struct vfio_container *container,
+					  struct vfio_iommu_driver *driver,
+					  void *data)
+{
+	struct vfio_group *group;
+	int ret = -ENODEV;
+
+	list_for_each_entry(group, &container->group_list, container_next) {
+		ret = driver->ops->attach_group(data, group->iommu_group,
+						group->type);
+		if (ret)
+			goto unwind;
+	}
+
+	return ret;
+
+unwind:
+	list_for_each_entry_continue_reverse(group, &container->group_list,
+					     container_next) {
+		driver->ops->detach_group(data, group->iommu_group);
+	}
+
+	return ret;
+}
+
+static long vfio_ioctl_set_iommu(struct vfio_container *container,
+				 unsigned long arg)
+{
+	struct vfio_iommu_driver *driver;
+	long ret = -ENODEV;
+
+	down_write(&container->group_lock);
+
+	/*
+	 * The container is designed to be an unprivileged interface while
+	 * the group can be assigned to specific users.  Therefore, only by
+	 * adding a group to a container does the user get the privilege of
+	 * enabling the iommu, which may allocate finite resources.  There
+	 * is no unset_iommu, but by removing all the groups from a container,
+	 * the container is deprivileged and returns to an unset state.
+	 */
+	if (list_empty(&container->group_list) || container->iommu_driver) {
+		up_write(&container->group_lock);
+		return -EINVAL;
+	}
+
+	mutex_lock(&vfio.iommu_drivers_lock);
+	list_for_each_entry(driver, &vfio.iommu_drivers_list, vfio_next) {
+		void *data;
+
+		if (!vfio_iommu_driver_allowed(container, driver))
+			continue;
+		if (!try_module_get(driver->ops->owner))
+			continue;
+
+		/*
+		 * The arg magic for SET_IOMMU is the same as CHECK_EXTENSION,
+		 * so test which iommu driver reported support for this
+		 * extension and call open on them.  We also pass them the
+		 * magic, allowing a single driver to support multiple
+		 * interfaces if they'd like.
+		 */
+		if (driver->ops->ioctl(NULL, VFIO_CHECK_EXTENSION, arg) <= 0) {
+			module_put(driver->ops->owner);
+			continue;
+		}
+
+		data = driver->ops->open(arg);
+		if (IS_ERR(data)) {
+			ret = PTR_ERR(data);
+			module_put(driver->ops->owner);
+			continue;
+		}
+
+		ret = __vfio_container_attach_groups(container, driver, data);
+		if (ret) {
+			driver->ops->release(data);
+			module_put(driver->ops->owner);
+			continue;
+		}
+
+		container->iommu_driver = driver;
+		container->iommu_data = data;
+		break;
+	}
+
+	mutex_unlock(&vfio.iommu_drivers_lock);
+	up_write(&container->group_lock);
+
+	return ret;
+}
+
+static long vfio_fops_unl_ioctl(struct file *filep,
+				unsigned int cmd, unsigned long arg)
+{
+	struct vfio_container *container = filep->private_data;
+	struct vfio_iommu_driver *driver;
+	void *data;
+	long ret = -EINVAL;
+
+	if (!container)
+		return ret;
+
+	switch (cmd) {
+	case VFIO_GET_API_VERSION:
+		ret = VFIO_API_VERSION;
+		break;
+	case VFIO_CHECK_EXTENSION:
+		ret = vfio_container_ioctl_check_extension(container, arg);
+		break;
+	case VFIO_SET_IOMMU:
+		ret = vfio_ioctl_set_iommu(container, arg);
+		break;
+	default:
+		driver = container->iommu_driver;
+		data = container->iommu_data;
+
+		if (driver) /* passthrough all unrecognized ioctls */
+			ret = driver->ops->ioctl(data, cmd, arg);
+	}
+
+	return ret;
+}
+
+static int vfio_fops_open(struct inode *inode, struct file *filep)
+{
+	struct vfio_container *container;
+
+	container = kzalloc(sizeof(*container), GFP_KERNEL_ACCOUNT);
+	if (!container)
+		return -ENOMEM;
+
+	INIT_LIST_HEAD(&container->group_list);
+	init_rwsem(&container->group_lock);
+	kref_init(&container->kref);
+
+	filep->private_data = container;
+
+	return 0;
+}
+
+static int vfio_fops_release(struct inode *inode, struct file *filep)
+{
+	struct vfio_container *container = filep->private_data;
+
+	filep->private_data = NULL;
+
+	vfio_container_put(container);
+
+	return 0;
+}
+
+static const struct file_operations vfio_fops = {
+	.owner		= THIS_MODULE,
+	.open		= vfio_fops_open,
+	.release	= vfio_fops_release,
+	.unlocked_ioctl	= vfio_fops_unl_ioctl,
+	.compat_ioctl	= compat_ptr_ioctl,
+};
+
+struct vfio_container *vfio_container_from_file(struct file *file)
+{
+	struct vfio_container *container;
+
+	/* Sanity check, is this really our fd? */
+	if (file->f_op != &vfio_fops)
+		return NULL;
+
+	container = file->private_data;
+	WARN_ON(!container); /* fget ensures we don't race vfio_release */
+	return container;
+}
+
+static struct miscdevice vfio_dev = {
+	.minor = VFIO_MINOR,
+	.name = "vfio",
+	.fops = &vfio_fops,
+	.nodename = "vfio/vfio",
+	.mode = S_IRUGO | S_IWUGO,
+};
+
+int vfio_container_attach_group(struct vfio_container *container,
+				struct vfio_group *group)
+{
+	struct vfio_iommu_driver *driver;
+	int ret = 0;
+
+	lockdep_assert_held(&group->group_lock);
+
+	if (group->type == VFIO_NO_IOMMU && !capable(CAP_SYS_RAWIO))
+		return -EPERM;
+
+	down_write(&container->group_lock);
+
+	/* Real groups and fake groups cannot mix */
+	if (!list_empty(&container->group_list) &&
+	    container->noiommu != (group->type == VFIO_NO_IOMMU)) {
+		ret = -EPERM;
+		goto out_unlock_container;
+	}
+
+	if (group->type == VFIO_IOMMU) {
+		ret = iommu_group_claim_dma_owner(group->iommu_group, group);
+		if (ret)
+			goto out_unlock_container;
+	}
+
+	driver = container->iommu_driver;
+	if (driver) {
+		ret = driver->ops->attach_group(container->iommu_data,
+						group->iommu_group,
+						group->type);
+		if (ret) {
+			if (group->type == VFIO_IOMMU)
+				iommu_group_release_dma_owner(
+					group->iommu_group);
+			goto out_unlock_container;
+		}
+	}
+
+	group->container = container;
+	group->container_users = 1;
+	container->noiommu = (group->type == VFIO_NO_IOMMU);
+	list_add(&group->container_next, &container->group_list);
+
+	/* Get a reference on the container and mark a user within the group */
+	vfio_container_get(container);
+
+out_unlock_container:
+	up_write(&container->group_lock);
+	return ret;
+}
+
+void vfio_group_detach_container(struct vfio_group *group)
+{
+	struct vfio_container *container = group->container;
+	struct vfio_iommu_driver *driver;
+
+	lockdep_assert_held(&group->group_lock);
+	WARN_ON(group->container_users != 1);
+
+	down_write(&container->group_lock);
+
+	driver = container->iommu_driver;
+	if (driver)
+		driver->ops->detach_group(container->iommu_data,
+					  group->iommu_group);
+
+	if (group->type == VFIO_IOMMU)
+		iommu_group_release_dma_owner(group->iommu_group);
+
+	group->container = NULL;
+	group->container_users = 0;
+	list_del(&group->container_next);
+
+	/* Detaching the last group deprivileges a container, remove iommu */
+	if (driver && list_empty(&container->group_list)) {
+		driver->ops->release(container->iommu_data);
+		module_put(driver->ops->owner);
+		container->iommu_driver = NULL;
+		container->iommu_data = NULL;
+	}
+
+	up_write(&container->group_lock);
+
+	vfio_container_put(container);
+}
+
+int vfio_group_use_container(struct vfio_group *group)
+{
+	lockdep_assert_held(&group->group_lock);
+
+	/*
+	 * The container fd has been assigned with VFIO_GROUP_SET_CONTAINER but
+	 * VFIO_SET_IOMMU hasn't been done yet.
+	 */
+	if (!group->container->iommu_driver)
+		return -EINVAL;
+
+	if (group->type == VFIO_NO_IOMMU && !capable(CAP_SYS_RAWIO))
+		return -EPERM;
+
+	get_file(group->opened_file);
+	group->container_users++;
+	return 0;
+}
+
+void vfio_group_unuse_container(struct vfio_group *group)
+{
+	lockdep_assert_held(&group->group_lock);
+
+	WARN_ON(group->container_users <= 1);
+	group->container_users--;
+	fput(group->opened_file);
+}
+
+int vfio_device_container_pin_pages(struct vfio_device *device,
+				    dma_addr_t iova, int npage,
+				    int prot, struct page **pages)
+{
+	struct vfio_container *container = device->group->container;
+	struct iommu_group *iommu_group = device->group->iommu_group;
+	struct vfio_iommu_driver *driver = container->iommu_driver;
+
+	if (npage > VFIO_PIN_PAGES_MAX_ENTRIES)
+		return -E2BIG;
+
+	if (unlikely(!driver || !driver->ops->pin_pages))
+		return -ENOTTY;
+	return driver->ops->pin_pages(container->iommu_data, iommu_group, iova,
+				      npage, prot, pages);
+}
+
+void vfio_device_container_unpin_pages(struct vfio_device *device,
+				       dma_addr_t iova, int npage)
+{
+	struct vfio_container *container = device->group->container;
+
+	if (WARN_ON(npage <= 0 || npage > VFIO_PIN_PAGES_MAX_ENTRIES))
+		return;
+
+	container->iommu_driver->ops->unpin_pages(container->iommu_data, iova,
+						  npage);
+}
+
+int vfio_device_container_dma_rw(struct vfio_device *device,
+				 dma_addr_t iova, void *data,
+				 size_t len, bool write)
+{
+	struct vfio_container *container = device->group->container;
+	struct vfio_iommu_driver *driver = container->iommu_driver;
+
+	if (unlikely(!driver || !driver->ops->dma_rw))
+		return -ENOTTY;
+	return driver->ops->dma_rw(container->iommu_data, iova, data, len,
+				   write);
+}
+
+int __init vfio_container_init(void)
+{
+	int ret;
+
+	mutex_init(&vfio.iommu_drivers_lock);
+	INIT_LIST_HEAD(&vfio.iommu_drivers_list);
+
+	ret = misc_register(&vfio_dev);
+	if (ret) {
+		pr_err("vfio: misc device register failed\n");
+		return ret;
+	}
+
+	if (IS_ENABLED(CONFIG_VFIO_NOIOMMU)) {
+		ret = vfio_register_iommu_driver(&vfio_noiommu_ops);
+		if (ret)
+			goto err_misc;
+	}
+	return 0;
+
+err_misc:
+	misc_deregister(&vfio_dev);
+	return ret;
+}
+
+void vfio_container_cleanup(void)
+{
+	if (IS_ENABLED(CONFIG_VFIO_NOIOMMU))
+		vfio_unregister_iommu_driver(&vfio_noiommu_ops);
+	misc_deregister(&vfio_dev);
+	mutex_destroy(&vfio.iommu_drivers_lock);
+}
+
+MODULE_ALIAS_MISCDEV(VFIO_MINOR);
+MODULE_ALIAS("devname:vfio/vfio");
diff --git a/drivers/vfio/device_cdev.c b/drivers/vfio/device_cdev.c
new file mode 100644
index 0000000000..e75da0a70d
--- /dev/null
+++ b/drivers/vfio/device_cdev.c
@@ -0,0 +1,228 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2023 Intel Corporation.
+ */
+#include <linux/vfio.h>
+#include <linux/iommufd.h>
+
+#include "vfio.h"
+
+static dev_t device_devt;
+
+void vfio_init_device_cdev(struct vfio_device *device)
+{
+	device->device.devt = MKDEV(MAJOR(device_devt), device->index);
+	cdev_init(&device->cdev, &vfio_device_fops);
+	device->cdev.owner = THIS_MODULE;
+}
+
+/*
+ * device access via the fd opened by this function is blocked until
+ * .open_device() is called successfully during BIND_IOMMUFD.
+ */
+int vfio_device_fops_cdev_open(struct inode *inode, struct file *filep)
+{
+	struct vfio_device *device = container_of(inode->i_cdev,
+						  struct vfio_device, cdev);
+	struct vfio_device_file *df;
+	int ret;
+
+	/* Paired with the put in vfio_device_fops_release() */
+	if (!vfio_device_try_get_registration(device))
+		return -ENODEV;
+
+	df = vfio_allocate_device_file(device);
+	if (IS_ERR(df)) {
+		ret = PTR_ERR(df);
+		goto err_put_registration;
+	}
+
+	filep->private_data = df;
+
+	return 0;
+
+err_put_registration:
+	vfio_device_put_registration(device);
+	return ret;
+}
+
+static void vfio_df_get_kvm_safe(struct vfio_device_file *df)
+{
+	spin_lock(&df->kvm_ref_lock);
+	vfio_device_get_kvm_safe(df->device, df->kvm);
+	spin_unlock(&df->kvm_ref_lock);
+}
+
+long vfio_df_ioctl_bind_iommufd(struct vfio_device_file *df,
+				struct vfio_device_bind_iommufd __user *arg)
+{
+	struct vfio_device *device = df->device;
+	struct vfio_device_bind_iommufd bind;
+	unsigned long minsz;
+	int ret;
+
+	static_assert(__same_type(arg->out_devid, df->devid));
+
+	minsz = offsetofend(struct vfio_device_bind_iommufd, out_devid);
+
+	if (copy_from_user(&bind, arg, minsz))
+		return -EFAULT;
+
+	if (bind.argsz < minsz || bind.flags || bind.iommufd < 0)
+		return -EINVAL;
+
+	/* BIND_IOMMUFD only allowed for cdev fds */
+	if (df->group)
+		return -EINVAL;
+
+	ret = vfio_device_block_group(device);
+	if (ret)
+		return ret;
+
+	mutex_lock(&device->dev_set->lock);
+	/* one device cannot be bound twice */
+	if (df->access_granted) {
+		ret = -EINVAL;
+		goto out_unlock;
+	}
+
+	df->iommufd = iommufd_ctx_from_fd(bind.iommufd);
+	if (IS_ERR(df->iommufd)) {
+		ret = PTR_ERR(df->iommufd);
+		df->iommufd = NULL;
+		goto out_unlock;
+	}
+
+	/*
+	 * Before the device open, get the KVM pointer currently
+	 * associated with the device file (if there is) and obtain
+	 * a reference.  This reference is held until device closed.
+	 * Save the pointer in the device for use by drivers.
+	 */
+	vfio_df_get_kvm_safe(df);
+
+	ret = vfio_df_open(df);
+	if (ret)
+		goto out_put_kvm;
+
+	ret = copy_to_user(&arg->out_devid, &df->devid,
+			   sizeof(df->devid)) ? -EFAULT : 0;
+	if (ret)
+		goto out_close_device;
+
+	device->cdev_opened = true;
+	/*
+	 * Paired with smp_load_acquire() in vfio_device_fops::ioctl/
+	 * read/write/mmap
+	 */
+	smp_store_release(&df->access_granted, true);
+	mutex_unlock(&device->dev_set->lock);
+	return 0;
+
+out_close_device:
+	vfio_df_close(df);
+out_put_kvm:
+	vfio_device_put_kvm(device);
+	iommufd_ctx_put(df->iommufd);
+	df->iommufd = NULL;
+out_unlock:
+	mutex_unlock(&device->dev_set->lock);
+	vfio_device_unblock_group(device);
+	return ret;
+}
+
+void vfio_df_unbind_iommufd(struct vfio_device_file *df)
+{
+	struct vfio_device *device = df->device;
+
+	/*
+	 * In the time of close, there is no contention with another one
+	 * changing this flag.  So read df->access_granted without lock
+	 * and no smp_load_acquire() is ok.
+	 */
+	if (!df->access_granted)
+		return;
+
+	mutex_lock(&device->dev_set->lock);
+	vfio_df_close(df);
+	vfio_device_put_kvm(device);
+	iommufd_ctx_put(df->iommufd);
+	device->cdev_opened = false;
+	mutex_unlock(&device->dev_set->lock);
+	vfio_device_unblock_group(device);
+}
+
+int vfio_df_ioctl_attach_pt(struct vfio_device_file *df,
+			    struct vfio_device_attach_iommufd_pt __user *arg)
+{
+	struct vfio_device *device = df->device;
+	struct vfio_device_attach_iommufd_pt attach;
+	unsigned long minsz;
+	int ret;
+
+	minsz = offsetofend(struct vfio_device_attach_iommufd_pt, pt_id);
+
+	if (copy_from_user(&attach, arg, minsz))
+		return -EFAULT;
+
+	if (attach.argsz < minsz || attach.flags)
+		return -EINVAL;
+
+	mutex_lock(&device->dev_set->lock);
+	ret = device->ops->attach_ioas(device, &attach.pt_id);
+	if (ret)
+		goto out_unlock;
+
+	if (copy_to_user(&arg->pt_id, &attach.pt_id, sizeof(attach.pt_id))) {
+		ret = -EFAULT;
+		goto out_detach;
+	}
+	mutex_unlock(&device->dev_set->lock);
+
+	return 0;
+
+out_detach:
+	device->ops->detach_ioas(device);
+out_unlock:
+	mutex_unlock(&device->dev_set->lock);
+	return ret;
+}
+
+int vfio_df_ioctl_detach_pt(struct vfio_device_file *df,
+			    struct vfio_device_detach_iommufd_pt __user *arg)
+{
+	struct vfio_device *device = df->device;
+	struct vfio_device_detach_iommufd_pt detach;
+	unsigned long minsz;
+
+	minsz = offsetofend(struct vfio_device_detach_iommufd_pt, flags);
+
+	if (copy_from_user(&detach, arg, minsz))
+		return -EFAULT;
+
+	if (detach.argsz < minsz || detach.flags)
+		return -EINVAL;
+
+	mutex_lock(&device->dev_set->lock);
+	device->ops->detach_ioas(device);
+	mutex_unlock(&device->dev_set->lock);
+
+	return 0;
+}
+
+static char *vfio_device_devnode(const struct device *dev, umode_t *mode)
+{
+	return kasprintf(GFP_KERNEL, "vfio/devices/%s", dev_name(dev));
+}
+
+int vfio_cdev_init(struct class *device_class)
+{
+	device_class->devnode = vfio_device_devnode;
+	return alloc_chrdev_region(&device_devt, 0,
+				   MINORMASK + 1, "vfio-dev");
+}
+
+void vfio_cdev_cleanup(void)
+{
+	unregister_chrdev_region(device_devt, MINORMASK + 1);
+}
diff --git a/drivers/vfio/fsl-mc/Kconfig b/drivers/vfio/fsl-mc/Kconfig
new file mode 100644
index 0000000000..7d1d690348
--- /dev/null
+++ b/drivers/vfio/fsl-mc/Kconfig
@@ -0,0 +1,14 @@
+menu "VFIO support for FSL_MC bus devices"
+	depends on FSL_MC_BUS
+
+config VFIO_FSL_MC
+	tristate "VFIO support for QorIQ DPAA2 fsl-mc bus devices"
+	select EVENTFD
+	help
+	  Driver to enable support for the VFIO QorIQ DPAA2 fsl-mc
+	  (Management Complex) devices. This is required to passthrough
+	  fsl-mc bus devices using the VFIO framework.
+
+	  If you don't know what to do here, say N.
+
+endmenu
diff --git a/drivers/vfio/fsl-mc/Makefile b/drivers/vfio/fsl-mc/Makefile
new file mode 100644
index 0000000000..cad6dbf0b7
--- /dev/null
+++ b/drivers/vfio/fsl-mc/Makefile
@@ -0,0 +1,4 @@
+# SPDX-License-Identifier: (GPL-2.0+ OR BSD-3-Clause)
+
+vfio-fsl-mc-y := vfio_fsl_mc.o vfio_fsl_mc_intr.o
+obj-$(CONFIG_VFIO_FSL_MC) += vfio-fsl-mc.o
diff --git a/drivers/vfio/fsl-mc/vfio_fsl_mc.c b/drivers/vfio/fsl-mc/vfio_fsl_mc.c
new file mode 100644
index 0000000000..f65d91c01f
--- /dev/null
+++ b/drivers/vfio/fsl-mc/vfio_fsl_mc.c
@@ -0,0 +1,611 @@
+// SPDX-License-Identifier: (GPL-2.0+ OR BSD-3-Clause)
+/*
+ * Copyright 2013-2016 Freescale Semiconductor Inc.
+ * Copyright 2016-2017,2019-2020 NXP
+ */
+
+#include <linux/device.h>
+#include <linux/iommu.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/vfio.h>
+#include <linux/fsl/mc.h>
+#include <linux/delay.h>
+#include <linux/io-64-nonatomic-hi-lo.h>
+
+#include "vfio_fsl_mc_private.h"
+
+static struct fsl_mc_driver vfio_fsl_mc_driver;
+
+static int vfio_fsl_mc_open_device(struct vfio_device *core_vdev)
+{
+	struct vfio_fsl_mc_device *vdev =
+		container_of(core_vdev, struct vfio_fsl_mc_device, vdev);
+	struct fsl_mc_device *mc_dev = vdev->mc_dev;
+	int count = mc_dev->obj_desc.region_count;
+	int i;
+
+	vdev->regions = kcalloc(count, sizeof(struct vfio_fsl_mc_region),
+				GFP_KERNEL_ACCOUNT);
+	if (!vdev->regions)
+		return -ENOMEM;
+
+	for (i = 0; i < count; i++) {
+		struct resource *res = &mc_dev->regions[i];
+		int no_mmap = is_fsl_mc_bus_dprc(mc_dev);
+
+		vdev->regions[i].addr = res->start;
+		vdev->regions[i].size = resource_size(res);
+		vdev->regions[i].type = mc_dev->regions[i].flags & IORESOURCE_BITS;
+		/*
+		 * Only regions addressed with PAGE granularity may be
+		 * MMAPed securely.
+		 */
+		if (!no_mmap && !(vdev->regions[i].addr & ~PAGE_MASK) &&
+				!(vdev->regions[i].size & ~PAGE_MASK))
+			vdev->regions[i].flags |=
+					VFIO_REGION_INFO_FLAG_MMAP;
+		vdev->regions[i].flags |= VFIO_REGION_INFO_FLAG_READ;
+		if (!(mc_dev->regions[i].flags & IORESOURCE_READONLY))
+			vdev->regions[i].flags |= VFIO_REGION_INFO_FLAG_WRITE;
+	}
+
+	return 0;
+}
+
+static void vfio_fsl_mc_regions_cleanup(struct vfio_fsl_mc_device *vdev)
+{
+	struct fsl_mc_device *mc_dev = vdev->mc_dev;
+	int i;
+
+	for (i = 0; i < mc_dev->obj_desc.region_count; i++)
+		iounmap(vdev->regions[i].ioaddr);
+	kfree(vdev->regions);
+}
+
+static int vfio_fsl_mc_reset_device(struct vfio_fsl_mc_device *vdev)
+{
+	struct fsl_mc_device *mc_dev = vdev->mc_dev;
+	int ret = 0;
+
+	if (is_fsl_mc_bus_dprc(vdev->mc_dev)) {
+		return dprc_reset_container(mc_dev->mc_io, 0,
+					mc_dev->mc_handle,
+					mc_dev->obj_desc.id,
+					DPRC_RESET_OPTION_NON_RECURSIVE);
+	} else {
+		u16 token;
+
+		ret = fsl_mc_obj_open(mc_dev->mc_io, 0, mc_dev->obj_desc.id,
+				      mc_dev->obj_desc.type,
+				      &token);
+		if (ret)
+			goto out;
+		ret = fsl_mc_obj_reset(mc_dev->mc_io, 0, token);
+		if (ret) {
+			fsl_mc_obj_close(mc_dev->mc_io, 0, token);
+			goto out;
+		}
+		ret = fsl_mc_obj_close(mc_dev->mc_io, 0, token);
+	}
+out:
+	return ret;
+}
+
+static void vfio_fsl_mc_close_device(struct vfio_device *core_vdev)
+{
+	struct vfio_fsl_mc_device *vdev =
+		container_of(core_vdev, struct vfio_fsl_mc_device, vdev);
+	struct fsl_mc_device *mc_dev = vdev->mc_dev;
+	struct device *cont_dev = fsl_mc_cont_dev(&mc_dev->dev);
+	struct fsl_mc_device *mc_cont = to_fsl_mc_device(cont_dev);
+	int ret;
+
+	vfio_fsl_mc_regions_cleanup(vdev);
+
+	/* reset the device before cleaning up the interrupts */
+	ret = vfio_fsl_mc_reset_device(vdev);
+
+	if (ret)
+		dev_warn(&mc_cont->dev,
+			 "VFIO_FSL_MC: reset device has failed (%d)\n", ret);
+
+	vfio_fsl_mc_irqs_cleanup(vdev);
+
+	fsl_mc_cleanup_irq_pool(mc_cont);
+}
+
+static long vfio_fsl_mc_ioctl(struct vfio_device *core_vdev,
+			      unsigned int cmd, unsigned long arg)
+{
+	unsigned long minsz;
+	struct vfio_fsl_mc_device *vdev =
+		container_of(core_vdev, struct vfio_fsl_mc_device, vdev);
+	struct fsl_mc_device *mc_dev = vdev->mc_dev;
+
+	switch (cmd) {
+	case VFIO_DEVICE_GET_INFO:
+	{
+		struct vfio_device_info info;
+
+		minsz = offsetofend(struct vfio_device_info, num_irqs);
+
+		if (copy_from_user(&info, (void __user *)arg, minsz))
+			return -EFAULT;
+
+		if (info.argsz < minsz)
+			return -EINVAL;
+
+		info.flags = VFIO_DEVICE_FLAGS_FSL_MC;
+
+		if (is_fsl_mc_bus_dprc(mc_dev))
+			info.flags |= VFIO_DEVICE_FLAGS_RESET;
+
+		info.num_regions = mc_dev->obj_desc.region_count;
+		info.num_irqs = mc_dev->obj_desc.irq_count;
+
+		return copy_to_user((void __user *)arg, &info, minsz) ?
+			-EFAULT : 0;
+	}
+	case VFIO_DEVICE_GET_REGION_INFO:
+	{
+		struct vfio_region_info info;
+
+		minsz = offsetofend(struct vfio_region_info, offset);
+
+		if (copy_from_user(&info, (void __user *)arg, minsz))
+			return -EFAULT;
+
+		if (info.argsz < minsz)
+			return -EINVAL;
+
+		if (info.index >= mc_dev->obj_desc.region_count)
+			return -EINVAL;
+
+		/* map offset to the physical address  */
+		info.offset = VFIO_FSL_MC_INDEX_TO_OFFSET(info.index);
+		info.size = vdev->regions[info.index].size;
+		info.flags = vdev->regions[info.index].flags;
+
+		if (copy_to_user((void __user *)arg, &info, minsz))
+			return -EFAULT;
+		return 0;
+	}
+	case VFIO_DEVICE_GET_IRQ_INFO:
+	{
+		struct vfio_irq_info info;
+
+		minsz = offsetofend(struct vfio_irq_info, count);
+		if (copy_from_user(&info, (void __user *)arg, minsz))
+			return -EFAULT;
+
+		if (info.argsz < minsz)
+			return -EINVAL;
+
+		if (info.index >= mc_dev->obj_desc.irq_count)
+			return -EINVAL;
+
+		info.flags = VFIO_IRQ_INFO_EVENTFD;
+		info.count = 1;
+
+		if (copy_to_user((void __user *)arg, &info, minsz))
+			return -EFAULT;
+		return 0;
+	}
+	case VFIO_DEVICE_SET_IRQS:
+	{
+		struct vfio_irq_set hdr;
+		u8 *data = NULL;
+		int ret = 0;
+		size_t data_size = 0;
+
+		minsz = offsetofend(struct vfio_irq_set, count);
+
+		if (copy_from_user(&hdr, (void __user *)arg, minsz))
+			return -EFAULT;
+
+		ret = vfio_set_irqs_validate_and_prepare(&hdr, mc_dev->obj_desc.irq_count,
+					mc_dev->obj_desc.irq_count, &data_size);
+		if (ret)
+			return ret;
+
+		if (data_size) {
+			data = memdup_user((void __user *)(arg + minsz),
+				   data_size);
+			if (IS_ERR(data))
+				return PTR_ERR(data);
+		}
+
+		mutex_lock(&vdev->igate);
+		ret = vfio_fsl_mc_set_irqs_ioctl(vdev, hdr.flags,
+						 hdr.index, hdr.start,
+						 hdr.count, data);
+		mutex_unlock(&vdev->igate);
+		kfree(data);
+
+		return ret;
+	}
+	case VFIO_DEVICE_RESET:
+	{
+		return vfio_fsl_mc_reset_device(vdev);
+
+	}
+	default:
+		return -ENOTTY;
+	}
+}
+
+static ssize_t vfio_fsl_mc_read(struct vfio_device *core_vdev, char __user *buf,
+				size_t count, loff_t *ppos)
+{
+	struct vfio_fsl_mc_device *vdev =
+		container_of(core_vdev, struct vfio_fsl_mc_device, vdev);
+	unsigned int index = VFIO_FSL_MC_OFFSET_TO_INDEX(*ppos);
+	loff_t off = *ppos & VFIO_FSL_MC_OFFSET_MASK;
+	struct fsl_mc_device *mc_dev = vdev->mc_dev;
+	struct vfio_fsl_mc_region *region;
+	u64 data[8];
+	int i;
+
+	if (index >= mc_dev->obj_desc.region_count)
+		return -EINVAL;
+
+	region = &vdev->regions[index];
+
+	if (!(region->flags & VFIO_REGION_INFO_FLAG_READ))
+		return -EINVAL;
+
+	if (!region->ioaddr) {
+		region->ioaddr = ioremap(region->addr, region->size);
+		if (!region->ioaddr)
+			return -ENOMEM;
+	}
+
+	if (count != 64 || off != 0)
+		return -EINVAL;
+
+	for (i = 7; i >= 0; i--)
+		data[i] = readq(region->ioaddr + i * sizeof(uint64_t));
+
+	if (copy_to_user(buf, data, 64))
+		return -EFAULT;
+
+	return count;
+}
+
+#define MC_CMD_COMPLETION_TIMEOUT_MS    5000
+#define MC_CMD_COMPLETION_POLLING_MAX_SLEEP_USECS    500
+
+static int vfio_fsl_mc_send_command(void __iomem *ioaddr, uint64_t *cmd_data)
+{
+	int i;
+	enum mc_cmd_status status;
+	unsigned long timeout_usecs = MC_CMD_COMPLETION_TIMEOUT_MS * 1000;
+
+	/* Write at command parameter into portal */
+	for (i = 7; i >= 1; i--)
+		writeq_relaxed(cmd_data[i], ioaddr + i * sizeof(uint64_t));
+
+	/* Write command header in the end */
+	writeq(cmd_data[0], ioaddr);
+
+	/* Wait for response before returning to user-space
+	 * This can be optimized in future to even prepare response
+	 * before returning to user-space and avoid read ioctl.
+	 */
+	for (;;) {
+		u64 header;
+		struct mc_cmd_header *resp_hdr;
+
+		header = cpu_to_le64(readq_relaxed(ioaddr));
+
+		resp_hdr = (struct mc_cmd_header *)&header;
+		status = (enum mc_cmd_status)resp_hdr->status;
+		if (status != MC_CMD_STATUS_READY)
+			break;
+
+		udelay(MC_CMD_COMPLETION_POLLING_MAX_SLEEP_USECS);
+		timeout_usecs -= MC_CMD_COMPLETION_POLLING_MAX_SLEEP_USECS;
+		if (timeout_usecs == 0)
+			return -ETIMEDOUT;
+	}
+
+	return 0;
+}
+
+static ssize_t vfio_fsl_mc_write(struct vfio_device *core_vdev,
+				 const char __user *buf, size_t count,
+				 loff_t *ppos)
+{
+	struct vfio_fsl_mc_device *vdev =
+		container_of(core_vdev, struct vfio_fsl_mc_device, vdev);
+	unsigned int index = VFIO_FSL_MC_OFFSET_TO_INDEX(*ppos);
+	loff_t off = *ppos & VFIO_FSL_MC_OFFSET_MASK;
+	struct fsl_mc_device *mc_dev = vdev->mc_dev;
+	struct vfio_fsl_mc_region *region;
+	u64 data[8];
+	int ret;
+
+	if (index >= mc_dev->obj_desc.region_count)
+		return -EINVAL;
+
+	region = &vdev->regions[index];
+
+	if (!(region->flags & VFIO_REGION_INFO_FLAG_WRITE))
+		return -EINVAL;
+
+	if (!region->ioaddr) {
+		region->ioaddr = ioremap(region->addr, region->size);
+		if (!region->ioaddr)
+			return -ENOMEM;
+	}
+
+	if (count != 64 || off != 0)
+		return -EINVAL;
+
+	if (copy_from_user(&data, buf, 64))
+		return -EFAULT;
+
+	ret = vfio_fsl_mc_send_command(region->ioaddr, data);
+	if (ret)
+		return ret;
+
+	return count;
+
+}
+
+static int vfio_fsl_mc_mmap_mmio(struct vfio_fsl_mc_region region,
+				 struct vm_area_struct *vma)
+{
+	u64 size = vma->vm_end - vma->vm_start;
+	u64 pgoff, base;
+	u8 region_cacheable;
+
+	pgoff = vma->vm_pgoff &
+		((1U << (VFIO_FSL_MC_OFFSET_SHIFT - PAGE_SHIFT)) - 1);
+	base = pgoff << PAGE_SHIFT;
+
+	if (region.size < PAGE_SIZE || base + size > region.size)
+		return -EINVAL;
+
+	region_cacheable = (region.type & FSL_MC_REGION_CACHEABLE) &&
+			   (region.type & FSL_MC_REGION_SHAREABLE);
+	if (!region_cacheable)
+		vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+
+	vma->vm_pgoff = (region.addr >> PAGE_SHIFT) + pgoff;
+
+	return remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff,
+			       size, vma->vm_page_prot);
+}
+
+static int vfio_fsl_mc_mmap(struct vfio_device *core_vdev,
+			    struct vm_area_struct *vma)
+{
+	struct vfio_fsl_mc_device *vdev =
+		container_of(core_vdev, struct vfio_fsl_mc_device, vdev);
+	struct fsl_mc_device *mc_dev = vdev->mc_dev;
+	unsigned int index;
+
+	index = vma->vm_pgoff >> (VFIO_FSL_MC_OFFSET_SHIFT - PAGE_SHIFT);
+
+	if (vma->vm_end < vma->vm_start)
+		return -EINVAL;
+	if (vma->vm_start & ~PAGE_MASK)
+		return -EINVAL;
+	if (vma->vm_end & ~PAGE_MASK)
+		return -EINVAL;
+	if (!(vma->vm_flags & VM_SHARED))
+		return -EINVAL;
+	if (index >= mc_dev->obj_desc.region_count)
+		return -EINVAL;
+
+	if (!(vdev->regions[index].flags & VFIO_REGION_INFO_FLAG_MMAP))
+		return -EINVAL;
+
+	if (!(vdev->regions[index].flags & VFIO_REGION_INFO_FLAG_READ)
+			&& (vma->vm_flags & VM_READ))
+		return -EINVAL;
+
+	if (!(vdev->regions[index].flags & VFIO_REGION_INFO_FLAG_WRITE)
+			&& (vma->vm_flags & VM_WRITE))
+		return -EINVAL;
+
+	vma->vm_private_data = mc_dev;
+
+	return vfio_fsl_mc_mmap_mmio(vdev->regions[index], vma);
+}
+
+static const struct vfio_device_ops vfio_fsl_mc_ops;
+static int vfio_fsl_mc_bus_notifier(struct notifier_block *nb,
+				    unsigned long action, void *data)
+{
+	struct vfio_fsl_mc_device *vdev = container_of(nb,
+					struct vfio_fsl_mc_device, nb);
+	struct device *dev = data;
+	struct fsl_mc_device *mc_dev = to_fsl_mc_device(dev);
+	struct fsl_mc_device *mc_cont = to_fsl_mc_device(mc_dev->dev.parent);
+
+	if (action == BUS_NOTIFY_ADD_DEVICE &&
+	    vdev->mc_dev == mc_cont) {
+		mc_dev->driver_override = kasprintf(GFP_KERNEL, "%s",
+						    vfio_fsl_mc_ops.name);
+		if (!mc_dev->driver_override)
+			dev_warn(dev, "VFIO_FSL_MC: Setting driver override for device in dprc %s failed\n",
+				 dev_name(&mc_cont->dev));
+		else
+			dev_info(dev, "VFIO_FSL_MC: Setting driver override for device in dprc %s\n",
+				 dev_name(&mc_cont->dev));
+	} else if (action == BUS_NOTIFY_BOUND_DRIVER &&
+		vdev->mc_dev == mc_cont) {
+		struct fsl_mc_driver *mc_drv = to_fsl_mc_driver(dev->driver);
+
+		if (mc_drv && mc_drv != &vfio_fsl_mc_driver)
+			dev_warn(dev, "VFIO_FSL_MC: Object %s bound to driver %s while DPRC bound to vfio-fsl-mc\n",
+				 dev_name(dev), mc_drv->driver.name);
+	}
+
+	return 0;
+}
+
+static int vfio_fsl_mc_init_device(struct vfio_fsl_mc_device *vdev)
+{
+	struct fsl_mc_device *mc_dev = vdev->mc_dev;
+	int ret;
+
+	/* Non-dprc devices share mc_io from parent */
+	if (!is_fsl_mc_bus_dprc(mc_dev)) {
+		struct fsl_mc_device *mc_cont = to_fsl_mc_device(mc_dev->dev.parent);
+
+		mc_dev->mc_io = mc_cont->mc_io;
+		return 0;
+	}
+
+	vdev->nb.notifier_call = vfio_fsl_mc_bus_notifier;
+	ret = bus_register_notifier(&fsl_mc_bus_type, &vdev->nb);
+	if (ret)
+		return ret;
+
+	/* open DPRC, allocate a MC portal */
+	ret = dprc_setup(mc_dev);
+	if (ret) {
+		dev_err(&mc_dev->dev, "VFIO_FSL_MC: Failed to setup DPRC (%d)\n", ret);
+		goto out_nc_unreg;
+	}
+	return 0;
+
+out_nc_unreg:
+	bus_unregister_notifier(&fsl_mc_bus_type, &vdev->nb);
+	return ret;
+}
+
+static int vfio_fsl_mc_scan_container(struct fsl_mc_device *mc_dev)
+{
+	int ret;
+
+	/* non dprc devices do not scan for other devices */
+	if (!is_fsl_mc_bus_dprc(mc_dev))
+		return 0;
+	ret = dprc_scan_container(mc_dev, false);
+	if (ret) {
+		dev_err(&mc_dev->dev,
+			"VFIO_FSL_MC: Container scanning failed (%d)\n", ret);
+		dprc_remove_devices(mc_dev, NULL, 0);
+		return ret;
+	}
+	return 0;
+}
+
+static void vfio_fsl_uninit_device(struct vfio_fsl_mc_device *vdev)
+{
+	struct fsl_mc_device *mc_dev = vdev->mc_dev;
+
+	if (!is_fsl_mc_bus_dprc(mc_dev))
+		return;
+
+	dprc_cleanup(mc_dev);
+	bus_unregister_notifier(&fsl_mc_bus_type, &vdev->nb);
+}
+
+static int vfio_fsl_mc_init_dev(struct vfio_device *core_vdev)
+{
+	struct vfio_fsl_mc_device *vdev =
+		container_of(core_vdev, struct vfio_fsl_mc_device, vdev);
+	struct fsl_mc_device *mc_dev = to_fsl_mc_device(core_vdev->dev);
+	int ret;
+
+	vdev->mc_dev = mc_dev;
+	mutex_init(&vdev->igate);
+
+	if (is_fsl_mc_bus_dprc(mc_dev))
+		ret = vfio_assign_device_set(core_vdev, &mc_dev->dev);
+	else
+		ret = vfio_assign_device_set(core_vdev, mc_dev->dev.parent);
+
+	if (ret)
+		return ret;
+
+	/* device_set is released by vfio core if @init fails */
+	return vfio_fsl_mc_init_device(vdev);
+}
+
+static int vfio_fsl_mc_probe(struct fsl_mc_device *mc_dev)
+{
+	struct vfio_fsl_mc_device *vdev;
+	struct device *dev = &mc_dev->dev;
+	int ret;
+
+	vdev = vfio_alloc_device(vfio_fsl_mc_device, vdev, dev,
+				 &vfio_fsl_mc_ops);
+	if (IS_ERR(vdev))
+		return PTR_ERR(vdev);
+
+	ret = vfio_register_group_dev(&vdev->vdev);
+	if (ret) {
+		dev_err(dev, "VFIO_FSL_MC: Failed to add to vfio group\n");
+		goto out_put_vdev;
+	}
+
+	ret = vfio_fsl_mc_scan_container(mc_dev);
+	if (ret)
+		goto out_group_dev;
+	dev_set_drvdata(dev, vdev);
+	return 0;
+
+out_group_dev:
+	vfio_unregister_group_dev(&vdev->vdev);
+out_put_vdev:
+	vfio_put_device(&vdev->vdev);
+	return ret;
+}
+
+static void vfio_fsl_mc_release_dev(struct vfio_device *core_vdev)
+{
+	struct vfio_fsl_mc_device *vdev =
+		container_of(core_vdev, struct vfio_fsl_mc_device, vdev);
+
+	vfio_fsl_uninit_device(vdev);
+	mutex_destroy(&vdev->igate);
+}
+
+static void vfio_fsl_mc_remove(struct fsl_mc_device *mc_dev)
+{
+	struct device *dev = &mc_dev->dev;
+	struct vfio_fsl_mc_device *vdev = dev_get_drvdata(dev);
+
+	vfio_unregister_group_dev(&vdev->vdev);
+	dprc_remove_devices(mc_dev, NULL, 0);
+	vfio_put_device(&vdev->vdev);
+}
+
+static const struct vfio_device_ops vfio_fsl_mc_ops = {
+	.name		= "vfio-fsl-mc",
+	.init		= vfio_fsl_mc_init_dev,
+	.release	= vfio_fsl_mc_release_dev,
+	.open_device	= vfio_fsl_mc_open_device,
+	.close_device	= vfio_fsl_mc_close_device,
+	.ioctl		= vfio_fsl_mc_ioctl,
+	.read		= vfio_fsl_mc_read,
+	.write		= vfio_fsl_mc_write,
+	.mmap		= vfio_fsl_mc_mmap,
+	.bind_iommufd	= vfio_iommufd_physical_bind,
+	.unbind_iommufd	= vfio_iommufd_physical_unbind,
+	.attach_ioas	= vfio_iommufd_physical_attach_ioas,
+	.detach_ioas	= vfio_iommufd_physical_detach_ioas,
+};
+
+static struct fsl_mc_driver vfio_fsl_mc_driver = {
+	.probe		= vfio_fsl_mc_probe,
+	.remove		= vfio_fsl_mc_remove,
+	.driver	= {
+		.name	= "vfio-fsl-mc",
+	},
+	.driver_managed_dma = true,
+};
+
+module_fsl_mc_driver(vfio_fsl_mc_driver);
+
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_DESCRIPTION("VFIO for FSL-MC devices - User Level meta-driver");
diff --git a/drivers/vfio/fsl-mc/vfio_fsl_mc_intr.c b/drivers/vfio/fsl-mc/vfio_fsl_mc_intr.c
new file mode 100644
index 0000000000..c51229fccb
--- /dev/null
+++ b/drivers/vfio/fsl-mc/vfio_fsl_mc_intr.c
@@ -0,0 +1,193 @@
+// SPDX-License-Identifier: (GPL-2.0+ OR BSD-3-Clause)
+/*
+ * Copyright 2013-2016 Freescale Semiconductor Inc.
+ * Copyright 2019 NXP
+ */
+
+#include <linux/vfio.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/eventfd.h>
+
+#include "linux/fsl/mc.h"
+#include "vfio_fsl_mc_private.h"
+
+static int vfio_fsl_mc_irqs_allocate(struct vfio_fsl_mc_device *vdev)
+{
+	struct fsl_mc_device *mc_dev = vdev->mc_dev;
+	struct vfio_fsl_mc_irq *mc_irq;
+	int irq_count;
+	int ret, i;
+
+	/* Device does not support any interrupt */
+	if (mc_dev->obj_desc.irq_count == 0)
+		return 0;
+
+	/* interrupts were already allocated for this device */
+	if (vdev->mc_irqs)
+		return 0;
+
+	irq_count = mc_dev->obj_desc.irq_count;
+
+	mc_irq = kcalloc(irq_count, sizeof(*mc_irq), GFP_KERNEL_ACCOUNT);
+	if (!mc_irq)
+		return -ENOMEM;
+
+	/* Allocate IRQs */
+	ret = fsl_mc_allocate_irqs(mc_dev);
+	if (ret) {
+		kfree(mc_irq);
+		return ret;
+	}
+
+	for (i = 0; i < irq_count; i++) {
+		mc_irq[i].count = 1;
+		mc_irq[i].flags = VFIO_IRQ_INFO_EVENTFD;
+	}
+
+	vdev->mc_irqs = mc_irq;
+
+	return 0;
+}
+
+static irqreturn_t vfio_fsl_mc_irq_handler(int irq_num, void *arg)
+{
+	struct vfio_fsl_mc_irq *mc_irq = (struct vfio_fsl_mc_irq *)arg;
+
+	eventfd_signal(mc_irq->trigger, 1);
+	return IRQ_HANDLED;
+}
+
+static int vfio_set_trigger(struct vfio_fsl_mc_device *vdev,
+						   int index, int fd)
+{
+	struct vfio_fsl_mc_irq *irq = &vdev->mc_irqs[index];
+	struct eventfd_ctx *trigger;
+	int hwirq;
+	int ret;
+
+	hwirq = vdev->mc_dev->irqs[index]->virq;
+	if (irq->trigger) {
+		free_irq(hwirq, irq);
+		kfree(irq->name);
+		eventfd_ctx_put(irq->trigger);
+		irq->trigger = NULL;
+	}
+
+	if (fd < 0) /* Disable only */
+		return 0;
+
+	irq->name = kasprintf(GFP_KERNEL_ACCOUNT, "vfio-irq[%d](%s)",
+			    hwirq, dev_name(&vdev->mc_dev->dev));
+	if (!irq->name)
+		return -ENOMEM;
+
+	trigger = eventfd_ctx_fdget(fd);
+	if (IS_ERR(trigger)) {
+		kfree(irq->name);
+		return PTR_ERR(trigger);
+	}
+
+	irq->trigger = trigger;
+
+	ret = request_irq(hwirq, vfio_fsl_mc_irq_handler, 0,
+		  irq->name, irq);
+	if (ret) {
+		kfree(irq->name);
+		eventfd_ctx_put(trigger);
+		irq->trigger = NULL;
+		return ret;
+	}
+
+	return 0;
+}
+
+static int vfio_fsl_mc_set_irq_trigger(struct vfio_fsl_mc_device *vdev,
+				       unsigned int index, unsigned int start,
+				       unsigned int count, u32 flags,
+				       void *data)
+{
+	struct fsl_mc_device *mc_dev = vdev->mc_dev;
+	int ret, hwirq;
+	struct vfio_fsl_mc_irq *irq;
+	struct device *cont_dev = fsl_mc_cont_dev(&mc_dev->dev);
+	struct fsl_mc_device *mc_cont = to_fsl_mc_device(cont_dev);
+
+	if (!count && (flags & VFIO_IRQ_SET_DATA_NONE))
+		return vfio_set_trigger(vdev, index, -1);
+
+	if (start != 0 || count != 1)
+		return -EINVAL;
+
+	mutex_lock(&vdev->vdev.dev_set->lock);
+	ret = fsl_mc_populate_irq_pool(mc_cont,
+			FSL_MC_IRQ_POOL_MAX_TOTAL_IRQS);
+	if (ret)
+		goto unlock;
+
+	ret = vfio_fsl_mc_irqs_allocate(vdev);
+	if (ret)
+		goto unlock;
+	mutex_unlock(&vdev->vdev.dev_set->lock);
+
+	if (flags & VFIO_IRQ_SET_DATA_EVENTFD) {
+		s32 fd = *(s32 *)data;
+
+		return vfio_set_trigger(vdev, index, fd);
+	}
+
+	hwirq = vdev->mc_dev->irqs[index]->virq;
+
+	irq = &vdev->mc_irqs[index];
+
+	if (flags & VFIO_IRQ_SET_DATA_NONE) {
+		vfio_fsl_mc_irq_handler(hwirq, irq);
+
+	} else if (flags & VFIO_IRQ_SET_DATA_BOOL) {
+		u8 trigger = *(u8 *)data;
+
+		if (trigger)
+			vfio_fsl_mc_irq_handler(hwirq, irq);
+	}
+
+	return 0;
+
+unlock:
+	mutex_unlock(&vdev->vdev.dev_set->lock);
+	return ret;
+
+}
+
+int vfio_fsl_mc_set_irqs_ioctl(struct vfio_fsl_mc_device *vdev,
+			       u32 flags, unsigned int index,
+			       unsigned int start, unsigned int count,
+			       void *data)
+{
+	if (flags & VFIO_IRQ_SET_ACTION_TRIGGER)
+		return  vfio_fsl_mc_set_irq_trigger(vdev, index, start,
+			  count, flags, data);
+	else
+		return -EINVAL;
+}
+
+/* Free All IRQs for the given MC object */
+void vfio_fsl_mc_irqs_cleanup(struct vfio_fsl_mc_device *vdev)
+{
+	struct fsl_mc_device *mc_dev = vdev->mc_dev;
+	int irq_count = mc_dev->obj_desc.irq_count;
+	int i;
+
+	/*
+	 * Device does not support any interrupt or the interrupts
+	 * were not configured
+	 */
+	if (!vdev->mc_irqs)
+		return;
+
+	for (i = 0; i < irq_count; i++)
+		vfio_set_trigger(vdev, i, -1);
+
+	fsl_mc_free_irqs(mc_dev);
+	kfree(vdev->mc_irqs);
+	vdev->mc_irqs = NULL;
+}
diff --git a/drivers/vfio/fsl-mc/vfio_fsl_mc_private.h b/drivers/vfio/fsl-mc/vfio_fsl_mc_private.h
new file mode 100644
index 0000000000..7a29f572f9
--- /dev/null
+++ b/drivers/vfio/fsl-mc/vfio_fsl_mc_private.h
@@ -0,0 +1,49 @@
+/* SPDX-License-Identifier: (GPL-2.0+ OR BSD-3-Clause) */
+/*
+ * Copyright 2013-2016 Freescale Semiconductor Inc.
+ * Copyright 2016,2019-2020 NXP
+ */
+
+#ifndef VFIO_FSL_MC_PRIVATE_H
+#define VFIO_FSL_MC_PRIVATE_H
+
+#define VFIO_FSL_MC_OFFSET_SHIFT    40
+#define VFIO_FSL_MC_OFFSET_MASK (((u64)(1) << VFIO_FSL_MC_OFFSET_SHIFT) - 1)
+
+#define VFIO_FSL_MC_OFFSET_TO_INDEX(off) ((off) >> VFIO_FSL_MC_OFFSET_SHIFT)
+
+#define VFIO_FSL_MC_INDEX_TO_OFFSET(index)	\
+	((u64)(index) << VFIO_FSL_MC_OFFSET_SHIFT)
+
+struct vfio_fsl_mc_irq {
+	u32         flags;
+	u32         count;
+	struct eventfd_ctx  *trigger;
+	char            *name;
+};
+
+struct vfio_fsl_mc_region {
+	u32			flags;
+	u32			type;
+	u64			addr;
+	resource_size_t		size;
+	void __iomem		*ioaddr;
+};
+
+struct vfio_fsl_mc_device {
+	struct vfio_device		vdev;
+	struct fsl_mc_device		*mc_dev;
+	struct notifier_block        nb;
+	struct vfio_fsl_mc_region	*regions;
+	struct mutex         igate;
+	struct vfio_fsl_mc_irq      *mc_irqs;
+};
+
+int vfio_fsl_mc_set_irqs_ioctl(struct vfio_fsl_mc_device *vdev,
+			       u32 flags, unsigned int index,
+			       unsigned int start, unsigned int count,
+			       void *data);
+
+void vfio_fsl_mc_irqs_cleanup(struct vfio_fsl_mc_device *vdev);
+
+#endif /* VFIO_FSL_MC_PRIVATE_H */
diff --git a/drivers/vfio/group.c b/drivers/vfio/group.c
new file mode 100644
index 0000000000..610a429c61
--- /dev/null
+++ b/drivers/vfio/group.c
@@ -0,0 +1,959 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * VFIO core
+ *
+ * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
+ *     Author: Alex Williamson <alex.williamson@redhat.com>
+ *
+ * Derived from original vfio:
+ * Copyright 2010 Cisco Systems, Inc.  All rights reserved.
+ * Author: Tom Lyon, pugs@cisco.com
+ */
+
+#include <linux/vfio.h>
+#include <linux/iommufd.h>
+#include <linux/anon_inodes.h>
+#include "vfio.h"
+
+static struct vfio {
+	struct class			*class;
+	struct list_head		group_list;
+	struct mutex			group_lock; /* locks group_list */
+	struct ida			group_ida;
+	dev_t				group_devt;
+} vfio;
+
+static struct vfio_device *vfio_device_get_from_name(struct vfio_group *group,
+						     char *buf)
+{
+	struct vfio_device *it, *device = ERR_PTR(-ENODEV);
+
+	mutex_lock(&group->device_lock);
+	list_for_each_entry(it, &group->device_list, group_next) {
+		int ret;
+
+		if (it->ops->match) {
+			ret = it->ops->match(it, buf);
+			if (ret < 0) {
+				device = ERR_PTR(ret);
+				break;
+			}
+		} else {
+			ret = !strcmp(dev_name(it->dev), buf);
+		}
+
+		if (ret && vfio_device_try_get_registration(it)) {
+			device = it;
+			break;
+		}
+	}
+	mutex_unlock(&group->device_lock);
+
+	return device;
+}
+
+/*
+ * VFIO Group fd, /dev/vfio/$GROUP
+ */
+static bool vfio_group_has_iommu(struct vfio_group *group)
+{
+	lockdep_assert_held(&group->group_lock);
+	/*
+	 * There can only be users if there is a container, and if there is a
+	 * container there must be users.
+	 */
+	WARN_ON(!group->container != !group->container_users);
+
+	return group->container || group->iommufd;
+}
+
+/*
+ * VFIO_GROUP_UNSET_CONTAINER should fail if there are other users or
+ * if there was no container to unset.  Since the ioctl is called on
+ * the group, we know that still exists, therefore the only valid
+ * transition here is 1->0.
+ */
+static int vfio_group_ioctl_unset_container(struct vfio_group *group)
+{
+	int ret = 0;
+
+	mutex_lock(&group->group_lock);
+	if (!vfio_group_has_iommu(group)) {
+		ret = -EINVAL;
+		goto out_unlock;
+	}
+	if (group->container) {
+		if (group->container_users != 1) {
+			ret = -EBUSY;
+			goto out_unlock;
+		}
+		vfio_group_detach_container(group);
+	}
+	if (group->iommufd) {
+		iommufd_ctx_put(group->iommufd);
+		group->iommufd = NULL;
+	}
+
+out_unlock:
+	mutex_unlock(&group->group_lock);
+	return ret;
+}
+
+static int vfio_group_ioctl_set_container(struct vfio_group *group,
+					  int __user *arg)
+{
+	struct vfio_container *container;
+	struct iommufd_ctx *iommufd;
+	struct fd f;
+	int ret;
+	int fd;
+
+	if (get_user(fd, arg))
+		return -EFAULT;
+
+	f = fdget(fd);
+	if (!f.file)
+		return -EBADF;
+
+	mutex_lock(&group->group_lock);
+	if (vfio_group_has_iommu(group)) {
+		ret = -EINVAL;
+		goto out_unlock;
+	}
+	if (!group->iommu_group) {
+		ret = -ENODEV;
+		goto out_unlock;
+	}
+
+	container = vfio_container_from_file(f.file);
+	if (container) {
+		ret = vfio_container_attach_group(container, group);
+		goto out_unlock;
+	}
+
+	iommufd = iommufd_ctx_from_file(f.file);
+	if (!IS_ERR(iommufd)) {
+		if (IS_ENABLED(CONFIG_VFIO_NOIOMMU) &&
+		    group->type == VFIO_NO_IOMMU)
+			ret = iommufd_vfio_compat_set_no_iommu(iommufd);
+		else
+			ret = iommufd_vfio_compat_ioas_create(iommufd);
+
+		if (ret) {
+			iommufd_ctx_put(iommufd);
+			goto out_unlock;
+		}
+
+		group->iommufd = iommufd;
+		goto out_unlock;
+	}
+
+	/* The FD passed is not recognized. */
+	ret = -EBADFD;
+
+out_unlock:
+	mutex_unlock(&group->group_lock);
+	fdput(f);
+	return ret;
+}
+
+static void vfio_device_group_get_kvm_safe(struct vfio_device *device)
+{
+	spin_lock(&device->group->kvm_ref_lock);
+	vfio_device_get_kvm_safe(device, device->group->kvm);
+	spin_unlock(&device->group->kvm_ref_lock);
+}
+
+static int vfio_df_group_open(struct vfio_device_file *df)
+{
+	struct vfio_device *device = df->device;
+	int ret;
+
+	mutex_lock(&device->group->group_lock);
+	if (!vfio_group_has_iommu(device->group)) {
+		ret = -EINVAL;
+		goto out_unlock;
+	}
+
+	mutex_lock(&device->dev_set->lock);
+
+	/*
+	 * Before the first device open, get the KVM pointer currently
+	 * associated with the group (if there is one) and obtain a reference
+	 * now that will be held until the open_count reaches 0 again.  Save
+	 * the pointer in the device for use by drivers.
+	 */
+	if (device->open_count == 0)
+		vfio_device_group_get_kvm_safe(device);
+
+	df->iommufd = device->group->iommufd;
+	if (df->iommufd && vfio_device_is_noiommu(device) && device->open_count == 0) {
+		/*
+		 * Require no compat ioas to be assigned to proceed.  The basic
+		 * statement is that the user cannot have done something that
+		 * implies they expected translation to exist
+		 */
+		if (!capable(CAP_SYS_RAWIO) ||
+		    vfio_iommufd_device_has_compat_ioas(device, df->iommufd))
+			ret = -EPERM;
+		else
+			ret = 0;
+		goto out_put_kvm;
+	}
+
+	ret = vfio_df_open(df);
+	if (ret)
+		goto out_put_kvm;
+
+	if (df->iommufd && device->open_count == 1) {
+		ret = vfio_iommufd_compat_attach_ioas(device, df->iommufd);
+		if (ret)
+			goto out_close_device;
+	}
+
+	/*
+	 * Paired with smp_load_acquire() in vfio_device_fops::ioctl/
+	 * read/write/mmap and vfio_file_has_device_access()
+	 */
+	smp_store_release(&df->access_granted, true);
+
+	mutex_unlock(&device->dev_set->lock);
+	mutex_unlock(&device->group->group_lock);
+	return 0;
+
+out_close_device:
+	vfio_df_close(df);
+out_put_kvm:
+	df->iommufd = NULL;
+	if (device->open_count == 0)
+		vfio_device_put_kvm(device);
+	mutex_unlock(&device->dev_set->lock);
+out_unlock:
+	mutex_unlock(&device->group->group_lock);
+	return ret;
+}
+
+void vfio_df_group_close(struct vfio_device_file *df)
+{
+	struct vfio_device *device = df->device;
+
+	mutex_lock(&device->group->group_lock);
+	mutex_lock(&device->dev_set->lock);
+
+	vfio_df_close(df);
+	df->iommufd = NULL;
+
+	if (device->open_count == 0)
+		vfio_device_put_kvm(device);
+
+	mutex_unlock(&device->dev_set->lock);
+	mutex_unlock(&device->group->group_lock);
+}
+
+static struct file *vfio_device_open_file(struct vfio_device *device)
+{
+	struct vfio_device_file *df;
+	struct file *filep;
+	int ret;
+
+	df = vfio_allocate_device_file(device);
+	if (IS_ERR(df)) {
+		ret = PTR_ERR(df);
+		goto err_out;
+	}
+
+	df->group = device->group;
+
+	ret = vfio_df_group_open(df);
+	if (ret)
+		goto err_free;
+
+	/*
+	 * We can't use anon_inode_getfd() because we need to modify
+	 * the f_mode flags directly to allow more than just ioctls
+	 */
+	filep = anon_inode_getfile("[vfio-device]", &vfio_device_fops,
+				   df, O_RDWR);
+	if (IS_ERR(filep)) {
+		ret = PTR_ERR(filep);
+		goto err_close_device;
+	}
+
+	/*
+	 * TODO: add an anon_inode interface to do this.
+	 * Appears to be missing by lack of need rather than
+	 * explicitly prevented.  Now there's need.
+	 */
+	filep->f_mode |= (FMODE_PREAD | FMODE_PWRITE);
+
+	if (device->group->type == VFIO_NO_IOMMU)
+		dev_warn(device->dev, "vfio-noiommu device opened by user "
+			 "(%s:%d)\n", current->comm, task_pid_nr(current));
+	/*
+	 * On success the ref of device is moved to the file and
+	 * put in vfio_device_fops_release()
+	 */
+	return filep;
+
+err_close_device:
+	vfio_df_group_close(df);
+err_free:
+	kfree(df);
+err_out:
+	return ERR_PTR(ret);
+}
+
+static int vfio_group_ioctl_get_device_fd(struct vfio_group *group,
+					  char __user *arg)
+{
+	struct vfio_device *device;
+	struct file *filep;
+	char *buf;
+	int fdno;
+	int ret;
+
+	buf = strndup_user(arg, PAGE_SIZE);
+	if (IS_ERR(buf))
+		return PTR_ERR(buf);
+
+	device = vfio_device_get_from_name(group, buf);
+	kfree(buf);
+	if (IS_ERR(device))
+		return PTR_ERR(device);
+
+	fdno = get_unused_fd_flags(O_CLOEXEC);
+	if (fdno < 0) {
+		ret = fdno;
+		goto err_put_device;
+	}
+
+	filep = vfio_device_open_file(device);
+	if (IS_ERR(filep)) {
+		ret = PTR_ERR(filep);
+		goto err_put_fdno;
+	}
+
+	fd_install(fdno, filep);
+	return fdno;
+
+err_put_fdno:
+	put_unused_fd(fdno);
+err_put_device:
+	vfio_device_put_registration(device);
+	return ret;
+}
+
+static int vfio_group_ioctl_get_status(struct vfio_group *group,
+				       struct vfio_group_status __user *arg)
+{
+	unsigned long minsz = offsetofend(struct vfio_group_status, flags);
+	struct vfio_group_status status;
+
+	if (copy_from_user(&status, arg, minsz))
+		return -EFAULT;
+
+	if (status.argsz < minsz)
+		return -EINVAL;
+
+	status.flags = 0;
+
+	mutex_lock(&group->group_lock);
+	if (!group->iommu_group) {
+		mutex_unlock(&group->group_lock);
+		return -ENODEV;
+	}
+
+	/*
+	 * With the container FD the iommu_group_claim_dma_owner() is done
+	 * during SET_CONTAINER but for IOMMFD this is done during
+	 * VFIO_GROUP_GET_DEVICE_FD. Meaning that with iommufd
+	 * VFIO_GROUP_FLAGS_VIABLE could be set but GET_DEVICE_FD will fail due
+	 * to viability.
+	 */
+	if (vfio_group_has_iommu(group))
+		status.flags |= VFIO_GROUP_FLAGS_CONTAINER_SET |
+				VFIO_GROUP_FLAGS_VIABLE;
+	else if (!iommu_group_dma_owner_claimed(group->iommu_group))
+		status.flags |= VFIO_GROUP_FLAGS_VIABLE;
+	mutex_unlock(&group->group_lock);
+
+	if (copy_to_user(arg, &status, minsz))
+		return -EFAULT;
+	return 0;
+}
+
+static long vfio_group_fops_unl_ioctl(struct file *filep,
+				      unsigned int cmd, unsigned long arg)
+{
+	struct vfio_group *group = filep->private_data;
+	void __user *uarg = (void __user *)arg;
+
+	switch (cmd) {
+	case VFIO_GROUP_GET_DEVICE_FD:
+		return vfio_group_ioctl_get_device_fd(group, uarg);
+	case VFIO_GROUP_GET_STATUS:
+		return vfio_group_ioctl_get_status(group, uarg);
+	case VFIO_GROUP_SET_CONTAINER:
+		return vfio_group_ioctl_set_container(group, uarg);
+	case VFIO_GROUP_UNSET_CONTAINER:
+		return vfio_group_ioctl_unset_container(group);
+	default:
+		return -ENOTTY;
+	}
+}
+
+int vfio_device_block_group(struct vfio_device *device)
+{
+	struct vfio_group *group = device->group;
+	int ret = 0;
+
+	mutex_lock(&group->group_lock);
+	if (group->opened_file) {
+		ret = -EBUSY;
+		goto out_unlock;
+	}
+
+	group->cdev_device_open_cnt++;
+
+out_unlock:
+	mutex_unlock(&group->group_lock);
+	return ret;
+}
+
+void vfio_device_unblock_group(struct vfio_device *device)
+{
+	struct vfio_group *group = device->group;
+
+	mutex_lock(&group->group_lock);
+	group->cdev_device_open_cnt--;
+	mutex_unlock(&group->group_lock);
+}
+
+static int vfio_group_fops_open(struct inode *inode, struct file *filep)
+{
+	struct vfio_group *group =
+		container_of(inode->i_cdev, struct vfio_group, cdev);
+	int ret;
+
+	mutex_lock(&group->group_lock);
+
+	/*
+	 * drivers can be zero if this races with vfio_device_remove_group(), it
+	 * will be stable at 0 under the group rwsem
+	 */
+	if (refcount_read(&group->drivers) == 0) {
+		ret = -ENODEV;
+		goto out_unlock;
+	}
+
+	if (group->type == VFIO_NO_IOMMU && !capable(CAP_SYS_RAWIO)) {
+		ret = -EPERM;
+		goto out_unlock;
+	}
+
+	if (group->cdev_device_open_cnt) {
+		ret = -EBUSY;
+		goto out_unlock;
+	}
+
+	/*
+	 * Do we need multiple instances of the group open?  Seems not.
+	 */
+	if (group->opened_file) {
+		ret = -EBUSY;
+		goto out_unlock;
+	}
+	group->opened_file = filep;
+	filep->private_data = group;
+	ret = 0;
+out_unlock:
+	mutex_unlock(&group->group_lock);
+	return ret;
+}
+
+static int vfio_group_fops_release(struct inode *inode, struct file *filep)
+{
+	struct vfio_group *group = filep->private_data;
+
+	filep->private_data = NULL;
+
+	mutex_lock(&group->group_lock);
+	/*
+	 * Device FDs hold a group file reference, therefore the group release
+	 * is only called when there are no open devices.
+	 */
+	WARN_ON(group->notifier.head);
+	if (group->container)
+		vfio_group_detach_container(group);
+	if (group->iommufd) {
+		iommufd_ctx_put(group->iommufd);
+		group->iommufd = NULL;
+	}
+	group->opened_file = NULL;
+	mutex_unlock(&group->group_lock);
+	return 0;
+}
+
+static const struct file_operations vfio_group_fops = {
+	.owner		= THIS_MODULE,
+	.unlocked_ioctl	= vfio_group_fops_unl_ioctl,
+	.compat_ioctl	= compat_ptr_ioctl,
+	.open		= vfio_group_fops_open,
+	.release	= vfio_group_fops_release,
+};
+
+/*
+ * Group objects - create, release, get, put, search
+ */
+static struct vfio_group *
+vfio_group_find_from_iommu(struct iommu_group *iommu_group)
+{
+	struct vfio_group *group;
+
+	lockdep_assert_held(&vfio.group_lock);
+
+	/*
+	 * group->iommu_group from the vfio.group_list cannot be NULL
+	 * under the vfio.group_lock.
+	 */
+	list_for_each_entry(group, &vfio.group_list, vfio_next) {
+		if (group->iommu_group == iommu_group)
+			return group;
+	}
+	return NULL;
+}
+
+static void vfio_group_release(struct device *dev)
+{
+	struct vfio_group *group = container_of(dev, struct vfio_group, dev);
+
+	mutex_destroy(&group->device_lock);
+	mutex_destroy(&group->group_lock);
+	WARN_ON(group->iommu_group);
+	WARN_ON(group->cdev_device_open_cnt);
+	ida_free(&vfio.group_ida, MINOR(group->dev.devt));
+	kfree(group);
+}
+
+static struct vfio_group *vfio_group_alloc(struct iommu_group *iommu_group,
+					   enum vfio_group_type type)
+{
+	struct vfio_group *group;
+	int minor;
+
+	group = kzalloc(sizeof(*group), GFP_KERNEL);
+	if (!group)
+		return ERR_PTR(-ENOMEM);
+
+	minor = ida_alloc_max(&vfio.group_ida, MINORMASK, GFP_KERNEL);
+	if (minor < 0) {
+		kfree(group);
+		return ERR_PTR(minor);
+	}
+
+	device_initialize(&group->dev);
+	group->dev.devt = MKDEV(MAJOR(vfio.group_devt), minor);
+	group->dev.class = vfio.class;
+	group->dev.release = vfio_group_release;
+	cdev_init(&group->cdev, &vfio_group_fops);
+	group->cdev.owner = THIS_MODULE;
+
+	refcount_set(&group->drivers, 1);
+	mutex_init(&group->group_lock);
+	spin_lock_init(&group->kvm_ref_lock);
+	INIT_LIST_HEAD(&group->device_list);
+	mutex_init(&group->device_lock);
+	group->iommu_group = iommu_group;
+	/* put in vfio_group_release() */
+	iommu_group_ref_get(iommu_group);
+	group->type = type;
+	BLOCKING_INIT_NOTIFIER_HEAD(&group->notifier);
+
+	return group;
+}
+
+static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group,
+		enum vfio_group_type type)
+{
+	struct vfio_group *group;
+	struct vfio_group *ret;
+	int err;
+
+	lockdep_assert_held(&vfio.group_lock);
+
+	group = vfio_group_alloc(iommu_group, type);
+	if (IS_ERR(group))
+		return group;
+
+	err = dev_set_name(&group->dev, "%s%d",
+			   group->type == VFIO_NO_IOMMU ? "noiommu-" : "",
+			   iommu_group_id(iommu_group));
+	if (err) {
+		ret = ERR_PTR(err);
+		goto err_put;
+	}
+
+	err = cdev_device_add(&group->cdev, &group->dev);
+	if (err) {
+		ret = ERR_PTR(err);
+		goto err_put;
+	}
+
+	list_add(&group->vfio_next, &vfio.group_list);
+
+	return group;
+
+err_put:
+	put_device(&group->dev);
+	return ret;
+}
+
+static struct vfio_group *vfio_noiommu_group_alloc(struct device *dev,
+		enum vfio_group_type type)
+{
+	struct iommu_group *iommu_group;
+	struct vfio_group *group;
+	int ret;
+
+	iommu_group = iommu_group_alloc();
+	if (IS_ERR(iommu_group))
+		return ERR_CAST(iommu_group);
+
+	ret = iommu_group_set_name(iommu_group, "vfio-noiommu");
+	if (ret)
+		goto out_put_group;
+	ret = iommu_group_add_device(iommu_group, dev);
+	if (ret)
+		goto out_put_group;
+
+	mutex_lock(&vfio.group_lock);
+	group = vfio_create_group(iommu_group, type);
+	mutex_unlock(&vfio.group_lock);
+	if (IS_ERR(group)) {
+		ret = PTR_ERR(group);
+		goto out_remove_device;
+	}
+	iommu_group_put(iommu_group);
+	return group;
+
+out_remove_device:
+	iommu_group_remove_device(dev);
+out_put_group:
+	iommu_group_put(iommu_group);
+	return ERR_PTR(ret);
+}
+
+static bool vfio_group_has_device(struct vfio_group *group, struct device *dev)
+{
+	struct vfio_device *device;
+
+	mutex_lock(&group->device_lock);
+	list_for_each_entry(device, &group->device_list, group_next) {
+		if (device->dev == dev) {
+			mutex_unlock(&group->device_lock);
+			return true;
+		}
+	}
+	mutex_unlock(&group->device_lock);
+	return false;
+}
+
+static struct vfio_group *vfio_group_find_or_alloc(struct device *dev)
+{
+	struct iommu_group *iommu_group;
+	struct vfio_group *group;
+
+	iommu_group = iommu_group_get(dev);
+	if (!iommu_group && vfio_noiommu) {
+		/*
+		 * With noiommu enabled, create an IOMMU group for devices that
+		 * don't already have one, implying no IOMMU hardware/driver
+		 * exists.  Taint the kernel because we're about to give a DMA
+		 * capable device to a user without IOMMU protection.
+		 */
+		group = vfio_noiommu_group_alloc(dev, VFIO_NO_IOMMU);
+		if (!IS_ERR(group)) {
+			add_taint(TAINT_USER, LOCKDEP_STILL_OK);
+			dev_warn(dev, "Adding kernel taint for vfio-noiommu group on device\n");
+		}
+		return group;
+	}
+
+	if (!iommu_group)
+		return ERR_PTR(-EINVAL);
+
+	mutex_lock(&vfio.group_lock);
+	group = vfio_group_find_from_iommu(iommu_group);
+	if (group) {
+		if (WARN_ON(vfio_group_has_device(group, dev)))
+			group = ERR_PTR(-EINVAL);
+		else
+			refcount_inc(&group->drivers);
+	} else {
+		group = vfio_create_group(iommu_group, VFIO_IOMMU);
+	}
+	mutex_unlock(&vfio.group_lock);
+
+	/* The vfio_group holds a reference to the iommu_group */
+	iommu_group_put(iommu_group);
+	return group;
+}
+
+int vfio_device_set_group(struct vfio_device *device,
+			  enum vfio_group_type type)
+{
+	struct vfio_group *group;
+
+	if (type == VFIO_IOMMU)
+		group = vfio_group_find_or_alloc(device->dev);
+	else
+		group = vfio_noiommu_group_alloc(device->dev, type);
+
+	if (IS_ERR(group))
+		return PTR_ERR(group);
+
+	/* Our reference on group is moved to the device */
+	device->group = group;
+	return 0;
+}
+
+void vfio_device_remove_group(struct vfio_device *device)
+{
+	struct vfio_group *group = device->group;
+	struct iommu_group *iommu_group;
+
+	if (group->type == VFIO_NO_IOMMU || group->type == VFIO_EMULATED_IOMMU)
+		iommu_group_remove_device(device->dev);
+
+	/* Pairs with vfio_create_group() / vfio_group_get_from_iommu() */
+	if (!refcount_dec_and_mutex_lock(&group->drivers, &vfio.group_lock))
+		return;
+	list_del(&group->vfio_next);
+
+	/*
+	 * We could concurrently probe another driver in the group that might
+	 * race vfio_device_remove_group() with vfio_get_group(), so we have to
+	 * ensure that the sysfs is all cleaned up under lock otherwise the
+	 * cdev_device_add() will fail due to the name aready existing.
+	 */
+	cdev_device_del(&group->cdev, &group->dev);
+
+	mutex_lock(&group->group_lock);
+	/*
+	 * These data structures all have paired operations that can only be
+	 * undone when the caller holds a live reference on the device. Since
+	 * all pairs must be undone these WARN_ON's indicate some caller did not
+	 * properly hold the group reference.
+	 */
+	WARN_ON(!list_empty(&group->device_list));
+	WARN_ON(group->notifier.head);
+
+	/*
+	 * Revoke all users of group->iommu_group. At this point we know there
+	 * are no devices active because we are unplugging the last one. Setting
+	 * iommu_group to NULL blocks all new users.
+	 */
+	if (group->container)
+		vfio_group_detach_container(group);
+	iommu_group = group->iommu_group;
+	group->iommu_group = NULL;
+	mutex_unlock(&group->group_lock);
+	mutex_unlock(&vfio.group_lock);
+
+	iommu_group_put(iommu_group);
+	put_device(&group->dev);
+}
+
+void vfio_device_group_register(struct vfio_device *device)
+{
+	mutex_lock(&device->group->device_lock);
+	list_add(&device->group_next, &device->group->device_list);
+	mutex_unlock(&device->group->device_lock);
+}
+
+void vfio_device_group_unregister(struct vfio_device *device)
+{
+	mutex_lock(&device->group->device_lock);
+	list_del(&device->group_next);
+	mutex_unlock(&device->group->device_lock);
+}
+
+int vfio_device_group_use_iommu(struct vfio_device *device)
+{
+	struct vfio_group *group = device->group;
+	int ret = 0;
+
+	lockdep_assert_held(&group->group_lock);
+
+	if (WARN_ON(!group->container))
+		return -EINVAL;
+
+	ret = vfio_group_use_container(group);
+	if (ret)
+		return ret;
+	vfio_device_container_register(device);
+	return 0;
+}
+
+void vfio_device_group_unuse_iommu(struct vfio_device *device)
+{
+	struct vfio_group *group = device->group;
+
+	lockdep_assert_held(&group->group_lock);
+
+	if (WARN_ON(!group->container))
+		return;
+
+	vfio_device_container_unregister(device);
+	vfio_group_unuse_container(group);
+}
+
+bool vfio_device_has_container(struct vfio_device *device)
+{
+	return device->group->container;
+}
+
+struct vfio_group *vfio_group_from_file(struct file *file)
+{
+	struct vfio_group *group = file->private_data;
+
+	if (file->f_op != &vfio_group_fops)
+		return NULL;
+	return group;
+}
+
+/**
+ * vfio_file_iommu_group - Return the struct iommu_group for the vfio group file
+ * @file: VFIO group file
+ *
+ * The returned iommu_group is valid as long as a ref is held on the file. This
+ * returns a reference on the group. This function is deprecated, only the SPAPR
+ * path in kvm should call it.
+ */
+struct iommu_group *vfio_file_iommu_group(struct file *file)
+{
+	struct vfio_group *group = vfio_group_from_file(file);
+	struct iommu_group *iommu_group = NULL;
+
+	if (!IS_ENABLED(CONFIG_SPAPR_TCE_IOMMU))
+		return NULL;
+
+	if (!group)
+		return NULL;
+
+	mutex_lock(&group->group_lock);
+	if (group->iommu_group) {
+		iommu_group = group->iommu_group;
+		iommu_group_ref_get(iommu_group);
+	}
+	mutex_unlock(&group->group_lock);
+	return iommu_group;
+}
+EXPORT_SYMBOL_GPL(vfio_file_iommu_group);
+
+/**
+ * vfio_file_is_group - True if the file is a vfio group file
+ * @file: VFIO group file
+ */
+bool vfio_file_is_group(struct file *file)
+{
+	return vfio_group_from_file(file);
+}
+EXPORT_SYMBOL_GPL(vfio_file_is_group);
+
+bool vfio_group_enforced_coherent(struct vfio_group *group)
+{
+	struct vfio_device *device;
+	bool ret = true;
+
+	/*
+	 * If the device does not have IOMMU_CAP_ENFORCE_CACHE_COHERENCY then
+	 * any domain later attached to it will also not support it. If the cap
+	 * is set then the iommu_domain eventually attached to the device/group
+	 * must use a domain with enforce_cache_coherency().
+	 */
+	mutex_lock(&group->device_lock);
+	list_for_each_entry(device, &group->device_list, group_next) {
+		if (!device_iommu_capable(device->dev,
+					  IOMMU_CAP_ENFORCE_CACHE_COHERENCY)) {
+			ret = false;
+			break;
+		}
+	}
+	mutex_unlock(&group->device_lock);
+	return ret;
+}
+
+void vfio_group_set_kvm(struct vfio_group *group, struct kvm *kvm)
+{
+	spin_lock(&group->kvm_ref_lock);
+	group->kvm = kvm;
+	spin_unlock(&group->kvm_ref_lock);
+}
+
+/**
+ * vfio_file_has_dev - True if the VFIO file is a handle for device
+ * @file: VFIO file to check
+ * @device: Device that must be part of the file
+ *
+ * Returns true if given file has permission to manipulate the given device.
+ */
+bool vfio_file_has_dev(struct file *file, struct vfio_device *device)
+{
+	struct vfio_group *group = vfio_group_from_file(file);
+
+	if (!group)
+		return false;
+
+	return group == device->group;
+}
+EXPORT_SYMBOL_GPL(vfio_file_has_dev);
+
+static char *vfio_devnode(const struct device *dev, umode_t *mode)
+{
+	return kasprintf(GFP_KERNEL, "vfio/%s", dev_name(dev));
+}
+
+int __init vfio_group_init(void)
+{
+	int ret;
+
+	ida_init(&vfio.group_ida);
+	mutex_init(&vfio.group_lock);
+	INIT_LIST_HEAD(&vfio.group_list);
+
+	ret = vfio_container_init();
+	if (ret)
+		return ret;
+
+	/* /dev/vfio/$GROUP */
+	vfio.class = class_create("vfio");
+	if (IS_ERR(vfio.class)) {
+		ret = PTR_ERR(vfio.class);
+		goto err_group_class;
+	}
+
+	vfio.class->devnode = vfio_devnode;
+
+	ret = alloc_chrdev_region(&vfio.group_devt, 0, MINORMASK + 1, "vfio");
+	if (ret)
+		goto err_alloc_chrdev;
+	return 0;
+
+err_alloc_chrdev:
+	class_destroy(vfio.class);
+	vfio.class = NULL;
+err_group_class:
+	vfio_container_cleanup();
+	return ret;
+}
+
+void vfio_group_cleanup(void)
+{
+	WARN_ON(!list_empty(&vfio.group_list));
+	ida_destroy(&vfio.group_ida);
+	unregister_chrdev_region(vfio.group_devt, MINORMASK + 1);
+	class_destroy(vfio.class);
+	vfio.class = NULL;
+	vfio_container_cleanup();
+}
diff --git a/drivers/vfio/iommufd.c b/drivers/vfio/iommufd.c
new file mode 100644
index 0000000000..82eba6966f
--- /dev/null
+++ b/drivers/vfio/iommufd.c
@@ -0,0 +1,247 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES
+ */
+#include <linux/vfio.h>
+#include <linux/iommufd.h>
+
+#include "vfio.h"
+
+MODULE_IMPORT_NS(IOMMUFD);
+MODULE_IMPORT_NS(IOMMUFD_VFIO);
+
+bool vfio_iommufd_device_has_compat_ioas(struct vfio_device *vdev,
+					 struct iommufd_ctx *ictx)
+{
+	u32 ioas_id;
+
+	return !iommufd_vfio_compat_ioas_get_id(ictx, &ioas_id);
+}
+
+int vfio_df_iommufd_bind(struct vfio_device_file *df)
+{
+	struct vfio_device *vdev = df->device;
+	struct iommufd_ctx *ictx = df->iommufd;
+
+	lockdep_assert_held(&vdev->dev_set->lock);
+
+	return vdev->ops->bind_iommufd(vdev, ictx, &df->devid);
+}
+
+int vfio_iommufd_compat_attach_ioas(struct vfio_device *vdev,
+				    struct iommufd_ctx *ictx)
+{
+	u32 ioas_id;
+	int ret;
+
+	lockdep_assert_held(&vdev->dev_set->lock);
+
+	/* compat noiommu does not need to do ioas attach */
+	if (vfio_device_is_noiommu(vdev))
+		return 0;
+
+	ret = iommufd_vfio_compat_ioas_get_id(ictx, &ioas_id);
+	if (ret)
+		return ret;
+
+	/* The legacy path has no way to return the selected pt_id */
+	return vdev->ops->attach_ioas(vdev, &ioas_id);
+}
+
+void vfio_df_iommufd_unbind(struct vfio_device_file *df)
+{
+	struct vfio_device *vdev = df->device;
+
+	lockdep_assert_held(&vdev->dev_set->lock);
+
+	if (vfio_device_is_noiommu(vdev))
+		return;
+
+	if (vdev->ops->unbind_iommufd)
+		vdev->ops->unbind_iommufd(vdev);
+}
+
+struct iommufd_ctx *vfio_iommufd_device_ictx(struct vfio_device *vdev)
+{
+	if (vdev->iommufd_device)
+		return iommufd_device_to_ictx(vdev->iommufd_device);
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(vfio_iommufd_device_ictx);
+
+static int vfio_iommufd_device_id(struct vfio_device *vdev)
+{
+	if (vdev->iommufd_device)
+		return iommufd_device_to_id(vdev->iommufd_device);
+	return -EINVAL;
+}
+
+/*
+ * Return devid for a device.
+ *  valid ID for the device that is owned by the ictx
+ *  -ENOENT = device is owned but there is no ID
+ *  -ENODEV or other error = device is not owned
+ */
+int vfio_iommufd_get_dev_id(struct vfio_device *vdev, struct iommufd_ctx *ictx)
+{
+	struct iommu_group *group;
+	int devid;
+
+	if (vfio_iommufd_device_ictx(vdev) == ictx)
+		return vfio_iommufd_device_id(vdev);
+
+	group = iommu_group_get(vdev->dev);
+	if (!group)
+		return -ENODEV;
+
+	if (iommufd_ctx_has_group(ictx, group))
+		devid = -ENOENT;
+	else
+		devid = -ENODEV;
+
+	iommu_group_put(group);
+
+	return devid;
+}
+EXPORT_SYMBOL_GPL(vfio_iommufd_get_dev_id);
+
+/*
+ * The physical standard ops mean that the iommufd_device is bound to the
+ * physical device vdev->dev that was provided to vfio_init_group_dev(). Drivers
+ * using this ops set should call vfio_register_group_dev()
+ */
+int vfio_iommufd_physical_bind(struct vfio_device *vdev,
+			       struct iommufd_ctx *ictx, u32 *out_device_id)
+{
+	struct iommufd_device *idev;
+
+	idev = iommufd_device_bind(ictx, vdev->dev, out_device_id);
+	if (IS_ERR(idev))
+		return PTR_ERR(idev);
+	vdev->iommufd_device = idev;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(vfio_iommufd_physical_bind);
+
+void vfio_iommufd_physical_unbind(struct vfio_device *vdev)
+{
+	lockdep_assert_held(&vdev->dev_set->lock);
+
+	if (vdev->iommufd_attached) {
+		iommufd_device_detach(vdev->iommufd_device);
+		vdev->iommufd_attached = false;
+	}
+	iommufd_device_unbind(vdev->iommufd_device);
+	vdev->iommufd_device = NULL;
+}
+EXPORT_SYMBOL_GPL(vfio_iommufd_physical_unbind);
+
+int vfio_iommufd_physical_attach_ioas(struct vfio_device *vdev, u32 *pt_id)
+{
+	int rc;
+
+	lockdep_assert_held(&vdev->dev_set->lock);
+
+	if (WARN_ON(!vdev->iommufd_device))
+		return -EINVAL;
+
+	if (vdev->iommufd_attached)
+		rc = iommufd_device_replace(vdev->iommufd_device, pt_id);
+	else
+		rc = iommufd_device_attach(vdev->iommufd_device, pt_id);
+	if (rc)
+		return rc;
+	vdev->iommufd_attached = true;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(vfio_iommufd_physical_attach_ioas);
+
+void vfio_iommufd_physical_detach_ioas(struct vfio_device *vdev)
+{
+	lockdep_assert_held(&vdev->dev_set->lock);
+
+	if (WARN_ON(!vdev->iommufd_device) || !vdev->iommufd_attached)
+		return;
+
+	iommufd_device_detach(vdev->iommufd_device);
+	vdev->iommufd_attached = false;
+}
+EXPORT_SYMBOL_GPL(vfio_iommufd_physical_detach_ioas);
+
+/*
+ * The emulated standard ops mean that vfio_device is going to use the
+ * "mdev path" and will call vfio_pin_pages()/vfio_dma_rw(). Drivers using this
+ * ops set should call vfio_register_emulated_iommu_dev(). Drivers that do
+ * not call vfio_pin_pages()/vfio_dma_rw() have no need to provide dma_unmap.
+ */
+
+static void vfio_emulated_unmap(void *data, unsigned long iova,
+				unsigned long length)
+{
+	struct vfio_device *vdev = data;
+
+	if (vdev->ops->dma_unmap)
+		vdev->ops->dma_unmap(vdev, iova, length);
+}
+
+static const struct iommufd_access_ops vfio_user_ops = {
+	.needs_pin_pages = 1,
+	.unmap = vfio_emulated_unmap,
+};
+
+int vfio_iommufd_emulated_bind(struct vfio_device *vdev,
+			       struct iommufd_ctx *ictx, u32 *out_device_id)
+{
+	struct iommufd_access *user;
+
+	lockdep_assert_held(&vdev->dev_set->lock);
+
+	user = iommufd_access_create(ictx, &vfio_user_ops, vdev, out_device_id);
+	if (IS_ERR(user))
+		return PTR_ERR(user);
+	vdev->iommufd_access = user;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(vfio_iommufd_emulated_bind);
+
+void vfio_iommufd_emulated_unbind(struct vfio_device *vdev)
+{
+	lockdep_assert_held(&vdev->dev_set->lock);
+
+	if (vdev->iommufd_access) {
+		iommufd_access_destroy(vdev->iommufd_access);
+		vdev->iommufd_attached = false;
+		vdev->iommufd_access = NULL;
+	}
+}
+EXPORT_SYMBOL_GPL(vfio_iommufd_emulated_unbind);
+
+int vfio_iommufd_emulated_attach_ioas(struct vfio_device *vdev, u32 *pt_id)
+{
+	int rc;
+
+	lockdep_assert_held(&vdev->dev_set->lock);
+
+	if (vdev->iommufd_attached)
+		rc = iommufd_access_replace(vdev->iommufd_access, *pt_id);
+	else
+		rc = iommufd_access_attach(vdev->iommufd_access, *pt_id);
+	if (rc)
+		return rc;
+	vdev->iommufd_attached = true;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(vfio_iommufd_emulated_attach_ioas);
+
+void vfio_iommufd_emulated_detach_ioas(struct vfio_device *vdev)
+{
+	lockdep_assert_held(&vdev->dev_set->lock);
+
+	if (WARN_ON(!vdev->iommufd_access) ||
+	    !vdev->iommufd_attached)
+		return;
+
+	iommufd_access_detach(vdev->iommufd_access);
+	vdev->iommufd_attached = false;
+}
+EXPORT_SYMBOL_GPL(vfio_iommufd_emulated_detach_ioas);
diff --git a/drivers/vfio/iova_bitmap.c b/drivers/vfio/iova_bitmap.c
new file mode 100644
index 0000000000..0848f920ef
--- /dev/null
+++ b/drivers/vfio/iova_bitmap.c
@@ -0,0 +1,423 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2022, Oracle and/or its affiliates.
+ * Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved
+ */
+#include <linux/iova_bitmap.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+
+#define BITS_PER_PAGE (PAGE_SIZE * BITS_PER_BYTE)
+
+/*
+ * struct iova_bitmap_map - A bitmap representing an IOVA range
+ *
+ * Main data structure for tracking mapped user pages of bitmap data.
+ *
+ * For example, for something recording dirty IOVAs, it will be provided a
+ * struct iova_bitmap structure, as a general structure for iterating the
+ * total IOVA range. The struct iova_bitmap_map, though, represents the
+ * subset of said IOVA space that is pinned by its parent structure (struct
+ * iova_bitmap).
+ *
+ * The user does not need to exact location of the bits in the bitmap.
+ * From user perspective the only API available is iova_bitmap_set() which
+ * records the IOVA *range* in the bitmap by setting the corresponding
+ * bits.
+ *
+ * The bitmap is an array of u64 whereas each bit represents an IOVA of
+ * range of (1 << pgshift). Thus formula for the bitmap data to be set is:
+ *
+ *   data[(iova / page_size) / 64] & (1ULL << (iova % 64))
+ */
+struct iova_bitmap_map {
+	/* base IOVA representing bit 0 of the first page */
+	unsigned long iova;
+
+	/* page size order that each bit granules to */
+	unsigned long pgshift;
+
+	/* page offset of the first user page pinned */
+	unsigned long pgoff;
+
+	/* number of pages pinned */
+	unsigned long npages;
+
+	/* pinned pages representing the bitmap data */
+	struct page **pages;
+};
+
+/*
+ * struct iova_bitmap - The IOVA bitmap object
+ *
+ * Main data structure for iterating over the bitmap data.
+ *
+ * Abstracts the pinning work and iterates in IOVA ranges.
+ * It uses a windowing scheme and pins the bitmap in relatively
+ * big ranges e.g.
+ *
+ * The bitmap object uses one base page to store all the pinned pages
+ * pointers related to the bitmap. For sizeof(struct page*) == 8 it stores
+ * 512 struct page pointers which, if the base page size is 4K, it means
+ * 2M of bitmap data is pinned at a time. If the iova_bitmap page size is
+ * also 4K then the range window to iterate is 64G.
+ *
+ * For example iterating on a total IOVA range of 4G..128G, it will walk
+ * through this set of ranges:
+ *
+ *    4G  -  68G-1 (64G)
+ *    68G - 128G-1 (64G)
+ *
+ * An example of the APIs on how to use/iterate over the IOVA bitmap:
+ *
+ *   bitmap = iova_bitmap_alloc(iova, length, page_size, data);
+ *   if (IS_ERR(bitmap))
+ *       return PTR_ERR(bitmap);
+ *
+ *   ret = iova_bitmap_for_each(bitmap, arg, dirty_reporter_fn);
+ *
+ *   iova_bitmap_free(bitmap);
+ *
+ * Each iteration of the @dirty_reporter_fn is called with a unique @iova
+ * and @length argument, indicating the current range available through the
+ * iova_bitmap. The @dirty_reporter_fn uses iova_bitmap_set() to mark dirty
+ * areas (@iova_length) within that provided range, as following:
+ *
+ *   iova_bitmap_set(bitmap, iova, iova_length);
+ *
+ * The internals of the object uses an index @mapped_base_index that indexes
+ * which u64 word of the bitmap is mapped, up to @mapped_total_index.
+ * Those keep being incremented until @mapped_total_index is reached while
+ * mapping up to PAGE_SIZE / sizeof(struct page*) maximum of pages.
+ *
+ * The IOVA bitmap is usually located on what tracks DMA mapped ranges or
+ * some form of IOVA range tracking that co-relates to the user passed
+ * bitmap.
+ */
+struct iova_bitmap {
+	/* IOVA range representing the currently mapped bitmap data */
+	struct iova_bitmap_map mapped;
+
+	/* userspace address of the bitmap */
+	u64 __user *bitmap;
+
+	/* u64 index that @mapped points to */
+	unsigned long mapped_base_index;
+
+	/* how many u64 can we walk in total */
+	unsigned long mapped_total_index;
+
+	/* base IOVA of the whole bitmap */
+	unsigned long iova;
+
+	/* length of the IOVA range for the whole bitmap */
+	size_t length;
+};
+
+/*
+ * Converts a relative IOVA to a bitmap index.
+ * This function provides the index into the u64 array (bitmap::bitmap)
+ * for a given IOVA offset.
+ * Relative IOVA means relative to the bitmap::mapped base IOVA
+ * (stored in mapped::iova). All computations in this file are done using
+ * relative IOVAs and thus avoid an extra subtraction against mapped::iova.
+ * The user API iova_bitmap_set() always uses a regular absolute IOVAs.
+ */
+static unsigned long iova_bitmap_offset_to_index(struct iova_bitmap *bitmap,
+						 unsigned long iova)
+{
+	unsigned long pgsize = 1 << bitmap->mapped.pgshift;
+
+	return iova / (BITS_PER_TYPE(*bitmap->bitmap) * pgsize);
+}
+
+/*
+ * Converts a bitmap index to a *relative* IOVA.
+ */
+static unsigned long iova_bitmap_index_to_offset(struct iova_bitmap *bitmap,
+						 unsigned long index)
+{
+	unsigned long pgshift = bitmap->mapped.pgshift;
+
+	return (index * BITS_PER_TYPE(*bitmap->bitmap)) << pgshift;
+}
+
+/*
+ * Returns the base IOVA of the mapped range.
+ */
+static unsigned long iova_bitmap_mapped_iova(struct iova_bitmap *bitmap)
+{
+	unsigned long skip = bitmap->mapped_base_index;
+
+	return bitmap->iova + iova_bitmap_index_to_offset(bitmap, skip);
+}
+
+/*
+ * Pins the bitmap user pages for the current range window.
+ * This is internal to IOVA bitmap and called when advancing the
+ * index (@mapped_base_index) or allocating the bitmap.
+ */
+static int iova_bitmap_get(struct iova_bitmap *bitmap)
+{
+	struct iova_bitmap_map *mapped = &bitmap->mapped;
+	unsigned long npages;
+	u64 __user *addr;
+	long ret;
+
+	/*
+	 * @mapped_base_index is the index of the currently mapped u64 words
+	 * that we have access. Anything before @mapped_base_index is not
+	 * mapped. The range @mapped_base_index .. @mapped_total_index-1 is
+	 * mapped but capped at a maximum number of pages.
+	 */
+	npages = DIV_ROUND_UP((bitmap->mapped_total_index -
+			       bitmap->mapped_base_index) *
+			       sizeof(*bitmap->bitmap), PAGE_SIZE);
+
+	/*
+	 * We always cap at max number of 'struct page' a base page can fit.
+	 * This is, for example, on x86 means 2M of bitmap data max.
+	 */
+	npages = min(npages,  PAGE_SIZE / sizeof(struct page *));
+
+	/*
+	 * Bitmap address to be pinned is calculated via pointer arithmetic
+	 * with bitmap u64 word index.
+	 */
+	addr = bitmap->bitmap + bitmap->mapped_base_index;
+
+	ret = pin_user_pages_fast((unsigned long)addr, npages,
+				  FOLL_WRITE, mapped->pages);
+	if (ret <= 0)
+		return -EFAULT;
+
+	mapped->npages = (unsigned long)ret;
+	/* Base IOVA where @pages point to i.e. bit 0 of the first page */
+	mapped->iova = iova_bitmap_mapped_iova(bitmap);
+
+	/*
+	 * offset of the page where pinned pages bit 0 is located.
+	 * This handles the case where the bitmap is not PAGE_SIZE
+	 * aligned.
+	 */
+	mapped->pgoff = offset_in_page(addr);
+	return 0;
+}
+
+/*
+ * Unpins the bitmap user pages and clears @npages
+ * (un)pinning is abstracted from API user and it's done when advancing
+ * the index or freeing the bitmap.
+ */
+static void iova_bitmap_put(struct iova_bitmap *bitmap)
+{
+	struct iova_bitmap_map *mapped = &bitmap->mapped;
+
+	if (mapped->npages) {
+		unpin_user_pages(mapped->pages, mapped->npages);
+		mapped->npages = 0;
+	}
+}
+
+/**
+ * iova_bitmap_alloc() - Allocates an IOVA bitmap object
+ * @iova: Start address of the IOVA range
+ * @length: Length of the IOVA range
+ * @page_size: Page size of the IOVA bitmap. It defines what each bit
+ *             granularity represents
+ * @data: Userspace address of the bitmap
+ *
+ * Allocates an IOVA object and initializes all its fields including the
+ * first user pages of @data.
+ *
+ * Return: A pointer to a newly allocated struct iova_bitmap
+ * or ERR_PTR() on error.
+ */
+struct iova_bitmap *iova_bitmap_alloc(unsigned long iova, size_t length,
+				      unsigned long page_size, u64 __user *data)
+{
+	struct iova_bitmap_map *mapped;
+	struct iova_bitmap *bitmap;
+	int rc;
+
+	bitmap = kzalloc(sizeof(*bitmap), GFP_KERNEL);
+	if (!bitmap)
+		return ERR_PTR(-ENOMEM);
+
+	mapped = &bitmap->mapped;
+	mapped->pgshift = __ffs(page_size);
+	bitmap->bitmap = data;
+	bitmap->mapped_total_index =
+		iova_bitmap_offset_to_index(bitmap, length - 1) + 1;
+	bitmap->iova = iova;
+	bitmap->length = length;
+	mapped->iova = iova;
+	mapped->pages = (struct page **)__get_free_page(GFP_KERNEL);
+	if (!mapped->pages) {
+		rc = -ENOMEM;
+		goto err;
+	}
+
+	rc = iova_bitmap_get(bitmap);
+	if (rc)
+		goto err;
+	return bitmap;
+
+err:
+	iova_bitmap_free(bitmap);
+	return ERR_PTR(rc);
+}
+
+/**
+ * iova_bitmap_free() - Frees an IOVA bitmap object
+ * @bitmap: IOVA bitmap to free
+ *
+ * It unpins and releases pages array memory and clears any leftover
+ * state.
+ */
+void iova_bitmap_free(struct iova_bitmap *bitmap)
+{
+	struct iova_bitmap_map *mapped = &bitmap->mapped;
+
+	iova_bitmap_put(bitmap);
+
+	if (mapped->pages) {
+		free_page((unsigned long)mapped->pages);
+		mapped->pages = NULL;
+	}
+
+	kfree(bitmap);
+}
+
+/*
+ * Returns the remaining bitmap indexes from mapped_total_index to process for
+ * the currently pinned bitmap pages.
+ */
+static unsigned long iova_bitmap_mapped_remaining(struct iova_bitmap *bitmap)
+{
+	unsigned long remaining, bytes;
+
+	bytes = (bitmap->mapped.npages << PAGE_SHIFT) - bitmap->mapped.pgoff;
+
+	remaining = bitmap->mapped_total_index - bitmap->mapped_base_index;
+	remaining = min_t(unsigned long, remaining,
+			  bytes / sizeof(*bitmap->bitmap));
+
+	return remaining;
+}
+
+/*
+ * Returns the length of the mapped IOVA range.
+ */
+static unsigned long iova_bitmap_mapped_length(struct iova_bitmap *bitmap)
+{
+	unsigned long max_iova = bitmap->iova + bitmap->length - 1;
+	unsigned long iova = iova_bitmap_mapped_iova(bitmap);
+	unsigned long remaining;
+
+	/*
+	 * iova_bitmap_mapped_remaining() returns a number of indexes which
+	 * when converted to IOVA gives us a max length that the bitmap
+	 * pinned data can cover. Afterwards, that is capped to
+	 * only cover the IOVA range in @bitmap::iova .. @bitmap::length.
+	 */
+	remaining = iova_bitmap_index_to_offset(bitmap,
+			iova_bitmap_mapped_remaining(bitmap));
+
+	if (iova + remaining - 1 > max_iova)
+		remaining -= ((iova + remaining - 1) - max_iova);
+
+	return remaining;
+}
+
+/*
+ * Returns true if there's not more data to iterate.
+ */
+static bool iova_bitmap_done(struct iova_bitmap *bitmap)
+{
+	return bitmap->mapped_base_index >= bitmap->mapped_total_index;
+}
+
+/*
+ * Advances to the next range, releases the current pinned
+ * pages and pins the next set of bitmap pages.
+ * Returns 0 on success or otherwise errno.
+ */
+static int iova_bitmap_advance(struct iova_bitmap *bitmap)
+{
+	unsigned long iova = iova_bitmap_mapped_length(bitmap) - 1;
+	unsigned long count = iova_bitmap_offset_to_index(bitmap, iova) + 1;
+
+	bitmap->mapped_base_index += count;
+
+	iova_bitmap_put(bitmap);
+	if (iova_bitmap_done(bitmap))
+		return 0;
+
+	/* When advancing the index we pin the next set of bitmap pages */
+	return iova_bitmap_get(bitmap);
+}
+
+/**
+ * iova_bitmap_for_each() - Iterates over the bitmap
+ * @bitmap: IOVA bitmap to iterate
+ * @opaque: Additional argument to pass to the callback
+ * @fn: Function that gets called for each IOVA range
+ *
+ * Helper function to iterate over bitmap data representing a portion of IOVA
+ * space. It hides the complexity of iterating bitmaps and translating the
+ * mapped bitmap user pages into IOVA ranges to process.
+ *
+ * Return: 0 on success, and an error on failure either upon
+ * iteration or when the callback returns an error.
+ */
+int iova_bitmap_for_each(struct iova_bitmap *bitmap, void *opaque,
+			 iova_bitmap_fn_t fn)
+{
+	int ret = 0;
+
+	for (; !iova_bitmap_done(bitmap) && !ret;
+	     ret = iova_bitmap_advance(bitmap)) {
+		ret = fn(bitmap, iova_bitmap_mapped_iova(bitmap),
+			 iova_bitmap_mapped_length(bitmap), opaque);
+		if (ret)
+			break;
+	}
+
+	return ret;
+}
+
+/**
+ * iova_bitmap_set() - Records an IOVA range in bitmap
+ * @bitmap: IOVA bitmap
+ * @iova: IOVA to start
+ * @length: IOVA range length
+ *
+ * Set the bits corresponding to the range [iova .. iova+length-1] in
+ * the user bitmap.
+ *
+ */
+void iova_bitmap_set(struct iova_bitmap *bitmap,
+		     unsigned long iova, size_t length)
+{
+	struct iova_bitmap_map *mapped = &bitmap->mapped;
+	unsigned long cur_bit = ((iova - mapped->iova) >>
+			mapped->pgshift) + mapped->pgoff * BITS_PER_BYTE;
+	unsigned long last_bit = (((iova + length - 1) - mapped->iova) >>
+			mapped->pgshift) + mapped->pgoff * BITS_PER_BYTE;
+
+	do {
+		unsigned int page_idx = cur_bit / BITS_PER_PAGE;
+		unsigned int offset = cur_bit % BITS_PER_PAGE;
+		unsigned int nbits = min(BITS_PER_PAGE - offset,
+					 last_bit - cur_bit + 1);
+		void *kaddr;
+
+		kaddr = kmap_local_page(mapped->pages[page_idx]);
+		bitmap_set(kaddr, offset, nbits);
+		kunmap_local(kaddr);
+		cur_bit += nbits;
+	} while (cur_bit <= last_bit);
+}
+EXPORT_SYMBOL_GPL(iova_bitmap_set);
diff --git a/drivers/vfio/mdev/Kconfig b/drivers/vfio/mdev/Kconfig
new file mode 100644
index 0000000000..e5fb84e079
--- /dev/null
+++ b/drivers/vfio/mdev/Kconfig
@@ -0,0 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+config VFIO_MDEV
+	tristate
diff --git a/drivers/vfio/mdev/Makefile b/drivers/vfio/mdev/Makefile
new file mode 100644
index 0000000000..7c236ba1b9
--- /dev/null
+++ b/drivers/vfio/mdev/Makefile
@@ -0,0 +1,5 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+mdev-y := mdev_core.o mdev_sysfs.o mdev_driver.o
+
+obj-$(CONFIG_VFIO_MDEV) += mdev.o
diff --git a/drivers/vfio/mdev/mdev_core.c b/drivers/vfio/mdev/mdev_core.c
new file mode 100644
index 0000000000..ed4737de45
--- /dev/null
+++ b/drivers/vfio/mdev/mdev_core.c
@@ -0,0 +1,275 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Mediated device Core Driver
+ *
+ * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
+ *     Author: Neo Jia <cjia@nvidia.com>
+ *             Kirti Wankhede <kwankhede@nvidia.com>
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/sysfs.h>
+#include <linux/mdev.h>
+
+#include "mdev_private.h"
+
+#define DRIVER_VERSION		"0.1"
+#define DRIVER_AUTHOR		"NVIDIA Corporation"
+#define DRIVER_DESC		"Mediated device Core Driver"
+
+static struct class_compat *mdev_bus_compat_class;
+
+static LIST_HEAD(mdev_list);
+static DEFINE_MUTEX(mdev_list_lock);
+
+/* Caller must hold parent unreg_sem read or write lock */
+static void mdev_device_remove_common(struct mdev_device *mdev)
+{
+	struct mdev_parent *parent = mdev->type->parent;
+
+	mdev_remove_sysfs_files(mdev);
+	device_del(&mdev->dev);
+	lockdep_assert_held(&parent->unreg_sem);
+	/* Balances with device_initialize() */
+	put_device(&mdev->dev);
+}
+
+static int mdev_device_remove_cb(struct device *dev, void *data)
+{
+	if (dev->bus == &mdev_bus_type)
+		mdev_device_remove_common(to_mdev_device(dev));
+	return 0;
+}
+
+/*
+ * mdev_register_parent: Register a device as parent for mdevs
+ * @parent: parent structure registered
+ * @dev: device structure representing parent device.
+ * @mdev_driver: Device driver to bind to the newly created mdev
+ * @types: Array of supported mdev types
+ * @nr_types: Number of entries in @types
+ *
+ * Registers the @parent stucture as a parent for mdev types and thus mdev
+ * devices.  The caller needs to hold a reference on @dev that must not be
+ * released until after the call to mdev_unregister_parent().
+ *
+ * Returns a negative value on error, otherwise 0.
+ */
+int mdev_register_parent(struct mdev_parent *parent, struct device *dev,
+		struct mdev_driver *mdev_driver, struct mdev_type **types,
+		unsigned int nr_types)
+{
+	char *env_string = "MDEV_STATE=registered";
+	char *envp[] = { env_string, NULL };
+	int ret;
+
+	memset(parent, 0, sizeof(*parent));
+	init_rwsem(&parent->unreg_sem);
+	parent->dev = dev;
+	parent->mdev_driver = mdev_driver;
+	parent->types = types;
+	parent->nr_types = nr_types;
+	atomic_set(&parent->available_instances, mdev_driver->max_instances);
+
+	ret = parent_create_sysfs_files(parent);
+	if (ret)
+		return ret;
+
+	ret = class_compat_create_link(mdev_bus_compat_class, dev, NULL);
+	if (ret)
+		dev_warn(dev, "Failed to create compatibility class link\n");
+
+	dev_info(dev, "MDEV: Registered\n");
+	kobject_uevent_env(&dev->kobj, KOBJ_CHANGE, envp);
+	return 0;
+}
+EXPORT_SYMBOL(mdev_register_parent);
+
+/*
+ * mdev_unregister_parent : Unregister a parent device
+ * @parent: parent structure to unregister
+ */
+void mdev_unregister_parent(struct mdev_parent *parent)
+{
+	char *env_string = "MDEV_STATE=unregistered";
+	char *envp[] = { env_string, NULL };
+
+	dev_info(parent->dev, "MDEV: Unregistering\n");
+
+	down_write(&parent->unreg_sem);
+	class_compat_remove_link(mdev_bus_compat_class, parent->dev, NULL);
+	device_for_each_child(parent->dev, NULL, mdev_device_remove_cb);
+	parent_remove_sysfs_files(parent);
+	up_write(&parent->unreg_sem);
+
+	kobject_uevent_env(&parent->dev->kobj, KOBJ_CHANGE, envp);
+}
+EXPORT_SYMBOL(mdev_unregister_parent);
+
+static void mdev_device_release(struct device *dev)
+{
+	struct mdev_device *mdev = to_mdev_device(dev);
+	struct mdev_parent *parent = mdev->type->parent;
+
+	mutex_lock(&mdev_list_lock);
+	list_del(&mdev->next);
+	if (!parent->mdev_driver->get_available)
+		atomic_inc(&parent->available_instances);
+	mutex_unlock(&mdev_list_lock);
+
+	/* Pairs with the get in mdev_device_create() */
+	kobject_put(&mdev->type->kobj);
+
+	dev_dbg(&mdev->dev, "MDEV: destroying\n");
+	kfree(mdev);
+}
+
+int mdev_device_create(struct mdev_type *type, const guid_t *uuid)
+{
+	int ret;
+	struct mdev_device *mdev, *tmp;
+	struct mdev_parent *parent = type->parent;
+	struct mdev_driver *drv = parent->mdev_driver;
+
+	mutex_lock(&mdev_list_lock);
+
+	/* Check for duplicate */
+	list_for_each_entry(tmp, &mdev_list, next) {
+		if (guid_equal(&tmp->uuid, uuid)) {
+			mutex_unlock(&mdev_list_lock);
+			return -EEXIST;
+		}
+	}
+
+	if (!drv->get_available) {
+		/*
+		 * Note: that non-atomic read and dec is fine here because
+		 * all modifications are under mdev_list_lock.
+		 */
+		if (!atomic_read(&parent->available_instances)) {
+			mutex_unlock(&mdev_list_lock);
+			return -EUSERS;
+		}
+		atomic_dec(&parent->available_instances);
+	}
+
+	mdev = kzalloc(sizeof(*mdev), GFP_KERNEL);
+	if (!mdev) {
+		mutex_unlock(&mdev_list_lock);
+		return -ENOMEM;
+	}
+
+	device_initialize(&mdev->dev);
+	mdev->dev.parent  = parent->dev;
+	mdev->dev.bus = &mdev_bus_type;
+	mdev->dev.release = mdev_device_release;
+	mdev->dev.groups = mdev_device_groups;
+	mdev->type = type;
+	/* Pairs with the put in mdev_device_release() */
+	kobject_get(&type->kobj);
+
+	guid_copy(&mdev->uuid, uuid);
+	list_add(&mdev->next, &mdev_list);
+	mutex_unlock(&mdev_list_lock);
+
+	ret = dev_set_name(&mdev->dev, "%pUl", uuid);
+	if (ret)
+		goto out_put_device;
+
+	/* Check if parent unregistration has started */
+	if (!down_read_trylock(&parent->unreg_sem)) {
+		ret = -ENODEV;
+		goto out_put_device;
+	}
+
+	ret = device_add(&mdev->dev);
+	if (ret)
+		goto out_unlock;
+
+	ret = device_driver_attach(&drv->driver, &mdev->dev);
+	if (ret)
+		goto out_del;
+
+	ret = mdev_create_sysfs_files(mdev);
+	if (ret)
+		goto out_del;
+
+	mdev->active = true;
+	dev_dbg(&mdev->dev, "MDEV: created\n");
+	up_read(&parent->unreg_sem);
+
+	return 0;
+
+out_del:
+	device_del(&mdev->dev);
+out_unlock:
+	up_read(&parent->unreg_sem);
+out_put_device:
+	put_device(&mdev->dev);
+	return ret;
+}
+
+int mdev_device_remove(struct mdev_device *mdev)
+{
+	struct mdev_device *tmp;
+	struct mdev_parent *parent = mdev->type->parent;
+
+	mutex_lock(&mdev_list_lock);
+	list_for_each_entry(tmp, &mdev_list, next) {
+		if (tmp == mdev)
+			break;
+	}
+
+	if (tmp != mdev) {
+		mutex_unlock(&mdev_list_lock);
+		return -ENODEV;
+	}
+
+	if (!mdev->active) {
+		mutex_unlock(&mdev_list_lock);
+		return -EAGAIN;
+	}
+
+	mdev->active = false;
+	mutex_unlock(&mdev_list_lock);
+
+	/* Check if parent unregistration has started */
+	if (!down_read_trylock(&parent->unreg_sem))
+		return -ENODEV;
+
+	mdev_device_remove_common(mdev);
+	up_read(&parent->unreg_sem);
+	return 0;
+}
+
+static int __init mdev_init(void)
+{
+	int ret;
+
+	ret = bus_register(&mdev_bus_type);
+	if (ret)
+		return ret;
+
+	mdev_bus_compat_class = class_compat_register("mdev_bus");
+	if (!mdev_bus_compat_class) {
+		bus_unregister(&mdev_bus_type);
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static void __exit mdev_exit(void)
+{
+	class_compat_unregister(mdev_bus_compat_class);
+	bus_unregister(&mdev_bus_type);
+}
+
+subsys_initcall(mdev_init)
+module_exit(mdev_exit)
+
+MODULE_VERSION(DRIVER_VERSION);
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR(DRIVER_AUTHOR);
+MODULE_DESCRIPTION(DRIVER_DESC);
diff --git a/drivers/vfio/mdev/mdev_driver.c b/drivers/vfio/mdev/mdev_driver.c
new file mode 100644
index 0000000000..7825d83a55
--- /dev/null
+++ b/drivers/vfio/mdev/mdev_driver.c
@@ -0,0 +1,75 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * MDEV driver
+ *
+ * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
+ *     Author: Neo Jia <cjia@nvidia.com>
+ *             Kirti Wankhede <kwankhede@nvidia.com>
+ */
+
+#include <linux/iommu.h>
+#include <linux/mdev.h>
+
+#include "mdev_private.h"
+
+static int mdev_probe(struct device *dev)
+{
+	struct mdev_driver *drv =
+		container_of(dev->driver, struct mdev_driver, driver);
+
+	if (!drv->probe)
+		return 0;
+	return drv->probe(to_mdev_device(dev));
+}
+
+static void mdev_remove(struct device *dev)
+{
+	struct mdev_driver *drv =
+		container_of(dev->driver, struct mdev_driver, driver);
+
+	if (drv->remove)
+		drv->remove(to_mdev_device(dev));
+}
+
+static int mdev_match(struct device *dev, struct device_driver *drv)
+{
+	/*
+	 * No drivers automatically match. Drivers are only bound by explicit
+	 * device_driver_attach()
+	 */
+	return 0;
+}
+
+struct bus_type mdev_bus_type = {
+	.name		= "mdev",
+	.probe		= mdev_probe,
+	.remove		= mdev_remove,
+	.match		= mdev_match,
+};
+
+/**
+ * mdev_register_driver - register a new MDEV driver
+ * @drv: the driver to register
+ *
+ * Returns a negative value on error, otherwise 0.
+ **/
+int mdev_register_driver(struct mdev_driver *drv)
+{
+	if (!drv->device_api)
+		return -EINVAL;
+
+	/* initialize common driver fields */
+	drv->driver.bus = &mdev_bus_type;
+	return driver_register(&drv->driver);
+}
+EXPORT_SYMBOL(mdev_register_driver);
+
+/*
+ * mdev_unregister_driver - unregister MDEV driver
+ * @drv: the driver to unregister
+ */
+void mdev_unregister_driver(struct mdev_driver *drv)
+{
+	driver_unregister(&drv->driver);
+}
+EXPORT_SYMBOL(mdev_unregister_driver);
diff --git a/drivers/vfio/mdev/mdev_private.h b/drivers/vfio/mdev/mdev_private.h
new file mode 100644
index 0000000000..af457b27f6
--- /dev/null
+++ b/drivers/vfio/mdev/mdev_private.h
@@ -0,0 +1,33 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Mediated device internal definitions
+ *
+ * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
+ *     Author: Neo Jia <cjia@nvidia.com>
+ *             Kirti Wankhede <kwankhede@nvidia.com>
+ */
+
+#ifndef MDEV_PRIVATE_H
+#define MDEV_PRIVATE_H
+
+int  mdev_bus_register(void);
+void mdev_bus_unregister(void);
+
+extern struct bus_type mdev_bus_type;
+extern const struct attribute_group *mdev_device_groups[];
+
+#define to_mdev_type_attr(_attr)	\
+	container_of(_attr, struct mdev_type_attribute, attr)
+#define to_mdev_type(_kobj)		\
+	container_of(_kobj, struct mdev_type, kobj)
+
+int  parent_create_sysfs_files(struct mdev_parent *parent);
+void parent_remove_sysfs_files(struct mdev_parent *parent);
+
+int  mdev_create_sysfs_files(struct mdev_device *mdev);
+void mdev_remove_sysfs_files(struct mdev_device *mdev);
+
+int mdev_device_create(struct mdev_type *kobj, const guid_t *uuid);
+int  mdev_device_remove(struct mdev_device *dev);
+
+#endif /* MDEV_PRIVATE_H */
diff --git a/drivers/vfio/mdev/mdev_sysfs.c b/drivers/vfio/mdev/mdev_sysfs.c
new file mode 100644
index 0000000000..9d2738e10c
--- /dev/null
+++ b/drivers/vfio/mdev/mdev_sysfs.c
@@ -0,0 +1,302 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * File attributes for Mediated devices
+ *
+ * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
+ *     Author: Neo Jia <cjia@nvidia.com>
+ *             Kirti Wankhede <kwankhede@nvidia.com>
+ */
+
+#include <linux/sysfs.h>
+#include <linux/ctype.h>
+#include <linux/slab.h>
+#include <linux/mdev.h>
+
+#include "mdev_private.h"
+
+struct mdev_type_attribute {
+	struct attribute attr;
+	ssize_t (*show)(struct mdev_type *mtype,
+			struct mdev_type_attribute *attr, char *buf);
+	ssize_t (*store)(struct mdev_type *mtype,
+			 struct mdev_type_attribute *attr, const char *buf,
+			 size_t count);
+};
+
+#define MDEV_TYPE_ATTR_RO(_name) \
+	struct mdev_type_attribute mdev_type_attr_##_name = __ATTR_RO(_name)
+#define MDEV_TYPE_ATTR_WO(_name) \
+	struct mdev_type_attribute mdev_type_attr_##_name = __ATTR_WO(_name)
+
+static ssize_t mdev_type_attr_show(struct kobject *kobj,
+				     struct attribute *__attr, char *buf)
+{
+	struct mdev_type_attribute *attr = to_mdev_type_attr(__attr);
+	struct mdev_type *type = to_mdev_type(kobj);
+	ssize_t ret = -EIO;
+
+	if (attr->show)
+		ret = attr->show(type, attr, buf);
+	return ret;
+}
+
+static ssize_t mdev_type_attr_store(struct kobject *kobj,
+				      struct attribute *__attr,
+				      const char *buf, size_t count)
+{
+	struct mdev_type_attribute *attr = to_mdev_type_attr(__attr);
+	struct mdev_type *type = to_mdev_type(kobj);
+	ssize_t ret = -EIO;
+
+	if (attr->store)
+		ret = attr->store(type, attr, buf, count);
+	return ret;
+}
+
+static const struct sysfs_ops mdev_type_sysfs_ops = {
+	.show = mdev_type_attr_show,
+	.store = mdev_type_attr_store,
+};
+
+static ssize_t create_store(struct mdev_type *mtype,
+			    struct mdev_type_attribute *attr, const char *buf,
+			    size_t count)
+{
+	char *str;
+	guid_t uuid;
+	int ret;
+
+	if ((count < UUID_STRING_LEN) || (count > UUID_STRING_LEN + 1))
+		return -EINVAL;
+
+	str = kstrndup(buf, count, GFP_KERNEL);
+	if (!str)
+		return -ENOMEM;
+
+	ret = guid_parse(str, &uuid);
+	kfree(str);
+	if (ret)
+		return ret;
+
+	ret = mdev_device_create(mtype, &uuid);
+	if (ret)
+		return ret;
+
+	return count;
+}
+static MDEV_TYPE_ATTR_WO(create);
+
+static ssize_t device_api_show(struct mdev_type *mtype,
+			       struct mdev_type_attribute *attr, char *buf)
+{
+	return sysfs_emit(buf, "%s\n", mtype->parent->mdev_driver->device_api);
+}
+static MDEV_TYPE_ATTR_RO(device_api);
+
+static ssize_t name_show(struct mdev_type *mtype,
+			 struct mdev_type_attribute *attr, char *buf)
+{
+	return sysfs_emit(buf, "%s\n",
+		mtype->pretty_name ? mtype->pretty_name : mtype->sysfs_name);
+}
+
+static MDEV_TYPE_ATTR_RO(name);
+
+static ssize_t available_instances_show(struct mdev_type *mtype,
+					struct mdev_type_attribute *attr,
+					char *buf)
+{
+	struct mdev_driver *drv = mtype->parent->mdev_driver;
+
+	if (drv->get_available)
+		return sysfs_emit(buf, "%u\n", drv->get_available(mtype));
+	return sysfs_emit(buf, "%u\n",
+			  atomic_read(&mtype->parent->available_instances));
+}
+static MDEV_TYPE_ATTR_RO(available_instances);
+
+static ssize_t description_show(struct mdev_type *mtype,
+				struct mdev_type_attribute *attr,
+				char *buf)
+{
+	return mtype->parent->mdev_driver->show_description(mtype, buf);
+}
+static MDEV_TYPE_ATTR_RO(description);
+
+static struct attribute *mdev_types_core_attrs[] = {
+	&mdev_type_attr_create.attr,
+	&mdev_type_attr_device_api.attr,
+	&mdev_type_attr_name.attr,
+	&mdev_type_attr_available_instances.attr,
+	&mdev_type_attr_description.attr,
+	NULL,
+};
+
+static umode_t mdev_types_core_is_visible(struct kobject *kobj,
+					  struct attribute *attr, int n)
+{
+	if (attr == &mdev_type_attr_description.attr &&
+	    !to_mdev_type(kobj)->parent->mdev_driver->show_description)
+		return 0;
+	return attr->mode;
+}
+
+static struct attribute_group mdev_type_core_group = {
+	.attrs = mdev_types_core_attrs,
+	.is_visible = mdev_types_core_is_visible,
+};
+
+static const struct attribute_group *mdev_type_groups[] = {
+	&mdev_type_core_group,
+	NULL,
+};
+
+static void mdev_type_release(struct kobject *kobj)
+{
+	struct mdev_type *type = to_mdev_type(kobj);
+
+	pr_debug("Releasing group %s\n", kobj->name);
+	/* Pairs with the get in add_mdev_supported_type() */
+	put_device(type->parent->dev);
+}
+
+static struct kobj_type mdev_type_ktype = {
+	.sysfs_ops	= &mdev_type_sysfs_ops,
+	.release	= mdev_type_release,
+	.default_groups	= mdev_type_groups,
+};
+
+static int mdev_type_add(struct mdev_parent *parent, struct mdev_type *type)
+{
+	int ret;
+
+	type->kobj.kset = parent->mdev_types_kset;
+	type->parent = parent;
+	/* Pairs with the put in mdev_type_release() */
+	get_device(parent->dev);
+
+	ret = kobject_init_and_add(&type->kobj, &mdev_type_ktype, NULL,
+				   "%s-%s", dev_driver_string(parent->dev),
+				   type->sysfs_name);
+	if (ret) {
+		kobject_put(&type->kobj);
+		return ret;
+	}
+
+	type->devices_kobj = kobject_create_and_add("devices", &type->kobj);
+	if (!type->devices_kobj) {
+		ret = -ENOMEM;
+		goto attr_devices_failed;
+	}
+
+	return 0;
+
+attr_devices_failed:
+	kobject_del(&type->kobj);
+	kobject_put(&type->kobj);
+	return ret;
+}
+
+static void mdev_type_remove(struct mdev_type *type)
+{
+	kobject_put(type->devices_kobj);
+	kobject_del(&type->kobj);
+	kobject_put(&type->kobj);
+}
+
+/* mdev sysfs functions */
+void parent_remove_sysfs_files(struct mdev_parent *parent)
+{
+	int i;
+
+	for (i = 0; i < parent->nr_types; i++)
+		mdev_type_remove(parent->types[i]);
+	kset_unregister(parent->mdev_types_kset);
+}
+
+int parent_create_sysfs_files(struct mdev_parent *parent)
+{
+	int ret, i;
+
+	parent->mdev_types_kset = kset_create_and_add("mdev_supported_types",
+					       NULL, &parent->dev->kobj);
+	if (!parent->mdev_types_kset)
+		return -ENOMEM;
+
+	for (i = 0; i < parent->nr_types; i++) {
+		ret = mdev_type_add(parent, parent->types[i]);
+		if (ret)
+			goto out_err;
+	}
+	return 0;
+
+out_err:
+	while (--i >= 0)
+		mdev_type_remove(parent->types[i]);
+	kset_unregister(parent->mdev_types_kset);
+	return ret;
+}
+
+static ssize_t remove_store(struct device *dev, struct device_attribute *attr,
+			    const char *buf, size_t count)
+{
+	struct mdev_device *mdev = to_mdev_device(dev);
+	unsigned long val;
+
+	if (kstrtoul(buf, 0, &val) < 0)
+		return -EINVAL;
+
+	if (val && device_remove_file_self(dev, attr)) {
+		int ret;
+
+		ret = mdev_device_remove(mdev);
+		if (ret)
+			return ret;
+	}
+
+	return count;
+}
+
+static DEVICE_ATTR_WO(remove);
+
+static struct attribute *mdev_device_attrs[] = {
+	&dev_attr_remove.attr,
+	NULL,
+};
+
+static const struct attribute_group mdev_device_group = {
+	.attrs = mdev_device_attrs,
+};
+
+const struct attribute_group *mdev_device_groups[] = {
+	&mdev_device_group,
+	NULL
+};
+
+int mdev_create_sysfs_files(struct mdev_device *mdev)
+{
+	struct mdev_type *type = mdev->type;
+	struct kobject *kobj = &mdev->dev.kobj;
+	int ret;
+
+	ret = sysfs_create_link(type->devices_kobj, kobj, dev_name(&mdev->dev));
+	if (ret)
+		return ret;
+
+	ret = sysfs_create_link(kobj, &type->kobj, "mdev_type");
+	if (ret)
+		goto type_link_failed;
+	return ret;
+
+type_link_failed:
+	sysfs_remove_link(mdev->type->devices_kobj, dev_name(&mdev->dev));
+	return ret;
+}
+
+void mdev_remove_sysfs_files(struct mdev_device *mdev)
+{
+	struct kobject *kobj = &mdev->dev.kobj;
+
+	sysfs_remove_link(kobj, "mdev_type");
+	sysfs_remove_link(mdev->type->devices_kobj, dev_name(&mdev->dev));
+}
diff --git a/drivers/vfio/pci/Kconfig b/drivers/vfio/pci/Kconfig
new file mode 100644
index 0000000000..8125e5f378
--- /dev/null
+++ b/drivers/vfio/pci/Kconfig
@@ -0,0 +1,68 @@
+# SPDX-License-Identifier: GPL-2.0-only
+menu "VFIO support for PCI devices"
+	depends on PCI && MMU
+
+config VFIO_PCI_CORE
+	tristate
+	select VFIO_VIRQFD
+	select IRQ_BYPASS_MANAGER
+
+config VFIO_PCI_MMAP
+	def_bool y if !S390
+	depends on VFIO_PCI_CORE
+
+config VFIO_PCI_INTX
+	def_bool y if !S390
+	depends on VFIO_PCI_CORE
+
+config VFIO_PCI
+	tristate "Generic VFIO support for any PCI device"
+	select VFIO_PCI_CORE
+	help
+	  Support for the generic PCI VFIO bus driver which can connect any
+	  PCI device to the VFIO framework.
+
+	  If you don't know what to do here, say N.
+
+if VFIO_PCI
+config VFIO_PCI_VGA
+	bool "Generic VFIO PCI support for VGA devices"
+	depends on X86 && VGA_ARB
+	help
+	  Support for VGA extension to VFIO PCI.  This exposes an additional
+	  region on VGA devices for accessing legacy VGA addresses used by
+	  BIOS and generic video drivers.
+
+	  If you don't know what to do here, say N.
+
+config VFIO_PCI_IGD
+	bool "Generic VFIO PCI extensions for Intel graphics (GVT-d)"
+	depends on X86
+	default y
+	help
+	  Support for Intel IGD specific extensions to enable direct
+	  assignment to virtual machines.  This includes exposing an IGD
+	  specific firmware table and read-only copies of the host bridge
+	  and LPC bridge config space.
+
+	  To enable Intel IGD assignment through vfio-pci, say Y.
+endif
+
+config VFIO_PCI_ZDEV_KVM
+	bool "VFIO PCI extensions for s390x KVM passthrough"
+	depends on S390 && KVM
+	default y
+	help
+	  Support s390x-specific extensions to enable support for enhancements
+	  to KVM passthrough capabilities, such as interpretive execution of
+	  zPCI instructions.
+
+	  To enable s390x KVM vfio-pci extensions, say Y.
+
+source "drivers/vfio/pci/mlx5/Kconfig"
+
+source "drivers/vfio/pci/hisilicon/Kconfig"
+
+source "drivers/vfio/pci/pds/Kconfig"
+
+endmenu
diff --git a/drivers/vfio/pci/Makefile b/drivers/vfio/pci/Makefile
new file mode 100644
index 0000000000..45167be462
--- /dev/null
+++ b/drivers/vfio/pci/Makefile
@@ -0,0 +1,15 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+vfio-pci-core-y := vfio_pci_core.o vfio_pci_intrs.o vfio_pci_rdwr.o vfio_pci_config.o
+vfio-pci-core-$(CONFIG_VFIO_PCI_ZDEV_KVM) += vfio_pci_zdev.o
+obj-$(CONFIG_VFIO_PCI_CORE) += vfio-pci-core.o
+
+vfio-pci-y := vfio_pci.o
+vfio-pci-$(CONFIG_VFIO_PCI_IGD) += vfio_pci_igd.o
+obj-$(CONFIG_VFIO_PCI) += vfio-pci.o
+
+obj-$(CONFIG_MLX5_VFIO_PCI)           += mlx5/
+
+obj-$(CONFIG_HISI_ACC_VFIO_PCI) += hisilicon/
+
+obj-$(CONFIG_PDS_VFIO_PCI) += pds/
diff --git a/drivers/vfio/pci/hisilicon/Kconfig b/drivers/vfio/pci/hisilicon/Kconfig
new file mode 100644
index 0000000000..cbf1c32f6e
--- /dev/null
+++ b/drivers/vfio/pci/hisilicon/Kconfig
@@ -0,0 +1,15 @@
+# SPDX-License-Identifier: GPL-2.0-only
+config HISI_ACC_VFIO_PCI
+	tristate "VFIO support for HiSilicon ACC PCI devices"
+	depends on ARM64 || (COMPILE_TEST && 64BIT)
+	depends on PCI_MSI
+	depends on CRYPTO_DEV_HISI_QM
+	depends on CRYPTO_DEV_HISI_HPRE
+	depends on CRYPTO_DEV_HISI_SEC2
+	depends on CRYPTO_DEV_HISI_ZIP
+	select VFIO_PCI_CORE
+	help
+	  This provides generic PCI support for HiSilicon ACC devices
+	  using the VFIO framework.
+
+	  If you don't know what to do here, say N.
diff --git a/drivers/vfio/pci/hisilicon/Makefile b/drivers/vfio/pci/hisilicon/Makefile
new file mode 100644
index 0000000000..c66b3783f2
--- /dev/null
+++ b/drivers/vfio/pci/hisilicon/Makefile
@@ -0,0 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
+obj-$(CONFIG_HISI_ACC_VFIO_PCI) += hisi-acc-vfio-pci.o
+hisi-acc-vfio-pci-y := hisi_acc_vfio_pci.o
+
diff --git a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c
new file mode 100644
index 0000000000..4d27465c8f
--- /dev/null
+++ b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c
@@ -0,0 +1,1470 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2021, HiSilicon Ltd.
+ */
+
+#include <linux/device.h>
+#include <linux/eventfd.h>
+#include <linux/file.h>
+#include <linux/hisi_acc_qm.h>
+#include <linux/interrupt.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/vfio.h>
+#include <linux/vfio_pci_core.h>
+#include <linux/anon_inodes.h>
+
+#include "hisi_acc_vfio_pci.h"
+
+/* Return 0 on VM acc device ready, -ETIMEDOUT hardware timeout */
+static int qm_wait_dev_not_ready(struct hisi_qm *qm)
+{
+	u32 val;
+
+	return readl_relaxed_poll_timeout(qm->io_base + QM_VF_STATE,
+				val, !(val & 0x1), MB_POLL_PERIOD_US,
+				MB_POLL_TIMEOUT_US);
+}
+
+/*
+ * Each state Reg is checked 100 times,
+ * with a delay of 100 microseconds after each check
+ */
+static u32 qm_check_reg_state(struct hisi_qm *qm, u32 regs)
+{
+	int check_times = 0;
+	u32 state;
+
+	state = readl(qm->io_base + regs);
+	while (state && check_times < ERROR_CHECK_TIMEOUT) {
+		udelay(CHECK_DELAY_TIME);
+		state = readl(qm->io_base + regs);
+		check_times++;
+	}
+
+	return state;
+}
+
+static int qm_read_regs(struct hisi_qm *qm, u32 reg_addr,
+			u32 *data, u8 nums)
+{
+	int i;
+
+	if (nums < 1 || nums > QM_REGS_MAX_LEN)
+		return -EINVAL;
+
+	for (i = 0; i < nums; i++) {
+		data[i] = readl(qm->io_base + reg_addr);
+		reg_addr += QM_REG_ADDR_OFFSET;
+	}
+
+	return 0;
+}
+
+static int qm_write_regs(struct hisi_qm *qm, u32 reg,
+			 u32 *data, u8 nums)
+{
+	int i;
+
+	if (nums < 1 || nums > QM_REGS_MAX_LEN)
+		return -EINVAL;
+
+	for (i = 0; i < nums; i++)
+		writel(data[i], qm->io_base + reg + i * QM_REG_ADDR_OFFSET);
+
+	return 0;
+}
+
+static int qm_get_vft(struct hisi_qm *qm, u32 *base)
+{
+	u64 sqc_vft;
+	u32 qp_num;
+	int ret;
+
+	ret = hisi_qm_mb(qm, QM_MB_CMD_SQC_VFT_V2, 0, 0, 1);
+	if (ret)
+		return ret;
+
+	sqc_vft = readl(qm->io_base + QM_MB_CMD_DATA_ADDR_L) |
+		  ((u64)readl(qm->io_base + QM_MB_CMD_DATA_ADDR_H) <<
+		  QM_XQC_ADDR_OFFSET);
+	*base = QM_SQC_VFT_BASE_MASK_V2 & (sqc_vft >> QM_SQC_VFT_BASE_SHIFT_V2);
+	qp_num = (QM_SQC_VFT_NUM_MASK_V2 &
+		  (sqc_vft >> QM_SQC_VFT_NUM_SHIFT_V2)) + 1;
+
+	return qp_num;
+}
+
+static int qm_get_sqc(struct hisi_qm *qm, u64 *addr)
+{
+	int ret;
+
+	ret = hisi_qm_mb(qm, QM_MB_CMD_SQC_BT, 0, 0, 1);
+	if (ret)
+		return ret;
+
+	*addr = readl(qm->io_base + QM_MB_CMD_DATA_ADDR_L) |
+		  ((u64)readl(qm->io_base + QM_MB_CMD_DATA_ADDR_H) <<
+		  QM_XQC_ADDR_OFFSET);
+
+	return 0;
+}
+
+static int qm_get_cqc(struct hisi_qm *qm, u64 *addr)
+{
+	int ret;
+
+	ret = hisi_qm_mb(qm, QM_MB_CMD_CQC_BT, 0, 0, 1);
+	if (ret)
+		return ret;
+
+	*addr = readl(qm->io_base + QM_MB_CMD_DATA_ADDR_L) |
+		  ((u64)readl(qm->io_base + QM_MB_CMD_DATA_ADDR_H) <<
+		  QM_XQC_ADDR_OFFSET);
+
+	return 0;
+}
+
+static int qm_get_regs(struct hisi_qm *qm, struct acc_vf_data *vf_data)
+{
+	struct device *dev = &qm->pdev->dev;
+	int ret;
+
+	ret = qm_read_regs(qm, QM_VF_AEQ_INT_MASK, &vf_data->aeq_int_mask, 1);
+	if (ret) {
+		dev_err(dev, "failed to read QM_VF_AEQ_INT_MASK\n");
+		return ret;
+	}
+
+	ret = qm_read_regs(qm, QM_VF_EQ_INT_MASK, &vf_data->eq_int_mask, 1);
+	if (ret) {
+		dev_err(dev, "failed to read QM_VF_EQ_INT_MASK\n");
+		return ret;
+	}
+
+	ret = qm_read_regs(qm, QM_IFC_INT_SOURCE_V,
+			   &vf_data->ifc_int_source, 1);
+	if (ret) {
+		dev_err(dev, "failed to read QM_IFC_INT_SOURCE_V\n");
+		return ret;
+	}
+
+	ret = qm_read_regs(qm, QM_IFC_INT_MASK, &vf_data->ifc_int_mask, 1);
+	if (ret) {
+		dev_err(dev, "failed to read QM_IFC_INT_MASK\n");
+		return ret;
+	}
+
+	ret = qm_read_regs(qm, QM_IFC_INT_SET_V, &vf_data->ifc_int_set, 1);
+	if (ret) {
+		dev_err(dev, "failed to read QM_IFC_INT_SET_V\n");
+		return ret;
+	}
+
+	ret = qm_read_regs(qm, QM_PAGE_SIZE, &vf_data->page_size, 1);
+	if (ret) {
+		dev_err(dev, "failed to read QM_PAGE_SIZE\n");
+		return ret;
+	}
+
+	/* QM_EQC_DW has 7 regs */
+	ret = qm_read_regs(qm, QM_EQC_DW0, vf_data->qm_eqc_dw, 7);
+	if (ret) {
+		dev_err(dev, "failed to read QM_EQC_DW\n");
+		return ret;
+	}
+
+	/* QM_AEQC_DW has 7 regs */
+	ret = qm_read_regs(qm, QM_AEQC_DW0, vf_data->qm_aeqc_dw, 7);
+	if (ret) {
+		dev_err(dev, "failed to read QM_AEQC_DW\n");
+		return ret;
+	}
+
+	return 0;
+}
+
+static int qm_set_regs(struct hisi_qm *qm, struct acc_vf_data *vf_data)
+{
+	struct device *dev = &qm->pdev->dev;
+	int ret;
+
+	/* Check VF state */
+	if (unlikely(hisi_qm_wait_mb_ready(qm))) {
+		dev_err(&qm->pdev->dev, "QM device is not ready to write\n");
+		return -EBUSY;
+	}
+
+	ret = qm_write_regs(qm, QM_VF_AEQ_INT_MASK, &vf_data->aeq_int_mask, 1);
+	if (ret) {
+		dev_err(dev, "failed to write QM_VF_AEQ_INT_MASK\n");
+		return ret;
+	}
+
+	ret = qm_write_regs(qm, QM_VF_EQ_INT_MASK, &vf_data->eq_int_mask, 1);
+	if (ret) {
+		dev_err(dev, "failed to write QM_VF_EQ_INT_MASK\n");
+		return ret;
+	}
+
+	ret = qm_write_regs(qm, QM_IFC_INT_SOURCE_V,
+			    &vf_data->ifc_int_source, 1);
+	if (ret) {
+		dev_err(dev, "failed to write QM_IFC_INT_SOURCE_V\n");
+		return ret;
+	}
+
+	ret = qm_write_regs(qm, QM_IFC_INT_MASK, &vf_data->ifc_int_mask, 1);
+	if (ret) {
+		dev_err(dev, "failed to write QM_IFC_INT_MASK\n");
+		return ret;
+	}
+
+	ret = qm_write_regs(qm, QM_IFC_INT_SET_V, &vf_data->ifc_int_set, 1);
+	if (ret) {
+		dev_err(dev, "failed to write QM_IFC_INT_SET_V\n");
+		return ret;
+	}
+
+	ret = qm_write_regs(qm, QM_QUE_ISO_CFG_V, &vf_data->que_iso_cfg, 1);
+	if (ret) {
+		dev_err(dev, "failed to write QM_QUE_ISO_CFG_V\n");
+		return ret;
+	}
+
+	ret = qm_write_regs(qm, QM_PAGE_SIZE, &vf_data->page_size, 1);
+	if (ret) {
+		dev_err(dev, "failed to write QM_PAGE_SIZE\n");
+		return ret;
+	}
+
+	/* QM_EQC_DW has 7 regs */
+	ret = qm_write_regs(qm, QM_EQC_DW0, vf_data->qm_eqc_dw, 7);
+	if (ret) {
+		dev_err(dev, "failed to write QM_EQC_DW\n");
+		return ret;
+	}
+
+	/* QM_AEQC_DW has 7 regs */
+	ret = qm_write_regs(qm, QM_AEQC_DW0, vf_data->qm_aeqc_dw, 7);
+	if (ret) {
+		dev_err(dev, "failed to write QM_AEQC_DW\n");
+		return ret;
+	}
+
+	return 0;
+}
+
+static void qm_db(struct hisi_qm *qm, u16 qn, u8 cmd,
+		  u16 index, u8 priority)
+{
+	u64 doorbell;
+	u64 dbase;
+	u16 randata = 0;
+
+	if (cmd == QM_DOORBELL_CMD_SQ || cmd == QM_DOORBELL_CMD_CQ)
+		dbase = QM_DOORBELL_SQ_CQ_BASE_V2;
+	else
+		dbase = QM_DOORBELL_EQ_AEQ_BASE_V2;
+
+	doorbell = qn | ((u64)cmd << QM_DB_CMD_SHIFT_V2) |
+		   ((u64)randata << QM_DB_RAND_SHIFT_V2) |
+		   ((u64)index << QM_DB_INDEX_SHIFT_V2)	 |
+		   ((u64)priority << QM_DB_PRIORITY_SHIFT_V2);
+
+	writeq(doorbell, qm->io_base + dbase);
+}
+
+static int pf_qm_get_qp_num(struct hisi_qm *qm, int vf_id, u32 *rbase)
+{
+	unsigned int val;
+	u64 sqc_vft;
+	u32 qp_num;
+	int ret;
+
+	ret = readl_relaxed_poll_timeout(qm->io_base + QM_VFT_CFG_RDY, val,
+					 val & BIT(0), MB_POLL_PERIOD_US,
+					 MB_POLL_TIMEOUT_US);
+	if (ret)
+		return ret;
+
+	writel(0x1, qm->io_base + QM_VFT_CFG_OP_WR);
+	/* 0 mean SQC VFT */
+	writel(0x0, qm->io_base + QM_VFT_CFG_TYPE);
+	writel(vf_id, qm->io_base + QM_VFT_CFG);
+
+	writel(0x0, qm->io_base + QM_VFT_CFG_RDY);
+	writel(0x1, qm->io_base + QM_VFT_CFG_OP_ENABLE);
+
+	ret = readl_relaxed_poll_timeout(qm->io_base + QM_VFT_CFG_RDY, val,
+					 val & BIT(0), MB_POLL_PERIOD_US,
+					 MB_POLL_TIMEOUT_US);
+	if (ret)
+		return ret;
+
+	sqc_vft = readl(qm->io_base + QM_VFT_CFG_DATA_L) |
+		  ((u64)readl(qm->io_base + QM_VFT_CFG_DATA_H) <<
+		  QM_XQC_ADDR_OFFSET);
+	*rbase = QM_SQC_VFT_BASE_MASK_V2 &
+		  (sqc_vft >> QM_SQC_VFT_BASE_SHIFT_V2);
+	qp_num = (QM_SQC_VFT_NUM_MASK_V2 &
+		  (sqc_vft >> QM_SQC_VFT_NUM_SHIFT_V2)) + 1;
+
+	return qp_num;
+}
+
+static void qm_dev_cmd_init(struct hisi_qm *qm)
+{
+	/* Clear VF communication status registers. */
+	writel(0x1, qm->io_base + QM_IFC_INT_SOURCE_V);
+
+	/* Enable pf and vf communication. */
+	writel(0x0, qm->io_base + QM_IFC_INT_MASK);
+}
+
+static int vf_qm_cache_wb(struct hisi_qm *qm)
+{
+	unsigned int val;
+
+	writel(0x1, qm->io_base + QM_CACHE_WB_START);
+	if (readl_relaxed_poll_timeout(qm->io_base + QM_CACHE_WB_DONE,
+				       val, val & BIT(0), MB_POLL_PERIOD_US,
+				       MB_POLL_TIMEOUT_US)) {
+		dev_err(&qm->pdev->dev, "vf QM writeback sqc cache fail\n");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static void vf_qm_fun_reset(struct hisi_qm *qm)
+{
+	int i;
+
+	for (i = 0; i < qm->qp_num; i++)
+		qm_db(qm, i, QM_DOORBELL_CMD_SQ, 0, 1);
+}
+
+static int vf_qm_func_stop(struct hisi_qm *qm)
+{
+	return hisi_qm_mb(qm, QM_MB_CMD_PAUSE_QM, 0, 0, 0);
+}
+
+static int vf_qm_check_match(struct hisi_acc_vf_core_device *hisi_acc_vdev,
+			     struct hisi_acc_vf_migration_file *migf)
+{
+	struct acc_vf_data *vf_data = &migf->vf_data;
+	struct hisi_qm *vf_qm = &hisi_acc_vdev->vf_qm;
+	struct hisi_qm *pf_qm = hisi_acc_vdev->pf_qm;
+	struct device *dev = &vf_qm->pdev->dev;
+	u32 que_iso_state;
+	int ret;
+
+	if (migf->total_length < QM_MATCH_SIZE || hisi_acc_vdev->match_done)
+		return 0;
+
+	if (vf_data->acc_magic != ACC_DEV_MAGIC) {
+		dev_err(dev, "failed to match ACC_DEV_MAGIC\n");
+		return -EINVAL;
+	}
+
+	if (vf_data->dev_id != hisi_acc_vdev->vf_dev->device) {
+		dev_err(dev, "failed to match VF devices\n");
+		return -EINVAL;
+	}
+
+	/* VF qp num check */
+	ret = qm_get_vft(vf_qm, &vf_qm->qp_base);
+	if (ret <= 0) {
+		dev_err(dev, "failed to get vft qp nums\n");
+		return -EINVAL;
+	}
+
+	if (ret != vf_data->qp_num) {
+		dev_err(dev, "failed to match VF qp num\n");
+		return -EINVAL;
+	}
+
+	vf_qm->qp_num = ret;
+
+	/* VF isolation state check */
+	ret = qm_read_regs(pf_qm, QM_QUE_ISO_CFG_V, &que_iso_state, 1);
+	if (ret) {
+		dev_err(dev, "failed to read QM_QUE_ISO_CFG_V\n");
+		return ret;
+	}
+
+	if (vf_data->que_iso_cfg != que_iso_state) {
+		dev_err(dev, "failed to match isolation state\n");
+		return -EINVAL;
+	}
+
+	ret = qm_write_regs(vf_qm, QM_VF_STATE, &vf_data->vf_qm_state, 1);
+	if (ret) {
+		dev_err(dev, "failed to write QM_VF_STATE\n");
+		return ret;
+	}
+
+	hisi_acc_vdev->vf_qm_state = vf_data->vf_qm_state;
+	hisi_acc_vdev->match_done = true;
+	return 0;
+}
+
+static int vf_qm_get_match_data(struct hisi_acc_vf_core_device *hisi_acc_vdev,
+				struct acc_vf_data *vf_data)
+{
+	struct hisi_qm *pf_qm = hisi_acc_vdev->pf_qm;
+	struct device *dev = &pf_qm->pdev->dev;
+	int vf_id = hisi_acc_vdev->vf_id;
+	int ret;
+
+	vf_data->acc_magic = ACC_DEV_MAGIC;
+	/* Save device id */
+	vf_data->dev_id = hisi_acc_vdev->vf_dev->device;
+
+	/* VF qp num save from PF */
+	ret = pf_qm_get_qp_num(pf_qm, vf_id, &vf_data->qp_base);
+	if (ret <= 0) {
+		dev_err(dev, "failed to get vft qp nums!\n");
+		return -EINVAL;
+	}
+
+	vf_data->qp_num = ret;
+
+	/* VF isolation state save from PF */
+	ret = qm_read_regs(pf_qm, QM_QUE_ISO_CFG_V, &vf_data->que_iso_cfg, 1);
+	if (ret) {
+		dev_err(dev, "failed to read QM_QUE_ISO_CFG_V!\n");
+		return ret;
+	}
+
+	return 0;
+}
+
+static int vf_qm_load_data(struct hisi_acc_vf_core_device *hisi_acc_vdev,
+			   struct hisi_acc_vf_migration_file *migf)
+{
+	struct hisi_qm *qm = &hisi_acc_vdev->vf_qm;
+	struct device *dev = &qm->pdev->dev;
+	struct acc_vf_data *vf_data = &migf->vf_data;
+	int ret;
+
+	/* Return if only match data was transferred */
+	if (migf->total_length == QM_MATCH_SIZE)
+		return 0;
+
+	if (migf->total_length < sizeof(struct acc_vf_data))
+		return -EINVAL;
+
+	qm->eqe_dma = vf_data->eqe_dma;
+	qm->aeqe_dma = vf_data->aeqe_dma;
+	qm->sqc_dma = vf_data->sqc_dma;
+	qm->cqc_dma = vf_data->cqc_dma;
+
+	qm->qp_base = vf_data->qp_base;
+	qm->qp_num = vf_data->qp_num;
+
+	ret = qm_set_regs(qm, vf_data);
+	if (ret) {
+		dev_err(dev, "set VF regs failed\n");
+		return ret;
+	}
+
+	ret = hisi_qm_mb(qm, QM_MB_CMD_SQC_BT, qm->sqc_dma, 0, 0);
+	if (ret) {
+		dev_err(dev, "set sqc failed\n");
+		return ret;
+	}
+
+	ret = hisi_qm_mb(qm, QM_MB_CMD_CQC_BT, qm->cqc_dma, 0, 0);
+	if (ret) {
+		dev_err(dev, "set cqc failed\n");
+		return ret;
+	}
+
+	qm_dev_cmd_init(qm);
+	return 0;
+}
+
+static int vf_qm_state_save(struct hisi_acc_vf_core_device *hisi_acc_vdev,
+			    struct hisi_acc_vf_migration_file *migf)
+{
+	struct acc_vf_data *vf_data = &migf->vf_data;
+	struct hisi_qm *vf_qm = &hisi_acc_vdev->vf_qm;
+	struct device *dev = &vf_qm->pdev->dev;
+	int ret;
+
+	if (unlikely(qm_wait_dev_not_ready(vf_qm))) {
+		/* Update state and return with match data */
+		vf_data->vf_qm_state = QM_NOT_READY;
+		hisi_acc_vdev->vf_qm_state = vf_data->vf_qm_state;
+		migf->total_length = QM_MATCH_SIZE;
+		return 0;
+	}
+
+	vf_data->vf_qm_state = QM_READY;
+	hisi_acc_vdev->vf_qm_state = vf_data->vf_qm_state;
+
+	ret = vf_qm_cache_wb(vf_qm);
+	if (ret) {
+		dev_err(dev, "failed to writeback QM Cache!\n");
+		return ret;
+	}
+
+	ret = qm_get_regs(vf_qm, vf_data);
+	if (ret)
+		return -EINVAL;
+
+	/* Every reg is 32 bit, the dma address is 64 bit. */
+	vf_data->eqe_dma = vf_data->qm_eqc_dw[1];
+	vf_data->eqe_dma <<= QM_XQC_ADDR_OFFSET;
+	vf_data->eqe_dma |= vf_data->qm_eqc_dw[0];
+	vf_data->aeqe_dma = vf_data->qm_aeqc_dw[1];
+	vf_data->aeqe_dma <<= QM_XQC_ADDR_OFFSET;
+	vf_data->aeqe_dma |= vf_data->qm_aeqc_dw[0];
+
+	/* Through SQC_BT/CQC_BT to get sqc and cqc address */
+	ret = qm_get_sqc(vf_qm, &vf_data->sqc_dma);
+	if (ret) {
+		dev_err(dev, "failed to read SQC addr!\n");
+		return -EINVAL;
+	}
+
+	ret = qm_get_cqc(vf_qm, &vf_data->cqc_dma);
+	if (ret) {
+		dev_err(dev, "failed to read CQC addr!\n");
+		return -EINVAL;
+	}
+
+	migf->total_length = sizeof(struct acc_vf_data);
+	return 0;
+}
+
+static struct hisi_acc_vf_core_device *hisi_acc_drvdata(struct pci_dev *pdev)
+{
+	struct vfio_pci_core_device *core_device = dev_get_drvdata(&pdev->dev);
+
+	return container_of(core_device, struct hisi_acc_vf_core_device,
+			    core_device);
+}
+
+/* Check the PF's RAS state and Function INT state */
+static int
+hisi_acc_check_int_state(struct hisi_acc_vf_core_device *hisi_acc_vdev)
+{
+	struct hisi_qm *vfqm = &hisi_acc_vdev->vf_qm;
+	struct hisi_qm *qm = hisi_acc_vdev->pf_qm;
+	struct pci_dev *vf_pdev = hisi_acc_vdev->vf_dev;
+	struct device *dev = &qm->pdev->dev;
+	u32 state;
+
+	/* Check RAS state */
+	state = qm_check_reg_state(qm, QM_ABNORMAL_INT_STATUS);
+	if (state) {
+		dev_err(dev, "failed to check QM RAS state!\n");
+		return -EBUSY;
+	}
+
+	/* Check Function Communication state between PF and VF */
+	state = qm_check_reg_state(vfqm, QM_IFC_INT_STATUS);
+	if (state) {
+		dev_err(dev, "failed to check QM IFC INT state!\n");
+		return -EBUSY;
+	}
+	state = qm_check_reg_state(vfqm, QM_IFC_INT_SET_V);
+	if (state) {
+		dev_err(dev, "failed to check QM IFC INT SET state!\n");
+		return -EBUSY;
+	}
+
+	/* Check submodule task state */
+	switch (vf_pdev->device) {
+	case PCI_DEVICE_ID_HUAWEI_SEC_VF:
+		state = qm_check_reg_state(qm, SEC_CORE_INT_STATUS);
+		if (state) {
+			dev_err(dev, "failed to check QM SEC Core INT state!\n");
+			return -EBUSY;
+		}
+		return 0;
+	case PCI_DEVICE_ID_HUAWEI_HPRE_VF:
+		state = qm_check_reg_state(qm, HPRE_HAC_INT_STATUS);
+		if (state) {
+			dev_err(dev, "failed to check QM HPRE HAC INT state!\n");
+			return -EBUSY;
+		}
+		return 0;
+	case PCI_DEVICE_ID_HUAWEI_ZIP_VF:
+		state = qm_check_reg_state(qm, HZIP_CORE_INT_STATUS);
+		if (state) {
+			dev_err(dev, "failed to check QM ZIP Core INT state!\n");
+			return -EBUSY;
+		}
+		return 0;
+	default:
+		dev_err(dev, "failed to detect acc module type!\n");
+		return -EINVAL;
+	}
+}
+
+static void hisi_acc_vf_disable_fd(struct hisi_acc_vf_migration_file *migf)
+{
+	mutex_lock(&migf->lock);
+	migf->disabled = true;
+	migf->total_length = 0;
+	migf->filp->f_pos = 0;
+	mutex_unlock(&migf->lock);
+}
+
+static void hisi_acc_vf_disable_fds(struct hisi_acc_vf_core_device *hisi_acc_vdev)
+{
+	if (hisi_acc_vdev->resuming_migf) {
+		hisi_acc_vf_disable_fd(hisi_acc_vdev->resuming_migf);
+		fput(hisi_acc_vdev->resuming_migf->filp);
+		hisi_acc_vdev->resuming_migf = NULL;
+	}
+
+	if (hisi_acc_vdev->saving_migf) {
+		hisi_acc_vf_disable_fd(hisi_acc_vdev->saving_migf);
+		fput(hisi_acc_vdev->saving_migf->filp);
+		hisi_acc_vdev->saving_migf = NULL;
+	}
+}
+
+/*
+ * This function is called in all state_mutex unlock cases to
+ * handle a 'deferred_reset' if exists.
+ */
+static void
+hisi_acc_vf_state_mutex_unlock(struct hisi_acc_vf_core_device *hisi_acc_vdev)
+{
+again:
+	spin_lock(&hisi_acc_vdev->reset_lock);
+	if (hisi_acc_vdev->deferred_reset) {
+		hisi_acc_vdev->deferred_reset = false;
+		spin_unlock(&hisi_acc_vdev->reset_lock);
+		hisi_acc_vdev->vf_qm_state = QM_NOT_READY;
+		hisi_acc_vdev->mig_state = VFIO_DEVICE_STATE_RUNNING;
+		hisi_acc_vf_disable_fds(hisi_acc_vdev);
+		goto again;
+	}
+	mutex_unlock(&hisi_acc_vdev->state_mutex);
+	spin_unlock(&hisi_acc_vdev->reset_lock);
+}
+
+static void hisi_acc_vf_start_device(struct hisi_acc_vf_core_device *hisi_acc_vdev)
+{
+	struct hisi_qm *vf_qm = &hisi_acc_vdev->vf_qm;
+
+	if (hisi_acc_vdev->vf_qm_state != QM_READY)
+		return;
+
+	/* Make sure the device is enabled */
+	qm_dev_cmd_init(vf_qm);
+
+	vf_qm_fun_reset(vf_qm);
+}
+
+static int hisi_acc_vf_load_state(struct hisi_acc_vf_core_device *hisi_acc_vdev)
+{
+	struct device *dev = &hisi_acc_vdev->vf_dev->dev;
+	struct hisi_acc_vf_migration_file *migf = hisi_acc_vdev->resuming_migf;
+	int ret;
+
+	/* Recover data to VF */
+	ret = vf_qm_load_data(hisi_acc_vdev, migf);
+	if (ret) {
+		dev_err(dev, "failed to recover the VF!\n");
+		return ret;
+	}
+
+	return 0;
+}
+
+static int hisi_acc_vf_release_file(struct inode *inode, struct file *filp)
+{
+	struct hisi_acc_vf_migration_file *migf = filp->private_data;
+
+	hisi_acc_vf_disable_fd(migf);
+	mutex_destroy(&migf->lock);
+	kfree(migf);
+	return 0;
+}
+
+static ssize_t hisi_acc_vf_resume_write(struct file *filp, const char __user *buf,
+					size_t len, loff_t *pos)
+{
+	struct hisi_acc_vf_migration_file *migf = filp->private_data;
+	u8 *vf_data = (u8 *)&migf->vf_data;
+	loff_t requested_length;
+	ssize_t done = 0;
+	int ret;
+
+	if (pos)
+		return -ESPIPE;
+	pos = &filp->f_pos;
+
+	if (*pos < 0 ||
+	    check_add_overflow((loff_t)len, *pos, &requested_length))
+		return -EINVAL;
+
+	if (requested_length > sizeof(struct acc_vf_data))
+		return -ENOMEM;
+
+	mutex_lock(&migf->lock);
+	if (migf->disabled) {
+		done = -ENODEV;
+		goto out_unlock;
+	}
+
+	ret = copy_from_user(vf_data + *pos, buf, len);
+	if (ret) {
+		done = -EFAULT;
+		goto out_unlock;
+	}
+	*pos += len;
+	done = len;
+	migf->total_length += len;
+
+	ret = vf_qm_check_match(migf->hisi_acc_vdev, migf);
+	if (ret)
+		done = -EFAULT;
+out_unlock:
+	mutex_unlock(&migf->lock);
+	return done;
+}
+
+static const struct file_operations hisi_acc_vf_resume_fops = {
+	.owner = THIS_MODULE,
+	.write = hisi_acc_vf_resume_write,
+	.release = hisi_acc_vf_release_file,
+	.llseek = no_llseek,
+};
+
+static struct hisi_acc_vf_migration_file *
+hisi_acc_vf_pci_resume(struct hisi_acc_vf_core_device *hisi_acc_vdev)
+{
+	struct hisi_acc_vf_migration_file *migf;
+
+	migf = kzalloc(sizeof(*migf), GFP_KERNEL_ACCOUNT);
+	if (!migf)
+		return ERR_PTR(-ENOMEM);
+
+	migf->filp = anon_inode_getfile("hisi_acc_vf_mig", &hisi_acc_vf_resume_fops, migf,
+					O_WRONLY);
+	if (IS_ERR(migf->filp)) {
+		int err = PTR_ERR(migf->filp);
+
+		kfree(migf);
+		return ERR_PTR(err);
+	}
+
+	stream_open(migf->filp->f_inode, migf->filp);
+	mutex_init(&migf->lock);
+	migf->hisi_acc_vdev = hisi_acc_vdev;
+	return migf;
+}
+
+static long hisi_acc_vf_precopy_ioctl(struct file *filp,
+				      unsigned int cmd, unsigned long arg)
+{
+	struct hisi_acc_vf_migration_file *migf = filp->private_data;
+	struct hisi_acc_vf_core_device *hisi_acc_vdev = migf->hisi_acc_vdev;
+	loff_t *pos = &filp->f_pos;
+	struct vfio_precopy_info info;
+	unsigned long minsz;
+	int ret;
+
+	if (cmd != VFIO_MIG_GET_PRECOPY_INFO)
+		return -ENOTTY;
+
+	minsz = offsetofend(struct vfio_precopy_info, dirty_bytes);
+
+	if (copy_from_user(&info, (void __user *)arg, minsz))
+		return -EFAULT;
+	if (info.argsz < minsz)
+		return -EINVAL;
+
+	mutex_lock(&hisi_acc_vdev->state_mutex);
+	if (hisi_acc_vdev->mig_state != VFIO_DEVICE_STATE_PRE_COPY) {
+		mutex_unlock(&hisi_acc_vdev->state_mutex);
+		return -EINVAL;
+	}
+
+	mutex_lock(&migf->lock);
+
+	if (migf->disabled) {
+		ret = -ENODEV;
+		goto out;
+	}
+
+	if (*pos > migf->total_length) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	info.dirty_bytes = 0;
+	info.initial_bytes = migf->total_length - *pos;
+
+	ret = copy_to_user((void __user *)arg, &info, minsz) ? -EFAULT : 0;
+out:
+	mutex_unlock(&migf->lock);
+	mutex_unlock(&hisi_acc_vdev->state_mutex);
+	return ret;
+}
+
+static ssize_t hisi_acc_vf_save_read(struct file *filp, char __user *buf, size_t len,
+				     loff_t *pos)
+{
+	struct hisi_acc_vf_migration_file *migf = filp->private_data;
+	ssize_t done = 0;
+	int ret;
+
+	if (pos)
+		return -ESPIPE;
+	pos = &filp->f_pos;
+
+	mutex_lock(&migf->lock);
+	if (*pos > migf->total_length) {
+		done = -EINVAL;
+		goto out_unlock;
+	}
+
+	if (migf->disabled) {
+		done = -ENODEV;
+		goto out_unlock;
+	}
+
+	len = min_t(size_t, migf->total_length - *pos, len);
+	if (len) {
+		u8 *vf_data = (u8 *)&migf->vf_data;
+
+		ret = copy_to_user(buf, vf_data + *pos, len);
+		if (ret) {
+			done = -EFAULT;
+			goto out_unlock;
+		}
+		*pos += len;
+		done = len;
+	}
+out_unlock:
+	mutex_unlock(&migf->lock);
+	return done;
+}
+
+static const struct file_operations hisi_acc_vf_save_fops = {
+	.owner = THIS_MODULE,
+	.read = hisi_acc_vf_save_read,
+	.unlocked_ioctl = hisi_acc_vf_precopy_ioctl,
+	.compat_ioctl = compat_ptr_ioctl,
+	.release = hisi_acc_vf_release_file,
+	.llseek = no_llseek,
+};
+
+static struct hisi_acc_vf_migration_file *
+hisi_acc_open_saving_migf(struct hisi_acc_vf_core_device *hisi_acc_vdev)
+{
+	struct hisi_acc_vf_migration_file *migf;
+	int ret;
+
+	migf = kzalloc(sizeof(*migf), GFP_KERNEL_ACCOUNT);
+	if (!migf)
+		return ERR_PTR(-ENOMEM);
+
+	migf->filp = anon_inode_getfile("hisi_acc_vf_mig", &hisi_acc_vf_save_fops, migf,
+					O_RDONLY);
+	if (IS_ERR(migf->filp)) {
+		int err = PTR_ERR(migf->filp);
+
+		kfree(migf);
+		return ERR_PTR(err);
+	}
+
+	stream_open(migf->filp->f_inode, migf->filp);
+	mutex_init(&migf->lock);
+	migf->hisi_acc_vdev = hisi_acc_vdev;
+
+	ret = vf_qm_get_match_data(hisi_acc_vdev, &migf->vf_data);
+	if (ret) {
+		fput(migf->filp);
+		return ERR_PTR(ret);
+	}
+
+	return migf;
+}
+
+static struct hisi_acc_vf_migration_file *
+hisi_acc_vf_pre_copy(struct hisi_acc_vf_core_device *hisi_acc_vdev)
+{
+	struct hisi_acc_vf_migration_file *migf;
+
+	migf = hisi_acc_open_saving_migf(hisi_acc_vdev);
+	if (IS_ERR(migf))
+		return migf;
+
+	migf->total_length = QM_MATCH_SIZE;
+	return migf;
+}
+
+static struct hisi_acc_vf_migration_file *
+hisi_acc_vf_stop_copy(struct hisi_acc_vf_core_device *hisi_acc_vdev, bool open)
+{
+	int ret;
+	struct hisi_acc_vf_migration_file *migf = NULL;
+
+	if (open) {
+		/*
+		 * Userspace didn't use PRECOPY support. Hence saving_migf
+		 * is not opened yet.
+		 */
+		migf = hisi_acc_open_saving_migf(hisi_acc_vdev);
+		if (IS_ERR(migf))
+			return migf;
+	} else {
+		migf = hisi_acc_vdev->saving_migf;
+	}
+
+	ret = vf_qm_state_save(hisi_acc_vdev, migf);
+	if (ret)
+		return ERR_PTR(ret);
+
+	return open ? migf : NULL;
+}
+
+static int hisi_acc_vf_stop_device(struct hisi_acc_vf_core_device *hisi_acc_vdev)
+{
+	struct device *dev = &hisi_acc_vdev->vf_dev->dev;
+	struct hisi_qm *vf_qm = &hisi_acc_vdev->vf_qm;
+	int ret;
+
+	ret = vf_qm_func_stop(vf_qm);
+	if (ret) {
+		dev_err(dev, "failed to stop QM VF function!\n");
+		return ret;
+	}
+
+	ret = hisi_acc_check_int_state(hisi_acc_vdev);
+	if (ret) {
+		dev_err(dev, "failed to check QM INT state!\n");
+		return ret;
+	}
+	return 0;
+}
+
+static struct file *
+hisi_acc_vf_set_device_state(struct hisi_acc_vf_core_device *hisi_acc_vdev,
+			     u32 new)
+{
+	u32 cur = hisi_acc_vdev->mig_state;
+	int ret;
+
+	if (cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_PRE_COPY) {
+		struct hisi_acc_vf_migration_file *migf;
+
+		migf = hisi_acc_vf_pre_copy(hisi_acc_vdev);
+		if (IS_ERR(migf))
+			return ERR_CAST(migf);
+		get_file(migf->filp);
+		hisi_acc_vdev->saving_migf = migf;
+		return migf->filp;
+	}
+
+	if (cur == VFIO_DEVICE_STATE_PRE_COPY && new == VFIO_DEVICE_STATE_STOP_COPY) {
+		struct hisi_acc_vf_migration_file *migf;
+
+		ret = hisi_acc_vf_stop_device(hisi_acc_vdev);
+		if (ret)
+			return ERR_PTR(ret);
+
+		migf = hisi_acc_vf_stop_copy(hisi_acc_vdev, false);
+		if (IS_ERR(migf))
+			return ERR_CAST(migf);
+
+		return NULL;
+	}
+
+	if (cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_STOP) {
+		ret = hisi_acc_vf_stop_device(hisi_acc_vdev);
+		if (ret)
+			return ERR_PTR(ret);
+		return NULL;
+	}
+
+	if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_STOP_COPY) {
+		struct hisi_acc_vf_migration_file *migf;
+
+		migf = hisi_acc_vf_stop_copy(hisi_acc_vdev, true);
+		if (IS_ERR(migf))
+			return ERR_CAST(migf);
+		get_file(migf->filp);
+		hisi_acc_vdev->saving_migf = migf;
+		return migf->filp;
+	}
+
+	if ((cur == VFIO_DEVICE_STATE_STOP_COPY && new == VFIO_DEVICE_STATE_STOP)) {
+		hisi_acc_vf_disable_fds(hisi_acc_vdev);
+		return NULL;
+	}
+
+	if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_RESUMING) {
+		struct hisi_acc_vf_migration_file *migf;
+
+		migf = hisi_acc_vf_pci_resume(hisi_acc_vdev);
+		if (IS_ERR(migf))
+			return ERR_CAST(migf);
+		get_file(migf->filp);
+		hisi_acc_vdev->resuming_migf = migf;
+		return migf->filp;
+	}
+
+	if (cur == VFIO_DEVICE_STATE_RESUMING && new == VFIO_DEVICE_STATE_STOP) {
+		ret = hisi_acc_vf_load_state(hisi_acc_vdev);
+		if (ret)
+			return ERR_PTR(ret);
+		hisi_acc_vf_disable_fds(hisi_acc_vdev);
+		return NULL;
+	}
+
+	if (cur == VFIO_DEVICE_STATE_PRE_COPY && new == VFIO_DEVICE_STATE_RUNNING) {
+		hisi_acc_vf_disable_fds(hisi_acc_vdev);
+		return NULL;
+	}
+
+	if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_RUNNING) {
+		hisi_acc_vf_start_device(hisi_acc_vdev);
+		return NULL;
+	}
+
+	/*
+	 * vfio_mig_get_next_state() does not use arcs other than the above
+	 */
+	WARN_ON(true);
+	return ERR_PTR(-EINVAL);
+}
+
+static struct file *
+hisi_acc_vfio_pci_set_device_state(struct vfio_device *vdev,
+				   enum vfio_device_mig_state new_state)
+{
+	struct hisi_acc_vf_core_device *hisi_acc_vdev = container_of(vdev,
+			struct hisi_acc_vf_core_device, core_device.vdev);
+	enum vfio_device_mig_state next_state;
+	struct file *res = NULL;
+	int ret;
+
+	mutex_lock(&hisi_acc_vdev->state_mutex);
+	while (new_state != hisi_acc_vdev->mig_state) {
+		ret = vfio_mig_get_next_state(vdev,
+					      hisi_acc_vdev->mig_state,
+					      new_state, &next_state);
+		if (ret) {
+			res = ERR_PTR(-EINVAL);
+			break;
+		}
+
+		res = hisi_acc_vf_set_device_state(hisi_acc_vdev, next_state);
+		if (IS_ERR(res))
+			break;
+		hisi_acc_vdev->mig_state = next_state;
+		if (WARN_ON(res && new_state != hisi_acc_vdev->mig_state)) {
+			fput(res);
+			res = ERR_PTR(-EINVAL);
+			break;
+		}
+	}
+	hisi_acc_vf_state_mutex_unlock(hisi_acc_vdev);
+	return res;
+}
+
+static int
+hisi_acc_vfio_pci_get_data_size(struct vfio_device *vdev,
+				unsigned long *stop_copy_length)
+{
+	*stop_copy_length = sizeof(struct acc_vf_data);
+	return 0;
+}
+
+static int
+hisi_acc_vfio_pci_get_device_state(struct vfio_device *vdev,
+				   enum vfio_device_mig_state *curr_state)
+{
+	struct hisi_acc_vf_core_device *hisi_acc_vdev = container_of(vdev,
+			struct hisi_acc_vf_core_device, core_device.vdev);
+
+	mutex_lock(&hisi_acc_vdev->state_mutex);
+	*curr_state = hisi_acc_vdev->mig_state;
+	hisi_acc_vf_state_mutex_unlock(hisi_acc_vdev);
+	return 0;
+}
+
+static void hisi_acc_vf_pci_aer_reset_done(struct pci_dev *pdev)
+{
+	struct hisi_acc_vf_core_device *hisi_acc_vdev = hisi_acc_drvdata(pdev);
+
+	if (hisi_acc_vdev->core_device.vdev.migration_flags !=
+				VFIO_MIGRATION_STOP_COPY)
+		return;
+
+	/*
+	 * As the higher VFIO layers are holding locks across reset and using
+	 * those same locks with the mm_lock we need to prevent ABBA deadlock
+	 * with the state_mutex and mm_lock.
+	 * In case the state_mutex was taken already we defer the cleanup work
+	 * to the unlock flow of the other running context.
+	 */
+	spin_lock(&hisi_acc_vdev->reset_lock);
+	hisi_acc_vdev->deferred_reset = true;
+	if (!mutex_trylock(&hisi_acc_vdev->state_mutex)) {
+		spin_unlock(&hisi_acc_vdev->reset_lock);
+		return;
+	}
+	spin_unlock(&hisi_acc_vdev->reset_lock);
+	hisi_acc_vf_state_mutex_unlock(hisi_acc_vdev);
+}
+
+static int hisi_acc_vf_qm_init(struct hisi_acc_vf_core_device *hisi_acc_vdev)
+{
+	struct vfio_pci_core_device *vdev = &hisi_acc_vdev->core_device;
+	struct hisi_qm *vf_qm = &hisi_acc_vdev->vf_qm;
+	struct pci_dev *vf_dev = vdev->pdev;
+
+	/*
+	 * ACC VF dev BAR2 region consists of both functional register space
+	 * and migration control register space. For migration to work, we
+	 * need access to both. Hence, we map the entire BAR2 region here.
+	 * But unnecessarily exposing the migration BAR region to the Guest
+	 * has the potential to prevent/corrupt the Guest migration. Hence,
+	 * we restrict access to the migration control space from
+	 * Guest(Please see mmap/ioctl/read/write override functions).
+	 *
+	 * Please note that it is OK to expose the entire VF BAR if migration
+	 * is not supported or required as this cannot affect the ACC PF
+	 * configurations.
+	 *
+	 * Also the HiSilicon ACC VF devices supported by this driver on
+	 * HiSilicon hardware platforms are integrated end point devices
+	 * and the platform lacks the capability to perform any PCIe P2P
+	 * between these devices.
+	 */
+
+	vf_qm->io_base =
+		ioremap(pci_resource_start(vf_dev, VFIO_PCI_BAR2_REGION_INDEX),
+			pci_resource_len(vf_dev, VFIO_PCI_BAR2_REGION_INDEX));
+	if (!vf_qm->io_base)
+		return -EIO;
+
+	vf_qm->fun_type = QM_HW_VF;
+	vf_qm->pdev = vf_dev;
+	mutex_init(&vf_qm->mailbox_lock);
+
+	return 0;
+}
+
+static struct hisi_qm *hisi_acc_get_pf_qm(struct pci_dev *pdev)
+{
+	struct hisi_qm	*pf_qm;
+	struct pci_driver *pf_driver;
+
+	if (!pdev->is_virtfn)
+		return NULL;
+
+	switch (pdev->device) {
+	case PCI_DEVICE_ID_HUAWEI_SEC_VF:
+		pf_driver = hisi_sec_get_pf_driver();
+		break;
+	case PCI_DEVICE_ID_HUAWEI_HPRE_VF:
+		pf_driver = hisi_hpre_get_pf_driver();
+		break;
+	case PCI_DEVICE_ID_HUAWEI_ZIP_VF:
+		pf_driver = hisi_zip_get_pf_driver();
+		break;
+	default:
+		return NULL;
+	}
+
+	if (!pf_driver)
+		return NULL;
+
+	pf_qm = pci_iov_get_pf_drvdata(pdev, pf_driver);
+
+	return !IS_ERR(pf_qm) ? pf_qm : NULL;
+}
+
+static int hisi_acc_pci_rw_access_check(struct vfio_device *core_vdev,
+					size_t count, loff_t *ppos,
+					size_t *new_count)
+{
+	unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
+	struct vfio_pci_core_device *vdev =
+		container_of(core_vdev, struct vfio_pci_core_device, vdev);
+
+	if (index == VFIO_PCI_BAR2_REGION_INDEX) {
+		loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK;
+		resource_size_t end = pci_resource_len(vdev->pdev, index) / 2;
+
+		/* Check if access is for migration control region */
+		if (pos >= end)
+			return -EINVAL;
+
+		*new_count = min(count, (size_t)(end - pos));
+	}
+
+	return 0;
+}
+
+static int hisi_acc_vfio_pci_mmap(struct vfio_device *core_vdev,
+				  struct vm_area_struct *vma)
+{
+	struct vfio_pci_core_device *vdev =
+		container_of(core_vdev, struct vfio_pci_core_device, vdev);
+	unsigned int index;
+
+	index = vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT);
+	if (index == VFIO_PCI_BAR2_REGION_INDEX) {
+		u64 req_len, pgoff, req_start;
+		resource_size_t end = pci_resource_len(vdev->pdev, index) / 2;
+
+		req_len = vma->vm_end - vma->vm_start;
+		pgoff = vma->vm_pgoff &
+			((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1);
+		req_start = pgoff << PAGE_SHIFT;
+
+		if (req_start + req_len > end)
+			return -EINVAL;
+	}
+
+	return vfio_pci_core_mmap(core_vdev, vma);
+}
+
+static ssize_t hisi_acc_vfio_pci_write(struct vfio_device *core_vdev,
+				       const char __user *buf, size_t count,
+				       loff_t *ppos)
+{
+	size_t new_count = count;
+	int ret;
+
+	ret = hisi_acc_pci_rw_access_check(core_vdev, count, ppos, &new_count);
+	if (ret)
+		return ret;
+
+	return vfio_pci_core_write(core_vdev, buf, new_count, ppos);
+}
+
+static ssize_t hisi_acc_vfio_pci_read(struct vfio_device *core_vdev,
+				      char __user *buf, size_t count,
+				      loff_t *ppos)
+{
+	size_t new_count = count;
+	int ret;
+
+	ret = hisi_acc_pci_rw_access_check(core_vdev, count, ppos, &new_count);
+	if (ret)
+		return ret;
+
+	return vfio_pci_core_read(core_vdev, buf, new_count, ppos);
+}
+
+static long hisi_acc_vfio_pci_ioctl(struct vfio_device *core_vdev, unsigned int cmd,
+				    unsigned long arg)
+{
+	if (cmd == VFIO_DEVICE_GET_REGION_INFO) {
+		struct vfio_pci_core_device *vdev =
+			container_of(core_vdev, struct vfio_pci_core_device, vdev);
+		struct pci_dev *pdev = vdev->pdev;
+		struct vfio_region_info info;
+		unsigned long minsz;
+
+		minsz = offsetofend(struct vfio_region_info, offset);
+
+		if (copy_from_user(&info, (void __user *)arg, minsz))
+			return -EFAULT;
+
+		if (info.argsz < minsz)
+			return -EINVAL;
+
+		if (info.index == VFIO_PCI_BAR2_REGION_INDEX) {
+			info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
+
+			/*
+			 * ACC VF dev BAR2 region consists of both functional
+			 * register space and migration control register space.
+			 * Report only the functional region to Guest.
+			 */
+			info.size = pci_resource_len(pdev, info.index) / 2;
+
+			info.flags = VFIO_REGION_INFO_FLAG_READ |
+					VFIO_REGION_INFO_FLAG_WRITE |
+					VFIO_REGION_INFO_FLAG_MMAP;
+
+			return copy_to_user((void __user *)arg, &info, minsz) ?
+					    -EFAULT : 0;
+		}
+	}
+	return vfio_pci_core_ioctl(core_vdev, cmd, arg);
+}
+
+static int hisi_acc_vfio_pci_open_device(struct vfio_device *core_vdev)
+{
+	struct hisi_acc_vf_core_device *hisi_acc_vdev = container_of(core_vdev,
+			struct hisi_acc_vf_core_device, core_device.vdev);
+	struct vfio_pci_core_device *vdev = &hisi_acc_vdev->core_device;
+	int ret;
+
+	ret = vfio_pci_core_enable(vdev);
+	if (ret)
+		return ret;
+
+	if (core_vdev->mig_ops) {
+		ret = hisi_acc_vf_qm_init(hisi_acc_vdev);
+		if (ret) {
+			vfio_pci_core_disable(vdev);
+			return ret;
+		}
+		hisi_acc_vdev->mig_state = VFIO_DEVICE_STATE_RUNNING;
+	}
+
+	vfio_pci_core_finish_enable(vdev);
+	return 0;
+}
+
+static void hisi_acc_vfio_pci_close_device(struct vfio_device *core_vdev)
+{
+	struct hisi_acc_vf_core_device *hisi_acc_vdev = container_of(core_vdev,
+			struct hisi_acc_vf_core_device, core_device.vdev);
+	struct hisi_qm *vf_qm = &hisi_acc_vdev->vf_qm;
+
+	iounmap(vf_qm->io_base);
+	vfio_pci_core_close_device(core_vdev);
+}
+
+static const struct vfio_migration_ops hisi_acc_vfio_pci_migrn_state_ops = {
+	.migration_set_state = hisi_acc_vfio_pci_set_device_state,
+	.migration_get_state = hisi_acc_vfio_pci_get_device_state,
+	.migration_get_data_size = hisi_acc_vfio_pci_get_data_size,
+};
+
+static int hisi_acc_vfio_pci_migrn_init_dev(struct vfio_device *core_vdev)
+{
+	struct hisi_acc_vf_core_device *hisi_acc_vdev = container_of(core_vdev,
+			struct hisi_acc_vf_core_device, core_device.vdev);
+	struct pci_dev *pdev = to_pci_dev(core_vdev->dev);
+	struct hisi_qm *pf_qm = hisi_acc_get_pf_qm(pdev);
+
+	hisi_acc_vdev->vf_id = pci_iov_vf_id(pdev) + 1;
+	hisi_acc_vdev->pf_qm = pf_qm;
+	hisi_acc_vdev->vf_dev = pdev;
+	mutex_init(&hisi_acc_vdev->state_mutex);
+
+	core_vdev->migration_flags = VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_PRE_COPY;
+	core_vdev->mig_ops = &hisi_acc_vfio_pci_migrn_state_ops;
+
+	return vfio_pci_core_init_dev(core_vdev);
+}
+
+static const struct vfio_device_ops hisi_acc_vfio_pci_migrn_ops = {
+	.name = "hisi-acc-vfio-pci-migration",
+	.init = hisi_acc_vfio_pci_migrn_init_dev,
+	.release = vfio_pci_core_release_dev,
+	.open_device = hisi_acc_vfio_pci_open_device,
+	.close_device = hisi_acc_vfio_pci_close_device,
+	.ioctl = hisi_acc_vfio_pci_ioctl,
+	.device_feature = vfio_pci_core_ioctl_feature,
+	.read = hisi_acc_vfio_pci_read,
+	.write = hisi_acc_vfio_pci_write,
+	.mmap = hisi_acc_vfio_pci_mmap,
+	.request = vfio_pci_core_request,
+	.match = vfio_pci_core_match,
+	.bind_iommufd = vfio_iommufd_physical_bind,
+	.unbind_iommufd = vfio_iommufd_physical_unbind,
+	.attach_ioas = vfio_iommufd_physical_attach_ioas,
+	.detach_ioas = vfio_iommufd_physical_detach_ioas,
+};
+
+static const struct vfio_device_ops hisi_acc_vfio_pci_ops = {
+	.name = "hisi-acc-vfio-pci",
+	.init = vfio_pci_core_init_dev,
+	.release = vfio_pci_core_release_dev,
+	.open_device = hisi_acc_vfio_pci_open_device,
+	.close_device = vfio_pci_core_close_device,
+	.ioctl = vfio_pci_core_ioctl,
+	.device_feature = vfio_pci_core_ioctl_feature,
+	.read = vfio_pci_core_read,
+	.write = vfio_pci_core_write,
+	.mmap = vfio_pci_core_mmap,
+	.request = vfio_pci_core_request,
+	.match = vfio_pci_core_match,
+	.bind_iommufd = vfio_iommufd_physical_bind,
+	.unbind_iommufd = vfio_iommufd_physical_unbind,
+	.attach_ioas = vfio_iommufd_physical_attach_ioas,
+	.detach_ioas = vfio_iommufd_physical_detach_ioas,
+};
+
+static int hisi_acc_vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
+{
+	struct hisi_acc_vf_core_device *hisi_acc_vdev;
+	const struct vfio_device_ops *ops = &hisi_acc_vfio_pci_ops;
+	struct hisi_qm *pf_qm;
+	int vf_id;
+	int ret;
+
+	pf_qm = hisi_acc_get_pf_qm(pdev);
+	if (pf_qm && pf_qm->ver >= QM_HW_V3) {
+		vf_id = pci_iov_vf_id(pdev);
+		if (vf_id >= 0)
+			ops = &hisi_acc_vfio_pci_migrn_ops;
+		else
+			pci_warn(pdev, "migration support failed, continue with generic interface\n");
+	}
+
+	hisi_acc_vdev = vfio_alloc_device(hisi_acc_vf_core_device,
+					  core_device.vdev, &pdev->dev, ops);
+	if (IS_ERR(hisi_acc_vdev))
+		return PTR_ERR(hisi_acc_vdev);
+
+	dev_set_drvdata(&pdev->dev, &hisi_acc_vdev->core_device);
+	ret = vfio_pci_core_register_device(&hisi_acc_vdev->core_device);
+	if (ret)
+		goto out_put_vdev;
+	return 0;
+
+out_put_vdev:
+	vfio_put_device(&hisi_acc_vdev->core_device.vdev);
+	return ret;
+}
+
+static void hisi_acc_vfio_pci_remove(struct pci_dev *pdev)
+{
+	struct hisi_acc_vf_core_device *hisi_acc_vdev = hisi_acc_drvdata(pdev);
+
+	vfio_pci_core_unregister_device(&hisi_acc_vdev->core_device);
+	vfio_put_device(&hisi_acc_vdev->core_device.vdev);
+}
+
+static const struct pci_device_id hisi_acc_vfio_pci_table[] = {
+	{ PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_HUAWEI, PCI_DEVICE_ID_HUAWEI_SEC_VF) },
+	{ PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_HUAWEI, PCI_DEVICE_ID_HUAWEI_HPRE_VF) },
+	{ PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_HUAWEI, PCI_DEVICE_ID_HUAWEI_ZIP_VF) },
+	{ }
+};
+
+MODULE_DEVICE_TABLE(pci, hisi_acc_vfio_pci_table);
+
+static const struct pci_error_handlers hisi_acc_vf_err_handlers = {
+	.reset_done = hisi_acc_vf_pci_aer_reset_done,
+	.error_detected = vfio_pci_core_aer_err_detected,
+};
+
+static struct pci_driver hisi_acc_vfio_pci_driver = {
+	.name = KBUILD_MODNAME,
+	.id_table = hisi_acc_vfio_pci_table,
+	.probe = hisi_acc_vfio_pci_probe,
+	.remove = hisi_acc_vfio_pci_remove,
+	.err_handler = &hisi_acc_vf_err_handlers,
+	.driver_managed_dma = true,
+};
+
+module_pci_driver(hisi_acc_vfio_pci_driver);
+
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Liu Longfang <liulongfang@huawei.com>");
+MODULE_AUTHOR("Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>");
+MODULE_DESCRIPTION("HiSilicon VFIO PCI - VFIO PCI driver with live migration support for HiSilicon ACC device family");
diff --git a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h
new file mode 100644
index 0000000000..dcabfeec6c
--- /dev/null
+++ b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h
@@ -0,0 +1,117 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (c) 2021 HiSilicon Ltd. */
+
+#ifndef HISI_ACC_VFIO_PCI_H
+#define HISI_ACC_VFIO_PCI_H
+
+#include <linux/hisi_acc_qm.h>
+
+#define MB_POLL_PERIOD_US		10
+#define MB_POLL_TIMEOUT_US		1000
+#define QM_CACHE_WB_START		0x204
+#define QM_CACHE_WB_DONE		0x208
+#define QM_MB_CMD_PAUSE_QM		0xe
+#define QM_ABNORMAL_INT_STATUS		0x100008
+#define QM_IFC_INT_STATUS		0x0028
+#define SEC_CORE_INT_STATUS		0x301008
+#define HPRE_HAC_INT_STATUS		0x301800
+#define HZIP_CORE_INT_STATUS		0x3010AC
+
+#define QM_VFT_CFG_RDY			0x10006c
+#define QM_VFT_CFG_OP_WR		0x100058
+#define QM_VFT_CFG_TYPE			0x10005c
+#define QM_VFT_CFG			0x100060
+#define QM_VFT_CFG_OP_ENABLE		0x100054
+#define QM_VFT_CFG_DATA_L		0x100064
+#define QM_VFT_CFG_DATA_H		0x100068
+
+#define ERROR_CHECK_TIMEOUT		100
+#define CHECK_DELAY_TIME		100
+
+#define QM_SQC_VFT_BASE_SHIFT_V2	28
+#define QM_SQC_VFT_BASE_MASK_V2		GENMASK(15, 0)
+#define QM_SQC_VFT_NUM_SHIFT_V2		45
+#define QM_SQC_VFT_NUM_MASK_V2		GENMASK(9, 0)
+
+/* RW regs */
+#define QM_REGS_MAX_LEN		7
+#define QM_REG_ADDR_OFFSET	0x0004
+
+#define QM_XQC_ADDR_OFFSET	32U
+#define QM_VF_AEQ_INT_MASK	0x0004
+#define QM_VF_EQ_INT_MASK	0x000c
+#define QM_IFC_INT_SOURCE_V	0x0020
+#define QM_IFC_INT_MASK		0x0024
+#define QM_IFC_INT_SET_V	0x002c
+#define QM_QUE_ISO_CFG_V	0x0030
+#define QM_PAGE_SIZE		0x0034
+
+#define QM_EQC_DW0		0X8000
+#define QM_AEQC_DW0		0X8020
+
+struct acc_vf_data {
+#define QM_MATCH_SIZE offsetofend(struct acc_vf_data, qm_rsv_state)
+	/* QM match information */
+#define ACC_DEV_MAGIC	0XCDCDCDCDFEEDAACC
+	u64 acc_magic;
+	u32 qp_num;
+	u32 dev_id;
+	u32 que_iso_cfg;
+	u32 qp_base;
+	u32 vf_qm_state;
+	/* QM reserved match information */
+	u32 qm_rsv_state[3];
+
+	/* QM RW regs */
+	u32 aeq_int_mask;
+	u32 eq_int_mask;
+	u32 ifc_int_source;
+	u32 ifc_int_mask;
+	u32 ifc_int_set;
+	u32 page_size;
+
+	/* QM_EQC_DW has 7 regs */
+	u32 qm_eqc_dw[7];
+
+	/* QM_AEQC_DW has 7 regs */
+	u32 qm_aeqc_dw[7];
+
+	/* QM reserved 5 regs */
+	u32 qm_rsv_regs[5];
+	u32 padding;
+	/* QM memory init information */
+	u64 eqe_dma;
+	u64 aeqe_dma;
+	u64 sqc_dma;
+	u64 cqc_dma;
+};
+
+struct hisi_acc_vf_migration_file {
+	struct file *filp;
+	struct mutex lock;
+	bool disabled;
+
+	struct hisi_acc_vf_core_device *hisi_acc_vdev;
+	struct acc_vf_data vf_data;
+	size_t total_length;
+};
+
+struct hisi_acc_vf_core_device {
+	struct vfio_pci_core_device core_device;
+	u8 match_done:1;
+	u8 deferred_reset:1;
+	/* For migration state */
+	struct mutex state_mutex;
+	enum vfio_device_mig_state mig_state;
+	struct pci_dev *pf_dev;
+	struct pci_dev *vf_dev;
+	struct hisi_qm *pf_qm;
+	struct hisi_qm vf_qm;
+	u32 vf_qm_state;
+	int vf_id;
+	/* For reset handler */
+	spinlock_t reset_lock;
+	struct hisi_acc_vf_migration_file *resuming_migf;
+	struct hisi_acc_vf_migration_file *saving_migf;
+};
+#endif /* HISI_ACC_VFIO_PCI_H */
diff --git a/drivers/vfio/pci/mlx5/Kconfig b/drivers/vfio/pci/mlx5/Kconfig
new file mode 100644
index 0000000000..7088edc4fb
--- /dev/null
+++ b/drivers/vfio/pci/mlx5/Kconfig
@@ -0,0 +1,10 @@
+# SPDX-License-Identifier: GPL-2.0-only
+config MLX5_VFIO_PCI
+	tristate "VFIO support for MLX5 PCI devices"
+	depends on MLX5_CORE
+	select VFIO_PCI_CORE
+	help
+	  This provides migration support for MLX5 devices using the VFIO
+	  framework.
+
+	  If you don't know what to do here, say N.
diff --git a/drivers/vfio/pci/mlx5/Makefile b/drivers/vfio/pci/mlx5/Makefile
new file mode 100644
index 0000000000..689627da7f
--- /dev/null
+++ b/drivers/vfio/pci/mlx5/Makefile
@@ -0,0 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
+obj-$(CONFIG_MLX5_VFIO_PCI) += mlx5-vfio-pci.o
+mlx5-vfio-pci-y := main.o cmd.o
+
diff --git a/drivers/vfio/pci/mlx5/cmd.c b/drivers/vfio/pci/mlx5/cmd.c
new file mode 100644
index 0000000000..33574b0447
--- /dev/null
+++ b/drivers/vfio/pci/mlx5/cmd.c
@@ -0,0 +1,1624 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/*
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved
+ */
+
+#include "cmd.h"
+
+enum { CQ_OK = 0, CQ_EMPTY = -1, CQ_POLL_ERR = -2 };
+
+static int mlx5vf_is_migratable(struct mlx5_core_dev *mdev, u16 func_id)
+{
+	int query_sz = MLX5_ST_SZ_BYTES(query_hca_cap_out);
+	void *query_cap = NULL, *cap;
+	int ret;
+
+	query_cap = kzalloc(query_sz, GFP_KERNEL);
+	if (!query_cap)
+		return -ENOMEM;
+
+	ret = mlx5_vport_get_other_func_cap(mdev, func_id, query_cap,
+					    MLX5_CAP_GENERAL_2);
+	if (ret)
+		goto out;
+
+	cap = MLX5_ADDR_OF(query_hca_cap_out, query_cap, capability);
+	if (!MLX5_GET(cmd_hca_cap_2, cap, migratable))
+		ret = -EOPNOTSUPP;
+out:
+	kfree(query_cap);
+	return ret;
+}
+
+static int mlx5vf_cmd_get_vhca_id(struct mlx5_core_dev *mdev, u16 function_id,
+				  u16 *vhca_id);
+static void
+_mlx5vf_free_page_tracker_resources(struct mlx5vf_pci_core_device *mvdev);
+
+int mlx5vf_cmd_suspend_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod)
+{
+	struct mlx5_vf_migration_file *migf = mvdev->saving_migf;
+	u32 out[MLX5_ST_SZ_DW(suspend_vhca_out)] = {};
+	u32 in[MLX5_ST_SZ_DW(suspend_vhca_in)] = {};
+	int err;
+
+	lockdep_assert_held(&mvdev->state_mutex);
+	if (mvdev->mdev_detach)
+		return -ENOTCONN;
+
+	/*
+	 * In case PRE_COPY is used, saving_migf is exposed while the device is
+	 * running. Make sure to run only once there is no active save command.
+	 * Running both in parallel, might end-up with a failure in the save
+	 * command once it will try to turn on 'tracking' on a suspended device.
+	 */
+	if (migf) {
+		err = wait_for_completion_interruptible(&migf->save_comp);
+		if (err)
+			return err;
+	}
+
+	MLX5_SET(suspend_vhca_in, in, opcode, MLX5_CMD_OP_SUSPEND_VHCA);
+	MLX5_SET(suspend_vhca_in, in, vhca_id, mvdev->vhca_id);
+	MLX5_SET(suspend_vhca_in, in, op_mod, op_mod);
+
+	err = mlx5_cmd_exec_inout(mvdev->mdev, suspend_vhca, in, out);
+	if (migf)
+		complete(&migf->save_comp);
+
+	return err;
+}
+
+int mlx5vf_cmd_resume_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod)
+{
+	u32 out[MLX5_ST_SZ_DW(resume_vhca_out)] = {};
+	u32 in[MLX5_ST_SZ_DW(resume_vhca_in)] = {};
+
+	lockdep_assert_held(&mvdev->state_mutex);
+	if (mvdev->mdev_detach)
+		return -ENOTCONN;
+
+	MLX5_SET(resume_vhca_in, in, opcode, MLX5_CMD_OP_RESUME_VHCA);
+	MLX5_SET(resume_vhca_in, in, vhca_id, mvdev->vhca_id);
+	MLX5_SET(resume_vhca_in, in, op_mod, op_mod);
+
+	return mlx5_cmd_exec_inout(mvdev->mdev, resume_vhca, in, out);
+}
+
+int mlx5vf_cmd_query_vhca_migration_state(struct mlx5vf_pci_core_device *mvdev,
+					  size_t *state_size, u8 query_flags)
+{
+	u32 out[MLX5_ST_SZ_DW(query_vhca_migration_state_out)] = {};
+	u32 in[MLX5_ST_SZ_DW(query_vhca_migration_state_in)] = {};
+	bool inc = query_flags & MLX5VF_QUERY_INC;
+	int ret;
+
+	lockdep_assert_held(&mvdev->state_mutex);
+	if (mvdev->mdev_detach)
+		return -ENOTCONN;
+
+	/*
+	 * In case PRE_COPY is used, saving_migf is exposed while device is
+	 * running. Make sure to run only once there is no active save command.
+	 * Running both in parallel, might end-up with a failure in the
+	 * incremental query command on un-tracked vhca.
+	 */
+	if (inc) {
+		ret = wait_for_completion_interruptible(&mvdev->saving_migf->save_comp);
+		if (ret)
+			return ret;
+		if (mvdev->saving_migf->state ==
+		    MLX5_MIGF_STATE_PRE_COPY_ERROR) {
+			/*
+			 * In case we had a PRE_COPY error, only query full
+			 * image for final image
+			 */
+			if (!(query_flags & MLX5VF_QUERY_FINAL)) {
+				*state_size = 0;
+				complete(&mvdev->saving_migf->save_comp);
+				return 0;
+			}
+			query_flags &= ~MLX5VF_QUERY_INC;
+		}
+	}
+
+	MLX5_SET(query_vhca_migration_state_in, in, opcode,
+		 MLX5_CMD_OP_QUERY_VHCA_MIGRATION_STATE);
+	MLX5_SET(query_vhca_migration_state_in, in, vhca_id, mvdev->vhca_id);
+	MLX5_SET(query_vhca_migration_state_in, in, op_mod, 0);
+	MLX5_SET(query_vhca_migration_state_in, in, incremental,
+		 query_flags & MLX5VF_QUERY_INC);
+
+	ret = mlx5_cmd_exec_inout(mvdev->mdev, query_vhca_migration_state, in,
+				  out);
+	if (inc)
+		complete(&mvdev->saving_migf->save_comp);
+
+	if (ret)
+		return ret;
+
+	*state_size = MLX5_GET(query_vhca_migration_state_out, out,
+			       required_umem_size);
+	return 0;
+}
+
+static void set_tracker_error(struct mlx5vf_pci_core_device *mvdev)
+{
+	/* Mark the tracker under an error and wake it up if it's running */
+	mvdev->tracker.is_err = true;
+	complete(&mvdev->tracker_comp);
+}
+
+static int mlx5fv_vf_event(struct notifier_block *nb,
+			   unsigned long event, void *data)
+{
+	struct mlx5vf_pci_core_device *mvdev =
+		container_of(nb, struct mlx5vf_pci_core_device, nb);
+
+	switch (event) {
+	case MLX5_PF_NOTIFY_ENABLE_VF:
+		mutex_lock(&mvdev->state_mutex);
+		mvdev->mdev_detach = false;
+		mlx5vf_state_mutex_unlock(mvdev);
+		break;
+	case MLX5_PF_NOTIFY_DISABLE_VF:
+		mlx5vf_cmd_close_migratable(mvdev);
+		mutex_lock(&mvdev->state_mutex);
+		mvdev->mdev_detach = true;
+		mlx5vf_state_mutex_unlock(mvdev);
+		break;
+	default:
+		break;
+	}
+
+	return 0;
+}
+
+void mlx5vf_cmd_close_migratable(struct mlx5vf_pci_core_device *mvdev)
+{
+	if (!mvdev->migrate_cap)
+		return;
+
+	/* Must be done outside the lock to let it progress */
+	set_tracker_error(mvdev);
+	mutex_lock(&mvdev->state_mutex);
+	mlx5vf_disable_fds(mvdev);
+	_mlx5vf_free_page_tracker_resources(mvdev);
+	mlx5vf_state_mutex_unlock(mvdev);
+}
+
+void mlx5vf_cmd_remove_migratable(struct mlx5vf_pci_core_device *mvdev)
+{
+	if (!mvdev->migrate_cap)
+		return;
+
+	mlx5_sriov_blocking_notifier_unregister(mvdev->mdev, mvdev->vf_id,
+						&mvdev->nb);
+	destroy_workqueue(mvdev->cb_wq);
+}
+
+void mlx5vf_cmd_set_migratable(struct mlx5vf_pci_core_device *mvdev,
+			       const struct vfio_migration_ops *mig_ops,
+			       const struct vfio_log_ops *log_ops)
+{
+	struct pci_dev *pdev = mvdev->core_device.pdev;
+	int ret;
+
+	if (!pdev->is_virtfn)
+		return;
+
+	mvdev->mdev = mlx5_vf_get_core_dev(pdev);
+	if (!mvdev->mdev)
+		return;
+
+	if (!MLX5_CAP_GEN(mvdev->mdev, migration))
+		goto end;
+
+	mvdev->vf_id = pci_iov_vf_id(pdev);
+	if (mvdev->vf_id < 0)
+		goto end;
+
+	ret = mlx5vf_is_migratable(mvdev->mdev, mvdev->vf_id + 1);
+	if (ret)
+		goto end;
+
+	if (mlx5vf_cmd_get_vhca_id(mvdev->mdev, mvdev->vf_id + 1,
+				   &mvdev->vhca_id))
+		goto end;
+
+	mvdev->cb_wq = alloc_ordered_workqueue("mlx5vf_wq", 0);
+	if (!mvdev->cb_wq)
+		goto end;
+
+	mutex_init(&mvdev->state_mutex);
+	spin_lock_init(&mvdev->reset_lock);
+	mvdev->nb.notifier_call = mlx5fv_vf_event;
+	ret = mlx5_sriov_blocking_notifier_register(mvdev->mdev, mvdev->vf_id,
+						    &mvdev->nb);
+	if (ret) {
+		destroy_workqueue(mvdev->cb_wq);
+		goto end;
+	}
+
+	mvdev->migrate_cap = 1;
+	mvdev->core_device.vdev.migration_flags =
+		VFIO_MIGRATION_STOP_COPY |
+		VFIO_MIGRATION_P2P;
+	mvdev->core_device.vdev.mig_ops = mig_ops;
+	init_completion(&mvdev->tracker_comp);
+	if (MLX5_CAP_GEN(mvdev->mdev, adv_virtualization))
+		mvdev->core_device.vdev.log_ops = log_ops;
+
+	if (MLX5_CAP_GEN_2(mvdev->mdev, migration_multi_load) &&
+	    MLX5_CAP_GEN_2(mvdev->mdev, migration_tracking_state))
+		mvdev->core_device.vdev.migration_flags |=
+			VFIO_MIGRATION_PRE_COPY;
+
+end:
+	mlx5_vf_put_core_dev(mvdev->mdev);
+}
+
+static int mlx5vf_cmd_get_vhca_id(struct mlx5_core_dev *mdev, u16 function_id,
+				  u16 *vhca_id)
+{
+	u32 in[MLX5_ST_SZ_DW(query_hca_cap_in)] = {};
+	int out_size;
+	void *out;
+	int ret;
+
+	out_size = MLX5_ST_SZ_BYTES(query_hca_cap_out);
+	out = kzalloc(out_size, GFP_KERNEL);
+	if (!out)
+		return -ENOMEM;
+
+	MLX5_SET(query_hca_cap_in, in, opcode, MLX5_CMD_OP_QUERY_HCA_CAP);
+	MLX5_SET(query_hca_cap_in, in, other_function, 1);
+	MLX5_SET(query_hca_cap_in, in, function_id, function_id);
+	MLX5_SET(query_hca_cap_in, in, op_mod,
+		 MLX5_SET_HCA_CAP_OP_MOD_GENERAL_DEVICE << 1 |
+		 HCA_CAP_OPMOD_GET_CUR);
+
+	ret = mlx5_cmd_exec_inout(mdev, query_hca_cap, in, out);
+	if (ret)
+		goto err_exec;
+
+	*vhca_id = MLX5_GET(query_hca_cap_out, out,
+			    capability.cmd_hca_cap.vhca_id);
+
+err_exec:
+	kfree(out);
+	return ret;
+}
+
+static int _create_mkey(struct mlx5_core_dev *mdev, u32 pdn,
+			struct mlx5_vhca_data_buffer *buf,
+			struct mlx5_vhca_recv_buf *recv_buf,
+			u32 *mkey)
+{
+	size_t npages = buf ? DIV_ROUND_UP(buf->allocated_length, PAGE_SIZE) :
+				recv_buf->npages;
+	int err = 0, inlen;
+	__be64 *mtt;
+	void *mkc;
+	u32 *in;
+
+	inlen = MLX5_ST_SZ_BYTES(create_mkey_in) +
+		sizeof(*mtt) * round_up(npages, 2);
+
+	in = kvzalloc(inlen, GFP_KERNEL);
+	if (!in)
+		return -ENOMEM;
+
+	MLX5_SET(create_mkey_in, in, translations_octword_actual_size,
+		 DIV_ROUND_UP(npages, 2));
+	mtt = (__be64 *)MLX5_ADDR_OF(create_mkey_in, in, klm_pas_mtt);
+
+	if (buf) {
+		struct sg_dma_page_iter dma_iter;
+
+		for_each_sgtable_dma_page(&buf->table.sgt, &dma_iter, 0)
+			*mtt++ = cpu_to_be64(sg_page_iter_dma_address(&dma_iter));
+	} else {
+		int i;
+
+		for (i = 0; i < npages; i++)
+			*mtt++ = cpu_to_be64(recv_buf->dma_addrs[i]);
+	}
+
+	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
+	MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_MTT);
+	MLX5_SET(mkc, mkc, lr, 1);
+	MLX5_SET(mkc, mkc, lw, 1);
+	MLX5_SET(mkc, mkc, rr, 1);
+	MLX5_SET(mkc, mkc, rw, 1);
+	MLX5_SET(mkc, mkc, pd, pdn);
+	MLX5_SET(mkc, mkc, bsf_octword_size, 0);
+	MLX5_SET(mkc, mkc, qpn, 0xffffff);
+	MLX5_SET(mkc, mkc, log_page_size, PAGE_SHIFT);
+	MLX5_SET(mkc, mkc, translations_octword_size, DIV_ROUND_UP(npages, 2));
+	MLX5_SET64(mkc, mkc, len, npages * PAGE_SIZE);
+	err = mlx5_core_create_mkey(mdev, mkey, in, inlen);
+	kvfree(in);
+	return err;
+}
+
+static int mlx5vf_dma_data_buffer(struct mlx5_vhca_data_buffer *buf)
+{
+	struct mlx5vf_pci_core_device *mvdev = buf->migf->mvdev;
+	struct mlx5_core_dev *mdev = mvdev->mdev;
+	int ret;
+
+	lockdep_assert_held(&mvdev->state_mutex);
+	if (mvdev->mdev_detach)
+		return -ENOTCONN;
+
+	if (buf->dmaed || !buf->allocated_length)
+		return -EINVAL;
+
+	ret = dma_map_sgtable(mdev->device, &buf->table.sgt, buf->dma_dir, 0);
+	if (ret)
+		return ret;
+
+	ret = _create_mkey(mdev, buf->migf->pdn, buf, NULL, &buf->mkey);
+	if (ret)
+		goto err;
+
+	buf->dmaed = true;
+
+	return 0;
+err:
+	dma_unmap_sgtable(mdev->device, &buf->table.sgt, buf->dma_dir, 0);
+	return ret;
+}
+
+void mlx5vf_free_data_buffer(struct mlx5_vhca_data_buffer *buf)
+{
+	struct mlx5_vf_migration_file *migf = buf->migf;
+	struct sg_page_iter sg_iter;
+
+	lockdep_assert_held(&migf->mvdev->state_mutex);
+	WARN_ON(migf->mvdev->mdev_detach);
+
+	if (buf->dmaed) {
+		mlx5_core_destroy_mkey(migf->mvdev->mdev, buf->mkey);
+		dma_unmap_sgtable(migf->mvdev->mdev->device, &buf->table.sgt,
+				  buf->dma_dir, 0);
+	}
+
+	/* Undo alloc_pages_bulk_array() */
+	for_each_sgtable_page(&buf->table.sgt, &sg_iter, 0)
+		__free_page(sg_page_iter_page(&sg_iter));
+	sg_free_append_table(&buf->table);
+	kfree(buf);
+}
+
+struct mlx5_vhca_data_buffer *
+mlx5vf_alloc_data_buffer(struct mlx5_vf_migration_file *migf,
+			 size_t length,
+			 enum dma_data_direction dma_dir)
+{
+	struct mlx5_vhca_data_buffer *buf;
+	int ret;
+
+	buf = kzalloc(sizeof(*buf), GFP_KERNEL_ACCOUNT);
+	if (!buf)
+		return ERR_PTR(-ENOMEM);
+
+	buf->dma_dir = dma_dir;
+	buf->migf = migf;
+	if (length) {
+		ret = mlx5vf_add_migration_pages(buf,
+				DIV_ROUND_UP_ULL(length, PAGE_SIZE));
+		if (ret)
+			goto end;
+
+		if (dma_dir != DMA_NONE) {
+			ret = mlx5vf_dma_data_buffer(buf);
+			if (ret)
+				goto end;
+		}
+	}
+
+	return buf;
+end:
+	mlx5vf_free_data_buffer(buf);
+	return ERR_PTR(ret);
+}
+
+void mlx5vf_put_data_buffer(struct mlx5_vhca_data_buffer *buf)
+{
+	spin_lock_irq(&buf->migf->list_lock);
+	list_add_tail(&buf->buf_elm, &buf->migf->avail_list);
+	spin_unlock_irq(&buf->migf->list_lock);
+}
+
+struct mlx5_vhca_data_buffer *
+mlx5vf_get_data_buffer(struct mlx5_vf_migration_file *migf,
+		       size_t length, enum dma_data_direction dma_dir)
+{
+	struct mlx5_vhca_data_buffer *buf, *temp_buf;
+	struct list_head free_list;
+
+	lockdep_assert_held(&migf->mvdev->state_mutex);
+	if (migf->mvdev->mdev_detach)
+		return ERR_PTR(-ENOTCONN);
+
+	INIT_LIST_HEAD(&free_list);
+
+	spin_lock_irq(&migf->list_lock);
+	list_for_each_entry_safe(buf, temp_buf, &migf->avail_list, buf_elm) {
+		if (buf->dma_dir == dma_dir) {
+			list_del_init(&buf->buf_elm);
+			if (buf->allocated_length >= length) {
+				spin_unlock_irq(&migf->list_lock);
+				goto found;
+			}
+			/*
+			 * Prevent holding redundant buffers. Put in a free
+			 * list and call at the end not under the spin lock
+			 * (&migf->list_lock) to mlx5vf_free_data_buffer which
+			 * might sleep.
+			 */
+			list_add(&buf->buf_elm, &free_list);
+		}
+	}
+	spin_unlock_irq(&migf->list_lock);
+	buf = mlx5vf_alloc_data_buffer(migf, length, dma_dir);
+
+found:
+	while ((temp_buf = list_first_entry_or_null(&free_list,
+				struct mlx5_vhca_data_buffer, buf_elm))) {
+		list_del(&temp_buf->buf_elm);
+		mlx5vf_free_data_buffer(temp_buf);
+	}
+
+	return buf;
+}
+
+void mlx5vf_mig_file_cleanup_cb(struct work_struct *_work)
+{
+	struct mlx5vf_async_data *async_data = container_of(_work,
+		struct mlx5vf_async_data, work);
+	struct mlx5_vf_migration_file *migf = container_of(async_data,
+		struct mlx5_vf_migration_file, async_data);
+
+	mutex_lock(&migf->lock);
+	if (async_data->status) {
+		mlx5vf_put_data_buffer(async_data->buf);
+		if (async_data->header_buf)
+			mlx5vf_put_data_buffer(async_data->header_buf);
+		if (async_data->status == MLX5_CMD_STAT_BAD_RES_STATE_ERR)
+			migf->state = MLX5_MIGF_STATE_PRE_COPY_ERROR;
+		else
+			migf->state = MLX5_MIGF_STATE_ERROR;
+		wake_up_interruptible(&migf->poll_wait);
+	}
+	mutex_unlock(&migf->lock);
+	kvfree(async_data->out);
+	complete(&migf->save_comp);
+	fput(migf->filp);
+}
+
+static int add_buf_header(struct mlx5_vhca_data_buffer *header_buf,
+			  size_t image_size, bool initial_pre_copy)
+{
+	struct mlx5_vf_migration_file *migf = header_buf->migf;
+	struct mlx5_vf_migration_header header = {};
+	unsigned long flags;
+	struct page *page;
+	u8 *to_buff;
+
+	header.record_size = cpu_to_le64(image_size);
+	header.flags = cpu_to_le32(MLX5_MIGF_HEADER_FLAGS_TAG_MANDATORY);
+	header.tag = cpu_to_le32(MLX5_MIGF_HEADER_TAG_FW_DATA);
+	page = mlx5vf_get_migration_page(header_buf, 0);
+	if (!page)
+		return -EINVAL;
+	to_buff = kmap_local_page(page);
+	memcpy(to_buff, &header, sizeof(header));
+	kunmap_local(to_buff);
+	header_buf->length = sizeof(header);
+	header_buf->start_pos = header_buf->migf->max_pos;
+	migf->max_pos += header_buf->length;
+	spin_lock_irqsave(&migf->list_lock, flags);
+	list_add_tail(&header_buf->buf_elm, &migf->buf_list);
+	spin_unlock_irqrestore(&migf->list_lock, flags);
+	if (initial_pre_copy)
+		migf->pre_copy_initial_bytes += sizeof(header);
+	return 0;
+}
+
+static void mlx5vf_save_callback(int status, struct mlx5_async_work *context)
+{
+	struct mlx5vf_async_data *async_data = container_of(context,
+			struct mlx5vf_async_data, cb_work);
+	struct mlx5_vf_migration_file *migf = container_of(async_data,
+			struct mlx5_vf_migration_file, async_data);
+
+	if (!status) {
+		size_t image_size;
+		unsigned long flags;
+		bool initial_pre_copy = migf->state != MLX5_MIGF_STATE_PRE_COPY &&
+				!async_data->last_chunk;
+
+		image_size = MLX5_GET(save_vhca_state_out, async_data->out,
+				      actual_image_size);
+		if (async_data->header_buf) {
+			status = add_buf_header(async_data->header_buf, image_size,
+						initial_pre_copy);
+			if (status)
+				goto err;
+		}
+		async_data->buf->length = image_size;
+		async_data->buf->start_pos = migf->max_pos;
+		migf->max_pos += async_data->buf->length;
+		spin_lock_irqsave(&migf->list_lock, flags);
+		list_add_tail(&async_data->buf->buf_elm, &migf->buf_list);
+		spin_unlock_irqrestore(&migf->list_lock, flags);
+		if (initial_pre_copy)
+			migf->pre_copy_initial_bytes += image_size;
+		migf->state = async_data->last_chunk ?
+			MLX5_MIGF_STATE_COMPLETE : MLX5_MIGF_STATE_PRE_COPY;
+		wake_up_interruptible(&migf->poll_wait);
+	}
+
+err:
+	/*
+	 * The error and the cleanup flows can't run from an
+	 * interrupt context
+	 */
+	if (status == -EREMOTEIO)
+		status = MLX5_GET(save_vhca_state_out, async_data->out, status);
+	async_data->status = status;
+	queue_work(migf->mvdev->cb_wq, &async_data->work);
+}
+
+int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev,
+			       struct mlx5_vf_migration_file *migf,
+			       struct mlx5_vhca_data_buffer *buf, bool inc,
+			       bool track)
+{
+	u32 out_size = MLX5_ST_SZ_BYTES(save_vhca_state_out);
+	u32 in[MLX5_ST_SZ_DW(save_vhca_state_in)] = {};
+	struct mlx5_vhca_data_buffer *header_buf = NULL;
+	struct mlx5vf_async_data *async_data;
+	int err;
+
+	lockdep_assert_held(&mvdev->state_mutex);
+	if (mvdev->mdev_detach)
+		return -ENOTCONN;
+
+	err = wait_for_completion_interruptible(&migf->save_comp);
+	if (err)
+		return err;
+
+	if (migf->state == MLX5_MIGF_STATE_PRE_COPY_ERROR)
+		/*
+		 * In case we had a PRE_COPY error, SAVE is triggered only for
+		 * the final image, read device full image.
+		 */
+		inc = false;
+
+	MLX5_SET(save_vhca_state_in, in, opcode,
+		 MLX5_CMD_OP_SAVE_VHCA_STATE);
+	MLX5_SET(save_vhca_state_in, in, op_mod, 0);
+	MLX5_SET(save_vhca_state_in, in, vhca_id, mvdev->vhca_id);
+	MLX5_SET(save_vhca_state_in, in, mkey, buf->mkey);
+	MLX5_SET(save_vhca_state_in, in, size, buf->allocated_length);
+	MLX5_SET(save_vhca_state_in, in, incremental, inc);
+	MLX5_SET(save_vhca_state_in, in, set_track, track);
+
+	async_data = &migf->async_data;
+	async_data->buf = buf;
+	async_data->last_chunk = !track;
+	async_data->out = kvzalloc(out_size, GFP_KERNEL);
+	if (!async_data->out) {
+		err = -ENOMEM;
+		goto err_out;
+	}
+
+	if (MLX5VF_PRE_COPY_SUPP(mvdev)) {
+		if (async_data->last_chunk && migf->buf_header) {
+			header_buf = migf->buf_header;
+			migf->buf_header = NULL;
+		} else {
+			header_buf = mlx5vf_get_data_buffer(migf,
+				sizeof(struct mlx5_vf_migration_header), DMA_NONE);
+			if (IS_ERR(header_buf)) {
+				err = PTR_ERR(header_buf);
+				goto err_free;
+			}
+		}
+	}
+
+	if (async_data->last_chunk)
+		migf->state = MLX5_MIGF_STATE_SAVE_LAST;
+
+	async_data->header_buf = header_buf;
+	get_file(migf->filp);
+	err = mlx5_cmd_exec_cb(&migf->async_ctx, in, sizeof(in),
+			       async_data->out,
+			       out_size, mlx5vf_save_callback,
+			       &async_data->cb_work);
+	if (err)
+		goto err_exec;
+
+	return 0;
+
+err_exec:
+	if (header_buf)
+		mlx5vf_put_data_buffer(header_buf);
+	fput(migf->filp);
+err_free:
+	kvfree(async_data->out);
+err_out:
+	complete(&migf->save_comp);
+	return err;
+}
+
+int mlx5vf_cmd_load_vhca_state(struct mlx5vf_pci_core_device *mvdev,
+			       struct mlx5_vf_migration_file *migf,
+			       struct mlx5_vhca_data_buffer *buf)
+{
+	u32 out[MLX5_ST_SZ_DW(load_vhca_state_out)] = {};
+	u32 in[MLX5_ST_SZ_DW(load_vhca_state_in)] = {};
+	int err;
+
+	lockdep_assert_held(&mvdev->state_mutex);
+	if (mvdev->mdev_detach)
+		return -ENOTCONN;
+
+	if (!buf->dmaed) {
+		err = mlx5vf_dma_data_buffer(buf);
+		if (err)
+			return err;
+	}
+
+	MLX5_SET(load_vhca_state_in, in, opcode,
+		 MLX5_CMD_OP_LOAD_VHCA_STATE);
+	MLX5_SET(load_vhca_state_in, in, op_mod, 0);
+	MLX5_SET(load_vhca_state_in, in, vhca_id, mvdev->vhca_id);
+	MLX5_SET(load_vhca_state_in, in, mkey, buf->mkey);
+	MLX5_SET(load_vhca_state_in, in, size, buf->length);
+	return mlx5_cmd_exec_inout(mvdev->mdev, load_vhca_state, in, out);
+}
+
+int mlx5vf_cmd_alloc_pd(struct mlx5_vf_migration_file *migf)
+{
+	int err;
+
+	lockdep_assert_held(&migf->mvdev->state_mutex);
+	if (migf->mvdev->mdev_detach)
+		return -ENOTCONN;
+
+	err = mlx5_core_alloc_pd(migf->mvdev->mdev, &migf->pdn);
+	return err;
+}
+
+void mlx5vf_cmd_dealloc_pd(struct mlx5_vf_migration_file *migf)
+{
+	lockdep_assert_held(&migf->mvdev->state_mutex);
+	if (migf->mvdev->mdev_detach)
+		return;
+
+	mlx5_core_dealloc_pd(migf->mvdev->mdev, migf->pdn);
+}
+
+void mlx5fv_cmd_clean_migf_resources(struct mlx5_vf_migration_file *migf)
+{
+	struct mlx5_vhca_data_buffer *entry;
+
+	lockdep_assert_held(&migf->mvdev->state_mutex);
+	WARN_ON(migf->mvdev->mdev_detach);
+
+	if (migf->buf) {
+		mlx5vf_free_data_buffer(migf->buf);
+		migf->buf = NULL;
+	}
+
+	if (migf->buf_header) {
+		mlx5vf_free_data_buffer(migf->buf_header);
+		migf->buf_header = NULL;
+	}
+
+	list_splice(&migf->avail_list, &migf->buf_list);
+
+	while ((entry = list_first_entry_or_null(&migf->buf_list,
+				struct mlx5_vhca_data_buffer, buf_elm))) {
+		list_del(&entry->buf_elm);
+		mlx5vf_free_data_buffer(entry);
+	}
+
+	mlx5vf_cmd_dealloc_pd(migf);
+}
+
+static int mlx5vf_create_tracker(struct mlx5_core_dev *mdev,
+				 struct mlx5vf_pci_core_device *mvdev,
+				 struct rb_root_cached *ranges, u32 nnodes)
+{
+	int max_num_range =
+		MLX5_CAP_ADV_VIRTUALIZATION(mdev, pg_track_max_num_range);
+	struct mlx5_vhca_page_tracker *tracker = &mvdev->tracker;
+	int record_size = MLX5_ST_SZ_BYTES(page_track_range);
+	u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)] = {};
+	struct interval_tree_node *node = NULL;
+	u64 total_ranges_len = 0;
+	u32 num_ranges = nnodes;
+	u8 log_addr_space_size;
+	void *range_list_ptr;
+	void *obj_context;
+	void *cmd_hdr;
+	int inlen;
+	void *in;
+	int err;
+	int i;
+
+	if (num_ranges > max_num_range) {
+		vfio_combine_iova_ranges(ranges, nnodes, max_num_range);
+		num_ranges = max_num_range;
+	}
+
+	inlen = MLX5_ST_SZ_BYTES(create_page_track_obj_in) +
+				 record_size * num_ranges;
+	in = kzalloc(inlen, GFP_KERNEL);
+	if (!in)
+		return -ENOMEM;
+
+	cmd_hdr = MLX5_ADDR_OF(create_page_track_obj_in, in,
+			       general_obj_in_cmd_hdr);
+	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode,
+		 MLX5_CMD_OP_CREATE_GENERAL_OBJECT);
+	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type,
+		 MLX5_OBJ_TYPE_PAGE_TRACK);
+	obj_context = MLX5_ADDR_OF(create_page_track_obj_in, in, obj_context);
+	MLX5_SET(page_track, obj_context, vhca_id, mvdev->vhca_id);
+	MLX5_SET(page_track, obj_context, track_type, 1);
+	MLX5_SET(page_track, obj_context, log_page_size,
+		 ilog2(tracker->host_qp->tracked_page_size));
+	MLX5_SET(page_track, obj_context, log_msg_size,
+		 ilog2(tracker->host_qp->max_msg_size));
+	MLX5_SET(page_track, obj_context, reporting_qpn, tracker->fw_qp->qpn);
+	MLX5_SET(page_track, obj_context, num_ranges, num_ranges);
+
+	range_list_ptr = MLX5_ADDR_OF(page_track, obj_context, track_range);
+	node = interval_tree_iter_first(ranges, 0, ULONG_MAX);
+	for (i = 0; i < num_ranges; i++) {
+		void *addr_range_i_base = range_list_ptr + record_size * i;
+		unsigned long length = node->last - node->start + 1;
+
+		MLX5_SET64(page_track_range, addr_range_i_base, start_address,
+			   node->start);
+		MLX5_SET64(page_track_range, addr_range_i_base, length, length);
+		total_ranges_len += length;
+		node = interval_tree_iter_next(node, 0, ULONG_MAX);
+	}
+
+	WARN_ON(node);
+	log_addr_space_size = ilog2(roundup_pow_of_two(total_ranges_len));
+	if (log_addr_space_size <
+	    (MLX5_CAP_ADV_VIRTUALIZATION(mdev, pg_track_log_min_addr_space)) ||
+	    log_addr_space_size >
+	    (MLX5_CAP_ADV_VIRTUALIZATION(mdev, pg_track_log_max_addr_space))) {
+		err = -EOPNOTSUPP;
+		goto out;
+	}
+
+	MLX5_SET(page_track, obj_context, log_addr_space_size,
+		 log_addr_space_size);
+	err = mlx5_cmd_exec(mdev, in, inlen, out, sizeof(out));
+	if (err)
+		goto out;
+
+	tracker->id = MLX5_GET(general_obj_out_cmd_hdr, out, obj_id);
+out:
+	kfree(in);
+	return err;
+}
+
+static int mlx5vf_cmd_destroy_tracker(struct mlx5_core_dev *mdev,
+				      u32 tracker_id)
+{
+	u32 in[MLX5_ST_SZ_DW(general_obj_in_cmd_hdr)] = {};
+	u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)] = {};
+
+	MLX5_SET(general_obj_in_cmd_hdr, in, opcode, MLX5_CMD_OP_DESTROY_GENERAL_OBJECT);
+	MLX5_SET(general_obj_in_cmd_hdr, in, obj_type, MLX5_OBJ_TYPE_PAGE_TRACK);
+	MLX5_SET(general_obj_in_cmd_hdr, in, obj_id, tracker_id);
+
+	return mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out));
+}
+
+static int mlx5vf_cmd_modify_tracker(struct mlx5_core_dev *mdev,
+				     u32 tracker_id, unsigned long iova,
+				     unsigned long length, u32 tracker_state)
+{
+	u32 in[MLX5_ST_SZ_DW(modify_page_track_obj_in)] = {};
+	u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)] = {};
+	void *obj_context;
+	void *cmd_hdr;
+
+	cmd_hdr = MLX5_ADDR_OF(modify_page_track_obj_in, in, general_obj_in_cmd_hdr);
+	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode, MLX5_CMD_OP_MODIFY_GENERAL_OBJECT);
+	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type, MLX5_OBJ_TYPE_PAGE_TRACK);
+	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_id, tracker_id);
+
+	obj_context = MLX5_ADDR_OF(modify_page_track_obj_in, in, obj_context);
+	MLX5_SET64(page_track, obj_context, modify_field_select, 0x3);
+	MLX5_SET64(page_track, obj_context, range_start_address, iova);
+	MLX5_SET64(page_track, obj_context, length, length);
+	MLX5_SET(page_track, obj_context, state, tracker_state);
+
+	return mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out));
+}
+
+static int alloc_cq_frag_buf(struct mlx5_core_dev *mdev,
+			     struct mlx5_vhca_cq_buf *buf, int nent,
+			     int cqe_size)
+{
+	struct mlx5_frag_buf *frag_buf = &buf->frag_buf;
+	u8 log_wq_stride = 6 + (cqe_size == 128 ? 1 : 0);
+	u8 log_wq_sz = ilog2(cqe_size);
+	int err;
+
+	err = mlx5_frag_buf_alloc_node(mdev, nent * cqe_size, frag_buf,
+				       mdev->priv.numa_node);
+	if (err)
+		return err;
+
+	mlx5_init_fbc(frag_buf->frags, log_wq_stride, log_wq_sz, &buf->fbc);
+	buf->cqe_size = cqe_size;
+	buf->nent = nent;
+	return 0;
+}
+
+static void init_cq_frag_buf(struct mlx5_vhca_cq_buf *buf)
+{
+	struct mlx5_cqe64 *cqe64;
+	void *cqe;
+	int i;
+
+	for (i = 0; i < buf->nent; i++) {
+		cqe = mlx5_frag_buf_get_wqe(&buf->fbc, i);
+		cqe64 = buf->cqe_size == 64 ? cqe : cqe + 64;
+		cqe64->op_own = MLX5_CQE_INVALID << 4;
+	}
+}
+
+static void mlx5vf_destroy_cq(struct mlx5_core_dev *mdev,
+			      struct mlx5_vhca_cq *cq)
+{
+	mlx5_core_destroy_cq(mdev, &cq->mcq);
+	mlx5_frag_buf_free(mdev, &cq->buf.frag_buf);
+	mlx5_db_free(mdev, &cq->db);
+}
+
+static void mlx5vf_cq_event(struct mlx5_core_cq *mcq, enum mlx5_event type)
+{
+	if (type != MLX5_EVENT_TYPE_CQ_ERROR)
+		return;
+
+	set_tracker_error(container_of(mcq, struct mlx5vf_pci_core_device,
+				       tracker.cq.mcq));
+}
+
+static int mlx5vf_event_notifier(struct notifier_block *nb, unsigned long type,
+				 void *data)
+{
+	struct mlx5_vhca_page_tracker *tracker =
+		mlx5_nb_cof(nb, struct mlx5_vhca_page_tracker, nb);
+	struct mlx5vf_pci_core_device *mvdev = container_of(
+		tracker, struct mlx5vf_pci_core_device, tracker);
+	struct mlx5_eqe *eqe = data;
+	u8 event_type = (u8)type;
+	u8 queue_type;
+	int qp_num;
+
+	switch (event_type) {
+	case MLX5_EVENT_TYPE_WQ_CATAS_ERROR:
+	case MLX5_EVENT_TYPE_WQ_ACCESS_ERROR:
+	case MLX5_EVENT_TYPE_WQ_INVAL_REQ_ERROR:
+		queue_type = eqe->data.qp_srq.type;
+		if (queue_type != MLX5_EVENT_QUEUE_TYPE_QP)
+			break;
+		qp_num = be32_to_cpu(eqe->data.qp_srq.qp_srq_n) & 0xffffff;
+		if (qp_num != tracker->host_qp->qpn &&
+		    qp_num != tracker->fw_qp->qpn)
+			break;
+		set_tracker_error(mvdev);
+		break;
+	default:
+		break;
+	}
+
+	return NOTIFY_OK;
+}
+
+static void mlx5vf_cq_complete(struct mlx5_core_cq *mcq,
+			       struct mlx5_eqe *eqe)
+{
+	struct mlx5vf_pci_core_device *mvdev =
+		container_of(mcq, struct mlx5vf_pci_core_device,
+			     tracker.cq.mcq);
+
+	complete(&mvdev->tracker_comp);
+}
+
+static int mlx5vf_create_cq(struct mlx5_core_dev *mdev,
+			    struct mlx5_vhca_page_tracker *tracker,
+			    size_t ncqe)
+{
+	int cqe_size = cache_line_size() == 128 ? 128 : 64;
+	u32 out[MLX5_ST_SZ_DW(create_cq_out)];
+	struct mlx5_vhca_cq *cq;
+	int inlen, err, eqn;
+	void *cqc, *in;
+	__be64 *pas;
+	int vector;
+
+	cq = &tracker->cq;
+	ncqe = roundup_pow_of_two(ncqe);
+	err = mlx5_db_alloc_node(mdev, &cq->db, mdev->priv.numa_node);
+	if (err)
+		return err;
+
+	cq->ncqe = ncqe;
+	cq->mcq.set_ci_db = cq->db.db;
+	cq->mcq.arm_db = cq->db.db + 1;
+	cq->mcq.cqe_sz = cqe_size;
+	err = alloc_cq_frag_buf(mdev, &cq->buf, ncqe, cqe_size);
+	if (err)
+		goto err_db_free;
+
+	init_cq_frag_buf(&cq->buf);
+	inlen = MLX5_ST_SZ_BYTES(create_cq_in) +
+		MLX5_FLD_SZ_BYTES(create_cq_in, pas[0]) *
+		cq->buf.frag_buf.npages;
+	in = kvzalloc(inlen, GFP_KERNEL);
+	if (!in) {
+		err = -ENOMEM;
+		goto err_buff;
+	}
+
+	vector = raw_smp_processor_id() % mlx5_comp_vectors_max(mdev);
+	err = mlx5_comp_eqn_get(mdev, vector, &eqn);
+	if (err)
+		goto err_vec;
+
+	cqc = MLX5_ADDR_OF(create_cq_in, in, cq_context);
+	MLX5_SET(cqc, cqc, log_cq_size, ilog2(ncqe));
+	MLX5_SET(cqc, cqc, c_eqn_or_apu_element, eqn);
+	MLX5_SET(cqc, cqc, uar_page, tracker->uar->index);
+	MLX5_SET(cqc, cqc, log_page_size, cq->buf.frag_buf.page_shift -
+		 MLX5_ADAPTER_PAGE_SHIFT);
+	MLX5_SET64(cqc, cqc, dbr_addr, cq->db.dma);
+	pas = (__be64 *)MLX5_ADDR_OF(create_cq_in, in, pas);
+	mlx5_fill_page_frag_array(&cq->buf.frag_buf, pas);
+	cq->mcq.comp = mlx5vf_cq_complete;
+	cq->mcq.event = mlx5vf_cq_event;
+	err = mlx5_core_create_cq(mdev, &cq->mcq, in, inlen, out, sizeof(out));
+	if (err)
+		goto err_vec;
+
+	mlx5_cq_arm(&cq->mcq, MLX5_CQ_DB_REQ_NOT, tracker->uar->map,
+		    cq->mcq.cons_index);
+	kvfree(in);
+	return 0;
+
+err_vec:
+	kvfree(in);
+err_buff:
+	mlx5_frag_buf_free(mdev, &cq->buf.frag_buf);
+err_db_free:
+	mlx5_db_free(mdev, &cq->db);
+	return err;
+}
+
+static struct mlx5_vhca_qp *
+mlx5vf_create_rc_qp(struct mlx5_core_dev *mdev,
+		    struct mlx5_vhca_page_tracker *tracker, u32 max_recv_wr)
+{
+	u32 out[MLX5_ST_SZ_DW(create_qp_out)] = {};
+	struct mlx5_vhca_qp *qp;
+	u8 log_rq_stride;
+	u8 log_rq_sz;
+	void *qpc;
+	int inlen;
+	void *in;
+	int err;
+
+	qp = kzalloc(sizeof(*qp), GFP_KERNEL_ACCOUNT);
+	if (!qp)
+		return ERR_PTR(-ENOMEM);
+
+	err = mlx5_db_alloc_node(mdev, &qp->db, mdev->priv.numa_node);
+	if (err)
+		goto err_free;
+
+	if (max_recv_wr) {
+		qp->rq.wqe_cnt = roundup_pow_of_two(max_recv_wr);
+		log_rq_stride = ilog2(MLX5_SEND_WQE_DS);
+		log_rq_sz = ilog2(qp->rq.wqe_cnt);
+		err = mlx5_frag_buf_alloc_node(mdev,
+			wq_get_byte_sz(log_rq_sz, log_rq_stride),
+			&qp->buf, mdev->priv.numa_node);
+		if (err)
+			goto err_db_free;
+		mlx5_init_fbc(qp->buf.frags, log_rq_stride, log_rq_sz, &qp->rq.fbc);
+	}
+
+	qp->rq.db = &qp->db.db[MLX5_RCV_DBR];
+	inlen = MLX5_ST_SZ_BYTES(create_qp_in) +
+		MLX5_FLD_SZ_BYTES(create_qp_in, pas[0]) *
+		qp->buf.npages;
+	in = kvzalloc(inlen, GFP_KERNEL);
+	if (!in) {
+		err = -ENOMEM;
+		goto err_in;
+	}
+
+	qpc = MLX5_ADDR_OF(create_qp_in, in, qpc);
+	MLX5_SET(qpc, qpc, st, MLX5_QP_ST_RC);
+	MLX5_SET(qpc, qpc, pm_state, MLX5_QP_PM_MIGRATED);
+	MLX5_SET(qpc, qpc, pd, tracker->pdn);
+	MLX5_SET(qpc, qpc, uar_page, tracker->uar->index);
+	MLX5_SET(qpc, qpc, log_page_size,
+		 qp->buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT);
+	MLX5_SET(qpc, qpc, ts_format, mlx5_get_qp_default_ts(mdev));
+	if (MLX5_CAP_GEN(mdev, cqe_version) == 1)
+		MLX5_SET(qpc, qpc, user_index, 0xFFFFFF);
+	MLX5_SET(qpc, qpc, no_sq, 1);
+	if (max_recv_wr) {
+		MLX5_SET(qpc, qpc, cqn_rcv, tracker->cq.mcq.cqn);
+		MLX5_SET(qpc, qpc, log_rq_stride, log_rq_stride - 4);
+		MLX5_SET(qpc, qpc, log_rq_size, log_rq_sz);
+		MLX5_SET(qpc, qpc, rq_type, MLX5_NON_ZERO_RQ);
+		MLX5_SET64(qpc, qpc, dbr_addr, qp->db.dma);
+		mlx5_fill_page_frag_array(&qp->buf,
+					  (__be64 *)MLX5_ADDR_OF(create_qp_in,
+								 in, pas));
+	} else {
+		MLX5_SET(qpc, qpc, rq_type, MLX5_ZERO_LEN_RQ);
+	}
+
+	MLX5_SET(create_qp_in, in, opcode, MLX5_CMD_OP_CREATE_QP);
+	err = mlx5_cmd_exec(mdev, in, inlen, out, sizeof(out));
+	kvfree(in);
+	if (err)
+		goto err_in;
+
+	qp->qpn = MLX5_GET(create_qp_out, out, qpn);
+	return qp;
+
+err_in:
+	if (max_recv_wr)
+		mlx5_frag_buf_free(mdev, &qp->buf);
+err_db_free:
+	mlx5_db_free(mdev, &qp->db);
+err_free:
+	kfree(qp);
+	return ERR_PTR(err);
+}
+
+static void mlx5vf_post_recv(struct mlx5_vhca_qp *qp)
+{
+	struct mlx5_wqe_data_seg *data;
+	unsigned int ix;
+
+	WARN_ON(qp->rq.pc - qp->rq.cc >= qp->rq.wqe_cnt);
+	ix = qp->rq.pc & (qp->rq.wqe_cnt - 1);
+	data = mlx5_frag_buf_get_wqe(&qp->rq.fbc, ix);
+	data->byte_count = cpu_to_be32(qp->max_msg_size);
+	data->lkey = cpu_to_be32(qp->recv_buf.mkey);
+	data->addr = cpu_to_be64(qp->recv_buf.next_rq_offset);
+	qp->rq.pc++;
+	/* Make sure that descriptors are written before doorbell record. */
+	dma_wmb();
+	*qp->rq.db = cpu_to_be32(qp->rq.pc & 0xffff);
+}
+
+static int mlx5vf_activate_qp(struct mlx5_core_dev *mdev,
+			      struct mlx5_vhca_qp *qp, u32 remote_qpn,
+			      bool host_qp)
+{
+	u32 init_in[MLX5_ST_SZ_DW(rst2init_qp_in)] = {};
+	u32 rtr_in[MLX5_ST_SZ_DW(init2rtr_qp_in)] = {};
+	u32 rts_in[MLX5_ST_SZ_DW(rtr2rts_qp_in)] = {};
+	void *qpc;
+	int ret;
+
+	/* Init */
+	qpc = MLX5_ADDR_OF(rst2init_qp_in, init_in, qpc);
+	MLX5_SET(qpc, qpc, primary_address_path.vhca_port_num, 1);
+	MLX5_SET(qpc, qpc, pm_state, MLX5_QPC_PM_STATE_MIGRATED);
+	MLX5_SET(qpc, qpc, rre, 1);
+	MLX5_SET(qpc, qpc, rwe, 1);
+	MLX5_SET(rst2init_qp_in, init_in, opcode, MLX5_CMD_OP_RST2INIT_QP);
+	MLX5_SET(rst2init_qp_in, init_in, qpn, qp->qpn);
+	ret = mlx5_cmd_exec_in(mdev, rst2init_qp, init_in);
+	if (ret)
+		return ret;
+
+	if (host_qp) {
+		struct mlx5_vhca_recv_buf *recv_buf = &qp->recv_buf;
+		int i;
+
+		for (i = 0; i < qp->rq.wqe_cnt; i++) {
+			mlx5vf_post_recv(qp);
+			recv_buf->next_rq_offset += qp->max_msg_size;
+		}
+	}
+
+	/* RTR */
+	qpc = MLX5_ADDR_OF(init2rtr_qp_in, rtr_in, qpc);
+	MLX5_SET(init2rtr_qp_in, rtr_in, qpn, qp->qpn);
+	MLX5_SET(qpc, qpc, mtu, IB_MTU_4096);
+	MLX5_SET(qpc, qpc, log_msg_max, MLX5_CAP_GEN(mdev, log_max_msg));
+	MLX5_SET(qpc, qpc, remote_qpn, remote_qpn);
+	MLX5_SET(qpc, qpc, primary_address_path.vhca_port_num, 1);
+	MLX5_SET(qpc, qpc, primary_address_path.fl, 1);
+	MLX5_SET(qpc, qpc, min_rnr_nak, 1);
+	MLX5_SET(init2rtr_qp_in, rtr_in, opcode, MLX5_CMD_OP_INIT2RTR_QP);
+	MLX5_SET(init2rtr_qp_in, rtr_in, qpn, qp->qpn);
+	ret = mlx5_cmd_exec_in(mdev, init2rtr_qp, rtr_in);
+	if (ret || host_qp)
+		return ret;
+
+	/* RTS */
+	qpc = MLX5_ADDR_OF(rtr2rts_qp_in, rts_in, qpc);
+	MLX5_SET(rtr2rts_qp_in, rts_in, qpn, qp->qpn);
+	MLX5_SET(qpc, qpc, retry_count, 7);
+	MLX5_SET(qpc, qpc, rnr_retry, 7); /* Infinite retry if RNR NACK */
+	MLX5_SET(qpc, qpc, primary_address_path.ack_timeout, 0x8); /* ~1ms */
+	MLX5_SET(rtr2rts_qp_in, rts_in, opcode, MLX5_CMD_OP_RTR2RTS_QP);
+	MLX5_SET(rtr2rts_qp_in, rts_in, qpn, qp->qpn);
+
+	return mlx5_cmd_exec_in(mdev, rtr2rts_qp, rts_in);
+}
+
+static void mlx5vf_destroy_qp(struct mlx5_core_dev *mdev,
+			      struct mlx5_vhca_qp *qp)
+{
+	u32 in[MLX5_ST_SZ_DW(destroy_qp_in)] = {};
+
+	MLX5_SET(destroy_qp_in, in, opcode, MLX5_CMD_OP_DESTROY_QP);
+	MLX5_SET(destroy_qp_in, in, qpn, qp->qpn);
+	mlx5_cmd_exec_in(mdev, destroy_qp, in);
+
+	mlx5_frag_buf_free(mdev, &qp->buf);
+	mlx5_db_free(mdev, &qp->db);
+	kfree(qp);
+}
+
+static void free_recv_pages(struct mlx5_vhca_recv_buf *recv_buf)
+{
+	int i;
+
+	/* Undo alloc_pages_bulk_array() */
+	for (i = 0; i < recv_buf->npages; i++)
+		__free_page(recv_buf->page_list[i]);
+
+	kvfree(recv_buf->page_list);
+}
+
+static int alloc_recv_pages(struct mlx5_vhca_recv_buf *recv_buf,
+			    unsigned int npages)
+{
+	unsigned int filled = 0, done = 0;
+	int i;
+
+	recv_buf->page_list = kvcalloc(npages, sizeof(*recv_buf->page_list),
+				       GFP_KERNEL_ACCOUNT);
+	if (!recv_buf->page_list)
+		return -ENOMEM;
+
+	for (;;) {
+		filled = alloc_pages_bulk_array(GFP_KERNEL_ACCOUNT,
+						npages - done,
+						recv_buf->page_list + done);
+		if (!filled)
+			goto err;
+
+		done += filled;
+		if (done == npages)
+			break;
+	}
+
+	recv_buf->npages = npages;
+	return 0;
+
+err:
+	for (i = 0; i < npages; i++) {
+		if (recv_buf->page_list[i])
+			__free_page(recv_buf->page_list[i]);
+	}
+
+	kvfree(recv_buf->page_list);
+	return -ENOMEM;
+}
+
+static int register_dma_recv_pages(struct mlx5_core_dev *mdev,
+				   struct mlx5_vhca_recv_buf *recv_buf)
+{
+	int i, j;
+
+	recv_buf->dma_addrs = kvcalloc(recv_buf->npages,
+				       sizeof(*recv_buf->dma_addrs),
+				       GFP_KERNEL_ACCOUNT);
+	if (!recv_buf->dma_addrs)
+		return -ENOMEM;
+
+	for (i = 0; i < recv_buf->npages; i++) {
+		recv_buf->dma_addrs[i] = dma_map_page(mdev->device,
+						      recv_buf->page_list[i],
+						      0, PAGE_SIZE,
+						      DMA_FROM_DEVICE);
+		if (dma_mapping_error(mdev->device, recv_buf->dma_addrs[i]))
+			goto error;
+	}
+	return 0;
+
+error:
+	for (j = 0; j < i; j++)
+		dma_unmap_single(mdev->device, recv_buf->dma_addrs[j],
+				 PAGE_SIZE, DMA_FROM_DEVICE);
+
+	kvfree(recv_buf->dma_addrs);
+	return -ENOMEM;
+}
+
+static void unregister_dma_recv_pages(struct mlx5_core_dev *mdev,
+				      struct mlx5_vhca_recv_buf *recv_buf)
+{
+	int i;
+
+	for (i = 0; i < recv_buf->npages; i++)
+		dma_unmap_single(mdev->device, recv_buf->dma_addrs[i],
+				 PAGE_SIZE, DMA_FROM_DEVICE);
+
+	kvfree(recv_buf->dma_addrs);
+}
+
+static void mlx5vf_free_qp_recv_resources(struct mlx5_core_dev *mdev,
+					  struct mlx5_vhca_qp *qp)
+{
+	struct mlx5_vhca_recv_buf *recv_buf = &qp->recv_buf;
+
+	mlx5_core_destroy_mkey(mdev, recv_buf->mkey);
+	unregister_dma_recv_pages(mdev, recv_buf);
+	free_recv_pages(&qp->recv_buf);
+}
+
+static int mlx5vf_alloc_qp_recv_resources(struct mlx5_core_dev *mdev,
+					  struct mlx5_vhca_qp *qp, u32 pdn,
+					  u64 rq_size)
+{
+	unsigned int npages = DIV_ROUND_UP_ULL(rq_size, PAGE_SIZE);
+	struct mlx5_vhca_recv_buf *recv_buf = &qp->recv_buf;
+	int err;
+
+	err = alloc_recv_pages(recv_buf, npages);
+	if (err < 0)
+		return err;
+
+	err = register_dma_recv_pages(mdev, recv_buf);
+	if (err)
+		goto end;
+
+	err = _create_mkey(mdev, pdn, NULL, recv_buf, &recv_buf->mkey);
+	if (err)
+		goto err_create_mkey;
+
+	return 0;
+
+err_create_mkey:
+	unregister_dma_recv_pages(mdev, recv_buf);
+end:
+	free_recv_pages(recv_buf);
+	return err;
+}
+
+static void
+_mlx5vf_free_page_tracker_resources(struct mlx5vf_pci_core_device *mvdev)
+{
+	struct mlx5_vhca_page_tracker *tracker = &mvdev->tracker;
+	struct mlx5_core_dev *mdev = mvdev->mdev;
+
+	lockdep_assert_held(&mvdev->state_mutex);
+
+	if (!mvdev->log_active)
+		return;
+
+	WARN_ON(mvdev->mdev_detach);
+
+	mlx5_eq_notifier_unregister(mdev, &tracker->nb);
+	mlx5vf_cmd_destroy_tracker(mdev, tracker->id);
+	mlx5vf_destroy_qp(mdev, tracker->fw_qp);
+	mlx5vf_free_qp_recv_resources(mdev, tracker->host_qp);
+	mlx5vf_destroy_qp(mdev, tracker->host_qp);
+	mlx5vf_destroy_cq(mdev, &tracker->cq);
+	mlx5_core_dealloc_pd(mdev, tracker->pdn);
+	mlx5_put_uars_page(mdev, tracker->uar);
+	mvdev->log_active = false;
+}
+
+int mlx5vf_stop_page_tracker(struct vfio_device *vdev)
+{
+	struct mlx5vf_pci_core_device *mvdev = container_of(
+		vdev, struct mlx5vf_pci_core_device, core_device.vdev);
+
+	mutex_lock(&mvdev->state_mutex);
+	if (!mvdev->log_active)
+		goto end;
+
+	_mlx5vf_free_page_tracker_resources(mvdev);
+	mvdev->log_active = false;
+end:
+	mlx5vf_state_mutex_unlock(mvdev);
+	return 0;
+}
+
+int mlx5vf_start_page_tracker(struct vfio_device *vdev,
+			      struct rb_root_cached *ranges, u32 nnodes,
+			      u64 *page_size)
+{
+	struct mlx5vf_pci_core_device *mvdev = container_of(
+		vdev, struct mlx5vf_pci_core_device, core_device.vdev);
+	struct mlx5_vhca_page_tracker *tracker = &mvdev->tracker;
+	u8 log_tracked_page = ilog2(*page_size);
+	struct mlx5_vhca_qp *host_qp;
+	struct mlx5_vhca_qp *fw_qp;
+	struct mlx5_core_dev *mdev;
+	u32 max_msg_size = PAGE_SIZE;
+	u64 rq_size = SZ_2M;
+	u32 max_recv_wr;
+	int err;
+
+	mutex_lock(&mvdev->state_mutex);
+	if (mvdev->mdev_detach) {
+		err = -ENOTCONN;
+		goto end;
+	}
+
+	if (mvdev->log_active) {
+		err = -EINVAL;
+		goto end;
+	}
+
+	mdev = mvdev->mdev;
+	memset(tracker, 0, sizeof(*tracker));
+	tracker->uar = mlx5_get_uars_page(mdev);
+	if (IS_ERR(tracker->uar)) {
+		err = PTR_ERR(tracker->uar);
+		goto end;
+	}
+
+	err = mlx5_core_alloc_pd(mdev, &tracker->pdn);
+	if (err)
+		goto err_uar;
+
+	max_recv_wr = DIV_ROUND_UP_ULL(rq_size, max_msg_size);
+	err = mlx5vf_create_cq(mdev, tracker, max_recv_wr);
+	if (err)
+		goto err_dealloc_pd;
+
+	host_qp = mlx5vf_create_rc_qp(mdev, tracker, max_recv_wr);
+	if (IS_ERR(host_qp)) {
+		err = PTR_ERR(host_qp);
+		goto err_cq;
+	}
+
+	host_qp->max_msg_size = max_msg_size;
+	if (log_tracked_page < MLX5_CAP_ADV_VIRTUALIZATION(mdev,
+				pg_track_log_min_page_size)) {
+		log_tracked_page = MLX5_CAP_ADV_VIRTUALIZATION(mdev,
+				pg_track_log_min_page_size);
+	} else if (log_tracked_page > MLX5_CAP_ADV_VIRTUALIZATION(mdev,
+				pg_track_log_max_page_size)) {
+		log_tracked_page = MLX5_CAP_ADV_VIRTUALIZATION(mdev,
+				pg_track_log_max_page_size);
+	}
+
+	host_qp->tracked_page_size = (1ULL << log_tracked_page);
+	err = mlx5vf_alloc_qp_recv_resources(mdev, host_qp, tracker->pdn,
+					     rq_size);
+	if (err)
+		goto err_host_qp;
+
+	fw_qp = mlx5vf_create_rc_qp(mdev, tracker, 0);
+	if (IS_ERR(fw_qp)) {
+		err = PTR_ERR(fw_qp);
+		goto err_recv_resources;
+	}
+
+	err = mlx5vf_activate_qp(mdev, host_qp, fw_qp->qpn, true);
+	if (err)
+		goto err_activate;
+
+	err = mlx5vf_activate_qp(mdev, fw_qp, host_qp->qpn, false);
+	if (err)
+		goto err_activate;
+
+	tracker->host_qp = host_qp;
+	tracker->fw_qp = fw_qp;
+	err = mlx5vf_create_tracker(mdev, mvdev, ranges, nnodes);
+	if (err)
+		goto err_activate;
+
+	MLX5_NB_INIT(&tracker->nb, mlx5vf_event_notifier, NOTIFY_ANY);
+	mlx5_eq_notifier_register(mdev, &tracker->nb);
+	*page_size = host_qp->tracked_page_size;
+	mvdev->log_active = true;
+	mlx5vf_state_mutex_unlock(mvdev);
+	return 0;
+
+err_activate:
+	mlx5vf_destroy_qp(mdev, fw_qp);
+err_recv_resources:
+	mlx5vf_free_qp_recv_resources(mdev, host_qp);
+err_host_qp:
+	mlx5vf_destroy_qp(mdev, host_qp);
+err_cq:
+	mlx5vf_destroy_cq(mdev, &tracker->cq);
+err_dealloc_pd:
+	mlx5_core_dealloc_pd(mdev, tracker->pdn);
+err_uar:
+	mlx5_put_uars_page(mdev, tracker->uar);
+end:
+	mlx5vf_state_mutex_unlock(mvdev);
+	return err;
+}
+
+static void
+set_report_output(u32 size, int index, struct mlx5_vhca_qp *qp,
+		  struct iova_bitmap *dirty)
+{
+	u32 entry_size = MLX5_ST_SZ_BYTES(page_track_report_entry);
+	u32 nent = size / entry_size;
+	struct page *page;
+	u64 addr;
+	u64 *buf;
+	int i;
+
+	if (WARN_ON(index >= qp->recv_buf.npages ||
+		    (nent > qp->max_msg_size / entry_size)))
+		return;
+
+	page = qp->recv_buf.page_list[index];
+	buf = kmap_local_page(page);
+	for (i = 0; i < nent; i++) {
+		addr = MLX5_GET(page_track_report_entry, buf + i,
+				dirty_address_low);
+		addr |= (u64)MLX5_GET(page_track_report_entry, buf + i,
+				      dirty_address_high) << 32;
+		iova_bitmap_set(dirty, addr, qp->tracked_page_size);
+	}
+	kunmap_local(buf);
+}
+
+static void
+mlx5vf_rq_cqe(struct mlx5_vhca_qp *qp, struct mlx5_cqe64 *cqe,
+	      struct iova_bitmap *dirty, int *tracker_status)
+{
+	u32 size;
+	int ix;
+
+	qp->rq.cc++;
+	*tracker_status = be32_to_cpu(cqe->immediate) >> 28;
+	size = be32_to_cpu(cqe->byte_cnt);
+	ix = be16_to_cpu(cqe->wqe_counter) & (qp->rq.wqe_cnt - 1);
+
+	/* zero length CQE, no data */
+	WARN_ON(!size && *tracker_status == MLX5_PAGE_TRACK_STATE_REPORTING);
+	if (size)
+		set_report_output(size, ix, qp, dirty);
+
+	qp->recv_buf.next_rq_offset = ix * qp->max_msg_size;
+	mlx5vf_post_recv(qp);
+}
+
+static void *get_cqe(struct mlx5_vhca_cq *cq, int n)
+{
+	return mlx5_frag_buf_get_wqe(&cq->buf.fbc, n);
+}
+
+static struct mlx5_cqe64 *get_sw_cqe(struct mlx5_vhca_cq *cq, int n)
+{
+	void *cqe = get_cqe(cq, n & (cq->ncqe - 1));
+	struct mlx5_cqe64 *cqe64;
+
+	cqe64 = (cq->mcq.cqe_sz == 64) ? cqe : cqe + 64;
+
+	if (likely(get_cqe_opcode(cqe64) != MLX5_CQE_INVALID) &&
+	    !((cqe64->op_own & MLX5_CQE_OWNER_MASK) ^ !!(n & (cq->ncqe)))) {
+		return cqe64;
+	} else {
+		return NULL;
+	}
+}
+
+static int
+mlx5vf_cq_poll_one(struct mlx5_vhca_cq *cq, struct mlx5_vhca_qp *qp,
+		   struct iova_bitmap *dirty, int *tracker_status)
+{
+	struct mlx5_cqe64 *cqe;
+	u8 opcode;
+
+	cqe = get_sw_cqe(cq, cq->mcq.cons_index);
+	if (!cqe)
+		return CQ_EMPTY;
+
+	++cq->mcq.cons_index;
+	/*
+	 * Make sure we read CQ entry contents after we've checked the
+	 * ownership bit.
+	 */
+	rmb();
+	opcode = get_cqe_opcode(cqe);
+	switch (opcode) {
+	case MLX5_CQE_RESP_SEND_IMM:
+		mlx5vf_rq_cqe(qp, cqe, dirty, tracker_status);
+		return CQ_OK;
+	default:
+		return CQ_POLL_ERR;
+	}
+}
+
+int mlx5vf_tracker_read_and_clear(struct vfio_device *vdev, unsigned long iova,
+				  unsigned long length,
+				  struct iova_bitmap *dirty)
+{
+	struct mlx5vf_pci_core_device *mvdev = container_of(
+		vdev, struct mlx5vf_pci_core_device, core_device.vdev);
+	struct mlx5_vhca_page_tracker *tracker = &mvdev->tracker;
+	struct mlx5_vhca_cq *cq = &tracker->cq;
+	struct mlx5_core_dev *mdev;
+	int poll_err, err;
+
+	mutex_lock(&mvdev->state_mutex);
+	if (!mvdev->log_active) {
+		err = -EINVAL;
+		goto end;
+	}
+
+	if (mvdev->mdev_detach) {
+		err = -ENOTCONN;
+		goto end;
+	}
+
+	mdev = mvdev->mdev;
+	err = mlx5vf_cmd_modify_tracker(mdev, tracker->id, iova, length,
+					MLX5_PAGE_TRACK_STATE_REPORTING);
+	if (err)
+		goto end;
+
+	tracker->status = MLX5_PAGE_TRACK_STATE_REPORTING;
+	while (tracker->status == MLX5_PAGE_TRACK_STATE_REPORTING &&
+	       !tracker->is_err) {
+		poll_err = mlx5vf_cq_poll_one(cq, tracker->host_qp, dirty,
+					      &tracker->status);
+		if (poll_err == CQ_EMPTY) {
+			mlx5_cq_arm(&cq->mcq, MLX5_CQ_DB_REQ_NOT, tracker->uar->map,
+				    cq->mcq.cons_index);
+			poll_err = mlx5vf_cq_poll_one(cq, tracker->host_qp,
+						      dirty, &tracker->status);
+			if (poll_err == CQ_EMPTY) {
+				wait_for_completion(&mvdev->tracker_comp);
+				continue;
+			}
+		}
+		if (poll_err == CQ_POLL_ERR) {
+			err = -EIO;
+			goto end;
+		}
+		mlx5_cq_set_ci(&cq->mcq);
+	}
+
+	if (tracker->status == MLX5_PAGE_TRACK_STATE_ERROR)
+		tracker->is_err = true;
+
+	if (tracker->is_err)
+		err = -EIO;
+end:
+	mlx5vf_state_mutex_unlock(mvdev);
+	return err;
+}
diff --git a/drivers/vfio/pci/mlx5/cmd.h b/drivers/vfio/pci/mlx5/cmd.h
new file mode 100644
index 0000000000..aec4c69dd6
--- /dev/null
+++ b/drivers/vfio/pci/mlx5/cmd.h
@@ -0,0 +1,225 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/*
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ */
+
+#ifndef MLX5_VFIO_CMD_H
+#define MLX5_VFIO_CMD_H
+
+#include <linux/kernel.h>
+#include <linux/vfio_pci_core.h>
+#include <linux/mlx5/driver.h>
+#include <linux/mlx5/vport.h>
+#include <linux/mlx5/cq.h>
+#include <linux/mlx5/qp.h>
+
+#define MLX5VF_PRE_COPY_SUPP(mvdev) \
+	((mvdev)->core_device.vdev.migration_flags & VFIO_MIGRATION_PRE_COPY)
+
+enum mlx5_vf_migf_state {
+	MLX5_MIGF_STATE_ERROR = 1,
+	MLX5_MIGF_STATE_PRE_COPY_ERROR,
+	MLX5_MIGF_STATE_PRE_COPY,
+	MLX5_MIGF_STATE_SAVE_LAST,
+	MLX5_MIGF_STATE_COMPLETE,
+};
+
+enum mlx5_vf_load_state {
+	MLX5_VF_LOAD_STATE_READ_IMAGE_NO_HEADER,
+	MLX5_VF_LOAD_STATE_READ_HEADER,
+	MLX5_VF_LOAD_STATE_PREP_HEADER_DATA,
+	MLX5_VF_LOAD_STATE_READ_HEADER_DATA,
+	MLX5_VF_LOAD_STATE_PREP_IMAGE,
+	MLX5_VF_LOAD_STATE_READ_IMAGE,
+	MLX5_VF_LOAD_STATE_LOAD_IMAGE,
+};
+
+struct mlx5_vf_migration_tag_stop_copy_data {
+	__le64 stop_copy_size;
+};
+
+enum mlx5_vf_migf_header_flags {
+	MLX5_MIGF_HEADER_FLAGS_TAG_MANDATORY = 0,
+	MLX5_MIGF_HEADER_FLAGS_TAG_OPTIONAL = 1 << 0,
+};
+
+enum mlx5_vf_migf_header_tag {
+	MLX5_MIGF_HEADER_TAG_FW_DATA = 0,
+	MLX5_MIGF_HEADER_TAG_STOP_COPY_SIZE = 1 << 0,
+};
+
+struct mlx5_vf_migration_header {
+	__le64 record_size;
+	/* For future use in case we may need to change the kernel protocol */
+	__le32 flags; /* Use mlx5_vf_migf_header_flags */
+	__le32 tag; /* Use mlx5_vf_migf_header_tag */
+	__u8 data[]; /* Its size is given in the record_size */
+};
+
+struct mlx5_vhca_data_buffer {
+	struct sg_append_table table;
+	loff_t start_pos;
+	u64 length;
+	u64 allocated_length;
+	u32 mkey;
+	enum dma_data_direction dma_dir;
+	u8 dmaed:1;
+	struct list_head buf_elm;
+	struct mlx5_vf_migration_file *migf;
+	/* Optimize mlx5vf_get_migration_page() for sequential access */
+	struct scatterlist *last_offset_sg;
+	unsigned int sg_last_entry;
+	unsigned long last_offset;
+};
+
+struct mlx5vf_async_data {
+	struct mlx5_async_work cb_work;
+	struct work_struct work;
+	struct mlx5_vhca_data_buffer *buf;
+	struct mlx5_vhca_data_buffer *header_buf;
+	int status;
+	u8 last_chunk:1;
+	void *out;
+};
+
+struct mlx5_vf_migration_file {
+	struct file *filp;
+	struct mutex lock;
+	enum mlx5_vf_migf_state state;
+
+	enum mlx5_vf_load_state load_state;
+	u32 pdn;
+	loff_t max_pos;
+	u64 record_size;
+	u32 record_tag;
+	u64 stop_copy_prep_size;
+	u64 pre_copy_initial_bytes;
+	struct mlx5_vhca_data_buffer *buf;
+	struct mlx5_vhca_data_buffer *buf_header;
+	spinlock_t list_lock;
+	struct list_head buf_list;
+	struct list_head avail_list;
+	struct mlx5vf_pci_core_device *mvdev;
+	wait_queue_head_t poll_wait;
+	struct completion save_comp;
+	struct mlx5_async_ctx async_ctx;
+	struct mlx5vf_async_data async_data;
+};
+
+struct mlx5_vhca_cq_buf {
+	struct mlx5_frag_buf_ctrl fbc;
+	struct mlx5_frag_buf frag_buf;
+	int cqe_size;
+	int nent;
+};
+
+struct mlx5_vhca_cq {
+	struct mlx5_vhca_cq_buf buf;
+	struct mlx5_db db;
+	struct mlx5_core_cq mcq;
+	size_t ncqe;
+};
+
+struct mlx5_vhca_recv_buf {
+	u32 npages;
+	struct page **page_list;
+	dma_addr_t *dma_addrs;
+	u32 next_rq_offset;
+	u32 mkey;
+};
+
+struct mlx5_vhca_qp {
+	struct mlx5_frag_buf buf;
+	struct mlx5_db db;
+	struct mlx5_vhca_recv_buf recv_buf;
+	u32 tracked_page_size;
+	u32 max_msg_size;
+	u32 qpn;
+	struct {
+		unsigned int pc;
+		unsigned int cc;
+		unsigned int wqe_cnt;
+		__be32 *db;
+		struct mlx5_frag_buf_ctrl fbc;
+	} rq;
+};
+
+struct mlx5_vhca_page_tracker {
+	u32 id;
+	u32 pdn;
+	u8 is_err:1;
+	struct mlx5_uars_page *uar;
+	struct mlx5_vhca_cq cq;
+	struct mlx5_vhca_qp *host_qp;
+	struct mlx5_vhca_qp *fw_qp;
+	struct mlx5_nb nb;
+	int status;
+};
+
+struct mlx5vf_pci_core_device {
+	struct vfio_pci_core_device core_device;
+	int vf_id;
+	u16 vhca_id;
+	u8 migrate_cap:1;
+	u8 deferred_reset:1;
+	u8 mdev_detach:1;
+	u8 log_active:1;
+	struct completion tracker_comp;
+	/* protect migration state */
+	struct mutex state_mutex;
+	enum vfio_device_mig_state mig_state;
+	/* protect the reset_done flow */
+	spinlock_t reset_lock;
+	struct mlx5_vf_migration_file *resuming_migf;
+	struct mlx5_vf_migration_file *saving_migf;
+	struct mlx5_vhca_page_tracker tracker;
+	struct workqueue_struct *cb_wq;
+	struct notifier_block nb;
+	struct mlx5_core_dev *mdev;
+};
+
+enum {
+	MLX5VF_QUERY_INC = (1UL << 0),
+	MLX5VF_QUERY_FINAL = (1UL << 1),
+};
+
+int mlx5vf_cmd_suspend_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod);
+int mlx5vf_cmd_resume_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod);
+int mlx5vf_cmd_query_vhca_migration_state(struct mlx5vf_pci_core_device *mvdev,
+					  size_t *state_size, u8 query_flags);
+void mlx5vf_cmd_set_migratable(struct mlx5vf_pci_core_device *mvdev,
+			       const struct vfio_migration_ops *mig_ops,
+			       const struct vfio_log_ops *log_ops);
+void mlx5vf_cmd_remove_migratable(struct mlx5vf_pci_core_device *mvdev);
+void mlx5vf_cmd_close_migratable(struct mlx5vf_pci_core_device *mvdev);
+int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev,
+			       struct mlx5_vf_migration_file *migf,
+			       struct mlx5_vhca_data_buffer *buf, bool inc,
+			       bool track);
+int mlx5vf_cmd_load_vhca_state(struct mlx5vf_pci_core_device *mvdev,
+			       struct mlx5_vf_migration_file *migf,
+			       struct mlx5_vhca_data_buffer *buf);
+int mlx5vf_cmd_alloc_pd(struct mlx5_vf_migration_file *migf);
+void mlx5vf_cmd_dealloc_pd(struct mlx5_vf_migration_file *migf);
+void mlx5fv_cmd_clean_migf_resources(struct mlx5_vf_migration_file *migf);
+struct mlx5_vhca_data_buffer *
+mlx5vf_alloc_data_buffer(struct mlx5_vf_migration_file *migf,
+			 size_t length, enum dma_data_direction dma_dir);
+void mlx5vf_free_data_buffer(struct mlx5_vhca_data_buffer *buf);
+struct mlx5_vhca_data_buffer *
+mlx5vf_get_data_buffer(struct mlx5_vf_migration_file *migf,
+		       size_t length, enum dma_data_direction dma_dir);
+void mlx5vf_put_data_buffer(struct mlx5_vhca_data_buffer *buf);
+int mlx5vf_add_migration_pages(struct mlx5_vhca_data_buffer *buf,
+			       unsigned int npages);
+struct page *mlx5vf_get_migration_page(struct mlx5_vhca_data_buffer *buf,
+				       unsigned long offset);
+void mlx5vf_state_mutex_unlock(struct mlx5vf_pci_core_device *mvdev);
+void mlx5vf_disable_fds(struct mlx5vf_pci_core_device *mvdev);
+void mlx5vf_mig_file_cleanup_cb(struct work_struct *_work);
+int mlx5vf_start_page_tracker(struct vfio_device *vdev,
+		struct rb_root_cached *ranges, u32 nnodes, u64 *page_size);
+int mlx5vf_stop_page_tracker(struct vfio_device *vdev);
+int mlx5vf_tracker_read_and_clear(struct vfio_device *vdev, unsigned long iova,
+			unsigned long length, struct iova_bitmap *dirty);
+#endif /* MLX5_VFIO_CMD_H */
diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c
new file mode 100644
index 0000000000..42ec574a86
--- /dev/null
+++ b/drivers/vfio/pci/mlx5/main.c
@@ -0,0 +1,1383 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved
+ */
+
+#include <linux/device.h>
+#include <linux/eventfd.h>
+#include <linux/file.h>
+#include <linux/interrupt.h>
+#include <linux/iommu.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/notifier.h>
+#include <linux/pci.h>
+#include <linux/pm_runtime.h>
+#include <linux/types.h>
+#include <linux/uaccess.h>
+#include <linux/vfio.h>
+#include <linux/sched/mm.h>
+#include <linux/anon_inodes.h>
+
+#include "cmd.h"
+
+/* Device specification max LOAD size */
+#define MAX_LOAD_SIZE (BIT_ULL(__mlx5_bit_sz(load_vhca_state_in, size)) - 1)
+
+static struct mlx5vf_pci_core_device *mlx5vf_drvdata(struct pci_dev *pdev)
+{
+	struct vfio_pci_core_device *core_device = dev_get_drvdata(&pdev->dev);
+
+	return container_of(core_device, struct mlx5vf_pci_core_device,
+			    core_device);
+}
+
+struct page *
+mlx5vf_get_migration_page(struct mlx5_vhca_data_buffer *buf,
+			  unsigned long offset)
+{
+	unsigned long cur_offset = 0;
+	struct scatterlist *sg;
+	unsigned int i;
+
+	/* All accesses are sequential */
+	if (offset < buf->last_offset || !buf->last_offset_sg) {
+		buf->last_offset = 0;
+		buf->last_offset_sg = buf->table.sgt.sgl;
+		buf->sg_last_entry = 0;
+	}
+
+	cur_offset = buf->last_offset;
+
+	for_each_sg(buf->last_offset_sg, sg,
+			buf->table.sgt.orig_nents - buf->sg_last_entry, i) {
+		if (offset < sg->length + cur_offset) {
+			buf->last_offset_sg = sg;
+			buf->sg_last_entry += i;
+			buf->last_offset = cur_offset;
+			return nth_page(sg_page(sg),
+					(offset - cur_offset) / PAGE_SIZE);
+		}
+		cur_offset += sg->length;
+	}
+	return NULL;
+}
+
+int mlx5vf_add_migration_pages(struct mlx5_vhca_data_buffer *buf,
+			       unsigned int npages)
+{
+	unsigned int to_alloc = npages;
+	struct page **page_list;
+	unsigned long filled;
+	unsigned int to_fill;
+	int ret;
+
+	to_fill = min_t(unsigned int, npages, PAGE_SIZE / sizeof(*page_list));
+	page_list = kvzalloc(to_fill * sizeof(*page_list), GFP_KERNEL_ACCOUNT);
+	if (!page_list)
+		return -ENOMEM;
+
+	do {
+		filled = alloc_pages_bulk_array(GFP_KERNEL_ACCOUNT, to_fill,
+						page_list);
+		if (!filled) {
+			ret = -ENOMEM;
+			goto err;
+		}
+		to_alloc -= filled;
+		ret = sg_alloc_append_table_from_pages(
+			&buf->table, page_list, filled, 0,
+			filled << PAGE_SHIFT, UINT_MAX, SG_MAX_SINGLE_ALLOC,
+			GFP_KERNEL_ACCOUNT);
+
+		if (ret)
+			goto err;
+		buf->allocated_length += filled * PAGE_SIZE;
+		/* clean input for another bulk allocation */
+		memset(page_list, 0, filled * sizeof(*page_list));
+		to_fill = min_t(unsigned int, to_alloc,
+				PAGE_SIZE / sizeof(*page_list));
+	} while (to_alloc > 0);
+
+	kvfree(page_list);
+	return 0;
+
+err:
+	kvfree(page_list);
+	return ret;
+}
+
+static void mlx5vf_disable_fd(struct mlx5_vf_migration_file *migf)
+{
+	mutex_lock(&migf->lock);
+	migf->state = MLX5_MIGF_STATE_ERROR;
+	migf->filp->f_pos = 0;
+	mutex_unlock(&migf->lock);
+}
+
+static int mlx5vf_release_file(struct inode *inode, struct file *filp)
+{
+	struct mlx5_vf_migration_file *migf = filp->private_data;
+
+	mlx5vf_disable_fd(migf);
+	mutex_destroy(&migf->lock);
+	kfree(migf);
+	return 0;
+}
+
+static struct mlx5_vhca_data_buffer *
+mlx5vf_get_data_buff_from_pos(struct mlx5_vf_migration_file *migf, loff_t pos,
+			      bool *end_of_data)
+{
+	struct mlx5_vhca_data_buffer *buf;
+	bool found = false;
+
+	*end_of_data = false;
+	spin_lock_irq(&migf->list_lock);
+	if (list_empty(&migf->buf_list)) {
+		*end_of_data = true;
+		goto end;
+	}
+
+	buf = list_first_entry(&migf->buf_list, struct mlx5_vhca_data_buffer,
+			       buf_elm);
+	if (pos >= buf->start_pos &&
+	    pos < buf->start_pos + buf->length) {
+		found = true;
+		goto end;
+	}
+
+	/*
+	 * As we use a stream based FD we may expect having the data always
+	 * on first chunk
+	 */
+	migf->state = MLX5_MIGF_STATE_ERROR;
+
+end:
+	spin_unlock_irq(&migf->list_lock);
+	return found ? buf : NULL;
+}
+
+static ssize_t mlx5vf_buf_read(struct mlx5_vhca_data_buffer *vhca_buf,
+			       char __user **buf, size_t *len, loff_t *pos)
+{
+	unsigned long offset;
+	ssize_t done = 0;
+	size_t copy_len;
+
+	copy_len = min_t(size_t,
+			 vhca_buf->start_pos + vhca_buf->length - *pos, *len);
+	while (copy_len) {
+		size_t page_offset;
+		struct page *page;
+		size_t page_len;
+		u8 *from_buff;
+		int ret;
+
+		offset = *pos - vhca_buf->start_pos;
+		page_offset = offset % PAGE_SIZE;
+		offset -= page_offset;
+		page = mlx5vf_get_migration_page(vhca_buf, offset);
+		if (!page)
+			return -EINVAL;
+		page_len = min_t(size_t, copy_len, PAGE_SIZE - page_offset);
+		from_buff = kmap_local_page(page);
+		ret = copy_to_user(*buf, from_buff + page_offset, page_len);
+		kunmap_local(from_buff);
+		if (ret)
+			return -EFAULT;
+		*pos += page_len;
+		*len -= page_len;
+		*buf += page_len;
+		done += page_len;
+		copy_len -= page_len;
+	}
+
+	if (*pos >= vhca_buf->start_pos + vhca_buf->length) {
+		spin_lock_irq(&vhca_buf->migf->list_lock);
+		list_del_init(&vhca_buf->buf_elm);
+		list_add_tail(&vhca_buf->buf_elm, &vhca_buf->migf->avail_list);
+		spin_unlock_irq(&vhca_buf->migf->list_lock);
+	}
+
+	return done;
+}
+
+static ssize_t mlx5vf_save_read(struct file *filp, char __user *buf, size_t len,
+			       loff_t *pos)
+{
+	struct mlx5_vf_migration_file *migf = filp->private_data;
+	struct mlx5_vhca_data_buffer *vhca_buf;
+	bool first_loop_call = true;
+	bool end_of_data;
+	ssize_t done = 0;
+
+	if (pos)
+		return -ESPIPE;
+	pos = &filp->f_pos;
+
+	if (!(filp->f_flags & O_NONBLOCK)) {
+		if (wait_event_interruptible(migf->poll_wait,
+				!list_empty(&migf->buf_list) ||
+				migf->state == MLX5_MIGF_STATE_ERROR ||
+				migf->state == MLX5_MIGF_STATE_PRE_COPY_ERROR ||
+				migf->state == MLX5_MIGF_STATE_PRE_COPY ||
+				migf->state == MLX5_MIGF_STATE_COMPLETE))
+			return -ERESTARTSYS;
+	}
+
+	mutex_lock(&migf->lock);
+	if (migf->state == MLX5_MIGF_STATE_ERROR) {
+		done = -ENODEV;
+		goto out_unlock;
+	}
+
+	while (len) {
+		ssize_t count;
+
+		vhca_buf = mlx5vf_get_data_buff_from_pos(migf, *pos,
+							 &end_of_data);
+		if (first_loop_call) {
+			first_loop_call = false;
+			/* Temporary end of file as part of PRE_COPY */
+			if (end_of_data && (migf->state == MLX5_MIGF_STATE_PRE_COPY ||
+				migf->state == MLX5_MIGF_STATE_PRE_COPY_ERROR)) {
+				done = -ENOMSG;
+				goto out_unlock;
+			}
+
+			if (end_of_data && migf->state != MLX5_MIGF_STATE_COMPLETE) {
+				if (filp->f_flags & O_NONBLOCK) {
+					done = -EAGAIN;
+					goto out_unlock;
+				}
+			}
+		}
+
+		if (end_of_data)
+			goto out_unlock;
+
+		if (!vhca_buf) {
+			done = -EINVAL;
+			goto out_unlock;
+		}
+
+		count = mlx5vf_buf_read(vhca_buf, &buf, &len, pos);
+		if (count < 0) {
+			done = count;
+			goto out_unlock;
+		}
+		done += count;
+	}
+
+out_unlock:
+	mutex_unlock(&migf->lock);
+	return done;
+}
+
+static __poll_t mlx5vf_save_poll(struct file *filp,
+				 struct poll_table_struct *wait)
+{
+	struct mlx5_vf_migration_file *migf = filp->private_data;
+	__poll_t pollflags = 0;
+
+	poll_wait(filp, &migf->poll_wait, wait);
+
+	mutex_lock(&migf->lock);
+	if (migf->state == MLX5_MIGF_STATE_ERROR)
+		pollflags = EPOLLIN | EPOLLRDNORM | EPOLLRDHUP;
+	else if (!list_empty(&migf->buf_list) ||
+		 migf->state == MLX5_MIGF_STATE_COMPLETE)
+		pollflags = EPOLLIN | EPOLLRDNORM;
+	mutex_unlock(&migf->lock);
+
+	return pollflags;
+}
+
+/*
+ * FD is exposed and user can use it after receiving an error.
+ * Mark migf in error, and wake the user.
+ */
+static void mlx5vf_mark_err(struct mlx5_vf_migration_file *migf)
+{
+	migf->state = MLX5_MIGF_STATE_ERROR;
+	wake_up_interruptible(&migf->poll_wait);
+}
+
+static int mlx5vf_add_stop_copy_header(struct mlx5_vf_migration_file *migf)
+{
+	size_t size = sizeof(struct mlx5_vf_migration_header) +
+		sizeof(struct mlx5_vf_migration_tag_stop_copy_data);
+	struct mlx5_vf_migration_tag_stop_copy_data data = {};
+	struct mlx5_vhca_data_buffer *header_buf = NULL;
+	struct mlx5_vf_migration_header header = {};
+	unsigned long flags;
+	struct page *page;
+	u8 *to_buff;
+	int ret;
+
+	header_buf = mlx5vf_get_data_buffer(migf, size, DMA_NONE);
+	if (IS_ERR(header_buf))
+		return PTR_ERR(header_buf);
+
+	header.record_size = cpu_to_le64(sizeof(data));
+	header.flags = cpu_to_le32(MLX5_MIGF_HEADER_FLAGS_TAG_OPTIONAL);
+	header.tag = cpu_to_le32(MLX5_MIGF_HEADER_TAG_STOP_COPY_SIZE);
+	page = mlx5vf_get_migration_page(header_buf, 0);
+	if (!page) {
+		ret = -EINVAL;
+		goto err;
+	}
+	to_buff = kmap_local_page(page);
+	memcpy(to_buff, &header, sizeof(header));
+	header_buf->length = sizeof(header);
+	data.stop_copy_size = cpu_to_le64(migf->buf->allocated_length);
+	memcpy(to_buff + sizeof(header), &data, sizeof(data));
+	header_buf->length += sizeof(data);
+	kunmap_local(to_buff);
+	header_buf->start_pos = header_buf->migf->max_pos;
+	migf->max_pos += header_buf->length;
+	spin_lock_irqsave(&migf->list_lock, flags);
+	list_add_tail(&header_buf->buf_elm, &migf->buf_list);
+	spin_unlock_irqrestore(&migf->list_lock, flags);
+	migf->pre_copy_initial_bytes = size;
+	return 0;
+err:
+	mlx5vf_put_data_buffer(header_buf);
+	return ret;
+}
+
+static int mlx5vf_prep_stop_copy(struct mlx5_vf_migration_file *migf,
+				 size_t state_size)
+{
+	struct mlx5_vhca_data_buffer *buf;
+	size_t inc_state_size;
+	int ret;
+
+	/* let's be ready for stop_copy size that might grow by 10 percents */
+	if (check_add_overflow(state_size, state_size / 10, &inc_state_size))
+		inc_state_size = state_size;
+
+	buf = mlx5vf_get_data_buffer(migf, inc_state_size, DMA_FROM_DEVICE);
+	if (IS_ERR(buf))
+		return PTR_ERR(buf);
+
+	migf->buf = buf;
+	buf = mlx5vf_get_data_buffer(migf,
+			sizeof(struct mlx5_vf_migration_header), DMA_NONE);
+	if (IS_ERR(buf)) {
+		ret = PTR_ERR(buf);
+		goto err;
+	}
+
+	migf->buf_header = buf;
+	ret = mlx5vf_add_stop_copy_header(migf);
+	if (ret)
+		goto err_header;
+	return 0;
+
+err_header:
+	mlx5vf_put_data_buffer(migf->buf_header);
+	migf->buf_header = NULL;
+err:
+	mlx5vf_put_data_buffer(migf->buf);
+	migf->buf = NULL;
+	return ret;
+}
+
+static long mlx5vf_precopy_ioctl(struct file *filp, unsigned int cmd,
+				 unsigned long arg)
+{
+	struct mlx5_vf_migration_file *migf = filp->private_data;
+	struct mlx5vf_pci_core_device *mvdev = migf->mvdev;
+	struct mlx5_vhca_data_buffer *buf;
+	struct vfio_precopy_info info = {};
+	loff_t *pos = &filp->f_pos;
+	unsigned long minsz;
+	size_t inc_length = 0;
+	bool end_of_data = false;
+	int ret;
+
+	if (cmd != VFIO_MIG_GET_PRECOPY_INFO)
+		return -ENOTTY;
+
+	minsz = offsetofend(struct vfio_precopy_info, dirty_bytes);
+
+	if (copy_from_user(&info, (void __user *)arg, minsz))
+		return -EFAULT;
+
+	if (info.argsz < minsz)
+		return -EINVAL;
+
+	mutex_lock(&mvdev->state_mutex);
+	if (mvdev->mig_state != VFIO_DEVICE_STATE_PRE_COPY &&
+	    mvdev->mig_state != VFIO_DEVICE_STATE_PRE_COPY_P2P) {
+		ret = -EINVAL;
+		goto err_state_unlock;
+	}
+
+	/*
+	 * We can't issue a SAVE command when the device is suspended, so as
+	 * part of VFIO_DEVICE_STATE_PRE_COPY_P2P no reason to query for extra
+	 * bytes that can't be read.
+	 */
+	if (mvdev->mig_state == VFIO_DEVICE_STATE_PRE_COPY) {
+		/*
+		 * Once the query returns it's guaranteed that there is no
+		 * active SAVE command.
+		 * As so, the other code below is safe with the proper locks.
+		 */
+		ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &inc_length,
+							    MLX5VF_QUERY_INC);
+		if (ret)
+			goto err_state_unlock;
+	}
+
+	mutex_lock(&migf->lock);
+	if (migf->state == MLX5_MIGF_STATE_ERROR) {
+		ret = -ENODEV;
+		goto err_migf_unlock;
+	}
+
+	if (migf->pre_copy_initial_bytes > *pos) {
+		info.initial_bytes = migf->pre_copy_initial_bytes - *pos;
+	} else {
+		info.dirty_bytes = migf->max_pos - *pos;
+		if (!info.dirty_bytes)
+			end_of_data = true;
+		info.dirty_bytes += inc_length;
+	}
+
+	if (!end_of_data || !inc_length) {
+		mutex_unlock(&migf->lock);
+		goto done;
+	}
+
+	mutex_unlock(&migf->lock);
+	/*
+	 * We finished transferring the current state and the device has a
+	 * dirty state, save a new state to be ready for.
+	 */
+	buf = mlx5vf_get_data_buffer(migf, inc_length, DMA_FROM_DEVICE);
+	if (IS_ERR(buf)) {
+		ret = PTR_ERR(buf);
+		mlx5vf_mark_err(migf);
+		goto err_state_unlock;
+	}
+
+	ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, true, true);
+	if (ret) {
+		mlx5vf_mark_err(migf);
+		mlx5vf_put_data_buffer(buf);
+		goto err_state_unlock;
+	}
+
+done:
+	mlx5vf_state_mutex_unlock(mvdev);
+	if (copy_to_user((void __user *)arg, &info, minsz))
+		return -EFAULT;
+	return 0;
+
+err_migf_unlock:
+	mutex_unlock(&migf->lock);
+err_state_unlock:
+	mlx5vf_state_mutex_unlock(mvdev);
+	return ret;
+}
+
+static const struct file_operations mlx5vf_save_fops = {
+	.owner = THIS_MODULE,
+	.read = mlx5vf_save_read,
+	.poll = mlx5vf_save_poll,
+	.unlocked_ioctl = mlx5vf_precopy_ioctl,
+	.compat_ioctl = compat_ptr_ioctl,
+	.release = mlx5vf_release_file,
+	.llseek = no_llseek,
+};
+
+static int mlx5vf_pci_save_device_inc_data(struct mlx5vf_pci_core_device *mvdev)
+{
+	struct mlx5_vf_migration_file *migf = mvdev->saving_migf;
+	struct mlx5_vhca_data_buffer *buf;
+	size_t length;
+	int ret;
+
+	if (migf->state == MLX5_MIGF_STATE_ERROR)
+		return -ENODEV;
+
+	ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length,
+				MLX5VF_QUERY_INC | MLX5VF_QUERY_FINAL);
+	if (ret)
+		goto err;
+
+	/* Checking whether we have a matching pre-allocated buffer that can fit */
+	if (migf->buf && migf->buf->allocated_length >= length) {
+		buf = migf->buf;
+		migf->buf = NULL;
+	} else {
+		buf = mlx5vf_get_data_buffer(migf, length, DMA_FROM_DEVICE);
+		if (IS_ERR(buf)) {
+			ret = PTR_ERR(buf);
+			goto err;
+		}
+	}
+
+	ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, true, false);
+	if (ret)
+		goto err_save;
+
+	return 0;
+
+err_save:
+	mlx5vf_put_data_buffer(buf);
+err:
+	mlx5vf_mark_err(migf);
+	return ret;
+}
+
+static struct mlx5_vf_migration_file *
+mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev, bool track)
+{
+	struct mlx5_vf_migration_file *migf;
+	struct mlx5_vhca_data_buffer *buf;
+	size_t length;
+	int ret;
+
+	migf = kzalloc(sizeof(*migf), GFP_KERNEL_ACCOUNT);
+	if (!migf)
+		return ERR_PTR(-ENOMEM);
+
+	migf->filp = anon_inode_getfile("mlx5vf_mig", &mlx5vf_save_fops, migf,
+					O_RDONLY);
+	if (IS_ERR(migf->filp)) {
+		ret = PTR_ERR(migf->filp);
+		goto end;
+	}
+
+	migf->mvdev = mvdev;
+	ret = mlx5vf_cmd_alloc_pd(migf);
+	if (ret)
+		goto out_free;
+
+	stream_open(migf->filp->f_inode, migf->filp);
+	mutex_init(&migf->lock);
+	init_waitqueue_head(&migf->poll_wait);
+	init_completion(&migf->save_comp);
+	/*
+	 * save_comp is being used as a binary semaphore built from
+	 * a completion. A normal mutex cannot be used because the lock is
+	 * passed between kernel threads and lockdep can't model this.
+	 */
+	complete(&migf->save_comp);
+	mlx5_cmd_init_async_ctx(mvdev->mdev, &migf->async_ctx);
+	INIT_WORK(&migf->async_data.work, mlx5vf_mig_file_cleanup_cb);
+	INIT_LIST_HEAD(&migf->buf_list);
+	INIT_LIST_HEAD(&migf->avail_list);
+	spin_lock_init(&migf->list_lock);
+	ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length, 0);
+	if (ret)
+		goto out_pd;
+
+	if (track) {
+		ret = mlx5vf_prep_stop_copy(migf, length);
+		if (ret)
+			goto out_pd;
+	}
+
+	buf = mlx5vf_alloc_data_buffer(migf, length, DMA_FROM_DEVICE);
+	if (IS_ERR(buf)) {
+		ret = PTR_ERR(buf);
+		goto out_pd;
+	}
+
+	ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, false, track);
+	if (ret)
+		goto out_save;
+	return migf;
+out_save:
+	mlx5vf_free_data_buffer(buf);
+out_pd:
+	mlx5fv_cmd_clean_migf_resources(migf);
+out_free:
+	fput(migf->filp);
+end:
+	kfree(migf);
+	return ERR_PTR(ret);
+}
+
+static int
+mlx5vf_append_page_to_mig_buf(struct mlx5_vhca_data_buffer *vhca_buf,
+			      const char __user **buf, size_t *len,
+			      loff_t *pos, ssize_t *done)
+{
+	unsigned long offset;
+	size_t page_offset;
+	struct page *page;
+	size_t page_len;
+	u8 *to_buff;
+	int ret;
+
+	offset = *pos - vhca_buf->start_pos;
+	page_offset = offset % PAGE_SIZE;
+
+	page = mlx5vf_get_migration_page(vhca_buf, offset - page_offset);
+	if (!page)
+		return -EINVAL;
+	page_len = min_t(size_t, *len, PAGE_SIZE - page_offset);
+	to_buff = kmap_local_page(page);
+	ret = copy_from_user(to_buff + page_offset, *buf, page_len);
+	kunmap_local(to_buff);
+	if (ret)
+		return -EFAULT;
+
+	*pos += page_len;
+	*done += page_len;
+	*buf += page_len;
+	*len -= page_len;
+	vhca_buf->length += page_len;
+	return 0;
+}
+
+static int
+mlx5vf_resume_read_image_no_header(struct mlx5_vhca_data_buffer *vhca_buf,
+				   loff_t requested_length,
+				   const char __user **buf, size_t *len,
+				   loff_t *pos, ssize_t *done)
+{
+	int ret;
+
+	if (requested_length > MAX_LOAD_SIZE)
+		return -ENOMEM;
+
+	if (vhca_buf->allocated_length < requested_length) {
+		ret = mlx5vf_add_migration_pages(
+			vhca_buf,
+			DIV_ROUND_UP(requested_length - vhca_buf->allocated_length,
+				     PAGE_SIZE));
+		if (ret)
+			return ret;
+	}
+
+	while (*len) {
+		ret = mlx5vf_append_page_to_mig_buf(vhca_buf, buf, len, pos,
+						    done);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+static ssize_t
+mlx5vf_resume_read_image(struct mlx5_vf_migration_file *migf,
+			 struct mlx5_vhca_data_buffer *vhca_buf,
+			 size_t image_size, const char __user **buf,
+			 size_t *len, loff_t *pos, ssize_t *done,
+			 bool *has_work)
+{
+	size_t copy_len, to_copy;
+	int ret;
+
+	to_copy = min_t(size_t, *len, image_size - vhca_buf->length);
+	copy_len = to_copy;
+	while (to_copy) {
+		ret = mlx5vf_append_page_to_mig_buf(vhca_buf, buf, &to_copy, pos,
+						    done);
+		if (ret)
+			return ret;
+	}
+
+	*len -= copy_len;
+	if (vhca_buf->length == image_size) {
+		migf->load_state = MLX5_VF_LOAD_STATE_LOAD_IMAGE;
+		migf->max_pos += image_size;
+		*has_work = true;
+	}
+
+	return 0;
+}
+
+static int
+mlx5vf_resume_read_header_data(struct mlx5_vf_migration_file *migf,
+			       struct mlx5_vhca_data_buffer *vhca_buf,
+			       const char __user **buf, size_t *len,
+			       loff_t *pos, ssize_t *done)
+{
+	size_t copy_len, to_copy;
+	size_t required_data;
+	u8 *to_buff;
+	int ret;
+
+	required_data = migf->record_size - vhca_buf->length;
+	to_copy = min_t(size_t, *len, required_data);
+	copy_len = to_copy;
+	while (to_copy) {
+		ret = mlx5vf_append_page_to_mig_buf(vhca_buf, buf, &to_copy, pos,
+						    done);
+		if (ret)
+			return ret;
+	}
+
+	*len -= copy_len;
+	if (vhca_buf->length == migf->record_size) {
+		switch (migf->record_tag) {
+		case MLX5_MIGF_HEADER_TAG_STOP_COPY_SIZE:
+		{
+			struct page *page;
+
+			page = mlx5vf_get_migration_page(vhca_buf, 0);
+			if (!page)
+				return -EINVAL;
+			to_buff = kmap_local_page(page);
+			migf->stop_copy_prep_size = min_t(u64,
+				le64_to_cpup((__le64 *)to_buff), MAX_LOAD_SIZE);
+			kunmap_local(to_buff);
+			break;
+		}
+		default:
+			/* Optional tag */
+			break;
+		}
+
+		migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER;
+		migf->max_pos += migf->record_size;
+		vhca_buf->length = 0;
+	}
+
+	return 0;
+}
+
+static int
+mlx5vf_resume_read_header(struct mlx5_vf_migration_file *migf,
+			  struct mlx5_vhca_data_buffer *vhca_buf,
+			  const char __user **buf,
+			  size_t *len, loff_t *pos,
+			  ssize_t *done, bool *has_work)
+{
+	struct page *page;
+	size_t copy_len;
+	u8 *to_buff;
+	int ret;
+
+	copy_len = min_t(size_t, *len,
+		sizeof(struct mlx5_vf_migration_header) - vhca_buf->length);
+	page = mlx5vf_get_migration_page(vhca_buf, 0);
+	if (!page)
+		return -EINVAL;
+	to_buff = kmap_local_page(page);
+	ret = copy_from_user(to_buff + vhca_buf->length, *buf, copy_len);
+	if (ret) {
+		ret = -EFAULT;
+		goto end;
+	}
+
+	*buf += copy_len;
+	*pos += copy_len;
+	*done += copy_len;
+	*len -= copy_len;
+	vhca_buf->length += copy_len;
+	if (vhca_buf->length == sizeof(struct mlx5_vf_migration_header)) {
+		u64 record_size;
+		u32 flags;
+
+		record_size = le64_to_cpup((__le64 *)to_buff);
+		if (record_size > MAX_LOAD_SIZE) {
+			ret = -ENOMEM;
+			goto end;
+		}
+
+		migf->record_size = record_size;
+		flags = le32_to_cpup((__le32 *)(to_buff +
+			    offsetof(struct mlx5_vf_migration_header, flags)));
+		migf->record_tag = le32_to_cpup((__le32 *)(to_buff +
+			    offsetof(struct mlx5_vf_migration_header, tag)));
+		switch (migf->record_tag) {
+		case MLX5_MIGF_HEADER_TAG_FW_DATA:
+			migf->load_state = MLX5_VF_LOAD_STATE_PREP_IMAGE;
+			break;
+		case MLX5_MIGF_HEADER_TAG_STOP_COPY_SIZE:
+			migf->load_state = MLX5_VF_LOAD_STATE_PREP_HEADER_DATA;
+			break;
+		default:
+			if (!(flags & MLX5_MIGF_HEADER_FLAGS_TAG_OPTIONAL)) {
+				ret = -EOPNOTSUPP;
+				goto end;
+			}
+			/* We may read and skip this optional record data */
+			migf->load_state = MLX5_VF_LOAD_STATE_PREP_HEADER_DATA;
+		}
+
+		migf->max_pos += vhca_buf->length;
+		vhca_buf->length = 0;
+		*has_work = true;
+	}
+end:
+	kunmap_local(to_buff);
+	return ret;
+}
+
+static ssize_t mlx5vf_resume_write(struct file *filp, const char __user *buf,
+				   size_t len, loff_t *pos)
+{
+	struct mlx5_vf_migration_file *migf = filp->private_data;
+	struct mlx5_vhca_data_buffer *vhca_buf = migf->buf;
+	struct mlx5_vhca_data_buffer *vhca_buf_header = migf->buf_header;
+	loff_t requested_length;
+	bool has_work = false;
+	ssize_t done = 0;
+	int ret = 0;
+
+	if (pos)
+		return -ESPIPE;
+	pos = &filp->f_pos;
+
+	if (*pos < 0 ||
+	    check_add_overflow((loff_t)len, *pos, &requested_length))
+		return -EINVAL;
+
+	mutex_lock(&migf->mvdev->state_mutex);
+	mutex_lock(&migf->lock);
+	if (migf->state == MLX5_MIGF_STATE_ERROR) {
+		ret = -ENODEV;
+		goto out_unlock;
+	}
+
+	while (len || has_work) {
+		has_work = false;
+		switch (migf->load_state) {
+		case MLX5_VF_LOAD_STATE_READ_HEADER:
+			ret = mlx5vf_resume_read_header(migf, vhca_buf_header,
+							&buf, &len, pos,
+							&done, &has_work);
+			if (ret)
+				goto out_unlock;
+			break;
+		case MLX5_VF_LOAD_STATE_PREP_HEADER_DATA:
+			if (vhca_buf_header->allocated_length < migf->record_size) {
+				mlx5vf_free_data_buffer(vhca_buf_header);
+
+				migf->buf_header = mlx5vf_alloc_data_buffer(migf,
+						migf->record_size, DMA_NONE);
+				if (IS_ERR(migf->buf_header)) {
+					ret = PTR_ERR(migf->buf_header);
+					migf->buf_header = NULL;
+					goto out_unlock;
+				}
+
+				vhca_buf_header = migf->buf_header;
+			}
+
+			vhca_buf_header->start_pos = migf->max_pos;
+			migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER_DATA;
+			break;
+		case MLX5_VF_LOAD_STATE_READ_HEADER_DATA:
+			ret = mlx5vf_resume_read_header_data(migf, vhca_buf_header,
+							&buf, &len, pos, &done);
+			if (ret)
+				goto out_unlock;
+			break;
+		case MLX5_VF_LOAD_STATE_PREP_IMAGE:
+		{
+			u64 size = max(migf->record_size,
+				       migf->stop_copy_prep_size);
+
+			if (vhca_buf->allocated_length < size) {
+				mlx5vf_free_data_buffer(vhca_buf);
+
+				migf->buf = mlx5vf_alloc_data_buffer(migf,
+							size, DMA_TO_DEVICE);
+				if (IS_ERR(migf->buf)) {
+					ret = PTR_ERR(migf->buf);
+					migf->buf = NULL;
+					goto out_unlock;
+				}
+
+				vhca_buf = migf->buf;
+			}
+
+			vhca_buf->start_pos = migf->max_pos;
+			migf->load_state = MLX5_VF_LOAD_STATE_READ_IMAGE;
+			break;
+		}
+		case MLX5_VF_LOAD_STATE_READ_IMAGE_NO_HEADER:
+			ret = mlx5vf_resume_read_image_no_header(vhca_buf,
+						requested_length,
+						&buf, &len, pos, &done);
+			if (ret)
+				goto out_unlock;
+			break;
+		case MLX5_VF_LOAD_STATE_READ_IMAGE:
+			ret = mlx5vf_resume_read_image(migf, vhca_buf,
+						migf->record_size,
+						&buf, &len, pos, &done, &has_work);
+			if (ret)
+				goto out_unlock;
+			break;
+		case MLX5_VF_LOAD_STATE_LOAD_IMAGE:
+			ret = mlx5vf_cmd_load_vhca_state(migf->mvdev, migf, vhca_buf);
+			if (ret)
+				goto out_unlock;
+			migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER;
+
+			/* prep header buf for next image */
+			vhca_buf_header->length = 0;
+			/* prep data buf for next image */
+			vhca_buf->length = 0;
+
+			break;
+		default:
+			break;
+		}
+	}
+
+out_unlock:
+	if (ret)
+		migf->state = MLX5_MIGF_STATE_ERROR;
+	mutex_unlock(&migf->lock);
+	mlx5vf_state_mutex_unlock(migf->mvdev);
+	return ret ? ret : done;
+}
+
+static const struct file_operations mlx5vf_resume_fops = {
+	.owner = THIS_MODULE,
+	.write = mlx5vf_resume_write,
+	.release = mlx5vf_release_file,
+	.llseek = no_llseek,
+};
+
+static struct mlx5_vf_migration_file *
+mlx5vf_pci_resume_device_data(struct mlx5vf_pci_core_device *mvdev)
+{
+	struct mlx5_vf_migration_file *migf;
+	struct mlx5_vhca_data_buffer *buf;
+	int ret;
+
+	migf = kzalloc(sizeof(*migf), GFP_KERNEL_ACCOUNT);
+	if (!migf)
+		return ERR_PTR(-ENOMEM);
+
+	migf->filp = anon_inode_getfile("mlx5vf_mig", &mlx5vf_resume_fops, migf,
+					O_WRONLY);
+	if (IS_ERR(migf->filp)) {
+		ret = PTR_ERR(migf->filp);
+		goto end;
+	}
+
+	migf->mvdev = mvdev;
+	ret = mlx5vf_cmd_alloc_pd(migf);
+	if (ret)
+		goto out_free;
+
+	buf = mlx5vf_alloc_data_buffer(migf, 0, DMA_TO_DEVICE);
+	if (IS_ERR(buf)) {
+		ret = PTR_ERR(buf);
+		goto out_pd;
+	}
+
+	migf->buf = buf;
+	if (MLX5VF_PRE_COPY_SUPP(mvdev)) {
+		buf = mlx5vf_alloc_data_buffer(migf,
+			sizeof(struct mlx5_vf_migration_header), DMA_NONE);
+		if (IS_ERR(buf)) {
+			ret = PTR_ERR(buf);
+			goto out_buf;
+		}
+
+		migf->buf_header = buf;
+		migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER;
+	} else {
+		/* Initial state will be to read the image */
+		migf->load_state = MLX5_VF_LOAD_STATE_READ_IMAGE_NO_HEADER;
+	}
+
+	stream_open(migf->filp->f_inode, migf->filp);
+	mutex_init(&migf->lock);
+	INIT_LIST_HEAD(&migf->buf_list);
+	INIT_LIST_HEAD(&migf->avail_list);
+	spin_lock_init(&migf->list_lock);
+	return migf;
+out_buf:
+	mlx5vf_free_data_buffer(migf->buf);
+out_pd:
+	mlx5vf_cmd_dealloc_pd(migf);
+out_free:
+	fput(migf->filp);
+end:
+	kfree(migf);
+	return ERR_PTR(ret);
+}
+
+void mlx5vf_disable_fds(struct mlx5vf_pci_core_device *mvdev)
+{
+	if (mvdev->resuming_migf) {
+		mlx5vf_disable_fd(mvdev->resuming_migf);
+		mlx5fv_cmd_clean_migf_resources(mvdev->resuming_migf);
+		fput(mvdev->resuming_migf->filp);
+		mvdev->resuming_migf = NULL;
+	}
+	if (mvdev->saving_migf) {
+		mlx5_cmd_cleanup_async_ctx(&mvdev->saving_migf->async_ctx);
+		cancel_work_sync(&mvdev->saving_migf->async_data.work);
+		mlx5vf_disable_fd(mvdev->saving_migf);
+		mlx5fv_cmd_clean_migf_resources(mvdev->saving_migf);
+		fput(mvdev->saving_migf->filp);
+		mvdev->saving_migf = NULL;
+	}
+}
+
+static struct file *
+mlx5vf_pci_step_device_state_locked(struct mlx5vf_pci_core_device *mvdev,
+				    u32 new)
+{
+	u32 cur = mvdev->mig_state;
+	int ret;
+
+	if (cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_STOP) {
+		ret = mlx5vf_cmd_suspend_vhca(mvdev,
+			MLX5_SUSPEND_VHCA_IN_OP_MOD_SUSPEND_RESPONDER);
+		if (ret)
+			return ERR_PTR(ret);
+		return NULL;
+	}
+
+	if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_RUNNING_P2P) {
+		ret = mlx5vf_cmd_resume_vhca(mvdev,
+			MLX5_RESUME_VHCA_IN_OP_MOD_RESUME_RESPONDER);
+		if (ret)
+			return ERR_PTR(ret);
+		return NULL;
+	}
+
+	if ((cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_RUNNING_P2P) ||
+	    (cur == VFIO_DEVICE_STATE_PRE_COPY && new == VFIO_DEVICE_STATE_PRE_COPY_P2P)) {
+		ret = mlx5vf_cmd_suspend_vhca(mvdev,
+			MLX5_SUSPEND_VHCA_IN_OP_MOD_SUSPEND_INITIATOR);
+		if (ret)
+			return ERR_PTR(ret);
+		return NULL;
+	}
+
+	if ((cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_RUNNING) ||
+	    (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && new == VFIO_DEVICE_STATE_PRE_COPY)) {
+		ret = mlx5vf_cmd_resume_vhca(mvdev,
+			MLX5_RESUME_VHCA_IN_OP_MOD_RESUME_INITIATOR);
+		if (ret)
+			return ERR_PTR(ret);
+		return NULL;
+	}
+
+	if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_STOP_COPY) {
+		struct mlx5_vf_migration_file *migf;
+
+		migf = mlx5vf_pci_save_device_data(mvdev, false);
+		if (IS_ERR(migf))
+			return ERR_CAST(migf);
+		get_file(migf->filp);
+		mvdev->saving_migf = migf;
+		return migf->filp;
+	}
+
+	if ((cur == VFIO_DEVICE_STATE_STOP_COPY && new == VFIO_DEVICE_STATE_STOP) ||
+	    (cur == VFIO_DEVICE_STATE_PRE_COPY && new == VFIO_DEVICE_STATE_RUNNING) ||
+	    (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P &&
+	     new == VFIO_DEVICE_STATE_RUNNING_P2P)) {
+		mlx5vf_disable_fds(mvdev);
+		return NULL;
+	}
+
+	if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_RESUMING) {
+		struct mlx5_vf_migration_file *migf;
+
+		migf = mlx5vf_pci_resume_device_data(mvdev);
+		if (IS_ERR(migf))
+			return ERR_CAST(migf);
+		get_file(migf->filp);
+		mvdev->resuming_migf = migf;
+		return migf->filp;
+	}
+
+	if (cur == VFIO_DEVICE_STATE_RESUMING && new == VFIO_DEVICE_STATE_STOP) {
+		if (!MLX5VF_PRE_COPY_SUPP(mvdev)) {
+			ret = mlx5vf_cmd_load_vhca_state(mvdev,
+							 mvdev->resuming_migf,
+							 mvdev->resuming_migf->buf);
+			if (ret)
+				return ERR_PTR(ret);
+		}
+		mlx5vf_disable_fds(mvdev);
+		return NULL;
+	}
+
+	if ((cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_PRE_COPY) ||
+	    (cur == VFIO_DEVICE_STATE_RUNNING_P2P &&
+	     new == VFIO_DEVICE_STATE_PRE_COPY_P2P)) {
+		struct mlx5_vf_migration_file *migf;
+
+		migf = mlx5vf_pci_save_device_data(mvdev, true);
+		if (IS_ERR(migf))
+			return ERR_CAST(migf);
+		get_file(migf->filp);
+		mvdev->saving_migf = migf;
+		return migf->filp;
+	}
+
+	if (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && new == VFIO_DEVICE_STATE_STOP_COPY) {
+		ret = mlx5vf_cmd_suspend_vhca(mvdev,
+			MLX5_SUSPEND_VHCA_IN_OP_MOD_SUSPEND_RESPONDER);
+		if (ret)
+			return ERR_PTR(ret);
+		ret = mlx5vf_pci_save_device_inc_data(mvdev);
+		return ret ? ERR_PTR(ret) : NULL;
+	}
+
+	/*
+	 * vfio_mig_get_next_state() does not use arcs other than the above
+	 */
+	WARN_ON(true);
+	return ERR_PTR(-EINVAL);
+}
+
+/*
+ * This function is called in all state_mutex unlock cases to
+ * handle a 'deferred_reset' if exists.
+ */
+void mlx5vf_state_mutex_unlock(struct mlx5vf_pci_core_device *mvdev)
+{
+again:
+	spin_lock(&mvdev->reset_lock);
+	if (mvdev->deferred_reset) {
+		mvdev->deferred_reset = false;
+		spin_unlock(&mvdev->reset_lock);
+		mvdev->mig_state = VFIO_DEVICE_STATE_RUNNING;
+		mlx5vf_disable_fds(mvdev);
+		goto again;
+	}
+	mutex_unlock(&mvdev->state_mutex);
+	spin_unlock(&mvdev->reset_lock);
+}
+
+static struct file *
+mlx5vf_pci_set_device_state(struct vfio_device *vdev,
+			    enum vfio_device_mig_state new_state)
+{
+	struct mlx5vf_pci_core_device *mvdev = container_of(
+		vdev, struct mlx5vf_pci_core_device, core_device.vdev);
+	enum vfio_device_mig_state next_state;
+	struct file *res = NULL;
+	int ret;
+
+	mutex_lock(&mvdev->state_mutex);
+	while (new_state != mvdev->mig_state) {
+		ret = vfio_mig_get_next_state(vdev, mvdev->mig_state,
+					      new_state, &next_state);
+		if (ret) {
+			res = ERR_PTR(ret);
+			break;
+		}
+		res = mlx5vf_pci_step_device_state_locked(mvdev, next_state);
+		if (IS_ERR(res))
+			break;
+		mvdev->mig_state = next_state;
+		if (WARN_ON(res && new_state != mvdev->mig_state)) {
+			fput(res);
+			res = ERR_PTR(-EINVAL);
+			break;
+		}
+	}
+	mlx5vf_state_mutex_unlock(mvdev);
+	return res;
+}
+
+static int mlx5vf_pci_get_data_size(struct vfio_device *vdev,
+				    unsigned long *stop_copy_length)
+{
+	struct mlx5vf_pci_core_device *mvdev = container_of(
+		vdev, struct mlx5vf_pci_core_device, core_device.vdev);
+	size_t state_size;
+	int ret;
+
+	mutex_lock(&mvdev->state_mutex);
+	ret = mlx5vf_cmd_query_vhca_migration_state(mvdev,
+						    &state_size, 0);
+	if (!ret)
+		*stop_copy_length = state_size;
+	mlx5vf_state_mutex_unlock(mvdev);
+	return ret;
+}
+
+static int mlx5vf_pci_get_device_state(struct vfio_device *vdev,
+				       enum vfio_device_mig_state *curr_state)
+{
+	struct mlx5vf_pci_core_device *mvdev = container_of(
+		vdev, struct mlx5vf_pci_core_device, core_device.vdev);
+
+	mutex_lock(&mvdev->state_mutex);
+	*curr_state = mvdev->mig_state;
+	mlx5vf_state_mutex_unlock(mvdev);
+	return 0;
+}
+
+static void mlx5vf_pci_aer_reset_done(struct pci_dev *pdev)
+{
+	struct mlx5vf_pci_core_device *mvdev = mlx5vf_drvdata(pdev);
+
+	if (!mvdev->migrate_cap)
+		return;
+
+	/*
+	 * As the higher VFIO layers are holding locks across reset and using
+	 * those same locks with the mm_lock we need to prevent ABBA deadlock
+	 * with the state_mutex and mm_lock.
+	 * In case the state_mutex was taken already we defer the cleanup work
+	 * to the unlock flow of the other running context.
+	 */
+	spin_lock(&mvdev->reset_lock);
+	mvdev->deferred_reset = true;
+	if (!mutex_trylock(&mvdev->state_mutex)) {
+		spin_unlock(&mvdev->reset_lock);
+		return;
+	}
+	spin_unlock(&mvdev->reset_lock);
+	mlx5vf_state_mutex_unlock(mvdev);
+}
+
+static int mlx5vf_pci_open_device(struct vfio_device *core_vdev)
+{
+	struct mlx5vf_pci_core_device *mvdev = container_of(
+		core_vdev, struct mlx5vf_pci_core_device, core_device.vdev);
+	struct vfio_pci_core_device *vdev = &mvdev->core_device;
+	int ret;
+
+	ret = vfio_pci_core_enable(vdev);
+	if (ret)
+		return ret;
+
+	if (mvdev->migrate_cap)
+		mvdev->mig_state = VFIO_DEVICE_STATE_RUNNING;
+	vfio_pci_core_finish_enable(vdev);
+	return 0;
+}
+
+static void mlx5vf_pci_close_device(struct vfio_device *core_vdev)
+{
+	struct mlx5vf_pci_core_device *mvdev = container_of(
+		core_vdev, struct mlx5vf_pci_core_device, core_device.vdev);
+
+	mlx5vf_cmd_close_migratable(mvdev);
+	vfio_pci_core_close_device(core_vdev);
+}
+
+static const struct vfio_migration_ops mlx5vf_pci_mig_ops = {
+	.migration_set_state = mlx5vf_pci_set_device_state,
+	.migration_get_state = mlx5vf_pci_get_device_state,
+	.migration_get_data_size = mlx5vf_pci_get_data_size,
+};
+
+static const struct vfio_log_ops mlx5vf_pci_log_ops = {
+	.log_start = mlx5vf_start_page_tracker,
+	.log_stop = mlx5vf_stop_page_tracker,
+	.log_read_and_clear = mlx5vf_tracker_read_and_clear,
+};
+
+static int mlx5vf_pci_init_dev(struct vfio_device *core_vdev)
+{
+	struct mlx5vf_pci_core_device *mvdev = container_of(core_vdev,
+			struct mlx5vf_pci_core_device, core_device.vdev);
+	int ret;
+
+	ret = vfio_pci_core_init_dev(core_vdev);
+	if (ret)
+		return ret;
+
+	mlx5vf_cmd_set_migratable(mvdev, &mlx5vf_pci_mig_ops,
+				  &mlx5vf_pci_log_ops);
+
+	return 0;
+}
+
+static void mlx5vf_pci_release_dev(struct vfio_device *core_vdev)
+{
+	struct mlx5vf_pci_core_device *mvdev = container_of(core_vdev,
+			struct mlx5vf_pci_core_device, core_device.vdev);
+
+	mlx5vf_cmd_remove_migratable(mvdev);
+	vfio_pci_core_release_dev(core_vdev);
+}
+
+static const struct vfio_device_ops mlx5vf_pci_ops = {
+	.name = "mlx5-vfio-pci",
+	.init = mlx5vf_pci_init_dev,
+	.release = mlx5vf_pci_release_dev,
+	.open_device = mlx5vf_pci_open_device,
+	.close_device = mlx5vf_pci_close_device,
+	.ioctl = vfio_pci_core_ioctl,
+	.device_feature = vfio_pci_core_ioctl_feature,
+	.read = vfio_pci_core_read,
+	.write = vfio_pci_core_write,
+	.mmap = vfio_pci_core_mmap,
+	.request = vfio_pci_core_request,
+	.match = vfio_pci_core_match,
+	.bind_iommufd = vfio_iommufd_physical_bind,
+	.unbind_iommufd = vfio_iommufd_physical_unbind,
+	.attach_ioas = vfio_iommufd_physical_attach_ioas,
+	.detach_ioas = vfio_iommufd_physical_detach_ioas,
+};
+
+static int mlx5vf_pci_probe(struct pci_dev *pdev,
+			    const struct pci_device_id *id)
+{
+	struct mlx5vf_pci_core_device *mvdev;
+	int ret;
+
+	mvdev = vfio_alloc_device(mlx5vf_pci_core_device, core_device.vdev,
+				  &pdev->dev, &mlx5vf_pci_ops);
+	if (IS_ERR(mvdev))
+		return PTR_ERR(mvdev);
+
+	dev_set_drvdata(&pdev->dev, &mvdev->core_device);
+	ret = vfio_pci_core_register_device(&mvdev->core_device);
+	if (ret)
+		goto out_put_vdev;
+	return 0;
+
+out_put_vdev:
+	vfio_put_device(&mvdev->core_device.vdev);
+	return ret;
+}
+
+static void mlx5vf_pci_remove(struct pci_dev *pdev)
+{
+	struct mlx5vf_pci_core_device *mvdev = mlx5vf_drvdata(pdev);
+
+	vfio_pci_core_unregister_device(&mvdev->core_device);
+	vfio_put_device(&mvdev->core_device.vdev);
+}
+
+static const struct pci_device_id mlx5vf_pci_table[] = {
+	{ PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_MELLANOX, 0x101e) }, /* ConnectX Family mlx5Gen Virtual Function */
+	{}
+};
+
+MODULE_DEVICE_TABLE(pci, mlx5vf_pci_table);
+
+static const struct pci_error_handlers mlx5vf_err_handlers = {
+	.reset_done = mlx5vf_pci_aer_reset_done,
+	.error_detected = vfio_pci_core_aer_err_detected,
+};
+
+static struct pci_driver mlx5vf_pci_driver = {
+	.name = KBUILD_MODNAME,
+	.id_table = mlx5vf_pci_table,
+	.probe = mlx5vf_pci_probe,
+	.remove = mlx5vf_pci_remove,
+	.err_handler = &mlx5vf_err_handlers,
+	.driver_managed_dma = true,
+};
+
+module_pci_driver(mlx5vf_pci_driver);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Max Gurtovoy <mgurtovoy@nvidia.com>");
+MODULE_AUTHOR("Yishai Hadas <yishaih@nvidia.com>");
+MODULE_DESCRIPTION(
+	"MLX5 VFIO PCI - User Level meta-driver for MLX5 device family");
diff --git a/drivers/vfio/pci/pds/Kconfig b/drivers/vfio/pci/pds/Kconfig
new file mode 100644
index 0000000000..6eceef7b02
--- /dev/null
+++ b/drivers/vfio/pci/pds/Kconfig
@@ -0,0 +1,19 @@
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (c) 2023 Advanced Micro Devices, Inc.
+
+config PDS_VFIO_PCI
+	tristate "VFIO support for PDS PCI devices"
+	depends on PDS_CORE && PCI_IOV
+	select VFIO_PCI_CORE
+	help
+	  This provides generic PCI support for PDS devices using the VFIO
+	  framework.
+
+	  More specific information on this driver can be
+	  found in
+	  <file:Documentation/networking/device_drivers/ethernet/amd/pds_vfio_pci.rst>.
+
+	  To compile this driver as a module, choose M here. The module
+	  will be called pds-vfio-pci.
+
+	  If you don't know what to do here, say N.
diff --git a/drivers/vfio/pci/pds/Makefile b/drivers/vfio/pci/pds/Makefile
new file mode 100644
index 0000000000..d5a06d8163
--- /dev/null
+++ b/drivers/vfio/pci/pds/Makefile
@@ -0,0 +1,11 @@
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (c) 2023 Advanced Micro Devices, Inc.
+
+obj-$(CONFIG_PDS_VFIO_PCI) += pds-vfio-pci.o
+
+pds-vfio-pci-y := \
+	cmds.o		\
+	dirty.o		\
+	lm.o		\
+	pci_drv.o	\
+	vfio_dev.o
diff --git a/drivers/vfio/pci/pds/cmds.c b/drivers/vfio/pci/pds/cmds.c
new file mode 100644
index 0000000000..36463ccc3d
--- /dev/null
+++ b/drivers/vfio/pci/pds/cmds.c
@@ -0,0 +1,510 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 2023 Advanced Micro Devices, Inc. */
+
+#include <linux/io.h>
+#include <linux/types.h>
+#include <linux/delay.h>
+
+#include <linux/pds/pds_common.h>
+#include <linux/pds/pds_core_if.h>
+#include <linux/pds/pds_adminq.h>
+
+#include "vfio_dev.h"
+#include "cmds.h"
+
+#define SUSPEND_TIMEOUT_S		5
+#define SUSPEND_CHECK_INTERVAL_MS	1
+
+static int pds_vfio_client_adminq_cmd(struct pds_vfio_pci_device *pds_vfio,
+				      union pds_core_adminq_cmd *req,
+				      union pds_core_adminq_comp *resp,
+				      bool fast_poll)
+{
+	struct pci_dev *pdev = pds_vfio_to_pci_dev(pds_vfio);
+	union pds_core_adminq_cmd cmd = {};
+	struct pdsc *pdsc;
+	int err;
+
+	/* Wrap the client request */
+	cmd.client_request.opcode = PDS_AQ_CMD_CLIENT_CMD;
+	cmd.client_request.client_id = cpu_to_le16(pds_vfio->client_id);
+	memcpy(cmd.client_request.client_cmd, req,
+	       sizeof(cmd.client_request.client_cmd));
+
+	pdsc = pdsc_get_pf_struct(pdev);
+	if (IS_ERR(pdsc))
+		return PTR_ERR(pdsc);
+
+	err = pdsc_adminq_post(pdsc, &cmd, resp, fast_poll);
+	if (err && err != -EAGAIN)
+		dev_err(pds_vfio_to_dev(pds_vfio),
+			"client admin cmd failed: %pe\n", ERR_PTR(err));
+
+	return err;
+}
+
+int pds_vfio_register_client_cmd(struct pds_vfio_pci_device *pds_vfio)
+{
+	struct pci_dev *pdev = pds_vfio_to_pci_dev(pds_vfio);
+	char devname[PDS_DEVNAME_LEN];
+	struct pdsc *pdsc;
+	int ci;
+
+	snprintf(devname, sizeof(devname), "%s.%d-%u", PDS_VFIO_LM_DEV_NAME,
+		 pci_domain_nr(pdev->bus),
+		 PCI_DEVID(pdev->bus->number, pdev->devfn));
+
+	pdsc = pdsc_get_pf_struct(pdev);
+	if (IS_ERR(pdsc))
+		return PTR_ERR(pdsc);
+
+	ci = pds_client_register(pdsc, devname);
+	if (ci < 0)
+		return ci;
+
+	pds_vfio->client_id = ci;
+
+	return 0;
+}
+
+void pds_vfio_unregister_client_cmd(struct pds_vfio_pci_device *pds_vfio)
+{
+	struct pci_dev *pdev = pds_vfio_to_pci_dev(pds_vfio);
+	struct pdsc *pdsc;
+	int err;
+
+	pdsc = pdsc_get_pf_struct(pdev);
+	if (IS_ERR(pdsc))
+		return;
+
+	err = pds_client_unregister(pdsc, pds_vfio->client_id);
+	if (err)
+		dev_err(&pdev->dev, "unregister from DSC failed: %pe\n",
+			ERR_PTR(err));
+
+	pds_vfio->client_id = 0;
+}
+
+static int
+pds_vfio_suspend_wait_device_cmd(struct pds_vfio_pci_device *pds_vfio, u8 type)
+{
+	union pds_core_adminq_cmd cmd = {
+		.lm_suspend_status = {
+			.opcode = PDS_LM_CMD_SUSPEND_STATUS,
+			.vf_id = cpu_to_le16(pds_vfio->vf_id),
+			.type = type,
+		},
+	};
+	struct device *dev = pds_vfio_to_dev(pds_vfio);
+	union pds_core_adminq_comp comp = {};
+	unsigned long time_limit;
+	unsigned long time_start;
+	unsigned long time_done;
+	int err;
+
+	time_start = jiffies;
+	time_limit = time_start + HZ * SUSPEND_TIMEOUT_S;
+	do {
+		err = pds_vfio_client_adminq_cmd(pds_vfio, &cmd, &comp, true);
+		if (err != -EAGAIN)
+			break;
+
+		msleep(SUSPEND_CHECK_INTERVAL_MS);
+	} while (time_before(jiffies, time_limit));
+
+	time_done = jiffies;
+	dev_dbg(dev, "%s: vf%u: Suspend comp received in %d msecs\n", __func__,
+		pds_vfio->vf_id, jiffies_to_msecs(time_done - time_start));
+
+	/* Check the results */
+	if (time_after_eq(time_done, time_limit)) {
+		dev_err(dev, "%s: vf%u: Suspend comp timeout\n", __func__,
+			pds_vfio->vf_id);
+		err = -ETIMEDOUT;
+	}
+
+	return err;
+}
+
+int pds_vfio_suspend_device_cmd(struct pds_vfio_pci_device *pds_vfio, u8 type)
+{
+	union pds_core_adminq_cmd cmd = {
+		.lm_suspend = {
+			.opcode = PDS_LM_CMD_SUSPEND,
+			.vf_id = cpu_to_le16(pds_vfio->vf_id),
+			.type = type,
+		},
+	};
+	struct device *dev = pds_vfio_to_dev(pds_vfio);
+	union pds_core_adminq_comp comp = {};
+	int err;
+
+	dev_dbg(dev, "vf%u: Suspend device\n", pds_vfio->vf_id);
+
+	/*
+	 * The initial suspend request to the firmware starts the device suspend
+	 * operation and the firmware returns success if it's started
+	 * successfully.
+	 */
+	err = pds_vfio_client_adminq_cmd(pds_vfio, &cmd, &comp, true);
+	if (err) {
+		dev_err(dev, "vf%u: Suspend failed: %pe\n", pds_vfio->vf_id,
+			ERR_PTR(err));
+		return err;
+	}
+
+	/*
+	 * The subsequent suspend status request(s) check if the firmware has
+	 * completed the device suspend process.
+	 */
+	return pds_vfio_suspend_wait_device_cmd(pds_vfio, type);
+}
+
+int pds_vfio_resume_device_cmd(struct pds_vfio_pci_device *pds_vfio, u8 type)
+{
+	union pds_core_adminq_cmd cmd = {
+		.lm_resume = {
+			.opcode = PDS_LM_CMD_RESUME,
+			.vf_id = cpu_to_le16(pds_vfio->vf_id),
+			.type = type,
+		},
+	};
+	struct device *dev = pds_vfio_to_dev(pds_vfio);
+	union pds_core_adminq_comp comp = {};
+
+	dev_dbg(dev, "vf%u: Resume device\n", pds_vfio->vf_id);
+
+	return pds_vfio_client_adminq_cmd(pds_vfio, &cmd, &comp, true);
+}
+
+int pds_vfio_get_lm_state_size_cmd(struct pds_vfio_pci_device *pds_vfio, u64 *size)
+{
+	union pds_core_adminq_cmd cmd = {
+		.lm_state_size = {
+			.opcode = PDS_LM_CMD_STATE_SIZE,
+			.vf_id = cpu_to_le16(pds_vfio->vf_id),
+		},
+	};
+	struct device *dev = pds_vfio_to_dev(pds_vfio);
+	union pds_core_adminq_comp comp = {};
+	int err;
+
+	dev_dbg(dev, "vf%u: Get migration status\n", pds_vfio->vf_id);
+
+	err = pds_vfio_client_adminq_cmd(pds_vfio, &cmd, &comp, false);
+	if (err)
+		return err;
+
+	*size = le64_to_cpu(comp.lm_state_size.size);
+	return 0;
+}
+
+static int pds_vfio_dma_map_lm_file(struct device *dev,
+				    enum dma_data_direction dir,
+				    struct pds_vfio_lm_file *lm_file)
+{
+	struct pds_lm_sg_elem *sgl, *sge;
+	struct scatterlist *sg;
+	dma_addr_t sgl_addr;
+	size_t sgl_size;
+	int err;
+	int i;
+
+	if (!lm_file)
+		return -EINVAL;
+
+	/* dma map file pages */
+	err = dma_map_sgtable(dev, &lm_file->sg_table, dir, 0);
+	if (err)
+		return err;
+
+	lm_file->num_sge = lm_file->sg_table.nents;
+
+	/* alloc sgl */
+	sgl_size = lm_file->num_sge * sizeof(struct pds_lm_sg_elem);
+	sgl = kzalloc(sgl_size, GFP_KERNEL);
+	if (!sgl) {
+		err = -ENOMEM;
+		goto out_unmap_sgtable;
+	}
+
+	/* fill sgl */
+	sge = sgl;
+	for_each_sgtable_dma_sg(&lm_file->sg_table, sg, i) {
+		sge->addr = cpu_to_le64(sg_dma_address(sg));
+		sge->len = cpu_to_le32(sg_dma_len(sg));
+		dev_dbg(dev, "addr = %llx, len = %u\n", sge->addr, sge->len);
+		sge++;
+	}
+
+	sgl_addr = dma_map_single(dev, sgl, sgl_size, DMA_TO_DEVICE);
+	if (dma_mapping_error(dev, sgl_addr)) {
+		err = -EIO;
+		goto out_free_sgl;
+	}
+
+	lm_file->sgl = sgl;
+	lm_file->sgl_addr = sgl_addr;
+
+	return 0;
+
+out_free_sgl:
+	kfree(sgl);
+out_unmap_sgtable:
+	lm_file->num_sge = 0;
+	dma_unmap_sgtable(dev, &lm_file->sg_table, dir, 0);
+	return err;
+}
+
+static void pds_vfio_dma_unmap_lm_file(struct device *dev,
+				       enum dma_data_direction dir,
+				       struct pds_vfio_lm_file *lm_file)
+{
+	if (!lm_file)
+		return;
+
+	/* free sgl */
+	if (lm_file->sgl) {
+		dma_unmap_single(dev, lm_file->sgl_addr,
+				 lm_file->num_sge * sizeof(*lm_file->sgl),
+				 DMA_TO_DEVICE);
+		kfree(lm_file->sgl);
+		lm_file->sgl = NULL;
+		lm_file->sgl_addr = DMA_MAPPING_ERROR;
+		lm_file->num_sge = 0;
+	}
+
+	/* dma unmap file pages */
+	dma_unmap_sgtable(dev, &lm_file->sg_table, dir, 0);
+}
+
+int pds_vfio_get_lm_state_cmd(struct pds_vfio_pci_device *pds_vfio)
+{
+	union pds_core_adminq_cmd cmd = {
+		.lm_save = {
+			.opcode = PDS_LM_CMD_SAVE,
+			.vf_id = cpu_to_le16(pds_vfio->vf_id),
+		},
+	};
+	struct pci_dev *pdev = pds_vfio_to_pci_dev(pds_vfio);
+	struct device *pdsc_dev = &pci_physfn(pdev)->dev;
+	union pds_core_adminq_comp comp = {};
+	struct pds_vfio_lm_file *lm_file;
+	int err;
+
+	dev_dbg(&pdev->dev, "vf%u: Get migration state\n", pds_vfio->vf_id);
+
+	lm_file = pds_vfio->save_file;
+
+	err = pds_vfio_dma_map_lm_file(pdsc_dev, DMA_FROM_DEVICE, lm_file);
+	if (err) {
+		dev_err(&pdev->dev, "failed to map save migration file: %pe\n",
+			ERR_PTR(err));
+		return err;
+	}
+
+	cmd.lm_save.sgl_addr = cpu_to_le64(lm_file->sgl_addr);
+	cmd.lm_save.num_sge = cpu_to_le32(lm_file->num_sge);
+
+	err = pds_vfio_client_adminq_cmd(pds_vfio, &cmd, &comp, false);
+	if (err)
+		dev_err(&pdev->dev, "failed to get migration state: %pe\n",
+			ERR_PTR(err));
+
+	pds_vfio_dma_unmap_lm_file(pdsc_dev, DMA_FROM_DEVICE, lm_file);
+
+	return err;
+}
+
+int pds_vfio_set_lm_state_cmd(struct pds_vfio_pci_device *pds_vfio)
+{
+	union pds_core_adminq_cmd cmd = {
+		.lm_restore = {
+			.opcode = PDS_LM_CMD_RESTORE,
+			.vf_id = cpu_to_le16(pds_vfio->vf_id),
+		},
+	};
+	struct pci_dev *pdev = pds_vfio_to_pci_dev(pds_vfio);
+	struct device *pdsc_dev = &pci_physfn(pdev)->dev;
+	union pds_core_adminq_comp comp = {};
+	struct pds_vfio_lm_file *lm_file;
+	int err;
+
+	dev_dbg(&pdev->dev, "vf%u: Set migration state\n", pds_vfio->vf_id);
+
+	lm_file = pds_vfio->restore_file;
+
+	err = pds_vfio_dma_map_lm_file(pdsc_dev, DMA_TO_DEVICE, lm_file);
+	if (err) {
+		dev_err(&pdev->dev,
+			"failed to map restore migration file: %pe\n",
+			ERR_PTR(err));
+		return err;
+	}
+
+	cmd.lm_restore.sgl_addr = cpu_to_le64(lm_file->sgl_addr);
+	cmd.lm_restore.num_sge = cpu_to_le32(lm_file->num_sge);
+
+	err = pds_vfio_client_adminq_cmd(pds_vfio, &cmd, &comp, false);
+	if (err)
+		dev_err(&pdev->dev, "failed to set migration state: %pe\n",
+			ERR_PTR(err));
+
+	pds_vfio_dma_unmap_lm_file(pdsc_dev, DMA_TO_DEVICE, lm_file);
+
+	return err;
+}
+
+void pds_vfio_send_host_vf_lm_status_cmd(struct pds_vfio_pci_device *pds_vfio,
+					 enum pds_lm_host_vf_status vf_status)
+{
+	union pds_core_adminq_cmd cmd = {
+		.lm_host_vf_status = {
+			.opcode = PDS_LM_CMD_HOST_VF_STATUS,
+			.vf_id = cpu_to_le16(pds_vfio->vf_id),
+			.status = vf_status,
+		},
+	};
+	struct device *dev = pds_vfio_to_dev(pds_vfio);
+	union pds_core_adminq_comp comp = {};
+	int err;
+
+	dev_dbg(dev, "vf%u: Set host VF LM status: %u", pds_vfio->vf_id,
+		vf_status);
+	if (vf_status != PDS_LM_STA_IN_PROGRESS &&
+	    vf_status != PDS_LM_STA_NONE) {
+		dev_warn(dev, "Invalid host VF migration status, %d\n",
+			 vf_status);
+		return;
+	}
+
+	err = pds_vfio_client_adminq_cmd(pds_vfio, &cmd, &comp, false);
+	if (err)
+		dev_warn(dev, "failed to send host VF migration status: %pe\n",
+			 ERR_PTR(err));
+}
+
+int pds_vfio_dirty_status_cmd(struct pds_vfio_pci_device *pds_vfio,
+			      u64 regions_dma, u8 *max_regions, u8 *num_regions)
+{
+	union pds_core_adminq_cmd cmd = {
+		.lm_dirty_status = {
+			.opcode = PDS_LM_CMD_DIRTY_STATUS,
+			.vf_id = cpu_to_le16(pds_vfio->vf_id),
+		},
+	};
+	struct device *dev = pds_vfio_to_dev(pds_vfio);
+	union pds_core_adminq_comp comp = {};
+	int err;
+
+	dev_dbg(dev, "vf%u: Dirty status\n", pds_vfio->vf_id);
+
+	cmd.lm_dirty_status.regions_dma = cpu_to_le64(regions_dma);
+	cmd.lm_dirty_status.max_regions = *max_regions;
+
+	err = pds_vfio_client_adminq_cmd(pds_vfio, &cmd, &comp, false);
+	if (err) {
+		dev_err(dev, "failed to get dirty status: %pe\n", ERR_PTR(err));
+		return err;
+	}
+
+	/* only support seq_ack approach for now */
+	if (!(le32_to_cpu(comp.lm_dirty_status.bmp_type_mask) &
+	      BIT(PDS_LM_DIRTY_BMP_TYPE_SEQ_ACK))) {
+		dev_err(dev, "Dirty bitmap tracking SEQ_ACK not supported\n");
+		return -EOPNOTSUPP;
+	}
+
+	*num_regions = comp.lm_dirty_status.num_regions;
+	*max_regions = comp.lm_dirty_status.max_regions;
+
+	dev_dbg(dev,
+		"Page Tracking Status command successful, max_regions: %d, num_regions: %d, bmp_type: %s\n",
+		*max_regions, *num_regions, "PDS_LM_DIRTY_BMP_TYPE_SEQ_ACK");
+
+	return 0;
+}
+
+int pds_vfio_dirty_enable_cmd(struct pds_vfio_pci_device *pds_vfio,
+			      u64 regions_dma, u8 num_regions)
+{
+	union pds_core_adminq_cmd cmd = {
+		.lm_dirty_enable = {
+			.opcode = PDS_LM_CMD_DIRTY_ENABLE,
+			.vf_id = cpu_to_le16(pds_vfio->vf_id),
+			.regions_dma = cpu_to_le64(regions_dma),
+			.bmp_type = PDS_LM_DIRTY_BMP_TYPE_SEQ_ACK,
+			.num_regions = num_regions,
+		},
+	};
+	struct device *dev = pds_vfio_to_dev(pds_vfio);
+	union pds_core_adminq_comp comp = {};
+	int err;
+
+	err = pds_vfio_client_adminq_cmd(pds_vfio, &cmd, &comp, false);
+	if (err) {
+		dev_err(dev, "failed dirty tracking enable: %pe\n",
+			ERR_PTR(err));
+		return err;
+	}
+
+	return 0;
+}
+
+int pds_vfio_dirty_disable_cmd(struct pds_vfio_pci_device *pds_vfio)
+{
+	union pds_core_adminq_cmd cmd = {
+		.lm_dirty_disable = {
+			.opcode = PDS_LM_CMD_DIRTY_DISABLE,
+			.vf_id = cpu_to_le16(pds_vfio->vf_id),
+		},
+	};
+	struct device *dev = pds_vfio_to_dev(pds_vfio);
+	union pds_core_adminq_comp comp = {};
+	int err;
+
+	err = pds_vfio_client_adminq_cmd(pds_vfio, &cmd, &comp, false);
+	if (err || comp.lm_dirty_status.num_regions != 0) {
+		/* in case num_regions is still non-zero after disable */
+		err = err ? err : -EIO;
+		dev_err(dev,
+			"failed dirty tracking disable: %pe, num_regions %d\n",
+			ERR_PTR(err), comp.lm_dirty_status.num_regions);
+		return err;
+	}
+
+	return 0;
+}
+
+int pds_vfio_dirty_seq_ack_cmd(struct pds_vfio_pci_device *pds_vfio,
+			       u64 sgl_dma, u16 num_sge, u32 offset,
+			       u32 total_len, bool read_seq)
+{
+	const char *cmd_type_str = read_seq ? "read_seq" : "write_ack";
+	union pds_core_adminq_cmd cmd = {
+		.lm_dirty_seq_ack = {
+			.vf_id = cpu_to_le16(pds_vfio->vf_id),
+			.len_bytes = cpu_to_le32(total_len),
+			.off_bytes = cpu_to_le32(offset),
+			.sgl_addr = cpu_to_le64(sgl_dma),
+			.num_sge = cpu_to_le16(num_sge),
+		},
+	};
+	struct device *dev = pds_vfio_to_dev(pds_vfio);
+	union pds_core_adminq_comp comp = {};
+	int err;
+
+	if (read_seq)
+		cmd.lm_dirty_seq_ack.opcode = PDS_LM_CMD_DIRTY_READ_SEQ;
+	else
+		cmd.lm_dirty_seq_ack.opcode = PDS_LM_CMD_DIRTY_WRITE_ACK;
+
+	err = pds_vfio_client_adminq_cmd(pds_vfio, &cmd, &comp, false);
+	if (err) {
+		dev_err(dev, "failed cmd Page Tracking %s: %pe\n", cmd_type_str,
+			ERR_PTR(err));
+		return err;
+	}
+
+	return 0;
+}
diff --git a/drivers/vfio/pci/pds/cmds.h b/drivers/vfio/pci/pds/cmds.h
new file mode 100644
index 0000000000..95221100b9
--- /dev/null
+++ b/drivers/vfio/pci/pds/cmds.h
@@ -0,0 +1,25 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright(c) 2023 Advanced Micro Devices, Inc. */
+
+#ifndef _CMDS_H_
+#define _CMDS_H_
+
+int pds_vfio_register_client_cmd(struct pds_vfio_pci_device *pds_vfio);
+void pds_vfio_unregister_client_cmd(struct pds_vfio_pci_device *pds_vfio);
+int pds_vfio_suspend_device_cmd(struct pds_vfio_pci_device *pds_vfio, u8 type);
+int pds_vfio_resume_device_cmd(struct pds_vfio_pci_device *pds_vfio, u8 type);
+int pds_vfio_get_lm_state_size_cmd(struct pds_vfio_pci_device *pds_vfio, u64 *size);
+int pds_vfio_get_lm_state_cmd(struct pds_vfio_pci_device *pds_vfio);
+int pds_vfio_set_lm_state_cmd(struct pds_vfio_pci_device *pds_vfio);
+void pds_vfio_send_host_vf_lm_status_cmd(struct pds_vfio_pci_device *pds_vfio,
+					 enum pds_lm_host_vf_status vf_status);
+int pds_vfio_dirty_status_cmd(struct pds_vfio_pci_device *pds_vfio,
+			      u64 regions_dma, u8 *max_regions,
+			      u8 *num_regions);
+int pds_vfio_dirty_enable_cmd(struct pds_vfio_pci_device *pds_vfio,
+			      u64 regions_dma, u8 num_regions);
+int pds_vfio_dirty_disable_cmd(struct pds_vfio_pci_device *pds_vfio);
+int pds_vfio_dirty_seq_ack_cmd(struct pds_vfio_pci_device *pds_vfio,
+			       u64 sgl_dma, u16 num_sge, u32 offset,
+			       u32 total_len, bool read_seq);
+#endif /* _CMDS_H_ */
diff --git a/drivers/vfio/pci/pds/dirty.c b/drivers/vfio/pci/pds/dirty.c
new file mode 100644
index 0000000000..27607d7b90
--- /dev/null
+++ b/drivers/vfio/pci/pds/dirty.c
@@ -0,0 +1,564 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 2023 Advanced Micro Devices, Inc. */
+
+#include <linux/interval_tree.h>
+#include <linux/vfio.h>
+
+#include <linux/pds/pds_common.h>
+#include <linux/pds/pds_core_if.h>
+#include <linux/pds/pds_adminq.h>
+
+#include "vfio_dev.h"
+#include "cmds.h"
+#include "dirty.h"
+
+#define READ_SEQ true
+#define WRITE_ACK false
+
+bool pds_vfio_dirty_is_enabled(struct pds_vfio_pci_device *pds_vfio)
+{
+	return pds_vfio->dirty.is_enabled;
+}
+
+void pds_vfio_dirty_set_enabled(struct pds_vfio_pci_device *pds_vfio)
+{
+	pds_vfio->dirty.is_enabled = true;
+}
+
+void pds_vfio_dirty_set_disabled(struct pds_vfio_pci_device *pds_vfio)
+{
+	pds_vfio->dirty.is_enabled = false;
+}
+
+static void
+pds_vfio_print_guest_region_info(struct pds_vfio_pci_device *pds_vfio,
+				 u8 max_regions)
+{
+	int len = max_regions * sizeof(struct pds_lm_dirty_region_info);
+	struct pci_dev *pdev = pds_vfio->vfio_coredev.pdev;
+	struct device *pdsc_dev = &pci_physfn(pdev)->dev;
+	struct pds_lm_dirty_region_info *region_info;
+	dma_addr_t regions_dma;
+	u8 num_regions;
+	int err;
+
+	region_info = kcalloc(max_regions,
+			      sizeof(struct pds_lm_dirty_region_info),
+			      GFP_KERNEL);
+	if (!region_info)
+		return;
+
+	regions_dma =
+		dma_map_single(pdsc_dev, region_info, len, DMA_FROM_DEVICE);
+	if (dma_mapping_error(pdsc_dev, regions_dma))
+		goto out_free_region_info;
+
+	err = pds_vfio_dirty_status_cmd(pds_vfio, regions_dma, &max_regions,
+					&num_regions);
+	dma_unmap_single(pdsc_dev, regions_dma, len, DMA_FROM_DEVICE);
+	if (err)
+		goto out_free_region_info;
+
+	for (unsigned int i = 0; i < num_regions; i++)
+		dev_dbg(&pdev->dev,
+			"region_info[%d]: dma_base 0x%llx page_count %u page_size_log2 %u\n",
+			i, le64_to_cpu(region_info[i].dma_base),
+			le32_to_cpu(region_info[i].page_count),
+			region_info[i].page_size_log2);
+
+out_free_region_info:
+	kfree(region_info);
+}
+
+static int pds_vfio_dirty_alloc_bitmaps(struct pds_vfio_dirty *dirty,
+					unsigned long bytes)
+{
+	unsigned long *host_seq_bmp, *host_ack_bmp;
+
+	host_seq_bmp = vzalloc(bytes);
+	if (!host_seq_bmp)
+		return -ENOMEM;
+
+	host_ack_bmp = vzalloc(bytes);
+	if (!host_ack_bmp) {
+		bitmap_free(host_seq_bmp);
+		return -ENOMEM;
+	}
+
+	dirty->host_seq.bmp = host_seq_bmp;
+	dirty->host_ack.bmp = host_ack_bmp;
+
+	return 0;
+}
+
+static void pds_vfio_dirty_free_bitmaps(struct pds_vfio_dirty *dirty)
+{
+	vfree(dirty->host_seq.bmp);
+	vfree(dirty->host_ack.bmp);
+	dirty->host_seq.bmp = NULL;
+	dirty->host_ack.bmp = NULL;
+}
+
+static void __pds_vfio_dirty_free_sgl(struct pds_vfio_pci_device *pds_vfio,
+				      struct pds_vfio_bmp_info *bmp_info)
+{
+	struct pci_dev *pdev = pds_vfio->vfio_coredev.pdev;
+	struct device *pdsc_dev = &pci_physfn(pdev)->dev;
+
+	dma_unmap_single(pdsc_dev, bmp_info->sgl_addr,
+			 bmp_info->num_sge * sizeof(struct pds_lm_sg_elem),
+			 DMA_BIDIRECTIONAL);
+	kfree(bmp_info->sgl);
+
+	bmp_info->num_sge = 0;
+	bmp_info->sgl = NULL;
+	bmp_info->sgl_addr = 0;
+}
+
+static void pds_vfio_dirty_free_sgl(struct pds_vfio_pci_device *pds_vfio)
+{
+	if (pds_vfio->dirty.host_seq.sgl)
+		__pds_vfio_dirty_free_sgl(pds_vfio, &pds_vfio->dirty.host_seq);
+	if (pds_vfio->dirty.host_ack.sgl)
+		__pds_vfio_dirty_free_sgl(pds_vfio, &pds_vfio->dirty.host_ack);
+}
+
+static int __pds_vfio_dirty_alloc_sgl(struct pds_vfio_pci_device *pds_vfio,
+				      struct pds_vfio_bmp_info *bmp_info,
+				      u32 page_count)
+{
+	struct pci_dev *pdev = pds_vfio->vfio_coredev.pdev;
+	struct device *pdsc_dev = &pci_physfn(pdev)->dev;
+	struct pds_lm_sg_elem *sgl;
+	dma_addr_t sgl_addr;
+	size_t sgl_size;
+	u32 max_sge;
+
+	max_sge = DIV_ROUND_UP(page_count, PAGE_SIZE * 8);
+	sgl_size = max_sge * sizeof(struct pds_lm_sg_elem);
+
+	sgl = kzalloc(sgl_size, GFP_KERNEL);
+	if (!sgl)
+		return -ENOMEM;
+
+	sgl_addr = dma_map_single(pdsc_dev, sgl, sgl_size, DMA_BIDIRECTIONAL);
+	if (dma_mapping_error(pdsc_dev, sgl_addr)) {
+		kfree(sgl);
+		return -EIO;
+	}
+
+	bmp_info->sgl = sgl;
+	bmp_info->num_sge = max_sge;
+	bmp_info->sgl_addr = sgl_addr;
+
+	return 0;
+}
+
+static int pds_vfio_dirty_alloc_sgl(struct pds_vfio_pci_device *pds_vfio,
+				    u32 page_count)
+{
+	struct pds_vfio_dirty *dirty = &pds_vfio->dirty;
+	int err;
+
+	err = __pds_vfio_dirty_alloc_sgl(pds_vfio, &dirty->host_seq,
+					 page_count);
+	if (err)
+		return err;
+
+	err = __pds_vfio_dirty_alloc_sgl(pds_vfio, &dirty->host_ack,
+					 page_count);
+	if (err) {
+		__pds_vfio_dirty_free_sgl(pds_vfio, &dirty->host_seq);
+		return err;
+	}
+
+	return 0;
+}
+
+static int pds_vfio_dirty_enable(struct pds_vfio_pci_device *pds_vfio,
+				 struct rb_root_cached *ranges, u32 nnodes,
+				 u64 *page_size)
+{
+	struct pci_dev *pdev = pds_vfio->vfio_coredev.pdev;
+	struct device *pdsc_dev = &pci_physfn(pdev)->dev;
+	struct pds_vfio_dirty *dirty = &pds_vfio->dirty;
+	u64 region_start, region_size, region_page_size;
+	struct pds_lm_dirty_region_info *region_info;
+	struct interval_tree_node *node = NULL;
+	u8 max_regions = 0, num_regions;
+	dma_addr_t regions_dma = 0;
+	u32 num_ranges = nnodes;
+	u32 page_count;
+	u16 len;
+	int err;
+
+	dev_dbg(&pdev->dev, "vf%u: Start dirty page tracking\n",
+		pds_vfio->vf_id);
+
+	if (pds_vfio_dirty_is_enabled(pds_vfio))
+		return -EINVAL;
+
+	/* find if dirty tracking is disabled, i.e. num_regions == 0 */
+	err = pds_vfio_dirty_status_cmd(pds_vfio, 0, &max_regions,
+					&num_regions);
+	if (err < 0) {
+		dev_err(&pdev->dev, "Failed to get dirty status, err %pe\n",
+			ERR_PTR(err));
+		return err;
+	} else if (num_regions) {
+		dev_err(&pdev->dev,
+			"Dirty tracking already enabled for %d regions\n",
+			num_regions);
+		return -EEXIST;
+	} else if (!max_regions) {
+		dev_err(&pdev->dev,
+			"Device doesn't support dirty tracking, max_regions %d\n",
+			max_regions);
+		return -EOPNOTSUPP;
+	}
+
+	/*
+	 * Only support 1 region for now. If there are any large gaps in the
+	 * VM's address regions, then this would be a waste of memory as we are
+	 * generating 2 bitmaps (ack/seq) from the min address to the max
+	 * address of the VM's address regions. In the future, if we support
+	 * more than one region in the device/driver we can split the bitmaps
+	 * on the largest address region gaps. We can do this split up to the
+	 * max_regions times returned from the dirty_status command.
+	 */
+	max_regions = 1;
+	if (num_ranges > max_regions) {
+		vfio_combine_iova_ranges(ranges, nnodes, max_regions);
+		num_ranges = max_regions;
+	}
+
+	node = interval_tree_iter_first(ranges, 0, ULONG_MAX);
+	if (!node)
+		return -EINVAL;
+
+	region_size = node->last - node->start + 1;
+	region_start = node->start;
+	region_page_size = *page_size;
+
+	len = sizeof(*region_info);
+	region_info = kzalloc(len, GFP_KERNEL);
+	if (!region_info)
+		return -ENOMEM;
+
+	page_count = DIV_ROUND_UP(region_size, region_page_size);
+
+	region_info->dma_base = cpu_to_le64(region_start);
+	region_info->page_count = cpu_to_le32(page_count);
+	region_info->page_size_log2 = ilog2(region_page_size);
+
+	regions_dma = dma_map_single(pdsc_dev, (void *)region_info, len,
+				     DMA_BIDIRECTIONAL);
+	if (dma_mapping_error(pdsc_dev, regions_dma)) {
+		err = -ENOMEM;
+		goto out_free_region_info;
+	}
+
+	err = pds_vfio_dirty_enable_cmd(pds_vfio, regions_dma, max_regions);
+	dma_unmap_single(pdsc_dev, regions_dma, len, DMA_BIDIRECTIONAL);
+	if (err)
+		goto out_free_region_info;
+
+	/*
+	 * page_count might be adjusted by the device,
+	 * update it before freeing region_info DMA
+	 */
+	page_count = le32_to_cpu(region_info->page_count);
+
+	dev_dbg(&pdev->dev,
+		"region_info: regions_dma 0x%llx dma_base 0x%llx page_count %u page_size_log2 %u\n",
+		regions_dma, region_start, page_count,
+		(u8)ilog2(region_page_size));
+
+	err = pds_vfio_dirty_alloc_bitmaps(dirty, page_count / BITS_PER_BYTE);
+	if (err) {
+		dev_err(&pdev->dev, "Failed to alloc dirty bitmaps: %pe\n",
+			ERR_PTR(err));
+		goto out_free_region_info;
+	}
+
+	err = pds_vfio_dirty_alloc_sgl(pds_vfio, page_count);
+	if (err) {
+		dev_err(&pdev->dev, "Failed to alloc dirty sg lists: %pe\n",
+			ERR_PTR(err));
+		goto out_free_bitmaps;
+	}
+
+	dirty->region_start = region_start;
+	dirty->region_size = region_size;
+	dirty->region_page_size = region_page_size;
+	pds_vfio_dirty_set_enabled(pds_vfio);
+
+	pds_vfio_print_guest_region_info(pds_vfio, max_regions);
+
+	kfree(region_info);
+
+	return 0;
+
+out_free_bitmaps:
+	pds_vfio_dirty_free_bitmaps(dirty);
+out_free_region_info:
+	kfree(region_info);
+	return err;
+}
+
+void pds_vfio_dirty_disable(struct pds_vfio_pci_device *pds_vfio, bool send_cmd)
+{
+	if (pds_vfio_dirty_is_enabled(pds_vfio)) {
+		pds_vfio_dirty_set_disabled(pds_vfio);
+		if (send_cmd)
+			pds_vfio_dirty_disable_cmd(pds_vfio);
+		pds_vfio_dirty_free_sgl(pds_vfio);
+		pds_vfio_dirty_free_bitmaps(&pds_vfio->dirty);
+	}
+
+	if (send_cmd)
+		pds_vfio_send_host_vf_lm_status_cmd(pds_vfio, PDS_LM_STA_NONE);
+}
+
+static int pds_vfio_dirty_seq_ack(struct pds_vfio_pci_device *pds_vfio,
+				  struct pds_vfio_bmp_info *bmp_info,
+				  u32 offset, u32 bmp_bytes, bool read_seq)
+{
+	const char *bmp_type_str = read_seq ? "read_seq" : "write_ack";
+	u8 dma_dir = read_seq ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
+	struct pci_dev *pdev = pds_vfio->vfio_coredev.pdev;
+	struct device *pdsc_dev = &pci_physfn(pdev)->dev;
+	unsigned long long npages;
+	struct sg_table sg_table;
+	struct scatterlist *sg;
+	struct page **pages;
+	u32 page_offset;
+	const void *bmp;
+	size_t size;
+	u16 num_sge;
+	int err;
+	int i;
+
+	bmp = (void *)((u64)bmp_info->bmp + offset);
+	page_offset = offset_in_page(bmp);
+	bmp -= page_offset;
+
+	/*
+	 * Start and end of bitmap section to seq/ack might not be page
+	 * aligned, so use the page_offset to account for that so there
+	 * will be enough pages to represent the bmp_bytes
+	 */
+	npages = DIV_ROUND_UP_ULL(bmp_bytes + page_offset, PAGE_SIZE);
+	pages = kmalloc_array(npages, sizeof(*pages), GFP_KERNEL);
+	if (!pages)
+		return -ENOMEM;
+
+	for (unsigned long long i = 0; i < npages; i++) {
+		struct page *page = vmalloc_to_page(bmp);
+
+		if (!page) {
+			err = -EFAULT;
+			goto out_free_pages;
+		}
+
+		pages[i] = page;
+		bmp += PAGE_SIZE;
+	}
+
+	err = sg_alloc_table_from_pages(&sg_table, pages, npages, page_offset,
+					bmp_bytes, GFP_KERNEL);
+	if (err)
+		goto out_free_pages;
+
+	err = dma_map_sgtable(pdsc_dev, &sg_table, dma_dir, 0);
+	if (err)
+		goto out_free_sg_table;
+
+	for_each_sgtable_dma_sg(&sg_table, sg, i) {
+		struct pds_lm_sg_elem *sg_elem = &bmp_info->sgl[i];
+
+		sg_elem->addr = cpu_to_le64(sg_dma_address(sg));
+		sg_elem->len = cpu_to_le32(sg_dma_len(sg));
+	}
+
+	num_sge = sg_table.nents;
+	size = num_sge * sizeof(struct pds_lm_sg_elem);
+	dma_sync_single_for_device(pdsc_dev, bmp_info->sgl_addr, size, dma_dir);
+	err = pds_vfio_dirty_seq_ack_cmd(pds_vfio, bmp_info->sgl_addr, num_sge,
+					 offset, bmp_bytes, read_seq);
+	if (err)
+		dev_err(&pdev->dev,
+			"Dirty bitmap %s failed offset %u bmp_bytes %u num_sge %u DMA 0x%llx: %pe\n",
+			bmp_type_str, offset, bmp_bytes,
+			num_sge, bmp_info->sgl_addr, ERR_PTR(err));
+	dma_sync_single_for_cpu(pdsc_dev, bmp_info->sgl_addr, size, dma_dir);
+
+	dma_unmap_sgtable(pdsc_dev, &sg_table, dma_dir, 0);
+out_free_sg_table:
+	sg_free_table(&sg_table);
+out_free_pages:
+	kfree(pages);
+
+	return err;
+}
+
+static int pds_vfio_dirty_write_ack(struct pds_vfio_pci_device *pds_vfio,
+				    u32 offset, u32 len)
+{
+	return pds_vfio_dirty_seq_ack(pds_vfio, &pds_vfio->dirty.host_ack,
+				      offset, len, WRITE_ACK);
+}
+
+static int pds_vfio_dirty_read_seq(struct pds_vfio_pci_device *pds_vfio,
+				   u32 offset, u32 len)
+{
+	return pds_vfio_dirty_seq_ack(pds_vfio, &pds_vfio->dirty.host_seq,
+				      offset, len, READ_SEQ);
+}
+
+static int pds_vfio_dirty_process_bitmaps(struct pds_vfio_pci_device *pds_vfio,
+					  struct iova_bitmap *dirty_bitmap,
+					  u32 bmp_offset, u32 len_bytes)
+{
+	u64 page_size = pds_vfio->dirty.region_page_size;
+	u64 region_start = pds_vfio->dirty.region_start;
+	u32 bmp_offset_bit;
+	__le64 *seq, *ack;
+	int dword_count;
+
+	dword_count = len_bytes / sizeof(u64);
+	seq = (__le64 *)((u64)pds_vfio->dirty.host_seq.bmp + bmp_offset);
+	ack = (__le64 *)((u64)pds_vfio->dirty.host_ack.bmp + bmp_offset);
+	bmp_offset_bit = bmp_offset * 8;
+
+	for (int i = 0; i < dword_count; i++) {
+		u64 xor = le64_to_cpu(seq[i]) ^ le64_to_cpu(ack[i]);
+
+		/* prepare for next write_ack call */
+		ack[i] = seq[i];
+
+		for (u8 bit_i = 0; bit_i < BITS_PER_TYPE(u64); ++bit_i) {
+			if (xor & BIT(bit_i)) {
+				u64 abs_bit_i = bmp_offset_bit +
+						i * BITS_PER_TYPE(u64) + bit_i;
+				u64 addr = abs_bit_i * page_size + region_start;
+
+				iova_bitmap_set(dirty_bitmap, addr, page_size);
+			}
+		}
+	}
+
+	return 0;
+}
+
+static int pds_vfio_dirty_sync(struct pds_vfio_pci_device *pds_vfio,
+			       struct iova_bitmap *dirty_bitmap,
+			       unsigned long iova, unsigned long length)
+{
+	struct device *dev = &pds_vfio->vfio_coredev.pdev->dev;
+	struct pds_vfio_dirty *dirty = &pds_vfio->dirty;
+	u64 bmp_offset, bmp_bytes;
+	u64 bitmap_size, pages;
+	int err;
+
+	dev_dbg(dev, "vf%u: Get dirty page bitmap\n", pds_vfio->vf_id);
+
+	if (!pds_vfio_dirty_is_enabled(pds_vfio)) {
+		dev_err(dev, "vf%u: Sync failed, dirty tracking is disabled\n",
+			pds_vfio->vf_id);
+		return -EINVAL;
+	}
+
+	pages = DIV_ROUND_UP(length, pds_vfio->dirty.region_page_size);
+	bitmap_size =
+		round_up(pages, sizeof(u64) * BITS_PER_BYTE) / BITS_PER_BYTE;
+
+	dev_dbg(dev,
+		"vf%u: iova 0x%lx length %lu page_size %llu pages %llu bitmap_size %llu\n",
+		pds_vfio->vf_id, iova, length, pds_vfio->dirty.region_page_size,
+		pages, bitmap_size);
+
+	if (!length || ((iova - dirty->region_start + length) > dirty->region_size)) {
+		dev_err(dev, "Invalid iova 0x%lx and/or length 0x%lx to sync\n",
+			iova, length);
+		return -EINVAL;
+	}
+
+	/* bitmap is modified in 64 bit chunks */
+	bmp_bytes = ALIGN(DIV_ROUND_UP(length / dirty->region_page_size,
+				       sizeof(u64)),
+			  sizeof(u64));
+	if (bmp_bytes != bitmap_size) {
+		dev_err(dev,
+			"Calculated bitmap bytes %llu not equal to bitmap size %llu\n",
+			bmp_bytes, bitmap_size);
+		return -EINVAL;
+	}
+
+	bmp_offset = DIV_ROUND_UP((iova - dirty->region_start) /
+				  dirty->region_page_size, sizeof(u64));
+
+	dev_dbg(dev,
+		"Syncing dirty bitmap, iova 0x%lx length 0x%lx, bmp_offset %llu bmp_bytes %llu\n",
+		iova, length, bmp_offset, bmp_bytes);
+
+	err = pds_vfio_dirty_read_seq(pds_vfio, bmp_offset, bmp_bytes);
+	if (err)
+		return err;
+
+	err = pds_vfio_dirty_process_bitmaps(pds_vfio, dirty_bitmap, bmp_offset,
+					     bmp_bytes);
+	if (err)
+		return err;
+
+	err = pds_vfio_dirty_write_ack(pds_vfio, bmp_offset, bmp_bytes);
+	if (err)
+		return err;
+
+	return 0;
+}
+
+int pds_vfio_dma_logging_report(struct vfio_device *vdev, unsigned long iova,
+				unsigned long length, struct iova_bitmap *dirty)
+{
+	struct pds_vfio_pci_device *pds_vfio =
+		container_of(vdev, struct pds_vfio_pci_device,
+			     vfio_coredev.vdev);
+	int err;
+
+	mutex_lock(&pds_vfio->state_mutex);
+	err = pds_vfio_dirty_sync(pds_vfio, dirty, iova, length);
+	pds_vfio_state_mutex_unlock(pds_vfio);
+
+	return err;
+}
+
+int pds_vfio_dma_logging_start(struct vfio_device *vdev,
+			       struct rb_root_cached *ranges, u32 nnodes,
+			       u64 *page_size)
+{
+	struct pds_vfio_pci_device *pds_vfio =
+		container_of(vdev, struct pds_vfio_pci_device,
+			     vfio_coredev.vdev);
+	int err;
+
+	mutex_lock(&pds_vfio->state_mutex);
+	pds_vfio_send_host_vf_lm_status_cmd(pds_vfio, PDS_LM_STA_IN_PROGRESS);
+	err = pds_vfio_dirty_enable(pds_vfio, ranges, nnodes, page_size);
+	pds_vfio_state_mutex_unlock(pds_vfio);
+
+	return err;
+}
+
+int pds_vfio_dma_logging_stop(struct vfio_device *vdev)
+{
+	struct pds_vfio_pci_device *pds_vfio =
+		container_of(vdev, struct pds_vfio_pci_device,
+			     vfio_coredev.vdev);
+
+	mutex_lock(&pds_vfio->state_mutex);
+	pds_vfio_dirty_disable(pds_vfio, true);
+	pds_vfio_state_mutex_unlock(pds_vfio);
+
+	return 0;
+}
diff --git a/drivers/vfio/pci/pds/dirty.h b/drivers/vfio/pci/pds/dirty.h
new file mode 100644
index 0000000000..f78da25d75
--- /dev/null
+++ b/drivers/vfio/pci/pds/dirty.h
@@ -0,0 +1,39 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright(c) 2023 Advanced Micro Devices, Inc. */
+
+#ifndef _DIRTY_H_
+#define _DIRTY_H_
+
+struct pds_vfio_bmp_info {
+	unsigned long *bmp;
+	u32 bmp_bytes;
+	struct pds_lm_sg_elem *sgl;
+	dma_addr_t sgl_addr;
+	u16 num_sge;
+};
+
+struct pds_vfio_dirty {
+	struct pds_vfio_bmp_info host_seq;
+	struct pds_vfio_bmp_info host_ack;
+	u64 region_size;
+	u64 region_start;
+	u64 region_page_size;
+	bool is_enabled;
+};
+
+struct pds_vfio_pci_device;
+
+bool pds_vfio_dirty_is_enabled(struct pds_vfio_pci_device *pds_vfio);
+void pds_vfio_dirty_set_enabled(struct pds_vfio_pci_device *pds_vfio);
+void pds_vfio_dirty_set_disabled(struct pds_vfio_pci_device *pds_vfio);
+void pds_vfio_dirty_disable(struct pds_vfio_pci_device *pds_vfio,
+			    bool send_cmd);
+
+int pds_vfio_dma_logging_report(struct vfio_device *vdev, unsigned long iova,
+				unsigned long length,
+				struct iova_bitmap *dirty);
+int pds_vfio_dma_logging_start(struct vfio_device *vdev,
+			       struct rb_root_cached *ranges, u32 nnodes,
+			       u64 *page_size);
+int pds_vfio_dma_logging_stop(struct vfio_device *vdev);
+#endif /* _DIRTY_H_ */
diff --git a/drivers/vfio/pci/pds/lm.c b/drivers/vfio/pci/pds/lm.c
new file mode 100644
index 0000000000..79fe2e66bb
--- /dev/null
+++ b/drivers/vfio/pci/pds/lm.c
@@ -0,0 +1,434 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 2023 Advanced Micro Devices, Inc. */
+
+#include <linux/anon_inodes.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/highmem.h>
+#include <linux/vfio.h>
+#include <linux/vfio_pci_core.h>
+
+#include "vfio_dev.h"
+#include "cmds.h"
+
+static struct pds_vfio_lm_file *
+pds_vfio_get_lm_file(const struct file_operations *fops, int flags, u64 size)
+{
+	struct pds_vfio_lm_file *lm_file = NULL;
+	unsigned long long npages;
+	struct page **pages;
+	void *page_mem;
+	const void *p;
+
+	if (!size)
+		return NULL;
+
+	/* Alloc file structure */
+	lm_file = kzalloc(sizeof(*lm_file), GFP_KERNEL);
+	if (!lm_file)
+		return NULL;
+
+	/* Create file */
+	lm_file->filep =
+		anon_inode_getfile("pds_vfio_lm", fops, lm_file, flags);
+	if (IS_ERR(lm_file->filep))
+		goto out_free_file;
+
+	stream_open(lm_file->filep->f_inode, lm_file->filep);
+	mutex_init(&lm_file->lock);
+
+	/* prevent file from being released before we are done with it */
+	get_file(lm_file->filep);
+
+	/* Allocate memory for file pages */
+	npages = DIV_ROUND_UP_ULL(size, PAGE_SIZE);
+	pages = kmalloc_array(npages, sizeof(*pages), GFP_KERNEL);
+	if (!pages)
+		goto out_put_file;
+
+	page_mem = kvzalloc(ALIGN(size, PAGE_SIZE), GFP_KERNEL);
+	if (!page_mem)
+		goto out_free_pages_array;
+
+	p = page_mem - offset_in_page(page_mem);
+	for (unsigned long long i = 0; i < npages; i++) {
+		if (is_vmalloc_addr(p))
+			pages[i] = vmalloc_to_page(p);
+		else
+			pages[i] = kmap_to_page((void *)p);
+		if (!pages[i])
+			goto out_free_page_mem;
+
+		p += PAGE_SIZE;
+	}
+
+	/* Create scatterlist of file pages to use for DMA mapping later */
+	if (sg_alloc_table_from_pages(&lm_file->sg_table, pages, npages, 0,
+				      size, GFP_KERNEL))
+		goto out_free_page_mem;
+
+	lm_file->size = size;
+	lm_file->pages = pages;
+	lm_file->npages = npages;
+	lm_file->page_mem = page_mem;
+	lm_file->alloc_size = npages * PAGE_SIZE;
+
+	return lm_file;
+
+out_free_page_mem:
+	kvfree(page_mem);
+out_free_pages_array:
+	kfree(pages);
+out_put_file:
+	fput(lm_file->filep);
+	mutex_destroy(&lm_file->lock);
+out_free_file:
+	kfree(lm_file);
+
+	return NULL;
+}
+
+static void pds_vfio_put_lm_file(struct pds_vfio_lm_file *lm_file)
+{
+	mutex_lock(&lm_file->lock);
+
+	lm_file->size = 0;
+	lm_file->alloc_size = 0;
+
+	/* Free scatter list of file pages */
+	sg_free_table(&lm_file->sg_table);
+
+	kvfree(lm_file->page_mem);
+	lm_file->page_mem = NULL;
+	kfree(lm_file->pages);
+	lm_file->pages = NULL;
+
+	mutex_unlock(&lm_file->lock);
+
+	/* allow file to be released since we are done with it */
+	fput(lm_file->filep);
+}
+
+void pds_vfio_put_save_file(struct pds_vfio_pci_device *pds_vfio)
+{
+	if (!pds_vfio->save_file)
+		return;
+
+	pds_vfio_put_lm_file(pds_vfio->save_file);
+	pds_vfio->save_file = NULL;
+}
+
+void pds_vfio_put_restore_file(struct pds_vfio_pci_device *pds_vfio)
+{
+	if (!pds_vfio->restore_file)
+		return;
+
+	pds_vfio_put_lm_file(pds_vfio->restore_file);
+	pds_vfio->restore_file = NULL;
+}
+
+static struct page *pds_vfio_get_file_page(struct pds_vfio_lm_file *lm_file,
+					   unsigned long offset)
+{
+	unsigned long cur_offset = 0;
+	struct scatterlist *sg;
+	unsigned int i;
+
+	/* All accesses are sequential */
+	if (offset < lm_file->last_offset || !lm_file->last_offset_sg) {
+		lm_file->last_offset = 0;
+		lm_file->last_offset_sg = lm_file->sg_table.sgl;
+		lm_file->sg_last_entry = 0;
+	}
+
+	cur_offset = lm_file->last_offset;
+
+	for_each_sg(lm_file->last_offset_sg, sg,
+		    lm_file->sg_table.orig_nents - lm_file->sg_last_entry, i) {
+		if (offset < sg->length + cur_offset) {
+			lm_file->last_offset_sg = sg;
+			lm_file->sg_last_entry += i;
+			lm_file->last_offset = cur_offset;
+			return nth_page(sg_page(sg),
+					(offset - cur_offset) / PAGE_SIZE);
+		}
+		cur_offset += sg->length;
+	}
+
+	return NULL;
+}
+
+static int pds_vfio_release_file(struct inode *inode, struct file *filp)
+{
+	struct pds_vfio_lm_file *lm_file = filp->private_data;
+
+	mutex_lock(&lm_file->lock);
+	lm_file->filep->f_pos = 0;
+	lm_file->size = 0;
+	mutex_unlock(&lm_file->lock);
+	mutex_destroy(&lm_file->lock);
+	kfree(lm_file);
+
+	return 0;
+}
+
+static ssize_t pds_vfio_save_read(struct file *filp, char __user *buf,
+				  size_t len, loff_t *pos)
+{
+	struct pds_vfio_lm_file *lm_file = filp->private_data;
+	ssize_t done = 0;
+
+	if (pos)
+		return -ESPIPE;
+	pos = &filp->f_pos;
+
+	mutex_lock(&lm_file->lock);
+	if (*pos > lm_file->size) {
+		done = -EINVAL;
+		goto out_unlock;
+	}
+
+	len = min_t(size_t, lm_file->size - *pos, len);
+	while (len) {
+		size_t page_offset;
+		struct page *page;
+		size_t page_len;
+		u8 *from_buff;
+		int err;
+
+		page_offset = (*pos) % PAGE_SIZE;
+		page = pds_vfio_get_file_page(lm_file, *pos - page_offset);
+		if (!page) {
+			if (done == 0)
+				done = -EINVAL;
+			goto out_unlock;
+		}
+
+		page_len = min_t(size_t, len, PAGE_SIZE - page_offset);
+		from_buff = kmap_local_page(page);
+		err = copy_to_user(buf, from_buff + page_offset, page_len);
+		kunmap_local(from_buff);
+		if (err) {
+			done = -EFAULT;
+			goto out_unlock;
+		}
+		*pos += page_len;
+		len -= page_len;
+		done += page_len;
+		buf += page_len;
+	}
+
+out_unlock:
+	mutex_unlock(&lm_file->lock);
+	return done;
+}
+
+static const struct file_operations pds_vfio_save_fops = {
+	.owner = THIS_MODULE,
+	.read = pds_vfio_save_read,
+	.release = pds_vfio_release_file,
+	.llseek = no_llseek,
+};
+
+static int pds_vfio_get_save_file(struct pds_vfio_pci_device *pds_vfio)
+{
+	struct device *dev = &pds_vfio->vfio_coredev.pdev->dev;
+	struct pds_vfio_lm_file *lm_file;
+	u64 size;
+	int err;
+
+	/* Get live migration state size in this state */
+	err = pds_vfio_get_lm_state_size_cmd(pds_vfio, &size);
+	if (err) {
+		dev_err(dev, "failed to get save status: %pe\n", ERR_PTR(err));
+		return err;
+	}
+
+	dev_dbg(dev, "save status, size = %lld\n", size);
+
+	if (!size) {
+		dev_err(dev, "invalid state size\n");
+		return -EIO;
+	}
+
+	lm_file = pds_vfio_get_lm_file(&pds_vfio_save_fops, O_RDONLY, size);
+	if (!lm_file) {
+		dev_err(dev, "failed to create save file\n");
+		return -ENOENT;
+	}
+
+	dev_dbg(dev, "size = %lld, alloc_size = %lld, npages = %lld\n",
+		lm_file->size, lm_file->alloc_size, lm_file->npages);
+
+	pds_vfio->save_file = lm_file;
+
+	return 0;
+}
+
+static ssize_t pds_vfio_restore_write(struct file *filp, const char __user *buf,
+				      size_t len, loff_t *pos)
+{
+	struct pds_vfio_lm_file *lm_file = filp->private_data;
+	loff_t requested_length;
+	ssize_t done = 0;
+
+	if (pos)
+		return -ESPIPE;
+
+	pos = &filp->f_pos;
+
+	if (*pos < 0 ||
+	    check_add_overflow((loff_t)len, *pos, &requested_length))
+		return -EINVAL;
+
+	mutex_lock(&lm_file->lock);
+
+	while (len) {
+		size_t page_offset;
+		struct page *page;
+		size_t page_len;
+		u8 *to_buff;
+		int err;
+
+		page_offset = (*pos) % PAGE_SIZE;
+		page = pds_vfio_get_file_page(lm_file, *pos - page_offset);
+		if (!page) {
+			if (done == 0)
+				done = -EINVAL;
+			goto out_unlock;
+		}
+
+		page_len = min_t(size_t, len, PAGE_SIZE - page_offset);
+		to_buff = kmap_local_page(page);
+		err = copy_from_user(to_buff + page_offset, buf, page_len);
+		kunmap_local(to_buff);
+		if (err) {
+			done = -EFAULT;
+			goto out_unlock;
+		}
+		*pos += page_len;
+		len -= page_len;
+		done += page_len;
+		buf += page_len;
+		lm_file->size += page_len;
+	}
+out_unlock:
+	mutex_unlock(&lm_file->lock);
+	return done;
+}
+
+static const struct file_operations pds_vfio_restore_fops = {
+	.owner = THIS_MODULE,
+	.write = pds_vfio_restore_write,
+	.release = pds_vfio_release_file,
+	.llseek = no_llseek,
+};
+
+static int pds_vfio_get_restore_file(struct pds_vfio_pci_device *pds_vfio)
+{
+	struct device *dev = &pds_vfio->vfio_coredev.pdev->dev;
+	struct pds_vfio_lm_file *lm_file;
+	u64 size;
+
+	size = sizeof(union pds_lm_dev_state);
+	dev_dbg(dev, "restore status, size = %lld\n", size);
+
+	if (!size) {
+		dev_err(dev, "invalid state size");
+		return -EIO;
+	}
+
+	lm_file = pds_vfio_get_lm_file(&pds_vfio_restore_fops, O_WRONLY, size);
+	if (!lm_file) {
+		dev_err(dev, "failed to create restore file");
+		return -ENOENT;
+	}
+	pds_vfio->restore_file = lm_file;
+
+	return 0;
+}
+
+struct file *
+pds_vfio_step_device_state_locked(struct pds_vfio_pci_device *pds_vfio,
+				  enum vfio_device_mig_state next)
+{
+	enum vfio_device_mig_state cur = pds_vfio->state;
+	int err;
+
+	if (cur == VFIO_DEVICE_STATE_STOP && next == VFIO_DEVICE_STATE_STOP_COPY) {
+		err = pds_vfio_get_save_file(pds_vfio);
+		if (err)
+			return ERR_PTR(err);
+
+		err = pds_vfio_get_lm_state_cmd(pds_vfio);
+		if (err) {
+			pds_vfio_put_save_file(pds_vfio);
+			return ERR_PTR(err);
+		}
+
+		return pds_vfio->save_file->filep;
+	}
+
+	if (cur == VFIO_DEVICE_STATE_STOP_COPY && next == VFIO_DEVICE_STATE_STOP) {
+		pds_vfio_put_save_file(pds_vfio);
+		pds_vfio_dirty_disable(pds_vfio, true);
+		return NULL;
+	}
+
+	if (cur == VFIO_DEVICE_STATE_STOP && next == VFIO_DEVICE_STATE_RESUMING) {
+		err = pds_vfio_get_restore_file(pds_vfio);
+		if (err)
+			return ERR_PTR(err);
+
+		return pds_vfio->restore_file->filep;
+	}
+
+	if (cur == VFIO_DEVICE_STATE_RESUMING && next == VFIO_DEVICE_STATE_STOP) {
+		err = pds_vfio_set_lm_state_cmd(pds_vfio);
+		if (err)
+			return ERR_PTR(err);
+
+		pds_vfio_put_restore_file(pds_vfio);
+		return NULL;
+	}
+
+	if (cur == VFIO_DEVICE_STATE_RUNNING && next == VFIO_DEVICE_STATE_RUNNING_P2P) {
+		pds_vfio_send_host_vf_lm_status_cmd(pds_vfio,
+						    PDS_LM_STA_IN_PROGRESS);
+		err = pds_vfio_suspend_device_cmd(pds_vfio,
+						  PDS_LM_SUSPEND_RESUME_TYPE_P2P);
+		if (err)
+			return ERR_PTR(err);
+
+		return NULL;
+	}
+
+	if (cur == VFIO_DEVICE_STATE_RUNNING_P2P && next == VFIO_DEVICE_STATE_RUNNING) {
+		err = pds_vfio_resume_device_cmd(pds_vfio,
+						 PDS_LM_SUSPEND_RESUME_TYPE_FULL);
+		if (err)
+			return ERR_PTR(err);
+
+		pds_vfio_send_host_vf_lm_status_cmd(pds_vfio, PDS_LM_STA_NONE);
+		return NULL;
+	}
+
+	if (cur == VFIO_DEVICE_STATE_STOP && next == VFIO_DEVICE_STATE_RUNNING_P2P) {
+		err = pds_vfio_resume_device_cmd(pds_vfio,
+						 PDS_LM_SUSPEND_RESUME_TYPE_P2P);
+		if (err)
+			return ERR_PTR(err);
+
+		return NULL;
+	}
+
+	if (cur == VFIO_DEVICE_STATE_RUNNING_P2P && next == VFIO_DEVICE_STATE_STOP) {
+		err = pds_vfio_suspend_device_cmd(pds_vfio,
+						  PDS_LM_SUSPEND_RESUME_TYPE_FULL);
+		if (err)
+			return ERR_PTR(err);
+		return NULL;
+	}
+
+	return ERR_PTR(-EINVAL);
+}
diff --git a/drivers/vfio/pci/pds/lm.h b/drivers/vfio/pci/pds/lm.h
new file mode 100644
index 0000000000..13be893198
--- /dev/null
+++ b/drivers/vfio/pci/pds/lm.h
@@ -0,0 +1,41 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright(c) 2023 Advanced Micro Devices, Inc. */
+
+#ifndef _LM_H_
+#define _LM_H_
+
+#include <linux/fs.h>
+#include <linux/mutex.h>
+#include <linux/scatterlist.h>
+#include <linux/types.h>
+
+#include <linux/pds/pds_common.h>
+#include <linux/pds/pds_adminq.h>
+
+struct pds_vfio_lm_file {
+	struct file *filep;
+	struct mutex lock;	/* protect live migration data file */
+	u64 size;		/* Size with valid data */
+	u64 alloc_size;		/* Total allocated size. Always >= len */
+	void *page_mem;		/* memory allocated for pages */
+	struct page **pages;	/* Backing pages for file */
+	unsigned long long npages;
+	struct sg_table sg_table;	/* SG table for backing pages */
+	struct pds_lm_sg_elem *sgl;	/* DMA mapping */
+	dma_addr_t sgl_addr;
+	u16 num_sge;
+	struct scatterlist *last_offset_sg;	/* Iterator */
+	unsigned int sg_last_entry;
+	unsigned long last_offset;
+};
+
+struct pds_vfio_pci_device;
+
+struct file *
+pds_vfio_step_device_state_locked(struct pds_vfio_pci_device *pds_vfio,
+				  enum vfio_device_mig_state next);
+
+void pds_vfio_put_save_file(struct pds_vfio_pci_device *pds_vfio);
+void pds_vfio_put_restore_file(struct pds_vfio_pci_device *pds_vfio);
+
+#endif /* _LM_H_ */
diff --git a/drivers/vfio/pci/pds/pci_drv.c b/drivers/vfio/pci/pds/pci_drv.c
new file mode 100644
index 0000000000..caffa1a2cf
--- /dev/null
+++ b/drivers/vfio/pci/pds/pci_drv.c
@@ -0,0 +1,209 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 2023 Advanced Micro Devices, Inc. */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/types.h>
+#include <linux/vfio.h>
+
+#include <linux/pds/pds_common.h>
+#include <linux/pds/pds_core_if.h>
+#include <linux/pds/pds_adminq.h>
+
+#include "vfio_dev.h"
+#include "pci_drv.h"
+#include "cmds.h"
+
+#define PDS_VFIO_DRV_DESCRIPTION	"AMD/Pensando VFIO Device Driver"
+#define PCI_VENDOR_ID_PENSANDO		0x1dd8
+
+static void pds_vfio_recovery(struct pds_vfio_pci_device *pds_vfio)
+{
+	bool deferred_reset_needed = false;
+
+	/*
+	 * Documentation states that the kernel migration driver must not
+	 * generate asynchronous device state transitions outside of
+	 * manipulation by the user or the VFIO_DEVICE_RESET ioctl.
+	 *
+	 * Since recovery is an asynchronous event received from the device,
+	 * initiate a deferred reset. Issue a deferred reset in the following
+	 * situations:
+	 *   1. Migration is in progress, which will cause the next step of
+	 *	the migration to fail.
+	 *   2. If the device is in a state that will be set to
+	 *	VFIO_DEVICE_STATE_RUNNING on the next action (i.e. VM is
+	 *	shutdown and device is in VFIO_DEVICE_STATE_STOP).
+	 */
+	mutex_lock(&pds_vfio->state_mutex);
+	if ((pds_vfio->state != VFIO_DEVICE_STATE_RUNNING &&
+	     pds_vfio->state != VFIO_DEVICE_STATE_ERROR) ||
+	    (pds_vfio->state == VFIO_DEVICE_STATE_RUNNING &&
+	     pds_vfio_dirty_is_enabled(pds_vfio)))
+		deferred_reset_needed = true;
+	mutex_unlock(&pds_vfio->state_mutex);
+
+	/*
+	 * On the next user initiated state transition, the device will
+	 * transition to the VFIO_DEVICE_STATE_ERROR. At this point it's the user's
+	 * responsibility to reset the device.
+	 *
+	 * If a VFIO_DEVICE_RESET is requested post recovery and before the next
+	 * state transition, then the deferred reset state will be set to
+	 * VFIO_DEVICE_STATE_RUNNING.
+	 */
+	if (deferred_reset_needed) {
+		mutex_lock(&pds_vfio->reset_mutex);
+		pds_vfio->deferred_reset = true;
+		pds_vfio->deferred_reset_state = VFIO_DEVICE_STATE_ERROR;
+		mutex_unlock(&pds_vfio->reset_mutex);
+	}
+}
+
+static int pds_vfio_pci_notify_handler(struct notifier_block *nb,
+				       unsigned long ecode, void *data)
+{
+	struct pds_vfio_pci_device *pds_vfio =
+		container_of(nb, struct pds_vfio_pci_device, nb);
+	struct device *dev = pds_vfio_to_dev(pds_vfio);
+	union pds_core_notifyq_comp *event = data;
+
+	dev_dbg(dev, "%s: event code %lu\n", __func__, ecode);
+
+	/*
+	 * We don't need to do anything for RESET state==0 as there is no notify
+	 * or feedback mechanism available, and it is possible that we won't
+	 * even see a state==0 event since the pds_core recovery is pending.
+	 *
+	 * Any requests from VFIO while state==0 will fail, which will return
+	 * error and may cause migration to fail.
+	 */
+	if (ecode == PDS_EVENT_RESET) {
+		dev_info(dev, "%s: PDS_EVENT_RESET event received, state==%d\n",
+			 __func__, event->reset.state);
+		/*
+		 * pds_core device finished recovery and sent us the
+		 * notification (state == 1) to allow us to recover
+		 */
+		if (event->reset.state == 1)
+			pds_vfio_recovery(pds_vfio);
+	}
+
+	return 0;
+}
+
+static int
+pds_vfio_pci_register_event_handler(struct pds_vfio_pci_device *pds_vfio)
+{
+	struct device *dev = pds_vfio_to_dev(pds_vfio);
+	struct notifier_block *nb = &pds_vfio->nb;
+	int err;
+
+	if (!nb->notifier_call) {
+		nb->notifier_call = pds_vfio_pci_notify_handler;
+		err = pdsc_register_notify(nb);
+		if (err) {
+			nb->notifier_call = NULL;
+			dev_err(dev,
+				"failed to register pds event handler: %pe\n",
+				ERR_PTR(err));
+			return -EINVAL;
+		}
+		dev_dbg(dev, "pds event handler registered\n");
+	}
+
+	return 0;
+}
+
+static void
+pds_vfio_pci_unregister_event_handler(struct pds_vfio_pci_device *pds_vfio)
+{
+	if (pds_vfio->nb.notifier_call) {
+		pdsc_unregister_notify(&pds_vfio->nb);
+		pds_vfio->nb.notifier_call = NULL;
+	}
+}
+
+static int pds_vfio_pci_probe(struct pci_dev *pdev,
+			      const struct pci_device_id *id)
+{
+	struct pds_vfio_pci_device *pds_vfio;
+	int err;
+
+	pds_vfio = vfio_alloc_device(pds_vfio_pci_device, vfio_coredev.vdev,
+				     &pdev->dev, pds_vfio_ops_info());
+	if (IS_ERR(pds_vfio))
+		return PTR_ERR(pds_vfio);
+
+	dev_set_drvdata(&pdev->dev, &pds_vfio->vfio_coredev);
+
+	err = vfio_pci_core_register_device(&pds_vfio->vfio_coredev);
+	if (err)
+		goto out_put_vdev;
+
+	err = pds_vfio_register_client_cmd(pds_vfio);
+	if (err) {
+		dev_err(&pdev->dev, "failed to register as client: %pe\n",
+			ERR_PTR(err));
+		goto out_unregister_coredev;
+	}
+
+	err = pds_vfio_pci_register_event_handler(pds_vfio);
+	if (err)
+		goto out_unregister_client;
+
+	return 0;
+
+out_unregister_client:
+	pds_vfio_unregister_client_cmd(pds_vfio);
+out_unregister_coredev:
+	vfio_pci_core_unregister_device(&pds_vfio->vfio_coredev);
+out_put_vdev:
+	vfio_put_device(&pds_vfio->vfio_coredev.vdev);
+	return err;
+}
+
+static void pds_vfio_pci_remove(struct pci_dev *pdev)
+{
+	struct pds_vfio_pci_device *pds_vfio = pds_vfio_pci_drvdata(pdev);
+
+	pds_vfio_pci_unregister_event_handler(pds_vfio);
+	pds_vfio_unregister_client_cmd(pds_vfio);
+	vfio_pci_core_unregister_device(&pds_vfio->vfio_coredev);
+	vfio_put_device(&pds_vfio->vfio_coredev.vdev);
+}
+
+static const struct pci_device_id pds_vfio_pci_table[] = {
+	{ PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_PENSANDO, 0x1003) }, /* Ethernet VF */
+	{ 0, }
+};
+MODULE_DEVICE_TABLE(pci, pds_vfio_pci_table);
+
+static void pds_vfio_pci_aer_reset_done(struct pci_dev *pdev)
+{
+	struct pds_vfio_pci_device *pds_vfio = pds_vfio_pci_drvdata(pdev);
+
+	pds_vfio_reset(pds_vfio);
+}
+
+static const struct pci_error_handlers pds_vfio_pci_err_handlers = {
+	.reset_done = pds_vfio_pci_aer_reset_done,
+	.error_detected = vfio_pci_core_aer_err_detected,
+};
+
+static struct pci_driver pds_vfio_pci_driver = {
+	.name = KBUILD_MODNAME,
+	.id_table = pds_vfio_pci_table,
+	.probe = pds_vfio_pci_probe,
+	.remove = pds_vfio_pci_remove,
+	.err_handler = &pds_vfio_pci_err_handlers,
+	.driver_managed_dma = true,
+};
+
+module_pci_driver(pds_vfio_pci_driver);
+
+MODULE_DESCRIPTION(PDS_VFIO_DRV_DESCRIPTION);
+MODULE_AUTHOR("Brett Creeley <brett.creeley@amd.com>");
+MODULE_LICENSE("GPL");
diff --git a/drivers/vfio/pci/pds/pci_drv.h b/drivers/vfio/pci/pds/pci_drv.h
new file mode 100644
index 0000000000..e79bed12ed
--- /dev/null
+++ b/drivers/vfio/pci/pds/pci_drv.h
@@ -0,0 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright(c) 2023 Advanced Micro Devices, Inc. */
+
+#ifndef _PCI_DRV_H
+#define _PCI_DRV_H
+
+#include <linux/pci.h>
+
+#endif /* _PCI_DRV_H */
diff --git a/drivers/vfio/pci/pds/vfio_dev.c b/drivers/vfio/pci/pds/vfio_dev.c
new file mode 100644
index 0000000000..4c351c59d0
--- /dev/null
+++ b/drivers/vfio/pci/pds/vfio_dev.c
@@ -0,0 +1,239 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 2023 Advanced Micro Devices, Inc. */
+
+#include <linux/vfio.h>
+#include <linux/vfio_pci_core.h>
+
+#include "lm.h"
+#include "dirty.h"
+#include "vfio_dev.h"
+
+struct pci_dev *pds_vfio_to_pci_dev(struct pds_vfio_pci_device *pds_vfio)
+{
+	return pds_vfio->vfio_coredev.pdev;
+}
+
+struct device *pds_vfio_to_dev(struct pds_vfio_pci_device *pds_vfio)
+{
+	return &pds_vfio_to_pci_dev(pds_vfio)->dev;
+}
+
+struct pds_vfio_pci_device *pds_vfio_pci_drvdata(struct pci_dev *pdev)
+{
+	struct vfio_pci_core_device *core_device = dev_get_drvdata(&pdev->dev);
+
+	return container_of(core_device, struct pds_vfio_pci_device,
+			    vfio_coredev);
+}
+
+void pds_vfio_state_mutex_unlock(struct pds_vfio_pci_device *pds_vfio)
+{
+again:
+	mutex_lock(&pds_vfio->reset_mutex);
+	if (pds_vfio->deferred_reset) {
+		pds_vfio->deferred_reset = false;
+		if (pds_vfio->state == VFIO_DEVICE_STATE_ERROR) {
+			pds_vfio_put_restore_file(pds_vfio);
+			pds_vfio_put_save_file(pds_vfio);
+			pds_vfio_dirty_disable(pds_vfio, false);
+		}
+		pds_vfio->state = pds_vfio->deferred_reset_state;
+		pds_vfio->deferred_reset_state = VFIO_DEVICE_STATE_RUNNING;
+		mutex_unlock(&pds_vfio->reset_mutex);
+		goto again;
+	}
+	mutex_unlock(&pds_vfio->state_mutex);
+	mutex_unlock(&pds_vfio->reset_mutex);
+}
+
+void pds_vfio_reset(struct pds_vfio_pci_device *pds_vfio)
+{
+	mutex_lock(&pds_vfio->reset_mutex);
+	pds_vfio->deferred_reset = true;
+	pds_vfio->deferred_reset_state = VFIO_DEVICE_STATE_RUNNING;
+	if (!mutex_trylock(&pds_vfio->state_mutex)) {
+		mutex_unlock(&pds_vfio->reset_mutex);
+		return;
+	}
+	mutex_unlock(&pds_vfio->reset_mutex);
+	pds_vfio_state_mutex_unlock(pds_vfio);
+}
+
+static struct file *
+pds_vfio_set_device_state(struct vfio_device *vdev,
+			  enum vfio_device_mig_state new_state)
+{
+	struct pds_vfio_pci_device *pds_vfio =
+		container_of(vdev, struct pds_vfio_pci_device,
+			     vfio_coredev.vdev);
+	struct file *res = NULL;
+
+	mutex_lock(&pds_vfio->state_mutex);
+	/*
+	 * only way to transition out of VFIO_DEVICE_STATE_ERROR is via
+	 * VFIO_DEVICE_RESET, so prevent the state machine from running since
+	 * vfio_mig_get_next_state() will throw a WARN_ON() when transitioning
+	 * from VFIO_DEVICE_STATE_ERROR to any other state
+	 */
+	while (pds_vfio->state != VFIO_DEVICE_STATE_ERROR &&
+	       new_state != pds_vfio->state) {
+		enum vfio_device_mig_state next_state;
+
+		int err = vfio_mig_get_next_state(vdev, pds_vfio->state,
+						  new_state, &next_state);
+		if (err) {
+			res = ERR_PTR(err);
+			break;
+		}
+
+		res = pds_vfio_step_device_state_locked(pds_vfio, next_state);
+		if (IS_ERR(res))
+			break;
+
+		pds_vfio->state = next_state;
+
+		if (WARN_ON(res && new_state != pds_vfio->state)) {
+			res = ERR_PTR(-EINVAL);
+			break;
+		}
+	}
+	pds_vfio_state_mutex_unlock(pds_vfio);
+	/* still waiting on a deferred_reset */
+	if (pds_vfio->state == VFIO_DEVICE_STATE_ERROR)
+		res = ERR_PTR(-EIO);
+
+	return res;
+}
+
+static int pds_vfio_get_device_state(struct vfio_device *vdev,
+				     enum vfio_device_mig_state *current_state)
+{
+	struct pds_vfio_pci_device *pds_vfio =
+		container_of(vdev, struct pds_vfio_pci_device,
+			     vfio_coredev.vdev);
+
+	mutex_lock(&pds_vfio->state_mutex);
+	*current_state = pds_vfio->state;
+	pds_vfio_state_mutex_unlock(pds_vfio);
+	return 0;
+}
+
+static int pds_vfio_get_device_state_size(struct vfio_device *vdev,
+					  unsigned long *stop_copy_length)
+{
+	*stop_copy_length = PDS_LM_DEVICE_STATE_LENGTH;
+	return 0;
+}
+
+static const struct vfio_migration_ops pds_vfio_lm_ops = {
+	.migration_set_state = pds_vfio_set_device_state,
+	.migration_get_state = pds_vfio_get_device_state,
+	.migration_get_data_size = pds_vfio_get_device_state_size
+};
+
+static const struct vfio_log_ops pds_vfio_log_ops = {
+	.log_start = pds_vfio_dma_logging_start,
+	.log_stop = pds_vfio_dma_logging_stop,
+	.log_read_and_clear = pds_vfio_dma_logging_report,
+};
+
+static int pds_vfio_init_device(struct vfio_device *vdev)
+{
+	struct pds_vfio_pci_device *pds_vfio =
+		container_of(vdev, struct pds_vfio_pci_device,
+			     vfio_coredev.vdev);
+	struct pci_dev *pdev = to_pci_dev(vdev->dev);
+	int err, vf_id, pci_id;
+
+	vf_id = pci_iov_vf_id(pdev);
+	if (vf_id < 0)
+		return vf_id;
+
+	err = vfio_pci_core_init_dev(vdev);
+	if (err)
+		return err;
+
+	pds_vfio->vf_id = vf_id;
+
+	mutex_init(&pds_vfio->state_mutex);
+	mutex_init(&pds_vfio->reset_mutex);
+
+	vdev->migration_flags = VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_P2P;
+	vdev->mig_ops = &pds_vfio_lm_ops;
+	vdev->log_ops = &pds_vfio_log_ops;
+
+	pci_id = PCI_DEVID(pdev->bus->number, pdev->devfn);
+	dev_dbg(&pdev->dev,
+		"%s: PF %#04x VF %#04x vf_id %d domain %d pds_vfio %p\n",
+		__func__, pci_dev_id(pci_physfn(pdev)), pci_id, vf_id,
+		pci_domain_nr(pdev->bus), pds_vfio);
+
+	return 0;
+}
+
+static void pds_vfio_release_device(struct vfio_device *vdev)
+{
+	struct pds_vfio_pci_device *pds_vfio =
+		container_of(vdev, struct pds_vfio_pci_device,
+			     vfio_coredev.vdev);
+
+	mutex_destroy(&pds_vfio->state_mutex);
+	mutex_destroy(&pds_vfio->reset_mutex);
+	vfio_pci_core_release_dev(vdev);
+}
+
+static int pds_vfio_open_device(struct vfio_device *vdev)
+{
+	struct pds_vfio_pci_device *pds_vfio =
+		container_of(vdev, struct pds_vfio_pci_device,
+			     vfio_coredev.vdev);
+	int err;
+
+	err = vfio_pci_core_enable(&pds_vfio->vfio_coredev);
+	if (err)
+		return err;
+
+	pds_vfio->state = VFIO_DEVICE_STATE_RUNNING;
+	pds_vfio->deferred_reset_state = VFIO_DEVICE_STATE_RUNNING;
+
+	vfio_pci_core_finish_enable(&pds_vfio->vfio_coredev);
+
+	return 0;
+}
+
+static void pds_vfio_close_device(struct vfio_device *vdev)
+{
+	struct pds_vfio_pci_device *pds_vfio =
+		container_of(vdev, struct pds_vfio_pci_device,
+			     vfio_coredev.vdev);
+
+	mutex_lock(&pds_vfio->state_mutex);
+	pds_vfio_put_restore_file(pds_vfio);
+	pds_vfio_put_save_file(pds_vfio);
+	pds_vfio_dirty_disable(pds_vfio, true);
+	mutex_unlock(&pds_vfio->state_mutex);
+	vfio_pci_core_close_device(vdev);
+}
+
+static const struct vfio_device_ops pds_vfio_ops = {
+	.name = "pds-vfio",
+	.init = pds_vfio_init_device,
+	.release = pds_vfio_release_device,
+	.open_device = pds_vfio_open_device,
+	.close_device = pds_vfio_close_device,
+	.ioctl = vfio_pci_core_ioctl,
+	.device_feature = vfio_pci_core_ioctl_feature,
+	.read = vfio_pci_core_read,
+	.write = vfio_pci_core_write,
+	.mmap = vfio_pci_core_mmap,
+	.request = vfio_pci_core_request,
+	.match = vfio_pci_core_match,
+	.bind_iommufd = vfio_iommufd_physical_bind,
+	.unbind_iommufd = vfio_iommufd_physical_unbind,
+	.attach_ioas = vfio_iommufd_physical_attach_ioas,
+};
+
+const struct vfio_device_ops *pds_vfio_ops_info(void)
+{
+	return &pds_vfio_ops;
+}
diff --git a/drivers/vfio/pci/pds/vfio_dev.h b/drivers/vfio/pci/pds/vfio_dev.h
new file mode 100644
index 0000000000..e7b01080a1
--- /dev/null
+++ b/drivers/vfio/pci/pds/vfio_dev.h
@@ -0,0 +1,39 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright(c) 2023 Advanced Micro Devices, Inc. */
+
+#ifndef _VFIO_DEV_H_
+#define _VFIO_DEV_H_
+
+#include <linux/pci.h>
+#include <linux/vfio_pci_core.h>
+
+#include "dirty.h"
+#include "lm.h"
+
+struct pds_vfio_pci_device {
+	struct vfio_pci_core_device vfio_coredev;
+
+	struct pds_vfio_lm_file *save_file;
+	struct pds_vfio_lm_file *restore_file;
+	struct pds_vfio_dirty dirty;
+	struct mutex state_mutex; /* protect migration state */
+	enum vfio_device_mig_state state;
+	struct mutex reset_mutex; /* protect reset_done flow */
+	u8 deferred_reset;
+	enum vfio_device_mig_state deferred_reset_state;
+	struct notifier_block nb;
+
+	int vf_id;
+	u16 client_id;
+};
+
+void pds_vfio_state_mutex_unlock(struct pds_vfio_pci_device *pds_vfio);
+
+const struct vfio_device_ops *pds_vfio_ops_info(void);
+struct pds_vfio_pci_device *pds_vfio_pci_drvdata(struct pci_dev *pdev);
+void pds_vfio_reset(struct pds_vfio_pci_device *pds_vfio);
+
+struct pci_dev *pds_vfio_to_pci_dev(struct pds_vfio_pci_device *pds_vfio);
+struct device *pds_vfio_to_dev(struct pds_vfio_pci_device *pds_vfio);
+
+#endif /* _VFIO_DEV_H_ */
diff --git a/drivers/vfio/pci/trace.h b/drivers/vfio/pci/trace.h
new file mode 100644
index 0000000000..b2aa986ab9
--- /dev/null
+++ b/drivers/vfio/pci/trace.h
@@ -0,0 +1,98 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * VFIO PCI mmap/mmap_fault tracepoints
+ *
+ * Copyright (C) 2018 IBM Corp.  All rights reserved.
+ *     Author: Alexey Kardashevskiy <aik@ozlabs.ru>
+ */
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM vfio_pci
+
+#if !defined(_TRACE_VFIO_PCI_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_VFIO_PCI_H
+
+#include <linux/tracepoint.h>
+
+TRACE_EVENT(vfio_pci_nvgpu_mmap_fault,
+	TP_PROTO(struct pci_dev *pdev, unsigned long hpa, unsigned long ua,
+			vm_fault_t ret),
+	TP_ARGS(pdev, hpa, ua, ret),
+
+	TP_STRUCT__entry(
+		__field(const char *, name)
+		__field(unsigned long, hpa)
+		__field(unsigned long, ua)
+		__field(int, ret)
+	),
+
+	TP_fast_assign(
+		__entry->name = dev_name(&pdev->dev),
+		__entry->hpa = hpa;
+		__entry->ua = ua;
+		__entry->ret = ret;
+	),
+
+	TP_printk("%s: %lx -> %lx ret=%d", __entry->name, __entry->hpa,
+			__entry->ua, __entry->ret)
+);
+
+TRACE_EVENT(vfio_pci_nvgpu_mmap,
+	TP_PROTO(struct pci_dev *pdev, unsigned long hpa, unsigned long ua,
+			unsigned long size, int ret),
+	TP_ARGS(pdev, hpa, ua, size, ret),
+
+	TP_STRUCT__entry(
+		__field(const char *, name)
+		__field(unsigned long, hpa)
+		__field(unsigned long, ua)
+		__field(unsigned long, size)
+		__field(int, ret)
+	),
+
+	TP_fast_assign(
+		__entry->name = dev_name(&pdev->dev),
+		__entry->hpa = hpa;
+		__entry->ua = ua;
+		__entry->size = size;
+		__entry->ret = ret;
+	),
+
+	TP_printk("%s: %lx -> %lx size=%lx ret=%d", __entry->name, __entry->hpa,
+			__entry->ua, __entry->size, __entry->ret)
+);
+
+TRACE_EVENT(vfio_pci_npu2_mmap,
+	TP_PROTO(struct pci_dev *pdev, unsigned long hpa, unsigned long ua,
+			unsigned long size, int ret),
+	TP_ARGS(pdev, hpa, ua, size, ret),
+
+	TP_STRUCT__entry(
+		__field(const char *, name)
+		__field(unsigned long, hpa)
+		__field(unsigned long, ua)
+		__field(unsigned long, size)
+		__field(int, ret)
+	),
+
+	TP_fast_assign(
+		__entry->name = dev_name(&pdev->dev),
+		__entry->hpa = hpa;
+		__entry->ua = ua;
+		__entry->size = size;
+		__entry->ret = ret;
+	),
+
+	TP_printk("%s: %lx -> %lx size=%lx ret=%d", __entry->name, __entry->hpa,
+			__entry->ua, __entry->size, __entry->ret)
+);
+
+#endif /* _TRACE_VFIO_PCI_H */
+
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH ../../drivers/vfio/pci
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_FILE trace
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
new file mode 100644
index 0000000000..cb5b7f865d
--- /dev/null
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -0,0 +1,280 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved
+ *
+ * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
+ *     Author: Alex Williamson <alex.williamson@redhat.com>
+ *
+ * Derived from original vfio:
+ * Copyright 2010 Cisco Systems, Inc.  All rights reserved.
+ * Author: Tom Lyon, pugs@cisco.com
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/device.h>
+#include <linux/eventfd.h>
+#include <linux/file.h>
+#include <linux/interrupt.h>
+#include <linux/iommu.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/notifier.h>
+#include <linux/pm_runtime.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/uaccess.h>
+
+#include "vfio_pci_priv.h"
+
+#define DRIVER_AUTHOR   "Alex Williamson <alex.williamson@redhat.com>"
+#define DRIVER_DESC     "VFIO PCI - User Level meta-driver"
+
+static char ids[1024] __initdata;
+module_param_string(ids, ids, sizeof(ids), 0);
+MODULE_PARM_DESC(ids, "Initial PCI IDs to add to the vfio driver, format is \"vendor:device[:subvendor[:subdevice[:class[:class_mask]]]]\" and multiple comma separated entries can be specified");
+
+static bool nointxmask;
+module_param_named(nointxmask, nointxmask, bool, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(nointxmask,
+		  "Disable support for PCI 2.3 style INTx masking.  If this resolves problems for specific devices, report lspci -vvvxxx to linux-pci@vger.kernel.org so the device can be fixed automatically via the broken_intx_masking flag.");
+
+#ifdef CONFIG_VFIO_PCI_VGA
+static bool disable_vga;
+module_param(disable_vga, bool, S_IRUGO);
+MODULE_PARM_DESC(disable_vga, "Disable VGA resource access through vfio-pci");
+#endif
+
+static bool disable_idle_d3;
+module_param(disable_idle_d3, bool, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(disable_idle_d3,
+		 "Disable using the PCI D3 low power state for idle, unused devices");
+
+static bool enable_sriov;
+#ifdef CONFIG_PCI_IOV
+module_param(enable_sriov, bool, 0644);
+MODULE_PARM_DESC(enable_sriov, "Enable support for SR-IOV configuration.  Enabling SR-IOV on a PF typically requires support of the userspace PF driver, enabling VFs without such support may result in non-functional VFs or PF.");
+#endif
+
+static bool disable_denylist;
+module_param(disable_denylist, bool, 0444);
+MODULE_PARM_DESC(disable_denylist, "Disable use of device denylist. Disabling the denylist allows binding to devices with known errata that may lead to exploitable stability or security issues when accessed by untrusted users.");
+
+static bool vfio_pci_dev_in_denylist(struct pci_dev *pdev)
+{
+	switch (pdev->vendor) {
+	case PCI_VENDOR_ID_INTEL:
+		switch (pdev->device) {
+		case PCI_DEVICE_ID_INTEL_QAT_C3XXX:
+		case PCI_DEVICE_ID_INTEL_QAT_C3XXX_VF:
+		case PCI_DEVICE_ID_INTEL_QAT_C62X:
+		case PCI_DEVICE_ID_INTEL_QAT_C62X_VF:
+		case PCI_DEVICE_ID_INTEL_QAT_DH895XCC:
+		case PCI_DEVICE_ID_INTEL_QAT_DH895XCC_VF:
+			return true;
+		default:
+			return false;
+		}
+	}
+
+	return false;
+}
+
+static bool vfio_pci_is_denylisted(struct pci_dev *pdev)
+{
+	if (!vfio_pci_dev_in_denylist(pdev))
+		return false;
+
+	if (disable_denylist) {
+		pci_warn(pdev,
+			 "device denylist disabled - allowing device %04x:%04x.\n",
+			 pdev->vendor, pdev->device);
+		return false;
+	}
+
+	pci_warn(pdev, "%04x:%04x exists in vfio-pci device denylist, driver probing disallowed.\n",
+		 pdev->vendor, pdev->device);
+
+	return true;
+}
+
+static int vfio_pci_open_device(struct vfio_device *core_vdev)
+{
+	struct vfio_pci_core_device *vdev =
+		container_of(core_vdev, struct vfio_pci_core_device, vdev);
+	struct pci_dev *pdev = vdev->pdev;
+	int ret;
+
+	ret = vfio_pci_core_enable(vdev);
+	if (ret)
+		return ret;
+
+	if (vfio_pci_is_vga(pdev) &&
+	    pdev->vendor == PCI_VENDOR_ID_INTEL &&
+	    IS_ENABLED(CONFIG_VFIO_PCI_IGD)) {
+		ret = vfio_pci_igd_init(vdev);
+		if (ret && ret != -ENODEV) {
+			pci_warn(pdev, "Failed to setup Intel IGD regions\n");
+			vfio_pci_core_disable(vdev);
+			return ret;
+		}
+	}
+
+	vfio_pci_core_finish_enable(vdev);
+
+	return 0;
+}
+
+static const struct vfio_device_ops vfio_pci_ops = {
+	.name		= "vfio-pci",
+	.init		= vfio_pci_core_init_dev,
+	.release	= vfio_pci_core_release_dev,
+	.open_device	= vfio_pci_open_device,
+	.close_device	= vfio_pci_core_close_device,
+	.ioctl		= vfio_pci_core_ioctl,
+	.device_feature = vfio_pci_core_ioctl_feature,
+	.read		= vfio_pci_core_read,
+	.write		= vfio_pci_core_write,
+	.mmap		= vfio_pci_core_mmap,
+	.request	= vfio_pci_core_request,
+	.match		= vfio_pci_core_match,
+	.bind_iommufd	= vfio_iommufd_physical_bind,
+	.unbind_iommufd	= vfio_iommufd_physical_unbind,
+	.attach_ioas	= vfio_iommufd_physical_attach_ioas,
+	.detach_ioas	= vfio_iommufd_physical_detach_ioas,
+};
+
+static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
+{
+	struct vfio_pci_core_device *vdev;
+	int ret;
+
+	if (vfio_pci_is_denylisted(pdev))
+		return -EINVAL;
+
+	vdev = vfio_alloc_device(vfio_pci_core_device, vdev, &pdev->dev,
+				 &vfio_pci_ops);
+	if (IS_ERR(vdev))
+		return PTR_ERR(vdev);
+
+	dev_set_drvdata(&pdev->dev, vdev);
+	ret = vfio_pci_core_register_device(vdev);
+	if (ret)
+		goto out_put_vdev;
+	return 0;
+
+out_put_vdev:
+	vfio_put_device(&vdev->vdev);
+	return ret;
+}
+
+static void vfio_pci_remove(struct pci_dev *pdev)
+{
+	struct vfio_pci_core_device *vdev = dev_get_drvdata(&pdev->dev);
+
+	vfio_pci_core_unregister_device(vdev);
+	vfio_put_device(&vdev->vdev);
+}
+
+static int vfio_pci_sriov_configure(struct pci_dev *pdev, int nr_virtfn)
+{
+	struct vfio_pci_core_device *vdev = dev_get_drvdata(&pdev->dev);
+
+	if (!enable_sriov)
+		return -ENOENT;
+
+	return vfio_pci_core_sriov_configure(vdev, nr_virtfn);
+}
+
+static const struct pci_device_id vfio_pci_table[] = {
+	{ PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_ANY_ID, PCI_ANY_ID) }, /* match all by default */
+	{}
+};
+
+MODULE_DEVICE_TABLE(pci, vfio_pci_table);
+
+static struct pci_driver vfio_pci_driver = {
+	.name			= "vfio-pci",
+	.id_table		= vfio_pci_table,
+	.probe			= vfio_pci_probe,
+	.remove			= vfio_pci_remove,
+	.sriov_configure	= vfio_pci_sriov_configure,
+	.err_handler		= &vfio_pci_core_err_handlers,
+	.driver_managed_dma	= true,
+};
+
+static void __init vfio_pci_fill_ids(void)
+{
+	char *p, *id;
+	int rc;
+
+	/* no ids passed actually */
+	if (ids[0] == '\0')
+		return;
+
+	/* add ids specified in the module parameter */
+	p = ids;
+	while ((id = strsep(&p, ","))) {
+		unsigned int vendor, device, subvendor = PCI_ANY_ID,
+			subdevice = PCI_ANY_ID, class = 0, class_mask = 0;
+		int fields;
+
+		if (!strlen(id))
+			continue;
+
+		fields = sscanf(id, "%x:%x:%x:%x:%x:%x",
+				&vendor, &device, &subvendor, &subdevice,
+				&class, &class_mask);
+
+		if (fields < 2) {
+			pr_warn("invalid id string \"%s\"\n", id);
+			continue;
+		}
+
+		rc = pci_add_dynid(&vfio_pci_driver, vendor, device,
+				   subvendor, subdevice, class, class_mask, 0);
+		if (rc)
+			pr_warn("failed to add dynamic id [%04x:%04x[%04x:%04x]] class %#08x/%08x (%d)\n",
+				vendor, device, subvendor, subdevice,
+				class, class_mask, rc);
+		else
+			pr_info("add [%04x:%04x[%04x:%04x]] class %#08x/%08x\n",
+				vendor, device, subvendor, subdevice,
+				class, class_mask);
+	}
+}
+
+static int __init vfio_pci_init(void)
+{
+	int ret;
+	bool is_disable_vga = true;
+
+#ifdef CONFIG_VFIO_PCI_VGA
+	is_disable_vga = disable_vga;
+#endif
+
+	vfio_pci_core_set_params(nointxmask, is_disable_vga, disable_idle_d3);
+
+	/* Register and scan for devices */
+	ret = pci_register_driver(&vfio_pci_driver);
+	if (ret)
+		return ret;
+
+	vfio_pci_fill_ids();
+
+	if (disable_denylist)
+		pr_warn("device denylist disabled.\n");
+
+	return 0;
+}
+module_init(vfio_pci_init);
+
+static void __exit vfio_pci_cleanup(void)
+{
+	pci_unregister_driver(&vfio_pci_driver);
+}
+module_exit(vfio_pci_cleanup);
+
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR(DRIVER_AUTHOR);
+MODULE_DESCRIPTION(DRIVER_DESC);
diff --git a/drivers/vfio/pci/vfio_pci_config.c b/drivers/vfio/pci/vfio_pci_config.c
new file mode 100644
index 0000000000..7e2e62ab08
--- /dev/null
+++ b/drivers/vfio/pci/vfio_pci_config.c
@@ -0,0 +1,1968 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * VFIO PCI config space virtualization
+ *
+ * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
+ *     Author: Alex Williamson <alex.williamson@redhat.com>
+ *
+ * Derived from original vfio:
+ * Copyright 2010 Cisco Systems, Inc.  All rights reserved.
+ * Author: Tom Lyon, pugs@cisco.com
+ */
+
+/*
+ * This code handles reading and writing of PCI configuration registers.
+ * This is hairy because we want to allow a lot of flexibility to the
+ * user driver, but cannot trust it with all of the config fields.
+ * Tables determine which fields can be read and written, as well as
+ * which fields are 'virtualized' - special actions and translations to
+ * make it appear to the user that he has control, when in fact things
+ * must be negotiated with the underlying OS.
+ */
+
+#include <linux/fs.h>
+#include <linux/pci.h>
+#include <linux/uaccess.h>
+#include <linux/vfio.h>
+#include <linux/slab.h>
+
+#include "vfio_pci_priv.h"
+
+/* Fake capability ID for standard config space */
+#define PCI_CAP_ID_BASIC	0
+
+#define is_bar(offset)	\
+	((offset >= PCI_BASE_ADDRESS_0 && offset < PCI_BASE_ADDRESS_5 + 4) || \
+	 (offset >= PCI_ROM_ADDRESS && offset < PCI_ROM_ADDRESS + 4))
+
+/*
+ * Lengths of PCI Config Capabilities
+ *   0: Removed from the user visible capability list
+ *   FF: Variable length
+ */
+static const u8 pci_cap_length[PCI_CAP_ID_MAX + 1] = {
+	[PCI_CAP_ID_BASIC]	= PCI_STD_HEADER_SIZEOF, /* pci config header */
+	[PCI_CAP_ID_PM]		= PCI_PM_SIZEOF,
+	[PCI_CAP_ID_AGP]	= PCI_AGP_SIZEOF,
+	[PCI_CAP_ID_VPD]	= PCI_CAP_VPD_SIZEOF,
+	[PCI_CAP_ID_SLOTID]	= 0,		/* bridge - don't care */
+	[PCI_CAP_ID_MSI]	= 0xFF,		/* 10, 14, 20, or 24 */
+	[PCI_CAP_ID_CHSWP]	= 0,		/* cpci - not yet */
+	[PCI_CAP_ID_PCIX]	= 0xFF,		/* 8 or 24 */
+	[PCI_CAP_ID_HT]		= 0xFF,		/* hypertransport */
+	[PCI_CAP_ID_VNDR]	= 0xFF,		/* variable */
+	[PCI_CAP_ID_DBG]	= 0,		/* debug - don't care */
+	[PCI_CAP_ID_CCRC]	= 0,		/* cpci - not yet */
+	[PCI_CAP_ID_SHPC]	= 0,		/* hotswap - not yet */
+	[PCI_CAP_ID_SSVID]	= 0,		/* bridge - don't care */
+	[PCI_CAP_ID_AGP3]	= 0,		/* AGP8x - not yet */
+	[PCI_CAP_ID_SECDEV]	= 0,		/* secure device not yet */
+	[PCI_CAP_ID_EXP]	= 0xFF,		/* 20 or 44 */
+	[PCI_CAP_ID_MSIX]	= PCI_CAP_MSIX_SIZEOF,
+	[PCI_CAP_ID_SATA]	= 0xFF,
+	[PCI_CAP_ID_AF]		= PCI_CAP_AF_SIZEOF,
+};
+
+/*
+ * Lengths of PCIe/PCI-X Extended Config Capabilities
+ *   0: Removed or masked from the user visible capability list
+ *   FF: Variable length
+ */
+static const u16 pci_ext_cap_length[PCI_EXT_CAP_ID_MAX + 1] = {
+	[PCI_EXT_CAP_ID_ERR]	=	PCI_ERR_ROOT_COMMAND,
+	[PCI_EXT_CAP_ID_VC]	=	0xFF,
+	[PCI_EXT_CAP_ID_DSN]	=	PCI_EXT_CAP_DSN_SIZEOF,
+	[PCI_EXT_CAP_ID_PWR]	=	PCI_EXT_CAP_PWR_SIZEOF,
+	[PCI_EXT_CAP_ID_RCLD]	=	0,	/* root only - don't care */
+	[PCI_EXT_CAP_ID_RCILC]	=	0,	/* root only - don't care */
+	[PCI_EXT_CAP_ID_RCEC]	=	0,	/* root only - don't care */
+	[PCI_EXT_CAP_ID_MFVC]	=	0xFF,
+	[PCI_EXT_CAP_ID_VC9]	=	0xFF,	/* same as CAP_ID_VC */
+	[PCI_EXT_CAP_ID_RCRB]	=	0,	/* root only - don't care */
+	[PCI_EXT_CAP_ID_VNDR]	=	0xFF,
+	[PCI_EXT_CAP_ID_CAC]	=	0,	/* obsolete */
+	[PCI_EXT_CAP_ID_ACS]	=	0xFF,
+	[PCI_EXT_CAP_ID_ARI]	=	PCI_EXT_CAP_ARI_SIZEOF,
+	[PCI_EXT_CAP_ID_ATS]	=	PCI_EXT_CAP_ATS_SIZEOF,
+	[PCI_EXT_CAP_ID_SRIOV]	=	PCI_EXT_CAP_SRIOV_SIZEOF,
+	[PCI_EXT_CAP_ID_MRIOV]	=	0,	/* not yet */
+	[PCI_EXT_CAP_ID_MCAST]	=	PCI_EXT_CAP_MCAST_ENDPOINT_SIZEOF,
+	[PCI_EXT_CAP_ID_PRI]	=	PCI_EXT_CAP_PRI_SIZEOF,
+	[PCI_EXT_CAP_ID_AMD_XXX] =	0,	/* not yet */
+	[PCI_EXT_CAP_ID_REBAR]	=	0xFF,
+	[PCI_EXT_CAP_ID_DPA]	=	0xFF,
+	[PCI_EXT_CAP_ID_TPH]	=	0xFF,
+	[PCI_EXT_CAP_ID_LTR]	=	PCI_EXT_CAP_LTR_SIZEOF,
+	[PCI_EXT_CAP_ID_SECPCI]	=	0,	/* not yet */
+	[PCI_EXT_CAP_ID_PMUX]	=	0,	/* not yet */
+	[PCI_EXT_CAP_ID_PASID]	=	0,	/* not yet */
+	[PCI_EXT_CAP_ID_DVSEC]	=	0xFF,
+};
+
+/*
+ * Read/Write Permission Bits - one bit for each bit in capability
+ * Any field can be read if it exists, but what is read depends on
+ * whether the field is 'virtualized', or just pass through to the
+ * hardware.  Any virtualized field is also virtualized for writes.
+ * Writes are only permitted if they have a 1 bit here.
+ */
+struct perm_bits {
+	u8	*virt;		/* read/write virtual data, not hw */
+	u8	*write;		/* writeable bits */
+	int	(*readfn)(struct vfio_pci_core_device *vdev, int pos, int count,
+			  struct perm_bits *perm, int offset, __le32 *val);
+	int	(*writefn)(struct vfio_pci_core_device *vdev, int pos, int count,
+			   struct perm_bits *perm, int offset, __le32 val);
+};
+
+#define	NO_VIRT		0
+#define	ALL_VIRT	0xFFFFFFFFU
+#define	NO_WRITE	0
+#define	ALL_WRITE	0xFFFFFFFFU
+
+static int vfio_user_config_read(struct pci_dev *pdev, int offset,
+				 __le32 *val, int count)
+{
+	int ret = -EINVAL;
+	u32 tmp_val = 0;
+
+	switch (count) {
+	case 1:
+	{
+		u8 tmp;
+		ret = pci_user_read_config_byte(pdev, offset, &tmp);
+		tmp_val = tmp;
+		break;
+	}
+	case 2:
+	{
+		u16 tmp;
+		ret = pci_user_read_config_word(pdev, offset, &tmp);
+		tmp_val = tmp;
+		break;
+	}
+	case 4:
+		ret = pci_user_read_config_dword(pdev, offset, &tmp_val);
+		break;
+	}
+
+	*val = cpu_to_le32(tmp_val);
+
+	return ret;
+}
+
+static int vfio_user_config_write(struct pci_dev *pdev, int offset,
+				  __le32 val, int count)
+{
+	int ret = -EINVAL;
+	u32 tmp_val = le32_to_cpu(val);
+
+	switch (count) {
+	case 1:
+		ret = pci_user_write_config_byte(pdev, offset, tmp_val);
+		break;
+	case 2:
+		ret = pci_user_write_config_word(pdev, offset, tmp_val);
+		break;
+	case 4:
+		ret = pci_user_write_config_dword(pdev, offset, tmp_val);
+		break;
+	}
+
+	return ret;
+}
+
+static int vfio_default_config_read(struct vfio_pci_core_device *vdev, int pos,
+				    int count, struct perm_bits *perm,
+				    int offset, __le32 *val)
+{
+	__le32 virt = 0;
+
+	memcpy(val, vdev->vconfig + pos, count);
+
+	memcpy(&virt, perm->virt + offset, count);
+
+	/* Any non-virtualized bits? */
+	if (cpu_to_le32(~0U >> (32 - (count * 8))) != virt) {
+		struct pci_dev *pdev = vdev->pdev;
+		__le32 phys_val = 0;
+		int ret;
+
+		ret = vfio_user_config_read(pdev, pos, &phys_val, count);
+		if (ret)
+			return ret;
+
+		*val = (phys_val & ~virt) | (*val & virt);
+	}
+
+	return count;
+}
+
+static int vfio_default_config_write(struct vfio_pci_core_device *vdev, int pos,
+				     int count, struct perm_bits *perm,
+				     int offset, __le32 val)
+{
+	__le32 virt = 0, write = 0;
+
+	memcpy(&write, perm->write + offset, count);
+
+	if (!write)
+		return count; /* drop, no writable bits */
+
+	memcpy(&virt, perm->virt + offset, count);
+
+	/* Virtualized and writable bits go to vconfig */
+	if (write & virt) {
+		__le32 virt_val = 0;
+
+		memcpy(&virt_val, vdev->vconfig + pos, count);
+
+		virt_val &= ~(write & virt);
+		virt_val |= (val & (write & virt));
+
+		memcpy(vdev->vconfig + pos, &virt_val, count);
+	}
+
+	/* Non-virtualized and writable bits go to hardware */
+	if (write & ~virt) {
+		struct pci_dev *pdev = vdev->pdev;
+		__le32 phys_val = 0;
+		int ret;
+
+		ret = vfio_user_config_read(pdev, pos, &phys_val, count);
+		if (ret)
+			return ret;
+
+		phys_val &= ~(write & ~virt);
+		phys_val |= (val & (write & ~virt));
+
+		ret = vfio_user_config_write(pdev, pos, phys_val, count);
+		if (ret)
+			return ret;
+	}
+
+	return count;
+}
+
+/* Allow direct read from hardware, except for capability next pointer */
+static int vfio_direct_config_read(struct vfio_pci_core_device *vdev, int pos,
+				   int count, struct perm_bits *perm,
+				   int offset, __le32 *val)
+{
+	int ret;
+
+	ret = vfio_user_config_read(vdev->pdev, pos, val, count);
+	if (ret)
+		return ret;
+
+	if (pos >= PCI_CFG_SPACE_SIZE) { /* Extended cap header mangling */
+		if (offset < 4)
+			memcpy(val, vdev->vconfig + pos, count);
+	} else if (pos >= PCI_STD_HEADER_SIZEOF) { /* Std cap mangling */
+		if (offset == PCI_CAP_LIST_ID && count > 1)
+			memcpy(val, vdev->vconfig + pos,
+			       min(PCI_CAP_FLAGS, count));
+		else if (offset == PCI_CAP_LIST_NEXT)
+			memcpy(val, vdev->vconfig + pos, 1);
+	}
+
+	return count;
+}
+
+/* Raw access skips any kind of virtualization */
+static int vfio_raw_config_write(struct vfio_pci_core_device *vdev, int pos,
+				 int count, struct perm_bits *perm,
+				 int offset, __le32 val)
+{
+	int ret;
+
+	ret = vfio_user_config_write(vdev->pdev, pos, val, count);
+	if (ret)
+		return ret;
+
+	return count;
+}
+
+static int vfio_raw_config_read(struct vfio_pci_core_device *vdev, int pos,
+				int count, struct perm_bits *perm,
+				int offset, __le32 *val)
+{
+	int ret;
+
+	ret = vfio_user_config_read(vdev->pdev, pos, val, count);
+	if (ret)
+		return ret;
+
+	return count;
+}
+
+/* Virt access uses only virtualization */
+static int vfio_virt_config_write(struct vfio_pci_core_device *vdev, int pos,
+				  int count, struct perm_bits *perm,
+				  int offset, __le32 val)
+{
+	memcpy(vdev->vconfig + pos, &val, count);
+	return count;
+}
+
+static int vfio_virt_config_read(struct vfio_pci_core_device *vdev, int pos,
+				 int count, struct perm_bits *perm,
+				 int offset, __le32 *val)
+{
+	memcpy(val, vdev->vconfig + pos, count);
+	return count;
+}
+
+/* Default capability regions to read-only, no-virtualization */
+static struct perm_bits cap_perms[PCI_CAP_ID_MAX + 1] = {
+	[0 ... PCI_CAP_ID_MAX] = { .readfn = vfio_direct_config_read }
+};
+static struct perm_bits ecap_perms[PCI_EXT_CAP_ID_MAX + 1] = {
+	[0 ... PCI_EXT_CAP_ID_MAX] = { .readfn = vfio_direct_config_read }
+};
+/*
+ * Default unassigned regions to raw read-write access.  Some devices
+ * require this to function as they hide registers between the gaps in
+ * config space (be2net).  Like MMIO and I/O port registers, we have
+ * to trust the hardware isolation.
+ */
+static struct perm_bits unassigned_perms = {
+	.readfn = vfio_raw_config_read,
+	.writefn = vfio_raw_config_write
+};
+
+static struct perm_bits virt_perms = {
+	.readfn = vfio_virt_config_read,
+	.writefn = vfio_virt_config_write
+};
+
+static void free_perm_bits(struct perm_bits *perm)
+{
+	kfree(perm->virt);
+	kfree(perm->write);
+	perm->virt = NULL;
+	perm->write = NULL;
+}
+
+static int alloc_perm_bits(struct perm_bits *perm, int size)
+{
+	/*
+	 * Round up all permission bits to the next dword, this lets us
+	 * ignore whether a read/write exceeds the defined capability
+	 * structure.  We can do this because:
+	 *  - Standard config space is already dword aligned
+	 *  - Capabilities are all dword aligned (bits 0:1 of next reserved)
+	 *  - Express capabilities defined as dword aligned
+	 */
+	size = round_up(size, 4);
+
+	/*
+	 * Zero state is
+	 * - All Readable, None Writeable, None Virtualized
+	 */
+	perm->virt = kzalloc(size, GFP_KERNEL);
+	perm->write = kzalloc(size, GFP_KERNEL);
+	if (!perm->virt || !perm->write) {
+		free_perm_bits(perm);
+		return -ENOMEM;
+	}
+
+	perm->readfn = vfio_default_config_read;
+	perm->writefn = vfio_default_config_write;
+
+	return 0;
+}
+
+/*
+ * Helper functions for filling in permission tables
+ */
+static inline void p_setb(struct perm_bits *p, int off, u8 virt, u8 write)
+{
+	p->virt[off] = virt;
+	p->write[off] = write;
+}
+
+/* Handle endian-ness - pci and tables are little-endian */
+static inline void p_setw(struct perm_bits *p, int off, u16 virt, u16 write)
+{
+	*(__le16 *)(&p->virt[off]) = cpu_to_le16(virt);
+	*(__le16 *)(&p->write[off]) = cpu_to_le16(write);
+}
+
+/* Handle endian-ness - pci and tables are little-endian */
+static inline void p_setd(struct perm_bits *p, int off, u32 virt, u32 write)
+{
+	*(__le32 *)(&p->virt[off]) = cpu_to_le32(virt);
+	*(__le32 *)(&p->write[off]) = cpu_to_le32(write);
+}
+
+/* Caller should hold memory_lock semaphore */
+bool __vfio_pci_memory_enabled(struct vfio_pci_core_device *vdev)
+{
+	struct pci_dev *pdev = vdev->pdev;
+	u16 cmd = le16_to_cpu(*(__le16 *)&vdev->vconfig[PCI_COMMAND]);
+
+	/*
+	 * Memory region cannot be accessed if device power state is D3.
+	 *
+	 * SR-IOV VF memory enable is handled by the MSE bit in the
+	 * PF SR-IOV capability, there's therefore no need to trigger
+	 * faults based on the virtual value.
+	 */
+	return pdev->current_state < PCI_D3hot &&
+	       (pdev->no_command_memory || (cmd & PCI_COMMAND_MEMORY));
+}
+
+/*
+ * Restore the *real* BARs after we detect a FLR or backdoor reset.
+ * (backdoor = some device specific technique that we didn't catch)
+ */
+static void vfio_bar_restore(struct vfio_pci_core_device *vdev)
+{
+	struct pci_dev *pdev = vdev->pdev;
+	u32 *rbar = vdev->rbar;
+	u16 cmd;
+	int i;
+
+	if (pdev->is_virtfn)
+		return;
+
+	pci_info(pdev, "%s: reset recovery - restoring BARs\n", __func__);
+
+	for (i = PCI_BASE_ADDRESS_0; i <= PCI_BASE_ADDRESS_5; i += 4, rbar++)
+		pci_user_write_config_dword(pdev, i, *rbar);
+
+	pci_user_write_config_dword(pdev, PCI_ROM_ADDRESS, *rbar);
+
+	if (vdev->nointx) {
+		pci_user_read_config_word(pdev, PCI_COMMAND, &cmd);
+		cmd |= PCI_COMMAND_INTX_DISABLE;
+		pci_user_write_config_word(pdev, PCI_COMMAND, cmd);
+	}
+}
+
+static __le32 vfio_generate_bar_flags(struct pci_dev *pdev, int bar)
+{
+	unsigned long flags = pci_resource_flags(pdev, bar);
+	u32 val;
+
+	if (flags & IORESOURCE_IO)
+		return cpu_to_le32(PCI_BASE_ADDRESS_SPACE_IO);
+
+	val = PCI_BASE_ADDRESS_SPACE_MEMORY;
+
+	if (flags & IORESOURCE_PREFETCH)
+		val |= PCI_BASE_ADDRESS_MEM_PREFETCH;
+
+	if (flags & IORESOURCE_MEM_64)
+		val |= PCI_BASE_ADDRESS_MEM_TYPE_64;
+
+	return cpu_to_le32(val);
+}
+
+/*
+ * Pretend we're hardware and tweak the values of the *virtual* PCI BARs
+ * to reflect the hardware capabilities.  This implements BAR sizing.
+ */
+static void vfio_bar_fixup(struct vfio_pci_core_device *vdev)
+{
+	struct pci_dev *pdev = vdev->pdev;
+	int i;
+	__le32 *vbar;
+	u64 mask;
+
+	if (!vdev->bardirty)
+		return;
+
+	vbar = (__le32 *)&vdev->vconfig[PCI_BASE_ADDRESS_0];
+
+	for (i = 0; i < PCI_STD_NUM_BARS; i++, vbar++) {
+		int bar = i + PCI_STD_RESOURCES;
+
+		if (!pci_resource_start(pdev, bar)) {
+			*vbar = 0; /* Unmapped by host = unimplemented to user */
+			continue;
+		}
+
+		mask = ~(pci_resource_len(pdev, bar) - 1);
+
+		*vbar &= cpu_to_le32((u32)mask);
+		*vbar |= vfio_generate_bar_flags(pdev, bar);
+
+		if (*vbar & cpu_to_le32(PCI_BASE_ADDRESS_MEM_TYPE_64)) {
+			vbar++;
+			*vbar &= cpu_to_le32((u32)(mask >> 32));
+			i++;
+		}
+	}
+
+	vbar = (__le32 *)&vdev->vconfig[PCI_ROM_ADDRESS];
+
+	/*
+	 * NB. REGION_INFO will have reported zero size if we weren't able
+	 * to read the ROM, but we still return the actual BAR size here if
+	 * it exists (or the shadow ROM space).
+	 */
+	if (pci_resource_start(pdev, PCI_ROM_RESOURCE)) {
+		mask = ~(pci_resource_len(pdev, PCI_ROM_RESOURCE) - 1);
+		mask |= PCI_ROM_ADDRESS_ENABLE;
+		*vbar &= cpu_to_le32((u32)mask);
+	} else if (pdev->resource[PCI_ROM_RESOURCE].flags &
+					IORESOURCE_ROM_SHADOW) {
+		mask = ~(0x20000 - 1);
+		mask |= PCI_ROM_ADDRESS_ENABLE;
+		*vbar &= cpu_to_le32((u32)mask);
+	} else
+		*vbar = 0;
+
+	vdev->bardirty = false;
+}
+
+static int vfio_basic_config_read(struct vfio_pci_core_device *vdev, int pos,
+				  int count, struct perm_bits *perm,
+				  int offset, __le32 *val)
+{
+	if (is_bar(offset)) /* pos == offset for basic config */
+		vfio_bar_fixup(vdev);
+
+	count = vfio_default_config_read(vdev, pos, count, perm, offset, val);
+
+	/* Mask in virtual memory enable */
+	if (offset == PCI_COMMAND && vdev->pdev->no_command_memory) {
+		u16 cmd = le16_to_cpu(*(__le16 *)&vdev->vconfig[PCI_COMMAND]);
+		u32 tmp_val = le32_to_cpu(*val);
+
+		tmp_val |= cmd & PCI_COMMAND_MEMORY;
+		*val = cpu_to_le32(tmp_val);
+	}
+
+	return count;
+}
+
+/* Test whether BARs match the value we think they should contain */
+static bool vfio_need_bar_restore(struct vfio_pci_core_device *vdev)
+{
+	int i = 0, pos = PCI_BASE_ADDRESS_0, ret;
+	u32 bar;
+
+	for (; pos <= PCI_BASE_ADDRESS_5; i++, pos += 4) {
+		if (vdev->rbar[i]) {
+			ret = pci_user_read_config_dword(vdev->pdev, pos, &bar);
+			if (ret || vdev->rbar[i] != bar)
+				return true;
+		}
+	}
+
+	return false;
+}
+
+static int vfio_basic_config_write(struct vfio_pci_core_device *vdev, int pos,
+				   int count, struct perm_bits *perm,
+				   int offset, __le32 val)
+{
+	struct pci_dev *pdev = vdev->pdev;
+	__le16 *virt_cmd;
+	u16 new_cmd = 0;
+	int ret;
+
+	virt_cmd = (__le16 *)&vdev->vconfig[PCI_COMMAND];
+
+	if (offset == PCI_COMMAND) {
+		bool phys_mem, virt_mem, new_mem, phys_io, virt_io, new_io;
+		u16 phys_cmd;
+
+		ret = pci_user_read_config_word(pdev, PCI_COMMAND, &phys_cmd);
+		if (ret)
+			return ret;
+
+		new_cmd = le32_to_cpu(val);
+
+		phys_io = !!(phys_cmd & PCI_COMMAND_IO);
+		virt_io = !!(le16_to_cpu(*virt_cmd) & PCI_COMMAND_IO);
+		new_io = !!(new_cmd & PCI_COMMAND_IO);
+
+		phys_mem = !!(phys_cmd & PCI_COMMAND_MEMORY);
+		virt_mem = !!(le16_to_cpu(*virt_cmd) & PCI_COMMAND_MEMORY);
+		new_mem = !!(new_cmd & PCI_COMMAND_MEMORY);
+
+		if (!new_mem)
+			vfio_pci_zap_and_down_write_memory_lock(vdev);
+		else
+			down_write(&vdev->memory_lock);
+
+		/*
+		 * If the user is writing mem/io enable (new_mem/io) and we
+		 * think it's already enabled (virt_mem/io), but the hardware
+		 * shows it disabled (phys_mem/io, then the device has
+		 * undergone some kind of backdoor reset and needs to be
+		 * restored before we allow it to enable the bars.
+		 * SR-IOV devices will trigger this - for mem enable let's
+		 * catch this now and for io enable it will be caught later
+		 */
+		if ((new_mem && virt_mem && !phys_mem &&
+		     !pdev->no_command_memory) ||
+		    (new_io && virt_io && !phys_io) ||
+		    vfio_need_bar_restore(vdev))
+			vfio_bar_restore(vdev);
+	}
+
+	count = vfio_default_config_write(vdev, pos, count, perm, offset, val);
+	if (count < 0) {
+		if (offset == PCI_COMMAND)
+			up_write(&vdev->memory_lock);
+		return count;
+	}
+
+	/*
+	 * Save current memory/io enable bits in vconfig to allow for
+	 * the test above next time.
+	 */
+	if (offset == PCI_COMMAND) {
+		u16 mask = PCI_COMMAND_MEMORY | PCI_COMMAND_IO;
+
+		*virt_cmd &= cpu_to_le16(~mask);
+		*virt_cmd |= cpu_to_le16(new_cmd & mask);
+
+		up_write(&vdev->memory_lock);
+	}
+
+	/* Emulate INTx disable */
+	if (offset >= PCI_COMMAND && offset <= PCI_COMMAND + 1) {
+		bool virt_intx_disable;
+
+		virt_intx_disable = !!(le16_to_cpu(*virt_cmd) &
+				       PCI_COMMAND_INTX_DISABLE);
+
+		if (virt_intx_disable && !vdev->virq_disabled) {
+			vdev->virq_disabled = true;
+			vfio_pci_intx_mask(vdev);
+		} else if (!virt_intx_disable && vdev->virq_disabled) {
+			vdev->virq_disabled = false;
+			vfio_pci_intx_unmask(vdev);
+		}
+	}
+
+	if (is_bar(offset))
+		vdev->bardirty = true;
+
+	return count;
+}
+
+/* Permissions for the Basic PCI Header */
+static int __init init_pci_cap_basic_perm(struct perm_bits *perm)
+{
+	if (alloc_perm_bits(perm, PCI_STD_HEADER_SIZEOF))
+		return -ENOMEM;
+
+	perm->readfn = vfio_basic_config_read;
+	perm->writefn = vfio_basic_config_write;
+
+	/* Virtualized for SR-IOV functions, which just have FFFF */
+	p_setw(perm, PCI_VENDOR_ID, (u16)ALL_VIRT, NO_WRITE);
+	p_setw(perm, PCI_DEVICE_ID, (u16)ALL_VIRT, NO_WRITE);
+
+	/*
+	 * Virtualize INTx disable, we use it internally for interrupt
+	 * control and can emulate it for non-PCI 2.3 devices.
+	 */
+	p_setw(perm, PCI_COMMAND, PCI_COMMAND_INTX_DISABLE, (u16)ALL_WRITE);
+
+	/* Virtualize capability list, we might want to skip/disable */
+	p_setw(perm, PCI_STATUS, PCI_STATUS_CAP_LIST, NO_WRITE);
+
+	/* No harm to write */
+	p_setb(perm, PCI_CACHE_LINE_SIZE, NO_VIRT, (u8)ALL_WRITE);
+	p_setb(perm, PCI_LATENCY_TIMER, NO_VIRT, (u8)ALL_WRITE);
+	p_setb(perm, PCI_BIST, NO_VIRT, (u8)ALL_WRITE);
+
+	/* Virtualize all bars, can't touch the real ones */
+	p_setd(perm, PCI_BASE_ADDRESS_0, ALL_VIRT, ALL_WRITE);
+	p_setd(perm, PCI_BASE_ADDRESS_1, ALL_VIRT, ALL_WRITE);
+	p_setd(perm, PCI_BASE_ADDRESS_2, ALL_VIRT, ALL_WRITE);
+	p_setd(perm, PCI_BASE_ADDRESS_3, ALL_VIRT, ALL_WRITE);
+	p_setd(perm, PCI_BASE_ADDRESS_4, ALL_VIRT, ALL_WRITE);
+	p_setd(perm, PCI_BASE_ADDRESS_5, ALL_VIRT, ALL_WRITE);
+	p_setd(perm, PCI_ROM_ADDRESS, ALL_VIRT, ALL_WRITE);
+
+	/* Allow us to adjust capability chain */
+	p_setb(perm, PCI_CAPABILITY_LIST, (u8)ALL_VIRT, NO_WRITE);
+
+	/* Sometimes used by sw, just virtualize */
+	p_setb(perm, PCI_INTERRUPT_LINE, (u8)ALL_VIRT, (u8)ALL_WRITE);
+
+	/* Virtualize interrupt pin to allow hiding INTx */
+	p_setb(perm, PCI_INTERRUPT_PIN, (u8)ALL_VIRT, (u8)NO_WRITE);
+
+	return 0;
+}
+
+/*
+ * It takes all the required locks to protect the access of power related
+ * variables and then invokes vfio_pci_set_power_state().
+ */
+static void vfio_lock_and_set_power_state(struct vfio_pci_core_device *vdev,
+					  pci_power_t state)
+{
+	if (state >= PCI_D3hot)
+		vfio_pci_zap_and_down_write_memory_lock(vdev);
+	else
+		down_write(&vdev->memory_lock);
+
+	vfio_pci_set_power_state(vdev, state);
+	up_write(&vdev->memory_lock);
+}
+
+static int vfio_pm_config_write(struct vfio_pci_core_device *vdev, int pos,
+				int count, struct perm_bits *perm,
+				int offset, __le32 val)
+{
+	count = vfio_default_config_write(vdev, pos, count, perm, offset, val);
+	if (count < 0)
+		return count;
+
+	if (offset == PCI_PM_CTRL) {
+		pci_power_t state;
+
+		switch (le32_to_cpu(val) & PCI_PM_CTRL_STATE_MASK) {
+		case 0:
+			state = PCI_D0;
+			break;
+		case 1:
+			state = PCI_D1;
+			break;
+		case 2:
+			state = PCI_D2;
+			break;
+		case 3:
+			state = PCI_D3hot;
+			break;
+		}
+
+		vfio_lock_and_set_power_state(vdev, state);
+	}
+
+	return count;
+}
+
+/* Permissions for the Power Management capability */
+static int __init init_pci_cap_pm_perm(struct perm_bits *perm)
+{
+	if (alloc_perm_bits(perm, pci_cap_length[PCI_CAP_ID_PM]))
+		return -ENOMEM;
+
+	perm->writefn = vfio_pm_config_write;
+
+	/*
+	 * We always virtualize the next field so we can remove
+	 * capabilities from the chain if we want to.
+	 */
+	p_setb(perm, PCI_CAP_LIST_NEXT, (u8)ALL_VIRT, NO_WRITE);
+
+	/*
+	 * The guests can't process PME events. If any PME event will be
+	 * generated, then it will be mostly handled in the host and the
+	 * host will clear the PME_STATUS. So virtualize PME_Support bits.
+	 * The vconfig bits will be cleared during device capability
+	 * initialization.
+	 */
+	p_setw(perm, PCI_PM_PMC, PCI_PM_CAP_PME_MASK, NO_WRITE);
+
+	/*
+	 * Power management is defined *per function*, so we can let
+	 * the user change power state, but we trap and initiate the
+	 * change ourselves, so the state bits are read-only.
+	 *
+	 * The guest can't process PME from D3cold so virtualize PME_Status
+	 * and PME_En bits. The vconfig bits will be cleared during device
+	 * capability initialization.
+	 */
+	p_setd(perm, PCI_PM_CTRL,
+	       PCI_PM_CTRL_PME_ENABLE | PCI_PM_CTRL_PME_STATUS,
+	       ~(PCI_PM_CTRL_PME_ENABLE | PCI_PM_CTRL_PME_STATUS |
+		 PCI_PM_CTRL_STATE_MASK));
+
+	return 0;
+}
+
+static int vfio_vpd_config_write(struct vfio_pci_core_device *vdev, int pos,
+				 int count, struct perm_bits *perm,
+				 int offset, __le32 val)
+{
+	struct pci_dev *pdev = vdev->pdev;
+	__le16 *paddr = (__le16 *)(vdev->vconfig + pos - offset + PCI_VPD_ADDR);
+	__le32 *pdata = (__le32 *)(vdev->vconfig + pos - offset + PCI_VPD_DATA);
+	u16 addr;
+	u32 data;
+
+	/*
+	 * Write through to emulation.  If the write includes the upper byte
+	 * of PCI_VPD_ADDR, then the PCI_VPD_ADDR_F bit is written and we
+	 * have work to do.
+	 */
+	count = vfio_default_config_write(vdev, pos, count, perm, offset, val);
+	if (count < 0 || offset > PCI_VPD_ADDR + 1 ||
+	    offset + count <= PCI_VPD_ADDR + 1)
+		return count;
+
+	addr = le16_to_cpu(*paddr);
+
+	if (addr & PCI_VPD_ADDR_F) {
+		data = le32_to_cpu(*pdata);
+		if (pci_write_vpd(pdev, addr & ~PCI_VPD_ADDR_F, 4, &data) != 4)
+			return count;
+	} else {
+		data = 0;
+		if (pci_read_vpd(pdev, addr, 4, &data) < 0)
+			return count;
+		*pdata = cpu_to_le32(data);
+	}
+
+	/*
+	 * Toggle PCI_VPD_ADDR_F in the emulated PCI_VPD_ADDR register to
+	 * signal completion.  If an error occurs above, we assume that not
+	 * toggling this bit will induce a driver timeout.
+	 */
+	addr ^= PCI_VPD_ADDR_F;
+	*paddr = cpu_to_le16(addr);
+
+	return count;
+}
+
+/* Permissions for Vital Product Data capability */
+static int __init init_pci_cap_vpd_perm(struct perm_bits *perm)
+{
+	if (alloc_perm_bits(perm, pci_cap_length[PCI_CAP_ID_VPD]))
+		return -ENOMEM;
+
+	perm->writefn = vfio_vpd_config_write;
+
+	/*
+	 * We always virtualize the next field so we can remove
+	 * capabilities from the chain if we want to.
+	 */
+	p_setb(perm, PCI_CAP_LIST_NEXT, (u8)ALL_VIRT, NO_WRITE);
+
+	/*
+	 * Both the address and data registers are virtualized to
+	 * enable access through the pci_vpd_read/write functions
+	 */
+	p_setw(perm, PCI_VPD_ADDR, (u16)ALL_VIRT, (u16)ALL_WRITE);
+	p_setd(perm, PCI_VPD_DATA, ALL_VIRT, ALL_WRITE);
+
+	return 0;
+}
+
+/* Permissions for PCI-X capability */
+static int __init init_pci_cap_pcix_perm(struct perm_bits *perm)
+{
+	/* Alloc 24, but only 8 are used in v0 */
+	if (alloc_perm_bits(perm, PCI_CAP_PCIX_SIZEOF_V2))
+		return -ENOMEM;
+
+	p_setb(perm, PCI_CAP_LIST_NEXT, (u8)ALL_VIRT, NO_WRITE);
+
+	p_setw(perm, PCI_X_CMD, NO_VIRT, (u16)ALL_WRITE);
+	p_setd(perm, PCI_X_ECC_CSR, NO_VIRT, ALL_WRITE);
+	return 0;
+}
+
+static int vfio_exp_config_write(struct vfio_pci_core_device *vdev, int pos,
+				 int count, struct perm_bits *perm,
+				 int offset, __le32 val)
+{
+	__le16 *ctrl = (__le16 *)(vdev->vconfig + pos -
+				  offset + PCI_EXP_DEVCTL);
+	int readrq = le16_to_cpu(*ctrl) & PCI_EXP_DEVCTL_READRQ;
+
+	count = vfio_default_config_write(vdev, pos, count, perm, offset, val);
+	if (count < 0)
+		return count;
+
+	/*
+	 * The FLR bit is virtualized, if set and the device supports PCIe
+	 * FLR, issue a reset_function.  Regardless, clear the bit, the spec
+	 * requires it to be always read as zero.  NB, reset_function might
+	 * not use a PCIe FLR, we don't have that level of granularity.
+	 */
+	if (*ctrl & cpu_to_le16(PCI_EXP_DEVCTL_BCR_FLR)) {
+		u32 cap;
+		int ret;
+
+		*ctrl &= ~cpu_to_le16(PCI_EXP_DEVCTL_BCR_FLR);
+
+		ret = pci_user_read_config_dword(vdev->pdev,
+						 pos - offset + PCI_EXP_DEVCAP,
+						 &cap);
+
+		if (!ret && (cap & PCI_EXP_DEVCAP_FLR)) {
+			vfio_pci_zap_and_down_write_memory_lock(vdev);
+			pci_try_reset_function(vdev->pdev);
+			up_write(&vdev->memory_lock);
+		}
+	}
+
+	/*
+	 * MPS is virtualized to the user, writes do not change the physical
+	 * register since determining a proper MPS value requires a system wide
+	 * device view.  The MRRS is largely independent of MPS, but since the
+	 * user does not have that system-wide view, they might set a safe, but
+	 * inefficiently low value.  Here we allow writes through to hardware,
+	 * but we set the floor to the physical device MPS setting, so that
+	 * we can at least use full TLPs, as defined by the MPS value.
+	 *
+	 * NB, if any devices actually depend on an artificially low MRRS
+	 * setting, this will need to be revisited, perhaps with a quirk
+	 * though pcie_set_readrq().
+	 */
+	if (readrq != (le16_to_cpu(*ctrl) & PCI_EXP_DEVCTL_READRQ)) {
+		readrq = 128 <<
+			((le16_to_cpu(*ctrl) & PCI_EXP_DEVCTL_READRQ) >> 12);
+		readrq = max(readrq, pcie_get_mps(vdev->pdev));
+
+		pcie_set_readrq(vdev->pdev, readrq);
+	}
+
+	return count;
+}
+
+/* Permissions for PCI Express capability */
+static int __init init_pci_cap_exp_perm(struct perm_bits *perm)
+{
+	/* Alloc largest of possible sizes */
+	if (alloc_perm_bits(perm, PCI_CAP_EXP_ENDPOINT_SIZEOF_V2))
+		return -ENOMEM;
+
+	perm->writefn = vfio_exp_config_write;
+
+	p_setb(perm, PCI_CAP_LIST_NEXT, (u8)ALL_VIRT, NO_WRITE);
+
+	/*
+	 * Allow writes to device control fields, except devctl_phantom,
+	 * which could confuse IOMMU, MPS, which can break communication
+	 * with other physical devices, and the ARI bit in devctl2, which
+	 * is set at probe time.  FLR and MRRS get virtualized via our
+	 * writefn.
+	 */
+	p_setw(perm, PCI_EXP_DEVCTL,
+	       PCI_EXP_DEVCTL_BCR_FLR | PCI_EXP_DEVCTL_PAYLOAD |
+	       PCI_EXP_DEVCTL_READRQ, ~PCI_EXP_DEVCTL_PHANTOM);
+	p_setw(perm, PCI_EXP_DEVCTL2, NO_VIRT, ~PCI_EXP_DEVCTL2_ARI);
+	return 0;
+}
+
+static int vfio_af_config_write(struct vfio_pci_core_device *vdev, int pos,
+				int count, struct perm_bits *perm,
+				int offset, __le32 val)
+{
+	u8 *ctrl = vdev->vconfig + pos - offset + PCI_AF_CTRL;
+
+	count = vfio_default_config_write(vdev, pos, count, perm, offset, val);
+	if (count < 0)
+		return count;
+
+	/*
+	 * The FLR bit is virtualized, if set and the device supports AF
+	 * FLR, issue a reset_function.  Regardless, clear the bit, the spec
+	 * requires it to be always read as zero.  NB, reset_function might
+	 * not use an AF FLR, we don't have that level of granularity.
+	 */
+	if (*ctrl & PCI_AF_CTRL_FLR) {
+		u8 cap;
+		int ret;
+
+		*ctrl &= ~PCI_AF_CTRL_FLR;
+
+		ret = pci_user_read_config_byte(vdev->pdev,
+						pos - offset + PCI_AF_CAP,
+						&cap);
+
+		if (!ret && (cap & PCI_AF_CAP_FLR) && (cap & PCI_AF_CAP_TP)) {
+			vfio_pci_zap_and_down_write_memory_lock(vdev);
+			pci_try_reset_function(vdev->pdev);
+			up_write(&vdev->memory_lock);
+		}
+	}
+
+	return count;
+}
+
+/* Permissions for Advanced Function capability */
+static int __init init_pci_cap_af_perm(struct perm_bits *perm)
+{
+	if (alloc_perm_bits(perm, pci_cap_length[PCI_CAP_ID_AF]))
+		return -ENOMEM;
+
+	perm->writefn = vfio_af_config_write;
+
+	p_setb(perm, PCI_CAP_LIST_NEXT, (u8)ALL_VIRT, NO_WRITE);
+	p_setb(perm, PCI_AF_CTRL, PCI_AF_CTRL_FLR, PCI_AF_CTRL_FLR);
+	return 0;
+}
+
+/* Permissions for Advanced Error Reporting extended capability */
+static int __init init_pci_ext_cap_err_perm(struct perm_bits *perm)
+{
+	u32 mask;
+
+	if (alloc_perm_bits(perm, pci_ext_cap_length[PCI_EXT_CAP_ID_ERR]))
+		return -ENOMEM;
+
+	/*
+	 * Virtualize the first dword of all express capabilities
+	 * because it includes the next pointer.  This lets us later
+	 * remove capabilities from the chain if we need to.
+	 */
+	p_setd(perm, 0, ALL_VIRT, NO_WRITE);
+
+	/* Writable bits mask */
+	mask =	PCI_ERR_UNC_UND |		/* Undefined */
+		PCI_ERR_UNC_DLP |		/* Data Link Protocol */
+		PCI_ERR_UNC_SURPDN |		/* Surprise Down */
+		PCI_ERR_UNC_POISON_TLP |	/* Poisoned TLP */
+		PCI_ERR_UNC_FCP |		/* Flow Control Protocol */
+		PCI_ERR_UNC_COMP_TIME |		/* Completion Timeout */
+		PCI_ERR_UNC_COMP_ABORT |	/* Completer Abort */
+		PCI_ERR_UNC_UNX_COMP |		/* Unexpected Completion */
+		PCI_ERR_UNC_RX_OVER |		/* Receiver Overflow */
+		PCI_ERR_UNC_MALF_TLP |		/* Malformed TLP */
+		PCI_ERR_UNC_ECRC |		/* ECRC Error Status */
+		PCI_ERR_UNC_UNSUP |		/* Unsupported Request */
+		PCI_ERR_UNC_ACSV |		/* ACS Violation */
+		PCI_ERR_UNC_INTN |		/* internal error */
+		PCI_ERR_UNC_MCBTLP |		/* MC blocked TLP */
+		PCI_ERR_UNC_ATOMEG |		/* Atomic egress blocked */
+		PCI_ERR_UNC_TLPPRE;		/* TLP prefix blocked */
+	p_setd(perm, PCI_ERR_UNCOR_STATUS, NO_VIRT, mask);
+	p_setd(perm, PCI_ERR_UNCOR_MASK, NO_VIRT, mask);
+	p_setd(perm, PCI_ERR_UNCOR_SEVER, NO_VIRT, mask);
+
+	mask =	PCI_ERR_COR_RCVR |		/* Receiver Error Status */
+		PCI_ERR_COR_BAD_TLP |		/* Bad TLP Status */
+		PCI_ERR_COR_BAD_DLLP |		/* Bad DLLP Status */
+		PCI_ERR_COR_REP_ROLL |		/* REPLAY_NUM Rollover */
+		PCI_ERR_COR_REP_TIMER |		/* Replay Timer Timeout */
+		PCI_ERR_COR_ADV_NFAT |		/* Advisory Non-Fatal */
+		PCI_ERR_COR_INTERNAL |		/* Corrected Internal */
+		PCI_ERR_COR_LOG_OVER;		/* Header Log Overflow */
+	p_setd(perm, PCI_ERR_COR_STATUS, NO_VIRT, mask);
+	p_setd(perm, PCI_ERR_COR_MASK, NO_VIRT, mask);
+
+	mask =	PCI_ERR_CAP_ECRC_GENE |		/* ECRC Generation Enable */
+		PCI_ERR_CAP_ECRC_CHKE;		/* ECRC Check Enable */
+	p_setd(perm, PCI_ERR_CAP, NO_VIRT, mask);
+	return 0;
+}
+
+/* Permissions for Power Budgeting extended capability */
+static int __init init_pci_ext_cap_pwr_perm(struct perm_bits *perm)
+{
+	if (alloc_perm_bits(perm, pci_ext_cap_length[PCI_EXT_CAP_ID_PWR]))
+		return -ENOMEM;
+
+	p_setd(perm, 0, ALL_VIRT, NO_WRITE);
+
+	/* Writing the data selector is OK, the info is still read-only */
+	p_setb(perm, PCI_PWR_DATA, NO_VIRT, (u8)ALL_WRITE);
+	return 0;
+}
+
+/*
+ * Initialize the shared permission tables
+ */
+void vfio_pci_uninit_perm_bits(void)
+{
+	free_perm_bits(&cap_perms[PCI_CAP_ID_BASIC]);
+
+	free_perm_bits(&cap_perms[PCI_CAP_ID_PM]);
+	free_perm_bits(&cap_perms[PCI_CAP_ID_VPD]);
+	free_perm_bits(&cap_perms[PCI_CAP_ID_PCIX]);
+	free_perm_bits(&cap_perms[PCI_CAP_ID_EXP]);
+	free_perm_bits(&cap_perms[PCI_CAP_ID_AF]);
+
+	free_perm_bits(&ecap_perms[PCI_EXT_CAP_ID_ERR]);
+	free_perm_bits(&ecap_perms[PCI_EXT_CAP_ID_PWR]);
+}
+
+int __init vfio_pci_init_perm_bits(void)
+{
+	int ret;
+
+	/* Basic config space */
+	ret = init_pci_cap_basic_perm(&cap_perms[PCI_CAP_ID_BASIC]);
+
+	/* Capabilities */
+	ret |= init_pci_cap_pm_perm(&cap_perms[PCI_CAP_ID_PM]);
+	ret |= init_pci_cap_vpd_perm(&cap_perms[PCI_CAP_ID_VPD]);
+	ret |= init_pci_cap_pcix_perm(&cap_perms[PCI_CAP_ID_PCIX]);
+	cap_perms[PCI_CAP_ID_VNDR].writefn = vfio_raw_config_write;
+	ret |= init_pci_cap_exp_perm(&cap_perms[PCI_CAP_ID_EXP]);
+	ret |= init_pci_cap_af_perm(&cap_perms[PCI_CAP_ID_AF]);
+
+	/* Extended capabilities */
+	ret |= init_pci_ext_cap_err_perm(&ecap_perms[PCI_EXT_CAP_ID_ERR]);
+	ret |= init_pci_ext_cap_pwr_perm(&ecap_perms[PCI_EXT_CAP_ID_PWR]);
+	ecap_perms[PCI_EXT_CAP_ID_VNDR].writefn = vfio_raw_config_write;
+	ecap_perms[PCI_EXT_CAP_ID_DVSEC].writefn = vfio_raw_config_write;
+
+	if (ret)
+		vfio_pci_uninit_perm_bits();
+
+	return ret;
+}
+
+static int vfio_find_cap_start(struct vfio_pci_core_device *vdev, int pos)
+{
+	u8 cap;
+	int base = (pos >= PCI_CFG_SPACE_SIZE) ? PCI_CFG_SPACE_SIZE :
+						 PCI_STD_HEADER_SIZEOF;
+	cap = vdev->pci_config_map[pos];
+
+	if (cap == PCI_CAP_ID_BASIC)
+		return 0;
+
+	/* XXX Can we have to abutting capabilities of the same type? */
+	while (pos - 1 >= base && vdev->pci_config_map[pos - 1] == cap)
+		pos--;
+
+	return pos;
+}
+
+static int vfio_msi_config_read(struct vfio_pci_core_device *vdev, int pos,
+				int count, struct perm_bits *perm,
+				int offset, __le32 *val)
+{
+	/* Update max available queue size from msi_qmax */
+	if (offset <= PCI_MSI_FLAGS && offset + count >= PCI_MSI_FLAGS) {
+		__le16 *flags;
+		int start;
+
+		start = vfio_find_cap_start(vdev, pos);
+
+		flags = (__le16 *)&vdev->vconfig[start];
+
+		*flags &= cpu_to_le16(~PCI_MSI_FLAGS_QMASK);
+		*flags |= cpu_to_le16(vdev->msi_qmax << 1);
+	}
+
+	return vfio_default_config_read(vdev, pos, count, perm, offset, val);
+}
+
+static int vfio_msi_config_write(struct vfio_pci_core_device *vdev, int pos,
+				 int count, struct perm_bits *perm,
+				 int offset, __le32 val)
+{
+	count = vfio_default_config_write(vdev, pos, count, perm, offset, val);
+	if (count < 0)
+		return count;
+
+	/* Fixup and write configured queue size and enable to hardware */
+	if (offset <= PCI_MSI_FLAGS && offset + count >= PCI_MSI_FLAGS) {
+		__le16 *pflags;
+		u16 flags;
+		int start, ret;
+
+		start = vfio_find_cap_start(vdev, pos);
+
+		pflags = (__le16 *)&vdev->vconfig[start + PCI_MSI_FLAGS];
+
+		flags = le16_to_cpu(*pflags);
+
+		/* MSI is enabled via ioctl */
+		if  (vdev->irq_type != VFIO_PCI_MSI_IRQ_INDEX)
+			flags &= ~PCI_MSI_FLAGS_ENABLE;
+
+		/* Check queue size */
+		if ((flags & PCI_MSI_FLAGS_QSIZE) >> 4 > vdev->msi_qmax) {
+			flags &= ~PCI_MSI_FLAGS_QSIZE;
+			flags |= vdev->msi_qmax << 4;
+		}
+
+		/* Write back to virt and to hardware */
+		*pflags = cpu_to_le16(flags);
+		ret = pci_user_write_config_word(vdev->pdev,
+						 start + PCI_MSI_FLAGS,
+						 flags);
+		if (ret)
+			return ret;
+	}
+
+	return count;
+}
+
+/*
+ * MSI determination is per-device, so this routine gets used beyond
+ * initialization time. Don't add __init
+ */
+static int init_pci_cap_msi_perm(struct perm_bits *perm, int len, u16 flags)
+{
+	if (alloc_perm_bits(perm, len))
+		return -ENOMEM;
+
+	perm->readfn = vfio_msi_config_read;
+	perm->writefn = vfio_msi_config_write;
+
+	p_setb(perm, PCI_CAP_LIST_NEXT, (u8)ALL_VIRT, NO_WRITE);
+
+	/*
+	 * The upper byte of the control register is reserved,
+	 * just setup the lower byte.
+	 */
+	p_setb(perm, PCI_MSI_FLAGS, (u8)ALL_VIRT, (u8)ALL_WRITE);
+	p_setd(perm, PCI_MSI_ADDRESS_LO, ALL_VIRT, ALL_WRITE);
+	if (flags & PCI_MSI_FLAGS_64BIT) {
+		p_setd(perm, PCI_MSI_ADDRESS_HI, ALL_VIRT, ALL_WRITE);
+		p_setw(perm, PCI_MSI_DATA_64, (u16)ALL_VIRT, (u16)ALL_WRITE);
+		if (flags & PCI_MSI_FLAGS_MASKBIT) {
+			p_setd(perm, PCI_MSI_MASK_64, NO_VIRT, ALL_WRITE);
+			p_setd(perm, PCI_MSI_PENDING_64, NO_VIRT, ALL_WRITE);
+		}
+	} else {
+		p_setw(perm, PCI_MSI_DATA_32, (u16)ALL_VIRT, (u16)ALL_WRITE);
+		if (flags & PCI_MSI_FLAGS_MASKBIT) {
+			p_setd(perm, PCI_MSI_MASK_32, NO_VIRT, ALL_WRITE);
+			p_setd(perm, PCI_MSI_PENDING_32, NO_VIRT, ALL_WRITE);
+		}
+	}
+	return 0;
+}
+
+/* Determine MSI CAP field length; initialize msi_perms on 1st call per vdev */
+static int vfio_msi_cap_len(struct vfio_pci_core_device *vdev, u8 pos)
+{
+	struct pci_dev *pdev = vdev->pdev;
+	int len, ret;
+	u16 flags;
+
+	ret = pci_read_config_word(pdev, pos + PCI_MSI_FLAGS, &flags);
+	if (ret)
+		return pcibios_err_to_errno(ret);
+
+	len = 10; /* Minimum size */
+	if (flags & PCI_MSI_FLAGS_64BIT)
+		len += 4;
+	if (flags & PCI_MSI_FLAGS_MASKBIT)
+		len += 10;
+
+	if (vdev->msi_perm)
+		return len;
+
+	vdev->msi_perm = kmalloc(sizeof(struct perm_bits), GFP_KERNEL_ACCOUNT);
+	if (!vdev->msi_perm)
+		return -ENOMEM;
+
+	ret = init_pci_cap_msi_perm(vdev->msi_perm, len, flags);
+	if (ret) {
+		kfree(vdev->msi_perm);
+		return ret;
+	}
+
+	return len;
+}
+
+/* Determine extended capability length for VC (2 & 9) and MFVC */
+static int vfio_vc_cap_len(struct vfio_pci_core_device *vdev, u16 pos)
+{
+	struct pci_dev *pdev = vdev->pdev;
+	u32 tmp;
+	int ret, evcc, phases, vc_arb;
+	int len = PCI_CAP_VC_BASE_SIZEOF;
+
+	ret = pci_read_config_dword(pdev, pos + PCI_VC_PORT_CAP1, &tmp);
+	if (ret)
+		return pcibios_err_to_errno(ret);
+
+	evcc = tmp & PCI_VC_CAP1_EVCC; /* extended vc count */
+	ret = pci_read_config_dword(pdev, pos + PCI_VC_PORT_CAP2, &tmp);
+	if (ret)
+		return pcibios_err_to_errno(ret);
+
+	if (tmp & PCI_VC_CAP2_128_PHASE)
+		phases = 128;
+	else if (tmp & PCI_VC_CAP2_64_PHASE)
+		phases = 64;
+	else if (tmp & PCI_VC_CAP2_32_PHASE)
+		phases = 32;
+	else
+		phases = 0;
+
+	vc_arb = phases * 4;
+
+	/*
+	 * Port arbitration tables are root & switch only;
+	 * function arbitration tables are function 0 only.
+	 * In either case, we'll never let user write them so
+	 * we don't care how big they are
+	 */
+	len += (1 + evcc) * PCI_CAP_VC_PER_VC_SIZEOF;
+	if (vc_arb) {
+		len = round_up(len, 16);
+		len += vc_arb / 8;
+	}
+	return len;
+}
+
+static int vfio_cap_len(struct vfio_pci_core_device *vdev, u8 cap, u8 pos)
+{
+	struct pci_dev *pdev = vdev->pdev;
+	u32 dword;
+	u16 word;
+	u8 byte;
+	int ret;
+
+	switch (cap) {
+	case PCI_CAP_ID_MSI:
+		return vfio_msi_cap_len(vdev, pos);
+	case PCI_CAP_ID_PCIX:
+		ret = pci_read_config_word(pdev, pos + PCI_X_CMD, &word);
+		if (ret)
+			return pcibios_err_to_errno(ret);
+
+		if (PCI_X_CMD_VERSION(word)) {
+			if (pdev->cfg_size > PCI_CFG_SPACE_SIZE) {
+				/* Test for extended capabilities */
+				pci_read_config_dword(pdev, PCI_CFG_SPACE_SIZE,
+						      &dword);
+				vdev->extended_caps = (dword != 0);
+			}
+			return PCI_CAP_PCIX_SIZEOF_V2;
+		} else
+			return PCI_CAP_PCIX_SIZEOF_V0;
+	case PCI_CAP_ID_VNDR:
+		/* length follows next field */
+		ret = pci_read_config_byte(pdev, pos + PCI_CAP_FLAGS, &byte);
+		if (ret)
+			return pcibios_err_to_errno(ret);
+
+		return byte;
+	case PCI_CAP_ID_EXP:
+		if (pdev->cfg_size > PCI_CFG_SPACE_SIZE) {
+			/* Test for extended capabilities */
+			pci_read_config_dword(pdev, PCI_CFG_SPACE_SIZE, &dword);
+			vdev->extended_caps = (dword != 0);
+		}
+
+		/* length based on version and type */
+		if ((pcie_caps_reg(pdev) & PCI_EXP_FLAGS_VERS) == 1) {
+			if (pci_pcie_type(pdev) == PCI_EXP_TYPE_RC_END)
+				return 0xc; /* "All Devices" only, no link */
+			return PCI_CAP_EXP_ENDPOINT_SIZEOF_V1;
+		} else {
+			if (pci_pcie_type(pdev) == PCI_EXP_TYPE_RC_END)
+				return 0x2c; /* No link */
+			return PCI_CAP_EXP_ENDPOINT_SIZEOF_V2;
+		}
+	case PCI_CAP_ID_HT:
+		ret = pci_read_config_byte(pdev, pos + 3, &byte);
+		if (ret)
+			return pcibios_err_to_errno(ret);
+
+		return (byte & HT_3BIT_CAP_MASK) ?
+			HT_CAP_SIZEOF_SHORT : HT_CAP_SIZEOF_LONG;
+	case PCI_CAP_ID_SATA:
+		ret = pci_read_config_byte(pdev, pos + PCI_SATA_REGS, &byte);
+		if (ret)
+			return pcibios_err_to_errno(ret);
+
+		byte &= PCI_SATA_REGS_MASK;
+		if (byte == PCI_SATA_REGS_INLINE)
+			return PCI_SATA_SIZEOF_LONG;
+		else
+			return PCI_SATA_SIZEOF_SHORT;
+	default:
+		pci_warn(pdev, "%s: unknown length for PCI cap %#x@%#x\n",
+			 __func__, cap, pos);
+	}
+
+	return 0;
+}
+
+static int vfio_ext_cap_len(struct vfio_pci_core_device *vdev, u16 ecap, u16 epos)
+{
+	struct pci_dev *pdev = vdev->pdev;
+	u8 byte;
+	u32 dword;
+	int ret;
+
+	switch (ecap) {
+	case PCI_EXT_CAP_ID_VNDR:
+		ret = pci_read_config_dword(pdev, epos + PCI_VSEC_HDR, &dword);
+		if (ret)
+			return pcibios_err_to_errno(ret);
+
+		return dword >> PCI_VSEC_HDR_LEN_SHIFT;
+	case PCI_EXT_CAP_ID_VC:
+	case PCI_EXT_CAP_ID_VC9:
+	case PCI_EXT_CAP_ID_MFVC:
+		return vfio_vc_cap_len(vdev, epos);
+	case PCI_EXT_CAP_ID_ACS:
+		ret = pci_read_config_byte(pdev, epos + PCI_ACS_CAP, &byte);
+		if (ret)
+			return pcibios_err_to_errno(ret);
+
+		if (byte & PCI_ACS_EC) {
+			int bits;
+
+			ret = pci_read_config_byte(pdev,
+						   epos + PCI_ACS_EGRESS_BITS,
+						   &byte);
+			if (ret)
+				return pcibios_err_to_errno(ret);
+
+			bits = byte ? round_up(byte, 32) : 256;
+			return 8 + (bits / 8);
+		}
+		return 8;
+
+	case PCI_EXT_CAP_ID_REBAR:
+		ret = pci_read_config_byte(pdev, epos + PCI_REBAR_CTRL, &byte);
+		if (ret)
+			return pcibios_err_to_errno(ret);
+
+		byte &= PCI_REBAR_CTRL_NBAR_MASK;
+		byte >>= PCI_REBAR_CTRL_NBAR_SHIFT;
+
+		return 4 + (byte * 8);
+	case PCI_EXT_CAP_ID_DPA:
+		ret = pci_read_config_byte(pdev, epos + PCI_DPA_CAP, &byte);
+		if (ret)
+			return pcibios_err_to_errno(ret);
+
+		byte &= PCI_DPA_CAP_SUBSTATE_MASK;
+		return PCI_DPA_BASE_SIZEOF + byte + 1;
+	case PCI_EXT_CAP_ID_TPH:
+		ret = pci_read_config_dword(pdev, epos + PCI_TPH_CAP, &dword);
+		if (ret)
+			return pcibios_err_to_errno(ret);
+
+		if ((dword & PCI_TPH_CAP_LOC_MASK) == PCI_TPH_LOC_CAP) {
+			int sts;
+
+			sts = dword & PCI_TPH_CAP_ST_MASK;
+			sts >>= PCI_TPH_CAP_ST_SHIFT;
+			return PCI_TPH_BASE_SIZEOF + (sts * 2) + 2;
+		}
+		return PCI_TPH_BASE_SIZEOF;
+	case PCI_EXT_CAP_ID_DVSEC:
+		ret = pci_read_config_dword(pdev, epos + PCI_DVSEC_HEADER1, &dword);
+		if (ret)
+			return pcibios_err_to_errno(ret);
+		return PCI_DVSEC_HEADER1_LEN(dword);
+	default:
+		pci_warn(pdev, "%s: unknown length for PCI ecap %#x@%#x\n",
+			 __func__, ecap, epos);
+	}
+
+	return 0;
+}
+
+static void vfio_update_pm_vconfig_bytes(struct vfio_pci_core_device *vdev,
+					 int offset)
+{
+	__le16 *pmc = (__le16 *)&vdev->vconfig[offset + PCI_PM_PMC];
+	__le16 *ctrl = (__le16 *)&vdev->vconfig[offset + PCI_PM_CTRL];
+
+	/* Clear vconfig PME_Support, PME_Status, and PME_En bits */
+	*pmc &= ~cpu_to_le16(PCI_PM_CAP_PME_MASK);
+	*ctrl &= ~cpu_to_le16(PCI_PM_CTRL_PME_ENABLE | PCI_PM_CTRL_PME_STATUS);
+}
+
+static int vfio_fill_vconfig_bytes(struct vfio_pci_core_device *vdev,
+				   int offset, int size)
+{
+	struct pci_dev *pdev = vdev->pdev;
+	int ret = 0;
+
+	/*
+	 * We try to read physical config space in the largest chunks
+	 * we can, assuming that all of the fields support dword access.
+	 * pci_save_state() makes this same assumption and seems to do ok.
+	 */
+	while (size) {
+		int filled;
+
+		if (size >= 4 && !(offset % 4)) {
+			__le32 *dwordp = (__le32 *)&vdev->vconfig[offset];
+			u32 dword;
+
+			ret = pci_read_config_dword(pdev, offset, &dword);
+			if (ret)
+				return ret;
+			*dwordp = cpu_to_le32(dword);
+			filled = 4;
+		} else if (size >= 2 && !(offset % 2)) {
+			__le16 *wordp = (__le16 *)&vdev->vconfig[offset];
+			u16 word;
+
+			ret = pci_read_config_word(pdev, offset, &word);
+			if (ret)
+				return ret;
+			*wordp = cpu_to_le16(word);
+			filled = 2;
+		} else {
+			u8 *byte = &vdev->vconfig[offset];
+			ret = pci_read_config_byte(pdev, offset, byte);
+			if (ret)
+				return ret;
+			filled = 1;
+		}
+
+		offset += filled;
+		size -= filled;
+	}
+
+	return ret;
+}
+
+static int vfio_cap_init(struct vfio_pci_core_device *vdev)
+{
+	struct pci_dev *pdev = vdev->pdev;
+	u8 *map = vdev->pci_config_map;
+	u16 status;
+	u8 pos, *prev, cap;
+	int loops, ret, caps = 0;
+
+	/* Any capabilities? */
+	ret = pci_read_config_word(pdev, PCI_STATUS, &status);
+	if (ret)
+		return ret;
+
+	if (!(status & PCI_STATUS_CAP_LIST))
+		return 0; /* Done */
+
+	ret = pci_read_config_byte(pdev, PCI_CAPABILITY_LIST, &pos);
+	if (ret)
+		return ret;
+
+	/* Mark the previous position in case we want to skip a capability */
+	prev = &vdev->vconfig[PCI_CAPABILITY_LIST];
+
+	/* We can bound our loop, capabilities are dword aligned */
+	loops = (PCI_CFG_SPACE_SIZE - PCI_STD_HEADER_SIZEOF) / PCI_CAP_SIZEOF;
+	while (pos && loops--) {
+		u8 next;
+		int i, len = 0;
+
+		ret = pci_read_config_byte(pdev, pos, &cap);
+		if (ret)
+			return ret;
+
+		ret = pci_read_config_byte(pdev,
+					   pos + PCI_CAP_LIST_NEXT, &next);
+		if (ret)
+			return ret;
+
+		/*
+		 * ID 0 is a NULL capability, conflicting with our fake
+		 * PCI_CAP_ID_BASIC.  As it has no content, consider it
+		 * hidden for now.
+		 */
+		if (cap && cap <= PCI_CAP_ID_MAX) {
+			len = pci_cap_length[cap];
+			if (len == 0xFF) { /* Variable length */
+				len = vfio_cap_len(vdev, cap, pos);
+				if (len < 0)
+					return len;
+			}
+		}
+
+		if (!len) {
+			pci_dbg(pdev, "%s: hiding cap %#x@%#x\n", __func__,
+				cap, pos);
+			*prev = next;
+			pos = next;
+			continue;
+		}
+
+		/* Sanity check, do we overlap other capabilities? */
+		for (i = 0; i < len; i++) {
+			if (likely(map[pos + i] == PCI_CAP_ID_INVALID))
+				continue;
+
+			pci_warn(pdev, "%s: PCI config conflict @%#x, was cap %#x now cap %#x\n",
+				 __func__, pos + i, map[pos + i], cap);
+		}
+
+		BUILD_BUG_ON(PCI_CAP_ID_MAX >= PCI_CAP_ID_INVALID_VIRT);
+
+		memset(map + pos, cap, len);
+		ret = vfio_fill_vconfig_bytes(vdev, pos, len);
+		if (ret)
+			return ret;
+
+		if (cap == PCI_CAP_ID_PM)
+			vfio_update_pm_vconfig_bytes(vdev, pos);
+
+		prev = &vdev->vconfig[pos + PCI_CAP_LIST_NEXT];
+		pos = next;
+		caps++;
+	}
+
+	/* If we didn't fill any capabilities, clear the status flag */
+	if (!caps) {
+		__le16 *vstatus = (__le16 *)&vdev->vconfig[PCI_STATUS];
+		*vstatus &= ~cpu_to_le16(PCI_STATUS_CAP_LIST);
+	}
+
+	return 0;
+}
+
+static int vfio_ecap_init(struct vfio_pci_core_device *vdev)
+{
+	struct pci_dev *pdev = vdev->pdev;
+	u8 *map = vdev->pci_config_map;
+	u16 epos;
+	__le32 *prev = NULL;
+	int loops, ret, ecaps = 0;
+
+	if (!vdev->extended_caps)
+		return 0;
+
+	epos = PCI_CFG_SPACE_SIZE;
+
+	loops = (pdev->cfg_size - PCI_CFG_SPACE_SIZE) / PCI_CAP_SIZEOF;
+
+	while (loops-- && epos >= PCI_CFG_SPACE_SIZE) {
+		u32 header;
+		u16 ecap;
+		int i, len = 0;
+		bool hidden = false;
+
+		ret = pci_read_config_dword(pdev, epos, &header);
+		if (ret)
+			return ret;
+
+		ecap = PCI_EXT_CAP_ID(header);
+
+		if (ecap <= PCI_EXT_CAP_ID_MAX) {
+			len = pci_ext_cap_length[ecap];
+			if (len == 0xFF) {
+				len = vfio_ext_cap_len(vdev, ecap, epos);
+				if (len < 0)
+					return len;
+			}
+		}
+
+		if (!len) {
+			pci_dbg(pdev, "%s: hiding ecap %#x@%#x\n",
+				__func__, ecap, epos);
+
+			/* If not the first in the chain, we can skip over it */
+			if (prev) {
+				u32 val = epos = PCI_EXT_CAP_NEXT(header);
+				*prev &= cpu_to_le32(~(0xffcU << 20));
+				*prev |= cpu_to_le32(val << 20);
+				continue;
+			}
+
+			/*
+			 * Otherwise, fill in a placeholder, the direct
+			 * readfn will virtualize this automatically
+			 */
+			len = PCI_CAP_SIZEOF;
+			hidden = true;
+		}
+
+		for (i = 0; i < len; i++) {
+			if (likely(map[epos + i] == PCI_CAP_ID_INVALID))
+				continue;
+
+			pci_warn(pdev, "%s: PCI config conflict @%#x, was ecap %#x now ecap %#x\n",
+				 __func__, epos + i, map[epos + i], ecap);
+		}
+
+		/*
+		 * Even though ecap is 2 bytes, we're currently a long way
+		 * from exceeding 1 byte capabilities.  If we ever make it
+		 * up to 0xFE we'll need to up this to a two-byte, byte map.
+		 */
+		BUILD_BUG_ON(PCI_EXT_CAP_ID_MAX >= PCI_CAP_ID_INVALID_VIRT);
+
+		memset(map + epos, ecap, len);
+		ret = vfio_fill_vconfig_bytes(vdev, epos, len);
+		if (ret)
+			return ret;
+
+		/*
+		 * If we're just using this capability to anchor the list,
+		 * hide the real ID.  Only count real ecaps.  XXX PCI spec
+		 * indicates to use cap id = 0, version = 0, next = 0 if
+		 * ecaps are absent, hope users check all the way to next.
+		 */
+		if (hidden)
+			*(__le32 *)&vdev->vconfig[epos] &=
+				cpu_to_le32((0xffcU << 20));
+		else
+			ecaps++;
+
+		prev = (__le32 *)&vdev->vconfig[epos];
+		epos = PCI_EXT_CAP_NEXT(header);
+	}
+
+	if (!ecaps)
+		*(u32 *)&vdev->vconfig[PCI_CFG_SPACE_SIZE] = 0;
+
+	return 0;
+}
+
+/*
+ * Nag about hardware bugs, hopefully to have vendors fix them, but at least
+ * to collect a list of dependencies for the VF INTx pin quirk below.
+ */
+static const struct pci_device_id known_bogus_vf_intx_pin[] = {
+	{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x270c) },
+	{}
+};
+
+/*
+ * For each device we allocate a pci_config_map that indicates the
+ * capability occupying each dword and thus the struct perm_bits we
+ * use for read and write.  We also allocate a virtualized config
+ * space which tracks reads and writes to bits that we emulate for
+ * the user.  Initial values filled from device.
+ *
+ * Using shared struct perm_bits between all vfio-pci devices saves
+ * us from allocating cfg_size buffers for virt and write for every
+ * device.  We could remove vconfig and allocate individual buffers
+ * for each area requiring emulated bits, but the array of pointers
+ * would be comparable in size (at least for standard config space).
+ */
+int vfio_config_init(struct vfio_pci_core_device *vdev)
+{
+	struct pci_dev *pdev = vdev->pdev;
+	u8 *map, *vconfig;
+	int ret;
+
+	/*
+	 * Config space, caps and ecaps are all dword aligned, so we could
+	 * use one byte per dword to record the type.  However, there are
+	 * no requirements on the length of a capability, so the gap between
+	 * capabilities needs byte granularity.
+	 */
+	map = kmalloc(pdev->cfg_size, GFP_KERNEL_ACCOUNT);
+	if (!map)
+		return -ENOMEM;
+
+	vconfig = kmalloc(pdev->cfg_size, GFP_KERNEL_ACCOUNT);
+	if (!vconfig) {
+		kfree(map);
+		return -ENOMEM;
+	}
+
+	vdev->pci_config_map = map;
+	vdev->vconfig = vconfig;
+
+	memset(map, PCI_CAP_ID_BASIC, PCI_STD_HEADER_SIZEOF);
+	memset(map + PCI_STD_HEADER_SIZEOF, PCI_CAP_ID_INVALID,
+	       pdev->cfg_size - PCI_STD_HEADER_SIZEOF);
+
+	ret = vfio_fill_vconfig_bytes(vdev, 0, PCI_STD_HEADER_SIZEOF);
+	if (ret)
+		goto out;
+
+	vdev->bardirty = true;
+
+	/*
+	 * XXX can we just pci_load_saved_state/pci_restore_state?
+	 * may need to rebuild vconfig after that
+	 */
+
+	/* For restore after reset */
+	vdev->rbar[0] = le32_to_cpu(*(__le32 *)&vconfig[PCI_BASE_ADDRESS_0]);
+	vdev->rbar[1] = le32_to_cpu(*(__le32 *)&vconfig[PCI_BASE_ADDRESS_1]);
+	vdev->rbar[2] = le32_to_cpu(*(__le32 *)&vconfig[PCI_BASE_ADDRESS_2]);
+	vdev->rbar[3] = le32_to_cpu(*(__le32 *)&vconfig[PCI_BASE_ADDRESS_3]);
+	vdev->rbar[4] = le32_to_cpu(*(__le32 *)&vconfig[PCI_BASE_ADDRESS_4]);
+	vdev->rbar[5] = le32_to_cpu(*(__le32 *)&vconfig[PCI_BASE_ADDRESS_5]);
+	vdev->rbar[6] = le32_to_cpu(*(__le32 *)&vconfig[PCI_ROM_ADDRESS]);
+
+	if (pdev->is_virtfn) {
+		*(__le16 *)&vconfig[PCI_VENDOR_ID] = cpu_to_le16(pdev->vendor);
+		*(__le16 *)&vconfig[PCI_DEVICE_ID] = cpu_to_le16(pdev->device);
+
+		/*
+		 * Per SR-IOV spec rev 1.1, 3.4.1.18 the interrupt pin register
+		 * does not apply to VFs and VFs must implement this register
+		 * as read-only with value zero.  Userspace is not readily able
+		 * to identify whether a device is a VF and thus that the pin
+		 * definition on the device is bogus should it violate this
+		 * requirement.  We already virtualize the pin register for
+		 * other purposes, so we simply need to replace the bogus value
+		 * and consider VFs when we determine INTx IRQ count.
+		 */
+		if (vconfig[PCI_INTERRUPT_PIN] &&
+		    !pci_match_id(known_bogus_vf_intx_pin, pdev))
+			pci_warn(pdev,
+				 "Hardware bug: VF reports bogus INTx pin %d\n",
+				 vconfig[PCI_INTERRUPT_PIN]);
+
+		vconfig[PCI_INTERRUPT_PIN] = 0; /* Gratuitous for good VFs */
+	}
+	if (pdev->no_command_memory) {
+		/*
+		 * VFs and devices that set pdev->no_command_memory do not
+		 * implement the memory enable bit of the COMMAND register
+		 * therefore we'll not have it set in our initial copy of
+		 * config space after pci_enable_device().  For consistency
+		 * with PFs, set the virtual enable bit here.
+		 */
+		*(__le16 *)&vconfig[PCI_COMMAND] |=
+					cpu_to_le16(PCI_COMMAND_MEMORY);
+	}
+
+	if (!IS_ENABLED(CONFIG_VFIO_PCI_INTX) || vdev->nointx)
+		vconfig[PCI_INTERRUPT_PIN] = 0;
+
+	ret = vfio_cap_init(vdev);
+	if (ret)
+		goto out;
+
+	ret = vfio_ecap_init(vdev);
+	if (ret)
+		goto out;
+
+	return 0;
+
+out:
+	kfree(map);
+	vdev->pci_config_map = NULL;
+	kfree(vconfig);
+	vdev->vconfig = NULL;
+	return pcibios_err_to_errno(ret);
+}
+
+void vfio_config_free(struct vfio_pci_core_device *vdev)
+{
+	kfree(vdev->vconfig);
+	vdev->vconfig = NULL;
+	kfree(vdev->pci_config_map);
+	vdev->pci_config_map = NULL;
+	if (vdev->msi_perm) {
+		free_perm_bits(vdev->msi_perm);
+		kfree(vdev->msi_perm);
+		vdev->msi_perm = NULL;
+	}
+}
+
+/*
+ * Find the remaining number of bytes in a dword that match the given
+ * position.  Stop at either the end of the capability or the dword boundary.
+ */
+static size_t vfio_pci_cap_remaining_dword(struct vfio_pci_core_device *vdev,
+					   loff_t pos)
+{
+	u8 cap = vdev->pci_config_map[pos];
+	size_t i;
+
+	for (i = 1; (pos + i) % 4 && vdev->pci_config_map[pos + i] == cap; i++)
+		/* nop */;
+
+	return i;
+}
+
+static ssize_t vfio_config_do_rw(struct vfio_pci_core_device *vdev, char __user *buf,
+				 size_t count, loff_t *ppos, bool iswrite)
+{
+	struct pci_dev *pdev = vdev->pdev;
+	struct perm_bits *perm;
+	__le32 val = 0;
+	int cap_start = 0, offset;
+	u8 cap_id;
+	ssize_t ret;
+
+	if (*ppos < 0 || *ppos >= pdev->cfg_size ||
+	    *ppos + count > pdev->cfg_size)
+		return -EFAULT;
+
+	/*
+	 * Chop accesses into aligned chunks containing no more than a
+	 * single capability.  Caller increments to the next chunk.
+	 */
+	count = min(count, vfio_pci_cap_remaining_dword(vdev, *ppos));
+	if (count >= 4 && !(*ppos % 4))
+		count = 4;
+	else if (count >= 2 && !(*ppos % 2))
+		count = 2;
+	else
+		count = 1;
+
+	ret = count;
+
+	cap_id = vdev->pci_config_map[*ppos];
+
+	if (cap_id == PCI_CAP_ID_INVALID) {
+		perm = &unassigned_perms;
+		cap_start = *ppos;
+	} else if (cap_id == PCI_CAP_ID_INVALID_VIRT) {
+		perm = &virt_perms;
+		cap_start = *ppos;
+	} else {
+		if (*ppos >= PCI_CFG_SPACE_SIZE) {
+			WARN_ON(cap_id > PCI_EXT_CAP_ID_MAX);
+
+			perm = &ecap_perms[cap_id];
+			cap_start = vfio_find_cap_start(vdev, *ppos);
+		} else {
+			WARN_ON(cap_id > PCI_CAP_ID_MAX);
+
+			perm = &cap_perms[cap_id];
+
+			if (cap_id == PCI_CAP_ID_MSI)
+				perm = vdev->msi_perm;
+
+			if (cap_id > PCI_CAP_ID_BASIC)
+				cap_start = vfio_find_cap_start(vdev, *ppos);
+		}
+	}
+
+	WARN_ON(!cap_start && cap_id != PCI_CAP_ID_BASIC);
+	WARN_ON(cap_start > *ppos);
+
+	offset = *ppos - cap_start;
+
+	if (iswrite) {
+		if (!perm->writefn)
+			return ret;
+
+		if (copy_from_user(&val, buf, count))
+			return -EFAULT;
+
+		ret = perm->writefn(vdev, *ppos, count, perm, offset, val);
+	} else {
+		if (perm->readfn) {
+			ret = perm->readfn(vdev, *ppos, count,
+					   perm, offset, &val);
+			if (ret < 0)
+				return ret;
+		}
+
+		if (copy_to_user(buf, &val, count))
+			return -EFAULT;
+	}
+
+	return ret;
+}
+
+ssize_t vfio_pci_config_rw(struct vfio_pci_core_device *vdev, char __user *buf,
+			   size_t count, loff_t *ppos, bool iswrite)
+{
+	size_t done = 0;
+	int ret = 0;
+	loff_t pos = *ppos;
+
+	pos &= VFIO_PCI_OFFSET_MASK;
+
+	while (count) {
+		ret = vfio_config_do_rw(vdev, buf, count, &pos, iswrite);
+		if (ret < 0)
+			return ret;
+
+		count -= ret;
+		done += ret;
+		buf += ret;
+		pos += ret;
+	}
+
+	*ppos += done;
+
+	return done;
+}
diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
new file mode 100644
index 0000000000..1929103ee5
--- /dev/null
+++ b/drivers/vfio/pci/vfio_pci_core.c
@@ -0,0 +1,2664 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
+ *     Author: Alex Williamson <alex.williamson@redhat.com>
+ *
+ * Derived from original vfio:
+ * Copyright 2010 Cisco Systems, Inc.  All rights reserved.
+ * Author: Tom Lyon, pugs@cisco.com
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/aperture.h>
+#include <linux/device.h>
+#include <linux/eventfd.h>
+#include <linux/file.h>
+#include <linux/interrupt.h>
+#include <linux/iommu.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/notifier.h>
+#include <linux/pci.h>
+#include <linux/pm_runtime.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/uaccess.h>
+#include <linux/vgaarb.h>
+#include <linux/nospec.h>
+#include <linux/sched/mm.h>
+#include <linux/iommufd.h>
+#if IS_ENABLED(CONFIG_EEH)
+#include <asm/eeh.h>
+#endif
+
+#include "vfio_pci_priv.h"
+
+#define DRIVER_AUTHOR   "Alex Williamson <alex.williamson@redhat.com>"
+#define DRIVER_DESC "core driver for VFIO based PCI devices"
+
+static bool nointxmask;
+static bool disable_vga;
+static bool disable_idle_d3;
+
+/* List of PF's that vfio_pci_core_sriov_configure() has been called on */
+static DEFINE_MUTEX(vfio_pci_sriov_pfs_mutex);
+static LIST_HEAD(vfio_pci_sriov_pfs);
+
+struct vfio_pci_dummy_resource {
+	struct resource		resource;
+	int			index;
+	struct list_head	res_next;
+};
+
+struct vfio_pci_vf_token {
+	struct mutex		lock;
+	uuid_t			uuid;
+	int			users;
+};
+
+struct vfio_pci_mmap_vma {
+	struct vm_area_struct	*vma;
+	struct list_head	vma_next;
+};
+
+static inline bool vfio_vga_disabled(void)
+{
+#ifdef CONFIG_VFIO_PCI_VGA
+	return disable_vga;
+#else
+	return true;
+#endif
+}
+
+/*
+ * Our VGA arbiter participation is limited since we don't know anything
+ * about the device itself.  However, if the device is the only VGA device
+ * downstream of a bridge and VFIO VGA support is disabled, then we can
+ * safely return legacy VGA IO and memory as not decoded since the user
+ * has no way to get to it and routing can be disabled externally at the
+ * bridge.
+ */
+static unsigned int vfio_pci_set_decode(struct pci_dev *pdev, bool single_vga)
+{
+	struct pci_dev *tmp = NULL;
+	unsigned char max_busnr;
+	unsigned int decodes;
+
+	if (single_vga || !vfio_vga_disabled() || pci_is_root_bus(pdev->bus))
+		return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM |
+		       VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM;
+
+	max_busnr = pci_bus_max_busnr(pdev->bus);
+	decodes = VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
+
+	while ((tmp = pci_get_class(PCI_CLASS_DISPLAY_VGA << 8, tmp)) != NULL) {
+		if (tmp == pdev ||
+		    pci_domain_nr(tmp->bus) != pci_domain_nr(pdev->bus) ||
+		    pci_is_root_bus(tmp->bus))
+			continue;
+
+		if (tmp->bus->number >= pdev->bus->number &&
+		    tmp->bus->number <= max_busnr) {
+			pci_dev_put(tmp);
+			decodes |= VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM;
+			break;
+		}
+	}
+
+	return decodes;
+}
+
+static void vfio_pci_probe_mmaps(struct vfio_pci_core_device *vdev)
+{
+	struct resource *res;
+	int i;
+	struct vfio_pci_dummy_resource *dummy_res;
+
+	for (i = 0; i < PCI_STD_NUM_BARS; i++) {
+		int bar = i + PCI_STD_RESOURCES;
+
+		res = &vdev->pdev->resource[bar];
+
+		if (!IS_ENABLED(CONFIG_VFIO_PCI_MMAP))
+			goto no_mmap;
+
+		if (!(res->flags & IORESOURCE_MEM))
+			goto no_mmap;
+
+		/*
+		 * The PCI core shouldn't set up a resource with a
+		 * type but zero size. But there may be bugs that
+		 * cause us to do that.
+		 */
+		if (!resource_size(res))
+			goto no_mmap;
+
+		if (resource_size(res) >= PAGE_SIZE) {
+			vdev->bar_mmap_supported[bar] = true;
+			continue;
+		}
+
+		if (!(res->start & ~PAGE_MASK)) {
+			/*
+			 * Add a dummy resource to reserve the remainder
+			 * of the exclusive page in case that hot-add
+			 * device's bar is assigned into it.
+			 */
+			dummy_res =
+				kzalloc(sizeof(*dummy_res), GFP_KERNEL_ACCOUNT);
+			if (dummy_res == NULL)
+				goto no_mmap;
+
+			dummy_res->resource.name = "vfio sub-page reserved";
+			dummy_res->resource.start = res->end + 1;
+			dummy_res->resource.end = res->start + PAGE_SIZE - 1;
+			dummy_res->resource.flags = res->flags;
+			if (request_resource(res->parent,
+						&dummy_res->resource)) {
+				kfree(dummy_res);
+				goto no_mmap;
+			}
+			dummy_res->index = bar;
+			list_add(&dummy_res->res_next,
+					&vdev->dummy_resources_list);
+			vdev->bar_mmap_supported[bar] = true;
+			continue;
+		}
+		/*
+		 * Here we don't handle the case when the BAR is not page
+		 * aligned because we can't expect the BAR will be
+		 * assigned into the same location in a page in guest
+		 * when we passthrough the BAR. And it's hard to access
+		 * this BAR in userspace because we have no way to get
+		 * the BAR's location in a page.
+		 */
+no_mmap:
+		vdev->bar_mmap_supported[bar] = false;
+	}
+}
+
+struct vfio_pci_group_info;
+static void vfio_pci_dev_set_try_reset(struct vfio_device_set *dev_set);
+static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set,
+				      struct vfio_pci_group_info *groups,
+				      struct iommufd_ctx *iommufd_ctx);
+
+/*
+ * INTx masking requires the ability to disable INTx signaling via PCI_COMMAND
+ * _and_ the ability detect when the device is asserting INTx via PCI_STATUS.
+ * If a device implements the former but not the latter we would typically
+ * expect broken_intx_masking be set and require an exclusive interrupt.
+ * However since we do have control of the device's ability to assert INTx,
+ * we can instead pretend that the device does not implement INTx, virtualizing
+ * the pin register to report zero and maintaining DisINTx set on the host.
+ */
+static bool vfio_pci_nointx(struct pci_dev *pdev)
+{
+	switch (pdev->vendor) {
+	case PCI_VENDOR_ID_INTEL:
+		switch (pdev->device) {
+		/* All i40e (XL710/X710/XXV710) 10/20/25/40GbE NICs */
+		case 0x1572:
+		case 0x1574:
+		case 0x1580 ... 0x1581:
+		case 0x1583 ... 0x158b:
+		case 0x37d0 ... 0x37d2:
+		/* X550 */
+		case 0x1563:
+			return true;
+		default:
+			return false;
+		}
+	}
+
+	return false;
+}
+
+static void vfio_pci_probe_power_state(struct vfio_pci_core_device *vdev)
+{
+	struct pci_dev *pdev = vdev->pdev;
+	u16 pmcsr;
+
+	if (!pdev->pm_cap)
+		return;
+
+	pci_read_config_word(pdev, pdev->pm_cap + PCI_PM_CTRL, &pmcsr);
+
+	vdev->needs_pm_restore = !(pmcsr & PCI_PM_CTRL_NO_SOFT_RESET);
+}
+
+/*
+ * pci_set_power_state() wrapper handling devices which perform a soft reset on
+ * D3->D0 transition.  Save state prior to D0/1/2->D3, stash it on the vdev,
+ * restore when returned to D0.  Saved separately from pci_saved_state for use
+ * by PM capability emulation and separately from pci_dev internal saved state
+ * to avoid it being overwritten and consumed around other resets.
+ */
+int vfio_pci_set_power_state(struct vfio_pci_core_device *vdev, pci_power_t state)
+{
+	struct pci_dev *pdev = vdev->pdev;
+	bool needs_restore = false, needs_save = false;
+	int ret;
+
+	/* Prevent changing power state for PFs with VFs enabled */
+	if (pci_num_vf(pdev) && state > PCI_D0)
+		return -EBUSY;
+
+	if (vdev->needs_pm_restore) {
+		if (pdev->current_state < PCI_D3hot && state >= PCI_D3hot) {
+			pci_save_state(pdev);
+			needs_save = true;
+		}
+
+		if (pdev->current_state >= PCI_D3hot && state <= PCI_D0)
+			needs_restore = true;
+	}
+
+	ret = pci_set_power_state(pdev, state);
+
+	if (!ret) {
+		/* D3 might be unsupported via quirk, skip unless in D3 */
+		if (needs_save && pdev->current_state >= PCI_D3hot) {
+			/*
+			 * The current PCI state will be saved locally in
+			 * 'pm_save' during the D3hot transition. When the
+			 * device state is changed to D0 again with the current
+			 * function, then pci_store_saved_state() will restore
+			 * the state and will free the memory pointed by
+			 * 'pm_save'. There are few cases where the PCI power
+			 * state can be changed to D0 without the involvement
+			 * of the driver. For these cases, free the earlier
+			 * allocated memory first before overwriting 'pm_save'
+			 * to prevent the memory leak.
+			 */
+			kfree(vdev->pm_save);
+			vdev->pm_save = pci_store_saved_state(pdev);
+		} else if (needs_restore) {
+			pci_load_and_free_saved_state(pdev, &vdev->pm_save);
+			pci_restore_state(pdev);
+		}
+	}
+
+	return ret;
+}
+
+static int vfio_pci_runtime_pm_entry(struct vfio_pci_core_device *vdev,
+				     struct eventfd_ctx *efdctx)
+{
+	/*
+	 * The vdev power related flags are protected with 'memory_lock'
+	 * semaphore.
+	 */
+	vfio_pci_zap_and_down_write_memory_lock(vdev);
+	if (vdev->pm_runtime_engaged) {
+		up_write(&vdev->memory_lock);
+		return -EINVAL;
+	}
+
+	vdev->pm_runtime_engaged = true;
+	vdev->pm_wake_eventfd_ctx = efdctx;
+	pm_runtime_put_noidle(&vdev->pdev->dev);
+	up_write(&vdev->memory_lock);
+
+	return 0;
+}
+
+static int vfio_pci_core_pm_entry(struct vfio_device *device, u32 flags,
+				  void __user *arg, size_t argsz)
+{
+	struct vfio_pci_core_device *vdev =
+		container_of(device, struct vfio_pci_core_device, vdev);
+	int ret;
+
+	ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_SET, 0);
+	if (ret != 1)
+		return ret;
+
+	/*
+	 * Inside vfio_pci_runtime_pm_entry(), only the runtime PM usage count
+	 * will be decremented. The pm_runtime_put() will be invoked again
+	 * while returning from the ioctl and then the device can go into
+	 * runtime suspended state.
+	 */
+	return vfio_pci_runtime_pm_entry(vdev, NULL);
+}
+
+static int vfio_pci_core_pm_entry_with_wakeup(
+	struct vfio_device *device, u32 flags,
+	struct vfio_device_low_power_entry_with_wakeup __user *arg,
+	size_t argsz)
+{
+	struct vfio_pci_core_device *vdev =
+		container_of(device, struct vfio_pci_core_device, vdev);
+	struct vfio_device_low_power_entry_with_wakeup entry;
+	struct eventfd_ctx *efdctx;
+	int ret;
+
+	ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_SET,
+				 sizeof(entry));
+	if (ret != 1)
+		return ret;
+
+	if (copy_from_user(&entry, arg, sizeof(entry)))
+		return -EFAULT;
+
+	if (entry.wakeup_eventfd < 0)
+		return -EINVAL;
+
+	efdctx = eventfd_ctx_fdget(entry.wakeup_eventfd);
+	if (IS_ERR(efdctx))
+		return PTR_ERR(efdctx);
+
+	ret = vfio_pci_runtime_pm_entry(vdev, efdctx);
+	if (ret)
+		eventfd_ctx_put(efdctx);
+
+	return ret;
+}
+
+static void __vfio_pci_runtime_pm_exit(struct vfio_pci_core_device *vdev)
+{
+	if (vdev->pm_runtime_engaged) {
+		vdev->pm_runtime_engaged = false;
+		pm_runtime_get_noresume(&vdev->pdev->dev);
+
+		if (vdev->pm_wake_eventfd_ctx) {
+			eventfd_ctx_put(vdev->pm_wake_eventfd_ctx);
+			vdev->pm_wake_eventfd_ctx = NULL;
+		}
+	}
+}
+
+static void vfio_pci_runtime_pm_exit(struct vfio_pci_core_device *vdev)
+{
+	/*
+	 * The vdev power related flags are protected with 'memory_lock'
+	 * semaphore.
+	 */
+	down_write(&vdev->memory_lock);
+	__vfio_pci_runtime_pm_exit(vdev);
+	up_write(&vdev->memory_lock);
+}
+
+static int vfio_pci_core_pm_exit(struct vfio_device *device, u32 flags,
+				 void __user *arg, size_t argsz)
+{
+	struct vfio_pci_core_device *vdev =
+		container_of(device, struct vfio_pci_core_device, vdev);
+	int ret;
+
+	ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_SET, 0);
+	if (ret != 1)
+		return ret;
+
+	/*
+	 * The device is always in the active state here due to pm wrappers
+	 * around ioctls. If the device had entered a low power state and
+	 * pm_wake_eventfd_ctx is valid, vfio_pci_core_runtime_resume() has
+	 * already signaled the eventfd and exited low power mode itself.
+	 * pm_runtime_engaged protects the redundant call here.
+	 */
+	vfio_pci_runtime_pm_exit(vdev);
+	return 0;
+}
+
+#ifdef CONFIG_PM
+static int vfio_pci_core_runtime_suspend(struct device *dev)
+{
+	struct vfio_pci_core_device *vdev = dev_get_drvdata(dev);
+
+	down_write(&vdev->memory_lock);
+	/*
+	 * The user can move the device into D3hot state before invoking
+	 * power management IOCTL. Move the device into D0 state here and then
+	 * the pci-driver core runtime PM suspend function will move the device
+	 * into the low power state. Also, for the devices which have
+	 * NoSoftRst-, it will help in restoring the original state
+	 * (saved locally in 'vdev->pm_save').
+	 */
+	vfio_pci_set_power_state(vdev, PCI_D0);
+	up_write(&vdev->memory_lock);
+
+	/*
+	 * If INTx is enabled, then mask INTx before going into the runtime
+	 * suspended state and unmask the same in the runtime resume.
+	 * If INTx has already been masked by the user, then
+	 * vfio_pci_intx_mask() will return false and in that case, INTx
+	 * should not be unmasked in the runtime resume.
+	 */
+	vdev->pm_intx_masked = ((vdev->irq_type == VFIO_PCI_INTX_IRQ_INDEX) &&
+				vfio_pci_intx_mask(vdev));
+
+	return 0;
+}
+
+static int vfio_pci_core_runtime_resume(struct device *dev)
+{
+	struct vfio_pci_core_device *vdev = dev_get_drvdata(dev);
+
+	/*
+	 * Resume with a pm_wake_eventfd_ctx signals the eventfd and exit
+	 * low power mode.
+	 */
+	down_write(&vdev->memory_lock);
+	if (vdev->pm_wake_eventfd_ctx) {
+		eventfd_signal(vdev->pm_wake_eventfd_ctx, 1);
+		__vfio_pci_runtime_pm_exit(vdev);
+	}
+	up_write(&vdev->memory_lock);
+
+	if (vdev->pm_intx_masked)
+		vfio_pci_intx_unmask(vdev);
+
+	return 0;
+}
+#endif /* CONFIG_PM */
+
+/*
+ * The pci-driver core runtime PM routines always save the device state
+ * before going into suspended state. If the device is going into low power
+ * state with only with runtime PM ops, then no explicit handling is needed
+ * for the devices which have NoSoftRst-.
+ */
+static const struct dev_pm_ops vfio_pci_core_pm_ops = {
+	SET_RUNTIME_PM_OPS(vfio_pci_core_runtime_suspend,
+			   vfio_pci_core_runtime_resume,
+			   NULL)
+};
+
+int vfio_pci_core_enable(struct vfio_pci_core_device *vdev)
+{
+	struct pci_dev *pdev = vdev->pdev;
+	int ret;
+	u16 cmd;
+	u8 msix_pos;
+
+	if (!disable_idle_d3) {
+		ret = pm_runtime_resume_and_get(&pdev->dev);
+		if (ret < 0)
+			return ret;
+	}
+
+	/* Don't allow our initial saved state to include busmaster */
+	pci_clear_master(pdev);
+
+	ret = pci_enable_device(pdev);
+	if (ret)
+		goto out_power;
+
+	/* If reset fails because of the device lock, fail this path entirely */
+	ret = pci_try_reset_function(pdev);
+	if (ret == -EAGAIN)
+		goto out_disable_device;
+
+	vdev->reset_works = !ret;
+	pci_save_state(pdev);
+	vdev->pci_saved_state = pci_store_saved_state(pdev);
+	if (!vdev->pci_saved_state)
+		pci_dbg(pdev, "%s: Couldn't store saved state\n", __func__);
+
+	if (likely(!nointxmask)) {
+		if (vfio_pci_nointx(pdev)) {
+			pci_info(pdev, "Masking broken INTx support\n");
+			vdev->nointx = true;
+			pci_intx(pdev, 0);
+		} else
+			vdev->pci_2_3 = pci_intx_mask_supported(pdev);
+	}
+
+	pci_read_config_word(pdev, PCI_COMMAND, &cmd);
+	if (vdev->pci_2_3 && (cmd & PCI_COMMAND_INTX_DISABLE)) {
+		cmd &= ~PCI_COMMAND_INTX_DISABLE;
+		pci_write_config_word(pdev, PCI_COMMAND, cmd);
+	}
+
+	ret = vfio_pci_zdev_open_device(vdev);
+	if (ret)
+		goto out_free_state;
+
+	ret = vfio_config_init(vdev);
+	if (ret)
+		goto out_free_zdev;
+
+	msix_pos = pdev->msix_cap;
+	if (msix_pos) {
+		u16 flags;
+		u32 table;
+
+		pci_read_config_word(pdev, msix_pos + PCI_MSIX_FLAGS, &flags);
+		pci_read_config_dword(pdev, msix_pos + PCI_MSIX_TABLE, &table);
+
+		vdev->msix_bar = table & PCI_MSIX_TABLE_BIR;
+		vdev->msix_offset = table & PCI_MSIX_TABLE_OFFSET;
+		vdev->msix_size = ((flags & PCI_MSIX_FLAGS_QSIZE) + 1) * 16;
+		vdev->has_dyn_msix = pci_msix_can_alloc_dyn(pdev);
+	} else {
+		vdev->msix_bar = 0xFF;
+		vdev->has_dyn_msix = false;
+	}
+
+	if (!vfio_vga_disabled() && vfio_pci_is_vga(pdev))
+		vdev->has_vga = true;
+
+
+	return 0;
+
+out_free_zdev:
+	vfio_pci_zdev_close_device(vdev);
+out_free_state:
+	kfree(vdev->pci_saved_state);
+	vdev->pci_saved_state = NULL;
+out_disable_device:
+	pci_disable_device(pdev);
+out_power:
+	if (!disable_idle_d3)
+		pm_runtime_put(&pdev->dev);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(vfio_pci_core_enable);
+
+void vfio_pci_core_disable(struct vfio_pci_core_device *vdev)
+{
+	struct pci_dev *pdev = vdev->pdev;
+	struct vfio_pci_dummy_resource *dummy_res, *tmp;
+	struct vfio_pci_ioeventfd *ioeventfd, *ioeventfd_tmp;
+	int i, bar;
+
+	/* For needs_reset */
+	lockdep_assert_held(&vdev->vdev.dev_set->lock);
+
+	/*
+	 * This function can be invoked while the power state is non-D0.
+	 * This non-D0 power state can be with or without runtime PM.
+	 * vfio_pci_runtime_pm_exit() will internally increment the usage
+	 * count corresponding to pm_runtime_put() called during low power
+	 * feature entry and then pm_runtime_resume() will wake up the device,
+	 * if the device has already gone into the suspended state. Otherwise,
+	 * the vfio_pci_set_power_state() will change the device power state
+	 * to D0.
+	 */
+	vfio_pci_runtime_pm_exit(vdev);
+	pm_runtime_resume(&pdev->dev);
+
+	/*
+	 * This function calls __pci_reset_function_locked() which internally
+	 * can use pci_pm_reset() for the function reset. pci_pm_reset() will
+	 * fail if the power state is non-D0. Also, for the devices which
+	 * have NoSoftRst-, the reset function can cause the PCI config space
+	 * reset without restoring the original state (saved locally in
+	 * 'vdev->pm_save').
+	 */
+	vfio_pci_set_power_state(vdev, PCI_D0);
+
+	/* Stop the device from further DMA */
+	pci_clear_master(pdev);
+
+	vfio_pci_set_irqs_ioctl(vdev, VFIO_IRQ_SET_DATA_NONE |
+				VFIO_IRQ_SET_ACTION_TRIGGER,
+				vdev->irq_type, 0, 0, NULL);
+
+	/* Device closed, don't need mutex here */
+	list_for_each_entry_safe(ioeventfd, ioeventfd_tmp,
+				 &vdev->ioeventfds_list, next) {
+		vfio_virqfd_disable(&ioeventfd->virqfd);
+		list_del(&ioeventfd->next);
+		kfree(ioeventfd);
+	}
+	vdev->ioeventfds_nr = 0;
+
+	vdev->virq_disabled = false;
+
+	for (i = 0; i < vdev->num_regions; i++)
+		vdev->region[i].ops->release(vdev, &vdev->region[i]);
+
+	vdev->num_regions = 0;
+	kfree(vdev->region);
+	vdev->region = NULL; /* don't krealloc a freed pointer */
+
+	vfio_config_free(vdev);
+
+	for (i = 0; i < PCI_STD_NUM_BARS; i++) {
+		bar = i + PCI_STD_RESOURCES;
+		if (!vdev->barmap[bar])
+			continue;
+		pci_iounmap(pdev, vdev->barmap[bar]);
+		pci_release_selected_regions(pdev, 1 << bar);
+		vdev->barmap[bar] = NULL;
+	}
+
+	list_for_each_entry_safe(dummy_res, tmp,
+				 &vdev->dummy_resources_list, res_next) {
+		list_del(&dummy_res->res_next);
+		release_resource(&dummy_res->resource);
+		kfree(dummy_res);
+	}
+
+	vdev->needs_reset = true;
+
+	vfio_pci_zdev_close_device(vdev);
+
+	/*
+	 * If we have saved state, restore it.  If we can reset the device,
+	 * even better.  Resetting with current state seems better than
+	 * nothing, but saving and restoring current state without reset
+	 * is just busy work.
+	 */
+	if (pci_load_and_free_saved_state(pdev, &vdev->pci_saved_state)) {
+		pci_info(pdev, "%s: Couldn't reload saved state\n", __func__);
+
+		if (!vdev->reset_works)
+			goto out;
+
+		pci_save_state(pdev);
+	}
+
+	/*
+	 * Disable INTx and MSI, presumably to avoid spurious interrupts
+	 * during reset.  Stolen from pci_reset_function()
+	 */
+	pci_write_config_word(pdev, PCI_COMMAND, PCI_COMMAND_INTX_DISABLE);
+
+	/*
+	 * Try to get the locks ourselves to prevent a deadlock. The
+	 * success of this is dependent on being able to lock the device,
+	 * which is not always possible.
+	 * We can not use the "try" reset interface here, which will
+	 * overwrite the previously restored configuration information.
+	 */
+	if (vdev->reset_works && pci_dev_trylock(pdev)) {
+		if (!__pci_reset_function_locked(pdev))
+			vdev->needs_reset = false;
+		pci_dev_unlock(pdev);
+	}
+
+	pci_restore_state(pdev);
+out:
+	pci_disable_device(pdev);
+
+	vfio_pci_dev_set_try_reset(vdev->vdev.dev_set);
+
+	/* Put the pm-runtime usage counter acquired during enable */
+	if (!disable_idle_d3)
+		pm_runtime_put(&pdev->dev);
+}
+EXPORT_SYMBOL_GPL(vfio_pci_core_disable);
+
+void vfio_pci_core_close_device(struct vfio_device *core_vdev)
+{
+	struct vfio_pci_core_device *vdev =
+		container_of(core_vdev, struct vfio_pci_core_device, vdev);
+
+	if (vdev->sriov_pf_core_dev) {
+		mutex_lock(&vdev->sriov_pf_core_dev->vf_token->lock);
+		WARN_ON(!vdev->sriov_pf_core_dev->vf_token->users);
+		vdev->sriov_pf_core_dev->vf_token->users--;
+		mutex_unlock(&vdev->sriov_pf_core_dev->vf_token->lock);
+	}
+#if IS_ENABLED(CONFIG_EEH)
+	eeh_dev_release(vdev->pdev);
+#endif
+	vfio_pci_core_disable(vdev);
+
+	mutex_lock(&vdev->igate);
+	if (vdev->err_trigger) {
+		eventfd_ctx_put(vdev->err_trigger);
+		vdev->err_trigger = NULL;
+	}
+	if (vdev->req_trigger) {
+		eventfd_ctx_put(vdev->req_trigger);
+		vdev->req_trigger = NULL;
+	}
+	mutex_unlock(&vdev->igate);
+}
+EXPORT_SYMBOL_GPL(vfio_pci_core_close_device);
+
+void vfio_pci_core_finish_enable(struct vfio_pci_core_device *vdev)
+{
+	vfio_pci_probe_mmaps(vdev);
+#if IS_ENABLED(CONFIG_EEH)
+	eeh_dev_open(vdev->pdev);
+#endif
+
+	if (vdev->sriov_pf_core_dev) {
+		mutex_lock(&vdev->sriov_pf_core_dev->vf_token->lock);
+		vdev->sriov_pf_core_dev->vf_token->users++;
+		mutex_unlock(&vdev->sriov_pf_core_dev->vf_token->lock);
+	}
+}
+EXPORT_SYMBOL_GPL(vfio_pci_core_finish_enable);
+
+static int vfio_pci_get_irq_count(struct vfio_pci_core_device *vdev, int irq_type)
+{
+	if (irq_type == VFIO_PCI_INTX_IRQ_INDEX) {
+		u8 pin;
+
+		if (!IS_ENABLED(CONFIG_VFIO_PCI_INTX) ||
+		    vdev->nointx || vdev->pdev->is_virtfn)
+			return 0;
+
+		pci_read_config_byte(vdev->pdev, PCI_INTERRUPT_PIN, &pin);
+
+		return pin ? 1 : 0;
+	} else if (irq_type == VFIO_PCI_MSI_IRQ_INDEX) {
+		u8 pos;
+		u16 flags;
+
+		pos = vdev->pdev->msi_cap;
+		if (pos) {
+			pci_read_config_word(vdev->pdev,
+					     pos + PCI_MSI_FLAGS, &flags);
+			return 1 << ((flags & PCI_MSI_FLAGS_QMASK) >> 1);
+		}
+	} else if (irq_type == VFIO_PCI_MSIX_IRQ_INDEX) {
+		u8 pos;
+		u16 flags;
+
+		pos = vdev->pdev->msix_cap;
+		if (pos) {
+			pci_read_config_word(vdev->pdev,
+					     pos + PCI_MSIX_FLAGS, &flags);
+
+			return (flags & PCI_MSIX_FLAGS_QSIZE) + 1;
+		}
+	} else if (irq_type == VFIO_PCI_ERR_IRQ_INDEX) {
+		if (pci_is_pcie(vdev->pdev))
+			return 1;
+	} else if (irq_type == VFIO_PCI_REQ_IRQ_INDEX) {
+		return 1;
+	}
+
+	return 0;
+}
+
+static int vfio_pci_count_devs(struct pci_dev *pdev, void *data)
+{
+	(*(int *)data)++;
+	return 0;
+}
+
+struct vfio_pci_fill_info {
+	struct vfio_pci_dependent_device __user *devices;
+	struct vfio_pci_dependent_device __user *devices_end;
+	struct vfio_device *vdev;
+	u32 count;
+	u32 flags;
+};
+
+static int vfio_pci_fill_devs(struct pci_dev *pdev, void *data)
+{
+	struct vfio_pci_dependent_device info = {
+		.segment = pci_domain_nr(pdev->bus),
+		.bus = pdev->bus->number,
+		.devfn = pdev->devfn,
+	};
+	struct vfio_pci_fill_info *fill = data;
+
+	fill->count++;
+	if (fill->devices >= fill->devices_end)
+		return 0;
+
+	if (fill->flags & VFIO_PCI_HOT_RESET_FLAG_DEV_ID) {
+		struct iommufd_ctx *iommufd = vfio_iommufd_device_ictx(fill->vdev);
+		struct vfio_device_set *dev_set = fill->vdev->dev_set;
+		struct vfio_device *vdev;
+
+		/*
+		 * hot-reset requires all affected devices be represented in
+		 * the dev_set.
+		 */
+		vdev = vfio_find_device_in_devset(dev_set, &pdev->dev);
+		if (!vdev) {
+			info.devid = VFIO_PCI_DEVID_NOT_OWNED;
+		} else {
+			int id = vfio_iommufd_get_dev_id(vdev, iommufd);
+
+			if (id > 0)
+				info.devid = id;
+			else if (id == -ENOENT)
+				info.devid = VFIO_PCI_DEVID_OWNED;
+			else
+				info.devid = VFIO_PCI_DEVID_NOT_OWNED;
+		}
+		/* If devid is VFIO_PCI_DEVID_NOT_OWNED, clear owned flag. */
+		if (info.devid == VFIO_PCI_DEVID_NOT_OWNED)
+			fill->flags &= ~VFIO_PCI_HOT_RESET_FLAG_DEV_ID_OWNED;
+	} else {
+		struct iommu_group *iommu_group;
+
+		iommu_group = iommu_group_get(&pdev->dev);
+		if (!iommu_group)
+			return -EPERM; /* Cannot reset non-isolated devices */
+
+		info.group_id = iommu_group_id(iommu_group);
+		iommu_group_put(iommu_group);
+	}
+
+	if (copy_to_user(fill->devices, &info, sizeof(info)))
+		return -EFAULT;
+	fill->devices++;
+	return 0;
+}
+
+struct vfio_pci_group_info {
+	int count;
+	struct file **files;
+};
+
+static bool vfio_pci_dev_below_slot(struct pci_dev *pdev, struct pci_slot *slot)
+{
+	for (; pdev; pdev = pdev->bus->self)
+		if (pdev->bus == slot->bus)
+			return (pdev->slot == slot);
+	return false;
+}
+
+struct vfio_pci_walk_info {
+	int (*fn)(struct pci_dev *pdev, void *data);
+	void *data;
+	struct pci_dev *pdev;
+	bool slot;
+	int ret;
+};
+
+static int vfio_pci_walk_wrapper(struct pci_dev *pdev, void *data)
+{
+	struct vfio_pci_walk_info *walk = data;
+
+	if (!walk->slot || vfio_pci_dev_below_slot(pdev, walk->pdev->slot))
+		walk->ret = walk->fn(pdev, walk->data);
+
+	return walk->ret;
+}
+
+static int vfio_pci_for_each_slot_or_bus(struct pci_dev *pdev,
+					 int (*fn)(struct pci_dev *,
+						   void *data), void *data,
+					 bool slot)
+{
+	struct vfio_pci_walk_info walk = {
+		.fn = fn, .data = data, .pdev = pdev, .slot = slot, .ret = 0,
+	};
+
+	pci_walk_bus(pdev->bus, vfio_pci_walk_wrapper, &walk);
+
+	return walk.ret;
+}
+
+static int msix_mmappable_cap(struct vfio_pci_core_device *vdev,
+			      struct vfio_info_cap *caps)
+{
+	struct vfio_info_cap_header header = {
+		.id = VFIO_REGION_INFO_CAP_MSIX_MAPPABLE,
+		.version = 1
+	};
+
+	return vfio_info_add_capability(caps, &header, sizeof(header));
+}
+
+int vfio_pci_core_register_dev_region(struct vfio_pci_core_device *vdev,
+				      unsigned int type, unsigned int subtype,
+				      const struct vfio_pci_regops *ops,
+				      size_t size, u32 flags, void *data)
+{
+	struct vfio_pci_region *region;
+
+	region = krealloc(vdev->region,
+			  (vdev->num_regions + 1) * sizeof(*region),
+			  GFP_KERNEL_ACCOUNT);
+	if (!region)
+		return -ENOMEM;
+
+	vdev->region = region;
+	vdev->region[vdev->num_regions].type = type;
+	vdev->region[vdev->num_regions].subtype = subtype;
+	vdev->region[vdev->num_regions].ops = ops;
+	vdev->region[vdev->num_regions].size = size;
+	vdev->region[vdev->num_regions].flags = flags;
+	vdev->region[vdev->num_regions].data = data;
+
+	vdev->num_regions++;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(vfio_pci_core_register_dev_region);
+
+static int vfio_pci_info_atomic_cap(struct vfio_pci_core_device *vdev,
+				    struct vfio_info_cap *caps)
+{
+	struct vfio_device_info_cap_pci_atomic_comp cap = {
+		.header.id = VFIO_DEVICE_INFO_CAP_PCI_ATOMIC_COMP,
+		.header.version = 1
+	};
+	struct pci_dev *pdev = pci_physfn(vdev->pdev);
+	u32 devcap2;
+
+	pcie_capability_read_dword(pdev, PCI_EXP_DEVCAP2, &devcap2);
+
+	if ((devcap2 & PCI_EXP_DEVCAP2_ATOMIC_COMP32) &&
+	    !pci_enable_atomic_ops_to_root(pdev, PCI_EXP_DEVCAP2_ATOMIC_COMP32))
+		cap.flags |= VFIO_PCI_ATOMIC_COMP32;
+
+	if ((devcap2 & PCI_EXP_DEVCAP2_ATOMIC_COMP64) &&
+	    !pci_enable_atomic_ops_to_root(pdev, PCI_EXP_DEVCAP2_ATOMIC_COMP64))
+		cap.flags |= VFIO_PCI_ATOMIC_COMP64;
+
+	if ((devcap2 & PCI_EXP_DEVCAP2_ATOMIC_COMP128) &&
+	    !pci_enable_atomic_ops_to_root(pdev,
+					   PCI_EXP_DEVCAP2_ATOMIC_COMP128))
+		cap.flags |= VFIO_PCI_ATOMIC_COMP128;
+
+	if (!cap.flags)
+		return -ENODEV;
+
+	return vfio_info_add_capability(caps, &cap.header, sizeof(cap));
+}
+
+static int vfio_pci_ioctl_get_info(struct vfio_pci_core_device *vdev,
+				   struct vfio_device_info __user *arg)
+{
+	unsigned long minsz = offsetofend(struct vfio_device_info, num_irqs);
+	struct vfio_device_info info = {};
+	struct vfio_info_cap caps = { .buf = NULL, .size = 0 };
+	int ret;
+
+	if (copy_from_user(&info, arg, minsz))
+		return -EFAULT;
+
+	if (info.argsz < minsz)
+		return -EINVAL;
+
+	minsz = min_t(size_t, info.argsz, sizeof(info));
+
+	info.flags = VFIO_DEVICE_FLAGS_PCI;
+
+	if (vdev->reset_works)
+		info.flags |= VFIO_DEVICE_FLAGS_RESET;
+
+	info.num_regions = VFIO_PCI_NUM_REGIONS + vdev->num_regions;
+	info.num_irqs = VFIO_PCI_NUM_IRQS;
+
+	ret = vfio_pci_info_zdev_add_caps(vdev, &caps);
+	if (ret && ret != -ENODEV) {
+		pci_warn(vdev->pdev,
+			 "Failed to setup zPCI info capabilities\n");
+		return ret;
+	}
+
+	ret = vfio_pci_info_atomic_cap(vdev, &caps);
+	if (ret && ret != -ENODEV) {
+		pci_warn(vdev->pdev,
+			 "Failed to setup AtomicOps info capability\n");
+		return ret;
+	}
+
+	if (caps.size) {
+		info.flags |= VFIO_DEVICE_FLAGS_CAPS;
+		if (info.argsz < sizeof(info) + caps.size) {
+			info.argsz = sizeof(info) + caps.size;
+		} else {
+			vfio_info_cap_shift(&caps, sizeof(info));
+			if (copy_to_user(arg + 1, caps.buf, caps.size)) {
+				kfree(caps.buf);
+				return -EFAULT;
+			}
+			info.cap_offset = sizeof(*arg);
+		}
+
+		kfree(caps.buf);
+	}
+
+	return copy_to_user(arg, &info, minsz) ? -EFAULT : 0;
+}
+
+static int vfio_pci_ioctl_get_region_info(struct vfio_pci_core_device *vdev,
+					  struct vfio_region_info __user *arg)
+{
+	unsigned long minsz = offsetofend(struct vfio_region_info, offset);
+	struct pci_dev *pdev = vdev->pdev;
+	struct vfio_region_info info;
+	struct vfio_info_cap caps = { .buf = NULL, .size = 0 };
+	int i, ret;
+
+	if (copy_from_user(&info, arg, minsz))
+		return -EFAULT;
+
+	if (info.argsz < minsz)
+		return -EINVAL;
+
+	switch (info.index) {
+	case VFIO_PCI_CONFIG_REGION_INDEX:
+		info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
+		info.size = pdev->cfg_size;
+		info.flags = VFIO_REGION_INFO_FLAG_READ |
+			     VFIO_REGION_INFO_FLAG_WRITE;
+		break;
+	case VFIO_PCI_BAR0_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX:
+		info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
+		info.size = pci_resource_len(pdev, info.index);
+		if (!info.size) {
+			info.flags = 0;
+			break;
+		}
+
+		info.flags = VFIO_REGION_INFO_FLAG_READ |
+			     VFIO_REGION_INFO_FLAG_WRITE;
+		if (vdev->bar_mmap_supported[info.index]) {
+			info.flags |= VFIO_REGION_INFO_FLAG_MMAP;
+			if (info.index == vdev->msix_bar) {
+				ret = msix_mmappable_cap(vdev, &caps);
+				if (ret)
+					return ret;
+			}
+		}
+
+		break;
+	case VFIO_PCI_ROM_REGION_INDEX: {
+		void __iomem *io;
+		size_t size;
+		u16 cmd;
+
+		info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
+		info.flags = 0;
+
+		/* Report the BAR size, not the ROM size */
+		info.size = pci_resource_len(pdev, info.index);
+		if (!info.size) {
+			/* Shadow ROMs appear as PCI option ROMs */
+			if (pdev->resource[PCI_ROM_RESOURCE].flags &
+			    IORESOURCE_ROM_SHADOW)
+				info.size = 0x20000;
+			else
+				break;
+		}
+
+		/*
+		 * Is it really there?  Enable memory decode for implicit access
+		 * in pci_map_rom().
+		 */
+		cmd = vfio_pci_memory_lock_and_enable(vdev);
+		io = pci_map_rom(pdev, &size);
+		if (io) {
+			info.flags = VFIO_REGION_INFO_FLAG_READ;
+			pci_unmap_rom(pdev, io);
+		} else {
+			info.size = 0;
+		}
+		vfio_pci_memory_unlock_and_restore(vdev, cmd);
+
+		break;
+	}
+	case VFIO_PCI_VGA_REGION_INDEX:
+		if (!vdev->has_vga)
+			return -EINVAL;
+
+		info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
+		info.size = 0xc0000;
+		info.flags = VFIO_REGION_INFO_FLAG_READ |
+			     VFIO_REGION_INFO_FLAG_WRITE;
+
+		break;
+	default: {
+		struct vfio_region_info_cap_type cap_type = {
+			.header.id = VFIO_REGION_INFO_CAP_TYPE,
+			.header.version = 1
+		};
+
+		if (info.index >= VFIO_PCI_NUM_REGIONS + vdev->num_regions)
+			return -EINVAL;
+		info.index = array_index_nospec(
+			info.index, VFIO_PCI_NUM_REGIONS + vdev->num_regions);
+
+		i = info.index - VFIO_PCI_NUM_REGIONS;
+
+		info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
+		info.size = vdev->region[i].size;
+		info.flags = vdev->region[i].flags;
+
+		cap_type.type = vdev->region[i].type;
+		cap_type.subtype = vdev->region[i].subtype;
+
+		ret = vfio_info_add_capability(&caps, &cap_type.header,
+					       sizeof(cap_type));
+		if (ret)
+			return ret;
+
+		if (vdev->region[i].ops->add_capability) {
+			ret = vdev->region[i].ops->add_capability(
+				vdev, &vdev->region[i], &caps);
+			if (ret)
+				return ret;
+		}
+	}
+	}
+
+	if (caps.size) {
+		info.flags |= VFIO_REGION_INFO_FLAG_CAPS;
+		if (info.argsz < sizeof(info) + caps.size) {
+			info.argsz = sizeof(info) + caps.size;
+			info.cap_offset = 0;
+		} else {
+			vfio_info_cap_shift(&caps, sizeof(info));
+			if (copy_to_user(arg + 1, caps.buf, caps.size)) {
+				kfree(caps.buf);
+				return -EFAULT;
+			}
+			info.cap_offset = sizeof(*arg);
+		}
+
+		kfree(caps.buf);
+	}
+
+	return copy_to_user(arg, &info, minsz) ? -EFAULT : 0;
+}
+
+static int vfio_pci_ioctl_get_irq_info(struct vfio_pci_core_device *vdev,
+				       struct vfio_irq_info __user *arg)
+{
+	unsigned long minsz = offsetofend(struct vfio_irq_info, count);
+	struct vfio_irq_info info;
+
+	if (copy_from_user(&info, arg, minsz))
+		return -EFAULT;
+
+	if (info.argsz < minsz || info.index >= VFIO_PCI_NUM_IRQS)
+		return -EINVAL;
+
+	switch (info.index) {
+	case VFIO_PCI_INTX_IRQ_INDEX ... VFIO_PCI_MSIX_IRQ_INDEX:
+	case VFIO_PCI_REQ_IRQ_INDEX:
+		break;
+	case VFIO_PCI_ERR_IRQ_INDEX:
+		if (pci_is_pcie(vdev->pdev))
+			break;
+		fallthrough;
+	default:
+		return -EINVAL;
+	}
+
+	info.flags = VFIO_IRQ_INFO_EVENTFD;
+
+	info.count = vfio_pci_get_irq_count(vdev, info.index);
+
+	if (info.index == VFIO_PCI_INTX_IRQ_INDEX)
+		info.flags |=
+			(VFIO_IRQ_INFO_MASKABLE | VFIO_IRQ_INFO_AUTOMASKED);
+	else if (info.index != VFIO_PCI_MSIX_IRQ_INDEX || !vdev->has_dyn_msix)
+		info.flags |= VFIO_IRQ_INFO_NORESIZE;
+
+	return copy_to_user(arg, &info, minsz) ? -EFAULT : 0;
+}
+
+static int vfio_pci_ioctl_set_irqs(struct vfio_pci_core_device *vdev,
+				   struct vfio_irq_set __user *arg)
+{
+	unsigned long minsz = offsetofend(struct vfio_irq_set, count);
+	struct vfio_irq_set hdr;
+	u8 *data = NULL;
+	int max, ret = 0;
+	size_t data_size = 0;
+
+	if (copy_from_user(&hdr, arg, minsz))
+		return -EFAULT;
+
+	max = vfio_pci_get_irq_count(vdev, hdr.index);
+
+	ret = vfio_set_irqs_validate_and_prepare(&hdr, max, VFIO_PCI_NUM_IRQS,
+						 &data_size);
+	if (ret)
+		return ret;
+
+	if (data_size) {
+		data = memdup_user(&arg->data, data_size);
+		if (IS_ERR(data))
+			return PTR_ERR(data);
+	}
+
+	mutex_lock(&vdev->igate);
+
+	ret = vfio_pci_set_irqs_ioctl(vdev, hdr.flags, hdr.index, hdr.start,
+				      hdr.count, data);
+
+	mutex_unlock(&vdev->igate);
+	kfree(data);
+
+	return ret;
+}
+
+static int vfio_pci_ioctl_reset(struct vfio_pci_core_device *vdev,
+				void __user *arg)
+{
+	int ret;
+
+	if (!vdev->reset_works)
+		return -EINVAL;
+
+	vfio_pci_zap_and_down_write_memory_lock(vdev);
+
+	/*
+	 * This function can be invoked while the power state is non-D0. If
+	 * pci_try_reset_function() has been called while the power state is
+	 * non-D0, then pci_try_reset_function() will internally set the power
+	 * state to D0 without vfio driver involvement. For the devices which
+	 * have NoSoftRst-, the reset function can cause the PCI config space
+	 * reset without restoring the original state (saved locally in
+	 * 'vdev->pm_save').
+	 */
+	vfio_pci_set_power_state(vdev, PCI_D0);
+
+	ret = pci_try_reset_function(vdev->pdev);
+	up_write(&vdev->memory_lock);
+
+	return ret;
+}
+
+static int vfio_pci_ioctl_get_pci_hot_reset_info(
+	struct vfio_pci_core_device *vdev,
+	struct vfio_pci_hot_reset_info __user *arg)
+{
+	unsigned long minsz =
+		offsetofend(struct vfio_pci_hot_reset_info, count);
+	struct vfio_pci_hot_reset_info hdr;
+	struct vfio_pci_fill_info fill = {};
+	bool slot = false;
+	int ret = 0;
+
+	if (copy_from_user(&hdr, arg, minsz))
+		return -EFAULT;
+
+	if (hdr.argsz < minsz)
+		return -EINVAL;
+
+	hdr.flags = 0;
+
+	/* Can we do a slot or bus reset or neither? */
+	if (!pci_probe_reset_slot(vdev->pdev->slot))
+		slot = true;
+	else if (pci_probe_reset_bus(vdev->pdev->bus))
+		return -ENODEV;
+
+	fill.devices = arg->devices;
+	fill.devices_end = arg->devices +
+			   (hdr.argsz - sizeof(hdr)) / sizeof(arg->devices[0]);
+	fill.vdev = &vdev->vdev;
+
+	if (vfio_device_cdev_opened(&vdev->vdev))
+		fill.flags |= VFIO_PCI_HOT_RESET_FLAG_DEV_ID |
+			     VFIO_PCI_HOT_RESET_FLAG_DEV_ID_OWNED;
+
+	mutex_lock(&vdev->vdev.dev_set->lock);
+	ret = vfio_pci_for_each_slot_or_bus(vdev->pdev, vfio_pci_fill_devs,
+					    &fill, slot);
+	mutex_unlock(&vdev->vdev.dev_set->lock);
+	if (ret)
+		return ret;
+
+	hdr.count = fill.count;
+	hdr.flags = fill.flags;
+	if (copy_to_user(arg, &hdr, minsz))
+		return -EFAULT;
+
+	if (fill.count > fill.devices - arg->devices)
+		return -ENOSPC;
+	return 0;
+}
+
+static int
+vfio_pci_ioctl_pci_hot_reset_groups(struct vfio_pci_core_device *vdev,
+				    int array_count, bool slot,
+				    struct vfio_pci_hot_reset __user *arg)
+{
+	int32_t *group_fds;
+	struct file **files;
+	struct vfio_pci_group_info info;
+	int file_idx, count = 0, ret = 0;
+
+	/*
+	 * We can't let userspace give us an arbitrarily large buffer to copy,
+	 * so verify how many we think there could be.  Note groups can have
+	 * multiple devices so one group per device is the max.
+	 */
+	ret = vfio_pci_for_each_slot_or_bus(vdev->pdev, vfio_pci_count_devs,
+					    &count, slot);
+	if (ret)
+		return ret;
+
+	if (array_count > count)
+		return -EINVAL;
+
+	group_fds = kcalloc(array_count, sizeof(*group_fds), GFP_KERNEL);
+	files = kcalloc(array_count, sizeof(*files), GFP_KERNEL);
+	if (!group_fds || !files) {
+		kfree(group_fds);
+		kfree(files);
+		return -ENOMEM;
+	}
+
+	if (copy_from_user(group_fds, arg->group_fds,
+			   array_count * sizeof(*group_fds))) {
+		kfree(group_fds);
+		kfree(files);
+		return -EFAULT;
+	}
+
+	/*
+	 * Get the group file for each fd to ensure the group is held across
+	 * the reset
+	 */
+	for (file_idx = 0; file_idx < array_count; file_idx++) {
+		struct file *file = fget(group_fds[file_idx]);
+
+		if (!file) {
+			ret = -EBADF;
+			break;
+		}
+
+		/* Ensure the FD is a vfio group FD.*/
+		if (!vfio_file_is_group(file)) {
+			fput(file);
+			ret = -EINVAL;
+			break;
+		}
+
+		files[file_idx] = file;
+	}
+
+	kfree(group_fds);
+
+	/* release reference to groups on error */
+	if (ret)
+		goto hot_reset_release;
+
+	info.count = array_count;
+	info.files = files;
+
+	ret = vfio_pci_dev_set_hot_reset(vdev->vdev.dev_set, &info, NULL);
+
+hot_reset_release:
+	for (file_idx--; file_idx >= 0; file_idx--)
+		fput(files[file_idx]);
+
+	kfree(files);
+	return ret;
+}
+
+static int vfio_pci_ioctl_pci_hot_reset(struct vfio_pci_core_device *vdev,
+					struct vfio_pci_hot_reset __user *arg)
+{
+	unsigned long minsz = offsetofend(struct vfio_pci_hot_reset, count);
+	struct vfio_pci_hot_reset hdr;
+	bool slot = false;
+
+	if (copy_from_user(&hdr, arg, minsz))
+		return -EFAULT;
+
+	if (hdr.argsz < minsz || hdr.flags)
+		return -EINVAL;
+
+	/* zero-length array is only for cdev opened devices */
+	if (!!hdr.count == vfio_device_cdev_opened(&vdev->vdev))
+		return -EINVAL;
+
+	/* Can we do a slot or bus reset or neither? */
+	if (!pci_probe_reset_slot(vdev->pdev->slot))
+		slot = true;
+	else if (pci_probe_reset_bus(vdev->pdev->bus))
+		return -ENODEV;
+
+	if (hdr.count)
+		return vfio_pci_ioctl_pci_hot_reset_groups(vdev, hdr.count, slot, arg);
+
+	return vfio_pci_dev_set_hot_reset(vdev->vdev.dev_set, NULL,
+					  vfio_iommufd_device_ictx(&vdev->vdev));
+}
+
+static int vfio_pci_ioctl_ioeventfd(struct vfio_pci_core_device *vdev,
+				    struct vfio_device_ioeventfd __user *arg)
+{
+	unsigned long minsz = offsetofend(struct vfio_device_ioeventfd, fd);
+	struct vfio_device_ioeventfd ioeventfd;
+	int count;
+
+	if (copy_from_user(&ioeventfd, arg, minsz))
+		return -EFAULT;
+
+	if (ioeventfd.argsz < minsz)
+		return -EINVAL;
+
+	if (ioeventfd.flags & ~VFIO_DEVICE_IOEVENTFD_SIZE_MASK)
+		return -EINVAL;
+
+	count = ioeventfd.flags & VFIO_DEVICE_IOEVENTFD_SIZE_MASK;
+
+	if (hweight8(count) != 1 || ioeventfd.fd < -1)
+		return -EINVAL;
+
+	return vfio_pci_ioeventfd(vdev, ioeventfd.offset, ioeventfd.data, count,
+				  ioeventfd.fd);
+}
+
+long vfio_pci_core_ioctl(struct vfio_device *core_vdev, unsigned int cmd,
+			 unsigned long arg)
+{
+	struct vfio_pci_core_device *vdev =
+		container_of(core_vdev, struct vfio_pci_core_device, vdev);
+	void __user *uarg = (void __user *)arg;
+
+	switch (cmd) {
+	case VFIO_DEVICE_GET_INFO:
+		return vfio_pci_ioctl_get_info(vdev, uarg);
+	case VFIO_DEVICE_GET_IRQ_INFO:
+		return vfio_pci_ioctl_get_irq_info(vdev, uarg);
+	case VFIO_DEVICE_GET_PCI_HOT_RESET_INFO:
+		return vfio_pci_ioctl_get_pci_hot_reset_info(vdev, uarg);
+	case VFIO_DEVICE_GET_REGION_INFO:
+		return vfio_pci_ioctl_get_region_info(vdev, uarg);
+	case VFIO_DEVICE_IOEVENTFD:
+		return vfio_pci_ioctl_ioeventfd(vdev, uarg);
+	case VFIO_DEVICE_PCI_HOT_RESET:
+		return vfio_pci_ioctl_pci_hot_reset(vdev, uarg);
+	case VFIO_DEVICE_RESET:
+		return vfio_pci_ioctl_reset(vdev, uarg);
+	case VFIO_DEVICE_SET_IRQS:
+		return vfio_pci_ioctl_set_irqs(vdev, uarg);
+	default:
+		return -ENOTTY;
+	}
+}
+EXPORT_SYMBOL_GPL(vfio_pci_core_ioctl);
+
+static int vfio_pci_core_feature_token(struct vfio_device *device, u32 flags,
+				       uuid_t __user *arg, size_t argsz)
+{
+	struct vfio_pci_core_device *vdev =
+		container_of(device, struct vfio_pci_core_device, vdev);
+	uuid_t uuid;
+	int ret;
+
+	if (!vdev->vf_token)
+		return -ENOTTY;
+	/*
+	 * We do not support GET of the VF Token UUID as this could
+	 * expose the token of the previous device user.
+	 */
+	ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_SET,
+				 sizeof(uuid));
+	if (ret != 1)
+		return ret;
+
+	if (copy_from_user(&uuid, arg, sizeof(uuid)))
+		return -EFAULT;
+
+	mutex_lock(&vdev->vf_token->lock);
+	uuid_copy(&vdev->vf_token->uuid, &uuid);
+	mutex_unlock(&vdev->vf_token->lock);
+	return 0;
+}
+
+int vfio_pci_core_ioctl_feature(struct vfio_device *device, u32 flags,
+				void __user *arg, size_t argsz)
+{
+	switch (flags & VFIO_DEVICE_FEATURE_MASK) {
+	case VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY:
+		return vfio_pci_core_pm_entry(device, flags, arg, argsz);
+	case VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY_WITH_WAKEUP:
+		return vfio_pci_core_pm_entry_with_wakeup(device, flags,
+							  arg, argsz);
+	case VFIO_DEVICE_FEATURE_LOW_POWER_EXIT:
+		return vfio_pci_core_pm_exit(device, flags, arg, argsz);
+	case VFIO_DEVICE_FEATURE_PCI_VF_TOKEN:
+		return vfio_pci_core_feature_token(device, flags, arg, argsz);
+	default:
+		return -ENOTTY;
+	}
+}
+EXPORT_SYMBOL_GPL(vfio_pci_core_ioctl_feature);
+
+static ssize_t vfio_pci_rw(struct vfio_pci_core_device *vdev, char __user *buf,
+			   size_t count, loff_t *ppos, bool iswrite)
+{
+	unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
+	int ret;
+
+	if (index >= VFIO_PCI_NUM_REGIONS + vdev->num_regions)
+		return -EINVAL;
+
+	ret = pm_runtime_resume_and_get(&vdev->pdev->dev);
+	if (ret) {
+		pci_info_ratelimited(vdev->pdev, "runtime resume failed %d\n",
+				     ret);
+		return -EIO;
+	}
+
+	switch (index) {
+	case VFIO_PCI_CONFIG_REGION_INDEX:
+		ret = vfio_pci_config_rw(vdev, buf, count, ppos, iswrite);
+		break;
+
+	case VFIO_PCI_ROM_REGION_INDEX:
+		if (iswrite)
+			ret = -EINVAL;
+		else
+			ret = vfio_pci_bar_rw(vdev, buf, count, ppos, false);
+		break;
+
+	case VFIO_PCI_BAR0_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX:
+		ret = vfio_pci_bar_rw(vdev, buf, count, ppos, iswrite);
+		break;
+
+	case VFIO_PCI_VGA_REGION_INDEX:
+		ret = vfio_pci_vga_rw(vdev, buf, count, ppos, iswrite);
+		break;
+
+	default:
+		index -= VFIO_PCI_NUM_REGIONS;
+		ret = vdev->region[index].ops->rw(vdev, buf,
+						   count, ppos, iswrite);
+		break;
+	}
+
+	pm_runtime_put(&vdev->pdev->dev);
+	return ret;
+}
+
+ssize_t vfio_pci_core_read(struct vfio_device *core_vdev, char __user *buf,
+		size_t count, loff_t *ppos)
+{
+	struct vfio_pci_core_device *vdev =
+		container_of(core_vdev, struct vfio_pci_core_device, vdev);
+
+	if (!count)
+		return 0;
+
+	return vfio_pci_rw(vdev, buf, count, ppos, false);
+}
+EXPORT_SYMBOL_GPL(vfio_pci_core_read);
+
+ssize_t vfio_pci_core_write(struct vfio_device *core_vdev, const char __user *buf,
+		size_t count, loff_t *ppos)
+{
+	struct vfio_pci_core_device *vdev =
+		container_of(core_vdev, struct vfio_pci_core_device, vdev);
+
+	if (!count)
+		return 0;
+
+	return vfio_pci_rw(vdev, (char __user *)buf, count, ppos, true);
+}
+EXPORT_SYMBOL_GPL(vfio_pci_core_write);
+
+/* Return 1 on zap and vma_lock acquired, 0 on contention (only with @try) */
+static int vfio_pci_zap_and_vma_lock(struct vfio_pci_core_device *vdev, bool try)
+{
+	struct vfio_pci_mmap_vma *mmap_vma, *tmp;
+
+	/*
+	 * Lock ordering:
+	 * vma_lock is nested under mmap_lock for vm_ops callback paths.
+	 * The memory_lock semaphore is used by both code paths calling
+	 * into this function to zap vmas and the vm_ops.fault callback
+	 * to protect the memory enable state of the device.
+	 *
+	 * When zapping vmas we need to maintain the mmap_lock => vma_lock
+	 * ordering, which requires using vma_lock to walk vma_list to
+	 * acquire an mm, then dropping vma_lock to get the mmap_lock and
+	 * reacquiring vma_lock.  This logic is derived from similar
+	 * requirements in uverbs_user_mmap_disassociate().
+	 *
+	 * mmap_lock must always be the top-level lock when it is taken.
+	 * Therefore we can only hold the memory_lock write lock when
+	 * vma_list is empty, as we'd need to take mmap_lock to clear
+	 * entries.  vma_list can only be guaranteed empty when holding
+	 * vma_lock, thus memory_lock is nested under vma_lock.
+	 *
+	 * This enables the vm_ops.fault callback to acquire vma_lock,
+	 * followed by memory_lock read lock, while already holding
+	 * mmap_lock without risk of deadlock.
+	 */
+	while (1) {
+		struct mm_struct *mm = NULL;
+
+		if (try) {
+			if (!mutex_trylock(&vdev->vma_lock))
+				return 0;
+		} else {
+			mutex_lock(&vdev->vma_lock);
+		}
+		while (!list_empty(&vdev->vma_list)) {
+			mmap_vma = list_first_entry(&vdev->vma_list,
+						    struct vfio_pci_mmap_vma,
+						    vma_next);
+			mm = mmap_vma->vma->vm_mm;
+			if (mmget_not_zero(mm))
+				break;
+
+			list_del(&mmap_vma->vma_next);
+			kfree(mmap_vma);
+			mm = NULL;
+		}
+		if (!mm)
+			return 1;
+		mutex_unlock(&vdev->vma_lock);
+
+		if (try) {
+			if (!mmap_read_trylock(mm)) {
+				mmput(mm);
+				return 0;
+			}
+		} else {
+			mmap_read_lock(mm);
+		}
+		if (try) {
+			if (!mutex_trylock(&vdev->vma_lock)) {
+				mmap_read_unlock(mm);
+				mmput(mm);
+				return 0;
+			}
+		} else {
+			mutex_lock(&vdev->vma_lock);
+		}
+		list_for_each_entry_safe(mmap_vma, tmp,
+					 &vdev->vma_list, vma_next) {
+			struct vm_area_struct *vma = mmap_vma->vma;
+
+			if (vma->vm_mm != mm)
+				continue;
+
+			list_del(&mmap_vma->vma_next);
+			kfree(mmap_vma);
+
+			zap_vma_ptes(vma, vma->vm_start,
+				     vma->vm_end - vma->vm_start);
+		}
+		mutex_unlock(&vdev->vma_lock);
+		mmap_read_unlock(mm);
+		mmput(mm);
+	}
+}
+
+void vfio_pci_zap_and_down_write_memory_lock(struct vfio_pci_core_device *vdev)
+{
+	vfio_pci_zap_and_vma_lock(vdev, false);
+	down_write(&vdev->memory_lock);
+	mutex_unlock(&vdev->vma_lock);
+}
+
+u16 vfio_pci_memory_lock_and_enable(struct vfio_pci_core_device *vdev)
+{
+	u16 cmd;
+
+	down_write(&vdev->memory_lock);
+	pci_read_config_word(vdev->pdev, PCI_COMMAND, &cmd);
+	if (!(cmd & PCI_COMMAND_MEMORY))
+		pci_write_config_word(vdev->pdev, PCI_COMMAND,
+				      cmd | PCI_COMMAND_MEMORY);
+
+	return cmd;
+}
+
+void vfio_pci_memory_unlock_and_restore(struct vfio_pci_core_device *vdev, u16 cmd)
+{
+	pci_write_config_word(vdev->pdev, PCI_COMMAND, cmd);
+	up_write(&vdev->memory_lock);
+}
+
+/* Caller holds vma_lock */
+static int __vfio_pci_add_vma(struct vfio_pci_core_device *vdev,
+			      struct vm_area_struct *vma)
+{
+	struct vfio_pci_mmap_vma *mmap_vma;
+
+	mmap_vma = kmalloc(sizeof(*mmap_vma), GFP_KERNEL_ACCOUNT);
+	if (!mmap_vma)
+		return -ENOMEM;
+
+	mmap_vma->vma = vma;
+	list_add(&mmap_vma->vma_next, &vdev->vma_list);
+
+	return 0;
+}
+
+/*
+ * Zap mmaps on open so that we can fault them in on access and therefore
+ * our vma_list only tracks mappings accessed since last zap.
+ */
+static void vfio_pci_mmap_open(struct vm_area_struct *vma)
+{
+	zap_vma_ptes(vma, vma->vm_start, vma->vm_end - vma->vm_start);
+}
+
+static void vfio_pci_mmap_close(struct vm_area_struct *vma)
+{
+	struct vfio_pci_core_device *vdev = vma->vm_private_data;
+	struct vfio_pci_mmap_vma *mmap_vma;
+
+	mutex_lock(&vdev->vma_lock);
+	list_for_each_entry(mmap_vma, &vdev->vma_list, vma_next) {
+		if (mmap_vma->vma == vma) {
+			list_del(&mmap_vma->vma_next);
+			kfree(mmap_vma);
+			break;
+		}
+	}
+	mutex_unlock(&vdev->vma_lock);
+}
+
+static vm_fault_t vfio_pci_mmap_fault(struct vm_fault *vmf)
+{
+	struct vm_area_struct *vma = vmf->vma;
+	struct vfio_pci_core_device *vdev = vma->vm_private_data;
+	struct vfio_pci_mmap_vma *mmap_vma;
+	vm_fault_t ret = VM_FAULT_NOPAGE;
+
+	mutex_lock(&vdev->vma_lock);
+	down_read(&vdev->memory_lock);
+
+	/*
+	 * Memory region cannot be accessed if the low power feature is engaged
+	 * or memory access is disabled.
+	 */
+	if (vdev->pm_runtime_engaged || !__vfio_pci_memory_enabled(vdev)) {
+		ret = VM_FAULT_SIGBUS;
+		goto up_out;
+	}
+
+	/*
+	 * We populate the whole vma on fault, so we need to test whether
+	 * the vma has already been mapped, such as for concurrent faults
+	 * to the same vma.  io_remap_pfn_range() will trigger a BUG_ON if
+	 * we ask it to fill the same range again.
+	 */
+	list_for_each_entry(mmap_vma, &vdev->vma_list, vma_next) {
+		if (mmap_vma->vma == vma)
+			goto up_out;
+	}
+
+	if (io_remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff,
+			       vma->vm_end - vma->vm_start,
+			       vma->vm_page_prot)) {
+		ret = VM_FAULT_SIGBUS;
+		zap_vma_ptes(vma, vma->vm_start, vma->vm_end - vma->vm_start);
+		goto up_out;
+	}
+
+	if (__vfio_pci_add_vma(vdev, vma)) {
+		ret = VM_FAULT_OOM;
+		zap_vma_ptes(vma, vma->vm_start, vma->vm_end - vma->vm_start);
+	}
+
+up_out:
+	up_read(&vdev->memory_lock);
+	mutex_unlock(&vdev->vma_lock);
+	return ret;
+}
+
+static const struct vm_operations_struct vfio_pci_mmap_ops = {
+	.open = vfio_pci_mmap_open,
+	.close = vfio_pci_mmap_close,
+	.fault = vfio_pci_mmap_fault,
+};
+
+int vfio_pci_core_mmap(struct vfio_device *core_vdev, struct vm_area_struct *vma)
+{
+	struct vfio_pci_core_device *vdev =
+		container_of(core_vdev, struct vfio_pci_core_device, vdev);
+	struct pci_dev *pdev = vdev->pdev;
+	unsigned int index;
+	u64 phys_len, req_len, pgoff, req_start;
+	int ret;
+
+	index = vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT);
+
+	if (index >= VFIO_PCI_NUM_REGIONS + vdev->num_regions)
+		return -EINVAL;
+	if (vma->vm_end < vma->vm_start)
+		return -EINVAL;
+	if ((vma->vm_flags & VM_SHARED) == 0)
+		return -EINVAL;
+	if (index >= VFIO_PCI_NUM_REGIONS) {
+		int regnum = index - VFIO_PCI_NUM_REGIONS;
+		struct vfio_pci_region *region = vdev->region + regnum;
+
+		if (region->ops && region->ops->mmap &&
+		    (region->flags & VFIO_REGION_INFO_FLAG_MMAP))
+			return region->ops->mmap(vdev, region, vma);
+		return -EINVAL;
+	}
+	if (index >= VFIO_PCI_ROM_REGION_INDEX)
+		return -EINVAL;
+	if (!vdev->bar_mmap_supported[index])
+		return -EINVAL;
+
+	phys_len = PAGE_ALIGN(pci_resource_len(pdev, index));
+	req_len = vma->vm_end - vma->vm_start;
+	pgoff = vma->vm_pgoff &
+		((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1);
+	req_start = pgoff << PAGE_SHIFT;
+
+	if (req_start + req_len > phys_len)
+		return -EINVAL;
+
+	/*
+	 * Even though we don't make use of the barmap for the mmap,
+	 * we need to request the region and the barmap tracks that.
+	 */
+	if (!vdev->barmap[index]) {
+		ret = pci_request_selected_regions(pdev,
+						   1 << index, "vfio-pci");
+		if (ret)
+			return ret;
+
+		vdev->barmap[index] = pci_iomap(pdev, index, 0);
+		if (!vdev->barmap[index]) {
+			pci_release_selected_regions(pdev, 1 << index);
+			return -ENOMEM;
+		}
+	}
+
+	vma->vm_private_data = vdev;
+	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+	vma->vm_pgoff = (pci_resource_start(pdev, index) >> PAGE_SHIFT) + pgoff;
+
+	/*
+	 * See remap_pfn_range(), called from vfio_pci_fault() but we can't
+	 * change vm_flags within the fault handler.  Set them now.
+	 */
+	vm_flags_set(vma, VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP);
+	vma->vm_ops = &vfio_pci_mmap_ops;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(vfio_pci_core_mmap);
+
+void vfio_pci_core_request(struct vfio_device *core_vdev, unsigned int count)
+{
+	struct vfio_pci_core_device *vdev =
+		container_of(core_vdev, struct vfio_pci_core_device, vdev);
+	struct pci_dev *pdev = vdev->pdev;
+
+	mutex_lock(&vdev->igate);
+
+	if (vdev->req_trigger) {
+		if (!(count % 10))
+			pci_notice_ratelimited(pdev,
+				"Relaying device request to user (#%u)\n",
+				count);
+		eventfd_signal(vdev->req_trigger, 1);
+	} else if (count == 0) {
+		pci_warn(pdev,
+			"No device request channel registered, blocked until released by user\n");
+	}
+
+	mutex_unlock(&vdev->igate);
+}
+EXPORT_SYMBOL_GPL(vfio_pci_core_request);
+
+static int vfio_pci_validate_vf_token(struct vfio_pci_core_device *vdev,
+				      bool vf_token, uuid_t *uuid)
+{
+	/*
+	 * There's always some degree of trust or collaboration between SR-IOV
+	 * PF and VFs, even if just that the PF hosts the SR-IOV capability and
+	 * can disrupt VFs with a reset, but often the PF has more explicit
+	 * access to deny service to the VF or access data passed through the
+	 * VF.  We therefore require an opt-in via a shared VF token (UUID) to
+	 * represent this trust.  This both prevents that a VF driver might
+	 * assume the PF driver is a trusted, in-kernel driver, and also that
+	 * a PF driver might be replaced with a rogue driver, unknown to in-use
+	 * VF drivers.
+	 *
+	 * Therefore when presented with a VF, if the PF is a vfio device and
+	 * it is bound to the vfio-pci driver, the user needs to provide a VF
+	 * token to access the device, in the form of appending a vf_token to
+	 * the device name, for example:
+	 *
+	 * "0000:04:10.0 vf_token=bd8d9d2b-5a5f-4f5a-a211-f591514ba1f3"
+	 *
+	 * When presented with a PF which has VFs in use, the user must also
+	 * provide the current VF token to prove collaboration with existing
+	 * VF users.  If VFs are not in use, the VF token provided for the PF
+	 * device will act to set the VF token.
+	 *
+	 * If the VF token is provided but unused, an error is generated.
+	 */
+	if (vdev->pdev->is_virtfn) {
+		struct vfio_pci_core_device *pf_vdev = vdev->sriov_pf_core_dev;
+		bool match;
+
+		if (!pf_vdev) {
+			if (!vf_token)
+				return 0; /* PF is not vfio-pci, no VF token */
+
+			pci_info_ratelimited(vdev->pdev,
+				"VF token incorrectly provided, PF not bound to vfio-pci\n");
+			return -EINVAL;
+		}
+
+		if (!vf_token) {
+			pci_info_ratelimited(vdev->pdev,
+				"VF token required to access device\n");
+			return -EACCES;
+		}
+
+		mutex_lock(&pf_vdev->vf_token->lock);
+		match = uuid_equal(uuid, &pf_vdev->vf_token->uuid);
+		mutex_unlock(&pf_vdev->vf_token->lock);
+
+		if (!match) {
+			pci_info_ratelimited(vdev->pdev,
+				"Incorrect VF token provided for device\n");
+			return -EACCES;
+		}
+	} else if (vdev->vf_token) {
+		mutex_lock(&vdev->vf_token->lock);
+		if (vdev->vf_token->users) {
+			if (!vf_token) {
+				mutex_unlock(&vdev->vf_token->lock);
+				pci_info_ratelimited(vdev->pdev,
+					"VF token required to access device\n");
+				return -EACCES;
+			}
+
+			if (!uuid_equal(uuid, &vdev->vf_token->uuid)) {
+				mutex_unlock(&vdev->vf_token->lock);
+				pci_info_ratelimited(vdev->pdev,
+					"Incorrect VF token provided for device\n");
+				return -EACCES;
+			}
+		} else if (vf_token) {
+			uuid_copy(&vdev->vf_token->uuid, uuid);
+		}
+
+		mutex_unlock(&vdev->vf_token->lock);
+	} else if (vf_token) {
+		pci_info_ratelimited(vdev->pdev,
+			"VF token incorrectly provided, not a PF or VF\n");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+#define VF_TOKEN_ARG "vf_token="
+
+int vfio_pci_core_match(struct vfio_device *core_vdev, char *buf)
+{
+	struct vfio_pci_core_device *vdev =
+		container_of(core_vdev, struct vfio_pci_core_device, vdev);
+	bool vf_token = false;
+	uuid_t uuid;
+	int ret;
+
+	if (strncmp(pci_name(vdev->pdev), buf, strlen(pci_name(vdev->pdev))))
+		return 0; /* No match */
+
+	if (strlen(buf) > strlen(pci_name(vdev->pdev))) {
+		buf += strlen(pci_name(vdev->pdev));
+
+		if (*buf != ' ')
+			return 0; /* No match: non-whitespace after name */
+
+		while (*buf) {
+			if (*buf == ' ') {
+				buf++;
+				continue;
+			}
+
+			if (!vf_token && !strncmp(buf, VF_TOKEN_ARG,
+						  strlen(VF_TOKEN_ARG))) {
+				buf += strlen(VF_TOKEN_ARG);
+
+				if (strlen(buf) < UUID_STRING_LEN)
+					return -EINVAL;
+
+				ret = uuid_parse(buf, &uuid);
+				if (ret)
+					return ret;
+
+				vf_token = true;
+				buf += UUID_STRING_LEN;
+			} else {
+				/* Unknown/duplicate option */
+				return -EINVAL;
+			}
+		}
+	}
+
+	ret = vfio_pci_validate_vf_token(vdev, vf_token, &uuid);
+	if (ret)
+		return ret;
+
+	return 1; /* Match */
+}
+EXPORT_SYMBOL_GPL(vfio_pci_core_match);
+
+static int vfio_pci_bus_notifier(struct notifier_block *nb,
+				 unsigned long action, void *data)
+{
+	struct vfio_pci_core_device *vdev = container_of(nb,
+						    struct vfio_pci_core_device, nb);
+	struct device *dev = data;
+	struct pci_dev *pdev = to_pci_dev(dev);
+	struct pci_dev *physfn = pci_physfn(pdev);
+
+	if (action == BUS_NOTIFY_ADD_DEVICE &&
+	    pdev->is_virtfn && physfn == vdev->pdev) {
+		pci_info(vdev->pdev, "Captured SR-IOV VF %s driver_override\n",
+			 pci_name(pdev));
+		pdev->driver_override = kasprintf(GFP_KERNEL, "%s",
+						  vdev->vdev.ops->name);
+	} else if (action == BUS_NOTIFY_BOUND_DRIVER &&
+		   pdev->is_virtfn && physfn == vdev->pdev) {
+		struct pci_driver *drv = pci_dev_driver(pdev);
+
+		if (drv && drv != pci_dev_driver(vdev->pdev))
+			pci_warn(vdev->pdev,
+				 "VF %s bound to driver %s while PF bound to driver %s\n",
+				 pci_name(pdev), drv->name,
+				 pci_dev_driver(vdev->pdev)->name);
+	}
+
+	return 0;
+}
+
+static int vfio_pci_vf_init(struct vfio_pci_core_device *vdev)
+{
+	struct pci_dev *pdev = vdev->pdev;
+	struct vfio_pci_core_device *cur;
+	struct pci_dev *physfn;
+	int ret;
+
+	if (pdev->is_virtfn) {
+		/*
+		 * If this VF was created by our vfio_pci_core_sriov_configure()
+		 * then we can find the PF vfio_pci_core_device now, and due to
+		 * the locking in pci_disable_sriov() it cannot change until
+		 * this VF device driver is removed.
+		 */
+		physfn = pci_physfn(vdev->pdev);
+		mutex_lock(&vfio_pci_sriov_pfs_mutex);
+		list_for_each_entry(cur, &vfio_pci_sriov_pfs, sriov_pfs_item) {
+			if (cur->pdev == physfn) {
+				vdev->sriov_pf_core_dev = cur;
+				break;
+			}
+		}
+		mutex_unlock(&vfio_pci_sriov_pfs_mutex);
+		return 0;
+	}
+
+	/* Not a SRIOV PF */
+	if (!pdev->is_physfn)
+		return 0;
+
+	vdev->vf_token = kzalloc(sizeof(*vdev->vf_token), GFP_KERNEL);
+	if (!vdev->vf_token)
+		return -ENOMEM;
+
+	mutex_init(&vdev->vf_token->lock);
+	uuid_gen(&vdev->vf_token->uuid);
+
+	vdev->nb.notifier_call = vfio_pci_bus_notifier;
+	ret = bus_register_notifier(&pci_bus_type, &vdev->nb);
+	if (ret) {
+		kfree(vdev->vf_token);
+		return ret;
+	}
+	return 0;
+}
+
+static void vfio_pci_vf_uninit(struct vfio_pci_core_device *vdev)
+{
+	if (!vdev->vf_token)
+		return;
+
+	bus_unregister_notifier(&pci_bus_type, &vdev->nb);
+	WARN_ON(vdev->vf_token->users);
+	mutex_destroy(&vdev->vf_token->lock);
+	kfree(vdev->vf_token);
+}
+
+static int vfio_pci_vga_init(struct vfio_pci_core_device *vdev)
+{
+	struct pci_dev *pdev = vdev->pdev;
+	int ret;
+
+	if (!vfio_pci_is_vga(pdev))
+		return 0;
+
+	ret = aperture_remove_conflicting_pci_devices(pdev, vdev->vdev.ops->name);
+	if (ret)
+		return ret;
+
+	ret = vga_client_register(pdev, vfio_pci_set_decode);
+	if (ret)
+		return ret;
+	vga_set_legacy_decoding(pdev, vfio_pci_set_decode(pdev, false));
+	return 0;
+}
+
+static void vfio_pci_vga_uninit(struct vfio_pci_core_device *vdev)
+{
+	struct pci_dev *pdev = vdev->pdev;
+
+	if (!vfio_pci_is_vga(pdev))
+		return;
+	vga_client_unregister(pdev);
+	vga_set_legacy_decoding(pdev, VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM |
+					      VGA_RSRC_LEGACY_IO |
+					      VGA_RSRC_LEGACY_MEM);
+}
+
+int vfio_pci_core_init_dev(struct vfio_device *core_vdev)
+{
+	struct vfio_pci_core_device *vdev =
+		container_of(core_vdev, struct vfio_pci_core_device, vdev);
+
+	vdev->pdev = to_pci_dev(core_vdev->dev);
+	vdev->irq_type = VFIO_PCI_NUM_IRQS;
+	mutex_init(&vdev->igate);
+	spin_lock_init(&vdev->irqlock);
+	mutex_init(&vdev->ioeventfds_lock);
+	INIT_LIST_HEAD(&vdev->dummy_resources_list);
+	INIT_LIST_HEAD(&vdev->ioeventfds_list);
+	mutex_init(&vdev->vma_lock);
+	INIT_LIST_HEAD(&vdev->vma_list);
+	INIT_LIST_HEAD(&vdev->sriov_pfs_item);
+	init_rwsem(&vdev->memory_lock);
+	xa_init(&vdev->ctx);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(vfio_pci_core_init_dev);
+
+void vfio_pci_core_release_dev(struct vfio_device *core_vdev)
+{
+	struct vfio_pci_core_device *vdev =
+		container_of(core_vdev, struct vfio_pci_core_device, vdev);
+
+	mutex_destroy(&vdev->igate);
+	mutex_destroy(&vdev->ioeventfds_lock);
+	mutex_destroy(&vdev->vma_lock);
+	kfree(vdev->region);
+	kfree(vdev->pm_save);
+}
+EXPORT_SYMBOL_GPL(vfio_pci_core_release_dev);
+
+int vfio_pci_core_register_device(struct vfio_pci_core_device *vdev)
+{
+	struct pci_dev *pdev = vdev->pdev;
+	struct device *dev = &pdev->dev;
+	int ret;
+
+	/* Drivers must set the vfio_pci_core_device to their drvdata */
+	if (WARN_ON(vdev != dev_get_drvdata(dev)))
+		return -EINVAL;
+
+	if (pdev->hdr_type != PCI_HEADER_TYPE_NORMAL)
+		return -EINVAL;
+
+	if (vdev->vdev.mig_ops) {
+		if (!(vdev->vdev.mig_ops->migration_get_state &&
+		      vdev->vdev.mig_ops->migration_set_state &&
+		      vdev->vdev.mig_ops->migration_get_data_size) ||
+		    !(vdev->vdev.migration_flags & VFIO_MIGRATION_STOP_COPY))
+			return -EINVAL;
+	}
+
+	if (vdev->vdev.log_ops && !(vdev->vdev.log_ops->log_start &&
+	    vdev->vdev.log_ops->log_stop &&
+	    vdev->vdev.log_ops->log_read_and_clear))
+		return -EINVAL;
+
+	/*
+	 * Prevent binding to PFs with VFs enabled, the VFs might be in use
+	 * by the host or other users.  We cannot capture the VFs if they
+	 * already exist, nor can we track VF users.  Disabling SR-IOV here
+	 * would initiate removing the VFs, which would unbind the driver,
+	 * which is prone to blocking if that VF is also in use by vfio-pci.
+	 * Just reject these PFs and let the user sort it out.
+	 */
+	if (pci_num_vf(pdev)) {
+		pci_warn(pdev, "Cannot bind to PF with SR-IOV enabled\n");
+		return -EBUSY;
+	}
+
+	if (pci_is_root_bus(pdev->bus)) {
+		ret = vfio_assign_device_set(&vdev->vdev, vdev);
+	} else if (!pci_probe_reset_slot(pdev->slot)) {
+		ret = vfio_assign_device_set(&vdev->vdev, pdev->slot);
+	} else {
+		/*
+		 * If there is no slot reset support for this device, the whole
+		 * bus needs to be grouped together to support bus-wide resets.
+		 */
+		ret = vfio_assign_device_set(&vdev->vdev, pdev->bus);
+	}
+
+	if (ret)
+		return ret;
+	ret = vfio_pci_vf_init(vdev);
+	if (ret)
+		return ret;
+	ret = vfio_pci_vga_init(vdev);
+	if (ret)
+		goto out_vf;
+
+	vfio_pci_probe_power_state(vdev);
+
+	/*
+	 * pci-core sets the device power state to an unknown value at
+	 * bootup and after being removed from a driver.  The only
+	 * transition it allows from this unknown state is to D0, which
+	 * typically happens when a driver calls pci_enable_device().
+	 * We're not ready to enable the device yet, but we do want to
+	 * be able to get to D3.  Therefore first do a D0 transition
+	 * before enabling runtime PM.
+	 */
+	vfio_pci_set_power_state(vdev, PCI_D0);
+
+	dev->driver->pm = &vfio_pci_core_pm_ops;
+	pm_runtime_allow(dev);
+	if (!disable_idle_d3)
+		pm_runtime_put(dev);
+
+	ret = vfio_register_group_dev(&vdev->vdev);
+	if (ret)
+		goto out_power;
+	return 0;
+
+out_power:
+	if (!disable_idle_d3)
+		pm_runtime_get_noresume(dev);
+
+	pm_runtime_forbid(dev);
+out_vf:
+	vfio_pci_vf_uninit(vdev);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(vfio_pci_core_register_device);
+
+void vfio_pci_core_unregister_device(struct vfio_pci_core_device *vdev)
+{
+	vfio_pci_core_sriov_configure(vdev, 0);
+
+	vfio_unregister_group_dev(&vdev->vdev);
+
+	vfio_pci_vf_uninit(vdev);
+	vfio_pci_vga_uninit(vdev);
+
+	if (!disable_idle_d3)
+		pm_runtime_get_noresume(&vdev->pdev->dev);
+
+	pm_runtime_forbid(&vdev->pdev->dev);
+}
+EXPORT_SYMBOL_GPL(vfio_pci_core_unregister_device);
+
+pci_ers_result_t vfio_pci_core_aer_err_detected(struct pci_dev *pdev,
+						pci_channel_state_t state)
+{
+	struct vfio_pci_core_device *vdev = dev_get_drvdata(&pdev->dev);
+
+	mutex_lock(&vdev->igate);
+
+	if (vdev->err_trigger)
+		eventfd_signal(vdev->err_trigger, 1);
+
+	mutex_unlock(&vdev->igate);
+
+	return PCI_ERS_RESULT_CAN_RECOVER;
+}
+EXPORT_SYMBOL_GPL(vfio_pci_core_aer_err_detected);
+
+int vfio_pci_core_sriov_configure(struct vfio_pci_core_device *vdev,
+				  int nr_virtfn)
+{
+	struct pci_dev *pdev = vdev->pdev;
+	int ret = 0;
+
+	device_lock_assert(&pdev->dev);
+
+	if (nr_virtfn) {
+		mutex_lock(&vfio_pci_sriov_pfs_mutex);
+		/*
+		 * The thread that adds the vdev to the list is the only thread
+		 * that gets to call pci_enable_sriov() and we will only allow
+		 * it to be called once without going through
+		 * pci_disable_sriov()
+		 */
+		if (!list_empty(&vdev->sriov_pfs_item)) {
+			ret = -EINVAL;
+			goto out_unlock;
+		}
+		list_add_tail(&vdev->sriov_pfs_item, &vfio_pci_sriov_pfs);
+		mutex_unlock(&vfio_pci_sriov_pfs_mutex);
+
+		/*
+		 * The PF power state should always be higher than the VF power
+		 * state. The PF can be in low power state either with runtime
+		 * power management (when there is no user) or PCI_PM_CTRL
+		 * register write by the user. If PF is in the low power state,
+		 * then change the power state to D0 first before enabling
+		 * SR-IOV. Also, this function can be called at any time, and
+		 * userspace PCI_PM_CTRL write can race against this code path,
+		 * so protect the same with 'memory_lock'.
+		 */
+		ret = pm_runtime_resume_and_get(&pdev->dev);
+		if (ret)
+			goto out_del;
+
+		down_write(&vdev->memory_lock);
+		vfio_pci_set_power_state(vdev, PCI_D0);
+		ret = pci_enable_sriov(pdev, nr_virtfn);
+		up_write(&vdev->memory_lock);
+		if (ret) {
+			pm_runtime_put(&pdev->dev);
+			goto out_del;
+		}
+		return nr_virtfn;
+	}
+
+	if (pci_num_vf(pdev)) {
+		pci_disable_sriov(pdev);
+		pm_runtime_put(&pdev->dev);
+	}
+
+out_del:
+	mutex_lock(&vfio_pci_sriov_pfs_mutex);
+	list_del_init(&vdev->sriov_pfs_item);
+out_unlock:
+	mutex_unlock(&vfio_pci_sriov_pfs_mutex);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(vfio_pci_core_sriov_configure);
+
+const struct pci_error_handlers vfio_pci_core_err_handlers = {
+	.error_detected = vfio_pci_core_aer_err_detected,
+};
+EXPORT_SYMBOL_GPL(vfio_pci_core_err_handlers);
+
+static bool vfio_dev_in_groups(struct vfio_device *vdev,
+			       struct vfio_pci_group_info *groups)
+{
+	unsigned int i;
+
+	if (!groups)
+		return false;
+
+	for (i = 0; i < groups->count; i++)
+		if (vfio_file_has_dev(groups->files[i], vdev))
+			return true;
+	return false;
+}
+
+static int vfio_pci_is_device_in_set(struct pci_dev *pdev, void *data)
+{
+	struct vfio_device_set *dev_set = data;
+
+	return vfio_find_device_in_devset(dev_set, &pdev->dev) ? 0 : -ENODEV;
+}
+
+/*
+ * vfio-core considers a group to be viable and will create a vfio_device even
+ * if some devices are bound to drivers like pci-stub or pcieport. Here we
+ * require all PCI devices to be inside our dev_set since that ensures they stay
+ * put and that every driver controlling the device can co-ordinate with the
+ * device reset.
+ *
+ * Returns the pci_dev to pass to pci_reset_bus() if every PCI device to be
+ * reset is inside the dev_set, and pci_reset_bus() can succeed. NULL otherwise.
+ */
+static struct pci_dev *
+vfio_pci_dev_set_resettable(struct vfio_device_set *dev_set)
+{
+	struct pci_dev *pdev;
+
+	lockdep_assert_held(&dev_set->lock);
+
+	/*
+	 * By definition all PCI devices in the dev_set share the same PCI
+	 * reset, so any pci_dev will have the same outcomes for
+	 * pci_probe_reset_*() and pci_reset_bus().
+	 */
+	pdev = list_first_entry(&dev_set->device_list,
+				struct vfio_pci_core_device,
+				vdev.dev_set_list)->pdev;
+
+	/* pci_reset_bus() is supported */
+	if (pci_probe_reset_slot(pdev->slot) && pci_probe_reset_bus(pdev->bus))
+		return NULL;
+
+	if (vfio_pci_for_each_slot_or_bus(pdev, vfio_pci_is_device_in_set,
+					  dev_set,
+					  !pci_probe_reset_slot(pdev->slot)))
+		return NULL;
+	return pdev;
+}
+
+static int vfio_pci_dev_set_pm_runtime_get(struct vfio_device_set *dev_set)
+{
+	struct vfio_pci_core_device *cur;
+	int ret;
+
+	list_for_each_entry(cur, &dev_set->device_list, vdev.dev_set_list) {
+		ret = pm_runtime_resume_and_get(&cur->pdev->dev);
+		if (ret)
+			goto unwind;
+	}
+
+	return 0;
+
+unwind:
+	list_for_each_entry_continue_reverse(cur, &dev_set->device_list,
+					     vdev.dev_set_list)
+		pm_runtime_put(&cur->pdev->dev);
+
+	return ret;
+}
+
+/*
+ * We need to get memory_lock for each device, but devices can share mmap_lock,
+ * therefore we need to zap and hold the vma_lock for each device, and only then
+ * get each memory_lock.
+ */
+static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set,
+				      struct vfio_pci_group_info *groups,
+				      struct iommufd_ctx *iommufd_ctx)
+{
+	struct vfio_pci_core_device *cur_mem;
+	struct vfio_pci_core_device *cur_vma;
+	struct vfio_pci_core_device *cur;
+	struct pci_dev *pdev;
+	bool is_mem = true;
+	int ret;
+
+	mutex_lock(&dev_set->lock);
+	cur_mem = list_first_entry(&dev_set->device_list,
+				   struct vfio_pci_core_device,
+				   vdev.dev_set_list);
+
+	pdev = vfio_pci_dev_set_resettable(dev_set);
+	if (!pdev) {
+		ret = -EINVAL;
+		goto err_unlock;
+	}
+
+	/*
+	 * Some of the devices in the dev_set can be in the runtime suspended
+	 * state. Increment the usage count for all the devices in the dev_set
+	 * before reset and decrement the same after reset.
+	 */
+	ret = vfio_pci_dev_set_pm_runtime_get(dev_set);
+	if (ret)
+		goto err_unlock;
+
+	list_for_each_entry(cur_vma, &dev_set->device_list, vdev.dev_set_list) {
+		bool owned;
+
+		/*
+		 * Test whether all the affected devices can be reset by the
+		 * user.
+		 *
+		 * If called from a group opened device and the user provides
+		 * a set of groups, all the devices in the dev_set should be
+		 * contained by the set of groups provided by the user.
+		 *
+		 * If called from a cdev opened device and the user provides
+		 * a zero-length array, all the devices in the dev_set must
+		 * be bound to the same iommufd_ctx as the input iommufd_ctx.
+		 * If there is any device that has not been bound to any
+		 * iommufd_ctx yet, check if its iommu_group has any device
+		 * bound to the input iommufd_ctx.  Such devices can be
+		 * considered owned by the input iommufd_ctx as the device
+		 * cannot be owned by another iommufd_ctx when its iommu_group
+		 * is owned.
+		 *
+		 * Otherwise, reset is not allowed.
+		 */
+		if (iommufd_ctx) {
+			int devid = vfio_iommufd_get_dev_id(&cur_vma->vdev,
+							    iommufd_ctx);
+
+			owned = (devid > 0 || devid == -ENOENT);
+		} else {
+			owned = vfio_dev_in_groups(&cur_vma->vdev, groups);
+		}
+
+		if (!owned) {
+			ret = -EINVAL;
+			goto err_undo;
+		}
+
+		/*
+		 * Locking multiple devices is prone to deadlock, runaway and
+		 * unwind if we hit contention.
+		 */
+		if (!vfio_pci_zap_and_vma_lock(cur_vma, true)) {
+			ret = -EBUSY;
+			goto err_undo;
+		}
+	}
+	cur_vma = NULL;
+
+	list_for_each_entry(cur_mem, &dev_set->device_list, vdev.dev_set_list) {
+		if (!down_write_trylock(&cur_mem->memory_lock)) {
+			ret = -EBUSY;
+			goto err_undo;
+		}
+		mutex_unlock(&cur_mem->vma_lock);
+	}
+	cur_mem = NULL;
+
+	/*
+	 * The pci_reset_bus() will reset all the devices in the bus.
+	 * The power state can be non-D0 for some of the devices in the bus.
+	 * For these devices, the pci_reset_bus() will internally set
+	 * the power state to D0 without vfio driver involvement.
+	 * For the devices which have NoSoftRst-, the reset function can
+	 * cause the PCI config space reset without restoring the original
+	 * state (saved locally in 'vdev->pm_save').
+	 */
+	list_for_each_entry(cur, &dev_set->device_list, vdev.dev_set_list)
+		vfio_pci_set_power_state(cur, PCI_D0);
+
+	ret = pci_reset_bus(pdev);
+
+err_undo:
+	list_for_each_entry(cur, &dev_set->device_list, vdev.dev_set_list) {
+		if (cur == cur_mem)
+			is_mem = false;
+		if (cur == cur_vma)
+			break;
+		if (is_mem)
+			up_write(&cur->memory_lock);
+		else
+			mutex_unlock(&cur->vma_lock);
+	}
+
+	list_for_each_entry(cur, &dev_set->device_list, vdev.dev_set_list)
+		pm_runtime_put(&cur->pdev->dev);
+err_unlock:
+	mutex_unlock(&dev_set->lock);
+	return ret;
+}
+
+static bool vfio_pci_dev_set_needs_reset(struct vfio_device_set *dev_set)
+{
+	struct vfio_pci_core_device *cur;
+	bool needs_reset = false;
+
+	/* No other VFIO device in the set can be open. */
+	if (vfio_device_set_open_count(dev_set) > 1)
+		return false;
+
+	list_for_each_entry(cur, &dev_set->device_list, vdev.dev_set_list)
+		needs_reset |= cur->needs_reset;
+	return needs_reset;
+}
+
+/*
+ * If a bus or slot reset is available for the provided dev_set and:
+ *  - All of the devices affected by that bus or slot reset are unused
+ *  - At least one of the affected devices is marked dirty via
+ *    needs_reset (such as by lack of FLR support)
+ * Then attempt to perform that bus or slot reset.
+ */
+static void vfio_pci_dev_set_try_reset(struct vfio_device_set *dev_set)
+{
+	struct vfio_pci_core_device *cur;
+	struct pci_dev *pdev;
+	bool reset_done = false;
+
+	if (!vfio_pci_dev_set_needs_reset(dev_set))
+		return;
+
+	pdev = vfio_pci_dev_set_resettable(dev_set);
+	if (!pdev)
+		return;
+
+	/*
+	 * Some of the devices in the bus can be in the runtime suspended
+	 * state. Increment the usage count for all the devices in the dev_set
+	 * before reset and decrement the same after reset.
+	 */
+	if (!disable_idle_d3 && vfio_pci_dev_set_pm_runtime_get(dev_set))
+		return;
+
+	if (!pci_reset_bus(pdev))
+		reset_done = true;
+
+	list_for_each_entry(cur, &dev_set->device_list, vdev.dev_set_list) {
+		if (reset_done)
+			cur->needs_reset = false;
+
+		if (!disable_idle_d3)
+			pm_runtime_put(&cur->pdev->dev);
+	}
+}
+
+void vfio_pci_core_set_params(bool is_nointxmask, bool is_disable_vga,
+			      bool is_disable_idle_d3)
+{
+	nointxmask = is_nointxmask;
+	disable_vga = is_disable_vga;
+	disable_idle_d3 = is_disable_idle_d3;
+}
+EXPORT_SYMBOL_GPL(vfio_pci_core_set_params);
+
+static void vfio_pci_core_cleanup(void)
+{
+	vfio_pci_uninit_perm_bits();
+}
+
+static int __init vfio_pci_core_init(void)
+{
+	/* Allocate shared config space permission data used by all devices */
+	return vfio_pci_init_perm_bits();
+}
+
+module_init(vfio_pci_core_init);
+module_exit(vfio_pci_core_cleanup);
+
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR(DRIVER_AUTHOR);
+MODULE_DESCRIPTION(DRIVER_DESC);
diff --git a/drivers/vfio/pci/vfio_pci_igd.c b/drivers/vfio/pci/vfio_pci_igd.c
new file mode 100644
index 0000000000..dd70e2431b
--- /dev/null
+++ b/drivers/vfio/pci/vfio_pci_igd.c
@@ -0,0 +1,451 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * VFIO PCI Intel Graphics support
+ *
+ * Copyright (C) 2016 Red Hat, Inc.  All rights reserved.
+ *	Author: Alex Williamson <alex.williamson@redhat.com>
+ *
+ * Register a device specific region through which to provide read-only
+ * access to the Intel IGD opregion.  The register defining the opregion
+ * address is also virtualized to prevent user modification.
+ */
+
+#include <linux/io.h>
+#include <linux/pci.h>
+#include <linux/uaccess.h>
+#include <linux/vfio.h>
+
+#include "vfio_pci_priv.h"
+
+#define OPREGION_SIGNATURE	"IntelGraphicsMem"
+#define OPREGION_SIZE		(8 * 1024)
+#define OPREGION_PCI_ADDR	0xfc
+
+#define OPREGION_RVDA		0x3ba
+#define OPREGION_RVDS		0x3c2
+#define OPREGION_VERSION	0x16
+
+struct igd_opregion_vbt {
+	void *opregion;
+	void *vbt_ex;
+};
+
+/**
+ * igd_opregion_shift_copy() - Copy OpRegion to user buffer and shift position.
+ * @dst: User buffer ptr to copy to.
+ * @off: Offset to user buffer ptr. Increased by bytes on return.
+ * @src: Source buffer to copy from.
+ * @pos: Increased by bytes on return.
+ * @remaining: Decreased by bytes on return.
+ * @bytes: Bytes to copy and adjust off, pos and remaining.
+ *
+ * Copy OpRegion to offset from specific source ptr and shift the offset.
+ *
+ * Return: 0 on success, -EFAULT otherwise.
+ *
+ */
+static inline unsigned long igd_opregion_shift_copy(char __user *dst,
+						    loff_t *off,
+						    void *src,
+						    loff_t *pos,
+						    size_t *remaining,
+						    size_t bytes)
+{
+	if (copy_to_user(dst + (*off), src, bytes))
+		return -EFAULT;
+
+	*off += bytes;
+	*pos += bytes;
+	*remaining -= bytes;
+
+	return 0;
+}
+
+static ssize_t vfio_pci_igd_rw(struct vfio_pci_core_device *vdev,
+			       char __user *buf, size_t count, loff_t *ppos,
+			       bool iswrite)
+{
+	unsigned int i = VFIO_PCI_OFFSET_TO_INDEX(*ppos) - VFIO_PCI_NUM_REGIONS;
+	struct igd_opregion_vbt *opregionvbt = vdev->region[i].data;
+	loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK, off = 0;
+	size_t remaining;
+
+	if (pos >= vdev->region[i].size || iswrite)
+		return -EINVAL;
+
+	count = min_t(size_t, count, vdev->region[i].size - pos);
+	remaining = count;
+
+	/* Copy until OpRegion version */
+	if (remaining && pos < OPREGION_VERSION) {
+		size_t bytes = min_t(size_t, remaining, OPREGION_VERSION - pos);
+
+		if (igd_opregion_shift_copy(buf, &off,
+					    opregionvbt->opregion + pos, &pos,
+					    &remaining, bytes))
+			return -EFAULT;
+	}
+
+	/* Copy patched (if necessary) OpRegion version */
+	if (remaining && pos < OPREGION_VERSION + sizeof(__le16)) {
+		size_t bytes = min_t(size_t, remaining,
+				     OPREGION_VERSION + sizeof(__le16) - pos);
+		__le16 version = *(__le16 *)(opregionvbt->opregion +
+					     OPREGION_VERSION);
+
+		/* Patch to 2.1 if OpRegion 2.0 has extended VBT */
+		if (le16_to_cpu(version) == 0x0200 && opregionvbt->vbt_ex)
+			version = cpu_to_le16(0x0201);
+
+		if (igd_opregion_shift_copy(buf, &off,
+					    (u8 *)&version +
+					    (pos - OPREGION_VERSION),
+					    &pos, &remaining, bytes))
+			return -EFAULT;
+	}
+
+	/* Copy until RVDA */
+	if (remaining && pos < OPREGION_RVDA) {
+		size_t bytes = min_t(size_t, remaining, OPREGION_RVDA - pos);
+
+		if (igd_opregion_shift_copy(buf, &off,
+					    opregionvbt->opregion + pos, &pos,
+					    &remaining, bytes))
+			return -EFAULT;
+	}
+
+	/* Copy modified (if necessary) RVDA */
+	if (remaining && pos < OPREGION_RVDA + sizeof(__le64)) {
+		size_t bytes = min_t(size_t, remaining,
+				     OPREGION_RVDA + sizeof(__le64) - pos);
+		__le64 rvda = cpu_to_le64(opregionvbt->vbt_ex ?
+					  OPREGION_SIZE : 0);
+
+		if (igd_opregion_shift_copy(buf, &off,
+					    (u8 *)&rvda + (pos - OPREGION_RVDA),
+					    &pos, &remaining, bytes))
+			return -EFAULT;
+	}
+
+	/* Copy the rest of OpRegion */
+	if (remaining && pos < OPREGION_SIZE) {
+		size_t bytes = min_t(size_t, remaining, OPREGION_SIZE - pos);
+
+		if (igd_opregion_shift_copy(buf, &off,
+					    opregionvbt->opregion + pos, &pos,
+					    &remaining, bytes))
+			return -EFAULT;
+	}
+
+	/* Copy extended VBT if exists */
+	if (remaining &&
+	    copy_to_user(buf + off, opregionvbt->vbt_ex + (pos - OPREGION_SIZE),
+			 remaining))
+		return -EFAULT;
+
+	*ppos += count;
+
+	return count;
+}
+
+static void vfio_pci_igd_release(struct vfio_pci_core_device *vdev,
+				 struct vfio_pci_region *region)
+{
+	struct igd_opregion_vbt *opregionvbt = region->data;
+
+	if (opregionvbt->vbt_ex)
+		memunmap(opregionvbt->vbt_ex);
+
+	memunmap(opregionvbt->opregion);
+	kfree(opregionvbt);
+}
+
+static const struct vfio_pci_regops vfio_pci_igd_regops = {
+	.rw		= vfio_pci_igd_rw,
+	.release	= vfio_pci_igd_release,
+};
+
+static int vfio_pci_igd_opregion_init(struct vfio_pci_core_device *vdev)
+{
+	__le32 *dwordp = (__le32 *)(vdev->vconfig + OPREGION_PCI_ADDR);
+	u32 addr, size;
+	struct igd_opregion_vbt *opregionvbt;
+	int ret;
+	u16 version;
+
+	ret = pci_read_config_dword(vdev->pdev, OPREGION_PCI_ADDR, &addr);
+	if (ret)
+		return ret;
+
+	if (!addr || !(~addr))
+		return -ENODEV;
+
+	opregionvbt = kzalloc(sizeof(*opregionvbt), GFP_KERNEL_ACCOUNT);
+	if (!opregionvbt)
+		return -ENOMEM;
+
+	opregionvbt->opregion = memremap(addr, OPREGION_SIZE, MEMREMAP_WB);
+	if (!opregionvbt->opregion) {
+		kfree(opregionvbt);
+		return -ENOMEM;
+	}
+
+	if (memcmp(opregionvbt->opregion, OPREGION_SIGNATURE, 16)) {
+		memunmap(opregionvbt->opregion);
+		kfree(opregionvbt);
+		return -EINVAL;
+	}
+
+	size = le32_to_cpu(*(__le32 *)(opregionvbt->opregion + 16));
+	if (!size) {
+		memunmap(opregionvbt->opregion);
+		kfree(opregionvbt);
+		return -EINVAL;
+	}
+
+	size *= 1024; /* In KB */
+
+	/*
+	 * OpRegion and VBT:
+	 * When VBT data doesn't exceed 6KB, it's stored in Mailbox #4.
+	 * When VBT data exceeds 6KB size, Mailbox #4 is no longer large enough
+	 * to hold the VBT data, the Extended VBT region is introduced since
+	 * OpRegion 2.0 to hold the VBT data. Since OpRegion 2.0, RVDA/RVDS are
+	 * introduced to define the extended VBT data location and size.
+	 * OpRegion 2.0: RVDA defines the absolute physical address of the
+	 *   extended VBT data, RVDS defines the VBT data size.
+	 * OpRegion 2.1 and above: RVDA defines the relative address of the
+	 *   extended VBT data to OpRegion base, RVDS defines the VBT data size.
+	 *
+	 * Due to the RVDA definition diff in OpRegion VBT (also the only diff
+	 * between 2.0 and 2.1), exposing OpRegion and VBT as a contiguous range
+	 * for OpRegion 2.0 and above makes it possible to support the
+	 * non-contiguous VBT through a single vfio region. From r/w ops view,
+	 * only contiguous VBT after OpRegion with version 2.1+ is exposed,
+	 * regardless the host OpRegion is 2.0 or non-contiguous 2.1+. The r/w
+	 * ops will on-the-fly shift the actural offset into VBT so that data at
+	 * correct position can be returned to the requester.
+	 */
+	version = le16_to_cpu(*(__le16 *)(opregionvbt->opregion +
+					  OPREGION_VERSION));
+	if (version >= 0x0200) {
+		u64 rvda = le64_to_cpu(*(__le64 *)(opregionvbt->opregion +
+						   OPREGION_RVDA));
+		u32 rvds = le32_to_cpu(*(__le32 *)(opregionvbt->opregion +
+						   OPREGION_RVDS));
+
+		/* The extended VBT is valid only when RVDA/RVDS are non-zero */
+		if (rvda && rvds) {
+			size += rvds;
+
+			/*
+			 * Extended VBT location by RVDA:
+			 * Absolute physical addr for 2.0.
+			 * Relative addr to OpRegion header for 2.1+.
+			 */
+			if (version == 0x0200)
+				addr = rvda;
+			else
+				addr += rvda;
+
+			opregionvbt->vbt_ex = memremap(addr, rvds, MEMREMAP_WB);
+			if (!opregionvbt->vbt_ex) {
+				memunmap(opregionvbt->opregion);
+				kfree(opregionvbt);
+				return -ENOMEM;
+			}
+		}
+	}
+
+	ret = vfio_pci_core_register_dev_region(vdev,
+		PCI_VENDOR_ID_INTEL | VFIO_REGION_TYPE_PCI_VENDOR_TYPE,
+		VFIO_REGION_SUBTYPE_INTEL_IGD_OPREGION, &vfio_pci_igd_regops,
+		size, VFIO_REGION_INFO_FLAG_READ, opregionvbt);
+	if (ret) {
+		if (opregionvbt->vbt_ex)
+			memunmap(opregionvbt->vbt_ex);
+
+		memunmap(opregionvbt->opregion);
+		kfree(opregionvbt);
+		return ret;
+	}
+
+	/* Fill vconfig with the hw value and virtualize register */
+	*dwordp = cpu_to_le32(addr);
+	memset(vdev->pci_config_map + OPREGION_PCI_ADDR,
+	       PCI_CAP_ID_INVALID_VIRT, 4);
+
+	return ret;
+}
+
+static ssize_t vfio_pci_igd_cfg_rw(struct vfio_pci_core_device *vdev,
+				   char __user *buf, size_t count, loff_t *ppos,
+				   bool iswrite)
+{
+	unsigned int i = VFIO_PCI_OFFSET_TO_INDEX(*ppos) - VFIO_PCI_NUM_REGIONS;
+	struct pci_dev *pdev = vdev->region[i].data;
+	loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK;
+	size_t size;
+	int ret;
+
+	if (pos >= vdev->region[i].size || iswrite)
+		return -EINVAL;
+
+	size = count = min(count, (size_t)(vdev->region[i].size - pos));
+
+	if ((pos & 1) && size) {
+		u8 val;
+
+		ret = pci_user_read_config_byte(pdev, pos, &val);
+		if (ret)
+			return ret;
+
+		if (copy_to_user(buf + count - size, &val, 1))
+			return -EFAULT;
+
+		pos++;
+		size--;
+	}
+
+	if ((pos & 3) && size > 2) {
+		u16 val;
+		__le16 lval;
+
+		ret = pci_user_read_config_word(pdev, pos, &val);
+		if (ret)
+			return ret;
+
+		lval = cpu_to_le16(val);
+		if (copy_to_user(buf + count - size, &lval, 2))
+			return -EFAULT;
+
+		pos += 2;
+		size -= 2;
+	}
+
+	while (size > 3) {
+		u32 val;
+		__le32 lval;
+
+		ret = pci_user_read_config_dword(pdev, pos, &val);
+		if (ret)
+			return ret;
+
+		lval = cpu_to_le32(val);
+		if (copy_to_user(buf + count - size, &lval, 4))
+			return -EFAULT;
+
+		pos += 4;
+		size -= 4;
+	}
+
+	while (size >= 2) {
+		u16 val;
+		__le16 lval;
+
+		ret = pci_user_read_config_word(pdev, pos, &val);
+		if (ret)
+			return ret;
+
+		lval = cpu_to_le16(val);
+		if (copy_to_user(buf + count - size, &lval, 2))
+			return -EFAULT;
+
+		pos += 2;
+		size -= 2;
+	}
+
+	while (size) {
+		u8 val;
+
+		ret = pci_user_read_config_byte(pdev, pos, &val);
+		if (ret)
+			return ret;
+
+		if (copy_to_user(buf + count - size, &val, 1))
+			return -EFAULT;
+
+		pos++;
+		size--;
+	}
+
+	*ppos += count;
+
+	return count;
+}
+
+static void vfio_pci_igd_cfg_release(struct vfio_pci_core_device *vdev,
+				     struct vfio_pci_region *region)
+{
+	struct pci_dev *pdev = region->data;
+
+	pci_dev_put(pdev);
+}
+
+static const struct vfio_pci_regops vfio_pci_igd_cfg_regops = {
+	.rw		= vfio_pci_igd_cfg_rw,
+	.release	= vfio_pci_igd_cfg_release,
+};
+
+static int vfio_pci_igd_cfg_init(struct vfio_pci_core_device *vdev)
+{
+	struct pci_dev *host_bridge, *lpc_bridge;
+	int ret;
+
+	host_bridge = pci_get_domain_bus_and_slot(0, 0, PCI_DEVFN(0, 0));
+	if (!host_bridge)
+		return -ENODEV;
+
+	if (host_bridge->vendor != PCI_VENDOR_ID_INTEL ||
+	    host_bridge->class != (PCI_CLASS_BRIDGE_HOST << 8)) {
+		pci_dev_put(host_bridge);
+		return -EINVAL;
+	}
+
+	ret = vfio_pci_core_register_dev_region(vdev,
+		PCI_VENDOR_ID_INTEL | VFIO_REGION_TYPE_PCI_VENDOR_TYPE,
+		VFIO_REGION_SUBTYPE_INTEL_IGD_HOST_CFG,
+		&vfio_pci_igd_cfg_regops, host_bridge->cfg_size,
+		VFIO_REGION_INFO_FLAG_READ, host_bridge);
+	if (ret) {
+		pci_dev_put(host_bridge);
+		return ret;
+	}
+
+	lpc_bridge = pci_get_domain_bus_and_slot(0, 0, PCI_DEVFN(0x1f, 0));
+	if (!lpc_bridge)
+		return -ENODEV;
+
+	if (lpc_bridge->vendor != PCI_VENDOR_ID_INTEL ||
+	    lpc_bridge->class != (PCI_CLASS_BRIDGE_ISA << 8)) {
+		pci_dev_put(lpc_bridge);
+		return -EINVAL;
+	}
+
+	ret = vfio_pci_core_register_dev_region(vdev,
+		PCI_VENDOR_ID_INTEL | VFIO_REGION_TYPE_PCI_VENDOR_TYPE,
+		VFIO_REGION_SUBTYPE_INTEL_IGD_LPC_CFG,
+		&vfio_pci_igd_cfg_regops, lpc_bridge->cfg_size,
+		VFIO_REGION_INFO_FLAG_READ, lpc_bridge);
+	if (ret) {
+		pci_dev_put(lpc_bridge);
+		return ret;
+	}
+
+	return 0;
+}
+
+int vfio_pci_igd_init(struct vfio_pci_core_device *vdev)
+{
+	int ret;
+
+	ret = vfio_pci_igd_opregion_init(vdev);
+	if (ret)
+		return ret;
+
+	ret = vfio_pci_igd_cfg_init(vdev);
+	if (ret)
+		return ret;
+
+	return 0;
+}
diff --git a/drivers/vfio/pci/vfio_pci_intrs.c b/drivers/vfio/pci/vfio_pci_intrs.c
new file mode 100644
index 0000000000..cbb4bcbfbf
--- /dev/null
+++ b/drivers/vfio/pci/vfio_pci_intrs.c
@@ -0,0 +1,834 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * VFIO PCI interrupt handling
+ *
+ * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
+ *     Author: Alex Williamson <alex.williamson@redhat.com>
+ *
+ * Derived from original vfio:
+ * Copyright 2010 Cisco Systems, Inc.  All rights reserved.
+ * Author: Tom Lyon, pugs@cisco.com
+ */
+
+#include <linux/device.h>
+#include <linux/interrupt.h>
+#include <linux/eventfd.h>
+#include <linux/msi.h>
+#include <linux/pci.h>
+#include <linux/file.h>
+#include <linux/vfio.h>
+#include <linux/wait.h>
+#include <linux/slab.h>
+
+#include "vfio_pci_priv.h"
+
+struct vfio_pci_irq_ctx {
+	struct eventfd_ctx	*trigger;
+	struct virqfd		*unmask;
+	struct virqfd		*mask;
+	char			*name;
+	bool			masked;
+	struct irq_bypass_producer	producer;
+};
+
+static bool irq_is(struct vfio_pci_core_device *vdev, int type)
+{
+	return vdev->irq_type == type;
+}
+
+static bool is_intx(struct vfio_pci_core_device *vdev)
+{
+	return vdev->irq_type == VFIO_PCI_INTX_IRQ_INDEX;
+}
+
+static bool is_irq_none(struct vfio_pci_core_device *vdev)
+{
+	return !(vdev->irq_type == VFIO_PCI_INTX_IRQ_INDEX ||
+		 vdev->irq_type == VFIO_PCI_MSI_IRQ_INDEX ||
+		 vdev->irq_type == VFIO_PCI_MSIX_IRQ_INDEX);
+}
+
+static
+struct vfio_pci_irq_ctx *vfio_irq_ctx_get(struct vfio_pci_core_device *vdev,
+					  unsigned long index)
+{
+	return xa_load(&vdev->ctx, index);
+}
+
+static void vfio_irq_ctx_free(struct vfio_pci_core_device *vdev,
+			      struct vfio_pci_irq_ctx *ctx, unsigned long index)
+{
+	xa_erase(&vdev->ctx, index);
+	kfree(ctx);
+}
+
+static struct vfio_pci_irq_ctx *
+vfio_irq_ctx_alloc(struct vfio_pci_core_device *vdev, unsigned long index)
+{
+	struct vfio_pci_irq_ctx *ctx;
+	int ret;
+
+	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL_ACCOUNT);
+	if (!ctx)
+		return NULL;
+
+	ret = xa_insert(&vdev->ctx, index, ctx, GFP_KERNEL_ACCOUNT);
+	if (ret) {
+		kfree(ctx);
+		return NULL;
+	}
+
+	return ctx;
+}
+
+/*
+ * INTx
+ */
+static void vfio_send_intx_eventfd(void *opaque, void *unused)
+{
+	struct vfio_pci_core_device *vdev = opaque;
+
+	if (likely(is_intx(vdev) && !vdev->virq_disabled)) {
+		struct vfio_pci_irq_ctx *ctx;
+
+		ctx = vfio_irq_ctx_get(vdev, 0);
+		if (WARN_ON_ONCE(!ctx))
+			return;
+		eventfd_signal(ctx->trigger, 1);
+	}
+}
+
+/* Returns true if the INTx vfio_pci_irq_ctx.masked value is changed. */
+bool vfio_pci_intx_mask(struct vfio_pci_core_device *vdev)
+{
+	struct pci_dev *pdev = vdev->pdev;
+	struct vfio_pci_irq_ctx *ctx;
+	unsigned long flags;
+	bool masked_changed = false;
+
+	spin_lock_irqsave(&vdev->irqlock, flags);
+
+	/*
+	 * Masking can come from interrupt, ioctl, or config space
+	 * via INTx disable.  The latter means this can get called
+	 * even when not using intx delivery.  In this case, just
+	 * try to have the physical bit follow the virtual bit.
+	 */
+	if (unlikely(!is_intx(vdev))) {
+		if (vdev->pci_2_3)
+			pci_intx(pdev, 0);
+		goto out_unlock;
+	}
+
+	ctx = vfio_irq_ctx_get(vdev, 0);
+	if (WARN_ON_ONCE(!ctx))
+		goto out_unlock;
+
+	if (!ctx->masked) {
+		/*
+		 * Can't use check_and_mask here because we always want to
+		 * mask, not just when something is pending.
+		 */
+		if (vdev->pci_2_3)
+			pci_intx(pdev, 0);
+		else
+			disable_irq_nosync(pdev->irq);
+
+		ctx->masked = true;
+		masked_changed = true;
+	}
+
+out_unlock:
+	spin_unlock_irqrestore(&vdev->irqlock, flags);
+	return masked_changed;
+}
+
+/*
+ * If this is triggered by an eventfd, we can't call eventfd_signal
+ * or else we'll deadlock on the eventfd wait queue.  Return >0 when
+ * a signal is necessary, which can then be handled via a work queue
+ * or directly depending on the caller.
+ */
+static int vfio_pci_intx_unmask_handler(void *opaque, void *unused)
+{
+	struct vfio_pci_core_device *vdev = opaque;
+	struct pci_dev *pdev = vdev->pdev;
+	struct vfio_pci_irq_ctx *ctx;
+	unsigned long flags;
+	int ret = 0;
+
+	spin_lock_irqsave(&vdev->irqlock, flags);
+
+	/*
+	 * Unmasking comes from ioctl or config, so again, have the
+	 * physical bit follow the virtual even when not using INTx.
+	 */
+	if (unlikely(!is_intx(vdev))) {
+		if (vdev->pci_2_3)
+			pci_intx(pdev, 1);
+		goto out_unlock;
+	}
+
+	ctx = vfio_irq_ctx_get(vdev, 0);
+	if (WARN_ON_ONCE(!ctx))
+		goto out_unlock;
+
+	if (ctx->masked && !vdev->virq_disabled) {
+		/*
+		 * A pending interrupt here would immediately trigger,
+		 * but we can avoid that overhead by just re-sending
+		 * the interrupt to the user.
+		 */
+		if (vdev->pci_2_3) {
+			if (!pci_check_and_unmask_intx(pdev))
+				ret = 1;
+		} else
+			enable_irq(pdev->irq);
+
+		ctx->masked = (ret > 0);
+	}
+
+out_unlock:
+	spin_unlock_irqrestore(&vdev->irqlock, flags);
+
+	return ret;
+}
+
+void vfio_pci_intx_unmask(struct vfio_pci_core_device *vdev)
+{
+	if (vfio_pci_intx_unmask_handler(vdev, NULL) > 0)
+		vfio_send_intx_eventfd(vdev, NULL);
+}
+
+static irqreturn_t vfio_intx_handler(int irq, void *dev_id)
+{
+	struct vfio_pci_core_device *vdev = dev_id;
+	struct vfio_pci_irq_ctx *ctx;
+	unsigned long flags;
+	int ret = IRQ_NONE;
+
+	ctx = vfio_irq_ctx_get(vdev, 0);
+	if (WARN_ON_ONCE(!ctx))
+		return ret;
+
+	spin_lock_irqsave(&vdev->irqlock, flags);
+
+	if (!vdev->pci_2_3) {
+		disable_irq_nosync(vdev->pdev->irq);
+		ctx->masked = true;
+		ret = IRQ_HANDLED;
+	} else if (!ctx->masked &&  /* may be shared */
+		   pci_check_and_mask_intx(vdev->pdev)) {
+		ctx->masked = true;
+		ret = IRQ_HANDLED;
+	}
+
+	spin_unlock_irqrestore(&vdev->irqlock, flags);
+
+	if (ret == IRQ_HANDLED)
+		vfio_send_intx_eventfd(vdev, NULL);
+
+	return ret;
+}
+
+static int vfio_intx_enable(struct vfio_pci_core_device *vdev)
+{
+	struct vfio_pci_irq_ctx *ctx;
+
+	if (!is_irq_none(vdev))
+		return -EINVAL;
+
+	if (!vdev->pdev->irq)
+		return -ENODEV;
+
+	ctx = vfio_irq_ctx_alloc(vdev, 0);
+	if (!ctx)
+		return -ENOMEM;
+
+	/*
+	 * If the virtual interrupt is masked, restore it.  Devices
+	 * supporting DisINTx can be masked at the hardware level
+	 * here, non-PCI-2.3 devices will have to wait until the
+	 * interrupt is enabled.
+	 */
+	ctx->masked = vdev->virq_disabled;
+	if (vdev->pci_2_3)
+		pci_intx(vdev->pdev, !ctx->masked);
+
+	vdev->irq_type = VFIO_PCI_INTX_IRQ_INDEX;
+
+	return 0;
+}
+
+static int vfio_intx_set_signal(struct vfio_pci_core_device *vdev, int fd)
+{
+	struct pci_dev *pdev = vdev->pdev;
+	unsigned long irqflags = IRQF_SHARED;
+	struct vfio_pci_irq_ctx *ctx;
+	struct eventfd_ctx *trigger;
+	unsigned long flags;
+	int ret;
+
+	ctx = vfio_irq_ctx_get(vdev, 0);
+	if (WARN_ON_ONCE(!ctx))
+		return -EINVAL;
+
+	if (ctx->trigger) {
+		free_irq(pdev->irq, vdev);
+		kfree(ctx->name);
+		eventfd_ctx_put(ctx->trigger);
+		ctx->trigger = NULL;
+	}
+
+	if (fd < 0) /* Disable only */
+		return 0;
+
+	ctx->name = kasprintf(GFP_KERNEL_ACCOUNT, "vfio-intx(%s)",
+			      pci_name(pdev));
+	if (!ctx->name)
+		return -ENOMEM;
+
+	trigger = eventfd_ctx_fdget(fd);
+	if (IS_ERR(trigger)) {
+		kfree(ctx->name);
+		return PTR_ERR(trigger);
+	}
+
+	ctx->trigger = trigger;
+
+	if (!vdev->pci_2_3)
+		irqflags = 0;
+
+	ret = request_irq(pdev->irq, vfio_intx_handler,
+			  irqflags, ctx->name, vdev);
+	if (ret) {
+		ctx->trigger = NULL;
+		kfree(ctx->name);
+		eventfd_ctx_put(trigger);
+		return ret;
+	}
+
+	/*
+	 * INTx disable will stick across the new irq setup,
+	 * disable_irq won't.
+	 */
+	spin_lock_irqsave(&vdev->irqlock, flags);
+	if (!vdev->pci_2_3 && ctx->masked)
+		disable_irq_nosync(pdev->irq);
+	spin_unlock_irqrestore(&vdev->irqlock, flags);
+
+	return 0;
+}
+
+static void vfio_intx_disable(struct vfio_pci_core_device *vdev)
+{
+	struct vfio_pci_irq_ctx *ctx;
+
+	ctx = vfio_irq_ctx_get(vdev, 0);
+	WARN_ON_ONCE(!ctx);
+	if (ctx) {
+		vfio_virqfd_disable(&ctx->unmask);
+		vfio_virqfd_disable(&ctx->mask);
+	}
+	vfio_intx_set_signal(vdev, -1);
+	vdev->irq_type = VFIO_PCI_NUM_IRQS;
+	vfio_irq_ctx_free(vdev, ctx, 0);
+}
+
+/*
+ * MSI/MSI-X
+ */
+static irqreturn_t vfio_msihandler(int irq, void *arg)
+{
+	struct eventfd_ctx *trigger = arg;
+
+	eventfd_signal(trigger, 1);
+	return IRQ_HANDLED;
+}
+
+static int vfio_msi_enable(struct vfio_pci_core_device *vdev, int nvec, bool msix)
+{
+	struct pci_dev *pdev = vdev->pdev;
+	unsigned int flag = msix ? PCI_IRQ_MSIX : PCI_IRQ_MSI;
+	int ret;
+	u16 cmd;
+
+	if (!is_irq_none(vdev))
+		return -EINVAL;
+
+	/* return the number of supported vectors if we can't get all: */
+	cmd = vfio_pci_memory_lock_and_enable(vdev);
+	ret = pci_alloc_irq_vectors(pdev, 1, nvec, flag);
+	if (ret < nvec) {
+		if (ret > 0)
+			pci_free_irq_vectors(pdev);
+		vfio_pci_memory_unlock_and_restore(vdev, cmd);
+		return ret;
+	}
+	vfio_pci_memory_unlock_and_restore(vdev, cmd);
+
+	vdev->irq_type = msix ? VFIO_PCI_MSIX_IRQ_INDEX :
+				VFIO_PCI_MSI_IRQ_INDEX;
+
+	if (!msix) {
+		/*
+		 * Compute the virtual hardware field for max msi vectors -
+		 * it is the log base 2 of the number of vectors.
+		 */
+		vdev->msi_qmax = fls(nvec * 2 - 1) - 1;
+	}
+
+	return 0;
+}
+
+/*
+ * vfio_msi_alloc_irq() returns the Linux IRQ number of an MSI or MSI-X device
+ * interrupt vector. If a Linux IRQ number is not available then a new
+ * interrupt is allocated if dynamic MSI-X is supported.
+ *
+ * Where is vfio_msi_free_irq()? Allocated interrupts are maintained,
+ * essentially forming a cache that subsequent allocations can draw from.
+ * Interrupts are freed using pci_free_irq_vectors() when MSI/MSI-X is
+ * disabled.
+ */
+static int vfio_msi_alloc_irq(struct vfio_pci_core_device *vdev,
+			      unsigned int vector, bool msix)
+{
+	struct pci_dev *pdev = vdev->pdev;
+	struct msi_map map;
+	int irq;
+	u16 cmd;
+
+	irq = pci_irq_vector(pdev, vector);
+	if (WARN_ON_ONCE(irq == 0))
+		return -EINVAL;
+	if (irq > 0 || !msix || !vdev->has_dyn_msix)
+		return irq;
+
+	cmd = vfio_pci_memory_lock_and_enable(vdev);
+	map = pci_msix_alloc_irq_at(pdev, vector, NULL);
+	vfio_pci_memory_unlock_and_restore(vdev, cmd);
+
+	return map.index < 0 ? map.index : map.virq;
+}
+
+static int vfio_msi_set_vector_signal(struct vfio_pci_core_device *vdev,
+				      unsigned int vector, int fd, bool msix)
+{
+	struct pci_dev *pdev = vdev->pdev;
+	struct vfio_pci_irq_ctx *ctx;
+	struct eventfd_ctx *trigger;
+	int irq = -EINVAL, ret;
+	u16 cmd;
+
+	ctx = vfio_irq_ctx_get(vdev, vector);
+
+	if (ctx) {
+		irq_bypass_unregister_producer(&ctx->producer);
+		irq = pci_irq_vector(pdev, vector);
+		cmd = vfio_pci_memory_lock_and_enable(vdev);
+		free_irq(irq, ctx->trigger);
+		vfio_pci_memory_unlock_and_restore(vdev, cmd);
+		/* Interrupt stays allocated, will be freed at MSI-X disable. */
+		kfree(ctx->name);
+		eventfd_ctx_put(ctx->trigger);
+		vfio_irq_ctx_free(vdev, ctx, vector);
+	}
+
+	if (fd < 0)
+		return 0;
+
+	if (irq == -EINVAL) {
+		/* Interrupt stays allocated, will be freed at MSI-X disable. */
+		irq = vfio_msi_alloc_irq(vdev, vector, msix);
+		if (irq < 0)
+			return irq;
+	}
+
+	ctx = vfio_irq_ctx_alloc(vdev, vector);
+	if (!ctx)
+		return -ENOMEM;
+
+	ctx->name = kasprintf(GFP_KERNEL_ACCOUNT, "vfio-msi%s[%d](%s)",
+			      msix ? "x" : "", vector, pci_name(pdev));
+	if (!ctx->name) {
+		ret = -ENOMEM;
+		goto out_free_ctx;
+	}
+
+	trigger = eventfd_ctx_fdget(fd);
+	if (IS_ERR(trigger)) {
+		ret = PTR_ERR(trigger);
+		goto out_free_name;
+	}
+
+	/*
+	 * If the vector was previously allocated, refresh the on-device
+	 * message data before enabling in case it had been cleared or
+	 * corrupted (e.g. due to backdoor resets) since writing.
+	 */
+	cmd = vfio_pci_memory_lock_and_enable(vdev);
+	if (msix) {
+		struct msi_msg msg;
+
+		get_cached_msi_msg(irq, &msg);
+		pci_write_msi_msg(irq, &msg);
+	}
+
+	ret = request_irq(irq, vfio_msihandler, 0, ctx->name, trigger);
+	vfio_pci_memory_unlock_and_restore(vdev, cmd);
+	if (ret)
+		goto out_put_eventfd_ctx;
+
+	ctx->producer.token = trigger;
+	ctx->producer.irq = irq;
+	ret = irq_bypass_register_producer(&ctx->producer);
+	if (unlikely(ret)) {
+		dev_info(&pdev->dev,
+		"irq bypass producer (token %p) registration fails: %d\n",
+		ctx->producer.token, ret);
+
+		ctx->producer.token = NULL;
+	}
+	ctx->trigger = trigger;
+
+	return 0;
+
+out_put_eventfd_ctx:
+	eventfd_ctx_put(trigger);
+out_free_name:
+	kfree(ctx->name);
+out_free_ctx:
+	vfio_irq_ctx_free(vdev, ctx, vector);
+	return ret;
+}
+
+static int vfio_msi_set_block(struct vfio_pci_core_device *vdev, unsigned start,
+			      unsigned count, int32_t *fds, bool msix)
+{
+	unsigned int i, j;
+	int ret = 0;
+
+	for (i = 0, j = start; i < count && !ret; i++, j++) {
+		int fd = fds ? fds[i] : -1;
+		ret = vfio_msi_set_vector_signal(vdev, j, fd, msix);
+	}
+
+	if (ret) {
+		for (i = start; i < j; i++)
+			vfio_msi_set_vector_signal(vdev, i, -1, msix);
+	}
+
+	return ret;
+}
+
+static void vfio_msi_disable(struct vfio_pci_core_device *vdev, bool msix)
+{
+	struct pci_dev *pdev = vdev->pdev;
+	struct vfio_pci_irq_ctx *ctx;
+	unsigned long i;
+	u16 cmd;
+
+	xa_for_each(&vdev->ctx, i, ctx) {
+		vfio_virqfd_disable(&ctx->unmask);
+		vfio_virqfd_disable(&ctx->mask);
+		vfio_msi_set_vector_signal(vdev, i, -1, msix);
+	}
+
+	cmd = vfio_pci_memory_lock_and_enable(vdev);
+	pci_free_irq_vectors(pdev);
+	vfio_pci_memory_unlock_and_restore(vdev, cmd);
+
+	/*
+	 * Both disable paths above use pci_intx_for_msi() to clear DisINTx
+	 * via their shutdown paths.  Restore for NoINTx devices.
+	 */
+	if (vdev->nointx)
+		pci_intx(pdev, 0);
+
+	vdev->irq_type = VFIO_PCI_NUM_IRQS;
+}
+
+/*
+ * IOCTL support
+ */
+static int vfio_pci_set_intx_unmask(struct vfio_pci_core_device *vdev,
+				    unsigned index, unsigned start,
+				    unsigned count, uint32_t flags, void *data)
+{
+	if (!is_intx(vdev) || start != 0 || count != 1)
+		return -EINVAL;
+
+	if (flags & VFIO_IRQ_SET_DATA_NONE) {
+		vfio_pci_intx_unmask(vdev);
+	} else if (flags & VFIO_IRQ_SET_DATA_BOOL) {
+		uint8_t unmask = *(uint8_t *)data;
+		if (unmask)
+			vfio_pci_intx_unmask(vdev);
+	} else if (flags & VFIO_IRQ_SET_DATA_EVENTFD) {
+		struct vfio_pci_irq_ctx *ctx = vfio_irq_ctx_get(vdev, 0);
+		int32_t fd = *(int32_t *)data;
+
+		if (WARN_ON_ONCE(!ctx))
+			return -EINVAL;
+		if (fd >= 0)
+			return vfio_virqfd_enable((void *) vdev,
+						  vfio_pci_intx_unmask_handler,
+						  vfio_send_intx_eventfd, NULL,
+						  &ctx->unmask, fd);
+
+		vfio_virqfd_disable(&ctx->unmask);
+	}
+
+	return 0;
+}
+
+static int vfio_pci_set_intx_mask(struct vfio_pci_core_device *vdev,
+				  unsigned index, unsigned start,
+				  unsigned count, uint32_t flags, void *data)
+{
+	if (!is_intx(vdev) || start != 0 || count != 1)
+		return -EINVAL;
+
+	if (flags & VFIO_IRQ_SET_DATA_NONE) {
+		vfio_pci_intx_mask(vdev);
+	} else if (flags & VFIO_IRQ_SET_DATA_BOOL) {
+		uint8_t mask = *(uint8_t *)data;
+		if (mask)
+			vfio_pci_intx_mask(vdev);
+	} else if (flags & VFIO_IRQ_SET_DATA_EVENTFD) {
+		return -ENOTTY; /* XXX implement me */
+	}
+
+	return 0;
+}
+
+static int vfio_pci_set_intx_trigger(struct vfio_pci_core_device *vdev,
+				     unsigned index, unsigned start,
+				     unsigned count, uint32_t flags, void *data)
+{
+	if (is_intx(vdev) && !count && (flags & VFIO_IRQ_SET_DATA_NONE)) {
+		vfio_intx_disable(vdev);
+		return 0;
+	}
+
+	if (!(is_intx(vdev) || is_irq_none(vdev)) || start != 0 || count != 1)
+		return -EINVAL;
+
+	if (flags & VFIO_IRQ_SET_DATA_EVENTFD) {
+		int32_t fd = *(int32_t *)data;
+		int ret;
+
+		if (is_intx(vdev))
+			return vfio_intx_set_signal(vdev, fd);
+
+		ret = vfio_intx_enable(vdev);
+		if (ret)
+			return ret;
+
+		ret = vfio_intx_set_signal(vdev, fd);
+		if (ret)
+			vfio_intx_disable(vdev);
+
+		return ret;
+	}
+
+	if (!is_intx(vdev))
+		return -EINVAL;
+
+	if (flags & VFIO_IRQ_SET_DATA_NONE) {
+		vfio_send_intx_eventfd(vdev, NULL);
+	} else if (flags & VFIO_IRQ_SET_DATA_BOOL) {
+		uint8_t trigger = *(uint8_t *)data;
+		if (trigger)
+			vfio_send_intx_eventfd(vdev, NULL);
+	}
+	return 0;
+}
+
+static int vfio_pci_set_msi_trigger(struct vfio_pci_core_device *vdev,
+				    unsigned index, unsigned start,
+				    unsigned count, uint32_t flags, void *data)
+{
+	struct vfio_pci_irq_ctx *ctx;
+	unsigned int i;
+	bool msix = (index == VFIO_PCI_MSIX_IRQ_INDEX) ? true : false;
+
+	if (irq_is(vdev, index) && !count && (flags & VFIO_IRQ_SET_DATA_NONE)) {
+		vfio_msi_disable(vdev, msix);
+		return 0;
+	}
+
+	if (!(irq_is(vdev, index) || is_irq_none(vdev)))
+		return -EINVAL;
+
+	if (flags & VFIO_IRQ_SET_DATA_EVENTFD) {
+		int32_t *fds = data;
+		int ret;
+
+		if (vdev->irq_type == index)
+			return vfio_msi_set_block(vdev, start, count,
+						  fds, msix);
+
+		ret = vfio_msi_enable(vdev, start + count, msix);
+		if (ret)
+			return ret;
+
+		ret = vfio_msi_set_block(vdev, start, count, fds, msix);
+		if (ret)
+			vfio_msi_disable(vdev, msix);
+
+		return ret;
+	}
+
+	if (!irq_is(vdev, index))
+		return -EINVAL;
+
+	for (i = start; i < start + count; i++) {
+		ctx = vfio_irq_ctx_get(vdev, i);
+		if (!ctx)
+			continue;
+		if (flags & VFIO_IRQ_SET_DATA_NONE) {
+			eventfd_signal(ctx->trigger, 1);
+		} else if (flags & VFIO_IRQ_SET_DATA_BOOL) {
+			uint8_t *bools = data;
+			if (bools[i - start])
+				eventfd_signal(ctx->trigger, 1);
+		}
+	}
+	return 0;
+}
+
+static int vfio_pci_set_ctx_trigger_single(struct eventfd_ctx **ctx,
+					   unsigned int count, uint32_t flags,
+					   void *data)
+{
+	/* DATA_NONE/DATA_BOOL enables loopback testing */
+	if (flags & VFIO_IRQ_SET_DATA_NONE) {
+		if (*ctx) {
+			if (count) {
+				eventfd_signal(*ctx, 1);
+			} else {
+				eventfd_ctx_put(*ctx);
+				*ctx = NULL;
+			}
+			return 0;
+		}
+	} else if (flags & VFIO_IRQ_SET_DATA_BOOL) {
+		uint8_t trigger;
+
+		if (!count)
+			return -EINVAL;
+
+		trigger = *(uint8_t *)data;
+		if (trigger && *ctx)
+			eventfd_signal(*ctx, 1);
+
+		return 0;
+	} else if (flags & VFIO_IRQ_SET_DATA_EVENTFD) {
+		int32_t fd;
+
+		if (!count)
+			return -EINVAL;
+
+		fd = *(int32_t *)data;
+		if (fd == -1) {
+			if (*ctx)
+				eventfd_ctx_put(*ctx);
+			*ctx = NULL;
+		} else if (fd >= 0) {
+			struct eventfd_ctx *efdctx;
+
+			efdctx = eventfd_ctx_fdget(fd);
+			if (IS_ERR(efdctx))
+				return PTR_ERR(efdctx);
+
+			if (*ctx)
+				eventfd_ctx_put(*ctx);
+
+			*ctx = efdctx;
+		}
+		return 0;
+	}
+
+	return -EINVAL;
+}
+
+static int vfio_pci_set_err_trigger(struct vfio_pci_core_device *vdev,
+				    unsigned index, unsigned start,
+				    unsigned count, uint32_t flags, void *data)
+{
+	if (index != VFIO_PCI_ERR_IRQ_INDEX || start != 0 || count > 1)
+		return -EINVAL;
+
+	return vfio_pci_set_ctx_trigger_single(&vdev->err_trigger,
+					       count, flags, data);
+}
+
+static int vfio_pci_set_req_trigger(struct vfio_pci_core_device *vdev,
+				    unsigned index, unsigned start,
+				    unsigned count, uint32_t flags, void *data)
+{
+	if (index != VFIO_PCI_REQ_IRQ_INDEX || start != 0 || count > 1)
+		return -EINVAL;
+
+	return vfio_pci_set_ctx_trigger_single(&vdev->req_trigger,
+					       count, flags, data);
+}
+
+int vfio_pci_set_irqs_ioctl(struct vfio_pci_core_device *vdev, uint32_t flags,
+			    unsigned index, unsigned start, unsigned count,
+			    void *data)
+{
+	int (*func)(struct vfio_pci_core_device *vdev, unsigned index,
+		    unsigned start, unsigned count, uint32_t flags,
+		    void *data) = NULL;
+
+	switch (index) {
+	case VFIO_PCI_INTX_IRQ_INDEX:
+		switch (flags & VFIO_IRQ_SET_ACTION_TYPE_MASK) {
+		case VFIO_IRQ_SET_ACTION_MASK:
+			func = vfio_pci_set_intx_mask;
+			break;
+		case VFIO_IRQ_SET_ACTION_UNMASK:
+			func = vfio_pci_set_intx_unmask;
+			break;
+		case VFIO_IRQ_SET_ACTION_TRIGGER:
+			func = vfio_pci_set_intx_trigger;
+			break;
+		}
+		break;
+	case VFIO_PCI_MSI_IRQ_INDEX:
+	case VFIO_PCI_MSIX_IRQ_INDEX:
+		switch (flags & VFIO_IRQ_SET_ACTION_TYPE_MASK) {
+		case VFIO_IRQ_SET_ACTION_MASK:
+		case VFIO_IRQ_SET_ACTION_UNMASK:
+			/* XXX Need masking support exported */
+			break;
+		case VFIO_IRQ_SET_ACTION_TRIGGER:
+			func = vfio_pci_set_msi_trigger;
+			break;
+		}
+		break;
+	case VFIO_PCI_ERR_IRQ_INDEX:
+		switch (flags & VFIO_IRQ_SET_ACTION_TYPE_MASK) {
+		case VFIO_IRQ_SET_ACTION_TRIGGER:
+			if (pci_is_pcie(vdev->pdev))
+				func = vfio_pci_set_err_trigger;
+			break;
+		}
+		break;
+	case VFIO_PCI_REQ_IRQ_INDEX:
+		switch (flags & VFIO_IRQ_SET_ACTION_TYPE_MASK) {
+		case VFIO_IRQ_SET_ACTION_TRIGGER:
+			func = vfio_pci_set_req_trigger;
+			break;
+		}
+		break;
+	}
+
+	if (!func)
+		return -ENOTTY;
+
+	return func(vdev, index, start, count, flags, data);
+}
diff --git a/drivers/vfio/pci/vfio_pci_priv.h b/drivers/vfio/pci/vfio_pci_priv.h
new file mode 100644
index 0000000000..5e4fa69aee
--- /dev/null
+++ b/drivers/vfio/pci/vfio_pci_priv.h
@@ -0,0 +1,104 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef VFIO_PCI_PRIV_H
+#define VFIO_PCI_PRIV_H
+
+#include <linux/vfio_pci_core.h>
+
+/* Special capability IDs predefined access */
+#define PCI_CAP_ID_INVALID		0xFF	/* default raw access */
+#define PCI_CAP_ID_INVALID_VIRT		0xFE	/* default virt access */
+
+/* Cap maximum number of ioeventfds per device (arbitrary) */
+#define VFIO_PCI_IOEVENTFD_MAX		1000
+
+struct vfio_pci_ioeventfd {
+	struct list_head	next;
+	struct vfio_pci_core_device	*vdev;
+	struct virqfd		*virqfd;
+	void __iomem		*addr;
+	uint64_t		data;
+	loff_t			pos;
+	int			bar;
+	int			count;
+	bool			test_mem;
+};
+
+bool vfio_pci_intx_mask(struct vfio_pci_core_device *vdev);
+void vfio_pci_intx_unmask(struct vfio_pci_core_device *vdev);
+
+int vfio_pci_set_irqs_ioctl(struct vfio_pci_core_device *vdev, uint32_t flags,
+			    unsigned index, unsigned start, unsigned count,
+			    void *data);
+
+ssize_t vfio_pci_config_rw(struct vfio_pci_core_device *vdev, char __user *buf,
+			   size_t count, loff_t *ppos, bool iswrite);
+
+ssize_t vfio_pci_bar_rw(struct vfio_pci_core_device *vdev, char __user *buf,
+			size_t count, loff_t *ppos, bool iswrite);
+
+#ifdef CONFIG_VFIO_PCI_VGA
+ssize_t vfio_pci_vga_rw(struct vfio_pci_core_device *vdev, char __user *buf,
+			size_t count, loff_t *ppos, bool iswrite);
+#else
+static inline ssize_t vfio_pci_vga_rw(struct vfio_pci_core_device *vdev,
+				      char __user *buf, size_t count,
+				      loff_t *ppos, bool iswrite)
+{
+	return -EINVAL;
+}
+#endif
+
+int vfio_pci_ioeventfd(struct vfio_pci_core_device *vdev, loff_t offset,
+		       uint64_t data, int count, int fd);
+
+int vfio_pci_init_perm_bits(void);
+void vfio_pci_uninit_perm_bits(void);
+
+int vfio_config_init(struct vfio_pci_core_device *vdev);
+void vfio_config_free(struct vfio_pci_core_device *vdev);
+
+int vfio_pci_set_power_state(struct vfio_pci_core_device *vdev,
+			     pci_power_t state);
+
+bool __vfio_pci_memory_enabled(struct vfio_pci_core_device *vdev);
+void vfio_pci_zap_and_down_write_memory_lock(struct vfio_pci_core_device *vdev);
+u16 vfio_pci_memory_lock_and_enable(struct vfio_pci_core_device *vdev);
+void vfio_pci_memory_unlock_and_restore(struct vfio_pci_core_device *vdev,
+					u16 cmd);
+
+#ifdef CONFIG_VFIO_PCI_IGD
+int vfio_pci_igd_init(struct vfio_pci_core_device *vdev);
+#else
+static inline int vfio_pci_igd_init(struct vfio_pci_core_device *vdev)
+{
+	return -ENODEV;
+}
+#endif
+
+#ifdef CONFIG_VFIO_PCI_ZDEV_KVM
+int vfio_pci_info_zdev_add_caps(struct vfio_pci_core_device *vdev,
+				struct vfio_info_cap *caps);
+int vfio_pci_zdev_open_device(struct vfio_pci_core_device *vdev);
+void vfio_pci_zdev_close_device(struct vfio_pci_core_device *vdev);
+#else
+static inline int vfio_pci_info_zdev_add_caps(struct vfio_pci_core_device *vdev,
+					      struct vfio_info_cap *caps)
+{
+	return -ENODEV;
+}
+
+static inline int vfio_pci_zdev_open_device(struct vfio_pci_core_device *vdev)
+{
+	return 0;
+}
+
+static inline void vfio_pci_zdev_close_device(struct vfio_pci_core_device *vdev)
+{}
+#endif
+
+static inline bool vfio_pci_is_vga(struct pci_dev *pdev)
+{
+	return (pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA;
+}
+
+#endif
diff --git a/drivers/vfio/pci/vfio_pci_rdwr.c b/drivers/vfio/pci/vfio_pci_rdwr.c
new file mode 100644
index 0000000000..e27de61ac9
--- /dev/null
+++ b/drivers/vfio/pci/vfio_pci_rdwr.c
@@ -0,0 +1,502 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * VFIO PCI I/O Port & MMIO access
+ *
+ * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
+ *     Author: Alex Williamson <alex.williamson@redhat.com>
+ *
+ * Derived from original vfio:
+ * Copyright 2010 Cisco Systems, Inc.  All rights reserved.
+ * Author: Tom Lyon, pugs@cisco.com
+ */
+
+#include <linux/fs.h>
+#include <linux/pci.h>
+#include <linux/uaccess.h>
+#include <linux/io.h>
+#include <linux/vfio.h>
+#include <linux/vgaarb.h>
+
+#include "vfio_pci_priv.h"
+
+#ifdef __LITTLE_ENDIAN
+#define vfio_ioread64	ioread64
+#define vfio_iowrite64	iowrite64
+#define vfio_ioread32	ioread32
+#define vfio_iowrite32	iowrite32
+#define vfio_ioread16	ioread16
+#define vfio_iowrite16	iowrite16
+#else
+#define vfio_ioread64	ioread64be
+#define vfio_iowrite64	iowrite64be
+#define vfio_ioread32	ioread32be
+#define vfio_iowrite32	iowrite32be
+#define vfio_ioread16	ioread16be
+#define vfio_iowrite16	iowrite16be
+#endif
+#define vfio_ioread8	ioread8
+#define vfio_iowrite8	iowrite8
+
+#define VFIO_IOWRITE(size) \
+static int vfio_pci_iowrite##size(struct vfio_pci_core_device *vdev,		\
+			bool test_mem, u##size val, void __iomem *io)	\
+{									\
+	if (test_mem) {							\
+		down_read(&vdev->memory_lock);				\
+		if (!__vfio_pci_memory_enabled(vdev)) {			\
+			up_read(&vdev->memory_lock);			\
+			return -EIO;					\
+		}							\
+	}								\
+									\
+	vfio_iowrite##size(val, io);					\
+									\
+	if (test_mem)							\
+		up_read(&vdev->memory_lock);				\
+									\
+	return 0;							\
+}
+
+VFIO_IOWRITE(8)
+VFIO_IOWRITE(16)
+VFIO_IOWRITE(32)
+#ifdef iowrite64
+VFIO_IOWRITE(64)
+#endif
+
+#define VFIO_IOREAD(size) \
+static int vfio_pci_ioread##size(struct vfio_pci_core_device *vdev,		\
+			bool test_mem, u##size *val, void __iomem *io)	\
+{									\
+	if (test_mem) {							\
+		down_read(&vdev->memory_lock);				\
+		if (!__vfio_pci_memory_enabled(vdev)) {			\
+			up_read(&vdev->memory_lock);			\
+			return -EIO;					\
+		}							\
+	}								\
+									\
+	*val = vfio_ioread##size(io);					\
+									\
+	if (test_mem)							\
+		up_read(&vdev->memory_lock);				\
+									\
+	return 0;							\
+}
+
+VFIO_IOREAD(8)
+VFIO_IOREAD(16)
+VFIO_IOREAD(32)
+
+/*
+ * Read or write from an __iomem region (MMIO or I/O port) with an excluded
+ * range which is inaccessible.  The excluded range drops writes and fills
+ * reads with -1.  This is intended for handling MSI-X vector tables and
+ * leftover space for ROM BARs.
+ */
+static ssize_t do_io_rw(struct vfio_pci_core_device *vdev, bool test_mem,
+			void __iomem *io, char __user *buf,
+			loff_t off, size_t count, size_t x_start,
+			size_t x_end, bool iswrite)
+{
+	ssize_t done = 0;
+	int ret;
+
+	while (count) {
+		size_t fillable, filled;
+
+		if (off < x_start)
+			fillable = min(count, (size_t)(x_start - off));
+		else if (off >= x_end)
+			fillable = count;
+		else
+			fillable = 0;
+
+		if (fillable >= 4 && !(off % 4)) {
+			u32 val;
+
+			if (iswrite) {
+				if (copy_from_user(&val, buf, 4))
+					return -EFAULT;
+
+				ret = vfio_pci_iowrite32(vdev, test_mem,
+							 val, io + off);
+				if (ret)
+					return ret;
+			} else {
+				ret = vfio_pci_ioread32(vdev, test_mem,
+							&val, io + off);
+				if (ret)
+					return ret;
+
+				if (copy_to_user(buf, &val, 4))
+					return -EFAULT;
+			}
+
+			filled = 4;
+		} else if (fillable >= 2 && !(off % 2)) {
+			u16 val;
+
+			if (iswrite) {
+				if (copy_from_user(&val, buf, 2))
+					return -EFAULT;
+
+				ret = vfio_pci_iowrite16(vdev, test_mem,
+							 val, io + off);
+				if (ret)
+					return ret;
+			} else {
+				ret = vfio_pci_ioread16(vdev, test_mem,
+							&val, io + off);
+				if (ret)
+					return ret;
+
+				if (copy_to_user(buf, &val, 2))
+					return -EFAULT;
+			}
+
+			filled = 2;
+		} else if (fillable) {
+			u8 val;
+
+			if (iswrite) {
+				if (copy_from_user(&val, buf, 1))
+					return -EFAULT;
+
+				ret = vfio_pci_iowrite8(vdev, test_mem,
+							val, io + off);
+				if (ret)
+					return ret;
+			} else {
+				ret = vfio_pci_ioread8(vdev, test_mem,
+						       &val, io + off);
+				if (ret)
+					return ret;
+
+				if (copy_to_user(buf, &val, 1))
+					return -EFAULT;
+			}
+
+			filled = 1;
+		} else {
+			/* Fill reads with -1, drop writes */
+			filled = min(count, (size_t)(x_end - off));
+			if (!iswrite) {
+				u8 val = 0xFF;
+				size_t i;
+
+				for (i = 0; i < filled; i++)
+					if (copy_to_user(buf + i, &val, 1))
+						return -EFAULT;
+			}
+		}
+
+		count -= filled;
+		done += filled;
+		off += filled;
+		buf += filled;
+	}
+
+	return done;
+}
+
+static int vfio_pci_setup_barmap(struct vfio_pci_core_device *vdev, int bar)
+{
+	struct pci_dev *pdev = vdev->pdev;
+	int ret;
+	void __iomem *io;
+
+	if (vdev->barmap[bar])
+		return 0;
+
+	ret = pci_request_selected_regions(pdev, 1 << bar, "vfio");
+	if (ret)
+		return ret;
+
+	io = pci_iomap(pdev, bar, 0);
+	if (!io) {
+		pci_release_selected_regions(pdev, 1 << bar);
+		return -ENOMEM;
+	}
+
+	vdev->barmap[bar] = io;
+
+	return 0;
+}
+
+ssize_t vfio_pci_bar_rw(struct vfio_pci_core_device *vdev, char __user *buf,
+			size_t count, loff_t *ppos, bool iswrite)
+{
+	struct pci_dev *pdev = vdev->pdev;
+	loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK;
+	int bar = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
+	size_t x_start = 0, x_end = 0;
+	resource_size_t end;
+	void __iomem *io;
+	struct resource *res = &vdev->pdev->resource[bar];
+	ssize_t done;
+
+	if (pci_resource_start(pdev, bar))
+		end = pci_resource_len(pdev, bar);
+	else if (bar == PCI_ROM_RESOURCE &&
+		 pdev->resource[bar].flags & IORESOURCE_ROM_SHADOW)
+		end = 0x20000;
+	else
+		return -EINVAL;
+
+	if (pos >= end)
+		return -EINVAL;
+
+	count = min(count, (size_t)(end - pos));
+
+	if (bar == PCI_ROM_RESOURCE) {
+		/*
+		 * The ROM can fill less space than the BAR, so we start the
+		 * excluded range at the end of the actual ROM.  This makes
+		 * filling large ROM BARs much faster.
+		 */
+		io = pci_map_rom(pdev, &x_start);
+		if (!io) {
+			done = -ENOMEM;
+			goto out;
+		}
+		x_end = end;
+	} else {
+		int ret = vfio_pci_setup_barmap(vdev, bar);
+		if (ret) {
+			done = ret;
+			goto out;
+		}
+
+		io = vdev->barmap[bar];
+	}
+
+	if (bar == vdev->msix_bar) {
+		x_start = vdev->msix_offset;
+		x_end = vdev->msix_offset + vdev->msix_size;
+	}
+
+	done = do_io_rw(vdev, res->flags & IORESOURCE_MEM, io, buf, pos,
+			count, x_start, x_end, iswrite);
+
+	if (done >= 0)
+		*ppos += done;
+
+	if (bar == PCI_ROM_RESOURCE)
+		pci_unmap_rom(pdev, io);
+out:
+	return done;
+}
+
+#ifdef CONFIG_VFIO_PCI_VGA
+ssize_t vfio_pci_vga_rw(struct vfio_pci_core_device *vdev, char __user *buf,
+			       size_t count, loff_t *ppos, bool iswrite)
+{
+	int ret;
+	loff_t off, pos = *ppos & VFIO_PCI_OFFSET_MASK;
+	void __iomem *iomem = NULL;
+	unsigned int rsrc;
+	bool is_ioport;
+	ssize_t done;
+
+	if (!vdev->has_vga)
+		return -EINVAL;
+
+	if (pos > 0xbfffful)
+		return -EINVAL;
+
+	switch ((u32)pos) {
+	case 0xa0000 ... 0xbffff:
+		count = min(count, (size_t)(0xc0000 - pos));
+		iomem = ioremap(0xa0000, 0xbffff - 0xa0000 + 1);
+		off = pos - 0xa0000;
+		rsrc = VGA_RSRC_LEGACY_MEM;
+		is_ioport = false;
+		break;
+	case 0x3b0 ... 0x3bb:
+		count = min(count, (size_t)(0x3bc - pos));
+		iomem = ioport_map(0x3b0, 0x3bb - 0x3b0 + 1);
+		off = pos - 0x3b0;
+		rsrc = VGA_RSRC_LEGACY_IO;
+		is_ioport = true;
+		break;
+	case 0x3c0 ... 0x3df:
+		count = min(count, (size_t)(0x3e0 - pos));
+		iomem = ioport_map(0x3c0, 0x3df - 0x3c0 + 1);
+		off = pos - 0x3c0;
+		rsrc = VGA_RSRC_LEGACY_IO;
+		is_ioport = true;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	if (!iomem)
+		return -ENOMEM;
+
+	ret = vga_get_interruptible(vdev->pdev, rsrc);
+	if (ret) {
+		is_ioport ? ioport_unmap(iomem) : iounmap(iomem);
+		return ret;
+	}
+
+	/*
+	 * VGA MMIO is a legacy, non-BAR resource that hopefully allows
+	 * probing, so we don't currently worry about access in relation
+	 * to the memory enable bit in the command register.
+	 */
+	done = do_io_rw(vdev, false, iomem, buf, off, count, 0, 0, iswrite);
+
+	vga_put(vdev->pdev, rsrc);
+
+	is_ioport ? ioport_unmap(iomem) : iounmap(iomem);
+
+	if (done >= 0)
+		*ppos += done;
+
+	return done;
+}
+#endif
+
+static void vfio_pci_ioeventfd_do_write(struct vfio_pci_ioeventfd *ioeventfd,
+					bool test_mem)
+{
+	switch (ioeventfd->count) {
+	case 1:
+		vfio_pci_iowrite8(ioeventfd->vdev, test_mem,
+				  ioeventfd->data, ioeventfd->addr);
+		break;
+	case 2:
+		vfio_pci_iowrite16(ioeventfd->vdev, test_mem,
+				   ioeventfd->data, ioeventfd->addr);
+		break;
+	case 4:
+		vfio_pci_iowrite32(ioeventfd->vdev, test_mem,
+				   ioeventfd->data, ioeventfd->addr);
+		break;
+#ifdef iowrite64
+	case 8:
+		vfio_pci_iowrite64(ioeventfd->vdev, test_mem,
+				   ioeventfd->data, ioeventfd->addr);
+		break;
+#endif
+	}
+}
+
+static int vfio_pci_ioeventfd_handler(void *opaque, void *unused)
+{
+	struct vfio_pci_ioeventfd *ioeventfd = opaque;
+	struct vfio_pci_core_device *vdev = ioeventfd->vdev;
+
+	if (ioeventfd->test_mem) {
+		if (!down_read_trylock(&vdev->memory_lock))
+			return 1; /* Lock contended, use thread */
+		if (!__vfio_pci_memory_enabled(vdev)) {
+			up_read(&vdev->memory_lock);
+			return 0;
+		}
+	}
+
+	vfio_pci_ioeventfd_do_write(ioeventfd, false);
+
+	if (ioeventfd->test_mem)
+		up_read(&vdev->memory_lock);
+
+	return 0;
+}
+
+static void vfio_pci_ioeventfd_thread(void *opaque, void *unused)
+{
+	struct vfio_pci_ioeventfd *ioeventfd = opaque;
+
+	vfio_pci_ioeventfd_do_write(ioeventfd, ioeventfd->test_mem);
+}
+
+int vfio_pci_ioeventfd(struct vfio_pci_core_device *vdev, loff_t offset,
+		       uint64_t data, int count, int fd)
+{
+	struct pci_dev *pdev = vdev->pdev;
+	loff_t pos = offset & VFIO_PCI_OFFSET_MASK;
+	int ret, bar = VFIO_PCI_OFFSET_TO_INDEX(offset);
+	struct vfio_pci_ioeventfd *ioeventfd;
+
+	/* Only support ioeventfds into BARs */
+	if (bar > VFIO_PCI_BAR5_REGION_INDEX)
+		return -EINVAL;
+
+	if (pos + count > pci_resource_len(pdev, bar))
+		return -EINVAL;
+
+	/* Disallow ioeventfds working around MSI-X table writes */
+	if (bar == vdev->msix_bar &&
+	    !(pos + count <= vdev->msix_offset ||
+	      pos >= vdev->msix_offset + vdev->msix_size))
+		return -EINVAL;
+
+#ifndef iowrite64
+	if (count == 8)
+		return -EINVAL;
+#endif
+
+	ret = vfio_pci_setup_barmap(vdev, bar);
+	if (ret)
+		return ret;
+
+	mutex_lock(&vdev->ioeventfds_lock);
+
+	list_for_each_entry(ioeventfd, &vdev->ioeventfds_list, next) {
+		if (ioeventfd->pos == pos && ioeventfd->bar == bar &&
+		    ioeventfd->data == data && ioeventfd->count == count) {
+			if (fd == -1) {
+				vfio_virqfd_disable(&ioeventfd->virqfd);
+				list_del(&ioeventfd->next);
+				vdev->ioeventfds_nr--;
+				kfree(ioeventfd);
+				ret = 0;
+			} else
+				ret = -EEXIST;
+
+			goto out_unlock;
+		}
+	}
+
+	if (fd < 0) {
+		ret = -ENODEV;
+		goto out_unlock;
+	}
+
+	if (vdev->ioeventfds_nr >= VFIO_PCI_IOEVENTFD_MAX) {
+		ret = -ENOSPC;
+		goto out_unlock;
+	}
+
+	ioeventfd = kzalloc(sizeof(*ioeventfd), GFP_KERNEL_ACCOUNT);
+	if (!ioeventfd) {
+		ret = -ENOMEM;
+		goto out_unlock;
+	}
+
+	ioeventfd->vdev = vdev;
+	ioeventfd->addr = vdev->barmap[bar] + pos;
+	ioeventfd->data = data;
+	ioeventfd->pos = pos;
+	ioeventfd->bar = bar;
+	ioeventfd->count = count;
+	ioeventfd->test_mem = vdev->pdev->resource[bar].flags & IORESOURCE_MEM;
+
+	ret = vfio_virqfd_enable(ioeventfd, vfio_pci_ioeventfd_handler,
+				 vfio_pci_ioeventfd_thread, NULL,
+				 &ioeventfd->virqfd, fd);
+	if (ret) {
+		kfree(ioeventfd);
+		goto out_unlock;
+	}
+
+	list_add(&ioeventfd->next, &vdev->ioeventfds_list);
+	vdev->ioeventfds_nr++;
+
+out_unlock:
+	mutex_unlock(&vdev->ioeventfds_lock);
+
+	return ret;
+}
diff --git a/drivers/vfio/pci/vfio_pci_zdev.c b/drivers/vfio/pci/vfio_pci_zdev.c
new file mode 100644
index 0000000000..0990fdb146
--- /dev/null
+++ b/drivers/vfio/pci/vfio_pci_zdev.c
@@ -0,0 +1,169 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * VFIO ZPCI devices support
+ *
+ * Copyright (C) IBM Corp. 2020.  All rights reserved.
+ *	Author(s): Pierre Morel <pmorel@linux.ibm.com>
+ *                 Matthew Rosato <mjrosato@linux.ibm.com>
+ */
+#include <linux/io.h>
+#include <linux/pci.h>
+#include <linux/uaccess.h>
+#include <linux/vfio.h>
+#include <linux/vfio_zdev.h>
+#include <linux/kvm_host.h>
+#include <asm/pci_clp.h>
+#include <asm/pci_io.h>
+
+#include "vfio_pci_priv.h"
+
+/*
+ * Add the Base PCI Function information to the device info region.
+ */
+static int zpci_base_cap(struct zpci_dev *zdev, struct vfio_info_cap *caps)
+{
+	struct vfio_device_info_cap_zpci_base cap = {
+		.header.id = VFIO_DEVICE_INFO_CAP_ZPCI_BASE,
+		.header.version = 2,
+		.start_dma = zdev->start_dma,
+		.end_dma = zdev->end_dma,
+		.pchid = zdev->pchid,
+		.vfn = zdev->vfn,
+		.fmb_length = zdev->fmb_length,
+		.pft = zdev->pft,
+		.gid = zdev->pfgid,
+		.fh = zdev->fh
+	};
+
+	return vfio_info_add_capability(caps, &cap.header, sizeof(cap));
+}
+
+/*
+ * Add the Base PCI Function Group information to the device info region.
+ */
+static int zpci_group_cap(struct zpci_dev *zdev, struct vfio_info_cap *caps)
+{
+	struct vfio_device_info_cap_zpci_group cap = {
+		.header.id = VFIO_DEVICE_INFO_CAP_ZPCI_GROUP,
+		.header.version = 2,
+		.dasm = zdev->dma_mask,
+		.msi_addr = zdev->msi_addr,
+		.flags = VFIO_DEVICE_INFO_ZPCI_FLAG_REFRESH,
+		.mui = zdev->fmb_update,
+		.noi = zdev->max_msi,
+		.maxstbl = ZPCI_MAX_WRITE_SIZE,
+		.version = zdev->version,
+		.reserved = 0,
+		.imaxstbl = zdev->maxstbl
+	};
+
+	return vfio_info_add_capability(caps, &cap.header, sizeof(cap));
+}
+
+/*
+ * Add the device utility string to the device info region.
+ */
+static int zpci_util_cap(struct zpci_dev *zdev, struct vfio_info_cap *caps)
+{
+	struct vfio_device_info_cap_zpci_util *cap;
+	int cap_size = sizeof(*cap) + CLP_UTIL_STR_LEN;
+	int ret;
+
+	cap = kmalloc(cap_size, GFP_KERNEL);
+	if (!cap)
+		return -ENOMEM;
+
+	cap->header.id = VFIO_DEVICE_INFO_CAP_ZPCI_UTIL;
+	cap->header.version = 1;
+	cap->size = CLP_UTIL_STR_LEN;
+	memcpy(cap->util_str, zdev->util_str, cap->size);
+
+	ret = vfio_info_add_capability(caps, &cap->header, cap_size);
+
+	kfree(cap);
+
+	return ret;
+}
+
+/*
+ * Add the function path string to the device info region.
+ */
+static int zpci_pfip_cap(struct zpci_dev *zdev, struct vfio_info_cap *caps)
+{
+	struct vfio_device_info_cap_zpci_pfip *cap;
+	int cap_size = sizeof(*cap) + CLP_PFIP_NR_SEGMENTS;
+	int ret;
+
+	cap = kmalloc(cap_size, GFP_KERNEL);
+	if (!cap)
+		return -ENOMEM;
+
+	cap->header.id = VFIO_DEVICE_INFO_CAP_ZPCI_PFIP;
+	cap->header.version = 1;
+	cap->size = CLP_PFIP_NR_SEGMENTS;
+	memcpy(cap->pfip, zdev->pfip, cap->size);
+
+	ret = vfio_info_add_capability(caps, &cap->header, cap_size);
+
+	kfree(cap);
+
+	return ret;
+}
+
+/*
+ * Add all supported capabilities to the VFIO_DEVICE_GET_INFO capability chain.
+ */
+int vfio_pci_info_zdev_add_caps(struct vfio_pci_core_device *vdev,
+				struct vfio_info_cap *caps)
+{
+	struct zpci_dev *zdev = to_zpci(vdev->pdev);
+	int ret;
+
+	if (!zdev)
+		return -ENODEV;
+
+	ret = zpci_base_cap(zdev, caps);
+	if (ret)
+		return ret;
+
+	ret = zpci_group_cap(zdev, caps);
+	if (ret)
+		return ret;
+
+	if (zdev->util_str_avail) {
+		ret = zpci_util_cap(zdev, caps);
+		if (ret)
+			return ret;
+	}
+
+	ret = zpci_pfip_cap(zdev, caps);
+
+	return ret;
+}
+
+int vfio_pci_zdev_open_device(struct vfio_pci_core_device *vdev)
+{
+	struct zpci_dev *zdev = to_zpci(vdev->pdev);
+
+	if (!zdev)
+		return -ENODEV;
+
+	if (!vdev->vdev.kvm)
+		return 0;
+
+	if (zpci_kvm_hook.kvm_register)
+		return zpci_kvm_hook.kvm_register(zdev, vdev->vdev.kvm);
+
+	return -ENOENT;
+}
+
+void vfio_pci_zdev_close_device(struct vfio_pci_core_device *vdev)
+{
+	struct zpci_dev *zdev = to_zpci(vdev->pdev);
+
+	if (!zdev || !vdev->vdev.kvm)
+		return;
+
+	if (zpci_kvm_hook.kvm_unregister)
+		zpci_kvm_hook.kvm_unregister(zdev);
+}
diff --git a/drivers/vfio/platform/Kconfig b/drivers/vfio/platform/Kconfig
new file mode 100644
index 0000000000..88fcde51f0
--- /dev/null
+++ b/drivers/vfio/platform/Kconfig
@@ -0,0 +1,35 @@
+# SPDX-License-Identifier: GPL-2.0-only
+menu "VFIO support for platform devices"
+	depends on ARM || ARM64 || COMPILE_TEST
+
+config VFIO_PLATFORM_BASE
+	tristate
+	select VFIO_VIRQFD
+
+config VFIO_PLATFORM
+	tristate "Generic VFIO support for any platform device"
+	select VFIO_PLATFORM_BASE
+	help
+	  Support for platform devices with VFIO. This is required to make
+	  use of platform devices present on the system using the VFIO
+	  framework.
+
+	  If you don't know what to do here, say N.
+
+config VFIO_AMBA
+	tristate "VFIO support for AMBA devices"
+	depends on ARM_AMBA || COMPILE_TEST
+	select VFIO_PLATFORM_BASE
+	help
+	  Support for ARM AMBA devices with VFIO. This is required to make
+	  use of ARM AMBA devices present on the system using the VFIO
+	  framework.
+
+	  If you don't know what to do here, say N.
+
+menu "VFIO platform reset drivers"
+	depends on VFIO_PLATFORM_BASE
+
+source "drivers/vfio/platform/reset/Kconfig"
+endmenu
+endmenu
diff --git a/drivers/vfio/platform/Makefile b/drivers/vfio/platform/Makefile
new file mode 100644
index 0000000000..ee4fb6a82c
--- /dev/null
+++ b/drivers/vfio/platform/Makefile
@@ -0,0 +1,10 @@
+# SPDX-License-Identifier: GPL-2.0
+vfio-platform-base-y := vfio_platform_common.o vfio_platform_irq.o
+obj-$(CONFIG_VFIO_PLATFORM_BASE) += vfio-platform-base.o
+obj-$(CONFIG_VFIO_PLATFORM_BASE) += reset/
+
+vfio-platform-y := vfio_platform.o
+obj-$(CONFIG_VFIO_PLATFORM) += vfio-platform.o
+
+vfio-amba-y := vfio_amba.o
+obj-$(CONFIG_VFIO_AMBA) += vfio-amba.o
diff --git a/drivers/vfio/platform/reset/Kconfig b/drivers/vfio/platform/reset/Kconfig
new file mode 100644
index 0000000000..dcc08dc145
--- /dev/null
+++ b/drivers/vfio/platform/reset/Kconfig
@@ -0,0 +1,25 @@
+# SPDX-License-Identifier: GPL-2.0-only
+if VFIO_PLATFORM
+config VFIO_PLATFORM_CALXEDAXGMAC_RESET
+	tristate "VFIO support for calxeda xgmac reset"
+	help
+	  Enables the VFIO platform driver to handle reset for Calxeda xgmac
+
+	  If you don't know what to do here, say N.
+
+config VFIO_PLATFORM_AMDXGBE_RESET
+	tristate "VFIO support for AMD XGBE reset"
+	help
+	  Enables the VFIO platform driver to handle reset for AMD XGBE
+
+	  If you don't know what to do here, say N.
+
+config VFIO_PLATFORM_BCMFLEXRM_RESET
+	tristate "VFIO support for Broadcom FlexRM reset"
+	depends on ARCH_BCM_IPROC || COMPILE_TEST
+	default ARCH_BCM_IPROC
+	help
+	  Enables the VFIO platform driver to handle reset for Broadcom FlexRM
+
+	  If you don't know what to do here, say N.
+endif
diff --git a/drivers/vfio/platform/reset/Makefile b/drivers/vfio/platform/reset/Makefile
new file mode 100644
index 0000000000..7294c5ea12
--- /dev/null
+++ b/drivers/vfio/platform/reset/Makefile
@@ -0,0 +1,7 @@
+# SPDX-License-Identifier: GPL-2.0
+vfio-platform-calxedaxgmac-y := vfio_platform_calxedaxgmac.o
+vfio-platform-amdxgbe-y := vfio_platform_amdxgbe.o
+
+obj-$(CONFIG_VFIO_PLATFORM_CALXEDAXGMAC_RESET) += vfio-platform-calxedaxgmac.o
+obj-$(CONFIG_VFIO_PLATFORM_AMDXGBE_RESET) += vfio-platform-amdxgbe.o
+obj-$(CONFIG_VFIO_PLATFORM_BCMFLEXRM_RESET) += vfio_platform_bcmflexrm.o
diff --git a/drivers/vfio/platform/reset/vfio_platform_amdxgbe.c b/drivers/vfio/platform/reset/vfio_platform_amdxgbe.c
new file mode 100644
index 0000000000..abdca90080
--- /dev/null
+++ b/drivers/vfio/platform/reset/vfio_platform_amdxgbe.c
@@ -0,0 +1,117 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * VFIO platform driver specialized for AMD xgbe reset
+ * reset code is inherited from AMD xgbe native driver
+ *
+ * Copyright (c) 2015 Linaro Ltd.
+ *              www.linaro.org
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/io.h>
+#include <uapi/linux/mdio.h>
+#include <linux/delay.h>
+
+#include "../vfio_platform_private.h"
+
+#define DMA_MR			0x3000
+#define MAC_VR			0x0110
+#define DMA_ISR			0x3008
+#define MAC_ISR			0x00b0
+#define PCS_MMD_SELECT		0xff
+#define MDIO_AN_INT		0x8002
+#define MDIO_AN_INTMASK		0x8001
+
+static unsigned int xmdio_read(void __iomem *ioaddr, unsigned int mmd,
+			       unsigned int reg)
+{
+	unsigned int mmd_address, value;
+
+	mmd_address = (mmd << 16) | ((reg) & 0xffff);
+	iowrite32(mmd_address >> 8, ioaddr + (PCS_MMD_SELECT << 2));
+	value = ioread32(ioaddr + ((mmd_address & 0xff) << 2));
+	return value;
+}
+
+static void xmdio_write(void __iomem *ioaddr, unsigned int mmd,
+			unsigned int reg, unsigned int value)
+{
+	unsigned int mmd_address;
+
+	mmd_address = (mmd << 16) | ((reg) & 0xffff);
+	iowrite32(mmd_address >> 8, ioaddr + (PCS_MMD_SELECT << 2));
+	iowrite32(value, ioaddr + ((mmd_address & 0xff) << 2));
+}
+
+static int vfio_platform_amdxgbe_reset(struct vfio_platform_device *vdev)
+{
+	struct vfio_platform_region *xgmac_regs = &vdev->regions[0];
+	struct vfio_platform_region *xpcs_regs = &vdev->regions[1];
+	u32 dma_mr_value, pcs_value, value;
+	unsigned int count;
+
+	if (!xgmac_regs->ioaddr) {
+		xgmac_regs->ioaddr =
+			ioremap(xgmac_regs->addr, xgmac_regs->size);
+		if (!xgmac_regs->ioaddr)
+			return -ENOMEM;
+	}
+	if (!xpcs_regs->ioaddr) {
+		xpcs_regs->ioaddr =
+			ioremap(xpcs_regs->addr, xpcs_regs->size);
+		if (!xpcs_regs->ioaddr)
+			return -ENOMEM;
+	}
+
+	/* reset the PHY through MDIO*/
+	pcs_value = xmdio_read(xpcs_regs->ioaddr, MDIO_MMD_PCS, MDIO_CTRL1);
+	pcs_value |= MDIO_CTRL1_RESET;
+	xmdio_write(xpcs_regs->ioaddr, MDIO_MMD_PCS, MDIO_CTRL1, pcs_value);
+
+	count = 50;
+	do {
+		msleep(20);
+		pcs_value = xmdio_read(xpcs_regs->ioaddr, MDIO_MMD_PCS,
+					MDIO_CTRL1);
+	} while ((pcs_value & MDIO_CTRL1_RESET) && --count);
+
+	if (pcs_value & MDIO_CTRL1_RESET)
+		dev_warn(vdev->device, "%s: XGBE PHY reset timeout\n",
+			 __func__);
+
+	/* disable auto-negotiation */
+	value = xmdio_read(xpcs_regs->ioaddr, MDIO_MMD_AN, MDIO_CTRL1);
+	value &= ~MDIO_AN_CTRL1_ENABLE;
+	xmdio_write(xpcs_regs->ioaddr, MDIO_MMD_AN, MDIO_CTRL1, value);
+
+	/* disable AN IRQ */
+	xmdio_write(xpcs_regs->ioaddr, MDIO_MMD_AN, MDIO_AN_INTMASK, 0);
+
+	/* clear AN IRQ */
+	xmdio_write(xpcs_regs->ioaddr, MDIO_MMD_AN, MDIO_AN_INT, 0);
+
+	/* MAC software reset */
+	dma_mr_value = ioread32(xgmac_regs->ioaddr + DMA_MR);
+	dma_mr_value |= 0x1;
+	iowrite32(dma_mr_value, xgmac_regs->ioaddr + DMA_MR);
+
+	usleep_range(10, 15);
+
+	count = 2000;
+	while (--count && (ioread32(xgmac_regs->ioaddr + DMA_MR) & 1))
+		usleep_range(500, 600);
+
+	if (!count)
+		dev_warn(vdev->device, "%s: MAC SW reset failed\n", __func__);
+
+	return 0;
+}
+
+module_vfio_reset_handler("amd,xgbe-seattle-v1a", vfio_platform_amdxgbe_reset);
+
+MODULE_VERSION("0.1");
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Eric Auger <eric.auger@linaro.org>");
+MODULE_DESCRIPTION("Reset support for AMD xgbe vfio platform device");
diff --git a/drivers/vfio/platform/reset/vfio_platform_bcmflexrm.c b/drivers/vfio/platform/reset/vfio_platform_bcmflexrm.c
new file mode 100644
index 0000000000..1131ebe483
--- /dev/null
+++ b/drivers/vfio/platform/reset/vfio_platform_bcmflexrm.c
@@ -0,0 +1,105 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2017 Broadcom
+ */
+
+/*
+ * This driver provides reset support for Broadcom FlexRM ring manager
+ * to VFIO platform.
+ */
+
+#include <linux/delay.h>
+#include <linux/device.h>
+#include <linux/init.h>
+#include <linux/io.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+
+#include "../vfio_platform_private.h"
+
+/* FlexRM configuration */
+#define RING_REGS_SIZE					0x10000
+#define RING_VER_MAGIC					0x76303031
+
+/* Per-Ring register offsets */
+#define RING_VER					0x000
+#define RING_CONTROL					0x034
+#define RING_FLUSH_DONE					0x038
+
+/* Register RING_CONTROL fields */
+#define CONTROL_FLUSH_SHIFT				5
+
+/* Register RING_FLUSH_DONE fields */
+#define FLUSH_DONE_MASK					0x1
+
+static int vfio_platform_bcmflexrm_shutdown(void __iomem *ring)
+{
+	unsigned int timeout;
+
+	/* Disable/inactivate ring */
+	writel_relaxed(0x0, ring + RING_CONTROL);
+
+	/* Set ring flush state */
+	timeout = 1000; /* timeout of 1s */
+	writel_relaxed(BIT(CONTROL_FLUSH_SHIFT), ring + RING_CONTROL);
+	do {
+		if (readl_relaxed(ring + RING_FLUSH_DONE) &
+		    FLUSH_DONE_MASK)
+			break;
+		mdelay(1);
+	} while (--timeout);
+	if (!timeout)
+		return -ETIMEDOUT;
+
+	/* Clear ring flush state */
+	timeout = 1000; /* timeout of 1s */
+	writel_relaxed(0x0, ring + RING_CONTROL);
+	do {
+		if (!(readl_relaxed(ring + RING_FLUSH_DONE) &
+		      FLUSH_DONE_MASK))
+			break;
+		mdelay(1);
+	} while (--timeout);
+	if (!timeout)
+		return -ETIMEDOUT;
+
+	return 0;
+}
+
+static int vfio_platform_bcmflexrm_reset(struct vfio_platform_device *vdev)
+{
+	void __iomem *ring;
+	int rc = 0, ret = 0, ring_num = 0;
+	struct vfio_platform_region *reg = &vdev->regions[0];
+
+	/* Map FlexRM ring registers if not mapped */
+	if (!reg->ioaddr) {
+		reg->ioaddr = ioremap(reg->addr, reg->size);
+		if (!reg->ioaddr)
+			return -ENOMEM;
+	}
+
+	/* Discover and shutdown each FlexRM ring */
+	for (ring = reg->ioaddr;
+	     ring < (reg->ioaddr + reg->size); ring += RING_REGS_SIZE) {
+		if (readl_relaxed(ring + RING_VER) == RING_VER_MAGIC) {
+			rc = vfio_platform_bcmflexrm_shutdown(ring);
+			if (rc) {
+				dev_warn(vdev->device,
+					 "FlexRM ring%d shutdown error %d\n",
+					 ring_num, rc);
+				ret |= rc;
+			}
+			ring_num++;
+		}
+	}
+
+	return ret;
+}
+
+module_vfio_reset_handler("brcm,iproc-flexrm-mbox",
+			  vfio_platform_bcmflexrm_reset);
+
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Anup Patel <anup.patel@broadcom.com>");
+MODULE_DESCRIPTION("Reset support for Broadcom FlexRM VFIO platform device");
diff --git a/drivers/vfio/platform/reset/vfio_platform_calxedaxgmac.c b/drivers/vfio/platform/reset/vfio_platform_calxedaxgmac.c
new file mode 100644
index 0000000000..63cc7f0b2e
--- /dev/null
+++ b/drivers/vfio/platform/reset/vfio_platform_calxedaxgmac.c
@@ -0,0 +1,74 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * VFIO platform driver specialized for Calxeda xgmac reset
+ * reset code is inherited from calxeda xgmac native driver
+ *
+ * Copyright 2010-2011 Calxeda, Inc.
+ * Copyright (c) 2015 Linaro Ltd.
+ *              www.linaro.org
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/io.h>
+
+#include "../vfio_platform_private.h"
+
+#define DRIVER_VERSION  "0.1"
+#define DRIVER_AUTHOR   "Eric Auger <eric.auger@linaro.org>"
+#define DRIVER_DESC     "Reset support for Calxeda xgmac vfio platform device"
+
+/* XGMAC Register definitions */
+#define XGMAC_CONTROL           0x00000000      /* MAC Configuration */
+
+/* DMA Control and Status Registers */
+#define XGMAC_DMA_CONTROL       0x00000f18      /* Ctrl (Operational Mode) */
+#define XGMAC_DMA_INTR_ENA      0x00000f1c      /* Interrupt Enable */
+
+/* DMA Control register defines */
+#define DMA_CONTROL_ST          0x00002000      /* Start/Stop Transmission */
+#define DMA_CONTROL_SR          0x00000002      /* Start/Stop Receive */
+
+/* Common MAC defines */
+#define MAC_ENABLE_TX           0x00000008      /* Transmitter Enable */
+#define MAC_ENABLE_RX           0x00000004      /* Receiver Enable */
+
+static inline void xgmac_mac_disable(void __iomem *ioaddr)
+{
+	u32 value = readl(ioaddr + XGMAC_DMA_CONTROL);
+
+	value &= ~(DMA_CONTROL_ST | DMA_CONTROL_SR);
+	writel(value, ioaddr + XGMAC_DMA_CONTROL);
+
+	value = readl(ioaddr + XGMAC_CONTROL);
+	value &= ~(MAC_ENABLE_TX | MAC_ENABLE_RX);
+	writel(value, ioaddr + XGMAC_CONTROL);
+}
+
+static int vfio_platform_calxedaxgmac_reset(struct vfio_platform_device *vdev)
+{
+	struct vfio_platform_region *reg = &vdev->regions[0];
+
+	if (!reg->ioaddr) {
+		reg->ioaddr =
+			ioremap(reg->addr, reg->size);
+		if (!reg->ioaddr)
+			return -ENOMEM;
+	}
+
+	/* disable IRQ */
+	writel(0, reg->ioaddr + XGMAC_DMA_INTR_ENA);
+
+	/* Disable the MAC core */
+	xgmac_mac_disable(reg->ioaddr);
+
+	return 0;
+}
+
+module_vfio_reset_handler("calxeda,hb-xgmac", vfio_platform_calxedaxgmac_reset);
+
+MODULE_VERSION(DRIVER_VERSION);
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR(DRIVER_AUTHOR);
+MODULE_DESCRIPTION(DRIVER_DESC);
diff --git a/drivers/vfio/platform/vfio_amba.c b/drivers/vfio/platform/vfio_amba.c
new file mode 100644
index 0000000000..6464b3939e
--- /dev/null
+++ b/drivers/vfio/platform/vfio_amba.c
@@ -0,0 +1,147 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2013 - Virtual Open Systems
+ * Author: Antonios Motakis <a.motakis@virtualopensystems.com>
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/vfio.h>
+#include <linux/pm_runtime.h>
+#include <linux/amba/bus.h>
+
+#include "vfio_platform_private.h"
+
+#define DRIVER_VERSION  "0.10"
+#define DRIVER_AUTHOR   "Antonios Motakis <a.motakis@virtualopensystems.com>"
+#define DRIVER_DESC     "VFIO for AMBA devices - User Level meta-driver"
+
+/* probing devices from the AMBA bus */
+
+static struct resource *get_amba_resource(struct vfio_platform_device *vdev,
+					  int i)
+{
+	struct amba_device *adev = (struct amba_device *) vdev->opaque;
+
+	if (i == 0)
+		return &adev->res;
+
+	return NULL;
+}
+
+static int get_amba_irq(struct vfio_platform_device *vdev, int i)
+{
+	struct amba_device *adev = (struct amba_device *) vdev->opaque;
+	int ret = 0;
+
+	if (i < AMBA_NR_IRQS)
+		ret = adev->irq[i];
+
+	/* zero is an unset IRQ for AMBA devices */
+	return ret ? ret : -ENXIO;
+}
+
+static int vfio_amba_init_dev(struct vfio_device *core_vdev)
+{
+	struct vfio_platform_device *vdev =
+		container_of(core_vdev, struct vfio_platform_device, vdev);
+	struct amba_device *adev = to_amba_device(core_vdev->dev);
+	int ret;
+
+	vdev->name = kasprintf(GFP_KERNEL, "vfio-amba-%08x", adev->periphid);
+	if (!vdev->name)
+		return -ENOMEM;
+
+	vdev->opaque = (void *) adev;
+	vdev->flags = VFIO_DEVICE_FLAGS_AMBA;
+	vdev->get_resource = get_amba_resource;
+	vdev->get_irq = get_amba_irq;
+	vdev->reset_required = false;
+
+	ret = vfio_platform_init_common(vdev);
+	if (ret)
+		kfree(vdev->name);
+	return ret;
+}
+
+static const struct vfio_device_ops vfio_amba_ops;
+static int vfio_amba_probe(struct amba_device *adev, const struct amba_id *id)
+{
+	struct vfio_platform_device *vdev;
+	int ret;
+
+	vdev = vfio_alloc_device(vfio_platform_device, vdev, &adev->dev,
+				 &vfio_amba_ops);
+	if (IS_ERR(vdev))
+		return PTR_ERR(vdev);
+
+	ret = vfio_register_group_dev(&vdev->vdev);
+	if (ret)
+		goto out_put_vdev;
+
+	pm_runtime_enable(&adev->dev);
+	dev_set_drvdata(&adev->dev, vdev);
+	return 0;
+
+out_put_vdev:
+	vfio_put_device(&vdev->vdev);
+	return ret;
+}
+
+static void vfio_amba_release_dev(struct vfio_device *core_vdev)
+{
+	struct vfio_platform_device *vdev =
+		container_of(core_vdev, struct vfio_platform_device, vdev);
+
+	vfio_platform_release_common(vdev);
+	kfree(vdev->name);
+}
+
+static void vfio_amba_remove(struct amba_device *adev)
+{
+	struct vfio_platform_device *vdev = dev_get_drvdata(&adev->dev);
+
+	vfio_unregister_group_dev(&vdev->vdev);
+	pm_runtime_disable(vdev->device);
+	vfio_put_device(&vdev->vdev);
+}
+
+static const struct vfio_device_ops vfio_amba_ops = {
+	.name		= "vfio-amba",
+	.init		= vfio_amba_init_dev,
+	.release	= vfio_amba_release_dev,
+	.open_device	= vfio_platform_open_device,
+	.close_device	= vfio_platform_close_device,
+	.ioctl		= vfio_platform_ioctl,
+	.read		= vfio_platform_read,
+	.write		= vfio_platform_write,
+	.mmap		= vfio_platform_mmap,
+	.bind_iommufd	= vfio_iommufd_physical_bind,
+	.unbind_iommufd	= vfio_iommufd_physical_unbind,
+	.attach_ioas	= vfio_iommufd_physical_attach_ioas,
+	.detach_ioas	= vfio_iommufd_physical_detach_ioas,
+};
+
+static const struct amba_id pl330_ids[] = {
+	{ 0, 0 },
+};
+
+MODULE_DEVICE_TABLE(amba, pl330_ids);
+
+static struct amba_driver vfio_amba_driver = {
+	.probe = vfio_amba_probe,
+	.remove = vfio_amba_remove,
+	.id_table = pl330_ids,
+	.drv = {
+		.name = "vfio-amba",
+		.owner = THIS_MODULE,
+	},
+	.driver_managed_dma = true,
+};
+
+module_amba_driver(vfio_amba_driver);
+
+MODULE_VERSION(DRIVER_VERSION);
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR(DRIVER_AUTHOR);
+MODULE_DESCRIPTION(DRIVER_DESC);
diff --git a/drivers/vfio/platform/vfio_platform.c b/drivers/vfio/platform/vfio_platform.c
new file mode 100644
index 0000000000..8cf22fa65b
--- /dev/null
+++ b/drivers/vfio/platform/vfio_platform.c
@@ -0,0 +1,128 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2013 - Virtual Open Systems
+ * Author: Antonios Motakis <a.motakis@virtualopensystems.com>
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/vfio.h>
+#include <linux/pm_runtime.h>
+#include <linux/platform_device.h>
+
+#include "vfio_platform_private.h"
+
+#define DRIVER_VERSION  "0.10"
+#define DRIVER_AUTHOR   "Antonios Motakis <a.motakis@virtualopensystems.com>"
+#define DRIVER_DESC     "VFIO for platform devices - User Level meta-driver"
+
+static bool reset_required = true;
+module_param(reset_required, bool, 0444);
+MODULE_PARM_DESC(reset_required, "override reset requirement (default: 1)");
+
+/* probing devices from the linux platform bus */
+
+static struct resource *get_platform_resource(struct vfio_platform_device *vdev,
+					      int num)
+{
+	struct platform_device *dev = (struct platform_device *) vdev->opaque;
+
+	return platform_get_mem_or_io(dev, num);
+}
+
+static int get_platform_irq(struct vfio_platform_device *vdev, int i)
+{
+	struct platform_device *pdev = (struct platform_device *) vdev->opaque;
+
+	return platform_get_irq_optional(pdev, i);
+}
+
+static int vfio_platform_init_dev(struct vfio_device *core_vdev)
+{
+	struct vfio_platform_device *vdev =
+		container_of(core_vdev, struct vfio_platform_device, vdev);
+	struct platform_device *pdev = to_platform_device(core_vdev->dev);
+
+	vdev->opaque = (void *) pdev;
+	vdev->name = pdev->name;
+	vdev->flags = VFIO_DEVICE_FLAGS_PLATFORM;
+	vdev->get_resource = get_platform_resource;
+	vdev->get_irq = get_platform_irq;
+	vdev->reset_required = reset_required;
+
+	return vfio_platform_init_common(vdev);
+}
+
+static const struct vfio_device_ops vfio_platform_ops;
+static int vfio_platform_probe(struct platform_device *pdev)
+{
+	struct vfio_platform_device *vdev;
+	int ret;
+
+	vdev = vfio_alloc_device(vfio_platform_device, vdev, &pdev->dev,
+				 &vfio_platform_ops);
+	if (IS_ERR(vdev))
+		return PTR_ERR(vdev);
+
+	ret = vfio_register_group_dev(&vdev->vdev);
+	if (ret)
+		goto out_put_vdev;
+
+	pm_runtime_enable(&pdev->dev);
+	dev_set_drvdata(&pdev->dev, vdev);
+	return 0;
+
+out_put_vdev:
+	vfio_put_device(&vdev->vdev);
+	return ret;
+}
+
+static void vfio_platform_release_dev(struct vfio_device *core_vdev)
+{
+	struct vfio_platform_device *vdev =
+		container_of(core_vdev, struct vfio_platform_device, vdev);
+
+	vfio_platform_release_common(vdev);
+}
+
+static int vfio_platform_remove(struct platform_device *pdev)
+{
+	struct vfio_platform_device *vdev = dev_get_drvdata(&pdev->dev);
+
+	vfio_unregister_group_dev(&vdev->vdev);
+	pm_runtime_disable(vdev->device);
+	vfio_put_device(&vdev->vdev);
+	return 0;
+}
+
+static const struct vfio_device_ops vfio_platform_ops = {
+	.name		= "vfio-platform",
+	.init		= vfio_platform_init_dev,
+	.release	= vfio_platform_release_dev,
+	.open_device	= vfio_platform_open_device,
+	.close_device	= vfio_platform_close_device,
+	.ioctl		= vfio_platform_ioctl,
+	.read		= vfio_platform_read,
+	.write		= vfio_platform_write,
+	.mmap		= vfio_platform_mmap,
+	.bind_iommufd	= vfio_iommufd_physical_bind,
+	.unbind_iommufd	= vfio_iommufd_physical_unbind,
+	.attach_ioas	= vfio_iommufd_physical_attach_ioas,
+	.detach_ioas	= vfio_iommufd_physical_detach_ioas,
+};
+
+static struct platform_driver vfio_platform_driver = {
+	.probe		= vfio_platform_probe,
+	.remove		= vfio_platform_remove,
+	.driver	= {
+		.name	= "vfio-platform",
+	},
+	.driver_managed_dma = true,
+};
+
+module_platform_driver(vfio_platform_driver);
+
+MODULE_VERSION(DRIVER_VERSION);
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR(DRIVER_AUTHOR);
+MODULE_DESCRIPTION(DRIVER_DESC);
diff --git a/drivers/vfio/platform/vfio_platform_common.c b/drivers/vfio/platform/vfio_platform_common.c
new file mode 100644
index 0000000000..e53757d1d0
--- /dev/null
+++ b/drivers/vfio/platform/vfio_platform_common.c
@@ -0,0 +1,695 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2013 - Virtual Open Systems
+ * Author: Antonios Motakis <a.motakis@virtualopensystems.com>
+ */
+
+#define dev_fmt(fmt)	"VFIO: " fmt
+
+#include <linux/device.h>
+#include <linux/acpi.h>
+#include <linux/iommu.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/pm_runtime.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/uaccess.h>
+#include <linux/vfio.h>
+
+#include "vfio_platform_private.h"
+
+#define DRIVER_VERSION  "0.10"
+#define DRIVER_AUTHOR   "Antonios Motakis <a.motakis@virtualopensystems.com>"
+#define DRIVER_DESC     "VFIO platform base module"
+
+#define VFIO_PLATFORM_IS_ACPI(vdev) ((vdev)->acpihid != NULL)
+
+static LIST_HEAD(reset_list);
+static DEFINE_MUTEX(driver_lock);
+
+static vfio_platform_reset_fn_t vfio_platform_lookup_reset(const char *compat,
+					struct module **module)
+{
+	struct vfio_platform_reset_node *iter;
+	vfio_platform_reset_fn_t reset_fn = NULL;
+
+	mutex_lock(&driver_lock);
+	list_for_each_entry(iter, &reset_list, link) {
+		if (!strcmp(iter->compat, compat) &&
+			try_module_get(iter->owner)) {
+			*module = iter->owner;
+			reset_fn = iter->of_reset;
+			break;
+		}
+	}
+	mutex_unlock(&driver_lock);
+	return reset_fn;
+}
+
+static int vfio_platform_acpi_probe(struct vfio_platform_device *vdev,
+				    struct device *dev)
+{
+	struct acpi_device *adev;
+
+	if (acpi_disabled)
+		return -ENOENT;
+
+	adev = ACPI_COMPANION(dev);
+	if (!adev) {
+		dev_err(dev, "ACPI companion device not found for %s\n",
+			vdev->name);
+		return -ENODEV;
+	}
+
+#ifdef CONFIG_ACPI
+	vdev->acpihid = acpi_device_hid(adev);
+#endif
+	return WARN_ON(!vdev->acpihid) ? -EINVAL : 0;
+}
+
+static int vfio_platform_acpi_call_reset(struct vfio_platform_device *vdev,
+				  const char **extra_dbg)
+{
+#ifdef CONFIG_ACPI
+	struct device *dev = vdev->device;
+	acpi_handle handle = ACPI_HANDLE(dev);
+	acpi_status acpi_ret;
+
+	acpi_ret = acpi_evaluate_object(handle, "_RST", NULL, NULL);
+	if (ACPI_FAILURE(acpi_ret)) {
+		if (extra_dbg)
+			*extra_dbg = acpi_format_exception(acpi_ret);
+		return -EINVAL;
+	}
+
+	return 0;
+#else
+	return -ENOENT;
+#endif
+}
+
+static bool vfio_platform_acpi_has_reset(struct vfio_platform_device *vdev)
+{
+#ifdef CONFIG_ACPI
+	struct device *dev = vdev->device;
+	acpi_handle handle = ACPI_HANDLE(dev);
+
+	return acpi_has_method(handle, "_RST");
+#else
+	return false;
+#endif
+}
+
+static bool vfio_platform_has_reset(struct vfio_platform_device *vdev)
+{
+	if (VFIO_PLATFORM_IS_ACPI(vdev))
+		return vfio_platform_acpi_has_reset(vdev);
+
+	return vdev->of_reset ? true : false;
+}
+
+static int vfio_platform_get_reset(struct vfio_platform_device *vdev)
+{
+	if (VFIO_PLATFORM_IS_ACPI(vdev))
+		return vfio_platform_acpi_has_reset(vdev) ? 0 : -ENOENT;
+
+	vdev->of_reset = vfio_platform_lookup_reset(vdev->compat,
+						    &vdev->reset_module);
+	if (!vdev->of_reset) {
+		request_module("vfio-reset:%s", vdev->compat);
+		vdev->of_reset = vfio_platform_lookup_reset(vdev->compat,
+							&vdev->reset_module);
+	}
+
+	return vdev->of_reset ? 0 : -ENOENT;
+}
+
+static void vfio_platform_put_reset(struct vfio_platform_device *vdev)
+{
+	if (VFIO_PLATFORM_IS_ACPI(vdev))
+		return;
+
+	if (vdev->of_reset)
+		module_put(vdev->reset_module);
+}
+
+static int vfio_platform_regions_init(struct vfio_platform_device *vdev)
+{
+	int cnt = 0, i;
+
+	while (vdev->get_resource(vdev, cnt))
+		cnt++;
+
+	vdev->regions = kcalloc(cnt, sizeof(struct vfio_platform_region),
+				GFP_KERNEL_ACCOUNT);
+	if (!vdev->regions)
+		return -ENOMEM;
+
+	for (i = 0; i < cnt;  i++) {
+		struct resource *res =
+			vdev->get_resource(vdev, i);
+
+		vdev->regions[i].addr = res->start;
+		vdev->regions[i].size = resource_size(res);
+		vdev->regions[i].flags = 0;
+
+		switch (resource_type(res)) {
+		case IORESOURCE_MEM:
+			vdev->regions[i].type = VFIO_PLATFORM_REGION_TYPE_MMIO;
+			vdev->regions[i].flags |= VFIO_REGION_INFO_FLAG_READ;
+			if (!(res->flags & IORESOURCE_READONLY))
+				vdev->regions[i].flags |=
+					VFIO_REGION_INFO_FLAG_WRITE;
+
+			/*
+			 * Only regions addressed with PAGE granularity may be
+			 * MMAPed securely.
+			 */
+			if (!(vdev->regions[i].addr & ~PAGE_MASK) &&
+					!(vdev->regions[i].size & ~PAGE_MASK))
+				vdev->regions[i].flags |=
+					VFIO_REGION_INFO_FLAG_MMAP;
+
+			break;
+		case IORESOURCE_IO:
+			vdev->regions[i].type = VFIO_PLATFORM_REGION_TYPE_PIO;
+			break;
+		default:
+			goto err;
+		}
+	}
+
+	vdev->num_regions = cnt;
+
+	return 0;
+err:
+	kfree(vdev->regions);
+	return -EINVAL;
+}
+
+static void vfio_platform_regions_cleanup(struct vfio_platform_device *vdev)
+{
+	int i;
+
+	for (i = 0; i < vdev->num_regions; i++)
+		iounmap(vdev->regions[i].ioaddr);
+
+	vdev->num_regions = 0;
+	kfree(vdev->regions);
+}
+
+static int vfio_platform_call_reset(struct vfio_platform_device *vdev,
+				    const char **extra_dbg)
+{
+	if (VFIO_PLATFORM_IS_ACPI(vdev)) {
+		dev_info(vdev->device, "reset\n");
+		return vfio_platform_acpi_call_reset(vdev, extra_dbg);
+	} else if (vdev->of_reset) {
+		dev_info(vdev->device, "reset\n");
+		return vdev->of_reset(vdev);
+	}
+
+	dev_warn(vdev->device, "no reset function found!\n");
+	return -EINVAL;
+}
+
+void vfio_platform_close_device(struct vfio_device *core_vdev)
+{
+	struct vfio_platform_device *vdev =
+		container_of(core_vdev, struct vfio_platform_device, vdev);
+	const char *extra_dbg = NULL;
+	int ret;
+
+	ret = vfio_platform_call_reset(vdev, &extra_dbg);
+	if (WARN_ON(ret && vdev->reset_required)) {
+		dev_warn(
+			vdev->device,
+			"reset driver is required and reset call failed in release (%d) %s\n",
+			ret, extra_dbg ? extra_dbg : "");
+	}
+	pm_runtime_put(vdev->device);
+	vfio_platform_regions_cleanup(vdev);
+	vfio_platform_irq_cleanup(vdev);
+}
+EXPORT_SYMBOL_GPL(vfio_platform_close_device);
+
+int vfio_platform_open_device(struct vfio_device *core_vdev)
+{
+	struct vfio_platform_device *vdev =
+		container_of(core_vdev, struct vfio_platform_device, vdev);
+	const char *extra_dbg = NULL;
+	int ret;
+
+	ret = vfio_platform_regions_init(vdev);
+	if (ret)
+		return ret;
+
+	ret = vfio_platform_irq_init(vdev);
+	if (ret)
+		goto err_irq;
+
+	ret = pm_runtime_get_sync(vdev->device);
+	if (ret < 0)
+		goto err_rst;
+
+	ret = vfio_platform_call_reset(vdev, &extra_dbg);
+	if (ret && vdev->reset_required) {
+		dev_warn(
+			vdev->device,
+			"reset driver is required and reset call failed in open (%d) %s\n",
+			ret, extra_dbg ? extra_dbg : "");
+		goto err_rst;
+	}
+	return 0;
+
+err_rst:
+	pm_runtime_put(vdev->device);
+	vfio_platform_irq_cleanup(vdev);
+err_irq:
+	vfio_platform_regions_cleanup(vdev);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(vfio_platform_open_device);
+
+long vfio_platform_ioctl(struct vfio_device *core_vdev,
+			 unsigned int cmd, unsigned long arg)
+{
+	struct vfio_platform_device *vdev =
+		container_of(core_vdev, struct vfio_platform_device, vdev);
+
+	unsigned long minsz;
+
+	if (cmd == VFIO_DEVICE_GET_INFO) {
+		struct vfio_device_info info;
+
+		minsz = offsetofend(struct vfio_device_info, num_irqs);
+
+		if (copy_from_user(&info, (void __user *)arg, minsz))
+			return -EFAULT;
+
+		if (info.argsz < minsz)
+			return -EINVAL;
+
+		if (vfio_platform_has_reset(vdev))
+			vdev->flags |= VFIO_DEVICE_FLAGS_RESET;
+		info.flags = vdev->flags;
+		info.num_regions = vdev->num_regions;
+		info.num_irqs = vdev->num_irqs;
+
+		return copy_to_user((void __user *)arg, &info, minsz) ?
+			-EFAULT : 0;
+
+	} else if (cmd == VFIO_DEVICE_GET_REGION_INFO) {
+		struct vfio_region_info info;
+
+		minsz = offsetofend(struct vfio_region_info, offset);
+
+		if (copy_from_user(&info, (void __user *)arg, minsz))
+			return -EFAULT;
+
+		if (info.argsz < minsz)
+			return -EINVAL;
+
+		if (info.index >= vdev->num_regions)
+			return -EINVAL;
+
+		/* map offset to the physical address  */
+		info.offset = VFIO_PLATFORM_INDEX_TO_OFFSET(info.index);
+		info.size = vdev->regions[info.index].size;
+		info.flags = vdev->regions[info.index].flags;
+
+		return copy_to_user((void __user *)arg, &info, minsz) ?
+			-EFAULT : 0;
+
+	} else if (cmd == VFIO_DEVICE_GET_IRQ_INFO) {
+		struct vfio_irq_info info;
+
+		minsz = offsetofend(struct vfio_irq_info, count);
+
+		if (copy_from_user(&info, (void __user *)arg, minsz))
+			return -EFAULT;
+
+		if (info.argsz < minsz)
+			return -EINVAL;
+
+		if (info.index >= vdev->num_irqs)
+			return -EINVAL;
+
+		info.flags = vdev->irqs[info.index].flags;
+		info.count = vdev->irqs[info.index].count;
+
+		return copy_to_user((void __user *)arg, &info, minsz) ?
+			-EFAULT : 0;
+
+	} else if (cmd == VFIO_DEVICE_SET_IRQS) {
+		struct vfio_irq_set hdr;
+		u8 *data = NULL;
+		int ret = 0;
+		size_t data_size = 0;
+
+		minsz = offsetofend(struct vfio_irq_set, count);
+
+		if (copy_from_user(&hdr, (void __user *)arg, minsz))
+			return -EFAULT;
+
+		ret = vfio_set_irqs_validate_and_prepare(&hdr, vdev->num_irqs,
+						 vdev->num_irqs, &data_size);
+		if (ret)
+			return ret;
+
+		if (data_size) {
+			data = memdup_user((void __user *)(arg + minsz),
+					    data_size);
+			if (IS_ERR(data))
+				return PTR_ERR(data);
+		}
+
+		mutex_lock(&vdev->igate);
+
+		ret = vfio_platform_set_irqs_ioctl(vdev, hdr.flags, hdr.index,
+						   hdr.start, hdr.count, data);
+		mutex_unlock(&vdev->igate);
+		kfree(data);
+
+		return ret;
+
+	} else if (cmd == VFIO_DEVICE_RESET) {
+		return vfio_platform_call_reset(vdev, NULL);
+	}
+
+	return -ENOTTY;
+}
+EXPORT_SYMBOL_GPL(vfio_platform_ioctl);
+
+static ssize_t vfio_platform_read_mmio(struct vfio_platform_region *reg,
+				       char __user *buf, size_t count,
+				       loff_t off)
+{
+	unsigned int done = 0;
+
+	if (!reg->ioaddr) {
+		reg->ioaddr =
+			ioremap(reg->addr, reg->size);
+
+		if (!reg->ioaddr)
+			return -ENOMEM;
+	}
+
+	while (count) {
+		size_t filled;
+
+		if (count >= 4 && !(off % 4)) {
+			u32 val;
+
+			val = ioread32(reg->ioaddr + off);
+			if (copy_to_user(buf, &val, 4))
+				goto err;
+
+			filled = 4;
+		} else if (count >= 2 && !(off % 2)) {
+			u16 val;
+
+			val = ioread16(reg->ioaddr + off);
+			if (copy_to_user(buf, &val, 2))
+				goto err;
+
+			filled = 2;
+		} else {
+			u8 val;
+
+			val = ioread8(reg->ioaddr + off);
+			if (copy_to_user(buf, &val, 1))
+				goto err;
+
+			filled = 1;
+		}
+
+
+		count -= filled;
+		done += filled;
+		off += filled;
+		buf += filled;
+	}
+
+	return done;
+err:
+	return -EFAULT;
+}
+
+ssize_t vfio_platform_read(struct vfio_device *core_vdev,
+			   char __user *buf, size_t count, loff_t *ppos)
+{
+	struct vfio_platform_device *vdev =
+		container_of(core_vdev, struct vfio_platform_device, vdev);
+	unsigned int index = VFIO_PLATFORM_OFFSET_TO_INDEX(*ppos);
+	loff_t off = *ppos & VFIO_PLATFORM_OFFSET_MASK;
+
+	if (index >= vdev->num_regions)
+		return -EINVAL;
+
+	if (!(vdev->regions[index].flags & VFIO_REGION_INFO_FLAG_READ))
+		return -EINVAL;
+
+	if (vdev->regions[index].type & VFIO_PLATFORM_REGION_TYPE_MMIO)
+		return vfio_platform_read_mmio(&vdev->regions[index],
+							buf, count, off);
+	else if (vdev->regions[index].type & VFIO_PLATFORM_REGION_TYPE_PIO)
+		return -EINVAL; /* not implemented */
+
+	return -EINVAL;
+}
+EXPORT_SYMBOL_GPL(vfio_platform_read);
+
+static ssize_t vfio_platform_write_mmio(struct vfio_platform_region *reg,
+					const char __user *buf, size_t count,
+					loff_t off)
+{
+	unsigned int done = 0;
+
+	if (!reg->ioaddr) {
+		reg->ioaddr =
+			ioremap(reg->addr, reg->size);
+
+		if (!reg->ioaddr)
+			return -ENOMEM;
+	}
+
+	while (count) {
+		size_t filled;
+
+		if (count >= 4 && !(off % 4)) {
+			u32 val;
+
+			if (copy_from_user(&val, buf, 4))
+				goto err;
+			iowrite32(val, reg->ioaddr + off);
+
+			filled = 4;
+		} else if (count >= 2 && !(off % 2)) {
+			u16 val;
+
+			if (copy_from_user(&val, buf, 2))
+				goto err;
+			iowrite16(val, reg->ioaddr + off);
+
+			filled = 2;
+		} else {
+			u8 val;
+
+			if (copy_from_user(&val, buf, 1))
+				goto err;
+			iowrite8(val, reg->ioaddr + off);
+
+			filled = 1;
+		}
+
+		count -= filled;
+		done += filled;
+		off += filled;
+		buf += filled;
+	}
+
+	return done;
+err:
+	return -EFAULT;
+}
+
+ssize_t vfio_platform_write(struct vfio_device *core_vdev, const char __user *buf,
+			    size_t count, loff_t *ppos)
+{
+	struct vfio_platform_device *vdev =
+		container_of(core_vdev, struct vfio_platform_device, vdev);
+	unsigned int index = VFIO_PLATFORM_OFFSET_TO_INDEX(*ppos);
+	loff_t off = *ppos & VFIO_PLATFORM_OFFSET_MASK;
+
+	if (index >= vdev->num_regions)
+		return -EINVAL;
+
+	if (!(vdev->regions[index].flags & VFIO_REGION_INFO_FLAG_WRITE))
+		return -EINVAL;
+
+	if (vdev->regions[index].type & VFIO_PLATFORM_REGION_TYPE_MMIO)
+		return vfio_platform_write_mmio(&vdev->regions[index],
+							buf, count, off);
+	else if (vdev->regions[index].type & VFIO_PLATFORM_REGION_TYPE_PIO)
+		return -EINVAL; /* not implemented */
+
+	return -EINVAL;
+}
+EXPORT_SYMBOL_GPL(vfio_platform_write);
+
+static int vfio_platform_mmap_mmio(struct vfio_platform_region region,
+				   struct vm_area_struct *vma)
+{
+	u64 req_len, pgoff, req_start;
+
+	req_len = vma->vm_end - vma->vm_start;
+	pgoff = vma->vm_pgoff &
+		((1U << (VFIO_PLATFORM_OFFSET_SHIFT - PAGE_SHIFT)) - 1);
+	req_start = pgoff << PAGE_SHIFT;
+
+	if (region.size < PAGE_SIZE || req_start + req_len > region.size)
+		return -EINVAL;
+
+	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+	vma->vm_pgoff = (region.addr >> PAGE_SHIFT) + pgoff;
+
+	return remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff,
+			       req_len, vma->vm_page_prot);
+}
+
+int vfio_platform_mmap(struct vfio_device *core_vdev, struct vm_area_struct *vma)
+{
+	struct vfio_platform_device *vdev =
+		container_of(core_vdev, struct vfio_platform_device, vdev);
+	unsigned int index;
+
+	index = vma->vm_pgoff >> (VFIO_PLATFORM_OFFSET_SHIFT - PAGE_SHIFT);
+
+	if (vma->vm_end < vma->vm_start)
+		return -EINVAL;
+	if (!(vma->vm_flags & VM_SHARED))
+		return -EINVAL;
+	if (index >= vdev->num_regions)
+		return -EINVAL;
+	if (vma->vm_start & ~PAGE_MASK)
+		return -EINVAL;
+	if (vma->vm_end & ~PAGE_MASK)
+		return -EINVAL;
+
+	if (!(vdev->regions[index].flags & VFIO_REGION_INFO_FLAG_MMAP))
+		return -EINVAL;
+
+	if (!(vdev->regions[index].flags & VFIO_REGION_INFO_FLAG_READ)
+			&& (vma->vm_flags & VM_READ))
+		return -EINVAL;
+
+	if (!(vdev->regions[index].flags & VFIO_REGION_INFO_FLAG_WRITE)
+			&& (vma->vm_flags & VM_WRITE))
+		return -EINVAL;
+
+	vma->vm_private_data = vdev;
+
+	if (vdev->regions[index].type & VFIO_PLATFORM_REGION_TYPE_MMIO)
+		return vfio_platform_mmap_mmio(vdev->regions[index], vma);
+
+	else if (vdev->regions[index].type & VFIO_PLATFORM_REGION_TYPE_PIO)
+		return -EINVAL; /* not implemented */
+
+	return -EINVAL;
+}
+EXPORT_SYMBOL_GPL(vfio_platform_mmap);
+
+static int vfio_platform_of_probe(struct vfio_platform_device *vdev,
+			   struct device *dev)
+{
+	int ret;
+
+	ret = device_property_read_string(dev, "compatible",
+					  &vdev->compat);
+	if (ret)
+		dev_err(dev, "Cannot retrieve compat for %s\n", vdev->name);
+
+	return ret;
+}
+
+/*
+ * There can be two kernel build combinations. One build where
+ * ACPI is not selected in Kconfig and another one with the ACPI Kconfig.
+ *
+ * In the first case, vfio_platform_acpi_probe will return since
+ * acpi_disabled is 1. DT user will not see any kind of messages from
+ * ACPI.
+ *
+ * In the second case, both DT and ACPI is compiled in but the system is
+ * booting with any of these combinations.
+ *
+ * If the firmware is DT type, then acpi_disabled is 1. The ACPI probe routine
+ * terminates immediately without any messages.
+ *
+ * If the firmware is ACPI type, then acpi_disabled is 0. All other checks are
+ * valid checks. We cannot claim that this system is DT.
+ */
+int vfio_platform_init_common(struct vfio_platform_device *vdev)
+{
+	int ret;
+	struct device *dev = vdev->vdev.dev;
+
+	ret = vfio_platform_acpi_probe(vdev, dev);
+	if (ret)
+		ret = vfio_platform_of_probe(vdev, dev);
+
+	if (ret)
+		return ret;
+
+	vdev->device = dev;
+	mutex_init(&vdev->igate);
+
+	ret = vfio_platform_get_reset(vdev);
+	if (ret && vdev->reset_required) {
+		dev_err(dev, "No reset function found for device %s\n",
+			vdev->name);
+		return ret;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(vfio_platform_init_common);
+
+void vfio_platform_release_common(struct vfio_platform_device *vdev)
+{
+	vfio_platform_put_reset(vdev);
+}
+EXPORT_SYMBOL_GPL(vfio_platform_release_common);
+
+void __vfio_platform_register_reset(struct vfio_platform_reset_node *node)
+{
+	mutex_lock(&driver_lock);
+	list_add(&node->link, &reset_list);
+	mutex_unlock(&driver_lock);
+}
+EXPORT_SYMBOL_GPL(__vfio_platform_register_reset);
+
+void vfio_platform_unregister_reset(const char *compat,
+				    vfio_platform_reset_fn_t fn)
+{
+	struct vfio_platform_reset_node *iter, *temp;
+
+	mutex_lock(&driver_lock);
+	list_for_each_entry_safe(iter, temp, &reset_list, link) {
+		if (!strcmp(iter->compat, compat) && (iter->of_reset == fn)) {
+			list_del(&iter->link);
+			break;
+		}
+	}
+
+	mutex_unlock(&driver_lock);
+
+}
+EXPORT_SYMBOL_GPL(vfio_platform_unregister_reset);
+
+MODULE_VERSION(DRIVER_VERSION);
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR(DRIVER_AUTHOR);
+MODULE_DESCRIPTION(DRIVER_DESC);
diff --git a/drivers/vfio/platform/vfio_platform_irq.c b/drivers/vfio/platform/vfio_platform_irq.c
new file mode 100644
index 0000000000..665197caed
--- /dev/null
+++ b/drivers/vfio/platform/vfio_platform_irq.c
@@ -0,0 +1,329 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * VFIO platform devices interrupt handling
+ *
+ * Copyright (C) 2013 - Virtual Open Systems
+ * Author: Antonios Motakis <a.motakis@virtualopensystems.com>
+ */
+
+#include <linux/eventfd.h>
+#include <linux/interrupt.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/vfio.h>
+#include <linux/irq.h>
+
+#include "vfio_platform_private.h"
+
+static void vfio_platform_mask(struct vfio_platform_irq *irq_ctx)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&irq_ctx->lock, flags);
+
+	if (!irq_ctx->masked) {
+		disable_irq_nosync(irq_ctx->hwirq);
+		irq_ctx->masked = true;
+	}
+
+	spin_unlock_irqrestore(&irq_ctx->lock, flags);
+}
+
+static int vfio_platform_mask_handler(void *opaque, void *unused)
+{
+	struct vfio_platform_irq *irq_ctx = opaque;
+
+	vfio_platform_mask(irq_ctx);
+
+	return 0;
+}
+
+static int vfio_platform_set_irq_mask(struct vfio_platform_device *vdev,
+				      unsigned index, unsigned start,
+				      unsigned count, uint32_t flags,
+				      void *data)
+{
+	if (start != 0 || count != 1)
+		return -EINVAL;
+
+	if (!(vdev->irqs[index].flags & VFIO_IRQ_INFO_MASKABLE))
+		return -EINVAL;
+
+	if (flags & VFIO_IRQ_SET_DATA_EVENTFD) {
+		int32_t fd = *(int32_t *)data;
+
+		if (fd >= 0)
+			return vfio_virqfd_enable((void *) &vdev->irqs[index],
+						  vfio_platform_mask_handler,
+						  NULL, NULL,
+						  &vdev->irqs[index].mask, fd);
+
+		vfio_virqfd_disable(&vdev->irqs[index].mask);
+		return 0;
+	}
+
+	if (flags & VFIO_IRQ_SET_DATA_NONE) {
+		vfio_platform_mask(&vdev->irqs[index]);
+
+	} else if (flags & VFIO_IRQ_SET_DATA_BOOL) {
+		uint8_t mask = *(uint8_t *)data;
+
+		if (mask)
+			vfio_platform_mask(&vdev->irqs[index]);
+	}
+
+	return 0;
+}
+
+static void vfio_platform_unmask(struct vfio_platform_irq *irq_ctx)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&irq_ctx->lock, flags);
+
+	if (irq_ctx->masked) {
+		enable_irq(irq_ctx->hwirq);
+		irq_ctx->masked = false;
+	}
+
+	spin_unlock_irqrestore(&irq_ctx->lock, flags);
+}
+
+static int vfio_platform_unmask_handler(void *opaque, void *unused)
+{
+	struct vfio_platform_irq *irq_ctx = opaque;
+
+	vfio_platform_unmask(irq_ctx);
+
+	return 0;
+}
+
+static int vfio_platform_set_irq_unmask(struct vfio_platform_device *vdev,
+					unsigned index, unsigned start,
+					unsigned count, uint32_t flags,
+					void *data)
+{
+	if (start != 0 || count != 1)
+		return -EINVAL;
+
+	if (!(vdev->irqs[index].flags & VFIO_IRQ_INFO_MASKABLE))
+		return -EINVAL;
+
+	if (flags & VFIO_IRQ_SET_DATA_EVENTFD) {
+		int32_t fd = *(int32_t *)data;
+
+		if (fd >= 0)
+			return vfio_virqfd_enable((void *) &vdev->irqs[index],
+						  vfio_platform_unmask_handler,
+						  NULL, NULL,
+						  &vdev->irqs[index].unmask,
+						  fd);
+
+		vfio_virqfd_disable(&vdev->irqs[index].unmask);
+		return 0;
+	}
+
+	if (flags & VFIO_IRQ_SET_DATA_NONE) {
+		vfio_platform_unmask(&vdev->irqs[index]);
+
+	} else if (flags & VFIO_IRQ_SET_DATA_BOOL) {
+		uint8_t unmask = *(uint8_t *)data;
+
+		if (unmask)
+			vfio_platform_unmask(&vdev->irqs[index]);
+	}
+
+	return 0;
+}
+
+static irqreturn_t vfio_automasked_irq_handler(int irq, void *dev_id)
+{
+	struct vfio_platform_irq *irq_ctx = dev_id;
+	unsigned long flags;
+	int ret = IRQ_NONE;
+
+	spin_lock_irqsave(&irq_ctx->lock, flags);
+
+	if (!irq_ctx->masked) {
+		ret = IRQ_HANDLED;
+
+		/* automask maskable interrupts */
+		disable_irq_nosync(irq_ctx->hwirq);
+		irq_ctx->masked = true;
+	}
+
+	spin_unlock_irqrestore(&irq_ctx->lock, flags);
+
+	if (ret == IRQ_HANDLED)
+		eventfd_signal(irq_ctx->trigger, 1);
+
+	return ret;
+}
+
+static irqreturn_t vfio_irq_handler(int irq, void *dev_id)
+{
+	struct vfio_platform_irq *irq_ctx = dev_id;
+
+	eventfd_signal(irq_ctx->trigger, 1);
+
+	return IRQ_HANDLED;
+}
+
+static int vfio_set_trigger(struct vfio_platform_device *vdev, int index,
+			    int fd, irq_handler_t handler)
+{
+	struct vfio_platform_irq *irq = &vdev->irqs[index];
+	struct eventfd_ctx *trigger;
+	int ret;
+
+	if (irq->trigger) {
+		irq_clear_status_flags(irq->hwirq, IRQ_NOAUTOEN);
+		free_irq(irq->hwirq, irq);
+		kfree(irq->name);
+		eventfd_ctx_put(irq->trigger);
+		irq->trigger = NULL;
+	}
+
+	if (fd < 0) /* Disable only */
+		return 0;
+	irq->name = kasprintf(GFP_KERNEL_ACCOUNT, "vfio-irq[%d](%s)",
+			      irq->hwirq, vdev->name);
+	if (!irq->name)
+		return -ENOMEM;
+
+	trigger = eventfd_ctx_fdget(fd);
+	if (IS_ERR(trigger)) {
+		kfree(irq->name);
+		return PTR_ERR(trigger);
+	}
+
+	irq->trigger = trigger;
+
+	irq_set_status_flags(irq->hwirq, IRQ_NOAUTOEN);
+	ret = request_irq(irq->hwirq, handler, 0, irq->name, irq);
+	if (ret) {
+		kfree(irq->name);
+		eventfd_ctx_put(trigger);
+		irq->trigger = NULL;
+		return ret;
+	}
+
+	if (!irq->masked)
+		enable_irq(irq->hwirq);
+
+	return 0;
+}
+
+static int vfio_platform_set_irq_trigger(struct vfio_platform_device *vdev,
+					 unsigned index, unsigned start,
+					 unsigned count, uint32_t flags,
+					 void *data)
+{
+	struct vfio_platform_irq *irq = &vdev->irqs[index];
+	irq_handler_t handler;
+
+	if (vdev->irqs[index].flags & VFIO_IRQ_INFO_AUTOMASKED)
+		handler = vfio_automasked_irq_handler;
+	else
+		handler = vfio_irq_handler;
+
+	if (!count && (flags & VFIO_IRQ_SET_DATA_NONE))
+		return vfio_set_trigger(vdev, index, -1, handler);
+
+	if (start != 0 || count != 1)
+		return -EINVAL;
+
+	if (flags & VFIO_IRQ_SET_DATA_EVENTFD) {
+		int32_t fd = *(int32_t *)data;
+
+		return vfio_set_trigger(vdev, index, fd, handler);
+	}
+
+	if (flags & VFIO_IRQ_SET_DATA_NONE) {
+		handler(irq->hwirq, irq);
+
+	} else if (flags & VFIO_IRQ_SET_DATA_BOOL) {
+		uint8_t trigger = *(uint8_t *)data;
+
+		if (trigger)
+			handler(irq->hwirq, irq);
+	}
+
+	return 0;
+}
+
+int vfio_platform_set_irqs_ioctl(struct vfio_platform_device *vdev,
+				 uint32_t flags, unsigned index, unsigned start,
+				 unsigned count, void *data)
+{
+	int (*func)(struct vfio_platform_device *vdev, unsigned index,
+		    unsigned start, unsigned count, uint32_t flags,
+		    void *data) = NULL;
+
+	switch (flags & VFIO_IRQ_SET_ACTION_TYPE_MASK) {
+	case VFIO_IRQ_SET_ACTION_MASK:
+		func = vfio_platform_set_irq_mask;
+		break;
+	case VFIO_IRQ_SET_ACTION_UNMASK:
+		func = vfio_platform_set_irq_unmask;
+		break;
+	case VFIO_IRQ_SET_ACTION_TRIGGER:
+		func = vfio_platform_set_irq_trigger;
+		break;
+	}
+
+	if (!func)
+		return -ENOTTY;
+
+	return func(vdev, index, start, count, flags, data);
+}
+
+int vfio_platform_irq_init(struct vfio_platform_device *vdev)
+{
+	int cnt = 0, i;
+
+	while (vdev->get_irq(vdev, cnt) >= 0)
+		cnt++;
+
+	vdev->irqs = kcalloc(cnt, sizeof(struct vfio_platform_irq),
+			     GFP_KERNEL_ACCOUNT);
+	if (!vdev->irqs)
+		return -ENOMEM;
+
+	for (i = 0; i < cnt; i++) {
+		int hwirq = vdev->get_irq(vdev, i);
+
+		if (hwirq < 0)
+			goto err;
+
+		spin_lock_init(&vdev->irqs[i].lock);
+
+		vdev->irqs[i].flags = VFIO_IRQ_INFO_EVENTFD;
+
+		if (irq_get_trigger_type(hwirq) & IRQ_TYPE_LEVEL_MASK)
+			vdev->irqs[i].flags |= VFIO_IRQ_INFO_MASKABLE
+						| VFIO_IRQ_INFO_AUTOMASKED;
+
+		vdev->irqs[i].count = 1;
+		vdev->irqs[i].hwirq = hwirq;
+		vdev->irqs[i].masked = false;
+	}
+
+	vdev->num_irqs = cnt;
+
+	return 0;
+err:
+	kfree(vdev->irqs);
+	return -EINVAL;
+}
+
+void vfio_platform_irq_cleanup(struct vfio_platform_device *vdev)
+{
+	int i;
+
+	for (i = 0; i < vdev->num_irqs; i++)
+		vfio_set_trigger(vdev, i, -1, NULL);
+
+	vdev->num_irqs = 0;
+	kfree(vdev->irqs);
+}
diff --git a/drivers/vfio/platform/vfio_platform_private.h b/drivers/vfio/platform/vfio_platform_private.h
new file mode 100644
index 0000000000..8d8fab5168
--- /dev/null
+++ b/drivers/vfio/platform/vfio_platform_private.h
@@ -0,0 +1,129 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2013 - Virtual Open Systems
+ * Author: Antonios Motakis <a.motakis@virtualopensystems.com>
+ */
+
+#ifndef VFIO_PLATFORM_PRIVATE_H
+#define VFIO_PLATFORM_PRIVATE_H
+
+#include <linux/types.h>
+#include <linux/interrupt.h>
+#include <linux/vfio.h>
+
+#define VFIO_PLATFORM_OFFSET_SHIFT   40
+#define VFIO_PLATFORM_OFFSET_MASK (((u64)(1) << VFIO_PLATFORM_OFFSET_SHIFT) - 1)
+
+#define VFIO_PLATFORM_OFFSET_TO_INDEX(off)	\
+	(off >> VFIO_PLATFORM_OFFSET_SHIFT)
+
+#define VFIO_PLATFORM_INDEX_TO_OFFSET(index)	\
+	((u64)(index) << VFIO_PLATFORM_OFFSET_SHIFT)
+
+struct vfio_platform_irq {
+	u32			flags;
+	u32			count;
+	int			hwirq;
+	char			*name;
+	struct eventfd_ctx	*trigger;
+	bool			masked;
+	spinlock_t		lock;
+	struct virqfd		*unmask;
+	struct virqfd		*mask;
+};
+
+struct vfio_platform_region {
+	u64			addr;
+	resource_size_t		size;
+	u32			flags;
+	u32			type;
+#define VFIO_PLATFORM_REGION_TYPE_MMIO	1
+#define VFIO_PLATFORM_REGION_TYPE_PIO	2
+	void __iomem		*ioaddr;
+};
+
+struct vfio_platform_device {
+	struct vfio_device		vdev;
+	struct vfio_platform_region	*regions;
+	u32				num_regions;
+	struct vfio_platform_irq	*irqs;
+	u32				num_irqs;
+	struct mutex			igate;
+	const char			*compat;
+	const char			*acpihid;
+	struct module			*reset_module;
+	struct device			*device;
+
+	/*
+	 * These fields should be filled by the bus specific binder
+	 */
+	void		*opaque;
+	const char	*name;
+	uint32_t	flags;
+	/* callbacks to discover device resources */
+	struct resource*
+		(*get_resource)(struct vfio_platform_device *vdev, int i);
+	int	(*get_irq)(struct vfio_platform_device *vdev, int i);
+	int	(*of_reset)(struct vfio_platform_device *vdev);
+
+	bool				reset_required;
+};
+
+typedef int (*vfio_platform_reset_fn_t)(struct vfio_platform_device *vdev);
+
+struct vfio_platform_reset_node {
+	struct list_head link;
+	char *compat;
+	struct module *owner;
+	vfio_platform_reset_fn_t of_reset;
+};
+
+int vfio_platform_init_common(struct vfio_platform_device *vdev);
+void vfio_platform_release_common(struct vfio_platform_device *vdev);
+
+int vfio_platform_open_device(struct vfio_device *core_vdev);
+void vfio_platform_close_device(struct vfio_device *core_vdev);
+long vfio_platform_ioctl(struct vfio_device *core_vdev,
+			 unsigned int cmd, unsigned long arg);
+ssize_t vfio_platform_read(struct vfio_device *core_vdev,
+			   char __user *buf, size_t count,
+			   loff_t *ppos);
+ssize_t vfio_platform_write(struct vfio_device *core_vdev,
+			    const char __user *buf,
+			    size_t count, loff_t *ppos);
+int vfio_platform_mmap(struct vfio_device *core_vdev,
+		       struct vm_area_struct *vma);
+
+int vfio_platform_irq_init(struct vfio_platform_device *vdev);
+void vfio_platform_irq_cleanup(struct vfio_platform_device *vdev);
+
+int vfio_platform_set_irqs_ioctl(struct vfio_platform_device *vdev,
+				 uint32_t flags, unsigned index,
+				 unsigned start, unsigned count, void *data);
+
+void __vfio_platform_register_reset(struct vfio_platform_reset_node *n);
+void vfio_platform_unregister_reset(const char *compat,
+				    vfio_platform_reset_fn_t fn);
+#define vfio_platform_register_reset(__compat, __reset)		\
+static struct vfio_platform_reset_node __reset ## _node = {	\
+	.owner = THIS_MODULE,					\
+	.compat = __compat,					\
+	.of_reset = __reset,					\
+};								\
+__vfio_platform_register_reset(&__reset ## _node)
+
+#define module_vfio_reset_handler(compat, reset)		\
+MODULE_ALIAS("vfio-reset:" compat);				\
+static int __init reset ## _module_init(void)			\
+{								\
+	vfio_platform_register_reset(compat, reset);		\
+	return 0;						\
+};								\
+static void __exit reset ## _module_exit(void)			\
+{								\
+	vfio_platform_unregister_reset(compat, reset);		\
+};								\
+module_init(reset ## _module_init);				\
+module_exit(reset ## _module_exit)
+
+#endif /* VFIO_PLATFORM_PRIVATE_H */
diff --git a/drivers/vfio/vfio.h b/drivers/vfio/vfio.h
new file mode 100644
index 0000000000..307e3f29b5
--- /dev/null
+++ b/drivers/vfio/vfio.h
@@ -0,0 +1,451 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
+ *     Author: Alex Williamson <alex.williamson@redhat.com>
+ */
+#ifndef __VFIO_VFIO_H__
+#define __VFIO_VFIO_H__
+
+#include <linux/file.h>
+#include <linux/device.h>
+#include <linux/cdev.h>
+#include <linux/module.h>
+#include <linux/vfio.h>
+
+struct iommufd_ctx;
+struct iommu_group;
+struct vfio_container;
+
+struct vfio_device_file {
+	struct vfio_device *device;
+	struct vfio_group *group;
+
+	u8 access_granted;
+	u32 devid; /* only valid when iommufd is valid */
+	spinlock_t kvm_ref_lock; /* protect kvm field */
+	struct kvm *kvm;
+	struct iommufd_ctx *iommufd; /* protected by struct vfio_device_set::lock */
+};
+
+void vfio_device_put_registration(struct vfio_device *device);
+bool vfio_device_try_get_registration(struct vfio_device *device);
+int vfio_df_open(struct vfio_device_file *df);
+void vfio_df_close(struct vfio_device_file *df);
+struct vfio_device_file *
+vfio_allocate_device_file(struct vfio_device *device);
+
+extern const struct file_operations vfio_device_fops;
+
+#ifdef CONFIG_VFIO_NOIOMMU
+extern bool vfio_noiommu __read_mostly;
+#else
+enum { vfio_noiommu = false };
+#endif
+
+enum vfio_group_type {
+	/*
+	 * Physical device with IOMMU backing.
+	 */
+	VFIO_IOMMU,
+
+	/*
+	 * Virtual device without IOMMU backing. The VFIO core fakes up an
+	 * iommu_group as the iommu_group sysfs interface is part of the
+	 * userspace ABI.  The user of these devices must not be able to
+	 * directly trigger unmediated DMA.
+	 */
+	VFIO_EMULATED_IOMMU,
+
+	/*
+	 * Physical device without IOMMU backing. The VFIO core fakes up an
+	 * iommu_group as the iommu_group sysfs interface is part of the
+	 * userspace ABI.  Users can trigger unmediated DMA by the device,
+	 * usage is highly dangerous, requires an explicit opt-in and will
+	 * taint the kernel.
+	 */
+	VFIO_NO_IOMMU,
+};
+
+#if IS_ENABLED(CONFIG_VFIO_GROUP)
+struct vfio_group {
+	struct device 			dev;
+	struct cdev			cdev;
+	/*
+	 * When drivers is non-zero a driver is attached to the struct device
+	 * that provided the iommu_group and thus the iommu_group is a valid
+	 * pointer. When drivers is 0 the driver is being detached. Once users
+	 * reaches 0 then the iommu_group is invalid.
+	 */
+	refcount_t			drivers;
+	unsigned int			container_users;
+	struct iommu_group		*iommu_group;
+	struct vfio_container		*container;
+	struct list_head		device_list;
+	struct mutex			device_lock;
+	struct list_head		vfio_next;
+#if IS_ENABLED(CONFIG_VFIO_CONTAINER)
+	struct list_head		container_next;
+#endif
+	enum vfio_group_type		type;
+	struct mutex			group_lock;
+	struct kvm			*kvm;
+	struct file			*opened_file;
+	struct blocking_notifier_head	notifier;
+	struct iommufd_ctx		*iommufd;
+	spinlock_t			kvm_ref_lock;
+	unsigned int			cdev_device_open_cnt;
+};
+
+int vfio_device_block_group(struct vfio_device *device);
+void vfio_device_unblock_group(struct vfio_device *device);
+int vfio_device_set_group(struct vfio_device *device,
+			  enum vfio_group_type type);
+void vfio_device_remove_group(struct vfio_device *device);
+void vfio_device_group_register(struct vfio_device *device);
+void vfio_device_group_unregister(struct vfio_device *device);
+int vfio_device_group_use_iommu(struct vfio_device *device);
+void vfio_device_group_unuse_iommu(struct vfio_device *device);
+void vfio_df_group_close(struct vfio_device_file *df);
+struct vfio_group *vfio_group_from_file(struct file *file);
+bool vfio_group_enforced_coherent(struct vfio_group *group);
+void vfio_group_set_kvm(struct vfio_group *group, struct kvm *kvm);
+bool vfio_device_has_container(struct vfio_device *device);
+int __init vfio_group_init(void);
+void vfio_group_cleanup(void);
+
+static inline bool vfio_device_is_noiommu(struct vfio_device *vdev)
+{
+	return IS_ENABLED(CONFIG_VFIO_NOIOMMU) &&
+	       vdev->group->type == VFIO_NO_IOMMU;
+}
+#else
+struct vfio_group;
+
+static inline int vfio_device_block_group(struct vfio_device *device)
+{
+	return 0;
+}
+
+static inline void vfio_device_unblock_group(struct vfio_device *device)
+{
+}
+
+static inline int vfio_device_set_group(struct vfio_device *device,
+					enum vfio_group_type type)
+{
+	return 0;
+}
+
+static inline void vfio_device_remove_group(struct vfio_device *device)
+{
+}
+
+static inline void vfio_device_group_register(struct vfio_device *device)
+{
+}
+
+static inline void vfio_device_group_unregister(struct vfio_device *device)
+{
+}
+
+static inline int vfio_device_group_use_iommu(struct vfio_device *device)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline void vfio_device_group_unuse_iommu(struct vfio_device *device)
+{
+}
+
+static inline void vfio_df_group_close(struct vfio_device_file *df)
+{
+}
+
+static inline struct vfio_group *vfio_group_from_file(struct file *file)
+{
+	return NULL;
+}
+
+static inline bool vfio_group_enforced_coherent(struct vfio_group *group)
+{
+	return true;
+}
+
+static inline void vfio_group_set_kvm(struct vfio_group *group, struct kvm *kvm)
+{
+}
+
+static inline bool vfio_device_has_container(struct vfio_device *device)
+{
+	return false;
+}
+
+static inline int __init vfio_group_init(void)
+{
+	return 0;
+}
+
+static inline void vfio_group_cleanup(void)
+{
+}
+
+static inline bool vfio_device_is_noiommu(struct vfio_device *vdev)
+{
+	return false;
+}
+#endif /* CONFIG_VFIO_GROUP */
+
+#if IS_ENABLED(CONFIG_VFIO_CONTAINER)
+/**
+ * struct vfio_iommu_driver_ops - VFIO IOMMU driver callbacks
+ */
+struct vfio_iommu_driver_ops {
+	char		*name;
+	struct module	*owner;
+	void		*(*open)(unsigned long arg);
+	void		(*release)(void *iommu_data);
+	long		(*ioctl)(void *iommu_data, unsigned int cmd,
+				 unsigned long arg);
+	int		(*attach_group)(void *iommu_data,
+					struct iommu_group *group,
+					enum vfio_group_type);
+	void		(*detach_group)(void *iommu_data,
+					struct iommu_group *group);
+	int		(*pin_pages)(void *iommu_data,
+				     struct iommu_group *group,
+				     dma_addr_t user_iova,
+				     int npage, int prot,
+				     struct page **pages);
+	void		(*unpin_pages)(void *iommu_data,
+				       dma_addr_t user_iova, int npage);
+	void		(*register_device)(void *iommu_data,
+					   struct vfio_device *vdev);
+	void		(*unregister_device)(void *iommu_data,
+					     struct vfio_device *vdev);
+	int		(*dma_rw)(void *iommu_data, dma_addr_t user_iova,
+				  void *data, size_t count, bool write);
+	struct iommu_domain *(*group_iommu_domain)(void *iommu_data,
+						   struct iommu_group *group);
+};
+
+struct vfio_iommu_driver {
+	const struct vfio_iommu_driver_ops	*ops;
+	struct list_head			vfio_next;
+};
+
+int vfio_register_iommu_driver(const struct vfio_iommu_driver_ops *ops);
+void vfio_unregister_iommu_driver(const struct vfio_iommu_driver_ops *ops);
+
+struct vfio_container *vfio_container_from_file(struct file *filep);
+int vfio_group_use_container(struct vfio_group *group);
+void vfio_group_unuse_container(struct vfio_group *group);
+int vfio_container_attach_group(struct vfio_container *container,
+				struct vfio_group *group);
+void vfio_group_detach_container(struct vfio_group *group);
+void vfio_device_container_register(struct vfio_device *device);
+void vfio_device_container_unregister(struct vfio_device *device);
+int vfio_device_container_pin_pages(struct vfio_device *device,
+				    dma_addr_t iova, int npage,
+				    int prot, struct page **pages);
+void vfio_device_container_unpin_pages(struct vfio_device *device,
+				       dma_addr_t iova, int npage);
+int vfio_device_container_dma_rw(struct vfio_device *device,
+				 dma_addr_t iova, void *data,
+				 size_t len, bool write);
+
+int __init vfio_container_init(void);
+void vfio_container_cleanup(void);
+#else
+static inline struct vfio_container *
+vfio_container_from_file(struct file *filep)
+{
+	return NULL;
+}
+
+static inline int vfio_group_use_container(struct vfio_group *group)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline void vfio_group_unuse_container(struct vfio_group *group)
+{
+}
+
+static inline int vfio_container_attach_group(struct vfio_container *container,
+					      struct vfio_group *group)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline void vfio_group_detach_container(struct vfio_group *group)
+{
+}
+
+static inline void vfio_device_container_register(struct vfio_device *device)
+{
+}
+
+static inline void vfio_device_container_unregister(struct vfio_device *device)
+{
+}
+
+static inline int vfio_device_container_pin_pages(struct vfio_device *device,
+						  dma_addr_t iova, int npage,
+						  int prot, struct page **pages)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline void vfio_device_container_unpin_pages(struct vfio_device *device,
+						     dma_addr_t iova, int npage)
+{
+}
+
+static inline int vfio_device_container_dma_rw(struct vfio_device *device,
+					       dma_addr_t iova, void *data,
+					       size_t len, bool write)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline int vfio_container_init(void)
+{
+	return 0;
+}
+static inline void vfio_container_cleanup(void)
+{
+}
+#endif
+
+#if IS_ENABLED(CONFIG_IOMMUFD)
+bool vfio_iommufd_device_has_compat_ioas(struct vfio_device *vdev,
+					 struct iommufd_ctx *ictx);
+int vfio_df_iommufd_bind(struct vfio_device_file *df);
+void vfio_df_iommufd_unbind(struct vfio_device_file *df);
+int vfio_iommufd_compat_attach_ioas(struct vfio_device *device,
+				    struct iommufd_ctx *ictx);
+#else
+static inline bool
+vfio_iommufd_device_has_compat_ioas(struct vfio_device *vdev,
+				    struct iommufd_ctx *ictx)
+{
+	return false;
+}
+
+static inline int vfio_df_iommufd_bind(struct vfio_device_file *fd)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline void vfio_df_iommufd_unbind(struct vfio_device_file *df)
+{
+}
+
+static inline int
+vfio_iommufd_compat_attach_ioas(struct vfio_device *device,
+				struct iommufd_ctx *ictx)
+{
+	return -EOPNOTSUPP;
+}
+#endif
+
+int vfio_df_ioctl_attach_pt(struct vfio_device_file *df,
+			    struct vfio_device_attach_iommufd_pt __user *arg);
+int vfio_df_ioctl_detach_pt(struct vfio_device_file *df,
+			    struct vfio_device_detach_iommufd_pt __user *arg);
+
+#if IS_ENABLED(CONFIG_VFIO_DEVICE_CDEV)
+void vfio_init_device_cdev(struct vfio_device *device);
+
+static inline int vfio_device_add(struct vfio_device *device)
+{
+	/* cdev does not support noiommu device */
+	if (vfio_device_is_noiommu(device))
+		return device_add(&device->device);
+	vfio_init_device_cdev(device);
+	return cdev_device_add(&device->cdev, &device->device);
+}
+
+static inline void vfio_device_del(struct vfio_device *device)
+{
+	if (vfio_device_is_noiommu(device))
+		device_del(&device->device);
+	else
+		cdev_device_del(&device->cdev, &device->device);
+}
+
+int vfio_device_fops_cdev_open(struct inode *inode, struct file *filep);
+long vfio_df_ioctl_bind_iommufd(struct vfio_device_file *df,
+				struct vfio_device_bind_iommufd __user *arg);
+void vfio_df_unbind_iommufd(struct vfio_device_file *df);
+int vfio_cdev_init(struct class *device_class);
+void vfio_cdev_cleanup(void);
+#else
+static inline void vfio_init_device_cdev(struct vfio_device *device)
+{
+}
+
+static inline int vfio_device_add(struct vfio_device *device)
+{
+	return device_add(&device->device);
+}
+
+static inline void vfio_device_del(struct vfio_device *device)
+{
+	device_del(&device->device);
+}
+
+static inline int vfio_device_fops_cdev_open(struct inode *inode,
+					     struct file *filep)
+{
+	return 0;
+}
+
+static inline long vfio_df_ioctl_bind_iommufd(struct vfio_device_file *df,
+					      struct vfio_device_bind_iommufd __user *arg)
+{
+	return -ENOTTY;
+}
+
+static inline void vfio_df_unbind_iommufd(struct vfio_device_file *df)
+{
+}
+
+static inline int vfio_cdev_init(struct class *device_class)
+{
+	return 0;
+}
+
+static inline void vfio_cdev_cleanup(void)
+{
+}
+#endif /* CONFIG_VFIO_DEVICE_CDEV */
+
+#if IS_ENABLED(CONFIG_VFIO_VIRQFD)
+int __init vfio_virqfd_init(void);
+void vfio_virqfd_exit(void);
+#else
+static inline int __init vfio_virqfd_init(void)
+{
+	return 0;
+}
+static inline void vfio_virqfd_exit(void)
+{
+}
+#endif
+
+#ifdef CONFIG_HAVE_KVM
+void vfio_device_get_kvm_safe(struct vfio_device *device, struct kvm *kvm);
+void vfio_device_put_kvm(struct vfio_device *device);
+#else
+static inline void vfio_device_get_kvm_safe(struct vfio_device *device,
+					    struct kvm *kvm)
+{
+}
+
+static inline void vfio_device_put_kvm(struct vfio_device *device)
+{
+}
+#endif
+
+#endif
diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
new file mode 100644
index 0000000000..a94ec6225d
--- /dev/null
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -0,0 +1,1374 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * VFIO: IOMMU DMA mapping support for TCE on POWER
+ *
+ * Copyright (C) 2013 IBM Corp.  All rights reserved.
+ *     Author: Alexey Kardashevskiy <aik@ozlabs.ru>
+ * Copyright Gavin Shan, IBM Corporation 2014.
+ *
+ * Derived from original vfio_iommu_type1.c:
+ * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
+ *     Author: Alex Williamson <alex.williamson@redhat.com>
+ */
+
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <linux/err.h>
+#include <linux/vfio.h>
+#include <linux/vmalloc.h>
+#include <linux/sched/mm.h>
+#include <linux/sched/signal.h>
+#include <linux/mm.h>
+#include "vfio.h"
+
+#include <asm/iommu.h>
+#include <asm/tce.h>
+#include <asm/mmu_context.h>
+
+#define DRIVER_VERSION  "0.1"
+#define DRIVER_AUTHOR   "aik@ozlabs.ru"
+#define DRIVER_DESC     "VFIO IOMMU SPAPR TCE"
+
+static void tce_iommu_detach_group(void *iommu_data,
+		struct iommu_group *iommu_group);
+
+/*
+ * VFIO IOMMU fd for SPAPR_TCE IOMMU implementation
+ *
+ * This code handles mapping and unmapping of user data buffers
+ * into DMA'ble space using the IOMMU
+ */
+
+struct tce_iommu_group {
+	struct list_head next;
+	struct iommu_group *grp;
+};
+
+/*
+ * A container needs to remember which preregistered region  it has
+ * referenced to do proper cleanup at the userspace process exit.
+ */
+struct tce_iommu_prereg {
+	struct list_head next;
+	struct mm_iommu_table_group_mem_t *mem;
+};
+
+/*
+ * The container descriptor supports only a single group per container.
+ * Required by the API as the container is not supplied with the IOMMU group
+ * at the moment of initialization.
+ */
+struct tce_container {
+	struct mutex lock;
+	bool enabled;
+	bool v2;
+	bool def_window_pending;
+	unsigned long locked_pages;
+	struct mm_struct *mm;
+	struct iommu_table *tables[IOMMU_TABLE_GROUP_MAX_TABLES];
+	struct list_head group_list;
+	struct list_head prereg_list;
+};
+
+static long tce_iommu_mm_set(struct tce_container *container)
+{
+	if (container->mm) {
+		if (container->mm == current->mm)
+			return 0;
+		return -EPERM;
+	}
+	BUG_ON(!current->mm);
+	container->mm = current->mm;
+	mmgrab(container->mm);
+
+	return 0;
+}
+
+static long tce_iommu_prereg_free(struct tce_container *container,
+		struct tce_iommu_prereg *tcemem)
+{
+	long ret;
+
+	ret = mm_iommu_put(container->mm, tcemem->mem);
+	if (ret)
+		return ret;
+
+	list_del(&tcemem->next);
+	kfree(tcemem);
+
+	return 0;
+}
+
+static long tce_iommu_unregister_pages(struct tce_container *container,
+		__u64 vaddr, __u64 size)
+{
+	struct mm_iommu_table_group_mem_t *mem;
+	struct tce_iommu_prereg *tcemem;
+	bool found = false;
+	long ret;
+
+	if ((vaddr & ~PAGE_MASK) || (size & ~PAGE_MASK))
+		return -EINVAL;
+
+	mem = mm_iommu_get(container->mm, vaddr, size >> PAGE_SHIFT);
+	if (!mem)
+		return -ENOENT;
+
+	list_for_each_entry(tcemem, &container->prereg_list, next) {
+		if (tcemem->mem == mem) {
+			found = true;
+			break;
+		}
+	}
+
+	if (!found)
+		ret = -ENOENT;
+	else
+		ret = tce_iommu_prereg_free(container, tcemem);
+
+	mm_iommu_put(container->mm, mem);
+
+	return ret;
+}
+
+static long tce_iommu_register_pages(struct tce_container *container,
+		__u64 vaddr, __u64 size)
+{
+	long ret = 0;
+	struct mm_iommu_table_group_mem_t *mem = NULL;
+	struct tce_iommu_prereg *tcemem;
+	unsigned long entries = size >> PAGE_SHIFT;
+
+	if ((vaddr & ~PAGE_MASK) || (size & ~PAGE_MASK) ||
+			((vaddr + size) < vaddr))
+		return -EINVAL;
+
+	mem = mm_iommu_get(container->mm, vaddr, entries);
+	if (mem) {
+		list_for_each_entry(tcemem, &container->prereg_list, next) {
+			if (tcemem->mem == mem) {
+				ret = -EBUSY;
+				goto put_exit;
+			}
+		}
+	} else {
+		ret = mm_iommu_new(container->mm, vaddr, entries, &mem);
+		if (ret)
+			return ret;
+	}
+
+	tcemem = kzalloc(sizeof(*tcemem), GFP_KERNEL);
+	if (!tcemem) {
+		ret = -ENOMEM;
+		goto put_exit;
+	}
+
+	tcemem->mem = mem;
+	list_add(&tcemem->next, &container->prereg_list);
+
+	container->enabled = true;
+
+	return 0;
+
+put_exit:
+	mm_iommu_put(container->mm, mem);
+	return ret;
+}
+
+static bool tce_page_is_contained(struct mm_struct *mm, unsigned long hpa,
+		unsigned int it_page_shift)
+{
+	struct page *page;
+	unsigned long size = 0;
+
+	if (mm_iommu_is_devmem(mm, hpa, it_page_shift, &size))
+		return size == (1UL << it_page_shift);
+
+	page = pfn_to_page(hpa >> PAGE_SHIFT);
+	/*
+	 * Check that the TCE table granularity is not bigger than the size of
+	 * a page we just found. Otherwise the hardware can get access to
+	 * a bigger memory chunk that it should.
+	 */
+	return page_shift(compound_head(page)) >= it_page_shift;
+}
+
+static inline bool tce_groups_attached(struct tce_container *container)
+{
+	return !list_empty(&container->group_list);
+}
+
+static long tce_iommu_find_table(struct tce_container *container,
+		phys_addr_t ioba, struct iommu_table **ptbl)
+{
+	long i;
+
+	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
+		struct iommu_table *tbl = container->tables[i];
+
+		if (tbl) {
+			unsigned long entry = ioba >> tbl->it_page_shift;
+			unsigned long start = tbl->it_offset;
+			unsigned long end = start + tbl->it_size;
+
+			if ((start <= entry) && (entry < end)) {
+				*ptbl = tbl;
+				return i;
+			}
+		}
+	}
+
+	return -1;
+}
+
+static int tce_iommu_find_free_table(struct tce_container *container)
+{
+	int i;
+
+	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
+		if (!container->tables[i])
+			return i;
+	}
+
+	return -ENOSPC;
+}
+
+static int tce_iommu_enable(struct tce_container *container)
+{
+	int ret = 0;
+	unsigned long locked;
+	struct iommu_table_group *table_group;
+	struct tce_iommu_group *tcegrp;
+
+	if (container->enabled)
+		return -EBUSY;
+
+	/*
+	 * When userspace pages are mapped into the IOMMU, they are effectively
+	 * locked memory, so, theoretically, we need to update the accounting
+	 * of locked pages on each map and unmap.  For powerpc, the map unmap
+	 * paths can be very hot, though, and the accounting would kill
+	 * performance, especially since it would be difficult to impossible
+	 * to handle the accounting in real mode only.
+	 *
+	 * To address that, rather than precisely accounting every page, we
+	 * instead account for a worst case on locked memory when the iommu is
+	 * enabled and disabled.  The worst case upper bound on locked memory
+	 * is the size of the whole iommu window, which is usually relatively
+	 * small (compared to total memory sizes) on POWER hardware.
+	 *
+	 * Also we don't have a nice way to fail on H_PUT_TCE due to ulimits,
+	 * that would effectively kill the guest at random points, much better
+	 * enforcing the limit based on the max that the guest can map.
+	 *
+	 * Unfortunately at the moment it counts whole tables, no matter how
+	 * much memory the guest has. I.e. for 4GB guest and 4 IOMMU groups
+	 * each with 2GB DMA window, 8GB will be counted here. The reason for
+	 * this is that we cannot tell here the amount of RAM used by the guest
+	 * as this information is only available from KVM and VFIO is
+	 * KVM agnostic.
+	 *
+	 * So we do not allow enabling a container without a group attached
+	 * as there is no way to know how much we should increment
+	 * the locked_vm counter.
+	 */
+	if (!tce_groups_attached(container))
+		return -ENODEV;
+
+	tcegrp = list_first_entry(&container->group_list,
+			struct tce_iommu_group, next);
+	table_group = iommu_group_get_iommudata(tcegrp->grp);
+	if (!table_group)
+		return -ENODEV;
+
+	if (!table_group->tce32_size)
+		return -EPERM;
+
+	ret = tce_iommu_mm_set(container);
+	if (ret)
+		return ret;
+
+	locked = table_group->tce32_size >> PAGE_SHIFT;
+	ret = account_locked_vm(container->mm, locked, true);
+	if (ret)
+		return ret;
+
+	container->locked_pages = locked;
+
+	container->enabled = true;
+
+	return ret;
+}
+
+static void tce_iommu_disable(struct tce_container *container)
+{
+	if (!container->enabled)
+		return;
+
+	container->enabled = false;
+
+	BUG_ON(!container->mm);
+	account_locked_vm(container->mm, container->locked_pages, false);
+}
+
+static void *tce_iommu_open(unsigned long arg)
+{
+	struct tce_container *container;
+
+	if ((arg != VFIO_SPAPR_TCE_IOMMU) && (arg != VFIO_SPAPR_TCE_v2_IOMMU)) {
+		pr_err("tce_vfio: Wrong IOMMU type\n");
+		return ERR_PTR(-EINVAL);
+	}
+
+	container = kzalloc(sizeof(*container), GFP_KERNEL);
+	if (!container)
+		return ERR_PTR(-ENOMEM);
+
+	mutex_init(&container->lock);
+	INIT_LIST_HEAD_RCU(&container->group_list);
+	INIT_LIST_HEAD_RCU(&container->prereg_list);
+
+	container->v2 = arg == VFIO_SPAPR_TCE_v2_IOMMU;
+
+	return container;
+}
+
+static int tce_iommu_clear(struct tce_container *container,
+		struct iommu_table *tbl,
+		unsigned long entry, unsigned long pages);
+static void tce_iommu_free_table(struct tce_container *container,
+		struct iommu_table *tbl);
+
+static void tce_iommu_release(void *iommu_data)
+{
+	struct tce_container *container = iommu_data;
+	struct tce_iommu_group *tcegrp;
+	struct tce_iommu_prereg *tcemem, *tmtmp;
+	long i;
+
+	while (tce_groups_attached(container)) {
+		tcegrp = list_first_entry(&container->group_list,
+				struct tce_iommu_group, next);
+		tce_iommu_detach_group(iommu_data, tcegrp->grp);
+	}
+
+	/*
+	 * If VFIO created a table, it was not disposed
+	 * by tce_iommu_detach_group() so do it now.
+	 */
+	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
+		struct iommu_table *tbl = container->tables[i];
+
+		if (!tbl)
+			continue;
+
+		tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size);
+		tce_iommu_free_table(container, tbl);
+	}
+
+	list_for_each_entry_safe(tcemem, tmtmp, &container->prereg_list, next)
+		WARN_ON(tce_iommu_prereg_free(container, tcemem));
+
+	tce_iommu_disable(container);
+	if (container->mm)
+		mmdrop(container->mm);
+	mutex_destroy(&container->lock);
+
+	kfree(container);
+}
+
+static void tce_iommu_unuse_page(unsigned long hpa)
+{
+	struct page *page;
+
+	page = pfn_to_page(hpa >> PAGE_SHIFT);
+	unpin_user_page(page);
+}
+
+static int tce_iommu_prereg_ua_to_hpa(struct tce_container *container,
+		unsigned long tce, unsigned long shift,
+		unsigned long *phpa, struct mm_iommu_table_group_mem_t **pmem)
+{
+	long ret = 0;
+	struct mm_iommu_table_group_mem_t *mem;
+
+	mem = mm_iommu_lookup(container->mm, tce, 1ULL << shift);
+	if (!mem)
+		return -EINVAL;
+
+	ret = mm_iommu_ua_to_hpa(mem, tce, shift, phpa);
+	if (ret)
+		return -EINVAL;
+
+	*pmem = mem;
+
+	return 0;
+}
+
+static void tce_iommu_unuse_page_v2(struct tce_container *container,
+		struct iommu_table *tbl, unsigned long entry)
+{
+	struct mm_iommu_table_group_mem_t *mem = NULL;
+	int ret;
+	unsigned long hpa = 0;
+	__be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RO(tbl, entry);
+
+	if (!pua)
+		return;
+
+	ret = tce_iommu_prereg_ua_to_hpa(container, be64_to_cpu(*pua),
+			tbl->it_page_shift, &hpa, &mem);
+	if (ret)
+		pr_debug("%s: tce %llx at #%lx was not cached, ret=%d\n",
+				__func__, be64_to_cpu(*pua), entry, ret);
+	if (mem)
+		mm_iommu_mapped_dec(mem);
+
+	*pua = cpu_to_be64(0);
+}
+
+static int tce_iommu_clear(struct tce_container *container,
+		struct iommu_table *tbl,
+		unsigned long entry, unsigned long pages)
+{
+	unsigned long oldhpa;
+	long ret;
+	enum dma_data_direction direction;
+	unsigned long lastentry = entry + pages, firstentry = entry;
+
+	for ( ; entry < lastentry; ++entry) {
+		if (tbl->it_indirect_levels && tbl->it_userspace) {
+			/*
+			 * For multilevel tables, we can take a shortcut here
+			 * and skip some TCEs as we know that the userspace
+			 * addresses cache is a mirror of the real TCE table
+			 * and if it is missing some indirect levels, then
+			 * the hardware table does not have them allocated
+			 * either and therefore does not require updating.
+			 */
+			__be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RO(tbl,
+					entry);
+			if (!pua) {
+				/* align to level_size which is power of two */
+				entry |= tbl->it_level_size - 1;
+				continue;
+			}
+		}
+
+		cond_resched();
+
+		direction = DMA_NONE;
+		oldhpa = 0;
+		ret = iommu_tce_xchg_no_kill(container->mm, tbl, entry, &oldhpa,
+				&direction);
+		if (ret)
+			continue;
+
+		if (direction == DMA_NONE)
+			continue;
+
+		if (container->v2) {
+			tce_iommu_unuse_page_v2(container, tbl, entry);
+			continue;
+		}
+
+		tce_iommu_unuse_page(oldhpa);
+	}
+
+	iommu_tce_kill(tbl, firstentry, pages);
+
+	return 0;
+}
+
+static int tce_iommu_use_page(unsigned long tce, unsigned long *hpa)
+{
+	struct page *page = NULL;
+	enum dma_data_direction direction = iommu_tce_direction(tce);
+
+	if (pin_user_pages_fast(tce & PAGE_MASK, 1,
+			direction != DMA_TO_DEVICE ? FOLL_WRITE : 0,
+			&page) != 1)
+		return -EFAULT;
+
+	*hpa = __pa((unsigned long) page_address(page));
+
+	return 0;
+}
+
+static long tce_iommu_build(struct tce_container *container,
+		struct iommu_table *tbl,
+		unsigned long entry, unsigned long tce, unsigned long pages,
+		enum dma_data_direction direction)
+{
+	long i, ret = 0;
+	unsigned long hpa;
+	enum dma_data_direction dirtmp;
+
+	for (i = 0; i < pages; ++i) {
+		unsigned long offset = tce & IOMMU_PAGE_MASK(tbl) & ~PAGE_MASK;
+
+		ret = tce_iommu_use_page(tce, &hpa);
+		if (ret)
+			break;
+
+		if (!tce_page_is_contained(container->mm, hpa,
+				tbl->it_page_shift)) {
+			ret = -EPERM;
+			break;
+		}
+
+		hpa |= offset;
+		dirtmp = direction;
+		ret = iommu_tce_xchg_no_kill(container->mm, tbl, entry + i,
+				&hpa, &dirtmp);
+		if (ret) {
+			tce_iommu_unuse_page(hpa);
+			pr_err("iommu_tce: %s failed ioba=%lx, tce=%lx, ret=%ld\n",
+					__func__, entry << tbl->it_page_shift,
+					tce, ret);
+			break;
+		}
+
+		if (dirtmp != DMA_NONE)
+			tce_iommu_unuse_page(hpa);
+
+		tce += IOMMU_PAGE_SIZE(tbl);
+	}
+
+	if (ret)
+		tce_iommu_clear(container, tbl, entry, i);
+	else
+		iommu_tce_kill(tbl, entry, pages);
+
+	return ret;
+}
+
+static long tce_iommu_build_v2(struct tce_container *container,
+		struct iommu_table *tbl,
+		unsigned long entry, unsigned long tce, unsigned long pages,
+		enum dma_data_direction direction)
+{
+	long i, ret = 0;
+	unsigned long hpa;
+	enum dma_data_direction dirtmp;
+
+	for (i = 0; i < pages; ++i) {
+		struct mm_iommu_table_group_mem_t *mem = NULL;
+		__be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry + i);
+
+		ret = tce_iommu_prereg_ua_to_hpa(container,
+				tce, tbl->it_page_shift, &hpa, &mem);
+		if (ret)
+			break;
+
+		if (!tce_page_is_contained(container->mm, hpa,
+				tbl->it_page_shift)) {
+			ret = -EPERM;
+			break;
+		}
+
+		/* Preserve offset within IOMMU page */
+		hpa |= tce & IOMMU_PAGE_MASK(tbl) & ~PAGE_MASK;
+		dirtmp = direction;
+
+		/* The registered region is being unregistered */
+		if (mm_iommu_mapped_inc(mem))
+			break;
+
+		ret = iommu_tce_xchg_no_kill(container->mm, tbl, entry + i,
+				&hpa, &dirtmp);
+		if (ret) {
+			/* dirtmp cannot be DMA_NONE here */
+			tce_iommu_unuse_page_v2(container, tbl, entry + i);
+			pr_err("iommu_tce: %s failed ioba=%lx, tce=%lx, ret=%ld\n",
+					__func__, entry << tbl->it_page_shift,
+					tce, ret);
+			break;
+		}
+
+		if (dirtmp != DMA_NONE)
+			tce_iommu_unuse_page_v2(container, tbl, entry + i);
+
+		*pua = cpu_to_be64(tce);
+
+		tce += IOMMU_PAGE_SIZE(tbl);
+	}
+
+	if (ret)
+		tce_iommu_clear(container, tbl, entry, i);
+	else
+		iommu_tce_kill(tbl, entry, pages);
+
+	return ret;
+}
+
+static long tce_iommu_create_table(struct tce_container *container,
+			struct iommu_table_group *table_group,
+			int num,
+			__u32 page_shift,
+			__u64 window_size,
+			__u32 levels,
+			struct iommu_table **ptbl)
+{
+	long ret, table_size;
+
+	table_size = table_group->ops->get_table_size(page_shift, window_size,
+			levels);
+	if (!table_size)
+		return -EINVAL;
+
+	ret = account_locked_vm(container->mm, table_size >> PAGE_SHIFT, true);
+	if (ret)
+		return ret;
+
+	ret = table_group->ops->create_table(table_group, num,
+			page_shift, window_size, levels, ptbl);
+
+	WARN_ON(!ret && !(*ptbl)->it_ops->free);
+	WARN_ON(!ret && ((*ptbl)->it_allocated_size > table_size));
+
+	return ret;
+}
+
+static void tce_iommu_free_table(struct tce_container *container,
+		struct iommu_table *tbl)
+{
+	unsigned long pages = tbl->it_allocated_size >> PAGE_SHIFT;
+
+	iommu_tce_table_put(tbl);
+	account_locked_vm(container->mm, pages, false);
+}
+
+static long tce_iommu_create_window(struct tce_container *container,
+		__u32 page_shift, __u64 window_size, __u32 levels,
+		__u64 *start_addr)
+{
+	struct tce_iommu_group *tcegrp;
+	struct iommu_table_group *table_group;
+	struct iommu_table *tbl = NULL;
+	long ret, num;
+
+	num = tce_iommu_find_free_table(container);
+	if (num < 0)
+		return num;
+
+	/* Get the first group for ops::create_table */
+	tcegrp = list_first_entry(&container->group_list,
+			struct tce_iommu_group, next);
+	table_group = iommu_group_get_iommudata(tcegrp->grp);
+	if (!table_group)
+		return -EFAULT;
+
+	if (!(table_group->pgsizes & (1ULL << page_shift)))
+		return -EINVAL;
+
+	if (!table_group->ops->set_window || !table_group->ops->unset_window ||
+			!table_group->ops->get_table_size ||
+			!table_group->ops->create_table)
+		return -EPERM;
+
+	/* Create TCE table */
+	ret = tce_iommu_create_table(container, table_group, num,
+			page_shift, window_size, levels, &tbl);
+	if (ret)
+		return ret;
+
+	BUG_ON(!tbl->it_ops->free);
+
+	/*
+	 * Program the table to every group.
+	 * Groups have been tested for compatibility at the attach time.
+	 */
+	list_for_each_entry(tcegrp, &container->group_list, next) {
+		table_group = iommu_group_get_iommudata(tcegrp->grp);
+
+		ret = table_group->ops->set_window(table_group, num, tbl);
+		if (ret)
+			goto unset_exit;
+	}
+
+	container->tables[num] = tbl;
+
+	/* Return start address assigned by platform in create_table() */
+	*start_addr = tbl->it_offset << tbl->it_page_shift;
+
+	return 0;
+
+unset_exit:
+	list_for_each_entry(tcegrp, &container->group_list, next) {
+		table_group = iommu_group_get_iommudata(tcegrp->grp);
+		table_group->ops->unset_window(table_group, num);
+	}
+	tce_iommu_free_table(container, tbl);
+
+	return ret;
+}
+
+static long tce_iommu_remove_window(struct tce_container *container,
+		__u64 start_addr)
+{
+	struct iommu_table_group *table_group = NULL;
+	struct iommu_table *tbl;
+	struct tce_iommu_group *tcegrp;
+	int num;
+
+	num = tce_iommu_find_table(container, start_addr, &tbl);
+	if (num < 0)
+		return -EINVAL;
+
+	BUG_ON(!tbl->it_size);
+
+	/* Detach groups from IOMMUs */
+	list_for_each_entry(tcegrp, &container->group_list, next) {
+		table_group = iommu_group_get_iommudata(tcegrp->grp);
+
+		/*
+		 * SPAPR TCE IOMMU exposes the default DMA window to
+		 * the guest via dma32_window_start/size of
+		 * VFIO_IOMMU_SPAPR_TCE_GET_INFO. Some platforms allow
+		 * the userspace to remove this window, some do not so
+		 * here we check for the platform capability.
+		 */
+		if (!table_group->ops || !table_group->ops->unset_window)
+			return -EPERM;
+
+		table_group->ops->unset_window(table_group, num);
+	}
+
+	/* Free table */
+	tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size);
+	tce_iommu_free_table(container, tbl);
+	container->tables[num] = NULL;
+
+	return 0;
+}
+
+static long tce_iommu_create_default_window(struct tce_container *container)
+{
+	long ret;
+	__u64 start_addr = 0;
+	struct tce_iommu_group *tcegrp;
+	struct iommu_table_group *table_group;
+
+	if (!container->def_window_pending)
+		return 0;
+
+	if (!tce_groups_attached(container))
+		return -ENODEV;
+
+	tcegrp = list_first_entry(&container->group_list,
+			struct tce_iommu_group, next);
+	table_group = iommu_group_get_iommudata(tcegrp->grp);
+	if (!table_group)
+		return -ENODEV;
+
+	ret = tce_iommu_create_window(container, IOMMU_PAGE_SHIFT_4K,
+			table_group->tce32_size, 1, &start_addr);
+	WARN_ON_ONCE(!ret && start_addr);
+
+	if (!ret)
+		container->def_window_pending = false;
+
+	return ret;
+}
+
+static long vfio_spapr_ioctl_eeh_pe_op(struct iommu_group *group,
+				       unsigned long arg)
+{
+	struct eeh_pe *pe;
+	struct vfio_eeh_pe_op op;
+	unsigned long minsz;
+
+	pe = eeh_iommu_group_to_pe(group);
+	if (!pe)
+		return -ENODEV;
+
+	minsz = offsetofend(struct vfio_eeh_pe_op, op);
+	if (copy_from_user(&op, (void __user *)arg, minsz))
+		return -EFAULT;
+	if (op.argsz < minsz || op.flags)
+		return -EINVAL;
+
+	switch (op.op) {
+	case VFIO_EEH_PE_DISABLE:
+		return eeh_pe_set_option(pe, EEH_OPT_DISABLE);
+	case VFIO_EEH_PE_ENABLE:
+		return eeh_pe_set_option(pe, EEH_OPT_ENABLE);
+	case VFIO_EEH_PE_UNFREEZE_IO:
+		return eeh_pe_set_option(pe, EEH_OPT_THAW_MMIO);
+	case VFIO_EEH_PE_UNFREEZE_DMA:
+		return eeh_pe_set_option(pe, EEH_OPT_THAW_DMA);
+	case VFIO_EEH_PE_GET_STATE:
+		return eeh_pe_get_state(pe);
+		break;
+	case VFIO_EEH_PE_RESET_DEACTIVATE:
+		return eeh_pe_reset(pe, EEH_RESET_DEACTIVATE, true);
+	case VFIO_EEH_PE_RESET_HOT:
+		return eeh_pe_reset(pe, EEH_RESET_HOT, true);
+	case VFIO_EEH_PE_RESET_FUNDAMENTAL:
+		return eeh_pe_reset(pe, EEH_RESET_FUNDAMENTAL, true);
+	case VFIO_EEH_PE_CONFIGURE:
+		return eeh_pe_configure(pe);
+	case VFIO_EEH_PE_INJECT_ERR:
+		minsz = offsetofend(struct vfio_eeh_pe_op, err.mask);
+		if (op.argsz < minsz)
+			return -EINVAL;
+		if (copy_from_user(&op, (void __user *)arg, minsz))
+			return -EFAULT;
+
+		return eeh_pe_inject_err(pe, op.err.type, op.err.func,
+					 op.err.addr, op.err.mask);
+	default:
+		return -EINVAL;
+	}
+}
+
+static long tce_iommu_ioctl(void *iommu_data,
+				 unsigned int cmd, unsigned long arg)
+{
+	struct tce_container *container = iommu_data;
+	unsigned long minsz, ddwsz;
+	long ret;
+
+	switch (cmd) {
+	case VFIO_CHECK_EXTENSION:
+		switch (arg) {
+		case VFIO_SPAPR_TCE_IOMMU:
+		case VFIO_SPAPR_TCE_v2_IOMMU:
+			return 1;
+		case VFIO_EEH:
+			return eeh_enabled();
+		default:
+			return 0;
+		}
+	}
+
+	/*
+	 * Sanity check to prevent one userspace from manipulating
+	 * another userspace mm.
+	 */
+	BUG_ON(!container);
+	if (container->mm && container->mm != current->mm)
+		return -EPERM;
+
+	switch (cmd) {
+	case VFIO_IOMMU_SPAPR_TCE_GET_INFO: {
+		struct vfio_iommu_spapr_tce_info info;
+		struct tce_iommu_group *tcegrp;
+		struct iommu_table_group *table_group;
+
+		if (!tce_groups_attached(container))
+			return -ENXIO;
+
+		tcegrp = list_first_entry(&container->group_list,
+				struct tce_iommu_group, next);
+		table_group = iommu_group_get_iommudata(tcegrp->grp);
+
+		if (!table_group)
+			return -ENXIO;
+
+		minsz = offsetofend(struct vfio_iommu_spapr_tce_info,
+				dma32_window_size);
+
+		if (copy_from_user(&info, (void __user *)arg, minsz))
+			return -EFAULT;
+
+		if (info.argsz < minsz)
+			return -EINVAL;
+
+		info.dma32_window_start = table_group->tce32_start;
+		info.dma32_window_size = table_group->tce32_size;
+		info.flags = 0;
+		memset(&info.ddw, 0, sizeof(info.ddw));
+
+		if (table_group->max_dynamic_windows_supported &&
+				container->v2) {
+			info.flags |= VFIO_IOMMU_SPAPR_INFO_DDW;
+			info.ddw.pgsizes = table_group->pgsizes;
+			info.ddw.max_dynamic_windows_supported =
+				table_group->max_dynamic_windows_supported;
+			info.ddw.levels = table_group->max_levels;
+		}
+
+		ddwsz = offsetofend(struct vfio_iommu_spapr_tce_info, ddw);
+
+		if (info.argsz >= ddwsz)
+			minsz = ddwsz;
+
+		if (copy_to_user((void __user *)arg, &info, minsz))
+			return -EFAULT;
+
+		return 0;
+	}
+	case VFIO_IOMMU_MAP_DMA: {
+		struct vfio_iommu_type1_dma_map param;
+		struct iommu_table *tbl = NULL;
+		long num;
+		enum dma_data_direction direction;
+
+		if (!container->enabled)
+			return -EPERM;
+
+		minsz = offsetofend(struct vfio_iommu_type1_dma_map, size);
+
+		if (copy_from_user(&param, (void __user *)arg, minsz))
+			return -EFAULT;
+
+		if (param.argsz < minsz)
+			return -EINVAL;
+
+		if (param.flags & ~(VFIO_DMA_MAP_FLAG_READ |
+				VFIO_DMA_MAP_FLAG_WRITE))
+			return -EINVAL;
+
+		ret = tce_iommu_create_default_window(container);
+		if (ret)
+			return ret;
+
+		num = tce_iommu_find_table(container, param.iova, &tbl);
+		if (num < 0)
+			return -ENXIO;
+
+		if ((param.size & ~IOMMU_PAGE_MASK(tbl)) ||
+				(param.vaddr & ~IOMMU_PAGE_MASK(tbl)))
+			return -EINVAL;
+
+		/* iova is checked by the IOMMU API */
+		if (param.flags & VFIO_DMA_MAP_FLAG_READ) {
+			if (param.flags & VFIO_DMA_MAP_FLAG_WRITE)
+				direction = DMA_BIDIRECTIONAL;
+			else
+				direction = DMA_TO_DEVICE;
+		} else {
+			if (param.flags & VFIO_DMA_MAP_FLAG_WRITE)
+				direction = DMA_FROM_DEVICE;
+			else
+				return -EINVAL;
+		}
+
+		ret = iommu_tce_put_param_check(tbl, param.iova, param.vaddr);
+		if (ret)
+			return ret;
+
+		if (container->v2)
+			ret = tce_iommu_build_v2(container, tbl,
+					param.iova >> tbl->it_page_shift,
+					param.vaddr,
+					param.size >> tbl->it_page_shift,
+					direction);
+		else
+			ret = tce_iommu_build(container, tbl,
+					param.iova >> tbl->it_page_shift,
+					param.vaddr,
+					param.size >> tbl->it_page_shift,
+					direction);
+
+		iommu_flush_tce(tbl);
+
+		return ret;
+	}
+	case VFIO_IOMMU_UNMAP_DMA: {
+		struct vfio_iommu_type1_dma_unmap param;
+		struct iommu_table *tbl = NULL;
+		long num;
+
+		if (!container->enabled)
+			return -EPERM;
+
+		minsz = offsetofend(struct vfio_iommu_type1_dma_unmap,
+				size);
+
+		if (copy_from_user(&param, (void __user *)arg, minsz))
+			return -EFAULT;
+
+		if (param.argsz < minsz)
+			return -EINVAL;
+
+		/* No flag is supported now */
+		if (param.flags)
+			return -EINVAL;
+
+		ret = tce_iommu_create_default_window(container);
+		if (ret)
+			return ret;
+
+		num = tce_iommu_find_table(container, param.iova, &tbl);
+		if (num < 0)
+			return -ENXIO;
+
+		if (param.size & ~IOMMU_PAGE_MASK(tbl))
+			return -EINVAL;
+
+		ret = iommu_tce_clear_param_check(tbl, param.iova, 0,
+				param.size >> tbl->it_page_shift);
+		if (ret)
+			return ret;
+
+		ret = tce_iommu_clear(container, tbl,
+				param.iova >> tbl->it_page_shift,
+				param.size >> tbl->it_page_shift);
+		iommu_flush_tce(tbl);
+
+		return ret;
+	}
+	case VFIO_IOMMU_SPAPR_REGISTER_MEMORY: {
+		struct vfio_iommu_spapr_register_memory param;
+
+		if (!container->v2)
+			break;
+
+		minsz = offsetofend(struct vfio_iommu_spapr_register_memory,
+				size);
+
+		ret = tce_iommu_mm_set(container);
+		if (ret)
+			return ret;
+
+		if (copy_from_user(&param, (void __user *)arg, minsz))
+			return -EFAULT;
+
+		if (param.argsz < minsz)
+			return -EINVAL;
+
+		/* No flag is supported now */
+		if (param.flags)
+			return -EINVAL;
+
+		mutex_lock(&container->lock);
+		ret = tce_iommu_register_pages(container, param.vaddr,
+				param.size);
+		mutex_unlock(&container->lock);
+
+		return ret;
+	}
+	case VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY: {
+		struct vfio_iommu_spapr_register_memory param;
+
+		if (!container->v2)
+			break;
+
+		if (!container->mm)
+			return -EPERM;
+
+		minsz = offsetofend(struct vfio_iommu_spapr_register_memory,
+				size);
+
+		if (copy_from_user(&param, (void __user *)arg, minsz))
+			return -EFAULT;
+
+		if (param.argsz < minsz)
+			return -EINVAL;
+
+		/* No flag is supported now */
+		if (param.flags)
+			return -EINVAL;
+
+		mutex_lock(&container->lock);
+		ret = tce_iommu_unregister_pages(container, param.vaddr,
+				param.size);
+		mutex_unlock(&container->lock);
+
+		return ret;
+	}
+	case VFIO_IOMMU_ENABLE:
+		if (container->v2)
+			break;
+
+		mutex_lock(&container->lock);
+		ret = tce_iommu_enable(container);
+		mutex_unlock(&container->lock);
+		return ret;
+
+
+	case VFIO_IOMMU_DISABLE:
+		if (container->v2)
+			break;
+
+		mutex_lock(&container->lock);
+		tce_iommu_disable(container);
+		mutex_unlock(&container->lock);
+		return 0;
+
+	case VFIO_EEH_PE_OP: {
+		struct tce_iommu_group *tcegrp;
+
+		ret = 0;
+		list_for_each_entry(tcegrp, &container->group_list, next) {
+			ret = vfio_spapr_ioctl_eeh_pe_op(tcegrp->grp, arg);
+			if (ret)
+				return ret;
+		}
+		return ret;
+	}
+
+	case VFIO_IOMMU_SPAPR_TCE_CREATE: {
+		struct vfio_iommu_spapr_tce_create create;
+
+		if (!container->v2)
+			break;
+
+		ret = tce_iommu_mm_set(container);
+		if (ret)
+			return ret;
+
+		if (!tce_groups_attached(container))
+			return -ENXIO;
+
+		minsz = offsetofend(struct vfio_iommu_spapr_tce_create,
+				start_addr);
+
+		if (copy_from_user(&create, (void __user *)arg, minsz))
+			return -EFAULT;
+
+		if (create.argsz < minsz)
+			return -EINVAL;
+
+		if (create.flags)
+			return -EINVAL;
+
+		mutex_lock(&container->lock);
+
+		ret = tce_iommu_create_default_window(container);
+		if (!ret)
+			ret = tce_iommu_create_window(container,
+					create.page_shift,
+					create.window_size, create.levels,
+					&create.start_addr);
+
+		mutex_unlock(&container->lock);
+
+		if (!ret && copy_to_user((void __user *)arg, &create, minsz))
+			ret = -EFAULT;
+
+		return ret;
+	}
+	case VFIO_IOMMU_SPAPR_TCE_REMOVE: {
+		struct vfio_iommu_spapr_tce_remove remove;
+
+		if (!container->v2)
+			break;
+
+		ret = tce_iommu_mm_set(container);
+		if (ret)
+			return ret;
+
+		if (!tce_groups_attached(container))
+			return -ENXIO;
+
+		minsz = offsetofend(struct vfio_iommu_spapr_tce_remove,
+				start_addr);
+
+		if (copy_from_user(&remove, (void __user *)arg, minsz))
+			return -EFAULT;
+
+		if (remove.argsz < minsz)
+			return -EINVAL;
+
+		if (remove.flags)
+			return -EINVAL;
+
+		if (container->def_window_pending && !remove.start_addr) {
+			container->def_window_pending = false;
+			return 0;
+		}
+
+		mutex_lock(&container->lock);
+
+		ret = tce_iommu_remove_window(container, remove.start_addr);
+
+		mutex_unlock(&container->lock);
+
+		return ret;
+	}
+	}
+
+	return -ENOTTY;
+}
+
+static void tce_iommu_release_ownership(struct tce_container *container,
+		struct iommu_table_group *table_group)
+{
+	long i;
+
+	if (!table_group->ops->unset_window) {
+		WARN_ON_ONCE(1);
+		return;
+	}
+
+	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i)
+		if (container->tables[i])
+			table_group->ops->unset_window(table_group, i);
+}
+
+static long tce_iommu_take_ownership(struct tce_container *container,
+		struct iommu_table_group *table_group)
+{
+	long i, ret = 0;
+
+	/* Set all windows to the new group */
+	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
+		struct iommu_table *tbl = container->tables[i];
+
+		if (!tbl)
+			continue;
+
+		ret = table_group->ops->set_window(table_group, i, tbl);
+		if (ret)
+			goto release_exit;
+	}
+
+	return 0;
+
+release_exit:
+	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i)
+		table_group->ops->unset_window(table_group, i);
+
+	return ret;
+}
+
+static int tce_iommu_attach_group(void *iommu_data,
+		struct iommu_group *iommu_group, enum vfio_group_type type)
+{
+	int ret = 0;
+	struct tce_container *container = iommu_data;
+	struct iommu_table_group *table_group;
+	struct tce_iommu_group *tcegrp = NULL;
+
+	if (type == VFIO_EMULATED_IOMMU)
+		return -EINVAL;
+
+	mutex_lock(&container->lock);
+
+	/* pr_debug("tce_vfio: Attaching group #%u to iommu %p\n",
+			iommu_group_id(iommu_group), iommu_group); */
+	table_group = iommu_group_get_iommudata(iommu_group);
+	if (!table_group) {
+		ret = -ENODEV;
+		goto unlock_exit;
+	}
+
+	/* v2 requires full support of dynamic DMA windows */
+	if (container->v2 && table_group->max_dynamic_windows_supported == 0) {
+		ret = -EINVAL;
+		goto unlock_exit;
+	}
+
+	/* v1 reuses TCE tables and does not share them among PEs */
+	if (!container->v2 && tce_groups_attached(container)) {
+		ret = -EBUSY;
+		goto unlock_exit;
+	}
+
+	/*
+	 * Check if new group has the same iommu_table_group_ops
+	 * (i.e. compatible)
+	 */
+	list_for_each_entry(tcegrp, &container->group_list, next) {
+		struct iommu_table_group *table_group_tmp;
+
+		if (tcegrp->grp == iommu_group) {
+			pr_warn("tce_vfio: Group %d is already attached\n",
+					iommu_group_id(iommu_group));
+			ret = -EBUSY;
+			goto unlock_exit;
+		}
+		table_group_tmp = iommu_group_get_iommudata(tcegrp->grp);
+		if (table_group_tmp->ops->create_table !=
+				table_group->ops->create_table) {
+			pr_warn("tce_vfio: Group %d is incompatible with group %d\n",
+					iommu_group_id(iommu_group),
+					iommu_group_id(tcegrp->grp));
+			ret = -EPERM;
+			goto unlock_exit;
+		}
+	}
+
+	tcegrp = kzalloc(sizeof(*tcegrp), GFP_KERNEL);
+	if (!tcegrp) {
+		ret = -ENOMEM;
+		goto unlock_exit;
+	}
+
+	ret = tce_iommu_take_ownership(container, table_group);
+	if (!tce_groups_attached(container) && !container->tables[0])
+		container->def_window_pending = true;
+
+	if (!ret) {
+		tcegrp->grp = iommu_group;
+		list_add(&tcegrp->next, &container->group_list);
+	}
+
+	if (ret && tcegrp)
+		kfree(tcegrp);
+
+unlock_exit:
+	mutex_unlock(&container->lock);
+
+	return ret;
+}
+
+static void tce_iommu_detach_group(void *iommu_data,
+		struct iommu_group *iommu_group)
+{
+	struct tce_container *container = iommu_data;
+	struct iommu_table_group *table_group;
+	bool found = false;
+	struct tce_iommu_group *tcegrp;
+
+	mutex_lock(&container->lock);
+
+	list_for_each_entry(tcegrp, &container->group_list, next) {
+		if (tcegrp->grp == iommu_group) {
+			found = true;
+			break;
+		}
+	}
+
+	if (!found) {
+		pr_warn("tce_vfio: detaching unattached group #%u\n",
+				iommu_group_id(iommu_group));
+		goto unlock_exit;
+	}
+
+	list_del(&tcegrp->next);
+	kfree(tcegrp);
+
+	table_group = iommu_group_get_iommudata(iommu_group);
+	BUG_ON(!table_group);
+
+	tce_iommu_release_ownership(container, table_group);
+
+unlock_exit:
+	mutex_unlock(&container->lock);
+}
+
+static const struct vfio_iommu_driver_ops tce_iommu_driver_ops = {
+	.name		= "iommu-vfio-powerpc",
+	.owner		= THIS_MODULE,
+	.open		= tce_iommu_open,
+	.release	= tce_iommu_release,
+	.ioctl		= tce_iommu_ioctl,
+	.attach_group	= tce_iommu_attach_group,
+	.detach_group	= tce_iommu_detach_group,
+};
+
+static int __init tce_iommu_init(void)
+{
+	return vfio_register_iommu_driver(&tce_iommu_driver_ops);
+}
+
+static void __exit tce_iommu_cleanup(void)
+{
+	vfio_unregister_iommu_driver(&tce_iommu_driver_ops);
+}
+
+module_init(tce_iommu_init);
+module_exit(tce_iommu_cleanup);
+
+MODULE_VERSION(DRIVER_VERSION);
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR(DRIVER_AUTHOR);
+MODULE_DESCRIPTION(DRIVER_DESC);
+
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
new file mode 100644
index 0000000000..eacd6ec04d
--- /dev/null
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -0,0 +1,3200 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * VFIO: IOMMU DMA mapping support for Type1 IOMMU
+ *
+ * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
+ *     Author: Alex Williamson <alex.williamson@redhat.com>
+ *
+ * Derived from original vfio:
+ * Copyright 2010 Cisco Systems, Inc.  All rights reserved.
+ * Author: Tom Lyon, pugs@cisco.com
+ *
+ * We arbitrarily define a Type1 IOMMU as one matching the below code.
+ * It could be called the x86 IOMMU as it's designed for AMD-Vi & Intel
+ * VT-d, but that makes it harder to re-use as theoretically anyone
+ * implementing a similar IOMMU could make use of this.  We expect the
+ * IOMMU to support the IOMMU API and have few to no restrictions around
+ * the IOVA range that can be mapped.  The Type1 IOMMU is currently
+ * optimized for relatively static mappings of a userspace process with
+ * userspace pages pinned into memory.  We also assume devices and IOMMU
+ * domains are PCI based as the IOMMU API is still centered around a
+ * device/bus interface rather than a group interface.
+ */
+
+#include <linux/compat.h>
+#include <linux/device.h>
+#include <linux/fs.h>
+#include <linux/highmem.h>
+#include <linux/iommu.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/kthread.h>
+#include <linux/rbtree.h>
+#include <linux/sched/signal.h>
+#include <linux/sched/mm.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <linux/vfio.h>
+#include <linux/workqueue.h>
+#include <linux/notifier.h>
+#include "vfio.h"
+
+#define DRIVER_VERSION  "0.2"
+#define DRIVER_AUTHOR   "Alex Williamson <alex.williamson@redhat.com>"
+#define DRIVER_DESC     "Type1 IOMMU driver for VFIO"
+
+static bool allow_unsafe_interrupts;
+module_param_named(allow_unsafe_interrupts,
+		   allow_unsafe_interrupts, bool, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(allow_unsafe_interrupts,
+		 "Enable VFIO IOMMU support for on platforms without interrupt remapping support.");
+
+static bool disable_hugepages;
+module_param_named(disable_hugepages,
+		   disable_hugepages, bool, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(disable_hugepages,
+		 "Disable VFIO IOMMU support for IOMMU hugepages.");
+
+static unsigned int dma_entry_limit __read_mostly = U16_MAX;
+module_param_named(dma_entry_limit, dma_entry_limit, uint, 0644);
+MODULE_PARM_DESC(dma_entry_limit,
+		 "Maximum number of user DMA mappings per container (65535).");
+
+struct vfio_iommu {
+	struct list_head	domain_list;
+	struct list_head	iova_list;
+	struct mutex		lock;
+	struct rb_root		dma_list;
+	struct list_head	device_list;
+	struct mutex		device_list_lock;
+	unsigned int		dma_avail;
+	unsigned int		vaddr_invalid_count;
+	uint64_t		pgsize_bitmap;
+	uint64_t		num_non_pinned_groups;
+	bool			v2;
+	bool			nesting;
+	bool			dirty_page_tracking;
+	struct list_head	emulated_iommu_groups;
+};
+
+struct vfio_domain {
+	struct iommu_domain	*domain;
+	struct list_head	next;
+	struct list_head	group_list;
+	bool			fgsp : 1;	/* Fine-grained super pages */
+	bool			enforce_cache_coherency : 1;
+};
+
+struct vfio_dma {
+	struct rb_node		node;
+	dma_addr_t		iova;		/* Device address */
+	unsigned long		vaddr;		/* Process virtual addr */
+	size_t			size;		/* Map size (bytes) */
+	int			prot;		/* IOMMU_READ/WRITE */
+	bool			iommu_mapped;
+	bool			lock_cap;	/* capable(CAP_IPC_LOCK) */
+	bool			vaddr_invalid;
+	struct task_struct	*task;
+	struct rb_root		pfn_list;	/* Ex-user pinned pfn list */
+	unsigned long		*bitmap;
+	struct mm_struct	*mm;
+	size_t			locked_vm;
+};
+
+struct vfio_batch {
+	struct page		**pages;	/* for pin_user_pages_remote */
+	struct page		*fallback_page; /* if pages alloc fails */
+	int			capacity;	/* length of pages array */
+	int			size;		/* of batch currently */
+	int			offset;		/* of next entry in pages */
+};
+
+struct vfio_iommu_group {
+	struct iommu_group	*iommu_group;
+	struct list_head	next;
+	bool			pinned_page_dirty_scope;
+};
+
+struct vfio_iova {
+	struct list_head	list;
+	dma_addr_t		start;
+	dma_addr_t		end;
+};
+
+/*
+ * Guest RAM pinning working set or DMA target
+ */
+struct vfio_pfn {
+	struct rb_node		node;
+	dma_addr_t		iova;		/* Device address */
+	unsigned long		pfn;		/* Host pfn */
+	unsigned int		ref_count;
+};
+
+struct vfio_regions {
+	struct list_head list;
+	dma_addr_t iova;
+	phys_addr_t phys;
+	size_t len;
+};
+
+#define DIRTY_BITMAP_BYTES(n)	(ALIGN(n, BITS_PER_TYPE(u64)) / BITS_PER_BYTE)
+
+/*
+ * Input argument of number of bits to bitmap_set() is unsigned integer, which
+ * further casts to signed integer for unaligned multi-bit operation,
+ * __bitmap_set().
+ * Then maximum bitmap size supported is 2^31 bits divided by 2^3 bits/byte,
+ * that is 2^28 (256 MB) which maps to 2^31 * 2^12 = 2^43 (8TB) on 4K page
+ * system.
+ */
+#define DIRTY_BITMAP_PAGES_MAX	 ((u64)INT_MAX)
+#define DIRTY_BITMAP_SIZE_MAX	 DIRTY_BITMAP_BYTES(DIRTY_BITMAP_PAGES_MAX)
+
+static int put_pfn(unsigned long pfn, int prot);
+
+static struct vfio_iommu_group*
+vfio_iommu_find_iommu_group(struct vfio_iommu *iommu,
+			    struct iommu_group *iommu_group);
+
+/*
+ * This code handles mapping and unmapping of user data buffers
+ * into DMA'ble space using the IOMMU
+ */
+
+static struct vfio_dma *vfio_find_dma(struct vfio_iommu *iommu,
+				      dma_addr_t start, size_t size)
+{
+	struct rb_node *node = iommu->dma_list.rb_node;
+
+	while (node) {
+		struct vfio_dma *dma = rb_entry(node, struct vfio_dma, node);
+
+		if (start + size <= dma->iova)
+			node = node->rb_left;
+		else if (start >= dma->iova + dma->size)
+			node = node->rb_right;
+		else
+			return dma;
+	}
+
+	return NULL;
+}
+
+static struct rb_node *vfio_find_dma_first_node(struct vfio_iommu *iommu,
+						dma_addr_t start, u64 size)
+{
+	struct rb_node *res = NULL;
+	struct rb_node *node = iommu->dma_list.rb_node;
+	struct vfio_dma *dma_res = NULL;
+
+	while (node) {
+		struct vfio_dma *dma = rb_entry(node, struct vfio_dma, node);
+
+		if (start < dma->iova + dma->size) {
+			res = node;
+			dma_res = dma;
+			if (start >= dma->iova)
+				break;
+			node = node->rb_left;
+		} else {
+			node = node->rb_right;
+		}
+	}
+	if (res && size && dma_res->iova >= start + size)
+		res = NULL;
+	return res;
+}
+
+static void vfio_link_dma(struct vfio_iommu *iommu, struct vfio_dma *new)
+{
+	struct rb_node **link = &iommu->dma_list.rb_node, *parent = NULL;
+	struct vfio_dma *dma;
+
+	while (*link) {
+		parent = *link;
+		dma = rb_entry(parent, struct vfio_dma, node);
+
+		if (new->iova + new->size <= dma->iova)
+			link = &(*link)->rb_left;
+		else
+			link = &(*link)->rb_right;
+	}
+
+	rb_link_node(&new->node, parent, link);
+	rb_insert_color(&new->node, &iommu->dma_list);
+}
+
+static void vfio_unlink_dma(struct vfio_iommu *iommu, struct vfio_dma *old)
+{
+	rb_erase(&old->node, &iommu->dma_list);
+}
+
+
+static int vfio_dma_bitmap_alloc(struct vfio_dma *dma, size_t pgsize)
+{
+	uint64_t npages = dma->size / pgsize;
+
+	if (npages > DIRTY_BITMAP_PAGES_MAX)
+		return -EINVAL;
+
+	/*
+	 * Allocate extra 64 bits that are used to calculate shift required for
+	 * bitmap_shift_left() to manipulate and club unaligned number of pages
+	 * in adjacent vfio_dma ranges.
+	 */
+	dma->bitmap = kvzalloc(DIRTY_BITMAP_BYTES(npages) + sizeof(u64),
+			       GFP_KERNEL);
+	if (!dma->bitmap)
+		return -ENOMEM;
+
+	return 0;
+}
+
+static void vfio_dma_bitmap_free(struct vfio_dma *dma)
+{
+	kvfree(dma->bitmap);
+	dma->bitmap = NULL;
+}
+
+static void vfio_dma_populate_bitmap(struct vfio_dma *dma, size_t pgsize)
+{
+	struct rb_node *p;
+	unsigned long pgshift = __ffs(pgsize);
+
+	for (p = rb_first(&dma->pfn_list); p; p = rb_next(p)) {
+		struct vfio_pfn *vpfn = rb_entry(p, struct vfio_pfn, node);
+
+		bitmap_set(dma->bitmap, (vpfn->iova - dma->iova) >> pgshift, 1);
+	}
+}
+
+static void vfio_iommu_populate_bitmap_full(struct vfio_iommu *iommu)
+{
+	struct rb_node *n;
+	unsigned long pgshift = __ffs(iommu->pgsize_bitmap);
+
+	for (n = rb_first(&iommu->dma_list); n; n = rb_next(n)) {
+		struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
+
+		bitmap_set(dma->bitmap, 0, dma->size >> pgshift);
+	}
+}
+
+static int vfio_dma_bitmap_alloc_all(struct vfio_iommu *iommu, size_t pgsize)
+{
+	struct rb_node *n;
+
+	for (n = rb_first(&iommu->dma_list); n; n = rb_next(n)) {
+		struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
+		int ret;
+
+		ret = vfio_dma_bitmap_alloc(dma, pgsize);
+		if (ret) {
+			struct rb_node *p;
+
+			for (p = rb_prev(n); p; p = rb_prev(p)) {
+				struct vfio_dma *dma = rb_entry(n,
+							struct vfio_dma, node);
+
+				vfio_dma_bitmap_free(dma);
+			}
+			return ret;
+		}
+		vfio_dma_populate_bitmap(dma, pgsize);
+	}
+	return 0;
+}
+
+static void vfio_dma_bitmap_free_all(struct vfio_iommu *iommu)
+{
+	struct rb_node *n;
+
+	for (n = rb_first(&iommu->dma_list); n; n = rb_next(n)) {
+		struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
+
+		vfio_dma_bitmap_free(dma);
+	}
+}
+
+/*
+ * Helper Functions for host iova-pfn list
+ */
+static struct vfio_pfn *vfio_find_vpfn(struct vfio_dma *dma, dma_addr_t iova)
+{
+	struct vfio_pfn *vpfn;
+	struct rb_node *node = dma->pfn_list.rb_node;
+
+	while (node) {
+		vpfn = rb_entry(node, struct vfio_pfn, node);
+
+		if (iova < vpfn->iova)
+			node = node->rb_left;
+		else if (iova > vpfn->iova)
+			node = node->rb_right;
+		else
+			return vpfn;
+	}
+	return NULL;
+}
+
+static void vfio_link_pfn(struct vfio_dma *dma,
+			  struct vfio_pfn *new)
+{
+	struct rb_node **link, *parent = NULL;
+	struct vfio_pfn *vpfn;
+
+	link = &dma->pfn_list.rb_node;
+	while (*link) {
+		parent = *link;
+		vpfn = rb_entry(parent, struct vfio_pfn, node);
+
+		if (new->iova < vpfn->iova)
+			link = &(*link)->rb_left;
+		else
+			link = &(*link)->rb_right;
+	}
+
+	rb_link_node(&new->node, parent, link);
+	rb_insert_color(&new->node, &dma->pfn_list);
+}
+
+static void vfio_unlink_pfn(struct vfio_dma *dma, struct vfio_pfn *old)
+{
+	rb_erase(&old->node, &dma->pfn_list);
+}
+
+static int vfio_add_to_pfn_list(struct vfio_dma *dma, dma_addr_t iova,
+				unsigned long pfn)
+{
+	struct vfio_pfn *vpfn;
+
+	vpfn = kzalloc(sizeof(*vpfn), GFP_KERNEL);
+	if (!vpfn)
+		return -ENOMEM;
+
+	vpfn->iova = iova;
+	vpfn->pfn = pfn;
+	vpfn->ref_count = 1;
+	vfio_link_pfn(dma, vpfn);
+	return 0;
+}
+
+static void vfio_remove_from_pfn_list(struct vfio_dma *dma,
+				      struct vfio_pfn *vpfn)
+{
+	vfio_unlink_pfn(dma, vpfn);
+	kfree(vpfn);
+}
+
+static struct vfio_pfn *vfio_iova_get_vfio_pfn(struct vfio_dma *dma,
+					       unsigned long iova)
+{
+	struct vfio_pfn *vpfn = vfio_find_vpfn(dma, iova);
+
+	if (vpfn)
+		vpfn->ref_count++;
+	return vpfn;
+}
+
+static int vfio_iova_put_vfio_pfn(struct vfio_dma *dma, struct vfio_pfn *vpfn)
+{
+	int ret = 0;
+
+	vpfn->ref_count--;
+	if (!vpfn->ref_count) {
+		ret = put_pfn(vpfn->pfn, dma->prot);
+		vfio_remove_from_pfn_list(dma, vpfn);
+	}
+	return ret;
+}
+
+static int mm_lock_acct(struct task_struct *task, struct mm_struct *mm,
+			bool lock_cap, long npage)
+{
+	int ret = mmap_write_lock_killable(mm);
+
+	if (ret)
+		return ret;
+
+	ret = __account_locked_vm(mm, abs(npage), npage > 0, task, lock_cap);
+	mmap_write_unlock(mm);
+	return ret;
+}
+
+static int vfio_lock_acct(struct vfio_dma *dma, long npage, bool async)
+{
+	struct mm_struct *mm;
+	int ret;
+
+	if (!npage)
+		return 0;
+
+	mm = dma->mm;
+	if (async && !mmget_not_zero(mm))
+		return -ESRCH; /* process exited */
+
+	ret = mm_lock_acct(dma->task, mm, dma->lock_cap, npage);
+	if (!ret)
+		dma->locked_vm += npage;
+
+	if (async)
+		mmput(mm);
+
+	return ret;
+}
+
+/*
+ * Some mappings aren't backed by a struct page, for example an mmap'd
+ * MMIO range for our own or another device.  These use a different
+ * pfn conversion and shouldn't be tracked as locked pages.
+ * For compound pages, any driver that sets the reserved bit in head
+ * page needs to set the reserved bit in all subpages to be safe.
+ */
+static bool is_invalid_reserved_pfn(unsigned long pfn)
+{
+	if (pfn_valid(pfn))
+		return PageReserved(pfn_to_page(pfn));
+
+	return true;
+}
+
+static int put_pfn(unsigned long pfn, int prot)
+{
+	if (!is_invalid_reserved_pfn(pfn)) {
+		struct page *page = pfn_to_page(pfn);
+
+		unpin_user_pages_dirty_lock(&page, 1, prot & IOMMU_WRITE);
+		return 1;
+	}
+	return 0;
+}
+
+#define VFIO_BATCH_MAX_CAPACITY (PAGE_SIZE / sizeof(struct page *))
+
+static void vfio_batch_init(struct vfio_batch *batch)
+{
+	batch->size = 0;
+	batch->offset = 0;
+
+	if (unlikely(disable_hugepages))
+		goto fallback;
+
+	batch->pages = (struct page **) __get_free_page(GFP_KERNEL);
+	if (!batch->pages)
+		goto fallback;
+
+	batch->capacity = VFIO_BATCH_MAX_CAPACITY;
+	return;
+
+fallback:
+	batch->pages = &batch->fallback_page;
+	batch->capacity = 1;
+}
+
+static void vfio_batch_unpin(struct vfio_batch *batch, struct vfio_dma *dma)
+{
+	while (batch->size) {
+		unsigned long pfn = page_to_pfn(batch->pages[batch->offset]);
+
+		put_pfn(pfn, dma->prot);
+		batch->offset++;
+		batch->size--;
+	}
+}
+
+static void vfio_batch_fini(struct vfio_batch *batch)
+{
+	if (batch->capacity == VFIO_BATCH_MAX_CAPACITY)
+		free_page((unsigned long)batch->pages);
+}
+
+static int follow_fault_pfn(struct vm_area_struct *vma, struct mm_struct *mm,
+			    unsigned long vaddr, unsigned long *pfn,
+			    bool write_fault)
+{
+	pte_t *ptep;
+	pte_t pte;
+	spinlock_t *ptl;
+	int ret;
+
+	ret = follow_pte(vma->vm_mm, vaddr, &ptep, &ptl);
+	if (ret) {
+		bool unlocked = false;
+
+		ret = fixup_user_fault(mm, vaddr,
+				       FAULT_FLAG_REMOTE |
+				       (write_fault ?  FAULT_FLAG_WRITE : 0),
+				       &unlocked);
+		if (unlocked)
+			return -EAGAIN;
+
+		if (ret)
+			return ret;
+
+		ret = follow_pte(vma->vm_mm, vaddr, &ptep, &ptl);
+		if (ret)
+			return ret;
+	}
+
+	pte = ptep_get(ptep);
+
+	if (write_fault && !pte_write(pte))
+		ret = -EFAULT;
+	else
+		*pfn = pte_pfn(pte);
+
+	pte_unmap_unlock(ptep, ptl);
+	return ret;
+}
+
+/*
+ * Returns the positive number of pfns successfully obtained or a negative
+ * error code.
+ */
+static int vaddr_get_pfns(struct mm_struct *mm, unsigned long vaddr,
+			  long npages, int prot, unsigned long *pfn,
+			  struct page **pages)
+{
+	struct vm_area_struct *vma;
+	unsigned int flags = 0;
+	int ret;
+
+	if (prot & IOMMU_WRITE)
+		flags |= FOLL_WRITE;
+
+	mmap_read_lock(mm);
+	ret = pin_user_pages_remote(mm, vaddr, npages, flags | FOLL_LONGTERM,
+				    pages, NULL);
+	if (ret > 0) {
+		int i;
+
+		/*
+		 * The zero page is always resident, we don't need to pin it
+		 * and it falls into our invalid/reserved test so we don't
+		 * unpin in put_pfn().  Unpin all zero pages in the batch here.
+		 */
+		for (i = 0 ; i < ret; i++) {
+			if (unlikely(is_zero_pfn(page_to_pfn(pages[i]))))
+				unpin_user_page(pages[i]);
+		}
+
+		*pfn = page_to_pfn(pages[0]);
+		goto done;
+	}
+
+	vaddr = untagged_addr_remote(mm, vaddr);
+
+retry:
+	vma = vma_lookup(mm, vaddr);
+
+	if (vma && vma->vm_flags & VM_PFNMAP) {
+		ret = follow_fault_pfn(vma, mm, vaddr, pfn, prot & IOMMU_WRITE);
+		if (ret == -EAGAIN)
+			goto retry;
+
+		if (!ret) {
+			if (is_invalid_reserved_pfn(*pfn))
+				ret = 1;
+			else
+				ret = -EFAULT;
+		}
+	}
+done:
+	mmap_read_unlock(mm);
+	return ret;
+}
+
+/*
+ * Attempt to pin pages.  We really don't want to track all the pfns and
+ * the iommu can only map chunks of consecutive pfns anyway, so get the
+ * first page and all consecutive pages with the same locking.
+ */
+static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
+				  long npage, unsigned long *pfn_base,
+				  unsigned long limit, struct vfio_batch *batch)
+{
+	unsigned long pfn;
+	struct mm_struct *mm = current->mm;
+	long ret, pinned = 0, lock_acct = 0;
+	bool rsvd;
+	dma_addr_t iova = vaddr - dma->vaddr + dma->iova;
+
+	/* This code path is only user initiated */
+	if (!mm)
+		return -ENODEV;
+
+	if (batch->size) {
+		/* Leftover pages in batch from an earlier call. */
+		*pfn_base = page_to_pfn(batch->pages[batch->offset]);
+		pfn = *pfn_base;
+		rsvd = is_invalid_reserved_pfn(*pfn_base);
+	} else {
+		*pfn_base = 0;
+	}
+
+	while (npage) {
+		if (!batch->size) {
+			/* Empty batch, so refill it. */
+			long req_pages = min_t(long, npage, batch->capacity);
+
+			ret = vaddr_get_pfns(mm, vaddr, req_pages, dma->prot,
+					     &pfn, batch->pages);
+			if (ret < 0)
+				goto unpin_out;
+
+			batch->size = ret;
+			batch->offset = 0;
+
+			if (!*pfn_base) {
+				*pfn_base = pfn;
+				rsvd = is_invalid_reserved_pfn(*pfn_base);
+			}
+		}
+
+		/*
+		 * pfn is preset for the first iteration of this inner loop and
+		 * updated at the end to handle a VM_PFNMAP pfn.  In that case,
+		 * batch->pages isn't valid (there's no struct page), so allow
+		 * batch->pages to be touched only when there's more than one
+		 * pfn to check, which guarantees the pfns are from a
+		 * !VM_PFNMAP vma.
+		 */
+		while (true) {
+			if (pfn != *pfn_base + pinned ||
+			    rsvd != is_invalid_reserved_pfn(pfn))
+				goto out;
+
+			/*
+			 * Reserved pages aren't counted against the user,
+			 * externally pinned pages are already counted against
+			 * the user.
+			 */
+			if (!rsvd && !vfio_find_vpfn(dma, iova)) {
+				if (!dma->lock_cap &&
+				    mm->locked_vm + lock_acct + 1 > limit) {
+					pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n",
+						__func__, limit << PAGE_SHIFT);
+					ret = -ENOMEM;
+					goto unpin_out;
+				}
+				lock_acct++;
+			}
+
+			pinned++;
+			npage--;
+			vaddr += PAGE_SIZE;
+			iova += PAGE_SIZE;
+			batch->offset++;
+			batch->size--;
+
+			if (!batch->size)
+				break;
+
+			pfn = page_to_pfn(batch->pages[batch->offset]);
+		}
+
+		if (unlikely(disable_hugepages))
+			break;
+	}
+
+out:
+	ret = vfio_lock_acct(dma, lock_acct, false);
+
+unpin_out:
+	if (batch->size == 1 && !batch->offset) {
+		/* May be a VM_PFNMAP pfn, which the batch can't remember. */
+		put_pfn(pfn, dma->prot);
+		batch->size = 0;
+	}
+
+	if (ret < 0) {
+		if (pinned && !rsvd) {
+			for (pfn = *pfn_base ; pinned ; pfn++, pinned--)
+				put_pfn(pfn, dma->prot);
+		}
+		vfio_batch_unpin(batch, dma);
+
+		return ret;
+	}
+
+	return pinned;
+}
+
+static long vfio_unpin_pages_remote(struct vfio_dma *dma, dma_addr_t iova,
+				    unsigned long pfn, long npage,
+				    bool do_accounting)
+{
+	long unlocked = 0, locked = 0;
+	long i;
+
+	for (i = 0; i < npage; i++, iova += PAGE_SIZE) {
+		if (put_pfn(pfn++, dma->prot)) {
+			unlocked++;
+			if (vfio_find_vpfn(dma, iova))
+				locked++;
+		}
+	}
+
+	if (do_accounting)
+		vfio_lock_acct(dma, locked - unlocked, true);
+
+	return unlocked;
+}
+
+static int vfio_pin_page_external(struct vfio_dma *dma, unsigned long vaddr,
+				  unsigned long *pfn_base, bool do_accounting)
+{
+	struct page *pages[1];
+	struct mm_struct *mm;
+	int ret;
+
+	mm = dma->mm;
+	if (!mmget_not_zero(mm))
+		return -ENODEV;
+
+	ret = vaddr_get_pfns(mm, vaddr, 1, dma->prot, pfn_base, pages);
+	if (ret != 1)
+		goto out;
+
+	ret = 0;
+
+	if (do_accounting && !is_invalid_reserved_pfn(*pfn_base)) {
+		ret = vfio_lock_acct(dma, 1, false);
+		if (ret) {
+			put_pfn(*pfn_base, dma->prot);
+			if (ret == -ENOMEM)
+				pr_warn("%s: Task %s (%d) RLIMIT_MEMLOCK "
+					"(%ld) exceeded\n", __func__,
+					dma->task->comm, task_pid_nr(dma->task),
+					task_rlimit(dma->task, RLIMIT_MEMLOCK));
+		}
+	}
+
+out:
+	mmput(mm);
+	return ret;
+}
+
+static int vfio_unpin_page_external(struct vfio_dma *dma, dma_addr_t iova,
+				    bool do_accounting)
+{
+	int unlocked;
+	struct vfio_pfn *vpfn = vfio_find_vpfn(dma, iova);
+
+	if (!vpfn)
+		return 0;
+
+	unlocked = vfio_iova_put_vfio_pfn(dma, vpfn);
+
+	if (do_accounting)
+		vfio_lock_acct(dma, -unlocked, true);
+
+	return unlocked;
+}
+
+static int vfio_iommu_type1_pin_pages(void *iommu_data,
+				      struct iommu_group *iommu_group,
+				      dma_addr_t user_iova,
+				      int npage, int prot,
+				      struct page **pages)
+{
+	struct vfio_iommu *iommu = iommu_data;
+	struct vfio_iommu_group *group;
+	int i, j, ret;
+	unsigned long remote_vaddr;
+	struct vfio_dma *dma;
+	bool do_accounting;
+
+	if (!iommu || !pages)
+		return -EINVAL;
+
+	/* Supported for v2 version only */
+	if (!iommu->v2)
+		return -EACCES;
+
+	mutex_lock(&iommu->lock);
+
+	if (WARN_ONCE(iommu->vaddr_invalid_count,
+		      "vfio_pin_pages not allowed with VFIO_UPDATE_VADDR\n")) {
+		ret = -EBUSY;
+		goto pin_done;
+	}
+
+	/* Fail if no dma_umap notifier is registered */
+	if (list_empty(&iommu->device_list)) {
+		ret = -EINVAL;
+		goto pin_done;
+	}
+
+	/*
+	 * If iommu capable domain exist in the container then all pages are
+	 * already pinned and accounted. Accounting should be done if there is no
+	 * iommu capable domain in the container.
+	 */
+	do_accounting = list_empty(&iommu->domain_list);
+
+	for (i = 0; i < npage; i++) {
+		unsigned long phys_pfn;
+		dma_addr_t iova;
+		struct vfio_pfn *vpfn;
+
+		iova = user_iova + PAGE_SIZE * i;
+		dma = vfio_find_dma(iommu, iova, PAGE_SIZE);
+		if (!dma) {
+			ret = -EINVAL;
+			goto pin_unwind;
+		}
+
+		if ((dma->prot & prot) != prot) {
+			ret = -EPERM;
+			goto pin_unwind;
+		}
+
+		vpfn = vfio_iova_get_vfio_pfn(dma, iova);
+		if (vpfn) {
+			pages[i] = pfn_to_page(vpfn->pfn);
+			continue;
+		}
+
+		remote_vaddr = dma->vaddr + (iova - dma->iova);
+		ret = vfio_pin_page_external(dma, remote_vaddr, &phys_pfn,
+					     do_accounting);
+		if (ret)
+			goto pin_unwind;
+
+		if (!pfn_valid(phys_pfn)) {
+			ret = -EINVAL;
+			goto pin_unwind;
+		}
+
+		ret = vfio_add_to_pfn_list(dma, iova, phys_pfn);
+		if (ret) {
+			if (put_pfn(phys_pfn, dma->prot) && do_accounting)
+				vfio_lock_acct(dma, -1, true);
+			goto pin_unwind;
+		}
+
+		pages[i] = pfn_to_page(phys_pfn);
+
+		if (iommu->dirty_page_tracking) {
+			unsigned long pgshift = __ffs(iommu->pgsize_bitmap);
+
+			/*
+			 * Bitmap populated with the smallest supported page
+			 * size
+			 */
+			bitmap_set(dma->bitmap,
+				   (iova - dma->iova) >> pgshift, 1);
+		}
+	}
+	ret = i;
+
+	group = vfio_iommu_find_iommu_group(iommu, iommu_group);
+	if (!group->pinned_page_dirty_scope) {
+		group->pinned_page_dirty_scope = true;
+		iommu->num_non_pinned_groups--;
+	}
+
+	goto pin_done;
+
+pin_unwind:
+	pages[i] = NULL;
+	for (j = 0; j < i; j++) {
+		dma_addr_t iova;
+
+		iova = user_iova + PAGE_SIZE * j;
+		dma = vfio_find_dma(iommu, iova, PAGE_SIZE);
+		vfio_unpin_page_external(dma, iova, do_accounting);
+		pages[j] = NULL;
+	}
+pin_done:
+	mutex_unlock(&iommu->lock);
+	return ret;
+}
+
+static void vfio_iommu_type1_unpin_pages(void *iommu_data,
+					 dma_addr_t user_iova, int npage)
+{
+	struct vfio_iommu *iommu = iommu_data;
+	bool do_accounting;
+	int i;
+
+	/* Supported for v2 version only */
+	if (WARN_ON(!iommu->v2))
+		return;
+
+	mutex_lock(&iommu->lock);
+
+	do_accounting = list_empty(&iommu->domain_list);
+	for (i = 0; i < npage; i++) {
+		dma_addr_t iova = user_iova + PAGE_SIZE * i;
+		struct vfio_dma *dma;
+
+		dma = vfio_find_dma(iommu, iova, PAGE_SIZE);
+		if (!dma)
+			break;
+
+		vfio_unpin_page_external(dma, iova, do_accounting);
+	}
+
+	mutex_unlock(&iommu->lock);
+
+	WARN_ON(i != npage);
+}
+
+static long vfio_sync_unpin(struct vfio_dma *dma, struct vfio_domain *domain,
+			    struct list_head *regions,
+			    struct iommu_iotlb_gather *iotlb_gather)
+{
+	long unlocked = 0;
+	struct vfio_regions *entry, *next;
+
+	iommu_iotlb_sync(domain->domain, iotlb_gather);
+
+	list_for_each_entry_safe(entry, next, regions, list) {
+		unlocked += vfio_unpin_pages_remote(dma,
+						    entry->iova,
+						    entry->phys >> PAGE_SHIFT,
+						    entry->len >> PAGE_SHIFT,
+						    false);
+		list_del(&entry->list);
+		kfree(entry);
+	}
+
+	cond_resched();
+
+	return unlocked;
+}
+
+/*
+ * Generally, VFIO needs to unpin remote pages after each IOTLB flush.
+ * Therefore, when using IOTLB flush sync interface, VFIO need to keep track
+ * of these regions (currently using a list).
+ *
+ * This value specifies maximum number of regions for each IOTLB flush sync.
+ */
+#define VFIO_IOMMU_TLB_SYNC_MAX		512
+
+static size_t unmap_unpin_fast(struct vfio_domain *domain,
+			       struct vfio_dma *dma, dma_addr_t *iova,
+			       size_t len, phys_addr_t phys, long *unlocked,
+			       struct list_head *unmapped_list,
+			       int *unmapped_cnt,
+			       struct iommu_iotlb_gather *iotlb_gather)
+{
+	size_t unmapped = 0;
+	struct vfio_regions *entry = kzalloc(sizeof(*entry), GFP_KERNEL);
+
+	if (entry) {
+		unmapped = iommu_unmap_fast(domain->domain, *iova, len,
+					    iotlb_gather);
+
+		if (!unmapped) {
+			kfree(entry);
+		} else {
+			entry->iova = *iova;
+			entry->phys = phys;
+			entry->len  = unmapped;
+			list_add_tail(&entry->list, unmapped_list);
+
+			*iova += unmapped;
+			(*unmapped_cnt)++;
+		}
+	}
+
+	/*
+	 * Sync if the number of fast-unmap regions hits the limit
+	 * or in case of errors.
+	 */
+	if (*unmapped_cnt >= VFIO_IOMMU_TLB_SYNC_MAX || !unmapped) {
+		*unlocked += vfio_sync_unpin(dma, domain, unmapped_list,
+					     iotlb_gather);
+		*unmapped_cnt = 0;
+	}
+
+	return unmapped;
+}
+
+static size_t unmap_unpin_slow(struct vfio_domain *domain,
+			       struct vfio_dma *dma, dma_addr_t *iova,
+			       size_t len, phys_addr_t phys,
+			       long *unlocked)
+{
+	size_t unmapped = iommu_unmap(domain->domain, *iova, len);
+
+	if (unmapped) {
+		*unlocked += vfio_unpin_pages_remote(dma, *iova,
+						     phys >> PAGE_SHIFT,
+						     unmapped >> PAGE_SHIFT,
+						     false);
+		*iova += unmapped;
+		cond_resched();
+	}
+	return unmapped;
+}
+
+static long vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma,
+			     bool do_accounting)
+{
+	dma_addr_t iova = dma->iova, end = dma->iova + dma->size;
+	struct vfio_domain *domain, *d;
+	LIST_HEAD(unmapped_region_list);
+	struct iommu_iotlb_gather iotlb_gather;
+	int unmapped_region_cnt = 0;
+	long unlocked = 0;
+
+	if (!dma->size)
+		return 0;
+
+	if (list_empty(&iommu->domain_list))
+		return 0;
+
+	/*
+	 * We use the IOMMU to track the physical addresses, otherwise we'd
+	 * need a much more complicated tracking system.  Unfortunately that
+	 * means we need to use one of the iommu domains to figure out the
+	 * pfns to unpin.  The rest need to be unmapped in advance so we have
+	 * no iommu translations remaining when the pages are unpinned.
+	 */
+	domain = d = list_first_entry(&iommu->domain_list,
+				      struct vfio_domain, next);
+
+	list_for_each_entry_continue(d, &iommu->domain_list, next) {
+		iommu_unmap(d->domain, dma->iova, dma->size);
+		cond_resched();
+	}
+
+	iommu_iotlb_gather_init(&iotlb_gather);
+	while (iova < end) {
+		size_t unmapped, len;
+		phys_addr_t phys, next;
+
+		phys = iommu_iova_to_phys(domain->domain, iova);
+		if (WARN_ON(!phys)) {
+			iova += PAGE_SIZE;
+			continue;
+		}
+
+		/*
+		 * To optimize for fewer iommu_unmap() calls, each of which
+		 * may require hardware cache flushing, try to find the
+		 * largest contiguous physical memory chunk to unmap.
+		 */
+		for (len = PAGE_SIZE;
+		     !domain->fgsp && iova + len < end; len += PAGE_SIZE) {
+			next = iommu_iova_to_phys(domain->domain, iova + len);
+			if (next != phys + len)
+				break;
+		}
+
+		/*
+		 * First, try to use fast unmap/unpin. In case of failure,
+		 * switch to slow unmap/unpin path.
+		 */
+		unmapped = unmap_unpin_fast(domain, dma, &iova, len, phys,
+					    &unlocked, &unmapped_region_list,
+					    &unmapped_region_cnt,
+					    &iotlb_gather);
+		if (!unmapped) {
+			unmapped = unmap_unpin_slow(domain, dma, &iova, len,
+						    phys, &unlocked);
+			if (WARN_ON(!unmapped))
+				break;
+		}
+	}
+
+	dma->iommu_mapped = false;
+
+	if (unmapped_region_cnt) {
+		unlocked += vfio_sync_unpin(dma, domain, &unmapped_region_list,
+					    &iotlb_gather);
+	}
+
+	if (do_accounting) {
+		vfio_lock_acct(dma, -unlocked, true);
+		return 0;
+	}
+	return unlocked;
+}
+
+static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *dma)
+{
+	WARN_ON(!RB_EMPTY_ROOT(&dma->pfn_list));
+	vfio_unmap_unpin(iommu, dma, true);
+	vfio_unlink_dma(iommu, dma);
+	put_task_struct(dma->task);
+	mmdrop(dma->mm);
+	vfio_dma_bitmap_free(dma);
+	if (dma->vaddr_invalid)
+		iommu->vaddr_invalid_count--;
+	kfree(dma);
+	iommu->dma_avail++;
+}
+
+static void vfio_update_pgsize_bitmap(struct vfio_iommu *iommu)
+{
+	struct vfio_domain *domain;
+
+	iommu->pgsize_bitmap = ULONG_MAX;
+
+	list_for_each_entry(domain, &iommu->domain_list, next)
+		iommu->pgsize_bitmap &= domain->domain->pgsize_bitmap;
+
+	/*
+	 * In case the IOMMU supports page sizes smaller than PAGE_SIZE
+	 * we pretend PAGE_SIZE is supported and hide sub-PAGE_SIZE sizes.
+	 * That way the user will be able to map/unmap buffers whose size/
+	 * start address is aligned with PAGE_SIZE. Pinning code uses that
+	 * granularity while iommu driver can use the sub-PAGE_SIZE size
+	 * to map the buffer.
+	 */
+	if (iommu->pgsize_bitmap & ~PAGE_MASK) {
+		iommu->pgsize_bitmap &= PAGE_MASK;
+		iommu->pgsize_bitmap |= PAGE_SIZE;
+	}
+}
+
+static int update_user_bitmap(u64 __user *bitmap, struct vfio_iommu *iommu,
+			      struct vfio_dma *dma, dma_addr_t base_iova,
+			      size_t pgsize)
+{
+	unsigned long pgshift = __ffs(pgsize);
+	unsigned long nbits = dma->size >> pgshift;
+	unsigned long bit_offset = (dma->iova - base_iova) >> pgshift;
+	unsigned long copy_offset = bit_offset / BITS_PER_LONG;
+	unsigned long shift = bit_offset % BITS_PER_LONG;
+	unsigned long leftover;
+
+	/*
+	 * mark all pages dirty if any IOMMU capable device is not able
+	 * to report dirty pages and all pages are pinned and mapped.
+	 */
+	if (iommu->num_non_pinned_groups && dma->iommu_mapped)
+		bitmap_set(dma->bitmap, 0, nbits);
+
+	if (shift) {
+		bitmap_shift_left(dma->bitmap, dma->bitmap, shift,
+				  nbits + shift);
+
+		if (copy_from_user(&leftover,
+				   (void __user *)(bitmap + copy_offset),
+				   sizeof(leftover)))
+			return -EFAULT;
+
+		bitmap_or(dma->bitmap, dma->bitmap, &leftover, shift);
+	}
+
+	if (copy_to_user((void __user *)(bitmap + copy_offset), dma->bitmap,
+			 DIRTY_BITMAP_BYTES(nbits + shift)))
+		return -EFAULT;
+
+	return 0;
+}
+
+static int vfio_iova_dirty_bitmap(u64 __user *bitmap, struct vfio_iommu *iommu,
+				  dma_addr_t iova, size_t size, size_t pgsize)
+{
+	struct vfio_dma *dma;
+	struct rb_node *n;
+	unsigned long pgshift = __ffs(pgsize);
+	int ret;
+
+	/*
+	 * GET_BITMAP request must fully cover vfio_dma mappings.  Multiple
+	 * vfio_dma mappings may be clubbed by specifying large ranges, but
+	 * there must not be any previous mappings bisected by the range.
+	 * An error will be returned if these conditions are not met.
+	 */
+	dma = vfio_find_dma(iommu, iova, 1);
+	if (dma && dma->iova != iova)
+		return -EINVAL;
+
+	dma = vfio_find_dma(iommu, iova + size - 1, 0);
+	if (dma && dma->iova + dma->size != iova + size)
+		return -EINVAL;
+
+	for (n = rb_first(&iommu->dma_list); n; n = rb_next(n)) {
+		struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
+
+		if (dma->iova < iova)
+			continue;
+
+		if (dma->iova > iova + size - 1)
+			break;
+
+		ret = update_user_bitmap(bitmap, iommu, dma, iova, pgsize);
+		if (ret)
+			return ret;
+
+		/*
+		 * Re-populate bitmap to include all pinned pages which are
+		 * considered as dirty but exclude pages which are unpinned and
+		 * pages which are marked dirty by vfio_dma_rw()
+		 */
+		bitmap_clear(dma->bitmap, 0, dma->size >> pgshift);
+		vfio_dma_populate_bitmap(dma, pgsize);
+	}
+	return 0;
+}
+
+static int verify_bitmap_size(uint64_t npages, uint64_t bitmap_size)
+{
+	if (!npages || !bitmap_size || (bitmap_size > DIRTY_BITMAP_SIZE_MAX) ||
+	    (bitmap_size < DIRTY_BITMAP_BYTES(npages)))
+		return -EINVAL;
+
+	return 0;
+}
+
+/*
+ * Notify VFIO drivers using vfio_register_emulated_iommu_dev() to invalidate
+ * and unmap iovas within the range we're about to unmap. Drivers MUST unpin
+ * pages in response to an invalidation.
+ */
+static void vfio_notify_dma_unmap(struct vfio_iommu *iommu,
+				  struct vfio_dma *dma)
+{
+	struct vfio_device *device;
+
+	if (list_empty(&iommu->device_list))
+		return;
+
+	/*
+	 * The device is expected to call vfio_unpin_pages() for any IOVA it has
+	 * pinned within the range. Since vfio_unpin_pages() will eventually
+	 * call back down to this code and try to obtain the iommu->lock we must
+	 * drop it.
+	 */
+	mutex_lock(&iommu->device_list_lock);
+	mutex_unlock(&iommu->lock);
+
+	list_for_each_entry(device, &iommu->device_list, iommu_entry)
+		device->ops->dma_unmap(device, dma->iova, dma->size);
+
+	mutex_unlock(&iommu->device_list_lock);
+	mutex_lock(&iommu->lock);
+}
+
+static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
+			     struct vfio_iommu_type1_dma_unmap *unmap,
+			     struct vfio_bitmap *bitmap)
+{
+	struct vfio_dma *dma, *dma_last = NULL;
+	size_t unmapped = 0, pgsize;
+	int ret = -EINVAL, retries = 0;
+	unsigned long pgshift;
+	dma_addr_t iova = unmap->iova;
+	u64 size = unmap->size;
+	bool unmap_all = unmap->flags & VFIO_DMA_UNMAP_FLAG_ALL;
+	bool invalidate_vaddr = unmap->flags & VFIO_DMA_UNMAP_FLAG_VADDR;
+	struct rb_node *n, *first_n;
+
+	mutex_lock(&iommu->lock);
+
+	/* Cannot update vaddr if mdev is present. */
+	if (invalidate_vaddr && !list_empty(&iommu->emulated_iommu_groups)) {
+		ret = -EBUSY;
+		goto unlock;
+	}
+
+	pgshift = __ffs(iommu->pgsize_bitmap);
+	pgsize = (size_t)1 << pgshift;
+
+	if (iova & (pgsize - 1))
+		goto unlock;
+
+	if (unmap_all) {
+		if (iova || size)
+			goto unlock;
+		size = U64_MAX;
+	} else if (!size || size & (pgsize - 1) ||
+		   iova + size - 1 < iova || size > SIZE_MAX) {
+		goto unlock;
+	}
+
+	/* When dirty tracking is enabled, allow only min supported pgsize */
+	if ((unmap->flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) &&
+	    (!iommu->dirty_page_tracking || (bitmap->pgsize != pgsize))) {
+		goto unlock;
+	}
+
+	WARN_ON((pgsize - 1) & PAGE_MASK);
+again:
+	/*
+	 * vfio-iommu-type1 (v1) - User mappings were coalesced together to
+	 * avoid tracking individual mappings.  This means that the granularity
+	 * of the original mapping was lost and the user was allowed to attempt
+	 * to unmap any range.  Depending on the contiguousness of physical
+	 * memory and page sizes supported by the IOMMU, arbitrary unmaps may
+	 * or may not have worked.  We only guaranteed unmap granularity
+	 * matching the original mapping; even though it was untracked here,
+	 * the original mappings are reflected in IOMMU mappings.  This
+	 * resulted in a couple unusual behaviors.  First, if a range is not
+	 * able to be unmapped, ex. a set of 4k pages that was mapped as a
+	 * 2M hugepage into the IOMMU, the unmap ioctl returns success but with
+	 * a zero sized unmap.  Also, if an unmap request overlaps the first
+	 * address of a hugepage, the IOMMU will unmap the entire hugepage.
+	 * This also returns success and the returned unmap size reflects the
+	 * actual size unmapped.
+	 *
+	 * We attempt to maintain compatibility with this "v1" interface, but
+	 * we take control out of the hands of the IOMMU.  Therefore, an unmap
+	 * request offset from the beginning of the original mapping will
+	 * return success with zero sized unmap.  And an unmap request covering
+	 * the first iova of mapping will unmap the entire range.
+	 *
+	 * The v2 version of this interface intends to be more deterministic.
+	 * Unmap requests must fully cover previous mappings.  Multiple
+	 * mappings may still be unmaped by specifying large ranges, but there
+	 * must not be any previous mappings bisected by the range.  An error
+	 * will be returned if these conditions are not met.  The v2 interface
+	 * will only return success and a size of zero if there were no
+	 * mappings within the range.
+	 */
+	if (iommu->v2 && !unmap_all) {
+		dma = vfio_find_dma(iommu, iova, 1);
+		if (dma && dma->iova != iova)
+			goto unlock;
+
+		dma = vfio_find_dma(iommu, iova + size - 1, 0);
+		if (dma && dma->iova + dma->size != iova + size)
+			goto unlock;
+	}
+
+	ret = 0;
+	n = first_n = vfio_find_dma_first_node(iommu, iova, size);
+
+	while (n) {
+		dma = rb_entry(n, struct vfio_dma, node);
+		if (dma->iova >= iova + size)
+			break;
+
+		if (!iommu->v2 && iova > dma->iova)
+			break;
+
+		if (invalidate_vaddr) {
+			if (dma->vaddr_invalid) {
+				struct rb_node *last_n = n;
+
+				for (n = first_n; n != last_n; n = rb_next(n)) {
+					dma = rb_entry(n,
+						       struct vfio_dma, node);
+					dma->vaddr_invalid = false;
+					iommu->vaddr_invalid_count--;
+				}
+				ret = -EINVAL;
+				unmapped = 0;
+				break;
+			}
+			dma->vaddr_invalid = true;
+			iommu->vaddr_invalid_count++;
+			unmapped += dma->size;
+			n = rb_next(n);
+			continue;
+		}
+
+		if (!RB_EMPTY_ROOT(&dma->pfn_list)) {
+			if (dma_last == dma) {
+				BUG_ON(++retries > 10);
+			} else {
+				dma_last = dma;
+				retries = 0;
+			}
+
+			vfio_notify_dma_unmap(iommu, dma);
+			goto again;
+		}
+
+		if (unmap->flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) {
+			ret = update_user_bitmap(bitmap->data, iommu, dma,
+						 iova, pgsize);
+			if (ret)
+				break;
+		}
+
+		unmapped += dma->size;
+		n = rb_next(n);
+		vfio_remove_dma(iommu, dma);
+	}
+
+unlock:
+	mutex_unlock(&iommu->lock);
+
+	/* Report how much was unmapped */
+	unmap->size = unmapped;
+
+	return ret;
+}
+
+static int vfio_iommu_map(struct vfio_iommu *iommu, dma_addr_t iova,
+			  unsigned long pfn, long npage, int prot)
+{
+	struct vfio_domain *d;
+	int ret;
+
+	list_for_each_entry(d, &iommu->domain_list, next) {
+		ret = iommu_map(d->domain, iova, (phys_addr_t)pfn << PAGE_SHIFT,
+				npage << PAGE_SHIFT, prot | IOMMU_CACHE,
+				GFP_KERNEL);
+		if (ret)
+			goto unwind;
+
+		cond_resched();
+	}
+
+	return 0;
+
+unwind:
+	list_for_each_entry_continue_reverse(d, &iommu->domain_list, next) {
+		iommu_unmap(d->domain, iova, npage << PAGE_SHIFT);
+		cond_resched();
+	}
+
+	return ret;
+}
+
+static int vfio_pin_map_dma(struct vfio_iommu *iommu, struct vfio_dma *dma,
+			    size_t map_size)
+{
+	dma_addr_t iova = dma->iova;
+	unsigned long vaddr = dma->vaddr;
+	struct vfio_batch batch;
+	size_t size = map_size;
+	long npage;
+	unsigned long pfn, limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+	int ret = 0;
+
+	vfio_batch_init(&batch);
+
+	while (size) {
+		/* Pin a contiguous chunk of memory */
+		npage = vfio_pin_pages_remote(dma, vaddr + dma->size,
+					      size >> PAGE_SHIFT, &pfn, limit,
+					      &batch);
+		if (npage <= 0) {
+			WARN_ON(!npage);
+			ret = (int)npage;
+			break;
+		}
+
+		/* Map it! */
+		ret = vfio_iommu_map(iommu, iova + dma->size, pfn, npage,
+				     dma->prot);
+		if (ret) {
+			vfio_unpin_pages_remote(dma, iova + dma->size, pfn,
+						npage, true);
+			vfio_batch_unpin(&batch, dma);
+			break;
+		}
+
+		size -= npage << PAGE_SHIFT;
+		dma->size += npage << PAGE_SHIFT;
+	}
+
+	vfio_batch_fini(&batch);
+	dma->iommu_mapped = true;
+
+	if (ret)
+		vfio_remove_dma(iommu, dma);
+
+	return ret;
+}
+
+/*
+ * Check dma map request is within a valid iova range
+ */
+static bool vfio_iommu_iova_dma_valid(struct vfio_iommu *iommu,
+				      dma_addr_t start, dma_addr_t end)
+{
+	struct list_head *iova = &iommu->iova_list;
+	struct vfio_iova *node;
+
+	list_for_each_entry(node, iova, list) {
+		if (start >= node->start && end <= node->end)
+			return true;
+	}
+
+	/*
+	 * Check for list_empty() as well since a container with
+	 * a single mdev device will have an empty list.
+	 */
+	return list_empty(iova);
+}
+
+static int vfio_change_dma_owner(struct vfio_dma *dma)
+{
+	struct task_struct *task = current->group_leader;
+	struct mm_struct *mm = current->mm;
+	long npage = dma->locked_vm;
+	bool lock_cap;
+	int ret;
+
+	if (mm == dma->mm)
+		return 0;
+
+	lock_cap = capable(CAP_IPC_LOCK);
+	ret = mm_lock_acct(task, mm, lock_cap, npage);
+	if (ret)
+		return ret;
+
+	if (mmget_not_zero(dma->mm)) {
+		mm_lock_acct(dma->task, dma->mm, dma->lock_cap, -npage);
+		mmput(dma->mm);
+	}
+
+	if (dma->task != task) {
+		put_task_struct(dma->task);
+		dma->task = get_task_struct(task);
+	}
+	mmdrop(dma->mm);
+	dma->mm = mm;
+	mmgrab(dma->mm);
+	dma->lock_cap = lock_cap;
+	return 0;
+}
+
+static int vfio_dma_do_map(struct vfio_iommu *iommu,
+			   struct vfio_iommu_type1_dma_map *map)
+{
+	bool set_vaddr = map->flags & VFIO_DMA_MAP_FLAG_VADDR;
+	dma_addr_t iova = map->iova;
+	unsigned long vaddr = map->vaddr;
+	size_t size = map->size;
+	int ret = 0, prot = 0;
+	size_t pgsize;
+	struct vfio_dma *dma;
+
+	/* Verify that none of our __u64 fields overflow */
+	if (map->size != size || map->vaddr != vaddr || map->iova != iova)
+		return -EINVAL;
+
+	/* READ/WRITE from device perspective */
+	if (map->flags & VFIO_DMA_MAP_FLAG_WRITE)
+		prot |= IOMMU_WRITE;
+	if (map->flags & VFIO_DMA_MAP_FLAG_READ)
+		prot |= IOMMU_READ;
+
+	if ((prot && set_vaddr) || (!prot && !set_vaddr))
+		return -EINVAL;
+
+	mutex_lock(&iommu->lock);
+
+	pgsize = (size_t)1 << __ffs(iommu->pgsize_bitmap);
+
+	WARN_ON((pgsize - 1) & PAGE_MASK);
+
+	if (!size || (size | iova | vaddr) & (pgsize - 1)) {
+		ret = -EINVAL;
+		goto out_unlock;
+	}
+
+	/* Don't allow IOVA or virtual address wrap */
+	if (iova + size - 1 < iova || vaddr + size - 1 < vaddr) {
+		ret = -EINVAL;
+		goto out_unlock;
+	}
+
+	dma = vfio_find_dma(iommu, iova, size);
+	if (set_vaddr) {
+		if (!dma) {
+			ret = -ENOENT;
+		} else if (!dma->vaddr_invalid || dma->iova != iova ||
+			   dma->size != size) {
+			ret = -EINVAL;
+		} else {
+			ret = vfio_change_dma_owner(dma);
+			if (ret)
+				goto out_unlock;
+			dma->vaddr = vaddr;
+			dma->vaddr_invalid = false;
+			iommu->vaddr_invalid_count--;
+		}
+		goto out_unlock;
+	} else if (dma) {
+		ret = -EEXIST;
+		goto out_unlock;
+	}
+
+	if (!iommu->dma_avail) {
+		ret = -ENOSPC;
+		goto out_unlock;
+	}
+
+	if (!vfio_iommu_iova_dma_valid(iommu, iova, iova + size - 1)) {
+		ret = -EINVAL;
+		goto out_unlock;
+	}
+
+	dma = kzalloc(sizeof(*dma), GFP_KERNEL);
+	if (!dma) {
+		ret = -ENOMEM;
+		goto out_unlock;
+	}
+
+	iommu->dma_avail--;
+	dma->iova = iova;
+	dma->vaddr = vaddr;
+	dma->prot = prot;
+
+	/*
+	 * We need to be able to both add to a task's locked memory and test
+	 * against the locked memory limit and we need to be able to do both
+	 * outside of this call path as pinning can be asynchronous via the
+	 * external interfaces for mdev devices.  RLIMIT_MEMLOCK requires a
+	 * task_struct. Save the group_leader so that all DMA tracking uses
+	 * the same task, to make debugging easier.  VM locked pages requires
+	 * an mm_struct, so grab the mm in case the task dies.
+	 */
+	get_task_struct(current->group_leader);
+	dma->task = current->group_leader;
+	dma->lock_cap = capable(CAP_IPC_LOCK);
+	dma->mm = current->mm;
+	mmgrab(dma->mm);
+
+	dma->pfn_list = RB_ROOT;
+
+	/* Insert zero-sized and grow as we map chunks of it */
+	vfio_link_dma(iommu, dma);
+
+	/* Don't pin and map if container doesn't contain IOMMU capable domain*/
+	if (list_empty(&iommu->domain_list))
+		dma->size = size;
+	else
+		ret = vfio_pin_map_dma(iommu, dma, size);
+
+	if (!ret && iommu->dirty_page_tracking) {
+		ret = vfio_dma_bitmap_alloc(dma, pgsize);
+		if (ret)
+			vfio_remove_dma(iommu, dma);
+	}
+
+out_unlock:
+	mutex_unlock(&iommu->lock);
+	return ret;
+}
+
+static int vfio_iommu_replay(struct vfio_iommu *iommu,
+			     struct vfio_domain *domain)
+{
+	struct vfio_batch batch;
+	struct vfio_domain *d = NULL;
+	struct rb_node *n;
+	unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+	int ret;
+
+	/* Arbitrarily pick the first domain in the list for lookups */
+	if (!list_empty(&iommu->domain_list))
+		d = list_first_entry(&iommu->domain_list,
+				     struct vfio_domain, next);
+
+	vfio_batch_init(&batch);
+
+	n = rb_first(&iommu->dma_list);
+
+	for (; n; n = rb_next(n)) {
+		struct vfio_dma *dma;
+		dma_addr_t iova;
+
+		dma = rb_entry(n, struct vfio_dma, node);
+		iova = dma->iova;
+
+		while (iova < dma->iova + dma->size) {
+			phys_addr_t phys;
+			size_t size;
+
+			if (dma->iommu_mapped) {
+				phys_addr_t p;
+				dma_addr_t i;
+
+				if (WARN_ON(!d)) { /* mapped w/o a domain?! */
+					ret = -EINVAL;
+					goto unwind;
+				}
+
+				phys = iommu_iova_to_phys(d->domain, iova);
+
+				if (WARN_ON(!phys)) {
+					iova += PAGE_SIZE;
+					continue;
+				}
+
+				size = PAGE_SIZE;
+				p = phys + size;
+				i = iova + size;
+				while (i < dma->iova + dma->size &&
+				       p == iommu_iova_to_phys(d->domain, i)) {
+					size += PAGE_SIZE;
+					p += PAGE_SIZE;
+					i += PAGE_SIZE;
+				}
+			} else {
+				unsigned long pfn;
+				unsigned long vaddr = dma->vaddr +
+						     (iova - dma->iova);
+				size_t n = dma->iova + dma->size - iova;
+				long npage;
+
+				npage = vfio_pin_pages_remote(dma, vaddr,
+							      n >> PAGE_SHIFT,
+							      &pfn, limit,
+							      &batch);
+				if (npage <= 0) {
+					WARN_ON(!npage);
+					ret = (int)npage;
+					goto unwind;
+				}
+
+				phys = pfn << PAGE_SHIFT;
+				size = npage << PAGE_SHIFT;
+			}
+
+			ret = iommu_map(domain->domain, iova, phys, size,
+					dma->prot | IOMMU_CACHE, GFP_KERNEL);
+			if (ret) {
+				if (!dma->iommu_mapped) {
+					vfio_unpin_pages_remote(dma, iova,
+							phys >> PAGE_SHIFT,
+							size >> PAGE_SHIFT,
+							true);
+					vfio_batch_unpin(&batch, dma);
+				}
+				goto unwind;
+			}
+
+			iova += size;
+		}
+	}
+
+	/* All dmas are now mapped, defer to second tree walk for unwind */
+	for (n = rb_first(&iommu->dma_list); n; n = rb_next(n)) {
+		struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
+
+		dma->iommu_mapped = true;
+	}
+
+	vfio_batch_fini(&batch);
+	return 0;
+
+unwind:
+	for (; n; n = rb_prev(n)) {
+		struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
+		dma_addr_t iova;
+
+		if (dma->iommu_mapped) {
+			iommu_unmap(domain->domain, dma->iova, dma->size);
+			continue;
+		}
+
+		iova = dma->iova;
+		while (iova < dma->iova + dma->size) {
+			phys_addr_t phys, p;
+			size_t size;
+			dma_addr_t i;
+
+			phys = iommu_iova_to_phys(domain->domain, iova);
+			if (!phys) {
+				iova += PAGE_SIZE;
+				continue;
+			}
+
+			size = PAGE_SIZE;
+			p = phys + size;
+			i = iova + size;
+			while (i < dma->iova + dma->size &&
+			       p == iommu_iova_to_phys(domain->domain, i)) {
+				size += PAGE_SIZE;
+				p += PAGE_SIZE;
+				i += PAGE_SIZE;
+			}
+
+			iommu_unmap(domain->domain, iova, size);
+			vfio_unpin_pages_remote(dma, iova, phys >> PAGE_SHIFT,
+						size >> PAGE_SHIFT, true);
+		}
+	}
+
+	vfio_batch_fini(&batch);
+	return ret;
+}
+
+/*
+ * We change our unmap behavior slightly depending on whether the IOMMU
+ * supports fine-grained superpages.  IOMMUs like AMD-Vi will use a superpage
+ * for practically any contiguous power-of-two mapping we give it.  This means
+ * we don't need to look for contiguous chunks ourselves to make unmapping
+ * more efficient.  On IOMMUs with coarse-grained super pages, like Intel VT-d
+ * with discrete 2M/1G/512G/1T superpages, identifying contiguous chunks
+ * significantly boosts non-hugetlbfs mappings and doesn't seem to hurt when
+ * hugetlbfs is in use.
+ */
+static void vfio_test_domain_fgsp(struct vfio_domain *domain, struct list_head *regions)
+{
+	int ret, order = get_order(PAGE_SIZE * 2);
+	struct vfio_iova *region;
+	struct page *pages;
+	dma_addr_t start;
+
+	pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, order);
+	if (!pages)
+		return;
+
+	list_for_each_entry(region, regions, list) {
+		start = ALIGN(region->start, PAGE_SIZE * 2);
+		if (start >= region->end || (region->end - start < PAGE_SIZE * 2))
+			continue;
+
+		ret = iommu_map(domain->domain, start, page_to_phys(pages), PAGE_SIZE * 2,
+				IOMMU_READ | IOMMU_WRITE | IOMMU_CACHE, GFP_KERNEL);
+		if (!ret) {
+			size_t unmapped = iommu_unmap(domain->domain, start, PAGE_SIZE);
+
+			if (unmapped == PAGE_SIZE)
+				iommu_unmap(domain->domain, start + PAGE_SIZE, PAGE_SIZE);
+			else
+				domain->fgsp = true;
+		}
+		break;
+	}
+
+	__free_pages(pages, order);
+}
+
+static struct vfio_iommu_group *find_iommu_group(struct vfio_domain *domain,
+						 struct iommu_group *iommu_group)
+{
+	struct vfio_iommu_group *g;
+
+	list_for_each_entry(g, &domain->group_list, next) {
+		if (g->iommu_group == iommu_group)
+			return g;
+	}
+
+	return NULL;
+}
+
+static struct vfio_iommu_group*
+vfio_iommu_find_iommu_group(struct vfio_iommu *iommu,
+			    struct iommu_group *iommu_group)
+{
+	struct vfio_iommu_group *group;
+	struct vfio_domain *domain;
+
+	list_for_each_entry(domain, &iommu->domain_list, next) {
+		group = find_iommu_group(domain, iommu_group);
+		if (group)
+			return group;
+	}
+
+	list_for_each_entry(group, &iommu->emulated_iommu_groups, next)
+		if (group->iommu_group == iommu_group)
+			return group;
+	return NULL;
+}
+
+static bool vfio_iommu_has_sw_msi(struct list_head *group_resv_regions,
+				  phys_addr_t *base)
+{
+	struct iommu_resv_region *region;
+	bool ret = false;
+
+	list_for_each_entry(region, group_resv_regions, list) {
+		/*
+		 * The presence of any 'real' MSI regions should take
+		 * precedence over the software-managed one if the
+		 * IOMMU driver happens to advertise both types.
+		 */
+		if (region->type == IOMMU_RESV_MSI) {
+			ret = false;
+			break;
+		}
+
+		if (region->type == IOMMU_RESV_SW_MSI) {
+			*base = region->start;
+			ret = true;
+		}
+	}
+
+	return ret;
+}
+
+/*
+ * This is a helper function to insert an address range to iova list.
+ * The list is initially created with a single entry corresponding to
+ * the IOMMU domain geometry to which the device group is attached.
+ * The list aperture gets modified when a new domain is added to the
+ * container if the new aperture doesn't conflict with the current one
+ * or with any existing dma mappings. The list is also modified to
+ * exclude any reserved regions associated with the device group.
+ */
+static int vfio_iommu_iova_insert(struct list_head *head,
+				  dma_addr_t start, dma_addr_t end)
+{
+	struct vfio_iova *region;
+
+	region = kmalloc(sizeof(*region), GFP_KERNEL);
+	if (!region)
+		return -ENOMEM;
+
+	INIT_LIST_HEAD(&region->list);
+	region->start = start;
+	region->end = end;
+
+	list_add_tail(&region->list, head);
+	return 0;
+}
+
+/*
+ * Check the new iommu aperture conflicts with existing aper or with any
+ * existing dma mappings.
+ */
+static bool vfio_iommu_aper_conflict(struct vfio_iommu *iommu,
+				     dma_addr_t start, dma_addr_t end)
+{
+	struct vfio_iova *first, *last;
+	struct list_head *iova = &iommu->iova_list;
+
+	if (list_empty(iova))
+		return false;
+
+	/* Disjoint sets, return conflict */
+	first = list_first_entry(iova, struct vfio_iova, list);
+	last = list_last_entry(iova, struct vfio_iova, list);
+	if (start > last->end || end < first->start)
+		return true;
+
+	/* Check for any existing dma mappings below the new start */
+	if (start > first->start) {
+		if (vfio_find_dma(iommu, first->start, start - first->start))
+			return true;
+	}
+
+	/* Check for any existing dma mappings beyond the new end */
+	if (end < last->end) {
+		if (vfio_find_dma(iommu, end + 1, last->end - end))
+			return true;
+	}
+
+	return false;
+}
+
+/*
+ * Resize iommu iova aperture window. This is called only if the new
+ * aperture has no conflict with existing aperture and dma mappings.
+ */
+static int vfio_iommu_aper_resize(struct list_head *iova,
+				  dma_addr_t start, dma_addr_t end)
+{
+	struct vfio_iova *node, *next;
+
+	if (list_empty(iova))
+		return vfio_iommu_iova_insert(iova, start, end);
+
+	/* Adjust iova list start */
+	list_for_each_entry_safe(node, next, iova, list) {
+		if (start < node->start)
+			break;
+		if (start >= node->start && start < node->end) {
+			node->start = start;
+			break;
+		}
+		/* Delete nodes before new start */
+		list_del(&node->list);
+		kfree(node);
+	}
+
+	/* Adjust iova list end */
+	list_for_each_entry_safe(node, next, iova, list) {
+		if (end > node->end)
+			continue;
+		if (end > node->start && end <= node->end) {
+			node->end = end;
+			continue;
+		}
+		/* Delete nodes after new end */
+		list_del(&node->list);
+		kfree(node);
+	}
+
+	return 0;
+}
+
+/*
+ * Check reserved region conflicts with existing dma mappings
+ */
+static bool vfio_iommu_resv_conflict(struct vfio_iommu *iommu,
+				     struct list_head *resv_regions)
+{
+	struct iommu_resv_region *region;
+
+	/* Check for conflict with existing dma mappings */
+	list_for_each_entry(region, resv_regions, list) {
+		if (region->type == IOMMU_RESV_DIRECT_RELAXABLE)
+			continue;
+
+		if (vfio_find_dma(iommu, region->start, region->length))
+			return true;
+	}
+
+	return false;
+}
+
+/*
+ * Check iova region overlap with  reserved regions and
+ * exclude them from the iommu iova range
+ */
+static int vfio_iommu_resv_exclude(struct list_head *iova,
+				   struct list_head *resv_regions)
+{
+	struct iommu_resv_region *resv;
+	struct vfio_iova *n, *next;
+
+	list_for_each_entry(resv, resv_regions, list) {
+		phys_addr_t start, end;
+
+		if (resv->type == IOMMU_RESV_DIRECT_RELAXABLE)
+			continue;
+
+		start = resv->start;
+		end = resv->start + resv->length - 1;
+
+		list_for_each_entry_safe(n, next, iova, list) {
+			int ret = 0;
+
+			/* No overlap */
+			if (start > n->end || end < n->start)
+				continue;
+			/*
+			 * Insert a new node if current node overlaps with the
+			 * reserve region to exclude that from valid iova range.
+			 * Note that, new node is inserted before the current
+			 * node and finally the current node is deleted keeping
+			 * the list updated and sorted.
+			 */
+			if (start > n->start)
+				ret = vfio_iommu_iova_insert(&n->list, n->start,
+							     start - 1);
+			if (!ret && end < n->end)
+				ret = vfio_iommu_iova_insert(&n->list, end + 1,
+							     n->end);
+			if (ret)
+				return ret;
+
+			list_del(&n->list);
+			kfree(n);
+		}
+	}
+
+	if (list_empty(iova))
+		return -EINVAL;
+
+	return 0;
+}
+
+static void vfio_iommu_resv_free(struct list_head *resv_regions)
+{
+	struct iommu_resv_region *n, *next;
+
+	list_for_each_entry_safe(n, next, resv_regions, list) {
+		list_del(&n->list);
+		kfree(n);
+	}
+}
+
+static void vfio_iommu_iova_free(struct list_head *iova)
+{
+	struct vfio_iova *n, *next;
+
+	list_for_each_entry_safe(n, next, iova, list) {
+		list_del(&n->list);
+		kfree(n);
+	}
+}
+
+static int vfio_iommu_iova_get_copy(struct vfio_iommu *iommu,
+				    struct list_head *iova_copy)
+{
+	struct list_head *iova = &iommu->iova_list;
+	struct vfio_iova *n;
+	int ret;
+
+	list_for_each_entry(n, iova, list) {
+		ret = vfio_iommu_iova_insert(iova_copy, n->start, n->end);
+		if (ret)
+			goto out_free;
+	}
+
+	return 0;
+
+out_free:
+	vfio_iommu_iova_free(iova_copy);
+	return ret;
+}
+
+static void vfio_iommu_iova_insert_copy(struct vfio_iommu *iommu,
+					struct list_head *iova_copy)
+{
+	struct list_head *iova = &iommu->iova_list;
+
+	vfio_iommu_iova_free(iova);
+
+	list_splice_tail(iova_copy, iova);
+}
+
+static int vfio_iommu_domain_alloc(struct device *dev, void *data)
+{
+	struct iommu_domain **domain = data;
+
+	*domain = iommu_domain_alloc(dev->bus);
+	return 1; /* Don't iterate */
+}
+
+static int vfio_iommu_type1_attach_group(void *iommu_data,
+		struct iommu_group *iommu_group, enum vfio_group_type type)
+{
+	struct vfio_iommu *iommu = iommu_data;
+	struct vfio_iommu_group *group;
+	struct vfio_domain *domain, *d;
+	bool resv_msi;
+	phys_addr_t resv_msi_base = 0;
+	struct iommu_domain_geometry *geo;
+	LIST_HEAD(iova_copy);
+	LIST_HEAD(group_resv_regions);
+	int ret = -EBUSY;
+
+	mutex_lock(&iommu->lock);
+
+	/* Attach could require pinning, so disallow while vaddr is invalid. */
+	if (iommu->vaddr_invalid_count)
+		goto out_unlock;
+
+	/* Check for duplicates */
+	ret = -EINVAL;
+	if (vfio_iommu_find_iommu_group(iommu, iommu_group))
+		goto out_unlock;
+
+	ret = -ENOMEM;
+	group = kzalloc(sizeof(*group), GFP_KERNEL);
+	if (!group)
+		goto out_unlock;
+	group->iommu_group = iommu_group;
+
+	if (type == VFIO_EMULATED_IOMMU) {
+		list_add(&group->next, &iommu->emulated_iommu_groups);
+		/*
+		 * An emulated IOMMU group cannot dirty memory directly, it can
+		 * only use interfaces that provide dirty tracking.
+		 * The iommu scope can only be promoted with the addition of a
+		 * dirty tracking group.
+		 */
+		group->pinned_page_dirty_scope = true;
+		ret = 0;
+		goto out_unlock;
+	}
+
+	ret = -ENOMEM;
+	domain = kzalloc(sizeof(*domain), GFP_KERNEL);
+	if (!domain)
+		goto out_free_group;
+
+	/*
+	 * Going via the iommu_group iterator avoids races, and trivially gives
+	 * us a representative device for the IOMMU API call. We don't actually
+	 * want to iterate beyond the first device (if any).
+	 */
+	ret = -EIO;
+	iommu_group_for_each_dev(iommu_group, &domain->domain,
+				 vfio_iommu_domain_alloc);
+	if (!domain->domain)
+		goto out_free_domain;
+
+	if (iommu->nesting) {
+		ret = iommu_enable_nesting(domain->domain);
+		if (ret)
+			goto out_domain;
+	}
+
+	ret = iommu_attach_group(domain->domain, group->iommu_group);
+	if (ret)
+		goto out_domain;
+
+	/* Get aperture info */
+	geo = &domain->domain->geometry;
+	if (vfio_iommu_aper_conflict(iommu, geo->aperture_start,
+				     geo->aperture_end)) {
+		ret = -EINVAL;
+		goto out_detach;
+	}
+
+	ret = iommu_get_group_resv_regions(iommu_group, &group_resv_regions);
+	if (ret)
+		goto out_detach;
+
+	if (vfio_iommu_resv_conflict(iommu, &group_resv_regions)) {
+		ret = -EINVAL;
+		goto out_detach;
+	}
+
+	/*
+	 * We don't want to work on the original iova list as the list
+	 * gets modified and in case of failure we have to retain the
+	 * original list. Get a copy here.
+	 */
+	ret = vfio_iommu_iova_get_copy(iommu, &iova_copy);
+	if (ret)
+		goto out_detach;
+
+	ret = vfio_iommu_aper_resize(&iova_copy, geo->aperture_start,
+				     geo->aperture_end);
+	if (ret)
+		goto out_detach;
+
+	ret = vfio_iommu_resv_exclude(&iova_copy, &group_resv_regions);
+	if (ret)
+		goto out_detach;
+
+	resv_msi = vfio_iommu_has_sw_msi(&group_resv_regions, &resv_msi_base);
+
+	INIT_LIST_HEAD(&domain->group_list);
+	list_add(&group->next, &domain->group_list);
+
+	if (!allow_unsafe_interrupts &&
+	    !iommu_group_has_isolated_msi(iommu_group)) {
+		pr_warn("%s: No interrupt remapping support.  Use the module param \"allow_unsafe_interrupts\" to enable VFIO IOMMU support on this platform\n",
+		       __func__);
+		ret = -EPERM;
+		goto out_detach;
+	}
+
+	/*
+	 * If the IOMMU can block non-coherent operations (ie PCIe TLPs with
+	 * no-snoop set) then VFIO always turns this feature on because on Intel
+	 * platforms it optimizes KVM to disable wbinvd emulation.
+	 */
+	if (domain->domain->ops->enforce_cache_coherency)
+		domain->enforce_cache_coherency =
+			domain->domain->ops->enforce_cache_coherency(
+				domain->domain);
+
+	/*
+	 * Try to match an existing compatible domain.  We don't want to
+	 * preclude an IOMMU driver supporting multiple bus_types and being
+	 * able to include different bus_types in the same IOMMU domain, so
+	 * we test whether the domains use the same iommu_ops rather than
+	 * testing if they're on the same bus_type.
+	 */
+	list_for_each_entry(d, &iommu->domain_list, next) {
+		if (d->domain->ops == domain->domain->ops &&
+		    d->enforce_cache_coherency ==
+			    domain->enforce_cache_coherency) {
+			iommu_detach_group(domain->domain, group->iommu_group);
+			if (!iommu_attach_group(d->domain,
+						group->iommu_group)) {
+				list_add(&group->next, &d->group_list);
+				iommu_domain_free(domain->domain);
+				kfree(domain);
+				goto done;
+			}
+
+			ret = iommu_attach_group(domain->domain,
+						 group->iommu_group);
+			if (ret)
+				goto out_domain;
+		}
+	}
+
+	vfio_test_domain_fgsp(domain, &iova_copy);
+
+	/* replay mappings on new domains */
+	ret = vfio_iommu_replay(iommu, domain);
+	if (ret)
+		goto out_detach;
+
+	if (resv_msi) {
+		ret = iommu_get_msi_cookie(domain->domain, resv_msi_base);
+		if (ret && ret != -ENODEV)
+			goto out_detach;
+	}
+
+	list_add(&domain->next, &iommu->domain_list);
+	vfio_update_pgsize_bitmap(iommu);
+done:
+	/* Delete the old one and insert new iova list */
+	vfio_iommu_iova_insert_copy(iommu, &iova_copy);
+
+	/*
+	 * An iommu backed group can dirty memory directly and therefore
+	 * demotes the iommu scope until it declares itself dirty tracking
+	 * capable via the page pinning interface.
+	 */
+	iommu->num_non_pinned_groups++;
+	mutex_unlock(&iommu->lock);
+	vfio_iommu_resv_free(&group_resv_regions);
+
+	return 0;
+
+out_detach:
+	iommu_detach_group(domain->domain, group->iommu_group);
+out_domain:
+	iommu_domain_free(domain->domain);
+	vfio_iommu_iova_free(&iova_copy);
+	vfio_iommu_resv_free(&group_resv_regions);
+out_free_domain:
+	kfree(domain);
+out_free_group:
+	kfree(group);
+out_unlock:
+	mutex_unlock(&iommu->lock);
+	return ret;
+}
+
+static void vfio_iommu_unmap_unpin_all(struct vfio_iommu *iommu)
+{
+	struct rb_node *node;
+
+	while ((node = rb_first(&iommu->dma_list)))
+		vfio_remove_dma(iommu, rb_entry(node, struct vfio_dma, node));
+}
+
+static void vfio_iommu_unmap_unpin_reaccount(struct vfio_iommu *iommu)
+{
+	struct rb_node *n, *p;
+
+	n = rb_first(&iommu->dma_list);
+	for (; n; n = rb_next(n)) {
+		struct vfio_dma *dma;
+		long locked = 0, unlocked = 0;
+
+		dma = rb_entry(n, struct vfio_dma, node);
+		unlocked += vfio_unmap_unpin(iommu, dma, false);
+		p = rb_first(&dma->pfn_list);
+		for (; p; p = rb_next(p)) {
+			struct vfio_pfn *vpfn = rb_entry(p, struct vfio_pfn,
+							 node);
+
+			if (!is_invalid_reserved_pfn(vpfn->pfn))
+				locked++;
+		}
+		vfio_lock_acct(dma, locked - unlocked, true);
+	}
+}
+
+/*
+ * Called when a domain is removed in detach. It is possible that
+ * the removed domain decided the iova aperture window. Modify the
+ * iova aperture with the smallest window among existing domains.
+ */
+static void vfio_iommu_aper_expand(struct vfio_iommu *iommu,
+				   struct list_head *iova_copy)
+{
+	struct vfio_domain *domain;
+	struct vfio_iova *node;
+	dma_addr_t start = 0;
+	dma_addr_t end = (dma_addr_t)~0;
+
+	if (list_empty(iova_copy))
+		return;
+
+	list_for_each_entry(domain, &iommu->domain_list, next) {
+		struct iommu_domain_geometry *geo = &domain->domain->geometry;
+
+		if (geo->aperture_start > start)
+			start = geo->aperture_start;
+		if (geo->aperture_end < end)
+			end = geo->aperture_end;
+	}
+
+	/* Modify aperture limits. The new aper is either same or bigger */
+	node = list_first_entry(iova_copy, struct vfio_iova, list);
+	node->start = start;
+	node = list_last_entry(iova_copy, struct vfio_iova, list);
+	node->end = end;
+}
+
+/*
+ * Called when a group is detached. The reserved regions for that
+ * group can be part of valid iova now. But since reserved regions
+ * may be duplicated among groups, populate the iova valid regions
+ * list again.
+ */
+static int vfio_iommu_resv_refresh(struct vfio_iommu *iommu,
+				   struct list_head *iova_copy)
+{
+	struct vfio_domain *d;
+	struct vfio_iommu_group *g;
+	struct vfio_iova *node;
+	dma_addr_t start, end;
+	LIST_HEAD(resv_regions);
+	int ret;
+
+	if (list_empty(iova_copy))
+		return -EINVAL;
+
+	list_for_each_entry(d, &iommu->domain_list, next) {
+		list_for_each_entry(g, &d->group_list, next) {
+			ret = iommu_get_group_resv_regions(g->iommu_group,
+							   &resv_regions);
+			if (ret)
+				goto done;
+		}
+	}
+
+	node = list_first_entry(iova_copy, struct vfio_iova, list);
+	start = node->start;
+	node = list_last_entry(iova_copy, struct vfio_iova, list);
+	end = node->end;
+
+	/* purge the iova list and create new one */
+	vfio_iommu_iova_free(iova_copy);
+
+	ret = vfio_iommu_aper_resize(iova_copy, start, end);
+	if (ret)
+		goto done;
+
+	/* Exclude current reserved regions from iova ranges */
+	ret = vfio_iommu_resv_exclude(iova_copy, &resv_regions);
+done:
+	vfio_iommu_resv_free(&resv_regions);
+	return ret;
+}
+
+static void vfio_iommu_type1_detach_group(void *iommu_data,
+					  struct iommu_group *iommu_group)
+{
+	struct vfio_iommu *iommu = iommu_data;
+	struct vfio_domain *domain;
+	struct vfio_iommu_group *group;
+	bool update_dirty_scope = false;
+	LIST_HEAD(iova_copy);
+
+	mutex_lock(&iommu->lock);
+	list_for_each_entry(group, &iommu->emulated_iommu_groups, next) {
+		if (group->iommu_group != iommu_group)
+			continue;
+		update_dirty_scope = !group->pinned_page_dirty_scope;
+		list_del(&group->next);
+		kfree(group);
+
+		if (list_empty(&iommu->emulated_iommu_groups) &&
+		    list_empty(&iommu->domain_list)) {
+			WARN_ON(!list_empty(&iommu->device_list));
+			vfio_iommu_unmap_unpin_all(iommu);
+		}
+		goto detach_group_done;
+	}
+
+	/*
+	 * Get a copy of iova list. This will be used to update
+	 * and to replace the current one later. Please note that
+	 * we will leave the original list as it is if update fails.
+	 */
+	vfio_iommu_iova_get_copy(iommu, &iova_copy);
+
+	list_for_each_entry(domain, &iommu->domain_list, next) {
+		group = find_iommu_group(domain, iommu_group);
+		if (!group)
+			continue;
+
+		iommu_detach_group(domain->domain, group->iommu_group);
+		update_dirty_scope = !group->pinned_page_dirty_scope;
+		list_del(&group->next);
+		kfree(group);
+		/*
+		 * Group ownership provides privilege, if the group list is
+		 * empty, the domain goes away. If it's the last domain with
+		 * iommu and external domain doesn't exist, then all the
+		 * mappings go away too. If it's the last domain with iommu and
+		 * external domain exist, update accounting
+		 */
+		if (list_empty(&domain->group_list)) {
+			if (list_is_singular(&iommu->domain_list)) {
+				if (list_empty(&iommu->emulated_iommu_groups)) {
+					WARN_ON(!list_empty(
+						&iommu->device_list));
+					vfio_iommu_unmap_unpin_all(iommu);
+				} else {
+					vfio_iommu_unmap_unpin_reaccount(iommu);
+				}
+			}
+			iommu_domain_free(domain->domain);
+			list_del(&domain->next);
+			kfree(domain);
+			vfio_iommu_aper_expand(iommu, &iova_copy);
+			vfio_update_pgsize_bitmap(iommu);
+		}
+		break;
+	}
+
+	if (!vfio_iommu_resv_refresh(iommu, &iova_copy))
+		vfio_iommu_iova_insert_copy(iommu, &iova_copy);
+	else
+		vfio_iommu_iova_free(&iova_copy);
+
+detach_group_done:
+	/*
+	 * Removal of a group without dirty tracking may allow the iommu scope
+	 * to be promoted.
+	 */
+	if (update_dirty_scope) {
+		iommu->num_non_pinned_groups--;
+		if (iommu->dirty_page_tracking)
+			vfio_iommu_populate_bitmap_full(iommu);
+	}
+	mutex_unlock(&iommu->lock);
+}
+
+static void *vfio_iommu_type1_open(unsigned long arg)
+{
+	struct vfio_iommu *iommu;
+
+	iommu = kzalloc(sizeof(*iommu), GFP_KERNEL);
+	if (!iommu)
+		return ERR_PTR(-ENOMEM);
+
+	switch (arg) {
+	case VFIO_TYPE1_IOMMU:
+		break;
+	case VFIO_TYPE1_NESTING_IOMMU:
+		iommu->nesting = true;
+		fallthrough;
+	case VFIO_TYPE1v2_IOMMU:
+		iommu->v2 = true;
+		break;
+	default:
+		kfree(iommu);
+		return ERR_PTR(-EINVAL);
+	}
+
+	INIT_LIST_HEAD(&iommu->domain_list);
+	INIT_LIST_HEAD(&iommu->iova_list);
+	iommu->dma_list = RB_ROOT;
+	iommu->dma_avail = dma_entry_limit;
+	mutex_init(&iommu->lock);
+	mutex_init(&iommu->device_list_lock);
+	INIT_LIST_HEAD(&iommu->device_list);
+	iommu->pgsize_bitmap = PAGE_MASK;
+	INIT_LIST_HEAD(&iommu->emulated_iommu_groups);
+
+	return iommu;
+}
+
+static void vfio_release_domain(struct vfio_domain *domain)
+{
+	struct vfio_iommu_group *group, *group_tmp;
+
+	list_for_each_entry_safe(group, group_tmp,
+				 &domain->group_list, next) {
+		iommu_detach_group(domain->domain, group->iommu_group);
+		list_del(&group->next);
+		kfree(group);
+	}
+
+	iommu_domain_free(domain->domain);
+}
+
+static void vfio_iommu_type1_release(void *iommu_data)
+{
+	struct vfio_iommu *iommu = iommu_data;
+	struct vfio_domain *domain, *domain_tmp;
+	struct vfio_iommu_group *group, *next_group;
+
+	list_for_each_entry_safe(group, next_group,
+			&iommu->emulated_iommu_groups, next) {
+		list_del(&group->next);
+		kfree(group);
+	}
+
+	vfio_iommu_unmap_unpin_all(iommu);
+
+	list_for_each_entry_safe(domain, domain_tmp,
+				 &iommu->domain_list, next) {
+		vfio_release_domain(domain);
+		list_del(&domain->next);
+		kfree(domain);
+	}
+
+	vfio_iommu_iova_free(&iommu->iova_list);
+
+	kfree(iommu);
+}
+
+static int vfio_domains_have_enforce_cache_coherency(struct vfio_iommu *iommu)
+{
+	struct vfio_domain *domain;
+	int ret = 1;
+
+	mutex_lock(&iommu->lock);
+	list_for_each_entry(domain, &iommu->domain_list, next) {
+		if (!(domain->enforce_cache_coherency)) {
+			ret = 0;
+			break;
+		}
+	}
+	mutex_unlock(&iommu->lock);
+
+	return ret;
+}
+
+static bool vfio_iommu_has_emulated(struct vfio_iommu *iommu)
+{
+	bool ret;
+
+	mutex_lock(&iommu->lock);
+	ret = !list_empty(&iommu->emulated_iommu_groups);
+	mutex_unlock(&iommu->lock);
+	return ret;
+}
+
+static int vfio_iommu_type1_check_extension(struct vfio_iommu *iommu,
+					    unsigned long arg)
+{
+	switch (arg) {
+	case VFIO_TYPE1_IOMMU:
+	case VFIO_TYPE1v2_IOMMU:
+	case VFIO_TYPE1_NESTING_IOMMU:
+	case VFIO_UNMAP_ALL:
+		return 1;
+	case VFIO_UPDATE_VADDR:
+		/*
+		 * Disable this feature if mdevs are present.  They cannot
+		 * safely pin/unpin/rw while vaddrs are being updated.
+		 */
+		return iommu && !vfio_iommu_has_emulated(iommu);
+	case VFIO_DMA_CC_IOMMU:
+		if (!iommu)
+			return 0;
+		return vfio_domains_have_enforce_cache_coherency(iommu);
+	default:
+		return 0;
+	}
+}
+
+static int vfio_iommu_iova_add_cap(struct vfio_info_cap *caps,
+		 struct vfio_iommu_type1_info_cap_iova_range *cap_iovas,
+		 size_t size)
+{
+	struct vfio_info_cap_header *header;
+	struct vfio_iommu_type1_info_cap_iova_range *iova_cap;
+
+	header = vfio_info_cap_add(caps, size,
+				   VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE, 1);
+	if (IS_ERR(header))
+		return PTR_ERR(header);
+
+	iova_cap = container_of(header,
+				struct vfio_iommu_type1_info_cap_iova_range,
+				header);
+	iova_cap->nr_iovas = cap_iovas->nr_iovas;
+	memcpy(iova_cap->iova_ranges, cap_iovas->iova_ranges,
+	       cap_iovas->nr_iovas * sizeof(*cap_iovas->iova_ranges));
+	return 0;
+}
+
+static int vfio_iommu_iova_build_caps(struct vfio_iommu *iommu,
+				      struct vfio_info_cap *caps)
+{
+	struct vfio_iommu_type1_info_cap_iova_range *cap_iovas;
+	struct vfio_iova *iova;
+	size_t size;
+	int iovas = 0, i = 0, ret;
+
+	list_for_each_entry(iova, &iommu->iova_list, list)
+		iovas++;
+
+	if (!iovas) {
+		/*
+		 * Return 0 as a container with a single mdev device
+		 * will have an empty list
+		 */
+		return 0;
+	}
+
+	size = struct_size(cap_iovas, iova_ranges, iovas);
+
+	cap_iovas = kzalloc(size, GFP_KERNEL);
+	if (!cap_iovas)
+		return -ENOMEM;
+
+	cap_iovas->nr_iovas = iovas;
+
+	list_for_each_entry(iova, &iommu->iova_list, list) {
+		cap_iovas->iova_ranges[i].start = iova->start;
+		cap_iovas->iova_ranges[i].end = iova->end;
+		i++;
+	}
+
+	ret = vfio_iommu_iova_add_cap(caps, cap_iovas, size);
+
+	kfree(cap_iovas);
+	return ret;
+}
+
+static int vfio_iommu_migration_build_caps(struct vfio_iommu *iommu,
+					   struct vfio_info_cap *caps)
+{
+	struct vfio_iommu_type1_info_cap_migration cap_mig = {};
+
+	cap_mig.header.id = VFIO_IOMMU_TYPE1_INFO_CAP_MIGRATION;
+	cap_mig.header.version = 1;
+
+	cap_mig.flags = 0;
+	/* support minimum pgsize */
+	cap_mig.pgsize_bitmap = (size_t)1 << __ffs(iommu->pgsize_bitmap);
+	cap_mig.max_dirty_bitmap_size = DIRTY_BITMAP_SIZE_MAX;
+
+	return vfio_info_add_capability(caps, &cap_mig.header, sizeof(cap_mig));
+}
+
+static int vfio_iommu_dma_avail_build_caps(struct vfio_iommu *iommu,
+					   struct vfio_info_cap *caps)
+{
+	struct vfio_iommu_type1_info_dma_avail cap_dma_avail;
+
+	cap_dma_avail.header.id = VFIO_IOMMU_TYPE1_INFO_DMA_AVAIL;
+	cap_dma_avail.header.version = 1;
+
+	cap_dma_avail.avail = iommu->dma_avail;
+
+	return vfio_info_add_capability(caps, &cap_dma_avail.header,
+					sizeof(cap_dma_avail));
+}
+
+static int vfio_iommu_type1_get_info(struct vfio_iommu *iommu,
+				     unsigned long arg)
+{
+	struct vfio_iommu_type1_info info = {};
+	unsigned long minsz;
+	struct vfio_info_cap caps = { .buf = NULL, .size = 0 };
+	int ret;
+
+	minsz = offsetofend(struct vfio_iommu_type1_info, iova_pgsizes);
+
+	if (copy_from_user(&info, (void __user *)arg, minsz))
+		return -EFAULT;
+
+	if (info.argsz < minsz)
+		return -EINVAL;
+
+	minsz = min_t(size_t, info.argsz, sizeof(info));
+
+	mutex_lock(&iommu->lock);
+	info.flags = VFIO_IOMMU_INFO_PGSIZES;
+
+	info.iova_pgsizes = iommu->pgsize_bitmap;
+
+	ret = vfio_iommu_migration_build_caps(iommu, &caps);
+
+	if (!ret)
+		ret = vfio_iommu_dma_avail_build_caps(iommu, &caps);
+
+	if (!ret)
+		ret = vfio_iommu_iova_build_caps(iommu, &caps);
+
+	mutex_unlock(&iommu->lock);
+
+	if (ret)
+		return ret;
+
+	if (caps.size) {
+		info.flags |= VFIO_IOMMU_INFO_CAPS;
+
+		if (info.argsz < sizeof(info) + caps.size) {
+			info.argsz = sizeof(info) + caps.size;
+		} else {
+			vfio_info_cap_shift(&caps, sizeof(info));
+			if (copy_to_user((void __user *)arg +
+					sizeof(info), caps.buf,
+					caps.size)) {
+				kfree(caps.buf);
+				return -EFAULT;
+			}
+			info.cap_offset = sizeof(info);
+		}
+
+		kfree(caps.buf);
+	}
+
+	return copy_to_user((void __user *)arg, &info, minsz) ?
+			-EFAULT : 0;
+}
+
+static int vfio_iommu_type1_map_dma(struct vfio_iommu *iommu,
+				    unsigned long arg)
+{
+	struct vfio_iommu_type1_dma_map map;
+	unsigned long minsz;
+	uint32_t mask = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE |
+			VFIO_DMA_MAP_FLAG_VADDR;
+
+	minsz = offsetofend(struct vfio_iommu_type1_dma_map, size);
+
+	if (copy_from_user(&map, (void __user *)arg, minsz))
+		return -EFAULT;
+
+	if (map.argsz < minsz || map.flags & ~mask)
+		return -EINVAL;
+
+	return vfio_dma_do_map(iommu, &map);
+}
+
+static int vfio_iommu_type1_unmap_dma(struct vfio_iommu *iommu,
+				      unsigned long arg)
+{
+	struct vfio_iommu_type1_dma_unmap unmap;
+	struct vfio_bitmap bitmap = { 0 };
+	uint32_t mask = VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP |
+			VFIO_DMA_UNMAP_FLAG_VADDR |
+			VFIO_DMA_UNMAP_FLAG_ALL;
+	unsigned long minsz;
+	int ret;
+
+	minsz = offsetofend(struct vfio_iommu_type1_dma_unmap, size);
+
+	if (copy_from_user(&unmap, (void __user *)arg, minsz))
+		return -EFAULT;
+
+	if (unmap.argsz < minsz || unmap.flags & ~mask)
+		return -EINVAL;
+
+	if ((unmap.flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) &&
+	    (unmap.flags & (VFIO_DMA_UNMAP_FLAG_ALL |
+			    VFIO_DMA_UNMAP_FLAG_VADDR)))
+		return -EINVAL;
+
+	if (unmap.flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) {
+		unsigned long pgshift;
+
+		if (unmap.argsz < (minsz + sizeof(bitmap)))
+			return -EINVAL;
+
+		if (copy_from_user(&bitmap,
+				   (void __user *)(arg + minsz),
+				   sizeof(bitmap)))
+			return -EFAULT;
+
+		if (!access_ok((void __user *)bitmap.data, bitmap.size))
+			return -EINVAL;
+
+		pgshift = __ffs(bitmap.pgsize);
+		ret = verify_bitmap_size(unmap.size >> pgshift,
+					 bitmap.size);
+		if (ret)
+			return ret;
+	}
+
+	ret = vfio_dma_do_unmap(iommu, &unmap, &bitmap);
+	if (ret)
+		return ret;
+
+	return copy_to_user((void __user *)arg, &unmap, minsz) ?
+			-EFAULT : 0;
+}
+
+static int vfio_iommu_type1_dirty_pages(struct vfio_iommu *iommu,
+					unsigned long arg)
+{
+	struct vfio_iommu_type1_dirty_bitmap dirty;
+	uint32_t mask = VFIO_IOMMU_DIRTY_PAGES_FLAG_START |
+			VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP |
+			VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP;
+	unsigned long minsz;
+	int ret = 0;
+
+	if (!iommu->v2)
+		return -EACCES;
+
+	minsz = offsetofend(struct vfio_iommu_type1_dirty_bitmap, flags);
+
+	if (copy_from_user(&dirty, (void __user *)arg, minsz))
+		return -EFAULT;
+
+	if (dirty.argsz < minsz || dirty.flags & ~mask)
+		return -EINVAL;
+
+	/* only one flag should be set at a time */
+	if (__ffs(dirty.flags) != __fls(dirty.flags))
+		return -EINVAL;
+
+	if (dirty.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_START) {
+		size_t pgsize;
+
+		mutex_lock(&iommu->lock);
+		pgsize = 1 << __ffs(iommu->pgsize_bitmap);
+		if (!iommu->dirty_page_tracking) {
+			ret = vfio_dma_bitmap_alloc_all(iommu, pgsize);
+			if (!ret)
+				iommu->dirty_page_tracking = true;
+		}
+		mutex_unlock(&iommu->lock);
+		return ret;
+	} else if (dirty.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP) {
+		mutex_lock(&iommu->lock);
+		if (iommu->dirty_page_tracking) {
+			iommu->dirty_page_tracking = false;
+			vfio_dma_bitmap_free_all(iommu);
+		}
+		mutex_unlock(&iommu->lock);
+		return 0;
+	} else if (dirty.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP) {
+		struct vfio_iommu_type1_dirty_bitmap_get range;
+		unsigned long pgshift;
+		size_t data_size = dirty.argsz - minsz;
+		size_t iommu_pgsize;
+
+		if (!data_size || data_size < sizeof(range))
+			return -EINVAL;
+
+		if (copy_from_user(&range, (void __user *)(arg + minsz),
+				   sizeof(range)))
+			return -EFAULT;
+
+		if (range.iova + range.size < range.iova)
+			return -EINVAL;
+		if (!access_ok((void __user *)range.bitmap.data,
+			       range.bitmap.size))
+			return -EINVAL;
+
+		pgshift = __ffs(range.bitmap.pgsize);
+		ret = verify_bitmap_size(range.size >> pgshift,
+					 range.bitmap.size);
+		if (ret)
+			return ret;
+
+		mutex_lock(&iommu->lock);
+
+		iommu_pgsize = (size_t)1 << __ffs(iommu->pgsize_bitmap);
+
+		/* allow only smallest supported pgsize */
+		if (range.bitmap.pgsize != iommu_pgsize) {
+			ret = -EINVAL;
+			goto out_unlock;
+		}
+		if (range.iova & (iommu_pgsize - 1)) {
+			ret = -EINVAL;
+			goto out_unlock;
+		}
+		if (!range.size || range.size & (iommu_pgsize - 1)) {
+			ret = -EINVAL;
+			goto out_unlock;
+		}
+
+		if (iommu->dirty_page_tracking)
+			ret = vfio_iova_dirty_bitmap(range.bitmap.data,
+						     iommu, range.iova,
+						     range.size,
+						     range.bitmap.pgsize);
+		else
+			ret = -EINVAL;
+out_unlock:
+		mutex_unlock(&iommu->lock);
+
+		return ret;
+	}
+
+	return -EINVAL;
+}
+
+static long vfio_iommu_type1_ioctl(void *iommu_data,
+				   unsigned int cmd, unsigned long arg)
+{
+	struct vfio_iommu *iommu = iommu_data;
+
+	switch (cmd) {
+	case VFIO_CHECK_EXTENSION:
+		return vfio_iommu_type1_check_extension(iommu, arg);
+	case VFIO_IOMMU_GET_INFO:
+		return vfio_iommu_type1_get_info(iommu, arg);
+	case VFIO_IOMMU_MAP_DMA:
+		return vfio_iommu_type1_map_dma(iommu, arg);
+	case VFIO_IOMMU_UNMAP_DMA:
+		return vfio_iommu_type1_unmap_dma(iommu, arg);
+	case VFIO_IOMMU_DIRTY_PAGES:
+		return vfio_iommu_type1_dirty_pages(iommu, arg);
+	default:
+		return -ENOTTY;
+	}
+}
+
+static void vfio_iommu_type1_register_device(void *iommu_data,
+					     struct vfio_device *vdev)
+{
+	struct vfio_iommu *iommu = iommu_data;
+
+	if (!vdev->ops->dma_unmap)
+		return;
+
+	/*
+	 * list_empty(&iommu->device_list) is tested under the iommu->lock while
+	 * iteration for dma_unmap must be done under the device_list_lock.
+	 * Holding both locks here allows avoiding the device_list_lock in
+	 * several fast paths. See vfio_notify_dma_unmap()
+	 */
+	mutex_lock(&iommu->lock);
+	mutex_lock(&iommu->device_list_lock);
+	list_add(&vdev->iommu_entry, &iommu->device_list);
+	mutex_unlock(&iommu->device_list_lock);
+	mutex_unlock(&iommu->lock);
+}
+
+static void vfio_iommu_type1_unregister_device(void *iommu_data,
+					       struct vfio_device *vdev)
+{
+	struct vfio_iommu *iommu = iommu_data;
+
+	if (!vdev->ops->dma_unmap)
+		return;
+
+	mutex_lock(&iommu->lock);
+	mutex_lock(&iommu->device_list_lock);
+	list_del(&vdev->iommu_entry);
+	mutex_unlock(&iommu->device_list_lock);
+	mutex_unlock(&iommu->lock);
+}
+
+static int vfio_iommu_type1_dma_rw_chunk(struct vfio_iommu *iommu,
+					 dma_addr_t user_iova, void *data,
+					 size_t count, bool write,
+					 size_t *copied)
+{
+	struct mm_struct *mm;
+	unsigned long vaddr;
+	struct vfio_dma *dma;
+	bool kthread = current->mm == NULL;
+	size_t offset;
+
+	*copied = 0;
+
+	dma = vfio_find_dma(iommu, user_iova, 1);
+	if (!dma)
+		return -EINVAL;
+
+	if ((write && !(dma->prot & IOMMU_WRITE)) ||
+			!(dma->prot & IOMMU_READ))
+		return -EPERM;
+
+	mm = dma->mm;
+	if (!mmget_not_zero(mm))
+		return -EPERM;
+
+	if (kthread)
+		kthread_use_mm(mm);
+	else if (current->mm != mm)
+		goto out;
+
+	offset = user_iova - dma->iova;
+
+	if (count > dma->size - offset)
+		count = dma->size - offset;
+
+	vaddr = dma->vaddr + offset;
+
+	if (write) {
+		*copied = copy_to_user((void __user *)vaddr, data,
+					 count) ? 0 : count;
+		if (*copied && iommu->dirty_page_tracking) {
+			unsigned long pgshift = __ffs(iommu->pgsize_bitmap);
+			/*
+			 * Bitmap populated with the smallest supported page
+			 * size
+			 */
+			bitmap_set(dma->bitmap, offset >> pgshift,
+				   ((offset + *copied - 1) >> pgshift) -
+				   (offset >> pgshift) + 1);
+		}
+	} else
+		*copied = copy_from_user(data, (void __user *)vaddr,
+					   count) ? 0 : count;
+	if (kthread)
+		kthread_unuse_mm(mm);
+out:
+	mmput(mm);
+	return *copied ? 0 : -EFAULT;
+}
+
+static int vfio_iommu_type1_dma_rw(void *iommu_data, dma_addr_t user_iova,
+				   void *data, size_t count, bool write)
+{
+	struct vfio_iommu *iommu = iommu_data;
+	int ret = 0;
+	size_t done;
+
+	mutex_lock(&iommu->lock);
+
+	if (WARN_ONCE(iommu->vaddr_invalid_count,
+		      "vfio_dma_rw not allowed with VFIO_UPDATE_VADDR\n")) {
+		ret = -EBUSY;
+		goto out;
+	}
+
+	while (count > 0) {
+		ret = vfio_iommu_type1_dma_rw_chunk(iommu, user_iova, data,
+						    count, write, &done);
+		if (ret)
+			break;
+
+		count -= done;
+		data += done;
+		user_iova += done;
+	}
+
+out:
+	mutex_unlock(&iommu->lock);
+	return ret;
+}
+
+static struct iommu_domain *
+vfio_iommu_type1_group_iommu_domain(void *iommu_data,
+				    struct iommu_group *iommu_group)
+{
+	struct iommu_domain *domain = ERR_PTR(-ENODEV);
+	struct vfio_iommu *iommu = iommu_data;
+	struct vfio_domain *d;
+
+	if (!iommu || !iommu_group)
+		return ERR_PTR(-EINVAL);
+
+	mutex_lock(&iommu->lock);
+	list_for_each_entry(d, &iommu->domain_list, next) {
+		if (find_iommu_group(d, iommu_group)) {
+			domain = d->domain;
+			break;
+		}
+	}
+	mutex_unlock(&iommu->lock);
+
+	return domain;
+}
+
+static const struct vfio_iommu_driver_ops vfio_iommu_driver_ops_type1 = {
+	.name			= "vfio-iommu-type1",
+	.owner			= THIS_MODULE,
+	.open			= vfio_iommu_type1_open,
+	.release		= vfio_iommu_type1_release,
+	.ioctl			= vfio_iommu_type1_ioctl,
+	.attach_group		= vfio_iommu_type1_attach_group,
+	.detach_group		= vfio_iommu_type1_detach_group,
+	.pin_pages		= vfio_iommu_type1_pin_pages,
+	.unpin_pages		= vfio_iommu_type1_unpin_pages,
+	.register_device	= vfio_iommu_type1_register_device,
+	.unregister_device	= vfio_iommu_type1_unregister_device,
+	.dma_rw			= vfio_iommu_type1_dma_rw,
+	.group_iommu_domain	= vfio_iommu_type1_group_iommu_domain,
+};
+
+static int __init vfio_iommu_type1_init(void)
+{
+	return vfio_register_iommu_driver(&vfio_iommu_driver_ops_type1);
+}
+
+static void __exit vfio_iommu_type1_cleanup(void)
+{
+	vfio_unregister_iommu_driver(&vfio_iommu_driver_ops_type1);
+}
+
+module_init(vfio_iommu_type1_init);
+module_exit(vfio_iommu_type1_cleanup);
+
+MODULE_VERSION(DRIVER_VERSION);
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR(DRIVER_AUTHOR);
+MODULE_DESCRIPTION(DRIVER_DESC);
diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c
new file mode 100644
index 0000000000..40732e8ed4
--- /dev/null
+++ b/drivers/vfio/vfio_main.c
@@ -0,0 +1,1700 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * VFIO core
+ *
+ * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
+ *     Author: Alex Williamson <alex.williamson@redhat.com>
+ *
+ * Derived from original vfio:
+ * Copyright 2010 Cisco Systems, Inc.  All rights reserved.
+ * Author: Tom Lyon, pugs@cisco.com
+ */
+
+#include <linux/cdev.h>
+#include <linux/compat.h>
+#include <linux/device.h>
+#include <linux/fs.h>
+#include <linux/idr.h>
+#include <linux/iommu.h>
+#ifdef CONFIG_HAVE_KVM
+#include <linux/kvm_host.h>
+#endif
+#include <linux/list.h>
+#include <linux/miscdevice.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/pci.h>
+#include <linux/rwsem.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/stat.h>
+#include <linux/string.h>
+#include <linux/uaccess.h>
+#include <linux/vfio.h>
+#include <linux/wait.h>
+#include <linux/sched/signal.h>
+#include <linux/pm_runtime.h>
+#include <linux/interval_tree.h>
+#include <linux/iova_bitmap.h>
+#include <linux/iommufd.h>
+#include "vfio.h"
+
+#define DRIVER_VERSION	"0.3"
+#define DRIVER_AUTHOR	"Alex Williamson <alex.williamson@redhat.com>"
+#define DRIVER_DESC	"VFIO - User Level meta-driver"
+
+static struct vfio {
+	struct class			*device_class;
+	struct ida			device_ida;
+} vfio;
+
+#ifdef CONFIG_VFIO_NOIOMMU
+bool vfio_noiommu __read_mostly;
+module_param_named(enable_unsafe_noiommu_mode,
+		   vfio_noiommu, bool, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(enable_unsafe_noiommu_mode, "Enable UNSAFE, no-IOMMU mode.  This mode provides no device isolation, no DMA translation, no host kernel protection, cannot be used for device assignment to virtual machines, requires RAWIO permissions, and will taint the kernel.  If you do not know what this is for, step away. (default: false)");
+#endif
+
+static DEFINE_XARRAY(vfio_device_set_xa);
+
+int vfio_assign_device_set(struct vfio_device *device, void *set_id)
+{
+	unsigned long idx = (unsigned long)set_id;
+	struct vfio_device_set *new_dev_set;
+	struct vfio_device_set *dev_set;
+
+	if (WARN_ON(!set_id))
+		return -EINVAL;
+
+	/*
+	 * Atomically acquire a singleton object in the xarray for this set_id
+	 */
+	xa_lock(&vfio_device_set_xa);
+	dev_set = xa_load(&vfio_device_set_xa, idx);
+	if (dev_set)
+		goto found_get_ref;
+	xa_unlock(&vfio_device_set_xa);
+
+	new_dev_set = kzalloc(sizeof(*new_dev_set), GFP_KERNEL);
+	if (!new_dev_set)
+		return -ENOMEM;
+	mutex_init(&new_dev_set->lock);
+	INIT_LIST_HEAD(&new_dev_set->device_list);
+	new_dev_set->set_id = set_id;
+
+	xa_lock(&vfio_device_set_xa);
+	dev_set = __xa_cmpxchg(&vfio_device_set_xa, idx, NULL, new_dev_set,
+			       GFP_KERNEL);
+	if (!dev_set) {
+		dev_set = new_dev_set;
+		goto found_get_ref;
+	}
+
+	kfree(new_dev_set);
+	if (xa_is_err(dev_set)) {
+		xa_unlock(&vfio_device_set_xa);
+		return xa_err(dev_set);
+	}
+
+found_get_ref:
+	dev_set->device_count++;
+	xa_unlock(&vfio_device_set_xa);
+	mutex_lock(&dev_set->lock);
+	device->dev_set = dev_set;
+	list_add_tail(&device->dev_set_list, &dev_set->device_list);
+	mutex_unlock(&dev_set->lock);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(vfio_assign_device_set);
+
+static void vfio_release_device_set(struct vfio_device *device)
+{
+	struct vfio_device_set *dev_set = device->dev_set;
+
+	if (!dev_set)
+		return;
+
+	mutex_lock(&dev_set->lock);
+	list_del(&device->dev_set_list);
+	mutex_unlock(&dev_set->lock);
+
+	xa_lock(&vfio_device_set_xa);
+	if (!--dev_set->device_count) {
+		__xa_erase(&vfio_device_set_xa,
+			   (unsigned long)dev_set->set_id);
+		mutex_destroy(&dev_set->lock);
+		kfree(dev_set);
+	}
+	xa_unlock(&vfio_device_set_xa);
+}
+
+unsigned int vfio_device_set_open_count(struct vfio_device_set *dev_set)
+{
+	struct vfio_device *cur;
+	unsigned int open_count = 0;
+
+	lockdep_assert_held(&dev_set->lock);
+
+	list_for_each_entry(cur, &dev_set->device_list, dev_set_list)
+		open_count += cur->open_count;
+	return open_count;
+}
+EXPORT_SYMBOL_GPL(vfio_device_set_open_count);
+
+struct vfio_device *
+vfio_find_device_in_devset(struct vfio_device_set *dev_set,
+			   struct device *dev)
+{
+	struct vfio_device *cur;
+
+	lockdep_assert_held(&dev_set->lock);
+
+	list_for_each_entry(cur, &dev_set->device_list, dev_set_list)
+		if (cur->dev == dev)
+			return cur;
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(vfio_find_device_in_devset);
+
+/*
+ * Device objects - create, release, get, put, search
+ */
+/* Device reference always implies a group reference */
+void vfio_device_put_registration(struct vfio_device *device)
+{
+	if (refcount_dec_and_test(&device->refcount))
+		complete(&device->comp);
+}
+
+bool vfio_device_try_get_registration(struct vfio_device *device)
+{
+	return refcount_inc_not_zero(&device->refcount);
+}
+
+/*
+ * VFIO driver API
+ */
+/* Release helper called by vfio_put_device() */
+static void vfio_device_release(struct device *dev)
+{
+	struct vfio_device *device =
+			container_of(dev, struct vfio_device, device);
+
+	vfio_release_device_set(device);
+	ida_free(&vfio.device_ida, device->index);
+
+	if (device->ops->release)
+		device->ops->release(device);
+
+	kvfree(device);
+}
+
+static int vfio_init_device(struct vfio_device *device, struct device *dev,
+			    const struct vfio_device_ops *ops);
+
+/*
+ * Allocate and initialize vfio_device so it can be registered to vfio
+ * core.
+ *
+ * Drivers should use the wrapper vfio_alloc_device() for allocation.
+ * @size is the size of the structure to be allocated, including any
+ * private data used by the driver.
+ *
+ * Driver may provide an @init callback to cover device private data.
+ *
+ * Use vfio_put_device() to release the structure after success return.
+ */
+struct vfio_device *_vfio_alloc_device(size_t size, struct device *dev,
+				       const struct vfio_device_ops *ops)
+{
+	struct vfio_device *device;
+	int ret;
+
+	if (WARN_ON(size < sizeof(struct vfio_device)))
+		return ERR_PTR(-EINVAL);
+
+	device = kvzalloc(size, GFP_KERNEL);
+	if (!device)
+		return ERR_PTR(-ENOMEM);
+
+	ret = vfio_init_device(device, dev, ops);
+	if (ret)
+		goto out_free;
+	return device;
+
+out_free:
+	kvfree(device);
+	return ERR_PTR(ret);
+}
+EXPORT_SYMBOL_GPL(_vfio_alloc_device);
+
+/*
+ * Initialize a vfio_device so it can be registered to vfio core.
+ */
+static int vfio_init_device(struct vfio_device *device, struct device *dev,
+			    const struct vfio_device_ops *ops)
+{
+	int ret;
+
+	ret = ida_alloc_max(&vfio.device_ida, MINORMASK, GFP_KERNEL);
+	if (ret < 0) {
+		dev_dbg(dev, "Error to alloc index\n");
+		return ret;
+	}
+
+	device->index = ret;
+	init_completion(&device->comp);
+	device->dev = dev;
+	device->ops = ops;
+
+	if (ops->init) {
+		ret = ops->init(device);
+		if (ret)
+			goto out_uninit;
+	}
+
+	device_initialize(&device->device);
+	device->device.release = vfio_device_release;
+	device->device.class = vfio.device_class;
+	device->device.parent = device->dev;
+	return 0;
+
+out_uninit:
+	vfio_release_device_set(device);
+	ida_free(&vfio.device_ida, device->index);
+	return ret;
+}
+
+static int __vfio_register_dev(struct vfio_device *device,
+			       enum vfio_group_type type)
+{
+	int ret;
+
+	if (WARN_ON(IS_ENABLED(CONFIG_IOMMUFD) &&
+		    (!device->ops->bind_iommufd ||
+		     !device->ops->unbind_iommufd ||
+		     !device->ops->attach_ioas ||
+		     !device->ops->detach_ioas)))
+		return -EINVAL;
+
+	/*
+	 * If the driver doesn't specify a set then the device is added to a
+	 * singleton set just for itself.
+	 */
+	if (!device->dev_set)
+		vfio_assign_device_set(device, device);
+
+	ret = dev_set_name(&device->device, "vfio%d", device->index);
+	if (ret)
+		return ret;
+
+	ret = vfio_device_set_group(device, type);
+	if (ret)
+		return ret;
+
+	/*
+	 * VFIO always sets IOMMU_CACHE because we offer no way for userspace to
+	 * restore cache coherency. It has to be checked here because it is only
+	 * valid for cases where we are using iommu groups.
+	 */
+	if (type == VFIO_IOMMU && !vfio_device_is_noiommu(device) &&
+	    !device_iommu_capable(device->dev, IOMMU_CAP_CACHE_COHERENCY)) {
+		ret = -EINVAL;
+		goto err_out;
+	}
+
+	ret = vfio_device_add(device);
+	if (ret)
+		goto err_out;
+
+	/* Refcounting can't start until the driver calls register */
+	refcount_set(&device->refcount, 1);
+
+	vfio_device_group_register(device);
+
+	return 0;
+err_out:
+	vfio_device_remove_group(device);
+	return ret;
+}
+
+int vfio_register_group_dev(struct vfio_device *device)
+{
+	return __vfio_register_dev(device, VFIO_IOMMU);
+}
+EXPORT_SYMBOL_GPL(vfio_register_group_dev);
+
+/*
+ * Register a virtual device without IOMMU backing.  The user of this
+ * device must not be able to directly trigger unmediated DMA.
+ */
+int vfio_register_emulated_iommu_dev(struct vfio_device *device)
+{
+	return __vfio_register_dev(device, VFIO_EMULATED_IOMMU);
+}
+EXPORT_SYMBOL_GPL(vfio_register_emulated_iommu_dev);
+
+/*
+ * Decrement the device reference count and wait for the device to be
+ * removed.  Open file descriptors for the device... */
+void vfio_unregister_group_dev(struct vfio_device *device)
+{
+	unsigned int i = 0;
+	bool interrupted = false;
+	long rc;
+
+	/*
+	 * Prevent new device opened by userspace via the
+	 * VFIO_GROUP_GET_DEVICE_FD in the group path.
+	 */
+	vfio_device_group_unregister(device);
+
+	/*
+	 * Balances vfio_device_add() in register path, also prevents
+	 * new device opened by userspace in the cdev path.
+	 */
+	vfio_device_del(device);
+
+	vfio_device_put_registration(device);
+	rc = try_wait_for_completion(&device->comp);
+	while (rc <= 0) {
+		if (device->ops->request)
+			device->ops->request(device, i++);
+
+		if (interrupted) {
+			rc = wait_for_completion_timeout(&device->comp,
+							 HZ * 10);
+		} else {
+			rc = wait_for_completion_interruptible_timeout(
+				&device->comp, HZ * 10);
+			if (rc < 0) {
+				interrupted = true;
+				dev_warn(device->dev,
+					 "Device is currently in use, task"
+					 " \"%s\" (%d) "
+					 "blocked until device is released",
+					 current->comm, task_pid_nr(current));
+			}
+		}
+	}
+
+	/* Balances vfio_device_set_group in register path */
+	vfio_device_remove_group(device);
+}
+EXPORT_SYMBOL_GPL(vfio_unregister_group_dev);
+
+#ifdef CONFIG_HAVE_KVM
+void vfio_device_get_kvm_safe(struct vfio_device *device, struct kvm *kvm)
+{
+	void (*pfn)(struct kvm *kvm);
+	bool (*fn)(struct kvm *kvm);
+	bool ret;
+
+	lockdep_assert_held(&device->dev_set->lock);
+
+	if (!kvm)
+		return;
+
+	pfn = symbol_get(kvm_put_kvm);
+	if (WARN_ON(!pfn))
+		return;
+
+	fn = symbol_get(kvm_get_kvm_safe);
+	if (WARN_ON(!fn)) {
+		symbol_put(kvm_put_kvm);
+		return;
+	}
+
+	ret = fn(kvm);
+	symbol_put(kvm_get_kvm_safe);
+	if (!ret) {
+		symbol_put(kvm_put_kvm);
+		return;
+	}
+
+	device->put_kvm = pfn;
+	device->kvm = kvm;
+}
+
+void vfio_device_put_kvm(struct vfio_device *device)
+{
+	lockdep_assert_held(&device->dev_set->lock);
+
+	if (!device->kvm)
+		return;
+
+	if (WARN_ON(!device->put_kvm))
+		goto clear;
+
+	device->put_kvm(device->kvm);
+	device->put_kvm = NULL;
+	symbol_put(kvm_put_kvm);
+
+clear:
+	device->kvm = NULL;
+}
+#endif
+
+/* true if the vfio_device has open_device() called but not close_device() */
+static bool vfio_assert_device_open(struct vfio_device *device)
+{
+	return !WARN_ON_ONCE(!READ_ONCE(device->open_count));
+}
+
+struct vfio_device_file *
+vfio_allocate_device_file(struct vfio_device *device)
+{
+	struct vfio_device_file *df;
+
+	df = kzalloc(sizeof(*df), GFP_KERNEL_ACCOUNT);
+	if (!df)
+		return ERR_PTR(-ENOMEM);
+
+	df->device = device;
+	spin_lock_init(&df->kvm_ref_lock);
+
+	return df;
+}
+
+static int vfio_df_device_first_open(struct vfio_device_file *df)
+{
+	struct vfio_device *device = df->device;
+	struct iommufd_ctx *iommufd = df->iommufd;
+	int ret;
+
+	lockdep_assert_held(&device->dev_set->lock);
+
+	if (!try_module_get(device->dev->driver->owner))
+		return -ENODEV;
+
+	if (iommufd)
+		ret = vfio_df_iommufd_bind(df);
+	else
+		ret = vfio_device_group_use_iommu(device);
+	if (ret)
+		goto err_module_put;
+
+	if (device->ops->open_device) {
+		ret = device->ops->open_device(device);
+		if (ret)
+			goto err_unuse_iommu;
+	}
+	return 0;
+
+err_unuse_iommu:
+	if (iommufd)
+		vfio_df_iommufd_unbind(df);
+	else
+		vfio_device_group_unuse_iommu(device);
+err_module_put:
+	module_put(device->dev->driver->owner);
+	return ret;
+}
+
+static void vfio_df_device_last_close(struct vfio_device_file *df)
+{
+	struct vfio_device *device = df->device;
+	struct iommufd_ctx *iommufd = df->iommufd;
+
+	lockdep_assert_held(&device->dev_set->lock);
+
+	if (device->ops->close_device)
+		device->ops->close_device(device);
+	if (iommufd)
+		vfio_df_iommufd_unbind(df);
+	else
+		vfio_device_group_unuse_iommu(device);
+	module_put(device->dev->driver->owner);
+}
+
+int vfio_df_open(struct vfio_device_file *df)
+{
+	struct vfio_device *device = df->device;
+	int ret = 0;
+
+	lockdep_assert_held(&device->dev_set->lock);
+
+	/*
+	 * Only the group path allows the device to be opened multiple
+	 * times.  The device cdev path doesn't have a secure way for it.
+	 */
+	if (device->open_count != 0 && !df->group)
+		return -EINVAL;
+
+	device->open_count++;
+	if (device->open_count == 1) {
+		ret = vfio_df_device_first_open(df);
+		if (ret)
+			device->open_count--;
+	}
+
+	return ret;
+}
+
+void vfio_df_close(struct vfio_device_file *df)
+{
+	struct vfio_device *device = df->device;
+
+	lockdep_assert_held(&device->dev_set->lock);
+
+	vfio_assert_device_open(device);
+	if (device->open_count == 1)
+		vfio_df_device_last_close(df);
+	device->open_count--;
+}
+
+/*
+ * Wrapper around pm_runtime_resume_and_get().
+ * Return error code on failure or 0 on success.
+ */
+static inline int vfio_device_pm_runtime_get(struct vfio_device *device)
+{
+	struct device *dev = device->dev;
+
+	if (dev->driver && dev->driver->pm) {
+		int ret;
+
+		ret = pm_runtime_resume_and_get(dev);
+		if (ret) {
+			dev_info_ratelimited(dev,
+				"vfio: runtime resume failed %d\n", ret);
+			return -EIO;
+		}
+	}
+
+	return 0;
+}
+
+/*
+ * Wrapper around pm_runtime_put().
+ */
+static inline void vfio_device_pm_runtime_put(struct vfio_device *device)
+{
+	struct device *dev = device->dev;
+
+	if (dev->driver && dev->driver->pm)
+		pm_runtime_put(dev);
+}
+
+/*
+ * VFIO Device fd
+ */
+static int vfio_device_fops_release(struct inode *inode, struct file *filep)
+{
+	struct vfio_device_file *df = filep->private_data;
+	struct vfio_device *device = df->device;
+
+	if (df->group)
+		vfio_df_group_close(df);
+	else
+		vfio_df_unbind_iommufd(df);
+
+	vfio_device_put_registration(device);
+
+	kfree(df);
+
+	return 0;
+}
+
+/*
+ * vfio_mig_get_next_state - Compute the next step in the FSM
+ * @cur_fsm - The current state the device is in
+ * @new_fsm - The target state to reach
+ * @next_fsm - Pointer to the next step to get to new_fsm
+ *
+ * Return 0 upon success, otherwise -errno
+ * Upon success the next step in the state progression between cur_fsm and
+ * new_fsm will be set in next_fsm.
+ *
+ * This breaks down requests for combination transitions into smaller steps and
+ * returns the next step to get to new_fsm. The function may need to be called
+ * multiple times before reaching new_fsm.
+ *
+ */
+int vfio_mig_get_next_state(struct vfio_device *device,
+			    enum vfio_device_mig_state cur_fsm,
+			    enum vfio_device_mig_state new_fsm,
+			    enum vfio_device_mig_state *next_fsm)
+{
+	enum { VFIO_DEVICE_NUM_STATES = VFIO_DEVICE_STATE_PRE_COPY_P2P + 1 };
+	/*
+	 * The coding in this table requires the driver to implement the
+	 * following FSM arcs:
+	 *         RESUMING -> STOP
+	 *         STOP -> RESUMING
+	 *         STOP -> STOP_COPY
+	 *         STOP_COPY -> STOP
+	 *
+	 * If P2P is supported then the driver must also implement these FSM
+	 * arcs:
+	 *         RUNNING -> RUNNING_P2P
+	 *         RUNNING_P2P -> RUNNING
+	 *         RUNNING_P2P -> STOP
+	 *         STOP -> RUNNING_P2P
+	 *
+	 * If precopy is supported then the driver must support these additional
+	 * FSM arcs:
+	 *         RUNNING -> PRE_COPY
+	 *         PRE_COPY -> RUNNING
+	 *         PRE_COPY -> STOP_COPY
+	 * However, if precopy and P2P are supported together then the driver
+	 * must support these additional arcs beyond the P2P arcs above:
+	 *         PRE_COPY -> RUNNING
+	 *         PRE_COPY -> PRE_COPY_P2P
+	 *         PRE_COPY_P2P -> PRE_COPY
+	 *         PRE_COPY_P2P -> RUNNING_P2P
+	 *         PRE_COPY_P2P -> STOP_COPY
+	 *         RUNNING -> PRE_COPY
+	 *         RUNNING_P2P -> PRE_COPY_P2P
+	 *
+	 * Without P2P and precopy the driver must implement:
+	 *         RUNNING -> STOP
+	 *         STOP -> RUNNING
+	 *
+	 * The coding will step through multiple states for some combination
+	 * transitions; if all optional features are supported, this means the
+	 * following ones:
+	 *         PRE_COPY -> PRE_COPY_P2P -> STOP_COPY
+	 *         PRE_COPY -> RUNNING -> RUNNING_P2P
+	 *         PRE_COPY -> RUNNING -> RUNNING_P2P -> STOP
+	 *         PRE_COPY -> RUNNING -> RUNNING_P2P -> STOP -> RESUMING
+	 *         PRE_COPY_P2P -> RUNNING_P2P -> RUNNING
+	 *         PRE_COPY_P2P -> RUNNING_P2P -> STOP
+	 *         PRE_COPY_P2P -> RUNNING_P2P -> STOP -> RESUMING
+	 *         RESUMING -> STOP -> RUNNING_P2P
+	 *         RESUMING -> STOP -> RUNNING_P2P -> PRE_COPY_P2P
+	 *         RESUMING -> STOP -> RUNNING_P2P -> RUNNING
+	 *         RESUMING -> STOP -> RUNNING_P2P -> RUNNING -> PRE_COPY
+	 *         RESUMING -> STOP -> STOP_COPY
+	 *         RUNNING -> RUNNING_P2P -> PRE_COPY_P2P
+	 *         RUNNING -> RUNNING_P2P -> STOP
+	 *         RUNNING -> RUNNING_P2P -> STOP -> RESUMING
+	 *         RUNNING -> RUNNING_P2P -> STOP -> STOP_COPY
+	 *         RUNNING_P2P -> RUNNING -> PRE_COPY
+	 *         RUNNING_P2P -> STOP -> RESUMING
+	 *         RUNNING_P2P -> STOP -> STOP_COPY
+	 *         STOP -> RUNNING_P2P -> PRE_COPY_P2P
+	 *         STOP -> RUNNING_P2P -> RUNNING
+	 *         STOP -> RUNNING_P2P -> RUNNING -> PRE_COPY
+	 *         STOP_COPY -> STOP -> RESUMING
+	 *         STOP_COPY -> STOP -> RUNNING_P2P
+	 *         STOP_COPY -> STOP -> RUNNING_P2P -> RUNNING
+	 *
+	 *  The following transitions are blocked:
+	 *         STOP_COPY -> PRE_COPY
+	 *         STOP_COPY -> PRE_COPY_P2P
+	 */
+	static const u8 vfio_from_fsm_table[VFIO_DEVICE_NUM_STATES][VFIO_DEVICE_NUM_STATES] = {
+		[VFIO_DEVICE_STATE_STOP] = {
+			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
+			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING_P2P,
+			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_RUNNING_P2P,
+			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
+			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY,
+			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RESUMING,
+			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
+			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
+		},
+		[VFIO_DEVICE_STATE_RUNNING] = {
+			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING_P2P,
+			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING,
+			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY,
+			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
+			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_RUNNING_P2P,
+			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING_P2P,
+			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
+			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
+		},
+		[VFIO_DEVICE_STATE_PRE_COPY] = {
+			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING,
+			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING,
+			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY,
+			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
+			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
+			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING,
+			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING,
+			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
+		},
+		[VFIO_DEVICE_STATE_PRE_COPY_P2P] = {
+			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING_P2P,
+			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING_P2P,
+			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY,
+			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
+			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY,
+			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING_P2P,
+			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
+			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
+		},
+		[VFIO_DEVICE_STATE_STOP_COPY] = {
+			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
+			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_STOP,
+			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_ERROR,
+			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_ERROR,
+			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY,
+			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_STOP,
+			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_STOP,
+			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
+		},
+		[VFIO_DEVICE_STATE_RESUMING] = {
+			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
+			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_STOP,
+			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_STOP,
+			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_STOP,
+			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP,
+			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RESUMING,
+			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_STOP,
+			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
+		},
+		[VFIO_DEVICE_STATE_RUNNING_P2P] = {
+			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
+			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING,
+			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_RUNNING,
+			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
+			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP,
+			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_STOP,
+			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
+			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
+		},
+		[VFIO_DEVICE_STATE_ERROR] = {
+			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_ERROR,
+			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_ERROR,
+			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_ERROR,
+			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_ERROR,
+			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_ERROR,
+			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_ERROR,
+			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_ERROR,
+			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
+		},
+	};
+
+	static const unsigned int state_flags_table[VFIO_DEVICE_NUM_STATES] = {
+		[VFIO_DEVICE_STATE_STOP] = VFIO_MIGRATION_STOP_COPY,
+		[VFIO_DEVICE_STATE_RUNNING] = VFIO_MIGRATION_STOP_COPY,
+		[VFIO_DEVICE_STATE_PRE_COPY] =
+			VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_PRE_COPY,
+		[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_MIGRATION_STOP_COPY |
+						   VFIO_MIGRATION_P2P |
+						   VFIO_MIGRATION_PRE_COPY,
+		[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_MIGRATION_STOP_COPY,
+		[VFIO_DEVICE_STATE_RESUMING] = VFIO_MIGRATION_STOP_COPY,
+		[VFIO_DEVICE_STATE_RUNNING_P2P] =
+			VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_P2P,
+		[VFIO_DEVICE_STATE_ERROR] = ~0U,
+	};
+
+	if (WARN_ON(cur_fsm >= ARRAY_SIZE(vfio_from_fsm_table) ||
+		    (state_flags_table[cur_fsm] & device->migration_flags) !=
+			state_flags_table[cur_fsm]))
+		return -EINVAL;
+
+	if (new_fsm >= ARRAY_SIZE(vfio_from_fsm_table) ||
+	   (state_flags_table[new_fsm] & device->migration_flags) !=
+			state_flags_table[new_fsm])
+		return -EINVAL;
+
+	/*
+	 * Arcs touching optional and unsupported states are skipped over. The
+	 * driver will instead see an arc from the original state to the next
+	 * logical state, as per the above comment.
+	 */
+	*next_fsm = vfio_from_fsm_table[cur_fsm][new_fsm];
+	while ((state_flags_table[*next_fsm] & device->migration_flags) !=
+			state_flags_table[*next_fsm])
+		*next_fsm = vfio_from_fsm_table[*next_fsm][new_fsm];
+
+	return (*next_fsm != VFIO_DEVICE_STATE_ERROR) ? 0 : -EINVAL;
+}
+EXPORT_SYMBOL_GPL(vfio_mig_get_next_state);
+
+/*
+ * Convert the drivers's struct file into a FD number and return it to userspace
+ */
+static int vfio_ioct_mig_return_fd(struct file *filp, void __user *arg,
+				   struct vfio_device_feature_mig_state *mig)
+{
+	int ret;
+	int fd;
+
+	fd = get_unused_fd_flags(O_CLOEXEC);
+	if (fd < 0) {
+		ret = fd;
+		goto out_fput;
+	}
+
+	mig->data_fd = fd;
+	if (copy_to_user(arg, mig, sizeof(*mig))) {
+		ret = -EFAULT;
+		goto out_put_unused;
+	}
+	fd_install(fd, filp);
+	return 0;
+
+out_put_unused:
+	put_unused_fd(fd);
+out_fput:
+	fput(filp);
+	return ret;
+}
+
+static int
+vfio_ioctl_device_feature_mig_device_state(struct vfio_device *device,
+					   u32 flags, void __user *arg,
+					   size_t argsz)
+{
+	size_t minsz =
+		offsetofend(struct vfio_device_feature_mig_state, data_fd);
+	struct vfio_device_feature_mig_state mig;
+	struct file *filp = NULL;
+	int ret;
+
+	if (!device->mig_ops)
+		return -ENOTTY;
+
+	ret = vfio_check_feature(flags, argsz,
+				 VFIO_DEVICE_FEATURE_SET |
+				 VFIO_DEVICE_FEATURE_GET,
+				 sizeof(mig));
+	if (ret != 1)
+		return ret;
+
+	if (copy_from_user(&mig, arg, minsz))
+		return -EFAULT;
+
+	if (flags & VFIO_DEVICE_FEATURE_GET) {
+		enum vfio_device_mig_state curr_state;
+
+		ret = device->mig_ops->migration_get_state(device,
+							   &curr_state);
+		if (ret)
+			return ret;
+		mig.device_state = curr_state;
+		goto out_copy;
+	}
+
+	/* Handle the VFIO_DEVICE_FEATURE_SET */
+	filp = device->mig_ops->migration_set_state(device, mig.device_state);
+	if (IS_ERR(filp) || !filp)
+		goto out_copy;
+
+	return vfio_ioct_mig_return_fd(filp, arg, &mig);
+out_copy:
+	mig.data_fd = -1;
+	if (copy_to_user(arg, &mig, sizeof(mig)))
+		return -EFAULT;
+	if (IS_ERR(filp))
+		return PTR_ERR(filp);
+	return 0;
+}
+
+static int
+vfio_ioctl_device_feature_migration_data_size(struct vfio_device *device,
+					      u32 flags, void __user *arg,
+					      size_t argsz)
+{
+	struct vfio_device_feature_mig_data_size data_size = {};
+	unsigned long stop_copy_length;
+	int ret;
+
+	if (!device->mig_ops)
+		return -ENOTTY;
+
+	ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET,
+				 sizeof(data_size));
+	if (ret != 1)
+		return ret;
+
+	ret = device->mig_ops->migration_get_data_size(device, &stop_copy_length);
+	if (ret)
+		return ret;
+
+	data_size.stop_copy_length = stop_copy_length;
+	if (copy_to_user(arg, &data_size, sizeof(data_size)))
+		return -EFAULT;
+
+	return 0;
+}
+
+static int vfio_ioctl_device_feature_migration(struct vfio_device *device,
+					       u32 flags, void __user *arg,
+					       size_t argsz)
+{
+	struct vfio_device_feature_migration mig = {
+		.flags = device->migration_flags,
+	};
+	int ret;
+
+	if (!device->mig_ops)
+		return -ENOTTY;
+
+	ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET,
+				 sizeof(mig));
+	if (ret != 1)
+		return ret;
+	if (copy_to_user(arg, &mig, sizeof(mig)))
+		return -EFAULT;
+	return 0;
+}
+
+void vfio_combine_iova_ranges(struct rb_root_cached *root, u32 cur_nodes,
+			      u32 req_nodes)
+{
+	struct interval_tree_node *prev, *curr, *comb_start, *comb_end;
+	unsigned long min_gap, curr_gap;
+
+	/* Special shortcut when a single range is required */
+	if (req_nodes == 1) {
+		unsigned long last;
+
+		comb_start = interval_tree_iter_first(root, 0, ULONG_MAX);
+		curr = comb_start;
+		while (curr) {
+			last = curr->last;
+			prev = curr;
+			curr = interval_tree_iter_next(curr, 0, ULONG_MAX);
+			if (prev != comb_start)
+				interval_tree_remove(prev, root);
+		}
+		comb_start->last = last;
+		return;
+	}
+
+	/* Combine ranges which have the smallest gap */
+	while (cur_nodes > req_nodes) {
+		prev = NULL;
+		min_gap = ULONG_MAX;
+		curr = interval_tree_iter_first(root, 0, ULONG_MAX);
+		while (curr) {
+			if (prev) {
+				curr_gap = curr->start - prev->last;
+				if (curr_gap < min_gap) {
+					min_gap = curr_gap;
+					comb_start = prev;
+					comb_end = curr;
+				}
+			}
+			prev = curr;
+			curr = interval_tree_iter_next(curr, 0, ULONG_MAX);
+		}
+		comb_start->last = comb_end->last;
+		interval_tree_remove(comb_end, root);
+		cur_nodes--;
+	}
+}
+EXPORT_SYMBOL_GPL(vfio_combine_iova_ranges);
+
+/* Ranges should fit into a single kernel page */
+#define LOG_MAX_RANGES \
+	(PAGE_SIZE / sizeof(struct vfio_device_feature_dma_logging_range))
+
+static int
+vfio_ioctl_device_feature_logging_start(struct vfio_device *device,
+					u32 flags, void __user *arg,
+					size_t argsz)
+{
+	size_t minsz =
+		offsetofend(struct vfio_device_feature_dma_logging_control,
+			    ranges);
+	struct vfio_device_feature_dma_logging_range __user *ranges;
+	struct vfio_device_feature_dma_logging_control control;
+	struct vfio_device_feature_dma_logging_range range;
+	struct rb_root_cached root = RB_ROOT_CACHED;
+	struct interval_tree_node *nodes;
+	u64 iova_end;
+	u32 nnodes;
+	int i, ret;
+
+	if (!device->log_ops)
+		return -ENOTTY;
+
+	ret = vfio_check_feature(flags, argsz,
+				 VFIO_DEVICE_FEATURE_SET,
+				 sizeof(control));
+	if (ret != 1)
+		return ret;
+
+	if (copy_from_user(&control, arg, minsz))
+		return -EFAULT;
+
+	nnodes = control.num_ranges;
+	if (!nnodes)
+		return -EINVAL;
+
+	if (nnodes > LOG_MAX_RANGES)
+		return -E2BIG;
+
+	ranges = u64_to_user_ptr(control.ranges);
+	nodes = kmalloc_array(nnodes, sizeof(struct interval_tree_node),
+			      GFP_KERNEL);
+	if (!nodes)
+		return -ENOMEM;
+
+	for (i = 0; i < nnodes; i++) {
+		if (copy_from_user(&range, &ranges[i], sizeof(range))) {
+			ret = -EFAULT;
+			goto end;
+		}
+		if (!IS_ALIGNED(range.iova, control.page_size) ||
+		    !IS_ALIGNED(range.length, control.page_size)) {
+			ret = -EINVAL;
+			goto end;
+		}
+
+		if (check_add_overflow(range.iova, range.length, &iova_end) ||
+		    iova_end > ULONG_MAX) {
+			ret = -EOVERFLOW;
+			goto end;
+		}
+
+		nodes[i].start = range.iova;
+		nodes[i].last = range.iova + range.length - 1;
+		if (interval_tree_iter_first(&root, nodes[i].start,
+					     nodes[i].last)) {
+			/* Range overlapping */
+			ret = -EINVAL;
+			goto end;
+		}
+		interval_tree_insert(nodes + i, &root);
+	}
+
+	ret = device->log_ops->log_start(device, &root, nnodes,
+					 &control.page_size);
+	if (ret)
+		goto end;
+
+	if (copy_to_user(arg, &control, sizeof(control))) {
+		ret = -EFAULT;
+		device->log_ops->log_stop(device);
+	}
+
+end:
+	kfree(nodes);
+	return ret;
+}
+
+static int
+vfio_ioctl_device_feature_logging_stop(struct vfio_device *device,
+				       u32 flags, void __user *arg,
+				       size_t argsz)
+{
+	int ret;
+
+	if (!device->log_ops)
+		return -ENOTTY;
+
+	ret = vfio_check_feature(flags, argsz,
+				 VFIO_DEVICE_FEATURE_SET, 0);
+	if (ret != 1)
+		return ret;
+
+	return device->log_ops->log_stop(device);
+}
+
+static int vfio_device_log_read_and_clear(struct iova_bitmap *iter,
+					  unsigned long iova, size_t length,
+					  void *opaque)
+{
+	struct vfio_device *device = opaque;
+
+	return device->log_ops->log_read_and_clear(device, iova, length, iter);
+}
+
+static int
+vfio_ioctl_device_feature_logging_report(struct vfio_device *device,
+					 u32 flags, void __user *arg,
+					 size_t argsz)
+{
+	size_t minsz =
+		offsetofend(struct vfio_device_feature_dma_logging_report,
+			    bitmap);
+	struct vfio_device_feature_dma_logging_report report;
+	struct iova_bitmap *iter;
+	u64 iova_end;
+	int ret;
+
+	if (!device->log_ops)
+		return -ENOTTY;
+
+	ret = vfio_check_feature(flags, argsz,
+				 VFIO_DEVICE_FEATURE_GET,
+				 sizeof(report));
+	if (ret != 1)
+		return ret;
+
+	if (copy_from_user(&report, arg, minsz))
+		return -EFAULT;
+
+	if (report.page_size < SZ_4K || !is_power_of_2(report.page_size))
+		return -EINVAL;
+
+	if (check_add_overflow(report.iova, report.length, &iova_end) ||
+	    iova_end > ULONG_MAX)
+		return -EOVERFLOW;
+
+	iter = iova_bitmap_alloc(report.iova, report.length,
+				 report.page_size,
+				 u64_to_user_ptr(report.bitmap));
+	if (IS_ERR(iter))
+		return PTR_ERR(iter);
+
+	ret = iova_bitmap_for_each(iter, device,
+				   vfio_device_log_read_and_clear);
+
+	iova_bitmap_free(iter);
+	return ret;
+}
+
+static int vfio_ioctl_device_feature(struct vfio_device *device,
+				     struct vfio_device_feature __user *arg)
+{
+	size_t minsz = offsetofend(struct vfio_device_feature, flags);
+	struct vfio_device_feature feature;
+
+	if (copy_from_user(&feature, arg, minsz))
+		return -EFAULT;
+
+	if (feature.argsz < minsz)
+		return -EINVAL;
+
+	/* Check unknown flags */
+	if (feature.flags &
+	    ~(VFIO_DEVICE_FEATURE_MASK | VFIO_DEVICE_FEATURE_SET |
+	      VFIO_DEVICE_FEATURE_GET | VFIO_DEVICE_FEATURE_PROBE))
+		return -EINVAL;
+
+	/* GET & SET are mutually exclusive except with PROBE */
+	if (!(feature.flags & VFIO_DEVICE_FEATURE_PROBE) &&
+	    (feature.flags & VFIO_DEVICE_FEATURE_SET) &&
+	    (feature.flags & VFIO_DEVICE_FEATURE_GET))
+		return -EINVAL;
+
+	switch (feature.flags & VFIO_DEVICE_FEATURE_MASK) {
+	case VFIO_DEVICE_FEATURE_MIGRATION:
+		return vfio_ioctl_device_feature_migration(
+			device, feature.flags, arg->data,
+			feature.argsz - minsz);
+	case VFIO_DEVICE_FEATURE_MIG_DEVICE_STATE:
+		return vfio_ioctl_device_feature_mig_device_state(
+			device, feature.flags, arg->data,
+			feature.argsz - minsz);
+	case VFIO_DEVICE_FEATURE_DMA_LOGGING_START:
+		return vfio_ioctl_device_feature_logging_start(
+			device, feature.flags, arg->data,
+			feature.argsz - minsz);
+	case VFIO_DEVICE_FEATURE_DMA_LOGGING_STOP:
+		return vfio_ioctl_device_feature_logging_stop(
+			device, feature.flags, arg->data,
+			feature.argsz - minsz);
+	case VFIO_DEVICE_FEATURE_DMA_LOGGING_REPORT:
+		return vfio_ioctl_device_feature_logging_report(
+			device, feature.flags, arg->data,
+			feature.argsz - minsz);
+	case VFIO_DEVICE_FEATURE_MIG_DATA_SIZE:
+		return vfio_ioctl_device_feature_migration_data_size(
+			device, feature.flags, arg->data,
+			feature.argsz - minsz);
+	default:
+		if (unlikely(!device->ops->device_feature))
+			return -EINVAL;
+		return device->ops->device_feature(device, feature.flags,
+						   arg->data,
+						   feature.argsz - minsz);
+	}
+}
+
+static long vfio_device_fops_unl_ioctl(struct file *filep,
+				       unsigned int cmd, unsigned long arg)
+{
+	struct vfio_device_file *df = filep->private_data;
+	struct vfio_device *device = df->device;
+	void __user *uptr = (void __user *)arg;
+	int ret;
+
+	if (cmd == VFIO_DEVICE_BIND_IOMMUFD)
+		return vfio_df_ioctl_bind_iommufd(df, uptr);
+
+	/* Paired with smp_store_release() following vfio_df_open() */
+	if (!smp_load_acquire(&df->access_granted))
+		return -EINVAL;
+
+	ret = vfio_device_pm_runtime_get(device);
+	if (ret)
+		return ret;
+
+	/* cdev only ioctls */
+	if (IS_ENABLED(CONFIG_VFIO_DEVICE_CDEV) && !df->group) {
+		switch (cmd) {
+		case VFIO_DEVICE_ATTACH_IOMMUFD_PT:
+			ret = vfio_df_ioctl_attach_pt(df, uptr);
+			goto out;
+
+		case VFIO_DEVICE_DETACH_IOMMUFD_PT:
+			ret = vfio_df_ioctl_detach_pt(df, uptr);
+			goto out;
+		}
+	}
+
+	switch (cmd) {
+	case VFIO_DEVICE_FEATURE:
+		ret = vfio_ioctl_device_feature(device, uptr);
+		break;
+
+	default:
+		if (unlikely(!device->ops->ioctl))
+			ret = -EINVAL;
+		else
+			ret = device->ops->ioctl(device, cmd, arg);
+		break;
+	}
+out:
+	vfio_device_pm_runtime_put(device);
+	return ret;
+}
+
+static ssize_t vfio_device_fops_read(struct file *filep, char __user *buf,
+				     size_t count, loff_t *ppos)
+{
+	struct vfio_device_file *df = filep->private_data;
+	struct vfio_device *device = df->device;
+
+	/* Paired with smp_store_release() following vfio_df_open() */
+	if (!smp_load_acquire(&df->access_granted))
+		return -EINVAL;
+
+	if (unlikely(!device->ops->read))
+		return -EINVAL;
+
+	return device->ops->read(device, buf, count, ppos);
+}
+
+static ssize_t vfio_device_fops_write(struct file *filep,
+				      const char __user *buf,
+				      size_t count, loff_t *ppos)
+{
+	struct vfio_device_file *df = filep->private_data;
+	struct vfio_device *device = df->device;
+
+	/* Paired with smp_store_release() following vfio_df_open() */
+	if (!smp_load_acquire(&df->access_granted))
+		return -EINVAL;
+
+	if (unlikely(!device->ops->write))
+		return -EINVAL;
+
+	return device->ops->write(device, buf, count, ppos);
+}
+
+static int vfio_device_fops_mmap(struct file *filep, struct vm_area_struct *vma)
+{
+	struct vfio_device_file *df = filep->private_data;
+	struct vfio_device *device = df->device;
+
+	/* Paired with smp_store_release() following vfio_df_open() */
+	if (!smp_load_acquire(&df->access_granted))
+		return -EINVAL;
+
+	if (unlikely(!device->ops->mmap))
+		return -EINVAL;
+
+	return device->ops->mmap(device, vma);
+}
+
+const struct file_operations vfio_device_fops = {
+	.owner		= THIS_MODULE,
+	.open		= vfio_device_fops_cdev_open,
+	.release	= vfio_device_fops_release,
+	.read		= vfio_device_fops_read,
+	.write		= vfio_device_fops_write,
+	.unlocked_ioctl	= vfio_device_fops_unl_ioctl,
+	.compat_ioctl	= compat_ptr_ioctl,
+	.mmap		= vfio_device_fops_mmap,
+};
+
+static struct vfio_device *vfio_device_from_file(struct file *file)
+{
+	struct vfio_device_file *df = file->private_data;
+
+	if (file->f_op != &vfio_device_fops)
+		return NULL;
+	return df->device;
+}
+
+/**
+ * vfio_file_is_valid - True if the file is valid vfio file
+ * @file: VFIO group file or VFIO device file
+ */
+bool vfio_file_is_valid(struct file *file)
+{
+	return vfio_group_from_file(file) ||
+	       vfio_device_from_file(file);
+}
+EXPORT_SYMBOL_GPL(vfio_file_is_valid);
+
+/**
+ * vfio_file_enforced_coherent - True if the DMA associated with the VFIO file
+ *        is always CPU cache coherent
+ * @file: VFIO group file or VFIO device file
+ *
+ * Enforced coherency means that the IOMMU ignores things like the PCIe no-snoop
+ * bit in DMA transactions. A return of false indicates that the user has
+ * rights to access additional instructions such as wbinvd on x86.
+ */
+bool vfio_file_enforced_coherent(struct file *file)
+{
+	struct vfio_device *device;
+	struct vfio_group *group;
+
+	group = vfio_group_from_file(file);
+	if (group)
+		return vfio_group_enforced_coherent(group);
+
+	device = vfio_device_from_file(file);
+	if (device)
+		return device_iommu_capable(device->dev,
+					    IOMMU_CAP_ENFORCE_CACHE_COHERENCY);
+
+	return true;
+}
+EXPORT_SYMBOL_GPL(vfio_file_enforced_coherent);
+
+static void vfio_device_file_set_kvm(struct file *file, struct kvm *kvm)
+{
+	struct vfio_device_file *df = file->private_data;
+
+	/*
+	 * The kvm is first recorded in the vfio_device_file, and will
+	 * be propagated to vfio_device::kvm when the file is bound to
+	 * iommufd successfully in the vfio device cdev path.
+	 */
+	spin_lock(&df->kvm_ref_lock);
+	df->kvm = kvm;
+	spin_unlock(&df->kvm_ref_lock);
+}
+
+/**
+ * vfio_file_set_kvm - Link a kvm with VFIO drivers
+ * @file: VFIO group file or VFIO device file
+ * @kvm: KVM to link
+ *
+ * When a VFIO device is first opened the KVM will be available in
+ * device->kvm if one was associated with the file.
+ */
+void vfio_file_set_kvm(struct file *file, struct kvm *kvm)
+{
+	struct vfio_group *group;
+
+	group = vfio_group_from_file(file);
+	if (group)
+		vfio_group_set_kvm(group, kvm);
+
+	if (vfio_device_from_file(file))
+		vfio_device_file_set_kvm(file, kvm);
+}
+EXPORT_SYMBOL_GPL(vfio_file_set_kvm);
+
+/*
+ * Sub-module support
+ */
+/*
+ * Helper for managing a buffer of info chain capabilities, allocate or
+ * reallocate a buffer with additional @size, filling in @id and @version
+ * of the capability.  A pointer to the new capability is returned.
+ *
+ * NB. The chain is based at the head of the buffer, so new entries are
+ * added to the tail, vfio_info_cap_shift() should be called to fixup the
+ * next offsets prior to copying to the user buffer.
+ */
+struct vfio_info_cap_header *vfio_info_cap_add(struct vfio_info_cap *caps,
+					       size_t size, u16 id, u16 version)
+{
+	void *buf;
+	struct vfio_info_cap_header *header, *tmp;
+
+	/* Ensure that the next capability struct will be aligned */
+	size = ALIGN(size, sizeof(u64));
+
+	buf = krealloc(caps->buf, caps->size + size, GFP_KERNEL);
+	if (!buf) {
+		kfree(caps->buf);
+		caps->buf = NULL;
+		caps->size = 0;
+		return ERR_PTR(-ENOMEM);
+	}
+
+	caps->buf = buf;
+	header = buf + caps->size;
+
+	/* Eventually copied to user buffer, zero */
+	memset(header, 0, size);
+
+	header->id = id;
+	header->version = version;
+
+	/* Add to the end of the capability chain */
+	for (tmp = buf; tmp->next; tmp = buf + tmp->next)
+		; /* nothing */
+
+	tmp->next = caps->size;
+	caps->size += size;
+
+	return header;
+}
+EXPORT_SYMBOL_GPL(vfio_info_cap_add);
+
+void vfio_info_cap_shift(struct vfio_info_cap *caps, size_t offset)
+{
+	struct vfio_info_cap_header *tmp;
+	void *buf = (void *)caps->buf;
+
+	/* Capability structs should start with proper alignment */
+	WARN_ON(!IS_ALIGNED(offset, sizeof(u64)));
+
+	for (tmp = buf; tmp->next; tmp = buf + tmp->next - offset)
+		tmp->next += offset;
+}
+EXPORT_SYMBOL(vfio_info_cap_shift);
+
+int vfio_info_add_capability(struct vfio_info_cap *caps,
+			     struct vfio_info_cap_header *cap, size_t size)
+{
+	struct vfio_info_cap_header *header;
+
+	header = vfio_info_cap_add(caps, size, cap->id, cap->version);
+	if (IS_ERR(header))
+		return PTR_ERR(header);
+
+	memcpy(header + 1, cap + 1, size - sizeof(*header));
+
+	return 0;
+}
+EXPORT_SYMBOL(vfio_info_add_capability);
+
+int vfio_set_irqs_validate_and_prepare(struct vfio_irq_set *hdr, int num_irqs,
+				       int max_irq_type, size_t *data_size)
+{
+	unsigned long minsz;
+	size_t size;
+
+	minsz = offsetofend(struct vfio_irq_set, count);
+
+	if ((hdr->argsz < minsz) || (hdr->index >= max_irq_type) ||
+	    (hdr->count >= (U32_MAX - hdr->start)) ||
+	    (hdr->flags & ~(VFIO_IRQ_SET_DATA_TYPE_MASK |
+				VFIO_IRQ_SET_ACTION_TYPE_MASK)))
+		return -EINVAL;
+
+	if (data_size)
+		*data_size = 0;
+
+	if (hdr->start >= num_irqs || hdr->start + hdr->count > num_irqs)
+		return -EINVAL;
+
+	switch (hdr->flags & VFIO_IRQ_SET_DATA_TYPE_MASK) {
+	case VFIO_IRQ_SET_DATA_NONE:
+		size = 0;
+		break;
+	case VFIO_IRQ_SET_DATA_BOOL:
+		size = sizeof(uint8_t);
+		break;
+	case VFIO_IRQ_SET_DATA_EVENTFD:
+		size = sizeof(int32_t);
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	if (size) {
+		if (hdr->argsz - minsz < hdr->count * size)
+			return -EINVAL;
+
+		if (!data_size)
+			return -EINVAL;
+
+		*data_size = hdr->count * size;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(vfio_set_irqs_validate_and_prepare);
+
+/*
+ * Pin contiguous user pages and return their associated host pages for local
+ * domain only.
+ * @device [in]  : device
+ * @iova [in]    : starting IOVA of user pages to be pinned.
+ * @npage [in]   : count of pages to be pinned.  This count should not
+ *		   be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
+ * @prot [in]    : protection flags
+ * @pages[out]   : array of host pages
+ * Return error or number of pages pinned.
+ *
+ * A driver may only call this function if the vfio_device was created
+ * by vfio_register_emulated_iommu_dev() due to vfio_device_container_pin_pages().
+ */
+int vfio_pin_pages(struct vfio_device *device, dma_addr_t iova,
+		   int npage, int prot, struct page **pages)
+{
+	/* group->container cannot change while a vfio device is open */
+	if (!pages || !npage || WARN_ON(!vfio_assert_device_open(device)))
+		return -EINVAL;
+	if (!device->ops->dma_unmap)
+		return -EINVAL;
+	if (vfio_device_has_container(device))
+		return vfio_device_container_pin_pages(device, iova,
+						       npage, prot, pages);
+	if (device->iommufd_access) {
+		int ret;
+
+		if (iova > ULONG_MAX)
+			return -EINVAL;
+		/*
+		 * VFIO ignores the sub page offset, npages is from the start of
+		 * a PAGE_SIZE chunk of IOVA. The caller is expected to recover
+		 * the sub page offset by doing:
+		 *     pages[0] + (iova % PAGE_SIZE)
+		 */
+		ret = iommufd_access_pin_pages(
+			device->iommufd_access, ALIGN_DOWN(iova, PAGE_SIZE),
+			npage * PAGE_SIZE, pages,
+			(prot & IOMMU_WRITE) ? IOMMUFD_ACCESS_RW_WRITE : 0);
+		if (ret)
+			return ret;
+		return npage;
+	}
+	return -EINVAL;
+}
+EXPORT_SYMBOL(vfio_pin_pages);
+
+/*
+ * Unpin contiguous host pages for local domain only.
+ * @device [in]  : device
+ * @iova [in]    : starting address of user pages to be unpinned.
+ * @npage [in]   : count of pages to be unpinned.  This count should not
+ *                 be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
+ */
+void vfio_unpin_pages(struct vfio_device *device, dma_addr_t iova, int npage)
+{
+	if (WARN_ON(!vfio_assert_device_open(device)))
+		return;
+	if (WARN_ON(!device->ops->dma_unmap))
+		return;
+
+	if (vfio_device_has_container(device)) {
+		vfio_device_container_unpin_pages(device, iova, npage);
+		return;
+	}
+	if (device->iommufd_access) {
+		if (WARN_ON(iova > ULONG_MAX))
+			return;
+		iommufd_access_unpin_pages(device->iommufd_access,
+					   ALIGN_DOWN(iova, PAGE_SIZE),
+					   npage * PAGE_SIZE);
+		return;
+	}
+}
+EXPORT_SYMBOL(vfio_unpin_pages);
+
+/*
+ * This interface allows the CPUs to perform some sort of virtual DMA on
+ * behalf of the device.
+ *
+ * CPUs read/write from/into a range of IOVAs pointing to user space memory
+ * into/from a kernel buffer.
+ *
+ * As the read/write of user space memory is conducted via the CPUs and is
+ * not a real device DMA, it is not necessary to pin the user space memory.
+ *
+ * @device [in]		: VFIO device
+ * @iova [in]		: base IOVA of a user space buffer
+ * @data [in]		: pointer to kernel buffer
+ * @len [in]		: kernel buffer length
+ * @write		: indicate read or write
+ * Return error code on failure or 0 on success.
+ */
+int vfio_dma_rw(struct vfio_device *device, dma_addr_t iova, void *data,
+		size_t len, bool write)
+{
+	if (!data || len <= 0 || !vfio_assert_device_open(device))
+		return -EINVAL;
+
+	if (vfio_device_has_container(device))
+		return vfio_device_container_dma_rw(device, iova,
+						    data, len, write);
+
+	if (device->iommufd_access) {
+		unsigned int flags = 0;
+
+		if (iova > ULONG_MAX)
+			return -EINVAL;
+
+		/* VFIO historically tries to auto-detect a kthread */
+		if (!current->mm)
+			flags |= IOMMUFD_ACCESS_RW_KTHREAD;
+		if (write)
+			flags |= IOMMUFD_ACCESS_RW_WRITE;
+		return iommufd_access_rw(device->iommufd_access, iova, data,
+					 len, flags);
+	}
+	return -EINVAL;
+}
+EXPORT_SYMBOL(vfio_dma_rw);
+
+/*
+ * Module/class support
+ */
+static int __init vfio_init(void)
+{
+	int ret;
+
+	ida_init(&vfio.device_ida);
+
+	ret = vfio_group_init();
+	if (ret)
+		return ret;
+
+	ret = vfio_virqfd_init();
+	if (ret)
+		goto err_virqfd;
+
+	/* /sys/class/vfio-dev/vfioX */
+	vfio.device_class = class_create("vfio-dev");
+	if (IS_ERR(vfio.device_class)) {
+		ret = PTR_ERR(vfio.device_class);
+		goto err_dev_class;
+	}
+
+	ret = vfio_cdev_init(vfio.device_class);
+	if (ret)
+		goto err_alloc_dev_chrdev;
+
+	pr_info(DRIVER_DESC " version: " DRIVER_VERSION "\n");
+	return 0;
+
+err_alloc_dev_chrdev:
+	class_destroy(vfio.device_class);
+	vfio.device_class = NULL;
+err_dev_class:
+	vfio_virqfd_exit();
+err_virqfd:
+	vfio_group_cleanup();
+	return ret;
+}
+
+static void __exit vfio_cleanup(void)
+{
+	ida_destroy(&vfio.device_ida);
+	vfio_cdev_cleanup();
+	class_destroy(vfio.device_class);
+	vfio.device_class = NULL;
+	vfio_virqfd_exit();
+	vfio_group_cleanup();
+	xa_destroy(&vfio_device_set_xa);
+}
+
+module_init(vfio_init);
+module_exit(vfio_cleanup);
+
+MODULE_VERSION(DRIVER_VERSION);
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR(DRIVER_AUTHOR);
+MODULE_DESCRIPTION(DRIVER_DESC);
+MODULE_SOFTDEP("post: vfio_iommu_type1 vfio_iommu_spapr_tce");
diff --git a/drivers/vfio/virqfd.c b/drivers/vfio/virqfd.c
new file mode 100644
index 0000000000..29c564b7a6
--- /dev/null
+++ b/drivers/vfio/virqfd.c
@@ -0,0 +1,215 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * VFIO generic eventfd code for IRQFD support.
+ * Derived from drivers/vfio/pci/vfio_pci_intrs.c
+ *
+ * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
+ *     Author: Alex Williamson <alex.williamson@redhat.com>
+ */
+
+#include <linux/vfio.h>
+#include <linux/eventfd.h>
+#include <linux/file.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include "vfio.h"
+
+static struct workqueue_struct *vfio_irqfd_cleanup_wq;
+static DEFINE_SPINLOCK(virqfd_lock);
+
+int __init vfio_virqfd_init(void)
+{
+	vfio_irqfd_cleanup_wq =
+		create_singlethread_workqueue("vfio-irqfd-cleanup");
+	if (!vfio_irqfd_cleanup_wq)
+		return -ENOMEM;
+
+	return 0;
+}
+
+void vfio_virqfd_exit(void)
+{
+	destroy_workqueue(vfio_irqfd_cleanup_wq);
+}
+
+static void virqfd_deactivate(struct virqfd *virqfd)
+{
+	queue_work(vfio_irqfd_cleanup_wq, &virqfd->shutdown);
+}
+
+static int virqfd_wakeup(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
+{
+	struct virqfd *virqfd = container_of(wait, struct virqfd, wait);
+	__poll_t flags = key_to_poll(key);
+
+	if (flags & EPOLLIN) {
+		u64 cnt;
+		eventfd_ctx_do_read(virqfd->eventfd, &cnt);
+
+		/* An event has been signaled, call function */
+		if ((!virqfd->handler ||
+		     virqfd->handler(virqfd->opaque, virqfd->data)) &&
+		    virqfd->thread)
+			schedule_work(&virqfd->inject);
+	}
+
+	if (flags & EPOLLHUP) {
+		unsigned long flags;
+		spin_lock_irqsave(&virqfd_lock, flags);
+
+		/*
+		 * The eventfd is closing, if the virqfd has not yet been
+		 * queued for release, as determined by testing whether the
+		 * virqfd pointer to it is still valid, queue it now.  As
+		 * with kvm irqfds, we know we won't race against the virqfd
+		 * going away because we hold the lock to get here.
+		 */
+		if (*(virqfd->pvirqfd) == virqfd) {
+			*(virqfd->pvirqfd) = NULL;
+			virqfd_deactivate(virqfd);
+		}
+
+		spin_unlock_irqrestore(&virqfd_lock, flags);
+	}
+
+	return 0;
+}
+
+static void virqfd_ptable_queue_proc(struct file *file,
+				     wait_queue_head_t *wqh, poll_table *pt)
+{
+	struct virqfd *virqfd = container_of(pt, struct virqfd, pt);
+	add_wait_queue(wqh, &virqfd->wait);
+}
+
+static void virqfd_shutdown(struct work_struct *work)
+{
+	struct virqfd *virqfd = container_of(work, struct virqfd, shutdown);
+	u64 cnt;
+
+	eventfd_ctx_remove_wait_queue(virqfd->eventfd, &virqfd->wait, &cnt);
+	flush_work(&virqfd->inject);
+	eventfd_ctx_put(virqfd->eventfd);
+
+	kfree(virqfd);
+}
+
+static void virqfd_inject(struct work_struct *work)
+{
+	struct virqfd *virqfd = container_of(work, struct virqfd, inject);
+	if (virqfd->thread)
+		virqfd->thread(virqfd->opaque, virqfd->data);
+}
+
+int vfio_virqfd_enable(void *opaque,
+		       int (*handler)(void *, void *),
+		       void (*thread)(void *, void *),
+		       void *data, struct virqfd **pvirqfd, int fd)
+{
+	struct fd irqfd;
+	struct eventfd_ctx *ctx;
+	struct virqfd *virqfd;
+	int ret = 0;
+	__poll_t events;
+
+	virqfd = kzalloc(sizeof(*virqfd), GFP_KERNEL_ACCOUNT);
+	if (!virqfd)
+		return -ENOMEM;
+
+	virqfd->pvirqfd = pvirqfd;
+	virqfd->opaque = opaque;
+	virqfd->handler = handler;
+	virqfd->thread = thread;
+	virqfd->data = data;
+
+	INIT_WORK(&virqfd->shutdown, virqfd_shutdown);
+	INIT_WORK(&virqfd->inject, virqfd_inject);
+
+	irqfd = fdget(fd);
+	if (!irqfd.file) {
+		ret = -EBADF;
+		goto err_fd;
+	}
+
+	ctx = eventfd_ctx_fileget(irqfd.file);
+	if (IS_ERR(ctx)) {
+		ret = PTR_ERR(ctx);
+		goto err_ctx;
+	}
+
+	virqfd->eventfd = ctx;
+
+	/*
+	 * virqfds can be released by closing the eventfd or directly
+	 * through ioctl.  These are both done through a workqueue, so
+	 * we update the pointer to the virqfd under lock to avoid
+	 * pushing multiple jobs to release the same virqfd.
+	 */
+	spin_lock_irq(&virqfd_lock);
+
+	if (*pvirqfd) {
+		spin_unlock_irq(&virqfd_lock);
+		ret = -EBUSY;
+		goto err_busy;
+	}
+	*pvirqfd = virqfd;
+
+	spin_unlock_irq(&virqfd_lock);
+
+	/*
+	 * Install our own custom wake-up handling so we are notified via
+	 * a callback whenever someone signals the underlying eventfd.
+	 */
+	init_waitqueue_func_entry(&virqfd->wait, virqfd_wakeup);
+	init_poll_funcptr(&virqfd->pt, virqfd_ptable_queue_proc);
+
+	events = vfs_poll(irqfd.file, &virqfd->pt);
+
+	/*
+	 * Check if there was an event already pending on the eventfd
+	 * before we registered and trigger it as if we didn't miss it.
+	 */
+	if (events & EPOLLIN) {
+		if ((!handler || handler(opaque, data)) && thread)
+			schedule_work(&virqfd->inject);
+	}
+
+	/*
+	 * Do not drop the file until the irqfd is fully initialized,
+	 * otherwise we might race against the EPOLLHUP.
+	 */
+	fdput(irqfd);
+
+	return 0;
+err_busy:
+	eventfd_ctx_put(ctx);
+err_ctx:
+	fdput(irqfd);
+err_fd:
+	kfree(virqfd);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(vfio_virqfd_enable);
+
+void vfio_virqfd_disable(struct virqfd **pvirqfd)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&virqfd_lock, flags);
+
+	if (*pvirqfd) {
+		virqfd_deactivate(*pvirqfd);
+		*pvirqfd = NULL;
+	}
+
+	spin_unlock_irqrestore(&virqfd_lock, flags);
+
+	/*
+	 * Block until we know all outstanding shutdown jobs have completed.
+	 * Even if we don't queue the job, flush the wq to be sure it's
+	 * been released.
+	 */
+	flush_workqueue(vfio_irqfd_cleanup_wq);
+}
+EXPORT_SYMBOL_GPL(vfio_virqfd_disable);
author	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-11 08:27:49 +0000
committer	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-11 08:27:49 +0000
commit	ace9429bb58fd418f0c81d4c2835699bddf6bde6 (patch)
tree	b2d64bc10158fdd5497876388cd68142ca374ed3 /drivers/vfio
parent	Initial commit. (diff)
download	linux-ace9429bb58fd418f0c81d4c2835699bddf6bde6.tar.xz linux-ace9429bb58fd418f0c81d4c2835699bddf6bde6.zip