summaryrefslogtreecommitdiffstats
path: root/drivers/iommu/iommufd
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-11 08:27:49 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-11 08:27:49 +0000
commitace9429bb58fd418f0c81d4c2835699bddf6bde6 (patch)
treeb2d64bc10158fdd5497876388cd68142ca374ed3 /drivers/iommu/iommufd
parentInitial commit. (diff)
downloadlinux-ace9429bb58fd418f0c81d4c2835699bddf6bde6.tar.xz
linux-ace9429bb58fd418f0c81d4c2835699bddf6bde6.zip
Adding upstream version 6.6.15.upstream/6.6.15
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'drivers/iommu/iommufd')
-rw-r--r--drivers/iommu/iommufd/Kconfig44
-rw-r--r--drivers/iommu/iommufd/Makefile13
-rw-r--r--drivers/iommu/iommufd/device.c1194
-rw-r--r--drivers/iommu/iommufd/double_span.h53
-rw-r--r--drivers/iommu/iommufd/hw_pagetable.c179
-rw-r--r--drivers/iommu/iommufd/io_pagetable.c1244
-rw-r--r--drivers/iommu/iommufd/io_pagetable.h241
-rw-r--r--drivers/iommu/iommufd/ioas.c398
-rw-r--r--drivers/iommu/iommufd/iommufd_private.h351
-rw-r--r--drivers/iommu/iommufd/iommufd_test.h112
-rw-r--r--drivers/iommu/iommufd/main.c556
-rw-r--r--drivers/iommu/iommufd/pages.c1993
-rw-r--r--drivers/iommu/iommufd/selftest.c1107
-rw-r--r--drivers/iommu/iommufd/vfio_compat.c541
14 files changed, 8026 insertions, 0 deletions
diff --git a/drivers/iommu/iommufd/Kconfig b/drivers/iommu/iommufd/Kconfig
new file mode 100644
index 0000000000..99d4b075df
--- /dev/null
+++ b/drivers/iommu/iommufd/Kconfig
@@ -0,0 +1,44 @@
+# SPDX-License-Identifier: GPL-2.0-only
+config IOMMUFD
+ tristate "IOMMU Userspace API"
+ select INTERVAL_TREE
+ select INTERVAL_TREE_SPAN_ITER
+ select IOMMU_API
+ default n
+ help
+ Provides /dev/iommu, the user API to control the IOMMU subsystem as
+ it relates to managing IO page tables that point at user space memory.
+
+ If you don't know what to do here, say N.
+
+if IOMMUFD
+config IOMMUFD_VFIO_CONTAINER
+ bool "IOMMUFD provides the VFIO container /dev/vfio/vfio"
+ depends on VFIO_GROUP && !VFIO_CONTAINER
+ default VFIO_GROUP && !VFIO_CONTAINER
+ help
+ IOMMUFD will provide /dev/vfio/vfio instead of VFIO. This relies on
+ IOMMUFD providing compatibility emulation to give the same ioctls.
+ It provides an option to build a kernel with legacy VFIO components
+ removed.
+
+ IOMMUFD VFIO container emulation is known to lack certain features
+ of the native VFIO container, such as peer-to-peer
+ DMA mapping, PPC IOMMU support, as well as other potentially
+ undiscovered gaps. This option is currently intended for the
+ purpose of testing IOMMUFD with unmodified userspace supporting VFIO
+ and making use of the Type1 VFIO IOMMU backend. General purpose
+ enabling of this option is currently discouraged.
+
+ Unless testing IOMMUFD, say N here.
+
+config IOMMUFD_TEST
+ bool "IOMMU Userspace API Test support"
+ depends on DEBUG_KERNEL
+ depends on FAULT_INJECTION
+ depends on RUNTIME_TESTING_MENU
+ default n
+ help
+ This is dangerous, do not enable unless running
+ tools/testing/selftests/iommu
+endif
diff --git a/drivers/iommu/iommufd/Makefile b/drivers/iommu/iommufd/Makefile
new file mode 100644
index 0000000000..8aeba81800
--- /dev/null
+++ b/drivers/iommu/iommufd/Makefile
@@ -0,0 +1,13 @@
+# SPDX-License-Identifier: GPL-2.0-only
+iommufd-y := \
+ device.o \
+ hw_pagetable.o \
+ io_pagetable.o \
+ ioas.o \
+ main.o \
+ pages.o \
+ vfio_compat.o
+
+iommufd-$(CONFIG_IOMMUFD_TEST) += selftest.o
+
+obj-$(CONFIG_IOMMUFD) += iommufd.o
diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c
new file mode 100644
index 0000000000..ce78c36715
--- /dev/null
+++ b/drivers/iommu/iommufd/device.c
@@ -0,0 +1,1194 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES
+ */
+#include <linux/iommufd.h>
+#include <linux/slab.h>
+#include <linux/iommu.h>
+#include <uapi/linux/iommufd.h>
+#include "../iommu-priv.h"
+
+#include "io_pagetable.h"
+#include "iommufd_private.h"
+
+static bool allow_unsafe_interrupts;
+module_param(allow_unsafe_interrupts, bool, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(
+ allow_unsafe_interrupts,
+ "Allow IOMMUFD to bind to devices even if the platform cannot isolate "
+ "the MSI interrupt window. Enabling this is a security weakness.");
+
+static void iommufd_group_release(struct kref *kref)
+{
+ struct iommufd_group *igroup =
+ container_of(kref, struct iommufd_group, ref);
+
+ WARN_ON(igroup->hwpt || !list_empty(&igroup->device_list));
+
+ xa_cmpxchg(&igroup->ictx->groups, iommu_group_id(igroup->group), igroup,
+ NULL, GFP_KERNEL);
+ iommu_group_put(igroup->group);
+ mutex_destroy(&igroup->lock);
+ kfree(igroup);
+}
+
+static void iommufd_put_group(struct iommufd_group *group)
+{
+ kref_put(&group->ref, iommufd_group_release);
+}
+
+static bool iommufd_group_try_get(struct iommufd_group *igroup,
+ struct iommu_group *group)
+{
+ if (!igroup)
+ return false;
+ /*
+ * group ID's cannot be re-used until the group is put back which does
+ * not happen if we could get an igroup pointer under the xa_lock.
+ */
+ if (WARN_ON(igroup->group != group))
+ return false;
+ return kref_get_unless_zero(&igroup->ref);
+}
+
+/*
+ * iommufd needs to store some more data for each iommu_group, we keep a
+ * parallel xarray indexed by iommu_group id to hold this instead of putting it
+ * in the core structure. To keep things simple the iommufd_group memory is
+ * unique within the iommufd_ctx. This makes it easy to check there are no
+ * memory leaks.
+ */
+static struct iommufd_group *iommufd_get_group(struct iommufd_ctx *ictx,
+ struct device *dev)
+{
+ struct iommufd_group *new_igroup;
+ struct iommufd_group *cur_igroup;
+ struct iommufd_group *igroup;
+ struct iommu_group *group;
+ unsigned int id;
+
+ group = iommu_group_get(dev);
+ if (!group)
+ return ERR_PTR(-ENODEV);
+
+ id = iommu_group_id(group);
+
+ xa_lock(&ictx->groups);
+ igroup = xa_load(&ictx->groups, id);
+ if (iommufd_group_try_get(igroup, group)) {
+ xa_unlock(&ictx->groups);
+ iommu_group_put(group);
+ return igroup;
+ }
+ xa_unlock(&ictx->groups);
+
+ new_igroup = kzalloc(sizeof(*new_igroup), GFP_KERNEL);
+ if (!new_igroup) {
+ iommu_group_put(group);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ kref_init(&new_igroup->ref);
+ mutex_init(&new_igroup->lock);
+ INIT_LIST_HEAD(&new_igroup->device_list);
+ new_igroup->sw_msi_start = PHYS_ADDR_MAX;
+ /* group reference moves into new_igroup */
+ new_igroup->group = group;
+
+ /*
+ * The ictx is not additionally refcounted here becase all objects using
+ * an igroup must put it before their destroy completes.
+ */
+ new_igroup->ictx = ictx;
+
+ /*
+ * We dropped the lock so igroup is invalid. NULL is a safe and likely
+ * value to assume for the xa_cmpxchg algorithm.
+ */
+ cur_igroup = NULL;
+ xa_lock(&ictx->groups);
+ while (true) {
+ igroup = __xa_cmpxchg(&ictx->groups, id, cur_igroup, new_igroup,
+ GFP_KERNEL);
+ if (xa_is_err(igroup)) {
+ xa_unlock(&ictx->groups);
+ iommufd_put_group(new_igroup);
+ return ERR_PTR(xa_err(igroup));
+ }
+
+ /* new_group was successfully installed */
+ if (cur_igroup == igroup) {
+ xa_unlock(&ictx->groups);
+ return new_igroup;
+ }
+
+ /* Check again if the current group is any good */
+ if (iommufd_group_try_get(igroup, group)) {
+ xa_unlock(&ictx->groups);
+ iommufd_put_group(new_igroup);
+ return igroup;
+ }
+ cur_igroup = igroup;
+ }
+}
+
+void iommufd_device_destroy(struct iommufd_object *obj)
+{
+ struct iommufd_device *idev =
+ container_of(obj, struct iommufd_device, obj);
+
+ iommu_device_release_dma_owner(idev->dev);
+ iommufd_put_group(idev->igroup);
+ if (!iommufd_selftest_is_mock_dev(idev->dev))
+ iommufd_ctx_put(idev->ictx);
+}
+
+/**
+ * iommufd_device_bind - Bind a physical device to an iommu fd
+ * @ictx: iommufd file descriptor
+ * @dev: Pointer to a physical device struct
+ * @id: Output ID number to return to userspace for this device
+ *
+ * A successful bind establishes an ownership over the device and returns
+ * struct iommufd_device pointer, otherwise returns error pointer.
+ *
+ * A driver using this API must set driver_managed_dma and must not touch
+ * the device until this routine succeeds and establishes ownership.
+ *
+ * Binding a PCI device places the entire RID under iommufd control.
+ *
+ * The caller must undo this with iommufd_device_unbind()
+ */
+struct iommufd_device *iommufd_device_bind(struct iommufd_ctx *ictx,
+ struct device *dev, u32 *id)
+{
+ struct iommufd_device *idev;
+ struct iommufd_group *igroup;
+ int rc;
+
+ /*
+ * iommufd always sets IOMMU_CACHE because we offer no way for userspace
+ * to restore cache coherency.
+ */
+ if (!device_iommu_capable(dev, IOMMU_CAP_CACHE_COHERENCY))
+ return ERR_PTR(-EINVAL);
+
+ igroup = iommufd_get_group(ictx, dev);
+ if (IS_ERR(igroup))
+ return ERR_CAST(igroup);
+
+ /*
+ * For historical compat with VFIO the insecure interrupt path is
+ * allowed if the module parameter is set. Secure/Isolated means that a
+ * MemWr operation from the device (eg a simple DMA) cannot trigger an
+ * interrupt outside this iommufd context.
+ */
+ if (!iommufd_selftest_is_mock_dev(dev) &&
+ !iommu_group_has_isolated_msi(igroup->group)) {
+ if (!allow_unsafe_interrupts) {
+ rc = -EPERM;
+ goto out_group_put;
+ }
+
+ dev_warn(
+ dev,
+ "MSI interrupts are not secure, they cannot be isolated by the platform. "
+ "Check that platform features like interrupt remapping are enabled. "
+ "Use the \"allow_unsafe_interrupts\" module parameter to override\n");
+ }
+
+ rc = iommu_device_claim_dma_owner(dev, ictx);
+ if (rc)
+ goto out_group_put;
+
+ idev = iommufd_object_alloc(ictx, idev, IOMMUFD_OBJ_DEVICE);
+ if (IS_ERR(idev)) {
+ rc = PTR_ERR(idev);
+ goto out_release_owner;
+ }
+ idev->ictx = ictx;
+ if (!iommufd_selftest_is_mock_dev(dev))
+ iommufd_ctx_get(ictx);
+ idev->dev = dev;
+ idev->enforce_cache_coherency =
+ device_iommu_capable(dev, IOMMU_CAP_ENFORCE_CACHE_COHERENCY);
+ /* The calling driver is a user until iommufd_device_unbind() */
+ refcount_inc(&idev->obj.users);
+ /* igroup refcount moves into iommufd_device */
+ idev->igroup = igroup;
+
+ /*
+ * If the caller fails after this success it must call
+ * iommufd_unbind_device() which is safe since we hold this refcount.
+ * This also means the device is a leaf in the graph and no other object
+ * can take a reference on it.
+ */
+ iommufd_object_finalize(ictx, &idev->obj);
+ *id = idev->obj.id;
+ return idev;
+
+out_release_owner:
+ iommu_device_release_dma_owner(dev);
+out_group_put:
+ iommufd_put_group(igroup);
+ return ERR_PTR(rc);
+}
+EXPORT_SYMBOL_NS_GPL(iommufd_device_bind, IOMMUFD);
+
+/**
+ * iommufd_ctx_has_group - True if any device within the group is bound
+ * to the ictx
+ * @ictx: iommufd file descriptor
+ * @group: Pointer to a physical iommu_group struct
+ *
+ * True if any device within the group has been bound to this ictx, ex. via
+ * iommufd_device_bind(), therefore implying ictx ownership of the group.
+ */
+bool iommufd_ctx_has_group(struct iommufd_ctx *ictx, struct iommu_group *group)
+{
+ struct iommufd_object *obj;
+ unsigned long index;
+
+ if (!ictx || !group)
+ return false;
+
+ xa_lock(&ictx->objects);
+ xa_for_each(&ictx->objects, index, obj) {
+ if (obj->type == IOMMUFD_OBJ_DEVICE &&
+ container_of(obj, struct iommufd_device, obj)
+ ->igroup->group == group) {
+ xa_unlock(&ictx->objects);
+ return true;
+ }
+ }
+ xa_unlock(&ictx->objects);
+ return false;
+}
+EXPORT_SYMBOL_NS_GPL(iommufd_ctx_has_group, IOMMUFD);
+
+/**
+ * iommufd_device_unbind - Undo iommufd_device_bind()
+ * @idev: Device returned by iommufd_device_bind()
+ *
+ * Release the device from iommufd control. The DMA ownership will return back
+ * to unowned with DMA controlled by the DMA API. This invalidates the
+ * iommufd_device pointer, other APIs that consume it must not be called
+ * concurrently.
+ */
+void iommufd_device_unbind(struct iommufd_device *idev)
+{
+ iommufd_object_destroy_user(idev->ictx, &idev->obj);
+}
+EXPORT_SYMBOL_NS_GPL(iommufd_device_unbind, IOMMUFD);
+
+struct iommufd_ctx *iommufd_device_to_ictx(struct iommufd_device *idev)
+{
+ return idev->ictx;
+}
+EXPORT_SYMBOL_NS_GPL(iommufd_device_to_ictx, IOMMUFD);
+
+u32 iommufd_device_to_id(struct iommufd_device *idev)
+{
+ return idev->obj.id;
+}
+EXPORT_SYMBOL_NS_GPL(iommufd_device_to_id, IOMMUFD);
+
+static int iommufd_group_setup_msi(struct iommufd_group *igroup,
+ struct iommufd_hw_pagetable *hwpt)
+{
+ phys_addr_t sw_msi_start = igroup->sw_msi_start;
+ int rc;
+
+ /*
+ * If the IOMMU driver gives a IOMMU_RESV_SW_MSI then it is asking us to
+ * call iommu_get_msi_cookie() on its behalf. This is necessary to setup
+ * the MSI window so iommu_dma_prepare_msi() can install pages into our
+ * domain after request_irq(). If it is not done interrupts will not
+ * work on this domain.
+ *
+ * FIXME: This is conceptually broken for iommufd since we want to allow
+ * userspace to change the domains, eg switch from an identity IOAS to a
+ * DMA IOAS. There is currently no way to create a MSI window that
+ * matches what the IRQ layer actually expects in a newly created
+ * domain.
+ */
+ if (sw_msi_start != PHYS_ADDR_MAX && !hwpt->msi_cookie) {
+ rc = iommu_get_msi_cookie(hwpt->domain, sw_msi_start);
+ if (rc)
+ return rc;
+
+ /*
+ * iommu_get_msi_cookie() can only be called once per domain,
+ * it returns -EBUSY on later calls.
+ */
+ hwpt->msi_cookie = true;
+ }
+ return 0;
+}
+
+int iommufd_hw_pagetable_attach(struct iommufd_hw_pagetable *hwpt,
+ struct iommufd_device *idev)
+{
+ int rc;
+
+ mutex_lock(&idev->igroup->lock);
+
+ if (idev->igroup->hwpt != NULL && idev->igroup->hwpt != hwpt) {
+ rc = -EINVAL;
+ goto err_unlock;
+ }
+
+ /* Try to upgrade the domain we have */
+ if (idev->enforce_cache_coherency) {
+ rc = iommufd_hw_pagetable_enforce_cc(hwpt);
+ if (rc)
+ goto err_unlock;
+ }
+
+ rc = iopt_table_enforce_dev_resv_regions(&hwpt->ioas->iopt, idev->dev,
+ &idev->igroup->sw_msi_start);
+ if (rc)
+ goto err_unlock;
+
+ /*
+ * Only attach to the group once for the first device that is in the
+ * group. All the other devices will follow this attachment. The user
+ * should attach every device individually to the hwpt as the per-device
+ * reserved regions are only updated during individual device
+ * attachment.
+ */
+ if (list_empty(&idev->igroup->device_list)) {
+ rc = iommufd_group_setup_msi(idev->igroup, hwpt);
+ if (rc)
+ goto err_unresv;
+
+ rc = iommu_attach_group(hwpt->domain, idev->igroup->group);
+ if (rc)
+ goto err_unresv;
+ idev->igroup->hwpt = hwpt;
+ }
+ refcount_inc(&hwpt->obj.users);
+ list_add_tail(&idev->group_item, &idev->igroup->device_list);
+ mutex_unlock(&idev->igroup->lock);
+ return 0;
+err_unresv:
+ iopt_remove_reserved_iova(&hwpt->ioas->iopt, idev->dev);
+err_unlock:
+ mutex_unlock(&idev->igroup->lock);
+ return rc;
+}
+
+struct iommufd_hw_pagetable *
+iommufd_hw_pagetable_detach(struct iommufd_device *idev)
+{
+ struct iommufd_hw_pagetable *hwpt = idev->igroup->hwpt;
+
+ mutex_lock(&idev->igroup->lock);
+ list_del(&idev->group_item);
+ if (list_empty(&idev->igroup->device_list)) {
+ iommu_detach_group(hwpt->domain, idev->igroup->group);
+ idev->igroup->hwpt = NULL;
+ }
+ iopt_remove_reserved_iova(&hwpt->ioas->iopt, idev->dev);
+ mutex_unlock(&idev->igroup->lock);
+
+ /* Caller must destroy hwpt */
+ return hwpt;
+}
+
+static struct iommufd_hw_pagetable *
+iommufd_device_do_attach(struct iommufd_device *idev,
+ struct iommufd_hw_pagetable *hwpt)
+{
+ int rc;
+
+ rc = iommufd_hw_pagetable_attach(hwpt, idev);
+ if (rc)
+ return ERR_PTR(rc);
+ return NULL;
+}
+
+static struct iommufd_hw_pagetable *
+iommufd_device_do_replace(struct iommufd_device *idev,
+ struct iommufd_hw_pagetable *hwpt)
+{
+ struct iommufd_group *igroup = idev->igroup;
+ struct iommufd_hw_pagetable *old_hwpt;
+ unsigned int num_devices = 0;
+ struct iommufd_device *cur;
+ int rc;
+
+ mutex_lock(&idev->igroup->lock);
+
+ if (igroup->hwpt == NULL) {
+ rc = -EINVAL;
+ goto err_unlock;
+ }
+
+ if (hwpt == igroup->hwpt) {
+ mutex_unlock(&idev->igroup->lock);
+ return NULL;
+ }
+
+ /* Try to upgrade the domain we have */
+ list_for_each_entry(cur, &igroup->device_list, group_item) {
+ num_devices++;
+ if (cur->enforce_cache_coherency) {
+ rc = iommufd_hw_pagetable_enforce_cc(hwpt);
+ if (rc)
+ goto err_unlock;
+ }
+ }
+
+ old_hwpt = igroup->hwpt;
+ if (hwpt->ioas != old_hwpt->ioas) {
+ list_for_each_entry(cur, &igroup->device_list, group_item) {
+ rc = iopt_table_enforce_dev_resv_regions(
+ &hwpt->ioas->iopt, cur->dev, NULL);
+ if (rc)
+ goto err_unresv;
+ }
+ }
+
+ rc = iommufd_group_setup_msi(idev->igroup, hwpt);
+ if (rc)
+ goto err_unresv;
+
+ rc = iommu_group_replace_domain(igroup->group, hwpt->domain);
+ if (rc)
+ goto err_unresv;
+
+ if (hwpt->ioas != old_hwpt->ioas) {
+ list_for_each_entry(cur, &igroup->device_list, group_item)
+ iopt_remove_reserved_iova(&old_hwpt->ioas->iopt,
+ cur->dev);
+ }
+
+ igroup->hwpt = hwpt;
+
+ /*
+ * Move the refcounts held by the device_list to the new hwpt. Retain a
+ * refcount for this thread as the caller will free it.
+ */
+ refcount_add(num_devices, &hwpt->obj.users);
+ if (num_devices > 1)
+ WARN_ON(refcount_sub_and_test(num_devices - 1,
+ &old_hwpt->obj.users));
+ mutex_unlock(&idev->igroup->lock);
+
+ /* Caller must destroy old_hwpt */
+ return old_hwpt;
+err_unresv:
+ list_for_each_entry(cur, &igroup->device_list, group_item)
+ iopt_remove_reserved_iova(&hwpt->ioas->iopt, cur->dev);
+err_unlock:
+ mutex_unlock(&idev->igroup->lock);
+ return ERR_PTR(rc);
+}
+
+typedef struct iommufd_hw_pagetable *(*attach_fn)(
+ struct iommufd_device *idev, struct iommufd_hw_pagetable *hwpt);
+
+/*
+ * When automatically managing the domains we search for a compatible domain in
+ * the iopt and if one is found use it, otherwise create a new domain.
+ * Automatic domain selection will never pick a manually created domain.
+ */
+static struct iommufd_hw_pagetable *
+iommufd_device_auto_get_domain(struct iommufd_device *idev,
+ struct iommufd_ioas *ioas, u32 *pt_id,
+ attach_fn do_attach)
+{
+ /*
+ * iommufd_hw_pagetable_attach() is called by
+ * iommufd_hw_pagetable_alloc() in immediate attachment mode, same as
+ * iommufd_device_do_attach(). So if we are in this mode then we prefer
+ * to use the immediate_attach path as it supports drivers that can't
+ * directly allocate a domain.
+ */
+ bool immediate_attach = do_attach == iommufd_device_do_attach;
+ struct iommufd_hw_pagetable *destroy_hwpt;
+ struct iommufd_hw_pagetable *hwpt;
+
+ /*
+ * There is no differentiation when domains are allocated, so any domain
+ * that is willing to attach to the device is interchangeable with any
+ * other.
+ */
+ mutex_lock(&ioas->mutex);
+ list_for_each_entry(hwpt, &ioas->hwpt_list, hwpt_item) {
+ if (!hwpt->auto_domain)
+ continue;
+
+ if (!iommufd_lock_obj(&hwpt->obj))
+ continue;
+ destroy_hwpt = (*do_attach)(idev, hwpt);
+ if (IS_ERR(destroy_hwpt)) {
+ iommufd_put_object(&hwpt->obj);
+ /*
+ * -EINVAL means the domain is incompatible with the
+ * device. Other error codes should propagate to
+ * userspace as failure. Success means the domain is
+ * attached.
+ */
+ if (PTR_ERR(destroy_hwpt) == -EINVAL)
+ continue;
+ goto out_unlock;
+ }
+ *pt_id = hwpt->obj.id;
+ iommufd_put_object(&hwpt->obj);
+ goto out_unlock;
+ }
+
+ hwpt = iommufd_hw_pagetable_alloc(idev->ictx, ioas, idev,
+ immediate_attach);
+ if (IS_ERR(hwpt)) {
+ destroy_hwpt = ERR_CAST(hwpt);
+ goto out_unlock;
+ }
+
+ if (!immediate_attach) {
+ destroy_hwpt = (*do_attach)(idev, hwpt);
+ if (IS_ERR(destroy_hwpt))
+ goto out_abort;
+ } else {
+ destroy_hwpt = NULL;
+ }
+
+ hwpt->auto_domain = true;
+ *pt_id = hwpt->obj.id;
+
+ iommufd_object_finalize(idev->ictx, &hwpt->obj);
+ mutex_unlock(&ioas->mutex);
+ return destroy_hwpt;
+
+out_abort:
+ iommufd_object_abort_and_destroy(idev->ictx, &hwpt->obj);
+out_unlock:
+ mutex_unlock(&ioas->mutex);
+ return destroy_hwpt;
+}
+
+static int iommufd_device_change_pt(struct iommufd_device *idev, u32 *pt_id,
+ attach_fn do_attach)
+{
+ struct iommufd_hw_pagetable *destroy_hwpt;
+ struct iommufd_object *pt_obj;
+
+ pt_obj = iommufd_get_object(idev->ictx, *pt_id, IOMMUFD_OBJ_ANY);
+ if (IS_ERR(pt_obj))
+ return PTR_ERR(pt_obj);
+
+ switch (pt_obj->type) {
+ case IOMMUFD_OBJ_HW_PAGETABLE: {
+ struct iommufd_hw_pagetable *hwpt =
+ container_of(pt_obj, struct iommufd_hw_pagetable, obj);
+
+ destroy_hwpt = (*do_attach)(idev, hwpt);
+ if (IS_ERR(destroy_hwpt))
+ goto out_put_pt_obj;
+ break;
+ }
+ case IOMMUFD_OBJ_IOAS: {
+ struct iommufd_ioas *ioas =
+ container_of(pt_obj, struct iommufd_ioas, obj);
+
+ destroy_hwpt = iommufd_device_auto_get_domain(idev, ioas, pt_id,
+ do_attach);
+ if (IS_ERR(destroy_hwpt))
+ goto out_put_pt_obj;
+ break;
+ }
+ default:
+ destroy_hwpt = ERR_PTR(-EINVAL);
+ goto out_put_pt_obj;
+ }
+ iommufd_put_object(pt_obj);
+
+ /* This destruction has to be after we unlock everything */
+ if (destroy_hwpt)
+ iommufd_hw_pagetable_put(idev->ictx, destroy_hwpt);
+ return 0;
+
+out_put_pt_obj:
+ iommufd_put_object(pt_obj);
+ return PTR_ERR(destroy_hwpt);
+}
+
+/**
+ * iommufd_device_attach - Connect a device to an iommu_domain
+ * @idev: device to attach
+ * @pt_id: Input a IOMMUFD_OBJ_IOAS, or IOMMUFD_OBJ_HW_PAGETABLE
+ * Output the IOMMUFD_OBJ_HW_PAGETABLE ID
+ *
+ * This connects the device to an iommu_domain, either automatically or manually
+ * selected. Once this completes the device could do DMA.
+ *
+ * The caller should return the resulting pt_id back to userspace.
+ * This function is undone by calling iommufd_device_detach().
+ */
+int iommufd_device_attach(struct iommufd_device *idev, u32 *pt_id)
+{
+ int rc;
+
+ rc = iommufd_device_change_pt(idev, pt_id, &iommufd_device_do_attach);
+ if (rc)
+ return rc;
+
+ /*
+ * Pairs with iommufd_device_detach() - catches caller bugs attempting
+ * to destroy a device with an attachment.
+ */
+ refcount_inc(&idev->obj.users);
+ return 0;
+}
+EXPORT_SYMBOL_NS_GPL(iommufd_device_attach, IOMMUFD);
+
+/**
+ * iommufd_device_replace - Change the device's iommu_domain
+ * @idev: device to change
+ * @pt_id: Input a IOMMUFD_OBJ_IOAS, or IOMMUFD_OBJ_HW_PAGETABLE
+ * Output the IOMMUFD_OBJ_HW_PAGETABLE ID
+ *
+ * This is the same as::
+ *
+ * iommufd_device_detach();
+ * iommufd_device_attach();
+ *
+ * If it fails then no change is made to the attachment. The iommu driver may
+ * implement this so there is no disruption in translation. This can only be
+ * called if iommufd_device_attach() has already succeeded.
+ */
+int iommufd_device_replace(struct iommufd_device *idev, u32 *pt_id)
+{
+ return iommufd_device_change_pt(idev, pt_id,
+ &iommufd_device_do_replace);
+}
+EXPORT_SYMBOL_NS_GPL(iommufd_device_replace, IOMMUFD);
+
+/**
+ * iommufd_device_detach - Disconnect a device to an iommu_domain
+ * @idev: device to detach
+ *
+ * Undo iommufd_device_attach(). This disconnects the idev from the previously
+ * attached pt_id. The device returns back to a blocked DMA translation.
+ */
+void iommufd_device_detach(struct iommufd_device *idev)
+{
+ struct iommufd_hw_pagetable *hwpt;
+
+ hwpt = iommufd_hw_pagetable_detach(idev);
+ iommufd_hw_pagetable_put(idev->ictx, hwpt);
+ refcount_dec(&idev->obj.users);
+}
+EXPORT_SYMBOL_NS_GPL(iommufd_device_detach, IOMMUFD);
+
+/*
+ * On success, it will refcount_inc() at a valid new_ioas and refcount_dec() at
+ * a valid cur_ioas (access->ioas). A caller passing in a valid new_ioas should
+ * call iommufd_put_object() if it does an iommufd_get_object() for a new_ioas.
+ */
+static int iommufd_access_change_ioas(struct iommufd_access *access,
+ struct iommufd_ioas *new_ioas)
+{
+ u32 iopt_access_list_id = access->iopt_access_list_id;
+ struct iommufd_ioas *cur_ioas = access->ioas;
+ int rc;
+
+ lockdep_assert_held(&access->ioas_lock);
+
+ /* We are racing with a concurrent detach, bail */
+ if (cur_ioas != access->ioas_unpin)
+ return -EBUSY;
+
+ if (cur_ioas == new_ioas)
+ return 0;
+
+ /*
+ * Set ioas to NULL to block any further iommufd_access_pin_pages().
+ * iommufd_access_unpin_pages() can continue using access->ioas_unpin.
+ */
+ access->ioas = NULL;
+
+ if (new_ioas) {
+ rc = iopt_add_access(&new_ioas->iopt, access);
+ if (rc) {
+ access->ioas = cur_ioas;
+ return rc;
+ }
+ refcount_inc(&new_ioas->obj.users);
+ }
+
+ if (cur_ioas) {
+ if (access->ops->unmap) {
+ mutex_unlock(&access->ioas_lock);
+ access->ops->unmap(access->data, 0, ULONG_MAX);
+ mutex_lock(&access->ioas_lock);
+ }
+ iopt_remove_access(&cur_ioas->iopt, access, iopt_access_list_id);
+ refcount_dec(&cur_ioas->obj.users);
+ }
+
+ access->ioas = new_ioas;
+ access->ioas_unpin = new_ioas;
+
+ return 0;
+}
+
+static int iommufd_access_change_ioas_id(struct iommufd_access *access, u32 id)
+{
+ struct iommufd_ioas *ioas = iommufd_get_ioas(access->ictx, id);
+ int rc;
+
+ if (IS_ERR(ioas))
+ return PTR_ERR(ioas);
+ rc = iommufd_access_change_ioas(access, ioas);
+ iommufd_put_object(&ioas->obj);
+ return rc;
+}
+
+void iommufd_access_destroy_object(struct iommufd_object *obj)
+{
+ struct iommufd_access *access =
+ container_of(obj, struct iommufd_access, obj);
+
+ mutex_lock(&access->ioas_lock);
+ if (access->ioas)
+ WARN_ON(iommufd_access_change_ioas(access, NULL));
+ mutex_unlock(&access->ioas_lock);
+ iommufd_ctx_put(access->ictx);
+}
+
+/**
+ * iommufd_access_create - Create an iommufd_access
+ * @ictx: iommufd file descriptor
+ * @ops: Driver's ops to associate with the access
+ * @data: Opaque data to pass into ops functions
+ * @id: Output ID number to return to userspace for this access
+ *
+ * An iommufd_access allows a driver to read/write to the IOAS without using
+ * DMA. The underlying CPU memory can be accessed using the
+ * iommufd_access_pin_pages() or iommufd_access_rw() functions.
+ *
+ * The provided ops are required to use iommufd_access_pin_pages().
+ */
+struct iommufd_access *
+iommufd_access_create(struct iommufd_ctx *ictx,
+ const struct iommufd_access_ops *ops, void *data, u32 *id)
+{
+ struct iommufd_access *access;
+
+ /*
+ * There is no uAPI for the access object, but to keep things symmetric
+ * use the object infrastructure anyhow.
+ */
+ access = iommufd_object_alloc(ictx, access, IOMMUFD_OBJ_ACCESS);
+ if (IS_ERR(access))
+ return access;
+
+ access->data = data;
+ access->ops = ops;
+
+ if (ops->needs_pin_pages)
+ access->iova_alignment = PAGE_SIZE;
+ else
+ access->iova_alignment = 1;
+
+ /* The calling driver is a user until iommufd_access_destroy() */
+ refcount_inc(&access->obj.users);
+ access->ictx = ictx;
+ iommufd_ctx_get(ictx);
+ iommufd_object_finalize(ictx, &access->obj);
+ *id = access->obj.id;
+ mutex_init(&access->ioas_lock);
+ return access;
+}
+EXPORT_SYMBOL_NS_GPL(iommufd_access_create, IOMMUFD);
+
+/**
+ * iommufd_access_destroy - Destroy an iommufd_access
+ * @access: The access to destroy
+ *
+ * The caller must stop using the access before destroying it.
+ */
+void iommufd_access_destroy(struct iommufd_access *access)
+{
+ iommufd_object_destroy_user(access->ictx, &access->obj);
+}
+EXPORT_SYMBOL_NS_GPL(iommufd_access_destroy, IOMMUFD);
+
+void iommufd_access_detach(struct iommufd_access *access)
+{
+ mutex_lock(&access->ioas_lock);
+ if (WARN_ON(!access->ioas)) {
+ mutex_unlock(&access->ioas_lock);
+ return;
+ }
+ WARN_ON(iommufd_access_change_ioas(access, NULL));
+ mutex_unlock(&access->ioas_lock);
+}
+EXPORT_SYMBOL_NS_GPL(iommufd_access_detach, IOMMUFD);
+
+int iommufd_access_attach(struct iommufd_access *access, u32 ioas_id)
+{
+ int rc;
+
+ mutex_lock(&access->ioas_lock);
+ if (WARN_ON(access->ioas)) {
+ mutex_unlock(&access->ioas_lock);
+ return -EINVAL;
+ }
+
+ rc = iommufd_access_change_ioas_id(access, ioas_id);
+ mutex_unlock(&access->ioas_lock);
+ return rc;
+}
+EXPORT_SYMBOL_NS_GPL(iommufd_access_attach, IOMMUFD);
+
+int iommufd_access_replace(struct iommufd_access *access, u32 ioas_id)
+{
+ int rc;
+
+ mutex_lock(&access->ioas_lock);
+ if (!access->ioas) {
+ mutex_unlock(&access->ioas_lock);
+ return -ENOENT;
+ }
+ rc = iommufd_access_change_ioas_id(access, ioas_id);
+ mutex_unlock(&access->ioas_lock);
+ return rc;
+}
+EXPORT_SYMBOL_NS_GPL(iommufd_access_replace, IOMMUFD);
+
+/**
+ * iommufd_access_notify_unmap - Notify users of an iopt to stop using it
+ * @iopt: iopt to work on
+ * @iova: Starting iova in the iopt
+ * @length: Number of bytes
+ *
+ * After this function returns there should be no users attached to the pages
+ * linked to this iopt that intersect with iova,length. Anyone that has attached
+ * a user through iopt_access_pages() needs to detach it through
+ * iommufd_access_unpin_pages() before this function returns.
+ *
+ * iommufd_access_destroy() will wait for any outstanding unmap callback to
+ * complete. Once iommufd_access_destroy() no unmap ops are running or will
+ * run in the future. Due to this a driver must not create locking that prevents
+ * unmap to complete while iommufd_access_destroy() is running.
+ */
+void iommufd_access_notify_unmap(struct io_pagetable *iopt, unsigned long iova,
+ unsigned long length)
+{
+ struct iommufd_ioas *ioas =
+ container_of(iopt, struct iommufd_ioas, iopt);
+ struct iommufd_access *access;
+ unsigned long index;
+
+ xa_lock(&ioas->iopt.access_list);
+ xa_for_each(&ioas->iopt.access_list, index, access) {
+ if (!iommufd_lock_obj(&access->obj))
+ continue;
+ xa_unlock(&ioas->iopt.access_list);
+
+ access->ops->unmap(access->data, iova, length);
+
+ iommufd_put_object(&access->obj);
+ xa_lock(&ioas->iopt.access_list);
+ }
+ xa_unlock(&ioas->iopt.access_list);
+}
+
+/**
+ * iommufd_access_unpin_pages() - Undo iommufd_access_pin_pages
+ * @access: IOAS access to act on
+ * @iova: Starting IOVA
+ * @length: Number of bytes to access
+ *
+ * Return the struct page's. The caller must stop accessing them before calling
+ * this. The iova/length must exactly match the one provided to access_pages.
+ */
+void iommufd_access_unpin_pages(struct iommufd_access *access,
+ unsigned long iova, unsigned long length)
+{
+ struct iopt_area_contig_iter iter;
+ struct io_pagetable *iopt;
+ unsigned long last_iova;
+ struct iopt_area *area;
+
+ if (WARN_ON(!length) ||
+ WARN_ON(check_add_overflow(iova, length - 1, &last_iova)))
+ return;
+
+ mutex_lock(&access->ioas_lock);
+ /*
+ * The driver must be doing something wrong if it calls this before an
+ * iommufd_access_attach() or after an iommufd_access_detach().
+ */
+ if (WARN_ON(!access->ioas_unpin)) {
+ mutex_unlock(&access->ioas_lock);
+ return;
+ }
+ iopt = &access->ioas_unpin->iopt;
+
+ down_read(&iopt->iova_rwsem);
+ iopt_for_each_contig_area(&iter, area, iopt, iova, last_iova)
+ iopt_area_remove_access(
+ area, iopt_area_iova_to_index(area, iter.cur_iova),
+ iopt_area_iova_to_index(
+ area,
+ min(last_iova, iopt_area_last_iova(area))));
+ WARN_ON(!iopt_area_contig_done(&iter));
+ up_read(&iopt->iova_rwsem);
+ mutex_unlock(&access->ioas_lock);
+}
+EXPORT_SYMBOL_NS_GPL(iommufd_access_unpin_pages, IOMMUFD);
+
+static bool iopt_area_contig_is_aligned(struct iopt_area_contig_iter *iter)
+{
+ if (iopt_area_start_byte(iter->area, iter->cur_iova) % PAGE_SIZE)
+ return false;
+
+ if (!iopt_area_contig_done(iter) &&
+ (iopt_area_start_byte(iter->area, iopt_area_last_iova(iter->area)) %
+ PAGE_SIZE) != (PAGE_SIZE - 1))
+ return false;
+ return true;
+}
+
+static bool check_area_prot(struct iopt_area *area, unsigned int flags)
+{
+ if (flags & IOMMUFD_ACCESS_RW_WRITE)
+ return area->iommu_prot & IOMMU_WRITE;
+ return area->iommu_prot & IOMMU_READ;
+}
+
+/**
+ * iommufd_access_pin_pages() - Return a list of pages under the iova
+ * @access: IOAS access to act on
+ * @iova: Starting IOVA
+ * @length: Number of bytes to access
+ * @out_pages: Output page list
+ * @flags: IOPMMUFD_ACCESS_RW_* flags
+ *
+ * Reads @length bytes starting at iova and returns the struct page * pointers.
+ * These can be kmap'd by the caller for CPU access.
+ *
+ * The caller must perform iommufd_access_unpin_pages() when done to balance
+ * this.
+ *
+ * This API always requires a page aligned iova. This happens naturally if the
+ * ioas alignment is >= PAGE_SIZE and the iova is PAGE_SIZE aligned. However
+ * smaller alignments have corner cases where this API can fail on otherwise
+ * aligned iova.
+ */
+int iommufd_access_pin_pages(struct iommufd_access *access, unsigned long iova,
+ unsigned long length, struct page **out_pages,
+ unsigned int flags)
+{
+ struct iopt_area_contig_iter iter;
+ struct io_pagetable *iopt;
+ unsigned long last_iova;
+ struct iopt_area *area;
+ int rc;
+
+ /* Driver's ops don't support pin_pages */
+ if (IS_ENABLED(CONFIG_IOMMUFD_TEST) &&
+ WARN_ON(access->iova_alignment != PAGE_SIZE || !access->ops->unmap))
+ return -EINVAL;
+
+ if (!length)
+ return -EINVAL;
+ if (check_add_overflow(iova, length - 1, &last_iova))
+ return -EOVERFLOW;
+
+ mutex_lock(&access->ioas_lock);
+ if (!access->ioas) {
+ mutex_unlock(&access->ioas_lock);
+ return -ENOENT;
+ }
+ iopt = &access->ioas->iopt;
+
+ down_read(&iopt->iova_rwsem);
+ iopt_for_each_contig_area(&iter, area, iopt, iova, last_iova) {
+ unsigned long last = min(last_iova, iopt_area_last_iova(area));
+ unsigned long last_index = iopt_area_iova_to_index(area, last);
+ unsigned long index =
+ iopt_area_iova_to_index(area, iter.cur_iova);
+
+ if (area->prevent_access ||
+ !iopt_area_contig_is_aligned(&iter)) {
+ rc = -EINVAL;
+ goto err_remove;
+ }
+
+ if (!check_area_prot(area, flags)) {
+ rc = -EPERM;
+ goto err_remove;
+ }
+
+ rc = iopt_area_add_access(area, index, last_index, out_pages,
+ flags);
+ if (rc)
+ goto err_remove;
+ out_pages += last_index - index + 1;
+ }
+ if (!iopt_area_contig_done(&iter)) {
+ rc = -ENOENT;
+ goto err_remove;
+ }
+
+ up_read(&iopt->iova_rwsem);
+ mutex_unlock(&access->ioas_lock);
+ return 0;
+
+err_remove:
+ if (iova < iter.cur_iova) {
+ last_iova = iter.cur_iova - 1;
+ iopt_for_each_contig_area(&iter, area, iopt, iova, last_iova)
+ iopt_area_remove_access(
+ area,
+ iopt_area_iova_to_index(area, iter.cur_iova),
+ iopt_area_iova_to_index(
+ area, min(last_iova,
+ iopt_area_last_iova(area))));
+ }
+ up_read(&iopt->iova_rwsem);
+ mutex_unlock(&access->ioas_lock);
+ return rc;
+}
+EXPORT_SYMBOL_NS_GPL(iommufd_access_pin_pages, IOMMUFD);
+
+/**
+ * iommufd_access_rw - Read or write data under the iova
+ * @access: IOAS access to act on
+ * @iova: Starting IOVA
+ * @data: Kernel buffer to copy to/from
+ * @length: Number of bytes to access
+ * @flags: IOMMUFD_ACCESS_RW_* flags
+ *
+ * Copy kernel to/from data into the range given by IOVA/length. If flags
+ * indicates IOMMUFD_ACCESS_RW_KTHREAD then a large copy can be optimized
+ * by changing it into copy_to/from_user().
+ */
+int iommufd_access_rw(struct iommufd_access *access, unsigned long iova,
+ void *data, size_t length, unsigned int flags)
+{
+ struct iopt_area_contig_iter iter;
+ struct io_pagetable *iopt;
+ struct iopt_area *area;
+ unsigned long last_iova;
+ int rc;
+
+ if (!length)
+ return -EINVAL;
+ if (check_add_overflow(iova, length - 1, &last_iova))
+ return -EOVERFLOW;
+
+ mutex_lock(&access->ioas_lock);
+ if (!access->ioas) {
+ mutex_unlock(&access->ioas_lock);
+ return -ENOENT;
+ }
+ iopt = &access->ioas->iopt;
+
+ down_read(&iopt->iova_rwsem);
+ iopt_for_each_contig_area(&iter, area, iopt, iova, last_iova) {
+ unsigned long last = min(last_iova, iopt_area_last_iova(area));
+ unsigned long bytes = (last - iter.cur_iova) + 1;
+
+ if (area->prevent_access) {
+ rc = -EINVAL;
+ goto err_out;
+ }
+
+ if (!check_area_prot(area, flags)) {
+ rc = -EPERM;
+ goto err_out;
+ }
+
+ rc = iopt_pages_rw_access(
+ area->pages, iopt_area_start_byte(area, iter.cur_iova),
+ data, bytes, flags);
+ if (rc)
+ goto err_out;
+ data += bytes;
+ }
+ if (!iopt_area_contig_done(&iter))
+ rc = -ENOENT;
+err_out:
+ up_read(&iopt->iova_rwsem);
+ mutex_unlock(&access->ioas_lock);
+ return rc;
+}
+EXPORT_SYMBOL_NS_GPL(iommufd_access_rw, IOMMUFD);
+
+int iommufd_get_hw_info(struct iommufd_ucmd *ucmd)
+{
+ struct iommu_hw_info *cmd = ucmd->cmd;
+ void __user *user_ptr = u64_to_user_ptr(cmd->data_uptr);
+ const struct iommu_ops *ops;
+ struct iommufd_device *idev;
+ unsigned int data_len;
+ unsigned int copy_len;
+ void *data;
+ int rc;
+
+ if (cmd->flags || cmd->__reserved)
+ return -EOPNOTSUPP;
+
+ idev = iommufd_get_device(ucmd, cmd->dev_id);
+ if (IS_ERR(idev))
+ return PTR_ERR(idev);
+
+ ops = dev_iommu_ops(idev->dev);
+ if (ops->hw_info) {
+ data = ops->hw_info(idev->dev, &data_len, &cmd->out_data_type);
+ if (IS_ERR(data)) {
+ rc = PTR_ERR(data);
+ goto out_put;
+ }
+
+ /*
+ * drivers that have hw_info callback should have a unique
+ * iommu_hw_info_type.
+ */
+ if (WARN_ON_ONCE(cmd->out_data_type ==
+ IOMMU_HW_INFO_TYPE_NONE)) {
+ rc = -ENODEV;
+ goto out_free;
+ }
+ } else {
+ cmd->out_data_type = IOMMU_HW_INFO_TYPE_NONE;
+ data_len = 0;
+ data = NULL;
+ }
+
+ copy_len = min(cmd->data_len, data_len);
+ if (copy_to_user(user_ptr, data, copy_len)) {
+ rc = -EFAULT;
+ goto out_free;
+ }
+
+ /*
+ * Zero the trailing bytes if the user buffer is bigger than the
+ * data size kernel actually has.
+ */
+ if (copy_len < cmd->data_len) {
+ if (clear_user(user_ptr + copy_len, cmd->data_len - copy_len)) {
+ rc = -EFAULT;
+ goto out_free;
+ }
+ }
+
+ /*
+ * We return the length the kernel supports so userspace may know what
+ * the kernel capability is. It could be larger than the input buffer.
+ */
+ cmd->data_len = data_len;
+
+ rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd));
+out_free:
+ kfree(data);
+out_put:
+ iommufd_put_object(&idev->obj);
+ return rc;
+}
diff --git a/drivers/iommu/iommufd/double_span.h b/drivers/iommu/iommufd/double_span.h
new file mode 100644
index 0000000000..b37aab7488
--- /dev/null
+++ b/drivers/iommu/iommufd/double_span.h
@@ -0,0 +1,53 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES.
+ */
+#ifndef __IOMMUFD_DOUBLE_SPAN_H
+#define __IOMMUFD_DOUBLE_SPAN_H
+
+#include <linux/interval_tree.h>
+
+/*
+ * This is a variation of the general interval_tree_span_iter that computes the
+ * spans over the union of two different interval trees. Used ranges are broken
+ * up and reported based on the tree that provides the interval. The first span
+ * always takes priority. Like interval_tree_span_iter it is greedy and the same
+ * value of is_used will not repeat on two iteration cycles.
+ */
+struct interval_tree_double_span_iter {
+ struct rb_root_cached *itrees[2];
+ struct interval_tree_span_iter spans[2];
+ union {
+ unsigned long start_hole;
+ unsigned long start_used;
+ };
+ union {
+ unsigned long last_hole;
+ unsigned long last_used;
+ };
+ /* 0 = hole, 1 = used span[0], 2 = used span[1], -1 done iteration */
+ int is_used;
+};
+
+void interval_tree_double_span_iter_update(
+ struct interval_tree_double_span_iter *iter);
+void interval_tree_double_span_iter_first(
+ struct interval_tree_double_span_iter *iter,
+ struct rb_root_cached *itree1, struct rb_root_cached *itree2,
+ unsigned long first_index, unsigned long last_index);
+void interval_tree_double_span_iter_next(
+ struct interval_tree_double_span_iter *iter);
+
+static inline bool
+interval_tree_double_span_iter_done(struct interval_tree_double_span_iter *state)
+{
+ return state->is_used == -1;
+}
+
+#define interval_tree_for_each_double_span(span, itree1, itree2, first_index, \
+ last_index) \
+ for (interval_tree_double_span_iter_first(span, itree1, itree2, \
+ first_index, last_index); \
+ !interval_tree_double_span_iter_done(span); \
+ interval_tree_double_span_iter_next(span))
+
+#endif
diff --git a/drivers/iommu/iommufd/hw_pagetable.c b/drivers/iommu/iommufd/hw_pagetable.c
new file mode 100644
index 0000000000..cf2c1504e2
--- /dev/null
+++ b/drivers/iommu/iommufd/hw_pagetable.c
@@ -0,0 +1,179 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES
+ */
+#include <linux/iommu.h>
+#include <uapi/linux/iommufd.h>
+
+#include "iommufd_private.h"
+
+void iommufd_hw_pagetable_destroy(struct iommufd_object *obj)
+{
+ struct iommufd_hw_pagetable *hwpt =
+ container_of(obj, struct iommufd_hw_pagetable, obj);
+
+ if (!list_empty(&hwpt->hwpt_item)) {
+ mutex_lock(&hwpt->ioas->mutex);
+ list_del(&hwpt->hwpt_item);
+ mutex_unlock(&hwpt->ioas->mutex);
+
+ iopt_table_remove_domain(&hwpt->ioas->iopt, hwpt->domain);
+ }
+
+ if (hwpt->domain)
+ iommu_domain_free(hwpt->domain);
+
+ refcount_dec(&hwpt->ioas->obj.users);
+}
+
+void iommufd_hw_pagetable_abort(struct iommufd_object *obj)
+{
+ struct iommufd_hw_pagetable *hwpt =
+ container_of(obj, struct iommufd_hw_pagetable, obj);
+
+ /* The ioas->mutex must be held until finalize is called. */
+ lockdep_assert_held(&hwpt->ioas->mutex);
+
+ if (!list_empty(&hwpt->hwpt_item)) {
+ list_del_init(&hwpt->hwpt_item);
+ iopt_table_remove_domain(&hwpt->ioas->iopt, hwpt->domain);
+ }
+ iommufd_hw_pagetable_destroy(obj);
+}
+
+int iommufd_hw_pagetable_enforce_cc(struct iommufd_hw_pagetable *hwpt)
+{
+ if (hwpt->enforce_cache_coherency)
+ return 0;
+
+ if (hwpt->domain->ops->enforce_cache_coherency)
+ hwpt->enforce_cache_coherency =
+ hwpt->domain->ops->enforce_cache_coherency(
+ hwpt->domain);
+ if (!hwpt->enforce_cache_coherency)
+ return -EINVAL;
+ return 0;
+}
+
+/**
+ * iommufd_hw_pagetable_alloc() - Get an iommu_domain for a device
+ * @ictx: iommufd context
+ * @ioas: IOAS to associate the domain with
+ * @idev: Device to get an iommu_domain for
+ * @immediate_attach: True if idev should be attached to the hwpt
+ *
+ * Allocate a new iommu_domain and return it as a hw_pagetable. The HWPT
+ * will be linked to the given ioas and upon return the underlying iommu_domain
+ * is fully popoulated.
+ *
+ * The caller must hold the ioas->mutex until after
+ * iommufd_object_abort_and_destroy() or iommufd_object_finalize() is called on
+ * the returned hwpt.
+ */
+struct iommufd_hw_pagetable *
+iommufd_hw_pagetable_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas,
+ struct iommufd_device *idev, bool immediate_attach)
+{
+ struct iommufd_hw_pagetable *hwpt;
+ int rc;
+
+ lockdep_assert_held(&ioas->mutex);
+
+ hwpt = iommufd_object_alloc(ictx, hwpt, IOMMUFD_OBJ_HW_PAGETABLE);
+ if (IS_ERR(hwpt))
+ return hwpt;
+
+ INIT_LIST_HEAD(&hwpt->hwpt_item);
+ /* Pairs with iommufd_hw_pagetable_destroy() */
+ refcount_inc(&ioas->obj.users);
+ hwpt->ioas = ioas;
+
+ hwpt->domain = iommu_domain_alloc(idev->dev->bus);
+ if (!hwpt->domain) {
+ rc = -ENOMEM;
+ goto out_abort;
+ }
+
+ /*
+ * Set the coherency mode before we do iopt_table_add_domain() as some
+ * iommus have a per-PTE bit that controls it and need to decide before
+ * doing any maps. It is an iommu driver bug to report
+ * IOMMU_CAP_ENFORCE_CACHE_COHERENCY but fail enforce_cache_coherency on
+ * a new domain.
+ */
+ if (idev->enforce_cache_coherency) {
+ rc = iommufd_hw_pagetable_enforce_cc(hwpt);
+ if (WARN_ON(rc))
+ goto out_abort;
+ }
+
+ /*
+ * immediate_attach exists only to accommodate iommu drivers that cannot
+ * directly allocate a domain. These drivers do not finish creating the
+ * domain until attach is completed. Thus we must have this call
+ * sequence. Once those drivers are fixed this should be removed.
+ */
+ if (immediate_attach) {
+ rc = iommufd_hw_pagetable_attach(hwpt, idev);
+ if (rc)
+ goto out_abort;
+ }
+
+ rc = iopt_table_add_domain(&hwpt->ioas->iopt, hwpt->domain);
+ if (rc)
+ goto out_detach;
+ list_add_tail(&hwpt->hwpt_item, &hwpt->ioas->hwpt_list);
+ return hwpt;
+
+out_detach:
+ if (immediate_attach)
+ iommufd_hw_pagetable_detach(idev);
+out_abort:
+ iommufd_object_abort_and_destroy(ictx, &hwpt->obj);
+ return ERR_PTR(rc);
+}
+
+int iommufd_hwpt_alloc(struct iommufd_ucmd *ucmd)
+{
+ struct iommu_hwpt_alloc *cmd = ucmd->cmd;
+ struct iommufd_hw_pagetable *hwpt;
+ struct iommufd_device *idev;
+ struct iommufd_ioas *ioas;
+ int rc;
+
+ if (cmd->flags || cmd->__reserved)
+ return -EOPNOTSUPP;
+
+ idev = iommufd_get_device(ucmd, cmd->dev_id);
+ if (IS_ERR(idev))
+ return PTR_ERR(idev);
+
+ ioas = iommufd_get_ioas(ucmd->ictx, cmd->pt_id);
+ if (IS_ERR(ioas)) {
+ rc = PTR_ERR(ioas);
+ goto out_put_idev;
+ }
+
+ mutex_lock(&ioas->mutex);
+ hwpt = iommufd_hw_pagetable_alloc(ucmd->ictx, ioas, idev, false);
+ if (IS_ERR(hwpt)) {
+ rc = PTR_ERR(hwpt);
+ goto out_unlock;
+ }
+
+ cmd->out_hwpt_id = hwpt->obj.id;
+ rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd));
+ if (rc)
+ goto out_hwpt;
+ iommufd_object_finalize(ucmd->ictx, &hwpt->obj);
+ goto out_unlock;
+
+out_hwpt:
+ iommufd_object_abort_and_destroy(ucmd->ictx, &hwpt->obj);
+out_unlock:
+ mutex_unlock(&ioas->mutex);
+ iommufd_put_object(&ioas->obj);
+out_put_idev:
+ iommufd_put_object(&idev->obj);
+ return rc;
+}
diff --git a/drivers/iommu/iommufd/io_pagetable.c b/drivers/iommu/iommufd/io_pagetable.c
new file mode 100644
index 0000000000..117a39ae2e
--- /dev/null
+++ b/drivers/iommu/iommufd/io_pagetable.c
@@ -0,0 +1,1244 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES.
+ *
+ * The io_pagetable is the top of datastructure that maps IOVA's to PFNs. The
+ * PFNs can be placed into an iommu_domain, or returned to the caller as a page
+ * list for access by an in-kernel user.
+ *
+ * The datastructure uses the iopt_pages to optimize the storage of the PFNs
+ * between the domains and xarray.
+ */
+#include <linux/iommufd.h>
+#include <linux/lockdep.h>
+#include <linux/iommu.h>
+#include <linux/sched/mm.h>
+#include <linux/err.h>
+#include <linux/slab.h>
+#include <linux/errno.h>
+
+#include "io_pagetable.h"
+#include "double_span.h"
+
+struct iopt_pages_list {
+ struct iopt_pages *pages;
+ struct iopt_area *area;
+ struct list_head next;
+ unsigned long start_byte;
+ unsigned long length;
+};
+
+struct iopt_area *iopt_area_contig_init(struct iopt_area_contig_iter *iter,
+ struct io_pagetable *iopt,
+ unsigned long iova,
+ unsigned long last_iova)
+{
+ lockdep_assert_held(&iopt->iova_rwsem);
+
+ iter->cur_iova = iova;
+ iter->last_iova = last_iova;
+ iter->area = iopt_area_iter_first(iopt, iova, iova);
+ if (!iter->area)
+ return NULL;
+ if (!iter->area->pages) {
+ iter->area = NULL;
+ return NULL;
+ }
+ return iter->area;
+}
+
+struct iopt_area *iopt_area_contig_next(struct iopt_area_contig_iter *iter)
+{
+ unsigned long last_iova;
+
+ if (!iter->area)
+ return NULL;
+ last_iova = iopt_area_last_iova(iter->area);
+ if (iter->last_iova <= last_iova)
+ return NULL;
+
+ iter->cur_iova = last_iova + 1;
+ iter->area = iopt_area_iter_next(iter->area, iter->cur_iova,
+ iter->last_iova);
+ if (!iter->area)
+ return NULL;
+ if (iter->cur_iova != iopt_area_iova(iter->area) ||
+ !iter->area->pages) {
+ iter->area = NULL;
+ return NULL;
+ }
+ return iter->area;
+}
+
+static bool __alloc_iova_check_hole(struct interval_tree_double_span_iter *span,
+ unsigned long length,
+ unsigned long iova_alignment,
+ unsigned long page_offset)
+{
+ if (span->is_used || span->last_hole - span->start_hole < length - 1)
+ return false;
+
+ span->start_hole = ALIGN(span->start_hole, iova_alignment) |
+ page_offset;
+ if (span->start_hole > span->last_hole ||
+ span->last_hole - span->start_hole < length - 1)
+ return false;
+ return true;
+}
+
+static bool __alloc_iova_check_used(struct interval_tree_span_iter *span,
+ unsigned long length,
+ unsigned long iova_alignment,
+ unsigned long page_offset)
+{
+ if (span->is_hole || span->last_used - span->start_used < length - 1)
+ return false;
+
+ span->start_used = ALIGN(span->start_used, iova_alignment) |
+ page_offset;
+ if (span->start_used > span->last_used ||
+ span->last_used - span->start_used < length - 1)
+ return false;
+ return true;
+}
+
+/*
+ * Automatically find a block of IOVA that is not being used and not reserved.
+ * Does not return a 0 IOVA even if it is valid.
+ */
+static int iopt_alloc_iova(struct io_pagetable *iopt, unsigned long *iova,
+ unsigned long uptr, unsigned long length)
+{
+ unsigned long page_offset = uptr % PAGE_SIZE;
+ struct interval_tree_double_span_iter used_span;
+ struct interval_tree_span_iter allowed_span;
+ unsigned long iova_alignment;
+
+ lockdep_assert_held(&iopt->iova_rwsem);
+
+ /* Protect roundup_pow-of_two() from overflow */
+ if (length == 0 || length >= ULONG_MAX / 2)
+ return -EOVERFLOW;
+
+ /*
+ * Keep alignment present in the uptr when building the IOVA, this
+ * increases the chance we can map a THP.
+ */
+ if (!uptr)
+ iova_alignment = roundup_pow_of_two(length);
+ else
+ iova_alignment = min_t(unsigned long,
+ roundup_pow_of_two(length),
+ 1UL << __ffs64(uptr));
+
+ if (iova_alignment < iopt->iova_alignment)
+ return -EINVAL;
+
+ interval_tree_for_each_span(&allowed_span, &iopt->allowed_itree,
+ PAGE_SIZE, ULONG_MAX - PAGE_SIZE) {
+ if (RB_EMPTY_ROOT(&iopt->allowed_itree.rb_root)) {
+ allowed_span.start_used = PAGE_SIZE;
+ allowed_span.last_used = ULONG_MAX - PAGE_SIZE;
+ allowed_span.is_hole = false;
+ }
+
+ if (!__alloc_iova_check_used(&allowed_span, length,
+ iova_alignment, page_offset))
+ continue;
+
+ interval_tree_for_each_double_span(
+ &used_span, &iopt->reserved_itree, &iopt->area_itree,
+ allowed_span.start_used, allowed_span.last_used) {
+ if (!__alloc_iova_check_hole(&used_span, length,
+ iova_alignment,
+ page_offset))
+ continue;
+
+ *iova = used_span.start_hole;
+ return 0;
+ }
+ }
+ return -ENOSPC;
+}
+
+static int iopt_check_iova(struct io_pagetable *iopt, unsigned long iova,
+ unsigned long length)
+{
+ unsigned long last;
+
+ lockdep_assert_held(&iopt->iova_rwsem);
+
+ if ((iova & (iopt->iova_alignment - 1)))
+ return -EINVAL;
+
+ if (check_add_overflow(iova, length - 1, &last))
+ return -EOVERFLOW;
+
+ /* No reserved IOVA intersects the range */
+ if (iopt_reserved_iter_first(iopt, iova, last))
+ return -EINVAL;
+
+ /* Check that there is not already a mapping in the range */
+ if (iopt_area_iter_first(iopt, iova, last))
+ return -EEXIST;
+ return 0;
+}
+
+/*
+ * The area takes a slice of the pages from start_bytes to start_byte + length
+ */
+static int iopt_insert_area(struct io_pagetable *iopt, struct iopt_area *area,
+ struct iopt_pages *pages, unsigned long iova,
+ unsigned long start_byte, unsigned long length,
+ int iommu_prot)
+{
+ lockdep_assert_held_write(&iopt->iova_rwsem);
+
+ if ((iommu_prot & IOMMU_WRITE) && !pages->writable)
+ return -EPERM;
+
+ area->iommu_prot = iommu_prot;
+ area->page_offset = start_byte % PAGE_SIZE;
+ if (area->page_offset & (iopt->iova_alignment - 1))
+ return -EINVAL;
+
+ area->node.start = iova;
+ if (check_add_overflow(iova, length - 1, &area->node.last))
+ return -EOVERFLOW;
+
+ area->pages_node.start = start_byte / PAGE_SIZE;
+ if (check_add_overflow(start_byte, length - 1, &area->pages_node.last))
+ return -EOVERFLOW;
+ area->pages_node.last = area->pages_node.last / PAGE_SIZE;
+ if (WARN_ON(area->pages_node.last >= pages->npages))
+ return -EOVERFLOW;
+
+ /*
+ * The area is inserted with a NULL pages indicating it is not fully
+ * initialized yet.
+ */
+ area->iopt = iopt;
+ interval_tree_insert(&area->node, &iopt->area_itree);
+ return 0;
+}
+
+static struct iopt_area *iopt_area_alloc(void)
+{
+ struct iopt_area *area;
+
+ area = kzalloc(sizeof(*area), GFP_KERNEL_ACCOUNT);
+ if (!area)
+ return NULL;
+ RB_CLEAR_NODE(&area->node.rb);
+ RB_CLEAR_NODE(&area->pages_node.rb);
+ return area;
+}
+
+static int iopt_alloc_area_pages(struct io_pagetable *iopt,
+ struct list_head *pages_list,
+ unsigned long length, unsigned long *dst_iova,
+ int iommu_prot, unsigned int flags)
+{
+ struct iopt_pages_list *elm;
+ unsigned long iova;
+ int rc = 0;
+
+ list_for_each_entry(elm, pages_list, next) {
+ elm->area = iopt_area_alloc();
+ if (!elm->area)
+ return -ENOMEM;
+ }
+
+ down_write(&iopt->iova_rwsem);
+ if ((length & (iopt->iova_alignment - 1)) || !length) {
+ rc = -EINVAL;
+ goto out_unlock;
+ }
+
+ if (flags & IOPT_ALLOC_IOVA) {
+ /* Use the first entry to guess the ideal IOVA alignment */
+ elm = list_first_entry(pages_list, struct iopt_pages_list,
+ next);
+ rc = iopt_alloc_iova(
+ iopt, dst_iova,
+ (uintptr_t)elm->pages->uptr + elm->start_byte, length);
+ if (rc)
+ goto out_unlock;
+ if (IS_ENABLED(CONFIG_IOMMUFD_TEST) &&
+ WARN_ON(iopt_check_iova(iopt, *dst_iova, length))) {
+ rc = -EINVAL;
+ goto out_unlock;
+ }
+ } else {
+ rc = iopt_check_iova(iopt, *dst_iova, length);
+ if (rc)
+ goto out_unlock;
+ }
+
+ /*
+ * Areas are created with a NULL pages so that the IOVA space is
+ * reserved and we can unlock the iova_rwsem.
+ */
+ iova = *dst_iova;
+ list_for_each_entry(elm, pages_list, next) {
+ rc = iopt_insert_area(iopt, elm->area, elm->pages, iova,
+ elm->start_byte, elm->length, iommu_prot);
+ if (rc)
+ goto out_unlock;
+ iova += elm->length;
+ }
+
+out_unlock:
+ up_write(&iopt->iova_rwsem);
+ return rc;
+}
+
+static void iopt_abort_area(struct iopt_area *area)
+{
+ if (IS_ENABLED(CONFIG_IOMMUFD_TEST))
+ WARN_ON(area->pages);
+ if (area->iopt) {
+ down_write(&area->iopt->iova_rwsem);
+ interval_tree_remove(&area->node, &area->iopt->area_itree);
+ up_write(&area->iopt->iova_rwsem);
+ }
+ kfree(area);
+}
+
+void iopt_free_pages_list(struct list_head *pages_list)
+{
+ struct iopt_pages_list *elm;
+
+ while ((elm = list_first_entry_or_null(pages_list,
+ struct iopt_pages_list, next))) {
+ if (elm->area)
+ iopt_abort_area(elm->area);
+ if (elm->pages)
+ iopt_put_pages(elm->pages);
+ list_del(&elm->next);
+ kfree(elm);
+ }
+}
+
+static int iopt_fill_domains_pages(struct list_head *pages_list)
+{
+ struct iopt_pages_list *undo_elm;
+ struct iopt_pages_list *elm;
+ int rc;
+
+ list_for_each_entry(elm, pages_list, next) {
+ rc = iopt_area_fill_domains(elm->area, elm->pages);
+ if (rc)
+ goto err_undo;
+ }
+ return 0;
+
+err_undo:
+ list_for_each_entry(undo_elm, pages_list, next) {
+ if (undo_elm == elm)
+ break;
+ iopt_area_unfill_domains(undo_elm->area, undo_elm->pages);
+ }
+ return rc;
+}
+
+int iopt_map_pages(struct io_pagetable *iopt, struct list_head *pages_list,
+ unsigned long length, unsigned long *dst_iova,
+ int iommu_prot, unsigned int flags)
+{
+ struct iopt_pages_list *elm;
+ int rc;
+
+ rc = iopt_alloc_area_pages(iopt, pages_list, length, dst_iova,
+ iommu_prot, flags);
+ if (rc)
+ return rc;
+
+ down_read(&iopt->domains_rwsem);
+ rc = iopt_fill_domains_pages(pages_list);
+ if (rc)
+ goto out_unlock_domains;
+
+ down_write(&iopt->iova_rwsem);
+ list_for_each_entry(elm, pages_list, next) {
+ /*
+ * area->pages must be set inside the domains_rwsem to ensure
+ * any newly added domains will get filled. Moves the reference
+ * in from the list.
+ */
+ elm->area->pages = elm->pages;
+ elm->pages = NULL;
+ elm->area = NULL;
+ }
+ up_write(&iopt->iova_rwsem);
+out_unlock_domains:
+ up_read(&iopt->domains_rwsem);
+ return rc;
+}
+
+/**
+ * iopt_map_user_pages() - Map a user VA to an iova in the io page table
+ * @ictx: iommufd_ctx the iopt is part of
+ * @iopt: io_pagetable to act on
+ * @iova: If IOPT_ALLOC_IOVA is set this is unused on input and contains
+ * the chosen iova on output. Otherwise is the iova to map to on input
+ * @uptr: User VA to map
+ * @length: Number of bytes to map
+ * @iommu_prot: Combination of IOMMU_READ/WRITE/etc bits for the mapping
+ * @flags: IOPT_ALLOC_IOVA or zero
+ *
+ * iova, uptr, and length must be aligned to iova_alignment. For domain backed
+ * page tables this will pin the pages and load them into the domain at iova.
+ * For non-domain page tables this will only setup a lazy reference and the
+ * caller must use iopt_access_pages() to touch them.
+ *
+ * iopt_unmap_iova() must be called to undo this before the io_pagetable can be
+ * destroyed.
+ */
+int iopt_map_user_pages(struct iommufd_ctx *ictx, struct io_pagetable *iopt,
+ unsigned long *iova, void __user *uptr,
+ unsigned long length, int iommu_prot,
+ unsigned int flags)
+{
+ struct iopt_pages_list elm = {};
+ LIST_HEAD(pages_list);
+ int rc;
+
+ elm.pages = iopt_alloc_pages(uptr, length, iommu_prot & IOMMU_WRITE);
+ if (IS_ERR(elm.pages))
+ return PTR_ERR(elm.pages);
+ if (ictx->account_mode == IOPT_PAGES_ACCOUNT_MM &&
+ elm.pages->account_mode == IOPT_PAGES_ACCOUNT_USER)
+ elm.pages->account_mode = IOPT_PAGES_ACCOUNT_MM;
+ elm.start_byte = uptr - elm.pages->uptr;
+ elm.length = length;
+ list_add(&elm.next, &pages_list);
+
+ rc = iopt_map_pages(iopt, &pages_list, length, iova, iommu_prot, flags);
+ if (rc) {
+ if (elm.area)
+ iopt_abort_area(elm.area);
+ if (elm.pages)
+ iopt_put_pages(elm.pages);
+ return rc;
+ }
+ return 0;
+}
+
+int iopt_get_pages(struct io_pagetable *iopt, unsigned long iova,
+ unsigned long length, struct list_head *pages_list)
+{
+ struct iopt_area_contig_iter iter;
+ unsigned long last_iova;
+ struct iopt_area *area;
+ int rc;
+
+ if (!length)
+ return -EINVAL;
+ if (check_add_overflow(iova, length - 1, &last_iova))
+ return -EOVERFLOW;
+
+ down_read(&iopt->iova_rwsem);
+ iopt_for_each_contig_area(&iter, area, iopt, iova, last_iova) {
+ struct iopt_pages_list *elm;
+ unsigned long last = min(last_iova, iopt_area_last_iova(area));
+
+ elm = kzalloc(sizeof(*elm), GFP_KERNEL_ACCOUNT);
+ if (!elm) {
+ rc = -ENOMEM;
+ goto err_free;
+ }
+ elm->start_byte = iopt_area_start_byte(area, iter.cur_iova);
+ elm->pages = area->pages;
+ elm->length = (last - iter.cur_iova) + 1;
+ kref_get(&elm->pages->kref);
+ list_add_tail(&elm->next, pages_list);
+ }
+ if (!iopt_area_contig_done(&iter)) {
+ rc = -ENOENT;
+ goto err_free;
+ }
+ up_read(&iopt->iova_rwsem);
+ return 0;
+err_free:
+ up_read(&iopt->iova_rwsem);
+ iopt_free_pages_list(pages_list);
+ return rc;
+}
+
+static int iopt_unmap_iova_range(struct io_pagetable *iopt, unsigned long start,
+ unsigned long last, unsigned long *unmapped)
+{
+ struct iopt_area *area;
+ unsigned long unmapped_bytes = 0;
+ unsigned int tries = 0;
+ int rc = -ENOENT;
+
+ /*
+ * The domains_rwsem must be held in read mode any time any area->pages
+ * is NULL. This prevents domain attach/detatch from running
+ * concurrently with cleaning up the area.
+ */
+again:
+ down_read(&iopt->domains_rwsem);
+ down_write(&iopt->iova_rwsem);
+ while ((area = iopt_area_iter_first(iopt, start, last))) {
+ unsigned long area_last = iopt_area_last_iova(area);
+ unsigned long area_first = iopt_area_iova(area);
+ struct iopt_pages *pages;
+
+ /* Userspace should not race map/unmap's of the same area */
+ if (!area->pages) {
+ rc = -EBUSY;
+ goto out_unlock_iova;
+ }
+
+ if (area_first < start || area_last > last) {
+ rc = -ENOENT;
+ goto out_unlock_iova;
+ }
+
+ if (area_first != start)
+ tries = 0;
+
+ /*
+ * num_accesses writers must hold the iova_rwsem too, so we can
+ * safely read it under the write side of the iovam_rwsem
+ * without the pages->mutex.
+ */
+ if (area->num_accesses) {
+ size_t length = iopt_area_length(area);
+
+ start = area_first;
+ area->prevent_access = true;
+ up_write(&iopt->iova_rwsem);
+ up_read(&iopt->domains_rwsem);
+
+ iommufd_access_notify_unmap(iopt, area_first, length);
+ /* Something is not responding to unmap requests. */
+ tries++;
+ if (WARN_ON(tries > 100))
+ return -EDEADLOCK;
+ goto again;
+ }
+
+ pages = area->pages;
+ area->pages = NULL;
+ up_write(&iopt->iova_rwsem);
+
+ iopt_area_unfill_domains(area, pages);
+ iopt_abort_area(area);
+ iopt_put_pages(pages);
+
+ unmapped_bytes += area_last - area_first + 1;
+
+ down_write(&iopt->iova_rwsem);
+ }
+ if (unmapped_bytes)
+ rc = 0;
+
+out_unlock_iova:
+ up_write(&iopt->iova_rwsem);
+ up_read(&iopt->domains_rwsem);
+ if (unmapped)
+ *unmapped = unmapped_bytes;
+ return rc;
+}
+
+/**
+ * iopt_unmap_iova() - Remove a range of iova
+ * @iopt: io_pagetable to act on
+ * @iova: Starting iova to unmap
+ * @length: Number of bytes to unmap
+ * @unmapped: Return number of bytes unmapped
+ *
+ * The requested range must be a superset of existing ranges.
+ * Splitting/truncating IOVA mappings is not allowed.
+ */
+int iopt_unmap_iova(struct io_pagetable *iopt, unsigned long iova,
+ unsigned long length, unsigned long *unmapped)
+{
+ unsigned long iova_last;
+
+ if (!length)
+ return -EINVAL;
+
+ if (check_add_overflow(iova, length - 1, &iova_last))
+ return -EOVERFLOW;
+
+ return iopt_unmap_iova_range(iopt, iova, iova_last, unmapped);
+}
+
+int iopt_unmap_all(struct io_pagetable *iopt, unsigned long *unmapped)
+{
+ int rc;
+
+ rc = iopt_unmap_iova_range(iopt, 0, ULONG_MAX, unmapped);
+ /* If the IOVAs are empty then unmap all succeeds */
+ if (rc == -ENOENT)
+ return 0;
+ return rc;
+}
+
+/* The caller must always free all the nodes in the allowed_iova rb_root. */
+int iopt_set_allow_iova(struct io_pagetable *iopt,
+ struct rb_root_cached *allowed_iova)
+{
+ struct iopt_allowed *allowed;
+
+ down_write(&iopt->iova_rwsem);
+ swap(*allowed_iova, iopt->allowed_itree);
+
+ for (allowed = iopt_allowed_iter_first(iopt, 0, ULONG_MAX); allowed;
+ allowed = iopt_allowed_iter_next(allowed, 0, ULONG_MAX)) {
+ if (iopt_reserved_iter_first(iopt, allowed->node.start,
+ allowed->node.last)) {
+ swap(*allowed_iova, iopt->allowed_itree);
+ up_write(&iopt->iova_rwsem);
+ return -EADDRINUSE;
+ }
+ }
+ up_write(&iopt->iova_rwsem);
+ return 0;
+}
+
+int iopt_reserve_iova(struct io_pagetable *iopt, unsigned long start,
+ unsigned long last, void *owner)
+{
+ struct iopt_reserved *reserved;
+
+ lockdep_assert_held_write(&iopt->iova_rwsem);
+
+ if (iopt_area_iter_first(iopt, start, last) ||
+ iopt_allowed_iter_first(iopt, start, last))
+ return -EADDRINUSE;
+
+ reserved = kzalloc(sizeof(*reserved), GFP_KERNEL_ACCOUNT);
+ if (!reserved)
+ return -ENOMEM;
+ reserved->node.start = start;
+ reserved->node.last = last;
+ reserved->owner = owner;
+ interval_tree_insert(&reserved->node, &iopt->reserved_itree);
+ return 0;
+}
+
+static void __iopt_remove_reserved_iova(struct io_pagetable *iopt, void *owner)
+{
+ struct iopt_reserved *reserved, *next;
+
+ lockdep_assert_held_write(&iopt->iova_rwsem);
+
+ for (reserved = iopt_reserved_iter_first(iopt, 0, ULONG_MAX); reserved;
+ reserved = next) {
+ next = iopt_reserved_iter_next(reserved, 0, ULONG_MAX);
+
+ if (reserved->owner == owner) {
+ interval_tree_remove(&reserved->node,
+ &iopt->reserved_itree);
+ kfree(reserved);
+ }
+ }
+}
+
+void iopt_remove_reserved_iova(struct io_pagetable *iopt, void *owner)
+{
+ down_write(&iopt->iova_rwsem);
+ __iopt_remove_reserved_iova(iopt, owner);
+ up_write(&iopt->iova_rwsem);
+}
+
+void iopt_init_table(struct io_pagetable *iopt)
+{
+ init_rwsem(&iopt->iova_rwsem);
+ init_rwsem(&iopt->domains_rwsem);
+ iopt->area_itree = RB_ROOT_CACHED;
+ iopt->allowed_itree = RB_ROOT_CACHED;
+ iopt->reserved_itree = RB_ROOT_CACHED;
+ xa_init_flags(&iopt->domains, XA_FLAGS_ACCOUNT);
+ xa_init_flags(&iopt->access_list, XA_FLAGS_ALLOC);
+
+ /*
+ * iopt's start as SW tables that can use the entire size_t IOVA space
+ * due to the use of size_t in the APIs. They have no alignment
+ * restriction.
+ */
+ iopt->iova_alignment = 1;
+}
+
+void iopt_destroy_table(struct io_pagetable *iopt)
+{
+ struct interval_tree_node *node;
+
+ if (IS_ENABLED(CONFIG_IOMMUFD_TEST))
+ iopt_remove_reserved_iova(iopt, NULL);
+
+ while ((node = interval_tree_iter_first(&iopt->allowed_itree, 0,
+ ULONG_MAX))) {
+ interval_tree_remove(node, &iopt->allowed_itree);
+ kfree(container_of(node, struct iopt_allowed, node));
+ }
+
+ WARN_ON(!RB_EMPTY_ROOT(&iopt->reserved_itree.rb_root));
+ WARN_ON(!xa_empty(&iopt->domains));
+ WARN_ON(!xa_empty(&iopt->access_list));
+ WARN_ON(!RB_EMPTY_ROOT(&iopt->area_itree.rb_root));
+}
+
+/**
+ * iopt_unfill_domain() - Unfill a domain with PFNs
+ * @iopt: io_pagetable to act on
+ * @domain: domain to unfill
+ *
+ * This is used when removing a domain from the iopt. Every area in the iopt
+ * will be unmapped from the domain. The domain must already be removed from the
+ * domains xarray.
+ */
+static void iopt_unfill_domain(struct io_pagetable *iopt,
+ struct iommu_domain *domain)
+{
+ struct iopt_area *area;
+
+ lockdep_assert_held(&iopt->iova_rwsem);
+ lockdep_assert_held_write(&iopt->domains_rwsem);
+
+ /*
+ * Some other domain is holding all the pfns still, rapidly unmap this
+ * domain.
+ */
+ if (iopt->next_domain_id != 0) {
+ /* Pick an arbitrary remaining domain to act as storage */
+ struct iommu_domain *storage_domain =
+ xa_load(&iopt->domains, 0);
+
+ for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
+ area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
+ struct iopt_pages *pages = area->pages;
+
+ if (!pages)
+ continue;
+
+ mutex_lock(&pages->mutex);
+ if (IS_ENABLED(CONFIG_IOMMUFD_TEST))
+ WARN_ON(!area->storage_domain);
+ if (area->storage_domain == domain)
+ area->storage_domain = storage_domain;
+ mutex_unlock(&pages->mutex);
+
+ iopt_area_unmap_domain(area, domain);
+ }
+ return;
+ }
+
+ for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
+ area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
+ struct iopt_pages *pages = area->pages;
+
+ if (!pages)
+ continue;
+
+ mutex_lock(&pages->mutex);
+ interval_tree_remove(&area->pages_node, &pages->domains_itree);
+ WARN_ON(area->storage_domain != domain);
+ area->storage_domain = NULL;
+ iopt_area_unfill_domain(area, pages, domain);
+ mutex_unlock(&pages->mutex);
+ }
+}
+
+/**
+ * iopt_fill_domain() - Fill a domain with PFNs
+ * @iopt: io_pagetable to act on
+ * @domain: domain to fill
+ *
+ * Fill the domain with PFNs from every area in the iopt. On failure the domain
+ * is left unchanged.
+ */
+static int iopt_fill_domain(struct io_pagetable *iopt,
+ struct iommu_domain *domain)
+{
+ struct iopt_area *end_area;
+ struct iopt_area *area;
+ int rc;
+
+ lockdep_assert_held(&iopt->iova_rwsem);
+ lockdep_assert_held_write(&iopt->domains_rwsem);
+
+ for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
+ area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
+ struct iopt_pages *pages = area->pages;
+
+ if (!pages)
+ continue;
+
+ mutex_lock(&pages->mutex);
+ rc = iopt_area_fill_domain(area, domain);
+ if (rc) {
+ mutex_unlock(&pages->mutex);
+ goto out_unfill;
+ }
+ if (!area->storage_domain) {
+ WARN_ON(iopt->next_domain_id != 0);
+ area->storage_domain = domain;
+ interval_tree_insert(&area->pages_node,
+ &pages->domains_itree);
+ }
+ mutex_unlock(&pages->mutex);
+ }
+ return 0;
+
+out_unfill:
+ end_area = area;
+ for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
+ area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
+ struct iopt_pages *pages = area->pages;
+
+ if (area == end_area)
+ break;
+ if (!pages)
+ continue;
+ mutex_lock(&pages->mutex);
+ if (iopt->next_domain_id == 0) {
+ interval_tree_remove(&area->pages_node,
+ &pages->domains_itree);
+ area->storage_domain = NULL;
+ }
+ iopt_area_unfill_domain(area, pages, domain);
+ mutex_unlock(&pages->mutex);
+ }
+ return rc;
+}
+
+/* All existing area's conform to an increased page size */
+static int iopt_check_iova_alignment(struct io_pagetable *iopt,
+ unsigned long new_iova_alignment)
+{
+ unsigned long align_mask = new_iova_alignment - 1;
+ struct iopt_area *area;
+
+ lockdep_assert_held(&iopt->iova_rwsem);
+ lockdep_assert_held(&iopt->domains_rwsem);
+
+ for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
+ area = iopt_area_iter_next(area, 0, ULONG_MAX))
+ if ((iopt_area_iova(area) & align_mask) ||
+ (iopt_area_length(area) & align_mask) ||
+ (area->page_offset & align_mask))
+ return -EADDRINUSE;
+
+ if (IS_ENABLED(CONFIG_IOMMUFD_TEST)) {
+ struct iommufd_access *access;
+ unsigned long index;
+
+ xa_for_each(&iopt->access_list, index, access)
+ if (WARN_ON(access->iova_alignment >
+ new_iova_alignment))
+ return -EADDRINUSE;
+ }
+ return 0;
+}
+
+int iopt_table_add_domain(struct io_pagetable *iopt,
+ struct iommu_domain *domain)
+{
+ const struct iommu_domain_geometry *geometry = &domain->geometry;
+ struct iommu_domain *iter_domain;
+ unsigned int new_iova_alignment;
+ unsigned long index;
+ int rc;
+
+ down_write(&iopt->domains_rwsem);
+ down_write(&iopt->iova_rwsem);
+
+ xa_for_each(&iopt->domains, index, iter_domain) {
+ if (WARN_ON(iter_domain == domain)) {
+ rc = -EEXIST;
+ goto out_unlock;
+ }
+ }
+
+ /*
+ * The io page size drives the iova_alignment. Internally the iopt_pages
+ * works in PAGE_SIZE units and we adjust when mapping sub-PAGE_SIZE
+ * objects into the iommu_domain.
+ *
+ * A iommu_domain must always be able to accept PAGE_SIZE to be
+ * compatible as we can't guarantee higher contiguity.
+ */
+ new_iova_alignment = max_t(unsigned long,
+ 1UL << __ffs(domain->pgsize_bitmap),
+ iopt->iova_alignment);
+ if (new_iova_alignment > PAGE_SIZE) {
+ rc = -EINVAL;
+ goto out_unlock;
+ }
+ if (new_iova_alignment != iopt->iova_alignment) {
+ rc = iopt_check_iova_alignment(iopt, new_iova_alignment);
+ if (rc)
+ goto out_unlock;
+ }
+
+ /* No area exists that is outside the allowed domain aperture */
+ if (geometry->aperture_start != 0) {
+ rc = iopt_reserve_iova(iopt, 0, geometry->aperture_start - 1,
+ domain);
+ if (rc)
+ goto out_reserved;
+ }
+ if (geometry->aperture_end != ULONG_MAX) {
+ rc = iopt_reserve_iova(iopt, geometry->aperture_end + 1,
+ ULONG_MAX, domain);
+ if (rc)
+ goto out_reserved;
+ }
+
+ rc = xa_reserve(&iopt->domains, iopt->next_domain_id, GFP_KERNEL);
+ if (rc)
+ goto out_reserved;
+
+ rc = iopt_fill_domain(iopt, domain);
+ if (rc)
+ goto out_release;
+
+ iopt->iova_alignment = new_iova_alignment;
+ xa_store(&iopt->domains, iopt->next_domain_id, domain, GFP_KERNEL);
+ iopt->next_domain_id++;
+ up_write(&iopt->iova_rwsem);
+ up_write(&iopt->domains_rwsem);
+ return 0;
+out_release:
+ xa_release(&iopt->domains, iopt->next_domain_id);
+out_reserved:
+ __iopt_remove_reserved_iova(iopt, domain);
+out_unlock:
+ up_write(&iopt->iova_rwsem);
+ up_write(&iopt->domains_rwsem);
+ return rc;
+}
+
+static int iopt_calculate_iova_alignment(struct io_pagetable *iopt)
+{
+ unsigned long new_iova_alignment;
+ struct iommufd_access *access;
+ struct iommu_domain *domain;
+ unsigned long index;
+
+ lockdep_assert_held_write(&iopt->iova_rwsem);
+ lockdep_assert_held(&iopt->domains_rwsem);
+
+ /* See batch_iommu_map_small() */
+ if (iopt->disable_large_pages)
+ new_iova_alignment = PAGE_SIZE;
+ else
+ new_iova_alignment = 1;
+
+ xa_for_each(&iopt->domains, index, domain)
+ new_iova_alignment = max_t(unsigned long,
+ 1UL << __ffs(domain->pgsize_bitmap),
+ new_iova_alignment);
+ xa_for_each(&iopt->access_list, index, access)
+ new_iova_alignment = max_t(unsigned long,
+ access->iova_alignment,
+ new_iova_alignment);
+
+ if (new_iova_alignment > iopt->iova_alignment) {
+ int rc;
+
+ rc = iopt_check_iova_alignment(iopt, new_iova_alignment);
+ if (rc)
+ return rc;
+ }
+ iopt->iova_alignment = new_iova_alignment;
+ return 0;
+}
+
+void iopt_table_remove_domain(struct io_pagetable *iopt,
+ struct iommu_domain *domain)
+{
+ struct iommu_domain *iter_domain = NULL;
+ unsigned long index;
+
+ down_write(&iopt->domains_rwsem);
+ down_write(&iopt->iova_rwsem);
+
+ xa_for_each(&iopt->domains, index, iter_domain)
+ if (iter_domain == domain)
+ break;
+ if (WARN_ON(iter_domain != domain) || index >= iopt->next_domain_id)
+ goto out_unlock;
+
+ /*
+ * Compress the xarray to keep it linear by swapping the entry to erase
+ * with the tail entry and shrinking the tail.
+ */
+ iopt->next_domain_id--;
+ iter_domain = xa_erase(&iopt->domains, iopt->next_domain_id);
+ if (index != iopt->next_domain_id)
+ xa_store(&iopt->domains, index, iter_domain, GFP_KERNEL);
+
+ iopt_unfill_domain(iopt, domain);
+ __iopt_remove_reserved_iova(iopt, domain);
+
+ WARN_ON(iopt_calculate_iova_alignment(iopt));
+out_unlock:
+ up_write(&iopt->iova_rwsem);
+ up_write(&iopt->domains_rwsem);
+}
+
+/**
+ * iopt_area_split - Split an area into two parts at iova
+ * @area: The area to split
+ * @iova: Becomes the last of a new area
+ *
+ * This splits an area into two. It is part of the VFIO compatibility to allow
+ * poking a hole in the mapping. The two areas continue to point at the same
+ * iopt_pages, just with different starting bytes.
+ */
+static int iopt_area_split(struct iopt_area *area, unsigned long iova)
+{
+ unsigned long alignment = area->iopt->iova_alignment;
+ unsigned long last_iova = iopt_area_last_iova(area);
+ unsigned long start_iova = iopt_area_iova(area);
+ unsigned long new_start = iova + 1;
+ struct io_pagetable *iopt = area->iopt;
+ struct iopt_pages *pages = area->pages;
+ struct iopt_area *lhs;
+ struct iopt_area *rhs;
+ int rc;
+
+ lockdep_assert_held_write(&iopt->iova_rwsem);
+
+ if (iova == start_iova || iova == last_iova)
+ return 0;
+
+ if (!pages || area->prevent_access)
+ return -EBUSY;
+
+ if (new_start & (alignment - 1) ||
+ iopt_area_start_byte(area, new_start) & (alignment - 1))
+ return -EINVAL;
+
+ lhs = iopt_area_alloc();
+ if (!lhs)
+ return -ENOMEM;
+
+ rhs = iopt_area_alloc();
+ if (!rhs) {
+ rc = -ENOMEM;
+ goto err_free_lhs;
+ }
+
+ mutex_lock(&pages->mutex);
+ /*
+ * Splitting is not permitted if an access exists, we don't track enough
+ * information to split existing accesses.
+ */
+ if (area->num_accesses) {
+ rc = -EINVAL;
+ goto err_unlock;
+ }
+
+ /*
+ * Splitting is not permitted if a domain could have been mapped with
+ * huge pages.
+ */
+ if (area->storage_domain && !iopt->disable_large_pages) {
+ rc = -EINVAL;
+ goto err_unlock;
+ }
+
+ interval_tree_remove(&area->node, &iopt->area_itree);
+ rc = iopt_insert_area(iopt, lhs, area->pages, start_iova,
+ iopt_area_start_byte(area, start_iova),
+ (new_start - 1) - start_iova + 1,
+ area->iommu_prot);
+ if (WARN_ON(rc))
+ goto err_insert;
+
+ rc = iopt_insert_area(iopt, rhs, area->pages, new_start,
+ iopt_area_start_byte(area, new_start),
+ last_iova - new_start + 1, area->iommu_prot);
+ if (WARN_ON(rc))
+ goto err_remove_lhs;
+
+ /*
+ * If the original area has filled a domain, domains_itree has to be
+ * updated.
+ */
+ if (area->storage_domain) {
+ interval_tree_remove(&area->pages_node, &pages->domains_itree);
+ interval_tree_insert(&lhs->pages_node, &pages->domains_itree);
+ interval_tree_insert(&rhs->pages_node, &pages->domains_itree);
+ }
+
+ lhs->storage_domain = area->storage_domain;
+ lhs->pages = area->pages;
+ rhs->storage_domain = area->storage_domain;
+ rhs->pages = area->pages;
+ kref_get(&rhs->pages->kref);
+ kfree(area);
+ mutex_unlock(&pages->mutex);
+
+ /*
+ * No change to domains or accesses because the pages hasn't been
+ * changed
+ */
+ return 0;
+
+err_remove_lhs:
+ interval_tree_remove(&lhs->node, &iopt->area_itree);
+err_insert:
+ interval_tree_insert(&area->node, &iopt->area_itree);
+err_unlock:
+ mutex_unlock(&pages->mutex);
+ kfree(rhs);
+err_free_lhs:
+ kfree(lhs);
+ return rc;
+}
+
+int iopt_cut_iova(struct io_pagetable *iopt, unsigned long *iovas,
+ size_t num_iovas)
+{
+ int rc = 0;
+ int i;
+
+ down_write(&iopt->iova_rwsem);
+ for (i = 0; i < num_iovas; i++) {
+ struct iopt_area *area;
+
+ area = iopt_area_iter_first(iopt, iovas[i], iovas[i]);
+ if (!area)
+ continue;
+ rc = iopt_area_split(area, iovas[i]);
+ if (rc)
+ break;
+ }
+ up_write(&iopt->iova_rwsem);
+ return rc;
+}
+
+void iopt_enable_large_pages(struct io_pagetable *iopt)
+{
+ int rc;
+
+ down_write(&iopt->domains_rwsem);
+ down_write(&iopt->iova_rwsem);
+ WRITE_ONCE(iopt->disable_large_pages, false);
+ rc = iopt_calculate_iova_alignment(iopt);
+ WARN_ON(rc);
+ up_write(&iopt->iova_rwsem);
+ up_write(&iopt->domains_rwsem);
+}
+
+int iopt_disable_large_pages(struct io_pagetable *iopt)
+{
+ int rc = 0;
+
+ down_write(&iopt->domains_rwsem);
+ down_write(&iopt->iova_rwsem);
+ if (iopt->disable_large_pages)
+ goto out_unlock;
+
+ /* Won't do it if domains already have pages mapped in them */
+ if (!xa_empty(&iopt->domains) &&
+ !RB_EMPTY_ROOT(&iopt->area_itree.rb_root)) {
+ rc = -EINVAL;
+ goto out_unlock;
+ }
+
+ WRITE_ONCE(iopt->disable_large_pages, true);
+ rc = iopt_calculate_iova_alignment(iopt);
+ if (rc)
+ WRITE_ONCE(iopt->disable_large_pages, false);
+out_unlock:
+ up_write(&iopt->iova_rwsem);
+ up_write(&iopt->domains_rwsem);
+ return rc;
+}
+
+int iopt_add_access(struct io_pagetable *iopt, struct iommufd_access *access)
+{
+ int rc;
+
+ down_write(&iopt->domains_rwsem);
+ down_write(&iopt->iova_rwsem);
+ rc = xa_alloc(&iopt->access_list, &access->iopt_access_list_id, access,
+ xa_limit_16b, GFP_KERNEL_ACCOUNT);
+ if (rc)
+ goto out_unlock;
+
+ rc = iopt_calculate_iova_alignment(iopt);
+ if (rc) {
+ xa_erase(&iopt->access_list, access->iopt_access_list_id);
+ goto out_unlock;
+ }
+
+out_unlock:
+ up_write(&iopt->iova_rwsem);
+ up_write(&iopt->domains_rwsem);
+ return rc;
+}
+
+void iopt_remove_access(struct io_pagetable *iopt,
+ struct iommufd_access *access,
+ u32 iopt_access_list_id)
+{
+ down_write(&iopt->domains_rwsem);
+ down_write(&iopt->iova_rwsem);
+ WARN_ON(xa_erase(&iopt->access_list, iopt_access_list_id) != access);
+ WARN_ON(iopt_calculate_iova_alignment(iopt));
+ up_write(&iopt->iova_rwsem);
+ up_write(&iopt->domains_rwsem);
+}
+
+/* Narrow the valid_iova_itree to include reserved ranges from a device. */
+int iopt_table_enforce_dev_resv_regions(struct io_pagetable *iopt,
+ struct device *dev,
+ phys_addr_t *sw_msi_start)
+{
+ struct iommu_resv_region *resv;
+ LIST_HEAD(resv_regions);
+ unsigned int num_hw_msi = 0;
+ unsigned int num_sw_msi = 0;
+ int rc;
+
+ if (iommufd_should_fail())
+ return -EINVAL;
+
+ down_write(&iopt->iova_rwsem);
+ /* FIXME: drivers allocate memory but there is no failure propogated */
+ iommu_get_resv_regions(dev, &resv_regions);
+
+ list_for_each_entry(resv, &resv_regions, list) {
+ if (resv->type == IOMMU_RESV_DIRECT_RELAXABLE)
+ continue;
+
+ if (sw_msi_start && resv->type == IOMMU_RESV_MSI)
+ num_hw_msi++;
+ if (sw_msi_start && resv->type == IOMMU_RESV_SW_MSI) {
+ *sw_msi_start = resv->start;
+ num_sw_msi++;
+ }
+
+ rc = iopt_reserve_iova(iopt, resv->start,
+ resv->length - 1 + resv->start, dev);
+ if (rc)
+ goto out_reserved;
+ }
+
+ /* Drivers must offer sane combinations of regions */
+ if (WARN_ON(num_sw_msi && num_hw_msi) || WARN_ON(num_sw_msi > 1)) {
+ rc = -EINVAL;
+ goto out_reserved;
+ }
+
+ rc = 0;
+ goto out_free_resv;
+
+out_reserved:
+ __iopt_remove_reserved_iova(iopt, dev);
+out_free_resv:
+ iommu_put_resv_regions(dev, &resv_regions);
+ up_write(&iopt->iova_rwsem);
+ return rc;
+}
diff --git a/drivers/iommu/iommufd/io_pagetable.h b/drivers/iommu/iommufd/io_pagetable.h
new file mode 100644
index 0000000000..0ec3509b7e
--- /dev/null
+++ b/drivers/iommu/iommufd/io_pagetable.h
@@ -0,0 +1,241 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES.
+ *
+ */
+#ifndef __IO_PAGETABLE_H
+#define __IO_PAGETABLE_H
+
+#include <linux/interval_tree.h>
+#include <linux/mutex.h>
+#include <linux/kref.h>
+#include <linux/xarray.h>
+
+#include "iommufd_private.h"
+
+struct iommu_domain;
+
+/*
+ * Each io_pagetable is composed of intervals of areas which cover regions of
+ * the iova that are backed by something. iova not covered by areas is not
+ * populated in the page table. Each area is fully populated with pages.
+ *
+ * iovas are in byte units, but must be iopt->iova_alignment aligned.
+ *
+ * pages can be NULL, this means some other thread is still working on setting
+ * up or tearing down the area. When observed under the write side of the
+ * domain_rwsem a NULL pages must mean the area is still being setup and no
+ * domains are filled.
+ *
+ * storage_domain points at an arbitrary iommu_domain that is holding the PFNs
+ * for this area. It is locked by the pages->mutex. This simplifies the locking
+ * as the pages code can rely on the storage_domain without having to get the
+ * iopt->domains_rwsem.
+ *
+ * The io_pagetable::iova_rwsem protects node
+ * The iopt_pages::mutex protects pages_node
+ * iopt and iommu_prot are immutable
+ * The pages::mutex protects num_accesses
+ */
+struct iopt_area {
+ struct interval_tree_node node;
+ struct interval_tree_node pages_node;
+ struct io_pagetable *iopt;
+ struct iopt_pages *pages;
+ struct iommu_domain *storage_domain;
+ /* How many bytes into the first page the area starts */
+ unsigned int page_offset;
+ /* IOMMU_READ, IOMMU_WRITE, etc */
+ int iommu_prot;
+ bool prevent_access : 1;
+ unsigned int num_accesses;
+};
+
+struct iopt_allowed {
+ struct interval_tree_node node;
+};
+
+struct iopt_reserved {
+ struct interval_tree_node node;
+ void *owner;
+};
+
+int iopt_area_fill_domains(struct iopt_area *area, struct iopt_pages *pages);
+void iopt_area_unfill_domains(struct iopt_area *area, struct iopt_pages *pages);
+
+int iopt_area_fill_domain(struct iopt_area *area, struct iommu_domain *domain);
+void iopt_area_unfill_domain(struct iopt_area *area, struct iopt_pages *pages,
+ struct iommu_domain *domain);
+void iopt_area_unmap_domain(struct iopt_area *area,
+ struct iommu_domain *domain);
+
+static inline unsigned long iopt_area_index(struct iopt_area *area)
+{
+ return area->pages_node.start;
+}
+
+static inline unsigned long iopt_area_last_index(struct iopt_area *area)
+{
+ return area->pages_node.last;
+}
+
+static inline unsigned long iopt_area_iova(struct iopt_area *area)
+{
+ return area->node.start;
+}
+
+static inline unsigned long iopt_area_last_iova(struct iopt_area *area)
+{
+ return area->node.last;
+}
+
+static inline size_t iopt_area_length(struct iopt_area *area)
+{
+ return (area->node.last - area->node.start) + 1;
+}
+
+/*
+ * Number of bytes from the start of the iopt_pages that the iova begins.
+ * iopt_area_start_byte() / PAGE_SIZE encodes the starting page index
+ * iopt_area_start_byte() % PAGE_SIZE encodes the offset within that page
+ */
+static inline unsigned long iopt_area_start_byte(struct iopt_area *area,
+ unsigned long iova)
+{
+ if (IS_ENABLED(CONFIG_IOMMUFD_TEST))
+ WARN_ON(iova < iopt_area_iova(area) ||
+ iova > iopt_area_last_iova(area));
+ return (iova - iopt_area_iova(area)) + area->page_offset +
+ iopt_area_index(area) * PAGE_SIZE;
+}
+
+static inline unsigned long iopt_area_iova_to_index(struct iopt_area *area,
+ unsigned long iova)
+{
+ return iopt_area_start_byte(area, iova) / PAGE_SIZE;
+}
+
+#define __make_iopt_iter(name) \
+ static inline struct iopt_##name *iopt_##name##_iter_first( \
+ struct io_pagetable *iopt, unsigned long start, \
+ unsigned long last) \
+ { \
+ struct interval_tree_node *node; \
+ \
+ lockdep_assert_held(&iopt->iova_rwsem); \
+ node = interval_tree_iter_first(&iopt->name##_itree, start, \
+ last); \
+ if (!node) \
+ return NULL; \
+ return container_of(node, struct iopt_##name, node); \
+ } \
+ static inline struct iopt_##name *iopt_##name##_iter_next( \
+ struct iopt_##name *last_node, unsigned long start, \
+ unsigned long last) \
+ { \
+ struct interval_tree_node *node; \
+ \
+ node = interval_tree_iter_next(&last_node->node, start, last); \
+ if (!node) \
+ return NULL; \
+ return container_of(node, struct iopt_##name, node); \
+ }
+
+__make_iopt_iter(area)
+__make_iopt_iter(allowed)
+__make_iopt_iter(reserved)
+
+struct iopt_area_contig_iter {
+ unsigned long cur_iova;
+ unsigned long last_iova;
+ struct iopt_area *area;
+};
+struct iopt_area *iopt_area_contig_init(struct iopt_area_contig_iter *iter,
+ struct io_pagetable *iopt,
+ unsigned long iova,
+ unsigned long last_iova);
+struct iopt_area *iopt_area_contig_next(struct iopt_area_contig_iter *iter);
+
+static inline bool iopt_area_contig_done(struct iopt_area_contig_iter *iter)
+{
+ return iter->area && iter->last_iova <= iopt_area_last_iova(iter->area);
+}
+
+/*
+ * Iterate over a contiguous list of areas that span the iova,last_iova range.
+ * The caller must check iopt_area_contig_done() after the loop to see if
+ * contiguous areas existed.
+ */
+#define iopt_for_each_contig_area(iter, area, iopt, iova, last_iova) \
+ for (area = iopt_area_contig_init(iter, iopt, iova, last_iova); area; \
+ area = iopt_area_contig_next(iter))
+
+enum {
+ IOPT_PAGES_ACCOUNT_NONE = 0,
+ IOPT_PAGES_ACCOUNT_USER = 1,
+ IOPT_PAGES_ACCOUNT_MM = 2,
+};
+
+/*
+ * This holds a pinned page list for multiple areas of IO address space. The
+ * pages always originate from a linear chunk of userspace VA. Multiple
+ * io_pagetable's, through their iopt_area's, can share a single iopt_pages
+ * which avoids multi-pinning and double accounting of page consumption.
+ *
+ * indexes in this structure are measured in PAGE_SIZE units, are 0 based from
+ * the start of the uptr and extend to npages. pages are pinned dynamically
+ * according to the intervals in the access_itree and domains_itree, npinned
+ * records the current number of pages pinned.
+ */
+struct iopt_pages {
+ struct kref kref;
+ struct mutex mutex;
+ size_t npages;
+ size_t npinned;
+ size_t last_npinned;
+ struct task_struct *source_task;
+ struct mm_struct *source_mm;
+ struct user_struct *source_user;
+ void __user *uptr;
+ bool writable:1;
+ u8 account_mode;
+
+ struct xarray pinned_pfns;
+ /* Of iopt_pages_access::node */
+ struct rb_root_cached access_itree;
+ /* Of iopt_area::pages_node */
+ struct rb_root_cached domains_itree;
+};
+
+struct iopt_pages *iopt_alloc_pages(void __user *uptr, unsigned long length,
+ bool writable);
+void iopt_release_pages(struct kref *kref);
+static inline void iopt_put_pages(struct iopt_pages *pages)
+{
+ kref_put(&pages->kref, iopt_release_pages);
+}
+
+void iopt_pages_fill_from_xarray(struct iopt_pages *pages, unsigned long start,
+ unsigned long last, struct page **out_pages);
+int iopt_pages_fill_xarray(struct iopt_pages *pages, unsigned long start,
+ unsigned long last, struct page **out_pages);
+void iopt_pages_unfill_xarray(struct iopt_pages *pages, unsigned long start,
+ unsigned long last);
+
+int iopt_area_add_access(struct iopt_area *area, unsigned long start,
+ unsigned long last, struct page **out_pages,
+ unsigned int flags);
+void iopt_area_remove_access(struct iopt_area *area, unsigned long start,
+ unsigned long last);
+int iopt_pages_rw_access(struct iopt_pages *pages, unsigned long start_byte,
+ void *data, unsigned long length, unsigned int flags);
+
+/*
+ * Each interval represents an active iopt_access_pages(), it acts as an
+ * interval lock that keeps the PFNs pinned and stored in the xarray.
+ */
+struct iopt_pages_access {
+ struct interval_tree_node node;
+ unsigned int users;
+};
+
+#endif
diff --git a/drivers/iommu/iommufd/ioas.c b/drivers/iommu/iommufd/ioas.c
new file mode 100644
index 0000000000..d5624577f7
--- /dev/null
+++ b/drivers/iommu/iommufd/ioas.c
@@ -0,0 +1,398 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES
+ */
+#include <linux/interval_tree.h>
+#include <linux/iommufd.h>
+#include <linux/iommu.h>
+#include <uapi/linux/iommufd.h>
+
+#include "io_pagetable.h"
+
+void iommufd_ioas_destroy(struct iommufd_object *obj)
+{
+ struct iommufd_ioas *ioas = container_of(obj, struct iommufd_ioas, obj);
+ int rc;
+
+ rc = iopt_unmap_all(&ioas->iopt, NULL);
+ WARN_ON(rc && rc != -ENOENT);
+ iopt_destroy_table(&ioas->iopt);
+ mutex_destroy(&ioas->mutex);
+}
+
+struct iommufd_ioas *iommufd_ioas_alloc(struct iommufd_ctx *ictx)
+{
+ struct iommufd_ioas *ioas;
+
+ ioas = iommufd_object_alloc(ictx, ioas, IOMMUFD_OBJ_IOAS);
+ if (IS_ERR(ioas))
+ return ioas;
+
+ iopt_init_table(&ioas->iopt);
+ INIT_LIST_HEAD(&ioas->hwpt_list);
+ mutex_init(&ioas->mutex);
+ return ioas;
+}
+
+int iommufd_ioas_alloc_ioctl(struct iommufd_ucmd *ucmd)
+{
+ struct iommu_ioas_alloc *cmd = ucmd->cmd;
+ struct iommufd_ioas *ioas;
+ int rc;
+
+ if (cmd->flags)
+ return -EOPNOTSUPP;
+
+ ioas = iommufd_ioas_alloc(ucmd->ictx);
+ if (IS_ERR(ioas))
+ return PTR_ERR(ioas);
+
+ cmd->out_ioas_id = ioas->obj.id;
+ rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd));
+ if (rc)
+ goto out_table;
+ iommufd_object_finalize(ucmd->ictx, &ioas->obj);
+ return 0;
+
+out_table:
+ iommufd_object_abort_and_destroy(ucmd->ictx, &ioas->obj);
+ return rc;
+}
+
+int iommufd_ioas_iova_ranges(struct iommufd_ucmd *ucmd)
+{
+ struct iommu_iova_range __user *ranges;
+ struct iommu_ioas_iova_ranges *cmd = ucmd->cmd;
+ struct iommufd_ioas *ioas;
+ struct interval_tree_span_iter span;
+ u32 max_iovas;
+ int rc;
+
+ if (cmd->__reserved)
+ return -EOPNOTSUPP;
+
+ ioas = iommufd_get_ioas(ucmd->ictx, cmd->ioas_id);
+ if (IS_ERR(ioas))
+ return PTR_ERR(ioas);
+
+ down_read(&ioas->iopt.iova_rwsem);
+ max_iovas = cmd->num_iovas;
+ ranges = u64_to_user_ptr(cmd->allowed_iovas);
+ cmd->num_iovas = 0;
+ cmd->out_iova_alignment = ioas->iopt.iova_alignment;
+ interval_tree_for_each_span(&span, &ioas->iopt.reserved_itree, 0,
+ ULONG_MAX) {
+ if (!span.is_hole)
+ continue;
+ if (cmd->num_iovas < max_iovas) {
+ struct iommu_iova_range elm = {
+ .start = span.start_hole,
+ .last = span.last_hole,
+ };
+
+ if (copy_to_user(&ranges[cmd->num_iovas], &elm,
+ sizeof(elm))) {
+ rc = -EFAULT;
+ goto out_put;
+ }
+ }
+ cmd->num_iovas++;
+ }
+ rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd));
+ if (rc)
+ goto out_put;
+ if (cmd->num_iovas > max_iovas)
+ rc = -EMSGSIZE;
+out_put:
+ up_read(&ioas->iopt.iova_rwsem);
+ iommufd_put_object(&ioas->obj);
+ return rc;
+}
+
+static int iommufd_ioas_load_iovas(struct rb_root_cached *itree,
+ struct iommu_iova_range __user *ranges,
+ u32 num)
+{
+ u32 i;
+
+ for (i = 0; i != num; i++) {
+ struct iommu_iova_range range;
+ struct iopt_allowed *allowed;
+
+ if (copy_from_user(&range, ranges + i, sizeof(range)))
+ return -EFAULT;
+
+ if (range.start >= range.last)
+ return -EINVAL;
+
+ if (interval_tree_iter_first(itree, range.start, range.last))
+ return -EINVAL;
+
+ allowed = kzalloc(sizeof(*allowed), GFP_KERNEL_ACCOUNT);
+ if (!allowed)
+ return -ENOMEM;
+ allowed->node.start = range.start;
+ allowed->node.last = range.last;
+
+ interval_tree_insert(&allowed->node, itree);
+ }
+ return 0;
+}
+
+int iommufd_ioas_allow_iovas(struct iommufd_ucmd *ucmd)
+{
+ struct iommu_ioas_allow_iovas *cmd = ucmd->cmd;
+ struct rb_root_cached allowed_iova = RB_ROOT_CACHED;
+ struct interval_tree_node *node;
+ struct iommufd_ioas *ioas;
+ struct io_pagetable *iopt;
+ int rc = 0;
+
+ if (cmd->__reserved)
+ return -EOPNOTSUPP;
+
+ ioas = iommufd_get_ioas(ucmd->ictx, cmd->ioas_id);
+ if (IS_ERR(ioas))
+ return PTR_ERR(ioas);
+ iopt = &ioas->iopt;
+
+ rc = iommufd_ioas_load_iovas(&allowed_iova,
+ u64_to_user_ptr(cmd->allowed_iovas),
+ cmd->num_iovas);
+ if (rc)
+ goto out_free;
+
+ /*
+ * We want the allowed tree update to be atomic, so we have to keep the
+ * original nodes around, and keep track of the new nodes as we allocate
+ * memory for them. The simplest solution is to have a new/old tree and
+ * then swap new for old. On success we free the old tree, on failure we
+ * free the new tree.
+ */
+ rc = iopt_set_allow_iova(iopt, &allowed_iova);
+out_free:
+ while ((node = interval_tree_iter_first(&allowed_iova, 0, ULONG_MAX))) {
+ interval_tree_remove(node, &allowed_iova);
+ kfree(container_of(node, struct iopt_allowed, node));
+ }
+ iommufd_put_object(&ioas->obj);
+ return rc;
+}
+
+static int conv_iommu_prot(u32 map_flags)
+{
+ /*
+ * We provide no manual cache coherency ioctls to userspace and most
+ * architectures make the CPU ops for cache flushing privileged.
+ * Therefore we require the underlying IOMMU to support CPU coherent
+ * operation. Support for IOMMU_CACHE is enforced by the
+ * IOMMU_CAP_CACHE_COHERENCY test during bind.
+ */
+ int iommu_prot = IOMMU_CACHE;
+
+ if (map_flags & IOMMU_IOAS_MAP_WRITEABLE)
+ iommu_prot |= IOMMU_WRITE;
+ if (map_flags & IOMMU_IOAS_MAP_READABLE)
+ iommu_prot |= IOMMU_READ;
+ return iommu_prot;
+}
+
+int iommufd_ioas_map(struct iommufd_ucmd *ucmd)
+{
+ struct iommu_ioas_map *cmd = ucmd->cmd;
+ unsigned long iova = cmd->iova;
+ struct iommufd_ioas *ioas;
+ unsigned int flags = 0;
+ int rc;
+
+ if ((cmd->flags &
+ ~(IOMMU_IOAS_MAP_FIXED_IOVA | IOMMU_IOAS_MAP_WRITEABLE |
+ IOMMU_IOAS_MAP_READABLE)) ||
+ cmd->__reserved)
+ return -EOPNOTSUPP;
+ if (cmd->iova >= ULONG_MAX || cmd->length >= ULONG_MAX)
+ return -EOVERFLOW;
+
+ ioas = iommufd_get_ioas(ucmd->ictx, cmd->ioas_id);
+ if (IS_ERR(ioas))
+ return PTR_ERR(ioas);
+
+ if (!(cmd->flags & IOMMU_IOAS_MAP_FIXED_IOVA))
+ flags = IOPT_ALLOC_IOVA;
+ rc = iopt_map_user_pages(ucmd->ictx, &ioas->iopt, &iova,
+ u64_to_user_ptr(cmd->user_va), cmd->length,
+ conv_iommu_prot(cmd->flags), flags);
+ if (rc)
+ goto out_put;
+
+ cmd->iova = iova;
+ rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd));
+out_put:
+ iommufd_put_object(&ioas->obj);
+ return rc;
+}
+
+int iommufd_ioas_copy(struct iommufd_ucmd *ucmd)
+{
+ struct iommu_ioas_copy *cmd = ucmd->cmd;
+ struct iommufd_ioas *src_ioas;
+ struct iommufd_ioas *dst_ioas;
+ unsigned int flags = 0;
+ LIST_HEAD(pages_list);
+ unsigned long iova;
+ int rc;
+
+ iommufd_test_syz_conv_iova_id(ucmd, cmd->src_ioas_id, &cmd->src_iova,
+ &cmd->flags);
+
+ if ((cmd->flags &
+ ~(IOMMU_IOAS_MAP_FIXED_IOVA | IOMMU_IOAS_MAP_WRITEABLE |
+ IOMMU_IOAS_MAP_READABLE)))
+ return -EOPNOTSUPP;
+ if (cmd->length >= ULONG_MAX || cmd->src_iova >= ULONG_MAX ||
+ cmd->dst_iova >= ULONG_MAX)
+ return -EOVERFLOW;
+
+ src_ioas = iommufd_get_ioas(ucmd->ictx, cmd->src_ioas_id);
+ if (IS_ERR(src_ioas))
+ return PTR_ERR(src_ioas);
+ rc = iopt_get_pages(&src_ioas->iopt, cmd->src_iova, cmd->length,
+ &pages_list);
+ iommufd_put_object(&src_ioas->obj);
+ if (rc)
+ return rc;
+
+ dst_ioas = iommufd_get_ioas(ucmd->ictx, cmd->dst_ioas_id);
+ if (IS_ERR(dst_ioas)) {
+ rc = PTR_ERR(dst_ioas);
+ goto out_pages;
+ }
+
+ if (!(cmd->flags & IOMMU_IOAS_MAP_FIXED_IOVA))
+ flags = IOPT_ALLOC_IOVA;
+ iova = cmd->dst_iova;
+ rc = iopt_map_pages(&dst_ioas->iopt, &pages_list, cmd->length, &iova,
+ conv_iommu_prot(cmd->flags), flags);
+ if (rc)
+ goto out_put_dst;
+
+ cmd->dst_iova = iova;
+ rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd));
+out_put_dst:
+ iommufd_put_object(&dst_ioas->obj);
+out_pages:
+ iopt_free_pages_list(&pages_list);
+ return rc;
+}
+
+int iommufd_ioas_unmap(struct iommufd_ucmd *ucmd)
+{
+ struct iommu_ioas_unmap *cmd = ucmd->cmd;
+ struct iommufd_ioas *ioas;
+ unsigned long unmapped = 0;
+ int rc;
+
+ ioas = iommufd_get_ioas(ucmd->ictx, cmd->ioas_id);
+ if (IS_ERR(ioas))
+ return PTR_ERR(ioas);
+
+ if (cmd->iova == 0 && cmd->length == U64_MAX) {
+ rc = iopt_unmap_all(&ioas->iopt, &unmapped);
+ if (rc)
+ goto out_put;
+ } else {
+ if (cmd->iova >= ULONG_MAX || cmd->length >= ULONG_MAX) {
+ rc = -EOVERFLOW;
+ goto out_put;
+ }
+ rc = iopt_unmap_iova(&ioas->iopt, cmd->iova, cmd->length,
+ &unmapped);
+ if (rc)
+ goto out_put;
+ }
+
+ cmd->length = unmapped;
+ rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd));
+
+out_put:
+ iommufd_put_object(&ioas->obj);
+ return rc;
+}
+
+int iommufd_option_rlimit_mode(struct iommu_option *cmd,
+ struct iommufd_ctx *ictx)
+{
+ if (cmd->object_id)
+ return -EOPNOTSUPP;
+
+ if (cmd->op == IOMMU_OPTION_OP_GET) {
+ cmd->val64 = ictx->account_mode == IOPT_PAGES_ACCOUNT_MM;
+ return 0;
+ }
+ if (cmd->op == IOMMU_OPTION_OP_SET) {
+ int rc = 0;
+
+ if (!capable(CAP_SYS_RESOURCE))
+ return -EPERM;
+
+ xa_lock(&ictx->objects);
+ if (!xa_empty(&ictx->objects)) {
+ rc = -EBUSY;
+ } else {
+ if (cmd->val64 == 0)
+ ictx->account_mode = IOPT_PAGES_ACCOUNT_USER;
+ else if (cmd->val64 == 1)
+ ictx->account_mode = IOPT_PAGES_ACCOUNT_MM;
+ else
+ rc = -EINVAL;
+ }
+ xa_unlock(&ictx->objects);
+
+ return rc;
+ }
+ return -EOPNOTSUPP;
+}
+
+static int iommufd_ioas_option_huge_pages(struct iommu_option *cmd,
+ struct iommufd_ioas *ioas)
+{
+ if (cmd->op == IOMMU_OPTION_OP_GET) {
+ cmd->val64 = !ioas->iopt.disable_large_pages;
+ return 0;
+ }
+ if (cmd->op == IOMMU_OPTION_OP_SET) {
+ if (cmd->val64 == 0)
+ return iopt_disable_large_pages(&ioas->iopt);
+ if (cmd->val64 == 1) {
+ iopt_enable_large_pages(&ioas->iopt);
+ return 0;
+ }
+ return -EINVAL;
+ }
+ return -EOPNOTSUPP;
+}
+
+int iommufd_ioas_option(struct iommufd_ucmd *ucmd)
+{
+ struct iommu_option *cmd = ucmd->cmd;
+ struct iommufd_ioas *ioas;
+ int rc = 0;
+
+ if (cmd->__reserved)
+ return -EOPNOTSUPP;
+
+ ioas = iommufd_get_ioas(ucmd->ictx, cmd->object_id);
+ if (IS_ERR(ioas))
+ return PTR_ERR(ioas);
+
+ switch (cmd->option_id) {
+ case IOMMU_OPTION_HUGE_PAGES:
+ rc = iommufd_ioas_option_huge_pages(cmd, ioas);
+ break;
+ default:
+ rc = -EOPNOTSUPP;
+ }
+
+ iommufd_put_object(&ioas->obj);
+ return rc;
+}
diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h
new file mode 100644
index 0000000000..2c58670011
--- /dev/null
+++ b/drivers/iommu/iommufd/iommufd_private.h
@@ -0,0 +1,351 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES
+ */
+#ifndef __IOMMUFD_PRIVATE_H
+#define __IOMMUFD_PRIVATE_H
+
+#include <linux/rwsem.h>
+#include <linux/xarray.h>
+#include <linux/refcount.h>
+#include <linux/uaccess.h>
+
+struct iommu_domain;
+struct iommu_group;
+struct iommu_option;
+struct iommufd_device;
+
+struct iommufd_ctx {
+ struct file *file;
+ struct xarray objects;
+ struct xarray groups;
+
+ u8 account_mode;
+ /* Compatibility with VFIO no iommu */
+ u8 no_iommu_mode;
+ struct iommufd_ioas *vfio_ioas;
+};
+
+/*
+ * The IOVA to PFN map. The map automatically copies the PFNs into multiple
+ * domains and permits sharing of PFNs between io_pagetable instances. This
+ * supports both a design where IOAS's are 1:1 with a domain (eg because the
+ * domain is HW customized), or where the IOAS is 1:N with multiple generic
+ * domains. The io_pagetable holds an interval tree of iopt_areas which point
+ * to shared iopt_pages which hold the pfns mapped to the page table.
+ *
+ * The locking order is domains_rwsem -> iova_rwsem -> pages::mutex
+ */
+struct io_pagetable {
+ struct rw_semaphore domains_rwsem;
+ struct xarray domains;
+ struct xarray access_list;
+ unsigned int next_domain_id;
+
+ struct rw_semaphore iova_rwsem;
+ struct rb_root_cached area_itree;
+ /* IOVA that cannot become reserved, struct iopt_allowed */
+ struct rb_root_cached allowed_itree;
+ /* IOVA that cannot be allocated, struct iopt_reserved */
+ struct rb_root_cached reserved_itree;
+ u8 disable_large_pages;
+ unsigned long iova_alignment;
+};
+
+void iopt_init_table(struct io_pagetable *iopt);
+void iopt_destroy_table(struct io_pagetable *iopt);
+int iopt_get_pages(struct io_pagetable *iopt, unsigned long iova,
+ unsigned long length, struct list_head *pages_list);
+void iopt_free_pages_list(struct list_head *pages_list);
+enum {
+ IOPT_ALLOC_IOVA = 1 << 0,
+};
+int iopt_map_user_pages(struct iommufd_ctx *ictx, struct io_pagetable *iopt,
+ unsigned long *iova, void __user *uptr,
+ unsigned long length, int iommu_prot,
+ unsigned int flags);
+int iopt_map_pages(struct io_pagetable *iopt, struct list_head *pages_list,
+ unsigned long length, unsigned long *dst_iova,
+ int iommu_prot, unsigned int flags);
+int iopt_unmap_iova(struct io_pagetable *iopt, unsigned long iova,
+ unsigned long length, unsigned long *unmapped);
+int iopt_unmap_all(struct io_pagetable *iopt, unsigned long *unmapped);
+
+void iommufd_access_notify_unmap(struct io_pagetable *iopt, unsigned long iova,
+ unsigned long length);
+int iopt_table_add_domain(struct io_pagetable *iopt,
+ struct iommu_domain *domain);
+void iopt_table_remove_domain(struct io_pagetable *iopt,
+ struct iommu_domain *domain);
+int iopt_table_enforce_dev_resv_regions(struct io_pagetable *iopt,
+ struct device *dev,
+ phys_addr_t *sw_msi_start);
+int iopt_set_allow_iova(struct io_pagetable *iopt,
+ struct rb_root_cached *allowed_iova);
+int iopt_reserve_iova(struct io_pagetable *iopt, unsigned long start,
+ unsigned long last, void *owner);
+void iopt_remove_reserved_iova(struct io_pagetable *iopt, void *owner);
+int iopt_cut_iova(struct io_pagetable *iopt, unsigned long *iovas,
+ size_t num_iovas);
+void iopt_enable_large_pages(struct io_pagetable *iopt);
+int iopt_disable_large_pages(struct io_pagetable *iopt);
+
+struct iommufd_ucmd {
+ struct iommufd_ctx *ictx;
+ void __user *ubuffer;
+ u32 user_size;
+ void *cmd;
+};
+
+int iommufd_vfio_ioctl(struct iommufd_ctx *ictx, unsigned int cmd,
+ unsigned long arg);
+
+/* Copy the response in ucmd->cmd back to userspace. */
+static inline int iommufd_ucmd_respond(struct iommufd_ucmd *ucmd,
+ size_t cmd_len)
+{
+ if (copy_to_user(ucmd->ubuffer, ucmd->cmd,
+ min_t(size_t, ucmd->user_size, cmd_len)))
+ return -EFAULT;
+ return 0;
+}
+
+enum iommufd_object_type {
+ IOMMUFD_OBJ_NONE,
+ IOMMUFD_OBJ_ANY = IOMMUFD_OBJ_NONE,
+ IOMMUFD_OBJ_DEVICE,
+ IOMMUFD_OBJ_HW_PAGETABLE,
+ IOMMUFD_OBJ_IOAS,
+ IOMMUFD_OBJ_ACCESS,
+#ifdef CONFIG_IOMMUFD_TEST
+ IOMMUFD_OBJ_SELFTEST,
+#endif
+ IOMMUFD_OBJ_MAX,
+};
+
+/* Base struct for all objects with a userspace ID handle. */
+struct iommufd_object {
+ struct rw_semaphore destroy_rwsem;
+ refcount_t users;
+ enum iommufd_object_type type;
+ unsigned int id;
+};
+
+static inline bool iommufd_lock_obj(struct iommufd_object *obj)
+{
+ if (!down_read_trylock(&obj->destroy_rwsem))
+ return false;
+ if (!refcount_inc_not_zero(&obj->users)) {
+ up_read(&obj->destroy_rwsem);
+ return false;
+ }
+ return true;
+}
+
+struct iommufd_object *iommufd_get_object(struct iommufd_ctx *ictx, u32 id,
+ enum iommufd_object_type type);
+static inline void iommufd_put_object(struct iommufd_object *obj)
+{
+ refcount_dec(&obj->users);
+ up_read(&obj->destroy_rwsem);
+}
+
+void iommufd_object_abort(struct iommufd_ctx *ictx, struct iommufd_object *obj);
+void iommufd_object_abort_and_destroy(struct iommufd_ctx *ictx,
+ struct iommufd_object *obj);
+void iommufd_object_finalize(struct iommufd_ctx *ictx,
+ struct iommufd_object *obj);
+void __iommufd_object_destroy_user(struct iommufd_ctx *ictx,
+ struct iommufd_object *obj, bool allow_fail);
+static inline void iommufd_object_destroy_user(struct iommufd_ctx *ictx,
+ struct iommufd_object *obj)
+{
+ __iommufd_object_destroy_user(ictx, obj, false);
+}
+static inline void iommufd_object_deref_user(struct iommufd_ctx *ictx,
+ struct iommufd_object *obj)
+{
+ __iommufd_object_destroy_user(ictx, obj, true);
+}
+
+struct iommufd_object *_iommufd_object_alloc(struct iommufd_ctx *ictx,
+ size_t size,
+ enum iommufd_object_type type);
+
+#define iommufd_object_alloc(ictx, ptr, type) \
+ container_of(_iommufd_object_alloc( \
+ ictx, \
+ sizeof(*(ptr)) + BUILD_BUG_ON_ZERO( \
+ offsetof(typeof(*(ptr)), \
+ obj) != 0), \
+ type), \
+ typeof(*(ptr)), obj)
+
+/*
+ * The IO Address Space (IOAS) pagetable is a virtual page table backed by the
+ * io_pagetable object. It is a user controlled mapping of IOVA -> PFNs. The
+ * mapping is copied into all of the associated domains and made available to
+ * in-kernel users.
+ *
+ * Every iommu_domain that is created is wrapped in a iommufd_hw_pagetable
+ * object. When we go to attach a device to an IOAS we need to get an
+ * iommu_domain and wrapping iommufd_hw_pagetable for it.
+ *
+ * An iommu_domain & iommfd_hw_pagetable will be automatically selected
+ * for a device based on the hwpt_list. If no suitable iommu_domain
+ * is found a new iommu_domain will be created.
+ */
+struct iommufd_ioas {
+ struct iommufd_object obj;
+ struct io_pagetable iopt;
+ struct mutex mutex;
+ struct list_head hwpt_list;
+};
+
+static inline struct iommufd_ioas *iommufd_get_ioas(struct iommufd_ctx *ictx,
+ u32 id)
+{
+ return container_of(iommufd_get_object(ictx, id,
+ IOMMUFD_OBJ_IOAS),
+ struct iommufd_ioas, obj);
+}
+
+struct iommufd_ioas *iommufd_ioas_alloc(struct iommufd_ctx *ictx);
+int iommufd_ioas_alloc_ioctl(struct iommufd_ucmd *ucmd);
+void iommufd_ioas_destroy(struct iommufd_object *obj);
+int iommufd_ioas_iova_ranges(struct iommufd_ucmd *ucmd);
+int iommufd_ioas_allow_iovas(struct iommufd_ucmd *ucmd);
+int iommufd_ioas_map(struct iommufd_ucmd *ucmd);
+int iommufd_ioas_copy(struct iommufd_ucmd *ucmd);
+int iommufd_ioas_unmap(struct iommufd_ucmd *ucmd);
+int iommufd_ioas_option(struct iommufd_ucmd *ucmd);
+int iommufd_option_rlimit_mode(struct iommu_option *cmd,
+ struct iommufd_ctx *ictx);
+
+int iommufd_vfio_ioas(struct iommufd_ucmd *ucmd);
+
+/*
+ * A HW pagetable is called an iommu_domain inside the kernel. This user object
+ * allows directly creating and inspecting the domains. Domains that have kernel
+ * owned page tables will be associated with an iommufd_ioas that provides the
+ * IOVA to PFN map.
+ */
+struct iommufd_hw_pagetable {
+ struct iommufd_object obj;
+ struct iommufd_ioas *ioas;
+ struct iommu_domain *domain;
+ bool auto_domain : 1;
+ bool enforce_cache_coherency : 1;
+ bool msi_cookie : 1;
+ /* Head at iommufd_ioas::hwpt_list */
+ struct list_head hwpt_item;
+};
+
+struct iommufd_hw_pagetable *
+iommufd_hw_pagetable_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas,
+ struct iommufd_device *idev, bool immediate_attach);
+int iommufd_hw_pagetable_enforce_cc(struct iommufd_hw_pagetable *hwpt);
+int iommufd_hw_pagetable_attach(struct iommufd_hw_pagetable *hwpt,
+ struct iommufd_device *idev);
+struct iommufd_hw_pagetable *
+iommufd_hw_pagetable_detach(struct iommufd_device *idev);
+void iommufd_hw_pagetable_destroy(struct iommufd_object *obj);
+void iommufd_hw_pagetable_abort(struct iommufd_object *obj);
+int iommufd_hwpt_alloc(struct iommufd_ucmd *ucmd);
+
+static inline void iommufd_hw_pagetable_put(struct iommufd_ctx *ictx,
+ struct iommufd_hw_pagetable *hwpt)
+{
+ lockdep_assert_not_held(&hwpt->ioas->mutex);
+ if (hwpt->auto_domain)
+ iommufd_object_deref_user(ictx, &hwpt->obj);
+ else
+ refcount_dec(&hwpt->obj.users);
+}
+
+struct iommufd_group {
+ struct kref ref;
+ struct mutex lock;
+ struct iommufd_ctx *ictx;
+ struct iommu_group *group;
+ struct iommufd_hw_pagetable *hwpt;
+ struct list_head device_list;
+ phys_addr_t sw_msi_start;
+};
+
+/*
+ * A iommufd_device object represents the binding relationship between a
+ * consuming driver and the iommufd. These objects are created/destroyed by
+ * external drivers, not by userspace.
+ */
+struct iommufd_device {
+ struct iommufd_object obj;
+ struct iommufd_ctx *ictx;
+ struct iommufd_group *igroup;
+ struct list_head group_item;
+ /* always the physical device */
+ struct device *dev;
+ bool enforce_cache_coherency;
+};
+
+static inline struct iommufd_device *
+iommufd_get_device(struct iommufd_ucmd *ucmd, u32 id)
+{
+ return container_of(iommufd_get_object(ucmd->ictx, id,
+ IOMMUFD_OBJ_DEVICE),
+ struct iommufd_device, obj);
+}
+
+void iommufd_device_destroy(struct iommufd_object *obj);
+int iommufd_get_hw_info(struct iommufd_ucmd *ucmd);
+
+struct iommufd_access {
+ struct iommufd_object obj;
+ struct iommufd_ctx *ictx;
+ struct iommufd_ioas *ioas;
+ struct iommufd_ioas *ioas_unpin;
+ struct mutex ioas_lock;
+ const struct iommufd_access_ops *ops;
+ void *data;
+ unsigned long iova_alignment;
+ u32 iopt_access_list_id;
+};
+
+int iopt_add_access(struct io_pagetable *iopt, struct iommufd_access *access);
+void iopt_remove_access(struct io_pagetable *iopt,
+ struct iommufd_access *access,
+ u32 iopt_access_list_id);
+void iommufd_access_destroy_object(struct iommufd_object *obj);
+
+#ifdef CONFIG_IOMMUFD_TEST
+int iommufd_test(struct iommufd_ucmd *ucmd);
+void iommufd_selftest_destroy(struct iommufd_object *obj);
+extern size_t iommufd_test_memory_limit;
+void iommufd_test_syz_conv_iova_id(struct iommufd_ucmd *ucmd,
+ unsigned int ioas_id, u64 *iova, u32 *flags);
+bool iommufd_should_fail(void);
+int __init iommufd_test_init(void);
+void iommufd_test_exit(void);
+bool iommufd_selftest_is_mock_dev(struct device *dev);
+#else
+static inline void iommufd_test_syz_conv_iova_id(struct iommufd_ucmd *ucmd,
+ unsigned int ioas_id,
+ u64 *iova, u32 *flags)
+{
+}
+static inline bool iommufd_should_fail(void)
+{
+ return false;
+}
+static inline int __init iommufd_test_init(void)
+{
+ return 0;
+}
+static inline void iommufd_test_exit(void)
+{
+}
+static inline bool iommufd_selftest_is_mock_dev(struct device *dev)
+{
+ return false;
+}
+#endif
+#endif
diff --git a/drivers/iommu/iommufd/iommufd_test.h b/drivers/iommu/iommufd/iommufd_test.h
new file mode 100644
index 0000000000..3f3644375b
--- /dev/null
+++ b/drivers/iommu/iommufd/iommufd_test.h
@@ -0,0 +1,112 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES.
+ */
+#ifndef _UAPI_IOMMUFD_TEST_H
+#define _UAPI_IOMMUFD_TEST_H
+
+#include <linux/types.h>
+#include <linux/iommufd.h>
+
+enum {
+ IOMMU_TEST_OP_ADD_RESERVED = 1,
+ IOMMU_TEST_OP_MOCK_DOMAIN,
+ IOMMU_TEST_OP_MD_CHECK_MAP,
+ IOMMU_TEST_OP_MD_CHECK_REFS,
+ IOMMU_TEST_OP_CREATE_ACCESS,
+ IOMMU_TEST_OP_DESTROY_ACCESS_PAGES,
+ IOMMU_TEST_OP_ACCESS_PAGES,
+ IOMMU_TEST_OP_ACCESS_RW,
+ IOMMU_TEST_OP_SET_TEMP_MEMORY_LIMIT,
+ IOMMU_TEST_OP_MOCK_DOMAIN_REPLACE,
+ IOMMU_TEST_OP_ACCESS_REPLACE_IOAS,
+};
+
+enum {
+ MOCK_APERTURE_START = 1UL << 24,
+ MOCK_APERTURE_LAST = (1UL << 31) - 1,
+};
+
+enum {
+ MOCK_FLAGS_ACCESS_WRITE = 1 << 0,
+ MOCK_FLAGS_ACCESS_SYZ = 1 << 16,
+};
+
+enum {
+ MOCK_ACCESS_RW_WRITE = 1 << 0,
+ MOCK_ACCESS_RW_SLOW_PATH = 1 << 2,
+};
+
+enum {
+ MOCK_FLAGS_ACCESS_CREATE_NEEDS_PIN_PAGES = 1 << 0,
+};
+
+struct iommu_test_cmd {
+ __u32 size;
+ __u32 op;
+ __u32 id;
+ __u32 __reserved;
+ union {
+ struct {
+ __aligned_u64 start;
+ __aligned_u64 length;
+ } add_reserved;
+ struct {
+ __u32 out_stdev_id;
+ __u32 out_hwpt_id;
+ /* out_idev_id is the standard iommufd_bind object */
+ __u32 out_idev_id;
+ } mock_domain;
+ struct {
+ __u32 pt_id;
+ } mock_domain_replace;
+ struct {
+ __aligned_u64 iova;
+ __aligned_u64 length;
+ __aligned_u64 uptr;
+ } check_map;
+ struct {
+ __aligned_u64 length;
+ __aligned_u64 uptr;
+ __u32 refs;
+ } check_refs;
+ struct {
+ __u32 out_access_fd;
+ __u32 flags;
+ } create_access;
+ struct {
+ __u32 access_pages_id;
+ } destroy_access_pages;
+ struct {
+ __u32 flags;
+ __u32 out_access_pages_id;
+ __aligned_u64 iova;
+ __aligned_u64 length;
+ __aligned_u64 uptr;
+ } access_pages;
+ struct {
+ __aligned_u64 iova;
+ __aligned_u64 length;
+ __aligned_u64 uptr;
+ __u32 flags;
+ } access_rw;
+ struct {
+ __u32 limit;
+ } memory_limit;
+ struct {
+ __u32 ioas_id;
+ } access_replace_ioas;
+ };
+ __u32 last;
+};
+#define IOMMU_TEST_CMD _IO(IOMMUFD_TYPE, IOMMUFD_CMD_BASE + 32)
+
+/* Mock structs for IOMMU_DEVICE_GET_HW_INFO ioctl */
+#define IOMMU_HW_INFO_TYPE_SELFTEST 0xfeedbeef
+#define IOMMU_HW_INFO_SELFTEST_REGVAL 0xdeadbeef
+
+struct iommu_test_hw_info {
+ __u32 flags;
+ __u32 test_reg;
+};
+
+#endif
diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c
new file mode 100644
index 0000000000..e71523cbd0
--- /dev/null
+++ b/drivers/iommu/iommufd/main.c
@@ -0,0 +1,556 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (C) 2021 Intel Corporation
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES
+ *
+ * iommufd provides control over the IOMMU HW objects created by IOMMU kernel
+ * drivers. IOMMU HW objects revolve around IO page tables that map incoming DMA
+ * addresses (IOVA) to CPU addresses.
+ */
+#define pr_fmt(fmt) "iommufd: " fmt
+
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/miscdevice.h>
+#include <linux/mutex.h>
+#include <linux/bug.h>
+#include <uapi/linux/iommufd.h>
+#include <linux/iommufd.h>
+
+#include "io_pagetable.h"
+#include "iommufd_private.h"
+#include "iommufd_test.h"
+
+struct iommufd_object_ops {
+ void (*destroy)(struct iommufd_object *obj);
+ void (*abort)(struct iommufd_object *obj);
+};
+static const struct iommufd_object_ops iommufd_object_ops[];
+static struct miscdevice vfio_misc_dev;
+
+struct iommufd_object *_iommufd_object_alloc(struct iommufd_ctx *ictx,
+ size_t size,
+ enum iommufd_object_type type)
+{
+ static struct lock_class_key obj_keys[IOMMUFD_OBJ_MAX];
+ struct iommufd_object *obj;
+ int rc;
+
+ obj = kzalloc(size, GFP_KERNEL_ACCOUNT);
+ if (!obj)
+ return ERR_PTR(-ENOMEM);
+ obj->type = type;
+ /*
+ * In most cases the destroy_rwsem is obtained with try so it doesn't
+ * interact with lockdep, however on destroy we have to sleep. This
+ * means if we have to destroy an object while holding a get on another
+ * object it triggers lockdep. Using one locking class per object type
+ * is a simple and reasonable way to avoid this.
+ */
+ __init_rwsem(&obj->destroy_rwsem, "iommufd_object::destroy_rwsem",
+ &obj_keys[type]);
+ refcount_set(&obj->users, 1);
+
+ /*
+ * Reserve an ID in the xarray but do not publish the pointer yet since
+ * the caller hasn't initialized it yet. Once the pointer is published
+ * in the xarray and visible to other threads we can't reliably destroy
+ * it anymore, so the caller must complete all errorable operations
+ * before calling iommufd_object_finalize().
+ */
+ rc = xa_alloc(&ictx->objects, &obj->id, XA_ZERO_ENTRY,
+ xa_limit_31b, GFP_KERNEL_ACCOUNT);
+ if (rc)
+ goto out_free;
+ return obj;
+out_free:
+ kfree(obj);
+ return ERR_PTR(rc);
+}
+
+/*
+ * Allow concurrent access to the object.
+ *
+ * Once another thread can see the object pointer it can prevent object
+ * destruction. Expect for special kernel-only objects there is no in-kernel way
+ * to reliably destroy a single object. Thus all APIs that are creating objects
+ * must use iommufd_object_abort() to handle their errors and only call
+ * iommufd_object_finalize() once object creation cannot fail.
+ */
+void iommufd_object_finalize(struct iommufd_ctx *ictx,
+ struct iommufd_object *obj)
+{
+ void *old;
+
+ old = xa_store(&ictx->objects, obj->id, obj, GFP_KERNEL);
+ /* obj->id was returned from xa_alloc() so the xa_store() cannot fail */
+ WARN_ON(old);
+}
+
+/* Undo _iommufd_object_alloc() if iommufd_object_finalize() was not called */
+void iommufd_object_abort(struct iommufd_ctx *ictx, struct iommufd_object *obj)
+{
+ void *old;
+
+ old = xa_erase(&ictx->objects, obj->id);
+ WARN_ON(old);
+ kfree(obj);
+}
+
+/*
+ * Abort an object that has been fully initialized and needs destroy, but has
+ * not been finalized.
+ */
+void iommufd_object_abort_and_destroy(struct iommufd_ctx *ictx,
+ struct iommufd_object *obj)
+{
+ if (iommufd_object_ops[obj->type].abort)
+ iommufd_object_ops[obj->type].abort(obj);
+ else
+ iommufd_object_ops[obj->type].destroy(obj);
+ iommufd_object_abort(ictx, obj);
+}
+
+struct iommufd_object *iommufd_get_object(struct iommufd_ctx *ictx, u32 id,
+ enum iommufd_object_type type)
+{
+ struct iommufd_object *obj;
+
+ if (iommufd_should_fail())
+ return ERR_PTR(-ENOENT);
+
+ xa_lock(&ictx->objects);
+ obj = xa_load(&ictx->objects, id);
+ if (!obj || (type != IOMMUFD_OBJ_ANY && obj->type != type) ||
+ !iommufd_lock_obj(obj))
+ obj = ERR_PTR(-ENOENT);
+ xa_unlock(&ictx->objects);
+ return obj;
+}
+
+/*
+ * Remove the given object id from the xarray if the only reference to the
+ * object is held by the xarray. The caller must call ops destroy().
+ */
+static struct iommufd_object *iommufd_object_remove(struct iommufd_ctx *ictx,
+ u32 id, bool extra_put)
+{
+ struct iommufd_object *obj;
+ XA_STATE(xas, &ictx->objects, id);
+
+ xa_lock(&ictx->objects);
+ obj = xas_load(&xas);
+ if (xa_is_zero(obj) || !obj) {
+ obj = ERR_PTR(-ENOENT);
+ goto out_xa;
+ }
+
+ /*
+ * If the caller is holding a ref on obj we put it here under the
+ * spinlock.
+ */
+ if (extra_put)
+ refcount_dec(&obj->users);
+
+ if (!refcount_dec_if_one(&obj->users)) {
+ obj = ERR_PTR(-EBUSY);
+ goto out_xa;
+ }
+
+ xas_store(&xas, NULL);
+ if (ictx->vfio_ioas == container_of(obj, struct iommufd_ioas, obj))
+ ictx->vfio_ioas = NULL;
+
+out_xa:
+ xa_unlock(&ictx->objects);
+
+ /* The returned object reference count is zero */
+ return obj;
+}
+
+/*
+ * The caller holds a users refcount and wants to destroy the object. Returns
+ * true if the object was destroyed. In all cases the caller no longer has a
+ * reference on obj.
+ */
+void __iommufd_object_destroy_user(struct iommufd_ctx *ictx,
+ struct iommufd_object *obj, bool allow_fail)
+{
+ struct iommufd_object *ret;
+
+ /*
+ * The purpose of the destroy_rwsem is to ensure deterministic
+ * destruction of objects used by external drivers and destroyed by this
+ * function. Any temporary increment of the refcount must hold the read
+ * side of this, such as during ioctl execution.
+ */
+ down_write(&obj->destroy_rwsem);
+ ret = iommufd_object_remove(ictx, obj->id, true);
+ up_write(&obj->destroy_rwsem);
+
+ if (allow_fail && IS_ERR(ret))
+ return;
+
+ /*
+ * If there is a bug and we couldn't destroy the object then we did put
+ * back the caller's refcount and will eventually try to free it again
+ * during close.
+ */
+ if (WARN_ON(IS_ERR(ret)))
+ return;
+
+ iommufd_object_ops[obj->type].destroy(obj);
+ kfree(obj);
+}
+
+static int iommufd_destroy(struct iommufd_ucmd *ucmd)
+{
+ struct iommu_destroy *cmd = ucmd->cmd;
+ struct iommufd_object *obj;
+
+ obj = iommufd_object_remove(ucmd->ictx, cmd->id, false);
+ if (IS_ERR(obj))
+ return PTR_ERR(obj);
+ iommufd_object_ops[obj->type].destroy(obj);
+ kfree(obj);
+ return 0;
+}
+
+static int iommufd_fops_open(struct inode *inode, struct file *filp)
+{
+ struct iommufd_ctx *ictx;
+
+ ictx = kzalloc(sizeof(*ictx), GFP_KERNEL_ACCOUNT);
+ if (!ictx)
+ return -ENOMEM;
+
+ /*
+ * For compatibility with VFIO when /dev/vfio/vfio is opened we default
+ * to the same rlimit accounting as vfio uses.
+ */
+ if (IS_ENABLED(CONFIG_IOMMUFD_VFIO_CONTAINER) &&
+ filp->private_data == &vfio_misc_dev) {
+ ictx->account_mode = IOPT_PAGES_ACCOUNT_MM;
+ pr_info_once("IOMMUFD is providing /dev/vfio/vfio, not VFIO.\n");
+ }
+
+ xa_init_flags(&ictx->objects, XA_FLAGS_ALLOC1 | XA_FLAGS_ACCOUNT);
+ xa_init(&ictx->groups);
+ ictx->file = filp;
+ filp->private_data = ictx;
+ return 0;
+}
+
+static int iommufd_fops_release(struct inode *inode, struct file *filp)
+{
+ struct iommufd_ctx *ictx = filp->private_data;
+ struct iommufd_object *obj;
+
+ /*
+ * The objects in the xarray form a graph of "users" counts, and we have
+ * to destroy them in a depth first manner. Leaf objects will reduce the
+ * users count of interior objects when they are destroyed.
+ *
+ * Repeatedly destroying all the "1 users" leaf objects will progress
+ * until the entire list is destroyed. If this can't progress then there
+ * is some bug related to object refcounting.
+ */
+ while (!xa_empty(&ictx->objects)) {
+ unsigned int destroyed = 0;
+ unsigned long index;
+
+ xa_for_each(&ictx->objects, index, obj) {
+ if (!refcount_dec_if_one(&obj->users))
+ continue;
+ destroyed++;
+ xa_erase(&ictx->objects, index);
+ iommufd_object_ops[obj->type].destroy(obj);
+ kfree(obj);
+ }
+ /* Bug related to users refcount */
+ if (WARN_ON(!destroyed))
+ break;
+ }
+ WARN_ON(!xa_empty(&ictx->groups));
+ kfree(ictx);
+ return 0;
+}
+
+static int iommufd_option(struct iommufd_ucmd *ucmd)
+{
+ struct iommu_option *cmd = ucmd->cmd;
+ int rc;
+
+ if (cmd->__reserved)
+ return -EOPNOTSUPP;
+
+ switch (cmd->option_id) {
+ case IOMMU_OPTION_RLIMIT_MODE:
+ rc = iommufd_option_rlimit_mode(cmd, ucmd->ictx);
+ break;
+ case IOMMU_OPTION_HUGE_PAGES:
+ rc = iommufd_ioas_option(ucmd);
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+ if (rc)
+ return rc;
+ if (copy_to_user(&((struct iommu_option __user *)ucmd->ubuffer)->val64,
+ &cmd->val64, sizeof(cmd->val64)))
+ return -EFAULT;
+ return 0;
+}
+
+union ucmd_buffer {
+ struct iommu_destroy destroy;
+ struct iommu_hw_info info;
+ struct iommu_hwpt_alloc hwpt;
+ struct iommu_ioas_alloc alloc;
+ struct iommu_ioas_allow_iovas allow_iovas;
+ struct iommu_ioas_copy ioas_copy;
+ struct iommu_ioas_iova_ranges iova_ranges;
+ struct iommu_ioas_map map;
+ struct iommu_ioas_unmap unmap;
+ struct iommu_option option;
+ struct iommu_vfio_ioas vfio_ioas;
+#ifdef CONFIG_IOMMUFD_TEST
+ struct iommu_test_cmd test;
+#endif
+};
+
+struct iommufd_ioctl_op {
+ unsigned int size;
+ unsigned int min_size;
+ unsigned int ioctl_num;
+ int (*execute)(struct iommufd_ucmd *ucmd);
+};
+
+#define IOCTL_OP(_ioctl, _fn, _struct, _last) \
+ [_IOC_NR(_ioctl) - IOMMUFD_CMD_BASE] = { \
+ .size = sizeof(_struct) + \
+ BUILD_BUG_ON_ZERO(sizeof(union ucmd_buffer) < \
+ sizeof(_struct)), \
+ .min_size = offsetofend(_struct, _last), \
+ .ioctl_num = _ioctl, \
+ .execute = _fn, \
+ }
+static const struct iommufd_ioctl_op iommufd_ioctl_ops[] = {
+ IOCTL_OP(IOMMU_DESTROY, iommufd_destroy, struct iommu_destroy, id),
+ IOCTL_OP(IOMMU_GET_HW_INFO, iommufd_get_hw_info, struct iommu_hw_info,
+ __reserved),
+ IOCTL_OP(IOMMU_HWPT_ALLOC, iommufd_hwpt_alloc, struct iommu_hwpt_alloc,
+ __reserved),
+ IOCTL_OP(IOMMU_IOAS_ALLOC, iommufd_ioas_alloc_ioctl,
+ struct iommu_ioas_alloc, out_ioas_id),
+ IOCTL_OP(IOMMU_IOAS_ALLOW_IOVAS, iommufd_ioas_allow_iovas,
+ struct iommu_ioas_allow_iovas, allowed_iovas),
+ IOCTL_OP(IOMMU_IOAS_COPY, iommufd_ioas_copy, struct iommu_ioas_copy,
+ src_iova),
+ IOCTL_OP(IOMMU_IOAS_IOVA_RANGES, iommufd_ioas_iova_ranges,
+ struct iommu_ioas_iova_ranges, out_iova_alignment),
+ IOCTL_OP(IOMMU_IOAS_MAP, iommufd_ioas_map, struct iommu_ioas_map,
+ iova),
+ IOCTL_OP(IOMMU_IOAS_UNMAP, iommufd_ioas_unmap, struct iommu_ioas_unmap,
+ length),
+ IOCTL_OP(IOMMU_OPTION, iommufd_option, struct iommu_option,
+ val64),
+ IOCTL_OP(IOMMU_VFIO_IOAS, iommufd_vfio_ioas, struct iommu_vfio_ioas,
+ __reserved),
+#ifdef CONFIG_IOMMUFD_TEST
+ IOCTL_OP(IOMMU_TEST_CMD, iommufd_test, struct iommu_test_cmd, last),
+#endif
+};
+
+static long iommufd_fops_ioctl(struct file *filp, unsigned int cmd,
+ unsigned long arg)
+{
+ struct iommufd_ctx *ictx = filp->private_data;
+ const struct iommufd_ioctl_op *op;
+ struct iommufd_ucmd ucmd = {};
+ union ucmd_buffer buf;
+ unsigned int nr;
+ int ret;
+
+ nr = _IOC_NR(cmd);
+ if (nr < IOMMUFD_CMD_BASE ||
+ (nr - IOMMUFD_CMD_BASE) >= ARRAY_SIZE(iommufd_ioctl_ops))
+ return iommufd_vfio_ioctl(ictx, cmd, arg);
+
+ ucmd.ictx = ictx;
+ ucmd.ubuffer = (void __user *)arg;
+ ret = get_user(ucmd.user_size, (u32 __user *)ucmd.ubuffer);
+ if (ret)
+ return ret;
+
+ op = &iommufd_ioctl_ops[nr - IOMMUFD_CMD_BASE];
+ if (op->ioctl_num != cmd)
+ return -ENOIOCTLCMD;
+ if (ucmd.user_size < op->min_size)
+ return -EINVAL;
+
+ ucmd.cmd = &buf;
+ ret = copy_struct_from_user(ucmd.cmd, op->size, ucmd.ubuffer,
+ ucmd.user_size);
+ if (ret)
+ return ret;
+ ret = op->execute(&ucmd);
+ return ret;
+}
+
+static const struct file_operations iommufd_fops = {
+ .owner = THIS_MODULE,
+ .open = iommufd_fops_open,
+ .release = iommufd_fops_release,
+ .unlocked_ioctl = iommufd_fops_ioctl,
+};
+
+/**
+ * iommufd_ctx_get - Get a context reference
+ * @ictx: Context to get
+ *
+ * The caller must already hold a valid reference to ictx.
+ */
+void iommufd_ctx_get(struct iommufd_ctx *ictx)
+{
+ get_file(ictx->file);
+}
+EXPORT_SYMBOL_NS_GPL(iommufd_ctx_get, IOMMUFD);
+
+/**
+ * iommufd_ctx_from_file - Acquires a reference to the iommufd context
+ * @file: File to obtain the reference from
+ *
+ * Returns a pointer to the iommufd_ctx, otherwise ERR_PTR. The struct file
+ * remains owned by the caller and the caller must still do fput. On success
+ * the caller is responsible to call iommufd_ctx_put().
+ */
+struct iommufd_ctx *iommufd_ctx_from_file(struct file *file)
+{
+ struct iommufd_ctx *ictx;
+
+ if (file->f_op != &iommufd_fops)
+ return ERR_PTR(-EBADFD);
+ ictx = file->private_data;
+ iommufd_ctx_get(ictx);
+ return ictx;
+}
+EXPORT_SYMBOL_NS_GPL(iommufd_ctx_from_file, IOMMUFD);
+
+/**
+ * iommufd_ctx_from_fd - Acquires a reference to the iommufd context
+ * @fd: File descriptor to obtain the reference from
+ *
+ * Returns a pointer to the iommufd_ctx, otherwise ERR_PTR. On success
+ * the caller is responsible to call iommufd_ctx_put().
+ */
+struct iommufd_ctx *iommufd_ctx_from_fd(int fd)
+{
+ struct file *file;
+
+ file = fget(fd);
+ if (!file)
+ return ERR_PTR(-EBADF);
+
+ if (file->f_op != &iommufd_fops) {
+ fput(file);
+ return ERR_PTR(-EBADFD);
+ }
+ /* fget is the same as iommufd_ctx_get() */
+ return file->private_data;
+}
+EXPORT_SYMBOL_NS_GPL(iommufd_ctx_from_fd, IOMMUFD);
+
+/**
+ * iommufd_ctx_put - Put back a reference
+ * @ictx: Context to put back
+ */
+void iommufd_ctx_put(struct iommufd_ctx *ictx)
+{
+ fput(ictx->file);
+}
+EXPORT_SYMBOL_NS_GPL(iommufd_ctx_put, IOMMUFD);
+
+static const struct iommufd_object_ops iommufd_object_ops[] = {
+ [IOMMUFD_OBJ_ACCESS] = {
+ .destroy = iommufd_access_destroy_object,
+ },
+ [IOMMUFD_OBJ_DEVICE] = {
+ .destroy = iommufd_device_destroy,
+ },
+ [IOMMUFD_OBJ_IOAS] = {
+ .destroy = iommufd_ioas_destroy,
+ },
+ [IOMMUFD_OBJ_HW_PAGETABLE] = {
+ .destroy = iommufd_hw_pagetable_destroy,
+ .abort = iommufd_hw_pagetable_abort,
+ },
+#ifdef CONFIG_IOMMUFD_TEST
+ [IOMMUFD_OBJ_SELFTEST] = {
+ .destroy = iommufd_selftest_destroy,
+ },
+#endif
+};
+
+static struct miscdevice iommu_misc_dev = {
+ .minor = MISC_DYNAMIC_MINOR,
+ .name = "iommu",
+ .fops = &iommufd_fops,
+ .nodename = "iommu",
+ .mode = 0660,
+};
+
+
+static struct miscdevice vfio_misc_dev = {
+ .minor = VFIO_MINOR,
+ .name = "vfio",
+ .fops = &iommufd_fops,
+ .nodename = "vfio/vfio",
+ .mode = 0666,
+};
+
+static int __init iommufd_init(void)
+{
+ int ret;
+
+ ret = misc_register(&iommu_misc_dev);
+ if (ret)
+ return ret;
+
+ if (IS_ENABLED(CONFIG_IOMMUFD_VFIO_CONTAINER)) {
+ ret = misc_register(&vfio_misc_dev);
+ if (ret)
+ goto err_misc;
+ }
+ ret = iommufd_test_init();
+ if (ret)
+ goto err_vfio_misc;
+ return 0;
+
+err_vfio_misc:
+ if (IS_ENABLED(CONFIG_IOMMUFD_VFIO_CONTAINER))
+ misc_deregister(&vfio_misc_dev);
+err_misc:
+ misc_deregister(&iommu_misc_dev);
+ return ret;
+}
+
+static void __exit iommufd_exit(void)
+{
+ iommufd_test_exit();
+ if (IS_ENABLED(CONFIG_IOMMUFD_VFIO_CONTAINER))
+ misc_deregister(&vfio_misc_dev);
+ misc_deregister(&iommu_misc_dev);
+}
+
+module_init(iommufd_init);
+module_exit(iommufd_exit);
+
+#if IS_ENABLED(CONFIG_IOMMUFD_VFIO_CONTAINER)
+MODULE_ALIAS_MISCDEV(VFIO_MINOR);
+MODULE_ALIAS("devname:vfio/vfio");
+#endif
+MODULE_IMPORT_NS(IOMMUFD_INTERNAL);
+MODULE_DESCRIPTION("I/O Address Space Management for passthrough devices");
+MODULE_LICENSE("GPL");
diff --git a/drivers/iommu/iommufd/pages.c b/drivers/iommu/iommufd/pages.c
new file mode 100644
index 0000000000..528f356238
--- /dev/null
+++ b/drivers/iommu/iommufd/pages.c
@@ -0,0 +1,1993 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES.
+ *
+ * The iopt_pages is the center of the storage and motion of PFNs. Each
+ * iopt_pages represents a logical linear array of full PFNs. The array is 0
+ * based and has npages in it. Accessors use 'index' to refer to the entry in
+ * this logical array, regardless of its storage location.
+ *
+ * PFNs are stored in a tiered scheme:
+ * 1) iopt_pages::pinned_pfns xarray
+ * 2) An iommu_domain
+ * 3) The origin of the PFNs, i.e. the userspace pointer
+ *
+ * PFN have to be copied between all combinations of tiers, depending on the
+ * configuration.
+ *
+ * When a PFN is taken out of the userspace pointer it is pinned exactly once.
+ * The storage locations of the PFN's index are tracked in the two interval
+ * trees. If no interval includes the index then it is not pinned.
+ *
+ * If access_itree includes the PFN's index then an in-kernel access has
+ * requested the page. The PFN is stored in the xarray so other requestors can
+ * continue to find it.
+ *
+ * If the domains_itree includes the PFN's index then an iommu_domain is storing
+ * the PFN and it can be read back using iommu_iova_to_phys(). To avoid
+ * duplicating storage the xarray is not used if only iommu_domains are using
+ * the PFN's index.
+ *
+ * As a general principle this is designed so that destroy never fails. This
+ * means removing an iommu_domain or releasing a in-kernel access will not fail
+ * due to insufficient memory. In practice this means some cases have to hold
+ * PFNs in the xarray even though they are also being stored in an iommu_domain.
+ *
+ * While the iopt_pages can use an iommu_domain as storage, it does not have an
+ * IOVA itself. Instead the iopt_area represents a range of IOVA and uses the
+ * iopt_pages as the PFN provider. Multiple iopt_areas can share the iopt_pages
+ * and reference their own slice of the PFN array, with sub page granularity.
+ *
+ * In this file the term 'last' indicates an inclusive and closed interval, eg
+ * [0,0] refers to a single PFN. 'end' means an open range, eg [0,0) refers to
+ * no PFNs.
+ *
+ * Be cautious of overflow. An IOVA can go all the way up to U64_MAX, so
+ * last_iova + 1 can overflow. An iopt_pages index will always be much less than
+ * ULONG_MAX so last_index + 1 cannot overflow.
+ */
+#include <linux/overflow.h>
+#include <linux/slab.h>
+#include <linux/iommu.h>
+#include <linux/sched/mm.h>
+#include <linux/highmem.h>
+#include <linux/kthread.h>
+#include <linux/iommufd.h>
+
+#include "io_pagetable.h"
+#include "double_span.h"
+
+#ifndef CONFIG_IOMMUFD_TEST
+#define TEMP_MEMORY_LIMIT 65536
+#else
+#define TEMP_MEMORY_LIMIT iommufd_test_memory_limit
+#endif
+#define BATCH_BACKUP_SIZE 32
+
+/*
+ * More memory makes pin_user_pages() and the batching more efficient, but as
+ * this is only a performance optimization don't try too hard to get it. A 64k
+ * allocation can hold about 26M of 4k pages and 13G of 2M pages in an
+ * pfn_batch. Various destroy paths cannot fail and provide a small amount of
+ * stack memory as a backup contingency. If backup_len is given this cannot
+ * fail.
+ */
+static void *temp_kmalloc(size_t *size, void *backup, size_t backup_len)
+{
+ void *res;
+
+ if (WARN_ON(*size == 0))
+ return NULL;
+
+ if (*size < backup_len)
+ return backup;
+
+ if (!backup && iommufd_should_fail())
+ return NULL;
+
+ *size = min_t(size_t, *size, TEMP_MEMORY_LIMIT);
+ res = kmalloc(*size, GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY);
+ if (res)
+ return res;
+ *size = PAGE_SIZE;
+ if (backup_len) {
+ res = kmalloc(*size, GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY);
+ if (res)
+ return res;
+ *size = backup_len;
+ return backup;
+ }
+ return kmalloc(*size, GFP_KERNEL);
+}
+
+void interval_tree_double_span_iter_update(
+ struct interval_tree_double_span_iter *iter)
+{
+ unsigned long last_hole = ULONG_MAX;
+ unsigned int i;
+
+ for (i = 0; i != ARRAY_SIZE(iter->spans); i++) {
+ if (interval_tree_span_iter_done(&iter->spans[i])) {
+ iter->is_used = -1;
+ return;
+ }
+
+ if (iter->spans[i].is_hole) {
+ last_hole = min(last_hole, iter->spans[i].last_hole);
+ continue;
+ }
+
+ iter->is_used = i + 1;
+ iter->start_used = iter->spans[i].start_used;
+ iter->last_used = min(iter->spans[i].last_used, last_hole);
+ return;
+ }
+
+ iter->is_used = 0;
+ iter->start_hole = iter->spans[0].start_hole;
+ iter->last_hole =
+ min(iter->spans[0].last_hole, iter->spans[1].last_hole);
+}
+
+void interval_tree_double_span_iter_first(
+ struct interval_tree_double_span_iter *iter,
+ struct rb_root_cached *itree1, struct rb_root_cached *itree2,
+ unsigned long first_index, unsigned long last_index)
+{
+ unsigned int i;
+
+ iter->itrees[0] = itree1;
+ iter->itrees[1] = itree2;
+ for (i = 0; i != ARRAY_SIZE(iter->spans); i++)
+ interval_tree_span_iter_first(&iter->spans[i], iter->itrees[i],
+ first_index, last_index);
+ interval_tree_double_span_iter_update(iter);
+}
+
+void interval_tree_double_span_iter_next(
+ struct interval_tree_double_span_iter *iter)
+{
+ unsigned int i;
+
+ if (iter->is_used == -1 ||
+ iter->last_hole == iter->spans[0].last_index) {
+ iter->is_used = -1;
+ return;
+ }
+
+ for (i = 0; i != ARRAY_SIZE(iter->spans); i++)
+ interval_tree_span_iter_advance(
+ &iter->spans[i], iter->itrees[i], iter->last_hole + 1);
+ interval_tree_double_span_iter_update(iter);
+}
+
+static void iopt_pages_add_npinned(struct iopt_pages *pages, size_t npages)
+{
+ int rc;
+
+ rc = check_add_overflow(pages->npinned, npages, &pages->npinned);
+ if (IS_ENABLED(CONFIG_IOMMUFD_TEST))
+ WARN_ON(rc || pages->npinned > pages->npages);
+}
+
+static void iopt_pages_sub_npinned(struct iopt_pages *pages, size_t npages)
+{
+ int rc;
+
+ rc = check_sub_overflow(pages->npinned, npages, &pages->npinned);
+ if (IS_ENABLED(CONFIG_IOMMUFD_TEST))
+ WARN_ON(rc || pages->npinned > pages->npages);
+}
+
+static void iopt_pages_err_unpin(struct iopt_pages *pages,
+ unsigned long start_index,
+ unsigned long last_index,
+ struct page **page_list)
+{
+ unsigned long npages = last_index - start_index + 1;
+
+ unpin_user_pages(page_list, npages);
+ iopt_pages_sub_npinned(pages, npages);
+}
+
+/*
+ * index is the number of PAGE_SIZE units from the start of the area's
+ * iopt_pages. If the iova is sub page-size then the area has an iova that
+ * covers a portion of the first and last pages in the range.
+ */
+static unsigned long iopt_area_index_to_iova(struct iopt_area *area,
+ unsigned long index)
+{
+ if (IS_ENABLED(CONFIG_IOMMUFD_TEST))
+ WARN_ON(index < iopt_area_index(area) ||
+ index > iopt_area_last_index(area));
+ index -= iopt_area_index(area);
+ if (index == 0)
+ return iopt_area_iova(area);
+ return iopt_area_iova(area) - area->page_offset + index * PAGE_SIZE;
+}
+
+static unsigned long iopt_area_index_to_iova_last(struct iopt_area *area,
+ unsigned long index)
+{
+ if (IS_ENABLED(CONFIG_IOMMUFD_TEST))
+ WARN_ON(index < iopt_area_index(area) ||
+ index > iopt_area_last_index(area));
+ if (index == iopt_area_last_index(area))
+ return iopt_area_last_iova(area);
+ return iopt_area_iova(area) - area->page_offset +
+ (index - iopt_area_index(area) + 1) * PAGE_SIZE - 1;
+}
+
+static void iommu_unmap_nofail(struct iommu_domain *domain, unsigned long iova,
+ size_t size)
+{
+ size_t ret;
+
+ ret = iommu_unmap(domain, iova, size);
+ /*
+ * It is a logic error in this code or a driver bug if the IOMMU unmaps
+ * something other than exactly as requested. This implies that the
+ * iommu driver may not fail unmap for reasons beyond bad agruments.
+ * Particularly, the iommu driver may not do a memory allocation on the
+ * unmap path.
+ */
+ WARN_ON(ret != size);
+}
+
+static void iopt_area_unmap_domain_range(struct iopt_area *area,
+ struct iommu_domain *domain,
+ unsigned long start_index,
+ unsigned long last_index)
+{
+ unsigned long start_iova = iopt_area_index_to_iova(area, start_index);
+
+ iommu_unmap_nofail(domain, start_iova,
+ iopt_area_index_to_iova_last(area, last_index) -
+ start_iova + 1);
+}
+
+static struct iopt_area *iopt_pages_find_domain_area(struct iopt_pages *pages,
+ unsigned long index)
+{
+ struct interval_tree_node *node;
+
+ node = interval_tree_iter_first(&pages->domains_itree, index, index);
+ if (!node)
+ return NULL;
+ return container_of(node, struct iopt_area, pages_node);
+}
+
+/*
+ * A simple datastructure to hold a vector of PFNs, optimized for contiguous
+ * PFNs. This is used as a temporary holding memory for shuttling pfns from one
+ * place to another. Generally everything is made more efficient if operations
+ * work on the largest possible grouping of pfns. eg fewer lock/unlock cycles,
+ * better cache locality, etc
+ */
+struct pfn_batch {
+ unsigned long *pfns;
+ u32 *npfns;
+ unsigned int array_size;
+ unsigned int end;
+ unsigned int total_pfns;
+};
+
+static void batch_clear(struct pfn_batch *batch)
+{
+ batch->total_pfns = 0;
+ batch->end = 0;
+ batch->pfns[0] = 0;
+ batch->npfns[0] = 0;
+}
+
+/*
+ * Carry means we carry a portion of the final hugepage over to the front of the
+ * batch
+ */
+static void batch_clear_carry(struct pfn_batch *batch, unsigned int keep_pfns)
+{
+ if (!keep_pfns)
+ return batch_clear(batch);
+
+ if (IS_ENABLED(CONFIG_IOMMUFD_TEST))
+ WARN_ON(!batch->end ||
+ batch->npfns[batch->end - 1] < keep_pfns);
+
+ batch->total_pfns = keep_pfns;
+ batch->pfns[0] = batch->pfns[batch->end - 1] +
+ (batch->npfns[batch->end - 1] - keep_pfns);
+ batch->npfns[0] = keep_pfns;
+ batch->end = 1;
+}
+
+static void batch_skip_carry(struct pfn_batch *batch, unsigned int skip_pfns)
+{
+ if (!batch->total_pfns)
+ return;
+ if (IS_ENABLED(CONFIG_IOMMUFD_TEST))
+ WARN_ON(batch->total_pfns != batch->npfns[0]);
+ skip_pfns = min(batch->total_pfns, skip_pfns);
+ batch->pfns[0] += skip_pfns;
+ batch->npfns[0] -= skip_pfns;
+ batch->total_pfns -= skip_pfns;
+}
+
+static int __batch_init(struct pfn_batch *batch, size_t max_pages, void *backup,
+ size_t backup_len)
+{
+ const size_t elmsz = sizeof(*batch->pfns) + sizeof(*batch->npfns);
+ size_t size = max_pages * elmsz;
+
+ batch->pfns = temp_kmalloc(&size, backup, backup_len);
+ if (!batch->pfns)
+ return -ENOMEM;
+ if (IS_ENABLED(CONFIG_IOMMUFD_TEST) && WARN_ON(size < elmsz))
+ return -EINVAL;
+ batch->array_size = size / elmsz;
+ batch->npfns = (u32 *)(batch->pfns + batch->array_size);
+ batch_clear(batch);
+ return 0;
+}
+
+static int batch_init(struct pfn_batch *batch, size_t max_pages)
+{
+ return __batch_init(batch, max_pages, NULL, 0);
+}
+
+static void batch_init_backup(struct pfn_batch *batch, size_t max_pages,
+ void *backup, size_t backup_len)
+{
+ __batch_init(batch, max_pages, backup, backup_len);
+}
+
+static void batch_destroy(struct pfn_batch *batch, void *backup)
+{
+ if (batch->pfns != backup)
+ kfree(batch->pfns);
+}
+
+/* true if the pfn was added, false otherwise */
+static bool batch_add_pfn(struct pfn_batch *batch, unsigned long pfn)
+{
+ const unsigned int MAX_NPFNS = type_max(typeof(*batch->npfns));
+
+ if (batch->end &&
+ pfn == batch->pfns[batch->end - 1] + batch->npfns[batch->end - 1] &&
+ batch->npfns[batch->end - 1] != MAX_NPFNS) {
+ batch->npfns[batch->end - 1]++;
+ batch->total_pfns++;
+ return true;
+ }
+ if (batch->end == batch->array_size)
+ return false;
+ batch->total_pfns++;
+ batch->pfns[batch->end] = pfn;
+ batch->npfns[batch->end] = 1;
+ batch->end++;
+ return true;
+}
+
+/*
+ * Fill the batch with pfns from the domain. When the batch is full, or it
+ * reaches last_index, the function will return. The caller should use
+ * batch->total_pfns to determine the starting point for the next iteration.
+ */
+static void batch_from_domain(struct pfn_batch *batch,
+ struct iommu_domain *domain,
+ struct iopt_area *area, unsigned long start_index,
+ unsigned long last_index)
+{
+ unsigned int page_offset = 0;
+ unsigned long iova;
+ phys_addr_t phys;
+
+ iova = iopt_area_index_to_iova(area, start_index);
+ if (start_index == iopt_area_index(area))
+ page_offset = area->page_offset;
+ while (start_index <= last_index) {
+ /*
+ * This is pretty slow, it would be nice to get the page size
+ * back from the driver, or have the driver directly fill the
+ * batch.
+ */
+ phys = iommu_iova_to_phys(domain, iova) - page_offset;
+ if (!batch_add_pfn(batch, PHYS_PFN(phys)))
+ return;
+ iova += PAGE_SIZE - page_offset;
+ page_offset = 0;
+ start_index++;
+ }
+}
+
+static struct page **raw_pages_from_domain(struct iommu_domain *domain,
+ struct iopt_area *area,
+ unsigned long start_index,
+ unsigned long last_index,
+ struct page **out_pages)
+{
+ unsigned int page_offset = 0;
+ unsigned long iova;
+ phys_addr_t phys;
+
+ iova = iopt_area_index_to_iova(area, start_index);
+ if (start_index == iopt_area_index(area))
+ page_offset = area->page_offset;
+ while (start_index <= last_index) {
+ phys = iommu_iova_to_phys(domain, iova) - page_offset;
+ *(out_pages++) = pfn_to_page(PHYS_PFN(phys));
+ iova += PAGE_SIZE - page_offset;
+ page_offset = 0;
+ start_index++;
+ }
+ return out_pages;
+}
+
+/* Continues reading a domain until we reach a discontinuity in the pfns. */
+static void batch_from_domain_continue(struct pfn_batch *batch,
+ struct iommu_domain *domain,
+ struct iopt_area *area,
+ unsigned long start_index,
+ unsigned long last_index)
+{
+ unsigned int array_size = batch->array_size;
+
+ batch->array_size = batch->end;
+ batch_from_domain(batch, domain, area, start_index, last_index);
+ batch->array_size = array_size;
+}
+
+/*
+ * This is part of the VFIO compatibility support for VFIO_TYPE1_IOMMU. That
+ * mode permits splitting a mapped area up, and then one of the splits is
+ * unmapped. Doing this normally would cause us to violate our invariant of
+ * pairing map/unmap. Thus, to support old VFIO compatibility disable support
+ * for batching consecutive PFNs. All PFNs mapped into the iommu are done in
+ * PAGE_SIZE units, not larger or smaller.
+ */
+static int batch_iommu_map_small(struct iommu_domain *domain,
+ unsigned long iova, phys_addr_t paddr,
+ size_t size, int prot)
+{
+ unsigned long start_iova = iova;
+ int rc;
+
+ if (IS_ENABLED(CONFIG_IOMMUFD_TEST))
+ WARN_ON(paddr % PAGE_SIZE || iova % PAGE_SIZE ||
+ size % PAGE_SIZE);
+
+ while (size) {
+ rc = iommu_map(domain, iova, paddr, PAGE_SIZE, prot,
+ GFP_KERNEL_ACCOUNT);
+ if (rc)
+ goto err_unmap;
+ iova += PAGE_SIZE;
+ paddr += PAGE_SIZE;
+ size -= PAGE_SIZE;
+ }
+ return 0;
+
+err_unmap:
+ if (start_iova != iova)
+ iommu_unmap_nofail(domain, start_iova, iova - start_iova);
+ return rc;
+}
+
+static int batch_to_domain(struct pfn_batch *batch, struct iommu_domain *domain,
+ struct iopt_area *area, unsigned long start_index)
+{
+ bool disable_large_pages = area->iopt->disable_large_pages;
+ unsigned long last_iova = iopt_area_last_iova(area);
+ unsigned int page_offset = 0;
+ unsigned long start_iova;
+ unsigned long next_iova;
+ unsigned int cur = 0;
+ unsigned long iova;
+ int rc;
+
+ /* The first index might be a partial page */
+ if (start_index == iopt_area_index(area))
+ page_offset = area->page_offset;
+ next_iova = iova = start_iova =
+ iopt_area_index_to_iova(area, start_index);
+ while (cur < batch->end) {
+ next_iova = min(last_iova + 1,
+ next_iova + batch->npfns[cur] * PAGE_SIZE -
+ page_offset);
+ if (disable_large_pages)
+ rc = batch_iommu_map_small(
+ domain, iova,
+ PFN_PHYS(batch->pfns[cur]) + page_offset,
+ next_iova - iova, area->iommu_prot);
+ else
+ rc = iommu_map(domain, iova,
+ PFN_PHYS(batch->pfns[cur]) + page_offset,
+ next_iova - iova, area->iommu_prot,
+ GFP_KERNEL_ACCOUNT);
+ if (rc)
+ goto err_unmap;
+ iova = next_iova;
+ page_offset = 0;
+ cur++;
+ }
+ return 0;
+err_unmap:
+ if (start_iova != iova)
+ iommu_unmap_nofail(domain, start_iova, iova - start_iova);
+ return rc;
+}
+
+static void batch_from_xarray(struct pfn_batch *batch, struct xarray *xa,
+ unsigned long start_index,
+ unsigned long last_index)
+{
+ XA_STATE(xas, xa, start_index);
+ void *entry;
+
+ rcu_read_lock();
+ while (true) {
+ entry = xas_next(&xas);
+ if (xas_retry(&xas, entry))
+ continue;
+ WARN_ON(!xa_is_value(entry));
+ if (!batch_add_pfn(batch, xa_to_value(entry)) ||
+ start_index == last_index)
+ break;
+ start_index++;
+ }
+ rcu_read_unlock();
+}
+
+static void batch_from_xarray_clear(struct pfn_batch *batch, struct xarray *xa,
+ unsigned long start_index,
+ unsigned long last_index)
+{
+ XA_STATE(xas, xa, start_index);
+ void *entry;
+
+ xas_lock(&xas);
+ while (true) {
+ entry = xas_next(&xas);
+ if (xas_retry(&xas, entry))
+ continue;
+ WARN_ON(!xa_is_value(entry));
+ if (!batch_add_pfn(batch, xa_to_value(entry)))
+ break;
+ xas_store(&xas, NULL);
+ if (start_index == last_index)
+ break;
+ start_index++;
+ }
+ xas_unlock(&xas);
+}
+
+static void clear_xarray(struct xarray *xa, unsigned long start_index,
+ unsigned long last_index)
+{
+ XA_STATE(xas, xa, start_index);
+ void *entry;
+
+ xas_lock(&xas);
+ xas_for_each(&xas, entry, last_index)
+ xas_store(&xas, NULL);
+ xas_unlock(&xas);
+}
+
+static int pages_to_xarray(struct xarray *xa, unsigned long start_index,
+ unsigned long last_index, struct page **pages)
+{
+ struct page **end_pages = pages + (last_index - start_index) + 1;
+ struct page **half_pages = pages + (end_pages - pages) / 2;
+ XA_STATE(xas, xa, start_index);
+
+ do {
+ void *old;
+
+ xas_lock(&xas);
+ while (pages != end_pages) {
+ /* xarray does not participate in fault injection */
+ if (pages == half_pages && iommufd_should_fail()) {
+ xas_set_err(&xas, -EINVAL);
+ xas_unlock(&xas);
+ /* aka xas_destroy() */
+ xas_nomem(&xas, GFP_KERNEL);
+ goto err_clear;
+ }
+
+ old = xas_store(&xas, xa_mk_value(page_to_pfn(*pages)));
+ if (xas_error(&xas))
+ break;
+ WARN_ON(old);
+ pages++;
+ xas_next(&xas);
+ }
+ xas_unlock(&xas);
+ } while (xas_nomem(&xas, GFP_KERNEL));
+
+err_clear:
+ if (xas_error(&xas)) {
+ if (xas.xa_index != start_index)
+ clear_xarray(xa, start_index, xas.xa_index - 1);
+ return xas_error(&xas);
+ }
+ return 0;
+}
+
+static void batch_from_pages(struct pfn_batch *batch, struct page **pages,
+ size_t npages)
+{
+ struct page **end = pages + npages;
+
+ for (; pages != end; pages++)
+ if (!batch_add_pfn(batch, page_to_pfn(*pages)))
+ break;
+}
+
+static void batch_unpin(struct pfn_batch *batch, struct iopt_pages *pages,
+ unsigned int first_page_off, size_t npages)
+{
+ unsigned int cur = 0;
+
+ while (first_page_off) {
+ if (batch->npfns[cur] > first_page_off)
+ break;
+ first_page_off -= batch->npfns[cur];
+ cur++;
+ }
+
+ while (npages) {
+ size_t to_unpin = min_t(size_t, npages,
+ batch->npfns[cur] - first_page_off);
+
+ unpin_user_page_range_dirty_lock(
+ pfn_to_page(batch->pfns[cur] + first_page_off),
+ to_unpin, pages->writable);
+ iopt_pages_sub_npinned(pages, to_unpin);
+ cur++;
+ first_page_off = 0;
+ npages -= to_unpin;
+ }
+}
+
+static void copy_data_page(struct page *page, void *data, unsigned long offset,
+ size_t length, unsigned int flags)
+{
+ void *mem;
+
+ mem = kmap_local_page(page);
+ if (flags & IOMMUFD_ACCESS_RW_WRITE) {
+ memcpy(mem + offset, data, length);
+ set_page_dirty_lock(page);
+ } else {
+ memcpy(data, mem + offset, length);
+ }
+ kunmap_local(mem);
+}
+
+static unsigned long batch_rw(struct pfn_batch *batch, void *data,
+ unsigned long offset, unsigned long length,
+ unsigned int flags)
+{
+ unsigned long copied = 0;
+ unsigned int npage = 0;
+ unsigned int cur = 0;
+
+ while (cur < batch->end) {
+ unsigned long bytes = min(length, PAGE_SIZE - offset);
+
+ copy_data_page(pfn_to_page(batch->pfns[cur] + npage), data,
+ offset, bytes, flags);
+ offset = 0;
+ length -= bytes;
+ data += bytes;
+ copied += bytes;
+ npage++;
+ if (npage == batch->npfns[cur]) {
+ npage = 0;
+ cur++;
+ }
+ if (!length)
+ break;
+ }
+ return copied;
+}
+
+/* pfn_reader_user is just the pin_user_pages() path */
+struct pfn_reader_user {
+ struct page **upages;
+ size_t upages_len;
+ unsigned long upages_start;
+ unsigned long upages_end;
+ unsigned int gup_flags;
+ /*
+ * 1 means mmget() and mmap_read_lock(), 0 means only mmget(), -1 is
+ * neither
+ */
+ int locked;
+};
+
+static void pfn_reader_user_init(struct pfn_reader_user *user,
+ struct iopt_pages *pages)
+{
+ user->upages = NULL;
+ user->upages_start = 0;
+ user->upages_end = 0;
+ user->locked = -1;
+
+ user->gup_flags = FOLL_LONGTERM;
+ if (pages->writable)
+ user->gup_flags |= FOLL_WRITE;
+}
+
+static void pfn_reader_user_destroy(struct pfn_reader_user *user,
+ struct iopt_pages *pages)
+{
+ if (user->locked != -1) {
+ if (user->locked)
+ mmap_read_unlock(pages->source_mm);
+ if (pages->source_mm != current->mm)
+ mmput(pages->source_mm);
+ user->locked = -1;
+ }
+
+ kfree(user->upages);
+ user->upages = NULL;
+}
+
+static int pfn_reader_user_pin(struct pfn_reader_user *user,
+ struct iopt_pages *pages,
+ unsigned long start_index,
+ unsigned long last_index)
+{
+ bool remote_mm = pages->source_mm != current->mm;
+ unsigned long npages;
+ uintptr_t uptr;
+ long rc;
+
+ if (IS_ENABLED(CONFIG_IOMMUFD_TEST) &&
+ WARN_ON(last_index < start_index))
+ return -EINVAL;
+
+ if (!user->upages) {
+ /* All undone in pfn_reader_destroy() */
+ user->upages_len =
+ (last_index - start_index + 1) * sizeof(*user->upages);
+ user->upages = temp_kmalloc(&user->upages_len, NULL, 0);
+ if (!user->upages)
+ return -ENOMEM;
+ }
+
+ if (user->locked == -1) {
+ /*
+ * The majority of usages will run the map task within the mm
+ * providing the pages, so we can optimize into
+ * get_user_pages_fast()
+ */
+ if (remote_mm) {
+ if (!mmget_not_zero(pages->source_mm))
+ return -EFAULT;
+ }
+ user->locked = 0;
+ }
+
+ npages = min_t(unsigned long, last_index - start_index + 1,
+ user->upages_len / sizeof(*user->upages));
+
+
+ if (iommufd_should_fail())
+ return -EFAULT;
+
+ uptr = (uintptr_t)(pages->uptr + start_index * PAGE_SIZE);
+ if (!remote_mm)
+ rc = pin_user_pages_fast(uptr, npages, user->gup_flags,
+ user->upages);
+ else {
+ if (!user->locked) {
+ mmap_read_lock(pages->source_mm);
+ user->locked = 1;
+ }
+ rc = pin_user_pages_remote(pages->source_mm, uptr, npages,
+ user->gup_flags, user->upages,
+ &user->locked);
+ }
+ if (rc <= 0) {
+ if (WARN_ON(!rc))
+ return -EFAULT;
+ return rc;
+ }
+ iopt_pages_add_npinned(pages, rc);
+ user->upages_start = start_index;
+ user->upages_end = start_index + rc;
+ return 0;
+}
+
+/* This is the "modern" and faster accounting method used by io_uring */
+static int incr_user_locked_vm(struct iopt_pages *pages, unsigned long npages)
+{
+ unsigned long lock_limit;
+ unsigned long cur_pages;
+ unsigned long new_pages;
+
+ lock_limit = task_rlimit(pages->source_task, RLIMIT_MEMLOCK) >>
+ PAGE_SHIFT;
+ do {
+ cur_pages = atomic_long_read(&pages->source_user->locked_vm);
+ new_pages = cur_pages + npages;
+ if (new_pages > lock_limit)
+ return -ENOMEM;
+ } while (atomic_long_cmpxchg(&pages->source_user->locked_vm, cur_pages,
+ new_pages) != cur_pages);
+ return 0;
+}
+
+static void decr_user_locked_vm(struct iopt_pages *pages, unsigned long npages)
+{
+ if (WARN_ON(atomic_long_read(&pages->source_user->locked_vm) < npages))
+ return;
+ atomic_long_sub(npages, &pages->source_user->locked_vm);
+}
+
+/* This is the accounting method used for compatibility with VFIO */
+static int update_mm_locked_vm(struct iopt_pages *pages, unsigned long npages,
+ bool inc, struct pfn_reader_user *user)
+{
+ bool do_put = false;
+ int rc;
+
+ if (user && user->locked) {
+ mmap_read_unlock(pages->source_mm);
+ user->locked = 0;
+ /* If we had the lock then we also have a get */
+ } else if ((!user || !user->upages) &&
+ pages->source_mm != current->mm) {
+ if (!mmget_not_zero(pages->source_mm))
+ return -EINVAL;
+ do_put = true;
+ }
+
+ mmap_write_lock(pages->source_mm);
+ rc = __account_locked_vm(pages->source_mm, npages, inc,
+ pages->source_task, false);
+ mmap_write_unlock(pages->source_mm);
+
+ if (do_put)
+ mmput(pages->source_mm);
+ return rc;
+}
+
+static int do_update_pinned(struct iopt_pages *pages, unsigned long npages,
+ bool inc, struct pfn_reader_user *user)
+{
+ int rc = 0;
+
+ switch (pages->account_mode) {
+ case IOPT_PAGES_ACCOUNT_NONE:
+ break;
+ case IOPT_PAGES_ACCOUNT_USER:
+ if (inc)
+ rc = incr_user_locked_vm(pages, npages);
+ else
+ decr_user_locked_vm(pages, npages);
+ break;
+ case IOPT_PAGES_ACCOUNT_MM:
+ rc = update_mm_locked_vm(pages, npages, inc, user);
+ break;
+ }
+ if (rc)
+ return rc;
+
+ pages->last_npinned = pages->npinned;
+ if (inc)
+ atomic64_add(npages, &pages->source_mm->pinned_vm);
+ else
+ atomic64_sub(npages, &pages->source_mm->pinned_vm);
+ return 0;
+}
+
+static void update_unpinned(struct iopt_pages *pages)
+{
+ if (WARN_ON(pages->npinned > pages->last_npinned))
+ return;
+ if (pages->npinned == pages->last_npinned)
+ return;
+ do_update_pinned(pages, pages->last_npinned - pages->npinned, false,
+ NULL);
+}
+
+/*
+ * Changes in the number of pages pinned is done after the pages have been read
+ * and processed. If the user lacked the limit then the error unwind will unpin
+ * everything that was just pinned. This is because it is expensive to calculate
+ * how many pages we have already pinned within a range to generate an accurate
+ * prediction in advance of doing the work to actually pin them.
+ */
+static int pfn_reader_user_update_pinned(struct pfn_reader_user *user,
+ struct iopt_pages *pages)
+{
+ unsigned long npages;
+ bool inc;
+
+ lockdep_assert_held(&pages->mutex);
+
+ if (pages->npinned == pages->last_npinned)
+ return 0;
+
+ if (pages->npinned < pages->last_npinned) {
+ npages = pages->last_npinned - pages->npinned;
+ inc = false;
+ } else {
+ if (iommufd_should_fail())
+ return -ENOMEM;
+ npages = pages->npinned - pages->last_npinned;
+ inc = true;
+ }
+ return do_update_pinned(pages, npages, inc, user);
+}
+
+/*
+ * PFNs are stored in three places, in order of preference:
+ * - The iopt_pages xarray. This is only populated if there is a
+ * iopt_pages_access
+ * - The iommu_domain under an area
+ * - The original PFN source, ie pages->source_mm
+ *
+ * This iterator reads the pfns optimizing to load according to the
+ * above order.
+ */
+struct pfn_reader {
+ struct iopt_pages *pages;
+ struct interval_tree_double_span_iter span;
+ struct pfn_batch batch;
+ unsigned long batch_start_index;
+ unsigned long batch_end_index;
+ unsigned long last_index;
+
+ struct pfn_reader_user user;
+};
+
+static int pfn_reader_update_pinned(struct pfn_reader *pfns)
+{
+ return pfn_reader_user_update_pinned(&pfns->user, pfns->pages);
+}
+
+/*
+ * The batch can contain a mixture of pages that are still in use and pages that
+ * need to be unpinned. Unpin only pages that are not held anywhere else.
+ */
+static void pfn_reader_unpin(struct pfn_reader *pfns)
+{
+ unsigned long last = pfns->batch_end_index - 1;
+ unsigned long start = pfns->batch_start_index;
+ struct interval_tree_double_span_iter span;
+ struct iopt_pages *pages = pfns->pages;
+
+ lockdep_assert_held(&pages->mutex);
+
+ interval_tree_for_each_double_span(&span, &pages->access_itree,
+ &pages->domains_itree, start, last) {
+ if (span.is_used)
+ continue;
+
+ batch_unpin(&pfns->batch, pages, span.start_hole - start,
+ span.last_hole - span.start_hole + 1);
+ }
+}
+
+/* Process a single span to load it from the proper storage */
+static int pfn_reader_fill_span(struct pfn_reader *pfns)
+{
+ struct interval_tree_double_span_iter *span = &pfns->span;
+ unsigned long start_index = pfns->batch_end_index;
+ struct iopt_area *area;
+ int rc;
+
+ if (IS_ENABLED(CONFIG_IOMMUFD_TEST) &&
+ WARN_ON(span->last_used < start_index))
+ return -EINVAL;
+
+ if (span->is_used == 1) {
+ batch_from_xarray(&pfns->batch, &pfns->pages->pinned_pfns,
+ start_index, span->last_used);
+ return 0;
+ }
+
+ if (span->is_used == 2) {
+ /*
+ * Pull as many pages from the first domain we find in the
+ * target span. If it is too small then we will be called again
+ * and we'll find another area.
+ */
+ area = iopt_pages_find_domain_area(pfns->pages, start_index);
+ if (WARN_ON(!area))
+ return -EINVAL;
+
+ /* The storage_domain cannot change without the pages mutex */
+ batch_from_domain(
+ &pfns->batch, area->storage_domain, area, start_index,
+ min(iopt_area_last_index(area), span->last_used));
+ return 0;
+ }
+
+ if (start_index >= pfns->user.upages_end) {
+ rc = pfn_reader_user_pin(&pfns->user, pfns->pages, start_index,
+ span->last_hole);
+ if (rc)
+ return rc;
+ }
+
+ batch_from_pages(&pfns->batch,
+ pfns->user.upages +
+ (start_index - pfns->user.upages_start),
+ pfns->user.upages_end - start_index);
+ return 0;
+}
+
+static bool pfn_reader_done(struct pfn_reader *pfns)
+{
+ return pfns->batch_start_index == pfns->last_index + 1;
+}
+
+static int pfn_reader_next(struct pfn_reader *pfns)
+{
+ int rc;
+
+ batch_clear(&pfns->batch);
+ pfns->batch_start_index = pfns->batch_end_index;
+
+ while (pfns->batch_end_index != pfns->last_index + 1) {
+ unsigned int npfns = pfns->batch.total_pfns;
+
+ if (IS_ENABLED(CONFIG_IOMMUFD_TEST) &&
+ WARN_ON(interval_tree_double_span_iter_done(&pfns->span)))
+ return -EINVAL;
+
+ rc = pfn_reader_fill_span(pfns);
+ if (rc)
+ return rc;
+
+ if (WARN_ON(!pfns->batch.total_pfns))
+ return -EINVAL;
+
+ pfns->batch_end_index =
+ pfns->batch_start_index + pfns->batch.total_pfns;
+ if (pfns->batch_end_index == pfns->span.last_used + 1)
+ interval_tree_double_span_iter_next(&pfns->span);
+
+ /* Batch is full */
+ if (npfns == pfns->batch.total_pfns)
+ return 0;
+ }
+ return 0;
+}
+
+static int pfn_reader_init(struct pfn_reader *pfns, struct iopt_pages *pages,
+ unsigned long start_index, unsigned long last_index)
+{
+ int rc;
+
+ lockdep_assert_held(&pages->mutex);
+
+ pfns->pages = pages;
+ pfns->batch_start_index = start_index;
+ pfns->batch_end_index = start_index;
+ pfns->last_index = last_index;
+ pfn_reader_user_init(&pfns->user, pages);
+ rc = batch_init(&pfns->batch, last_index - start_index + 1);
+ if (rc)
+ return rc;
+ interval_tree_double_span_iter_first(&pfns->span, &pages->access_itree,
+ &pages->domains_itree, start_index,
+ last_index);
+ return 0;
+}
+
+/*
+ * There are many assertions regarding the state of pages->npinned vs
+ * pages->last_pinned, for instance something like unmapping a domain must only
+ * decrement the npinned, and pfn_reader_destroy() must be called only after all
+ * the pins are updated. This is fine for success flows, but error flows
+ * sometimes need to release the pins held inside the pfn_reader before going on
+ * to complete unmapping and releasing pins held in domains.
+ */
+static void pfn_reader_release_pins(struct pfn_reader *pfns)
+{
+ struct iopt_pages *pages = pfns->pages;
+
+ if (pfns->user.upages_end > pfns->batch_end_index) {
+ size_t npages = pfns->user.upages_end - pfns->batch_end_index;
+
+ /* Any pages not transferred to the batch are just unpinned */
+ unpin_user_pages(pfns->user.upages + (pfns->batch_end_index -
+ pfns->user.upages_start),
+ npages);
+ iopt_pages_sub_npinned(pages, npages);
+ pfns->user.upages_end = pfns->batch_end_index;
+ }
+ if (pfns->batch_start_index != pfns->batch_end_index) {
+ pfn_reader_unpin(pfns);
+ pfns->batch_start_index = pfns->batch_end_index;
+ }
+}
+
+static void pfn_reader_destroy(struct pfn_reader *pfns)
+{
+ struct iopt_pages *pages = pfns->pages;
+
+ pfn_reader_release_pins(pfns);
+ pfn_reader_user_destroy(&pfns->user, pfns->pages);
+ batch_destroy(&pfns->batch, NULL);
+ WARN_ON(pages->last_npinned != pages->npinned);
+}
+
+static int pfn_reader_first(struct pfn_reader *pfns, struct iopt_pages *pages,
+ unsigned long start_index, unsigned long last_index)
+{
+ int rc;
+
+ if (IS_ENABLED(CONFIG_IOMMUFD_TEST) &&
+ WARN_ON(last_index < start_index))
+ return -EINVAL;
+
+ rc = pfn_reader_init(pfns, pages, start_index, last_index);
+ if (rc)
+ return rc;
+ rc = pfn_reader_next(pfns);
+ if (rc) {
+ pfn_reader_destroy(pfns);
+ return rc;
+ }
+ return 0;
+}
+
+struct iopt_pages *iopt_alloc_pages(void __user *uptr, unsigned long length,
+ bool writable)
+{
+ struct iopt_pages *pages;
+ unsigned long end;
+
+ /*
+ * The iommu API uses size_t as the length, and protect the DIV_ROUND_UP
+ * below from overflow
+ */
+ if (length > SIZE_MAX - PAGE_SIZE || length == 0)
+ return ERR_PTR(-EINVAL);
+
+ if (check_add_overflow((unsigned long)uptr, length, &end))
+ return ERR_PTR(-EOVERFLOW);
+
+ pages = kzalloc(sizeof(*pages), GFP_KERNEL_ACCOUNT);
+ if (!pages)
+ return ERR_PTR(-ENOMEM);
+
+ kref_init(&pages->kref);
+ xa_init_flags(&pages->pinned_pfns, XA_FLAGS_ACCOUNT);
+ mutex_init(&pages->mutex);
+ pages->source_mm = current->mm;
+ mmgrab(pages->source_mm);
+ pages->uptr = (void __user *)ALIGN_DOWN((uintptr_t)uptr, PAGE_SIZE);
+ pages->npages = DIV_ROUND_UP(length + (uptr - pages->uptr), PAGE_SIZE);
+ pages->access_itree = RB_ROOT_CACHED;
+ pages->domains_itree = RB_ROOT_CACHED;
+ pages->writable = writable;
+ if (capable(CAP_IPC_LOCK))
+ pages->account_mode = IOPT_PAGES_ACCOUNT_NONE;
+ else
+ pages->account_mode = IOPT_PAGES_ACCOUNT_USER;
+ pages->source_task = current->group_leader;
+ get_task_struct(current->group_leader);
+ pages->source_user = get_uid(current_user());
+ return pages;
+}
+
+void iopt_release_pages(struct kref *kref)
+{
+ struct iopt_pages *pages = container_of(kref, struct iopt_pages, kref);
+
+ WARN_ON(!RB_EMPTY_ROOT(&pages->access_itree.rb_root));
+ WARN_ON(!RB_EMPTY_ROOT(&pages->domains_itree.rb_root));
+ WARN_ON(pages->npinned);
+ WARN_ON(!xa_empty(&pages->pinned_pfns));
+ mmdrop(pages->source_mm);
+ mutex_destroy(&pages->mutex);
+ put_task_struct(pages->source_task);
+ free_uid(pages->source_user);
+ kfree(pages);
+}
+
+static void
+iopt_area_unpin_domain(struct pfn_batch *batch, struct iopt_area *area,
+ struct iopt_pages *pages, struct iommu_domain *domain,
+ unsigned long start_index, unsigned long last_index,
+ unsigned long *unmapped_end_index,
+ unsigned long real_last_index)
+{
+ while (start_index <= last_index) {
+ unsigned long batch_last_index;
+
+ if (*unmapped_end_index <= last_index) {
+ unsigned long start =
+ max(start_index, *unmapped_end_index);
+
+ if (IS_ENABLED(CONFIG_IOMMUFD_TEST) &&
+ batch->total_pfns)
+ WARN_ON(*unmapped_end_index -
+ batch->total_pfns !=
+ start_index);
+ batch_from_domain(batch, domain, area, start,
+ last_index);
+ batch_last_index = start_index + batch->total_pfns - 1;
+ } else {
+ batch_last_index = last_index;
+ }
+
+ if (IS_ENABLED(CONFIG_IOMMUFD_TEST))
+ WARN_ON(batch_last_index > real_last_index);
+
+ /*
+ * unmaps must always 'cut' at a place where the pfns are not
+ * contiguous to pair with the maps that always install
+ * contiguous pages. Thus, if we have to stop unpinning in the
+ * middle of the domains we need to keep reading pfns until we
+ * find a cut point to do the unmap. The pfns we read are
+ * carried over and either skipped or integrated into the next
+ * batch.
+ */
+ if (batch_last_index == last_index &&
+ last_index != real_last_index)
+ batch_from_domain_continue(batch, domain, area,
+ last_index + 1,
+ real_last_index);
+
+ if (*unmapped_end_index <= batch_last_index) {
+ iopt_area_unmap_domain_range(
+ area, domain, *unmapped_end_index,
+ start_index + batch->total_pfns - 1);
+ *unmapped_end_index = start_index + batch->total_pfns;
+ }
+
+ /* unpin must follow unmap */
+ batch_unpin(batch, pages, 0,
+ batch_last_index - start_index + 1);
+ start_index = batch_last_index + 1;
+
+ batch_clear_carry(batch,
+ *unmapped_end_index - batch_last_index - 1);
+ }
+}
+
+static void __iopt_area_unfill_domain(struct iopt_area *area,
+ struct iopt_pages *pages,
+ struct iommu_domain *domain,
+ unsigned long last_index)
+{
+ struct interval_tree_double_span_iter span;
+ unsigned long start_index = iopt_area_index(area);
+ unsigned long unmapped_end_index = start_index;
+ u64 backup[BATCH_BACKUP_SIZE];
+ struct pfn_batch batch;
+
+ lockdep_assert_held(&pages->mutex);
+
+ /*
+ * For security we must not unpin something that is still DMA mapped,
+ * so this must unmap any IOVA before we go ahead and unpin the pages.
+ * This creates a complexity where we need to skip over unpinning pages
+ * held in the xarray, but continue to unmap from the domain.
+ *
+ * The domain unmap cannot stop in the middle of a contiguous range of
+ * PFNs. To solve this problem the unpinning step will read ahead to the
+ * end of any contiguous span, unmap that whole span, and then only
+ * unpin the leading part that does not have any accesses. The residual
+ * PFNs that were unmapped but not unpinned are called a "carry" in the
+ * batch as they are moved to the front of the PFN list and continue on
+ * to the next iteration(s).
+ */
+ batch_init_backup(&batch, last_index + 1, backup, sizeof(backup));
+ interval_tree_for_each_double_span(&span, &pages->domains_itree,
+ &pages->access_itree, start_index,
+ last_index) {
+ if (span.is_used) {
+ batch_skip_carry(&batch,
+ span.last_used - span.start_used + 1);
+ continue;
+ }
+ iopt_area_unpin_domain(&batch, area, pages, domain,
+ span.start_hole, span.last_hole,
+ &unmapped_end_index, last_index);
+ }
+ /*
+ * If the range ends in a access then we do the residual unmap without
+ * any unpins.
+ */
+ if (unmapped_end_index != last_index + 1)
+ iopt_area_unmap_domain_range(area, domain, unmapped_end_index,
+ last_index);
+ WARN_ON(batch.total_pfns);
+ batch_destroy(&batch, backup);
+ update_unpinned(pages);
+}
+
+static void iopt_area_unfill_partial_domain(struct iopt_area *area,
+ struct iopt_pages *pages,
+ struct iommu_domain *domain,
+ unsigned long end_index)
+{
+ if (end_index != iopt_area_index(area))
+ __iopt_area_unfill_domain(area, pages, domain, end_index - 1);
+}
+
+/**
+ * iopt_area_unmap_domain() - Unmap without unpinning PFNs in a domain
+ * @area: The IOVA range to unmap
+ * @domain: The domain to unmap
+ *
+ * The caller must know that unpinning is not required, usually because there
+ * are other domains in the iopt.
+ */
+void iopt_area_unmap_domain(struct iopt_area *area, struct iommu_domain *domain)
+{
+ iommu_unmap_nofail(domain, iopt_area_iova(area),
+ iopt_area_length(area));
+}
+
+/**
+ * iopt_area_unfill_domain() - Unmap and unpin PFNs in a domain
+ * @area: IOVA area to use
+ * @pages: page supplier for the area (area->pages is NULL)
+ * @domain: Domain to unmap from
+ *
+ * The domain should be removed from the domains_itree before calling. The
+ * domain will always be unmapped, but the PFNs may not be unpinned if there are
+ * still accesses.
+ */
+void iopt_area_unfill_domain(struct iopt_area *area, struct iopt_pages *pages,
+ struct iommu_domain *domain)
+{
+ __iopt_area_unfill_domain(area, pages, domain,
+ iopt_area_last_index(area));
+}
+
+/**
+ * iopt_area_fill_domain() - Map PFNs from the area into a domain
+ * @area: IOVA area to use
+ * @domain: Domain to load PFNs into
+ *
+ * Read the pfns from the area's underlying iopt_pages and map them into the
+ * given domain. Called when attaching a new domain to an io_pagetable.
+ */
+int iopt_area_fill_domain(struct iopt_area *area, struct iommu_domain *domain)
+{
+ unsigned long done_end_index;
+ struct pfn_reader pfns;
+ int rc;
+
+ lockdep_assert_held(&area->pages->mutex);
+
+ rc = pfn_reader_first(&pfns, area->pages, iopt_area_index(area),
+ iopt_area_last_index(area));
+ if (rc)
+ return rc;
+
+ while (!pfn_reader_done(&pfns)) {
+ done_end_index = pfns.batch_start_index;
+ rc = batch_to_domain(&pfns.batch, domain, area,
+ pfns.batch_start_index);
+ if (rc)
+ goto out_unmap;
+ done_end_index = pfns.batch_end_index;
+
+ rc = pfn_reader_next(&pfns);
+ if (rc)
+ goto out_unmap;
+ }
+
+ rc = pfn_reader_update_pinned(&pfns);
+ if (rc)
+ goto out_unmap;
+ goto out_destroy;
+
+out_unmap:
+ pfn_reader_release_pins(&pfns);
+ iopt_area_unfill_partial_domain(area, area->pages, domain,
+ done_end_index);
+out_destroy:
+ pfn_reader_destroy(&pfns);
+ return rc;
+}
+
+/**
+ * iopt_area_fill_domains() - Install PFNs into the area's domains
+ * @area: The area to act on
+ * @pages: The pages associated with the area (area->pages is NULL)
+ *
+ * Called during area creation. The area is freshly created and not inserted in
+ * the domains_itree yet. PFNs are read and loaded into every domain held in the
+ * area's io_pagetable and the area is installed in the domains_itree.
+ *
+ * On failure all domains are left unchanged.
+ */
+int iopt_area_fill_domains(struct iopt_area *area, struct iopt_pages *pages)
+{
+ unsigned long done_first_end_index;
+ unsigned long done_all_end_index;
+ struct iommu_domain *domain;
+ unsigned long unmap_index;
+ struct pfn_reader pfns;
+ unsigned long index;
+ int rc;
+
+ lockdep_assert_held(&area->iopt->domains_rwsem);
+
+ if (xa_empty(&area->iopt->domains))
+ return 0;
+
+ mutex_lock(&pages->mutex);
+ rc = pfn_reader_first(&pfns, pages, iopt_area_index(area),
+ iopt_area_last_index(area));
+ if (rc)
+ goto out_unlock;
+
+ while (!pfn_reader_done(&pfns)) {
+ done_first_end_index = pfns.batch_end_index;
+ done_all_end_index = pfns.batch_start_index;
+ xa_for_each(&area->iopt->domains, index, domain) {
+ rc = batch_to_domain(&pfns.batch, domain, area,
+ pfns.batch_start_index);
+ if (rc)
+ goto out_unmap;
+ }
+ done_all_end_index = done_first_end_index;
+
+ rc = pfn_reader_next(&pfns);
+ if (rc)
+ goto out_unmap;
+ }
+ rc = pfn_reader_update_pinned(&pfns);
+ if (rc)
+ goto out_unmap;
+
+ area->storage_domain = xa_load(&area->iopt->domains, 0);
+ interval_tree_insert(&area->pages_node, &pages->domains_itree);
+ goto out_destroy;
+
+out_unmap:
+ pfn_reader_release_pins(&pfns);
+ xa_for_each(&area->iopt->domains, unmap_index, domain) {
+ unsigned long end_index;
+
+ if (unmap_index < index)
+ end_index = done_first_end_index;
+ else
+ end_index = done_all_end_index;
+
+ /*
+ * The area is not yet part of the domains_itree so we have to
+ * manage the unpinning specially. The last domain does the
+ * unpin, every other domain is just unmapped.
+ */
+ if (unmap_index != area->iopt->next_domain_id - 1) {
+ if (end_index != iopt_area_index(area))
+ iopt_area_unmap_domain_range(
+ area, domain, iopt_area_index(area),
+ end_index - 1);
+ } else {
+ iopt_area_unfill_partial_domain(area, pages, domain,
+ end_index);
+ }
+ }
+out_destroy:
+ pfn_reader_destroy(&pfns);
+out_unlock:
+ mutex_unlock(&pages->mutex);
+ return rc;
+}
+
+/**
+ * iopt_area_unfill_domains() - unmap PFNs from the area's domains
+ * @area: The area to act on
+ * @pages: The pages associated with the area (area->pages is NULL)
+ *
+ * Called during area destruction. This unmaps the iova's covered by all the
+ * area's domains and releases the PFNs.
+ */
+void iopt_area_unfill_domains(struct iopt_area *area, struct iopt_pages *pages)
+{
+ struct io_pagetable *iopt = area->iopt;
+ struct iommu_domain *domain;
+ unsigned long index;
+
+ lockdep_assert_held(&iopt->domains_rwsem);
+
+ mutex_lock(&pages->mutex);
+ if (!area->storage_domain)
+ goto out_unlock;
+
+ xa_for_each(&iopt->domains, index, domain)
+ if (domain != area->storage_domain)
+ iopt_area_unmap_domain_range(
+ area, domain, iopt_area_index(area),
+ iopt_area_last_index(area));
+
+ if (IS_ENABLED(CONFIG_IOMMUFD_TEST))
+ WARN_ON(RB_EMPTY_NODE(&area->pages_node.rb));
+ interval_tree_remove(&area->pages_node, &pages->domains_itree);
+ iopt_area_unfill_domain(area, pages, area->storage_domain);
+ area->storage_domain = NULL;
+out_unlock:
+ mutex_unlock(&pages->mutex);
+}
+
+static void iopt_pages_unpin_xarray(struct pfn_batch *batch,
+ struct iopt_pages *pages,
+ unsigned long start_index,
+ unsigned long end_index)
+{
+ while (start_index <= end_index) {
+ batch_from_xarray_clear(batch, &pages->pinned_pfns, start_index,
+ end_index);
+ batch_unpin(batch, pages, 0, batch->total_pfns);
+ start_index += batch->total_pfns;
+ batch_clear(batch);
+ }
+}
+
+/**
+ * iopt_pages_unfill_xarray() - Update the xarry after removing an access
+ * @pages: The pages to act on
+ * @start_index: Starting PFN index
+ * @last_index: Last PFN index
+ *
+ * Called when an iopt_pages_access is removed, removes pages from the itree.
+ * The access should already be removed from the access_itree.
+ */
+void iopt_pages_unfill_xarray(struct iopt_pages *pages,
+ unsigned long start_index,
+ unsigned long last_index)
+{
+ struct interval_tree_double_span_iter span;
+ u64 backup[BATCH_BACKUP_SIZE];
+ struct pfn_batch batch;
+ bool batch_inited = false;
+
+ lockdep_assert_held(&pages->mutex);
+
+ interval_tree_for_each_double_span(&span, &pages->access_itree,
+ &pages->domains_itree, start_index,
+ last_index) {
+ if (!span.is_used) {
+ if (!batch_inited) {
+ batch_init_backup(&batch,
+ last_index - start_index + 1,
+ backup, sizeof(backup));
+ batch_inited = true;
+ }
+ iopt_pages_unpin_xarray(&batch, pages, span.start_hole,
+ span.last_hole);
+ } else if (span.is_used == 2) {
+ /* Covered by a domain */
+ clear_xarray(&pages->pinned_pfns, span.start_used,
+ span.last_used);
+ }
+ /* Otherwise covered by an existing access */
+ }
+ if (batch_inited)
+ batch_destroy(&batch, backup);
+ update_unpinned(pages);
+}
+
+/**
+ * iopt_pages_fill_from_xarray() - Fast path for reading PFNs
+ * @pages: The pages to act on
+ * @start_index: The first page index in the range
+ * @last_index: The last page index in the range
+ * @out_pages: The output array to return the pages
+ *
+ * This can be called if the caller is holding a refcount on an
+ * iopt_pages_access that is known to have already been filled. It quickly reads
+ * the pages directly from the xarray.
+ *
+ * This is part of the SW iommu interface to read pages for in-kernel use.
+ */
+void iopt_pages_fill_from_xarray(struct iopt_pages *pages,
+ unsigned long start_index,
+ unsigned long last_index,
+ struct page **out_pages)
+{
+ XA_STATE(xas, &pages->pinned_pfns, start_index);
+ void *entry;
+
+ rcu_read_lock();
+ while (start_index <= last_index) {
+ entry = xas_next(&xas);
+ if (xas_retry(&xas, entry))
+ continue;
+ WARN_ON(!xa_is_value(entry));
+ *(out_pages++) = pfn_to_page(xa_to_value(entry));
+ start_index++;
+ }
+ rcu_read_unlock();
+}
+
+static int iopt_pages_fill_from_domain(struct iopt_pages *pages,
+ unsigned long start_index,
+ unsigned long last_index,
+ struct page **out_pages)
+{
+ while (start_index != last_index + 1) {
+ unsigned long domain_last;
+ struct iopt_area *area;
+
+ area = iopt_pages_find_domain_area(pages, start_index);
+ if (WARN_ON(!area))
+ return -EINVAL;
+
+ domain_last = min(iopt_area_last_index(area), last_index);
+ out_pages = raw_pages_from_domain(area->storage_domain, area,
+ start_index, domain_last,
+ out_pages);
+ start_index = domain_last + 1;
+ }
+ return 0;
+}
+
+static int iopt_pages_fill_from_mm(struct iopt_pages *pages,
+ struct pfn_reader_user *user,
+ unsigned long start_index,
+ unsigned long last_index,
+ struct page **out_pages)
+{
+ unsigned long cur_index = start_index;
+ int rc;
+
+ while (cur_index != last_index + 1) {
+ user->upages = out_pages + (cur_index - start_index);
+ rc = pfn_reader_user_pin(user, pages, cur_index, last_index);
+ if (rc)
+ goto out_unpin;
+ cur_index = user->upages_end;
+ }
+ return 0;
+
+out_unpin:
+ if (start_index != cur_index)
+ iopt_pages_err_unpin(pages, start_index, cur_index - 1,
+ out_pages);
+ return rc;
+}
+
+/**
+ * iopt_pages_fill_xarray() - Read PFNs
+ * @pages: The pages to act on
+ * @start_index: The first page index in the range
+ * @last_index: The last page index in the range
+ * @out_pages: The output array to return the pages, may be NULL
+ *
+ * This populates the xarray and returns the pages in out_pages. As the slow
+ * path this is able to copy pages from other storage tiers into the xarray.
+ *
+ * On failure the xarray is left unchanged.
+ *
+ * This is part of the SW iommu interface to read pages for in-kernel use.
+ */
+int iopt_pages_fill_xarray(struct iopt_pages *pages, unsigned long start_index,
+ unsigned long last_index, struct page **out_pages)
+{
+ struct interval_tree_double_span_iter span;
+ unsigned long xa_end = start_index;
+ struct pfn_reader_user user;
+ int rc;
+
+ lockdep_assert_held(&pages->mutex);
+
+ pfn_reader_user_init(&user, pages);
+ user.upages_len = (last_index - start_index + 1) * sizeof(*out_pages);
+ interval_tree_for_each_double_span(&span, &pages->access_itree,
+ &pages->domains_itree, start_index,
+ last_index) {
+ struct page **cur_pages;
+
+ if (span.is_used == 1) {
+ cur_pages = out_pages + (span.start_used - start_index);
+ iopt_pages_fill_from_xarray(pages, span.start_used,
+ span.last_used, cur_pages);
+ continue;
+ }
+
+ if (span.is_used == 2) {
+ cur_pages = out_pages + (span.start_used - start_index);
+ iopt_pages_fill_from_domain(pages, span.start_used,
+ span.last_used, cur_pages);
+ rc = pages_to_xarray(&pages->pinned_pfns,
+ span.start_used, span.last_used,
+ cur_pages);
+ if (rc)
+ goto out_clean_xa;
+ xa_end = span.last_used + 1;
+ continue;
+ }
+
+ /* hole */
+ cur_pages = out_pages + (span.start_hole - start_index);
+ rc = iopt_pages_fill_from_mm(pages, &user, span.start_hole,
+ span.last_hole, cur_pages);
+ if (rc)
+ goto out_clean_xa;
+ rc = pages_to_xarray(&pages->pinned_pfns, span.start_hole,
+ span.last_hole, cur_pages);
+ if (rc) {
+ iopt_pages_err_unpin(pages, span.start_hole,
+ span.last_hole, cur_pages);
+ goto out_clean_xa;
+ }
+ xa_end = span.last_hole + 1;
+ }
+ rc = pfn_reader_user_update_pinned(&user, pages);
+ if (rc)
+ goto out_clean_xa;
+ user.upages = NULL;
+ pfn_reader_user_destroy(&user, pages);
+ return 0;
+
+out_clean_xa:
+ if (start_index != xa_end)
+ iopt_pages_unfill_xarray(pages, start_index, xa_end - 1);
+ user.upages = NULL;
+ pfn_reader_user_destroy(&user, pages);
+ return rc;
+}
+
+/*
+ * This uses the pfn_reader instead of taking a shortcut by using the mm. It can
+ * do every scenario and is fully consistent with what an iommu_domain would
+ * see.
+ */
+static int iopt_pages_rw_slow(struct iopt_pages *pages,
+ unsigned long start_index,
+ unsigned long last_index, unsigned long offset,
+ void *data, unsigned long length,
+ unsigned int flags)
+{
+ struct pfn_reader pfns;
+ int rc;
+
+ mutex_lock(&pages->mutex);
+
+ rc = pfn_reader_first(&pfns, pages, start_index, last_index);
+ if (rc)
+ goto out_unlock;
+
+ while (!pfn_reader_done(&pfns)) {
+ unsigned long done;
+
+ done = batch_rw(&pfns.batch, data, offset, length, flags);
+ data += done;
+ length -= done;
+ offset = 0;
+ pfn_reader_unpin(&pfns);
+
+ rc = pfn_reader_next(&pfns);
+ if (rc)
+ goto out_destroy;
+ }
+ if (WARN_ON(length != 0))
+ rc = -EINVAL;
+out_destroy:
+ pfn_reader_destroy(&pfns);
+out_unlock:
+ mutex_unlock(&pages->mutex);
+ return rc;
+}
+
+/*
+ * A medium speed path that still allows DMA inconsistencies, but doesn't do any
+ * memory allocations or interval tree searches.
+ */
+static int iopt_pages_rw_page(struct iopt_pages *pages, unsigned long index,
+ unsigned long offset, void *data,
+ unsigned long length, unsigned int flags)
+{
+ struct page *page = NULL;
+ int rc;
+
+ if (!mmget_not_zero(pages->source_mm))
+ return iopt_pages_rw_slow(pages, index, index, offset, data,
+ length, flags);
+
+ if (iommufd_should_fail()) {
+ rc = -EINVAL;
+ goto out_mmput;
+ }
+
+ mmap_read_lock(pages->source_mm);
+ rc = pin_user_pages_remote(
+ pages->source_mm, (uintptr_t)(pages->uptr + index * PAGE_SIZE),
+ 1, (flags & IOMMUFD_ACCESS_RW_WRITE) ? FOLL_WRITE : 0, &page,
+ NULL);
+ mmap_read_unlock(pages->source_mm);
+ if (rc != 1) {
+ if (WARN_ON(rc >= 0))
+ rc = -EINVAL;
+ goto out_mmput;
+ }
+ copy_data_page(page, data, offset, length, flags);
+ unpin_user_page(page);
+ rc = 0;
+
+out_mmput:
+ mmput(pages->source_mm);
+ return rc;
+}
+
+/**
+ * iopt_pages_rw_access - Copy to/from a linear slice of the pages
+ * @pages: pages to act on
+ * @start_byte: First byte of pages to copy to/from
+ * @data: Kernel buffer to get/put the data
+ * @length: Number of bytes to copy
+ * @flags: IOMMUFD_ACCESS_RW_* flags
+ *
+ * This will find each page in the range, kmap it and then memcpy to/from
+ * the given kernel buffer.
+ */
+int iopt_pages_rw_access(struct iopt_pages *pages, unsigned long start_byte,
+ void *data, unsigned long length, unsigned int flags)
+{
+ unsigned long start_index = start_byte / PAGE_SIZE;
+ unsigned long last_index = (start_byte + length - 1) / PAGE_SIZE;
+ bool change_mm = current->mm != pages->source_mm;
+ int rc = 0;
+
+ if (IS_ENABLED(CONFIG_IOMMUFD_TEST) &&
+ (flags & __IOMMUFD_ACCESS_RW_SLOW_PATH))
+ change_mm = true;
+
+ if ((flags & IOMMUFD_ACCESS_RW_WRITE) && !pages->writable)
+ return -EPERM;
+
+ if (!(flags & IOMMUFD_ACCESS_RW_KTHREAD) && change_mm) {
+ if (start_index == last_index)
+ return iopt_pages_rw_page(pages, start_index,
+ start_byte % PAGE_SIZE, data,
+ length, flags);
+ return iopt_pages_rw_slow(pages, start_index, last_index,
+ start_byte % PAGE_SIZE, data, length,
+ flags);
+ }
+
+ /*
+ * Try to copy using copy_to_user(). We do this as a fast path and
+ * ignore any pinning inconsistencies, unlike a real DMA path.
+ */
+ if (change_mm) {
+ if (!mmget_not_zero(pages->source_mm))
+ return iopt_pages_rw_slow(pages, start_index,
+ last_index,
+ start_byte % PAGE_SIZE, data,
+ length, flags);
+ kthread_use_mm(pages->source_mm);
+ }
+
+ if (flags & IOMMUFD_ACCESS_RW_WRITE) {
+ if (copy_to_user(pages->uptr + start_byte, data, length))
+ rc = -EFAULT;
+ } else {
+ if (copy_from_user(data, pages->uptr + start_byte, length))
+ rc = -EFAULT;
+ }
+
+ if (change_mm) {
+ kthread_unuse_mm(pages->source_mm);
+ mmput(pages->source_mm);
+ }
+
+ return rc;
+}
+
+static struct iopt_pages_access *
+iopt_pages_get_exact_access(struct iopt_pages *pages, unsigned long index,
+ unsigned long last)
+{
+ struct interval_tree_node *node;
+
+ lockdep_assert_held(&pages->mutex);
+
+ /* There can be overlapping ranges in this interval tree */
+ for (node = interval_tree_iter_first(&pages->access_itree, index, last);
+ node; node = interval_tree_iter_next(node, index, last))
+ if (node->start == index && node->last == last)
+ return container_of(node, struct iopt_pages_access,
+ node);
+ return NULL;
+}
+
+/**
+ * iopt_area_add_access() - Record an in-knerel access for PFNs
+ * @area: The source of PFNs
+ * @start_index: First page index
+ * @last_index: Inclusive last page index
+ * @out_pages: Output list of struct page's representing the PFNs
+ * @flags: IOMMUFD_ACCESS_RW_* flags
+ *
+ * Record that an in-kernel access will be accessing the pages, ensure they are
+ * pinned, and return the PFNs as a simple list of 'struct page *'.
+ *
+ * This should be undone through a matching call to iopt_area_remove_access()
+ */
+int iopt_area_add_access(struct iopt_area *area, unsigned long start_index,
+ unsigned long last_index, struct page **out_pages,
+ unsigned int flags)
+{
+ struct iopt_pages *pages = area->pages;
+ struct iopt_pages_access *access;
+ int rc;
+
+ if ((flags & IOMMUFD_ACCESS_RW_WRITE) && !pages->writable)
+ return -EPERM;
+
+ mutex_lock(&pages->mutex);
+ access = iopt_pages_get_exact_access(pages, start_index, last_index);
+ if (access) {
+ area->num_accesses++;
+ access->users++;
+ iopt_pages_fill_from_xarray(pages, start_index, last_index,
+ out_pages);
+ mutex_unlock(&pages->mutex);
+ return 0;
+ }
+
+ access = kzalloc(sizeof(*access), GFP_KERNEL_ACCOUNT);
+ if (!access) {
+ rc = -ENOMEM;
+ goto err_unlock;
+ }
+
+ rc = iopt_pages_fill_xarray(pages, start_index, last_index, out_pages);
+ if (rc)
+ goto err_free;
+
+ access->node.start = start_index;
+ access->node.last = last_index;
+ access->users = 1;
+ area->num_accesses++;
+ interval_tree_insert(&access->node, &pages->access_itree);
+ mutex_unlock(&pages->mutex);
+ return 0;
+
+err_free:
+ kfree(access);
+err_unlock:
+ mutex_unlock(&pages->mutex);
+ return rc;
+}
+
+/**
+ * iopt_area_remove_access() - Release an in-kernel access for PFNs
+ * @area: The source of PFNs
+ * @start_index: First page index
+ * @last_index: Inclusive last page index
+ *
+ * Undo iopt_area_add_access() and unpin the pages if necessary. The caller
+ * must stop using the PFNs before calling this.
+ */
+void iopt_area_remove_access(struct iopt_area *area, unsigned long start_index,
+ unsigned long last_index)
+{
+ struct iopt_pages *pages = area->pages;
+ struct iopt_pages_access *access;
+
+ mutex_lock(&pages->mutex);
+ access = iopt_pages_get_exact_access(pages, start_index, last_index);
+ if (WARN_ON(!access))
+ goto out_unlock;
+
+ WARN_ON(area->num_accesses == 0 || access->users == 0);
+ area->num_accesses--;
+ access->users--;
+ if (access->users)
+ goto out_unlock;
+
+ interval_tree_remove(&access->node, &pages->access_itree);
+ iopt_pages_unfill_xarray(pages, start_index, last_index);
+ kfree(access);
+out_unlock:
+ mutex_unlock(&pages->mutex);
+}
diff --git a/drivers/iommu/iommufd/selftest.c b/drivers/iommu/iommufd/selftest.c
new file mode 100644
index 0000000000..56506d5753
--- /dev/null
+++ b/drivers/iommu/iommufd/selftest.c
@@ -0,0 +1,1107 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES.
+ *
+ * Kernel side components to support tools/testing/selftests/iommu
+ */
+#include <linux/slab.h>
+#include <linux/iommu.h>
+#include <linux/xarray.h>
+#include <linux/file.h>
+#include <linux/anon_inodes.h>
+#include <linux/fault-inject.h>
+#include <linux/platform_device.h>
+#include <uapi/linux/iommufd.h>
+
+#include "../iommu-priv.h"
+#include "io_pagetable.h"
+#include "iommufd_private.h"
+#include "iommufd_test.h"
+
+static DECLARE_FAULT_ATTR(fail_iommufd);
+static struct dentry *dbgfs_root;
+static struct platform_device *selftest_iommu_dev;
+
+size_t iommufd_test_memory_limit = 65536;
+
+enum {
+ MOCK_IO_PAGE_SIZE = PAGE_SIZE / 2,
+
+ /*
+ * Like a real page table alignment requires the low bits of the address
+ * to be zero. xarray also requires the high bit to be zero, so we store
+ * the pfns shifted. The upper bits are used for metadata.
+ */
+ MOCK_PFN_MASK = ULONG_MAX / MOCK_IO_PAGE_SIZE,
+
+ _MOCK_PFN_START = MOCK_PFN_MASK + 1,
+ MOCK_PFN_START_IOVA = _MOCK_PFN_START,
+ MOCK_PFN_LAST_IOVA = _MOCK_PFN_START,
+};
+
+/*
+ * Syzkaller has trouble randomizing the correct iova to use since it is linked
+ * to the map ioctl's output, and it has no ide about that. So, simplify things.
+ * In syzkaller mode the 64 bit IOVA is converted into an nth area and offset
+ * value. This has a much smaller randomization space and syzkaller can hit it.
+ */
+static unsigned long iommufd_test_syz_conv_iova(struct io_pagetable *iopt,
+ u64 *iova)
+{
+ struct syz_layout {
+ __u32 nth_area;
+ __u32 offset;
+ };
+ struct syz_layout *syz = (void *)iova;
+ unsigned int nth = syz->nth_area;
+ struct iopt_area *area;
+
+ down_read(&iopt->iova_rwsem);
+ for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
+ area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
+ if (nth == 0) {
+ up_read(&iopt->iova_rwsem);
+ return iopt_area_iova(area) + syz->offset;
+ }
+ nth--;
+ }
+ up_read(&iopt->iova_rwsem);
+
+ return 0;
+}
+
+void iommufd_test_syz_conv_iova_id(struct iommufd_ucmd *ucmd,
+ unsigned int ioas_id, u64 *iova, u32 *flags)
+{
+ struct iommufd_ioas *ioas;
+
+ if (!(*flags & MOCK_FLAGS_ACCESS_SYZ))
+ return;
+ *flags &= ~(u32)MOCK_FLAGS_ACCESS_SYZ;
+
+ ioas = iommufd_get_ioas(ucmd->ictx, ioas_id);
+ if (IS_ERR(ioas))
+ return;
+ *iova = iommufd_test_syz_conv_iova(&ioas->iopt, iova);
+ iommufd_put_object(&ioas->obj);
+}
+
+struct mock_iommu_domain {
+ struct iommu_domain domain;
+ struct xarray pfns;
+};
+
+enum selftest_obj_type {
+ TYPE_IDEV,
+};
+
+struct mock_dev {
+ struct device dev;
+};
+
+struct selftest_obj {
+ struct iommufd_object obj;
+ enum selftest_obj_type type;
+
+ union {
+ struct {
+ struct iommufd_device *idev;
+ struct iommufd_ctx *ictx;
+ struct mock_dev *mock_dev;
+ } idev;
+ };
+};
+
+static void mock_domain_blocking_free(struct iommu_domain *domain)
+{
+}
+
+static int mock_domain_nop_attach(struct iommu_domain *domain,
+ struct device *dev)
+{
+ return 0;
+}
+
+static const struct iommu_domain_ops mock_blocking_ops = {
+ .free = mock_domain_blocking_free,
+ .attach_dev = mock_domain_nop_attach,
+};
+
+static struct iommu_domain mock_blocking_domain = {
+ .type = IOMMU_DOMAIN_BLOCKED,
+ .ops = &mock_blocking_ops,
+};
+
+static void *mock_domain_hw_info(struct device *dev, u32 *length, u32 *type)
+{
+ struct iommu_test_hw_info *info;
+
+ info = kzalloc(sizeof(*info), GFP_KERNEL);
+ if (!info)
+ return ERR_PTR(-ENOMEM);
+
+ info->test_reg = IOMMU_HW_INFO_SELFTEST_REGVAL;
+ *length = sizeof(*info);
+ *type = IOMMU_HW_INFO_TYPE_SELFTEST;
+
+ return info;
+}
+
+static struct iommu_domain *mock_domain_alloc(unsigned int iommu_domain_type)
+{
+ struct mock_iommu_domain *mock;
+
+ if (iommu_domain_type == IOMMU_DOMAIN_BLOCKED)
+ return &mock_blocking_domain;
+
+ if (iommu_domain_type != IOMMU_DOMAIN_UNMANAGED)
+ return NULL;
+
+ mock = kzalloc(sizeof(*mock), GFP_KERNEL);
+ if (!mock)
+ return NULL;
+ mock->domain.geometry.aperture_start = MOCK_APERTURE_START;
+ mock->domain.geometry.aperture_end = MOCK_APERTURE_LAST;
+ mock->domain.pgsize_bitmap = MOCK_IO_PAGE_SIZE;
+ xa_init(&mock->pfns);
+ return &mock->domain;
+}
+
+static void mock_domain_free(struct iommu_domain *domain)
+{
+ struct mock_iommu_domain *mock =
+ container_of(domain, struct mock_iommu_domain, domain);
+
+ WARN_ON(!xa_empty(&mock->pfns));
+ kfree(mock);
+}
+
+static int mock_domain_map_pages(struct iommu_domain *domain,
+ unsigned long iova, phys_addr_t paddr,
+ size_t pgsize, size_t pgcount, int prot,
+ gfp_t gfp, size_t *mapped)
+{
+ struct mock_iommu_domain *mock =
+ container_of(domain, struct mock_iommu_domain, domain);
+ unsigned long flags = MOCK_PFN_START_IOVA;
+ unsigned long start_iova = iova;
+
+ /*
+ * xarray does not reliably work with fault injection because it does a
+ * retry allocation, so put our own failure point.
+ */
+ if (iommufd_should_fail())
+ return -ENOENT;
+
+ WARN_ON(iova % MOCK_IO_PAGE_SIZE);
+ WARN_ON(pgsize % MOCK_IO_PAGE_SIZE);
+ for (; pgcount; pgcount--) {
+ size_t cur;
+
+ for (cur = 0; cur != pgsize; cur += MOCK_IO_PAGE_SIZE) {
+ void *old;
+
+ if (pgcount == 1 && cur + MOCK_IO_PAGE_SIZE == pgsize)
+ flags = MOCK_PFN_LAST_IOVA;
+ old = xa_store(&mock->pfns, iova / MOCK_IO_PAGE_SIZE,
+ xa_mk_value((paddr / MOCK_IO_PAGE_SIZE) |
+ flags),
+ gfp);
+ if (xa_is_err(old)) {
+ for (; start_iova != iova;
+ start_iova += MOCK_IO_PAGE_SIZE)
+ xa_erase(&mock->pfns,
+ start_iova /
+ MOCK_IO_PAGE_SIZE);
+ return xa_err(old);
+ }
+ WARN_ON(old);
+ iova += MOCK_IO_PAGE_SIZE;
+ paddr += MOCK_IO_PAGE_SIZE;
+ *mapped += MOCK_IO_PAGE_SIZE;
+ flags = 0;
+ }
+ }
+ return 0;
+}
+
+static size_t mock_domain_unmap_pages(struct iommu_domain *domain,
+ unsigned long iova, size_t pgsize,
+ size_t pgcount,
+ struct iommu_iotlb_gather *iotlb_gather)
+{
+ struct mock_iommu_domain *mock =
+ container_of(domain, struct mock_iommu_domain, domain);
+ bool first = true;
+ size_t ret = 0;
+ void *ent;
+
+ WARN_ON(iova % MOCK_IO_PAGE_SIZE);
+ WARN_ON(pgsize % MOCK_IO_PAGE_SIZE);
+
+ for (; pgcount; pgcount--) {
+ size_t cur;
+
+ for (cur = 0; cur != pgsize; cur += MOCK_IO_PAGE_SIZE) {
+ ent = xa_erase(&mock->pfns, iova / MOCK_IO_PAGE_SIZE);
+ WARN_ON(!ent);
+ /*
+ * iommufd generates unmaps that must be a strict
+ * superset of the map's performend So every starting
+ * IOVA should have been an iova passed to map, and the
+ *
+ * First IOVA must be present and have been a first IOVA
+ * passed to map_pages
+ */
+ if (first) {
+ WARN_ON(!(xa_to_value(ent) &
+ MOCK_PFN_START_IOVA));
+ first = false;
+ }
+ if (pgcount == 1 && cur + MOCK_IO_PAGE_SIZE == pgsize)
+ WARN_ON(!(xa_to_value(ent) &
+ MOCK_PFN_LAST_IOVA));
+
+ iova += MOCK_IO_PAGE_SIZE;
+ ret += MOCK_IO_PAGE_SIZE;
+ }
+ }
+ return ret;
+}
+
+static phys_addr_t mock_domain_iova_to_phys(struct iommu_domain *domain,
+ dma_addr_t iova)
+{
+ struct mock_iommu_domain *mock =
+ container_of(domain, struct mock_iommu_domain, domain);
+ void *ent;
+
+ WARN_ON(iova % MOCK_IO_PAGE_SIZE);
+ ent = xa_load(&mock->pfns, iova / MOCK_IO_PAGE_SIZE);
+ WARN_ON(!ent);
+ return (xa_to_value(ent) & MOCK_PFN_MASK) * MOCK_IO_PAGE_SIZE;
+}
+
+static bool mock_domain_capable(struct device *dev, enum iommu_cap cap)
+{
+ return cap == IOMMU_CAP_CACHE_COHERENCY;
+}
+
+static void mock_domain_set_plaform_dma_ops(struct device *dev)
+{
+ /*
+ * mock doesn't setup default domains because we can't hook into the
+ * normal probe path
+ */
+}
+
+static struct iommu_device mock_iommu_device = {
+};
+
+static struct iommu_device *mock_probe_device(struct device *dev)
+{
+ return &mock_iommu_device;
+}
+
+static const struct iommu_ops mock_ops = {
+ .owner = THIS_MODULE,
+ .pgsize_bitmap = MOCK_IO_PAGE_SIZE,
+ .hw_info = mock_domain_hw_info,
+ .domain_alloc = mock_domain_alloc,
+ .capable = mock_domain_capable,
+ .set_platform_dma_ops = mock_domain_set_plaform_dma_ops,
+ .device_group = generic_device_group,
+ .probe_device = mock_probe_device,
+ .default_domain_ops =
+ &(struct iommu_domain_ops){
+ .free = mock_domain_free,
+ .attach_dev = mock_domain_nop_attach,
+ .map_pages = mock_domain_map_pages,
+ .unmap_pages = mock_domain_unmap_pages,
+ .iova_to_phys = mock_domain_iova_to_phys,
+ },
+};
+
+static inline struct iommufd_hw_pagetable *
+get_md_pagetable(struct iommufd_ucmd *ucmd, u32 mockpt_id,
+ struct mock_iommu_domain **mock)
+{
+ struct iommufd_hw_pagetable *hwpt;
+ struct iommufd_object *obj;
+
+ obj = iommufd_get_object(ucmd->ictx, mockpt_id,
+ IOMMUFD_OBJ_HW_PAGETABLE);
+ if (IS_ERR(obj))
+ return ERR_CAST(obj);
+ hwpt = container_of(obj, struct iommufd_hw_pagetable, obj);
+ if (hwpt->domain->ops != mock_ops.default_domain_ops) {
+ iommufd_put_object(&hwpt->obj);
+ return ERR_PTR(-EINVAL);
+ }
+ *mock = container_of(hwpt->domain, struct mock_iommu_domain, domain);
+ return hwpt;
+}
+
+struct mock_bus_type {
+ struct bus_type bus;
+ struct notifier_block nb;
+};
+
+static struct mock_bus_type iommufd_mock_bus_type = {
+ .bus = {
+ .name = "iommufd_mock",
+ },
+};
+
+static atomic_t mock_dev_num;
+
+static void mock_dev_release(struct device *dev)
+{
+ struct mock_dev *mdev = container_of(dev, struct mock_dev, dev);
+
+ atomic_dec(&mock_dev_num);
+ kfree(mdev);
+}
+
+static struct mock_dev *mock_dev_create(void)
+{
+ struct mock_dev *mdev;
+ int rc;
+
+ mdev = kzalloc(sizeof(*mdev), GFP_KERNEL);
+ if (!mdev)
+ return ERR_PTR(-ENOMEM);
+
+ device_initialize(&mdev->dev);
+ mdev->dev.release = mock_dev_release;
+ mdev->dev.bus = &iommufd_mock_bus_type.bus;
+
+ rc = dev_set_name(&mdev->dev, "iommufd_mock%u",
+ atomic_inc_return(&mock_dev_num));
+ if (rc)
+ goto err_put;
+
+ rc = device_add(&mdev->dev);
+ if (rc)
+ goto err_put;
+ return mdev;
+
+err_put:
+ put_device(&mdev->dev);
+ return ERR_PTR(rc);
+}
+
+static void mock_dev_destroy(struct mock_dev *mdev)
+{
+ device_unregister(&mdev->dev);
+}
+
+bool iommufd_selftest_is_mock_dev(struct device *dev)
+{
+ return dev->release == mock_dev_release;
+}
+
+/* Create an hw_pagetable with the mock domain so we can test the domain ops */
+static int iommufd_test_mock_domain(struct iommufd_ucmd *ucmd,
+ struct iommu_test_cmd *cmd)
+{
+ struct iommufd_device *idev;
+ struct selftest_obj *sobj;
+ u32 pt_id = cmd->id;
+ u32 idev_id;
+ int rc;
+
+ sobj = iommufd_object_alloc(ucmd->ictx, sobj, IOMMUFD_OBJ_SELFTEST);
+ if (IS_ERR(sobj))
+ return PTR_ERR(sobj);
+
+ sobj->idev.ictx = ucmd->ictx;
+ sobj->type = TYPE_IDEV;
+
+ sobj->idev.mock_dev = mock_dev_create();
+ if (IS_ERR(sobj->idev.mock_dev)) {
+ rc = PTR_ERR(sobj->idev.mock_dev);
+ goto out_sobj;
+ }
+
+ idev = iommufd_device_bind(ucmd->ictx, &sobj->idev.mock_dev->dev,
+ &idev_id);
+ if (IS_ERR(idev)) {
+ rc = PTR_ERR(idev);
+ goto out_mdev;
+ }
+ sobj->idev.idev = idev;
+
+ rc = iommufd_device_attach(idev, &pt_id);
+ if (rc)
+ goto out_unbind;
+
+ /* Userspace must destroy the device_id to destroy the object */
+ cmd->mock_domain.out_hwpt_id = pt_id;
+ cmd->mock_domain.out_stdev_id = sobj->obj.id;
+ cmd->mock_domain.out_idev_id = idev_id;
+ rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd));
+ if (rc)
+ goto out_detach;
+ iommufd_object_finalize(ucmd->ictx, &sobj->obj);
+ return 0;
+
+out_detach:
+ iommufd_device_detach(idev);
+out_unbind:
+ iommufd_device_unbind(idev);
+out_mdev:
+ mock_dev_destroy(sobj->idev.mock_dev);
+out_sobj:
+ iommufd_object_abort(ucmd->ictx, &sobj->obj);
+ return rc;
+}
+
+/* Replace the mock domain with a manually allocated hw_pagetable */
+static int iommufd_test_mock_domain_replace(struct iommufd_ucmd *ucmd,
+ unsigned int device_id, u32 pt_id,
+ struct iommu_test_cmd *cmd)
+{
+ struct iommufd_object *dev_obj;
+ struct selftest_obj *sobj;
+ int rc;
+
+ /*
+ * Prefer to use the OBJ_SELFTEST because the destroy_rwsem will ensure
+ * it doesn't race with detach, which is not allowed.
+ */
+ dev_obj =
+ iommufd_get_object(ucmd->ictx, device_id, IOMMUFD_OBJ_SELFTEST);
+ if (IS_ERR(dev_obj))
+ return PTR_ERR(dev_obj);
+
+ sobj = container_of(dev_obj, struct selftest_obj, obj);
+ if (sobj->type != TYPE_IDEV) {
+ rc = -EINVAL;
+ goto out_dev_obj;
+ }
+
+ rc = iommufd_device_replace(sobj->idev.idev, &pt_id);
+ if (rc)
+ goto out_dev_obj;
+
+ cmd->mock_domain_replace.pt_id = pt_id;
+ rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd));
+
+out_dev_obj:
+ iommufd_put_object(dev_obj);
+ return rc;
+}
+
+/* Add an additional reserved IOVA to the IOAS */
+static int iommufd_test_add_reserved(struct iommufd_ucmd *ucmd,
+ unsigned int mockpt_id,
+ unsigned long start, size_t length)
+{
+ struct iommufd_ioas *ioas;
+ int rc;
+
+ ioas = iommufd_get_ioas(ucmd->ictx, mockpt_id);
+ if (IS_ERR(ioas))
+ return PTR_ERR(ioas);
+ down_write(&ioas->iopt.iova_rwsem);
+ rc = iopt_reserve_iova(&ioas->iopt, start, start + length - 1, NULL);
+ up_write(&ioas->iopt.iova_rwsem);
+ iommufd_put_object(&ioas->obj);
+ return rc;
+}
+
+/* Check that every pfn under each iova matches the pfn under a user VA */
+static int iommufd_test_md_check_pa(struct iommufd_ucmd *ucmd,
+ unsigned int mockpt_id, unsigned long iova,
+ size_t length, void __user *uptr)
+{
+ struct iommufd_hw_pagetable *hwpt;
+ struct mock_iommu_domain *mock;
+ uintptr_t end;
+ int rc;
+
+ if (iova % MOCK_IO_PAGE_SIZE || length % MOCK_IO_PAGE_SIZE ||
+ (uintptr_t)uptr % MOCK_IO_PAGE_SIZE ||
+ check_add_overflow((uintptr_t)uptr, (uintptr_t)length, &end))
+ return -EINVAL;
+
+ hwpt = get_md_pagetable(ucmd, mockpt_id, &mock);
+ if (IS_ERR(hwpt))
+ return PTR_ERR(hwpt);
+
+ for (; length; length -= MOCK_IO_PAGE_SIZE) {
+ struct page *pages[1];
+ unsigned long pfn;
+ long npages;
+ void *ent;
+
+ npages = get_user_pages_fast((uintptr_t)uptr & PAGE_MASK, 1, 0,
+ pages);
+ if (npages < 0) {
+ rc = npages;
+ goto out_put;
+ }
+ if (WARN_ON(npages != 1)) {
+ rc = -EFAULT;
+ goto out_put;
+ }
+ pfn = page_to_pfn(pages[0]);
+ put_page(pages[0]);
+
+ ent = xa_load(&mock->pfns, iova / MOCK_IO_PAGE_SIZE);
+ if (!ent ||
+ (xa_to_value(ent) & MOCK_PFN_MASK) * MOCK_IO_PAGE_SIZE !=
+ pfn * PAGE_SIZE + ((uintptr_t)uptr % PAGE_SIZE)) {
+ rc = -EINVAL;
+ goto out_put;
+ }
+ iova += MOCK_IO_PAGE_SIZE;
+ uptr += MOCK_IO_PAGE_SIZE;
+ }
+ rc = 0;
+
+out_put:
+ iommufd_put_object(&hwpt->obj);
+ return rc;
+}
+
+/* Check that the page ref count matches, to look for missing pin/unpins */
+static int iommufd_test_md_check_refs(struct iommufd_ucmd *ucmd,
+ void __user *uptr, size_t length,
+ unsigned int refs)
+{
+ uintptr_t end;
+
+ if (length % PAGE_SIZE || (uintptr_t)uptr % PAGE_SIZE ||
+ check_add_overflow((uintptr_t)uptr, (uintptr_t)length, &end))
+ return -EINVAL;
+
+ for (; length; length -= PAGE_SIZE) {
+ struct page *pages[1];
+ long npages;
+
+ npages = get_user_pages_fast((uintptr_t)uptr, 1, 0, pages);
+ if (npages < 0)
+ return npages;
+ if (WARN_ON(npages != 1))
+ return -EFAULT;
+ if (!PageCompound(pages[0])) {
+ unsigned int count;
+
+ count = page_ref_count(pages[0]);
+ if (count / GUP_PIN_COUNTING_BIAS != refs) {
+ put_page(pages[0]);
+ return -EIO;
+ }
+ }
+ put_page(pages[0]);
+ uptr += PAGE_SIZE;
+ }
+ return 0;
+}
+
+struct selftest_access {
+ struct iommufd_access *access;
+ struct file *file;
+ struct mutex lock;
+ struct list_head items;
+ unsigned int next_id;
+ bool destroying;
+};
+
+struct selftest_access_item {
+ struct list_head items_elm;
+ unsigned long iova;
+ size_t length;
+ unsigned int id;
+};
+
+static const struct file_operations iommfd_test_staccess_fops;
+
+static struct selftest_access *iommufd_access_get(int fd)
+{
+ struct file *file;
+
+ file = fget(fd);
+ if (!file)
+ return ERR_PTR(-EBADFD);
+
+ if (file->f_op != &iommfd_test_staccess_fops) {
+ fput(file);
+ return ERR_PTR(-EBADFD);
+ }
+ return file->private_data;
+}
+
+static void iommufd_test_access_unmap(void *data, unsigned long iova,
+ unsigned long length)
+{
+ unsigned long iova_last = iova + length - 1;
+ struct selftest_access *staccess = data;
+ struct selftest_access_item *item;
+ struct selftest_access_item *tmp;
+
+ mutex_lock(&staccess->lock);
+ list_for_each_entry_safe(item, tmp, &staccess->items, items_elm) {
+ if (iova > item->iova + item->length - 1 ||
+ iova_last < item->iova)
+ continue;
+ list_del(&item->items_elm);
+ iommufd_access_unpin_pages(staccess->access, item->iova,
+ item->length);
+ kfree(item);
+ }
+ mutex_unlock(&staccess->lock);
+}
+
+static int iommufd_test_access_item_destroy(struct iommufd_ucmd *ucmd,
+ unsigned int access_id,
+ unsigned int item_id)
+{
+ struct selftest_access_item *item;
+ struct selftest_access *staccess;
+
+ staccess = iommufd_access_get(access_id);
+ if (IS_ERR(staccess))
+ return PTR_ERR(staccess);
+
+ mutex_lock(&staccess->lock);
+ list_for_each_entry(item, &staccess->items, items_elm) {
+ if (item->id == item_id) {
+ list_del(&item->items_elm);
+ iommufd_access_unpin_pages(staccess->access, item->iova,
+ item->length);
+ mutex_unlock(&staccess->lock);
+ kfree(item);
+ fput(staccess->file);
+ return 0;
+ }
+ }
+ mutex_unlock(&staccess->lock);
+ fput(staccess->file);
+ return -ENOENT;
+}
+
+static int iommufd_test_staccess_release(struct inode *inode,
+ struct file *filep)
+{
+ struct selftest_access *staccess = filep->private_data;
+
+ if (staccess->access) {
+ iommufd_test_access_unmap(staccess, 0, ULONG_MAX);
+ iommufd_access_destroy(staccess->access);
+ }
+ mutex_destroy(&staccess->lock);
+ kfree(staccess);
+ return 0;
+}
+
+static const struct iommufd_access_ops selftest_access_ops_pin = {
+ .needs_pin_pages = 1,
+ .unmap = iommufd_test_access_unmap,
+};
+
+static const struct iommufd_access_ops selftest_access_ops = {
+ .unmap = iommufd_test_access_unmap,
+};
+
+static const struct file_operations iommfd_test_staccess_fops = {
+ .release = iommufd_test_staccess_release,
+};
+
+static struct selftest_access *iommufd_test_alloc_access(void)
+{
+ struct selftest_access *staccess;
+ struct file *filep;
+
+ staccess = kzalloc(sizeof(*staccess), GFP_KERNEL_ACCOUNT);
+ if (!staccess)
+ return ERR_PTR(-ENOMEM);
+ INIT_LIST_HEAD(&staccess->items);
+ mutex_init(&staccess->lock);
+
+ filep = anon_inode_getfile("[iommufd_test_staccess]",
+ &iommfd_test_staccess_fops, staccess,
+ O_RDWR);
+ if (IS_ERR(filep)) {
+ kfree(staccess);
+ return ERR_CAST(filep);
+ }
+ staccess->file = filep;
+ return staccess;
+}
+
+static int iommufd_test_create_access(struct iommufd_ucmd *ucmd,
+ unsigned int ioas_id, unsigned int flags)
+{
+ struct iommu_test_cmd *cmd = ucmd->cmd;
+ struct selftest_access *staccess;
+ struct iommufd_access *access;
+ u32 id;
+ int fdno;
+ int rc;
+
+ if (flags & ~MOCK_FLAGS_ACCESS_CREATE_NEEDS_PIN_PAGES)
+ return -EOPNOTSUPP;
+
+ staccess = iommufd_test_alloc_access();
+ if (IS_ERR(staccess))
+ return PTR_ERR(staccess);
+
+ fdno = get_unused_fd_flags(O_CLOEXEC);
+ if (fdno < 0) {
+ rc = -ENOMEM;
+ goto out_free_staccess;
+ }
+
+ access = iommufd_access_create(
+ ucmd->ictx,
+ (flags & MOCK_FLAGS_ACCESS_CREATE_NEEDS_PIN_PAGES) ?
+ &selftest_access_ops_pin :
+ &selftest_access_ops,
+ staccess, &id);
+ if (IS_ERR(access)) {
+ rc = PTR_ERR(access);
+ goto out_put_fdno;
+ }
+ rc = iommufd_access_attach(access, ioas_id);
+ if (rc)
+ goto out_destroy;
+ cmd->create_access.out_access_fd = fdno;
+ rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd));
+ if (rc)
+ goto out_destroy;
+
+ staccess->access = access;
+ fd_install(fdno, staccess->file);
+ return 0;
+
+out_destroy:
+ iommufd_access_destroy(access);
+out_put_fdno:
+ put_unused_fd(fdno);
+out_free_staccess:
+ fput(staccess->file);
+ return rc;
+}
+
+static int iommufd_test_access_replace_ioas(struct iommufd_ucmd *ucmd,
+ unsigned int access_id,
+ unsigned int ioas_id)
+{
+ struct selftest_access *staccess;
+ int rc;
+
+ staccess = iommufd_access_get(access_id);
+ if (IS_ERR(staccess))
+ return PTR_ERR(staccess);
+
+ rc = iommufd_access_replace(staccess->access, ioas_id);
+ fput(staccess->file);
+ return rc;
+}
+
+/* Check that the pages in a page array match the pages in the user VA */
+static int iommufd_test_check_pages(void __user *uptr, struct page **pages,
+ size_t npages)
+{
+ for (; npages; npages--) {
+ struct page *tmp_pages[1];
+ long rc;
+
+ rc = get_user_pages_fast((uintptr_t)uptr, 1, 0, tmp_pages);
+ if (rc < 0)
+ return rc;
+ if (WARN_ON(rc != 1))
+ return -EFAULT;
+ put_page(tmp_pages[0]);
+ if (tmp_pages[0] != *pages)
+ return -EBADE;
+ pages++;
+ uptr += PAGE_SIZE;
+ }
+ return 0;
+}
+
+static int iommufd_test_access_pages(struct iommufd_ucmd *ucmd,
+ unsigned int access_id, unsigned long iova,
+ size_t length, void __user *uptr,
+ u32 flags)
+{
+ struct iommu_test_cmd *cmd = ucmd->cmd;
+ struct selftest_access_item *item;
+ struct selftest_access *staccess;
+ struct page **pages;
+ size_t npages;
+ int rc;
+
+ /* Prevent syzkaller from triggering a WARN_ON in kvzalloc() */
+ if (length > 16*1024*1024)
+ return -ENOMEM;
+
+ if (flags & ~(MOCK_FLAGS_ACCESS_WRITE | MOCK_FLAGS_ACCESS_SYZ))
+ return -EOPNOTSUPP;
+
+ staccess = iommufd_access_get(access_id);
+ if (IS_ERR(staccess))
+ return PTR_ERR(staccess);
+
+ if (staccess->access->ops != &selftest_access_ops_pin) {
+ rc = -EOPNOTSUPP;
+ goto out_put;
+ }
+
+ if (flags & MOCK_FLAGS_ACCESS_SYZ)
+ iova = iommufd_test_syz_conv_iova(&staccess->access->ioas->iopt,
+ &cmd->access_pages.iova);
+
+ npages = (ALIGN(iova + length, PAGE_SIZE) -
+ ALIGN_DOWN(iova, PAGE_SIZE)) /
+ PAGE_SIZE;
+ pages = kvcalloc(npages, sizeof(*pages), GFP_KERNEL_ACCOUNT);
+ if (!pages) {
+ rc = -ENOMEM;
+ goto out_put;
+ }
+
+ /*
+ * Drivers will need to think very carefully about this locking. The
+ * core code can do multiple unmaps instantaneously after
+ * iommufd_access_pin_pages() and *all* the unmaps must not return until
+ * the range is unpinned. This simple implementation puts a global lock
+ * around the pin, which may not suit drivers that want this to be a
+ * performance path. drivers that get this wrong will trigger WARN_ON
+ * races and cause EDEADLOCK failures to userspace.
+ */
+ mutex_lock(&staccess->lock);
+ rc = iommufd_access_pin_pages(staccess->access, iova, length, pages,
+ flags & MOCK_FLAGS_ACCESS_WRITE);
+ if (rc)
+ goto out_unlock;
+
+ /* For syzkaller allow uptr to be NULL to skip this check */
+ if (uptr) {
+ rc = iommufd_test_check_pages(
+ uptr - (iova - ALIGN_DOWN(iova, PAGE_SIZE)), pages,
+ npages);
+ if (rc)
+ goto out_unaccess;
+ }
+
+ item = kzalloc(sizeof(*item), GFP_KERNEL_ACCOUNT);
+ if (!item) {
+ rc = -ENOMEM;
+ goto out_unaccess;
+ }
+
+ item->iova = iova;
+ item->length = length;
+ item->id = staccess->next_id++;
+ list_add_tail(&item->items_elm, &staccess->items);
+
+ cmd->access_pages.out_access_pages_id = item->id;
+ rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd));
+ if (rc)
+ goto out_free_item;
+ goto out_unlock;
+
+out_free_item:
+ list_del(&item->items_elm);
+ kfree(item);
+out_unaccess:
+ iommufd_access_unpin_pages(staccess->access, iova, length);
+out_unlock:
+ mutex_unlock(&staccess->lock);
+ kvfree(pages);
+out_put:
+ fput(staccess->file);
+ return rc;
+}
+
+static int iommufd_test_access_rw(struct iommufd_ucmd *ucmd,
+ unsigned int access_id, unsigned long iova,
+ size_t length, void __user *ubuf,
+ unsigned int flags)
+{
+ struct iommu_test_cmd *cmd = ucmd->cmd;
+ struct selftest_access *staccess;
+ void *tmp;
+ int rc;
+
+ /* Prevent syzkaller from triggering a WARN_ON in kvzalloc() */
+ if (length > 16*1024*1024)
+ return -ENOMEM;
+
+ if (flags & ~(MOCK_ACCESS_RW_WRITE | MOCK_ACCESS_RW_SLOW_PATH |
+ MOCK_FLAGS_ACCESS_SYZ))
+ return -EOPNOTSUPP;
+
+ staccess = iommufd_access_get(access_id);
+ if (IS_ERR(staccess))
+ return PTR_ERR(staccess);
+
+ tmp = kvzalloc(length, GFP_KERNEL_ACCOUNT);
+ if (!tmp) {
+ rc = -ENOMEM;
+ goto out_put;
+ }
+
+ if (flags & MOCK_ACCESS_RW_WRITE) {
+ if (copy_from_user(tmp, ubuf, length)) {
+ rc = -EFAULT;
+ goto out_free;
+ }
+ }
+
+ if (flags & MOCK_FLAGS_ACCESS_SYZ)
+ iova = iommufd_test_syz_conv_iova(&staccess->access->ioas->iopt,
+ &cmd->access_rw.iova);
+
+ rc = iommufd_access_rw(staccess->access, iova, tmp, length, flags);
+ if (rc)
+ goto out_free;
+ if (!(flags & MOCK_ACCESS_RW_WRITE)) {
+ if (copy_to_user(ubuf, tmp, length)) {
+ rc = -EFAULT;
+ goto out_free;
+ }
+ }
+
+out_free:
+ kvfree(tmp);
+out_put:
+ fput(staccess->file);
+ return rc;
+}
+static_assert((unsigned int)MOCK_ACCESS_RW_WRITE == IOMMUFD_ACCESS_RW_WRITE);
+static_assert((unsigned int)MOCK_ACCESS_RW_SLOW_PATH ==
+ __IOMMUFD_ACCESS_RW_SLOW_PATH);
+
+void iommufd_selftest_destroy(struct iommufd_object *obj)
+{
+ struct selftest_obj *sobj = container_of(obj, struct selftest_obj, obj);
+
+ switch (sobj->type) {
+ case TYPE_IDEV:
+ iommufd_device_detach(sobj->idev.idev);
+ iommufd_device_unbind(sobj->idev.idev);
+ mock_dev_destroy(sobj->idev.mock_dev);
+ break;
+ }
+}
+
+int iommufd_test(struct iommufd_ucmd *ucmd)
+{
+ struct iommu_test_cmd *cmd = ucmd->cmd;
+
+ switch (cmd->op) {
+ case IOMMU_TEST_OP_ADD_RESERVED:
+ return iommufd_test_add_reserved(ucmd, cmd->id,
+ cmd->add_reserved.start,
+ cmd->add_reserved.length);
+ case IOMMU_TEST_OP_MOCK_DOMAIN:
+ return iommufd_test_mock_domain(ucmd, cmd);
+ case IOMMU_TEST_OP_MOCK_DOMAIN_REPLACE:
+ return iommufd_test_mock_domain_replace(
+ ucmd, cmd->id, cmd->mock_domain_replace.pt_id, cmd);
+ case IOMMU_TEST_OP_MD_CHECK_MAP:
+ return iommufd_test_md_check_pa(
+ ucmd, cmd->id, cmd->check_map.iova,
+ cmd->check_map.length,
+ u64_to_user_ptr(cmd->check_map.uptr));
+ case IOMMU_TEST_OP_MD_CHECK_REFS:
+ return iommufd_test_md_check_refs(
+ ucmd, u64_to_user_ptr(cmd->check_refs.uptr),
+ cmd->check_refs.length, cmd->check_refs.refs);
+ case IOMMU_TEST_OP_CREATE_ACCESS:
+ return iommufd_test_create_access(ucmd, cmd->id,
+ cmd->create_access.flags);
+ case IOMMU_TEST_OP_ACCESS_REPLACE_IOAS:
+ return iommufd_test_access_replace_ioas(
+ ucmd, cmd->id, cmd->access_replace_ioas.ioas_id);
+ case IOMMU_TEST_OP_ACCESS_PAGES:
+ return iommufd_test_access_pages(
+ ucmd, cmd->id, cmd->access_pages.iova,
+ cmd->access_pages.length,
+ u64_to_user_ptr(cmd->access_pages.uptr),
+ cmd->access_pages.flags);
+ case IOMMU_TEST_OP_ACCESS_RW:
+ return iommufd_test_access_rw(
+ ucmd, cmd->id, cmd->access_rw.iova,
+ cmd->access_rw.length,
+ u64_to_user_ptr(cmd->access_rw.uptr),
+ cmd->access_rw.flags);
+ case IOMMU_TEST_OP_DESTROY_ACCESS_PAGES:
+ return iommufd_test_access_item_destroy(
+ ucmd, cmd->id, cmd->destroy_access_pages.access_pages_id);
+ case IOMMU_TEST_OP_SET_TEMP_MEMORY_LIMIT:
+ /* Protect _batch_init(), can not be less than elmsz */
+ if (cmd->memory_limit.limit <
+ sizeof(unsigned long) + sizeof(u32))
+ return -EINVAL;
+ iommufd_test_memory_limit = cmd->memory_limit.limit;
+ return 0;
+ default:
+ return -EOPNOTSUPP;
+ }
+}
+
+bool iommufd_should_fail(void)
+{
+ return should_fail(&fail_iommufd, 1);
+}
+
+int __init iommufd_test_init(void)
+{
+ struct platform_device_info pdevinfo = {
+ .name = "iommufd_selftest_iommu",
+ };
+ int rc;
+
+ dbgfs_root =
+ fault_create_debugfs_attr("fail_iommufd", NULL, &fail_iommufd);
+
+ selftest_iommu_dev = platform_device_register_full(&pdevinfo);
+ if (IS_ERR(selftest_iommu_dev)) {
+ rc = PTR_ERR(selftest_iommu_dev);
+ goto err_dbgfs;
+ }
+
+ rc = bus_register(&iommufd_mock_bus_type.bus);
+ if (rc)
+ goto err_platform;
+
+ rc = iommu_device_sysfs_add(&mock_iommu_device,
+ &selftest_iommu_dev->dev, NULL, "%s",
+ dev_name(&selftest_iommu_dev->dev));
+ if (rc)
+ goto err_bus;
+
+ rc = iommu_device_register_bus(&mock_iommu_device, &mock_ops,
+ &iommufd_mock_bus_type.bus,
+ &iommufd_mock_bus_type.nb);
+ if (rc)
+ goto err_sysfs;
+ return 0;
+
+err_sysfs:
+ iommu_device_sysfs_remove(&mock_iommu_device);
+err_bus:
+ bus_unregister(&iommufd_mock_bus_type.bus);
+err_platform:
+ platform_device_unregister(selftest_iommu_dev);
+err_dbgfs:
+ debugfs_remove_recursive(dbgfs_root);
+ return rc;
+}
+
+void iommufd_test_exit(void)
+{
+ iommu_device_sysfs_remove(&mock_iommu_device);
+ iommu_device_unregister_bus(&mock_iommu_device,
+ &iommufd_mock_bus_type.bus,
+ &iommufd_mock_bus_type.nb);
+ bus_unregister(&iommufd_mock_bus_type.bus);
+ platform_device_unregister(selftest_iommu_dev);
+ debugfs_remove_recursive(dbgfs_root);
+}
diff --git a/drivers/iommu/iommufd/vfio_compat.c b/drivers/iommu/iommufd/vfio_compat.c
new file mode 100644
index 0000000000..6c810bf80f
--- /dev/null
+++ b/drivers/iommu/iommufd/vfio_compat.c
@@ -0,0 +1,541 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES
+ */
+#include <linux/file.h>
+#include <linux/interval_tree.h>
+#include <linux/iommu.h>
+#include <linux/iommufd.h>
+#include <linux/slab.h>
+#include <linux/vfio.h>
+#include <uapi/linux/vfio.h>
+#include <uapi/linux/iommufd.h>
+
+#include "iommufd_private.h"
+
+static struct iommufd_ioas *get_compat_ioas(struct iommufd_ctx *ictx)
+{
+ struct iommufd_ioas *ioas = ERR_PTR(-ENODEV);
+
+ xa_lock(&ictx->objects);
+ if (!ictx->vfio_ioas || !iommufd_lock_obj(&ictx->vfio_ioas->obj))
+ goto out_unlock;
+ ioas = ictx->vfio_ioas;
+out_unlock:
+ xa_unlock(&ictx->objects);
+ return ioas;
+}
+
+/**
+ * iommufd_vfio_compat_ioas_get_id - Ensure a compat IOAS exists
+ * @ictx: Context to operate on
+ * @out_ioas_id: The IOAS ID of the compatibility IOAS
+ *
+ * Return the ID of the current compatibility IOAS. The ID can be passed into
+ * other functions that take an ioas_id.
+ */
+int iommufd_vfio_compat_ioas_get_id(struct iommufd_ctx *ictx, u32 *out_ioas_id)
+{
+ struct iommufd_ioas *ioas;
+
+ ioas = get_compat_ioas(ictx);
+ if (IS_ERR(ioas))
+ return PTR_ERR(ioas);
+ *out_ioas_id = ioas->obj.id;
+ iommufd_put_object(&ioas->obj);
+ return 0;
+}
+EXPORT_SYMBOL_NS_GPL(iommufd_vfio_compat_ioas_get_id, IOMMUFD_VFIO);
+
+/**
+ * iommufd_vfio_compat_set_no_iommu - Called when a no-iommu device is attached
+ * @ictx: Context to operate on
+ *
+ * This allows selecting the VFIO_NOIOMMU_IOMMU and blocks normal types.
+ */
+int iommufd_vfio_compat_set_no_iommu(struct iommufd_ctx *ictx)
+{
+ int ret;
+
+ xa_lock(&ictx->objects);
+ if (!ictx->vfio_ioas) {
+ ictx->no_iommu_mode = 1;
+ ret = 0;
+ } else {
+ ret = -EINVAL;
+ }
+ xa_unlock(&ictx->objects);
+ return ret;
+}
+EXPORT_SYMBOL_NS_GPL(iommufd_vfio_compat_set_no_iommu, IOMMUFD_VFIO);
+
+/**
+ * iommufd_vfio_compat_ioas_create - Ensure the compat IOAS is created
+ * @ictx: Context to operate on
+ *
+ * The compatibility IOAS is the IOAS that the vfio compatibility ioctls operate
+ * on since they do not have an IOAS ID input in their ABI. Only attaching a
+ * group should cause a default creation of the internal ioas, this does nothing
+ * if an existing ioas has already been assigned somehow.
+ */
+int iommufd_vfio_compat_ioas_create(struct iommufd_ctx *ictx)
+{
+ struct iommufd_ioas *ioas = NULL;
+ int ret;
+
+ ioas = iommufd_ioas_alloc(ictx);
+ if (IS_ERR(ioas))
+ return PTR_ERR(ioas);
+
+ xa_lock(&ictx->objects);
+ /*
+ * VFIO won't allow attaching a container to both iommu and no iommu
+ * operation
+ */
+ if (ictx->no_iommu_mode) {
+ ret = -EINVAL;
+ goto out_abort;
+ }
+
+ if (ictx->vfio_ioas && iommufd_lock_obj(&ictx->vfio_ioas->obj)) {
+ ret = 0;
+ iommufd_put_object(&ictx->vfio_ioas->obj);
+ goto out_abort;
+ }
+ ictx->vfio_ioas = ioas;
+ xa_unlock(&ictx->objects);
+
+ /*
+ * An automatically created compat IOAS is treated as a userspace
+ * created object. Userspace can learn the ID via IOMMU_VFIO_IOAS_GET,
+ * and if not manually destroyed it will be destroyed automatically
+ * at iommufd release.
+ */
+ iommufd_object_finalize(ictx, &ioas->obj);
+ return 0;
+
+out_abort:
+ xa_unlock(&ictx->objects);
+ iommufd_object_abort(ictx, &ioas->obj);
+ return ret;
+}
+EXPORT_SYMBOL_NS_GPL(iommufd_vfio_compat_ioas_create, IOMMUFD_VFIO);
+
+int iommufd_vfio_ioas(struct iommufd_ucmd *ucmd)
+{
+ struct iommu_vfio_ioas *cmd = ucmd->cmd;
+ struct iommufd_ioas *ioas;
+
+ if (cmd->__reserved)
+ return -EOPNOTSUPP;
+ switch (cmd->op) {
+ case IOMMU_VFIO_IOAS_GET:
+ ioas = get_compat_ioas(ucmd->ictx);
+ if (IS_ERR(ioas))
+ return PTR_ERR(ioas);
+ cmd->ioas_id = ioas->obj.id;
+ iommufd_put_object(&ioas->obj);
+ return iommufd_ucmd_respond(ucmd, sizeof(*cmd));
+
+ case IOMMU_VFIO_IOAS_SET:
+ ioas = iommufd_get_ioas(ucmd->ictx, cmd->ioas_id);
+ if (IS_ERR(ioas))
+ return PTR_ERR(ioas);
+ xa_lock(&ucmd->ictx->objects);
+ ucmd->ictx->vfio_ioas = ioas;
+ xa_unlock(&ucmd->ictx->objects);
+ iommufd_put_object(&ioas->obj);
+ return 0;
+
+ case IOMMU_VFIO_IOAS_CLEAR:
+ xa_lock(&ucmd->ictx->objects);
+ ucmd->ictx->vfio_ioas = NULL;
+ xa_unlock(&ucmd->ictx->objects);
+ return 0;
+ default:
+ return -EOPNOTSUPP;
+ }
+}
+
+static int iommufd_vfio_map_dma(struct iommufd_ctx *ictx, unsigned int cmd,
+ void __user *arg)
+{
+ u32 supported_flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE;
+ size_t minsz = offsetofend(struct vfio_iommu_type1_dma_map, size);
+ struct vfio_iommu_type1_dma_map map;
+ int iommu_prot = IOMMU_CACHE;
+ struct iommufd_ioas *ioas;
+ unsigned long iova;
+ int rc;
+
+ if (copy_from_user(&map, arg, minsz))
+ return -EFAULT;
+
+ if (map.argsz < minsz || map.flags & ~supported_flags)
+ return -EINVAL;
+
+ if (map.flags & VFIO_DMA_MAP_FLAG_READ)
+ iommu_prot |= IOMMU_READ;
+ if (map.flags & VFIO_DMA_MAP_FLAG_WRITE)
+ iommu_prot |= IOMMU_WRITE;
+
+ ioas = get_compat_ioas(ictx);
+ if (IS_ERR(ioas))
+ return PTR_ERR(ioas);
+
+ /*
+ * Maps created through the legacy interface always use VFIO compatible
+ * rlimit accounting. If the user wishes to use the faster user based
+ * rlimit accounting then they must use the new interface.
+ */
+ iova = map.iova;
+ rc = iopt_map_user_pages(ictx, &ioas->iopt, &iova, u64_to_user_ptr(map.vaddr),
+ map.size, iommu_prot, 0);
+ iommufd_put_object(&ioas->obj);
+ return rc;
+}
+
+static int iommufd_vfio_unmap_dma(struct iommufd_ctx *ictx, unsigned int cmd,
+ void __user *arg)
+{
+ size_t minsz = offsetofend(struct vfio_iommu_type1_dma_unmap, size);
+ /*
+ * VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP is obsoleted by the new
+ * dirty tracking direction:
+ * https://lore.kernel.org/kvm/20220731125503.142683-1-yishaih@nvidia.com/
+ * https://lore.kernel.org/kvm/20220428210933.3583-1-joao.m.martins@oracle.com/
+ */
+ u32 supported_flags = VFIO_DMA_UNMAP_FLAG_ALL;
+ struct vfio_iommu_type1_dma_unmap unmap;
+ unsigned long unmapped = 0;
+ struct iommufd_ioas *ioas;
+ int rc;
+
+ if (copy_from_user(&unmap, arg, minsz))
+ return -EFAULT;
+
+ if (unmap.argsz < minsz || unmap.flags & ~supported_flags)
+ return -EINVAL;
+
+ ioas = get_compat_ioas(ictx);
+ if (IS_ERR(ioas))
+ return PTR_ERR(ioas);
+
+ if (unmap.flags & VFIO_DMA_UNMAP_FLAG_ALL) {
+ if (unmap.iova != 0 || unmap.size != 0) {
+ rc = -EINVAL;
+ goto err_put;
+ }
+ rc = iopt_unmap_all(&ioas->iopt, &unmapped);
+ } else {
+ if (READ_ONCE(ioas->iopt.disable_large_pages)) {
+ /*
+ * Create cuts at the start and last of the requested
+ * range. If the start IOVA is 0 then it doesn't need to
+ * be cut.
+ */
+ unsigned long iovas[] = { unmap.iova + unmap.size - 1,
+ unmap.iova - 1 };
+
+ rc = iopt_cut_iova(&ioas->iopt, iovas,
+ unmap.iova ? 2 : 1);
+ if (rc)
+ goto err_put;
+ }
+ rc = iopt_unmap_iova(&ioas->iopt, unmap.iova, unmap.size,
+ &unmapped);
+ }
+ unmap.size = unmapped;
+ if (copy_to_user(arg, &unmap, minsz))
+ rc = -EFAULT;
+
+err_put:
+ iommufd_put_object(&ioas->obj);
+ return rc;
+}
+
+static int iommufd_vfio_cc_iommu(struct iommufd_ctx *ictx)
+{
+ struct iommufd_hw_pagetable *hwpt;
+ struct iommufd_ioas *ioas;
+ int rc = 1;
+
+ ioas = get_compat_ioas(ictx);
+ if (IS_ERR(ioas))
+ return PTR_ERR(ioas);
+
+ mutex_lock(&ioas->mutex);
+ list_for_each_entry(hwpt, &ioas->hwpt_list, hwpt_item) {
+ if (!hwpt->enforce_cache_coherency) {
+ rc = 0;
+ break;
+ }
+ }
+ mutex_unlock(&ioas->mutex);
+
+ iommufd_put_object(&ioas->obj);
+ return rc;
+}
+
+static int iommufd_vfio_check_extension(struct iommufd_ctx *ictx,
+ unsigned long type)
+{
+ switch (type) {
+ case VFIO_TYPE1_IOMMU:
+ case VFIO_TYPE1v2_IOMMU:
+ case VFIO_UNMAP_ALL:
+ return 1;
+
+ case VFIO_NOIOMMU_IOMMU:
+ return IS_ENABLED(CONFIG_VFIO_NOIOMMU);
+
+ case VFIO_DMA_CC_IOMMU:
+ return iommufd_vfio_cc_iommu(ictx);
+
+ /*
+ * This is obsolete, and to be removed from VFIO. It was an incomplete
+ * idea that got merged.
+ * https://lore.kernel.org/kvm/0-v1-0093c9b0e345+19-vfio_no_nesting_jgg@nvidia.com/
+ */
+ case VFIO_TYPE1_NESTING_IOMMU:
+ return 0;
+
+ /*
+ * VFIO_DMA_MAP_FLAG_VADDR
+ * https://lore.kernel.org/kvm/1611939252-7240-1-git-send-email-steven.sistare@oracle.com/
+ * https://lore.kernel.org/all/Yz777bJZjTyLrHEQ@nvidia.com/
+ *
+ * It is hard to see how this could be implemented safely.
+ */
+ case VFIO_UPDATE_VADDR:
+ default:
+ return 0;
+ }
+}
+
+static int iommufd_vfio_set_iommu(struct iommufd_ctx *ictx, unsigned long type)
+{
+ bool no_iommu_mode = READ_ONCE(ictx->no_iommu_mode);
+ struct iommufd_ioas *ioas = NULL;
+ int rc = 0;
+
+ /*
+ * Emulation for NOIOMMU is imperfect in that VFIO blocks almost all
+ * other ioctls. We let them keep working but they mostly fail since no
+ * IOAS should exist.
+ */
+ if (IS_ENABLED(CONFIG_VFIO_NOIOMMU) && type == VFIO_NOIOMMU_IOMMU &&
+ no_iommu_mode) {
+ if (!capable(CAP_SYS_RAWIO))
+ return -EPERM;
+ return 0;
+ }
+
+ if ((type != VFIO_TYPE1_IOMMU && type != VFIO_TYPE1v2_IOMMU) ||
+ no_iommu_mode)
+ return -EINVAL;
+
+ /* VFIO fails the set_iommu if there is no group */
+ ioas = get_compat_ioas(ictx);
+ if (IS_ERR(ioas))
+ return PTR_ERR(ioas);
+
+ /*
+ * The difference between TYPE1 and TYPE1v2 is the ability to unmap in
+ * the middle of mapped ranges. This is complicated by huge page support
+ * which creates single large IOPTEs that cannot be split by the iommu
+ * driver. TYPE1 is very old at this point and likely nothing uses it,
+ * however it is simple enough to emulate by simply disabling the
+ * problematic large IOPTEs. Then we can safely unmap within any range.
+ */
+ if (type == VFIO_TYPE1_IOMMU)
+ rc = iopt_disable_large_pages(&ioas->iopt);
+ iommufd_put_object(&ioas->obj);
+ return rc;
+}
+
+static unsigned long iommufd_get_pagesizes(struct iommufd_ioas *ioas)
+{
+ struct io_pagetable *iopt = &ioas->iopt;
+ unsigned long pgsize_bitmap = ULONG_MAX;
+ struct iommu_domain *domain;
+ unsigned long index;
+
+ down_read(&iopt->domains_rwsem);
+ xa_for_each(&iopt->domains, index, domain)
+ pgsize_bitmap &= domain->pgsize_bitmap;
+
+ /* See vfio_update_pgsize_bitmap() */
+ if (pgsize_bitmap & ~PAGE_MASK) {
+ pgsize_bitmap &= PAGE_MASK;
+ pgsize_bitmap |= PAGE_SIZE;
+ }
+ pgsize_bitmap = max(pgsize_bitmap, ioas->iopt.iova_alignment);
+ up_read(&iopt->domains_rwsem);
+ return pgsize_bitmap;
+}
+
+static int iommufd_fill_cap_iova(struct iommufd_ioas *ioas,
+ struct vfio_info_cap_header __user *cur,
+ size_t avail)
+{
+ struct vfio_iommu_type1_info_cap_iova_range __user *ucap_iovas =
+ container_of(cur,
+ struct vfio_iommu_type1_info_cap_iova_range __user,
+ header);
+ struct vfio_iommu_type1_info_cap_iova_range cap_iovas = {
+ .header = {
+ .id = VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE,
+ .version = 1,
+ },
+ };
+ struct interval_tree_span_iter span;
+
+ interval_tree_for_each_span(&span, &ioas->iopt.reserved_itree, 0,
+ ULONG_MAX) {
+ struct vfio_iova_range range;
+
+ if (!span.is_hole)
+ continue;
+ range.start = span.start_hole;
+ range.end = span.last_hole;
+ if (avail >= struct_size(&cap_iovas, iova_ranges,
+ cap_iovas.nr_iovas + 1) &&
+ copy_to_user(&ucap_iovas->iova_ranges[cap_iovas.nr_iovas],
+ &range, sizeof(range)))
+ return -EFAULT;
+ cap_iovas.nr_iovas++;
+ }
+ if (avail >= struct_size(&cap_iovas, iova_ranges, cap_iovas.nr_iovas) &&
+ copy_to_user(ucap_iovas, &cap_iovas, sizeof(cap_iovas)))
+ return -EFAULT;
+ return struct_size(&cap_iovas, iova_ranges, cap_iovas.nr_iovas);
+}
+
+static int iommufd_fill_cap_dma_avail(struct iommufd_ioas *ioas,
+ struct vfio_info_cap_header __user *cur,
+ size_t avail)
+{
+ struct vfio_iommu_type1_info_dma_avail cap_dma = {
+ .header = {
+ .id = VFIO_IOMMU_TYPE1_INFO_DMA_AVAIL,
+ .version = 1,
+ },
+ /*
+ * iommufd's limit is based on the cgroup's memory limit.
+ * Normally vfio would return U16_MAX here, and provide a module
+ * parameter to adjust it. Since S390 qemu userspace actually
+ * pays attention and needs a value bigger than U16_MAX return
+ * U32_MAX.
+ */
+ .avail = U32_MAX,
+ };
+
+ if (avail >= sizeof(cap_dma) &&
+ copy_to_user(cur, &cap_dma, sizeof(cap_dma)))
+ return -EFAULT;
+ return sizeof(cap_dma);
+}
+
+static int iommufd_vfio_iommu_get_info(struct iommufd_ctx *ictx,
+ void __user *arg)
+{
+ typedef int (*fill_cap_fn)(struct iommufd_ioas *ioas,
+ struct vfio_info_cap_header __user *cur,
+ size_t avail);
+ static const fill_cap_fn fill_fns[] = {
+ iommufd_fill_cap_dma_avail,
+ iommufd_fill_cap_iova,
+ };
+ size_t minsz = offsetofend(struct vfio_iommu_type1_info, iova_pgsizes);
+ struct vfio_info_cap_header __user *last_cap = NULL;
+ struct vfio_iommu_type1_info info = {};
+ struct iommufd_ioas *ioas;
+ size_t total_cap_size;
+ int rc;
+ int i;
+
+ if (copy_from_user(&info, arg, minsz))
+ return -EFAULT;
+
+ if (info.argsz < minsz)
+ return -EINVAL;
+ minsz = min_t(size_t, info.argsz, sizeof(info));
+
+ ioas = get_compat_ioas(ictx);
+ if (IS_ERR(ioas))
+ return PTR_ERR(ioas);
+
+ info.flags = VFIO_IOMMU_INFO_PGSIZES;
+ info.iova_pgsizes = iommufd_get_pagesizes(ioas);
+ info.cap_offset = 0;
+
+ down_read(&ioas->iopt.iova_rwsem);
+ total_cap_size = sizeof(info);
+ for (i = 0; i != ARRAY_SIZE(fill_fns); i++) {
+ int cap_size;
+
+ if (info.argsz > total_cap_size)
+ cap_size = fill_fns[i](ioas, arg + total_cap_size,
+ info.argsz - total_cap_size);
+ else
+ cap_size = fill_fns[i](ioas, NULL, 0);
+ if (cap_size < 0) {
+ rc = cap_size;
+ goto out_put;
+ }
+ cap_size = ALIGN(cap_size, sizeof(u64));
+
+ if (last_cap && info.argsz >= total_cap_size &&
+ put_user(total_cap_size, &last_cap->next)) {
+ rc = -EFAULT;
+ goto out_put;
+ }
+ last_cap = arg + total_cap_size;
+ total_cap_size += cap_size;
+ }
+
+ /*
+ * If the user did not provide enough space then only some caps are
+ * returned and the argsz will be updated to the correct amount to get
+ * all caps.
+ */
+ if (info.argsz >= total_cap_size)
+ info.cap_offset = sizeof(info);
+ info.argsz = total_cap_size;
+ info.flags |= VFIO_IOMMU_INFO_CAPS;
+ if (copy_to_user(arg, &info, minsz)) {
+ rc = -EFAULT;
+ goto out_put;
+ }
+ rc = 0;
+
+out_put:
+ up_read(&ioas->iopt.iova_rwsem);
+ iommufd_put_object(&ioas->obj);
+ return rc;
+}
+
+int iommufd_vfio_ioctl(struct iommufd_ctx *ictx, unsigned int cmd,
+ unsigned long arg)
+{
+ void __user *uarg = (void __user *)arg;
+
+ switch (cmd) {
+ case VFIO_GET_API_VERSION:
+ return VFIO_API_VERSION;
+ case VFIO_SET_IOMMU:
+ return iommufd_vfio_set_iommu(ictx, arg);
+ case VFIO_CHECK_EXTENSION:
+ return iommufd_vfio_check_extension(ictx, arg);
+ case VFIO_IOMMU_GET_INFO:
+ return iommufd_vfio_iommu_get_info(ictx, uarg);
+ case VFIO_IOMMU_MAP_DMA:
+ return iommufd_vfio_map_dma(ictx, cmd, uarg);
+ case VFIO_IOMMU_UNMAP_DMA:
+ return iommufd_vfio_unmap_dma(ictx, cmd, uarg);
+ case VFIO_IOMMU_DIRTY_PAGES:
+ default:
+ return -ENOIOCTLCMD;
+ }
+ return -ENOIOCTLCMD;
+}