diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-11 08:27:49 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-11 08:27:49 +0000 |
commit | ace9429bb58fd418f0c81d4c2835699bddf6bde6 (patch) | |
tree | b2d64bc10158fdd5497876388cd68142ca374ed3 /drivers/iommu/iommufd | |
parent | Initial commit. (diff) | |
download | linux-ace9429bb58fd418f0c81d4c2835699bddf6bde6.tar.xz linux-ace9429bb58fd418f0c81d4c2835699bddf6bde6.zip |
Adding upstream version 6.6.15.upstream/6.6.15
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'drivers/iommu/iommufd')
-rw-r--r-- | drivers/iommu/iommufd/Kconfig | 44 | ||||
-rw-r--r-- | drivers/iommu/iommufd/Makefile | 13 | ||||
-rw-r--r-- | drivers/iommu/iommufd/device.c | 1194 | ||||
-rw-r--r-- | drivers/iommu/iommufd/double_span.h | 53 | ||||
-rw-r--r-- | drivers/iommu/iommufd/hw_pagetable.c | 179 | ||||
-rw-r--r-- | drivers/iommu/iommufd/io_pagetable.c | 1244 | ||||
-rw-r--r-- | drivers/iommu/iommufd/io_pagetable.h | 241 | ||||
-rw-r--r-- | drivers/iommu/iommufd/ioas.c | 398 | ||||
-rw-r--r-- | drivers/iommu/iommufd/iommufd_private.h | 351 | ||||
-rw-r--r-- | drivers/iommu/iommufd/iommufd_test.h | 112 | ||||
-rw-r--r-- | drivers/iommu/iommufd/main.c | 556 | ||||
-rw-r--r-- | drivers/iommu/iommufd/pages.c | 1993 | ||||
-rw-r--r-- | drivers/iommu/iommufd/selftest.c | 1107 | ||||
-rw-r--r-- | drivers/iommu/iommufd/vfio_compat.c | 541 |
14 files changed, 8026 insertions, 0 deletions
diff --git a/drivers/iommu/iommufd/Kconfig b/drivers/iommu/iommufd/Kconfig new file mode 100644 index 0000000000..99d4b075df --- /dev/null +++ b/drivers/iommu/iommufd/Kconfig @@ -0,0 +1,44 @@ +# SPDX-License-Identifier: GPL-2.0-only +config IOMMUFD + tristate "IOMMU Userspace API" + select INTERVAL_TREE + select INTERVAL_TREE_SPAN_ITER + select IOMMU_API + default n + help + Provides /dev/iommu, the user API to control the IOMMU subsystem as + it relates to managing IO page tables that point at user space memory. + + If you don't know what to do here, say N. + +if IOMMUFD +config IOMMUFD_VFIO_CONTAINER + bool "IOMMUFD provides the VFIO container /dev/vfio/vfio" + depends on VFIO_GROUP && !VFIO_CONTAINER + default VFIO_GROUP && !VFIO_CONTAINER + help + IOMMUFD will provide /dev/vfio/vfio instead of VFIO. This relies on + IOMMUFD providing compatibility emulation to give the same ioctls. + It provides an option to build a kernel with legacy VFIO components + removed. + + IOMMUFD VFIO container emulation is known to lack certain features + of the native VFIO container, such as peer-to-peer + DMA mapping, PPC IOMMU support, as well as other potentially + undiscovered gaps. This option is currently intended for the + purpose of testing IOMMUFD with unmodified userspace supporting VFIO + and making use of the Type1 VFIO IOMMU backend. General purpose + enabling of this option is currently discouraged. + + Unless testing IOMMUFD, say N here. + +config IOMMUFD_TEST + bool "IOMMU Userspace API Test support" + depends on DEBUG_KERNEL + depends on FAULT_INJECTION + depends on RUNTIME_TESTING_MENU + default n + help + This is dangerous, do not enable unless running + tools/testing/selftests/iommu +endif diff --git a/drivers/iommu/iommufd/Makefile b/drivers/iommu/iommufd/Makefile new file mode 100644 index 0000000000..8aeba81800 --- /dev/null +++ b/drivers/iommu/iommufd/Makefile @@ -0,0 +1,13 @@ +# SPDX-License-Identifier: GPL-2.0-only +iommufd-y := \ + device.o \ + hw_pagetable.o \ + io_pagetable.o \ + ioas.o \ + main.o \ + pages.o \ + vfio_compat.o + +iommufd-$(CONFIG_IOMMUFD_TEST) += selftest.o + +obj-$(CONFIG_IOMMUFD) += iommufd.o diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c new file mode 100644 index 0000000000..ce78c36715 --- /dev/null +++ b/drivers/iommu/iommufd/device.c @@ -0,0 +1,1194 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES + */ +#include <linux/iommufd.h> +#include <linux/slab.h> +#include <linux/iommu.h> +#include <uapi/linux/iommufd.h> +#include "../iommu-priv.h" + +#include "io_pagetable.h" +#include "iommufd_private.h" + +static bool allow_unsafe_interrupts; +module_param(allow_unsafe_interrupts, bool, S_IRUGO | S_IWUSR); +MODULE_PARM_DESC( + allow_unsafe_interrupts, + "Allow IOMMUFD to bind to devices even if the platform cannot isolate " + "the MSI interrupt window. Enabling this is a security weakness."); + +static void iommufd_group_release(struct kref *kref) +{ + struct iommufd_group *igroup = + container_of(kref, struct iommufd_group, ref); + + WARN_ON(igroup->hwpt || !list_empty(&igroup->device_list)); + + xa_cmpxchg(&igroup->ictx->groups, iommu_group_id(igroup->group), igroup, + NULL, GFP_KERNEL); + iommu_group_put(igroup->group); + mutex_destroy(&igroup->lock); + kfree(igroup); +} + +static void iommufd_put_group(struct iommufd_group *group) +{ + kref_put(&group->ref, iommufd_group_release); +} + +static bool iommufd_group_try_get(struct iommufd_group *igroup, + struct iommu_group *group) +{ + if (!igroup) + return false; + /* + * group ID's cannot be re-used until the group is put back which does + * not happen if we could get an igroup pointer under the xa_lock. + */ + if (WARN_ON(igroup->group != group)) + return false; + return kref_get_unless_zero(&igroup->ref); +} + +/* + * iommufd needs to store some more data for each iommu_group, we keep a + * parallel xarray indexed by iommu_group id to hold this instead of putting it + * in the core structure. To keep things simple the iommufd_group memory is + * unique within the iommufd_ctx. This makes it easy to check there are no + * memory leaks. + */ +static struct iommufd_group *iommufd_get_group(struct iommufd_ctx *ictx, + struct device *dev) +{ + struct iommufd_group *new_igroup; + struct iommufd_group *cur_igroup; + struct iommufd_group *igroup; + struct iommu_group *group; + unsigned int id; + + group = iommu_group_get(dev); + if (!group) + return ERR_PTR(-ENODEV); + + id = iommu_group_id(group); + + xa_lock(&ictx->groups); + igroup = xa_load(&ictx->groups, id); + if (iommufd_group_try_get(igroup, group)) { + xa_unlock(&ictx->groups); + iommu_group_put(group); + return igroup; + } + xa_unlock(&ictx->groups); + + new_igroup = kzalloc(sizeof(*new_igroup), GFP_KERNEL); + if (!new_igroup) { + iommu_group_put(group); + return ERR_PTR(-ENOMEM); + } + + kref_init(&new_igroup->ref); + mutex_init(&new_igroup->lock); + INIT_LIST_HEAD(&new_igroup->device_list); + new_igroup->sw_msi_start = PHYS_ADDR_MAX; + /* group reference moves into new_igroup */ + new_igroup->group = group; + + /* + * The ictx is not additionally refcounted here becase all objects using + * an igroup must put it before their destroy completes. + */ + new_igroup->ictx = ictx; + + /* + * We dropped the lock so igroup is invalid. NULL is a safe and likely + * value to assume for the xa_cmpxchg algorithm. + */ + cur_igroup = NULL; + xa_lock(&ictx->groups); + while (true) { + igroup = __xa_cmpxchg(&ictx->groups, id, cur_igroup, new_igroup, + GFP_KERNEL); + if (xa_is_err(igroup)) { + xa_unlock(&ictx->groups); + iommufd_put_group(new_igroup); + return ERR_PTR(xa_err(igroup)); + } + + /* new_group was successfully installed */ + if (cur_igroup == igroup) { + xa_unlock(&ictx->groups); + return new_igroup; + } + + /* Check again if the current group is any good */ + if (iommufd_group_try_get(igroup, group)) { + xa_unlock(&ictx->groups); + iommufd_put_group(new_igroup); + return igroup; + } + cur_igroup = igroup; + } +} + +void iommufd_device_destroy(struct iommufd_object *obj) +{ + struct iommufd_device *idev = + container_of(obj, struct iommufd_device, obj); + + iommu_device_release_dma_owner(idev->dev); + iommufd_put_group(idev->igroup); + if (!iommufd_selftest_is_mock_dev(idev->dev)) + iommufd_ctx_put(idev->ictx); +} + +/** + * iommufd_device_bind - Bind a physical device to an iommu fd + * @ictx: iommufd file descriptor + * @dev: Pointer to a physical device struct + * @id: Output ID number to return to userspace for this device + * + * A successful bind establishes an ownership over the device and returns + * struct iommufd_device pointer, otherwise returns error pointer. + * + * A driver using this API must set driver_managed_dma and must not touch + * the device until this routine succeeds and establishes ownership. + * + * Binding a PCI device places the entire RID under iommufd control. + * + * The caller must undo this with iommufd_device_unbind() + */ +struct iommufd_device *iommufd_device_bind(struct iommufd_ctx *ictx, + struct device *dev, u32 *id) +{ + struct iommufd_device *idev; + struct iommufd_group *igroup; + int rc; + + /* + * iommufd always sets IOMMU_CACHE because we offer no way for userspace + * to restore cache coherency. + */ + if (!device_iommu_capable(dev, IOMMU_CAP_CACHE_COHERENCY)) + return ERR_PTR(-EINVAL); + + igroup = iommufd_get_group(ictx, dev); + if (IS_ERR(igroup)) + return ERR_CAST(igroup); + + /* + * For historical compat with VFIO the insecure interrupt path is + * allowed if the module parameter is set. Secure/Isolated means that a + * MemWr operation from the device (eg a simple DMA) cannot trigger an + * interrupt outside this iommufd context. + */ + if (!iommufd_selftest_is_mock_dev(dev) && + !iommu_group_has_isolated_msi(igroup->group)) { + if (!allow_unsafe_interrupts) { + rc = -EPERM; + goto out_group_put; + } + + dev_warn( + dev, + "MSI interrupts are not secure, they cannot be isolated by the platform. " + "Check that platform features like interrupt remapping are enabled. " + "Use the \"allow_unsafe_interrupts\" module parameter to override\n"); + } + + rc = iommu_device_claim_dma_owner(dev, ictx); + if (rc) + goto out_group_put; + + idev = iommufd_object_alloc(ictx, idev, IOMMUFD_OBJ_DEVICE); + if (IS_ERR(idev)) { + rc = PTR_ERR(idev); + goto out_release_owner; + } + idev->ictx = ictx; + if (!iommufd_selftest_is_mock_dev(dev)) + iommufd_ctx_get(ictx); + idev->dev = dev; + idev->enforce_cache_coherency = + device_iommu_capable(dev, IOMMU_CAP_ENFORCE_CACHE_COHERENCY); + /* The calling driver is a user until iommufd_device_unbind() */ + refcount_inc(&idev->obj.users); + /* igroup refcount moves into iommufd_device */ + idev->igroup = igroup; + + /* + * If the caller fails after this success it must call + * iommufd_unbind_device() which is safe since we hold this refcount. + * This also means the device is a leaf in the graph and no other object + * can take a reference on it. + */ + iommufd_object_finalize(ictx, &idev->obj); + *id = idev->obj.id; + return idev; + +out_release_owner: + iommu_device_release_dma_owner(dev); +out_group_put: + iommufd_put_group(igroup); + return ERR_PTR(rc); +} +EXPORT_SYMBOL_NS_GPL(iommufd_device_bind, IOMMUFD); + +/** + * iommufd_ctx_has_group - True if any device within the group is bound + * to the ictx + * @ictx: iommufd file descriptor + * @group: Pointer to a physical iommu_group struct + * + * True if any device within the group has been bound to this ictx, ex. via + * iommufd_device_bind(), therefore implying ictx ownership of the group. + */ +bool iommufd_ctx_has_group(struct iommufd_ctx *ictx, struct iommu_group *group) +{ + struct iommufd_object *obj; + unsigned long index; + + if (!ictx || !group) + return false; + + xa_lock(&ictx->objects); + xa_for_each(&ictx->objects, index, obj) { + if (obj->type == IOMMUFD_OBJ_DEVICE && + container_of(obj, struct iommufd_device, obj) + ->igroup->group == group) { + xa_unlock(&ictx->objects); + return true; + } + } + xa_unlock(&ictx->objects); + return false; +} +EXPORT_SYMBOL_NS_GPL(iommufd_ctx_has_group, IOMMUFD); + +/** + * iommufd_device_unbind - Undo iommufd_device_bind() + * @idev: Device returned by iommufd_device_bind() + * + * Release the device from iommufd control. The DMA ownership will return back + * to unowned with DMA controlled by the DMA API. This invalidates the + * iommufd_device pointer, other APIs that consume it must not be called + * concurrently. + */ +void iommufd_device_unbind(struct iommufd_device *idev) +{ + iommufd_object_destroy_user(idev->ictx, &idev->obj); +} +EXPORT_SYMBOL_NS_GPL(iommufd_device_unbind, IOMMUFD); + +struct iommufd_ctx *iommufd_device_to_ictx(struct iommufd_device *idev) +{ + return idev->ictx; +} +EXPORT_SYMBOL_NS_GPL(iommufd_device_to_ictx, IOMMUFD); + +u32 iommufd_device_to_id(struct iommufd_device *idev) +{ + return idev->obj.id; +} +EXPORT_SYMBOL_NS_GPL(iommufd_device_to_id, IOMMUFD); + +static int iommufd_group_setup_msi(struct iommufd_group *igroup, + struct iommufd_hw_pagetable *hwpt) +{ + phys_addr_t sw_msi_start = igroup->sw_msi_start; + int rc; + + /* + * If the IOMMU driver gives a IOMMU_RESV_SW_MSI then it is asking us to + * call iommu_get_msi_cookie() on its behalf. This is necessary to setup + * the MSI window so iommu_dma_prepare_msi() can install pages into our + * domain after request_irq(). If it is not done interrupts will not + * work on this domain. + * + * FIXME: This is conceptually broken for iommufd since we want to allow + * userspace to change the domains, eg switch from an identity IOAS to a + * DMA IOAS. There is currently no way to create a MSI window that + * matches what the IRQ layer actually expects in a newly created + * domain. + */ + if (sw_msi_start != PHYS_ADDR_MAX && !hwpt->msi_cookie) { + rc = iommu_get_msi_cookie(hwpt->domain, sw_msi_start); + if (rc) + return rc; + + /* + * iommu_get_msi_cookie() can only be called once per domain, + * it returns -EBUSY on later calls. + */ + hwpt->msi_cookie = true; + } + return 0; +} + +int iommufd_hw_pagetable_attach(struct iommufd_hw_pagetable *hwpt, + struct iommufd_device *idev) +{ + int rc; + + mutex_lock(&idev->igroup->lock); + + if (idev->igroup->hwpt != NULL && idev->igroup->hwpt != hwpt) { + rc = -EINVAL; + goto err_unlock; + } + + /* Try to upgrade the domain we have */ + if (idev->enforce_cache_coherency) { + rc = iommufd_hw_pagetable_enforce_cc(hwpt); + if (rc) + goto err_unlock; + } + + rc = iopt_table_enforce_dev_resv_regions(&hwpt->ioas->iopt, idev->dev, + &idev->igroup->sw_msi_start); + if (rc) + goto err_unlock; + + /* + * Only attach to the group once for the first device that is in the + * group. All the other devices will follow this attachment. The user + * should attach every device individually to the hwpt as the per-device + * reserved regions are only updated during individual device + * attachment. + */ + if (list_empty(&idev->igroup->device_list)) { + rc = iommufd_group_setup_msi(idev->igroup, hwpt); + if (rc) + goto err_unresv; + + rc = iommu_attach_group(hwpt->domain, idev->igroup->group); + if (rc) + goto err_unresv; + idev->igroup->hwpt = hwpt; + } + refcount_inc(&hwpt->obj.users); + list_add_tail(&idev->group_item, &idev->igroup->device_list); + mutex_unlock(&idev->igroup->lock); + return 0; +err_unresv: + iopt_remove_reserved_iova(&hwpt->ioas->iopt, idev->dev); +err_unlock: + mutex_unlock(&idev->igroup->lock); + return rc; +} + +struct iommufd_hw_pagetable * +iommufd_hw_pagetable_detach(struct iommufd_device *idev) +{ + struct iommufd_hw_pagetable *hwpt = idev->igroup->hwpt; + + mutex_lock(&idev->igroup->lock); + list_del(&idev->group_item); + if (list_empty(&idev->igroup->device_list)) { + iommu_detach_group(hwpt->domain, idev->igroup->group); + idev->igroup->hwpt = NULL; + } + iopt_remove_reserved_iova(&hwpt->ioas->iopt, idev->dev); + mutex_unlock(&idev->igroup->lock); + + /* Caller must destroy hwpt */ + return hwpt; +} + +static struct iommufd_hw_pagetable * +iommufd_device_do_attach(struct iommufd_device *idev, + struct iommufd_hw_pagetable *hwpt) +{ + int rc; + + rc = iommufd_hw_pagetable_attach(hwpt, idev); + if (rc) + return ERR_PTR(rc); + return NULL; +} + +static struct iommufd_hw_pagetable * +iommufd_device_do_replace(struct iommufd_device *idev, + struct iommufd_hw_pagetable *hwpt) +{ + struct iommufd_group *igroup = idev->igroup; + struct iommufd_hw_pagetable *old_hwpt; + unsigned int num_devices = 0; + struct iommufd_device *cur; + int rc; + + mutex_lock(&idev->igroup->lock); + + if (igroup->hwpt == NULL) { + rc = -EINVAL; + goto err_unlock; + } + + if (hwpt == igroup->hwpt) { + mutex_unlock(&idev->igroup->lock); + return NULL; + } + + /* Try to upgrade the domain we have */ + list_for_each_entry(cur, &igroup->device_list, group_item) { + num_devices++; + if (cur->enforce_cache_coherency) { + rc = iommufd_hw_pagetable_enforce_cc(hwpt); + if (rc) + goto err_unlock; + } + } + + old_hwpt = igroup->hwpt; + if (hwpt->ioas != old_hwpt->ioas) { + list_for_each_entry(cur, &igroup->device_list, group_item) { + rc = iopt_table_enforce_dev_resv_regions( + &hwpt->ioas->iopt, cur->dev, NULL); + if (rc) + goto err_unresv; + } + } + + rc = iommufd_group_setup_msi(idev->igroup, hwpt); + if (rc) + goto err_unresv; + + rc = iommu_group_replace_domain(igroup->group, hwpt->domain); + if (rc) + goto err_unresv; + + if (hwpt->ioas != old_hwpt->ioas) { + list_for_each_entry(cur, &igroup->device_list, group_item) + iopt_remove_reserved_iova(&old_hwpt->ioas->iopt, + cur->dev); + } + + igroup->hwpt = hwpt; + + /* + * Move the refcounts held by the device_list to the new hwpt. Retain a + * refcount for this thread as the caller will free it. + */ + refcount_add(num_devices, &hwpt->obj.users); + if (num_devices > 1) + WARN_ON(refcount_sub_and_test(num_devices - 1, + &old_hwpt->obj.users)); + mutex_unlock(&idev->igroup->lock); + + /* Caller must destroy old_hwpt */ + return old_hwpt; +err_unresv: + list_for_each_entry(cur, &igroup->device_list, group_item) + iopt_remove_reserved_iova(&hwpt->ioas->iopt, cur->dev); +err_unlock: + mutex_unlock(&idev->igroup->lock); + return ERR_PTR(rc); +} + +typedef struct iommufd_hw_pagetable *(*attach_fn)( + struct iommufd_device *idev, struct iommufd_hw_pagetable *hwpt); + +/* + * When automatically managing the domains we search for a compatible domain in + * the iopt and if one is found use it, otherwise create a new domain. + * Automatic domain selection will never pick a manually created domain. + */ +static struct iommufd_hw_pagetable * +iommufd_device_auto_get_domain(struct iommufd_device *idev, + struct iommufd_ioas *ioas, u32 *pt_id, + attach_fn do_attach) +{ + /* + * iommufd_hw_pagetable_attach() is called by + * iommufd_hw_pagetable_alloc() in immediate attachment mode, same as + * iommufd_device_do_attach(). So if we are in this mode then we prefer + * to use the immediate_attach path as it supports drivers that can't + * directly allocate a domain. + */ + bool immediate_attach = do_attach == iommufd_device_do_attach; + struct iommufd_hw_pagetable *destroy_hwpt; + struct iommufd_hw_pagetable *hwpt; + + /* + * There is no differentiation when domains are allocated, so any domain + * that is willing to attach to the device is interchangeable with any + * other. + */ + mutex_lock(&ioas->mutex); + list_for_each_entry(hwpt, &ioas->hwpt_list, hwpt_item) { + if (!hwpt->auto_domain) + continue; + + if (!iommufd_lock_obj(&hwpt->obj)) + continue; + destroy_hwpt = (*do_attach)(idev, hwpt); + if (IS_ERR(destroy_hwpt)) { + iommufd_put_object(&hwpt->obj); + /* + * -EINVAL means the domain is incompatible with the + * device. Other error codes should propagate to + * userspace as failure. Success means the domain is + * attached. + */ + if (PTR_ERR(destroy_hwpt) == -EINVAL) + continue; + goto out_unlock; + } + *pt_id = hwpt->obj.id; + iommufd_put_object(&hwpt->obj); + goto out_unlock; + } + + hwpt = iommufd_hw_pagetable_alloc(idev->ictx, ioas, idev, + immediate_attach); + if (IS_ERR(hwpt)) { + destroy_hwpt = ERR_CAST(hwpt); + goto out_unlock; + } + + if (!immediate_attach) { + destroy_hwpt = (*do_attach)(idev, hwpt); + if (IS_ERR(destroy_hwpt)) + goto out_abort; + } else { + destroy_hwpt = NULL; + } + + hwpt->auto_domain = true; + *pt_id = hwpt->obj.id; + + iommufd_object_finalize(idev->ictx, &hwpt->obj); + mutex_unlock(&ioas->mutex); + return destroy_hwpt; + +out_abort: + iommufd_object_abort_and_destroy(idev->ictx, &hwpt->obj); +out_unlock: + mutex_unlock(&ioas->mutex); + return destroy_hwpt; +} + +static int iommufd_device_change_pt(struct iommufd_device *idev, u32 *pt_id, + attach_fn do_attach) +{ + struct iommufd_hw_pagetable *destroy_hwpt; + struct iommufd_object *pt_obj; + + pt_obj = iommufd_get_object(idev->ictx, *pt_id, IOMMUFD_OBJ_ANY); + if (IS_ERR(pt_obj)) + return PTR_ERR(pt_obj); + + switch (pt_obj->type) { + case IOMMUFD_OBJ_HW_PAGETABLE: { + struct iommufd_hw_pagetable *hwpt = + container_of(pt_obj, struct iommufd_hw_pagetable, obj); + + destroy_hwpt = (*do_attach)(idev, hwpt); + if (IS_ERR(destroy_hwpt)) + goto out_put_pt_obj; + break; + } + case IOMMUFD_OBJ_IOAS: { + struct iommufd_ioas *ioas = + container_of(pt_obj, struct iommufd_ioas, obj); + + destroy_hwpt = iommufd_device_auto_get_domain(idev, ioas, pt_id, + do_attach); + if (IS_ERR(destroy_hwpt)) + goto out_put_pt_obj; + break; + } + default: + destroy_hwpt = ERR_PTR(-EINVAL); + goto out_put_pt_obj; + } + iommufd_put_object(pt_obj); + + /* This destruction has to be after we unlock everything */ + if (destroy_hwpt) + iommufd_hw_pagetable_put(idev->ictx, destroy_hwpt); + return 0; + +out_put_pt_obj: + iommufd_put_object(pt_obj); + return PTR_ERR(destroy_hwpt); +} + +/** + * iommufd_device_attach - Connect a device to an iommu_domain + * @idev: device to attach + * @pt_id: Input a IOMMUFD_OBJ_IOAS, or IOMMUFD_OBJ_HW_PAGETABLE + * Output the IOMMUFD_OBJ_HW_PAGETABLE ID + * + * This connects the device to an iommu_domain, either automatically or manually + * selected. Once this completes the device could do DMA. + * + * The caller should return the resulting pt_id back to userspace. + * This function is undone by calling iommufd_device_detach(). + */ +int iommufd_device_attach(struct iommufd_device *idev, u32 *pt_id) +{ + int rc; + + rc = iommufd_device_change_pt(idev, pt_id, &iommufd_device_do_attach); + if (rc) + return rc; + + /* + * Pairs with iommufd_device_detach() - catches caller bugs attempting + * to destroy a device with an attachment. + */ + refcount_inc(&idev->obj.users); + return 0; +} +EXPORT_SYMBOL_NS_GPL(iommufd_device_attach, IOMMUFD); + +/** + * iommufd_device_replace - Change the device's iommu_domain + * @idev: device to change + * @pt_id: Input a IOMMUFD_OBJ_IOAS, or IOMMUFD_OBJ_HW_PAGETABLE + * Output the IOMMUFD_OBJ_HW_PAGETABLE ID + * + * This is the same as:: + * + * iommufd_device_detach(); + * iommufd_device_attach(); + * + * If it fails then no change is made to the attachment. The iommu driver may + * implement this so there is no disruption in translation. This can only be + * called if iommufd_device_attach() has already succeeded. + */ +int iommufd_device_replace(struct iommufd_device *idev, u32 *pt_id) +{ + return iommufd_device_change_pt(idev, pt_id, + &iommufd_device_do_replace); +} +EXPORT_SYMBOL_NS_GPL(iommufd_device_replace, IOMMUFD); + +/** + * iommufd_device_detach - Disconnect a device to an iommu_domain + * @idev: device to detach + * + * Undo iommufd_device_attach(). This disconnects the idev from the previously + * attached pt_id. The device returns back to a blocked DMA translation. + */ +void iommufd_device_detach(struct iommufd_device *idev) +{ + struct iommufd_hw_pagetable *hwpt; + + hwpt = iommufd_hw_pagetable_detach(idev); + iommufd_hw_pagetable_put(idev->ictx, hwpt); + refcount_dec(&idev->obj.users); +} +EXPORT_SYMBOL_NS_GPL(iommufd_device_detach, IOMMUFD); + +/* + * On success, it will refcount_inc() at a valid new_ioas and refcount_dec() at + * a valid cur_ioas (access->ioas). A caller passing in a valid new_ioas should + * call iommufd_put_object() if it does an iommufd_get_object() for a new_ioas. + */ +static int iommufd_access_change_ioas(struct iommufd_access *access, + struct iommufd_ioas *new_ioas) +{ + u32 iopt_access_list_id = access->iopt_access_list_id; + struct iommufd_ioas *cur_ioas = access->ioas; + int rc; + + lockdep_assert_held(&access->ioas_lock); + + /* We are racing with a concurrent detach, bail */ + if (cur_ioas != access->ioas_unpin) + return -EBUSY; + + if (cur_ioas == new_ioas) + return 0; + + /* + * Set ioas to NULL to block any further iommufd_access_pin_pages(). + * iommufd_access_unpin_pages() can continue using access->ioas_unpin. + */ + access->ioas = NULL; + + if (new_ioas) { + rc = iopt_add_access(&new_ioas->iopt, access); + if (rc) { + access->ioas = cur_ioas; + return rc; + } + refcount_inc(&new_ioas->obj.users); + } + + if (cur_ioas) { + if (access->ops->unmap) { + mutex_unlock(&access->ioas_lock); + access->ops->unmap(access->data, 0, ULONG_MAX); + mutex_lock(&access->ioas_lock); + } + iopt_remove_access(&cur_ioas->iopt, access, iopt_access_list_id); + refcount_dec(&cur_ioas->obj.users); + } + + access->ioas = new_ioas; + access->ioas_unpin = new_ioas; + + return 0; +} + +static int iommufd_access_change_ioas_id(struct iommufd_access *access, u32 id) +{ + struct iommufd_ioas *ioas = iommufd_get_ioas(access->ictx, id); + int rc; + + if (IS_ERR(ioas)) + return PTR_ERR(ioas); + rc = iommufd_access_change_ioas(access, ioas); + iommufd_put_object(&ioas->obj); + return rc; +} + +void iommufd_access_destroy_object(struct iommufd_object *obj) +{ + struct iommufd_access *access = + container_of(obj, struct iommufd_access, obj); + + mutex_lock(&access->ioas_lock); + if (access->ioas) + WARN_ON(iommufd_access_change_ioas(access, NULL)); + mutex_unlock(&access->ioas_lock); + iommufd_ctx_put(access->ictx); +} + +/** + * iommufd_access_create - Create an iommufd_access + * @ictx: iommufd file descriptor + * @ops: Driver's ops to associate with the access + * @data: Opaque data to pass into ops functions + * @id: Output ID number to return to userspace for this access + * + * An iommufd_access allows a driver to read/write to the IOAS without using + * DMA. The underlying CPU memory can be accessed using the + * iommufd_access_pin_pages() or iommufd_access_rw() functions. + * + * The provided ops are required to use iommufd_access_pin_pages(). + */ +struct iommufd_access * +iommufd_access_create(struct iommufd_ctx *ictx, + const struct iommufd_access_ops *ops, void *data, u32 *id) +{ + struct iommufd_access *access; + + /* + * There is no uAPI for the access object, but to keep things symmetric + * use the object infrastructure anyhow. + */ + access = iommufd_object_alloc(ictx, access, IOMMUFD_OBJ_ACCESS); + if (IS_ERR(access)) + return access; + + access->data = data; + access->ops = ops; + + if (ops->needs_pin_pages) + access->iova_alignment = PAGE_SIZE; + else + access->iova_alignment = 1; + + /* The calling driver is a user until iommufd_access_destroy() */ + refcount_inc(&access->obj.users); + access->ictx = ictx; + iommufd_ctx_get(ictx); + iommufd_object_finalize(ictx, &access->obj); + *id = access->obj.id; + mutex_init(&access->ioas_lock); + return access; +} +EXPORT_SYMBOL_NS_GPL(iommufd_access_create, IOMMUFD); + +/** + * iommufd_access_destroy - Destroy an iommufd_access + * @access: The access to destroy + * + * The caller must stop using the access before destroying it. + */ +void iommufd_access_destroy(struct iommufd_access *access) +{ + iommufd_object_destroy_user(access->ictx, &access->obj); +} +EXPORT_SYMBOL_NS_GPL(iommufd_access_destroy, IOMMUFD); + +void iommufd_access_detach(struct iommufd_access *access) +{ + mutex_lock(&access->ioas_lock); + if (WARN_ON(!access->ioas)) { + mutex_unlock(&access->ioas_lock); + return; + } + WARN_ON(iommufd_access_change_ioas(access, NULL)); + mutex_unlock(&access->ioas_lock); +} +EXPORT_SYMBOL_NS_GPL(iommufd_access_detach, IOMMUFD); + +int iommufd_access_attach(struct iommufd_access *access, u32 ioas_id) +{ + int rc; + + mutex_lock(&access->ioas_lock); + if (WARN_ON(access->ioas)) { + mutex_unlock(&access->ioas_lock); + return -EINVAL; + } + + rc = iommufd_access_change_ioas_id(access, ioas_id); + mutex_unlock(&access->ioas_lock); + return rc; +} +EXPORT_SYMBOL_NS_GPL(iommufd_access_attach, IOMMUFD); + +int iommufd_access_replace(struct iommufd_access *access, u32 ioas_id) +{ + int rc; + + mutex_lock(&access->ioas_lock); + if (!access->ioas) { + mutex_unlock(&access->ioas_lock); + return -ENOENT; + } + rc = iommufd_access_change_ioas_id(access, ioas_id); + mutex_unlock(&access->ioas_lock); + return rc; +} +EXPORT_SYMBOL_NS_GPL(iommufd_access_replace, IOMMUFD); + +/** + * iommufd_access_notify_unmap - Notify users of an iopt to stop using it + * @iopt: iopt to work on + * @iova: Starting iova in the iopt + * @length: Number of bytes + * + * After this function returns there should be no users attached to the pages + * linked to this iopt that intersect with iova,length. Anyone that has attached + * a user through iopt_access_pages() needs to detach it through + * iommufd_access_unpin_pages() before this function returns. + * + * iommufd_access_destroy() will wait for any outstanding unmap callback to + * complete. Once iommufd_access_destroy() no unmap ops are running or will + * run in the future. Due to this a driver must not create locking that prevents + * unmap to complete while iommufd_access_destroy() is running. + */ +void iommufd_access_notify_unmap(struct io_pagetable *iopt, unsigned long iova, + unsigned long length) +{ + struct iommufd_ioas *ioas = + container_of(iopt, struct iommufd_ioas, iopt); + struct iommufd_access *access; + unsigned long index; + + xa_lock(&ioas->iopt.access_list); + xa_for_each(&ioas->iopt.access_list, index, access) { + if (!iommufd_lock_obj(&access->obj)) + continue; + xa_unlock(&ioas->iopt.access_list); + + access->ops->unmap(access->data, iova, length); + + iommufd_put_object(&access->obj); + xa_lock(&ioas->iopt.access_list); + } + xa_unlock(&ioas->iopt.access_list); +} + +/** + * iommufd_access_unpin_pages() - Undo iommufd_access_pin_pages + * @access: IOAS access to act on + * @iova: Starting IOVA + * @length: Number of bytes to access + * + * Return the struct page's. The caller must stop accessing them before calling + * this. The iova/length must exactly match the one provided to access_pages. + */ +void iommufd_access_unpin_pages(struct iommufd_access *access, + unsigned long iova, unsigned long length) +{ + struct iopt_area_contig_iter iter; + struct io_pagetable *iopt; + unsigned long last_iova; + struct iopt_area *area; + + if (WARN_ON(!length) || + WARN_ON(check_add_overflow(iova, length - 1, &last_iova))) + return; + + mutex_lock(&access->ioas_lock); + /* + * The driver must be doing something wrong if it calls this before an + * iommufd_access_attach() or after an iommufd_access_detach(). + */ + if (WARN_ON(!access->ioas_unpin)) { + mutex_unlock(&access->ioas_lock); + return; + } + iopt = &access->ioas_unpin->iopt; + + down_read(&iopt->iova_rwsem); + iopt_for_each_contig_area(&iter, area, iopt, iova, last_iova) + iopt_area_remove_access( + area, iopt_area_iova_to_index(area, iter.cur_iova), + iopt_area_iova_to_index( + area, + min(last_iova, iopt_area_last_iova(area)))); + WARN_ON(!iopt_area_contig_done(&iter)); + up_read(&iopt->iova_rwsem); + mutex_unlock(&access->ioas_lock); +} +EXPORT_SYMBOL_NS_GPL(iommufd_access_unpin_pages, IOMMUFD); + +static bool iopt_area_contig_is_aligned(struct iopt_area_contig_iter *iter) +{ + if (iopt_area_start_byte(iter->area, iter->cur_iova) % PAGE_SIZE) + return false; + + if (!iopt_area_contig_done(iter) && + (iopt_area_start_byte(iter->area, iopt_area_last_iova(iter->area)) % + PAGE_SIZE) != (PAGE_SIZE - 1)) + return false; + return true; +} + +static bool check_area_prot(struct iopt_area *area, unsigned int flags) +{ + if (flags & IOMMUFD_ACCESS_RW_WRITE) + return area->iommu_prot & IOMMU_WRITE; + return area->iommu_prot & IOMMU_READ; +} + +/** + * iommufd_access_pin_pages() - Return a list of pages under the iova + * @access: IOAS access to act on + * @iova: Starting IOVA + * @length: Number of bytes to access + * @out_pages: Output page list + * @flags: IOPMMUFD_ACCESS_RW_* flags + * + * Reads @length bytes starting at iova and returns the struct page * pointers. + * These can be kmap'd by the caller for CPU access. + * + * The caller must perform iommufd_access_unpin_pages() when done to balance + * this. + * + * This API always requires a page aligned iova. This happens naturally if the + * ioas alignment is >= PAGE_SIZE and the iova is PAGE_SIZE aligned. However + * smaller alignments have corner cases where this API can fail on otherwise + * aligned iova. + */ +int iommufd_access_pin_pages(struct iommufd_access *access, unsigned long iova, + unsigned long length, struct page **out_pages, + unsigned int flags) +{ + struct iopt_area_contig_iter iter; + struct io_pagetable *iopt; + unsigned long last_iova; + struct iopt_area *area; + int rc; + + /* Driver's ops don't support pin_pages */ + if (IS_ENABLED(CONFIG_IOMMUFD_TEST) && + WARN_ON(access->iova_alignment != PAGE_SIZE || !access->ops->unmap)) + return -EINVAL; + + if (!length) + return -EINVAL; + if (check_add_overflow(iova, length - 1, &last_iova)) + return -EOVERFLOW; + + mutex_lock(&access->ioas_lock); + if (!access->ioas) { + mutex_unlock(&access->ioas_lock); + return -ENOENT; + } + iopt = &access->ioas->iopt; + + down_read(&iopt->iova_rwsem); + iopt_for_each_contig_area(&iter, area, iopt, iova, last_iova) { + unsigned long last = min(last_iova, iopt_area_last_iova(area)); + unsigned long last_index = iopt_area_iova_to_index(area, last); + unsigned long index = + iopt_area_iova_to_index(area, iter.cur_iova); + + if (area->prevent_access || + !iopt_area_contig_is_aligned(&iter)) { + rc = -EINVAL; + goto err_remove; + } + + if (!check_area_prot(area, flags)) { + rc = -EPERM; + goto err_remove; + } + + rc = iopt_area_add_access(area, index, last_index, out_pages, + flags); + if (rc) + goto err_remove; + out_pages += last_index - index + 1; + } + if (!iopt_area_contig_done(&iter)) { + rc = -ENOENT; + goto err_remove; + } + + up_read(&iopt->iova_rwsem); + mutex_unlock(&access->ioas_lock); + return 0; + +err_remove: + if (iova < iter.cur_iova) { + last_iova = iter.cur_iova - 1; + iopt_for_each_contig_area(&iter, area, iopt, iova, last_iova) + iopt_area_remove_access( + area, + iopt_area_iova_to_index(area, iter.cur_iova), + iopt_area_iova_to_index( + area, min(last_iova, + iopt_area_last_iova(area)))); + } + up_read(&iopt->iova_rwsem); + mutex_unlock(&access->ioas_lock); + return rc; +} +EXPORT_SYMBOL_NS_GPL(iommufd_access_pin_pages, IOMMUFD); + +/** + * iommufd_access_rw - Read or write data under the iova + * @access: IOAS access to act on + * @iova: Starting IOVA + * @data: Kernel buffer to copy to/from + * @length: Number of bytes to access + * @flags: IOMMUFD_ACCESS_RW_* flags + * + * Copy kernel to/from data into the range given by IOVA/length. If flags + * indicates IOMMUFD_ACCESS_RW_KTHREAD then a large copy can be optimized + * by changing it into copy_to/from_user(). + */ +int iommufd_access_rw(struct iommufd_access *access, unsigned long iova, + void *data, size_t length, unsigned int flags) +{ + struct iopt_area_contig_iter iter; + struct io_pagetable *iopt; + struct iopt_area *area; + unsigned long last_iova; + int rc; + + if (!length) + return -EINVAL; + if (check_add_overflow(iova, length - 1, &last_iova)) + return -EOVERFLOW; + + mutex_lock(&access->ioas_lock); + if (!access->ioas) { + mutex_unlock(&access->ioas_lock); + return -ENOENT; + } + iopt = &access->ioas->iopt; + + down_read(&iopt->iova_rwsem); + iopt_for_each_contig_area(&iter, area, iopt, iova, last_iova) { + unsigned long last = min(last_iova, iopt_area_last_iova(area)); + unsigned long bytes = (last - iter.cur_iova) + 1; + + if (area->prevent_access) { + rc = -EINVAL; + goto err_out; + } + + if (!check_area_prot(area, flags)) { + rc = -EPERM; + goto err_out; + } + + rc = iopt_pages_rw_access( + area->pages, iopt_area_start_byte(area, iter.cur_iova), + data, bytes, flags); + if (rc) + goto err_out; + data += bytes; + } + if (!iopt_area_contig_done(&iter)) + rc = -ENOENT; +err_out: + up_read(&iopt->iova_rwsem); + mutex_unlock(&access->ioas_lock); + return rc; +} +EXPORT_SYMBOL_NS_GPL(iommufd_access_rw, IOMMUFD); + +int iommufd_get_hw_info(struct iommufd_ucmd *ucmd) +{ + struct iommu_hw_info *cmd = ucmd->cmd; + void __user *user_ptr = u64_to_user_ptr(cmd->data_uptr); + const struct iommu_ops *ops; + struct iommufd_device *idev; + unsigned int data_len; + unsigned int copy_len; + void *data; + int rc; + + if (cmd->flags || cmd->__reserved) + return -EOPNOTSUPP; + + idev = iommufd_get_device(ucmd, cmd->dev_id); + if (IS_ERR(idev)) + return PTR_ERR(idev); + + ops = dev_iommu_ops(idev->dev); + if (ops->hw_info) { + data = ops->hw_info(idev->dev, &data_len, &cmd->out_data_type); + if (IS_ERR(data)) { + rc = PTR_ERR(data); + goto out_put; + } + + /* + * drivers that have hw_info callback should have a unique + * iommu_hw_info_type. + */ + if (WARN_ON_ONCE(cmd->out_data_type == + IOMMU_HW_INFO_TYPE_NONE)) { + rc = -ENODEV; + goto out_free; + } + } else { + cmd->out_data_type = IOMMU_HW_INFO_TYPE_NONE; + data_len = 0; + data = NULL; + } + + copy_len = min(cmd->data_len, data_len); + if (copy_to_user(user_ptr, data, copy_len)) { + rc = -EFAULT; + goto out_free; + } + + /* + * Zero the trailing bytes if the user buffer is bigger than the + * data size kernel actually has. + */ + if (copy_len < cmd->data_len) { + if (clear_user(user_ptr + copy_len, cmd->data_len - copy_len)) { + rc = -EFAULT; + goto out_free; + } + } + + /* + * We return the length the kernel supports so userspace may know what + * the kernel capability is. It could be larger than the input buffer. + */ + cmd->data_len = data_len; + + rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd)); +out_free: + kfree(data); +out_put: + iommufd_put_object(&idev->obj); + return rc; +} diff --git a/drivers/iommu/iommufd/double_span.h b/drivers/iommu/iommufd/double_span.h new file mode 100644 index 0000000000..b37aab7488 --- /dev/null +++ b/drivers/iommu/iommufd/double_span.h @@ -0,0 +1,53 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. + */ +#ifndef __IOMMUFD_DOUBLE_SPAN_H +#define __IOMMUFD_DOUBLE_SPAN_H + +#include <linux/interval_tree.h> + +/* + * This is a variation of the general interval_tree_span_iter that computes the + * spans over the union of two different interval trees. Used ranges are broken + * up and reported based on the tree that provides the interval. The first span + * always takes priority. Like interval_tree_span_iter it is greedy and the same + * value of is_used will not repeat on two iteration cycles. + */ +struct interval_tree_double_span_iter { + struct rb_root_cached *itrees[2]; + struct interval_tree_span_iter spans[2]; + union { + unsigned long start_hole; + unsigned long start_used; + }; + union { + unsigned long last_hole; + unsigned long last_used; + }; + /* 0 = hole, 1 = used span[0], 2 = used span[1], -1 done iteration */ + int is_used; +}; + +void interval_tree_double_span_iter_update( + struct interval_tree_double_span_iter *iter); +void interval_tree_double_span_iter_first( + struct interval_tree_double_span_iter *iter, + struct rb_root_cached *itree1, struct rb_root_cached *itree2, + unsigned long first_index, unsigned long last_index); +void interval_tree_double_span_iter_next( + struct interval_tree_double_span_iter *iter); + +static inline bool +interval_tree_double_span_iter_done(struct interval_tree_double_span_iter *state) +{ + return state->is_used == -1; +} + +#define interval_tree_for_each_double_span(span, itree1, itree2, first_index, \ + last_index) \ + for (interval_tree_double_span_iter_first(span, itree1, itree2, \ + first_index, last_index); \ + !interval_tree_double_span_iter_done(span); \ + interval_tree_double_span_iter_next(span)) + +#endif diff --git a/drivers/iommu/iommufd/hw_pagetable.c b/drivers/iommu/iommufd/hw_pagetable.c new file mode 100644 index 0000000000..cf2c1504e2 --- /dev/null +++ b/drivers/iommu/iommufd/hw_pagetable.c @@ -0,0 +1,179 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES + */ +#include <linux/iommu.h> +#include <uapi/linux/iommufd.h> + +#include "iommufd_private.h" + +void iommufd_hw_pagetable_destroy(struct iommufd_object *obj) +{ + struct iommufd_hw_pagetable *hwpt = + container_of(obj, struct iommufd_hw_pagetable, obj); + + if (!list_empty(&hwpt->hwpt_item)) { + mutex_lock(&hwpt->ioas->mutex); + list_del(&hwpt->hwpt_item); + mutex_unlock(&hwpt->ioas->mutex); + + iopt_table_remove_domain(&hwpt->ioas->iopt, hwpt->domain); + } + + if (hwpt->domain) + iommu_domain_free(hwpt->domain); + + refcount_dec(&hwpt->ioas->obj.users); +} + +void iommufd_hw_pagetable_abort(struct iommufd_object *obj) +{ + struct iommufd_hw_pagetable *hwpt = + container_of(obj, struct iommufd_hw_pagetable, obj); + + /* The ioas->mutex must be held until finalize is called. */ + lockdep_assert_held(&hwpt->ioas->mutex); + + if (!list_empty(&hwpt->hwpt_item)) { + list_del_init(&hwpt->hwpt_item); + iopt_table_remove_domain(&hwpt->ioas->iopt, hwpt->domain); + } + iommufd_hw_pagetable_destroy(obj); +} + +int iommufd_hw_pagetable_enforce_cc(struct iommufd_hw_pagetable *hwpt) +{ + if (hwpt->enforce_cache_coherency) + return 0; + + if (hwpt->domain->ops->enforce_cache_coherency) + hwpt->enforce_cache_coherency = + hwpt->domain->ops->enforce_cache_coherency( + hwpt->domain); + if (!hwpt->enforce_cache_coherency) + return -EINVAL; + return 0; +} + +/** + * iommufd_hw_pagetable_alloc() - Get an iommu_domain for a device + * @ictx: iommufd context + * @ioas: IOAS to associate the domain with + * @idev: Device to get an iommu_domain for + * @immediate_attach: True if idev should be attached to the hwpt + * + * Allocate a new iommu_domain and return it as a hw_pagetable. The HWPT + * will be linked to the given ioas and upon return the underlying iommu_domain + * is fully popoulated. + * + * The caller must hold the ioas->mutex until after + * iommufd_object_abort_and_destroy() or iommufd_object_finalize() is called on + * the returned hwpt. + */ +struct iommufd_hw_pagetable * +iommufd_hw_pagetable_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas, + struct iommufd_device *idev, bool immediate_attach) +{ + struct iommufd_hw_pagetable *hwpt; + int rc; + + lockdep_assert_held(&ioas->mutex); + + hwpt = iommufd_object_alloc(ictx, hwpt, IOMMUFD_OBJ_HW_PAGETABLE); + if (IS_ERR(hwpt)) + return hwpt; + + INIT_LIST_HEAD(&hwpt->hwpt_item); + /* Pairs with iommufd_hw_pagetable_destroy() */ + refcount_inc(&ioas->obj.users); + hwpt->ioas = ioas; + + hwpt->domain = iommu_domain_alloc(idev->dev->bus); + if (!hwpt->domain) { + rc = -ENOMEM; + goto out_abort; + } + + /* + * Set the coherency mode before we do iopt_table_add_domain() as some + * iommus have a per-PTE bit that controls it and need to decide before + * doing any maps. It is an iommu driver bug to report + * IOMMU_CAP_ENFORCE_CACHE_COHERENCY but fail enforce_cache_coherency on + * a new domain. + */ + if (idev->enforce_cache_coherency) { + rc = iommufd_hw_pagetable_enforce_cc(hwpt); + if (WARN_ON(rc)) + goto out_abort; + } + + /* + * immediate_attach exists only to accommodate iommu drivers that cannot + * directly allocate a domain. These drivers do not finish creating the + * domain until attach is completed. Thus we must have this call + * sequence. Once those drivers are fixed this should be removed. + */ + if (immediate_attach) { + rc = iommufd_hw_pagetable_attach(hwpt, idev); + if (rc) + goto out_abort; + } + + rc = iopt_table_add_domain(&hwpt->ioas->iopt, hwpt->domain); + if (rc) + goto out_detach; + list_add_tail(&hwpt->hwpt_item, &hwpt->ioas->hwpt_list); + return hwpt; + +out_detach: + if (immediate_attach) + iommufd_hw_pagetable_detach(idev); +out_abort: + iommufd_object_abort_and_destroy(ictx, &hwpt->obj); + return ERR_PTR(rc); +} + +int iommufd_hwpt_alloc(struct iommufd_ucmd *ucmd) +{ + struct iommu_hwpt_alloc *cmd = ucmd->cmd; + struct iommufd_hw_pagetable *hwpt; + struct iommufd_device *idev; + struct iommufd_ioas *ioas; + int rc; + + if (cmd->flags || cmd->__reserved) + return -EOPNOTSUPP; + + idev = iommufd_get_device(ucmd, cmd->dev_id); + if (IS_ERR(idev)) + return PTR_ERR(idev); + + ioas = iommufd_get_ioas(ucmd->ictx, cmd->pt_id); + if (IS_ERR(ioas)) { + rc = PTR_ERR(ioas); + goto out_put_idev; + } + + mutex_lock(&ioas->mutex); + hwpt = iommufd_hw_pagetable_alloc(ucmd->ictx, ioas, idev, false); + if (IS_ERR(hwpt)) { + rc = PTR_ERR(hwpt); + goto out_unlock; + } + + cmd->out_hwpt_id = hwpt->obj.id; + rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd)); + if (rc) + goto out_hwpt; + iommufd_object_finalize(ucmd->ictx, &hwpt->obj); + goto out_unlock; + +out_hwpt: + iommufd_object_abort_and_destroy(ucmd->ictx, &hwpt->obj); +out_unlock: + mutex_unlock(&ioas->mutex); + iommufd_put_object(&ioas->obj); +out_put_idev: + iommufd_put_object(&idev->obj); + return rc; +} diff --git a/drivers/iommu/iommufd/io_pagetable.c b/drivers/iommu/iommufd/io_pagetable.c new file mode 100644 index 0000000000..117a39ae2e --- /dev/null +++ b/drivers/iommu/iommufd/io_pagetable.c @@ -0,0 +1,1244 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. + * + * The io_pagetable is the top of datastructure that maps IOVA's to PFNs. The + * PFNs can be placed into an iommu_domain, or returned to the caller as a page + * list for access by an in-kernel user. + * + * The datastructure uses the iopt_pages to optimize the storage of the PFNs + * between the domains and xarray. + */ +#include <linux/iommufd.h> +#include <linux/lockdep.h> +#include <linux/iommu.h> +#include <linux/sched/mm.h> +#include <linux/err.h> +#include <linux/slab.h> +#include <linux/errno.h> + +#include "io_pagetable.h" +#include "double_span.h" + +struct iopt_pages_list { + struct iopt_pages *pages; + struct iopt_area *area; + struct list_head next; + unsigned long start_byte; + unsigned long length; +}; + +struct iopt_area *iopt_area_contig_init(struct iopt_area_contig_iter *iter, + struct io_pagetable *iopt, + unsigned long iova, + unsigned long last_iova) +{ + lockdep_assert_held(&iopt->iova_rwsem); + + iter->cur_iova = iova; + iter->last_iova = last_iova; + iter->area = iopt_area_iter_first(iopt, iova, iova); + if (!iter->area) + return NULL; + if (!iter->area->pages) { + iter->area = NULL; + return NULL; + } + return iter->area; +} + +struct iopt_area *iopt_area_contig_next(struct iopt_area_contig_iter *iter) +{ + unsigned long last_iova; + + if (!iter->area) + return NULL; + last_iova = iopt_area_last_iova(iter->area); + if (iter->last_iova <= last_iova) + return NULL; + + iter->cur_iova = last_iova + 1; + iter->area = iopt_area_iter_next(iter->area, iter->cur_iova, + iter->last_iova); + if (!iter->area) + return NULL; + if (iter->cur_iova != iopt_area_iova(iter->area) || + !iter->area->pages) { + iter->area = NULL; + return NULL; + } + return iter->area; +} + +static bool __alloc_iova_check_hole(struct interval_tree_double_span_iter *span, + unsigned long length, + unsigned long iova_alignment, + unsigned long page_offset) +{ + if (span->is_used || span->last_hole - span->start_hole < length - 1) + return false; + + span->start_hole = ALIGN(span->start_hole, iova_alignment) | + page_offset; + if (span->start_hole > span->last_hole || + span->last_hole - span->start_hole < length - 1) + return false; + return true; +} + +static bool __alloc_iova_check_used(struct interval_tree_span_iter *span, + unsigned long length, + unsigned long iova_alignment, + unsigned long page_offset) +{ + if (span->is_hole || span->last_used - span->start_used < length - 1) + return false; + + span->start_used = ALIGN(span->start_used, iova_alignment) | + page_offset; + if (span->start_used > span->last_used || + span->last_used - span->start_used < length - 1) + return false; + return true; +} + +/* + * Automatically find a block of IOVA that is not being used and not reserved. + * Does not return a 0 IOVA even if it is valid. + */ +static int iopt_alloc_iova(struct io_pagetable *iopt, unsigned long *iova, + unsigned long uptr, unsigned long length) +{ + unsigned long page_offset = uptr % PAGE_SIZE; + struct interval_tree_double_span_iter used_span; + struct interval_tree_span_iter allowed_span; + unsigned long iova_alignment; + + lockdep_assert_held(&iopt->iova_rwsem); + + /* Protect roundup_pow-of_two() from overflow */ + if (length == 0 || length >= ULONG_MAX / 2) + return -EOVERFLOW; + + /* + * Keep alignment present in the uptr when building the IOVA, this + * increases the chance we can map a THP. + */ + if (!uptr) + iova_alignment = roundup_pow_of_two(length); + else + iova_alignment = min_t(unsigned long, + roundup_pow_of_two(length), + 1UL << __ffs64(uptr)); + + if (iova_alignment < iopt->iova_alignment) + return -EINVAL; + + interval_tree_for_each_span(&allowed_span, &iopt->allowed_itree, + PAGE_SIZE, ULONG_MAX - PAGE_SIZE) { + if (RB_EMPTY_ROOT(&iopt->allowed_itree.rb_root)) { + allowed_span.start_used = PAGE_SIZE; + allowed_span.last_used = ULONG_MAX - PAGE_SIZE; + allowed_span.is_hole = false; + } + + if (!__alloc_iova_check_used(&allowed_span, length, + iova_alignment, page_offset)) + continue; + + interval_tree_for_each_double_span( + &used_span, &iopt->reserved_itree, &iopt->area_itree, + allowed_span.start_used, allowed_span.last_used) { + if (!__alloc_iova_check_hole(&used_span, length, + iova_alignment, + page_offset)) + continue; + + *iova = used_span.start_hole; + return 0; + } + } + return -ENOSPC; +} + +static int iopt_check_iova(struct io_pagetable *iopt, unsigned long iova, + unsigned long length) +{ + unsigned long last; + + lockdep_assert_held(&iopt->iova_rwsem); + + if ((iova & (iopt->iova_alignment - 1))) + return -EINVAL; + + if (check_add_overflow(iova, length - 1, &last)) + return -EOVERFLOW; + + /* No reserved IOVA intersects the range */ + if (iopt_reserved_iter_first(iopt, iova, last)) + return -EINVAL; + + /* Check that there is not already a mapping in the range */ + if (iopt_area_iter_first(iopt, iova, last)) + return -EEXIST; + return 0; +} + +/* + * The area takes a slice of the pages from start_bytes to start_byte + length + */ +static int iopt_insert_area(struct io_pagetable *iopt, struct iopt_area *area, + struct iopt_pages *pages, unsigned long iova, + unsigned long start_byte, unsigned long length, + int iommu_prot) +{ + lockdep_assert_held_write(&iopt->iova_rwsem); + + if ((iommu_prot & IOMMU_WRITE) && !pages->writable) + return -EPERM; + + area->iommu_prot = iommu_prot; + area->page_offset = start_byte % PAGE_SIZE; + if (area->page_offset & (iopt->iova_alignment - 1)) + return -EINVAL; + + area->node.start = iova; + if (check_add_overflow(iova, length - 1, &area->node.last)) + return -EOVERFLOW; + + area->pages_node.start = start_byte / PAGE_SIZE; + if (check_add_overflow(start_byte, length - 1, &area->pages_node.last)) + return -EOVERFLOW; + area->pages_node.last = area->pages_node.last / PAGE_SIZE; + if (WARN_ON(area->pages_node.last >= pages->npages)) + return -EOVERFLOW; + + /* + * The area is inserted with a NULL pages indicating it is not fully + * initialized yet. + */ + area->iopt = iopt; + interval_tree_insert(&area->node, &iopt->area_itree); + return 0; +} + +static struct iopt_area *iopt_area_alloc(void) +{ + struct iopt_area *area; + + area = kzalloc(sizeof(*area), GFP_KERNEL_ACCOUNT); + if (!area) + return NULL; + RB_CLEAR_NODE(&area->node.rb); + RB_CLEAR_NODE(&area->pages_node.rb); + return area; +} + +static int iopt_alloc_area_pages(struct io_pagetable *iopt, + struct list_head *pages_list, + unsigned long length, unsigned long *dst_iova, + int iommu_prot, unsigned int flags) +{ + struct iopt_pages_list *elm; + unsigned long iova; + int rc = 0; + + list_for_each_entry(elm, pages_list, next) { + elm->area = iopt_area_alloc(); + if (!elm->area) + return -ENOMEM; + } + + down_write(&iopt->iova_rwsem); + if ((length & (iopt->iova_alignment - 1)) || !length) { + rc = -EINVAL; + goto out_unlock; + } + + if (flags & IOPT_ALLOC_IOVA) { + /* Use the first entry to guess the ideal IOVA alignment */ + elm = list_first_entry(pages_list, struct iopt_pages_list, + next); + rc = iopt_alloc_iova( + iopt, dst_iova, + (uintptr_t)elm->pages->uptr + elm->start_byte, length); + if (rc) + goto out_unlock; + if (IS_ENABLED(CONFIG_IOMMUFD_TEST) && + WARN_ON(iopt_check_iova(iopt, *dst_iova, length))) { + rc = -EINVAL; + goto out_unlock; + } + } else { + rc = iopt_check_iova(iopt, *dst_iova, length); + if (rc) + goto out_unlock; + } + + /* + * Areas are created with a NULL pages so that the IOVA space is + * reserved and we can unlock the iova_rwsem. + */ + iova = *dst_iova; + list_for_each_entry(elm, pages_list, next) { + rc = iopt_insert_area(iopt, elm->area, elm->pages, iova, + elm->start_byte, elm->length, iommu_prot); + if (rc) + goto out_unlock; + iova += elm->length; + } + +out_unlock: + up_write(&iopt->iova_rwsem); + return rc; +} + +static void iopt_abort_area(struct iopt_area *area) +{ + if (IS_ENABLED(CONFIG_IOMMUFD_TEST)) + WARN_ON(area->pages); + if (area->iopt) { + down_write(&area->iopt->iova_rwsem); + interval_tree_remove(&area->node, &area->iopt->area_itree); + up_write(&area->iopt->iova_rwsem); + } + kfree(area); +} + +void iopt_free_pages_list(struct list_head *pages_list) +{ + struct iopt_pages_list *elm; + + while ((elm = list_first_entry_or_null(pages_list, + struct iopt_pages_list, next))) { + if (elm->area) + iopt_abort_area(elm->area); + if (elm->pages) + iopt_put_pages(elm->pages); + list_del(&elm->next); + kfree(elm); + } +} + +static int iopt_fill_domains_pages(struct list_head *pages_list) +{ + struct iopt_pages_list *undo_elm; + struct iopt_pages_list *elm; + int rc; + + list_for_each_entry(elm, pages_list, next) { + rc = iopt_area_fill_domains(elm->area, elm->pages); + if (rc) + goto err_undo; + } + return 0; + +err_undo: + list_for_each_entry(undo_elm, pages_list, next) { + if (undo_elm == elm) + break; + iopt_area_unfill_domains(undo_elm->area, undo_elm->pages); + } + return rc; +} + +int iopt_map_pages(struct io_pagetable *iopt, struct list_head *pages_list, + unsigned long length, unsigned long *dst_iova, + int iommu_prot, unsigned int flags) +{ + struct iopt_pages_list *elm; + int rc; + + rc = iopt_alloc_area_pages(iopt, pages_list, length, dst_iova, + iommu_prot, flags); + if (rc) + return rc; + + down_read(&iopt->domains_rwsem); + rc = iopt_fill_domains_pages(pages_list); + if (rc) + goto out_unlock_domains; + + down_write(&iopt->iova_rwsem); + list_for_each_entry(elm, pages_list, next) { + /* + * area->pages must be set inside the domains_rwsem to ensure + * any newly added domains will get filled. Moves the reference + * in from the list. + */ + elm->area->pages = elm->pages; + elm->pages = NULL; + elm->area = NULL; + } + up_write(&iopt->iova_rwsem); +out_unlock_domains: + up_read(&iopt->domains_rwsem); + return rc; +} + +/** + * iopt_map_user_pages() - Map a user VA to an iova in the io page table + * @ictx: iommufd_ctx the iopt is part of + * @iopt: io_pagetable to act on + * @iova: If IOPT_ALLOC_IOVA is set this is unused on input and contains + * the chosen iova on output. Otherwise is the iova to map to on input + * @uptr: User VA to map + * @length: Number of bytes to map + * @iommu_prot: Combination of IOMMU_READ/WRITE/etc bits for the mapping + * @flags: IOPT_ALLOC_IOVA or zero + * + * iova, uptr, and length must be aligned to iova_alignment. For domain backed + * page tables this will pin the pages and load them into the domain at iova. + * For non-domain page tables this will only setup a lazy reference and the + * caller must use iopt_access_pages() to touch them. + * + * iopt_unmap_iova() must be called to undo this before the io_pagetable can be + * destroyed. + */ +int iopt_map_user_pages(struct iommufd_ctx *ictx, struct io_pagetable *iopt, + unsigned long *iova, void __user *uptr, + unsigned long length, int iommu_prot, + unsigned int flags) +{ + struct iopt_pages_list elm = {}; + LIST_HEAD(pages_list); + int rc; + + elm.pages = iopt_alloc_pages(uptr, length, iommu_prot & IOMMU_WRITE); + if (IS_ERR(elm.pages)) + return PTR_ERR(elm.pages); + if (ictx->account_mode == IOPT_PAGES_ACCOUNT_MM && + elm.pages->account_mode == IOPT_PAGES_ACCOUNT_USER) + elm.pages->account_mode = IOPT_PAGES_ACCOUNT_MM; + elm.start_byte = uptr - elm.pages->uptr; + elm.length = length; + list_add(&elm.next, &pages_list); + + rc = iopt_map_pages(iopt, &pages_list, length, iova, iommu_prot, flags); + if (rc) { + if (elm.area) + iopt_abort_area(elm.area); + if (elm.pages) + iopt_put_pages(elm.pages); + return rc; + } + return 0; +} + +int iopt_get_pages(struct io_pagetable *iopt, unsigned long iova, + unsigned long length, struct list_head *pages_list) +{ + struct iopt_area_contig_iter iter; + unsigned long last_iova; + struct iopt_area *area; + int rc; + + if (!length) + return -EINVAL; + if (check_add_overflow(iova, length - 1, &last_iova)) + return -EOVERFLOW; + + down_read(&iopt->iova_rwsem); + iopt_for_each_contig_area(&iter, area, iopt, iova, last_iova) { + struct iopt_pages_list *elm; + unsigned long last = min(last_iova, iopt_area_last_iova(area)); + + elm = kzalloc(sizeof(*elm), GFP_KERNEL_ACCOUNT); + if (!elm) { + rc = -ENOMEM; + goto err_free; + } + elm->start_byte = iopt_area_start_byte(area, iter.cur_iova); + elm->pages = area->pages; + elm->length = (last - iter.cur_iova) + 1; + kref_get(&elm->pages->kref); + list_add_tail(&elm->next, pages_list); + } + if (!iopt_area_contig_done(&iter)) { + rc = -ENOENT; + goto err_free; + } + up_read(&iopt->iova_rwsem); + return 0; +err_free: + up_read(&iopt->iova_rwsem); + iopt_free_pages_list(pages_list); + return rc; +} + +static int iopt_unmap_iova_range(struct io_pagetable *iopt, unsigned long start, + unsigned long last, unsigned long *unmapped) +{ + struct iopt_area *area; + unsigned long unmapped_bytes = 0; + unsigned int tries = 0; + int rc = -ENOENT; + + /* + * The domains_rwsem must be held in read mode any time any area->pages + * is NULL. This prevents domain attach/detatch from running + * concurrently with cleaning up the area. + */ +again: + down_read(&iopt->domains_rwsem); + down_write(&iopt->iova_rwsem); + while ((area = iopt_area_iter_first(iopt, start, last))) { + unsigned long area_last = iopt_area_last_iova(area); + unsigned long area_first = iopt_area_iova(area); + struct iopt_pages *pages; + + /* Userspace should not race map/unmap's of the same area */ + if (!area->pages) { + rc = -EBUSY; + goto out_unlock_iova; + } + + if (area_first < start || area_last > last) { + rc = -ENOENT; + goto out_unlock_iova; + } + + if (area_first != start) + tries = 0; + + /* + * num_accesses writers must hold the iova_rwsem too, so we can + * safely read it under the write side of the iovam_rwsem + * without the pages->mutex. + */ + if (area->num_accesses) { + size_t length = iopt_area_length(area); + + start = area_first; + area->prevent_access = true; + up_write(&iopt->iova_rwsem); + up_read(&iopt->domains_rwsem); + + iommufd_access_notify_unmap(iopt, area_first, length); + /* Something is not responding to unmap requests. */ + tries++; + if (WARN_ON(tries > 100)) + return -EDEADLOCK; + goto again; + } + + pages = area->pages; + area->pages = NULL; + up_write(&iopt->iova_rwsem); + + iopt_area_unfill_domains(area, pages); + iopt_abort_area(area); + iopt_put_pages(pages); + + unmapped_bytes += area_last - area_first + 1; + + down_write(&iopt->iova_rwsem); + } + if (unmapped_bytes) + rc = 0; + +out_unlock_iova: + up_write(&iopt->iova_rwsem); + up_read(&iopt->domains_rwsem); + if (unmapped) + *unmapped = unmapped_bytes; + return rc; +} + +/** + * iopt_unmap_iova() - Remove a range of iova + * @iopt: io_pagetable to act on + * @iova: Starting iova to unmap + * @length: Number of bytes to unmap + * @unmapped: Return number of bytes unmapped + * + * The requested range must be a superset of existing ranges. + * Splitting/truncating IOVA mappings is not allowed. + */ +int iopt_unmap_iova(struct io_pagetable *iopt, unsigned long iova, + unsigned long length, unsigned long *unmapped) +{ + unsigned long iova_last; + + if (!length) + return -EINVAL; + + if (check_add_overflow(iova, length - 1, &iova_last)) + return -EOVERFLOW; + + return iopt_unmap_iova_range(iopt, iova, iova_last, unmapped); +} + +int iopt_unmap_all(struct io_pagetable *iopt, unsigned long *unmapped) +{ + int rc; + + rc = iopt_unmap_iova_range(iopt, 0, ULONG_MAX, unmapped); + /* If the IOVAs are empty then unmap all succeeds */ + if (rc == -ENOENT) + return 0; + return rc; +} + +/* The caller must always free all the nodes in the allowed_iova rb_root. */ +int iopt_set_allow_iova(struct io_pagetable *iopt, + struct rb_root_cached *allowed_iova) +{ + struct iopt_allowed *allowed; + + down_write(&iopt->iova_rwsem); + swap(*allowed_iova, iopt->allowed_itree); + + for (allowed = iopt_allowed_iter_first(iopt, 0, ULONG_MAX); allowed; + allowed = iopt_allowed_iter_next(allowed, 0, ULONG_MAX)) { + if (iopt_reserved_iter_first(iopt, allowed->node.start, + allowed->node.last)) { + swap(*allowed_iova, iopt->allowed_itree); + up_write(&iopt->iova_rwsem); + return -EADDRINUSE; + } + } + up_write(&iopt->iova_rwsem); + return 0; +} + +int iopt_reserve_iova(struct io_pagetable *iopt, unsigned long start, + unsigned long last, void *owner) +{ + struct iopt_reserved *reserved; + + lockdep_assert_held_write(&iopt->iova_rwsem); + + if (iopt_area_iter_first(iopt, start, last) || + iopt_allowed_iter_first(iopt, start, last)) + return -EADDRINUSE; + + reserved = kzalloc(sizeof(*reserved), GFP_KERNEL_ACCOUNT); + if (!reserved) + return -ENOMEM; + reserved->node.start = start; + reserved->node.last = last; + reserved->owner = owner; + interval_tree_insert(&reserved->node, &iopt->reserved_itree); + return 0; +} + +static void __iopt_remove_reserved_iova(struct io_pagetable *iopt, void *owner) +{ + struct iopt_reserved *reserved, *next; + + lockdep_assert_held_write(&iopt->iova_rwsem); + + for (reserved = iopt_reserved_iter_first(iopt, 0, ULONG_MAX); reserved; + reserved = next) { + next = iopt_reserved_iter_next(reserved, 0, ULONG_MAX); + + if (reserved->owner == owner) { + interval_tree_remove(&reserved->node, + &iopt->reserved_itree); + kfree(reserved); + } + } +} + +void iopt_remove_reserved_iova(struct io_pagetable *iopt, void *owner) +{ + down_write(&iopt->iova_rwsem); + __iopt_remove_reserved_iova(iopt, owner); + up_write(&iopt->iova_rwsem); +} + +void iopt_init_table(struct io_pagetable *iopt) +{ + init_rwsem(&iopt->iova_rwsem); + init_rwsem(&iopt->domains_rwsem); + iopt->area_itree = RB_ROOT_CACHED; + iopt->allowed_itree = RB_ROOT_CACHED; + iopt->reserved_itree = RB_ROOT_CACHED; + xa_init_flags(&iopt->domains, XA_FLAGS_ACCOUNT); + xa_init_flags(&iopt->access_list, XA_FLAGS_ALLOC); + + /* + * iopt's start as SW tables that can use the entire size_t IOVA space + * due to the use of size_t in the APIs. They have no alignment + * restriction. + */ + iopt->iova_alignment = 1; +} + +void iopt_destroy_table(struct io_pagetable *iopt) +{ + struct interval_tree_node *node; + + if (IS_ENABLED(CONFIG_IOMMUFD_TEST)) + iopt_remove_reserved_iova(iopt, NULL); + + while ((node = interval_tree_iter_first(&iopt->allowed_itree, 0, + ULONG_MAX))) { + interval_tree_remove(node, &iopt->allowed_itree); + kfree(container_of(node, struct iopt_allowed, node)); + } + + WARN_ON(!RB_EMPTY_ROOT(&iopt->reserved_itree.rb_root)); + WARN_ON(!xa_empty(&iopt->domains)); + WARN_ON(!xa_empty(&iopt->access_list)); + WARN_ON(!RB_EMPTY_ROOT(&iopt->area_itree.rb_root)); +} + +/** + * iopt_unfill_domain() - Unfill a domain with PFNs + * @iopt: io_pagetable to act on + * @domain: domain to unfill + * + * This is used when removing a domain from the iopt. Every area in the iopt + * will be unmapped from the domain. The domain must already be removed from the + * domains xarray. + */ +static void iopt_unfill_domain(struct io_pagetable *iopt, + struct iommu_domain *domain) +{ + struct iopt_area *area; + + lockdep_assert_held(&iopt->iova_rwsem); + lockdep_assert_held_write(&iopt->domains_rwsem); + + /* + * Some other domain is holding all the pfns still, rapidly unmap this + * domain. + */ + if (iopt->next_domain_id != 0) { + /* Pick an arbitrary remaining domain to act as storage */ + struct iommu_domain *storage_domain = + xa_load(&iopt->domains, 0); + + for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area; + area = iopt_area_iter_next(area, 0, ULONG_MAX)) { + struct iopt_pages *pages = area->pages; + + if (!pages) + continue; + + mutex_lock(&pages->mutex); + if (IS_ENABLED(CONFIG_IOMMUFD_TEST)) + WARN_ON(!area->storage_domain); + if (area->storage_domain == domain) + area->storage_domain = storage_domain; + mutex_unlock(&pages->mutex); + + iopt_area_unmap_domain(area, domain); + } + return; + } + + for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area; + area = iopt_area_iter_next(area, 0, ULONG_MAX)) { + struct iopt_pages *pages = area->pages; + + if (!pages) + continue; + + mutex_lock(&pages->mutex); + interval_tree_remove(&area->pages_node, &pages->domains_itree); + WARN_ON(area->storage_domain != domain); + area->storage_domain = NULL; + iopt_area_unfill_domain(area, pages, domain); + mutex_unlock(&pages->mutex); + } +} + +/** + * iopt_fill_domain() - Fill a domain with PFNs + * @iopt: io_pagetable to act on + * @domain: domain to fill + * + * Fill the domain with PFNs from every area in the iopt. On failure the domain + * is left unchanged. + */ +static int iopt_fill_domain(struct io_pagetable *iopt, + struct iommu_domain *domain) +{ + struct iopt_area *end_area; + struct iopt_area *area; + int rc; + + lockdep_assert_held(&iopt->iova_rwsem); + lockdep_assert_held_write(&iopt->domains_rwsem); + + for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area; + area = iopt_area_iter_next(area, 0, ULONG_MAX)) { + struct iopt_pages *pages = area->pages; + + if (!pages) + continue; + + mutex_lock(&pages->mutex); + rc = iopt_area_fill_domain(area, domain); + if (rc) { + mutex_unlock(&pages->mutex); + goto out_unfill; + } + if (!area->storage_domain) { + WARN_ON(iopt->next_domain_id != 0); + area->storage_domain = domain; + interval_tree_insert(&area->pages_node, + &pages->domains_itree); + } + mutex_unlock(&pages->mutex); + } + return 0; + +out_unfill: + end_area = area; + for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area; + area = iopt_area_iter_next(area, 0, ULONG_MAX)) { + struct iopt_pages *pages = area->pages; + + if (area == end_area) + break; + if (!pages) + continue; + mutex_lock(&pages->mutex); + if (iopt->next_domain_id == 0) { + interval_tree_remove(&area->pages_node, + &pages->domains_itree); + area->storage_domain = NULL; + } + iopt_area_unfill_domain(area, pages, domain); + mutex_unlock(&pages->mutex); + } + return rc; +} + +/* All existing area's conform to an increased page size */ +static int iopt_check_iova_alignment(struct io_pagetable *iopt, + unsigned long new_iova_alignment) +{ + unsigned long align_mask = new_iova_alignment - 1; + struct iopt_area *area; + + lockdep_assert_held(&iopt->iova_rwsem); + lockdep_assert_held(&iopt->domains_rwsem); + + for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area; + area = iopt_area_iter_next(area, 0, ULONG_MAX)) + if ((iopt_area_iova(area) & align_mask) || + (iopt_area_length(area) & align_mask) || + (area->page_offset & align_mask)) + return -EADDRINUSE; + + if (IS_ENABLED(CONFIG_IOMMUFD_TEST)) { + struct iommufd_access *access; + unsigned long index; + + xa_for_each(&iopt->access_list, index, access) + if (WARN_ON(access->iova_alignment > + new_iova_alignment)) + return -EADDRINUSE; + } + return 0; +} + +int iopt_table_add_domain(struct io_pagetable *iopt, + struct iommu_domain *domain) +{ + const struct iommu_domain_geometry *geometry = &domain->geometry; + struct iommu_domain *iter_domain; + unsigned int new_iova_alignment; + unsigned long index; + int rc; + + down_write(&iopt->domains_rwsem); + down_write(&iopt->iova_rwsem); + + xa_for_each(&iopt->domains, index, iter_domain) { + if (WARN_ON(iter_domain == domain)) { + rc = -EEXIST; + goto out_unlock; + } + } + + /* + * The io page size drives the iova_alignment. Internally the iopt_pages + * works in PAGE_SIZE units and we adjust when mapping sub-PAGE_SIZE + * objects into the iommu_domain. + * + * A iommu_domain must always be able to accept PAGE_SIZE to be + * compatible as we can't guarantee higher contiguity. + */ + new_iova_alignment = max_t(unsigned long, + 1UL << __ffs(domain->pgsize_bitmap), + iopt->iova_alignment); + if (new_iova_alignment > PAGE_SIZE) { + rc = -EINVAL; + goto out_unlock; + } + if (new_iova_alignment != iopt->iova_alignment) { + rc = iopt_check_iova_alignment(iopt, new_iova_alignment); + if (rc) + goto out_unlock; + } + + /* No area exists that is outside the allowed domain aperture */ + if (geometry->aperture_start != 0) { + rc = iopt_reserve_iova(iopt, 0, geometry->aperture_start - 1, + domain); + if (rc) + goto out_reserved; + } + if (geometry->aperture_end != ULONG_MAX) { + rc = iopt_reserve_iova(iopt, geometry->aperture_end + 1, + ULONG_MAX, domain); + if (rc) + goto out_reserved; + } + + rc = xa_reserve(&iopt->domains, iopt->next_domain_id, GFP_KERNEL); + if (rc) + goto out_reserved; + + rc = iopt_fill_domain(iopt, domain); + if (rc) + goto out_release; + + iopt->iova_alignment = new_iova_alignment; + xa_store(&iopt->domains, iopt->next_domain_id, domain, GFP_KERNEL); + iopt->next_domain_id++; + up_write(&iopt->iova_rwsem); + up_write(&iopt->domains_rwsem); + return 0; +out_release: + xa_release(&iopt->domains, iopt->next_domain_id); +out_reserved: + __iopt_remove_reserved_iova(iopt, domain); +out_unlock: + up_write(&iopt->iova_rwsem); + up_write(&iopt->domains_rwsem); + return rc; +} + +static int iopt_calculate_iova_alignment(struct io_pagetable *iopt) +{ + unsigned long new_iova_alignment; + struct iommufd_access *access; + struct iommu_domain *domain; + unsigned long index; + + lockdep_assert_held_write(&iopt->iova_rwsem); + lockdep_assert_held(&iopt->domains_rwsem); + + /* See batch_iommu_map_small() */ + if (iopt->disable_large_pages) + new_iova_alignment = PAGE_SIZE; + else + new_iova_alignment = 1; + + xa_for_each(&iopt->domains, index, domain) + new_iova_alignment = max_t(unsigned long, + 1UL << __ffs(domain->pgsize_bitmap), + new_iova_alignment); + xa_for_each(&iopt->access_list, index, access) + new_iova_alignment = max_t(unsigned long, + access->iova_alignment, + new_iova_alignment); + + if (new_iova_alignment > iopt->iova_alignment) { + int rc; + + rc = iopt_check_iova_alignment(iopt, new_iova_alignment); + if (rc) + return rc; + } + iopt->iova_alignment = new_iova_alignment; + return 0; +} + +void iopt_table_remove_domain(struct io_pagetable *iopt, + struct iommu_domain *domain) +{ + struct iommu_domain *iter_domain = NULL; + unsigned long index; + + down_write(&iopt->domains_rwsem); + down_write(&iopt->iova_rwsem); + + xa_for_each(&iopt->domains, index, iter_domain) + if (iter_domain == domain) + break; + if (WARN_ON(iter_domain != domain) || index >= iopt->next_domain_id) + goto out_unlock; + + /* + * Compress the xarray to keep it linear by swapping the entry to erase + * with the tail entry and shrinking the tail. + */ + iopt->next_domain_id--; + iter_domain = xa_erase(&iopt->domains, iopt->next_domain_id); + if (index != iopt->next_domain_id) + xa_store(&iopt->domains, index, iter_domain, GFP_KERNEL); + + iopt_unfill_domain(iopt, domain); + __iopt_remove_reserved_iova(iopt, domain); + + WARN_ON(iopt_calculate_iova_alignment(iopt)); +out_unlock: + up_write(&iopt->iova_rwsem); + up_write(&iopt->domains_rwsem); +} + +/** + * iopt_area_split - Split an area into two parts at iova + * @area: The area to split + * @iova: Becomes the last of a new area + * + * This splits an area into two. It is part of the VFIO compatibility to allow + * poking a hole in the mapping. The two areas continue to point at the same + * iopt_pages, just with different starting bytes. + */ +static int iopt_area_split(struct iopt_area *area, unsigned long iova) +{ + unsigned long alignment = area->iopt->iova_alignment; + unsigned long last_iova = iopt_area_last_iova(area); + unsigned long start_iova = iopt_area_iova(area); + unsigned long new_start = iova + 1; + struct io_pagetable *iopt = area->iopt; + struct iopt_pages *pages = area->pages; + struct iopt_area *lhs; + struct iopt_area *rhs; + int rc; + + lockdep_assert_held_write(&iopt->iova_rwsem); + + if (iova == start_iova || iova == last_iova) + return 0; + + if (!pages || area->prevent_access) + return -EBUSY; + + if (new_start & (alignment - 1) || + iopt_area_start_byte(area, new_start) & (alignment - 1)) + return -EINVAL; + + lhs = iopt_area_alloc(); + if (!lhs) + return -ENOMEM; + + rhs = iopt_area_alloc(); + if (!rhs) { + rc = -ENOMEM; + goto err_free_lhs; + } + + mutex_lock(&pages->mutex); + /* + * Splitting is not permitted if an access exists, we don't track enough + * information to split existing accesses. + */ + if (area->num_accesses) { + rc = -EINVAL; + goto err_unlock; + } + + /* + * Splitting is not permitted if a domain could have been mapped with + * huge pages. + */ + if (area->storage_domain && !iopt->disable_large_pages) { + rc = -EINVAL; + goto err_unlock; + } + + interval_tree_remove(&area->node, &iopt->area_itree); + rc = iopt_insert_area(iopt, lhs, area->pages, start_iova, + iopt_area_start_byte(area, start_iova), + (new_start - 1) - start_iova + 1, + area->iommu_prot); + if (WARN_ON(rc)) + goto err_insert; + + rc = iopt_insert_area(iopt, rhs, area->pages, new_start, + iopt_area_start_byte(area, new_start), + last_iova - new_start + 1, area->iommu_prot); + if (WARN_ON(rc)) + goto err_remove_lhs; + + /* + * If the original area has filled a domain, domains_itree has to be + * updated. + */ + if (area->storage_domain) { + interval_tree_remove(&area->pages_node, &pages->domains_itree); + interval_tree_insert(&lhs->pages_node, &pages->domains_itree); + interval_tree_insert(&rhs->pages_node, &pages->domains_itree); + } + + lhs->storage_domain = area->storage_domain; + lhs->pages = area->pages; + rhs->storage_domain = area->storage_domain; + rhs->pages = area->pages; + kref_get(&rhs->pages->kref); + kfree(area); + mutex_unlock(&pages->mutex); + + /* + * No change to domains or accesses because the pages hasn't been + * changed + */ + return 0; + +err_remove_lhs: + interval_tree_remove(&lhs->node, &iopt->area_itree); +err_insert: + interval_tree_insert(&area->node, &iopt->area_itree); +err_unlock: + mutex_unlock(&pages->mutex); + kfree(rhs); +err_free_lhs: + kfree(lhs); + return rc; +} + +int iopt_cut_iova(struct io_pagetable *iopt, unsigned long *iovas, + size_t num_iovas) +{ + int rc = 0; + int i; + + down_write(&iopt->iova_rwsem); + for (i = 0; i < num_iovas; i++) { + struct iopt_area *area; + + area = iopt_area_iter_first(iopt, iovas[i], iovas[i]); + if (!area) + continue; + rc = iopt_area_split(area, iovas[i]); + if (rc) + break; + } + up_write(&iopt->iova_rwsem); + return rc; +} + +void iopt_enable_large_pages(struct io_pagetable *iopt) +{ + int rc; + + down_write(&iopt->domains_rwsem); + down_write(&iopt->iova_rwsem); + WRITE_ONCE(iopt->disable_large_pages, false); + rc = iopt_calculate_iova_alignment(iopt); + WARN_ON(rc); + up_write(&iopt->iova_rwsem); + up_write(&iopt->domains_rwsem); +} + +int iopt_disable_large_pages(struct io_pagetable *iopt) +{ + int rc = 0; + + down_write(&iopt->domains_rwsem); + down_write(&iopt->iova_rwsem); + if (iopt->disable_large_pages) + goto out_unlock; + + /* Won't do it if domains already have pages mapped in them */ + if (!xa_empty(&iopt->domains) && + !RB_EMPTY_ROOT(&iopt->area_itree.rb_root)) { + rc = -EINVAL; + goto out_unlock; + } + + WRITE_ONCE(iopt->disable_large_pages, true); + rc = iopt_calculate_iova_alignment(iopt); + if (rc) + WRITE_ONCE(iopt->disable_large_pages, false); +out_unlock: + up_write(&iopt->iova_rwsem); + up_write(&iopt->domains_rwsem); + return rc; +} + +int iopt_add_access(struct io_pagetable *iopt, struct iommufd_access *access) +{ + int rc; + + down_write(&iopt->domains_rwsem); + down_write(&iopt->iova_rwsem); + rc = xa_alloc(&iopt->access_list, &access->iopt_access_list_id, access, + xa_limit_16b, GFP_KERNEL_ACCOUNT); + if (rc) + goto out_unlock; + + rc = iopt_calculate_iova_alignment(iopt); + if (rc) { + xa_erase(&iopt->access_list, access->iopt_access_list_id); + goto out_unlock; + } + +out_unlock: + up_write(&iopt->iova_rwsem); + up_write(&iopt->domains_rwsem); + return rc; +} + +void iopt_remove_access(struct io_pagetable *iopt, + struct iommufd_access *access, + u32 iopt_access_list_id) +{ + down_write(&iopt->domains_rwsem); + down_write(&iopt->iova_rwsem); + WARN_ON(xa_erase(&iopt->access_list, iopt_access_list_id) != access); + WARN_ON(iopt_calculate_iova_alignment(iopt)); + up_write(&iopt->iova_rwsem); + up_write(&iopt->domains_rwsem); +} + +/* Narrow the valid_iova_itree to include reserved ranges from a device. */ +int iopt_table_enforce_dev_resv_regions(struct io_pagetable *iopt, + struct device *dev, + phys_addr_t *sw_msi_start) +{ + struct iommu_resv_region *resv; + LIST_HEAD(resv_regions); + unsigned int num_hw_msi = 0; + unsigned int num_sw_msi = 0; + int rc; + + if (iommufd_should_fail()) + return -EINVAL; + + down_write(&iopt->iova_rwsem); + /* FIXME: drivers allocate memory but there is no failure propogated */ + iommu_get_resv_regions(dev, &resv_regions); + + list_for_each_entry(resv, &resv_regions, list) { + if (resv->type == IOMMU_RESV_DIRECT_RELAXABLE) + continue; + + if (sw_msi_start && resv->type == IOMMU_RESV_MSI) + num_hw_msi++; + if (sw_msi_start && resv->type == IOMMU_RESV_SW_MSI) { + *sw_msi_start = resv->start; + num_sw_msi++; + } + + rc = iopt_reserve_iova(iopt, resv->start, + resv->length - 1 + resv->start, dev); + if (rc) + goto out_reserved; + } + + /* Drivers must offer sane combinations of regions */ + if (WARN_ON(num_sw_msi && num_hw_msi) || WARN_ON(num_sw_msi > 1)) { + rc = -EINVAL; + goto out_reserved; + } + + rc = 0; + goto out_free_resv; + +out_reserved: + __iopt_remove_reserved_iova(iopt, dev); +out_free_resv: + iommu_put_resv_regions(dev, &resv_regions); + up_write(&iopt->iova_rwsem); + return rc; +} diff --git a/drivers/iommu/iommufd/io_pagetable.h b/drivers/iommu/iommufd/io_pagetable.h new file mode 100644 index 0000000000..0ec3509b7e --- /dev/null +++ b/drivers/iommu/iommufd/io_pagetable.h @@ -0,0 +1,241 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. + * + */ +#ifndef __IO_PAGETABLE_H +#define __IO_PAGETABLE_H + +#include <linux/interval_tree.h> +#include <linux/mutex.h> +#include <linux/kref.h> +#include <linux/xarray.h> + +#include "iommufd_private.h" + +struct iommu_domain; + +/* + * Each io_pagetable is composed of intervals of areas which cover regions of + * the iova that are backed by something. iova not covered by areas is not + * populated in the page table. Each area is fully populated with pages. + * + * iovas are in byte units, but must be iopt->iova_alignment aligned. + * + * pages can be NULL, this means some other thread is still working on setting + * up or tearing down the area. When observed under the write side of the + * domain_rwsem a NULL pages must mean the area is still being setup and no + * domains are filled. + * + * storage_domain points at an arbitrary iommu_domain that is holding the PFNs + * for this area. It is locked by the pages->mutex. This simplifies the locking + * as the pages code can rely on the storage_domain without having to get the + * iopt->domains_rwsem. + * + * The io_pagetable::iova_rwsem protects node + * The iopt_pages::mutex protects pages_node + * iopt and iommu_prot are immutable + * The pages::mutex protects num_accesses + */ +struct iopt_area { + struct interval_tree_node node; + struct interval_tree_node pages_node; + struct io_pagetable *iopt; + struct iopt_pages *pages; + struct iommu_domain *storage_domain; + /* How many bytes into the first page the area starts */ + unsigned int page_offset; + /* IOMMU_READ, IOMMU_WRITE, etc */ + int iommu_prot; + bool prevent_access : 1; + unsigned int num_accesses; +}; + +struct iopt_allowed { + struct interval_tree_node node; +}; + +struct iopt_reserved { + struct interval_tree_node node; + void *owner; +}; + +int iopt_area_fill_domains(struct iopt_area *area, struct iopt_pages *pages); +void iopt_area_unfill_domains(struct iopt_area *area, struct iopt_pages *pages); + +int iopt_area_fill_domain(struct iopt_area *area, struct iommu_domain *domain); +void iopt_area_unfill_domain(struct iopt_area *area, struct iopt_pages *pages, + struct iommu_domain *domain); +void iopt_area_unmap_domain(struct iopt_area *area, + struct iommu_domain *domain); + +static inline unsigned long iopt_area_index(struct iopt_area *area) +{ + return area->pages_node.start; +} + +static inline unsigned long iopt_area_last_index(struct iopt_area *area) +{ + return area->pages_node.last; +} + +static inline unsigned long iopt_area_iova(struct iopt_area *area) +{ + return area->node.start; +} + +static inline unsigned long iopt_area_last_iova(struct iopt_area *area) +{ + return area->node.last; +} + +static inline size_t iopt_area_length(struct iopt_area *area) +{ + return (area->node.last - area->node.start) + 1; +} + +/* + * Number of bytes from the start of the iopt_pages that the iova begins. + * iopt_area_start_byte() / PAGE_SIZE encodes the starting page index + * iopt_area_start_byte() % PAGE_SIZE encodes the offset within that page + */ +static inline unsigned long iopt_area_start_byte(struct iopt_area *area, + unsigned long iova) +{ + if (IS_ENABLED(CONFIG_IOMMUFD_TEST)) + WARN_ON(iova < iopt_area_iova(area) || + iova > iopt_area_last_iova(area)); + return (iova - iopt_area_iova(area)) + area->page_offset + + iopt_area_index(area) * PAGE_SIZE; +} + +static inline unsigned long iopt_area_iova_to_index(struct iopt_area *area, + unsigned long iova) +{ + return iopt_area_start_byte(area, iova) / PAGE_SIZE; +} + +#define __make_iopt_iter(name) \ + static inline struct iopt_##name *iopt_##name##_iter_first( \ + struct io_pagetable *iopt, unsigned long start, \ + unsigned long last) \ + { \ + struct interval_tree_node *node; \ + \ + lockdep_assert_held(&iopt->iova_rwsem); \ + node = interval_tree_iter_first(&iopt->name##_itree, start, \ + last); \ + if (!node) \ + return NULL; \ + return container_of(node, struct iopt_##name, node); \ + } \ + static inline struct iopt_##name *iopt_##name##_iter_next( \ + struct iopt_##name *last_node, unsigned long start, \ + unsigned long last) \ + { \ + struct interval_tree_node *node; \ + \ + node = interval_tree_iter_next(&last_node->node, start, last); \ + if (!node) \ + return NULL; \ + return container_of(node, struct iopt_##name, node); \ + } + +__make_iopt_iter(area) +__make_iopt_iter(allowed) +__make_iopt_iter(reserved) + +struct iopt_area_contig_iter { + unsigned long cur_iova; + unsigned long last_iova; + struct iopt_area *area; +}; +struct iopt_area *iopt_area_contig_init(struct iopt_area_contig_iter *iter, + struct io_pagetable *iopt, + unsigned long iova, + unsigned long last_iova); +struct iopt_area *iopt_area_contig_next(struct iopt_area_contig_iter *iter); + +static inline bool iopt_area_contig_done(struct iopt_area_contig_iter *iter) +{ + return iter->area && iter->last_iova <= iopt_area_last_iova(iter->area); +} + +/* + * Iterate over a contiguous list of areas that span the iova,last_iova range. + * The caller must check iopt_area_contig_done() after the loop to see if + * contiguous areas existed. + */ +#define iopt_for_each_contig_area(iter, area, iopt, iova, last_iova) \ + for (area = iopt_area_contig_init(iter, iopt, iova, last_iova); area; \ + area = iopt_area_contig_next(iter)) + +enum { + IOPT_PAGES_ACCOUNT_NONE = 0, + IOPT_PAGES_ACCOUNT_USER = 1, + IOPT_PAGES_ACCOUNT_MM = 2, +}; + +/* + * This holds a pinned page list for multiple areas of IO address space. The + * pages always originate from a linear chunk of userspace VA. Multiple + * io_pagetable's, through their iopt_area's, can share a single iopt_pages + * which avoids multi-pinning and double accounting of page consumption. + * + * indexes in this structure are measured in PAGE_SIZE units, are 0 based from + * the start of the uptr and extend to npages. pages are pinned dynamically + * according to the intervals in the access_itree and domains_itree, npinned + * records the current number of pages pinned. + */ +struct iopt_pages { + struct kref kref; + struct mutex mutex; + size_t npages; + size_t npinned; + size_t last_npinned; + struct task_struct *source_task; + struct mm_struct *source_mm; + struct user_struct *source_user; + void __user *uptr; + bool writable:1; + u8 account_mode; + + struct xarray pinned_pfns; + /* Of iopt_pages_access::node */ + struct rb_root_cached access_itree; + /* Of iopt_area::pages_node */ + struct rb_root_cached domains_itree; +}; + +struct iopt_pages *iopt_alloc_pages(void __user *uptr, unsigned long length, + bool writable); +void iopt_release_pages(struct kref *kref); +static inline void iopt_put_pages(struct iopt_pages *pages) +{ + kref_put(&pages->kref, iopt_release_pages); +} + +void iopt_pages_fill_from_xarray(struct iopt_pages *pages, unsigned long start, + unsigned long last, struct page **out_pages); +int iopt_pages_fill_xarray(struct iopt_pages *pages, unsigned long start, + unsigned long last, struct page **out_pages); +void iopt_pages_unfill_xarray(struct iopt_pages *pages, unsigned long start, + unsigned long last); + +int iopt_area_add_access(struct iopt_area *area, unsigned long start, + unsigned long last, struct page **out_pages, + unsigned int flags); +void iopt_area_remove_access(struct iopt_area *area, unsigned long start, + unsigned long last); +int iopt_pages_rw_access(struct iopt_pages *pages, unsigned long start_byte, + void *data, unsigned long length, unsigned int flags); + +/* + * Each interval represents an active iopt_access_pages(), it acts as an + * interval lock that keeps the PFNs pinned and stored in the xarray. + */ +struct iopt_pages_access { + struct interval_tree_node node; + unsigned int users; +}; + +#endif diff --git a/drivers/iommu/iommufd/ioas.c b/drivers/iommu/iommufd/ioas.c new file mode 100644 index 0000000000..d5624577f7 --- /dev/null +++ b/drivers/iommu/iommufd/ioas.c @@ -0,0 +1,398 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES + */ +#include <linux/interval_tree.h> +#include <linux/iommufd.h> +#include <linux/iommu.h> +#include <uapi/linux/iommufd.h> + +#include "io_pagetable.h" + +void iommufd_ioas_destroy(struct iommufd_object *obj) +{ + struct iommufd_ioas *ioas = container_of(obj, struct iommufd_ioas, obj); + int rc; + + rc = iopt_unmap_all(&ioas->iopt, NULL); + WARN_ON(rc && rc != -ENOENT); + iopt_destroy_table(&ioas->iopt); + mutex_destroy(&ioas->mutex); +} + +struct iommufd_ioas *iommufd_ioas_alloc(struct iommufd_ctx *ictx) +{ + struct iommufd_ioas *ioas; + + ioas = iommufd_object_alloc(ictx, ioas, IOMMUFD_OBJ_IOAS); + if (IS_ERR(ioas)) + return ioas; + + iopt_init_table(&ioas->iopt); + INIT_LIST_HEAD(&ioas->hwpt_list); + mutex_init(&ioas->mutex); + return ioas; +} + +int iommufd_ioas_alloc_ioctl(struct iommufd_ucmd *ucmd) +{ + struct iommu_ioas_alloc *cmd = ucmd->cmd; + struct iommufd_ioas *ioas; + int rc; + + if (cmd->flags) + return -EOPNOTSUPP; + + ioas = iommufd_ioas_alloc(ucmd->ictx); + if (IS_ERR(ioas)) + return PTR_ERR(ioas); + + cmd->out_ioas_id = ioas->obj.id; + rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd)); + if (rc) + goto out_table; + iommufd_object_finalize(ucmd->ictx, &ioas->obj); + return 0; + +out_table: + iommufd_object_abort_and_destroy(ucmd->ictx, &ioas->obj); + return rc; +} + +int iommufd_ioas_iova_ranges(struct iommufd_ucmd *ucmd) +{ + struct iommu_iova_range __user *ranges; + struct iommu_ioas_iova_ranges *cmd = ucmd->cmd; + struct iommufd_ioas *ioas; + struct interval_tree_span_iter span; + u32 max_iovas; + int rc; + + if (cmd->__reserved) + return -EOPNOTSUPP; + + ioas = iommufd_get_ioas(ucmd->ictx, cmd->ioas_id); + if (IS_ERR(ioas)) + return PTR_ERR(ioas); + + down_read(&ioas->iopt.iova_rwsem); + max_iovas = cmd->num_iovas; + ranges = u64_to_user_ptr(cmd->allowed_iovas); + cmd->num_iovas = 0; + cmd->out_iova_alignment = ioas->iopt.iova_alignment; + interval_tree_for_each_span(&span, &ioas->iopt.reserved_itree, 0, + ULONG_MAX) { + if (!span.is_hole) + continue; + if (cmd->num_iovas < max_iovas) { + struct iommu_iova_range elm = { + .start = span.start_hole, + .last = span.last_hole, + }; + + if (copy_to_user(&ranges[cmd->num_iovas], &elm, + sizeof(elm))) { + rc = -EFAULT; + goto out_put; + } + } + cmd->num_iovas++; + } + rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd)); + if (rc) + goto out_put; + if (cmd->num_iovas > max_iovas) + rc = -EMSGSIZE; +out_put: + up_read(&ioas->iopt.iova_rwsem); + iommufd_put_object(&ioas->obj); + return rc; +} + +static int iommufd_ioas_load_iovas(struct rb_root_cached *itree, + struct iommu_iova_range __user *ranges, + u32 num) +{ + u32 i; + + for (i = 0; i != num; i++) { + struct iommu_iova_range range; + struct iopt_allowed *allowed; + + if (copy_from_user(&range, ranges + i, sizeof(range))) + return -EFAULT; + + if (range.start >= range.last) + return -EINVAL; + + if (interval_tree_iter_first(itree, range.start, range.last)) + return -EINVAL; + + allowed = kzalloc(sizeof(*allowed), GFP_KERNEL_ACCOUNT); + if (!allowed) + return -ENOMEM; + allowed->node.start = range.start; + allowed->node.last = range.last; + + interval_tree_insert(&allowed->node, itree); + } + return 0; +} + +int iommufd_ioas_allow_iovas(struct iommufd_ucmd *ucmd) +{ + struct iommu_ioas_allow_iovas *cmd = ucmd->cmd; + struct rb_root_cached allowed_iova = RB_ROOT_CACHED; + struct interval_tree_node *node; + struct iommufd_ioas *ioas; + struct io_pagetable *iopt; + int rc = 0; + + if (cmd->__reserved) + return -EOPNOTSUPP; + + ioas = iommufd_get_ioas(ucmd->ictx, cmd->ioas_id); + if (IS_ERR(ioas)) + return PTR_ERR(ioas); + iopt = &ioas->iopt; + + rc = iommufd_ioas_load_iovas(&allowed_iova, + u64_to_user_ptr(cmd->allowed_iovas), + cmd->num_iovas); + if (rc) + goto out_free; + + /* + * We want the allowed tree update to be atomic, so we have to keep the + * original nodes around, and keep track of the new nodes as we allocate + * memory for them. The simplest solution is to have a new/old tree and + * then swap new for old. On success we free the old tree, on failure we + * free the new tree. + */ + rc = iopt_set_allow_iova(iopt, &allowed_iova); +out_free: + while ((node = interval_tree_iter_first(&allowed_iova, 0, ULONG_MAX))) { + interval_tree_remove(node, &allowed_iova); + kfree(container_of(node, struct iopt_allowed, node)); + } + iommufd_put_object(&ioas->obj); + return rc; +} + +static int conv_iommu_prot(u32 map_flags) +{ + /* + * We provide no manual cache coherency ioctls to userspace and most + * architectures make the CPU ops for cache flushing privileged. + * Therefore we require the underlying IOMMU to support CPU coherent + * operation. Support for IOMMU_CACHE is enforced by the + * IOMMU_CAP_CACHE_COHERENCY test during bind. + */ + int iommu_prot = IOMMU_CACHE; + + if (map_flags & IOMMU_IOAS_MAP_WRITEABLE) + iommu_prot |= IOMMU_WRITE; + if (map_flags & IOMMU_IOAS_MAP_READABLE) + iommu_prot |= IOMMU_READ; + return iommu_prot; +} + +int iommufd_ioas_map(struct iommufd_ucmd *ucmd) +{ + struct iommu_ioas_map *cmd = ucmd->cmd; + unsigned long iova = cmd->iova; + struct iommufd_ioas *ioas; + unsigned int flags = 0; + int rc; + + if ((cmd->flags & + ~(IOMMU_IOAS_MAP_FIXED_IOVA | IOMMU_IOAS_MAP_WRITEABLE | + IOMMU_IOAS_MAP_READABLE)) || + cmd->__reserved) + return -EOPNOTSUPP; + if (cmd->iova >= ULONG_MAX || cmd->length >= ULONG_MAX) + return -EOVERFLOW; + + ioas = iommufd_get_ioas(ucmd->ictx, cmd->ioas_id); + if (IS_ERR(ioas)) + return PTR_ERR(ioas); + + if (!(cmd->flags & IOMMU_IOAS_MAP_FIXED_IOVA)) + flags = IOPT_ALLOC_IOVA; + rc = iopt_map_user_pages(ucmd->ictx, &ioas->iopt, &iova, + u64_to_user_ptr(cmd->user_va), cmd->length, + conv_iommu_prot(cmd->flags), flags); + if (rc) + goto out_put; + + cmd->iova = iova; + rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd)); +out_put: + iommufd_put_object(&ioas->obj); + return rc; +} + +int iommufd_ioas_copy(struct iommufd_ucmd *ucmd) +{ + struct iommu_ioas_copy *cmd = ucmd->cmd; + struct iommufd_ioas *src_ioas; + struct iommufd_ioas *dst_ioas; + unsigned int flags = 0; + LIST_HEAD(pages_list); + unsigned long iova; + int rc; + + iommufd_test_syz_conv_iova_id(ucmd, cmd->src_ioas_id, &cmd->src_iova, + &cmd->flags); + + if ((cmd->flags & + ~(IOMMU_IOAS_MAP_FIXED_IOVA | IOMMU_IOAS_MAP_WRITEABLE | + IOMMU_IOAS_MAP_READABLE))) + return -EOPNOTSUPP; + if (cmd->length >= ULONG_MAX || cmd->src_iova >= ULONG_MAX || + cmd->dst_iova >= ULONG_MAX) + return -EOVERFLOW; + + src_ioas = iommufd_get_ioas(ucmd->ictx, cmd->src_ioas_id); + if (IS_ERR(src_ioas)) + return PTR_ERR(src_ioas); + rc = iopt_get_pages(&src_ioas->iopt, cmd->src_iova, cmd->length, + &pages_list); + iommufd_put_object(&src_ioas->obj); + if (rc) + return rc; + + dst_ioas = iommufd_get_ioas(ucmd->ictx, cmd->dst_ioas_id); + if (IS_ERR(dst_ioas)) { + rc = PTR_ERR(dst_ioas); + goto out_pages; + } + + if (!(cmd->flags & IOMMU_IOAS_MAP_FIXED_IOVA)) + flags = IOPT_ALLOC_IOVA; + iova = cmd->dst_iova; + rc = iopt_map_pages(&dst_ioas->iopt, &pages_list, cmd->length, &iova, + conv_iommu_prot(cmd->flags), flags); + if (rc) + goto out_put_dst; + + cmd->dst_iova = iova; + rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd)); +out_put_dst: + iommufd_put_object(&dst_ioas->obj); +out_pages: + iopt_free_pages_list(&pages_list); + return rc; +} + +int iommufd_ioas_unmap(struct iommufd_ucmd *ucmd) +{ + struct iommu_ioas_unmap *cmd = ucmd->cmd; + struct iommufd_ioas *ioas; + unsigned long unmapped = 0; + int rc; + + ioas = iommufd_get_ioas(ucmd->ictx, cmd->ioas_id); + if (IS_ERR(ioas)) + return PTR_ERR(ioas); + + if (cmd->iova == 0 && cmd->length == U64_MAX) { + rc = iopt_unmap_all(&ioas->iopt, &unmapped); + if (rc) + goto out_put; + } else { + if (cmd->iova >= ULONG_MAX || cmd->length >= ULONG_MAX) { + rc = -EOVERFLOW; + goto out_put; + } + rc = iopt_unmap_iova(&ioas->iopt, cmd->iova, cmd->length, + &unmapped); + if (rc) + goto out_put; + } + + cmd->length = unmapped; + rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd)); + +out_put: + iommufd_put_object(&ioas->obj); + return rc; +} + +int iommufd_option_rlimit_mode(struct iommu_option *cmd, + struct iommufd_ctx *ictx) +{ + if (cmd->object_id) + return -EOPNOTSUPP; + + if (cmd->op == IOMMU_OPTION_OP_GET) { + cmd->val64 = ictx->account_mode == IOPT_PAGES_ACCOUNT_MM; + return 0; + } + if (cmd->op == IOMMU_OPTION_OP_SET) { + int rc = 0; + + if (!capable(CAP_SYS_RESOURCE)) + return -EPERM; + + xa_lock(&ictx->objects); + if (!xa_empty(&ictx->objects)) { + rc = -EBUSY; + } else { + if (cmd->val64 == 0) + ictx->account_mode = IOPT_PAGES_ACCOUNT_USER; + else if (cmd->val64 == 1) + ictx->account_mode = IOPT_PAGES_ACCOUNT_MM; + else + rc = -EINVAL; + } + xa_unlock(&ictx->objects); + + return rc; + } + return -EOPNOTSUPP; +} + +static int iommufd_ioas_option_huge_pages(struct iommu_option *cmd, + struct iommufd_ioas *ioas) +{ + if (cmd->op == IOMMU_OPTION_OP_GET) { + cmd->val64 = !ioas->iopt.disable_large_pages; + return 0; + } + if (cmd->op == IOMMU_OPTION_OP_SET) { + if (cmd->val64 == 0) + return iopt_disable_large_pages(&ioas->iopt); + if (cmd->val64 == 1) { + iopt_enable_large_pages(&ioas->iopt); + return 0; + } + return -EINVAL; + } + return -EOPNOTSUPP; +} + +int iommufd_ioas_option(struct iommufd_ucmd *ucmd) +{ + struct iommu_option *cmd = ucmd->cmd; + struct iommufd_ioas *ioas; + int rc = 0; + + if (cmd->__reserved) + return -EOPNOTSUPP; + + ioas = iommufd_get_ioas(ucmd->ictx, cmd->object_id); + if (IS_ERR(ioas)) + return PTR_ERR(ioas); + + switch (cmd->option_id) { + case IOMMU_OPTION_HUGE_PAGES: + rc = iommufd_ioas_option_huge_pages(cmd, ioas); + break; + default: + rc = -EOPNOTSUPP; + } + + iommufd_put_object(&ioas->obj); + return rc; +} diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h new file mode 100644 index 0000000000..2c58670011 --- /dev/null +++ b/drivers/iommu/iommufd/iommufd_private.h @@ -0,0 +1,351 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES + */ +#ifndef __IOMMUFD_PRIVATE_H +#define __IOMMUFD_PRIVATE_H + +#include <linux/rwsem.h> +#include <linux/xarray.h> +#include <linux/refcount.h> +#include <linux/uaccess.h> + +struct iommu_domain; +struct iommu_group; +struct iommu_option; +struct iommufd_device; + +struct iommufd_ctx { + struct file *file; + struct xarray objects; + struct xarray groups; + + u8 account_mode; + /* Compatibility with VFIO no iommu */ + u8 no_iommu_mode; + struct iommufd_ioas *vfio_ioas; +}; + +/* + * The IOVA to PFN map. The map automatically copies the PFNs into multiple + * domains and permits sharing of PFNs between io_pagetable instances. This + * supports both a design where IOAS's are 1:1 with a domain (eg because the + * domain is HW customized), or where the IOAS is 1:N with multiple generic + * domains. The io_pagetable holds an interval tree of iopt_areas which point + * to shared iopt_pages which hold the pfns mapped to the page table. + * + * The locking order is domains_rwsem -> iova_rwsem -> pages::mutex + */ +struct io_pagetable { + struct rw_semaphore domains_rwsem; + struct xarray domains; + struct xarray access_list; + unsigned int next_domain_id; + + struct rw_semaphore iova_rwsem; + struct rb_root_cached area_itree; + /* IOVA that cannot become reserved, struct iopt_allowed */ + struct rb_root_cached allowed_itree; + /* IOVA that cannot be allocated, struct iopt_reserved */ + struct rb_root_cached reserved_itree; + u8 disable_large_pages; + unsigned long iova_alignment; +}; + +void iopt_init_table(struct io_pagetable *iopt); +void iopt_destroy_table(struct io_pagetable *iopt); +int iopt_get_pages(struct io_pagetable *iopt, unsigned long iova, + unsigned long length, struct list_head *pages_list); +void iopt_free_pages_list(struct list_head *pages_list); +enum { + IOPT_ALLOC_IOVA = 1 << 0, +}; +int iopt_map_user_pages(struct iommufd_ctx *ictx, struct io_pagetable *iopt, + unsigned long *iova, void __user *uptr, + unsigned long length, int iommu_prot, + unsigned int flags); +int iopt_map_pages(struct io_pagetable *iopt, struct list_head *pages_list, + unsigned long length, unsigned long *dst_iova, + int iommu_prot, unsigned int flags); +int iopt_unmap_iova(struct io_pagetable *iopt, unsigned long iova, + unsigned long length, unsigned long *unmapped); +int iopt_unmap_all(struct io_pagetable *iopt, unsigned long *unmapped); + +void iommufd_access_notify_unmap(struct io_pagetable *iopt, unsigned long iova, + unsigned long length); +int iopt_table_add_domain(struct io_pagetable *iopt, + struct iommu_domain *domain); +void iopt_table_remove_domain(struct io_pagetable *iopt, + struct iommu_domain *domain); +int iopt_table_enforce_dev_resv_regions(struct io_pagetable *iopt, + struct device *dev, + phys_addr_t *sw_msi_start); +int iopt_set_allow_iova(struct io_pagetable *iopt, + struct rb_root_cached *allowed_iova); +int iopt_reserve_iova(struct io_pagetable *iopt, unsigned long start, + unsigned long last, void *owner); +void iopt_remove_reserved_iova(struct io_pagetable *iopt, void *owner); +int iopt_cut_iova(struct io_pagetable *iopt, unsigned long *iovas, + size_t num_iovas); +void iopt_enable_large_pages(struct io_pagetable *iopt); +int iopt_disable_large_pages(struct io_pagetable *iopt); + +struct iommufd_ucmd { + struct iommufd_ctx *ictx; + void __user *ubuffer; + u32 user_size; + void *cmd; +}; + +int iommufd_vfio_ioctl(struct iommufd_ctx *ictx, unsigned int cmd, + unsigned long arg); + +/* Copy the response in ucmd->cmd back to userspace. */ +static inline int iommufd_ucmd_respond(struct iommufd_ucmd *ucmd, + size_t cmd_len) +{ + if (copy_to_user(ucmd->ubuffer, ucmd->cmd, + min_t(size_t, ucmd->user_size, cmd_len))) + return -EFAULT; + return 0; +} + +enum iommufd_object_type { + IOMMUFD_OBJ_NONE, + IOMMUFD_OBJ_ANY = IOMMUFD_OBJ_NONE, + IOMMUFD_OBJ_DEVICE, + IOMMUFD_OBJ_HW_PAGETABLE, + IOMMUFD_OBJ_IOAS, + IOMMUFD_OBJ_ACCESS, +#ifdef CONFIG_IOMMUFD_TEST + IOMMUFD_OBJ_SELFTEST, +#endif + IOMMUFD_OBJ_MAX, +}; + +/* Base struct for all objects with a userspace ID handle. */ +struct iommufd_object { + struct rw_semaphore destroy_rwsem; + refcount_t users; + enum iommufd_object_type type; + unsigned int id; +}; + +static inline bool iommufd_lock_obj(struct iommufd_object *obj) +{ + if (!down_read_trylock(&obj->destroy_rwsem)) + return false; + if (!refcount_inc_not_zero(&obj->users)) { + up_read(&obj->destroy_rwsem); + return false; + } + return true; +} + +struct iommufd_object *iommufd_get_object(struct iommufd_ctx *ictx, u32 id, + enum iommufd_object_type type); +static inline void iommufd_put_object(struct iommufd_object *obj) +{ + refcount_dec(&obj->users); + up_read(&obj->destroy_rwsem); +} + +void iommufd_object_abort(struct iommufd_ctx *ictx, struct iommufd_object *obj); +void iommufd_object_abort_and_destroy(struct iommufd_ctx *ictx, + struct iommufd_object *obj); +void iommufd_object_finalize(struct iommufd_ctx *ictx, + struct iommufd_object *obj); +void __iommufd_object_destroy_user(struct iommufd_ctx *ictx, + struct iommufd_object *obj, bool allow_fail); +static inline void iommufd_object_destroy_user(struct iommufd_ctx *ictx, + struct iommufd_object *obj) +{ + __iommufd_object_destroy_user(ictx, obj, false); +} +static inline void iommufd_object_deref_user(struct iommufd_ctx *ictx, + struct iommufd_object *obj) +{ + __iommufd_object_destroy_user(ictx, obj, true); +} + +struct iommufd_object *_iommufd_object_alloc(struct iommufd_ctx *ictx, + size_t size, + enum iommufd_object_type type); + +#define iommufd_object_alloc(ictx, ptr, type) \ + container_of(_iommufd_object_alloc( \ + ictx, \ + sizeof(*(ptr)) + BUILD_BUG_ON_ZERO( \ + offsetof(typeof(*(ptr)), \ + obj) != 0), \ + type), \ + typeof(*(ptr)), obj) + +/* + * The IO Address Space (IOAS) pagetable is a virtual page table backed by the + * io_pagetable object. It is a user controlled mapping of IOVA -> PFNs. The + * mapping is copied into all of the associated domains and made available to + * in-kernel users. + * + * Every iommu_domain that is created is wrapped in a iommufd_hw_pagetable + * object. When we go to attach a device to an IOAS we need to get an + * iommu_domain and wrapping iommufd_hw_pagetable for it. + * + * An iommu_domain & iommfd_hw_pagetable will be automatically selected + * for a device based on the hwpt_list. If no suitable iommu_domain + * is found a new iommu_domain will be created. + */ +struct iommufd_ioas { + struct iommufd_object obj; + struct io_pagetable iopt; + struct mutex mutex; + struct list_head hwpt_list; +}; + +static inline struct iommufd_ioas *iommufd_get_ioas(struct iommufd_ctx *ictx, + u32 id) +{ + return container_of(iommufd_get_object(ictx, id, + IOMMUFD_OBJ_IOAS), + struct iommufd_ioas, obj); +} + +struct iommufd_ioas *iommufd_ioas_alloc(struct iommufd_ctx *ictx); +int iommufd_ioas_alloc_ioctl(struct iommufd_ucmd *ucmd); +void iommufd_ioas_destroy(struct iommufd_object *obj); +int iommufd_ioas_iova_ranges(struct iommufd_ucmd *ucmd); +int iommufd_ioas_allow_iovas(struct iommufd_ucmd *ucmd); +int iommufd_ioas_map(struct iommufd_ucmd *ucmd); +int iommufd_ioas_copy(struct iommufd_ucmd *ucmd); +int iommufd_ioas_unmap(struct iommufd_ucmd *ucmd); +int iommufd_ioas_option(struct iommufd_ucmd *ucmd); +int iommufd_option_rlimit_mode(struct iommu_option *cmd, + struct iommufd_ctx *ictx); + +int iommufd_vfio_ioas(struct iommufd_ucmd *ucmd); + +/* + * A HW pagetable is called an iommu_domain inside the kernel. This user object + * allows directly creating and inspecting the domains. Domains that have kernel + * owned page tables will be associated with an iommufd_ioas that provides the + * IOVA to PFN map. + */ +struct iommufd_hw_pagetable { + struct iommufd_object obj; + struct iommufd_ioas *ioas; + struct iommu_domain *domain; + bool auto_domain : 1; + bool enforce_cache_coherency : 1; + bool msi_cookie : 1; + /* Head at iommufd_ioas::hwpt_list */ + struct list_head hwpt_item; +}; + +struct iommufd_hw_pagetable * +iommufd_hw_pagetable_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas, + struct iommufd_device *idev, bool immediate_attach); +int iommufd_hw_pagetable_enforce_cc(struct iommufd_hw_pagetable *hwpt); +int iommufd_hw_pagetable_attach(struct iommufd_hw_pagetable *hwpt, + struct iommufd_device *idev); +struct iommufd_hw_pagetable * +iommufd_hw_pagetable_detach(struct iommufd_device *idev); +void iommufd_hw_pagetable_destroy(struct iommufd_object *obj); +void iommufd_hw_pagetable_abort(struct iommufd_object *obj); +int iommufd_hwpt_alloc(struct iommufd_ucmd *ucmd); + +static inline void iommufd_hw_pagetable_put(struct iommufd_ctx *ictx, + struct iommufd_hw_pagetable *hwpt) +{ + lockdep_assert_not_held(&hwpt->ioas->mutex); + if (hwpt->auto_domain) + iommufd_object_deref_user(ictx, &hwpt->obj); + else + refcount_dec(&hwpt->obj.users); +} + +struct iommufd_group { + struct kref ref; + struct mutex lock; + struct iommufd_ctx *ictx; + struct iommu_group *group; + struct iommufd_hw_pagetable *hwpt; + struct list_head device_list; + phys_addr_t sw_msi_start; +}; + +/* + * A iommufd_device object represents the binding relationship between a + * consuming driver and the iommufd. These objects are created/destroyed by + * external drivers, not by userspace. + */ +struct iommufd_device { + struct iommufd_object obj; + struct iommufd_ctx *ictx; + struct iommufd_group *igroup; + struct list_head group_item; + /* always the physical device */ + struct device *dev; + bool enforce_cache_coherency; +}; + +static inline struct iommufd_device * +iommufd_get_device(struct iommufd_ucmd *ucmd, u32 id) +{ + return container_of(iommufd_get_object(ucmd->ictx, id, + IOMMUFD_OBJ_DEVICE), + struct iommufd_device, obj); +} + +void iommufd_device_destroy(struct iommufd_object *obj); +int iommufd_get_hw_info(struct iommufd_ucmd *ucmd); + +struct iommufd_access { + struct iommufd_object obj; + struct iommufd_ctx *ictx; + struct iommufd_ioas *ioas; + struct iommufd_ioas *ioas_unpin; + struct mutex ioas_lock; + const struct iommufd_access_ops *ops; + void *data; + unsigned long iova_alignment; + u32 iopt_access_list_id; +}; + +int iopt_add_access(struct io_pagetable *iopt, struct iommufd_access *access); +void iopt_remove_access(struct io_pagetable *iopt, + struct iommufd_access *access, + u32 iopt_access_list_id); +void iommufd_access_destroy_object(struct iommufd_object *obj); + +#ifdef CONFIG_IOMMUFD_TEST +int iommufd_test(struct iommufd_ucmd *ucmd); +void iommufd_selftest_destroy(struct iommufd_object *obj); +extern size_t iommufd_test_memory_limit; +void iommufd_test_syz_conv_iova_id(struct iommufd_ucmd *ucmd, + unsigned int ioas_id, u64 *iova, u32 *flags); +bool iommufd_should_fail(void); +int __init iommufd_test_init(void); +void iommufd_test_exit(void); +bool iommufd_selftest_is_mock_dev(struct device *dev); +#else +static inline void iommufd_test_syz_conv_iova_id(struct iommufd_ucmd *ucmd, + unsigned int ioas_id, + u64 *iova, u32 *flags) +{ +} +static inline bool iommufd_should_fail(void) +{ + return false; +} +static inline int __init iommufd_test_init(void) +{ + return 0; +} +static inline void iommufd_test_exit(void) +{ +} +static inline bool iommufd_selftest_is_mock_dev(struct device *dev) +{ + return false; +} +#endif +#endif diff --git a/drivers/iommu/iommufd/iommufd_test.h b/drivers/iommu/iommufd/iommufd_test.h new file mode 100644 index 0000000000..3f3644375b --- /dev/null +++ b/drivers/iommu/iommufd/iommufd_test.h @@ -0,0 +1,112 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. + */ +#ifndef _UAPI_IOMMUFD_TEST_H +#define _UAPI_IOMMUFD_TEST_H + +#include <linux/types.h> +#include <linux/iommufd.h> + +enum { + IOMMU_TEST_OP_ADD_RESERVED = 1, + IOMMU_TEST_OP_MOCK_DOMAIN, + IOMMU_TEST_OP_MD_CHECK_MAP, + IOMMU_TEST_OP_MD_CHECK_REFS, + IOMMU_TEST_OP_CREATE_ACCESS, + IOMMU_TEST_OP_DESTROY_ACCESS_PAGES, + IOMMU_TEST_OP_ACCESS_PAGES, + IOMMU_TEST_OP_ACCESS_RW, + IOMMU_TEST_OP_SET_TEMP_MEMORY_LIMIT, + IOMMU_TEST_OP_MOCK_DOMAIN_REPLACE, + IOMMU_TEST_OP_ACCESS_REPLACE_IOAS, +}; + +enum { + MOCK_APERTURE_START = 1UL << 24, + MOCK_APERTURE_LAST = (1UL << 31) - 1, +}; + +enum { + MOCK_FLAGS_ACCESS_WRITE = 1 << 0, + MOCK_FLAGS_ACCESS_SYZ = 1 << 16, +}; + +enum { + MOCK_ACCESS_RW_WRITE = 1 << 0, + MOCK_ACCESS_RW_SLOW_PATH = 1 << 2, +}; + +enum { + MOCK_FLAGS_ACCESS_CREATE_NEEDS_PIN_PAGES = 1 << 0, +}; + +struct iommu_test_cmd { + __u32 size; + __u32 op; + __u32 id; + __u32 __reserved; + union { + struct { + __aligned_u64 start; + __aligned_u64 length; + } add_reserved; + struct { + __u32 out_stdev_id; + __u32 out_hwpt_id; + /* out_idev_id is the standard iommufd_bind object */ + __u32 out_idev_id; + } mock_domain; + struct { + __u32 pt_id; + } mock_domain_replace; + struct { + __aligned_u64 iova; + __aligned_u64 length; + __aligned_u64 uptr; + } check_map; + struct { + __aligned_u64 length; + __aligned_u64 uptr; + __u32 refs; + } check_refs; + struct { + __u32 out_access_fd; + __u32 flags; + } create_access; + struct { + __u32 access_pages_id; + } destroy_access_pages; + struct { + __u32 flags; + __u32 out_access_pages_id; + __aligned_u64 iova; + __aligned_u64 length; + __aligned_u64 uptr; + } access_pages; + struct { + __aligned_u64 iova; + __aligned_u64 length; + __aligned_u64 uptr; + __u32 flags; + } access_rw; + struct { + __u32 limit; + } memory_limit; + struct { + __u32 ioas_id; + } access_replace_ioas; + }; + __u32 last; +}; +#define IOMMU_TEST_CMD _IO(IOMMUFD_TYPE, IOMMUFD_CMD_BASE + 32) + +/* Mock structs for IOMMU_DEVICE_GET_HW_INFO ioctl */ +#define IOMMU_HW_INFO_TYPE_SELFTEST 0xfeedbeef +#define IOMMU_HW_INFO_SELFTEST_REGVAL 0xdeadbeef + +struct iommu_test_hw_info { + __u32 flags; + __u32 test_reg; +}; + +#endif diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c new file mode 100644 index 0000000000..e71523cbd0 --- /dev/null +++ b/drivers/iommu/iommufd/main.c @@ -0,0 +1,556 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (C) 2021 Intel Corporation + * Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES + * + * iommufd provides control over the IOMMU HW objects created by IOMMU kernel + * drivers. IOMMU HW objects revolve around IO page tables that map incoming DMA + * addresses (IOVA) to CPU addresses. + */ +#define pr_fmt(fmt) "iommufd: " fmt + +#include <linux/file.h> +#include <linux/fs.h> +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/miscdevice.h> +#include <linux/mutex.h> +#include <linux/bug.h> +#include <uapi/linux/iommufd.h> +#include <linux/iommufd.h> + +#include "io_pagetable.h" +#include "iommufd_private.h" +#include "iommufd_test.h" + +struct iommufd_object_ops { + void (*destroy)(struct iommufd_object *obj); + void (*abort)(struct iommufd_object *obj); +}; +static const struct iommufd_object_ops iommufd_object_ops[]; +static struct miscdevice vfio_misc_dev; + +struct iommufd_object *_iommufd_object_alloc(struct iommufd_ctx *ictx, + size_t size, + enum iommufd_object_type type) +{ + static struct lock_class_key obj_keys[IOMMUFD_OBJ_MAX]; + struct iommufd_object *obj; + int rc; + + obj = kzalloc(size, GFP_KERNEL_ACCOUNT); + if (!obj) + return ERR_PTR(-ENOMEM); + obj->type = type; + /* + * In most cases the destroy_rwsem is obtained with try so it doesn't + * interact with lockdep, however on destroy we have to sleep. This + * means if we have to destroy an object while holding a get on another + * object it triggers lockdep. Using one locking class per object type + * is a simple and reasonable way to avoid this. + */ + __init_rwsem(&obj->destroy_rwsem, "iommufd_object::destroy_rwsem", + &obj_keys[type]); + refcount_set(&obj->users, 1); + + /* + * Reserve an ID in the xarray but do not publish the pointer yet since + * the caller hasn't initialized it yet. Once the pointer is published + * in the xarray and visible to other threads we can't reliably destroy + * it anymore, so the caller must complete all errorable operations + * before calling iommufd_object_finalize(). + */ + rc = xa_alloc(&ictx->objects, &obj->id, XA_ZERO_ENTRY, + xa_limit_31b, GFP_KERNEL_ACCOUNT); + if (rc) + goto out_free; + return obj; +out_free: + kfree(obj); + return ERR_PTR(rc); +} + +/* + * Allow concurrent access to the object. + * + * Once another thread can see the object pointer it can prevent object + * destruction. Expect for special kernel-only objects there is no in-kernel way + * to reliably destroy a single object. Thus all APIs that are creating objects + * must use iommufd_object_abort() to handle their errors and only call + * iommufd_object_finalize() once object creation cannot fail. + */ +void iommufd_object_finalize(struct iommufd_ctx *ictx, + struct iommufd_object *obj) +{ + void *old; + + old = xa_store(&ictx->objects, obj->id, obj, GFP_KERNEL); + /* obj->id was returned from xa_alloc() so the xa_store() cannot fail */ + WARN_ON(old); +} + +/* Undo _iommufd_object_alloc() if iommufd_object_finalize() was not called */ +void iommufd_object_abort(struct iommufd_ctx *ictx, struct iommufd_object *obj) +{ + void *old; + + old = xa_erase(&ictx->objects, obj->id); + WARN_ON(old); + kfree(obj); +} + +/* + * Abort an object that has been fully initialized and needs destroy, but has + * not been finalized. + */ +void iommufd_object_abort_and_destroy(struct iommufd_ctx *ictx, + struct iommufd_object *obj) +{ + if (iommufd_object_ops[obj->type].abort) + iommufd_object_ops[obj->type].abort(obj); + else + iommufd_object_ops[obj->type].destroy(obj); + iommufd_object_abort(ictx, obj); +} + +struct iommufd_object *iommufd_get_object(struct iommufd_ctx *ictx, u32 id, + enum iommufd_object_type type) +{ + struct iommufd_object *obj; + + if (iommufd_should_fail()) + return ERR_PTR(-ENOENT); + + xa_lock(&ictx->objects); + obj = xa_load(&ictx->objects, id); + if (!obj || (type != IOMMUFD_OBJ_ANY && obj->type != type) || + !iommufd_lock_obj(obj)) + obj = ERR_PTR(-ENOENT); + xa_unlock(&ictx->objects); + return obj; +} + +/* + * Remove the given object id from the xarray if the only reference to the + * object is held by the xarray. The caller must call ops destroy(). + */ +static struct iommufd_object *iommufd_object_remove(struct iommufd_ctx *ictx, + u32 id, bool extra_put) +{ + struct iommufd_object *obj; + XA_STATE(xas, &ictx->objects, id); + + xa_lock(&ictx->objects); + obj = xas_load(&xas); + if (xa_is_zero(obj) || !obj) { + obj = ERR_PTR(-ENOENT); + goto out_xa; + } + + /* + * If the caller is holding a ref on obj we put it here under the + * spinlock. + */ + if (extra_put) + refcount_dec(&obj->users); + + if (!refcount_dec_if_one(&obj->users)) { + obj = ERR_PTR(-EBUSY); + goto out_xa; + } + + xas_store(&xas, NULL); + if (ictx->vfio_ioas == container_of(obj, struct iommufd_ioas, obj)) + ictx->vfio_ioas = NULL; + +out_xa: + xa_unlock(&ictx->objects); + + /* The returned object reference count is zero */ + return obj; +} + +/* + * The caller holds a users refcount and wants to destroy the object. Returns + * true if the object was destroyed. In all cases the caller no longer has a + * reference on obj. + */ +void __iommufd_object_destroy_user(struct iommufd_ctx *ictx, + struct iommufd_object *obj, bool allow_fail) +{ + struct iommufd_object *ret; + + /* + * The purpose of the destroy_rwsem is to ensure deterministic + * destruction of objects used by external drivers and destroyed by this + * function. Any temporary increment of the refcount must hold the read + * side of this, such as during ioctl execution. + */ + down_write(&obj->destroy_rwsem); + ret = iommufd_object_remove(ictx, obj->id, true); + up_write(&obj->destroy_rwsem); + + if (allow_fail && IS_ERR(ret)) + return; + + /* + * If there is a bug and we couldn't destroy the object then we did put + * back the caller's refcount and will eventually try to free it again + * during close. + */ + if (WARN_ON(IS_ERR(ret))) + return; + + iommufd_object_ops[obj->type].destroy(obj); + kfree(obj); +} + +static int iommufd_destroy(struct iommufd_ucmd *ucmd) +{ + struct iommu_destroy *cmd = ucmd->cmd; + struct iommufd_object *obj; + + obj = iommufd_object_remove(ucmd->ictx, cmd->id, false); + if (IS_ERR(obj)) + return PTR_ERR(obj); + iommufd_object_ops[obj->type].destroy(obj); + kfree(obj); + return 0; +} + +static int iommufd_fops_open(struct inode *inode, struct file *filp) +{ + struct iommufd_ctx *ictx; + + ictx = kzalloc(sizeof(*ictx), GFP_KERNEL_ACCOUNT); + if (!ictx) + return -ENOMEM; + + /* + * For compatibility with VFIO when /dev/vfio/vfio is opened we default + * to the same rlimit accounting as vfio uses. + */ + if (IS_ENABLED(CONFIG_IOMMUFD_VFIO_CONTAINER) && + filp->private_data == &vfio_misc_dev) { + ictx->account_mode = IOPT_PAGES_ACCOUNT_MM; + pr_info_once("IOMMUFD is providing /dev/vfio/vfio, not VFIO.\n"); + } + + xa_init_flags(&ictx->objects, XA_FLAGS_ALLOC1 | XA_FLAGS_ACCOUNT); + xa_init(&ictx->groups); + ictx->file = filp; + filp->private_data = ictx; + return 0; +} + +static int iommufd_fops_release(struct inode *inode, struct file *filp) +{ + struct iommufd_ctx *ictx = filp->private_data; + struct iommufd_object *obj; + + /* + * The objects in the xarray form a graph of "users" counts, and we have + * to destroy them in a depth first manner. Leaf objects will reduce the + * users count of interior objects when they are destroyed. + * + * Repeatedly destroying all the "1 users" leaf objects will progress + * until the entire list is destroyed. If this can't progress then there + * is some bug related to object refcounting. + */ + while (!xa_empty(&ictx->objects)) { + unsigned int destroyed = 0; + unsigned long index; + + xa_for_each(&ictx->objects, index, obj) { + if (!refcount_dec_if_one(&obj->users)) + continue; + destroyed++; + xa_erase(&ictx->objects, index); + iommufd_object_ops[obj->type].destroy(obj); + kfree(obj); + } + /* Bug related to users refcount */ + if (WARN_ON(!destroyed)) + break; + } + WARN_ON(!xa_empty(&ictx->groups)); + kfree(ictx); + return 0; +} + +static int iommufd_option(struct iommufd_ucmd *ucmd) +{ + struct iommu_option *cmd = ucmd->cmd; + int rc; + + if (cmd->__reserved) + return -EOPNOTSUPP; + + switch (cmd->option_id) { + case IOMMU_OPTION_RLIMIT_MODE: + rc = iommufd_option_rlimit_mode(cmd, ucmd->ictx); + break; + case IOMMU_OPTION_HUGE_PAGES: + rc = iommufd_ioas_option(ucmd); + break; + default: + return -EOPNOTSUPP; + } + if (rc) + return rc; + if (copy_to_user(&((struct iommu_option __user *)ucmd->ubuffer)->val64, + &cmd->val64, sizeof(cmd->val64))) + return -EFAULT; + return 0; +} + +union ucmd_buffer { + struct iommu_destroy destroy; + struct iommu_hw_info info; + struct iommu_hwpt_alloc hwpt; + struct iommu_ioas_alloc alloc; + struct iommu_ioas_allow_iovas allow_iovas; + struct iommu_ioas_copy ioas_copy; + struct iommu_ioas_iova_ranges iova_ranges; + struct iommu_ioas_map map; + struct iommu_ioas_unmap unmap; + struct iommu_option option; + struct iommu_vfio_ioas vfio_ioas; +#ifdef CONFIG_IOMMUFD_TEST + struct iommu_test_cmd test; +#endif +}; + +struct iommufd_ioctl_op { + unsigned int size; + unsigned int min_size; + unsigned int ioctl_num; + int (*execute)(struct iommufd_ucmd *ucmd); +}; + +#define IOCTL_OP(_ioctl, _fn, _struct, _last) \ + [_IOC_NR(_ioctl) - IOMMUFD_CMD_BASE] = { \ + .size = sizeof(_struct) + \ + BUILD_BUG_ON_ZERO(sizeof(union ucmd_buffer) < \ + sizeof(_struct)), \ + .min_size = offsetofend(_struct, _last), \ + .ioctl_num = _ioctl, \ + .execute = _fn, \ + } +static const struct iommufd_ioctl_op iommufd_ioctl_ops[] = { + IOCTL_OP(IOMMU_DESTROY, iommufd_destroy, struct iommu_destroy, id), + IOCTL_OP(IOMMU_GET_HW_INFO, iommufd_get_hw_info, struct iommu_hw_info, + __reserved), + IOCTL_OP(IOMMU_HWPT_ALLOC, iommufd_hwpt_alloc, struct iommu_hwpt_alloc, + __reserved), + IOCTL_OP(IOMMU_IOAS_ALLOC, iommufd_ioas_alloc_ioctl, + struct iommu_ioas_alloc, out_ioas_id), + IOCTL_OP(IOMMU_IOAS_ALLOW_IOVAS, iommufd_ioas_allow_iovas, + struct iommu_ioas_allow_iovas, allowed_iovas), + IOCTL_OP(IOMMU_IOAS_COPY, iommufd_ioas_copy, struct iommu_ioas_copy, + src_iova), + IOCTL_OP(IOMMU_IOAS_IOVA_RANGES, iommufd_ioas_iova_ranges, + struct iommu_ioas_iova_ranges, out_iova_alignment), + IOCTL_OP(IOMMU_IOAS_MAP, iommufd_ioas_map, struct iommu_ioas_map, + iova), + IOCTL_OP(IOMMU_IOAS_UNMAP, iommufd_ioas_unmap, struct iommu_ioas_unmap, + length), + IOCTL_OP(IOMMU_OPTION, iommufd_option, struct iommu_option, + val64), + IOCTL_OP(IOMMU_VFIO_IOAS, iommufd_vfio_ioas, struct iommu_vfio_ioas, + __reserved), +#ifdef CONFIG_IOMMUFD_TEST + IOCTL_OP(IOMMU_TEST_CMD, iommufd_test, struct iommu_test_cmd, last), +#endif +}; + +static long iommufd_fops_ioctl(struct file *filp, unsigned int cmd, + unsigned long arg) +{ + struct iommufd_ctx *ictx = filp->private_data; + const struct iommufd_ioctl_op *op; + struct iommufd_ucmd ucmd = {}; + union ucmd_buffer buf; + unsigned int nr; + int ret; + + nr = _IOC_NR(cmd); + if (nr < IOMMUFD_CMD_BASE || + (nr - IOMMUFD_CMD_BASE) >= ARRAY_SIZE(iommufd_ioctl_ops)) + return iommufd_vfio_ioctl(ictx, cmd, arg); + + ucmd.ictx = ictx; + ucmd.ubuffer = (void __user *)arg; + ret = get_user(ucmd.user_size, (u32 __user *)ucmd.ubuffer); + if (ret) + return ret; + + op = &iommufd_ioctl_ops[nr - IOMMUFD_CMD_BASE]; + if (op->ioctl_num != cmd) + return -ENOIOCTLCMD; + if (ucmd.user_size < op->min_size) + return -EINVAL; + + ucmd.cmd = &buf; + ret = copy_struct_from_user(ucmd.cmd, op->size, ucmd.ubuffer, + ucmd.user_size); + if (ret) + return ret; + ret = op->execute(&ucmd); + return ret; +} + +static const struct file_operations iommufd_fops = { + .owner = THIS_MODULE, + .open = iommufd_fops_open, + .release = iommufd_fops_release, + .unlocked_ioctl = iommufd_fops_ioctl, +}; + +/** + * iommufd_ctx_get - Get a context reference + * @ictx: Context to get + * + * The caller must already hold a valid reference to ictx. + */ +void iommufd_ctx_get(struct iommufd_ctx *ictx) +{ + get_file(ictx->file); +} +EXPORT_SYMBOL_NS_GPL(iommufd_ctx_get, IOMMUFD); + +/** + * iommufd_ctx_from_file - Acquires a reference to the iommufd context + * @file: File to obtain the reference from + * + * Returns a pointer to the iommufd_ctx, otherwise ERR_PTR. The struct file + * remains owned by the caller and the caller must still do fput. On success + * the caller is responsible to call iommufd_ctx_put(). + */ +struct iommufd_ctx *iommufd_ctx_from_file(struct file *file) +{ + struct iommufd_ctx *ictx; + + if (file->f_op != &iommufd_fops) + return ERR_PTR(-EBADFD); + ictx = file->private_data; + iommufd_ctx_get(ictx); + return ictx; +} +EXPORT_SYMBOL_NS_GPL(iommufd_ctx_from_file, IOMMUFD); + +/** + * iommufd_ctx_from_fd - Acquires a reference to the iommufd context + * @fd: File descriptor to obtain the reference from + * + * Returns a pointer to the iommufd_ctx, otherwise ERR_PTR. On success + * the caller is responsible to call iommufd_ctx_put(). + */ +struct iommufd_ctx *iommufd_ctx_from_fd(int fd) +{ + struct file *file; + + file = fget(fd); + if (!file) + return ERR_PTR(-EBADF); + + if (file->f_op != &iommufd_fops) { + fput(file); + return ERR_PTR(-EBADFD); + } + /* fget is the same as iommufd_ctx_get() */ + return file->private_data; +} +EXPORT_SYMBOL_NS_GPL(iommufd_ctx_from_fd, IOMMUFD); + +/** + * iommufd_ctx_put - Put back a reference + * @ictx: Context to put back + */ +void iommufd_ctx_put(struct iommufd_ctx *ictx) +{ + fput(ictx->file); +} +EXPORT_SYMBOL_NS_GPL(iommufd_ctx_put, IOMMUFD); + +static const struct iommufd_object_ops iommufd_object_ops[] = { + [IOMMUFD_OBJ_ACCESS] = { + .destroy = iommufd_access_destroy_object, + }, + [IOMMUFD_OBJ_DEVICE] = { + .destroy = iommufd_device_destroy, + }, + [IOMMUFD_OBJ_IOAS] = { + .destroy = iommufd_ioas_destroy, + }, + [IOMMUFD_OBJ_HW_PAGETABLE] = { + .destroy = iommufd_hw_pagetable_destroy, + .abort = iommufd_hw_pagetable_abort, + }, +#ifdef CONFIG_IOMMUFD_TEST + [IOMMUFD_OBJ_SELFTEST] = { + .destroy = iommufd_selftest_destroy, + }, +#endif +}; + +static struct miscdevice iommu_misc_dev = { + .minor = MISC_DYNAMIC_MINOR, + .name = "iommu", + .fops = &iommufd_fops, + .nodename = "iommu", + .mode = 0660, +}; + + +static struct miscdevice vfio_misc_dev = { + .minor = VFIO_MINOR, + .name = "vfio", + .fops = &iommufd_fops, + .nodename = "vfio/vfio", + .mode = 0666, +}; + +static int __init iommufd_init(void) +{ + int ret; + + ret = misc_register(&iommu_misc_dev); + if (ret) + return ret; + + if (IS_ENABLED(CONFIG_IOMMUFD_VFIO_CONTAINER)) { + ret = misc_register(&vfio_misc_dev); + if (ret) + goto err_misc; + } + ret = iommufd_test_init(); + if (ret) + goto err_vfio_misc; + return 0; + +err_vfio_misc: + if (IS_ENABLED(CONFIG_IOMMUFD_VFIO_CONTAINER)) + misc_deregister(&vfio_misc_dev); +err_misc: + misc_deregister(&iommu_misc_dev); + return ret; +} + +static void __exit iommufd_exit(void) +{ + iommufd_test_exit(); + if (IS_ENABLED(CONFIG_IOMMUFD_VFIO_CONTAINER)) + misc_deregister(&vfio_misc_dev); + misc_deregister(&iommu_misc_dev); +} + +module_init(iommufd_init); +module_exit(iommufd_exit); + +#if IS_ENABLED(CONFIG_IOMMUFD_VFIO_CONTAINER) +MODULE_ALIAS_MISCDEV(VFIO_MINOR); +MODULE_ALIAS("devname:vfio/vfio"); +#endif +MODULE_IMPORT_NS(IOMMUFD_INTERNAL); +MODULE_DESCRIPTION("I/O Address Space Management for passthrough devices"); +MODULE_LICENSE("GPL"); diff --git a/drivers/iommu/iommufd/pages.c b/drivers/iommu/iommufd/pages.c new file mode 100644 index 0000000000..528f356238 --- /dev/null +++ b/drivers/iommu/iommufd/pages.c @@ -0,0 +1,1993 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. + * + * The iopt_pages is the center of the storage and motion of PFNs. Each + * iopt_pages represents a logical linear array of full PFNs. The array is 0 + * based and has npages in it. Accessors use 'index' to refer to the entry in + * this logical array, regardless of its storage location. + * + * PFNs are stored in a tiered scheme: + * 1) iopt_pages::pinned_pfns xarray + * 2) An iommu_domain + * 3) The origin of the PFNs, i.e. the userspace pointer + * + * PFN have to be copied between all combinations of tiers, depending on the + * configuration. + * + * When a PFN is taken out of the userspace pointer it is pinned exactly once. + * The storage locations of the PFN's index are tracked in the two interval + * trees. If no interval includes the index then it is not pinned. + * + * If access_itree includes the PFN's index then an in-kernel access has + * requested the page. The PFN is stored in the xarray so other requestors can + * continue to find it. + * + * If the domains_itree includes the PFN's index then an iommu_domain is storing + * the PFN and it can be read back using iommu_iova_to_phys(). To avoid + * duplicating storage the xarray is not used if only iommu_domains are using + * the PFN's index. + * + * As a general principle this is designed so that destroy never fails. This + * means removing an iommu_domain or releasing a in-kernel access will not fail + * due to insufficient memory. In practice this means some cases have to hold + * PFNs in the xarray even though they are also being stored in an iommu_domain. + * + * While the iopt_pages can use an iommu_domain as storage, it does not have an + * IOVA itself. Instead the iopt_area represents a range of IOVA and uses the + * iopt_pages as the PFN provider. Multiple iopt_areas can share the iopt_pages + * and reference their own slice of the PFN array, with sub page granularity. + * + * In this file the term 'last' indicates an inclusive and closed interval, eg + * [0,0] refers to a single PFN. 'end' means an open range, eg [0,0) refers to + * no PFNs. + * + * Be cautious of overflow. An IOVA can go all the way up to U64_MAX, so + * last_iova + 1 can overflow. An iopt_pages index will always be much less than + * ULONG_MAX so last_index + 1 cannot overflow. + */ +#include <linux/overflow.h> +#include <linux/slab.h> +#include <linux/iommu.h> +#include <linux/sched/mm.h> +#include <linux/highmem.h> +#include <linux/kthread.h> +#include <linux/iommufd.h> + +#include "io_pagetable.h" +#include "double_span.h" + +#ifndef CONFIG_IOMMUFD_TEST +#define TEMP_MEMORY_LIMIT 65536 +#else +#define TEMP_MEMORY_LIMIT iommufd_test_memory_limit +#endif +#define BATCH_BACKUP_SIZE 32 + +/* + * More memory makes pin_user_pages() and the batching more efficient, but as + * this is only a performance optimization don't try too hard to get it. A 64k + * allocation can hold about 26M of 4k pages and 13G of 2M pages in an + * pfn_batch. Various destroy paths cannot fail and provide a small amount of + * stack memory as a backup contingency. If backup_len is given this cannot + * fail. + */ +static void *temp_kmalloc(size_t *size, void *backup, size_t backup_len) +{ + void *res; + + if (WARN_ON(*size == 0)) + return NULL; + + if (*size < backup_len) + return backup; + + if (!backup && iommufd_should_fail()) + return NULL; + + *size = min_t(size_t, *size, TEMP_MEMORY_LIMIT); + res = kmalloc(*size, GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY); + if (res) + return res; + *size = PAGE_SIZE; + if (backup_len) { + res = kmalloc(*size, GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY); + if (res) + return res; + *size = backup_len; + return backup; + } + return kmalloc(*size, GFP_KERNEL); +} + +void interval_tree_double_span_iter_update( + struct interval_tree_double_span_iter *iter) +{ + unsigned long last_hole = ULONG_MAX; + unsigned int i; + + for (i = 0; i != ARRAY_SIZE(iter->spans); i++) { + if (interval_tree_span_iter_done(&iter->spans[i])) { + iter->is_used = -1; + return; + } + + if (iter->spans[i].is_hole) { + last_hole = min(last_hole, iter->spans[i].last_hole); + continue; + } + + iter->is_used = i + 1; + iter->start_used = iter->spans[i].start_used; + iter->last_used = min(iter->spans[i].last_used, last_hole); + return; + } + + iter->is_used = 0; + iter->start_hole = iter->spans[0].start_hole; + iter->last_hole = + min(iter->spans[0].last_hole, iter->spans[1].last_hole); +} + +void interval_tree_double_span_iter_first( + struct interval_tree_double_span_iter *iter, + struct rb_root_cached *itree1, struct rb_root_cached *itree2, + unsigned long first_index, unsigned long last_index) +{ + unsigned int i; + + iter->itrees[0] = itree1; + iter->itrees[1] = itree2; + for (i = 0; i != ARRAY_SIZE(iter->spans); i++) + interval_tree_span_iter_first(&iter->spans[i], iter->itrees[i], + first_index, last_index); + interval_tree_double_span_iter_update(iter); +} + +void interval_tree_double_span_iter_next( + struct interval_tree_double_span_iter *iter) +{ + unsigned int i; + + if (iter->is_used == -1 || + iter->last_hole == iter->spans[0].last_index) { + iter->is_used = -1; + return; + } + + for (i = 0; i != ARRAY_SIZE(iter->spans); i++) + interval_tree_span_iter_advance( + &iter->spans[i], iter->itrees[i], iter->last_hole + 1); + interval_tree_double_span_iter_update(iter); +} + +static void iopt_pages_add_npinned(struct iopt_pages *pages, size_t npages) +{ + int rc; + + rc = check_add_overflow(pages->npinned, npages, &pages->npinned); + if (IS_ENABLED(CONFIG_IOMMUFD_TEST)) + WARN_ON(rc || pages->npinned > pages->npages); +} + +static void iopt_pages_sub_npinned(struct iopt_pages *pages, size_t npages) +{ + int rc; + + rc = check_sub_overflow(pages->npinned, npages, &pages->npinned); + if (IS_ENABLED(CONFIG_IOMMUFD_TEST)) + WARN_ON(rc || pages->npinned > pages->npages); +} + +static void iopt_pages_err_unpin(struct iopt_pages *pages, + unsigned long start_index, + unsigned long last_index, + struct page **page_list) +{ + unsigned long npages = last_index - start_index + 1; + + unpin_user_pages(page_list, npages); + iopt_pages_sub_npinned(pages, npages); +} + +/* + * index is the number of PAGE_SIZE units from the start of the area's + * iopt_pages. If the iova is sub page-size then the area has an iova that + * covers a portion of the first and last pages in the range. + */ +static unsigned long iopt_area_index_to_iova(struct iopt_area *area, + unsigned long index) +{ + if (IS_ENABLED(CONFIG_IOMMUFD_TEST)) + WARN_ON(index < iopt_area_index(area) || + index > iopt_area_last_index(area)); + index -= iopt_area_index(area); + if (index == 0) + return iopt_area_iova(area); + return iopt_area_iova(area) - area->page_offset + index * PAGE_SIZE; +} + +static unsigned long iopt_area_index_to_iova_last(struct iopt_area *area, + unsigned long index) +{ + if (IS_ENABLED(CONFIG_IOMMUFD_TEST)) + WARN_ON(index < iopt_area_index(area) || + index > iopt_area_last_index(area)); + if (index == iopt_area_last_index(area)) + return iopt_area_last_iova(area); + return iopt_area_iova(area) - area->page_offset + + (index - iopt_area_index(area) + 1) * PAGE_SIZE - 1; +} + +static void iommu_unmap_nofail(struct iommu_domain *domain, unsigned long iova, + size_t size) +{ + size_t ret; + + ret = iommu_unmap(domain, iova, size); + /* + * It is a logic error in this code or a driver bug if the IOMMU unmaps + * something other than exactly as requested. This implies that the + * iommu driver may not fail unmap for reasons beyond bad agruments. + * Particularly, the iommu driver may not do a memory allocation on the + * unmap path. + */ + WARN_ON(ret != size); +} + +static void iopt_area_unmap_domain_range(struct iopt_area *area, + struct iommu_domain *domain, + unsigned long start_index, + unsigned long last_index) +{ + unsigned long start_iova = iopt_area_index_to_iova(area, start_index); + + iommu_unmap_nofail(domain, start_iova, + iopt_area_index_to_iova_last(area, last_index) - + start_iova + 1); +} + +static struct iopt_area *iopt_pages_find_domain_area(struct iopt_pages *pages, + unsigned long index) +{ + struct interval_tree_node *node; + + node = interval_tree_iter_first(&pages->domains_itree, index, index); + if (!node) + return NULL; + return container_of(node, struct iopt_area, pages_node); +} + +/* + * A simple datastructure to hold a vector of PFNs, optimized for contiguous + * PFNs. This is used as a temporary holding memory for shuttling pfns from one + * place to another. Generally everything is made more efficient if operations + * work on the largest possible grouping of pfns. eg fewer lock/unlock cycles, + * better cache locality, etc + */ +struct pfn_batch { + unsigned long *pfns; + u32 *npfns; + unsigned int array_size; + unsigned int end; + unsigned int total_pfns; +}; + +static void batch_clear(struct pfn_batch *batch) +{ + batch->total_pfns = 0; + batch->end = 0; + batch->pfns[0] = 0; + batch->npfns[0] = 0; +} + +/* + * Carry means we carry a portion of the final hugepage over to the front of the + * batch + */ +static void batch_clear_carry(struct pfn_batch *batch, unsigned int keep_pfns) +{ + if (!keep_pfns) + return batch_clear(batch); + + if (IS_ENABLED(CONFIG_IOMMUFD_TEST)) + WARN_ON(!batch->end || + batch->npfns[batch->end - 1] < keep_pfns); + + batch->total_pfns = keep_pfns; + batch->pfns[0] = batch->pfns[batch->end - 1] + + (batch->npfns[batch->end - 1] - keep_pfns); + batch->npfns[0] = keep_pfns; + batch->end = 1; +} + +static void batch_skip_carry(struct pfn_batch *batch, unsigned int skip_pfns) +{ + if (!batch->total_pfns) + return; + if (IS_ENABLED(CONFIG_IOMMUFD_TEST)) + WARN_ON(batch->total_pfns != batch->npfns[0]); + skip_pfns = min(batch->total_pfns, skip_pfns); + batch->pfns[0] += skip_pfns; + batch->npfns[0] -= skip_pfns; + batch->total_pfns -= skip_pfns; +} + +static int __batch_init(struct pfn_batch *batch, size_t max_pages, void *backup, + size_t backup_len) +{ + const size_t elmsz = sizeof(*batch->pfns) + sizeof(*batch->npfns); + size_t size = max_pages * elmsz; + + batch->pfns = temp_kmalloc(&size, backup, backup_len); + if (!batch->pfns) + return -ENOMEM; + if (IS_ENABLED(CONFIG_IOMMUFD_TEST) && WARN_ON(size < elmsz)) + return -EINVAL; + batch->array_size = size / elmsz; + batch->npfns = (u32 *)(batch->pfns + batch->array_size); + batch_clear(batch); + return 0; +} + +static int batch_init(struct pfn_batch *batch, size_t max_pages) +{ + return __batch_init(batch, max_pages, NULL, 0); +} + +static void batch_init_backup(struct pfn_batch *batch, size_t max_pages, + void *backup, size_t backup_len) +{ + __batch_init(batch, max_pages, backup, backup_len); +} + +static void batch_destroy(struct pfn_batch *batch, void *backup) +{ + if (batch->pfns != backup) + kfree(batch->pfns); +} + +/* true if the pfn was added, false otherwise */ +static bool batch_add_pfn(struct pfn_batch *batch, unsigned long pfn) +{ + const unsigned int MAX_NPFNS = type_max(typeof(*batch->npfns)); + + if (batch->end && + pfn == batch->pfns[batch->end - 1] + batch->npfns[batch->end - 1] && + batch->npfns[batch->end - 1] != MAX_NPFNS) { + batch->npfns[batch->end - 1]++; + batch->total_pfns++; + return true; + } + if (batch->end == batch->array_size) + return false; + batch->total_pfns++; + batch->pfns[batch->end] = pfn; + batch->npfns[batch->end] = 1; + batch->end++; + return true; +} + +/* + * Fill the batch with pfns from the domain. When the batch is full, or it + * reaches last_index, the function will return. The caller should use + * batch->total_pfns to determine the starting point for the next iteration. + */ +static void batch_from_domain(struct pfn_batch *batch, + struct iommu_domain *domain, + struct iopt_area *area, unsigned long start_index, + unsigned long last_index) +{ + unsigned int page_offset = 0; + unsigned long iova; + phys_addr_t phys; + + iova = iopt_area_index_to_iova(area, start_index); + if (start_index == iopt_area_index(area)) + page_offset = area->page_offset; + while (start_index <= last_index) { + /* + * This is pretty slow, it would be nice to get the page size + * back from the driver, or have the driver directly fill the + * batch. + */ + phys = iommu_iova_to_phys(domain, iova) - page_offset; + if (!batch_add_pfn(batch, PHYS_PFN(phys))) + return; + iova += PAGE_SIZE - page_offset; + page_offset = 0; + start_index++; + } +} + +static struct page **raw_pages_from_domain(struct iommu_domain *domain, + struct iopt_area *area, + unsigned long start_index, + unsigned long last_index, + struct page **out_pages) +{ + unsigned int page_offset = 0; + unsigned long iova; + phys_addr_t phys; + + iova = iopt_area_index_to_iova(area, start_index); + if (start_index == iopt_area_index(area)) + page_offset = area->page_offset; + while (start_index <= last_index) { + phys = iommu_iova_to_phys(domain, iova) - page_offset; + *(out_pages++) = pfn_to_page(PHYS_PFN(phys)); + iova += PAGE_SIZE - page_offset; + page_offset = 0; + start_index++; + } + return out_pages; +} + +/* Continues reading a domain until we reach a discontinuity in the pfns. */ +static void batch_from_domain_continue(struct pfn_batch *batch, + struct iommu_domain *domain, + struct iopt_area *area, + unsigned long start_index, + unsigned long last_index) +{ + unsigned int array_size = batch->array_size; + + batch->array_size = batch->end; + batch_from_domain(batch, domain, area, start_index, last_index); + batch->array_size = array_size; +} + +/* + * This is part of the VFIO compatibility support for VFIO_TYPE1_IOMMU. That + * mode permits splitting a mapped area up, and then one of the splits is + * unmapped. Doing this normally would cause us to violate our invariant of + * pairing map/unmap. Thus, to support old VFIO compatibility disable support + * for batching consecutive PFNs. All PFNs mapped into the iommu are done in + * PAGE_SIZE units, not larger or smaller. + */ +static int batch_iommu_map_small(struct iommu_domain *domain, + unsigned long iova, phys_addr_t paddr, + size_t size, int prot) +{ + unsigned long start_iova = iova; + int rc; + + if (IS_ENABLED(CONFIG_IOMMUFD_TEST)) + WARN_ON(paddr % PAGE_SIZE || iova % PAGE_SIZE || + size % PAGE_SIZE); + + while (size) { + rc = iommu_map(domain, iova, paddr, PAGE_SIZE, prot, + GFP_KERNEL_ACCOUNT); + if (rc) + goto err_unmap; + iova += PAGE_SIZE; + paddr += PAGE_SIZE; + size -= PAGE_SIZE; + } + return 0; + +err_unmap: + if (start_iova != iova) + iommu_unmap_nofail(domain, start_iova, iova - start_iova); + return rc; +} + +static int batch_to_domain(struct pfn_batch *batch, struct iommu_domain *domain, + struct iopt_area *area, unsigned long start_index) +{ + bool disable_large_pages = area->iopt->disable_large_pages; + unsigned long last_iova = iopt_area_last_iova(area); + unsigned int page_offset = 0; + unsigned long start_iova; + unsigned long next_iova; + unsigned int cur = 0; + unsigned long iova; + int rc; + + /* The first index might be a partial page */ + if (start_index == iopt_area_index(area)) + page_offset = area->page_offset; + next_iova = iova = start_iova = + iopt_area_index_to_iova(area, start_index); + while (cur < batch->end) { + next_iova = min(last_iova + 1, + next_iova + batch->npfns[cur] * PAGE_SIZE - + page_offset); + if (disable_large_pages) + rc = batch_iommu_map_small( + domain, iova, + PFN_PHYS(batch->pfns[cur]) + page_offset, + next_iova - iova, area->iommu_prot); + else + rc = iommu_map(domain, iova, + PFN_PHYS(batch->pfns[cur]) + page_offset, + next_iova - iova, area->iommu_prot, + GFP_KERNEL_ACCOUNT); + if (rc) + goto err_unmap; + iova = next_iova; + page_offset = 0; + cur++; + } + return 0; +err_unmap: + if (start_iova != iova) + iommu_unmap_nofail(domain, start_iova, iova - start_iova); + return rc; +} + +static void batch_from_xarray(struct pfn_batch *batch, struct xarray *xa, + unsigned long start_index, + unsigned long last_index) +{ + XA_STATE(xas, xa, start_index); + void *entry; + + rcu_read_lock(); + while (true) { + entry = xas_next(&xas); + if (xas_retry(&xas, entry)) + continue; + WARN_ON(!xa_is_value(entry)); + if (!batch_add_pfn(batch, xa_to_value(entry)) || + start_index == last_index) + break; + start_index++; + } + rcu_read_unlock(); +} + +static void batch_from_xarray_clear(struct pfn_batch *batch, struct xarray *xa, + unsigned long start_index, + unsigned long last_index) +{ + XA_STATE(xas, xa, start_index); + void *entry; + + xas_lock(&xas); + while (true) { + entry = xas_next(&xas); + if (xas_retry(&xas, entry)) + continue; + WARN_ON(!xa_is_value(entry)); + if (!batch_add_pfn(batch, xa_to_value(entry))) + break; + xas_store(&xas, NULL); + if (start_index == last_index) + break; + start_index++; + } + xas_unlock(&xas); +} + +static void clear_xarray(struct xarray *xa, unsigned long start_index, + unsigned long last_index) +{ + XA_STATE(xas, xa, start_index); + void *entry; + + xas_lock(&xas); + xas_for_each(&xas, entry, last_index) + xas_store(&xas, NULL); + xas_unlock(&xas); +} + +static int pages_to_xarray(struct xarray *xa, unsigned long start_index, + unsigned long last_index, struct page **pages) +{ + struct page **end_pages = pages + (last_index - start_index) + 1; + struct page **half_pages = pages + (end_pages - pages) / 2; + XA_STATE(xas, xa, start_index); + + do { + void *old; + + xas_lock(&xas); + while (pages != end_pages) { + /* xarray does not participate in fault injection */ + if (pages == half_pages && iommufd_should_fail()) { + xas_set_err(&xas, -EINVAL); + xas_unlock(&xas); + /* aka xas_destroy() */ + xas_nomem(&xas, GFP_KERNEL); + goto err_clear; + } + + old = xas_store(&xas, xa_mk_value(page_to_pfn(*pages))); + if (xas_error(&xas)) + break; + WARN_ON(old); + pages++; + xas_next(&xas); + } + xas_unlock(&xas); + } while (xas_nomem(&xas, GFP_KERNEL)); + +err_clear: + if (xas_error(&xas)) { + if (xas.xa_index != start_index) + clear_xarray(xa, start_index, xas.xa_index - 1); + return xas_error(&xas); + } + return 0; +} + +static void batch_from_pages(struct pfn_batch *batch, struct page **pages, + size_t npages) +{ + struct page **end = pages + npages; + + for (; pages != end; pages++) + if (!batch_add_pfn(batch, page_to_pfn(*pages))) + break; +} + +static void batch_unpin(struct pfn_batch *batch, struct iopt_pages *pages, + unsigned int first_page_off, size_t npages) +{ + unsigned int cur = 0; + + while (first_page_off) { + if (batch->npfns[cur] > first_page_off) + break; + first_page_off -= batch->npfns[cur]; + cur++; + } + + while (npages) { + size_t to_unpin = min_t(size_t, npages, + batch->npfns[cur] - first_page_off); + + unpin_user_page_range_dirty_lock( + pfn_to_page(batch->pfns[cur] + first_page_off), + to_unpin, pages->writable); + iopt_pages_sub_npinned(pages, to_unpin); + cur++; + first_page_off = 0; + npages -= to_unpin; + } +} + +static void copy_data_page(struct page *page, void *data, unsigned long offset, + size_t length, unsigned int flags) +{ + void *mem; + + mem = kmap_local_page(page); + if (flags & IOMMUFD_ACCESS_RW_WRITE) { + memcpy(mem + offset, data, length); + set_page_dirty_lock(page); + } else { + memcpy(data, mem + offset, length); + } + kunmap_local(mem); +} + +static unsigned long batch_rw(struct pfn_batch *batch, void *data, + unsigned long offset, unsigned long length, + unsigned int flags) +{ + unsigned long copied = 0; + unsigned int npage = 0; + unsigned int cur = 0; + + while (cur < batch->end) { + unsigned long bytes = min(length, PAGE_SIZE - offset); + + copy_data_page(pfn_to_page(batch->pfns[cur] + npage), data, + offset, bytes, flags); + offset = 0; + length -= bytes; + data += bytes; + copied += bytes; + npage++; + if (npage == batch->npfns[cur]) { + npage = 0; + cur++; + } + if (!length) + break; + } + return copied; +} + +/* pfn_reader_user is just the pin_user_pages() path */ +struct pfn_reader_user { + struct page **upages; + size_t upages_len; + unsigned long upages_start; + unsigned long upages_end; + unsigned int gup_flags; + /* + * 1 means mmget() and mmap_read_lock(), 0 means only mmget(), -1 is + * neither + */ + int locked; +}; + +static void pfn_reader_user_init(struct pfn_reader_user *user, + struct iopt_pages *pages) +{ + user->upages = NULL; + user->upages_start = 0; + user->upages_end = 0; + user->locked = -1; + + user->gup_flags = FOLL_LONGTERM; + if (pages->writable) + user->gup_flags |= FOLL_WRITE; +} + +static void pfn_reader_user_destroy(struct pfn_reader_user *user, + struct iopt_pages *pages) +{ + if (user->locked != -1) { + if (user->locked) + mmap_read_unlock(pages->source_mm); + if (pages->source_mm != current->mm) + mmput(pages->source_mm); + user->locked = -1; + } + + kfree(user->upages); + user->upages = NULL; +} + +static int pfn_reader_user_pin(struct pfn_reader_user *user, + struct iopt_pages *pages, + unsigned long start_index, + unsigned long last_index) +{ + bool remote_mm = pages->source_mm != current->mm; + unsigned long npages; + uintptr_t uptr; + long rc; + + if (IS_ENABLED(CONFIG_IOMMUFD_TEST) && + WARN_ON(last_index < start_index)) + return -EINVAL; + + if (!user->upages) { + /* All undone in pfn_reader_destroy() */ + user->upages_len = + (last_index - start_index + 1) * sizeof(*user->upages); + user->upages = temp_kmalloc(&user->upages_len, NULL, 0); + if (!user->upages) + return -ENOMEM; + } + + if (user->locked == -1) { + /* + * The majority of usages will run the map task within the mm + * providing the pages, so we can optimize into + * get_user_pages_fast() + */ + if (remote_mm) { + if (!mmget_not_zero(pages->source_mm)) + return -EFAULT; + } + user->locked = 0; + } + + npages = min_t(unsigned long, last_index - start_index + 1, + user->upages_len / sizeof(*user->upages)); + + + if (iommufd_should_fail()) + return -EFAULT; + + uptr = (uintptr_t)(pages->uptr + start_index * PAGE_SIZE); + if (!remote_mm) + rc = pin_user_pages_fast(uptr, npages, user->gup_flags, + user->upages); + else { + if (!user->locked) { + mmap_read_lock(pages->source_mm); + user->locked = 1; + } + rc = pin_user_pages_remote(pages->source_mm, uptr, npages, + user->gup_flags, user->upages, + &user->locked); + } + if (rc <= 0) { + if (WARN_ON(!rc)) + return -EFAULT; + return rc; + } + iopt_pages_add_npinned(pages, rc); + user->upages_start = start_index; + user->upages_end = start_index + rc; + return 0; +} + +/* This is the "modern" and faster accounting method used by io_uring */ +static int incr_user_locked_vm(struct iopt_pages *pages, unsigned long npages) +{ + unsigned long lock_limit; + unsigned long cur_pages; + unsigned long new_pages; + + lock_limit = task_rlimit(pages->source_task, RLIMIT_MEMLOCK) >> + PAGE_SHIFT; + do { + cur_pages = atomic_long_read(&pages->source_user->locked_vm); + new_pages = cur_pages + npages; + if (new_pages > lock_limit) + return -ENOMEM; + } while (atomic_long_cmpxchg(&pages->source_user->locked_vm, cur_pages, + new_pages) != cur_pages); + return 0; +} + +static void decr_user_locked_vm(struct iopt_pages *pages, unsigned long npages) +{ + if (WARN_ON(atomic_long_read(&pages->source_user->locked_vm) < npages)) + return; + atomic_long_sub(npages, &pages->source_user->locked_vm); +} + +/* This is the accounting method used for compatibility with VFIO */ +static int update_mm_locked_vm(struct iopt_pages *pages, unsigned long npages, + bool inc, struct pfn_reader_user *user) +{ + bool do_put = false; + int rc; + + if (user && user->locked) { + mmap_read_unlock(pages->source_mm); + user->locked = 0; + /* If we had the lock then we also have a get */ + } else if ((!user || !user->upages) && + pages->source_mm != current->mm) { + if (!mmget_not_zero(pages->source_mm)) + return -EINVAL; + do_put = true; + } + + mmap_write_lock(pages->source_mm); + rc = __account_locked_vm(pages->source_mm, npages, inc, + pages->source_task, false); + mmap_write_unlock(pages->source_mm); + + if (do_put) + mmput(pages->source_mm); + return rc; +} + +static int do_update_pinned(struct iopt_pages *pages, unsigned long npages, + bool inc, struct pfn_reader_user *user) +{ + int rc = 0; + + switch (pages->account_mode) { + case IOPT_PAGES_ACCOUNT_NONE: + break; + case IOPT_PAGES_ACCOUNT_USER: + if (inc) + rc = incr_user_locked_vm(pages, npages); + else + decr_user_locked_vm(pages, npages); + break; + case IOPT_PAGES_ACCOUNT_MM: + rc = update_mm_locked_vm(pages, npages, inc, user); + break; + } + if (rc) + return rc; + + pages->last_npinned = pages->npinned; + if (inc) + atomic64_add(npages, &pages->source_mm->pinned_vm); + else + atomic64_sub(npages, &pages->source_mm->pinned_vm); + return 0; +} + +static void update_unpinned(struct iopt_pages *pages) +{ + if (WARN_ON(pages->npinned > pages->last_npinned)) + return; + if (pages->npinned == pages->last_npinned) + return; + do_update_pinned(pages, pages->last_npinned - pages->npinned, false, + NULL); +} + +/* + * Changes in the number of pages pinned is done after the pages have been read + * and processed. If the user lacked the limit then the error unwind will unpin + * everything that was just pinned. This is because it is expensive to calculate + * how many pages we have already pinned within a range to generate an accurate + * prediction in advance of doing the work to actually pin them. + */ +static int pfn_reader_user_update_pinned(struct pfn_reader_user *user, + struct iopt_pages *pages) +{ + unsigned long npages; + bool inc; + + lockdep_assert_held(&pages->mutex); + + if (pages->npinned == pages->last_npinned) + return 0; + + if (pages->npinned < pages->last_npinned) { + npages = pages->last_npinned - pages->npinned; + inc = false; + } else { + if (iommufd_should_fail()) + return -ENOMEM; + npages = pages->npinned - pages->last_npinned; + inc = true; + } + return do_update_pinned(pages, npages, inc, user); +} + +/* + * PFNs are stored in three places, in order of preference: + * - The iopt_pages xarray. This is only populated if there is a + * iopt_pages_access + * - The iommu_domain under an area + * - The original PFN source, ie pages->source_mm + * + * This iterator reads the pfns optimizing to load according to the + * above order. + */ +struct pfn_reader { + struct iopt_pages *pages; + struct interval_tree_double_span_iter span; + struct pfn_batch batch; + unsigned long batch_start_index; + unsigned long batch_end_index; + unsigned long last_index; + + struct pfn_reader_user user; +}; + +static int pfn_reader_update_pinned(struct pfn_reader *pfns) +{ + return pfn_reader_user_update_pinned(&pfns->user, pfns->pages); +} + +/* + * The batch can contain a mixture of pages that are still in use and pages that + * need to be unpinned. Unpin only pages that are not held anywhere else. + */ +static void pfn_reader_unpin(struct pfn_reader *pfns) +{ + unsigned long last = pfns->batch_end_index - 1; + unsigned long start = pfns->batch_start_index; + struct interval_tree_double_span_iter span; + struct iopt_pages *pages = pfns->pages; + + lockdep_assert_held(&pages->mutex); + + interval_tree_for_each_double_span(&span, &pages->access_itree, + &pages->domains_itree, start, last) { + if (span.is_used) + continue; + + batch_unpin(&pfns->batch, pages, span.start_hole - start, + span.last_hole - span.start_hole + 1); + } +} + +/* Process a single span to load it from the proper storage */ +static int pfn_reader_fill_span(struct pfn_reader *pfns) +{ + struct interval_tree_double_span_iter *span = &pfns->span; + unsigned long start_index = pfns->batch_end_index; + struct iopt_area *area; + int rc; + + if (IS_ENABLED(CONFIG_IOMMUFD_TEST) && + WARN_ON(span->last_used < start_index)) + return -EINVAL; + + if (span->is_used == 1) { + batch_from_xarray(&pfns->batch, &pfns->pages->pinned_pfns, + start_index, span->last_used); + return 0; + } + + if (span->is_used == 2) { + /* + * Pull as many pages from the first domain we find in the + * target span. If it is too small then we will be called again + * and we'll find another area. + */ + area = iopt_pages_find_domain_area(pfns->pages, start_index); + if (WARN_ON(!area)) + return -EINVAL; + + /* The storage_domain cannot change without the pages mutex */ + batch_from_domain( + &pfns->batch, area->storage_domain, area, start_index, + min(iopt_area_last_index(area), span->last_used)); + return 0; + } + + if (start_index >= pfns->user.upages_end) { + rc = pfn_reader_user_pin(&pfns->user, pfns->pages, start_index, + span->last_hole); + if (rc) + return rc; + } + + batch_from_pages(&pfns->batch, + pfns->user.upages + + (start_index - pfns->user.upages_start), + pfns->user.upages_end - start_index); + return 0; +} + +static bool pfn_reader_done(struct pfn_reader *pfns) +{ + return pfns->batch_start_index == pfns->last_index + 1; +} + +static int pfn_reader_next(struct pfn_reader *pfns) +{ + int rc; + + batch_clear(&pfns->batch); + pfns->batch_start_index = pfns->batch_end_index; + + while (pfns->batch_end_index != pfns->last_index + 1) { + unsigned int npfns = pfns->batch.total_pfns; + + if (IS_ENABLED(CONFIG_IOMMUFD_TEST) && + WARN_ON(interval_tree_double_span_iter_done(&pfns->span))) + return -EINVAL; + + rc = pfn_reader_fill_span(pfns); + if (rc) + return rc; + + if (WARN_ON(!pfns->batch.total_pfns)) + return -EINVAL; + + pfns->batch_end_index = + pfns->batch_start_index + pfns->batch.total_pfns; + if (pfns->batch_end_index == pfns->span.last_used + 1) + interval_tree_double_span_iter_next(&pfns->span); + + /* Batch is full */ + if (npfns == pfns->batch.total_pfns) + return 0; + } + return 0; +} + +static int pfn_reader_init(struct pfn_reader *pfns, struct iopt_pages *pages, + unsigned long start_index, unsigned long last_index) +{ + int rc; + + lockdep_assert_held(&pages->mutex); + + pfns->pages = pages; + pfns->batch_start_index = start_index; + pfns->batch_end_index = start_index; + pfns->last_index = last_index; + pfn_reader_user_init(&pfns->user, pages); + rc = batch_init(&pfns->batch, last_index - start_index + 1); + if (rc) + return rc; + interval_tree_double_span_iter_first(&pfns->span, &pages->access_itree, + &pages->domains_itree, start_index, + last_index); + return 0; +} + +/* + * There are many assertions regarding the state of pages->npinned vs + * pages->last_pinned, for instance something like unmapping a domain must only + * decrement the npinned, and pfn_reader_destroy() must be called only after all + * the pins are updated. This is fine for success flows, but error flows + * sometimes need to release the pins held inside the pfn_reader before going on + * to complete unmapping and releasing pins held in domains. + */ +static void pfn_reader_release_pins(struct pfn_reader *pfns) +{ + struct iopt_pages *pages = pfns->pages; + + if (pfns->user.upages_end > pfns->batch_end_index) { + size_t npages = pfns->user.upages_end - pfns->batch_end_index; + + /* Any pages not transferred to the batch are just unpinned */ + unpin_user_pages(pfns->user.upages + (pfns->batch_end_index - + pfns->user.upages_start), + npages); + iopt_pages_sub_npinned(pages, npages); + pfns->user.upages_end = pfns->batch_end_index; + } + if (pfns->batch_start_index != pfns->batch_end_index) { + pfn_reader_unpin(pfns); + pfns->batch_start_index = pfns->batch_end_index; + } +} + +static void pfn_reader_destroy(struct pfn_reader *pfns) +{ + struct iopt_pages *pages = pfns->pages; + + pfn_reader_release_pins(pfns); + pfn_reader_user_destroy(&pfns->user, pfns->pages); + batch_destroy(&pfns->batch, NULL); + WARN_ON(pages->last_npinned != pages->npinned); +} + +static int pfn_reader_first(struct pfn_reader *pfns, struct iopt_pages *pages, + unsigned long start_index, unsigned long last_index) +{ + int rc; + + if (IS_ENABLED(CONFIG_IOMMUFD_TEST) && + WARN_ON(last_index < start_index)) + return -EINVAL; + + rc = pfn_reader_init(pfns, pages, start_index, last_index); + if (rc) + return rc; + rc = pfn_reader_next(pfns); + if (rc) { + pfn_reader_destroy(pfns); + return rc; + } + return 0; +} + +struct iopt_pages *iopt_alloc_pages(void __user *uptr, unsigned long length, + bool writable) +{ + struct iopt_pages *pages; + unsigned long end; + + /* + * The iommu API uses size_t as the length, and protect the DIV_ROUND_UP + * below from overflow + */ + if (length > SIZE_MAX - PAGE_SIZE || length == 0) + return ERR_PTR(-EINVAL); + + if (check_add_overflow((unsigned long)uptr, length, &end)) + return ERR_PTR(-EOVERFLOW); + + pages = kzalloc(sizeof(*pages), GFP_KERNEL_ACCOUNT); + if (!pages) + return ERR_PTR(-ENOMEM); + + kref_init(&pages->kref); + xa_init_flags(&pages->pinned_pfns, XA_FLAGS_ACCOUNT); + mutex_init(&pages->mutex); + pages->source_mm = current->mm; + mmgrab(pages->source_mm); + pages->uptr = (void __user *)ALIGN_DOWN((uintptr_t)uptr, PAGE_SIZE); + pages->npages = DIV_ROUND_UP(length + (uptr - pages->uptr), PAGE_SIZE); + pages->access_itree = RB_ROOT_CACHED; + pages->domains_itree = RB_ROOT_CACHED; + pages->writable = writable; + if (capable(CAP_IPC_LOCK)) + pages->account_mode = IOPT_PAGES_ACCOUNT_NONE; + else + pages->account_mode = IOPT_PAGES_ACCOUNT_USER; + pages->source_task = current->group_leader; + get_task_struct(current->group_leader); + pages->source_user = get_uid(current_user()); + return pages; +} + +void iopt_release_pages(struct kref *kref) +{ + struct iopt_pages *pages = container_of(kref, struct iopt_pages, kref); + + WARN_ON(!RB_EMPTY_ROOT(&pages->access_itree.rb_root)); + WARN_ON(!RB_EMPTY_ROOT(&pages->domains_itree.rb_root)); + WARN_ON(pages->npinned); + WARN_ON(!xa_empty(&pages->pinned_pfns)); + mmdrop(pages->source_mm); + mutex_destroy(&pages->mutex); + put_task_struct(pages->source_task); + free_uid(pages->source_user); + kfree(pages); +} + +static void +iopt_area_unpin_domain(struct pfn_batch *batch, struct iopt_area *area, + struct iopt_pages *pages, struct iommu_domain *domain, + unsigned long start_index, unsigned long last_index, + unsigned long *unmapped_end_index, + unsigned long real_last_index) +{ + while (start_index <= last_index) { + unsigned long batch_last_index; + + if (*unmapped_end_index <= last_index) { + unsigned long start = + max(start_index, *unmapped_end_index); + + if (IS_ENABLED(CONFIG_IOMMUFD_TEST) && + batch->total_pfns) + WARN_ON(*unmapped_end_index - + batch->total_pfns != + start_index); + batch_from_domain(batch, domain, area, start, + last_index); + batch_last_index = start_index + batch->total_pfns - 1; + } else { + batch_last_index = last_index; + } + + if (IS_ENABLED(CONFIG_IOMMUFD_TEST)) + WARN_ON(batch_last_index > real_last_index); + + /* + * unmaps must always 'cut' at a place where the pfns are not + * contiguous to pair with the maps that always install + * contiguous pages. Thus, if we have to stop unpinning in the + * middle of the domains we need to keep reading pfns until we + * find a cut point to do the unmap. The pfns we read are + * carried over and either skipped or integrated into the next + * batch. + */ + if (batch_last_index == last_index && + last_index != real_last_index) + batch_from_domain_continue(batch, domain, area, + last_index + 1, + real_last_index); + + if (*unmapped_end_index <= batch_last_index) { + iopt_area_unmap_domain_range( + area, domain, *unmapped_end_index, + start_index + batch->total_pfns - 1); + *unmapped_end_index = start_index + batch->total_pfns; + } + + /* unpin must follow unmap */ + batch_unpin(batch, pages, 0, + batch_last_index - start_index + 1); + start_index = batch_last_index + 1; + + batch_clear_carry(batch, + *unmapped_end_index - batch_last_index - 1); + } +} + +static void __iopt_area_unfill_domain(struct iopt_area *area, + struct iopt_pages *pages, + struct iommu_domain *domain, + unsigned long last_index) +{ + struct interval_tree_double_span_iter span; + unsigned long start_index = iopt_area_index(area); + unsigned long unmapped_end_index = start_index; + u64 backup[BATCH_BACKUP_SIZE]; + struct pfn_batch batch; + + lockdep_assert_held(&pages->mutex); + + /* + * For security we must not unpin something that is still DMA mapped, + * so this must unmap any IOVA before we go ahead and unpin the pages. + * This creates a complexity where we need to skip over unpinning pages + * held in the xarray, but continue to unmap from the domain. + * + * The domain unmap cannot stop in the middle of a contiguous range of + * PFNs. To solve this problem the unpinning step will read ahead to the + * end of any contiguous span, unmap that whole span, and then only + * unpin the leading part that does not have any accesses. The residual + * PFNs that were unmapped but not unpinned are called a "carry" in the + * batch as they are moved to the front of the PFN list and continue on + * to the next iteration(s). + */ + batch_init_backup(&batch, last_index + 1, backup, sizeof(backup)); + interval_tree_for_each_double_span(&span, &pages->domains_itree, + &pages->access_itree, start_index, + last_index) { + if (span.is_used) { + batch_skip_carry(&batch, + span.last_used - span.start_used + 1); + continue; + } + iopt_area_unpin_domain(&batch, area, pages, domain, + span.start_hole, span.last_hole, + &unmapped_end_index, last_index); + } + /* + * If the range ends in a access then we do the residual unmap without + * any unpins. + */ + if (unmapped_end_index != last_index + 1) + iopt_area_unmap_domain_range(area, domain, unmapped_end_index, + last_index); + WARN_ON(batch.total_pfns); + batch_destroy(&batch, backup); + update_unpinned(pages); +} + +static void iopt_area_unfill_partial_domain(struct iopt_area *area, + struct iopt_pages *pages, + struct iommu_domain *domain, + unsigned long end_index) +{ + if (end_index != iopt_area_index(area)) + __iopt_area_unfill_domain(area, pages, domain, end_index - 1); +} + +/** + * iopt_area_unmap_domain() - Unmap without unpinning PFNs in a domain + * @area: The IOVA range to unmap + * @domain: The domain to unmap + * + * The caller must know that unpinning is not required, usually because there + * are other domains in the iopt. + */ +void iopt_area_unmap_domain(struct iopt_area *area, struct iommu_domain *domain) +{ + iommu_unmap_nofail(domain, iopt_area_iova(area), + iopt_area_length(area)); +} + +/** + * iopt_area_unfill_domain() - Unmap and unpin PFNs in a domain + * @area: IOVA area to use + * @pages: page supplier for the area (area->pages is NULL) + * @domain: Domain to unmap from + * + * The domain should be removed from the domains_itree before calling. The + * domain will always be unmapped, but the PFNs may not be unpinned if there are + * still accesses. + */ +void iopt_area_unfill_domain(struct iopt_area *area, struct iopt_pages *pages, + struct iommu_domain *domain) +{ + __iopt_area_unfill_domain(area, pages, domain, + iopt_area_last_index(area)); +} + +/** + * iopt_area_fill_domain() - Map PFNs from the area into a domain + * @area: IOVA area to use + * @domain: Domain to load PFNs into + * + * Read the pfns from the area's underlying iopt_pages and map them into the + * given domain. Called when attaching a new domain to an io_pagetable. + */ +int iopt_area_fill_domain(struct iopt_area *area, struct iommu_domain *domain) +{ + unsigned long done_end_index; + struct pfn_reader pfns; + int rc; + + lockdep_assert_held(&area->pages->mutex); + + rc = pfn_reader_first(&pfns, area->pages, iopt_area_index(area), + iopt_area_last_index(area)); + if (rc) + return rc; + + while (!pfn_reader_done(&pfns)) { + done_end_index = pfns.batch_start_index; + rc = batch_to_domain(&pfns.batch, domain, area, + pfns.batch_start_index); + if (rc) + goto out_unmap; + done_end_index = pfns.batch_end_index; + + rc = pfn_reader_next(&pfns); + if (rc) + goto out_unmap; + } + + rc = pfn_reader_update_pinned(&pfns); + if (rc) + goto out_unmap; + goto out_destroy; + +out_unmap: + pfn_reader_release_pins(&pfns); + iopt_area_unfill_partial_domain(area, area->pages, domain, + done_end_index); +out_destroy: + pfn_reader_destroy(&pfns); + return rc; +} + +/** + * iopt_area_fill_domains() - Install PFNs into the area's domains + * @area: The area to act on + * @pages: The pages associated with the area (area->pages is NULL) + * + * Called during area creation. The area is freshly created and not inserted in + * the domains_itree yet. PFNs are read and loaded into every domain held in the + * area's io_pagetable and the area is installed in the domains_itree. + * + * On failure all domains are left unchanged. + */ +int iopt_area_fill_domains(struct iopt_area *area, struct iopt_pages *pages) +{ + unsigned long done_first_end_index; + unsigned long done_all_end_index; + struct iommu_domain *domain; + unsigned long unmap_index; + struct pfn_reader pfns; + unsigned long index; + int rc; + + lockdep_assert_held(&area->iopt->domains_rwsem); + + if (xa_empty(&area->iopt->domains)) + return 0; + + mutex_lock(&pages->mutex); + rc = pfn_reader_first(&pfns, pages, iopt_area_index(area), + iopt_area_last_index(area)); + if (rc) + goto out_unlock; + + while (!pfn_reader_done(&pfns)) { + done_first_end_index = pfns.batch_end_index; + done_all_end_index = pfns.batch_start_index; + xa_for_each(&area->iopt->domains, index, domain) { + rc = batch_to_domain(&pfns.batch, domain, area, + pfns.batch_start_index); + if (rc) + goto out_unmap; + } + done_all_end_index = done_first_end_index; + + rc = pfn_reader_next(&pfns); + if (rc) + goto out_unmap; + } + rc = pfn_reader_update_pinned(&pfns); + if (rc) + goto out_unmap; + + area->storage_domain = xa_load(&area->iopt->domains, 0); + interval_tree_insert(&area->pages_node, &pages->domains_itree); + goto out_destroy; + +out_unmap: + pfn_reader_release_pins(&pfns); + xa_for_each(&area->iopt->domains, unmap_index, domain) { + unsigned long end_index; + + if (unmap_index < index) + end_index = done_first_end_index; + else + end_index = done_all_end_index; + + /* + * The area is not yet part of the domains_itree so we have to + * manage the unpinning specially. The last domain does the + * unpin, every other domain is just unmapped. + */ + if (unmap_index != area->iopt->next_domain_id - 1) { + if (end_index != iopt_area_index(area)) + iopt_area_unmap_domain_range( + area, domain, iopt_area_index(area), + end_index - 1); + } else { + iopt_area_unfill_partial_domain(area, pages, domain, + end_index); + } + } +out_destroy: + pfn_reader_destroy(&pfns); +out_unlock: + mutex_unlock(&pages->mutex); + return rc; +} + +/** + * iopt_area_unfill_domains() - unmap PFNs from the area's domains + * @area: The area to act on + * @pages: The pages associated with the area (area->pages is NULL) + * + * Called during area destruction. This unmaps the iova's covered by all the + * area's domains and releases the PFNs. + */ +void iopt_area_unfill_domains(struct iopt_area *area, struct iopt_pages *pages) +{ + struct io_pagetable *iopt = area->iopt; + struct iommu_domain *domain; + unsigned long index; + + lockdep_assert_held(&iopt->domains_rwsem); + + mutex_lock(&pages->mutex); + if (!area->storage_domain) + goto out_unlock; + + xa_for_each(&iopt->domains, index, domain) + if (domain != area->storage_domain) + iopt_area_unmap_domain_range( + area, domain, iopt_area_index(area), + iopt_area_last_index(area)); + + if (IS_ENABLED(CONFIG_IOMMUFD_TEST)) + WARN_ON(RB_EMPTY_NODE(&area->pages_node.rb)); + interval_tree_remove(&area->pages_node, &pages->domains_itree); + iopt_area_unfill_domain(area, pages, area->storage_domain); + area->storage_domain = NULL; +out_unlock: + mutex_unlock(&pages->mutex); +} + +static void iopt_pages_unpin_xarray(struct pfn_batch *batch, + struct iopt_pages *pages, + unsigned long start_index, + unsigned long end_index) +{ + while (start_index <= end_index) { + batch_from_xarray_clear(batch, &pages->pinned_pfns, start_index, + end_index); + batch_unpin(batch, pages, 0, batch->total_pfns); + start_index += batch->total_pfns; + batch_clear(batch); + } +} + +/** + * iopt_pages_unfill_xarray() - Update the xarry after removing an access + * @pages: The pages to act on + * @start_index: Starting PFN index + * @last_index: Last PFN index + * + * Called when an iopt_pages_access is removed, removes pages from the itree. + * The access should already be removed from the access_itree. + */ +void iopt_pages_unfill_xarray(struct iopt_pages *pages, + unsigned long start_index, + unsigned long last_index) +{ + struct interval_tree_double_span_iter span; + u64 backup[BATCH_BACKUP_SIZE]; + struct pfn_batch batch; + bool batch_inited = false; + + lockdep_assert_held(&pages->mutex); + + interval_tree_for_each_double_span(&span, &pages->access_itree, + &pages->domains_itree, start_index, + last_index) { + if (!span.is_used) { + if (!batch_inited) { + batch_init_backup(&batch, + last_index - start_index + 1, + backup, sizeof(backup)); + batch_inited = true; + } + iopt_pages_unpin_xarray(&batch, pages, span.start_hole, + span.last_hole); + } else if (span.is_used == 2) { + /* Covered by a domain */ + clear_xarray(&pages->pinned_pfns, span.start_used, + span.last_used); + } + /* Otherwise covered by an existing access */ + } + if (batch_inited) + batch_destroy(&batch, backup); + update_unpinned(pages); +} + +/** + * iopt_pages_fill_from_xarray() - Fast path for reading PFNs + * @pages: The pages to act on + * @start_index: The first page index in the range + * @last_index: The last page index in the range + * @out_pages: The output array to return the pages + * + * This can be called if the caller is holding a refcount on an + * iopt_pages_access that is known to have already been filled. It quickly reads + * the pages directly from the xarray. + * + * This is part of the SW iommu interface to read pages for in-kernel use. + */ +void iopt_pages_fill_from_xarray(struct iopt_pages *pages, + unsigned long start_index, + unsigned long last_index, + struct page **out_pages) +{ + XA_STATE(xas, &pages->pinned_pfns, start_index); + void *entry; + + rcu_read_lock(); + while (start_index <= last_index) { + entry = xas_next(&xas); + if (xas_retry(&xas, entry)) + continue; + WARN_ON(!xa_is_value(entry)); + *(out_pages++) = pfn_to_page(xa_to_value(entry)); + start_index++; + } + rcu_read_unlock(); +} + +static int iopt_pages_fill_from_domain(struct iopt_pages *pages, + unsigned long start_index, + unsigned long last_index, + struct page **out_pages) +{ + while (start_index != last_index + 1) { + unsigned long domain_last; + struct iopt_area *area; + + area = iopt_pages_find_domain_area(pages, start_index); + if (WARN_ON(!area)) + return -EINVAL; + + domain_last = min(iopt_area_last_index(area), last_index); + out_pages = raw_pages_from_domain(area->storage_domain, area, + start_index, domain_last, + out_pages); + start_index = domain_last + 1; + } + return 0; +} + +static int iopt_pages_fill_from_mm(struct iopt_pages *pages, + struct pfn_reader_user *user, + unsigned long start_index, + unsigned long last_index, + struct page **out_pages) +{ + unsigned long cur_index = start_index; + int rc; + + while (cur_index != last_index + 1) { + user->upages = out_pages + (cur_index - start_index); + rc = pfn_reader_user_pin(user, pages, cur_index, last_index); + if (rc) + goto out_unpin; + cur_index = user->upages_end; + } + return 0; + +out_unpin: + if (start_index != cur_index) + iopt_pages_err_unpin(pages, start_index, cur_index - 1, + out_pages); + return rc; +} + +/** + * iopt_pages_fill_xarray() - Read PFNs + * @pages: The pages to act on + * @start_index: The first page index in the range + * @last_index: The last page index in the range + * @out_pages: The output array to return the pages, may be NULL + * + * This populates the xarray and returns the pages in out_pages. As the slow + * path this is able to copy pages from other storage tiers into the xarray. + * + * On failure the xarray is left unchanged. + * + * This is part of the SW iommu interface to read pages for in-kernel use. + */ +int iopt_pages_fill_xarray(struct iopt_pages *pages, unsigned long start_index, + unsigned long last_index, struct page **out_pages) +{ + struct interval_tree_double_span_iter span; + unsigned long xa_end = start_index; + struct pfn_reader_user user; + int rc; + + lockdep_assert_held(&pages->mutex); + + pfn_reader_user_init(&user, pages); + user.upages_len = (last_index - start_index + 1) * sizeof(*out_pages); + interval_tree_for_each_double_span(&span, &pages->access_itree, + &pages->domains_itree, start_index, + last_index) { + struct page **cur_pages; + + if (span.is_used == 1) { + cur_pages = out_pages + (span.start_used - start_index); + iopt_pages_fill_from_xarray(pages, span.start_used, + span.last_used, cur_pages); + continue; + } + + if (span.is_used == 2) { + cur_pages = out_pages + (span.start_used - start_index); + iopt_pages_fill_from_domain(pages, span.start_used, + span.last_used, cur_pages); + rc = pages_to_xarray(&pages->pinned_pfns, + span.start_used, span.last_used, + cur_pages); + if (rc) + goto out_clean_xa; + xa_end = span.last_used + 1; + continue; + } + + /* hole */ + cur_pages = out_pages + (span.start_hole - start_index); + rc = iopt_pages_fill_from_mm(pages, &user, span.start_hole, + span.last_hole, cur_pages); + if (rc) + goto out_clean_xa; + rc = pages_to_xarray(&pages->pinned_pfns, span.start_hole, + span.last_hole, cur_pages); + if (rc) { + iopt_pages_err_unpin(pages, span.start_hole, + span.last_hole, cur_pages); + goto out_clean_xa; + } + xa_end = span.last_hole + 1; + } + rc = pfn_reader_user_update_pinned(&user, pages); + if (rc) + goto out_clean_xa; + user.upages = NULL; + pfn_reader_user_destroy(&user, pages); + return 0; + +out_clean_xa: + if (start_index != xa_end) + iopt_pages_unfill_xarray(pages, start_index, xa_end - 1); + user.upages = NULL; + pfn_reader_user_destroy(&user, pages); + return rc; +} + +/* + * This uses the pfn_reader instead of taking a shortcut by using the mm. It can + * do every scenario and is fully consistent with what an iommu_domain would + * see. + */ +static int iopt_pages_rw_slow(struct iopt_pages *pages, + unsigned long start_index, + unsigned long last_index, unsigned long offset, + void *data, unsigned long length, + unsigned int flags) +{ + struct pfn_reader pfns; + int rc; + + mutex_lock(&pages->mutex); + + rc = pfn_reader_first(&pfns, pages, start_index, last_index); + if (rc) + goto out_unlock; + + while (!pfn_reader_done(&pfns)) { + unsigned long done; + + done = batch_rw(&pfns.batch, data, offset, length, flags); + data += done; + length -= done; + offset = 0; + pfn_reader_unpin(&pfns); + + rc = pfn_reader_next(&pfns); + if (rc) + goto out_destroy; + } + if (WARN_ON(length != 0)) + rc = -EINVAL; +out_destroy: + pfn_reader_destroy(&pfns); +out_unlock: + mutex_unlock(&pages->mutex); + return rc; +} + +/* + * A medium speed path that still allows DMA inconsistencies, but doesn't do any + * memory allocations or interval tree searches. + */ +static int iopt_pages_rw_page(struct iopt_pages *pages, unsigned long index, + unsigned long offset, void *data, + unsigned long length, unsigned int flags) +{ + struct page *page = NULL; + int rc; + + if (!mmget_not_zero(pages->source_mm)) + return iopt_pages_rw_slow(pages, index, index, offset, data, + length, flags); + + if (iommufd_should_fail()) { + rc = -EINVAL; + goto out_mmput; + } + + mmap_read_lock(pages->source_mm); + rc = pin_user_pages_remote( + pages->source_mm, (uintptr_t)(pages->uptr + index * PAGE_SIZE), + 1, (flags & IOMMUFD_ACCESS_RW_WRITE) ? FOLL_WRITE : 0, &page, + NULL); + mmap_read_unlock(pages->source_mm); + if (rc != 1) { + if (WARN_ON(rc >= 0)) + rc = -EINVAL; + goto out_mmput; + } + copy_data_page(page, data, offset, length, flags); + unpin_user_page(page); + rc = 0; + +out_mmput: + mmput(pages->source_mm); + return rc; +} + +/** + * iopt_pages_rw_access - Copy to/from a linear slice of the pages + * @pages: pages to act on + * @start_byte: First byte of pages to copy to/from + * @data: Kernel buffer to get/put the data + * @length: Number of bytes to copy + * @flags: IOMMUFD_ACCESS_RW_* flags + * + * This will find each page in the range, kmap it and then memcpy to/from + * the given kernel buffer. + */ +int iopt_pages_rw_access(struct iopt_pages *pages, unsigned long start_byte, + void *data, unsigned long length, unsigned int flags) +{ + unsigned long start_index = start_byte / PAGE_SIZE; + unsigned long last_index = (start_byte + length - 1) / PAGE_SIZE; + bool change_mm = current->mm != pages->source_mm; + int rc = 0; + + if (IS_ENABLED(CONFIG_IOMMUFD_TEST) && + (flags & __IOMMUFD_ACCESS_RW_SLOW_PATH)) + change_mm = true; + + if ((flags & IOMMUFD_ACCESS_RW_WRITE) && !pages->writable) + return -EPERM; + + if (!(flags & IOMMUFD_ACCESS_RW_KTHREAD) && change_mm) { + if (start_index == last_index) + return iopt_pages_rw_page(pages, start_index, + start_byte % PAGE_SIZE, data, + length, flags); + return iopt_pages_rw_slow(pages, start_index, last_index, + start_byte % PAGE_SIZE, data, length, + flags); + } + + /* + * Try to copy using copy_to_user(). We do this as a fast path and + * ignore any pinning inconsistencies, unlike a real DMA path. + */ + if (change_mm) { + if (!mmget_not_zero(pages->source_mm)) + return iopt_pages_rw_slow(pages, start_index, + last_index, + start_byte % PAGE_SIZE, data, + length, flags); + kthread_use_mm(pages->source_mm); + } + + if (flags & IOMMUFD_ACCESS_RW_WRITE) { + if (copy_to_user(pages->uptr + start_byte, data, length)) + rc = -EFAULT; + } else { + if (copy_from_user(data, pages->uptr + start_byte, length)) + rc = -EFAULT; + } + + if (change_mm) { + kthread_unuse_mm(pages->source_mm); + mmput(pages->source_mm); + } + + return rc; +} + +static struct iopt_pages_access * +iopt_pages_get_exact_access(struct iopt_pages *pages, unsigned long index, + unsigned long last) +{ + struct interval_tree_node *node; + + lockdep_assert_held(&pages->mutex); + + /* There can be overlapping ranges in this interval tree */ + for (node = interval_tree_iter_first(&pages->access_itree, index, last); + node; node = interval_tree_iter_next(node, index, last)) + if (node->start == index && node->last == last) + return container_of(node, struct iopt_pages_access, + node); + return NULL; +} + +/** + * iopt_area_add_access() - Record an in-knerel access for PFNs + * @area: The source of PFNs + * @start_index: First page index + * @last_index: Inclusive last page index + * @out_pages: Output list of struct page's representing the PFNs + * @flags: IOMMUFD_ACCESS_RW_* flags + * + * Record that an in-kernel access will be accessing the pages, ensure they are + * pinned, and return the PFNs as a simple list of 'struct page *'. + * + * This should be undone through a matching call to iopt_area_remove_access() + */ +int iopt_area_add_access(struct iopt_area *area, unsigned long start_index, + unsigned long last_index, struct page **out_pages, + unsigned int flags) +{ + struct iopt_pages *pages = area->pages; + struct iopt_pages_access *access; + int rc; + + if ((flags & IOMMUFD_ACCESS_RW_WRITE) && !pages->writable) + return -EPERM; + + mutex_lock(&pages->mutex); + access = iopt_pages_get_exact_access(pages, start_index, last_index); + if (access) { + area->num_accesses++; + access->users++; + iopt_pages_fill_from_xarray(pages, start_index, last_index, + out_pages); + mutex_unlock(&pages->mutex); + return 0; + } + + access = kzalloc(sizeof(*access), GFP_KERNEL_ACCOUNT); + if (!access) { + rc = -ENOMEM; + goto err_unlock; + } + + rc = iopt_pages_fill_xarray(pages, start_index, last_index, out_pages); + if (rc) + goto err_free; + + access->node.start = start_index; + access->node.last = last_index; + access->users = 1; + area->num_accesses++; + interval_tree_insert(&access->node, &pages->access_itree); + mutex_unlock(&pages->mutex); + return 0; + +err_free: + kfree(access); +err_unlock: + mutex_unlock(&pages->mutex); + return rc; +} + +/** + * iopt_area_remove_access() - Release an in-kernel access for PFNs + * @area: The source of PFNs + * @start_index: First page index + * @last_index: Inclusive last page index + * + * Undo iopt_area_add_access() and unpin the pages if necessary. The caller + * must stop using the PFNs before calling this. + */ +void iopt_area_remove_access(struct iopt_area *area, unsigned long start_index, + unsigned long last_index) +{ + struct iopt_pages *pages = area->pages; + struct iopt_pages_access *access; + + mutex_lock(&pages->mutex); + access = iopt_pages_get_exact_access(pages, start_index, last_index); + if (WARN_ON(!access)) + goto out_unlock; + + WARN_ON(area->num_accesses == 0 || access->users == 0); + area->num_accesses--; + access->users--; + if (access->users) + goto out_unlock; + + interval_tree_remove(&access->node, &pages->access_itree); + iopt_pages_unfill_xarray(pages, start_index, last_index); + kfree(access); +out_unlock: + mutex_unlock(&pages->mutex); +} diff --git a/drivers/iommu/iommufd/selftest.c b/drivers/iommu/iommufd/selftest.c new file mode 100644 index 0000000000..56506d5753 --- /dev/null +++ b/drivers/iommu/iommufd/selftest.c @@ -0,0 +1,1107 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. + * + * Kernel side components to support tools/testing/selftests/iommu + */ +#include <linux/slab.h> +#include <linux/iommu.h> +#include <linux/xarray.h> +#include <linux/file.h> +#include <linux/anon_inodes.h> +#include <linux/fault-inject.h> +#include <linux/platform_device.h> +#include <uapi/linux/iommufd.h> + +#include "../iommu-priv.h" +#include "io_pagetable.h" +#include "iommufd_private.h" +#include "iommufd_test.h" + +static DECLARE_FAULT_ATTR(fail_iommufd); +static struct dentry *dbgfs_root; +static struct platform_device *selftest_iommu_dev; + +size_t iommufd_test_memory_limit = 65536; + +enum { + MOCK_IO_PAGE_SIZE = PAGE_SIZE / 2, + + /* + * Like a real page table alignment requires the low bits of the address + * to be zero. xarray also requires the high bit to be zero, so we store + * the pfns shifted. The upper bits are used for metadata. + */ + MOCK_PFN_MASK = ULONG_MAX / MOCK_IO_PAGE_SIZE, + + _MOCK_PFN_START = MOCK_PFN_MASK + 1, + MOCK_PFN_START_IOVA = _MOCK_PFN_START, + MOCK_PFN_LAST_IOVA = _MOCK_PFN_START, +}; + +/* + * Syzkaller has trouble randomizing the correct iova to use since it is linked + * to the map ioctl's output, and it has no ide about that. So, simplify things. + * In syzkaller mode the 64 bit IOVA is converted into an nth area and offset + * value. This has a much smaller randomization space and syzkaller can hit it. + */ +static unsigned long iommufd_test_syz_conv_iova(struct io_pagetable *iopt, + u64 *iova) +{ + struct syz_layout { + __u32 nth_area; + __u32 offset; + }; + struct syz_layout *syz = (void *)iova; + unsigned int nth = syz->nth_area; + struct iopt_area *area; + + down_read(&iopt->iova_rwsem); + for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area; + area = iopt_area_iter_next(area, 0, ULONG_MAX)) { + if (nth == 0) { + up_read(&iopt->iova_rwsem); + return iopt_area_iova(area) + syz->offset; + } + nth--; + } + up_read(&iopt->iova_rwsem); + + return 0; +} + +void iommufd_test_syz_conv_iova_id(struct iommufd_ucmd *ucmd, + unsigned int ioas_id, u64 *iova, u32 *flags) +{ + struct iommufd_ioas *ioas; + + if (!(*flags & MOCK_FLAGS_ACCESS_SYZ)) + return; + *flags &= ~(u32)MOCK_FLAGS_ACCESS_SYZ; + + ioas = iommufd_get_ioas(ucmd->ictx, ioas_id); + if (IS_ERR(ioas)) + return; + *iova = iommufd_test_syz_conv_iova(&ioas->iopt, iova); + iommufd_put_object(&ioas->obj); +} + +struct mock_iommu_domain { + struct iommu_domain domain; + struct xarray pfns; +}; + +enum selftest_obj_type { + TYPE_IDEV, +}; + +struct mock_dev { + struct device dev; +}; + +struct selftest_obj { + struct iommufd_object obj; + enum selftest_obj_type type; + + union { + struct { + struct iommufd_device *idev; + struct iommufd_ctx *ictx; + struct mock_dev *mock_dev; + } idev; + }; +}; + +static void mock_domain_blocking_free(struct iommu_domain *domain) +{ +} + +static int mock_domain_nop_attach(struct iommu_domain *domain, + struct device *dev) +{ + return 0; +} + +static const struct iommu_domain_ops mock_blocking_ops = { + .free = mock_domain_blocking_free, + .attach_dev = mock_domain_nop_attach, +}; + +static struct iommu_domain mock_blocking_domain = { + .type = IOMMU_DOMAIN_BLOCKED, + .ops = &mock_blocking_ops, +}; + +static void *mock_domain_hw_info(struct device *dev, u32 *length, u32 *type) +{ + struct iommu_test_hw_info *info; + + info = kzalloc(sizeof(*info), GFP_KERNEL); + if (!info) + return ERR_PTR(-ENOMEM); + + info->test_reg = IOMMU_HW_INFO_SELFTEST_REGVAL; + *length = sizeof(*info); + *type = IOMMU_HW_INFO_TYPE_SELFTEST; + + return info; +} + +static struct iommu_domain *mock_domain_alloc(unsigned int iommu_domain_type) +{ + struct mock_iommu_domain *mock; + + if (iommu_domain_type == IOMMU_DOMAIN_BLOCKED) + return &mock_blocking_domain; + + if (iommu_domain_type != IOMMU_DOMAIN_UNMANAGED) + return NULL; + + mock = kzalloc(sizeof(*mock), GFP_KERNEL); + if (!mock) + return NULL; + mock->domain.geometry.aperture_start = MOCK_APERTURE_START; + mock->domain.geometry.aperture_end = MOCK_APERTURE_LAST; + mock->domain.pgsize_bitmap = MOCK_IO_PAGE_SIZE; + xa_init(&mock->pfns); + return &mock->domain; +} + +static void mock_domain_free(struct iommu_domain *domain) +{ + struct mock_iommu_domain *mock = + container_of(domain, struct mock_iommu_domain, domain); + + WARN_ON(!xa_empty(&mock->pfns)); + kfree(mock); +} + +static int mock_domain_map_pages(struct iommu_domain *domain, + unsigned long iova, phys_addr_t paddr, + size_t pgsize, size_t pgcount, int prot, + gfp_t gfp, size_t *mapped) +{ + struct mock_iommu_domain *mock = + container_of(domain, struct mock_iommu_domain, domain); + unsigned long flags = MOCK_PFN_START_IOVA; + unsigned long start_iova = iova; + + /* + * xarray does not reliably work with fault injection because it does a + * retry allocation, so put our own failure point. + */ + if (iommufd_should_fail()) + return -ENOENT; + + WARN_ON(iova % MOCK_IO_PAGE_SIZE); + WARN_ON(pgsize % MOCK_IO_PAGE_SIZE); + for (; pgcount; pgcount--) { + size_t cur; + + for (cur = 0; cur != pgsize; cur += MOCK_IO_PAGE_SIZE) { + void *old; + + if (pgcount == 1 && cur + MOCK_IO_PAGE_SIZE == pgsize) + flags = MOCK_PFN_LAST_IOVA; + old = xa_store(&mock->pfns, iova / MOCK_IO_PAGE_SIZE, + xa_mk_value((paddr / MOCK_IO_PAGE_SIZE) | + flags), + gfp); + if (xa_is_err(old)) { + for (; start_iova != iova; + start_iova += MOCK_IO_PAGE_SIZE) + xa_erase(&mock->pfns, + start_iova / + MOCK_IO_PAGE_SIZE); + return xa_err(old); + } + WARN_ON(old); + iova += MOCK_IO_PAGE_SIZE; + paddr += MOCK_IO_PAGE_SIZE; + *mapped += MOCK_IO_PAGE_SIZE; + flags = 0; + } + } + return 0; +} + +static size_t mock_domain_unmap_pages(struct iommu_domain *domain, + unsigned long iova, size_t pgsize, + size_t pgcount, + struct iommu_iotlb_gather *iotlb_gather) +{ + struct mock_iommu_domain *mock = + container_of(domain, struct mock_iommu_domain, domain); + bool first = true; + size_t ret = 0; + void *ent; + + WARN_ON(iova % MOCK_IO_PAGE_SIZE); + WARN_ON(pgsize % MOCK_IO_PAGE_SIZE); + + for (; pgcount; pgcount--) { + size_t cur; + + for (cur = 0; cur != pgsize; cur += MOCK_IO_PAGE_SIZE) { + ent = xa_erase(&mock->pfns, iova / MOCK_IO_PAGE_SIZE); + WARN_ON(!ent); + /* + * iommufd generates unmaps that must be a strict + * superset of the map's performend So every starting + * IOVA should have been an iova passed to map, and the + * + * First IOVA must be present and have been a first IOVA + * passed to map_pages + */ + if (first) { + WARN_ON(!(xa_to_value(ent) & + MOCK_PFN_START_IOVA)); + first = false; + } + if (pgcount == 1 && cur + MOCK_IO_PAGE_SIZE == pgsize) + WARN_ON(!(xa_to_value(ent) & + MOCK_PFN_LAST_IOVA)); + + iova += MOCK_IO_PAGE_SIZE; + ret += MOCK_IO_PAGE_SIZE; + } + } + return ret; +} + +static phys_addr_t mock_domain_iova_to_phys(struct iommu_domain *domain, + dma_addr_t iova) +{ + struct mock_iommu_domain *mock = + container_of(domain, struct mock_iommu_domain, domain); + void *ent; + + WARN_ON(iova % MOCK_IO_PAGE_SIZE); + ent = xa_load(&mock->pfns, iova / MOCK_IO_PAGE_SIZE); + WARN_ON(!ent); + return (xa_to_value(ent) & MOCK_PFN_MASK) * MOCK_IO_PAGE_SIZE; +} + +static bool mock_domain_capable(struct device *dev, enum iommu_cap cap) +{ + return cap == IOMMU_CAP_CACHE_COHERENCY; +} + +static void mock_domain_set_plaform_dma_ops(struct device *dev) +{ + /* + * mock doesn't setup default domains because we can't hook into the + * normal probe path + */ +} + +static struct iommu_device mock_iommu_device = { +}; + +static struct iommu_device *mock_probe_device(struct device *dev) +{ + return &mock_iommu_device; +} + +static const struct iommu_ops mock_ops = { + .owner = THIS_MODULE, + .pgsize_bitmap = MOCK_IO_PAGE_SIZE, + .hw_info = mock_domain_hw_info, + .domain_alloc = mock_domain_alloc, + .capable = mock_domain_capable, + .set_platform_dma_ops = mock_domain_set_plaform_dma_ops, + .device_group = generic_device_group, + .probe_device = mock_probe_device, + .default_domain_ops = + &(struct iommu_domain_ops){ + .free = mock_domain_free, + .attach_dev = mock_domain_nop_attach, + .map_pages = mock_domain_map_pages, + .unmap_pages = mock_domain_unmap_pages, + .iova_to_phys = mock_domain_iova_to_phys, + }, +}; + +static inline struct iommufd_hw_pagetable * +get_md_pagetable(struct iommufd_ucmd *ucmd, u32 mockpt_id, + struct mock_iommu_domain **mock) +{ + struct iommufd_hw_pagetable *hwpt; + struct iommufd_object *obj; + + obj = iommufd_get_object(ucmd->ictx, mockpt_id, + IOMMUFD_OBJ_HW_PAGETABLE); + if (IS_ERR(obj)) + return ERR_CAST(obj); + hwpt = container_of(obj, struct iommufd_hw_pagetable, obj); + if (hwpt->domain->ops != mock_ops.default_domain_ops) { + iommufd_put_object(&hwpt->obj); + return ERR_PTR(-EINVAL); + } + *mock = container_of(hwpt->domain, struct mock_iommu_domain, domain); + return hwpt; +} + +struct mock_bus_type { + struct bus_type bus; + struct notifier_block nb; +}; + +static struct mock_bus_type iommufd_mock_bus_type = { + .bus = { + .name = "iommufd_mock", + }, +}; + +static atomic_t mock_dev_num; + +static void mock_dev_release(struct device *dev) +{ + struct mock_dev *mdev = container_of(dev, struct mock_dev, dev); + + atomic_dec(&mock_dev_num); + kfree(mdev); +} + +static struct mock_dev *mock_dev_create(void) +{ + struct mock_dev *mdev; + int rc; + + mdev = kzalloc(sizeof(*mdev), GFP_KERNEL); + if (!mdev) + return ERR_PTR(-ENOMEM); + + device_initialize(&mdev->dev); + mdev->dev.release = mock_dev_release; + mdev->dev.bus = &iommufd_mock_bus_type.bus; + + rc = dev_set_name(&mdev->dev, "iommufd_mock%u", + atomic_inc_return(&mock_dev_num)); + if (rc) + goto err_put; + + rc = device_add(&mdev->dev); + if (rc) + goto err_put; + return mdev; + +err_put: + put_device(&mdev->dev); + return ERR_PTR(rc); +} + +static void mock_dev_destroy(struct mock_dev *mdev) +{ + device_unregister(&mdev->dev); +} + +bool iommufd_selftest_is_mock_dev(struct device *dev) +{ + return dev->release == mock_dev_release; +} + +/* Create an hw_pagetable with the mock domain so we can test the domain ops */ +static int iommufd_test_mock_domain(struct iommufd_ucmd *ucmd, + struct iommu_test_cmd *cmd) +{ + struct iommufd_device *idev; + struct selftest_obj *sobj; + u32 pt_id = cmd->id; + u32 idev_id; + int rc; + + sobj = iommufd_object_alloc(ucmd->ictx, sobj, IOMMUFD_OBJ_SELFTEST); + if (IS_ERR(sobj)) + return PTR_ERR(sobj); + + sobj->idev.ictx = ucmd->ictx; + sobj->type = TYPE_IDEV; + + sobj->idev.mock_dev = mock_dev_create(); + if (IS_ERR(sobj->idev.mock_dev)) { + rc = PTR_ERR(sobj->idev.mock_dev); + goto out_sobj; + } + + idev = iommufd_device_bind(ucmd->ictx, &sobj->idev.mock_dev->dev, + &idev_id); + if (IS_ERR(idev)) { + rc = PTR_ERR(idev); + goto out_mdev; + } + sobj->idev.idev = idev; + + rc = iommufd_device_attach(idev, &pt_id); + if (rc) + goto out_unbind; + + /* Userspace must destroy the device_id to destroy the object */ + cmd->mock_domain.out_hwpt_id = pt_id; + cmd->mock_domain.out_stdev_id = sobj->obj.id; + cmd->mock_domain.out_idev_id = idev_id; + rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd)); + if (rc) + goto out_detach; + iommufd_object_finalize(ucmd->ictx, &sobj->obj); + return 0; + +out_detach: + iommufd_device_detach(idev); +out_unbind: + iommufd_device_unbind(idev); +out_mdev: + mock_dev_destroy(sobj->idev.mock_dev); +out_sobj: + iommufd_object_abort(ucmd->ictx, &sobj->obj); + return rc; +} + +/* Replace the mock domain with a manually allocated hw_pagetable */ +static int iommufd_test_mock_domain_replace(struct iommufd_ucmd *ucmd, + unsigned int device_id, u32 pt_id, + struct iommu_test_cmd *cmd) +{ + struct iommufd_object *dev_obj; + struct selftest_obj *sobj; + int rc; + + /* + * Prefer to use the OBJ_SELFTEST because the destroy_rwsem will ensure + * it doesn't race with detach, which is not allowed. + */ + dev_obj = + iommufd_get_object(ucmd->ictx, device_id, IOMMUFD_OBJ_SELFTEST); + if (IS_ERR(dev_obj)) + return PTR_ERR(dev_obj); + + sobj = container_of(dev_obj, struct selftest_obj, obj); + if (sobj->type != TYPE_IDEV) { + rc = -EINVAL; + goto out_dev_obj; + } + + rc = iommufd_device_replace(sobj->idev.idev, &pt_id); + if (rc) + goto out_dev_obj; + + cmd->mock_domain_replace.pt_id = pt_id; + rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd)); + +out_dev_obj: + iommufd_put_object(dev_obj); + return rc; +} + +/* Add an additional reserved IOVA to the IOAS */ +static int iommufd_test_add_reserved(struct iommufd_ucmd *ucmd, + unsigned int mockpt_id, + unsigned long start, size_t length) +{ + struct iommufd_ioas *ioas; + int rc; + + ioas = iommufd_get_ioas(ucmd->ictx, mockpt_id); + if (IS_ERR(ioas)) + return PTR_ERR(ioas); + down_write(&ioas->iopt.iova_rwsem); + rc = iopt_reserve_iova(&ioas->iopt, start, start + length - 1, NULL); + up_write(&ioas->iopt.iova_rwsem); + iommufd_put_object(&ioas->obj); + return rc; +} + +/* Check that every pfn under each iova matches the pfn under a user VA */ +static int iommufd_test_md_check_pa(struct iommufd_ucmd *ucmd, + unsigned int mockpt_id, unsigned long iova, + size_t length, void __user *uptr) +{ + struct iommufd_hw_pagetable *hwpt; + struct mock_iommu_domain *mock; + uintptr_t end; + int rc; + + if (iova % MOCK_IO_PAGE_SIZE || length % MOCK_IO_PAGE_SIZE || + (uintptr_t)uptr % MOCK_IO_PAGE_SIZE || + check_add_overflow((uintptr_t)uptr, (uintptr_t)length, &end)) + return -EINVAL; + + hwpt = get_md_pagetable(ucmd, mockpt_id, &mock); + if (IS_ERR(hwpt)) + return PTR_ERR(hwpt); + + for (; length; length -= MOCK_IO_PAGE_SIZE) { + struct page *pages[1]; + unsigned long pfn; + long npages; + void *ent; + + npages = get_user_pages_fast((uintptr_t)uptr & PAGE_MASK, 1, 0, + pages); + if (npages < 0) { + rc = npages; + goto out_put; + } + if (WARN_ON(npages != 1)) { + rc = -EFAULT; + goto out_put; + } + pfn = page_to_pfn(pages[0]); + put_page(pages[0]); + + ent = xa_load(&mock->pfns, iova / MOCK_IO_PAGE_SIZE); + if (!ent || + (xa_to_value(ent) & MOCK_PFN_MASK) * MOCK_IO_PAGE_SIZE != + pfn * PAGE_SIZE + ((uintptr_t)uptr % PAGE_SIZE)) { + rc = -EINVAL; + goto out_put; + } + iova += MOCK_IO_PAGE_SIZE; + uptr += MOCK_IO_PAGE_SIZE; + } + rc = 0; + +out_put: + iommufd_put_object(&hwpt->obj); + return rc; +} + +/* Check that the page ref count matches, to look for missing pin/unpins */ +static int iommufd_test_md_check_refs(struct iommufd_ucmd *ucmd, + void __user *uptr, size_t length, + unsigned int refs) +{ + uintptr_t end; + + if (length % PAGE_SIZE || (uintptr_t)uptr % PAGE_SIZE || + check_add_overflow((uintptr_t)uptr, (uintptr_t)length, &end)) + return -EINVAL; + + for (; length; length -= PAGE_SIZE) { + struct page *pages[1]; + long npages; + + npages = get_user_pages_fast((uintptr_t)uptr, 1, 0, pages); + if (npages < 0) + return npages; + if (WARN_ON(npages != 1)) + return -EFAULT; + if (!PageCompound(pages[0])) { + unsigned int count; + + count = page_ref_count(pages[0]); + if (count / GUP_PIN_COUNTING_BIAS != refs) { + put_page(pages[0]); + return -EIO; + } + } + put_page(pages[0]); + uptr += PAGE_SIZE; + } + return 0; +} + +struct selftest_access { + struct iommufd_access *access; + struct file *file; + struct mutex lock; + struct list_head items; + unsigned int next_id; + bool destroying; +}; + +struct selftest_access_item { + struct list_head items_elm; + unsigned long iova; + size_t length; + unsigned int id; +}; + +static const struct file_operations iommfd_test_staccess_fops; + +static struct selftest_access *iommufd_access_get(int fd) +{ + struct file *file; + + file = fget(fd); + if (!file) + return ERR_PTR(-EBADFD); + + if (file->f_op != &iommfd_test_staccess_fops) { + fput(file); + return ERR_PTR(-EBADFD); + } + return file->private_data; +} + +static void iommufd_test_access_unmap(void *data, unsigned long iova, + unsigned long length) +{ + unsigned long iova_last = iova + length - 1; + struct selftest_access *staccess = data; + struct selftest_access_item *item; + struct selftest_access_item *tmp; + + mutex_lock(&staccess->lock); + list_for_each_entry_safe(item, tmp, &staccess->items, items_elm) { + if (iova > item->iova + item->length - 1 || + iova_last < item->iova) + continue; + list_del(&item->items_elm); + iommufd_access_unpin_pages(staccess->access, item->iova, + item->length); + kfree(item); + } + mutex_unlock(&staccess->lock); +} + +static int iommufd_test_access_item_destroy(struct iommufd_ucmd *ucmd, + unsigned int access_id, + unsigned int item_id) +{ + struct selftest_access_item *item; + struct selftest_access *staccess; + + staccess = iommufd_access_get(access_id); + if (IS_ERR(staccess)) + return PTR_ERR(staccess); + + mutex_lock(&staccess->lock); + list_for_each_entry(item, &staccess->items, items_elm) { + if (item->id == item_id) { + list_del(&item->items_elm); + iommufd_access_unpin_pages(staccess->access, item->iova, + item->length); + mutex_unlock(&staccess->lock); + kfree(item); + fput(staccess->file); + return 0; + } + } + mutex_unlock(&staccess->lock); + fput(staccess->file); + return -ENOENT; +} + +static int iommufd_test_staccess_release(struct inode *inode, + struct file *filep) +{ + struct selftest_access *staccess = filep->private_data; + + if (staccess->access) { + iommufd_test_access_unmap(staccess, 0, ULONG_MAX); + iommufd_access_destroy(staccess->access); + } + mutex_destroy(&staccess->lock); + kfree(staccess); + return 0; +} + +static const struct iommufd_access_ops selftest_access_ops_pin = { + .needs_pin_pages = 1, + .unmap = iommufd_test_access_unmap, +}; + +static const struct iommufd_access_ops selftest_access_ops = { + .unmap = iommufd_test_access_unmap, +}; + +static const struct file_operations iommfd_test_staccess_fops = { + .release = iommufd_test_staccess_release, +}; + +static struct selftest_access *iommufd_test_alloc_access(void) +{ + struct selftest_access *staccess; + struct file *filep; + + staccess = kzalloc(sizeof(*staccess), GFP_KERNEL_ACCOUNT); + if (!staccess) + return ERR_PTR(-ENOMEM); + INIT_LIST_HEAD(&staccess->items); + mutex_init(&staccess->lock); + + filep = anon_inode_getfile("[iommufd_test_staccess]", + &iommfd_test_staccess_fops, staccess, + O_RDWR); + if (IS_ERR(filep)) { + kfree(staccess); + return ERR_CAST(filep); + } + staccess->file = filep; + return staccess; +} + +static int iommufd_test_create_access(struct iommufd_ucmd *ucmd, + unsigned int ioas_id, unsigned int flags) +{ + struct iommu_test_cmd *cmd = ucmd->cmd; + struct selftest_access *staccess; + struct iommufd_access *access; + u32 id; + int fdno; + int rc; + + if (flags & ~MOCK_FLAGS_ACCESS_CREATE_NEEDS_PIN_PAGES) + return -EOPNOTSUPP; + + staccess = iommufd_test_alloc_access(); + if (IS_ERR(staccess)) + return PTR_ERR(staccess); + + fdno = get_unused_fd_flags(O_CLOEXEC); + if (fdno < 0) { + rc = -ENOMEM; + goto out_free_staccess; + } + + access = iommufd_access_create( + ucmd->ictx, + (flags & MOCK_FLAGS_ACCESS_CREATE_NEEDS_PIN_PAGES) ? + &selftest_access_ops_pin : + &selftest_access_ops, + staccess, &id); + if (IS_ERR(access)) { + rc = PTR_ERR(access); + goto out_put_fdno; + } + rc = iommufd_access_attach(access, ioas_id); + if (rc) + goto out_destroy; + cmd->create_access.out_access_fd = fdno; + rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd)); + if (rc) + goto out_destroy; + + staccess->access = access; + fd_install(fdno, staccess->file); + return 0; + +out_destroy: + iommufd_access_destroy(access); +out_put_fdno: + put_unused_fd(fdno); +out_free_staccess: + fput(staccess->file); + return rc; +} + +static int iommufd_test_access_replace_ioas(struct iommufd_ucmd *ucmd, + unsigned int access_id, + unsigned int ioas_id) +{ + struct selftest_access *staccess; + int rc; + + staccess = iommufd_access_get(access_id); + if (IS_ERR(staccess)) + return PTR_ERR(staccess); + + rc = iommufd_access_replace(staccess->access, ioas_id); + fput(staccess->file); + return rc; +} + +/* Check that the pages in a page array match the pages in the user VA */ +static int iommufd_test_check_pages(void __user *uptr, struct page **pages, + size_t npages) +{ + for (; npages; npages--) { + struct page *tmp_pages[1]; + long rc; + + rc = get_user_pages_fast((uintptr_t)uptr, 1, 0, tmp_pages); + if (rc < 0) + return rc; + if (WARN_ON(rc != 1)) + return -EFAULT; + put_page(tmp_pages[0]); + if (tmp_pages[0] != *pages) + return -EBADE; + pages++; + uptr += PAGE_SIZE; + } + return 0; +} + +static int iommufd_test_access_pages(struct iommufd_ucmd *ucmd, + unsigned int access_id, unsigned long iova, + size_t length, void __user *uptr, + u32 flags) +{ + struct iommu_test_cmd *cmd = ucmd->cmd; + struct selftest_access_item *item; + struct selftest_access *staccess; + struct page **pages; + size_t npages; + int rc; + + /* Prevent syzkaller from triggering a WARN_ON in kvzalloc() */ + if (length > 16*1024*1024) + return -ENOMEM; + + if (flags & ~(MOCK_FLAGS_ACCESS_WRITE | MOCK_FLAGS_ACCESS_SYZ)) + return -EOPNOTSUPP; + + staccess = iommufd_access_get(access_id); + if (IS_ERR(staccess)) + return PTR_ERR(staccess); + + if (staccess->access->ops != &selftest_access_ops_pin) { + rc = -EOPNOTSUPP; + goto out_put; + } + + if (flags & MOCK_FLAGS_ACCESS_SYZ) + iova = iommufd_test_syz_conv_iova(&staccess->access->ioas->iopt, + &cmd->access_pages.iova); + + npages = (ALIGN(iova + length, PAGE_SIZE) - + ALIGN_DOWN(iova, PAGE_SIZE)) / + PAGE_SIZE; + pages = kvcalloc(npages, sizeof(*pages), GFP_KERNEL_ACCOUNT); + if (!pages) { + rc = -ENOMEM; + goto out_put; + } + + /* + * Drivers will need to think very carefully about this locking. The + * core code can do multiple unmaps instantaneously after + * iommufd_access_pin_pages() and *all* the unmaps must not return until + * the range is unpinned. This simple implementation puts a global lock + * around the pin, which may not suit drivers that want this to be a + * performance path. drivers that get this wrong will trigger WARN_ON + * races and cause EDEADLOCK failures to userspace. + */ + mutex_lock(&staccess->lock); + rc = iommufd_access_pin_pages(staccess->access, iova, length, pages, + flags & MOCK_FLAGS_ACCESS_WRITE); + if (rc) + goto out_unlock; + + /* For syzkaller allow uptr to be NULL to skip this check */ + if (uptr) { + rc = iommufd_test_check_pages( + uptr - (iova - ALIGN_DOWN(iova, PAGE_SIZE)), pages, + npages); + if (rc) + goto out_unaccess; + } + + item = kzalloc(sizeof(*item), GFP_KERNEL_ACCOUNT); + if (!item) { + rc = -ENOMEM; + goto out_unaccess; + } + + item->iova = iova; + item->length = length; + item->id = staccess->next_id++; + list_add_tail(&item->items_elm, &staccess->items); + + cmd->access_pages.out_access_pages_id = item->id; + rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd)); + if (rc) + goto out_free_item; + goto out_unlock; + +out_free_item: + list_del(&item->items_elm); + kfree(item); +out_unaccess: + iommufd_access_unpin_pages(staccess->access, iova, length); +out_unlock: + mutex_unlock(&staccess->lock); + kvfree(pages); +out_put: + fput(staccess->file); + return rc; +} + +static int iommufd_test_access_rw(struct iommufd_ucmd *ucmd, + unsigned int access_id, unsigned long iova, + size_t length, void __user *ubuf, + unsigned int flags) +{ + struct iommu_test_cmd *cmd = ucmd->cmd; + struct selftest_access *staccess; + void *tmp; + int rc; + + /* Prevent syzkaller from triggering a WARN_ON in kvzalloc() */ + if (length > 16*1024*1024) + return -ENOMEM; + + if (flags & ~(MOCK_ACCESS_RW_WRITE | MOCK_ACCESS_RW_SLOW_PATH | + MOCK_FLAGS_ACCESS_SYZ)) + return -EOPNOTSUPP; + + staccess = iommufd_access_get(access_id); + if (IS_ERR(staccess)) + return PTR_ERR(staccess); + + tmp = kvzalloc(length, GFP_KERNEL_ACCOUNT); + if (!tmp) { + rc = -ENOMEM; + goto out_put; + } + + if (flags & MOCK_ACCESS_RW_WRITE) { + if (copy_from_user(tmp, ubuf, length)) { + rc = -EFAULT; + goto out_free; + } + } + + if (flags & MOCK_FLAGS_ACCESS_SYZ) + iova = iommufd_test_syz_conv_iova(&staccess->access->ioas->iopt, + &cmd->access_rw.iova); + + rc = iommufd_access_rw(staccess->access, iova, tmp, length, flags); + if (rc) + goto out_free; + if (!(flags & MOCK_ACCESS_RW_WRITE)) { + if (copy_to_user(ubuf, tmp, length)) { + rc = -EFAULT; + goto out_free; + } + } + +out_free: + kvfree(tmp); +out_put: + fput(staccess->file); + return rc; +} +static_assert((unsigned int)MOCK_ACCESS_RW_WRITE == IOMMUFD_ACCESS_RW_WRITE); +static_assert((unsigned int)MOCK_ACCESS_RW_SLOW_PATH == + __IOMMUFD_ACCESS_RW_SLOW_PATH); + +void iommufd_selftest_destroy(struct iommufd_object *obj) +{ + struct selftest_obj *sobj = container_of(obj, struct selftest_obj, obj); + + switch (sobj->type) { + case TYPE_IDEV: + iommufd_device_detach(sobj->idev.idev); + iommufd_device_unbind(sobj->idev.idev); + mock_dev_destroy(sobj->idev.mock_dev); + break; + } +} + +int iommufd_test(struct iommufd_ucmd *ucmd) +{ + struct iommu_test_cmd *cmd = ucmd->cmd; + + switch (cmd->op) { + case IOMMU_TEST_OP_ADD_RESERVED: + return iommufd_test_add_reserved(ucmd, cmd->id, + cmd->add_reserved.start, + cmd->add_reserved.length); + case IOMMU_TEST_OP_MOCK_DOMAIN: + return iommufd_test_mock_domain(ucmd, cmd); + case IOMMU_TEST_OP_MOCK_DOMAIN_REPLACE: + return iommufd_test_mock_domain_replace( + ucmd, cmd->id, cmd->mock_domain_replace.pt_id, cmd); + case IOMMU_TEST_OP_MD_CHECK_MAP: + return iommufd_test_md_check_pa( + ucmd, cmd->id, cmd->check_map.iova, + cmd->check_map.length, + u64_to_user_ptr(cmd->check_map.uptr)); + case IOMMU_TEST_OP_MD_CHECK_REFS: + return iommufd_test_md_check_refs( + ucmd, u64_to_user_ptr(cmd->check_refs.uptr), + cmd->check_refs.length, cmd->check_refs.refs); + case IOMMU_TEST_OP_CREATE_ACCESS: + return iommufd_test_create_access(ucmd, cmd->id, + cmd->create_access.flags); + case IOMMU_TEST_OP_ACCESS_REPLACE_IOAS: + return iommufd_test_access_replace_ioas( + ucmd, cmd->id, cmd->access_replace_ioas.ioas_id); + case IOMMU_TEST_OP_ACCESS_PAGES: + return iommufd_test_access_pages( + ucmd, cmd->id, cmd->access_pages.iova, + cmd->access_pages.length, + u64_to_user_ptr(cmd->access_pages.uptr), + cmd->access_pages.flags); + case IOMMU_TEST_OP_ACCESS_RW: + return iommufd_test_access_rw( + ucmd, cmd->id, cmd->access_rw.iova, + cmd->access_rw.length, + u64_to_user_ptr(cmd->access_rw.uptr), + cmd->access_rw.flags); + case IOMMU_TEST_OP_DESTROY_ACCESS_PAGES: + return iommufd_test_access_item_destroy( + ucmd, cmd->id, cmd->destroy_access_pages.access_pages_id); + case IOMMU_TEST_OP_SET_TEMP_MEMORY_LIMIT: + /* Protect _batch_init(), can not be less than elmsz */ + if (cmd->memory_limit.limit < + sizeof(unsigned long) + sizeof(u32)) + return -EINVAL; + iommufd_test_memory_limit = cmd->memory_limit.limit; + return 0; + default: + return -EOPNOTSUPP; + } +} + +bool iommufd_should_fail(void) +{ + return should_fail(&fail_iommufd, 1); +} + +int __init iommufd_test_init(void) +{ + struct platform_device_info pdevinfo = { + .name = "iommufd_selftest_iommu", + }; + int rc; + + dbgfs_root = + fault_create_debugfs_attr("fail_iommufd", NULL, &fail_iommufd); + + selftest_iommu_dev = platform_device_register_full(&pdevinfo); + if (IS_ERR(selftest_iommu_dev)) { + rc = PTR_ERR(selftest_iommu_dev); + goto err_dbgfs; + } + + rc = bus_register(&iommufd_mock_bus_type.bus); + if (rc) + goto err_platform; + + rc = iommu_device_sysfs_add(&mock_iommu_device, + &selftest_iommu_dev->dev, NULL, "%s", + dev_name(&selftest_iommu_dev->dev)); + if (rc) + goto err_bus; + + rc = iommu_device_register_bus(&mock_iommu_device, &mock_ops, + &iommufd_mock_bus_type.bus, + &iommufd_mock_bus_type.nb); + if (rc) + goto err_sysfs; + return 0; + +err_sysfs: + iommu_device_sysfs_remove(&mock_iommu_device); +err_bus: + bus_unregister(&iommufd_mock_bus_type.bus); +err_platform: + platform_device_unregister(selftest_iommu_dev); +err_dbgfs: + debugfs_remove_recursive(dbgfs_root); + return rc; +} + +void iommufd_test_exit(void) +{ + iommu_device_sysfs_remove(&mock_iommu_device); + iommu_device_unregister_bus(&mock_iommu_device, + &iommufd_mock_bus_type.bus, + &iommufd_mock_bus_type.nb); + bus_unregister(&iommufd_mock_bus_type.bus); + platform_device_unregister(selftest_iommu_dev); + debugfs_remove_recursive(dbgfs_root); +} diff --git a/drivers/iommu/iommufd/vfio_compat.c b/drivers/iommu/iommufd/vfio_compat.c new file mode 100644 index 0000000000..6c810bf80f --- /dev/null +++ b/drivers/iommu/iommufd/vfio_compat.c @@ -0,0 +1,541 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES + */ +#include <linux/file.h> +#include <linux/interval_tree.h> +#include <linux/iommu.h> +#include <linux/iommufd.h> +#include <linux/slab.h> +#include <linux/vfio.h> +#include <uapi/linux/vfio.h> +#include <uapi/linux/iommufd.h> + +#include "iommufd_private.h" + +static struct iommufd_ioas *get_compat_ioas(struct iommufd_ctx *ictx) +{ + struct iommufd_ioas *ioas = ERR_PTR(-ENODEV); + + xa_lock(&ictx->objects); + if (!ictx->vfio_ioas || !iommufd_lock_obj(&ictx->vfio_ioas->obj)) + goto out_unlock; + ioas = ictx->vfio_ioas; +out_unlock: + xa_unlock(&ictx->objects); + return ioas; +} + +/** + * iommufd_vfio_compat_ioas_get_id - Ensure a compat IOAS exists + * @ictx: Context to operate on + * @out_ioas_id: The IOAS ID of the compatibility IOAS + * + * Return the ID of the current compatibility IOAS. The ID can be passed into + * other functions that take an ioas_id. + */ +int iommufd_vfio_compat_ioas_get_id(struct iommufd_ctx *ictx, u32 *out_ioas_id) +{ + struct iommufd_ioas *ioas; + + ioas = get_compat_ioas(ictx); + if (IS_ERR(ioas)) + return PTR_ERR(ioas); + *out_ioas_id = ioas->obj.id; + iommufd_put_object(&ioas->obj); + return 0; +} +EXPORT_SYMBOL_NS_GPL(iommufd_vfio_compat_ioas_get_id, IOMMUFD_VFIO); + +/** + * iommufd_vfio_compat_set_no_iommu - Called when a no-iommu device is attached + * @ictx: Context to operate on + * + * This allows selecting the VFIO_NOIOMMU_IOMMU and blocks normal types. + */ +int iommufd_vfio_compat_set_no_iommu(struct iommufd_ctx *ictx) +{ + int ret; + + xa_lock(&ictx->objects); + if (!ictx->vfio_ioas) { + ictx->no_iommu_mode = 1; + ret = 0; + } else { + ret = -EINVAL; + } + xa_unlock(&ictx->objects); + return ret; +} +EXPORT_SYMBOL_NS_GPL(iommufd_vfio_compat_set_no_iommu, IOMMUFD_VFIO); + +/** + * iommufd_vfio_compat_ioas_create - Ensure the compat IOAS is created + * @ictx: Context to operate on + * + * The compatibility IOAS is the IOAS that the vfio compatibility ioctls operate + * on since they do not have an IOAS ID input in their ABI. Only attaching a + * group should cause a default creation of the internal ioas, this does nothing + * if an existing ioas has already been assigned somehow. + */ +int iommufd_vfio_compat_ioas_create(struct iommufd_ctx *ictx) +{ + struct iommufd_ioas *ioas = NULL; + int ret; + + ioas = iommufd_ioas_alloc(ictx); + if (IS_ERR(ioas)) + return PTR_ERR(ioas); + + xa_lock(&ictx->objects); + /* + * VFIO won't allow attaching a container to both iommu and no iommu + * operation + */ + if (ictx->no_iommu_mode) { + ret = -EINVAL; + goto out_abort; + } + + if (ictx->vfio_ioas && iommufd_lock_obj(&ictx->vfio_ioas->obj)) { + ret = 0; + iommufd_put_object(&ictx->vfio_ioas->obj); + goto out_abort; + } + ictx->vfio_ioas = ioas; + xa_unlock(&ictx->objects); + + /* + * An automatically created compat IOAS is treated as a userspace + * created object. Userspace can learn the ID via IOMMU_VFIO_IOAS_GET, + * and if not manually destroyed it will be destroyed automatically + * at iommufd release. + */ + iommufd_object_finalize(ictx, &ioas->obj); + return 0; + +out_abort: + xa_unlock(&ictx->objects); + iommufd_object_abort(ictx, &ioas->obj); + return ret; +} +EXPORT_SYMBOL_NS_GPL(iommufd_vfio_compat_ioas_create, IOMMUFD_VFIO); + +int iommufd_vfio_ioas(struct iommufd_ucmd *ucmd) +{ + struct iommu_vfio_ioas *cmd = ucmd->cmd; + struct iommufd_ioas *ioas; + + if (cmd->__reserved) + return -EOPNOTSUPP; + switch (cmd->op) { + case IOMMU_VFIO_IOAS_GET: + ioas = get_compat_ioas(ucmd->ictx); + if (IS_ERR(ioas)) + return PTR_ERR(ioas); + cmd->ioas_id = ioas->obj.id; + iommufd_put_object(&ioas->obj); + return iommufd_ucmd_respond(ucmd, sizeof(*cmd)); + + case IOMMU_VFIO_IOAS_SET: + ioas = iommufd_get_ioas(ucmd->ictx, cmd->ioas_id); + if (IS_ERR(ioas)) + return PTR_ERR(ioas); + xa_lock(&ucmd->ictx->objects); + ucmd->ictx->vfio_ioas = ioas; + xa_unlock(&ucmd->ictx->objects); + iommufd_put_object(&ioas->obj); + return 0; + + case IOMMU_VFIO_IOAS_CLEAR: + xa_lock(&ucmd->ictx->objects); + ucmd->ictx->vfio_ioas = NULL; + xa_unlock(&ucmd->ictx->objects); + return 0; + default: + return -EOPNOTSUPP; + } +} + +static int iommufd_vfio_map_dma(struct iommufd_ctx *ictx, unsigned int cmd, + void __user *arg) +{ + u32 supported_flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE; + size_t minsz = offsetofend(struct vfio_iommu_type1_dma_map, size); + struct vfio_iommu_type1_dma_map map; + int iommu_prot = IOMMU_CACHE; + struct iommufd_ioas *ioas; + unsigned long iova; + int rc; + + if (copy_from_user(&map, arg, minsz)) + return -EFAULT; + + if (map.argsz < minsz || map.flags & ~supported_flags) + return -EINVAL; + + if (map.flags & VFIO_DMA_MAP_FLAG_READ) + iommu_prot |= IOMMU_READ; + if (map.flags & VFIO_DMA_MAP_FLAG_WRITE) + iommu_prot |= IOMMU_WRITE; + + ioas = get_compat_ioas(ictx); + if (IS_ERR(ioas)) + return PTR_ERR(ioas); + + /* + * Maps created through the legacy interface always use VFIO compatible + * rlimit accounting. If the user wishes to use the faster user based + * rlimit accounting then they must use the new interface. + */ + iova = map.iova; + rc = iopt_map_user_pages(ictx, &ioas->iopt, &iova, u64_to_user_ptr(map.vaddr), + map.size, iommu_prot, 0); + iommufd_put_object(&ioas->obj); + return rc; +} + +static int iommufd_vfio_unmap_dma(struct iommufd_ctx *ictx, unsigned int cmd, + void __user *arg) +{ + size_t minsz = offsetofend(struct vfio_iommu_type1_dma_unmap, size); + /* + * VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP is obsoleted by the new + * dirty tracking direction: + * https://lore.kernel.org/kvm/20220731125503.142683-1-yishaih@nvidia.com/ + * https://lore.kernel.org/kvm/20220428210933.3583-1-joao.m.martins@oracle.com/ + */ + u32 supported_flags = VFIO_DMA_UNMAP_FLAG_ALL; + struct vfio_iommu_type1_dma_unmap unmap; + unsigned long unmapped = 0; + struct iommufd_ioas *ioas; + int rc; + + if (copy_from_user(&unmap, arg, minsz)) + return -EFAULT; + + if (unmap.argsz < minsz || unmap.flags & ~supported_flags) + return -EINVAL; + + ioas = get_compat_ioas(ictx); + if (IS_ERR(ioas)) + return PTR_ERR(ioas); + + if (unmap.flags & VFIO_DMA_UNMAP_FLAG_ALL) { + if (unmap.iova != 0 || unmap.size != 0) { + rc = -EINVAL; + goto err_put; + } + rc = iopt_unmap_all(&ioas->iopt, &unmapped); + } else { + if (READ_ONCE(ioas->iopt.disable_large_pages)) { + /* + * Create cuts at the start and last of the requested + * range. If the start IOVA is 0 then it doesn't need to + * be cut. + */ + unsigned long iovas[] = { unmap.iova + unmap.size - 1, + unmap.iova - 1 }; + + rc = iopt_cut_iova(&ioas->iopt, iovas, + unmap.iova ? 2 : 1); + if (rc) + goto err_put; + } + rc = iopt_unmap_iova(&ioas->iopt, unmap.iova, unmap.size, + &unmapped); + } + unmap.size = unmapped; + if (copy_to_user(arg, &unmap, minsz)) + rc = -EFAULT; + +err_put: + iommufd_put_object(&ioas->obj); + return rc; +} + +static int iommufd_vfio_cc_iommu(struct iommufd_ctx *ictx) +{ + struct iommufd_hw_pagetable *hwpt; + struct iommufd_ioas *ioas; + int rc = 1; + + ioas = get_compat_ioas(ictx); + if (IS_ERR(ioas)) + return PTR_ERR(ioas); + + mutex_lock(&ioas->mutex); + list_for_each_entry(hwpt, &ioas->hwpt_list, hwpt_item) { + if (!hwpt->enforce_cache_coherency) { + rc = 0; + break; + } + } + mutex_unlock(&ioas->mutex); + + iommufd_put_object(&ioas->obj); + return rc; +} + +static int iommufd_vfio_check_extension(struct iommufd_ctx *ictx, + unsigned long type) +{ + switch (type) { + case VFIO_TYPE1_IOMMU: + case VFIO_TYPE1v2_IOMMU: + case VFIO_UNMAP_ALL: + return 1; + + case VFIO_NOIOMMU_IOMMU: + return IS_ENABLED(CONFIG_VFIO_NOIOMMU); + + case VFIO_DMA_CC_IOMMU: + return iommufd_vfio_cc_iommu(ictx); + + /* + * This is obsolete, and to be removed from VFIO. It was an incomplete + * idea that got merged. + * https://lore.kernel.org/kvm/0-v1-0093c9b0e345+19-vfio_no_nesting_jgg@nvidia.com/ + */ + case VFIO_TYPE1_NESTING_IOMMU: + return 0; + + /* + * VFIO_DMA_MAP_FLAG_VADDR + * https://lore.kernel.org/kvm/1611939252-7240-1-git-send-email-steven.sistare@oracle.com/ + * https://lore.kernel.org/all/Yz777bJZjTyLrHEQ@nvidia.com/ + * + * It is hard to see how this could be implemented safely. + */ + case VFIO_UPDATE_VADDR: + default: + return 0; + } +} + +static int iommufd_vfio_set_iommu(struct iommufd_ctx *ictx, unsigned long type) +{ + bool no_iommu_mode = READ_ONCE(ictx->no_iommu_mode); + struct iommufd_ioas *ioas = NULL; + int rc = 0; + + /* + * Emulation for NOIOMMU is imperfect in that VFIO blocks almost all + * other ioctls. We let them keep working but they mostly fail since no + * IOAS should exist. + */ + if (IS_ENABLED(CONFIG_VFIO_NOIOMMU) && type == VFIO_NOIOMMU_IOMMU && + no_iommu_mode) { + if (!capable(CAP_SYS_RAWIO)) + return -EPERM; + return 0; + } + + if ((type != VFIO_TYPE1_IOMMU && type != VFIO_TYPE1v2_IOMMU) || + no_iommu_mode) + return -EINVAL; + + /* VFIO fails the set_iommu if there is no group */ + ioas = get_compat_ioas(ictx); + if (IS_ERR(ioas)) + return PTR_ERR(ioas); + + /* + * The difference between TYPE1 and TYPE1v2 is the ability to unmap in + * the middle of mapped ranges. This is complicated by huge page support + * which creates single large IOPTEs that cannot be split by the iommu + * driver. TYPE1 is very old at this point and likely nothing uses it, + * however it is simple enough to emulate by simply disabling the + * problematic large IOPTEs. Then we can safely unmap within any range. + */ + if (type == VFIO_TYPE1_IOMMU) + rc = iopt_disable_large_pages(&ioas->iopt); + iommufd_put_object(&ioas->obj); + return rc; +} + +static unsigned long iommufd_get_pagesizes(struct iommufd_ioas *ioas) +{ + struct io_pagetable *iopt = &ioas->iopt; + unsigned long pgsize_bitmap = ULONG_MAX; + struct iommu_domain *domain; + unsigned long index; + + down_read(&iopt->domains_rwsem); + xa_for_each(&iopt->domains, index, domain) + pgsize_bitmap &= domain->pgsize_bitmap; + + /* See vfio_update_pgsize_bitmap() */ + if (pgsize_bitmap & ~PAGE_MASK) { + pgsize_bitmap &= PAGE_MASK; + pgsize_bitmap |= PAGE_SIZE; + } + pgsize_bitmap = max(pgsize_bitmap, ioas->iopt.iova_alignment); + up_read(&iopt->domains_rwsem); + return pgsize_bitmap; +} + +static int iommufd_fill_cap_iova(struct iommufd_ioas *ioas, + struct vfio_info_cap_header __user *cur, + size_t avail) +{ + struct vfio_iommu_type1_info_cap_iova_range __user *ucap_iovas = + container_of(cur, + struct vfio_iommu_type1_info_cap_iova_range __user, + header); + struct vfio_iommu_type1_info_cap_iova_range cap_iovas = { + .header = { + .id = VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE, + .version = 1, + }, + }; + struct interval_tree_span_iter span; + + interval_tree_for_each_span(&span, &ioas->iopt.reserved_itree, 0, + ULONG_MAX) { + struct vfio_iova_range range; + + if (!span.is_hole) + continue; + range.start = span.start_hole; + range.end = span.last_hole; + if (avail >= struct_size(&cap_iovas, iova_ranges, + cap_iovas.nr_iovas + 1) && + copy_to_user(&ucap_iovas->iova_ranges[cap_iovas.nr_iovas], + &range, sizeof(range))) + return -EFAULT; + cap_iovas.nr_iovas++; + } + if (avail >= struct_size(&cap_iovas, iova_ranges, cap_iovas.nr_iovas) && + copy_to_user(ucap_iovas, &cap_iovas, sizeof(cap_iovas))) + return -EFAULT; + return struct_size(&cap_iovas, iova_ranges, cap_iovas.nr_iovas); +} + +static int iommufd_fill_cap_dma_avail(struct iommufd_ioas *ioas, + struct vfio_info_cap_header __user *cur, + size_t avail) +{ + struct vfio_iommu_type1_info_dma_avail cap_dma = { + .header = { + .id = VFIO_IOMMU_TYPE1_INFO_DMA_AVAIL, + .version = 1, + }, + /* + * iommufd's limit is based on the cgroup's memory limit. + * Normally vfio would return U16_MAX here, and provide a module + * parameter to adjust it. Since S390 qemu userspace actually + * pays attention and needs a value bigger than U16_MAX return + * U32_MAX. + */ + .avail = U32_MAX, + }; + + if (avail >= sizeof(cap_dma) && + copy_to_user(cur, &cap_dma, sizeof(cap_dma))) + return -EFAULT; + return sizeof(cap_dma); +} + +static int iommufd_vfio_iommu_get_info(struct iommufd_ctx *ictx, + void __user *arg) +{ + typedef int (*fill_cap_fn)(struct iommufd_ioas *ioas, + struct vfio_info_cap_header __user *cur, + size_t avail); + static const fill_cap_fn fill_fns[] = { + iommufd_fill_cap_dma_avail, + iommufd_fill_cap_iova, + }; + size_t minsz = offsetofend(struct vfio_iommu_type1_info, iova_pgsizes); + struct vfio_info_cap_header __user *last_cap = NULL; + struct vfio_iommu_type1_info info = {}; + struct iommufd_ioas *ioas; + size_t total_cap_size; + int rc; + int i; + + if (copy_from_user(&info, arg, minsz)) + return -EFAULT; + + if (info.argsz < minsz) + return -EINVAL; + minsz = min_t(size_t, info.argsz, sizeof(info)); + + ioas = get_compat_ioas(ictx); + if (IS_ERR(ioas)) + return PTR_ERR(ioas); + + info.flags = VFIO_IOMMU_INFO_PGSIZES; + info.iova_pgsizes = iommufd_get_pagesizes(ioas); + info.cap_offset = 0; + + down_read(&ioas->iopt.iova_rwsem); + total_cap_size = sizeof(info); + for (i = 0; i != ARRAY_SIZE(fill_fns); i++) { + int cap_size; + + if (info.argsz > total_cap_size) + cap_size = fill_fns[i](ioas, arg + total_cap_size, + info.argsz - total_cap_size); + else + cap_size = fill_fns[i](ioas, NULL, 0); + if (cap_size < 0) { + rc = cap_size; + goto out_put; + } + cap_size = ALIGN(cap_size, sizeof(u64)); + + if (last_cap && info.argsz >= total_cap_size && + put_user(total_cap_size, &last_cap->next)) { + rc = -EFAULT; + goto out_put; + } + last_cap = arg + total_cap_size; + total_cap_size += cap_size; + } + + /* + * If the user did not provide enough space then only some caps are + * returned and the argsz will be updated to the correct amount to get + * all caps. + */ + if (info.argsz >= total_cap_size) + info.cap_offset = sizeof(info); + info.argsz = total_cap_size; + info.flags |= VFIO_IOMMU_INFO_CAPS; + if (copy_to_user(arg, &info, minsz)) { + rc = -EFAULT; + goto out_put; + } + rc = 0; + +out_put: + up_read(&ioas->iopt.iova_rwsem); + iommufd_put_object(&ioas->obj); + return rc; +} + +int iommufd_vfio_ioctl(struct iommufd_ctx *ictx, unsigned int cmd, + unsigned long arg) +{ + void __user *uarg = (void __user *)arg; + + switch (cmd) { + case VFIO_GET_API_VERSION: + return VFIO_API_VERSION; + case VFIO_SET_IOMMU: + return iommufd_vfio_set_iommu(ictx, arg); + case VFIO_CHECK_EXTENSION: + return iommufd_vfio_check_extension(ictx, arg); + case VFIO_IOMMU_GET_INFO: + return iommufd_vfio_iommu_get_info(ictx, uarg); + case VFIO_IOMMU_MAP_DMA: + return iommufd_vfio_map_dma(ictx, cmd, uarg); + case VFIO_IOMMU_UNMAP_DMA: + return iommufd_vfio_unmap_dma(ictx, cmd, uarg); + case VFIO_IOMMU_DIRTY_PAGES: + default: + return -ENOIOCTLCMD; + } + return -ENOIOCTLCMD; +} |