diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-11 08:27:49 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-11 08:27:49 +0000 |
commit | ace9429bb58fd418f0c81d4c2835699bddf6bde6 (patch) | |
tree | b2d64bc10158fdd5497876388cd68142ca374ed3 /drivers/dma/idxd | |
parent | Initial commit. (diff) | |
download | linux-ace9429bb58fd418f0c81d4c2835699bddf6bde6.tar.xz linux-ace9429bb58fd418f0c81d4c2835699bddf6bde6.zip |
Adding upstream version 6.6.15.upstream/6.6.15
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'drivers/dma/idxd')
-rw-r--r-- | drivers/dma/idxd/Makefile | 12 | ||||
-rw-r--r-- | drivers/dma/idxd/bus.c | 91 | ||||
-rw-r--r-- | drivers/dma/idxd/cdev.c | 692 | ||||
-rw-r--r-- | drivers/dma/idxd/compat.c | 107 | ||||
-rw-r--r-- | drivers/dma/idxd/debugfs.c | 138 | ||||
-rw-r--r-- | drivers/dma/idxd/device.c | 1607 | ||||
-rw-r--r-- | drivers/dma/idxd/dma.c | 359 | ||||
-rw-r--r-- | drivers/dma/idxd/idxd.h | 752 | ||||
-rw-r--r-- | drivers/dma/idxd/init.c | 913 | ||||
-rw-r--r-- | drivers/dma/idxd/irq.c | 655 | ||||
-rw-r--r-- | drivers/dma/idxd/perfmon.c | 661 | ||||
-rw-r--r-- | drivers/dma/idxd/perfmon.h | 119 | ||||
-rw-r--r-- | drivers/dma/idxd/registers.h | 632 | ||||
-rw-r--r-- | drivers/dma/idxd/submit.c | 217 | ||||
-rw-r--r-- | drivers/dma/idxd/sysfs.c | 1934 |
15 files changed, 8889 insertions, 0 deletions
diff --git a/drivers/dma/idxd/Makefile b/drivers/dma/idxd/Makefile new file mode 100644 index 0000000000..c5e679070e --- /dev/null +++ b/drivers/dma/idxd/Makefile @@ -0,0 +1,12 @@ +ccflags-y += -DDEFAULT_SYMBOL_NAMESPACE=IDXD + +obj-$(CONFIG_INTEL_IDXD_BUS) += idxd_bus.o +idxd_bus-y := bus.o + +obj-$(CONFIG_INTEL_IDXD) += idxd.o +idxd-y := init.o irq.o device.o sysfs.o submit.o dma.o cdev.o debugfs.o + +idxd-$(CONFIG_INTEL_IDXD_PERFMON) += perfmon.o + +obj-$(CONFIG_INTEL_IDXD_COMPAT) += idxd_compat.o +idxd_compat-y := compat.o diff --git a/drivers/dma/idxd/bus.c b/drivers/dma/idxd/bus.c new file mode 100644 index 0000000000..6f84621053 --- /dev/null +++ b/drivers/dma/idxd/bus.c @@ -0,0 +1,91 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright(c) 2021 Intel Corporation. All rights rsvd. */ +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/device.h> +#include "idxd.h" + + +int __idxd_driver_register(struct idxd_device_driver *idxd_drv, struct module *owner, + const char *mod_name) +{ + struct device_driver *drv = &idxd_drv->drv; + + if (!idxd_drv->type) { + pr_debug("driver type not set (%ps)\n", __builtin_return_address(0)); + return -EINVAL; + } + + drv->name = idxd_drv->name; + drv->bus = &dsa_bus_type; + drv->owner = owner; + drv->mod_name = mod_name; + + return driver_register(drv); +} +EXPORT_SYMBOL_GPL(__idxd_driver_register); + +void idxd_driver_unregister(struct idxd_device_driver *idxd_drv) +{ + driver_unregister(&idxd_drv->drv); +} +EXPORT_SYMBOL_GPL(idxd_driver_unregister); + +static int idxd_config_bus_match(struct device *dev, + struct device_driver *drv) +{ + struct idxd_device_driver *idxd_drv = + container_of(drv, struct idxd_device_driver, drv); + struct idxd_dev *idxd_dev = confdev_to_idxd_dev(dev); + int i = 0; + + while (idxd_drv->type[i] != IDXD_DEV_NONE) { + if (idxd_dev->type == idxd_drv->type[i]) + return 1; + i++; + } + + return 0; +} + +static int idxd_config_bus_probe(struct device *dev) +{ + struct idxd_device_driver *idxd_drv = + container_of(dev->driver, struct idxd_device_driver, drv); + struct idxd_dev *idxd_dev = confdev_to_idxd_dev(dev); + + return idxd_drv->probe(idxd_dev); +} + +static void idxd_config_bus_remove(struct device *dev) +{ + struct idxd_device_driver *idxd_drv = + container_of(dev->driver, struct idxd_device_driver, drv); + struct idxd_dev *idxd_dev = confdev_to_idxd_dev(dev); + + idxd_drv->remove(idxd_dev); +} + +struct bus_type dsa_bus_type = { + .name = "dsa", + .match = idxd_config_bus_match, + .probe = idxd_config_bus_probe, + .remove = idxd_config_bus_remove, +}; +EXPORT_SYMBOL_GPL(dsa_bus_type); + +static int __init dsa_bus_init(void) +{ + return bus_register(&dsa_bus_type); +} +module_init(dsa_bus_init); + +static void __exit dsa_bus_exit(void) +{ + bus_unregister(&dsa_bus_type); +} +module_exit(dsa_bus_exit); + +MODULE_DESCRIPTION("IDXD driver dsa_bus_type driver"); +MODULE_LICENSE("GPL v2"); diff --git a/drivers/dma/idxd/cdev.c b/drivers/dma/idxd/cdev.c new file mode 100644 index 0000000000..d32deb9b4e --- /dev/null +++ b/drivers/dma/idxd/cdev.c @@ -0,0 +1,692 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright(c) 2019 Intel Corporation. All rights rsvd. */ +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/pci.h> +#include <linux/device.h> +#include <linux/sched/task.h> +#include <linux/io-64-nonatomic-lo-hi.h> +#include <linux/cdev.h> +#include <linux/fs.h> +#include <linux/poll.h> +#include <linux/iommu.h> +#include <linux/highmem.h> +#include <uapi/linux/idxd.h> +#include <linux/xarray.h> +#include "registers.h" +#include "idxd.h" + +struct idxd_cdev_context { + const char *name; + dev_t devt; + struct ida minor_ida; +}; + +/* + * Since user file names are global in DSA devices, define their ida's as + * global to avoid conflict file names. + */ +static DEFINE_IDA(file_ida); +static DEFINE_MUTEX(ida_lock); + +/* + * ictx is an array based off of accelerator types. enum idxd_type + * is used as index + */ +static struct idxd_cdev_context ictx[IDXD_TYPE_MAX] = { + { .name = "dsa" }, + { .name = "iax" } +}; + +struct idxd_user_context { + struct idxd_wq *wq; + struct task_struct *task; + unsigned int pasid; + struct mm_struct *mm; + unsigned int flags; + struct iommu_sva *sva; + struct idxd_dev idxd_dev; + u64 counters[COUNTER_MAX]; + int id; + pid_t pid; +}; + +static void idxd_cdev_evl_drain_pasid(struct idxd_wq *wq, u32 pasid); +static void idxd_xa_pasid_remove(struct idxd_user_context *ctx); + +static inline struct idxd_user_context *dev_to_uctx(struct device *dev) +{ + struct idxd_dev *idxd_dev = confdev_to_idxd_dev(dev); + + return container_of(idxd_dev, struct idxd_user_context, idxd_dev); +} + +static ssize_t cr_faults_show(struct device *dev, struct device_attribute *attr, char *buf) +{ + struct idxd_user_context *ctx = dev_to_uctx(dev); + + return sysfs_emit(buf, "%llu\n", ctx->counters[COUNTER_FAULTS]); +} +static DEVICE_ATTR_RO(cr_faults); + +static ssize_t cr_fault_failures_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct idxd_user_context *ctx = dev_to_uctx(dev); + + return sysfs_emit(buf, "%llu\n", ctx->counters[COUNTER_FAULT_FAILS]); +} +static DEVICE_ATTR_RO(cr_fault_failures); + +static ssize_t pid_show(struct device *dev, struct device_attribute *attr, char *buf) +{ + struct idxd_user_context *ctx = dev_to_uctx(dev); + + return sysfs_emit(buf, "%u\n", ctx->pid); +} +static DEVICE_ATTR_RO(pid); + +static struct attribute *cdev_file_attributes[] = { + &dev_attr_cr_faults.attr, + &dev_attr_cr_fault_failures.attr, + &dev_attr_pid.attr, + NULL +}; + +static umode_t cdev_file_attr_visible(struct kobject *kobj, struct attribute *a, int n) +{ + struct device *dev = container_of(kobj, typeof(*dev), kobj); + struct idxd_user_context *ctx = dev_to_uctx(dev); + struct idxd_wq *wq = ctx->wq; + + if (!wq_pasid_enabled(wq)) + return 0; + + return a->mode; +} + +static const struct attribute_group cdev_file_attribute_group = { + .attrs = cdev_file_attributes, + .is_visible = cdev_file_attr_visible, +}; + +static const struct attribute_group *cdev_file_attribute_groups[] = { + &cdev_file_attribute_group, + NULL +}; + +static void idxd_file_dev_release(struct device *dev) +{ + struct idxd_user_context *ctx = dev_to_uctx(dev); + struct idxd_wq *wq = ctx->wq; + struct idxd_device *idxd = wq->idxd; + int rc; + + mutex_lock(&ida_lock); + ida_free(&file_ida, ctx->id); + mutex_unlock(&ida_lock); + + /* Wait for in-flight operations to complete. */ + if (wq_shared(wq)) { + idxd_device_drain_pasid(idxd, ctx->pasid); + } else { + if (device_user_pasid_enabled(idxd)) { + /* The wq disable in the disable pasid function will drain the wq */ + rc = idxd_wq_disable_pasid(wq); + if (rc < 0) + dev_err(dev, "wq disable pasid failed.\n"); + } else { + idxd_wq_drain(wq); + } + } + + if (ctx->sva) { + idxd_cdev_evl_drain_pasid(wq, ctx->pasid); + iommu_sva_unbind_device(ctx->sva); + idxd_xa_pasid_remove(ctx); + } + kfree(ctx); + mutex_lock(&wq->wq_lock); + idxd_wq_put(wq); + mutex_unlock(&wq->wq_lock); +} + +static struct device_type idxd_cdev_file_type = { + .name = "idxd_file", + .release = idxd_file_dev_release, + .groups = cdev_file_attribute_groups, +}; + +static void idxd_cdev_dev_release(struct device *dev) +{ + struct idxd_cdev *idxd_cdev = dev_to_cdev(dev); + struct idxd_cdev_context *cdev_ctx; + struct idxd_wq *wq = idxd_cdev->wq; + + cdev_ctx = &ictx[wq->idxd->data->type]; + ida_simple_remove(&cdev_ctx->minor_ida, idxd_cdev->minor); + kfree(idxd_cdev); +} + +static struct device_type idxd_cdev_device_type = { + .name = "idxd_cdev", + .release = idxd_cdev_dev_release, +}; + +static inline struct idxd_cdev *inode_idxd_cdev(struct inode *inode) +{ + struct cdev *cdev = inode->i_cdev; + + return container_of(cdev, struct idxd_cdev, cdev); +} + +static inline struct idxd_wq *inode_wq(struct inode *inode) +{ + struct idxd_cdev *idxd_cdev = inode_idxd_cdev(inode); + + return idxd_cdev->wq; +} + +static void idxd_xa_pasid_remove(struct idxd_user_context *ctx) +{ + struct idxd_wq *wq = ctx->wq; + void *ptr; + + mutex_lock(&wq->uc_lock); + ptr = xa_cmpxchg(&wq->upasid_xa, ctx->pasid, ctx, NULL, GFP_KERNEL); + if (ptr != (void *)ctx) + dev_warn(&wq->idxd->pdev->dev, "xarray cmpxchg failed for pasid %u\n", + ctx->pasid); + mutex_unlock(&wq->uc_lock); +} + +void idxd_user_counter_increment(struct idxd_wq *wq, u32 pasid, int index) +{ + struct idxd_user_context *ctx; + + if (index >= COUNTER_MAX) + return; + + mutex_lock(&wq->uc_lock); + ctx = xa_load(&wq->upasid_xa, pasid); + if (!ctx) { + mutex_unlock(&wq->uc_lock); + return; + } + ctx->counters[index]++; + mutex_unlock(&wq->uc_lock); +} + +static int idxd_cdev_open(struct inode *inode, struct file *filp) +{ + struct idxd_user_context *ctx; + struct idxd_device *idxd; + struct idxd_wq *wq; + struct device *dev, *fdev; + int rc = 0; + struct iommu_sva *sva; + unsigned int pasid; + struct idxd_cdev *idxd_cdev; + + wq = inode_wq(inode); + idxd = wq->idxd; + dev = &idxd->pdev->dev; + + dev_dbg(dev, "%s called: %d\n", __func__, idxd_wq_refcount(wq)); + + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + + mutex_lock(&wq->wq_lock); + + if (idxd_wq_refcount(wq) > 0 && wq_dedicated(wq)) { + rc = -EBUSY; + goto failed; + } + + ctx->wq = wq; + filp->private_data = ctx; + ctx->pid = current->pid; + + if (device_user_pasid_enabled(idxd)) { + sva = iommu_sva_bind_device(dev, current->mm); + if (IS_ERR(sva)) { + rc = PTR_ERR(sva); + dev_err(dev, "pasid allocation failed: %d\n", rc); + goto failed; + } + + pasid = iommu_sva_get_pasid(sva); + if (pasid == IOMMU_PASID_INVALID) { + rc = -EINVAL; + goto failed_get_pasid; + } + + ctx->sva = sva; + ctx->pasid = pasid; + ctx->mm = current->mm; + + mutex_lock(&wq->uc_lock); + rc = xa_insert(&wq->upasid_xa, pasid, ctx, GFP_KERNEL); + mutex_unlock(&wq->uc_lock); + if (rc < 0) + dev_warn(dev, "PASID entry already exist in xarray.\n"); + + if (wq_dedicated(wq)) { + rc = idxd_wq_set_pasid(wq, pasid); + if (rc < 0) { + dev_err(dev, "wq set pasid failed: %d\n", rc); + goto failed_set_pasid; + } + } + } + + idxd_cdev = wq->idxd_cdev; + mutex_lock(&ida_lock); + ctx->id = ida_alloc(&file_ida, GFP_KERNEL); + mutex_unlock(&ida_lock); + if (ctx->id < 0) { + dev_warn(dev, "ida alloc failure\n"); + goto failed_ida; + } + ctx->idxd_dev.type = IDXD_DEV_CDEV_FILE; + fdev = user_ctx_dev(ctx); + device_initialize(fdev); + fdev->parent = cdev_dev(idxd_cdev); + fdev->bus = &dsa_bus_type; + fdev->type = &idxd_cdev_file_type; + + rc = dev_set_name(fdev, "file%d", ctx->id); + if (rc < 0) { + dev_warn(dev, "set name failure\n"); + goto failed_dev_name; + } + + rc = device_add(fdev); + if (rc < 0) { + dev_warn(dev, "file device add failure\n"); + goto failed_dev_add; + } + + idxd_wq_get(wq); + mutex_unlock(&wq->wq_lock); + return 0; + +failed_dev_add: +failed_dev_name: + put_device(fdev); +failed_ida: +failed_set_pasid: + if (device_user_pasid_enabled(idxd)) + idxd_xa_pasid_remove(ctx); +failed_get_pasid: + if (device_user_pasid_enabled(idxd)) + iommu_sva_unbind_device(sva); +failed: + mutex_unlock(&wq->wq_lock); + kfree(ctx); + return rc; +} + +static void idxd_cdev_evl_drain_pasid(struct idxd_wq *wq, u32 pasid) +{ + struct idxd_device *idxd = wq->idxd; + struct idxd_evl *evl = idxd->evl; + union evl_status_reg status; + u16 h, t, size; + int ent_size = evl_ent_size(idxd); + struct __evl_entry *entry_head; + + if (!evl) + return; + + spin_lock(&evl->lock); + status.bits = ioread64(idxd->reg_base + IDXD_EVLSTATUS_OFFSET); + t = status.tail; + h = evl->head; + size = evl->size; + + while (h != t) { + entry_head = (struct __evl_entry *)(evl->log + (h * ent_size)); + if (entry_head->pasid == pasid && entry_head->wq_idx == wq->id) + set_bit(h, evl->bmap); + h = (h + 1) % size; + } + spin_unlock(&evl->lock); + + drain_workqueue(wq->wq); +} + +static int idxd_cdev_release(struct inode *node, struct file *filep) +{ + struct idxd_user_context *ctx = filep->private_data; + struct idxd_wq *wq = ctx->wq; + struct idxd_device *idxd = wq->idxd; + struct device *dev = &idxd->pdev->dev; + + dev_dbg(dev, "%s called\n", __func__); + filep->private_data = NULL; + + device_unregister(user_ctx_dev(ctx)); + + return 0; +} + +static int check_vma(struct idxd_wq *wq, struct vm_area_struct *vma, + const char *func) +{ + struct device *dev = &wq->idxd->pdev->dev; + + if ((vma->vm_end - vma->vm_start) > PAGE_SIZE) { + dev_info_ratelimited(dev, + "%s: %s: mapping too large: %lu\n", + current->comm, func, + vma->vm_end - vma->vm_start); + return -EINVAL; + } + + return 0; +} + +static int idxd_cdev_mmap(struct file *filp, struct vm_area_struct *vma) +{ + struct idxd_user_context *ctx = filp->private_data; + struct idxd_wq *wq = ctx->wq; + struct idxd_device *idxd = wq->idxd; + struct pci_dev *pdev = idxd->pdev; + phys_addr_t base = pci_resource_start(pdev, IDXD_WQ_BAR); + unsigned long pfn; + int rc; + + dev_dbg(&pdev->dev, "%s called\n", __func__); + rc = check_vma(wq, vma, __func__); + if (rc < 0) + return rc; + + vm_flags_set(vma, VM_DONTCOPY); + pfn = (base + idxd_get_wq_portal_full_offset(wq->id, + IDXD_PORTAL_LIMITED)) >> PAGE_SHIFT; + vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); + vma->vm_private_data = ctx; + + return io_remap_pfn_range(vma, vma->vm_start, pfn, PAGE_SIZE, + vma->vm_page_prot); +} + +static __poll_t idxd_cdev_poll(struct file *filp, + struct poll_table_struct *wait) +{ + struct idxd_user_context *ctx = filp->private_data; + struct idxd_wq *wq = ctx->wq; + struct idxd_device *idxd = wq->idxd; + __poll_t out = 0; + + poll_wait(filp, &wq->err_queue, wait); + spin_lock(&idxd->dev_lock); + if (idxd->sw_err.valid) + out = EPOLLIN | EPOLLRDNORM; + spin_unlock(&idxd->dev_lock); + + return out; +} + +static const struct file_operations idxd_cdev_fops = { + .owner = THIS_MODULE, + .open = idxd_cdev_open, + .release = idxd_cdev_release, + .mmap = idxd_cdev_mmap, + .poll = idxd_cdev_poll, +}; + +int idxd_cdev_get_major(struct idxd_device *idxd) +{ + return MAJOR(ictx[idxd->data->type].devt); +} + +int idxd_wq_add_cdev(struct idxd_wq *wq) +{ + struct idxd_device *idxd = wq->idxd; + struct idxd_cdev *idxd_cdev; + struct cdev *cdev; + struct device *dev; + struct idxd_cdev_context *cdev_ctx; + int rc, minor; + + idxd_cdev = kzalloc(sizeof(*idxd_cdev), GFP_KERNEL); + if (!idxd_cdev) + return -ENOMEM; + + idxd_cdev->idxd_dev.type = IDXD_DEV_CDEV; + idxd_cdev->wq = wq; + cdev = &idxd_cdev->cdev; + dev = cdev_dev(idxd_cdev); + cdev_ctx = &ictx[wq->idxd->data->type]; + minor = ida_simple_get(&cdev_ctx->minor_ida, 0, MINORMASK, GFP_KERNEL); + if (minor < 0) { + kfree(idxd_cdev); + return minor; + } + idxd_cdev->minor = minor; + + device_initialize(dev); + dev->parent = wq_confdev(wq); + dev->bus = &dsa_bus_type; + dev->type = &idxd_cdev_device_type; + dev->devt = MKDEV(MAJOR(cdev_ctx->devt), minor); + + rc = dev_set_name(dev, "%s/wq%u.%u", idxd->data->name_prefix, idxd->id, wq->id); + if (rc < 0) + goto err; + + wq->idxd_cdev = idxd_cdev; + cdev_init(cdev, &idxd_cdev_fops); + rc = cdev_device_add(cdev, dev); + if (rc) { + dev_dbg(&wq->idxd->pdev->dev, "cdev_add failed: %d\n", rc); + goto err; + } + + return 0; + + err: + put_device(dev); + wq->idxd_cdev = NULL; + return rc; +} + +void idxd_wq_del_cdev(struct idxd_wq *wq) +{ + struct idxd_cdev *idxd_cdev; + + idxd_cdev = wq->idxd_cdev; + ida_destroy(&file_ida); + wq->idxd_cdev = NULL; + cdev_device_del(&idxd_cdev->cdev, cdev_dev(idxd_cdev)); + put_device(cdev_dev(idxd_cdev)); +} + +static int idxd_user_drv_probe(struct idxd_dev *idxd_dev) +{ + struct idxd_wq *wq = idxd_dev_to_wq(idxd_dev); + struct idxd_device *idxd = wq->idxd; + int rc; + + if (idxd->state != IDXD_DEV_ENABLED) + return -ENXIO; + + /* + * User type WQ is enabled only when SVA is enabled for two reasons: + * - If no IOMMU or IOMMU Passthrough without SVA, userspace + * can directly access physical address through the WQ. + * - The IDXD cdev driver does not provide any ways to pin + * user pages and translate the address from user VA to IOVA or + * PA without IOMMU SVA. Therefore the application has no way + * to instruct the device to perform DMA function. This makes + * the cdev not usable for normal application usage. + */ + if (!device_user_pasid_enabled(idxd)) { + idxd->cmd_status = IDXD_SCMD_WQ_USER_NO_IOMMU; + dev_dbg(&idxd->pdev->dev, + "User type WQ cannot be enabled without SVA.\n"); + + return -EOPNOTSUPP; + } + + mutex_lock(&wq->wq_lock); + + wq->wq = create_workqueue(dev_name(wq_confdev(wq))); + if (!wq->wq) { + rc = -ENOMEM; + goto wq_err; + } + + wq->type = IDXD_WQT_USER; + rc = drv_enable_wq(wq); + if (rc < 0) + goto err; + + rc = idxd_wq_add_cdev(wq); + if (rc < 0) { + idxd->cmd_status = IDXD_SCMD_CDEV_ERR; + goto err_cdev; + } + + idxd->cmd_status = 0; + mutex_unlock(&wq->wq_lock); + return 0; + +err_cdev: + drv_disable_wq(wq); +err: + destroy_workqueue(wq->wq); + wq->type = IDXD_WQT_NONE; +wq_err: + mutex_unlock(&wq->wq_lock); + return rc; +} + +static void idxd_user_drv_remove(struct idxd_dev *idxd_dev) +{ + struct idxd_wq *wq = idxd_dev_to_wq(idxd_dev); + + mutex_lock(&wq->wq_lock); + idxd_wq_del_cdev(wq); + drv_disable_wq(wq); + wq->type = IDXD_WQT_NONE; + destroy_workqueue(wq->wq); + wq->wq = NULL; + mutex_unlock(&wq->wq_lock); +} + +static enum idxd_dev_type dev_types[] = { + IDXD_DEV_WQ, + IDXD_DEV_NONE, +}; + +struct idxd_device_driver idxd_user_drv = { + .probe = idxd_user_drv_probe, + .remove = idxd_user_drv_remove, + .name = "user", + .type = dev_types, +}; +EXPORT_SYMBOL_GPL(idxd_user_drv); + +int idxd_cdev_register(void) +{ + int rc, i; + + for (i = 0; i < IDXD_TYPE_MAX; i++) { + ida_init(&ictx[i].minor_ida); + rc = alloc_chrdev_region(&ictx[i].devt, 0, MINORMASK, + ictx[i].name); + if (rc) + goto err_free_chrdev_region; + } + + return 0; + +err_free_chrdev_region: + for (i--; i >= 0; i--) + unregister_chrdev_region(ictx[i].devt, MINORMASK); + + return rc; +} + +void idxd_cdev_remove(void) +{ + int i; + + for (i = 0; i < IDXD_TYPE_MAX; i++) { + unregister_chrdev_region(ictx[i].devt, MINORMASK); + ida_destroy(&ictx[i].minor_ida); + } +} + +/** + * idxd_copy_cr - copy completion record to user address space found by wq and + * PASID + * @wq: work queue + * @pasid: PASID + * @addr: user fault address to write + * @cr: completion record + * @len: number of bytes to copy + * + * This is called by a work that handles completion record fault. + * + * Return: number of bytes copied. + */ +int idxd_copy_cr(struct idxd_wq *wq, ioasid_t pasid, unsigned long addr, + void *cr, int len) +{ + struct device *dev = &wq->idxd->pdev->dev; + int left = len, status_size = 1; + struct idxd_user_context *ctx; + struct mm_struct *mm; + + mutex_lock(&wq->uc_lock); + + ctx = xa_load(&wq->upasid_xa, pasid); + if (!ctx) { + dev_warn(dev, "No user context\n"); + goto out; + } + + mm = ctx->mm; + /* + * The completion record fault handling work is running in kernel + * thread context. It temporarily switches to the mm to copy cr + * to addr in the mm. + */ + kthread_use_mm(mm); + left = copy_to_user((void __user *)addr + status_size, cr + status_size, + len - status_size); + /* + * Copy status only after the rest of completion record is copied + * successfully so that the user gets the complete completion record + * when a non-zero status is polled. + */ + if (!left) { + u8 status; + + /* + * Ensure that the completion record's status field is written + * after the rest of the completion record has been written. + * This ensures that the user receives the correct completion + * record information once polling for a non-zero status. + */ + wmb(); + status = *(u8 *)cr; + if (put_user(status, (u8 __user *)addr)) + left += status_size; + } else { + left += status_size; + } + kthread_unuse_mm(mm); + +out: + mutex_unlock(&wq->uc_lock); + + return len - left; +} diff --git a/drivers/dma/idxd/compat.c b/drivers/dma/idxd/compat.c new file mode 100644 index 0000000000..5fd38d1b9d --- /dev/null +++ b/drivers/dma/idxd/compat.c @@ -0,0 +1,107 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright(c) 2021 Intel Corporation. All rights rsvd. */ +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/device.h> +#include <linux/device/bus.h> +#include "idxd.h" + +extern int device_driver_attach(struct device_driver *drv, struct device *dev); +extern void device_driver_detach(struct device *dev); + +#define DRIVER_ATTR_IGNORE_LOCKDEP(_name, _mode, _show, _store) \ + struct driver_attribute driver_attr_##_name = \ + __ATTR_IGNORE_LOCKDEP(_name, _mode, _show, _store) + +static ssize_t unbind_store(struct device_driver *drv, const char *buf, size_t count) +{ + const struct bus_type *bus = drv->bus; + struct device *dev; + int rc = -ENODEV; + + dev = bus_find_device_by_name(bus, NULL, buf); + if (dev && dev->driver) { + device_driver_detach(dev); + rc = count; + } + + return rc; +} +static DRIVER_ATTR_IGNORE_LOCKDEP(unbind, 0200, NULL, unbind_store); + +static ssize_t bind_store(struct device_driver *drv, const char *buf, size_t count) +{ + const struct bus_type *bus = drv->bus; + struct device *dev; + struct device_driver *alt_drv = NULL; + int rc = -ENODEV; + struct idxd_dev *idxd_dev; + + dev = bus_find_device_by_name(bus, NULL, buf); + if (!dev || dev->driver || drv != &dsa_drv.drv) + return -ENODEV; + + idxd_dev = confdev_to_idxd_dev(dev); + if (is_idxd_dev(idxd_dev)) { + alt_drv = driver_find("idxd", bus); + } else if (is_idxd_wq_dev(idxd_dev)) { + struct idxd_wq *wq = confdev_to_wq(dev); + + if (is_idxd_wq_kernel(wq)) + alt_drv = driver_find("dmaengine", bus); + else if (is_idxd_wq_user(wq)) + alt_drv = driver_find("user", bus); + } + if (!alt_drv) + return -ENODEV; + + rc = device_driver_attach(alt_drv, dev); + if (rc < 0) + return rc; + + return count; +} +static DRIVER_ATTR_IGNORE_LOCKDEP(bind, 0200, NULL, bind_store); + +static struct attribute *dsa_drv_compat_attrs[] = { + &driver_attr_bind.attr, + &driver_attr_unbind.attr, + NULL, +}; + +static const struct attribute_group dsa_drv_compat_attr_group = { + .attrs = dsa_drv_compat_attrs, +}; + +static const struct attribute_group *dsa_drv_compat_groups[] = { + &dsa_drv_compat_attr_group, + NULL, +}; + +static int idxd_dsa_drv_probe(struct idxd_dev *idxd_dev) +{ + return -ENODEV; +} + +static void idxd_dsa_drv_remove(struct idxd_dev *idxd_dev) +{ +} + +static enum idxd_dev_type dev_types[] = { + IDXD_DEV_NONE, +}; + +struct idxd_device_driver dsa_drv = { + .name = "dsa", + .probe = idxd_dsa_drv_probe, + .remove = idxd_dsa_drv_remove, + .type = dev_types, + .drv = { + .suppress_bind_attrs = true, + .groups = dsa_drv_compat_groups, + }, +}; + +module_idxd_driver(dsa_drv); +MODULE_IMPORT_NS(IDXD); diff --git a/drivers/dma/idxd/debugfs.c b/drivers/dma/idxd/debugfs.c new file mode 100644 index 0000000000..9cfbd9b14c --- /dev/null +++ b/drivers/dma/idxd/debugfs.c @@ -0,0 +1,138 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright(c) 2021 Intel Corporation. All rights rsvd. */ +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/pci.h> +#include <linux/debugfs.h> +#include <linux/io-64-nonatomic-lo-hi.h> +#include <uapi/linux/idxd.h> +#include "idxd.h" +#include "registers.h" + +static struct dentry *idxd_debugfs_dir; + +static void dump_event_entry(struct idxd_device *idxd, struct seq_file *s, + u16 index, int *count, bool processed) +{ + struct idxd_evl *evl = idxd->evl; + struct dsa_evl_entry *entry; + struct dsa_completion_record *cr; + u64 *raw; + int i; + int evl_strides = evl_ent_size(idxd) / sizeof(u64); + + entry = (struct dsa_evl_entry *)evl->log + index; + + if (!entry->e.desc_valid) + return; + + seq_printf(s, "Event Log entry %d (real index %u) processed: %u\n", + *count, index, processed); + + seq_printf(s, "desc valid %u wq idx valid %u\n" + "batch %u fault rw %u priv %u error 0x%x\n" + "wq idx %u op %#x pasid %u batch idx %u\n" + "fault addr %#llx\n", + entry->e.desc_valid, entry->e.wq_idx_valid, + entry->e.batch, entry->e.fault_rw, entry->e.priv, + entry->e.error, entry->e.wq_idx, entry->e.operation, + entry->e.pasid, entry->e.batch_idx, entry->e.fault_addr); + + cr = &entry->cr; + seq_printf(s, "status %#x result %#x fault_info %#x bytes_completed %u\n" + "fault addr %#llx inv flags %#x\n\n", + cr->status, cr->result, cr->fault_info, cr->bytes_completed, + cr->fault_addr, cr->invalid_flags); + + raw = (u64 *)entry; + + for (i = 0; i < evl_strides; i++) + seq_printf(s, "entry[%d] = %#llx\n", i, raw[i]); + + seq_puts(s, "\n"); + *count += 1; +} + +static int debugfs_evl_show(struct seq_file *s, void *d) +{ + struct idxd_device *idxd = s->private; + struct idxd_evl *evl = idxd->evl; + union evl_status_reg evl_status; + u16 h, t, evl_size, i; + int count = 0; + bool processed = true; + + if (!evl || !evl->log) + return 0; + + spin_lock(&evl->lock); + + h = evl->head; + evl_status.bits = ioread64(idxd->reg_base + IDXD_EVLSTATUS_OFFSET); + t = evl_status.tail; + evl_size = evl->size; + + seq_printf(s, "Event Log head %u tail %u interrupt pending %u\n\n", + evl_status.head, evl_status.tail, evl_status.int_pending); + + i = t; + while (1) { + i = (i + 1) % evl_size; + if (i == t) + break; + + if (processed && i == h) + processed = false; + dump_event_entry(idxd, s, i, &count, processed); + } + + spin_unlock(&evl->lock); + return 0; +} + +DEFINE_SHOW_ATTRIBUTE(debugfs_evl); + +int idxd_device_init_debugfs(struct idxd_device *idxd) +{ + if (IS_ERR_OR_NULL(idxd_debugfs_dir)) + return 0; + + idxd->dbgfs_dir = debugfs_create_dir(dev_name(idxd_confdev(idxd)), idxd_debugfs_dir); + if (IS_ERR(idxd->dbgfs_dir)) + return PTR_ERR(idxd->dbgfs_dir); + + if (idxd->evl) { + idxd->dbgfs_evl_file = debugfs_create_file("event_log", 0400, + idxd->dbgfs_dir, idxd, + &debugfs_evl_fops); + if (IS_ERR(idxd->dbgfs_evl_file)) { + debugfs_remove_recursive(idxd->dbgfs_dir); + idxd->dbgfs_dir = NULL; + return PTR_ERR(idxd->dbgfs_evl_file); + } + } + + return 0; +} + +void idxd_device_remove_debugfs(struct idxd_device *idxd) +{ + debugfs_remove_recursive(idxd->dbgfs_dir); +} + +int idxd_init_debugfs(void) +{ + if (!debugfs_initialized()) + return 0; + + idxd_debugfs_dir = debugfs_create_dir(KBUILD_MODNAME, NULL); + if (IS_ERR(idxd_debugfs_dir)) + return PTR_ERR(idxd_debugfs_dir); + return 0; +} + +void idxd_remove_debugfs(void) +{ + debugfs_remove_recursive(idxd_debugfs_dir); +} diff --git a/drivers/dma/idxd/device.c b/drivers/dma/idxd/device.c new file mode 100644 index 0000000000..fa0f880bea --- /dev/null +++ b/drivers/dma/idxd/device.c @@ -0,0 +1,1607 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright(c) 2019 Intel Corporation. All rights rsvd. */ +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/pci.h> +#include <linux/io-64-nonatomic-lo-hi.h> +#include <linux/dmaengine.h> +#include <linux/irq.h> +#include <uapi/linux/idxd.h> +#include "../dmaengine.h" +#include "idxd.h" +#include "registers.h" + +static void idxd_cmd_exec(struct idxd_device *idxd, int cmd_code, u32 operand, + u32 *status); +static void idxd_device_wqs_clear_state(struct idxd_device *idxd); +static void idxd_wq_disable_cleanup(struct idxd_wq *wq); + +/* Interrupt control bits */ +void idxd_unmask_error_interrupts(struct idxd_device *idxd) +{ + union genctrl_reg genctrl; + + genctrl.bits = ioread32(idxd->reg_base + IDXD_GENCTRL_OFFSET); + genctrl.softerr_int_en = 1; + genctrl.halt_int_en = 1; + iowrite32(genctrl.bits, idxd->reg_base + IDXD_GENCTRL_OFFSET); +} + +void idxd_mask_error_interrupts(struct idxd_device *idxd) +{ + union genctrl_reg genctrl; + + genctrl.bits = ioread32(idxd->reg_base + IDXD_GENCTRL_OFFSET); + genctrl.softerr_int_en = 0; + genctrl.halt_int_en = 0; + iowrite32(genctrl.bits, idxd->reg_base + IDXD_GENCTRL_OFFSET); +} + +static void free_hw_descs(struct idxd_wq *wq) +{ + int i; + + for (i = 0; i < wq->num_descs; i++) + kfree(wq->hw_descs[i]); + + kfree(wq->hw_descs); +} + +static int alloc_hw_descs(struct idxd_wq *wq, int num) +{ + struct device *dev = &wq->idxd->pdev->dev; + int i; + int node = dev_to_node(dev); + + wq->hw_descs = kcalloc_node(num, sizeof(struct dsa_hw_desc *), + GFP_KERNEL, node); + if (!wq->hw_descs) + return -ENOMEM; + + for (i = 0; i < num; i++) { + wq->hw_descs[i] = kzalloc_node(sizeof(*wq->hw_descs[i]), + GFP_KERNEL, node); + if (!wq->hw_descs[i]) { + free_hw_descs(wq); + return -ENOMEM; + } + } + + return 0; +} + +static void free_descs(struct idxd_wq *wq) +{ + int i; + + for (i = 0; i < wq->num_descs; i++) + kfree(wq->descs[i]); + + kfree(wq->descs); +} + +static int alloc_descs(struct idxd_wq *wq, int num) +{ + struct device *dev = &wq->idxd->pdev->dev; + int i; + int node = dev_to_node(dev); + + wq->descs = kcalloc_node(num, sizeof(struct idxd_desc *), + GFP_KERNEL, node); + if (!wq->descs) + return -ENOMEM; + + for (i = 0; i < num; i++) { + wq->descs[i] = kzalloc_node(sizeof(*wq->descs[i]), + GFP_KERNEL, node); + if (!wq->descs[i]) { + free_descs(wq); + return -ENOMEM; + } + } + + return 0; +} + +/* WQ control bits */ +int idxd_wq_alloc_resources(struct idxd_wq *wq) +{ + struct idxd_device *idxd = wq->idxd; + struct device *dev = &idxd->pdev->dev; + int rc, num_descs, i; + + if (wq->type != IDXD_WQT_KERNEL) + return 0; + + num_descs = wq_dedicated(wq) ? wq->size : wq->threshold; + wq->num_descs = num_descs; + + rc = alloc_hw_descs(wq, num_descs); + if (rc < 0) + return rc; + + wq->compls_size = num_descs * idxd->data->compl_size; + wq->compls = dma_alloc_coherent(dev, wq->compls_size, &wq->compls_addr, GFP_KERNEL); + if (!wq->compls) { + rc = -ENOMEM; + goto fail_alloc_compls; + } + + rc = alloc_descs(wq, num_descs); + if (rc < 0) + goto fail_alloc_descs; + + rc = sbitmap_queue_init_node(&wq->sbq, num_descs, -1, false, GFP_KERNEL, + dev_to_node(dev)); + if (rc < 0) + goto fail_sbitmap_init; + + for (i = 0; i < num_descs; i++) { + struct idxd_desc *desc = wq->descs[i]; + + desc->hw = wq->hw_descs[i]; + if (idxd->data->type == IDXD_TYPE_DSA) + desc->completion = &wq->compls[i]; + else if (idxd->data->type == IDXD_TYPE_IAX) + desc->iax_completion = &wq->iax_compls[i]; + desc->compl_dma = wq->compls_addr + idxd->data->compl_size * i; + desc->id = i; + desc->wq = wq; + desc->cpu = -1; + } + + return 0; + + fail_sbitmap_init: + free_descs(wq); + fail_alloc_descs: + dma_free_coherent(dev, wq->compls_size, wq->compls, wq->compls_addr); + fail_alloc_compls: + free_hw_descs(wq); + return rc; +} + +void idxd_wq_free_resources(struct idxd_wq *wq) +{ + struct device *dev = &wq->idxd->pdev->dev; + + if (wq->type != IDXD_WQT_KERNEL) + return; + + free_hw_descs(wq); + free_descs(wq); + dma_free_coherent(dev, wq->compls_size, wq->compls, wq->compls_addr); + sbitmap_queue_free(&wq->sbq); +} + +int idxd_wq_enable(struct idxd_wq *wq) +{ + struct idxd_device *idxd = wq->idxd; + struct device *dev = &idxd->pdev->dev; + u32 status; + + if (wq->state == IDXD_WQ_ENABLED) { + dev_dbg(dev, "WQ %d already enabled\n", wq->id); + return 0; + } + + idxd_cmd_exec(idxd, IDXD_CMD_ENABLE_WQ, wq->id, &status); + + if (status != IDXD_CMDSTS_SUCCESS && + status != IDXD_CMDSTS_ERR_WQ_ENABLED) { + dev_dbg(dev, "WQ enable failed: %#x\n", status); + return -ENXIO; + } + + wq->state = IDXD_WQ_ENABLED; + set_bit(wq->id, idxd->wq_enable_map); + dev_dbg(dev, "WQ %d enabled\n", wq->id); + return 0; +} + +int idxd_wq_disable(struct idxd_wq *wq, bool reset_config) +{ + struct idxd_device *idxd = wq->idxd; + struct device *dev = &idxd->pdev->dev; + u32 status, operand; + + dev_dbg(dev, "Disabling WQ %d\n", wq->id); + + if (wq->state != IDXD_WQ_ENABLED) { + dev_dbg(dev, "WQ %d in wrong state: %d\n", wq->id, wq->state); + return 0; + } + + operand = BIT(wq->id % 16) | ((wq->id / 16) << 16); + idxd_cmd_exec(idxd, IDXD_CMD_DISABLE_WQ, operand, &status); + + if (status != IDXD_CMDSTS_SUCCESS) { + dev_dbg(dev, "WQ disable failed: %#x\n", status); + return -ENXIO; + } + + if (reset_config) + idxd_wq_disable_cleanup(wq); + clear_bit(wq->id, idxd->wq_enable_map); + wq->state = IDXD_WQ_DISABLED; + dev_dbg(dev, "WQ %d disabled\n", wq->id); + return 0; +} + +void idxd_wq_drain(struct idxd_wq *wq) +{ + struct idxd_device *idxd = wq->idxd; + struct device *dev = &idxd->pdev->dev; + u32 operand; + + if (wq->state != IDXD_WQ_ENABLED) { + dev_dbg(dev, "WQ %d in wrong state: %d\n", wq->id, wq->state); + return; + } + + dev_dbg(dev, "Draining WQ %d\n", wq->id); + operand = BIT(wq->id % 16) | ((wq->id / 16) << 16); + idxd_cmd_exec(idxd, IDXD_CMD_DRAIN_WQ, operand, NULL); +} + +void idxd_wq_reset(struct idxd_wq *wq) +{ + struct idxd_device *idxd = wq->idxd; + struct device *dev = &idxd->pdev->dev; + u32 operand; + + if (wq->state != IDXD_WQ_ENABLED) { + dev_dbg(dev, "WQ %d in wrong state: %d\n", wq->id, wq->state); + return; + } + + operand = BIT(wq->id % 16) | ((wq->id / 16) << 16); + idxd_cmd_exec(idxd, IDXD_CMD_RESET_WQ, operand, NULL); + idxd_wq_disable_cleanup(wq); +} + +int idxd_wq_map_portal(struct idxd_wq *wq) +{ + struct idxd_device *idxd = wq->idxd; + struct pci_dev *pdev = idxd->pdev; + struct device *dev = &pdev->dev; + resource_size_t start; + + start = pci_resource_start(pdev, IDXD_WQ_BAR); + start += idxd_get_wq_portal_full_offset(wq->id, IDXD_PORTAL_LIMITED); + + wq->portal = devm_ioremap(dev, start, IDXD_PORTAL_SIZE); + if (!wq->portal) + return -ENOMEM; + + return 0; +} + +void idxd_wq_unmap_portal(struct idxd_wq *wq) +{ + struct device *dev = &wq->idxd->pdev->dev; + + devm_iounmap(dev, wq->portal); + wq->portal = NULL; + wq->portal_offset = 0; +} + +void idxd_wqs_unmap_portal(struct idxd_device *idxd) +{ + int i; + + for (i = 0; i < idxd->max_wqs; i++) { + struct idxd_wq *wq = idxd->wqs[i]; + + if (wq->portal) + idxd_wq_unmap_portal(wq); + } +} + +static void __idxd_wq_set_pasid_locked(struct idxd_wq *wq, int pasid) +{ + struct idxd_device *idxd = wq->idxd; + union wqcfg wqcfg; + unsigned int offset; + + offset = WQCFG_OFFSET(idxd, wq->id, WQCFG_PASID_IDX); + spin_lock(&idxd->dev_lock); + wqcfg.bits[WQCFG_PASID_IDX] = ioread32(idxd->reg_base + offset); + wqcfg.pasid_en = 1; + wqcfg.pasid = pasid; + wq->wqcfg->bits[WQCFG_PASID_IDX] = wqcfg.bits[WQCFG_PASID_IDX]; + iowrite32(wqcfg.bits[WQCFG_PASID_IDX], idxd->reg_base + offset); + spin_unlock(&idxd->dev_lock); +} + +int idxd_wq_set_pasid(struct idxd_wq *wq, int pasid) +{ + int rc; + + rc = idxd_wq_disable(wq, false); + if (rc < 0) + return rc; + + __idxd_wq_set_pasid_locked(wq, pasid); + + rc = idxd_wq_enable(wq); + if (rc < 0) + return rc; + + return 0; +} + +int idxd_wq_disable_pasid(struct idxd_wq *wq) +{ + struct idxd_device *idxd = wq->idxd; + int rc; + union wqcfg wqcfg; + unsigned int offset; + + rc = idxd_wq_disable(wq, false); + if (rc < 0) + return rc; + + offset = WQCFG_OFFSET(idxd, wq->id, WQCFG_PASID_IDX); + spin_lock(&idxd->dev_lock); + wqcfg.bits[WQCFG_PASID_IDX] = ioread32(idxd->reg_base + offset); + wqcfg.pasid_en = 0; + wqcfg.pasid = 0; + iowrite32(wqcfg.bits[WQCFG_PASID_IDX], idxd->reg_base + offset); + spin_unlock(&idxd->dev_lock); + + rc = idxd_wq_enable(wq); + if (rc < 0) + return rc; + + return 0; +} + +static void idxd_wq_disable_cleanup(struct idxd_wq *wq) +{ + struct idxd_device *idxd = wq->idxd; + + lockdep_assert_held(&wq->wq_lock); + wq->state = IDXD_WQ_DISABLED; + memset(wq->wqcfg, 0, idxd->wqcfg_size); + wq->type = IDXD_WQT_NONE; + wq->threshold = 0; + wq->priority = 0; + wq->enqcmds_retries = IDXD_ENQCMDS_RETRIES; + wq->flags = 0; + memset(wq->name, 0, WQ_NAME_SIZE); + wq->max_xfer_bytes = WQ_DEFAULT_MAX_XFER; + idxd_wq_set_max_batch_size(idxd->data->type, wq, WQ_DEFAULT_MAX_BATCH); + if (wq->opcap_bmap) + bitmap_copy(wq->opcap_bmap, idxd->opcap_bmap, IDXD_MAX_OPCAP_BITS); +} + +static void idxd_wq_device_reset_cleanup(struct idxd_wq *wq) +{ + lockdep_assert_held(&wq->wq_lock); + + wq->size = 0; + wq->group = NULL; +} + +static void idxd_wq_ref_release(struct percpu_ref *ref) +{ + struct idxd_wq *wq = container_of(ref, struct idxd_wq, wq_active); + + complete(&wq->wq_dead); +} + +int idxd_wq_init_percpu_ref(struct idxd_wq *wq) +{ + int rc; + + memset(&wq->wq_active, 0, sizeof(wq->wq_active)); + rc = percpu_ref_init(&wq->wq_active, idxd_wq_ref_release, + PERCPU_REF_ALLOW_REINIT, GFP_KERNEL); + if (rc < 0) + return rc; + reinit_completion(&wq->wq_dead); + reinit_completion(&wq->wq_resurrect); + return 0; +} + +void __idxd_wq_quiesce(struct idxd_wq *wq) +{ + lockdep_assert_held(&wq->wq_lock); + reinit_completion(&wq->wq_resurrect); + percpu_ref_kill(&wq->wq_active); + complete_all(&wq->wq_resurrect); + wait_for_completion(&wq->wq_dead); +} + +void idxd_wq_quiesce(struct idxd_wq *wq) +{ + mutex_lock(&wq->wq_lock); + __idxd_wq_quiesce(wq); + mutex_unlock(&wq->wq_lock); +} + +/* Device control bits */ +static inline bool idxd_is_enabled(struct idxd_device *idxd) +{ + union gensts_reg gensts; + + gensts.bits = ioread32(idxd->reg_base + IDXD_GENSTATS_OFFSET); + + if (gensts.state == IDXD_DEVICE_STATE_ENABLED) + return true; + return false; +} + +static inline bool idxd_device_is_halted(struct idxd_device *idxd) +{ + union gensts_reg gensts; + + gensts.bits = ioread32(idxd->reg_base + IDXD_GENSTATS_OFFSET); + + return (gensts.state == IDXD_DEVICE_STATE_HALT); +} + +/* + * This is function is only used for reset during probe and will + * poll for completion. Once the device is setup with interrupts, + * all commands will be done via interrupt completion. + */ +int idxd_device_init_reset(struct idxd_device *idxd) +{ + struct device *dev = &idxd->pdev->dev; + union idxd_command_reg cmd; + + if (idxd_device_is_halted(idxd)) { + dev_warn(&idxd->pdev->dev, "Device is HALTED!\n"); + return -ENXIO; + } + + memset(&cmd, 0, sizeof(cmd)); + cmd.cmd = IDXD_CMD_RESET_DEVICE; + dev_dbg(dev, "%s: sending reset for init.\n", __func__); + spin_lock(&idxd->cmd_lock); + iowrite32(cmd.bits, idxd->reg_base + IDXD_CMD_OFFSET); + + while (ioread32(idxd->reg_base + IDXD_CMDSTS_OFFSET) & + IDXD_CMDSTS_ACTIVE) + cpu_relax(); + spin_unlock(&idxd->cmd_lock); + return 0; +} + +static void idxd_cmd_exec(struct idxd_device *idxd, int cmd_code, u32 operand, + u32 *status) +{ + union idxd_command_reg cmd; + DECLARE_COMPLETION_ONSTACK(done); + u32 stat; + unsigned long flags; + + if (idxd_device_is_halted(idxd)) { + dev_warn(&idxd->pdev->dev, "Device is HALTED!\n"); + if (status) + *status = IDXD_CMDSTS_HW_ERR; + return; + } + + memset(&cmd, 0, sizeof(cmd)); + cmd.cmd = cmd_code; + cmd.operand = operand; + cmd.int_req = 1; + + spin_lock_irqsave(&idxd->cmd_lock, flags); + wait_event_lock_irq(idxd->cmd_waitq, + !test_bit(IDXD_FLAG_CMD_RUNNING, &idxd->flags), + idxd->cmd_lock); + + dev_dbg(&idxd->pdev->dev, "%s: sending cmd: %#x op: %#x\n", + __func__, cmd_code, operand); + + idxd->cmd_status = 0; + __set_bit(IDXD_FLAG_CMD_RUNNING, &idxd->flags); + idxd->cmd_done = &done; + iowrite32(cmd.bits, idxd->reg_base + IDXD_CMD_OFFSET); + + /* + * After command submitted, release lock and go to sleep until + * the command completes via interrupt. + */ + spin_unlock_irqrestore(&idxd->cmd_lock, flags); + wait_for_completion(&done); + stat = ioread32(idxd->reg_base + IDXD_CMDSTS_OFFSET); + spin_lock(&idxd->cmd_lock); + if (status) + *status = stat; + idxd->cmd_status = stat & GENMASK(7, 0); + + __clear_bit(IDXD_FLAG_CMD_RUNNING, &idxd->flags); + /* Wake up other pending commands */ + wake_up(&idxd->cmd_waitq); + spin_unlock(&idxd->cmd_lock); +} + +int idxd_device_enable(struct idxd_device *idxd) +{ + struct device *dev = &idxd->pdev->dev; + u32 status; + + if (idxd_is_enabled(idxd)) { + dev_dbg(dev, "Device already enabled\n"); + return -ENXIO; + } + + idxd_cmd_exec(idxd, IDXD_CMD_ENABLE_DEVICE, 0, &status); + + /* If the command is successful or if the device was enabled */ + if (status != IDXD_CMDSTS_SUCCESS && + status != IDXD_CMDSTS_ERR_DEV_ENABLED) { + dev_dbg(dev, "%s: err_code: %#x\n", __func__, status); + return -ENXIO; + } + + idxd->state = IDXD_DEV_ENABLED; + return 0; +} + +int idxd_device_disable(struct idxd_device *idxd) +{ + struct device *dev = &idxd->pdev->dev; + u32 status; + + if (!idxd_is_enabled(idxd)) { + dev_dbg(dev, "Device is not enabled\n"); + return 0; + } + + idxd_cmd_exec(idxd, IDXD_CMD_DISABLE_DEVICE, 0, &status); + + /* If the command is successful or if the device was disabled */ + if (status != IDXD_CMDSTS_SUCCESS && + !(status & IDXD_CMDSTS_ERR_DIS_DEV_EN)) { + dev_dbg(dev, "%s: err_code: %#x\n", __func__, status); + return -ENXIO; + } + + idxd_device_clear_state(idxd); + return 0; +} + +void idxd_device_reset(struct idxd_device *idxd) +{ + idxd_cmd_exec(idxd, IDXD_CMD_RESET_DEVICE, 0, NULL); + idxd_device_clear_state(idxd); + spin_lock(&idxd->dev_lock); + idxd_unmask_error_interrupts(idxd); + spin_unlock(&idxd->dev_lock); +} + +void idxd_device_drain_pasid(struct idxd_device *idxd, int pasid) +{ + struct device *dev = &idxd->pdev->dev; + u32 operand; + + operand = pasid; + dev_dbg(dev, "cmd: %u operand: %#x\n", IDXD_CMD_DRAIN_PASID, operand); + idxd_cmd_exec(idxd, IDXD_CMD_DRAIN_PASID, operand, NULL); + dev_dbg(dev, "pasid %d drained\n", pasid); +} + +int idxd_device_request_int_handle(struct idxd_device *idxd, int idx, int *handle, + enum idxd_interrupt_type irq_type) +{ + struct device *dev = &idxd->pdev->dev; + u32 operand, status; + + if (!(idxd->hw.cmd_cap & BIT(IDXD_CMD_REQUEST_INT_HANDLE))) + return -EOPNOTSUPP; + + dev_dbg(dev, "get int handle, idx %d\n", idx); + + operand = idx & GENMASK(15, 0); + if (irq_type == IDXD_IRQ_IMS) + operand |= CMD_INT_HANDLE_IMS; + + dev_dbg(dev, "cmd: %u operand: %#x\n", IDXD_CMD_REQUEST_INT_HANDLE, operand); + + idxd_cmd_exec(idxd, IDXD_CMD_REQUEST_INT_HANDLE, operand, &status); + + if ((status & IDXD_CMDSTS_ERR_MASK) != IDXD_CMDSTS_SUCCESS) { + dev_dbg(dev, "request int handle failed: %#x\n", status); + return -ENXIO; + } + + *handle = (status >> IDXD_CMDSTS_RES_SHIFT) & GENMASK(15, 0); + + dev_dbg(dev, "int handle acquired: %u\n", *handle); + return 0; +} + +int idxd_device_release_int_handle(struct idxd_device *idxd, int handle, + enum idxd_interrupt_type irq_type) +{ + struct device *dev = &idxd->pdev->dev; + u32 operand, status; + union idxd_command_reg cmd; + + if (!(idxd->hw.cmd_cap & BIT(IDXD_CMD_RELEASE_INT_HANDLE))) + return -EOPNOTSUPP; + + dev_dbg(dev, "release int handle, handle %d\n", handle); + + memset(&cmd, 0, sizeof(cmd)); + operand = handle & GENMASK(15, 0); + + if (irq_type == IDXD_IRQ_IMS) + operand |= CMD_INT_HANDLE_IMS; + + cmd.cmd = IDXD_CMD_RELEASE_INT_HANDLE; + cmd.operand = operand; + + dev_dbg(dev, "cmd: %u operand: %#x\n", IDXD_CMD_RELEASE_INT_HANDLE, operand); + + spin_lock(&idxd->cmd_lock); + iowrite32(cmd.bits, idxd->reg_base + IDXD_CMD_OFFSET); + + while (ioread32(idxd->reg_base + IDXD_CMDSTS_OFFSET) & IDXD_CMDSTS_ACTIVE) + cpu_relax(); + status = ioread32(idxd->reg_base + IDXD_CMDSTS_OFFSET); + spin_unlock(&idxd->cmd_lock); + + if ((status & IDXD_CMDSTS_ERR_MASK) != IDXD_CMDSTS_SUCCESS) { + dev_dbg(dev, "release int handle failed: %#x\n", status); + return -ENXIO; + } + + dev_dbg(dev, "int handle released.\n"); + return 0; +} + +/* Device configuration bits */ +static void idxd_engines_clear_state(struct idxd_device *idxd) +{ + struct idxd_engine *engine; + int i; + + lockdep_assert_held(&idxd->dev_lock); + for (i = 0; i < idxd->max_engines; i++) { + engine = idxd->engines[i]; + engine->group = NULL; + } +} + +static void idxd_groups_clear_state(struct idxd_device *idxd) +{ + struct idxd_group *group; + int i; + + lockdep_assert_held(&idxd->dev_lock); + for (i = 0; i < idxd->max_groups; i++) { + group = idxd->groups[i]; + memset(&group->grpcfg, 0, sizeof(group->grpcfg)); + group->num_engines = 0; + group->num_wqs = 0; + group->use_rdbuf_limit = false; + /* + * The default value is the same as the value of + * total read buffers in GRPCAP. + */ + group->rdbufs_allowed = idxd->max_rdbufs; + group->rdbufs_reserved = 0; + if (idxd->hw.version <= DEVICE_VERSION_2 && !tc_override) { + group->tc_a = 1; + group->tc_b = 1; + } else { + group->tc_a = -1; + group->tc_b = -1; + } + group->desc_progress_limit = 0; + group->batch_progress_limit = 0; + } +} + +static void idxd_device_wqs_clear_state(struct idxd_device *idxd) +{ + int i; + + for (i = 0; i < idxd->max_wqs; i++) { + struct idxd_wq *wq = idxd->wqs[i]; + + mutex_lock(&wq->wq_lock); + idxd_wq_disable_cleanup(wq); + idxd_wq_device_reset_cleanup(wq); + mutex_unlock(&wq->wq_lock); + } +} + +void idxd_device_clear_state(struct idxd_device *idxd) +{ + /* IDXD is always disabled. Other states are cleared only when IDXD is configurable. */ + if (test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags)) { + /* + * Clearing wq state is protected by wq lock. + * So no need to be protected by device lock. + */ + idxd_device_wqs_clear_state(idxd); + + spin_lock(&idxd->dev_lock); + idxd_groups_clear_state(idxd); + idxd_engines_clear_state(idxd); + } else { + spin_lock(&idxd->dev_lock); + } + + idxd->state = IDXD_DEV_DISABLED; + spin_unlock(&idxd->dev_lock); +} + +static int idxd_device_evl_setup(struct idxd_device *idxd) +{ + union gencfg_reg gencfg; + union evlcfg_reg evlcfg; + union genctrl_reg genctrl; + struct device *dev = &idxd->pdev->dev; + void *addr; + dma_addr_t dma_addr; + int size; + struct idxd_evl *evl = idxd->evl; + unsigned long *bmap; + int rc; + + if (!evl) + return 0; + + size = evl_size(idxd); + + bmap = bitmap_zalloc(size, GFP_KERNEL); + if (!bmap) { + rc = -ENOMEM; + goto err_bmap; + } + + /* + * Address needs to be page aligned. However, dma_alloc_coherent() provides + * at minimal page size aligned address. No manual alignment required. + */ + addr = dma_alloc_coherent(dev, size, &dma_addr, GFP_KERNEL); + if (!addr) { + rc = -ENOMEM; + goto err_alloc; + } + + spin_lock(&evl->lock); + evl->log = addr; + evl->dma = dma_addr; + evl->log_size = size; + evl->bmap = bmap; + + memset(&evlcfg, 0, sizeof(evlcfg)); + evlcfg.bits[0] = dma_addr & GENMASK(63, 12); + evlcfg.size = evl->size; + + iowrite64(evlcfg.bits[0], idxd->reg_base + IDXD_EVLCFG_OFFSET); + iowrite64(evlcfg.bits[1], idxd->reg_base + IDXD_EVLCFG_OFFSET + 8); + + genctrl.bits = ioread32(idxd->reg_base + IDXD_GENCTRL_OFFSET); + genctrl.evl_int_en = 1; + iowrite32(genctrl.bits, idxd->reg_base + IDXD_GENCTRL_OFFSET); + + gencfg.bits = ioread32(idxd->reg_base + IDXD_GENCFG_OFFSET); + gencfg.evl_en = 1; + iowrite32(gencfg.bits, idxd->reg_base + IDXD_GENCFG_OFFSET); + + spin_unlock(&evl->lock); + return 0; + +err_alloc: + bitmap_free(bmap); +err_bmap: + return rc; +} + +static void idxd_device_evl_free(struct idxd_device *idxd) +{ + void *evl_log; + unsigned int evl_log_size; + dma_addr_t evl_dma; + union gencfg_reg gencfg; + union genctrl_reg genctrl; + struct device *dev = &idxd->pdev->dev; + struct idxd_evl *evl = idxd->evl; + + gencfg.bits = ioread32(idxd->reg_base + IDXD_GENCFG_OFFSET); + if (!gencfg.evl_en) + return; + + spin_lock(&evl->lock); + gencfg.evl_en = 0; + iowrite32(gencfg.bits, idxd->reg_base + IDXD_GENCFG_OFFSET); + + genctrl.bits = ioread32(idxd->reg_base + IDXD_GENCTRL_OFFSET); + genctrl.evl_int_en = 0; + iowrite32(genctrl.bits, idxd->reg_base + IDXD_GENCTRL_OFFSET); + + iowrite64(0, idxd->reg_base + IDXD_EVLCFG_OFFSET); + iowrite64(0, idxd->reg_base + IDXD_EVLCFG_OFFSET + 8); + + bitmap_free(evl->bmap); + evl_log = evl->log; + evl_log_size = evl->log_size; + evl_dma = evl->dma; + evl->log = NULL; + evl->size = IDXD_EVL_SIZE_MIN; + spin_unlock(&evl->lock); + + dma_free_coherent(dev, evl_log_size, evl_log, evl_dma); +} + +static void idxd_group_config_write(struct idxd_group *group) +{ + struct idxd_device *idxd = group->idxd; + struct device *dev = &idxd->pdev->dev; + int i; + u32 grpcfg_offset; + + dev_dbg(dev, "Writing group %d cfg registers\n", group->id); + + /* setup GRPWQCFG */ + for (i = 0; i < GRPWQCFG_STRIDES; i++) { + grpcfg_offset = GRPWQCFG_OFFSET(idxd, group->id, i); + iowrite64(group->grpcfg.wqs[i], idxd->reg_base + grpcfg_offset); + dev_dbg(dev, "GRPCFG wq[%d:%d: %#x]: %#llx\n", + group->id, i, grpcfg_offset, + ioread64(idxd->reg_base + grpcfg_offset)); + } + + /* setup GRPENGCFG */ + grpcfg_offset = GRPENGCFG_OFFSET(idxd, group->id); + iowrite64(group->grpcfg.engines, idxd->reg_base + grpcfg_offset); + dev_dbg(dev, "GRPCFG engs[%d: %#x]: %#llx\n", group->id, + grpcfg_offset, ioread64(idxd->reg_base + grpcfg_offset)); + + /* setup GRPFLAGS */ + grpcfg_offset = GRPFLGCFG_OFFSET(idxd, group->id); + iowrite64(group->grpcfg.flags.bits, idxd->reg_base + grpcfg_offset); + dev_dbg(dev, "GRPFLAGS flags[%d: %#x]: %#llx\n", + group->id, grpcfg_offset, + ioread64(idxd->reg_base + grpcfg_offset)); +} + +static int idxd_groups_config_write(struct idxd_device *idxd) + +{ + union gencfg_reg reg; + int i; + struct device *dev = &idxd->pdev->dev; + + /* Setup bandwidth rdbuf limit */ + if (idxd->hw.gen_cap.config_en && idxd->rdbuf_limit) { + reg.bits = ioread32(idxd->reg_base + IDXD_GENCFG_OFFSET); + reg.rdbuf_limit = idxd->rdbuf_limit; + iowrite32(reg.bits, idxd->reg_base + IDXD_GENCFG_OFFSET); + } + + dev_dbg(dev, "GENCFG(%#x): %#x\n", IDXD_GENCFG_OFFSET, + ioread32(idxd->reg_base + IDXD_GENCFG_OFFSET)); + + for (i = 0; i < idxd->max_groups; i++) { + struct idxd_group *group = idxd->groups[i]; + + idxd_group_config_write(group); + } + + return 0; +} + +static bool idxd_device_pasid_priv_enabled(struct idxd_device *idxd) +{ + struct pci_dev *pdev = idxd->pdev; + + if (pdev->pasid_enabled && (pdev->pasid_features & PCI_PASID_CAP_PRIV)) + return true; + return false; +} + +static int idxd_wq_config_write(struct idxd_wq *wq) +{ + struct idxd_device *idxd = wq->idxd; + struct device *dev = &idxd->pdev->dev; + u32 wq_offset; + int i, n; + + if (!wq->group) + return 0; + + /* + * Instead of memset the entire shadow copy of WQCFG, copy from the hardware after + * wq reset. This will copy back the sticky values that are present on some devices. + */ + for (i = 0; i < WQCFG_STRIDES(idxd); i++) { + wq_offset = WQCFG_OFFSET(idxd, wq->id, i); + wq->wqcfg->bits[i] |= ioread32(idxd->reg_base + wq_offset); + } + + if (wq->size == 0 && wq->type != IDXD_WQT_NONE) + wq->size = WQ_DEFAULT_QUEUE_DEPTH; + + /* byte 0-3 */ + wq->wqcfg->wq_size = wq->size; + + /* bytes 4-7 */ + wq->wqcfg->wq_thresh = wq->threshold; + + /* byte 8-11 */ + if (wq_dedicated(wq)) + wq->wqcfg->mode = 1; + + /* + * The WQ priv bit is set depending on the WQ type. priv = 1 if the + * WQ type is kernel to indicate privileged access. This setting only + * matters for dedicated WQ. According to the DSA spec: + * If the WQ is in dedicated mode, WQ PASID Enable is 1, and the + * Privileged Mode Enable field of the PCI Express PASID capability + * is 0, this field must be 0. + * + * In the case of a dedicated kernel WQ that is not able to support + * the PASID cap, then the configuration will be rejected. + */ + if (wq_dedicated(wq) && wq->wqcfg->pasid_en && + !idxd_device_pasid_priv_enabled(idxd) && + wq->type == IDXD_WQT_KERNEL) { + idxd->cmd_status = IDXD_SCMD_WQ_NO_PRIV; + return -EOPNOTSUPP; + } + + wq->wqcfg->priority = wq->priority; + + if (idxd->hw.gen_cap.block_on_fault && + test_bit(WQ_FLAG_BLOCK_ON_FAULT, &wq->flags) && + !test_bit(WQ_FLAG_PRS_DISABLE, &wq->flags)) + wq->wqcfg->bof = 1; + + if (idxd->hw.wq_cap.wq_ats_support) + wq->wqcfg->wq_ats_disable = test_bit(WQ_FLAG_ATS_DISABLE, &wq->flags); + + if (idxd->hw.wq_cap.wq_prs_support) + wq->wqcfg->wq_prs_disable = test_bit(WQ_FLAG_PRS_DISABLE, &wq->flags); + + /* bytes 12-15 */ + wq->wqcfg->max_xfer_shift = ilog2(wq->max_xfer_bytes); + idxd_wqcfg_set_max_batch_shift(idxd->data->type, wq->wqcfg, ilog2(wq->max_batch_size)); + + /* bytes 32-63 */ + if (idxd->hw.wq_cap.op_config && wq->opcap_bmap) { + memset(wq->wqcfg->op_config, 0, IDXD_MAX_OPCAP_BITS / 8); + for_each_set_bit(n, wq->opcap_bmap, IDXD_MAX_OPCAP_BITS) { + int pos = n % BITS_PER_LONG_LONG; + int idx = n / BITS_PER_LONG_LONG; + + wq->wqcfg->op_config[idx] |= BIT(pos); + } + } + + dev_dbg(dev, "WQ %d CFGs\n", wq->id); + for (i = 0; i < WQCFG_STRIDES(idxd); i++) { + wq_offset = WQCFG_OFFSET(idxd, wq->id, i); + iowrite32(wq->wqcfg->bits[i], idxd->reg_base + wq_offset); + dev_dbg(dev, "WQ[%d][%d][%#x]: %#x\n", + wq->id, i, wq_offset, + ioread32(idxd->reg_base + wq_offset)); + } + + return 0; +} + +static int idxd_wqs_config_write(struct idxd_device *idxd) +{ + int i, rc; + + for (i = 0; i < idxd->max_wqs; i++) { + struct idxd_wq *wq = idxd->wqs[i]; + + rc = idxd_wq_config_write(wq); + if (rc < 0) + return rc; + } + + return 0; +} + +static void idxd_group_flags_setup(struct idxd_device *idxd) +{ + int i; + + /* TC-A 0 and TC-B 1 should be defaults */ + for (i = 0; i < idxd->max_groups; i++) { + struct idxd_group *group = idxd->groups[i]; + + if (group->tc_a == -1) + group->tc_a = group->grpcfg.flags.tc_a = 0; + else + group->grpcfg.flags.tc_a = group->tc_a; + if (group->tc_b == -1) + group->tc_b = group->grpcfg.flags.tc_b = 1; + else + group->grpcfg.flags.tc_b = group->tc_b; + group->grpcfg.flags.use_rdbuf_limit = group->use_rdbuf_limit; + group->grpcfg.flags.rdbufs_reserved = group->rdbufs_reserved; + group->grpcfg.flags.rdbufs_allowed = group->rdbufs_allowed; + group->grpcfg.flags.desc_progress_limit = group->desc_progress_limit; + group->grpcfg.flags.batch_progress_limit = group->batch_progress_limit; + } +} + +static int idxd_engines_setup(struct idxd_device *idxd) +{ + int i, engines = 0; + struct idxd_engine *eng; + struct idxd_group *group; + + for (i = 0; i < idxd->max_groups; i++) { + group = idxd->groups[i]; + group->grpcfg.engines = 0; + } + + for (i = 0; i < idxd->max_engines; i++) { + eng = idxd->engines[i]; + group = eng->group; + + if (!group) + continue; + + group->grpcfg.engines |= BIT(eng->id); + engines++; + } + + if (!engines) + return -EINVAL; + + return 0; +} + +static int idxd_wqs_setup(struct idxd_device *idxd) +{ + struct idxd_wq *wq; + struct idxd_group *group; + int i, j, configured = 0; + struct device *dev = &idxd->pdev->dev; + + for (i = 0; i < idxd->max_groups; i++) { + group = idxd->groups[i]; + for (j = 0; j < 4; j++) + group->grpcfg.wqs[j] = 0; + } + + for (i = 0; i < idxd->max_wqs; i++) { + wq = idxd->wqs[i]; + group = wq->group; + + if (!wq->group) + continue; + + if (wq_shared(wq) && !wq_shared_supported(wq)) { + idxd->cmd_status = IDXD_SCMD_WQ_NO_SWQ_SUPPORT; + dev_warn(dev, "No shared wq support but configured.\n"); + return -EINVAL; + } + + group->grpcfg.wqs[wq->id / 64] |= BIT(wq->id % 64); + configured++; + } + + if (configured == 0) { + idxd->cmd_status = IDXD_SCMD_WQ_NONE_CONFIGURED; + return -EINVAL; + } + + return 0; +} + +int idxd_device_config(struct idxd_device *idxd) +{ + int rc; + + lockdep_assert_held(&idxd->dev_lock); + rc = idxd_wqs_setup(idxd); + if (rc < 0) + return rc; + + rc = idxd_engines_setup(idxd); + if (rc < 0) + return rc; + + idxd_group_flags_setup(idxd); + + rc = idxd_wqs_config_write(idxd); + if (rc < 0) + return rc; + + rc = idxd_groups_config_write(idxd); + if (rc < 0) + return rc; + + return 0; +} + +static int idxd_wq_load_config(struct idxd_wq *wq) +{ + struct idxd_device *idxd = wq->idxd; + struct device *dev = &idxd->pdev->dev; + int wqcfg_offset; + int i; + + wqcfg_offset = WQCFG_OFFSET(idxd, wq->id, 0); + memcpy_fromio(wq->wqcfg, idxd->reg_base + wqcfg_offset, idxd->wqcfg_size); + + wq->size = wq->wqcfg->wq_size; + wq->threshold = wq->wqcfg->wq_thresh; + + /* The driver does not support shared WQ mode in read-only config yet */ + if (wq->wqcfg->mode == 0 || wq->wqcfg->pasid_en) + return -EOPNOTSUPP; + + set_bit(WQ_FLAG_DEDICATED, &wq->flags); + + wq->priority = wq->wqcfg->priority; + + wq->max_xfer_bytes = 1ULL << wq->wqcfg->max_xfer_shift; + idxd_wq_set_max_batch_size(idxd->data->type, wq, 1U << wq->wqcfg->max_batch_shift); + + for (i = 0; i < WQCFG_STRIDES(idxd); i++) { + wqcfg_offset = WQCFG_OFFSET(idxd, wq->id, i); + dev_dbg(dev, "WQ[%d][%d][%#x]: %#x\n", wq->id, i, wqcfg_offset, wq->wqcfg->bits[i]); + } + + return 0; +} + +static void idxd_group_load_config(struct idxd_group *group) +{ + struct idxd_device *idxd = group->idxd; + struct device *dev = &idxd->pdev->dev; + int i, j, grpcfg_offset; + + /* + * Load WQS bit fields + * Iterate through all 256 bits 64 bits at a time + */ + for (i = 0; i < GRPWQCFG_STRIDES; i++) { + struct idxd_wq *wq; + + grpcfg_offset = GRPWQCFG_OFFSET(idxd, group->id, i); + group->grpcfg.wqs[i] = ioread64(idxd->reg_base + grpcfg_offset); + dev_dbg(dev, "GRPCFG wq[%d:%d: %#x]: %#llx\n", + group->id, i, grpcfg_offset, group->grpcfg.wqs[i]); + + if (i * 64 >= idxd->max_wqs) + break; + + /* Iterate through all 64 bits and check for wq set */ + for (j = 0; j < 64; j++) { + int id = i * 64 + j; + + /* No need to check beyond max wqs */ + if (id >= idxd->max_wqs) + break; + + /* Set group assignment for wq if wq bit is set */ + if (group->grpcfg.wqs[i] & BIT(j)) { + wq = idxd->wqs[id]; + wq->group = group; + } + } + } + + grpcfg_offset = GRPENGCFG_OFFSET(idxd, group->id); + group->grpcfg.engines = ioread64(idxd->reg_base + grpcfg_offset); + dev_dbg(dev, "GRPCFG engs[%d: %#x]: %#llx\n", group->id, + grpcfg_offset, group->grpcfg.engines); + + /* Iterate through all 64 bits to check engines set */ + for (i = 0; i < 64; i++) { + if (i >= idxd->max_engines) + break; + + if (group->grpcfg.engines & BIT(i)) { + struct idxd_engine *engine = idxd->engines[i]; + + engine->group = group; + } + } + + grpcfg_offset = GRPFLGCFG_OFFSET(idxd, group->id); + group->grpcfg.flags.bits = ioread64(idxd->reg_base + grpcfg_offset); + dev_dbg(dev, "GRPFLAGS flags[%d: %#x]: %#llx\n", + group->id, grpcfg_offset, group->grpcfg.flags.bits); +} + +int idxd_device_load_config(struct idxd_device *idxd) +{ + union gencfg_reg reg; + int i, rc; + + reg.bits = ioread32(idxd->reg_base + IDXD_GENCFG_OFFSET); + idxd->rdbuf_limit = reg.rdbuf_limit; + + for (i = 0; i < idxd->max_groups; i++) { + struct idxd_group *group = idxd->groups[i]; + + idxd_group_load_config(group); + } + + for (i = 0; i < idxd->max_wqs; i++) { + struct idxd_wq *wq = idxd->wqs[i]; + + rc = idxd_wq_load_config(wq); + if (rc < 0) + return rc; + } + + return 0; +} + +static void idxd_flush_pending_descs(struct idxd_irq_entry *ie) +{ + struct idxd_desc *desc, *itr; + struct llist_node *head; + LIST_HEAD(flist); + enum idxd_complete_type ctype; + + spin_lock(&ie->list_lock); + head = llist_del_all(&ie->pending_llist); + if (head) { + llist_for_each_entry_safe(desc, itr, head, llnode) + list_add_tail(&desc->list, &ie->work_list); + } + + list_for_each_entry_safe(desc, itr, &ie->work_list, list) + list_move_tail(&desc->list, &flist); + spin_unlock(&ie->list_lock); + + list_for_each_entry_safe(desc, itr, &flist, list) { + struct dma_async_tx_descriptor *tx; + + list_del(&desc->list); + ctype = desc->completion->status ? IDXD_COMPLETE_NORMAL : IDXD_COMPLETE_ABORT; + /* + * wq is being disabled. Any remaining descriptors are + * likely to be stuck and can be dropped. callback could + * point to code that is no longer accessible, for example + * if dmatest module has been unloaded. + */ + tx = &desc->txd; + tx->callback = NULL; + tx->callback_result = NULL; + idxd_dma_complete_txd(desc, ctype, true); + } +} + +static void idxd_device_set_perm_entry(struct idxd_device *idxd, + struct idxd_irq_entry *ie) +{ + union msix_perm mperm; + + if (ie->pasid == IOMMU_PASID_INVALID) + return; + + mperm.bits = 0; + mperm.pasid = ie->pasid; + mperm.pasid_en = 1; + iowrite32(mperm.bits, idxd->reg_base + idxd->msix_perm_offset + ie->id * 8); +} + +static void idxd_device_clear_perm_entry(struct idxd_device *idxd, + struct idxd_irq_entry *ie) +{ + iowrite32(0, idxd->reg_base + idxd->msix_perm_offset + ie->id * 8); +} + +void idxd_wq_free_irq(struct idxd_wq *wq) +{ + struct idxd_device *idxd = wq->idxd; + struct idxd_irq_entry *ie = &wq->ie; + + if (wq->type != IDXD_WQT_KERNEL) + return; + + free_irq(ie->vector, ie); + idxd_flush_pending_descs(ie); + if (idxd->request_int_handles) + idxd_device_release_int_handle(idxd, ie->int_handle, IDXD_IRQ_MSIX); + idxd_device_clear_perm_entry(idxd, ie); + ie->vector = -1; + ie->int_handle = INVALID_INT_HANDLE; + ie->pasid = IOMMU_PASID_INVALID; +} + +int idxd_wq_request_irq(struct idxd_wq *wq) +{ + struct idxd_device *idxd = wq->idxd; + struct pci_dev *pdev = idxd->pdev; + struct device *dev = &pdev->dev; + struct idxd_irq_entry *ie; + int rc; + + if (wq->type != IDXD_WQT_KERNEL) + return 0; + + ie = &wq->ie; + ie->vector = pci_irq_vector(pdev, ie->id); + ie->pasid = device_pasid_enabled(idxd) ? idxd->pasid : IOMMU_PASID_INVALID; + idxd_device_set_perm_entry(idxd, ie); + + rc = request_threaded_irq(ie->vector, NULL, idxd_wq_thread, 0, "idxd-portal", ie); + if (rc < 0) { + dev_err(dev, "Failed to request irq %d.\n", ie->vector); + goto err_irq; + } + + if (idxd->request_int_handles) { + rc = idxd_device_request_int_handle(idxd, ie->id, &ie->int_handle, + IDXD_IRQ_MSIX); + if (rc < 0) + goto err_int_handle; + } else { + ie->int_handle = ie->id; + } + + return 0; + +err_int_handle: + ie->int_handle = INVALID_INT_HANDLE; + free_irq(ie->vector, ie); +err_irq: + idxd_device_clear_perm_entry(idxd, ie); + ie->pasid = IOMMU_PASID_INVALID; + return rc; +} + +int drv_enable_wq(struct idxd_wq *wq) +{ + struct idxd_device *idxd = wq->idxd; + struct device *dev = &idxd->pdev->dev; + int rc = -ENXIO; + + lockdep_assert_held(&wq->wq_lock); + + if (idxd->state != IDXD_DEV_ENABLED) { + idxd->cmd_status = IDXD_SCMD_DEV_NOT_ENABLED; + goto err; + } + + if (wq->state != IDXD_WQ_DISABLED) { + dev_dbg(dev, "wq %d already enabled.\n", wq->id); + idxd->cmd_status = IDXD_SCMD_WQ_ENABLED; + rc = -EBUSY; + goto err; + } + + if (!wq->group) { + dev_dbg(dev, "wq %d not attached to group.\n", wq->id); + idxd->cmd_status = IDXD_SCMD_WQ_NO_GRP; + goto err; + } + + if (strlen(wq->name) == 0) { + idxd->cmd_status = IDXD_SCMD_WQ_NO_NAME; + dev_dbg(dev, "wq %d name not set.\n", wq->id); + goto err; + } + + /* Shared WQ checks */ + if (wq_shared(wq)) { + if (!wq_shared_supported(wq)) { + idxd->cmd_status = IDXD_SCMD_WQ_NO_SVM; + dev_dbg(dev, "PASID not enabled and shared wq.\n"); + goto err; + } + /* + * Shared wq with the threshold set to 0 means the user + * did not set the threshold or transitioned from a + * dedicated wq but did not set threshold. A value + * of 0 would effectively disable the shared wq. The + * driver does not allow a value of 0 to be set for + * threshold via sysfs. + */ + if (wq->threshold == 0) { + idxd->cmd_status = IDXD_SCMD_WQ_NO_THRESH; + dev_dbg(dev, "Shared wq and threshold 0.\n"); + goto err; + } + } + + /* + * In the event that the WQ is configurable for pasid, the driver + * should setup the pasid, pasid_en bit. This is true for both kernel + * and user shared workqueues. There is no need to setup priv bit in + * that in-kernel DMA will also do user privileged requests. + * A dedicated wq that is not 'kernel' type will configure pasid and + * pasid_en later on so there is no need to setup. + */ + if (test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags)) { + if (wq_pasid_enabled(wq)) { + if (is_idxd_wq_kernel(wq) || wq_shared(wq)) { + u32 pasid = wq_dedicated(wq) ? idxd->pasid : 0; + + __idxd_wq_set_pasid_locked(wq, pasid); + } + } + } + + rc = 0; + spin_lock(&idxd->dev_lock); + if (test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags)) + rc = idxd_device_config(idxd); + spin_unlock(&idxd->dev_lock); + if (rc < 0) { + dev_dbg(dev, "Writing wq %d config failed: %d\n", wq->id, rc); + goto err; + } + + rc = idxd_wq_enable(wq); + if (rc < 0) { + dev_dbg(dev, "wq %d enabling failed: %d\n", wq->id, rc); + goto err; + } + + rc = idxd_wq_map_portal(wq); + if (rc < 0) { + idxd->cmd_status = IDXD_SCMD_WQ_PORTAL_ERR; + dev_dbg(dev, "wq %d portal mapping failed: %d\n", wq->id, rc); + goto err_map_portal; + } + + wq->client_count = 0; + + rc = idxd_wq_request_irq(wq); + if (rc < 0) { + idxd->cmd_status = IDXD_SCMD_WQ_IRQ_ERR; + dev_dbg(dev, "WQ %d irq setup failed: %d\n", wq->id, rc); + goto err_irq; + } + + rc = idxd_wq_alloc_resources(wq); + if (rc < 0) { + idxd->cmd_status = IDXD_SCMD_WQ_RES_ALLOC_ERR; + dev_dbg(dev, "WQ resource alloc failed\n"); + goto err_res_alloc; + } + + rc = idxd_wq_init_percpu_ref(wq); + if (rc < 0) { + idxd->cmd_status = IDXD_SCMD_PERCPU_ERR; + dev_dbg(dev, "percpu_ref setup failed\n"); + goto err_ref; + } + + return 0; + +err_ref: + idxd_wq_free_resources(wq); +err_res_alloc: + idxd_wq_free_irq(wq); +err_irq: + idxd_wq_unmap_portal(wq); +err_map_portal: + if (idxd_wq_disable(wq, false)) + dev_dbg(dev, "wq %s disable failed\n", dev_name(wq_confdev(wq))); +err: + return rc; +} + +void drv_disable_wq(struct idxd_wq *wq) +{ + struct idxd_device *idxd = wq->idxd; + struct device *dev = &idxd->pdev->dev; + + lockdep_assert_held(&wq->wq_lock); + + if (idxd_wq_refcount(wq)) + dev_warn(dev, "Clients has claim on wq %d: %d\n", + wq->id, idxd_wq_refcount(wq)); + + idxd_wq_unmap_portal(wq); + idxd_wq_drain(wq); + idxd_wq_free_irq(wq); + idxd_wq_reset(wq); + idxd_wq_free_resources(wq); + percpu_ref_exit(&wq->wq_active); + wq->type = IDXD_WQT_NONE; + wq->client_count = 0; +} + +int idxd_device_drv_probe(struct idxd_dev *idxd_dev) +{ + struct idxd_device *idxd = idxd_dev_to_idxd(idxd_dev); + int rc = 0; + + /* + * Device should be in disabled state for the idxd_drv to load. If it's in + * enabled state, then the device was altered outside of driver's control. + * If the state is in halted state, then we don't want to proceed. + */ + if (idxd->state != IDXD_DEV_DISABLED) { + idxd->cmd_status = IDXD_SCMD_DEV_ENABLED; + return -ENXIO; + } + + /* Device configuration */ + spin_lock(&idxd->dev_lock); + if (test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags)) + rc = idxd_device_config(idxd); + spin_unlock(&idxd->dev_lock); + if (rc < 0) + return -ENXIO; + + /* + * System PASID is preserved across device disable/enable cycle, but + * genconfig register content gets cleared during device reset. We + * need to re-enable user interrupts for kernel work queue completion + * IRQ to function. + */ + if (idxd->pasid != IOMMU_PASID_INVALID) + idxd_set_user_intr(idxd, 1); + + rc = idxd_device_evl_setup(idxd); + if (rc < 0) { + idxd->cmd_status = IDXD_SCMD_DEV_EVL_ERR; + return rc; + } + + /* Start device */ + rc = idxd_device_enable(idxd); + if (rc < 0) { + idxd_device_evl_free(idxd); + return rc; + } + + /* Setup DMA device without channels */ + rc = idxd_register_dma_device(idxd); + if (rc < 0) { + idxd_device_disable(idxd); + idxd_device_evl_free(idxd); + idxd->cmd_status = IDXD_SCMD_DEV_DMA_ERR; + return rc; + } + + idxd->cmd_status = 0; + return 0; +} + +void idxd_device_drv_remove(struct idxd_dev *idxd_dev) +{ + struct device *dev = &idxd_dev->conf_dev; + struct idxd_device *idxd = idxd_dev_to_idxd(idxd_dev); + int i; + + for (i = 0; i < idxd->max_wqs; i++) { + struct idxd_wq *wq = idxd->wqs[i]; + struct device *wq_dev = wq_confdev(wq); + + if (wq->state == IDXD_WQ_DISABLED) + continue; + dev_warn(dev, "Active wq %d on disable %s.\n", i, dev_name(wq_dev)); + device_release_driver(wq_dev); + } + + idxd_unregister_dma_device(idxd); + idxd_device_disable(idxd); + if (test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags)) + idxd_device_reset(idxd); + idxd_device_evl_free(idxd); +} + +static enum idxd_dev_type dev_types[] = { + IDXD_DEV_DSA, + IDXD_DEV_IAX, + IDXD_DEV_NONE, +}; + +struct idxd_device_driver idxd_drv = { + .type = dev_types, + .probe = idxd_device_drv_probe, + .remove = idxd_device_drv_remove, + .name = "idxd", +}; +EXPORT_SYMBOL_GPL(idxd_drv); diff --git a/drivers/dma/idxd/dma.c b/drivers/dma/idxd/dma.c new file mode 100644 index 0000000000..07623fb0f5 --- /dev/null +++ b/drivers/dma/idxd/dma.c @@ -0,0 +1,359 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright(c) 2019 Intel Corporation. All rights rsvd. */ +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/pci.h> +#include <linux/device.h> +#include <linux/io-64-nonatomic-lo-hi.h> +#include <linux/dmaengine.h> +#include <uapi/linux/idxd.h> +#include "../dmaengine.h" +#include "registers.h" +#include "idxd.h" + +static inline struct idxd_wq *to_idxd_wq(struct dma_chan *c) +{ + struct idxd_dma_chan *idxd_chan; + + idxd_chan = container_of(c, struct idxd_dma_chan, chan); + return idxd_chan->wq; +} + +void idxd_dma_complete_txd(struct idxd_desc *desc, + enum idxd_complete_type comp_type, + bool free_desc) +{ + struct idxd_device *idxd = desc->wq->idxd; + struct dma_async_tx_descriptor *tx; + struct dmaengine_result res; + int complete = 1; + + if (desc->completion->status == DSA_COMP_SUCCESS) { + res.result = DMA_TRANS_NOERROR; + } else if (desc->completion->status) { + if (idxd->request_int_handles && comp_type != IDXD_COMPLETE_ABORT && + desc->completion->status == DSA_COMP_INT_HANDLE_INVAL && + idxd_queue_int_handle_resubmit(desc)) + return; + res.result = DMA_TRANS_WRITE_FAILED; + } else if (comp_type == IDXD_COMPLETE_ABORT) { + res.result = DMA_TRANS_ABORTED; + } else { + complete = 0; + } + + tx = &desc->txd; + if (complete && tx->cookie) { + dma_cookie_complete(tx); + dma_descriptor_unmap(tx); + dmaengine_desc_get_callback_invoke(tx, &res); + tx->callback = NULL; + tx->callback_result = NULL; + } + + if (free_desc) + idxd_free_desc(desc->wq, desc); +} + +static void op_flag_setup(unsigned long flags, u32 *desc_flags) +{ + *desc_flags = IDXD_OP_FLAG_CRAV | IDXD_OP_FLAG_RCR; + if (flags & DMA_PREP_INTERRUPT) + *desc_flags |= IDXD_OP_FLAG_RCI; +} + +static inline void idxd_prep_desc_common(struct idxd_wq *wq, + struct dsa_hw_desc *hw, char opcode, + u64 addr_f1, u64 addr_f2, u64 len, + u64 compl, u32 flags) +{ + hw->flags = flags; + hw->opcode = opcode; + hw->src_addr = addr_f1; + hw->dst_addr = addr_f2; + hw->xfer_size = len; + /* + * For dedicated WQ, this field is ignored and HW will use the WQCFG.priv + * field instead. This field should be set to 0 for kernel descriptors + * since kernel DMA on VT-d supports "user" privilege only. + */ + hw->priv = 0; + hw->completion_addr = compl; +} + +static struct dma_async_tx_descriptor * +idxd_dma_prep_interrupt(struct dma_chan *c, unsigned long flags) +{ + struct idxd_wq *wq = to_idxd_wq(c); + u32 desc_flags; + struct idxd_desc *desc; + + if (wq->state != IDXD_WQ_ENABLED) + return NULL; + + op_flag_setup(flags, &desc_flags); + desc = idxd_alloc_desc(wq, IDXD_OP_BLOCK); + if (IS_ERR(desc)) + return NULL; + + idxd_prep_desc_common(wq, desc->hw, DSA_OPCODE_NOOP, + 0, 0, 0, desc->compl_dma, desc_flags); + desc->txd.flags = flags; + return &desc->txd; +} + +static struct dma_async_tx_descriptor * +idxd_dma_submit_memcpy(struct dma_chan *c, dma_addr_t dma_dest, + dma_addr_t dma_src, size_t len, unsigned long flags) +{ + struct idxd_wq *wq = to_idxd_wq(c); + u32 desc_flags; + struct idxd_device *idxd = wq->idxd; + struct idxd_desc *desc; + + if (wq->state != IDXD_WQ_ENABLED) + return NULL; + + if (len > idxd->max_xfer_bytes) + return NULL; + + op_flag_setup(flags, &desc_flags); + desc = idxd_alloc_desc(wq, IDXD_OP_BLOCK); + if (IS_ERR(desc)) + return NULL; + + idxd_prep_desc_common(wq, desc->hw, DSA_OPCODE_MEMMOVE, + dma_src, dma_dest, len, desc->compl_dma, + desc_flags); + + desc->txd.flags = flags; + + return &desc->txd; +} + +static int idxd_dma_alloc_chan_resources(struct dma_chan *chan) +{ + struct idxd_wq *wq = to_idxd_wq(chan); + struct device *dev = &wq->idxd->pdev->dev; + + idxd_wq_get(wq); + dev_dbg(dev, "%s: client_count: %d\n", __func__, + idxd_wq_refcount(wq)); + return 0; +} + +static void idxd_dma_free_chan_resources(struct dma_chan *chan) +{ + struct idxd_wq *wq = to_idxd_wq(chan); + struct device *dev = &wq->idxd->pdev->dev; + + idxd_wq_put(wq); + dev_dbg(dev, "%s: client_count: %d\n", __func__, + idxd_wq_refcount(wq)); +} + +static enum dma_status idxd_dma_tx_status(struct dma_chan *dma_chan, + dma_cookie_t cookie, + struct dma_tx_state *txstate) +{ + return DMA_OUT_OF_ORDER; +} + +/* + * issue_pending() does not need to do anything since tx_submit() does the job + * already. + */ +static void idxd_dma_issue_pending(struct dma_chan *dma_chan) +{ +} + +static dma_cookie_t idxd_dma_tx_submit(struct dma_async_tx_descriptor *tx) +{ + struct dma_chan *c = tx->chan; + struct idxd_wq *wq = to_idxd_wq(c); + dma_cookie_t cookie; + int rc; + struct idxd_desc *desc = container_of(tx, struct idxd_desc, txd); + + cookie = dma_cookie_assign(tx); + + rc = idxd_submit_desc(wq, desc); + if (rc < 0) { + idxd_free_desc(wq, desc); + return rc; + } + + return cookie; +} + +static void idxd_dma_release(struct dma_device *device) +{ + struct idxd_dma_dev *idxd_dma = container_of(device, struct idxd_dma_dev, dma); + + kfree(idxd_dma); +} + +int idxd_register_dma_device(struct idxd_device *idxd) +{ + struct idxd_dma_dev *idxd_dma; + struct dma_device *dma; + struct device *dev = &idxd->pdev->dev; + int rc; + + idxd_dma = kzalloc_node(sizeof(*idxd_dma), GFP_KERNEL, dev_to_node(dev)); + if (!idxd_dma) + return -ENOMEM; + + dma = &idxd_dma->dma; + INIT_LIST_HEAD(&dma->channels); + dma->dev = dev; + + dma_cap_set(DMA_INTERRUPT, dma->cap_mask); + dma_cap_set(DMA_PRIVATE, dma->cap_mask); + dma_cap_set(DMA_COMPLETION_NO_ORDER, dma->cap_mask); + dma->device_release = idxd_dma_release; + + dma->device_prep_dma_interrupt = idxd_dma_prep_interrupt; + if (idxd->hw.opcap.bits[0] & IDXD_OPCAP_MEMMOVE) { + dma_cap_set(DMA_MEMCPY, dma->cap_mask); + dma->device_prep_dma_memcpy = idxd_dma_submit_memcpy; + } + + dma->device_tx_status = idxd_dma_tx_status; + dma->device_issue_pending = idxd_dma_issue_pending; + dma->device_alloc_chan_resources = idxd_dma_alloc_chan_resources; + dma->device_free_chan_resources = idxd_dma_free_chan_resources; + + rc = dma_async_device_register(dma); + if (rc < 0) { + kfree(idxd_dma); + return rc; + } + + idxd_dma->idxd = idxd; + /* + * This pointer is protected by the refs taken by the dma_chan. It will remain valid + * as long as there are outstanding channels. + */ + idxd->idxd_dma = idxd_dma; + return 0; +} + +void idxd_unregister_dma_device(struct idxd_device *idxd) +{ + dma_async_device_unregister(&idxd->idxd_dma->dma); +} + +static int idxd_register_dma_channel(struct idxd_wq *wq) +{ + struct idxd_device *idxd = wq->idxd; + struct dma_device *dma = &idxd->idxd_dma->dma; + struct device *dev = &idxd->pdev->dev; + struct idxd_dma_chan *idxd_chan; + struct dma_chan *chan; + int rc, i; + + idxd_chan = kzalloc_node(sizeof(*idxd_chan), GFP_KERNEL, dev_to_node(dev)); + if (!idxd_chan) + return -ENOMEM; + + chan = &idxd_chan->chan; + chan->device = dma; + list_add_tail(&chan->device_node, &dma->channels); + + for (i = 0; i < wq->num_descs; i++) { + struct idxd_desc *desc = wq->descs[i]; + + dma_async_tx_descriptor_init(&desc->txd, chan); + desc->txd.tx_submit = idxd_dma_tx_submit; + } + + rc = dma_async_device_channel_register(dma, chan); + if (rc < 0) { + kfree(idxd_chan); + return rc; + } + + wq->idxd_chan = idxd_chan; + idxd_chan->wq = wq; + get_device(wq_confdev(wq)); + + return 0; +} + +static void idxd_unregister_dma_channel(struct idxd_wq *wq) +{ + struct idxd_dma_chan *idxd_chan = wq->idxd_chan; + struct dma_chan *chan = &idxd_chan->chan; + struct idxd_dma_dev *idxd_dma = wq->idxd->idxd_dma; + + dma_async_device_channel_unregister(&idxd_dma->dma, chan); + list_del(&chan->device_node); + kfree(wq->idxd_chan); + wq->idxd_chan = NULL; + put_device(wq_confdev(wq)); +} + +static int idxd_dmaengine_drv_probe(struct idxd_dev *idxd_dev) +{ + struct device *dev = &idxd_dev->conf_dev; + struct idxd_wq *wq = idxd_dev_to_wq(idxd_dev); + struct idxd_device *idxd = wq->idxd; + int rc; + + if (idxd->state != IDXD_DEV_ENABLED) + return -ENXIO; + + mutex_lock(&wq->wq_lock); + wq->type = IDXD_WQT_KERNEL; + + rc = drv_enable_wq(wq); + if (rc < 0) { + dev_dbg(dev, "Enable wq %d failed: %d\n", wq->id, rc); + rc = -ENXIO; + goto err; + } + + rc = idxd_register_dma_channel(wq); + if (rc < 0) { + idxd->cmd_status = IDXD_SCMD_DMA_CHAN_ERR; + dev_dbg(dev, "Failed to register dma channel\n"); + goto err_dma; + } + + idxd->cmd_status = 0; + mutex_unlock(&wq->wq_lock); + return 0; + +err_dma: + drv_disable_wq(wq); +err: + wq->type = IDXD_WQT_NONE; + mutex_unlock(&wq->wq_lock); + return rc; +} + +static void idxd_dmaengine_drv_remove(struct idxd_dev *idxd_dev) +{ + struct idxd_wq *wq = idxd_dev_to_wq(idxd_dev); + + mutex_lock(&wq->wq_lock); + __idxd_wq_quiesce(wq); + idxd_unregister_dma_channel(wq); + drv_disable_wq(wq); + mutex_unlock(&wq->wq_lock); +} + +static enum idxd_dev_type dev_types[] = { + IDXD_DEV_WQ, + IDXD_DEV_NONE, +}; + +struct idxd_device_driver idxd_dmaengine_drv = { + .probe = idxd_dmaengine_drv_probe, + .remove = idxd_dmaengine_drv_remove, + .name = "dmaengine", + .type = dev_types, +}; +EXPORT_SYMBOL_GPL(idxd_dmaengine_drv); diff --git a/drivers/dma/idxd/idxd.h b/drivers/dma/idxd/idxd.h new file mode 100644 index 0000000000..e269ca1f48 --- /dev/null +++ b/drivers/dma/idxd/idxd.h @@ -0,0 +1,752 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright(c) 2019 Intel Corporation. All rights rsvd. */ +#ifndef _IDXD_H_ +#define _IDXD_H_ + +#include <linux/sbitmap.h> +#include <linux/dmaengine.h> +#include <linux/percpu-rwsem.h> +#include <linux/wait.h> +#include <linux/cdev.h> +#include <linux/idr.h> +#include <linux/pci.h> +#include <linux/bitmap.h> +#include <linux/perf_event.h> +#include <linux/iommu.h> +#include <uapi/linux/idxd.h> +#include "registers.h" + +#define IDXD_DRIVER_VERSION "1.00" + +extern struct kmem_cache *idxd_desc_pool; +extern bool tc_override; + +struct idxd_wq; +struct idxd_dev; + +enum idxd_dev_type { + IDXD_DEV_NONE = -1, + IDXD_DEV_DSA = 0, + IDXD_DEV_IAX, + IDXD_DEV_WQ, + IDXD_DEV_GROUP, + IDXD_DEV_ENGINE, + IDXD_DEV_CDEV, + IDXD_DEV_CDEV_FILE, + IDXD_DEV_MAX_TYPE, +}; + +struct idxd_dev { + struct device conf_dev; + enum idxd_dev_type type; +}; + +#define IDXD_REG_TIMEOUT 50 +#define IDXD_DRAIN_TIMEOUT 5000 + +enum idxd_type { + IDXD_TYPE_UNKNOWN = -1, + IDXD_TYPE_DSA = 0, + IDXD_TYPE_IAX, + IDXD_TYPE_MAX, +}; + +#define IDXD_NAME_SIZE 128 +#define IDXD_PMU_EVENT_MAX 64 + +#define IDXD_ENQCMDS_RETRIES 32 +#define IDXD_ENQCMDS_MAX_RETRIES 64 + +struct idxd_device_driver { + const char *name; + enum idxd_dev_type *type; + int (*probe)(struct idxd_dev *idxd_dev); + void (*remove)(struct idxd_dev *idxd_dev); + struct device_driver drv; +}; + +extern struct idxd_device_driver dsa_drv; +extern struct idxd_device_driver idxd_drv; +extern struct idxd_device_driver idxd_dmaengine_drv; +extern struct idxd_device_driver idxd_user_drv; + +#define INVALID_INT_HANDLE -1 +struct idxd_irq_entry { + int id; + int vector; + struct llist_head pending_llist; + struct list_head work_list; + /* + * Lock to protect access between irq thread process descriptor + * and irq thread processing error descriptor. + */ + spinlock_t list_lock; + int int_handle; + ioasid_t pasid; +}; + +struct idxd_group { + struct idxd_dev idxd_dev; + struct idxd_device *idxd; + struct grpcfg grpcfg; + int id; + int num_engines; + int num_wqs; + bool use_rdbuf_limit; + u8 rdbufs_allowed; + u8 rdbufs_reserved; + int tc_a; + int tc_b; + int desc_progress_limit; + int batch_progress_limit; +}; + +struct idxd_pmu { + struct idxd_device *idxd; + + struct perf_event *event_list[IDXD_PMU_EVENT_MAX]; + int n_events; + + DECLARE_BITMAP(used_mask, IDXD_PMU_EVENT_MAX); + + struct pmu pmu; + char name[IDXD_NAME_SIZE]; + int cpu; + + int n_counters; + int counter_width; + int n_event_categories; + + bool per_counter_caps_supported; + unsigned long supported_event_categories; + + unsigned long supported_filters; + int n_filters; + + struct hlist_node cpuhp_node; +}; + +#define IDXD_MAX_PRIORITY 0xf + +enum { + COUNTER_FAULTS = 0, + COUNTER_FAULT_FAILS, + COUNTER_MAX +}; + +enum idxd_wq_state { + IDXD_WQ_DISABLED = 0, + IDXD_WQ_ENABLED, +}; + +enum idxd_wq_flag { + WQ_FLAG_DEDICATED = 0, + WQ_FLAG_BLOCK_ON_FAULT, + WQ_FLAG_ATS_DISABLE, + WQ_FLAG_PRS_DISABLE, +}; + +enum idxd_wq_type { + IDXD_WQT_NONE = 0, + IDXD_WQT_KERNEL, + IDXD_WQT_USER, +}; + +struct idxd_cdev { + struct idxd_wq *wq; + struct cdev cdev; + struct idxd_dev idxd_dev; + int minor; +}; + +#define IDXD_ALLOCATED_BATCH_SIZE 128U +#define WQ_NAME_SIZE 1024 +#define WQ_TYPE_SIZE 10 + +#define WQ_DEFAULT_QUEUE_DEPTH 16 +#define WQ_DEFAULT_MAX_XFER SZ_2M +#define WQ_DEFAULT_MAX_BATCH 32 + +enum idxd_op_type { + IDXD_OP_BLOCK = 0, + IDXD_OP_NONBLOCK = 1, +}; + +enum idxd_complete_type { + IDXD_COMPLETE_NORMAL = 0, + IDXD_COMPLETE_ABORT, + IDXD_COMPLETE_DEV_FAIL, +}; + +struct idxd_dma_chan { + struct dma_chan chan; + struct idxd_wq *wq; +}; + +struct idxd_wq { + void __iomem *portal; + u32 portal_offset; + unsigned int enqcmds_retries; + struct percpu_ref wq_active; + struct completion wq_dead; + struct completion wq_resurrect; + struct idxd_dev idxd_dev; + struct idxd_cdev *idxd_cdev; + struct wait_queue_head err_queue; + struct workqueue_struct *wq; + struct idxd_device *idxd; + int id; + struct idxd_irq_entry ie; + enum idxd_wq_type type; + struct idxd_group *group; + int client_count; + struct mutex wq_lock; /* mutex for workqueue */ + u32 size; + u32 threshold; + u32 priority; + enum idxd_wq_state state; + unsigned long flags; + union wqcfg *wqcfg; + unsigned long *opcap_bmap; + + struct dsa_hw_desc **hw_descs; + int num_descs; + union { + struct dsa_completion_record *compls; + struct iax_completion_record *iax_compls; + }; + dma_addr_t compls_addr; + int compls_size; + struct idxd_desc **descs; + struct sbitmap_queue sbq; + struct idxd_dma_chan *idxd_chan; + char name[WQ_NAME_SIZE + 1]; + u64 max_xfer_bytes; + u32 max_batch_size; + + /* Lock to protect upasid_xa access. */ + struct mutex uc_lock; + struct xarray upasid_xa; +}; + +struct idxd_engine { + struct idxd_dev idxd_dev; + int id; + struct idxd_group *group; + struct idxd_device *idxd; +}; + +/* shadow registers */ +struct idxd_hw { + u32 version; + union gen_cap_reg gen_cap; + union wq_cap_reg wq_cap; + union group_cap_reg group_cap; + union engine_cap_reg engine_cap; + struct opcap opcap; + u32 cmd_cap; + union iaa_cap_reg iaa_cap; +}; + +enum idxd_device_state { + IDXD_DEV_HALTED = -1, + IDXD_DEV_DISABLED = 0, + IDXD_DEV_ENABLED, +}; + +enum idxd_device_flag { + IDXD_FLAG_CONFIGURABLE = 0, + IDXD_FLAG_CMD_RUNNING, + IDXD_FLAG_PASID_ENABLED, + IDXD_FLAG_USER_PASID_ENABLED, +}; + +struct idxd_dma_dev { + struct idxd_device *idxd; + struct dma_device dma; +}; + +struct idxd_driver_data { + const char *name_prefix; + enum idxd_type type; + struct device_type *dev_type; + int compl_size; + int align; + int evl_cr_off; + int cr_status_off; + int cr_result_off; +}; + +struct idxd_evl { + /* Lock to protect event log access. */ + spinlock_t lock; + void *log; + dma_addr_t dma; + /* Total size of event log = number of entries * entry size. */ + unsigned int log_size; + /* The number of entries in the event log. */ + u16 size; + u16 head; + unsigned long *bmap; + bool batch_fail[IDXD_MAX_BATCH_IDENT]; +}; + +struct idxd_evl_fault { + struct work_struct work; + struct idxd_wq *wq; + u8 status; + + /* make this last member always */ + struct __evl_entry entry[]; +}; + +struct idxd_device { + struct idxd_dev idxd_dev; + struct idxd_driver_data *data; + struct list_head list; + struct idxd_hw hw; + enum idxd_device_state state; + unsigned long flags; + int id; + int major; + u32 cmd_status; + struct idxd_irq_entry ie; /* misc irq, msix 0 */ + + struct pci_dev *pdev; + void __iomem *reg_base; + + spinlock_t dev_lock; /* spinlock for device */ + spinlock_t cmd_lock; /* spinlock for device commands */ + struct completion *cmd_done; + struct idxd_group **groups; + struct idxd_wq **wqs; + struct idxd_engine **engines; + + struct iommu_sva *sva; + unsigned int pasid; + + int num_groups; + int irq_cnt; + bool request_int_handles; + + u32 msix_perm_offset; + u32 wqcfg_offset; + u32 grpcfg_offset; + u32 perfmon_offset; + + u64 max_xfer_bytes; + u32 max_batch_size; + int max_groups; + int max_engines; + int max_rdbufs; + int max_wqs; + int max_wq_size; + int rdbuf_limit; + int nr_rdbufs; /* non-reserved read buffers */ + unsigned int wqcfg_size; + unsigned long *wq_enable_map; + + union sw_err_reg sw_err; + wait_queue_head_t cmd_waitq; + + struct idxd_dma_dev *idxd_dma; + struct workqueue_struct *wq; + struct work_struct work; + + struct idxd_pmu *idxd_pmu; + + unsigned long *opcap_bmap; + struct idxd_evl *evl; + struct kmem_cache *evl_cache; + + struct dentry *dbgfs_dir; + struct dentry *dbgfs_evl_file; +}; + +static inline unsigned int evl_ent_size(struct idxd_device *idxd) +{ + return idxd->hw.gen_cap.evl_support ? + (32 * (1 << idxd->hw.gen_cap.evl_support)) : 0; +} + +static inline unsigned int evl_size(struct idxd_device *idxd) +{ + return idxd->evl->size * evl_ent_size(idxd); +} + +/* IDXD software descriptor */ +struct idxd_desc { + union { + struct dsa_hw_desc *hw; + struct iax_hw_desc *iax_hw; + }; + dma_addr_t desc_dma; + union { + struct dsa_completion_record *completion; + struct iax_completion_record *iax_completion; + }; + dma_addr_t compl_dma; + struct dma_async_tx_descriptor txd; + struct llist_node llnode; + struct list_head list; + int id; + int cpu; + struct idxd_wq *wq; +}; + +/* + * This is software defined error for the completion status. We overload the error code + * that will never appear in completion status and only SWERR register. + */ +enum idxd_completion_status { + IDXD_COMP_DESC_ABORT = 0xff, +}; + +#define idxd_confdev(idxd) &idxd->idxd_dev.conf_dev +#define wq_confdev(wq) &wq->idxd_dev.conf_dev +#define engine_confdev(engine) &engine->idxd_dev.conf_dev +#define group_confdev(group) &group->idxd_dev.conf_dev +#define cdev_dev(cdev) &cdev->idxd_dev.conf_dev +#define user_ctx_dev(ctx) (&(ctx)->idxd_dev.conf_dev) + +#define confdev_to_idxd_dev(dev) container_of(dev, struct idxd_dev, conf_dev) +#define idxd_dev_to_idxd(idxd_dev) container_of(idxd_dev, struct idxd_device, idxd_dev) +#define idxd_dev_to_wq(idxd_dev) container_of(idxd_dev, struct idxd_wq, idxd_dev) + +static inline struct idxd_device *confdev_to_idxd(struct device *dev) +{ + struct idxd_dev *idxd_dev = confdev_to_idxd_dev(dev); + + return idxd_dev_to_idxd(idxd_dev); +} + +static inline struct idxd_wq *confdev_to_wq(struct device *dev) +{ + struct idxd_dev *idxd_dev = confdev_to_idxd_dev(dev); + + return idxd_dev_to_wq(idxd_dev); +} + +static inline struct idxd_engine *confdev_to_engine(struct device *dev) +{ + struct idxd_dev *idxd_dev = confdev_to_idxd_dev(dev); + + return container_of(idxd_dev, struct idxd_engine, idxd_dev); +} + +static inline struct idxd_group *confdev_to_group(struct device *dev) +{ + struct idxd_dev *idxd_dev = confdev_to_idxd_dev(dev); + + return container_of(idxd_dev, struct idxd_group, idxd_dev); +} + +static inline struct idxd_cdev *dev_to_cdev(struct device *dev) +{ + struct idxd_dev *idxd_dev = confdev_to_idxd_dev(dev); + + return container_of(idxd_dev, struct idxd_cdev, idxd_dev); +} + +static inline void idxd_dev_set_type(struct idxd_dev *idev, int type) +{ + if (type >= IDXD_DEV_MAX_TYPE) { + idev->type = IDXD_DEV_NONE; + return; + } + + idev->type = type; +} + +static inline struct idxd_irq_entry *idxd_get_ie(struct idxd_device *idxd, int idx) +{ + return (idx == 0) ? &idxd->ie : &idxd->wqs[idx - 1]->ie; +} + +static inline struct idxd_wq *ie_to_wq(struct idxd_irq_entry *ie) +{ + return container_of(ie, struct idxd_wq, ie); +} + +static inline struct idxd_device *ie_to_idxd(struct idxd_irq_entry *ie) +{ + return container_of(ie, struct idxd_device, ie); +} + +static inline void idxd_set_user_intr(struct idxd_device *idxd, bool enable) +{ + union gencfg_reg reg; + + reg.bits = ioread32(idxd->reg_base + IDXD_GENCFG_OFFSET); + reg.user_int_en = enable; + iowrite32(reg.bits, idxd->reg_base + IDXD_GENCFG_OFFSET); +} + +extern struct bus_type dsa_bus_type; + +extern bool support_enqcmd; +extern struct ida idxd_ida; +extern struct device_type dsa_device_type; +extern struct device_type iax_device_type; +extern struct device_type idxd_wq_device_type; +extern struct device_type idxd_engine_device_type; +extern struct device_type idxd_group_device_type; + +static inline bool is_dsa_dev(struct idxd_dev *idxd_dev) +{ + return idxd_dev->type == IDXD_DEV_DSA; +} + +static inline bool is_iax_dev(struct idxd_dev *idxd_dev) +{ + return idxd_dev->type == IDXD_DEV_IAX; +} + +static inline bool is_idxd_dev(struct idxd_dev *idxd_dev) +{ + return is_dsa_dev(idxd_dev) || is_iax_dev(idxd_dev); +} + +static inline bool is_idxd_wq_dev(struct idxd_dev *idxd_dev) +{ + return idxd_dev->type == IDXD_DEV_WQ; +} + +static inline bool is_idxd_wq_dmaengine(struct idxd_wq *wq) +{ + if (wq->type == IDXD_WQT_KERNEL && strcmp(wq->name, "dmaengine") == 0) + return true; + return false; +} + +static inline bool is_idxd_wq_user(struct idxd_wq *wq) +{ + return wq->type == IDXD_WQT_USER; +} + +static inline bool is_idxd_wq_kernel(struct idxd_wq *wq) +{ + return wq->type == IDXD_WQT_KERNEL; +} + +static inline bool wq_dedicated(struct idxd_wq *wq) +{ + return test_bit(WQ_FLAG_DEDICATED, &wq->flags); +} + +static inline bool wq_shared(struct idxd_wq *wq) +{ + return !test_bit(WQ_FLAG_DEDICATED, &wq->flags); +} + +static inline bool device_pasid_enabled(struct idxd_device *idxd) +{ + return test_bit(IDXD_FLAG_PASID_ENABLED, &idxd->flags); +} + +static inline bool device_user_pasid_enabled(struct idxd_device *idxd) +{ + return test_bit(IDXD_FLAG_USER_PASID_ENABLED, &idxd->flags); +} + +static inline bool wq_pasid_enabled(struct idxd_wq *wq) +{ + return (is_idxd_wq_kernel(wq) && device_pasid_enabled(wq->idxd)) || + (is_idxd_wq_user(wq) && device_user_pasid_enabled(wq->idxd)); +} + +static inline bool wq_shared_supported(struct idxd_wq *wq) +{ + return (support_enqcmd && wq_pasid_enabled(wq)); +} + +enum idxd_portal_prot { + IDXD_PORTAL_UNLIMITED = 0, + IDXD_PORTAL_LIMITED, +}; + +enum idxd_interrupt_type { + IDXD_IRQ_MSIX = 0, + IDXD_IRQ_IMS, +}; + +static inline int idxd_get_wq_portal_offset(enum idxd_portal_prot prot) +{ + return prot * 0x1000; +} + +static inline int idxd_get_wq_portal_full_offset(int wq_id, + enum idxd_portal_prot prot) +{ + return ((wq_id * 4) << PAGE_SHIFT) + idxd_get_wq_portal_offset(prot); +} + +#define IDXD_PORTAL_MASK (PAGE_SIZE - 1) + +/* + * Even though this function can be accessed by multiple threads, it is safe to use. + * At worst the address gets used more than once before it gets incremented. We don't + * hit a threshold until iops becomes many million times a second. So the occasional + * reuse of the same address is tolerable compare to using an atomic variable. This is + * safe on a system that has atomic load/store for 32bit integers. Given that this is an + * Intel iEP device, that should not be a problem. + */ +static inline void __iomem *idxd_wq_portal_addr(struct idxd_wq *wq) +{ + int ofs = wq->portal_offset; + + wq->portal_offset = (ofs + sizeof(struct dsa_raw_desc)) & IDXD_PORTAL_MASK; + return wq->portal + ofs; +} + +static inline void idxd_wq_get(struct idxd_wq *wq) +{ + wq->client_count++; +} + +static inline void idxd_wq_put(struct idxd_wq *wq) +{ + wq->client_count--; +} + +static inline int idxd_wq_refcount(struct idxd_wq *wq) +{ + return wq->client_count; +}; + +/* + * Intel IAA does not support batch processing. + * The max batch size of device, max batch size of wq and + * max batch shift of wqcfg should be always 0 on IAA. + */ +static inline void idxd_set_max_batch_size(int idxd_type, struct idxd_device *idxd, + u32 max_batch_size) +{ + if (idxd_type == IDXD_TYPE_IAX) + idxd->max_batch_size = 0; + else + idxd->max_batch_size = max_batch_size; +} + +static inline void idxd_wq_set_max_batch_size(int idxd_type, struct idxd_wq *wq, + u32 max_batch_size) +{ + if (idxd_type == IDXD_TYPE_IAX) + wq->max_batch_size = 0; + else + wq->max_batch_size = max_batch_size; +} + +static inline void idxd_wqcfg_set_max_batch_shift(int idxd_type, union wqcfg *wqcfg, + u32 max_batch_shift) +{ + if (idxd_type == IDXD_TYPE_IAX) + wqcfg->max_batch_shift = 0; + else + wqcfg->max_batch_shift = max_batch_shift; +} + +int __must_check __idxd_driver_register(struct idxd_device_driver *idxd_drv, + struct module *module, const char *mod_name); +#define idxd_driver_register(driver) \ + __idxd_driver_register(driver, THIS_MODULE, KBUILD_MODNAME) + +void idxd_driver_unregister(struct idxd_device_driver *idxd_drv); + +#define module_idxd_driver(__idxd_driver) \ + module_driver(__idxd_driver, idxd_driver_register, idxd_driver_unregister) + +int idxd_register_bus_type(void); +void idxd_unregister_bus_type(void); +int idxd_register_devices(struct idxd_device *idxd); +void idxd_unregister_devices(struct idxd_device *idxd); +void idxd_wqs_quiesce(struct idxd_device *idxd); +bool idxd_queue_int_handle_resubmit(struct idxd_desc *desc); +void multi_u64_to_bmap(unsigned long *bmap, u64 *val, int count); + +/* device interrupt control */ +irqreturn_t idxd_misc_thread(int vec, void *data); +irqreturn_t idxd_wq_thread(int irq, void *data); +void idxd_mask_error_interrupts(struct idxd_device *idxd); +void idxd_unmask_error_interrupts(struct idxd_device *idxd); + +/* device control */ +int idxd_device_drv_probe(struct idxd_dev *idxd_dev); +void idxd_device_drv_remove(struct idxd_dev *idxd_dev); +int drv_enable_wq(struct idxd_wq *wq); +void drv_disable_wq(struct idxd_wq *wq); +int idxd_device_init_reset(struct idxd_device *idxd); +int idxd_device_enable(struct idxd_device *idxd); +int idxd_device_disable(struct idxd_device *idxd); +void idxd_device_reset(struct idxd_device *idxd); +void idxd_device_clear_state(struct idxd_device *idxd); +int idxd_device_config(struct idxd_device *idxd); +void idxd_device_drain_pasid(struct idxd_device *idxd, int pasid); +int idxd_device_load_config(struct idxd_device *idxd); +int idxd_device_request_int_handle(struct idxd_device *idxd, int idx, int *handle, + enum idxd_interrupt_type irq_type); +int idxd_device_release_int_handle(struct idxd_device *idxd, int handle, + enum idxd_interrupt_type irq_type); + +/* work queue control */ +void idxd_wqs_unmap_portal(struct idxd_device *idxd); +int idxd_wq_alloc_resources(struct idxd_wq *wq); +void idxd_wq_free_resources(struct idxd_wq *wq); +int idxd_wq_enable(struct idxd_wq *wq); +int idxd_wq_disable(struct idxd_wq *wq, bool reset_config); +void idxd_wq_drain(struct idxd_wq *wq); +void idxd_wq_reset(struct idxd_wq *wq); +int idxd_wq_map_portal(struct idxd_wq *wq); +void idxd_wq_unmap_portal(struct idxd_wq *wq); +int idxd_wq_set_pasid(struct idxd_wq *wq, int pasid); +int idxd_wq_disable_pasid(struct idxd_wq *wq); +void __idxd_wq_quiesce(struct idxd_wq *wq); +void idxd_wq_quiesce(struct idxd_wq *wq); +int idxd_wq_init_percpu_ref(struct idxd_wq *wq); +void idxd_wq_free_irq(struct idxd_wq *wq); +int idxd_wq_request_irq(struct idxd_wq *wq); + +/* submission */ +int idxd_submit_desc(struct idxd_wq *wq, struct idxd_desc *desc); +struct idxd_desc *idxd_alloc_desc(struct idxd_wq *wq, enum idxd_op_type optype); +void idxd_free_desc(struct idxd_wq *wq, struct idxd_desc *desc); +int idxd_enqcmds(struct idxd_wq *wq, void __iomem *portal, const void *desc); + +/* dmaengine */ +int idxd_register_dma_device(struct idxd_device *idxd); +void idxd_unregister_dma_device(struct idxd_device *idxd); +void idxd_dma_complete_txd(struct idxd_desc *desc, + enum idxd_complete_type comp_type, bool free_desc); + +/* cdev */ +int idxd_cdev_register(void); +void idxd_cdev_remove(void); +int idxd_cdev_get_major(struct idxd_device *idxd); +int idxd_wq_add_cdev(struct idxd_wq *wq); +void idxd_wq_del_cdev(struct idxd_wq *wq); +int idxd_copy_cr(struct idxd_wq *wq, ioasid_t pasid, unsigned long addr, + void *buf, int len); +void idxd_user_counter_increment(struct idxd_wq *wq, u32 pasid, int index); + +/* perfmon */ +#if IS_ENABLED(CONFIG_INTEL_IDXD_PERFMON) +int perfmon_pmu_init(struct idxd_device *idxd); +void perfmon_pmu_remove(struct idxd_device *idxd); +void perfmon_counter_overflow(struct idxd_device *idxd); +void perfmon_init(void); +void perfmon_exit(void); +#else +static inline int perfmon_pmu_init(struct idxd_device *idxd) { return 0; } +static inline void perfmon_pmu_remove(struct idxd_device *idxd) {} +static inline void perfmon_counter_overflow(struct idxd_device *idxd) {} +static inline void perfmon_init(void) {} +static inline void perfmon_exit(void) {} +#endif + +/* debugfs */ +int idxd_device_init_debugfs(struct idxd_device *idxd); +void idxd_device_remove_debugfs(struct idxd_device *idxd); +int idxd_init_debugfs(void); +void idxd_remove_debugfs(void); + +#endif diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c new file mode 100644 index 0000000000..0eb1c827a2 --- /dev/null +++ b/drivers/dma/idxd/init.c @@ -0,0 +1,913 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright(c) 2019 Intel Corporation. All rights rsvd. */ +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/pci.h> +#include <linux/interrupt.h> +#include <linux/delay.h> +#include <linux/dma-mapping.h> +#include <linux/workqueue.h> +#include <linux/fs.h> +#include <linux/io-64-nonatomic-lo-hi.h> +#include <linux/device.h> +#include <linux/idr.h> +#include <linux/iommu.h> +#include <uapi/linux/idxd.h> +#include <linux/dmaengine.h> +#include "../dmaengine.h" +#include "registers.h" +#include "idxd.h" +#include "perfmon.h" + +MODULE_VERSION(IDXD_DRIVER_VERSION); +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR("Intel Corporation"); +MODULE_IMPORT_NS(IDXD); + +static bool sva = true; +module_param(sva, bool, 0644); +MODULE_PARM_DESC(sva, "Toggle SVA support on/off"); + +bool tc_override; +module_param(tc_override, bool, 0644); +MODULE_PARM_DESC(tc_override, "Override traffic class defaults"); + +#define DRV_NAME "idxd" + +bool support_enqcmd; +DEFINE_IDA(idxd_ida); + +static struct idxd_driver_data idxd_driver_data[] = { + [IDXD_TYPE_DSA] = { + .name_prefix = "dsa", + .type = IDXD_TYPE_DSA, + .compl_size = sizeof(struct dsa_completion_record), + .align = 32, + .dev_type = &dsa_device_type, + .evl_cr_off = offsetof(struct dsa_evl_entry, cr), + .cr_status_off = offsetof(struct dsa_completion_record, status), + .cr_result_off = offsetof(struct dsa_completion_record, result), + }, + [IDXD_TYPE_IAX] = { + .name_prefix = "iax", + .type = IDXD_TYPE_IAX, + .compl_size = sizeof(struct iax_completion_record), + .align = 64, + .dev_type = &iax_device_type, + .evl_cr_off = offsetof(struct iax_evl_entry, cr), + .cr_status_off = offsetof(struct iax_completion_record, status), + .cr_result_off = offsetof(struct iax_completion_record, error_code), + }, +}; + +static struct pci_device_id idxd_pci_tbl[] = { + /* DSA ver 1.0 platforms */ + { PCI_DEVICE_DATA(INTEL, DSA_SPR0, &idxd_driver_data[IDXD_TYPE_DSA]) }, + + /* IAX ver 1.0 platforms */ + { PCI_DEVICE_DATA(INTEL, IAX_SPR0, &idxd_driver_data[IDXD_TYPE_IAX]) }, + { 0, } +}; +MODULE_DEVICE_TABLE(pci, idxd_pci_tbl); + +static int idxd_setup_interrupts(struct idxd_device *idxd) +{ + struct pci_dev *pdev = idxd->pdev; + struct device *dev = &pdev->dev; + struct idxd_irq_entry *ie; + int i, msixcnt; + int rc = 0; + + msixcnt = pci_msix_vec_count(pdev); + if (msixcnt < 0) { + dev_err(dev, "Not MSI-X interrupt capable.\n"); + return -ENOSPC; + } + idxd->irq_cnt = msixcnt; + + rc = pci_alloc_irq_vectors(pdev, msixcnt, msixcnt, PCI_IRQ_MSIX); + if (rc != msixcnt) { + dev_err(dev, "Failed enabling %d MSIX entries: %d\n", msixcnt, rc); + return -ENOSPC; + } + dev_dbg(dev, "Enabled %d msix vectors\n", msixcnt); + + + ie = idxd_get_ie(idxd, 0); + ie->vector = pci_irq_vector(pdev, 0); + rc = request_threaded_irq(ie->vector, NULL, idxd_misc_thread, 0, "idxd-misc", ie); + if (rc < 0) { + dev_err(dev, "Failed to allocate misc interrupt.\n"); + goto err_misc_irq; + } + dev_dbg(dev, "Requested idxd-misc handler on msix vector %d\n", ie->vector); + + for (i = 0; i < idxd->max_wqs; i++) { + int msix_idx = i + 1; + + ie = idxd_get_ie(idxd, msix_idx); + ie->id = msix_idx; + ie->int_handle = INVALID_INT_HANDLE; + ie->pasid = IOMMU_PASID_INVALID; + + spin_lock_init(&ie->list_lock); + init_llist_head(&ie->pending_llist); + INIT_LIST_HEAD(&ie->work_list); + } + + idxd_unmask_error_interrupts(idxd); + return 0; + + err_misc_irq: + idxd_mask_error_interrupts(idxd); + pci_free_irq_vectors(pdev); + dev_err(dev, "No usable interrupts\n"); + return rc; +} + +static void idxd_cleanup_interrupts(struct idxd_device *idxd) +{ + struct pci_dev *pdev = idxd->pdev; + struct idxd_irq_entry *ie; + int msixcnt; + + msixcnt = pci_msix_vec_count(pdev); + if (msixcnt <= 0) + return; + + ie = idxd_get_ie(idxd, 0); + idxd_mask_error_interrupts(idxd); + free_irq(ie->vector, ie); + pci_free_irq_vectors(pdev); +} + +static int idxd_setup_wqs(struct idxd_device *idxd) +{ + struct device *dev = &idxd->pdev->dev; + struct idxd_wq *wq; + struct device *conf_dev; + int i, rc; + + idxd->wqs = kcalloc_node(idxd->max_wqs, sizeof(struct idxd_wq *), + GFP_KERNEL, dev_to_node(dev)); + if (!idxd->wqs) + return -ENOMEM; + + idxd->wq_enable_map = bitmap_zalloc_node(idxd->max_wqs, GFP_KERNEL, dev_to_node(dev)); + if (!idxd->wq_enable_map) { + kfree(idxd->wqs); + return -ENOMEM; + } + + for (i = 0; i < idxd->max_wqs; i++) { + wq = kzalloc_node(sizeof(*wq), GFP_KERNEL, dev_to_node(dev)); + if (!wq) { + rc = -ENOMEM; + goto err; + } + + idxd_dev_set_type(&wq->idxd_dev, IDXD_DEV_WQ); + conf_dev = wq_confdev(wq); + wq->id = i; + wq->idxd = idxd; + device_initialize(wq_confdev(wq)); + conf_dev->parent = idxd_confdev(idxd); + conf_dev->bus = &dsa_bus_type; + conf_dev->type = &idxd_wq_device_type; + rc = dev_set_name(conf_dev, "wq%d.%d", idxd->id, wq->id); + if (rc < 0) { + put_device(conf_dev); + goto err; + } + + mutex_init(&wq->wq_lock); + init_waitqueue_head(&wq->err_queue); + init_completion(&wq->wq_dead); + init_completion(&wq->wq_resurrect); + wq->max_xfer_bytes = WQ_DEFAULT_MAX_XFER; + idxd_wq_set_max_batch_size(idxd->data->type, wq, WQ_DEFAULT_MAX_BATCH); + wq->enqcmds_retries = IDXD_ENQCMDS_RETRIES; + wq->wqcfg = kzalloc_node(idxd->wqcfg_size, GFP_KERNEL, dev_to_node(dev)); + if (!wq->wqcfg) { + put_device(conf_dev); + rc = -ENOMEM; + goto err; + } + + if (idxd->hw.wq_cap.op_config) { + wq->opcap_bmap = bitmap_zalloc(IDXD_MAX_OPCAP_BITS, GFP_KERNEL); + if (!wq->opcap_bmap) { + put_device(conf_dev); + rc = -ENOMEM; + goto err; + } + bitmap_copy(wq->opcap_bmap, idxd->opcap_bmap, IDXD_MAX_OPCAP_BITS); + } + mutex_init(&wq->uc_lock); + xa_init(&wq->upasid_xa); + idxd->wqs[i] = wq; + } + + return 0; + + err: + while (--i >= 0) { + wq = idxd->wqs[i]; + conf_dev = wq_confdev(wq); + put_device(conf_dev); + } + return rc; +} + +static int idxd_setup_engines(struct idxd_device *idxd) +{ + struct idxd_engine *engine; + struct device *dev = &idxd->pdev->dev; + struct device *conf_dev; + int i, rc; + + idxd->engines = kcalloc_node(idxd->max_engines, sizeof(struct idxd_engine *), + GFP_KERNEL, dev_to_node(dev)); + if (!idxd->engines) + return -ENOMEM; + + for (i = 0; i < idxd->max_engines; i++) { + engine = kzalloc_node(sizeof(*engine), GFP_KERNEL, dev_to_node(dev)); + if (!engine) { + rc = -ENOMEM; + goto err; + } + + idxd_dev_set_type(&engine->idxd_dev, IDXD_DEV_ENGINE); + conf_dev = engine_confdev(engine); + engine->id = i; + engine->idxd = idxd; + device_initialize(conf_dev); + conf_dev->parent = idxd_confdev(idxd); + conf_dev->bus = &dsa_bus_type; + conf_dev->type = &idxd_engine_device_type; + rc = dev_set_name(conf_dev, "engine%d.%d", idxd->id, engine->id); + if (rc < 0) { + put_device(conf_dev); + goto err; + } + + idxd->engines[i] = engine; + } + + return 0; + + err: + while (--i >= 0) { + engine = idxd->engines[i]; + conf_dev = engine_confdev(engine); + put_device(conf_dev); + } + return rc; +} + +static int idxd_setup_groups(struct idxd_device *idxd) +{ + struct device *dev = &idxd->pdev->dev; + struct device *conf_dev; + struct idxd_group *group; + int i, rc; + + idxd->groups = kcalloc_node(idxd->max_groups, sizeof(struct idxd_group *), + GFP_KERNEL, dev_to_node(dev)); + if (!idxd->groups) + return -ENOMEM; + + for (i = 0; i < idxd->max_groups; i++) { + group = kzalloc_node(sizeof(*group), GFP_KERNEL, dev_to_node(dev)); + if (!group) { + rc = -ENOMEM; + goto err; + } + + idxd_dev_set_type(&group->idxd_dev, IDXD_DEV_GROUP); + conf_dev = group_confdev(group); + group->id = i; + group->idxd = idxd; + device_initialize(conf_dev); + conf_dev->parent = idxd_confdev(idxd); + conf_dev->bus = &dsa_bus_type; + conf_dev->type = &idxd_group_device_type; + rc = dev_set_name(conf_dev, "group%d.%d", idxd->id, group->id); + if (rc < 0) { + put_device(conf_dev); + goto err; + } + + idxd->groups[i] = group; + if (idxd->hw.version <= DEVICE_VERSION_2 && !tc_override) { + group->tc_a = 1; + group->tc_b = 1; + } else { + group->tc_a = -1; + group->tc_b = -1; + } + /* + * The default value is the same as the value of + * total read buffers in GRPCAP. + */ + group->rdbufs_allowed = idxd->max_rdbufs; + } + + return 0; + + err: + while (--i >= 0) { + group = idxd->groups[i]; + put_device(group_confdev(group)); + } + return rc; +} + +static void idxd_cleanup_internals(struct idxd_device *idxd) +{ + int i; + + for (i = 0; i < idxd->max_groups; i++) + put_device(group_confdev(idxd->groups[i])); + for (i = 0; i < idxd->max_engines; i++) + put_device(engine_confdev(idxd->engines[i])); + for (i = 0; i < idxd->max_wqs; i++) + put_device(wq_confdev(idxd->wqs[i])); + destroy_workqueue(idxd->wq); +} + +static int idxd_init_evl(struct idxd_device *idxd) +{ + struct device *dev = &idxd->pdev->dev; + struct idxd_evl *evl; + + if (idxd->hw.gen_cap.evl_support == 0) + return 0; + + evl = kzalloc_node(sizeof(*evl), GFP_KERNEL, dev_to_node(dev)); + if (!evl) + return -ENOMEM; + + spin_lock_init(&evl->lock); + evl->size = IDXD_EVL_SIZE_MIN; + + idxd->evl_cache = kmem_cache_create(dev_name(idxd_confdev(idxd)), + sizeof(struct idxd_evl_fault) + evl_ent_size(idxd), + 0, 0, NULL); + if (!idxd->evl_cache) { + kfree(evl); + return -ENOMEM; + } + + idxd->evl = evl; + return 0; +} + +static int idxd_setup_internals(struct idxd_device *idxd) +{ + struct device *dev = &idxd->pdev->dev; + int rc, i; + + init_waitqueue_head(&idxd->cmd_waitq); + + rc = idxd_setup_wqs(idxd); + if (rc < 0) + goto err_wqs; + + rc = idxd_setup_engines(idxd); + if (rc < 0) + goto err_engine; + + rc = idxd_setup_groups(idxd); + if (rc < 0) + goto err_group; + + idxd->wq = create_workqueue(dev_name(dev)); + if (!idxd->wq) { + rc = -ENOMEM; + goto err_wkq_create; + } + + rc = idxd_init_evl(idxd); + if (rc < 0) + goto err_evl; + + return 0; + + err_evl: + destroy_workqueue(idxd->wq); + err_wkq_create: + for (i = 0; i < idxd->max_groups; i++) + put_device(group_confdev(idxd->groups[i])); + err_group: + for (i = 0; i < idxd->max_engines; i++) + put_device(engine_confdev(idxd->engines[i])); + err_engine: + for (i = 0; i < idxd->max_wqs; i++) + put_device(wq_confdev(idxd->wqs[i])); + err_wqs: + return rc; +} + +static void idxd_read_table_offsets(struct idxd_device *idxd) +{ + union offsets_reg offsets; + struct device *dev = &idxd->pdev->dev; + + offsets.bits[0] = ioread64(idxd->reg_base + IDXD_TABLE_OFFSET); + offsets.bits[1] = ioread64(idxd->reg_base + IDXD_TABLE_OFFSET + sizeof(u64)); + idxd->grpcfg_offset = offsets.grpcfg * IDXD_TABLE_MULT; + dev_dbg(dev, "IDXD Group Config Offset: %#x\n", idxd->grpcfg_offset); + idxd->wqcfg_offset = offsets.wqcfg * IDXD_TABLE_MULT; + dev_dbg(dev, "IDXD Work Queue Config Offset: %#x\n", idxd->wqcfg_offset); + idxd->msix_perm_offset = offsets.msix_perm * IDXD_TABLE_MULT; + dev_dbg(dev, "IDXD MSIX Permission Offset: %#x\n", idxd->msix_perm_offset); + idxd->perfmon_offset = offsets.perfmon * IDXD_TABLE_MULT; + dev_dbg(dev, "IDXD Perfmon Offset: %#x\n", idxd->perfmon_offset); +} + +void multi_u64_to_bmap(unsigned long *bmap, u64 *val, int count) +{ + int i, j, nr; + + for (i = 0, nr = 0; i < count; i++) { + for (j = 0; j < BITS_PER_LONG_LONG; j++) { + if (val[i] & BIT(j)) + set_bit(nr, bmap); + nr++; + } + } +} + +static void idxd_read_caps(struct idxd_device *idxd) +{ + struct device *dev = &idxd->pdev->dev; + int i; + + /* reading generic capabilities */ + idxd->hw.gen_cap.bits = ioread64(idxd->reg_base + IDXD_GENCAP_OFFSET); + dev_dbg(dev, "gen_cap: %#llx\n", idxd->hw.gen_cap.bits); + + if (idxd->hw.gen_cap.cmd_cap) { + idxd->hw.cmd_cap = ioread32(idxd->reg_base + IDXD_CMDCAP_OFFSET); + dev_dbg(dev, "cmd_cap: %#x\n", idxd->hw.cmd_cap); + } + + /* reading command capabilities */ + if (idxd->hw.cmd_cap & BIT(IDXD_CMD_REQUEST_INT_HANDLE)) + idxd->request_int_handles = true; + + idxd->max_xfer_bytes = 1ULL << idxd->hw.gen_cap.max_xfer_shift; + dev_dbg(dev, "max xfer size: %llu bytes\n", idxd->max_xfer_bytes); + idxd_set_max_batch_size(idxd->data->type, idxd, 1U << idxd->hw.gen_cap.max_batch_shift); + dev_dbg(dev, "max batch size: %u\n", idxd->max_batch_size); + if (idxd->hw.gen_cap.config_en) + set_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags); + + /* reading group capabilities */ + idxd->hw.group_cap.bits = + ioread64(idxd->reg_base + IDXD_GRPCAP_OFFSET); + dev_dbg(dev, "group_cap: %#llx\n", idxd->hw.group_cap.bits); + idxd->max_groups = idxd->hw.group_cap.num_groups; + dev_dbg(dev, "max groups: %u\n", idxd->max_groups); + idxd->max_rdbufs = idxd->hw.group_cap.total_rdbufs; + dev_dbg(dev, "max read buffers: %u\n", idxd->max_rdbufs); + idxd->nr_rdbufs = idxd->max_rdbufs; + + /* read engine capabilities */ + idxd->hw.engine_cap.bits = + ioread64(idxd->reg_base + IDXD_ENGCAP_OFFSET); + dev_dbg(dev, "engine_cap: %#llx\n", idxd->hw.engine_cap.bits); + idxd->max_engines = idxd->hw.engine_cap.num_engines; + dev_dbg(dev, "max engines: %u\n", idxd->max_engines); + + /* read workqueue capabilities */ + idxd->hw.wq_cap.bits = ioread64(idxd->reg_base + IDXD_WQCAP_OFFSET); + dev_dbg(dev, "wq_cap: %#llx\n", idxd->hw.wq_cap.bits); + idxd->max_wq_size = idxd->hw.wq_cap.total_wq_size; + dev_dbg(dev, "total workqueue size: %u\n", idxd->max_wq_size); + idxd->max_wqs = idxd->hw.wq_cap.num_wqs; + dev_dbg(dev, "max workqueues: %u\n", idxd->max_wqs); + idxd->wqcfg_size = 1 << (idxd->hw.wq_cap.wqcfg_size + IDXD_WQCFG_MIN); + dev_dbg(dev, "wqcfg size: %u\n", idxd->wqcfg_size); + + /* reading operation capabilities */ + for (i = 0; i < 4; i++) { + idxd->hw.opcap.bits[i] = ioread64(idxd->reg_base + + IDXD_OPCAP_OFFSET + i * sizeof(u64)); + dev_dbg(dev, "opcap[%d]: %#llx\n", i, idxd->hw.opcap.bits[i]); + } + multi_u64_to_bmap(idxd->opcap_bmap, &idxd->hw.opcap.bits[0], 4); + + /* read iaa cap */ + if (idxd->data->type == IDXD_TYPE_IAX && idxd->hw.version >= DEVICE_VERSION_2) + idxd->hw.iaa_cap.bits = ioread64(idxd->reg_base + IDXD_IAACAP_OFFSET); +} + +static struct idxd_device *idxd_alloc(struct pci_dev *pdev, struct idxd_driver_data *data) +{ + struct device *dev = &pdev->dev; + struct device *conf_dev; + struct idxd_device *idxd; + int rc; + + idxd = kzalloc_node(sizeof(*idxd), GFP_KERNEL, dev_to_node(dev)); + if (!idxd) + return NULL; + + conf_dev = idxd_confdev(idxd); + idxd->pdev = pdev; + idxd->data = data; + idxd_dev_set_type(&idxd->idxd_dev, idxd->data->type); + idxd->id = ida_alloc(&idxd_ida, GFP_KERNEL); + if (idxd->id < 0) + return NULL; + + idxd->opcap_bmap = bitmap_zalloc_node(IDXD_MAX_OPCAP_BITS, GFP_KERNEL, dev_to_node(dev)); + if (!idxd->opcap_bmap) { + ida_free(&idxd_ida, idxd->id); + return NULL; + } + + device_initialize(conf_dev); + conf_dev->parent = dev; + conf_dev->bus = &dsa_bus_type; + conf_dev->type = idxd->data->dev_type; + rc = dev_set_name(conf_dev, "%s%d", idxd->data->name_prefix, idxd->id); + if (rc < 0) { + put_device(conf_dev); + return NULL; + } + + spin_lock_init(&idxd->dev_lock); + spin_lock_init(&idxd->cmd_lock); + + return idxd; +} + +static int idxd_enable_system_pasid(struct idxd_device *idxd) +{ + struct pci_dev *pdev = idxd->pdev; + struct device *dev = &pdev->dev; + struct iommu_domain *domain; + ioasid_t pasid; + int ret; + + /* + * Attach a global PASID to the DMA domain so that we can use ENQCMDS + * to submit work on buffers mapped by DMA API. + */ + domain = iommu_get_domain_for_dev(dev); + if (!domain) + return -EPERM; + + pasid = iommu_alloc_global_pasid(dev); + if (pasid == IOMMU_PASID_INVALID) + return -ENOSPC; + + /* + * DMA domain is owned by the driver, it should support all valid + * types such as DMA-FQ, identity, etc. + */ + ret = iommu_attach_device_pasid(domain, dev, pasid); + if (ret) { + dev_err(dev, "failed to attach device pasid %d, domain type %d", + pasid, domain->type); + iommu_free_global_pasid(pasid); + return ret; + } + + /* Since we set user privilege for kernel DMA, enable completion IRQ */ + idxd_set_user_intr(idxd, 1); + idxd->pasid = pasid; + + return ret; +} + +static void idxd_disable_system_pasid(struct idxd_device *idxd) +{ + struct pci_dev *pdev = idxd->pdev; + struct device *dev = &pdev->dev; + struct iommu_domain *domain; + + domain = iommu_get_domain_for_dev(dev); + if (!domain) + return; + + iommu_detach_device_pasid(domain, dev, idxd->pasid); + iommu_free_global_pasid(idxd->pasid); + + idxd_set_user_intr(idxd, 0); + idxd->sva = NULL; + idxd->pasid = IOMMU_PASID_INVALID; +} + +static int idxd_enable_sva(struct pci_dev *pdev) +{ + int ret; + + ret = iommu_dev_enable_feature(&pdev->dev, IOMMU_DEV_FEAT_IOPF); + if (ret) + return ret; + + ret = iommu_dev_enable_feature(&pdev->dev, IOMMU_DEV_FEAT_SVA); + if (ret) + iommu_dev_disable_feature(&pdev->dev, IOMMU_DEV_FEAT_IOPF); + + return ret; +} + +static void idxd_disable_sva(struct pci_dev *pdev) +{ + iommu_dev_disable_feature(&pdev->dev, IOMMU_DEV_FEAT_SVA); + iommu_dev_disable_feature(&pdev->dev, IOMMU_DEV_FEAT_IOPF); +} + +static int idxd_probe(struct idxd_device *idxd) +{ + struct pci_dev *pdev = idxd->pdev; + struct device *dev = &pdev->dev; + int rc; + + dev_dbg(dev, "%s entered and resetting device\n", __func__); + rc = idxd_device_init_reset(idxd); + if (rc < 0) + return rc; + + dev_dbg(dev, "IDXD reset complete\n"); + + if (IS_ENABLED(CONFIG_INTEL_IDXD_SVM) && sva) { + if (idxd_enable_sva(pdev)) { + dev_warn(dev, "Unable to turn on user SVA feature.\n"); + } else { + set_bit(IDXD_FLAG_USER_PASID_ENABLED, &idxd->flags); + + rc = idxd_enable_system_pasid(idxd); + if (rc) + dev_warn(dev, "No in-kernel DMA with PASID. %d\n", rc); + else + set_bit(IDXD_FLAG_PASID_ENABLED, &idxd->flags); + } + } else if (!sva) { + dev_warn(dev, "User forced SVA off via module param.\n"); + } + + idxd_read_caps(idxd); + idxd_read_table_offsets(idxd); + + rc = idxd_setup_internals(idxd); + if (rc) + goto err; + + /* If the configs are readonly, then load them from device */ + if (!test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags)) { + dev_dbg(dev, "Loading RO device config\n"); + rc = idxd_device_load_config(idxd); + if (rc < 0) + goto err_config; + } + + rc = idxd_setup_interrupts(idxd); + if (rc) + goto err_config; + + idxd->major = idxd_cdev_get_major(idxd); + + rc = perfmon_pmu_init(idxd); + if (rc < 0) + dev_warn(dev, "Failed to initialize perfmon. No PMU support: %d\n", rc); + + dev_dbg(dev, "IDXD device %d probed successfully\n", idxd->id); + return 0; + + err_config: + idxd_cleanup_internals(idxd); + err: + if (device_pasid_enabled(idxd)) + idxd_disable_system_pasid(idxd); + if (device_user_pasid_enabled(idxd)) + idxd_disable_sva(pdev); + return rc; +} + +static void idxd_cleanup(struct idxd_device *idxd) +{ + perfmon_pmu_remove(idxd); + idxd_cleanup_interrupts(idxd); + idxd_cleanup_internals(idxd); + if (device_pasid_enabled(idxd)) + idxd_disable_system_pasid(idxd); + if (device_user_pasid_enabled(idxd)) + idxd_disable_sva(idxd->pdev); +} + +static int idxd_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) +{ + struct device *dev = &pdev->dev; + struct idxd_device *idxd; + struct idxd_driver_data *data = (struct idxd_driver_data *)id->driver_data; + int rc; + + rc = pci_enable_device(pdev); + if (rc) + return rc; + + dev_dbg(dev, "Alloc IDXD context\n"); + idxd = idxd_alloc(pdev, data); + if (!idxd) { + rc = -ENOMEM; + goto err_idxd_alloc; + } + + dev_dbg(dev, "Mapping BARs\n"); + idxd->reg_base = pci_iomap(pdev, IDXD_MMIO_BAR, 0); + if (!idxd->reg_base) { + rc = -ENOMEM; + goto err_iomap; + } + + dev_dbg(dev, "Set DMA masks\n"); + rc = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64)); + if (rc) + goto err; + + dev_dbg(dev, "Set PCI master\n"); + pci_set_master(pdev); + pci_set_drvdata(pdev, idxd); + + idxd->hw.version = ioread32(idxd->reg_base + IDXD_VER_OFFSET); + rc = idxd_probe(idxd); + if (rc) { + dev_err(dev, "Intel(R) IDXD DMA Engine init failed\n"); + goto err; + } + + rc = idxd_register_devices(idxd); + if (rc) { + dev_err(dev, "IDXD sysfs setup failed\n"); + goto err_dev_register; + } + + rc = idxd_device_init_debugfs(idxd); + if (rc) + dev_warn(dev, "IDXD debugfs failed to setup\n"); + + dev_info(&pdev->dev, "Intel(R) Accelerator Device (v%x)\n", + idxd->hw.version); + + return 0; + + err_dev_register: + idxd_cleanup(idxd); + err: + pci_iounmap(pdev, idxd->reg_base); + err_iomap: + put_device(idxd_confdev(idxd)); + err_idxd_alloc: + pci_disable_device(pdev); + return rc; +} + +void idxd_wqs_quiesce(struct idxd_device *idxd) +{ + struct idxd_wq *wq; + int i; + + for (i = 0; i < idxd->max_wqs; i++) { + wq = idxd->wqs[i]; + if (wq->state == IDXD_WQ_ENABLED && wq->type == IDXD_WQT_KERNEL) + idxd_wq_quiesce(wq); + } +} + +static void idxd_shutdown(struct pci_dev *pdev) +{ + struct idxd_device *idxd = pci_get_drvdata(pdev); + struct idxd_irq_entry *irq_entry; + int rc; + + rc = idxd_device_disable(idxd); + if (rc) + dev_err(&pdev->dev, "Disabling device failed\n"); + + irq_entry = &idxd->ie; + synchronize_irq(irq_entry->vector); + idxd_mask_error_interrupts(idxd); + flush_workqueue(idxd->wq); +} + +static void idxd_remove(struct pci_dev *pdev) +{ + struct idxd_device *idxd = pci_get_drvdata(pdev); + struct idxd_irq_entry *irq_entry; + + idxd_unregister_devices(idxd); + /* + * When ->release() is called for the idxd->conf_dev, it frees all the memory related + * to the idxd context. The driver still needs those bits in order to do the rest of + * the cleanup. However, we do need to unbound the idxd sub-driver. So take a ref + * on the device here to hold off the freeing while allowing the idxd sub-driver + * to unbind. + */ + get_device(idxd_confdev(idxd)); + device_unregister(idxd_confdev(idxd)); + idxd_shutdown(pdev); + if (device_pasid_enabled(idxd)) + idxd_disable_system_pasid(idxd); + idxd_device_remove_debugfs(idxd); + + irq_entry = idxd_get_ie(idxd, 0); + free_irq(irq_entry->vector, irq_entry); + pci_free_irq_vectors(pdev); + pci_iounmap(pdev, idxd->reg_base); + if (device_user_pasid_enabled(idxd)) + idxd_disable_sva(pdev); + pci_disable_device(pdev); + destroy_workqueue(idxd->wq); + perfmon_pmu_remove(idxd); + put_device(idxd_confdev(idxd)); +} + +static struct pci_driver idxd_pci_driver = { + .name = DRV_NAME, + .id_table = idxd_pci_tbl, + .probe = idxd_pci_probe, + .remove = idxd_remove, + .shutdown = idxd_shutdown, +}; + +static int __init idxd_init_module(void) +{ + int err; + + /* + * If the CPU does not support MOVDIR64B or ENQCMDS, there's no point in + * enumerating the device. We can not utilize it. + */ + if (!cpu_feature_enabled(X86_FEATURE_MOVDIR64B)) { + pr_warn("idxd driver failed to load without MOVDIR64B.\n"); + return -ENODEV; + } + + if (!cpu_feature_enabled(X86_FEATURE_ENQCMD)) + pr_warn("Platform does not have ENQCMD(S) support.\n"); + else + support_enqcmd = true; + + perfmon_init(); + + err = idxd_driver_register(&idxd_drv); + if (err < 0) + goto err_idxd_driver_register; + + err = idxd_driver_register(&idxd_dmaengine_drv); + if (err < 0) + goto err_idxd_dmaengine_driver_register; + + err = idxd_driver_register(&idxd_user_drv); + if (err < 0) + goto err_idxd_user_driver_register; + + err = idxd_cdev_register(); + if (err) + goto err_cdev_register; + + err = idxd_init_debugfs(); + if (err) + goto err_debugfs; + + err = pci_register_driver(&idxd_pci_driver); + if (err) + goto err_pci_register; + + return 0; + +err_pci_register: + idxd_remove_debugfs(); +err_debugfs: + idxd_cdev_remove(); +err_cdev_register: + idxd_driver_unregister(&idxd_user_drv); +err_idxd_user_driver_register: + idxd_driver_unregister(&idxd_dmaengine_drv); +err_idxd_dmaengine_driver_register: + idxd_driver_unregister(&idxd_drv); +err_idxd_driver_register: + return err; +} +module_init(idxd_init_module); + +static void __exit idxd_exit_module(void) +{ + idxd_driver_unregister(&idxd_user_drv); + idxd_driver_unregister(&idxd_dmaengine_drv); + idxd_driver_unregister(&idxd_drv); + pci_unregister_driver(&idxd_pci_driver); + idxd_cdev_remove(); + perfmon_exit(); + idxd_remove_debugfs(); +} +module_exit(idxd_exit_module); diff --git a/drivers/dma/idxd/irq.c b/drivers/dma/idxd/irq.c new file mode 100644 index 0000000000..b501320a9c --- /dev/null +++ b/drivers/dma/idxd/irq.c @@ -0,0 +1,655 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright(c) 2019 Intel Corporation. All rights rsvd. */ +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/pci.h> +#include <linux/io-64-nonatomic-lo-hi.h> +#include <linux/dmaengine.h> +#include <linux/delay.h> +#include <linux/iommu.h> +#include <linux/sched/mm.h> +#include <uapi/linux/idxd.h> +#include "../dmaengine.h" +#include "idxd.h" +#include "registers.h" + +enum irq_work_type { + IRQ_WORK_NORMAL = 0, + IRQ_WORK_PROCESS_FAULT, +}; + +struct idxd_resubmit { + struct work_struct work; + struct idxd_desc *desc; +}; + +struct idxd_int_handle_revoke { + struct work_struct work; + struct idxd_device *idxd; +}; + +static void idxd_device_reinit(struct work_struct *work) +{ + struct idxd_device *idxd = container_of(work, struct idxd_device, work); + struct device *dev = &idxd->pdev->dev; + int rc, i; + + idxd_device_reset(idxd); + rc = idxd_device_config(idxd); + if (rc < 0) + goto out; + + rc = idxd_device_enable(idxd); + if (rc < 0) + goto out; + + for (i = 0; i < idxd->max_wqs; i++) { + if (test_bit(i, idxd->wq_enable_map)) { + struct idxd_wq *wq = idxd->wqs[i]; + + rc = idxd_wq_enable(wq); + if (rc < 0) { + clear_bit(i, idxd->wq_enable_map); + dev_warn(dev, "Unable to re-enable wq %s\n", + dev_name(wq_confdev(wq))); + } + } + } + + return; + + out: + idxd_device_clear_state(idxd); +} + +/* + * The function sends a drain descriptor for the interrupt handle. The drain ensures + * all descriptors with this interrupt handle is flushed and the interrupt + * will allow the cleanup of the outstanding descriptors. + */ +static void idxd_int_handle_revoke_drain(struct idxd_irq_entry *ie) +{ + struct idxd_wq *wq = ie_to_wq(ie); + struct idxd_device *idxd = wq->idxd; + struct device *dev = &idxd->pdev->dev; + struct dsa_hw_desc desc = {}; + void __iomem *portal; + int rc; + + /* Issue a simple drain operation with interrupt but no completion record */ + desc.flags = IDXD_OP_FLAG_RCI; + desc.opcode = DSA_OPCODE_DRAIN; + desc.priv = 1; + + if (ie->pasid != IOMMU_PASID_INVALID) + desc.pasid = ie->pasid; + desc.int_handle = ie->int_handle; + portal = idxd_wq_portal_addr(wq); + + /* + * The wmb() makes sure that the descriptor is all there before we + * issue. + */ + wmb(); + if (wq_dedicated(wq)) { + iosubmit_cmds512(portal, &desc, 1); + } else { + rc = idxd_enqcmds(wq, portal, &desc); + /* This should not fail unless hardware failed. */ + if (rc < 0) + dev_warn(dev, "Failed to submit drain desc on wq %d\n", wq->id); + } +} + +static void idxd_abort_invalid_int_handle_descs(struct idxd_irq_entry *ie) +{ + LIST_HEAD(flist); + struct idxd_desc *d, *t; + struct llist_node *head; + + spin_lock(&ie->list_lock); + head = llist_del_all(&ie->pending_llist); + if (head) { + llist_for_each_entry_safe(d, t, head, llnode) + list_add_tail(&d->list, &ie->work_list); + } + + list_for_each_entry_safe(d, t, &ie->work_list, list) { + if (d->completion->status == DSA_COMP_INT_HANDLE_INVAL) + list_move_tail(&d->list, &flist); + } + spin_unlock(&ie->list_lock); + + list_for_each_entry_safe(d, t, &flist, list) { + list_del(&d->list); + idxd_dma_complete_txd(d, IDXD_COMPLETE_ABORT, true); + } +} + +static void idxd_int_handle_revoke(struct work_struct *work) +{ + struct idxd_int_handle_revoke *revoke = + container_of(work, struct idxd_int_handle_revoke, work); + struct idxd_device *idxd = revoke->idxd; + struct pci_dev *pdev = idxd->pdev; + struct device *dev = &pdev->dev; + int i, new_handle, rc; + + if (!idxd->request_int_handles) { + kfree(revoke); + dev_warn(dev, "Unexpected int handle refresh interrupt.\n"); + return; + } + + /* + * The loop attempts to acquire new interrupt handle for all interrupt + * vectors that supports a handle. If a new interrupt handle is acquired and the + * wq is kernel type, the driver will kill the percpu_ref to pause all + * ongoing descriptor submissions. The interrupt handle is then changed. + * After change, the percpu_ref is revived and all the pending submissions + * are woken to try again. A drain is sent to for the interrupt handle + * at the end to make sure all invalid int handle descriptors are processed. + */ + for (i = 1; i < idxd->irq_cnt; i++) { + struct idxd_irq_entry *ie = idxd_get_ie(idxd, i); + struct idxd_wq *wq = ie_to_wq(ie); + + if (ie->int_handle == INVALID_INT_HANDLE) + continue; + + rc = idxd_device_request_int_handle(idxd, i, &new_handle, IDXD_IRQ_MSIX); + if (rc < 0) { + dev_warn(dev, "get int handle %d failed: %d\n", i, rc); + /* + * Failed to acquire new interrupt handle. Kill the WQ + * and release all the pending submitters. The submitters will + * get error return code and handle appropriately. + */ + ie->int_handle = INVALID_INT_HANDLE; + idxd_wq_quiesce(wq); + idxd_abort_invalid_int_handle_descs(ie); + continue; + } + + /* No change in interrupt handle, nothing needs to be done */ + if (ie->int_handle == new_handle) + continue; + + if (wq->state != IDXD_WQ_ENABLED || wq->type != IDXD_WQT_KERNEL) { + /* + * All the MSIX interrupts are allocated at once during probe. + * Therefore we need to update all interrupts even if the WQ + * isn't supporting interrupt operations. + */ + ie->int_handle = new_handle; + continue; + } + + mutex_lock(&wq->wq_lock); + reinit_completion(&wq->wq_resurrect); + + /* Kill percpu_ref to pause additional descriptor submissions */ + percpu_ref_kill(&wq->wq_active); + + /* Wait for all submitters quiesce before we change interrupt handle */ + wait_for_completion(&wq->wq_dead); + + ie->int_handle = new_handle; + + /* Revive percpu ref and wake up all the waiting submitters */ + percpu_ref_reinit(&wq->wq_active); + complete_all(&wq->wq_resurrect); + mutex_unlock(&wq->wq_lock); + + /* + * The delay here is to wait for all possible MOVDIR64B that + * are issued before percpu_ref_kill() has happened to have + * reached the PCIe domain before the drain is issued. The driver + * needs to ensure that the drain descriptor issued does not pass + * all the other issued descriptors that contain the invalid + * interrupt handle in order to ensure that the drain descriptor + * interrupt will allow the cleanup of all the descriptors with + * invalid interrupt handle. + */ + if (wq_dedicated(wq)) + udelay(100); + idxd_int_handle_revoke_drain(ie); + } + kfree(revoke); +} + +static void idxd_evl_fault_work(struct work_struct *work) +{ + struct idxd_evl_fault *fault = container_of(work, struct idxd_evl_fault, work); + struct idxd_wq *wq = fault->wq; + struct idxd_device *idxd = wq->idxd; + struct device *dev = &idxd->pdev->dev; + struct idxd_evl *evl = idxd->evl; + struct __evl_entry *entry_head = fault->entry; + void *cr = (void *)entry_head + idxd->data->evl_cr_off; + int cr_size = idxd->data->compl_size; + u8 *status = (u8 *)cr + idxd->data->cr_status_off; + u8 *result = (u8 *)cr + idxd->data->cr_result_off; + int copied, copy_size; + bool *bf; + + switch (fault->status) { + case DSA_COMP_CRA_XLAT: + if (entry_head->batch && entry_head->first_err_in_batch) + evl->batch_fail[entry_head->batch_id] = false; + + copy_size = cr_size; + idxd_user_counter_increment(wq, entry_head->pasid, COUNTER_FAULTS); + break; + case DSA_COMP_BATCH_EVL_ERR: + bf = &evl->batch_fail[entry_head->batch_id]; + + copy_size = entry_head->rcr || *bf ? cr_size : 0; + if (*bf) { + if (*status == DSA_COMP_SUCCESS) + *status = DSA_COMP_BATCH_FAIL; + *result = 1; + *bf = false; + } + idxd_user_counter_increment(wq, entry_head->pasid, COUNTER_FAULTS); + break; + case DSA_COMP_DRAIN_EVL: + copy_size = cr_size; + break; + default: + copy_size = 0; + dev_dbg_ratelimited(dev, "Unrecognized error code: %#x\n", fault->status); + break; + } + + if (copy_size == 0) + return; + + /* + * Copy completion record to fault_addr in user address space + * that is found by wq and PASID. + */ + copied = idxd_copy_cr(wq, entry_head->pasid, entry_head->fault_addr, + cr, copy_size); + /* + * The task that triggered the page fault is unknown currently + * because multiple threads may share the user address + * space or the task exits already before this fault. + * So if the copy fails, SIGSEGV can not be sent to the task. + * Just print an error for the failure. The user application + * waiting for the completion record will time out on this + * failure. + */ + switch (fault->status) { + case DSA_COMP_CRA_XLAT: + if (copied != copy_size) { + idxd_user_counter_increment(wq, entry_head->pasid, COUNTER_FAULT_FAILS); + dev_dbg_ratelimited(dev, "Failed to write to completion record: (%d:%d)\n", + copy_size, copied); + if (entry_head->batch) + evl->batch_fail[entry_head->batch_id] = true; + } + break; + case DSA_COMP_BATCH_EVL_ERR: + if (copied != copy_size) { + idxd_user_counter_increment(wq, entry_head->pasid, COUNTER_FAULT_FAILS); + dev_dbg_ratelimited(dev, "Failed to write to batch completion record: (%d:%d)\n", + copy_size, copied); + } + break; + case DSA_COMP_DRAIN_EVL: + if (copied != copy_size) + dev_dbg_ratelimited(dev, "Failed to write to drain completion record: (%d:%d)\n", + copy_size, copied); + break; + } + + kmem_cache_free(idxd->evl_cache, fault); +} + +static void process_evl_entry(struct idxd_device *idxd, + struct __evl_entry *entry_head, unsigned int index) +{ + struct device *dev = &idxd->pdev->dev; + struct idxd_evl *evl = idxd->evl; + u8 status; + + if (test_bit(index, evl->bmap)) { + clear_bit(index, evl->bmap); + } else { + status = DSA_COMP_STATUS(entry_head->error); + + if (status == DSA_COMP_CRA_XLAT || status == DSA_COMP_DRAIN_EVL || + status == DSA_COMP_BATCH_EVL_ERR) { + struct idxd_evl_fault *fault; + int ent_size = evl_ent_size(idxd); + + if (entry_head->rci) + dev_dbg(dev, "Completion Int Req set, ignoring!\n"); + + if (!entry_head->rcr && status == DSA_COMP_DRAIN_EVL) + return; + + fault = kmem_cache_alloc(idxd->evl_cache, GFP_ATOMIC); + if (fault) { + struct idxd_wq *wq = idxd->wqs[entry_head->wq_idx]; + + fault->wq = wq; + fault->status = status; + memcpy(&fault->entry, entry_head, ent_size); + INIT_WORK(&fault->work, idxd_evl_fault_work); + queue_work(wq->wq, &fault->work); + } else { + dev_warn(dev, "Failed to service fault work.\n"); + } + } else { + dev_warn_ratelimited(dev, "Device error %#x operation: %#x fault addr: %#llx\n", + status, entry_head->operation, + entry_head->fault_addr); + } + } +} + +static void process_evl_entries(struct idxd_device *idxd) +{ + union evl_status_reg evl_status; + unsigned int h, t; + struct idxd_evl *evl = idxd->evl; + struct __evl_entry *entry_head; + unsigned int ent_size = evl_ent_size(idxd); + u32 size; + + evl_status.bits = 0; + evl_status.int_pending = 1; + + spin_lock(&evl->lock); + /* Clear interrupt pending bit */ + iowrite32(evl_status.bits_upper32, + idxd->reg_base + IDXD_EVLSTATUS_OFFSET + sizeof(u32)); + h = evl->head; + evl_status.bits = ioread64(idxd->reg_base + IDXD_EVLSTATUS_OFFSET); + t = evl_status.tail; + size = idxd->evl->size; + + while (h != t) { + entry_head = (struct __evl_entry *)(evl->log + (h * ent_size)); + process_evl_entry(idxd, entry_head, h); + h = (h + 1) % size; + } + + evl->head = h; + evl_status.head = h; + iowrite32(evl_status.bits_lower32, idxd->reg_base + IDXD_EVLSTATUS_OFFSET); + spin_unlock(&evl->lock); +} + +irqreturn_t idxd_misc_thread(int vec, void *data) +{ + struct idxd_irq_entry *irq_entry = data; + struct idxd_device *idxd = ie_to_idxd(irq_entry); + struct device *dev = &idxd->pdev->dev; + union gensts_reg gensts; + u32 val = 0; + int i; + bool err = false; + u32 cause; + + cause = ioread32(idxd->reg_base + IDXD_INTCAUSE_OFFSET); + if (!cause) + return IRQ_NONE; + + iowrite32(cause, idxd->reg_base + IDXD_INTCAUSE_OFFSET); + + if (cause & IDXD_INTC_HALT_STATE) + goto halt; + + if (cause & IDXD_INTC_ERR) { + spin_lock(&idxd->dev_lock); + for (i = 0; i < 4; i++) + idxd->sw_err.bits[i] = ioread64(idxd->reg_base + + IDXD_SWERR_OFFSET + i * sizeof(u64)); + + iowrite64(idxd->sw_err.bits[0] & IDXD_SWERR_ACK, + idxd->reg_base + IDXD_SWERR_OFFSET); + + if (idxd->sw_err.valid && idxd->sw_err.wq_idx_valid) { + int id = idxd->sw_err.wq_idx; + struct idxd_wq *wq = idxd->wqs[id]; + + if (wq->type == IDXD_WQT_USER) + wake_up_interruptible(&wq->err_queue); + } else { + int i; + + for (i = 0; i < idxd->max_wqs; i++) { + struct idxd_wq *wq = idxd->wqs[i]; + + if (wq->type == IDXD_WQT_USER) + wake_up_interruptible(&wq->err_queue); + } + } + + spin_unlock(&idxd->dev_lock); + val |= IDXD_INTC_ERR; + + for (i = 0; i < 4; i++) + dev_warn(dev, "err[%d]: %#16.16llx\n", + i, idxd->sw_err.bits[i]); + err = true; + } + + if (cause & IDXD_INTC_INT_HANDLE_REVOKED) { + struct idxd_int_handle_revoke *revoke; + + val |= IDXD_INTC_INT_HANDLE_REVOKED; + + revoke = kzalloc(sizeof(*revoke), GFP_ATOMIC); + if (revoke) { + revoke->idxd = idxd; + INIT_WORK(&revoke->work, idxd_int_handle_revoke); + queue_work(idxd->wq, &revoke->work); + + } else { + dev_err(dev, "Failed to allocate work for int handle revoke\n"); + idxd_wqs_quiesce(idxd); + } + } + + if (cause & IDXD_INTC_CMD) { + val |= IDXD_INTC_CMD; + complete(idxd->cmd_done); + } + + if (cause & IDXD_INTC_OCCUPY) { + /* Driver does not utilize occupancy interrupt */ + val |= IDXD_INTC_OCCUPY; + } + + if (cause & IDXD_INTC_PERFMON_OVFL) { + val |= IDXD_INTC_PERFMON_OVFL; + perfmon_counter_overflow(idxd); + } + + if (cause & IDXD_INTC_EVL) { + val |= IDXD_INTC_EVL; + process_evl_entries(idxd); + } + + val ^= cause; + if (val) + dev_warn_once(dev, "Unexpected interrupt cause bits set: %#x\n", + val); + + if (!err) + goto out; + +halt: + gensts.bits = ioread32(idxd->reg_base + IDXD_GENSTATS_OFFSET); + if (gensts.state == IDXD_DEVICE_STATE_HALT) { + idxd->state = IDXD_DEV_HALTED; + if (gensts.reset_type == IDXD_DEVICE_RESET_SOFTWARE) { + /* + * If we need a software reset, we will throw the work + * on a system workqueue in order to allow interrupts + * for the device command completions. + */ + INIT_WORK(&idxd->work, idxd_device_reinit); + queue_work(idxd->wq, &idxd->work); + } else { + idxd->state = IDXD_DEV_HALTED; + idxd_wqs_quiesce(idxd); + idxd_wqs_unmap_portal(idxd); + idxd_device_clear_state(idxd); + dev_err(&idxd->pdev->dev, + "idxd halted, need %s.\n", + gensts.reset_type == IDXD_DEVICE_RESET_FLR ? + "FLR" : "system reset"); + } + } + +out: + return IRQ_HANDLED; +} + +static void idxd_int_handle_resubmit_work(struct work_struct *work) +{ + struct idxd_resubmit *irw = container_of(work, struct idxd_resubmit, work); + struct idxd_desc *desc = irw->desc; + struct idxd_wq *wq = desc->wq; + int rc; + + desc->completion->status = 0; + rc = idxd_submit_desc(wq, desc); + if (rc < 0) { + dev_dbg(&wq->idxd->pdev->dev, "Failed to resubmit desc %d to wq %d.\n", + desc->id, wq->id); + /* + * If the error is not -EAGAIN, it means the submission failed due to wq + * has been killed instead of ENQCMDS failure. Here the driver needs to + * notify the submitter of the failure by reporting abort status. + * + * -EAGAIN comes from ENQCMDS failure. idxd_submit_desc() will handle the + * abort. + */ + if (rc != -EAGAIN) { + desc->completion->status = IDXD_COMP_DESC_ABORT; + idxd_dma_complete_txd(desc, IDXD_COMPLETE_ABORT, false); + } + idxd_free_desc(wq, desc); + } + kfree(irw); +} + +bool idxd_queue_int_handle_resubmit(struct idxd_desc *desc) +{ + struct idxd_wq *wq = desc->wq; + struct idxd_device *idxd = wq->idxd; + struct idxd_resubmit *irw; + + irw = kzalloc(sizeof(*irw), GFP_KERNEL); + if (!irw) + return false; + + irw->desc = desc; + INIT_WORK(&irw->work, idxd_int_handle_resubmit_work); + queue_work(idxd->wq, &irw->work); + return true; +} + +static void irq_process_pending_llist(struct idxd_irq_entry *irq_entry) +{ + struct idxd_desc *desc, *t; + struct llist_node *head; + + head = llist_del_all(&irq_entry->pending_llist); + if (!head) + return; + + llist_for_each_entry_safe(desc, t, head, llnode) { + u8 status = desc->completion->status & DSA_COMP_STATUS_MASK; + + if (status) { + /* + * Check against the original status as ABORT is software defined + * and 0xff, which DSA_COMP_STATUS_MASK can mask out. + */ + if (unlikely(desc->completion->status == IDXD_COMP_DESC_ABORT)) { + idxd_dma_complete_txd(desc, IDXD_COMPLETE_ABORT, true); + continue; + } + + idxd_dma_complete_txd(desc, IDXD_COMPLETE_NORMAL, true); + } else { + spin_lock(&irq_entry->list_lock); + list_add_tail(&desc->list, + &irq_entry->work_list); + spin_unlock(&irq_entry->list_lock); + } + } +} + +static void irq_process_work_list(struct idxd_irq_entry *irq_entry) +{ + LIST_HEAD(flist); + struct idxd_desc *desc, *n; + + /* + * This lock protects list corruption from access of list outside of the irq handler + * thread. + */ + spin_lock(&irq_entry->list_lock); + if (list_empty(&irq_entry->work_list)) { + spin_unlock(&irq_entry->list_lock); + return; + } + + list_for_each_entry_safe(desc, n, &irq_entry->work_list, list) { + if (desc->completion->status) { + list_move_tail(&desc->list, &flist); + } + } + + spin_unlock(&irq_entry->list_lock); + + list_for_each_entry(desc, &flist, list) { + /* + * Check against the original status as ABORT is software defined + * and 0xff, which DSA_COMP_STATUS_MASK can mask out. + */ + if (unlikely(desc->completion->status == IDXD_COMP_DESC_ABORT)) { + idxd_dma_complete_txd(desc, IDXD_COMPLETE_ABORT, true); + continue; + } + + idxd_dma_complete_txd(desc, IDXD_COMPLETE_NORMAL, true); + } +} + +irqreturn_t idxd_wq_thread(int irq, void *data) +{ + struct idxd_irq_entry *irq_entry = data; + + /* + * There are two lists we are processing. The pending_llist is where + * submmiter adds all the submitted descriptor after sending it to + * the workqueue. It's a lockless singly linked list. The work_list + * is the common linux double linked list. We are in a scenario of + * multiple producers and a single consumer. The producers are all + * the kernel submitters of descriptors, and the consumer is the + * kernel irq handler thread for the msix vector when using threaded + * irq. To work with the restrictions of llist to remain lockless, + * we are doing the following steps: + * 1. Iterate through the work_list and process any completed + * descriptor. Delete the completed entries during iteration. + * 2. llist_del_all() from the pending list. + * 3. Iterate through the llist that was deleted from the pending list + * and process the completed entries. + * 4. If the entry is still waiting on hardware, list_add_tail() to + * the work_list. + */ + irq_process_work_list(irq_entry); + irq_process_pending_llist(irq_entry); + + return IRQ_HANDLED; +} diff --git a/drivers/dma/idxd/perfmon.c b/drivers/dma/idxd/perfmon.c new file mode 100644 index 0000000000..fdda6d6042 --- /dev/null +++ b/drivers/dma/idxd/perfmon.c @@ -0,0 +1,661 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright(c) 2020 Intel Corporation. All rights rsvd. */ + +#include <linux/sched/task.h> +#include <linux/io-64-nonatomic-lo-hi.h> +#include "idxd.h" +#include "perfmon.h" + +static ssize_t cpumask_show(struct device *dev, struct device_attribute *attr, + char *buf); + +static cpumask_t perfmon_dsa_cpu_mask; +static bool cpuhp_set_up; +static enum cpuhp_state cpuhp_slot; + +/* + * perf userspace reads this attribute to determine which cpus to open + * counters on. It's connected to perfmon_dsa_cpu_mask, which is + * maintained by the cpu hotplug handlers. + */ +static DEVICE_ATTR_RO(cpumask); + +static struct attribute *perfmon_cpumask_attrs[] = { + &dev_attr_cpumask.attr, + NULL, +}; + +static struct attribute_group cpumask_attr_group = { + .attrs = perfmon_cpumask_attrs, +}; + +/* + * These attributes specify the bits in the config word that the perf + * syscall uses to pass the event ids and categories to perfmon. + */ +DEFINE_PERFMON_FORMAT_ATTR(event_category, "config:0-3"); +DEFINE_PERFMON_FORMAT_ATTR(event, "config:4-31"); + +/* + * These attributes specify the bits in the config1 word that the perf + * syscall uses to pass filter data to perfmon. + */ +DEFINE_PERFMON_FORMAT_ATTR(filter_wq, "config1:0-31"); +DEFINE_PERFMON_FORMAT_ATTR(filter_tc, "config1:32-39"); +DEFINE_PERFMON_FORMAT_ATTR(filter_pgsz, "config1:40-43"); +DEFINE_PERFMON_FORMAT_ATTR(filter_sz, "config1:44-51"); +DEFINE_PERFMON_FORMAT_ATTR(filter_eng, "config1:52-59"); + +#define PERFMON_FILTERS_START 2 +#define PERFMON_FILTERS_MAX 5 + +static struct attribute *perfmon_format_attrs[] = { + &format_attr_idxd_event_category.attr, + &format_attr_idxd_event.attr, + &format_attr_idxd_filter_wq.attr, + &format_attr_idxd_filter_tc.attr, + &format_attr_idxd_filter_pgsz.attr, + &format_attr_idxd_filter_sz.attr, + &format_attr_idxd_filter_eng.attr, + NULL, +}; + +static struct attribute_group perfmon_format_attr_group = { + .name = "format", + .attrs = perfmon_format_attrs, +}; + +static const struct attribute_group *perfmon_attr_groups[] = { + &perfmon_format_attr_group, + &cpumask_attr_group, + NULL, +}; + +static ssize_t cpumask_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + return cpumap_print_to_pagebuf(true, buf, &perfmon_dsa_cpu_mask); +} + +static bool is_idxd_event(struct idxd_pmu *idxd_pmu, struct perf_event *event) +{ + return &idxd_pmu->pmu == event->pmu; +} + +static int perfmon_collect_events(struct idxd_pmu *idxd_pmu, + struct perf_event *leader, + bool do_grp) +{ + struct perf_event *event; + int n, max_count; + + max_count = idxd_pmu->n_counters; + n = idxd_pmu->n_events; + + if (n >= max_count) + return -EINVAL; + + if (is_idxd_event(idxd_pmu, leader)) { + idxd_pmu->event_list[n] = leader; + idxd_pmu->event_list[n]->hw.idx = n; + n++; + } + + if (!do_grp) + return n; + + for_each_sibling_event(event, leader) { + if (!is_idxd_event(idxd_pmu, event) || + event->state <= PERF_EVENT_STATE_OFF) + continue; + + if (n >= max_count) + return -EINVAL; + + idxd_pmu->event_list[n] = event; + idxd_pmu->event_list[n]->hw.idx = n; + n++; + } + + return n; +} + +static void perfmon_assign_hw_event(struct idxd_pmu *idxd_pmu, + struct perf_event *event, int idx) +{ + struct idxd_device *idxd = idxd_pmu->idxd; + struct hw_perf_event *hwc = &event->hw; + + hwc->idx = idx; + hwc->config_base = ioread64(CNTRCFG_REG(idxd, idx)); + hwc->event_base = ioread64(CNTRCFG_REG(idxd, idx)); +} + +static int perfmon_assign_event(struct idxd_pmu *idxd_pmu, + struct perf_event *event) +{ + int i; + + for (i = 0; i < IDXD_PMU_EVENT_MAX; i++) + if (!test_and_set_bit(i, idxd_pmu->used_mask)) + return i; + + return -EINVAL; +} + +/* + * Check whether there are enough counters to satisfy that all the + * events in the group can actually be scheduled at the same time. + * + * To do this, create a fake idxd_pmu object so the event collection + * and assignment functions can be used without affecting the internal + * state of the real idxd_pmu object. + */ +static int perfmon_validate_group(struct idxd_pmu *pmu, + struct perf_event *event) +{ + struct perf_event *leader = event->group_leader; + struct idxd_pmu *fake_pmu; + int i, ret = 0, n, idx; + + fake_pmu = kzalloc(sizeof(*fake_pmu), GFP_KERNEL); + if (!fake_pmu) + return -ENOMEM; + + fake_pmu->pmu.name = pmu->pmu.name; + fake_pmu->n_counters = pmu->n_counters; + + n = perfmon_collect_events(fake_pmu, leader, true); + if (n < 0) { + ret = n; + goto out; + } + + fake_pmu->n_events = n; + n = perfmon_collect_events(fake_pmu, event, false); + if (n < 0) { + ret = n; + goto out; + } + + fake_pmu->n_events = n; + + for (i = 0; i < n; i++) { + event = fake_pmu->event_list[i]; + + idx = perfmon_assign_event(fake_pmu, event); + if (idx < 0) { + ret = idx; + goto out; + } + } +out: + kfree(fake_pmu); + + return ret; +} + +static int perfmon_pmu_event_init(struct perf_event *event) +{ + struct idxd_device *idxd; + int ret = 0; + + idxd = event_to_idxd(event); + event->hw.idx = -1; + + if (event->attr.type != event->pmu->type) + return -ENOENT; + + /* sampling not supported */ + if (event->attr.sample_period) + return -EINVAL; + + if (event->cpu < 0) + return -EINVAL; + + if (event->pmu != &idxd->idxd_pmu->pmu) + return -EINVAL; + + event->hw.event_base = ioread64(PERFMON_TABLE_OFFSET(idxd)); + event->cpu = idxd->idxd_pmu->cpu; + event->hw.config = event->attr.config; + + if (event->group_leader != event) + /* non-group events have themselves as leader */ + ret = perfmon_validate_group(idxd->idxd_pmu, event); + + return ret; +} + +static inline u64 perfmon_pmu_read_counter(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + struct idxd_device *idxd; + int cntr = hwc->idx; + + idxd = event_to_idxd(event); + + return ioread64(CNTRDATA_REG(idxd, cntr)); +} + +static void perfmon_pmu_event_update(struct perf_event *event) +{ + struct idxd_device *idxd = event_to_idxd(event); + u64 prev_raw_count, new_raw_count, delta, p, n; + int shift = 64 - idxd->idxd_pmu->counter_width; + struct hw_perf_event *hwc = &event->hw; + + prev_raw_count = local64_read(&hwc->prev_count); + do { + new_raw_count = perfmon_pmu_read_counter(event); + } while (!local64_try_cmpxchg(&hwc->prev_count, + &prev_raw_count, new_raw_count)); + n = (new_raw_count << shift); + p = (prev_raw_count << shift); + + delta = ((n - p) >> shift); + + local64_add(delta, &event->count); +} + +void perfmon_counter_overflow(struct idxd_device *idxd) +{ + int i, n_counters, max_loop = OVERFLOW_SIZE; + struct perf_event *event; + unsigned long ovfstatus; + + n_counters = min(idxd->idxd_pmu->n_counters, OVERFLOW_SIZE); + + ovfstatus = ioread32(OVFSTATUS_REG(idxd)); + + /* + * While updating overflowed counters, other counters behind + * them could overflow and be missed in a given pass. + * Normally this could happen at most n_counters times, but in + * theory a tiny counter width could result in continual + * overflows and endless looping. max_loop provides a + * failsafe in that highly unlikely case. + */ + while (ovfstatus && max_loop--) { + /* Figure out which counter(s) overflowed */ + for_each_set_bit(i, &ovfstatus, n_counters) { + unsigned long ovfstatus_clear = 0; + + /* Update event->count for overflowed counter */ + event = idxd->idxd_pmu->event_list[i]; + perfmon_pmu_event_update(event); + /* Writing 1 to OVFSTATUS bit clears it */ + set_bit(i, &ovfstatus_clear); + iowrite32(ovfstatus_clear, OVFSTATUS_REG(idxd)); + } + + ovfstatus = ioread32(OVFSTATUS_REG(idxd)); + } + + /* + * Should never happen. If so, it means a counter(s) looped + * around twice while this handler was running. + */ + WARN_ON_ONCE(ovfstatus); +} + +static inline void perfmon_reset_config(struct idxd_device *idxd) +{ + iowrite32(CONFIG_RESET, PERFRST_REG(idxd)); + iowrite32(0, OVFSTATUS_REG(idxd)); + iowrite32(0, PERFFRZ_REG(idxd)); +} + +static inline void perfmon_reset_counters(struct idxd_device *idxd) +{ + iowrite32(CNTR_RESET, PERFRST_REG(idxd)); +} + +static inline void perfmon_reset(struct idxd_device *idxd) +{ + perfmon_reset_config(idxd); + perfmon_reset_counters(idxd); +} + +static void perfmon_pmu_event_start(struct perf_event *event, int mode) +{ + u32 flt_wq, flt_tc, flt_pg_sz, flt_xfer_sz, flt_eng = 0; + u64 cntr_cfg, cntrdata, event_enc, event_cat = 0; + struct hw_perf_event *hwc = &event->hw; + union filter_cfg flt_cfg; + union event_cfg event_cfg; + struct idxd_device *idxd; + int cntr; + + idxd = event_to_idxd(event); + + event->hw.idx = hwc->idx; + cntr = hwc->idx; + + /* Obtain event category and event value from user space */ + event_cfg.val = event->attr.config; + flt_cfg.val = event->attr.config1; + event_cat = event_cfg.event_cat; + event_enc = event_cfg.event_enc; + + /* Obtain filter configuration from user space */ + flt_wq = flt_cfg.wq; + flt_tc = flt_cfg.tc; + flt_pg_sz = flt_cfg.pg_sz; + flt_xfer_sz = flt_cfg.xfer_sz; + flt_eng = flt_cfg.eng; + + if (flt_wq && test_bit(FLT_WQ, &idxd->idxd_pmu->supported_filters)) + iowrite32(flt_wq, FLTCFG_REG(idxd, cntr, FLT_WQ)); + if (flt_tc && test_bit(FLT_TC, &idxd->idxd_pmu->supported_filters)) + iowrite32(flt_tc, FLTCFG_REG(idxd, cntr, FLT_TC)); + if (flt_pg_sz && test_bit(FLT_PG_SZ, &idxd->idxd_pmu->supported_filters)) + iowrite32(flt_pg_sz, FLTCFG_REG(idxd, cntr, FLT_PG_SZ)); + if (flt_xfer_sz && test_bit(FLT_XFER_SZ, &idxd->idxd_pmu->supported_filters)) + iowrite32(flt_xfer_sz, FLTCFG_REG(idxd, cntr, FLT_XFER_SZ)); + if (flt_eng && test_bit(FLT_ENG, &idxd->idxd_pmu->supported_filters)) + iowrite32(flt_eng, FLTCFG_REG(idxd, cntr, FLT_ENG)); + + /* Read the start value */ + cntrdata = ioread64(CNTRDATA_REG(idxd, cntr)); + local64_set(&event->hw.prev_count, cntrdata); + + /* Set counter to event/category */ + cntr_cfg = event_cat << CNTRCFG_CATEGORY_SHIFT; + cntr_cfg |= event_enc << CNTRCFG_EVENT_SHIFT; + /* Set interrupt on overflow and counter enable bits */ + cntr_cfg |= (CNTRCFG_IRQ_OVERFLOW | CNTRCFG_ENABLE); + + iowrite64(cntr_cfg, CNTRCFG_REG(idxd, cntr)); +} + +static void perfmon_pmu_event_stop(struct perf_event *event, int mode) +{ + struct hw_perf_event *hwc = &event->hw; + struct idxd_device *idxd; + int i, cntr = hwc->idx; + u64 cntr_cfg; + + idxd = event_to_idxd(event); + + /* remove this event from event list */ + for (i = 0; i < idxd->idxd_pmu->n_events; i++) { + if (event != idxd->idxd_pmu->event_list[i]) + continue; + + for (++i; i < idxd->idxd_pmu->n_events; i++) + idxd->idxd_pmu->event_list[i - 1] = idxd->idxd_pmu->event_list[i]; + --idxd->idxd_pmu->n_events; + break; + } + + cntr_cfg = ioread64(CNTRCFG_REG(idxd, cntr)); + cntr_cfg &= ~CNTRCFG_ENABLE; + iowrite64(cntr_cfg, CNTRCFG_REG(idxd, cntr)); + + if (mode == PERF_EF_UPDATE) + perfmon_pmu_event_update(event); + + event->hw.idx = -1; + clear_bit(cntr, idxd->idxd_pmu->used_mask); +} + +static void perfmon_pmu_event_del(struct perf_event *event, int mode) +{ + perfmon_pmu_event_stop(event, PERF_EF_UPDATE); +} + +static int perfmon_pmu_event_add(struct perf_event *event, int flags) +{ + struct idxd_device *idxd = event_to_idxd(event); + struct idxd_pmu *idxd_pmu = idxd->idxd_pmu; + struct hw_perf_event *hwc = &event->hw; + int idx, n; + + n = perfmon_collect_events(idxd_pmu, event, false); + if (n < 0) + return n; + + hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED; + if (!(flags & PERF_EF_START)) + hwc->state |= PERF_HES_ARCH; + + idx = perfmon_assign_event(idxd_pmu, event); + if (idx < 0) + return idx; + + perfmon_assign_hw_event(idxd_pmu, event, idx); + + if (flags & PERF_EF_START) + perfmon_pmu_event_start(event, 0); + + idxd_pmu->n_events = n; + + return 0; +} + +static void enable_perfmon_pmu(struct idxd_device *idxd) +{ + iowrite32(COUNTER_UNFREEZE, PERFFRZ_REG(idxd)); +} + +static void disable_perfmon_pmu(struct idxd_device *idxd) +{ + iowrite32(COUNTER_FREEZE, PERFFRZ_REG(idxd)); +} + +static void perfmon_pmu_enable(struct pmu *pmu) +{ + struct idxd_device *idxd = pmu_to_idxd(pmu); + + enable_perfmon_pmu(idxd); +} + +static void perfmon_pmu_disable(struct pmu *pmu) +{ + struct idxd_device *idxd = pmu_to_idxd(pmu); + + disable_perfmon_pmu(idxd); +} + +static void skip_filter(int i) +{ + int j; + + for (j = i; j < PERFMON_FILTERS_MAX; j++) + perfmon_format_attrs[PERFMON_FILTERS_START + j] = + perfmon_format_attrs[PERFMON_FILTERS_START + j + 1]; +} + +static void idxd_pmu_init(struct idxd_pmu *idxd_pmu) +{ + int i; + + for (i = 0 ; i < PERFMON_FILTERS_MAX; i++) { + if (!test_bit(i, &idxd_pmu->supported_filters)) + skip_filter(i); + } + + idxd_pmu->pmu.name = idxd_pmu->name; + idxd_pmu->pmu.attr_groups = perfmon_attr_groups; + idxd_pmu->pmu.task_ctx_nr = perf_invalid_context; + idxd_pmu->pmu.event_init = perfmon_pmu_event_init; + idxd_pmu->pmu.pmu_enable = perfmon_pmu_enable, + idxd_pmu->pmu.pmu_disable = perfmon_pmu_disable, + idxd_pmu->pmu.add = perfmon_pmu_event_add; + idxd_pmu->pmu.del = perfmon_pmu_event_del; + idxd_pmu->pmu.start = perfmon_pmu_event_start; + idxd_pmu->pmu.stop = perfmon_pmu_event_stop; + idxd_pmu->pmu.read = perfmon_pmu_event_update; + idxd_pmu->pmu.capabilities = PERF_PMU_CAP_NO_EXCLUDE; + idxd_pmu->pmu.module = THIS_MODULE; +} + +void perfmon_pmu_remove(struct idxd_device *idxd) +{ + if (!idxd->idxd_pmu) + return; + + cpuhp_state_remove_instance(cpuhp_slot, &idxd->idxd_pmu->cpuhp_node); + perf_pmu_unregister(&idxd->idxd_pmu->pmu); + kfree(idxd->idxd_pmu); + idxd->idxd_pmu = NULL; +} + +static int perf_event_cpu_online(unsigned int cpu, struct hlist_node *node) +{ + struct idxd_pmu *idxd_pmu; + + idxd_pmu = hlist_entry_safe(node, typeof(*idxd_pmu), cpuhp_node); + + /* select the first online CPU as the designated reader */ + if (cpumask_empty(&perfmon_dsa_cpu_mask)) { + cpumask_set_cpu(cpu, &perfmon_dsa_cpu_mask); + idxd_pmu->cpu = cpu; + } + + return 0; +} + +static int perf_event_cpu_offline(unsigned int cpu, struct hlist_node *node) +{ + struct idxd_pmu *idxd_pmu; + unsigned int target; + + idxd_pmu = hlist_entry_safe(node, typeof(*idxd_pmu), cpuhp_node); + + if (!cpumask_test_and_clear_cpu(cpu, &perfmon_dsa_cpu_mask)) + return 0; + + target = cpumask_any_but(cpu_online_mask, cpu); + + /* migrate events if there is a valid target */ + if (target < nr_cpu_ids) + cpumask_set_cpu(target, &perfmon_dsa_cpu_mask); + else + target = -1; + + perf_pmu_migrate_context(&idxd_pmu->pmu, cpu, target); + + return 0; +} + +int perfmon_pmu_init(struct idxd_device *idxd) +{ + union idxd_perfcap perfcap; + struct idxd_pmu *idxd_pmu; + int rc = -ENODEV; + + /* + * perfmon module initialization failed, nothing to do + */ + if (!cpuhp_set_up) + return -ENODEV; + + /* + * If perfmon_offset or num_counters is 0, it means perfmon is + * not supported on this hardware. + */ + if (idxd->perfmon_offset == 0) + return -ENODEV; + + idxd_pmu = kzalloc(sizeof(*idxd_pmu), GFP_KERNEL); + if (!idxd_pmu) + return -ENOMEM; + + idxd_pmu->idxd = idxd; + idxd->idxd_pmu = idxd_pmu; + + if (idxd->data->type == IDXD_TYPE_DSA) { + rc = sprintf(idxd_pmu->name, "dsa%d", idxd->id); + if (rc < 0) + goto free; + } else if (idxd->data->type == IDXD_TYPE_IAX) { + rc = sprintf(idxd_pmu->name, "iax%d", idxd->id); + if (rc < 0) + goto free; + } else { + goto free; + } + + perfmon_reset(idxd); + + perfcap.bits = ioread64(PERFCAP_REG(idxd)); + + /* + * If total perf counter is 0, stop further registration. + * This is necessary in order to support driver running on + * guest which does not have pmon support. + */ + if (perfcap.num_perf_counter == 0) + goto free; + + /* A counter width of 0 means it can't count */ + if (perfcap.counter_width == 0) + goto free; + + /* Overflow interrupt and counter freeze support must be available */ + if (!perfcap.overflow_interrupt || !perfcap.counter_freeze) + goto free; + + /* Number of event categories cannot be 0 */ + if (perfcap.num_event_category == 0) + goto free; + + /* + * We don't support per-counter capabilities for now. + */ + if (perfcap.cap_per_counter) + goto free; + + idxd_pmu->n_event_categories = perfcap.num_event_category; + idxd_pmu->supported_event_categories = perfcap.global_event_category; + idxd_pmu->per_counter_caps_supported = perfcap.cap_per_counter; + + /* check filter capability. If 0, then filters are not supported */ + idxd_pmu->supported_filters = perfcap.filter; + if (perfcap.filter) + idxd_pmu->n_filters = hweight8(perfcap.filter); + + /* Store the total number of counters categories, and counter width */ + idxd_pmu->n_counters = perfcap.num_perf_counter; + idxd_pmu->counter_width = perfcap.counter_width; + + idxd_pmu_init(idxd_pmu); + + rc = perf_pmu_register(&idxd_pmu->pmu, idxd_pmu->name, -1); + if (rc) + goto free; + + rc = cpuhp_state_add_instance(cpuhp_slot, &idxd_pmu->cpuhp_node); + if (rc) { + perf_pmu_unregister(&idxd->idxd_pmu->pmu); + goto free; + } +out: + return rc; +free: + kfree(idxd_pmu); + idxd->idxd_pmu = NULL; + + goto out; +} + +void __init perfmon_init(void) +{ + int rc = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, + "driver/dma/idxd/perf:online", + perf_event_cpu_online, + perf_event_cpu_offline); + if (WARN_ON(rc < 0)) + return; + + cpuhp_slot = rc; + cpuhp_set_up = true; +} + +void __exit perfmon_exit(void) +{ + if (cpuhp_set_up) + cpuhp_remove_multi_state(cpuhp_slot); +} diff --git a/drivers/dma/idxd/perfmon.h b/drivers/dma/idxd/perfmon.h new file mode 100644 index 0000000000..9a081a1bc6 --- /dev/null +++ b/drivers/dma/idxd/perfmon.h @@ -0,0 +1,119 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright(c) 2020 Intel Corporation. All rights rsvd. */ + +#ifndef _PERFMON_H_ +#define _PERFMON_H_ + +#include <linux/slab.h> +#include <linux/pci.h> +#include <linux/sbitmap.h> +#include <linux/dmaengine.h> +#include <linux/percpu-rwsem.h> +#include <linux/wait.h> +#include <linux/cdev.h> +#include <linux/uuid.h> +#include <linux/idxd.h> +#include <linux/perf_event.h> +#include "registers.h" + +static inline struct idxd_pmu *event_to_pmu(struct perf_event *event) +{ + struct idxd_pmu *idxd_pmu; + struct pmu *pmu; + + pmu = event->pmu; + idxd_pmu = container_of(pmu, struct idxd_pmu, pmu); + + return idxd_pmu; +} + +static inline struct idxd_device *event_to_idxd(struct perf_event *event) +{ + struct idxd_pmu *idxd_pmu; + struct pmu *pmu; + + pmu = event->pmu; + idxd_pmu = container_of(pmu, struct idxd_pmu, pmu); + + return idxd_pmu->idxd; +} + +static inline struct idxd_device *pmu_to_idxd(struct pmu *pmu) +{ + struct idxd_pmu *idxd_pmu; + + idxd_pmu = container_of(pmu, struct idxd_pmu, pmu); + + return idxd_pmu->idxd; +} + +enum dsa_perf_events { + DSA_PERF_EVENT_WQ = 0, + DSA_PERF_EVENT_ENGINE, + DSA_PERF_EVENT_ADDR_TRANS, + DSA_PERF_EVENT_OP, + DSA_PERF_EVENT_COMPL, + DSA_PERF_EVENT_MAX, +}; + +enum filter_enc { + FLT_WQ = 0, + FLT_TC, + FLT_PG_SZ, + FLT_XFER_SZ, + FLT_ENG, + FLT_MAX, +}; + +#define CONFIG_RESET 0x0000000000000001 +#define CNTR_RESET 0x0000000000000002 +#define CNTR_ENABLE 0x0000000000000001 +#define INTR_OVFL 0x0000000000000002 + +#define COUNTER_FREEZE 0x00000000FFFFFFFF +#define COUNTER_UNFREEZE 0x0000000000000000 +#define OVERFLOW_SIZE 32 + +#define CNTRCFG_ENABLE BIT(0) +#define CNTRCFG_IRQ_OVERFLOW BIT(1) +#define CNTRCFG_CATEGORY_SHIFT 8 +#define CNTRCFG_EVENT_SHIFT 32 + +#define PERFMON_TABLE_OFFSET(_idxd) \ +({ \ + typeof(_idxd) __idxd = (_idxd); \ + ((__idxd)->reg_base + (__idxd)->perfmon_offset); \ +}) +#define PERFMON_REG_OFFSET(idxd, offset) \ + (PERFMON_TABLE_OFFSET(idxd) + (offset)) + +#define PERFCAP_REG(idxd) (PERFMON_REG_OFFSET(idxd, IDXD_PERFCAP_OFFSET)) +#define PERFRST_REG(idxd) (PERFMON_REG_OFFSET(idxd, IDXD_PERFRST_OFFSET)) +#define OVFSTATUS_REG(idxd) (PERFMON_REG_OFFSET(idxd, IDXD_OVFSTATUS_OFFSET)) +#define PERFFRZ_REG(idxd) (PERFMON_REG_OFFSET(idxd, IDXD_PERFFRZ_OFFSET)) + +#define FLTCFG_REG(idxd, cntr, flt) \ + (PERFMON_REG_OFFSET(idxd, IDXD_FLTCFG_OFFSET) + ((cntr) * 32) + ((flt) * 4)) + +#define CNTRCFG_REG(idxd, cntr) \ + (PERFMON_REG_OFFSET(idxd, IDXD_CNTRCFG_OFFSET) + ((cntr) * 8)) +#define CNTRDATA_REG(idxd, cntr) \ + (PERFMON_REG_OFFSET(idxd, IDXD_CNTRDATA_OFFSET) + ((cntr) * 8)) +#define CNTRCAP_REG(idxd, cntr) \ + (PERFMON_REG_OFFSET(idxd, IDXD_CNTRCAP_OFFSET) + ((cntr) * 8)) + +#define EVNTCAP_REG(idxd, category) \ + (PERFMON_REG_OFFSET(idxd, IDXD_EVNTCAP_OFFSET) + ((category) * 8)) + +#define DEFINE_PERFMON_FORMAT_ATTR(_name, _format) \ +static ssize_t __perfmon_idxd_##_name##_show(struct kobject *kobj, \ + struct kobj_attribute *attr, \ + char *page) \ +{ \ + BUILD_BUG_ON(sizeof(_format) >= PAGE_SIZE); \ + return sprintf(page, _format "\n"); \ +} \ +static struct kobj_attribute format_attr_idxd_##_name = \ + __ATTR(_name, 0444, __perfmon_idxd_##_name##_show, NULL) + +#endif diff --git a/drivers/dma/idxd/registers.h b/drivers/dma/idxd/registers.h new file mode 100644 index 0000000000..7b54a3939e --- /dev/null +++ b/drivers/dma/idxd/registers.h @@ -0,0 +1,632 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright(c) 2019 Intel Corporation. All rights rsvd. */ +#ifndef _IDXD_REGISTERS_H_ +#define _IDXD_REGISTERS_H_ + +#include <uapi/linux/idxd.h> + +/* PCI Config */ +#define PCI_DEVICE_ID_INTEL_DSA_SPR0 0x0b25 +#define PCI_DEVICE_ID_INTEL_IAX_SPR0 0x0cfe + +#define DEVICE_VERSION_1 0x100 +#define DEVICE_VERSION_2 0x200 + +#define IDXD_MMIO_BAR 0 +#define IDXD_WQ_BAR 2 +#define IDXD_PORTAL_SIZE PAGE_SIZE + +/* MMIO Device BAR0 Registers */ +#define IDXD_VER_OFFSET 0x00 +#define IDXD_VER_MAJOR_MASK 0xf0 +#define IDXD_VER_MINOR_MASK 0x0f +#define GET_IDXD_VER_MAJOR(x) (((x) & IDXD_VER_MAJOR_MASK) >> 4) +#define GET_IDXD_VER_MINOR(x) ((x) & IDXD_VER_MINOR_MASK) + +union gen_cap_reg { + struct { + u64 block_on_fault:1; + u64 overlap_copy:1; + u64 cache_control_mem:1; + u64 cache_control_cache:1; + u64 cmd_cap:1; + u64 rsvd:3; + u64 dest_readback:1; + u64 drain_readback:1; + u64 rsvd2:3; + u64 evl_support:2; + u64 batch_continuation:1; + u64 max_xfer_shift:5; + u64 max_batch_shift:4; + u64 max_ims_mult:6; + u64 config_en:1; + u64 rsvd3:32; + }; + u64 bits; +} __packed; +#define IDXD_GENCAP_OFFSET 0x10 + +union wq_cap_reg { + struct { + u64 total_wq_size:16; + u64 num_wqs:8; + u64 wqcfg_size:4; + u64 rsvd:20; + u64 shared_mode:1; + u64 dedicated_mode:1; + u64 wq_ats_support:1; + u64 priority:1; + u64 occupancy:1; + u64 occupancy_int:1; + u64 op_config:1; + u64 wq_prs_support:1; + u64 rsvd4:8; + }; + u64 bits; +} __packed; +#define IDXD_WQCAP_OFFSET 0x20 +#define IDXD_WQCFG_MIN 5 + +union group_cap_reg { + struct { + u64 num_groups:8; + u64 total_rdbufs:8; /* formerly total_tokens */ + u64 rdbuf_ctrl:1; /* formerly token_en */ + u64 rdbuf_limit:1; /* formerly token_limit */ + u64 progress_limit:1; /* descriptor and batch descriptor */ + u64 rsvd:45; + }; + u64 bits; +} __packed; +#define IDXD_GRPCAP_OFFSET 0x30 + +union engine_cap_reg { + struct { + u64 num_engines:8; + u64 rsvd:56; + }; + u64 bits; +} __packed; + +#define IDXD_ENGCAP_OFFSET 0x38 + +#define IDXD_OPCAP_NOOP 0x0001 +#define IDXD_OPCAP_BATCH 0x0002 +#define IDXD_OPCAP_MEMMOVE 0x0008 +struct opcap { + u64 bits[4]; +}; + +#define IDXD_MAX_OPCAP_BITS 256U + +#define IDXD_OPCAP_OFFSET 0x40 + +#define IDXD_TABLE_OFFSET 0x60 +union offsets_reg { + struct { + u64 grpcfg:16; + u64 wqcfg:16; + u64 msix_perm:16; + u64 ims:16; + u64 perfmon:16; + u64 rsvd:48; + }; + u64 bits[2]; +} __packed; + +#define IDXD_TABLE_MULT 0x100 + +#define IDXD_GENCFG_OFFSET 0x80 +union gencfg_reg { + struct { + u32 rdbuf_limit:8; + u32 rsvd:4; + u32 user_int_en:1; + u32 evl_en:1; + u32 rsvd2:18; + }; + u32 bits; +} __packed; + +#define IDXD_GENCTRL_OFFSET 0x88 +union genctrl_reg { + struct { + u32 softerr_int_en:1; + u32 halt_int_en:1; + u32 evl_int_en:1; + u32 rsvd:29; + }; + u32 bits; +} __packed; + +#define IDXD_GENSTATS_OFFSET 0x90 +union gensts_reg { + struct { + u32 state:2; + u32 reset_type:2; + u32 rsvd:28; + }; + u32 bits; +} __packed; + +enum idxd_device_status_state { + IDXD_DEVICE_STATE_DISABLED = 0, + IDXD_DEVICE_STATE_ENABLED, + IDXD_DEVICE_STATE_DRAIN, + IDXD_DEVICE_STATE_HALT, +}; + +enum idxd_device_reset_type { + IDXD_DEVICE_RESET_SOFTWARE = 0, + IDXD_DEVICE_RESET_FLR, + IDXD_DEVICE_RESET_WARM, + IDXD_DEVICE_RESET_COLD, +}; + +#define IDXD_INTCAUSE_OFFSET 0x98 +#define IDXD_INTC_ERR 0x01 +#define IDXD_INTC_CMD 0x02 +#define IDXD_INTC_OCCUPY 0x04 +#define IDXD_INTC_PERFMON_OVFL 0x08 +#define IDXD_INTC_HALT_STATE 0x10 +#define IDXD_INTC_EVL 0x20 +#define IDXD_INTC_INT_HANDLE_REVOKED 0x80000000 + +#define IDXD_CMD_OFFSET 0xa0 +union idxd_command_reg { + struct { + u32 operand:20; + u32 cmd:5; + u32 rsvd:6; + u32 int_req:1; + }; + u32 bits; +} __packed; + +enum idxd_cmd { + IDXD_CMD_ENABLE_DEVICE = 1, + IDXD_CMD_DISABLE_DEVICE, + IDXD_CMD_DRAIN_ALL, + IDXD_CMD_ABORT_ALL, + IDXD_CMD_RESET_DEVICE, + IDXD_CMD_ENABLE_WQ, + IDXD_CMD_DISABLE_WQ, + IDXD_CMD_DRAIN_WQ, + IDXD_CMD_ABORT_WQ, + IDXD_CMD_RESET_WQ, + IDXD_CMD_DRAIN_PASID, + IDXD_CMD_ABORT_PASID, + IDXD_CMD_REQUEST_INT_HANDLE, + IDXD_CMD_RELEASE_INT_HANDLE, +}; + +#define CMD_INT_HANDLE_IMS 0x10000 + +#define IDXD_CMDSTS_OFFSET 0xa8 +union cmdsts_reg { + struct { + u8 err; + u16 result; + u8 rsvd:7; + u8 active:1; + }; + u32 bits; +} __packed; +#define IDXD_CMDSTS_ACTIVE 0x80000000 +#define IDXD_CMDSTS_ERR_MASK 0xff +#define IDXD_CMDSTS_RES_SHIFT 8 + +enum idxd_cmdsts_err { + IDXD_CMDSTS_SUCCESS = 0, + IDXD_CMDSTS_INVAL_CMD, + IDXD_CMDSTS_INVAL_WQIDX, + IDXD_CMDSTS_HW_ERR, + /* enable device errors */ + IDXD_CMDSTS_ERR_DEV_ENABLED = 0x10, + IDXD_CMDSTS_ERR_CONFIG, + IDXD_CMDSTS_ERR_BUSMASTER_EN, + IDXD_CMDSTS_ERR_PASID_INVAL, + IDXD_CMDSTS_ERR_WQ_SIZE_ERANGE, + IDXD_CMDSTS_ERR_GRP_CONFIG, + IDXD_CMDSTS_ERR_GRP_CONFIG2, + IDXD_CMDSTS_ERR_GRP_CONFIG3, + IDXD_CMDSTS_ERR_GRP_CONFIG4, + /* enable wq errors */ + IDXD_CMDSTS_ERR_DEV_NOTEN = 0x20, + IDXD_CMDSTS_ERR_WQ_ENABLED, + IDXD_CMDSTS_ERR_WQ_SIZE, + IDXD_CMDSTS_ERR_WQ_PRIOR, + IDXD_CMDSTS_ERR_WQ_MODE, + IDXD_CMDSTS_ERR_BOF_EN, + IDXD_CMDSTS_ERR_PASID_EN, + IDXD_CMDSTS_ERR_MAX_BATCH_SIZE, + IDXD_CMDSTS_ERR_MAX_XFER_SIZE, + /* disable device errors */ + IDXD_CMDSTS_ERR_DIS_DEV_EN = 0x31, + /* disable WQ, drain WQ, abort WQ, reset WQ */ + IDXD_CMDSTS_ERR_DEV_NOT_EN, + /* request interrupt handle */ + IDXD_CMDSTS_ERR_INVAL_INT_IDX = 0x41, + IDXD_CMDSTS_ERR_NO_HANDLE, +}; + +#define IDXD_CMDCAP_OFFSET 0xb0 + +#define IDXD_SWERR_OFFSET 0xc0 +#define IDXD_SWERR_VALID 0x00000001 +#define IDXD_SWERR_OVERFLOW 0x00000002 +#define IDXD_SWERR_ACK (IDXD_SWERR_VALID | IDXD_SWERR_OVERFLOW) +union sw_err_reg { + struct { + u64 valid:1; + u64 overflow:1; + u64 desc_valid:1; + u64 wq_idx_valid:1; + u64 batch:1; + u64 fault_rw:1; + u64 priv:1; + u64 rsvd:1; + u64 error:8; + u64 wq_idx:8; + u64 rsvd2:8; + u64 operation:8; + u64 pasid:20; + u64 rsvd3:4; + + u64 batch_idx:16; + u64 rsvd4:16; + u64 invalid_flags:32; + + u64 fault_addr; + + u64 rsvd5; + }; + u64 bits[4]; +} __packed; + +union iaa_cap_reg { + struct { + u64 dec_aecs_format_ver:1; + u64 drop_init_bits:1; + u64 chaining:1; + u64 force_array_output_mod:1; + u64 load_part_aecs:1; + u64 comp_early_abort:1; + u64 nested_comp:1; + u64 diction_comp:1; + u64 header_gen:1; + u64 crypto_gcm:1; + u64 crypto_cfb:1; + u64 crypto_xts:1; + u64 rsvd:52; + }; + u64 bits; +} __packed; + +#define IDXD_IAACAP_OFFSET 0x180 + +#define IDXD_EVLCFG_OFFSET 0xe0 +union evlcfg_reg { + struct { + u64 pasid_en:1; + u64 priv:1; + u64 rsvd:10; + u64 base_addr:52; + + u64 size:16; + u64 pasid:20; + u64 rsvd2:28; + }; + u64 bits[2]; +} __packed; + +#define IDXD_EVL_SIZE_MIN 0x0040 +#define IDXD_EVL_SIZE_MAX 0xffff + +union msix_perm { + struct { + u32 rsvd:2; + u32 ignore:1; + u32 pasid_en:1; + u32 rsvd2:8; + u32 pasid:20; + }; + u32 bits; +} __packed; + +union group_flags { + struct { + u64 tc_a:3; + u64 tc_b:3; + u64 rsvd:1; + u64 use_rdbuf_limit:1; + u64 rdbufs_reserved:8; + u64 rsvd2:4; + u64 rdbufs_allowed:8; + u64 rsvd3:4; + u64 desc_progress_limit:2; + u64 rsvd4:2; + u64 batch_progress_limit:2; + u64 rsvd5:26; + }; + u64 bits; +} __packed; + +struct grpcfg { + u64 wqs[4]; + u64 engines; + union group_flags flags; +} __packed; + +union wqcfg { + struct { + /* bytes 0-3 */ + u16 wq_size; + u16 rsvd; + + /* bytes 4-7 */ + u16 wq_thresh; + u16 rsvd1; + + /* bytes 8-11 */ + u32 mode:1; /* shared or dedicated */ + u32 bof:1; /* block on fault */ + u32 wq_ats_disable:1; + u32 wq_prs_disable:1; + u32 priority:4; + u32 pasid:20; + u32 pasid_en:1; + u32 priv:1; + u32 rsvd3:2; + + /* bytes 12-15 */ + u32 max_xfer_shift:5; + u32 max_batch_shift:4; + u32 rsvd4:23; + + /* bytes 16-19 */ + u16 occupancy_inth; + u16 occupancy_table_sel:1; + u16 rsvd5:15; + + /* bytes 20-23 */ + u16 occupancy_limit; + u16 occupancy_int_en:1; + u16 rsvd6:15; + + /* bytes 24-27 */ + u16 occupancy; + u16 occupancy_int:1; + u16 rsvd7:12; + u16 mode_support:1; + u16 wq_state:2; + + /* bytes 28-31 */ + u32 rsvd8; + + /* bytes 32-63 */ + u64 op_config[4]; + }; + u32 bits[16]; +} __packed; + +#define WQCFG_PASID_IDX 2 +#define WQCFG_PRIVL_IDX 2 +#define WQCFG_OCCUP_IDX 6 + +#define WQCFG_OCCUP_MASK 0xffff + +/* + * This macro calculates the offset into the WQCFG register + * idxd - struct idxd * + * n - wq id + * ofs - the index of the 32b dword for the config register + * + * The WQCFG register block is divided into groups per each wq. The n index + * allows us to move to the register group that's for that particular wq. + * Each register is 32bits. The ofs gives us the number of register to access. + */ +#define WQCFG_OFFSET(_idxd_dev, n, ofs) \ +({\ + typeof(_idxd_dev) __idxd_dev = (_idxd_dev); \ + (__idxd_dev)->wqcfg_offset + (n) * (__idxd_dev)->wqcfg_size + sizeof(u32) * (ofs); \ +}) + +#define WQCFG_STRIDES(_idxd_dev) ((_idxd_dev)->wqcfg_size / sizeof(u32)) + +#define GRPCFG_SIZE 64 +#define GRPWQCFG_STRIDES 4 + +/* + * This macro calculates the offset into the GRPCFG register + * idxd - struct idxd * + * n - wq id + * ofs - the index of the 32b dword for the config register + * + * The WQCFG register block is divided into groups per each wq. The n index + * allows us to move to the register group that's for that particular wq. + * Each register is 32bits. The ofs gives us the number of register to access. + */ +#define GRPWQCFG_OFFSET(idxd_dev, n, ofs) ((idxd_dev)->grpcfg_offset +\ + (n) * GRPCFG_SIZE + sizeof(u64) * (ofs)) +#define GRPENGCFG_OFFSET(idxd_dev, n) ((idxd_dev)->grpcfg_offset + (n) * GRPCFG_SIZE + 32) +#define GRPFLGCFG_OFFSET(idxd_dev, n) ((idxd_dev)->grpcfg_offset + (n) * GRPCFG_SIZE + 40) + +/* Following is performance monitor registers */ +#define IDXD_PERFCAP_OFFSET 0x0 +union idxd_perfcap { + struct { + u64 num_perf_counter:6; + u64 rsvd1:2; + u64 counter_width:8; + u64 num_event_category:4; + u64 global_event_category:16; + u64 filter:8; + u64 rsvd2:8; + u64 cap_per_counter:1; + u64 writeable_counter:1; + u64 counter_freeze:1; + u64 overflow_interrupt:1; + u64 rsvd3:8; + }; + u64 bits; +} __packed; + +#define IDXD_EVNTCAP_OFFSET 0x80 +union idxd_evntcap { + struct { + u64 events:28; + u64 rsvd:36; + }; + u64 bits; +} __packed; + +struct idxd_event { + union { + struct { + u32 event_category:4; + u32 events:28; + }; + u32 val; + }; +} __packed; + +#define IDXD_CNTRCAP_OFFSET 0x800 +struct idxd_cntrcap { + union { + struct { + u32 counter_width:8; + u32 rsvd:20; + u32 num_events:4; + }; + u32 val; + }; + struct idxd_event events[]; +} __packed; + +#define IDXD_PERFRST_OFFSET 0x10 +union idxd_perfrst { + struct { + u32 perfrst_config:1; + u32 perfrst_counter:1; + u32 rsvd:30; + }; + u32 val; +} __packed; + +#define IDXD_OVFSTATUS_OFFSET 0x30 +#define IDXD_PERFFRZ_OFFSET 0x20 +#define IDXD_CNTRCFG_OFFSET 0x100 +union idxd_cntrcfg { + struct { + u64 enable:1; + u64 interrupt_ovf:1; + u64 global_freeze_ovf:1; + u64 rsvd1:5; + u64 event_category:4; + u64 rsvd2:20; + u64 events:28; + u64 rsvd3:4; + }; + u64 val; +} __packed; + +#define IDXD_FLTCFG_OFFSET 0x300 + +#define IDXD_CNTRDATA_OFFSET 0x200 +union idxd_cntrdata { + struct { + u64 event_count_value; + }; + u64 val; +} __packed; + +union event_cfg { + struct { + u64 event_cat:4; + u64 event_enc:28; + }; + u64 val; +} __packed; + +union filter_cfg { + struct { + u64 wq:32; + u64 tc:8; + u64 pg_sz:4; + u64 xfer_sz:8; + u64 eng:8; + }; + u64 val; +} __packed; + +#define IDXD_EVLSTATUS_OFFSET 0xf0 + +union evl_status_reg { + struct { + u32 head:16; + u32 rsvd:16; + u32 tail:16; + u32 rsvd2:14; + u32 int_pending:1; + u32 rsvd3:1; + }; + struct { + u32 bits_lower32; + u32 bits_upper32; + }; + u64 bits; +} __packed; + +#define IDXD_MAX_BATCH_IDENT 256 + +struct __evl_entry { + u64 rsvd:2; + u64 desc_valid:1; + u64 wq_idx_valid:1; + u64 batch:1; + u64 fault_rw:1; + u64 priv:1; + u64 err_info_valid:1; + u64 error:8; + u64 wq_idx:8; + u64 batch_id:8; + u64 operation:8; + u64 pasid:20; + u64 rsvd2:4; + + u16 batch_idx; + u16 rsvd3; + union { + /* Invalid Flags 0x11 */ + u32 invalid_flags; + /* Invalid Int Handle 0x19 */ + /* Page fault 0x1a */ + /* Page fault 0x06, 0x1f, only operand_id */ + /* Page fault before drain or in batch, 0x26, 0x27 */ + struct { + u16 int_handle; + u16 rci:1; + u16 ims:1; + u16 rcr:1; + u16 first_err_in_batch:1; + u16 rsvd4_2:9; + u16 operand_id:3; + }; + }; + u64 fault_addr; + u64 rsvd5; +} __packed; + +struct dsa_evl_entry { + struct __evl_entry e; + struct dsa_completion_record cr; +} __packed; + +struct iax_evl_entry { + struct __evl_entry e; + u64 rsvd[4]; + struct iax_completion_record cr; +} __packed; + +#endif diff --git a/drivers/dma/idxd/submit.c b/drivers/dma/idxd/submit.c new file mode 100644 index 0000000000..3f922518e3 --- /dev/null +++ b/drivers/dma/idxd/submit.c @@ -0,0 +1,217 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright(c) 2019 Intel Corporation. All rights rsvd. */ +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/pci.h> +#include <uapi/linux/idxd.h> +#include "idxd.h" +#include "registers.h" + +static struct idxd_desc *__get_desc(struct idxd_wq *wq, int idx, int cpu) +{ + struct idxd_desc *desc; + struct idxd_device *idxd = wq->idxd; + + desc = wq->descs[idx]; + memset(desc->hw, 0, sizeof(struct dsa_hw_desc)); + memset(desc->completion, 0, idxd->data->compl_size); + desc->cpu = cpu; + + if (device_pasid_enabled(idxd)) + desc->hw->pasid = idxd->pasid; + + return desc; +} + +struct idxd_desc *idxd_alloc_desc(struct idxd_wq *wq, enum idxd_op_type optype) +{ + int cpu, idx; + struct idxd_device *idxd = wq->idxd; + DEFINE_SBQ_WAIT(wait); + struct sbq_wait_state *ws; + struct sbitmap_queue *sbq; + + if (idxd->state != IDXD_DEV_ENABLED) + return ERR_PTR(-EIO); + + sbq = &wq->sbq; + idx = sbitmap_queue_get(sbq, &cpu); + if (idx < 0) { + if (optype == IDXD_OP_NONBLOCK) + return ERR_PTR(-EAGAIN); + } else { + return __get_desc(wq, idx, cpu); + } + + ws = &sbq->ws[0]; + for (;;) { + sbitmap_prepare_to_wait(sbq, ws, &wait, TASK_INTERRUPTIBLE); + if (signal_pending_state(TASK_INTERRUPTIBLE, current)) + break; + idx = sbitmap_queue_get(sbq, &cpu); + if (idx >= 0) + break; + schedule(); + } + + sbitmap_finish_wait(sbq, ws, &wait); + if (idx < 0) + return ERR_PTR(-EAGAIN); + + return __get_desc(wq, idx, cpu); +} + +void idxd_free_desc(struct idxd_wq *wq, struct idxd_desc *desc) +{ + int cpu = desc->cpu; + + desc->cpu = -1; + sbitmap_queue_clear(&wq->sbq, desc->id, cpu); +} + +static struct idxd_desc *list_abort_desc(struct idxd_wq *wq, struct idxd_irq_entry *ie, + struct idxd_desc *desc) +{ + struct idxd_desc *d, *n; + + lockdep_assert_held(&ie->list_lock); + list_for_each_entry_safe(d, n, &ie->work_list, list) { + if (d == desc) { + list_del(&d->list); + return d; + } + } + + /* + * At this point, the desc needs to be aborted is held by the completion + * handler where it has taken it off the pending list but has not added to the + * work list. It will be cleaned up by the interrupt handler when it sees the + * IDXD_COMP_DESC_ABORT for completion status. + */ + return NULL; +} + +static void llist_abort_desc(struct idxd_wq *wq, struct idxd_irq_entry *ie, + struct idxd_desc *desc) +{ + struct idxd_desc *d, *t, *found = NULL; + struct llist_node *head; + LIST_HEAD(flist); + + desc->completion->status = IDXD_COMP_DESC_ABORT; + /* + * Grab the list lock so it will block the irq thread handler. This allows the + * abort code to locate the descriptor need to be aborted. + */ + spin_lock(&ie->list_lock); + head = llist_del_all(&ie->pending_llist); + if (head) { + llist_for_each_entry_safe(d, t, head, llnode) { + if (d == desc) { + found = desc; + continue; + } + + if (d->completion->status) + list_add_tail(&d->list, &flist); + else + list_add_tail(&d->list, &ie->work_list); + } + } + + if (!found) + found = list_abort_desc(wq, ie, desc); + spin_unlock(&ie->list_lock); + + if (found) + idxd_dma_complete_txd(found, IDXD_COMPLETE_ABORT, false); + + /* + * completing the descriptor will return desc to allocator and + * the desc can be acquired by a different process and the + * desc->list can be modified. Delete desc from list so the + * list trasversing does not get corrupted by the other process. + */ + list_for_each_entry_safe(d, t, &flist, list) { + list_del_init(&d->list); + idxd_dma_complete_txd(found, IDXD_COMPLETE_ABORT, true); + } +} + +/* + * ENQCMDS typically fail when the WQ is inactive or busy. On host submission, the driver + * has better control of number of descriptors being submitted to a shared wq by limiting + * the number of driver allocated descriptors to the wq size. However, when the swq is + * exported to a guest kernel, it may be shared with multiple guest kernels. This means + * the likelihood of getting busy returned on the swq when submitting goes significantly up. + * Having a tunable retry mechanism allows the driver to keep trying for a bit before giving + * up. The sysfs knob can be tuned by the system administrator. + */ +int idxd_enqcmds(struct idxd_wq *wq, void __iomem *portal, const void *desc) +{ + unsigned int retries = wq->enqcmds_retries; + int rc; + + do { + rc = enqcmds(portal, desc); + if (rc == 0) + break; + cpu_relax(); + } while (retries--); + + return rc; +} + +int idxd_submit_desc(struct idxd_wq *wq, struct idxd_desc *desc) +{ + struct idxd_device *idxd = wq->idxd; + struct idxd_irq_entry *ie = NULL; + u32 desc_flags = desc->hw->flags; + void __iomem *portal; + int rc; + + if (idxd->state != IDXD_DEV_ENABLED) + return -EIO; + + if (!percpu_ref_tryget_live(&wq->wq_active)) { + wait_for_completion(&wq->wq_resurrect); + if (!percpu_ref_tryget_live(&wq->wq_active)) + return -ENXIO; + } + + portal = idxd_wq_portal_addr(wq); + + /* + * Pending the descriptor to the lockless list for the irq_entry + * that we designated the descriptor to. + */ + if (desc_flags & IDXD_OP_FLAG_RCI) { + ie = &wq->ie; + desc->hw->int_handle = ie->int_handle; + llist_add(&desc->llnode, &ie->pending_llist); + } + + /* + * The wmb() flushes writes to coherent DMA data before + * possibly triggering a DMA read. The wmb() is necessary + * even on UP because the recipient is a device. + */ + wmb(); + + if (wq_dedicated(wq)) { + iosubmit_cmds512(portal, desc->hw, 1); + } else { + rc = idxd_enqcmds(wq, portal, desc->hw); + if (rc < 0) { + percpu_ref_put(&wq->wq_active); + /* abort operation frees the descriptor */ + if (ie) + llist_abort_desc(wq, ie, desc); + return rc; + } + } + + percpu_ref_put(&wq->wq_active); + return 0; +} diff --git a/drivers/dma/idxd/sysfs.c b/drivers/dma/idxd/sysfs.c new file mode 100644 index 0000000000..7caba90d85 --- /dev/null +++ b/drivers/dma/idxd/sysfs.c @@ -0,0 +1,1934 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright(c) 2019 Intel Corporation. All rights rsvd. */ +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/pci.h> +#include <linux/device.h> +#include <linux/io-64-nonatomic-lo-hi.h> +#include <uapi/linux/idxd.h> +#include "registers.h" +#include "idxd.h" + +static char *idxd_wq_type_names[] = { + [IDXD_WQT_NONE] = "none", + [IDXD_WQT_KERNEL] = "kernel", + [IDXD_WQT_USER] = "user", +}; + +/* IDXD engine attributes */ +static ssize_t engine_group_id_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct idxd_engine *engine = confdev_to_engine(dev); + + if (engine->group) + return sysfs_emit(buf, "%d\n", engine->group->id); + else + return sysfs_emit(buf, "%d\n", -1); +} + +static ssize_t engine_group_id_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct idxd_engine *engine = confdev_to_engine(dev); + struct idxd_device *idxd = engine->idxd; + long id; + int rc; + struct idxd_group *prevg; + + rc = kstrtol(buf, 10, &id); + if (rc < 0) + return -EINVAL; + + if (!test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags)) + return -EPERM; + + if (id > idxd->max_groups - 1 || id < -1) + return -EINVAL; + + if (id == -1) { + if (engine->group) { + engine->group->num_engines--; + engine->group = NULL; + } + return count; + } + + prevg = engine->group; + + if (prevg) + prevg->num_engines--; + engine->group = idxd->groups[id]; + engine->group->num_engines++; + + return count; +} + +static struct device_attribute dev_attr_engine_group = + __ATTR(group_id, 0644, engine_group_id_show, + engine_group_id_store); + +static struct attribute *idxd_engine_attributes[] = { + &dev_attr_engine_group.attr, + NULL, +}; + +static const struct attribute_group idxd_engine_attribute_group = { + .attrs = idxd_engine_attributes, +}; + +static const struct attribute_group *idxd_engine_attribute_groups[] = { + &idxd_engine_attribute_group, + NULL, +}; + +static void idxd_conf_engine_release(struct device *dev) +{ + struct idxd_engine *engine = confdev_to_engine(dev); + + kfree(engine); +} + +struct device_type idxd_engine_device_type = { + .name = "engine", + .release = idxd_conf_engine_release, + .groups = idxd_engine_attribute_groups, +}; + +/* Group attributes */ + +static void idxd_set_free_rdbufs(struct idxd_device *idxd) +{ + int i, rdbufs; + + for (i = 0, rdbufs = 0; i < idxd->max_groups; i++) { + struct idxd_group *g = idxd->groups[i]; + + rdbufs += g->rdbufs_reserved; + } + + idxd->nr_rdbufs = idxd->max_rdbufs - rdbufs; +} + +static ssize_t group_read_buffers_reserved_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct idxd_group *group = confdev_to_group(dev); + + return sysfs_emit(buf, "%u\n", group->rdbufs_reserved); +} + +static ssize_t group_tokens_reserved_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + dev_warn_once(dev, "attribute deprecated, see read_buffers_reserved.\n"); + return group_read_buffers_reserved_show(dev, attr, buf); +} + +static ssize_t group_read_buffers_reserved_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct idxd_group *group = confdev_to_group(dev); + struct idxd_device *idxd = group->idxd; + unsigned long val; + int rc; + + rc = kstrtoul(buf, 10, &val); + if (rc < 0) + return -EINVAL; + + if (idxd->data->type == IDXD_TYPE_IAX) + return -EOPNOTSUPP; + + if (!test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags)) + return -EPERM; + + if (idxd->state == IDXD_DEV_ENABLED) + return -EPERM; + + if (val > idxd->max_rdbufs) + return -EINVAL; + + if (val > idxd->nr_rdbufs + group->rdbufs_reserved) + return -EINVAL; + + group->rdbufs_reserved = val; + idxd_set_free_rdbufs(idxd); + return count; +} + +static ssize_t group_tokens_reserved_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + dev_warn_once(dev, "attribute deprecated, see read_buffers_reserved.\n"); + return group_read_buffers_reserved_store(dev, attr, buf, count); +} + +static struct device_attribute dev_attr_group_tokens_reserved = + __ATTR(tokens_reserved, 0644, group_tokens_reserved_show, + group_tokens_reserved_store); + +static struct device_attribute dev_attr_group_read_buffers_reserved = + __ATTR(read_buffers_reserved, 0644, group_read_buffers_reserved_show, + group_read_buffers_reserved_store); + +static ssize_t group_read_buffers_allowed_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct idxd_group *group = confdev_to_group(dev); + + return sysfs_emit(buf, "%u\n", group->rdbufs_allowed); +} + +static ssize_t group_tokens_allowed_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + dev_warn_once(dev, "attribute deprecated, see read_buffers_allowed.\n"); + return group_read_buffers_allowed_show(dev, attr, buf); +} + +static ssize_t group_read_buffers_allowed_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct idxd_group *group = confdev_to_group(dev); + struct idxd_device *idxd = group->idxd; + unsigned long val; + int rc; + + rc = kstrtoul(buf, 10, &val); + if (rc < 0) + return -EINVAL; + + if (idxd->data->type == IDXD_TYPE_IAX) + return -EOPNOTSUPP; + + if (!test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags)) + return -EPERM; + + if (idxd->state == IDXD_DEV_ENABLED) + return -EPERM; + + if (val < 4 * group->num_engines || + val > group->rdbufs_reserved + idxd->nr_rdbufs) + return -EINVAL; + + group->rdbufs_allowed = val; + return count; +} + +static ssize_t group_tokens_allowed_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + dev_warn_once(dev, "attribute deprecated, see read_buffers_allowed.\n"); + return group_read_buffers_allowed_store(dev, attr, buf, count); +} + +static struct device_attribute dev_attr_group_tokens_allowed = + __ATTR(tokens_allowed, 0644, group_tokens_allowed_show, + group_tokens_allowed_store); + +static struct device_attribute dev_attr_group_read_buffers_allowed = + __ATTR(read_buffers_allowed, 0644, group_read_buffers_allowed_show, + group_read_buffers_allowed_store); + +static ssize_t group_use_read_buffer_limit_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct idxd_group *group = confdev_to_group(dev); + + return sysfs_emit(buf, "%u\n", group->use_rdbuf_limit); +} + +static ssize_t group_use_token_limit_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + dev_warn_once(dev, "attribute deprecated, see use_read_buffer_limit.\n"); + return group_use_read_buffer_limit_show(dev, attr, buf); +} + +static ssize_t group_use_read_buffer_limit_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct idxd_group *group = confdev_to_group(dev); + struct idxd_device *idxd = group->idxd; + unsigned long val; + int rc; + + rc = kstrtoul(buf, 10, &val); + if (rc < 0) + return -EINVAL; + + if (idxd->data->type == IDXD_TYPE_IAX) + return -EOPNOTSUPP; + + if (!test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags)) + return -EPERM; + + if (idxd->state == IDXD_DEV_ENABLED) + return -EPERM; + + if (idxd->rdbuf_limit == 0) + return -EPERM; + + group->use_rdbuf_limit = !!val; + return count; +} + +static ssize_t group_use_token_limit_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + dev_warn_once(dev, "attribute deprecated, see use_read_buffer_limit.\n"); + return group_use_read_buffer_limit_store(dev, attr, buf, count); +} + +static struct device_attribute dev_attr_group_use_token_limit = + __ATTR(use_token_limit, 0644, group_use_token_limit_show, + group_use_token_limit_store); + +static struct device_attribute dev_attr_group_use_read_buffer_limit = + __ATTR(use_read_buffer_limit, 0644, group_use_read_buffer_limit_show, + group_use_read_buffer_limit_store); + +static ssize_t group_engines_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct idxd_group *group = confdev_to_group(dev); + int i, rc = 0; + struct idxd_device *idxd = group->idxd; + + for (i = 0; i < idxd->max_engines; i++) { + struct idxd_engine *engine = idxd->engines[i]; + + if (!engine->group) + continue; + + if (engine->group->id == group->id) + rc += sysfs_emit_at(buf, rc, "engine%d.%d ", idxd->id, engine->id); + } + + if (!rc) + return 0; + rc--; + rc += sysfs_emit_at(buf, rc, "\n"); + + return rc; +} + +static struct device_attribute dev_attr_group_engines = + __ATTR(engines, 0444, group_engines_show, NULL); + +static ssize_t group_work_queues_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct idxd_group *group = confdev_to_group(dev); + int i, rc = 0; + struct idxd_device *idxd = group->idxd; + + for (i = 0; i < idxd->max_wqs; i++) { + struct idxd_wq *wq = idxd->wqs[i]; + + if (!wq->group) + continue; + + if (wq->group->id == group->id) + rc += sysfs_emit_at(buf, rc, "wq%d.%d ", idxd->id, wq->id); + } + + if (!rc) + return 0; + rc--; + rc += sysfs_emit_at(buf, rc, "\n"); + + return rc; +} + +static struct device_attribute dev_attr_group_work_queues = + __ATTR(work_queues, 0444, group_work_queues_show, NULL); + +static ssize_t group_traffic_class_a_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct idxd_group *group = confdev_to_group(dev); + + return sysfs_emit(buf, "%d\n", group->tc_a); +} + +static ssize_t group_traffic_class_a_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct idxd_group *group = confdev_to_group(dev); + struct idxd_device *idxd = group->idxd; + long val; + int rc; + + rc = kstrtol(buf, 10, &val); + if (rc < 0) + return -EINVAL; + + if (!test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags)) + return -EPERM; + + if (idxd->state == IDXD_DEV_ENABLED) + return -EPERM; + + if (idxd->hw.version <= DEVICE_VERSION_2 && !tc_override) + return -EPERM; + + if (val < 0 || val > 7) + return -EINVAL; + + group->tc_a = val; + return count; +} + +static struct device_attribute dev_attr_group_traffic_class_a = + __ATTR(traffic_class_a, 0644, group_traffic_class_a_show, + group_traffic_class_a_store); + +static ssize_t group_traffic_class_b_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct idxd_group *group = confdev_to_group(dev); + + return sysfs_emit(buf, "%d\n", group->tc_b); +} + +static ssize_t group_traffic_class_b_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct idxd_group *group = confdev_to_group(dev); + struct idxd_device *idxd = group->idxd; + long val; + int rc; + + rc = kstrtol(buf, 10, &val); + if (rc < 0) + return -EINVAL; + + if (!test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags)) + return -EPERM; + + if (idxd->state == IDXD_DEV_ENABLED) + return -EPERM; + + if (idxd->hw.version <= DEVICE_VERSION_2 && !tc_override) + return -EPERM; + + if (val < 0 || val > 7) + return -EINVAL; + + group->tc_b = val; + return count; +} + +static struct device_attribute dev_attr_group_traffic_class_b = + __ATTR(traffic_class_b, 0644, group_traffic_class_b_show, + group_traffic_class_b_store); + +static ssize_t group_desc_progress_limit_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct idxd_group *group = confdev_to_group(dev); + + return sysfs_emit(buf, "%d\n", group->desc_progress_limit); +} + +static ssize_t group_desc_progress_limit_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct idxd_group *group = confdev_to_group(dev); + int val, rc; + + rc = kstrtoint(buf, 10, &val); + if (rc < 0) + return -EINVAL; + + if (val & ~GENMASK(1, 0)) + return -EINVAL; + + group->desc_progress_limit = val; + return count; +} + +static struct device_attribute dev_attr_group_desc_progress_limit = + __ATTR(desc_progress_limit, 0644, group_desc_progress_limit_show, + group_desc_progress_limit_store); + +static ssize_t group_batch_progress_limit_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct idxd_group *group = confdev_to_group(dev); + + return sysfs_emit(buf, "%d\n", group->batch_progress_limit); +} + +static ssize_t group_batch_progress_limit_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct idxd_group *group = confdev_to_group(dev); + int val, rc; + + rc = kstrtoint(buf, 10, &val); + if (rc < 0) + return -EINVAL; + + if (val & ~GENMASK(1, 0)) + return -EINVAL; + + group->batch_progress_limit = val; + return count; +} + +static struct device_attribute dev_attr_group_batch_progress_limit = + __ATTR(batch_progress_limit, 0644, group_batch_progress_limit_show, + group_batch_progress_limit_store); +static struct attribute *idxd_group_attributes[] = { + &dev_attr_group_work_queues.attr, + &dev_attr_group_engines.attr, + &dev_attr_group_use_token_limit.attr, + &dev_attr_group_use_read_buffer_limit.attr, + &dev_attr_group_tokens_allowed.attr, + &dev_attr_group_read_buffers_allowed.attr, + &dev_attr_group_tokens_reserved.attr, + &dev_attr_group_read_buffers_reserved.attr, + &dev_attr_group_traffic_class_a.attr, + &dev_attr_group_traffic_class_b.attr, + &dev_attr_group_desc_progress_limit.attr, + &dev_attr_group_batch_progress_limit.attr, + NULL, +}; + +static bool idxd_group_attr_progress_limit_invisible(struct attribute *attr, + struct idxd_device *idxd) +{ + return (attr == &dev_attr_group_desc_progress_limit.attr || + attr == &dev_attr_group_batch_progress_limit.attr) && + !idxd->hw.group_cap.progress_limit; +} + +static bool idxd_group_attr_read_buffers_invisible(struct attribute *attr, + struct idxd_device *idxd) +{ + /* + * Intel IAA does not support Read Buffer allocation control, + * make these attributes invisible. + */ + return (attr == &dev_attr_group_use_token_limit.attr || + attr == &dev_attr_group_use_read_buffer_limit.attr || + attr == &dev_attr_group_tokens_allowed.attr || + attr == &dev_attr_group_read_buffers_allowed.attr || + attr == &dev_attr_group_tokens_reserved.attr || + attr == &dev_attr_group_read_buffers_reserved.attr) && + idxd->data->type == IDXD_TYPE_IAX; +} + +static umode_t idxd_group_attr_visible(struct kobject *kobj, + struct attribute *attr, int n) +{ + struct device *dev = container_of(kobj, struct device, kobj); + struct idxd_group *group = confdev_to_group(dev); + struct idxd_device *idxd = group->idxd; + + if (idxd_group_attr_progress_limit_invisible(attr, idxd)) + return 0; + + if (idxd_group_attr_read_buffers_invisible(attr, idxd)) + return 0; + + return attr->mode; +} + +static const struct attribute_group idxd_group_attribute_group = { + .attrs = idxd_group_attributes, + .is_visible = idxd_group_attr_visible, +}; + +static const struct attribute_group *idxd_group_attribute_groups[] = { + &idxd_group_attribute_group, + NULL, +}; + +static void idxd_conf_group_release(struct device *dev) +{ + struct idxd_group *group = confdev_to_group(dev); + + kfree(group); +} + +struct device_type idxd_group_device_type = { + .name = "group", + .release = idxd_conf_group_release, + .groups = idxd_group_attribute_groups, +}; + +/* IDXD work queue attribs */ +static ssize_t wq_clients_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct idxd_wq *wq = confdev_to_wq(dev); + + return sysfs_emit(buf, "%d\n", wq->client_count); +} + +static struct device_attribute dev_attr_wq_clients = + __ATTR(clients, 0444, wq_clients_show, NULL); + +static ssize_t wq_state_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct idxd_wq *wq = confdev_to_wq(dev); + + switch (wq->state) { + case IDXD_WQ_DISABLED: + return sysfs_emit(buf, "disabled\n"); + case IDXD_WQ_ENABLED: + return sysfs_emit(buf, "enabled\n"); + } + + return sysfs_emit(buf, "unknown\n"); +} + +static struct device_attribute dev_attr_wq_state = + __ATTR(state, 0444, wq_state_show, NULL); + +static ssize_t wq_group_id_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct idxd_wq *wq = confdev_to_wq(dev); + + if (wq->group) + return sysfs_emit(buf, "%u\n", wq->group->id); + else + return sysfs_emit(buf, "-1\n"); +} + +static ssize_t wq_group_id_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct idxd_wq *wq = confdev_to_wq(dev); + struct idxd_device *idxd = wq->idxd; + long id; + int rc; + struct idxd_group *prevg, *group; + + rc = kstrtol(buf, 10, &id); + if (rc < 0) + return -EINVAL; + + if (!test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags)) + return -EPERM; + + if (wq->state != IDXD_WQ_DISABLED) + return -EPERM; + + if (id > idxd->max_groups - 1 || id < -1) + return -EINVAL; + + if (id == -1) { + if (wq->group) { + wq->group->num_wqs--; + wq->group = NULL; + } + return count; + } + + group = idxd->groups[id]; + prevg = wq->group; + + if (prevg) + prevg->num_wqs--; + wq->group = group; + group->num_wqs++; + return count; +} + +static struct device_attribute dev_attr_wq_group_id = + __ATTR(group_id, 0644, wq_group_id_show, wq_group_id_store); + +static ssize_t wq_mode_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct idxd_wq *wq = confdev_to_wq(dev); + + return sysfs_emit(buf, "%s\n", wq_dedicated(wq) ? "dedicated" : "shared"); +} + +static ssize_t wq_mode_store(struct device *dev, + struct device_attribute *attr, const char *buf, + size_t count) +{ + struct idxd_wq *wq = confdev_to_wq(dev); + struct idxd_device *idxd = wq->idxd; + + if (!test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags)) + return -EPERM; + + if (wq->state != IDXD_WQ_DISABLED) + return -EPERM; + + if (sysfs_streq(buf, "dedicated")) { + set_bit(WQ_FLAG_DEDICATED, &wq->flags); + wq->threshold = 0; + } else if (sysfs_streq(buf, "shared")) { + clear_bit(WQ_FLAG_DEDICATED, &wq->flags); + } else { + return -EINVAL; + } + + return count; +} + +static struct device_attribute dev_attr_wq_mode = + __ATTR(mode, 0644, wq_mode_show, wq_mode_store); + +static ssize_t wq_size_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct idxd_wq *wq = confdev_to_wq(dev); + + return sysfs_emit(buf, "%u\n", wq->size); +} + +static int total_claimed_wq_size(struct idxd_device *idxd) +{ + int i; + int wq_size = 0; + + for (i = 0; i < idxd->max_wqs; i++) { + struct idxd_wq *wq = idxd->wqs[i]; + + wq_size += wq->size; + } + + return wq_size; +} + +static ssize_t wq_size_store(struct device *dev, + struct device_attribute *attr, const char *buf, + size_t count) +{ + struct idxd_wq *wq = confdev_to_wq(dev); + unsigned long size; + struct idxd_device *idxd = wq->idxd; + int rc; + + rc = kstrtoul(buf, 10, &size); + if (rc < 0) + return -EINVAL; + + if (!test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags)) + return -EPERM; + + if (idxd->state == IDXD_DEV_ENABLED) + return -EPERM; + + if (size + total_claimed_wq_size(idxd) - wq->size > idxd->max_wq_size) + return -EINVAL; + + wq->size = size; + return count; +} + +static struct device_attribute dev_attr_wq_size = + __ATTR(size, 0644, wq_size_show, wq_size_store); + +static ssize_t wq_priority_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct idxd_wq *wq = confdev_to_wq(dev); + + return sysfs_emit(buf, "%u\n", wq->priority); +} + +static ssize_t wq_priority_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct idxd_wq *wq = confdev_to_wq(dev); + unsigned long prio; + struct idxd_device *idxd = wq->idxd; + int rc; + + rc = kstrtoul(buf, 10, &prio); + if (rc < 0) + return -EINVAL; + + if (!test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags)) + return -EPERM; + + if (wq->state != IDXD_WQ_DISABLED) + return -EPERM; + + if (prio > IDXD_MAX_PRIORITY) + return -EINVAL; + + wq->priority = prio; + return count; +} + +static struct device_attribute dev_attr_wq_priority = + __ATTR(priority, 0644, wq_priority_show, wq_priority_store); + +static ssize_t wq_block_on_fault_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct idxd_wq *wq = confdev_to_wq(dev); + + return sysfs_emit(buf, "%u\n", test_bit(WQ_FLAG_BLOCK_ON_FAULT, &wq->flags)); +} + +static ssize_t wq_block_on_fault_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct idxd_wq *wq = confdev_to_wq(dev); + struct idxd_device *idxd = wq->idxd; + bool bof; + int rc; + + if (!idxd->hw.gen_cap.block_on_fault) + return -EOPNOTSUPP; + + if (!test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags)) + return -EPERM; + + if (wq->state != IDXD_WQ_DISABLED) + return -ENXIO; + + rc = kstrtobool(buf, &bof); + if (rc < 0) + return rc; + + if (bof) { + if (test_bit(WQ_FLAG_PRS_DISABLE, &wq->flags)) + return -EOPNOTSUPP; + + set_bit(WQ_FLAG_BLOCK_ON_FAULT, &wq->flags); + } else { + clear_bit(WQ_FLAG_BLOCK_ON_FAULT, &wq->flags); + } + + return count; +} + +static struct device_attribute dev_attr_wq_block_on_fault = + __ATTR(block_on_fault, 0644, wq_block_on_fault_show, + wq_block_on_fault_store); + +static ssize_t wq_threshold_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct idxd_wq *wq = confdev_to_wq(dev); + + return sysfs_emit(buf, "%u\n", wq->threshold); +} + +static ssize_t wq_threshold_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct idxd_wq *wq = confdev_to_wq(dev); + struct idxd_device *idxd = wq->idxd; + unsigned int val; + int rc; + + rc = kstrtouint(buf, 0, &val); + if (rc < 0) + return -EINVAL; + + if (val > wq->size || val <= 0) + return -EINVAL; + + if (!test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags)) + return -EPERM; + + if (wq->state != IDXD_WQ_DISABLED) + return -ENXIO; + + if (test_bit(WQ_FLAG_DEDICATED, &wq->flags)) + return -EINVAL; + + wq->threshold = val; + + return count; +} + +static struct device_attribute dev_attr_wq_threshold = + __ATTR(threshold, 0644, wq_threshold_show, wq_threshold_store); + +static ssize_t wq_type_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct idxd_wq *wq = confdev_to_wq(dev); + + switch (wq->type) { + case IDXD_WQT_KERNEL: + return sysfs_emit(buf, "%s\n", idxd_wq_type_names[IDXD_WQT_KERNEL]); + case IDXD_WQT_USER: + return sysfs_emit(buf, "%s\n", idxd_wq_type_names[IDXD_WQT_USER]); + case IDXD_WQT_NONE: + default: + return sysfs_emit(buf, "%s\n", idxd_wq_type_names[IDXD_WQT_NONE]); + } + + return -EINVAL; +} + +static ssize_t wq_type_store(struct device *dev, + struct device_attribute *attr, const char *buf, + size_t count) +{ + struct idxd_wq *wq = confdev_to_wq(dev); + enum idxd_wq_type old_type; + + if (wq->state != IDXD_WQ_DISABLED) + return -EPERM; + + old_type = wq->type; + if (sysfs_streq(buf, idxd_wq_type_names[IDXD_WQT_NONE])) + wq->type = IDXD_WQT_NONE; + else if (sysfs_streq(buf, idxd_wq_type_names[IDXD_WQT_KERNEL])) + wq->type = IDXD_WQT_KERNEL; + else if (sysfs_streq(buf, idxd_wq_type_names[IDXD_WQT_USER])) + wq->type = IDXD_WQT_USER; + else + return -EINVAL; + + /* If we are changing queue type, clear the name */ + if (wq->type != old_type) + memset(wq->name, 0, WQ_NAME_SIZE + 1); + + return count; +} + +static struct device_attribute dev_attr_wq_type = + __ATTR(type, 0644, wq_type_show, wq_type_store); + +static ssize_t wq_name_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct idxd_wq *wq = confdev_to_wq(dev); + + return sysfs_emit(buf, "%s\n", wq->name); +} + +static ssize_t wq_name_store(struct device *dev, + struct device_attribute *attr, const char *buf, + size_t count) +{ + struct idxd_wq *wq = confdev_to_wq(dev); + char *input, *pos; + + if (wq->state != IDXD_WQ_DISABLED) + return -EPERM; + + if (strlen(buf) > WQ_NAME_SIZE || strlen(buf) == 0) + return -EINVAL; + + input = kstrndup(buf, count, GFP_KERNEL); + if (!input) + return -ENOMEM; + + pos = strim(input); + memset(wq->name, 0, WQ_NAME_SIZE + 1); + sprintf(wq->name, "%s", pos); + kfree(input); + return count; +} + +static struct device_attribute dev_attr_wq_name = + __ATTR(name, 0644, wq_name_show, wq_name_store); + +static ssize_t wq_cdev_minor_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct idxd_wq *wq = confdev_to_wq(dev); + int minor = -1; + + mutex_lock(&wq->wq_lock); + if (wq->idxd_cdev) + minor = wq->idxd_cdev->minor; + mutex_unlock(&wq->wq_lock); + + if (minor == -1) + return -ENXIO; + return sysfs_emit(buf, "%d\n", minor); +} + +static struct device_attribute dev_attr_wq_cdev_minor = + __ATTR(cdev_minor, 0444, wq_cdev_minor_show, NULL); + +static int __get_sysfs_u64(const char *buf, u64 *val) +{ + int rc; + + rc = kstrtou64(buf, 0, val); + if (rc < 0) + return -EINVAL; + + if (*val == 0) + return -EINVAL; + + *val = roundup_pow_of_two(*val); + return 0; +} + +static ssize_t wq_max_transfer_size_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct idxd_wq *wq = confdev_to_wq(dev); + + return sysfs_emit(buf, "%llu\n", wq->max_xfer_bytes); +} + +static ssize_t wq_max_transfer_size_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count) +{ + struct idxd_wq *wq = confdev_to_wq(dev); + struct idxd_device *idxd = wq->idxd; + u64 xfer_size; + int rc; + + if (!test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags)) + return -EPERM; + + if (wq->state != IDXD_WQ_DISABLED) + return -EPERM; + + rc = __get_sysfs_u64(buf, &xfer_size); + if (rc < 0) + return rc; + + if (xfer_size > idxd->max_xfer_bytes) + return -EINVAL; + + wq->max_xfer_bytes = xfer_size; + + return count; +} + +static struct device_attribute dev_attr_wq_max_transfer_size = + __ATTR(max_transfer_size, 0644, + wq_max_transfer_size_show, wq_max_transfer_size_store); + +static ssize_t wq_max_batch_size_show(struct device *dev, struct device_attribute *attr, char *buf) +{ + struct idxd_wq *wq = confdev_to_wq(dev); + + return sysfs_emit(buf, "%u\n", wq->max_batch_size); +} + +static ssize_t wq_max_batch_size_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count) +{ + struct idxd_wq *wq = confdev_to_wq(dev); + struct idxd_device *idxd = wq->idxd; + u64 batch_size; + int rc; + + if (!test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags)) + return -EPERM; + + if (wq->state != IDXD_WQ_DISABLED) + return -EPERM; + + rc = __get_sysfs_u64(buf, &batch_size); + if (rc < 0) + return rc; + + if (batch_size > idxd->max_batch_size) + return -EINVAL; + + idxd_wq_set_max_batch_size(idxd->data->type, wq, (u32)batch_size); + + return count; +} + +static struct device_attribute dev_attr_wq_max_batch_size = + __ATTR(max_batch_size, 0644, wq_max_batch_size_show, wq_max_batch_size_store); + +static ssize_t wq_ats_disable_show(struct device *dev, struct device_attribute *attr, char *buf) +{ + struct idxd_wq *wq = confdev_to_wq(dev); + + return sysfs_emit(buf, "%u\n", test_bit(WQ_FLAG_ATS_DISABLE, &wq->flags)); +} + +static ssize_t wq_ats_disable_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count) +{ + struct idxd_wq *wq = confdev_to_wq(dev); + struct idxd_device *idxd = wq->idxd; + bool ats_dis; + int rc; + + if (wq->state != IDXD_WQ_DISABLED) + return -EPERM; + + if (!test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags)) + return -EPERM; + + rc = kstrtobool(buf, &ats_dis); + if (rc < 0) + return rc; + + if (ats_dis) + set_bit(WQ_FLAG_ATS_DISABLE, &wq->flags); + else + clear_bit(WQ_FLAG_ATS_DISABLE, &wq->flags); + + return count; +} + +static struct device_attribute dev_attr_wq_ats_disable = + __ATTR(ats_disable, 0644, wq_ats_disable_show, wq_ats_disable_store); + +static ssize_t wq_prs_disable_show(struct device *dev, struct device_attribute *attr, char *buf) +{ + struct idxd_wq *wq = confdev_to_wq(dev); + + return sysfs_emit(buf, "%u\n", test_bit(WQ_FLAG_PRS_DISABLE, &wq->flags)); +} + +static ssize_t wq_prs_disable_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count) +{ + struct idxd_wq *wq = confdev_to_wq(dev); + struct idxd_device *idxd = wq->idxd; + bool prs_dis; + int rc; + + if (wq->state != IDXD_WQ_DISABLED) + return -EPERM; + + if (!test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags)) + return -EPERM; + + rc = kstrtobool(buf, &prs_dis); + if (rc < 0) + return rc; + + if (prs_dis) { + set_bit(WQ_FLAG_PRS_DISABLE, &wq->flags); + /* when PRS is disabled, BOF needs to be off as well */ + clear_bit(WQ_FLAG_BLOCK_ON_FAULT, &wq->flags); + } else { + clear_bit(WQ_FLAG_PRS_DISABLE, &wq->flags); + } + return count; +} + +static struct device_attribute dev_attr_wq_prs_disable = + __ATTR(prs_disable, 0644, wq_prs_disable_show, wq_prs_disable_store); + +static ssize_t wq_occupancy_show(struct device *dev, struct device_attribute *attr, char *buf) +{ + struct idxd_wq *wq = confdev_to_wq(dev); + struct idxd_device *idxd = wq->idxd; + u32 occup, offset; + + if (!idxd->hw.wq_cap.occupancy) + return -EOPNOTSUPP; + + offset = WQCFG_OFFSET(idxd, wq->id, WQCFG_OCCUP_IDX); + occup = ioread32(idxd->reg_base + offset) & WQCFG_OCCUP_MASK; + + return sysfs_emit(buf, "%u\n", occup); +} + +static struct device_attribute dev_attr_wq_occupancy = + __ATTR(occupancy, 0444, wq_occupancy_show, NULL); + +static ssize_t wq_enqcmds_retries_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct idxd_wq *wq = confdev_to_wq(dev); + + if (wq_dedicated(wq)) + return -EOPNOTSUPP; + + return sysfs_emit(buf, "%u\n", wq->enqcmds_retries); +} + +static ssize_t wq_enqcmds_retries_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count) +{ + struct idxd_wq *wq = confdev_to_wq(dev); + int rc; + unsigned int retries; + + if (wq_dedicated(wq)) + return -EOPNOTSUPP; + + rc = kstrtouint(buf, 10, &retries); + if (rc < 0) + return rc; + + if (retries > IDXD_ENQCMDS_MAX_RETRIES) + retries = IDXD_ENQCMDS_MAX_RETRIES; + + wq->enqcmds_retries = retries; + return count; +} + +static struct device_attribute dev_attr_wq_enqcmds_retries = + __ATTR(enqcmds_retries, 0644, wq_enqcmds_retries_show, wq_enqcmds_retries_store); + +static ssize_t wq_op_config_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct idxd_wq *wq = confdev_to_wq(dev); + + return sysfs_emit(buf, "%*pb\n", IDXD_MAX_OPCAP_BITS, wq->opcap_bmap); +} + +static int idxd_verify_supported_opcap(struct idxd_device *idxd, unsigned long *opmask) +{ + int bit; + + /* + * The OPCAP is defined as 256 bits that represents each operation the device + * supports per bit. Iterate through all the bits and check if the input mask + * is set for bits that are not set in the OPCAP for the device. If no OPCAP + * bit is set and input mask has the bit set, then return error. + */ + for_each_set_bit(bit, opmask, IDXD_MAX_OPCAP_BITS) { + if (!test_bit(bit, idxd->opcap_bmap)) + return -EINVAL; + } + + return 0; +} + +static ssize_t wq_op_config_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count) +{ + struct idxd_wq *wq = confdev_to_wq(dev); + struct idxd_device *idxd = wq->idxd; + unsigned long *opmask; + int rc; + + if (wq->state != IDXD_WQ_DISABLED) + return -EPERM; + + opmask = bitmap_zalloc(IDXD_MAX_OPCAP_BITS, GFP_KERNEL); + if (!opmask) + return -ENOMEM; + + rc = bitmap_parse(buf, count, opmask, IDXD_MAX_OPCAP_BITS); + if (rc < 0) + goto err; + + rc = idxd_verify_supported_opcap(idxd, opmask); + if (rc < 0) + goto err; + + bitmap_copy(wq->opcap_bmap, opmask, IDXD_MAX_OPCAP_BITS); + + bitmap_free(opmask); + return count; + +err: + bitmap_free(opmask); + return rc; +} + +static struct device_attribute dev_attr_wq_op_config = + __ATTR(op_config, 0644, wq_op_config_show, wq_op_config_store); + +static struct attribute *idxd_wq_attributes[] = { + &dev_attr_wq_clients.attr, + &dev_attr_wq_state.attr, + &dev_attr_wq_group_id.attr, + &dev_attr_wq_mode.attr, + &dev_attr_wq_size.attr, + &dev_attr_wq_priority.attr, + &dev_attr_wq_block_on_fault.attr, + &dev_attr_wq_threshold.attr, + &dev_attr_wq_type.attr, + &dev_attr_wq_name.attr, + &dev_attr_wq_cdev_minor.attr, + &dev_attr_wq_max_transfer_size.attr, + &dev_attr_wq_max_batch_size.attr, + &dev_attr_wq_ats_disable.attr, + &dev_attr_wq_prs_disable.attr, + &dev_attr_wq_occupancy.attr, + &dev_attr_wq_enqcmds_retries.attr, + &dev_attr_wq_op_config.attr, + NULL, +}; + +/* A WQ attr is invisible if the feature is not supported in WQCAP. */ +#define idxd_wq_attr_invisible(name, cap_field, a, idxd) \ + ((a) == &dev_attr_wq_##name.attr && !(idxd)->hw.wq_cap.cap_field) + +static bool idxd_wq_attr_max_batch_size_invisible(struct attribute *attr, + struct idxd_device *idxd) +{ + /* Intel IAA does not support batch processing, make it invisible */ + return attr == &dev_attr_wq_max_batch_size.attr && + idxd->data->type == IDXD_TYPE_IAX; +} + +static umode_t idxd_wq_attr_visible(struct kobject *kobj, + struct attribute *attr, int n) +{ + struct device *dev = container_of(kobj, struct device, kobj); + struct idxd_wq *wq = confdev_to_wq(dev); + struct idxd_device *idxd = wq->idxd; + + if (idxd_wq_attr_invisible(op_config, op_config, attr, idxd)) + return 0; + + if (idxd_wq_attr_max_batch_size_invisible(attr, idxd)) + return 0; + + if (idxd_wq_attr_invisible(prs_disable, wq_prs_support, attr, idxd)) + return 0; + + if (idxd_wq_attr_invisible(ats_disable, wq_ats_support, attr, idxd)) + return 0; + + return attr->mode; +} + +static const struct attribute_group idxd_wq_attribute_group = { + .attrs = idxd_wq_attributes, + .is_visible = idxd_wq_attr_visible, +}; + +static const struct attribute_group *idxd_wq_attribute_groups[] = { + &idxd_wq_attribute_group, + NULL, +}; + +static void idxd_conf_wq_release(struct device *dev) +{ + struct idxd_wq *wq = confdev_to_wq(dev); + + bitmap_free(wq->opcap_bmap); + kfree(wq->wqcfg); + xa_destroy(&wq->upasid_xa); + kfree(wq); +} + +struct device_type idxd_wq_device_type = { + .name = "wq", + .release = idxd_conf_wq_release, + .groups = idxd_wq_attribute_groups, +}; + +/* IDXD device attribs */ +static ssize_t version_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct idxd_device *idxd = confdev_to_idxd(dev); + + return sysfs_emit(buf, "%#x\n", idxd->hw.version); +} +static DEVICE_ATTR_RO(version); + +static ssize_t max_work_queues_size_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct idxd_device *idxd = confdev_to_idxd(dev); + + return sysfs_emit(buf, "%u\n", idxd->max_wq_size); +} +static DEVICE_ATTR_RO(max_work_queues_size); + +static ssize_t max_groups_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct idxd_device *idxd = confdev_to_idxd(dev); + + return sysfs_emit(buf, "%u\n", idxd->max_groups); +} +static DEVICE_ATTR_RO(max_groups); + +static ssize_t max_work_queues_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct idxd_device *idxd = confdev_to_idxd(dev); + + return sysfs_emit(buf, "%u\n", idxd->max_wqs); +} +static DEVICE_ATTR_RO(max_work_queues); + +static ssize_t max_engines_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct idxd_device *idxd = confdev_to_idxd(dev); + + return sysfs_emit(buf, "%u\n", idxd->max_engines); +} +static DEVICE_ATTR_RO(max_engines); + +static ssize_t numa_node_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct idxd_device *idxd = confdev_to_idxd(dev); + + return sysfs_emit(buf, "%d\n", dev_to_node(&idxd->pdev->dev)); +} +static DEVICE_ATTR_RO(numa_node); + +static ssize_t max_batch_size_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct idxd_device *idxd = confdev_to_idxd(dev); + + return sysfs_emit(buf, "%u\n", idxd->max_batch_size); +} +static DEVICE_ATTR_RO(max_batch_size); + +static ssize_t max_transfer_size_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct idxd_device *idxd = confdev_to_idxd(dev); + + return sysfs_emit(buf, "%llu\n", idxd->max_xfer_bytes); +} +static DEVICE_ATTR_RO(max_transfer_size); + +static ssize_t op_cap_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct idxd_device *idxd = confdev_to_idxd(dev); + + return sysfs_emit(buf, "%*pb\n", IDXD_MAX_OPCAP_BITS, idxd->opcap_bmap); +} +static DEVICE_ATTR_RO(op_cap); + +static ssize_t gen_cap_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct idxd_device *idxd = confdev_to_idxd(dev); + + return sysfs_emit(buf, "%#llx\n", idxd->hw.gen_cap.bits); +} +static DEVICE_ATTR_RO(gen_cap); + +static ssize_t configurable_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct idxd_device *idxd = confdev_to_idxd(dev); + + return sysfs_emit(buf, "%u\n", test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags)); +} +static DEVICE_ATTR_RO(configurable); + +static ssize_t clients_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct idxd_device *idxd = confdev_to_idxd(dev); + int count = 0, i; + + spin_lock(&idxd->dev_lock); + for (i = 0; i < idxd->max_wqs; i++) { + struct idxd_wq *wq = idxd->wqs[i]; + + count += wq->client_count; + } + spin_unlock(&idxd->dev_lock); + + return sysfs_emit(buf, "%d\n", count); +} +static DEVICE_ATTR_RO(clients); + +static ssize_t pasid_enabled_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct idxd_device *idxd = confdev_to_idxd(dev); + + return sysfs_emit(buf, "%u\n", device_user_pasid_enabled(idxd)); +} +static DEVICE_ATTR_RO(pasid_enabled); + +static ssize_t state_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct idxd_device *idxd = confdev_to_idxd(dev); + + switch (idxd->state) { + case IDXD_DEV_DISABLED: + return sysfs_emit(buf, "disabled\n"); + case IDXD_DEV_ENABLED: + return sysfs_emit(buf, "enabled\n"); + case IDXD_DEV_HALTED: + return sysfs_emit(buf, "halted\n"); + } + + return sysfs_emit(buf, "unknown\n"); +} +static DEVICE_ATTR_RO(state); + +static ssize_t errors_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct idxd_device *idxd = confdev_to_idxd(dev); + DECLARE_BITMAP(swerr_bmap, 256); + + bitmap_zero(swerr_bmap, 256); + spin_lock(&idxd->dev_lock); + multi_u64_to_bmap(swerr_bmap, &idxd->sw_err.bits[0], 4); + spin_unlock(&idxd->dev_lock); + return sysfs_emit(buf, "%*pb\n", 256, swerr_bmap); +} +static DEVICE_ATTR_RO(errors); + +static ssize_t max_read_buffers_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct idxd_device *idxd = confdev_to_idxd(dev); + + return sysfs_emit(buf, "%u\n", idxd->max_rdbufs); +} + +static ssize_t max_tokens_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + dev_warn_once(dev, "attribute deprecated, see max_read_buffers.\n"); + return max_read_buffers_show(dev, attr, buf); +} + +static DEVICE_ATTR_RO(max_tokens); /* deprecated */ +static DEVICE_ATTR_RO(max_read_buffers); + +static ssize_t read_buffer_limit_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct idxd_device *idxd = confdev_to_idxd(dev); + + return sysfs_emit(buf, "%u\n", idxd->rdbuf_limit); +} + +static ssize_t token_limit_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + dev_warn_once(dev, "attribute deprecated, see read_buffer_limit.\n"); + return read_buffer_limit_show(dev, attr, buf); +} + +static ssize_t read_buffer_limit_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct idxd_device *idxd = confdev_to_idxd(dev); + unsigned long val; + int rc; + + rc = kstrtoul(buf, 10, &val); + if (rc < 0) + return -EINVAL; + + if (idxd->state == IDXD_DEV_ENABLED) + return -EPERM; + + if (!test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags)) + return -EPERM; + + if (!idxd->hw.group_cap.rdbuf_limit) + return -EPERM; + + if (val > idxd->hw.group_cap.total_rdbufs) + return -EINVAL; + + idxd->rdbuf_limit = val; + return count; +} + +static ssize_t token_limit_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + dev_warn_once(dev, "attribute deprecated, see read_buffer_limit\n"); + return read_buffer_limit_store(dev, attr, buf, count); +} + +static DEVICE_ATTR_RW(token_limit); /* deprecated */ +static DEVICE_ATTR_RW(read_buffer_limit); + +static ssize_t cdev_major_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct idxd_device *idxd = confdev_to_idxd(dev); + + return sysfs_emit(buf, "%u\n", idxd->major); +} +static DEVICE_ATTR_RO(cdev_major); + +static ssize_t cmd_status_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct idxd_device *idxd = confdev_to_idxd(dev); + + return sysfs_emit(buf, "%#x\n", idxd->cmd_status); +} + +static ssize_t cmd_status_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count) +{ + struct idxd_device *idxd = confdev_to_idxd(dev); + + idxd->cmd_status = 0; + return count; +} +static DEVICE_ATTR_RW(cmd_status); + +static ssize_t iaa_cap_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct idxd_device *idxd = confdev_to_idxd(dev); + + if (idxd->hw.version < DEVICE_VERSION_2) + return -EOPNOTSUPP; + + return sysfs_emit(buf, "%#llx\n", idxd->hw.iaa_cap.bits); +} +static DEVICE_ATTR_RO(iaa_cap); + +static ssize_t event_log_size_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct idxd_device *idxd = confdev_to_idxd(dev); + + if (!idxd->evl) + return -EOPNOTSUPP; + + return sysfs_emit(buf, "%u\n", idxd->evl->size); +} + +static ssize_t event_log_size_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct idxd_device *idxd = confdev_to_idxd(dev); + unsigned long val; + int rc; + + if (!idxd->evl) + return -EOPNOTSUPP; + + rc = kstrtoul(buf, 10, &val); + if (rc < 0) + return -EINVAL; + + if (idxd->state == IDXD_DEV_ENABLED) + return -EPERM; + + if (!test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags)) + return -EPERM; + + if (val < IDXD_EVL_SIZE_MIN || val > IDXD_EVL_SIZE_MAX || + (val * evl_ent_size(idxd) > ULONG_MAX - idxd->evl->dma)) + return -EINVAL; + + idxd->evl->size = val; + return count; +} +static DEVICE_ATTR_RW(event_log_size); + +static bool idxd_device_attr_max_batch_size_invisible(struct attribute *attr, + struct idxd_device *idxd) +{ + /* Intel IAA does not support batch processing, make it invisible */ + return attr == &dev_attr_max_batch_size.attr && + idxd->data->type == IDXD_TYPE_IAX; +} + +static bool idxd_device_attr_read_buffers_invisible(struct attribute *attr, + struct idxd_device *idxd) +{ + /* + * Intel IAA does not support Read Buffer allocation control, + * make these attributes invisible. + */ + return (attr == &dev_attr_max_tokens.attr || + attr == &dev_attr_max_read_buffers.attr || + attr == &dev_attr_token_limit.attr || + attr == &dev_attr_read_buffer_limit.attr) && + idxd->data->type == IDXD_TYPE_IAX; +} + +static bool idxd_device_attr_iaa_cap_invisible(struct attribute *attr, + struct idxd_device *idxd) +{ + return attr == &dev_attr_iaa_cap.attr && + (idxd->data->type != IDXD_TYPE_IAX || + idxd->hw.version < DEVICE_VERSION_2); +} + +static bool idxd_device_attr_event_log_size_invisible(struct attribute *attr, + struct idxd_device *idxd) +{ + return (attr == &dev_attr_event_log_size.attr && + !idxd->hw.gen_cap.evl_support); +} + +static umode_t idxd_device_attr_visible(struct kobject *kobj, + struct attribute *attr, int n) +{ + struct device *dev = container_of(kobj, struct device, kobj); + struct idxd_device *idxd = confdev_to_idxd(dev); + + if (idxd_device_attr_max_batch_size_invisible(attr, idxd)) + return 0; + + if (idxd_device_attr_read_buffers_invisible(attr, idxd)) + return 0; + + if (idxd_device_attr_iaa_cap_invisible(attr, idxd)) + return 0; + + if (idxd_device_attr_event_log_size_invisible(attr, idxd)) + return 0; + + return attr->mode; +} + +static struct attribute *idxd_device_attributes[] = { + &dev_attr_version.attr, + &dev_attr_max_groups.attr, + &dev_attr_max_work_queues.attr, + &dev_attr_max_work_queues_size.attr, + &dev_attr_max_engines.attr, + &dev_attr_numa_node.attr, + &dev_attr_max_batch_size.attr, + &dev_attr_max_transfer_size.attr, + &dev_attr_op_cap.attr, + &dev_attr_gen_cap.attr, + &dev_attr_configurable.attr, + &dev_attr_clients.attr, + &dev_attr_pasid_enabled.attr, + &dev_attr_state.attr, + &dev_attr_errors.attr, + &dev_attr_max_tokens.attr, + &dev_attr_max_read_buffers.attr, + &dev_attr_token_limit.attr, + &dev_attr_read_buffer_limit.attr, + &dev_attr_cdev_major.attr, + &dev_attr_cmd_status.attr, + &dev_attr_iaa_cap.attr, + &dev_attr_event_log_size.attr, + NULL, +}; + +static const struct attribute_group idxd_device_attribute_group = { + .attrs = idxd_device_attributes, + .is_visible = idxd_device_attr_visible, +}; + +static const struct attribute_group *idxd_attribute_groups[] = { + &idxd_device_attribute_group, + NULL, +}; + +static void idxd_conf_device_release(struct device *dev) +{ + struct idxd_device *idxd = confdev_to_idxd(dev); + + kfree(idxd->groups); + bitmap_free(idxd->wq_enable_map); + kfree(idxd->wqs); + kfree(idxd->engines); + kfree(idxd->evl); + kmem_cache_destroy(idxd->evl_cache); + ida_free(&idxd_ida, idxd->id); + bitmap_free(idxd->opcap_bmap); + kfree(idxd); +} + +struct device_type dsa_device_type = { + .name = "dsa", + .release = idxd_conf_device_release, + .groups = idxd_attribute_groups, +}; + +struct device_type iax_device_type = { + .name = "iax", + .release = idxd_conf_device_release, + .groups = idxd_attribute_groups, +}; + +static int idxd_register_engine_devices(struct idxd_device *idxd) +{ + struct idxd_engine *engine; + int i, j, rc; + + for (i = 0; i < idxd->max_engines; i++) { + engine = idxd->engines[i]; + rc = device_add(engine_confdev(engine)); + if (rc < 0) + goto cleanup; + } + + return 0; + +cleanup: + j = i - 1; + for (; i < idxd->max_engines; i++) { + engine = idxd->engines[i]; + put_device(engine_confdev(engine)); + } + + while (j--) { + engine = idxd->engines[j]; + device_unregister(engine_confdev(engine)); + } + return rc; +} + +static int idxd_register_group_devices(struct idxd_device *idxd) +{ + struct idxd_group *group; + int i, j, rc; + + for (i = 0; i < idxd->max_groups; i++) { + group = idxd->groups[i]; + rc = device_add(group_confdev(group)); + if (rc < 0) + goto cleanup; + } + + return 0; + +cleanup: + j = i - 1; + for (; i < idxd->max_groups; i++) { + group = idxd->groups[i]; + put_device(group_confdev(group)); + } + + while (j--) { + group = idxd->groups[j]; + device_unregister(group_confdev(group)); + } + return rc; +} + +static int idxd_register_wq_devices(struct idxd_device *idxd) +{ + struct idxd_wq *wq; + int i, rc, j; + + for (i = 0; i < idxd->max_wqs; i++) { + wq = idxd->wqs[i]; + rc = device_add(wq_confdev(wq)); + if (rc < 0) + goto cleanup; + } + + return 0; + +cleanup: + j = i - 1; + for (; i < idxd->max_wqs; i++) { + wq = idxd->wqs[i]; + put_device(wq_confdev(wq)); + } + + while (j--) { + wq = idxd->wqs[j]; + device_unregister(wq_confdev(wq)); + } + return rc; +} + +int idxd_register_devices(struct idxd_device *idxd) +{ + struct device *dev = &idxd->pdev->dev; + int rc, i; + + rc = device_add(idxd_confdev(idxd)); + if (rc < 0) + return rc; + + rc = idxd_register_wq_devices(idxd); + if (rc < 0) { + dev_dbg(dev, "WQ devices registering failed: %d\n", rc); + goto err_wq; + } + + rc = idxd_register_engine_devices(idxd); + if (rc < 0) { + dev_dbg(dev, "Engine devices registering failed: %d\n", rc); + goto err_engine; + } + + rc = idxd_register_group_devices(idxd); + if (rc < 0) { + dev_dbg(dev, "Group device registering failed: %d\n", rc); + goto err_group; + } + + return 0; + + err_group: + for (i = 0; i < idxd->max_engines; i++) + device_unregister(engine_confdev(idxd->engines[i])); + err_engine: + for (i = 0; i < idxd->max_wqs; i++) + device_unregister(wq_confdev(idxd->wqs[i])); + err_wq: + device_del(idxd_confdev(idxd)); + return rc; +} + +void idxd_unregister_devices(struct idxd_device *idxd) +{ + int i; + + for (i = 0; i < idxd->max_wqs; i++) { + struct idxd_wq *wq = idxd->wqs[i]; + + device_unregister(wq_confdev(wq)); + } + + for (i = 0; i < idxd->max_engines; i++) { + struct idxd_engine *engine = idxd->engines[i]; + + device_unregister(engine_confdev(engine)); + } + + for (i = 0; i < idxd->max_groups; i++) { + struct idxd_group *group = idxd->groups[i]; + + device_unregister(group_confdev(group)); + } +} + +int idxd_register_bus_type(void) +{ + return bus_register(&dsa_bus_type); +} + +void idxd_unregister_bus_type(void) +{ + bus_unregister(&dsa_bus_type); +} |