diff options
Diffstat (limited to 'drivers/iommu/intel')
-rw-r--r-- | drivers/iommu/intel/Makefile | 2 | ||||
-rw-r--r-- | drivers/iommu/intel/cache.c | 420 | ||||
-rw-r--r-- | drivers/iommu/intel/debugfs.c | 7 | ||||
-rw-r--r-- | drivers/iommu/intel/dmar.c | 26 | ||||
-rw-r--r-- | drivers/iommu/intel/iommu.c | 374 | ||||
-rw-r--r-- | drivers/iommu/intel/iommu.h | 88 | ||||
-rw-r--r-- | drivers/iommu/intel/irq_remapping.c | 148 | ||||
-rw-r--r-- | drivers/iommu/intel/nested.c | 69 | ||||
-rw-r--r-- | drivers/iommu/intel/pasid.c | 18 | ||||
-rw-r--r-- | drivers/iommu/intel/perf.h | 1 | ||||
-rw-r--r-- | drivers/iommu/intel/svm.c | 383 | ||||
-rw-r--r-- | drivers/iommu/intel/trace.h | 103 |
12 files changed, 890 insertions, 749 deletions
diff --git a/drivers/iommu/intel/Makefile b/drivers/iommu/intel/Makefile index 5402b699a1..c8beb02815 100644 --- a/drivers/iommu/intel/Makefile +++ b/drivers/iommu/intel/Makefile @@ -1,6 +1,6 @@ # SPDX-License-Identifier: GPL-2.0 obj-$(CONFIG_DMAR_TABLE) += dmar.o -obj-$(CONFIG_INTEL_IOMMU) += iommu.o pasid.o nested.o +obj-$(CONFIG_INTEL_IOMMU) += iommu.o pasid.o nested.o cache.o obj-$(CONFIG_DMAR_TABLE) += trace.o cap_audit.o obj-$(CONFIG_DMAR_PERF) += perf.o obj-$(CONFIG_INTEL_IOMMU_DEBUGFS) += debugfs.o diff --git a/drivers/iommu/intel/cache.c b/drivers/iommu/intel/cache.c new file mode 100644 index 0000000000..44e92638c0 --- /dev/null +++ b/drivers/iommu/intel/cache.c @@ -0,0 +1,420 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * cache.c - Intel VT-d cache invalidation + * + * Copyright (C) 2024 Intel Corporation + * + * Author: Lu Baolu <baolu.lu@linux.intel.com> + */ + +#define pr_fmt(fmt) "DMAR: " fmt + +#include <linux/dmar.h> +#include <linux/iommu.h> +#include <linux/memory.h> +#include <linux/pci.h> +#include <linux/spinlock.h> + +#include "iommu.h" +#include "pasid.h" +#include "trace.h" + +/* Check if an existing cache tag can be reused for a new association. */ +static bool cache_tage_match(struct cache_tag *tag, u16 domain_id, + struct intel_iommu *iommu, struct device *dev, + ioasid_t pasid, enum cache_tag_type type) +{ + if (tag->type != type) + return false; + + if (tag->domain_id != domain_id || tag->pasid != pasid) + return false; + + if (type == CACHE_TAG_IOTLB || type == CACHE_TAG_NESTING_IOTLB) + return tag->iommu == iommu; + + if (type == CACHE_TAG_DEVTLB || type == CACHE_TAG_NESTING_DEVTLB) + return tag->dev == dev; + + return false; +} + +/* Assign a cache tag with specified type to domain. */ +static int cache_tag_assign(struct dmar_domain *domain, u16 did, + struct device *dev, ioasid_t pasid, + enum cache_tag_type type) +{ + struct device_domain_info *info = dev_iommu_priv_get(dev); + struct intel_iommu *iommu = info->iommu; + struct cache_tag *tag, *temp; + unsigned long flags; + + tag = kzalloc(sizeof(*tag), GFP_KERNEL); + if (!tag) + return -ENOMEM; + + tag->type = type; + tag->iommu = iommu; + tag->domain_id = did; + tag->pasid = pasid; + tag->users = 1; + + if (type == CACHE_TAG_DEVTLB || type == CACHE_TAG_NESTING_DEVTLB) + tag->dev = dev; + else + tag->dev = iommu->iommu.dev; + + spin_lock_irqsave(&domain->cache_lock, flags); + list_for_each_entry(temp, &domain->cache_tags, node) { + if (cache_tage_match(temp, did, iommu, dev, pasid, type)) { + temp->users++; + spin_unlock_irqrestore(&domain->cache_lock, flags); + kfree(tag); + trace_cache_tag_assign(temp); + return 0; + } + } + list_add_tail(&tag->node, &domain->cache_tags); + spin_unlock_irqrestore(&domain->cache_lock, flags); + trace_cache_tag_assign(tag); + + return 0; +} + +/* Unassign a cache tag with specified type from domain. */ +static void cache_tag_unassign(struct dmar_domain *domain, u16 did, + struct device *dev, ioasid_t pasid, + enum cache_tag_type type) +{ + struct device_domain_info *info = dev_iommu_priv_get(dev); + struct intel_iommu *iommu = info->iommu; + struct cache_tag *tag; + unsigned long flags; + + spin_lock_irqsave(&domain->cache_lock, flags); + list_for_each_entry(tag, &domain->cache_tags, node) { + if (cache_tage_match(tag, did, iommu, dev, pasid, type)) { + trace_cache_tag_unassign(tag); + if (--tag->users == 0) { + list_del(&tag->node); + kfree(tag); + } + break; + } + } + spin_unlock_irqrestore(&domain->cache_lock, flags); +} + +static int __cache_tag_assign_domain(struct dmar_domain *domain, u16 did, + struct device *dev, ioasid_t pasid) +{ + struct device_domain_info *info = dev_iommu_priv_get(dev); + int ret; + + ret = cache_tag_assign(domain, did, dev, pasid, CACHE_TAG_IOTLB); + if (ret || !info->ats_enabled) + return ret; + + ret = cache_tag_assign(domain, did, dev, pasid, CACHE_TAG_DEVTLB); + if (ret) + cache_tag_unassign(domain, did, dev, pasid, CACHE_TAG_IOTLB); + + return ret; +} + +static void __cache_tag_unassign_domain(struct dmar_domain *domain, u16 did, + struct device *dev, ioasid_t pasid) +{ + struct device_domain_info *info = dev_iommu_priv_get(dev); + + cache_tag_unassign(domain, did, dev, pasid, CACHE_TAG_IOTLB); + + if (info->ats_enabled) + cache_tag_unassign(domain, did, dev, pasid, CACHE_TAG_DEVTLB); +} + +static int __cache_tag_assign_parent_domain(struct dmar_domain *domain, u16 did, + struct device *dev, ioasid_t pasid) +{ + struct device_domain_info *info = dev_iommu_priv_get(dev); + int ret; + + ret = cache_tag_assign(domain, did, dev, pasid, CACHE_TAG_NESTING_IOTLB); + if (ret || !info->ats_enabled) + return ret; + + ret = cache_tag_assign(domain, did, dev, pasid, CACHE_TAG_NESTING_DEVTLB); + if (ret) + cache_tag_unassign(domain, did, dev, pasid, CACHE_TAG_NESTING_IOTLB); + + return ret; +} + +static void __cache_tag_unassign_parent_domain(struct dmar_domain *domain, u16 did, + struct device *dev, ioasid_t pasid) +{ + struct device_domain_info *info = dev_iommu_priv_get(dev); + + cache_tag_unassign(domain, did, dev, pasid, CACHE_TAG_NESTING_IOTLB); + + if (info->ats_enabled) + cache_tag_unassign(domain, did, dev, pasid, CACHE_TAG_NESTING_DEVTLB); +} + +static u16 domain_get_id_for_dev(struct dmar_domain *domain, struct device *dev) +{ + struct device_domain_info *info = dev_iommu_priv_get(dev); + struct intel_iommu *iommu = info->iommu; + + /* + * The driver assigns different domain IDs for all domains except + * the SVA type. + */ + if (domain->domain.type == IOMMU_DOMAIN_SVA) + return FLPT_DEFAULT_DID; + + return domain_id_iommu(domain, iommu); +} + +/* + * Assign cache tags to a domain when it's associated with a device's + * PASID using a specific domain ID. + * + * On success (return value of 0), cache tags are created and added to the + * domain's cache tag list. On failure (negative return value), an error + * code is returned indicating the reason for the failure. + */ +int cache_tag_assign_domain(struct dmar_domain *domain, + struct device *dev, ioasid_t pasid) +{ + u16 did = domain_get_id_for_dev(domain, dev); + int ret; + + ret = __cache_tag_assign_domain(domain, did, dev, pasid); + if (ret || domain->domain.type != IOMMU_DOMAIN_NESTED) + return ret; + + ret = __cache_tag_assign_parent_domain(domain->s2_domain, did, dev, pasid); + if (ret) + __cache_tag_unassign_domain(domain, did, dev, pasid); + + return ret; +} + +/* + * Remove the cache tags associated with a device's PASID when the domain is + * detached from the device. + * + * The cache tags must be previously assigned to the domain by calling the + * assign interface. + */ +void cache_tag_unassign_domain(struct dmar_domain *domain, + struct device *dev, ioasid_t pasid) +{ + u16 did = domain_get_id_for_dev(domain, dev); + + __cache_tag_unassign_domain(domain, did, dev, pasid); + if (domain->domain.type == IOMMU_DOMAIN_NESTED) + __cache_tag_unassign_parent_domain(domain->s2_domain, did, dev, pasid); +} + +static unsigned long calculate_psi_aligned_address(unsigned long start, + unsigned long end, + unsigned long *_pages, + unsigned long *_mask) +{ + unsigned long pages = aligned_nrpages(start, end - start + 1); + unsigned long aligned_pages = __roundup_pow_of_two(pages); + unsigned long bitmask = aligned_pages - 1; + unsigned long mask = ilog2(aligned_pages); + unsigned long pfn = IOVA_PFN(start); + + /* + * PSI masks the low order bits of the base address. If the + * address isn't aligned to the mask, then compute a mask value + * needed to ensure the target range is flushed. + */ + if (unlikely(bitmask & pfn)) { + unsigned long end_pfn = pfn + pages - 1, shared_bits; + + /* + * Since end_pfn <= pfn + bitmask, the only way bits + * higher than bitmask can differ in pfn and end_pfn is + * by carrying. This means after masking out bitmask, + * high bits starting with the first set bit in + * shared_bits are all equal in both pfn and end_pfn. + */ + shared_bits = ~(pfn ^ end_pfn) & ~bitmask; + mask = shared_bits ? __ffs(shared_bits) : MAX_AGAW_PFN_WIDTH; + aligned_pages = 1UL << mask; + } + + *_pages = aligned_pages; + *_mask = mask; + + return ALIGN_DOWN(start, VTD_PAGE_SIZE << mask); +} + +/* + * Invalidates a range of IOVA from @start (inclusive) to @end (inclusive) + * when the memory mappings in the target domain have been modified. + */ +void cache_tag_flush_range(struct dmar_domain *domain, unsigned long start, + unsigned long end, int ih) +{ + unsigned long pages, mask, addr; + struct cache_tag *tag; + unsigned long flags; + + addr = calculate_psi_aligned_address(start, end, &pages, &mask); + + spin_lock_irqsave(&domain->cache_lock, flags); + list_for_each_entry(tag, &domain->cache_tags, node) { + struct intel_iommu *iommu = tag->iommu; + struct device_domain_info *info; + u16 sid; + + switch (tag->type) { + case CACHE_TAG_IOTLB: + case CACHE_TAG_NESTING_IOTLB: + if (domain->use_first_level) { + qi_flush_piotlb(iommu, tag->domain_id, + tag->pasid, addr, pages, ih); + } else { + /* + * Fallback to domain selective flush if no + * PSI support or the size is too big. + */ + if (!cap_pgsel_inv(iommu->cap) || + mask > cap_max_amask_val(iommu->cap)) + iommu->flush.flush_iotlb(iommu, tag->domain_id, + 0, 0, DMA_TLB_DSI_FLUSH); + else + iommu->flush.flush_iotlb(iommu, tag->domain_id, + addr | ih, mask, + DMA_TLB_PSI_FLUSH); + } + break; + case CACHE_TAG_NESTING_DEVTLB: + /* + * Address translation cache in device side caches the + * result of nested translation. There is no easy way + * to identify the exact set of nested translations + * affected by a change in S2. So just flush the entire + * device cache. + */ + addr = 0; + mask = MAX_AGAW_PFN_WIDTH; + fallthrough; + case CACHE_TAG_DEVTLB: + info = dev_iommu_priv_get(tag->dev); + sid = PCI_DEVID(info->bus, info->devfn); + + if (tag->pasid == IOMMU_NO_PASID) + qi_flush_dev_iotlb(iommu, sid, info->pfsid, + info->ats_qdep, addr, mask); + else + qi_flush_dev_iotlb_pasid(iommu, sid, info->pfsid, + tag->pasid, info->ats_qdep, + addr, mask); + + quirk_extra_dev_tlb_flush(info, addr, mask, tag->pasid, info->ats_qdep); + break; + } + + trace_cache_tag_flush_range(tag, start, end, addr, pages, mask); + } + spin_unlock_irqrestore(&domain->cache_lock, flags); +} + +/* + * Invalidates all ranges of IOVA when the memory mappings in the target + * domain have been modified. + */ +void cache_tag_flush_all(struct dmar_domain *domain) +{ + struct cache_tag *tag; + unsigned long flags; + + spin_lock_irqsave(&domain->cache_lock, flags); + list_for_each_entry(tag, &domain->cache_tags, node) { + struct intel_iommu *iommu = tag->iommu; + struct device_domain_info *info; + u16 sid; + + switch (tag->type) { + case CACHE_TAG_IOTLB: + case CACHE_TAG_NESTING_IOTLB: + if (domain->use_first_level) + qi_flush_piotlb(iommu, tag->domain_id, + tag->pasid, 0, -1, 0); + else + iommu->flush.flush_iotlb(iommu, tag->domain_id, + 0, 0, DMA_TLB_DSI_FLUSH); + break; + case CACHE_TAG_DEVTLB: + case CACHE_TAG_NESTING_DEVTLB: + info = dev_iommu_priv_get(tag->dev); + sid = PCI_DEVID(info->bus, info->devfn); + + qi_flush_dev_iotlb(iommu, sid, info->pfsid, info->ats_qdep, + 0, MAX_AGAW_PFN_WIDTH); + quirk_extra_dev_tlb_flush(info, 0, MAX_AGAW_PFN_WIDTH, + IOMMU_NO_PASID, info->ats_qdep); + break; + } + + trace_cache_tag_flush_all(tag); + } + spin_unlock_irqrestore(&domain->cache_lock, flags); +} + +/* + * Invalidate a range of IOVA when new mappings are created in the target + * domain. + * + * - VT-d spec, Section 6.1 Caching Mode: When the CM field is reported as + * Set, any software updates to remapping structures other than first- + * stage mapping requires explicit invalidation of the caches. + * - VT-d spec, Section 6.8 Write Buffer Flushing: For hardware that requires + * write buffer flushing, software must explicitly perform write-buffer + * flushing, if cache invalidation is not required. + */ +void cache_tag_flush_range_np(struct dmar_domain *domain, unsigned long start, + unsigned long end) +{ + unsigned long pages, mask, addr; + struct cache_tag *tag; + unsigned long flags; + + addr = calculate_psi_aligned_address(start, end, &pages, &mask); + + spin_lock_irqsave(&domain->cache_lock, flags); + list_for_each_entry(tag, &domain->cache_tags, node) { + struct intel_iommu *iommu = tag->iommu; + + if (!cap_caching_mode(iommu->cap) || domain->use_first_level) { + iommu_flush_write_buffer(iommu); + continue; + } + + if (tag->type == CACHE_TAG_IOTLB || + tag->type == CACHE_TAG_NESTING_IOTLB) { + /* + * Fallback to domain selective flush if no + * PSI support or the size is too big. + */ + if (!cap_pgsel_inv(iommu->cap) || + mask > cap_max_amask_val(iommu->cap)) + iommu->flush.flush_iotlb(iommu, tag->domain_id, + 0, 0, DMA_TLB_DSI_FLUSH); + else + iommu->flush.flush_iotlb(iommu, tag->domain_id, + addr, mask, + DMA_TLB_PSI_FLUSH); + } + + trace_cache_tag_flush_range_np(tag, start, end, addr, pages, mask); + } + spin_unlock_irqrestore(&domain->cache_lock, flags); +} diff --git a/drivers/iommu/intel/debugfs.c b/drivers/iommu/intel/debugfs.c index 86b506af7d..affbf4a155 100644 --- a/drivers/iommu/intel/debugfs.c +++ b/drivers/iommu/intel/debugfs.c @@ -706,7 +706,6 @@ static ssize_t dmar_perf_latency_write(struct file *filp, dmar_latency_disable(iommu, DMAR_LATENCY_INV_IOTLB); dmar_latency_disable(iommu, DMAR_LATENCY_INV_DEVTLB); dmar_latency_disable(iommu, DMAR_LATENCY_INV_IEC); - dmar_latency_disable(iommu, DMAR_LATENCY_PRQ); } rcu_read_unlock(); break; @@ -728,12 +727,6 @@ static ssize_t dmar_perf_latency_write(struct file *filp, dmar_latency_enable(iommu, DMAR_LATENCY_INV_IEC); rcu_read_unlock(); break; - case 4: - rcu_read_lock(); - for_each_active_iommu(iommu, drhd) - dmar_latency_enable(iommu, DMAR_LATENCY_PRQ); - rcu_read_unlock(); - break; default: return -EINVAL; } diff --git a/drivers/iommu/intel/dmar.c b/drivers/iommu/intel/dmar.c index 36d7427b12..304e84949c 100644 --- a/drivers/iommu/intel/dmar.c +++ b/drivers/iommu/intel/dmar.c @@ -32,6 +32,7 @@ #include "iommu.h" #include "../irq_remapping.h" +#include "../iommu-pages.h" #include "perf.h" #include "trace.h" #include "perfmon.h" @@ -1067,7 +1068,6 @@ static int alloc_iommu(struct dmar_drhd_unit *drhd) goto error_free_seq_id; } - err = -EINVAL; if (!cap_sagaw(iommu->cap) && (!ecap_smts(iommu->ecap) || ecap_slts(iommu->ecap))) { pr_info("%s: No supported address widths. Not attempting DMA translation.\n", @@ -1187,7 +1187,7 @@ static void free_iommu(struct intel_iommu *iommu) } if (iommu->qi) { - free_page((unsigned long)iommu->qi->desc); + iommu_free_page(iommu->qi->desc); kfree(iommu->qi->desc_status); kfree(iommu->qi); } @@ -1755,7 +1755,8 @@ static void __dmar_enable_qi(struct intel_iommu *iommu) int dmar_enable_qi(struct intel_iommu *iommu) { struct q_inval *qi; - struct page *desc_page; + void *desc; + int order; if (!ecap_qis(iommu->ecap)) return -ENOENT; @@ -1776,19 +1777,19 @@ int dmar_enable_qi(struct intel_iommu *iommu) * Need two pages to accommodate 256 descriptors of 256 bits each * if the remapping hardware supports scalable mode translation. */ - desc_page = alloc_pages_node(iommu->node, GFP_ATOMIC | __GFP_ZERO, - !!ecap_smts(iommu->ecap)); - if (!desc_page) { + order = ecap_smts(iommu->ecap) ? 1 : 0; + desc = iommu_alloc_pages_node(iommu->node, GFP_ATOMIC, order); + if (!desc) { kfree(qi); iommu->qi = NULL; return -ENOMEM; } - qi->desc = page_address(desc_page); + qi->desc = desc; qi->desc_status = kcalloc(QI_LENGTH, sizeof(int), GFP_ATOMIC); if (!qi->desc_status) { - free_page((unsigned long) qi->desc); + iommu_free_page(qi->desc); kfree(qi); iommu->qi = NULL; return -ENOMEM; @@ -2122,7 +2123,7 @@ int dmar_set_interrupt(struct intel_iommu *iommu) return ret; } -int __init enable_drhd_fault_handling(void) +int enable_drhd_fault_handling(unsigned int cpu) { struct dmar_drhd_unit *drhd; struct intel_iommu *iommu; @@ -2132,7 +2133,12 @@ int __init enable_drhd_fault_handling(void) */ for_each_iommu(iommu, drhd) { u32 fault_status; - int ret = dmar_set_interrupt(iommu); + int ret; + + if (iommu->irq || iommu->node != cpu_to_node(cpu)) + continue; + + ret = dmar_set_interrupt(iommu); if (ret) { pr_err("DRHD %Lx: failed to enable fault, interrupt, ret %d\n", diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index e4a03588a8..f55ec1fd79 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -27,6 +27,7 @@ #include "iommu.h" #include "../dma-iommu.h" #include "../irq_remapping.h" +#include "../iommu-pages.h" #include "pasid.h" #include "cap_audit.h" #include "perfmon.h" @@ -54,11 +55,6 @@ __DOMAIN_MAX_PFN(gaw), (unsigned long)-1)) #define DOMAIN_MAX_ADDR(gaw) (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT) -/* IO virtual address start page frame number */ -#define IOVA_START_PFN (1) - -#define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT) - static void __init check_tylersburg_isoch(void); static int rwbf_quirk; @@ -297,22 +293,6 @@ static int __init intel_iommu_setup(char *str) } __setup("intel_iommu=", intel_iommu_setup); -void *alloc_pgtable_page(int node, gfp_t gfp) -{ - struct page *page; - void *vaddr = NULL; - - page = alloc_pages_node(node, gfp | __GFP_ZERO, 0); - if (page) - vaddr = page_address(page); - return vaddr; -} - -void free_pgtable_page(void *vaddr) -{ - free_page((unsigned long)vaddr); -} - static int domain_type_is_si(struct dmar_domain *domain) { return domain->domain.type == IOMMU_DOMAIN_IDENTITY; @@ -544,7 +524,7 @@ struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus, if (!alloc) return NULL; - context = alloc_pgtable_page(iommu->node, GFP_ATOMIC); + context = iommu_alloc_page_node(iommu->node, GFP_ATOMIC); if (!context) return NULL; @@ -718,17 +698,17 @@ static void free_context_table(struct intel_iommu *iommu) for (i = 0; i < ROOT_ENTRY_NR; i++) { context = iommu_context_addr(iommu, i, 0, 0); if (context) - free_pgtable_page(context); + iommu_free_page(context); if (!sm_supported(iommu)) continue; context = iommu_context_addr(iommu, i, 0x80, 0); if (context) - free_pgtable_page(context); + iommu_free_page(context); } - free_pgtable_page(iommu->root_entry); + iommu_free_page(iommu->root_entry); iommu->root_entry = NULL; } @@ -864,9 +844,9 @@ static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain, break; if (!dma_pte_present(pte)) { - uint64_t pteval; + uint64_t pteval, tmp; - tmp_page = alloc_pgtable_page(domain->nid, gfp); + tmp_page = iommu_alloc_page_node(domain->nid, gfp); if (!tmp_page) return NULL; @@ -876,9 +856,10 @@ static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain, if (domain->use_first_level) pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS; - if (cmpxchg64(&pte->val, 0ULL, pteval)) + tmp = 0ULL; + if (!try_cmpxchg64(&pte->val, &tmp, pteval)) /* Someone else set it while we were thinking; use theirs. */ - free_pgtable_page(tmp_page); + iommu_free_page(tmp_page); else domain_flush_cache(domain, pte, sizeof(*pte)); } @@ -991,7 +972,7 @@ static void dma_pte_free_level(struct dmar_domain *domain, int level, last_pfn < level_pfn + level_size(level) - 1)) { dma_clear_pte(pte); domain_flush_cache(domain, pte, sizeof(*pte)); - free_pgtable_page(level_pte); + iommu_free_page(level_pte); } next: pfn += level_size(level); @@ -1015,7 +996,7 @@ static void dma_pte_free_pagetable(struct dmar_domain *domain, /* free pgd */ if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) { - free_pgtable_page(domain->pgd); + iommu_free_page(domain->pgd); domain->pgd = NULL; } } @@ -1117,7 +1098,7 @@ static int iommu_alloc_root_entry(struct intel_iommu *iommu) { struct root_entry *root; - root = alloc_pgtable_page(iommu->node, GFP_ATOMIC); + root = iommu_alloc_page_node(iommu->node, GFP_ATOMIC); if (!root) { pr_err("Allocating root entry for %s failed\n", iommu->name); @@ -1393,197 +1374,9 @@ static void __iommu_flush_dev_iotlb(struct device_domain_info *info, quirk_extra_dev_tlb_flush(info, addr, mask, IOMMU_NO_PASID, qdep); } -static void iommu_flush_dev_iotlb(struct dmar_domain *domain, - u64 addr, unsigned mask) -{ - struct dev_pasid_info *dev_pasid; - struct device_domain_info *info; - unsigned long flags; - - if (!domain->has_iotlb_device) - return; - - spin_lock_irqsave(&domain->lock, flags); - list_for_each_entry(info, &domain->devices, link) - __iommu_flush_dev_iotlb(info, addr, mask); - - list_for_each_entry(dev_pasid, &domain->dev_pasids, link_domain) { - info = dev_iommu_priv_get(dev_pasid->dev); - - if (!info->ats_enabled) - continue; - - qi_flush_dev_iotlb_pasid(info->iommu, - PCI_DEVID(info->bus, info->devfn), - info->pfsid, dev_pasid->pasid, - info->ats_qdep, addr, - mask); - } - spin_unlock_irqrestore(&domain->lock, flags); -} - -static void domain_flush_pasid_iotlb(struct intel_iommu *iommu, - struct dmar_domain *domain, u64 addr, - unsigned long npages, bool ih) -{ - u16 did = domain_id_iommu(domain, iommu); - struct dev_pasid_info *dev_pasid; - unsigned long flags; - - spin_lock_irqsave(&domain->lock, flags); - list_for_each_entry(dev_pasid, &domain->dev_pasids, link_domain) - qi_flush_piotlb(iommu, did, dev_pasid->pasid, addr, npages, ih); - - if (!list_empty(&domain->devices)) - qi_flush_piotlb(iommu, did, IOMMU_NO_PASID, addr, npages, ih); - spin_unlock_irqrestore(&domain->lock, flags); -} - -static void __iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did, - unsigned long pfn, unsigned int pages, - int ih) -{ - unsigned int aligned_pages = __roundup_pow_of_two(pages); - unsigned long bitmask = aligned_pages - 1; - unsigned int mask = ilog2(aligned_pages); - u64 addr = (u64)pfn << VTD_PAGE_SHIFT; - - /* - * PSI masks the low order bits of the base address. If the - * address isn't aligned to the mask, then compute a mask value - * needed to ensure the target range is flushed. - */ - if (unlikely(bitmask & pfn)) { - unsigned long end_pfn = pfn + pages - 1, shared_bits; - - /* - * Since end_pfn <= pfn + bitmask, the only way bits - * higher than bitmask can differ in pfn and end_pfn is - * by carrying. This means after masking out bitmask, - * high bits starting with the first set bit in - * shared_bits are all equal in both pfn and end_pfn. - */ - shared_bits = ~(pfn ^ end_pfn) & ~bitmask; - mask = shared_bits ? __ffs(shared_bits) : BITS_PER_LONG; - } - - /* - * Fallback to domain selective flush if no PSI support or - * the size is too big. - */ - if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap)) - iommu->flush.flush_iotlb(iommu, did, 0, 0, - DMA_TLB_DSI_FLUSH); - else - iommu->flush.flush_iotlb(iommu, did, addr | ih, mask, - DMA_TLB_PSI_FLUSH); -} - -static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, - struct dmar_domain *domain, - unsigned long pfn, unsigned int pages, - int ih, int map) -{ - unsigned int aligned_pages = __roundup_pow_of_two(pages); - unsigned int mask = ilog2(aligned_pages); - uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT; - u16 did = domain_id_iommu(domain, iommu); - - if (WARN_ON(!pages)) - return; - - if (ih) - ih = 1 << 6; - - if (domain->use_first_level) - domain_flush_pasid_iotlb(iommu, domain, addr, pages, ih); - else - __iommu_flush_iotlb_psi(iommu, did, pfn, pages, ih); - - /* - * In caching mode, changes of pages from non-present to present require - * flush. However, device IOTLB doesn't need to be flushed in this case. - */ - if (!cap_caching_mode(iommu->cap) || !map) - iommu_flush_dev_iotlb(domain, addr, mask); -} - -/* Notification for newly created mappings */ -static void __mapping_notify_one(struct intel_iommu *iommu, struct dmar_domain *domain, - unsigned long pfn, unsigned int pages) -{ - /* - * It's a non-present to present mapping. Only flush if caching mode - * and second level. - */ - if (cap_caching_mode(iommu->cap) && !domain->use_first_level) - iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1); - else - iommu_flush_write_buffer(iommu); -} - -/* - * Flush the relevant caches in nested translation if the domain - * also serves as a parent - */ -static void parent_domain_flush(struct dmar_domain *domain, - unsigned long pfn, - unsigned long pages, int ih) -{ - struct dmar_domain *s1_domain; - - spin_lock(&domain->s1_lock); - list_for_each_entry(s1_domain, &domain->s1_domains, s2_link) { - struct device_domain_info *device_info; - struct iommu_domain_info *info; - unsigned long flags; - unsigned long i; - - xa_for_each(&s1_domain->iommu_array, i, info) - __iommu_flush_iotlb_psi(info->iommu, info->did, - pfn, pages, ih); - - if (!s1_domain->has_iotlb_device) - continue; - - spin_lock_irqsave(&s1_domain->lock, flags); - list_for_each_entry(device_info, &s1_domain->devices, link) - /* - * Address translation cache in device side caches the - * result of nested translation. There is no easy way - * to identify the exact set of nested translations - * affected by a change in S2. So just flush the entire - * device cache. - */ - __iommu_flush_dev_iotlb(device_info, 0, - MAX_AGAW_PFN_WIDTH); - spin_unlock_irqrestore(&s1_domain->lock, flags); - } - spin_unlock(&domain->s1_lock); -} - static void intel_flush_iotlb_all(struct iommu_domain *domain) { - struct dmar_domain *dmar_domain = to_dmar_domain(domain); - struct iommu_domain_info *info; - unsigned long idx; - - xa_for_each(&dmar_domain->iommu_array, idx, info) { - struct intel_iommu *iommu = info->iommu; - u16 did = domain_id_iommu(dmar_domain, iommu); - - if (dmar_domain->use_first_level) - domain_flush_pasid_iotlb(iommu, dmar_domain, 0, -1, 0); - else - iommu->flush.flush_iotlb(iommu, did, 0, 0, - DMA_TLB_DSI_FLUSH); - - if (!cap_caching_mode(iommu->cap)) - iommu_flush_dev_iotlb(dmar_domain, 0, MAX_AGAW_PFN_WIDTH); - } - - if (dmar_domain->nested_parent) - parent_domain_flush(dmar_domain, 0, -1, 0); + cache_tag_flush_all(to_dmar_domain(domain)); } static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu) @@ -1749,7 +1542,9 @@ static struct dmar_domain *alloc_domain(unsigned int type) domain->has_iotlb_device = false; INIT_LIST_HEAD(&domain->devices); INIT_LIST_HEAD(&domain->dev_pasids); + INIT_LIST_HEAD(&domain->cache_tags); spin_lock_init(&domain->lock); + spin_lock_init(&domain->cache_lock); xa_init(&domain->iommu_array); return domain; @@ -1761,6 +1556,9 @@ int domain_attach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu) unsigned long ndomains; int num, ret = -ENOSPC; + if (domain->domain.type == IOMMU_DOMAIN_SVA) + return 0; + info = kzalloc(sizeof(*info), GFP_KERNEL); if (!info) return -ENOMEM; @@ -1808,6 +1606,9 @@ void domain_detach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu) { struct iommu_domain_info *info; + if (domain->domain.type == IOMMU_DOMAIN_SVA) + return; + spin_lock(&iommu->lock); info = xa_load(&domain->iommu_array, iommu->seq_id); if (--info->refcnt == 0) { @@ -1840,7 +1641,7 @@ static void domain_exit(struct dmar_domain *domain) LIST_HEAD(freelist); domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw), &freelist); - put_pages_list(&freelist); + iommu_put_pages_list(&freelist); } if (WARN_ON(!list_empty(&domain->devices))) @@ -1987,13 +1788,6 @@ domain_context_mapping(struct dmar_domain *domain, struct device *dev) domain_context_mapping_cb, domain); } -/* Returns a number of VTD pages, but aligned to MM page size */ -static unsigned long aligned_nrpages(unsigned long host_addr, size_t size) -{ - host_addr &= ~PAGE_MASK; - return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT; -} - /* Return largest possible superpage level for a given mapping */ static int hardware_largepage_caps(struct dmar_domain *domain, unsigned long iov_pfn, unsigned long phy_pfn, unsigned long pages) @@ -2030,9 +1824,7 @@ static void switch_to_super_page(struct dmar_domain *domain, unsigned long end_pfn, int level) { unsigned long lvl_pages = lvl_to_nr_pages(level); - struct iommu_domain_info *info; struct dma_pte *pte = NULL; - unsigned long i; while (start_pfn <= end_pfn) { if (!pte) @@ -2044,13 +1836,8 @@ static void switch_to_super_page(struct dmar_domain *domain, start_pfn + lvl_pages - 1, level + 1); - xa_for_each(&domain->iommu_array, i, info) - iommu_flush_iotlb_psi(info->iommu, domain, - start_pfn, lvl_pages, - 0, 0); - if (domain->nested_parent) - parent_domain_flush(domain, start_pfn, - lvl_pages, 0); + cache_tag_flush_range(domain, start_pfn << VTD_PAGE_SHIFT, + end_pfn << VTD_PAGE_SHIFT, 0); } pte++; @@ -2127,8 +1914,8 @@ __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn, /* We don't need lock here, nobody else * touches the iova range */ - tmp = cmpxchg64_local(&pte->val, 0ULL, pteval); - if (tmp) { + tmp = 0ULL; + if (!try_cmpxchg64_local(&pte->val, &tmp, pteval)) { static int dumps = 5; pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n", iov_pfn, tmp, (unsigned long long)pteval); @@ -2284,7 +2071,7 @@ static int __init si_domain_init(int hw) for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) { ret = iommu_domain_identity_map(si_domain, mm_to_dma_pfn_start(start_pfn), - mm_to_dma_pfn_end(end_pfn)); + mm_to_dma_pfn_end(end_pfn-1)); if (ret) return ret; } @@ -2326,6 +2113,7 @@ static int dmar_domain_attach_device(struct dmar_domain *domain, ret = domain_attach_iommu(domain, iommu); if (ret) return ret; + info->domain = domain; spin_lock_irqsave(&domain->lock, flags); list_add(&info->link, &domain->devices); @@ -2343,15 +2131,21 @@ static int dmar_domain_attach_device(struct dmar_domain *domain, else ret = intel_pasid_setup_second_level(iommu, domain, dev, IOMMU_NO_PASID); - if (ret) { - device_block_translation(dev); - return ret; - } + if (ret) + goto out_block_translation; if (sm_supported(info->iommu) || !domain_type_is_si(info->domain)) iommu_enable_pci_caps(info); + ret = cache_tag_assign_domain(domain, dev, IOMMU_NO_PASID); + if (ret) + goto out_block_translation; + return 0; + +out_block_translation: + device_block_translation(dev); + return ret; } /** @@ -2493,7 +2287,7 @@ static int copy_context_table(struct intel_iommu *iommu, if (!old_ce) goto out; - new_ce = alloc_pgtable_page(iommu->node, GFP_KERNEL); + new_ce = iommu_alloc_page_node(iommu->node, GFP_KERNEL); if (!new_ce) goto out_unmap; @@ -3407,19 +3201,10 @@ static int intel_iommu_memory_notifier(struct notifier_block *nb, case MEM_OFFLINE: case MEM_CANCEL_ONLINE: { - struct dmar_drhd_unit *drhd; - struct intel_iommu *iommu; LIST_HEAD(freelist); domain_unmap(si_domain, start_vpfn, last_vpfn, &freelist); - - rcu_read_lock(); - for_each_active_iommu(iommu, drhd) - iommu_flush_iotlb_psi(iommu, si_domain, - start_vpfn, mhp->nr_pages, - list_empty(&freelist), 0); - rcu_read_unlock(); - put_pages_list(&freelist); + iommu_put_pages_list(&freelist); } break; } @@ -3808,6 +3593,7 @@ void device_block_translation(struct device *dev) list_del(&info->link); spin_unlock_irqrestore(&info->domain->lock, flags); + cache_tag_unassign_domain(info->domain, dev, IOMMU_NO_PASID); domain_detach_iommu(info->domain, iommu); info->domain = NULL; } @@ -3826,7 +3612,7 @@ static int md_domain_init(struct dmar_domain *domain, int guest_width) domain->max_addr = 0; /* always allocate the top pgd */ - domain->pgd = alloc_pgtable_page(domain->nid, GFP_ATOMIC); + domain->pgd = iommu_alloc_page_node(domain->nid, GFP_ATOMIC); if (!domain->pgd) return -ENOMEM; domain_flush_cache(domain, domain->pgd, PAGE_SIZE); @@ -3875,8 +3661,6 @@ static struct iommu_domain *intel_iommu_domain_alloc(unsigned type) return domain; case IOMMU_DOMAIN_IDENTITY: return &si_domain->domain; - case IOMMU_DOMAIN_SVA: - return intel_svm_domain_alloc(); default: return NULL; } @@ -3980,7 +3764,7 @@ int prepare_domain_attach_device(struct iommu_domain *domain, pte = dmar_domain->pgd; if (dma_pte_present(pte)) { dmar_domain->pgd = phys_to_virt(dma_pte_addr(pte)); - free_pgtable_page(pte); + iommu_free_page(pte); } dmar_domain->agaw--; } @@ -4115,26 +3899,9 @@ static size_t intel_iommu_unmap_pages(struct iommu_domain *domain, static void intel_iommu_tlb_sync(struct iommu_domain *domain, struct iommu_iotlb_gather *gather) { - struct dmar_domain *dmar_domain = to_dmar_domain(domain); - unsigned long iova_pfn = IOVA_PFN(gather->start); - size_t size = gather->end - gather->start; - struct iommu_domain_info *info; - unsigned long start_pfn; - unsigned long nrpages; - unsigned long i; - - nrpages = aligned_nrpages(gather->start, size); - start_pfn = mm_to_dma_pfn_start(iova_pfn); - - xa_for_each(&dmar_domain->iommu_array, i, info) - iommu_flush_iotlb_psi(info->iommu, dmar_domain, - start_pfn, nrpages, - list_empty(&gather->freelist), 0); - - if (dmar_domain->nested_parent) - parent_domain_flush(dmar_domain, start_pfn, nrpages, - list_empty(&gather->freelist)); - put_pages_list(&gather->freelist); + cache_tag_flush_range(to_dmar_domain(domain), gather->start, + gather->end, list_empty(&gather->freelist)); + iommu_put_pages_list(&gather->freelist); } static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain, @@ -4345,12 +4112,6 @@ static void intel_iommu_release_device(struct device *dev) set_dma_ops(dev, NULL); } -static void intel_iommu_probe_finalize(struct device *dev) -{ - set_dma_ops(dev, NULL); - iommu_setup_dma_ops(dev, 0, U64_MAX); -} - static void intel_iommu_get_resv_regions(struct device *device, struct list_head *head) { @@ -4572,41 +4333,20 @@ static bool risky_device(struct pci_dev *pdev) static int intel_iommu_iotlb_sync_map(struct iommu_domain *domain, unsigned long iova, size_t size) { - struct dmar_domain *dmar_domain = to_dmar_domain(domain); - unsigned long pages = aligned_nrpages(iova, size); - unsigned long pfn = iova >> VTD_PAGE_SHIFT; - struct iommu_domain_info *info; - unsigned long i; + cache_tag_flush_range_np(to_dmar_domain(domain), iova, iova + size - 1); - xa_for_each(&dmar_domain->iommu_array, i, info) - __mapping_notify_one(info->iommu, dmar_domain, pfn, pages); return 0; } -static void intel_iommu_remove_dev_pasid(struct device *dev, ioasid_t pasid) +static void intel_iommu_remove_dev_pasid(struct device *dev, ioasid_t pasid, + struct iommu_domain *domain) { struct device_domain_info *info = dev_iommu_priv_get(dev); + struct dmar_domain *dmar_domain = to_dmar_domain(domain); struct dev_pasid_info *curr, *dev_pasid = NULL; struct intel_iommu *iommu = info->iommu; - struct dmar_domain *dmar_domain; - struct iommu_domain *domain; unsigned long flags; - domain = iommu_get_domain_for_dev_pasid(dev, pasid, 0); - if (WARN_ON_ONCE(!domain)) - goto out_tear_down; - - /* - * The SVA implementation needs to handle its own stuffs like the mm - * notification. Before consolidating that code into iommu core, let - * the intel sva code handle it. - */ - if (domain->type == IOMMU_DOMAIN_SVA) { - intel_svm_remove_dev_pasid(dev, pasid); - goto out_tear_down; - } - - dmar_domain = to_dmar_domain(domain); spin_lock_irqsave(&dmar_domain->lock, flags); list_for_each_entry(curr, &dmar_domain->dev_pasids, link_domain) { if (curr->dev == dev && curr->pasid == pasid) { @@ -4618,10 +4358,10 @@ static void intel_iommu_remove_dev_pasid(struct device *dev, ioasid_t pasid) WARN_ON_ONCE(!dev_pasid); spin_unlock_irqrestore(&dmar_domain->lock, flags); + cache_tag_unassign_domain(dmar_domain, dev, pasid); domain_detach_iommu(dmar_domain, iommu); intel_iommu_debugfs_remove_dev_pasid(dev_pasid); kfree(dev_pasid); -out_tear_down: intel_pasid_tear_down_entry(iommu, dev, pasid, false); intel_drain_pasid_prq(dev, pasid); } @@ -4657,6 +4397,10 @@ static int intel_iommu_set_dev_pasid(struct iommu_domain *domain, if (ret) goto out_free; + ret = cache_tag_assign_domain(dmar_domain, dev, pasid); + if (ret) + goto out_detach_iommu; + if (domain_type_is_si(dmar_domain)) ret = intel_pasid_setup_pass_through(iommu, dev, pasid); else if (dmar_domain->use_first_level) @@ -4666,7 +4410,7 @@ static int intel_iommu_set_dev_pasid(struct iommu_domain *domain, ret = intel_pasid_setup_second_level(iommu, dmar_domain, dev, pasid); if (ret) - goto out_detach_iommu; + goto out_unassign_tag; dev_pasid->dev = dev; dev_pasid->pasid = pasid; @@ -4678,6 +4422,8 @@ static int intel_iommu_set_dev_pasid(struct iommu_domain *domain, intel_iommu_debugfs_create_dev_pasid(dev_pasid); return 0; +out_unassign_tag: + cache_tag_unassign_domain(dmar_domain, dev, pasid); out_detach_iommu: domain_detach_iommu(dmar_domain, iommu); out_free: @@ -4834,8 +4580,8 @@ const struct iommu_ops intel_iommu_ops = { .hw_info = intel_iommu_hw_info, .domain_alloc = intel_iommu_domain_alloc, .domain_alloc_user = intel_iommu_domain_alloc_user, + .domain_alloc_sva = intel_svm_domain_alloc, .probe_device = intel_iommu_probe_device, - .probe_finalize = intel_iommu_probe_finalize, .release_device = intel_iommu_release_device, .get_resv_regions = intel_iommu_get_resv_regions, .device_group = intel_iommu_device_group, diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h index 404d2476a8..eaf015b435 100644 --- a/drivers/iommu/intel/iommu.h +++ b/drivers/iommu/intel/iommu.h @@ -35,6 +35,8 @@ #define VTD_PAGE_MASK (((u64)-1) << VTD_PAGE_SHIFT) #define VTD_PAGE_ALIGN(addr) (((addr) + VTD_PAGE_SIZE - 1) & VTD_PAGE_MASK) +#define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT) + #define VTD_STRIDE_SHIFT (9) #define VTD_STRIDE_MASK (((u64)-1) << VTD_STRIDE_SHIFT) @@ -455,7 +457,6 @@ enum { /* Page group response descriptor QW0 */ #define QI_PGRP_PASID_P(p) (((u64)(p)) << 4) -#define QI_PGRP_PDP(p) (((u64)(p)) << 5) #define QI_PGRP_RESP_CODE(res) (((u64)(res)) << 12) #define QI_PGRP_DID(rid) (((u64)(rid)) << 16) #define QI_PGRP_PASID(pasid) (((u64)(pasid)) << 32) @@ -607,6 +608,9 @@ struct dmar_domain { struct list_head devices; /* all devices' list */ struct list_head dev_pasids; /* all attached pasids */ + spinlock_t cache_lock; /* Protect the cache tag list */ + struct list_head cache_tags; /* Cache tag list */ + int iommu_superpage;/* Level of superpages supported: 0 == 4KiB (no superpages), 1 == 2MiB, 2 == 1GiB, 3 == 512GiB, 4 == 1TiB */ @@ -644,6 +648,11 @@ struct dmar_domain { /* link to parent domain siblings */ struct list_head s2_link; }; + + /* SVA domain */ + struct { + struct mmu_notifier notifier; + }; }; struct iommu_domain domain; /* generic domain data structure for @@ -1038,6 +1047,19 @@ static inline void context_set_sm_pre(struct context_entry *context) context->lo |= BIT_ULL(4); } +/* Returns a number of VTD pages, but aligned to MM page size */ +static inline unsigned long aligned_nrpages(unsigned long host_addr, size_t size) +{ + host_addr &= ~PAGE_MASK; + return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT; +} + +/* Return a size from number of VTD pages. */ +static inline unsigned long nrpages_to_size(unsigned long npages) +{ + return npages << VTD_PAGE_SHIFT; +} + /* Convert value to context PASID directory size field coding. */ #define context_pdts(pds) (((pds) & 0x7) << 9) @@ -1085,48 +1107,60 @@ void domain_update_iommu_cap(struct dmar_domain *domain); int dmar_ir_support(void); -void *alloc_pgtable_page(int node, gfp_t gfp); -void free_pgtable_page(void *vaddr); void iommu_flush_write_buffer(struct intel_iommu *iommu); struct iommu_domain *intel_nested_domain_alloc(struct iommu_domain *parent, const struct iommu_user_data *user_data); struct device *device_rbtree_find(struct intel_iommu *iommu, u16 rid); +enum cache_tag_type { + CACHE_TAG_IOTLB, + CACHE_TAG_DEVTLB, + CACHE_TAG_NESTING_IOTLB, + CACHE_TAG_NESTING_DEVTLB, +}; + +struct cache_tag { + struct list_head node; + enum cache_tag_type type; + struct intel_iommu *iommu; + /* + * The @dev field represents the location of the cache. For IOTLB, it + * resides on the IOMMU hardware. @dev stores the device pointer to + * the IOMMU hardware. For DevTLB, it locates in the PCIe endpoint. + * @dev stores the device pointer to that endpoint. + */ + struct device *dev; + u16 domain_id; + ioasid_t pasid; + unsigned int users; +}; + +int cache_tag_assign_domain(struct dmar_domain *domain, + struct device *dev, ioasid_t pasid); +void cache_tag_unassign_domain(struct dmar_domain *domain, + struct device *dev, ioasid_t pasid); +void cache_tag_flush_range(struct dmar_domain *domain, unsigned long start, + unsigned long end, int ih); +void cache_tag_flush_all(struct dmar_domain *domain); +void cache_tag_flush_range_np(struct dmar_domain *domain, unsigned long start, + unsigned long end); + #ifdef CONFIG_INTEL_IOMMU_SVM void intel_svm_check(struct intel_iommu *iommu); int intel_svm_enable_prq(struct intel_iommu *iommu); int intel_svm_finish_prq(struct intel_iommu *iommu); void intel_svm_page_response(struct device *dev, struct iopf_fault *evt, struct iommu_page_response *msg); -struct iommu_domain *intel_svm_domain_alloc(void); -void intel_svm_remove_dev_pasid(struct device *dev, ioasid_t pasid); +struct iommu_domain *intel_svm_domain_alloc(struct device *dev, + struct mm_struct *mm); void intel_drain_pasid_prq(struct device *dev, u32 pasid); - -struct intel_svm_dev { - struct list_head list; - struct rcu_head rcu; - struct device *dev; - struct intel_iommu *iommu; - u16 did; - u16 sid, qdep; -}; - -struct intel_svm { - struct mmu_notifier notifier; - struct mm_struct *mm; - u32 pasid; - struct list_head devs; -}; #else static inline void intel_svm_check(struct intel_iommu *iommu) {} static inline void intel_drain_pasid_prq(struct device *dev, u32 pasid) {} -static inline struct iommu_domain *intel_svm_domain_alloc(void) -{ - return NULL; -} - -static inline void intel_svm_remove_dev_pasid(struct device *dev, ioasid_t pasid) +static inline struct iommu_domain *intel_svm_domain_alloc(struct device *dev, + struct mm_struct *mm) { + return ERR_PTR(-ENODEV); } #endif diff --git a/drivers/iommu/intel/irq_remapping.c b/drivers/iommu/intel/irq_remapping.c index 566297bc87..e4a7088667 100644 --- a/drivers/iommu/intel/irq_remapping.c +++ b/drivers/iommu/intel/irq_remapping.c @@ -19,9 +19,11 @@ #include <asm/cpu.h> #include <asm/irq_remapping.h> #include <asm/pci-direct.h> +#include <asm/posted_intr.h> #include "iommu.h" #include "../irq_remapping.h" +#include "../iommu-pages.h" #include "cap_audit.h" enum irq_mode { @@ -49,6 +51,7 @@ struct irq_2_iommu { u16 sub_handle; u8 irte_mask; enum irq_mode mode; + bool posted_msi; }; struct intel_ir_data { @@ -82,7 +85,7 @@ static const struct irq_domain_ops intel_ir_domain_ops; static void iommu_disable_irq_remapping(struct intel_iommu *iommu); static int __init parse_ioapics_under_ir(void); -static const struct msi_parent_ops dmar_msi_parent_ops, virt_dmar_msi_parent_ops; +static const struct msi_parent_ops dmar_msi_parent_ops; static bool ir_pre_enabled(struct intel_iommu *iommu) { @@ -527,7 +530,7 @@ static int intel_setup_irq_remapping(struct intel_iommu *iommu) struct ir_table *ir_table; struct fwnode_handle *fn; unsigned long *bitmap; - struct page *pages; + void *ir_table_base; if (iommu->ir_table) return 0; @@ -536,9 +539,9 @@ static int intel_setup_irq_remapping(struct intel_iommu *iommu) if (!ir_table) return -ENOMEM; - pages = alloc_pages_node(iommu->node, GFP_KERNEL | __GFP_ZERO, - INTR_REMAP_PAGE_ORDER); - if (!pages) { + ir_table_base = iommu_alloc_pages_node(iommu->node, GFP_KERNEL, + INTR_REMAP_PAGE_ORDER); + if (!ir_table_base) { pr_err("IR%d: failed to allocate pages of order %d\n", iommu->seq_id, INTR_REMAP_PAGE_ORDER); goto out_free_table; @@ -567,13 +570,9 @@ static int intel_setup_irq_remapping(struct intel_iommu *iommu) irq_domain_update_bus_token(iommu->ir_domain, DOMAIN_BUS_DMAR); iommu->ir_domain->flags |= IRQ_DOMAIN_FLAG_MSI_PARENT | IRQ_DOMAIN_FLAG_ISOLATED_MSI; + iommu->ir_domain->msi_parent_ops = &dmar_msi_parent_ops; - if (cap_caching_mode(iommu->cap)) - iommu->ir_domain->msi_parent_ops = &virt_dmar_msi_parent_ops; - else - iommu->ir_domain->msi_parent_ops = &dmar_msi_parent_ops; - - ir_table->base = page_address(pages); + ir_table->base = ir_table_base; ir_table->bitmap = bitmap; iommu->ir_table = ir_table; @@ -622,7 +621,7 @@ out_free_fwnode: out_free_bitmap: bitmap_free(bitmap); out_free_pages: - __free_pages(pages, INTR_REMAP_PAGE_ORDER); + iommu_free_pages(ir_table_base, INTR_REMAP_PAGE_ORDER); out_free_table: kfree(ir_table); @@ -643,8 +642,7 @@ static void intel_teardown_irq_remapping(struct intel_iommu *iommu) irq_domain_free_fwnode(fn); iommu->ir_domain = NULL; } - free_pages((unsigned long)iommu->ir_table->base, - INTR_REMAP_PAGE_ORDER); + iommu_free_pages(iommu->ir_table->base, INTR_REMAP_PAGE_ORDER); bitmap_free(iommu->ir_table->bitmap); kfree(iommu->ir_table); iommu->ir_table = NULL; @@ -1118,6 +1116,14 @@ static void prepare_irte(struct irte *irte, int vector, unsigned int dest) irte->redir_hint = 1; } +static void prepare_irte_posted(struct irte *irte) +{ + memset(irte, 0, sizeof(*irte)); + + irte->present = 1; + irte->p_pst = 1; +} + struct irq_remap_ops intel_irq_remap_ops = { .prepare = intel_prepare_irq_remapping, .enable = intel_enable_irq_remapping, @@ -1126,6 +1132,47 @@ struct irq_remap_ops intel_irq_remap_ops = { .enable_faulting = enable_drhd_fault_handling, }; +#ifdef CONFIG_X86_POSTED_MSI + +static phys_addr_t get_pi_desc_addr(struct irq_data *irqd) +{ + int cpu = cpumask_first(irq_data_get_effective_affinity_mask(irqd)); + + if (WARN_ON(cpu >= nr_cpu_ids)) + return 0; + + return __pa(per_cpu_ptr(&posted_msi_pi_desc, cpu)); +} + +static void intel_ir_reconfigure_irte_posted(struct irq_data *irqd) +{ + struct intel_ir_data *ir_data = irqd->chip_data; + struct irte *irte = &ir_data->irte_entry; + struct irte irte_pi; + u64 pid_addr; + + pid_addr = get_pi_desc_addr(irqd); + + if (!pid_addr) { + pr_warn("Failed to setup IRQ %d for posted mode", irqd->irq); + return; + } + + memset(&irte_pi, 0, sizeof(irte_pi)); + + /* The shared IRTE already be set up as posted during alloc_irte */ + dmar_copy_shared_irte(&irte_pi, irte); + + irte_pi.pda_l = (pid_addr >> (32 - PDA_LOW_BIT)) & ~(-1UL << PDA_LOW_BIT); + irte_pi.pda_h = (pid_addr >> 32) & ~(-1UL << PDA_HIGH_BIT); + + modify_irte(&ir_data->irq_2_iommu, &irte_pi); +} + +#else +static inline void intel_ir_reconfigure_irte_posted(struct irq_data *irqd) {} +#endif + static void intel_ir_reconfigure_irte(struct irq_data *irqd, bool force) { struct intel_ir_data *ir_data = irqd->chip_data; @@ -1139,8 +1186,9 @@ static void intel_ir_reconfigure_irte(struct irq_data *irqd, bool force) irte->vector = cfg->vector; irte->dest_id = IRTE_DEST(cfg->dest_apicid); - /* Update the hardware only if the interrupt is in remapped mode. */ - if (force || ir_data->irq_2_iommu.mode == IRQ_REMAPPING) + if (ir_data->irq_2_iommu.posted_msi) + intel_ir_reconfigure_irte_posted(irqd); + else if (force || ir_data->irq_2_iommu.mode == IRQ_REMAPPING) modify_irte(&ir_data->irq_2_iommu, irte); } @@ -1194,7 +1242,7 @@ static int intel_ir_set_vcpu_affinity(struct irq_data *data, void *info) struct intel_ir_data *ir_data = data->chip_data; struct vcpu_data *vcpu_pi_info = info; - /* stop posting interrupts, back to remapping mode */ + /* stop posting interrupts, back to the default mode */ if (!vcpu_pi_info) { modify_irte(&ir_data->irq_2_iommu, &ir_data->irte_entry); } else { @@ -1233,6 +1281,49 @@ static struct irq_chip intel_ir_chip = { .irq_set_vcpu_affinity = intel_ir_set_vcpu_affinity, }; +/* + * With posted MSIs, all vectors are multiplexed into a single notification + * vector. Devices MSIs are then dispatched in a demux loop where + * EOIs can be coalesced as well. + * + * "INTEL-IR-POST" IRQ chip does not do EOI on ACK, thus the dummy irq_ack() + * function. Instead EOI is performed by the posted interrupt notification + * handler. + * + * For the example below, 3 MSIs are coalesced into one CPU notification. Only + * one apic_eoi() is needed. + * + * __sysvec_posted_msi_notification() + * irq_enter(); + * handle_edge_irq() + * irq_chip_ack_parent() + * dummy(); // No EOI + * handle_irq_event() + * driver_handler() + * handle_edge_irq() + * irq_chip_ack_parent() + * dummy(); // No EOI + * handle_irq_event() + * driver_handler() + * handle_edge_irq() + * irq_chip_ack_parent() + * dummy(); // No EOI + * handle_irq_event() + * driver_handler() + * apic_eoi() + * irq_exit() + */ + +static void dummy_ack(struct irq_data *d) { } + +static struct irq_chip intel_ir_chip_post_msi = { + .name = "INTEL-IR-POST", + .irq_ack = dummy_ack, + .irq_set_affinity = intel_ir_set_affinity, + .irq_compose_msi_msg = intel_ir_compose_msi_msg, + .irq_set_vcpu_affinity = intel_ir_set_vcpu_affinity, +}; + static void fill_msi_msg(struct msi_msg *msg, u32 index, u32 subhandle) { memset(msg, 0, sizeof(*msg)); @@ -1274,6 +1365,11 @@ static void intel_irq_remapping_prepare_irte(struct intel_ir_data *data, break; case X86_IRQ_ALLOC_TYPE_PCI_MSI: case X86_IRQ_ALLOC_TYPE_PCI_MSIX: + if (posted_msi_supported()) { + prepare_irte_posted(irte); + data->irq_2_iommu.posted_msi = 1; + } + set_msi_sid(irte, pci_real_dma_dev(msi_desc_to_pci_dev(info->desc))); break; @@ -1361,7 +1457,12 @@ static int intel_irq_remapping_alloc(struct irq_domain *domain, irq_data->hwirq = (index << 16) + i; irq_data->chip_data = ird; - irq_data->chip = &intel_ir_chip; + if (posted_msi_supported() && + ((info->type == X86_IRQ_ALLOC_TYPE_PCI_MSI) || + (info->type == X86_IRQ_ALLOC_TYPE_PCI_MSIX))) + irq_data->chip = &intel_ir_chip_post_msi; + else + irq_data->chip = &intel_ir_chip; intel_irq_remapping_prepare_irte(ird, irq_cfg, info, index, i); irq_set_status_flags(virq + i, IRQ_MOVE_PCNTXT); } @@ -1421,20 +1522,11 @@ static const struct irq_domain_ops intel_ir_domain_ops = { }; static const struct msi_parent_ops dmar_msi_parent_ops = { - .supported_flags = X86_VECTOR_MSI_FLAGS_SUPPORTED | - MSI_FLAG_MULTI_PCI_MSI | - MSI_FLAG_PCI_IMS, + .supported_flags = X86_VECTOR_MSI_FLAGS_SUPPORTED | MSI_FLAG_MULTI_PCI_MSI, .prefix = "IR-", .init_dev_msi_info = msi_parent_init_dev_msi_info, }; -static const struct msi_parent_ops virt_dmar_msi_parent_ops = { - .supported_flags = X86_VECTOR_MSI_FLAGS_SUPPORTED | - MSI_FLAG_MULTI_PCI_MSI, - .prefix = "vIR-", - .init_dev_msi_info = msi_parent_init_dev_msi_info, -}; - /* * Support of Interrupt Remapping Unit Hotplug */ diff --git a/drivers/iommu/intel/nested.c b/drivers/iommu/intel/nested.c index a7d68f3d51..16a2bcf5cf 100644 --- a/drivers/iommu/intel/nested.c +++ b/drivers/iommu/intel/nested.c @@ -52,13 +52,14 @@ static int intel_nested_attach_dev(struct iommu_domain *domain, return ret; } + ret = cache_tag_assign_domain(dmar_domain, dev, IOMMU_NO_PASID); + if (ret) + goto detach_iommu; + ret = intel_pasid_setup_nested(iommu, dev, IOMMU_NO_PASID, dmar_domain); - if (ret) { - domain_detach_iommu(dmar_domain, iommu); - dev_err_ratelimited(dev, "Failed to setup pasid entry\n"); - return ret; - } + if (ret) + goto unassign_tag; info->domain = dmar_domain; spin_lock_irqsave(&dmar_domain->lock, flags); @@ -68,6 +69,12 @@ static int intel_nested_attach_dev(struct iommu_domain *domain, domain_update_iotlb(dmar_domain); return 0; +unassign_tag: + cache_tag_unassign_domain(dmar_domain, dev, IOMMU_NO_PASID); +detach_iommu: + domain_detach_iommu(dmar_domain, iommu); + + return ret; } static void intel_nested_domain_free(struct iommu_domain *domain) @@ -81,50 +88,6 @@ static void intel_nested_domain_free(struct iommu_domain *domain) kfree(dmar_domain); } -static void nested_flush_dev_iotlb(struct dmar_domain *domain, u64 addr, - unsigned int mask) -{ - struct device_domain_info *info; - unsigned long flags; - u16 sid, qdep; - - spin_lock_irqsave(&domain->lock, flags); - list_for_each_entry(info, &domain->devices, link) { - if (!info->ats_enabled) - continue; - sid = info->bus << 8 | info->devfn; - qdep = info->ats_qdep; - qi_flush_dev_iotlb(info->iommu, sid, info->pfsid, - qdep, addr, mask); - quirk_extra_dev_tlb_flush(info, addr, mask, - IOMMU_NO_PASID, qdep); - } - spin_unlock_irqrestore(&domain->lock, flags); -} - -static void intel_nested_flush_cache(struct dmar_domain *domain, u64 addr, - u64 npages, bool ih) -{ - struct iommu_domain_info *info; - unsigned int mask; - unsigned long i; - - xa_for_each(&domain->iommu_array, i, info) - qi_flush_piotlb(info->iommu, - domain_id_iommu(domain, info->iommu), - IOMMU_NO_PASID, addr, npages, ih); - - if (!domain->has_iotlb_device) - return; - - if (npages == U64_MAX) - mask = 64 - VTD_PAGE_SHIFT; - else - mask = ilog2(__roundup_pow_of_two(npages)); - - nested_flush_dev_iotlb(domain, addr, mask); -} - static int intel_nested_cache_invalidate_user(struct iommu_domain *domain, struct iommu_user_data_array *array) { @@ -157,9 +120,9 @@ static int intel_nested_cache_invalidate_user(struct iommu_domain *domain, break; } - intel_nested_flush_cache(dmar_domain, inv_entry.addr, - inv_entry.npages, - inv_entry.flags & IOMMU_VTD_INV_FLAGS_LEAF); + cache_tag_flush_range(dmar_domain, inv_entry.addr, + inv_entry.addr + nrpages_to_size(inv_entry.npages) - 1, + inv_entry.flags & IOMMU_VTD_INV_FLAGS_LEAF); processed++; } @@ -206,7 +169,9 @@ struct iommu_domain *intel_nested_domain_alloc(struct iommu_domain *parent, domain->domain.type = IOMMU_DOMAIN_NESTED; INIT_LIST_HEAD(&domain->devices); INIT_LIST_HEAD(&domain->dev_pasids); + INIT_LIST_HEAD(&domain->cache_tags); spin_lock_init(&domain->lock); + spin_lock_init(&domain->cache_lock); xa_init(&domain->iommu_array); spin_lock(&s2_domain->s1_lock); diff --git a/drivers/iommu/intel/pasid.c b/drivers/iommu/intel/pasid.c index 11f0b856d7..abce19e2ad 100644 --- a/drivers/iommu/intel/pasid.c +++ b/drivers/iommu/intel/pasid.c @@ -20,6 +20,7 @@ #include "iommu.h" #include "pasid.h" +#include "../iommu-pages.h" /* * Intel IOMMU system wide PASID name space: @@ -38,7 +39,7 @@ int intel_pasid_alloc_table(struct device *dev) { struct device_domain_info *info; struct pasid_table *pasid_table; - struct page *pages; + struct pasid_dir_entry *dir; u32 max_pasid = 0; int order, size; @@ -59,14 +60,13 @@ int intel_pasid_alloc_table(struct device *dev) size = max_pasid >> (PASID_PDE_SHIFT - 3); order = size ? get_order(size) : 0; - pages = alloc_pages_node(info->iommu->node, - GFP_KERNEL | __GFP_ZERO, order); - if (!pages) { + dir = iommu_alloc_pages_node(info->iommu->node, GFP_KERNEL, order); + if (!dir) { kfree(pasid_table); return -ENOMEM; } - pasid_table->table = page_address(pages); + pasid_table->table = dir; pasid_table->order = order; pasid_table->max_pasid = 1 << (order + PAGE_SHIFT + 3); info->pasid_table = pasid_table; @@ -97,10 +97,10 @@ void intel_pasid_free_table(struct device *dev) max_pde = pasid_table->max_pasid >> PASID_PDE_SHIFT; for (i = 0; i < max_pde; i++) { table = get_pasid_table_from_pde(&dir[i]); - free_pgtable_page(table); + iommu_free_page(table); } - free_pages((unsigned long)pasid_table->table, pasid_table->order); + iommu_free_pages(pasid_table->table, pasid_table->order); kfree(pasid_table); } @@ -146,7 +146,7 @@ static struct pasid_entry *intel_pasid_get_entry(struct device *dev, u32 pasid) retry: entries = get_pasid_table_from_pde(&dir[dir_index]); if (!entries) { - entries = alloc_pgtable_page(info->iommu->node, GFP_ATOMIC); + entries = iommu_alloc_page_node(info->iommu->node, GFP_ATOMIC); if (!entries) return NULL; @@ -158,7 +158,7 @@ retry: */ if (cmpxchg64(&dir[dir_index].val, 0ULL, (u64)virt_to_phys(entries) | PASID_PTE_PRESENT)) { - free_pgtable_page(entries); + iommu_free_page(entries); goto retry; } if (!ecap_coherent(info->iommu->ecap)) { diff --git a/drivers/iommu/intel/perf.h b/drivers/iommu/intel/perf.h index fd6db8049d..df9a36942d 100644 --- a/drivers/iommu/intel/perf.h +++ b/drivers/iommu/intel/perf.h @@ -11,7 +11,6 @@ enum latency_type { DMAR_LATENCY_INV_IOTLB = 0, DMAR_LATENCY_INV_DEVTLB, DMAR_LATENCY_INV_IEC, - DMAR_LATENCY_PRQ, DMAR_LATENCY_NUM }; diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c index ee3b469e2d..0e3a9b38be 100644 --- a/drivers/iommu/intel/svm.c +++ b/drivers/iommu/intel/svm.c @@ -22,57 +22,22 @@ #include "iommu.h" #include "pasid.h" #include "perf.h" +#include "../iommu-pages.h" #include "trace.h" static irqreturn_t prq_event_thread(int irq, void *d); -static DEFINE_XARRAY_ALLOC(pasid_private_array); -static int pasid_private_add(ioasid_t pasid, void *priv) -{ - return xa_alloc(&pasid_private_array, &pasid, priv, - XA_LIMIT(pasid, pasid), GFP_ATOMIC); -} - -static void pasid_private_remove(ioasid_t pasid) -{ - xa_erase(&pasid_private_array, pasid); -} - -static void *pasid_private_find(ioasid_t pasid) -{ - return xa_load(&pasid_private_array, pasid); -} - -static struct intel_svm_dev * -svm_lookup_device_by_dev(struct intel_svm *svm, struct device *dev) -{ - struct intel_svm_dev *sdev = NULL, *t; - - rcu_read_lock(); - list_for_each_entry_rcu(t, &svm->devs, list) { - if (t->dev == dev) { - sdev = t; - break; - } - } - rcu_read_unlock(); - - return sdev; -} - int intel_svm_enable_prq(struct intel_iommu *iommu) { struct iopf_queue *iopfq; - struct page *pages; int irq, ret; - pages = alloc_pages_node(iommu->node, GFP_KERNEL | __GFP_ZERO, PRQ_ORDER); - if (!pages) { + iommu->prq = iommu_alloc_pages_node(iommu->node, GFP_KERNEL, PRQ_ORDER); + if (!iommu->prq) { pr_warn("IOMMU: %s: Failed to allocate page request queue\n", iommu->name); return -ENOMEM; } - iommu->prq = page_address(pages); irq = dmar_alloc_hwirq(IOMMU_IRQ_ID_OFFSET_PRQ + iommu->seq_id, iommu->node, iommu); if (irq <= 0) { @@ -117,7 +82,7 @@ free_hwirq: dmar_free_hwirq(irq); iommu->pr_irq = 0; free_prq: - free_pages((unsigned long)iommu->prq, PRQ_ORDER); + iommu_free_pages(iommu->prq, PRQ_ORDER); iommu->prq = NULL; return ret; @@ -140,7 +105,7 @@ int intel_svm_finish_prq(struct intel_iommu *iommu) iommu->iopf_queue = NULL; } - free_pages((unsigned long)iommu->prq, PRQ_ORDER); + iommu_free_pages(iommu->prq, PRQ_ORDER); iommu->prq = NULL; return 0; @@ -168,94 +133,32 @@ void intel_svm_check(struct intel_iommu *iommu) iommu->flags |= VTD_FLAG_SVM_CAPABLE; } -static void __flush_svm_range_dev(struct intel_svm *svm, - struct intel_svm_dev *sdev, - unsigned long address, - unsigned long pages, int ih) -{ - struct device_domain_info *info = dev_iommu_priv_get(sdev->dev); - - if (WARN_ON(!pages)) - return; - - qi_flush_piotlb(sdev->iommu, sdev->did, svm->pasid, address, pages, ih); - if (info->ats_enabled) { - qi_flush_dev_iotlb_pasid(sdev->iommu, sdev->sid, info->pfsid, - svm->pasid, sdev->qdep, address, - order_base_2(pages)); - quirk_extra_dev_tlb_flush(info, address, order_base_2(pages), - svm->pasid, sdev->qdep); - } -} - -static void intel_flush_svm_range_dev(struct intel_svm *svm, - struct intel_svm_dev *sdev, - unsigned long address, - unsigned long pages, int ih) -{ - unsigned long shift = ilog2(__roundup_pow_of_two(pages)); - unsigned long align = (1ULL << (VTD_PAGE_SHIFT + shift)); - unsigned long start = ALIGN_DOWN(address, align); - unsigned long end = ALIGN(address + (pages << VTD_PAGE_SHIFT), align); - - while (start < end) { - __flush_svm_range_dev(svm, sdev, start, align >> VTD_PAGE_SHIFT, ih); - start += align; - } -} - -static void intel_flush_svm_range(struct intel_svm *svm, unsigned long address, - unsigned long pages, int ih) -{ - struct intel_svm_dev *sdev; - - rcu_read_lock(); - list_for_each_entry_rcu(sdev, &svm->devs, list) - intel_flush_svm_range_dev(svm, sdev, address, pages, ih); - rcu_read_unlock(); -} - -static void intel_flush_svm_all(struct intel_svm *svm) -{ - struct device_domain_info *info; - struct intel_svm_dev *sdev; - - rcu_read_lock(); - list_for_each_entry_rcu(sdev, &svm->devs, list) { - info = dev_iommu_priv_get(sdev->dev); - - qi_flush_piotlb(sdev->iommu, sdev->did, svm->pasid, 0, -1UL, 0); - if (info->ats_enabled) { - qi_flush_dev_iotlb_pasid(sdev->iommu, sdev->sid, info->pfsid, - svm->pasid, sdev->qdep, - 0, 64 - VTD_PAGE_SHIFT); - quirk_extra_dev_tlb_flush(info, 0, 64 - VTD_PAGE_SHIFT, - svm->pasid, sdev->qdep); - } - } - rcu_read_unlock(); -} - /* Pages have been freed at this point */ static void intel_arch_invalidate_secondary_tlbs(struct mmu_notifier *mn, struct mm_struct *mm, unsigned long start, unsigned long end) { - struct intel_svm *svm = container_of(mn, struct intel_svm, notifier); + struct dmar_domain *domain = container_of(mn, struct dmar_domain, notifier); - if (start == 0 && end == -1UL) { - intel_flush_svm_all(svm); + if (start == 0 && end == ULONG_MAX) { + cache_tag_flush_all(domain); return; } - intel_flush_svm_range(svm, start, - (end - start + PAGE_SIZE - 1) >> VTD_PAGE_SHIFT, 0); + /* + * The mm_types defines vm_end as the first byte after the end address, + * different from IOMMU subsystem using the last address of an address + * range. + */ + cache_tag_flush_range(domain, start, end - 1, 0); } static void intel_mm_release(struct mmu_notifier *mn, struct mm_struct *mm) { - struct intel_svm *svm = container_of(mn, struct intel_svm, notifier); - struct intel_svm_dev *sdev; + struct dmar_domain *domain = container_of(mn, struct dmar_domain, notifier); + struct dev_pasid_info *dev_pasid; + struct device_domain_info *info; + unsigned long flags; /* This might end up being called from exit_mmap(), *before* the page * tables are cleared. And __mmu_notifier_release() will delete us from @@ -269,157 +172,78 @@ static void intel_mm_release(struct mmu_notifier *mn, struct mm_struct *mm) * page) so that we end up taking a fault that the hardware really * *has* to handle gracefully without affecting other processes. */ - rcu_read_lock(); - list_for_each_entry_rcu(sdev, &svm->devs, list) - intel_pasid_tear_down_entry(sdev->iommu, sdev->dev, - svm->pasid, true); - rcu_read_unlock(); + spin_lock_irqsave(&domain->lock, flags); + list_for_each_entry(dev_pasid, &domain->dev_pasids, link_domain) { + info = dev_iommu_priv_get(dev_pasid->dev); + intel_pasid_tear_down_entry(info->iommu, dev_pasid->dev, + dev_pasid->pasid, true); + } + spin_unlock_irqrestore(&domain->lock, flags); + +} +static void intel_mm_free_notifier(struct mmu_notifier *mn) +{ + kfree(container_of(mn, struct dmar_domain, notifier)); } static const struct mmu_notifier_ops intel_mmuops = { .release = intel_mm_release, .arch_invalidate_secondary_tlbs = intel_arch_invalidate_secondary_tlbs, + .free_notifier = intel_mm_free_notifier, }; -static int pasid_to_svm_sdev(struct device *dev, unsigned int pasid, - struct intel_svm **rsvm, - struct intel_svm_dev **rsdev) -{ - struct intel_svm_dev *sdev = NULL; - struct intel_svm *svm; - - if (pasid == IOMMU_PASID_INVALID || pasid >= PASID_MAX) - return -EINVAL; - - svm = pasid_private_find(pasid); - if (IS_ERR(svm)) - return PTR_ERR(svm); - - if (!svm) - goto out; - - /* - * If we found svm for the PASID, there must be at least one device - * bond. - */ - if (WARN_ON(list_empty(&svm->devs))) - return -EINVAL; - sdev = svm_lookup_device_by_dev(svm, dev); - -out: - *rsvm = svm; - *rsdev = sdev; - - return 0; -} - static int intel_svm_set_dev_pasid(struct iommu_domain *domain, struct device *dev, ioasid_t pasid) { struct device_domain_info *info = dev_iommu_priv_get(dev); + struct dmar_domain *dmar_domain = to_dmar_domain(domain); struct intel_iommu *iommu = info->iommu; struct mm_struct *mm = domain->mm; - struct intel_svm_dev *sdev; - struct intel_svm *svm; + struct dev_pasid_info *dev_pasid; unsigned long sflags; + unsigned long flags; int ret = 0; - svm = pasid_private_find(pasid); - if (!svm) { - svm = kzalloc(sizeof(*svm), GFP_KERNEL); - if (!svm) - return -ENOMEM; - - svm->pasid = pasid; - svm->mm = mm; - INIT_LIST_HEAD_RCU(&svm->devs); - - svm->notifier.ops = &intel_mmuops; - ret = mmu_notifier_register(&svm->notifier, mm); - if (ret) { - kfree(svm); - return ret; - } - - ret = pasid_private_add(svm->pasid, svm); - if (ret) { - mmu_notifier_unregister(&svm->notifier, mm); - kfree(svm); - return ret; - } - } + dev_pasid = kzalloc(sizeof(*dev_pasid), GFP_KERNEL); + if (!dev_pasid) + return -ENOMEM; - sdev = kzalloc(sizeof(*sdev), GFP_KERNEL); - if (!sdev) { - ret = -ENOMEM; - goto free_svm; - } + dev_pasid->dev = dev; + dev_pasid->pasid = pasid; - sdev->dev = dev; - sdev->iommu = iommu; - sdev->did = FLPT_DEFAULT_DID; - sdev->sid = PCI_DEVID(info->bus, info->devfn); - if (info->ats_enabled) { - sdev->qdep = info->ats_qdep; - if (sdev->qdep >= QI_DEV_EIOTLB_MAX_INVS) - sdev->qdep = 0; - } + ret = cache_tag_assign_domain(to_dmar_domain(domain), dev, pasid); + if (ret) + goto free_dev_pasid; /* Setup the pasid table: */ sflags = cpu_feature_enabled(X86_FEATURE_LA57) ? PASID_FLAG_FL5LP : 0; ret = intel_pasid_setup_first_level(iommu, dev, mm->pgd, pasid, FLPT_DEFAULT_DID, sflags); if (ret) - goto free_sdev; + goto unassign_tag; - list_add_rcu(&sdev->list, &svm->devs); + spin_lock_irqsave(&dmar_domain->lock, flags); + list_add(&dev_pasid->link_domain, &dmar_domain->dev_pasids); + spin_unlock_irqrestore(&dmar_domain->lock, flags); return 0; -free_sdev: - kfree(sdev); -free_svm: - if (list_empty(&svm->devs)) { - mmu_notifier_unregister(&svm->notifier, mm); - pasid_private_remove(pasid); - kfree(svm); - } +unassign_tag: + cache_tag_unassign_domain(to_dmar_domain(domain), dev, pasid); +free_dev_pasid: + kfree(dev_pasid); return ret; } -void intel_svm_remove_dev_pasid(struct device *dev, u32 pasid) -{ - struct intel_svm_dev *sdev; - struct intel_svm *svm; - struct mm_struct *mm; - - if (pasid_to_svm_sdev(dev, pasid, &svm, &sdev)) - return; - mm = svm->mm; - - if (sdev) { - list_del_rcu(&sdev->list); - kfree_rcu(sdev, rcu); - - if (list_empty(&svm->devs)) { - if (svm->notifier.ops) - mmu_notifier_unregister(&svm->notifier, mm); - pasid_private_remove(svm->pasid); - kfree(svm); - } - } -} - /* Page request queue descriptor */ struct page_req_dsc { union { struct { u64 type:8; u64 pasid_present:1; - u64 priv_data_present:1; - u64 rsvd:6; + u64 rsvd:7; u64 rid:16; u64 pasid:20; u64 exe_req:1; @@ -438,7 +262,8 @@ struct page_req_dsc { }; u64 qw_1; }; - u64 priv_data[2]; + u64 qw_2; + u64 qw_3; }; static bool is_canonical_address(u64 addr) @@ -572,24 +397,6 @@ static void intel_svm_prq_report(struct intel_iommu *iommu, struct device *dev, event.fault.prm.flags |= IOMMU_FAULT_PAGE_REQUEST_PASID_VALID; event.fault.prm.flags |= IOMMU_FAULT_PAGE_RESPONSE_NEEDS_PASID; } - if (desc->priv_data_present) { - /* - * Set last page in group bit if private data is present, - * page response is required as it does for LPIG. - * iommu_report_device_fault() doesn't understand this vendor - * specific requirement thus we set last_page as a workaround. - */ - event.fault.prm.flags |= IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE; - event.fault.prm.flags |= IOMMU_FAULT_PAGE_REQUEST_PRIV_DATA; - event.fault.prm.private_data[0] = desc->priv_data[0]; - event.fault.prm.private_data[1] = desc->priv_data[1]; - } else if (dmar_latency_enabled(iommu, DMAR_LATENCY_PRQ)) { - /* - * If the private data fields are not used by hardware, use it - * to monitor the prq handle latency. - */ - event.fault.prm.private_data[0] = ktime_to_ns(ktime_get()); - } iommu_report_device_fault(dev, &event); } @@ -597,39 +404,23 @@ static void intel_svm_prq_report(struct intel_iommu *iommu, struct device *dev, static void handle_bad_prq_event(struct intel_iommu *iommu, struct page_req_dsc *req, int result) { - struct qi_desc desc; + struct qi_desc desc = { }; pr_err("%s: Invalid page request: %08llx %08llx\n", iommu->name, ((unsigned long long *)req)[0], ((unsigned long long *)req)[1]); - /* - * Per VT-d spec. v3.0 ch7.7, system software must - * respond with page group response if private data - * is present (PDP) or last page in group (LPIG) bit - * is set. This is an additional VT-d feature beyond - * PCI ATS spec. - */ - if (!req->lpig && !req->priv_data_present) + if (!req->lpig) return; desc.qw0 = QI_PGRP_PASID(req->pasid) | QI_PGRP_DID(req->rid) | QI_PGRP_PASID_P(req->pasid_present) | - QI_PGRP_PDP(req->priv_data_present) | QI_PGRP_RESP_CODE(result) | QI_PGRP_RESP_TYPE; desc.qw1 = QI_PGRP_IDX(req->prg_index) | QI_PGRP_LPIG(req->lpig); - if (req->priv_data_present) { - desc.qw2 = req->priv_data[0]; - desc.qw3 = req->priv_data[1]; - } else { - desc.qw2 = 0; - desc.qw3 = 0; - } - qi_submit_sync(iommu, &desc, 1, 0); } @@ -697,7 +488,7 @@ bad_req: intel_svm_prq_report(iommu, dev, req); trace_prq_report(iommu, dev, req->qw_0, req->qw_1, - req->priv_data[0], req->priv_data[1], + req->qw_2, req->qw_3, iommu->prq_seq_number++); mutex_unlock(&iommu->iopf_lock); prq_advance: @@ -736,7 +527,7 @@ void intel_svm_page_response(struct device *dev, struct iopf_fault *evt, struct intel_iommu *iommu = info->iommu; u8 bus = info->bus, devfn = info->devfn; struct iommu_fault_page_request *prm; - bool private_present; + struct qi_desc desc; bool pasid_present; bool last_page; u16 sid; @@ -744,42 +535,25 @@ void intel_svm_page_response(struct device *dev, struct iopf_fault *evt, prm = &evt->fault.prm; sid = PCI_DEVID(bus, devfn); pasid_present = prm->flags & IOMMU_FAULT_PAGE_REQUEST_PASID_VALID; - private_present = prm->flags & IOMMU_FAULT_PAGE_REQUEST_PRIV_DATA; last_page = prm->flags & IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE; - /* - * Per VT-d spec. v3.0 ch7.7, system software must respond - * with page group response if private data is present (PDP) - * or last page in group (LPIG) bit is set. This is an - * additional VT-d requirement beyond PCI ATS spec. - */ - if (last_page || private_present) { - struct qi_desc desc; - - desc.qw0 = QI_PGRP_PASID(prm->pasid) | QI_PGRP_DID(sid) | - QI_PGRP_PASID_P(pasid_present) | - QI_PGRP_PDP(private_present) | - QI_PGRP_RESP_CODE(msg->code) | - QI_PGRP_RESP_TYPE; - desc.qw1 = QI_PGRP_IDX(prm->grpid) | QI_PGRP_LPIG(last_page); - desc.qw2 = 0; - desc.qw3 = 0; - - if (private_present) { - desc.qw2 = prm->private_data[0]; - desc.qw3 = prm->private_data[1]; - } else if (prm->private_data[0]) { - dmar_latency_update(iommu, DMAR_LATENCY_PRQ, - ktime_to_ns(ktime_get()) - prm->private_data[0]); - } + desc.qw0 = QI_PGRP_PASID(prm->pasid) | QI_PGRP_DID(sid) | + QI_PGRP_PASID_P(pasid_present) | + QI_PGRP_RESP_CODE(msg->code) | + QI_PGRP_RESP_TYPE; + desc.qw1 = QI_PGRP_IDX(prm->grpid) | QI_PGRP_LPIG(last_page); + desc.qw2 = 0; + desc.qw3 = 0; - qi_submit_sync(iommu, &desc, 1, 0); - } + qi_submit_sync(iommu, &desc, 1, 0); } static void intel_svm_domain_free(struct iommu_domain *domain) { - kfree(to_dmar_domain(domain)); + struct dmar_domain *dmar_domain = to_dmar_domain(domain); + + /* dmar_domain free is deferred to the mmu free_notifier callback. */ + mmu_notifier_put(&dmar_domain->notifier); } static const struct iommu_domain_ops intel_svm_domain_ops = { @@ -787,14 +561,29 @@ static const struct iommu_domain_ops intel_svm_domain_ops = { .free = intel_svm_domain_free }; -struct iommu_domain *intel_svm_domain_alloc(void) +struct iommu_domain *intel_svm_domain_alloc(struct device *dev, + struct mm_struct *mm) { struct dmar_domain *domain; + int ret; domain = kzalloc(sizeof(*domain), GFP_KERNEL); if (!domain) - return NULL; + return ERR_PTR(-ENOMEM); + domain->domain.ops = &intel_svm_domain_ops; + domain->use_first_level = true; + INIT_LIST_HEAD(&domain->dev_pasids); + INIT_LIST_HEAD(&domain->cache_tags); + spin_lock_init(&domain->cache_lock); + spin_lock_init(&domain->lock); + + domain->notifier.ops = &intel_mmuops; + ret = mmu_notifier_register(&domain->notifier, mm); + if (ret) { + kfree(domain); + return ERR_PTR(ret); + } return &domain->domain; } diff --git a/drivers/iommu/intel/trace.h b/drivers/iommu/intel/trace.h index 93d96f93a8..9defdae6eb 100644 --- a/drivers/iommu/intel/trace.h +++ b/drivers/iommu/intel/trace.h @@ -32,7 +32,7 @@ TRACE_EVENT(qi_submit, ), TP_fast_assign( - __assign_str(iommu, iommu->name); + __assign_str(iommu); __entry->qw0 = qw0; __entry->qw1 = qw1; __entry->qw2 = qw2; @@ -79,8 +79,8 @@ TRACE_EVENT(prq_report, __entry->dw2 = dw2; __entry->dw3 = dw3; __entry->seq = seq; - __assign_str(iommu, iommu->name); - __assign_str(dev, dev_name(dev)); + __assign_str(iommu); + __assign_str(dev); ), TP_printk("%s/%s seq# %ld: %s", @@ -89,6 +89,103 @@ TRACE_EVENT(prq_report, __entry->dw1, __entry->dw2, __entry->dw3) ) ); + +DECLARE_EVENT_CLASS(cache_tag_log, + TP_PROTO(struct cache_tag *tag), + TP_ARGS(tag), + TP_STRUCT__entry( + __string(iommu, tag->iommu->name) + __string(dev, dev_name(tag->dev)) + __field(u16, type) + __field(u16, domain_id) + __field(u32, pasid) + __field(u32, users) + ), + TP_fast_assign( + __assign_str(iommu); + __assign_str(dev); + __entry->type = tag->type; + __entry->domain_id = tag->domain_id; + __entry->pasid = tag->pasid; + __entry->users = tag->users; + ), + TP_printk("%s/%s type %s did %d pasid %d ref %d", + __get_str(iommu), __get_str(dev), + __print_symbolic(__entry->type, + { CACHE_TAG_IOTLB, "iotlb" }, + { CACHE_TAG_DEVTLB, "devtlb" }, + { CACHE_TAG_NESTING_IOTLB, "nesting_iotlb" }, + { CACHE_TAG_NESTING_DEVTLB, "nesting_devtlb" }), + __entry->domain_id, __entry->pasid, __entry->users + ) +); + +DEFINE_EVENT(cache_tag_log, cache_tag_assign, + TP_PROTO(struct cache_tag *tag), + TP_ARGS(tag) +); + +DEFINE_EVENT(cache_tag_log, cache_tag_unassign, + TP_PROTO(struct cache_tag *tag), + TP_ARGS(tag) +); + +DEFINE_EVENT(cache_tag_log, cache_tag_flush_all, + TP_PROTO(struct cache_tag *tag), + TP_ARGS(tag) +); + +DECLARE_EVENT_CLASS(cache_tag_flush, + TP_PROTO(struct cache_tag *tag, unsigned long start, unsigned long end, + unsigned long addr, unsigned long pages, unsigned long mask), + TP_ARGS(tag, start, end, addr, pages, mask), + TP_STRUCT__entry( + __string(iommu, tag->iommu->name) + __string(dev, dev_name(tag->dev)) + __field(u16, type) + __field(u16, domain_id) + __field(u32, pasid) + __field(unsigned long, start) + __field(unsigned long, end) + __field(unsigned long, addr) + __field(unsigned long, pages) + __field(unsigned long, mask) + ), + TP_fast_assign( + __assign_str(iommu); + __assign_str(dev); + __entry->type = tag->type; + __entry->domain_id = tag->domain_id; + __entry->pasid = tag->pasid; + __entry->start = start; + __entry->end = end; + __entry->addr = addr; + __entry->pages = pages; + __entry->mask = mask; + ), + TP_printk("%s %s[%d] type %s did %d [0x%lx-0x%lx] addr 0x%lx pages 0x%lx mask 0x%lx", + __get_str(iommu), __get_str(dev), __entry->pasid, + __print_symbolic(__entry->type, + { CACHE_TAG_IOTLB, "iotlb" }, + { CACHE_TAG_DEVTLB, "devtlb" }, + { CACHE_TAG_NESTING_IOTLB, "nesting_iotlb" }, + { CACHE_TAG_NESTING_DEVTLB, "nesting_devtlb" }), + __entry->domain_id, __entry->start, __entry->end, + __entry->addr, __entry->pages, __entry->mask + ) +); + +DEFINE_EVENT(cache_tag_flush, cache_tag_flush_range, + TP_PROTO(struct cache_tag *tag, unsigned long start, unsigned long end, + unsigned long addr, unsigned long pages, unsigned long mask), + TP_ARGS(tag, start, end, addr, pages, mask) +); + +DEFINE_EVENT(cache_tag_flush, cache_tag_flush_range_np, + TP_PROTO(struct cache_tag *tag, unsigned long start, unsigned long end, + unsigned long addr, unsigned long pages, unsigned long mask), + TP_ARGS(tag, start, end, addr, pages, mask) +); #endif /* _TRACE_INTEL_IOMMU_H */ /* This part must be outside protection */ |