diff options
Diffstat (limited to 'drivers/iommu/amd')
-rw-r--r-- | drivers/iommu/amd/Kconfig | 10 | ||||
-rw-r--r-- | drivers/iommu/amd/Makefile | 1 | ||||
-rw-r--r-- | drivers/iommu/amd/amd_iommu.h | 35 | ||||
-rw-r--r-- | drivers/iommu/amd/amd_iommu_types.h | 64 | ||||
-rw-r--r-- | drivers/iommu/amd/init.c | 117 | ||||
-rw-r--r-- | drivers/iommu/amd/io_pgtable.c | 68 | ||||
-rw-r--r-- | drivers/iommu/amd/io_pgtable_v2.c | 8 | ||||
-rw-r--r-- | drivers/iommu/amd/iommu.c | 724 | ||||
-rw-r--r-- | drivers/iommu/amd/iommu_v2.c | 996 |
9 files changed, 557 insertions, 1466 deletions
diff --git a/drivers/iommu/amd/Kconfig b/drivers/iommu/amd/Kconfig index 9b5fc3356b..443b2c13c3 100644 --- a/drivers/iommu/amd/Kconfig +++ b/drivers/iommu/amd/Kconfig @@ -10,6 +10,7 @@ config AMD_IOMMU select IOMMU_API select IOMMU_IOVA select IOMMU_IO_PGTABLE + select IOMMUFD_DRIVER if IOMMUFD depends on X86_64 && PCI && ACPI && HAVE_CMPXCHG_DOUBLE help With this option you can enable support for AMD IOMMU hardware in @@ -22,15 +23,6 @@ config AMD_IOMMU your BIOS for an option to enable it or if you have an IVRS ACPI table. -config AMD_IOMMU_V2 - tristate "AMD IOMMU Version 2 driver" - depends on AMD_IOMMU - select MMU_NOTIFIER - help - This option enables support for the AMD IOMMUv2 features of the IOMMU - hardware. Select this option if you want to use devices that support - the PCI PRI and PASID interface. - config AMD_IOMMU_DEBUGFS bool "Enable AMD IOMMU internals in DebugFS" depends on AMD_IOMMU && IOMMU_DEBUGFS diff --git a/drivers/iommu/amd/Makefile b/drivers/iommu/amd/Makefile index 773d8aa002..f454fbb156 100644 --- a/drivers/iommu/amd/Makefile +++ b/drivers/iommu/amd/Makefile @@ -1,4 +1,3 @@ # SPDX-License-Identifier: GPL-2.0-only obj-$(CONFIG_AMD_IOMMU) += iommu.o init.o quirks.o io_pgtable.o io_pgtable_v2.o obj-$(CONFIG_AMD_IOMMU_DEBUGFS) += debugfs.o -obj-$(CONFIG_AMD_IOMMU_V2) += iommu_v2.o diff --git a/drivers/iommu/amd/amd_iommu.h b/drivers/iommu/amd/amd_iommu.h index e2857109e9..86be1edd50 100644 --- a/drivers/iommu/amd/amd_iommu.h +++ b/drivers/iommu/amd/amd_iommu.h @@ -38,9 +38,6 @@ extern int amd_iommu_guest_ir; extern enum io_pgtable_fmt amd_iommu_pgtable; extern int amd_iommu_gpt_level; -/* IOMMUv2 specific functions */ -struct iommu_domain; - bool amd_iommu_v2_supported(void); struct amd_iommu *get_amd_iommu(unsigned int idx); u8 amd_iommu_pc_get_max_banks(unsigned int idx); @@ -51,10 +48,10 @@ int amd_iommu_pc_get_reg(struct amd_iommu *iommu, u8 bank, u8 cntr, int amd_iommu_pc_set_reg(struct amd_iommu *iommu, u8 bank, u8 cntr, u8 fxn, u64 *value); -int amd_iommu_register_ppr_notifier(struct notifier_block *nb); -int amd_iommu_unregister_ppr_notifier(struct notifier_block *nb); -void amd_iommu_domain_direct_map(struct iommu_domain *dom); -int amd_iommu_domain_enable_v2(struct iommu_domain *dom, int pasids); +/* Device capabilities */ +int amd_iommu_pdev_enable_cap_pri(struct pci_dev *pdev); +void amd_iommu_pdev_disable_cap_pri(struct pci_dev *pdev); + int amd_iommu_flush_page(struct iommu_domain *dom, u32 pasid, u64 address); void amd_iommu_update_and_flush_device_table(struct protection_domain *domain); void amd_iommu_domain_update(struct protection_domain *domain); @@ -87,9 +84,25 @@ static inline bool is_rd890_iommu(struct pci_dev *pdev) (pdev->device == PCI_DEVICE_ID_RD890_IOMMU); } -static inline bool iommu_feature(struct amd_iommu *iommu, u64 mask) +static inline bool check_feature(u64 mask) +{ + return (amd_iommu_efr & mask); +} + +static inline bool check_feature2(u64 mask) +{ + return (amd_iommu_efr2 & mask); +} + +static inline int check_feature_gpt_level(void) +{ + return ((amd_iommu_efr >> FEATURE_GATS_SHIFT) & FEATURE_GATS_MASK); +} + +static inline bool amd_iommu_gt_ppr_supported(void) { - return !!(iommu->features & mask); + return (check_feature(FEATURE_GT) && + check_feature(FEATURE_PPR)); } static inline u64 iommu_virt_to_phys(void *vaddr) @@ -105,7 +118,6 @@ static inline void *iommu_phys_to_virt(unsigned long paddr) static inline void amd_iommu_domain_set_pt_root(struct protection_domain *domain, u64 root) { - atomic64_set(&domain->iop.pt_root, root); domain->iop.root = (u64 *)(root & PAGE_MASK); domain->iop.mode = root & 7; /* lowest 3 bits encode pgtable mode */ } @@ -146,8 +158,5 @@ void amd_iommu_domain_set_pgtable(struct protection_domain *domain, u64 *root, int mode); struct dev_table_entry *get_dev_table(struct amd_iommu *iommu); -extern u64 amd_iommu_efr; -extern u64 amd_iommu_efr2; - extern bool amd_iommu_snp_en; #endif diff --git a/drivers/iommu/amd/amd_iommu_types.h b/drivers/iommu/amd/amd_iommu_types.h index 7dc30c2b56..90b7d7950a 100644 --- a/drivers/iommu/amd/amd_iommu_types.h +++ b/drivers/iommu/amd/amd_iommu_types.h @@ -97,7 +97,9 @@ #define FEATURE_GATS_MASK (3ULL) #define FEATURE_GAM_VAPIC BIT_ULL(21) #define FEATURE_GIOSUP BIT_ULL(48) +#define FEATURE_HASUP BIT_ULL(49) #define FEATURE_EPHSUP BIT_ULL(50) +#define FEATURE_HDSUP BIT_ULL(52) #define FEATURE_SNP BIT_ULL(63) #define FEATURE_PASID_SHIFT 32 @@ -212,6 +214,7 @@ /* macros and definitions for device table entries */ #define DEV_ENTRY_VALID 0x00 #define DEV_ENTRY_TRANSLATION 0x01 +#define DEV_ENTRY_HAD 0x07 #define DEV_ENTRY_PPR 0x34 #define DEV_ENTRY_IR 0x3d #define DEV_ENTRY_IW 0x3e @@ -371,9 +374,15 @@ (1ULL << (12 + (9 * (level)))) /* + * The IOPTE dirty bit + */ +#define IOMMU_PTE_HD_BIT (6) + +/* * Bit value definition for I/O PTE fields */ #define IOMMU_PTE_PR BIT_ULL(0) +#define IOMMU_PTE_HD BIT_ULL(IOMMU_PTE_HD_BIT) #define IOMMU_PTE_U BIT_ULL(59) #define IOMMU_PTE_FC BIT_ULL(60) #define IOMMU_PTE_IR BIT_ULL(61) @@ -384,6 +393,7 @@ */ #define DTE_FLAG_V BIT_ULL(0) #define DTE_FLAG_TV BIT_ULL(1) +#define DTE_FLAG_HAD (3ULL << 7) #define DTE_FLAG_GIOV BIT_ULL(54) #define DTE_FLAG_GV BIT_ULL(55) #define DTE_GLX_SHIFT (56) @@ -413,6 +423,7 @@ #define IOMMU_PAGE_MASK (((1ULL << 52) - 1) & ~0xfffULL) #define IOMMU_PTE_PRESENT(pte) ((pte) & IOMMU_PTE_PR) +#define IOMMU_PTE_DIRTY(pte) ((pte) & IOMMU_PTE_HD) #define IOMMU_PTE_PAGE(pte) (iommu_phys_to_virt((pte) & IOMMU_PAGE_MASK)) #define IOMMU_PTE_MODE(pte) (((pte) >> 9) & 0x07) @@ -451,6 +462,10 @@ #define PD_IOMMUV2_MASK BIT(3) /* domain has gcr3 table */ #define PD_GIOV_MASK BIT(4) /* domain enable GIOV support */ +/* Timeout stuff */ +#define LOOP_TIMEOUT 100000 +#define MMIO_STATUS_TIMEOUT 2000000 + extern bool amd_iommu_dump; #define DUMP_printk(format, arg...) \ do { \ @@ -505,19 +520,6 @@ extern struct kmem_cache *amd_iommu_irq_cache; #define APERTURE_RANGE_INDEX(a) ((a) >> APERTURE_RANGE_SHIFT) #define APERTURE_PAGE_INDEX(a) (((a) >> 21) & 0x3fULL) -/* - * This struct is used to pass information about - * incoming PPR faults around. - */ -struct amd_iommu_fault { - u64 address; /* IO virtual address of the fault*/ - u32 pasid; /* Address space identifier */ - u32 sbdf; /* Originating PCI device id */ - u16 tag; /* PPR tag */ - u16 flags; /* Fault flags */ - -}; - struct amd_iommu; struct iommu_domain; @@ -544,7 +546,6 @@ struct amd_io_pgtable { struct io_pgtable iop; int mode; u64 *root; - atomic64_t pt_root; /* pgtable root and pgtable mode */ u64 *pgd; /* v2 pgtable pgd pointer */ }; @@ -563,6 +564,7 @@ struct protection_domain { int nid; /* Node ID */ u64 *gcr3_tbl; /* Guest CR3 table */ unsigned long flags; /* flags to find out type of domain */ + bool dirty_tracking; /* dirty tracking is enabled in the domain */ unsigned dev_cnt; /* devices assigned to this domain */ unsigned dev_iommu[MAX_IOMMUS]; /* per-IOMMU reference count */ }; @@ -676,9 +678,6 @@ struct amd_iommu { /* Extended features 2 */ u64 features2; - /* IOMMUv2 */ - bool is_iommu_v2; - /* PCI device id of the IOMMU device */ u16 devid; @@ -799,6 +798,14 @@ struct devid_map { bool cmd_line; }; +#define AMD_IOMMU_DEVICE_FLAG_ATS_SUP 0x1 /* ATS feature supported */ +#define AMD_IOMMU_DEVICE_FLAG_PRI_SUP 0x2 /* PRI feature supported */ +#define AMD_IOMMU_DEVICE_FLAG_PASID_SUP 0x4 /* PASID context supported */ +/* Device may request execution on memory pages */ +#define AMD_IOMMU_DEVICE_FLAG_EXEC_SUP 0x8 +/* Device may request super-user privileges */ +#define AMD_IOMMU_DEVICE_FLAG_PRIV_SUP 0x10 + /* * This struct contains device specific data for the IOMMU */ @@ -811,13 +818,15 @@ struct iommu_dev_data { struct protection_domain *domain; /* Domain the device is bound to */ struct device *dev; u16 devid; /* PCI Device ID */ - bool iommu_v2; /* Device can make use of IOMMUv2 */ - struct { - bool enabled; - int qdep; - } ats; /* ATS state */ - bool pri_tlp; /* PASID TLB required for + + u32 flags; /* Holds AMD_IOMMU_DEVICE_FLAG_<*> */ + int ats_qdep; + u8 ats_enabled :1; /* ATS state */ + u8 pri_enabled :1; /* PRI state */ + u8 pasid_enabled:1; /* PASID state */ + u8 pri_tlp :1; /* PASID TLB required for PPR completions */ + u8 ppr :1; /* Enable device PPR support */ bool use_vapic; /* Enable device to use vapic mode */ bool defer_attach; @@ -884,16 +893,15 @@ extern unsigned amd_iommu_aperture_order; /* allocation bitmap for domain ids */ extern unsigned long *amd_iommu_pd_alloc_bitmap; -/* Smallest max PASID supported by any IOMMU in the system */ -extern u32 amd_iommu_max_pasid; - -extern bool amd_iommu_v2_present; - extern bool amd_iommu_force_isolation; /* Max levels of glxval supported */ extern int amd_iommu_max_glx_val; +/* Global EFR and EFR2 registers */ +extern u64 amd_iommu_efr; +extern u64 amd_iommu_efr2; + /* * This function flushes all internal caches of * the IOMMU used by this driver. diff --git a/drivers/iommu/amd/init.c b/drivers/iommu/amd/init.c index 45efb7e5d7..64bcf3df37 100644 --- a/drivers/iommu/amd/init.c +++ b/drivers/iommu/amd/init.c @@ -83,8 +83,6 @@ #define ACPI_DEVFLAG_LINT1 0x80 #define ACPI_DEVFLAG_ATSDIS 0x10000000 -#define LOOP_TIMEOUT 2000000 - #define IVRS_GET_SBDF_ID(seg, bus, dev, fn) (((seg & 0xffff) << 16) | ((bus & 0xff) << 8) \ | ((dev & 0x1f) << 3) | (fn & 0x7)) @@ -187,9 +185,6 @@ static int amd_iommus_present; bool amd_iommu_np_cache __read_mostly; bool amd_iommu_iotlb_sup __read_mostly = true; -u32 amd_iommu_max_pasid __read_mostly = ~0; - -bool amd_iommu_v2_present __read_mostly; static bool amd_iommu_pc_present __read_mostly; bool amdr_ivrs_remap_support __read_mostly; @@ -272,7 +267,7 @@ int amd_iommu_get_num_iommus(void) * Iterate through all the IOMMUs to get common EFR * masks among all IOMMUs and warn if found inconsistency. */ -static void get_global_efr(void) +static __init void get_global_efr(void) { struct amd_iommu *iommu; @@ -304,16 +299,6 @@ static void get_global_efr(void) pr_info("Using global IVHD EFR:%#llx, EFR2:%#llx\n", amd_iommu_efr, amd_iommu_efr2); } -static bool check_feature_on_all_iommus(u64 mask) -{ - return !!(amd_iommu_efr & mask); -} - -static inline int check_feature_gpt_level(void) -{ - return ((amd_iommu_efr >> FEATURE_GATS_SHIFT) & FEATURE_GATS_MASK); -} - /* * For IVHD type 0x11/0x40, EFR is also available via IVHD. * Default to IVHD EFR since it is available sooner @@ -399,7 +384,7 @@ static void iommu_set_cwwb_range(struct amd_iommu *iommu) u64 start = iommu_virt_to_phys((void *)iommu->cmd_sem); u64 entry = start & PM_ADDR_MASK; - if (!check_feature_on_all_iommus(FEATURE_SNP)) + if (!check_feature(FEATURE_SNP)) return; /* Note: @@ -869,7 +854,7 @@ static void *__init iommu_alloc_4k_pages(struct amd_iommu *iommu, void *buf = (void *)__get_free_pages(gfp, order); if (buf && - check_feature_on_all_iommus(FEATURE_SNP) && + check_feature(FEATURE_SNP) && set_memory_4k((unsigned long)buf, (1 << order))) { free_pages((unsigned long)buf, order); buf = NULL; @@ -985,14 +970,14 @@ static int iommu_ga_log_enable(struct amd_iommu *iommu) iommu_feature_enable(iommu, CONTROL_GAINT_EN); iommu_feature_enable(iommu, CONTROL_GALOG_EN); - for (i = 0; i < LOOP_TIMEOUT; ++i) { + for (i = 0; i < MMIO_STATUS_TIMEOUT; ++i) { status = readl(iommu->mmio_base + MMIO_STATUS_OFFSET); if (status & (MMIO_STATUS_GALOG_RUN_MASK)) break; udelay(10); } - if (WARN_ON(i >= LOOP_TIMEOUT)) + if (WARN_ON(i >= MMIO_STATUS_TIMEOUT)) return -EINVAL; return 0; @@ -1048,7 +1033,7 @@ static void iommu_enable_xt(struct amd_iommu *iommu) static void iommu_enable_gt(struct amd_iommu *iommu) { - if (!iommu_feature(iommu, FEATURE_GT)) + if (!check_feature(FEATURE_GT)) return; iommu_feature_enable(iommu, CONTROL_GT_EN); @@ -1987,7 +1972,7 @@ static void init_iommu_perf_ctr(struct amd_iommu *iommu) u64 val; struct pci_dev *pdev = iommu->dev; - if (!iommu_feature(iommu, FEATURE_PC)) + if (!check_feature(FEATURE_PC)) return; amd_iommu_pc_present = true; @@ -2014,8 +1999,7 @@ static ssize_t amd_iommu_show_features(struct device *dev, struct device_attribute *attr, char *buf) { - struct amd_iommu *iommu = dev_to_amd_iommu(dev); - return sysfs_emit(buf, "%llx:%llx\n", iommu->features2, iommu->features); + return sysfs_emit(buf, "%llx:%llx\n", amd_iommu_efr, amd_iommu_efr2); } static DEVICE_ATTR(features, S_IRUGO, amd_iommu_show_features, NULL); @@ -2051,9 +2035,9 @@ static void __init late_iommu_features_init(struct amd_iommu *iommu) features = readq(iommu->mmio_base + MMIO_EXT_FEATURES); features2 = readq(iommu->mmio_base + MMIO_EXT_FEATURES2); - if (!iommu->features) { - iommu->features = features; - iommu->features2 = features2; + if (!amd_iommu_efr) { + amd_iommu_efr = features; + amd_iommu_efr2 = features2; return; } @@ -2061,12 +2045,12 @@ static void __init late_iommu_features_init(struct amd_iommu *iommu) * Sanity check and warn if EFR values from * IVHD and MMIO conflict. */ - if (features != iommu->features || - features2 != iommu->features2) { + if (features != amd_iommu_efr || + features2 != amd_iommu_efr2) { pr_warn(FW_WARN "EFR mismatch. Use IVHD EFR (%#llx : %#llx), EFR2 (%#llx : %#llx).\n", - features, iommu->features, - features2, iommu->features2); + features, amd_iommu_efr, + features2, amd_iommu_efr2); } } @@ -2092,20 +2076,17 @@ static int __init iommu_init_pci(struct amd_iommu *iommu) late_iommu_features_init(iommu); - if (iommu_feature(iommu, FEATURE_GT)) { + if (check_feature(FEATURE_GT)) { int glxval; - u32 max_pasid; u64 pasmax; - pasmax = iommu->features & FEATURE_PASID_MASK; + pasmax = amd_iommu_efr & FEATURE_PASID_MASK; pasmax >>= FEATURE_PASID_SHIFT; - max_pasid = (1 << (pasmax + 1)) - 1; + iommu->iommu.max_pasids = (1 << (pasmax + 1)) - 1; - amd_iommu_max_pasid = min(amd_iommu_max_pasid, max_pasid); + BUG_ON(iommu->iommu.max_pasids & ~PASID_MASK); - BUG_ON(amd_iommu_max_pasid & ~PASID_MASK); - - glxval = iommu->features & FEATURE_GLXVAL_MASK; + glxval = amd_iommu_efr & FEATURE_GLXVAL_MASK; glxval >>= FEATURE_GLXVAL_SHIFT; if (amd_iommu_max_glx_val == -1) @@ -2114,13 +2095,7 @@ static int __init iommu_init_pci(struct amd_iommu *iommu) amd_iommu_max_glx_val = min(amd_iommu_max_glx_val, glxval); } - if (iommu_feature(iommu, FEATURE_GT) && - iommu_feature(iommu, FEATURE_PPR)) { - iommu->is_iommu_v2 = true; - amd_iommu_v2_present = true; - } - - if (iommu_feature(iommu, FEATURE_PPR) && alloc_ppr_log(iommu)) + if (check_feature(FEATURE_PPR) && alloc_ppr_log(iommu)) return -ENOMEM; if (iommu->cap & (1UL << IOMMU_CAP_NPCACHE)) { @@ -2132,13 +2107,10 @@ static int __init iommu_init_pci(struct amd_iommu *iommu) init_iommu_perf_ctr(iommu); if (amd_iommu_pgtable == AMD_IOMMU_V2) { - if (!iommu_feature(iommu, FEATURE_GIOSUP) || - !iommu_feature(iommu, FEATURE_GT)) { + if (!check_feature(FEATURE_GIOSUP) || + !check_feature(FEATURE_GT)) { pr_warn("Cannot enable v2 page table for DMA-API. Fallback to v1.\n"); amd_iommu_pgtable = AMD_IOMMU_V1; - } else if (iommu_default_passthrough()) { - pr_warn("V2 page table doesn't support passthrough mode. Fallback to v1.\n"); - amd_iommu_pgtable = AMD_IOMMU_V1; } } @@ -2186,35 +2158,29 @@ static int __init iommu_init_pci(struct amd_iommu *iommu) static void print_iommu_info(void) { + int i; static const char * const feat_str[] = { "PreF", "PPR", "X2APIC", "NX", "GT", "[5]", "IA", "GA", "HE", "PC" }; - struct amd_iommu *iommu; - - for_each_iommu(iommu) { - struct pci_dev *pdev = iommu->dev; - int i; - pci_info(pdev, "Found IOMMU cap 0x%x\n", iommu->cap_ptr); + if (amd_iommu_efr) { + pr_info("Extended features (%#llx, %#llx):", amd_iommu_efr, amd_iommu_efr2); - if (iommu->cap & (1 << IOMMU_CAP_EFR)) { - pr_info("Extended features (%#llx, %#llx):", iommu->features, iommu->features2); - - for (i = 0; i < ARRAY_SIZE(feat_str); ++i) { - if (iommu_feature(iommu, (1ULL << i))) - pr_cont(" %s", feat_str[i]); - } + for (i = 0; i < ARRAY_SIZE(feat_str); ++i) { + if (check_feature(1ULL << i)) + pr_cont(" %s", feat_str[i]); + } - if (iommu->features & FEATURE_GAM_VAPIC) - pr_cont(" GA_vAPIC"); + if (check_feature(FEATURE_GAM_VAPIC)) + pr_cont(" GA_vAPIC"); - if (iommu->features & FEATURE_SNP) - pr_cont(" SNP"); + if (check_feature(FEATURE_SNP)) + pr_cont(" SNP"); - pr_cont("\n"); - } + pr_cont("\n"); } + if (irq_remapping_enabled) { pr_info("Interrupt remapping enabled\n"); if (amd_iommu_xt_mode == IRQ_REMAP_X2APIC_MODE) @@ -2900,19 +2866,19 @@ static void enable_iommus_vapic(void) * Need to set and poll check the GALOGRun bit to zero before * we can set/ modify GA Log registers safely. */ - for (i = 0; i < LOOP_TIMEOUT; ++i) { + for (i = 0; i < MMIO_STATUS_TIMEOUT; ++i) { status = readl(iommu->mmio_base + MMIO_STATUS_OFFSET); if (!(status & MMIO_STATUS_GALOG_RUN_MASK)) break; udelay(10); } - if (WARN_ON(i >= LOOP_TIMEOUT)) + if (WARN_ON(i >= MMIO_STATUS_TIMEOUT)) return; } if (AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir) && - !check_feature_on_all_iommus(FEATURE_GAM_VAPIC)) { + !check_feature(FEATURE_GAM_VAPIC)) { amd_iommu_guest_ir = AMD_IOMMU_GUEST_IR_LEGACY_GA; return; } @@ -3698,9 +3664,8 @@ bool amd_iommu_v2_supported(void) * (i.e. EFR[SNPSup]=1), IOMMUv2 page table cannot be used without * setting up IOMMUv1 page table. */ - return amd_iommu_v2_present && !amd_iommu_snp_en; + return amd_iommu_gt_ppr_supported() && !amd_iommu_snp_en; } -EXPORT_SYMBOL(amd_iommu_v2_supported); struct amd_iommu *get_amd_iommu(unsigned int idx) { @@ -3824,7 +3789,7 @@ int amd_iommu_snp_enable(void) return -EINVAL; } - amd_iommu_snp_en = check_feature_on_all_iommus(FEATURE_SNP); + amd_iommu_snp_en = check_feature(FEATURE_SNP); if (!amd_iommu_snp_en) return -EINVAL; diff --git a/drivers/iommu/amd/io_pgtable.c b/drivers/iommu/amd/io_pgtable.c index 2892aa1b4d..6c0621f6f5 100644 --- a/drivers/iommu/amd/io_pgtable.c +++ b/drivers/iommu/amd/io_pgtable.c @@ -486,6 +486,73 @@ static phys_addr_t iommu_v1_iova_to_phys(struct io_pgtable_ops *ops, unsigned lo return (__pte & ~offset_mask) | (iova & offset_mask); } +static bool pte_test_and_clear_dirty(u64 *ptep, unsigned long size, + unsigned long flags) +{ + bool test_only = flags & IOMMU_DIRTY_NO_CLEAR; + bool dirty = false; + int i, count; + + /* + * 2.2.3.2 Host Dirty Support + * When a non-default page size is used , software must OR the + * Dirty bits in all of the replicated host PTEs used to map + * the page. The IOMMU does not guarantee the Dirty bits are + * set in all of the replicated PTEs. Any portion of the page + * may have been written even if the Dirty bit is set in only + * one of the replicated PTEs. + */ + count = PAGE_SIZE_PTE_COUNT(size); + for (i = 0; i < count && test_only; i++) { + if (test_bit(IOMMU_PTE_HD_BIT, (unsigned long *)&ptep[i])) { + dirty = true; + break; + } + } + + for (i = 0; i < count && !test_only; i++) { + if (test_and_clear_bit(IOMMU_PTE_HD_BIT, + (unsigned long *)&ptep[i])) { + dirty = true; + } + } + + return dirty; +} + +static int iommu_v1_read_and_clear_dirty(struct io_pgtable_ops *ops, + unsigned long iova, size_t size, + unsigned long flags, + struct iommu_dirty_bitmap *dirty) +{ + struct amd_io_pgtable *pgtable = io_pgtable_ops_to_data(ops); + unsigned long end = iova + size - 1; + + do { + unsigned long pgsize = 0; + u64 *ptep, pte; + + ptep = fetch_pte(pgtable, iova, &pgsize); + if (ptep) + pte = READ_ONCE(*ptep); + if (!ptep || !IOMMU_PTE_PRESENT(pte)) { + pgsize = pgsize ?: PTE_LEVEL_PAGE_SIZE(0); + iova += pgsize; + continue; + } + + /* + * Mark the whole IOVA range as dirty even if only one of + * the replicated PTEs were marked dirty. + */ + if (pte_test_and_clear_dirty(ptep, pgsize, flags)) + iommu_dirty_bitmap_record(dirty, iova, pgsize); + iova += pgsize; + } while (iova < end); + + return 0; +} + /* * ---------------------------------------------------- */ @@ -527,6 +594,7 @@ static struct io_pgtable *v1_alloc_pgtable(struct io_pgtable_cfg *cfg, void *coo pgtable->iop.ops.map_pages = iommu_v1_map_pages; pgtable->iop.ops.unmap_pages = iommu_v1_unmap_pages; pgtable->iop.ops.iova_to_phys = iommu_v1_iova_to_phys; + pgtable->iop.ops.read_and_clear_dirty = iommu_v1_read_and_clear_dirty; return &pgtable->iop; } diff --git a/drivers/iommu/amd/io_pgtable_v2.c b/drivers/iommu/amd/io_pgtable_v2.c index e9ef2e0a62..f818a7e254 100644 --- a/drivers/iommu/amd/io_pgtable_v2.c +++ b/drivers/iommu/amd/io_pgtable_v2.c @@ -363,10 +363,10 @@ static void v2_free_pgtable(struct io_pgtable *iop) if (!(pdom->flags & PD_IOMMUV2_MASK)) return; - /* - * Make changes visible to IOMMUs. No need to clear gcr3 entry - * as gcr3 table is already freed. - */ + /* Clear gcr3 entry */ + amd_iommu_domain_clear_gcr3(&pdom->domain, 0); + + /* Make changes visible to IOMMUs */ amd_iommu_domain_update(pdom); /* Free page table */ diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index 95bd7c25ba..fcc987f5d4 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -37,6 +37,7 @@ #include <asm/iommu.h> #include <asm/gart.h> #include <asm/dma.h> +#include <uapi/linux/iommufd.h> #include "amd_iommu.h" #include "../dma-iommu.h" @@ -44,8 +45,6 @@ #define CMD_SET_TYPE(cmd, t) ((cmd)->data[1] |= ((t) << 28)) -#define LOOP_TIMEOUT 100000 - /* IO virtual address start page frame number */ #define IOVA_START_PFN (1) #define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT) @@ -65,8 +64,8 @@ LIST_HEAD(hpet_map); LIST_HEAD(acpihid_map); const struct iommu_ops amd_iommu_ops; +const struct iommu_dirty_ops amd_dirty_ops; -static ATOMIC_NOTIFIER_HEAD(ppr_notifier); int amd_iommu_max_glx_val = -1; /* @@ -79,7 +78,6 @@ struct iommu_cmd { struct kmem_cache *amd_iommu_irq_cache; static void detach_device(struct device *dev); -static int domain_enable_v2(struct protection_domain *domain, int pasids); /**************************************************************************** * @@ -322,24 +320,141 @@ static struct iommu_group *acpihid_device_group(struct device *dev) return entry->group; } -static bool pci_iommuv2_capable(struct pci_dev *pdev) +static inline bool pdev_pasid_supported(struct iommu_dev_data *dev_data) { - static const int caps[] = { - PCI_EXT_CAP_ID_PRI, - PCI_EXT_CAP_ID_PASID, - }; - int i, pos; + return (dev_data->flags & AMD_IOMMU_DEVICE_FLAG_PASID_SUP); +} - if (!pci_ats_supported(pdev)) - return false; +static u32 pdev_get_caps(struct pci_dev *pdev) +{ + int features; + u32 flags = 0; + + if (pci_ats_supported(pdev)) + flags |= AMD_IOMMU_DEVICE_FLAG_ATS_SUP; - for (i = 0; i < 2; ++i) { - pos = pci_find_ext_capability(pdev, caps[i]); - if (pos == 0) - return false; + if (pci_pri_supported(pdev)) + flags |= AMD_IOMMU_DEVICE_FLAG_PRI_SUP; + + features = pci_pasid_features(pdev); + if (features >= 0) { + flags |= AMD_IOMMU_DEVICE_FLAG_PASID_SUP; + + if (features & PCI_PASID_CAP_EXEC) + flags |= AMD_IOMMU_DEVICE_FLAG_EXEC_SUP; + + if (features & PCI_PASID_CAP_PRIV) + flags |= AMD_IOMMU_DEVICE_FLAG_PRIV_SUP; } - return true; + return flags; +} + +static inline int pdev_enable_cap_ats(struct pci_dev *pdev) +{ + struct iommu_dev_data *dev_data = dev_iommu_priv_get(&pdev->dev); + int ret = -EINVAL; + + if (dev_data->ats_enabled) + return 0; + + if (amd_iommu_iotlb_sup && + (dev_data->flags & AMD_IOMMU_DEVICE_FLAG_ATS_SUP)) { + ret = pci_enable_ats(pdev, PAGE_SHIFT); + if (!ret) { + dev_data->ats_enabled = 1; + dev_data->ats_qdep = pci_ats_queue_depth(pdev); + } + } + + return ret; +} + +static inline void pdev_disable_cap_ats(struct pci_dev *pdev) +{ + struct iommu_dev_data *dev_data = dev_iommu_priv_get(&pdev->dev); + + if (dev_data->ats_enabled) { + pci_disable_ats(pdev); + dev_data->ats_enabled = 0; + } +} + +int amd_iommu_pdev_enable_cap_pri(struct pci_dev *pdev) +{ + struct iommu_dev_data *dev_data = dev_iommu_priv_get(&pdev->dev); + int ret = -EINVAL; + + if (dev_data->pri_enabled) + return 0; + + if (dev_data->flags & AMD_IOMMU_DEVICE_FLAG_PRI_SUP) { + /* + * First reset the PRI state of the device. + * FIXME: Hardcode number of outstanding requests for now + */ + if (!pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32)) { + dev_data->pri_enabled = 1; + dev_data->pri_tlp = pci_prg_resp_pasid_required(pdev); + + ret = 0; + } + } + + return ret; +} + +void amd_iommu_pdev_disable_cap_pri(struct pci_dev *pdev) +{ + struct iommu_dev_data *dev_data = dev_iommu_priv_get(&pdev->dev); + + if (dev_data->pri_enabled) { + pci_disable_pri(pdev); + dev_data->pri_enabled = 0; + } +} + +static inline int pdev_enable_cap_pasid(struct pci_dev *pdev) +{ + struct iommu_dev_data *dev_data = dev_iommu_priv_get(&pdev->dev); + int ret = -EINVAL; + + if (dev_data->pasid_enabled) + return 0; + + if (dev_data->flags & AMD_IOMMU_DEVICE_FLAG_PASID_SUP) { + /* Only allow access to user-accessible pages */ + ret = pci_enable_pasid(pdev, 0); + if (!ret) + dev_data->pasid_enabled = 1; + } + + return ret; +} + +static inline void pdev_disable_cap_pasid(struct pci_dev *pdev) +{ + struct iommu_dev_data *dev_data = dev_iommu_priv_get(&pdev->dev); + + if (dev_data->pasid_enabled) { + pci_disable_pasid(pdev); + dev_data->pasid_enabled = 0; + } +} + +static void pdev_enable_caps(struct pci_dev *pdev) +{ + pdev_enable_cap_ats(pdev); + pdev_enable_cap_pasid(pdev); + amd_iommu_pdev_enable_cap_pri(pdev); + +} + +static void pdev_disable_caps(struct pci_dev *pdev) +{ + pdev_disable_cap_ats(pdev); + pdev_disable_cap_pasid(pdev); + amd_iommu_pdev_disable_cap_pri(pdev); } /* @@ -399,8 +514,8 @@ static int iommu_init_device(struct amd_iommu *iommu, struct device *dev) * it'll be forced to go into translation mode. */ if ((iommu_default_passthrough() || !amd_iommu_force_isolation) && - dev_is_pci(dev) && pci_iommuv2_capable(to_pci_dev(dev))) { - dev_data->iommu_v2 = iommu->is_iommu_v2; + dev_is_pci(dev) && amd_iommu_gt_ppr_supported()) { + dev_data->flags = pdev_get_caps(to_pci_dev(dev)); } dev_iommu_priv_set(dev, dev_data); @@ -701,24 +816,6 @@ static void iommu_poll_events(struct amd_iommu *iommu) writel(head, iommu->mmio_base + MMIO_EVT_HEAD_OFFSET); } -static void iommu_handle_ppr_entry(struct amd_iommu *iommu, u64 *raw) -{ - struct amd_iommu_fault fault; - - if (PPR_REQ_TYPE(raw[0]) != PPR_REQ_FAULT) { - pr_err_ratelimited("Unknown PPR request received\n"); - return; - } - - fault.address = raw[1]; - fault.pasid = PPR_PASID(raw[0]); - fault.sbdf = PCI_SEG_DEVID_TO_SBDF(iommu->pci_seg->id, PPR_DEVID(raw[0])); - fault.tag = PPR_TAG(raw[0]); - fault.flags = PPR_FLAGS(raw[0]); - - atomic_notifier_call_chain(&ppr_notifier, 0, &fault); -} - static void iommu_poll_ppr_log(struct amd_iommu *iommu) { u32 head, tail; @@ -764,8 +861,7 @@ static void iommu_poll_ppr_log(struct amd_iommu *iommu) head = (head + PPR_ENTRY_SIZE) % PPR_LOG_SIZE; writel(head, iommu->mmio_base + MMIO_PPR_HEAD_OFFSET); - /* Handle PPR entry */ - iommu_handle_ppr_entry(iommu, entry); + /* TODO: PPR Handler will be added when we add IOPF support */ /* Refresh ring-buffer information */ head = readl(iommu->mmio_base + MMIO_PPR_HEAD_OFFSET); @@ -1094,7 +1190,7 @@ static void build_inv_iotlb_pasid(struct iommu_cmd *cmd, u16 devid, u32 pasid, } static void build_complete_ppr(struct iommu_cmd *cmd, u16 devid, u32 pasid, - int status, int tag, bool gn) + int status, int tag, u8 gn) { memset(cmd, 0, sizeof(*cmd)); @@ -1298,7 +1394,7 @@ static void amd_iommu_flush_irt_all(struct amd_iommu *iommu) void iommu_flush_all_caches(struct amd_iommu *iommu) { - if (iommu_feature(iommu, FEATURE_IA)) { + if (check_feature(FEATURE_IA)) { amd_iommu_flush_all(iommu); } else { amd_iommu_flush_dte_all(iommu); @@ -1317,7 +1413,7 @@ static int device_flush_iotlb(struct iommu_dev_data *dev_data, struct iommu_cmd cmd; int qdep; - qdep = dev_data->ats.qdep; + qdep = dev_data->ats_qdep; iommu = rlookup_amd_iommu(dev_data->dev); if (!iommu) return -EINVAL; @@ -1368,7 +1464,7 @@ static int device_flush_dte(struct iommu_dev_data *dev_data) return ret; } - if (dev_data->ats.enabled) + if (dev_data->ats_enabled) ret = device_flush_iotlb(dev_data, 0, ~0UL); return ret; @@ -1401,7 +1497,7 @@ static void __domain_flush_pages(struct protection_domain *domain, list_for_each_entry(dev_data, &domain->dev_list, list) { - if (!dev_data->ats.enabled) + if (!dev_data->ats_enabled) continue; ret |= device_flush_iotlb(dev_data, address, size); @@ -1577,6 +1673,42 @@ static void free_gcr3_table(struct protection_domain *domain) free_page((unsigned long)domain->gcr3_tbl); } +/* + * Number of GCR3 table levels required. Level must be 4-Kbyte + * page and can contain up to 512 entries. + */ +static int get_gcr3_levels(int pasids) +{ + int levels; + + if (pasids == -1) + return amd_iommu_max_glx_val; + + levels = get_count_order(pasids); + + return levels ? (DIV_ROUND_UP(levels, 9) - 1) : levels; +} + +/* Note: This function expects iommu_domain->lock to be held prior calling the function. */ +static int setup_gcr3_table(struct protection_domain *domain, int pasids) +{ + int levels = get_gcr3_levels(pasids); + + if (levels > amd_iommu_max_glx_val) + return -EINVAL; + + domain->gcr3_tbl = alloc_pgtable_page(domain->nid, GFP_ATOMIC); + if (domain->gcr3_tbl == NULL) + return -ENOMEM; + + domain->glx = levels; + domain->flags |= PD_IOMMUV2_MASK; + + amd_iommu_domain_update(domain); + + return 0; +} + static void set_dte_entry(struct amd_iommu *iommu, u16 devid, struct protection_domain *domain, bool ats, bool ppr) { @@ -1605,10 +1737,11 @@ static void set_dte_entry(struct amd_iommu *iommu, u16 devid, if (ats) flags |= DTE_FLAG_IOTLB; - if (ppr) { - if (iommu_feature(iommu, FEATURE_EPHSUP)) - pte_root |= 1ULL << DEV_ENTRY_PPR; - } + if (ppr) + pte_root |= 1ULL << DEV_ENTRY_PPR; + + if (domain->dirty_tracking) + pte_root |= DTE_FLAG_HAD; if (domain->flags & PD_IOMMUV2_MASK) { u64 gcr3 = iommu_virt_to_phys(domain->gcr3_tbl); @@ -1685,7 +1818,7 @@ static void do_attach(struct iommu_dev_data *dev_data, iommu = rlookup_amd_iommu(dev_data->dev); if (!iommu) return; - ats = dev_data->ats.enabled; + ats = dev_data->ats_enabled; /* Update data structures */ dev_data->domain = domain; @@ -1701,7 +1834,7 @@ static void do_attach(struct iommu_dev_data *dev_data, /* Update device table */ set_dte_entry(iommu, dev_data->devid, domain, - ats, dev_data->iommu_v2); + ats, dev_data->ppr); clone_aliases(iommu, dev_data->dev); device_flush_dte(dev_data); @@ -1736,48 +1869,6 @@ static void do_detach(struct iommu_dev_data *dev_data) domain->dev_cnt -= 1; } -static void pdev_iommuv2_disable(struct pci_dev *pdev) -{ - pci_disable_ats(pdev); - pci_disable_pri(pdev); - pci_disable_pasid(pdev); -} - -static int pdev_pri_ats_enable(struct pci_dev *pdev) -{ - int ret; - - /* Only allow access to user-accessible pages */ - ret = pci_enable_pasid(pdev, 0); - if (ret) - return ret; - - /* First reset the PRI state of the device */ - ret = pci_reset_pri(pdev); - if (ret) - goto out_err_pasid; - - /* Enable PRI */ - /* FIXME: Hardcode number of outstanding requests for now */ - ret = pci_enable_pri(pdev, 32); - if (ret) - goto out_err_pasid; - - ret = pci_enable_ats(pdev, PAGE_SHIFT); - if (ret) - goto out_err_pri; - - return 0; - -out_err_pri: - pci_disable_pri(pdev); - -out_err_pasid: - pci_disable_pasid(pdev); - - return ret; -} - /* * If a device is not yet associated with a domain, this function makes the * device visible in the domain @@ -1786,9 +1877,8 @@ static int attach_device(struct device *dev, struct protection_domain *domain) { struct iommu_dev_data *dev_data; - struct pci_dev *pdev; unsigned long flags; - int ret; + int ret = 0; spin_lock_irqsave(&domain->lock, flags); @@ -1796,45 +1886,13 @@ static int attach_device(struct device *dev, spin_lock(&dev_data->lock); - ret = -EBUSY; - if (dev_data->domain != NULL) + if (dev_data->domain != NULL) { + ret = -EBUSY; goto out; - - if (!dev_is_pci(dev)) - goto skip_ats_check; - - pdev = to_pci_dev(dev); - if (domain->flags & PD_IOMMUV2_MASK) { - struct iommu_domain *def_domain = iommu_get_dma_domain(dev); - - ret = -EINVAL; - - /* - * In case of using AMD_IOMMU_V1 page table mode and the device - * is enabling for PPR/ATS support (using v2 table), - * we need to make sure that the domain type is identity map. - */ - if ((amd_iommu_pgtable == AMD_IOMMU_V1) && - def_domain->type != IOMMU_DOMAIN_IDENTITY) { - goto out; - } - - if (dev_data->iommu_v2) { - if (pdev_pri_ats_enable(pdev) != 0) - goto out; - - dev_data->ats.enabled = true; - dev_data->ats.qdep = pci_ats_queue_depth(pdev); - dev_data->pri_tlp = pci_prg_resp_pasid_required(pdev); - } - } else if (amd_iommu_iotlb_sup && - pci_enable_ats(pdev, PAGE_SHIFT) == 0) { - dev_data->ats.enabled = true; - dev_data->ats.qdep = pci_ats_queue_depth(pdev); } -skip_ats_check: - ret = 0; + if (dev_is_pci(dev)) + pdev_enable_caps(to_pci_dev(dev)); do_attach(dev_data, domain); @@ -1882,15 +1940,8 @@ static void detach_device(struct device *dev) do_detach(dev_data); - if (!dev_is_pci(dev)) - goto out; - - if (domain->flags & PD_IOMMUV2_MASK && dev_data->iommu_v2) - pdev_iommuv2_disable(to_pci_dev(dev)); - else if (dev_data->ats.enabled) - pci_disable_ats(to_pci_dev(dev)); - - dev_data->ats.enabled = false; + if (dev_is_pci(dev)) + pdev_disable_caps(to_pci_dev(dev)); out: spin_unlock(&dev_data->lock); @@ -1980,7 +2031,7 @@ static void update_device_table(struct protection_domain *domain) if (!iommu) continue; set_dte_entry(iommu, dev_data->devid, domain, - dev_data->ats.enabled, dev_data->iommu_v2); + dev_data->ats_enabled, dev_data->ppr); clone_aliases(iommu, dev_data->dev); } } @@ -2014,9 +2065,11 @@ void amd_iommu_domain_update(struct protection_domain *domain) static void cleanup_domain(struct protection_domain *domain) { struct iommu_dev_data *entry; - unsigned long flags; - spin_lock_irqsave(&domain->lock, flags); + lockdep_assert_held(&domain->lock); + + if (!domain->dev_cnt) + return; while (!list_empty(&domain->dev_list)) { entry = list_first_entry(&domain->dev_list, @@ -2024,8 +2077,7 @@ static void cleanup_domain(struct protection_domain *domain) BUG_ON(!entry->domain); do_detach(entry); } - - spin_unlock_irqrestore(&domain->lock, flags); + WARN_ON(domain->dev_cnt != 0); } static void protection_domain_free(struct protection_domain *domain) @@ -2036,6 +2088,12 @@ static void protection_domain_free(struct protection_domain *domain) if (domain->iop.pgtbl_cfg.tlb) free_io_pgtable_ops(&domain->iop.iop.ops); + if (domain->flags & PD_IOMMUV2_MASK) + free_gcr3_table(domain); + + if (domain->iop.root) + free_page((unsigned long)domain->iop.root); + if (domain->id) domain_id_free(domain->id); @@ -2048,18 +2106,10 @@ static int protection_domain_init_v1(struct protection_domain *domain, int mode) BUG_ON(mode < PAGE_MODE_NONE || mode > PAGE_MODE_6_LEVEL); - spin_lock_init(&domain->lock); - domain->id = domain_id_alloc(); - if (!domain->id) - return -ENOMEM; - INIT_LIST_HEAD(&domain->dev_list); - if (mode != PAGE_MODE_NONE) { pt_root = (void *)get_zeroed_page(GFP_KERNEL); - if (!pt_root) { - domain_id_free(domain->id); + if (!pt_root) return -ENOMEM; - } } amd_iommu_domain_set_pgtable(domain, pt_root, mode); @@ -2069,20 +2119,12 @@ static int protection_domain_init_v1(struct protection_domain *domain, int mode) static int protection_domain_init_v2(struct protection_domain *domain) { - spin_lock_init(&domain->lock); - domain->id = domain_id_alloc(); - if (!domain->id) - return -ENOMEM; - INIT_LIST_HEAD(&domain->dev_list); - domain->flags |= PD_GIOV_MASK; domain->domain.pgsize_bitmap = AMD_IOMMU_PGSIZES_V2; - if (domain_enable_v2(domain, 1)) { - domain_id_free(domain->id); + if (setup_gcr3_table(domain, 1)) return -ENOMEM; - } return 0; } @@ -2092,57 +2134,60 @@ static struct protection_domain *protection_domain_alloc(unsigned int type) struct io_pgtable_ops *pgtbl_ops; struct protection_domain *domain; int pgtable; - int mode = DEFAULT_PGTABLE_LEVEL; int ret; + domain = kzalloc(sizeof(*domain), GFP_KERNEL); + if (!domain) + return NULL; + + domain->id = domain_id_alloc(); + if (!domain->id) + goto out_err; + + spin_lock_init(&domain->lock); + INIT_LIST_HEAD(&domain->dev_list); + domain->nid = NUMA_NO_NODE; + + switch (type) { + /* No need to allocate io pgtable ops in passthrough mode */ + case IOMMU_DOMAIN_IDENTITY: + return domain; + case IOMMU_DOMAIN_DMA: + pgtable = amd_iommu_pgtable; + break; /* - * Force IOMMU v1 page table when iommu=pt and - * when allocating domain for pass-through devices. + * Force IOMMU v1 page table when allocating + * domain for pass-through devices. */ - if (type == IOMMU_DOMAIN_IDENTITY) { - pgtable = AMD_IOMMU_V1; - mode = PAGE_MODE_NONE; - } else if (type == IOMMU_DOMAIN_UNMANAGED) { + case IOMMU_DOMAIN_UNMANAGED: pgtable = AMD_IOMMU_V1; - } else if (type == IOMMU_DOMAIN_DMA || type == IOMMU_DOMAIN_DMA_FQ) { - pgtable = amd_iommu_pgtable; - } else { - return NULL; + break; + default: + goto out_err; } - domain = kzalloc(sizeof(*domain), GFP_KERNEL); - if (!domain) - return NULL; - switch (pgtable) { case AMD_IOMMU_V1: - ret = protection_domain_init_v1(domain, mode); + ret = protection_domain_init_v1(domain, DEFAULT_PGTABLE_LEVEL); break; case AMD_IOMMU_V2: ret = protection_domain_init_v2(domain); break; default: ret = -EINVAL; + break; } if (ret) goto out_err; - /* No need to allocate io pgtable ops in passthrough mode */ - if (type == IOMMU_DOMAIN_IDENTITY) - return domain; - - domain->nid = NUMA_NO_NODE; - pgtbl_ops = alloc_io_pgtable_ops(pgtable, &domain->iop.pgtbl_cfg, domain); - if (!pgtbl_ops) { - domain_id_free(domain->id); + if (!pgtbl_ops) goto out_err; - } return domain; out_err: - kfree(domain); + protection_domain_free(domain); return NULL; } @@ -2155,44 +2200,94 @@ static inline u64 dma_max_address(void) return ((1ULL << PM_LEVEL_SHIFT(amd_iommu_gpt_level)) - 1); } -static struct iommu_domain *amd_iommu_domain_alloc(unsigned type) +static bool amd_iommu_hd_support(struct amd_iommu *iommu) +{ + return iommu && (iommu->features & FEATURE_HDSUP); +} + +static struct iommu_domain *do_iommu_domain_alloc(unsigned int type, + struct device *dev, u32 flags) { + bool dirty_tracking = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING; struct protection_domain *domain; + struct amd_iommu *iommu = NULL; + + if (dev) { + iommu = rlookup_amd_iommu(dev); + if (!iommu) + return ERR_PTR(-ENODEV); + } /* * Since DTE[Mode]=0 is prohibited on SNP-enabled system, * default to use IOMMU_DOMAIN_DMA[_FQ]. */ if (amd_iommu_snp_en && (type == IOMMU_DOMAIN_IDENTITY)) - return NULL; + return ERR_PTR(-EINVAL); + + if (dirty_tracking && !amd_iommu_hd_support(iommu)) + return ERR_PTR(-EOPNOTSUPP); domain = protection_domain_alloc(type); if (!domain) - return NULL; + return ERR_PTR(-ENOMEM); domain->domain.geometry.aperture_start = 0; domain->domain.geometry.aperture_end = dma_max_address(); domain->domain.geometry.force_aperture = true; + if (iommu) { + domain->domain.type = type; + domain->domain.pgsize_bitmap = iommu->iommu.ops->pgsize_bitmap; + domain->domain.ops = iommu->iommu.ops->default_domain_ops; + + if (dirty_tracking) + domain->domain.dirty_ops = &amd_dirty_ops; + } + return &domain->domain; } -static void amd_iommu_domain_free(struct iommu_domain *dom) +static struct iommu_domain *amd_iommu_domain_alloc(unsigned int type) { - struct protection_domain *domain; + struct iommu_domain *domain; - domain = to_pdomain(dom); + domain = do_iommu_domain_alloc(type, NULL, 0); + if (IS_ERR(domain)) + return NULL; + + return domain; +} + +static struct iommu_domain * +amd_iommu_domain_alloc_user(struct device *dev, u32 flags, + struct iommu_domain *parent, + const struct iommu_user_data *user_data) + +{ + unsigned int type = IOMMU_DOMAIN_UNMANAGED; - if (domain->dev_cnt > 0) - cleanup_domain(domain); + if ((flags & ~IOMMU_HWPT_ALLOC_DIRTY_TRACKING) || parent || user_data) + return ERR_PTR(-EOPNOTSUPP); - BUG_ON(domain->dev_cnt != 0); + return do_iommu_domain_alloc(type, dev, flags); +} + +static void amd_iommu_domain_free(struct iommu_domain *dom) +{ + struct protection_domain *domain; + unsigned long flags; if (!dom) return; - if (domain->flags & PD_IOMMUV2_MASK) - free_gcr3_table(domain); + domain = to_pdomain(dom); + + spin_lock_irqsave(&domain->lock, flags); + + cleanup_domain(domain); + + spin_unlock_irqrestore(&domain->lock, flags); protection_domain_free(domain); } @@ -2214,6 +2309,13 @@ static int amd_iommu_attach_device(struct iommu_domain *dom, dev_data->defer_attach = false; + /* + * Restrict to devices with compatible IOMMU hardware support + * when enforcement of dirty tracking is enabled. + */ + if (dom->dirty_ops && !amd_iommu_hd_support(iommu)) + return -EINVAL; + if (dev_data->domain) detach_device(dev); @@ -2233,14 +2335,15 @@ static int amd_iommu_attach_device(struct iommu_domain *dom, return ret; } -static void amd_iommu_iotlb_sync_map(struct iommu_domain *dom, - unsigned long iova, size_t size) +static int amd_iommu_iotlb_sync_map(struct iommu_domain *dom, + unsigned long iova, size_t size) { struct protection_domain *domain = to_pdomain(dom); struct io_pgtable_ops *ops = &domain->iop.iop.ops; if (ops->map_pages) domain_flush_np_cache(domain, iova, size); + return 0; } static int amd_iommu_map_pages(struct iommu_domain *dom, unsigned long iova, @@ -2332,6 +2435,11 @@ static bool amd_iommu_capable(struct device *dev, enum iommu_cap cap) return true; case IOMMU_CAP_DEFERRED_FLUSH: return true; + case IOMMU_CAP_DIRTY_TRACKING: { + struct amd_iommu *iommu = rlookup_amd_iommu(dev); + + return amd_iommu_hd_support(iommu); + } default: break; } @@ -2339,6 +2447,73 @@ static bool amd_iommu_capable(struct device *dev, enum iommu_cap cap) return false; } +static int amd_iommu_set_dirty_tracking(struct iommu_domain *domain, + bool enable) +{ + struct protection_domain *pdomain = to_pdomain(domain); + struct dev_table_entry *dev_table; + struct iommu_dev_data *dev_data; + bool domain_flush = false; + struct amd_iommu *iommu; + unsigned long flags; + u64 pte_root; + + spin_lock_irqsave(&pdomain->lock, flags); + if (!(pdomain->dirty_tracking ^ enable)) { + spin_unlock_irqrestore(&pdomain->lock, flags); + return 0; + } + + list_for_each_entry(dev_data, &pdomain->dev_list, list) { + iommu = rlookup_amd_iommu(dev_data->dev); + if (!iommu) + continue; + + dev_table = get_dev_table(iommu); + pte_root = dev_table[dev_data->devid].data[0]; + + pte_root = (enable ? pte_root | DTE_FLAG_HAD : + pte_root & ~DTE_FLAG_HAD); + + /* Flush device DTE */ + dev_table[dev_data->devid].data[0] = pte_root; + device_flush_dte(dev_data); + domain_flush = true; + } + + /* Flush IOTLB to mark IOPTE dirty on the next translation(s) */ + if (domain_flush) { + amd_iommu_domain_flush_tlb_pde(pdomain); + amd_iommu_domain_flush_complete(pdomain); + } + pdomain->dirty_tracking = enable; + spin_unlock_irqrestore(&pdomain->lock, flags); + + return 0; +} + +static int amd_iommu_read_and_clear_dirty(struct iommu_domain *domain, + unsigned long iova, size_t size, + unsigned long flags, + struct iommu_dirty_bitmap *dirty) +{ + struct protection_domain *pdomain = to_pdomain(domain); + struct io_pgtable_ops *ops = &pdomain->iop.iop.ops; + unsigned long lflags; + + if (!ops || !ops->read_and_clear_dirty) + return -EOPNOTSUPP; + + spin_lock_irqsave(&pdomain->lock, lflags); + if (!pdomain->dirty_tracking && dirty->bitmap) { + spin_unlock_irqrestore(&pdomain->lock, lflags); + return -EINVAL; + } + spin_unlock_irqrestore(&pdomain->lock, lflags); + + return ops->read_and_clear_dirty(ops, iova, size, flags, dirty); +} + static void amd_iommu_get_resv_regions(struct device *dev, struct list_head *head) { @@ -2406,7 +2581,6 @@ bool amd_iommu_is_attach_deferred(struct device *dev) return dev_data->defer_attach; } -EXPORT_SYMBOL_GPL(amd_iommu_is_attach_deferred); static void amd_iommu_flush_iotlb_all(struct iommu_domain *domain) { @@ -2446,7 +2620,7 @@ static int amd_iommu_def_domain_type(struct device *dev) * and require remapping. * - SNP is enabled, because it prohibits DTE[Mode]=0. */ - if (dev_data->iommu_v2 && + if (pdev_pasid_supported(dev_data) && !cc_platform_has(CC_ATTR_MEM_ENCRYPT) && !amd_iommu_snp_en) { return IOMMU_DOMAIN_IDENTITY; @@ -2461,9 +2635,15 @@ static bool amd_iommu_enforce_cache_coherency(struct iommu_domain *domain) return true; } +const struct iommu_dirty_ops amd_dirty_ops = { + .set_dirty_tracking = amd_iommu_set_dirty_tracking, + .read_and_clear_dirty = amd_iommu_read_and_clear_dirty, +}; + const struct iommu_ops amd_iommu_ops = { .capable = amd_iommu_capable, .domain_alloc = amd_iommu_domain_alloc, + .domain_alloc_user = amd_iommu_domain_alloc_user, .probe_device = amd_iommu_probe_device, .release_device = amd_iommu_release_device, .probe_finalize = amd_iommu_probe_finalize, @@ -2485,93 +2665,6 @@ const struct iommu_ops amd_iommu_ops = { } }; -/***************************************************************************** - * - * The next functions do a basic initialization of IOMMU for pass through - * mode - * - * In passthrough mode the IOMMU is initialized and enabled but not used for - * DMA-API translation. - * - *****************************************************************************/ - -/* IOMMUv2 specific functions */ -int amd_iommu_register_ppr_notifier(struct notifier_block *nb) -{ - return atomic_notifier_chain_register(&ppr_notifier, nb); -} -EXPORT_SYMBOL(amd_iommu_register_ppr_notifier); - -int amd_iommu_unregister_ppr_notifier(struct notifier_block *nb) -{ - return atomic_notifier_chain_unregister(&ppr_notifier, nb); -} -EXPORT_SYMBOL(amd_iommu_unregister_ppr_notifier); - -void amd_iommu_domain_direct_map(struct iommu_domain *dom) -{ - struct protection_domain *domain = to_pdomain(dom); - unsigned long flags; - - spin_lock_irqsave(&domain->lock, flags); - - if (domain->iop.pgtbl_cfg.tlb) - free_io_pgtable_ops(&domain->iop.iop.ops); - - spin_unlock_irqrestore(&domain->lock, flags); -} -EXPORT_SYMBOL(amd_iommu_domain_direct_map); - -/* Note: This function expects iommu_domain->lock to be held prior calling the function. */ -static int domain_enable_v2(struct protection_domain *domain, int pasids) -{ - int levels; - - /* Number of GCR3 table levels required */ - for (levels = 0; (pasids - 1) & ~0x1ff; pasids >>= 9) - levels += 1; - - if (levels > amd_iommu_max_glx_val) - return -EINVAL; - - domain->gcr3_tbl = (void *)get_zeroed_page(GFP_ATOMIC); - if (domain->gcr3_tbl == NULL) - return -ENOMEM; - - domain->glx = levels; - domain->flags |= PD_IOMMUV2_MASK; - - amd_iommu_domain_update(domain); - - return 0; -} - -int amd_iommu_domain_enable_v2(struct iommu_domain *dom, int pasids) -{ - struct protection_domain *pdom = to_pdomain(dom); - unsigned long flags; - int ret; - - spin_lock_irqsave(&pdom->lock, flags); - - /* - * Save us all sanity checks whether devices already in the - * domain support IOMMUv2. Just force that the domain has no - * devices attached when it is switched into IOMMUv2 mode. - */ - ret = -EBUSY; - if (pdom->dev_cnt > 0 || pdom->flags & PD_IOMMUV2_MASK) - goto out; - - if (!pdom->gcr3_tbl) - ret = domain_enable_v2(pdom, pasids); - -out: - spin_unlock_irqrestore(&pdom->lock, flags); - return ret; -} -EXPORT_SYMBOL(amd_iommu_domain_enable_v2); - static int __flush_pasid(struct protection_domain *domain, u32 pasid, u64 address, bool size) { @@ -2609,10 +2702,10 @@ static int __flush_pasid(struct protection_domain *domain, u32 pasid, There might be non-IOMMUv2 capable devices in an IOMMUv2 * domain. */ - if (!dev_data->ats.enabled) + if (!dev_data->ats_enabled) continue; - qdep = dev_data->ats.qdep; + qdep = dev_data->ats_qdep; iommu = rlookup_amd_iommu(dev_data->dev); if (!iommu) continue; @@ -2653,7 +2746,6 @@ int amd_iommu_flush_page(struct iommu_domain *dom, u32 pasid, return ret; } -EXPORT_SYMBOL(amd_iommu_flush_page); static int __amd_iommu_flush_tlb(struct protection_domain *domain, u32 pasid) { @@ -2673,7 +2765,6 @@ int amd_iommu_flush_tlb(struct iommu_domain *dom, u32 pasid) return ret; } -EXPORT_SYMBOL(amd_iommu_flush_tlb); static u64 *__get_gcr3_pte(u64 *root, int level, u32 pasid, bool alloc) { @@ -2753,7 +2844,6 @@ int amd_iommu_domain_set_gcr3(struct iommu_domain *dom, u32 pasid, return ret; } -EXPORT_SYMBOL(amd_iommu_domain_set_gcr3); int amd_iommu_domain_clear_gcr3(struct iommu_domain *dom, u32 pasid) { @@ -2767,7 +2857,6 @@ int amd_iommu_domain_clear_gcr3(struct iommu_domain *dom, u32 pasid) return ret; } -EXPORT_SYMBOL(amd_iommu_domain_clear_gcr3); int amd_iommu_complete_ppr(struct pci_dev *pdev, u32 pasid, int status, int tag) @@ -2786,49 +2875,6 @@ int amd_iommu_complete_ppr(struct pci_dev *pdev, u32 pasid, return iommu_queue_command(iommu, &cmd); } -EXPORT_SYMBOL(amd_iommu_complete_ppr); - -int amd_iommu_device_info(struct pci_dev *pdev, - struct amd_iommu_device_info *info) -{ - int max_pasids; - int pos; - - if (pdev == NULL || info == NULL) - return -EINVAL; - - if (!amd_iommu_v2_supported()) - return -EINVAL; - - memset(info, 0, sizeof(*info)); - - if (pci_ats_supported(pdev)) - info->flags |= AMD_IOMMU_DEVICE_FLAG_ATS_SUP; - - pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI); - if (pos) - info->flags |= AMD_IOMMU_DEVICE_FLAG_PRI_SUP; - - pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PASID); - if (pos) { - int features; - - max_pasids = 1 << (9 * (amd_iommu_max_glx_val + 1)); - max_pasids = min(max_pasids, (1 << 20)); - - info->flags |= AMD_IOMMU_DEVICE_FLAG_PASID_SUP; - info->max_pasids = min(pci_max_pasids(pdev), max_pasids); - - features = pci_pasid_features(pdev); - if (features & PCI_PASID_CAP_EXEC) - info->flags |= AMD_IOMMU_DEVICE_FLAG_EXEC_SUP; - if (features & PCI_PASID_CAP_PRIV) - info->flags |= AMD_IOMMU_DEVICE_FLAG_PRIV_SUP; - } - - return 0; -} -EXPORT_SYMBOL(amd_iommu_device_info); #ifdef CONFIG_IRQ_REMAP diff --git a/drivers/iommu/amd/iommu_v2.c b/drivers/iommu/amd/iommu_v2.c deleted file mode 100644 index 57c2fb1146..0000000000 --- a/drivers/iommu/amd/iommu_v2.c +++ /dev/null @@ -1,996 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Copyright (C) 2010-2012 Advanced Micro Devices, Inc. - * Author: Joerg Roedel <jroedel@suse.de> - */ - -#define pr_fmt(fmt) "AMD-Vi: " fmt - -#include <linux/refcount.h> -#include <linux/mmu_notifier.h> -#include <linux/amd-iommu.h> -#include <linux/mm_types.h> -#include <linux/profile.h> -#include <linux/module.h> -#include <linux/sched.h> -#include <linux/sched/mm.h> -#include <linux/wait.h> -#include <linux/pci.h> -#include <linux/gfp.h> -#include <linux/cc_platform.h> - -#include "amd_iommu.h" - -MODULE_LICENSE("GPL v2"); -MODULE_AUTHOR("Joerg Roedel <jroedel@suse.de>"); - -#define PRI_QUEUE_SIZE 512 - -struct pri_queue { - atomic_t inflight; - bool finish; - int status; -}; - -struct pasid_state { - struct list_head list; /* For global state-list */ - refcount_t count; /* Reference count */ - unsigned mmu_notifier_count; /* Counting nested mmu_notifier - calls */ - struct mm_struct *mm; /* mm_struct for the faults */ - struct mmu_notifier mn; /* mmu_notifier handle */ - struct pri_queue pri[PRI_QUEUE_SIZE]; /* PRI tag states */ - struct device_state *device_state; /* Link to our device_state */ - u32 pasid; /* PASID index */ - bool invalid; /* Used during setup and - teardown of the pasid */ - spinlock_t lock; /* Protect pri_queues and - mmu_notifer_count */ - wait_queue_head_t wq; /* To wait for count == 0 */ -}; - -struct device_state { - struct list_head list; - u32 sbdf; - atomic_t count; - struct pci_dev *pdev; - struct pasid_state **states; - struct iommu_domain *domain; - int pasid_levels; - int max_pasids; - amd_iommu_invalid_ppr_cb inv_ppr_cb; - amd_iommu_invalidate_ctx inv_ctx_cb; - spinlock_t lock; - wait_queue_head_t wq; -}; - -struct fault { - struct work_struct work; - struct device_state *dev_state; - struct pasid_state *state; - struct mm_struct *mm; - u64 address; - u32 pasid; - u16 tag; - u16 finish; - u16 flags; -}; - -static LIST_HEAD(state_list); -static DEFINE_SPINLOCK(state_lock); - -static struct workqueue_struct *iommu_wq; - -static void free_pasid_states(struct device_state *dev_state); - -static struct device_state *__get_device_state(u32 sbdf) -{ - struct device_state *dev_state; - - list_for_each_entry(dev_state, &state_list, list) { - if (dev_state->sbdf == sbdf) - return dev_state; - } - - return NULL; -} - -static struct device_state *get_device_state(u32 sbdf) -{ - struct device_state *dev_state; - unsigned long flags; - - spin_lock_irqsave(&state_lock, flags); - dev_state = __get_device_state(sbdf); - if (dev_state != NULL) - atomic_inc(&dev_state->count); - spin_unlock_irqrestore(&state_lock, flags); - - return dev_state; -} - -static void free_device_state(struct device_state *dev_state) -{ - struct iommu_group *group; - - /* Get rid of any remaining pasid states */ - free_pasid_states(dev_state); - - /* - * Wait until the last reference is dropped before freeing - * the device state. - */ - wait_event(dev_state->wq, !atomic_read(&dev_state->count)); - - /* - * First detach device from domain - No more PRI requests will arrive - * from that device after it is unbound from the IOMMUv2 domain. - */ - group = iommu_group_get(&dev_state->pdev->dev); - if (WARN_ON(!group)) - return; - - iommu_detach_group(dev_state->domain, group); - - iommu_group_put(group); - - /* Everything is down now, free the IOMMUv2 domain */ - iommu_domain_free(dev_state->domain); - - /* Finally get rid of the device-state */ - kfree(dev_state); -} - -static void put_device_state(struct device_state *dev_state) -{ - if (atomic_dec_and_test(&dev_state->count)) - wake_up(&dev_state->wq); -} - -/* Must be called under dev_state->lock */ -static struct pasid_state **__get_pasid_state_ptr(struct device_state *dev_state, - u32 pasid, bool alloc) -{ - struct pasid_state **root, **ptr; - int level, index; - - level = dev_state->pasid_levels; - root = dev_state->states; - - while (true) { - - index = (pasid >> (9 * level)) & 0x1ff; - ptr = &root[index]; - - if (level == 0) - break; - - if (*ptr == NULL) { - if (!alloc) - return NULL; - - *ptr = (void *)get_zeroed_page(GFP_ATOMIC); - if (*ptr == NULL) - return NULL; - } - - root = (struct pasid_state **)*ptr; - level -= 1; - } - - return ptr; -} - -static int set_pasid_state(struct device_state *dev_state, - struct pasid_state *pasid_state, - u32 pasid) -{ - struct pasid_state **ptr; - unsigned long flags; - int ret; - - spin_lock_irqsave(&dev_state->lock, flags); - ptr = __get_pasid_state_ptr(dev_state, pasid, true); - - ret = -ENOMEM; - if (ptr == NULL) - goto out_unlock; - - ret = -ENOMEM; - if (*ptr != NULL) - goto out_unlock; - - *ptr = pasid_state; - - ret = 0; - -out_unlock: - spin_unlock_irqrestore(&dev_state->lock, flags); - - return ret; -} - -static void clear_pasid_state(struct device_state *dev_state, u32 pasid) -{ - struct pasid_state **ptr; - unsigned long flags; - - spin_lock_irqsave(&dev_state->lock, flags); - ptr = __get_pasid_state_ptr(dev_state, pasid, true); - - if (ptr == NULL) - goto out_unlock; - - *ptr = NULL; - -out_unlock: - spin_unlock_irqrestore(&dev_state->lock, flags); -} - -static struct pasid_state *get_pasid_state(struct device_state *dev_state, - u32 pasid) -{ - struct pasid_state **ptr, *ret = NULL; - unsigned long flags; - - spin_lock_irqsave(&dev_state->lock, flags); - ptr = __get_pasid_state_ptr(dev_state, pasid, false); - - if (ptr == NULL) - goto out_unlock; - - ret = *ptr; - if (ret) - refcount_inc(&ret->count); - -out_unlock: - spin_unlock_irqrestore(&dev_state->lock, flags); - - return ret; -} - -static void free_pasid_state(struct pasid_state *pasid_state) -{ - kfree(pasid_state); -} - -static void put_pasid_state(struct pasid_state *pasid_state) -{ - if (refcount_dec_and_test(&pasid_state->count)) - wake_up(&pasid_state->wq); -} - -static void put_pasid_state_wait(struct pasid_state *pasid_state) -{ - if (!refcount_dec_and_test(&pasid_state->count)) - wait_event(pasid_state->wq, !refcount_read(&pasid_state->count)); - free_pasid_state(pasid_state); -} - -static void unbind_pasid(struct pasid_state *pasid_state) -{ - struct iommu_domain *domain; - - domain = pasid_state->device_state->domain; - - /* - * Mark pasid_state as invalid, no more faults will we added to the - * work queue after this is visible everywhere. - */ - pasid_state->invalid = true; - - /* Make sure this is visible */ - smp_wmb(); - - /* After this the device/pasid can't access the mm anymore */ - amd_iommu_domain_clear_gcr3(domain, pasid_state->pasid); - - /* Make sure no more pending faults are in the queue */ - flush_workqueue(iommu_wq); -} - -static void free_pasid_states_level1(struct pasid_state **tbl) -{ - int i; - - for (i = 0; i < 512; ++i) { - if (tbl[i] == NULL) - continue; - - free_page((unsigned long)tbl[i]); - } -} - -static void free_pasid_states_level2(struct pasid_state **tbl) -{ - struct pasid_state **ptr; - int i; - - for (i = 0; i < 512; ++i) { - if (tbl[i] == NULL) - continue; - - ptr = (struct pasid_state **)tbl[i]; - free_pasid_states_level1(ptr); - } -} - -static void free_pasid_states(struct device_state *dev_state) -{ - struct pasid_state *pasid_state; - int i; - - for (i = 0; i < dev_state->max_pasids; ++i) { - pasid_state = get_pasid_state(dev_state, i); - if (pasid_state == NULL) - continue; - - put_pasid_state(pasid_state); - - /* Clear the pasid state so that the pasid can be re-used */ - clear_pasid_state(dev_state, pasid_state->pasid); - - /* - * This will call the mn_release function and - * unbind the PASID - */ - mmu_notifier_unregister(&pasid_state->mn, pasid_state->mm); - - put_pasid_state_wait(pasid_state); /* Reference taken in - amd_iommu_bind_pasid */ - - /* Drop reference taken in amd_iommu_bind_pasid */ - put_device_state(dev_state); - } - - if (dev_state->pasid_levels == 2) - free_pasid_states_level2(dev_state->states); - else if (dev_state->pasid_levels == 1) - free_pasid_states_level1(dev_state->states); - else - BUG_ON(dev_state->pasid_levels != 0); - - free_page((unsigned long)dev_state->states); -} - -static struct pasid_state *mn_to_state(struct mmu_notifier *mn) -{ - return container_of(mn, struct pasid_state, mn); -} - -static void mn_arch_invalidate_secondary_tlbs(struct mmu_notifier *mn, - struct mm_struct *mm, - unsigned long start, unsigned long end) -{ - struct pasid_state *pasid_state; - struct device_state *dev_state; - - pasid_state = mn_to_state(mn); - dev_state = pasid_state->device_state; - - if ((start ^ (end - 1)) < PAGE_SIZE) - amd_iommu_flush_page(dev_state->domain, pasid_state->pasid, - start); - else - amd_iommu_flush_tlb(dev_state->domain, pasid_state->pasid); -} - -static void mn_release(struct mmu_notifier *mn, struct mm_struct *mm) -{ - struct pasid_state *pasid_state; - struct device_state *dev_state; - bool run_inv_ctx_cb; - - might_sleep(); - - pasid_state = mn_to_state(mn); - dev_state = pasid_state->device_state; - run_inv_ctx_cb = !pasid_state->invalid; - - if (run_inv_ctx_cb && dev_state->inv_ctx_cb) - dev_state->inv_ctx_cb(dev_state->pdev, pasid_state->pasid); - - unbind_pasid(pasid_state); -} - -static const struct mmu_notifier_ops iommu_mn = { - .release = mn_release, - .arch_invalidate_secondary_tlbs = mn_arch_invalidate_secondary_tlbs, -}; - -static void set_pri_tag_status(struct pasid_state *pasid_state, - u16 tag, int status) -{ - unsigned long flags; - - spin_lock_irqsave(&pasid_state->lock, flags); - pasid_state->pri[tag].status = status; - spin_unlock_irqrestore(&pasid_state->lock, flags); -} - -static void finish_pri_tag(struct device_state *dev_state, - struct pasid_state *pasid_state, - u16 tag) -{ - unsigned long flags; - - spin_lock_irqsave(&pasid_state->lock, flags); - if (atomic_dec_and_test(&pasid_state->pri[tag].inflight) && - pasid_state->pri[tag].finish) { - amd_iommu_complete_ppr(dev_state->pdev, pasid_state->pasid, - pasid_state->pri[tag].status, tag); - pasid_state->pri[tag].finish = false; - pasid_state->pri[tag].status = PPR_SUCCESS; - } - spin_unlock_irqrestore(&pasid_state->lock, flags); -} - -static void handle_fault_error(struct fault *fault) -{ - int status; - - if (!fault->dev_state->inv_ppr_cb) { - set_pri_tag_status(fault->state, fault->tag, PPR_INVALID); - return; - } - - status = fault->dev_state->inv_ppr_cb(fault->dev_state->pdev, - fault->pasid, - fault->address, - fault->flags); - switch (status) { - case AMD_IOMMU_INV_PRI_RSP_SUCCESS: - set_pri_tag_status(fault->state, fault->tag, PPR_SUCCESS); - break; - case AMD_IOMMU_INV_PRI_RSP_INVALID: - set_pri_tag_status(fault->state, fault->tag, PPR_INVALID); - break; - case AMD_IOMMU_INV_PRI_RSP_FAIL: - set_pri_tag_status(fault->state, fault->tag, PPR_FAILURE); - break; - default: - BUG(); - } -} - -static bool access_error(struct vm_area_struct *vma, struct fault *fault) -{ - unsigned long requested = 0; - - if (fault->flags & PPR_FAULT_EXEC) - requested |= VM_EXEC; - - if (fault->flags & PPR_FAULT_READ) - requested |= VM_READ; - - if (fault->flags & PPR_FAULT_WRITE) - requested |= VM_WRITE; - - return (requested & ~vma->vm_flags) != 0; -} - -static void do_fault(struct work_struct *work) -{ - struct fault *fault = container_of(work, struct fault, work); - struct vm_area_struct *vma; - vm_fault_t ret = VM_FAULT_ERROR; - unsigned int flags = 0; - struct mm_struct *mm; - u64 address; - - mm = fault->state->mm; - address = fault->address; - - if (fault->flags & PPR_FAULT_USER) - flags |= FAULT_FLAG_USER; - if (fault->flags & PPR_FAULT_WRITE) - flags |= FAULT_FLAG_WRITE; - flags |= FAULT_FLAG_REMOTE; - - mmap_read_lock(mm); - vma = vma_lookup(mm, address); - if (!vma) - /* failed to get a vma in the right range */ - goto out; - - /* Check if we have the right permissions on the vma */ - if (access_error(vma, fault)) - goto out; - - ret = handle_mm_fault(vma, address, flags, NULL); -out: - mmap_read_unlock(mm); - - if (ret & VM_FAULT_ERROR) - /* failed to service fault */ - handle_fault_error(fault); - - finish_pri_tag(fault->dev_state, fault->state, fault->tag); - - put_pasid_state(fault->state); - - kfree(fault); -} - -static int ppr_notifier(struct notifier_block *nb, unsigned long e, void *data) -{ - struct amd_iommu_fault *iommu_fault; - struct pasid_state *pasid_state; - struct device_state *dev_state; - struct pci_dev *pdev = NULL; - unsigned long flags; - struct fault *fault; - bool finish; - u16 tag, devid, seg_id; - int ret; - - iommu_fault = data; - tag = iommu_fault->tag & 0x1ff; - finish = (iommu_fault->tag >> 9) & 1; - - seg_id = PCI_SBDF_TO_SEGID(iommu_fault->sbdf); - devid = PCI_SBDF_TO_DEVID(iommu_fault->sbdf); - pdev = pci_get_domain_bus_and_slot(seg_id, PCI_BUS_NUM(devid), - devid & 0xff); - if (!pdev) - return -ENODEV; - - ret = NOTIFY_DONE; - - /* In kdump kernel pci dev is not initialized yet -> send INVALID */ - if (amd_iommu_is_attach_deferred(&pdev->dev)) { - amd_iommu_complete_ppr(pdev, iommu_fault->pasid, - PPR_INVALID, tag); - goto out; - } - - dev_state = get_device_state(iommu_fault->sbdf); - if (dev_state == NULL) - goto out; - - pasid_state = get_pasid_state(dev_state, iommu_fault->pasid); - if (pasid_state == NULL || pasid_state->invalid) { - /* We know the device but not the PASID -> send INVALID */ - amd_iommu_complete_ppr(dev_state->pdev, iommu_fault->pasid, - PPR_INVALID, tag); - goto out_drop_state; - } - - spin_lock_irqsave(&pasid_state->lock, flags); - atomic_inc(&pasid_state->pri[tag].inflight); - if (finish) - pasid_state->pri[tag].finish = true; - spin_unlock_irqrestore(&pasid_state->lock, flags); - - fault = kzalloc(sizeof(*fault), GFP_ATOMIC); - if (fault == NULL) { - /* We are OOM - send success and let the device re-fault */ - finish_pri_tag(dev_state, pasid_state, tag); - goto out_drop_state; - } - - fault->dev_state = dev_state; - fault->address = iommu_fault->address; - fault->state = pasid_state; - fault->tag = tag; - fault->finish = finish; - fault->pasid = iommu_fault->pasid; - fault->flags = iommu_fault->flags; - INIT_WORK(&fault->work, do_fault); - - queue_work(iommu_wq, &fault->work); - - ret = NOTIFY_OK; - -out_drop_state: - - if (ret != NOTIFY_OK && pasid_state) - put_pasid_state(pasid_state); - - put_device_state(dev_state); - -out: - pci_dev_put(pdev); - return ret; -} - -static struct notifier_block ppr_nb = { - .notifier_call = ppr_notifier, -}; - -int amd_iommu_bind_pasid(struct pci_dev *pdev, u32 pasid, - struct task_struct *task) -{ - struct pasid_state *pasid_state; - struct device_state *dev_state; - struct mm_struct *mm; - u32 sbdf; - int ret; - - might_sleep(); - - if (!amd_iommu_v2_supported()) - return -ENODEV; - - sbdf = get_pci_sbdf_id(pdev); - dev_state = get_device_state(sbdf); - - if (dev_state == NULL) - return -EINVAL; - - ret = -EINVAL; - if (pasid >= dev_state->max_pasids) - goto out; - - ret = -ENOMEM; - pasid_state = kzalloc(sizeof(*pasid_state), GFP_KERNEL); - if (pasid_state == NULL) - goto out; - - - refcount_set(&pasid_state->count, 1); - init_waitqueue_head(&pasid_state->wq); - spin_lock_init(&pasid_state->lock); - - mm = get_task_mm(task); - pasid_state->mm = mm; - pasid_state->device_state = dev_state; - pasid_state->pasid = pasid; - pasid_state->invalid = true; /* Mark as valid only if we are - done with setting up the pasid */ - pasid_state->mn.ops = &iommu_mn; - - if (pasid_state->mm == NULL) - goto out_free; - - ret = mmu_notifier_register(&pasid_state->mn, mm); - if (ret) - goto out_free; - - ret = set_pasid_state(dev_state, pasid_state, pasid); - if (ret) - goto out_unregister; - - ret = amd_iommu_domain_set_gcr3(dev_state->domain, pasid, - __pa(pasid_state->mm->pgd)); - if (ret) - goto out_clear_state; - - /* Now we are ready to handle faults */ - pasid_state->invalid = false; - - /* - * Drop the reference to the mm_struct here. We rely on the - * mmu_notifier release call-back to inform us when the mm - * is going away. - */ - mmput(mm); - - return 0; - -out_clear_state: - clear_pasid_state(dev_state, pasid); - -out_unregister: - mmu_notifier_unregister(&pasid_state->mn, mm); - mmput(mm); - -out_free: - free_pasid_state(pasid_state); - -out: - put_device_state(dev_state); - - return ret; -} -EXPORT_SYMBOL(amd_iommu_bind_pasid); - -void amd_iommu_unbind_pasid(struct pci_dev *pdev, u32 pasid) -{ - struct pasid_state *pasid_state; - struct device_state *dev_state; - u32 sbdf; - - might_sleep(); - - if (!amd_iommu_v2_supported()) - return; - - sbdf = get_pci_sbdf_id(pdev); - dev_state = get_device_state(sbdf); - if (dev_state == NULL) - return; - - if (pasid >= dev_state->max_pasids) - goto out; - - pasid_state = get_pasid_state(dev_state, pasid); - if (pasid_state == NULL) - goto out; - /* - * Drop reference taken here. We are safe because we still hold - * the reference taken in the amd_iommu_bind_pasid function. - */ - put_pasid_state(pasid_state); - - /* Clear the pasid state so that the pasid can be re-used */ - clear_pasid_state(dev_state, pasid_state->pasid); - - /* - * Call mmu_notifier_unregister to drop our reference - * to pasid_state->mm - */ - mmu_notifier_unregister(&pasid_state->mn, pasid_state->mm); - - put_pasid_state_wait(pasid_state); /* Reference taken in - amd_iommu_bind_pasid */ -out: - /* Drop reference taken in this function */ - put_device_state(dev_state); - - /* Drop reference taken in amd_iommu_bind_pasid */ - put_device_state(dev_state); -} -EXPORT_SYMBOL(amd_iommu_unbind_pasid); - -int amd_iommu_init_device(struct pci_dev *pdev, int pasids) -{ - struct device_state *dev_state; - struct iommu_group *group; - unsigned long flags; - int ret, tmp; - u32 sbdf; - - might_sleep(); - - /* - * When memory encryption is active the device is likely not in a - * direct-mapped domain. Forbid using IOMMUv2 functionality for now. - */ - if (cc_platform_has(CC_ATTR_MEM_ENCRYPT)) - return -ENODEV; - - if (!amd_iommu_v2_supported()) - return -ENODEV; - - if (pasids <= 0 || pasids > (PASID_MASK + 1)) - return -EINVAL; - - sbdf = get_pci_sbdf_id(pdev); - - dev_state = kzalloc(sizeof(*dev_state), GFP_KERNEL); - if (dev_state == NULL) - return -ENOMEM; - - spin_lock_init(&dev_state->lock); - init_waitqueue_head(&dev_state->wq); - dev_state->pdev = pdev; - dev_state->sbdf = sbdf; - - tmp = pasids; - for (dev_state->pasid_levels = 0; (tmp - 1) & ~0x1ff; tmp >>= 9) - dev_state->pasid_levels += 1; - - atomic_set(&dev_state->count, 1); - dev_state->max_pasids = pasids; - - ret = -ENOMEM; - dev_state->states = (void *)get_zeroed_page(GFP_KERNEL); - if (dev_state->states == NULL) - goto out_free_dev_state; - - dev_state->domain = iommu_domain_alloc(&pci_bus_type); - if (dev_state->domain == NULL) - goto out_free_states; - - /* See iommu_is_default_domain() */ - dev_state->domain->type = IOMMU_DOMAIN_IDENTITY; - amd_iommu_domain_direct_map(dev_state->domain); - - ret = amd_iommu_domain_enable_v2(dev_state->domain, pasids); - if (ret) - goto out_free_domain; - - group = iommu_group_get(&pdev->dev); - if (!group) { - ret = -EINVAL; - goto out_free_domain; - } - - ret = iommu_attach_group(dev_state->domain, group); - if (ret != 0) - goto out_drop_group; - - iommu_group_put(group); - - spin_lock_irqsave(&state_lock, flags); - - if (__get_device_state(sbdf) != NULL) { - spin_unlock_irqrestore(&state_lock, flags); - ret = -EBUSY; - goto out_free_domain; - } - - list_add_tail(&dev_state->list, &state_list); - - spin_unlock_irqrestore(&state_lock, flags); - - return 0; - -out_drop_group: - iommu_group_put(group); - -out_free_domain: - iommu_domain_free(dev_state->domain); - -out_free_states: - free_page((unsigned long)dev_state->states); - -out_free_dev_state: - kfree(dev_state); - - return ret; -} -EXPORT_SYMBOL(amd_iommu_init_device); - -void amd_iommu_free_device(struct pci_dev *pdev) -{ - struct device_state *dev_state; - unsigned long flags; - u32 sbdf; - - if (!amd_iommu_v2_supported()) - return; - - sbdf = get_pci_sbdf_id(pdev); - - spin_lock_irqsave(&state_lock, flags); - - dev_state = __get_device_state(sbdf); - if (dev_state == NULL) { - spin_unlock_irqrestore(&state_lock, flags); - return; - } - - list_del(&dev_state->list); - - spin_unlock_irqrestore(&state_lock, flags); - - put_device_state(dev_state); - free_device_state(dev_state); -} -EXPORT_SYMBOL(amd_iommu_free_device); - -int amd_iommu_set_invalid_ppr_cb(struct pci_dev *pdev, - amd_iommu_invalid_ppr_cb cb) -{ - struct device_state *dev_state; - unsigned long flags; - u32 sbdf; - int ret; - - if (!amd_iommu_v2_supported()) - return -ENODEV; - - sbdf = get_pci_sbdf_id(pdev); - - spin_lock_irqsave(&state_lock, flags); - - ret = -EINVAL; - dev_state = __get_device_state(sbdf); - if (dev_state == NULL) - goto out_unlock; - - dev_state->inv_ppr_cb = cb; - - ret = 0; - -out_unlock: - spin_unlock_irqrestore(&state_lock, flags); - - return ret; -} -EXPORT_SYMBOL(amd_iommu_set_invalid_ppr_cb); - -int amd_iommu_set_invalidate_ctx_cb(struct pci_dev *pdev, - amd_iommu_invalidate_ctx cb) -{ - struct device_state *dev_state; - unsigned long flags; - u32 sbdf; - int ret; - - if (!amd_iommu_v2_supported()) - return -ENODEV; - - sbdf = get_pci_sbdf_id(pdev); - - spin_lock_irqsave(&state_lock, flags); - - ret = -EINVAL; - dev_state = __get_device_state(sbdf); - if (dev_state == NULL) - goto out_unlock; - - dev_state->inv_ctx_cb = cb; - - ret = 0; - -out_unlock: - spin_unlock_irqrestore(&state_lock, flags); - - return ret; -} -EXPORT_SYMBOL(amd_iommu_set_invalidate_ctx_cb); - -static int __init amd_iommu_v2_init(void) -{ - int ret; - - if (!amd_iommu_v2_supported()) { - pr_info("AMD IOMMUv2 functionality not available on this system - This is not a bug.\n"); - /* - * Load anyway to provide the symbols to other modules - * which may use AMD IOMMUv2 optionally. - */ - return 0; - } - - ret = -ENOMEM; - iommu_wq = alloc_workqueue("amd_iommu_v2", WQ_MEM_RECLAIM, 0); - if (iommu_wq == NULL) - goto out; - - amd_iommu_register_ppr_notifier(&ppr_nb); - - pr_info("AMD IOMMUv2 loaded and initialized\n"); - - return 0; - -out: - return ret; -} - -static void __exit amd_iommu_v2_exit(void) -{ - struct device_state *dev_state, *next; - unsigned long flags; - LIST_HEAD(freelist); - - if (!amd_iommu_v2_supported()) - return; - - amd_iommu_unregister_ppr_notifier(&ppr_nb); - - flush_workqueue(iommu_wq); - - /* - * The loop below might call flush_workqueue(), so call - * destroy_workqueue() after it - */ - spin_lock_irqsave(&state_lock, flags); - - list_for_each_entry_safe(dev_state, next, &state_list, list) { - WARN_ON_ONCE(1); - - put_device_state(dev_state); - list_del(&dev_state->list); - list_add_tail(&dev_state->list, &freelist); - } - - spin_unlock_irqrestore(&state_lock, flags); - - /* - * Since free_device_state waits on the count to be zero, - * we need to free dev_state outside the spinlock. - */ - list_for_each_entry_safe(dev_state, next, &freelist, list) { - list_del(&dev_state->list); - free_device_state(dev_state); - } - - destroy_workqueue(iommu_wq); -} - -module_init(amd_iommu_v2_init); -module_exit(amd_iommu_v2_exit); |