diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-18 17:40:19 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-18 17:40:19 +0000 |
commit | 9f0fc191371843c4fc000a226b0a26b6c059aacd (patch) | |
tree | 35f8be3ef04506ac891ad001e8c41e535ae8d01d /drivers/iommu | |
parent | Releasing progress-linux version 6.6.15-2~progress7.99u1. (diff) | |
download | linux-9f0fc191371843c4fc000a226b0a26b6c059aacd.tar.xz linux-9f0fc191371843c4fc000a226b0a26b6c059aacd.zip |
Merging upstream version 6.7.7.
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'drivers/iommu')
55 files changed, 4514 insertions, 3036 deletions
diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig index 2b12b583ef..7673bb8294 100644 --- a/drivers/iommu/Kconfig +++ b/drivers/iommu/Kconfig @@ -7,6 +7,10 @@ config IOMMU_IOVA config IOMMU_API bool +config IOMMUFD_DRIVER + bool + default n + menuconfig IOMMU_SUPPORT bool "IOMMU Hardware Support" depends on MMU @@ -91,7 +95,7 @@ config IOMMU_DEBUGFS choice prompt "IOMMU default domain type" depends on IOMMU_API - default IOMMU_DEFAULT_DMA_LAZY if X86 || IA64 + default IOMMU_DEFAULT_DMA_LAZY if X86 || S390 default IOMMU_DEFAULT_DMA_STRICT help Choose the type of IOMMU domain used to manage DMA API usage by @@ -146,7 +150,7 @@ config OF_IOMMU # IOMMU-agnostic DMA-mapping layer config IOMMU_DMA - def_bool ARM64 || IA64 || X86 + def_bool ARM64 || X86 || S390 select DMA_OPS select IOMMU_API select IOMMU_IOVA @@ -236,17 +240,6 @@ config SUN50I_IOMMU help Support for the IOMMU introduced in the Allwinner H6 SoCs. -config TEGRA_IOMMU_GART - bool "Tegra GART IOMMU Support" - depends on ARCH_TEGRA_2x_SOC - depends on TEGRA_MC - select IOMMU_API - help - Enables support for remapping discontiguous physical memory - shared with the operating system into contiguous I/O virtual - space through the GART (Graphics Address Relocation Table) - hardware included on Tegra SoCs. - config TEGRA_IOMMU_SMMU bool "NVIDIA Tegra SMMU Support" depends on ARCH_TEGRA diff --git a/drivers/iommu/Makefile b/drivers/iommu/Makefile index 769e43d780..95ad9dbfbd 100644 --- a/drivers/iommu/Makefile +++ b/drivers/iommu/Makefile @@ -20,7 +20,6 @@ obj-$(CONFIG_OMAP_IOMMU) += omap-iommu.o obj-$(CONFIG_OMAP_IOMMU_DEBUG) += omap-iommu-debug.o obj-$(CONFIG_ROCKCHIP_IOMMU) += rockchip-iommu.o obj-$(CONFIG_SUN50I_IOMMU) += sun50i-iommu.o -obj-$(CONFIG_TEGRA_IOMMU_GART) += tegra-gart.o obj-$(CONFIG_TEGRA_IOMMU_SMMU) += tegra-smmu.o obj-$(CONFIG_EXYNOS_IOMMU) += exynos-iommu.o obj-$(CONFIG_FSL_PAMU) += fsl_pamu.o fsl_pamu_domain.o diff --git a/drivers/iommu/amd/Kconfig b/drivers/iommu/amd/Kconfig index 9b5fc3356b..443b2c13c3 100644 --- a/drivers/iommu/amd/Kconfig +++ b/drivers/iommu/amd/Kconfig @@ -10,6 +10,7 @@ config AMD_IOMMU select IOMMU_API select IOMMU_IOVA select IOMMU_IO_PGTABLE + select IOMMUFD_DRIVER if IOMMUFD depends on X86_64 && PCI && ACPI && HAVE_CMPXCHG_DOUBLE help With this option you can enable support for AMD IOMMU hardware in @@ -22,15 +23,6 @@ config AMD_IOMMU your BIOS for an option to enable it or if you have an IVRS ACPI table. -config AMD_IOMMU_V2 - tristate "AMD IOMMU Version 2 driver" - depends on AMD_IOMMU - select MMU_NOTIFIER - help - This option enables support for the AMD IOMMUv2 features of the IOMMU - hardware. Select this option if you want to use devices that support - the PCI PRI and PASID interface. - config AMD_IOMMU_DEBUGFS bool "Enable AMD IOMMU internals in DebugFS" depends on AMD_IOMMU && IOMMU_DEBUGFS diff --git a/drivers/iommu/amd/Makefile b/drivers/iommu/amd/Makefile index 773d8aa002..f454fbb156 100644 --- a/drivers/iommu/amd/Makefile +++ b/drivers/iommu/amd/Makefile @@ -1,4 +1,3 @@ # SPDX-License-Identifier: GPL-2.0-only obj-$(CONFIG_AMD_IOMMU) += iommu.o init.o quirks.o io_pgtable.o io_pgtable_v2.o obj-$(CONFIG_AMD_IOMMU_DEBUGFS) += debugfs.o -obj-$(CONFIG_AMD_IOMMU_V2) += iommu_v2.o diff --git a/drivers/iommu/amd/amd_iommu.h b/drivers/iommu/amd/amd_iommu.h index e2857109e9..86be1edd50 100644 --- a/drivers/iommu/amd/amd_iommu.h +++ b/drivers/iommu/amd/amd_iommu.h @@ -38,9 +38,6 @@ extern int amd_iommu_guest_ir; extern enum io_pgtable_fmt amd_iommu_pgtable; extern int amd_iommu_gpt_level; -/* IOMMUv2 specific functions */ -struct iommu_domain; - bool amd_iommu_v2_supported(void); struct amd_iommu *get_amd_iommu(unsigned int idx); u8 amd_iommu_pc_get_max_banks(unsigned int idx); @@ -51,10 +48,10 @@ int amd_iommu_pc_get_reg(struct amd_iommu *iommu, u8 bank, u8 cntr, int amd_iommu_pc_set_reg(struct amd_iommu *iommu, u8 bank, u8 cntr, u8 fxn, u64 *value); -int amd_iommu_register_ppr_notifier(struct notifier_block *nb); -int amd_iommu_unregister_ppr_notifier(struct notifier_block *nb); -void amd_iommu_domain_direct_map(struct iommu_domain *dom); -int amd_iommu_domain_enable_v2(struct iommu_domain *dom, int pasids); +/* Device capabilities */ +int amd_iommu_pdev_enable_cap_pri(struct pci_dev *pdev); +void amd_iommu_pdev_disable_cap_pri(struct pci_dev *pdev); + int amd_iommu_flush_page(struct iommu_domain *dom, u32 pasid, u64 address); void amd_iommu_update_and_flush_device_table(struct protection_domain *domain); void amd_iommu_domain_update(struct protection_domain *domain); @@ -87,9 +84,25 @@ static inline bool is_rd890_iommu(struct pci_dev *pdev) (pdev->device == PCI_DEVICE_ID_RD890_IOMMU); } -static inline bool iommu_feature(struct amd_iommu *iommu, u64 mask) +static inline bool check_feature(u64 mask) +{ + return (amd_iommu_efr & mask); +} + +static inline bool check_feature2(u64 mask) +{ + return (amd_iommu_efr2 & mask); +} + +static inline int check_feature_gpt_level(void) +{ + return ((amd_iommu_efr >> FEATURE_GATS_SHIFT) & FEATURE_GATS_MASK); +} + +static inline bool amd_iommu_gt_ppr_supported(void) { - return !!(iommu->features & mask); + return (check_feature(FEATURE_GT) && + check_feature(FEATURE_PPR)); } static inline u64 iommu_virt_to_phys(void *vaddr) @@ -105,7 +118,6 @@ static inline void *iommu_phys_to_virt(unsigned long paddr) static inline void amd_iommu_domain_set_pt_root(struct protection_domain *domain, u64 root) { - atomic64_set(&domain->iop.pt_root, root); domain->iop.root = (u64 *)(root & PAGE_MASK); domain->iop.mode = root & 7; /* lowest 3 bits encode pgtable mode */ } @@ -146,8 +158,5 @@ void amd_iommu_domain_set_pgtable(struct protection_domain *domain, u64 *root, int mode); struct dev_table_entry *get_dev_table(struct amd_iommu *iommu); -extern u64 amd_iommu_efr; -extern u64 amd_iommu_efr2; - extern bool amd_iommu_snp_en; #endif diff --git a/drivers/iommu/amd/amd_iommu_types.h b/drivers/iommu/amd/amd_iommu_types.h index 7dc30c2b56..90b7d7950a 100644 --- a/drivers/iommu/amd/amd_iommu_types.h +++ b/drivers/iommu/amd/amd_iommu_types.h @@ -97,7 +97,9 @@ #define FEATURE_GATS_MASK (3ULL) #define FEATURE_GAM_VAPIC BIT_ULL(21) #define FEATURE_GIOSUP BIT_ULL(48) +#define FEATURE_HASUP BIT_ULL(49) #define FEATURE_EPHSUP BIT_ULL(50) +#define FEATURE_HDSUP BIT_ULL(52) #define FEATURE_SNP BIT_ULL(63) #define FEATURE_PASID_SHIFT 32 @@ -212,6 +214,7 @@ /* macros and definitions for device table entries */ #define DEV_ENTRY_VALID 0x00 #define DEV_ENTRY_TRANSLATION 0x01 +#define DEV_ENTRY_HAD 0x07 #define DEV_ENTRY_PPR 0x34 #define DEV_ENTRY_IR 0x3d #define DEV_ENTRY_IW 0x3e @@ -371,9 +374,15 @@ (1ULL << (12 + (9 * (level)))) /* + * The IOPTE dirty bit + */ +#define IOMMU_PTE_HD_BIT (6) + +/* * Bit value definition for I/O PTE fields */ #define IOMMU_PTE_PR BIT_ULL(0) +#define IOMMU_PTE_HD BIT_ULL(IOMMU_PTE_HD_BIT) #define IOMMU_PTE_U BIT_ULL(59) #define IOMMU_PTE_FC BIT_ULL(60) #define IOMMU_PTE_IR BIT_ULL(61) @@ -384,6 +393,7 @@ */ #define DTE_FLAG_V BIT_ULL(0) #define DTE_FLAG_TV BIT_ULL(1) +#define DTE_FLAG_HAD (3ULL << 7) #define DTE_FLAG_GIOV BIT_ULL(54) #define DTE_FLAG_GV BIT_ULL(55) #define DTE_GLX_SHIFT (56) @@ -413,6 +423,7 @@ #define IOMMU_PAGE_MASK (((1ULL << 52) - 1) & ~0xfffULL) #define IOMMU_PTE_PRESENT(pte) ((pte) & IOMMU_PTE_PR) +#define IOMMU_PTE_DIRTY(pte) ((pte) & IOMMU_PTE_HD) #define IOMMU_PTE_PAGE(pte) (iommu_phys_to_virt((pte) & IOMMU_PAGE_MASK)) #define IOMMU_PTE_MODE(pte) (((pte) >> 9) & 0x07) @@ -451,6 +462,10 @@ #define PD_IOMMUV2_MASK BIT(3) /* domain has gcr3 table */ #define PD_GIOV_MASK BIT(4) /* domain enable GIOV support */ +/* Timeout stuff */ +#define LOOP_TIMEOUT 100000 +#define MMIO_STATUS_TIMEOUT 2000000 + extern bool amd_iommu_dump; #define DUMP_printk(format, arg...) \ do { \ @@ -505,19 +520,6 @@ extern struct kmem_cache *amd_iommu_irq_cache; #define APERTURE_RANGE_INDEX(a) ((a) >> APERTURE_RANGE_SHIFT) #define APERTURE_PAGE_INDEX(a) (((a) >> 21) & 0x3fULL) -/* - * This struct is used to pass information about - * incoming PPR faults around. - */ -struct amd_iommu_fault { - u64 address; /* IO virtual address of the fault*/ - u32 pasid; /* Address space identifier */ - u32 sbdf; /* Originating PCI device id */ - u16 tag; /* PPR tag */ - u16 flags; /* Fault flags */ - -}; - struct amd_iommu; struct iommu_domain; @@ -544,7 +546,6 @@ struct amd_io_pgtable { struct io_pgtable iop; int mode; u64 *root; - atomic64_t pt_root; /* pgtable root and pgtable mode */ u64 *pgd; /* v2 pgtable pgd pointer */ }; @@ -563,6 +564,7 @@ struct protection_domain { int nid; /* Node ID */ u64 *gcr3_tbl; /* Guest CR3 table */ unsigned long flags; /* flags to find out type of domain */ + bool dirty_tracking; /* dirty tracking is enabled in the domain */ unsigned dev_cnt; /* devices assigned to this domain */ unsigned dev_iommu[MAX_IOMMUS]; /* per-IOMMU reference count */ }; @@ -676,9 +678,6 @@ struct amd_iommu { /* Extended features 2 */ u64 features2; - /* IOMMUv2 */ - bool is_iommu_v2; - /* PCI device id of the IOMMU device */ u16 devid; @@ -799,6 +798,14 @@ struct devid_map { bool cmd_line; }; +#define AMD_IOMMU_DEVICE_FLAG_ATS_SUP 0x1 /* ATS feature supported */ +#define AMD_IOMMU_DEVICE_FLAG_PRI_SUP 0x2 /* PRI feature supported */ +#define AMD_IOMMU_DEVICE_FLAG_PASID_SUP 0x4 /* PASID context supported */ +/* Device may request execution on memory pages */ +#define AMD_IOMMU_DEVICE_FLAG_EXEC_SUP 0x8 +/* Device may request super-user privileges */ +#define AMD_IOMMU_DEVICE_FLAG_PRIV_SUP 0x10 + /* * This struct contains device specific data for the IOMMU */ @@ -811,13 +818,15 @@ struct iommu_dev_data { struct protection_domain *domain; /* Domain the device is bound to */ struct device *dev; u16 devid; /* PCI Device ID */ - bool iommu_v2; /* Device can make use of IOMMUv2 */ - struct { - bool enabled; - int qdep; - } ats; /* ATS state */ - bool pri_tlp; /* PASID TLB required for + + u32 flags; /* Holds AMD_IOMMU_DEVICE_FLAG_<*> */ + int ats_qdep; + u8 ats_enabled :1; /* ATS state */ + u8 pri_enabled :1; /* PRI state */ + u8 pasid_enabled:1; /* PASID state */ + u8 pri_tlp :1; /* PASID TLB required for PPR completions */ + u8 ppr :1; /* Enable device PPR support */ bool use_vapic; /* Enable device to use vapic mode */ bool defer_attach; @@ -884,16 +893,15 @@ extern unsigned amd_iommu_aperture_order; /* allocation bitmap for domain ids */ extern unsigned long *amd_iommu_pd_alloc_bitmap; -/* Smallest max PASID supported by any IOMMU in the system */ -extern u32 amd_iommu_max_pasid; - -extern bool amd_iommu_v2_present; - extern bool amd_iommu_force_isolation; /* Max levels of glxval supported */ extern int amd_iommu_max_glx_val; +/* Global EFR and EFR2 registers */ +extern u64 amd_iommu_efr; +extern u64 amd_iommu_efr2; + /* * This function flushes all internal caches of * the IOMMU used by this driver. diff --git a/drivers/iommu/amd/init.c b/drivers/iommu/amd/init.c index 45efb7e5d7..64bcf3df37 100644 --- a/drivers/iommu/amd/init.c +++ b/drivers/iommu/amd/init.c @@ -83,8 +83,6 @@ #define ACPI_DEVFLAG_LINT1 0x80 #define ACPI_DEVFLAG_ATSDIS 0x10000000 -#define LOOP_TIMEOUT 2000000 - #define IVRS_GET_SBDF_ID(seg, bus, dev, fn) (((seg & 0xffff) << 16) | ((bus & 0xff) << 8) \ | ((dev & 0x1f) << 3) | (fn & 0x7)) @@ -187,9 +185,6 @@ static int amd_iommus_present; bool amd_iommu_np_cache __read_mostly; bool amd_iommu_iotlb_sup __read_mostly = true; -u32 amd_iommu_max_pasid __read_mostly = ~0; - -bool amd_iommu_v2_present __read_mostly; static bool amd_iommu_pc_present __read_mostly; bool amdr_ivrs_remap_support __read_mostly; @@ -272,7 +267,7 @@ int amd_iommu_get_num_iommus(void) * Iterate through all the IOMMUs to get common EFR * masks among all IOMMUs and warn if found inconsistency. */ -static void get_global_efr(void) +static __init void get_global_efr(void) { struct amd_iommu *iommu; @@ -304,16 +299,6 @@ static void get_global_efr(void) pr_info("Using global IVHD EFR:%#llx, EFR2:%#llx\n", amd_iommu_efr, amd_iommu_efr2); } -static bool check_feature_on_all_iommus(u64 mask) -{ - return !!(amd_iommu_efr & mask); -} - -static inline int check_feature_gpt_level(void) -{ - return ((amd_iommu_efr >> FEATURE_GATS_SHIFT) & FEATURE_GATS_MASK); -} - /* * For IVHD type 0x11/0x40, EFR is also available via IVHD. * Default to IVHD EFR since it is available sooner @@ -399,7 +384,7 @@ static void iommu_set_cwwb_range(struct amd_iommu *iommu) u64 start = iommu_virt_to_phys((void *)iommu->cmd_sem); u64 entry = start & PM_ADDR_MASK; - if (!check_feature_on_all_iommus(FEATURE_SNP)) + if (!check_feature(FEATURE_SNP)) return; /* Note: @@ -869,7 +854,7 @@ static void *__init iommu_alloc_4k_pages(struct amd_iommu *iommu, void *buf = (void *)__get_free_pages(gfp, order); if (buf && - check_feature_on_all_iommus(FEATURE_SNP) && + check_feature(FEATURE_SNP) && set_memory_4k((unsigned long)buf, (1 << order))) { free_pages((unsigned long)buf, order); buf = NULL; @@ -985,14 +970,14 @@ static int iommu_ga_log_enable(struct amd_iommu *iommu) iommu_feature_enable(iommu, CONTROL_GAINT_EN); iommu_feature_enable(iommu, CONTROL_GALOG_EN); - for (i = 0; i < LOOP_TIMEOUT; ++i) { + for (i = 0; i < MMIO_STATUS_TIMEOUT; ++i) { status = readl(iommu->mmio_base + MMIO_STATUS_OFFSET); if (status & (MMIO_STATUS_GALOG_RUN_MASK)) break; udelay(10); } - if (WARN_ON(i >= LOOP_TIMEOUT)) + if (WARN_ON(i >= MMIO_STATUS_TIMEOUT)) return -EINVAL; return 0; @@ -1048,7 +1033,7 @@ static void iommu_enable_xt(struct amd_iommu *iommu) static void iommu_enable_gt(struct amd_iommu *iommu) { - if (!iommu_feature(iommu, FEATURE_GT)) + if (!check_feature(FEATURE_GT)) return; iommu_feature_enable(iommu, CONTROL_GT_EN); @@ -1987,7 +1972,7 @@ static void init_iommu_perf_ctr(struct amd_iommu *iommu) u64 val; struct pci_dev *pdev = iommu->dev; - if (!iommu_feature(iommu, FEATURE_PC)) + if (!check_feature(FEATURE_PC)) return; amd_iommu_pc_present = true; @@ -2014,8 +1999,7 @@ static ssize_t amd_iommu_show_features(struct device *dev, struct device_attribute *attr, char *buf) { - struct amd_iommu *iommu = dev_to_amd_iommu(dev); - return sysfs_emit(buf, "%llx:%llx\n", iommu->features2, iommu->features); + return sysfs_emit(buf, "%llx:%llx\n", amd_iommu_efr, amd_iommu_efr2); } static DEVICE_ATTR(features, S_IRUGO, amd_iommu_show_features, NULL); @@ -2051,9 +2035,9 @@ static void __init late_iommu_features_init(struct amd_iommu *iommu) features = readq(iommu->mmio_base + MMIO_EXT_FEATURES); features2 = readq(iommu->mmio_base + MMIO_EXT_FEATURES2); - if (!iommu->features) { - iommu->features = features; - iommu->features2 = features2; + if (!amd_iommu_efr) { + amd_iommu_efr = features; + amd_iommu_efr2 = features2; return; } @@ -2061,12 +2045,12 @@ static void __init late_iommu_features_init(struct amd_iommu *iommu) * Sanity check and warn if EFR values from * IVHD and MMIO conflict. */ - if (features != iommu->features || - features2 != iommu->features2) { + if (features != amd_iommu_efr || + features2 != amd_iommu_efr2) { pr_warn(FW_WARN "EFR mismatch. Use IVHD EFR (%#llx : %#llx), EFR2 (%#llx : %#llx).\n", - features, iommu->features, - features2, iommu->features2); + features, amd_iommu_efr, + features2, amd_iommu_efr2); } } @@ -2092,20 +2076,17 @@ static int __init iommu_init_pci(struct amd_iommu *iommu) late_iommu_features_init(iommu); - if (iommu_feature(iommu, FEATURE_GT)) { + if (check_feature(FEATURE_GT)) { int glxval; - u32 max_pasid; u64 pasmax; - pasmax = iommu->features & FEATURE_PASID_MASK; + pasmax = amd_iommu_efr & FEATURE_PASID_MASK; pasmax >>= FEATURE_PASID_SHIFT; - max_pasid = (1 << (pasmax + 1)) - 1; + iommu->iommu.max_pasids = (1 << (pasmax + 1)) - 1; - amd_iommu_max_pasid = min(amd_iommu_max_pasid, max_pasid); + BUG_ON(iommu->iommu.max_pasids & ~PASID_MASK); - BUG_ON(amd_iommu_max_pasid & ~PASID_MASK); - - glxval = iommu->features & FEATURE_GLXVAL_MASK; + glxval = amd_iommu_efr & FEATURE_GLXVAL_MASK; glxval >>= FEATURE_GLXVAL_SHIFT; if (amd_iommu_max_glx_val == -1) @@ -2114,13 +2095,7 @@ static int __init iommu_init_pci(struct amd_iommu *iommu) amd_iommu_max_glx_val = min(amd_iommu_max_glx_val, glxval); } - if (iommu_feature(iommu, FEATURE_GT) && - iommu_feature(iommu, FEATURE_PPR)) { - iommu->is_iommu_v2 = true; - amd_iommu_v2_present = true; - } - - if (iommu_feature(iommu, FEATURE_PPR) && alloc_ppr_log(iommu)) + if (check_feature(FEATURE_PPR) && alloc_ppr_log(iommu)) return -ENOMEM; if (iommu->cap & (1UL << IOMMU_CAP_NPCACHE)) { @@ -2132,13 +2107,10 @@ static int __init iommu_init_pci(struct amd_iommu *iommu) init_iommu_perf_ctr(iommu); if (amd_iommu_pgtable == AMD_IOMMU_V2) { - if (!iommu_feature(iommu, FEATURE_GIOSUP) || - !iommu_feature(iommu, FEATURE_GT)) { + if (!check_feature(FEATURE_GIOSUP) || + !check_feature(FEATURE_GT)) { pr_warn("Cannot enable v2 page table for DMA-API. Fallback to v1.\n"); amd_iommu_pgtable = AMD_IOMMU_V1; - } else if (iommu_default_passthrough()) { - pr_warn("V2 page table doesn't support passthrough mode. Fallback to v1.\n"); - amd_iommu_pgtable = AMD_IOMMU_V1; } } @@ -2186,35 +2158,29 @@ static int __init iommu_init_pci(struct amd_iommu *iommu) static void print_iommu_info(void) { + int i; static const char * const feat_str[] = { "PreF", "PPR", "X2APIC", "NX", "GT", "[5]", "IA", "GA", "HE", "PC" }; - struct amd_iommu *iommu; - - for_each_iommu(iommu) { - struct pci_dev *pdev = iommu->dev; - int i; - pci_info(pdev, "Found IOMMU cap 0x%x\n", iommu->cap_ptr); + if (amd_iommu_efr) { + pr_info("Extended features (%#llx, %#llx):", amd_iommu_efr, amd_iommu_efr2); - if (iommu->cap & (1 << IOMMU_CAP_EFR)) { - pr_info("Extended features (%#llx, %#llx):", iommu->features, iommu->features2); - - for (i = 0; i < ARRAY_SIZE(feat_str); ++i) { - if (iommu_feature(iommu, (1ULL << i))) - pr_cont(" %s", feat_str[i]); - } + for (i = 0; i < ARRAY_SIZE(feat_str); ++i) { + if (check_feature(1ULL << i)) + pr_cont(" %s", feat_str[i]); + } - if (iommu->features & FEATURE_GAM_VAPIC) - pr_cont(" GA_vAPIC"); + if (check_feature(FEATURE_GAM_VAPIC)) + pr_cont(" GA_vAPIC"); - if (iommu->features & FEATURE_SNP) - pr_cont(" SNP"); + if (check_feature(FEATURE_SNP)) + pr_cont(" SNP"); - pr_cont("\n"); - } + pr_cont("\n"); } + if (irq_remapping_enabled) { pr_info("Interrupt remapping enabled\n"); if (amd_iommu_xt_mode == IRQ_REMAP_X2APIC_MODE) @@ -2900,19 +2866,19 @@ static void enable_iommus_vapic(void) * Need to set and poll check the GALOGRun bit to zero before * we can set/ modify GA Log registers safely. */ - for (i = 0; i < LOOP_TIMEOUT; ++i) { + for (i = 0; i < MMIO_STATUS_TIMEOUT; ++i) { status = readl(iommu->mmio_base + MMIO_STATUS_OFFSET); if (!(status & MMIO_STATUS_GALOG_RUN_MASK)) break; udelay(10); } - if (WARN_ON(i >= LOOP_TIMEOUT)) + if (WARN_ON(i >= MMIO_STATUS_TIMEOUT)) return; } if (AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir) && - !check_feature_on_all_iommus(FEATURE_GAM_VAPIC)) { + !check_feature(FEATURE_GAM_VAPIC)) { amd_iommu_guest_ir = AMD_IOMMU_GUEST_IR_LEGACY_GA; return; } @@ -3698,9 +3664,8 @@ bool amd_iommu_v2_supported(void) * (i.e. EFR[SNPSup]=1), IOMMUv2 page table cannot be used without * setting up IOMMUv1 page table. */ - return amd_iommu_v2_present && !amd_iommu_snp_en; + return amd_iommu_gt_ppr_supported() && !amd_iommu_snp_en; } -EXPORT_SYMBOL(amd_iommu_v2_supported); struct amd_iommu *get_amd_iommu(unsigned int idx) { @@ -3824,7 +3789,7 @@ int amd_iommu_snp_enable(void) return -EINVAL; } - amd_iommu_snp_en = check_feature_on_all_iommus(FEATURE_SNP); + amd_iommu_snp_en = check_feature(FEATURE_SNP); if (!amd_iommu_snp_en) return -EINVAL; diff --git a/drivers/iommu/amd/io_pgtable.c b/drivers/iommu/amd/io_pgtable.c index 2892aa1b4d..6c0621f6f5 100644 --- a/drivers/iommu/amd/io_pgtable.c +++ b/drivers/iommu/amd/io_pgtable.c @@ -486,6 +486,73 @@ static phys_addr_t iommu_v1_iova_to_phys(struct io_pgtable_ops *ops, unsigned lo return (__pte & ~offset_mask) | (iova & offset_mask); } +static bool pte_test_and_clear_dirty(u64 *ptep, unsigned long size, + unsigned long flags) +{ + bool test_only = flags & IOMMU_DIRTY_NO_CLEAR; + bool dirty = false; + int i, count; + + /* + * 2.2.3.2 Host Dirty Support + * When a non-default page size is used , software must OR the + * Dirty bits in all of the replicated host PTEs used to map + * the page. The IOMMU does not guarantee the Dirty bits are + * set in all of the replicated PTEs. Any portion of the page + * may have been written even if the Dirty bit is set in only + * one of the replicated PTEs. + */ + count = PAGE_SIZE_PTE_COUNT(size); + for (i = 0; i < count && test_only; i++) { + if (test_bit(IOMMU_PTE_HD_BIT, (unsigned long *)&ptep[i])) { + dirty = true; + break; + } + } + + for (i = 0; i < count && !test_only; i++) { + if (test_and_clear_bit(IOMMU_PTE_HD_BIT, + (unsigned long *)&ptep[i])) { + dirty = true; + } + } + + return dirty; +} + +static int iommu_v1_read_and_clear_dirty(struct io_pgtable_ops *ops, + unsigned long iova, size_t size, + unsigned long flags, + struct iommu_dirty_bitmap *dirty) +{ + struct amd_io_pgtable *pgtable = io_pgtable_ops_to_data(ops); + unsigned long end = iova + size - 1; + + do { + unsigned long pgsize = 0; + u64 *ptep, pte; + + ptep = fetch_pte(pgtable, iova, &pgsize); + if (ptep) + pte = READ_ONCE(*ptep); + if (!ptep || !IOMMU_PTE_PRESENT(pte)) { + pgsize = pgsize ?: PTE_LEVEL_PAGE_SIZE(0); + iova += pgsize; + continue; + } + + /* + * Mark the whole IOVA range as dirty even if only one of + * the replicated PTEs were marked dirty. + */ + if (pte_test_and_clear_dirty(ptep, pgsize, flags)) + iommu_dirty_bitmap_record(dirty, iova, pgsize); + iova += pgsize; + } while (iova < end); + + return 0; +} + /* * ---------------------------------------------------- */ @@ -527,6 +594,7 @@ static struct io_pgtable *v1_alloc_pgtable(struct io_pgtable_cfg *cfg, void *coo pgtable->iop.ops.map_pages = iommu_v1_map_pages; pgtable->iop.ops.unmap_pages = iommu_v1_unmap_pages; pgtable->iop.ops.iova_to_phys = iommu_v1_iova_to_phys; + pgtable->iop.ops.read_and_clear_dirty = iommu_v1_read_and_clear_dirty; return &pgtable->iop; } diff --git a/drivers/iommu/amd/io_pgtable_v2.c b/drivers/iommu/amd/io_pgtable_v2.c index e9ef2e0a62..f818a7e254 100644 --- a/drivers/iommu/amd/io_pgtable_v2.c +++ b/drivers/iommu/amd/io_pgtable_v2.c @@ -363,10 +363,10 @@ static void v2_free_pgtable(struct io_pgtable *iop) if (!(pdom->flags & PD_IOMMUV2_MASK)) return; - /* - * Make changes visible to IOMMUs. No need to clear gcr3 entry - * as gcr3 table is already freed. - */ + /* Clear gcr3 entry */ + amd_iommu_domain_clear_gcr3(&pdom->domain, 0); + + /* Make changes visible to IOMMUs */ amd_iommu_domain_update(pdom); /* Free page table */ diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index 95bd7c25ba..fcc987f5d4 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -37,6 +37,7 @@ #include <asm/iommu.h> #include <asm/gart.h> #include <asm/dma.h> +#include <uapi/linux/iommufd.h> #include "amd_iommu.h" #include "../dma-iommu.h" @@ -44,8 +45,6 @@ #define CMD_SET_TYPE(cmd, t) ((cmd)->data[1] |= ((t) << 28)) -#define LOOP_TIMEOUT 100000 - /* IO virtual address start page frame number */ #define IOVA_START_PFN (1) #define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT) @@ -65,8 +64,8 @@ LIST_HEAD(hpet_map); LIST_HEAD(acpihid_map); const struct iommu_ops amd_iommu_ops; +const struct iommu_dirty_ops amd_dirty_ops; -static ATOMIC_NOTIFIER_HEAD(ppr_notifier); int amd_iommu_max_glx_val = -1; /* @@ -79,7 +78,6 @@ struct iommu_cmd { struct kmem_cache *amd_iommu_irq_cache; static void detach_device(struct device *dev); -static int domain_enable_v2(struct protection_domain *domain, int pasids); /**************************************************************************** * @@ -322,24 +320,141 @@ static struct iommu_group *acpihid_device_group(struct device *dev) return entry->group; } -static bool pci_iommuv2_capable(struct pci_dev *pdev) +static inline bool pdev_pasid_supported(struct iommu_dev_data *dev_data) { - static const int caps[] = { - PCI_EXT_CAP_ID_PRI, - PCI_EXT_CAP_ID_PASID, - }; - int i, pos; + return (dev_data->flags & AMD_IOMMU_DEVICE_FLAG_PASID_SUP); +} - if (!pci_ats_supported(pdev)) - return false; +static u32 pdev_get_caps(struct pci_dev *pdev) +{ + int features; + u32 flags = 0; + + if (pci_ats_supported(pdev)) + flags |= AMD_IOMMU_DEVICE_FLAG_ATS_SUP; - for (i = 0; i < 2; ++i) { - pos = pci_find_ext_capability(pdev, caps[i]); - if (pos == 0) - return false; + if (pci_pri_supported(pdev)) + flags |= AMD_IOMMU_DEVICE_FLAG_PRI_SUP; + + features = pci_pasid_features(pdev); + if (features >= 0) { + flags |= AMD_IOMMU_DEVICE_FLAG_PASID_SUP; + + if (features & PCI_PASID_CAP_EXEC) + flags |= AMD_IOMMU_DEVICE_FLAG_EXEC_SUP; + + if (features & PCI_PASID_CAP_PRIV) + flags |= AMD_IOMMU_DEVICE_FLAG_PRIV_SUP; } - return true; + return flags; +} + +static inline int pdev_enable_cap_ats(struct pci_dev *pdev) +{ + struct iommu_dev_data *dev_data = dev_iommu_priv_get(&pdev->dev); + int ret = -EINVAL; + + if (dev_data->ats_enabled) + return 0; + + if (amd_iommu_iotlb_sup && + (dev_data->flags & AMD_IOMMU_DEVICE_FLAG_ATS_SUP)) { + ret = pci_enable_ats(pdev, PAGE_SHIFT); + if (!ret) { + dev_data->ats_enabled = 1; + dev_data->ats_qdep = pci_ats_queue_depth(pdev); + } + } + + return ret; +} + +static inline void pdev_disable_cap_ats(struct pci_dev *pdev) +{ + struct iommu_dev_data *dev_data = dev_iommu_priv_get(&pdev->dev); + + if (dev_data->ats_enabled) { + pci_disable_ats(pdev); + dev_data->ats_enabled = 0; + } +} + +int amd_iommu_pdev_enable_cap_pri(struct pci_dev *pdev) +{ + struct iommu_dev_data *dev_data = dev_iommu_priv_get(&pdev->dev); + int ret = -EINVAL; + + if (dev_data->pri_enabled) + return 0; + + if (dev_data->flags & AMD_IOMMU_DEVICE_FLAG_PRI_SUP) { + /* + * First reset the PRI state of the device. + * FIXME: Hardcode number of outstanding requests for now + */ + if (!pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32)) { + dev_data->pri_enabled = 1; + dev_data->pri_tlp = pci_prg_resp_pasid_required(pdev); + + ret = 0; + } + } + + return ret; +} + +void amd_iommu_pdev_disable_cap_pri(struct pci_dev *pdev) +{ + struct iommu_dev_data *dev_data = dev_iommu_priv_get(&pdev->dev); + + if (dev_data->pri_enabled) { + pci_disable_pri(pdev); + dev_data->pri_enabled = 0; + } +} + +static inline int pdev_enable_cap_pasid(struct pci_dev *pdev) +{ + struct iommu_dev_data *dev_data = dev_iommu_priv_get(&pdev->dev); + int ret = -EINVAL; + + if (dev_data->pasid_enabled) + return 0; + + if (dev_data->flags & AMD_IOMMU_DEVICE_FLAG_PASID_SUP) { + /* Only allow access to user-accessible pages */ + ret = pci_enable_pasid(pdev, 0); + if (!ret) + dev_data->pasid_enabled = 1; + } + + return ret; +} + +static inline void pdev_disable_cap_pasid(struct pci_dev *pdev) +{ + struct iommu_dev_data *dev_data = dev_iommu_priv_get(&pdev->dev); + + if (dev_data->pasid_enabled) { + pci_disable_pasid(pdev); + dev_data->pasid_enabled = 0; + } +} + +static void pdev_enable_caps(struct pci_dev *pdev) +{ + pdev_enable_cap_ats(pdev); + pdev_enable_cap_pasid(pdev); + amd_iommu_pdev_enable_cap_pri(pdev); + +} + +static void pdev_disable_caps(struct pci_dev *pdev) +{ + pdev_disable_cap_ats(pdev); + pdev_disable_cap_pasid(pdev); + amd_iommu_pdev_disable_cap_pri(pdev); } /* @@ -399,8 +514,8 @@ static int iommu_init_device(struct amd_iommu *iommu, struct device *dev) * it'll be forced to go into translation mode. */ if ((iommu_default_passthrough() || !amd_iommu_force_isolation) && - dev_is_pci(dev) && pci_iommuv2_capable(to_pci_dev(dev))) { - dev_data->iommu_v2 = iommu->is_iommu_v2; + dev_is_pci(dev) && amd_iommu_gt_ppr_supported()) { + dev_data->flags = pdev_get_caps(to_pci_dev(dev)); } dev_iommu_priv_set(dev, dev_data); @@ -701,24 +816,6 @@ static void iommu_poll_events(struct amd_iommu *iommu) writel(head, iommu->mmio_base + MMIO_EVT_HEAD_OFFSET); } -static void iommu_handle_ppr_entry(struct amd_iommu *iommu, u64 *raw) -{ - struct amd_iommu_fault fault; - - if (PPR_REQ_TYPE(raw[0]) != PPR_REQ_FAULT) { - pr_err_ratelimited("Unknown PPR request received\n"); - return; - } - - fault.address = raw[1]; - fault.pasid = PPR_PASID(raw[0]); - fault.sbdf = PCI_SEG_DEVID_TO_SBDF(iommu->pci_seg->id, PPR_DEVID(raw[0])); - fault.tag = PPR_TAG(raw[0]); - fault.flags = PPR_FLAGS(raw[0]); - - atomic_notifier_call_chain(&ppr_notifier, 0, &fault); -} - static void iommu_poll_ppr_log(struct amd_iommu *iommu) { u32 head, tail; @@ -764,8 +861,7 @@ static void iommu_poll_ppr_log(struct amd_iommu *iommu) head = (head + PPR_ENTRY_SIZE) % PPR_LOG_SIZE; writel(head, iommu->mmio_base + MMIO_PPR_HEAD_OFFSET); - /* Handle PPR entry */ - iommu_handle_ppr_entry(iommu, entry); + /* TODO: PPR Handler will be added when we add IOPF support */ /* Refresh ring-buffer information */ head = readl(iommu->mmio_base + MMIO_PPR_HEAD_OFFSET); @@ -1094,7 +1190,7 @@ static void build_inv_iotlb_pasid(struct iommu_cmd *cmd, u16 devid, u32 pasid, } static void build_complete_ppr(struct iommu_cmd *cmd, u16 devid, u32 pasid, - int status, int tag, bool gn) + int status, int tag, u8 gn) { memset(cmd, 0, sizeof(*cmd)); @@ -1298,7 +1394,7 @@ static void amd_iommu_flush_irt_all(struct amd_iommu *iommu) void iommu_flush_all_caches(struct amd_iommu *iommu) { - if (iommu_feature(iommu, FEATURE_IA)) { + if (check_feature(FEATURE_IA)) { amd_iommu_flush_all(iommu); } else { amd_iommu_flush_dte_all(iommu); @@ -1317,7 +1413,7 @@ static int device_flush_iotlb(struct iommu_dev_data *dev_data, struct iommu_cmd cmd; int qdep; - qdep = dev_data->ats.qdep; + qdep = dev_data->ats_qdep; iommu = rlookup_amd_iommu(dev_data->dev); if (!iommu) return -EINVAL; @@ -1368,7 +1464,7 @@ static int device_flush_dte(struct iommu_dev_data *dev_data) return ret; } - if (dev_data->ats.enabled) + if (dev_data->ats_enabled) ret = device_flush_iotlb(dev_data, 0, ~0UL); return ret; @@ -1401,7 +1497,7 @@ static void __domain_flush_pages(struct protection_domain *domain, list_for_each_entry(dev_data, &domain->dev_list, list) { - if (!dev_data->ats.enabled) + if (!dev_data->ats_enabled) continue; ret |= device_flush_iotlb(dev_data, address, size); @@ -1577,6 +1673,42 @@ static void free_gcr3_table(struct protection_domain *domain) free_page((unsigned long)domain->gcr3_tbl); } +/* + * Number of GCR3 table levels required. Level must be 4-Kbyte + * page and can contain up to 512 entries. + */ +static int get_gcr3_levels(int pasids) +{ + int levels; + + if (pasids == -1) + return amd_iommu_max_glx_val; + + levels = get_count_order(pasids); + + return levels ? (DIV_ROUND_UP(levels, 9) - 1) : levels; +} + +/* Note: This function expects iommu_domain->lock to be held prior calling the function. */ +static int setup_gcr3_table(struct protection_domain *domain, int pasids) +{ + int levels = get_gcr3_levels(pasids); + + if (levels > amd_iommu_max_glx_val) + return -EINVAL; + + domain->gcr3_tbl = alloc_pgtable_page(domain->nid, GFP_ATOMIC); + if (domain->gcr3_tbl == NULL) + return -ENOMEM; + + domain->glx = levels; + domain->flags |= PD_IOMMUV2_MASK; + + amd_iommu_domain_update(domain); + + return 0; +} + static void set_dte_entry(struct amd_iommu *iommu, u16 devid, struct protection_domain *domain, bool ats, bool ppr) { @@ -1605,10 +1737,11 @@ static void set_dte_entry(struct amd_iommu *iommu, u16 devid, if (ats) flags |= DTE_FLAG_IOTLB; - if (ppr) { - if (iommu_feature(iommu, FEATURE_EPHSUP)) - pte_root |= 1ULL << DEV_ENTRY_PPR; - } + if (ppr) + pte_root |= 1ULL << DEV_ENTRY_PPR; + + if (domain->dirty_tracking) + pte_root |= DTE_FLAG_HAD; if (domain->flags & PD_IOMMUV2_MASK) { u64 gcr3 = iommu_virt_to_phys(domain->gcr3_tbl); @@ -1685,7 +1818,7 @@ static void do_attach(struct iommu_dev_data *dev_data, iommu = rlookup_amd_iommu(dev_data->dev); if (!iommu) return; - ats = dev_data->ats.enabled; + ats = dev_data->ats_enabled; /* Update data structures */ dev_data->domain = domain; @@ -1701,7 +1834,7 @@ static void do_attach(struct iommu_dev_data *dev_data, /* Update device table */ set_dte_entry(iommu, dev_data->devid, domain, - ats, dev_data->iommu_v2); + ats, dev_data->ppr); clone_aliases(iommu, dev_data->dev); device_flush_dte(dev_data); @@ -1736,48 +1869,6 @@ static void do_detach(struct iommu_dev_data *dev_data) domain->dev_cnt -= 1; } -static void pdev_iommuv2_disable(struct pci_dev *pdev) -{ - pci_disable_ats(pdev); - pci_disable_pri(pdev); - pci_disable_pasid(pdev); -} - -static int pdev_pri_ats_enable(struct pci_dev *pdev) -{ - int ret; - - /* Only allow access to user-accessible pages */ - ret = pci_enable_pasid(pdev, 0); - if (ret) - return ret; - - /* First reset the PRI state of the device */ - ret = pci_reset_pri(pdev); - if (ret) - goto out_err_pasid; - - /* Enable PRI */ - /* FIXME: Hardcode number of outstanding requests for now */ - ret = pci_enable_pri(pdev, 32); - if (ret) - goto out_err_pasid; - - ret = pci_enable_ats(pdev, PAGE_SHIFT); - if (ret) - goto out_err_pri; - - return 0; - -out_err_pri: - pci_disable_pri(pdev); - -out_err_pasid: - pci_disable_pasid(pdev); - - return ret; -} - /* * If a device is not yet associated with a domain, this function makes the * device visible in the domain @@ -1786,9 +1877,8 @@ static int attach_device(struct device *dev, struct protection_domain *domain) { struct iommu_dev_data *dev_data; - struct pci_dev *pdev; unsigned long flags; - int ret; + int ret = 0; spin_lock_irqsave(&domain->lock, flags); @@ -1796,45 +1886,13 @@ static int attach_device(struct device *dev, spin_lock(&dev_data->lock); - ret = -EBUSY; - if (dev_data->domain != NULL) + if (dev_data->domain != NULL) { + ret = -EBUSY; goto out; - - if (!dev_is_pci(dev)) - goto skip_ats_check; - - pdev = to_pci_dev(dev); - if (domain->flags & PD_IOMMUV2_MASK) { - struct iommu_domain *def_domain = iommu_get_dma_domain(dev); - - ret = -EINVAL; - - /* - * In case of using AMD_IOMMU_V1 page table mode and the device - * is enabling for PPR/ATS support (using v2 table), - * we need to make sure that the domain type is identity map. - */ - if ((amd_iommu_pgtable == AMD_IOMMU_V1) && - def_domain->type != IOMMU_DOMAIN_IDENTITY) { - goto out; - } - - if (dev_data->iommu_v2) { - if (pdev_pri_ats_enable(pdev) != 0) - goto out; - - dev_data->ats.enabled = true; - dev_data->ats.qdep = pci_ats_queue_depth(pdev); - dev_data->pri_tlp = pci_prg_resp_pasid_required(pdev); - } - } else if (amd_iommu_iotlb_sup && - pci_enable_ats(pdev, PAGE_SHIFT) == 0) { - dev_data->ats.enabled = true; - dev_data->ats.qdep = pci_ats_queue_depth(pdev); } -skip_ats_check: - ret = 0; + if (dev_is_pci(dev)) + pdev_enable_caps(to_pci_dev(dev)); do_attach(dev_data, domain); @@ -1882,15 +1940,8 @@ static void detach_device(struct device *dev) do_detach(dev_data); - if (!dev_is_pci(dev)) - goto out; - - if (domain->flags & PD_IOMMUV2_MASK && dev_data->iommu_v2) - pdev_iommuv2_disable(to_pci_dev(dev)); - else if (dev_data->ats.enabled) - pci_disable_ats(to_pci_dev(dev)); - - dev_data->ats.enabled = false; + if (dev_is_pci(dev)) + pdev_disable_caps(to_pci_dev(dev)); out: spin_unlock(&dev_data->lock); @@ -1980,7 +2031,7 @@ static void update_device_table(struct protection_domain *domain) if (!iommu) continue; set_dte_entry(iommu, dev_data->devid, domain, - dev_data->ats.enabled, dev_data->iommu_v2); + dev_data->ats_enabled, dev_data->ppr); clone_aliases(iommu, dev_data->dev); } } @@ -2014,9 +2065,11 @@ void amd_iommu_domain_update(struct protection_domain *domain) static void cleanup_domain(struct protection_domain *domain) { struct iommu_dev_data *entry; - unsigned long flags; - spin_lock_irqsave(&domain->lock, flags); + lockdep_assert_held(&domain->lock); + + if (!domain->dev_cnt) + return; while (!list_empty(&domain->dev_list)) { entry = list_first_entry(&domain->dev_list, @@ -2024,8 +2077,7 @@ static void cleanup_domain(struct protection_domain *domain) BUG_ON(!entry->domain); do_detach(entry); } - - spin_unlock_irqrestore(&domain->lock, flags); + WARN_ON(domain->dev_cnt != 0); } static void protection_domain_free(struct protection_domain *domain) @@ -2036,6 +2088,12 @@ static void protection_domain_free(struct protection_domain *domain) if (domain->iop.pgtbl_cfg.tlb) free_io_pgtable_ops(&domain->iop.iop.ops); + if (domain->flags & PD_IOMMUV2_MASK) + free_gcr3_table(domain); + + if (domain->iop.root) + free_page((unsigned long)domain->iop.root); + if (domain->id) domain_id_free(domain->id); @@ -2048,18 +2106,10 @@ static int protection_domain_init_v1(struct protection_domain *domain, int mode) BUG_ON(mode < PAGE_MODE_NONE || mode > PAGE_MODE_6_LEVEL); - spin_lock_init(&domain->lock); - domain->id = domain_id_alloc(); - if (!domain->id) - return -ENOMEM; - INIT_LIST_HEAD(&domain->dev_list); - if (mode != PAGE_MODE_NONE) { pt_root = (void *)get_zeroed_page(GFP_KERNEL); - if (!pt_root) { - domain_id_free(domain->id); + if (!pt_root) return -ENOMEM; - } } amd_iommu_domain_set_pgtable(domain, pt_root, mode); @@ -2069,20 +2119,12 @@ static int protection_domain_init_v1(struct protection_domain *domain, int mode) static int protection_domain_init_v2(struct protection_domain *domain) { - spin_lock_init(&domain->lock); - domain->id = domain_id_alloc(); - if (!domain->id) - return -ENOMEM; - INIT_LIST_HEAD(&domain->dev_list); - domain->flags |= PD_GIOV_MASK; domain->domain.pgsize_bitmap = AMD_IOMMU_PGSIZES_V2; - if (domain_enable_v2(domain, 1)) { - domain_id_free(domain->id); + if (setup_gcr3_table(domain, 1)) return -ENOMEM; - } return 0; } @@ -2092,57 +2134,60 @@ static struct protection_domain *protection_domain_alloc(unsigned int type) struct io_pgtable_ops *pgtbl_ops; struct protection_domain *domain; int pgtable; - int mode = DEFAULT_PGTABLE_LEVEL; int ret; + domain = kzalloc(sizeof(*domain), GFP_KERNEL); + if (!domain) + return NULL; + + domain->id = domain_id_alloc(); + if (!domain->id) + goto out_err; + + spin_lock_init(&domain->lock); + INIT_LIST_HEAD(&domain->dev_list); + domain->nid = NUMA_NO_NODE; + + switch (type) { + /* No need to allocate io pgtable ops in passthrough mode */ + case IOMMU_DOMAIN_IDENTITY: + return domain; + case IOMMU_DOMAIN_DMA: + pgtable = amd_iommu_pgtable; + break; /* - * Force IOMMU v1 page table when iommu=pt and - * when allocating domain for pass-through devices. + * Force IOMMU v1 page table when allocating + * domain for pass-through devices. */ - if (type == IOMMU_DOMAIN_IDENTITY) { - pgtable = AMD_IOMMU_V1; - mode = PAGE_MODE_NONE; - } else if (type == IOMMU_DOMAIN_UNMANAGED) { + case IOMMU_DOMAIN_UNMANAGED: pgtable = AMD_IOMMU_V1; - } else if (type == IOMMU_DOMAIN_DMA || type == IOMMU_DOMAIN_DMA_FQ) { - pgtable = amd_iommu_pgtable; - } else { - return NULL; + break; + default: + goto out_err; } - domain = kzalloc(sizeof(*domain), GFP_KERNEL); - if (!domain) - return NULL; - switch (pgtable) { case AMD_IOMMU_V1: - ret = protection_domain_init_v1(domain, mode); + ret = protection_domain_init_v1(domain, DEFAULT_PGTABLE_LEVEL); break; case AMD_IOMMU_V2: ret = protection_domain_init_v2(domain); break; default: ret = -EINVAL; + break; } if (ret) goto out_err; - /* No need to allocate io pgtable ops in passthrough mode */ - if (type == IOMMU_DOMAIN_IDENTITY) - return domain; - - domain->nid = NUMA_NO_NODE; - pgtbl_ops = alloc_io_pgtable_ops(pgtable, &domain->iop.pgtbl_cfg, domain); - if (!pgtbl_ops) { - domain_id_free(domain->id); + if (!pgtbl_ops) goto out_err; - } return domain; out_err: - kfree(domain); + protection_domain_free(domain); return NULL; } @@ -2155,44 +2200,94 @@ static inline u64 dma_max_address(void) return ((1ULL << PM_LEVEL_SHIFT(amd_iommu_gpt_level)) - 1); } -static struct iommu_domain *amd_iommu_domain_alloc(unsigned type) +static bool amd_iommu_hd_support(struct amd_iommu *iommu) +{ + return iommu && (iommu->features & FEATURE_HDSUP); +} + +static struct iommu_domain *do_iommu_domain_alloc(unsigned int type, + struct device *dev, u32 flags) { + bool dirty_tracking = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING; struct protection_domain *domain; + struct amd_iommu *iommu = NULL; + + if (dev) { + iommu = rlookup_amd_iommu(dev); + if (!iommu) + return ERR_PTR(-ENODEV); + } /* * Since DTE[Mode]=0 is prohibited on SNP-enabled system, * default to use IOMMU_DOMAIN_DMA[_FQ]. */ if (amd_iommu_snp_en && (type == IOMMU_DOMAIN_IDENTITY)) - return NULL; + return ERR_PTR(-EINVAL); + + if (dirty_tracking && !amd_iommu_hd_support(iommu)) + return ERR_PTR(-EOPNOTSUPP); domain = protection_domain_alloc(type); if (!domain) - return NULL; + return ERR_PTR(-ENOMEM); domain->domain.geometry.aperture_start = 0; domain->domain.geometry.aperture_end = dma_max_address(); domain->domain.geometry.force_aperture = true; + if (iommu) { + domain->domain.type = type; + domain->domain.pgsize_bitmap = iommu->iommu.ops->pgsize_bitmap; + domain->domain.ops = iommu->iommu.ops->default_domain_ops; + + if (dirty_tracking) + domain->domain.dirty_ops = &amd_dirty_ops; + } + return &domain->domain; } -static void amd_iommu_domain_free(struct iommu_domain *dom) +static struct iommu_domain *amd_iommu_domain_alloc(unsigned int type) { - struct protection_domain *domain; + struct iommu_domain *domain; - domain = to_pdomain(dom); + domain = do_iommu_domain_alloc(type, NULL, 0); + if (IS_ERR(domain)) + return NULL; + + return domain; +} + +static struct iommu_domain * +amd_iommu_domain_alloc_user(struct device *dev, u32 flags, + struct iommu_domain *parent, + const struct iommu_user_data *user_data) + +{ + unsigned int type = IOMMU_DOMAIN_UNMANAGED; - if (domain->dev_cnt > 0) - cleanup_domain(domain); + if ((flags & ~IOMMU_HWPT_ALLOC_DIRTY_TRACKING) || parent || user_data) + return ERR_PTR(-EOPNOTSUPP); - BUG_ON(domain->dev_cnt != 0); + return do_iommu_domain_alloc(type, dev, flags); +} + +static void amd_iommu_domain_free(struct iommu_domain *dom) +{ + struct protection_domain *domain; + unsigned long flags; if (!dom) return; - if (domain->flags & PD_IOMMUV2_MASK) - free_gcr3_table(domain); + domain = to_pdomain(dom); + + spin_lock_irqsave(&domain->lock, flags); + + cleanup_domain(domain); + + spin_unlock_irqrestore(&domain->lock, flags); protection_domain_free(domain); } @@ -2214,6 +2309,13 @@ static int amd_iommu_attach_device(struct iommu_domain *dom, dev_data->defer_attach = false; + /* + * Restrict to devices with compatible IOMMU hardware support + * when enforcement of dirty tracking is enabled. + */ + if (dom->dirty_ops && !amd_iommu_hd_support(iommu)) + return -EINVAL; + if (dev_data->domain) detach_device(dev); @@ -2233,14 +2335,15 @@ static int amd_iommu_attach_device(struct iommu_domain *dom, return ret; } -static void amd_iommu_iotlb_sync_map(struct iommu_domain *dom, - unsigned long iova, size_t size) +static int amd_iommu_iotlb_sync_map(struct iommu_domain *dom, + unsigned long iova, size_t size) { struct protection_domain *domain = to_pdomain(dom); struct io_pgtable_ops *ops = &domain->iop.iop.ops; if (ops->map_pages) domain_flush_np_cache(domain, iova, size); + return 0; } static int amd_iommu_map_pages(struct iommu_domain *dom, unsigned long iova, @@ -2332,6 +2435,11 @@ static bool amd_iommu_capable(struct device *dev, enum iommu_cap cap) return true; case IOMMU_CAP_DEFERRED_FLUSH: return true; + case IOMMU_CAP_DIRTY_TRACKING: { + struct amd_iommu *iommu = rlookup_amd_iommu(dev); + + return amd_iommu_hd_support(iommu); + } default: break; } @@ -2339,6 +2447,73 @@ static bool amd_iommu_capable(struct device *dev, enum iommu_cap cap) return false; } +static int amd_iommu_set_dirty_tracking(struct iommu_domain *domain, + bool enable) +{ + struct protection_domain *pdomain = to_pdomain(domain); + struct dev_table_entry *dev_table; + struct iommu_dev_data *dev_data; + bool domain_flush = false; + struct amd_iommu *iommu; + unsigned long flags; + u64 pte_root; + + spin_lock_irqsave(&pdomain->lock, flags); + if (!(pdomain->dirty_tracking ^ enable)) { + spin_unlock_irqrestore(&pdomain->lock, flags); + return 0; + } + + list_for_each_entry(dev_data, &pdomain->dev_list, list) { + iommu = rlookup_amd_iommu(dev_data->dev); + if (!iommu) + continue; + + dev_table = get_dev_table(iommu); + pte_root = dev_table[dev_data->devid].data[0]; + + pte_root = (enable ? pte_root | DTE_FLAG_HAD : + pte_root & ~DTE_FLAG_HAD); + + /* Flush device DTE */ + dev_table[dev_data->devid].data[0] = pte_root; + device_flush_dte(dev_data); + domain_flush = true; + } + + /* Flush IOTLB to mark IOPTE dirty on the next translation(s) */ + if (domain_flush) { + amd_iommu_domain_flush_tlb_pde(pdomain); + amd_iommu_domain_flush_complete(pdomain); + } + pdomain->dirty_tracking = enable; + spin_unlock_irqrestore(&pdomain->lock, flags); + + return 0; +} + +static int amd_iommu_read_and_clear_dirty(struct iommu_domain *domain, + unsigned long iova, size_t size, + unsigned long flags, + struct iommu_dirty_bitmap *dirty) +{ + struct protection_domain *pdomain = to_pdomain(domain); + struct io_pgtable_ops *ops = &pdomain->iop.iop.ops; + unsigned long lflags; + + if (!ops || !ops->read_and_clear_dirty) + return -EOPNOTSUPP; + + spin_lock_irqsave(&pdomain->lock, lflags); + if (!pdomain->dirty_tracking && dirty->bitmap) { + spin_unlock_irqrestore(&pdomain->lock, lflags); + return -EINVAL; + } + spin_unlock_irqrestore(&pdomain->lock, lflags); + + return ops->read_and_clear_dirty(ops, iova, size, flags, dirty); +} + static void amd_iommu_get_resv_regions(struct device *dev, struct list_head *head) { @@ -2406,7 +2581,6 @@ bool amd_iommu_is_attach_deferred(struct device *dev) return dev_data->defer_attach; } -EXPORT_SYMBOL_GPL(amd_iommu_is_attach_deferred); static void amd_iommu_flush_iotlb_all(struct iommu_domain *domain) { @@ -2446,7 +2620,7 @@ static int amd_iommu_def_domain_type(struct device *dev) * and require remapping. * - SNP is enabled, because it prohibits DTE[Mode]=0. */ - if (dev_data->iommu_v2 && + if (pdev_pasid_supported(dev_data) && !cc_platform_has(CC_ATTR_MEM_ENCRYPT) && !amd_iommu_snp_en) { return IOMMU_DOMAIN_IDENTITY; @@ -2461,9 +2635,15 @@ static bool amd_iommu_enforce_cache_coherency(struct iommu_domain *domain) return true; } +const struct iommu_dirty_ops amd_dirty_ops = { + .set_dirty_tracking = amd_iommu_set_dirty_tracking, + .read_and_clear_dirty = amd_iommu_read_and_clear_dirty, +}; + const struct iommu_ops amd_iommu_ops = { .capable = amd_iommu_capable, .domain_alloc = amd_iommu_domain_alloc, + .domain_alloc_user = amd_iommu_domain_alloc_user, .probe_device = amd_iommu_probe_device, .release_device = amd_iommu_release_device, .probe_finalize = amd_iommu_probe_finalize, @@ -2485,93 +2665,6 @@ const struct iommu_ops amd_iommu_ops = { } }; -/***************************************************************************** - * - * The next functions do a basic initialization of IOMMU for pass through - * mode - * - * In passthrough mode the IOMMU is initialized and enabled but not used for - * DMA-API translation. - * - *****************************************************************************/ - -/* IOMMUv2 specific functions */ -int amd_iommu_register_ppr_notifier(struct notifier_block *nb) -{ - return atomic_notifier_chain_register(&ppr_notifier, nb); -} -EXPORT_SYMBOL(amd_iommu_register_ppr_notifier); - -int amd_iommu_unregister_ppr_notifier(struct notifier_block *nb) -{ - return atomic_notifier_chain_unregister(&ppr_notifier, nb); -} -EXPORT_SYMBOL(amd_iommu_unregister_ppr_notifier); - -void amd_iommu_domain_direct_map(struct iommu_domain *dom) -{ - struct protection_domain *domain = to_pdomain(dom); - unsigned long flags; - - spin_lock_irqsave(&domain->lock, flags); - - if (domain->iop.pgtbl_cfg.tlb) - free_io_pgtable_ops(&domain->iop.iop.ops); - - spin_unlock_irqrestore(&domain->lock, flags); -} -EXPORT_SYMBOL(amd_iommu_domain_direct_map); - -/* Note: This function expects iommu_domain->lock to be held prior calling the function. */ -static int domain_enable_v2(struct protection_domain *domain, int pasids) -{ - int levels; - - /* Number of GCR3 table levels required */ - for (levels = 0; (pasids - 1) & ~0x1ff; pasids >>= 9) - levels += 1; - - if (levels > amd_iommu_max_glx_val) - return -EINVAL; - - domain->gcr3_tbl = (void *)get_zeroed_page(GFP_ATOMIC); - if (domain->gcr3_tbl == NULL) - return -ENOMEM; - - domain->glx = levels; - domain->flags |= PD_IOMMUV2_MASK; - - amd_iommu_domain_update(domain); - - return 0; -} - -int amd_iommu_domain_enable_v2(struct iommu_domain *dom, int pasids) -{ - struct protection_domain *pdom = to_pdomain(dom); - unsigned long flags; - int ret; - - spin_lock_irqsave(&pdom->lock, flags); - - /* - * Save us all sanity checks whether devices already in the - * domain support IOMMUv2. Just force that the domain has no - * devices attached when it is switched into IOMMUv2 mode. - */ - ret = -EBUSY; - if (pdom->dev_cnt > 0 || pdom->flags & PD_IOMMUV2_MASK) - goto out; - - if (!pdom->gcr3_tbl) - ret = domain_enable_v2(pdom, pasids); - -out: - spin_unlock_irqrestore(&pdom->lock, flags); - return ret; -} -EXPORT_SYMBOL(amd_iommu_domain_enable_v2); - static int __flush_pasid(struct protection_domain *domain, u32 pasid, u64 address, bool size) { @@ -2609,10 +2702,10 @@ static int __flush_pasid(struct protection_domain *domain, u32 pasid, There might be non-IOMMUv2 capable devices in an IOMMUv2 * domain. */ - if (!dev_data->ats.enabled) + if (!dev_data->ats_enabled) continue; - qdep = dev_data->ats.qdep; + qdep = dev_data->ats_qdep; iommu = rlookup_amd_iommu(dev_data->dev); if (!iommu) continue; @@ -2653,7 +2746,6 @@ int amd_iommu_flush_page(struct iommu_domain *dom, u32 pasid, return ret; } -EXPORT_SYMBOL(amd_iommu_flush_page); static int __amd_iommu_flush_tlb(struct protection_domain *domain, u32 pasid) { @@ -2673,7 +2765,6 @@ int amd_iommu_flush_tlb(struct iommu_domain *dom, u32 pasid) return ret; } -EXPORT_SYMBOL(amd_iommu_flush_tlb); static u64 *__get_gcr3_pte(u64 *root, int level, u32 pasid, bool alloc) { @@ -2753,7 +2844,6 @@ int amd_iommu_domain_set_gcr3(struct iommu_domain *dom, u32 pasid, return ret; } -EXPORT_SYMBOL(amd_iommu_domain_set_gcr3); int amd_iommu_domain_clear_gcr3(struct iommu_domain *dom, u32 pasid) { @@ -2767,7 +2857,6 @@ int amd_iommu_domain_clear_gcr3(struct iommu_domain *dom, u32 pasid) return ret; } -EXPORT_SYMBOL(amd_iommu_domain_clear_gcr3); int amd_iommu_complete_ppr(struct pci_dev *pdev, u32 pasid, int status, int tag) @@ -2786,49 +2875,6 @@ int amd_iommu_complete_ppr(struct pci_dev *pdev, u32 pasid, return iommu_queue_command(iommu, &cmd); } -EXPORT_SYMBOL(amd_iommu_complete_ppr); - -int amd_iommu_device_info(struct pci_dev *pdev, - struct amd_iommu_device_info *info) -{ - int max_pasids; - int pos; - - if (pdev == NULL || info == NULL) - return -EINVAL; - - if (!amd_iommu_v2_supported()) - return -EINVAL; - - memset(info, 0, sizeof(*info)); - - if (pci_ats_supported(pdev)) - info->flags |= AMD_IOMMU_DEVICE_FLAG_ATS_SUP; - - pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI); - if (pos) - info->flags |= AMD_IOMMU_DEVICE_FLAG_PRI_SUP; - - pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PASID); - if (pos) { - int features; - - max_pasids = 1 << (9 * (amd_iommu_max_glx_val + 1)); - max_pasids = min(max_pasids, (1 << 20)); - - info->flags |= AMD_IOMMU_DEVICE_FLAG_PASID_SUP; - info->max_pasids = min(pci_max_pasids(pdev), max_pasids); - - features = pci_pasid_features(pdev); - if (features & PCI_PASID_CAP_EXEC) - info->flags |= AMD_IOMMU_DEVICE_FLAG_EXEC_SUP; - if (features & PCI_PASID_CAP_PRIV) - info->flags |= AMD_IOMMU_DEVICE_FLAG_PRIV_SUP; - } - - return 0; -} -EXPORT_SYMBOL(amd_iommu_device_info); #ifdef CONFIG_IRQ_REMAP diff --git a/drivers/iommu/amd/iommu_v2.c b/drivers/iommu/amd/iommu_v2.c deleted file mode 100644 index 57c2fb1146..0000000000 --- a/drivers/iommu/amd/iommu_v2.c +++ /dev/null @@ -1,996 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Copyright (C) 2010-2012 Advanced Micro Devices, Inc. - * Author: Joerg Roedel <jroedel@suse.de> - */ - -#define pr_fmt(fmt) "AMD-Vi: " fmt - -#include <linux/refcount.h> -#include <linux/mmu_notifier.h> -#include <linux/amd-iommu.h> -#include <linux/mm_types.h> -#include <linux/profile.h> -#include <linux/module.h> -#include <linux/sched.h> -#include <linux/sched/mm.h> -#include <linux/wait.h> -#include <linux/pci.h> -#include <linux/gfp.h> -#include <linux/cc_platform.h> - -#include "amd_iommu.h" - -MODULE_LICENSE("GPL v2"); -MODULE_AUTHOR("Joerg Roedel <jroedel@suse.de>"); - -#define PRI_QUEUE_SIZE 512 - -struct pri_queue { - atomic_t inflight; - bool finish; - int status; -}; - -struct pasid_state { - struct list_head list; /* For global state-list */ - refcount_t count; /* Reference count */ - unsigned mmu_notifier_count; /* Counting nested mmu_notifier - calls */ - struct mm_struct *mm; /* mm_struct for the faults */ - struct mmu_notifier mn; /* mmu_notifier handle */ - struct pri_queue pri[PRI_QUEUE_SIZE]; /* PRI tag states */ - struct device_state *device_state; /* Link to our device_state */ - u32 pasid; /* PASID index */ - bool invalid; /* Used during setup and - teardown of the pasid */ - spinlock_t lock; /* Protect pri_queues and - mmu_notifer_count */ - wait_queue_head_t wq; /* To wait for count == 0 */ -}; - -struct device_state { - struct list_head list; - u32 sbdf; - atomic_t count; - struct pci_dev *pdev; - struct pasid_state **states; - struct iommu_domain *domain; - int pasid_levels; - int max_pasids; - amd_iommu_invalid_ppr_cb inv_ppr_cb; - amd_iommu_invalidate_ctx inv_ctx_cb; - spinlock_t lock; - wait_queue_head_t wq; -}; - -struct fault { - struct work_struct work; - struct device_state *dev_state; - struct pasid_state *state; - struct mm_struct *mm; - u64 address; - u32 pasid; - u16 tag; - u16 finish; - u16 flags; -}; - -static LIST_HEAD(state_list); -static DEFINE_SPINLOCK(state_lock); - -static struct workqueue_struct *iommu_wq; - -static void free_pasid_states(struct device_state *dev_state); - -static struct device_state *__get_device_state(u32 sbdf) -{ - struct device_state *dev_state; - - list_for_each_entry(dev_state, &state_list, list) { - if (dev_state->sbdf == sbdf) - return dev_state; - } - - return NULL; -} - -static struct device_state *get_device_state(u32 sbdf) -{ - struct device_state *dev_state; - unsigned long flags; - - spin_lock_irqsave(&state_lock, flags); - dev_state = __get_device_state(sbdf); - if (dev_state != NULL) - atomic_inc(&dev_state->count); - spin_unlock_irqrestore(&state_lock, flags); - - return dev_state; -} - -static void free_device_state(struct device_state *dev_state) -{ - struct iommu_group *group; - - /* Get rid of any remaining pasid states */ - free_pasid_states(dev_state); - - /* - * Wait until the last reference is dropped before freeing - * the device state. - */ - wait_event(dev_state->wq, !atomic_read(&dev_state->count)); - - /* - * First detach device from domain - No more PRI requests will arrive - * from that device after it is unbound from the IOMMUv2 domain. - */ - group = iommu_group_get(&dev_state->pdev->dev); - if (WARN_ON(!group)) - return; - - iommu_detach_group(dev_state->domain, group); - - iommu_group_put(group); - - /* Everything is down now, free the IOMMUv2 domain */ - iommu_domain_free(dev_state->domain); - - /* Finally get rid of the device-state */ - kfree(dev_state); -} - -static void put_device_state(struct device_state *dev_state) -{ - if (atomic_dec_and_test(&dev_state->count)) - wake_up(&dev_state->wq); -} - -/* Must be called under dev_state->lock */ -static struct pasid_state **__get_pasid_state_ptr(struct device_state *dev_state, - u32 pasid, bool alloc) -{ - struct pasid_state **root, **ptr; - int level, index; - - level = dev_state->pasid_levels; - root = dev_state->states; - - while (true) { - - index = (pasid >> (9 * level)) & 0x1ff; - ptr = &root[index]; - - if (level == 0) - break; - - if (*ptr == NULL) { - if (!alloc) - return NULL; - - *ptr = (void *)get_zeroed_page(GFP_ATOMIC); - if (*ptr == NULL) - return NULL; - } - - root = (struct pasid_state **)*ptr; - level -= 1; - } - - return ptr; -} - -static int set_pasid_state(struct device_state *dev_state, - struct pasid_state *pasid_state, - u32 pasid) -{ - struct pasid_state **ptr; - unsigned long flags; - int ret; - - spin_lock_irqsave(&dev_state->lock, flags); - ptr = __get_pasid_state_ptr(dev_state, pasid, true); - - ret = -ENOMEM; - if (ptr == NULL) - goto out_unlock; - - ret = -ENOMEM; - if (*ptr != NULL) - goto out_unlock; - - *ptr = pasid_state; - - ret = 0; - -out_unlock: - spin_unlock_irqrestore(&dev_state->lock, flags); - - return ret; -} - -static void clear_pasid_state(struct device_state *dev_state, u32 pasid) -{ - struct pasid_state **ptr; - unsigned long flags; - - spin_lock_irqsave(&dev_state->lock, flags); - ptr = __get_pasid_state_ptr(dev_state, pasid, true); - - if (ptr == NULL) - goto out_unlock; - - *ptr = NULL; - -out_unlock: - spin_unlock_irqrestore(&dev_state->lock, flags); -} - -static struct pasid_state *get_pasid_state(struct device_state *dev_state, - u32 pasid) -{ - struct pasid_state **ptr, *ret = NULL; - unsigned long flags; - - spin_lock_irqsave(&dev_state->lock, flags); - ptr = __get_pasid_state_ptr(dev_state, pasid, false); - - if (ptr == NULL) - goto out_unlock; - - ret = *ptr; - if (ret) - refcount_inc(&ret->count); - -out_unlock: - spin_unlock_irqrestore(&dev_state->lock, flags); - - return ret; -} - -static void free_pasid_state(struct pasid_state *pasid_state) -{ - kfree(pasid_state); -} - -static void put_pasid_state(struct pasid_state *pasid_state) -{ - if (refcount_dec_and_test(&pasid_state->count)) - wake_up(&pasid_state->wq); -} - -static void put_pasid_state_wait(struct pasid_state *pasid_state) -{ - if (!refcount_dec_and_test(&pasid_state->count)) - wait_event(pasid_state->wq, !refcount_read(&pasid_state->count)); - free_pasid_state(pasid_state); -} - -static void unbind_pasid(struct pasid_state *pasid_state) -{ - struct iommu_domain *domain; - - domain = pasid_state->device_state->domain; - - /* - * Mark pasid_state as invalid, no more faults will we added to the - * work queue after this is visible everywhere. - */ - pasid_state->invalid = true; - - /* Make sure this is visible */ - smp_wmb(); - - /* After this the device/pasid can't access the mm anymore */ - amd_iommu_domain_clear_gcr3(domain, pasid_state->pasid); - - /* Make sure no more pending faults are in the queue */ - flush_workqueue(iommu_wq); -} - -static void free_pasid_states_level1(struct pasid_state **tbl) -{ - int i; - - for (i = 0; i < 512; ++i) { - if (tbl[i] == NULL) - continue; - - free_page((unsigned long)tbl[i]); - } -} - -static void free_pasid_states_level2(struct pasid_state **tbl) -{ - struct pasid_state **ptr; - int i; - - for (i = 0; i < 512; ++i) { - if (tbl[i] == NULL) - continue; - - ptr = (struct pasid_state **)tbl[i]; - free_pasid_states_level1(ptr); - } -} - -static void free_pasid_states(struct device_state *dev_state) -{ - struct pasid_state *pasid_state; - int i; - - for (i = 0; i < dev_state->max_pasids; ++i) { - pasid_state = get_pasid_state(dev_state, i); - if (pasid_state == NULL) - continue; - - put_pasid_state(pasid_state); - - /* Clear the pasid state so that the pasid can be re-used */ - clear_pasid_state(dev_state, pasid_state->pasid); - - /* - * This will call the mn_release function and - * unbind the PASID - */ - mmu_notifier_unregister(&pasid_state->mn, pasid_state->mm); - - put_pasid_state_wait(pasid_state); /* Reference taken in - amd_iommu_bind_pasid */ - - /* Drop reference taken in amd_iommu_bind_pasid */ - put_device_state(dev_state); - } - - if (dev_state->pasid_levels == 2) - free_pasid_states_level2(dev_state->states); - else if (dev_state->pasid_levels == 1) - free_pasid_states_level1(dev_state->states); - else - BUG_ON(dev_state->pasid_levels != 0); - - free_page((unsigned long)dev_state->states); -} - -static struct pasid_state *mn_to_state(struct mmu_notifier *mn) -{ - return container_of(mn, struct pasid_state, mn); -} - -static void mn_arch_invalidate_secondary_tlbs(struct mmu_notifier *mn, - struct mm_struct *mm, - unsigned long start, unsigned long end) -{ - struct pasid_state *pasid_state; - struct device_state *dev_state; - - pasid_state = mn_to_state(mn); - dev_state = pasid_state->device_state; - - if ((start ^ (end - 1)) < PAGE_SIZE) - amd_iommu_flush_page(dev_state->domain, pasid_state->pasid, - start); - else - amd_iommu_flush_tlb(dev_state->domain, pasid_state->pasid); -} - -static void mn_release(struct mmu_notifier *mn, struct mm_struct *mm) -{ - struct pasid_state *pasid_state; - struct device_state *dev_state; - bool run_inv_ctx_cb; - - might_sleep(); - - pasid_state = mn_to_state(mn); - dev_state = pasid_state->device_state; - run_inv_ctx_cb = !pasid_state->invalid; - - if (run_inv_ctx_cb && dev_state->inv_ctx_cb) - dev_state->inv_ctx_cb(dev_state->pdev, pasid_state->pasid); - - unbind_pasid(pasid_state); -} - -static const struct mmu_notifier_ops iommu_mn = { - .release = mn_release, - .arch_invalidate_secondary_tlbs = mn_arch_invalidate_secondary_tlbs, -}; - -static void set_pri_tag_status(struct pasid_state *pasid_state, - u16 tag, int status) -{ - unsigned long flags; - - spin_lock_irqsave(&pasid_state->lock, flags); - pasid_state->pri[tag].status = status; - spin_unlock_irqrestore(&pasid_state->lock, flags); -} - -static void finish_pri_tag(struct device_state *dev_state, - struct pasid_state *pasid_state, - u16 tag) -{ - unsigned long flags; - - spin_lock_irqsave(&pasid_state->lock, flags); - if (atomic_dec_and_test(&pasid_state->pri[tag].inflight) && - pasid_state->pri[tag].finish) { - amd_iommu_complete_ppr(dev_state->pdev, pasid_state->pasid, - pasid_state->pri[tag].status, tag); - pasid_state->pri[tag].finish = false; - pasid_state->pri[tag].status = PPR_SUCCESS; - } - spin_unlock_irqrestore(&pasid_state->lock, flags); -} - -static void handle_fault_error(struct fault *fault) -{ - int status; - - if (!fault->dev_state->inv_ppr_cb) { - set_pri_tag_status(fault->state, fault->tag, PPR_INVALID); - return; - } - - status = fault->dev_state->inv_ppr_cb(fault->dev_state->pdev, - fault->pasid, - fault->address, - fault->flags); - switch (status) { - case AMD_IOMMU_INV_PRI_RSP_SUCCESS: - set_pri_tag_status(fault->state, fault->tag, PPR_SUCCESS); - break; - case AMD_IOMMU_INV_PRI_RSP_INVALID: - set_pri_tag_status(fault->state, fault->tag, PPR_INVALID); - break; - case AMD_IOMMU_INV_PRI_RSP_FAIL: - set_pri_tag_status(fault->state, fault->tag, PPR_FAILURE); - break; - default: - BUG(); - } -} - -static bool access_error(struct vm_area_struct *vma, struct fault *fault) -{ - unsigned long requested = 0; - - if (fault->flags & PPR_FAULT_EXEC) - requested |= VM_EXEC; - - if (fault->flags & PPR_FAULT_READ) - requested |= VM_READ; - - if (fault->flags & PPR_FAULT_WRITE) - requested |= VM_WRITE; - - return (requested & ~vma->vm_flags) != 0; -} - -static void do_fault(struct work_struct *work) -{ - struct fault *fault = container_of(work, struct fault, work); - struct vm_area_struct *vma; - vm_fault_t ret = VM_FAULT_ERROR; - unsigned int flags = 0; - struct mm_struct *mm; - u64 address; - - mm = fault->state->mm; - address = fault->address; - - if (fault->flags & PPR_FAULT_USER) - flags |= FAULT_FLAG_USER; - if (fault->flags & PPR_FAULT_WRITE) - flags |= FAULT_FLAG_WRITE; - flags |= FAULT_FLAG_REMOTE; - - mmap_read_lock(mm); - vma = vma_lookup(mm, address); - if (!vma) - /* failed to get a vma in the right range */ - goto out; - - /* Check if we have the right permissions on the vma */ - if (access_error(vma, fault)) - goto out; - - ret = handle_mm_fault(vma, address, flags, NULL); -out: - mmap_read_unlock(mm); - - if (ret & VM_FAULT_ERROR) - /* failed to service fault */ - handle_fault_error(fault); - - finish_pri_tag(fault->dev_state, fault->state, fault->tag); - - put_pasid_state(fault->state); - - kfree(fault); -} - -static int ppr_notifier(struct notifier_block *nb, unsigned long e, void *data) -{ - struct amd_iommu_fault *iommu_fault; - struct pasid_state *pasid_state; - struct device_state *dev_state; - struct pci_dev *pdev = NULL; - unsigned long flags; - struct fault *fault; - bool finish; - u16 tag, devid, seg_id; - int ret; - - iommu_fault = data; - tag = iommu_fault->tag & 0x1ff; - finish = (iommu_fault->tag >> 9) & 1; - - seg_id = PCI_SBDF_TO_SEGID(iommu_fault->sbdf); - devid = PCI_SBDF_TO_DEVID(iommu_fault->sbdf); - pdev = pci_get_domain_bus_and_slot(seg_id, PCI_BUS_NUM(devid), - devid & 0xff); - if (!pdev) - return -ENODEV; - - ret = NOTIFY_DONE; - - /* In kdump kernel pci dev is not initialized yet -> send INVALID */ - if (amd_iommu_is_attach_deferred(&pdev->dev)) { - amd_iommu_complete_ppr(pdev, iommu_fault->pasid, - PPR_INVALID, tag); - goto out; - } - - dev_state = get_device_state(iommu_fault->sbdf); - if (dev_state == NULL) - goto out; - - pasid_state = get_pasid_state(dev_state, iommu_fault->pasid); - if (pasid_state == NULL || pasid_state->invalid) { - /* We know the device but not the PASID -> send INVALID */ - amd_iommu_complete_ppr(dev_state->pdev, iommu_fault->pasid, - PPR_INVALID, tag); - goto out_drop_state; - } - - spin_lock_irqsave(&pasid_state->lock, flags); - atomic_inc(&pasid_state->pri[tag].inflight); - if (finish) - pasid_state->pri[tag].finish = true; - spin_unlock_irqrestore(&pasid_state->lock, flags); - - fault = kzalloc(sizeof(*fault), GFP_ATOMIC); - if (fault == NULL) { - /* We are OOM - send success and let the device re-fault */ - finish_pri_tag(dev_state, pasid_state, tag); - goto out_drop_state; - } - - fault->dev_state = dev_state; - fault->address = iommu_fault->address; - fault->state = pasid_state; - fault->tag = tag; - fault->finish = finish; - fault->pasid = iommu_fault->pasid; - fault->flags = iommu_fault->flags; - INIT_WORK(&fault->work, do_fault); - - queue_work(iommu_wq, &fault->work); - - ret = NOTIFY_OK; - -out_drop_state: - - if (ret != NOTIFY_OK && pasid_state) - put_pasid_state(pasid_state); - - put_device_state(dev_state); - -out: - pci_dev_put(pdev); - return ret; -} - -static struct notifier_block ppr_nb = { - .notifier_call = ppr_notifier, -}; - -int amd_iommu_bind_pasid(struct pci_dev *pdev, u32 pasid, - struct task_struct *task) -{ - struct pasid_state *pasid_state; - struct device_state *dev_state; - struct mm_struct *mm; - u32 sbdf; - int ret; - - might_sleep(); - - if (!amd_iommu_v2_supported()) - return -ENODEV; - - sbdf = get_pci_sbdf_id(pdev); - dev_state = get_device_state(sbdf); - - if (dev_state == NULL) - return -EINVAL; - - ret = -EINVAL; - if (pasid >= dev_state->max_pasids) - goto out; - - ret = -ENOMEM; - pasid_state = kzalloc(sizeof(*pasid_state), GFP_KERNEL); - if (pasid_state == NULL) - goto out; - - - refcount_set(&pasid_state->count, 1); - init_waitqueue_head(&pasid_state->wq); - spin_lock_init(&pasid_state->lock); - - mm = get_task_mm(task); - pasid_state->mm = mm; - pasid_state->device_state = dev_state; - pasid_state->pasid = pasid; - pasid_state->invalid = true; /* Mark as valid only if we are - done with setting up the pasid */ - pasid_state->mn.ops = &iommu_mn; - - if (pasid_state->mm == NULL) - goto out_free; - - ret = mmu_notifier_register(&pasid_state->mn, mm); - if (ret) - goto out_free; - - ret = set_pasid_state(dev_state, pasid_state, pasid); - if (ret) - goto out_unregister; - - ret = amd_iommu_domain_set_gcr3(dev_state->domain, pasid, - __pa(pasid_state->mm->pgd)); - if (ret) - goto out_clear_state; - - /* Now we are ready to handle faults */ - pasid_state->invalid = false; - - /* - * Drop the reference to the mm_struct here. We rely on the - * mmu_notifier release call-back to inform us when the mm - * is going away. - */ - mmput(mm); - - return 0; - -out_clear_state: - clear_pasid_state(dev_state, pasid); - -out_unregister: - mmu_notifier_unregister(&pasid_state->mn, mm); - mmput(mm); - -out_free: - free_pasid_state(pasid_state); - -out: - put_device_state(dev_state); - - return ret; -} -EXPORT_SYMBOL(amd_iommu_bind_pasid); - -void amd_iommu_unbind_pasid(struct pci_dev *pdev, u32 pasid) -{ - struct pasid_state *pasid_state; - struct device_state *dev_state; - u32 sbdf; - - might_sleep(); - - if (!amd_iommu_v2_supported()) - return; - - sbdf = get_pci_sbdf_id(pdev); - dev_state = get_device_state(sbdf); - if (dev_state == NULL) - return; - - if (pasid >= dev_state->max_pasids) - goto out; - - pasid_state = get_pasid_state(dev_state, pasid); - if (pasid_state == NULL) - goto out; - /* - * Drop reference taken here. We are safe because we still hold - * the reference taken in the amd_iommu_bind_pasid function. - */ - put_pasid_state(pasid_state); - - /* Clear the pasid state so that the pasid can be re-used */ - clear_pasid_state(dev_state, pasid_state->pasid); - - /* - * Call mmu_notifier_unregister to drop our reference - * to pasid_state->mm - */ - mmu_notifier_unregister(&pasid_state->mn, pasid_state->mm); - - put_pasid_state_wait(pasid_state); /* Reference taken in - amd_iommu_bind_pasid */ -out: - /* Drop reference taken in this function */ - put_device_state(dev_state); - - /* Drop reference taken in amd_iommu_bind_pasid */ - put_device_state(dev_state); -} -EXPORT_SYMBOL(amd_iommu_unbind_pasid); - -int amd_iommu_init_device(struct pci_dev *pdev, int pasids) -{ - struct device_state *dev_state; - struct iommu_group *group; - unsigned long flags; - int ret, tmp; - u32 sbdf; - - might_sleep(); - - /* - * When memory encryption is active the device is likely not in a - * direct-mapped domain. Forbid using IOMMUv2 functionality for now. - */ - if (cc_platform_has(CC_ATTR_MEM_ENCRYPT)) - return -ENODEV; - - if (!amd_iommu_v2_supported()) - return -ENODEV; - - if (pasids <= 0 || pasids > (PASID_MASK + 1)) - return -EINVAL; - - sbdf = get_pci_sbdf_id(pdev); - - dev_state = kzalloc(sizeof(*dev_state), GFP_KERNEL); - if (dev_state == NULL) - return -ENOMEM; - - spin_lock_init(&dev_state->lock); - init_waitqueue_head(&dev_state->wq); - dev_state->pdev = pdev; - dev_state->sbdf = sbdf; - - tmp = pasids; - for (dev_state->pasid_levels = 0; (tmp - 1) & ~0x1ff; tmp >>= 9) - dev_state->pasid_levels += 1; - - atomic_set(&dev_state->count, 1); - dev_state->max_pasids = pasids; - - ret = -ENOMEM; - dev_state->states = (void *)get_zeroed_page(GFP_KERNEL); - if (dev_state->states == NULL) - goto out_free_dev_state; - - dev_state->domain = iommu_domain_alloc(&pci_bus_type); - if (dev_state->domain == NULL) - goto out_free_states; - - /* See iommu_is_default_domain() */ - dev_state->domain->type = IOMMU_DOMAIN_IDENTITY; - amd_iommu_domain_direct_map(dev_state->domain); - - ret = amd_iommu_domain_enable_v2(dev_state->domain, pasids); - if (ret) - goto out_free_domain; - - group = iommu_group_get(&pdev->dev); - if (!group) { - ret = -EINVAL; - goto out_free_domain; - } - - ret = iommu_attach_group(dev_state->domain, group); - if (ret != 0) - goto out_drop_group; - - iommu_group_put(group); - - spin_lock_irqsave(&state_lock, flags); - - if (__get_device_state(sbdf) != NULL) { - spin_unlock_irqrestore(&state_lock, flags); - ret = -EBUSY; - goto out_free_domain; - } - - list_add_tail(&dev_state->list, &state_list); - - spin_unlock_irqrestore(&state_lock, flags); - - return 0; - -out_drop_group: - iommu_group_put(group); - -out_free_domain: - iommu_domain_free(dev_state->domain); - -out_free_states: - free_page((unsigned long)dev_state->states); - -out_free_dev_state: - kfree(dev_state); - - return ret; -} -EXPORT_SYMBOL(amd_iommu_init_device); - -void amd_iommu_free_device(struct pci_dev *pdev) -{ - struct device_state *dev_state; - unsigned long flags; - u32 sbdf; - - if (!amd_iommu_v2_supported()) - return; - - sbdf = get_pci_sbdf_id(pdev); - - spin_lock_irqsave(&state_lock, flags); - - dev_state = __get_device_state(sbdf); - if (dev_state == NULL) { - spin_unlock_irqrestore(&state_lock, flags); - return; - } - - list_del(&dev_state->list); - - spin_unlock_irqrestore(&state_lock, flags); - - put_device_state(dev_state); - free_device_state(dev_state); -} -EXPORT_SYMBOL(amd_iommu_free_device); - -int amd_iommu_set_invalid_ppr_cb(struct pci_dev *pdev, - amd_iommu_invalid_ppr_cb cb) -{ - struct device_state *dev_state; - unsigned long flags; - u32 sbdf; - int ret; - - if (!amd_iommu_v2_supported()) - return -ENODEV; - - sbdf = get_pci_sbdf_id(pdev); - - spin_lock_irqsave(&state_lock, flags); - - ret = -EINVAL; - dev_state = __get_device_state(sbdf); - if (dev_state == NULL) - goto out_unlock; - - dev_state->inv_ppr_cb = cb; - - ret = 0; - -out_unlock: - spin_unlock_irqrestore(&state_lock, flags); - - return ret; -} -EXPORT_SYMBOL(amd_iommu_set_invalid_ppr_cb); - -int amd_iommu_set_invalidate_ctx_cb(struct pci_dev *pdev, - amd_iommu_invalidate_ctx cb) -{ - struct device_state *dev_state; - unsigned long flags; - u32 sbdf; - int ret; - - if (!amd_iommu_v2_supported()) - return -ENODEV; - - sbdf = get_pci_sbdf_id(pdev); - - spin_lock_irqsave(&state_lock, flags); - - ret = -EINVAL; - dev_state = __get_device_state(sbdf); - if (dev_state == NULL) - goto out_unlock; - - dev_state->inv_ctx_cb = cb; - - ret = 0; - -out_unlock: - spin_unlock_irqrestore(&state_lock, flags); - - return ret; -} -EXPORT_SYMBOL(amd_iommu_set_invalidate_ctx_cb); - -static int __init amd_iommu_v2_init(void) -{ - int ret; - - if (!amd_iommu_v2_supported()) { - pr_info("AMD IOMMUv2 functionality not available on this system - This is not a bug.\n"); - /* - * Load anyway to provide the symbols to other modules - * which may use AMD IOMMUv2 optionally. - */ - return 0; - } - - ret = -ENOMEM; - iommu_wq = alloc_workqueue("amd_iommu_v2", WQ_MEM_RECLAIM, 0); - if (iommu_wq == NULL) - goto out; - - amd_iommu_register_ppr_notifier(&ppr_nb); - - pr_info("AMD IOMMUv2 loaded and initialized\n"); - - return 0; - -out: - return ret; -} - -static void __exit amd_iommu_v2_exit(void) -{ - struct device_state *dev_state, *next; - unsigned long flags; - LIST_HEAD(freelist); - - if (!amd_iommu_v2_supported()) - return; - - amd_iommu_unregister_ppr_notifier(&ppr_nb); - - flush_workqueue(iommu_wq); - - /* - * The loop below might call flush_workqueue(), so call - * destroy_workqueue() after it - */ - spin_lock_irqsave(&state_lock, flags); - - list_for_each_entry_safe(dev_state, next, &state_list, list) { - WARN_ON_ONCE(1); - - put_device_state(dev_state); - list_del(&dev_state->list); - list_add_tail(&dev_state->list, &freelist); - } - - spin_unlock_irqrestore(&state_lock, flags); - - /* - * Since free_device_state waits on the count to be zero, - * we need to free dev_state outside the spinlock. - */ - list_for_each_entry_safe(dev_state, next, &freelist, list) { - list_del(&dev_state->list); - free_device_state(dev_state); - } - - destroy_workqueue(iommu_wq); -} - -module_init(amd_iommu_v2_init); -module_exit(amd_iommu_v2_exit); diff --git a/drivers/iommu/apple-dart.c b/drivers/iommu/apple-dart.c index 0b89275084..ee05f4824b 100644 --- a/drivers/iommu/apple-dart.c +++ b/drivers/iommu/apple-dart.c @@ -196,7 +196,6 @@ struct apple_dart_hw { * @lock: lock for hardware operations involving this dart * @pgsize: pagesize supported by this DART * @supports_bypass: indicates if this DART supports bypass mode - * @force_bypass: force bypass mode due to pagesize mismatch? * @sid2group: maps stream ids to iommu_groups * @iommu: iommu core device */ @@ -217,7 +216,6 @@ struct apple_dart { u32 pgsize; u32 num_streams; u32 supports_bypass : 1; - u32 force_bypass : 1; struct iommu_group *sid2group[DART_MAX_STREAMS]; struct iommu_device iommu; @@ -506,10 +504,11 @@ static void apple_dart_iotlb_sync(struct iommu_domain *domain, apple_dart_domain_flush_tlb(to_dart_domain(domain)); } -static void apple_dart_iotlb_sync_map(struct iommu_domain *domain, - unsigned long iova, size_t size) +static int apple_dart_iotlb_sync_map(struct iommu_domain *domain, + unsigned long iova, size_t size) { apple_dart_domain_flush_tlb(to_dart_domain(domain)); + return 0; } static phys_addr_t apple_dart_iova_to_phys(struct iommu_domain *domain, @@ -568,15 +567,17 @@ apple_dart_setup_translation(struct apple_dart_domain *domain, stream_map->dart->hw->invalidate_tlb(stream_map); } -static int apple_dart_finalize_domain(struct iommu_domain *domain, +static int apple_dart_finalize_domain(struct apple_dart_domain *dart_domain, struct apple_dart_master_cfg *cfg) { - struct apple_dart_domain *dart_domain = to_dart_domain(domain); struct apple_dart *dart = cfg->stream_maps[0].dart; struct io_pgtable_cfg pgtbl_cfg; int ret = 0; int i, j; + if (dart->pgsize > PAGE_SIZE) + return -EINVAL; + mutex_lock(&dart_domain->init_lock); if (dart_domain->finalized) @@ -597,17 +598,18 @@ static int apple_dart_finalize_domain(struct iommu_domain *domain, .iommu_dev = dart->dev, }; - dart_domain->pgtbl_ops = - alloc_io_pgtable_ops(dart->hw->fmt, &pgtbl_cfg, domain); + dart_domain->pgtbl_ops = alloc_io_pgtable_ops(dart->hw->fmt, &pgtbl_cfg, + &dart_domain->domain); if (!dart_domain->pgtbl_ops) { ret = -ENOMEM; goto done; } - domain->pgsize_bitmap = pgtbl_cfg.pgsize_bitmap; - domain->geometry.aperture_start = 0; - domain->geometry.aperture_end = (dma_addr_t)DMA_BIT_MASK(dart->ias); - domain->geometry.force_aperture = true; + dart_domain->domain.pgsize_bitmap = pgtbl_cfg.pgsize_bitmap; + dart_domain->domain.geometry.aperture_start = 0; + dart_domain->domain.geometry.aperture_end = + (dma_addr_t)DMA_BIT_MASK(dart->ias); + dart_domain->domain.geometry.force_aperture = true; dart_domain->finalized = true; @@ -651,47 +653,72 @@ static int apple_dart_domain_add_streams(struct apple_dart_domain *domain, true); } -static int apple_dart_attach_dev(struct iommu_domain *domain, - struct device *dev) +static int apple_dart_attach_dev_paging(struct iommu_domain *domain, + struct device *dev) { int ret, i; struct apple_dart_stream_map *stream_map; struct apple_dart_master_cfg *cfg = dev_iommu_priv_get(dev); struct apple_dart_domain *dart_domain = to_dart_domain(domain); - if (cfg->stream_maps[0].dart->force_bypass && - domain->type != IOMMU_DOMAIN_IDENTITY) - return -EINVAL; - if (!cfg->stream_maps[0].dart->supports_bypass && - domain->type == IOMMU_DOMAIN_IDENTITY) - return -EINVAL; + ret = apple_dart_finalize_domain(dart_domain, cfg); + if (ret) + return ret; - ret = apple_dart_finalize_domain(domain, cfg); + ret = apple_dart_domain_add_streams(dart_domain, cfg); if (ret) return ret; - switch (domain->type) { - default: - ret = apple_dart_domain_add_streams(dart_domain, cfg); - if (ret) - return ret; + for_each_stream_map(i, cfg, stream_map) + apple_dart_setup_translation(dart_domain, stream_map); + return 0; +} - for_each_stream_map(i, cfg, stream_map) - apple_dart_setup_translation(dart_domain, stream_map); - break; - case IOMMU_DOMAIN_BLOCKED: - for_each_stream_map(i, cfg, stream_map) - apple_dart_hw_disable_dma(stream_map); - break; - case IOMMU_DOMAIN_IDENTITY: - for_each_stream_map(i, cfg, stream_map) - apple_dart_hw_enable_bypass(stream_map); - break; - } +static int apple_dart_attach_dev_identity(struct iommu_domain *domain, + struct device *dev) +{ + struct apple_dart_master_cfg *cfg = dev_iommu_priv_get(dev); + struct apple_dart_stream_map *stream_map; + int i; - return ret; + if (!cfg->stream_maps[0].dart->supports_bypass) + return -EINVAL; + + for_each_stream_map(i, cfg, stream_map) + apple_dart_hw_enable_bypass(stream_map); + return 0; } +static const struct iommu_domain_ops apple_dart_identity_ops = { + .attach_dev = apple_dart_attach_dev_identity, +}; + +static struct iommu_domain apple_dart_identity_domain = { + .type = IOMMU_DOMAIN_IDENTITY, + .ops = &apple_dart_identity_ops, +}; + +static int apple_dart_attach_dev_blocked(struct iommu_domain *domain, + struct device *dev) +{ + struct apple_dart_master_cfg *cfg = dev_iommu_priv_get(dev); + struct apple_dart_stream_map *stream_map; + int i; + + for_each_stream_map(i, cfg, stream_map) + apple_dart_hw_disable_dma(stream_map); + return 0; +} + +static const struct iommu_domain_ops apple_dart_blocked_ops = { + .attach_dev = apple_dart_attach_dev_blocked, +}; + +static struct iommu_domain apple_dart_blocked_domain = { + .type = IOMMU_DOMAIN_BLOCKED, + .ops = &apple_dart_blocked_ops, +}; + static struct iommu_device *apple_dart_probe_device(struct device *dev) { struct apple_dart_master_cfg *cfg = dev_iommu_priv_get(dev); @@ -717,24 +744,26 @@ static void apple_dart_release_device(struct device *dev) kfree(cfg); } -static struct iommu_domain *apple_dart_domain_alloc(unsigned int type) +static struct iommu_domain *apple_dart_domain_alloc_paging(struct device *dev) { struct apple_dart_domain *dart_domain; - if (type != IOMMU_DOMAIN_DMA && type != IOMMU_DOMAIN_UNMANAGED && - type != IOMMU_DOMAIN_IDENTITY && type != IOMMU_DOMAIN_BLOCKED) - return NULL; - dart_domain = kzalloc(sizeof(*dart_domain), GFP_KERNEL); if (!dart_domain) return NULL; mutex_init(&dart_domain->init_lock); - /* no need to allocate pgtbl_ops or do any other finalization steps */ - if (type == IOMMU_DOMAIN_IDENTITY || type == IOMMU_DOMAIN_BLOCKED) - dart_domain->finalized = true; + if (dev) { + struct apple_dart_master_cfg *cfg = dev_iommu_priv_get(dev); + int ret; + ret = apple_dart_finalize_domain(dart_domain, cfg); + if (ret) { + kfree(dart_domain); + return ERR_PTR(ret); + } + } return &dart_domain->domain; } @@ -770,8 +799,6 @@ static int apple_dart_of_xlate(struct device *dev, struct of_phandle_args *args) if (cfg_dart) { if (cfg_dart->supports_bypass != dart->supports_bypass) return -EINVAL; - if (cfg_dart->force_bypass != dart->force_bypass) - return -EINVAL; if (cfg_dart->pgsize != dart->pgsize) return -EINVAL; } @@ -913,7 +940,7 @@ static int apple_dart_def_domain_type(struct device *dev) { struct apple_dart_master_cfg *cfg = dev_iommu_priv_get(dev); - if (cfg->stream_maps[0].dart->force_bypass) + if (cfg->stream_maps[0].dart->pgsize > PAGE_SIZE) return IOMMU_DOMAIN_IDENTITY; if (!cfg->stream_maps[0].dart->supports_bypass) return IOMMU_DOMAIN_DMA; @@ -947,7 +974,9 @@ static void apple_dart_get_resv_regions(struct device *dev, } static const struct iommu_ops apple_dart_iommu_ops = { - .domain_alloc = apple_dart_domain_alloc, + .identity_domain = &apple_dart_identity_domain, + .blocked_domain = &apple_dart_blocked_domain, + .domain_alloc_paging = apple_dart_domain_alloc_paging, .probe_device = apple_dart_probe_device, .release_device = apple_dart_release_device, .device_group = apple_dart_device_group, @@ -957,7 +986,7 @@ static const struct iommu_ops apple_dart_iommu_ops = { .pgsize_bitmap = -1UL, /* Restricted during dart probe */ .owner = THIS_MODULE, .default_domain_ops = &(const struct iommu_domain_ops) { - .attach_dev = apple_dart_attach_dev, + .attach_dev = apple_dart_attach_dev_paging, .map_pages = apple_dart_map_pages, .unmap_pages = apple_dart_unmap_pages, .flush_iotlb_all = apple_dart_flush_iotlb_all, @@ -1111,8 +1140,6 @@ static int apple_dart_probe(struct platform_device *pdev) goto err_clk_disable; } - dart->force_bypass = dart->pgsize > PAGE_SIZE; - ret = apple_dart_hw_reset(dart); if (ret) goto err_clk_disable; @@ -1136,7 +1163,8 @@ static int apple_dart_probe(struct platform_device *pdev) dev_info( &pdev->dev, "DART [pagesize %x, %d streams, bypass support: %d, bypass forced: %d] initialized\n", - dart->pgsize, dart->num_streams, dart->supports_bypass, dart->force_bypass); + dart->pgsize, dart->num_streams, dart->supports_bypass, + dart->pgsize > PAGE_SIZE); return 0; err_sysfs_remove: diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c index 8a16cd3ef4..4a27fbdb2d 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c @@ -25,11 +25,9 @@ struct arm_smmu_mmu_notifier { #define mn_to_smmu(mn) container_of(mn, struct arm_smmu_mmu_notifier, mn) struct arm_smmu_bond { - struct iommu_sva sva; struct mm_struct *mm; struct arm_smmu_mmu_notifier *smmu_mn; struct list_head list; - refcount_t refs; }; #define sva_to_bond(handle) \ @@ -38,6 +36,25 @@ struct arm_smmu_bond { static DEFINE_MUTEX(sva_lock); /* + * Write the CD to the CD tables for all masters that this domain is attached + * to. Note that this is only used to update existing CD entries in the target + * CD table, for which it's assumed that arm_smmu_write_ctx_desc can't fail. + */ +static void arm_smmu_update_ctx_desc_devices(struct arm_smmu_domain *smmu_domain, + int ssid, + struct arm_smmu_ctx_desc *cd) +{ + struct arm_smmu_master *master; + unsigned long flags; + + spin_lock_irqsave(&smmu_domain->devices_lock, flags); + list_for_each_entry(master, &smmu_domain->devices, domain_head) { + arm_smmu_write_ctx_desc(master, ssid, cd); + } + spin_unlock_irqrestore(&smmu_domain->devices_lock, flags); +} + +/* * Check if the CPU ASID is available on the SMMU side. If a private context * descriptor is using it, try to replace it. */ @@ -62,7 +79,7 @@ arm_smmu_share_asid(struct mm_struct *mm, u16 asid) return cd; } - smmu_domain = container_of(cd, struct arm_smmu_domain, s1_cfg.cd); + smmu_domain = container_of(cd, struct arm_smmu_domain, cd); smmu = smmu_domain->smmu; ret = xa_alloc(&arm_smmu_asid_xa, &new_asid, cd, @@ -80,7 +97,7 @@ arm_smmu_share_asid(struct mm_struct *mm, u16 asid) * be some overlap between use of both ASIDs, until we invalidate the * TLB. */ - arm_smmu_write_ctx_desc(smmu_domain, IOMMU_NO_PASID, cd); + arm_smmu_update_ctx_desc_devices(smmu_domain, IOMMU_NO_PASID, cd); /* Invalidate TLB entries previously associated with that context */ arm_smmu_tlb_inv_asid(smmu, asid); @@ -229,7 +246,8 @@ static void arm_smmu_mm_arch_invalidate_secondary_tlbs(struct mmu_notifier *mn, smmu_domain); } - arm_smmu_atc_inv_domain(smmu_domain, mm->pasid, start, size); + arm_smmu_atc_inv_domain(smmu_domain, mm_get_enqcmd_pasid(mm), start, + size); } static void arm_smmu_mm_release(struct mmu_notifier *mn, struct mm_struct *mm) @@ -247,10 +265,11 @@ static void arm_smmu_mm_release(struct mmu_notifier *mn, struct mm_struct *mm) * DMA may still be running. Keep the cd valid to avoid C_BAD_CD events, * but disable translation. */ - arm_smmu_write_ctx_desc(smmu_domain, mm->pasid, &quiet_cd); + arm_smmu_update_ctx_desc_devices(smmu_domain, mm_get_enqcmd_pasid(mm), + &quiet_cd); arm_smmu_tlb_inv_asid(smmu_domain->smmu, smmu_mn->cd->asid); - arm_smmu_atc_inv_domain(smmu_domain, mm->pasid, 0, 0); + arm_smmu_atc_inv_domain(smmu_domain, mm_get_enqcmd_pasid(mm), 0, 0); smmu_mn->cleared = true; mutex_unlock(&sva_lock); @@ -304,16 +323,9 @@ arm_smmu_mmu_notifier_get(struct arm_smmu_domain *smmu_domain, goto err_free_cd; } - ret = arm_smmu_write_ctx_desc(smmu_domain, mm->pasid, cd); - if (ret) - goto err_put_notifier; - list_add(&smmu_mn->list, &smmu_domain->mmu_notifiers); return smmu_mn; -err_put_notifier: - /* Frees smmu_mn */ - mmu_notifier_put(&smmu_mn->mn); err_free_cd: arm_smmu_free_shared_cd(cd); return ERR_PTR(ret); @@ -329,7 +341,6 @@ static void arm_smmu_mmu_notifier_put(struct arm_smmu_mmu_notifier *smmu_mn) return; list_del(&smmu_mn->list); - arm_smmu_write_ctx_desc(smmu_domain, mm->pasid, NULL); /* * If we went through clear(), we've already invalidated, and no @@ -337,7 +348,8 @@ static void arm_smmu_mmu_notifier_put(struct arm_smmu_mmu_notifier *smmu_mn) */ if (!smmu_mn->cleared) { arm_smmu_tlb_inv_asid(smmu_domain->smmu, cd->asid); - arm_smmu_atc_inv_domain(smmu_domain, mm->pasid, 0, 0); + arm_smmu_atc_inv_domain(smmu_domain, mm_get_enqcmd_pasid(mm), 0, + 0); } /* Frees smmu_mn */ @@ -345,8 +357,8 @@ static void arm_smmu_mmu_notifier_put(struct arm_smmu_mmu_notifier *smmu_mn) arm_smmu_free_shared_cd(cd); } -static struct iommu_sva * -__arm_smmu_sva_bind(struct device *dev, struct mm_struct *mm) +static int __arm_smmu_sva_bind(struct device *dev, ioasid_t pasid, + struct mm_struct *mm) { int ret; struct arm_smmu_bond *bond; @@ -355,23 +367,13 @@ __arm_smmu_sva_bind(struct device *dev, struct mm_struct *mm) struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain); if (!master || !master->sva_enabled) - return ERR_PTR(-ENODEV); - - /* If bind() was already called for this {dev, mm} pair, reuse it. */ - list_for_each_entry(bond, &master->bonds, list) { - if (bond->mm == mm) { - refcount_inc(&bond->refs); - return &bond->sva; - } - } + return -ENODEV; bond = kzalloc(sizeof(*bond), GFP_KERNEL); if (!bond) - return ERR_PTR(-ENOMEM); + return -ENOMEM; bond->mm = mm; - bond->sva.dev = dev; - refcount_set(&bond->refs, 1); bond->smmu_mn = arm_smmu_mmu_notifier_get(smmu_domain, mm); if (IS_ERR(bond->smmu_mn)) { @@ -379,12 +381,18 @@ __arm_smmu_sva_bind(struct device *dev, struct mm_struct *mm) goto err_free_bond; } + ret = arm_smmu_write_ctx_desc(master, pasid, bond->smmu_mn->cd); + if (ret) + goto err_put_notifier; + list_add(&bond->list, &master->bonds); - return &bond->sva; + return 0; +err_put_notifier: + arm_smmu_mmu_notifier_put(bond->smmu_mn); err_free_bond: kfree(bond); - return ERR_PTR(ret); + return ret; } bool arm_smmu_sva_supported(struct arm_smmu_device *smmu) @@ -543,6 +551,9 @@ void arm_smmu_sva_remove_dev_pasid(struct iommu_domain *domain, struct arm_smmu_master *master = dev_iommu_priv_get(dev); mutex_lock(&sva_lock); + + arm_smmu_write_ctx_desc(master, id, NULL); + list_for_each_entry(t, &master->bonds, list) { if (t->mm == mm) { bond = t; @@ -550,7 +561,7 @@ void arm_smmu_sva_remove_dev_pasid(struct iommu_domain *domain, } } - if (!WARN_ON(!bond) && refcount_dec_and_test(&bond->refs)) { + if (!WARN_ON(!bond)) { list_del(&bond->list); arm_smmu_mmu_notifier_put(bond->smmu_mn); kfree(bond); @@ -562,13 +573,10 @@ static int arm_smmu_sva_set_dev_pasid(struct iommu_domain *domain, struct device *dev, ioasid_t id) { int ret = 0; - struct iommu_sva *handle; struct mm_struct *mm = domain->mm; mutex_lock(&sva_lock); - handle = __arm_smmu_sva_bind(dev, mm); - if (IS_ERR(handle)) - ret = PTR_ERR(handle); + ret = __arm_smmu_sva_bind(dev, id, mm); mutex_unlock(&sva_lock); return ret; diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index bd0a596f98..7445454c2a 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -971,14 +971,12 @@ void arm_smmu_tlb_inv_asid(struct arm_smmu_device *smmu, u16 asid) arm_smmu_cmdq_issue_cmd_with_sync(smmu, &cmd); } -static void arm_smmu_sync_cd(struct arm_smmu_domain *smmu_domain, +static void arm_smmu_sync_cd(struct arm_smmu_master *master, int ssid, bool leaf) { size_t i; - unsigned long flags; - struct arm_smmu_master *master; struct arm_smmu_cmdq_batch cmds; - struct arm_smmu_device *smmu = smmu_domain->smmu; + struct arm_smmu_device *smmu = master->smmu; struct arm_smmu_cmdq_ent cmd = { .opcode = CMDQ_OP_CFGI_CD, .cfgi = { @@ -988,15 +986,10 @@ static void arm_smmu_sync_cd(struct arm_smmu_domain *smmu_domain, }; cmds.num = 0; - - spin_lock_irqsave(&smmu_domain->devices_lock, flags); - list_for_each_entry(master, &smmu_domain->devices, domain_head) { - for (i = 0; i < master->num_streams; i++) { - cmd.cfgi.sid = master->streams[i].id; - arm_smmu_cmdq_batch_add(smmu, &cmds, &cmd); - } + for (i = 0; i < master->num_streams; i++) { + cmd.cfgi.sid = master->streams[i].id; + arm_smmu_cmdq_batch_add(smmu, &cmds, &cmd); } - spin_unlock_irqrestore(&smmu_domain->devices_lock, flags); arm_smmu_cmdq_batch_submit(smmu, &cmds); } @@ -1026,34 +1019,33 @@ static void arm_smmu_write_cd_l1_desc(__le64 *dst, WRITE_ONCE(*dst, cpu_to_le64(val)); } -static __le64 *arm_smmu_get_cd_ptr(struct arm_smmu_domain *smmu_domain, - u32 ssid) +static __le64 *arm_smmu_get_cd_ptr(struct arm_smmu_master *master, u32 ssid) { __le64 *l1ptr; unsigned int idx; struct arm_smmu_l1_ctx_desc *l1_desc; - struct arm_smmu_device *smmu = smmu_domain->smmu; - struct arm_smmu_ctx_desc_cfg *cdcfg = &smmu_domain->s1_cfg.cdcfg; + struct arm_smmu_device *smmu = master->smmu; + struct arm_smmu_ctx_desc_cfg *cd_table = &master->cd_table; - if (smmu_domain->s1_cfg.s1fmt == STRTAB_STE_0_S1FMT_LINEAR) - return cdcfg->cdtab + ssid * CTXDESC_CD_DWORDS; + if (cd_table->s1fmt == STRTAB_STE_0_S1FMT_LINEAR) + return cd_table->cdtab + ssid * CTXDESC_CD_DWORDS; idx = ssid >> CTXDESC_SPLIT; - l1_desc = &cdcfg->l1_desc[idx]; + l1_desc = &cd_table->l1_desc[idx]; if (!l1_desc->l2ptr) { if (arm_smmu_alloc_cd_leaf_table(smmu, l1_desc)) return NULL; - l1ptr = cdcfg->cdtab + idx * CTXDESC_L1_DESC_DWORDS; + l1ptr = cd_table->cdtab + idx * CTXDESC_L1_DESC_DWORDS; arm_smmu_write_cd_l1_desc(l1ptr, l1_desc); /* An invalid L1CD can be cached */ - arm_smmu_sync_cd(smmu_domain, ssid, false); + arm_smmu_sync_cd(master, ssid, false); } idx = ssid & (CTXDESC_L2_ENTRIES - 1); return l1_desc->l2ptr + idx * CTXDESC_CD_DWORDS; } -int arm_smmu_write_ctx_desc(struct arm_smmu_domain *smmu_domain, int ssid, +int arm_smmu_write_ctx_desc(struct arm_smmu_master *master, int ssid, struct arm_smmu_ctx_desc *cd) { /* @@ -1070,11 +1062,12 @@ int arm_smmu_write_ctx_desc(struct arm_smmu_domain *smmu_domain, int ssid, u64 val; bool cd_live; __le64 *cdptr; + struct arm_smmu_ctx_desc_cfg *cd_table = &master->cd_table; - if (WARN_ON(ssid >= (1 << smmu_domain->s1_cfg.s1cdmax))) + if (WARN_ON(ssid >= (1 << cd_table->s1cdmax))) return -E2BIG; - cdptr = arm_smmu_get_cd_ptr(smmu_domain, ssid); + cdptr = arm_smmu_get_cd_ptr(master, ssid); if (!cdptr) return -ENOMEM; @@ -1098,11 +1091,11 @@ int arm_smmu_write_ctx_desc(struct arm_smmu_domain *smmu_domain, int ssid, cdptr[3] = cpu_to_le64(cd->mair); /* - * STE is live, and the SMMU might read dwords of this CD in any + * STE may be live, and the SMMU might read dwords of this CD in any * order. Ensure that it observes valid values before reading * V=1. */ - arm_smmu_sync_cd(smmu_domain, ssid, true); + arm_smmu_sync_cd(master, ssid, true); val = cd->tcr | #ifdef __BIG_ENDIAN @@ -1114,7 +1107,7 @@ int arm_smmu_write_ctx_desc(struct arm_smmu_domain *smmu_domain, int ssid, FIELD_PREP(CTXDESC_CD_0_ASID, cd->asid) | CTXDESC_CD_0_V; - if (smmu_domain->stall_enabled) + if (cd_table->stall_enabled) val |= CTXDESC_CD_0_S; } @@ -1128,44 +1121,45 @@ int arm_smmu_write_ctx_desc(struct arm_smmu_domain *smmu_domain, int ssid, * without first making the structure invalid. */ WRITE_ONCE(cdptr[0], cpu_to_le64(val)); - arm_smmu_sync_cd(smmu_domain, ssid, true); + arm_smmu_sync_cd(master, ssid, true); return 0; } -static int arm_smmu_alloc_cd_tables(struct arm_smmu_domain *smmu_domain) +static int arm_smmu_alloc_cd_tables(struct arm_smmu_master *master) { int ret; size_t l1size; size_t max_contexts; - struct arm_smmu_device *smmu = smmu_domain->smmu; - struct arm_smmu_s1_cfg *cfg = &smmu_domain->s1_cfg; - struct arm_smmu_ctx_desc_cfg *cdcfg = &cfg->cdcfg; + struct arm_smmu_device *smmu = master->smmu; + struct arm_smmu_ctx_desc_cfg *cd_table = &master->cd_table; - max_contexts = 1 << cfg->s1cdmax; + cd_table->stall_enabled = master->stall_enabled; + cd_table->s1cdmax = master->ssid_bits; + max_contexts = 1 << cd_table->s1cdmax; if (!(smmu->features & ARM_SMMU_FEAT_2_LVL_CDTAB) || max_contexts <= CTXDESC_L2_ENTRIES) { - cfg->s1fmt = STRTAB_STE_0_S1FMT_LINEAR; - cdcfg->num_l1_ents = max_contexts; + cd_table->s1fmt = STRTAB_STE_0_S1FMT_LINEAR; + cd_table->num_l1_ents = max_contexts; l1size = max_contexts * (CTXDESC_CD_DWORDS << 3); } else { - cfg->s1fmt = STRTAB_STE_0_S1FMT_64K_L2; - cdcfg->num_l1_ents = DIV_ROUND_UP(max_contexts, + cd_table->s1fmt = STRTAB_STE_0_S1FMT_64K_L2; + cd_table->num_l1_ents = DIV_ROUND_UP(max_contexts, CTXDESC_L2_ENTRIES); - cdcfg->l1_desc = devm_kcalloc(smmu->dev, cdcfg->num_l1_ents, - sizeof(*cdcfg->l1_desc), + cd_table->l1_desc = devm_kcalloc(smmu->dev, cd_table->num_l1_ents, + sizeof(*cd_table->l1_desc), GFP_KERNEL); - if (!cdcfg->l1_desc) + if (!cd_table->l1_desc) return -ENOMEM; - l1size = cdcfg->num_l1_ents * (CTXDESC_L1_DESC_DWORDS << 3); + l1size = cd_table->num_l1_ents * (CTXDESC_L1_DESC_DWORDS << 3); } - cdcfg->cdtab = dmam_alloc_coherent(smmu->dev, l1size, &cdcfg->cdtab_dma, + cd_table->cdtab = dmam_alloc_coherent(smmu->dev, l1size, &cd_table->cdtab_dma, GFP_KERNEL); - if (!cdcfg->cdtab) { + if (!cd_table->cdtab) { dev_warn(smmu->dev, "failed to allocate context descriptor\n"); ret = -ENOMEM; goto err_free_l1; @@ -1174,42 +1168,42 @@ static int arm_smmu_alloc_cd_tables(struct arm_smmu_domain *smmu_domain) return 0; err_free_l1: - if (cdcfg->l1_desc) { - devm_kfree(smmu->dev, cdcfg->l1_desc); - cdcfg->l1_desc = NULL; + if (cd_table->l1_desc) { + devm_kfree(smmu->dev, cd_table->l1_desc); + cd_table->l1_desc = NULL; } return ret; } -static void arm_smmu_free_cd_tables(struct arm_smmu_domain *smmu_domain) +static void arm_smmu_free_cd_tables(struct arm_smmu_master *master) { int i; size_t size, l1size; - struct arm_smmu_device *smmu = smmu_domain->smmu; - struct arm_smmu_ctx_desc_cfg *cdcfg = &smmu_domain->s1_cfg.cdcfg; + struct arm_smmu_device *smmu = master->smmu; + struct arm_smmu_ctx_desc_cfg *cd_table = &master->cd_table; - if (cdcfg->l1_desc) { + if (cd_table->l1_desc) { size = CTXDESC_L2_ENTRIES * (CTXDESC_CD_DWORDS << 3); - for (i = 0; i < cdcfg->num_l1_ents; i++) { - if (!cdcfg->l1_desc[i].l2ptr) + for (i = 0; i < cd_table->num_l1_ents; i++) { + if (!cd_table->l1_desc[i].l2ptr) continue; dmam_free_coherent(smmu->dev, size, - cdcfg->l1_desc[i].l2ptr, - cdcfg->l1_desc[i].l2ptr_dma); + cd_table->l1_desc[i].l2ptr, + cd_table->l1_desc[i].l2ptr_dma); } - devm_kfree(smmu->dev, cdcfg->l1_desc); - cdcfg->l1_desc = NULL; + devm_kfree(smmu->dev, cd_table->l1_desc); + cd_table->l1_desc = NULL; - l1size = cdcfg->num_l1_ents * (CTXDESC_L1_DESC_DWORDS << 3); + l1size = cd_table->num_l1_ents * (CTXDESC_L1_DESC_DWORDS << 3); } else { - l1size = cdcfg->num_l1_ents * (CTXDESC_CD_DWORDS << 3); + l1size = cd_table->num_l1_ents * (CTXDESC_CD_DWORDS << 3); } - dmam_free_coherent(smmu->dev, l1size, cdcfg->cdtab, cdcfg->cdtab_dma); - cdcfg->cdtab_dma = 0; - cdcfg->cdtab = NULL; + dmam_free_coherent(smmu->dev, l1size, cd_table->cdtab, cd_table->cdtab_dma); + cd_table->cdtab_dma = 0; + cd_table->cdtab = NULL; } bool arm_smmu_free_asid(struct arm_smmu_ctx_desc *cd) @@ -1276,7 +1270,7 @@ static void arm_smmu_write_strtab_ent(struct arm_smmu_master *master, u32 sid, u64 val = le64_to_cpu(dst[0]); bool ste_live = false; struct arm_smmu_device *smmu = NULL; - struct arm_smmu_s1_cfg *s1_cfg = NULL; + struct arm_smmu_ctx_desc_cfg *cd_table = NULL; struct arm_smmu_s2_cfg *s2_cfg = NULL; struct arm_smmu_domain *smmu_domain = NULL; struct arm_smmu_cmdq_ent prefetch_cmd = { @@ -1294,7 +1288,7 @@ static void arm_smmu_write_strtab_ent(struct arm_smmu_master *master, u32 sid, if (smmu_domain) { switch (smmu_domain->stage) { case ARM_SMMU_DOMAIN_S1: - s1_cfg = &smmu_domain->s1_cfg; + cd_table = &master->cd_table; break; case ARM_SMMU_DOMAIN_S2: case ARM_SMMU_DOMAIN_NESTED: @@ -1325,7 +1319,7 @@ static void arm_smmu_write_strtab_ent(struct arm_smmu_master *master, u32 sid, val = STRTAB_STE_0_V; /* Bypass/fault */ - if (!smmu_domain || !(s1_cfg || s2_cfg)) { + if (!smmu_domain || !(cd_table || s2_cfg)) { if (!smmu_domain && disable_bypass) val |= FIELD_PREP(STRTAB_STE_0_CFG, STRTAB_STE_0_CFG_ABORT); else @@ -1344,7 +1338,7 @@ static void arm_smmu_write_strtab_ent(struct arm_smmu_master *master, u32 sid, return; } - if (s1_cfg) { + if (cd_table) { u64 strw = smmu->features & ARM_SMMU_FEAT_E2H ? STRTAB_STE_1_STRW_EL2 : STRTAB_STE_1_STRW_NSEL1; @@ -1360,10 +1354,10 @@ static void arm_smmu_write_strtab_ent(struct arm_smmu_master *master, u32 sid, !master->stall_enabled) dst[1] |= cpu_to_le64(STRTAB_STE_1_S1STALLD); - val |= (s1_cfg->cdcfg.cdtab_dma & STRTAB_STE_0_S1CTXPTR_MASK) | + val |= (cd_table->cdtab_dma & STRTAB_STE_0_S1CTXPTR_MASK) | FIELD_PREP(STRTAB_STE_0_CFG, STRTAB_STE_0_CFG_S1_TRANS) | - FIELD_PREP(STRTAB_STE_0_S1CDMAX, s1_cfg->s1cdmax) | - FIELD_PREP(STRTAB_STE_0_S1FMT, s1_cfg->s1fmt); + FIELD_PREP(STRTAB_STE_0_S1CDMAX, cd_table->s1cdmax) | + FIELD_PREP(STRTAB_STE_0_S1FMT, cd_table->s1fmt); } if (s2_cfg) { @@ -1869,7 +1863,7 @@ static void arm_smmu_tlb_inv_context(void *cookie) * careful, 007. */ if (smmu_domain->stage == ARM_SMMU_DOMAIN_S1) { - arm_smmu_tlb_inv_asid(smmu, smmu_domain->s1_cfg.cd.asid); + arm_smmu_tlb_inv_asid(smmu, smmu_domain->cd.asid); } else { cmd.opcode = CMDQ_OP_TLBI_S12_VMALL; cmd.tlbi.vmid = smmu_domain->s2_cfg.vmid; @@ -1962,7 +1956,7 @@ static void arm_smmu_tlb_inv_range_domain(unsigned long iova, size_t size, if (smmu_domain->stage == ARM_SMMU_DOMAIN_S1) { cmd.opcode = smmu_domain->smmu->features & ARM_SMMU_FEAT_E2H ? CMDQ_OP_TLBI_EL2_VA : CMDQ_OP_TLBI_NH_VA; - cmd.tlbi.asid = smmu_domain->s1_cfg.cd.asid; + cmd.tlbi.asid = smmu_domain->cd.asid; } else { cmd.opcode = CMDQ_OP_TLBI_S2_IPA; cmd.tlbi.vmid = smmu_domain->s2_cfg.vmid; @@ -2067,15 +2061,11 @@ static void arm_smmu_domain_free(struct iommu_domain *domain) free_io_pgtable_ops(smmu_domain->pgtbl_ops); - /* Free the CD and ASID, if we allocated them */ + /* Free the ASID or VMID */ if (smmu_domain->stage == ARM_SMMU_DOMAIN_S1) { - struct arm_smmu_s1_cfg *cfg = &smmu_domain->s1_cfg; - /* Prevent SVA from touching the CD while we're freeing it */ mutex_lock(&arm_smmu_asid_lock); - if (cfg->cdcfg.cdtab) - arm_smmu_free_cd_tables(smmu_domain); - arm_smmu_free_asid(&cfg->cd); + arm_smmu_free_asid(&smmu_domain->cd); mutex_unlock(&arm_smmu_asid_lock); } else { struct arm_smmu_s2_cfg *cfg = &smmu_domain->s2_cfg; @@ -2087,66 +2077,43 @@ static void arm_smmu_domain_free(struct iommu_domain *domain) } static int arm_smmu_domain_finalise_s1(struct arm_smmu_domain *smmu_domain, - struct arm_smmu_master *master, struct io_pgtable_cfg *pgtbl_cfg) { int ret; u32 asid; struct arm_smmu_device *smmu = smmu_domain->smmu; - struct arm_smmu_s1_cfg *cfg = &smmu_domain->s1_cfg; + struct arm_smmu_ctx_desc *cd = &smmu_domain->cd; typeof(&pgtbl_cfg->arm_lpae_s1_cfg.tcr) tcr = &pgtbl_cfg->arm_lpae_s1_cfg.tcr; - refcount_set(&cfg->cd.refs, 1); + refcount_set(&cd->refs, 1); /* Prevent SVA from modifying the ASID until it is written to the CD */ mutex_lock(&arm_smmu_asid_lock); - ret = xa_alloc(&arm_smmu_asid_xa, &asid, &cfg->cd, + ret = xa_alloc(&arm_smmu_asid_xa, &asid, cd, XA_LIMIT(1, (1 << smmu->asid_bits) - 1), GFP_KERNEL); if (ret) goto out_unlock; - cfg->s1cdmax = master->ssid_bits; - - smmu_domain->stall_enabled = master->stall_enabled; - - ret = arm_smmu_alloc_cd_tables(smmu_domain); - if (ret) - goto out_free_asid; - - cfg->cd.asid = (u16)asid; - cfg->cd.ttbr = pgtbl_cfg->arm_lpae_s1_cfg.ttbr; - cfg->cd.tcr = FIELD_PREP(CTXDESC_CD_0_TCR_T0SZ, tcr->tsz) | + cd->asid = (u16)asid; + cd->ttbr = pgtbl_cfg->arm_lpae_s1_cfg.ttbr; + cd->tcr = FIELD_PREP(CTXDESC_CD_0_TCR_T0SZ, tcr->tsz) | FIELD_PREP(CTXDESC_CD_0_TCR_TG0, tcr->tg) | FIELD_PREP(CTXDESC_CD_0_TCR_IRGN0, tcr->irgn) | FIELD_PREP(CTXDESC_CD_0_TCR_ORGN0, tcr->orgn) | FIELD_PREP(CTXDESC_CD_0_TCR_SH0, tcr->sh) | FIELD_PREP(CTXDESC_CD_0_TCR_IPS, tcr->ips) | CTXDESC_CD_0_TCR_EPD1 | CTXDESC_CD_0_AA64; - cfg->cd.mair = pgtbl_cfg->arm_lpae_s1_cfg.mair; - - /* - * Note that this will end up calling arm_smmu_sync_cd() before - * the master has been added to the devices list for this domain. - * This isn't an issue because the STE hasn't been installed yet. - */ - ret = arm_smmu_write_ctx_desc(smmu_domain, IOMMU_NO_PASID, &cfg->cd); - if (ret) - goto out_free_cd_tables; + cd->mair = pgtbl_cfg->arm_lpae_s1_cfg.mair; mutex_unlock(&arm_smmu_asid_lock); return 0; -out_free_cd_tables: - arm_smmu_free_cd_tables(smmu_domain); -out_free_asid: - arm_smmu_free_asid(&cfg->cd); out_unlock: mutex_unlock(&arm_smmu_asid_lock); return ret; } static int arm_smmu_domain_finalise_s2(struct arm_smmu_domain *smmu_domain, - struct arm_smmu_master *master, struct io_pgtable_cfg *pgtbl_cfg) { int vmid; @@ -2173,8 +2140,7 @@ static int arm_smmu_domain_finalise_s2(struct arm_smmu_domain *smmu_domain, return 0; } -static int arm_smmu_domain_finalise(struct iommu_domain *domain, - struct arm_smmu_master *master) +static int arm_smmu_domain_finalise(struct iommu_domain *domain) { int ret; unsigned long ias, oas; @@ -2182,7 +2148,6 @@ static int arm_smmu_domain_finalise(struct iommu_domain *domain, struct io_pgtable_cfg pgtbl_cfg; struct io_pgtable_ops *pgtbl_ops; int (*finalise_stage_fn)(struct arm_smmu_domain *, - struct arm_smmu_master *, struct io_pgtable_cfg *); struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain); struct arm_smmu_device *smmu = smmu_domain->smmu; @@ -2234,7 +2199,7 @@ static int arm_smmu_domain_finalise(struct iommu_domain *domain, domain->geometry.aperture_end = (1UL << pgtbl_cfg.ias) - 1; domain->geometry.force_aperture = true; - ret = finalise_stage_fn(smmu_domain, master, &pgtbl_cfg); + ret = finalise_stage_fn(smmu_domain, &pgtbl_cfg); if (ret < 0) { free_io_pgtable_ops(pgtbl_ops); return ret; @@ -2403,6 +2368,14 @@ static void arm_smmu_detach_dev(struct arm_smmu_master *master) master->domain = NULL; master->ats_enabled = false; arm_smmu_install_ste_for_dev(master); + /* + * Clearing the CD entry isn't strictly required to detach the domain + * since the table is uninstalled anyway, but it helps avoid confusion + * in the call to arm_smmu_write_ctx_desc on the next attach (which + * expects the entry to be empty). + */ + if (smmu_domain->stage == ARM_SMMU_DOMAIN_S1 && master->cd_table.cdtab) + arm_smmu_write_ctx_desc(master, IOMMU_NO_PASID, NULL); } static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev) @@ -2436,23 +2409,15 @@ static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev) if (!smmu_domain->smmu) { smmu_domain->smmu = smmu; - ret = arm_smmu_domain_finalise(domain, master); - if (ret) { + ret = arm_smmu_domain_finalise(domain); + if (ret) smmu_domain->smmu = NULL; - goto out_unlock; - } - } else if (smmu_domain->smmu != smmu) { - ret = -EINVAL; - goto out_unlock; - } else if (smmu_domain->stage == ARM_SMMU_DOMAIN_S1 && - master->ssid_bits != smmu_domain->s1_cfg.s1cdmax) { + } else if (smmu_domain->smmu != smmu) ret = -EINVAL; - goto out_unlock; - } else if (smmu_domain->stage == ARM_SMMU_DOMAIN_S1 && - smmu_domain->stall_enabled != master->stall_enabled) { - ret = -EINVAL; - goto out_unlock; - } + + mutex_unlock(&smmu_domain->init_mutex); + if (ret) + return ret; master->domain = smmu_domain; @@ -2466,16 +2431,42 @@ static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev) if (smmu_domain->stage != ARM_SMMU_DOMAIN_BYPASS) master->ats_enabled = arm_smmu_ats_supported(master); - arm_smmu_install_ste_for_dev(master); - spin_lock_irqsave(&smmu_domain->devices_lock, flags); list_add(&master->domain_head, &smmu_domain->devices); spin_unlock_irqrestore(&smmu_domain->devices_lock, flags); + if (smmu_domain->stage == ARM_SMMU_DOMAIN_S1) { + if (!master->cd_table.cdtab) { + ret = arm_smmu_alloc_cd_tables(master); + if (ret) { + master->domain = NULL; + goto out_list_del; + } + } + + /* + * Prevent SVA from concurrently modifying the CD or writing to + * the CD entry + */ + mutex_lock(&arm_smmu_asid_lock); + ret = arm_smmu_write_ctx_desc(master, IOMMU_NO_PASID, &smmu_domain->cd); + mutex_unlock(&arm_smmu_asid_lock); + if (ret) { + master->domain = NULL; + goto out_list_del; + } + } + + arm_smmu_install_ste_for_dev(master); + arm_smmu_enable_ats(master); + return 0; + +out_list_del: + spin_lock_irqsave(&smmu_domain->devices_lock, flags); + list_del(&master->domain_head); + spin_unlock_irqrestore(&smmu_domain->devices_lock, flags); -out_unlock: - mutex_unlock(&smmu_domain->init_mutex); return ret; } @@ -2720,6 +2711,8 @@ static void arm_smmu_release_device(struct device *dev) arm_smmu_detach_dev(master); arm_smmu_disable_pasid(master); arm_smmu_remove_master(master); + if (master->cd_table.cdtab) + arm_smmu_free_cd_tables(master); kfree(master); } diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h index 9915850dd4..961205ba86 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h @@ -595,13 +595,11 @@ struct arm_smmu_ctx_desc_cfg { dma_addr_t cdtab_dma; struct arm_smmu_l1_ctx_desc *l1_desc; unsigned int num_l1_ents; -}; - -struct arm_smmu_s1_cfg { - struct arm_smmu_ctx_desc_cfg cdcfg; - struct arm_smmu_ctx_desc cd; u8 s1fmt; + /* log2 of the maximum number of CDs supported by this table */ u8 s1cdmax; + /* Whether CD entries in this table have the stall bit set. */ + u8 stall_enabled:1; }; struct arm_smmu_s2_cfg { @@ -697,6 +695,8 @@ struct arm_smmu_master { struct arm_smmu_domain *domain; struct list_head domain_head; struct arm_smmu_stream *streams; + /* Locked by the iommu core using the group mutex */ + struct arm_smmu_ctx_desc_cfg cd_table; unsigned int num_streams; bool ats_enabled; bool stall_enabled; @@ -719,13 +719,12 @@ struct arm_smmu_domain { struct mutex init_mutex; /* Protects smmu pointer */ struct io_pgtable_ops *pgtbl_ops; - bool stall_enabled; atomic_t nr_ats_masters; enum arm_smmu_domain_stage stage; union { - struct arm_smmu_s1_cfg s1_cfg; - struct arm_smmu_s2_cfg s2_cfg; + struct arm_smmu_ctx_desc cd; + struct arm_smmu_s2_cfg s2_cfg; }; struct iommu_domain domain; @@ -745,7 +744,7 @@ extern struct xarray arm_smmu_asid_xa; extern struct mutex arm_smmu_asid_lock; extern struct arm_smmu_ctx_desc quiet_cd; -int arm_smmu_write_ctx_desc(struct arm_smmu_domain *smmu_domain, int ssid, +int arm_smmu_write_ctx_desc(struct arm_smmu_master *smmu_master, int ssid, struct arm_smmu_ctx_desc *cd); void arm_smmu_tlb_inv_asid(struct arm_smmu_device *smmu, u16 asid); void arm_smmu_tlb_inv_range_asid(unsigned long iova, size_t size, int asid, diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c index 40503376d8..d326fa230b 100644 --- a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c +++ b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c @@ -252,6 +252,7 @@ static const struct of_device_id qcom_smmu_client_of_match[] __maybe_unused = { { .compatible = "qcom,sc7280-mss-pil" }, { .compatible = "qcom,sc8180x-mdss" }, { .compatible = "qcom,sc8280xp-mdss" }, + { .compatible = "qcom,sdm670-mdss" }, { .compatible = "qcom,sdm845-mdss" }, { .compatible = "qcom,sdm845-mss-pil" }, { .compatible = "qcom,sm6350-mdss" }, @@ -533,6 +534,7 @@ static const struct of_device_id __maybe_unused qcom_smmu_impl_of_match[] = { { .compatible = "qcom,sm6350-smmu-500", .data = &qcom_smmu_500_impl0_data }, { .compatible = "qcom,sm6375-smmu-v2", .data = &qcom_smmu_v2_data }, { .compatible = "qcom,sm6375-smmu-500", .data = &qcom_smmu_500_impl0_data }, + { .compatible = "qcom,sm7150-smmu-v2", .data = &qcom_smmu_v2_data }, { .compatible = "qcom,sm8150-smmu-500", .data = &qcom_smmu_500_impl0_data }, { .compatible = "qcom,sm8250-smmu-500", .data = &qcom_smmu_500_impl0_data }, { .compatible = "qcom,sm8350-smmu-500", .data = &qcom_smmu_500_impl0_data }, diff --git a/drivers/iommu/arm/arm-smmu/qcom_iommu.c b/drivers/iommu/arm/arm-smmu/qcom_iommu.c index 775a3cbaff..97b2122032 100644 --- a/drivers/iommu/arm/arm-smmu/qcom_iommu.c +++ b/drivers/iommu/arm/arm-smmu/qcom_iommu.c @@ -332,12 +332,10 @@ out_unlock: return ret; } -static struct iommu_domain *qcom_iommu_domain_alloc(unsigned type) +static struct iommu_domain *qcom_iommu_domain_alloc_paging(struct device *dev) { struct qcom_iommu_domain *qcom_domain; - if (type != IOMMU_DOMAIN_UNMANAGED && type != IOMMU_DOMAIN_DMA) - return NULL; /* * Allocate the domain and initialise some of its data structures. * We can't really do anything meaningful until we've added a @@ -400,6 +398,44 @@ static int qcom_iommu_attach_dev(struct iommu_domain *domain, struct device *dev return 0; } +static int qcom_iommu_identity_attach(struct iommu_domain *identity_domain, + struct device *dev) +{ + struct iommu_domain *domain = iommu_get_domain_for_dev(dev); + struct qcom_iommu_domain *qcom_domain; + struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); + struct qcom_iommu_dev *qcom_iommu = to_iommu(dev); + unsigned int i; + + if (domain == identity_domain || !domain) + return 0; + + qcom_domain = to_qcom_iommu_domain(domain); + if (WARN_ON(!qcom_domain->iommu)) + return -EINVAL; + + pm_runtime_get_sync(qcom_iommu->dev); + for (i = 0; i < fwspec->num_ids; i++) { + struct qcom_iommu_ctx *ctx = to_ctx(qcom_domain, fwspec->ids[i]); + + /* Disable the context bank: */ + iommu_writel(ctx, ARM_SMMU_CB_SCTLR, 0); + + ctx->domain = NULL; + } + pm_runtime_put_sync(qcom_iommu->dev); + return 0; +} + +static struct iommu_domain_ops qcom_iommu_identity_ops = { + .attach_dev = qcom_iommu_identity_attach, +}; + +static struct iommu_domain qcom_iommu_identity_domain = { + .type = IOMMU_DOMAIN_IDENTITY, + .ops = &qcom_iommu_identity_ops, +}; + static int qcom_iommu_map(struct iommu_domain *domain, unsigned long iova, phys_addr_t paddr, size_t pgsize, size_t pgcount, int prot, gfp_t gfp, size_t *mapped) @@ -565,8 +601,9 @@ static int qcom_iommu_of_xlate(struct device *dev, struct of_phandle_args *args) } static const struct iommu_ops qcom_iommu_ops = { + .identity_domain = &qcom_iommu_identity_domain, .capable = qcom_iommu_capable, - .domain_alloc = qcom_iommu_domain_alloc, + .domain_alloc_paging = qcom_iommu_domain_alloc_paging, .probe_device = qcom_iommu_probe_device, .device_group = generic_device_group, .of_xlate = qcom_iommu_of_xlate, diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c index e5d087bd6d..037fcf8264 100644 --- a/drivers/iommu/dma-iommu.c +++ b/drivers/iommu/dma-iommu.c @@ -44,14 +44,28 @@ enum iommu_dma_cookie_type { IOMMU_DMA_MSI_COOKIE, }; +enum iommu_dma_queue_type { + IOMMU_DMA_OPTS_PER_CPU_QUEUE, + IOMMU_DMA_OPTS_SINGLE_QUEUE, +}; + +struct iommu_dma_options { + enum iommu_dma_queue_type qt; + size_t fq_size; + unsigned int fq_timeout; +}; + struct iommu_dma_cookie { enum iommu_dma_cookie_type type; union { /* Full allocator for IOMMU_DMA_IOVA_COOKIE */ struct { struct iova_domain iovad; - - struct iova_fq __percpu *fq; /* Flush queue */ + /* Flush queue */ + union { + struct iova_fq *single_fq; + struct iova_fq __percpu *percpu_fq; + }; /* Number of TLB flushes that have been started */ atomic64_t fq_flush_start_cnt; /* Number of TLB flushes that have been finished */ @@ -68,6 +82,8 @@ struct iommu_dma_cookie { /* Domain for flush queue callback; NULL if flush queue not in use */ struct iommu_domain *fq_domain; + /* Options for dma-iommu use */ + struct iommu_dma_options options; struct mutex mutex; }; @@ -85,10 +101,12 @@ static int __init iommu_dma_forcedac_setup(char *str) early_param("iommu.forcedac", iommu_dma_forcedac_setup); /* Number of entries per flush queue */ -#define IOVA_FQ_SIZE 256 +#define IOVA_DEFAULT_FQ_SIZE 256 +#define IOVA_SINGLE_FQ_SIZE 32768 /* Timeout (in ms) after which entries are flushed from the queue */ -#define IOVA_FQ_TIMEOUT 10 +#define IOVA_DEFAULT_FQ_TIMEOUT 10 +#define IOVA_SINGLE_FQ_TIMEOUT 1000 /* Flush queue entry for deferred flushing */ struct iova_fq_entry { @@ -100,18 +118,19 @@ struct iova_fq_entry { /* Per-CPU flush queue structure */ struct iova_fq { - struct iova_fq_entry entries[IOVA_FQ_SIZE]; - unsigned int head, tail; spinlock_t lock; + unsigned int head, tail; + unsigned int mod_mask; + struct iova_fq_entry entries[]; }; #define fq_ring_for_each(i, fq) \ - for ((i) = (fq)->head; (i) != (fq)->tail; (i) = ((i) + 1) % IOVA_FQ_SIZE) + for ((i) = (fq)->head; (i) != (fq)->tail; (i) = ((i) + 1) & (fq)->mod_mask) static inline bool fq_full(struct iova_fq *fq) { assert_spin_locked(&fq->lock); - return (((fq->tail + 1) % IOVA_FQ_SIZE) == fq->head); + return (((fq->tail + 1) & fq->mod_mask) == fq->head); } static inline unsigned int fq_ring_add(struct iova_fq *fq) @@ -120,12 +139,12 @@ static inline unsigned int fq_ring_add(struct iova_fq *fq) assert_spin_locked(&fq->lock); - fq->tail = (idx + 1) % IOVA_FQ_SIZE; + fq->tail = (idx + 1) & fq->mod_mask; return idx; } -static void fq_ring_free(struct iommu_dma_cookie *cookie, struct iova_fq *fq) +static void fq_ring_free_locked(struct iommu_dma_cookie *cookie, struct iova_fq *fq) { u64 counter = atomic64_read(&cookie->fq_flush_finish_cnt); unsigned int idx; @@ -142,10 +161,19 @@ static void fq_ring_free(struct iommu_dma_cookie *cookie, struct iova_fq *fq) fq->entries[idx].iova_pfn, fq->entries[idx].pages); - fq->head = (fq->head + 1) % IOVA_FQ_SIZE; + fq->head = (fq->head + 1) & fq->mod_mask; } } +static void fq_ring_free(struct iommu_dma_cookie *cookie, struct iova_fq *fq) +{ + unsigned long flags; + + spin_lock_irqsave(&fq->lock, flags); + fq_ring_free_locked(cookie, fq); + spin_unlock_irqrestore(&fq->lock, flags); +} + static void fq_flush_iotlb(struct iommu_dma_cookie *cookie) { atomic64_inc(&cookie->fq_flush_start_cnt); @@ -161,14 +189,11 @@ static void fq_flush_timeout(struct timer_list *t) atomic_set(&cookie->fq_timer_on, 0); fq_flush_iotlb(cookie); - for_each_possible_cpu(cpu) { - unsigned long flags; - struct iova_fq *fq; - - fq = per_cpu_ptr(cookie->fq, cpu); - spin_lock_irqsave(&fq->lock, flags); - fq_ring_free(cookie, fq); - spin_unlock_irqrestore(&fq->lock, flags); + if (cookie->options.qt == IOMMU_DMA_OPTS_SINGLE_QUEUE) { + fq_ring_free(cookie, cookie->single_fq); + } else { + for_each_possible_cpu(cpu) + fq_ring_free(cookie, per_cpu_ptr(cookie->percpu_fq, cpu)); } } @@ -189,7 +214,11 @@ static void queue_iova(struct iommu_dma_cookie *cookie, */ smp_mb(); - fq = raw_cpu_ptr(cookie->fq); + if (cookie->options.qt == IOMMU_DMA_OPTS_SINGLE_QUEUE) + fq = cookie->single_fq; + else + fq = raw_cpu_ptr(cookie->percpu_fq); + spin_lock_irqsave(&fq->lock, flags); /* @@ -197,11 +226,11 @@ static void queue_iova(struct iommu_dma_cookie *cookie, * flushed out on another CPU. This makes the fq_full() check below less * likely to be true. */ - fq_ring_free(cookie, fq); + fq_ring_free_locked(cookie, fq); if (fq_full(fq)) { fq_flush_iotlb(cookie); - fq_ring_free(cookie, fq); + fq_ring_free_locked(cookie, fq); } idx = fq_ring_add(fq); @@ -217,34 +246,95 @@ static void queue_iova(struct iommu_dma_cookie *cookie, if (!atomic_read(&cookie->fq_timer_on) && !atomic_xchg(&cookie->fq_timer_on, 1)) mod_timer(&cookie->fq_timer, - jiffies + msecs_to_jiffies(IOVA_FQ_TIMEOUT)); + jiffies + msecs_to_jiffies(cookie->options.fq_timeout)); } -static void iommu_dma_free_fq(struct iommu_dma_cookie *cookie) +static void iommu_dma_free_fq_single(struct iova_fq *fq) { - int cpu, idx; + int idx; - if (!cookie->fq) - return; + fq_ring_for_each(idx, fq) + put_pages_list(&fq->entries[idx].freelist); + vfree(fq); +} + +static void iommu_dma_free_fq_percpu(struct iova_fq __percpu *percpu_fq) +{ + int cpu, idx; - del_timer_sync(&cookie->fq_timer); /* The IOVAs will be torn down separately, so just free our queued pages */ for_each_possible_cpu(cpu) { - struct iova_fq *fq = per_cpu_ptr(cookie->fq, cpu); + struct iova_fq *fq = per_cpu_ptr(percpu_fq, cpu); fq_ring_for_each(idx, fq) put_pages_list(&fq->entries[idx].freelist); } - free_percpu(cookie->fq); + free_percpu(percpu_fq); +} + +static void iommu_dma_free_fq(struct iommu_dma_cookie *cookie) +{ + if (!cookie->fq_domain) + return; + + del_timer_sync(&cookie->fq_timer); + if (cookie->options.qt == IOMMU_DMA_OPTS_SINGLE_QUEUE) + iommu_dma_free_fq_single(cookie->single_fq); + else + iommu_dma_free_fq_percpu(cookie->percpu_fq); +} + +static void iommu_dma_init_one_fq(struct iova_fq *fq, size_t fq_size) +{ + int i; + + fq->head = 0; + fq->tail = 0; + fq->mod_mask = fq_size - 1; + + spin_lock_init(&fq->lock); + + for (i = 0; i < fq_size; i++) + INIT_LIST_HEAD(&fq->entries[i].freelist); +} + +static int iommu_dma_init_fq_single(struct iommu_dma_cookie *cookie) +{ + size_t fq_size = cookie->options.fq_size; + struct iova_fq *queue; + + queue = vmalloc(struct_size(queue, entries, fq_size)); + if (!queue) + return -ENOMEM; + iommu_dma_init_one_fq(queue, fq_size); + cookie->single_fq = queue; + + return 0; +} + +static int iommu_dma_init_fq_percpu(struct iommu_dma_cookie *cookie) +{ + size_t fq_size = cookie->options.fq_size; + struct iova_fq __percpu *queue; + int cpu; + + queue = __alloc_percpu(struct_size(queue, entries, fq_size), + __alignof__(*queue)); + if (!queue) + return -ENOMEM; + + for_each_possible_cpu(cpu) + iommu_dma_init_one_fq(per_cpu_ptr(queue, cpu), fq_size); + cookie->percpu_fq = queue; + return 0; } /* sysfs updates are serialised by the mutex of the group owning @domain */ int iommu_dma_init_fq(struct iommu_domain *domain) { struct iommu_dma_cookie *cookie = domain->iova_cookie; - struct iova_fq __percpu *queue; - int i, cpu; + int rc; if (cookie->fq_domain) return 0; @@ -252,26 +342,16 @@ int iommu_dma_init_fq(struct iommu_domain *domain) atomic64_set(&cookie->fq_flush_start_cnt, 0); atomic64_set(&cookie->fq_flush_finish_cnt, 0); - queue = alloc_percpu(struct iova_fq); - if (!queue) { + if (cookie->options.qt == IOMMU_DMA_OPTS_SINGLE_QUEUE) + rc = iommu_dma_init_fq_single(cookie); + else + rc = iommu_dma_init_fq_percpu(cookie); + + if (rc) { pr_warn("iova flush queue initialization failed\n"); return -ENOMEM; } - for_each_possible_cpu(cpu) { - struct iova_fq *fq = per_cpu_ptr(queue, cpu); - - fq->head = 0; - fq->tail = 0; - - spin_lock_init(&fq->lock); - - for (i = 0; i < IOVA_FQ_SIZE; i++) - INIT_LIST_HEAD(&fq->entries[i].freelist); - } - - cookie->fq = queue; - timer_setup(&cookie->fq_timer, fq_flush_timeout, 0); atomic_set(&cookie->fq_timer_on, 0); /* @@ -556,6 +636,28 @@ static bool dev_use_sg_swiotlb(struct device *dev, struct scatterlist *sg, } /** + * iommu_dma_init_options - Initialize dma-iommu options + * @options: The options to be initialized + * @dev: Device the options are set for + * + * This allows tuning dma-iommu specific to device properties + */ +static void iommu_dma_init_options(struct iommu_dma_options *options, + struct device *dev) +{ + /* Shadowing IOTLB flushes do better with a single large queue */ + if (dev->iommu->shadow_on_flush) { + options->qt = IOMMU_DMA_OPTS_SINGLE_QUEUE; + options->fq_timeout = IOVA_SINGLE_FQ_TIMEOUT; + options->fq_size = IOVA_SINGLE_FQ_SIZE; + } else { + options->qt = IOMMU_DMA_OPTS_PER_CPU_QUEUE; + options->fq_size = IOVA_DEFAULT_FQ_SIZE; + options->fq_timeout = IOVA_DEFAULT_FQ_TIMEOUT; + } +} + +/** * iommu_dma_init_domain - Initialise a DMA mapping domain * @domain: IOMMU domain previously prepared by iommu_get_dma_cookie() * @base: IOVA at which the mappable address space starts @@ -615,6 +717,8 @@ static int iommu_dma_init_domain(struct iommu_domain *domain, dma_addr_t base, if (ret) goto done_unlock; + iommu_dma_init_options(&cookie->options, dev); + /* If the FQ fails we can simply fall back to strict mode */ if (domain->type == IOMMU_DOMAIN_DMA_FQ && (!device_iommu_capable(dev, IOMMU_CAP_DEFERRED_FLUSH) || iommu_dma_init_fq(domain))) diff --git a/drivers/iommu/exynos-iommu.c b/drivers/iommu/exynos-iommu.c index c275fe71c4..2c6e9094f1 100644 --- a/drivers/iommu/exynos-iommu.c +++ b/drivers/iommu/exynos-iommu.c @@ -24,6 +24,7 @@ typedef u32 sysmmu_iova_t; typedef u32 sysmmu_pte_t; +static struct iommu_domain exynos_identity_domain; /* We do not consider super section mapping (16MB) */ #define SECT_ORDER 20 @@ -829,7 +830,7 @@ static int __maybe_unused exynos_sysmmu_suspend(struct device *dev) struct exynos_iommu_owner *owner = dev_iommu_priv_get(master); mutex_lock(&owner->rpm_lock); - if (data->domain) { + if (&data->domain->domain != &exynos_identity_domain) { dev_dbg(data->sysmmu, "saving state\n"); __sysmmu_disable(data); } @@ -847,7 +848,7 @@ static int __maybe_unused exynos_sysmmu_resume(struct device *dev) struct exynos_iommu_owner *owner = dev_iommu_priv_get(master); mutex_lock(&owner->rpm_lock); - if (data->domain) { + if (&data->domain->domain != &exynos_identity_domain) { dev_dbg(data->sysmmu, "restoring state\n"); __sysmmu_enable(data); } @@ -886,7 +887,7 @@ static inline void exynos_iommu_set_pte(sysmmu_pte_t *ent, sysmmu_pte_t val) DMA_TO_DEVICE); } -static struct iommu_domain *exynos_iommu_domain_alloc(unsigned type) +static struct iommu_domain *exynos_iommu_domain_alloc_paging(struct device *dev) { struct exynos_iommu_domain *domain; dma_addr_t handle; @@ -895,9 +896,6 @@ static struct iommu_domain *exynos_iommu_domain_alloc(unsigned type) /* Check if correct PTE offsets are initialized */ BUG_ON(PG_ENT_SHIFT < 0 || !dma_dev); - if (type != IOMMU_DOMAIN_DMA && type != IOMMU_DOMAIN_UNMANAGED) - return NULL; - domain = kzalloc(sizeof(*domain), GFP_KERNEL); if (!domain) return NULL; @@ -980,17 +978,20 @@ static void exynos_iommu_domain_free(struct iommu_domain *iommu_domain) kfree(domain); } -static void exynos_iommu_detach_device(struct iommu_domain *iommu_domain, - struct device *dev) +static int exynos_iommu_identity_attach(struct iommu_domain *identity_domain, + struct device *dev) { - struct exynos_iommu_domain *domain = to_exynos_domain(iommu_domain); struct exynos_iommu_owner *owner = dev_iommu_priv_get(dev); - phys_addr_t pagetable = virt_to_phys(domain->pgtable); + struct exynos_iommu_domain *domain; + phys_addr_t pagetable; struct sysmmu_drvdata *data, *next; unsigned long flags; - if (!has_sysmmu(dev) || owner->domain != iommu_domain) - return; + if (owner->domain == identity_domain) + return 0; + + domain = to_exynos_domain(owner->domain); + pagetable = virt_to_phys(domain->pgtable); mutex_lock(&owner->rpm_lock); @@ -1009,15 +1010,25 @@ static void exynos_iommu_detach_device(struct iommu_domain *iommu_domain, list_del_init(&data->domain_node); spin_unlock(&data->lock); } - owner->domain = NULL; + owner->domain = identity_domain; spin_unlock_irqrestore(&domain->lock, flags); mutex_unlock(&owner->rpm_lock); - dev_dbg(dev, "%s: Detached IOMMU with pgtable %pa\n", __func__, - &pagetable); + dev_dbg(dev, "%s: Restored IOMMU to IDENTITY from pgtable %pa\n", + __func__, &pagetable); + return 0; } +static struct iommu_domain_ops exynos_identity_ops = { + .attach_dev = exynos_iommu_identity_attach, +}; + +static struct iommu_domain exynos_identity_domain = { + .type = IOMMU_DOMAIN_IDENTITY, + .ops = &exynos_identity_ops, +}; + static int exynos_iommu_attach_device(struct iommu_domain *iommu_domain, struct device *dev) { @@ -1026,12 +1037,11 @@ static int exynos_iommu_attach_device(struct iommu_domain *iommu_domain, struct sysmmu_drvdata *data; phys_addr_t pagetable = virt_to_phys(domain->pgtable); unsigned long flags; + int err; - if (!has_sysmmu(dev)) - return -ENODEV; - - if (owner->domain) - exynos_iommu_detach_device(owner->domain, dev); + err = exynos_iommu_identity_attach(&exynos_identity_domain, dev); + if (err) + return err; mutex_lock(&owner->rpm_lock); @@ -1219,7 +1229,7 @@ static int lv2set_page(sysmmu_pte_t *pent, phys_addr_t paddr, size_t size, */ static int exynos_iommu_map(struct iommu_domain *iommu_domain, unsigned long l_iova, phys_addr_t paddr, size_t size, - int prot, gfp_t gfp) + size_t count, int prot, gfp_t gfp, size_t *mapped) { struct exynos_iommu_domain *domain = to_exynos_domain(iommu_domain); sysmmu_pte_t *entry; @@ -1253,6 +1263,8 @@ static int exynos_iommu_map(struct iommu_domain *iommu_domain, if (ret) pr_err("%s: Failed(%d) to map %#zx bytes @ %#x\n", __func__, ret, size, iova); + else + *mapped = size; spin_unlock_irqrestore(&domain->pgtablelock, flags); @@ -1274,7 +1286,7 @@ static void exynos_iommu_tlb_invalidate_entry(struct exynos_iommu_domain *domain } static size_t exynos_iommu_unmap(struct iommu_domain *iommu_domain, - unsigned long l_iova, size_t size, + unsigned long l_iova, size_t size, size_t count, struct iommu_iotlb_gather *gather) { struct exynos_iommu_domain *domain = to_exynos_domain(iommu_domain); @@ -1407,26 +1419,12 @@ static struct iommu_device *exynos_iommu_probe_device(struct device *dev) return &data->iommu; } -static void exynos_iommu_set_platform_dma(struct device *dev) -{ - struct exynos_iommu_owner *owner = dev_iommu_priv_get(dev); - - if (owner->domain) { - struct iommu_group *group = iommu_group_get(dev); - - if (group) { - exynos_iommu_detach_device(owner->domain, dev); - iommu_group_put(group); - } - } -} - static void exynos_iommu_release_device(struct device *dev) { struct exynos_iommu_owner *owner = dev_iommu_priv_get(dev); struct sysmmu_drvdata *data; - exynos_iommu_set_platform_dma(dev); + WARN_ON(exynos_iommu_identity_attach(&exynos_identity_domain, dev)); list_for_each_entry(data, &owner->controllers, owner_node) device_link_del(data->link); @@ -1457,6 +1455,7 @@ static int exynos_iommu_of_xlate(struct device *dev, INIT_LIST_HEAD(&owner->controllers); mutex_init(&owner->rpm_lock); + owner->domain = &exynos_identity_domain; dev_iommu_priv_set(dev, owner); } @@ -1471,19 +1470,17 @@ static int exynos_iommu_of_xlate(struct device *dev, } static const struct iommu_ops exynos_iommu_ops = { - .domain_alloc = exynos_iommu_domain_alloc, + .identity_domain = &exynos_identity_domain, + .domain_alloc_paging = exynos_iommu_domain_alloc_paging, .device_group = generic_device_group, -#ifdef CONFIG_ARM - .set_platform_dma_ops = exynos_iommu_set_platform_dma, -#endif .probe_device = exynos_iommu_probe_device, .release_device = exynos_iommu_release_device, .pgsize_bitmap = SECT_SIZE | LPAGE_SIZE | SPAGE_SIZE, .of_xlate = exynos_iommu_of_xlate, .default_domain_ops = &(const struct iommu_domain_ops) { .attach_dev = exynos_iommu_attach_device, - .map = exynos_iommu_map, - .unmap = exynos_iommu_unmap, + .map_pages = exynos_iommu_map, + .unmap_pages = exynos_iommu_unmap, .iova_to_phys = exynos_iommu_iova_to_phys, .free = exynos_iommu_domain_free, } diff --git a/drivers/iommu/fsl_pamu_domain.c b/drivers/iommu/fsl_pamu_domain.c index 4ac0e247ec..e9d2bff465 100644 --- a/drivers/iommu/fsl_pamu_domain.c +++ b/drivers/iommu/fsl_pamu_domain.c @@ -196,6 +196,13 @@ static struct iommu_domain *fsl_pamu_domain_alloc(unsigned type) { struct fsl_dma_domain *dma_domain; + /* + * FIXME: This isn't creating an unmanaged domain since the + * default_domain_ops do not have any map/unmap function it doesn't meet + * the requirements for __IOMMU_DOMAIN_PAGING. The only purpose seems to + * allow drivers/soc/fsl/qbman/qman_portal.c to do + * fsl_pamu_configure_l1_stash() + */ if (type != IOMMU_DOMAIN_UNMANAGED) return NULL; @@ -283,16 +290,34 @@ static int fsl_pamu_attach_device(struct iommu_domain *domain, return ret; } -static void fsl_pamu_set_platform_dma(struct device *dev) +/* + * FIXME: fsl/pamu is completely broken in terms of how it works with the iommu + * API. Immediately after probe the HW is left in an IDENTITY translation and + * the driver provides a non-working UNMANAGED domain that it can switch over + * to. However it cannot switch back to an IDENTITY translation, instead it + * switches to what looks like BLOCKING. + */ +static int fsl_pamu_platform_attach(struct iommu_domain *platform_domain, + struct device *dev) { struct iommu_domain *domain = iommu_get_domain_for_dev(dev); - struct fsl_dma_domain *dma_domain = to_fsl_dma_domain(domain); + struct fsl_dma_domain *dma_domain; const u32 *prop; int len; struct pci_dev *pdev = NULL; struct pci_controller *pci_ctl; /* + * Hack to keep things working as they always have, only leaving an + * UNMANAGED domain makes it BLOCKING. + */ + if (domain == platform_domain || !domain || + domain->type != IOMMU_DOMAIN_UNMANAGED) + return 0; + + dma_domain = to_fsl_dma_domain(domain); + + /* * Use LIODN of the PCI controller while detaching a * PCI device. */ @@ -312,8 +337,18 @@ static void fsl_pamu_set_platform_dma(struct device *dev) detach_device(dev, dma_domain); else pr_debug("missing fsl,liodn property at %pOF\n", dev->of_node); + return 0; } +static struct iommu_domain_ops fsl_pamu_platform_ops = { + .attach_dev = fsl_pamu_platform_attach, +}; + +static struct iommu_domain fsl_pamu_platform_domain = { + .type = IOMMU_DOMAIN_PLATFORM, + .ops = &fsl_pamu_platform_ops, +}; + /* Set the domain stash attribute */ int fsl_pamu_configure_l1_stash(struct iommu_domain *domain, u32 cpu) { @@ -395,11 +430,11 @@ static struct iommu_device *fsl_pamu_probe_device(struct device *dev) } static const struct iommu_ops fsl_pamu_ops = { + .default_domain = &fsl_pamu_platform_domain, .capable = fsl_pamu_capable, .domain_alloc = fsl_pamu_domain_alloc, .probe_device = fsl_pamu_probe_device, .device_group = fsl_pamu_device_group, - .set_platform_dma_ops = fsl_pamu_set_platform_dma, .default_domain_ops = &(const struct iommu_domain_ops) { .attach_dev = fsl_pamu_attach_device, .iova_to_phys = fsl_pamu_iova_to_phys, diff --git a/drivers/iommu/intel/Kconfig b/drivers/iommu/intel/Kconfig index 2e56bd79f5..012cd2541a 100644 --- a/drivers/iommu/intel/Kconfig +++ b/drivers/iommu/intel/Kconfig @@ -11,10 +11,11 @@ config DMAR_DEBUG config INTEL_IOMMU bool "Support for Intel IOMMU using DMA Remapping Devices" - depends on PCI_MSI && ACPI && (X86 || IA64) + depends on PCI_MSI && ACPI && X86 select DMA_OPS select IOMMU_API select IOMMU_IOVA + select IOMMUFD_DRIVER if IOMMUFD select NEED_DMA_MAP_STATE select DMAR_TABLE select SWIOTLB diff --git a/drivers/iommu/intel/Makefile b/drivers/iommu/intel/Makefile index 7af3b8a4f2..5dabf081a7 100644 --- a/drivers/iommu/intel/Makefile +++ b/drivers/iommu/intel/Makefile @@ -1,6 +1,6 @@ # SPDX-License-Identifier: GPL-2.0 obj-$(CONFIG_DMAR_TABLE) += dmar.o -obj-$(CONFIG_INTEL_IOMMU) += iommu.o pasid.o +obj-$(CONFIG_INTEL_IOMMU) += iommu.o pasid.o nested.o obj-$(CONFIG_DMAR_TABLE) += trace.o cap_audit.o obj-$(CONFIG_DMAR_PERF) += perf.o obj-$(CONFIG_INTEL_IOMMU_DEBUGFS) += debugfs.o diff --git a/drivers/iommu/intel/debugfs.c b/drivers/iommu/intel/debugfs.c index 1f92528510..dee61e513b 100644 --- a/drivers/iommu/intel/debugfs.c +++ b/drivers/iommu/intel/debugfs.c @@ -111,6 +111,8 @@ static const struct iommu_regset iommu_regs_64[] = { IOMMU_REGSET_ENTRY(VCRSP), }; +static struct dentry *intel_iommu_debug; + static int iommu_regset_show(struct seq_file *m, void *unused) { struct dmar_drhd_unit *drhd; @@ -311,9 +313,14 @@ static inline unsigned long level_to_directory_size(int level) static inline void dump_page_info(struct seq_file *m, unsigned long iova, u64 *path) { - seq_printf(m, "0x%013lx |\t0x%016llx\t0x%016llx\t0x%016llx\t0x%016llx\t0x%016llx\n", - iova >> VTD_PAGE_SHIFT, path[5], path[4], - path[3], path[2], path[1]); + seq_printf(m, "0x%013lx |\t0x%016llx\t0x%016llx\t0x%016llx", + iova >> VTD_PAGE_SHIFT, path[5], path[4], path[3]); + if (path[2]) { + seq_printf(m, "\t0x%016llx", path[2]); + if (path[1]) + seq_printf(m, "\t0x%016llx", path[1]); + } + seq_putc(m, '\n'); } static void pgtable_walk_level(struct seq_file *m, struct dma_pte *pde, @@ -340,58 +347,140 @@ static void pgtable_walk_level(struct seq_file *m, struct dma_pte *pde, } } -static int __show_device_domain_translation(struct device *dev, void *data) +static int domain_translation_struct_show(struct seq_file *m, + struct device_domain_info *info, + ioasid_t pasid) { - struct dmar_domain *domain; - struct seq_file *m = data; - u64 path[6] = { 0 }; - - domain = to_dmar_domain(iommu_get_domain_for_dev(dev)); - if (!domain) - return 0; + bool scalable, found = false; + struct dmar_drhd_unit *drhd; + struct intel_iommu *iommu; + u16 devfn, bus, seg; - seq_printf(m, "Device %s @0x%llx\n", dev_name(dev), - (u64)virt_to_phys(domain->pgd)); - seq_puts(m, "IOVA_PFN\t\tPML5E\t\t\tPML4E\t\t\tPDPE\t\t\tPDE\t\t\tPTE\n"); + bus = info->bus; + devfn = info->devfn; + seg = info->segment; - pgtable_walk_level(m, domain->pgd, domain->agaw + 2, 0, path); - seq_putc(m, '\n'); + rcu_read_lock(); + for_each_active_iommu(iommu, drhd) { + struct context_entry *context; + u64 pgd, path[6] = { 0 }; + u32 sts, agaw; - /* Don't iterate */ - return 1; -} + if (seg != iommu->segment) + continue; -static int show_device_domain_translation(struct device *dev, void *data) -{ - struct iommu_group *group; + sts = dmar_readl(iommu->reg + DMAR_GSTS_REG); + if (!(sts & DMA_GSTS_TES)) { + seq_printf(m, "DMA Remapping is not enabled on %s\n", + iommu->name); + continue; + } + if (dmar_readq(iommu->reg + DMAR_RTADDR_REG) & DMA_RTADDR_SMT) + scalable = true; + else + scalable = false; - group = iommu_group_get(dev); - if (group) { /* - * The group->mutex is held across the callback, which will - * block calls to iommu_attach/detach_group/device. Hence, + * The iommu->lock is held across the callback, which will + * block calls to domain_attach/domain_detach. Hence, * the domain of the device will not change during traversal. * - * All devices in an iommu group share a single domain, hence - * we only dump the domain of the first device. Even though, - * this code still possibly races with the iommu_unmap() + * Traversing page table possibly races with the iommu_unmap() * interface. This could be solved by RCU-freeing the page * table pages in the iommu_unmap() path. */ - iommu_group_for_each_dev(group, data, - __show_device_domain_translation); - iommu_group_put(group); + spin_lock(&iommu->lock); + + context = iommu_context_addr(iommu, bus, devfn, 0); + if (!context || !context_present(context)) + goto iommu_unlock; + + if (scalable) { /* scalable mode */ + struct pasid_entry *pasid_tbl, *pasid_tbl_entry; + struct pasid_dir_entry *dir_tbl, *dir_entry; + u16 dir_idx, tbl_idx, pgtt; + u64 pasid_dir_ptr; + + pasid_dir_ptr = context->lo & VTD_PAGE_MASK; + + /* Dump specified device domain mappings with PASID. */ + dir_idx = pasid >> PASID_PDE_SHIFT; + tbl_idx = pasid & PASID_PTE_MASK; + + dir_tbl = phys_to_virt(pasid_dir_ptr); + dir_entry = &dir_tbl[dir_idx]; + + pasid_tbl = get_pasid_table_from_pde(dir_entry); + if (!pasid_tbl) + goto iommu_unlock; + + pasid_tbl_entry = &pasid_tbl[tbl_idx]; + if (!pasid_pte_is_present(pasid_tbl_entry)) + goto iommu_unlock; + + /* + * According to PASID Granular Translation Type(PGTT), + * get the page table pointer. + */ + pgtt = (u16)(pasid_tbl_entry->val[0] & GENMASK_ULL(8, 6)) >> 6; + agaw = (u8)(pasid_tbl_entry->val[0] & GENMASK_ULL(4, 2)) >> 2; + + switch (pgtt) { + case PASID_ENTRY_PGTT_FL_ONLY: + pgd = pasid_tbl_entry->val[2]; + break; + case PASID_ENTRY_PGTT_SL_ONLY: + case PASID_ENTRY_PGTT_NESTED: + pgd = pasid_tbl_entry->val[0]; + break; + default: + goto iommu_unlock; + } + pgd &= VTD_PAGE_MASK; + } else { /* legacy mode */ + pgd = context->lo & VTD_PAGE_MASK; + agaw = context->hi & 7; + } + + seq_printf(m, "Device %04x:%02x:%02x.%x ", + iommu->segment, bus, PCI_SLOT(devfn), PCI_FUNC(devfn)); + + if (scalable) + seq_printf(m, "with pasid %x @0x%llx\n", pasid, pgd); + else + seq_printf(m, "@0x%llx\n", pgd); + + seq_printf(m, "%-17s\t%-18s\t%-18s\t%-18s\t%-18s\t%-s\n", + "IOVA_PFN", "PML5E", "PML4E", "PDPE", "PDE", "PTE"); + pgtable_walk_level(m, phys_to_virt(pgd), agaw + 2, 0, path); + + found = true; +iommu_unlock: + spin_unlock(&iommu->lock); + if (found) + break; } + rcu_read_unlock(); return 0; } -static int domain_translation_struct_show(struct seq_file *m, void *unused) +static int dev_domain_translation_struct_show(struct seq_file *m, void *unused) +{ + struct device_domain_info *info = (struct device_domain_info *)m->private; + + return domain_translation_struct_show(m, info, IOMMU_NO_PASID); +} +DEFINE_SHOW_ATTRIBUTE(dev_domain_translation_struct); + +static int pasid_domain_translation_struct_show(struct seq_file *m, void *unused) { - return bus_for_each_dev(&pci_bus_type, NULL, m, - show_device_domain_translation); + struct dev_pasid_info *dev_pasid = (struct dev_pasid_info *)m->private; + struct device_domain_info *info = dev_iommu_priv_get(dev_pasid->dev); + + return domain_translation_struct_show(m, info, dev_pasid->pasid); } -DEFINE_SHOW_ATTRIBUTE(domain_translation_struct); +DEFINE_SHOW_ATTRIBUTE(pasid_domain_translation_struct); static void invalidation_queue_entry_show(struct seq_file *m, struct intel_iommu *iommu) @@ -666,16 +755,12 @@ static const struct file_operations dmar_perf_latency_fops = { void __init intel_iommu_debugfs_init(void) { - struct dentry *intel_iommu_debug = debugfs_create_dir("intel", - iommu_debugfs_dir); + intel_iommu_debug = debugfs_create_dir("intel", iommu_debugfs_dir); debugfs_create_file("iommu_regset", 0444, intel_iommu_debug, NULL, &iommu_regset_fops); debugfs_create_file("dmar_translation_struct", 0444, intel_iommu_debug, NULL, &dmar_translation_struct_fops); - debugfs_create_file("domain_translation_struct", 0444, - intel_iommu_debug, NULL, - &domain_translation_struct_fops); debugfs_create_file("invalidation_queue", 0444, intel_iommu_debug, NULL, &invalidation_queue_fops); #ifdef CONFIG_IRQ_REMAP @@ -685,3 +770,51 @@ void __init intel_iommu_debugfs_init(void) debugfs_create_file("dmar_perf_latency", 0644, intel_iommu_debug, NULL, &dmar_perf_latency_fops); } + +/* + * Create a debugfs directory for each device, and then create a + * debugfs file in this directory for users to dump the page table + * of the default domain. e.g. + * /sys/kernel/debug/iommu/intel/0000:00:01.0/domain_translation_struct + */ +void intel_iommu_debugfs_create_dev(struct device_domain_info *info) +{ + info->debugfs_dentry = debugfs_create_dir(dev_name(info->dev), intel_iommu_debug); + + debugfs_create_file("domain_translation_struct", 0444, info->debugfs_dentry, + info, &dev_domain_translation_struct_fops); +} + +/* Remove the device debugfs directory. */ +void intel_iommu_debugfs_remove_dev(struct device_domain_info *info) +{ + debugfs_remove_recursive(info->debugfs_dentry); +} + +/* + * Create a debugfs directory per pair of {device, pasid}, then create the + * corresponding debugfs file in this directory for users to dump its page + * table. e.g. + * /sys/kernel/debug/iommu/intel/0000:00:01.0/1/domain_translation_struct + * + * The debugfs only dumps the page tables whose mappings are created and + * destroyed by the iommu_map/unmap() interfaces. Check the mapping type + * of the domain before creating debugfs directory. + */ +void intel_iommu_debugfs_create_dev_pasid(struct dev_pasid_info *dev_pasid) +{ + struct device_domain_info *info = dev_iommu_priv_get(dev_pasid->dev); + char dir_name[10]; + + sprintf(dir_name, "%x", dev_pasid->pasid); + dev_pasid->debugfs_dentry = debugfs_create_dir(dir_name, info->debugfs_dentry); + + debugfs_create_file("domain_translation_struct", 0444, dev_pasid->debugfs_dentry, + dev_pasid, &pasid_domain_translation_struct_fops); +} + +/* Remove the device pasid debugfs directory. */ +void intel_iommu_debugfs_remove_dev_pasid(struct dev_pasid_info *dev_pasid) +{ + debugfs_remove_recursive(dev_pasid->debugfs_dentry); +} diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 744e4e6b8d..a8366b1f4f 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -282,7 +282,6 @@ static LIST_HEAD(dmar_satc_units); #define for_each_rmrr_units(rmrr) \ list_for_each_entry(rmrr, &dmar_rmrr_units, list) -static void device_block_translation(struct device *dev); static void intel_iommu_domain_free(struct iommu_domain *domain); int dmar_disabled = !IS_ENABLED(CONFIG_INTEL_IOMMU_DEFAULT_ON); @@ -300,6 +299,7 @@ static int iommu_skip_te_disable; #define IDENTMAP_AZALIA 4 const struct iommu_ops intel_iommu_ops; +static const struct iommu_dirty_ops intel_dirty_ops; static bool translation_pre_enabled(struct intel_iommu *iommu) { @@ -540,8 +540,6 @@ static int domain_update_device_node(struct dmar_domain *domain) return nid; } -static void domain_update_iotlb(struct dmar_domain *domain); - /* Return the super pagesize bitmap if supported. */ static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain) { @@ -560,7 +558,7 @@ static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain) } /* Some capabilities may be different across iommus */ -static void domain_update_iommu_cap(struct dmar_domain *domain) +void domain_update_iommu_cap(struct dmar_domain *domain) { domain_update_iommu_coherency(domain); domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL); @@ -1362,7 +1360,7 @@ domain_lookup_dev_info(struct dmar_domain *domain, return NULL; } -static void domain_update_iotlb(struct dmar_domain *domain) +void domain_update_iotlb(struct dmar_domain *domain) { struct dev_pasid_info *dev_pasid; struct device_domain_info *info; @@ -1778,8 +1776,7 @@ static struct dmar_domain *alloc_domain(unsigned int type) return domain; } -static int domain_attach_iommu(struct dmar_domain *domain, - struct intel_iommu *iommu) +int domain_attach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu) { struct iommu_domain_info *info, *curr; unsigned long ndomains; @@ -1828,8 +1825,7 @@ err_unlock: return ret; } -static void domain_detach_iommu(struct dmar_domain *domain, - struct intel_iommu *iommu) +void domain_detach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu) { struct iommu_domain_info *info; @@ -2196,6 +2192,11 @@ __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn, if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0) return -EINVAL; + if (!(prot & DMA_PTE_WRITE) && domain->nested_parent) { + pr_err_ratelimited("Read-only mapping is disallowed on the domain which serves as the parent in a nested configuration, due to HW errata (ERRATA_772415_SPR17)\n"); + return -EINVAL; + } + attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP); attr |= DMA_FL_PTE_PRESENT; if (domain->use_first_level) { @@ -3961,7 +3962,7 @@ static void dmar_remove_one_dev_info(struct device *dev) * all DMA requests without PASID from the device are blocked. If the page * table has been set, clean up the data structures. */ -static void device_block_translation(struct device *dev) +void device_block_translation(struct device *dev) { struct device_domain_info *info = dev_iommu_priv_get(dev); struct intel_iommu *iommu = info->iommu; @@ -4016,9 +4017,9 @@ static int blocking_domain_attach_dev(struct iommu_domain *domain, } static struct iommu_domain blocking_domain = { + .type = IOMMU_DOMAIN_BLOCKED, .ops = &(const struct iommu_domain_ops) { .attach_dev = blocking_domain_attach_dev, - .free = intel_iommu_domain_free } }; @@ -4028,8 +4029,6 @@ static struct iommu_domain *intel_iommu_domain_alloc(unsigned type) struct iommu_domain *domain; switch (type) { - case IOMMU_DOMAIN_BLOCKED: - return &blocking_domain; case IOMMU_DOMAIN_DMA: case IOMMU_DOMAIN_UNMANAGED: dmar_domain = alloc_domain(type); @@ -4061,14 +4060,72 @@ static struct iommu_domain *intel_iommu_domain_alloc(unsigned type) return NULL; } +static struct iommu_domain * +intel_iommu_domain_alloc_user(struct device *dev, u32 flags, + struct iommu_domain *parent, + const struct iommu_user_data *user_data) +{ + struct device_domain_info *info = dev_iommu_priv_get(dev); + bool dirty_tracking = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING; + bool nested_parent = flags & IOMMU_HWPT_ALLOC_NEST_PARENT; + struct intel_iommu *iommu = info->iommu; + struct dmar_domain *dmar_domain; + struct iommu_domain *domain; + + /* Must be NESTING domain */ + if (parent) { + if (!nested_supported(iommu) || flags) + return ERR_PTR(-EOPNOTSUPP); + return intel_nested_domain_alloc(parent, user_data); + } + + if (flags & + (~(IOMMU_HWPT_ALLOC_NEST_PARENT | IOMMU_HWPT_ALLOC_DIRTY_TRACKING))) + return ERR_PTR(-EOPNOTSUPP); + if (nested_parent && !nested_supported(iommu)) + return ERR_PTR(-EOPNOTSUPP); + if (user_data || (dirty_tracking && !ssads_supported(iommu))) + return ERR_PTR(-EOPNOTSUPP); + + /* + * domain_alloc_user op needs to fully initialize a domain before + * return, so uses iommu_domain_alloc() here for simple. + */ + domain = iommu_domain_alloc(dev->bus); + if (!domain) + return ERR_PTR(-ENOMEM); + + dmar_domain = to_dmar_domain(domain); + + if (nested_parent) { + dmar_domain->nested_parent = true; + INIT_LIST_HEAD(&dmar_domain->s1_domains); + spin_lock_init(&dmar_domain->s1_lock); + } + + if (dirty_tracking) { + if (dmar_domain->use_first_level) { + iommu_domain_free(domain); + return ERR_PTR(-EOPNOTSUPP); + } + domain->dirty_ops = &intel_dirty_ops; + } + + return domain; +} + static void intel_iommu_domain_free(struct iommu_domain *domain) { - if (domain != &si_domain->domain && domain != &blocking_domain) - domain_exit(to_dmar_domain(domain)); + struct dmar_domain *dmar_domain = to_dmar_domain(domain); + + WARN_ON(dmar_domain->nested_parent && + !list_empty(&dmar_domain->s1_domains)); + if (domain != &si_domain->domain) + domain_exit(dmar_domain); } -static int prepare_domain_attach_device(struct iommu_domain *domain, - struct device *dev) +int prepare_domain_attach_device(struct iommu_domain *domain, + struct device *dev) { struct dmar_domain *dmar_domain = to_dmar_domain(domain); struct intel_iommu *iommu; @@ -4081,6 +4138,9 @@ static int prepare_domain_attach_device(struct iommu_domain *domain, if (dmar_domain->force_snooping && !ecap_sc_support(iommu->ecap)) return -EINVAL; + if (domain->dirty_ops && !ssads_supported(iommu)) + return -EINVAL; + /* check if this iommu agaw is sufficient for max mapped address */ addr_width = agaw_to_width(iommu->agaw); if (addr_width > cap_mgaw(iommu->cap)) @@ -4336,6 +4396,8 @@ static bool intel_iommu_capable(struct device *dev, enum iommu_cap cap) return dmar_platform_optin(); case IOMMU_CAP_ENFORCE_CACHE_COHERENCY: return ecap_sc_support(info->iommu->ecap); + case IOMMU_CAP_DIRTY_TRACKING: + return ssads_supported(info->iommu); default: return false; } @@ -4413,6 +4475,8 @@ static struct iommu_device *intel_iommu_probe_device(struct device *dev) } } + intel_iommu_debugfs_create_dev(info); + return &iommu->iommu; } @@ -4422,6 +4486,7 @@ static void intel_iommu_release_device(struct device *dev) dmar_remove_one_dev_info(dev); intel_pasid_free_table(dev); + intel_iommu_debugfs_remove_dev(info); dev_iommu_priv_set(dev, NULL); kfree(info); set_dma_ops(dev, NULL); @@ -4666,8 +4731,8 @@ static bool risky_device(struct pci_dev *pdev) return false; } -static void intel_iommu_iotlb_sync_map(struct iommu_domain *domain, - unsigned long iova, size_t size) +static int intel_iommu_iotlb_sync_map(struct iommu_domain *domain, + unsigned long iova, size_t size) { struct dmar_domain *dmar_domain = to_dmar_domain(domain); unsigned long pages = aligned_nrpages(iova, size); @@ -4677,6 +4742,7 @@ static void intel_iommu_iotlb_sync_map(struct iommu_domain *domain, xa_for_each(&dmar_domain->iommu_array, i, info) __mapping_notify_one(info->iommu, dmar_domain, pfn, pages); + return 0; } static void intel_iommu_remove_dev_pasid(struct device *dev, ioasid_t pasid) @@ -4714,6 +4780,7 @@ static void intel_iommu_remove_dev_pasid(struct device *dev, ioasid_t pasid) spin_unlock_irqrestore(&dmar_domain->lock, flags); domain_detach_iommu(dmar_domain, iommu); + intel_iommu_debugfs_remove_dev_pasid(dev_pasid); kfree(dev_pasid); out_tear_down: intel_pasid_tear_down_entry(iommu, dev, pasid, false); @@ -4733,6 +4800,9 @@ static int intel_iommu_set_dev_pasid(struct iommu_domain *domain, if (!pasid_supported(iommu) || dev_is_real_dma_subdevice(dev)) return -EOPNOTSUPP; + if (domain->dirty_ops) + return -EINVAL; + if (context_copied(iommu, info->bus, info->devfn)) return -EBUSY; @@ -4766,6 +4836,9 @@ static int intel_iommu_set_dev_pasid(struct iommu_domain *domain, list_add(&dev_pasid->link_domain, &dmar_domain->dev_pasids); spin_unlock_irqrestore(&dmar_domain->lock, flags); + if (domain->type & __IOMMU_DOMAIN_PAGING) + intel_iommu_debugfs_create_dev_pasid(dev_pasid); + return 0; out_detach_iommu: domain_detach_iommu(dmar_domain, iommu); @@ -4784,6 +4857,7 @@ static void *intel_iommu_hw_info(struct device *dev, u32 *length, u32 *type) if (!vtd) return ERR_PTR(-ENOMEM); + vtd->flags = IOMMU_HW_INFO_VTD_ERRATA_772415_SPR17; vtd->cap_reg = iommu->cap; vtd->ecap_reg = iommu->ecap; *length = sizeof(*vtd); @@ -4791,10 +4865,136 @@ static void *intel_iommu_hw_info(struct device *dev, u32 *length, u32 *type) return vtd; } +/* + * Set dirty tracking for the device list of a domain. The caller must + * hold the domain->lock when calling it. + */ +static int device_set_dirty_tracking(struct list_head *devices, bool enable) +{ + struct device_domain_info *info; + int ret = 0; + + list_for_each_entry(info, devices, link) { + ret = intel_pasid_setup_dirty_tracking(info->iommu, info->dev, + IOMMU_NO_PASID, enable); + if (ret) + break; + } + + return ret; +} + +static int parent_domain_set_dirty_tracking(struct dmar_domain *domain, + bool enable) +{ + struct dmar_domain *s1_domain; + unsigned long flags; + int ret; + + spin_lock(&domain->s1_lock); + list_for_each_entry(s1_domain, &domain->s1_domains, s2_link) { + spin_lock_irqsave(&s1_domain->lock, flags); + ret = device_set_dirty_tracking(&s1_domain->devices, enable); + spin_unlock_irqrestore(&s1_domain->lock, flags); + if (ret) + goto err_unwind; + } + spin_unlock(&domain->s1_lock); + return 0; + +err_unwind: + list_for_each_entry(s1_domain, &domain->s1_domains, s2_link) { + spin_lock_irqsave(&s1_domain->lock, flags); + device_set_dirty_tracking(&s1_domain->devices, + domain->dirty_tracking); + spin_unlock_irqrestore(&s1_domain->lock, flags); + } + spin_unlock(&domain->s1_lock); + return ret; +} + +static int intel_iommu_set_dirty_tracking(struct iommu_domain *domain, + bool enable) +{ + struct dmar_domain *dmar_domain = to_dmar_domain(domain); + int ret; + + spin_lock(&dmar_domain->lock); + if (dmar_domain->dirty_tracking == enable) + goto out_unlock; + + ret = device_set_dirty_tracking(&dmar_domain->devices, enable); + if (ret) + goto err_unwind; + + if (dmar_domain->nested_parent) { + ret = parent_domain_set_dirty_tracking(dmar_domain, enable); + if (ret) + goto err_unwind; + } + + dmar_domain->dirty_tracking = enable; +out_unlock: + spin_unlock(&dmar_domain->lock); + + return 0; + +err_unwind: + device_set_dirty_tracking(&dmar_domain->devices, + dmar_domain->dirty_tracking); + spin_unlock(&dmar_domain->lock); + return ret; +} + +static int intel_iommu_read_and_clear_dirty(struct iommu_domain *domain, + unsigned long iova, size_t size, + unsigned long flags, + struct iommu_dirty_bitmap *dirty) +{ + struct dmar_domain *dmar_domain = to_dmar_domain(domain); + unsigned long end = iova + size - 1; + unsigned long pgsize; + + /* + * IOMMUFD core calls into a dirty tracking disabled domain without an + * IOVA bitmap set in order to clean dirty bits in all PTEs that might + * have occurred when we stopped dirty tracking. This ensures that we + * never inherit dirtied bits from a previous cycle. + */ + if (!dmar_domain->dirty_tracking && dirty->bitmap) + return -EINVAL; + + do { + struct dma_pte *pte; + int lvl = 0; + + pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &lvl, + GFP_ATOMIC); + pgsize = level_size(lvl) << VTD_PAGE_SHIFT; + if (!pte || !dma_pte_present(pte)) { + iova += pgsize; + continue; + } + + if (dma_sl_pte_test_and_clear_dirty(pte, flags)) + iommu_dirty_bitmap_record(dirty, iova, pgsize); + iova += pgsize; + } while (iova < end); + + return 0; +} + +static const struct iommu_dirty_ops intel_dirty_ops = { + .set_dirty_tracking = intel_iommu_set_dirty_tracking, + .read_and_clear_dirty = intel_iommu_read_and_clear_dirty, +}; + const struct iommu_ops intel_iommu_ops = { + .blocked_domain = &blocking_domain, .capable = intel_iommu_capable, .hw_info = intel_iommu_hw_info, .domain_alloc = intel_iommu_domain_alloc, + .domain_alloc_user = intel_iommu_domain_alloc_user, .probe_device = intel_iommu_probe_device, .probe_finalize = intel_iommu_probe_finalize, .release_device = intel_iommu_release_device, diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h index e6a3e70656..efc00d2b45 100644 --- a/drivers/iommu/intel/iommu.h +++ b/drivers/iommu/intel/iommu.h @@ -25,6 +25,7 @@ #include <asm/cacheflush.h> #include <asm/iommu.h> +#include <uapi/linux/iommufd.h> /* * VT-d hardware uses 4KiB page size regardless of host page size. @@ -48,6 +49,9 @@ #define DMA_FL_PTE_DIRTY BIT_ULL(6) #define DMA_FL_PTE_XD BIT_ULL(63) +#define DMA_SL_PTE_DIRTY_BIT 9 +#define DMA_SL_PTE_DIRTY BIT_ULL(DMA_SL_PTE_DIRTY_BIT) + #define ADDR_WIDTH_5LEVEL (57) #define ADDR_WIDTH_4LEVEL (48) @@ -539,6 +543,10 @@ enum { #define sm_supported(iommu) (intel_iommu_sm && ecap_smts((iommu)->ecap)) #define pasid_supported(iommu) (sm_supported(iommu) && \ ecap_pasid((iommu)->ecap)) +#define ssads_supported(iommu) (sm_supported(iommu) && \ + ecap_slads((iommu)->ecap)) +#define nested_supported(iommu) (sm_supported(iommu) && \ + ecap_nest((iommu)->ecap)) struct pasid_entry; struct pasid_state_entry; @@ -592,6 +600,8 @@ struct dmar_domain { * otherwise, goes through the second * level. */ + u8 dirty_tracking:1; /* Dirty tracking is enabled */ + u8 nested_parent:1; /* Has other domains nested on it */ u8 has_mappings:1; /* Has mappings configured through * iommu_map() interface. */ @@ -600,15 +610,44 @@ struct dmar_domain { struct list_head devices; /* all devices' list */ struct list_head dev_pasids; /* all attached pasids */ - struct dma_pte *pgd; /* virtual address */ - int gaw; /* max guest address width */ - - /* adjusted guest address width, 0 is level 2 30-bit */ - int agaw; int iommu_superpage;/* Level of superpages supported: 0 == 4KiB (no superpages), 1 == 2MiB, 2 == 1GiB, 3 == 512GiB, 4 == 1TiB */ - u64 max_addr; /* maximum mapped address */ + union { + /* DMA remapping domain */ + struct { + /* virtual address */ + struct dma_pte *pgd; + /* max guest address width */ + int gaw; + /* + * adjusted guest address width: + * 0: level 2 30-bit + * 1: level 3 39-bit + * 2: level 4 48-bit + * 3: level 5 57-bit + */ + int agaw; + /* maximum mapped address */ + u64 max_addr; + /* Protect the s1_domains list */ + spinlock_t s1_lock; + /* Track s1_domains nested on this domain */ + struct list_head s1_domains; + }; + + /* Nested user domain */ + struct { + /* parent page table which the user domain is nested on */ + struct dmar_domain *s2_domain; + /* user page table pointer (in GPA) */ + unsigned long s1_pgtbl; + /* page table attributes */ + struct iommu_hwpt_vtd_s1 s1_cfg; + /* link to parent domain siblings */ + struct list_head s2_link; + }; + }; struct iommu_domain domain; /* generic domain data structure for iommu core */ @@ -719,12 +758,18 @@ struct device_domain_info { struct intel_iommu *iommu; /* IOMMU used by this device */ struct dmar_domain *domain; /* pointer to domain */ struct pasid_table *pasid_table; /* pasid table */ +#ifdef CONFIG_INTEL_IOMMU_DEBUGFS + struct dentry *debugfs_dentry; /* pointer to device directory dentry */ +#endif }; struct dev_pasid_info { struct list_head link_domain; /* link to domain siblings */ struct device *dev; ioasid_t pasid; +#ifdef CONFIG_INTEL_IOMMU_DEBUGFS + struct dentry *debugfs_dentry; /* pointer to pasid directory dentry */ +#endif }; static inline void __iommu_flush_cache( @@ -784,6 +829,16 @@ static inline bool dma_pte_present(struct dma_pte *pte) return (pte->val & 3) != 0; } +static inline bool dma_sl_pte_test_and_clear_dirty(struct dma_pte *pte, + unsigned long flags) +{ + if (flags & IOMMU_DIRTY_NO_CLEAR) + return (pte->val & DMA_SL_PTE_DIRTY) != 0; + + return test_and_clear_bit(DMA_SL_PTE_DIRTY_BIT, + (unsigned long *)&pte->val); +} + static inline bool dma_pte_superpage(struct dma_pte *pte) { return (pte->val & DMA_PTE_LARGE_PAGE); @@ -839,12 +894,22 @@ int qi_submit_sync(struct intel_iommu *iommu, struct qi_desc *desc, */ #define QI_OPT_WAIT_DRAIN BIT(0) +void domain_update_iotlb(struct dmar_domain *domain); +int domain_attach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu); +void domain_detach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu); +void device_block_translation(struct device *dev); +int prepare_domain_attach_device(struct iommu_domain *domain, + struct device *dev); +void domain_update_iommu_cap(struct dmar_domain *domain); + int dmar_ir_support(void); void *alloc_pgtable_page(int node, gfp_t gfp); void free_pgtable_page(void *vaddr); void iommu_flush_write_buffer(struct intel_iommu *iommu); struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn); +struct iommu_domain *intel_nested_domain_alloc(struct iommu_domain *parent, + const struct iommu_user_data *user_data); #ifdef CONFIG_INTEL_IOMMU_SVM void intel_svm_check(struct intel_iommu *iommu); @@ -886,8 +951,16 @@ static inline void intel_svm_remove_dev_pasid(struct device *dev, ioasid_t pasid #ifdef CONFIG_INTEL_IOMMU_DEBUGFS void intel_iommu_debugfs_init(void); +void intel_iommu_debugfs_create_dev(struct device_domain_info *info); +void intel_iommu_debugfs_remove_dev(struct device_domain_info *info); +void intel_iommu_debugfs_create_dev_pasid(struct dev_pasid_info *dev_pasid); +void intel_iommu_debugfs_remove_dev_pasid(struct dev_pasid_info *dev_pasid); #else static inline void intel_iommu_debugfs_init(void) {} +static inline void intel_iommu_debugfs_create_dev(struct device_domain_info *info) {} +static inline void intel_iommu_debugfs_remove_dev(struct device_domain_info *info) {} +static inline void intel_iommu_debugfs_create_dev_pasid(struct dev_pasid_info *dev_pasid) {} +static inline void intel_iommu_debugfs_remove_dev_pasid(struct dev_pasid_info *dev_pasid) {} #endif /* CONFIG_INTEL_IOMMU_DEBUGFS */ extern const struct attribute_group *intel_iommu_groups[]; diff --git a/drivers/iommu/intel/nested.c b/drivers/iommu/intel/nested.c new file mode 100644 index 0000000000..92e82b33ea --- /dev/null +++ b/drivers/iommu/intel/nested.c @@ -0,0 +1,129 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * nested.c - nested mode translation support + * + * Copyright (C) 2023 Intel Corporation + * + * Author: Lu Baolu <baolu.lu@linux.intel.com> + * Jacob Pan <jacob.jun.pan@linux.intel.com> + * Yi Liu <yi.l.liu@intel.com> + */ + +#define pr_fmt(fmt) "DMAR: " fmt + +#include <linux/iommu.h> +#include <linux/pci.h> +#include <linux/pci-ats.h> + +#include "iommu.h" +#include "pasid.h" + +static int intel_nested_attach_dev(struct iommu_domain *domain, + struct device *dev) +{ + struct device_domain_info *info = dev_iommu_priv_get(dev); + struct dmar_domain *dmar_domain = to_dmar_domain(domain); + struct intel_iommu *iommu = info->iommu; + unsigned long flags; + int ret = 0; + + if (info->domain) + device_block_translation(dev); + + if (iommu->agaw < dmar_domain->s2_domain->agaw) { + dev_err_ratelimited(dev, "Adjusted guest address width not compatible\n"); + return -ENODEV; + } + + /* + * Stage-1 domain cannot work alone, it is nested on a s2_domain. + * The s2_domain will be used in nested translation, hence needs + * to ensure the s2_domain is compatible with this IOMMU. + */ + ret = prepare_domain_attach_device(&dmar_domain->s2_domain->domain, dev); + if (ret) { + dev_err_ratelimited(dev, "s2 domain is not compatible\n"); + return ret; + } + + ret = domain_attach_iommu(dmar_domain, iommu); + if (ret) { + dev_err_ratelimited(dev, "Failed to attach domain to iommu\n"); + return ret; + } + + ret = intel_pasid_setup_nested(iommu, dev, + IOMMU_NO_PASID, dmar_domain); + if (ret) { + domain_detach_iommu(dmar_domain, iommu); + dev_err_ratelimited(dev, "Failed to setup pasid entry\n"); + return ret; + } + + info->domain = dmar_domain; + spin_lock_irqsave(&dmar_domain->lock, flags); + list_add(&info->link, &dmar_domain->devices); + spin_unlock_irqrestore(&dmar_domain->lock, flags); + + domain_update_iotlb(dmar_domain); + + return 0; +} + +static void intel_nested_domain_free(struct iommu_domain *domain) +{ + struct dmar_domain *dmar_domain = to_dmar_domain(domain); + struct dmar_domain *s2_domain = dmar_domain->s2_domain; + + spin_lock(&s2_domain->s1_lock); + list_del(&dmar_domain->s2_link); + spin_unlock(&s2_domain->s1_lock); + kfree(dmar_domain); +} + +static const struct iommu_domain_ops intel_nested_domain_ops = { + .attach_dev = intel_nested_attach_dev, + .free = intel_nested_domain_free, +}; + +struct iommu_domain *intel_nested_domain_alloc(struct iommu_domain *parent, + const struct iommu_user_data *user_data) +{ + struct dmar_domain *s2_domain = to_dmar_domain(parent); + struct iommu_hwpt_vtd_s1 vtd; + struct dmar_domain *domain; + int ret; + + /* Must be nested domain */ + if (user_data->type != IOMMU_HWPT_DATA_VTD_S1) + return ERR_PTR(-EOPNOTSUPP); + if (parent->ops != intel_iommu_ops.default_domain_ops || + !s2_domain->nested_parent) + return ERR_PTR(-EINVAL); + + ret = iommu_copy_struct_from_user(&vtd, user_data, + IOMMU_HWPT_DATA_VTD_S1, __reserved); + if (ret) + return ERR_PTR(ret); + + domain = kzalloc(sizeof(*domain), GFP_KERNEL_ACCOUNT); + if (!domain) + return ERR_PTR(-ENOMEM); + + domain->use_first_level = true; + domain->s2_domain = s2_domain; + domain->s1_pgtbl = vtd.pgtbl_addr; + domain->s1_cfg = vtd; + domain->domain.ops = &intel_nested_domain_ops; + domain->domain.type = IOMMU_DOMAIN_NESTED; + INIT_LIST_HEAD(&domain->devices); + INIT_LIST_HEAD(&domain->dev_pasids); + spin_lock_init(&domain->lock); + xa_init(&domain->iommu_array); + + spin_lock(&s2_domain->s1_lock); + list_add(&domain->s2_link, &s2_domain->s1_domains); + spin_unlock(&s2_domain->s1_lock); + + return &domain->domain; +} diff --git a/drivers/iommu/intel/pasid.c b/drivers/iommu/intel/pasid.c index 8f92b92f3d..6e102cbbde 100644 --- a/drivers/iommu/intel/pasid.c +++ b/drivers/iommu/intel/pasid.c @@ -277,6 +277,11 @@ static inline void pasid_set_bits(u64 *ptr, u64 mask, u64 bits) WRITE_ONCE(*ptr, (old & ~mask) | bits); } +static inline u64 pasid_get_bits(u64 *ptr) +{ + return READ_ONCE(*ptr); +} + /* * Setup the DID(Domain Identifier) field (Bit 64~79) of scalable mode * PASID entry. @@ -336,6 +341,45 @@ static inline void pasid_set_fault_enable(struct pasid_entry *pe) } /* + * Enable second level A/D bits by setting the SLADE (Second Level + * Access Dirty Enable) field (Bit 9) of a scalable mode PASID + * entry. + */ +static inline void pasid_set_ssade(struct pasid_entry *pe) +{ + pasid_set_bits(&pe->val[0], 1 << 9, 1 << 9); +} + +/* + * Disable second level A/D bits by clearing the SLADE (Second Level + * Access Dirty Enable) field (Bit 9) of a scalable mode PASID + * entry. + */ +static inline void pasid_clear_ssade(struct pasid_entry *pe) +{ + pasid_set_bits(&pe->val[0], 1 << 9, 0); +} + +/* + * Checks if second level A/D bits specifically the SLADE (Second Level + * Access Dirty Enable) field (Bit 9) of a scalable mode PASID + * entry is set. + */ +static inline bool pasid_get_ssade(struct pasid_entry *pe) +{ + return pasid_get_bits(&pe->val[0]) & (1 << 9); +} + +/* + * Setup the SRE(Supervisor Request Enable) field (Bit 128) of a + * scalable mode PASID entry. + */ +static inline void pasid_set_sre(struct pasid_entry *pe) +{ + pasid_set_bits(&pe->val[2], 1 << 0, 1); +} + +/* * Setup the WPE(Write Protect Enable) field (Bit 132) of a * scalable mode PASID entry. */ @@ -402,6 +446,15 @@ pasid_set_flpm(struct pasid_entry *pe, u64 value) pasid_set_bits(&pe->val[2], GENMASK_ULL(3, 2), value << 2); } +/* + * Setup the Extended Access Flag Enable (EAFE) field (Bit 135) + * of a scalable mode PASID entry. + */ +static inline void pasid_set_eafe(struct pasid_entry *pe) +{ + pasid_set_bits(&pe->val[2], 1 << 7, 1 << 7); +} + static void pasid_cache_invalidation_with_pasid(struct intel_iommu *iommu, u16 did, u32 pasid) @@ -627,6 +680,8 @@ int intel_pasid_setup_second_level(struct intel_iommu *iommu, pasid_set_translation_type(pte, PASID_ENTRY_PGTT_SL_ONLY); pasid_set_fault_enable(pte); pasid_set_page_snoop(pte, !!ecap_smpwc(iommu->ecap)); + if (domain->dirty_tracking) + pasid_set_ssade(pte); pasid_set_present(pte); spin_unlock(&iommu->lock); @@ -637,6 +692,77 @@ int intel_pasid_setup_second_level(struct intel_iommu *iommu, } /* + * Set up dirty tracking on a second only or nested translation type. + */ +int intel_pasid_setup_dirty_tracking(struct intel_iommu *iommu, + struct device *dev, u32 pasid, + bool enabled) +{ + struct pasid_entry *pte; + u16 did, pgtt; + + spin_lock(&iommu->lock); + + pte = intel_pasid_get_entry(dev, pasid); + if (!pte) { + spin_unlock(&iommu->lock); + dev_err_ratelimited( + dev, "Failed to get pasid entry of PASID %d\n", pasid); + return -ENODEV; + } + + did = pasid_get_domain_id(pte); + pgtt = pasid_pte_get_pgtt(pte); + if (pgtt != PASID_ENTRY_PGTT_SL_ONLY && + pgtt != PASID_ENTRY_PGTT_NESTED) { + spin_unlock(&iommu->lock); + dev_err_ratelimited( + dev, + "Dirty tracking not supported on translation type %d\n", + pgtt); + return -EOPNOTSUPP; + } + + if (pasid_get_ssade(pte) == enabled) { + spin_unlock(&iommu->lock); + return 0; + } + + if (enabled) + pasid_set_ssade(pte); + else + pasid_clear_ssade(pte); + spin_unlock(&iommu->lock); + + if (!ecap_coherent(iommu->ecap)) + clflush_cache_range(pte, sizeof(*pte)); + + /* + * From VT-d spec table 25 "Guidance to Software for Invalidations": + * + * - PASID-selective-within-Domain PASID-cache invalidation + * If (PGTT=SS or Nested) + * - Domain-selective IOTLB invalidation + * Else + * - PASID-selective PASID-based IOTLB invalidation + * - If (pasid is RID_PASID) + * - Global Device-TLB invalidation to affected functions + * Else + * - PASID-based Device-TLB invalidation (with S=1 and + * Addr[63:12]=0x7FFFFFFF_FFFFF) to affected functions + */ + pasid_cache_invalidation_with_pasid(iommu, did, pasid); + + iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH); + + /* Device IOTLB doesn't need to be flushed in caching mode. */ + if (!cap_caching_mode(iommu->cap)) + devtlb_invalidation_with_pasid(iommu, dev, pasid); + + return 0; +} + +/* * Set up the scalable mode pasid entry for passthrough translation type. */ int intel_pasid_setup_pass_through(struct intel_iommu *iommu, @@ -713,3 +839,99 @@ void intel_pasid_setup_page_snoop_control(struct intel_iommu *iommu, if (!cap_caching_mode(iommu->cap)) devtlb_invalidation_with_pasid(iommu, dev, pasid); } + +/** + * intel_pasid_setup_nested() - Set up PASID entry for nested translation. + * @iommu: IOMMU which the device belong to + * @dev: Device to be set up for translation + * @pasid: PASID to be programmed in the device PASID table + * @domain: User stage-1 domain nested on a stage-2 domain + * + * This is used for nested translation. The input domain should be + * nested type and nested on a parent with 'is_nested_parent' flag + * set. + */ +int intel_pasid_setup_nested(struct intel_iommu *iommu, struct device *dev, + u32 pasid, struct dmar_domain *domain) +{ + struct iommu_hwpt_vtd_s1 *s1_cfg = &domain->s1_cfg; + pgd_t *s1_gpgd = (pgd_t *)(uintptr_t)domain->s1_pgtbl; + struct dmar_domain *s2_domain = domain->s2_domain; + u16 did = domain_id_iommu(domain, iommu); + struct dma_pte *pgd = s2_domain->pgd; + struct pasid_entry *pte; + + /* Address width should match the address width supported by hardware */ + switch (s1_cfg->addr_width) { + case ADDR_WIDTH_4LEVEL: + break; + case ADDR_WIDTH_5LEVEL: + if (!cap_fl5lp_support(iommu->cap)) { + dev_err_ratelimited(dev, + "5-level paging not supported\n"); + return -EINVAL; + } + break; + default: + dev_err_ratelimited(dev, "Invalid stage-1 address width %d\n", + s1_cfg->addr_width); + return -EINVAL; + } + + if ((s1_cfg->flags & IOMMU_VTD_S1_SRE) && !ecap_srs(iommu->ecap)) { + pr_err_ratelimited("No supervisor request support on %s\n", + iommu->name); + return -EINVAL; + } + + if ((s1_cfg->flags & IOMMU_VTD_S1_EAFE) && !ecap_eafs(iommu->ecap)) { + pr_err_ratelimited("No extended access flag support on %s\n", + iommu->name); + return -EINVAL; + } + + spin_lock(&iommu->lock); + pte = intel_pasid_get_entry(dev, pasid); + if (!pte) { + spin_unlock(&iommu->lock); + return -ENODEV; + } + if (pasid_pte_is_present(pte)) { + spin_unlock(&iommu->lock); + return -EBUSY; + } + + pasid_clear_entry(pte); + + if (s1_cfg->addr_width == ADDR_WIDTH_5LEVEL) + pasid_set_flpm(pte, 1); + + pasid_set_flptr(pte, (uintptr_t)s1_gpgd); + + if (s1_cfg->flags & IOMMU_VTD_S1_SRE) { + pasid_set_sre(pte); + if (s1_cfg->flags & IOMMU_VTD_S1_WPE) + pasid_set_wpe(pte); + } + + if (s1_cfg->flags & IOMMU_VTD_S1_EAFE) + pasid_set_eafe(pte); + + if (s2_domain->force_snooping) + pasid_set_pgsnp(pte); + + pasid_set_slptr(pte, virt_to_phys(pgd)); + pasid_set_fault_enable(pte); + pasid_set_domain_id(pte, did); + pasid_set_address_width(pte, s2_domain->agaw); + pasid_set_page_snoop(pte, !!ecap_smpwc(iommu->ecap)); + if (s2_domain->dirty_tracking) + pasid_set_ssade(pte); + pasid_set_translation_type(pte, PASID_ENTRY_PGTT_NESTED); + pasid_set_present(pte); + spin_unlock(&iommu->lock); + + pasid_flush_caches(iommu, pte, pasid, did); + + return 0; +} diff --git a/drivers/iommu/intel/pasid.h b/drivers/iommu/intel/pasid.h index 4e9e68c3c3..3568adca1f 100644 --- a/drivers/iommu/intel/pasid.h +++ b/drivers/iommu/intel/pasid.h @@ -106,9 +106,14 @@ int intel_pasid_setup_first_level(struct intel_iommu *iommu, int intel_pasid_setup_second_level(struct intel_iommu *iommu, struct dmar_domain *domain, struct device *dev, u32 pasid); +int intel_pasid_setup_dirty_tracking(struct intel_iommu *iommu, + struct device *dev, u32 pasid, + bool enabled); int intel_pasid_setup_pass_through(struct intel_iommu *iommu, struct dmar_domain *domain, struct device *dev, u32 pasid); +int intel_pasid_setup_nested(struct intel_iommu *iommu, struct device *dev, + u32 pasid, struct dmar_domain *domain); void intel_pasid_tear_down_entry(struct intel_iommu *iommu, struct device *dev, u32 pasid, bool fault_ignore); diff --git a/drivers/iommu/iommu-sva.c b/drivers/iommu/iommu-sva.c index b78671a8a9..4a2f569974 100644 --- a/drivers/iommu/iommu-sva.c +++ b/drivers/iommu/iommu-sva.c @@ -141,7 +141,7 @@ u32 iommu_sva_get_pasid(struct iommu_sva *handle) { struct iommu_domain *domain = handle->domain; - return domain->mm->pasid; + return mm_get_enqcmd_pasid(domain->mm); } EXPORT_SYMBOL_GPL(iommu_sva_get_pasid); diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 3a67e63628..33e2a9b5d3 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -37,7 +37,6 @@ #include "iommu-priv.h" #include "iommu-sva.h" -#include "iommu-priv.h" static struct kset *iommu_group_kset; static DEFINE_IDA(iommu_group_ida); @@ -96,8 +95,8 @@ static const char * const iommu_group_resv_type_string[] = { static int iommu_bus_notifier(struct notifier_block *nb, unsigned long action, void *data); static void iommu_release_device(struct device *dev); -static struct iommu_domain *__iommu_domain_alloc(const struct bus_type *bus, - unsigned type); +static struct iommu_domain * +__iommu_group_domain_alloc(struct iommu_group *group, unsigned int type); static int __iommu_attach_device(struct iommu_domain *domain, struct device *dev); static int __iommu_attach_group(struct iommu_domain *domain, @@ -184,6 +183,8 @@ static const char *iommu_domain_type_str(unsigned int t) case IOMMU_DOMAIN_DMA: case IOMMU_DOMAIN_DMA_FQ: return "Translated"; + case IOMMU_DOMAIN_PLATFORM: + return "Platform"; default: return "Unknown"; } @@ -290,6 +291,10 @@ void iommu_device_unregister(struct iommu_device *iommu) spin_lock(&iommu_device_lock); list_del(&iommu->list); spin_unlock(&iommu_device_lock); + + /* Pairs with the alloc in generic_single_device_group() */ + iommu_group_put(iommu->singleton_group); + iommu->singleton_group = NULL; } EXPORT_SYMBOL_GPL(iommu_device_unregister); @@ -404,6 +409,7 @@ static int iommu_init_device(struct device *dev, const struct iommu_ops *ops) ret = PTR_ERR(iommu_dev); goto err_module_put; } + dev->iommu->iommu_dev = iommu_dev; ret = iommu_device_link(iommu_dev, dev); if (ret) @@ -418,7 +424,6 @@ static int iommu_init_device(struct device *dev, const struct iommu_ops *ops) } dev->iommu_group = group; - dev->iommu->iommu_dev = iommu_dev; dev->iommu->max_pasids = dev_iommu_get_max_pasids(dev); if (ops->is_attach_deferred) dev->iommu->attach_deferred = ops->is_attach_deferred(dev); @@ -432,6 +437,7 @@ err_release: err_module_put: module_put(ops->owner); err_free: + dev->iommu->iommu_dev = NULL; dev_iommu_free(dev); return ret; } @@ -1635,6 +1641,27 @@ struct iommu_group *generic_device_group(struct device *dev) EXPORT_SYMBOL_GPL(generic_device_group); /* + * Generic device_group call-back function. It just allocates one + * iommu-group per iommu driver instance shared by every device + * probed by that iommu driver. + */ +struct iommu_group *generic_single_device_group(struct device *dev) +{ + struct iommu_device *iommu = dev->iommu->iommu_dev; + + if (!iommu->singleton_group) { + struct iommu_group *group; + + group = iommu_group_alloc(); + if (IS_ERR(group)) + return group; + iommu->singleton_group = group; + } + return iommu_group_ref_get(iommu->singleton_group); +} +EXPORT_SYMBOL_GPL(generic_single_device_group); + +/* * Use standard PCI bus topology, isolation features, and DMA alias quirks * to find or create an IOMMU group for a device. */ @@ -1715,26 +1742,29 @@ struct iommu_group *fsl_mc_device_group(struct device *dev) } EXPORT_SYMBOL_GPL(fsl_mc_device_group); -static int iommu_get_def_domain_type(struct device *dev) -{ - const struct iommu_ops *ops = dev_iommu_ops(dev); - - if (dev_is_pci(dev) && to_pci_dev(dev)->untrusted) - return IOMMU_DOMAIN_DMA; - - if (ops->def_domain_type) - return ops->def_domain_type(dev); - - return 0; -} - static struct iommu_domain * -__iommu_group_alloc_default_domain(const struct bus_type *bus, - struct iommu_group *group, int req_type) +__iommu_group_alloc_default_domain(struct iommu_group *group, int req_type) { if (group->default_domain && group->default_domain->type == req_type) return group->default_domain; - return __iommu_domain_alloc(bus, req_type); + return __iommu_group_domain_alloc(group, req_type); +} + +/* + * Returns the iommu_ops for the devices in an iommu group. + * + * It is assumed that all devices in an iommu group are managed by a single + * IOMMU unit. Therefore, this returns the dev_iommu_ops of the first device + * in the group. + */ +static const struct iommu_ops *group_iommu_ops(struct iommu_group *group) +{ + struct group_device *device = + list_first_entry(&group->devices, struct group_device, list); + + lockdep_assert_held(&group->mutex); + + return dev_iommu_ops(device->dev); } /* @@ -1744,27 +1774,36 @@ __iommu_group_alloc_default_domain(const struct bus_type *bus, static struct iommu_domain * iommu_group_alloc_default_domain(struct iommu_group *group, int req_type) { - const struct bus_type *bus = - list_first_entry(&group->devices, struct group_device, list) - ->dev->bus; + const struct iommu_ops *ops = group_iommu_ops(group); struct iommu_domain *dom; lockdep_assert_held(&group->mutex); + /* + * Allow legacy drivers to specify the domain that will be the default + * domain. This should always be either an IDENTITY/BLOCKED/PLATFORM + * domain. Do not use in new drivers. + */ + if (ops->default_domain) { + if (req_type) + return ERR_PTR(-EINVAL); + return ops->default_domain; + } + if (req_type) - return __iommu_group_alloc_default_domain(bus, group, req_type); + return __iommu_group_alloc_default_domain(group, req_type); /* The driver gave no guidance on what type to use, try the default */ - dom = __iommu_group_alloc_default_domain(bus, group, iommu_def_domain_type); - if (dom) + dom = __iommu_group_alloc_default_domain(group, iommu_def_domain_type); + if (!IS_ERR(dom)) return dom; /* Otherwise IDENTITY and DMA_FQ defaults will try DMA */ if (iommu_def_domain_type == IOMMU_DOMAIN_DMA) - return NULL; - dom = __iommu_group_alloc_default_domain(bus, group, IOMMU_DOMAIN_DMA); - if (!dom) - return NULL; + return ERR_PTR(-EINVAL); + dom = __iommu_group_alloc_default_domain(group, IOMMU_DOMAIN_DMA); + if (IS_ERR(dom)) + return dom; pr_warn("Failed to allocate default IOMMU domain of type %u for group %s - Falling back to IOMMU_DOMAIN_DMA", iommu_def_domain_type, group->name); @@ -1808,40 +1847,109 @@ static int iommu_bus_notifier(struct notifier_block *nb, return 0; } -/* A target_type of 0 will select the best domain type and cannot fail */ +/* + * Combine the driver's chosen def_domain_type across all the devices in a + * group. Drivers must give a consistent result. + */ +static int iommu_get_def_domain_type(struct iommu_group *group, + struct device *dev, int cur_type) +{ + const struct iommu_ops *ops = group_iommu_ops(group); + int type; + + if (!ops->def_domain_type) + return cur_type; + + type = ops->def_domain_type(dev); + if (!type || cur_type == type) + return cur_type; + if (!cur_type) + return type; + + dev_err_ratelimited( + dev, + "IOMMU driver error, requesting conflicting def_domain_type, %s and %s, for devices in group %u.\n", + iommu_domain_type_str(cur_type), iommu_domain_type_str(type), + group->id); + + /* + * Try to recover, drivers are allowed to force IDENITY or DMA, IDENTITY + * takes precedence. + */ + if (type == IOMMU_DOMAIN_IDENTITY) + return type; + return cur_type; +} + +/* + * A target_type of 0 will select the best domain type. 0 can be returned in + * this case meaning the global default should be used. + */ static int iommu_get_default_domain_type(struct iommu_group *group, int target_type) { - int best_type = target_type; + struct device *untrusted = NULL; struct group_device *gdev; - struct device *last_dev; + int driver_type = 0; lockdep_assert_held(&group->mutex); + /* + * ARM32 drivers supporting CONFIG_ARM_DMA_USE_IOMMU can declare an + * identity_domain and it will automatically become their default + * domain. Later on ARM_DMA_USE_IOMMU will install its UNMANAGED domain. + * Override the selection to IDENTITY. + */ + if (IS_ENABLED(CONFIG_ARM_DMA_USE_IOMMU)) { + static_assert(!(IS_ENABLED(CONFIG_ARM_DMA_USE_IOMMU) && + IS_ENABLED(CONFIG_IOMMU_DMA))); + driver_type = IOMMU_DOMAIN_IDENTITY; + } + for_each_group_device(group, gdev) { - unsigned int type = iommu_get_def_domain_type(gdev->dev); - - if (best_type && type && best_type != type) { - if (target_type) { - dev_err_ratelimited( - gdev->dev, - "Device cannot be in %s domain\n", - iommu_domain_type_str(target_type)); + driver_type = iommu_get_def_domain_type(group, gdev->dev, + driver_type); + + if (dev_is_pci(gdev->dev) && to_pci_dev(gdev->dev)->untrusted) { + /* + * No ARM32 using systems will set untrusted, it cannot + * work. + */ + if (WARN_ON(IS_ENABLED(CONFIG_ARM_DMA_USE_IOMMU))) return -1; - } + untrusted = gdev->dev; + } + } - dev_warn( - gdev->dev, - "Device needs domain type %s, but device %s in the same iommu group requires type %s - using default\n", - iommu_domain_type_str(type), dev_name(last_dev), - iommu_domain_type_str(best_type)); - return 0; + /* + * If the common dma ops are not selected in kconfig then we cannot use + * IOMMU_DOMAIN_DMA at all. Force IDENTITY if nothing else has been + * selected. + */ + if (!IS_ENABLED(CONFIG_IOMMU_DMA)) { + if (WARN_ON(driver_type == IOMMU_DOMAIN_DMA)) + return -1; + if (!driver_type) + driver_type = IOMMU_DOMAIN_IDENTITY; + } + + if (untrusted) { + if (driver_type && driver_type != IOMMU_DOMAIN_DMA) { + dev_err_ratelimited( + untrusted, + "Device is not trusted, but driver is overriding group %u to %s, refusing to probe.\n", + group->id, iommu_domain_type_str(driver_type)); + return -1; } - if (!best_type) - best_type = type; - last_dev = gdev->dev; + driver_type = IOMMU_DOMAIN_DMA; } - return best_type; + + if (target_type) { + if (driver_type && target_type != driver_type) + return -1; + return target_type; + } + return driver_type; } static void iommu_group_do_probe_finalize(struct device *dev) @@ -1970,18 +2078,33 @@ void iommu_set_fault_handler(struct iommu_domain *domain, } EXPORT_SYMBOL_GPL(iommu_set_fault_handler); -static struct iommu_domain *__iommu_domain_alloc(const struct bus_type *bus, - unsigned type) +static struct iommu_domain *__iommu_domain_alloc(const struct iommu_ops *ops, + struct device *dev, + unsigned int type) { struct iommu_domain *domain; unsigned int alloc_type = type & IOMMU_DOMAIN_ALLOC_FLAGS; - if (bus == NULL || bus->iommu_ops == NULL) - return NULL; + if (alloc_type == IOMMU_DOMAIN_IDENTITY && ops->identity_domain) + return ops->identity_domain; + else if (alloc_type == IOMMU_DOMAIN_BLOCKED && ops->blocked_domain) + return ops->blocked_domain; + else if (type & __IOMMU_DOMAIN_PAGING && ops->domain_alloc_paging) + domain = ops->domain_alloc_paging(dev); + else if (ops->domain_alloc) + domain = ops->domain_alloc(alloc_type); + else + return ERR_PTR(-EOPNOTSUPP); - domain = bus->iommu_ops->domain_alloc(alloc_type); + /* + * Many domain_alloc ops now return ERR_PTR, make things easier for the + * driver by accepting ERR_PTR from all domain_alloc ops instead of + * having two rules. + */ + if (IS_ERR(domain)) + return domain; if (!domain) - return NULL; + return ERR_PTR(-ENOMEM); domain->type = type; /* @@ -1989,21 +2112,44 @@ static struct iommu_domain *__iommu_domain_alloc(const struct bus_type *bus, * may override this later */ if (!domain->pgsize_bitmap) - domain->pgsize_bitmap = bus->iommu_ops->pgsize_bitmap; + domain->pgsize_bitmap = ops->pgsize_bitmap; if (!domain->ops) - domain->ops = bus->iommu_ops->default_domain_ops; + domain->ops = ops->default_domain_ops; - if (iommu_is_dma_domain(domain) && iommu_get_dma_cookie(domain)) { - iommu_domain_free(domain); - domain = NULL; + if (iommu_is_dma_domain(domain)) { + int rc; + + rc = iommu_get_dma_cookie(domain); + if (rc) { + iommu_domain_free(domain); + return ERR_PTR(rc); + } } return domain; } +static struct iommu_domain * +__iommu_group_domain_alloc(struct iommu_group *group, unsigned int type) +{ + struct device *dev = + list_first_entry(&group->devices, struct group_device, list) + ->dev; + + return __iommu_domain_alloc(group_iommu_ops(group), dev, type); +} + struct iommu_domain *iommu_domain_alloc(const struct bus_type *bus) { - return __iommu_domain_alloc(bus, IOMMU_DOMAIN_UNMANAGED); + struct iommu_domain *domain; + + if (bus == NULL || bus->iommu_ops == NULL) + return NULL; + domain = __iommu_domain_alloc(bus->iommu_ops, NULL, + IOMMU_DOMAIN_UNMANAGED); + if (IS_ERR(domain)) + return NULL; + return domain; } EXPORT_SYMBOL_GPL(iommu_domain_alloc); @@ -2012,7 +2158,8 @@ void iommu_domain_free(struct iommu_domain *domain) if (domain->type == IOMMU_DOMAIN_SVA) mmdrop(domain->mm); iommu_put_dma_cookie(domain); - domain->ops->free(domain); + if (domain->ops->free) + domain->ops->free(domain); } EXPORT_SYMBOL_GPL(iommu_domain_free); @@ -2062,10 +2209,10 @@ static int __iommu_attach_device(struct iommu_domain *domain, */ int iommu_attach_device(struct iommu_domain *domain, struct device *dev) { - struct iommu_group *group; + /* Caller must be a probed driver on dev */ + struct iommu_group *group = dev->iommu_group; int ret; - group = iommu_group_get(dev); if (!group) return -ENODEV; @@ -2082,8 +2229,6 @@ int iommu_attach_device(struct iommu_domain *domain, struct device *dev) out_unlock: mutex_unlock(&group->mutex); - iommu_group_put(group); - return ret; } EXPORT_SYMBOL_GPL(iommu_attach_device); @@ -2098,9 +2243,9 @@ int iommu_deferred_attach(struct device *dev, struct iommu_domain *domain) void iommu_detach_device(struct iommu_domain *domain, struct device *dev) { - struct iommu_group *group; + /* Caller must be a probed driver on dev */ + struct iommu_group *group = dev->iommu_group; - group = iommu_group_get(dev); if (!group) return; @@ -2112,24 +2257,18 @@ void iommu_detach_device(struct iommu_domain *domain, struct device *dev) out_unlock: mutex_unlock(&group->mutex); - iommu_group_put(group); } EXPORT_SYMBOL_GPL(iommu_detach_device); struct iommu_domain *iommu_get_domain_for_dev(struct device *dev) { - struct iommu_domain *domain; - struct iommu_group *group; + /* Caller must be a probed driver on dev */ + struct iommu_group *group = dev->iommu_group; - group = iommu_group_get(dev); if (!group) return NULL; - domain = group->domain; - - iommu_group_put(group); - - return domain; + return group->domain; } EXPORT_SYMBOL_GPL(iommu_get_domain_for_dev); @@ -2275,21 +2414,8 @@ static int __iommu_group_set_domain_internal(struct iommu_group *group, if (group->domain == new_domain) return 0; - /* - * New drivers should support default domains, so set_platform_dma() - * op will never be called. Otherwise the NULL domain represents some - * platform specific behavior. - */ - if (!new_domain) { - for_each_group_device(group, gdev) { - const struct iommu_ops *ops = dev_iommu_ops(gdev->dev); - - if (!WARN_ON(!ops->set_platform_dma_ops)) - ops->set_platform_dma_ops(gdev->dev); - } - group->domain = NULL; - return 0; - } + if (WARN_ON(!new_domain)) + return -EINVAL; /* * Changing the domain is done by calling attach_dev() on the new @@ -2325,19 +2451,15 @@ err_revert: */ last_gdev = gdev; for_each_group_device(group, gdev) { - const struct iommu_ops *ops = dev_iommu_ops(gdev->dev); - /* - * If set_platform_dma_ops is not present a NULL domain can - * happen only for first probe, in which case we leave - * group->domain as NULL and let release clean everything up. + * A NULL domain can happen only for first probe, in which case + * we leave group->domain as NULL and let release clean + * everything up. */ if (group->domain) WARN_ON(__iommu_device_set_domain( group, gdev->dev, group->domain, IOMMU_SET_DOMAIN_MUST_SUCCEED)); - else if (ops->set_platform_dma_ops) - ops->set_platform_dma_ops(gdev->dev); if (gdev == last_gdev) break; } @@ -2418,30 +2540,6 @@ out_set_count: return pgsize; } -static int __iommu_map_pages(struct iommu_domain *domain, unsigned long iova, - phys_addr_t paddr, size_t size, int prot, - gfp_t gfp, size_t *mapped) -{ - const struct iommu_domain_ops *ops = domain->ops; - size_t pgsize, count; - int ret; - - pgsize = iommu_pgsize(domain, iova, paddr, size, &count); - - pr_debug("mapping: iova 0x%lx pa %pa pgsize 0x%zx count %zu\n", - iova, &paddr, pgsize, count); - - if (ops->map_pages) { - ret = ops->map_pages(domain, iova, paddr, pgsize, count, prot, - gfp, mapped); - } else { - ret = ops->map(domain, iova, paddr, pgsize, prot, gfp); - *mapped = ret ? 0 : pgsize; - } - - return ret; -} - static int __iommu_map(struct iommu_domain *domain, unsigned long iova, phys_addr_t paddr, size_t size, int prot, gfp_t gfp) { @@ -2452,13 +2550,12 @@ static int __iommu_map(struct iommu_domain *domain, unsigned long iova, phys_addr_t orig_paddr = paddr; int ret = 0; - if (unlikely(!(ops->map || ops->map_pages) || - domain->pgsize_bitmap == 0UL)) - return -ENODEV; - if (unlikely(!(domain->type & __IOMMU_DOMAIN_PAGING))) return -EINVAL; + if (WARN_ON(!ops->map_pages || domain->pgsize_bitmap == 0UL)) + return -ENODEV; + /* find out the minimum page size supported */ min_pagesz = 1 << __ffs(domain->pgsize_bitmap); @@ -2476,10 +2573,14 @@ static int __iommu_map(struct iommu_domain *domain, unsigned long iova, pr_debug("map: iova 0x%lx pa %pa size 0x%zx\n", iova, &paddr, size); while (size) { - size_t mapped = 0; + size_t pgsize, count, mapped = 0; - ret = __iommu_map_pages(domain, iova, paddr, size, prot, gfp, - &mapped); + pgsize = iommu_pgsize(domain, iova, paddr, size, &count); + + pr_debug("mapping: iova 0x%lx pa %pa pgsize 0x%zx count %zu\n", + iova, &paddr, pgsize, count); + ret = ops->map_pages(domain, iova, paddr, pgsize, count, prot, + gfp, &mapped); /* * Some pages may have been mapped, even if an error occurred, * so we should account for those so they can be unmapped. @@ -2516,25 +2617,21 @@ int iommu_map(struct iommu_domain *domain, unsigned long iova, return -EINVAL; ret = __iommu_map(domain, iova, paddr, size, prot, gfp); - if (ret == 0 && ops->iotlb_sync_map) - ops->iotlb_sync_map(domain, iova, size); + if (ret == 0 && ops->iotlb_sync_map) { + ret = ops->iotlb_sync_map(domain, iova, size); + if (ret) + goto out_err; + } return ret; -} -EXPORT_SYMBOL_GPL(iommu_map); -static size_t __iommu_unmap_pages(struct iommu_domain *domain, - unsigned long iova, size_t size, - struct iommu_iotlb_gather *iotlb_gather) -{ - const struct iommu_domain_ops *ops = domain->ops; - size_t pgsize, count; +out_err: + /* undo mappings already done */ + iommu_unmap(domain, iova, size); - pgsize = iommu_pgsize(domain, iova, iova, size, &count); - return ops->unmap_pages ? - ops->unmap_pages(domain, iova, pgsize, count, iotlb_gather) : - ops->unmap(domain, iova, pgsize, iotlb_gather); + return ret; } +EXPORT_SYMBOL_GPL(iommu_map); static size_t __iommu_unmap(struct iommu_domain *domain, unsigned long iova, size_t size, @@ -2545,11 +2642,10 @@ static size_t __iommu_unmap(struct iommu_domain *domain, unsigned long orig_iova = iova; unsigned int min_pagesz; - if (unlikely(!(ops->unmap || ops->unmap_pages) || - domain->pgsize_bitmap == 0UL)) + if (unlikely(!(domain->type & __IOMMU_DOMAIN_PAGING))) return 0; - if (unlikely(!(domain->type & __IOMMU_DOMAIN_PAGING))) + if (WARN_ON(!ops->unmap_pages || domain->pgsize_bitmap == 0UL)) return 0; /* find out the minimum page size supported */ @@ -2573,9 +2669,10 @@ static size_t __iommu_unmap(struct iommu_domain *domain, * or we hit an area that isn't mapped. */ while (unmapped < size) { - unmapped_page = __iommu_unmap_pages(domain, iova, - size - unmapped, - iotlb_gather); + size_t pgsize, count; + + pgsize = iommu_pgsize(domain, iova, iova, size - unmapped, &count); + unmapped_page = ops->unmap_pages(domain, iova, pgsize, count, iotlb_gather); if (!unmapped_page) break; @@ -2658,8 +2755,11 @@ next: sg = sg_next(sg); } - if (ops->iotlb_sync_map) - ops->iotlb_sync_map(domain, iova, mapped); + if (ops->iotlb_sync_map) { + ret = ops->iotlb_sync_map(domain, iova, mapped); + if (ret) + goto out_err; + } return mapped; out_err: @@ -2957,21 +3057,9 @@ static int iommu_setup_default_domain(struct iommu_group *group, if (req_type < 0) return -EINVAL; - /* - * There are still some drivers which don't support default domains, so - * we ignore the failure and leave group->default_domain NULL. - * - * We assume that the iommu driver starts up the device in - * 'set_platform_dma_ops' mode if it does not support default domains. - */ dom = iommu_group_alloc_default_domain(group, req_type); - if (!dom) { - /* Once in default_domain mode we never leave */ - if (group->default_domain) - return -ENODEV; - group->default_domain = NULL; - return 0; - } + if (IS_ERR(dom)) + return PTR_ERR(dom); if (group->default_domain == dom) return 0; @@ -3114,24 +3202,6 @@ out_unlock: return ret ?: count; } -static bool iommu_is_default_domain(struct iommu_group *group) -{ - if (group->domain == group->default_domain) - return true; - - /* - * If the default domain was set to identity and it is still an identity - * domain then we consider this a pass. This happens because of - * amd_iommu_init_device() replacing the default idenytity domain with an - * identity domain that has a different configuration for AMDGPU. - */ - if (group->default_domain && - group->default_domain->type == IOMMU_DOMAIN_IDENTITY && - group->domain && group->domain->type == IOMMU_DOMAIN_IDENTITY) - return true; - return false; -} - /** * iommu_device_use_default_domain() - Device driver wants to handle device * DMA through the kernel DMA API. @@ -3142,7 +3212,8 @@ static bool iommu_is_default_domain(struct iommu_group *group) */ int iommu_device_use_default_domain(struct device *dev) { - struct iommu_group *group = iommu_group_get(dev); + /* Caller is the driver core during the pre-probe path */ + struct iommu_group *group = dev->iommu_group; int ret = 0; if (!group) @@ -3150,7 +3221,7 @@ int iommu_device_use_default_domain(struct device *dev) mutex_lock(&group->mutex); if (group->owner_cnt) { - if (group->owner || !iommu_is_default_domain(group) || + if (group->domain != group->default_domain || group->owner || !xa_empty(&group->pasid_array)) { ret = -EBUSY; goto unlock_out; @@ -3161,8 +3232,6 @@ int iommu_device_use_default_domain(struct device *dev) unlock_out: mutex_unlock(&group->mutex); - iommu_group_put(group); - return ret; } @@ -3176,7 +3245,8 @@ unlock_out: */ void iommu_device_unuse_default_domain(struct device *dev) { - struct iommu_group *group = iommu_group_get(dev); + /* Caller is the driver core during the post-probe path */ + struct iommu_group *group = dev->iommu_group; if (!group) return; @@ -3186,29 +3256,27 @@ void iommu_device_unuse_default_domain(struct device *dev) group->owner_cnt--; mutex_unlock(&group->mutex); - iommu_group_put(group); } static int __iommu_group_alloc_blocking_domain(struct iommu_group *group) { - struct group_device *dev = - list_first_entry(&group->devices, struct group_device, list); + struct iommu_domain *domain; if (group->blocking_domain) return 0; - group->blocking_domain = - __iommu_domain_alloc(dev->dev->bus, IOMMU_DOMAIN_BLOCKED); - if (!group->blocking_domain) { + domain = __iommu_group_domain_alloc(group, IOMMU_DOMAIN_BLOCKED); + if (IS_ERR(domain)) { /* * For drivers that do not yet understand IOMMU_DOMAIN_BLOCKED * create an empty domain instead. */ - group->blocking_domain = __iommu_domain_alloc( - dev->dev->bus, IOMMU_DOMAIN_UNMANAGED); - if (!group->blocking_domain) - return -EINVAL; + domain = __iommu_group_domain_alloc(group, + IOMMU_DOMAIN_UNMANAGED); + if (IS_ERR(domain)) + return PTR_ERR(domain); } + group->blocking_domain = domain; return 0; } @@ -3273,13 +3341,13 @@ EXPORT_SYMBOL_GPL(iommu_group_claim_dma_owner); */ int iommu_device_claim_dma_owner(struct device *dev, void *owner) { - struct iommu_group *group; + /* Caller must be a probed driver on dev */ + struct iommu_group *group = dev->iommu_group; int ret = 0; if (WARN_ON(!owner)) return -EINVAL; - group = iommu_group_get(dev); if (!group) return -ENODEV; @@ -3296,8 +3364,6 @@ int iommu_device_claim_dma_owner(struct device *dev, void *owner) ret = __iommu_take_dma_ownership(group, owner); unlock_out: mutex_unlock(&group->mutex); - iommu_group_put(group); - return ret; } EXPORT_SYMBOL_GPL(iommu_device_claim_dma_owner); @@ -3335,7 +3401,8 @@ EXPORT_SYMBOL_GPL(iommu_group_release_dma_owner); */ void iommu_device_release_dma_owner(struct device *dev) { - struct iommu_group *group = iommu_group_get(dev); + /* Caller must be a probed driver on dev */ + struct iommu_group *group = dev->iommu_group; mutex_lock(&group->mutex); if (group->owner_cnt > 1) @@ -3343,7 +3410,6 @@ void iommu_device_release_dma_owner(struct device *dev) else __iommu_release_dma_ownership(group); mutex_unlock(&group->mutex); - iommu_group_put(group); } EXPORT_SYMBOL_GPL(iommu_device_release_dma_owner); @@ -3404,14 +3470,14 @@ static void __iommu_remove_group_pasid(struct iommu_group *group, int iommu_attach_device_pasid(struct iommu_domain *domain, struct device *dev, ioasid_t pasid) { - struct iommu_group *group; + /* Caller must be a probed driver on dev */ + struct iommu_group *group = dev->iommu_group; void *curr; int ret; if (!domain->ops->set_dev_pasid) return -EOPNOTSUPP; - group = iommu_group_get(dev); if (!group) return -ENODEV; @@ -3429,8 +3495,6 @@ int iommu_attach_device_pasid(struct iommu_domain *domain, } out_unlock: mutex_unlock(&group->mutex); - iommu_group_put(group); - return ret; } EXPORT_SYMBOL_GPL(iommu_attach_device_pasid); @@ -3447,14 +3511,13 @@ EXPORT_SYMBOL_GPL(iommu_attach_device_pasid); void iommu_detach_device_pasid(struct iommu_domain *domain, struct device *dev, ioasid_t pasid) { - struct iommu_group *group = iommu_group_get(dev); + /* Caller must be a probed driver on dev */ + struct iommu_group *group = dev->iommu_group; mutex_lock(&group->mutex); __iommu_remove_group_pasid(group, pasid); WARN_ON(xa_erase(&group->pasid_array, pasid) != domain); mutex_unlock(&group->mutex); - - iommu_group_put(group); } EXPORT_SYMBOL_GPL(iommu_detach_device_pasid); @@ -3476,10 +3539,10 @@ struct iommu_domain *iommu_get_domain_for_dev_pasid(struct device *dev, ioasid_t pasid, unsigned int type) { + /* Caller must be a probed driver on dev */ + struct iommu_group *group = dev->iommu_group; struct iommu_domain *domain; - struct iommu_group *group; - group = iommu_group_get(dev); if (!group) return NULL; @@ -3488,7 +3551,6 @@ struct iommu_domain *iommu_get_domain_for_dev_pasid(struct device *dev, if (type && domain && domain->type != type) domain = ERR_PTR(-EBUSY); xa_unlock(&group->pasid_array); - iommu_group_put(group); return domain; } diff --git a/drivers/iommu/iommufd/Makefile b/drivers/iommu/iommufd/Makefile index 8aeba81800..34b4461469 100644 --- a/drivers/iommu/iommufd/Makefile +++ b/drivers/iommu/iommufd/Makefile @@ -11,3 +11,4 @@ iommufd-y := \ iommufd-$(CONFIG_IOMMUFD_TEST) += selftest.o obj-$(CONFIG_IOMMUFD) += iommufd.o +obj-$(CONFIG_IOMMUFD_DRIVER) += iova_bitmap.o diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c index ce78c36715..873630c111 100644 --- a/drivers/iommu/iommufd/device.c +++ b/drivers/iommu/iommufd/device.c @@ -293,7 +293,7 @@ u32 iommufd_device_to_id(struct iommufd_device *idev) EXPORT_SYMBOL_NS_GPL(iommufd_device_to_id, IOMMUFD); static int iommufd_group_setup_msi(struct iommufd_group *igroup, - struct iommufd_hw_pagetable *hwpt) + struct iommufd_hwpt_paging *hwpt_paging) { phys_addr_t sw_msi_start = igroup->sw_msi_start; int rc; @@ -311,8 +311,9 @@ static int iommufd_group_setup_msi(struct iommufd_group *igroup, * matches what the IRQ layer actually expects in a newly created * domain. */ - if (sw_msi_start != PHYS_ADDR_MAX && !hwpt->msi_cookie) { - rc = iommu_get_msi_cookie(hwpt->domain, sw_msi_start); + if (sw_msi_start != PHYS_ADDR_MAX && !hwpt_paging->msi_cookie) { + rc = iommu_get_msi_cookie(hwpt_paging->common.domain, + sw_msi_start); if (rc) return rc; @@ -320,7 +321,31 @@ static int iommufd_group_setup_msi(struct iommufd_group *igroup, * iommu_get_msi_cookie() can only be called once per domain, * it returns -EBUSY on later calls. */ - hwpt->msi_cookie = true; + hwpt_paging->msi_cookie = true; + } + return 0; +} + +static int iommufd_hwpt_paging_attach(struct iommufd_hwpt_paging *hwpt_paging, + struct iommufd_device *idev) +{ + int rc; + + lockdep_assert_held(&idev->igroup->lock); + + rc = iopt_table_enforce_dev_resv_regions(&hwpt_paging->ioas->iopt, + idev->dev, + &idev->igroup->sw_msi_start); + if (rc) + return rc; + + if (list_empty(&idev->igroup->device_list)) { + rc = iommufd_group_setup_msi(idev->igroup, hwpt_paging); + if (rc) { + iopt_remove_reserved_iova(&hwpt_paging->ioas->iopt, + idev->dev); + return rc; + } } return 0; } @@ -337,18 +362,12 @@ int iommufd_hw_pagetable_attach(struct iommufd_hw_pagetable *hwpt, goto err_unlock; } - /* Try to upgrade the domain we have */ - if (idev->enforce_cache_coherency) { - rc = iommufd_hw_pagetable_enforce_cc(hwpt); + if (hwpt_is_paging(hwpt)) { + rc = iommufd_hwpt_paging_attach(to_hwpt_paging(hwpt), idev); if (rc) goto err_unlock; } - rc = iopt_table_enforce_dev_resv_regions(&hwpt->ioas->iopt, idev->dev, - &idev->igroup->sw_msi_start); - if (rc) - goto err_unlock; - /* * Only attach to the group once for the first device that is in the * group. All the other devices will follow this attachment. The user @@ -357,10 +376,6 @@ int iommufd_hw_pagetable_attach(struct iommufd_hw_pagetable *hwpt, * attachment. */ if (list_empty(&idev->igroup->device_list)) { - rc = iommufd_group_setup_msi(idev->igroup, hwpt); - if (rc) - goto err_unresv; - rc = iommu_attach_group(hwpt->domain, idev->igroup->group); if (rc) goto err_unresv; @@ -371,7 +386,9 @@ int iommufd_hw_pagetable_attach(struct iommufd_hw_pagetable *hwpt, mutex_unlock(&idev->igroup->lock); return 0; err_unresv: - iopt_remove_reserved_iova(&hwpt->ioas->iopt, idev->dev); + if (hwpt_is_paging(hwpt)) + iopt_remove_reserved_iova(&to_hwpt_paging(hwpt)->ioas->iopt, + idev->dev); err_unlock: mutex_unlock(&idev->igroup->lock); return rc; @@ -388,7 +405,9 @@ iommufd_hw_pagetable_detach(struct iommufd_device *idev) iommu_detach_group(hwpt->domain, idev->igroup->group); idev->igroup->hwpt = NULL; } - iopt_remove_reserved_iova(&hwpt->ioas->iopt, idev->dev); + if (hwpt_is_paging(hwpt)) + iopt_remove_reserved_iova(&to_hwpt_paging(hwpt)->ioas->iopt, + idev->dev); mutex_unlock(&idev->igroup->lock); /* Caller must destroy hwpt */ @@ -407,14 +426,55 @@ iommufd_device_do_attach(struct iommufd_device *idev, return NULL; } +static void +iommufd_group_remove_reserved_iova(struct iommufd_group *igroup, + struct iommufd_hwpt_paging *hwpt_paging) +{ + struct iommufd_device *cur; + + lockdep_assert_held(&igroup->lock); + + list_for_each_entry(cur, &igroup->device_list, group_item) + iopt_remove_reserved_iova(&hwpt_paging->ioas->iopt, cur->dev); +} + +static int +iommufd_group_do_replace_paging(struct iommufd_group *igroup, + struct iommufd_hwpt_paging *hwpt_paging) +{ + struct iommufd_hw_pagetable *old_hwpt = igroup->hwpt; + struct iommufd_device *cur; + int rc; + + lockdep_assert_held(&igroup->lock); + + if (!hwpt_is_paging(old_hwpt) || + hwpt_paging->ioas != to_hwpt_paging(old_hwpt)->ioas) { + list_for_each_entry(cur, &igroup->device_list, group_item) { + rc = iopt_table_enforce_dev_resv_regions( + &hwpt_paging->ioas->iopt, cur->dev, NULL); + if (rc) + goto err_unresv; + } + } + + rc = iommufd_group_setup_msi(igroup, hwpt_paging); + if (rc) + goto err_unresv; + return 0; + +err_unresv: + iommufd_group_remove_reserved_iova(igroup, hwpt_paging); + return rc; +} + static struct iommufd_hw_pagetable * iommufd_device_do_replace(struct iommufd_device *idev, struct iommufd_hw_pagetable *hwpt) { struct iommufd_group *igroup = idev->igroup; struct iommufd_hw_pagetable *old_hwpt; - unsigned int num_devices = 0; - struct iommufd_device *cur; + unsigned int num_devices; int rc; mutex_lock(&idev->igroup->lock); @@ -429,42 +489,27 @@ iommufd_device_do_replace(struct iommufd_device *idev, return NULL; } - /* Try to upgrade the domain we have */ - list_for_each_entry(cur, &igroup->device_list, group_item) { - num_devices++; - if (cur->enforce_cache_coherency) { - rc = iommufd_hw_pagetable_enforce_cc(hwpt); - if (rc) - goto err_unlock; - } - } - old_hwpt = igroup->hwpt; - if (hwpt->ioas != old_hwpt->ioas) { - list_for_each_entry(cur, &igroup->device_list, group_item) { - rc = iopt_table_enforce_dev_resv_regions( - &hwpt->ioas->iopt, cur->dev, NULL); - if (rc) - goto err_unresv; - } + if (hwpt_is_paging(hwpt)) { + rc = iommufd_group_do_replace_paging(igroup, + to_hwpt_paging(hwpt)); + if (rc) + goto err_unlock; } - rc = iommufd_group_setup_msi(idev->igroup, hwpt); - if (rc) - goto err_unresv; - rc = iommu_group_replace_domain(igroup->group, hwpt->domain); if (rc) goto err_unresv; - if (hwpt->ioas != old_hwpt->ioas) { - list_for_each_entry(cur, &igroup->device_list, group_item) - iopt_remove_reserved_iova(&old_hwpt->ioas->iopt, - cur->dev); - } + if (hwpt_is_paging(old_hwpt) && + (!hwpt_is_paging(hwpt) || + to_hwpt_paging(hwpt)->ioas != to_hwpt_paging(old_hwpt)->ioas)) + iommufd_group_remove_reserved_iova(igroup, + to_hwpt_paging(old_hwpt)); igroup->hwpt = hwpt; + num_devices = list_count_nodes(&igroup->device_list); /* * Move the refcounts held by the device_list to the new hwpt. Retain a * refcount for this thread as the caller will free it. @@ -478,8 +523,9 @@ iommufd_device_do_replace(struct iommufd_device *idev, /* Caller must destroy old_hwpt */ return old_hwpt; err_unresv: - list_for_each_entry(cur, &igroup->device_list, group_item) - iopt_remove_reserved_iova(&hwpt->ioas->iopt, cur->dev); + if (hwpt_is_paging(hwpt)) + iommufd_group_remove_reserved_iova(igroup, + to_hwpt_paging(old_hwpt)); err_unlock: mutex_unlock(&idev->igroup->lock); return ERR_PTR(rc); @@ -507,6 +553,7 @@ iommufd_device_auto_get_domain(struct iommufd_device *idev, */ bool immediate_attach = do_attach == iommufd_device_do_attach; struct iommufd_hw_pagetable *destroy_hwpt; + struct iommufd_hwpt_paging *hwpt_paging; struct iommufd_hw_pagetable *hwpt; /* @@ -515,15 +562,16 @@ iommufd_device_auto_get_domain(struct iommufd_device *idev, * other. */ mutex_lock(&ioas->mutex); - list_for_each_entry(hwpt, &ioas->hwpt_list, hwpt_item) { - if (!hwpt->auto_domain) + list_for_each_entry(hwpt_paging, &ioas->hwpt_list, hwpt_item) { + if (!hwpt_paging->auto_domain) continue; + hwpt = &hwpt_paging->common; if (!iommufd_lock_obj(&hwpt->obj)) continue; destroy_hwpt = (*do_attach)(idev, hwpt); if (IS_ERR(destroy_hwpt)) { - iommufd_put_object(&hwpt->obj); + iommufd_put_object(idev->ictx, &hwpt->obj); /* * -EINVAL means the domain is incompatible with the * device. Other error codes should propagate to @@ -535,16 +583,17 @@ iommufd_device_auto_get_domain(struct iommufd_device *idev, goto out_unlock; } *pt_id = hwpt->obj.id; - iommufd_put_object(&hwpt->obj); + iommufd_put_object(idev->ictx, &hwpt->obj); goto out_unlock; } - hwpt = iommufd_hw_pagetable_alloc(idev->ictx, ioas, idev, - immediate_attach); - if (IS_ERR(hwpt)) { - destroy_hwpt = ERR_CAST(hwpt); + hwpt_paging = iommufd_hwpt_paging_alloc(idev->ictx, ioas, idev, 0, + immediate_attach, NULL); + if (IS_ERR(hwpt_paging)) { + destroy_hwpt = ERR_CAST(hwpt_paging); goto out_unlock; } + hwpt = &hwpt_paging->common; if (!immediate_attach) { destroy_hwpt = (*do_attach)(idev, hwpt); @@ -554,7 +603,7 @@ iommufd_device_auto_get_domain(struct iommufd_device *idev, destroy_hwpt = NULL; } - hwpt->auto_domain = true; + hwpt_paging->auto_domain = true; *pt_id = hwpt->obj.id; iommufd_object_finalize(idev->ictx, &hwpt->obj); @@ -579,7 +628,8 @@ static int iommufd_device_change_pt(struct iommufd_device *idev, u32 *pt_id, return PTR_ERR(pt_obj); switch (pt_obj->type) { - case IOMMUFD_OBJ_HW_PAGETABLE: { + case IOMMUFD_OBJ_HWPT_NESTED: + case IOMMUFD_OBJ_HWPT_PAGING: { struct iommufd_hw_pagetable *hwpt = container_of(pt_obj, struct iommufd_hw_pagetable, obj); @@ -602,7 +652,7 @@ static int iommufd_device_change_pt(struct iommufd_device *idev, u32 *pt_id, destroy_hwpt = ERR_PTR(-EINVAL); goto out_put_pt_obj; } - iommufd_put_object(pt_obj); + iommufd_put_object(idev->ictx, pt_obj); /* This destruction has to be after we unlock everything */ if (destroy_hwpt) @@ -610,15 +660,15 @@ static int iommufd_device_change_pt(struct iommufd_device *idev, u32 *pt_id, return 0; out_put_pt_obj: - iommufd_put_object(pt_obj); + iommufd_put_object(idev->ictx, pt_obj); return PTR_ERR(destroy_hwpt); } /** * iommufd_device_attach - Connect a device to an iommu_domain * @idev: device to attach - * @pt_id: Input a IOMMUFD_OBJ_IOAS, or IOMMUFD_OBJ_HW_PAGETABLE - * Output the IOMMUFD_OBJ_HW_PAGETABLE ID + * @pt_id: Input a IOMMUFD_OBJ_IOAS, or IOMMUFD_OBJ_HWPT_PAGING + * Output the IOMMUFD_OBJ_HWPT_PAGING ID * * This connects the device to an iommu_domain, either automatically or manually * selected. Once this completes the device could do DMA. @@ -646,8 +696,8 @@ EXPORT_SYMBOL_NS_GPL(iommufd_device_attach, IOMMUFD); /** * iommufd_device_replace - Change the device's iommu_domain * @idev: device to change - * @pt_id: Input a IOMMUFD_OBJ_IOAS, or IOMMUFD_OBJ_HW_PAGETABLE - * Output the IOMMUFD_OBJ_HW_PAGETABLE ID + * @pt_id: Input a IOMMUFD_OBJ_IOAS, or IOMMUFD_OBJ_HWPT_PAGING + * Output the IOMMUFD_OBJ_HWPT_PAGING ID * * This is the same as:: * @@ -742,7 +792,7 @@ static int iommufd_access_change_ioas_id(struct iommufd_access *access, u32 id) if (IS_ERR(ioas)) return PTR_ERR(ioas); rc = iommufd_access_change_ioas(access, ioas); - iommufd_put_object(&ioas->obj); + iommufd_put_object(access->ictx, &ioas->obj); return rc; } @@ -891,7 +941,7 @@ void iommufd_access_notify_unmap(struct io_pagetable *iopt, unsigned long iova, access->ops->unmap(access->data, iova, length); - iommufd_put_object(&access->obj); + iommufd_put_object(access->ictx, &access->obj); xa_lock(&ioas->iopt.access_list); } xa_unlock(&ioas->iopt.access_list); @@ -1185,10 +1235,14 @@ int iommufd_get_hw_info(struct iommufd_ucmd *ucmd) */ cmd->data_len = data_len; + cmd->out_capabilities = 0; + if (device_iommu_capable(idev->dev, IOMMU_CAP_DIRTY_TRACKING)) + cmd->out_capabilities |= IOMMU_HW_CAP_DIRTY_TRACKING; + rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd)); out_free: kfree(data); out_put: - iommufd_put_object(&idev->obj); + iommufd_put_object(ucmd->ictx, &idev->obj); return rc; } diff --git a/drivers/iommu/iommufd/hw_pagetable.c b/drivers/iommu/iommufd/hw_pagetable.c index cf2c1504e2..6f680959b2 100644 --- a/drivers/iommu/iommufd/hw_pagetable.c +++ b/drivers/iommu/iommufd/hw_pagetable.c @@ -5,62 +5,87 @@ #include <linux/iommu.h> #include <uapi/linux/iommufd.h> +#include "../iommu-priv.h" #include "iommufd_private.h" -void iommufd_hw_pagetable_destroy(struct iommufd_object *obj) +void iommufd_hwpt_paging_destroy(struct iommufd_object *obj) { - struct iommufd_hw_pagetable *hwpt = - container_of(obj, struct iommufd_hw_pagetable, obj); + struct iommufd_hwpt_paging *hwpt_paging = + container_of(obj, struct iommufd_hwpt_paging, common.obj); - if (!list_empty(&hwpt->hwpt_item)) { - mutex_lock(&hwpt->ioas->mutex); - list_del(&hwpt->hwpt_item); - mutex_unlock(&hwpt->ioas->mutex); + if (!list_empty(&hwpt_paging->hwpt_item)) { + mutex_lock(&hwpt_paging->ioas->mutex); + list_del(&hwpt_paging->hwpt_item); + mutex_unlock(&hwpt_paging->ioas->mutex); - iopt_table_remove_domain(&hwpt->ioas->iopt, hwpt->domain); + iopt_table_remove_domain(&hwpt_paging->ioas->iopt, + hwpt_paging->common.domain); } - if (hwpt->domain) - iommu_domain_free(hwpt->domain); + if (hwpt_paging->common.domain) + iommu_domain_free(hwpt_paging->common.domain); - refcount_dec(&hwpt->ioas->obj.users); + refcount_dec(&hwpt_paging->ioas->obj.users); } -void iommufd_hw_pagetable_abort(struct iommufd_object *obj) +void iommufd_hwpt_paging_abort(struct iommufd_object *obj) { - struct iommufd_hw_pagetable *hwpt = - container_of(obj, struct iommufd_hw_pagetable, obj); + struct iommufd_hwpt_paging *hwpt_paging = + container_of(obj, struct iommufd_hwpt_paging, common.obj); /* The ioas->mutex must be held until finalize is called. */ - lockdep_assert_held(&hwpt->ioas->mutex); + lockdep_assert_held(&hwpt_paging->ioas->mutex); - if (!list_empty(&hwpt->hwpt_item)) { - list_del_init(&hwpt->hwpt_item); - iopt_table_remove_domain(&hwpt->ioas->iopt, hwpt->domain); + if (!list_empty(&hwpt_paging->hwpt_item)) { + list_del_init(&hwpt_paging->hwpt_item); + iopt_table_remove_domain(&hwpt_paging->ioas->iopt, + hwpt_paging->common.domain); } - iommufd_hw_pagetable_destroy(obj); + iommufd_hwpt_paging_destroy(obj); } -int iommufd_hw_pagetable_enforce_cc(struct iommufd_hw_pagetable *hwpt) +void iommufd_hwpt_nested_destroy(struct iommufd_object *obj) { - if (hwpt->enforce_cache_coherency) + struct iommufd_hwpt_nested *hwpt_nested = + container_of(obj, struct iommufd_hwpt_nested, common.obj); + + if (hwpt_nested->common.domain) + iommu_domain_free(hwpt_nested->common.domain); + + refcount_dec(&hwpt_nested->parent->common.obj.users); +} + +void iommufd_hwpt_nested_abort(struct iommufd_object *obj) +{ + iommufd_hwpt_nested_destroy(obj); +} + +static int +iommufd_hwpt_paging_enforce_cc(struct iommufd_hwpt_paging *hwpt_paging) +{ + struct iommu_domain *paging_domain = hwpt_paging->common.domain; + + if (hwpt_paging->enforce_cache_coherency) return 0; - if (hwpt->domain->ops->enforce_cache_coherency) - hwpt->enforce_cache_coherency = - hwpt->domain->ops->enforce_cache_coherency( - hwpt->domain); - if (!hwpt->enforce_cache_coherency) + if (paging_domain->ops->enforce_cache_coherency) + hwpt_paging->enforce_cache_coherency = + paging_domain->ops->enforce_cache_coherency( + paging_domain); + if (!hwpt_paging->enforce_cache_coherency) return -EINVAL; return 0; } /** - * iommufd_hw_pagetable_alloc() - Get an iommu_domain for a device + * iommufd_hwpt_paging_alloc() - Get a PAGING iommu_domain for a device * @ictx: iommufd context * @ioas: IOAS to associate the domain with * @idev: Device to get an iommu_domain for + * @flags: Flags from userspace * @immediate_attach: True if idev should be attached to the hwpt + * @user_data: The user provided driver specific data describing the domain to + * create * * Allocate a new iommu_domain and return it as a hw_pagetable. The HWPT * will be linked to the given ioas and upon return the underlying iommu_domain @@ -70,28 +95,52 @@ int iommufd_hw_pagetable_enforce_cc(struct iommufd_hw_pagetable *hwpt) * iommufd_object_abort_and_destroy() or iommufd_object_finalize() is called on * the returned hwpt. */ -struct iommufd_hw_pagetable * -iommufd_hw_pagetable_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas, - struct iommufd_device *idev, bool immediate_attach) +struct iommufd_hwpt_paging * +iommufd_hwpt_paging_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas, + struct iommufd_device *idev, u32 flags, + bool immediate_attach, + const struct iommu_user_data *user_data) { + const u32 valid_flags = IOMMU_HWPT_ALLOC_NEST_PARENT | + IOMMU_HWPT_ALLOC_DIRTY_TRACKING; + const struct iommu_ops *ops = dev_iommu_ops(idev->dev); + struct iommufd_hwpt_paging *hwpt_paging; struct iommufd_hw_pagetable *hwpt; int rc; lockdep_assert_held(&ioas->mutex); - hwpt = iommufd_object_alloc(ictx, hwpt, IOMMUFD_OBJ_HW_PAGETABLE); - if (IS_ERR(hwpt)) - return hwpt; + if ((flags || user_data) && !ops->domain_alloc_user) + return ERR_PTR(-EOPNOTSUPP); + if (flags & ~valid_flags) + return ERR_PTR(-EOPNOTSUPP); + + hwpt_paging = __iommufd_object_alloc( + ictx, hwpt_paging, IOMMUFD_OBJ_HWPT_PAGING, common.obj); + if (IS_ERR(hwpt_paging)) + return ERR_CAST(hwpt_paging); + hwpt = &hwpt_paging->common; - INIT_LIST_HEAD(&hwpt->hwpt_item); + INIT_LIST_HEAD(&hwpt_paging->hwpt_item); /* Pairs with iommufd_hw_pagetable_destroy() */ refcount_inc(&ioas->obj.users); - hwpt->ioas = ioas; + hwpt_paging->ioas = ioas; + hwpt_paging->nest_parent = flags & IOMMU_HWPT_ALLOC_NEST_PARENT; - hwpt->domain = iommu_domain_alloc(idev->dev->bus); - if (!hwpt->domain) { - rc = -ENOMEM; - goto out_abort; + if (ops->domain_alloc_user) { + hwpt->domain = ops->domain_alloc_user(idev->dev, flags, NULL, + user_data); + if (IS_ERR(hwpt->domain)) { + rc = PTR_ERR(hwpt->domain); + hwpt->domain = NULL; + goto out_abort; + } + } else { + hwpt->domain = iommu_domain_alloc(idev->dev->bus); + if (!hwpt->domain) { + rc = -ENOMEM; + goto out_abort; + } } /* @@ -100,9 +149,16 @@ iommufd_hw_pagetable_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas, * doing any maps. It is an iommu driver bug to report * IOMMU_CAP_ENFORCE_CACHE_COHERENCY but fail enforce_cache_coherency on * a new domain. + * + * The cache coherency mode must be configured here and unchanged later. + * Note that a HWPT (non-CC) created for a device (non-CC) can be later + * reused by another device (either non-CC or CC). However, A HWPT (CC) + * created for a device (CC) cannot be reused by another device (non-CC) + * but only devices (CC). Instead user space in this case would need to + * allocate a separate HWPT (non-CC). */ if (idev->enforce_cache_coherency) { - rc = iommufd_hw_pagetable_enforce_cc(hwpt); + rc = iommufd_hwpt_paging_enforce_cc(hwpt_paging); if (WARN_ON(rc)) goto out_abort; } @@ -119,11 +175,11 @@ iommufd_hw_pagetable_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas, goto out_abort; } - rc = iopt_table_add_domain(&hwpt->ioas->iopt, hwpt->domain); + rc = iopt_table_add_domain(&ioas->iopt, hwpt->domain); if (rc) goto out_detach; - list_add_tail(&hwpt->hwpt_item, &hwpt->ioas->hwpt_list); - return hwpt; + list_add_tail(&hwpt_paging->hwpt_item, &ioas->hwpt_list); + return hwpt_paging; out_detach: if (immediate_attach) @@ -133,32 +189,121 @@ out_abort: return ERR_PTR(rc); } +/** + * iommufd_hwpt_nested_alloc() - Get a NESTED iommu_domain for a device + * @ictx: iommufd context + * @parent: Parent PAGING-type hwpt to associate the domain with + * @idev: Device to get an iommu_domain for + * @flags: Flags from userspace + * @user_data: user_data pointer. Must be valid + * + * Allocate a new iommu_domain (must be IOMMU_DOMAIN_NESTED) and return it as + * a NESTED hw_pagetable. The given parent PAGING-type hwpt must be capable of + * being a parent. + */ +static struct iommufd_hwpt_nested * +iommufd_hwpt_nested_alloc(struct iommufd_ctx *ictx, + struct iommufd_hwpt_paging *parent, + struct iommufd_device *idev, u32 flags, + const struct iommu_user_data *user_data) +{ + const struct iommu_ops *ops = dev_iommu_ops(idev->dev); + struct iommufd_hwpt_nested *hwpt_nested; + struct iommufd_hw_pagetable *hwpt; + int rc; + + if (flags || !user_data->len || !ops->domain_alloc_user) + return ERR_PTR(-EOPNOTSUPP); + if (parent->auto_domain || !parent->nest_parent) + return ERR_PTR(-EINVAL); + + hwpt_nested = __iommufd_object_alloc( + ictx, hwpt_nested, IOMMUFD_OBJ_HWPT_NESTED, common.obj); + if (IS_ERR(hwpt_nested)) + return ERR_CAST(hwpt_nested); + hwpt = &hwpt_nested->common; + + refcount_inc(&parent->common.obj.users); + hwpt_nested->parent = parent; + + hwpt->domain = ops->domain_alloc_user(idev->dev, flags, + parent->common.domain, user_data); + if (IS_ERR(hwpt->domain)) { + rc = PTR_ERR(hwpt->domain); + hwpt->domain = NULL; + goto out_abort; + } + + if (WARN_ON_ONCE(hwpt->domain->type != IOMMU_DOMAIN_NESTED)) { + rc = -EINVAL; + goto out_abort; + } + return hwpt_nested; + +out_abort: + iommufd_object_abort_and_destroy(ictx, &hwpt->obj); + return ERR_PTR(rc); +} + int iommufd_hwpt_alloc(struct iommufd_ucmd *ucmd) { struct iommu_hwpt_alloc *cmd = ucmd->cmd; + const struct iommu_user_data user_data = { + .type = cmd->data_type, + .uptr = u64_to_user_ptr(cmd->data_uptr), + .len = cmd->data_len, + }; struct iommufd_hw_pagetable *hwpt; + struct iommufd_ioas *ioas = NULL; + struct iommufd_object *pt_obj; struct iommufd_device *idev; - struct iommufd_ioas *ioas; int rc; - if (cmd->flags || cmd->__reserved) + if (cmd->__reserved) return -EOPNOTSUPP; + if ((cmd->data_type == IOMMU_HWPT_DATA_NONE && cmd->data_len) || + (cmd->data_type != IOMMU_HWPT_DATA_NONE && !cmd->data_len)) + return -EINVAL; idev = iommufd_get_device(ucmd, cmd->dev_id); if (IS_ERR(idev)) return PTR_ERR(idev); - ioas = iommufd_get_ioas(ucmd->ictx, cmd->pt_id); - if (IS_ERR(ioas)) { - rc = PTR_ERR(ioas); + pt_obj = iommufd_get_object(ucmd->ictx, cmd->pt_id, IOMMUFD_OBJ_ANY); + if (IS_ERR(pt_obj)) { + rc = -EINVAL; goto out_put_idev; } - mutex_lock(&ioas->mutex); - hwpt = iommufd_hw_pagetable_alloc(ucmd->ictx, ioas, idev, false); - if (IS_ERR(hwpt)) { - rc = PTR_ERR(hwpt); - goto out_unlock; + if (pt_obj->type == IOMMUFD_OBJ_IOAS) { + struct iommufd_hwpt_paging *hwpt_paging; + + ioas = container_of(pt_obj, struct iommufd_ioas, obj); + mutex_lock(&ioas->mutex); + hwpt_paging = iommufd_hwpt_paging_alloc( + ucmd->ictx, ioas, idev, cmd->flags, false, + user_data.len ? &user_data : NULL); + if (IS_ERR(hwpt_paging)) { + rc = PTR_ERR(hwpt_paging); + goto out_unlock; + } + hwpt = &hwpt_paging->common; + } else if (pt_obj->type == IOMMUFD_OBJ_HWPT_PAGING) { + struct iommufd_hwpt_nested *hwpt_nested; + + hwpt_nested = iommufd_hwpt_nested_alloc( + ucmd->ictx, + container_of(pt_obj, struct iommufd_hwpt_paging, + common.obj), + idev, cmd->flags, &user_data); + if (IS_ERR(hwpt_nested)) { + rc = PTR_ERR(hwpt_nested); + goto out_unlock; + } + hwpt = &hwpt_nested->common; + } else { + rc = -EINVAL; + goto out_put_pt; } cmd->out_hwpt_id = hwpt->obj.id; @@ -171,9 +316,59 @@ int iommufd_hwpt_alloc(struct iommufd_ucmd *ucmd) out_hwpt: iommufd_object_abort_and_destroy(ucmd->ictx, &hwpt->obj); out_unlock: - mutex_unlock(&ioas->mutex); - iommufd_put_object(&ioas->obj); + if (ioas) + mutex_unlock(&ioas->mutex); +out_put_pt: + iommufd_put_object(ucmd->ictx, pt_obj); out_put_idev: - iommufd_put_object(&idev->obj); + iommufd_put_object(ucmd->ictx, &idev->obj); + return rc; +} + +int iommufd_hwpt_set_dirty_tracking(struct iommufd_ucmd *ucmd) +{ + struct iommu_hwpt_set_dirty_tracking *cmd = ucmd->cmd; + struct iommufd_hwpt_paging *hwpt_paging; + struct iommufd_ioas *ioas; + int rc = -EOPNOTSUPP; + bool enable; + + if (cmd->flags & ~IOMMU_HWPT_DIRTY_TRACKING_ENABLE) + return rc; + + hwpt_paging = iommufd_get_hwpt_paging(ucmd, cmd->hwpt_id); + if (IS_ERR(hwpt_paging)) + return PTR_ERR(hwpt_paging); + + ioas = hwpt_paging->ioas; + enable = cmd->flags & IOMMU_HWPT_DIRTY_TRACKING_ENABLE; + + rc = iopt_set_dirty_tracking(&ioas->iopt, hwpt_paging->common.domain, + enable); + + iommufd_put_object(ucmd->ictx, &hwpt_paging->common.obj); + return rc; +} + +int iommufd_hwpt_get_dirty_bitmap(struct iommufd_ucmd *ucmd) +{ + struct iommu_hwpt_get_dirty_bitmap *cmd = ucmd->cmd; + struct iommufd_hwpt_paging *hwpt_paging; + struct iommufd_ioas *ioas; + int rc = -EOPNOTSUPP; + + if ((cmd->flags & ~(IOMMU_HWPT_GET_DIRTY_BITMAP_NO_CLEAR)) || + cmd->__reserved) + return -EOPNOTSUPP; + + hwpt_paging = iommufd_get_hwpt_paging(ucmd, cmd->hwpt_id); + if (IS_ERR(hwpt_paging)) + return PTR_ERR(hwpt_paging); + + ioas = hwpt_paging->ioas; + rc = iopt_read_and_clear_dirty_data( + &ioas->iopt, hwpt_paging->common.domain, cmd->flags, cmd); + + iommufd_put_object(ucmd->ictx, &hwpt_paging->common.obj); return rc; } diff --git a/drivers/iommu/iommufd/io_pagetable.c b/drivers/iommu/iommufd/io_pagetable.c index 117a39ae2e..504ac1b01b 100644 --- a/drivers/iommu/iommufd/io_pagetable.c +++ b/drivers/iommu/iommufd/io_pagetable.c @@ -15,6 +15,7 @@ #include <linux/err.h> #include <linux/slab.h> #include <linux/errno.h> +#include <uapi/linux/iommufd.h> #include "io_pagetable.h" #include "double_span.h" @@ -424,6 +425,177 @@ int iopt_map_user_pages(struct iommufd_ctx *ictx, struct io_pagetable *iopt, return 0; } +struct iova_bitmap_fn_arg { + unsigned long flags; + struct io_pagetable *iopt; + struct iommu_domain *domain; + struct iommu_dirty_bitmap *dirty; +}; + +static int __iommu_read_and_clear_dirty(struct iova_bitmap *bitmap, + unsigned long iova, size_t length, + void *opaque) +{ + struct iopt_area *area; + struct iopt_area_contig_iter iter; + struct iova_bitmap_fn_arg *arg = opaque; + struct iommu_domain *domain = arg->domain; + struct iommu_dirty_bitmap *dirty = arg->dirty; + const struct iommu_dirty_ops *ops = domain->dirty_ops; + unsigned long last_iova = iova + length - 1; + unsigned long flags = arg->flags; + int ret; + + iopt_for_each_contig_area(&iter, area, arg->iopt, iova, last_iova) { + unsigned long last = min(last_iova, iopt_area_last_iova(area)); + + ret = ops->read_and_clear_dirty(domain, iter.cur_iova, + last - iter.cur_iova + 1, flags, + dirty); + if (ret) + return ret; + } + + if (!iopt_area_contig_done(&iter)) + return -EINVAL; + return 0; +} + +static int +iommu_read_and_clear_dirty(struct iommu_domain *domain, + struct io_pagetable *iopt, unsigned long flags, + struct iommu_hwpt_get_dirty_bitmap *bitmap) +{ + const struct iommu_dirty_ops *ops = domain->dirty_ops; + struct iommu_iotlb_gather gather; + struct iommu_dirty_bitmap dirty; + struct iova_bitmap_fn_arg arg; + struct iova_bitmap *iter; + int ret = 0; + + if (!ops || !ops->read_and_clear_dirty) + return -EOPNOTSUPP; + + iter = iova_bitmap_alloc(bitmap->iova, bitmap->length, + bitmap->page_size, + u64_to_user_ptr(bitmap->data)); + if (IS_ERR(iter)) + return -ENOMEM; + + iommu_dirty_bitmap_init(&dirty, iter, &gather); + + arg.flags = flags; + arg.iopt = iopt; + arg.domain = domain; + arg.dirty = &dirty; + iova_bitmap_for_each(iter, &arg, __iommu_read_and_clear_dirty); + + if (!(flags & IOMMU_DIRTY_NO_CLEAR)) + iommu_iotlb_sync(domain, &gather); + + iova_bitmap_free(iter); + + return ret; +} + +int iommufd_check_iova_range(struct io_pagetable *iopt, + struct iommu_hwpt_get_dirty_bitmap *bitmap) +{ + size_t iommu_pgsize = iopt->iova_alignment; + u64 last_iova; + + if (check_add_overflow(bitmap->iova, bitmap->length - 1, &last_iova)) + return -EOVERFLOW; + + if (bitmap->iova > ULONG_MAX || last_iova > ULONG_MAX) + return -EOVERFLOW; + + if ((bitmap->iova & (iommu_pgsize - 1)) || + ((last_iova + 1) & (iommu_pgsize - 1))) + return -EINVAL; + + if (!bitmap->page_size) + return -EINVAL; + + if ((bitmap->iova & (bitmap->page_size - 1)) || + ((last_iova + 1) & (bitmap->page_size - 1))) + return -EINVAL; + + return 0; +} + +int iopt_read_and_clear_dirty_data(struct io_pagetable *iopt, + struct iommu_domain *domain, + unsigned long flags, + struct iommu_hwpt_get_dirty_bitmap *bitmap) +{ + int ret; + + ret = iommufd_check_iova_range(iopt, bitmap); + if (ret) + return ret; + + down_read(&iopt->iova_rwsem); + ret = iommu_read_and_clear_dirty(domain, iopt, flags, bitmap); + up_read(&iopt->iova_rwsem); + + return ret; +} + +static int iopt_clear_dirty_data(struct io_pagetable *iopt, + struct iommu_domain *domain) +{ + const struct iommu_dirty_ops *ops = domain->dirty_ops; + struct iommu_iotlb_gather gather; + struct iommu_dirty_bitmap dirty; + struct iopt_area *area; + int ret = 0; + + lockdep_assert_held_read(&iopt->iova_rwsem); + + iommu_dirty_bitmap_init(&dirty, NULL, &gather); + + for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area; + area = iopt_area_iter_next(area, 0, ULONG_MAX)) { + if (!area->pages) + continue; + + ret = ops->read_and_clear_dirty(domain, iopt_area_iova(area), + iopt_area_length(area), 0, + &dirty); + if (ret) + break; + } + + iommu_iotlb_sync(domain, &gather); + return ret; +} + +int iopt_set_dirty_tracking(struct io_pagetable *iopt, + struct iommu_domain *domain, bool enable) +{ + const struct iommu_dirty_ops *ops = domain->dirty_ops; + int ret = 0; + + if (!ops) + return -EOPNOTSUPP; + + down_read(&iopt->iova_rwsem); + + /* Clear dirty bits from PTEs to ensure a clean snapshot */ + if (enable) { + ret = iopt_clear_dirty_data(iopt, domain); + if (ret) + goto out_unlock; + } + + ret = ops->set_dirty_tracking(domain, enable); + +out_unlock: + up_read(&iopt->iova_rwsem); + return ret; +} + int iopt_get_pages(struct io_pagetable *iopt, unsigned long iova, unsigned long length, struct list_head *pages_list) { diff --git a/drivers/iommu/iommufd/ioas.c b/drivers/iommu/iommufd/ioas.c index d5624577f7..7422482765 100644 --- a/drivers/iommu/iommufd/ioas.c +++ b/drivers/iommu/iommufd/ioas.c @@ -105,7 +105,7 @@ int iommufd_ioas_iova_ranges(struct iommufd_ucmd *ucmd) rc = -EMSGSIZE; out_put: up_read(&ioas->iopt.iova_rwsem); - iommufd_put_object(&ioas->obj); + iommufd_put_object(ucmd->ictx, &ioas->obj); return rc; } @@ -175,7 +175,7 @@ out_free: interval_tree_remove(node, &allowed_iova); kfree(container_of(node, struct iopt_allowed, node)); } - iommufd_put_object(&ioas->obj); + iommufd_put_object(ucmd->ictx, &ioas->obj); return rc; } @@ -228,7 +228,7 @@ int iommufd_ioas_map(struct iommufd_ucmd *ucmd) cmd->iova = iova; rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd)); out_put: - iommufd_put_object(&ioas->obj); + iommufd_put_object(ucmd->ictx, &ioas->obj); return rc; } @@ -258,7 +258,7 @@ int iommufd_ioas_copy(struct iommufd_ucmd *ucmd) return PTR_ERR(src_ioas); rc = iopt_get_pages(&src_ioas->iopt, cmd->src_iova, cmd->length, &pages_list); - iommufd_put_object(&src_ioas->obj); + iommufd_put_object(ucmd->ictx, &src_ioas->obj); if (rc) return rc; @@ -279,7 +279,7 @@ int iommufd_ioas_copy(struct iommufd_ucmd *ucmd) cmd->dst_iova = iova; rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd)); out_put_dst: - iommufd_put_object(&dst_ioas->obj); + iommufd_put_object(ucmd->ictx, &dst_ioas->obj); out_pages: iopt_free_pages_list(&pages_list); return rc; @@ -315,7 +315,7 @@ int iommufd_ioas_unmap(struct iommufd_ucmd *ucmd) rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd)); out_put: - iommufd_put_object(&ioas->obj); + iommufd_put_object(ucmd->ictx, &ioas->obj); return rc; } @@ -393,6 +393,6 @@ int iommufd_ioas_option(struct iommufd_ucmd *ucmd) rc = -EOPNOTSUPP; } - iommufd_put_object(&ioas->obj); + iommufd_put_object(ucmd->ictx, &ioas->obj); return rc; } diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h index 2c58670011..abae041e25 100644 --- a/drivers/iommu/iommufd/iommufd_private.h +++ b/drivers/iommu/iommufd/iommufd_private.h @@ -8,6 +8,9 @@ #include <linux/xarray.h> #include <linux/refcount.h> #include <linux/uaccess.h> +#include <linux/iommu.h> +#include <linux/iova_bitmap.h> +#include <uapi/linux/iommufd.h> struct iommu_domain; struct iommu_group; @@ -18,6 +21,7 @@ struct iommufd_ctx { struct file *file; struct xarray objects; struct xarray groups; + wait_queue_head_t destroy_wait; u8 account_mode; /* Compatibility with VFIO no iommu */ @@ -70,6 +74,13 @@ int iopt_unmap_iova(struct io_pagetable *iopt, unsigned long iova, unsigned long length, unsigned long *unmapped); int iopt_unmap_all(struct io_pagetable *iopt, unsigned long *unmapped); +int iopt_read_and_clear_dirty_data(struct io_pagetable *iopt, + struct iommu_domain *domain, + unsigned long flags, + struct iommu_hwpt_get_dirty_bitmap *bitmap); +int iopt_set_dirty_tracking(struct io_pagetable *iopt, + struct iommu_domain *domain, bool enable); + void iommufd_access_notify_unmap(struct io_pagetable *iopt, unsigned long iova, unsigned long length); int iopt_table_add_domain(struct io_pagetable *iopt, @@ -113,7 +124,8 @@ enum iommufd_object_type { IOMMUFD_OBJ_NONE, IOMMUFD_OBJ_ANY = IOMMUFD_OBJ_NONE, IOMMUFD_OBJ_DEVICE, - IOMMUFD_OBJ_HW_PAGETABLE, + IOMMUFD_OBJ_HWPT_PAGING, + IOMMUFD_OBJ_HWPT_NESTED, IOMMUFD_OBJ_IOAS, IOMMUFD_OBJ_ACCESS, #ifdef CONFIG_IOMMUFD_TEST @@ -124,7 +136,7 @@ enum iommufd_object_type { /* Base struct for all objects with a userspace ID handle. */ struct iommufd_object { - struct rw_semaphore destroy_rwsem; + refcount_t shortterm_users; refcount_t users; enum iommufd_object_type type; unsigned int id; @@ -132,10 +144,15 @@ struct iommufd_object { static inline bool iommufd_lock_obj(struct iommufd_object *obj) { - if (!down_read_trylock(&obj->destroy_rwsem)) + if (!refcount_inc_not_zero(&obj->users)) return false; - if (!refcount_inc_not_zero(&obj->users)) { - up_read(&obj->destroy_rwsem); + if (!refcount_inc_not_zero(&obj->shortterm_users)) { + /* + * If the caller doesn't already have a ref on obj this must be + * called under the xa_lock. Otherwise the caller is holding a + * ref on users. Thus it cannot be one before this decrement. + */ + refcount_dec(&obj->users); return false; } return true; @@ -143,10 +160,16 @@ static inline bool iommufd_lock_obj(struct iommufd_object *obj) struct iommufd_object *iommufd_get_object(struct iommufd_ctx *ictx, u32 id, enum iommufd_object_type type); -static inline void iommufd_put_object(struct iommufd_object *obj) +static inline void iommufd_put_object(struct iommufd_ctx *ictx, + struct iommufd_object *obj) { + /* + * Users first, then shortterm so that REMOVE_WAIT_SHORTTERM never sees + * a spurious !0 users with a 0 shortterm_users. + */ refcount_dec(&obj->users); - up_read(&obj->destroy_rwsem); + if (refcount_dec_and_test(&obj->shortterm_users)) + wake_up_interruptible_all(&ictx->destroy_wait); } void iommufd_object_abort(struct iommufd_ctx *ictx, struct iommufd_object *obj); @@ -154,24 +177,56 @@ void iommufd_object_abort_and_destroy(struct iommufd_ctx *ictx, struct iommufd_object *obj); void iommufd_object_finalize(struct iommufd_ctx *ictx, struct iommufd_object *obj); -void __iommufd_object_destroy_user(struct iommufd_ctx *ictx, - struct iommufd_object *obj, bool allow_fail); + +enum { + REMOVE_WAIT_SHORTTERM = 1, +}; +int iommufd_object_remove(struct iommufd_ctx *ictx, + struct iommufd_object *to_destroy, u32 id, + unsigned int flags); + +/* + * The caller holds a users refcount and wants to destroy the object. At this + * point the caller has no shortterm_users reference and at least the xarray + * will be holding one. + */ static inline void iommufd_object_destroy_user(struct iommufd_ctx *ictx, struct iommufd_object *obj) { - __iommufd_object_destroy_user(ictx, obj, false); + int ret; + + ret = iommufd_object_remove(ictx, obj, obj->id, REMOVE_WAIT_SHORTTERM); + + /* + * If there is a bug and we couldn't destroy the object then we did put + * back the caller's users refcount and will eventually try to free it + * again during close. + */ + WARN_ON(ret); } -static inline void iommufd_object_deref_user(struct iommufd_ctx *ictx, - struct iommufd_object *obj) + +/* + * The HWPT allocated by autodomains is used in possibly many devices and + * is automatically destroyed when its refcount reaches zero. + * + * If userspace uses the HWPT manually, even for a short term, then it will + * disrupt this refcounting and the auto-free in the kernel will not work. + * Userspace that tries to use the automatically allocated HWPT must be careful + * to ensure that it is consistently destroyed, eg by not racing accesses + * and by not attaching an automatic HWPT to a device manually. + */ +static inline void +iommufd_object_put_and_try_destroy(struct iommufd_ctx *ictx, + struct iommufd_object *obj) { - __iommufd_object_destroy_user(ictx, obj, true); + iommufd_object_remove(ictx, obj, obj->id, 0); } struct iommufd_object *_iommufd_object_alloc(struct iommufd_ctx *ictx, size_t size, enum iommufd_object_type type); -#define iommufd_object_alloc(ictx, ptr, type) \ +#define __iommufd_object_alloc(ictx, ptr, type, obj) \ container_of(_iommufd_object_alloc( \ ictx, \ sizeof(*(ptr)) + BUILD_BUG_ON_ZERO( \ @@ -180,6 +235,9 @@ struct iommufd_object *_iommufd_object_alloc(struct iommufd_ctx *ictx, type), \ typeof(*(ptr)), obj) +#define iommufd_object_alloc(ictx, ptr, type) \ + __iommufd_object_alloc(ictx, ptr, type, obj) + /* * The IO Address Space (IOAS) pagetable is a virtual page table backed by the * io_pagetable object. It is a user controlled mapping of IOVA -> PFNs. The @@ -222,6 +280,8 @@ int iommufd_option_rlimit_mode(struct iommu_option *cmd, struct iommufd_ctx *ictx); int iommufd_vfio_ioas(struct iommufd_ucmd *ucmd); +int iommufd_check_iova_range(struct io_pagetable *iopt, + struct iommu_hwpt_get_dirty_bitmap *bitmap); /* * A HW pagetable is called an iommu_domain inside the kernel. This user object @@ -231,35 +291,75 @@ int iommufd_vfio_ioas(struct iommufd_ucmd *ucmd); */ struct iommufd_hw_pagetable { struct iommufd_object obj; - struct iommufd_ioas *ioas; struct iommu_domain *domain; +}; + +struct iommufd_hwpt_paging { + struct iommufd_hw_pagetable common; + struct iommufd_ioas *ioas; bool auto_domain : 1; bool enforce_cache_coherency : 1; bool msi_cookie : 1; + bool nest_parent : 1; /* Head at iommufd_ioas::hwpt_list */ struct list_head hwpt_item; }; -struct iommufd_hw_pagetable * -iommufd_hw_pagetable_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas, - struct iommufd_device *idev, bool immediate_attach); -int iommufd_hw_pagetable_enforce_cc(struct iommufd_hw_pagetable *hwpt); +struct iommufd_hwpt_nested { + struct iommufd_hw_pagetable common; + struct iommufd_hwpt_paging *parent; +}; + +static inline bool hwpt_is_paging(struct iommufd_hw_pagetable *hwpt) +{ + return hwpt->obj.type == IOMMUFD_OBJ_HWPT_PAGING; +} + +static inline struct iommufd_hwpt_paging * +to_hwpt_paging(struct iommufd_hw_pagetable *hwpt) +{ + return container_of(hwpt, struct iommufd_hwpt_paging, common); +} + +static inline struct iommufd_hwpt_paging * +iommufd_get_hwpt_paging(struct iommufd_ucmd *ucmd, u32 id) +{ + return container_of(iommufd_get_object(ucmd->ictx, id, + IOMMUFD_OBJ_HWPT_PAGING), + struct iommufd_hwpt_paging, common.obj); +} +int iommufd_hwpt_set_dirty_tracking(struct iommufd_ucmd *ucmd); +int iommufd_hwpt_get_dirty_bitmap(struct iommufd_ucmd *ucmd); + +struct iommufd_hwpt_paging * +iommufd_hwpt_paging_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas, + struct iommufd_device *idev, u32 flags, + bool immediate_attach, + const struct iommu_user_data *user_data); int iommufd_hw_pagetable_attach(struct iommufd_hw_pagetable *hwpt, struct iommufd_device *idev); struct iommufd_hw_pagetable * iommufd_hw_pagetable_detach(struct iommufd_device *idev); -void iommufd_hw_pagetable_destroy(struct iommufd_object *obj); -void iommufd_hw_pagetable_abort(struct iommufd_object *obj); +void iommufd_hwpt_paging_destroy(struct iommufd_object *obj); +void iommufd_hwpt_paging_abort(struct iommufd_object *obj); +void iommufd_hwpt_nested_destroy(struct iommufd_object *obj); +void iommufd_hwpt_nested_abort(struct iommufd_object *obj); int iommufd_hwpt_alloc(struct iommufd_ucmd *ucmd); static inline void iommufd_hw_pagetable_put(struct iommufd_ctx *ictx, struct iommufd_hw_pagetable *hwpt) { - lockdep_assert_not_held(&hwpt->ioas->mutex); - if (hwpt->auto_domain) - iommufd_object_deref_user(ictx, &hwpt->obj); - else - refcount_dec(&hwpt->obj.users); + if (hwpt->obj.type == IOMMUFD_OBJ_HWPT_PAGING) { + struct iommufd_hwpt_paging *hwpt_paging = to_hwpt_paging(hwpt); + + lockdep_assert_not_held(&hwpt_paging->ioas->mutex); + + if (hwpt_paging->auto_domain) { + iommufd_object_put_and_try_destroy(ictx, &hwpt->obj); + return; + } + } + refcount_dec(&hwpt->obj.users); } struct iommufd_group { diff --git a/drivers/iommu/iommufd/iommufd_test.h b/drivers/iommu/iommufd/iommufd_test.h index 3f3644375b..7910fbe196 100644 --- a/drivers/iommu/iommufd/iommufd_test.h +++ b/drivers/iommu/iommufd/iommufd_test.h @@ -19,6 +19,8 @@ enum { IOMMU_TEST_OP_SET_TEMP_MEMORY_LIMIT, IOMMU_TEST_OP_MOCK_DOMAIN_REPLACE, IOMMU_TEST_OP_ACCESS_REPLACE_IOAS, + IOMMU_TEST_OP_MOCK_DOMAIN_FLAGS, + IOMMU_TEST_OP_DIRTY, }; enum { @@ -40,6 +42,15 @@ enum { MOCK_FLAGS_ACCESS_CREATE_NEEDS_PIN_PAGES = 1 << 0, }; +enum { + MOCK_FLAGS_DEVICE_NO_DIRTY = 1 << 0, +}; + +enum { + MOCK_NESTED_DOMAIN_IOTLB_ID_MAX = 3, + MOCK_NESTED_DOMAIN_IOTLB_NUM = 4, +}; + struct iommu_test_cmd { __u32 size; __u32 op; @@ -57,6 +68,13 @@ struct iommu_test_cmd { __u32 out_idev_id; } mock_domain; struct { + __u32 out_stdev_id; + __u32 out_hwpt_id; + __u32 out_idev_id; + /* Expand mock_domain to set mock device flags */ + __u32 dev_flags; + } mock_domain_flags; + struct { __u32 pt_id; } mock_domain_replace; struct { @@ -95,6 +113,14 @@ struct iommu_test_cmd { struct { __u32 ioas_id; } access_replace_ioas; + struct { + __u32 flags; + __aligned_u64 iova; + __aligned_u64 length; + __aligned_u64 page_size; + __aligned_u64 uptr; + __aligned_u64 out_nr_dirty; + } dirty; }; __u32 last; }; @@ -109,4 +135,17 @@ struct iommu_test_hw_info { __u32 test_reg; }; +/* Should not be equal to any defined value in enum iommu_hwpt_data_type */ +#define IOMMU_HWPT_DATA_SELFTEST 0xdead +#define IOMMU_TEST_IOTLB_DEFAULT 0xbadbeef + +/** + * struct iommu_hwpt_selftest + * + * @iotlb: default mock iotlb value, IOMMU_TEST_IOTLB_DEFAULT + */ +struct iommu_hwpt_selftest { + __u32 iotlb; +}; + #endif diff --git a/drivers/iommu/iommufd/iova_bitmap.c b/drivers/iommu/iommufd/iova_bitmap.c new file mode 100644 index 0000000000..db8c46bee1 --- /dev/null +++ b/drivers/iommu/iommufd/iova_bitmap.c @@ -0,0 +1,474 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2022, Oracle and/or its affiliates. + * Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved + */ +#include <linux/iova_bitmap.h> +#include <linux/mm.h> +#include <linux/slab.h> +#include <linux/highmem.h> + +#define BITS_PER_PAGE (PAGE_SIZE * BITS_PER_BYTE) + +/* + * struct iova_bitmap_map - A bitmap representing an IOVA range + * + * Main data structure for tracking mapped user pages of bitmap data. + * + * For example, for something recording dirty IOVAs, it will be provided a + * struct iova_bitmap structure, as a general structure for iterating the + * total IOVA range. The struct iova_bitmap_map, though, represents the + * subset of said IOVA space that is pinned by its parent structure (struct + * iova_bitmap). + * + * The user does not need to exact location of the bits in the bitmap. + * From user perspective the only API available is iova_bitmap_set() which + * records the IOVA *range* in the bitmap by setting the corresponding + * bits. + * + * The bitmap is an array of u64 whereas each bit represents an IOVA of + * range of (1 << pgshift). Thus formula for the bitmap data to be set is: + * + * data[(iova / page_size) / 64] & (1ULL << (iova % 64)) + */ +struct iova_bitmap_map { + /* base IOVA representing bit 0 of the first page */ + unsigned long iova; + + /* page size order that each bit granules to */ + unsigned long pgshift; + + /* page offset of the first user page pinned */ + unsigned long pgoff; + + /* number of pages pinned */ + unsigned long npages; + + /* pinned pages representing the bitmap data */ + struct page **pages; +}; + +/* + * struct iova_bitmap - The IOVA bitmap object + * + * Main data structure for iterating over the bitmap data. + * + * Abstracts the pinning work and iterates in IOVA ranges. + * It uses a windowing scheme and pins the bitmap in relatively + * big ranges e.g. + * + * The bitmap object uses one base page to store all the pinned pages + * pointers related to the bitmap. For sizeof(struct page*) == 8 it stores + * 512 struct page pointers which, if the base page size is 4K, it means + * 2M of bitmap data is pinned at a time. If the iova_bitmap page size is + * also 4K then the range window to iterate is 64G. + * + * For example iterating on a total IOVA range of 4G..128G, it will walk + * through this set of ranges: + * + * 4G - 68G-1 (64G) + * 68G - 128G-1 (64G) + * + * An example of the APIs on how to use/iterate over the IOVA bitmap: + * + * bitmap = iova_bitmap_alloc(iova, length, page_size, data); + * if (IS_ERR(bitmap)) + * return PTR_ERR(bitmap); + * + * ret = iova_bitmap_for_each(bitmap, arg, dirty_reporter_fn); + * + * iova_bitmap_free(bitmap); + * + * Each iteration of the @dirty_reporter_fn is called with a unique @iova + * and @length argument, indicating the current range available through the + * iova_bitmap. The @dirty_reporter_fn uses iova_bitmap_set() to mark dirty + * areas (@iova_length) within that provided range, as following: + * + * iova_bitmap_set(bitmap, iova, iova_length); + * + * The internals of the object uses an index @mapped_base_index that indexes + * which u64 word of the bitmap is mapped, up to @mapped_total_index. + * Those keep being incremented until @mapped_total_index is reached while + * mapping up to PAGE_SIZE / sizeof(struct page*) maximum of pages. + * + * The IOVA bitmap is usually located on what tracks DMA mapped ranges or + * some form of IOVA range tracking that co-relates to the user passed + * bitmap. + */ +struct iova_bitmap { + /* IOVA range representing the currently mapped bitmap data */ + struct iova_bitmap_map mapped; + + /* userspace address of the bitmap */ + u8 __user *bitmap; + + /* u64 index that @mapped points to */ + unsigned long mapped_base_index; + + /* how many u64 can we walk in total */ + unsigned long mapped_total_index; + + /* base IOVA of the whole bitmap */ + unsigned long iova; + + /* length of the IOVA range for the whole bitmap */ + size_t length; + + /* length of the IOVA range set ahead the pinned pages */ + unsigned long set_ahead_length; +}; + +/* + * Converts a relative IOVA to a bitmap index. + * This function provides the index into the u64 array (bitmap::bitmap) + * for a given IOVA offset. + * Relative IOVA means relative to the bitmap::mapped base IOVA + * (stored in mapped::iova). All computations in this file are done using + * relative IOVAs and thus avoid an extra subtraction against mapped::iova. + * The user API iova_bitmap_set() always uses a regular absolute IOVAs. + */ +static unsigned long iova_bitmap_offset_to_index(struct iova_bitmap *bitmap, + unsigned long iova) +{ + unsigned long pgsize = 1 << bitmap->mapped.pgshift; + + return iova / (BITS_PER_TYPE(*bitmap->bitmap) * pgsize); +} + +/* + * Converts a bitmap index to a *relative* IOVA. + */ +static unsigned long iova_bitmap_index_to_offset(struct iova_bitmap *bitmap, + unsigned long index) +{ + unsigned long pgshift = bitmap->mapped.pgshift; + + return (index * BITS_PER_TYPE(*bitmap->bitmap)) << pgshift; +} + +/* + * Returns the base IOVA of the mapped range. + */ +static unsigned long iova_bitmap_mapped_iova(struct iova_bitmap *bitmap) +{ + unsigned long skip = bitmap->mapped_base_index; + + return bitmap->iova + iova_bitmap_index_to_offset(bitmap, skip); +} + +/* + * Pins the bitmap user pages for the current range window. + * This is internal to IOVA bitmap and called when advancing the + * index (@mapped_base_index) or allocating the bitmap. + */ +static int iova_bitmap_get(struct iova_bitmap *bitmap) +{ + struct iova_bitmap_map *mapped = &bitmap->mapped; + unsigned long npages; + u8 __user *addr; + long ret; + + /* + * @mapped_base_index is the index of the currently mapped u64 words + * that we have access. Anything before @mapped_base_index is not + * mapped. The range @mapped_base_index .. @mapped_total_index-1 is + * mapped but capped at a maximum number of pages. + */ + npages = DIV_ROUND_UP((bitmap->mapped_total_index - + bitmap->mapped_base_index) * + sizeof(*bitmap->bitmap), PAGE_SIZE); + + /* + * Bitmap address to be pinned is calculated via pointer arithmetic + * with bitmap u64 word index. + */ + addr = bitmap->bitmap + bitmap->mapped_base_index; + + /* + * We always cap at max number of 'struct page' a base page can fit. + * This is, for example, on x86 means 2M of bitmap data max. + */ + npages = min(npages + !!offset_in_page(addr), + PAGE_SIZE / sizeof(struct page *)); + + ret = pin_user_pages_fast((unsigned long)addr, npages, + FOLL_WRITE, mapped->pages); + if (ret <= 0) + return -EFAULT; + + mapped->npages = (unsigned long)ret; + /* Base IOVA where @pages point to i.e. bit 0 of the first page */ + mapped->iova = iova_bitmap_mapped_iova(bitmap); + + /* + * offset of the page where pinned pages bit 0 is located. + * This handles the case where the bitmap is not PAGE_SIZE + * aligned. + */ + mapped->pgoff = offset_in_page(addr); + return 0; +} + +/* + * Unpins the bitmap user pages and clears @npages + * (un)pinning is abstracted from API user and it's done when advancing + * the index or freeing the bitmap. + */ +static void iova_bitmap_put(struct iova_bitmap *bitmap) +{ + struct iova_bitmap_map *mapped = &bitmap->mapped; + + if (mapped->npages) { + unpin_user_pages(mapped->pages, mapped->npages); + mapped->npages = 0; + } +} + +/** + * iova_bitmap_alloc() - Allocates an IOVA bitmap object + * @iova: Start address of the IOVA range + * @length: Length of the IOVA range + * @page_size: Page size of the IOVA bitmap. It defines what each bit + * granularity represents + * @data: Userspace address of the bitmap + * + * Allocates an IOVA object and initializes all its fields including the + * first user pages of @data. + * + * Return: A pointer to a newly allocated struct iova_bitmap + * or ERR_PTR() on error. + */ +struct iova_bitmap *iova_bitmap_alloc(unsigned long iova, size_t length, + unsigned long page_size, u64 __user *data) +{ + struct iova_bitmap_map *mapped; + struct iova_bitmap *bitmap; + int rc; + + bitmap = kzalloc(sizeof(*bitmap), GFP_KERNEL); + if (!bitmap) + return ERR_PTR(-ENOMEM); + + mapped = &bitmap->mapped; + mapped->pgshift = __ffs(page_size); + bitmap->bitmap = (u8 __user *)data; + bitmap->mapped_total_index = + iova_bitmap_offset_to_index(bitmap, length - 1) + 1; + bitmap->iova = iova; + bitmap->length = length; + mapped->iova = iova; + mapped->pages = (struct page **)__get_free_page(GFP_KERNEL); + if (!mapped->pages) { + rc = -ENOMEM; + goto err; + } + + rc = iova_bitmap_get(bitmap); + if (rc) + goto err; + return bitmap; + +err: + iova_bitmap_free(bitmap); + return ERR_PTR(rc); +} +EXPORT_SYMBOL_NS_GPL(iova_bitmap_alloc, IOMMUFD); + +/** + * iova_bitmap_free() - Frees an IOVA bitmap object + * @bitmap: IOVA bitmap to free + * + * It unpins and releases pages array memory and clears any leftover + * state. + */ +void iova_bitmap_free(struct iova_bitmap *bitmap) +{ + struct iova_bitmap_map *mapped = &bitmap->mapped; + + iova_bitmap_put(bitmap); + + if (mapped->pages) { + free_page((unsigned long)mapped->pages); + mapped->pages = NULL; + } + + kfree(bitmap); +} +EXPORT_SYMBOL_NS_GPL(iova_bitmap_free, IOMMUFD); + +/* + * Returns the remaining bitmap indexes from mapped_total_index to process for + * the currently pinned bitmap pages. + */ +static unsigned long iova_bitmap_mapped_remaining(struct iova_bitmap *bitmap) +{ + unsigned long remaining, bytes; + + bytes = (bitmap->mapped.npages << PAGE_SHIFT) - bitmap->mapped.pgoff; + + remaining = bitmap->mapped_total_index - bitmap->mapped_base_index; + remaining = min_t(unsigned long, remaining, + DIV_ROUND_UP(bytes, sizeof(*bitmap->bitmap))); + + return remaining; +} + +/* + * Returns the length of the mapped IOVA range. + */ +static unsigned long iova_bitmap_mapped_length(struct iova_bitmap *bitmap) +{ + unsigned long max_iova = bitmap->iova + bitmap->length - 1; + unsigned long iova = iova_bitmap_mapped_iova(bitmap); + unsigned long remaining; + + /* + * iova_bitmap_mapped_remaining() returns a number of indexes which + * when converted to IOVA gives us a max length that the bitmap + * pinned data can cover. Afterwards, that is capped to + * only cover the IOVA range in @bitmap::iova .. @bitmap::length. + */ + remaining = iova_bitmap_index_to_offset(bitmap, + iova_bitmap_mapped_remaining(bitmap)); + + if (iova + remaining - 1 > max_iova) + remaining -= ((iova + remaining - 1) - max_iova); + + return remaining; +} + +/* + * Returns true if there's not more data to iterate. + */ +static bool iova_bitmap_done(struct iova_bitmap *bitmap) +{ + return bitmap->mapped_base_index >= bitmap->mapped_total_index; +} + +static int iova_bitmap_set_ahead(struct iova_bitmap *bitmap, + size_t set_ahead_length) +{ + int ret = 0; + + while (set_ahead_length > 0 && !iova_bitmap_done(bitmap)) { + unsigned long length = iova_bitmap_mapped_length(bitmap); + unsigned long iova = iova_bitmap_mapped_iova(bitmap); + + ret = iova_bitmap_get(bitmap); + if (ret) + break; + + length = min(length, set_ahead_length); + iova_bitmap_set(bitmap, iova, length); + + set_ahead_length -= length; + bitmap->mapped_base_index += + iova_bitmap_offset_to_index(bitmap, length - 1) + 1; + iova_bitmap_put(bitmap); + } + + bitmap->set_ahead_length = 0; + return ret; +} + +/* + * Advances to the next range, releases the current pinned + * pages and pins the next set of bitmap pages. + * Returns 0 on success or otherwise errno. + */ +static int iova_bitmap_advance(struct iova_bitmap *bitmap) +{ + unsigned long iova = iova_bitmap_mapped_length(bitmap) - 1; + unsigned long count = iova_bitmap_offset_to_index(bitmap, iova) + 1; + + bitmap->mapped_base_index += count; + + iova_bitmap_put(bitmap); + if (iova_bitmap_done(bitmap)) + return 0; + + /* Iterate, set and skip any bits requested for next iteration */ + if (bitmap->set_ahead_length) { + int ret; + + ret = iova_bitmap_set_ahead(bitmap, bitmap->set_ahead_length); + if (ret) + return ret; + } + + /* When advancing the index we pin the next set of bitmap pages */ + return iova_bitmap_get(bitmap); +} + +/** + * iova_bitmap_for_each() - Iterates over the bitmap + * @bitmap: IOVA bitmap to iterate + * @opaque: Additional argument to pass to the callback + * @fn: Function that gets called for each IOVA range + * + * Helper function to iterate over bitmap data representing a portion of IOVA + * space. It hides the complexity of iterating bitmaps and translating the + * mapped bitmap user pages into IOVA ranges to process. + * + * Return: 0 on success, and an error on failure either upon + * iteration or when the callback returns an error. + */ +int iova_bitmap_for_each(struct iova_bitmap *bitmap, void *opaque, + iova_bitmap_fn_t fn) +{ + int ret = 0; + + for (; !iova_bitmap_done(bitmap) && !ret; + ret = iova_bitmap_advance(bitmap)) { + ret = fn(bitmap, iova_bitmap_mapped_iova(bitmap), + iova_bitmap_mapped_length(bitmap), opaque); + if (ret) + break; + } + + return ret; +} +EXPORT_SYMBOL_NS_GPL(iova_bitmap_for_each, IOMMUFD); + +/** + * iova_bitmap_set() - Records an IOVA range in bitmap + * @bitmap: IOVA bitmap + * @iova: IOVA to start + * @length: IOVA range length + * + * Set the bits corresponding to the range [iova .. iova+length-1] in + * the user bitmap. + * + */ +void iova_bitmap_set(struct iova_bitmap *bitmap, + unsigned long iova, size_t length) +{ + struct iova_bitmap_map *mapped = &bitmap->mapped; + unsigned long cur_bit = ((iova - mapped->iova) >> + mapped->pgshift) + mapped->pgoff * BITS_PER_BYTE; + unsigned long last_bit = (((iova + length - 1) - mapped->iova) >> + mapped->pgshift) + mapped->pgoff * BITS_PER_BYTE; + unsigned long last_page_idx = mapped->npages - 1; + + do { + unsigned int page_idx = cur_bit / BITS_PER_PAGE; + unsigned int offset = cur_bit % BITS_PER_PAGE; + unsigned int nbits = min(BITS_PER_PAGE - offset, + last_bit - cur_bit + 1); + void *kaddr; + + if (unlikely(page_idx > last_page_idx)) + break; + + kaddr = kmap_local_page(mapped->pages[page_idx]); + bitmap_set(kaddr, offset, nbits); + kunmap_local(kaddr); + cur_bit += nbits; + } while (cur_bit <= last_bit); + + if (unlikely(cur_bit <= last_bit)) { + bitmap->set_ahead_length = + ((last_bit - cur_bit + 1) << bitmap->mapped.pgshift); + } +} +EXPORT_SYMBOL_NS_GPL(iova_bitmap_set, IOMMUFD); diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c index e71523cbd0..c9091e46d2 100644 --- a/drivers/iommu/iommufd/main.c +++ b/drivers/iommu/iommufd/main.c @@ -33,7 +33,6 @@ struct iommufd_object *_iommufd_object_alloc(struct iommufd_ctx *ictx, size_t size, enum iommufd_object_type type) { - static struct lock_class_key obj_keys[IOMMUFD_OBJ_MAX]; struct iommufd_object *obj; int rc; @@ -41,15 +40,8 @@ struct iommufd_object *_iommufd_object_alloc(struct iommufd_ctx *ictx, if (!obj) return ERR_PTR(-ENOMEM); obj->type = type; - /* - * In most cases the destroy_rwsem is obtained with try so it doesn't - * interact with lockdep, however on destroy we have to sleep. This - * means if we have to destroy an object while holding a get on another - * object it triggers lockdep. Using one locking class per object type - * is a simple and reasonable way to avoid this. - */ - __init_rwsem(&obj->destroy_rwsem, "iommufd_object::destroy_rwsem", - &obj_keys[type]); + /* Starts out bias'd by 1 until it is removed from the xarray */ + refcount_set(&obj->shortterm_users, 1); refcount_set(&obj->users, 1); /* @@ -129,92 +121,113 @@ struct iommufd_object *iommufd_get_object(struct iommufd_ctx *ictx, u32 id, return obj; } +static int iommufd_object_dec_wait_shortterm(struct iommufd_ctx *ictx, + struct iommufd_object *to_destroy) +{ + if (refcount_dec_and_test(&to_destroy->shortterm_users)) + return 0; + + if (wait_event_timeout(ictx->destroy_wait, + refcount_read(&to_destroy->shortterm_users) == + 0, + msecs_to_jiffies(10000))) + return 0; + + pr_crit("Time out waiting for iommufd object to become free\n"); + refcount_inc(&to_destroy->shortterm_users); + return -EBUSY; +} + /* * Remove the given object id from the xarray if the only reference to the - * object is held by the xarray. The caller must call ops destroy(). + * object is held by the xarray. */ -static struct iommufd_object *iommufd_object_remove(struct iommufd_ctx *ictx, - u32 id, bool extra_put) +int iommufd_object_remove(struct iommufd_ctx *ictx, + struct iommufd_object *to_destroy, u32 id, + unsigned int flags) { struct iommufd_object *obj; XA_STATE(xas, &ictx->objects, id); - - xa_lock(&ictx->objects); - obj = xas_load(&xas); - if (xa_is_zero(obj) || !obj) { - obj = ERR_PTR(-ENOENT); - goto out_xa; - } + bool zerod_shortterm = false; + int ret; /* - * If the caller is holding a ref on obj we put it here under the - * spinlock. + * The purpose of the shortterm_users is to ensure deterministic + * destruction of objects used by external drivers and destroyed by this + * function. Any temporary increment of the refcount must increment + * shortterm_users, such as during ioctl execution. */ - if (extra_put) + if (flags & REMOVE_WAIT_SHORTTERM) { + ret = iommufd_object_dec_wait_shortterm(ictx, to_destroy); + if (ret) { + /* + * We have a bug. Put back the callers reference and + * defer cleaning this object until close. + */ + refcount_dec(&to_destroy->users); + return ret; + } + zerod_shortterm = true; + } + + xa_lock(&ictx->objects); + obj = xas_load(&xas); + if (to_destroy) { + /* + * If the caller is holding a ref on obj we put it here under + * the spinlock. + */ refcount_dec(&obj->users); + if (WARN_ON(obj != to_destroy)) { + ret = -ENOENT; + goto err_xa; + } + } else if (xa_is_zero(obj) || !obj) { + ret = -ENOENT; + goto err_xa; + } + if (!refcount_dec_if_one(&obj->users)) { - obj = ERR_PTR(-EBUSY); - goto out_xa; + ret = -EBUSY; + goto err_xa; } xas_store(&xas, NULL); if (ictx->vfio_ioas == container_of(obj, struct iommufd_ioas, obj)) ictx->vfio_ioas = NULL; - -out_xa: xa_unlock(&ictx->objects); - /* The returned object reference count is zero */ - return obj; -} - -/* - * The caller holds a users refcount and wants to destroy the object. Returns - * true if the object was destroyed. In all cases the caller no longer has a - * reference on obj. - */ -void __iommufd_object_destroy_user(struct iommufd_ctx *ictx, - struct iommufd_object *obj, bool allow_fail) -{ - struct iommufd_object *ret; - - /* - * The purpose of the destroy_rwsem is to ensure deterministic - * destruction of objects used by external drivers and destroyed by this - * function. Any temporary increment of the refcount must hold the read - * side of this, such as during ioctl execution. - */ - down_write(&obj->destroy_rwsem); - ret = iommufd_object_remove(ictx, obj->id, true); - up_write(&obj->destroy_rwsem); - - if (allow_fail && IS_ERR(ret)) - return; - /* - * If there is a bug and we couldn't destroy the object then we did put - * back the caller's refcount and will eventually try to free it again - * during close. + * Since users is zero any positive users_shortterm must be racing + * iommufd_put_object(), or we have a bug. */ - if (WARN_ON(IS_ERR(ret))) - return; + if (!zerod_shortterm) { + ret = iommufd_object_dec_wait_shortterm(ictx, obj); + if (WARN_ON(ret)) + return ret; + } iommufd_object_ops[obj->type].destroy(obj); kfree(obj); + return 0; + +err_xa: + if (zerod_shortterm) { + /* Restore the xarray owned reference */ + refcount_set(&obj->shortterm_users, 1); + } + xa_unlock(&ictx->objects); + + /* The returned object reference count is zero */ + return ret; } static int iommufd_destroy(struct iommufd_ucmd *ucmd) { struct iommu_destroy *cmd = ucmd->cmd; - struct iommufd_object *obj; - obj = iommufd_object_remove(ucmd->ictx, cmd->id, false); - if (IS_ERR(obj)) - return PTR_ERR(obj); - iommufd_object_ops[obj->type].destroy(obj); - kfree(obj); - return 0; + return iommufd_object_remove(ucmd->ictx, NULL, cmd->id, 0); } static int iommufd_fops_open(struct inode *inode, struct file *filp) @@ -238,6 +251,7 @@ static int iommufd_fops_open(struct inode *inode, struct file *filp) xa_init_flags(&ictx->objects, XA_FLAGS_ALLOC1 | XA_FLAGS_ACCOUNT); xa_init(&ictx->groups); ictx->file = filp; + init_waitqueue_head(&ictx->destroy_wait); filp->private_data = ictx; return 0; } @@ -307,6 +321,8 @@ union ucmd_buffer { struct iommu_destroy destroy; struct iommu_hw_info info; struct iommu_hwpt_alloc hwpt; + struct iommu_hwpt_get_dirty_bitmap get_dirty_bitmap; + struct iommu_hwpt_set_dirty_tracking set_dirty_tracking; struct iommu_ioas_alloc alloc; struct iommu_ioas_allow_iovas allow_iovas; struct iommu_ioas_copy ioas_copy; @@ -342,6 +358,10 @@ static const struct iommufd_ioctl_op iommufd_ioctl_ops[] = { __reserved), IOCTL_OP(IOMMU_HWPT_ALLOC, iommufd_hwpt_alloc, struct iommu_hwpt_alloc, __reserved), + IOCTL_OP(IOMMU_HWPT_GET_DIRTY_BITMAP, iommufd_hwpt_get_dirty_bitmap, + struct iommu_hwpt_get_dirty_bitmap, data), + IOCTL_OP(IOMMU_HWPT_SET_DIRTY_TRACKING, iommufd_hwpt_set_dirty_tracking, + struct iommu_hwpt_set_dirty_tracking, __reserved), IOCTL_OP(IOMMU_IOAS_ALLOC, iommufd_ioas_alloc_ioctl, struct iommu_ioas_alloc, out_ioas_id), IOCTL_OP(IOMMU_IOAS_ALLOW_IOVAS, iommufd_ioas_allow_iovas, @@ -482,9 +502,13 @@ static const struct iommufd_object_ops iommufd_object_ops[] = { [IOMMUFD_OBJ_IOAS] = { .destroy = iommufd_ioas_destroy, }, - [IOMMUFD_OBJ_HW_PAGETABLE] = { - .destroy = iommufd_hw_pagetable_destroy, - .abort = iommufd_hw_pagetable_abort, + [IOMMUFD_OBJ_HWPT_PAGING] = { + .destroy = iommufd_hwpt_paging_destroy, + .abort = iommufd_hwpt_paging_abort, + }, + [IOMMUFD_OBJ_HWPT_NESTED] = { + .destroy = iommufd_hwpt_nested_destroy, + .abort = iommufd_hwpt_nested_abort, }, #ifdef CONFIG_IOMMUFD_TEST [IOMMUFD_OBJ_SELFTEST] = { @@ -552,5 +576,6 @@ MODULE_ALIAS_MISCDEV(VFIO_MINOR); MODULE_ALIAS("devname:vfio/vfio"); #endif MODULE_IMPORT_NS(IOMMUFD_INTERNAL); +MODULE_IMPORT_NS(IOMMUFD); MODULE_DESCRIPTION("I/O Address Space Management for passthrough devices"); MODULE_LICENSE("GPL"); diff --git a/drivers/iommu/iommufd/selftest.c b/drivers/iommu/iommufd/selftest.c index 56506d5753..022ef8f550 100644 --- a/drivers/iommu/iommufd/selftest.c +++ b/drivers/iommu/iommufd/selftest.c @@ -20,10 +20,13 @@ static DECLARE_FAULT_ATTR(fail_iommufd); static struct dentry *dbgfs_root; static struct platform_device *selftest_iommu_dev; +static const struct iommu_ops mock_ops; +static struct iommu_domain_ops domain_nested_ops; size_t iommufd_test_memory_limit = 65536; enum { + MOCK_DIRTY_TRACK = 1, MOCK_IO_PAGE_SIZE = PAGE_SIZE / 2, /* @@ -36,6 +39,7 @@ enum { _MOCK_PFN_START = MOCK_PFN_MASK + 1, MOCK_PFN_START_IOVA = _MOCK_PFN_START, MOCK_PFN_LAST_IOVA = _MOCK_PFN_START, + MOCK_PFN_DIRTY_IOVA = _MOCK_PFN_START << 1, }; /* @@ -82,20 +86,28 @@ void iommufd_test_syz_conv_iova_id(struct iommufd_ucmd *ucmd, if (IS_ERR(ioas)) return; *iova = iommufd_test_syz_conv_iova(&ioas->iopt, iova); - iommufd_put_object(&ioas->obj); + iommufd_put_object(ucmd->ictx, &ioas->obj); } struct mock_iommu_domain { + unsigned long flags; struct iommu_domain domain; struct xarray pfns; }; +struct mock_iommu_domain_nested { + struct iommu_domain domain; + struct mock_iommu_domain *parent; + u32 iotlb[MOCK_NESTED_DOMAIN_IOTLB_NUM]; +}; + enum selftest_obj_type { TYPE_IDEV, }; struct mock_dev { struct device dev; + unsigned long flags; }; struct selftest_obj { @@ -111,18 +123,18 @@ struct selftest_obj { }; }; -static void mock_domain_blocking_free(struct iommu_domain *domain) -{ -} - static int mock_domain_nop_attach(struct iommu_domain *domain, struct device *dev) { + struct mock_dev *mdev = container_of(dev, struct mock_dev, dev); + + if (domain->dirty_ops && (mdev->flags & MOCK_FLAGS_DEVICE_NO_DIRTY)) + return -EINVAL; + return 0; } static const struct iommu_domain_ops mock_blocking_ops = { - .free = mock_domain_blocking_free, .attach_dev = mock_domain_nop_attach, }; @@ -146,15 +158,70 @@ static void *mock_domain_hw_info(struct device *dev, u32 *length, u32 *type) return info; } -static struct iommu_domain *mock_domain_alloc(unsigned int iommu_domain_type) +static int mock_domain_set_dirty_tracking(struct iommu_domain *domain, + bool enable) { - struct mock_iommu_domain *mock; + struct mock_iommu_domain *mock = + container_of(domain, struct mock_iommu_domain, domain); + unsigned long flags = mock->flags; - if (iommu_domain_type == IOMMU_DOMAIN_BLOCKED) - return &mock_blocking_domain; + if (enable && !domain->dirty_ops) + return -EINVAL; - if (iommu_domain_type != IOMMU_DOMAIN_UNMANAGED) - return NULL; + /* No change? */ + if (!(enable ^ !!(flags & MOCK_DIRTY_TRACK))) + return 0; + + flags = (enable ? flags | MOCK_DIRTY_TRACK : flags & ~MOCK_DIRTY_TRACK); + + mock->flags = flags; + return 0; +} + +static int mock_domain_read_and_clear_dirty(struct iommu_domain *domain, + unsigned long iova, size_t size, + unsigned long flags, + struct iommu_dirty_bitmap *dirty) +{ + struct mock_iommu_domain *mock = + container_of(domain, struct mock_iommu_domain, domain); + unsigned long i, max = size / MOCK_IO_PAGE_SIZE; + void *ent, *old; + + if (!(mock->flags & MOCK_DIRTY_TRACK) && dirty->bitmap) + return -EINVAL; + + for (i = 0; i < max; i++) { + unsigned long cur = iova + i * MOCK_IO_PAGE_SIZE; + + ent = xa_load(&mock->pfns, cur / MOCK_IO_PAGE_SIZE); + if (ent && (xa_to_value(ent) & MOCK_PFN_DIRTY_IOVA)) { + /* Clear dirty */ + if (!(flags & IOMMU_DIRTY_NO_CLEAR)) { + unsigned long val; + + val = xa_to_value(ent) & ~MOCK_PFN_DIRTY_IOVA; + old = xa_store(&mock->pfns, + cur / MOCK_IO_PAGE_SIZE, + xa_mk_value(val), GFP_KERNEL); + WARN_ON_ONCE(ent != old); + } + iommu_dirty_bitmap_record(dirty, cur, + MOCK_IO_PAGE_SIZE); + } + } + + return 0; +} + +const struct iommu_dirty_ops dirty_ops = { + .set_dirty_tracking = mock_domain_set_dirty_tracking, + .read_and_clear_dirty = mock_domain_read_and_clear_dirty, +}; + +static struct iommu_domain *mock_domain_alloc_paging(struct device *dev) +{ + struct mock_iommu_domain *mock; mock = kzalloc(sizeof(*mock), GFP_KERNEL); if (!mock) @@ -162,10 +229,78 @@ static struct iommu_domain *mock_domain_alloc(unsigned int iommu_domain_type) mock->domain.geometry.aperture_start = MOCK_APERTURE_START; mock->domain.geometry.aperture_end = MOCK_APERTURE_LAST; mock->domain.pgsize_bitmap = MOCK_IO_PAGE_SIZE; + mock->domain.ops = mock_ops.default_domain_ops; + mock->domain.type = IOMMU_DOMAIN_UNMANAGED; xa_init(&mock->pfns); return &mock->domain; } +static struct iommu_domain * +__mock_domain_alloc_nested(struct mock_iommu_domain *mock_parent, + const struct iommu_hwpt_selftest *user_cfg) +{ + struct mock_iommu_domain_nested *mock_nested; + int i; + + mock_nested = kzalloc(sizeof(*mock_nested), GFP_KERNEL); + if (!mock_nested) + return ERR_PTR(-ENOMEM); + mock_nested->parent = mock_parent; + mock_nested->domain.ops = &domain_nested_ops; + mock_nested->domain.type = IOMMU_DOMAIN_NESTED; + for (i = 0; i < MOCK_NESTED_DOMAIN_IOTLB_NUM; i++) + mock_nested->iotlb[i] = user_cfg->iotlb; + return &mock_nested->domain; +} + +static struct iommu_domain * +mock_domain_alloc_user(struct device *dev, u32 flags, + struct iommu_domain *parent, + const struct iommu_user_data *user_data) +{ + struct mock_iommu_domain *mock_parent; + struct iommu_hwpt_selftest user_cfg; + int rc; + + /* must be mock_domain */ + if (!parent) { + struct mock_dev *mdev = container_of(dev, struct mock_dev, dev); + bool has_dirty_flag = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING; + bool no_dirty_ops = mdev->flags & MOCK_FLAGS_DEVICE_NO_DIRTY; + struct iommu_domain *domain; + + if (flags & (~(IOMMU_HWPT_ALLOC_NEST_PARENT | + IOMMU_HWPT_ALLOC_DIRTY_TRACKING))) + return ERR_PTR(-EOPNOTSUPP); + if (user_data || (has_dirty_flag && no_dirty_ops)) + return ERR_PTR(-EOPNOTSUPP); + domain = mock_domain_alloc_paging(NULL); + if (!domain) + return ERR_PTR(-ENOMEM); + if (has_dirty_flag) + container_of(domain, struct mock_iommu_domain, domain) + ->domain.dirty_ops = &dirty_ops; + return domain; + } + + /* must be mock_domain_nested */ + if (user_data->type != IOMMU_HWPT_DATA_SELFTEST || flags) + return ERR_PTR(-EOPNOTSUPP); + if (!parent || parent->ops != mock_ops.default_domain_ops) + return ERR_PTR(-EINVAL); + + mock_parent = container_of(parent, struct mock_iommu_domain, domain); + if (!mock_parent) + return ERR_PTR(-EINVAL); + + rc = iommu_copy_struct_from_user(&user_cfg, user_data, + IOMMU_HWPT_DATA_SELFTEST, iotlb); + if (rc) + return ERR_PTR(rc); + + return __mock_domain_alloc_nested(mock_parent, &user_cfg); +} + static void mock_domain_free(struct iommu_domain *domain) { struct mock_iommu_domain *mock = @@ -243,7 +378,7 @@ static size_t mock_domain_unmap_pages(struct iommu_domain *domain, for (cur = 0; cur != pgsize; cur += MOCK_IO_PAGE_SIZE) { ent = xa_erase(&mock->pfns, iova / MOCK_IO_PAGE_SIZE); - WARN_ON(!ent); + /* * iommufd generates unmaps that must be a strict * superset of the map's performend So every starting @@ -253,13 +388,13 @@ static size_t mock_domain_unmap_pages(struct iommu_domain *domain, * passed to map_pages */ if (first) { - WARN_ON(!(xa_to_value(ent) & - MOCK_PFN_START_IOVA)); + WARN_ON(ent && !(xa_to_value(ent) & + MOCK_PFN_START_IOVA)); first = false; } if (pgcount == 1 && cur + MOCK_IO_PAGE_SIZE == pgsize) - WARN_ON(!(xa_to_value(ent) & - MOCK_PFN_LAST_IOVA)); + WARN_ON(ent && !(xa_to_value(ent) & + MOCK_PFN_LAST_IOVA)); iova += MOCK_IO_PAGE_SIZE; ret += MOCK_IO_PAGE_SIZE; @@ -283,15 +418,18 @@ static phys_addr_t mock_domain_iova_to_phys(struct iommu_domain *domain, static bool mock_domain_capable(struct device *dev, enum iommu_cap cap) { - return cap == IOMMU_CAP_CACHE_COHERENCY; -} + struct mock_dev *mdev = container_of(dev, struct mock_dev, dev); -static void mock_domain_set_plaform_dma_ops(struct device *dev) -{ - /* - * mock doesn't setup default domains because we can't hook into the - * normal probe path - */ + switch (cap) { + case IOMMU_CAP_CACHE_COHERENCY: + return true; + case IOMMU_CAP_DIRTY_TRACKING: + return !(mdev->flags & MOCK_FLAGS_DEVICE_NO_DIRTY); + default: + break; + } + + return false; } static struct iommu_device mock_iommu_device = { @@ -303,12 +441,18 @@ static struct iommu_device *mock_probe_device(struct device *dev) } static const struct iommu_ops mock_ops = { + /* + * IOMMU_DOMAIN_BLOCKED cannot be returned from def_domain_type() + * because it is zero. + */ + .default_domain = &mock_blocking_domain, + .blocked_domain = &mock_blocking_domain, .owner = THIS_MODULE, .pgsize_bitmap = MOCK_IO_PAGE_SIZE, .hw_info = mock_domain_hw_info, - .domain_alloc = mock_domain_alloc, + .domain_alloc_paging = mock_domain_alloc_paging, + .domain_alloc_user = mock_domain_alloc_user, .capable = mock_domain_capable, - .set_platform_dma_ops = mock_domain_set_plaform_dma_ops, .device_group = generic_device_group, .probe_device = mock_probe_device, .default_domain_ops = @@ -321,26 +465,67 @@ static const struct iommu_ops mock_ops = { }, }; +static void mock_domain_free_nested(struct iommu_domain *domain) +{ + struct mock_iommu_domain_nested *mock_nested = + container_of(domain, struct mock_iommu_domain_nested, domain); + + kfree(mock_nested); +} + +static struct iommu_domain_ops domain_nested_ops = { + .free = mock_domain_free_nested, + .attach_dev = mock_domain_nop_attach, +}; + static inline struct iommufd_hw_pagetable * -get_md_pagetable(struct iommufd_ucmd *ucmd, u32 mockpt_id, - struct mock_iommu_domain **mock) +__get_md_pagetable(struct iommufd_ucmd *ucmd, u32 mockpt_id, u32 hwpt_type) { - struct iommufd_hw_pagetable *hwpt; struct iommufd_object *obj; - obj = iommufd_get_object(ucmd->ictx, mockpt_id, - IOMMUFD_OBJ_HW_PAGETABLE); + obj = iommufd_get_object(ucmd->ictx, mockpt_id, hwpt_type); if (IS_ERR(obj)) return ERR_CAST(obj); - hwpt = container_of(obj, struct iommufd_hw_pagetable, obj); - if (hwpt->domain->ops != mock_ops.default_domain_ops) { - iommufd_put_object(&hwpt->obj); + return container_of(obj, struct iommufd_hw_pagetable, obj); +} + +static inline struct iommufd_hw_pagetable * +get_md_pagetable(struct iommufd_ucmd *ucmd, u32 mockpt_id, + struct mock_iommu_domain **mock) +{ + struct iommufd_hw_pagetable *hwpt; + + hwpt = __get_md_pagetable(ucmd, mockpt_id, IOMMUFD_OBJ_HWPT_PAGING); + if (IS_ERR(hwpt)) + return hwpt; + if (hwpt->domain->type != IOMMU_DOMAIN_UNMANAGED || + hwpt->domain->ops != mock_ops.default_domain_ops) { + iommufd_put_object(ucmd->ictx, &hwpt->obj); return ERR_PTR(-EINVAL); } *mock = container_of(hwpt->domain, struct mock_iommu_domain, domain); return hwpt; } +static inline struct iommufd_hw_pagetable * +get_md_pagetable_nested(struct iommufd_ucmd *ucmd, u32 mockpt_id, + struct mock_iommu_domain_nested **mock_nested) +{ + struct iommufd_hw_pagetable *hwpt; + + hwpt = __get_md_pagetable(ucmd, mockpt_id, IOMMUFD_OBJ_HWPT_NESTED); + if (IS_ERR(hwpt)) + return hwpt; + if (hwpt->domain->type != IOMMU_DOMAIN_NESTED || + hwpt->domain->ops != &domain_nested_ops) { + iommufd_put_object(ucmd->ictx, &hwpt->obj); + return ERR_PTR(-EINVAL); + } + *mock_nested = container_of(hwpt->domain, + struct mock_iommu_domain_nested, domain); + return hwpt; +} + struct mock_bus_type { struct bus_type bus; struct notifier_block nb; @@ -362,16 +547,20 @@ static void mock_dev_release(struct device *dev) kfree(mdev); } -static struct mock_dev *mock_dev_create(void) +static struct mock_dev *mock_dev_create(unsigned long dev_flags) { struct mock_dev *mdev; int rc; + if (dev_flags & ~(MOCK_FLAGS_DEVICE_NO_DIRTY)) + return ERR_PTR(-EINVAL); + mdev = kzalloc(sizeof(*mdev), GFP_KERNEL); if (!mdev) return ERR_PTR(-ENOMEM); device_initialize(&mdev->dev); + mdev->flags = dev_flags; mdev->dev.release = mock_dev_release; mdev->dev.bus = &iommufd_mock_bus_type.bus; @@ -407,6 +596,7 @@ static int iommufd_test_mock_domain(struct iommufd_ucmd *ucmd, struct iommufd_device *idev; struct selftest_obj *sobj; u32 pt_id = cmd->id; + u32 dev_flags = 0; u32 idev_id; int rc; @@ -417,7 +607,10 @@ static int iommufd_test_mock_domain(struct iommufd_ucmd *ucmd, sobj->idev.ictx = ucmd->ictx; sobj->type = TYPE_IDEV; - sobj->idev.mock_dev = mock_dev_create(); + if (cmd->op == IOMMU_TEST_OP_MOCK_DOMAIN_FLAGS) + dev_flags = cmd->mock_domain_flags.dev_flags; + + sobj->idev.mock_dev = mock_dev_create(dev_flags); if (IS_ERR(sobj->idev.mock_dev)) { rc = PTR_ERR(sobj->idev.mock_dev); goto out_sobj; @@ -488,7 +681,7 @@ static int iommufd_test_mock_domain_replace(struct iommufd_ucmd *ucmd, rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd)); out_dev_obj: - iommufd_put_object(dev_obj); + iommufd_put_object(ucmd->ictx, dev_obj); return rc; } @@ -506,7 +699,7 @@ static int iommufd_test_add_reserved(struct iommufd_ucmd *ucmd, down_write(&ioas->iopt.iova_rwsem); rc = iopt_reserve_iova(&ioas->iopt, start, start + length - 1, NULL); up_write(&ioas->iopt.iova_rwsem); - iommufd_put_object(&ioas->obj); + iommufd_put_object(ucmd->ictx, &ioas->obj); return rc; } @@ -561,7 +754,7 @@ static int iommufd_test_md_check_pa(struct iommufd_ucmd *ucmd, rc = 0; out_put: - iommufd_put_object(&hwpt->obj); + iommufd_put_object(ucmd->ictx, &hwpt->obj); return rc; } @@ -977,6 +1170,73 @@ static_assert((unsigned int)MOCK_ACCESS_RW_WRITE == IOMMUFD_ACCESS_RW_WRITE); static_assert((unsigned int)MOCK_ACCESS_RW_SLOW_PATH == __IOMMUFD_ACCESS_RW_SLOW_PATH); +static int iommufd_test_dirty(struct iommufd_ucmd *ucmd, unsigned int mockpt_id, + unsigned long iova, size_t length, + unsigned long page_size, void __user *uptr, + u32 flags) +{ + unsigned long bitmap_size, i, max; + struct iommu_test_cmd *cmd = ucmd->cmd; + struct iommufd_hw_pagetable *hwpt; + struct mock_iommu_domain *mock; + int rc, count = 0; + void *tmp; + + if (!page_size || !length || iova % page_size || length % page_size || + !uptr) + return -EINVAL; + + hwpt = get_md_pagetable(ucmd, mockpt_id, &mock); + if (IS_ERR(hwpt)) + return PTR_ERR(hwpt); + + if (!(mock->flags & MOCK_DIRTY_TRACK)) { + rc = -EINVAL; + goto out_put; + } + + max = length / page_size; + bitmap_size = max / BITS_PER_BYTE; + + tmp = kvzalloc(bitmap_size, GFP_KERNEL_ACCOUNT); + if (!tmp) { + rc = -ENOMEM; + goto out_put; + } + + if (copy_from_user(tmp, uptr, bitmap_size)) { + rc = -EFAULT; + goto out_free; + } + + for (i = 0; i < max; i++) { + unsigned long cur = iova + i * page_size; + void *ent, *old; + + if (!test_bit(i, (unsigned long *)tmp)) + continue; + + ent = xa_load(&mock->pfns, cur / page_size); + if (ent) { + unsigned long val; + + val = xa_to_value(ent) | MOCK_PFN_DIRTY_IOVA; + old = xa_store(&mock->pfns, cur / page_size, + xa_mk_value(val), GFP_KERNEL); + WARN_ON_ONCE(ent != old); + count++; + } + } + + cmd->dirty.out_nr_dirty = count; + rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd)); +out_free: + kvfree(tmp); +out_put: + iommufd_put_object(ucmd->ictx, &hwpt->obj); + return rc; +} + void iommufd_selftest_destroy(struct iommufd_object *obj) { struct selftest_obj *sobj = container_of(obj, struct selftest_obj, obj); @@ -1000,6 +1260,7 @@ int iommufd_test(struct iommufd_ucmd *ucmd) cmd->add_reserved.start, cmd->add_reserved.length); case IOMMU_TEST_OP_MOCK_DOMAIN: + case IOMMU_TEST_OP_MOCK_DOMAIN_FLAGS: return iommufd_test_mock_domain(ucmd, cmd); case IOMMU_TEST_OP_MOCK_DOMAIN_REPLACE: return iommufd_test_mock_domain_replace( @@ -1041,6 +1302,12 @@ int iommufd_test(struct iommufd_ucmd *ucmd) return -EINVAL; iommufd_test_memory_limit = cmd->memory_limit.limit; return 0; + case IOMMU_TEST_OP_DIRTY: + return iommufd_test_dirty(ucmd, cmd->id, cmd->dirty.iova, + cmd->dirty.length, + cmd->dirty.page_size, + u64_to_user_ptr(cmd->dirty.uptr), + cmd->dirty.flags); default: return -EOPNOTSUPP; } diff --git a/drivers/iommu/iommufd/vfio_compat.c b/drivers/iommu/iommufd/vfio_compat.c index 6c810bf80f..a3ad5f0b6c 100644 --- a/drivers/iommu/iommufd/vfio_compat.c +++ b/drivers/iommu/iommufd/vfio_compat.c @@ -41,7 +41,7 @@ int iommufd_vfio_compat_ioas_get_id(struct iommufd_ctx *ictx, u32 *out_ioas_id) if (IS_ERR(ioas)) return PTR_ERR(ioas); *out_ioas_id = ioas->obj.id; - iommufd_put_object(&ioas->obj); + iommufd_put_object(ictx, &ioas->obj); return 0; } EXPORT_SYMBOL_NS_GPL(iommufd_vfio_compat_ioas_get_id, IOMMUFD_VFIO); @@ -98,7 +98,7 @@ int iommufd_vfio_compat_ioas_create(struct iommufd_ctx *ictx) if (ictx->vfio_ioas && iommufd_lock_obj(&ictx->vfio_ioas->obj)) { ret = 0; - iommufd_put_object(&ictx->vfio_ioas->obj); + iommufd_put_object(ictx, &ictx->vfio_ioas->obj); goto out_abort; } ictx->vfio_ioas = ioas; @@ -133,7 +133,7 @@ int iommufd_vfio_ioas(struct iommufd_ucmd *ucmd) if (IS_ERR(ioas)) return PTR_ERR(ioas); cmd->ioas_id = ioas->obj.id; - iommufd_put_object(&ioas->obj); + iommufd_put_object(ucmd->ictx, &ioas->obj); return iommufd_ucmd_respond(ucmd, sizeof(*cmd)); case IOMMU_VFIO_IOAS_SET: @@ -143,7 +143,7 @@ int iommufd_vfio_ioas(struct iommufd_ucmd *ucmd) xa_lock(&ucmd->ictx->objects); ucmd->ictx->vfio_ioas = ioas; xa_unlock(&ucmd->ictx->objects); - iommufd_put_object(&ioas->obj); + iommufd_put_object(ucmd->ictx, &ioas->obj); return 0; case IOMMU_VFIO_IOAS_CLEAR: @@ -190,7 +190,7 @@ static int iommufd_vfio_map_dma(struct iommufd_ctx *ictx, unsigned int cmd, iova = map.iova; rc = iopt_map_user_pages(ictx, &ioas->iopt, &iova, u64_to_user_ptr(map.vaddr), map.size, iommu_prot, 0); - iommufd_put_object(&ioas->obj); + iommufd_put_object(ictx, &ioas->obj); return rc; } @@ -249,13 +249,13 @@ static int iommufd_vfio_unmap_dma(struct iommufd_ctx *ictx, unsigned int cmd, rc = -EFAULT; err_put: - iommufd_put_object(&ioas->obj); + iommufd_put_object(ictx, &ioas->obj); return rc; } static int iommufd_vfio_cc_iommu(struct iommufd_ctx *ictx) { - struct iommufd_hw_pagetable *hwpt; + struct iommufd_hwpt_paging *hwpt_paging; struct iommufd_ioas *ioas; int rc = 1; @@ -264,15 +264,15 @@ static int iommufd_vfio_cc_iommu(struct iommufd_ctx *ictx) return PTR_ERR(ioas); mutex_lock(&ioas->mutex); - list_for_each_entry(hwpt, &ioas->hwpt_list, hwpt_item) { - if (!hwpt->enforce_cache_coherency) { + list_for_each_entry(hwpt_paging, &ioas->hwpt_list, hwpt_item) { + if (!hwpt_paging->enforce_cache_coherency) { rc = 0; break; } } mutex_unlock(&ioas->mutex); - iommufd_put_object(&ioas->obj); + iommufd_put_object(ictx, &ioas->obj); return rc; } @@ -349,7 +349,7 @@ static int iommufd_vfio_set_iommu(struct iommufd_ctx *ictx, unsigned long type) */ if (type == VFIO_TYPE1_IOMMU) rc = iopt_disable_large_pages(&ioas->iopt); - iommufd_put_object(&ioas->obj); + iommufd_put_object(ictx, &ioas->obj); return rc; } @@ -511,7 +511,7 @@ static int iommufd_vfio_iommu_get_info(struct iommufd_ctx *ictx, out_put: up_read(&ioas->iopt.iova_rwsem); - iommufd_put_object(&ioas->obj); + iommufd_put_object(ictx, &ioas->obj); return rc; } diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c index 10b9646009..d30e453d0f 100644 --- a/drivers/iommu/iova.c +++ b/drivers/iommu/iova.c @@ -11,6 +11,7 @@ #include <linux/smp.h> #include <linux/bitops.h> #include <linux/cpu.h> +#include <linux/workqueue.h> /* The anchor node sits above the top of the usable address space */ #define IOVA_ANCHOR ~0UL @@ -622,15 +623,21 @@ EXPORT_SYMBOL_GPL(reserve_iova); /* * As kmalloc's buffer size is fixed to power of 2, 127 is chosen to * assure size of 'iova_magazine' to be 1024 bytes, so that no memory - * will be wasted. + * will be wasted. Since only full magazines are inserted into the depot, + * we don't need to waste PFN capacity on a separate list head either. */ #define IOVA_MAG_SIZE 127 -#define MAX_GLOBAL_MAGS 32 /* magazines per bin */ + +#define IOVA_DEPOT_DELAY msecs_to_jiffies(100) struct iova_magazine { - unsigned long size; + union { + unsigned long size; + struct iova_magazine *next; + }; unsigned long pfns[IOVA_MAG_SIZE]; }; +static_assert(!(sizeof(struct iova_magazine) & (sizeof(struct iova_magazine) - 1))); struct iova_cpu_rcache { spinlock_t lock; @@ -640,9 +647,11 @@ struct iova_cpu_rcache { struct iova_rcache { spinlock_t lock; - unsigned long depot_size; - struct iova_magazine *depot[MAX_GLOBAL_MAGS]; + unsigned int depot_size; + struct iova_magazine *depot; struct iova_cpu_rcache __percpu *cpu_rcaches; + struct iova_domain *iovad; + struct delayed_work work; }; static struct iova_magazine *iova_magazine_alloc(gfp_t flags) @@ -717,6 +726,41 @@ static void iova_magazine_push(struct iova_magazine *mag, unsigned long pfn) mag->pfns[mag->size++] = pfn; } +static struct iova_magazine *iova_depot_pop(struct iova_rcache *rcache) +{ + struct iova_magazine *mag = rcache->depot; + + rcache->depot = mag->next; + mag->size = IOVA_MAG_SIZE; + rcache->depot_size--; + return mag; +} + +static void iova_depot_push(struct iova_rcache *rcache, struct iova_magazine *mag) +{ + mag->next = rcache->depot; + rcache->depot = mag; + rcache->depot_size++; +} + +static void iova_depot_work_func(struct work_struct *work) +{ + struct iova_rcache *rcache = container_of(work, typeof(*rcache), work.work); + struct iova_magazine *mag = NULL; + unsigned long flags; + + spin_lock_irqsave(&rcache->lock, flags); + if (rcache->depot_size > num_online_cpus()) + mag = iova_depot_pop(rcache); + spin_unlock_irqrestore(&rcache->lock, flags); + + if (mag) { + iova_magazine_free_pfns(mag, rcache->iovad); + iova_magazine_free(mag); + schedule_delayed_work(&rcache->work, IOVA_DEPOT_DELAY); + } +} + int iova_domain_init_rcaches(struct iova_domain *iovad) { unsigned int cpu; @@ -734,7 +778,8 @@ int iova_domain_init_rcaches(struct iova_domain *iovad) rcache = &iovad->rcaches[i]; spin_lock_init(&rcache->lock); - rcache->depot_size = 0; + rcache->iovad = iovad; + INIT_DELAYED_WORK(&rcache->work, iova_depot_work_func); rcache->cpu_rcaches = __alloc_percpu(sizeof(*cpu_rcache), cache_line_size()); if (!rcache->cpu_rcaches) { @@ -776,7 +821,6 @@ static bool __iova_rcache_insert(struct iova_domain *iovad, struct iova_rcache *rcache, unsigned long iova_pfn) { - struct iova_magazine *mag_to_free = NULL; struct iova_cpu_rcache *cpu_rcache; bool can_insert = false; unsigned long flags; @@ -794,13 +838,9 @@ static bool __iova_rcache_insert(struct iova_domain *iovad, if (new_mag) { spin_lock(&rcache->lock); - if (rcache->depot_size < MAX_GLOBAL_MAGS) { - rcache->depot[rcache->depot_size++] = - cpu_rcache->loaded; - } else { - mag_to_free = cpu_rcache->loaded; - } + iova_depot_push(rcache, cpu_rcache->loaded); spin_unlock(&rcache->lock); + schedule_delayed_work(&rcache->work, IOVA_DEPOT_DELAY); cpu_rcache->loaded = new_mag; can_insert = true; @@ -812,11 +852,6 @@ static bool __iova_rcache_insert(struct iova_domain *iovad, spin_unlock_irqrestore(&cpu_rcache->lock, flags); - if (mag_to_free) { - iova_magazine_free_pfns(mag_to_free, iovad); - iova_magazine_free(mag_to_free); - } - return can_insert; } @@ -854,9 +889,9 @@ static unsigned long __iova_rcache_get(struct iova_rcache *rcache, has_pfn = true; } else { spin_lock(&rcache->lock); - if (rcache->depot_size > 0) { + if (rcache->depot) { iova_magazine_free(cpu_rcache->loaded); - cpu_rcache->loaded = rcache->depot[--rcache->depot_size]; + cpu_rcache->loaded = iova_depot_pop(rcache); has_pfn = true; } spin_unlock(&rcache->lock); @@ -895,9 +930,8 @@ static void free_iova_rcaches(struct iova_domain *iovad) struct iova_rcache *rcache; struct iova_cpu_rcache *cpu_rcache; unsigned int cpu; - int i, j; - for (i = 0; i < IOVA_RANGE_CACHE_MAX_SIZE; ++i) { + for (int i = 0; i < IOVA_RANGE_CACHE_MAX_SIZE; ++i) { rcache = &iovad->rcaches[i]; if (!rcache->cpu_rcaches) break; @@ -907,8 +941,9 @@ static void free_iova_rcaches(struct iova_domain *iovad) iova_magazine_free(cpu_rcache->prev); } free_percpu(rcache->cpu_rcaches); - for (j = 0; j < rcache->depot_size; ++j) - iova_magazine_free(rcache->depot[j]); + cancel_delayed_work_sync(&rcache->work); + while (rcache->depot) + iova_magazine_free(iova_depot_pop(rcache)); } kfree(iovad->rcaches); @@ -942,16 +977,16 @@ static void free_global_cached_iovas(struct iova_domain *iovad) { struct iova_rcache *rcache; unsigned long flags; - int i, j; - for (i = 0; i < IOVA_RANGE_CACHE_MAX_SIZE; ++i) { + for (int i = 0; i < IOVA_RANGE_CACHE_MAX_SIZE; ++i) { rcache = &iovad->rcaches[i]; spin_lock_irqsave(&rcache->lock, flags); - for (j = 0; j < rcache->depot_size; ++j) { - iova_magazine_free_pfns(rcache->depot[j], iovad); - iova_magazine_free(rcache->depot[j]); + while (rcache->depot) { + struct iova_magazine *mag = iova_depot_pop(rcache); + + iova_magazine_free_pfns(mag, iovad); + iova_magazine_free(mag); } - rcache->depot_size = 0; spin_unlock_irqrestore(&rcache->lock, flags); } } diff --git a/drivers/iommu/ipmmu-vmsa.c b/drivers/iommu/ipmmu-vmsa.c index 65ff69477c..ace1fc4bd3 100644 --- a/drivers/iommu/ipmmu-vmsa.c +++ b/drivers/iommu/ipmmu-vmsa.c @@ -64,7 +64,6 @@ struct ipmmu_vmsa_device { struct ipmmu_vmsa_domain *domains[IPMMU_CTX_MAX]; s8 utlb_ctx[IPMMU_UTLB_MAX]; - struct iommu_group *group; struct dma_iommu_mapping *mapping; }; @@ -295,6 +294,18 @@ static void ipmmu_utlb_enable(struct ipmmu_vmsa_domain *domain, mmu->utlb_ctx[utlb] = domain->context_id; } +/* + * Disable MMU translation for the microTLB. + */ +static void ipmmu_utlb_disable(struct ipmmu_vmsa_domain *domain, + unsigned int utlb) +{ + struct ipmmu_vmsa_device *mmu = domain->mmu; + + ipmmu_imuctr_write(mmu, utlb, 0); + mmu->utlb_ctx[utlb] = IPMMU_CTX_INVALID; +} + static void ipmmu_tlb_flush_all(void *cookie) { struct ipmmu_vmsa_domain *domain = cookie; @@ -551,13 +562,10 @@ static irqreturn_t ipmmu_irq(int irq, void *dev) * IOMMU Operations */ -static struct iommu_domain *ipmmu_domain_alloc(unsigned type) +static struct iommu_domain *ipmmu_domain_alloc_paging(struct device *dev) { struct ipmmu_vmsa_domain *domain; - if (type != IOMMU_DOMAIN_UNMANAGED && type != IOMMU_DOMAIN_DMA) - return NULL; - domain = kzalloc(sizeof(*domain), GFP_KERNEL); if (!domain) return NULL; @@ -627,6 +635,36 @@ static int ipmmu_attach_device(struct iommu_domain *io_domain, return 0; } +static int ipmmu_iommu_identity_attach(struct iommu_domain *identity_domain, + struct device *dev) +{ + struct iommu_domain *io_domain = iommu_get_domain_for_dev(dev); + struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); + struct ipmmu_vmsa_domain *domain; + unsigned int i; + + if (io_domain == identity_domain || !io_domain) + return 0; + + domain = to_vmsa_domain(io_domain); + for (i = 0; i < fwspec->num_ids; ++i) + ipmmu_utlb_disable(domain, fwspec->ids[i]); + + /* + * TODO: Optimize by disabling the context when no device is attached. + */ + return 0; +} + +static struct iommu_domain_ops ipmmu_iommu_identity_ops = { + .attach_dev = ipmmu_iommu_identity_attach, +}; + +static struct iommu_domain ipmmu_iommu_identity_domain = { + .type = IOMMU_DOMAIN_IDENTITY, + .ops = &ipmmu_iommu_identity_ops, +}; + static int ipmmu_map(struct iommu_domain *io_domain, unsigned long iova, phys_addr_t paddr, size_t pgsize, size_t pgcount, int prot, gfp_t gfp, size_t *mapped) @@ -833,28 +871,18 @@ static void ipmmu_release_device(struct device *dev) arm_iommu_release_mapping(mmu->mapping); } -static struct iommu_group *ipmmu_find_group(struct device *dev) -{ - struct ipmmu_vmsa_device *mmu = to_ipmmu(dev); - struct iommu_group *group; - - if (mmu->group) - return iommu_group_ref_get(mmu->group); - - group = iommu_group_alloc(); - if (!IS_ERR(group)) - mmu->group = group; - - return group; -} - static const struct iommu_ops ipmmu_ops = { - .domain_alloc = ipmmu_domain_alloc, + .identity_domain = &ipmmu_iommu_identity_domain, + .domain_alloc_paging = ipmmu_domain_alloc_paging, .probe_device = ipmmu_probe_device, .release_device = ipmmu_release_device, .probe_finalize = ipmmu_probe_finalize, + /* + * FIXME: The device grouping is a fixed property of the hardware's + * ability to isolate and control DMA, it should not depend on kconfig. + */ .device_group = IS_ENABLED(CONFIG_ARM) && !IS_ENABLED(CONFIG_IOMMU_DMA) - ? generic_device_group : ipmmu_find_group, + ? generic_device_group : generic_single_device_group, .pgsize_bitmap = SZ_1G | SZ_2M | SZ_4K, .of_xlate = ipmmu_of_xlate, .default_domain_ops = &(const struct iommu_domain_ops) { diff --git a/drivers/iommu/msm_iommu.c b/drivers/iommu/msm_iommu.c index 79d89bad51..f86af9815d 100644 --- a/drivers/iommu/msm_iommu.c +++ b/drivers/iommu/msm_iommu.c @@ -302,13 +302,10 @@ static void __program_context(void __iomem *base, int ctx, SET_M(base, ctx, 1); } -static struct iommu_domain *msm_iommu_domain_alloc(unsigned type) +static struct iommu_domain *msm_iommu_domain_alloc_paging(struct device *dev) { struct msm_priv *priv; - if (type != IOMMU_DOMAIN_UNMANAGED) - return NULL; - priv = kzalloc(sizeof(*priv), GFP_KERNEL); if (!priv) goto fail_nomem; @@ -443,15 +440,20 @@ fail: return ret; } -static void msm_iommu_set_platform_dma(struct device *dev) +static int msm_iommu_identity_attach(struct iommu_domain *identity_domain, + struct device *dev) { struct iommu_domain *domain = iommu_get_domain_for_dev(dev); - struct msm_priv *priv = to_msm_priv(domain); + struct msm_priv *priv; unsigned long flags; struct msm_iommu_dev *iommu; struct msm_iommu_ctx_dev *master; - int ret; + int ret = 0; + + if (domain == identity_domain || !domain) + return 0; + priv = to_msm_priv(domain); free_io_pgtable_ops(priv->iop); spin_lock_irqsave(&msm_iommu_lock, flags); @@ -468,8 +470,18 @@ static void msm_iommu_set_platform_dma(struct device *dev) } fail: spin_unlock_irqrestore(&msm_iommu_lock, flags); + return ret; } +static struct iommu_domain_ops msm_iommu_identity_ops = { + .attach_dev = msm_iommu_identity_attach, +}; + +static struct iommu_domain msm_iommu_identity_domain = { + .type = IOMMU_DOMAIN_IDENTITY, + .ops = &msm_iommu_identity_ops, +}; + static int msm_iommu_map(struct iommu_domain *domain, unsigned long iova, phys_addr_t pa, size_t pgsize, size_t pgcount, int prot, gfp_t gfp, size_t *mapped) @@ -486,12 +498,13 @@ static int msm_iommu_map(struct iommu_domain *domain, unsigned long iova, return ret; } -static void msm_iommu_sync_map(struct iommu_domain *domain, unsigned long iova, - size_t size) +static int msm_iommu_sync_map(struct iommu_domain *domain, unsigned long iova, + size_t size) { struct msm_priv *priv = to_msm_priv(domain); __flush_iotlb_range(iova, size, SZ_4K, false, priv); + return 0; } static size_t msm_iommu_unmap(struct iommu_domain *domain, unsigned long iova, @@ -675,10 +688,10 @@ fail: } static struct iommu_ops msm_iommu_ops = { - .domain_alloc = msm_iommu_domain_alloc, + .identity_domain = &msm_iommu_identity_domain, + .domain_alloc_paging = msm_iommu_domain_alloc_paging, .probe_device = msm_iommu_probe_device, .device_group = generic_device_group, - .set_platform_dma_ops = msm_iommu_set_platform_dma, .pgsize_bitmap = MSM_IOMMU_PGSIZES, .of_xlate = qcom_iommu_of_xlate, .default_domain_ops = &(const struct iommu_domain_ops) { diff --git a/drivers/iommu/mtk_iommu.c b/drivers/iommu/mtk_iommu.c index fab6c347ce..75279500a4 100644 --- a/drivers/iommu/mtk_iommu.c +++ b/drivers/iommu/mtk_iommu.c @@ -688,13 +688,10 @@ update_iova_region: return 0; } -static struct iommu_domain *mtk_iommu_domain_alloc(unsigned type) +static struct iommu_domain *mtk_iommu_domain_alloc_paging(struct device *dev) { struct mtk_iommu_domain *dom; - if (type != IOMMU_DOMAIN_DMA && type != IOMMU_DOMAIN_UNMANAGED) - return NULL; - dom = kzalloc(sizeof(*dom), GFP_KERNEL); if (!dom) return NULL; @@ -776,6 +773,28 @@ err_unlock: return ret; } +static int mtk_iommu_identity_attach(struct iommu_domain *identity_domain, + struct device *dev) +{ + struct iommu_domain *domain = iommu_get_domain_for_dev(dev); + struct mtk_iommu_data *data = dev_iommu_priv_get(dev); + + if (domain == identity_domain || !domain) + return 0; + + mtk_iommu_config(data, dev, false, 0); + return 0; +} + +static struct iommu_domain_ops mtk_iommu_identity_ops = { + .attach_dev = mtk_iommu_identity_attach, +}; + +static struct iommu_domain mtk_iommu_identity_domain = { + .type = IOMMU_DOMAIN_IDENTITY, + .ops = &mtk_iommu_identity_ops, +}; + static int mtk_iommu_map(struct iommu_domain *domain, unsigned long iova, phys_addr_t paddr, size_t pgsize, size_t pgcount, int prot, gfp_t gfp, size_t *mapped) @@ -817,12 +836,13 @@ static void mtk_iommu_iotlb_sync(struct iommu_domain *domain, mtk_iommu_tlb_flush_range_sync(gather->start, length, dom->bank); } -static void mtk_iommu_sync_map(struct iommu_domain *domain, unsigned long iova, - size_t size) +static int mtk_iommu_sync_map(struct iommu_domain *domain, unsigned long iova, + size_t size) { struct mtk_iommu_domain *dom = to_mtk_domain(domain); mtk_iommu_tlb_flush_range_sync(iova, size, dom->bank); + return 0; } static phys_addr_t mtk_iommu_iova_to_phys(struct iommu_domain *domain, @@ -995,7 +1015,8 @@ static void mtk_iommu_get_resv_regions(struct device *dev, } static const struct iommu_ops mtk_iommu_ops = { - .domain_alloc = mtk_iommu_domain_alloc, + .identity_domain = &mtk_iommu_identity_domain, + .domain_alloc_paging = mtk_iommu_domain_alloc_paging, .probe_device = mtk_iommu_probe_device, .release_device = mtk_iommu_release_device, .device_group = mtk_iommu_device_group, diff --git a/drivers/iommu/mtk_iommu_v1.c b/drivers/iommu/mtk_iommu_v1.c index 8a0a5e5d04..67e044c1a7 100644 --- a/drivers/iommu/mtk_iommu_v1.c +++ b/drivers/iommu/mtk_iommu_v1.c @@ -270,13 +270,10 @@ static int mtk_iommu_v1_domain_finalise(struct mtk_iommu_v1_data *data) return 0; } -static struct iommu_domain *mtk_iommu_v1_domain_alloc(unsigned type) +static struct iommu_domain *mtk_iommu_v1_domain_alloc_paging(struct device *dev) { struct mtk_iommu_v1_domain *dom; - if (type != IOMMU_DOMAIN_UNMANAGED) - return NULL; - dom = kzalloc(sizeof(*dom), GFP_KERNEL); if (!dom) return NULL; @@ -319,13 +316,24 @@ static int mtk_iommu_v1_attach_device(struct iommu_domain *domain, struct device return 0; } -static void mtk_iommu_v1_set_platform_dma(struct device *dev) +static int mtk_iommu_v1_identity_attach(struct iommu_domain *identity_domain, + struct device *dev) { struct mtk_iommu_v1_data *data = dev_iommu_priv_get(dev); mtk_iommu_v1_config(data, dev, false); + return 0; } +static struct iommu_domain_ops mtk_iommu_v1_identity_ops = { + .attach_dev = mtk_iommu_v1_identity_attach, +}; + +static struct iommu_domain mtk_iommu_v1_identity_domain = { + .type = IOMMU_DOMAIN_IDENTITY, + .ops = &mtk_iommu_v1_identity_ops, +}; + static int mtk_iommu_v1_map(struct iommu_domain *domain, unsigned long iova, phys_addr_t paddr, size_t pgsize, size_t pgcount, int prot, gfp_t gfp, size_t *mapped) @@ -441,11 +449,6 @@ static int mtk_iommu_v1_create_mapping(struct device *dev, struct of_phandle_arg return 0; } -static int mtk_iommu_v1_def_domain_type(struct device *dev) -{ - return IOMMU_DOMAIN_UNMANAGED; -} - static struct iommu_device *mtk_iommu_v1_probe_device(struct device *dev) { struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); @@ -578,14 +581,13 @@ static int mtk_iommu_v1_hw_init(const struct mtk_iommu_v1_data *data) } static const struct iommu_ops mtk_iommu_v1_ops = { - .domain_alloc = mtk_iommu_v1_domain_alloc, + .identity_domain = &mtk_iommu_v1_identity_domain, + .domain_alloc_paging = mtk_iommu_v1_domain_alloc_paging, .probe_device = mtk_iommu_v1_probe_device, .probe_finalize = mtk_iommu_v1_probe_finalize, .release_device = mtk_iommu_v1_release_device, - .def_domain_type = mtk_iommu_v1_def_domain_type, .device_group = generic_device_group, .pgsize_bitmap = MT2701_IOMMU_PAGE_SIZE, - .set_platform_dma_ops = mtk_iommu_v1_set_platform_dma, .owner = THIS_MODULE, .default_domain_ops = &(const struct iommu_domain_ops) { .attach_dev = mtk_iommu_v1_attach_device, diff --git a/drivers/iommu/omap-iommu.c b/drivers/iommu/omap-iommu.c index 537e402f9b..c66b070841 100644 --- a/drivers/iommu/omap-iommu.c +++ b/drivers/iommu/omap-iommu.c @@ -1225,18 +1225,15 @@ static int omap_iommu_probe(struct platform_device *pdev) platform_set_drvdata(pdev, obj); if (omap_iommu_can_register(pdev)) { - obj->group = iommu_group_alloc(); - if (IS_ERR(obj->group)) - return PTR_ERR(obj->group); - err = iommu_device_sysfs_add(&obj->iommu, obj->dev, NULL, obj->name); if (err) - goto out_group; + return err; err = iommu_device_register(&obj->iommu, &omap_iommu_ops, &pdev->dev); if (err) goto out_sysfs; + obj->has_iommu_driver = true; } pm_runtime_enable(obj->dev); @@ -1252,8 +1249,6 @@ static int omap_iommu_probe(struct platform_device *pdev) out_sysfs: iommu_device_sysfs_remove(&obj->iommu); -out_group: - iommu_group_put(obj->group); return err; } @@ -1261,10 +1256,7 @@ static void omap_iommu_remove(struct platform_device *pdev) { struct omap_iommu *obj = platform_get_drvdata(pdev); - if (obj->group) { - iommu_group_put(obj->group); - obj->group = NULL; - + if (obj->has_iommu_driver) { iommu_device_sysfs_remove(&obj->iommu); iommu_device_unregister(&obj->iommu); } @@ -1318,7 +1310,8 @@ static u32 iotlb_init_entry(struct iotlb_entry *e, u32 da, u32 pa, int pgsz) } static int omap_iommu_map(struct iommu_domain *domain, unsigned long da, - phys_addr_t pa, size_t bytes, int prot, gfp_t gfp) + phys_addr_t pa, size_t bytes, size_t count, + int prot, gfp_t gfp, size_t *mapped) { struct omap_iommu_domain *omap_domain = to_omap_domain(domain); struct device *dev = omap_domain->dev; @@ -1356,13 +1349,15 @@ static int omap_iommu_map(struct iommu_domain *domain, unsigned long da, oiommu = iommu->iommu_dev; iopgtable_clear_entry(oiommu, da); } + } else { + *mapped = bytes; } return ret; } static size_t omap_iommu_unmap(struct iommu_domain *domain, unsigned long da, - size_t size, struct iommu_iotlb_gather *gather) + size_t size, size_t count, struct iommu_iotlb_gather *gather) { struct omap_iommu_domain *omap_domain = to_omap_domain(domain); struct device *dev = omap_domain->dev; @@ -1555,23 +1550,35 @@ static void _omap_iommu_detach_dev(struct omap_iommu_domain *omap_domain, omap_domain->dev = NULL; } -static void omap_iommu_set_platform_dma(struct device *dev) +static int omap_iommu_identity_attach(struct iommu_domain *identity_domain, + struct device *dev) { struct iommu_domain *domain = iommu_get_domain_for_dev(dev); - struct omap_iommu_domain *omap_domain = to_omap_domain(domain); + struct omap_iommu_domain *omap_domain; + + if (domain == identity_domain || !domain) + return 0; + omap_domain = to_omap_domain(domain); spin_lock(&omap_domain->lock); _omap_iommu_detach_dev(omap_domain, dev); spin_unlock(&omap_domain->lock); + return 0; } -static struct iommu_domain *omap_iommu_domain_alloc(unsigned type) +static struct iommu_domain_ops omap_iommu_identity_ops = { + .attach_dev = omap_iommu_identity_attach, +}; + +static struct iommu_domain omap_iommu_identity_domain = { + .type = IOMMU_DOMAIN_IDENTITY, + .ops = &omap_iommu_identity_ops, +}; + +static struct iommu_domain *omap_iommu_domain_alloc_paging(struct device *dev) { struct omap_iommu_domain *omap_domain; - if (type != IOMMU_DOMAIN_UNMANAGED) - return NULL; - omap_domain = kzalloc(sizeof(*omap_domain), GFP_KERNEL); if (!omap_domain) return NULL; @@ -1717,31 +1724,17 @@ static void omap_iommu_release_device(struct device *dev) } -static struct iommu_group *omap_iommu_device_group(struct device *dev) -{ - struct omap_iommu_arch_data *arch_data = dev_iommu_priv_get(dev); - struct iommu_group *group = ERR_PTR(-EINVAL); - - if (!arch_data) - return ERR_PTR(-ENODEV); - - if (arch_data->iommu_dev) - group = iommu_group_ref_get(arch_data->iommu_dev->group); - - return group; -} - static const struct iommu_ops omap_iommu_ops = { - .domain_alloc = omap_iommu_domain_alloc, + .identity_domain = &omap_iommu_identity_domain, + .domain_alloc_paging = omap_iommu_domain_alloc_paging, .probe_device = omap_iommu_probe_device, .release_device = omap_iommu_release_device, - .device_group = omap_iommu_device_group, - .set_platform_dma_ops = omap_iommu_set_platform_dma, + .device_group = generic_single_device_group, .pgsize_bitmap = OMAP_IOMMU_PGSIZES, .default_domain_ops = &(const struct iommu_domain_ops) { .attach_dev = omap_iommu_attach_dev, - .map = omap_iommu_map, - .unmap = omap_iommu_unmap, + .map_pages = omap_iommu_map, + .unmap_pages = omap_iommu_unmap, .iova_to_phys = omap_iommu_iova_to_phys, .free = omap_iommu_domain_free, } diff --git a/drivers/iommu/omap-iommu.h b/drivers/iommu/omap-iommu.h index 18ee713ede..27697109ec 100644 --- a/drivers/iommu/omap-iommu.h +++ b/drivers/iommu/omap-iommu.h @@ -80,7 +80,7 @@ struct omap_iommu { u32 id; struct iommu_device iommu; - struct iommu_group *group; + bool has_iommu_driver; u8 pwrst; }; diff --git a/drivers/iommu/rockchip-iommu.c b/drivers/iommu/rockchip-iommu.c index 8ff69fbf9f..2685861c0a 100644 --- a/drivers/iommu/rockchip-iommu.c +++ b/drivers/iommu/rockchip-iommu.c @@ -113,7 +113,6 @@ struct rk_iommu { struct iommu_device iommu; struct list_head node; /* entry in rk_iommu_domain.iommus */ struct iommu_domain *domain; /* domain to which iommu is attached */ - struct iommu_group *group; }; struct rk_iommudata { @@ -817,7 +816,8 @@ unwind: } static int rk_iommu_map(struct iommu_domain *domain, unsigned long _iova, - phys_addr_t paddr, size_t size, int prot, gfp_t gfp) + phys_addr_t paddr, size_t size, size_t count, + int prot, gfp_t gfp, size_t *mapped) { struct rk_iommu_domain *rk_domain = to_rk_domain(domain); unsigned long flags; @@ -850,12 +850,14 @@ static int rk_iommu_map(struct iommu_domain *domain, unsigned long _iova, paddr, size, prot); spin_unlock_irqrestore(&rk_domain->dt_lock, flags); + if (!ret) + *mapped = size; return ret; } static size_t rk_iommu_unmap(struct iommu_domain *domain, unsigned long _iova, - size_t size, struct iommu_iotlb_gather *gather) + size_t size, size_t count, struct iommu_iotlb_gather *gather) { struct rk_iommu_domain *rk_domain = to_rk_domain(domain); unsigned long flags; @@ -989,13 +991,8 @@ static int rk_iommu_identity_attach(struct iommu_domain *identity_domain, return 0; } -static void rk_iommu_identity_free(struct iommu_domain *domain) -{ -} - static struct iommu_domain_ops rk_identity_ops = { .attach_dev = rk_iommu_identity_attach, - .free = rk_iommu_identity_free, }; static struct iommu_domain rk_identity_domain = { @@ -1003,13 +1000,6 @@ static struct iommu_domain rk_identity_domain = { .ops = &rk_identity_ops, }; -#ifdef CONFIG_ARM -static void rk_iommu_set_platform_dma(struct device *dev) -{ - WARN_ON(rk_iommu_identity_attach(&rk_identity_domain, dev)); -} -#endif - static int rk_iommu_attach_device(struct iommu_domain *domain, struct device *dev) { @@ -1055,16 +1045,10 @@ static int rk_iommu_attach_device(struct iommu_domain *domain, return ret; } -static struct iommu_domain *rk_iommu_domain_alloc(unsigned type) +static struct iommu_domain *rk_iommu_domain_alloc_paging(struct device *dev) { struct rk_iommu_domain *rk_domain; - if (type == IOMMU_DOMAIN_IDENTITY) - return &rk_identity_domain; - - if (type != IOMMU_DOMAIN_UNMANAGED && type != IOMMU_DOMAIN_DMA) - return NULL; - if (!dma_dev) return NULL; @@ -1155,15 +1139,6 @@ static void rk_iommu_release_device(struct device *dev) device_link_del(data->link); } -static struct iommu_group *rk_iommu_device_group(struct device *dev) -{ - struct rk_iommu *iommu; - - iommu = rk_iommu_from_dev(dev); - - return iommu_group_ref_get(iommu->group); -} - static int rk_iommu_of_xlate(struct device *dev, struct of_phandle_args *args) { @@ -1186,19 +1161,17 @@ static int rk_iommu_of_xlate(struct device *dev, } static const struct iommu_ops rk_iommu_ops = { - .domain_alloc = rk_iommu_domain_alloc, + .identity_domain = &rk_identity_domain, + .domain_alloc_paging = rk_iommu_domain_alloc_paging, .probe_device = rk_iommu_probe_device, .release_device = rk_iommu_release_device, - .device_group = rk_iommu_device_group, -#ifdef CONFIG_ARM - .set_platform_dma_ops = rk_iommu_set_platform_dma, -#endif + .device_group = generic_single_device_group, .pgsize_bitmap = RK_IOMMU_PGSIZE_BITMAP, .of_xlate = rk_iommu_of_xlate, .default_domain_ops = &(const struct iommu_domain_ops) { .attach_dev = rk_iommu_attach_device, - .map = rk_iommu_map, - .unmap = rk_iommu_unmap, + .map_pages = rk_iommu_map, + .unmap_pages = rk_iommu_unmap, .iova_to_phys = rk_iommu_iova_to_phys, .free = rk_iommu_domain_free, } @@ -1280,15 +1253,9 @@ static int rk_iommu_probe(struct platform_device *pdev) if (err) return err; - iommu->group = iommu_group_alloc(); - if (IS_ERR(iommu->group)) { - err = PTR_ERR(iommu->group); - goto err_unprepare_clocks; - } - err = iommu_device_sysfs_add(&iommu->iommu, dev, NULL, dev_name(dev)); if (err) - goto err_put_group; + goto err_unprepare_clocks; err = iommu_device_register(&iommu->iommu, &rk_iommu_ops, dev); if (err) @@ -1325,8 +1292,6 @@ err_pm_disable: pm_runtime_disable(dev); err_remove_sysfs: iommu_device_sysfs_remove(&iommu->iommu); -err_put_group: - iommu_group_put(iommu->group); err_unprepare_clocks: clk_bulk_unprepare(iommu->num_clocks, iommu->clocks); return err; diff --git a/drivers/iommu/s390-iommu.c b/drivers/iommu/s390-iommu.c index fbf59a8db2..9a5196f523 100644 --- a/drivers/iommu/s390-iommu.c +++ b/drivers/iommu/s390-iommu.c @@ -14,16 +14,300 @@ #include <linux/rcupdate.h> #include <asm/pci_dma.h> +#include "dma-iommu.h" + static const struct iommu_ops s390_iommu_ops; +static struct kmem_cache *dma_region_table_cache; +static struct kmem_cache *dma_page_table_cache; + +static u64 s390_iommu_aperture; +static u32 s390_iommu_aperture_factor = 1; + struct s390_domain { struct iommu_domain domain; struct list_head devices; + struct zpci_iommu_ctrs ctrs; unsigned long *dma_table; spinlock_t list_lock; struct rcu_head rcu; }; +static inline unsigned int calc_rtx(dma_addr_t ptr) +{ + return ((unsigned long)ptr >> ZPCI_RT_SHIFT) & ZPCI_INDEX_MASK; +} + +static inline unsigned int calc_sx(dma_addr_t ptr) +{ + return ((unsigned long)ptr >> ZPCI_ST_SHIFT) & ZPCI_INDEX_MASK; +} + +static inline unsigned int calc_px(dma_addr_t ptr) +{ + return ((unsigned long)ptr >> PAGE_SHIFT) & ZPCI_PT_MASK; +} + +static inline void set_pt_pfaa(unsigned long *entry, phys_addr_t pfaa) +{ + *entry &= ZPCI_PTE_FLAG_MASK; + *entry |= (pfaa & ZPCI_PTE_ADDR_MASK); +} + +static inline void set_rt_sto(unsigned long *entry, phys_addr_t sto) +{ + *entry &= ZPCI_RTE_FLAG_MASK; + *entry |= (sto & ZPCI_RTE_ADDR_MASK); + *entry |= ZPCI_TABLE_TYPE_RTX; +} + +static inline void set_st_pto(unsigned long *entry, phys_addr_t pto) +{ + *entry &= ZPCI_STE_FLAG_MASK; + *entry |= (pto & ZPCI_STE_ADDR_MASK); + *entry |= ZPCI_TABLE_TYPE_SX; +} + +static inline void validate_rt_entry(unsigned long *entry) +{ + *entry &= ~ZPCI_TABLE_VALID_MASK; + *entry &= ~ZPCI_TABLE_OFFSET_MASK; + *entry |= ZPCI_TABLE_VALID; + *entry |= ZPCI_TABLE_LEN_RTX; +} + +static inline void validate_st_entry(unsigned long *entry) +{ + *entry &= ~ZPCI_TABLE_VALID_MASK; + *entry |= ZPCI_TABLE_VALID; +} + +static inline void invalidate_pt_entry(unsigned long *entry) +{ + WARN_ON_ONCE((*entry & ZPCI_PTE_VALID_MASK) == ZPCI_PTE_INVALID); + *entry &= ~ZPCI_PTE_VALID_MASK; + *entry |= ZPCI_PTE_INVALID; +} + +static inline void validate_pt_entry(unsigned long *entry) +{ + WARN_ON_ONCE((*entry & ZPCI_PTE_VALID_MASK) == ZPCI_PTE_VALID); + *entry &= ~ZPCI_PTE_VALID_MASK; + *entry |= ZPCI_PTE_VALID; +} + +static inline void entry_set_protected(unsigned long *entry) +{ + *entry &= ~ZPCI_TABLE_PROT_MASK; + *entry |= ZPCI_TABLE_PROTECTED; +} + +static inline void entry_clr_protected(unsigned long *entry) +{ + *entry &= ~ZPCI_TABLE_PROT_MASK; + *entry |= ZPCI_TABLE_UNPROTECTED; +} + +static inline int reg_entry_isvalid(unsigned long entry) +{ + return (entry & ZPCI_TABLE_VALID_MASK) == ZPCI_TABLE_VALID; +} + +static inline int pt_entry_isvalid(unsigned long entry) +{ + return (entry & ZPCI_PTE_VALID_MASK) == ZPCI_PTE_VALID; +} + +static inline unsigned long *get_rt_sto(unsigned long entry) +{ + if ((entry & ZPCI_TABLE_TYPE_MASK) == ZPCI_TABLE_TYPE_RTX) + return phys_to_virt(entry & ZPCI_RTE_ADDR_MASK); + else + return NULL; +} + +static inline unsigned long *get_st_pto(unsigned long entry) +{ + if ((entry & ZPCI_TABLE_TYPE_MASK) == ZPCI_TABLE_TYPE_SX) + return phys_to_virt(entry & ZPCI_STE_ADDR_MASK); + else + return NULL; +} + +static int __init dma_alloc_cpu_table_caches(void) +{ + dma_region_table_cache = kmem_cache_create("PCI_DMA_region_tables", + ZPCI_TABLE_SIZE, + ZPCI_TABLE_ALIGN, + 0, NULL); + if (!dma_region_table_cache) + return -ENOMEM; + + dma_page_table_cache = kmem_cache_create("PCI_DMA_page_tables", + ZPCI_PT_SIZE, + ZPCI_PT_ALIGN, + 0, NULL); + if (!dma_page_table_cache) { + kmem_cache_destroy(dma_region_table_cache); + return -ENOMEM; + } + return 0; +} + +static unsigned long *dma_alloc_cpu_table(gfp_t gfp) +{ + unsigned long *table, *entry; + + table = kmem_cache_alloc(dma_region_table_cache, gfp); + if (!table) + return NULL; + + for (entry = table; entry < table + ZPCI_TABLE_ENTRIES; entry++) + *entry = ZPCI_TABLE_INVALID; + return table; +} + +static void dma_free_cpu_table(void *table) +{ + kmem_cache_free(dma_region_table_cache, table); +} + +static void dma_free_page_table(void *table) +{ + kmem_cache_free(dma_page_table_cache, table); +} + +static void dma_free_seg_table(unsigned long entry) +{ + unsigned long *sto = get_rt_sto(entry); + int sx; + + for (sx = 0; sx < ZPCI_TABLE_ENTRIES; sx++) + if (reg_entry_isvalid(sto[sx])) + dma_free_page_table(get_st_pto(sto[sx])); + + dma_free_cpu_table(sto); +} + +static void dma_cleanup_tables(unsigned long *table) +{ + int rtx; + + if (!table) + return; + + for (rtx = 0; rtx < ZPCI_TABLE_ENTRIES; rtx++) + if (reg_entry_isvalid(table[rtx])) + dma_free_seg_table(table[rtx]); + + dma_free_cpu_table(table); +} + +static unsigned long *dma_alloc_page_table(gfp_t gfp) +{ + unsigned long *table, *entry; + + table = kmem_cache_alloc(dma_page_table_cache, gfp); + if (!table) + return NULL; + + for (entry = table; entry < table + ZPCI_PT_ENTRIES; entry++) + *entry = ZPCI_PTE_INVALID; + return table; +} + +static unsigned long *dma_get_seg_table_origin(unsigned long *rtep, gfp_t gfp) +{ + unsigned long old_rte, rte; + unsigned long *sto; + + rte = READ_ONCE(*rtep); + if (reg_entry_isvalid(rte)) { + sto = get_rt_sto(rte); + } else { + sto = dma_alloc_cpu_table(gfp); + if (!sto) + return NULL; + + set_rt_sto(&rte, virt_to_phys(sto)); + validate_rt_entry(&rte); + entry_clr_protected(&rte); + + old_rte = cmpxchg(rtep, ZPCI_TABLE_INVALID, rte); + if (old_rte != ZPCI_TABLE_INVALID) { + /* Somone else was faster, use theirs */ + dma_free_cpu_table(sto); + sto = get_rt_sto(old_rte); + } + } + return sto; +} + +static unsigned long *dma_get_page_table_origin(unsigned long *step, gfp_t gfp) +{ + unsigned long old_ste, ste; + unsigned long *pto; + + ste = READ_ONCE(*step); + if (reg_entry_isvalid(ste)) { + pto = get_st_pto(ste); + } else { + pto = dma_alloc_page_table(gfp); + if (!pto) + return NULL; + set_st_pto(&ste, virt_to_phys(pto)); + validate_st_entry(&ste); + entry_clr_protected(&ste); + + old_ste = cmpxchg(step, ZPCI_TABLE_INVALID, ste); + if (old_ste != ZPCI_TABLE_INVALID) { + /* Somone else was faster, use theirs */ + dma_free_page_table(pto); + pto = get_st_pto(old_ste); + } + } + return pto; +} + +static unsigned long *dma_walk_cpu_trans(unsigned long *rto, dma_addr_t dma_addr, gfp_t gfp) +{ + unsigned long *sto, *pto; + unsigned int rtx, sx, px; + + rtx = calc_rtx(dma_addr); + sto = dma_get_seg_table_origin(&rto[rtx], gfp); + if (!sto) + return NULL; + + sx = calc_sx(dma_addr); + pto = dma_get_page_table_origin(&sto[sx], gfp); + if (!pto) + return NULL; + + px = calc_px(dma_addr); + return &pto[px]; +} + +static void dma_update_cpu_trans(unsigned long *ptep, phys_addr_t page_addr, int flags) +{ + unsigned long pte; + + pte = READ_ONCE(*ptep); + if (flags & ZPCI_PTE_INVALID) { + invalidate_pt_entry(&pte); + } else { + set_pt_pfaa(&pte, page_addr); + validate_pt_entry(&pte); + } + + if (flags & ZPCI_TABLE_PROTECTED) + entry_set_protected(&pte); + else + entry_clr_protected(&pte); + + xchg(ptep, pte); +} + static struct s390_domain *to_s390_domain(struct iommu_domain *dom) { return container_of(dom, struct s390_domain, domain); @@ -31,21 +315,22 @@ static struct s390_domain *to_s390_domain(struct iommu_domain *dom) static bool s390_iommu_capable(struct device *dev, enum iommu_cap cap) { + struct zpci_dev *zdev = to_zpci_dev(dev); + switch (cap) { case IOMMU_CAP_CACHE_COHERENCY: return true; + case IOMMU_CAP_DEFERRED_FLUSH: + return zdev->pft != PCI_FUNC_TYPE_ISM; default: return false; } } -static struct iommu_domain *s390_domain_alloc(unsigned domain_type) +static struct iommu_domain *s390_domain_alloc_paging(struct device *dev) { struct s390_domain *s390_domain; - if (domain_type != IOMMU_DOMAIN_UNMANAGED) - return NULL; - s390_domain = kzalloc(sizeof(*s390_domain), GFP_KERNEL); if (!s390_domain) return NULL; @@ -84,14 +369,13 @@ static void s390_domain_free(struct iommu_domain *domain) call_rcu(&s390_domain->rcu, s390_iommu_rcu_free_domain); } -static void __s390_iommu_detach_device(struct zpci_dev *zdev) +static void s390_iommu_detach_device(struct iommu_domain *domain, + struct device *dev) { - struct s390_domain *s390_domain = zdev->s390_domain; + struct s390_domain *s390_domain = to_s390_domain(domain); + struct zpci_dev *zdev = to_zpci_dev(dev); unsigned long flags; - if (!s390_domain) - return; - spin_lock_irqsave(&s390_domain->list_lock, flags); list_del_rcu(&zdev->iommu_list); spin_unlock_irqrestore(&s390_domain->list_lock, flags); @@ -118,9 +402,7 @@ static int s390_iommu_attach_device(struct iommu_domain *domain, return -EINVAL; if (zdev->s390_domain) - __s390_iommu_detach_device(zdev); - else if (zdev->dma_table) - zpci_dma_exit_device(zdev); + s390_iommu_detach_device(&zdev->s390_domain->domain, dev); cc = zpci_register_ioat(zdev, 0, zdev->start_dma, zdev->end_dma, virt_to_phys(s390_domain->dma_table), &status); @@ -130,7 +412,6 @@ static int s390_iommu_attach_device(struct iommu_domain *domain, */ if (cc && status != ZPCI_PCI_ST_FUNC_NOT_AVAIL) return -EIO; - zdev->dma_table = s390_domain->dma_table; zdev->dma_table = s390_domain->dma_table; zdev->s390_domain = s390_domain; @@ -142,14 +423,6 @@ static int s390_iommu_attach_device(struct iommu_domain *domain, return 0; } -static void s390_iommu_set_platform_dma(struct device *dev) -{ - struct zpci_dev *zdev = to_zpci_dev(dev); - - __s390_iommu_detach_device(zdev); - zpci_dma_init_device(zdev); -} - static void s390_iommu_get_resv_regions(struct device *dev, struct list_head *list) { @@ -190,6 +463,9 @@ static struct iommu_device *s390_iommu_probe_device(struct device *dev) if (zdev->end_dma > ZPCI_TABLE_SIZE_RT - 1) zdev->end_dma = ZPCI_TABLE_SIZE_RT - 1; + if (zdev->tlb_refresh) + dev->iommu->shadow_on_flush = 1; + return &zdev->iommu_dev; } @@ -202,7 +478,13 @@ static void s390_iommu_release_device(struct device *dev) * to the device, but keep it attached to other devices in the group. */ if (zdev) - __s390_iommu_detach_device(zdev); + s390_iommu_detach_device(&zdev->s390_domain->domain, dev); +} + +static int zpci_refresh_all(struct zpci_dev *zdev) +{ + return zpci_refresh_trans((u64)zdev->fh << 32, zdev->start_dma, + zdev->end_dma - zdev->start_dma + 1); } static void s390_iommu_flush_iotlb_all(struct iommu_domain *domain) @@ -212,8 +494,8 @@ static void s390_iommu_flush_iotlb_all(struct iommu_domain *domain) rcu_read_lock(); list_for_each_entry_rcu(zdev, &s390_domain->devices, iommu_list) { - zpci_refresh_trans((u64)zdev->fh << 32, zdev->start_dma, - zdev->end_dma - zdev->start_dma + 1); + atomic64_inc(&s390_domain->ctrs.global_rpcits); + zpci_refresh_all(zdev); } rcu_read_unlock(); } @@ -231,26 +513,40 @@ static void s390_iommu_iotlb_sync(struct iommu_domain *domain, rcu_read_lock(); list_for_each_entry_rcu(zdev, &s390_domain->devices, iommu_list) { + atomic64_inc(&s390_domain->ctrs.sync_rpcits); zpci_refresh_trans((u64)zdev->fh << 32, gather->start, size); } rcu_read_unlock(); } -static void s390_iommu_iotlb_sync_map(struct iommu_domain *domain, - unsigned long iova, size_t size) +static int s390_iommu_iotlb_sync_map(struct iommu_domain *domain, + unsigned long iova, size_t size) { struct s390_domain *s390_domain = to_s390_domain(domain); struct zpci_dev *zdev; + int ret = 0; rcu_read_lock(); list_for_each_entry_rcu(zdev, &s390_domain->devices, iommu_list) { if (!zdev->tlb_refresh) continue; - zpci_refresh_trans((u64)zdev->fh << 32, - iova, size); + atomic64_inc(&s390_domain->ctrs.sync_map_rpcits); + ret = zpci_refresh_trans((u64)zdev->fh << 32, + iova, size); + /* + * let the hypervisor discover invalidated entries + * allowing it to free IOVAs and unpin pages + */ + if (ret == -ENOMEM) { + ret = zpci_refresh_all(zdev); + if (ret) + break; + } } rcu_read_unlock(); + + return ret; } static int s390_iommu_validate_trans(struct s390_domain *s390_domain, @@ -330,16 +626,15 @@ static int s390_iommu_map_pages(struct iommu_domain *domain, if (!IS_ALIGNED(iova | paddr, pgsize)) return -EINVAL; - if (!(prot & IOMMU_READ)) - return -EINVAL; - if (!(prot & IOMMU_WRITE)) flags |= ZPCI_TABLE_PROTECTED; rc = s390_iommu_validate_trans(s390_domain, paddr, iova, - pgcount, flags, gfp); - if (!rc) + pgcount, flags, gfp); + if (!rc) { *mapped = size; + atomic64_add(pgcount, &s390_domain->ctrs.mapped_pages); + } return rc; } @@ -395,12 +690,26 @@ static size_t s390_iommu_unmap_pages(struct iommu_domain *domain, return 0; iommu_iotlb_gather_add_range(gather, iova, size); + atomic64_add(pgcount, &s390_domain->ctrs.unmapped_pages); return size; } +static void s390_iommu_probe_finalize(struct device *dev) +{ + iommu_setup_dma_ops(dev, 0, U64_MAX); +} + +struct zpci_iommu_ctrs *zpci_get_iommu_ctrs(struct zpci_dev *zdev) +{ + if (!zdev || !zdev->s390_domain) + return NULL; + return &zdev->s390_domain->ctrs; +} + int zpci_init_iommu(struct zpci_dev *zdev) { + u64 aperture_size; int rc = 0; rc = iommu_device_sysfs_add(&zdev->iommu_dev, NULL, NULL, @@ -412,6 +721,12 @@ int zpci_init_iommu(struct zpci_dev *zdev) if (rc) goto out_sysfs; + zdev->start_dma = PAGE_ALIGN(zdev->start_dma); + aperture_size = min3(s390_iommu_aperture, + ZPCI_TABLE_SIZE_RT - zdev->start_dma, + zdev->end_dma - zdev->start_dma + 1); + zdev->end_dma = zdev->start_dma + aperture_size - 1; + return 0; out_sysfs: @@ -427,13 +742,52 @@ void zpci_destroy_iommu(struct zpci_dev *zdev) iommu_device_sysfs_remove(&zdev->iommu_dev); } +static int __init s390_iommu_setup(char *str) +{ + if (!strcmp(str, "strict")) { + pr_warn("s390_iommu=strict deprecated; use iommu.strict=1 instead\n"); + iommu_set_dma_strict(); + } + return 1; +} + +__setup("s390_iommu=", s390_iommu_setup); + +static int __init s390_iommu_aperture_setup(char *str) +{ + if (kstrtou32(str, 10, &s390_iommu_aperture_factor)) + s390_iommu_aperture_factor = 1; + return 1; +} + +__setup("s390_iommu_aperture=", s390_iommu_aperture_setup); + +static int __init s390_iommu_init(void) +{ + int rc; + + iommu_dma_forcedac = true; + s390_iommu_aperture = (u64)virt_to_phys(high_memory); + if (!s390_iommu_aperture_factor) + s390_iommu_aperture = ULONG_MAX; + else + s390_iommu_aperture *= s390_iommu_aperture_factor; + + rc = dma_alloc_cpu_table_caches(); + if (rc) + return rc; + + return rc; +} +subsys_initcall(s390_iommu_init); + static const struct iommu_ops s390_iommu_ops = { .capable = s390_iommu_capable, - .domain_alloc = s390_domain_alloc, + .domain_alloc_paging = s390_domain_alloc_paging, .probe_device = s390_iommu_probe_device, + .probe_finalize = s390_iommu_probe_finalize, .release_device = s390_iommu_release_device, .device_group = generic_device_group, - .set_platform_dma_ops = s390_iommu_set_platform_dma, .pgsize_bitmap = SZ_4K, .get_resv_regions = s390_iommu_get_resv_regions, .default_domain_ops = &(const struct iommu_domain_ops) { diff --git a/drivers/iommu/sprd-iommu.c b/drivers/iommu/sprd-iommu.c index 2fa9afebd4..2eb9fb4670 100644 --- a/drivers/iommu/sprd-iommu.c +++ b/drivers/iommu/sprd-iommu.c @@ -70,7 +70,6 @@ struct sprd_iommu_device { void __iomem *base; struct device *dev; struct iommu_device iommu; - struct iommu_group *group; struct clk *eb; }; @@ -134,13 +133,10 @@ sprd_iommu_pgt_size(struct iommu_domain *domain) SPRD_IOMMU_PAGE_SHIFT) * sizeof(u32); } -static struct iommu_domain *sprd_iommu_domain_alloc(unsigned int domain_type) +static struct iommu_domain *sprd_iommu_domain_alloc_paging(struct device *dev) { struct sprd_iommu_domain *dom; - if (domain_type != IOMMU_DOMAIN_DMA && domain_type != IOMMU_DOMAIN_UNMANAGED) - return NULL; - dom = kzalloc(sizeof(*dom), GFP_KERNEL); if (!dom) return NULL; @@ -345,8 +341,8 @@ static size_t sprd_iommu_unmap(struct iommu_domain *domain, unsigned long iova, return size; } -static void sprd_iommu_sync_map(struct iommu_domain *domain, - unsigned long iova, size_t size) +static int sprd_iommu_sync_map(struct iommu_domain *domain, + unsigned long iova, size_t size) { struct sprd_iommu_domain *dom = to_sprd_domain(domain); unsigned int reg; @@ -358,6 +354,7 @@ static void sprd_iommu_sync_map(struct iommu_domain *domain, /* clear IOMMU TLB buffer after page table updated */ sprd_iommu_write(dom->sdev, reg, 0xffffffff); + return 0; } static void sprd_iommu_sync(struct iommu_domain *domain, @@ -399,13 +396,6 @@ static struct iommu_device *sprd_iommu_probe_device(struct device *dev) return &sdev->iommu; } -static struct iommu_group *sprd_iommu_device_group(struct device *dev) -{ - struct sprd_iommu_device *sdev = dev_iommu_priv_get(dev); - - return iommu_group_ref_get(sdev->group); -} - static int sprd_iommu_of_xlate(struct device *dev, struct of_phandle_args *args) { struct platform_device *pdev; @@ -421,9 +411,9 @@ static int sprd_iommu_of_xlate(struct device *dev, struct of_phandle_args *args) static const struct iommu_ops sprd_iommu_ops = { - .domain_alloc = sprd_iommu_domain_alloc, + .domain_alloc_paging = sprd_iommu_domain_alloc_paging, .probe_device = sprd_iommu_probe_device, - .device_group = sprd_iommu_device_group, + .device_group = generic_single_device_group, .of_xlate = sprd_iommu_of_xlate, .pgsize_bitmap = SPRD_IOMMU_PAGE_SIZE, .owner = THIS_MODULE, @@ -496,16 +486,9 @@ static int sprd_iommu_probe(struct platform_device *pdev) platform_set_drvdata(pdev, sdev); sdev->dev = dev; - /* All the client devices are in the same iommu-group */ - sdev->group = iommu_group_alloc(); - if (IS_ERR(sdev->group)) { - ret = PTR_ERR(sdev->group); - goto free_page; - } - ret = iommu_device_sysfs_add(&sdev->iommu, dev, NULL, dev_name(dev)); if (ret) - goto put_group; + goto free_page; ret = iommu_device_register(&sdev->iommu, &sprd_iommu_ops, dev); if (ret) @@ -530,8 +513,6 @@ unregister_iommu: iommu_device_unregister(&sdev->iommu); remove_sysfs: iommu_device_sysfs_remove(&sdev->iommu); -put_group: - iommu_group_put(sdev->group); free_page: dma_free_coherent(sdev->dev, SPRD_IOMMU_PAGE_SIZE, sdev->prot_page_va, sdev->prot_page_pa); return ret; @@ -543,9 +524,6 @@ static void sprd_iommu_remove(struct platform_device *pdev) dma_free_coherent(sdev->dev, SPRD_IOMMU_PAGE_SIZE, sdev->prot_page_va, sdev->prot_page_pa); - iommu_group_put(sdev->group); - sdev->group = NULL; - platform_set_drvdata(pdev, NULL); iommu_device_sysfs_remove(&sdev->iommu); iommu_device_unregister(&sdev->iommu); diff --git a/drivers/iommu/sun50i-iommu.c b/drivers/iommu/sun50i-iommu.c index 74c5cb93e9..41484a5a39 100644 --- a/drivers/iommu/sun50i-iommu.c +++ b/drivers/iommu/sun50i-iommu.c @@ -107,7 +107,6 @@ struct sun50i_iommu { struct clk *clk; struct iommu_domain *domain; - struct iommu_group *group; struct kmem_cache *pt_pool; }; @@ -402,8 +401,8 @@ static void sun50i_iommu_flush_iotlb_all(struct iommu_domain *domain) spin_unlock_irqrestore(&iommu->iommu_lock, flags); } -static void sun50i_iommu_iotlb_sync_map(struct iommu_domain *domain, - unsigned long iova, size_t size) +static int sun50i_iommu_iotlb_sync_map(struct iommu_domain *domain, + unsigned long iova, size_t size) { struct sun50i_iommu_domain *sun50i_domain = to_sun50i_domain(domain); struct sun50i_iommu *iommu = sun50i_domain->iommu; @@ -412,6 +411,8 @@ static void sun50i_iommu_iotlb_sync_map(struct iommu_domain *domain, spin_lock_irqsave(&iommu->iommu_lock, flags); sun50i_iommu_zap_range(iommu, iova, size); spin_unlock_irqrestore(&iommu->iommu_lock, flags); + + return 0; } static void sun50i_iommu_iotlb_sync(struct iommu_domain *domain, @@ -589,7 +590,8 @@ static u32 *sun50i_dte_get_page_table(struct sun50i_iommu_domain *sun50i_domain, } static int sun50i_iommu_map(struct iommu_domain *domain, unsigned long iova, - phys_addr_t paddr, size_t size, int prot, gfp_t gfp) + phys_addr_t paddr, size_t size, size_t count, + int prot, gfp_t gfp, size_t *mapped) { struct sun50i_iommu_domain *sun50i_domain = to_sun50i_domain(domain); struct sun50i_iommu *iommu = sun50i_domain->iommu; @@ -616,13 +618,14 @@ static int sun50i_iommu_map(struct iommu_domain *domain, unsigned long iova, *pte_addr = sun50i_mk_pte(paddr, prot); sun50i_table_flush(sun50i_domain, pte_addr, 1); + *mapped = size; out: return ret; } static size_t sun50i_iommu_unmap(struct iommu_domain *domain, unsigned long iova, - size_t size, struct iommu_iotlb_gather *gather) + size_t size, size_t count, struct iommu_iotlb_gather *gather) { struct sun50i_iommu_domain *sun50i_domain = to_sun50i_domain(domain); phys_addr_t pt_phys; @@ -667,14 +670,11 @@ static phys_addr_t sun50i_iommu_iova_to_phys(struct iommu_domain *domain, sun50i_iova_get_page_offset(iova); } -static struct iommu_domain *sun50i_iommu_domain_alloc(unsigned type) +static struct iommu_domain * +sun50i_iommu_domain_alloc_paging(struct device *dev) { struct sun50i_iommu_domain *sun50i_domain; - if (type != IOMMU_DOMAIN_DMA && - type != IOMMU_DOMAIN_UNMANAGED) - return NULL; - sun50i_domain = kzalloc(sizeof(*sun50i_domain), GFP_KERNEL); if (!sun50i_domain) return NULL; @@ -757,21 +757,32 @@ static void sun50i_iommu_detach_domain(struct sun50i_iommu *iommu, iommu->domain = NULL; } -static void sun50i_iommu_detach_device(struct iommu_domain *domain, - struct device *dev) +static int sun50i_iommu_identity_attach(struct iommu_domain *identity_domain, + struct device *dev) { - struct sun50i_iommu_domain *sun50i_domain = to_sun50i_domain(domain); struct sun50i_iommu *iommu = dev_iommu_priv_get(dev); + struct sun50i_iommu_domain *sun50i_domain; dev_dbg(dev, "Detaching from IOMMU domain\n"); - if (iommu->domain != domain) - return; + if (iommu->domain == identity_domain) + return 0; + sun50i_domain = to_sun50i_domain(iommu->domain); if (refcount_dec_and_test(&sun50i_domain->refcnt)) sun50i_iommu_detach_domain(iommu, sun50i_domain); + return 0; } +static struct iommu_domain_ops sun50i_iommu_identity_ops = { + .attach_dev = sun50i_iommu_identity_attach, +}; + +static struct iommu_domain sun50i_iommu_identity_domain = { + .type = IOMMU_DOMAIN_IDENTITY, + .ops = &sun50i_iommu_identity_ops, +}; + static int sun50i_iommu_attach_device(struct iommu_domain *domain, struct device *dev) { @@ -789,8 +800,7 @@ static int sun50i_iommu_attach_device(struct iommu_domain *domain, if (iommu->domain == domain) return 0; - if (iommu->domain) - sun50i_iommu_detach_device(iommu->domain, dev); + sun50i_iommu_identity_attach(&sun50i_iommu_identity_domain, dev); sun50i_iommu_attach_domain(iommu, sun50i_domain); @@ -808,13 +818,6 @@ static struct iommu_device *sun50i_iommu_probe_device(struct device *dev) return &iommu->iommu; } -static struct iommu_group *sun50i_iommu_device_group(struct device *dev) -{ - struct sun50i_iommu *iommu = sun50i_iommu_from_dev(dev); - - return iommu_group_ref_get(iommu->group); -} - static int sun50i_iommu_of_xlate(struct device *dev, struct of_phandle_args *args) { @@ -827,9 +830,10 @@ static int sun50i_iommu_of_xlate(struct device *dev, } static const struct iommu_ops sun50i_iommu_ops = { + .identity_domain = &sun50i_iommu_identity_domain, .pgsize_bitmap = SZ_4K, - .device_group = sun50i_iommu_device_group, - .domain_alloc = sun50i_iommu_domain_alloc, + .device_group = generic_single_device_group, + .domain_alloc_paging = sun50i_iommu_domain_alloc_paging, .of_xlate = sun50i_iommu_of_xlate, .probe_device = sun50i_iommu_probe_device, .default_domain_ops = &(const struct iommu_domain_ops) { @@ -838,8 +842,8 @@ static const struct iommu_ops sun50i_iommu_ops = { .iotlb_sync_map = sun50i_iommu_iotlb_sync_map, .iotlb_sync = sun50i_iommu_iotlb_sync, .iova_to_phys = sun50i_iommu_iova_to_phys, - .map = sun50i_iommu_map, - .unmap = sun50i_iommu_unmap, + .map_pages = sun50i_iommu_map, + .unmap_pages = sun50i_iommu_unmap, .free = sun50i_iommu_domain_free, } }; @@ -985,6 +989,7 @@ static int sun50i_iommu_probe(struct platform_device *pdev) if (!iommu) return -ENOMEM; spin_lock_init(&iommu->iommu_lock); + iommu->domain = &sun50i_iommu_identity_domain; platform_set_drvdata(pdev, iommu); iommu->dev = &pdev->dev; @@ -995,42 +1000,36 @@ static int sun50i_iommu_probe(struct platform_device *pdev) if (!iommu->pt_pool) return -ENOMEM; - iommu->group = iommu_group_alloc(); - if (IS_ERR(iommu->group)) { - ret = PTR_ERR(iommu->group); - goto err_free_cache; - } - iommu->base = devm_platform_ioremap_resource(pdev, 0); if (IS_ERR(iommu->base)) { ret = PTR_ERR(iommu->base); - goto err_free_group; + goto err_free_cache; } irq = platform_get_irq(pdev, 0); if (irq < 0) { ret = irq; - goto err_free_group; + goto err_free_cache; } iommu->clk = devm_clk_get(&pdev->dev, NULL); if (IS_ERR(iommu->clk)) { dev_err(&pdev->dev, "Couldn't get our clock.\n"); ret = PTR_ERR(iommu->clk); - goto err_free_group; + goto err_free_cache; } iommu->reset = devm_reset_control_get(&pdev->dev, NULL); if (IS_ERR(iommu->reset)) { dev_err(&pdev->dev, "Couldn't get our reset line.\n"); ret = PTR_ERR(iommu->reset); - goto err_free_group; + goto err_free_cache; } ret = iommu_device_sysfs_add(&iommu->iommu, &pdev->dev, NULL, dev_name(&pdev->dev)); if (ret) - goto err_free_group; + goto err_free_cache; ret = iommu_device_register(&iommu->iommu, &sun50i_iommu_ops, &pdev->dev); if (ret) @@ -1049,9 +1048,6 @@ err_unregister: err_remove_sysfs: iommu_device_sysfs_remove(&iommu->iommu); -err_free_group: - iommu_group_put(iommu->group); - err_free_cache: kmem_cache_destroy(iommu->pt_pool); diff --git a/drivers/iommu/tegra-gart.c b/drivers/iommu/tegra-gart.c deleted file mode 100644 index a482ff838b..0000000000 --- a/drivers/iommu/tegra-gart.c +++ /dev/null @@ -1,371 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * IOMMU API for Graphics Address Relocation Table on Tegra20 - * - * Copyright (c) 2010-2012, NVIDIA CORPORATION. All rights reserved. - * - * Author: Hiroshi DOYU <hdoyu@nvidia.com> - */ - -#define dev_fmt(fmt) "gart: " fmt - -#include <linux/io.h> -#include <linux/iommu.h> -#include <linux/moduleparam.h> -#include <linux/platform_device.h> -#include <linux/slab.h> -#include <linux/spinlock.h> -#include <linux/vmalloc.h> - -#include <soc/tegra/mc.h> - -#define GART_REG_BASE 0x24 -#define GART_CONFIG (0x24 - GART_REG_BASE) -#define GART_ENTRY_ADDR (0x28 - GART_REG_BASE) -#define GART_ENTRY_DATA (0x2c - GART_REG_BASE) - -#define GART_ENTRY_PHYS_ADDR_VALID BIT(31) - -#define GART_PAGE_SHIFT 12 -#define GART_PAGE_SIZE (1 << GART_PAGE_SHIFT) -#define GART_PAGE_MASK GENMASK(30, GART_PAGE_SHIFT) - -/* bitmap of the page sizes currently supported */ -#define GART_IOMMU_PGSIZES (GART_PAGE_SIZE) - -struct gart_device { - void __iomem *regs; - u32 *savedata; - unsigned long iovmm_base; /* offset to vmm_area start */ - unsigned long iovmm_end; /* offset to vmm_area end */ - spinlock_t pte_lock; /* for pagetable */ - spinlock_t dom_lock; /* for active domain */ - unsigned int active_devices; /* number of active devices */ - struct iommu_domain *active_domain; /* current active domain */ - struct iommu_device iommu; /* IOMMU Core handle */ - struct device *dev; -}; - -static struct gart_device *gart_handle; /* unique for a system */ - -static bool gart_debug; - -/* - * Any interaction between any block on PPSB and a block on APB or AHB - * must have these read-back to ensure the APB/AHB bus transaction is - * complete before initiating activity on the PPSB block. - */ -#define FLUSH_GART_REGS(gart) readl_relaxed((gart)->regs + GART_CONFIG) - -#define for_each_gart_pte(gart, iova) \ - for (iova = gart->iovmm_base; \ - iova < gart->iovmm_end; \ - iova += GART_PAGE_SIZE) - -static inline void gart_set_pte(struct gart_device *gart, - unsigned long iova, unsigned long pte) -{ - writel_relaxed(iova, gart->regs + GART_ENTRY_ADDR); - writel_relaxed(pte, gart->regs + GART_ENTRY_DATA); -} - -static inline unsigned long gart_read_pte(struct gart_device *gart, - unsigned long iova) -{ - unsigned long pte; - - writel_relaxed(iova, gart->regs + GART_ENTRY_ADDR); - pte = readl_relaxed(gart->regs + GART_ENTRY_DATA); - - return pte; -} - -static void do_gart_setup(struct gart_device *gart, const u32 *data) -{ - unsigned long iova; - - for_each_gart_pte(gart, iova) - gart_set_pte(gart, iova, data ? *(data++) : 0); - - writel_relaxed(1, gart->regs + GART_CONFIG); - FLUSH_GART_REGS(gart); -} - -static inline bool gart_iova_range_invalid(struct gart_device *gart, - unsigned long iova, size_t bytes) -{ - return unlikely(iova < gart->iovmm_base || bytes != GART_PAGE_SIZE || - iova + bytes > gart->iovmm_end); -} - -static inline bool gart_pte_valid(struct gart_device *gart, unsigned long iova) -{ - return !!(gart_read_pte(gart, iova) & GART_ENTRY_PHYS_ADDR_VALID); -} - -static int gart_iommu_attach_dev(struct iommu_domain *domain, - struct device *dev) -{ - struct gart_device *gart = gart_handle; - int ret = 0; - - spin_lock(&gart->dom_lock); - - if (gart->active_domain && gart->active_domain != domain) { - ret = -EINVAL; - } else if (dev_iommu_priv_get(dev) != domain) { - dev_iommu_priv_set(dev, domain); - gart->active_domain = domain; - gart->active_devices++; - } - - spin_unlock(&gart->dom_lock); - - return ret; -} - -static void gart_iommu_set_platform_dma(struct device *dev) -{ - struct iommu_domain *domain = iommu_get_domain_for_dev(dev); - struct gart_device *gart = gart_handle; - - spin_lock(&gart->dom_lock); - - if (dev_iommu_priv_get(dev) == domain) { - dev_iommu_priv_set(dev, NULL); - - if (--gart->active_devices == 0) - gart->active_domain = NULL; - } - - spin_unlock(&gart->dom_lock); -} - -static struct iommu_domain *gart_iommu_domain_alloc(unsigned type) -{ - struct iommu_domain *domain; - - if (type != IOMMU_DOMAIN_UNMANAGED) - return NULL; - - domain = kzalloc(sizeof(*domain), GFP_KERNEL); - if (domain) { - domain->geometry.aperture_start = gart_handle->iovmm_base; - domain->geometry.aperture_end = gart_handle->iovmm_end - 1; - domain->geometry.force_aperture = true; - } - - return domain; -} - -static void gart_iommu_domain_free(struct iommu_domain *domain) -{ - WARN_ON(gart_handle->active_domain == domain); - kfree(domain); -} - -static inline int __gart_iommu_map(struct gart_device *gart, unsigned long iova, - unsigned long pa) -{ - if (unlikely(gart_debug && gart_pte_valid(gart, iova))) { - dev_err(gart->dev, "Page entry is in-use\n"); - return -EINVAL; - } - - gart_set_pte(gart, iova, GART_ENTRY_PHYS_ADDR_VALID | pa); - - return 0; -} - -static int gart_iommu_map(struct iommu_domain *domain, unsigned long iova, - phys_addr_t pa, size_t bytes, int prot, gfp_t gfp) -{ - struct gart_device *gart = gart_handle; - int ret; - - if (gart_iova_range_invalid(gart, iova, bytes)) - return -EINVAL; - - spin_lock(&gart->pte_lock); - ret = __gart_iommu_map(gart, iova, (unsigned long)pa); - spin_unlock(&gart->pte_lock); - - return ret; -} - -static inline int __gart_iommu_unmap(struct gart_device *gart, - unsigned long iova) -{ - if (unlikely(gart_debug && !gart_pte_valid(gart, iova))) { - dev_err(gart->dev, "Page entry is invalid\n"); - return -EINVAL; - } - - gart_set_pte(gart, iova, 0); - - return 0; -} - -static size_t gart_iommu_unmap(struct iommu_domain *domain, unsigned long iova, - size_t bytes, struct iommu_iotlb_gather *gather) -{ - struct gart_device *gart = gart_handle; - int err; - - if (gart_iova_range_invalid(gart, iova, bytes)) - return 0; - - spin_lock(&gart->pte_lock); - err = __gart_iommu_unmap(gart, iova); - spin_unlock(&gart->pte_lock); - - return err ? 0 : bytes; -} - -static phys_addr_t gart_iommu_iova_to_phys(struct iommu_domain *domain, - dma_addr_t iova) -{ - struct gart_device *gart = gart_handle; - unsigned long pte; - - if (gart_iova_range_invalid(gart, iova, GART_PAGE_SIZE)) - return -EINVAL; - - spin_lock(&gart->pte_lock); - pte = gart_read_pte(gart, iova); - spin_unlock(&gart->pte_lock); - - return pte & GART_PAGE_MASK; -} - -static struct iommu_device *gart_iommu_probe_device(struct device *dev) -{ - if (!dev_iommu_fwspec_get(dev)) - return ERR_PTR(-ENODEV); - - return &gart_handle->iommu; -} - -static int gart_iommu_of_xlate(struct device *dev, - struct of_phandle_args *args) -{ - return 0; -} - -static void gart_iommu_sync_map(struct iommu_domain *domain, unsigned long iova, - size_t size) -{ - FLUSH_GART_REGS(gart_handle); -} - -static void gart_iommu_sync(struct iommu_domain *domain, - struct iommu_iotlb_gather *gather) -{ - size_t length = gather->end - gather->start + 1; - - gart_iommu_sync_map(domain, gather->start, length); -} - -static const struct iommu_ops gart_iommu_ops = { - .domain_alloc = gart_iommu_domain_alloc, - .probe_device = gart_iommu_probe_device, - .device_group = generic_device_group, - .set_platform_dma_ops = gart_iommu_set_platform_dma, - .pgsize_bitmap = GART_IOMMU_PGSIZES, - .of_xlate = gart_iommu_of_xlate, - .default_domain_ops = &(const struct iommu_domain_ops) { - .attach_dev = gart_iommu_attach_dev, - .map = gart_iommu_map, - .unmap = gart_iommu_unmap, - .iova_to_phys = gart_iommu_iova_to_phys, - .iotlb_sync_map = gart_iommu_sync_map, - .iotlb_sync = gart_iommu_sync, - .free = gart_iommu_domain_free, - } -}; - -int tegra_gart_suspend(struct gart_device *gart) -{ - u32 *data = gart->savedata; - unsigned long iova; - - /* - * All GART users shall be suspended at this point. Disable - * address translation to trap all GART accesses as invalid - * memory accesses. - */ - writel_relaxed(0, gart->regs + GART_CONFIG); - FLUSH_GART_REGS(gart); - - for_each_gart_pte(gart, iova) - *(data++) = gart_read_pte(gart, iova); - - return 0; -} - -int tegra_gart_resume(struct gart_device *gart) -{ - do_gart_setup(gart, gart->savedata); - - return 0; -} - -struct gart_device *tegra_gart_probe(struct device *dev, struct tegra_mc *mc) -{ - struct gart_device *gart; - struct resource *res; - int err; - - BUILD_BUG_ON(PAGE_SHIFT != GART_PAGE_SHIFT); - - /* the GART memory aperture is required */ - res = platform_get_resource(to_platform_device(dev), IORESOURCE_MEM, 1); - if (!res) { - dev_err(dev, "Memory aperture resource unavailable\n"); - return ERR_PTR(-ENXIO); - } - - gart = kzalloc(sizeof(*gart), GFP_KERNEL); - if (!gart) - return ERR_PTR(-ENOMEM); - - gart_handle = gart; - - gart->dev = dev; - gart->regs = mc->regs + GART_REG_BASE; - gart->iovmm_base = res->start; - gart->iovmm_end = res->end + 1; - spin_lock_init(&gart->pte_lock); - spin_lock_init(&gart->dom_lock); - - do_gart_setup(gart, NULL); - - err = iommu_device_sysfs_add(&gart->iommu, dev, NULL, "gart"); - if (err) - goto free_gart; - - err = iommu_device_register(&gart->iommu, &gart_iommu_ops, dev); - if (err) - goto remove_sysfs; - - gart->savedata = vmalloc(resource_size(res) / GART_PAGE_SIZE * - sizeof(u32)); - if (!gart->savedata) { - err = -ENOMEM; - goto unregister_iommu; - } - - return gart; - -unregister_iommu: - iommu_device_unregister(&gart->iommu); -remove_sysfs: - iommu_device_sysfs_remove(&gart->iommu); -free_gart: - kfree(gart); - - return ERR_PTR(err); -} - -module_param(gart_debug, bool, 0644); -MODULE_PARM_DESC(gart_debug, "Enable GART debugging"); diff --git a/drivers/iommu/tegra-smmu.c b/drivers/iommu/tegra-smmu.c index e445f80d02..310871728a 100644 --- a/drivers/iommu/tegra-smmu.c +++ b/drivers/iommu/tegra-smmu.c @@ -272,13 +272,10 @@ static void tegra_smmu_free_asid(struct tegra_smmu *smmu, unsigned int id) clear_bit(id, smmu->asids); } -static struct iommu_domain *tegra_smmu_domain_alloc(unsigned type) +static struct iommu_domain *tegra_smmu_domain_alloc_paging(struct device *dev) { struct tegra_smmu_as *as; - if (type != IOMMU_DOMAIN_UNMANAGED) - return NULL; - as = kzalloc(sizeof(*as), GFP_KERNEL); if (!as) return NULL; @@ -511,23 +508,39 @@ disable: return err; } -static void tegra_smmu_set_platform_dma(struct device *dev) +static int tegra_smmu_identity_attach(struct iommu_domain *identity_domain, + struct device *dev) { struct iommu_domain *domain = iommu_get_domain_for_dev(dev); struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); - struct tegra_smmu_as *as = to_smmu_as(domain); - struct tegra_smmu *smmu = as->smmu; + struct tegra_smmu_as *as; + struct tegra_smmu *smmu; unsigned int index; if (!fwspec) - return; + return -ENODEV; + if (domain == identity_domain || !domain) + return 0; + + as = to_smmu_as(domain); + smmu = as->smmu; for (index = 0; index < fwspec->num_ids; index++) { tegra_smmu_disable(smmu, fwspec->ids[index], as->id); tegra_smmu_as_unprepare(smmu, as); } + return 0; } +static struct iommu_domain_ops tegra_smmu_identity_ops = { + .attach_dev = tegra_smmu_identity_attach, +}; + +static struct iommu_domain tegra_smmu_identity_domain = { + .type = IOMMU_DOMAIN_IDENTITY, + .ops = &tegra_smmu_identity_ops, +}; + static void tegra_smmu_set_pde(struct tegra_smmu_as *as, unsigned long iova, u32 value) { @@ -751,7 +764,8 @@ __tegra_smmu_unmap(struct iommu_domain *domain, unsigned long iova, } static int tegra_smmu_map(struct iommu_domain *domain, unsigned long iova, - phys_addr_t paddr, size_t size, int prot, gfp_t gfp) + phys_addr_t paddr, size_t size, size_t count, + int prot, gfp_t gfp, size_t *mapped) { struct tegra_smmu_as *as = to_smmu_as(domain); unsigned long flags; @@ -761,11 +775,14 @@ static int tegra_smmu_map(struct iommu_domain *domain, unsigned long iova, ret = __tegra_smmu_map(domain, iova, paddr, size, prot, gfp, &flags); spin_unlock_irqrestore(&as->lock, flags); + if (!ret) + *mapped = size; + return ret; } static size_t tegra_smmu_unmap(struct iommu_domain *domain, unsigned long iova, - size_t size, struct iommu_iotlb_gather *gather) + size_t size, size_t count, struct iommu_iotlb_gather *gather) { struct tegra_smmu_as *as = to_smmu_as(domain); unsigned long flags; @@ -962,17 +979,28 @@ static int tegra_smmu_of_xlate(struct device *dev, return iommu_fwspec_add_ids(dev, &id, 1); } +static int tegra_smmu_def_domain_type(struct device *dev) +{ + /* + * FIXME: For now we want to run all translation in IDENTITY mode, due + * to some device quirks. Better would be to just quirk the troubled + * devices. + */ + return IOMMU_DOMAIN_IDENTITY; +} + static const struct iommu_ops tegra_smmu_ops = { - .domain_alloc = tegra_smmu_domain_alloc, + .identity_domain = &tegra_smmu_identity_domain, + .def_domain_type = &tegra_smmu_def_domain_type, + .domain_alloc_paging = tegra_smmu_domain_alloc_paging, .probe_device = tegra_smmu_probe_device, .device_group = tegra_smmu_device_group, - .set_platform_dma_ops = tegra_smmu_set_platform_dma, .of_xlate = tegra_smmu_of_xlate, .pgsize_bitmap = SZ_4K, .default_domain_ops = &(const struct iommu_domain_ops) { .attach_dev = tegra_smmu_attach_dev, - .map = tegra_smmu_map, - .unmap = tegra_smmu_unmap, + .map_pages = tegra_smmu_map, + .unmap_pages = tegra_smmu_unmap, .iova_to_phys = tegra_smmu_iova_to_phys, .free = tegra_smmu_domain_free, } @@ -1056,8 +1084,6 @@ DEFINE_SHOW_ATTRIBUTE(tegra_smmu_clients); static void tegra_smmu_debugfs_init(struct tegra_smmu *smmu) { smmu->debugfs = debugfs_create_dir("smmu", NULL); - if (!smmu->debugfs) - return; debugfs_create_file("swgroups", S_IRUGO, smmu->debugfs, smmu, &tegra_smmu_swgroups_fops); diff --git a/drivers/iommu/virtio-iommu.c b/drivers/iommu/virtio-iommu.c index 17dcd826f5..379ebe03ef 100644 --- a/drivers/iommu/virtio-iommu.c +++ b/drivers/iommu/virtio-iommu.c @@ -85,7 +85,7 @@ struct viommu_request { void *writeback; unsigned int write_offset; unsigned int len; - char buf[]; + char buf[] __counted_by(len); }; #define VIOMMU_FAULT_RESV_MASK 0xffffff00 @@ -230,7 +230,7 @@ static int __viommu_add_req(struct viommu_dev *viommu, void *buf, size_t len, if (write_offset <= 0) return -EINVAL; - req = kzalloc(sizeof(*req) + len, GFP_ATOMIC); + req = kzalloc(struct_size(req, buf, len), GFP_ATOMIC); if (!req) return -ENOMEM; |