summaryrefslogtreecommitdiffstats
path: root/drivers/iommu
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-27 10:05:51 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-27 10:05:51 +0000
commit5d1646d90e1f2cceb9f0828f4b28318cd0ec7744 (patch)
treea94efe259b9009378be6d90eb30d2b019d95c194 /drivers/iommu
parentInitial commit. (diff)
downloadlinux-upstream.tar.xz
linux-upstream.zip
Adding upstream version 5.10.209.upstream/5.10.209upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'drivers/iommu')
-rw-r--r--drivers/iommu/Kconfig404
-rw-r--r--drivers/iommu/Makefile29
-rw-r--r--drivers/iommu/amd/Kconfig44
-rw-r--r--drivers/iommu/amd/Makefile4
-rw-r--r--drivers/iommu/amd/amd_iommu.h115
-rw-r--r--drivers/iommu/amd/amd_iommu_types.h920
-rw-r--r--drivers/iommu/amd/debugfs.c32
-rw-r--r--drivers/iommu/amd/init.c3318
-rw-r--r--drivers/iommu/amd/iommu.c4144
-rw-r--r--drivers/iommu/amd/iommu_v2.c989
-rw-r--r--drivers/iommu/amd/quirks.c105
-rw-r--r--drivers/iommu/arm/Makefile2
-rw-r--r--drivers/iommu/arm/arm-smmu-v3/Makefile5
-rw-r--r--drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c248
-rw-r--r--drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c3641
-rw-r--r--drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h723
-rw-r--r--drivers/iommu/arm/arm-smmu/Makefile4
-rw-r--r--drivers/iommu/arm/arm-smmu/arm-smmu-impl.c230
-rw-r--r--drivers/iommu/arm/arm-smmu/arm-smmu-nvidia.c278
-rw-r--r--drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c195
-rw-r--r--drivers/iommu/arm/arm-smmu/arm-smmu.c2349
-rw-r--r--drivers/iommu/arm/arm-smmu/arm-smmu.h528
-rw-r--r--drivers/iommu/arm/arm-smmu/qcom_iommu.c953
-rw-r--r--drivers/iommu/dma-iommu.c1288
-rw-r--r--drivers/iommu/exynos-iommu.c1392
-rw-r--r--drivers/iommu/fsl_pamu.c1271
-rw-r--r--drivers/iommu/fsl_pamu.h400
-rw-r--r--drivers/iommu/fsl_pamu_domain.c1069
-rw-r--r--drivers/iommu/fsl_pamu_domain.h73
-rw-r--r--drivers/iommu/hyperv-iommu.c199
-rw-r--r--drivers/iommu/intel/Kconfig87
-rw-r--r--drivers/iommu/intel/Makefile7
-rw-r--r--drivers/iommu/intel/debugfs.c559
-rw-r--r--drivers/iommu/intel/dmar.c2324
-rw-r--r--drivers/iommu/intel/iommu.c6406
-rw-r--r--drivers/iommu/intel/irq_remapping.c1531
-rw-r--r--drivers/iommu/intel/pasid.c883
-rw-r--r--drivers/iommu/intel/pasid.h135
-rw-r--r--drivers/iommu/intel/svm.c1223
-rw-r--r--drivers/iommu/intel/trace.c14
-rw-r--r--drivers/iommu/io-pgtable-arm-v7s.c991
-rw-r--r--drivers/iommu/io-pgtable-arm.c1250
-rw-r--r--drivers/iommu/io-pgtable-arm.h30
-rw-r--r--drivers/iommu/io-pgtable.c70
-rw-r--r--drivers/iommu/ioasid.c422
-rw-r--r--drivers/iommu/iommu-debugfs.c51
-rw-r--r--drivers/iommu/iommu-sysfs.c135
-rw-r--r--drivers/iommu/iommu-traces.c28
-rw-r--r--drivers/iommu/iommu.c3060
-rw-r--r--drivers/iommu/iova.c1050
-rw-r--r--drivers/iommu/ipmmu-vmsa.c1204
-rw-r--r--drivers/iommu/irq_remapping.c174
-rw-r--r--drivers/iommu/irq_remapping.h62
-rw-r--r--drivers/iommu/msm_iommu.c852
-rw-r--r--drivers/iommu/msm_iommu.h89
-rw-r--r--drivers/iommu/msm_iommu_hw-8xxx.h1852
-rw-r--r--drivers/iommu/mtk_iommu.c894
-rw-r--r--drivers/iommu/mtk_iommu.h101
-rw-r--r--drivers/iommu/mtk_iommu_v1.c715
-rw-r--r--drivers/iommu/of_iommu.c247
-rw-r--r--drivers/iommu/omap-iommu-debug.c274
-rw-r--r--drivers/iommu/omap-iommu.c1794
-rw-r--r--drivers/iommu/omap-iommu.h274
-rw-r--r--drivers/iommu/omap-iopgtable.h100
-rw-r--r--drivers/iommu/rockchip-iommu.c1305
-rw-r--r--drivers/iommu/s390-iommu.c376
-rw-r--r--drivers/iommu/sun50i-iommu.c1024
-rw-r--r--drivers/iommu/tegra-gart.c380
-rw-r--r--drivers/iommu/tegra-smmu.c1182
-rw-r--r--drivers/iommu/virtio-iommu.c1158
70 files changed, 59265 insertions, 0 deletions
diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
new file mode 100644
index 000000000..04878caf6
--- /dev/null
+++ b/drivers/iommu/Kconfig
@@ -0,0 +1,404 @@
+# SPDX-License-Identifier: GPL-2.0-only
+# The IOVA library may also be used by non-IOMMU_API users
+config IOMMU_IOVA
+ tristate
+
+# The IOASID library may also be used by non-IOMMU_API users
+config IOASID
+ tristate
+
+# IOMMU_API always gets selected by whoever wants it.
+config IOMMU_API
+ bool
+
+menuconfig IOMMU_SUPPORT
+ bool "IOMMU Hardware Support"
+ depends on MMU
+ default y
+ help
+ Say Y here if you want to compile device drivers for IO Memory
+ Management Units into the kernel. These devices usually allow to
+ remap DMA requests and/or remap interrupts from other devices on the
+ system.
+
+if IOMMU_SUPPORT
+
+menu "Generic IOMMU Pagetable Support"
+
+# Selected by the actual pagetable implementations
+config IOMMU_IO_PGTABLE
+ bool
+
+config IOMMU_IO_PGTABLE_LPAE
+ bool "ARMv7/v8 Long Descriptor Format"
+ select IOMMU_IO_PGTABLE
+ depends on ARM || ARM64 || (COMPILE_TEST && !GENERIC_ATOMIC64)
+ help
+ Enable support for the ARM long descriptor pagetable format.
+ This allocator supports 4K/2M/1G, 16K/32M and 64K/512M page
+ sizes at both stage-1 and stage-2, as well as address spaces
+ up to 48-bits in size.
+
+config IOMMU_IO_PGTABLE_LPAE_SELFTEST
+ bool "LPAE selftests"
+ depends on IOMMU_IO_PGTABLE_LPAE
+ help
+ Enable self-tests for LPAE page table allocator. This performs
+ a series of page-table consistency checks during boot.
+
+ If unsure, say N here.
+
+config IOMMU_IO_PGTABLE_ARMV7S
+ bool "ARMv7/v8 Short Descriptor Format"
+ select IOMMU_IO_PGTABLE
+ depends on ARM || ARM64 || COMPILE_TEST
+ help
+ Enable support for the ARM Short-descriptor pagetable format.
+ This supports 32-bit virtual and physical addresses mapped using
+ 2-level tables with 4KB pages/1MB sections, and contiguous entries
+ for 64KB pages/16MB supersections if indicated by the IOMMU driver.
+
+config IOMMU_IO_PGTABLE_ARMV7S_SELFTEST
+ bool "ARMv7s selftests"
+ depends on IOMMU_IO_PGTABLE_ARMV7S
+ help
+ Enable self-tests for ARMv7s page table allocator. This performs
+ a series of page-table consistency checks during boot.
+
+ If unsure, say N here.
+
+endmenu
+
+config IOMMU_DEBUGFS
+ bool "Export IOMMU internals in DebugFS"
+ depends on DEBUG_FS
+ help
+ Allows exposure of IOMMU device internals. This option enables
+ the use of debugfs by IOMMU drivers as required. Devices can,
+ at initialization time, cause the IOMMU code to create a top-level
+ debug/iommu directory, and then populate a subdirectory with
+ entries as required.
+
+config IOMMU_DEFAULT_PASSTHROUGH
+ bool "IOMMU passthrough by default"
+ depends on IOMMU_API
+ help
+ Enable passthrough by default, removing the need to pass in
+ iommu.passthrough=on or iommu=pt through command line. If this
+ is enabled, you can still disable with iommu.passthrough=off
+ or iommu=nopt depending on the architecture.
+
+ If unsure, say N here.
+
+config OF_IOMMU
+ def_bool y
+ depends on OF && IOMMU_API
+
+# IOMMU-agnostic DMA-mapping layer
+config IOMMU_DMA
+ bool
+ select DMA_OPS
+ select IOMMU_API
+ select IOMMU_IOVA
+ select IRQ_MSI_IOMMU
+ select NEED_SG_DMA_LENGTH
+
+config FSL_PAMU
+ bool "Freescale IOMMU support"
+ depends on PCI
+ depends on PHYS_64BIT
+ depends on PPC_E500MC || (COMPILE_TEST && PPC)
+ select IOMMU_API
+ select GENERIC_ALLOCATOR
+ help
+ Freescale PAMU support. PAMU is the IOMMU present on Freescale QorIQ platforms.
+ PAMU can authorize memory access, remap the memory address, and remap I/O
+ transaction types.
+
+# MSM IOMMU support
+config MSM_IOMMU
+ bool "MSM IOMMU Support"
+ depends on ARM
+ depends on ARCH_MSM8X60 || ARCH_MSM8960 || COMPILE_TEST
+ select IOMMU_API
+ select IOMMU_IO_PGTABLE_ARMV7S
+ help
+ Support for the IOMMUs found on certain Qualcomm SOCs.
+ These IOMMUs allow virtualization of the address space used by most
+ cores within the multimedia subsystem.
+
+ If unsure, say N here.
+
+source "drivers/iommu/amd/Kconfig"
+source "drivers/iommu/intel/Kconfig"
+
+config IRQ_REMAP
+ bool "Support for Interrupt Remapping"
+ depends on X86_64 && X86_IO_APIC && PCI_MSI && ACPI
+ select DMAR_TABLE
+ help
+ Supports Interrupt remapping for IO-APIC and MSI devices.
+ To use x2apic mode in the CPU's which support x2APIC enhancements or
+ to support platforms with CPU's having > 8 bit APIC ID, say Y.
+
+# OMAP IOMMU support
+config OMAP_IOMMU
+ bool "OMAP IOMMU Support"
+ depends on ARCH_OMAP2PLUS || COMPILE_TEST
+ select IOMMU_API
+ help
+ The OMAP3 media platform drivers depend on iommu support,
+ if you need them say Y here.
+
+config OMAP_IOMMU_DEBUG
+ bool "Export OMAP IOMMU internals in DebugFS"
+ depends on OMAP_IOMMU && DEBUG_FS
+ help
+ Select this to see extensive information about
+ the internal state of OMAP IOMMU in debugfs.
+
+ Say N unless you know you need this.
+
+config ROCKCHIP_IOMMU
+ bool "Rockchip IOMMU Support"
+ depends on ARCH_ROCKCHIP || COMPILE_TEST
+ select IOMMU_API
+ select ARM_DMA_USE_IOMMU
+ help
+ Support for IOMMUs found on Rockchip rk32xx SOCs.
+ These IOMMUs allow virtualization of the address space used by most
+ cores within the multimedia subsystem.
+ Say Y here if you are using a Rockchip SoC that includes an IOMMU
+ device.
+
+config SUN50I_IOMMU
+ bool "Allwinner H6 IOMMU Support"
+ depends on HAS_DMA
+ depends on ARCH_SUNXI || COMPILE_TEST
+ select ARM_DMA_USE_IOMMU
+ select IOMMU_API
+ help
+ Support for the IOMMU introduced in the Allwinner H6 SoCs.
+
+config TEGRA_IOMMU_GART
+ bool "Tegra GART IOMMU Support"
+ depends on ARCH_TEGRA_2x_SOC
+ depends on TEGRA_MC
+ select IOMMU_API
+ help
+ Enables support for remapping discontiguous physical memory
+ shared with the operating system into contiguous I/O virtual
+ space through the GART (Graphics Address Relocation Table)
+ hardware included on Tegra SoCs.
+
+config TEGRA_IOMMU_SMMU
+ bool "NVIDIA Tegra SMMU Support"
+ depends on ARCH_TEGRA
+ depends on TEGRA_AHB
+ depends on TEGRA_MC
+ select IOMMU_API
+ help
+ This driver supports the IOMMU hardware (SMMU) found on NVIDIA Tegra
+ SoCs (Tegra30 up to Tegra210).
+
+config EXYNOS_IOMMU
+ bool "Exynos IOMMU Support"
+ depends on ARCH_EXYNOS || COMPILE_TEST
+ depends on !CPU_BIG_ENDIAN # revisit driver if we can enable big-endian ptes
+ select IOMMU_API
+ select ARM_DMA_USE_IOMMU
+ help
+ Support for the IOMMU (System MMU) of Samsung Exynos application
+ processor family. This enables H/W multimedia accelerators to see
+ non-linear physical memory chunks as linear memory in their
+ address space.
+
+ If unsure, say N here.
+
+config EXYNOS_IOMMU_DEBUG
+ bool "Debugging log for Exynos IOMMU"
+ depends on EXYNOS_IOMMU
+ help
+ Select this to see the detailed log message that shows what
+ happens in the IOMMU driver.
+
+ Say N unless you need kernel log message for IOMMU debugging.
+
+config IPMMU_VMSA
+ bool "Renesas VMSA-compatible IPMMU"
+ depends on ARCH_RENESAS || (COMPILE_TEST && !GENERIC_ATOMIC64)
+ select IOMMU_API
+ select IOMMU_IO_PGTABLE_LPAE
+ select ARM_DMA_USE_IOMMU
+ help
+ Support for the Renesas VMSA-compatible IPMMU found in the R-Mobile
+ APE6, R-Car Gen{2,3} and RZ/G{1,2} SoCs.
+
+ If unsure, say N.
+
+config SPAPR_TCE_IOMMU
+ bool "sPAPR TCE IOMMU Support"
+ depends on PPC_POWERNV || PPC_PSERIES
+ select IOMMU_API
+ help
+ Enables bits of IOMMU API required by VFIO. The iommu_ops
+ is not implemented as it is not necessary for VFIO.
+
+# ARM IOMMU support
+config ARM_SMMU
+ tristate "ARM Ltd. System MMU (SMMU) Support"
+ depends on ARM64 || ARM || (COMPILE_TEST && !GENERIC_ATOMIC64)
+ select IOMMU_API
+ select IOMMU_IO_PGTABLE_LPAE
+ select ARM_DMA_USE_IOMMU if ARM
+ help
+ Support for implementations of the ARM System MMU architecture
+ versions 1 and 2.
+
+ Say Y here if your SoC includes an IOMMU device implementing
+ the ARM SMMU architecture.
+
+config ARM_SMMU_LEGACY_DT_BINDINGS
+ bool "Support the legacy \"mmu-masters\" devicetree bindings"
+ depends on ARM_SMMU=y && OF
+ help
+ Support for the badly designed and deprecated "mmu-masters"
+ devicetree bindings. This allows some DMA masters to attach
+ to the SMMU but does not provide any support via the DMA API.
+ If you're lucky, you might be able to get VFIO up and running.
+
+ If you say Y here then you'll make me very sad. Instead, say N
+ and move your firmware to the utopian future that was 2016.
+
+config ARM_SMMU_DISABLE_BYPASS_BY_DEFAULT
+ bool "Default to disabling bypass on ARM SMMU v1 and v2"
+ depends on ARM_SMMU
+ default y
+ help
+ Say Y here to (by default) disable bypass streams such that
+ incoming transactions from devices that are not attached to
+ an iommu domain will report an abort back to the device and
+ will not be allowed to pass through the SMMU.
+
+ Any old kernels that existed before this KConfig was
+ introduced would default to _allowing_ bypass (AKA the
+ equivalent of NO for this config). However the default for
+ this option is YES because the old behavior is insecure.
+
+ There are few reasons to allow unmatched stream bypass, and
+ even fewer good ones. If saying YES here breaks your board
+ you should work on fixing your board. This KConfig option
+ is expected to be removed in the future and we'll simply
+ hardcode the bypass disable in the code.
+
+ NOTE: the kernel command line parameter
+ 'arm-smmu.disable_bypass' will continue to override this
+ config.
+
+config ARM_SMMU_V3
+ tristate "ARM Ltd. System MMU Version 3 (SMMUv3) Support"
+ depends on ARM64
+ select IOMMU_API
+ select IOMMU_IO_PGTABLE_LPAE
+ select GENERIC_MSI_IRQ_DOMAIN
+ help
+ Support for implementations of the ARM System MMU architecture
+ version 3 providing translation support to a PCIe root complex.
+
+ Say Y here if your system includes an IOMMU device implementing
+ the ARM SMMUv3 architecture.
+
+config ARM_SMMU_V3_SVA
+ bool "Shared Virtual Addressing support for the ARM SMMUv3"
+ depends on ARM_SMMU_V3
+ help
+ Support for sharing process address spaces with devices using the
+ SMMUv3.
+
+ Say Y here if your system supports SVA extensions such as PCIe PASID
+ and PRI.
+
+config S390_IOMMU
+ def_bool y if S390 && PCI
+ depends on S390 && PCI
+ select IOMMU_API
+ help
+ Support for the IOMMU API for s390 PCI devices.
+
+config S390_CCW_IOMMU
+ bool "S390 CCW IOMMU Support"
+ depends on S390 && CCW || COMPILE_TEST
+ select IOMMU_API
+ help
+ Enables bits of IOMMU API required by VFIO. The iommu_ops
+ is not implemented as it is not necessary for VFIO.
+
+config S390_AP_IOMMU
+ bool "S390 AP IOMMU Support"
+ depends on S390 && ZCRYPT || COMPILE_TEST
+ select IOMMU_API
+ help
+ Enables bits of IOMMU API required by VFIO. The iommu_ops
+ is not implemented as it is not necessary for VFIO.
+
+config MTK_IOMMU
+ bool "MTK IOMMU Support"
+ depends on ARCH_MEDIATEK || COMPILE_TEST
+ select ARM_DMA_USE_IOMMU
+ select IOMMU_API
+ select IOMMU_IO_PGTABLE_ARMV7S
+ select MEMORY
+ select MTK_SMI
+ help
+ Support for the M4U on certain Mediatek SOCs. M4U is MultiMedia
+ Memory Management Unit. This option enables remapping of DMA memory
+ accesses for the multimedia subsystem.
+
+ If unsure, say N here.
+
+config MTK_IOMMU_V1
+ bool "MTK IOMMU Version 1 (M4U gen1) Support"
+ depends on ARM
+ depends on ARCH_MEDIATEK || COMPILE_TEST
+ select ARM_DMA_USE_IOMMU
+ select IOMMU_API
+ select MEMORY
+ select MTK_SMI
+ help
+ Support for the M4U on certain Mediatek SoCs. M4U generation 1 HW is
+ Multimedia Memory Managememt Unit. This option enables remapping of
+ DMA memory accesses for the multimedia subsystem.
+
+ if unsure, say N here.
+
+config QCOM_IOMMU
+ # Note: iommu drivers cannot (yet?) be built as modules
+ bool "Qualcomm IOMMU Support"
+ depends on ARCH_QCOM || (COMPILE_TEST && !GENERIC_ATOMIC64)
+ select IOMMU_API
+ select IOMMU_IO_PGTABLE_LPAE
+ select ARM_DMA_USE_IOMMU
+ help
+ Support for IOMMU on certain Qualcomm SoCs.
+
+config HYPERV_IOMMU
+ bool "Hyper-V x2APIC IRQ Handling"
+ depends on HYPERV && X86
+ select IOMMU_API
+ default HYPERV
+ help
+ Stub IOMMU driver to handle IRQs as to allow Hyper-V Linux
+ guests to run with x2APIC mode enabled.
+
+config VIRTIO_IOMMU
+ tristate "Virtio IOMMU driver"
+ depends on VIRTIO
+ depends on ARM64
+ select IOMMU_API
+ select INTERVAL_TREE
+ help
+ Para-virtualised IOMMU driver with virtio.
+
+ Say Y here if you intend to run this kernel as a guest.
+
+endif # IOMMU_SUPPORT
diff --git a/drivers/iommu/Makefile b/drivers/iommu/Makefile
new file mode 100644
index 000000000..11f177110
--- /dev/null
+++ b/drivers/iommu/Makefile
@@ -0,0 +1,29 @@
+# SPDX-License-Identifier: GPL-2.0
+obj-y += amd/ intel/ arm/
+obj-$(CONFIG_IOMMU_API) += iommu.o
+obj-$(CONFIG_IOMMU_API) += iommu-traces.o
+obj-$(CONFIG_IOMMU_API) += iommu-sysfs.o
+obj-$(CONFIG_IOMMU_DEBUGFS) += iommu-debugfs.o
+obj-$(CONFIG_IOMMU_DMA) += dma-iommu.o
+obj-$(CONFIG_IOMMU_IO_PGTABLE) += io-pgtable.o
+obj-$(CONFIG_IOMMU_IO_PGTABLE_ARMV7S) += io-pgtable-arm-v7s.o
+obj-$(CONFIG_IOMMU_IO_PGTABLE_LPAE) += io-pgtable-arm.o
+obj-$(CONFIG_IOASID) += ioasid.o
+obj-$(CONFIG_IOMMU_IOVA) += iova.o
+obj-$(CONFIG_OF_IOMMU) += of_iommu.o
+obj-$(CONFIG_MSM_IOMMU) += msm_iommu.o
+obj-$(CONFIG_IPMMU_VMSA) += ipmmu-vmsa.o
+obj-$(CONFIG_IRQ_REMAP) += irq_remapping.o
+obj-$(CONFIG_MTK_IOMMU) += mtk_iommu.o
+obj-$(CONFIG_MTK_IOMMU_V1) += mtk_iommu_v1.o
+obj-$(CONFIG_OMAP_IOMMU) += omap-iommu.o
+obj-$(CONFIG_OMAP_IOMMU_DEBUG) += omap-iommu-debug.o
+obj-$(CONFIG_ROCKCHIP_IOMMU) += rockchip-iommu.o
+obj-$(CONFIG_SUN50I_IOMMU) += sun50i-iommu.o
+obj-$(CONFIG_TEGRA_IOMMU_GART) += tegra-gart.o
+obj-$(CONFIG_TEGRA_IOMMU_SMMU) += tegra-smmu.o
+obj-$(CONFIG_EXYNOS_IOMMU) += exynos-iommu.o
+obj-$(CONFIG_FSL_PAMU) += fsl_pamu.o fsl_pamu_domain.o
+obj-$(CONFIG_S390_IOMMU) += s390-iommu.o
+obj-$(CONFIG_HYPERV_IOMMU) += hyperv-iommu.o
+obj-$(CONFIG_VIRTIO_IOMMU) += virtio-iommu.o
diff --git a/drivers/iommu/amd/Kconfig b/drivers/iommu/amd/Kconfig
new file mode 100644
index 000000000..626b97d0d
--- /dev/null
+++ b/drivers/iommu/amd/Kconfig
@@ -0,0 +1,44 @@
+# SPDX-License-Identifier: GPL-2.0-only
+# AMD IOMMU support
+config AMD_IOMMU
+ bool "AMD IOMMU support"
+ select SWIOTLB
+ select PCI_MSI
+ select PCI_ATS
+ select PCI_PRI
+ select PCI_PASID
+ select IOMMU_API
+ select IOMMU_IOVA
+ select IOMMU_DMA
+ depends on X86_64 && PCI && ACPI && HAVE_CMPXCHG_DOUBLE
+ help
+ With this option you can enable support for AMD IOMMU hardware in
+ your system. An IOMMU is a hardware component which provides
+ remapping of DMA memory accesses from devices. With an AMD IOMMU you
+ can isolate the DMA memory of different devices and protect the
+ system from misbehaving device drivers or hardware.
+
+ You can find out if your system has an AMD IOMMU if you look into
+ your BIOS for an option to enable it or if you have an IVRS ACPI
+ table.
+
+config AMD_IOMMU_V2
+ tristate "AMD IOMMU Version 2 driver"
+ depends on AMD_IOMMU
+ select MMU_NOTIFIER
+ help
+ This option enables support for the AMD IOMMUv2 features of the IOMMU
+ hardware. Select this option if you want to use devices that support
+ the PCI PRI and PASID interface.
+
+config AMD_IOMMU_DEBUGFS
+ bool "Enable AMD IOMMU internals in DebugFS"
+ depends on AMD_IOMMU && IOMMU_DEBUGFS
+ help
+ !!!WARNING!!! !!!WARNING!!! !!!WARNING!!! !!!WARNING!!!
+
+ DO NOT ENABLE THIS OPTION UNLESS YOU REALLY, -REALLY- KNOW WHAT YOU ARE DOING!!!
+ Exposes AMD IOMMU device internals in DebugFS.
+
+ This option is -NOT- intended for production environments, and should
+ not generally be enabled.
diff --git a/drivers/iommu/amd/Makefile b/drivers/iommu/amd/Makefile
new file mode 100644
index 000000000..dc5a2fa4f
--- /dev/null
+++ b/drivers/iommu/amd/Makefile
@@ -0,0 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
+obj-$(CONFIG_AMD_IOMMU) += iommu.o init.o quirks.o
+obj-$(CONFIG_AMD_IOMMU_DEBUGFS) += debugfs.o
+obj-$(CONFIG_AMD_IOMMU_V2) += iommu_v2.o
diff --git a/drivers/iommu/amd/amd_iommu.h b/drivers/iommu/amd/amd_iommu.h
new file mode 100644
index 000000000..0c40d2240
--- /dev/null
+++ b/drivers/iommu/amd/amd_iommu.h
@@ -0,0 +1,115 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2009-2010 Advanced Micro Devices, Inc.
+ * Author: Joerg Roedel <jroedel@suse.de>
+ */
+
+#ifndef AMD_IOMMU_H
+#define AMD_IOMMU_H
+
+#include <linux/iommu.h>
+
+#include "amd_iommu_types.h"
+
+extern int amd_iommu_get_num_iommus(void);
+extern int amd_iommu_init_dma_ops(void);
+extern int amd_iommu_init_passthrough(void);
+extern irqreturn_t amd_iommu_int_thread(int irq, void *data);
+extern irqreturn_t amd_iommu_int_handler(int irq, void *data);
+extern void amd_iommu_apply_erratum_63(u16 devid);
+extern void amd_iommu_restart_event_logging(struct amd_iommu *iommu);
+extern void amd_iommu_reset_cmd_buffer(struct amd_iommu *iommu);
+extern int amd_iommu_init_devices(void);
+extern void amd_iommu_uninit_devices(void);
+extern void amd_iommu_init_notifier(void);
+extern int amd_iommu_init_api(void);
+
+#ifdef CONFIG_AMD_IOMMU_DEBUGFS
+void amd_iommu_debugfs_setup(struct amd_iommu *iommu);
+#else
+static inline void amd_iommu_debugfs_setup(struct amd_iommu *iommu) {}
+#endif
+
+/* Needed for interrupt remapping */
+extern int amd_iommu_prepare(void);
+extern int amd_iommu_enable(void);
+extern void amd_iommu_disable(void);
+extern int amd_iommu_reenable(int);
+extern int amd_iommu_enable_faulting(void);
+extern int amd_iommu_guest_ir;
+
+/* IOMMUv2 specific functions */
+struct iommu_domain;
+
+extern bool amd_iommu_v2_supported(void);
+extern struct amd_iommu *get_amd_iommu(unsigned int idx);
+extern u8 amd_iommu_pc_get_max_banks(unsigned int idx);
+extern bool amd_iommu_pc_supported(void);
+extern u8 amd_iommu_pc_get_max_counters(unsigned int idx);
+extern int amd_iommu_pc_get_reg(struct amd_iommu *iommu, u8 bank, u8 cntr,
+ u8 fxn, u64 *value);
+extern int amd_iommu_pc_set_reg(struct amd_iommu *iommu, u8 bank, u8 cntr,
+ u8 fxn, u64 *value);
+
+extern int amd_iommu_register_ppr_notifier(struct notifier_block *nb);
+extern int amd_iommu_unregister_ppr_notifier(struct notifier_block *nb);
+extern void amd_iommu_domain_direct_map(struct iommu_domain *dom);
+extern int amd_iommu_domain_enable_v2(struct iommu_domain *dom, int pasids);
+extern int amd_iommu_flush_page(struct iommu_domain *dom, u32 pasid,
+ u64 address);
+extern int amd_iommu_flush_tlb(struct iommu_domain *dom, u32 pasid);
+extern int amd_iommu_domain_set_gcr3(struct iommu_domain *dom, u32 pasid,
+ unsigned long cr3);
+extern int amd_iommu_domain_clear_gcr3(struct iommu_domain *dom, u32 pasid);
+extern struct iommu_domain *amd_iommu_get_v2_domain(struct pci_dev *pdev);
+
+#ifdef CONFIG_IRQ_REMAP
+extern int amd_iommu_create_irq_domain(struct amd_iommu *iommu);
+#else
+static inline int amd_iommu_create_irq_domain(struct amd_iommu *iommu)
+{
+ return 0;
+}
+#endif
+
+#define PPR_SUCCESS 0x0
+#define PPR_INVALID 0x1
+#define PPR_FAILURE 0xf
+
+extern int amd_iommu_complete_ppr(struct pci_dev *pdev, u32 pasid,
+ int status, int tag);
+
+static inline bool is_rd890_iommu(struct pci_dev *pdev)
+{
+ return (pdev->vendor == PCI_VENDOR_ID_ATI) &&
+ (pdev->device == PCI_DEVICE_ID_RD890_IOMMU);
+}
+
+static inline bool iommu_feature(struct amd_iommu *iommu, u64 mask)
+{
+ return !!(iommu->features & mask);
+}
+
+static inline u64 iommu_virt_to_phys(void *vaddr)
+{
+ return (u64)__sme_set(virt_to_phys(vaddr));
+}
+
+static inline void *iommu_phys_to_virt(unsigned long paddr)
+{
+ return phys_to_virt(__sme_clr(paddr));
+}
+
+extern bool translation_pre_enabled(struct amd_iommu *iommu);
+extern bool amd_iommu_is_attach_deferred(struct iommu_domain *domain,
+ struct device *dev);
+extern int __init add_special_device(u8 type, u8 id, u16 *devid,
+ bool cmd_line);
+
+#ifdef CONFIG_DMI
+void amd_iommu_apply_ivrs_quirks(void);
+#else
+static inline void amd_iommu_apply_ivrs_quirks(void) { }
+#endif
+
+#endif
diff --git a/drivers/iommu/amd/amd_iommu_types.h b/drivers/iommu/amd/amd_iommu_types.h
new file mode 100644
index 000000000..4a8791e03
--- /dev/null
+++ b/drivers/iommu/amd/amd_iommu_types.h
@@ -0,0 +1,920 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2007-2010 Advanced Micro Devices, Inc.
+ * Author: Joerg Roedel <jroedel@suse.de>
+ * Leo Duran <leo.duran@amd.com>
+ */
+
+#ifndef _ASM_X86_AMD_IOMMU_TYPES_H
+#define _ASM_X86_AMD_IOMMU_TYPES_H
+
+#include <linux/types.h>
+#include <linux/mutex.h>
+#include <linux/msi.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/pci.h>
+#include <linux/irqreturn.h>
+
+/*
+ * Maximum number of IOMMUs supported
+ */
+#define MAX_IOMMUS 32
+
+/*
+ * some size calculation constants
+ */
+#define DEV_TABLE_ENTRY_SIZE 32
+#define ALIAS_TABLE_ENTRY_SIZE 2
+#define RLOOKUP_TABLE_ENTRY_SIZE (sizeof(void *))
+
+/* Capability offsets used by the driver */
+#define MMIO_CAP_HDR_OFFSET 0x00
+#define MMIO_RANGE_OFFSET 0x0c
+#define MMIO_MISC_OFFSET 0x10
+
+/* Masks, shifts and macros to parse the device range capability */
+#define MMIO_RANGE_LD_MASK 0xff000000
+#define MMIO_RANGE_FD_MASK 0x00ff0000
+#define MMIO_RANGE_BUS_MASK 0x0000ff00
+#define MMIO_RANGE_LD_SHIFT 24
+#define MMIO_RANGE_FD_SHIFT 16
+#define MMIO_RANGE_BUS_SHIFT 8
+#define MMIO_GET_LD(x) (((x) & MMIO_RANGE_LD_MASK) >> MMIO_RANGE_LD_SHIFT)
+#define MMIO_GET_FD(x) (((x) & MMIO_RANGE_FD_MASK) >> MMIO_RANGE_FD_SHIFT)
+#define MMIO_GET_BUS(x) (((x) & MMIO_RANGE_BUS_MASK) >> MMIO_RANGE_BUS_SHIFT)
+#define MMIO_MSI_NUM(x) ((x) & 0x1f)
+
+/* Flag masks for the AMD IOMMU exclusion range */
+#define MMIO_EXCL_ENABLE_MASK 0x01ULL
+#define MMIO_EXCL_ALLOW_MASK 0x02ULL
+
+/* Used offsets into the MMIO space */
+#define MMIO_DEV_TABLE_OFFSET 0x0000
+#define MMIO_CMD_BUF_OFFSET 0x0008
+#define MMIO_EVT_BUF_OFFSET 0x0010
+#define MMIO_CONTROL_OFFSET 0x0018
+#define MMIO_EXCL_BASE_OFFSET 0x0020
+#define MMIO_EXCL_LIMIT_OFFSET 0x0028
+#define MMIO_EXT_FEATURES 0x0030
+#define MMIO_PPR_LOG_OFFSET 0x0038
+#define MMIO_GA_LOG_BASE_OFFSET 0x00e0
+#define MMIO_GA_LOG_TAIL_OFFSET 0x00e8
+#define MMIO_MSI_ADDR_LO_OFFSET 0x015C
+#define MMIO_MSI_ADDR_HI_OFFSET 0x0160
+#define MMIO_MSI_DATA_OFFSET 0x0164
+#define MMIO_INTCAPXT_EVT_OFFSET 0x0170
+#define MMIO_INTCAPXT_PPR_OFFSET 0x0178
+#define MMIO_INTCAPXT_GALOG_OFFSET 0x0180
+#define MMIO_CMD_HEAD_OFFSET 0x2000
+#define MMIO_CMD_TAIL_OFFSET 0x2008
+#define MMIO_EVT_HEAD_OFFSET 0x2010
+#define MMIO_EVT_TAIL_OFFSET 0x2018
+#define MMIO_STATUS_OFFSET 0x2020
+#define MMIO_PPR_HEAD_OFFSET 0x2030
+#define MMIO_PPR_TAIL_OFFSET 0x2038
+#define MMIO_GA_HEAD_OFFSET 0x2040
+#define MMIO_GA_TAIL_OFFSET 0x2048
+#define MMIO_CNTR_CONF_OFFSET 0x4000
+#define MMIO_CNTR_REG_OFFSET 0x40000
+#define MMIO_REG_END_OFFSET 0x80000
+
+
+
+/* Extended Feature Bits */
+#define FEATURE_PREFETCH (1ULL<<0)
+#define FEATURE_PPR (1ULL<<1)
+#define FEATURE_X2APIC (1ULL<<2)
+#define FEATURE_NX (1ULL<<3)
+#define FEATURE_GT (1ULL<<4)
+#define FEATURE_IA (1ULL<<6)
+#define FEATURE_GA (1ULL<<7)
+#define FEATURE_HE (1ULL<<8)
+#define FEATURE_PC (1ULL<<9)
+#define FEATURE_GAM_VAPIC (1ULL<<21)
+#define FEATURE_EPHSUP (1ULL<<50)
+#define FEATURE_SNP (1ULL<<63)
+
+#define FEATURE_PASID_SHIFT 32
+#define FEATURE_PASID_MASK (0x1fULL << FEATURE_PASID_SHIFT)
+
+#define FEATURE_GLXVAL_SHIFT 14
+#define FEATURE_GLXVAL_MASK (0x03ULL << FEATURE_GLXVAL_SHIFT)
+
+/* Note:
+ * The current driver only support 16-bit PASID.
+ * Currently, hardware only implement upto 16-bit PASID
+ * even though the spec says it could have upto 20 bits.
+ */
+#define PASID_MASK 0x0000ffff
+
+/* MMIO status bits */
+#define MMIO_STATUS_EVT_OVERFLOW_INT_MASK (1 << 0)
+#define MMIO_STATUS_EVT_INT_MASK (1 << 1)
+#define MMIO_STATUS_COM_WAIT_INT_MASK (1 << 2)
+#define MMIO_STATUS_PPR_INT_MASK (1 << 6)
+#define MMIO_STATUS_GALOG_RUN_MASK (1 << 8)
+#define MMIO_STATUS_GALOG_OVERFLOW_MASK (1 << 9)
+#define MMIO_STATUS_GALOG_INT_MASK (1 << 10)
+
+/* event logging constants */
+#define EVENT_ENTRY_SIZE 0x10
+#define EVENT_TYPE_SHIFT 28
+#define EVENT_TYPE_MASK 0xf
+#define EVENT_TYPE_ILL_DEV 0x1
+#define EVENT_TYPE_IO_FAULT 0x2
+#define EVENT_TYPE_DEV_TAB_ERR 0x3
+#define EVENT_TYPE_PAGE_TAB_ERR 0x4
+#define EVENT_TYPE_ILL_CMD 0x5
+#define EVENT_TYPE_CMD_HARD_ERR 0x6
+#define EVENT_TYPE_IOTLB_INV_TO 0x7
+#define EVENT_TYPE_INV_DEV_REQ 0x8
+#define EVENT_TYPE_INV_PPR_REQ 0x9
+#define EVENT_TYPE_RMP_FAULT 0xd
+#define EVENT_TYPE_RMP_HW_ERR 0xe
+#define EVENT_DEVID_MASK 0xffff
+#define EVENT_DEVID_SHIFT 0
+#define EVENT_DOMID_MASK_LO 0xffff
+#define EVENT_DOMID_MASK_HI 0xf0000
+#define EVENT_FLAGS_MASK 0xfff
+#define EVENT_FLAGS_SHIFT 0x10
+
+/* feature control bits */
+#define CONTROL_IOMMU_EN 0x00ULL
+#define CONTROL_HT_TUN_EN 0x01ULL
+#define CONTROL_EVT_LOG_EN 0x02ULL
+#define CONTROL_EVT_INT_EN 0x03ULL
+#define CONTROL_COMWAIT_EN 0x04ULL
+#define CONTROL_INV_TIMEOUT 0x05ULL
+#define CONTROL_PASSPW_EN 0x08ULL
+#define CONTROL_RESPASSPW_EN 0x09ULL
+#define CONTROL_COHERENT_EN 0x0aULL
+#define CONTROL_ISOC_EN 0x0bULL
+#define CONTROL_CMDBUF_EN 0x0cULL
+#define CONTROL_PPRLOG_EN 0x0dULL
+#define CONTROL_PPRINT_EN 0x0eULL
+#define CONTROL_PPR_EN 0x0fULL
+#define CONTROL_GT_EN 0x10ULL
+#define CONTROL_GA_EN 0x11ULL
+#define CONTROL_GAM_EN 0x19ULL
+#define CONTROL_GALOG_EN 0x1CULL
+#define CONTROL_GAINT_EN 0x1DULL
+#define CONTROL_XT_EN 0x32ULL
+#define CONTROL_INTCAPXT_EN 0x33ULL
+
+#define CTRL_INV_TO_MASK (7 << CONTROL_INV_TIMEOUT)
+#define CTRL_INV_TO_NONE 0
+#define CTRL_INV_TO_1MS 1
+#define CTRL_INV_TO_10MS 2
+#define CTRL_INV_TO_100MS 3
+#define CTRL_INV_TO_1S 4
+#define CTRL_INV_TO_10S 5
+#define CTRL_INV_TO_100S 6
+
+/* command specific defines */
+#define CMD_COMPL_WAIT 0x01
+#define CMD_INV_DEV_ENTRY 0x02
+#define CMD_INV_IOMMU_PAGES 0x03
+#define CMD_INV_IOTLB_PAGES 0x04
+#define CMD_INV_IRT 0x05
+#define CMD_COMPLETE_PPR 0x07
+#define CMD_INV_ALL 0x08
+
+#define CMD_COMPL_WAIT_STORE_MASK 0x01
+#define CMD_COMPL_WAIT_INT_MASK 0x02
+#define CMD_INV_IOMMU_PAGES_SIZE_MASK 0x01
+#define CMD_INV_IOMMU_PAGES_PDE_MASK 0x02
+#define CMD_INV_IOMMU_PAGES_GN_MASK 0x04
+
+#define PPR_STATUS_MASK 0xf
+#define PPR_STATUS_SHIFT 12
+
+#define CMD_INV_IOMMU_ALL_PAGES_ADDRESS 0x7fffffffffffffffULL
+
+/* macros and definitions for device table entries */
+#define DEV_ENTRY_VALID 0x00
+#define DEV_ENTRY_TRANSLATION 0x01
+#define DEV_ENTRY_PPR 0x34
+#define DEV_ENTRY_IR 0x3d
+#define DEV_ENTRY_IW 0x3e
+#define DEV_ENTRY_NO_PAGE_FAULT 0x62
+#define DEV_ENTRY_EX 0x67
+#define DEV_ENTRY_SYSMGT1 0x68
+#define DEV_ENTRY_SYSMGT2 0x69
+#define DEV_ENTRY_IRQ_TBL_EN 0x80
+#define DEV_ENTRY_INIT_PASS 0xb8
+#define DEV_ENTRY_EINT_PASS 0xb9
+#define DEV_ENTRY_NMI_PASS 0xba
+#define DEV_ENTRY_LINT0_PASS 0xbe
+#define DEV_ENTRY_LINT1_PASS 0xbf
+#define DEV_ENTRY_MODE_MASK 0x07
+#define DEV_ENTRY_MODE_SHIFT 0x09
+
+#define MAX_DEV_TABLE_ENTRIES 0xffff
+
+/* constants to configure the command buffer */
+#define CMD_BUFFER_SIZE 8192
+#define CMD_BUFFER_UNINITIALIZED 1
+#define CMD_BUFFER_ENTRIES 512
+#define MMIO_CMD_SIZE_SHIFT 56
+#define MMIO_CMD_SIZE_512 (0x9ULL << MMIO_CMD_SIZE_SHIFT)
+
+/* constants for event buffer handling */
+#define EVT_BUFFER_SIZE 8192 /* 512 entries */
+#define EVT_LEN_MASK (0x9ULL << 56)
+
+/* Constants for PPR Log handling */
+#define PPR_LOG_ENTRIES 512
+#define PPR_LOG_SIZE_SHIFT 56
+#define PPR_LOG_SIZE_512 (0x9ULL << PPR_LOG_SIZE_SHIFT)
+#define PPR_ENTRY_SIZE 16
+#define PPR_LOG_SIZE (PPR_ENTRY_SIZE * PPR_LOG_ENTRIES)
+
+#define PPR_REQ_TYPE(x) (((x) >> 60) & 0xfULL)
+#define PPR_FLAGS(x) (((x) >> 48) & 0xfffULL)
+#define PPR_DEVID(x) ((x) & 0xffffULL)
+#define PPR_TAG(x) (((x) >> 32) & 0x3ffULL)
+#define PPR_PASID1(x) (((x) >> 16) & 0xffffULL)
+#define PPR_PASID2(x) (((x) >> 42) & 0xfULL)
+#define PPR_PASID(x) ((PPR_PASID2(x) << 16) | PPR_PASID1(x))
+
+#define PPR_REQ_FAULT 0x01
+
+/* Constants for GA Log handling */
+#define GA_LOG_ENTRIES 512
+#define GA_LOG_SIZE_SHIFT 56
+#define GA_LOG_SIZE_512 (0x8ULL << GA_LOG_SIZE_SHIFT)
+#define GA_ENTRY_SIZE 8
+#define GA_LOG_SIZE (GA_ENTRY_SIZE * GA_LOG_ENTRIES)
+
+#define GA_TAG(x) (u32)(x & 0xffffffffULL)
+#define GA_DEVID(x) (u16)(((x) >> 32) & 0xffffULL)
+#define GA_REQ_TYPE(x) (((x) >> 60) & 0xfULL)
+
+#define GA_GUEST_NR 0x1
+
+/* Bit value definition for dte irq remapping fields*/
+#define DTE_IRQ_PHYS_ADDR_MASK (((1ULL << 45)-1) << 6)
+#define DTE_IRQ_REMAP_INTCTL_MASK (0x3ULL << 60)
+#define DTE_IRQ_TABLE_LEN_MASK (0xfULL << 1)
+#define DTE_IRQ_REMAP_INTCTL (2ULL << 60)
+#define DTE_IRQ_TABLE_LEN (9ULL << 1)
+#define DTE_IRQ_REMAP_ENABLE 1ULL
+
+#define PAGE_MODE_NONE 0x00
+#define PAGE_MODE_1_LEVEL 0x01
+#define PAGE_MODE_2_LEVEL 0x02
+#define PAGE_MODE_3_LEVEL 0x03
+#define PAGE_MODE_4_LEVEL 0x04
+#define PAGE_MODE_5_LEVEL 0x05
+#define PAGE_MODE_6_LEVEL 0x06
+#define PAGE_MODE_7_LEVEL 0x07
+
+#define PM_LEVEL_SHIFT(x) (12 + ((x) * 9))
+#define PM_LEVEL_SIZE(x) (((x) < 6) ? \
+ ((1ULL << PM_LEVEL_SHIFT((x))) - 1): \
+ (0xffffffffffffffffULL))
+#define PM_LEVEL_INDEX(x, a) (((a) >> PM_LEVEL_SHIFT((x))) & 0x1ffULL)
+#define PM_LEVEL_ENC(x) (((x) << 9) & 0xe00ULL)
+#define PM_LEVEL_PDE(x, a) ((a) | PM_LEVEL_ENC((x)) | \
+ IOMMU_PTE_PR | IOMMU_PTE_IR | IOMMU_PTE_IW)
+#define PM_PTE_LEVEL(pte) (((pte) >> 9) & 0x7ULL)
+
+#define PM_MAP_4k 0
+#define PM_ADDR_MASK 0x000ffffffffff000ULL
+#define PM_MAP_MASK(lvl) (PM_ADDR_MASK & \
+ (~((1ULL << (12 + ((lvl) * 9))) - 1)))
+#define PM_ALIGNED(lvl, addr) ((PM_MAP_MASK(lvl) & (addr)) == (addr))
+
+/*
+ * Returns the page table level to use for a given page size
+ * Pagesize is expected to be a power-of-two
+ */
+#define PAGE_SIZE_LEVEL(pagesize) \
+ ((__ffs(pagesize) - 12) / 9)
+/*
+ * Returns the number of ptes to use for a given page size
+ * Pagesize is expected to be a power-of-two
+ */
+#define PAGE_SIZE_PTE_COUNT(pagesize) \
+ (1ULL << ((__ffs(pagesize) - 12) % 9))
+
+/*
+ * Aligns a given io-virtual address to a given page size
+ * Pagesize is expected to be a power-of-two
+ */
+#define PAGE_SIZE_ALIGN(address, pagesize) \
+ ((address) & ~((pagesize) - 1))
+/*
+ * Creates an IOMMU PTE for an address and a given pagesize
+ * The PTE has no permission bits set
+ * Pagesize is expected to be a power-of-two larger than 4096
+ */
+#define PAGE_SIZE_PTE(address, pagesize) \
+ (((address) | ((pagesize) - 1)) & \
+ (~(pagesize >> 1)) & PM_ADDR_MASK)
+
+/*
+ * Takes a PTE value with mode=0x07 and returns the page size it maps
+ */
+#define PTE_PAGE_SIZE(pte) \
+ (1ULL << (1 + ffz(((pte) | 0xfffULL))))
+
+/*
+ * Takes a page-table level and returns the default page-size for this level
+ */
+#define PTE_LEVEL_PAGE_SIZE(level) \
+ (1ULL << (12 + (9 * (level))))
+
+/*
+ * Bit value definition for I/O PTE fields
+ */
+#define IOMMU_PTE_PR (1ULL << 0)
+#define IOMMU_PTE_U (1ULL << 59)
+#define IOMMU_PTE_FC (1ULL << 60)
+#define IOMMU_PTE_IR (1ULL << 61)
+#define IOMMU_PTE_IW (1ULL << 62)
+
+/*
+ * Bit value definition for DTE fields
+ */
+#define DTE_FLAG_V (1ULL << 0)
+#define DTE_FLAG_TV (1ULL << 1)
+#define DTE_FLAG_IR (1ULL << 61)
+#define DTE_FLAG_IW (1ULL << 62)
+
+#define DTE_FLAG_IOTLB (1ULL << 32)
+#define DTE_FLAG_GV (1ULL << 55)
+#define DTE_FLAG_MASK (0x3ffULL << 32)
+#define DTE_GLX_SHIFT (56)
+#define DTE_GLX_MASK (3)
+#define DEV_DOMID_MASK 0xffffULL
+
+#define DTE_GCR3_VAL_A(x) (((x) >> 12) & 0x00007ULL)
+#define DTE_GCR3_VAL_B(x) (((x) >> 15) & 0x0ffffULL)
+#define DTE_GCR3_VAL_C(x) (((x) >> 31) & 0x1fffffULL)
+
+#define DTE_GCR3_INDEX_A 0
+#define DTE_GCR3_INDEX_B 1
+#define DTE_GCR3_INDEX_C 1
+
+#define DTE_GCR3_SHIFT_A 58
+#define DTE_GCR3_SHIFT_B 16
+#define DTE_GCR3_SHIFT_C 43
+
+#define GCR3_VALID 0x01ULL
+
+#define IOMMU_PAGE_MASK (((1ULL << 52) - 1) & ~0xfffULL)
+#define IOMMU_PTE_PRESENT(pte) ((pte) & IOMMU_PTE_PR)
+#define IOMMU_PTE_PAGE(pte) (iommu_phys_to_virt((pte) & IOMMU_PAGE_MASK))
+#define IOMMU_PTE_MODE(pte) (((pte) >> 9) & 0x07)
+
+#define IOMMU_PROT_MASK 0x03
+#define IOMMU_PROT_IR 0x01
+#define IOMMU_PROT_IW 0x02
+
+#define IOMMU_UNITY_MAP_FLAG_EXCL_RANGE (1 << 2)
+
+/* IOMMU capabilities */
+#define IOMMU_CAP_IOTLB 24
+#define IOMMU_CAP_NPCACHE 26
+#define IOMMU_CAP_EFR 27
+
+/* IOMMU IVINFO */
+#define IOMMU_IVINFO_OFFSET 36
+#define IOMMU_IVINFO_EFRSUP BIT(0)
+
+/* IOMMU Feature Reporting Field (for IVHD type 10h */
+#define IOMMU_FEAT_GASUP_SHIFT 6
+
+/* IOMMU Extended Feature Register (EFR) */
+#define IOMMU_EFR_XTSUP_SHIFT 2
+#define IOMMU_EFR_GASUP_SHIFT 7
+#define IOMMU_EFR_MSICAPMMIOSUP_SHIFT 46
+
+#define MAX_DOMAIN_ID 65536
+
+/* Protection domain flags */
+#define PD_DMA_OPS_MASK (1UL << 0) /* domain used for dma_ops */
+#define PD_DEFAULT_MASK (1UL << 1) /* domain is a default dma_ops
+ domain for an IOMMU */
+#define PD_PASSTHROUGH_MASK (1UL << 2) /* domain has no page
+ translation */
+#define PD_IOMMUV2_MASK (1UL << 3) /* domain has gcr3 table */
+
+extern bool amd_iommu_dump;
+#define DUMP_printk(format, arg...) \
+ do { \
+ if (amd_iommu_dump) \
+ pr_info("AMD-Vi: " format, ## arg); \
+ } while(0);
+
+/* global flag if IOMMUs cache non-present entries */
+extern bool amd_iommu_np_cache;
+/* Only true if all IOMMUs support device IOTLBs */
+extern bool amd_iommu_iotlb_sup;
+
+/*
+ * AMD IOMMU hardware only support 512 IRTEs despite
+ * the architectural limitation of 2048 entries.
+ */
+#define MAX_IRQS_PER_TABLE 512
+#define IRQ_TABLE_ALIGNMENT 128
+
+struct irq_remap_table {
+ raw_spinlock_t lock;
+ unsigned min_index;
+ u32 *table;
+};
+
+extern struct irq_remap_table **irq_lookup_table;
+
+/* Interrupt remapping feature used? */
+extern bool amd_iommu_irq_remap;
+
+/* kmem_cache to get tables with 128 byte alignement */
+extern struct kmem_cache *amd_iommu_irq_cache;
+
+/*
+ * Make iterating over all IOMMUs easier
+ */
+#define for_each_iommu(iommu) \
+ list_for_each_entry((iommu), &amd_iommu_list, list)
+#define for_each_iommu_safe(iommu, next) \
+ list_for_each_entry_safe((iommu), (next), &amd_iommu_list, list)
+
+#define APERTURE_RANGE_SHIFT 27 /* 128 MB */
+#define APERTURE_RANGE_SIZE (1ULL << APERTURE_RANGE_SHIFT)
+#define APERTURE_RANGE_PAGES (APERTURE_RANGE_SIZE >> PAGE_SHIFT)
+#define APERTURE_MAX_RANGES 32 /* allows 4GB of DMA address space */
+#define APERTURE_RANGE_INDEX(a) ((a) >> APERTURE_RANGE_SHIFT)
+#define APERTURE_PAGE_INDEX(a) (((a) >> 21) & 0x3fULL)
+
+/*
+ * This struct is used to pass information about
+ * incoming PPR faults around.
+ */
+struct amd_iommu_fault {
+ u64 address; /* IO virtual address of the fault*/
+ u32 pasid; /* Address space identifier */
+ u16 device_id; /* Originating PCI device id */
+ u16 tag; /* PPR tag */
+ u16 flags; /* Fault flags */
+
+};
+
+
+struct iommu_domain;
+struct irq_domain;
+struct amd_irte_ops;
+
+#define AMD_IOMMU_FLAG_TRANS_PRE_ENABLED (1 << 0)
+
+/*
+ * This structure contains generic data for IOMMU protection domains
+ * independent of their use.
+ */
+struct protection_domain {
+ struct list_head dev_list; /* List of all devices in this domain */
+ struct iommu_domain domain; /* generic domain handle used by
+ iommu core code */
+ spinlock_t lock; /* mostly used to lock the page table*/
+ u16 id; /* the domain id written to the device table */
+ atomic64_t pt_root; /* pgtable root and pgtable mode */
+ int glx; /* Number of levels for GCR3 table */
+ u64 *gcr3_tbl; /* Guest CR3 table */
+ unsigned long flags; /* flags to find out type of domain */
+ unsigned dev_cnt; /* devices assigned to this domain */
+ unsigned dev_iommu[MAX_IOMMUS]; /* per-IOMMU reference count */
+};
+
+/* For decocded pt_root */
+struct domain_pgtable {
+ int mode;
+ u64 *root;
+};
+
+/*
+ * Structure where we save information about one hardware AMD IOMMU in the
+ * system.
+ */
+struct amd_iommu {
+ struct list_head list;
+
+ /* Index within the IOMMU array */
+ int index;
+
+ /* locks the accesses to the hardware */
+ raw_spinlock_t lock;
+
+ /* Pointer to PCI device of this IOMMU */
+ struct pci_dev *dev;
+
+ /* Cache pdev to root device for resume quirks */
+ struct pci_dev *root_pdev;
+
+ /* physical address of MMIO space */
+ u64 mmio_phys;
+
+ /* physical end address of MMIO space */
+ u64 mmio_phys_end;
+
+ /* virtual address of MMIO space */
+ u8 __iomem *mmio_base;
+
+ /* capabilities of that IOMMU read from ACPI */
+ u32 cap;
+
+ /* flags read from acpi table */
+ u8 acpi_flags;
+
+ /* Extended features */
+ u64 features;
+
+ /* IOMMUv2 */
+ bool is_iommu_v2;
+
+ /* PCI device id of the IOMMU device */
+ u16 devid;
+
+ /*
+ * Capability pointer. There could be more than one IOMMU per PCI
+ * device function if there are more than one AMD IOMMU capability
+ * pointers.
+ */
+ u16 cap_ptr;
+
+ /* pci domain of this IOMMU */
+ u16 pci_seg;
+
+ /* start of exclusion range of that IOMMU */
+ u64 exclusion_start;
+ /* length of exclusion range of that IOMMU */
+ u64 exclusion_length;
+
+ /* command buffer virtual address */
+ u8 *cmd_buf;
+ u32 cmd_buf_head;
+ u32 cmd_buf_tail;
+
+ /* event buffer virtual address */
+ u8 *evt_buf;
+
+ /* Base of the PPR log, if present */
+ u8 *ppr_log;
+
+ /* Base of the GA log, if present */
+ u8 *ga_log;
+
+ /* Tail of the GA log, if present */
+ u8 *ga_log_tail;
+
+ /* true if interrupts for this IOMMU are already enabled */
+ bool int_enabled;
+
+ /* if one, we need to send a completion wait command */
+ bool need_sync;
+
+ /* Handle for IOMMU core code */
+ struct iommu_device iommu;
+
+ /*
+ * We can't rely on the BIOS to restore all values on reinit, so we
+ * need to stash them
+ */
+
+ /* The iommu BAR */
+ u32 stored_addr_lo;
+ u32 stored_addr_hi;
+
+ /*
+ * Each iommu has 6 l1s, each of which is documented as having 0x12
+ * registers
+ */
+ u32 stored_l1[6][0x12];
+
+ /* The l2 indirect registers */
+ u32 stored_l2[0x83];
+
+ /* The maximum PC banks and counters/bank (PCSup=1) */
+ u8 max_banks;
+ u8 max_counters;
+#ifdef CONFIG_IRQ_REMAP
+ struct irq_domain *ir_domain;
+ struct irq_domain *msi_domain;
+
+ struct amd_irte_ops *irte_ops;
+#endif
+
+ u32 flags;
+ volatile u64 *cmd_sem;
+ u64 cmd_sem_val;
+
+#ifdef CONFIG_AMD_IOMMU_DEBUGFS
+ /* DebugFS Info */
+ struct dentry *debugfs;
+#endif
+ /* IRQ notifier for IntCapXT interrupt */
+ struct irq_affinity_notify intcapxt_notify;
+};
+
+static inline struct amd_iommu *dev_to_amd_iommu(struct device *dev)
+{
+ struct iommu_device *iommu = dev_to_iommu_device(dev);
+
+ return container_of(iommu, struct amd_iommu, iommu);
+}
+
+#define ACPIHID_UID_LEN 256
+#define ACPIHID_HID_LEN 9
+
+struct acpihid_map_entry {
+ struct list_head list;
+ u8 uid[ACPIHID_UID_LEN];
+ u8 hid[ACPIHID_HID_LEN];
+ u16 devid;
+ u16 root_devid;
+ bool cmd_line;
+ struct iommu_group *group;
+};
+
+struct devid_map {
+ struct list_head list;
+ u8 id;
+ u16 devid;
+ bool cmd_line;
+};
+
+/*
+ * This struct contains device specific data for the IOMMU
+ */
+struct iommu_dev_data {
+ /*Protect against attach/detach races */
+ spinlock_t lock;
+
+ struct list_head list; /* For domain->dev_list */
+ struct llist_node dev_data_list; /* For global dev_data_list */
+ struct protection_domain *domain; /* Domain the device is bound to */
+ struct pci_dev *pdev;
+ u16 devid; /* PCI Device ID */
+ bool iommu_v2; /* Device can make use of IOMMUv2 */
+ struct {
+ bool enabled;
+ int qdep;
+ } ats; /* ATS state */
+ bool pri_tlp; /* PASID TLB required for
+ PPR completions */
+ u32 errata; /* Bitmap for errata to apply */
+ bool use_vapic; /* Enable device to use vapic mode */
+ bool defer_attach;
+
+ struct ratelimit_state rs; /* Ratelimit IOPF messages */
+};
+
+/* Map HPET and IOAPIC ids to the devid used by the IOMMU */
+extern struct list_head ioapic_map;
+extern struct list_head hpet_map;
+extern struct list_head acpihid_map;
+
+/*
+ * List with all IOMMUs in the system. This list is not locked because it is
+ * only written and read at driver initialization or suspend time
+ */
+extern struct list_head amd_iommu_list;
+
+/*
+ * Array with pointers to each IOMMU struct
+ * The indices are referenced in the protection domains
+ */
+extern struct amd_iommu *amd_iommus[MAX_IOMMUS];
+
+/*
+ * Structure defining one entry in the device table
+ */
+struct dev_table_entry {
+ u64 data[4];
+};
+
+/*
+ * One entry for unity mappings parsed out of the ACPI table.
+ */
+struct unity_map_entry {
+ struct list_head list;
+
+ /* starting device id this entry is used for (including) */
+ u16 devid_start;
+ /* end device id this entry is used for (including) */
+ u16 devid_end;
+
+ /* start address to unity map (including) */
+ u64 address_start;
+ /* end address to unity map (including) */
+ u64 address_end;
+
+ /* required protection */
+ int prot;
+};
+
+/*
+ * List of all unity mappings. It is not locked because as runtime it is only
+ * read. It is created at ACPI table parsing time.
+ */
+extern struct list_head amd_iommu_unity_map;
+
+/*
+ * Data structures for device handling
+ */
+
+/*
+ * Device table used by hardware. Read and write accesses by software are
+ * locked with the amd_iommu_pd_table lock.
+ */
+extern struct dev_table_entry *amd_iommu_dev_table;
+
+/*
+ * Alias table to find requestor ids to device ids. Not locked because only
+ * read on runtime.
+ */
+extern u16 *amd_iommu_alias_table;
+
+/*
+ * Reverse lookup table to find the IOMMU which translates a specific device.
+ */
+extern struct amd_iommu **amd_iommu_rlookup_table;
+
+/* size of the dma_ops aperture as power of 2 */
+extern unsigned amd_iommu_aperture_order;
+
+/* largest PCI device id we expect translation requests for */
+extern u16 amd_iommu_last_bdf;
+
+/* allocation bitmap for domain ids */
+extern unsigned long *amd_iommu_pd_alloc_bitmap;
+
+/*
+ * If true, the addresses will be flushed on unmap time, not when
+ * they are reused
+ */
+extern bool amd_iommu_unmap_flush;
+
+/* Smallest max PASID supported by any IOMMU in the system */
+extern u32 amd_iommu_max_pasid;
+
+extern bool amd_iommu_v2_present;
+
+extern bool amd_iommu_force_isolation;
+
+/* Max levels of glxval supported */
+extern int amd_iommu_max_glx_val;
+
+/*
+ * This function flushes all internal caches of
+ * the IOMMU used by this driver.
+ */
+extern void iommu_flush_all_caches(struct amd_iommu *iommu);
+
+static inline int get_ioapic_devid(int id)
+{
+ struct devid_map *entry;
+
+ list_for_each_entry(entry, &ioapic_map, list) {
+ if (entry->id == id)
+ return entry->devid;
+ }
+
+ return -EINVAL;
+}
+
+static inline int get_hpet_devid(int id)
+{
+ struct devid_map *entry;
+
+ list_for_each_entry(entry, &hpet_map, list) {
+ if (entry->id == id)
+ return entry->devid;
+ }
+
+ return -EINVAL;
+}
+
+enum amd_iommu_intr_mode_type {
+ AMD_IOMMU_GUEST_IR_LEGACY,
+
+ /* This mode is not visible to users. It is used when
+ * we cannot fully enable vAPIC and fallback to only support
+ * legacy interrupt remapping via 128-bit IRTE.
+ */
+ AMD_IOMMU_GUEST_IR_LEGACY_GA,
+ AMD_IOMMU_GUEST_IR_VAPIC,
+};
+
+#define AMD_IOMMU_GUEST_IR_GA(x) (x == AMD_IOMMU_GUEST_IR_VAPIC || \
+ x == AMD_IOMMU_GUEST_IR_LEGACY_GA)
+
+#define AMD_IOMMU_GUEST_IR_VAPIC(x) (x == AMD_IOMMU_GUEST_IR_VAPIC)
+
+union irte {
+ u32 val;
+ struct {
+ u32 valid : 1,
+ no_fault : 1,
+ int_type : 3,
+ rq_eoi : 1,
+ dm : 1,
+ rsvd_1 : 1,
+ destination : 8,
+ vector : 8,
+ rsvd_2 : 8;
+ } fields;
+};
+
+#define APICID_TO_IRTE_DEST_LO(x) (x & 0xffffff)
+#define APICID_TO_IRTE_DEST_HI(x) ((x >> 24) & 0xff)
+
+union irte_ga_lo {
+ u64 val;
+
+ /* For int remapping */
+ struct {
+ u64 valid : 1,
+ no_fault : 1,
+ /* ------ */
+ int_type : 3,
+ rq_eoi : 1,
+ dm : 1,
+ /* ------ */
+ guest_mode : 1,
+ destination : 24,
+ ga_tag : 32;
+ } fields_remap;
+
+ /* For guest vAPIC */
+ struct {
+ u64 valid : 1,
+ no_fault : 1,
+ /* ------ */
+ ga_log_intr : 1,
+ rsvd1 : 3,
+ is_run : 1,
+ /* ------ */
+ guest_mode : 1,
+ destination : 24,
+ ga_tag : 32;
+ } fields_vapic;
+};
+
+union irte_ga_hi {
+ u64 val;
+ struct {
+ u64 vector : 8,
+ rsvd_1 : 4,
+ ga_root_ptr : 40,
+ rsvd_2 : 4,
+ destination : 8;
+ } fields;
+};
+
+struct irte_ga {
+ union irte_ga_lo lo;
+ union irte_ga_hi hi;
+};
+
+struct irq_2_irte {
+ u16 devid; /* Device ID for IRTE table */
+ u16 index; /* Index into IRTE table*/
+};
+
+struct amd_ir_data {
+ u32 cached_ga_tag;
+ struct irq_2_irte irq_2_irte;
+ struct msi_msg msi_entry;
+ void *entry; /* Pointer to union irte or struct irte_ga */
+ void *ref; /* Pointer to the actual irte */
+
+ /**
+ * Store information for activate/de-activate
+ * Guest virtual APIC mode during runtime.
+ */
+ struct irq_cfg *cfg;
+ int ga_vector;
+ u64 ga_root_ptr;
+ u32 ga_tag;
+};
+
+struct amd_irte_ops {
+ void (*prepare)(void *, u32, u32, u8, u32, int);
+ void (*activate)(void *, u16, u16);
+ void (*deactivate)(void *, u16, u16);
+ void (*set_affinity)(void *, u16, u16, u8, u32);
+ void *(*get)(struct irq_remap_table *, int);
+ void (*set_allocated)(struct irq_remap_table *, int);
+ bool (*is_allocated)(struct irq_remap_table *, int);
+ void (*clear_allocated)(struct irq_remap_table *, int);
+};
+
+#ifdef CONFIG_IRQ_REMAP
+extern struct amd_irte_ops irte_32_ops;
+extern struct amd_irte_ops irte_128_ops;
+#endif
+
+#endif /* _ASM_X86_AMD_IOMMU_TYPES_H */
diff --git a/drivers/iommu/amd/debugfs.c b/drivers/iommu/amd/debugfs.c
new file mode 100644
index 000000000..545372fcc
--- /dev/null
+++ b/drivers/iommu/amd/debugfs.c
@@ -0,0 +1,32 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * AMD IOMMU driver
+ *
+ * Copyright (C) 2018 Advanced Micro Devices, Inc.
+ *
+ * Author: Gary R Hook <gary.hook@amd.com>
+ */
+
+#include <linux/debugfs.h>
+#include <linux/pci.h>
+
+#include "amd_iommu.h"
+
+static struct dentry *amd_iommu_debugfs;
+static DEFINE_MUTEX(amd_iommu_debugfs_lock);
+
+#define MAX_NAME_LEN 20
+
+void amd_iommu_debugfs_setup(struct amd_iommu *iommu)
+{
+ char name[MAX_NAME_LEN + 1];
+
+ mutex_lock(&amd_iommu_debugfs_lock);
+ if (!amd_iommu_debugfs)
+ amd_iommu_debugfs = debugfs_create_dir("amd",
+ iommu_debugfs_dir);
+ mutex_unlock(&amd_iommu_debugfs_lock);
+
+ snprintf(name, MAX_NAME_LEN, "iommu%02d", iommu->index);
+ iommu->debugfs = debugfs_create_dir(name, amd_iommu_debugfs);
+}
diff --git a/drivers/iommu/amd/init.c b/drivers/iommu/amd/init.c
new file mode 100644
index 000000000..603f625a7
--- /dev/null
+++ b/drivers/iommu/amd/init.c
@@ -0,0 +1,3318 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2007-2010 Advanced Micro Devices, Inc.
+ * Author: Joerg Roedel <jroedel@suse.de>
+ * Leo Duran <leo.duran@amd.com>
+ */
+
+#define pr_fmt(fmt) "AMD-Vi: " fmt
+#define dev_fmt(fmt) pr_fmt(fmt)
+
+#include <linux/pci.h>
+#include <linux/acpi.h>
+#include <linux/list.h>
+#include <linux/bitmap.h>
+#include <linux/slab.h>
+#include <linux/syscore_ops.h>
+#include <linux/interrupt.h>
+#include <linux/msi.h>
+#include <linux/amd-iommu.h>
+#include <linux/export.h>
+#include <linux/kmemleak.h>
+#include <linux/mem_encrypt.h>
+#include <linux/iopoll.h>
+#include <asm/pci-direct.h>
+#include <asm/iommu.h>
+#include <asm/apic.h>
+#include <asm/msidef.h>
+#include <asm/gart.h>
+#include <asm/x86_init.h>
+#include <asm/iommu_table.h>
+#include <asm/io_apic.h>
+#include <asm/irq_remapping.h>
+#include <asm/set_memory.h>
+
+#include <linux/crash_dump.h>
+
+#include "amd_iommu.h"
+#include "../irq_remapping.h"
+
+/*
+ * definitions for the ACPI scanning code
+ */
+#define IVRS_HEADER_LENGTH 48
+
+#define ACPI_IVHD_TYPE_MAX_SUPPORTED 0x40
+#define ACPI_IVMD_TYPE_ALL 0x20
+#define ACPI_IVMD_TYPE 0x21
+#define ACPI_IVMD_TYPE_RANGE 0x22
+
+#define IVHD_DEV_ALL 0x01
+#define IVHD_DEV_SELECT 0x02
+#define IVHD_DEV_SELECT_RANGE_START 0x03
+#define IVHD_DEV_RANGE_END 0x04
+#define IVHD_DEV_ALIAS 0x42
+#define IVHD_DEV_ALIAS_RANGE 0x43
+#define IVHD_DEV_EXT_SELECT 0x46
+#define IVHD_DEV_EXT_SELECT_RANGE 0x47
+#define IVHD_DEV_SPECIAL 0x48
+#define IVHD_DEV_ACPI_HID 0xf0
+
+#define UID_NOT_PRESENT 0
+#define UID_IS_INTEGER 1
+#define UID_IS_CHARACTER 2
+
+#define IVHD_SPECIAL_IOAPIC 1
+#define IVHD_SPECIAL_HPET 2
+
+#define IVHD_FLAG_HT_TUN_EN_MASK 0x01
+#define IVHD_FLAG_PASSPW_EN_MASK 0x02
+#define IVHD_FLAG_RESPASSPW_EN_MASK 0x04
+#define IVHD_FLAG_ISOC_EN_MASK 0x08
+
+#define IVMD_FLAG_EXCL_RANGE 0x08
+#define IVMD_FLAG_IW 0x04
+#define IVMD_FLAG_IR 0x02
+#define IVMD_FLAG_UNITY_MAP 0x01
+
+#define ACPI_DEVFLAG_INITPASS 0x01
+#define ACPI_DEVFLAG_EXTINT 0x02
+#define ACPI_DEVFLAG_NMI 0x04
+#define ACPI_DEVFLAG_SYSMGT1 0x10
+#define ACPI_DEVFLAG_SYSMGT2 0x20
+#define ACPI_DEVFLAG_LINT0 0x40
+#define ACPI_DEVFLAG_LINT1 0x80
+#define ACPI_DEVFLAG_ATSDIS 0x10000000
+
+#define LOOP_TIMEOUT 2000000
+
+#define IVRS_GET_SBDF_ID(seg, bus, dev, fd) (((seg & 0xffff) << 16) | ((bus & 0xff) << 8) \
+ | ((dev & 0x1f) << 3) | (fn & 0x7))
+
+/*
+ * ACPI table definitions
+ *
+ * These data structures are laid over the table to parse the important values
+ * out of it.
+ */
+
+extern const struct iommu_ops amd_iommu_ops;
+
+/*
+ * structure describing one IOMMU in the ACPI table. Typically followed by one
+ * or more ivhd_entrys.
+ */
+struct ivhd_header {
+ u8 type;
+ u8 flags;
+ u16 length;
+ u16 devid;
+ u16 cap_ptr;
+ u64 mmio_phys;
+ u16 pci_seg;
+ u16 info;
+ u32 efr_attr;
+
+ /* Following only valid on IVHD type 11h and 40h */
+ u64 efr_reg; /* Exact copy of MMIO_EXT_FEATURES */
+ u64 res;
+} __attribute__((packed));
+
+/*
+ * A device entry describing which devices a specific IOMMU translates and
+ * which requestor ids they use.
+ */
+struct ivhd_entry {
+ u8 type;
+ u16 devid;
+ u8 flags;
+ u32 ext;
+ u32 hidh;
+ u64 cid;
+ u8 uidf;
+ u8 uidl;
+ u8 uid;
+} __attribute__((packed));
+
+/*
+ * An AMD IOMMU memory definition structure. It defines things like exclusion
+ * ranges for devices and regions that should be unity mapped.
+ */
+struct ivmd_header {
+ u8 type;
+ u8 flags;
+ u16 length;
+ u16 devid;
+ u16 aux;
+ u64 resv;
+ u64 range_start;
+ u64 range_length;
+} __attribute__((packed));
+
+bool amd_iommu_dump;
+bool amd_iommu_irq_remap __read_mostly;
+
+int amd_iommu_guest_ir = AMD_IOMMU_GUEST_IR_VAPIC;
+static int amd_iommu_xt_mode = IRQ_REMAP_XAPIC_MODE;
+
+static bool amd_iommu_detected;
+static bool __initdata amd_iommu_disabled;
+static int amd_iommu_target_ivhd_type;
+
+u16 amd_iommu_last_bdf; /* largest PCI device id we have
+ to handle */
+LIST_HEAD(amd_iommu_unity_map); /* a list of required unity mappings
+ we find in ACPI */
+bool amd_iommu_unmap_flush; /* if true, flush on every unmap */
+
+LIST_HEAD(amd_iommu_list); /* list of all AMD IOMMUs in the
+ system */
+
+/* Array to assign indices to IOMMUs*/
+struct amd_iommu *amd_iommus[MAX_IOMMUS];
+
+/* Number of IOMMUs present in the system */
+static int amd_iommus_present;
+
+/* IOMMUs have a non-present cache? */
+bool amd_iommu_np_cache __read_mostly;
+bool amd_iommu_iotlb_sup __read_mostly = true;
+
+u32 amd_iommu_max_pasid __read_mostly = ~0;
+
+bool amd_iommu_v2_present __read_mostly;
+static bool amd_iommu_pc_present __read_mostly;
+
+bool amd_iommu_force_isolation __read_mostly;
+
+/*
+ * Pointer to the device table which is shared by all AMD IOMMUs
+ * it is indexed by the PCI device id or the HT unit id and contains
+ * information about the domain the device belongs to as well as the
+ * page table root pointer.
+ */
+struct dev_table_entry *amd_iommu_dev_table;
+/*
+ * Pointer to a device table which the content of old device table
+ * will be copied to. It's only be used in kdump kernel.
+ */
+static struct dev_table_entry *old_dev_tbl_cpy;
+
+/*
+ * The alias table is a driver specific data structure which contains the
+ * mappings of the PCI device ids to the actual requestor ids on the IOMMU.
+ * More than one device can share the same requestor id.
+ */
+u16 *amd_iommu_alias_table;
+
+/*
+ * The rlookup table is used to find the IOMMU which is responsible
+ * for a specific device. It is also indexed by the PCI device id.
+ */
+struct amd_iommu **amd_iommu_rlookup_table;
+EXPORT_SYMBOL(amd_iommu_rlookup_table);
+
+/*
+ * This table is used to find the irq remapping table for a given device id
+ * quickly.
+ */
+struct irq_remap_table **irq_lookup_table;
+
+/*
+ * AMD IOMMU allows up to 2^16 different protection domains. This is a bitmap
+ * to know which ones are already in use.
+ */
+unsigned long *amd_iommu_pd_alloc_bitmap;
+
+static u32 dev_table_size; /* size of the device table */
+static u32 alias_table_size; /* size of the alias table */
+static u32 rlookup_table_size; /* size if the rlookup table */
+
+enum iommu_init_state {
+ IOMMU_START_STATE,
+ IOMMU_IVRS_DETECTED,
+ IOMMU_ACPI_FINISHED,
+ IOMMU_ENABLED,
+ IOMMU_PCI_INIT,
+ IOMMU_INTERRUPTS_EN,
+ IOMMU_DMA_OPS,
+ IOMMU_INITIALIZED,
+ IOMMU_NOT_FOUND,
+ IOMMU_INIT_ERROR,
+ IOMMU_CMDLINE_DISABLED,
+};
+
+/* Early ioapic and hpet maps from kernel command line */
+#define EARLY_MAP_SIZE 4
+static struct devid_map __initdata early_ioapic_map[EARLY_MAP_SIZE];
+static struct devid_map __initdata early_hpet_map[EARLY_MAP_SIZE];
+static struct acpihid_map_entry __initdata early_acpihid_map[EARLY_MAP_SIZE];
+
+static int __initdata early_ioapic_map_size;
+static int __initdata early_hpet_map_size;
+static int __initdata early_acpihid_map_size;
+
+static bool __initdata cmdline_maps;
+
+static enum iommu_init_state init_state = IOMMU_START_STATE;
+
+static int amd_iommu_enable_interrupts(void);
+static int __init iommu_go_to_state(enum iommu_init_state state);
+static void init_device_table_dma(void);
+
+static bool amd_iommu_pre_enabled = true;
+
+static u32 amd_iommu_ivinfo __initdata;
+
+bool translation_pre_enabled(struct amd_iommu *iommu)
+{
+ return (iommu->flags & AMD_IOMMU_FLAG_TRANS_PRE_ENABLED);
+}
+EXPORT_SYMBOL(translation_pre_enabled);
+
+static void clear_translation_pre_enabled(struct amd_iommu *iommu)
+{
+ iommu->flags &= ~AMD_IOMMU_FLAG_TRANS_PRE_ENABLED;
+}
+
+static void init_translation_status(struct amd_iommu *iommu)
+{
+ u64 ctrl;
+
+ ctrl = readq(iommu->mmio_base + MMIO_CONTROL_OFFSET);
+ if (ctrl & (1<<CONTROL_IOMMU_EN))
+ iommu->flags |= AMD_IOMMU_FLAG_TRANS_PRE_ENABLED;
+}
+
+static inline void update_last_devid(u16 devid)
+{
+ if (devid > amd_iommu_last_bdf)
+ amd_iommu_last_bdf = devid;
+}
+
+static inline unsigned long tbl_size(int entry_size)
+{
+ unsigned shift = PAGE_SHIFT +
+ get_order(((int)amd_iommu_last_bdf + 1) * entry_size);
+
+ return 1UL << shift;
+}
+
+int amd_iommu_get_num_iommus(void)
+{
+ return amd_iommus_present;
+}
+
+#ifdef CONFIG_IRQ_REMAP
+static bool check_feature_on_all_iommus(u64 mask)
+{
+ bool ret = false;
+ struct amd_iommu *iommu;
+
+ for_each_iommu(iommu) {
+ ret = iommu_feature(iommu, mask);
+ if (!ret)
+ return false;
+ }
+
+ return true;
+}
+#endif
+
+/*
+ * For IVHD type 0x11/0x40, EFR is also available via IVHD.
+ * Default to IVHD EFR since it is available sooner
+ * (i.e. before PCI init).
+ */
+static void __init early_iommu_features_init(struct amd_iommu *iommu,
+ struct ivhd_header *h)
+{
+ if (amd_iommu_ivinfo & IOMMU_IVINFO_EFRSUP)
+ iommu->features = h->efr_reg;
+}
+
+/* Access to l1 and l2 indexed register spaces */
+
+static u32 iommu_read_l1(struct amd_iommu *iommu, u16 l1, u8 address)
+{
+ u32 val;
+
+ pci_write_config_dword(iommu->dev, 0xf8, (address | l1 << 16));
+ pci_read_config_dword(iommu->dev, 0xfc, &val);
+ return val;
+}
+
+static void iommu_write_l1(struct amd_iommu *iommu, u16 l1, u8 address, u32 val)
+{
+ pci_write_config_dword(iommu->dev, 0xf8, (address | l1 << 16 | 1 << 31));
+ pci_write_config_dword(iommu->dev, 0xfc, val);
+ pci_write_config_dword(iommu->dev, 0xf8, (address | l1 << 16));
+}
+
+static u32 iommu_read_l2(struct amd_iommu *iommu, u8 address)
+{
+ u32 val;
+
+ pci_write_config_dword(iommu->dev, 0xf0, address);
+ pci_read_config_dword(iommu->dev, 0xf4, &val);
+ return val;
+}
+
+static void iommu_write_l2(struct amd_iommu *iommu, u8 address, u32 val)
+{
+ pci_write_config_dword(iommu->dev, 0xf0, (address | 1 << 8));
+ pci_write_config_dword(iommu->dev, 0xf4, val);
+}
+
+/****************************************************************************
+ *
+ * AMD IOMMU MMIO register space handling functions
+ *
+ * These functions are used to program the IOMMU device registers in
+ * MMIO space required for that driver.
+ *
+ ****************************************************************************/
+
+/*
+ * This function set the exclusion range in the IOMMU. DMA accesses to the
+ * exclusion range are passed through untranslated
+ */
+static void iommu_set_exclusion_range(struct amd_iommu *iommu)
+{
+ u64 start = iommu->exclusion_start & PAGE_MASK;
+ u64 limit = (start + iommu->exclusion_length - 1) & PAGE_MASK;
+ u64 entry;
+
+ if (!iommu->exclusion_start)
+ return;
+
+ entry = start | MMIO_EXCL_ENABLE_MASK;
+ memcpy_toio(iommu->mmio_base + MMIO_EXCL_BASE_OFFSET,
+ &entry, sizeof(entry));
+
+ entry = limit;
+ memcpy_toio(iommu->mmio_base + MMIO_EXCL_LIMIT_OFFSET,
+ &entry, sizeof(entry));
+}
+
+static void iommu_set_cwwb_range(struct amd_iommu *iommu)
+{
+ u64 start = iommu_virt_to_phys((void *)iommu->cmd_sem);
+ u64 entry = start & PM_ADDR_MASK;
+
+ if (!iommu_feature(iommu, FEATURE_SNP))
+ return;
+
+ /* Note:
+ * Re-purpose Exclusion base/limit registers for Completion wait
+ * write-back base/limit.
+ */
+ memcpy_toio(iommu->mmio_base + MMIO_EXCL_BASE_OFFSET,
+ &entry, sizeof(entry));
+
+ /* Note:
+ * Default to 4 Kbytes, which can be specified by setting base
+ * address equal to the limit address.
+ */
+ memcpy_toio(iommu->mmio_base + MMIO_EXCL_LIMIT_OFFSET,
+ &entry, sizeof(entry));
+}
+
+/* Programs the physical address of the device table into the IOMMU hardware */
+static void iommu_set_device_table(struct amd_iommu *iommu)
+{
+ u64 entry;
+
+ BUG_ON(iommu->mmio_base == NULL);
+
+ entry = iommu_virt_to_phys(amd_iommu_dev_table);
+ entry |= (dev_table_size >> 12) - 1;
+ memcpy_toio(iommu->mmio_base + MMIO_DEV_TABLE_OFFSET,
+ &entry, sizeof(entry));
+}
+
+/* Generic functions to enable/disable certain features of the IOMMU. */
+static void iommu_feature_enable(struct amd_iommu *iommu, u8 bit)
+{
+ u64 ctrl;
+
+ ctrl = readq(iommu->mmio_base + MMIO_CONTROL_OFFSET);
+ ctrl |= (1ULL << bit);
+ writeq(ctrl, iommu->mmio_base + MMIO_CONTROL_OFFSET);
+}
+
+static void iommu_feature_disable(struct amd_iommu *iommu, u8 bit)
+{
+ u64 ctrl;
+
+ ctrl = readq(iommu->mmio_base + MMIO_CONTROL_OFFSET);
+ ctrl &= ~(1ULL << bit);
+ writeq(ctrl, iommu->mmio_base + MMIO_CONTROL_OFFSET);
+}
+
+static void iommu_set_inv_tlb_timeout(struct amd_iommu *iommu, int timeout)
+{
+ u64 ctrl;
+
+ ctrl = readq(iommu->mmio_base + MMIO_CONTROL_OFFSET);
+ ctrl &= ~CTRL_INV_TO_MASK;
+ ctrl |= (timeout << CONTROL_INV_TIMEOUT) & CTRL_INV_TO_MASK;
+ writeq(ctrl, iommu->mmio_base + MMIO_CONTROL_OFFSET);
+}
+
+/* Function to enable the hardware */
+static void iommu_enable(struct amd_iommu *iommu)
+{
+ iommu_feature_enable(iommu, CONTROL_IOMMU_EN);
+}
+
+static void iommu_disable(struct amd_iommu *iommu)
+{
+ if (!iommu->mmio_base)
+ return;
+
+ /* Disable command buffer */
+ iommu_feature_disable(iommu, CONTROL_CMDBUF_EN);
+
+ /* Disable event logging and event interrupts */
+ iommu_feature_disable(iommu, CONTROL_EVT_INT_EN);
+ iommu_feature_disable(iommu, CONTROL_EVT_LOG_EN);
+
+ /* Disable IOMMU GA_LOG */
+ iommu_feature_disable(iommu, CONTROL_GALOG_EN);
+ iommu_feature_disable(iommu, CONTROL_GAINT_EN);
+
+ /* Disable IOMMU hardware itself */
+ iommu_feature_disable(iommu, CONTROL_IOMMU_EN);
+}
+
+/*
+ * mapping and unmapping functions for the IOMMU MMIO space. Each AMD IOMMU in
+ * the system has one.
+ */
+static u8 __iomem * __init iommu_map_mmio_space(u64 address, u64 end)
+{
+ if (!request_mem_region(address, end, "amd_iommu")) {
+ pr_err("Can not reserve memory region %llx-%llx for mmio\n",
+ address, end);
+ pr_err("This is a BIOS bug. Please contact your hardware vendor\n");
+ return NULL;
+ }
+
+ return (u8 __iomem *)ioremap(address, end);
+}
+
+static void __init iommu_unmap_mmio_space(struct amd_iommu *iommu)
+{
+ if (iommu->mmio_base)
+ iounmap(iommu->mmio_base);
+ release_mem_region(iommu->mmio_phys, iommu->mmio_phys_end);
+}
+
+static inline u32 get_ivhd_header_size(struct ivhd_header *h)
+{
+ u32 size = 0;
+
+ switch (h->type) {
+ case 0x10:
+ size = 24;
+ break;
+ case 0x11:
+ case 0x40:
+ size = 40;
+ break;
+ }
+ return size;
+}
+
+/****************************************************************************
+ *
+ * The functions below belong to the first pass of AMD IOMMU ACPI table
+ * parsing. In this pass we try to find out the highest device id this
+ * code has to handle. Upon this information the size of the shared data
+ * structures is determined later.
+ *
+ ****************************************************************************/
+
+/*
+ * This function calculates the length of a given IVHD entry
+ */
+static inline int ivhd_entry_length(u8 *ivhd)
+{
+ u32 type = ((struct ivhd_entry *)ivhd)->type;
+
+ if (type < 0x80) {
+ return 0x04 << (*ivhd >> 6);
+ } else if (type == IVHD_DEV_ACPI_HID) {
+ /* For ACPI_HID, offset 21 is uid len */
+ return *((u8 *)ivhd + 21) + 22;
+ }
+ return 0;
+}
+
+/*
+ * After reading the highest device id from the IOMMU PCI capability header
+ * this function looks if there is a higher device id defined in the ACPI table
+ */
+static int __init find_last_devid_from_ivhd(struct ivhd_header *h)
+{
+ u8 *p = (void *)h, *end = (void *)h;
+ struct ivhd_entry *dev;
+
+ u32 ivhd_size = get_ivhd_header_size(h);
+
+ if (!ivhd_size) {
+ pr_err("Unsupported IVHD type %#x\n", h->type);
+ return -EINVAL;
+ }
+
+ p += ivhd_size;
+ end += h->length;
+
+ while (p < end) {
+ dev = (struct ivhd_entry *)p;
+ switch (dev->type) {
+ case IVHD_DEV_ALL:
+ /* Use maximum BDF value for DEV_ALL */
+ update_last_devid(0xffff);
+ break;
+ case IVHD_DEV_SELECT:
+ case IVHD_DEV_RANGE_END:
+ case IVHD_DEV_ALIAS:
+ case IVHD_DEV_EXT_SELECT:
+ /* all the above subfield types refer to device ids */
+ update_last_devid(dev->devid);
+ break;
+ default:
+ break;
+ }
+ p += ivhd_entry_length(p);
+ }
+
+ WARN_ON(p != end);
+
+ return 0;
+}
+
+static int __init check_ivrs_checksum(struct acpi_table_header *table)
+{
+ int i;
+ u8 checksum = 0, *p = (u8 *)table;
+
+ for (i = 0; i < table->length; ++i)
+ checksum += p[i];
+ if (checksum != 0) {
+ /* ACPI table corrupt */
+ pr_err(FW_BUG "IVRS invalid checksum\n");
+ return -ENODEV;
+ }
+
+ return 0;
+}
+
+/*
+ * Iterate over all IVHD entries in the ACPI table and find the highest device
+ * id which we need to handle. This is the first of three functions which parse
+ * the ACPI table. So we check the checksum here.
+ */
+static int __init find_last_devid_acpi(struct acpi_table_header *table)
+{
+ u8 *p = (u8 *)table, *end = (u8 *)table;
+ struct ivhd_header *h;
+
+ p += IVRS_HEADER_LENGTH;
+
+ end += table->length;
+ while (p < end) {
+ h = (struct ivhd_header *)p;
+ if (h->type == amd_iommu_target_ivhd_type) {
+ int ret = find_last_devid_from_ivhd(h);
+
+ if (ret)
+ return ret;
+ }
+ p += h->length;
+ }
+ WARN_ON(p != end);
+
+ return 0;
+}
+
+/****************************************************************************
+ *
+ * The following functions belong to the code path which parses the ACPI table
+ * the second time. In this ACPI parsing iteration we allocate IOMMU specific
+ * data structures, initialize the device/alias/rlookup table and also
+ * basically initialize the hardware.
+ *
+ ****************************************************************************/
+
+/*
+ * Allocates the command buffer. This buffer is per AMD IOMMU. We can
+ * write commands to that buffer later and the IOMMU will execute them
+ * asynchronously
+ */
+static int __init alloc_command_buffer(struct amd_iommu *iommu)
+{
+ iommu->cmd_buf = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
+ get_order(CMD_BUFFER_SIZE));
+
+ return iommu->cmd_buf ? 0 : -ENOMEM;
+}
+
+/*
+ * This function restarts event logging in case the IOMMU experienced
+ * an event log buffer overflow.
+ */
+void amd_iommu_restart_event_logging(struct amd_iommu *iommu)
+{
+ iommu_feature_disable(iommu, CONTROL_EVT_LOG_EN);
+ iommu_feature_enable(iommu, CONTROL_EVT_LOG_EN);
+}
+
+/*
+ * This function resets the command buffer if the IOMMU stopped fetching
+ * commands from it.
+ */
+void amd_iommu_reset_cmd_buffer(struct amd_iommu *iommu)
+{
+ iommu_feature_disable(iommu, CONTROL_CMDBUF_EN);
+
+ writel(0x00, iommu->mmio_base + MMIO_CMD_HEAD_OFFSET);
+ writel(0x00, iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
+ iommu->cmd_buf_head = 0;
+ iommu->cmd_buf_tail = 0;
+
+ iommu_feature_enable(iommu, CONTROL_CMDBUF_EN);
+}
+
+/*
+ * This function writes the command buffer address to the hardware and
+ * enables it.
+ */
+static void iommu_enable_command_buffer(struct amd_iommu *iommu)
+{
+ u64 entry;
+
+ BUG_ON(iommu->cmd_buf == NULL);
+
+ entry = iommu_virt_to_phys(iommu->cmd_buf);
+ entry |= MMIO_CMD_SIZE_512;
+
+ memcpy_toio(iommu->mmio_base + MMIO_CMD_BUF_OFFSET,
+ &entry, sizeof(entry));
+
+ amd_iommu_reset_cmd_buffer(iommu);
+}
+
+/*
+ * This function disables the command buffer
+ */
+static void iommu_disable_command_buffer(struct amd_iommu *iommu)
+{
+ iommu_feature_disable(iommu, CONTROL_CMDBUF_EN);
+}
+
+static void __init free_command_buffer(struct amd_iommu *iommu)
+{
+ free_pages((unsigned long)iommu->cmd_buf, get_order(CMD_BUFFER_SIZE));
+}
+
+static void *__init iommu_alloc_4k_pages(struct amd_iommu *iommu,
+ gfp_t gfp, size_t size)
+{
+ int order = get_order(size);
+ void *buf = (void *)__get_free_pages(gfp, order);
+
+ if (buf &&
+ iommu_feature(iommu, FEATURE_SNP) &&
+ set_memory_4k((unsigned long)buf, (1 << order))) {
+ free_pages((unsigned long)buf, order);
+ buf = NULL;
+ }
+
+ return buf;
+}
+
+/* allocates the memory where the IOMMU will log its events to */
+static int __init alloc_event_buffer(struct amd_iommu *iommu)
+{
+ iommu->evt_buf = iommu_alloc_4k_pages(iommu, GFP_KERNEL | __GFP_ZERO,
+ EVT_BUFFER_SIZE);
+
+ return iommu->evt_buf ? 0 : -ENOMEM;
+}
+
+static void iommu_enable_event_buffer(struct amd_iommu *iommu)
+{
+ u64 entry;
+
+ BUG_ON(iommu->evt_buf == NULL);
+
+ entry = iommu_virt_to_phys(iommu->evt_buf) | EVT_LEN_MASK;
+
+ memcpy_toio(iommu->mmio_base + MMIO_EVT_BUF_OFFSET,
+ &entry, sizeof(entry));
+
+ /* set head and tail to zero manually */
+ writel(0x00, iommu->mmio_base + MMIO_EVT_HEAD_OFFSET);
+ writel(0x00, iommu->mmio_base + MMIO_EVT_TAIL_OFFSET);
+
+ iommu_feature_enable(iommu, CONTROL_EVT_LOG_EN);
+}
+
+/*
+ * This function disables the event log buffer
+ */
+static void iommu_disable_event_buffer(struct amd_iommu *iommu)
+{
+ iommu_feature_disable(iommu, CONTROL_EVT_LOG_EN);
+}
+
+static void __init free_event_buffer(struct amd_iommu *iommu)
+{
+ free_pages((unsigned long)iommu->evt_buf, get_order(EVT_BUFFER_SIZE));
+}
+
+/* allocates the memory where the IOMMU will log its events to */
+static int __init alloc_ppr_log(struct amd_iommu *iommu)
+{
+ iommu->ppr_log = iommu_alloc_4k_pages(iommu, GFP_KERNEL | __GFP_ZERO,
+ PPR_LOG_SIZE);
+
+ return iommu->ppr_log ? 0 : -ENOMEM;
+}
+
+static void iommu_enable_ppr_log(struct amd_iommu *iommu)
+{
+ u64 entry;
+
+ if (iommu->ppr_log == NULL)
+ return;
+
+ entry = iommu_virt_to_phys(iommu->ppr_log) | PPR_LOG_SIZE_512;
+
+ memcpy_toio(iommu->mmio_base + MMIO_PPR_LOG_OFFSET,
+ &entry, sizeof(entry));
+
+ /* set head and tail to zero manually */
+ writel(0x00, iommu->mmio_base + MMIO_PPR_HEAD_OFFSET);
+ writel(0x00, iommu->mmio_base + MMIO_PPR_TAIL_OFFSET);
+
+ iommu_feature_enable(iommu, CONTROL_PPRLOG_EN);
+ iommu_feature_enable(iommu, CONTROL_PPR_EN);
+}
+
+static void __init free_ppr_log(struct amd_iommu *iommu)
+{
+ free_pages((unsigned long)iommu->ppr_log, get_order(PPR_LOG_SIZE));
+}
+
+static void free_ga_log(struct amd_iommu *iommu)
+{
+#ifdef CONFIG_IRQ_REMAP
+ free_pages((unsigned long)iommu->ga_log, get_order(GA_LOG_SIZE));
+ free_pages((unsigned long)iommu->ga_log_tail, get_order(8));
+#endif
+}
+
+static int iommu_ga_log_enable(struct amd_iommu *iommu)
+{
+#ifdef CONFIG_IRQ_REMAP
+ u32 status, i;
+ u64 entry;
+
+ if (!iommu->ga_log)
+ return -EINVAL;
+
+ /* Check if already running */
+ status = readl(iommu->mmio_base + MMIO_STATUS_OFFSET);
+ if (WARN_ON(status & (MMIO_STATUS_GALOG_RUN_MASK)))
+ return 0;
+
+ entry = iommu_virt_to_phys(iommu->ga_log) | GA_LOG_SIZE_512;
+ memcpy_toio(iommu->mmio_base + MMIO_GA_LOG_BASE_OFFSET,
+ &entry, sizeof(entry));
+ entry = (iommu_virt_to_phys(iommu->ga_log_tail) &
+ (BIT_ULL(52)-1)) & ~7ULL;
+ memcpy_toio(iommu->mmio_base + MMIO_GA_LOG_TAIL_OFFSET,
+ &entry, sizeof(entry));
+ writel(0x00, iommu->mmio_base + MMIO_GA_HEAD_OFFSET);
+ writel(0x00, iommu->mmio_base + MMIO_GA_TAIL_OFFSET);
+
+
+ iommu_feature_enable(iommu, CONTROL_GAINT_EN);
+ iommu_feature_enable(iommu, CONTROL_GALOG_EN);
+
+ for (i = 0; i < LOOP_TIMEOUT; ++i) {
+ status = readl(iommu->mmio_base + MMIO_STATUS_OFFSET);
+ if (status & (MMIO_STATUS_GALOG_RUN_MASK))
+ break;
+ udelay(10);
+ }
+
+ if (WARN_ON(i >= LOOP_TIMEOUT))
+ return -EINVAL;
+#endif /* CONFIG_IRQ_REMAP */
+ return 0;
+}
+
+static int iommu_init_ga_log(struct amd_iommu *iommu)
+{
+#ifdef CONFIG_IRQ_REMAP
+ if (!AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir))
+ return 0;
+
+ iommu->ga_log = (u8 *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
+ get_order(GA_LOG_SIZE));
+ if (!iommu->ga_log)
+ goto err_out;
+
+ iommu->ga_log_tail = (u8 *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
+ get_order(8));
+ if (!iommu->ga_log_tail)
+ goto err_out;
+
+ return 0;
+err_out:
+ free_ga_log(iommu);
+ return -EINVAL;
+#else
+ return 0;
+#endif /* CONFIG_IRQ_REMAP */
+}
+
+static int __init alloc_cwwb_sem(struct amd_iommu *iommu)
+{
+ iommu->cmd_sem = iommu_alloc_4k_pages(iommu, GFP_KERNEL | __GFP_ZERO, 1);
+
+ return iommu->cmd_sem ? 0 : -ENOMEM;
+}
+
+static void __init free_cwwb_sem(struct amd_iommu *iommu)
+{
+ if (iommu->cmd_sem)
+ free_page((unsigned long)iommu->cmd_sem);
+}
+
+static void iommu_enable_xt(struct amd_iommu *iommu)
+{
+#ifdef CONFIG_IRQ_REMAP
+ /*
+ * XT mode (32-bit APIC destination ID) requires
+ * GA mode (128-bit IRTE support) as a prerequisite.
+ */
+ if (AMD_IOMMU_GUEST_IR_GA(amd_iommu_guest_ir) &&
+ amd_iommu_xt_mode == IRQ_REMAP_X2APIC_MODE)
+ iommu_feature_enable(iommu, CONTROL_XT_EN);
+#endif /* CONFIG_IRQ_REMAP */
+}
+
+static void iommu_enable_gt(struct amd_iommu *iommu)
+{
+ if (!iommu_feature(iommu, FEATURE_GT))
+ return;
+
+ iommu_feature_enable(iommu, CONTROL_GT_EN);
+}
+
+/* sets a specific bit in the device table entry. */
+static void set_dev_entry_bit(u16 devid, u8 bit)
+{
+ int i = (bit >> 6) & 0x03;
+ int _bit = bit & 0x3f;
+
+ amd_iommu_dev_table[devid].data[i] |= (1UL << _bit);
+}
+
+static int get_dev_entry_bit(u16 devid, u8 bit)
+{
+ int i = (bit >> 6) & 0x03;
+ int _bit = bit & 0x3f;
+
+ return (amd_iommu_dev_table[devid].data[i] & (1UL << _bit)) >> _bit;
+}
+
+
+static bool copy_device_table(void)
+{
+ u64 int_ctl, int_tab_len, entry = 0, last_entry = 0;
+ struct dev_table_entry *old_devtb = NULL;
+ u32 lo, hi, devid, old_devtb_size;
+ phys_addr_t old_devtb_phys;
+ struct amd_iommu *iommu;
+ u16 dom_id, dte_v, irq_v;
+ gfp_t gfp_flag;
+ u64 tmp;
+
+ if (!amd_iommu_pre_enabled)
+ return false;
+
+ pr_warn("Translation is already enabled - trying to copy translation structures\n");
+ for_each_iommu(iommu) {
+ /* All IOMMUs should use the same device table with the same size */
+ lo = readl(iommu->mmio_base + MMIO_DEV_TABLE_OFFSET);
+ hi = readl(iommu->mmio_base + MMIO_DEV_TABLE_OFFSET + 4);
+ entry = (((u64) hi) << 32) + lo;
+ if (last_entry && last_entry != entry) {
+ pr_err("IOMMU:%d should use the same dev table as others!\n",
+ iommu->index);
+ return false;
+ }
+ last_entry = entry;
+
+ old_devtb_size = ((entry & ~PAGE_MASK) + 1) << 12;
+ if (old_devtb_size != dev_table_size) {
+ pr_err("The device table size of IOMMU:%d is not expected!\n",
+ iommu->index);
+ return false;
+ }
+ }
+
+ /*
+ * When SME is enabled in the first kernel, the entry includes the
+ * memory encryption mask(sme_me_mask), we must remove the memory
+ * encryption mask to obtain the true physical address in kdump kernel.
+ */
+ old_devtb_phys = __sme_clr(entry) & PAGE_MASK;
+
+ if (old_devtb_phys >= 0x100000000ULL) {
+ pr_err("The address of old device table is above 4G, not trustworthy!\n");
+ return false;
+ }
+ old_devtb = (sme_active() && is_kdump_kernel())
+ ? (__force void *)ioremap_encrypted(old_devtb_phys,
+ dev_table_size)
+ : memremap(old_devtb_phys, dev_table_size, MEMREMAP_WB);
+
+ if (!old_devtb)
+ return false;
+
+ gfp_flag = GFP_KERNEL | __GFP_ZERO | GFP_DMA32;
+ old_dev_tbl_cpy = (void *)__get_free_pages(gfp_flag,
+ get_order(dev_table_size));
+ if (old_dev_tbl_cpy == NULL) {
+ pr_err("Failed to allocate memory for copying old device table!\n");
+ return false;
+ }
+
+ for (devid = 0; devid <= amd_iommu_last_bdf; ++devid) {
+ old_dev_tbl_cpy[devid] = old_devtb[devid];
+ dom_id = old_devtb[devid].data[1] & DEV_DOMID_MASK;
+ dte_v = old_devtb[devid].data[0] & DTE_FLAG_V;
+
+ if (dte_v && dom_id) {
+ old_dev_tbl_cpy[devid].data[0] = old_devtb[devid].data[0];
+ old_dev_tbl_cpy[devid].data[1] = old_devtb[devid].data[1];
+ __set_bit(dom_id, amd_iommu_pd_alloc_bitmap);
+ /* If gcr3 table existed, mask it out */
+ if (old_devtb[devid].data[0] & DTE_FLAG_GV) {
+ tmp = DTE_GCR3_VAL_B(~0ULL) << DTE_GCR3_SHIFT_B;
+ tmp |= DTE_GCR3_VAL_C(~0ULL) << DTE_GCR3_SHIFT_C;
+ old_dev_tbl_cpy[devid].data[1] &= ~tmp;
+ tmp = DTE_GCR3_VAL_A(~0ULL) << DTE_GCR3_SHIFT_A;
+ tmp |= DTE_FLAG_GV;
+ old_dev_tbl_cpy[devid].data[0] &= ~tmp;
+ }
+ }
+
+ irq_v = old_devtb[devid].data[2] & DTE_IRQ_REMAP_ENABLE;
+ int_ctl = old_devtb[devid].data[2] & DTE_IRQ_REMAP_INTCTL_MASK;
+ int_tab_len = old_devtb[devid].data[2] & DTE_IRQ_TABLE_LEN_MASK;
+ if (irq_v && (int_ctl || int_tab_len)) {
+ if ((int_ctl != DTE_IRQ_REMAP_INTCTL) ||
+ (int_tab_len != DTE_IRQ_TABLE_LEN)) {
+ pr_err("Wrong old irq remapping flag: %#x\n", devid);
+ return false;
+ }
+
+ old_dev_tbl_cpy[devid].data[2] = old_devtb[devid].data[2];
+ }
+ }
+ memunmap(old_devtb);
+
+ return true;
+}
+
+void amd_iommu_apply_erratum_63(u16 devid)
+{
+ int sysmgt;
+
+ sysmgt = get_dev_entry_bit(devid, DEV_ENTRY_SYSMGT1) |
+ (get_dev_entry_bit(devid, DEV_ENTRY_SYSMGT2) << 1);
+
+ if (sysmgt == 0x01)
+ set_dev_entry_bit(devid, DEV_ENTRY_IW);
+}
+
+/* Writes the specific IOMMU for a device into the rlookup table */
+static void __init set_iommu_for_device(struct amd_iommu *iommu, u16 devid)
+{
+ amd_iommu_rlookup_table[devid] = iommu;
+}
+
+/*
+ * This function takes the device specific flags read from the ACPI
+ * table and sets up the device table entry with that information
+ */
+static void __init set_dev_entry_from_acpi(struct amd_iommu *iommu,
+ u16 devid, u32 flags, u32 ext_flags)
+{
+ if (flags & ACPI_DEVFLAG_INITPASS)
+ set_dev_entry_bit(devid, DEV_ENTRY_INIT_PASS);
+ if (flags & ACPI_DEVFLAG_EXTINT)
+ set_dev_entry_bit(devid, DEV_ENTRY_EINT_PASS);
+ if (flags & ACPI_DEVFLAG_NMI)
+ set_dev_entry_bit(devid, DEV_ENTRY_NMI_PASS);
+ if (flags & ACPI_DEVFLAG_SYSMGT1)
+ set_dev_entry_bit(devid, DEV_ENTRY_SYSMGT1);
+ if (flags & ACPI_DEVFLAG_SYSMGT2)
+ set_dev_entry_bit(devid, DEV_ENTRY_SYSMGT2);
+ if (flags & ACPI_DEVFLAG_LINT0)
+ set_dev_entry_bit(devid, DEV_ENTRY_LINT0_PASS);
+ if (flags & ACPI_DEVFLAG_LINT1)
+ set_dev_entry_bit(devid, DEV_ENTRY_LINT1_PASS);
+
+ amd_iommu_apply_erratum_63(devid);
+
+ set_iommu_for_device(iommu, devid);
+}
+
+int __init add_special_device(u8 type, u8 id, u16 *devid, bool cmd_line)
+{
+ struct devid_map *entry;
+ struct list_head *list;
+
+ if (type == IVHD_SPECIAL_IOAPIC)
+ list = &ioapic_map;
+ else if (type == IVHD_SPECIAL_HPET)
+ list = &hpet_map;
+ else
+ return -EINVAL;
+
+ list_for_each_entry(entry, list, list) {
+ if (!(entry->id == id && entry->cmd_line))
+ continue;
+
+ pr_info("Command-line override present for %s id %d - ignoring\n",
+ type == IVHD_SPECIAL_IOAPIC ? "IOAPIC" : "HPET", id);
+
+ *devid = entry->devid;
+
+ return 0;
+ }
+
+ entry = kzalloc(sizeof(*entry), GFP_KERNEL);
+ if (!entry)
+ return -ENOMEM;
+
+ entry->id = id;
+ entry->devid = *devid;
+ entry->cmd_line = cmd_line;
+
+ list_add_tail(&entry->list, list);
+
+ return 0;
+}
+
+static int __init add_acpi_hid_device(u8 *hid, u8 *uid, u16 *devid,
+ bool cmd_line)
+{
+ struct acpihid_map_entry *entry;
+ struct list_head *list = &acpihid_map;
+
+ list_for_each_entry(entry, list, list) {
+ if (strcmp(entry->hid, hid) ||
+ (*uid && *entry->uid && strcmp(entry->uid, uid)) ||
+ !entry->cmd_line)
+ continue;
+
+ pr_info("Command-line override for hid:%s uid:%s\n",
+ hid, uid);
+ *devid = entry->devid;
+ return 0;
+ }
+
+ entry = kzalloc(sizeof(*entry), GFP_KERNEL);
+ if (!entry)
+ return -ENOMEM;
+
+ memcpy(entry->uid, uid, strlen(uid));
+ memcpy(entry->hid, hid, strlen(hid));
+ entry->devid = *devid;
+ entry->cmd_line = cmd_line;
+ entry->root_devid = (entry->devid & (~0x7));
+
+ pr_info("%s, add hid:%s, uid:%s, rdevid:%d\n",
+ entry->cmd_line ? "cmd" : "ivrs",
+ entry->hid, entry->uid, entry->root_devid);
+
+ list_add_tail(&entry->list, list);
+ return 0;
+}
+
+static int __init add_early_maps(void)
+{
+ int i, ret;
+
+ for (i = 0; i < early_ioapic_map_size; ++i) {
+ ret = add_special_device(IVHD_SPECIAL_IOAPIC,
+ early_ioapic_map[i].id,
+ &early_ioapic_map[i].devid,
+ early_ioapic_map[i].cmd_line);
+ if (ret)
+ return ret;
+ }
+
+ for (i = 0; i < early_hpet_map_size; ++i) {
+ ret = add_special_device(IVHD_SPECIAL_HPET,
+ early_hpet_map[i].id,
+ &early_hpet_map[i].devid,
+ early_hpet_map[i].cmd_line);
+ if (ret)
+ return ret;
+ }
+
+ for (i = 0; i < early_acpihid_map_size; ++i) {
+ ret = add_acpi_hid_device(early_acpihid_map[i].hid,
+ early_acpihid_map[i].uid,
+ &early_acpihid_map[i].devid,
+ early_acpihid_map[i].cmd_line);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+/*
+ * Takes a pointer to an AMD IOMMU entry in the ACPI table and
+ * initializes the hardware and our data structures with it.
+ */
+static int __init init_iommu_from_acpi(struct amd_iommu *iommu,
+ struct ivhd_header *h)
+{
+ u8 *p = (u8 *)h;
+ u8 *end = p, flags = 0;
+ u16 devid = 0, devid_start = 0, devid_to = 0;
+ u32 dev_i, ext_flags = 0;
+ bool alias = false;
+ struct ivhd_entry *e;
+ u32 ivhd_size;
+ int ret;
+
+
+ ret = add_early_maps();
+ if (ret)
+ return ret;
+
+ amd_iommu_apply_ivrs_quirks();
+
+ /*
+ * First save the recommended feature enable bits from ACPI
+ */
+ iommu->acpi_flags = h->flags;
+
+ /*
+ * Done. Now parse the device entries
+ */
+ ivhd_size = get_ivhd_header_size(h);
+ if (!ivhd_size) {
+ pr_err("Unsupported IVHD type %#x\n", h->type);
+ return -EINVAL;
+ }
+
+ p += ivhd_size;
+
+ end += h->length;
+
+
+ while (p < end) {
+ e = (struct ivhd_entry *)p;
+ switch (e->type) {
+ case IVHD_DEV_ALL:
+
+ DUMP_printk(" DEV_ALL\t\t\tflags: %02x\n", e->flags);
+
+ for (dev_i = 0; dev_i <= amd_iommu_last_bdf; ++dev_i)
+ set_dev_entry_from_acpi(iommu, dev_i, e->flags, 0);
+ break;
+ case IVHD_DEV_SELECT:
+
+ DUMP_printk(" DEV_SELECT\t\t\t devid: %02x:%02x.%x "
+ "flags: %02x\n",
+ PCI_BUS_NUM(e->devid),
+ PCI_SLOT(e->devid),
+ PCI_FUNC(e->devid),
+ e->flags);
+
+ devid = e->devid;
+ set_dev_entry_from_acpi(iommu, devid, e->flags, 0);
+ break;
+ case IVHD_DEV_SELECT_RANGE_START:
+
+ DUMP_printk(" DEV_SELECT_RANGE_START\t "
+ "devid: %02x:%02x.%x flags: %02x\n",
+ PCI_BUS_NUM(e->devid),
+ PCI_SLOT(e->devid),
+ PCI_FUNC(e->devid),
+ e->flags);
+
+ devid_start = e->devid;
+ flags = e->flags;
+ ext_flags = 0;
+ alias = false;
+ break;
+ case IVHD_DEV_ALIAS:
+
+ DUMP_printk(" DEV_ALIAS\t\t\t devid: %02x:%02x.%x "
+ "flags: %02x devid_to: %02x:%02x.%x\n",
+ PCI_BUS_NUM(e->devid),
+ PCI_SLOT(e->devid),
+ PCI_FUNC(e->devid),
+ e->flags,
+ PCI_BUS_NUM(e->ext >> 8),
+ PCI_SLOT(e->ext >> 8),
+ PCI_FUNC(e->ext >> 8));
+
+ devid = e->devid;
+ devid_to = e->ext >> 8;
+ set_dev_entry_from_acpi(iommu, devid , e->flags, 0);
+ set_dev_entry_from_acpi(iommu, devid_to, e->flags, 0);
+ amd_iommu_alias_table[devid] = devid_to;
+ break;
+ case IVHD_DEV_ALIAS_RANGE:
+
+ DUMP_printk(" DEV_ALIAS_RANGE\t\t "
+ "devid: %02x:%02x.%x flags: %02x "
+ "devid_to: %02x:%02x.%x\n",
+ PCI_BUS_NUM(e->devid),
+ PCI_SLOT(e->devid),
+ PCI_FUNC(e->devid),
+ e->flags,
+ PCI_BUS_NUM(e->ext >> 8),
+ PCI_SLOT(e->ext >> 8),
+ PCI_FUNC(e->ext >> 8));
+
+ devid_start = e->devid;
+ flags = e->flags;
+ devid_to = e->ext >> 8;
+ ext_flags = 0;
+ alias = true;
+ break;
+ case IVHD_DEV_EXT_SELECT:
+
+ DUMP_printk(" DEV_EXT_SELECT\t\t devid: %02x:%02x.%x "
+ "flags: %02x ext: %08x\n",
+ PCI_BUS_NUM(e->devid),
+ PCI_SLOT(e->devid),
+ PCI_FUNC(e->devid),
+ e->flags, e->ext);
+
+ devid = e->devid;
+ set_dev_entry_from_acpi(iommu, devid, e->flags,
+ e->ext);
+ break;
+ case IVHD_DEV_EXT_SELECT_RANGE:
+
+ DUMP_printk(" DEV_EXT_SELECT_RANGE\t devid: "
+ "%02x:%02x.%x flags: %02x ext: %08x\n",
+ PCI_BUS_NUM(e->devid),
+ PCI_SLOT(e->devid),
+ PCI_FUNC(e->devid),
+ e->flags, e->ext);
+
+ devid_start = e->devid;
+ flags = e->flags;
+ ext_flags = e->ext;
+ alias = false;
+ break;
+ case IVHD_DEV_RANGE_END:
+
+ DUMP_printk(" DEV_RANGE_END\t\t devid: %02x:%02x.%x\n",
+ PCI_BUS_NUM(e->devid),
+ PCI_SLOT(e->devid),
+ PCI_FUNC(e->devid));
+
+ devid = e->devid;
+ for (dev_i = devid_start; dev_i <= devid; ++dev_i) {
+ if (alias) {
+ amd_iommu_alias_table[dev_i] = devid_to;
+ set_dev_entry_from_acpi(iommu,
+ devid_to, flags, ext_flags);
+ }
+ set_dev_entry_from_acpi(iommu, dev_i,
+ flags, ext_flags);
+ }
+ break;
+ case IVHD_DEV_SPECIAL: {
+ u8 handle, type;
+ const char *var;
+ u16 devid;
+ int ret;
+
+ handle = e->ext & 0xff;
+ devid = (e->ext >> 8) & 0xffff;
+ type = (e->ext >> 24) & 0xff;
+
+ if (type == IVHD_SPECIAL_IOAPIC)
+ var = "IOAPIC";
+ else if (type == IVHD_SPECIAL_HPET)
+ var = "HPET";
+ else
+ var = "UNKNOWN";
+
+ DUMP_printk(" DEV_SPECIAL(%s[%d])\t\tdevid: %02x:%02x.%x\n",
+ var, (int)handle,
+ PCI_BUS_NUM(devid),
+ PCI_SLOT(devid),
+ PCI_FUNC(devid));
+
+ ret = add_special_device(type, handle, &devid, false);
+ if (ret)
+ return ret;
+
+ /*
+ * add_special_device might update the devid in case a
+ * command-line override is present. So call
+ * set_dev_entry_from_acpi after add_special_device.
+ */
+ set_dev_entry_from_acpi(iommu, devid, e->flags, 0);
+
+ break;
+ }
+ case IVHD_DEV_ACPI_HID: {
+ u16 devid;
+ u8 hid[ACPIHID_HID_LEN];
+ u8 uid[ACPIHID_UID_LEN];
+ int ret;
+
+ if (h->type != 0x40) {
+ pr_err(FW_BUG "Invalid IVHD device type %#x\n",
+ e->type);
+ break;
+ }
+
+ memcpy(hid, (u8 *)(&e->ext), ACPIHID_HID_LEN - 1);
+ hid[ACPIHID_HID_LEN - 1] = '\0';
+
+ if (!(*hid)) {
+ pr_err(FW_BUG "Invalid HID.\n");
+ break;
+ }
+
+ uid[0] = '\0';
+ switch (e->uidf) {
+ case UID_NOT_PRESENT:
+
+ if (e->uidl != 0)
+ pr_warn(FW_BUG "Invalid UID length.\n");
+
+ break;
+ case UID_IS_INTEGER:
+
+ sprintf(uid, "%d", e->uid);
+
+ break;
+ case UID_IS_CHARACTER:
+
+ memcpy(uid, &e->uid, e->uidl);
+ uid[e->uidl] = '\0';
+
+ break;
+ default:
+ break;
+ }
+
+ devid = e->devid;
+ DUMP_printk(" DEV_ACPI_HID(%s[%s])\t\tdevid: %02x:%02x.%x\n",
+ hid, uid,
+ PCI_BUS_NUM(devid),
+ PCI_SLOT(devid),
+ PCI_FUNC(devid));
+
+ flags = e->flags;
+
+ ret = add_acpi_hid_device(hid, uid, &devid, false);
+ if (ret)
+ return ret;
+
+ /*
+ * add_special_device might update the devid in case a
+ * command-line override is present. So call
+ * set_dev_entry_from_acpi after add_special_device.
+ */
+ set_dev_entry_from_acpi(iommu, devid, e->flags, 0);
+
+ break;
+ }
+ default:
+ break;
+ }
+
+ p += ivhd_entry_length(p);
+ }
+
+ return 0;
+}
+
+static void __init free_iommu_one(struct amd_iommu *iommu)
+{
+ free_cwwb_sem(iommu);
+ free_command_buffer(iommu);
+ free_event_buffer(iommu);
+ free_ppr_log(iommu);
+ free_ga_log(iommu);
+ iommu_unmap_mmio_space(iommu);
+}
+
+static void __init free_iommu_all(void)
+{
+ struct amd_iommu *iommu, *next;
+
+ for_each_iommu_safe(iommu, next) {
+ list_del(&iommu->list);
+ free_iommu_one(iommu);
+ kfree(iommu);
+ }
+}
+
+/*
+ * Family15h Model 10h-1fh erratum 746 (IOMMU Logging May Stall Translations)
+ * Workaround:
+ * BIOS should disable L2B micellaneous clock gating by setting
+ * L2_L2B_CK_GATE_CONTROL[CKGateL2BMiscDisable](D0F2xF4_x90[2]) = 1b
+ */
+static void amd_iommu_erratum_746_workaround(struct amd_iommu *iommu)
+{
+ u32 value;
+
+ if ((boot_cpu_data.x86 != 0x15) ||
+ (boot_cpu_data.x86_model < 0x10) ||
+ (boot_cpu_data.x86_model > 0x1f))
+ return;
+
+ pci_write_config_dword(iommu->dev, 0xf0, 0x90);
+ pci_read_config_dword(iommu->dev, 0xf4, &value);
+
+ if (value & BIT(2))
+ return;
+
+ /* Select NB indirect register 0x90 and enable writing */
+ pci_write_config_dword(iommu->dev, 0xf0, 0x90 | (1 << 8));
+
+ pci_write_config_dword(iommu->dev, 0xf4, value | 0x4);
+ pci_info(iommu->dev, "Applying erratum 746 workaround\n");
+
+ /* Clear the enable writing bit */
+ pci_write_config_dword(iommu->dev, 0xf0, 0x90);
+}
+
+/*
+ * Family15h Model 30h-3fh (IOMMU Mishandles ATS Write Permission)
+ * Workaround:
+ * BIOS should enable ATS write permission check by setting
+ * L2_DEBUG_3[AtsIgnoreIWDis](D0F2xF4_x47[0]) = 1b
+ */
+static void amd_iommu_ats_write_check_workaround(struct amd_iommu *iommu)
+{
+ u32 value;
+
+ if ((boot_cpu_data.x86 != 0x15) ||
+ (boot_cpu_data.x86_model < 0x30) ||
+ (boot_cpu_data.x86_model > 0x3f))
+ return;
+
+ /* Test L2_DEBUG_3[AtsIgnoreIWDis] == 1 */
+ value = iommu_read_l2(iommu, 0x47);
+
+ if (value & BIT(0))
+ return;
+
+ /* Set L2_DEBUG_3[AtsIgnoreIWDis] = 1 */
+ iommu_write_l2(iommu, 0x47, value | BIT(0));
+
+ pci_info(iommu->dev, "Applying ATS write check workaround\n");
+}
+
+/*
+ * This function clues the initialization function for one IOMMU
+ * together and also allocates the command buffer and programs the
+ * hardware. It does NOT enable the IOMMU. This is done afterwards.
+ */
+static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h)
+{
+ int ret;
+
+ raw_spin_lock_init(&iommu->lock);
+ iommu->cmd_sem_val = 0;
+
+ /* Add IOMMU to internal data structures */
+ list_add_tail(&iommu->list, &amd_iommu_list);
+ iommu->index = amd_iommus_present++;
+
+ if (unlikely(iommu->index >= MAX_IOMMUS)) {
+ WARN(1, "System has more IOMMUs than supported by this driver\n");
+ return -ENOSYS;
+ }
+
+ /* Index is fine - add IOMMU to the array */
+ amd_iommus[iommu->index] = iommu;
+
+ /*
+ * Copy data from ACPI table entry to the iommu struct
+ */
+ iommu->devid = h->devid;
+ iommu->cap_ptr = h->cap_ptr;
+ iommu->pci_seg = h->pci_seg;
+ iommu->mmio_phys = h->mmio_phys;
+
+ switch (h->type) {
+ case 0x10:
+ /* Check if IVHD EFR contains proper max banks/counters */
+ if ((h->efr_attr != 0) &&
+ ((h->efr_attr & (0xF << 13)) != 0) &&
+ ((h->efr_attr & (0x3F << 17)) != 0))
+ iommu->mmio_phys_end = MMIO_REG_END_OFFSET;
+ else
+ iommu->mmio_phys_end = MMIO_CNTR_CONF_OFFSET;
+
+ /*
+ * Note: GA (128-bit IRTE) mode requires cmpxchg16b supports.
+ * GAM also requires GA mode. Therefore, we need to
+ * check cmpxchg16b support before enabling it.
+ */
+ if (!boot_cpu_has(X86_FEATURE_CX16) ||
+ ((h->efr_attr & (0x1 << IOMMU_FEAT_GASUP_SHIFT)) == 0))
+ amd_iommu_guest_ir = AMD_IOMMU_GUEST_IR_LEGACY;
+ break;
+ case 0x11:
+ case 0x40:
+ if (h->efr_reg & (1 << 9))
+ iommu->mmio_phys_end = MMIO_REG_END_OFFSET;
+ else
+ iommu->mmio_phys_end = MMIO_CNTR_CONF_OFFSET;
+
+ /*
+ * Note: GA (128-bit IRTE) mode requires cmpxchg16b supports.
+ * XT, GAM also requires GA mode. Therefore, we need to
+ * check cmpxchg16b support before enabling them.
+ */
+ if (!boot_cpu_has(X86_FEATURE_CX16) ||
+ ((h->efr_reg & (0x1 << IOMMU_EFR_GASUP_SHIFT)) == 0)) {
+ amd_iommu_guest_ir = AMD_IOMMU_GUEST_IR_LEGACY;
+ break;
+ }
+
+ /*
+ * Note: Since iommu_update_intcapxt() leverages
+ * the IOMMU MMIO access to MSI capability block registers
+ * for MSI address lo/hi/data, we need to check both
+ * EFR[XtSup] and EFR[MsiCapMmioSup] for x2APIC support.
+ */
+ if ((h->efr_reg & BIT(IOMMU_EFR_XTSUP_SHIFT)) &&
+ (h->efr_reg & BIT(IOMMU_EFR_MSICAPMMIOSUP_SHIFT)))
+ amd_iommu_xt_mode = IRQ_REMAP_X2APIC_MODE;
+
+ early_iommu_features_init(iommu, h);
+
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ iommu->mmio_base = iommu_map_mmio_space(iommu->mmio_phys,
+ iommu->mmio_phys_end);
+ if (!iommu->mmio_base)
+ return -ENOMEM;
+
+ if (alloc_cwwb_sem(iommu))
+ return -ENOMEM;
+
+ if (alloc_command_buffer(iommu))
+ return -ENOMEM;
+
+ if (alloc_event_buffer(iommu))
+ return -ENOMEM;
+
+ iommu->int_enabled = false;
+
+ init_translation_status(iommu);
+ if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
+ iommu_disable(iommu);
+ clear_translation_pre_enabled(iommu);
+ pr_warn("Translation was enabled for IOMMU:%d but we are not in kdump mode\n",
+ iommu->index);
+ }
+ if (amd_iommu_pre_enabled)
+ amd_iommu_pre_enabled = translation_pre_enabled(iommu);
+
+ ret = init_iommu_from_acpi(iommu, h);
+ if (ret)
+ return ret;
+
+ ret = amd_iommu_create_irq_domain(iommu);
+ if (ret)
+ return ret;
+
+ /*
+ * Make sure IOMMU is not considered to translate itself. The IVRS
+ * table tells us so, but this is a lie!
+ */
+ amd_iommu_rlookup_table[iommu->devid] = NULL;
+
+ return 0;
+}
+
+/**
+ * get_highest_supported_ivhd_type - Look up the appropriate IVHD type
+ * @ivrs: Pointer to the IVRS header
+ *
+ * This function search through all IVDB of the maximum supported IVHD
+ */
+static u8 get_highest_supported_ivhd_type(struct acpi_table_header *ivrs)
+{
+ u8 *base = (u8 *)ivrs;
+ struct ivhd_header *ivhd = (struct ivhd_header *)
+ (base + IVRS_HEADER_LENGTH);
+ u8 last_type = ivhd->type;
+ u16 devid = ivhd->devid;
+
+ while (((u8 *)ivhd - base < ivrs->length) &&
+ (ivhd->type <= ACPI_IVHD_TYPE_MAX_SUPPORTED)) {
+ u8 *p = (u8 *) ivhd;
+
+ if (ivhd->devid == devid)
+ last_type = ivhd->type;
+ ivhd = (struct ivhd_header *)(p + ivhd->length);
+ }
+
+ return last_type;
+}
+
+/*
+ * Iterates over all IOMMU entries in the ACPI table, allocates the
+ * IOMMU structure and initializes it with init_iommu_one()
+ */
+static int __init init_iommu_all(struct acpi_table_header *table)
+{
+ u8 *p = (u8 *)table, *end = (u8 *)table;
+ struct ivhd_header *h;
+ struct amd_iommu *iommu;
+ int ret;
+
+ end += table->length;
+ p += IVRS_HEADER_LENGTH;
+
+ while (p < end) {
+ h = (struct ivhd_header *)p;
+ if (*p == amd_iommu_target_ivhd_type) {
+
+ DUMP_printk("device: %02x:%02x.%01x cap: %04x "
+ "seg: %d flags: %01x info %04x\n",
+ PCI_BUS_NUM(h->devid), PCI_SLOT(h->devid),
+ PCI_FUNC(h->devid), h->cap_ptr,
+ h->pci_seg, h->flags, h->info);
+ DUMP_printk(" mmio-addr: %016llx\n",
+ h->mmio_phys);
+
+ iommu = kzalloc(sizeof(struct amd_iommu), GFP_KERNEL);
+ if (iommu == NULL)
+ return -ENOMEM;
+
+ ret = init_iommu_one(iommu, h);
+ if (ret)
+ return ret;
+ }
+ p += h->length;
+
+ }
+ WARN_ON(p != end);
+
+ return 0;
+}
+
+static void init_iommu_perf_ctr(struct amd_iommu *iommu)
+{
+ u64 val;
+ struct pci_dev *pdev = iommu->dev;
+
+ if (!iommu_feature(iommu, FEATURE_PC))
+ return;
+
+ amd_iommu_pc_present = true;
+
+ pci_info(pdev, "IOMMU performance counters supported\n");
+
+ val = readl(iommu->mmio_base + MMIO_CNTR_CONF_OFFSET);
+ iommu->max_banks = (u8) ((val >> 12) & 0x3f);
+ iommu->max_counters = (u8) ((val >> 7) & 0xf);
+
+ return;
+}
+
+static ssize_t amd_iommu_show_cap(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct amd_iommu *iommu = dev_to_amd_iommu(dev);
+ return sprintf(buf, "%x\n", iommu->cap);
+}
+static DEVICE_ATTR(cap, S_IRUGO, amd_iommu_show_cap, NULL);
+
+static ssize_t amd_iommu_show_features(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct amd_iommu *iommu = dev_to_amd_iommu(dev);
+ return sprintf(buf, "%llx\n", iommu->features);
+}
+static DEVICE_ATTR(features, S_IRUGO, amd_iommu_show_features, NULL);
+
+static struct attribute *amd_iommu_attrs[] = {
+ &dev_attr_cap.attr,
+ &dev_attr_features.attr,
+ NULL,
+};
+
+static struct attribute_group amd_iommu_group = {
+ .name = "amd-iommu",
+ .attrs = amd_iommu_attrs,
+};
+
+static const struct attribute_group *amd_iommu_groups[] = {
+ &amd_iommu_group,
+ NULL,
+};
+
+/*
+ * Note: IVHD 0x11 and 0x40 also contains exact copy
+ * of the IOMMU Extended Feature Register [MMIO Offset 0030h].
+ * Default to EFR in IVHD since it is available sooner (i.e. before PCI init).
+ */
+static void __init late_iommu_features_init(struct amd_iommu *iommu)
+{
+ u64 features;
+
+ if (!(iommu->cap & (1 << IOMMU_CAP_EFR)))
+ return;
+
+ /* read extended feature bits */
+ features = readq(iommu->mmio_base + MMIO_EXT_FEATURES);
+
+ if (!iommu->features) {
+ iommu->features = features;
+ return;
+ }
+
+ /*
+ * Sanity check and warn if EFR values from
+ * IVHD and MMIO conflict.
+ */
+ if (features != iommu->features)
+ pr_warn(FW_WARN "EFR mismatch. Use IVHD EFR (%#llx : %#llx).\n",
+ features, iommu->features);
+}
+
+static int __init iommu_init_pci(struct amd_iommu *iommu)
+{
+ int cap_ptr = iommu->cap_ptr;
+ int ret;
+
+ iommu->dev = pci_get_domain_bus_and_slot(0, PCI_BUS_NUM(iommu->devid),
+ iommu->devid & 0xff);
+ if (!iommu->dev)
+ return -ENODEV;
+
+ /* Prevent binding other PCI device drivers to IOMMU devices */
+ iommu->dev->match_driver = false;
+
+ pci_read_config_dword(iommu->dev, cap_ptr + MMIO_CAP_HDR_OFFSET,
+ &iommu->cap);
+
+ if (!(iommu->cap & (1 << IOMMU_CAP_IOTLB)))
+ amd_iommu_iotlb_sup = false;
+
+ late_iommu_features_init(iommu);
+
+ if (iommu_feature(iommu, FEATURE_GT)) {
+ int glxval;
+ u32 max_pasid;
+ u64 pasmax;
+
+ pasmax = iommu->features & FEATURE_PASID_MASK;
+ pasmax >>= FEATURE_PASID_SHIFT;
+ max_pasid = (1 << (pasmax + 1)) - 1;
+
+ amd_iommu_max_pasid = min(amd_iommu_max_pasid, max_pasid);
+
+ BUG_ON(amd_iommu_max_pasid & ~PASID_MASK);
+
+ glxval = iommu->features & FEATURE_GLXVAL_MASK;
+ glxval >>= FEATURE_GLXVAL_SHIFT;
+
+ if (amd_iommu_max_glx_val == -1)
+ amd_iommu_max_glx_val = glxval;
+ else
+ amd_iommu_max_glx_val = min(amd_iommu_max_glx_val, glxval);
+ }
+
+ if (iommu_feature(iommu, FEATURE_GT) &&
+ iommu_feature(iommu, FEATURE_PPR)) {
+ iommu->is_iommu_v2 = true;
+ amd_iommu_v2_present = true;
+ }
+
+ if (iommu_feature(iommu, FEATURE_PPR) && alloc_ppr_log(iommu))
+ return -ENOMEM;
+
+ ret = iommu_init_ga_log(iommu);
+ if (ret)
+ return ret;
+
+ if (iommu->cap & (1UL << IOMMU_CAP_NPCACHE))
+ amd_iommu_np_cache = true;
+
+ init_iommu_perf_ctr(iommu);
+
+ if (is_rd890_iommu(iommu->dev)) {
+ int i, j;
+
+ iommu->root_pdev =
+ pci_get_domain_bus_and_slot(0, iommu->dev->bus->number,
+ PCI_DEVFN(0, 0));
+
+ /*
+ * Some rd890 systems may not be fully reconfigured by the
+ * BIOS, so it's necessary for us to store this information so
+ * it can be reprogrammed on resume
+ */
+ pci_read_config_dword(iommu->dev, iommu->cap_ptr + 4,
+ &iommu->stored_addr_lo);
+ pci_read_config_dword(iommu->dev, iommu->cap_ptr + 8,
+ &iommu->stored_addr_hi);
+
+ /* Low bit locks writes to configuration space */
+ iommu->stored_addr_lo &= ~1;
+
+ for (i = 0; i < 6; i++)
+ for (j = 0; j < 0x12; j++)
+ iommu->stored_l1[i][j] = iommu_read_l1(iommu, i, j);
+
+ for (i = 0; i < 0x83; i++)
+ iommu->stored_l2[i] = iommu_read_l2(iommu, i);
+ }
+
+ amd_iommu_erratum_746_workaround(iommu);
+ amd_iommu_ats_write_check_workaround(iommu);
+
+ iommu_device_sysfs_add(&iommu->iommu, &iommu->dev->dev,
+ amd_iommu_groups, "ivhd%d", iommu->index);
+ iommu_device_set_ops(&iommu->iommu, &amd_iommu_ops);
+ iommu_device_register(&iommu->iommu);
+
+ return pci_enable_device(iommu->dev);
+}
+
+static void print_iommu_info(void)
+{
+ static const char * const feat_str[] = {
+ "PreF", "PPR", "X2APIC", "NX", "GT", "[5]",
+ "IA", "GA", "HE", "PC"
+ };
+ struct amd_iommu *iommu;
+
+ for_each_iommu(iommu) {
+ struct pci_dev *pdev = iommu->dev;
+ int i;
+
+ pci_info(pdev, "Found IOMMU cap 0x%hx\n", iommu->cap_ptr);
+
+ if (iommu->cap & (1 << IOMMU_CAP_EFR)) {
+ pr_info("Extended features (%#llx):", iommu->features);
+
+ for (i = 0; i < ARRAY_SIZE(feat_str); ++i) {
+ if (iommu_feature(iommu, (1ULL << i)))
+ pr_cont(" %s", feat_str[i]);
+ }
+
+ if (iommu->features & FEATURE_GAM_VAPIC)
+ pr_cont(" GA_vAPIC");
+
+ pr_cont("\n");
+ }
+ }
+ if (irq_remapping_enabled) {
+ pr_info("Interrupt remapping enabled\n");
+ if (AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir))
+ pr_info("Virtual APIC enabled\n");
+ if (amd_iommu_xt_mode == IRQ_REMAP_X2APIC_MODE)
+ pr_info("X2APIC enabled\n");
+ }
+}
+
+static int __init amd_iommu_init_pci(void)
+{
+ struct amd_iommu *iommu;
+ int ret = 0;
+
+ for_each_iommu(iommu) {
+ ret = iommu_init_pci(iommu);
+ if (ret)
+ break;
+
+ /* Need to setup range after PCI init */
+ iommu_set_cwwb_range(iommu);
+ }
+
+ /*
+ * Order is important here to make sure any unity map requirements are
+ * fulfilled. The unity mappings are created and written to the device
+ * table during the amd_iommu_init_api() call.
+ *
+ * After that we call init_device_table_dma() to make sure any
+ * uninitialized DTE will block DMA, and in the end we flush the caches
+ * of all IOMMUs to make sure the changes to the device table are
+ * active.
+ */
+ ret = amd_iommu_init_api();
+
+ init_device_table_dma();
+
+ for_each_iommu(iommu)
+ iommu_flush_all_caches(iommu);
+
+ if (!ret)
+ print_iommu_info();
+
+ return ret;
+}
+
+/****************************************************************************
+ *
+ * The following functions initialize the MSI interrupts for all IOMMUs
+ * in the system. It's a bit challenging because there could be multiple
+ * IOMMUs per PCI BDF but we can call pci_enable_msi(x) only once per
+ * pci_dev.
+ *
+ ****************************************************************************/
+
+static int iommu_setup_msi(struct amd_iommu *iommu)
+{
+ int r;
+
+ r = pci_enable_msi(iommu->dev);
+ if (r)
+ return r;
+
+ r = request_threaded_irq(iommu->dev->irq,
+ amd_iommu_int_handler,
+ amd_iommu_int_thread,
+ 0, "AMD-Vi",
+ iommu);
+
+ if (r) {
+ pci_disable_msi(iommu->dev);
+ return r;
+ }
+
+ iommu->int_enabled = true;
+
+ return 0;
+}
+
+#define XT_INT_DEST_MODE(x) (((x) & 0x1ULL) << 2)
+#define XT_INT_DEST_LO(x) (((x) & 0xFFFFFFULL) << 8)
+#define XT_INT_VEC(x) (((x) & 0xFFULL) << 32)
+#define XT_INT_DEST_HI(x) ((((x) >> 24) & 0xFFULL) << 56)
+
+/*
+ * Setup the IntCapXT registers with interrupt routing information
+ * based on the PCI MSI capability block registers, accessed via
+ * MMIO MSI address low/hi and MSI data registers.
+ */
+static void iommu_update_intcapxt(struct amd_iommu *iommu)
+{
+ u64 val;
+ u32 addr_lo = readl(iommu->mmio_base + MMIO_MSI_ADDR_LO_OFFSET);
+ u32 addr_hi = readl(iommu->mmio_base + MMIO_MSI_ADDR_HI_OFFSET);
+ u32 data = readl(iommu->mmio_base + MMIO_MSI_DATA_OFFSET);
+ bool dm = (addr_lo >> MSI_ADDR_DEST_MODE_SHIFT) & 0x1;
+ u32 dest = ((addr_lo >> MSI_ADDR_DEST_ID_SHIFT) & 0xFF);
+
+ if (x2apic_enabled())
+ dest |= MSI_ADDR_EXT_DEST_ID(addr_hi);
+
+ val = XT_INT_VEC(data & 0xFF) |
+ XT_INT_DEST_MODE(dm) |
+ XT_INT_DEST_LO(dest) |
+ XT_INT_DEST_HI(dest);
+
+ /**
+ * Current IOMMU implemtation uses the same IRQ for all
+ * 3 IOMMU interrupts.
+ */
+ writeq(val, iommu->mmio_base + MMIO_INTCAPXT_EVT_OFFSET);
+ writeq(val, iommu->mmio_base + MMIO_INTCAPXT_PPR_OFFSET);
+ writeq(val, iommu->mmio_base + MMIO_INTCAPXT_GALOG_OFFSET);
+}
+
+static void _irq_notifier_notify(struct irq_affinity_notify *notify,
+ const cpumask_t *mask)
+{
+ struct amd_iommu *iommu;
+
+ for_each_iommu(iommu) {
+ if (iommu->dev->irq == notify->irq) {
+ iommu_update_intcapxt(iommu);
+ break;
+ }
+ }
+}
+
+static void _irq_notifier_release(struct kref *ref)
+{
+}
+
+static int iommu_init_intcapxt(struct amd_iommu *iommu)
+{
+ int ret;
+ struct irq_affinity_notify *notify = &iommu->intcapxt_notify;
+
+ /**
+ * IntCapXT requires XTSup=1 and MsiCapMmioSup=1,
+ * which can be inferred from amd_iommu_xt_mode.
+ */
+ if (amd_iommu_xt_mode != IRQ_REMAP_X2APIC_MODE)
+ return 0;
+
+ /**
+ * Also, we need to setup notifier to update the IntCapXT registers
+ * whenever the irq affinity is changed from user-space.
+ */
+ notify->irq = iommu->dev->irq;
+ notify->notify = _irq_notifier_notify,
+ notify->release = _irq_notifier_release,
+ ret = irq_set_affinity_notifier(iommu->dev->irq, notify);
+ if (ret) {
+ pr_err("Failed to register irq affinity notifier (devid=%#x, irq %d)\n",
+ iommu->devid, iommu->dev->irq);
+ return ret;
+ }
+
+ iommu_update_intcapxt(iommu);
+ iommu_feature_enable(iommu, CONTROL_INTCAPXT_EN);
+ return ret;
+}
+
+static int iommu_init_msi(struct amd_iommu *iommu)
+{
+ int ret;
+
+ if (iommu->int_enabled)
+ goto enable_faults;
+
+ if (iommu->dev->msi_cap)
+ ret = iommu_setup_msi(iommu);
+ else
+ ret = -ENODEV;
+
+ if (ret)
+ return ret;
+
+enable_faults:
+ ret = iommu_init_intcapxt(iommu);
+ if (ret)
+ return ret;
+
+ iommu_feature_enable(iommu, CONTROL_EVT_INT_EN);
+
+ if (iommu->ppr_log != NULL)
+ iommu_feature_enable(iommu, CONTROL_PPRINT_EN);
+
+ iommu_ga_log_enable(iommu);
+
+ return 0;
+}
+
+/****************************************************************************
+ *
+ * The next functions belong to the third pass of parsing the ACPI
+ * table. In this last pass the memory mapping requirements are
+ * gathered (like exclusion and unity mapping ranges).
+ *
+ ****************************************************************************/
+
+static void __init free_unity_maps(void)
+{
+ struct unity_map_entry *entry, *next;
+
+ list_for_each_entry_safe(entry, next, &amd_iommu_unity_map, list) {
+ list_del(&entry->list);
+ kfree(entry);
+ }
+}
+
+/* called for unity map ACPI definition */
+static int __init init_unity_map_range(struct ivmd_header *m)
+{
+ struct unity_map_entry *e = NULL;
+ char *s;
+
+ e = kzalloc(sizeof(*e), GFP_KERNEL);
+ if (e == NULL)
+ return -ENOMEM;
+
+ switch (m->type) {
+ default:
+ kfree(e);
+ return 0;
+ case ACPI_IVMD_TYPE:
+ s = "IVMD_TYPEi\t\t\t";
+ e->devid_start = e->devid_end = m->devid;
+ break;
+ case ACPI_IVMD_TYPE_ALL:
+ s = "IVMD_TYPE_ALL\t\t";
+ e->devid_start = 0;
+ e->devid_end = amd_iommu_last_bdf;
+ break;
+ case ACPI_IVMD_TYPE_RANGE:
+ s = "IVMD_TYPE_RANGE\t\t";
+ e->devid_start = m->devid;
+ e->devid_end = m->aux;
+ break;
+ }
+ e->address_start = PAGE_ALIGN(m->range_start);
+ e->address_end = e->address_start + PAGE_ALIGN(m->range_length);
+ e->prot = m->flags >> 1;
+
+ /*
+ * Treat per-device exclusion ranges as r/w unity-mapped regions
+ * since some buggy BIOSes might lead to the overwritten exclusion
+ * range (exclusion_start and exclusion_length members). This
+ * happens when there are multiple exclusion ranges (IVMD entries)
+ * defined in ACPI table.
+ */
+ if (m->flags & IVMD_FLAG_EXCL_RANGE)
+ e->prot = (IVMD_FLAG_IW | IVMD_FLAG_IR) >> 1;
+
+ DUMP_printk("%s devid_start: %02x:%02x.%x devid_end: %02x:%02x.%x"
+ " range_start: %016llx range_end: %016llx flags: %x\n", s,
+ PCI_BUS_NUM(e->devid_start), PCI_SLOT(e->devid_start),
+ PCI_FUNC(e->devid_start), PCI_BUS_NUM(e->devid_end),
+ PCI_SLOT(e->devid_end), PCI_FUNC(e->devid_end),
+ e->address_start, e->address_end, m->flags);
+
+ list_add_tail(&e->list, &amd_iommu_unity_map);
+
+ return 0;
+}
+
+/* iterates over all memory definitions we find in the ACPI table */
+static int __init init_memory_definitions(struct acpi_table_header *table)
+{
+ u8 *p = (u8 *)table, *end = (u8 *)table;
+ struct ivmd_header *m;
+
+ end += table->length;
+ p += IVRS_HEADER_LENGTH;
+
+ while (p < end) {
+ m = (struct ivmd_header *)p;
+ if (m->flags & (IVMD_FLAG_UNITY_MAP | IVMD_FLAG_EXCL_RANGE))
+ init_unity_map_range(m);
+
+ p += m->length;
+ }
+
+ return 0;
+}
+
+/*
+ * Init the device table to not allow DMA access for devices
+ */
+static void init_device_table_dma(void)
+{
+ u32 devid;
+
+ for (devid = 0; devid <= amd_iommu_last_bdf; ++devid) {
+ set_dev_entry_bit(devid, DEV_ENTRY_VALID);
+ set_dev_entry_bit(devid, DEV_ENTRY_TRANSLATION);
+ }
+}
+
+static void __init uninit_device_table_dma(void)
+{
+ u32 devid;
+
+ for (devid = 0; devid <= amd_iommu_last_bdf; ++devid) {
+ amd_iommu_dev_table[devid].data[0] = 0ULL;
+ amd_iommu_dev_table[devid].data[1] = 0ULL;
+ }
+}
+
+static void init_device_table(void)
+{
+ u32 devid;
+
+ if (!amd_iommu_irq_remap)
+ return;
+
+ for (devid = 0; devid <= amd_iommu_last_bdf; ++devid)
+ set_dev_entry_bit(devid, DEV_ENTRY_IRQ_TBL_EN);
+}
+
+static void iommu_init_flags(struct amd_iommu *iommu)
+{
+ iommu->acpi_flags & IVHD_FLAG_HT_TUN_EN_MASK ?
+ iommu_feature_enable(iommu, CONTROL_HT_TUN_EN) :
+ iommu_feature_disable(iommu, CONTROL_HT_TUN_EN);
+
+ iommu->acpi_flags & IVHD_FLAG_PASSPW_EN_MASK ?
+ iommu_feature_enable(iommu, CONTROL_PASSPW_EN) :
+ iommu_feature_disable(iommu, CONTROL_PASSPW_EN);
+
+ iommu->acpi_flags & IVHD_FLAG_RESPASSPW_EN_MASK ?
+ iommu_feature_enable(iommu, CONTROL_RESPASSPW_EN) :
+ iommu_feature_disable(iommu, CONTROL_RESPASSPW_EN);
+
+ iommu->acpi_flags & IVHD_FLAG_ISOC_EN_MASK ?
+ iommu_feature_enable(iommu, CONTROL_ISOC_EN) :
+ iommu_feature_disable(iommu, CONTROL_ISOC_EN);
+
+ /*
+ * make IOMMU memory accesses cache coherent
+ */
+ iommu_feature_enable(iommu, CONTROL_COHERENT_EN);
+
+ /* Set IOTLB invalidation timeout to 1s */
+ iommu_set_inv_tlb_timeout(iommu, CTRL_INV_TO_1S);
+}
+
+static void iommu_apply_resume_quirks(struct amd_iommu *iommu)
+{
+ int i, j;
+ u32 ioc_feature_control;
+ struct pci_dev *pdev = iommu->root_pdev;
+
+ /* RD890 BIOSes may not have completely reconfigured the iommu */
+ if (!is_rd890_iommu(iommu->dev) || !pdev)
+ return;
+
+ /*
+ * First, we need to ensure that the iommu is enabled. This is
+ * controlled by a register in the northbridge
+ */
+
+ /* Select Northbridge indirect register 0x75 and enable writing */
+ pci_write_config_dword(pdev, 0x60, 0x75 | (1 << 7));
+ pci_read_config_dword(pdev, 0x64, &ioc_feature_control);
+
+ /* Enable the iommu */
+ if (!(ioc_feature_control & 0x1))
+ pci_write_config_dword(pdev, 0x64, ioc_feature_control | 1);
+
+ /* Restore the iommu BAR */
+ pci_write_config_dword(iommu->dev, iommu->cap_ptr + 4,
+ iommu->stored_addr_lo);
+ pci_write_config_dword(iommu->dev, iommu->cap_ptr + 8,
+ iommu->stored_addr_hi);
+
+ /* Restore the l1 indirect regs for each of the 6 l1s */
+ for (i = 0; i < 6; i++)
+ for (j = 0; j < 0x12; j++)
+ iommu_write_l1(iommu, i, j, iommu->stored_l1[i][j]);
+
+ /* Restore the l2 indirect regs */
+ for (i = 0; i < 0x83; i++)
+ iommu_write_l2(iommu, i, iommu->stored_l2[i]);
+
+ /* Lock PCI setup registers */
+ pci_write_config_dword(iommu->dev, iommu->cap_ptr + 4,
+ iommu->stored_addr_lo | 1);
+}
+
+static void iommu_enable_ga(struct amd_iommu *iommu)
+{
+#ifdef CONFIG_IRQ_REMAP
+ switch (amd_iommu_guest_ir) {
+ case AMD_IOMMU_GUEST_IR_VAPIC:
+ iommu_feature_enable(iommu, CONTROL_GAM_EN);
+ fallthrough;
+ case AMD_IOMMU_GUEST_IR_LEGACY_GA:
+ iommu_feature_enable(iommu, CONTROL_GA_EN);
+ iommu->irte_ops = &irte_128_ops;
+ break;
+ default:
+ iommu->irte_ops = &irte_32_ops;
+ break;
+ }
+#endif
+}
+
+static void early_enable_iommu(struct amd_iommu *iommu)
+{
+ iommu_disable(iommu);
+ iommu_init_flags(iommu);
+ iommu_set_device_table(iommu);
+ iommu_enable_command_buffer(iommu);
+ iommu_enable_event_buffer(iommu);
+ iommu_set_exclusion_range(iommu);
+ iommu_enable_ga(iommu);
+ iommu_enable_xt(iommu);
+ iommu_enable(iommu);
+ iommu_flush_all_caches(iommu);
+}
+
+/*
+ * This function finally enables all IOMMUs found in the system after
+ * they have been initialized.
+ *
+ * Or if in kdump kernel and IOMMUs are all pre-enabled, try to copy
+ * the old content of device table entries. Not this case or copy failed,
+ * just continue as normal kernel does.
+ */
+static void early_enable_iommus(void)
+{
+ struct amd_iommu *iommu;
+
+
+ if (!copy_device_table()) {
+ /*
+ * If come here because of failure in copying device table from old
+ * kernel with all IOMMUs enabled, print error message and try to
+ * free allocated old_dev_tbl_cpy.
+ */
+ if (amd_iommu_pre_enabled)
+ pr_err("Failed to copy DEV table from previous kernel.\n");
+ if (old_dev_tbl_cpy != NULL)
+ free_pages((unsigned long)old_dev_tbl_cpy,
+ get_order(dev_table_size));
+
+ for_each_iommu(iommu) {
+ clear_translation_pre_enabled(iommu);
+ early_enable_iommu(iommu);
+ }
+ } else {
+ pr_info("Copied DEV table from previous kernel.\n");
+ free_pages((unsigned long)amd_iommu_dev_table,
+ get_order(dev_table_size));
+ amd_iommu_dev_table = old_dev_tbl_cpy;
+ for_each_iommu(iommu) {
+ iommu_disable_command_buffer(iommu);
+ iommu_disable_event_buffer(iommu);
+ iommu_enable_command_buffer(iommu);
+ iommu_enable_event_buffer(iommu);
+ iommu_enable_ga(iommu);
+ iommu_enable_xt(iommu);
+ iommu_set_device_table(iommu);
+ iommu_flush_all_caches(iommu);
+ }
+ }
+
+#ifdef CONFIG_IRQ_REMAP
+ /*
+ * Note: We have already checked GASup from IVRS table.
+ * Now, we need to make sure that GAMSup is set.
+ */
+ if (AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir) &&
+ !check_feature_on_all_iommus(FEATURE_GAM_VAPIC))
+ amd_iommu_guest_ir = AMD_IOMMU_GUEST_IR_LEGACY_GA;
+
+ if (AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir))
+ amd_iommu_irq_ops.capability |= (1 << IRQ_POSTING_CAP);
+#endif
+}
+
+static void enable_iommus_v2(void)
+{
+ struct amd_iommu *iommu;
+
+ for_each_iommu(iommu) {
+ iommu_enable_ppr_log(iommu);
+ iommu_enable_gt(iommu);
+ }
+}
+
+static void enable_iommus(void)
+{
+ early_enable_iommus();
+
+ enable_iommus_v2();
+}
+
+static void disable_iommus(void)
+{
+ struct amd_iommu *iommu;
+
+ for_each_iommu(iommu)
+ iommu_disable(iommu);
+
+#ifdef CONFIG_IRQ_REMAP
+ if (AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir))
+ amd_iommu_irq_ops.capability &= ~(1 << IRQ_POSTING_CAP);
+#endif
+}
+
+/*
+ * Suspend/Resume support
+ * disable suspend until real resume implemented
+ */
+
+static void amd_iommu_resume(void)
+{
+ struct amd_iommu *iommu;
+
+ for_each_iommu(iommu)
+ iommu_apply_resume_quirks(iommu);
+
+ /* re-load the hardware */
+ enable_iommus();
+
+ amd_iommu_enable_interrupts();
+}
+
+static int amd_iommu_suspend(void)
+{
+ /* disable IOMMUs to go out of the way for BIOS */
+ disable_iommus();
+
+ return 0;
+}
+
+static struct syscore_ops amd_iommu_syscore_ops = {
+ .suspend = amd_iommu_suspend,
+ .resume = amd_iommu_resume,
+};
+
+static void __init free_iommu_resources(void)
+{
+ kmemleak_free(irq_lookup_table);
+ free_pages((unsigned long)irq_lookup_table,
+ get_order(rlookup_table_size));
+ irq_lookup_table = NULL;
+
+ kmem_cache_destroy(amd_iommu_irq_cache);
+ amd_iommu_irq_cache = NULL;
+
+ free_pages((unsigned long)amd_iommu_rlookup_table,
+ get_order(rlookup_table_size));
+ amd_iommu_rlookup_table = NULL;
+
+ free_pages((unsigned long)amd_iommu_alias_table,
+ get_order(alias_table_size));
+ amd_iommu_alias_table = NULL;
+
+ free_pages((unsigned long)amd_iommu_dev_table,
+ get_order(dev_table_size));
+ amd_iommu_dev_table = NULL;
+
+ free_iommu_all();
+}
+
+/* SB IOAPIC is always on this device in AMD systems */
+#define IOAPIC_SB_DEVID ((0x00 << 8) | PCI_DEVFN(0x14, 0))
+
+static bool __init check_ioapic_information(void)
+{
+ const char *fw_bug = FW_BUG;
+ bool ret, has_sb_ioapic;
+ int idx;
+
+ has_sb_ioapic = false;
+ ret = false;
+
+ /*
+ * If we have map overrides on the kernel command line the
+ * messages in this function might not describe firmware bugs
+ * anymore - so be careful
+ */
+ if (cmdline_maps)
+ fw_bug = "";
+
+ for (idx = 0; idx < nr_ioapics; idx++) {
+ int devid, id = mpc_ioapic_id(idx);
+
+ devid = get_ioapic_devid(id);
+ if (devid < 0) {
+ pr_err("%s: IOAPIC[%d] not in IVRS table\n",
+ fw_bug, id);
+ ret = false;
+ } else if (devid == IOAPIC_SB_DEVID) {
+ has_sb_ioapic = true;
+ ret = true;
+ }
+ }
+
+ if (!has_sb_ioapic) {
+ /*
+ * We expect the SB IOAPIC to be listed in the IVRS
+ * table. The system timer is connected to the SB IOAPIC
+ * and if we don't have it in the list the system will
+ * panic at boot time. This situation usually happens
+ * when the BIOS is buggy and provides us the wrong
+ * device id for the IOAPIC in the system.
+ */
+ pr_err("%s: No southbridge IOAPIC found\n", fw_bug);
+ }
+
+ if (!ret)
+ pr_err("Disabling interrupt remapping\n");
+
+ return ret;
+}
+
+static void __init free_dma_resources(void)
+{
+ free_pages((unsigned long)amd_iommu_pd_alloc_bitmap,
+ get_order(MAX_DOMAIN_ID/8));
+ amd_iommu_pd_alloc_bitmap = NULL;
+
+ free_unity_maps();
+}
+
+static void __init ivinfo_init(void *ivrs)
+{
+ amd_iommu_ivinfo = *((u32 *)(ivrs + IOMMU_IVINFO_OFFSET));
+}
+
+/*
+ * This is the hardware init function for AMD IOMMU in the system.
+ * This function is called either from amd_iommu_init or from the interrupt
+ * remapping setup code.
+ *
+ * This function basically parses the ACPI table for AMD IOMMU (IVRS)
+ * four times:
+ *
+ * 1 pass) Discover the most comprehensive IVHD type to use.
+ *
+ * 2 pass) Find the highest PCI device id the driver has to handle.
+ * Upon this information the size of the data structures is
+ * determined that needs to be allocated.
+ *
+ * 3 pass) Initialize the data structures just allocated with the
+ * information in the ACPI table about available AMD IOMMUs
+ * in the system. It also maps the PCI devices in the
+ * system to specific IOMMUs
+ *
+ * 4 pass) After the basic data structures are allocated and
+ * initialized we update them with information about memory
+ * remapping requirements parsed out of the ACPI table in
+ * this last pass.
+ *
+ * After everything is set up the IOMMUs are enabled and the necessary
+ * hotplug and suspend notifiers are registered.
+ */
+static int __init early_amd_iommu_init(void)
+{
+ struct acpi_table_header *ivrs_base;
+ acpi_status status;
+ int i, remap_cache_sz, ret = 0;
+ u32 pci_id;
+
+ if (!amd_iommu_detected)
+ return -ENODEV;
+
+ status = acpi_get_table("IVRS", 0, &ivrs_base);
+ if (status == AE_NOT_FOUND)
+ return -ENODEV;
+ else if (ACPI_FAILURE(status)) {
+ const char *err = acpi_format_exception(status);
+ pr_err("IVRS table error: %s\n", err);
+ return -EINVAL;
+ }
+
+ /*
+ * Validate checksum here so we don't need to do it when
+ * we actually parse the table
+ */
+ ret = check_ivrs_checksum(ivrs_base);
+ if (ret)
+ goto out;
+
+ ivinfo_init(ivrs_base);
+
+ amd_iommu_target_ivhd_type = get_highest_supported_ivhd_type(ivrs_base);
+ DUMP_printk("Using IVHD type %#x\n", amd_iommu_target_ivhd_type);
+
+ /*
+ * First parse ACPI tables to find the largest Bus/Dev/Func
+ * we need to handle. Upon this information the shared data
+ * structures for the IOMMUs in the system will be allocated
+ */
+ ret = find_last_devid_acpi(ivrs_base);
+ if (ret)
+ goto out;
+
+ dev_table_size = tbl_size(DEV_TABLE_ENTRY_SIZE);
+ alias_table_size = tbl_size(ALIAS_TABLE_ENTRY_SIZE);
+ rlookup_table_size = tbl_size(RLOOKUP_TABLE_ENTRY_SIZE);
+
+ /* Device table - directly used by all IOMMUs */
+ ret = -ENOMEM;
+ amd_iommu_dev_table = (void *)__get_free_pages(
+ GFP_KERNEL | __GFP_ZERO | GFP_DMA32,
+ get_order(dev_table_size));
+ if (amd_iommu_dev_table == NULL)
+ goto out;
+
+ /*
+ * Alias table - map PCI Bus/Dev/Func to Bus/Dev/Func the
+ * IOMMU see for that device
+ */
+ amd_iommu_alias_table = (void *)__get_free_pages(GFP_KERNEL,
+ get_order(alias_table_size));
+ if (amd_iommu_alias_table == NULL)
+ goto out;
+
+ /* IOMMU rlookup table - find the IOMMU for a specific device */
+ amd_iommu_rlookup_table = (void *)__get_free_pages(
+ GFP_KERNEL | __GFP_ZERO,
+ get_order(rlookup_table_size));
+ if (amd_iommu_rlookup_table == NULL)
+ goto out;
+
+ amd_iommu_pd_alloc_bitmap = (void *)__get_free_pages(
+ GFP_KERNEL | __GFP_ZERO,
+ get_order(MAX_DOMAIN_ID/8));
+ if (amd_iommu_pd_alloc_bitmap == NULL)
+ goto out;
+
+ /*
+ * let all alias entries point to itself
+ */
+ for (i = 0; i <= amd_iommu_last_bdf; ++i)
+ amd_iommu_alias_table[i] = i;
+
+ /*
+ * never allocate domain 0 because its used as the non-allocated and
+ * error value placeholder
+ */
+ __set_bit(0, amd_iommu_pd_alloc_bitmap);
+
+ /*
+ * now the data structures are allocated and basically initialized
+ * start the real acpi table scan
+ */
+ ret = init_iommu_all(ivrs_base);
+ if (ret)
+ goto out;
+
+ /* Disable IOMMU if there's Stoney Ridge graphics */
+ for (i = 0; i < 32; i++) {
+ pci_id = read_pci_config(0, i, 0, 0);
+ if ((pci_id & 0xffff) == 0x1002 && (pci_id >> 16) == 0x98e4) {
+ pr_info("Disable IOMMU on Stoney Ridge\n");
+ amd_iommu_disabled = true;
+ break;
+ }
+ }
+
+ /* Disable any previously enabled IOMMUs */
+ if (!is_kdump_kernel() || amd_iommu_disabled)
+ disable_iommus();
+
+ if (amd_iommu_irq_remap)
+ amd_iommu_irq_remap = check_ioapic_information();
+
+ if (amd_iommu_irq_remap) {
+ /*
+ * Interrupt remapping enabled, create kmem_cache for the
+ * remapping tables.
+ */
+ ret = -ENOMEM;
+ if (!AMD_IOMMU_GUEST_IR_GA(amd_iommu_guest_ir))
+ remap_cache_sz = MAX_IRQS_PER_TABLE * sizeof(u32);
+ else
+ remap_cache_sz = MAX_IRQS_PER_TABLE * (sizeof(u64) * 2);
+ amd_iommu_irq_cache = kmem_cache_create("irq_remap_cache",
+ remap_cache_sz,
+ IRQ_TABLE_ALIGNMENT,
+ 0, NULL);
+ if (!amd_iommu_irq_cache)
+ goto out;
+
+ irq_lookup_table = (void *)__get_free_pages(
+ GFP_KERNEL | __GFP_ZERO,
+ get_order(rlookup_table_size));
+ kmemleak_alloc(irq_lookup_table, rlookup_table_size,
+ 1, GFP_KERNEL);
+ if (!irq_lookup_table)
+ goto out;
+ }
+
+ ret = init_memory_definitions(ivrs_base);
+ if (ret)
+ goto out;
+
+ /* init the device table */
+ init_device_table();
+
+out:
+ /* Don't leak any ACPI memory */
+ acpi_put_table(ivrs_base);
+ ivrs_base = NULL;
+
+ return ret;
+}
+
+static int amd_iommu_enable_interrupts(void)
+{
+ struct amd_iommu *iommu;
+ int ret = 0;
+
+ for_each_iommu(iommu) {
+ ret = iommu_init_msi(iommu);
+ if (ret)
+ goto out;
+ }
+
+out:
+ return ret;
+}
+
+static bool detect_ivrs(void)
+{
+ struct acpi_table_header *ivrs_base;
+ acpi_status status;
+
+ status = acpi_get_table("IVRS", 0, &ivrs_base);
+ if (status == AE_NOT_FOUND)
+ return false;
+ else if (ACPI_FAILURE(status)) {
+ const char *err = acpi_format_exception(status);
+ pr_err("IVRS table error: %s\n", err);
+ return false;
+ }
+
+ acpi_put_table(ivrs_base);
+
+ /* Make sure ACS will be enabled during PCI probe */
+ pci_request_acs();
+
+ return true;
+}
+
+/****************************************************************************
+ *
+ * AMD IOMMU Initialization State Machine
+ *
+ ****************************************************************************/
+
+static int __init state_next(void)
+{
+ int ret = 0;
+
+ switch (init_state) {
+ case IOMMU_START_STATE:
+ if (!detect_ivrs()) {
+ init_state = IOMMU_NOT_FOUND;
+ ret = -ENODEV;
+ } else {
+ init_state = IOMMU_IVRS_DETECTED;
+ }
+ break;
+ case IOMMU_IVRS_DETECTED:
+ ret = early_amd_iommu_init();
+ init_state = ret ? IOMMU_INIT_ERROR : IOMMU_ACPI_FINISHED;
+ if (init_state == IOMMU_ACPI_FINISHED && amd_iommu_disabled) {
+ pr_info("AMD IOMMU disabled\n");
+ init_state = IOMMU_CMDLINE_DISABLED;
+ ret = -EINVAL;
+ }
+ break;
+ case IOMMU_ACPI_FINISHED:
+ early_enable_iommus();
+ x86_platform.iommu_shutdown = disable_iommus;
+ init_state = IOMMU_ENABLED;
+ break;
+ case IOMMU_ENABLED:
+ register_syscore_ops(&amd_iommu_syscore_ops);
+ ret = amd_iommu_init_pci();
+ init_state = ret ? IOMMU_INIT_ERROR : IOMMU_PCI_INIT;
+ enable_iommus_v2();
+ break;
+ case IOMMU_PCI_INIT:
+ ret = amd_iommu_enable_interrupts();
+ init_state = ret ? IOMMU_INIT_ERROR : IOMMU_INTERRUPTS_EN;
+ break;
+ case IOMMU_INTERRUPTS_EN:
+ ret = amd_iommu_init_dma_ops();
+ init_state = ret ? IOMMU_INIT_ERROR : IOMMU_DMA_OPS;
+ break;
+ case IOMMU_DMA_OPS:
+ init_state = IOMMU_INITIALIZED;
+ break;
+ case IOMMU_INITIALIZED:
+ /* Nothing to do */
+ break;
+ case IOMMU_NOT_FOUND:
+ case IOMMU_INIT_ERROR:
+ case IOMMU_CMDLINE_DISABLED:
+ /* Error states => do nothing */
+ ret = -EINVAL;
+ break;
+ default:
+ /* Unknown state */
+ BUG();
+ }
+
+ if (ret) {
+ free_dma_resources();
+ if (!irq_remapping_enabled) {
+ disable_iommus();
+ free_iommu_resources();
+ } else {
+ struct amd_iommu *iommu;
+
+ uninit_device_table_dma();
+ for_each_iommu(iommu)
+ iommu_flush_all_caches(iommu);
+ }
+ }
+ return ret;
+}
+
+static int __init iommu_go_to_state(enum iommu_init_state state)
+{
+ int ret = -EINVAL;
+
+ while (init_state != state) {
+ if (init_state == IOMMU_NOT_FOUND ||
+ init_state == IOMMU_INIT_ERROR ||
+ init_state == IOMMU_CMDLINE_DISABLED)
+ break;
+ ret = state_next();
+ }
+
+ return ret;
+}
+
+#ifdef CONFIG_IRQ_REMAP
+int __init amd_iommu_prepare(void)
+{
+ int ret;
+
+ amd_iommu_irq_remap = true;
+
+ ret = iommu_go_to_state(IOMMU_ACPI_FINISHED);
+ if (ret)
+ return ret;
+ return amd_iommu_irq_remap ? 0 : -ENODEV;
+}
+
+int __init amd_iommu_enable(void)
+{
+ int ret;
+
+ ret = iommu_go_to_state(IOMMU_ENABLED);
+ if (ret)
+ return ret;
+
+ irq_remapping_enabled = 1;
+ return amd_iommu_xt_mode;
+}
+
+void amd_iommu_disable(void)
+{
+ amd_iommu_suspend();
+}
+
+int amd_iommu_reenable(int mode)
+{
+ amd_iommu_resume();
+
+ return 0;
+}
+
+int __init amd_iommu_enable_faulting(void)
+{
+ /* We enable MSI later when PCI is initialized */
+ return 0;
+}
+#endif
+
+/*
+ * This is the core init function for AMD IOMMU hardware in the system.
+ * This function is called from the generic x86 DMA layer initialization
+ * code.
+ */
+static int __init amd_iommu_init(void)
+{
+ struct amd_iommu *iommu;
+ int ret;
+
+ ret = iommu_go_to_state(IOMMU_INITIALIZED);
+#ifdef CONFIG_GART_IOMMU
+ if (ret && list_empty(&amd_iommu_list)) {
+ /*
+ * We failed to initialize the AMD IOMMU - try fallback
+ * to GART if possible.
+ */
+ gart_iommu_init();
+ }
+#endif
+
+ for_each_iommu(iommu)
+ amd_iommu_debugfs_setup(iommu);
+
+ return ret;
+}
+
+static bool amd_iommu_sme_check(void)
+{
+ if (!sme_active() || (boot_cpu_data.x86 != 0x17))
+ return true;
+
+ /* For Fam17h, a specific level of support is required */
+ if (boot_cpu_data.microcode >= 0x08001205)
+ return true;
+
+ if ((boot_cpu_data.microcode >= 0x08001126) &&
+ (boot_cpu_data.microcode <= 0x080011ff))
+ return true;
+
+ pr_notice("IOMMU not currently supported when SME is active\n");
+
+ return false;
+}
+
+/****************************************************************************
+ *
+ * Early detect code. This code runs at IOMMU detection time in the DMA
+ * layer. It just looks if there is an IVRS ACPI table to detect AMD
+ * IOMMUs
+ *
+ ****************************************************************************/
+int __init amd_iommu_detect(void)
+{
+ int ret;
+
+ if (no_iommu || (iommu_detected && !gart_iommu_aperture))
+ return -ENODEV;
+
+ if (!amd_iommu_sme_check())
+ return -ENODEV;
+
+ ret = iommu_go_to_state(IOMMU_IVRS_DETECTED);
+ if (ret)
+ return ret;
+
+ amd_iommu_detected = true;
+ iommu_detected = 1;
+ x86_init.iommu.iommu_init = amd_iommu_init;
+
+ return 1;
+}
+
+/****************************************************************************
+ *
+ * Parsing functions for the AMD IOMMU specific kernel command line
+ * options.
+ *
+ ****************************************************************************/
+
+static int __init parse_amd_iommu_dump(char *str)
+{
+ amd_iommu_dump = true;
+
+ return 1;
+}
+
+static int __init parse_amd_iommu_intr(char *str)
+{
+ for (; *str; ++str) {
+ if (strncmp(str, "legacy", 6) == 0) {
+ amd_iommu_guest_ir = AMD_IOMMU_GUEST_IR_LEGACY_GA;
+ break;
+ }
+ if (strncmp(str, "vapic", 5) == 0) {
+ amd_iommu_guest_ir = AMD_IOMMU_GUEST_IR_VAPIC;
+ break;
+ }
+ }
+ return 1;
+}
+
+static int __init parse_amd_iommu_options(char *str)
+{
+ for (; *str; ++str) {
+ if (strncmp(str, "fullflush", 9) == 0)
+ amd_iommu_unmap_flush = true;
+ if (strncmp(str, "off", 3) == 0)
+ amd_iommu_disabled = true;
+ if (strncmp(str, "force_isolation", 15) == 0)
+ amd_iommu_force_isolation = true;
+ }
+
+ return 1;
+}
+
+static int __init parse_ivrs_ioapic(char *str)
+{
+ u32 seg = 0, bus, dev, fn;
+ int id, i;
+ u32 devid;
+
+ if (sscanf(str, "=%d@%x:%x.%x", &id, &bus, &dev, &fn) == 4 ||
+ sscanf(str, "=%d@%x:%x:%x.%x", &id, &seg, &bus, &dev, &fn) == 5)
+ goto found;
+
+ if (sscanf(str, "[%d]=%x:%x.%x", &id, &bus, &dev, &fn) == 4 ||
+ sscanf(str, "[%d]=%x:%x:%x.%x", &id, &seg, &bus, &dev, &fn) == 5) {
+ pr_warn("ivrs_ioapic%s option format deprecated; use ivrs_ioapic=%d@%04x:%02x:%02x.%d instead\n",
+ str, id, seg, bus, dev, fn);
+ goto found;
+ }
+
+ pr_err("Invalid command line: ivrs_ioapic%s\n", str);
+ return 1;
+
+found:
+ if (early_ioapic_map_size == EARLY_MAP_SIZE) {
+ pr_err("Early IOAPIC map overflow - ignoring ivrs_ioapic%s\n",
+ str);
+ return 1;
+ }
+
+ devid = IVRS_GET_SBDF_ID(seg, bus, dev, fn);
+
+ cmdline_maps = true;
+ i = early_ioapic_map_size++;
+ early_ioapic_map[i].id = id;
+ early_ioapic_map[i].devid = devid;
+ early_ioapic_map[i].cmd_line = true;
+
+ return 1;
+}
+
+static int __init parse_ivrs_hpet(char *str)
+{
+ u32 seg = 0, bus, dev, fn;
+ int id, i;
+ u32 devid;
+
+ if (sscanf(str, "=%d@%x:%x.%x", &id, &bus, &dev, &fn) == 4 ||
+ sscanf(str, "=%d@%x:%x:%x.%x", &id, &seg, &bus, &dev, &fn) == 5)
+ goto found;
+
+ if (sscanf(str, "[%d]=%x:%x.%x", &id, &bus, &dev, &fn) == 4 ||
+ sscanf(str, "[%d]=%x:%x:%x.%x", &id, &seg, &bus, &dev, &fn) == 5) {
+ pr_warn("ivrs_hpet%s option format deprecated; use ivrs_hpet=%d@%04x:%02x:%02x.%d instead\n",
+ str, id, seg, bus, dev, fn);
+ goto found;
+ }
+
+ pr_err("Invalid command line: ivrs_hpet%s\n", str);
+ return 1;
+
+found:
+ if (early_hpet_map_size == EARLY_MAP_SIZE) {
+ pr_err("Early HPET map overflow - ignoring ivrs_hpet%s\n",
+ str);
+ return 1;
+ }
+
+ devid = IVRS_GET_SBDF_ID(seg, bus, dev, fn);
+
+ cmdline_maps = true;
+ i = early_hpet_map_size++;
+ early_hpet_map[i].id = id;
+ early_hpet_map[i].devid = devid;
+ early_hpet_map[i].cmd_line = true;
+
+ return 1;
+}
+
+#define ACPIID_LEN (ACPIHID_UID_LEN + ACPIHID_HID_LEN)
+
+static int __init parse_ivrs_acpihid(char *str)
+{
+ u32 seg = 0, bus, dev, fn;
+ char *hid, *uid, *p, *addr;
+ char acpiid[ACPIID_LEN] = {0};
+ int i;
+
+ addr = strchr(str, '@');
+ if (!addr) {
+ addr = strchr(str, '=');
+ if (!addr)
+ goto not_found;
+
+ ++addr;
+
+ if (strlen(addr) > ACPIID_LEN)
+ goto not_found;
+
+ if (sscanf(str, "[%x:%x.%x]=%s", &bus, &dev, &fn, acpiid) == 4 ||
+ sscanf(str, "[%x:%x:%x.%x]=%s", &seg, &bus, &dev, &fn, acpiid) == 5) {
+ pr_warn("ivrs_acpihid%s option format deprecated; use ivrs_acpihid=%s@%04x:%02x:%02x.%d instead\n",
+ str, acpiid, seg, bus, dev, fn);
+ goto found;
+ }
+ goto not_found;
+ }
+
+ /* We have the '@', make it the terminator to get just the acpiid */
+ *addr++ = 0;
+
+ if (strlen(str) > ACPIID_LEN + 1)
+ goto not_found;
+
+ if (sscanf(str, "=%s", acpiid) != 1)
+ goto not_found;
+
+ if (sscanf(addr, "%x:%x.%x", &bus, &dev, &fn) == 3 ||
+ sscanf(addr, "%x:%x:%x.%x", &seg, &bus, &dev, &fn) == 4)
+ goto found;
+
+not_found:
+ pr_err("Invalid command line: ivrs_acpihid%s\n", str);
+ return 1;
+
+found:
+ p = acpiid;
+ hid = strsep(&p, ":");
+ uid = p;
+
+ if (!hid || !(*hid) || !uid) {
+ pr_err("Invalid command line: hid or uid\n");
+ return 1;
+ }
+
+ /*
+ * Ignore leading zeroes after ':', so e.g., AMDI0095:00
+ * will match AMDI0095:0 in the second strcmp in acpi_dev_hid_uid_match
+ */
+ while (*uid == '0' && *(uid + 1))
+ uid++;
+
+ i = early_acpihid_map_size++;
+ memcpy(early_acpihid_map[i].hid, hid, strlen(hid));
+ memcpy(early_acpihid_map[i].uid, uid, strlen(uid));
+ early_acpihid_map[i].devid = IVRS_GET_SBDF_ID(seg, bus, dev, fn);
+ early_acpihid_map[i].cmd_line = true;
+
+ return 1;
+}
+
+__setup("amd_iommu_dump", parse_amd_iommu_dump);
+__setup("amd_iommu=", parse_amd_iommu_options);
+__setup("amd_iommu_intr=", parse_amd_iommu_intr);
+__setup("ivrs_ioapic", parse_ivrs_ioapic);
+__setup("ivrs_hpet", parse_ivrs_hpet);
+__setup("ivrs_acpihid", parse_ivrs_acpihid);
+
+IOMMU_INIT_FINISH(amd_iommu_detect,
+ gart_iommu_hole_init,
+ NULL,
+ NULL);
+
+bool amd_iommu_v2_supported(void)
+{
+ return amd_iommu_v2_present;
+}
+EXPORT_SYMBOL(amd_iommu_v2_supported);
+
+struct amd_iommu *get_amd_iommu(unsigned int idx)
+{
+ unsigned int i = 0;
+ struct amd_iommu *iommu;
+
+ for_each_iommu(iommu)
+ if (i++ == idx)
+ return iommu;
+ return NULL;
+}
+EXPORT_SYMBOL(get_amd_iommu);
+
+/****************************************************************************
+ *
+ * IOMMU EFR Performance Counter support functionality. This code allows
+ * access to the IOMMU PC functionality.
+ *
+ ****************************************************************************/
+
+u8 amd_iommu_pc_get_max_banks(unsigned int idx)
+{
+ struct amd_iommu *iommu = get_amd_iommu(idx);
+
+ if (iommu)
+ return iommu->max_banks;
+
+ return 0;
+}
+EXPORT_SYMBOL(amd_iommu_pc_get_max_banks);
+
+bool amd_iommu_pc_supported(void)
+{
+ return amd_iommu_pc_present;
+}
+EXPORT_SYMBOL(amd_iommu_pc_supported);
+
+u8 amd_iommu_pc_get_max_counters(unsigned int idx)
+{
+ struct amd_iommu *iommu = get_amd_iommu(idx);
+
+ if (iommu)
+ return iommu->max_counters;
+
+ return 0;
+}
+EXPORT_SYMBOL(amd_iommu_pc_get_max_counters);
+
+static int iommu_pc_get_set_reg(struct amd_iommu *iommu, u8 bank, u8 cntr,
+ u8 fxn, u64 *value, bool is_write)
+{
+ u32 offset;
+ u32 max_offset_lim;
+
+ /* Make sure the IOMMU PC resource is available */
+ if (!amd_iommu_pc_present)
+ return -ENODEV;
+
+ /* Check for valid iommu and pc register indexing */
+ if (WARN_ON(!iommu || (fxn > 0x28) || (fxn & 7)))
+ return -ENODEV;
+
+ offset = (u32)(((0x40 | bank) << 12) | (cntr << 8) | fxn);
+
+ /* Limit the offset to the hw defined mmio region aperture */
+ max_offset_lim = (u32)(((0x40 | iommu->max_banks) << 12) |
+ (iommu->max_counters << 8) | 0x28);
+ if ((offset < MMIO_CNTR_REG_OFFSET) ||
+ (offset > max_offset_lim))
+ return -EINVAL;
+
+ if (is_write) {
+ u64 val = *value & GENMASK_ULL(47, 0);
+
+ writel((u32)val, iommu->mmio_base + offset);
+ writel((val >> 32), iommu->mmio_base + offset + 4);
+ } else {
+ *value = readl(iommu->mmio_base + offset + 4);
+ *value <<= 32;
+ *value |= readl(iommu->mmio_base + offset);
+ *value &= GENMASK_ULL(47, 0);
+ }
+
+ return 0;
+}
+
+int amd_iommu_pc_get_reg(struct amd_iommu *iommu, u8 bank, u8 cntr, u8 fxn, u64 *value)
+{
+ if (!iommu)
+ return -EINVAL;
+
+ return iommu_pc_get_set_reg(iommu, bank, cntr, fxn, value, false);
+}
+EXPORT_SYMBOL(amd_iommu_pc_get_reg);
+
+int amd_iommu_pc_set_reg(struct amd_iommu *iommu, u8 bank, u8 cntr, u8 fxn, u64 *value)
+{
+ if (!iommu)
+ return -EINVAL;
+
+ return iommu_pc_get_set_reg(iommu, bank, cntr, fxn, value, true);
+}
+EXPORT_SYMBOL(amd_iommu_pc_set_reg);
diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c
new file mode 100644
index 000000000..0a061a196
--- /dev/null
+++ b/drivers/iommu/amd/iommu.c
@@ -0,0 +1,4144 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2007-2010 Advanced Micro Devices, Inc.
+ * Author: Joerg Roedel <jroedel@suse.de>
+ * Leo Duran <leo.duran@amd.com>
+ */
+
+#define pr_fmt(fmt) "AMD-Vi: " fmt
+#define dev_fmt(fmt) pr_fmt(fmt)
+
+#include <linux/ratelimit.h>
+#include <linux/pci.h>
+#include <linux/acpi.h>
+#include <linux/amba/bus.h>
+#include <linux/platform_device.h>
+#include <linux/pci-ats.h>
+#include <linux/bitmap.h>
+#include <linux/slab.h>
+#include <linux/debugfs.h>
+#include <linux/scatterlist.h>
+#include <linux/dma-map-ops.h>
+#include <linux/dma-direct.h>
+#include <linux/dma-iommu.h>
+#include <linux/iommu-helper.h>
+#include <linux/delay.h>
+#include <linux/amd-iommu.h>
+#include <linux/notifier.h>
+#include <linux/export.h>
+#include <linux/irq.h>
+#include <linux/msi.h>
+#include <linux/irqdomain.h>
+#include <linux/percpu.h>
+#include <linux/iova.h>
+#include <asm/irq_remapping.h>
+#include <asm/io_apic.h>
+#include <asm/apic.h>
+#include <asm/hw_irq.h>
+#include <asm/msidef.h>
+#include <asm/proto.h>
+#include <asm/iommu.h>
+#include <asm/gart.h>
+#include <asm/dma.h>
+
+#include "amd_iommu.h"
+#include "../irq_remapping.h"
+
+#define CMD_SET_TYPE(cmd, t) ((cmd)->data[1] |= ((t) << 28))
+
+#define LOOP_TIMEOUT 100000
+
+/* IO virtual address start page frame number */
+#define IOVA_START_PFN (1)
+#define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT)
+
+/* Reserved IOVA ranges */
+#define MSI_RANGE_START (0xfee00000)
+#define MSI_RANGE_END (0xfeefffff)
+#define HT_RANGE_START (0xfd00000000ULL)
+#define HT_RANGE_END (0xffffffffffULL)
+
+/*
+ * This bitmap is used to advertise the page sizes our hardware support
+ * to the IOMMU core, which will then use this information to split
+ * physically contiguous memory regions it is mapping into page sizes
+ * that we support.
+ *
+ * 512GB Pages are not supported due to a hardware bug
+ */
+#define AMD_IOMMU_PGSIZES ((~0xFFFUL) & ~(2ULL << 38))
+
+#define DEFAULT_PGTABLE_LEVEL PAGE_MODE_3_LEVEL
+
+static DEFINE_SPINLOCK(pd_bitmap_lock);
+
+/* List of all available dev_data structures */
+static LLIST_HEAD(dev_data_list);
+
+LIST_HEAD(ioapic_map);
+LIST_HEAD(hpet_map);
+LIST_HEAD(acpihid_map);
+
+/*
+ * Domain for untranslated devices - only allocated
+ * if iommu=pt passed on kernel cmd line.
+ */
+const struct iommu_ops amd_iommu_ops;
+
+static ATOMIC_NOTIFIER_HEAD(ppr_notifier);
+int amd_iommu_max_glx_val = -1;
+
+/*
+ * general struct to manage commands send to an IOMMU
+ */
+struct iommu_cmd {
+ u32 data[4];
+};
+
+struct kmem_cache *amd_iommu_irq_cache;
+
+static void update_domain(struct protection_domain *domain);
+static void detach_device(struct device *dev);
+static void update_and_flush_device_table(struct protection_domain *domain,
+ struct domain_pgtable *pgtable);
+
+/****************************************************************************
+ *
+ * Helper functions
+ *
+ ****************************************************************************/
+
+static inline u16 get_pci_device_id(struct device *dev)
+{
+ struct pci_dev *pdev = to_pci_dev(dev);
+
+ return pci_dev_id(pdev);
+}
+
+static inline int get_acpihid_device_id(struct device *dev,
+ struct acpihid_map_entry **entry)
+{
+ struct acpi_device *adev = ACPI_COMPANION(dev);
+ struct acpihid_map_entry *p;
+
+ if (!adev)
+ return -ENODEV;
+
+ list_for_each_entry(p, &acpihid_map, list) {
+ if (acpi_dev_hid_uid_match(adev, p->hid,
+ p->uid[0] ? p->uid : NULL)) {
+ if (entry)
+ *entry = p;
+ return p->devid;
+ }
+ }
+ return -EINVAL;
+}
+
+static inline int get_device_id(struct device *dev)
+{
+ int devid;
+
+ if (dev_is_pci(dev))
+ devid = get_pci_device_id(dev);
+ else
+ devid = get_acpihid_device_id(dev, NULL);
+
+ return devid;
+}
+
+static struct protection_domain *to_pdomain(struct iommu_domain *dom)
+{
+ return container_of(dom, struct protection_domain, domain);
+}
+
+static void amd_iommu_domain_get_pgtable(struct protection_domain *domain,
+ struct domain_pgtable *pgtable)
+{
+ u64 pt_root = atomic64_read(&domain->pt_root);
+
+ pgtable->root = (u64 *)(pt_root & PAGE_MASK);
+ pgtable->mode = pt_root & 7; /* lowest 3 bits encode pgtable mode */
+}
+
+static void amd_iommu_domain_set_pt_root(struct protection_domain *domain, u64 root)
+{
+ atomic64_set(&domain->pt_root, root);
+}
+
+static void amd_iommu_domain_clr_pt_root(struct protection_domain *domain)
+{
+ amd_iommu_domain_set_pt_root(domain, 0);
+}
+
+static void amd_iommu_domain_set_pgtable(struct protection_domain *domain,
+ u64 *root, int mode)
+{
+ u64 pt_root;
+
+ /* lowest 3 bits encode pgtable mode */
+ pt_root = mode & 7;
+ pt_root |= (u64)root;
+
+ amd_iommu_domain_set_pt_root(domain, pt_root);
+}
+
+static struct iommu_dev_data *alloc_dev_data(u16 devid)
+{
+ struct iommu_dev_data *dev_data;
+
+ dev_data = kzalloc(sizeof(*dev_data), GFP_KERNEL);
+ if (!dev_data)
+ return NULL;
+
+ spin_lock_init(&dev_data->lock);
+ dev_data->devid = devid;
+ ratelimit_default_init(&dev_data->rs);
+
+ llist_add(&dev_data->dev_data_list, &dev_data_list);
+ return dev_data;
+}
+
+static struct iommu_dev_data *search_dev_data(u16 devid)
+{
+ struct iommu_dev_data *dev_data;
+ struct llist_node *node;
+
+ if (llist_empty(&dev_data_list))
+ return NULL;
+
+ node = dev_data_list.first;
+ llist_for_each_entry(dev_data, node, dev_data_list) {
+ if (dev_data->devid == devid)
+ return dev_data;
+ }
+
+ return NULL;
+}
+
+static int clone_alias(struct pci_dev *pdev, u16 alias, void *data)
+{
+ u16 devid = pci_dev_id(pdev);
+
+ if (devid == alias)
+ return 0;
+
+ amd_iommu_rlookup_table[alias] =
+ amd_iommu_rlookup_table[devid];
+ memcpy(amd_iommu_dev_table[alias].data,
+ amd_iommu_dev_table[devid].data,
+ sizeof(amd_iommu_dev_table[alias].data));
+
+ return 0;
+}
+
+static void clone_aliases(struct pci_dev *pdev)
+{
+ if (!pdev)
+ return;
+
+ /*
+ * The IVRS alias stored in the alias table may not be
+ * part of the PCI DMA aliases if it's bus differs
+ * from the original device.
+ */
+ clone_alias(pdev, amd_iommu_alias_table[pci_dev_id(pdev)], NULL);
+
+ pci_for_each_dma_alias(pdev, clone_alias, NULL);
+}
+
+static struct pci_dev *setup_aliases(struct device *dev)
+{
+ struct pci_dev *pdev = to_pci_dev(dev);
+ u16 ivrs_alias;
+
+ /* For ACPI HID devices, there are no aliases */
+ if (!dev_is_pci(dev))
+ return NULL;
+
+ /*
+ * Add the IVRS alias to the pci aliases if it is on the same
+ * bus. The IVRS table may know about a quirk that we don't.
+ */
+ ivrs_alias = amd_iommu_alias_table[pci_dev_id(pdev)];
+ if (ivrs_alias != pci_dev_id(pdev) &&
+ PCI_BUS_NUM(ivrs_alias) == pdev->bus->number)
+ pci_add_dma_alias(pdev, ivrs_alias & 0xff, 1);
+
+ clone_aliases(pdev);
+
+ return pdev;
+}
+
+static struct iommu_dev_data *find_dev_data(u16 devid)
+{
+ struct iommu_dev_data *dev_data;
+ struct amd_iommu *iommu = amd_iommu_rlookup_table[devid];
+
+ dev_data = search_dev_data(devid);
+
+ if (dev_data == NULL) {
+ dev_data = alloc_dev_data(devid);
+ if (!dev_data)
+ return NULL;
+
+ if (translation_pre_enabled(iommu))
+ dev_data->defer_attach = true;
+ }
+
+ return dev_data;
+}
+
+/*
+* Find or create an IOMMU group for a acpihid device.
+*/
+static struct iommu_group *acpihid_device_group(struct device *dev)
+{
+ struct acpihid_map_entry *p, *entry = NULL;
+ int devid;
+
+ devid = get_acpihid_device_id(dev, &entry);
+ if (devid < 0)
+ return ERR_PTR(devid);
+
+ list_for_each_entry(p, &acpihid_map, list) {
+ if ((devid == p->devid) && p->group)
+ entry->group = p->group;
+ }
+
+ if (!entry->group)
+ entry->group = generic_device_group(dev);
+ else
+ iommu_group_ref_get(entry->group);
+
+ return entry->group;
+}
+
+static bool pci_iommuv2_capable(struct pci_dev *pdev)
+{
+ static const int caps[] = {
+ PCI_EXT_CAP_ID_PRI,
+ PCI_EXT_CAP_ID_PASID,
+ };
+ int i, pos;
+
+ if (!pci_ats_supported(pdev))
+ return false;
+
+ for (i = 0; i < 2; ++i) {
+ pos = pci_find_ext_capability(pdev, caps[i]);
+ if (pos == 0)
+ return false;
+ }
+
+ return true;
+}
+
+static bool pdev_pri_erratum(struct pci_dev *pdev, u32 erratum)
+{
+ struct iommu_dev_data *dev_data;
+
+ dev_data = dev_iommu_priv_get(&pdev->dev);
+
+ return dev_data->errata & (1 << erratum) ? true : false;
+}
+
+/*
+ * This function checks if the driver got a valid device from the caller to
+ * avoid dereferencing invalid pointers.
+ */
+static bool check_device(struct device *dev)
+{
+ int devid;
+
+ if (!dev)
+ return false;
+
+ devid = get_device_id(dev);
+ if (devid < 0)
+ return false;
+
+ /* Out of our scope? */
+ if (devid > amd_iommu_last_bdf)
+ return false;
+
+ if (amd_iommu_rlookup_table[devid] == NULL)
+ return false;
+
+ return true;
+}
+
+static int iommu_init_device(struct device *dev)
+{
+ struct iommu_dev_data *dev_data;
+ int devid;
+
+ if (dev_iommu_priv_get(dev))
+ return 0;
+
+ devid = get_device_id(dev);
+ if (devid < 0)
+ return devid;
+
+ dev_data = find_dev_data(devid);
+ if (!dev_data)
+ return -ENOMEM;
+
+ dev_data->pdev = setup_aliases(dev);
+
+ /*
+ * By default we use passthrough mode for IOMMUv2 capable device.
+ * But if amd_iommu=force_isolation is set (e.g. to debug DMA to
+ * invalid address), we ignore the capability for the device so
+ * it'll be forced to go into translation mode.
+ */
+ if ((iommu_default_passthrough() || !amd_iommu_force_isolation) &&
+ dev_is_pci(dev) && pci_iommuv2_capable(to_pci_dev(dev))) {
+ struct amd_iommu *iommu;
+
+ iommu = amd_iommu_rlookup_table[dev_data->devid];
+ dev_data->iommu_v2 = iommu->is_iommu_v2;
+ }
+
+ dev_iommu_priv_set(dev, dev_data);
+
+ return 0;
+}
+
+static void iommu_ignore_device(struct device *dev)
+{
+ int devid;
+
+ devid = get_device_id(dev);
+ if (devid < 0)
+ return;
+
+ amd_iommu_rlookup_table[devid] = NULL;
+ memset(&amd_iommu_dev_table[devid], 0, sizeof(struct dev_table_entry));
+
+ setup_aliases(dev);
+}
+
+static void amd_iommu_uninit_device(struct device *dev)
+{
+ struct iommu_dev_data *dev_data;
+
+ dev_data = dev_iommu_priv_get(dev);
+ if (!dev_data)
+ return;
+
+ if (dev_data->domain)
+ detach_device(dev);
+
+ dev_iommu_priv_set(dev, NULL);
+
+ /*
+ * We keep dev_data around for unplugged devices and reuse it when the
+ * device is re-plugged - not doing so would introduce a ton of races.
+ */
+}
+
+/*
+ * Helper function to get the first pte of a large mapping
+ */
+static u64 *first_pte_l7(u64 *pte, unsigned long *page_size,
+ unsigned long *count)
+{
+ unsigned long pte_mask, pg_size, cnt;
+ u64 *fpte;
+
+ pg_size = PTE_PAGE_SIZE(*pte);
+ cnt = PAGE_SIZE_PTE_COUNT(pg_size);
+ pte_mask = ~((cnt << 3) - 1);
+ fpte = (u64 *)(((unsigned long)pte) & pte_mask);
+
+ if (page_size)
+ *page_size = pg_size;
+
+ if (count)
+ *count = cnt;
+
+ return fpte;
+}
+
+/****************************************************************************
+ *
+ * Interrupt handling functions
+ *
+ ****************************************************************************/
+
+static void dump_dte_entry(u16 devid)
+{
+ int i;
+
+ for (i = 0; i < 4; ++i)
+ pr_err("DTE[%d]: %016llx\n", i,
+ amd_iommu_dev_table[devid].data[i]);
+}
+
+static void dump_command(unsigned long phys_addr)
+{
+ struct iommu_cmd *cmd = iommu_phys_to_virt(phys_addr);
+ int i;
+
+ for (i = 0; i < 4; ++i)
+ pr_err("CMD[%d]: %08x\n", i, cmd->data[i]);
+}
+
+static void amd_iommu_report_rmp_hw_error(volatile u32 *event)
+{
+ struct iommu_dev_data *dev_data = NULL;
+ int devid, vmg_tag, flags;
+ struct pci_dev *pdev;
+ u64 spa;
+
+ devid = (event[0] >> EVENT_DEVID_SHIFT) & EVENT_DEVID_MASK;
+ vmg_tag = (event[1]) & 0xFFFF;
+ flags = (event[1] >> EVENT_FLAGS_SHIFT) & EVENT_FLAGS_MASK;
+ spa = ((u64)event[3] << 32) | (event[2] & 0xFFFFFFF8);
+
+ pdev = pci_get_domain_bus_and_slot(0, PCI_BUS_NUM(devid),
+ devid & 0xff);
+ if (pdev)
+ dev_data = dev_iommu_priv_get(&pdev->dev);
+
+ if (dev_data && __ratelimit(&dev_data->rs)) {
+ pci_err(pdev, "Event logged [RMP_HW_ERROR vmg_tag=0x%04x, spa=0x%llx, flags=0x%04x]\n",
+ vmg_tag, spa, flags);
+ } else {
+ pr_err_ratelimited("Event logged [RMP_HW_ERROR device=%02x:%02x.%x, vmg_tag=0x%04x, spa=0x%llx, flags=0x%04x]\n",
+ PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid),
+ vmg_tag, spa, flags);
+ }
+
+ if (pdev)
+ pci_dev_put(pdev);
+}
+
+static void amd_iommu_report_rmp_fault(volatile u32 *event)
+{
+ struct iommu_dev_data *dev_data = NULL;
+ int devid, flags_rmp, vmg_tag, flags;
+ struct pci_dev *pdev;
+ u64 gpa;
+
+ devid = (event[0] >> EVENT_DEVID_SHIFT) & EVENT_DEVID_MASK;
+ flags_rmp = (event[0] >> EVENT_FLAGS_SHIFT) & 0xFF;
+ vmg_tag = (event[1]) & 0xFFFF;
+ flags = (event[1] >> EVENT_FLAGS_SHIFT) & EVENT_FLAGS_MASK;
+ gpa = ((u64)event[3] << 32) | event[2];
+
+ pdev = pci_get_domain_bus_and_slot(0, PCI_BUS_NUM(devid),
+ devid & 0xff);
+ if (pdev)
+ dev_data = dev_iommu_priv_get(&pdev->dev);
+
+ if (dev_data && __ratelimit(&dev_data->rs)) {
+ pci_err(pdev, "Event logged [RMP_PAGE_FAULT vmg_tag=0x%04x, gpa=0x%llx, flags_rmp=0x%04x, flags=0x%04x]\n",
+ vmg_tag, gpa, flags_rmp, flags);
+ } else {
+ pr_err_ratelimited("Event logged [RMP_PAGE_FAULT device=%02x:%02x.%x, vmg_tag=0x%04x, gpa=0x%llx, flags_rmp=0x%04x, flags=0x%04x]\n",
+ PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid),
+ vmg_tag, gpa, flags_rmp, flags);
+ }
+
+ if (pdev)
+ pci_dev_put(pdev);
+}
+
+static void amd_iommu_report_page_fault(u16 devid, u16 domain_id,
+ u64 address, int flags)
+{
+ struct iommu_dev_data *dev_data = NULL;
+ struct pci_dev *pdev;
+
+ pdev = pci_get_domain_bus_and_slot(0, PCI_BUS_NUM(devid),
+ devid & 0xff);
+ if (pdev)
+ dev_data = dev_iommu_priv_get(&pdev->dev);
+
+ if (dev_data && __ratelimit(&dev_data->rs)) {
+ pci_err(pdev, "Event logged [IO_PAGE_FAULT domain=0x%04x address=0x%llx flags=0x%04x]\n",
+ domain_id, address, flags);
+ } else if (printk_ratelimit()) {
+ pr_err("Event logged [IO_PAGE_FAULT device=%02x:%02x.%x domain=0x%04x address=0x%llx flags=0x%04x]\n",
+ PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid),
+ domain_id, address, flags);
+ }
+
+ if (pdev)
+ pci_dev_put(pdev);
+}
+
+static void iommu_print_event(struct amd_iommu *iommu, void *__evt)
+{
+ struct device *dev = iommu->iommu.dev;
+ int type, devid, flags, tag;
+ volatile u32 *event = __evt;
+ int count = 0;
+ u64 address;
+ u32 pasid;
+
+retry:
+ type = (event[1] >> EVENT_TYPE_SHIFT) & EVENT_TYPE_MASK;
+ devid = (event[0] >> EVENT_DEVID_SHIFT) & EVENT_DEVID_MASK;
+ pasid = (event[0] & EVENT_DOMID_MASK_HI) |
+ (event[1] & EVENT_DOMID_MASK_LO);
+ flags = (event[1] >> EVENT_FLAGS_SHIFT) & EVENT_FLAGS_MASK;
+ address = (u64)(((u64)event[3]) << 32) | event[2];
+
+ if (type == 0) {
+ /* Did we hit the erratum? */
+ if (++count == LOOP_TIMEOUT) {
+ pr_err("No event written to event log\n");
+ return;
+ }
+ udelay(1);
+ goto retry;
+ }
+
+ if (type == EVENT_TYPE_IO_FAULT) {
+ amd_iommu_report_page_fault(devid, pasid, address, flags);
+ return;
+ }
+
+ switch (type) {
+ case EVENT_TYPE_ILL_DEV:
+ dev_err(dev, "Event logged [ILLEGAL_DEV_TABLE_ENTRY device=%02x:%02x.%x pasid=0x%05x address=0x%llx flags=0x%04x]\n",
+ PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid),
+ pasid, address, flags);
+ dump_dte_entry(devid);
+ break;
+ case EVENT_TYPE_DEV_TAB_ERR:
+ dev_err(dev, "Event logged [DEV_TAB_HARDWARE_ERROR device=%02x:%02x.%x "
+ "address=0x%llx flags=0x%04x]\n",
+ PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid),
+ address, flags);
+ break;
+ case EVENT_TYPE_PAGE_TAB_ERR:
+ dev_err(dev, "Event logged [PAGE_TAB_HARDWARE_ERROR device=%02x:%02x.%x pasid=0x%04x address=0x%llx flags=0x%04x]\n",
+ PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid),
+ pasid, address, flags);
+ break;
+ case EVENT_TYPE_ILL_CMD:
+ dev_err(dev, "Event logged [ILLEGAL_COMMAND_ERROR address=0x%llx]\n", address);
+ dump_command(address);
+ break;
+ case EVENT_TYPE_CMD_HARD_ERR:
+ dev_err(dev, "Event logged [COMMAND_HARDWARE_ERROR address=0x%llx flags=0x%04x]\n",
+ address, flags);
+ break;
+ case EVENT_TYPE_IOTLB_INV_TO:
+ dev_err(dev, "Event logged [IOTLB_INV_TIMEOUT device=%02x:%02x.%x address=0x%llx]\n",
+ PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid),
+ address);
+ break;
+ case EVENT_TYPE_INV_DEV_REQ:
+ dev_err(dev, "Event logged [INVALID_DEVICE_REQUEST device=%02x:%02x.%x pasid=0x%05x address=0x%llx flags=0x%04x]\n",
+ PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid),
+ pasid, address, flags);
+ break;
+ case EVENT_TYPE_RMP_FAULT:
+ amd_iommu_report_rmp_fault(event);
+ break;
+ case EVENT_TYPE_RMP_HW_ERR:
+ amd_iommu_report_rmp_hw_error(event);
+ break;
+ case EVENT_TYPE_INV_PPR_REQ:
+ pasid = PPR_PASID(*((u64 *)__evt));
+ tag = event[1] & 0x03FF;
+ dev_err(dev, "Event logged [INVALID_PPR_REQUEST device=%02x:%02x.%x pasid=0x%05x address=0x%llx flags=0x%04x tag=0x%03x]\n",
+ PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid),
+ pasid, address, flags, tag);
+ break;
+ default:
+ dev_err(dev, "Event logged [UNKNOWN event[0]=0x%08x event[1]=0x%08x event[2]=0x%08x event[3]=0x%08x\n",
+ event[0], event[1], event[2], event[3]);
+ }
+
+ memset(__evt, 0, 4 * sizeof(u32));
+}
+
+static void iommu_poll_events(struct amd_iommu *iommu)
+{
+ u32 head, tail;
+
+ head = readl(iommu->mmio_base + MMIO_EVT_HEAD_OFFSET);
+ tail = readl(iommu->mmio_base + MMIO_EVT_TAIL_OFFSET);
+
+ while (head != tail) {
+ iommu_print_event(iommu, iommu->evt_buf + head);
+ head = (head + EVENT_ENTRY_SIZE) % EVT_BUFFER_SIZE;
+ }
+
+ writel(head, iommu->mmio_base + MMIO_EVT_HEAD_OFFSET);
+}
+
+static void iommu_handle_ppr_entry(struct amd_iommu *iommu, u64 *raw)
+{
+ struct amd_iommu_fault fault;
+
+ if (PPR_REQ_TYPE(raw[0]) != PPR_REQ_FAULT) {
+ pr_err_ratelimited("Unknown PPR request received\n");
+ return;
+ }
+
+ fault.address = raw[1];
+ fault.pasid = PPR_PASID(raw[0]);
+ fault.device_id = PPR_DEVID(raw[0]);
+ fault.tag = PPR_TAG(raw[0]);
+ fault.flags = PPR_FLAGS(raw[0]);
+
+ atomic_notifier_call_chain(&ppr_notifier, 0, &fault);
+}
+
+static void iommu_poll_ppr_log(struct amd_iommu *iommu)
+{
+ u32 head, tail;
+
+ if (iommu->ppr_log == NULL)
+ return;
+
+ head = readl(iommu->mmio_base + MMIO_PPR_HEAD_OFFSET);
+ tail = readl(iommu->mmio_base + MMIO_PPR_TAIL_OFFSET);
+
+ while (head != tail) {
+ volatile u64 *raw;
+ u64 entry[2];
+ int i;
+
+ raw = (u64 *)(iommu->ppr_log + head);
+
+ /*
+ * Hardware bug: Interrupt may arrive before the entry is
+ * written to memory. If this happens we need to wait for the
+ * entry to arrive.
+ */
+ for (i = 0; i < LOOP_TIMEOUT; ++i) {
+ if (PPR_REQ_TYPE(raw[0]) != 0)
+ break;
+ udelay(1);
+ }
+
+ /* Avoid memcpy function-call overhead */
+ entry[0] = raw[0];
+ entry[1] = raw[1];
+
+ /*
+ * To detect the hardware bug we need to clear the entry
+ * back to zero.
+ */
+ raw[0] = raw[1] = 0UL;
+
+ /* Update head pointer of hardware ring-buffer */
+ head = (head + PPR_ENTRY_SIZE) % PPR_LOG_SIZE;
+ writel(head, iommu->mmio_base + MMIO_PPR_HEAD_OFFSET);
+
+ /* Handle PPR entry */
+ iommu_handle_ppr_entry(iommu, entry);
+
+ /* Refresh ring-buffer information */
+ head = readl(iommu->mmio_base + MMIO_PPR_HEAD_OFFSET);
+ tail = readl(iommu->mmio_base + MMIO_PPR_TAIL_OFFSET);
+ }
+}
+
+#ifdef CONFIG_IRQ_REMAP
+static int (*iommu_ga_log_notifier)(u32);
+
+int amd_iommu_register_ga_log_notifier(int (*notifier)(u32))
+{
+ iommu_ga_log_notifier = notifier;
+
+ return 0;
+}
+EXPORT_SYMBOL(amd_iommu_register_ga_log_notifier);
+
+static void iommu_poll_ga_log(struct amd_iommu *iommu)
+{
+ u32 head, tail, cnt = 0;
+
+ if (iommu->ga_log == NULL)
+ return;
+
+ head = readl(iommu->mmio_base + MMIO_GA_HEAD_OFFSET);
+ tail = readl(iommu->mmio_base + MMIO_GA_TAIL_OFFSET);
+
+ while (head != tail) {
+ volatile u64 *raw;
+ u64 log_entry;
+
+ raw = (u64 *)(iommu->ga_log + head);
+ cnt++;
+
+ /* Avoid memcpy function-call overhead */
+ log_entry = *raw;
+
+ /* Update head pointer of hardware ring-buffer */
+ head = (head + GA_ENTRY_SIZE) % GA_LOG_SIZE;
+ writel(head, iommu->mmio_base + MMIO_GA_HEAD_OFFSET);
+
+ /* Handle GA entry */
+ switch (GA_REQ_TYPE(log_entry)) {
+ case GA_GUEST_NR:
+ if (!iommu_ga_log_notifier)
+ break;
+
+ pr_debug("%s: devid=%#x, ga_tag=%#x\n",
+ __func__, GA_DEVID(log_entry),
+ GA_TAG(log_entry));
+
+ if (iommu_ga_log_notifier(GA_TAG(log_entry)) != 0)
+ pr_err("GA log notifier failed.\n");
+ break;
+ default:
+ break;
+ }
+ }
+}
+
+static void
+amd_iommu_set_pci_msi_domain(struct device *dev, struct amd_iommu *iommu)
+{
+ if (!irq_remapping_enabled || !dev_is_pci(dev) ||
+ pci_dev_has_special_msi_domain(to_pci_dev(dev)))
+ return;
+
+ dev_set_msi_domain(dev, iommu->msi_domain);
+}
+
+#else /* CONFIG_IRQ_REMAP */
+static inline void
+amd_iommu_set_pci_msi_domain(struct device *dev, struct amd_iommu *iommu) { }
+#endif /* !CONFIG_IRQ_REMAP */
+
+#define AMD_IOMMU_INT_MASK \
+ (MMIO_STATUS_EVT_OVERFLOW_INT_MASK | \
+ MMIO_STATUS_EVT_INT_MASK | \
+ MMIO_STATUS_PPR_INT_MASK | \
+ MMIO_STATUS_GALOG_INT_MASK)
+
+irqreturn_t amd_iommu_int_thread(int irq, void *data)
+{
+ struct amd_iommu *iommu = (struct amd_iommu *) data;
+ u32 status = readl(iommu->mmio_base + MMIO_STATUS_OFFSET);
+
+ while (status & AMD_IOMMU_INT_MASK) {
+ /* Enable interrupt sources again */
+ writel(AMD_IOMMU_INT_MASK,
+ iommu->mmio_base + MMIO_STATUS_OFFSET);
+
+ if (status & MMIO_STATUS_EVT_INT_MASK) {
+ pr_devel("Processing IOMMU Event Log\n");
+ iommu_poll_events(iommu);
+ }
+
+ if (status & MMIO_STATUS_PPR_INT_MASK) {
+ pr_devel("Processing IOMMU PPR Log\n");
+ iommu_poll_ppr_log(iommu);
+ }
+
+#ifdef CONFIG_IRQ_REMAP
+ if (status & MMIO_STATUS_GALOG_INT_MASK) {
+ pr_devel("Processing IOMMU GA Log\n");
+ iommu_poll_ga_log(iommu);
+ }
+#endif
+
+ if (status & MMIO_STATUS_EVT_OVERFLOW_INT_MASK) {
+ pr_info_ratelimited("IOMMU event log overflow\n");
+ amd_iommu_restart_event_logging(iommu);
+ }
+
+ /*
+ * Hardware bug: ERBT1312
+ * When re-enabling interrupt (by writing 1
+ * to clear the bit), the hardware might also try to set
+ * the interrupt bit in the event status register.
+ * In this scenario, the bit will be set, and disable
+ * subsequent interrupts.
+ *
+ * Workaround: The IOMMU driver should read back the
+ * status register and check if the interrupt bits are cleared.
+ * If not, driver will need to go through the interrupt handler
+ * again and re-clear the bits
+ */
+ status = readl(iommu->mmio_base + MMIO_STATUS_OFFSET);
+ }
+ return IRQ_HANDLED;
+}
+
+irqreturn_t amd_iommu_int_handler(int irq, void *data)
+{
+ return IRQ_WAKE_THREAD;
+}
+
+/****************************************************************************
+ *
+ * IOMMU command queuing functions
+ *
+ ****************************************************************************/
+
+static int wait_on_sem(struct amd_iommu *iommu, u64 data)
+{
+ int i = 0;
+
+ while (*iommu->cmd_sem != data && i < LOOP_TIMEOUT) {
+ udelay(1);
+ i += 1;
+ }
+
+ if (i == LOOP_TIMEOUT) {
+ pr_alert("Completion-Wait loop timed out\n");
+ return -EIO;
+ }
+
+ return 0;
+}
+
+static void copy_cmd_to_buffer(struct amd_iommu *iommu,
+ struct iommu_cmd *cmd)
+{
+ u8 *target;
+ u32 tail;
+
+ /* Copy command to buffer */
+ tail = iommu->cmd_buf_tail;
+ target = iommu->cmd_buf + tail;
+ memcpy(target, cmd, sizeof(*cmd));
+
+ tail = (tail + sizeof(*cmd)) % CMD_BUFFER_SIZE;
+ iommu->cmd_buf_tail = tail;
+
+ /* Tell the IOMMU about it */
+ writel(tail, iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
+}
+
+static void build_completion_wait(struct iommu_cmd *cmd,
+ struct amd_iommu *iommu,
+ u64 data)
+{
+ u64 paddr = iommu_virt_to_phys((void *)iommu->cmd_sem);
+
+ memset(cmd, 0, sizeof(*cmd));
+ cmd->data[0] = lower_32_bits(paddr) | CMD_COMPL_WAIT_STORE_MASK;
+ cmd->data[1] = upper_32_bits(paddr);
+ cmd->data[2] = lower_32_bits(data);
+ cmd->data[3] = upper_32_bits(data);
+ CMD_SET_TYPE(cmd, CMD_COMPL_WAIT);
+}
+
+static void build_inv_dte(struct iommu_cmd *cmd, u16 devid)
+{
+ memset(cmd, 0, sizeof(*cmd));
+ cmd->data[0] = devid;
+ CMD_SET_TYPE(cmd, CMD_INV_DEV_ENTRY);
+}
+
+static void build_inv_iommu_pages(struct iommu_cmd *cmd, u64 address,
+ size_t size, u16 domid, int pde)
+{
+ u64 pages;
+ bool s;
+
+ pages = iommu_num_pages(address, size, PAGE_SIZE);
+ s = false;
+
+ if (pages > 1) {
+ /*
+ * If we have to flush more than one page, flush all
+ * TLB entries for this domain
+ */
+ address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS;
+ s = true;
+ }
+
+ address &= PAGE_MASK;
+
+ memset(cmd, 0, sizeof(*cmd));
+ cmd->data[1] |= domid;
+ cmd->data[2] = lower_32_bits(address);
+ cmd->data[3] = upper_32_bits(address);
+ CMD_SET_TYPE(cmd, CMD_INV_IOMMU_PAGES);
+ if (s) /* size bit - we flush more than one 4kb page */
+ cmd->data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK;
+ if (pde) /* PDE bit - we want to flush everything, not only the PTEs */
+ cmd->data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK;
+}
+
+static void build_inv_iotlb_pages(struct iommu_cmd *cmd, u16 devid, int qdep,
+ u64 address, size_t size)
+{
+ u64 pages;
+ bool s;
+
+ pages = iommu_num_pages(address, size, PAGE_SIZE);
+ s = false;
+
+ if (pages > 1) {
+ /*
+ * If we have to flush more than one page, flush all
+ * TLB entries for this domain
+ */
+ address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS;
+ s = true;
+ }
+
+ address &= PAGE_MASK;
+
+ memset(cmd, 0, sizeof(*cmd));
+ cmd->data[0] = devid;
+ cmd->data[0] |= (qdep & 0xff) << 24;
+ cmd->data[1] = devid;
+ cmd->data[2] = lower_32_bits(address);
+ cmd->data[3] = upper_32_bits(address);
+ CMD_SET_TYPE(cmd, CMD_INV_IOTLB_PAGES);
+ if (s)
+ cmd->data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK;
+}
+
+static void build_inv_iommu_pasid(struct iommu_cmd *cmd, u16 domid, u32 pasid,
+ u64 address, bool size)
+{
+ memset(cmd, 0, sizeof(*cmd));
+
+ address &= ~(0xfffULL);
+
+ cmd->data[0] = pasid;
+ cmd->data[1] = domid;
+ cmd->data[2] = lower_32_bits(address);
+ cmd->data[3] = upper_32_bits(address);
+ cmd->data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK;
+ cmd->data[2] |= CMD_INV_IOMMU_PAGES_GN_MASK;
+ if (size)
+ cmd->data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK;
+ CMD_SET_TYPE(cmd, CMD_INV_IOMMU_PAGES);
+}
+
+static void build_inv_iotlb_pasid(struct iommu_cmd *cmd, u16 devid, u32 pasid,
+ int qdep, u64 address, bool size)
+{
+ memset(cmd, 0, sizeof(*cmd));
+
+ address &= ~(0xfffULL);
+
+ cmd->data[0] = devid;
+ cmd->data[0] |= ((pasid >> 8) & 0xff) << 16;
+ cmd->data[0] |= (qdep & 0xff) << 24;
+ cmd->data[1] = devid;
+ cmd->data[1] |= (pasid & 0xff) << 16;
+ cmd->data[2] = lower_32_bits(address);
+ cmd->data[2] |= CMD_INV_IOMMU_PAGES_GN_MASK;
+ cmd->data[3] = upper_32_bits(address);
+ if (size)
+ cmd->data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK;
+ CMD_SET_TYPE(cmd, CMD_INV_IOTLB_PAGES);
+}
+
+static void build_complete_ppr(struct iommu_cmd *cmd, u16 devid, u32 pasid,
+ int status, int tag, bool gn)
+{
+ memset(cmd, 0, sizeof(*cmd));
+
+ cmd->data[0] = devid;
+ if (gn) {
+ cmd->data[1] = pasid;
+ cmd->data[2] = CMD_INV_IOMMU_PAGES_GN_MASK;
+ }
+ cmd->data[3] = tag & 0x1ff;
+ cmd->data[3] |= (status & PPR_STATUS_MASK) << PPR_STATUS_SHIFT;
+
+ CMD_SET_TYPE(cmd, CMD_COMPLETE_PPR);
+}
+
+static void build_inv_all(struct iommu_cmd *cmd)
+{
+ memset(cmd, 0, sizeof(*cmd));
+ CMD_SET_TYPE(cmd, CMD_INV_ALL);
+}
+
+static void build_inv_irt(struct iommu_cmd *cmd, u16 devid)
+{
+ memset(cmd, 0, sizeof(*cmd));
+ cmd->data[0] = devid;
+ CMD_SET_TYPE(cmd, CMD_INV_IRT);
+}
+
+/*
+ * Writes the command to the IOMMUs command buffer and informs the
+ * hardware about the new command.
+ */
+static int __iommu_queue_command_sync(struct amd_iommu *iommu,
+ struct iommu_cmd *cmd,
+ bool sync)
+{
+ unsigned int count = 0;
+ u32 left, next_tail;
+
+ next_tail = (iommu->cmd_buf_tail + sizeof(*cmd)) % CMD_BUFFER_SIZE;
+again:
+ left = (iommu->cmd_buf_head - next_tail) % CMD_BUFFER_SIZE;
+
+ if (left <= 0x20) {
+ /* Skip udelay() the first time around */
+ if (count++) {
+ if (count == LOOP_TIMEOUT) {
+ pr_err("Command buffer timeout\n");
+ return -EIO;
+ }
+
+ udelay(1);
+ }
+
+ /* Update head and recheck remaining space */
+ iommu->cmd_buf_head = readl(iommu->mmio_base +
+ MMIO_CMD_HEAD_OFFSET);
+
+ goto again;
+ }
+
+ copy_cmd_to_buffer(iommu, cmd);
+
+ /* Do we need to make sure all commands are processed? */
+ iommu->need_sync = sync;
+
+ return 0;
+}
+
+static int iommu_queue_command_sync(struct amd_iommu *iommu,
+ struct iommu_cmd *cmd,
+ bool sync)
+{
+ unsigned long flags;
+ int ret;
+
+ raw_spin_lock_irqsave(&iommu->lock, flags);
+ ret = __iommu_queue_command_sync(iommu, cmd, sync);
+ raw_spin_unlock_irqrestore(&iommu->lock, flags);
+
+ return ret;
+}
+
+static int iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd)
+{
+ return iommu_queue_command_sync(iommu, cmd, true);
+}
+
+/*
+ * This function queues a completion wait command into the command
+ * buffer of an IOMMU
+ */
+static int iommu_completion_wait(struct amd_iommu *iommu)
+{
+ struct iommu_cmd cmd;
+ unsigned long flags;
+ int ret;
+ u64 data;
+
+ if (!iommu->need_sync)
+ return 0;
+
+ raw_spin_lock_irqsave(&iommu->lock, flags);
+
+ data = ++iommu->cmd_sem_val;
+ build_completion_wait(&cmd, iommu, data);
+
+ ret = __iommu_queue_command_sync(iommu, &cmd, false);
+ if (ret)
+ goto out_unlock;
+
+ ret = wait_on_sem(iommu, data);
+
+out_unlock:
+ raw_spin_unlock_irqrestore(&iommu->lock, flags);
+
+ return ret;
+}
+
+static int iommu_flush_dte(struct amd_iommu *iommu, u16 devid)
+{
+ struct iommu_cmd cmd;
+
+ build_inv_dte(&cmd, devid);
+
+ return iommu_queue_command(iommu, &cmd);
+}
+
+static void amd_iommu_flush_dte_all(struct amd_iommu *iommu)
+{
+ u32 devid;
+
+ for (devid = 0; devid <= 0xffff; ++devid)
+ iommu_flush_dte(iommu, devid);
+
+ iommu_completion_wait(iommu);
+}
+
+/*
+ * This function uses heavy locking and may disable irqs for some time. But
+ * this is no issue because it is only called during resume.
+ */
+static void amd_iommu_flush_tlb_all(struct amd_iommu *iommu)
+{
+ u32 dom_id;
+
+ for (dom_id = 0; dom_id <= 0xffff; ++dom_id) {
+ struct iommu_cmd cmd;
+ build_inv_iommu_pages(&cmd, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS,
+ dom_id, 1);
+ iommu_queue_command(iommu, &cmd);
+ }
+
+ iommu_completion_wait(iommu);
+}
+
+static void amd_iommu_flush_tlb_domid(struct amd_iommu *iommu, u32 dom_id)
+{
+ struct iommu_cmd cmd;
+
+ build_inv_iommu_pages(&cmd, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS,
+ dom_id, 1);
+ iommu_queue_command(iommu, &cmd);
+
+ iommu_completion_wait(iommu);
+}
+
+static void amd_iommu_flush_all(struct amd_iommu *iommu)
+{
+ struct iommu_cmd cmd;
+
+ build_inv_all(&cmd);
+
+ iommu_queue_command(iommu, &cmd);
+ iommu_completion_wait(iommu);
+}
+
+static void iommu_flush_irt(struct amd_iommu *iommu, u16 devid)
+{
+ struct iommu_cmd cmd;
+
+ build_inv_irt(&cmd, devid);
+
+ iommu_queue_command(iommu, &cmd);
+}
+
+static void amd_iommu_flush_irt_all(struct amd_iommu *iommu)
+{
+ u32 devid;
+
+ for (devid = 0; devid <= MAX_DEV_TABLE_ENTRIES; devid++)
+ iommu_flush_irt(iommu, devid);
+
+ iommu_completion_wait(iommu);
+}
+
+void iommu_flush_all_caches(struct amd_iommu *iommu)
+{
+ if (iommu_feature(iommu, FEATURE_IA)) {
+ amd_iommu_flush_all(iommu);
+ } else {
+ amd_iommu_flush_dte_all(iommu);
+ amd_iommu_flush_irt_all(iommu);
+ amd_iommu_flush_tlb_all(iommu);
+ }
+}
+
+/*
+ * Command send function for flushing on-device TLB
+ */
+static int device_flush_iotlb(struct iommu_dev_data *dev_data,
+ u64 address, size_t size)
+{
+ struct amd_iommu *iommu;
+ struct iommu_cmd cmd;
+ int qdep;
+
+ qdep = dev_data->ats.qdep;
+ iommu = amd_iommu_rlookup_table[dev_data->devid];
+
+ build_inv_iotlb_pages(&cmd, dev_data->devid, qdep, address, size);
+
+ return iommu_queue_command(iommu, &cmd);
+}
+
+static int device_flush_dte_alias(struct pci_dev *pdev, u16 alias, void *data)
+{
+ struct amd_iommu *iommu = data;
+
+ return iommu_flush_dte(iommu, alias);
+}
+
+/*
+ * Command send function for invalidating a device table entry
+ */
+static int device_flush_dte(struct iommu_dev_data *dev_data)
+{
+ struct amd_iommu *iommu;
+ u16 alias;
+ int ret;
+
+ iommu = amd_iommu_rlookup_table[dev_data->devid];
+
+ if (dev_data->pdev)
+ ret = pci_for_each_dma_alias(dev_data->pdev,
+ device_flush_dte_alias, iommu);
+ else
+ ret = iommu_flush_dte(iommu, dev_data->devid);
+ if (ret)
+ return ret;
+
+ alias = amd_iommu_alias_table[dev_data->devid];
+ if (alias != dev_data->devid) {
+ ret = iommu_flush_dte(iommu, alias);
+ if (ret)
+ return ret;
+ }
+
+ if (dev_data->ats.enabled)
+ ret = device_flush_iotlb(dev_data, 0, ~0UL);
+
+ return ret;
+}
+
+/*
+ * TLB invalidation function which is called from the mapping functions.
+ * It invalidates a single PTE if the range to flush is within a single
+ * page. Otherwise it flushes the whole TLB of the IOMMU.
+ */
+static void __domain_flush_pages(struct protection_domain *domain,
+ u64 address, size_t size, int pde)
+{
+ struct iommu_dev_data *dev_data;
+ struct iommu_cmd cmd;
+ int ret = 0, i;
+
+ build_inv_iommu_pages(&cmd, address, size, domain->id, pde);
+
+ for (i = 0; i < amd_iommu_get_num_iommus(); ++i) {
+ if (!domain->dev_iommu[i])
+ continue;
+
+ /*
+ * Devices of this domain are behind this IOMMU
+ * We need a TLB flush
+ */
+ ret |= iommu_queue_command(amd_iommus[i], &cmd);
+ }
+
+ list_for_each_entry(dev_data, &domain->dev_list, list) {
+
+ if (!dev_data->ats.enabled)
+ continue;
+
+ ret |= device_flush_iotlb(dev_data, address, size);
+ }
+
+ WARN_ON(ret);
+}
+
+static void domain_flush_pages(struct protection_domain *domain,
+ u64 address, size_t size)
+{
+ __domain_flush_pages(domain, address, size, 0);
+}
+
+/* Flush the whole IO/TLB for a given protection domain - including PDE */
+static void domain_flush_tlb_pde(struct protection_domain *domain)
+{
+ __domain_flush_pages(domain, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, 1);
+}
+
+static void domain_flush_complete(struct protection_domain *domain)
+{
+ int i;
+
+ for (i = 0; i < amd_iommu_get_num_iommus(); ++i) {
+ if (domain && !domain->dev_iommu[i])
+ continue;
+
+ /*
+ * Devices of this domain are behind this IOMMU
+ * We need to wait for completion of all commands.
+ */
+ iommu_completion_wait(amd_iommus[i]);
+ }
+}
+
+/* Flush the not present cache if it exists */
+static void domain_flush_np_cache(struct protection_domain *domain,
+ dma_addr_t iova, size_t size)
+{
+ if (unlikely(amd_iommu_np_cache)) {
+ unsigned long flags;
+
+ spin_lock_irqsave(&domain->lock, flags);
+ domain_flush_pages(domain, iova, size);
+ domain_flush_complete(domain);
+ spin_unlock_irqrestore(&domain->lock, flags);
+ }
+}
+
+
+/*
+ * This function flushes the DTEs for all devices in domain
+ */
+static void domain_flush_devices(struct protection_domain *domain)
+{
+ struct iommu_dev_data *dev_data;
+
+ list_for_each_entry(dev_data, &domain->dev_list, list)
+ device_flush_dte(dev_data);
+}
+
+/****************************************************************************
+ *
+ * The functions below are used the create the page table mappings for
+ * unity mapped regions.
+ *
+ ****************************************************************************/
+
+static void free_page_list(struct page *freelist)
+{
+ while (freelist != NULL) {
+ unsigned long p = (unsigned long)page_address(freelist);
+ freelist = freelist->freelist;
+ free_page(p);
+ }
+}
+
+static struct page *free_pt_page(unsigned long pt, struct page *freelist)
+{
+ struct page *p = virt_to_page((void *)pt);
+
+ p->freelist = freelist;
+
+ return p;
+}
+
+#define DEFINE_FREE_PT_FN(LVL, FN) \
+static struct page *free_pt_##LVL (unsigned long __pt, struct page *freelist) \
+{ \
+ unsigned long p; \
+ u64 *pt; \
+ int i; \
+ \
+ pt = (u64 *)__pt; \
+ \
+ for (i = 0; i < 512; ++i) { \
+ /* PTE present? */ \
+ if (!IOMMU_PTE_PRESENT(pt[i])) \
+ continue; \
+ \
+ /* Large PTE? */ \
+ if (PM_PTE_LEVEL(pt[i]) == 0 || \
+ PM_PTE_LEVEL(pt[i]) == 7) \
+ continue; \
+ \
+ p = (unsigned long)IOMMU_PTE_PAGE(pt[i]); \
+ freelist = FN(p, freelist); \
+ } \
+ \
+ return free_pt_page((unsigned long)pt, freelist); \
+}
+
+DEFINE_FREE_PT_FN(l2, free_pt_page)
+DEFINE_FREE_PT_FN(l3, free_pt_l2)
+DEFINE_FREE_PT_FN(l4, free_pt_l3)
+DEFINE_FREE_PT_FN(l5, free_pt_l4)
+DEFINE_FREE_PT_FN(l6, free_pt_l5)
+
+static struct page *free_sub_pt(unsigned long root, int mode,
+ struct page *freelist)
+{
+ switch (mode) {
+ case PAGE_MODE_NONE:
+ case PAGE_MODE_7_LEVEL:
+ break;
+ case PAGE_MODE_1_LEVEL:
+ freelist = free_pt_page(root, freelist);
+ break;
+ case PAGE_MODE_2_LEVEL:
+ freelist = free_pt_l2(root, freelist);
+ break;
+ case PAGE_MODE_3_LEVEL:
+ freelist = free_pt_l3(root, freelist);
+ break;
+ case PAGE_MODE_4_LEVEL:
+ freelist = free_pt_l4(root, freelist);
+ break;
+ case PAGE_MODE_5_LEVEL:
+ freelist = free_pt_l5(root, freelist);
+ break;
+ case PAGE_MODE_6_LEVEL:
+ freelist = free_pt_l6(root, freelist);
+ break;
+ default:
+ BUG();
+ }
+
+ return freelist;
+}
+
+static void free_pagetable(struct domain_pgtable *pgtable)
+{
+ struct page *freelist = NULL;
+ unsigned long root;
+
+ if (pgtable->mode == PAGE_MODE_NONE)
+ return;
+
+ BUG_ON(pgtable->mode < PAGE_MODE_NONE ||
+ pgtable->mode > PAGE_MODE_6_LEVEL);
+
+ root = (unsigned long)pgtable->root;
+ freelist = free_sub_pt(root, pgtable->mode, freelist);
+
+ free_page_list(freelist);
+}
+
+/*
+ * This function is used to add another level to an IO page table. Adding
+ * another level increases the size of the address space by 9 bits to a size up
+ * to 64 bits.
+ */
+static bool increase_address_space(struct protection_domain *domain,
+ unsigned long address,
+ gfp_t gfp)
+{
+ struct domain_pgtable pgtable;
+ unsigned long flags;
+ bool ret = true;
+ u64 *pte;
+
+ pte = (void *)get_zeroed_page(gfp);
+ if (!pte)
+ return false;
+
+ spin_lock_irqsave(&domain->lock, flags);
+
+ amd_iommu_domain_get_pgtable(domain, &pgtable);
+
+ if (address <= PM_LEVEL_SIZE(pgtable.mode))
+ goto out;
+
+ ret = false;
+ if (WARN_ON_ONCE(pgtable.mode == PAGE_MODE_6_LEVEL))
+ goto out;
+
+ *pte = PM_LEVEL_PDE(pgtable.mode, iommu_virt_to_phys(pgtable.root));
+
+ pgtable.root = pte;
+ pgtable.mode += 1;
+ update_and_flush_device_table(domain, &pgtable);
+ domain_flush_complete(domain);
+
+ /*
+ * Device Table needs to be updated and flushed before the new root can
+ * be published.
+ */
+ amd_iommu_domain_set_pgtable(domain, pte, pgtable.mode);
+
+ pte = NULL;
+ ret = true;
+
+out:
+ spin_unlock_irqrestore(&domain->lock, flags);
+ free_page((unsigned long)pte);
+
+ return ret;
+}
+
+static u64 *alloc_pte(struct protection_domain *domain,
+ unsigned long address,
+ unsigned long page_size,
+ u64 **pte_page,
+ gfp_t gfp,
+ bool *updated)
+{
+ struct domain_pgtable pgtable;
+ int level, end_lvl;
+ u64 *pte, *page;
+
+ BUG_ON(!is_power_of_2(page_size));
+
+ amd_iommu_domain_get_pgtable(domain, &pgtable);
+
+ while (address > PM_LEVEL_SIZE(pgtable.mode)) {
+ /*
+ * Return an error if there is no memory to update the
+ * page-table.
+ */
+ if (!increase_address_space(domain, address, gfp))
+ return NULL;
+
+ /* Read new values to check if update was successful */
+ amd_iommu_domain_get_pgtable(domain, &pgtable);
+ }
+
+
+ level = pgtable.mode - 1;
+ pte = &pgtable.root[PM_LEVEL_INDEX(level, address)];
+ address = PAGE_SIZE_ALIGN(address, page_size);
+ end_lvl = PAGE_SIZE_LEVEL(page_size);
+
+ while (level > end_lvl) {
+ u64 __pte, __npte;
+ int pte_level;
+
+ __pte = *pte;
+ pte_level = PM_PTE_LEVEL(__pte);
+
+ /*
+ * If we replace a series of large PTEs, we need
+ * to tear down all of them.
+ */
+ if (IOMMU_PTE_PRESENT(__pte) &&
+ pte_level == PAGE_MODE_7_LEVEL) {
+ unsigned long count, i;
+ u64 *lpte;
+
+ lpte = first_pte_l7(pte, NULL, &count);
+
+ /*
+ * Unmap the replicated PTEs that still match the
+ * original large mapping
+ */
+ for (i = 0; i < count; ++i)
+ cmpxchg64(&lpte[i], __pte, 0ULL);
+
+ *updated = true;
+ continue;
+ }
+
+ if (!IOMMU_PTE_PRESENT(__pte) ||
+ pte_level == PAGE_MODE_NONE) {
+ page = (u64 *)get_zeroed_page(gfp);
+
+ if (!page)
+ return NULL;
+
+ __npte = PM_LEVEL_PDE(level, iommu_virt_to_phys(page));
+
+ /* pte could have been changed somewhere. */
+ if (cmpxchg64(pte, __pte, __npte) != __pte)
+ free_page((unsigned long)page);
+ else if (IOMMU_PTE_PRESENT(__pte))
+ *updated = true;
+
+ continue;
+ }
+
+ /* No level skipping support yet */
+ if (pte_level != level)
+ return NULL;
+
+ level -= 1;
+
+ pte = IOMMU_PTE_PAGE(__pte);
+
+ if (pte_page && level == end_lvl)
+ *pte_page = pte;
+
+ pte = &pte[PM_LEVEL_INDEX(level, address)];
+ }
+
+ return pte;
+}
+
+/*
+ * This function checks if there is a PTE for a given dma address. If
+ * there is one, it returns the pointer to it.
+ */
+static u64 *fetch_pte(struct protection_domain *domain,
+ unsigned long address,
+ unsigned long *page_size)
+{
+ struct domain_pgtable pgtable;
+ int level;
+ u64 *pte;
+
+ *page_size = 0;
+
+ amd_iommu_domain_get_pgtable(domain, &pgtable);
+
+ if (address > PM_LEVEL_SIZE(pgtable.mode))
+ return NULL;
+
+ level = pgtable.mode - 1;
+ pte = &pgtable.root[PM_LEVEL_INDEX(level, address)];
+ *page_size = PTE_LEVEL_PAGE_SIZE(level);
+
+ while (level > 0) {
+
+ /* Not Present */
+ if (!IOMMU_PTE_PRESENT(*pte))
+ return NULL;
+
+ /* Large PTE */
+ if (PM_PTE_LEVEL(*pte) == 7 ||
+ PM_PTE_LEVEL(*pte) == 0)
+ break;
+
+ /* No level skipping support yet */
+ if (PM_PTE_LEVEL(*pte) != level)
+ return NULL;
+
+ level -= 1;
+
+ /* Walk to the next level */
+ pte = IOMMU_PTE_PAGE(*pte);
+ pte = &pte[PM_LEVEL_INDEX(level, address)];
+ *page_size = PTE_LEVEL_PAGE_SIZE(level);
+ }
+
+ /*
+ * If we have a series of large PTEs, make
+ * sure to return a pointer to the first one.
+ */
+ if (PM_PTE_LEVEL(*pte) == PAGE_MODE_7_LEVEL)
+ pte = first_pte_l7(pte, page_size, NULL);
+
+ return pte;
+}
+
+static struct page *free_clear_pte(u64 *pte, u64 pteval, struct page *freelist)
+{
+ unsigned long pt;
+ int mode;
+
+ while (cmpxchg64(pte, pteval, 0) != pteval) {
+ pr_warn("AMD-Vi: IOMMU pte changed since we read it\n");
+ pteval = *pte;
+ }
+
+ if (!IOMMU_PTE_PRESENT(pteval))
+ return freelist;
+
+ pt = (unsigned long)IOMMU_PTE_PAGE(pteval);
+ mode = IOMMU_PTE_MODE(pteval);
+
+ return free_sub_pt(pt, mode, freelist);
+}
+
+/*
+ * Generic mapping functions. It maps a physical address into a DMA
+ * address space. It allocates the page table pages if necessary.
+ * In the future it can be extended to a generic mapping function
+ * supporting all features of AMD IOMMU page tables like level skipping
+ * and full 64 bit address spaces.
+ */
+static int iommu_map_page(struct protection_domain *dom,
+ unsigned long bus_addr,
+ unsigned long phys_addr,
+ unsigned long page_size,
+ int prot,
+ gfp_t gfp)
+{
+ struct page *freelist = NULL;
+ bool updated = false;
+ u64 __pte, *pte;
+ int ret, i, count;
+
+ BUG_ON(!IS_ALIGNED(bus_addr, page_size));
+ BUG_ON(!IS_ALIGNED(phys_addr, page_size));
+
+ ret = -EINVAL;
+ if (!(prot & IOMMU_PROT_MASK))
+ goto out;
+
+ count = PAGE_SIZE_PTE_COUNT(page_size);
+ pte = alloc_pte(dom, bus_addr, page_size, NULL, gfp, &updated);
+
+ ret = -ENOMEM;
+ if (!pte)
+ goto out;
+
+ for (i = 0; i < count; ++i)
+ freelist = free_clear_pte(&pte[i], pte[i], freelist);
+
+ if (freelist != NULL)
+ updated = true;
+
+ if (count > 1) {
+ __pte = PAGE_SIZE_PTE(__sme_set(phys_addr), page_size);
+ __pte |= PM_LEVEL_ENC(7) | IOMMU_PTE_PR | IOMMU_PTE_FC;
+ } else
+ __pte = __sme_set(phys_addr) | IOMMU_PTE_PR | IOMMU_PTE_FC;
+
+ if (prot & IOMMU_PROT_IR)
+ __pte |= IOMMU_PTE_IR;
+ if (prot & IOMMU_PROT_IW)
+ __pte |= IOMMU_PTE_IW;
+
+ for (i = 0; i < count; ++i)
+ pte[i] = __pte;
+
+ ret = 0;
+
+out:
+ if (updated) {
+ unsigned long flags;
+
+ spin_lock_irqsave(&dom->lock, flags);
+ /*
+ * Flush domain TLB(s) and wait for completion. Any Device-Table
+ * Updates and flushing already happened in
+ * increase_address_space().
+ */
+ domain_flush_tlb_pde(dom);
+ domain_flush_complete(dom);
+ spin_unlock_irqrestore(&dom->lock, flags);
+ }
+
+ /* Everything flushed out, free pages now */
+ free_page_list(freelist);
+
+ return ret;
+}
+
+static unsigned long iommu_unmap_page(struct protection_domain *dom,
+ unsigned long bus_addr,
+ unsigned long page_size)
+{
+ unsigned long long unmapped;
+ unsigned long unmap_size;
+ u64 *pte;
+
+ BUG_ON(!is_power_of_2(page_size));
+
+ unmapped = 0;
+
+ while (unmapped < page_size) {
+
+ pte = fetch_pte(dom, bus_addr, &unmap_size);
+
+ if (pte) {
+ int i, count;
+
+ count = PAGE_SIZE_PTE_COUNT(unmap_size);
+ for (i = 0; i < count; i++)
+ pte[i] = 0ULL;
+ }
+
+ bus_addr = (bus_addr & ~(unmap_size - 1)) + unmap_size;
+ unmapped += unmap_size;
+ }
+
+ BUG_ON(unmapped && !is_power_of_2(unmapped));
+
+ return unmapped;
+}
+
+/****************************************************************************
+ *
+ * The next functions belong to the domain allocation. A domain is
+ * allocated for every IOMMU as the default domain. If device isolation
+ * is enabled, every device get its own domain. The most important thing
+ * about domains is the page table mapping the DMA address space they
+ * contain.
+ *
+ ****************************************************************************/
+
+static u16 domain_id_alloc(void)
+{
+ int id;
+
+ spin_lock(&pd_bitmap_lock);
+ id = find_first_zero_bit(amd_iommu_pd_alloc_bitmap, MAX_DOMAIN_ID);
+ BUG_ON(id == 0);
+ if (id > 0 && id < MAX_DOMAIN_ID)
+ __set_bit(id, amd_iommu_pd_alloc_bitmap);
+ else
+ id = 0;
+ spin_unlock(&pd_bitmap_lock);
+
+ return id;
+}
+
+static void domain_id_free(int id)
+{
+ spin_lock(&pd_bitmap_lock);
+ if (id > 0 && id < MAX_DOMAIN_ID)
+ __clear_bit(id, amd_iommu_pd_alloc_bitmap);
+ spin_unlock(&pd_bitmap_lock);
+}
+
+static void free_gcr3_tbl_level1(u64 *tbl)
+{
+ u64 *ptr;
+ int i;
+
+ for (i = 0; i < 512; ++i) {
+ if (!(tbl[i] & GCR3_VALID))
+ continue;
+
+ ptr = iommu_phys_to_virt(tbl[i] & PAGE_MASK);
+
+ free_page((unsigned long)ptr);
+ }
+}
+
+static void free_gcr3_tbl_level2(u64 *tbl)
+{
+ u64 *ptr;
+ int i;
+
+ for (i = 0; i < 512; ++i) {
+ if (!(tbl[i] & GCR3_VALID))
+ continue;
+
+ ptr = iommu_phys_to_virt(tbl[i] & PAGE_MASK);
+
+ free_gcr3_tbl_level1(ptr);
+ }
+}
+
+static void free_gcr3_table(struct protection_domain *domain)
+{
+ if (domain->glx == 2)
+ free_gcr3_tbl_level2(domain->gcr3_tbl);
+ else if (domain->glx == 1)
+ free_gcr3_tbl_level1(domain->gcr3_tbl);
+ else
+ BUG_ON(domain->glx != 0);
+
+ free_page((unsigned long)domain->gcr3_tbl);
+}
+
+static void set_dte_entry(u16 devid, struct protection_domain *domain,
+ struct domain_pgtable *pgtable,
+ bool ats, bool ppr)
+{
+ u64 pte_root = 0;
+ u64 flags = 0;
+ u32 old_domid;
+
+ if (pgtable->mode != PAGE_MODE_NONE)
+ pte_root = iommu_virt_to_phys(pgtable->root);
+
+ pte_root |= (pgtable->mode & DEV_ENTRY_MODE_MASK)
+ << DEV_ENTRY_MODE_SHIFT;
+ pte_root |= DTE_FLAG_IR | DTE_FLAG_IW | DTE_FLAG_V | DTE_FLAG_TV;
+
+ flags = amd_iommu_dev_table[devid].data[1];
+
+ if (ats)
+ flags |= DTE_FLAG_IOTLB;
+
+ if (ppr) {
+ struct amd_iommu *iommu = amd_iommu_rlookup_table[devid];
+
+ if (iommu_feature(iommu, FEATURE_EPHSUP))
+ pte_root |= 1ULL << DEV_ENTRY_PPR;
+ }
+
+ if (domain->flags & PD_IOMMUV2_MASK) {
+ u64 gcr3 = iommu_virt_to_phys(domain->gcr3_tbl);
+ u64 glx = domain->glx;
+ u64 tmp;
+
+ pte_root |= DTE_FLAG_GV;
+ pte_root |= (glx & DTE_GLX_MASK) << DTE_GLX_SHIFT;
+
+ /* First mask out possible old values for GCR3 table */
+ tmp = DTE_GCR3_VAL_B(~0ULL) << DTE_GCR3_SHIFT_B;
+ flags &= ~tmp;
+
+ tmp = DTE_GCR3_VAL_C(~0ULL) << DTE_GCR3_SHIFT_C;
+ flags &= ~tmp;
+
+ /* Encode GCR3 table into DTE */
+ tmp = DTE_GCR3_VAL_A(gcr3) << DTE_GCR3_SHIFT_A;
+ pte_root |= tmp;
+
+ tmp = DTE_GCR3_VAL_B(gcr3) << DTE_GCR3_SHIFT_B;
+ flags |= tmp;
+
+ tmp = DTE_GCR3_VAL_C(gcr3) << DTE_GCR3_SHIFT_C;
+ flags |= tmp;
+ }
+
+ flags &= ~DEV_DOMID_MASK;
+ flags |= domain->id;
+
+ old_domid = amd_iommu_dev_table[devid].data[1] & DEV_DOMID_MASK;
+ amd_iommu_dev_table[devid].data[1] = flags;
+ amd_iommu_dev_table[devid].data[0] = pte_root;
+
+ /*
+ * A kdump kernel might be replacing a domain ID that was copied from
+ * the previous kernel--if so, it needs to flush the translation cache
+ * entries for the old domain ID that is being overwritten
+ */
+ if (old_domid) {
+ struct amd_iommu *iommu = amd_iommu_rlookup_table[devid];
+
+ amd_iommu_flush_tlb_domid(iommu, old_domid);
+ }
+}
+
+static void clear_dte_entry(u16 devid)
+{
+ /* remove entry from the device table seen by the hardware */
+ amd_iommu_dev_table[devid].data[0] = DTE_FLAG_V | DTE_FLAG_TV;
+ amd_iommu_dev_table[devid].data[1] &= DTE_FLAG_MASK;
+
+ amd_iommu_apply_erratum_63(devid);
+}
+
+static void do_attach(struct iommu_dev_data *dev_data,
+ struct protection_domain *domain)
+{
+ struct domain_pgtable pgtable;
+ struct amd_iommu *iommu;
+ bool ats;
+
+ iommu = amd_iommu_rlookup_table[dev_data->devid];
+ ats = dev_data->ats.enabled;
+
+ /* Update data structures */
+ dev_data->domain = domain;
+ list_add(&dev_data->list, &domain->dev_list);
+
+ /* Do reference counting */
+ domain->dev_iommu[iommu->index] += 1;
+ domain->dev_cnt += 1;
+
+ /* Update device table */
+ amd_iommu_domain_get_pgtable(domain, &pgtable);
+ set_dte_entry(dev_data->devid, domain, &pgtable,
+ ats, dev_data->iommu_v2);
+ clone_aliases(dev_data->pdev);
+
+ device_flush_dte(dev_data);
+}
+
+static void do_detach(struct iommu_dev_data *dev_data)
+{
+ struct protection_domain *domain = dev_data->domain;
+ struct amd_iommu *iommu;
+
+ iommu = amd_iommu_rlookup_table[dev_data->devid];
+
+ /* Update data structures */
+ dev_data->domain = NULL;
+ list_del(&dev_data->list);
+ clear_dte_entry(dev_data->devid);
+ clone_aliases(dev_data->pdev);
+
+ /* Flush the DTE entry */
+ device_flush_dte(dev_data);
+
+ /* Flush IOTLB */
+ domain_flush_tlb_pde(domain);
+
+ /* Wait for the flushes to finish */
+ domain_flush_complete(domain);
+
+ /* decrease reference counters - needs to happen after the flushes */
+ domain->dev_iommu[iommu->index] -= 1;
+ domain->dev_cnt -= 1;
+}
+
+static void pdev_iommuv2_disable(struct pci_dev *pdev)
+{
+ pci_disable_ats(pdev);
+ pci_disable_pri(pdev);
+ pci_disable_pasid(pdev);
+}
+
+/* FIXME: Change generic reset-function to do the same */
+static int pri_reset_while_enabled(struct pci_dev *pdev)
+{
+ u16 control;
+ int pos;
+
+ pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI);
+ if (!pos)
+ return -EINVAL;
+
+ pci_read_config_word(pdev, pos + PCI_PRI_CTRL, &control);
+ control |= PCI_PRI_CTRL_RESET;
+ pci_write_config_word(pdev, pos + PCI_PRI_CTRL, control);
+
+ return 0;
+}
+
+static int pdev_iommuv2_enable(struct pci_dev *pdev)
+{
+ bool reset_enable;
+ int reqs, ret;
+
+ /* FIXME: Hardcode number of outstanding requests for now */
+ reqs = 32;
+ if (pdev_pri_erratum(pdev, AMD_PRI_DEV_ERRATUM_LIMIT_REQ_ONE))
+ reqs = 1;
+ reset_enable = pdev_pri_erratum(pdev, AMD_PRI_DEV_ERRATUM_ENABLE_RESET);
+
+ /* Only allow access to user-accessible pages */
+ ret = pci_enable_pasid(pdev, 0);
+ if (ret)
+ goto out_err;
+
+ /* First reset the PRI state of the device */
+ ret = pci_reset_pri(pdev);
+ if (ret)
+ goto out_err;
+
+ /* Enable PRI */
+ ret = pci_enable_pri(pdev, reqs);
+ if (ret)
+ goto out_err;
+
+ if (reset_enable) {
+ ret = pri_reset_while_enabled(pdev);
+ if (ret)
+ goto out_err;
+ }
+
+ ret = pci_enable_ats(pdev, PAGE_SHIFT);
+ if (ret)
+ goto out_err;
+
+ return 0;
+
+out_err:
+ pci_disable_pri(pdev);
+ pci_disable_pasid(pdev);
+
+ return ret;
+}
+
+/*
+ * If a device is not yet associated with a domain, this function makes the
+ * device visible in the domain
+ */
+static int attach_device(struct device *dev,
+ struct protection_domain *domain)
+{
+ struct iommu_dev_data *dev_data;
+ struct pci_dev *pdev;
+ unsigned long flags;
+ int ret;
+
+ spin_lock_irqsave(&domain->lock, flags);
+
+ dev_data = dev_iommu_priv_get(dev);
+
+ spin_lock(&dev_data->lock);
+
+ ret = -EBUSY;
+ if (dev_data->domain != NULL)
+ goto out;
+
+ if (!dev_is_pci(dev))
+ goto skip_ats_check;
+
+ pdev = to_pci_dev(dev);
+ if (domain->flags & PD_IOMMUV2_MASK) {
+ struct iommu_domain *def_domain = iommu_get_dma_domain(dev);
+
+ ret = -EINVAL;
+ if (def_domain->type != IOMMU_DOMAIN_IDENTITY)
+ goto out;
+
+ if (dev_data->iommu_v2) {
+ if (pdev_iommuv2_enable(pdev) != 0)
+ goto out;
+
+ dev_data->ats.enabled = true;
+ dev_data->ats.qdep = pci_ats_queue_depth(pdev);
+ dev_data->pri_tlp = pci_prg_resp_pasid_required(pdev);
+ }
+ } else if (amd_iommu_iotlb_sup &&
+ pci_enable_ats(pdev, PAGE_SHIFT) == 0) {
+ dev_data->ats.enabled = true;
+ dev_data->ats.qdep = pci_ats_queue_depth(pdev);
+ }
+
+skip_ats_check:
+ ret = 0;
+
+ do_attach(dev_data, domain);
+
+ /*
+ * We might boot into a crash-kernel here. The crashed kernel
+ * left the caches in the IOMMU dirty. So we have to flush
+ * here to evict all dirty stuff.
+ */
+ domain_flush_tlb_pde(domain);
+
+ domain_flush_complete(domain);
+
+out:
+ spin_unlock(&dev_data->lock);
+
+ spin_unlock_irqrestore(&domain->lock, flags);
+
+ return ret;
+}
+
+/*
+ * Removes a device from a protection domain (with devtable_lock held)
+ */
+static void detach_device(struct device *dev)
+{
+ struct protection_domain *domain;
+ struct iommu_dev_data *dev_data;
+ unsigned long flags;
+
+ dev_data = dev_iommu_priv_get(dev);
+ domain = dev_data->domain;
+
+ spin_lock_irqsave(&domain->lock, flags);
+
+ spin_lock(&dev_data->lock);
+
+ /*
+ * First check if the device is still attached. It might already
+ * be detached from its domain because the generic
+ * iommu_detach_group code detached it and we try again here in
+ * our alias handling.
+ */
+ if (WARN_ON(!dev_data->domain))
+ goto out;
+
+ do_detach(dev_data);
+
+ if (!dev_is_pci(dev))
+ goto out;
+
+ if (domain->flags & PD_IOMMUV2_MASK && dev_data->iommu_v2)
+ pdev_iommuv2_disable(to_pci_dev(dev));
+ else if (dev_data->ats.enabled)
+ pci_disable_ats(to_pci_dev(dev));
+
+ dev_data->ats.enabled = false;
+
+out:
+ spin_unlock(&dev_data->lock);
+
+ spin_unlock_irqrestore(&domain->lock, flags);
+}
+
+static struct iommu_device *amd_iommu_probe_device(struct device *dev)
+{
+ struct iommu_device *iommu_dev;
+ struct amd_iommu *iommu;
+ int ret, devid;
+
+ if (!check_device(dev))
+ return ERR_PTR(-ENODEV);
+
+ devid = get_device_id(dev);
+ if (devid < 0)
+ return ERR_PTR(devid);
+
+ iommu = amd_iommu_rlookup_table[devid];
+
+ if (dev_iommu_priv_get(dev))
+ return &iommu->iommu;
+
+ ret = iommu_init_device(dev);
+ if (ret) {
+ if (ret != -ENOTSUPP)
+ dev_err(dev, "Failed to initialize - trying to proceed anyway\n");
+ iommu_dev = ERR_PTR(ret);
+ iommu_ignore_device(dev);
+ } else {
+ amd_iommu_set_pci_msi_domain(dev, iommu);
+ iommu_dev = &iommu->iommu;
+ }
+
+ iommu_completion_wait(iommu);
+
+ return iommu_dev;
+}
+
+static void amd_iommu_probe_finalize(struct device *dev)
+{
+ struct iommu_domain *domain;
+
+ /* Domains are initialized for this device - have a look what we ended up with */
+ domain = iommu_get_domain_for_dev(dev);
+ if (domain->type == IOMMU_DOMAIN_DMA)
+ iommu_setup_dma_ops(dev, IOVA_START_PFN << PAGE_SHIFT, 0);
+}
+
+static void amd_iommu_release_device(struct device *dev)
+{
+ int devid = get_device_id(dev);
+ struct amd_iommu *iommu;
+
+ if (!check_device(dev))
+ return;
+
+ iommu = amd_iommu_rlookup_table[devid];
+
+ amd_iommu_uninit_device(dev);
+ iommu_completion_wait(iommu);
+}
+
+static struct iommu_group *amd_iommu_device_group(struct device *dev)
+{
+ if (dev_is_pci(dev))
+ return pci_device_group(dev);
+
+ return acpihid_device_group(dev);
+}
+
+static int amd_iommu_domain_get_attr(struct iommu_domain *domain,
+ enum iommu_attr attr, void *data)
+{
+ switch (domain->type) {
+ case IOMMU_DOMAIN_UNMANAGED:
+ return -ENODEV;
+ case IOMMU_DOMAIN_DMA:
+ switch (attr) {
+ case DOMAIN_ATTR_DMA_USE_FLUSH_QUEUE:
+ *(int *)data = !amd_iommu_unmap_flush;
+ return 0;
+ default:
+ return -ENODEV;
+ }
+ break;
+ default:
+ return -EINVAL;
+ }
+}
+
+/*****************************************************************************
+ *
+ * The next functions belong to the dma_ops mapping/unmapping code.
+ *
+ *****************************************************************************/
+
+static void update_device_table(struct protection_domain *domain,
+ struct domain_pgtable *pgtable)
+{
+ struct iommu_dev_data *dev_data;
+
+ list_for_each_entry(dev_data, &domain->dev_list, list) {
+ set_dte_entry(dev_data->devid, domain, pgtable,
+ dev_data->ats.enabled, dev_data->iommu_v2);
+ clone_aliases(dev_data->pdev);
+ }
+}
+
+static void update_and_flush_device_table(struct protection_domain *domain,
+ struct domain_pgtable *pgtable)
+{
+ update_device_table(domain, pgtable);
+ domain_flush_devices(domain);
+}
+
+static void update_domain(struct protection_domain *domain)
+{
+ struct domain_pgtable pgtable;
+
+ /* Update device table */
+ amd_iommu_domain_get_pgtable(domain, &pgtable);
+ update_and_flush_device_table(domain, &pgtable);
+
+ /* Flush domain TLB(s) and wait for completion */
+ domain_flush_tlb_pde(domain);
+ domain_flush_complete(domain);
+}
+
+int __init amd_iommu_init_api(void)
+{
+ int ret, err = 0;
+
+ ret = iova_cache_get();
+ if (ret)
+ return ret;
+
+ err = bus_set_iommu(&pci_bus_type, &amd_iommu_ops);
+ if (err)
+ return err;
+#ifdef CONFIG_ARM_AMBA
+ err = bus_set_iommu(&amba_bustype, &amd_iommu_ops);
+ if (err)
+ return err;
+#endif
+ err = bus_set_iommu(&platform_bus_type, &amd_iommu_ops);
+ if (err)
+ return err;
+
+ return 0;
+}
+
+int __init amd_iommu_init_dma_ops(void)
+{
+ swiotlb = (iommu_default_passthrough() || sme_me_mask) ? 1 : 0;
+
+ if (amd_iommu_unmap_flush)
+ pr_info("IO/TLB flush on unmap enabled\n");
+ else
+ pr_info("Lazy IO/TLB flushing enabled\n");
+
+ return 0;
+
+}
+
+/*****************************************************************************
+ *
+ * The following functions belong to the exported interface of AMD IOMMU
+ *
+ * This interface allows access to lower level functions of the IOMMU
+ * like protection domain handling and assignement of devices to domains
+ * which is not possible with the dma_ops interface.
+ *
+ *****************************************************************************/
+
+static void cleanup_domain(struct protection_domain *domain)
+{
+ struct iommu_dev_data *entry;
+ unsigned long flags;
+
+ spin_lock_irqsave(&domain->lock, flags);
+
+ while (!list_empty(&domain->dev_list)) {
+ entry = list_first_entry(&domain->dev_list,
+ struct iommu_dev_data, list);
+ BUG_ON(!entry->domain);
+ do_detach(entry);
+ }
+
+ spin_unlock_irqrestore(&domain->lock, flags);
+}
+
+static void protection_domain_free(struct protection_domain *domain)
+{
+ struct domain_pgtable pgtable;
+
+ if (!domain)
+ return;
+
+ if (domain->id)
+ domain_id_free(domain->id);
+
+ amd_iommu_domain_get_pgtable(domain, &pgtable);
+ amd_iommu_domain_clr_pt_root(domain);
+ free_pagetable(&pgtable);
+
+ kfree(domain);
+}
+
+static int protection_domain_init(struct protection_domain *domain, int mode)
+{
+ u64 *pt_root = NULL;
+
+ BUG_ON(mode < PAGE_MODE_NONE || mode > PAGE_MODE_6_LEVEL);
+
+ spin_lock_init(&domain->lock);
+ domain->id = domain_id_alloc();
+ if (!domain->id)
+ return -ENOMEM;
+ INIT_LIST_HEAD(&domain->dev_list);
+
+ if (mode != PAGE_MODE_NONE) {
+ pt_root = (void *)get_zeroed_page(GFP_KERNEL);
+ if (!pt_root)
+ return -ENOMEM;
+ }
+
+ amd_iommu_domain_set_pgtable(domain, pt_root, mode);
+
+ return 0;
+}
+
+static struct protection_domain *protection_domain_alloc(int mode)
+{
+ struct protection_domain *domain;
+
+ domain = kzalloc(sizeof(*domain), GFP_KERNEL);
+ if (!domain)
+ return NULL;
+
+ if (protection_domain_init(domain, mode))
+ goto out_err;
+
+ return domain;
+
+out_err:
+ kfree(domain);
+
+ return NULL;
+}
+
+static struct iommu_domain *amd_iommu_domain_alloc(unsigned type)
+{
+ struct protection_domain *domain;
+ int mode = DEFAULT_PGTABLE_LEVEL;
+
+ if (type == IOMMU_DOMAIN_IDENTITY)
+ mode = PAGE_MODE_NONE;
+
+ domain = protection_domain_alloc(mode);
+ if (!domain)
+ return NULL;
+
+ domain->domain.geometry.aperture_start = 0;
+ domain->domain.geometry.aperture_end = ~0ULL;
+ domain->domain.geometry.force_aperture = true;
+
+ if (type == IOMMU_DOMAIN_DMA &&
+ iommu_get_dma_cookie(&domain->domain) == -ENOMEM)
+ goto free_domain;
+
+ return &domain->domain;
+
+free_domain:
+ protection_domain_free(domain);
+
+ return NULL;
+}
+
+static void amd_iommu_domain_free(struct iommu_domain *dom)
+{
+ struct protection_domain *domain;
+
+ domain = to_pdomain(dom);
+
+ if (domain->dev_cnt > 0)
+ cleanup_domain(domain);
+
+ BUG_ON(domain->dev_cnt != 0);
+
+ if (!dom)
+ return;
+
+ if (dom->type == IOMMU_DOMAIN_DMA)
+ iommu_put_dma_cookie(&domain->domain);
+
+ if (domain->flags & PD_IOMMUV2_MASK)
+ free_gcr3_table(domain);
+
+ protection_domain_free(domain);
+}
+
+static void amd_iommu_detach_device(struct iommu_domain *dom,
+ struct device *dev)
+{
+ struct iommu_dev_data *dev_data = dev_iommu_priv_get(dev);
+ struct amd_iommu *iommu;
+ int devid;
+
+ if (!check_device(dev))
+ return;
+
+ devid = get_device_id(dev);
+ if (devid < 0)
+ return;
+
+ if (dev_data->domain != NULL)
+ detach_device(dev);
+
+ iommu = amd_iommu_rlookup_table[devid];
+ if (!iommu)
+ return;
+
+#ifdef CONFIG_IRQ_REMAP
+ if (AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir) &&
+ (dom->type == IOMMU_DOMAIN_UNMANAGED))
+ dev_data->use_vapic = 0;
+#endif
+
+ iommu_completion_wait(iommu);
+}
+
+static int amd_iommu_attach_device(struct iommu_domain *dom,
+ struct device *dev)
+{
+ struct protection_domain *domain = to_pdomain(dom);
+ struct iommu_dev_data *dev_data;
+ struct amd_iommu *iommu;
+ int ret;
+
+ if (!check_device(dev))
+ return -EINVAL;
+
+ dev_data = dev_iommu_priv_get(dev);
+ dev_data->defer_attach = false;
+
+ iommu = amd_iommu_rlookup_table[dev_data->devid];
+ if (!iommu)
+ return -EINVAL;
+
+ if (dev_data->domain)
+ detach_device(dev);
+
+ ret = attach_device(dev, domain);
+
+#ifdef CONFIG_IRQ_REMAP
+ if (AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir)) {
+ if (dom->type == IOMMU_DOMAIN_UNMANAGED)
+ dev_data->use_vapic = 1;
+ else
+ dev_data->use_vapic = 0;
+ }
+#endif
+
+ iommu_completion_wait(iommu);
+
+ return ret;
+}
+
+static int amd_iommu_map(struct iommu_domain *dom, unsigned long iova,
+ phys_addr_t paddr, size_t page_size, int iommu_prot,
+ gfp_t gfp)
+{
+ struct protection_domain *domain = to_pdomain(dom);
+ struct domain_pgtable pgtable;
+ int prot = 0;
+ int ret;
+
+ amd_iommu_domain_get_pgtable(domain, &pgtable);
+ if (pgtable.mode == PAGE_MODE_NONE)
+ return -EINVAL;
+
+ if (iommu_prot & IOMMU_READ)
+ prot |= IOMMU_PROT_IR;
+ if (iommu_prot & IOMMU_WRITE)
+ prot |= IOMMU_PROT_IW;
+
+ ret = iommu_map_page(domain, iova, paddr, page_size, prot, gfp);
+
+ domain_flush_np_cache(domain, iova, page_size);
+
+ return ret;
+}
+
+static size_t amd_iommu_unmap(struct iommu_domain *dom, unsigned long iova,
+ size_t page_size,
+ struct iommu_iotlb_gather *gather)
+{
+ struct protection_domain *domain = to_pdomain(dom);
+ struct domain_pgtable pgtable;
+
+ amd_iommu_domain_get_pgtable(domain, &pgtable);
+ if (pgtable.mode == PAGE_MODE_NONE)
+ return 0;
+
+ return iommu_unmap_page(domain, iova, page_size);
+}
+
+static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom,
+ dma_addr_t iova)
+{
+ struct protection_domain *domain = to_pdomain(dom);
+ unsigned long offset_mask, pte_pgsize;
+ struct domain_pgtable pgtable;
+ u64 *pte, __pte;
+
+ amd_iommu_domain_get_pgtable(domain, &pgtable);
+ if (pgtable.mode == PAGE_MODE_NONE)
+ return iova;
+
+ pte = fetch_pte(domain, iova, &pte_pgsize);
+
+ if (!pte || !IOMMU_PTE_PRESENT(*pte))
+ return 0;
+
+ offset_mask = pte_pgsize - 1;
+ __pte = __sme_clr(*pte & PM_ADDR_MASK);
+
+ return (__pte & ~offset_mask) | (iova & offset_mask);
+}
+
+static bool amd_iommu_capable(enum iommu_cap cap)
+{
+ switch (cap) {
+ case IOMMU_CAP_CACHE_COHERENCY:
+ return true;
+ case IOMMU_CAP_INTR_REMAP:
+ return (irq_remapping_enabled == 1);
+ case IOMMU_CAP_NOEXEC:
+ return false;
+ default:
+ break;
+ }
+
+ return false;
+}
+
+static void amd_iommu_get_resv_regions(struct device *dev,
+ struct list_head *head)
+{
+ struct iommu_resv_region *region;
+ struct unity_map_entry *entry;
+ int devid;
+
+ devid = get_device_id(dev);
+ if (devid < 0)
+ return;
+
+ list_for_each_entry(entry, &amd_iommu_unity_map, list) {
+ int type, prot = 0;
+ size_t length;
+
+ if (devid < entry->devid_start || devid > entry->devid_end)
+ continue;
+
+ type = IOMMU_RESV_DIRECT;
+ length = entry->address_end - entry->address_start;
+ if (entry->prot & IOMMU_PROT_IR)
+ prot |= IOMMU_READ;
+ if (entry->prot & IOMMU_PROT_IW)
+ prot |= IOMMU_WRITE;
+ if (entry->prot & IOMMU_UNITY_MAP_FLAG_EXCL_RANGE)
+ /* Exclusion range */
+ type = IOMMU_RESV_RESERVED;
+
+ region = iommu_alloc_resv_region(entry->address_start,
+ length, prot, type);
+ if (!region) {
+ dev_err(dev, "Out of memory allocating dm-regions\n");
+ return;
+ }
+ list_add_tail(&region->list, head);
+ }
+
+ region = iommu_alloc_resv_region(MSI_RANGE_START,
+ MSI_RANGE_END - MSI_RANGE_START + 1,
+ 0, IOMMU_RESV_MSI);
+ if (!region)
+ return;
+ list_add_tail(&region->list, head);
+
+ region = iommu_alloc_resv_region(HT_RANGE_START,
+ HT_RANGE_END - HT_RANGE_START + 1,
+ 0, IOMMU_RESV_RESERVED);
+ if (!region)
+ return;
+ list_add_tail(&region->list, head);
+}
+
+bool amd_iommu_is_attach_deferred(struct iommu_domain *domain,
+ struct device *dev)
+{
+ struct iommu_dev_data *dev_data = dev_iommu_priv_get(dev);
+
+ return dev_data->defer_attach;
+}
+EXPORT_SYMBOL_GPL(amd_iommu_is_attach_deferred);
+
+static void amd_iommu_flush_iotlb_all(struct iommu_domain *domain)
+{
+ struct protection_domain *dom = to_pdomain(domain);
+ unsigned long flags;
+
+ spin_lock_irqsave(&dom->lock, flags);
+ domain_flush_tlb_pde(dom);
+ domain_flush_complete(dom);
+ spin_unlock_irqrestore(&dom->lock, flags);
+}
+
+static void amd_iommu_iotlb_sync(struct iommu_domain *domain,
+ struct iommu_iotlb_gather *gather)
+{
+ amd_iommu_flush_iotlb_all(domain);
+}
+
+static int amd_iommu_def_domain_type(struct device *dev)
+{
+ struct iommu_dev_data *dev_data;
+
+ dev_data = dev_iommu_priv_get(dev);
+ if (!dev_data)
+ return 0;
+
+ /*
+ * Do not identity map IOMMUv2 capable devices when memory encryption is
+ * active, because some of those devices (AMD GPUs) don't have the
+ * encryption bit in their DMA-mask and require remapping.
+ */
+ if (!mem_encrypt_active() && dev_data->iommu_v2)
+ return IOMMU_DOMAIN_IDENTITY;
+
+ return 0;
+}
+
+const struct iommu_ops amd_iommu_ops = {
+ .capable = amd_iommu_capable,
+ .domain_alloc = amd_iommu_domain_alloc,
+ .domain_free = amd_iommu_domain_free,
+ .attach_dev = amd_iommu_attach_device,
+ .detach_dev = amd_iommu_detach_device,
+ .map = amd_iommu_map,
+ .unmap = amd_iommu_unmap,
+ .iova_to_phys = amd_iommu_iova_to_phys,
+ .probe_device = amd_iommu_probe_device,
+ .release_device = amd_iommu_release_device,
+ .probe_finalize = amd_iommu_probe_finalize,
+ .device_group = amd_iommu_device_group,
+ .domain_get_attr = amd_iommu_domain_get_attr,
+ .get_resv_regions = amd_iommu_get_resv_regions,
+ .put_resv_regions = generic_iommu_put_resv_regions,
+ .is_attach_deferred = amd_iommu_is_attach_deferred,
+ .pgsize_bitmap = AMD_IOMMU_PGSIZES,
+ .flush_iotlb_all = amd_iommu_flush_iotlb_all,
+ .iotlb_sync = amd_iommu_iotlb_sync,
+ .def_domain_type = amd_iommu_def_domain_type,
+};
+
+/*****************************************************************************
+ *
+ * The next functions do a basic initialization of IOMMU for pass through
+ * mode
+ *
+ * In passthrough mode the IOMMU is initialized and enabled but not used for
+ * DMA-API translation.
+ *
+ *****************************************************************************/
+
+/* IOMMUv2 specific functions */
+int amd_iommu_register_ppr_notifier(struct notifier_block *nb)
+{
+ return atomic_notifier_chain_register(&ppr_notifier, nb);
+}
+EXPORT_SYMBOL(amd_iommu_register_ppr_notifier);
+
+int amd_iommu_unregister_ppr_notifier(struct notifier_block *nb)
+{
+ return atomic_notifier_chain_unregister(&ppr_notifier, nb);
+}
+EXPORT_SYMBOL(amd_iommu_unregister_ppr_notifier);
+
+void amd_iommu_domain_direct_map(struct iommu_domain *dom)
+{
+ struct protection_domain *domain = to_pdomain(dom);
+ struct domain_pgtable pgtable;
+ unsigned long flags;
+
+ spin_lock_irqsave(&domain->lock, flags);
+
+ /* First save pgtable configuration*/
+ amd_iommu_domain_get_pgtable(domain, &pgtable);
+
+ /* Remove page-table from domain */
+ amd_iommu_domain_clr_pt_root(domain);
+
+ /* Make changes visible to IOMMUs */
+ update_domain(domain);
+
+ /* Page-table is not visible to IOMMU anymore, so free it */
+ free_pagetable(&pgtable);
+
+ spin_unlock_irqrestore(&domain->lock, flags);
+}
+EXPORT_SYMBOL(amd_iommu_domain_direct_map);
+
+int amd_iommu_domain_enable_v2(struct iommu_domain *dom, int pasids)
+{
+ struct protection_domain *domain = to_pdomain(dom);
+ unsigned long flags;
+ int levels, ret;
+
+ if (pasids <= 0 || pasids > (PASID_MASK + 1))
+ return -EINVAL;
+
+ /* Number of GCR3 table levels required */
+ for (levels = 0; (pasids - 1) & ~0x1ff; pasids >>= 9)
+ levels += 1;
+
+ if (levels > amd_iommu_max_glx_val)
+ return -EINVAL;
+
+ spin_lock_irqsave(&domain->lock, flags);
+
+ /*
+ * Save us all sanity checks whether devices already in the
+ * domain support IOMMUv2. Just force that the domain has no
+ * devices attached when it is switched into IOMMUv2 mode.
+ */
+ ret = -EBUSY;
+ if (domain->dev_cnt > 0 || domain->flags & PD_IOMMUV2_MASK)
+ goto out;
+
+ ret = -ENOMEM;
+ domain->gcr3_tbl = (void *)get_zeroed_page(GFP_ATOMIC);
+ if (domain->gcr3_tbl == NULL)
+ goto out;
+
+ domain->glx = levels;
+ domain->flags |= PD_IOMMUV2_MASK;
+
+ update_domain(domain);
+
+ ret = 0;
+
+out:
+ spin_unlock_irqrestore(&domain->lock, flags);
+
+ return ret;
+}
+EXPORT_SYMBOL(amd_iommu_domain_enable_v2);
+
+static int __flush_pasid(struct protection_domain *domain, u32 pasid,
+ u64 address, bool size)
+{
+ struct iommu_dev_data *dev_data;
+ struct iommu_cmd cmd;
+ int i, ret;
+
+ if (!(domain->flags & PD_IOMMUV2_MASK))
+ return -EINVAL;
+
+ build_inv_iommu_pasid(&cmd, domain->id, pasid, address, size);
+
+ /*
+ * IOMMU TLB needs to be flushed before Device TLB to
+ * prevent device TLB refill from IOMMU TLB
+ */
+ for (i = 0; i < amd_iommu_get_num_iommus(); ++i) {
+ if (domain->dev_iommu[i] == 0)
+ continue;
+
+ ret = iommu_queue_command(amd_iommus[i], &cmd);
+ if (ret != 0)
+ goto out;
+ }
+
+ /* Wait until IOMMU TLB flushes are complete */
+ domain_flush_complete(domain);
+
+ /* Now flush device TLBs */
+ list_for_each_entry(dev_data, &domain->dev_list, list) {
+ struct amd_iommu *iommu;
+ int qdep;
+
+ /*
+ There might be non-IOMMUv2 capable devices in an IOMMUv2
+ * domain.
+ */
+ if (!dev_data->ats.enabled)
+ continue;
+
+ qdep = dev_data->ats.qdep;
+ iommu = amd_iommu_rlookup_table[dev_data->devid];
+
+ build_inv_iotlb_pasid(&cmd, dev_data->devid, pasid,
+ qdep, address, size);
+
+ ret = iommu_queue_command(iommu, &cmd);
+ if (ret != 0)
+ goto out;
+ }
+
+ /* Wait until all device TLBs are flushed */
+ domain_flush_complete(domain);
+
+ ret = 0;
+
+out:
+
+ return ret;
+}
+
+static int __amd_iommu_flush_page(struct protection_domain *domain, u32 pasid,
+ u64 address)
+{
+ return __flush_pasid(domain, pasid, address, false);
+}
+
+int amd_iommu_flush_page(struct iommu_domain *dom, u32 pasid,
+ u64 address)
+{
+ struct protection_domain *domain = to_pdomain(dom);
+ unsigned long flags;
+ int ret;
+
+ spin_lock_irqsave(&domain->lock, flags);
+ ret = __amd_iommu_flush_page(domain, pasid, address);
+ spin_unlock_irqrestore(&domain->lock, flags);
+
+ return ret;
+}
+EXPORT_SYMBOL(amd_iommu_flush_page);
+
+static int __amd_iommu_flush_tlb(struct protection_domain *domain, u32 pasid)
+{
+ return __flush_pasid(domain, pasid, CMD_INV_IOMMU_ALL_PAGES_ADDRESS,
+ true);
+}
+
+int amd_iommu_flush_tlb(struct iommu_domain *dom, u32 pasid)
+{
+ struct protection_domain *domain = to_pdomain(dom);
+ unsigned long flags;
+ int ret;
+
+ spin_lock_irqsave(&domain->lock, flags);
+ ret = __amd_iommu_flush_tlb(domain, pasid);
+ spin_unlock_irqrestore(&domain->lock, flags);
+
+ return ret;
+}
+EXPORT_SYMBOL(amd_iommu_flush_tlb);
+
+static u64 *__get_gcr3_pte(u64 *root, int level, u32 pasid, bool alloc)
+{
+ int index;
+ u64 *pte;
+
+ while (true) {
+
+ index = (pasid >> (9 * level)) & 0x1ff;
+ pte = &root[index];
+
+ if (level == 0)
+ break;
+
+ if (!(*pte & GCR3_VALID)) {
+ if (!alloc)
+ return NULL;
+
+ root = (void *)get_zeroed_page(GFP_ATOMIC);
+ if (root == NULL)
+ return NULL;
+
+ *pte = iommu_virt_to_phys(root) | GCR3_VALID;
+ }
+
+ root = iommu_phys_to_virt(*pte & PAGE_MASK);
+
+ level -= 1;
+ }
+
+ return pte;
+}
+
+static int __set_gcr3(struct protection_domain *domain, u32 pasid,
+ unsigned long cr3)
+{
+ struct domain_pgtable pgtable;
+ u64 *pte;
+
+ amd_iommu_domain_get_pgtable(domain, &pgtable);
+ if (pgtable.mode != PAGE_MODE_NONE)
+ return -EINVAL;
+
+ pte = __get_gcr3_pte(domain->gcr3_tbl, domain->glx, pasid, true);
+ if (pte == NULL)
+ return -ENOMEM;
+
+ *pte = (cr3 & PAGE_MASK) | GCR3_VALID;
+
+ return __amd_iommu_flush_tlb(domain, pasid);
+}
+
+static int __clear_gcr3(struct protection_domain *domain, u32 pasid)
+{
+ struct domain_pgtable pgtable;
+ u64 *pte;
+
+ amd_iommu_domain_get_pgtable(domain, &pgtable);
+ if (pgtable.mode != PAGE_MODE_NONE)
+ return -EINVAL;
+
+ pte = __get_gcr3_pte(domain->gcr3_tbl, domain->glx, pasid, false);
+ if (pte == NULL)
+ return 0;
+
+ *pte = 0;
+
+ return __amd_iommu_flush_tlb(domain, pasid);
+}
+
+int amd_iommu_domain_set_gcr3(struct iommu_domain *dom, u32 pasid,
+ unsigned long cr3)
+{
+ struct protection_domain *domain = to_pdomain(dom);
+ unsigned long flags;
+ int ret;
+
+ spin_lock_irqsave(&domain->lock, flags);
+ ret = __set_gcr3(domain, pasid, cr3);
+ spin_unlock_irqrestore(&domain->lock, flags);
+
+ return ret;
+}
+EXPORT_SYMBOL(amd_iommu_domain_set_gcr3);
+
+int amd_iommu_domain_clear_gcr3(struct iommu_domain *dom, u32 pasid)
+{
+ struct protection_domain *domain = to_pdomain(dom);
+ unsigned long flags;
+ int ret;
+
+ spin_lock_irqsave(&domain->lock, flags);
+ ret = __clear_gcr3(domain, pasid);
+ spin_unlock_irqrestore(&domain->lock, flags);
+
+ return ret;
+}
+EXPORT_SYMBOL(amd_iommu_domain_clear_gcr3);
+
+int amd_iommu_complete_ppr(struct pci_dev *pdev, u32 pasid,
+ int status, int tag)
+{
+ struct iommu_dev_data *dev_data;
+ struct amd_iommu *iommu;
+ struct iommu_cmd cmd;
+
+ dev_data = dev_iommu_priv_get(&pdev->dev);
+ iommu = amd_iommu_rlookup_table[dev_data->devid];
+
+ build_complete_ppr(&cmd, dev_data->devid, pasid, status,
+ tag, dev_data->pri_tlp);
+
+ return iommu_queue_command(iommu, &cmd);
+}
+EXPORT_SYMBOL(amd_iommu_complete_ppr);
+
+struct iommu_domain *amd_iommu_get_v2_domain(struct pci_dev *pdev)
+{
+ struct protection_domain *pdomain;
+ struct iommu_dev_data *dev_data;
+ struct device *dev = &pdev->dev;
+ struct iommu_domain *io_domain;
+
+ if (!check_device(dev))
+ return NULL;
+
+ dev_data = dev_iommu_priv_get(&pdev->dev);
+ pdomain = dev_data->domain;
+ io_domain = iommu_get_domain_for_dev(dev);
+
+ if (pdomain == NULL && dev_data->defer_attach) {
+ dev_data->defer_attach = false;
+ pdomain = to_pdomain(io_domain);
+ attach_device(dev, pdomain);
+ }
+
+ if (pdomain == NULL)
+ return NULL;
+
+ if (io_domain->type != IOMMU_DOMAIN_DMA)
+ return NULL;
+
+ /* Only return IOMMUv2 domains */
+ if (!(pdomain->flags & PD_IOMMUV2_MASK))
+ return NULL;
+
+ return &pdomain->domain;
+}
+EXPORT_SYMBOL(amd_iommu_get_v2_domain);
+
+void amd_iommu_enable_device_erratum(struct pci_dev *pdev, u32 erratum)
+{
+ struct iommu_dev_data *dev_data;
+
+ if (!amd_iommu_v2_supported())
+ return;
+
+ dev_data = dev_iommu_priv_get(&pdev->dev);
+ dev_data->errata |= (1 << erratum);
+}
+EXPORT_SYMBOL(amd_iommu_enable_device_erratum);
+
+int amd_iommu_device_info(struct pci_dev *pdev,
+ struct amd_iommu_device_info *info)
+{
+ int max_pasids;
+ int pos;
+
+ if (pdev == NULL || info == NULL)
+ return -EINVAL;
+
+ if (!amd_iommu_v2_supported())
+ return -EINVAL;
+
+ memset(info, 0, sizeof(*info));
+
+ if (pci_ats_supported(pdev))
+ info->flags |= AMD_IOMMU_DEVICE_FLAG_ATS_SUP;
+
+ pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI);
+ if (pos)
+ info->flags |= AMD_IOMMU_DEVICE_FLAG_PRI_SUP;
+
+ pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PASID);
+ if (pos) {
+ int features;
+
+ max_pasids = 1 << (9 * (amd_iommu_max_glx_val + 1));
+ max_pasids = min(max_pasids, (1 << 20));
+
+ info->flags |= AMD_IOMMU_DEVICE_FLAG_PASID_SUP;
+ info->max_pasids = min(pci_max_pasids(pdev), max_pasids);
+
+ features = pci_pasid_features(pdev);
+ if (features & PCI_PASID_CAP_EXEC)
+ info->flags |= AMD_IOMMU_DEVICE_FLAG_EXEC_SUP;
+ if (features & PCI_PASID_CAP_PRIV)
+ info->flags |= AMD_IOMMU_DEVICE_FLAG_PRIV_SUP;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(amd_iommu_device_info);
+
+#ifdef CONFIG_IRQ_REMAP
+
+/*****************************************************************************
+ *
+ * Interrupt Remapping Implementation
+ *
+ *****************************************************************************/
+
+static struct irq_chip amd_ir_chip;
+static DEFINE_SPINLOCK(iommu_table_lock);
+
+static void set_dte_irq_entry(u16 devid, struct irq_remap_table *table)
+{
+ u64 dte;
+
+ dte = amd_iommu_dev_table[devid].data[2];
+ dte &= ~DTE_IRQ_PHYS_ADDR_MASK;
+ dte |= iommu_virt_to_phys(table->table);
+ dte |= DTE_IRQ_REMAP_INTCTL;
+ dte |= DTE_IRQ_TABLE_LEN;
+ dte |= DTE_IRQ_REMAP_ENABLE;
+
+ amd_iommu_dev_table[devid].data[2] = dte;
+}
+
+static struct irq_remap_table *get_irq_table(u16 devid)
+{
+ struct irq_remap_table *table;
+
+ if (WARN_ONCE(!amd_iommu_rlookup_table[devid],
+ "%s: no iommu for devid %x\n", __func__, devid))
+ return NULL;
+
+ table = irq_lookup_table[devid];
+ if (WARN_ONCE(!table, "%s: no table for devid %x\n", __func__, devid))
+ return NULL;
+
+ return table;
+}
+
+static struct irq_remap_table *__alloc_irq_table(void)
+{
+ struct irq_remap_table *table;
+
+ table = kzalloc(sizeof(*table), GFP_KERNEL);
+ if (!table)
+ return NULL;
+
+ table->table = kmem_cache_alloc(amd_iommu_irq_cache, GFP_KERNEL);
+ if (!table->table) {
+ kfree(table);
+ return NULL;
+ }
+ raw_spin_lock_init(&table->lock);
+
+ if (!AMD_IOMMU_GUEST_IR_GA(amd_iommu_guest_ir))
+ memset(table->table, 0,
+ MAX_IRQS_PER_TABLE * sizeof(u32));
+ else
+ memset(table->table, 0,
+ (MAX_IRQS_PER_TABLE * (sizeof(u64) * 2)));
+ return table;
+}
+
+static void set_remap_table_entry(struct amd_iommu *iommu, u16 devid,
+ struct irq_remap_table *table)
+{
+ irq_lookup_table[devid] = table;
+ set_dte_irq_entry(devid, table);
+ iommu_flush_dte(iommu, devid);
+}
+
+static int set_remap_table_entry_alias(struct pci_dev *pdev, u16 alias,
+ void *data)
+{
+ struct irq_remap_table *table = data;
+
+ irq_lookup_table[alias] = table;
+ set_dte_irq_entry(alias, table);
+
+ iommu_flush_dte(amd_iommu_rlookup_table[alias], alias);
+
+ return 0;
+}
+
+static struct irq_remap_table *alloc_irq_table(u16 devid, struct pci_dev *pdev)
+{
+ struct irq_remap_table *table = NULL;
+ struct irq_remap_table *new_table = NULL;
+ struct amd_iommu *iommu;
+ unsigned long flags;
+ u16 alias;
+
+ spin_lock_irqsave(&iommu_table_lock, flags);
+
+ iommu = amd_iommu_rlookup_table[devid];
+ if (!iommu)
+ goto out_unlock;
+
+ table = irq_lookup_table[devid];
+ if (table)
+ goto out_unlock;
+
+ alias = amd_iommu_alias_table[devid];
+ table = irq_lookup_table[alias];
+ if (table) {
+ set_remap_table_entry(iommu, devid, table);
+ goto out_wait;
+ }
+ spin_unlock_irqrestore(&iommu_table_lock, flags);
+
+ /* Nothing there yet, allocate new irq remapping table */
+ new_table = __alloc_irq_table();
+ if (!new_table)
+ return NULL;
+
+ spin_lock_irqsave(&iommu_table_lock, flags);
+
+ table = irq_lookup_table[devid];
+ if (table)
+ goto out_unlock;
+
+ table = irq_lookup_table[alias];
+ if (table) {
+ set_remap_table_entry(iommu, devid, table);
+ goto out_wait;
+ }
+
+ table = new_table;
+ new_table = NULL;
+
+ if (pdev)
+ pci_for_each_dma_alias(pdev, set_remap_table_entry_alias,
+ table);
+ else
+ set_remap_table_entry(iommu, devid, table);
+
+ if (devid != alias)
+ set_remap_table_entry(iommu, alias, table);
+
+out_wait:
+ iommu_completion_wait(iommu);
+
+out_unlock:
+ spin_unlock_irqrestore(&iommu_table_lock, flags);
+
+ if (new_table) {
+ kmem_cache_free(amd_iommu_irq_cache, new_table->table);
+ kfree(new_table);
+ }
+ return table;
+}
+
+static int alloc_irq_index(u16 devid, int count, bool align,
+ struct pci_dev *pdev)
+{
+ struct irq_remap_table *table;
+ int index, c, alignment = 1;
+ unsigned long flags;
+ struct amd_iommu *iommu = amd_iommu_rlookup_table[devid];
+
+ if (!iommu)
+ return -ENODEV;
+
+ table = alloc_irq_table(devid, pdev);
+ if (!table)
+ return -ENODEV;
+
+ if (align)
+ alignment = roundup_pow_of_two(count);
+
+ raw_spin_lock_irqsave(&table->lock, flags);
+
+ /* Scan table for free entries */
+ for (index = ALIGN(table->min_index, alignment), c = 0;
+ index < MAX_IRQS_PER_TABLE;) {
+ if (!iommu->irte_ops->is_allocated(table, index)) {
+ c += 1;
+ } else {
+ c = 0;
+ index = ALIGN(index + 1, alignment);
+ continue;
+ }
+
+ if (c == count) {
+ for (; c != 0; --c)
+ iommu->irte_ops->set_allocated(table, index - c + 1);
+
+ index -= count - 1;
+ goto out;
+ }
+
+ index++;
+ }
+
+ index = -ENOSPC;
+
+out:
+ raw_spin_unlock_irqrestore(&table->lock, flags);
+
+ return index;
+}
+
+static int modify_irte_ga(u16 devid, int index, struct irte_ga *irte,
+ struct amd_ir_data *data)
+{
+ bool ret;
+ struct irq_remap_table *table;
+ struct amd_iommu *iommu;
+ unsigned long flags;
+ struct irte_ga *entry;
+
+ iommu = amd_iommu_rlookup_table[devid];
+ if (iommu == NULL)
+ return -EINVAL;
+
+ table = get_irq_table(devid);
+ if (!table)
+ return -ENOMEM;
+
+ raw_spin_lock_irqsave(&table->lock, flags);
+
+ entry = (struct irte_ga *)table->table;
+ entry = &entry[index];
+
+ ret = cmpxchg_double(&entry->lo.val, &entry->hi.val,
+ entry->lo.val, entry->hi.val,
+ irte->lo.val, irte->hi.val);
+ /*
+ * We use cmpxchg16 to atomically update the 128-bit IRTE,
+ * and it cannot be updated by the hardware or other processors
+ * behind us, so the return value of cmpxchg16 should be the
+ * same as the old value.
+ */
+ WARN_ON(!ret);
+
+ if (data)
+ data->ref = entry;
+
+ raw_spin_unlock_irqrestore(&table->lock, flags);
+
+ iommu_flush_irt(iommu, devid);
+ iommu_completion_wait(iommu);
+
+ return 0;
+}
+
+static int modify_irte(u16 devid, int index, union irte *irte)
+{
+ struct irq_remap_table *table;
+ struct amd_iommu *iommu;
+ unsigned long flags;
+
+ iommu = amd_iommu_rlookup_table[devid];
+ if (iommu == NULL)
+ return -EINVAL;
+
+ table = get_irq_table(devid);
+ if (!table)
+ return -ENOMEM;
+
+ raw_spin_lock_irqsave(&table->lock, flags);
+ table->table[index] = irte->val;
+ raw_spin_unlock_irqrestore(&table->lock, flags);
+
+ iommu_flush_irt(iommu, devid);
+ iommu_completion_wait(iommu);
+
+ return 0;
+}
+
+static void free_irte(u16 devid, int index)
+{
+ struct irq_remap_table *table;
+ struct amd_iommu *iommu;
+ unsigned long flags;
+
+ iommu = amd_iommu_rlookup_table[devid];
+ if (iommu == NULL)
+ return;
+
+ table = get_irq_table(devid);
+ if (!table)
+ return;
+
+ raw_spin_lock_irqsave(&table->lock, flags);
+ iommu->irte_ops->clear_allocated(table, index);
+ raw_spin_unlock_irqrestore(&table->lock, flags);
+
+ iommu_flush_irt(iommu, devid);
+ iommu_completion_wait(iommu);
+}
+
+static void irte_prepare(void *entry,
+ u32 delivery_mode, u32 dest_mode,
+ u8 vector, u32 dest_apicid, int devid)
+{
+ union irte *irte = (union irte *) entry;
+
+ irte->val = 0;
+ irte->fields.vector = vector;
+ irte->fields.int_type = delivery_mode;
+ irte->fields.destination = dest_apicid;
+ irte->fields.dm = dest_mode;
+ irte->fields.valid = 1;
+}
+
+static void irte_ga_prepare(void *entry,
+ u32 delivery_mode, u32 dest_mode,
+ u8 vector, u32 dest_apicid, int devid)
+{
+ struct irte_ga *irte = (struct irte_ga *) entry;
+
+ irte->lo.val = 0;
+ irte->hi.val = 0;
+ irte->lo.fields_remap.int_type = delivery_mode;
+ irte->lo.fields_remap.dm = dest_mode;
+ irte->hi.fields.vector = vector;
+ irte->lo.fields_remap.destination = APICID_TO_IRTE_DEST_LO(dest_apicid);
+ irte->hi.fields.destination = APICID_TO_IRTE_DEST_HI(dest_apicid);
+ irte->lo.fields_remap.valid = 1;
+}
+
+static void irte_activate(void *entry, u16 devid, u16 index)
+{
+ union irte *irte = (union irte *) entry;
+
+ irte->fields.valid = 1;
+ modify_irte(devid, index, irte);
+}
+
+static void irte_ga_activate(void *entry, u16 devid, u16 index)
+{
+ struct irte_ga *irte = (struct irte_ga *) entry;
+
+ irte->lo.fields_remap.valid = 1;
+ modify_irte_ga(devid, index, irte, NULL);
+}
+
+static void irte_deactivate(void *entry, u16 devid, u16 index)
+{
+ union irte *irte = (union irte *) entry;
+
+ irte->fields.valid = 0;
+ modify_irte(devid, index, irte);
+}
+
+static void irte_ga_deactivate(void *entry, u16 devid, u16 index)
+{
+ struct irte_ga *irte = (struct irte_ga *) entry;
+
+ irte->lo.fields_remap.valid = 0;
+ modify_irte_ga(devid, index, irte, NULL);
+}
+
+static void irte_set_affinity(void *entry, u16 devid, u16 index,
+ u8 vector, u32 dest_apicid)
+{
+ union irte *irte = (union irte *) entry;
+
+ irte->fields.vector = vector;
+ irte->fields.destination = dest_apicid;
+ modify_irte(devid, index, irte);
+}
+
+static void irte_ga_set_affinity(void *entry, u16 devid, u16 index,
+ u8 vector, u32 dest_apicid)
+{
+ struct irte_ga *irte = (struct irte_ga *) entry;
+
+ if (!irte->lo.fields_remap.guest_mode) {
+ irte->hi.fields.vector = vector;
+ irte->lo.fields_remap.destination =
+ APICID_TO_IRTE_DEST_LO(dest_apicid);
+ irte->hi.fields.destination =
+ APICID_TO_IRTE_DEST_HI(dest_apicid);
+ modify_irte_ga(devid, index, irte, NULL);
+ }
+}
+
+#define IRTE_ALLOCATED (~1U)
+static void irte_set_allocated(struct irq_remap_table *table, int index)
+{
+ table->table[index] = IRTE_ALLOCATED;
+}
+
+static void irte_ga_set_allocated(struct irq_remap_table *table, int index)
+{
+ struct irte_ga *ptr = (struct irte_ga *)table->table;
+ struct irte_ga *irte = &ptr[index];
+
+ memset(&irte->lo.val, 0, sizeof(u64));
+ memset(&irte->hi.val, 0, sizeof(u64));
+ irte->hi.fields.vector = 0xff;
+}
+
+static bool irte_is_allocated(struct irq_remap_table *table, int index)
+{
+ union irte *ptr = (union irte *)table->table;
+ union irte *irte = &ptr[index];
+
+ return irte->val != 0;
+}
+
+static bool irte_ga_is_allocated(struct irq_remap_table *table, int index)
+{
+ struct irte_ga *ptr = (struct irte_ga *)table->table;
+ struct irte_ga *irte = &ptr[index];
+
+ return irte->hi.fields.vector != 0;
+}
+
+static void irte_clear_allocated(struct irq_remap_table *table, int index)
+{
+ table->table[index] = 0;
+}
+
+static void irte_ga_clear_allocated(struct irq_remap_table *table, int index)
+{
+ struct irte_ga *ptr = (struct irte_ga *)table->table;
+ struct irte_ga *irte = &ptr[index];
+
+ memset(&irte->lo.val, 0, sizeof(u64));
+ memset(&irte->hi.val, 0, sizeof(u64));
+}
+
+static int get_devid(struct irq_alloc_info *info)
+{
+ switch (info->type) {
+ case X86_IRQ_ALLOC_TYPE_IOAPIC:
+ case X86_IRQ_ALLOC_TYPE_IOAPIC_GET_PARENT:
+ return get_ioapic_devid(info->devid);
+ case X86_IRQ_ALLOC_TYPE_HPET:
+ case X86_IRQ_ALLOC_TYPE_HPET_GET_PARENT:
+ return get_hpet_devid(info->devid);
+ case X86_IRQ_ALLOC_TYPE_PCI_MSI:
+ case X86_IRQ_ALLOC_TYPE_PCI_MSIX:
+ return get_device_id(msi_desc_to_dev(info->desc));
+ default:
+ WARN_ON_ONCE(1);
+ return -1;
+ }
+}
+
+static struct irq_domain *get_irq_domain_for_devid(struct irq_alloc_info *info,
+ int devid)
+{
+ struct amd_iommu *iommu = amd_iommu_rlookup_table[devid];
+
+ if (!iommu)
+ return NULL;
+
+ switch (info->type) {
+ case X86_IRQ_ALLOC_TYPE_IOAPIC_GET_PARENT:
+ case X86_IRQ_ALLOC_TYPE_HPET_GET_PARENT:
+ return iommu->ir_domain;
+ default:
+ WARN_ON_ONCE(1);
+ return NULL;
+ }
+}
+
+static struct irq_domain *get_irq_domain(struct irq_alloc_info *info)
+{
+ int devid;
+
+ if (!info)
+ return NULL;
+
+ devid = get_devid(info);
+ if (devid < 0)
+ return NULL;
+ return get_irq_domain_for_devid(info, devid);
+}
+
+struct irq_remap_ops amd_iommu_irq_ops = {
+ .prepare = amd_iommu_prepare,
+ .enable = amd_iommu_enable,
+ .disable = amd_iommu_disable,
+ .reenable = amd_iommu_reenable,
+ .enable_faulting = amd_iommu_enable_faulting,
+ .get_irq_domain = get_irq_domain,
+};
+
+static void irq_remapping_prepare_irte(struct amd_ir_data *data,
+ struct irq_cfg *irq_cfg,
+ struct irq_alloc_info *info,
+ int devid, int index, int sub_handle)
+{
+ struct irq_2_irte *irte_info = &data->irq_2_irte;
+ struct msi_msg *msg = &data->msi_entry;
+ struct IO_APIC_route_entry *entry;
+ struct amd_iommu *iommu = amd_iommu_rlookup_table[devid];
+
+ if (!iommu)
+ return;
+
+ data->irq_2_irte.devid = devid;
+ data->irq_2_irte.index = index + sub_handle;
+ iommu->irte_ops->prepare(data->entry, apic->irq_delivery_mode,
+ apic->irq_dest_mode, irq_cfg->vector,
+ irq_cfg->dest_apicid, devid);
+
+ switch (info->type) {
+ case X86_IRQ_ALLOC_TYPE_IOAPIC:
+ /* Setup IOAPIC entry */
+ entry = info->ioapic.entry;
+ info->ioapic.entry = NULL;
+ memset(entry, 0, sizeof(*entry));
+ entry->vector = index;
+ entry->mask = 0;
+ entry->trigger = info->ioapic.trigger;
+ entry->polarity = info->ioapic.polarity;
+ /* Mask level triggered irqs. */
+ if (info->ioapic.trigger)
+ entry->mask = 1;
+ break;
+
+ case X86_IRQ_ALLOC_TYPE_HPET:
+ case X86_IRQ_ALLOC_TYPE_PCI_MSI:
+ case X86_IRQ_ALLOC_TYPE_PCI_MSIX:
+ msg->address_hi = MSI_ADDR_BASE_HI;
+ msg->address_lo = MSI_ADDR_BASE_LO;
+ msg->data = irte_info->index;
+ break;
+
+ default:
+ BUG_ON(1);
+ break;
+ }
+}
+
+struct amd_irte_ops irte_32_ops = {
+ .prepare = irte_prepare,
+ .activate = irte_activate,
+ .deactivate = irte_deactivate,
+ .set_affinity = irte_set_affinity,
+ .set_allocated = irte_set_allocated,
+ .is_allocated = irte_is_allocated,
+ .clear_allocated = irte_clear_allocated,
+};
+
+struct amd_irte_ops irte_128_ops = {
+ .prepare = irte_ga_prepare,
+ .activate = irte_ga_activate,
+ .deactivate = irte_ga_deactivate,
+ .set_affinity = irte_ga_set_affinity,
+ .set_allocated = irte_ga_set_allocated,
+ .is_allocated = irte_ga_is_allocated,
+ .clear_allocated = irte_ga_clear_allocated,
+};
+
+static int irq_remapping_alloc(struct irq_domain *domain, unsigned int virq,
+ unsigned int nr_irqs, void *arg)
+{
+ struct irq_alloc_info *info = arg;
+ struct irq_data *irq_data;
+ struct amd_ir_data *data = NULL;
+ struct irq_cfg *cfg;
+ int i, ret, devid;
+ int index;
+
+ if (!info)
+ return -EINVAL;
+ if (nr_irqs > 1 && info->type != X86_IRQ_ALLOC_TYPE_PCI_MSI &&
+ info->type != X86_IRQ_ALLOC_TYPE_PCI_MSIX)
+ return -EINVAL;
+
+ /*
+ * With IRQ remapping enabled, don't need contiguous CPU vectors
+ * to support multiple MSI interrupts.
+ */
+ if (info->type == X86_IRQ_ALLOC_TYPE_PCI_MSI)
+ info->flags &= ~X86_IRQ_ALLOC_CONTIGUOUS_VECTORS;
+
+ devid = get_devid(info);
+ if (devid < 0)
+ return -EINVAL;
+
+ ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, arg);
+ if (ret < 0)
+ return ret;
+
+ if (info->type == X86_IRQ_ALLOC_TYPE_IOAPIC) {
+ struct irq_remap_table *table;
+ struct amd_iommu *iommu;
+
+ table = alloc_irq_table(devid, NULL);
+ if (table) {
+ if (!table->min_index) {
+ /*
+ * Keep the first 32 indexes free for IOAPIC
+ * interrupts.
+ */
+ table->min_index = 32;
+ iommu = amd_iommu_rlookup_table[devid];
+ for (i = 0; i < 32; ++i)
+ iommu->irte_ops->set_allocated(table, i);
+ }
+ WARN_ON(table->min_index != 32);
+ index = info->ioapic.pin;
+ } else {
+ index = -ENOMEM;
+ }
+ } else if (info->type == X86_IRQ_ALLOC_TYPE_PCI_MSI ||
+ info->type == X86_IRQ_ALLOC_TYPE_PCI_MSIX) {
+ bool align = (info->type == X86_IRQ_ALLOC_TYPE_PCI_MSI);
+
+ index = alloc_irq_index(devid, nr_irqs, align,
+ msi_desc_to_pci_dev(info->desc));
+ } else {
+ index = alloc_irq_index(devid, nr_irqs, false, NULL);
+ }
+
+ if (index < 0) {
+ pr_warn("Failed to allocate IRTE\n");
+ ret = index;
+ goto out_free_parent;
+ }
+
+ for (i = 0; i < nr_irqs; i++) {
+ irq_data = irq_domain_get_irq_data(domain, virq + i);
+ cfg = irq_data ? irqd_cfg(irq_data) : NULL;
+ if (!cfg) {
+ ret = -EINVAL;
+ goto out_free_data;
+ }
+
+ ret = -ENOMEM;
+ data = kzalloc(sizeof(*data), GFP_KERNEL);
+ if (!data)
+ goto out_free_data;
+
+ if (!AMD_IOMMU_GUEST_IR_GA(amd_iommu_guest_ir))
+ data->entry = kzalloc(sizeof(union irte), GFP_KERNEL);
+ else
+ data->entry = kzalloc(sizeof(struct irte_ga),
+ GFP_KERNEL);
+ if (!data->entry) {
+ kfree(data);
+ goto out_free_data;
+ }
+
+ irq_data->hwirq = (devid << 16) + i;
+ irq_data->chip_data = data;
+ irq_data->chip = &amd_ir_chip;
+ irq_remapping_prepare_irte(data, cfg, info, devid, index, i);
+ irq_set_status_flags(virq + i, IRQ_MOVE_PCNTXT);
+ }
+
+ return 0;
+
+out_free_data:
+ for (i--; i >= 0; i--) {
+ irq_data = irq_domain_get_irq_data(domain, virq + i);
+ if (irq_data)
+ kfree(irq_data->chip_data);
+ }
+ for (i = 0; i < nr_irqs; i++)
+ free_irte(devid, index + i);
+out_free_parent:
+ irq_domain_free_irqs_common(domain, virq, nr_irqs);
+ return ret;
+}
+
+static void irq_remapping_free(struct irq_domain *domain, unsigned int virq,
+ unsigned int nr_irqs)
+{
+ struct irq_2_irte *irte_info;
+ struct irq_data *irq_data;
+ struct amd_ir_data *data;
+ int i;
+
+ for (i = 0; i < nr_irqs; i++) {
+ irq_data = irq_domain_get_irq_data(domain, virq + i);
+ if (irq_data && irq_data->chip_data) {
+ data = irq_data->chip_data;
+ irte_info = &data->irq_2_irte;
+ free_irte(irte_info->devid, irte_info->index);
+ kfree(data->entry);
+ kfree(data);
+ }
+ }
+ irq_domain_free_irqs_common(domain, virq, nr_irqs);
+}
+
+static void amd_ir_update_irte(struct irq_data *irqd, struct amd_iommu *iommu,
+ struct amd_ir_data *ir_data,
+ struct irq_2_irte *irte_info,
+ struct irq_cfg *cfg);
+
+static int irq_remapping_activate(struct irq_domain *domain,
+ struct irq_data *irq_data, bool reserve)
+{
+ struct amd_ir_data *data = irq_data->chip_data;
+ struct irq_2_irte *irte_info = &data->irq_2_irte;
+ struct amd_iommu *iommu = amd_iommu_rlookup_table[irte_info->devid];
+ struct irq_cfg *cfg = irqd_cfg(irq_data);
+
+ if (!iommu)
+ return 0;
+
+ iommu->irte_ops->activate(data->entry, irte_info->devid,
+ irte_info->index);
+ amd_ir_update_irte(irq_data, iommu, data, irte_info, cfg);
+ return 0;
+}
+
+static void irq_remapping_deactivate(struct irq_domain *domain,
+ struct irq_data *irq_data)
+{
+ struct amd_ir_data *data = irq_data->chip_data;
+ struct irq_2_irte *irte_info = &data->irq_2_irte;
+ struct amd_iommu *iommu = amd_iommu_rlookup_table[irte_info->devid];
+
+ if (iommu)
+ iommu->irte_ops->deactivate(data->entry, irte_info->devid,
+ irte_info->index);
+}
+
+static const struct irq_domain_ops amd_ir_domain_ops = {
+ .alloc = irq_remapping_alloc,
+ .free = irq_remapping_free,
+ .activate = irq_remapping_activate,
+ .deactivate = irq_remapping_deactivate,
+};
+
+int amd_iommu_activate_guest_mode(void *data)
+{
+ struct amd_ir_data *ir_data = (struct amd_ir_data *)data;
+ struct irte_ga *entry = (struct irte_ga *) ir_data->entry;
+ u64 valid;
+
+ if (!AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir) || !entry)
+ return 0;
+
+ valid = entry->lo.fields_vapic.valid;
+
+ entry->lo.val = 0;
+ entry->hi.val = 0;
+
+ entry->lo.fields_vapic.valid = valid;
+ entry->lo.fields_vapic.guest_mode = 1;
+ entry->lo.fields_vapic.ga_log_intr = 1;
+ entry->hi.fields.ga_root_ptr = ir_data->ga_root_ptr;
+ entry->hi.fields.vector = ir_data->ga_vector;
+ entry->lo.fields_vapic.ga_tag = ir_data->ga_tag;
+
+ return modify_irte_ga(ir_data->irq_2_irte.devid,
+ ir_data->irq_2_irte.index, entry, ir_data);
+}
+EXPORT_SYMBOL(amd_iommu_activate_guest_mode);
+
+int amd_iommu_deactivate_guest_mode(void *data)
+{
+ struct amd_ir_data *ir_data = (struct amd_ir_data *)data;
+ struct irte_ga *entry = (struct irte_ga *) ir_data->entry;
+ struct irq_cfg *cfg = ir_data->cfg;
+ u64 valid;
+
+ if (!AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir) ||
+ !entry || !entry->lo.fields_vapic.guest_mode)
+ return 0;
+
+ valid = entry->lo.fields_remap.valid;
+
+ entry->lo.val = 0;
+ entry->hi.val = 0;
+
+ entry->lo.fields_remap.valid = valid;
+ entry->lo.fields_remap.dm = apic->irq_dest_mode;
+ entry->lo.fields_remap.int_type = apic->irq_delivery_mode;
+ entry->hi.fields.vector = cfg->vector;
+ entry->lo.fields_remap.destination =
+ APICID_TO_IRTE_DEST_LO(cfg->dest_apicid);
+ entry->hi.fields.destination =
+ APICID_TO_IRTE_DEST_HI(cfg->dest_apicid);
+
+ return modify_irte_ga(ir_data->irq_2_irte.devid,
+ ir_data->irq_2_irte.index, entry, ir_data);
+}
+EXPORT_SYMBOL(amd_iommu_deactivate_guest_mode);
+
+static int amd_ir_set_vcpu_affinity(struct irq_data *data, void *vcpu_info)
+{
+ int ret;
+ struct amd_iommu *iommu;
+ struct amd_iommu_pi_data *pi_data = vcpu_info;
+ struct vcpu_data *vcpu_pi_info = pi_data->vcpu_data;
+ struct amd_ir_data *ir_data = data->chip_data;
+ struct irq_2_irte *irte_info = &ir_data->irq_2_irte;
+ struct iommu_dev_data *dev_data = search_dev_data(irte_info->devid);
+
+ /* Note:
+ * This device has never been set up for guest mode.
+ * we should not modify the IRTE
+ */
+ if (!dev_data || !dev_data->use_vapic)
+ return 0;
+
+ ir_data->cfg = irqd_cfg(data);
+ pi_data->ir_data = ir_data;
+
+ /* Note:
+ * SVM tries to set up for VAPIC mode, but we are in
+ * legacy mode. So, we force legacy mode instead.
+ */
+ if (!AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir)) {
+ pr_debug("%s: Fall back to using intr legacy remap\n",
+ __func__);
+ pi_data->is_guest_mode = false;
+ }
+
+ iommu = amd_iommu_rlookup_table[irte_info->devid];
+ if (iommu == NULL)
+ return -EINVAL;
+
+ pi_data->prev_ga_tag = ir_data->cached_ga_tag;
+ if (pi_data->is_guest_mode) {
+ ir_data->ga_root_ptr = (pi_data->base >> 12);
+ ir_data->ga_vector = vcpu_pi_info->vector;
+ ir_data->ga_tag = pi_data->ga_tag;
+ ret = amd_iommu_activate_guest_mode(ir_data);
+ if (!ret)
+ ir_data->cached_ga_tag = pi_data->ga_tag;
+ } else {
+ ret = amd_iommu_deactivate_guest_mode(ir_data);
+
+ /*
+ * This communicates the ga_tag back to the caller
+ * so that it can do all the necessary clean up.
+ */
+ if (!ret)
+ ir_data->cached_ga_tag = 0;
+ }
+
+ return ret;
+}
+
+
+static void amd_ir_update_irte(struct irq_data *irqd, struct amd_iommu *iommu,
+ struct amd_ir_data *ir_data,
+ struct irq_2_irte *irte_info,
+ struct irq_cfg *cfg)
+{
+
+ /*
+ * Atomically updates the IRTE with the new destination, vector
+ * and flushes the interrupt entry cache.
+ */
+ iommu->irte_ops->set_affinity(ir_data->entry, irte_info->devid,
+ irte_info->index, cfg->vector,
+ cfg->dest_apicid);
+}
+
+static int amd_ir_set_affinity(struct irq_data *data,
+ const struct cpumask *mask, bool force)
+{
+ struct amd_ir_data *ir_data = data->chip_data;
+ struct irq_2_irte *irte_info = &ir_data->irq_2_irte;
+ struct irq_cfg *cfg = irqd_cfg(data);
+ struct irq_data *parent = data->parent_data;
+ struct amd_iommu *iommu = amd_iommu_rlookup_table[irte_info->devid];
+ int ret;
+
+ if (!iommu)
+ return -ENODEV;
+
+ ret = parent->chip->irq_set_affinity(parent, mask, force);
+ if (ret < 0 || ret == IRQ_SET_MASK_OK_DONE)
+ return ret;
+
+ amd_ir_update_irte(data, iommu, ir_data, irte_info, cfg);
+ /*
+ * After this point, all the interrupts will start arriving
+ * at the new destination. So, time to cleanup the previous
+ * vector allocation.
+ */
+ send_cleanup_vector(cfg);
+
+ return IRQ_SET_MASK_OK_DONE;
+}
+
+static void ir_compose_msi_msg(struct irq_data *irq_data, struct msi_msg *msg)
+{
+ struct amd_ir_data *ir_data = irq_data->chip_data;
+
+ *msg = ir_data->msi_entry;
+}
+
+static struct irq_chip amd_ir_chip = {
+ .name = "AMD-IR",
+ .irq_ack = apic_ack_irq,
+ .irq_set_affinity = amd_ir_set_affinity,
+ .irq_set_vcpu_affinity = amd_ir_set_vcpu_affinity,
+ .irq_compose_msi_msg = ir_compose_msi_msg,
+};
+
+int amd_iommu_create_irq_domain(struct amd_iommu *iommu)
+{
+ struct fwnode_handle *fn;
+
+ fn = irq_domain_alloc_named_id_fwnode("AMD-IR", iommu->index);
+ if (!fn)
+ return -ENOMEM;
+ iommu->ir_domain = irq_domain_create_tree(fn, &amd_ir_domain_ops, iommu);
+ if (!iommu->ir_domain) {
+ irq_domain_free_fwnode(fn);
+ return -ENOMEM;
+ }
+
+ iommu->ir_domain->parent = arch_get_ir_parent_domain();
+ iommu->msi_domain = arch_create_remap_msi_irq_domain(iommu->ir_domain,
+ "AMD-IR-MSI",
+ iommu->index);
+ return 0;
+}
+
+int amd_iommu_update_ga(int cpu, bool is_run, void *data)
+{
+ unsigned long flags;
+ struct amd_iommu *iommu;
+ struct irq_remap_table *table;
+ struct amd_ir_data *ir_data = (struct amd_ir_data *)data;
+ int devid = ir_data->irq_2_irte.devid;
+ struct irte_ga *entry = (struct irte_ga *) ir_data->entry;
+ struct irte_ga *ref = (struct irte_ga *) ir_data->ref;
+
+ if (!AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir) ||
+ !ref || !entry || !entry->lo.fields_vapic.guest_mode)
+ return 0;
+
+ iommu = amd_iommu_rlookup_table[devid];
+ if (!iommu)
+ return -ENODEV;
+
+ table = get_irq_table(devid);
+ if (!table)
+ return -ENODEV;
+
+ raw_spin_lock_irqsave(&table->lock, flags);
+
+ if (ref->lo.fields_vapic.guest_mode) {
+ if (cpu >= 0) {
+ ref->lo.fields_vapic.destination =
+ APICID_TO_IRTE_DEST_LO(cpu);
+ ref->hi.fields.destination =
+ APICID_TO_IRTE_DEST_HI(cpu);
+ }
+ ref->lo.fields_vapic.is_run = is_run;
+ barrier();
+ }
+
+ raw_spin_unlock_irqrestore(&table->lock, flags);
+
+ iommu_flush_irt(iommu, devid);
+ iommu_completion_wait(iommu);
+ return 0;
+}
+EXPORT_SYMBOL(amd_iommu_update_ga);
+#endif
diff --git a/drivers/iommu/amd/iommu_v2.c b/drivers/iommu/amd/iommu_v2.c
new file mode 100644
index 000000000..16776e3c6
--- /dev/null
+++ b/drivers/iommu/amd/iommu_v2.c
@@ -0,0 +1,989 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2010-2012 Advanced Micro Devices, Inc.
+ * Author: Joerg Roedel <jroedel@suse.de>
+ */
+
+#define pr_fmt(fmt) "AMD-Vi: " fmt
+
+#include <linux/mmu_notifier.h>
+#include <linux/amd-iommu.h>
+#include <linux/mm_types.h>
+#include <linux/profile.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/sched/mm.h>
+#include <linux/wait.h>
+#include <linux/pci.h>
+#include <linux/gfp.h>
+
+#include "amd_iommu.h"
+
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Joerg Roedel <jroedel@suse.de>");
+
+#define MAX_DEVICES 0x10000
+#define PRI_QUEUE_SIZE 512
+
+struct pri_queue {
+ atomic_t inflight;
+ bool finish;
+ int status;
+};
+
+struct pasid_state {
+ struct list_head list; /* For global state-list */
+ atomic_t count; /* Reference count */
+ unsigned mmu_notifier_count; /* Counting nested mmu_notifier
+ calls */
+ struct mm_struct *mm; /* mm_struct for the faults */
+ struct mmu_notifier mn; /* mmu_notifier handle */
+ struct pri_queue pri[PRI_QUEUE_SIZE]; /* PRI tag states */
+ struct device_state *device_state; /* Link to our device_state */
+ u32 pasid; /* PASID index */
+ bool invalid; /* Used during setup and
+ teardown of the pasid */
+ spinlock_t lock; /* Protect pri_queues and
+ mmu_notifer_count */
+ wait_queue_head_t wq; /* To wait for count == 0 */
+};
+
+struct device_state {
+ struct list_head list;
+ u16 devid;
+ atomic_t count;
+ struct pci_dev *pdev;
+ struct pasid_state **states;
+ struct iommu_domain *domain;
+ int pasid_levels;
+ int max_pasids;
+ amd_iommu_invalid_ppr_cb inv_ppr_cb;
+ amd_iommu_invalidate_ctx inv_ctx_cb;
+ spinlock_t lock;
+ wait_queue_head_t wq;
+};
+
+struct fault {
+ struct work_struct work;
+ struct device_state *dev_state;
+ struct pasid_state *state;
+ struct mm_struct *mm;
+ u64 address;
+ u16 devid;
+ u32 pasid;
+ u16 tag;
+ u16 finish;
+ u16 flags;
+};
+
+static LIST_HEAD(state_list);
+static spinlock_t state_lock;
+
+static struct workqueue_struct *iommu_wq;
+
+static void free_pasid_states(struct device_state *dev_state);
+
+static u16 device_id(struct pci_dev *pdev)
+{
+ u16 devid;
+
+ devid = pdev->bus->number;
+ devid = (devid << 8) | pdev->devfn;
+
+ return devid;
+}
+
+static struct device_state *__get_device_state(u16 devid)
+{
+ struct device_state *dev_state;
+
+ list_for_each_entry(dev_state, &state_list, list) {
+ if (dev_state->devid == devid)
+ return dev_state;
+ }
+
+ return NULL;
+}
+
+static struct device_state *get_device_state(u16 devid)
+{
+ struct device_state *dev_state;
+ unsigned long flags;
+
+ spin_lock_irqsave(&state_lock, flags);
+ dev_state = __get_device_state(devid);
+ if (dev_state != NULL)
+ atomic_inc(&dev_state->count);
+ spin_unlock_irqrestore(&state_lock, flags);
+
+ return dev_state;
+}
+
+static void free_device_state(struct device_state *dev_state)
+{
+ struct iommu_group *group;
+
+ /*
+ * First detach device from domain - No more PRI requests will arrive
+ * from that device after it is unbound from the IOMMUv2 domain.
+ */
+ group = iommu_group_get(&dev_state->pdev->dev);
+ if (WARN_ON(!group))
+ return;
+
+ iommu_detach_group(dev_state->domain, group);
+
+ iommu_group_put(group);
+
+ /* Everything is down now, free the IOMMUv2 domain */
+ iommu_domain_free(dev_state->domain);
+
+ /* Finally get rid of the device-state */
+ kfree(dev_state);
+}
+
+static void put_device_state(struct device_state *dev_state)
+{
+ if (atomic_dec_and_test(&dev_state->count))
+ wake_up(&dev_state->wq);
+}
+
+/* Must be called under dev_state->lock */
+static struct pasid_state **__get_pasid_state_ptr(struct device_state *dev_state,
+ u32 pasid, bool alloc)
+{
+ struct pasid_state **root, **ptr;
+ int level, index;
+
+ level = dev_state->pasid_levels;
+ root = dev_state->states;
+
+ while (true) {
+
+ index = (pasid >> (9 * level)) & 0x1ff;
+ ptr = &root[index];
+
+ if (level == 0)
+ break;
+
+ if (*ptr == NULL) {
+ if (!alloc)
+ return NULL;
+
+ *ptr = (void *)get_zeroed_page(GFP_ATOMIC);
+ if (*ptr == NULL)
+ return NULL;
+ }
+
+ root = (struct pasid_state **)*ptr;
+ level -= 1;
+ }
+
+ return ptr;
+}
+
+static int set_pasid_state(struct device_state *dev_state,
+ struct pasid_state *pasid_state,
+ u32 pasid)
+{
+ struct pasid_state **ptr;
+ unsigned long flags;
+ int ret;
+
+ spin_lock_irqsave(&dev_state->lock, flags);
+ ptr = __get_pasid_state_ptr(dev_state, pasid, true);
+
+ ret = -ENOMEM;
+ if (ptr == NULL)
+ goto out_unlock;
+
+ ret = -ENOMEM;
+ if (*ptr != NULL)
+ goto out_unlock;
+
+ *ptr = pasid_state;
+
+ ret = 0;
+
+out_unlock:
+ spin_unlock_irqrestore(&dev_state->lock, flags);
+
+ return ret;
+}
+
+static void clear_pasid_state(struct device_state *dev_state, u32 pasid)
+{
+ struct pasid_state **ptr;
+ unsigned long flags;
+
+ spin_lock_irqsave(&dev_state->lock, flags);
+ ptr = __get_pasid_state_ptr(dev_state, pasid, true);
+
+ if (ptr == NULL)
+ goto out_unlock;
+
+ *ptr = NULL;
+
+out_unlock:
+ spin_unlock_irqrestore(&dev_state->lock, flags);
+}
+
+static struct pasid_state *get_pasid_state(struct device_state *dev_state,
+ u32 pasid)
+{
+ struct pasid_state **ptr, *ret = NULL;
+ unsigned long flags;
+
+ spin_lock_irqsave(&dev_state->lock, flags);
+ ptr = __get_pasid_state_ptr(dev_state, pasid, false);
+
+ if (ptr == NULL)
+ goto out_unlock;
+
+ ret = *ptr;
+ if (ret)
+ atomic_inc(&ret->count);
+
+out_unlock:
+ spin_unlock_irqrestore(&dev_state->lock, flags);
+
+ return ret;
+}
+
+static void free_pasid_state(struct pasid_state *pasid_state)
+{
+ kfree(pasid_state);
+}
+
+static void put_pasid_state(struct pasid_state *pasid_state)
+{
+ if (atomic_dec_and_test(&pasid_state->count))
+ wake_up(&pasid_state->wq);
+}
+
+static void put_pasid_state_wait(struct pasid_state *pasid_state)
+{
+ atomic_dec(&pasid_state->count);
+ wait_event(pasid_state->wq, !atomic_read(&pasid_state->count));
+ free_pasid_state(pasid_state);
+}
+
+static void unbind_pasid(struct pasid_state *pasid_state)
+{
+ struct iommu_domain *domain;
+
+ domain = pasid_state->device_state->domain;
+
+ /*
+ * Mark pasid_state as invalid, no more faults will we added to the
+ * work queue after this is visible everywhere.
+ */
+ pasid_state->invalid = true;
+
+ /* Make sure this is visible */
+ smp_wmb();
+
+ /* After this the device/pasid can't access the mm anymore */
+ amd_iommu_domain_clear_gcr3(domain, pasid_state->pasid);
+
+ /* Make sure no more pending faults are in the queue */
+ flush_workqueue(iommu_wq);
+}
+
+static void free_pasid_states_level1(struct pasid_state **tbl)
+{
+ int i;
+
+ for (i = 0; i < 512; ++i) {
+ if (tbl[i] == NULL)
+ continue;
+
+ free_page((unsigned long)tbl[i]);
+ }
+}
+
+static void free_pasid_states_level2(struct pasid_state **tbl)
+{
+ struct pasid_state **ptr;
+ int i;
+
+ for (i = 0; i < 512; ++i) {
+ if (tbl[i] == NULL)
+ continue;
+
+ ptr = (struct pasid_state **)tbl[i];
+ free_pasid_states_level1(ptr);
+ }
+}
+
+static void free_pasid_states(struct device_state *dev_state)
+{
+ struct pasid_state *pasid_state;
+ int i;
+
+ for (i = 0; i < dev_state->max_pasids; ++i) {
+ pasid_state = get_pasid_state(dev_state, i);
+ if (pasid_state == NULL)
+ continue;
+
+ put_pasid_state(pasid_state);
+
+ /*
+ * This will call the mn_release function and
+ * unbind the PASID
+ */
+ mmu_notifier_unregister(&pasid_state->mn, pasid_state->mm);
+
+ put_pasid_state_wait(pasid_state); /* Reference taken in
+ amd_iommu_bind_pasid */
+
+ /* Drop reference taken in amd_iommu_bind_pasid */
+ put_device_state(dev_state);
+ }
+
+ if (dev_state->pasid_levels == 2)
+ free_pasid_states_level2(dev_state->states);
+ else if (dev_state->pasid_levels == 1)
+ free_pasid_states_level1(dev_state->states);
+ else
+ BUG_ON(dev_state->pasid_levels != 0);
+
+ free_page((unsigned long)dev_state->states);
+}
+
+static struct pasid_state *mn_to_state(struct mmu_notifier *mn)
+{
+ return container_of(mn, struct pasid_state, mn);
+}
+
+static void mn_invalidate_range(struct mmu_notifier *mn,
+ struct mm_struct *mm,
+ unsigned long start, unsigned long end)
+{
+ struct pasid_state *pasid_state;
+ struct device_state *dev_state;
+
+ pasid_state = mn_to_state(mn);
+ dev_state = pasid_state->device_state;
+
+ if ((start ^ (end - 1)) < PAGE_SIZE)
+ amd_iommu_flush_page(dev_state->domain, pasid_state->pasid,
+ start);
+ else
+ amd_iommu_flush_tlb(dev_state->domain, pasid_state->pasid);
+}
+
+static void mn_release(struct mmu_notifier *mn, struct mm_struct *mm)
+{
+ struct pasid_state *pasid_state;
+ struct device_state *dev_state;
+ bool run_inv_ctx_cb;
+
+ might_sleep();
+
+ pasid_state = mn_to_state(mn);
+ dev_state = pasid_state->device_state;
+ run_inv_ctx_cb = !pasid_state->invalid;
+
+ if (run_inv_ctx_cb && dev_state->inv_ctx_cb)
+ dev_state->inv_ctx_cb(dev_state->pdev, pasid_state->pasid);
+
+ unbind_pasid(pasid_state);
+}
+
+static const struct mmu_notifier_ops iommu_mn = {
+ .release = mn_release,
+ .invalidate_range = mn_invalidate_range,
+};
+
+static void set_pri_tag_status(struct pasid_state *pasid_state,
+ u16 tag, int status)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&pasid_state->lock, flags);
+ pasid_state->pri[tag].status = status;
+ spin_unlock_irqrestore(&pasid_state->lock, flags);
+}
+
+static void finish_pri_tag(struct device_state *dev_state,
+ struct pasid_state *pasid_state,
+ u16 tag)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&pasid_state->lock, flags);
+ if (atomic_dec_and_test(&pasid_state->pri[tag].inflight) &&
+ pasid_state->pri[tag].finish) {
+ amd_iommu_complete_ppr(dev_state->pdev, pasid_state->pasid,
+ pasid_state->pri[tag].status, tag);
+ pasid_state->pri[tag].finish = false;
+ pasid_state->pri[tag].status = PPR_SUCCESS;
+ }
+ spin_unlock_irqrestore(&pasid_state->lock, flags);
+}
+
+static void handle_fault_error(struct fault *fault)
+{
+ int status;
+
+ if (!fault->dev_state->inv_ppr_cb) {
+ set_pri_tag_status(fault->state, fault->tag, PPR_INVALID);
+ return;
+ }
+
+ status = fault->dev_state->inv_ppr_cb(fault->dev_state->pdev,
+ fault->pasid,
+ fault->address,
+ fault->flags);
+ switch (status) {
+ case AMD_IOMMU_INV_PRI_RSP_SUCCESS:
+ set_pri_tag_status(fault->state, fault->tag, PPR_SUCCESS);
+ break;
+ case AMD_IOMMU_INV_PRI_RSP_INVALID:
+ set_pri_tag_status(fault->state, fault->tag, PPR_INVALID);
+ break;
+ case AMD_IOMMU_INV_PRI_RSP_FAIL:
+ set_pri_tag_status(fault->state, fault->tag, PPR_FAILURE);
+ break;
+ default:
+ BUG();
+ }
+}
+
+static bool access_error(struct vm_area_struct *vma, struct fault *fault)
+{
+ unsigned long requested = 0;
+
+ if (fault->flags & PPR_FAULT_EXEC)
+ requested |= VM_EXEC;
+
+ if (fault->flags & PPR_FAULT_READ)
+ requested |= VM_READ;
+
+ if (fault->flags & PPR_FAULT_WRITE)
+ requested |= VM_WRITE;
+
+ return (requested & ~vma->vm_flags) != 0;
+}
+
+static void do_fault(struct work_struct *work)
+{
+ struct fault *fault = container_of(work, struct fault, work);
+ struct vm_area_struct *vma;
+ vm_fault_t ret = VM_FAULT_ERROR;
+ unsigned int flags = 0;
+ struct mm_struct *mm;
+ u64 address;
+
+ mm = fault->state->mm;
+ address = fault->address;
+
+ if (fault->flags & PPR_FAULT_USER)
+ flags |= FAULT_FLAG_USER;
+ if (fault->flags & PPR_FAULT_WRITE)
+ flags |= FAULT_FLAG_WRITE;
+ flags |= FAULT_FLAG_REMOTE;
+
+ mmap_read_lock(mm);
+ vma = find_extend_vma(mm, address);
+ if (!vma || address < vma->vm_start)
+ /* failed to get a vma in the right range */
+ goto out;
+
+ /* Check if we have the right permissions on the vma */
+ if (access_error(vma, fault))
+ goto out;
+
+ ret = handle_mm_fault(vma, address, flags, NULL);
+out:
+ mmap_read_unlock(mm);
+
+ if (ret & VM_FAULT_ERROR)
+ /* failed to service fault */
+ handle_fault_error(fault);
+
+ finish_pri_tag(fault->dev_state, fault->state, fault->tag);
+
+ put_pasid_state(fault->state);
+
+ kfree(fault);
+}
+
+static int ppr_notifier(struct notifier_block *nb, unsigned long e, void *data)
+{
+ struct amd_iommu_fault *iommu_fault;
+ struct pasid_state *pasid_state;
+ struct device_state *dev_state;
+ struct pci_dev *pdev = NULL;
+ unsigned long flags;
+ struct fault *fault;
+ bool finish;
+ u16 tag, devid;
+ int ret;
+
+ iommu_fault = data;
+ tag = iommu_fault->tag & 0x1ff;
+ finish = (iommu_fault->tag >> 9) & 1;
+
+ devid = iommu_fault->device_id;
+ pdev = pci_get_domain_bus_and_slot(0, PCI_BUS_NUM(devid),
+ devid & 0xff);
+ if (!pdev)
+ return -ENODEV;
+
+ ret = NOTIFY_DONE;
+
+ /* In kdump kernel pci dev is not initialized yet -> send INVALID */
+ if (amd_iommu_is_attach_deferred(NULL, &pdev->dev)) {
+ amd_iommu_complete_ppr(pdev, iommu_fault->pasid,
+ PPR_INVALID, tag);
+ goto out;
+ }
+
+ dev_state = get_device_state(iommu_fault->device_id);
+ if (dev_state == NULL)
+ goto out;
+
+ pasid_state = get_pasid_state(dev_state, iommu_fault->pasid);
+ if (pasid_state == NULL || pasid_state->invalid) {
+ /* We know the device but not the PASID -> send INVALID */
+ amd_iommu_complete_ppr(dev_state->pdev, iommu_fault->pasid,
+ PPR_INVALID, tag);
+ goto out_drop_state;
+ }
+
+ spin_lock_irqsave(&pasid_state->lock, flags);
+ atomic_inc(&pasid_state->pri[tag].inflight);
+ if (finish)
+ pasid_state->pri[tag].finish = true;
+ spin_unlock_irqrestore(&pasid_state->lock, flags);
+
+ fault = kzalloc(sizeof(*fault), GFP_ATOMIC);
+ if (fault == NULL) {
+ /* We are OOM - send success and let the device re-fault */
+ finish_pri_tag(dev_state, pasid_state, tag);
+ goto out_drop_state;
+ }
+
+ fault->dev_state = dev_state;
+ fault->address = iommu_fault->address;
+ fault->state = pasid_state;
+ fault->tag = tag;
+ fault->finish = finish;
+ fault->pasid = iommu_fault->pasid;
+ fault->flags = iommu_fault->flags;
+ INIT_WORK(&fault->work, do_fault);
+
+ queue_work(iommu_wq, &fault->work);
+
+ ret = NOTIFY_OK;
+
+out_drop_state:
+
+ if (ret != NOTIFY_OK && pasid_state)
+ put_pasid_state(pasid_state);
+
+ put_device_state(dev_state);
+
+out:
+ pci_dev_put(pdev);
+ return ret;
+}
+
+static struct notifier_block ppr_nb = {
+ .notifier_call = ppr_notifier,
+};
+
+int amd_iommu_bind_pasid(struct pci_dev *pdev, u32 pasid,
+ struct task_struct *task)
+{
+ struct pasid_state *pasid_state;
+ struct device_state *dev_state;
+ struct mm_struct *mm;
+ u16 devid;
+ int ret;
+
+ might_sleep();
+
+ if (!amd_iommu_v2_supported())
+ return -ENODEV;
+
+ devid = device_id(pdev);
+ dev_state = get_device_state(devid);
+
+ if (dev_state == NULL)
+ return -EINVAL;
+
+ ret = -EINVAL;
+ if (pasid >= dev_state->max_pasids)
+ goto out;
+
+ ret = -ENOMEM;
+ pasid_state = kzalloc(sizeof(*pasid_state), GFP_KERNEL);
+ if (pasid_state == NULL)
+ goto out;
+
+
+ atomic_set(&pasid_state->count, 1);
+ init_waitqueue_head(&pasid_state->wq);
+ spin_lock_init(&pasid_state->lock);
+
+ mm = get_task_mm(task);
+ pasid_state->mm = mm;
+ pasid_state->device_state = dev_state;
+ pasid_state->pasid = pasid;
+ pasid_state->invalid = true; /* Mark as valid only if we are
+ done with setting up the pasid */
+ pasid_state->mn.ops = &iommu_mn;
+
+ if (pasid_state->mm == NULL)
+ goto out_free;
+
+ mmu_notifier_register(&pasid_state->mn, mm);
+
+ ret = set_pasid_state(dev_state, pasid_state, pasid);
+ if (ret)
+ goto out_unregister;
+
+ ret = amd_iommu_domain_set_gcr3(dev_state->domain, pasid,
+ __pa(pasid_state->mm->pgd));
+ if (ret)
+ goto out_clear_state;
+
+ /* Now we are ready to handle faults */
+ pasid_state->invalid = false;
+
+ /*
+ * Drop the reference to the mm_struct here. We rely on the
+ * mmu_notifier release call-back to inform us when the mm
+ * is going away.
+ */
+ mmput(mm);
+
+ return 0;
+
+out_clear_state:
+ clear_pasid_state(dev_state, pasid);
+
+out_unregister:
+ mmu_notifier_unregister(&pasid_state->mn, mm);
+ mmput(mm);
+
+out_free:
+ free_pasid_state(pasid_state);
+
+out:
+ put_device_state(dev_state);
+
+ return ret;
+}
+EXPORT_SYMBOL(amd_iommu_bind_pasid);
+
+void amd_iommu_unbind_pasid(struct pci_dev *pdev, u32 pasid)
+{
+ struct pasid_state *pasid_state;
+ struct device_state *dev_state;
+ u16 devid;
+
+ might_sleep();
+
+ if (!amd_iommu_v2_supported())
+ return;
+
+ devid = device_id(pdev);
+ dev_state = get_device_state(devid);
+ if (dev_state == NULL)
+ return;
+
+ if (pasid >= dev_state->max_pasids)
+ goto out;
+
+ pasid_state = get_pasid_state(dev_state, pasid);
+ if (pasid_state == NULL)
+ goto out;
+ /*
+ * Drop reference taken here. We are safe because we still hold
+ * the reference taken in the amd_iommu_bind_pasid function.
+ */
+ put_pasid_state(pasid_state);
+
+ /* Clear the pasid state so that the pasid can be re-used */
+ clear_pasid_state(dev_state, pasid_state->pasid);
+
+ /*
+ * Call mmu_notifier_unregister to drop our reference
+ * to pasid_state->mm
+ */
+ mmu_notifier_unregister(&pasid_state->mn, pasid_state->mm);
+
+ put_pasid_state_wait(pasid_state); /* Reference taken in
+ amd_iommu_bind_pasid */
+out:
+ /* Drop reference taken in this function */
+ put_device_state(dev_state);
+
+ /* Drop reference taken in amd_iommu_bind_pasid */
+ put_device_state(dev_state);
+}
+EXPORT_SYMBOL(amd_iommu_unbind_pasid);
+
+int amd_iommu_init_device(struct pci_dev *pdev, int pasids)
+{
+ struct device_state *dev_state;
+ struct iommu_group *group;
+ unsigned long flags;
+ int ret, tmp;
+ u16 devid;
+
+ might_sleep();
+
+ /*
+ * When memory encryption is active the device is likely not in a
+ * direct-mapped domain. Forbid using IOMMUv2 functionality for now.
+ */
+ if (mem_encrypt_active())
+ return -ENODEV;
+
+ if (!amd_iommu_v2_supported())
+ return -ENODEV;
+
+ if (pasids <= 0 || pasids > (PASID_MASK + 1))
+ return -EINVAL;
+
+ devid = device_id(pdev);
+
+ dev_state = kzalloc(sizeof(*dev_state), GFP_KERNEL);
+ if (dev_state == NULL)
+ return -ENOMEM;
+
+ spin_lock_init(&dev_state->lock);
+ init_waitqueue_head(&dev_state->wq);
+ dev_state->pdev = pdev;
+ dev_state->devid = devid;
+
+ tmp = pasids;
+ for (dev_state->pasid_levels = 0; (tmp - 1) & ~0x1ff; tmp >>= 9)
+ dev_state->pasid_levels += 1;
+
+ atomic_set(&dev_state->count, 1);
+ dev_state->max_pasids = pasids;
+
+ ret = -ENOMEM;
+ dev_state->states = (void *)get_zeroed_page(GFP_KERNEL);
+ if (dev_state->states == NULL)
+ goto out_free_dev_state;
+
+ dev_state->domain = iommu_domain_alloc(&pci_bus_type);
+ if (dev_state->domain == NULL)
+ goto out_free_states;
+
+ amd_iommu_domain_direct_map(dev_state->domain);
+
+ ret = amd_iommu_domain_enable_v2(dev_state->domain, pasids);
+ if (ret)
+ goto out_free_domain;
+
+ group = iommu_group_get(&pdev->dev);
+ if (!group) {
+ ret = -EINVAL;
+ goto out_free_domain;
+ }
+
+ ret = iommu_attach_group(dev_state->domain, group);
+ if (ret != 0)
+ goto out_drop_group;
+
+ iommu_group_put(group);
+
+ spin_lock_irqsave(&state_lock, flags);
+
+ if (__get_device_state(devid) != NULL) {
+ spin_unlock_irqrestore(&state_lock, flags);
+ ret = -EBUSY;
+ goto out_free_domain;
+ }
+
+ list_add_tail(&dev_state->list, &state_list);
+
+ spin_unlock_irqrestore(&state_lock, flags);
+
+ return 0;
+
+out_drop_group:
+ iommu_group_put(group);
+
+out_free_domain:
+ iommu_domain_free(dev_state->domain);
+
+out_free_states:
+ free_page((unsigned long)dev_state->states);
+
+out_free_dev_state:
+ kfree(dev_state);
+
+ return ret;
+}
+EXPORT_SYMBOL(amd_iommu_init_device);
+
+void amd_iommu_free_device(struct pci_dev *pdev)
+{
+ struct device_state *dev_state;
+ unsigned long flags;
+ u16 devid;
+
+ if (!amd_iommu_v2_supported())
+ return;
+
+ devid = device_id(pdev);
+
+ spin_lock_irqsave(&state_lock, flags);
+
+ dev_state = __get_device_state(devid);
+ if (dev_state == NULL) {
+ spin_unlock_irqrestore(&state_lock, flags);
+ return;
+ }
+
+ list_del(&dev_state->list);
+
+ spin_unlock_irqrestore(&state_lock, flags);
+
+ /* Get rid of any remaining pasid states */
+ free_pasid_states(dev_state);
+
+ put_device_state(dev_state);
+ /*
+ * Wait until the last reference is dropped before freeing
+ * the device state.
+ */
+ wait_event(dev_state->wq, !atomic_read(&dev_state->count));
+ free_device_state(dev_state);
+}
+EXPORT_SYMBOL(amd_iommu_free_device);
+
+int amd_iommu_set_invalid_ppr_cb(struct pci_dev *pdev,
+ amd_iommu_invalid_ppr_cb cb)
+{
+ struct device_state *dev_state;
+ unsigned long flags;
+ u16 devid;
+ int ret;
+
+ if (!amd_iommu_v2_supported())
+ return -ENODEV;
+
+ devid = device_id(pdev);
+
+ spin_lock_irqsave(&state_lock, flags);
+
+ ret = -EINVAL;
+ dev_state = __get_device_state(devid);
+ if (dev_state == NULL)
+ goto out_unlock;
+
+ dev_state->inv_ppr_cb = cb;
+
+ ret = 0;
+
+out_unlock:
+ spin_unlock_irqrestore(&state_lock, flags);
+
+ return ret;
+}
+EXPORT_SYMBOL(amd_iommu_set_invalid_ppr_cb);
+
+int amd_iommu_set_invalidate_ctx_cb(struct pci_dev *pdev,
+ amd_iommu_invalidate_ctx cb)
+{
+ struct device_state *dev_state;
+ unsigned long flags;
+ u16 devid;
+ int ret;
+
+ if (!amd_iommu_v2_supported())
+ return -ENODEV;
+
+ devid = device_id(pdev);
+
+ spin_lock_irqsave(&state_lock, flags);
+
+ ret = -EINVAL;
+ dev_state = __get_device_state(devid);
+ if (dev_state == NULL)
+ goto out_unlock;
+
+ dev_state->inv_ctx_cb = cb;
+
+ ret = 0;
+
+out_unlock:
+ spin_unlock_irqrestore(&state_lock, flags);
+
+ return ret;
+}
+EXPORT_SYMBOL(amd_iommu_set_invalidate_ctx_cb);
+
+static int __init amd_iommu_v2_init(void)
+{
+ int ret;
+
+ if (!amd_iommu_v2_supported()) {
+ pr_info("AMD IOMMUv2 functionality not available on this system - This is not a bug.\n");
+ /*
+ * Load anyway to provide the symbols to other modules
+ * which may use AMD IOMMUv2 optionally.
+ */
+ return 0;
+ }
+
+ spin_lock_init(&state_lock);
+
+ ret = -ENOMEM;
+ iommu_wq = alloc_workqueue("amd_iommu_v2", WQ_MEM_RECLAIM, 0);
+ if (iommu_wq == NULL)
+ goto out;
+
+ amd_iommu_register_ppr_notifier(&ppr_nb);
+
+ pr_info("AMD IOMMUv2 loaded and initialized\n");
+
+ return 0;
+
+out:
+ return ret;
+}
+
+static void __exit amd_iommu_v2_exit(void)
+{
+ struct device_state *dev_state;
+ int i;
+
+ if (!amd_iommu_v2_supported())
+ return;
+
+ amd_iommu_unregister_ppr_notifier(&ppr_nb);
+
+ flush_workqueue(iommu_wq);
+
+ /*
+ * The loop below might call flush_workqueue(), so call
+ * destroy_workqueue() after it
+ */
+ for (i = 0; i < MAX_DEVICES; ++i) {
+ dev_state = get_device_state(i);
+
+ if (dev_state == NULL)
+ continue;
+
+ WARN_ON_ONCE(1);
+
+ put_device_state(dev_state);
+ amd_iommu_free_device(dev_state->pdev);
+ }
+
+ destroy_workqueue(iommu_wq);
+}
+
+module_init(amd_iommu_v2_init);
+module_exit(amd_iommu_v2_exit);
diff --git a/drivers/iommu/amd/quirks.c b/drivers/iommu/amd/quirks.c
new file mode 100644
index 000000000..5120ce4fd
--- /dev/null
+++ b/drivers/iommu/amd/quirks.c
@@ -0,0 +1,105 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+/*
+ * Quirks for AMD IOMMU
+ *
+ * Copyright (C) 2019 Kai-Heng Feng <kai.heng.feng@canonical.com>
+ */
+
+#ifdef CONFIG_DMI
+#include <linux/dmi.h>
+
+#include "amd_iommu.h"
+
+#define IVHD_SPECIAL_IOAPIC 1
+
+struct ivrs_quirk_entry {
+ u8 id;
+ u16 devid;
+};
+
+enum {
+ DELL_INSPIRON_7375 = 0,
+ DELL_LATITUDE_5495,
+ LENOVO_IDEAPAD_330S_15ARR,
+};
+
+static const struct ivrs_quirk_entry ivrs_ioapic_quirks[][3] __initconst = {
+ /* ivrs_ioapic[4]=00:14.0 ivrs_ioapic[5]=00:00.2 */
+ [DELL_INSPIRON_7375] = {
+ { .id = 4, .devid = 0xa0 },
+ { .id = 5, .devid = 0x2 },
+ {}
+ },
+ /* ivrs_ioapic[4]=00:14.0 */
+ [DELL_LATITUDE_5495] = {
+ { .id = 4, .devid = 0xa0 },
+ {}
+ },
+ /* ivrs_ioapic[32]=00:14.0 */
+ [LENOVO_IDEAPAD_330S_15ARR] = {
+ { .id = 32, .devid = 0xa0 },
+ {}
+ },
+ {}
+};
+
+static int __init ivrs_ioapic_quirk_cb(const struct dmi_system_id *d)
+{
+ const struct ivrs_quirk_entry *i;
+
+ for (i = d->driver_data; i->id != 0 && i->devid != 0; i++)
+ add_special_device(IVHD_SPECIAL_IOAPIC, i->id, (u16 *)&i->devid, 0);
+
+ return 0;
+}
+
+static const struct dmi_system_id ivrs_quirks[] __initconst = {
+ {
+ .callback = ivrs_ioapic_quirk_cb,
+ .ident = "Dell Inspiron 7375",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "Inspiron 7375"),
+ },
+ .driver_data = (void *)&ivrs_ioapic_quirks[DELL_INSPIRON_7375],
+ },
+ {
+ .callback = ivrs_ioapic_quirk_cb,
+ .ident = "Dell Latitude 5495",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "Latitude 5495"),
+ },
+ .driver_data = (void *)&ivrs_ioapic_quirks[DELL_LATITUDE_5495],
+ },
+ {
+ /*
+ * Acer Aspire A315-41 requires the very same workaround as
+ * Dell Latitude 5495
+ */
+ .callback = ivrs_ioapic_quirk_cb,
+ .ident = "Acer Aspire A315-41",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Acer"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "Aspire A315-41"),
+ },
+ .driver_data = (void *)&ivrs_ioapic_quirks[DELL_LATITUDE_5495],
+ },
+ {
+ .callback = ivrs_ioapic_quirk_cb,
+ .ident = "Lenovo ideapad 330S-15ARR",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "81FB"),
+ },
+ .driver_data = (void *)&ivrs_ioapic_quirks[LENOVO_IDEAPAD_330S_15ARR],
+ },
+ {}
+};
+
+void __init amd_iommu_apply_ivrs_quirks(void)
+{
+ dmi_check_system(ivrs_quirks);
+}
+#endif
diff --git a/drivers/iommu/arm/Makefile b/drivers/iommu/arm/Makefile
new file mode 100644
index 000000000..0f9efeab7
--- /dev/null
+++ b/drivers/iommu/arm/Makefile
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0
+obj-y += arm-smmu/ arm-smmu-v3/
diff --git a/drivers/iommu/arm/arm-smmu-v3/Makefile b/drivers/iommu/arm/arm-smmu-v3/Makefile
new file mode 100644
index 000000000..54feb1ecc
--- /dev/null
+++ b/drivers/iommu/arm/arm-smmu-v3/Makefile
@@ -0,0 +1,5 @@
+# SPDX-License-Identifier: GPL-2.0
+obj-$(CONFIG_ARM_SMMU_V3) += arm_smmu_v3.o
+arm_smmu_v3-objs-y += arm-smmu-v3.o
+arm_smmu_v3-objs-$(CONFIG_ARM_SMMU_V3_SVA) += arm-smmu-v3-sva.o
+arm_smmu_v3-objs := $(arm_smmu_v3-objs-y)
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
new file mode 100644
index 000000000..9255c9600
--- /dev/null
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
@@ -0,0 +1,248 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Implementation of the IOMMU SVA API for the ARM SMMUv3
+ */
+
+#include <linux/mm.h>
+#include <linux/mmu_context.h>
+#include <linux/slab.h>
+
+#include "arm-smmu-v3.h"
+#include "../../io-pgtable-arm.h"
+
+static DEFINE_MUTEX(sva_lock);
+
+/*
+ * Check if the CPU ASID is available on the SMMU side. If a private context
+ * descriptor is using it, try to replace it.
+ */
+static struct arm_smmu_ctx_desc *
+arm_smmu_share_asid(struct mm_struct *mm, u16 asid)
+{
+ int ret;
+ u32 new_asid;
+ struct arm_smmu_ctx_desc *cd;
+ struct arm_smmu_device *smmu;
+ struct arm_smmu_domain *smmu_domain;
+
+ cd = xa_load(&arm_smmu_asid_xa, asid);
+ if (!cd)
+ return NULL;
+
+ if (cd->mm) {
+ if (WARN_ON(cd->mm != mm))
+ return ERR_PTR(-EINVAL);
+ /* All devices bound to this mm use the same cd struct. */
+ refcount_inc(&cd->refs);
+ return cd;
+ }
+
+ smmu_domain = container_of(cd, struct arm_smmu_domain, s1_cfg.cd);
+ smmu = smmu_domain->smmu;
+
+ ret = xa_alloc(&arm_smmu_asid_xa, &new_asid, cd,
+ XA_LIMIT(1, (1 << smmu->asid_bits) - 1), GFP_KERNEL);
+ if (ret)
+ return ERR_PTR(-ENOSPC);
+ /*
+ * Race with unmap: TLB invalidations will start targeting the new ASID,
+ * which isn't assigned yet. We'll do an invalidate-all on the old ASID
+ * later, so it doesn't matter.
+ */
+ cd->asid = new_asid;
+ /*
+ * Update ASID and invalidate CD in all associated masters. There will
+ * be some overlap between use of both ASIDs, until we invalidate the
+ * TLB.
+ */
+ arm_smmu_write_ctx_desc(smmu_domain, 0, cd);
+
+ /* Invalidate TLB entries previously associated with that context */
+ arm_smmu_tlb_inv_asid(smmu, asid);
+
+ xa_erase(&arm_smmu_asid_xa, asid);
+ return NULL;
+}
+
+__maybe_unused
+static struct arm_smmu_ctx_desc *arm_smmu_alloc_shared_cd(struct mm_struct *mm)
+{
+ u16 asid;
+ int err = 0;
+ u64 tcr, par, reg;
+ struct arm_smmu_ctx_desc *cd;
+ struct arm_smmu_ctx_desc *ret = NULL;
+
+ asid = arm64_mm_context_get(mm);
+ if (!asid)
+ return ERR_PTR(-ESRCH);
+
+ cd = kzalloc(sizeof(*cd), GFP_KERNEL);
+ if (!cd) {
+ err = -ENOMEM;
+ goto out_put_context;
+ }
+
+ refcount_set(&cd->refs, 1);
+
+ mutex_lock(&arm_smmu_asid_lock);
+ ret = arm_smmu_share_asid(mm, asid);
+ if (ret) {
+ mutex_unlock(&arm_smmu_asid_lock);
+ goto out_free_cd;
+ }
+
+ err = xa_insert(&arm_smmu_asid_xa, asid, cd, GFP_KERNEL);
+ mutex_unlock(&arm_smmu_asid_lock);
+
+ if (err)
+ goto out_free_asid;
+
+ tcr = FIELD_PREP(CTXDESC_CD_0_TCR_T0SZ, 64ULL - vabits_actual) |
+ FIELD_PREP(CTXDESC_CD_0_TCR_IRGN0, ARM_LPAE_TCR_RGN_WBWA) |
+ FIELD_PREP(CTXDESC_CD_0_TCR_ORGN0, ARM_LPAE_TCR_RGN_WBWA) |
+ FIELD_PREP(CTXDESC_CD_0_TCR_SH0, ARM_LPAE_TCR_SH_IS) |
+ CTXDESC_CD_0_TCR_EPD1 | CTXDESC_CD_0_AA64;
+
+ switch (PAGE_SIZE) {
+ case SZ_4K:
+ tcr |= FIELD_PREP(CTXDESC_CD_0_TCR_TG0, ARM_LPAE_TCR_TG0_4K);
+ break;
+ case SZ_16K:
+ tcr |= FIELD_PREP(CTXDESC_CD_0_TCR_TG0, ARM_LPAE_TCR_TG0_16K);
+ break;
+ case SZ_64K:
+ tcr |= FIELD_PREP(CTXDESC_CD_0_TCR_TG0, ARM_LPAE_TCR_TG0_64K);
+ break;
+ default:
+ WARN_ON(1);
+ err = -EINVAL;
+ goto out_free_asid;
+ }
+
+ reg = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1);
+ par = cpuid_feature_extract_unsigned_field(reg, ID_AA64MMFR0_PARANGE_SHIFT);
+ tcr |= FIELD_PREP(CTXDESC_CD_0_TCR_IPS, par);
+
+ cd->ttbr = virt_to_phys(mm->pgd);
+ cd->tcr = tcr;
+ /*
+ * MAIR value is pretty much constant and global, so we can just get it
+ * from the current CPU register
+ */
+ cd->mair = read_sysreg(mair_el1);
+ cd->asid = asid;
+ cd->mm = mm;
+
+ return cd;
+
+out_free_asid:
+ arm_smmu_free_asid(cd);
+out_free_cd:
+ kfree(cd);
+out_put_context:
+ arm64_mm_context_put(mm);
+ return err < 0 ? ERR_PTR(err) : ret;
+}
+
+__maybe_unused
+static void arm_smmu_free_shared_cd(struct arm_smmu_ctx_desc *cd)
+{
+ if (arm_smmu_free_asid(cd)) {
+ /* Unpin ASID */
+ arm64_mm_context_put(cd->mm);
+ kfree(cd);
+ }
+}
+
+bool arm_smmu_sva_supported(struct arm_smmu_device *smmu)
+{
+ unsigned long reg, fld;
+ unsigned long oas;
+ unsigned long asid_bits;
+ u32 feat_mask = ARM_SMMU_FEAT_BTM | ARM_SMMU_FEAT_COHERENCY;
+
+ if (vabits_actual == 52)
+ feat_mask |= ARM_SMMU_FEAT_VAX;
+
+ if ((smmu->features & feat_mask) != feat_mask)
+ return false;
+
+ if (!(smmu->pgsize_bitmap & PAGE_SIZE))
+ return false;
+
+ /*
+ * Get the smallest PA size of all CPUs (sanitized by cpufeature). We're
+ * not even pretending to support AArch32 here. Abort if the MMU outputs
+ * addresses larger than what we support.
+ */
+ reg = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1);
+ fld = cpuid_feature_extract_unsigned_field(reg, ID_AA64MMFR0_PARANGE_SHIFT);
+ oas = id_aa64mmfr0_parange_to_phys_shift(fld);
+ if (smmu->oas < oas)
+ return false;
+
+ /* We can support bigger ASIDs than the CPU, but not smaller */
+ fld = cpuid_feature_extract_unsigned_field(reg, ID_AA64MMFR0_ASID_SHIFT);
+ asid_bits = fld ? 16 : 8;
+ if (smmu->asid_bits < asid_bits)
+ return false;
+
+ /*
+ * See max_pinned_asids in arch/arm64/mm/context.c. The following is
+ * generally the maximum number of bindable processes.
+ */
+ if (arm64_kernel_unmapped_at_el0())
+ asid_bits--;
+ dev_dbg(smmu->dev, "%d shared contexts\n", (1 << asid_bits) -
+ num_possible_cpus() - 2);
+
+ return true;
+}
+
+static bool arm_smmu_iopf_supported(struct arm_smmu_master *master)
+{
+ return false;
+}
+
+bool arm_smmu_master_sva_supported(struct arm_smmu_master *master)
+{
+ if (!(master->smmu->features & ARM_SMMU_FEAT_SVA))
+ return false;
+
+ /* SSID and IOPF support are mandatory for the moment */
+ return master->ssid_bits && arm_smmu_iopf_supported(master);
+}
+
+bool arm_smmu_master_sva_enabled(struct arm_smmu_master *master)
+{
+ bool enabled;
+
+ mutex_lock(&sva_lock);
+ enabled = master->sva_enabled;
+ mutex_unlock(&sva_lock);
+ return enabled;
+}
+
+int arm_smmu_master_enable_sva(struct arm_smmu_master *master)
+{
+ mutex_lock(&sva_lock);
+ master->sva_enabled = true;
+ mutex_unlock(&sva_lock);
+
+ return 0;
+}
+
+int arm_smmu_master_disable_sva(struct arm_smmu_master *master)
+{
+ mutex_lock(&sva_lock);
+ if (!list_empty(&master->bonds)) {
+ dev_err(master->dev, "cannot disable SVA, device is bound\n");
+ mutex_unlock(&sva_lock);
+ return -EBUSY;
+ }
+ master->sva_enabled = false;
+ mutex_unlock(&sva_lock);
+
+ return 0;
+}
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
new file mode 100644
index 000000000..982c42c87
--- /dev/null
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -0,0 +1,3641 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * IOMMU API for ARM architected SMMUv3 implementations.
+ *
+ * Copyright (C) 2015 ARM Limited
+ *
+ * Author: Will Deacon <will.deacon@arm.com>
+ *
+ * This driver is powered by bad coffee and bombay mix.
+ */
+
+#include <linux/acpi.h>
+#include <linux/acpi_iort.h>
+#include <linux/bitops.h>
+#include <linux/crash_dump.h>
+#include <linux/delay.h>
+#include <linux/dma-iommu.h>
+#include <linux/err.h>
+#include <linux/interrupt.h>
+#include <linux/io-pgtable.h>
+#include <linux/iopoll.h>
+#include <linux/module.h>
+#include <linux/msi.h>
+#include <linux/of.h>
+#include <linux/of_address.h>
+#include <linux/of_iommu.h>
+#include <linux/of_platform.h>
+#include <linux/pci.h>
+#include <linux/pci-ats.h>
+#include <linux/platform_device.h>
+
+#include <linux/amba/bus.h>
+
+#include "arm-smmu-v3.h"
+
+static bool disable_bypass = 1;
+module_param(disable_bypass, bool, 0444);
+MODULE_PARM_DESC(disable_bypass,
+ "Disable bypass streams such that incoming transactions from devices that are not attached to an iommu domain will report an abort back to the device and will not be allowed to pass through the SMMU.");
+
+static bool disable_msipolling;
+module_param(disable_msipolling, bool, 0444);
+MODULE_PARM_DESC(disable_msipolling,
+ "Disable MSI-based polling for CMD_SYNC completion.");
+
+enum arm_smmu_msi_index {
+ EVTQ_MSI_INDEX,
+ GERROR_MSI_INDEX,
+ PRIQ_MSI_INDEX,
+ ARM_SMMU_MAX_MSIS,
+};
+
+static phys_addr_t arm_smmu_msi_cfg[ARM_SMMU_MAX_MSIS][3] = {
+ [EVTQ_MSI_INDEX] = {
+ ARM_SMMU_EVTQ_IRQ_CFG0,
+ ARM_SMMU_EVTQ_IRQ_CFG1,
+ ARM_SMMU_EVTQ_IRQ_CFG2,
+ },
+ [GERROR_MSI_INDEX] = {
+ ARM_SMMU_GERROR_IRQ_CFG0,
+ ARM_SMMU_GERROR_IRQ_CFG1,
+ ARM_SMMU_GERROR_IRQ_CFG2,
+ },
+ [PRIQ_MSI_INDEX] = {
+ ARM_SMMU_PRIQ_IRQ_CFG0,
+ ARM_SMMU_PRIQ_IRQ_CFG1,
+ ARM_SMMU_PRIQ_IRQ_CFG2,
+ },
+};
+
+struct arm_smmu_option_prop {
+ u32 opt;
+ const char *prop;
+};
+
+DEFINE_XARRAY_ALLOC1(arm_smmu_asid_xa);
+DEFINE_MUTEX(arm_smmu_asid_lock);
+
+static struct arm_smmu_option_prop arm_smmu_options[] = {
+ { ARM_SMMU_OPT_SKIP_PREFETCH, "hisilicon,broken-prefetch-cmd" },
+ { ARM_SMMU_OPT_PAGE0_REGS_ONLY, "cavium,cn9900-broken-page1-regspace"},
+ { 0, NULL},
+};
+
+static inline void __iomem *arm_smmu_page1_fixup(unsigned long offset,
+ struct arm_smmu_device *smmu)
+{
+ if (offset > SZ_64K)
+ return smmu->page1 + offset - SZ_64K;
+
+ return smmu->base + offset;
+}
+
+static struct arm_smmu_domain *to_smmu_domain(struct iommu_domain *dom)
+{
+ return container_of(dom, struct arm_smmu_domain, domain);
+}
+
+static void parse_driver_options(struct arm_smmu_device *smmu)
+{
+ int i = 0;
+
+ do {
+ if (of_property_read_bool(smmu->dev->of_node,
+ arm_smmu_options[i].prop)) {
+ smmu->options |= arm_smmu_options[i].opt;
+ dev_notice(smmu->dev, "option %s\n",
+ arm_smmu_options[i].prop);
+ }
+ } while (arm_smmu_options[++i].opt);
+}
+
+/* Low-level queue manipulation functions */
+static bool queue_has_space(struct arm_smmu_ll_queue *q, u32 n)
+{
+ u32 space, prod, cons;
+
+ prod = Q_IDX(q, q->prod);
+ cons = Q_IDX(q, q->cons);
+
+ if (Q_WRP(q, q->prod) == Q_WRP(q, q->cons))
+ space = (1 << q->max_n_shift) - (prod - cons);
+ else
+ space = cons - prod;
+
+ return space >= n;
+}
+
+static bool queue_full(struct arm_smmu_ll_queue *q)
+{
+ return Q_IDX(q, q->prod) == Q_IDX(q, q->cons) &&
+ Q_WRP(q, q->prod) != Q_WRP(q, q->cons);
+}
+
+static bool queue_empty(struct arm_smmu_ll_queue *q)
+{
+ return Q_IDX(q, q->prod) == Q_IDX(q, q->cons) &&
+ Q_WRP(q, q->prod) == Q_WRP(q, q->cons);
+}
+
+static bool queue_consumed(struct arm_smmu_ll_queue *q, u32 prod)
+{
+ return ((Q_WRP(q, q->cons) == Q_WRP(q, prod)) &&
+ (Q_IDX(q, q->cons) > Q_IDX(q, prod))) ||
+ ((Q_WRP(q, q->cons) != Q_WRP(q, prod)) &&
+ (Q_IDX(q, q->cons) <= Q_IDX(q, prod)));
+}
+
+static void queue_sync_cons_out(struct arm_smmu_queue *q)
+{
+ /*
+ * Ensure that all CPU accesses (reads and writes) to the queue
+ * are complete before we update the cons pointer.
+ */
+ __iomb();
+ writel_relaxed(q->llq.cons, q->cons_reg);
+}
+
+static void queue_inc_cons(struct arm_smmu_ll_queue *q)
+{
+ u32 cons = (Q_WRP(q, q->cons) | Q_IDX(q, q->cons)) + 1;
+ q->cons = Q_OVF(q->cons) | Q_WRP(q, cons) | Q_IDX(q, cons);
+}
+
+static void queue_sync_cons_ovf(struct arm_smmu_queue *q)
+{
+ struct arm_smmu_ll_queue *llq = &q->llq;
+
+ if (likely(Q_OVF(llq->prod) == Q_OVF(llq->cons)))
+ return;
+
+ llq->cons = Q_OVF(llq->prod) | Q_WRP(llq, llq->cons) |
+ Q_IDX(llq, llq->cons);
+ queue_sync_cons_out(q);
+}
+
+static int queue_sync_prod_in(struct arm_smmu_queue *q)
+{
+ u32 prod;
+ int ret = 0;
+
+ /*
+ * We can't use the _relaxed() variant here, as we must prevent
+ * speculative reads of the queue before we have determined that
+ * prod has indeed moved.
+ */
+ prod = readl(q->prod_reg);
+
+ if (Q_OVF(prod) != Q_OVF(q->llq.prod))
+ ret = -EOVERFLOW;
+
+ q->llq.prod = prod;
+ return ret;
+}
+
+static u32 queue_inc_prod_n(struct arm_smmu_ll_queue *q, int n)
+{
+ u32 prod = (Q_WRP(q, q->prod) | Q_IDX(q, q->prod)) + n;
+ return Q_OVF(q->prod) | Q_WRP(q, prod) | Q_IDX(q, prod);
+}
+
+static void queue_poll_init(struct arm_smmu_device *smmu,
+ struct arm_smmu_queue_poll *qp)
+{
+ qp->delay = 1;
+ qp->spin_cnt = 0;
+ qp->wfe = !!(smmu->features & ARM_SMMU_FEAT_SEV);
+ qp->timeout = ktime_add_us(ktime_get(), ARM_SMMU_POLL_TIMEOUT_US);
+}
+
+static int queue_poll(struct arm_smmu_queue_poll *qp)
+{
+ if (ktime_compare(ktime_get(), qp->timeout) > 0)
+ return -ETIMEDOUT;
+
+ if (qp->wfe) {
+ wfe();
+ } else if (++qp->spin_cnt < ARM_SMMU_POLL_SPIN_COUNT) {
+ cpu_relax();
+ } else {
+ udelay(qp->delay);
+ qp->delay *= 2;
+ qp->spin_cnt = 0;
+ }
+
+ return 0;
+}
+
+static void queue_write(__le64 *dst, u64 *src, size_t n_dwords)
+{
+ int i;
+
+ for (i = 0; i < n_dwords; ++i)
+ *dst++ = cpu_to_le64(*src++);
+}
+
+static void queue_read(u64 *dst, __le64 *src, size_t n_dwords)
+{
+ int i;
+
+ for (i = 0; i < n_dwords; ++i)
+ *dst++ = le64_to_cpu(*src++);
+}
+
+static int queue_remove_raw(struct arm_smmu_queue *q, u64 *ent)
+{
+ if (queue_empty(&q->llq))
+ return -EAGAIN;
+
+ queue_read(ent, Q_ENT(q, q->llq.cons), q->ent_dwords);
+ queue_inc_cons(&q->llq);
+ queue_sync_cons_out(q);
+ return 0;
+}
+
+/* High-level queue accessors */
+static int arm_smmu_cmdq_build_cmd(u64 *cmd, struct arm_smmu_cmdq_ent *ent)
+{
+ memset(cmd, 0, 1 << CMDQ_ENT_SZ_SHIFT);
+ cmd[0] |= FIELD_PREP(CMDQ_0_OP, ent->opcode);
+
+ switch (ent->opcode) {
+ case CMDQ_OP_TLBI_EL2_ALL:
+ case CMDQ_OP_TLBI_NSNH_ALL:
+ break;
+ case CMDQ_OP_PREFETCH_CFG:
+ cmd[0] |= FIELD_PREP(CMDQ_PREFETCH_0_SID, ent->prefetch.sid);
+ cmd[1] |= FIELD_PREP(CMDQ_PREFETCH_1_SIZE, ent->prefetch.size);
+ cmd[1] |= ent->prefetch.addr & CMDQ_PREFETCH_1_ADDR_MASK;
+ break;
+ case CMDQ_OP_CFGI_CD:
+ cmd[0] |= FIELD_PREP(CMDQ_CFGI_0_SSID, ent->cfgi.ssid);
+ fallthrough;
+ case CMDQ_OP_CFGI_STE:
+ cmd[0] |= FIELD_PREP(CMDQ_CFGI_0_SID, ent->cfgi.sid);
+ cmd[1] |= FIELD_PREP(CMDQ_CFGI_1_LEAF, ent->cfgi.leaf);
+ break;
+ case CMDQ_OP_CFGI_CD_ALL:
+ cmd[0] |= FIELD_PREP(CMDQ_CFGI_0_SID, ent->cfgi.sid);
+ break;
+ case CMDQ_OP_CFGI_ALL:
+ /* Cover the entire SID range */
+ cmd[1] |= FIELD_PREP(CMDQ_CFGI_1_RANGE, 31);
+ break;
+ case CMDQ_OP_TLBI_NH_VA:
+ cmd[0] |= FIELD_PREP(CMDQ_TLBI_0_NUM, ent->tlbi.num);
+ cmd[0] |= FIELD_PREP(CMDQ_TLBI_0_SCALE, ent->tlbi.scale);
+ cmd[0] |= FIELD_PREP(CMDQ_TLBI_0_VMID, ent->tlbi.vmid);
+ cmd[0] |= FIELD_PREP(CMDQ_TLBI_0_ASID, ent->tlbi.asid);
+ cmd[1] |= FIELD_PREP(CMDQ_TLBI_1_LEAF, ent->tlbi.leaf);
+ cmd[1] |= FIELD_PREP(CMDQ_TLBI_1_TTL, ent->tlbi.ttl);
+ cmd[1] |= FIELD_PREP(CMDQ_TLBI_1_TG, ent->tlbi.tg);
+ cmd[1] |= ent->tlbi.addr & CMDQ_TLBI_1_VA_MASK;
+ break;
+ case CMDQ_OP_TLBI_S2_IPA:
+ cmd[0] |= FIELD_PREP(CMDQ_TLBI_0_NUM, ent->tlbi.num);
+ cmd[0] |= FIELD_PREP(CMDQ_TLBI_0_SCALE, ent->tlbi.scale);
+ cmd[0] |= FIELD_PREP(CMDQ_TLBI_0_VMID, ent->tlbi.vmid);
+ cmd[1] |= FIELD_PREP(CMDQ_TLBI_1_LEAF, ent->tlbi.leaf);
+ cmd[1] |= FIELD_PREP(CMDQ_TLBI_1_TTL, ent->tlbi.ttl);
+ cmd[1] |= FIELD_PREP(CMDQ_TLBI_1_TG, ent->tlbi.tg);
+ cmd[1] |= ent->tlbi.addr & CMDQ_TLBI_1_IPA_MASK;
+ break;
+ case CMDQ_OP_TLBI_NH_ASID:
+ cmd[0] |= FIELD_PREP(CMDQ_TLBI_0_ASID, ent->tlbi.asid);
+ fallthrough;
+ case CMDQ_OP_TLBI_S12_VMALL:
+ cmd[0] |= FIELD_PREP(CMDQ_TLBI_0_VMID, ent->tlbi.vmid);
+ break;
+ case CMDQ_OP_ATC_INV:
+ cmd[0] |= FIELD_PREP(CMDQ_0_SSV, ent->substream_valid);
+ cmd[0] |= FIELD_PREP(CMDQ_ATC_0_GLOBAL, ent->atc.global);
+ cmd[0] |= FIELD_PREP(CMDQ_ATC_0_SSID, ent->atc.ssid);
+ cmd[0] |= FIELD_PREP(CMDQ_ATC_0_SID, ent->atc.sid);
+ cmd[1] |= FIELD_PREP(CMDQ_ATC_1_SIZE, ent->atc.size);
+ cmd[1] |= ent->atc.addr & CMDQ_ATC_1_ADDR_MASK;
+ break;
+ case CMDQ_OP_PRI_RESP:
+ cmd[0] |= FIELD_PREP(CMDQ_0_SSV, ent->substream_valid);
+ cmd[0] |= FIELD_PREP(CMDQ_PRI_0_SSID, ent->pri.ssid);
+ cmd[0] |= FIELD_PREP(CMDQ_PRI_0_SID, ent->pri.sid);
+ cmd[1] |= FIELD_PREP(CMDQ_PRI_1_GRPID, ent->pri.grpid);
+ switch (ent->pri.resp) {
+ case PRI_RESP_DENY:
+ case PRI_RESP_FAIL:
+ case PRI_RESP_SUCC:
+ break;
+ default:
+ return -EINVAL;
+ }
+ cmd[1] |= FIELD_PREP(CMDQ_PRI_1_RESP, ent->pri.resp);
+ break;
+ case CMDQ_OP_CMD_SYNC:
+ if (ent->sync.msiaddr) {
+ cmd[0] |= FIELD_PREP(CMDQ_SYNC_0_CS, CMDQ_SYNC_0_CS_IRQ);
+ cmd[1] |= ent->sync.msiaddr & CMDQ_SYNC_1_MSIADDR_MASK;
+ } else {
+ cmd[0] |= FIELD_PREP(CMDQ_SYNC_0_CS, CMDQ_SYNC_0_CS_SEV);
+ }
+ cmd[0] |= FIELD_PREP(CMDQ_SYNC_0_MSH, ARM_SMMU_SH_ISH);
+ cmd[0] |= FIELD_PREP(CMDQ_SYNC_0_MSIATTR, ARM_SMMU_MEMATTR_OIWB);
+ break;
+ default:
+ return -ENOENT;
+ }
+
+ return 0;
+}
+
+static void arm_smmu_cmdq_build_sync_cmd(u64 *cmd, struct arm_smmu_device *smmu,
+ u32 prod)
+{
+ struct arm_smmu_queue *q = &smmu->cmdq.q;
+ struct arm_smmu_cmdq_ent ent = {
+ .opcode = CMDQ_OP_CMD_SYNC,
+ };
+
+ /*
+ * Beware that Hi16xx adds an extra 32 bits of goodness to its MSI
+ * payload, so the write will zero the entire command on that platform.
+ */
+ if (smmu->options & ARM_SMMU_OPT_MSIPOLL) {
+ ent.sync.msiaddr = q->base_dma + Q_IDX(&q->llq, prod) *
+ q->ent_dwords * 8;
+ }
+
+ arm_smmu_cmdq_build_cmd(cmd, &ent);
+}
+
+static void arm_smmu_cmdq_skip_err(struct arm_smmu_device *smmu)
+{
+ static const char *cerror_str[] = {
+ [CMDQ_ERR_CERROR_NONE_IDX] = "No error",
+ [CMDQ_ERR_CERROR_ILL_IDX] = "Illegal command",
+ [CMDQ_ERR_CERROR_ABT_IDX] = "Abort on command fetch",
+ [CMDQ_ERR_CERROR_ATC_INV_IDX] = "ATC invalidate timeout",
+ };
+
+ int i;
+ u64 cmd[CMDQ_ENT_DWORDS];
+ struct arm_smmu_queue *q = &smmu->cmdq.q;
+ u32 cons = readl_relaxed(q->cons_reg);
+ u32 idx = FIELD_GET(CMDQ_CONS_ERR, cons);
+ struct arm_smmu_cmdq_ent cmd_sync = {
+ .opcode = CMDQ_OP_CMD_SYNC,
+ };
+
+ dev_err(smmu->dev, "CMDQ error (cons 0x%08x): %s\n", cons,
+ idx < ARRAY_SIZE(cerror_str) ? cerror_str[idx] : "Unknown");
+
+ switch (idx) {
+ case CMDQ_ERR_CERROR_ABT_IDX:
+ dev_err(smmu->dev, "retrying command fetch\n");
+ case CMDQ_ERR_CERROR_NONE_IDX:
+ return;
+ case CMDQ_ERR_CERROR_ATC_INV_IDX:
+ /*
+ * ATC Invalidation Completion timeout. CONS is still pointing
+ * at the CMD_SYNC. Attempt to complete other pending commands
+ * by repeating the CMD_SYNC, though we might well end up back
+ * here since the ATC invalidation may still be pending.
+ */
+ return;
+ case CMDQ_ERR_CERROR_ILL_IDX:
+ default:
+ break;
+ }
+
+ /*
+ * We may have concurrent producers, so we need to be careful
+ * not to touch any of the shadow cmdq state.
+ */
+ queue_read(cmd, Q_ENT(q, cons), q->ent_dwords);
+ dev_err(smmu->dev, "skipping command in error state:\n");
+ for (i = 0; i < ARRAY_SIZE(cmd); ++i)
+ dev_err(smmu->dev, "\t0x%016llx\n", (unsigned long long)cmd[i]);
+
+ /* Convert the erroneous command into a CMD_SYNC */
+ if (arm_smmu_cmdq_build_cmd(cmd, &cmd_sync)) {
+ dev_err(smmu->dev, "failed to convert to CMD_SYNC\n");
+ return;
+ }
+
+ queue_write(Q_ENT(q, cons), cmd, q->ent_dwords);
+}
+
+/*
+ * Command queue locking.
+ * This is a form of bastardised rwlock with the following major changes:
+ *
+ * - The only LOCK routines are exclusive_trylock() and shared_lock().
+ * Neither have barrier semantics, and instead provide only a control
+ * dependency.
+ *
+ * - The UNLOCK routines are supplemented with shared_tryunlock(), which
+ * fails if the caller appears to be the last lock holder (yes, this is
+ * racy). All successful UNLOCK routines have RELEASE semantics.
+ */
+static void arm_smmu_cmdq_shared_lock(struct arm_smmu_cmdq *cmdq)
+{
+ int val;
+
+ /*
+ * We can try to avoid the cmpxchg() loop by simply incrementing the
+ * lock counter. When held in exclusive state, the lock counter is set
+ * to INT_MIN so these increments won't hurt as the value will remain
+ * negative.
+ */
+ if (atomic_fetch_inc_relaxed(&cmdq->lock) >= 0)
+ return;
+
+ do {
+ val = atomic_cond_read_relaxed(&cmdq->lock, VAL >= 0);
+ } while (atomic_cmpxchg_relaxed(&cmdq->lock, val, val + 1) != val);
+}
+
+static void arm_smmu_cmdq_shared_unlock(struct arm_smmu_cmdq *cmdq)
+{
+ (void)atomic_dec_return_release(&cmdq->lock);
+}
+
+static bool arm_smmu_cmdq_shared_tryunlock(struct arm_smmu_cmdq *cmdq)
+{
+ if (atomic_read(&cmdq->lock) == 1)
+ return false;
+
+ arm_smmu_cmdq_shared_unlock(cmdq);
+ return true;
+}
+
+#define arm_smmu_cmdq_exclusive_trylock_irqsave(cmdq, flags) \
+({ \
+ bool __ret; \
+ local_irq_save(flags); \
+ __ret = !atomic_cmpxchg_relaxed(&cmdq->lock, 0, INT_MIN); \
+ if (!__ret) \
+ local_irq_restore(flags); \
+ __ret; \
+})
+
+#define arm_smmu_cmdq_exclusive_unlock_irqrestore(cmdq, flags) \
+({ \
+ atomic_set_release(&cmdq->lock, 0); \
+ local_irq_restore(flags); \
+})
+
+
+/*
+ * Command queue insertion.
+ * This is made fiddly by our attempts to achieve some sort of scalability
+ * since there is one queue shared amongst all of the CPUs in the system. If
+ * you like mixed-size concurrency, dependency ordering and relaxed atomics,
+ * then you'll *love* this monstrosity.
+ *
+ * The basic idea is to split the queue up into ranges of commands that are
+ * owned by a given CPU; the owner may not have written all of the commands
+ * itself, but is responsible for advancing the hardware prod pointer when
+ * the time comes. The algorithm is roughly:
+ *
+ * 1. Allocate some space in the queue. At this point we also discover
+ * whether the head of the queue is currently owned by another CPU,
+ * or whether we are the owner.
+ *
+ * 2. Write our commands into our allocated slots in the queue.
+ *
+ * 3. Mark our slots as valid in arm_smmu_cmdq.valid_map.
+ *
+ * 4. If we are an owner:
+ * a. Wait for the previous owner to finish.
+ * b. Mark the queue head as unowned, which tells us the range
+ * that we are responsible for publishing.
+ * c. Wait for all commands in our owned range to become valid.
+ * d. Advance the hardware prod pointer.
+ * e. Tell the next owner we've finished.
+ *
+ * 5. If we are inserting a CMD_SYNC (we may or may not have been an
+ * owner), then we need to stick around until it has completed:
+ * a. If we have MSIs, the SMMU can write back into the CMD_SYNC
+ * to clear the first 4 bytes.
+ * b. Otherwise, we spin waiting for the hardware cons pointer to
+ * advance past our command.
+ *
+ * The devil is in the details, particularly the use of locking for handling
+ * SYNC completion and freeing up space in the queue before we think that it is
+ * full.
+ */
+static void __arm_smmu_cmdq_poll_set_valid_map(struct arm_smmu_cmdq *cmdq,
+ u32 sprod, u32 eprod, bool set)
+{
+ u32 swidx, sbidx, ewidx, ebidx;
+ struct arm_smmu_ll_queue llq = {
+ .max_n_shift = cmdq->q.llq.max_n_shift,
+ .prod = sprod,
+ };
+
+ ewidx = BIT_WORD(Q_IDX(&llq, eprod));
+ ebidx = Q_IDX(&llq, eprod) % BITS_PER_LONG;
+
+ while (llq.prod != eprod) {
+ unsigned long mask;
+ atomic_long_t *ptr;
+ u32 limit = BITS_PER_LONG;
+
+ swidx = BIT_WORD(Q_IDX(&llq, llq.prod));
+ sbidx = Q_IDX(&llq, llq.prod) % BITS_PER_LONG;
+
+ ptr = &cmdq->valid_map[swidx];
+
+ if ((swidx == ewidx) && (sbidx < ebidx))
+ limit = ebidx;
+
+ mask = GENMASK(limit - 1, sbidx);
+
+ /*
+ * The valid bit is the inverse of the wrap bit. This means
+ * that a zero-initialised queue is invalid and, after marking
+ * all entries as valid, they become invalid again when we
+ * wrap.
+ */
+ if (set) {
+ atomic_long_xor(mask, ptr);
+ } else { /* Poll */
+ unsigned long valid;
+
+ valid = (ULONG_MAX + !!Q_WRP(&llq, llq.prod)) & mask;
+ atomic_long_cond_read_relaxed(ptr, (VAL & mask) == valid);
+ }
+
+ llq.prod = queue_inc_prod_n(&llq, limit - sbidx);
+ }
+}
+
+/* Mark all entries in the range [sprod, eprod) as valid */
+static void arm_smmu_cmdq_set_valid_map(struct arm_smmu_cmdq *cmdq,
+ u32 sprod, u32 eprod)
+{
+ __arm_smmu_cmdq_poll_set_valid_map(cmdq, sprod, eprod, true);
+}
+
+/* Wait for all entries in the range [sprod, eprod) to become valid */
+static void arm_smmu_cmdq_poll_valid_map(struct arm_smmu_cmdq *cmdq,
+ u32 sprod, u32 eprod)
+{
+ __arm_smmu_cmdq_poll_set_valid_map(cmdq, sprod, eprod, false);
+}
+
+/* Wait for the command queue to become non-full */
+static int arm_smmu_cmdq_poll_until_not_full(struct arm_smmu_device *smmu,
+ struct arm_smmu_ll_queue *llq)
+{
+ unsigned long flags;
+ struct arm_smmu_queue_poll qp;
+ struct arm_smmu_cmdq *cmdq = &smmu->cmdq;
+ int ret = 0;
+
+ /*
+ * Try to update our copy of cons by grabbing exclusive cmdq access. If
+ * that fails, spin until somebody else updates it for us.
+ */
+ if (arm_smmu_cmdq_exclusive_trylock_irqsave(cmdq, flags)) {
+ WRITE_ONCE(cmdq->q.llq.cons, readl_relaxed(cmdq->q.cons_reg));
+ arm_smmu_cmdq_exclusive_unlock_irqrestore(cmdq, flags);
+ llq->val = READ_ONCE(cmdq->q.llq.val);
+ return 0;
+ }
+
+ queue_poll_init(smmu, &qp);
+ do {
+ llq->val = READ_ONCE(smmu->cmdq.q.llq.val);
+ if (!queue_full(llq))
+ break;
+
+ ret = queue_poll(&qp);
+ } while (!ret);
+
+ return ret;
+}
+
+/*
+ * Wait until the SMMU signals a CMD_SYNC completion MSI.
+ * Must be called with the cmdq lock held in some capacity.
+ */
+static int __arm_smmu_cmdq_poll_until_msi(struct arm_smmu_device *smmu,
+ struct arm_smmu_ll_queue *llq)
+{
+ int ret = 0;
+ struct arm_smmu_queue_poll qp;
+ struct arm_smmu_cmdq *cmdq = &smmu->cmdq;
+ u32 *cmd = (u32 *)(Q_ENT(&cmdq->q, llq->prod));
+
+ queue_poll_init(smmu, &qp);
+
+ /*
+ * The MSI won't generate an event, since it's being written back
+ * into the command queue.
+ */
+ qp.wfe = false;
+ smp_cond_load_relaxed(cmd, !VAL || (ret = queue_poll(&qp)));
+ llq->cons = ret ? llq->prod : queue_inc_prod_n(llq, 1);
+ return ret;
+}
+
+/*
+ * Wait until the SMMU cons index passes llq->prod.
+ * Must be called with the cmdq lock held in some capacity.
+ */
+static int __arm_smmu_cmdq_poll_until_consumed(struct arm_smmu_device *smmu,
+ struct arm_smmu_ll_queue *llq)
+{
+ struct arm_smmu_queue_poll qp;
+ struct arm_smmu_cmdq *cmdq = &smmu->cmdq;
+ u32 prod = llq->prod;
+ int ret = 0;
+
+ queue_poll_init(smmu, &qp);
+ llq->val = READ_ONCE(smmu->cmdq.q.llq.val);
+ do {
+ if (queue_consumed(llq, prod))
+ break;
+
+ ret = queue_poll(&qp);
+
+ /*
+ * This needs to be a readl() so that our subsequent call
+ * to arm_smmu_cmdq_shared_tryunlock() can fail accurately.
+ *
+ * Specifically, we need to ensure that we observe all
+ * shared_lock()s by other CMD_SYNCs that share our owner,
+ * so that a failing call to tryunlock() means that we're
+ * the last one out and therefore we can safely advance
+ * cmdq->q.llq.cons. Roughly speaking:
+ *
+ * CPU 0 CPU1 CPU2 (us)
+ *
+ * if (sync)
+ * shared_lock();
+ *
+ * dma_wmb();
+ * set_valid_map();
+ *
+ * if (owner) {
+ * poll_valid_map();
+ * <control dependency>
+ * writel(prod_reg);
+ *
+ * readl(cons_reg);
+ * tryunlock();
+ *
+ * Requires us to see CPU 0's shared_lock() acquisition.
+ */
+ llq->cons = readl(cmdq->q.cons_reg);
+ } while (!ret);
+
+ return ret;
+}
+
+static int arm_smmu_cmdq_poll_until_sync(struct arm_smmu_device *smmu,
+ struct arm_smmu_ll_queue *llq)
+{
+ if (smmu->options & ARM_SMMU_OPT_MSIPOLL)
+ return __arm_smmu_cmdq_poll_until_msi(smmu, llq);
+
+ return __arm_smmu_cmdq_poll_until_consumed(smmu, llq);
+}
+
+static void arm_smmu_cmdq_write_entries(struct arm_smmu_cmdq *cmdq, u64 *cmds,
+ u32 prod, int n)
+{
+ int i;
+ struct arm_smmu_ll_queue llq = {
+ .max_n_shift = cmdq->q.llq.max_n_shift,
+ .prod = prod,
+ };
+
+ for (i = 0; i < n; ++i) {
+ u64 *cmd = &cmds[i * CMDQ_ENT_DWORDS];
+
+ prod = queue_inc_prod_n(&llq, i);
+ queue_write(Q_ENT(&cmdq->q, prod), cmd, CMDQ_ENT_DWORDS);
+ }
+}
+
+/*
+ * This is the actual insertion function, and provides the following
+ * ordering guarantees to callers:
+ *
+ * - There is a dma_wmb() before publishing any commands to the queue.
+ * This can be relied upon to order prior writes to data structures
+ * in memory (such as a CD or an STE) before the command.
+ *
+ * - On completion of a CMD_SYNC, there is a control dependency.
+ * This can be relied upon to order subsequent writes to memory (e.g.
+ * freeing an IOVA) after completion of the CMD_SYNC.
+ *
+ * - Command insertion is totally ordered, so if two CPUs each race to
+ * insert their own list of commands then all of the commands from one
+ * CPU will appear before any of the commands from the other CPU.
+ */
+static int arm_smmu_cmdq_issue_cmdlist(struct arm_smmu_device *smmu,
+ u64 *cmds, int n, bool sync)
+{
+ u64 cmd_sync[CMDQ_ENT_DWORDS];
+ u32 prod;
+ unsigned long flags;
+ bool owner;
+ struct arm_smmu_cmdq *cmdq = &smmu->cmdq;
+ struct arm_smmu_ll_queue llq = {
+ .max_n_shift = cmdq->q.llq.max_n_shift,
+ }, head = llq;
+ int ret = 0;
+
+ /* 1. Allocate some space in the queue */
+ local_irq_save(flags);
+ llq.val = READ_ONCE(cmdq->q.llq.val);
+ do {
+ u64 old;
+
+ while (!queue_has_space(&llq, n + sync)) {
+ local_irq_restore(flags);
+ if (arm_smmu_cmdq_poll_until_not_full(smmu, &llq))
+ dev_err_ratelimited(smmu->dev, "CMDQ timeout\n");
+ local_irq_save(flags);
+ }
+
+ head.cons = llq.cons;
+ head.prod = queue_inc_prod_n(&llq, n + sync) |
+ CMDQ_PROD_OWNED_FLAG;
+
+ old = cmpxchg_relaxed(&cmdq->q.llq.val, llq.val, head.val);
+ if (old == llq.val)
+ break;
+
+ llq.val = old;
+ } while (1);
+ owner = !(llq.prod & CMDQ_PROD_OWNED_FLAG);
+ head.prod &= ~CMDQ_PROD_OWNED_FLAG;
+ llq.prod &= ~CMDQ_PROD_OWNED_FLAG;
+
+ /*
+ * 2. Write our commands into the queue
+ * Dependency ordering from the cmpxchg() loop above.
+ */
+ arm_smmu_cmdq_write_entries(cmdq, cmds, llq.prod, n);
+ if (sync) {
+ prod = queue_inc_prod_n(&llq, n);
+ arm_smmu_cmdq_build_sync_cmd(cmd_sync, smmu, prod);
+ queue_write(Q_ENT(&cmdq->q, prod), cmd_sync, CMDQ_ENT_DWORDS);
+
+ /*
+ * In order to determine completion of our CMD_SYNC, we must
+ * ensure that the queue can't wrap twice without us noticing.
+ * We achieve that by taking the cmdq lock as shared before
+ * marking our slot as valid.
+ */
+ arm_smmu_cmdq_shared_lock(cmdq);
+ }
+
+ /* 3. Mark our slots as valid, ensuring commands are visible first */
+ dma_wmb();
+ arm_smmu_cmdq_set_valid_map(cmdq, llq.prod, head.prod);
+
+ /* 4. If we are the owner, take control of the SMMU hardware */
+ if (owner) {
+ /* a. Wait for previous owner to finish */
+ atomic_cond_read_relaxed(&cmdq->owner_prod, VAL == llq.prod);
+
+ /* b. Stop gathering work by clearing the owned flag */
+ prod = atomic_fetch_andnot_relaxed(CMDQ_PROD_OWNED_FLAG,
+ &cmdq->q.llq.atomic.prod);
+ prod &= ~CMDQ_PROD_OWNED_FLAG;
+
+ /*
+ * c. Wait for any gathered work to be written to the queue.
+ * Note that we read our own entries so that we have the control
+ * dependency required by (d).
+ */
+ arm_smmu_cmdq_poll_valid_map(cmdq, llq.prod, prod);
+
+ /*
+ * d. Advance the hardware prod pointer
+ * Control dependency ordering from the entries becoming valid.
+ */
+ writel_relaxed(prod, cmdq->q.prod_reg);
+
+ /*
+ * e. Tell the next owner we're done
+ * Make sure we've updated the hardware first, so that we don't
+ * race to update prod and potentially move it backwards.
+ */
+ atomic_set_release(&cmdq->owner_prod, prod);
+ }
+
+ /* 5. If we are inserting a CMD_SYNC, we must wait for it to complete */
+ if (sync) {
+ llq.prod = queue_inc_prod_n(&llq, n);
+ ret = arm_smmu_cmdq_poll_until_sync(smmu, &llq);
+ if (ret) {
+ dev_err_ratelimited(smmu->dev,
+ "CMD_SYNC timeout at 0x%08x [hwprod 0x%08x, hwcons 0x%08x]\n",
+ llq.prod,
+ readl_relaxed(cmdq->q.prod_reg),
+ readl_relaxed(cmdq->q.cons_reg));
+ }
+
+ /*
+ * Try to unlock the cmdq lock. This will fail if we're the last
+ * reader, in which case we can safely update cmdq->q.llq.cons
+ */
+ if (!arm_smmu_cmdq_shared_tryunlock(cmdq)) {
+ WRITE_ONCE(cmdq->q.llq.cons, llq.cons);
+ arm_smmu_cmdq_shared_unlock(cmdq);
+ }
+ }
+
+ local_irq_restore(flags);
+ return ret;
+}
+
+static int arm_smmu_cmdq_issue_cmd(struct arm_smmu_device *smmu,
+ struct arm_smmu_cmdq_ent *ent)
+{
+ u64 cmd[CMDQ_ENT_DWORDS];
+
+ if (arm_smmu_cmdq_build_cmd(cmd, ent)) {
+ dev_warn(smmu->dev, "ignoring unknown CMDQ opcode 0x%x\n",
+ ent->opcode);
+ return -EINVAL;
+ }
+
+ return arm_smmu_cmdq_issue_cmdlist(smmu, cmd, 1, false);
+}
+
+static int arm_smmu_cmdq_issue_sync(struct arm_smmu_device *smmu)
+{
+ return arm_smmu_cmdq_issue_cmdlist(smmu, NULL, 0, true);
+}
+
+static void arm_smmu_cmdq_batch_add(struct arm_smmu_device *smmu,
+ struct arm_smmu_cmdq_batch *cmds,
+ struct arm_smmu_cmdq_ent *cmd)
+{
+ if (cmds->num == CMDQ_BATCH_ENTRIES) {
+ arm_smmu_cmdq_issue_cmdlist(smmu, cmds->cmds, cmds->num, false);
+ cmds->num = 0;
+ }
+ arm_smmu_cmdq_build_cmd(&cmds->cmds[cmds->num * CMDQ_ENT_DWORDS], cmd);
+ cmds->num++;
+}
+
+static int arm_smmu_cmdq_batch_submit(struct arm_smmu_device *smmu,
+ struct arm_smmu_cmdq_batch *cmds)
+{
+ return arm_smmu_cmdq_issue_cmdlist(smmu, cmds->cmds, cmds->num, true);
+}
+
+/* Context descriptor manipulation functions */
+void arm_smmu_tlb_inv_asid(struct arm_smmu_device *smmu, u16 asid)
+{
+ struct arm_smmu_cmdq_ent cmd = {
+ .opcode = CMDQ_OP_TLBI_NH_ASID,
+ .tlbi.asid = asid,
+ };
+
+ arm_smmu_cmdq_issue_cmd(smmu, &cmd);
+ arm_smmu_cmdq_issue_sync(smmu);
+}
+
+static void arm_smmu_sync_cd(struct arm_smmu_domain *smmu_domain,
+ int ssid, bool leaf)
+{
+ size_t i;
+ unsigned long flags;
+ struct arm_smmu_master *master;
+ struct arm_smmu_cmdq_batch cmds = {};
+ struct arm_smmu_device *smmu = smmu_domain->smmu;
+ struct arm_smmu_cmdq_ent cmd = {
+ .opcode = CMDQ_OP_CFGI_CD,
+ .cfgi = {
+ .ssid = ssid,
+ .leaf = leaf,
+ },
+ };
+
+ spin_lock_irqsave(&smmu_domain->devices_lock, flags);
+ list_for_each_entry(master, &smmu_domain->devices, domain_head) {
+ for (i = 0; i < master->num_sids; i++) {
+ cmd.cfgi.sid = master->sids[i];
+ arm_smmu_cmdq_batch_add(smmu, &cmds, &cmd);
+ }
+ }
+ spin_unlock_irqrestore(&smmu_domain->devices_lock, flags);
+
+ arm_smmu_cmdq_batch_submit(smmu, &cmds);
+}
+
+static int arm_smmu_alloc_cd_leaf_table(struct arm_smmu_device *smmu,
+ struct arm_smmu_l1_ctx_desc *l1_desc)
+{
+ size_t size = CTXDESC_L2_ENTRIES * (CTXDESC_CD_DWORDS << 3);
+
+ l1_desc->l2ptr = dmam_alloc_coherent(smmu->dev, size,
+ &l1_desc->l2ptr_dma, GFP_KERNEL);
+ if (!l1_desc->l2ptr) {
+ dev_warn(smmu->dev,
+ "failed to allocate context descriptor table\n");
+ return -ENOMEM;
+ }
+ return 0;
+}
+
+static void arm_smmu_write_cd_l1_desc(__le64 *dst,
+ struct arm_smmu_l1_ctx_desc *l1_desc)
+{
+ u64 val = (l1_desc->l2ptr_dma & CTXDESC_L1_DESC_L2PTR_MASK) |
+ CTXDESC_L1_DESC_V;
+
+ /* See comment in arm_smmu_write_ctx_desc() */
+ WRITE_ONCE(*dst, cpu_to_le64(val));
+}
+
+static __le64 *arm_smmu_get_cd_ptr(struct arm_smmu_domain *smmu_domain,
+ u32 ssid)
+{
+ __le64 *l1ptr;
+ unsigned int idx;
+ struct arm_smmu_l1_ctx_desc *l1_desc;
+ struct arm_smmu_device *smmu = smmu_domain->smmu;
+ struct arm_smmu_ctx_desc_cfg *cdcfg = &smmu_domain->s1_cfg.cdcfg;
+
+ if (smmu_domain->s1_cfg.s1fmt == STRTAB_STE_0_S1FMT_LINEAR)
+ return cdcfg->cdtab + ssid * CTXDESC_CD_DWORDS;
+
+ idx = ssid >> CTXDESC_SPLIT;
+ l1_desc = &cdcfg->l1_desc[idx];
+ if (!l1_desc->l2ptr) {
+ if (arm_smmu_alloc_cd_leaf_table(smmu, l1_desc))
+ return NULL;
+
+ l1ptr = cdcfg->cdtab + idx * CTXDESC_L1_DESC_DWORDS;
+ arm_smmu_write_cd_l1_desc(l1ptr, l1_desc);
+ /* An invalid L1CD can be cached */
+ arm_smmu_sync_cd(smmu_domain, ssid, false);
+ }
+ idx = ssid & (CTXDESC_L2_ENTRIES - 1);
+ return l1_desc->l2ptr + idx * CTXDESC_CD_DWORDS;
+}
+
+int arm_smmu_write_ctx_desc(struct arm_smmu_domain *smmu_domain, int ssid,
+ struct arm_smmu_ctx_desc *cd)
+{
+ /*
+ * This function handles the following cases:
+ *
+ * (1) Install primary CD, for normal DMA traffic (SSID = 0).
+ * (2) Install a secondary CD, for SID+SSID traffic.
+ * (3) Update ASID of a CD. Atomically write the first 64 bits of the
+ * CD, then invalidate the old entry and mappings.
+ * (4) Remove a secondary CD.
+ */
+ u64 val;
+ bool cd_live;
+ __le64 *cdptr;
+ struct arm_smmu_device *smmu = smmu_domain->smmu;
+
+ if (WARN_ON(ssid >= (1 << smmu_domain->s1_cfg.s1cdmax)))
+ return -E2BIG;
+
+ cdptr = arm_smmu_get_cd_ptr(smmu_domain, ssid);
+ if (!cdptr)
+ return -ENOMEM;
+
+ val = le64_to_cpu(cdptr[0]);
+ cd_live = !!(val & CTXDESC_CD_0_V);
+
+ if (!cd) { /* (4) */
+ val = 0;
+ } else if (cd_live) { /* (3) */
+ val &= ~CTXDESC_CD_0_ASID;
+ val |= FIELD_PREP(CTXDESC_CD_0_ASID, cd->asid);
+ /*
+ * Until CD+TLB invalidation, both ASIDs may be used for tagging
+ * this substream's traffic
+ */
+ } else { /* (1) and (2) */
+ cdptr[1] = cpu_to_le64(cd->ttbr & CTXDESC_CD_1_TTB0_MASK);
+ cdptr[2] = 0;
+ cdptr[3] = cpu_to_le64(cd->mair);
+
+ /*
+ * STE is live, and the SMMU might read dwords of this CD in any
+ * order. Ensure that it observes valid values before reading
+ * V=1.
+ */
+ arm_smmu_sync_cd(smmu_domain, ssid, true);
+
+ val = cd->tcr |
+#ifdef __BIG_ENDIAN
+ CTXDESC_CD_0_ENDI |
+#endif
+ CTXDESC_CD_0_R | CTXDESC_CD_0_A |
+ (cd->mm ? 0 : CTXDESC_CD_0_ASET) |
+ CTXDESC_CD_0_AA64 |
+ FIELD_PREP(CTXDESC_CD_0_ASID, cd->asid) |
+ CTXDESC_CD_0_V;
+
+ /* STALL_MODEL==0b10 && CD.S==0 is ILLEGAL */
+ if (smmu->features & ARM_SMMU_FEAT_STALL_FORCE)
+ val |= CTXDESC_CD_0_S;
+ }
+
+ /*
+ * The SMMU accesses 64-bit values atomically. See IHI0070Ca 3.21.3
+ * "Configuration structures and configuration invalidation completion"
+ *
+ * The size of single-copy atomic reads made by the SMMU is
+ * IMPLEMENTATION DEFINED but must be at least 64 bits. Any single
+ * field within an aligned 64-bit span of a structure can be altered
+ * without first making the structure invalid.
+ */
+ WRITE_ONCE(cdptr[0], cpu_to_le64(val));
+ arm_smmu_sync_cd(smmu_domain, ssid, true);
+ return 0;
+}
+
+static int arm_smmu_alloc_cd_tables(struct arm_smmu_domain *smmu_domain)
+{
+ int ret;
+ size_t l1size;
+ size_t max_contexts;
+ struct arm_smmu_device *smmu = smmu_domain->smmu;
+ struct arm_smmu_s1_cfg *cfg = &smmu_domain->s1_cfg;
+ struct arm_smmu_ctx_desc_cfg *cdcfg = &cfg->cdcfg;
+
+ max_contexts = 1 << cfg->s1cdmax;
+
+ if (!(smmu->features & ARM_SMMU_FEAT_2_LVL_CDTAB) ||
+ max_contexts <= CTXDESC_L2_ENTRIES) {
+ cfg->s1fmt = STRTAB_STE_0_S1FMT_LINEAR;
+ cdcfg->num_l1_ents = max_contexts;
+
+ l1size = max_contexts * (CTXDESC_CD_DWORDS << 3);
+ } else {
+ cfg->s1fmt = STRTAB_STE_0_S1FMT_64K_L2;
+ cdcfg->num_l1_ents = DIV_ROUND_UP(max_contexts,
+ CTXDESC_L2_ENTRIES);
+
+ cdcfg->l1_desc = devm_kcalloc(smmu->dev, cdcfg->num_l1_ents,
+ sizeof(*cdcfg->l1_desc),
+ GFP_KERNEL);
+ if (!cdcfg->l1_desc)
+ return -ENOMEM;
+
+ l1size = cdcfg->num_l1_ents * (CTXDESC_L1_DESC_DWORDS << 3);
+ }
+
+ cdcfg->cdtab = dmam_alloc_coherent(smmu->dev, l1size, &cdcfg->cdtab_dma,
+ GFP_KERNEL);
+ if (!cdcfg->cdtab) {
+ dev_warn(smmu->dev, "failed to allocate context descriptor\n");
+ ret = -ENOMEM;
+ goto err_free_l1;
+ }
+
+ return 0;
+
+err_free_l1:
+ if (cdcfg->l1_desc) {
+ devm_kfree(smmu->dev, cdcfg->l1_desc);
+ cdcfg->l1_desc = NULL;
+ }
+ return ret;
+}
+
+static void arm_smmu_free_cd_tables(struct arm_smmu_domain *smmu_domain)
+{
+ int i;
+ size_t size, l1size;
+ struct arm_smmu_device *smmu = smmu_domain->smmu;
+ struct arm_smmu_ctx_desc_cfg *cdcfg = &smmu_domain->s1_cfg.cdcfg;
+
+ if (cdcfg->l1_desc) {
+ size = CTXDESC_L2_ENTRIES * (CTXDESC_CD_DWORDS << 3);
+
+ for (i = 0; i < cdcfg->num_l1_ents; i++) {
+ if (!cdcfg->l1_desc[i].l2ptr)
+ continue;
+
+ dmam_free_coherent(smmu->dev, size,
+ cdcfg->l1_desc[i].l2ptr,
+ cdcfg->l1_desc[i].l2ptr_dma);
+ }
+ devm_kfree(smmu->dev, cdcfg->l1_desc);
+ cdcfg->l1_desc = NULL;
+
+ l1size = cdcfg->num_l1_ents * (CTXDESC_L1_DESC_DWORDS << 3);
+ } else {
+ l1size = cdcfg->num_l1_ents * (CTXDESC_CD_DWORDS << 3);
+ }
+
+ dmam_free_coherent(smmu->dev, l1size, cdcfg->cdtab, cdcfg->cdtab_dma);
+ cdcfg->cdtab_dma = 0;
+ cdcfg->cdtab = NULL;
+}
+
+bool arm_smmu_free_asid(struct arm_smmu_ctx_desc *cd)
+{
+ bool free;
+ struct arm_smmu_ctx_desc *old_cd;
+
+ if (!cd->asid)
+ return false;
+
+ free = refcount_dec_and_test(&cd->refs);
+ if (free) {
+ old_cd = xa_erase(&arm_smmu_asid_xa, cd->asid);
+ WARN_ON(old_cd != cd);
+ }
+ return free;
+}
+
+/* Stream table manipulation functions */
+static void
+arm_smmu_write_strtab_l1_desc(__le64 *dst, struct arm_smmu_strtab_l1_desc *desc)
+{
+ u64 val = 0;
+
+ val |= FIELD_PREP(STRTAB_L1_DESC_SPAN, desc->span);
+ val |= desc->l2ptr_dma & STRTAB_L1_DESC_L2PTR_MASK;
+
+ /* See comment in arm_smmu_write_ctx_desc() */
+ WRITE_ONCE(*dst, cpu_to_le64(val));
+}
+
+static void arm_smmu_sync_ste_for_sid(struct arm_smmu_device *smmu, u32 sid)
+{
+ struct arm_smmu_cmdq_ent cmd = {
+ .opcode = CMDQ_OP_CFGI_STE,
+ .cfgi = {
+ .sid = sid,
+ .leaf = true,
+ },
+ };
+
+ arm_smmu_cmdq_issue_cmd(smmu, &cmd);
+ arm_smmu_cmdq_issue_sync(smmu);
+}
+
+static void arm_smmu_write_strtab_ent(struct arm_smmu_master *master, u32 sid,
+ __le64 *dst)
+{
+ /*
+ * This is hideously complicated, but we only really care about
+ * three cases at the moment:
+ *
+ * 1. Invalid (all zero) -> bypass/fault (init)
+ * 2. Bypass/fault -> translation/bypass (attach)
+ * 3. Translation/bypass -> bypass/fault (detach)
+ *
+ * Given that we can't update the STE atomically and the SMMU
+ * doesn't read the thing in a defined order, that leaves us
+ * with the following maintenance requirements:
+ *
+ * 1. Update Config, return (init time STEs aren't live)
+ * 2. Write everything apart from dword 0, sync, write dword 0, sync
+ * 3. Update Config, sync
+ */
+ u64 val = le64_to_cpu(dst[0]);
+ bool ste_live = false;
+ struct arm_smmu_device *smmu = NULL;
+ struct arm_smmu_s1_cfg *s1_cfg = NULL;
+ struct arm_smmu_s2_cfg *s2_cfg = NULL;
+ struct arm_smmu_domain *smmu_domain = NULL;
+ struct arm_smmu_cmdq_ent prefetch_cmd = {
+ .opcode = CMDQ_OP_PREFETCH_CFG,
+ .prefetch = {
+ .sid = sid,
+ },
+ };
+
+ if (master) {
+ smmu_domain = master->domain;
+ smmu = master->smmu;
+ }
+
+ if (smmu_domain) {
+ switch (smmu_domain->stage) {
+ case ARM_SMMU_DOMAIN_S1:
+ s1_cfg = &smmu_domain->s1_cfg;
+ break;
+ case ARM_SMMU_DOMAIN_S2:
+ case ARM_SMMU_DOMAIN_NESTED:
+ s2_cfg = &smmu_domain->s2_cfg;
+ break;
+ default:
+ break;
+ }
+ }
+
+ if (val & STRTAB_STE_0_V) {
+ switch (FIELD_GET(STRTAB_STE_0_CFG, val)) {
+ case STRTAB_STE_0_CFG_BYPASS:
+ break;
+ case STRTAB_STE_0_CFG_S1_TRANS:
+ case STRTAB_STE_0_CFG_S2_TRANS:
+ ste_live = true;
+ break;
+ case STRTAB_STE_0_CFG_ABORT:
+ BUG_ON(!disable_bypass);
+ break;
+ default:
+ BUG(); /* STE corruption */
+ }
+ }
+
+ /* Nuke the existing STE_0 value, as we're going to rewrite it */
+ val = STRTAB_STE_0_V;
+
+ /* Bypass/fault */
+ if (!smmu_domain || !(s1_cfg || s2_cfg)) {
+ if (!smmu_domain && disable_bypass)
+ val |= FIELD_PREP(STRTAB_STE_0_CFG, STRTAB_STE_0_CFG_ABORT);
+ else
+ val |= FIELD_PREP(STRTAB_STE_0_CFG, STRTAB_STE_0_CFG_BYPASS);
+
+ dst[0] = cpu_to_le64(val);
+ dst[1] = cpu_to_le64(FIELD_PREP(STRTAB_STE_1_SHCFG,
+ STRTAB_STE_1_SHCFG_INCOMING));
+ dst[2] = 0; /* Nuke the VMID */
+ /*
+ * The SMMU can perform negative caching, so we must sync
+ * the STE regardless of whether the old value was live.
+ */
+ if (smmu)
+ arm_smmu_sync_ste_for_sid(smmu, sid);
+ return;
+ }
+
+ if (s1_cfg) {
+ BUG_ON(ste_live);
+ dst[1] = cpu_to_le64(
+ FIELD_PREP(STRTAB_STE_1_S1DSS, STRTAB_STE_1_S1DSS_SSID0) |
+ FIELD_PREP(STRTAB_STE_1_S1CIR, STRTAB_STE_1_S1C_CACHE_WBRA) |
+ FIELD_PREP(STRTAB_STE_1_S1COR, STRTAB_STE_1_S1C_CACHE_WBRA) |
+ FIELD_PREP(STRTAB_STE_1_S1CSH, ARM_SMMU_SH_ISH) |
+ FIELD_PREP(STRTAB_STE_1_STRW, STRTAB_STE_1_STRW_NSEL1));
+
+ if (smmu->features & ARM_SMMU_FEAT_STALLS &&
+ !(smmu->features & ARM_SMMU_FEAT_STALL_FORCE))
+ dst[1] |= cpu_to_le64(STRTAB_STE_1_S1STALLD);
+
+ val |= (s1_cfg->cdcfg.cdtab_dma & STRTAB_STE_0_S1CTXPTR_MASK) |
+ FIELD_PREP(STRTAB_STE_0_CFG, STRTAB_STE_0_CFG_S1_TRANS) |
+ FIELD_PREP(STRTAB_STE_0_S1CDMAX, s1_cfg->s1cdmax) |
+ FIELD_PREP(STRTAB_STE_0_S1FMT, s1_cfg->s1fmt);
+ }
+
+ if (s2_cfg) {
+ BUG_ON(ste_live);
+ dst[2] = cpu_to_le64(
+ FIELD_PREP(STRTAB_STE_2_S2VMID, s2_cfg->vmid) |
+ FIELD_PREP(STRTAB_STE_2_VTCR, s2_cfg->vtcr) |
+#ifdef __BIG_ENDIAN
+ STRTAB_STE_2_S2ENDI |
+#endif
+ STRTAB_STE_2_S2PTW | STRTAB_STE_2_S2AA64 |
+ STRTAB_STE_2_S2R);
+
+ dst[3] = cpu_to_le64(s2_cfg->vttbr & STRTAB_STE_3_S2TTB_MASK);
+
+ val |= FIELD_PREP(STRTAB_STE_0_CFG, STRTAB_STE_0_CFG_S2_TRANS);
+ }
+
+ if (master->ats_enabled)
+ dst[1] |= cpu_to_le64(FIELD_PREP(STRTAB_STE_1_EATS,
+ STRTAB_STE_1_EATS_TRANS));
+
+ arm_smmu_sync_ste_for_sid(smmu, sid);
+ /* See comment in arm_smmu_write_ctx_desc() */
+ WRITE_ONCE(dst[0], cpu_to_le64(val));
+ arm_smmu_sync_ste_for_sid(smmu, sid);
+
+ /* It's likely that we'll want to use the new STE soon */
+ if (!(smmu->options & ARM_SMMU_OPT_SKIP_PREFETCH))
+ arm_smmu_cmdq_issue_cmd(smmu, &prefetch_cmd);
+}
+
+static void arm_smmu_init_bypass_stes(__le64 *strtab, unsigned int nent)
+{
+ unsigned int i;
+
+ for (i = 0; i < nent; ++i) {
+ arm_smmu_write_strtab_ent(NULL, -1, strtab);
+ strtab += STRTAB_STE_DWORDS;
+ }
+}
+
+static int arm_smmu_init_l2_strtab(struct arm_smmu_device *smmu, u32 sid)
+{
+ size_t size;
+ void *strtab;
+ struct arm_smmu_strtab_cfg *cfg = &smmu->strtab_cfg;
+ struct arm_smmu_strtab_l1_desc *desc = &cfg->l1_desc[sid >> STRTAB_SPLIT];
+
+ if (desc->l2ptr)
+ return 0;
+
+ size = 1 << (STRTAB_SPLIT + ilog2(STRTAB_STE_DWORDS) + 3);
+ strtab = &cfg->strtab[(sid >> STRTAB_SPLIT) * STRTAB_L1_DESC_DWORDS];
+
+ desc->span = STRTAB_SPLIT + 1;
+ desc->l2ptr = dmam_alloc_coherent(smmu->dev, size, &desc->l2ptr_dma,
+ GFP_KERNEL);
+ if (!desc->l2ptr) {
+ dev_err(smmu->dev,
+ "failed to allocate l2 stream table for SID %u\n",
+ sid);
+ return -ENOMEM;
+ }
+
+ arm_smmu_init_bypass_stes(desc->l2ptr, 1 << STRTAB_SPLIT);
+ arm_smmu_write_strtab_l1_desc(strtab, desc);
+ return 0;
+}
+
+/* IRQ and event handlers */
+static irqreturn_t arm_smmu_evtq_thread(int irq, void *dev)
+{
+ int i;
+ struct arm_smmu_device *smmu = dev;
+ struct arm_smmu_queue *q = &smmu->evtq.q;
+ struct arm_smmu_ll_queue *llq = &q->llq;
+ u64 evt[EVTQ_ENT_DWORDS];
+
+ do {
+ while (!queue_remove_raw(q, evt)) {
+ u8 id = FIELD_GET(EVTQ_0_ID, evt[0]);
+
+ dev_info(smmu->dev, "event 0x%02x received:\n", id);
+ for (i = 0; i < ARRAY_SIZE(evt); ++i)
+ dev_info(smmu->dev, "\t0x%016llx\n",
+ (unsigned long long)evt[i]);
+
+ cond_resched();
+ }
+
+ /*
+ * Not much we can do on overflow, so scream and pretend we're
+ * trying harder.
+ */
+ if (queue_sync_prod_in(q) == -EOVERFLOW)
+ dev_err(smmu->dev, "EVTQ overflow detected -- events lost\n");
+ } while (!queue_empty(llq));
+
+ /* Sync our overflow flag, as we believe we're up to speed */
+ queue_sync_cons_ovf(q);
+ return IRQ_HANDLED;
+}
+
+static void arm_smmu_handle_ppr(struct arm_smmu_device *smmu, u64 *evt)
+{
+ u32 sid, ssid;
+ u16 grpid;
+ bool ssv, last;
+
+ sid = FIELD_GET(PRIQ_0_SID, evt[0]);
+ ssv = FIELD_GET(PRIQ_0_SSID_V, evt[0]);
+ ssid = ssv ? FIELD_GET(PRIQ_0_SSID, evt[0]) : 0;
+ last = FIELD_GET(PRIQ_0_PRG_LAST, evt[0]);
+ grpid = FIELD_GET(PRIQ_1_PRG_IDX, evt[1]);
+
+ dev_info(smmu->dev, "unexpected PRI request received:\n");
+ dev_info(smmu->dev,
+ "\tsid 0x%08x.0x%05x: [%u%s] %sprivileged %s%s%s access at iova 0x%016llx\n",
+ sid, ssid, grpid, last ? "L" : "",
+ evt[0] & PRIQ_0_PERM_PRIV ? "" : "un",
+ evt[0] & PRIQ_0_PERM_READ ? "R" : "",
+ evt[0] & PRIQ_0_PERM_WRITE ? "W" : "",
+ evt[0] & PRIQ_0_PERM_EXEC ? "X" : "",
+ evt[1] & PRIQ_1_ADDR_MASK);
+
+ if (last) {
+ struct arm_smmu_cmdq_ent cmd = {
+ .opcode = CMDQ_OP_PRI_RESP,
+ .substream_valid = ssv,
+ .pri = {
+ .sid = sid,
+ .ssid = ssid,
+ .grpid = grpid,
+ .resp = PRI_RESP_DENY,
+ },
+ };
+
+ arm_smmu_cmdq_issue_cmd(smmu, &cmd);
+ }
+}
+
+static irqreturn_t arm_smmu_priq_thread(int irq, void *dev)
+{
+ struct arm_smmu_device *smmu = dev;
+ struct arm_smmu_queue *q = &smmu->priq.q;
+ struct arm_smmu_ll_queue *llq = &q->llq;
+ u64 evt[PRIQ_ENT_DWORDS];
+
+ do {
+ while (!queue_remove_raw(q, evt))
+ arm_smmu_handle_ppr(smmu, evt);
+
+ if (queue_sync_prod_in(q) == -EOVERFLOW)
+ dev_err(smmu->dev, "PRIQ overflow detected -- requests lost\n");
+ } while (!queue_empty(llq));
+
+ /* Sync our overflow flag, as we believe we're up to speed */
+ queue_sync_cons_ovf(q);
+ return IRQ_HANDLED;
+}
+
+static int arm_smmu_device_disable(struct arm_smmu_device *smmu);
+
+static irqreturn_t arm_smmu_gerror_handler(int irq, void *dev)
+{
+ u32 gerror, gerrorn, active;
+ struct arm_smmu_device *smmu = dev;
+
+ gerror = readl_relaxed(smmu->base + ARM_SMMU_GERROR);
+ gerrorn = readl_relaxed(smmu->base + ARM_SMMU_GERRORN);
+
+ active = gerror ^ gerrorn;
+ if (!(active & GERROR_ERR_MASK))
+ return IRQ_NONE; /* No errors pending */
+
+ dev_warn(smmu->dev,
+ "unexpected global error reported (0x%08x), this could be serious\n",
+ active);
+
+ if (active & GERROR_SFM_ERR) {
+ dev_err(smmu->dev, "device has entered Service Failure Mode!\n");
+ arm_smmu_device_disable(smmu);
+ }
+
+ if (active & GERROR_MSI_GERROR_ABT_ERR)
+ dev_warn(smmu->dev, "GERROR MSI write aborted\n");
+
+ if (active & GERROR_MSI_PRIQ_ABT_ERR)
+ dev_warn(smmu->dev, "PRIQ MSI write aborted\n");
+
+ if (active & GERROR_MSI_EVTQ_ABT_ERR)
+ dev_warn(smmu->dev, "EVTQ MSI write aborted\n");
+
+ if (active & GERROR_MSI_CMDQ_ABT_ERR)
+ dev_warn(smmu->dev, "CMDQ MSI write aborted\n");
+
+ if (active & GERROR_PRIQ_ABT_ERR)
+ dev_err(smmu->dev, "PRIQ write aborted -- events may have been lost\n");
+
+ if (active & GERROR_EVTQ_ABT_ERR)
+ dev_err(smmu->dev, "EVTQ write aborted -- events may have been lost\n");
+
+ if (active & GERROR_CMDQ_ERR)
+ arm_smmu_cmdq_skip_err(smmu);
+
+ writel(gerror, smmu->base + ARM_SMMU_GERRORN);
+ return IRQ_HANDLED;
+}
+
+static irqreturn_t arm_smmu_combined_irq_thread(int irq, void *dev)
+{
+ struct arm_smmu_device *smmu = dev;
+
+ arm_smmu_evtq_thread(irq, dev);
+ if (smmu->features & ARM_SMMU_FEAT_PRI)
+ arm_smmu_priq_thread(irq, dev);
+
+ return IRQ_HANDLED;
+}
+
+static irqreturn_t arm_smmu_combined_irq_handler(int irq, void *dev)
+{
+ arm_smmu_gerror_handler(irq, dev);
+ return IRQ_WAKE_THREAD;
+}
+
+static void
+arm_smmu_atc_inv_to_cmd(int ssid, unsigned long iova, size_t size,
+ struct arm_smmu_cmdq_ent *cmd)
+{
+ size_t log2_span;
+ size_t span_mask;
+ /* ATC invalidates are always on 4096-bytes pages */
+ size_t inval_grain_shift = 12;
+ unsigned long page_start, page_end;
+
+ *cmd = (struct arm_smmu_cmdq_ent) {
+ .opcode = CMDQ_OP_ATC_INV,
+ .substream_valid = !!ssid,
+ .atc.ssid = ssid,
+ };
+
+ if (!size) {
+ cmd->atc.size = ATC_INV_SIZE_ALL;
+ return;
+ }
+
+ page_start = iova >> inval_grain_shift;
+ page_end = (iova + size - 1) >> inval_grain_shift;
+
+ /*
+ * In an ATS Invalidate Request, the address must be aligned on the
+ * range size, which must be a power of two number of page sizes. We
+ * thus have to choose between grossly over-invalidating the region, or
+ * splitting the invalidation into multiple commands. For simplicity
+ * we'll go with the first solution, but should refine it in the future
+ * if multiple commands are shown to be more efficient.
+ *
+ * Find the smallest power of two that covers the range. The most
+ * significant differing bit between the start and end addresses,
+ * fls(start ^ end), indicates the required span. For example:
+ *
+ * We want to invalidate pages [8; 11]. This is already the ideal range:
+ * x = 0b1000 ^ 0b1011 = 0b11
+ * span = 1 << fls(x) = 4
+ *
+ * To invalidate pages [7; 10], we need to invalidate [0; 15]:
+ * x = 0b0111 ^ 0b1010 = 0b1101
+ * span = 1 << fls(x) = 16
+ */
+ log2_span = fls_long(page_start ^ page_end);
+ span_mask = (1ULL << log2_span) - 1;
+
+ page_start &= ~span_mask;
+
+ cmd->atc.addr = page_start << inval_grain_shift;
+ cmd->atc.size = log2_span;
+}
+
+static int arm_smmu_atc_inv_master(struct arm_smmu_master *master)
+{
+ int i;
+ struct arm_smmu_cmdq_ent cmd;
+
+ arm_smmu_atc_inv_to_cmd(0, 0, 0, &cmd);
+
+ for (i = 0; i < master->num_sids; i++) {
+ cmd.atc.sid = master->sids[i];
+ arm_smmu_cmdq_issue_cmd(master->smmu, &cmd);
+ }
+
+ return arm_smmu_cmdq_issue_sync(master->smmu);
+}
+
+static int arm_smmu_atc_inv_domain(struct arm_smmu_domain *smmu_domain,
+ int ssid, unsigned long iova, size_t size)
+{
+ int i;
+ unsigned long flags;
+ struct arm_smmu_cmdq_ent cmd;
+ struct arm_smmu_master *master;
+ struct arm_smmu_cmdq_batch cmds = {};
+
+ if (!(smmu_domain->smmu->features & ARM_SMMU_FEAT_ATS))
+ return 0;
+
+ /*
+ * Ensure that we've completed prior invalidation of the main TLBs
+ * before we read 'nr_ats_masters' in case of a concurrent call to
+ * arm_smmu_enable_ats():
+ *
+ * // unmap() // arm_smmu_enable_ats()
+ * TLBI+SYNC atomic_inc(&nr_ats_masters);
+ * smp_mb(); [...]
+ * atomic_read(&nr_ats_masters); pci_enable_ats() // writel()
+ *
+ * Ensures that we always see the incremented 'nr_ats_masters' count if
+ * ATS was enabled at the PCI device before completion of the TLBI.
+ */
+ smp_mb();
+ if (!atomic_read(&smmu_domain->nr_ats_masters))
+ return 0;
+
+ arm_smmu_atc_inv_to_cmd(ssid, iova, size, &cmd);
+
+ spin_lock_irqsave(&smmu_domain->devices_lock, flags);
+ list_for_each_entry(master, &smmu_domain->devices, domain_head) {
+ if (!master->ats_enabled)
+ continue;
+
+ for (i = 0; i < master->num_sids; i++) {
+ cmd.atc.sid = master->sids[i];
+ arm_smmu_cmdq_batch_add(smmu_domain->smmu, &cmds, &cmd);
+ }
+ }
+ spin_unlock_irqrestore(&smmu_domain->devices_lock, flags);
+
+ return arm_smmu_cmdq_batch_submit(smmu_domain->smmu, &cmds);
+}
+
+/* IO_PGTABLE API */
+static void arm_smmu_tlb_inv_context(void *cookie)
+{
+ struct arm_smmu_domain *smmu_domain = cookie;
+ struct arm_smmu_device *smmu = smmu_domain->smmu;
+ struct arm_smmu_cmdq_ent cmd;
+
+ /*
+ * NOTE: when io-pgtable is in non-strict mode, we may get here with
+ * PTEs previously cleared by unmaps on the current CPU not yet visible
+ * to the SMMU. We are relying on the dma_wmb() implicit during cmd
+ * insertion to guarantee those are observed before the TLBI. Do be
+ * careful, 007.
+ */
+ if (smmu_domain->stage == ARM_SMMU_DOMAIN_S1) {
+ arm_smmu_tlb_inv_asid(smmu, smmu_domain->s1_cfg.cd.asid);
+ } else {
+ cmd.opcode = CMDQ_OP_TLBI_S12_VMALL;
+ cmd.tlbi.vmid = smmu_domain->s2_cfg.vmid;
+ arm_smmu_cmdq_issue_cmd(smmu, &cmd);
+ arm_smmu_cmdq_issue_sync(smmu);
+ }
+ arm_smmu_atc_inv_domain(smmu_domain, 0, 0, 0);
+}
+
+static void arm_smmu_tlb_inv_range(unsigned long iova, size_t size,
+ size_t granule, bool leaf,
+ struct arm_smmu_domain *smmu_domain)
+{
+ struct arm_smmu_device *smmu = smmu_domain->smmu;
+ unsigned long start = iova, end = iova + size, num_pages = 0, tg = 0;
+ size_t inv_range = granule;
+ struct arm_smmu_cmdq_batch cmds = {};
+ struct arm_smmu_cmdq_ent cmd = {
+ .tlbi = {
+ .leaf = leaf,
+ },
+ };
+
+ if (!size)
+ return;
+
+ if (smmu_domain->stage == ARM_SMMU_DOMAIN_S1) {
+ cmd.opcode = CMDQ_OP_TLBI_NH_VA;
+ cmd.tlbi.asid = smmu_domain->s1_cfg.cd.asid;
+ } else {
+ cmd.opcode = CMDQ_OP_TLBI_S2_IPA;
+ cmd.tlbi.vmid = smmu_domain->s2_cfg.vmid;
+ }
+
+ if (smmu->features & ARM_SMMU_FEAT_RANGE_INV) {
+ /* Get the leaf page size */
+ tg = __ffs(smmu_domain->domain.pgsize_bitmap);
+
+ /* Convert page size of 12,14,16 (log2) to 1,2,3 */
+ cmd.tlbi.tg = (tg - 10) / 2;
+
+ /* Determine what level the granule is at */
+ cmd.tlbi.ttl = 4 - ((ilog2(granule) - 3) / (tg - 3));
+
+ num_pages = size >> tg;
+ }
+
+ while (iova < end) {
+ if (smmu->features & ARM_SMMU_FEAT_RANGE_INV) {
+ /*
+ * On each iteration of the loop, the range is 5 bits
+ * worth of the aligned size remaining.
+ * The range in pages is:
+ *
+ * range = (num_pages & (0x1f << __ffs(num_pages)))
+ */
+ unsigned long scale, num;
+
+ /* Determine the power of 2 multiple number of pages */
+ scale = __ffs(num_pages);
+ cmd.tlbi.scale = scale;
+
+ /* Determine how many chunks of 2^scale size we have */
+ num = (num_pages >> scale) & CMDQ_TLBI_RANGE_NUM_MAX;
+ cmd.tlbi.num = num - 1;
+
+ /* range is num * 2^scale * pgsize */
+ inv_range = num << (scale + tg);
+
+ /* Clear out the lower order bits for the next iteration */
+ num_pages -= num << scale;
+ }
+
+ cmd.tlbi.addr = iova;
+ arm_smmu_cmdq_batch_add(smmu, &cmds, &cmd);
+ iova += inv_range;
+ }
+ arm_smmu_cmdq_batch_submit(smmu, &cmds);
+
+ /*
+ * Unfortunately, this can't be leaf-only since we may have
+ * zapped an entire table.
+ */
+ arm_smmu_atc_inv_domain(smmu_domain, 0, start, size);
+}
+
+static void arm_smmu_tlb_inv_page_nosync(struct iommu_iotlb_gather *gather,
+ unsigned long iova, size_t granule,
+ void *cookie)
+{
+ struct arm_smmu_domain *smmu_domain = cookie;
+ struct iommu_domain *domain = &smmu_domain->domain;
+
+ iommu_iotlb_gather_add_page(domain, gather, iova, granule);
+}
+
+static void arm_smmu_tlb_inv_walk(unsigned long iova, size_t size,
+ size_t granule, void *cookie)
+{
+ arm_smmu_tlb_inv_range(iova, size, granule, false, cookie);
+}
+
+static void arm_smmu_tlb_inv_leaf(unsigned long iova, size_t size,
+ size_t granule, void *cookie)
+{
+ arm_smmu_tlb_inv_range(iova, size, granule, true, cookie);
+}
+
+static const struct iommu_flush_ops arm_smmu_flush_ops = {
+ .tlb_flush_all = arm_smmu_tlb_inv_context,
+ .tlb_flush_walk = arm_smmu_tlb_inv_walk,
+ .tlb_flush_leaf = arm_smmu_tlb_inv_leaf,
+ .tlb_add_page = arm_smmu_tlb_inv_page_nosync,
+};
+
+/* IOMMU API */
+static bool arm_smmu_capable(enum iommu_cap cap)
+{
+ switch (cap) {
+ case IOMMU_CAP_CACHE_COHERENCY:
+ return true;
+ case IOMMU_CAP_NOEXEC:
+ return true;
+ default:
+ return false;
+ }
+}
+
+static struct iommu_domain *arm_smmu_domain_alloc(unsigned type)
+{
+ struct arm_smmu_domain *smmu_domain;
+
+ if (type != IOMMU_DOMAIN_UNMANAGED &&
+ type != IOMMU_DOMAIN_DMA &&
+ type != IOMMU_DOMAIN_IDENTITY)
+ return NULL;
+
+ /*
+ * Allocate the domain and initialise some of its data structures.
+ * We can't really do anything meaningful until we've added a
+ * master.
+ */
+ smmu_domain = kzalloc(sizeof(*smmu_domain), GFP_KERNEL);
+ if (!smmu_domain)
+ return NULL;
+
+ if (type == IOMMU_DOMAIN_DMA &&
+ iommu_get_dma_cookie(&smmu_domain->domain)) {
+ kfree(smmu_domain);
+ return NULL;
+ }
+
+ mutex_init(&smmu_domain->init_mutex);
+ INIT_LIST_HEAD(&smmu_domain->devices);
+ spin_lock_init(&smmu_domain->devices_lock);
+
+ return &smmu_domain->domain;
+}
+
+static int arm_smmu_bitmap_alloc(unsigned long *map, int span)
+{
+ int idx, size = 1 << span;
+
+ do {
+ idx = find_first_zero_bit(map, size);
+ if (idx == size)
+ return -ENOSPC;
+ } while (test_and_set_bit(idx, map));
+
+ return idx;
+}
+
+static void arm_smmu_bitmap_free(unsigned long *map, int idx)
+{
+ clear_bit(idx, map);
+}
+
+static void arm_smmu_domain_free(struct iommu_domain *domain)
+{
+ struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
+ struct arm_smmu_device *smmu = smmu_domain->smmu;
+
+ iommu_put_dma_cookie(domain);
+ free_io_pgtable_ops(smmu_domain->pgtbl_ops);
+
+ /* Free the CD and ASID, if we allocated them */
+ if (smmu_domain->stage == ARM_SMMU_DOMAIN_S1) {
+ struct arm_smmu_s1_cfg *cfg = &smmu_domain->s1_cfg;
+
+ /* Prevent SVA from touching the CD while we're freeing it */
+ mutex_lock(&arm_smmu_asid_lock);
+ if (cfg->cdcfg.cdtab)
+ arm_smmu_free_cd_tables(smmu_domain);
+ arm_smmu_free_asid(&cfg->cd);
+ mutex_unlock(&arm_smmu_asid_lock);
+ } else {
+ struct arm_smmu_s2_cfg *cfg = &smmu_domain->s2_cfg;
+ if (cfg->vmid)
+ arm_smmu_bitmap_free(smmu->vmid_map, cfg->vmid);
+ }
+
+ kfree(smmu_domain);
+}
+
+static int arm_smmu_domain_finalise_s1(struct arm_smmu_domain *smmu_domain,
+ struct arm_smmu_master *master,
+ struct io_pgtable_cfg *pgtbl_cfg)
+{
+ int ret;
+ u32 asid;
+ struct arm_smmu_device *smmu = smmu_domain->smmu;
+ struct arm_smmu_s1_cfg *cfg = &smmu_domain->s1_cfg;
+ typeof(&pgtbl_cfg->arm_lpae_s1_cfg.tcr) tcr = &pgtbl_cfg->arm_lpae_s1_cfg.tcr;
+
+ refcount_set(&cfg->cd.refs, 1);
+
+ /* Prevent SVA from modifying the ASID until it is written to the CD */
+ mutex_lock(&arm_smmu_asid_lock);
+ ret = xa_alloc(&arm_smmu_asid_xa, &asid, &cfg->cd,
+ XA_LIMIT(1, (1 << smmu->asid_bits) - 1), GFP_KERNEL);
+ if (ret)
+ goto out_unlock;
+
+ cfg->s1cdmax = master->ssid_bits;
+
+ ret = arm_smmu_alloc_cd_tables(smmu_domain);
+ if (ret)
+ goto out_free_asid;
+
+ cfg->cd.asid = (u16)asid;
+ cfg->cd.ttbr = pgtbl_cfg->arm_lpae_s1_cfg.ttbr;
+ cfg->cd.tcr = FIELD_PREP(CTXDESC_CD_0_TCR_T0SZ, tcr->tsz) |
+ FIELD_PREP(CTXDESC_CD_0_TCR_TG0, tcr->tg) |
+ FIELD_PREP(CTXDESC_CD_0_TCR_IRGN0, tcr->irgn) |
+ FIELD_PREP(CTXDESC_CD_0_TCR_ORGN0, tcr->orgn) |
+ FIELD_PREP(CTXDESC_CD_0_TCR_SH0, tcr->sh) |
+ FIELD_PREP(CTXDESC_CD_0_TCR_IPS, tcr->ips) |
+ CTXDESC_CD_0_TCR_EPD1 | CTXDESC_CD_0_AA64;
+ cfg->cd.mair = pgtbl_cfg->arm_lpae_s1_cfg.mair;
+
+ /*
+ * Note that this will end up calling arm_smmu_sync_cd() before
+ * the master has been added to the devices list for this domain.
+ * This isn't an issue because the STE hasn't been installed yet.
+ */
+ ret = arm_smmu_write_ctx_desc(smmu_domain, 0, &cfg->cd);
+ if (ret)
+ goto out_free_cd_tables;
+
+ mutex_unlock(&arm_smmu_asid_lock);
+ return 0;
+
+out_free_cd_tables:
+ arm_smmu_free_cd_tables(smmu_domain);
+out_free_asid:
+ arm_smmu_free_asid(&cfg->cd);
+out_unlock:
+ mutex_unlock(&arm_smmu_asid_lock);
+ return ret;
+}
+
+static int arm_smmu_domain_finalise_s2(struct arm_smmu_domain *smmu_domain,
+ struct arm_smmu_master *master,
+ struct io_pgtable_cfg *pgtbl_cfg)
+{
+ int vmid;
+ struct arm_smmu_device *smmu = smmu_domain->smmu;
+ struct arm_smmu_s2_cfg *cfg = &smmu_domain->s2_cfg;
+ typeof(&pgtbl_cfg->arm_lpae_s2_cfg.vtcr) vtcr;
+
+ vmid = arm_smmu_bitmap_alloc(smmu->vmid_map, smmu->vmid_bits);
+ if (vmid < 0)
+ return vmid;
+
+ vtcr = &pgtbl_cfg->arm_lpae_s2_cfg.vtcr;
+ cfg->vmid = (u16)vmid;
+ cfg->vttbr = pgtbl_cfg->arm_lpae_s2_cfg.vttbr;
+ cfg->vtcr = FIELD_PREP(STRTAB_STE_2_VTCR_S2T0SZ, vtcr->tsz) |
+ FIELD_PREP(STRTAB_STE_2_VTCR_S2SL0, vtcr->sl) |
+ FIELD_PREP(STRTAB_STE_2_VTCR_S2IR0, vtcr->irgn) |
+ FIELD_PREP(STRTAB_STE_2_VTCR_S2OR0, vtcr->orgn) |
+ FIELD_PREP(STRTAB_STE_2_VTCR_S2SH0, vtcr->sh) |
+ FIELD_PREP(STRTAB_STE_2_VTCR_S2TG, vtcr->tg) |
+ FIELD_PREP(STRTAB_STE_2_VTCR_S2PS, vtcr->ps);
+ return 0;
+}
+
+static int arm_smmu_domain_finalise(struct iommu_domain *domain,
+ struct arm_smmu_master *master)
+{
+ int ret;
+ unsigned long ias, oas;
+ enum io_pgtable_fmt fmt;
+ struct io_pgtable_cfg pgtbl_cfg;
+ struct io_pgtable_ops *pgtbl_ops;
+ int (*finalise_stage_fn)(struct arm_smmu_domain *,
+ struct arm_smmu_master *,
+ struct io_pgtable_cfg *);
+ struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
+ struct arm_smmu_device *smmu = smmu_domain->smmu;
+
+ if (domain->type == IOMMU_DOMAIN_IDENTITY) {
+ smmu_domain->stage = ARM_SMMU_DOMAIN_BYPASS;
+ return 0;
+ }
+
+ /* Restrict the stage to what we can actually support */
+ if (!(smmu->features & ARM_SMMU_FEAT_TRANS_S1))
+ smmu_domain->stage = ARM_SMMU_DOMAIN_S2;
+ if (!(smmu->features & ARM_SMMU_FEAT_TRANS_S2))
+ smmu_domain->stage = ARM_SMMU_DOMAIN_S1;
+
+ switch (smmu_domain->stage) {
+ case ARM_SMMU_DOMAIN_S1:
+ ias = (smmu->features & ARM_SMMU_FEAT_VAX) ? 52 : 48;
+ ias = min_t(unsigned long, ias, VA_BITS);
+ oas = smmu->ias;
+ fmt = ARM_64_LPAE_S1;
+ finalise_stage_fn = arm_smmu_domain_finalise_s1;
+ break;
+ case ARM_SMMU_DOMAIN_NESTED:
+ case ARM_SMMU_DOMAIN_S2:
+ ias = smmu->ias;
+ oas = smmu->oas;
+ fmt = ARM_64_LPAE_S2;
+ finalise_stage_fn = arm_smmu_domain_finalise_s2;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ pgtbl_cfg = (struct io_pgtable_cfg) {
+ .pgsize_bitmap = smmu->pgsize_bitmap,
+ .ias = ias,
+ .oas = oas,
+ .coherent_walk = smmu->features & ARM_SMMU_FEAT_COHERENCY,
+ .tlb = &arm_smmu_flush_ops,
+ .iommu_dev = smmu->dev,
+ };
+
+ if (smmu_domain->non_strict)
+ pgtbl_cfg.quirks |= IO_PGTABLE_QUIRK_NON_STRICT;
+
+ pgtbl_ops = alloc_io_pgtable_ops(fmt, &pgtbl_cfg, smmu_domain);
+ if (!pgtbl_ops)
+ return -ENOMEM;
+
+ domain->pgsize_bitmap = pgtbl_cfg.pgsize_bitmap;
+ domain->geometry.aperture_end = (1UL << pgtbl_cfg.ias) - 1;
+ domain->geometry.force_aperture = true;
+
+ ret = finalise_stage_fn(smmu_domain, master, &pgtbl_cfg);
+ if (ret < 0) {
+ free_io_pgtable_ops(pgtbl_ops);
+ return ret;
+ }
+
+ smmu_domain->pgtbl_ops = pgtbl_ops;
+ return 0;
+}
+
+static __le64 *arm_smmu_get_step_for_sid(struct arm_smmu_device *smmu, u32 sid)
+{
+ __le64 *step;
+ struct arm_smmu_strtab_cfg *cfg = &smmu->strtab_cfg;
+
+ if (smmu->features & ARM_SMMU_FEAT_2_LVL_STRTAB) {
+ struct arm_smmu_strtab_l1_desc *l1_desc;
+ int idx;
+
+ /* Two-level walk */
+ idx = (sid >> STRTAB_SPLIT) * STRTAB_L1_DESC_DWORDS;
+ l1_desc = &cfg->l1_desc[idx];
+ idx = (sid & ((1 << STRTAB_SPLIT) - 1)) * STRTAB_STE_DWORDS;
+ step = &l1_desc->l2ptr[idx];
+ } else {
+ /* Simple linear lookup */
+ step = &cfg->strtab[sid * STRTAB_STE_DWORDS];
+ }
+
+ return step;
+}
+
+static void arm_smmu_install_ste_for_dev(struct arm_smmu_master *master)
+{
+ int i, j;
+ struct arm_smmu_device *smmu = master->smmu;
+
+ for (i = 0; i < master->num_sids; ++i) {
+ u32 sid = master->sids[i];
+ __le64 *step = arm_smmu_get_step_for_sid(smmu, sid);
+
+ /* Bridged PCI devices may end up with duplicated IDs */
+ for (j = 0; j < i; j++)
+ if (master->sids[j] == sid)
+ break;
+ if (j < i)
+ continue;
+
+ arm_smmu_write_strtab_ent(master, sid, step);
+ }
+}
+
+static bool arm_smmu_ats_supported(struct arm_smmu_master *master)
+{
+ struct device *dev = master->dev;
+ struct arm_smmu_device *smmu = master->smmu;
+ struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
+
+ if (!(smmu->features & ARM_SMMU_FEAT_ATS))
+ return false;
+
+ if (!(fwspec->flags & IOMMU_FWSPEC_PCI_RC_ATS))
+ return false;
+
+ return dev_is_pci(dev) && pci_ats_supported(to_pci_dev(dev));
+}
+
+static void arm_smmu_enable_ats(struct arm_smmu_master *master)
+{
+ size_t stu;
+ struct pci_dev *pdev;
+ struct arm_smmu_device *smmu = master->smmu;
+ struct arm_smmu_domain *smmu_domain = master->domain;
+
+ /* Don't enable ATS at the endpoint if it's not enabled in the STE */
+ if (!master->ats_enabled)
+ return;
+
+ /* Smallest Translation Unit: log2 of the smallest supported granule */
+ stu = __ffs(smmu->pgsize_bitmap);
+ pdev = to_pci_dev(master->dev);
+
+ atomic_inc(&smmu_domain->nr_ats_masters);
+ arm_smmu_atc_inv_domain(smmu_domain, 0, 0, 0);
+ if (pci_enable_ats(pdev, stu))
+ dev_err(master->dev, "Failed to enable ATS (STU %zu)\n", stu);
+}
+
+static void arm_smmu_disable_ats(struct arm_smmu_master *master)
+{
+ struct arm_smmu_domain *smmu_domain = master->domain;
+
+ if (!master->ats_enabled)
+ return;
+
+ pci_disable_ats(to_pci_dev(master->dev));
+ /*
+ * Ensure ATS is disabled at the endpoint before we issue the
+ * ATC invalidation via the SMMU.
+ */
+ wmb();
+ arm_smmu_atc_inv_master(master);
+ atomic_dec(&smmu_domain->nr_ats_masters);
+}
+
+static int arm_smmu_enable_pasid(struct arm_smmu_master *master)
+{
+ int ret;
+ int features;
+ int num_pasids;
+ struct pci_dev *pdev;
+
+ if (!dev_is_pci(master->dev))
+ return -ENODEV;
+
+ pdev = to_pci_dev(master->dev);
+
+ features = pci_pasid_features(pdev);
+ if (features < 0)
+ return features;
+
+ num_pasids = pci_max_pasids(pdev);
+ if (num_pasids <= 0)
+ return num_pasids;
+
+ ret = pci_enable_pasid(pdev, features);
+ if (ret) {
+ dev_err(&pdev->dev, "Failed to enable PASID\n");
+ return ret;
+ }
+
+ master->ssid_bits = min_t(u8, ilog2(num_pasids),
+ master->smmu->ssid_bits);
+ return 0;
+}
+
+static void arm_smmu_disable_pasid(struct arm_smmu_master *master)
+{
+ struct pci_dev *pdev;
+
+ if (!dev_is_pci(master->dev))
+ return;
+
+ pdev = to_pci_dev(master->dev);
+
+ if (!pdev->pasid_enabled)
+ return;
+
+ master->ssid_bits = 0;
+ pci_disable_pasid(pdev);
+}
+
+static void arm_smmu_detach_dev(struct arm_smmu_master *master)
+{
+ unsigned long flags;
+ struct arm_smmu_domain *smmu_domain = master->domain;
+
+ if (!smmu_domain)
+ return;
+
+ arm_smmu_disable_ats(master);
+
+ spin_lock_irqsave(&smmu_domain->devices_lock, flags);
+ list_del(&master->domain_head);
+ spin_unlock_irqrestore(&smmu_domain->devices_lock, flags);
+
+ master->domain = NULL;
+ master->ats_enabled = false;
+ arm_smmu_install_ste_for_dev(master);
+}
+
+static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev)
+{
+ int ret = 0;
+ unsigned long flags;
+ struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
+ struct arm_smmu_device *smmu;
+ struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
+ struct arm_smmu_master *master;
+
+ if (!fwspec)
+ return -ENOENT;
+
+ master = dev_iommu_priv_get(dev);
+ smmu = master->smmu;
+
+ /*
+ * Checking that SVA is disabled ensures that this device isn't bound to
+ * any mm, and can be safely detached from its old domain. Bonds cannot
+ * be removed concurrently since we're holding the group mutex.
+ */
+ if (arm_smmu_master_sva_enabled(master)) {
+ dev_err(dev, "cannot attach - SVA enabled\n");
+ return -EBUSY;
+ }
+
+ arm_smmu_detach_dev(master);
+
+ mutex_lock(&smmu_domain->init_mutex);
+
+ if (!smmu_domain->smmu) {
+ smmu_domain->smmu = smmu;
+ ret = arm_smmu_domain_finalise(domain, master);
+ if (ret) {
+ smmu_domain->smmu = NULL;
+ goto out_unlock;
+ }
+ } else if (smmu_domain->smmu != smmu) {
+ dev_err(dev,
+ "cannot attach to SMMU %s (upstream of %s)\n",
+ dev_name(smmu_domain->smmu->dev),
+ dev_name(smmu->dev));
+ ret = -ENXIO;
+ goto out_unlock;
+ } else if (smmu_domain->stage == ARM_SMMU_DOMAIN_S1 &&
+ master->ssid_bits != smmu_domain->s1_cfg.s1cdmax) {
+ dev_err(dev,
+ "cannot attach to incompatible domain (%u SSID bits != %u)\n",
+ smmu_domain->s1_cfg.s1cdmax, master->ssid_bits);
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+
+ master->domain = smmu_domain;
+
+ if (smmu_domain->stage != ARM_SMMU_DOMAIN_BYPASS)
+ master->ats_enabled = arm_smmu_ats_supported(master);
+
+ arm_smmu_install_ste_for_dev(master);
+
+ spin_lock_irqsave(&smmu_domain->devices_lock, flags);
+ list_add(&master->domain_head, &smmu_domain->devices);
+ spin_unlock_irqrestore(&smmu_domain->devices_lock, flags);
+
+ arm_smmu_enable_ats(master);
+
+out_unlock:
+ mutex_unlock(&smmu_domain->init_mutex);
+ return ret;
+}
+
+static int arm_smmu_map(struct iommu_domain *domain, unsigned long iova,
+ phys_addr_t paddr, size_t size, int prot, gfp_t gfp)
+{
+ struct io_pgtable_ops *ops = to_smmu_domain(domain)->pgtbl_ops;
+
+ if (!ops)
+ return -ENODEV;
+
+ return ops->map(ops, iova, paddr, size, prot, gfp);
+}
+
+static size_t arm_smmu_unmap(struct iommu_domain *domain, unsigned long iova,
+ size_t size, struct iommu_iotlb_gather *gather)
+{
+ struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
+ struct io_pgtable_ops *ops = smmu_domain->pgtbl_ops;
+
+ if (!ops)
+ return 0;
+
+ return ops->unmap(ops, iova, size, gather);
+}
+
+static void arm_smmu_flush_iotlb_all(struct iommu_domain *domain)
+{
+ struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
+
+ if (smmu_domain->smmu)
+ arm_smmu_tlb_inv_context(smmu_domain);
+}
+
+static void arm_smmu_iotlb_sync(struct iommu_domain *domain,
+ struct iommu_iotlb_gather *gather)
+{
+ struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
+
+ arm_smmu_tlb_inv_range(gather->start, gather->end - gather->start + 1,
+ gather->pgsize, true, smmu_domain);
+}
+
+static phys_addr_t
+arm_smmu_iova_to_phys(struct iommu_domain *domain, dma_addr_t iova)
+{
+ struct io_pgtable_ops *ops = to_smmu_domain(domain)->pgtbl_ops;
+
+ if (domain->type == IOMMU_DOMAIN_IDENTITY)
+ return iova;
+
+ if (!ops)
+ return 0;
+
+ return ops->iova_to_phys(ops, iova);
+}
+
+static struct platform_driver arm_smmu_driver;
+
+static
+struct arm_smmu_device *arm_smmu_get_by_fwnode(struct fwnode_handle *fwnode)
+{
+ struct device *dev = driver_find_device_by_fwnode(&arm_smmu_driver.driver,
+ fwnode);
+ put_device(dev);
+ return dev ? dev_get_drvdata(dev) : NULL;
+}
+
+static bool arm_smmu_sid_in_range(struct arm_smmu_device *smmu, u32 sid)
+{
+ unsigned long limit = smmu->strtab_cfg.num_l1_ents;
+
+ if (smmu->features & ARM_SMMU_FEAT_2_LVL_STRTAB)
+ limit *= 1UL << STRTAB_SPLIT;
+
+ return sid < limit;
+}
+
+static struct iommu_ops arm_smmu_ops;
+
+static struct iommu_device *arm_smmu_probe_device(struct device *dev)
+{
+ int i, ret;
+ struct arm_smmu_device *smmu;
+ struct arm_smmu_master *master;
+ struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
+
+ if (!fwspec || fwspec->ops != &arm_smmu_ops)
+ return ERR_PTR(-ENODEV);
+
+ if (WARN_ON_ONCE(dev_iommu_priv_get(dev)))
+ return ERR_PTR(-EBUSY);
+
+ smmu = arm_smmu_get_by_fwnode(fwspec->iommu_fwnode);
+ if (!smmu)
+ return ERR_PTR(-ENODEV);
+
+ master = kzalloc(sizeof(*master), GFP_KERNEL);
+ if (!master)
+ return ERR_PTR(-ENOMEM);
+
+ master->dev = dev;
+ master->smmu = smmu;
+ master->sids = fwspec->ids;
+ master->num_sids = fwspec->num_ids;
+ INIT_LIST_HEAD(&master->bonds);
+ dev_iommu_priv_set(dev, master);
+
+ /* Check the SIDs are in range of the SMMU and our stream table */
+ for (i = 0; i < master->num_sids; i++) {
+ u32 sid = master->sids[i];
+
+ if (!arm_smmu_sid_in_range(smmu, sid)) {
+ ret = -ERANGE;
+ goto err_free_master;
+ }
+
+ /* Ensure l2 strtab is initialised */
+ if (smmu->features & ARM_SMMU_FEAT_2_LVL_STRTAB) {
+ ret = arm_smmu_init_l2_strtab(smmu, sid);
+ if (ret)
+ goto err_free_master;
+ }
+ }
+
+ master->ssid_bits = min(smmu->ssid_bits, fwspec->num_pasid_bits);
+
+ /*
+ * Note that PASID must be enabled before, and disabled after ATS:
+ * PCI Express Base 4.0r1.0 - 10.5.1.3 ATS Control Register
+ *
+ * Behavior is undefined if this bit is Set and the value of the PASID
+ * Enable, Execute Requested Enable, or Privileged Mode Requested bits
+ * are changed.
+ */
+ arm_smmu_enable_pasid(master);
+
+ if (!(smmu->features & ARM_SMMU_FEAT_2_LVL_CDTAB))
+ master->ssid_bits = min_t(u8, master->ssid_bits,
+ CTXDESC_LINEAR_CDMAX);
+
+ return &smmu->iommu;
+
+err_free_master:
+ kfree(master);
+ dev_iommu_priv_set(dev, NULL);
+ return ERR_PTR(ret);
+}
+
+static void arm_smmu_release_device(struct device *dev)
+{
+ struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
+ struct arm_smmu_master *master;
+
+ if (!fwspec || fwspec->ops != &arm_smmu_ops)
+ return;
+
+ master = dev_iommu_priv_get(dev);
+ WARN_ON(arm_smmu_master_sva_enabled(master));
+ arm_smmu_detach_dev(master);
+ arm_smmu_disable_pasid(master);
+ kfree(master);
+ iommu_fwspec_free(dev);
+}
+
+static struct iommu_group *arm_smmu_device_group(struct device *dev)
+{
+ struct iommu_group *group;
+
+ /*
+ * We don't support devices sharing stream IDs other than PCI RID
+ * aliases, since the necessary ID-to-device lookup becomes rather
+ * impractical given a potential sparse 32-bit stream ID space.
+ */
+ if (dev_is_pci(dev))
+ group = pci_device_group(dev);
+ else
+ group = generic_device_group(dev);
+
+ return group;
+}
+
+static int arm_smmu_domain_get_attr(struct iommu_domain *domain,
+ enum iommu_attr attr, void *data)
+{
+ struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
+
+ switch (domain->type) {
+ case IOMMU_DOMAIN_UNMANAGED:
+ switch (attr) {
+ case DOMAIN_ATTR_NESTING:
+ *(int *)data = (smmu_domain->stage == ARM_SMMU_DOMAIN_NESTED);
+ return 0;
+ default:
+ return -ENODEV;
+ }
+ break;
+ case IOMMU_DOMAIN_DMA:
+ switch (attr) {
+ case DOMAIN_ATTR_DMA_USE_FLUSH_QUEUE:
+ *(int *)data = smmu_domain->non_strict;
+ return 0;
+ default:
+ return -ENODEV;
+ }
+ break;
+ default:
+ return -EINVAL;
+ }
+}
+
+static int arm_smmu_domain_set_attr(struct iommu_domain *domain,
+ enum iommu_attr attr, void *data)
+{
+ int ret = 0;
+ struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
+
+ mutex_lock(&smmu_domain->init_mutex);
+
+ switch (domain->type) {
+ case IOMMU_DOMAIN_UNMANAGED:
+ switch (attr) {
+ case DOMAIN_ATTR_NESTING:
+ if (smmu_domain->smmu) {
+ ret = -EPERM;
+ goto out_unlock;
+ }
+
+ if (*(int *)data)
+ smmu_domain->stage = ARM_SMMU_DOMAIN_NESTED;
+ else
+ smmu_domain->stage = ARM_SMMU_DOMAIN_S1;
+ break;
+ default:
+ ret = -ENODEV;
+ }
+ break;
+ case IOMMU_DOMAIN_DMA:
+ switch(attr) {
+ case DOMAIN_ATTR_DMA_USE_FLUSH_QUEUE:
+ smmu_domain->non_strict = *(int *)data;
+ break;
+ default:
+ ret = -ENODEV;
+ }
+ break;
+ default:
+ ret = -EINVAL;
+ }
+
+out_unlock:
+ mutex_unlock(&smmu_domain->init_mutex);
+ return ret;
+}
+
+static int arm_smmu_of_xlate(struct device *dev, struct of_phandle_args *args)
+{
+ return iommu_fwspec_add_ids(dev, args->args, 1);
+}
+
+static void arm_smmu_get_resv_regions(struct device *dev,
+ struct list_head *head)
+{
+ struct iommu_resv_region *region;
+ int prot = IOMMU_WRITE | IOMMU_NOEXEC | IOMMU_MMIO;
+
+ region = iommu_alloc_resv_region(MSI_IOVA_BASE, MSI_IOVA_LENGTH,
+ prot, IOMMU_RESV_SW_MSI);
+ if (!region)
+ return;
+
+ list_add_tail(&region->list, head);
+
+ iommu_dma_get_resv_regions(dev, head);
+}
+
+static bool arm_smmu_dev_has_feature(struct device *dev,
+ enum iommu_dev_features feat)
+{
+ struct arm_smmu_master *master = dev_iommu_priv_get(dev);
+
+ if (!master)
+ return false;
+
+ switch (feat) {
+ case IOMMU_DEV_FEAT_SVA:
+ return arm_smmu_master_sva_supported(master);
+ default:
+ return false;
+ }
+}
+
+static bool arm_smmu_dev_feature_enabled(struct device *dev,
+ enum iommu_dev_features feat)
+{
+ struct arm_smmu_master *master = dev_iommu_priv_get(dev);
+
+ if (!master)
+ return false;
+
+ switch (feat) {
+ case IOMMU_DEV_FEAT_SVA:
+ return arm_smmu_master_sva_enabled(master);
+ default:
+ return false;
+ }
+}
+
+static int arm_smmu_dev_enable_feature(struct device *dev,
+ enum iommu_dev_features feat)
+{
+ if (!arm_smmu_dev_has_feature(dev, feat))
+ return -ENODEV;
+
+ if (arm_smmu_dev_feature_enabled(dev, feat))
+ return -EBUSY;
+
+ switch (feat) {
+ case IOMMU_DEV_FEAT_SVA:
+ return arm_smmu_master_enable_sva(dev_iommu_priv_get(dev));
+ default:
+ return -EINVAL;
+ }
+}
+
+static int arm_smmu_dev_disable_feature(struct device *dev,
+ enum iommu_dev_features feat)
+{
+ if (!arm_smmu_dev_feature_enabled(dev, feat))
+ return -EINVAL;
+
+ switch (feat) {
+ case IOMMU_DEV_FEAT_SVA:
+ return arm_smmu_master_disable_sva(dev_iommu_priv_get(dev));
+ default:
+ return -EINVAL;
+ }
+}
+
+static struct iommu_ops arm_smmu_ops = {
+ .capable = arm_smmu_capable,
+ .domain_alloc = arm_smmu_domain_alloc,
+ .domain_free = arm_smmu_domain_free,
+ .attach_dev = arm_smmu_attach_dev,
+ .map = arm_smmu_map,
+ .unmap = arm_smmu_unmap,
+ .flush_iotlb_all = arm_smmu_flush_iotlb_all,
+ .iotlb_sync = arm_smmu_iotlb_sync,
+ .iova_to_phys = arm_smmu_iova_to_phys,
+ .probe_device = arm_smmu_probe_device,
+ .release_device = arm_smmu_release_device,
+ .device_group = arm_smmu_device_group,
+ .domain_get_attr = arm_smmu_domain_get_attr,
+ .domain_set_attr = arm_smmu_domain_set_attr,
+ .of_xlate = arm_smmu_of_xlate,
+ .get_resv_regions = arm_smmu_get_resv_regions,
+ .put_resv_regions = generic_iommu_put_resv_regions,
+ .dev_has_feat = arm_smmu_dev_has_feature,
+ .dev_feat_enabled = arm_smmu_dev_feature_enabled,
+ .dev_enable_feat = arm_smmu_dev_enable_feature,
+ .dev_disable_feat = arm_smmu_dev_disable_feature,
+ .pgsize_bitmap = -1UL, /* Restricted during device attach */
+};
+
+/* Probing and initialisation functions */
+static int arm_smmu_init_one_queue(struct arm_smmu_device *smmu,
+ struct arm_smmu_queue *q,
+ unsigned long prod_off,
+ unsigned long cons_off,
+ size_t dwords, const char *name)
+{
+ size_t qsz;
+
+ do {
+ qsz = ((1 << q->llq.max_n_shift) * dwords) << 3;
+ q->base = dmam_alloc_coherent(smmu->dev, qsz, &q->base_dma,
+ GFP_KERNEL);
+ if (q->base || qsz < PAGE_SIZE)
+ break;
+
+ q->llq.max_n_shift--;
+ } while (1);
+
+ if (!q->base) {
+ dev_err(smmu->dev,
+ "failed to allocate queue (0x%zx bytes) for %s\n",
+ qsz, name);
+ return -ENOMEM;
+ }
+
+ if (!WARN_ON(q->base_dma & (qsz - 1))) {
+ dev_info(smmu->dev, "allocated %u entries for %s\n",
+ 1 << q->llq.max_n_shift, name);
+ }
+
+ q->prod_reg = arm_smmu_page1_fixup(prod_off, smmu);
+ q->cons_reg = arm_smmu_page1_fixup(cons_off, smmu);
+ q->ent_dwords = dwords;
+
+ q->q_base = Q_BASE_RWA;
+ q->q_base |= q->base_dma & Q_BASE_ADDR_MASK;
+ q->q_base |= FIELD_PREP(Q_BASE_LOG2SIZE, q->llq.max_n_shift);
+
+ q->llq.prod = q->llq.cons = 0;
+ return 0;
+}
+
+static void arm_smmu_cmdq_free_bitmap(void *data)
+{
+ unsigned long *bitmap = data;
+ bitmap_free(bitmap);
+}
+
+static int arm_smmu_cmdq_init(struct arm_smmu_device *smmu)
+{
+ int ret = 0;
+ struct arm_smmu_cmdq *cmdq = &smmu->cmdq;
+ unsigned int nents = 1 << cmdq->q.llq.max_n_shift;
+ atomic_long_t *bitmap;
+
+ atomic_set(&cmdq->owner_prod, 0);
+ atomic_set(&cmdq->lock, 0);
+
+ bitmap = (atomic_long_t *)bitmap_zalloc(nents, GFP_KERNEL);
+ if (!bitmap) {
+ dev_err(smmu->dev, "failed to allocate cmdq bitmap\n");
+ ret = -ENOMEM;
+ } else {
+ cmdq->valid_map = bitmap;
+ devm_add_action(smmu->dev, arm_smmu_cmdq_free_bitmap, bitmap);
+ }
+
+ return ret;
+}
+
+static int arm_smmu_init_queues(struct arm_smmu_device *smmu)
+{
+ int ret;
+
+ /* cmdq */
+ ret = arm_smmu_init_one_queue(smmu, &smmu->cmdq.q, ARM_SMMU_CMDQ_PROD,
+ ARM_SMMU_CMDQ_CONS, CMDQ_ENT_DWORDS,
+ "cmdq");
+ if (ret)
+ return ret;
+
+ ret = arm_smmu_cmdq_init(smmu);
+ if (ret)
+ return ret;
+
+ /* evtq */
+ ret = arm_smmu_init_one_queue(smmu, &smmu->evtq.q, ARM_SMMU_EVTQ_PROD,
+ ARM_SMMU_EVTQ_CONS, EVTQ_ENT_DWORDS,
+ "evtq");
+ if (ret)
+ return ret;
+
+ /* priq */
+ if (!(smmu->features & ARM_SMMU_FEAT_PRI))
+ return 0;
+
+ return arm_smmu_init_one_queue(smmu, &smmu->priq.q, ARM_SMMU_PRIQ_PROD,
+ ARM_SMMU_PRIQ_CONS, PRIQ_ENT_DWORDS,
+ "priq");
+}
+
+static int arm_smmu_init_l1_strtab(struct arm_smmu_device *smmu)
+{
+ unsigned int i;
+ struct arm_smmu_strtab_cfg *cfg = &smmu->strtab_cfg;
+ size_t size = sizeof(*cfg->l1_desc) * cfg->num_l1_ents;
+ void *strtab = smmu->strtab_cfg.strtab;
+
+ cfg->l1_desc = devm_kzalloc(smmu->dev, size, GFP_KERNEL);
+ if (!cfg->l1_desc) {
+ dev_err(smmu->dev, "failed to allocate l1 stream table desc\n");
+ return -ENOMEM;
+ }
+
+ for (i = 0; i < cfg->num_l1_ents; ++i) {
+ arm_smmu_write_strtab_l1_desc(strtab, &cfg->l1_desc[i]);
+ strtab += STRTAB_L1_DESC_DWORDS << 3;
+ }
+
+ return 0;
+}
+
+static int arm_smmu_init_strtab_2lvl(struct arm_smmu_device *smmu)
+{
+ void *strtab;
+ u64 reg;
+ u32 size, l1size;
+ struct arm_smmu_strtab_cfg *cfg = &smmu->strtab_cfg;
+
+ /* Calculate the L1 size, capped to the SIDSIZE. */
+ size = STRTAB_L1_SZ_SHIFT - (ilog2(STRTAB_L1_DESC_DWORDS) + 3);
+ size = min(size, smmu->sid_bits - STRTAB_SPLIT);
+ cfg->num_l1_ents = 1 << size;
+
+ size += STRTAB_SPLIT;
+ if (size < smmu->sid_bits)
+ dev_warn(smmu->dev,
+ "2-level strtab only covers %u/%u bits of SID\n",
+ size, smmu->sid_bits);
+
+ l1size = cfg->num_l1_ents * (STRTAB_L1_DESC_DWORDS << 3);
+ strtab = dmam_alloc_coherent(smmu->dev, l1size, &cfg->strtab_dma,
+ GFP_KERNEL);
+ if (!strtab) {
+ dev_err(smmu->dev,
+ "failed to allocate l1 stream table (%u bytes)\n",
+ l1size);
+ return -ENOMEM;
+ }
+ cfg->strtab = strtab;
+
+ /* Configure strtab_base_cfg for 2 levels */
+ reg = FIELD_PREP(STRTAB_BASE_CFG_FMT, STRTAB_BASE_CFG_FMT_2LVL);
+ reg |= FIELD_PREP(STRTAB_BASE_CFG_LOG2SIZE, size);
+ reg |= FIELD_PREP(STRTAB_BASE_CFG_SPLIT, STRTAB_SPLIT);
+ cfg->strtab_base_cfg = reg;
+
+ return arm_smmu_init_l1_strtab(smmu);
+}
+
+static int arm_smmu_init_strtab_linear(struct arm_smmu_device *smmu)
+{
+ void *strtab;
+ u64 reg;
+ u32 size;
+ struct arm_smmu_strtab_cfg *cfg = &smmu->strtab_cfg;
+
+ size = (1 << smmu->sid_bits) * (STRTAB_STE_DWORDS << 3);
+ strtab = dmam_alloc_coherent(smmu->dev, size, &cfg->strtab_dma,
+ GFP_KERNEL);
+ if (!strtab) {
+ dev_err(smmu->dev,
+ "failed to allocate linear stream table (%u bytes)\n",
+ size);
+ return -ENOMEM;
+ }
+ cfg->strtab = strtab;
+ cfg->num_l1_ents = 1 << smmu->sid_bits;
+
+ /* Configure strtab_base_cfg for a linear table covering all SIDs */
+ reg = FIELD_PREP(STRTAB_BASE_CFG_FMT, STRTAB_BASE_CFG_FMT_LINEAR);
+ reg |= FIELD_PREP(STRTAB_BASE_CFG_LOG2SIZE, smmu->sid_bits);
+ cfg->strtab_base_cfg = reg;
+
+ arm_smmu_init_bypass_stes(strtab, cfg->num_l1_ents);
+ return 0;
+}
+
+static int arm_smmu_init_strtab(struct arm_smmu_device *smmu)
+{
+ u64 reg;
+ int ret;
+
+ if (smmu->features & ARM_SMMU_FEAT_2_LVL_STRTAB)
+ ret = arm_smmu_init_strtab_2lvl(smmu);
+ else
+ ret = arm_smmu_init_strtab_linear(smmu);
+
+ if (ret)
+ return ret;
+
+ /* Set the strtab base address */
+ reg = smmu->strtab_cfg.strtab_dma & STRTAB_BASE_ADDR_MASK;
+ reg |= STRTAB_BASE_RA;
+ smmu->strtab_cfg.strtab_base = reg;
+
+ /* Allocate the first VMID for stage-2 bypass STEs */
+ set_bit(0, smmu->vmid_map);
+ return 0;
+}
+
+static int arm_smmu_init_structures(struct arm_smmu_device *smmu)
+{
+ int ret;
+
+ ret = arm_smmu_init_queues(smmu);
+ if (ret)
+ return ret;
+
+ return arm_smmu_init_strtab(smmu);
+}
+
+static int arm_smmu_write_reg_sync(struct arm_smmu_device *smmu, u32 val,
+ unsigned int reg_off, unsigned int ack_off)
+{
+ u32 reg;
+
+ writel_relaxed(val, smmu->base + reg_off);
+ return readl_relaxed_poll_timeout(smmu->base + ack_off, reg, reg == val,
+ 1, ARM_SMMU_POLL_TIMEOUT_US);
+}
+
+/* GBPA is "special" */
+static int arm_smmu_update_gbpa(struct arm_smmu_device *smmu, u32 set, u32 clr)
+{
+ int ret;
+ u32 reg, __iomem *gbpa = smmu->base + ARM_SMMU_GBPA;
+
+ ret = readl_relaxed_poll_timeout(gbpa, reg, !(reg & GBPA_UPDATE),
+ 1, ARM_SMMU_POLL_TIMEOUT_US);
+ if (ret)
+ return ret;
+
+ reg &= ~clr;
+ reg |= set;
+ writel_relaxed(reg | GBPA_UPDATE, gbpa);
+ ret = readl_relaxed_poll_timeout(gbpa, reg, !(reg & GBPA_UPDATE),
+ 1, ARM_SMMU_POLL_TIMEOUT_US);
+
+ if (ret)
+ dev_err(smmu->dev, "GBPA not responding to update\n");
+ return ret;
+}
+
+static void arm_smmu_free_msis(void *data)
+{
+ struct device *dev = data;
+ platform_msi_domain_free_irqs(dev);
+}
+
+static void arm_smmu_write_msi_msg(struct msi_desc *desc, struct msi_msg *msg)
+{
+ phys_addr_t doorbell;
+ struct device *dev = msi_desc_to_dev(desc);
+ struct arm_smmu_device *smmu = dev_get_drvdata(dev);
+ phys_addr_t *cfg = arm_smmu_msi_cfg[desc->platform.msi_index];
+
+ doorbell = (((u64)msg->address_hi) << 32) | msg->address_lo;
+ doorbell &= MSI_CFG0_ADDR_MASK;
+
+ writeq_relaxed(doorbell, smmu->base + cfg[0]);
+ writel_relaxed(msg->data, smmu->base + cfg[1]);
+ writel_relaxed(ARM_SMMU_MEMATTR_DEVICE_nGnRE, smmu->base + cfg[2]);
+}
+
+static void arm_smmu_setup_msis(struct arm_smmu_device *smmu)
+{
+ struct msi_desc *desc;
+ int ret, nvec = ARM_SMMU_MAX_MSIS;
+ struct device *dev = smmu->dev;
+
+ /* Clear the MSI address regs */
+ writeq_relaxed(0, smmu->base + ARM_SMMU_GERROR_IRQ_CFG0);
+ writeq_relaxed(0, smmu->base + ARM_SMMU_EVTQ_IRQ_CFG0);
+
+ if (smmu->features & ARM_SMMU_FEAT_PRI)
+ writeq_relaxed(0, smmu->base + ARM_SMMU_PRIQ_IRQ_CFG0);
+ else
+ nvec--;
+
+ if (!(smmu->features & ARM_SMMU_FEAT_MSI))
+ return;
+
+ if (!dev->msi_domain) {
+ dev_info(smmu->dev, "msi_domain absent - falling back to wired irqs\n");
+ return;
+ }
+
+ /* Allocate MSIs for evtq, gerror and priq. Ignore cmdq */
+ ret = platform_msi_domain_alloc_irqs(dev, nvec, arm_smmu_write_msi_msg);
+ if (ret) {
+ dev_warn(dev, "failed to allocate MSIs - falling back to wired irqs\n");
+ return;
+ }
+
+ for_each_msi_entry(desc, dev) {
+ switch (desc->platform.msi_index) {
+ case EVTQ_MSI_INDEX:
+ smmu->evtq.q.irq = desc->irq;
+ break;
+ case GERROR_MSI_INDEX:
+ smmu->gerr_irq = desc->irq;
+ break;
+ case PRIQ_MSI_INDEX:
+ smmu->priq.q.irq = desc->irq;
+ break;
+ default: /* Unknown */
+ continue;
+ }
+ }
+
+ /* Add callback to free MSIs on teardown */
+ devm_add_action(dev, arm_smmu_free_msis, dev);
+}
+
+static void arm_smmu_setup_unique_irqs(struct arm_smmu_device *smmu)
+{
+ int irq, ret;
+
+ arm_smmu_setup_msis(smmu);
+
+ /* Request interrupt lines */
+ irq = smmu->evtq.q.irq;
+ if (irq) {
+ ret = devm_request_threaded_irq(smmu->dev, irq, NULL,
+ arm_smmu_evtq_thread,
+ IRQF_ONESHOT,
+ "arm-smmu-v3-evtq", smmu);
+ if (ret < 0)
+ dev_warn(smmu->dev, "failed to enable evtq irq\n");
+ } else {
+ dev_warn(smmu->dev, "no evtq irq - events will not be reported!\n");
+ }
+
+ irq = smmu->gerr_irq;
+ if (irq) {
+ ret = devm_request_irq(smmu->dev, irq, arm_smmu_gerror_handler,
+ 0, "arm-smmu-v3-gerror", smmu);
+ if (ret < 0)
+ dev_warn(smmu->dev, "failed to enable gerror irq\n");
+ } else {
+ dev_warn(smmu->dev, "no gerr irq - errors will not be reported!\n");
+ }
+
+ if (smmu->features & ARM_SMMU_FEAT_PRI) {
+ irq = smmu->priq.q.irq;
+ if (irq) {
+ ret = devm_request_threaded_irq(smmu->dev, irq, NULL,
+ arm_smmu_priq_thread,
+ IRQF_ONESHOT,
+ "arm-smmu-v3-priq",
+ smmu);
+ if (ret < 0)
+ dev_warn(smmu->dev,
+ "failed to enable priq irq\n");
+ } else {
+ dev_warn(smmu->dev, "no priq irq - PRI will be broken\n");
+ }
+ }
+}
+
+static int arm_smmu_setup_irqs(struct arm_smmu_device *smmu)
+{
+ int ret, irq;
+ u32 irqen_flags = IRQ_CTRL_EVTQ_IRQEN | IRQ_CTRL_GERROR_IRQEN;
+
+ /* Disable IRQs first */
+ ret = arm_smmu_write_reg_sync(smmu, 0, ARM_SMMU_IRQ_CTRL,
+ ARM_SMMU_IRQ_CTRLACK);
+ if (ret) {
+ dev_err(smmu->dev, "failed to disable irqs\n");
+ return ret;
+ }
+
+ irq = smmu->combined_irq;
+ if (irq) {
+ /*
+ * Cavium ThunderX2 implementation doesn't support unique irq
+ * lines. Use a single irq line for all the SMMUv3 interrupts.
+ */
+ ret = devm_request_threaded_irq(smmu->dev, irq,
+ arm_smmu_combined_irq_handler,
+ arm_smmu_combined_irq_thread,
+ IRQF_ONESHOT,
+ "arm-smmu-v3-combined-irq", smmu);
+ if (ret < 0)
+ dev_warn(smmu->dev, "failed to enable combined irq\n");
+ } else
+ arm_smmu_setup_unique_irqs(smmu);
+
+ if (smmu->features & ARM_SMMU_FEAT_PRI)
+ irqen_flags |= IRQ_CTRL_PRIQ_IRQEN;
+
+ /* Enable interrupt generation on the SMMU */
+ ret = arm_smmu_write_reg_sync(smmu, irqen_flags,
+ ARM_SMMU_IRQ_CTRL, ARM_SMMU_IRQ_CTRLACK);
+ if (ret)
+ dev_warn(smmu->dev, "failed to enable irqs\n");
+
+ return 0;
+}
+
+static int arm_smmu_device_disable(struct arm_smmu_device *smmu)
+{
+ int ret;
+
+ ret = arm_smmu_write_reg_sync(smmu, 0, ARM_SMMU_CR0, ARM_SMMU_CR0ACK);
+ if (ret)
+ dev_err(smmu->dev, "failed to clear cr0\n");
+
+ return ret;
+}
+
+static int arm_smmu_device_reset(struct arm_smmu_device *smmu, bool bypass)
+{
+ int ret;
+ u32 reg, enables;
+ struct arm_smmu_cmdq_ent cmd;
+
+ /* Clear CR0 and sync (disables SMMU and queue processing) */
+ reg = readl_relaxed(smmu->base + ARM_SMMU_CR0);
+ if (reg & CR0_SMMUEN) {
+ dev_warn(smmu->dev, "SMMU currently enabled! Resetting...\n");
+ WARN_ON(is_kdump_kernel() && !disable_bypass);
+ arm_smmu_update_gbpa(smmu, GBPA_ABORT, 0);
+ }
+
+ ret = arm_smmu_device_disable(smmu);
+ if (ret)
+ return ret;
+
+ /* CR1 (table and queue memory attributes) */
+ reg = FIELD_PREP(CR1_TABLE_SH, ARM_SMMU_SH_ISH) |
+ FIELD_PREP(CR1_TABLE_OC, CR1_CACHE_WB) |
+ FIELD_PREP(CR1_TABLE_IC, CR1_CACHE_WB) |
+ FIELD_PREP(CR1_QUEUE_SH, ARM_SMMU_SH_ISH) |
+ FIELD_PREP(CR1_QUEUE_OC, CR1_CACHE_WB) |
+ FIELD_PREP(CR1_QUEUE_IC, CR1_CACHE_WB);
+ writel_relaxed(reg, smmu->base + ARM_SMMU_CR1);
+
+ /* CR2 (random crap) */
+ reg = CR2_PTM | CR2_RECINVSID | CR2_E2H;
+ writel_relaxed(reg, smmu->base + ARM_SMMU_CR2);
+
+ /* Stream table */
+ writeq_relaxed(smmu->strtab_cfg.strtab_base,
+ smmu->base + ARM_SMMU_STRTAB_BASE);
+ writel_relaxed(smmu->strtab_cfg.strtab_base_cfg,
+ smmu->base + ARM_SMMU_STRTAB_BASE_CFG);
+
+ /* Command queue */
+ writeq_relaxed(smmu->cmdq.q.q_base, smmu->base + ARM_SMMU_CMDQ_BASE);
+ writel_relaxed(smmu->cmdq.q.llq.prod, smmu->base + ARM_SMMU_CMDQ_PROD);
+ writel_relaxed(smmu->cmdq.q.llq.cons, smmu->base + ARM_SMMU_CMDQ_CONS);
+
+ enables = CR0_CMDQEN;
+ ret = arm_smmu_write_reg_sync(smmu, enables, ARM_SMMU_CR0,
+ ARM_SMMU_CR0ACK);
+ if (ret) {
+ dev_err(smmu->dev, "failed to enable command queue\n");
+ return ret;
+ }
+
+ /* Invalidate any cached configuration */
+ cmd.opcode = CMDQ_OP_CFGI_ALL;
+ arm_smmu_cmdq_issue_cmd(smmu, &cmd);
+ arm_smmu_cmdq_issue_sync(smmu);
+
+ /* Invalidate any stale TLB entries */
+ if (smmu->features & ARM_SMMU_FEAT_HYP) {
+ cmd.opcode = CMDQ_OP_TLBI_EL2_ALL;
+ arm_smmu_cmdq_issue_cmd(smmu, &cmd);
+ }
+
+ cmd.opcode = CMDQ_OP_TLBI_NSNH_ALL;
+ arm_smmu_cmdq_issue_cmd(smmu, &cmd);
+ arm_smmu_cmdq_issue_sync(smmu);
+
+ /* Event queue */
+ writeq_relaxed(smmu->evtq.q.q_base, smmu->base + ARM_SMMU_EVTQ_BASE);
+ writel_relaxed(smmu->evtq.q.llq.prod,
+ arm_smmu_page1_fixup(ARM_SMMU_EVTQ_PROD, smmu));
+ writel_relaxed(smmu->evtq.q.llq.cons,
+ arm_smmu_page1_fixup(ARM_SMMU_EVTQ_CONS, smmu));
+
+ enables |= CR0_EVTQEN;
+ ret = arm_smmu_write_reg_sync(smmu, enables, ARM_SMMU_CR0,
+ ARM_SMMU_CR0ACK);
+ if (ret) {
+ dev_err(smmu->dev, "failed to enable event queue\n");
+ return ret;
+ }
+
+ /* PRI queue */
+ if (smmu->features & ARM_SMMU_FEAT_PRI) {
+ writeq_relaxed(smmu->priq.q.q_base,
+ smmu->base + ARM_SMMU_PRIQ_BASE);
+ writel_relaxed(smmu->priq.q.llq.prod,
+ arm_smmu_page1_fixup(ARM_SMMU_PRIQ_PROD, smmu));
+ writel_relaxed(smmu->priq.q.llq.cons,
+ arm_smmu_page1_fixup(ARM_SMMU_PRIQ_CONS, smmu));
+
+ enables |= CR0_PRIQEN;
+ ret = arm_smmu_write_reg_sync(smmu, enables, ARM_SMMU_CR0,
+ ARM_SMMU_CR0ACK);
+ if (ret) {
+ dev_err(smmu->dev, "failed to enable PRI queue\n");
+ return ret;
+ }
+ }
+
+ if (smmu->features & ARM_SMMU_FEAT_ATS) {
+ enables |= CR0_ATSCHK;
+ ret = arm_smmu_write_reg_sync(smmu, enables, ARM_SMMU_CR0,
+ ARM_SMMU_CR0ACK);
+ if (ret) {
+ dev_err(smmu->dev, "failed to enable ATS check\n");
+ return ret;
+ }
+ }
+
+ ret = arm_smmu_setup_irqs(smmu);
+ if (ret) {
+ dev_err(smmu->dev, "failed to setup irqs\n");
+ return ret;
+ }
+
+ if (is_kdump_kernel())
+ enables &= ~(CR0_EVTQEN | CR0_PRIQEN);
+
+ /* Enable the SMMU interface, or ensure bypass */
+ if (!bypass || disable_bypass) {
+ enables |= CR0_SMMUEN;
+ } else {
+ ret = arm_smmu_update_gbpa(smmu, 0, GBPA_ABORT);
+ if (ret)
+ return ret;
+ }
+ ret = arm_smmu_write_reg_sync(smmu, enables, ARM_SMMU_CR0,
+ ARM_SMMU_CR0ACK);
+ if (ret) {
+ dev_err(smmu->dev, "failed to enable SMMU interface\n");
+ return ret;
+ }
+
+ return 0;
+}
+
+static int arm_smmu_device_hw_probe(struct arm_smmu_device *smmu)
+{
+ u32 reg;
+ bool coherent = smmu->features & ARM_SMMU_FEAT_COHERENCY;
+
+ /* IDR0 */
+ reg = readl_relaxed(smmu->base + ARM_SMMU_IDR0);
+
+ /* 2-level structures */
+ if (FIELD_GET(IDR0_ST_LVL, reg) == IDR0_ST_LVL_2LVL)
+ smmu->features |= ARM_SMMU_FEAT_2_LVL_STRTAB;
+
+ if (reg & IDR0_CD2L)
+ smmu->features |= ARM_SMMU_FEAT_2_LVL_CDTAB;
+
+ /*
+ * Translation table endianness.
+ * We currently require the same endianness as the CPU, but this
+ * could be changed later by adding a new IO_PGTABLE_QUIRK.
+ */
+ switch (FIELD_GET(IDR0_TTENDIAN, reg)) {
+ case IDR0_TTENDIAN_MIXED:
+ smmu->features |= ARM_SMMU_FEAT_TT_LE | ARM_SMMU_FEAT_TT_BE;
+ break;
+#ifdef __BIG_ENDIAN
+ case IDR0_TTENDIAN_BE:
+ smmu->features |= ARM_SMMU_FEAT_TT_BE;
+ break;
+#else
+ case IDR0_TTENDIAN_LE:
+ smmu->features |= ARM_SMMU_FEAT_TT_LE;
+ break;
+#endif
+ default:
+ dev_err(smmu->dev, "unknown/unsupported TT endianness!\n");
+ return -ENXIO;
+ }
+
+ /* Boolean feature flags */
+ if (IS_ENABLED(CONFIG_PCI_PRI) && reg & IDR0_PRI)
+ smmu->features |= ARM_SMMU_FEAT_PRI;
+
+ if (IS_ENABLED(CONFIG_PCI_ATS) && reg & IDR0_ATS)
+ smmu->features |= ARM_SMMU_FEAT_ATS;
+
+ if (reg & IDR0_SEV)
+ smmu->features |= ARM_SMMU_FEAT_SEV;
+
+ if (reg & IDR0_MSI) {
+ smmu->features |= ARM_SMMU_FEAT_MSI;
+ if (coherent && !disable_msipolling)
+ smmu->options |= ARM_SMMU_OPT_MSIPOLL;
+ }
+
+ if (reg & IDR0_HYP)
+ smmu->features |= ARM_SMMU_FEAT_HYP;
+
+ /*
+ * The coherency feature as set by FW is used in preference to the ID
+ * register, but warn on mismatch.
+ */
+ if (!!(reg & IDR0_COHACC) != coherent)
+ dev_warn(smmu->dev, "IDR0.COHACC overridden by FW configuration (%s)\n",
+ coherent ? "true" : "false");
+
+ switch (FIELD_GET(IDR0_STALL_MODEL, reg)) {
+ case IDR0_STALL_MODEL_FORCE:
+ smmu->features |= ARM_SMMU_FEAT_STALL_FORCE;
+ fallthrough;
+ case IDR0_STALL_MODEL_STALL:
+ smmu->features |= ARM_SMMU_FEAT_STALLS;
+ }
+
+ if (reg & IDR0_S1P)
+ smmu->features |= ARM_SMMU_FEAT_TRANS_S1;
+
+ if (reg & IDR0_S2P)
+ smmu->features |= ARM_SMMU_FEAT_TRANS_S2;
+
+ if (!(reg & (IDR0_S1P | IDR0_S2P))) {
+ dev_err(smmu->dev, "no translation support!\n");
+ return -ENXIO;
+ }
+
+ /* We only support the AArch64 table format at present */
+ switch (FIELD_GET(IDR0_TTF, reg)) {
+ case IDR0_TTF_AARCH32_64:
+ smmu->ias = 40;
+ fallthrough;
+ case IDR0_TTF_AARCH64:
+ break;
+ default:
+ dev_err(smmu->dev, "AArch64 table format not supported!\n");
+ return -ENXIO;
+ }
+
+ /* ASID/VMID sizes */
+ smmu->asid_bits = reg & IDR0_ASID16 ? 16 : 8;
+ smmu->vmid_bits = reg & IDR0_VMID16 ? 16 : 8;
+
+ /* IDR1 */
+ reg = readl_relaxed(smmu->base + ARM_SMMU_IDR1);
+ if (reg & (IDR1_TABLES_PRESET | IDR1_QUEUES_PRESET | IDR1_REL)) {
+ dev_err(smmu->dev, "embedded implementation not supported\n");
+ return -ENXIO;
+ }
+
+ /* Queue sizes, capped to ensure natural alignment */
+ smmu->cmdq.q.llq.max_n_shift = min_t(u32, CMDQ_MAX_SZ_SHIFT,
+ FIELD_GET(IDR1_CMDQS, reg));
+ if (smmu->cmdq.q.llq.max_n_shift <= ilog2(CMDQ_BATCH_ENTRIES)) {
+ /*
+ * We don't support splitting up batches, so one batch of
+ * commands plus an extra sync needs to fit inside the command
+ * queue. There's also no way we can handle the weird alignment
+ * restrictions on the base pointer for a unit-length queue.
+ */
+ dev_err(smmu->dev, "command queue size <= %d entries not supported\n",
+ CMDQ_BATCH_ENTRIES);
+ return -ENXIO;
+ }
+
+ smmu->evtq.q.llq.max_n_shift = min_t(u32, EVTQ_MAX_SZ_SHIFT,
+ FIELD_GET(IDR1_EVTQS, reg));
+ smmu->priq.q.llq.max_n_shift = min_t(u32, PRIQ_MAX_SZ_SHIFT,
+ FIELD_GET(IDR1_PRIQS, reg));
+
+ /* SID/SSID sizes */
+ smmu->ssid_bits = FIELD_GET(IDR1_SSIDSIZE, reg);
+ smmu->sid_bits = FIELD_GET(IDR1_SIDSIZE, reg);
+
+ /*
+ * If the SMMU supports fewer bits than would fill a single L2 stream
+ * table, use a linear table instead.
+ */
+ if (smmu->sid_bits <= STRTAB_SPLIT)
+ smmu->features &= ~ARM_SMMU_FEAT_2_LVL_STRTAB;
+
+ /* IDR3 */
+ reg = readl_relaxed(smmu->base + ARM_SMMU_IDR3);
+ if (FIELD_GET(IDR3_RIL, reg))
+ smmu->features |= ARM_SMMU_FEAT_RANGE_INV;
+
+ /* IDR5 */
+ reg = readl_relaxed(smmu->base + ARM_SMMU_IDR5);
+
+ /* Maximum number of outstanding stalls */
+ smmu->evtq.max_stalls = FIELD_GET(IDR5_STALL_MAX, reg);
+
+ /* Page sizes */
+ if (reg & IDR5_GRAN64K)
+ smmu->pgsize_bitmap |= SZ_64K | SZ_512M;
+ if (reg & IDR5_GRAN16K)
+ smmu->pgsize_bitmap |= SZ_16K | SZ_32M;
+ if (reg & IDR5_GRAN4K)
+ smmu->pgsize_bitmap |= SZ_4K | SZ_2M | SZ_1G;
+
+ /* Input address size */
+ if (FIELD_GET(IDR5_VAX, reg) == IDR5_VAX_52_BIT)
+ smmu->features |= ARM_SMMU_FEAT_VAX;
+
+ /* Output address size */
+ switch (FIELD_GET(IDR5_OAS, reg)) {
+ case IDR5_OAS_32_BIT:
+ smmu->oas = 32;
+ break;
+ case IDR5_OAS_36_BIT:
+ smmu->oas = 36;
+ break;
+ case IDR5_OAS_40_BIT:
+ smmu->oas = 40;
+ break;
+ case IDR5_OAS_42_BIT:
+ smmu->oas = 42;
+ break;
+ case IDR5_OAS_44_BIT:
+ smmu->oas = 44;
+ break;
+ case IDR5_OAS_52_BIT:
+ smmu->oas = 52;
+ smmu->pgsize_bitmap |= 1ULL << 42; /* 4TB */
+ break;
+ default:
+ dev_info(smmu->dev,
+ "unknown output address size. Truncating to 48-bit\n");
+ fallthrough;
+ case IDR5_OAS_48_BIT:
+ smmu->oas = 48;
+ }
+
+ if (arm_smmu_ops.pgsize_bitmap == -1UL)
+ arm_smmu_ops.pgsize_bitmap = smmu->pgsize_bitmap;
+ else
+ arm_smmu_ops.pgsize_bitmap |= smmu->pgsize_bitmap;
+
+ /* Set the DMA mask for our table walker */
+ if (dma_set_mask_and_coherent(smmu->dev, DMA_BIT_MASK(smmu->oas)))
+ dev_warn(smmu->dev,
+ "failed to set DMA mask for table walker\n");
+
+ smmu->ias = max(smmu->ias, smmu->oas);
+
+ if (arm_smmu_sva_supported(smmu))
+ smmu->features |= ARM_SMMU_FEAT_SVA;
+
+ dev_info(smmu->dev, "ias %lu-bit, oas %lu-bit (features 0x%08x)\n",
+ smmu->ias, smmu->oas, smmu->features);
+ return 0;
+}
+
+#ifdef CONFIG_ACPI
+static void acpi_smmu_get_options(u32 model, struct arm_smmu_device *smmu)
+{
+ switch (model) {
+ case ACPI_IORT_SMMU_V3_CAVIUM_CN99XX:
+ smmu->options |= ARM_SMMU_OPT_PAGE0_REGS_ONLY;
+ break;
+ case ACPI_IORT_SMMU_V3_HISILICON_HI161X:
+ smmu->options |= ARM_SMMU_OPT_SKIP_PREFETCH;
+ break;
+ }
+
+ dev_notice(smmu->dev, "option mask 0x%x\n", smmu->options);
+}
+
+static int arm_smmu_device_acpi_probe(struct platform_device *pdev,
+ struct arm_smmu_device *smmu)
+{
+ struct acpi_iort_smmu_v3 *iort_smmu;
+ struct device *dev = smmu->dev;
+ struct acpi_iort_node *node;
+
+ node = *(struct acpi_iort_node **)dev_get_platdata(dev);
+
+ /* Retrieve SMMUv3 specific data */
+ iort_smmu = (struct acpi_iort_smmu_v3 *)node->node_data;
+
+ acpi_smmu_get_options(iort_smmu->model, smmu);
+
+ if (iort_smmu->flags & ACPI_IORT_SMMU_V3_COHACC_OVERRIDE)
+ smmu->features |= ARM_SMMU_FEAT_COHERENCY;
+
+ return 0;
+}
+#else
+static inline int arm_smmu_device_acpi_probe(struct platform_device *pdev,
+ struct arm_smmu_device *smmu)
+{
+ return -ENODEV;
+}
+#endif
+
+static int arm_smmu_device_dt_probe(struct platform_device *pdev,
+ struct arm_smmu_device *smmu)
+{
+ struct device *dev = &pdev->dev;
+ u32 cells;
+ int ret = -EINVAL;
+
+ if (of_property_read_u32(dev->of_node, "#iommu-cells", &cells))
+ dev_err(dev, "missing #iommu-cells property\n");
+ else if (cells != 1)
+ dev_err(dev, "invalid #iommu-cells value (%d)\n", cells);
+ else
+ ret = 0;
+
+ parse_driver_options(smmu);
+
+ if (of_dma_is_coherent(dev->of_node))
+ smmu->features |= ARM_SMMU_FEAT_COHERENCY;
+
+ return ret;
+}
+
+static unsigned long arm_smmu_resource_size(struct arm_smmu_device *smmu)
+{
+ if (smmu->options & ARM_SMMU_OPT_PAGE0_REGS_ONLY)
+ return SZ_64K;
+ else
+ return SZ_128K;
+}
+
+static int arm_smmu_set_bus_ops(struct iommu_ops *ops)
+{
+ int err;
+
+#ifdef CONFIG_PCI
+ if (pci_bus_type.iommu_ops != ops) {
+ err = bus_set_iommu(&pci_bus_type, ops);
+ if (err)
+ return err;
+ }
+#endif
+#ifdef CONFIG_ARM_AMBA
+ if (amba_bustype.iommu_ops != ops) {
+ err = bus_set_iommu(&amba_bustype, ops);
+ if (err)
+ goto err_reset_pci_ops;
+ }
+#endif
+ if (platform_bus_type.iommu_ops != ops) {
+ err = bus_set_iommu(&platform_bus_type, ops);
+ if (err)
+ goto err_reset_amba_ops;
+ }
+
+ return 0;
+
+err_reset_amba_ops:
+#ifdef CONFIG_ARM_AMBA
+ bus_set_iommu(&amba_bustype, NULL);
+#endif
+err_reset_pci_ops: __maybe_unused;
+#ifdef CONFIG_PCI
+ bus_set_iommu(&pci_bus_type, NULL);
+#endif
+ return err;
+}
+
+static void __iomem *arm_smmu_ioremap(struct device *dev, resource_size_t start,
+ resource_size_t size)
+{
+ struct resource res = {
+ .flags = IORESOURCE_MEM,
+ .start = start,
+ .end = start + size - 1,
+ };
+
+ return devm_ioremap_resource(dev, &res);
+}
+
+static int arm_smmu_device_probe(struct platform_device *pdev)
+{
+ int irq, ret;
+ struct resource *res;
+ resource_size_t ioaddr;
+ struct arm_smmu_device *smmu;
+ struct device *dev = &pdev->dev;
+ bool bypass;
+
+ smmu = devm_kzalloc(dev, sizeof(*smmu), GFP_KERNEL);
+ if (!smmu) {
+ dev_err(dev, "failed to allocate arm_smmu_device\n");
+ return -ENOMEM;
+ }
+ smmu->dev = dev;
+
+ if (dev->of_node) {
+ ret = arm_smmu_device_dt_probe(pdev, smmu);
+ } else {
+ ret = arm_smmu_device_acpi_probe(pdev, smmu);
+ if (ret == -ENODEV)
+ return ret;
+ }
+
+ /* Set bypass mode according to firmware probing result */
+ bypass = !!ret;
+
+ /* Base address */
+ res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+ if (!res)
+ return -EINVAL;
+ if (resource_size(res) < arm_smmu_resource_size(smmu)) {
+ dev_err(dev, "MMIO region too small (%pr)\n", res);
+ return -EINVAL;
+ }
+ ioaddr = res->start;
+
+ /*
+ * Don't map the IMPLEMENTATION DEFINED regions, since they may contain
+ * the PMCG registers which are reserved by the PMU driver.
+ */
+ smmu->base = arm_smmu_ioremap(dev, ioaddr, ARM_SMMU_REG_SZ);
+ if (IS_ERR(smmu->base))
+ return PTR_ERR(smmu->base);
+
+ if (arm_smmu_resource_size(smmu) > SZ_64K) {
+ smmu->page1 = arm_smmu_ioremap(dev, ioaddr + SZ_64K,
+ ARM_SMMU_REG_SZ);
+ if (IS_ERR(smmu->page1))
+ return PTR_ERR(smmu->page1);
+ } else {
+ smmu->page1 = smmu->base;
+ }
+
+ /* Interrupt lines */
+
+ irq = platform_get_irq_byname_optional(pdev, "combined");
+ if (irq > 0)
+ smmu->combined_irq = irq;
+ else {
+ irq = platform_get_irq_byname_optional(pdev, "eventq");
+ if (irq > 0)
+ smmu->evtq.q.irq = irq;
+
+ irq = platform_get_irq_byname_optional(pdev, "priq");
+ if (irq > 0)
+ smmu->priq.q.irq = irq;
+
+ irq = platform_get_irq_byname_optional(pdev, "gerror");
+ if (irq > 0)
+ smmu->gerr_irq = irq;
+ }
+ /* Probe the h/w */
+ ret = arm_smmu_device_hw_probe(smmu);
+ if (ret)
+ return ret;
+
+ /* Initialise in-memory data structures */
+ ret = arm_smmu_init_structures(smmu);
+ if (ret)
+ return ret;
+
+ /* Record our private device structure */
+ platform_set_drvdata(pdev, smmu);
+
+ /* Reset the device */
+ ret = arm_smmu_device_reset(smmu, bypass);
+ if (ret)
+ return ret;
+
+ /* And we're up. Go go go! */
+ ret = iommu_device_sysfs_add(&smmu->iommu, dev, NULL,
+ "smmu3.%pa", &ioaddr);
+ if (ret)
+ return ret;
+
+ iommu_device_set_ops(&smmu->iommu, &arm_smmu_ops);
+ iommu_device_set_fwnode(&smmu->iommu, dev->fwnode);
+
+ ret = iommu_device_register(&smmu->iommu);
+ if (ret) {
+ dev_err(dev, "Failed to register iommu\n");
+ return ret;
+ }
+
+ return arm_smmu_set_bus_ops(&arm_smmu_ops);
+}
+
+static int arm_smmu_device_remove(struct platform_device *pdev)
+{
+ struct arm_smmu_device *smmu = platform_get_drvdata(pdev);
+
+ arm_smmu_set_bus_ops(NULL);
+ iommu_device_unregister(&smmu->iommu);
+ iommu_device_sysfs_remove(&smmu->iommu);
+ arm_smmu_device_disable(smmu);
+
+ return 0;
+}
+
+static void arm_smmu_device_shutdown(struct platform_device *pdev)
+{
+ arm_smmu_device_remove(pdev);
+}
+
+static const struct of_device_id arm_smmu_of_match[] = {
+ { .compatible = "arm,smmu-v3", },
+ { },
+};
+MODULE_DEVICE_TABLE(of, arm_smmu_of_match);
+
+static struct platform_driver arm_smmu_driver = {
+ .driver = {
+ .name = "arm-smmu-v3",
+ .of_match_table = arm_smmu_of_match,
+ .suppress_bind_attrs = true,
+ },
+ .probe = arm_smmu_device_probe,
+ .remove = arm_smmu_device_remove,
+ .shutdown = arm_smmu_device_shutdown,
+};
+module_platform_driver(arm_smmu_driver);
+
+MODULE_DESCRIPTION("IOMMU API for ARM architected SMMUv3 implementations");
+MODULE_AUTHOR("Will Deacon <will@kernel.org>");
+MODULE_ALIAS("platform:arm-smmu-v3");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
new file mode 100644
index 000000000..57e5d223c
--- /dev/null
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
@@ -0,0 +1,723 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * IOMMU API for ARM architected SMMUv3 implementations.
+ *
+ * Copyright (C) 2015 ARM Limited
+ */
+
+#ifndef _ARM_SMMU_V3_H
+#define _ARM_SMMU_V3_H
+
+#include <linux/bitfield.h>
+#include <linux/iommu.h>
+#include <linux/kernel.h>
+#include <linux/mmzone.h>
+#include <linux/sizes.h>
+
+/* MMIO registers */
+#define ARM_SMMU_IDR0 0x0
+#define IDR0_ST_LVL GENMASK(28, 27)
+#define IDR0_ST_LVL_2LVL 1
+#define IDR0_STALL_MODEL GENMASK(25, 24)
+#define IDR0_STALL_MODEL_STALL 0
+#define IDR0_STALL_MODEL_FORCE 2
+#define IDR0_TTENDIAN GENMASK(22, 21)
+#define IDR0_TTENDIAN_MIXED 0
+#define IDR0_TTENDIAN_LE 2
+#define IDR0_TTENDIAN_BE 3
+#define IDR0_CD2L (1 << 19)
+#define IDR0_VMID16 (1 << 18)
+#define IDR0_PRI (1 << 16)
+#define IDR0_SEV (1 << 14)
+#define IDR0_MSI (1 << 13)
+#define IDR0_ASID16 (1 << 12)
+#define IDR0_ATS (1 << 10)
+#define IDR0_HYP (1 << 9)
+#define IDR0_COHACC (1 << 4)
+#define IDR0_TTF GENMASK(3, 2)
+#define IDR0_TTF_AARCH64 2
+#define IDR0_TTF_AARCH32_64 3
+#define IDR0_S1P (1 << 1)
+#define IDR0_S2P (1 << 0)
+
+#define ARM_SMMU_IDR1 0x4
+#define IDR1_TABLES_PRESET (1 << 30)
+#define IDR1_QUEUES_PRESET (1 << 29)
+#define IDR1_REL (1 << 28)
+#define IDR1_CMDQS GENMASK(25, 21)
+#define IDR1_EVTQS GENMASK(20, 16)
+#define IDR1_PRIQS GENMASK(15, 11)
+#define IDR1_SSIDSIZE GENMASK(10, 6)
+#define IDR1_SIDSIZE GENMASK(5, 0)
+
+#define ARM_SMMU_IDR3 0xc
+#define IDR3_RIL (1 << 10)
+
+#define ARM_SMMU_IDR5 0x14
+#define IDR5_STALL_MAX GENMASK(31, 16)
+#define IDR5_GRAN64K (1 << 6)
+#define IDR5_GRAN16K (1 << 5)
+#define IDR5_GRAN4K (1 << 4)
+#define IDR5_OAS GENMASK(2, 0)
+#define IDR5_OAS_32_BIT 0
+#define IDR5_OAS_36_BIT 1
+#define IDR5_OAS_40_BIT 2
+#define IDR5_OAS_42_BIT 3
+#define IDR5_OAS_44_BIT 4
+#define IDR5_OAS_48_BIT 5
+#define IDR5_OAS_52_BIT 6
+#define IDR5_VAX GENMASK(11, 10)
+#define IDR5_VAX_52_BIT 1
+
+#define ARM_SMMU_CR0 0x20
+#define CR0_ATSCHK (1 << 4)
+#define CR0_CMDQEN (1 << 3)
+#define CR0_EVTQEN (1 << 2)
+#define CR0_PRIQEN (1 << 1)
+#define CR0_SMMUEN (1 << 0)
+
+#define ARM_SMMU_CR0ACK 0x24
+
+#define ARM_SMMU_CR1 0x28
+#define CR1_TABLE_SH GENMASK(11, 10)
+#define CR1_TABLE_OC GENMASK(9, 8)
+#define CR1_TABLE_IC GENMASK(7, 6)
+#define CR1_QUEUE_SH GENMASK(5, 4)
+#define CR1_QUEUE_OC GENMASK(3, 2)
+#define CR1_QUEUE_IC GENMASK(1, 0)
+/* CR1 cacheability fields don't quite follow the usual TCR-style encoding */
+#define CR1_CACHE_NC 0
+#define CR1_CACHE_WB 1
+#define CR1_CACHE_WT 2
+
+#define ARM_SMMU_CR2 0x2c
+#define CR2_PTM (1 << 2)
+#define CR2_RECINVSID (1 << 1)
+#define CR2_E2H (1 << 0)
+
+#define ARM_SMMU_GBPA 0x44
+#define GBPA_UPDATE (1 << 31)
+#define GBPA_ABORT (1 << 20)
+
+#define ARM_SMMU_IRQ_CTRL 0x50
+#define IRQ_CTRL_EVTQ_IRQEN (1 << 2)
+#define IRQ_CTRL_PRIQ_IRQEN (1 << 1)
+#define IRQ_CTRL_GERROR_IRQEN (1 << 0)
+
+#define ARM_SMMU_IRQ_CTRLACK 0x54
+
+#define ARM_SMMU_GERROR 0x60
+#define GERROR_SFM_ERR (1 << 8)
+#define GERROR_MSI_GERROR_ABT_ERR (1 << 7)
+#define GERROR_MSI_PRIQ_ABT_ERR (1 << 6)
+#define GERROR_MSI_EVTQ_ABT_ERR (1 << 5)
+#define GERROR_MSI_CMDQ_ABT_ERR (1 << 4)
+#define GERROR_PRIQ_ABT_ERR (1 << 3)
+#define GERROR_EVTQ_ABT_ERR (1 << 2)
+#define GERROR_CMDQ_ERR (1 << 0)
+#define GERROR_ERR_MASK 0x1fd
+
+#define ARM_SMMU_GERRORN 0x64
+
+#define ARM_SMMU_GERROR_IRQ_CFG0 0x68
+#define ARM_SMMU_GERROR_IRQ_CFG1 0x70
+#define ARM_SMMU_GERROR_IRQ_CFG2 0x74
+
+#define ARM_SMMU_STRTAB_BASE 0x80
+#define STRTAB_BASE_RA (1UL << 62)
+#define STRTAB_BASE_ADDR_MASK GENMASK_ULL(51, 6)
+
+#define ARM_SMMU_STRTAB_BASE_CFG 0x88
+#define STRTAB_BASE_CFG_FMT GENMASK(17, 16)
+#define STRTAB_BASE_CFG_FMT_LINEAR 0
+#define STRTAB_BASE_CFG_FMT_2LVL 1
+#define STRTAB_BASE_CFG_SPLIT GENMASK(10, 6)
+#define STRTAB_BASE_CFG_LOG2SIZE GENMASK(5, 0)
+
+#define ARM_SMMU_CMDQ_BASE 0x90
+#define ARM_SMMU_CMDQ_PROD 0x98
+#define ARM_SMMU_CMDQ_CONS 0x9c
+
+#define ARM_SMMU_EVTQ_BASE 0xa0
+#define ARM_SMMU_EVTQ_PROD 0x100a8
+#define ARM_SMMU_EVTQ_CONS 0x100ac
+#define ARM_SMMU_EVTQ_IRQ_CFG0 0xb0
+#define ARM_SMMU_EVTQ_IRQ_CFG1 0xb8
+#define ARM_SMMU_EVTQ_IRQ_CFG2 0xbc
+
+#define ARM_SMMU_PRIQ_BASE 0xc0
+#define ARM_SMMU_PRIQ_PROD 0x100c8
+#define ARM_SMMU_PRIQ_CONS 0x100cc
+#define ARM_SMMU_PRIQ_IRQ_CFG0 0xd0
+#define ARM_SMMU_PRIQ_IRQ_CFG1 0xd8
+#define ARM_SMMU_PRIQ_IRQ_CFG2 0xdc
+
+#define ARM_SMMU_REG_SZ 0xe00
+
+/* Common MSI config fields */
+#define MSI_CFG0_ADDR_MASK GENMASK_ULL(51, 2)
+#define MSI_CFG2_SH GENMASK(5, 4)
+#define MSI_CFG2_MEMATTR GENMASK(3, 0)
+
+/* Common memory attribute values */
+#define ARM_SMMU_SH_NSH 0
+#define ARM_SMMU_SH_OSH 2
+#define ARM_SMMU_SH_ISH 3
+#define ARM_SMMU_MEMATTR_DEVICE_nGnRE 0x1
+#define ARM_SMMU_MEMATTR_OIWB 0xf
+
+#define Q_IDX(llq, p) ((p) & ((1 << (llq)->max_n_shift) - 1))
+#define Q_WRP(llq, p) ((p) & (1 << (llq)->max_n_shift))
+#define Q_OVERFLOW_FLAG (1U << 31)
+#define Q_OVF(p) ((p) & Q_OVERFLOW_FLAG)
+#define Q_ENT(q, p) ((q)->base + \
+ Q_IDX(&((q)->llq), p) * \
+ (q)->ent_dwords)
+
+#define Q_BASE_RWA (1UL << 62)
+#define Q_BASE_ADDR_MASK GENMASK_ULL(51, 5)
+#define Q_BASE_LOG2SIZE GENMASK(4, 0)
+
+/* Ensure DMA allocations are naturally aligned */
+#ifdef CONFIG_CMA_ALIGNMENT
+#define Q_MAX_SZ_SHIFT (PAGE_SHIFT + CONFIG_CMA_ALIGNMENT)
+#else
+#define Q_MAX_SZ_SHIFT (PAGE_SHIFT + MAX_ORDER - 1)
+#endif
+
+/*
+ * Stream table.
+ *
+ * Linear: Enough to cover 1 << IDR1.SIDSIZE entries
+ * 2lvl: 128k L1 entries,
+ * 256 lazy entries per table (each table covers a PCI bus)
+ */
+#define STRTAB_L1_SZ_SHIFT 20
+#define STRTAB_SPLIT 8
+
+#define STRTAB_L1_DESC_DWORDS 1
+#define STRTAB_L1_DESC_SPAN GENMASK_ULL(4, 0)
+#define STRTAB_L1_DESC_L2PTR_MASK GENMASK_ULL(51, 6)
+
+#define STRTAB_STE_DWORDS 8
+#define STRTAB_STE_0_V (1UL << 0)
+#define STRTAB_STE_0_CFG GENMASK_ULL(3, 1)
+#define STRTAB_STE_0_CFG_ABORT 0
+#define STRTAB_STE_0_CFG_BYPASS 4
+#define STRTAB_STE_0_CFG_S1_TRANS 5
+#define STRTAB_STE_0_CFG_S2_TRANS 6
+
+#define STRTAB_STE_0_S1FMT GENMASK_ULL(5, 4)
+#define STRTAB_STE_0_S1FMT_LINEAR 0
+#define STRTAB_STE_0_S1FMT_64K_L2 2
+#define STRTAB_STE_0_S1CTXPTR_MASK GENMASK_ULL(51, 6)
+#define STRTAB_STE_0_S1CDMAX GENMASK_ULL(63, 59)
+
+#define STRTAB_STE_1_S1DSS GENMASK_ULL(1, 0)
+#define STRTAB_STE_1_S1DSS_TERMINATE 0x0
+#define STRTAB_STE_1_S1DSS_BYPASS 0x1
+#define STRTAB_STE_1_S1DSS_SSID0 0x2
+
+#define STRTAB_STE_1_S1C_CACHE_NC 0UL
+#define STRTAB_STE_1_S1C_CACHE_WBRA 1UL
+#define STRTAB_STE_1_S1C_CACHE_WT 2UL
+#define STRTAB_STE_1_S1C_CACHE_WB 3UL
+#define STRTAB_STE_1_S1CIR GENMASK_ULL(3, 2)
+#define STRTAB_STE_1_S1COR GENMASK_ULL(5, 4)
+#define STRTAB_STE_1_S1CSH GENMASK_ULL(7, 6)
+
+#define STRTAB_STE_1_S1STALLD (1UL << 27)
+
+#define STRTAB_STE_1_EATS GENMASK_ULL(29, 28)
+#define STRTAB_STE_1_EATS_ABT 0UL
+#define STRTAB_STE_1_EATS_TRANS 1UL
+#define STRTAB_STE_1_EATS_S1CHK 2UL
+
+#define STRTAB_STE_1_STRW GENMASK_ULL(31, 30)
+#define STRTAB_STE_1_STRW_NSEL1 0UL
+#define STRTAB_STE_1_STRW_EL2 2UL
+
+#define STRTAB_STE_1_SHCFG GENMASK_ULL(45, 44)
+#define STRTAB_STE_1_SHCFG_INCOMING 1UL
+
+#define STRTAB_STE_2_S2VMID GENMASK_ULL(15, 0)
+#define STRTAB_STE_2_VTCR GENMASK_ULL(50, 32)
+#define STRTAB_STE_2_VTCR_S2T0SZ GENMASK_ULL(5, 0)
+#define STRTAB_STE_2_VTCR_S2SL0 GENMASK_ULL(7, 6)
+#define STRTAB_STE_2_VTCR_S2IR0 GENMASK_ULL(9, 8)
+#define STRTAB_STE_2_VTCR_S2OR0 GENMASK_ULL(11, 10)
+#define STRTAB_STE_2_VTCR_S2SH0 GENMASK_ULL(13, 12)
+#define STRTAB_STE_2_VTCR_S2TG GENMASK_ULL(15, 14)
+#define STRTAB_STE_2_VTCR_S2PS GENMASK_ULL(18, 16)
+#define STRTAB_STE_2_S2AA64 (1UL << 51)
+#define STRTAB_STE_2_S2ENDI (1UL << 52)
+#define STRTAB_STE_2_S2PTW (1UL << 54)
+#define STRTAB_STE_2_S2R (1UL << 58)
+
+#define STRTAB_STE_3_S2TTB_MASK GENMASK_ULL(51, 4)
+
+/*
+ * Context descriptors.
+ *
+ * Linear: when less than 1024 SSIDs are supported
+ * 2lvl: at most 1024 L1 entries,
+ * 1024 lazy entries per table.
+ */
+#define CTXDESC_SPLIT 10
+#define CTXDESC_L2_ENTRIES (1 << CTXDESC_SPLIT)
+
+#define CTXDESC_L1_DESC_DWORDS 1
+#define CTXDESC_L1_DESC_V (1UL << 0)
+#define CTXDESC_L1_DESC_L2PTR_MASK GENMASK_ULL(51, 12)
+
+#define CTXDESC_CD_DWORDS 8
+#define CTXDESC_CD_0_TCR_T0SZ GENMASK_ULL(5, 0)
+#define CTXDESC_CD_0_TCR_TG0 GENMASK_ULL(7, 6)
+#define CTXDESC_CD_0_TCR_IRGN0 GENMASK_ULL(9, 8)
+#define CTXDESC_CD_0_TCR_ORGN0 GENMASK_ULL(11, 10)
+#define CTXDESC_CD_0_TCR_SH0 GENMASK_ULL(13, 12)
+#define CTXDESC_CD_0_TCR_EPD0 (1ULL << 14)
+#define CTXDESC_CD_0_TCR_EPD1 (1ULL << 30)
+
+#define CTXDESC_CD_0_ENDI (1UL << 15)
+#define CTXDESC_CD_0_V (1UL << 31)
+
+#define CTXDESC_CD_0_TCR_IPS GENMASK_ULL(34, 32)
+#define CTXDESC_CD_0_TCR_TBI0 (1ULL << 38)
+
+#define CTXDESC_CD_0_AA64 (1UL << 41)
+#define CTXDESC_CD_0_S (1UL << 44)
+#define CTXDESC_CD_0_R (1UL << 45)
+#define CTXDESC_CD_0_A (1UL << 46)
+#define CTXDESC_CD_0_ASET (1UL << 47)
+#define CTXDESC_CD_0_ASID GENMASK_ULL(63, 48)
+
+#define CTXDESC_CD_1_TTB0_MASK GENMASK_ULL(51, 4)
+
+/*
+ * When the SMMU only supports linear context descriptor tables, pick a
+ * reasonable size limit (64kB).
+ */
+#define CTXDESC_LINEAR_CDMAX ilog2(SZ_64K / (CTXDESC_CD_DWORDS << 3))
+
+/* Command queue */
+#define CMDQ_ENT_SZ_SHIFT 4
+#define CMDQ_ENT_DWORDS ((1 << CMDQ_ENT_SZ_SHIFT) >> 3)
+#define CMDQ_MAX_SZ_SHIFT (Q_MAX_SZ_SHIFT - CMDQ_ENT_SZ_SHIFT)
+
+#define CMDQ_CONS_ERR GENMASK(30, 24)
+#define CMDQ_ERR_CERROR_NONE_IDX 0
+#define CMDQ_ERR_CERROR_ILL_IDX 1
+#define CMDQ_ERR_CERROR_ABT_IDX 2
+#define CMDQ_ERR_CERROR_ATC_INV_IDX 3
+
+#define CMDQ_PROD_OWNED_FLAG Q_OVERFLOW_FLAG
+
+/*
+ * This is used to size the command queue and therefore must be at least
+ * BITS_PER_LONG so that the valid_map works correctly (it relies on the
+ * total number of queue entries being a multiple of BITS_PER_LONG).
+ */
+#define CMDQ_BATCH_ENTRIES BITS_PER_LONG
+
+#define CMDQ_0_OP GENMASK_ULL(7, 0)
+#define CMDQ_0_SSV (1UL << 11)
+
+#define CMDQ_PREFETCH_0_SID GENMASK_ULL(63, 32)
+#define CMDQ_PREFETCH_1_SIZE GENMASK_ULL(4, 0)
+#define CMDQ_PREFETCH_1_ADDR_MASK GENMASK_ULL(63, 12)
+
+#define CMDQ_CFGI_0_SSID GENMASK_ULL(31, 12)
+#define CMDQ_CFGI_0_SID GENMASK_ULL(63, 32)
+#define CMDQ_CFGI_1_LEAF (1UL << 0)
+#define CMDQ_CFGI_1_RANGE GENMASK_ULL(4, 0)
+
+#define CMDQ_TLBI_0_NUM GENMASK_ULL(16, 12)
+#define CMDQ_TLBI_RANGE_NUM_MAX 31
+#define CMDQ_TLBI_0_SCALE GENMASK_ULL(24, 20)
+#define CMDQ_TLBI_0_VMID GENMASK_ULL(47, 32)
+#define CMDQ_TLBI_0_ASID GENMASK_ULL(63, 48)
+#define CMDQ_TLBI_1_LEAF (1UL << 0)
+#define CMDQ_TLBI_1_TTL GENMASK_ULL(9, 8)
+#define CMDQ_TLBI_1_TG GENMASK_ULL(11, 10)
+#define CMDQ_TLBI_1_VA_MASK GENMASK_ULL(63, 12)
+#define CMDQ_TLBI_1_IPA_MASK GENMASK_ULL(51, 12)
+
+#define CMDQ_ATC_0_SSID GENMASK_ULL(31, 12)
+#define CMDQ_ATC_0_SID GENMASK_ULL(63, 32)
+#define CMDQ_ATC_0_GLOBAL (1UL << 9)
+#define CMDQ_ATC_1_SIZE GENMASK_ULL(5, 0)
+#define CMDQ_ATC_1_ADDR_MASK GENMASK_ULL(63, 12)
+
+#define CMDQ_PRI_0_SSID GENMASK_ULL(31, 12)
+#define CMDQ_PRI_0_SID GENMASK_ULL(63, 32)
+#define CMDQ_PRI_1_GRPID GENMASK_ULL(8, 0)
+#define CMDQ_PRI_1_RESP GENMASK_ULL(13, 12)
+
+#define CMDQ_SYNC_0_CS GENMASK_ULL(13, 12)
+#define CMDQ_SYNC_0_CS_NONE 0
+#define CMDQ_SYNC_0_CS_IRQ 1
+#define CMDQ_SYNC_0_CS_SEV 2
+#define CMDQ_SYNC_0_MSH GENMASK_ULL(23, 22)
+#define CMDQ_SYNC_0_MSIATTR GENMASK_ULL(27, 24)
+#define CMDQ_SYNC_0_MSIDATA GENMASK_ULL(63, 32)
+#define CMDQ_SYNC_1_MSIADDR_MASK GENMASK_ULL(51, 2)
+
+/* Event queue */
+#define EVTQ_ENT_SZ_SHIFT 5
+#define EVTQ_ENT_DWORDS ((1 << EVTQ_ENT_SZ_SHIFT) >> 3)
+#define EVTQ_MAX_SZ_SHIFT (Q_MAX_SZ_SHIFT - EVTQ_ENT_SZ_SHIFT)
+
+#define EVTQ_0_ID GENMASK_ULL(7, 0)
+
+/* PRI queue */
+#define PRIQ_ENT_SZ_SHIFT 4
+#define PRIQ_ENT_DWORDS ((1 << PRIQ_ENT_SZ_SHIFT) >> 3)
+#define PRIQ_MAX_SZ_SHIFT (Q_MAX_SZ_SHIFT - PRIQ_ENT_SZ_SHIFT)
+
+#define PRIQ_0_SID GENMASK_ULL(31, 0)
+#define PRIQ_0_SSID GENMASK_ULL(51, 32)
+#define PRIQ_0_PERM_PRIV (1UL << 58)
+#define PRIQ_0_PERM_EXEC (1UL << 59)
+#define PRIQ_0_PERM_READ (1UL << 60)
+#define PRIQ_0_PERM_WRITE (1UL << 61)
+#define PRIQ_0_PRG_LAST (1UL << 62)
+#define PRIQ_0_SSID_V (1UL << 63)
+
+#define PRIQ_1_PRG_IDX GENMASK_ULL(8, 0)
+#define PRIQ_1_ADDR_MASK GENMASK_ULL(63, 12)
+
+/* High-level queue structures */
+#define ARM_SMMU_POLL_TIMEOUT_US 1000000 /* 1s! */
+#define ARM_SMMU_POLL_SPIN_COUNT 10
+
+#define MSI_IOVA_BASE 0x8000000
+#define MSI_IOVA_LENGTH 0x100000
+
+enum pri_resp {
+ PRI_RESP_DENY = 0,
+ PRI_RESP_FAIL = 1,
+ PRI_RESP_SUCC = 2,
+};
+
+struct arm_smmu_cmdq_ent {
+ /* Common fields */
+ u8 opcode;
+ bool substream_valid;
+
+ /* Command-specific fields */
+ union {
+ #define CMDQ_OP_PREFETCH_CFG 0x1
+ struct {
+ u32 sid;
+ u8 size;
+ u64 addr;
+ } prefetch;
+
+ #define CMDQ_OP_CFGI_STE 0x3
+ #define CMDQ_OP_CFGI_ALL 0x4
+ #define CMDQ_OP_CFGI_CD 0x5
+ #define CMDQ_OP_CFGI_CD_ALL 0x6
+ struct {
+ u32 sid;
+ u32 ssid;
+ union {
+ bool leaf;
+ u8 span;
+ };
+ } cfgi;
+
+ #define CMDQ_OP_TLBI_NH_ASID 0x11
+ #define CMDQ_OP_TLBI_NH_VA 0x12
+ #define CMDQ_OP_TLBI_EL2_ALL 0x20
+ #define CMDQ_OP_TLBI_S12_VMALL 0x28
+ #define CMDQ_OP_TLBI_S2_IPA 0x2a
+ #define CMDQ_OP_TLBI_NSNH_ALL 0x30
+ struct {
+ u8 num;
+ u8 scale;
+ u16 asid;
+ u16 vmid;
+ bool leaf;
+ u8 ttl;
+ u8 tg;
+ u64 addr;
+ } tlbi;
+
+ #define CMDQ_OP_ATC_INV 0x40
+ #define ATC_INV_SIZE_ALL 52
+ struct {
+ u32 sid;
+ u32 ssid;
+ u64 addr;
+ u8 size;
+ bool global;
+ } atc;
+
+ #define CMDQ_OP_PRI_RESP 0x41
+ struct {
+ u32 sid;
+ u32 ssid;
+ u16 grpid;
+ enum pri_resp resp;
+ } pri;
+
+ #define CMDQ_OP_CMD_SYNC 0x46
+ struct {
+ u64 msiaddr;
+ } sync;
+ };
+};
+
+struct arm_smmu_ll_queue {
+ union {
+ u64 val;
+ struct {
+ u32 prod;
+ u32 cons;
+ };
+ struct {
+ atomic_t prod;
+ atomic_t cons;
+ } atomic;
+ u8 __pad[SMP_CACHE_BYTES];
+ } ____cacheline_aligned_in_smp;
+ u32 max_n_shift;
+};
+
+struct arm_smmu_queue {
+ struct arm_smmu_ll_queue llq;
+ int irq; /* Wired interrupt */
+
+ __le64 *base;
+ dma_addr_t base_dma;
+ u64 q_base;
+
+ size_t ent_dwords;
+
+ u32 __iomem *prod_reg;
+ u32 __iomem *cons_reg;
+};
+
+struct arm_smmu_queue_poll {
+ ktime_t timeout;
+ unsigned int delay;
+ unsigned int spin_cnt;
+ bool wfe;
+};
+
+struct arm_smmu_cmdq {
+ struct arm_smmu_queue q;
+ atomic_long_t *valid_map;
+ atomic_t owner_prod;
+ atomic_t lock;
+};
+
+struct arm_smmu_cmdq_batch {
+ u64 cmds[CMDQ_BATCH_ENTRIES * CMDQ_ENT_DWORDS];
+ int num;
+};
+
+struct arm_smmu_evtq {
+ struct arm_smmu_queue q;
+ u32 max_stalls;
+};
+
+struct arm_smmu_priq {
+ struct arm_smmu_queue q;
+};
+
+/* High-level stream table and context descriptor structures */
+struct arm_smmu_strtab_l1_desc {
+ u8 span;
+
+ __le64 *l2ptr;
+ dma_addr_t l2ptr_dma;
+};
+
+struct arm_smmu_ctx_desc {
+ u16 asid;
+ u64 ttbr;
+ u64 tcr;
+ u64 mair;
+
+ refcount_t refs;
+ struct mm_struct *mm;
+};
+
+struct arm_smmu_l1_ctx_desc {
+ __le64 *l2ptr;
+ dma_addr_t l2ptr_dma;
+};
+
+struct arm_smmu_ctx_desc_cfg {
+ __le64 *cdtab;
+ dma_addr_t cdtab_dma;
+ struct arm_smmu_l1_ctx_desc *l1_desc;
+ unsigned int num_l1_ents;
+};
+
+struct arm_smmu_s1_cfg {
+ struct arm_smmu_ctx_desc_cfg cdcfg;
+ struct arm_smmu_ctx_desc cd;
+ u8 s1fmt;
+ u8 s1cdmax;
+};
+
+struct arm_smmu_s2_cfg {
+ u16 vmid;
+ u64 vttbr;
+ u64 vtcr;
+};
+
+struct arm_smmu_strtab_cfg {
+ __le64 *strtab;
+ dma_addr_t strtab_dma;
+ struct arm_smmu_strtab_l1_desc *l1_desc;
+ unsigned int num_l1_ents;
+
+ u64 strtab_base;
+ u32 strtab_base_cfg;
+};
+
+/* An SMMUv3 instance */
+struct arm_smmu_device {
+ struct device *dev;
+ void __iomem *base;
+ void __iomem *page1;
+
+#define ARM_SMMU_FEAT_2_LVL_STRTAB (1 << 0)
+#define ARM_SMMU_FEAT_2_LVL_CDTAB (1 << 1)
+#define ARM_SMMU_FEAT_TT_LE (1 << 2)
+#define ARM_SMMU_FEAT_TT_BE (1 << 3)
+#define ARM_SMMU_FEAT_PRI (1 << 4)
+#define ARM_SMMU_FEAT_ATS (1 << 5)
+#define ARM_SMMU_FEAT_SEV (1 << 6)
+#define ARM_SMMU_FEAT_MSI (1 << 7)
+#define ARM_SMMU_FEAT_COHERENCY (1 << 8)
+#define ARM_SMMU_FEAT_TRANS_S1 (1 << 9)
+#define ARM_SMMU_FEAT_TRANS_S2 (1 << 10)
+#define ARM_SMMU_FEAT_STALLS (1 << 11)
+#define ARM_SMMU_FEAT_HYP (1 << 12)
+#define ARM_SMMU_FEAT_STALL_FORCE (1 << 13)
+#define ARM_SMMU_FEAT_VAX (1 << 14)
+#define ARM_SMMU_FEAT_RANGE_INV (1 << 15)
+#define ARM_SMMU_FEAT_BTM (1 << 16)
+#define ARM_SMMU_FEAT_SVA (1 << 17)
+ u32 features;
+
+#define ARM_SMMU_OPT_SKIP_PREFETCH (1 << 0)
+#define ARM_SMMU_OPT_PAGE0_REGS_ONLY (1 << 1)
+#define ARM_SMMU_OPT_MSIPOLL (1 << 2)
+ u32 options;
+
+ struct arm_smmu_cmdq cmdq;
+ struct arm_smmu_evtq evtq;
+ struct arm_smmu_priq priq;
+
+ int gerr_irq;
+ int combined_irq;
+
+ unsigned long ias; /* IPA */
+ unsigned long oas; /* PA */
+ unsigned long pgsize_bitmap;
+
+#define ARM_SMMU_MAX_ASIDS (1 << 16)
+ unsigned int asid_bits;
+
+#define ARM_SMMU_MAX_VMIDS (1 << 16)
+ unsigned int vmid_bits;
+ DECLARE_BITMAP(vmid_map, ARM_SMMU_MAX_VMIDS);
+
+ unsigned int ssid_bits;
+ unsigned int sid_bits;
+
+ struct arm_smmu_strtab_cfg strtab_cfg;
+
+ /* IOMMU core code handle */
+ struct iommu_device iommu;
+};
+
+/* SMMU private data for each master */
+struct arm_smmu_master {
+ struct arm_smmu_device *smmu;
+ struct device *dev;
+ struct arm_smmu_domain *domain;
+ struct list_head domain_head;
+ u32 *sids;
+ unsigned int num_sids;
+ bool ats_enabled;
+ bool sva_enabled;
+ struct list_head bonds;
+ unsigned int ssid_bits;
+};
+
+/* SMMU private data for an IOMMU domain */
+enum arm_smmu_domain_stage {
+ ARM_SMMU_DOMAIN_S1 = 0,
+ ARM_SMMU_DOMAIN_S2,
+ ARM_SMMU_DOMAIN_NESTED,
+ ARM_SMMU_DOMAIN_BYPASS,
+};
+
+struct arm_smmu_domain {
+ struct arm_smmu_device *smmu;
+ struct mutex init_mutex; /* Protects smmu pointer */
+
+ struct io_pgtable_ops *pgtbl_ops;
+ bool non_strict;
+ atomic_t nr_ats_masters;
+
+ enum arm_smmu_domain_stage stage;
+ union {
+ struct arm_smmu_s1_cfg s1_cfg;
+ struct arm_smmu_s2_cfg s2_cfg;
+ };
+
+ struct iommu_domain domain;
+
+ struct list_head devices;
+ spinlock_t devices_lock;
+};
+
+extern struct xarray arm_smmu_asid_xa;
+extern struct mutex arm_smmu_asid_lock;
+
+int arm_smmu_write_ctx_desc(struct arm_smmu_domain *smmu_domain, int ssid,
+ struct arm_smmu_ctx_desc *cd);
+void arm_smmu_tlb_inv_asid(struct arm_smmu_device *smmu, u16 asid);
+bool arm_smmu_free_asid(struct arm_smmu_ctx_desc *cd);
+
+#ifdef CONFIG_ARM_SMMU_V3_SVA
+bool arm_smmu_sva_supported(struct arm_smmu_device *smmu);
+bool arm_smmu_master_sva_supported(struct arm_smmu_master *master);
+bool arm_smmu_master_sva_enabled(struct arm_smmu_master *master);
+int arm_smmu_master_enable_sva(struct arm_smmu_master *master);
+int arm_smmu_master_disable_sva(struct arm_smmu_master *master);
+#else /* CONFIG_ARM_SMMU_V3_SVA */
+static inline bool arm_smmu_sva_supported(struct arm_smmu_device *smmu)
+{
+ return false;
+}
+
+static inline bool arm_smmu_master_sva_supported(struct arm_smmu_master *master)
+{
+ return false;
+}
+
+static inline bool arm_smmu_master_sva_enabled(struct arm_smmu_master *master)
+{
+ return false;
+}
+
+static inline int arm_smmu_master_enable_sva(struct arm_smmu_master *master)
+{
+ return -ENODEV;
+}
+
+static inline int arm_smmu_master_disable_sva(struct arm_smmu_master *master)
+{
+ return -ENODEV;
+}
+#endif /* CONFIG_ARM_SMMU_V3_SVA */
+#endif /* _ARM_SMMU_V3_H */
diff --git a/drivers/iommu/arm/arm-smmu/Makefile b/drivers/iommu/arm/arm-smmu/Makefile
new file mode 100644
index 000000000..e240a7bcf
--- /dev/null
+++ b/drivers/iommu/arm/arm-smmu/Makefile
@@ -0,0 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0
+obj-$(CONFIG_QCOM_IOMMU) += qcom_iommu.o
+obj-$(CONFIG_ARM_SMMU) += arm_smmu.o
+arm_smmu-objs += arm-smmu.o arm-smmu-impl.o arm-smmu-nvidia.o arm-smmu-qcom.o
diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu-impl.c b/drivers/iommu/arm/arm-smmu/arm-smmu-impl.c
new file mode 100644
index 000000000..88f17cc33
--- /dev/null
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu-impl.c
@@ -0,0 +1,230 @@
+// SPDX-License-Identifier: GPL-2.0-only
+// Miscellaneous Arm SMMU implementation and integration quirks
+// Copyright (C) 2019 Arm Limited
+
+#define pr_fmt(fmt) "arm-smmu: " fmt
+
+#include <linux/bitfield.h>
+#include <linux/of.h>
+
+#include "arm-smmu.h"
+
+
+static int arm_smmu_gr0_ns(int offset)
+{
+ switch(offset) {
+ case ARM_SMMU_GR0_sCR0:
+ case ARM_SMMU_GR0_sACR:
+ case ARM_SMMU_GR0_sGFSR:
+ case ARM_SMMU_GR0_sGFSYNR0:
+ case ARM_SMMU_GR0_sGFSYNR1:
+ case ARM_SMMU_GR0_sGFSYNR2:
+ return offset + 0x400;
+ default:
+ return offset;
+ }
+}
+
+static u32 arm_smmu_read_ns(struct arm_smmu_device *smmu, int page,
+ int offset)
+{
+ if (page == ARM_SMMU_GR0)
+ offset = arm_smmu_gr0_ns(offset);
+ return readl_relaxed(arm_smmu_page(smmu, page) + offset);
+}
+
+static void arm_smmu_write_ns(struct arm_smmu_device *smmu, int page,
+ int offset, u32 val)
+{
+ if (page == ARM_SMMU_GR0)
+ offset = arm_smmu_gr0_ns(offset);
+ writel_relaxed(val, arm_smmu_page(smmu, page) + offset);
+}
+
+/* Since we don't care for sGFAR, we can do without 64-bit accessors */
+static const struct arm_smmu_impl calxeda_impl = {
+ .read_reg = arm_smmu_read_ns,
+ .write_reg = arm_smmu_write_ns,
+};
+
+
+struct cavium_smmu {
+ struct arm_smmu_device smmu;
+ u32 id_base;
+};
+
+static int cavium_cfg_probe(struct arm_smmu_device *smmu)
+{
+ static atomic_t context_count = ATOMIC_INIT(0);
+ struct cavium_smmu *cs = container_of(smmu, struct cavium_smmu, smmu);
+ /*
+ * Cavium CN88xx erratum #27704.
+ * Ensure ASID and VMID allocation is unique across all SMMUs in
+ * the system.
+ */
+ cs->id_base = atomic_fetch_add(smmu->num_context_banks, &context_count);
+ dev_notice(smmu->dev, "\tenabling workaround for Cavium erratum 27704\n");
+
+ return 0;
+}
+
+static int cavium_init_context(struct arm_smmu_domain *smmu_domain,
+ struct io_pgtable_cfg *pgtbl_cfg, struct device *dev)
+{
+ struct cavium_smmu *cs = container_of(smmu_domain->smmu,
+ struct cavium_smmu, smmu);
+
+ if (smmu_domain->stage == ARM_SMMU_DOMAIN_S2)
+ smmu_domain->cfg.vmid += cs->id_base;
+ else
+ smmu_domain->cfg.asid += cs->id_base;
+
+ return 0;
+}
+
+static const struct arm_smmu_impl cavium_impl = {
+ .cfg_probe = cavium_cfg_probe,
+ .init_context = cavium_init_context,
+};
+
+static struct arm_smmu_device *cavium_smmu_impl_init(struct arm_smmu_device *smmu)
+{
+ struct cavium_smmu *cs;
+
+ cs = devm_kzalloc(smmu->dev, sizeof(*cs), GFP_KERNEL);
+ if (!cs)
+ return ERR_PTR(-ENOMEM);
+
+ cs->smmu = *smmu;
+ cs->smmu.impl = &cavium_impl;
+
+ devm_kfree(smmu->dev, smmu);
+
+ return &cs->smmu;
+}
+
+
+#define ARM_MMU500_ACTLR_CPRE (1 << 1)
+
+#define ARM_MMU500_ACR_CACHE_LOCK (1 << 26)
+#define ARM_MMU500_ACR_S2CRB_TLBEN (1 << 10)
+#define ARM_MMU500_ACR_SMTNMB_TLBEN (1 << 8)
+
+int arm_mmu500_reset(struct arm_smmu_device *smmu)
+{
+ u32 reg, major;
+ int i;
+ /*
+ * On MMU-500 r2p0 onwards we need to clear ACR.CACHE_LOCK before
+ * writes to the context bank ACTLRs will stick. And we just hope that
+ * Secure has also cleared SACR.CACHE_LOCK for this to take effect...
+ */
+ reg = arm_smmu_gr0_read(smmu, ARM_SMMU_GR0_ID7);
+ major = FIELD_GET(ARM_SMMU_ID7_MAJOR, reg);
+ reg = arm_smmu_gr0_read(smmu, ARM_SMMU_GR0_sACR);
+ if (major >= 2)
+ reg &= ~ARM_MMU500_ACR_CACHE_LOCK;
+ /*
+ * Allow unmatched Stream IDs to allocate bypass
+ * TLB entries for reduced latency.
+ */
+ reg |= ARM_MMU500_ACR_SMTNMB_TLBEN | ARM_MMU500_ACR_S2CRB_TLBEN;
+ arm_smmu_gr0_write(smmu, ARM_SMMU_GR0_sACR, reg);
+
+ /*
+ * Disable MMU-500's not-particularly-beneficial next-page
+ * prefetcher for the sake of errata #841119 and #826419.
+ */
+ for (i = 0; i < smmu->num_context_banks; ++i) {
+ reg = arm_smmu_cb_read(smmu, i, ARM_SMMU_CB_ACTLR);
+ reg &= ~ARM_MMU500_ACTLR_CPRE;
+ arm_smmu_cb_write(smmu, i, ARM_SMMU_CB_ACTLR, reg);
+ }
+
+ return 0;
+}
+
+static const struct arm_smmu_impl arm_mmu500_impl = {
+ .reset = arm_mmu500_reset,
+};
+
+static u64 mrvl_mmu500_readq(struct arm_smmu_device *smmu, int page, int off)
+{
+ /*
+ * Marvell Armada-AP806 erratum #582743.
+ * Split all the readq to double readl
+ */
+ return hi_lo_readq_relaxed(arm_smmu_page(smmu, page) + off);
+}
+
+static void mrvl_mmu500_writeq(struct arm_smmu_device *smmu, int page, int off,
+ u64 val)
+{
+ /*
+ * Marvell Armada-AP806 erratum #582743.
+ * Split all the writeq to double writel
+ */
+ hi_lo_writeq_relaxed(val, arm_smmu_page(smmu, page) + off);
+}
+
+static int mrvl_mmu500_cfg_probe(struct arm_smmu_device *smmu)
+{
+
+ /*
+ * Armada-AP806 erratum #582743.
+ * Hide the SMMU_IDR2.PTFSv8 fields to sidestep the AArch64
+ * formats altogether and allow using 32 bits access on the
+ * interconnect.
+ */
+ smmu->features &= ~(ARM_SMMU_FEAT_FMT_AARCH64_4K |
+ ARM_SMMU_FEAT_FMT_AARCH64_16K |
+ ARM_SMMU_FEAT_FMT_AARCH64_64K);
+
+ return 0;
+}
+
+static const struct arm_smmu_impl mrvl_mmu500_impl = {
+ .read_reg64 = mrvl_mmu500_readq,
+ .write_reg64 = mrvl_mmu500_writeq,
+ .cfg_probe = mrvl_mmu500_cfg_probe,
+ .reset = arm_mmu500_reset,
+};
+
+
+struct arm_smmu_device *arm_smmu_impl_init(struct arm_smmu_device *smmu)
+{
+ const struct device_node *np = smmu->dev->of_node;
+
+ /*
+ * Set the impl for model-specific implementation quirks first,
+ * such that platform integration quirks can pick it up and
+ * inherit from it if necessary.
+ */
+ switch (smmu->model) {
+ case ARM_MMU500:
+ smmu->impl = &arm_mmu500_impl;
+ break;
+ case CAVIUM_SMMUV2:
+ return cavium_smmu_impl_init(smmu);
+ default:
+ break;
+ }
+
+ /* This is implicitly MMU-400 */
+ if (of_property_read_bool(np, "calxeda,smmu-secure-config-access"))
+ smmu->impl = &calxeda_impl;
+
+ if (of_device_is_compatible(np, "nvidia,tegra194-smmu"))
+ return nvidia_smmu_impl_init(smmu);
+
+ if (of_device_is_compatible(np, "qcom,sdm845-smmu-500") ||
+ of_device_is_compatible(np, "qcom,sc7180-smmu-500") ||
+ of_device_is_compatible(np, "qcom,sm8150-smmu-500") ||
+ of_device_is_compatible(np, "qcom,sm8250-smmu-500"))
+ return qcom_smmu_impl_init(smmu);
+
+ if (of_device_is_compatible(np, "marvell,ap806-smmu-500"))
+ smmu->impl = &mrvl_mmu500_impl;
+
+ return smmu;
+}
diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu-nvidia.c b/drivers/iommu/arm/arm-smmu/arm-smmu-nvidia.c
new file mode 100644
index 000000000..31368057e
--- /dev/null
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu-nvidia.c
@@ -0,0 +1,278 @@
+// SPDX-License-Identifier: GPL-2.0-only
+// Copyright (C) 2019-2020 NVIDIA CORPORATION. All rights reserved.
+
+#include <linux/bitfield.h>
+#include <linux/delay.h>
+#include <linux/of.h>
+#include <linux/platform_device.h>
+#include <linux/slab.h>
+
+#include "arm-smmu.h"
+
+/*
+ * Tegra194 has three ARM MMU-500 Instances.
+ * Two of them are used together and must be programmed identically for
+ * interleaved IOVA accesses across them and translates accesses from
+ * non-isochronous HW devices.
+ * Third one is used for translating accesses from isochronous HW devices.
+ * This implementation supports programming of the two instances that must
+ * be programmed identically.
+ * The third instance usage is through standard arm-smmu driver itself and
+ * is out of scope of this implementation.
+ */
+#define NUM_SMMU_INSTANCES 2
+
+struct nvidia_smmu {
+ struct arm_smmu_device smmu;
+ void __iomem *bases[NUM_SMMU_INSTANCES];
+};
+
+static inline void __iomem *nvidia_smmu_page(struct arm_smmu_device *smmu,
+ unsigned int inst, int page)
+{
+ struct nvidia_smmu *nvidia_smmu;
+
+ nvidia_smmu = container_of(smmu, struct nvidia_smmu, smmu);
+ return nvidia_smmu->bases[inst] + (page << smmu->pgshift);
+}
+
+static u32 nvidia_smmu_read_reg(struct arm_smmu_device *smmu,
+ int page, int offset)
+{
+ void __iomem *reg = nvidia_smmu_page(smmu, 0, page) + offset;
+
+ return readl_relaxed(reg);
+}
+
+static void nvidia_smmu_write_reg(struct arm_smmu_device *smmu,
+ int page, int offset, u32 val)
+{
+ unsigned int i;
+
+ for (i = 0; i < NUM_SMMU_INSTANCES; i++) {
+ void __iomem *reg = nvidia_smmu_page(smmu, i, page) + offset;
+
+ writel_relaxed(val, reg);
+ }
+}
+
+static u64 nvidia_smmu_read_reg64(struct arm_smmu_device *smmu,
+ int page, int offset)
+{
+ void __iomem *reg = nvidia_smmu_page(smmu, 0, page) + offset;
+
+ return readq_relaxed(reg);
+}
+
+static void nvidia_smmu_write_reg64(struct arm_smmu_device *smmu,
+ int page, int offset, u64 val)
+{
+ unsigned int i;
+
+ for (i = 0; i < NUM_SMMU_INSTANCES; i++) {
+ void __iomem *reg = nvidia_smmu_page(smmu, i, page) + offset;
+
+ writeq_relaxed(val, reg);
+ }
+}
+
+static void nvidia_smmu_tlb_sync(struct arm_smmu_device *smmu, int page,
+ int sync, int status)
+{
+ unsigned int delay;
+
+ arm_smmu_writel(smmu, page, sync, 0);
+
+ for (delay = 1; delay < TLB_LOOP_TIMEOUT; delay *= 2) {
+ unsigned int spin_cnt;
+
+ for (spin_cnt = TLB_SPIN_COUNT; spin_cnt > 0; spin_cnt--) {
+ u32 val = 0;
+ unsigned int i;
+
+ for (i = 0; i < NUM_SMMU_INSTANCES; i++) {
+ void __iomem *reg;
+
+ reg = nvidia_smmu_page(smmu, i, page) + status;
+ val |= readl_relaxed(reg);
+ }
+
+ if (!(val & ARM_SMMU_sTLBGSTATUS_GSACTIVE))
+ return;
+
+ cpu_relax();
+ }
+
+ udelay(delay);
+ }
+
+ dev_err_ratelimited(smmu->dev,
+ "TLB sync timed out -- SMMU may be deadlocked\n");
+}
+
+static int nvidia_smmu_reset(struct arm_smmu_device *smmu)
+{
+ unsigned int i;
+
+ for (i = 0; i < NUM_SMMU_INSTANCES; i++) {
+ u32 val;
+ void __iomem *reg = nvidia_smmu_page(smmu, i, ARM_SMMU_GR0) +
+ ARM_SMMU_GR0_sGFSR;
+
+ /* clear global FSR */
+ val = readl_relaxed(reg);
+ writel_relaxed(val, reg);
+ }
+
+ return 0;
+}
+
+static irqreturn_t nvidia_smmu_global_fault_inst(int irq,
+ struct arm_smmu_device *smmu,
+ int inst)
+{
+ u32 gfsr, gfsynr0, gfsynr1, gfsynr2;
+ void __iomem *gr0_base = nvidia_smmu_page(smmu, inst, 0);
+
+ gfsr = readl_relaxed(gr0_base + ARM_SMMU_GR0_sGFSR);
+ if (!gfsr)
+ return IRQ_NONE;
+
+ gfsynr0 = readl_relaxed(gr0_base + ARM_SMMU_GR0_sGFSYNR0);
+ gfsynr1 = readl_relaxed(gr0_base + ARM_SMMU_GR0_sGFSYNR1);
+ gfsynr2 = readl_relaxed(gr0_base + ARM_SMMU_GR0_sGFSYNR2);
+
+ dev_err_ratelimited(smmu->dev,
+ "Unexpected global fault, this could be serious\n");
+ dev_err_ratelimited(smmu->dev,
+ "\tGFSR 0x%08x, GFSYNR0 0x%08x, GFSYNR1 0x%08x, GFSYNR2 0x%08x\n",
+ gfsr, gfsynr0, gfsynr1, gfsynr2);
+
+ writel_relaxed(gfsr, gr0_base + ARM_SMMU_GR0_sGFSR);
+ return IRQ_HANDLED;
+}
+
+static irqreturn_t nvidia_smmu_global_fault(int irq, void *dev)
+{
+ unsigned int inst;
+ irqreturn_t ret = IRQ_NONE;
+ struct arm_smmu_device *smmu = dev;
+
+ for (inst = 0; inst < NUM_SMMU_INSTANCES; inst++) {
+ irqreturn_t irq_ret;
+
+ irq_ret = nvidia_smmu_global_fault_inst(irq, smmu, inst);
+ if (irq_ret == IRQ_HANDLED)
+ ret = IRQ_HANDLED;
+ }
+
+ return ret;
+}
+
+static irqreturn_t nvidia_smmu_context_fault_bank(int irq,
+ struct arm_smmu_device *smmu,
+ int idx, int inst)
+{
+ u32 fsr, fsynr, cbfrsynra;
+ unsigned long iova;
+ void __iomem *gr1_base = nvidia_smmu_page(smmu, inst, 1);
+ void __iomem *cb_base = nvidia_smmu_page(smmu, inst, smmu->numpage + idx);
+
+ fsr = readl_relaxed(cb_base + ARM_SMMU_CB_FSR);
+ if (!(fsr & ARM_SMMU_FSR_FAULT))
+ return IRQ_NONE;
+
+ fsynr = readl_relaxed(cb_base + ARM_SMMU_CB_FSYNR0);
+ iova = readq_relaxed(cb_base + ARM_SMMU_CB_FAR);
+ cbfrsynra = readl_relaxed(gr1_base + ARM_SMMU_GR1_CBFRSYNRA(idx));
+
+ dev_err_ratelimited(smmu->dev,
+ "Unhandled context fault: fsr=0x%x, iova=0x%08lx, fsynr=0x%x, cbfrsynra=0x%x, cb=%d\n",
+ fsr, iova, fsynr, cbfrsynra, idx);
+
+ writel_relaxed(fsr, cb_base + ARM_SMMU_CB_FSR);
+ return IRQ_HANDLED;
+}
+
+static irqreturn_t nvidia_smmu_context_fault(int irq, void *dev)
+{
+ int idx;
+ unsigned int inst;
+ irqreturn_t ret = IRQ_NONE;
+ struct arm_smmu_device *smmu;
+ struct iommu_domain *domain = dev;
+ struct arm_smmu_domain *smmu_domain;
+
+ smmu_domain = container_of(domain, struct arm_smmu_domain, domain);
+ smmu = smmu_domain->smmu;
+
+ for (inst = 0; inst < NUM_SMMU_INSTANCES; inst++) {
+ irqreturn_t irq_ret;
+
+ /*
+ * Interrupt line is shared between all contexts.
+ * Check for faults across all contexts.
+ */
+ for (idx = 0; idx < smmu->num_context_banks; idx++) {
+ irq_ret = nvidia_smmu_context_fault_bank(irq, smmu,
+ idx, inst);
+ if (irq_ret == IRQ_HANDLED)
+ ret = IRQ_HANDLED;
+ }
+ }
+
+ return ret;
+}
+
+static const struct arm_smmu_impl nvidia_smmu_impl = {
+ .read_reg = nvidia_smmu_read_reg,
+ .write_reg = nvidia_smmu_write_reg,
+ .read_reg64 = nvidia_smmu_read_reg64,
+ .write_reg64 = nvidia_smmu_write_reg64,
+ .reset = nvidia_smmu_reset,
+ .tlb_sync = nvidia_smmu_tlb_sync,
+ .global_fault = nvidia_smmu_global_fault,
+ .context_fault = nvidia_smmu_context_fault,
+};
+
+struct arm_smmu_device *nvidia_smmu_impl_init(struct arm_smmu_device *smmu)
+{
+ struct resource *res;
+ struct device *dev = smmu->dev;
+ struct nvidia_smmu *nvidia_smmu;
+ struct platform_device *pdev = to_platform_device(dev);
+
+ nvidia_smmu = devm_kzalloc(dev, sizeof(*nvidia_smmu), GFP_KERNEL);
+ if (!nvidia_smmu)
+ return ERR_PTR(-ENOMEM);
+
+ /*
+ * Copy the data from struct arm_smmu_device *smmu allocated in
+ * arm-smmu.c. The smmu from struct nvidia_smmu replaces the smmu
+ * pointer used in arm-smmu.c once this function returns.
+ * This is necessary to derive nvidia_smmu from smmu pointer passed
+ * through arm_smmu_impl function calls subsequently.
+ */
+ nvidia_smmu->smmu = *smmu;
+ /* Instance 0 is ioremapped by arm-smmu.c. */
+ nvidia_smmu->bases[0] = smmu->base;
+
+ res = platform_get_resource(pdev, IORESOURCE_MEM, 1);
+ if (!res)
+ return ERR_PTR(-ENODEV);
+
+ nvidia_smmu->bases[1] = devm_ioremap_resource(dev, res);
+ if (IS_ERR(nvidia_smmu->bases[1]))
+ return ERR_CAST(nvidia_smmu->bases[1]);
+
+ nvidia_smmu->smmu.impl = &nvidia_smmu_impl;
+
+ /*
+ * Free the struct arm_smmu_device *smmu allocated in arm-smmu.c.
+ * Once this function returns, arm-smmu.c would use arm_smmu_device
+ * allocated as part of struct nvidia_smmu.
+ */
+ devm_kfree(dev, smmu);
+
+ return &nvidia_smmu->smmu;
+}
diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c
new file mode 100644
index 000000000..a5164d5cb
--- /dev/null
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c
@@ -0,0 +1,195 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2019, The Linux Foundation. All rights reserved.
+ */
+
+#include <linux/of_device.h>
+#include <linux/qcom_scm.h>
+
+#include "arm-smmu.h"
+
+struct qcom_smmu {
+ struct arm_smmu_device smmu;
+ bool bypass_quirk;
+ u8 bypass_cbndx;
+};
+
+static struct qcom_smmu *to_qcom_smmu(struct arm_smmu_device *smmu)
+{
+ return container_of(smmu, struct qcom_smmu, smmu);
+}
+
+static const struct of_device_id qcom_smmu_client_of_match[] __maybe_unused = {
+ { .compatible = "qcom,adreno" },
+ { .compatible = "qcom,adreno-gmu" },
+ { .compatible = "qcom,mdp4" },
+ { .compatible = "qcom,mdss" },
+ { .compatible = "qcom,sc7180-mdss" },
+ { .compatible = "qcom,sc7180-mss-pil" },
+ { .compatible = "qcom,sdm845-mdss" },
+ { .compatible = "qcom,sdm845-mss-pil" },
+ { }
+};
+
+static int qcom_smmu_cfg_probe(struct arm_smmu_device *smmu)
+{
+ struct qcom_smmu *qsmmu = to_qcom_smmu(smmu);
+ unsigned int last_s2cr;
+ u32 reg;
+ u32 smr;
+ int i;
+
+ /*
+ * Some platforms support more than the Arm SMMU architected maximum of
+ * 128 stream matching groups. For unknown reasons, the additional
+ * groups don't exhibit the same behavior as the architected registers,
+ * so limit the groups to 128 until the behavior is fixed for the other
+ * groups.
+ */
+ if (smmu->num_mapping_groups > 128) {
+ dev_notice(smmu->dev, "\tLimiting the stream matching groups to 128\n");
+ smmu->num_mapping_groups = 128;
+ }
+
+ last_s2cr = ARM_SMMU_GR0_S2CR(smmu->num_mapping_groups - 1);
+
+ /*
+ * With some firmware versions writes to S2CR of type FAULT are
+ * ignored, and writing BYPASS will end up written as FAULT in the
+ * register. Perform a write to S2CR to detect if this is the case and
+ * if so reserve a context bank to emulate bypass streams.
+ */
+ reg = FIELD_PREP(ARM_SMMU_S2CR_TYPE, S2CR_TYPE_BYPASS) |
+ FIELD_PREP(ARM_SMMU_S2CR_CBNDX, 0xff) |
+ FIELD_PREP(ARM_SMMU_S2CR_PRIVCFG, S2CR_PRIVCFG_DEFAULT);
+ arm_smmu_gr0_write(smmu, last_s2cr, reg);
+ reg = arm_smmu_gr0_read(smmu, last_s2cr);
+ if (FIELD_GET(ARM_SMMU_S2CR_TYPE, reg) != S2CR_TYPE_BYPASS) {
+ qsmmu->bypass_quirk = true;
+ qsmmu->bypass_cbndx = smmu->num_context_banks - 1;
+
+ set_bit(qsmmu->bypass_cbndx, smmu->context_map);
+
+ arm_smmu_cb_write(smmu, qsmmu->bypass_cbndx, ARM_SMMU_CB_SCTLR, 0);
+
+ reg = FIELD_PREP(ARM_SMMU_CBAR_TYPE, CBAR_TYPE_S1_TRANS_S2_BYPASS);
+ arm_smmu_gr1_write(smmu, ARM_SMMU_GR1_CBAR(qsmmu->bypass_cbndx), reg);
+ }
+
+ for (i = 0; i < smmu->num_mapping_groups; i++) {
+ smr = arm_smmu_gr0_read(smmu, ARM_SMMU_GR0_SMR(i));
+
+ if (FIELD_GET(ARM_SMMU_SMR_VALID, smr)) {
+ /* Ignore valid bit for SMR mask extraction. */
+ smr &= ~ARM_SMMU_SMR_VALID;
+ smmu->smrs[i].id = FIELD_GET(ARM_SMMU_SMR_ID, smr);
+ smmu->smrs[i].mask = FIELD_GET(ARM_SMMU_SMR_MASK, smr);
+ smmu->smrs[i].valid = true;
+
+ smmu->s2crs[i].type = S2CR_TYPE_BYPASS;
+ smmu->s2crs[i].privcfg = S2CR_PRIVCFG_DEFAULT;
+ smmu->s2crs[i].cbndx = 0xff;
+ }
+ }
+
+ return 0;
+}
+
+static void qcom_smmu_write_s2cr(struct arm_smmu_device *smmu, int idx)
+{
+ struct arm_smmu_s2cr *s2cr = smmu->s2crs + idx;
+ struct qcom_smmu *qsmmu = to_qcom_smmu(smmu);
+ u32 cbndx = s2cr->cbndx;
+ u32 type = s2cr->type;
+ u32 reg;
+
+ if (qsmmu->bypass_quirk) {
+ if (type == S2CR_TYPE_BYPASS) {
+ /*
+ * Firmware with quirky S2CR handling will substitute
+ * BYPASS writes with FAULT, so point the stream to the
+ * reserved context bank and ask for translation on the
+ * stream
+ */
+ type = S2CR_TYPE_TRANS;
+ cbndx = qsmmu->bypass_cbndx;
+ } else if (type == S2CR_TYPE_FAULT) {
+ /*
+ * Firmware with quirky S2CR handling will ignore FAULT
+ * writes, so trick it to write FAULT by asking for a
+ * BYPASS.
+ */
+ type = S2CR_TYPE_BYPASS;
+ cbndx = 0xff;
+ }
+ }
+
+ reg = FIELD_PREP(ARM_SMMU_S2CR_TYPE, type) |
+ FIELD_PREP(ARM_SMMU_S2CR_CBNDX, cbndx) |
+ FIELD_PREP(ARM_SMMU_S2CR_PRIVCFG, s2cr->privcfg);
+ arm_smmu_gr0_write(smmu, ARM_SMMU_GR0_S2CR(idx), reg);
+}
+
+static int qcom_smmu_def_domain_type(struct device *dev)
+{
+ const struct of_device_id *match =
+ of_match_device(qcom_smmu_client_of_match, dev);
+
+ return match ? IOMMU_DOMAIN_IDENTITY : 0;
+}
+
+static int qcom_sdm845_smmu500_reset(struct arm_smmu_device *smmu)
+{
+ int ret;
+
+ /*
+ * To address performance degradation in non-real time clients,
+ * such as USB and UFS, turn off wait-for-safe on sdm845 based boards,
+ * such as MTP and db845, whose firmwares implement secure monitor
+ * call handlers to turn on/off the wait-for-safe logic.
+ */
+ ret = qcom_scm_qsmmu500_wait_safe_toggle(0);
+ if (ret)
+ dev_warn(smmu->dev, "Failed to turn off SAFE logic\n");
+
+ return ret;
+}
+
+static int qcom_smmu500_reset(struct arm_smmu_device *smmu)
+{
+ const struct device_node *np = smmu->dev->of_node;
+
+ arm_mmu500_reset(smmu);
+
+ if (of_device_is_compatible(np, "qcom,sdm845-smmu-500"))
+ return qcom_sdm845_smmu500_reset(smmu);
+
+ return 0;
+}
+
+static const struct arm_smmu_impl qcom_smmu_impl = {
+ .cfg_probe = qcom_smmu_cfg_probe,
+ .def_domain_type = qcom_smmu_def_domain_type,
+ .reset = qcom_smmu500_reset,
+ .write_s2cr = qcom_smmu_write_s2cr,
+};
+
+struct arm_smmu_device *qcom_smmu_impl_init(struct arm_smmu_device *smmu)
+{
+ struct qcom_smmu *qsmmu;
+
+ /* Check to make sure qcom_scm has finished probing */
+ if (!qcom_scm_is_available())
+ return ERR_PTR(-EPROBE_DEFER);
+
+ qsmmu = devm_kzalloc(smmu->dev, sizeof(*qsmmu), GFP_KERNEL);
+ if (!qsmmu)
+ return ERR_PTR(-ENOMEM);
+
+ qsmmu->smmu = *smmu;
+
+ qsmmu->smmu.impl = &qcom_smmu_impl;
+ devm_kfree(smmu->dev, smmu);
+
+ return &qsmmu->smmu;
+}
diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.c b/drivers/iommu/arm/arm-smmu/arm-smmu.c
new file mode 100644
index 000000000..6b41fe229
--- /dev/null
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu.c
@@ -0,0 +1,2349 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * IOMMU API for ARM architected SMMU implementations.
+ *
+ * Copyright (C) 2013 ARM Limited
+ *
+ * Author: Will Deacon <will.deacon@arm.com>
+ *
+ * This driver currently supports:
+ * - SMMUv1 and v2 implementations
+ * - Stream-matching and stream-indexing
+ * - v7/v8 long-descriptor format
+ * - Non-secure access to the SMMU
+ * - Context fault reporting
+ * - Extended Stream ID (16 bit)
+ */
+
+#define pr_fmt(fmt) "arm-smmu: " fmt
+
+#include <linux/acpi.h>
+#include <linux/acpi_iort.h>
+#include <linux/bitfield.h>
+#include <linux/delay.h>
+#include <linux/dma-iommu.h>
+#include <linux/dma-mapping.h>
+#include <linux/err.h>
+#include <linux/interrupt.h>
+#include <linux/io.h>
+#include <linux/iopoll.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/of_address.h>
+#include <linux/of_device.h>
+#include <linux/of_iommu.h>
+#include <linux/pci.h>
+#include <linux/platform_device.h>
+#include <linux/pm_runtime.h>
+#include <linux/ratelimit.h>
+#include <linux/slab.h>
+
+#include <linux/amba/bus.h>
+#include <linux/fsl/mc.h>
+
+#include "arm-smmu.h"
+
+/*
+ * Apparently, some Qualcomm arm64 platforms which appear to expose their SMMU
+ * global register space are still, in fact, using a hypervisor to mediate it
+ * by trapping and emulating register accesses. Sadly, some deployed versions
+ * of said trapping code have bugs wherein they go horribly wrong for stores
+ * using r31 (i.e. XZR/WZR) as the source register.
+ */
+#define QCOM_DUMMY_VAL -1
+
+#define MSI_IOVA_BASE 0x8000000
+#define MSI_IOVA_LENGTH 0x100000
+
+static int force_stage;
+module_param(force_stage, int, S_IRUGO);
+MODULE_PARM_DESC(force_stage,
+ "Force SMMU mappings to be installed at a particular stage of translation. A value of '1' or '2' forces the corresponding stage. All other values are ignored (i.e. no stage is forced). Note that selecting a specific stage will disable support for nested translation.");
+static bool disable_bypass =
+ IS_ENABLED(CONFIG_ARM_SMMU_DISABLE_BYPASS_BY_DEFAULT);
+module_param(disable_bypass, bool, S_IRUGO);
+MODULE_PARM_DESC(disable_bypass,
+ "Disable bypass streams such that incoming transactions from devices that are not attached to an iommu domain will report an abort back to the device and will not be allowed to pass through the SMMU.");
+
+#define s2cr_init_val (struct arm_smmu_s2cr){ \
+ .type = disable_bypass ? S2CR_TYPE_FAULT : S2CR_TYPE_BYPASS, \
+}
+
+static bool using_legacy_binding, using_generic_binding;
+
+static inline int arm_smmu_rpm_get(struct arm_smmu_device *smmu)
+{
+ if (pm_runtime_enabled(smmu->dev))
+ return pm_runtime_resume_and_get(smmu->dev);
+
+ return 0;
+}
+
+static inline void arm_smmu_rpm_put(struct arm_smmu_device *smmu)
+{
+ if (pm_runtime_enabled(smmu->dev))
+ pm_runtime_put_autosuspend(smmu->dev);
+}
+
+static struct arm_smmu_domain *to_smmu_domain(struct iommu_domain *dom)
+{
+ return container_of(dom, struct arm_smmu_domain, domain);
+}
+
+static struct platform_driver arm_smmu_driver;
+static struct iommu_ops arm_smmu_ops;
+
+#ifdef CONFIG_ARM_SMMU_LEGACY_DT_BINDINGS
+static int arm_smmu_bus_init(struct iommu_ops *ops);
+
+static struct device_node *dev_get_dev_node(struct device *dev)
+{
+ if (dev_is_pci(dev)) {
+ struct pci_bus *bus = to_pci_dev(dev)->bus;
+
+ while (!pci_is_root_bus(bus))
+ bus = bus->parent;
+ return of_node_get(bus->bridge->parent->of_node);
+ }
+
+ return of_node_get(dev->of_node);
+}
+
+static int __arm_smmu_get_pci_sid(struct pci_dev *pdev, u16 alias, void *data)
+{
+ *((__be32 *)data) = cpu_to_be32(alias);
+ return 0; /* Continue walking */
+}
+
+static int __find_legacy_master_phandle(struct device *dev, void *data)
+{
+ struct of_phandle_iterator *it = *(void **)data;
+ struct device_node *np = it->node;
+ int err;
+
+ of_for_each_phandle(it, err, dev->of_node, "mmu-masters",
+ "#stream-id-cells", -1)
+ if (it->node == np) {
+ *(void **)data = dev;
+ return 1;
+ }
+ it->node = np;
+ return err == -ENOENT ? 0 : err;
+}
+
+static int arm_smmu_register_legacy_master(struct device *dev,
+ struct arm_smmu_device **smmu)
+{
+ struct device *smmu_dev;
+ struct device_node *np;
+ struct of_phandle_iterator it;
+ void *data = &it;
+ u32 *sids;
+ __be32 pci_sid;
+ int err;
+
+ np = dev_get_dev_node(dev);
+ if (!np || !of_find_property(np, "#stream-id-cells", NULL)) {
+ of_node_put(np);
+ return -ENODEV;
+ }
+
+ it.node = np;
+ err = driver_for_each_device(&arm_smmu_driver.driver, NULL, &data,
+ __find_legacy_master_phandle);
+ smmu_dev = data;
+ of_node_put(np);
+ if (err == 0)
+ return -ENODEV;
+ if (err < 0)
+ return err;
+
+ if (dev_is_pci(dev)) {
+ /* "mmu-masters" assumes Stream ID == Requester ID */
+ pci_for_each_dma_alias(to_pci_dev(dev), __arm_smmu_get_pci_sid,
+ &pci_sid);
+ it.cur = &pci_sid;
+ it.cur_count = 1;
+ }
+
+ err = iommu_fwspec_init(dev, &smmu_dev->of_node->fwnode,
+ &arm_smmu_ops);
+ if (err)
+ return err;
+
+ sids = kcalloc(it.cur_count, sizeof(*sids), GFP_KERNEL);
+ if (!sids)
+ return -ENOMEM;
+
+ *smmu = dev_get_drvdata(smmu_dev);
+ of_phandle_iterator_args(&it, sids, it.cur_count);
+ err = iommu_fwspec_add_ids(dev, sids, it.cur_count);
+ kfree(sids);
+ return err;
+}
+
+/*
+ * With the legacy DT binding in play, we have no guarantees about
+ * probe order, but then we're also not doing default domains, so we can
+ * delay setting bus ops until we're sure every possible SMMU is ready,
+ * and that way ensure that no probe_device() calls get missed.
+ */
+static int arm_smmu_legacy_bus_init(void)
+{
+ if (using_legacy_binding)
+ return arm_smmu_bus_init(&arm_smmu_ops);
+ return 0;
+}
+device_initcall_sync(arm_smmu_legacy_bus_init);
+#else
+static int arm_smmu_register_legacy_master(struct device *dev,
+ struct arm_smmu_device **smmu)
+{
+ return -ENODEV;
+}
+#endif /* CONFIG_ARM_SMMU_LEGACY_DT_BINDINGS */
+
+static void __arm_smmu_free_bitmap(unsigned long *map, int idx)
+{
+ clear_bit(idx, map);
+}
+
+/* Wait for any pending TLB invalidations to complete */
+static void __arm_smmu_tlb_sync(struct arm_smmu_device *smmu, int page,
+ int sync, int status)
+{
+ unsigned int spin_cnt, delay;
+ u32 reg;
+
+ if (smmu->impl && unlikely(smmu->impl->tlb_sync))
+ return smmu->impl->tlb_sync(smmu, page, sync, status);
+
+ arm_smmu_writel(smmu, page, sync, QCOM_DUMMY_VAL);
+ for (delay = 1; delay < TLB_LOOP_TIMEOUT; delay *= 2) {
+ for (spin_cnt = TLB_SPIN_COUNT; spin_cnt > 0; spin_cnt--) {
+ reg = arm_smmu_readl(smmu, page, status);
+ if (!(reg & ARM_SMMU_sTLBGSTATUS_GSACTIVE))
+ return;
+ cpu_relax();
+ }
+ udelay(delay);
+ }
+ dev_err_ratelimited(smmu->dev,
+ "TLB sync timed out -- SMMU may be deadlocked\n");
+}
+
+static void arm_smmu_tlb_sync_global(struct arm_smmu_device *smmu)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&smmu->global_sync_lock, flags);
+ __arm_smmu_tlb_sync(smmu, ARM_SMMU_GR0, ARM_SMMU_GR0_sTLBGSYNC,
+ ARM_SMMU_GR0_sTLBGSTATUS);
+ spin_unlock_irqrestore(&smmu->global_sync_lock, flags);
+}
+
+static void arm_smmu_tlb_sync_context(struct arm_smmu_domain *smmu_domain)
+{
+ struct arm_smmu_device *smmu = smmu_domain->smmu;
+ unsigned long flags;
+
+ spin_lock_irqsave(&smmu_domain->cb_lock, flags);
+ __arm_smmu_tlb_sync(smmu, ARM_SMMU_CB(smmu, smmu_domain->cfg.cbndx),
+ ARM_SMMU_CB_TLBSYNC, ARM_SMMU_CB_TLBSTATUS);
+ spin_unlock_irqrestore(&smmu_domain->cb_lock, flags);
+}
+
+static void arm_smmu_tlb_inv_context_s1(void *cookie)
+{
+ struct arm_smmu_domain *smmu_domain = cookie;
+ /*
+ * The TLBI write may be relaxed, so ensure that PTEs cleared by the
+ * current CPU are visible beforehand.
+ */
+ wmb();
+ arm_smmu_cb_write(smmu_domain->smmu, smmu_domain->cfg.cbndx,
+ ARM_SMMU_CB_S1_TLBIASID, smmu_domain->cfg.asid);
+ arm_smmu_tlb_sync_context(smmu_domain);
+}
+
+static void arm_smmu_tlb_inv_context_s2(void *cookie)
+{
+ struct arm_smmu_domain *smmu_domain = cookie;
+ struct arm_smmu_device *smmu = smmu_domain->smmu;
+
+ /* See above */
+ wmb();
+ arm_smmu_gr0_write(smmu, ARM_SMMU_GR0_TLBIVMID, smmu_domain->cfg.vmid);
+ arm_smmu_tlb_sync_global(smmu);
+}
+
+static void arm_smmu_tlb_inv_range_s1(unsigned long iova, size_t size,
+ size_t granule, void *cookie, int reg)
+{
+ struct arm_smmu_domain *smmu_domain = cookie;
+ struct arm_smmu_device *smmu = smmu_domain->smmu;
+ struct arm_smmu_cfg *cfg = &smmu_domain->cfg;
+ int idx = cfg->cbndx;
+
+ if (smmu->features & ARM_SMMU_FEAT_COHERENT_WALK)
+ wmb();
+
+ if (cfg->fmt != ARM_SMMU_CTX_FMT_AARCH64) {
+ iova = (iova >> 12) << 12;
+ iova |= cfg->asid;
+ do {
+ arm_smmu_cb_write(smmu, idx, reg, iova);
+ iova += granule;
+ } while (size -= granule);
+ } else {
+ iova >>= 12;
+ iova |= (u64)cfg->asid << 48;
+ do {
+ arm_smmu_cb_writeq(smmu, idx, reg, iova);
+ iova += granule >> 12;
+ } while (size -= granule);
+ }
+}
+
+static void arm_smmu_tlb_inv_range_s2(unsigned long iova, size_t size,
+ size_t granule, void *cookie, int reg)
+{
+ struct arm_smmu_domain *smmu_domain = cookie;
+ struct arm_smmu_device *smmu = smmu_domain->smmu;
+ int idx = smmu_domain->cfg.cbndx;
+
+ if (smmu->features & ARM_SMMU_FEAT_COHERENT_WALK)
+ wmb();
+
+ iova >>= 12;
+ do {
+ if (smmu_domain->cfg.fmt == ARM_SMMU_CTX_FMT_AARCH64)
+ arm_smmu_cb_writeq(smmu, idx, reg, iova);
+ else
+ arm_smmu_cb_write(smmu, idx, reg, iova);
+ iova += granule >> 12;
+ } while (size -= granule);
+}
+
+static void arm_smmu_tlb_inv_walk_s1(unsigned long iova, size_t size,
+ size_t granule, void *cookie)
+{
+ arm_smmu_tlb_inv_range_s1(iova, size, granule, cookie,
+ ARM_SMMU_CB_S1_TLBIVA);
+ arm_smmu_tlb_sync_context(cookie);
+}
+
+static void arm_smmu_tlb_inv_leaf_s1(unsigned long iova, size_t size,
+ size_t granule, void *cookie)
+{
+ arm_smmu_tlb_inv_range_s1(iova, size, granule, cookie,
+ ARM_SMMU_CB_S1_TLBIVAL);
+ arm_smmu_tlb_sync_context(cookie);
+}
+
+static void arm_smmu_tlb_add_page_s1(struct iommu_iotlb_gather *gather,
+ unsigned long iova, size_t granule,
+ void *cookie)
+{
+ arm_smmu_tlb_inv_range_s1(iova, granule, granule, cookie,
+ ARM_SMMU_CB_S1_TLBIVAL);
+}
+
+static void arm_smmu_tlb_inv_walk_s2(unsigned long iova, size_t size,
+ size_t granule, void *cookie)
+{
+ arm_smmu_tlb_inv_range_s2(iova, size, granule, cookie,
+ ARM_SMMU_CB_S2_TLBIIPAS2);
+ arm_smmu_tlb_sync_context(cookie);
+}
+
+static void arm_smmu_tlb_inv_leaf_s2(unsigned long iova, size_t size,
+ size_t granule, void *cookie)
+{
+ arm_smmu_tlb_inv_range_s2(iova, size, granule, cookie,
+ ARM_SMMU_CB_S2_TLBIIPAS2L);
+ arm_smmu_tlb_sync_context(cookie);
+}
+
+static void arm_smmu_tlb_add_page_s2(struct iommu_iotlb_gather *gather,
+ unsigned long iova, size_t granule,
+ void *cookie)
+{
+ arm_smmu_tlb_inv_range_s2(iova, granule, granule, cookie,
+ ARM_SMMU_CB_S2_TLBIIPAS2L);
+}
+
+static void arm_smmu_tlb_inv_any_s2_v1(unsigned long iova, size_t size,
+ size_t granule, void *cookie)
+{
+ arm_smmu_tlb_inv_context_s2(cookie);
+}
+/*
+ * On MMU-401 at least, the cost of firing off multiple TLBIVMIDs appears
+ * almost negligible, but the benefit of getting the first one in as far ahead
+ * of the sync as possible is significant, hence we don't just make this a
+ * no-op and call arm_smmu_tlb_inv_context_s2() from .iotlb_sync as you might
+ * think.
+ */
+static void arm_smmu_tlb_add_page_s2_v1(struct iommu_iotlb_gather *gather,
+ unsigned long iova, size_t granule,
+ void *cookie)
+{
+ struct arm_smmu_domain *smmu_domain = cookie;
+ struct arm_smmu_device *smmu = smmu_domain->smmu;
+
+ if (smmu->features & ARM_SMMU_FEAT_COHERENT_WALK)
+ wmb();
+
+ arm_smmu_gr0_write(smmu, ARM_SMMU_GR0_TLBIVMID, smmu_domain->cfg.vmid);
+}
+
+static const struct iommu_flush_ops arm_smmu_s1_tlb_ops = {
+ .tlb_flush_all = arm_smmu_tlb_inv_context_s1,
+ .tlb_flush_walk = arm_smmu_tlb_inv_walk_s1,
+ .tlb_flush_leaf = arm_smmu_tlb_inv_leaf_s1,
+ .tlb_add_page = arm_smmu_tlb_add_page_s1,
+};
+
+static const struct iommu_flush_ops arm_smmu_s2_tlb_ops_v2 = {
+ .tlb_flush_all = arm_smmu_tlb_inv_context_s2,
+ .tlb_flush_walk = arm_smmu_tlb_inv_walk_s2,
+ .tlb_flush_leaf = arm_smmu_tlb_inv_leaf_s2,
+ .tlb_add_page = arm_smmu_tlb_add_page_s2,
+};
+
+static const struct iommu_flush_ops arm_smmu_s2_tlb_ops_v1 = {
+ .tlb_flush_all = arm_smmu_tlb_inv_context_s2,
+ .tlb_flush_walk = arm_smmu_tlb_inv_any_s2_v1,
+ .tlb_flush_leaf = arm_smmu_tlb_inv_any_s2_v1,
+ .tlb_add_page = arm_smmu_tlb_add_page_s2_v1,
+};
+
+static irqreturn_t arm_smmu_context_fault(int irq, void *dev)
+{
+ u32 fsr, fsynr, cbfrsynra;
+ unsigned long iova;
+ struct iommu_domain *domain = dev;
+ struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
+ struct arm_smmu_device *smmu = smmu_domain->smmu;
+ int idx = smmu_domain->cfg.cbndx;
+
+ fsr = arm_smmu_cb_read(smmu, idx, ARM_SMMU_CB_FSR);
+ if (!(fsr & ARM_SMMU_FSR_FAULT))
+ return IRQ_NONE;
+
+ fsynr = arm_smmu_cb_read(smmu, idx, ARM_SMMU_CB_FSYNR0);
+ iova = arm_smmu_cb_readq(smmu, idx, ARM_SMMU_CB_FAR);
+ cbfrsynra = arm_smmu_gr1_read(smmu, ARM_SMMU_GR1_CBFRSYNRA(idx));
+
+ dev_err_ratelimited(smmu->dev,
+ "Unhandled context fault: fsr=0x%x, iova=0x%08lx, fsynr=0x%x, cbfrsynra=0x%x, cb=%d\n",
+ fsr, iova, fsynr, cbfrsynra, idx);
+
+ arm_smmu_cb_write(smmu, idx, ARM_SMMU_CB_FSR, fsr);
+ return IRQ_HANDLED;
+}
+
+static irqreturn_t arm_smmu_global_fault(int irq, void *dev)
+{
+ u32 gfsr, gfsynr0, gfsynr1, gfsynr2;
+ struct arm_smmu_device *smmu = dev;
+ static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL,
+ DEFAULT_RATELIMIT_BURST);
+
+ gfsr = arm_smmu_gr0_read(smmu, ARM_SMMU_GR0_sGFSR);
+ gfsynr0 = arm_smmu_gr0_read(smmu, ARM_SMMU_GR0_sGFSYNR0);
+ gfsynr1 = arm_smmu_gr0_read(smmu, ARM_SMMU_GR0_sGFSYNR1);
+ gfsynr2 = arm_smmu_gr0_read(smmu, ARM_SMMU_GR0_sGFSYNR2);
+
+ if (!gfsr)
+ return IRQ_NONE;
+
+ if (__ratelimit(&rs)) {
+ if (IS_ENABLED(CONFIG_ARM_SMMU_DISABLE_BYPASS_BY_DEFAULT) &&
+ (gfsr & ARM_SMMU_sGFSR_USF))
+ dev_err(smmu->dev,
+ "Blocked unknown Stream ID 0x%hx; boot with \"arm-smmu.disable_bypass=0\" to allow, but this may have security implications\n",
+ (u16)gfsynr1);
+ else
+ dev_err(smmu->dev,
+ "Unexpected global fault, this could be serious\n");
+ dev_err(smmu->dev,
+ "\tGFSR 0x%08x, GFSYNR0 0x%08x, GFSYNR1 0x%08x, GFSYNR2 0x%08x\n",
+ gfsr, gfsynr0, gfsynr1, gfsynr2);
+ }
+
+ arm_smmu_gr0_write(smmu, ARM_SMMU_GR0_sGFSR, gfsr);
+ return IRQ_HANDLED;
+}
+
+static void arm_smmu_init_context_bank(struct arm_smmu_domain *smmu_domain,
+ struct io_pgtable_cfg *pgtbl_cfg)
+{
+ struct arm_smmu_cfg *cfg = &smmu_domain->cfg;
+ struct arm_smmu_cb *cb = &smmu_domain->smmu->cbs[cfg->cbndx];
+ bool stage1 = cfg->cbar != CBAR_TYPE_S2_TRANS;
+
+ cb->cfg = cfg;
+
+ /* TCR */
+ if (stage1) {
+ if (cfg->fmt == ARM_SMMU_CTX_FMT_AARCH32_S) {
+ cb->tcr[0] = pgtbl_cfg->arm_v7s_cfg.tcr;
+ } else {
+ cb->tcr[0] = arm_smmu_lpae_tcr(pgtbl_cfg);
+ cb->tcr[1] = arm_smmu_lpae_tcr2(pgtbl_cfg);
+ if (cfg->fmt == ARM_SMMU_CTX_FMT_AARCH64)
+ cb->tcr[1] |= ARM_SMMU_TCR2_AS;
+ else
+ cb->tcr[0] |= ARM_SMMU_TCR_EAE;
+ }
+ } else {
+ cb->tcr[0] = arm_smmu_lpae_vtcr(pgtbl_cfg);
+ }
+
+ /* TTBRs */
+ if (stage1) {
+ if (cfg->fmt == ARM_SMMU_CTX_FMT_AARCH32_S) {
+ cb->ttbr[0] = pgtbl_cfg->arm_v7s_cfg.ttbr;
+ cb->ttbr[1] = 0;
+ } else {
+ cb->ttbr[0] = FIELD_PREP(ARM_SMMU_TTBRn_ASID,
+ cfg->asid);
+ cb->ttbr[1] = FIELD_PREP(ARM_SMMU_TTBRn_ASID,
+ cfg->asid);
+
+ if (pgtbl_cfg->quirks & IO_PGTABLE_QUIRK_ARM_TTBR1)
+ cb->ttbr[1] |= pgtbl_cfg->arm_lpae_s1_cfg.ttbr;
+ else
+ cb->ttbr[0] |= pgtbl_cfg->arm_lpae_s1_cfg.ttbr;
+ }
+ } else {
+ cb->ttbr[0] = pgtbl_cfg->arm_lpae_s2_cfg.vttbr;
+ }
+
+ /* MAIRs (stage-1 only) */
+ if (stage1) {
+ if (cfg->fmt == ARM_SMMU_CTX_FMT_AARCH32_S) {
+ cb->mair[0] = pgtbl_cfg->arm_v7s_cfg.prrr;
+ cb->mair[1] = pgtbl_cfg->arm_v7s_cfg.nmrr;
+ } else {
+ cb->mair[0] = pgtbl_cfg->arm_lpae_s1_cfg.mair;
+ cb->mair[1] = pgtbl_cfg->arm_lpae_s1_cfg.mair >> 32;
+ }
+ }
+}
+
+void arm_smmu_write_context_bank(struct arm_smmu_device *smmu, int idx)
+{
+ u32 reg;
+ bool stage1;
+ struct arm_smmu_cb *cb = &smmu->cbs[idx];
+ struct arm_smmu_cfg *cfg = cb->cfg;
+
+ /* Unassigned context banks only need disabling */
+ if (!cfg) {
+ arm_smmu_cb_write(smmu, idx, ARM_SMMU_CB_SCTLR, 0);
+ return;
+ }
+
+ stage1 = cfg->cbar != CBAR_TYPE_S2_TRANS;
+
+ /* CBA2R */
+ if (smmu->version > ARM_SMMU_V1) {
+ if (cfg->fmt == ARM_SMMU_CTX_FMT_AARCH64)
+ reg = ARM_SMMU_CBA2R_VA64;
+ else
+ reg = 0;
+ /* 16-bit VMIDs live in CBA2R */
+ if (smmu->features & ARM_SMMU_FEAT_VMID16)
+ reg |= FIELD_PREP(ARM_SMMU_CBA2R_VMID16, cfg->vmid);
+
+ arm_smmu_gr1_write(smmu, ARM_SMMU_GR1_CBA2R(idx), reg);
+ }
+
+ /* CBAR */
+ reg = FIELD_PREP(ARM_SMMU_CBAR_TYPE, cfg->cbar);
+ if (smmu->version < ARM_SMMU_V2)
+ reg |= FIELD_PREP(ARM_SMMU_CBAR_IRPTNDX, cfg->irptndx);
+
+ /*
+ * Use the weakest shareability/memory types, so they are
+ * overridden by the ttbcr/pte.
+ */
+ if (stage1) {
+ reg |= FIELD_PREP(ARM_SMMU_CBAR_S1_BPSHCFG,
+ ARM_SMMU_CBAR_S1_BPSHCFG_NSH) |
+ FIELD_PREP(ARM_SMMU_CBAR_S1_MEMATTR,
+ ARM_SMMU_CBAR_S1_MEMATTR_WB);
+ } else if (!(smmu->features & ARM_SMMU_FEAT_VMID16)) {
+ /* 8-bit VMIDs live in CBAR */
+ reg |= FIELD_PREP(ARM_SMMU_CBAR_VMID, cfg->vmid);
+ }
+ arm_smmu_gr1_write(smmu, ARM_SMMU_GR1_CBAR(idx), reg);
+
+ /*
+ * TCR
+ * We must write this before the TTBRs, since it determines the
+ * access behaviour of some fields (in particular, ASID[15:8]).
+ */
+ if (stage1 && smmu->version > ARM_SMMU_V1)
+ arm_smmu_cb_write(smmu, idx, ARM_SMMU_CB_TCR2, cb->tcr[1]);
+ arm_smmu_cb_write(smmu, idx, ARM_SMMU_CB_TCR, cb->tcr[0]);
+
+ /* TTBRs */
+ if (cfg->fmt == ARM_SMMU_CTX_FMT_AARCH32_S) {
+ arm_smmu_cb_write(smmu, idx, ARM_SMMU_CB_CONTEXTIDR, cfg->asid);
+ arm_smmu_cb_write(smmu, idx, ARM_SMMU_CB_TTBR0, cb->ttbr[0]);
+ arm_smmu_cb_write(smmu, idx, ARM_SMMU_CB_TTBR1, cb->ttbr[1]);
+ } else {
+ arm_smmu_cb_writeq(smmu, idx, ARM_SMMU_CB_TTBR0, cb->ttbr[0]);
+ if (stage1)
+ arm_smmu_cb_writeq(smmu, idx, ARM_SMMU_CB_TTBR1,
+ cb->ttbr[1]);
+ }
+
+ /* MAIRs (stage-1 only) */
+ if (stage1) {
+ arm_smmu_cb_write(smmu, idx, ARM_SMMU_CB_S1_MAIR0, cb->mair[0]);
+ arm_smmu_cb_write(smmu, idx, ARM_SMMU_CB_S1_MAIR1, cb->mair[1]);
+ }
+
+ /* SCTLR */
+ reg = ARM_SMMU_SCTLR_CFIE | ARM_SMMU_SCTLR_CFRE | ARM_SMMU_SCTLR_AFE |
+ ARM_SMMU_SCTLR_TRE | ARM_SMMU_SCTLR_M;
+ if (stage1)
+ reg |= ARM_SMMU_SCTLR_S1_ASIDPNE;
+ if (IS_ENABLED(CONFIG_CPU_BIG_ENDIAN))
+ reg |= ARM_SMMU_SCTLR_E;
+
+ arm_smmu_cb_write(smmu, idx, ARM_SMMU_CB_SCTLR, reg);
+}
+
+static int arm_smmu_alloc_context_bank(struct arm_smmu_domain *smmu_domain,
+ struct arm_smmu_device *smmu,
+ struct device *dev, unsigned int start)
+{
+ if (smmu->impl && smmu->impl->alloc_context_bank)
+ return smmu->impl->alloc_context_bank(smmu_domain, smmu, dev, start);
+
+ return __arm_smmu_alloc_bitmap(smmu->context_map, start, smmu->num_context_banks);
+}
+
+static int arm_smmu_init_domain_context(struct iommu_domain *domain,
+ struct arm_smmu_device *smmu,
+ struct device *dev)
+{
+ int irq, start, ret = 0;
+ unsigned long ias, oas;
+ struct io_pgtable_ops *pgtbl_ops;
+ struct io_pgtable_cfg pgtbl_cfg;
+ enum io_pgtable_fmt fmt;
+ struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
+ struct arm_smmu_cfg *cfg = &smmu_domain->cfg;
+ irqreturn_t (*context_fault)(int irq, void *dev);
+
+ mutex_lock(&smmu_domain->init_mutex);
+ if (smmu_domain->smmu)
+ goto out_unlock;
+
+ if (domain->type == IOMMU_DOMAIN_IDENTITY) {
+ smmu_domain->stage = ARM_SMMU_DOMAIN_BYPASS;
+ smmu_domain->smmu = smmu;
+ goto out_unlock;
+ }
+
+ /*
+ * Mapping the requested stage onto what we support is surprisingly
+ * complicated, mainly because the spec allows S1+S2 SMMUs without
+ * support for nested translation. That means we end up with the
+ * following table:
+ *
+ * Requested Supported Actual
+ * S1 N S1
+ * S1 S1+S2 S1
+ * S1 S2 S2
+ * S1 S1 S1
+ * N N N
+ * N S1+S2 S2
+ * N S2 S2
+ * N S1 S1
+ *
+ * Note that you can't actually request stage-2 mappings.
+ */
+ if (!(smmu->features & ARM_SMMU_FEAT_TRANS_S1))
+ smmu_domain->stage = ARM_SMMU_DOMAIN_S2;
+ if (!(smmu->features & ARM_SMMU_FEAT_TRANS_S2))
+ smmu_domain->stage = ARM_SMMU_DOMAIN_S1;
+
+ /*
+ * Choosing a suitable context format is even more fiddly. Until we
+ * grow some way for the caller to express a preference, and/or move
+ * the decision into the io-pgtable code where it arguably belongs,
+ * just aim for the closest thing to the rest of the system, and hope
+ * that the hardware isn't esoteric enough that we can't assume AArch64
+ * support to be a superset of AArch32 support...
+ */
+ if (smmu->features & ARM_SMMU_FEAT_FMT_AARCH32_L)
+ cfg->fmt = ARM_SMMU_CTX_FMT_AARCH32_L;
+ if (IS_ENABLED(CONFIG_IOMMU_IO_PGTABLE_ARMV7S) &&
+ !IS_ENABLED(CONFIG_64BIT) && !IS_ENABLED(CONFIG_ARM_LPAE) &&
+ (smmu->features & ARM_SMMU_FEAT_FMT_AARCH32_S) &&
+ (smmu_domain->stage == ARM_SMMU_DOMAIN_S1))
+ cfg->fmt = ARM_SMMU_CTX_FMT_AARCH32_S;
+ if ((IS_ENABLED(CONFIG_64BIT) || cfg->fmt == ARM_SMMU_CTX_FMT_NONE) &&
+ (smmu->features & (ARM_SMMU_FEAT_FMT_AARCH64_64K |
+ ARM_SMMU_FEAT_FMT_AARCH64_16K |
+ ARM_SMMU_FEAT_FMT_AARCH64_4K)))
+ cfg->fmt = ARM_SMMU_CTX_FMT_AARCH64;
+
+ if (cfg->fmt == ARM_SMMU_CTX_FMT_NONE) {
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+
+ switch (smmu_domain->stage) {
+ case ARM_SMMU_DOMAIN_S1:
+ cfg->cbar = CBAR_TYPE_S1_TRANS_S2_BYPASS;
+ start = smmu->num_s2_context_banks;
+ ias = smmu->va_size;
+ oas = smmu->ipa_size;
+ if (cfg->fmt == ARM_SMMU_CTX_FMT_AARCH64) {
+ fmt = ARM_64_LPAE_S1;
+ } else if (cfg->fmt == ARM_SMMU_CTX_FMT_AARCH32_L) {
+ fmt = ARM_32_LPAE_S1;
+ ias = min(ias, 32UL);
+ oas = min(oas, 40UL);
+ } else {
+ fmt = ARM_V7S;
+ ias = min(ias, 32UL);
+ oas = min(oas, 32UL);
+ }
+ smmu_domain->flush_ops = &arm_smmu_s1_tlb_ops;
+ break;
+ case ARM_SMMU_DOMAIN_NESTED:
+ /*
+ * We will likely want to change this if/when KVM gets
+ * involved.
+ */
+ case ARM_SMMU_DOMAIN_S2:
+ cfg->cbar = CBAR_TYPE_S2_TRANS;
+ start = 0;
+ ias = smmu->ipa_size;
+ oas = smmu->pa_size;
+ if (cfg->fmt == ARM_SMMU_CTX_FMT_AARCH64) {
+ fmt = ARM_64_LPAE_S2;
+ } else {
+ fmt = ARM_32_LPAE_S2;
+ ias = min(ias, 40UL);
+ oas = min(oas, 40UL);
+ }
+ if (smmu->version == ARM_SMMU_V2)
+ smmu_domain->flush_ops = &arm_smmu_s2_tlb_ops_v2;
+ else
+ smmu_domain->flush_ops = &arm_smmu_s2_tlb_ops_v1;
+ break;
+ default:
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+
+ ret = arm_smmu_alloc_context_bank(smmu_domain, smmu, dev, start);
+ if (ret < 0) {
+ goto out_unlock;
+ }
+
+ smmu_domain->smmu = smmu;
+
+ cfg->cbndx = ret;
+ if (smmu->version < ARM_SMMU_V2) {
+ cfg->irptndx = atomic_inc_return(&smmu->irptndx);
+ cfg->irptndx %= smmu->num_context_irqs;
+ } else {
+ cfg->irptndx = cfg->cbndx;
+ }
+
+ if (smmu_domain->stage == ARM_SMMU_DOMAIN_S2)
+ cfg->vmid = cfg->cbndx + 1;
+ else
+ cfg->asid = cfg->cbndx;
+
+ pgtbl_cfg = (struct io_pgtable_cfg) {
+ .pgsize_bitmap = smmu->pgsize_bitmap,
+ .ias = ias,
+ .oas = oas,
+ .coherent_walk = smmu->features & ARM_SMMU_FEAT_COHERENT_WALK,
+ .tlb = smmu_domain->flush_ops,
+ .iommu_dev = smmu->dev,
+ };
+
+ if (smmu->impl && smmu->impl->init_context) {
+ ret = smmu->impl->init_context(smmu_domain, &pgtbl_cfg, dev);
+ if (ret)
+ goto out_clear_smmu;
+ }
+
+ if (smmu_domain->non_strict)
+ pgtbl_cfg.quirks |= IO_PGTABLE_QUIRK_NON_STRICT;
+
+ pgtbl_ops = alloc_io_pgtable_ops(fmt, &pgtbl_cfg, smmu_domain);
+ if (!pgtbl_ops) {
+ ret = -ENOMEM;
+ goto out_clear_smmu;
+ }
+
+ /* Update the domain's page sizes to reflect the page table format */
+ domain->pgsize_bitmap = pgtbl_cfg.pgsize_bitmap;
+
+ if (pgtbl_cfg.quirks & IO_PGTABLE_QUIRK_ARM_TTBR1) {
+ domain->geometry.aperture_start = ~0UL << ias;
+ domain->geometry.aperture_end = ~0UL;
+ } else {
+ domain->geometry.aperture_end = (1UL << ias) - 1;
+ }
+
+ domain->geometry.force_aperture = true;
+
+ /* Initialise the context bank with our page table cfg */
+ arm_smmu_init_context_bank(smmu_domain, &pgtbl_cfg);
+ arm_smmu_write_context_bank(smmu, cfg->cbndx);
+
+ /*
+ * Request context fault interrupt. Do this last to avoid the
+ * handler seeing a half-initialised domain state.
+ */
+ irq = smmu->irqs[smmu->num_global_irqs + cfg->irptndx];
+
+ if (smmu->impl && smmu->impl->context_fault)
+ context_fault = smmu->impl->context_fault;
+ else
+ context_fault = arm_smmu_context_fault;
+
+ ret = devm_request_irq(smmu->dev, irq, context_fault,
+ IRQF_SHARED, "arm-smmu-context-fault", domain);
+ if (ret < 0) {
+ dev_err(smmu->dev, "failed to request context IRQ %d (%u)\n",
+ cfg->irptndx, irq);
+ cfg->irptndx = ARM_SMMU_INVALID_IRPTNDX;
+ }
+
+ mutex_unlock(&smmu_domain->init_mutex);
+
+ /* Publish page table ops for map/unmap */
+ smmu_domain->pgtbl_ops = pgtbl_ops;
+ return 0;
+
+out_clear_smmu:
+ __arm_smmu_free_bitmap(smmu->context_map, cfg->cbndx);
+ smmu_domain->smmu = NULL;
+out_unlock:
+ mutex_unlock(&smmu_domain->init_mutex);
+ return ret;
+}
+
+static void arm_smmu_destroy_domain_context(struct iommu_domain *domain)
+{
+ struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
+ struct arm_smmu_device *smmu = smmu_domain->smmu;
+ struct arm_smmu_cfg *cfg = &smmu_domain->cfg;
+ int ret, irq;
+
+ if (!smmu || domain->type == IOMMU_DOMAIN_IDENTITY)
+ return;
+
+ ret = arm_smmu_rpm_get(smmu);
+ if (ret < 0)
+ return;
+
+ /*
+ * Disable the context bank and free the page tables before freeing
+ * it.
+ */
+ smmu->cbs[cfg->cbndx].cfg = NULL;
+ arm_smmu_write_context_bank(smmu, cfg->cbndx);
+
+ if (cfg->irptndx != ARM_SMMU_INVALID_IRPTNDX) {
+ irq = smmu->irqs[smmu->num_global_irqs + cfg->irptndx];
+ devm_free_irq(smmu->dev, irq, domain);
+ }
+
+ free_io_pgtable_ops(smmu_domain->pgtbl_ops);
+ __arm_smmu_free_bitmap(smmu->context_map, cfg->cbndx);
+
+ arm_smmu_rpm_put(smmu);
+}
+
+static struct iommu_domain *arm_smmu_domain_alloc(unsigned type)
+{
+ struct arm_smmu_domain *smmu_domain;
+
+ if (type != IOMMU_DOMAIN_UNMANAGED &&
+ type != IOMMU_DOMAIN_DMA &&
+ type != IOMMU_DOMAIN_IDENTITY)
+ return NULL;
+ /*
+ * Allocate the domain and initialise some of its data structures.
+ * We can't really do anything meaningful until we've added a
+ * master.
+ */
+ smmu_domain = kzalloc(sizeof(*smmu_domain), GFP_KERNEL);
+ if (!smmu_domain)
+ return NULL;
+
+ if (type == IOMMU_DOMAIN_DMA && (using_legacy_binding ||
+ iommu_get_dma_cookie(&smmu_domain->domain))) {
+ kfree(smmu_domain);
+ return NULL;
+ }
+
+ mutex_init(&smmu_domain->init_mutex);
+ spin_lock_init(&smmu_domain->cb_lock);
+
+ return &smmu_domain->domain;
+}
+
+static void arm_smmu_domain_free(struct iommu_domain *domain)
+{
+ struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
+
+ /*
+ * Free the domain resources. We assume that all devices have
+ * already been detached.
+ */
+ iommu_put_dma_cookie(domain);
+ arm_smmu_destroy_domain_context(domain);
+ kfree(smmu_domain);
+}
+
+static void arm_smmu_write_smr(struct arm_smmu_device *smmu, int idx)
+{
+ struct arm_smmu_smr *smr = smmu->smrs + idx;
+ u32 reg = FIELD_PREP(ARM_SMMU_SMR_ID, smr->id) |
+ FIELD_PREP(ARM_SMMU_SMR_MASK, smr->mask);
+
+ if (!(smmu->features & ARM_SMMU_FEAT_EXIDS) && smr->valid)
+ reg |= ARM_SMMU_SMR_VALID;
+ arm_smmu_gr0_write(smmu, ARM_SMMU_GR0_SMR(idx), reg);
+}
+
+static void arm_smmu_write_s2cr(struct arm_smmu_device *smmu, int idx)
+{
+ struct arm_smmu_s2cr *s2cr = smmu->s2crs + idx;
+ u32 reg;
+
+ if (smmu->impl && smmu->impl->write_s2cr) {
+ smmu->impl->write_s2cr(smmu, idx);
+ return;
+ }
+
+ reg = FIELD_PREP(ARM_SMMU_S2CR_TYPE, s2cr->type) |
+ FIELD_PREP(ARM_SMMU_S2CR_CBNDX, s2cr->cbndx) |
+ FIELD_PREP(ARM_SMMU_S2CR_PRIVCFG, s2cr->privcfg);
+
+ if (smmu->features & ARM_SMMU_FEAT_EXIDS && smmu->smrs &&
+ smmu->smrs[idx].valid)
+ reg |= ARM_SMMU_S2CR_EXIDVALID;
+ arm_smmu_gr0_write(smmu, ARM_SMMU_GR0_S2CR(idx), reg);
+}
+
+static void arm_smmu_write_sme(struct arm_smmu_device *smmu, int idx)
+{
+ arm_smmu_write_s2cr(smmu, idx);
+ if (smmu->smrs)
+ arm_smmu_write_smr(smmu, idx);
+}
+
+/*
+ * The width of SMR's mask field depends on sCR0_EXIDENABLE, so this function
+ * should be called after sCR0 is written.
+ */
+static void arm_smmu_test_smr_masks(struct arm_smmu_device *smmu)
+{
+ u32 smr;
+ int i;
+
+ if (!smmu->smrs)
+ return;
+ /*
+ * If we've had to accommodate firmware memory regions, we may
+ * have live SMRs by now; tread carefully...
+ *
+ * Somewhat perversely, not having a free SMR for this test implies we
+ * can get away without it anyway, as we'll only be able to 'allocate'
+ * these SMRs for the ID/mask values we're already trusting to be OK.
+ */
+ for (i = 0; i < smmu->num_mapping_groups; i++)
+ if (!smmu->smrs[i].valid)
+ goto smr_ok;
+ return;
+smr_ok:
+ /*
+ * SMR.ID bits may not be preserved if the corresponding MASK
+ * bits are set, so check each one separately. We can reject
+ * masters later if they try to claim IDs outside these masks.
+ */
+ smr = FIELD_PREP(ARM_SMMU_SMR_ID, smmu->streamid_mask);
+ arm_smmu_gr0_write(smmu, ARM_SMMU_GR0_SMR(i), smr);
+ smr = arm_smmu_gr0_read(smmu, ARM_SMMU_GR0_SMR(i));
+ smmu->streamid_mask = FIELD_GET(ARM_SMMU_SMR_ID, smr);
+
+ smr = FIELD_PREP(ARM_SMMU_SMR_MASK, smmu->streamid_mask);
+ arm_smmu_gr0_write(smmu, ARM_SMMU_GR0_SMR(i), smr);
+ smr = arm_smmu_gr0_read(smmu, ARM_SMMU_GR0_SMR(i));
+ smmu->smr_mask_mask = FIELD_GET(ARM_SMMU_SMR_MASK, smr);
+}
+
+static int arm_smmu_find_sme(struct arm_smmu_device *smmu, u16 id, u16 mask)
+{
+ struct arm_smmu_smr *smrs = smmu->smrs;
+ int i, free_idx = -ENOSPC;
+
+ /* Stream indexing is blissfully easy */
+ if (!smrs)
+ return id;
+
+ /* Validating SMRs is... less so */
+ for (i = 0; i < smmu->num_mapping_groups; ++i) {
+ if (!smrs[i].valid) {
+ /*
+ * Note the first free entry we come across, which
+ * we'll claim in the end if nothing else matches.
+ */
+ if (free_idx < 0)
+ free_idx = i;
+ continue;
+ }
+ /*
+ * If the new entry is _entirely_ matched by an existing entry,
+ * then reuse that, with the guarantee that there also cannot
+ * be any subsequent conflicting entries. In normal use we'd
+ * expect simply identical entries for this case, but there's
+ * no harm in accommodating the generalisation.
+ */
+ if ((mask & smrs[i].mask) == mask &&
+ !((id ^ smrs[i].id) & ~smrs[i].mask))
+ return i;
+ /*
+ * If the new entry has any other overlap with an existing one,
+ * though, then there always exists at least one stream ID
+ * which would cause a conflict, and we can't allow that risk.
+ */
+ if (!((id ^ smrs[i].id) & ~(smrs[i].mask | mask)))
+ return -EINVAL;
+ }
+
+ return free_idx;
+}
+
+static bool arm_smmu_free_sme(struct arm_smmu_device *smmu, int idx)
+{
+ if (--smmu->s2crs[idx].count)
+ return false;
+
+ smmu->s2crs[idx] = s2cr_init_val;
+ if (smmu->smrs)
+ smmu->smrs[idx].valid = false;
+
+ return true;
+}
+
+static int arm_smmu_master_alloc_smes(struct device *dev)
+{
+ struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
+ struct arm_smmu_master_cfg *cfg = dev_iommu_priv_get(dev);
+ struct arm_smmu_device *smmu = cfg->smmu;
+ struct arm_smmu_smr *smrs = smmu->smrs;
+ int i, idx, ret;
+
+ mutex_lock(&smmu->stream_map_mutex);
+ /* Figure out a viable stream map entry allocation */
+ for_each_cfg_sme(cfg, fwspec, i, idx) {
+ u16 sid = FIELD_GET(ARM_SMMU_SMR_ID, fwspec->ids[i]);
+ u16 mask = FIELD_GET(ARM_SMMU_SMR_MASK, fwspec->ids[i]);
+
+ if (idx != INVALID_SMENDX) {
+ ret = -EEXIST;
+ goto out_err;
+ }
+
+ ret = arm_smmu_find_sme(smmu, sid, mask);
+ if (ret < 0)
+ goto out_err;
+
+ idx = ret;
+ if (smrs && smmu->s2crs[idx].count == 0) {
+ smrs[idx].id = sid;
+ smrs[idx].mask = mask;
+ smrs[idx].valid = true;
+ }
+ smmu->s2crs[idx].count++;
+ cfg->smendx[i] = (s16)idx;
+ }
+
+ /* It worked! Now, poke the actual hardware */
+ for_each_cfg_sme(cfg, fwspec, i, idx)
+ arm_smmu_write_sme(smmu, idx);
+
+ mutex_unlock(&smmu->stream_map_mutex);
+ return 0;
+
+out_err:
+ while (i--) {
+ arm_smmu_free_sme(smmu, cfg->smendx[i]);
+ cfg->smendx[i] = INVALID_SMENDX;
+ }
+ mutex_unlock(&smmu->stream_map_mutex);
+ return ret;
+}
+
+static void arm_smmu_master_free_smes(struct arm_smmu_master_cfg *cfg,
+ struct iommu_fwspec *fwspec)
+{
+ struct arm_smmu_device *smmu = cfg->smmu;
+ int i, idx;
+
+ mutex_lock(&smmu->stream_map_mutex);
+ for_each_cfg_sme(cfg, fwspec, i, idx) {
+ if (arm_smmu_free_sme(smmu, idx))
+ arm_smmu_write_sme(smmu, idx);
+ cfg->smendx[i] = INVALID_SMENDX;
+ }
+ mutex_unlock(&smmu->stream_map_mutex);
+}
+
+static int arm_smmu_domain_add_master(struct arm_smmu_domain *smmu_domain,
+ struct arm_smmu_master_cfg *cfg,
+ struct iommu_fwspec *fwspec)
+{
+ struct arm_smmu_device *smmu = smmu_domain->smmu;
+ struct arm_smmu_s2cr *s2cr = smmu->s2crs;
+ u8 cbndx = smmu_domain->cfg.cbndx;
+ enum arm_smmu_s2cr_type type;
+ int i, idx;
+
+ if (smmu_domain->stage == ARM_SMMU_DOMAIN_BYPASS)
+ type = S2CR_TYPE_BYPASS;
+ else
+ type = S2CR_TYPE_TRANS;
+
+ for_each_cfg_sme(cfg, fwspec, i, idx) {
+ if (type == s2cr[idx].type && cbndx == s2cr[idx].cbndx)
+ continue;
+
+ s2cr[idx].type = type;
+ s2cr[idx].privcfg = S2CR_PRIVCFG_DEFAULT;
+ s2cr[idx].cbndx = cbndx;
+ arm_smmu_write_s2cr(smmu, idx);
+ }
+ return 0;
+}
+
+static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev)
+{
+ struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
+ struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
+ struct arm_smmu_master_cfg *cfg;
+ struct arm_smmu_device *smmu;
+ int ret;
+
+ if (!fwspec || fwspec->ops != &arm_smmu_ops) {
+ dev_err(dev, "cannot attach to SMMU, is it on the same bus?\n");
+ return -ENXIO;
+ }
+
+ /*
+ * FIXME: The arch/arm DMA API code tries to attach devices to its own
+ * domains between of_xlate() and probe_device() - we have no way to cope
+ * with that, so until ARM gets converted to rely on groups and default
+ * domains, just say no (but more politely than by dereferencing NULL).
+ * This should be at least a WARN_ON once that's sorted.
+ */
+ cfg = dev_iommu_priv_get(dev);
+ if (!cfg)
+ return -ENODEV;
+
+ smmu = cfg->smmu;
+
+ ret = arm_smmu_rpm_get(smmu);
+ if (ret < 0)
+ return ret;
+
+ /* Ensure that the domain is finalised */
+ ret = arm_smmu_init_domain_context(domain, smmu, dev);
+ if (ret < 0)
+ goto rpm_put;
+
+ /*
+ * Sanity check the domain. We don't support domains across
+ * different SMMUs.
+ */
+ if (smmu_domain->smmu != smmu) {
+ dev_err(dev,
+ "cannot attach to SMMU %s whilst already attached to domain on SMMU %s\n",
+ dev_name(smmu_domain->smmu->dev), dev_name(smmu->dev));
+ ret = -EINVAL;
+ goto rpm_put;
+ }
+
+ /* Looks ok, so add the device to the domain */
+ ret = arm_smmu_domain_add_master(smmu_domain, cfg, fwspec);
+
+ /*
+ * Setup an autosuspend delay to avoid bouncing runpm state.
+ * Otherwise, if a driver for a suspended consumer device
+ * unmaps buffers, it will runpm resume/suspend for each one.
+ *
+ * For example, when used by a GPU device, when an application
+ * or game exits, it can trigger unmapping 100s or 1000s of
+ * buffers. With a runpm cycle for each buffer, that adds up
+ * to 5-10sec worth of reprogramming the context bank, while
+ * the system appears to be locked up to the user.
+ */
+ pm_runtime_set_autosuspend_delay(smmu->dev, 20);
+ pm_runtime_use_autosuspend(smmu->dev);
+
+rpm_put:
+ arm_smmu_rpm_put(smmu);
+ return ret;
+}
+
+static int arm_smmu_map(struct iommu_domain *domain, unsigned long iova,
+ phys_addr_t paddr, size_t size, int prot, gfp_t gfp)
+{
+ struct io_pgtable_ops *ops = to_smmu_domain(domain)->pgtbl_ops;
+ struct arm_smmu_device *smmu = to_smmu_domain(domain)->smmu;
+ int ret;
+
+ if (!ops)
+ return -ENODEV;
+
+ arm_smmu_rpm_get(smmu);
+ ret = ops->map(ops, iova, paddr, size, prot, gfp);
+ arm_smmu_rpm_put(smmu);
+
+ return ret;
+}
+
+static size_t arm_smmu_unmap(struct iommu_domain *domain, unsigned long iova,
+ size_t size, struct iommu_iotlb_gather *gather)
+{
+ struct io_pgtable_ops *ops = to_smmu_domain(domain)->pgtbl_ops;
+ struct arm_smmu_device *smmu = to_smmu_domain(domain)->smmu;
+ size_t ret;
+
+ if (!ops)
+ return 0;
+
+ arm_smmu_rpm_get(smmu);
+ ret = ops->unmap(ops, iova, size, gather);
+ arm_smmu_rpm_put(smmu);
+
+ return ret;
+}
+
+static void arm_smmu_flush_iotlb_all(struct iommu_domain *domain)
+{
+ struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
+ struct arm_smmu_device *smmu = smmu_domain->smmu;
+
+ if (smmu_domain->flush_ops) {
+ arm_smmu_rpm_get(smmu);
+ smmu_domain->flush_ops->tlb_flush_all(smmu_domain);
+ arm_smmu_rpm_put(smmu);
+ }
+}
+
+static void arm_smmu_iotlb_sync(struct iommu_domain *domain,
+ struct iommu_iotlb_gather *gather)
+{
+ struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
+ struct arm_smmu_device *smmu = smmu_domain->smmu;
+
+ if (!smmu)
+ return;
+
+ arm_smmu_rpm_get(smmu);
+ if (smmu->version == ARM_SMMU_V2 ||
+ smmu_domain->stage == ARM_SMMU_DOMAIN_S1)
+ arm_smmu_tlb_sync_context(smmu_domain);
+ else
+ arm_smmu_tlb_sync_global(smmu);
+ arm_smmu_rpm_put(smmu);
+}
+
+static phys_addr_t arm_smmu_iova_to_phys_hard(struct iommu_domain *domain,
+ dma_addr_t iova)
+{
+ struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
+ struct arm_smmu_device *smmu = smmu_domain->smmu;
+ struct arm_smmu_cfg *cfg = &smmu_domain->cfg;
+ struct io_pgtable_ops *ops= smmu_domain->pgtbl_ops;
+ struct device *dev = smmu->dev;
+ void __iomem *reg;
+ u32 tmp;
+ u64 phys;
+ unsigned long va, flags;
+ int ret, idx = cfg->cbndx;
+ phys_addr_t addr = 0;
+
+ ret = arm_smmu_rpm_get(smmu);
+ if (ret < 0)
+ return 0;
+
+ spin_lock_irqsave(&smmu_domain->cb_lock, flags);
+ va = iova & ~0xfffUL;
+ if (cfg->fmt == ARM_SMMU_CTX_FMT_AARCH64)
+ arm_smmu_cb_writeq(smmu, idx, ARM_SMMU_CB_ATS1PR, va);
+ else
+ arm_smmu_cb_write(smmu, idx, ARM_SMMU_CB_ATS1PR, va);
+
+ reg = arm_smmu_page(smmu, ARM_SMMU_CB(smmu, idx)) + ARM_SMMU_CB_ATSR;
+ if (readl_poll_timeout_atomic(reg, tmp, !(tmp & ARM_SMMU_ATSR_ACTIVE),
+ 5, 50)) {
+ spin_unlock_irqrestore(&smmu_domain->cb_lock, flags);
+ dev_err(dev,
+ "iova to phys timed out on %pad. Falling back to software table walk.\n",
+ &iova);
+ arm_smmu_rpm_put(smmu);
+ return ops->iova_to_phys(ops, iova);
+ }
+
+ phys = arm_smmu_cb_readq(smmu, idx, ARM_SMMU_CB_PAR);
+ spin_unlock_irqrestore(&smmu_domain->cb_lock, flags);
+ if (phys & ARM_SMMU_CB_PAR_F) {
+ dev_err(dev, "translation fault!\n");
+ dev_err(dev, "PAR = 0x%llx\n", phys);
+ goto out;
+ }
+
+ addr = (phys & GENMASK_ULL(39, 12)) | (iova & 0xfff);
+out:
+ arm_smmu_rpm_put(smmu);
+
+ return addr;
+}
+
+static phys_addr_t arm_smmu_iova_to_phys(struct iommu_domain *domain,
+ dma_addr_t iova)
+{
+ struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
+ struct io_pgtable_ops *ops = smmu_domain->pgtbl_ops;
+
+ if (domain->type == IOMMU_DOMAIN_IDENTITY)
+ return iova;
+
+ if (!ops)
+ return 0;
+
+ if (smmu_domain->smmu->features & ARM_SMMU_FEAT_TRANS_OPS &&
+ smmu_domain->stage == ARM_SMMU_DOMAIN_S1)
+ return arm_smmu_iova_to_phys_hard(domain, iova);
+
+ return ops->iova_to_phys(ops, iova);
+}
+
+static bool arm_smmu_capable(enum iommu_cap cap)
+{
+ switch (cap) {
+ case IOMMU_CAP_CACHE_COHERENCY:
+ /*
+ * Return true here as the SMMU can always send out coherent
+ * requests.
+ */
+ return true;
+ case IOMMU_CAP_NOEXEC:
+ return true;
+ default:
+ return false;
+ }
+}
+
+static
+struct arm_smmu_device *arm_smmu_get_by_fwnode(struct fwnode_handle *fwnode)
+{
+ struct device *dev = driver_find_device_by_fwnode(&arm_smmu_driver.driver,
+ fwnode);
+ put_device(dev);
+ return dev ? dev_get_drvdata(dev) : NULL;
+}
+
+static struct iommu_device *arm_smmu_probe_device(struct device *dev)
+{
+ struct arm_smmu_device *smmu = NULL;
+ struct arm_smmu_master_cfg *cfg;
+ struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
+ int i, ret;
+
+ if (using_legacy_binding) {
+ ret = arm_smmu_register_legacy_master(dev, &smmu);
+
+ /*
+ * If dev->iommu_fwspec is initally NULL, arm_smmu_register_legacy_master()
+ * will allocate/initialise a new one. Thus we need to update fwspec for
+ * later use.
+ */
+ fwspec = dev_iommu_fwspec_get(dev);
+ if (ret)
+ goto out_free;
+ } else if (fwspec && fwspec->ops == &arm_smmu_ops) {
+ smmu = arm_smmu_get_by_fwnode(fwspec->iommu_fwnode);
+ } else {
+ return ERR_PTR(-ENODEV);
+ }
+
+ ret = -EINVAL;
+ for (i = 0; i < fwspec->num_ids; i++) {
+ u16 sid = FIELD_GET(ARM_SMMU_SMR_ID, fwspec->ids[i]);
+ u16 mask = FIELD_GET(ARM_SMMU_SMR_MASK, fwspec->ids[i]);
+
+ if (sid & ~smmu->streamid_mask) {
+ dev_err(dev, "stream ID 0x%x out of range for SMMU (0x%x)\n",
+ sid, smmu->streamid_mask);
+ goto out_free;
+ }
+ if (mask & ~smmu->smr_mask_mask) {
+ dev_err(dev, "SMR mask 0x%x out of range for SMMU (0x%x)\n",
+ mask, smmu->smr_mask_mask);
+ goto out_free;
+ }
+ }
+
+ ret = -ENOMEM;
+ cfg = kzalloc(offsetof(struct arm_smmu_master_cfg, smendx[i]),
+ GFP_KERNEL);
+ if (!cfg)
+ goto out_free;
+
+ cfg->smmu = smmu;
+ dev_iommu_priv_set(dev, cfg);
+ while (i--)
+ cfg->smendx[i] = INVALID_SMENDX;
+
+ ret = arm_smmu_rpm_get(smmu);
+ if (ret < 0)
+ goto out_cfg_free;
+
+ ret = arm_smmu_master_alloc_smes(dev);
+ arm_smmu_rpm_put(smmu);
+
+ if (ret)
+ goto out_cfg_free;
+
+ device_link_add(dev, smmu->dev,
+ DL_FLAG_PM_RUNTIME | DL_FLAG_AUTOREMOVE_SUPPLIER);
+
+ return &smmu->iommu;
+
+out_cfg_free:
+ kfree(cfg);
+out_free:
+ iommu_fwspec_free(dev);
+ return ERR_PTR(ret);
+}
+
+static void arm_smmu_release_device(struct device *dev)
+{
+ struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
+ struct arm_smmu_master_cfg *cfg;
+ struct arm_smmu_device *smmu;
+ int ret;
+
+ if (!fwspec || fwspec->ops != &arm_smmu_ops)
+ return;
+
+ cfg = dev_iommu_priv_get(dev);
+ smmu = cfg->smmu;
+
+ ret = arm_smmu_rpm_get(smmu);
+ if (ret < 0)
+ return;
+
+ arm_smmu_master_free_smes(cfg, fwspec);
+
+ arm_smmu_rpm_put(smmu);
+
+ dev_iommu_priv_set(dev, NULL);
+ kfree(cfg);
+ iommu_fwspec_free(dev);
+}
+
+static struct iommu_group *arm_smmu_device_group(struct device *dev)
+{
+ struct arm_smmu_master_cfg *cfg = dev_iommu_priv_get(dev);
+ struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
+ struct arm_smmu_device *smmu = cfg->smmu;
+ struct iommu_group *group = NULL;
+ int i, idx;
+
+ for_each_cfg_sme(cfg, fwspec, i, idx) {
+ if (group && smmu->s2crs[idx].group &&
+ group != smmu->s2crs[idx].group)
+ return ERR_PTR(-EINVAL);
+
+ group = smmu->s2crs[idx].group;
+ }
+
+ if (group)
+ return iommu_group_ref_get(group);
+
+ if (dev_is_pci(dev))
+ group = pci_device_group(dev);
+ else if (dev_is_fsl_mc(dev))
+ group = fsl_mc_device_group(dev);
+ else
+ group = generic_device_group(dev);
+
+ /* Remember group for faster lookups */
+ if (!IS_ERR(group))
+ for_each_cfg_sme(cfg, fwspec, i, idx)
+ smmu->s2crs[idx].group = group;
+
+ return group;
+}
+
+static int arm_smmu_domain_get_attr(struct iommu_domain *domain,
+ enum iommu_attr attr, void *data)
+{
+ struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
+
+ switch(domain->type) {
+ case IOMMU_DOMAIN_UNMANAGED:
+ switch (attr) {
+ case DOMAIN_ATTR_NESTING:
+ *(int *)data = (smmu_domain->stage == ARM_SMMU_DOMAIN_NESTED);
+ return 0;
+ default:
+ return -ENODEV;
+ }
+ break;
+ case IOMMU_DOMAIN_DMA:
+ switch (attr) {
+ case DOMAIN_ATTR_DMA_USE_FLUSH_QUEUE:
+ *(int *)data = smmu_domain->non_strict;
+ return 0;
+ default:
+ return -ENODEV;
+ }
+ break;
+ default:
+ return -EINVAL;
+ }
+}
+
+static int arm_smmu_domain_set_attr(struct iommu_domain *domain,
+ enum iommu_attr attr, void *data)
+{
+ int ret = 0;
+ struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
+
+ mutex_lock(&smmu_domain->init_mutex);
+
+ switch(domain->type) {
+ case IOMMU_DOMAIN_UNMANAGED:
+ switch (attr) {
+ case DOMAIN_ATTR_NESTING:
+ if (smmu_domain->smmu) {
+ ret = -EPERM;
+ goto out_unlock;
+ }
+
+ if (*(int *)data)
+ smmu_domain->stage = ARM_SMMU_DOMAIN_NESTED;
+ else
+ smmu_domain->stage = ARM_SMMU_DOMAIN_S1;
+ break;
+ default:
+ ret = -ENODEV;
+ }
+ break;
+ case IOMMU_DOMAIN_DMA:
+ switch (attr) {
+ case DOMAIN_ATTR_DMA_USE_FLUSH_QUEUE:
+ smmu_domain->non_strict = *(int *)data;
+ break;
+ default:
+ ret = -ENODEV;
+ }
+ break;
+ default:
+ ret = -EINVAL;
+ }
+out_unlock:
+ mutex_unlock(&smmu_domain->init_mutex);
+ return ret;
+}
+
+static int arm_smmu_of_xlate(struct device *dev, struct of_phandle_args *args)
+{
+ u32 mask, fwid = 0;
+
+ if (args->args_count > 0)
+ fwid |= FIELD_PREP(ARM_SMMU_SMR_ID, args->args[0]);
+
+ if (args->args_count > 1)
+ fwid |= FIELD_PREP(ARM_SMMU_SMR_MASK, args->args[1]);
+ else if (!of_property_read_u32(args->np, "stream-match-mask", &mask))
+ fwid |= FIELD_PREP(ARM_SMMU_SMR_MASK, mask);
+
+ return iommu_fwspec_add_ids(dev, &fwid, 1);
+}
+
+static void arm_smmu_get_resv_regions(struct device *dev,
+ struct list_head *head)
+{
+ struct iommu_resv_region *region;
+ int prot = IOMMU_WRITE | IOMMU_NOEXEC | IOMMU_MMIO;
+
+ region = iommu_alloc_resv_region(MSI_IOVA_BASE, MSI_IOVA_LENGTH,
+ prot, IOMMU_RESV_SW_MSI);
+ if (!region)
+ return;
+
+ list_add_tail(&region->list, head);
+
+ iommu_dma_get_resv_regions(dev, head);
+}
+
+static int arm_smmu_def_domain_type(struct device *dev)
+{
+ struct arm_smmu_master_cfg *cfg = dev_iommu_priv_get(dev);
+ const struct arm_smmu_impl *impl = cfg->smmu->impl;
+
+ if (impl && impl->def_domain_type)
+ return impl->def_domain_type(dev);
+
+ return 0;
+}
+
+static struct iommu_ops arm_smmu_ops = {
+ .capable = arm_smmu_capable,
+ .domain_alloc = arm_smmu_domain_alloc,
+ .domain_free = arm_smmu_domain_free,
+ .attach_dev = arm_smmu_attach_dev,
+ .map = arm_smmu_map,
+ .unmap = arm_smmu_unmap,
+ .flush_iotlb_all = arm_smmu_flush_iotlb_all,
+ .iotlb_sync = arm_smmu_iotlb_sync,
+ .iova_to_phys = arm_smmu_iova_to_phys,
+ .probe_device = arm_smmu_probe_device,
+ .release_device = arm_smmu_release_device,
+ .device_group = arm_smmu_device_group,
+ .domain_get_attr = arm_smmu_domain_get_attr,
+ .domain_set_attr = arm_smmu_domain_set_attr,
+ .of_xlate = arm_smmu_of_xlate,
+ .get_resv_regions = arm_smmu_get_resv_regions,
+ .put_resv_regions = generic_iommu_put_resv_regions,
+ .def_domain_type = arm_smmu_def_domain_type,
+ .pgsize_bitmap = -1UL, /* Restricted during device attach */
+};
+
+static void arm_smmu_device_reset(struct arm_smmu_device *smmu)
+{
+ int i;
+ u32 reg;
+
+ /* clear global FSR */
+ reg = arm_smmu_gr0_read(smmu, ARM_SMMU_GR0_sGFSR);
+ arm_smmu_gr0_write(smmu, ARM_SMMU_GR0_sGFSR, reg);
+
+ /*
+ * Reset stream mapping groups: Initial values mark all SMRn as
+ * invalid and all S2CRn as bypass unless overridden.
+ */
+ for (i = 0; i < smmu->num_mapping_groups; ++i)
+ arm_smmu_write_sme(smmu, i);
+
+ /* Make sure all context banks are disabled and clear CB_FSR */
+ for (i = 0; i < smmu->num_context_banks; ++i) {
+ arm_smmu_write_context_bank(smmu, i);
+ arm_smmu_cb_write(smmu, i, ARM_SMMU_CB_FSR, ARM_SMMU_FSR_FAULT);
+ }
+
+ /* Invalidate the TLB, just in case */
+ arm_smmu_gr0_write(smmu, ARM_SMMU_GR0_TLBIALLH, QCOM_DUMMY_VAL);
+ arm_smmu_gr0_write(smmu, ARM_SMMU_GR0_TLBIALLNSNH, QCOM_DUMMY_VAL);
+
+ reg = arm_smmu_gr0_read(smmu, ARM_SMMU_GR0_sCR0);
+
+ /* Enable fault reporting */
+ reg |= (ARM_SMMU_sCR0_GFRE | ARM_SMMU_sCR0_GFIE |
+ ARM_SMMU_sCR0_GCFGFRE | ARM_SMMU_sCR0_GCFGFIE);
+
+ /* Disable TLB broadcasting. */
+ reg |= (ARM_SMMU_sCR0_VMIDPNE | ARM_SMMU_sCR0_PTM);
+
+ /* Enable client access, handling unmatched streams as appropriate */
+ reg &= ~ARM_SMMU_sCR0_CLIENTPD;
+ if (disable_bypass)
+ reg |= ARM_SMMU_sCR0_USFCFG;
+ else
+ reg &= ~ARM_SMMU_sCR0_USFCFG;
+
+ /* Disable forced broadcasting */
+ reg &= ~ARM_SMMU_sCR0_FB;
+
+ /* Don't upgrade barriers */
+ reg &= ~(ARM_SMMU_sCR0_BSU);
+
+ if (smmu->features & ARM_SMMU_FEAT_VMID16)
+ reg |= ARM_SMMU_sCR0_VMID16EN;
+
+ if (smmu->features & ARM_SMMU_FEAT_EXIDS)
+ reg |= ARM_SMMU_sCR0_EXIDENABLE;
+
+ if (smmu->impl && smmu->impl->reset)
+ smmu->impl->reset(smmu);
+
+ /* Push the button */
+ arm_smmu_tlb_sync_global(smmu);
+ arm_smmu_gr0_write(smmu, ARM_SMMU_GR0_sCR0, reg);
+}
+
+static int arm_smmu_id_size_to_bits(int size)
+{
+ switch (size) {
+ case 0:
+ return 32;
+ case 1:
+ return 36;
+ case 2:
+ return 40;
+ case 3:
+ return 42;
+ case 4:
+ return 44;
+ case 5:
+ default:
+ return 48;
+ }
+}
+
+static int arm_smmu_device_cfg_probe(struct arm_smmu_device *smmu)
+{
+ unsigned int size;
+ u32 id;
+ bool cttw_reg, cttw_fw = smmu->features & ARM_SMMU_FEAT_COHERENT_WALK;
+ int i, ret;
+
+ dev_notice(smmu->dev, "probing hardware configuration...\n");
+ dev_notice(smmu->dev, "SMMUv%d with:\n",
+ smmu->version == ARM_SMMU_V2 ? 2 : 1);
+
+ /* ID0 */
+ id = arm_smmu_gr0_read(smmu, ARM_SMMU_GR0_ID0);
+
+ /* Restrict available stages based on module parameter */
+ if (force_stage == 1)
+ id &= ~(ARM_SMMU_ID0_S2TS | ARM_SMMU_ID0_NTS);
+ else if (force_stage == 2)
+ id &= ~(ARM_SMMU_ID0_S1TS | ARM_SMMU_ID0_NTS);
+
+ if (id & ARM_SMMU_ID0_S1TS) {
+ smmu->features |= ARM_SMMU_FEAT_TRANS_S1;
+ dev_notice(smmu->dev, "\tstage 1 translation\n");
+ }
+
+ if (id & ARM_SMMU_ID0_S2TS) {
+ smmu->features |= ARM_SMMU_FEAT_TRANS_S2;
+ dev_notice(smmu->dev, "\tstage 2 translation\n");
+ }
+
+ if (id & ARM_SMMU_ID0_NTS) {
+ smmu->features |= ARM_SMMU_FEAT_TRANS_NESTED;
+ dev_notice(smmu->dev, "\tnested translation\n");
+ }
+
+ if (!(smmu->features &
+ (ARM_SMMU_FEAT_TRANS_S1 | ARM_SMMU_FEAT_TRANS_S2))) {
+ dev_err(smmu->dev, "\tno translation support!\n");
+ return -ENODEV;
+ }
+
+ if ((id & ARM_SMMU_ID0_S1TS) &&
+ ((smmu->version < ARM_SMMU_V2) || !(id & ARM_SMMU_ID0_ATOSNS))) {
+ smmu->features |= ARM_SMMU_FEAT_TRANS_OPS;
+ dev_notice(smmu->dev, "\taddress translation ops\n");
+ }
+
+ /*
+ * In order for DMA API calls to work properly, we must defer to what
+ * the FW says about coherency, regardless of what the hardware claims.
+ * Fortunately, this also opens up a workaround for systems where the
+ * ID register value has ended up configured incorrectly.
+ */
+ cttw_reg = !!(id & ARM_SMMU_ID0_CTTW);
+ if (cttw_fw || cttw_reg)
+ dev_notice(smmu->dev, "\t%scoherent table walk\n",
+ cttw_fw ? "" : "non-");
+ if (cttw_fw != cttw_reg)
+ dev_notice(smmu->dev,
+ "\t(IDR0.CTTW overridden by FW configuration)\n");
+
+ /* Max. number of entries we have for stream matching/indexing */
+ if (smmu->version == ARM_SMMU_V2 && id & ARM_SMMU_ID0_EXIDS) {
+ smmu->features |= ARM_SMMU_FEAT_EXIDS;
+ size = 1 << 16;
+ } else {
+ size = 1 << FIELD_GET(ARM_SMMU_ID0_NUMSIDB, id);
+ }
+ smmu->streamid_mask = size - 1;
+ if (id & ARM_SMMU_ID0_SMS) {
+ smmu->features |= ARM_SMMU_FEAT_STREAM_MATCH;
+ size = FIELD_GET(ARM_SMMU_ID0_NUMSMRG, id);
+ if (size == 0) {
+ dev_err(smmu->dev,
+ "stream-matching supported, but no SMRs present!\n");
+ return -ENODEV;
+ }
+
+ /* Zero-initialised to mark as invalid */
+ smmu->smrs = devm_kcalloc(smmu->dev, size, sizeof(*smmu->smrs),
+ GFP_KERNEL);
+ if (!smmu->smrs)
+ return -ENOMEM;
+
+ dev_notice(smmu->dev,
+ "\tstream matching with %u register groups", size);
+ }
+ /* s2cr->type == 0 means translation, so initialise explicitly */
+ smmu->s2crs = devm_kmalloc_array(smmu->dev, size, sizeof(*smmu->s2crs),
+ GFP_KERNEL);
+ if (!smmu->s2crs)
+ return -ENOMEM;
+ for (i = 0; i < size; i++)
+ smmu->s2crs[i] = s2cr_init_val;
+
+ smmu->num_mapping_groups = size;
+ mutex_init(&smmu->stream_map_mutex);
+ spin_lock_init(&smmu->global_sync_lock);
+
+ if (smmu->version < ARM_SMMU_V2 ||
+ !(id & ARM_SMMU_ID0_PTFS_NO_AARCH32)) {
+ smmu->features |= ARM_SMMU_FEAT_FMT_AARCH32_L;
+ if (!(id & ARM_SMMU_ID0_PTFS_NO_AARCH32S))
+ smmu->features |= ARM_SMMU_FEAT_FMT_AARCH32_S;
+ }
+
+ /* ID1 */
+ id = arm_smmu_gr0_read(smmu, ARM_SMMU_GR0_ID1);
+ smmu->pgshift = (id & ARM_SMMU_ID1_PAGESIZE) ? 16 : 12;
+
+ /* Check for size mismatch of SMMU address space from mapped region */
+ size = 1 << (FIELD_GET(ARM_SMMU_ID1_NUMPAGENDXB, id) + 1);
+ if (smmu->numpage != 2 * size << smmu->pgshift)
+ dev_warn(smmu->dev,
+ "SMMU address space size (0x%x) differs from mapped region size (0x%x)!\n",
+ 2 * size << smmu->pgshift, smmu->numpage);
+ /* Now properly encode NUMPAGE to subsequently derive SMMU_CB_BASE */
+ smmu->numpage = size;
+
+ smmu->num_s2_context_banks = FIELD_GET(ARM_SMMU_ID1_NUMS2CB, id);
+ smmu->num_context_banks = FIELD_GET(ARM_SMMU_ID1_NUMCB, id);
+ if (smmu->num_s2_context_banks > smmu->num_context_banks) {
+ dev_err(smmu->dev, "impossible number of S2 context banks!\n");
+ return -ENODEV;
+ }
+ dev_notice(smmu->dev, "\t%u context banks (%u stage-2 only)\n",
+ smmu->num_context_banks, smmu->num_s2_context_banks);
+ smmu->cbs = devm_kcalloc(smmu->dev, smmu->num_context_banks,
+ sizeof(*smmu->cbs), GFP_KERNEL);
+ if (!smmu->cbs)
+ return -ENOMEM;
+
+ /* ID2 */
+ id = arm_smmu_gr0_read(smmu, ARM_SMMU_GR0_ID2);
+ size = arm_smmu_id_size_to_bits(FIELD_GET(ARM_SMMU_ID2_IAS, id));
+ smmu->ipa_size = size;
+
+ /* The output mask is also applied for bypass */
+ size = arm_smmu_id_size_to_bits(FIELD_GET(ARM_SMMU_ID2_OAS, id));
+ smmu->pa_size = size;
+
+ if (id & ARM_SMMU_ID2_VMID16)
+ smmu->features |= ARM_SMMU_FEAT_VMID16;
+
+ /*
+ * What the page table walker can address actually depends on which
+ * descriptor format is in use, but since a) we don't know that yet,
+ * and b) it can vary per context bank, this will have to do...
+ */
+ if (dma_set_mask_and_coherent(smmu->dev, DMA_BIT_MASK(size)))
+ dev_warn(smmu->dev,
+ "failed to set DMA mask for table walker\n");
+
+ if (smmu->version < ARM_SMMU_V2) {
+ smmu->va_size = smmu->ipa_size;
+ if (smmu->version == ARM_SMMU_V1_64K)
+ smmu->features |= ARM_SMMU_FEAT_FMT_AARCH64_64K;
+ } else {
+ size = FIELD_GET(ARM_SMMU_ID2_UBS, id);
+ smmu->va_size = arm_smmu_id_size_to_bits(size);
+ if (id & ARM_SMMU_ID2_PTFS_4K)
+ smmu->features |= ARM_SMMU_FEAT_FMT_AARCH64_4K;
+ if (id & ARM_SMMU_ID2_PTFS_16K)
+ smmu->features |= ARM_SMMU_FEAT_FMT_AARCH64_16K;
+ if (id & ARM_SMMU_ID2_PTFS_64K)
+ smmu->features |= ARM_SMMU_FEAT_FMT_AARCH64_64K;
+ }
+
+ if (smmu->impl && smmu->impl->cfg_probe) {
+ ret = smmu->impl->cfg_probe(smmu);
+ if (ret)
+ return ret;
+ }
+
+ /* Now we've corralled the various formats, what'll it do? */
+ if (smmu->features & ARM_SMMU_FEAT_FMT_AARCH32_S)
+ smmu->pgsize_bitmap |= SZ_4K | SZ_64K | SZ_1M | SZ_16M;
+ if (smmu->features &
+ (ARM_SMMU_FEAT_FMT_AARCH32_L | ARM_SMMU_FEAT_FMT_AARCH64_4K))
+ smmu->pgsize_bitmap |= SZ_4K | SZ_2M | SZ_1G;
+ if (smmu->features & ARM_SMMU_FEAT_FMT_AARCH64_16K)
+ smmu->pgsize_bitmap |= SZ_16K | SZ_32M;
+ if (smmu->features & ARM_SMMU_FEAT_FMT_AARCH64_64K)
+ smmu->pgsize_bitmap |= SZ_64K | SZ_512M;
+
+ if (arm_smmu_ops.pgsize_bitmap == -1UL)
+ arm_smmu_ops.pgsize_bitmap = smmu->pgsize_bitmap;
+ else
+ arm_smmu_ops.pgsize_bitmap |= smmu->pgsize_bitmap;
+ dev_notice(smmu->dev, "\tSupported page sizes: 0x%08lx\n",
+ smmu->pgsize_bitmap);
+
+
+ if (smmu->features & ARM_SMMU_FEAT_TRANS_S1)
+ dev_notice(smmu->dev, "\tStage-1: %lu-bit VA -> %lu-bit IPA\n",
+ smmu->va_size, smmu->ipa_size);
+
+ if (smmu->features & ARM_SMMU_FEAT_TRANS_S2)
+ dev_notice(smmu->dev, "\tStage-2: %lu-bit IPA -> %lu-bit PA\n",
+ smmu->ipa_size, smmu->pa_size);
+
+ return 0;
+}
+
+struct arm_smmu_match_data {
+ enum arm_smmu_arch_version version;
+ enum arm_smmu_implementation model;
+};
+
+#define ARM_SMMU_MATCH_DATA(name, ver, imp) \
+static const struct arm_smmu_match_data name = { .version = ver, .model = imp }
+
+ARM_SMMU_MATCH_DATA(smmu_generic_v1, ARM_SMMU_V1, GENERIC_SMMU);
+ARM_SMMU_MATCH_DATA(smmu_generic_v2, ARM_SMMU_V2, GENERIC_SMMU);
+ARM_SMMU_MATCH_DATA(arm_mmu401, ARM_SMMU_V1_64K, GENERIC_SMMU);
+ARM_SMMU_MATCH_DATA(arm_mmu500, ARM_SMMU_V2, ARM_MMU500);
+ARM_SMMU_MATCH_DATA(cavium_smmuv2, ARM_SMMU_V2, CAVIUM_SMMUV2);
+ARM_SMMU_MATCH_DATA(qcom_smmuv2, ARM_SMMU_V2, QCOM_SMMUV2);
+
+static const struct of_device_id arm_smmu_of_match[] = {
+ { .compatible = "arm,smmu-v1", .data = &smmu_generic_v1 },
+ { .compatible = "arm,smmu-v2", .data = &smmu_generic_v2 },
+ { .compatible = "arm,mmu-400", .data = &smmu_generic_v1 },
+ { .compatible = "arm,mmu-401", .data = &arm_mmu401 },
+ { .compatible = "arm,mmu-500", .data = &arm_mmu500 },
+ { .compatible = "cavium,smmu-v2", .data = &cavium_smmuv2 },
+ { .compatible = "nvidia,smmu-500", .data = &arm_mmu500 },
+ { .compatible = "qcom,smmu-v2", .data = &qcom_smmuv2 },
+ { },
+};
+MODULE_DEVICE_TABLE(of, arm_smmu_of_match);
+
+#ifdef CONFIG_ACPI
+static int acpi_smmu_get_data(u32 model, struct arm_smmu_device *smmu)
+{
+ int ret = 0;
+
+ switch (model) {
+ case ACPI_IORT_SMMU_V1:
+ case ACPI_IORT_SMMU_CORELINK_MMU400:
+ smmu->version = ARM_SMMU_V1;
+ smmu->model = GENERIC_SMMU;
+ break;
+ case ACPI_IORT_SMMU_CORELINK_MMU401:
+ smmu->version = ARM_SMMU_V1_64K;
+ smmu->model = GENERIC_SMMU;
+ break;
+ case ACPI_IORT_SMMU_V2:
+ smmu->version = ARM_SMMU_V2;
+ smmu->model = GENERIC_SMMU;
+ break;
+ case ACPI_IORT_SMMU_CORELINK_MMU500:
+ smmu->version = ARM_SMMU_V2;
+ smmu->model = ARM_MMU500;
+ break;
+ case ACPI_IORT_SMMU_CAVIUM_THUNDERX:
+ smmu->version = ARM_SMMU_V2;
+ smmu->model = CAVIUM_SMMUV2;
+ break;
+ default:
+ ret = -ENODEV;
+ }
+
+ return ret;
+}
+
+static int arm_smmu_device_acpi_probe(struct platform_device *pdev,
+ struct arm_smmu_device *smmu)
+{
+ struct device *dev = smmu->dev;
+ struct acpi_iort_node *node =
+ *(struct acpi_iort_node **)dev_get_platdata(dev);
+ struct acpi_iort_smmu *iort_smmu;
+ int ret;
+
+ /* Retrieve SMMU1/2 specific data */
+ iort_smmu = (struct acpi_iort_smmu *)node->node_data;
+
+ ret = acpi_smmu_get_data(iort_smmu->model, smmu);
+ if (ret < 0)
+ return ret;
+
+ /* Ignore the configuration access interrupt */
+ smmu->num_global_irqs = 1;
+
+ if (iort_smmu->flags & ACPI_IORT_SMMU_COHERENT_WALK)
+ smmu->features |= ARM_SMMU_FEAT_COHERENT_WALK;
+
+ return 0;
+}
+#else
+static inline int arm_smmu_device_acpi_probe(struct platform_device *pdev,
+ struct arm_smmu_device *smmu)
+{
+ return -ENODEV;
+}
+#endif
+
+static int arm_smmu_device_dt_probe(struct platform_device *pdev,
+ struct arm_smmu_device *smmu)
+{
+ const struct arm_smmu_match_data *data;
+ struct device *dev = &pdev->dev;
+ bool legacy_binding;
+
+ if (of_property_read_u32(dev->of_node, "#global-interrupts",
+ &smmu->num_global_irqs)) {
+ dev_err(dev, "missing #global-interrupts property\n");
+ return -ENODEV;
+ }
+
+ data = of_device_get_match_data(dev);
+ smmu->version = data->version;
+ smmu->model = data->model;
+
+ legacy_binding = of_find_property(dev->of_node, "mmu-masters", NULL);
+ if (legacy_binding && !using_generic_binding) {
+ if (!using_legacy_binding) {
+ pr_notice("deprecated \"mmu-masters\" DT property in use; %s support unavailable\n",
+ IS_ENABLED(CONFIG_ARM_SMMU_LEGACY_DT_BINDINGS) ? "DMA API" : "SMMU");
+ }
+ using_legacy_binding = true;
+ } else if (!legacy_binding && !using_legacy_binding) {
+ using_generic_binding = true;
+ } else {
+ dev_err(dev, "not probing due to mismatched DT properties\n");
+ return -ENODEV;
+ }
+
+ if (of_dma_is_coherent(dev->of_node))
+ smmu->features |= ARM_SMMU_FEAT_COHERENT_WALK;
+
+ return 0;
+}
+
+static int arm_smmu_bus_init(struct iommu_ops *ops)
+{
+ int err;
+
+ /* Oh, for a proper bus abstraction */
+ if (!iommu_present(&platform_bus_type)) {
+ err = bus_set_iommu(&platform_bus_type, ops);
+ if (err)
+ return err;
+ }
+#ifdef CONFIG_ARM_AMBA
+ if (!iommu_present(&amba_bustype)) {
+ err = bus_set_iommu(&amba_bustype, ops);
+ if (err)
+ goto err_reset_platform_ops;
+ }
+#endif
+#ifdef CONFIG_PCI
+ if (!iommu_present(&pci_bus_type)) {
+ err = bus_set_iommu(&pci_bus_type, ops);
+ if (err)
+ goto err_reset_amba_ops;
+ }
+#endif
+#ifdef CONFIG_FSL_MC_BUS
+ if (!iommu_present(&fsl_mc_bus_type)) {
+ err = bus_set_iommu(&fsl_mc_bus_type, ops);
+ if (err)
+ goto err_reset_pci_ops;
+ }
+#endif
+ return 0;
+
+err_reset_pci_ops: __maybe_unused;
+#ifdef CONFIG_PCI
+ bus_set_iommu(&pci_bus_type, NULL);
+#endif
+err_reset_amba_ops: __maybe_unused;
+#ifdef CONFIG_ARM_AMBA
+ bus_set_iommu(&amba_bustype, NULL);
+#endif
+err_reset_platform_ops: __maybe_unused;
+ bus_set_iommu(&platform_bus_type, NULL);
+ return err;
+}
+
+static int arm_smmu_device_probe(struct platform_device *pdev)
+{
+ struct resource *res;
+ resource_size_t ioaddr;
+ struct arm_smmu_device *smmu;
+ struct device *dev = &pdev->dev;
+ int num_irqs, i, err;
+ irqreturn_t (*global_fault)(int irq, void *dev);
+
+ smmu = devm_kzalloc(dev, sizeof(*smmu), GFP_KERNEL);
+ if (!smmu) {
+ dev_err(dev, "failed to allocate arm_smmu_device\n");
+ return -ENOMEM;
+ }
+ smmu->dev = dev;
+
+ if (dev->of_node)
+ err = arm_smmu_device_dt_probe(pdev, smmu);
+ else
+ err = arm_smmu_device_acpi_probe(pdev, smmu);
+
+ if (err)
+ return err;
+
+ smmu->base = devm_platform_get_and_ioremap_resource(pdev, 0, &res);
+ if (IS_ERR(smmu->base))
+ return PTR_ERR(smmu->base);
+ ioaddr = res->start;
+ /*
+ * The resource size should effectively match the value of SMMU_TOP;
+ * stash that temporarily until we know PAGESIZE to validate it with.
+ */
+ smmu->numpage = resource_size(res);
+
+ smmu = arm_smmu_impl_init(smmu);
+ if (IS_ERR(smmu))
+ return PTR_ERR(smmu);
+
+ num_irqs = 0;
+ while ((res = platform_get_resource(pdev, IORESOURCE_IRQ, num_irqs))) {
+ num_irqs++;
+ if (num_irqs > smmu->num_global_irqs)
+ smmu->num_context_irqs++;
+ }
+
+ if (!smmu->num_context_irqs) {
+ dev_err(dev, "found %d interrupts but expected at least %d\n",
+ num_irqs, smmu->num_global_irqs + 1);
+ return -ENODEV;
+ }
+
+ smmu->irqs = devm_kcalloc(dev, num_irqs, sizeof(*smmu->irqs),
+ GFP_KERNEL);
+ if (!smmu->irqs) {
+ dev_err(dev, "failed to allocate %d irqs\n", num_irqs);
+ return -ENOMEM;
+ }
+
+ for (i = 0; i < num_irqs; ++i) {
+ int irq = platform_get_irq(pdev, i);
+
+ if (irq < 0)
+ return -ENODEV;
+ smmu->irqs[i] = irq;
+ }
+
+ err = devm_clk_bulk_get_all(dev, &smmu->clks);
+ if (err < 0) {
+ dev_err(dev, "failed to get clocks %d\n", err);
+ return err;
+ }
+ smmu->num_clks = err;
+
+ err = clk_bulk_prepare_enable(smmu->num_clks, smmu->clks);
+ if (err)
+ return err;
+
+ err = arm_smmu_device_cfg_probe(smmu);
+ if (err)
+ return err;
+
+ if (smmu->version == ARM_SMMU_V2) {
+ if (smmu->num_context_banks > smmu->num_context_irqs) {
+ dev_err(dev,
+ "found only %d context irq(s) but %d required\n",
+ smmu->num_context_irqs, smmu->num_context_banks);
+ return -ENODEV;
+ }
+
+ /* Ignore superfluous interrupts */
+ smmu->num_context_irqs = smmu->num_context_banks;
+ }
+
+ if (smmu->impl && smmu->impl->global_fault)
+ global_fault = smmu->impl->global_fault;
+ else
+ global_fault = arm_smmu_global_fault;
+
+ for (i = 0; i < smmu->num_global_irqs; ++i) {
+ err = devm_request_irq(smmu->dev, smmu->irqs[i],
+ global_fault,
+ IRQF_SHARED,
+ "arm-smmu global fault",
+ smmu);
+ if (err) {
+ dev_err(dev, "failed to request global IRQ %d (%u)\n",
+ i, smmu->irqs[i]);
+ return err;
+ }
+ }
+
+ err = iommu_device_sysfs_add(&smmu->iommu, smmu->dev, NULL,
+ "smmu.%pa", &ioaddr);
+ if (err) {
+ dev_err(dev, "Failed to register iommu in sysfs\n");
+ return err;
+ }
+
+ iommu_device_set_ops(&smmu->iommu, &arm_smmu_ops);
+ iommu_device_set_fwnode(&smmu->iommu, dev->fwnode);
+
+ err = iommu_device_register(&smmu->iommu);
+ if (err) {
+ dev_err(dev, "Failed to register iommu\n");
+ return err;
+ }
+
+ platform_set_drvdata(pdev, smmu);
+ arm_smmu_device_reset(smmu);
+ arm_smmu_test_smr_masks(smmu);
+
+ /*
+ * We want to avoid touching dev->power.lock in fastpaths unless
+ * it's really going to do something useful - pm_runtime_enabled()
+ * can serve as an ideal proxy for that decision. So, conditionally
+ * enable pm_runtime.
+ */
+ if (dev->pm_domain) {
+ pm_runtime_set_active(dev);
+ pm_runtime_enable(dev);
+ }
+
+ /*
+ * For ACPI and generic DT bindings, an SMMU will be probed before
+ * any device which might need it, so we want the bus ops in place
+ * ready to handle default domain setup as soon as any SMMU exists.
+ */
+ if (!using_legacy_binding)
+ return arm_smmu_bus_init(&arm_smmu_ops);
+
+ return 0;
+}
+
+static int arm_smmu_device_remove(struct platform_device *pdev)
+{
+ struct arm_smmu_device *smmu = platform_get_drvdata(pdev);
+
+ if (!smmu)
+ return -ENODEV;
+
+ if (!bitmap_empty(smmu->context_map, ARM_SMMU_MAX_CBS))
+ dev_notice(&pdev->dev, "disabling translation\n");
+
+ arm_smmu_bus_init(NULL);
+ iommu_device_unregister(&smmu->iommu);
+ iommu_device_sysfs_remove(&smmu->iommu);
+
+ arm_smmu_rpm_get(smmu);
+ /* Turn the thing off */
+ arm_smmu_gr0_write(smmu, ARM_SMMU_GR0_sCR0, ARM_SMMU_sCR0_CLIENTPD);
+ arm_smmu_rpm_put(smmu);
+
+ if (pm_runtime_enabled(smmu->dev))
+ pm_runtime_force_suspend(smmu->dev);
+ else
+ clk_bulk_disable(smmu->num_clks, smmu->clks);
+
+ clk_bulk_unprepare(smmu->num_clks, smmu->clks);
+ return 0;
+}
+
+static void arm_smmu_device_shutdown(struct platform_device *pdev)
+{
+ arm_smmu_device_remove(pdev);
+}
+
+static int __maybe_unused arm_smmu_runtime_resume(struct device *dev)
+{
+ struct arm_smmu_device *smmu = dev_get_drvdata(dev);
+ int ret;
+
+ ret = clk_bulk_enable(smmu->num_clks, smmu->clks);
+ if (ret)
+ return ret;
+
+ arm_smmu_device_reset(smmu);
+
+ return 0;
+}
+
+static int __maybe_unused arm_smmu_runtime_suspend(struct device *dev)
+{
+ struct arm_smmu_device *smmu = dev_get_drvdata(dev);
+
+ clk_bulk_disable(smmu->num_clks, smmu->clks);
+
+ return 0;
+}
+
+static int __maybe_unused arm_smmu_pm_resume(struct device *dev)
+{
+ if (pm_runtime_suspended(dev))
+ return 0;
+
+ return arm_smmu_runtime_resume(dev);
+}
+
+static int __maybe_unused arm_smmu_pm_suspend(struct device *dev)
+{
+ if (pm_runtime_suspended(dev))
+ return 0;
+
+ return arm_smmu_runtime_suspend(dev);
+}
+
+static const struct dev_pm_ops arm_smmu_pm_ops = {
+ SET_SYSTEM_SLEEP_PM_OPS(arm_smmu_pm_suspend, arm_smmu_pm_resume)
+ SET_RUNTIME_PM_OPS(arm_smmu_runtime_suspend,
+ arm_smmu_runtime_resume, NULL)
+};
+
+static struct platform_driver arm_smmu_driver = {
+ .driver = {
+ .name = "arm-smmu",
+ .of_match_table = arm_smmu_of_match,
+ .pm = &arm_smmu_pm_ops,
+ .suppress_bind_attrs = true,
+ },
+ .probe = arm_smmu_device_probe,
+ .remove = arm_smmu_device_remove,
+ .shutdown = arm_smmu_device_shutdown,
+};
+module_platform_driver(arm_smmu_driver);
+
+MODULE_DESCRIPTION("IOMMU API for ARM architected SMMU implementations");
+MODULE_AUTHOR("Will Deacon <will@kernel.org>");
+MODULE_ALIAS("platform:arm-smmu");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.h b/drivers/iommu/arm/arm-smmu/arm-smmu.h
new file mode 100644
index 000000000..b71647eaa
--- /dev/null
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu.h
@@ -0,0 +1,528 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * IOMMU API for ARM architected SMMU implementations.
+ *
+ * Copyright (C) 2013 ARM Limited
+ *
+ * Author: Will Deacon <will.deacon@arm.com>
+ */
+
+#ifndef _ARM_SMMU_H
+#define _ARM_SMMU_H
+
+#include <linux/atomic.h>
+#include <linux/bitfield.h>
+#include <linux/bits.h>
+#include <linux/clk.h>
+#include <linux/device.h>
+#include <linux/io-64-nonatomic-hi-lo.h>
+#include <linux/io-pgtable.h>
+#include <linux/iommu.h>
+#include <linux/irqreturn.h>
+#include <linux/mutex.h>
+#include <linux/spinlock.h>
+#include <linux/types.h>
+
+/* Configuration registers */
+#define ARM_SMMU_GR0_sCR0 0x0
+#define ARM_SMMU_sCR0_VMID16EN BIT(31)
+#define ARM_SMMU_sCR0_BSU GENMASK(15, 14)
+#define ARM_SMMU_sCR0_FB BIT(13)
+#define ARM_SMMU_sCR0_PTM BIT(12)
+#define ARM_SMMU_sCR0_VMIDPNE BIT(11)
+#define ARM_SMMU_sCR0_USFCFG BIT(10)
+#define ARM_SMMU_sCR0_GCFGFIE BIT(5)
+#define ARM_SMMU_sCR0_GCFGFRE BIT(4)
+#define ARM_SMMU_sCR0_EXIDENABLE BIT(3)
+#define ARM_SMMU_sCR0_GFIE BIT(2)
+#define ARM_SMMU_sCR0_GFRE BIT(1)
+#define ARM_SMMU_sCR0_CLIENTPD BIT(0)
+
+/* Auxiliary Configuration register */
+#define ARM_SMMU_GR0_sACR 0x10
+
+/* Identification registers */
+#define ARM_SMMU_GR0_ID0 0x20
+#define ARM_SMMU_ID0_S1TS BIT(30)
+#define ARM_SMMU_ID0_S2TS BIT(29)
+#define ARM_SMMU_ID0_NTS BIT(28)
+#define ARM_SMMU_ID0_SMS BIT(27)
+#define ARM_SMMU_ID0_ATOSNS BIT(26)
+#define ARM_SMMU_ID0_PTFS_NO_AARCH32 BIT(25)
+#define ARM_SMMU_ID0_PTFS_NO_AARCH32S BIT(24)
+#define ARM_SMMU_ID0_NUMIRPT GENMASK(23, 16)
+#define ARM_SMMU_ID0_CTTW BIT(14)
+#define ARM_SMMU_ID0_NUMSIDB GENMASK(12, 9)
+#define ARM_SMMU_ID0_EXIDS BIT(8)
+#define ARM_SMMU_ID0_NUMSMRG GENMASK(7, 0)
+
+#define ARM_SMMU_GR0_ID1 0x24
+#define ARM_SMMU_ID1_PAGESIZE BIT(31)
+#define ARM_SMMU_ID1_NUMPAGENDXB GENMASK(30, 28)
+#define ARM_SMMU_ID1_NUMS2CB GENMASK(23, 16)
+#define ARM_SMMU_ID1_NUMCB GENMASK(7, 0)
+
+#define ARM_SMMU_GR0_ID2 0x28
+#define ARM_SMMU_ID2_VMID16 BIT(15)
+#define ARM_SMMU_ID2_PTFS_64K BIT(14)
+#define ARM_SMMU_ID2_PTFS_16K BIT(13)
+#define ARM_SMMU_ID2_PTFS_4K BIT(12)
+#define ARM_SMMU_ID2_UBS GENMASK(11, 8)
+#define ARM_SMMU_ID2_OAS GENMASK(7, 4)
+#define ARM_SMMU_ID2_IAS GENMASK(3, 0)
+
+#define ARM_SMMU_GR0_ID3 0x2c
+#define ARM_SMMU_GR0_ID4 0x30
+#define ARM_SMMU_GR0_ID5 0x34
+#define ARM_SMMU_GR0_ID6 0x38
+
+#define ARM_SMMU_GR0_ID7 0x3c
+#define ARM_SMMU_ID7_MAJOR GENMASK(7, 4)
+#define ARM_SMMU_ID7_MINOR GENMASK(3, 0)
+
+#define ARM_SMMU_GR0_sGFSR 0x48
+#define ARM_SMMU_sGFSR_USF BIT(1)
+
+#define ARM_SMMU_GR0_sGFSYNR0 0x50
+#define ARM_SMMU_GR0_sGFSYNR1 0x54
+#define ARM_SMMU_GR0_sGFSYNR2 0x58
+
+/* Global TLB invalidation */
+#define ARM_SMMU_GR0_TLBIVMID 0x64
+#define ARM_SMMU_GR0_TLBIALLNSNH 0x68
+#define ARM_SMMU_GR0_TLBIALLH 0x6c
+#define ARM_SMMU_GR0_sTLBGSYNC 0x70
+
+#define ARM_SMMU_GR0_sTLBGSTATUS 0x74
+#define ARM_SMMU_sTLBGSTATUS_GSACTIVE BIT(0)
+
+/* Stream mapping registers */
+#define ARM_SMMU_GR0_SMR(n) (0x800 + ((n) << 2))
+#define ARM_SMMU_SMR_VALID BIT(31)
+#define ARM_SMMU_SMR_MASK GENMASK(31, 16)
+#define ARM_SMMU_SMR_ID GENMASK(15, 0)
+
+#define ARM_SMMU_GR0_S2CR(n) (0xc00 + ((n) << 2))
+#define ARM_SMMU_S2CR_PRIVCFG GENMASK(25, 24)
+enum arm_smmu_s2cr_privcfg {
+ S2CR_PRIVCFG_DEFAULT,
+ S2CR_PRIVCFG_DIPAN,
+ S2CR_PRIVCFG_UNPRIV,
+ S2CR_PRIVCFG_PRIV,
+};
+#define ARM_SMMU_S2CR_TYPE GENMASK(17, 16)
+enum arm_smmu_s2cr_type {
+ S2CR_TYPE_TRANS,
+ S2CR_TYPE_BYPASS,
+ S2CR_TYPE_FAULT,
+};
+#define ARM_SMMU_S2CR_EXIDVALID BIT(10)
+#define ARM_SMMU_S2CR_CBNDX GENMASK(7, 0)
+
+/* Context bank attribute registers */
+#define ARM_SMMU_GR1_CBAR(n) (0x0 + ((n) << 2))
+#define ARM_SMMU_CBAR_IRPTNDX GENMASK(31, 24)
+#define ARM_SMMU_CBAR_TYPE GENMASK(17, 16)
+enum arm_smmu_cbar_type {
+ CBAR_TYPE_S2_TRANS,
+ CBAR_TYPE_S1_TRANS_S2_BYPASS,
+ CBAR_TYPE_S1_TRANS_S2_FAULT,
+ CBAR_TYPE_S1_TRANS_S2_TRANS,
+};
+#define ARM_SMMU_CBAR_S1_MEMATTR GENMASK(15, 12)
+#define ARM_SMMU_CBAR_S1_MEMATTR_WB 0xf
+#define ARM_SMMU_CBAR_S1_BPSHCFG GENMASK(9, 8)
+#define ARM_SMMU_CBAR_S1_BPSHCFG_NSH 3
+#define ARM_SMMU_CBAR_VMID GENMASK(7, 0)
+
+#define ARM_SMMU_GR1_CBFRSYNRA(n) (0x400 + ((n) << 2))
+
+#define ARM_SMMU_GR1_CBA2R(n) (0x800 + ((n) << 2))
+#define ARM_SMMU_CBA2R_VMID16 GENMASK(31, 16)
+#define ARM_SMMU_CBA2R_VA64 BIT(0)
+
+#define ARM_SMMU_CB_SCTLR 0x0
+#define ARM_SMMU_SCTLR_S1_ASIDPNE BIT(12)
+#define ARM_SMMU_SCTLR_CFCFG BIT(7)
+#define ARM_SMMU_SCTLR_CFIE BIT(6)
+#define ARM_SMMU_SCTLR_CFRE BIT(5)
+#define ARM_SMMU_SCTLR_E BIT(4)
+#define ARM_SMMU_SCTLR_AFE BIT(2)
+#define ARM_SMMU_SCTLR_TRE BIT(1)
+#define ARM_SMMU_SCTLR_M BIT(0)
+
+#define ARM_SMMU_CB_ACTLR 0x4
+
+#define ARM_SMMU_CB_RESUME 0x8
+#define ARM_SMMU_RESUME_TERMINATE BIT(0)
+
+#define ARM_SMMU_CB_TCR2 0x10
+#define ARM_SMMU_TCR2_SEP GENMASK(17, 15)
+#define ARM_SMMU_TCR2_SEP_UPSTREAM 0x7
+#define ARM_SMMU_TCR2_AS BIT(4)
+#define ARM_SMMU_TCR2_PASIZE GENMASK(3, 0)
+
+#define ARM_SMMU_CB_TTBR0 0x20
+#define ARM_SMMU_CB_TTBR1 0x28
+#define ARM_SMMU_TTBRn_ASID GENMASK_ULL(63, 48)
+
+#define ARM_SMMU_CB_TCR 0x30
+#define ARM_SMMU_TCR_EAE BIT(31)
+#define ARM_SMMU_TCR_EPD1 BIT(23)
+#define ARM_SMMU_TCR_A1 BIT(22)
+#define ARM_SMMU_TCR_TG0 GENMASK(15, 14)
+#define ARM_SMMU_TCR_SH0 GENMASK(13, 12)
+#define ARM_SMMU_TCR_ORGN0 GENMASK(11, 10)
+#define ARM_SMMU_TCR_IRGN0 GENMASK(9, 8)
+#define ARM_SMMU_TCR_EPD0 BIT(7)
+#define ARM_SMMU_TCR_T0SZ GENMASK(5, 0)
+
+#define ARM_SMMU_VTCR_RES1 BIT(31)
+#define ARM_SMMU_VTCR_PS GENMASK(18, 16)
+#define ARM_SMMU_VTCR_TG0 ARM_SMMU_TCR_TG0
+#define ARM_SMMU_VTCR_SH0 ARM_SMMU_TCR_SH0
+#define ARM_SMMU_VTCR_ORGN0 ARM_SMMU_TCR_ORGN0
+#define ARM_SMMU_VTCR_IRGN0 ARM_SMMU_TCR_IRGN0
+#define ARM_SMMU_VTCR_SL0 GENMASK(7, 6)
+#define ARM_SMMU_VTCR_T0SZ ARM_SMMU_TCR_T0SZ
+
+#define ARM_SMMU_CB_CONTEXTIDR 0x34
+#define ARM_SMMU_CB_S1_MAIR0 0x38
+#define ARM_SMMU_CB_S1_MAIR1 0x3c
+
+#define ARM_SMMU_CB_PAR 0x50
+#define ARM_SMMU_CB_PAR_F BIT(0)
+
+#define ARM_SMMU_CB_FSR 0x58
+#define ARM_SMMU_FSR_MULTI BIT(31)
+#define ARM_SMMU_FSR_SS BIT(30)
+#define ARM_SMMU_FSR_UUT BIT(8)
+#define ARM_SMMU_FSR_ASF BIT(7)
+#define ARM_SMMU_FSR_TLBLKF BIT(6)
+#define ARM_SMMU_FSR_TLBMCF BIT(5)
+#define ARM_SMMU_FSR_EF BIT(4)
+#define ARM_SMMU_FSR_PF BIT(3)
+#define ARM_SMMU_FSR_AFF BIT(2)
+#define ARM_SMMU_FSR_TF BIT(1)
+
+#define ARM_SMMU_FSR_IGN (ARM_SMMU_FSR_AFF | \
+ ARM_SMMU_FSR_ASF | \
+ ARM_SMMU_FSR_TLBMCF | \
+ ARM_SMMU_FSR_TLBLKF)
+
+#define ARM_SMMU_FSR_FAULT (ARM_SMMU_FSR_MULTI | \
+ ARM_SMMU_FSR_SS | \
+ ARM_SMMU_FSR_UUT | \
+ ARM_SMMU_FSR_EF | \
+ ARM_SMMU_FSR_PF | \
+ ARM_SMMU_FSR_TF | \
+ ARM_SMMU_FSR_IGN)
+
+#define ARM_SMMU_CB_FAR 0x60
+
+#define ARM_SMMU_CB_FSYNR0 0x68
+#define ARM_SMMU_FSYNR0_WNR BIT(4)
+
+#define ARM_SMMU_CB_S1_TLBIVA 0x600
+#define ARM_SMMU_CB_S1_TLBIASID 0x610
+#define ARM_SMMU_CB_S1_TLBIVAL 0x620
+#define ARM_SMMU_CB_S2_TLBIIPAS2 0x630
+#define ARM_SMMU_CB_S2_TLBIIPAS2L 0x638
+#define ARM_SMMU_CB_TLBSYNC 0x7f0
+#define ARM_SMMU_CB_TLBSTATUS 0x7f4
+#define ARM_SMMU_CB_ATS1PR 0x800
+
+#define ARM_SMMU_CB_ATSR 0x8f0
+#define ARM_SMMU_ATSR_ACTIVE BIT(0)
+
+
+/* Maximum number of context banks per SMMU */
+#define ARM_SMMU_MAX_CBS 128
+
+#define TLB_LOOP_TIMEOUT 1000000 /* 1s! */
+#define TLB_SPIN_COUNT 10
+
+/* Shared driver definitions */
+enum arm_smmu_arch_version {
+ ARM_SMMU_V1,
+ ARM_SMMU_V1_64K,
+ ARM_SMMU_V2,
+};
+
+enum arm_smmu_implementation {
+ GENERIC_SMMU,
+ ARM_MMU500,
+ CAVIUM_SMMUV2,
+ QCOM_SMMUV2,
+};
+
+struct arm_smmu_s2cr {
+ struct iommu_group *group;
+ int count;
+ enum arm_smmu_s2cr_type type;
+ enum arm_smmu_s2cr_privcfg privcfg;
+ u8 cbndx;
+};
+
+struct arm_smmu_smr {
+ u16 mask;
+ u16 id;
+ bool valid;
+ bool pinned;
+};
+
+struct arm_smmu_device {
+ struct device *dev;
+
+ void __iomem *base;
+ unsigned int numpage;
+ unsigned int pgshift;
+
+#define ARM_SMMU_FEAT_COHERENT_WALK (1 << 0)
+#define ARM_SMMU_FEAT_STREAM_MATCH (1 << 1)
+#define ARM_SMMU_FEAT_TRANS_S1 (1 << 2)
+#define ARM_SMMU_FEAT_TRANS_S2 (1 << 3)
+#define ARM_SMMU_FEAT_TRANS_NESTED (1 << 4)
+#define ARM_SMMU_FEAT_TRANS_OPS (1 << 5)
+#define ARM_SMMU_FEAT_VMID16 (1 << 6)
+#define ARM_SMMU_FEAT_FMT_AARCH64_4K (1 << 7)
+#define ARM_SMMU_FEAT_FMT_AARCH64_16K (1 << 8)
+#define ARM_SMMU_FEAT_FMT_AARCH64_64K (1 << 9)
+#define ARM_SMMU_FEAT_FMT_AARCH32_L (1 << 10)
+#define ARM_SMMU_FEAT_FMT_AARCH32_S (1 << 11)
+#define ARM_SMMU_FEAT_EXIDS (1 << 12)
+ u32 features;
+
+ enum arm_smmu_arch_version version;
+ enum arm_smmu_implementation model;
+ const struct arm_smmu_impl *impl;
+
+ u32 num_context_banks;
+ u32 num_s2_context_banks;
+ DECLARE_BITMAP(context_map, ARM_SMMU_MAX_CBS);
+ struct arm_smmu_cb *cbs;
+ atomic_t irptndx;
+
+ u32 num_mapping_groups;
+ u16 streamid_mask;
+ u16 smr_mask_mask;
+ struct arm_smmu_smr *smrs;
+ struct arm_smmu_s2cr *s2crs;
+ struct mutex stream_map_mutex;
+
+ unsigned long va_size;
+ unsigned long ipa_size;
+ unsigned long pa_size;
+ unsigned long pgsize_bitmap;
+
+ u32 num_global_irqs;
+ u32 num_context_irqs;
+ unsigned int *irqs;
+ struct clk_bulk_data *clks;
+ int num_clks;
+
+ spinlock_t global_sync_lock;
+
+ /* IOMMU core code handle */
+ struct iommu_device iommu;
+};
+
+enum arm_smmu_context_fmt {
+ ARM_SMMU_CTX_FMT_NONE,
+ ARM_SMMU_CTX_FMT_AARCH64,
+ ARM_SMMU_CTX_FMT_AARCH32_L,
+ ARM_SMMU_CTX_FMT_AARCH32_S,
+};
+
+struct arm_smmu_cfg {
+ u8 cbndx;
+ u8 irptndx;
+ union {
+ u16 asid;
+ u16 vmid;
+ };
+ enum arm_smmu_cbar_type cbar;
+ enum arm_smmu_context_fmt fmt;
+};
+#define ARM_SMMU_INVALID_IRPTNDX 0xff
+
+struct arm_smmu_cb {
+ u64 ttbr[2];
+ u32 tcr[2];
+ u32 mair[2];
+ struct arm_smmu_cfg *cfg;
+};
+
+enum arm_smmu_domain_stage {
+ ARM_SMMU_DOMAIN_S1 = 0,
+ ARM_SMMU_DOMAIN_S2,
+ ARM_SMMU_DOMAIN_NESTED,
+ ARM_SMMU_DOMAIN_BYPASS,
+};
+
+struct arm_smmu_domain {
+ struct arm_smmu_device *smmu;
+ struct io_pgtable_ops *pgtbl_ops;
+ const struct iommu_flush_ops *flush_ops;
+ struct arm_smmu_cfg cfg;
+ enum arm_smmu_domain_stage stage;
+ bool non_strict;
+ struct mutex init_mutex; /* Protects smmu pointer */
+ spinlock_t cb_lock; /* Serialises ATS1* ops and TLB syncs */
+ struct iommu_domain domain;
+};
+
+struct arm_smmu_master_cfg {
+ struct arm_smmu_device *smmu;
+ s16 smendx[];
+};
+
+static inline u32 arm_smmu_lpae_tcr(const struct io_pgtable_cfg *cfg)
+{
+ u32 tcr = FIELD_PREP(ARM_SMMU_TCR_TG0, cfg->arm_lpae_s1_cfg.tcr.tg) |
+ FIELD_PREP(ARM_SMMU_TCR_SH0, cfg->arm_lpae_s1_cfg.tcr.sh) |
+ FIELD_PREP(ARM_SMMU_TCR_ORGN0, cfg->arm_lpae_s1_cfg.tcr.orgn) |
+ FIELD_PREP(ARM_SMMU_TCR_IRGN0, cfg->arm_lpae_s1_cfg.tcr.irgn) |
+ FIELD_PREP(ARM_SMMU_TCR_T0SZ, cfg->arm_lpae_s1_cfg.tcr.tsz);
+
+ /*
+ * When TTBR1 is selected shift the TCR fields by 16 bits and disable
+ * translation in TTBR0
+ */
+ if (cfg->quirks & IO_PGTABLE_QUIRK_ARM_TTBR1) {
+ tcr = (tcr << 16) & ~ARM_SMMU_TCR_A1;
+ tcr |= ARM_SMMU_TCR_EPD0;
+ } else
+ tcr |= ARM_SMMU_TCR_EPD1;
+
+ return tcr;
+}
+
+static inline u32 arm_smmu_lpae_tcr2(const struct io_pgtable_cfg *cfg)
+{
+ return FIELD_PREP(ARM_SMMU_TCR2_PASIZE, cfg->arm_lpae_s1_cfg.tcr.ips) |
+ FIELD_PREP(ARM_SMMU_TCR2_SEP, ARM_SMMU_TCR2_SEP_UPSTREAM);
+}
+
+static inline u32 arm_smmu_lpae_vtcr(const struct io_pgtable_cfg *cfg)
+{
+ return ARM_SMMU_VTCR_RES1 |
+ FIELD_PREP(ARM_SMMU_VTCR_PS, cfg->arm_lpae_s2_cfg.vtcr.ps) |
+ FIELD_PREP(ARM_SMMU_VTCR_TG0, cfg->arm_lpae_s2_cfg.vtcr.tg) |
+ FIELD_PREP(ARM_SMMU_VTCR_SH0, cfg->arm_lpae_s2_cfg.vtcr.sh) |
+ FIELD_PREP(ARM_SMMU_VTCR_ORGN0, cfg->arm_lpae_s2_cfg.vtcr.orgn) |
+ FIELD_PREP(ARM_SMMU_VTCR_IRGN0, cfg->arm_lpae_s2_cfg.vtcr.irgn) |
+ FIELD_PREP(ARM_SMMU_VTCR_SL0, cfg->arm_lpae_s2_cfg.vtcr.sl) |
+ FIELD_PREP(ARM_SMMU_VTCR_T0SZ, cfg->arm_lpae_s2_cfg.vtcr.tsz);
+}
+
+/* Implementation details, yay! */
+struct arm_smmu_impl {
+ u32 (*read_reg)(struct arm_smmu_device *smmu, int page, int offset);
+ void (*write_reg)(struct arm_smmu_device *smmu, int page, int offset,
+ u32 val);
+ u64 (*read_reg64)(struct arm_smmu_device *smmu, int page, int offset);
+ void (*write_reg64)(struct arm_smmu_device *smmu, int page, int offset,
+ u64 val);
+ int (*cfg_probe)(struct arm_smmu_device *smmu);
+ int (*reset)(struct arm_smmu_device *smmu);
+ int (*init_context)(struct arm_smmu_domain *smmu_domain,
+ struct io_pgtable_cfg *cfg, struct device *dev);
+ void (*tlb_sync)(struct arm_smmu_device *smmu, int page, int sync,
+ int status);
+ int (*def_domain_type)(struct device *dev);
+ irqreturn_t (*global_fault)(int irq, void *dev);
+ irqreturn_t (*context_fault)(int irq, void *dev);
+ int (*alloc_context_bank)(struct arm_smmu_domain *smmu_domain,
+ struct arm_smmu_device *smmu,
+ struct device *dev, int start);
+ void (*write_s2cr)(struct arm_smmu_device *smmu, int idx);
+};
+
+#define INVALID_SMENDX -1
+#define cfg_smendx(cfg, fw, i) \
+ (i >= fw->num_ids ? INVALID_SMENDX : cfg->smendx[i])
+#define for_each_cfg_sme(cfg, fw, i, idx) \
+ for (i = 0; idx = cfg_smendx(cfg, fw, i), i < fw->num_ids; ++i)
+
+static inline int __arm_smmu_alloc_bitmap(unsigned long *map, int start, int end)
+{
+ int idx;
+
+ do {
+ idx = find_next_zero_bit(map, end, start);
+ if (idx == end)
+ return -ENOSPC;
+ } while (test_and_set_bit(idx, map));
+
+ return idx;
+}
+
+static inline void __iomem *arm_smmu_page(struct arm_smmu_device *smmu, int n)
+{
+ return smmu->base + (n << smmu->pgshift);
+}
+
+static inline u32 arm_smmu_readl(struct arm_smmu_device *smmu, int page, int offset)
+{
+ if (smmu->impl && unlikely(smmu->impl->read_reg))
+ return smmu->impl->read_reg(smmu, page, offset);
+ return readl_relaxed(arm_smmu_page(smmu, page) + offset);
+}
+
+static inline void arm_smmu_writel(struct arm_smmu_device *smmu, int page,
+ int offset, u32 val)
+{
+ if (smmu->impl && unlikely(smmu->impl->write_reg))
+ smmu->impl->write_reg(smmu, page, offset, val);
+ else
+ writel_relaxed(val, arm_smmu_page(smmu, page) + offset);
+}
+
+static inline u64 arm_smmu_readq(struct arm_smmu_device *smmu, int page, int offset)
+{
+ if (smmu->impl && unlikely(smmu->impl->read_reg64))
+ return smmu->impl->read_reg64(smmu, page, offset);
+ return readq_relaxed(arm_smmu_page(smmu, page) + offset);
+}
+
+static inline void arm_smmu_writeq(struct arm_smmu_device *smmu, int page,
+ int offset, u64 val)
+{
+ if (smmu->impl && unlikely(smmu->impl->write_reg64))
+ smmu->impl->write_reg64(smmu, page, offset, val);
+ else
+ writeq_relaxed(val, arm_smmu_page(smmu, page) + offset);
+}
+
+#define ARM_SMMU_GR0 0
+#define ARM_SMMU_GR1 1
+#define ARM_SMMU_CB(s, n) ((s)->numpage + (n))
+
+#define arm_smmu_gr0_read(s, o) \
+ arm_smmu_readl((s), ARM_SMMU_GR0, (o))
+#define arm_smmu_gr0_write(s, o, v) \
+ arm_smmu_writel((s), ARM_SMMU_GR0, (o), (v))
+
+#define arm_smmu_gr1_read(s, o) \
+ arm_smmu_readl((s), ARM_SMMU_GR1, (o))
+#define arm_smmu_gr1_write(s, o, v) \
+ arm_smmu_writel((s), ARM_SMMU_GR1, (o), (v))
+
+#define arm_smmu_cb_read(s, n, o) \
+ arm_smmu_readl((s), ARM_SMMU_CB((s), (n)), (o))
+#define arm_smmu_cb_write(s, n, o, v) \
+ arm_smmu_writel((s), ARM_SMMU_CB((s), (n)), (o), (v))
+#define arm_smmu_cb_readq(s, n, o) \
+ arm_smmu_readq((s), ARM_SMMU_CB((s), (n)), (o))
+#define arm_smmu_cb_writeq(s, n, o, v) \
+ arm_smmu_writeq((s), ARM_SMMU_CB((s), (n)), (o), (v))
+
+struct arm_smmu_device *arm_smmu_impl_init(struct arm_smmu_device *smmu);
+struct arm_smmu_device *nvidia_smmu_impl_init(struct arm_smmu_device *smmu);
+struct arm_smmu_device *qcom_smmu_impl_init(struct arm_smmu_device *smmu);
+
+void arm_smmu_write_context_bank(struct arm_smmu_device *smmu, int idx);
+int arm_mmu500_reset(struct arm_smmu_device *smmu);
+
+#endif /* _ARM_SMMU_H */
diff --git a/drivers/iommu/arm/arm-smmu/qcom_iommu.c b/drivers/iommu/arm/arm-smmu/qcom_iommu.c
new file mode 100644
index 000000000..37c8f75a3
--- /dev/null
+++ b/drivers/iommu/arm/arm-smmu/qcom_iommu.c
@@ -0,0 +1,953 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * IOMMU API for QCOM secure IOMMUs. Somewhat based on arm-smmu.c
+ *
+ * Copyright (C) 2013 ARM Limited
+ * Copyright (C) 2017 Red Hat
+ */
+
+#include <linux/atomic.h>
+#include <linux/bitfield.h>
+#include <linux/clk.h>
+#include <linux/delay.h>
+#include <linux/dma-iommu.h>
+#include <linux/dma-mapping.h>
+#include <linux/err.h>
+#include <linux/interrupt.h>
+#include <linux/io.h>
+#include <linux/io-64-nonatomic-hi-lo.h>
+#include <linux/io-pgtable.h>
+#include <linux/iommu.h>
+#include <linux/iopoll.h>
+#include <linux/kconfig.h>
+#include <linux/init.h>
+#include <linux/mutex.h>
+#include <linux/of.h>
+#include <linux/of_address.h>
+#include <linux/of_device.h>
+#include <linux/of_iommu.h>
+#include <linux/platform_device.h>
+#include <linux/pm.h>
+#include <linux/pm_runtime.h>
+#include <linux/qcom_scm.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+
+#include "arm-smmu.h"
+
+#define SMMU_INTR_SEL_NS 0x2000
+
+enum qcom_iommu_clk {
+ CLK_IFACE,
+ CLK_BUS,
+ CLK_TBU,
+ CLK_NUM,
+};
+
+struct qcom_iommu_ctx;
+
+struct qcom_iommu_dev {
+ /* IOMMU core code handle */
+ struct iommu_device iommu;
+ struct device *dev;
+ struct clk_bulk_data clks[CLK_NUM];
+ void __iomem *local_base;
+ u32 sec_id;
+ u8 num_ctxs;
+ struct qcom_iommu_ctx *ctxs[]; /* indexed by asid-1 */
+};
+
+struct qcom_iommu_ctx {
+ struct device *dev;
+ void __iomem *base;
+ bool secure_init;
+ u8 asid; /* asid and ctx bank # are 1:1 */
+ struct iommu_domain *domain;
+};
+
+struct qcom_iommu_domain {
+ struct io_pgtable_ops *pgtbl_ops;
+ spinlock_t pgtbl_lock;
+ struct mutex init_mutex; /* Protects iommu pointer */
+ struct iommu_domain domain;
+ struct qcom_iommu_dev *iommu;
+ struct iommu_fwspec *fwspec;
+};
+
+static struct qcom_iommu_domain *to_qcom_iommu_domain(struct iommu_domain *dom)
+{
+ return container_of(dom, struct qcom_iommu_domain, domain);
+}
+
+static const struct iommu_ops qcom_iommu_ops;
+
+static struct qcom_iommu_dev * to_iommu(struct device *dev)
+{
+ struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
+
+ if (!fwspec || fwspec->ops != &qcom_iommu_ops)
+ return NULL;
+
+ return dev_iommu_priv_get(dev);
+}
+
+static struct qcom_iommu_ctx * to_ctx(struct qcom_iommu_domain *d, unsigned asid)
+{
+ struct qcom_iommu_dev *qcom_iommu = d->iommu;
+ if (!qcom_iommu)
+ return NULL;
+ return qcom_iommu->ctxs[asid - 1];
+}
+
+static inline void
+iommu_writel(struct qcom_iommu_ctx *ctx, unsigned reg, u32 val)
+{
+ writel_relaxed(val, ctx->base + reg);
+}
+
+static inline void
+iommu_writeq(struct qcom_iommu_ctx *ctx, unsigned reg, u64 val)
+{
+ writeq_relaxed(val, ctx->base + reg);
+}
+
+static inline u32
+iommu_readl(struct qcom_iommu_ctx *ctx, unsigned reg)
+{
+ return readl_relaxed(ctx->base + reg);
+}
+
+static inline u64
+iommu_readq(struct qcom_iommu_ctx *ctx, unsigned reg)
+{
+ return readq_relaxed(ctx->base + reg);
+}
+
+static void qcom_iommu_tlb_sync(void *cookie)
+{
+ struct qcom_iommu_domain *qcom_domain = cookie;
+ struct iommu_fwspec *fwspec = qcom_domain->fwspec;
+ unsigned i;
+
+ for (i = 0; i < fwspec->num_ids; i++) {
+ struct qcom_iommu_ctx *ctx = to_ctx(qcom_domain, fwspec->ids[i]);
+ unsigned int val, ret;
+
+ iommu_writel(ctx, ARM_SMMU_CB_TLBSYNC, 0);
+
+ ret = readl_poll_timeout(ctx->base + ARM_SMMU_CB_TLBSTATUS, val,
+ (val & 0x1) == 0, 0, 5000000);
+ if (ret)
+ dev_err(ctx->dev, "timeout waiting for TLB SYNC\n");
+ }
+}
+
+static void qcom_iommu_tlb_inv_context(void *cookie)
+{
+ struct qcom_iommu_domain *qcom_domain = cookie;
+ struct iommu_fwspec *fwspec = qcom_domain->fwspec;
+ unsigned i;
+
+ for (i = 0; i < fwspec->num_ids; i++) {
+ struct qcom_iommu_ctx *ctx = to_ctx(qcom_domain, fwspec->ids[i]);
+ iommu_writel(ctx, ARM_SMMU_CB_S1_TLBIASID, ctx->asid);
+ }
+
+ qcom_iommu_tlb_sync(cookie);
+}
+
+static void qcom_iommu_tlb_inv_range_nosync(unsigned long iova, size_t size,
+ size_t granule, bool leaf, void *cookie)
+{
+ struct qcom_iommu_domain *qcom_domain = cookie;
+ struct iommu_fwspec *fwspec = qcom_domain->fwspec;
+ unsigned i, reg;
+
+ reg = leaf ? ARM_SMMU_CB_S1_TLBIVAL : ARM_SMMU_CB_S1_TLBIVA;
+
+ for (i = 0; i < fwspec->num_ids; i++) {
+ struct qcom_iommu_ctx *ctx = to_ctx(qcom_domain, fwspec->ids[i]);
+ size_t s = size;
+
+ iova = (iova >> 12) << 12;
+ iova |= ctx->asid;
+ do {
+ iommu_writel(ctx, reg, iova);
+ iova += granule;
+ } while (s -= granule);
+ }
+}
+
+static void qcom_iommu_tlb_flush_walk(unsigned long iova, size_t size,
+ size_t granule, void *cookie)
+{
+ qcom_iommu_tlb_inv_range_nosync(iova, size, granule, false, cookie);
+ qcom_iommu_tlb_sync(cookie);
+}
+
+static void qcom_iommu_tlb_flush_leaf(unsigned long iova, size_t size,
+ size_t granule, void *cookie)
+{
+ qcom_iommu_tlb_inv_range_nosync(iova, size, granule, true, cookie);
+ qcom_iommu_tlb_sync(cookie);
+}
+
+static void qcom_iommu_tlb_add_page(struct iommu_iotlb_gather *gather,
+ unsigned long iova, size_t granule,
+ void *cookie)
+{
+ qcom_iommu_tlb_inv_range_nosync(iova, granule, granule, true, cookie);
+}
+
+static const struct iommu_flush_ops qcom_flush_ops = {
+ .tlb_flush_all = qcom_iommu_tlb_inv_context,
+ .tlb_flush_walk = qcom_iommu_tlb_flush_walk,
+ .tlb_flush_leaf = qcom_iommu_tlb_flush_leaf,
+ .tlb_add_page = qcom_iommu_tlb_add_page,
+};
+
+static irqreturn_t qcom_iommu_fault(int irq, void *dev)
+{
+ struct qcom_iommu_ctx *ctx = dev;
+ u32 fsr, fsynr;
+ u64 iova;
+
+ fsr = iommu_readl(ctx, ARM_SMMU_CB_FSR);
+
+ if (!(fsr & ARM_SMMU_FSR_FAULT))
+ return IRQ_NONE;
+
+ fsynr = iommu_readl(ctx, ARM_SMMU_CB_FSYNR0);
+ iova = iommu_readq(ctx, ARM_SMMU_CB_FAR);
+
+ if (!report_iommu_fault(ctx->domain, ctx->dev, iova, 0)) {
+ dev_err_ratelimited(ctx->dev,
+ "Unhandled context fault: fsr=0x%x, "
+ "iova=0x%016llx, fsynr=0x%x, cb=%d\n",
+ fsr, iova, fsynr, ctx->asid);
+ }
+
+ iommu_writel(ctx, ARM_SMMU_CB_FSR, fsr);
+ iommu_writel(ctx, ARM_SMMU_CB_RESUME, ARM_SMMU_RESUME_TERMINATE);
+
+ return IRQ_HANDLED;
+}
+
+static int qcom_iommu_init_domain(struct iommu_domain *domain,
+ struct qcom_iommu_dev *qcom_iommu,
+ struct device *dev)
+{
+ struct qcom_iommu_domain *qcom_domain = to_qcom_iommu_domain(domain);
+ struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
+ struct io_pgtable_ops *pgtbl_ops;
+ struct io_pgtable_cfg pgtbl_cfg;
+ int i, ret = 0;
+ u32 reg;
+
+ mutex_lock(&qcom_domain->init_mutex);
+ if (qcom_domain->iommu)
+ goto out_unlock;
+
+ pgtbl_cfg = (struct io_pgtable_cfg) {
+ .pgsize_bitmap = qcom_iommu_ops.pgsize_bitmap,
+ .ias = 32,
+ .oas = 40,
+ .tlb = &qcom_flush_ops,
+ .iommu_dev = qcom_iommu->dev,
+ };
+
+ qcom_domain->iommu = qcom_iommu;
+ qcom_domain->fwspec = fwspec;
+
+ pgtbl_ops = alloc_io_pgtable_ops(ARM_32_LPAE_S1, &pgtbl_cfg, qcom_domain);
+ if (!pgtbl_ops) {
+ dev_err(qcom_iommu->dev, "failed to allocate pagetable ops\n");
+ ret = -ENOMEM;
+ goto out_clear_iommu;
+ }
+
+ /* Update the domain's page sizes to reflect the page table format */
+ domain->pgsize_bitmap = pgtbl_cfg.pgsize_bitmap;
+ domain->geometry.aperture_end = (1ULL << pgtbl_cfg.ias) - 1;
+ domain->geometry.force_aperture = true;
+
+ for (i = 0; i < fwspec->num_ids; i++) {
+ struct qcom_iommu_ctx *ctx = to_ctx(qcom_domain, fwspec->ids[i]);
+
+ if (!ctx->secure_init) {
+ ret = qcom_scm_restore_sec_cfg(qcom_iommu->sec_id, ctx->asid);
+ if (ret) {
+ dev_err(qcom_iommu->dev, "secure init failed: %d\n", ret);
+ goto out_clear_iommu;
+ }
+ ctx->secure_init = true;
+ }
+
+ /* Disable context bank before programming */
+ iommu_writel(ctx, ARM_SMMU_CB_SCTLR, 0);
+
+ /* Clear context bank fault address fault status registers */
+ iommu_writel(ctx, ARM_SMMU_CB_FAR, 0);
+ iommu_writel(ctx, ARM_SMMU_CB_FSR, ARM_SMMU_FSR_FAULT);
+
+ /* TTBRs */
+ iommu_writeq(ctx, ARM_SMMU_CB_TTBR0,
+ pgtbl_cfg.arm_lpae_s1_cfg.ttbr |
+ FIELD_PREP(ARM_SMMU_TTBRn_ASID, ctx->asid));
+ iommu_writeq(ctx, ARM_SMMU_CB_TTBR1, 0);
+
+ /* TCR */
+ iommu_writel(ctx, ARM_SMMU_CB_TCR2,
+ arm_smmu_lpae_tcr2(&pgtbl_cfg));
+ iommu_writel(ctx, ARM_SMMU_CB_TCR,
+ arm_smmu_lpae_tcr(&pgtbl_cfg) | ARM_SMMU_TCR_EAE);
+
+ /* MAIRs (stage-1 only) */
+ iommu_writel(ctx, ARM_SMMU_CB_S1_MAIR0,
+ pgtbl_cfg.arm_lpae_s1_cfg.mair);
+ iommu_writel(ctx, ARM_SMMU_CB_S1_MAIR1,
+ pgtbl_cfg.arm_lpae_s1_cfg.mair >> 32);
+
+ /* SCTLR */
+ reg = ARM_SMMU_SCTLR_CFIE | ARM_SMMU_SCTLR_CFRE |
+ ARM_SMMU_SCTLR_AFE | ARM_SMMU_SCTLR_TRE |
+ ARM_SMMU_SCTLR_M | ARM_SMMU_SCTLR_S1_ASIDPNE |
+ ARM_SMMU_SCTLR_CFCFG;
+
+ if (IS_ENABLED(CONFIG_CPU_BIG_ENDIAN))
+ reg |= ARM_SMMU_SCTLR_E;
+
+ iommu_writel(ctx, ARM_SMMU_CB_SCTLR, reg);
+
+ ctx->domain = domain;
+ }
+
+ mutex_unlock(&qcom_domain->init_mutex);
+
+ /* Publish page table ops for map/unmap */
+ qcom_domain->pgtbl_ops = pgtbl_ops;
+
+ return 0;
+
+out_clear_iommu:
+ qcom_domain->iommu = NULL;
+out_unlock:
+ mutex_unlock(&qcom_domain->init_mutex);
+ return ret;
+}
+
+static struct iommu_domain *qcom_iommu_domain_alloc(unsigned type)
+{
+ struct qcom_iommu_domain *qcom_domain;
+
+ if (type != IOMMU_DOMAIN_UNMANAGED && type != IOMMU_DOMAIN_DMA)
+ return NULL;
+ /*
+ * Allocate the domain and initialise some of its data structures.
+ * We can't really do anything meaningful until we've added a
+ * master.
+ */
+ qcom_domain = kzalloc(sizeof(*qcom_domain), GFP_KERNEL);
+ if (!qcom_domain)
+ return NULL;
+
+ if (type == IOMMU_DOMAIN_DMA &&
+ iommu_get_dma_cookie(&qcom_domain->domain)) {
+ kfree(qcom_domain);
+ return NULL;
+ }
+
+ mutex_init(&qcom_domain->init_mutex);
+ spin_lock_init(&qcom_domain->pgtbl_lock);
+
+ return &qcom_domain->domain;
+}
+
+static void qcom_iommu_domain_free(struct iommu_domain *domain)
+{
+ struct qcom_iommu_domain *qcom_domain = to_qcom_iommu_domain(domain);
+
+ iommu_put_dma_cookie(domain);
+
+ if (qcom_domain->iommu) {
+ /*
+ * NOTE: unmap can be called after client device is powered
+ * off, for example, with GPUs or anything involving dma-buf.
+ * So we cannot rely on the device_link. Make sure the IOMMU
+ * is on to avoid unclocked accesses in the TLB inv path:
+ */
+ pm_runtime_get_sync(qcom_domain->iommu->dev);
+ free_io_pgtable_ops(qcom_domain->pgtbl_ops);
+ pm_runtime_put_sync(qcom_domain->iommu->dev);
+ }
+
+ kfree(qcom_domain);
+}
+
+static int qcom_iommu_attach_dev(struct iommu_domain *domain, struct device *dev)
+{
+ struct qcom_iommu_dev *qcom_iommu = to_iommu(dev);
+ struct qcom_iommu_domain *qcom_domain = to_qcom_iommu_domain(domain);
+ int ret;
+
+ if (!qcom_iommu) {
+ dev_err(dev, "cannot attach to IOMMU, is it on the same bus?\n");
+ return -ENXIO;
+ }
+
+ /* Ensure that the domain is finalized */
+ pm_runtime_get_sync(qcom_iommu->dev);
+ ret = qcom_iommu_init_domain(domain, qcom_iommu, dev);
+ pm_runtime_put_sync(qcom_iommu->dev);
+ if (ret < 0)
+ return ret;
+
+ /*
+ * Sanity check the domain. We don't support domains across
+ * different IOMMUs.
+ */
+ if (qcom_domain->iommu != qcom_iommu) {
+ dev_err(dev, "cannot attach to IOMMU %s while already "
+ "attached to domain on IOMMU %s\n",
+ dev_name(qcom_domain->iommu->dev),
+ dev_name(qcom_iommu->dev));
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static void qcom_iommu_detach_dev(struct iommu_domain *domain, struct device *dev)
+{
+ struct qcom_iommu_domain *qcom_domain = to_qcom_iommu_domain(domain);
+ struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
+ struct qcom_iommu_dev *qcom_iommu = to_iommu(dev);
+ unsigned i;
+
+ if (WARN_ON(!qcom_domain->iommu))
+ return;
+
+ pm_runtime_get_sync(qcom_iommu->dev);
+ for (i = 0; i < fwspec->num_ids; i++) {
+ struct qcom_iommu_ctx *ctx = to_ctx(qcom_domain, fwspec->ids[i]);
+
+ /* Disable the context bank: */
+ iommu_writel(ctx, ARM_SMMU_CB_SCTLR, 0);
+
+ ctx->domain = NULL;
+ }
+ pm_runtime_put_sync(qcom_iommu->dev);
+}
+
+static int qcom_iommu_map(struct iommu_domain *domain, unsigned long iova,
+ phys_addr_t paddr, size_t size, int prot, gfp_t gfp)
+{
+ int ret;
+ unsigned long flags;
+ struct qcom_iommu_domain *qcom_domain = to_qcom_iommu_domain(domain);
+ struct io_pgtable_ops *ops = qcom_domain->pgtbl_ops;
+
+ if (!ops)
+ return -ENODEV;
+
+ spin_lock_irqsave(&qcom_domain->pgtbl_lock, flags);
+ ret = ops->map(ops, iova, paddr, size, prot, GFP_ATOMIC);
+ spin_unlock_irqrestore(&qcom_domain->pgtbl_lock, flags);
+ return ret;
+}
+
+static size_t qcom_iommu_unmap(struct iommu_domain *domain, unsigned long iova,
+ size_t size, struct iommu_iotlb_gather *gather)
+{
+ size_t ret;
+ unsigned long flags;
+ struct qcom_iommu_domain *qcom_domain = to_qcom_iommu_domain(domain);
+ struct io_pgtable_ops *ops = qcom_domain->pgtbl_ops;
+
+ if (!ops)
+ return 0;
+
+ /* NOTE: unmap can be called after client device is powered off,
+ * for example, with GPUs or anything involving dma-buf. So we
+ * cannot rely on the device_link. Make sure the IOMMU is on to
+ * avoid unclocked accesses in the TLB inv path:
+ */
+ pm_runtime_get_sync(qcom_domain->iommu->dev);
+ spin_lock_irqsave(&qcom_domain->pgtbl_lock, flags);
+ ret = ops->unmap(ops, iova, size, gather);
+ spin_unlock_irqrestore(&qcom_domain->pgtbl_lock, flags);
+ pm_runtime_put_sync(qcom_domain->iommu->dev);
+
+ return ret;
+}
+
+static void qcom_iommu_flush_iotlb_all(struct iommu_domain *domain)
+{
+ struct qcom_iommu_domain *qcom_domain = to_qcom_iommu_domain(domain);
+ struct io_pgtable *pgtable = container_of(qcom_domain->pgtbl_ops,
+ struct io_pgtable, ops);
+ if (!qcom_domain->pgtbl_ops)
+ return;
+
+ pm_runtime_get_sync(qcom_domain->iommu->dev);
+ qcom_iommu_tlb_sync(pgtable->cookie);
+ pm_runtime_put_sync(qcom_domain->iommu->dev);
+}
+
+static void qcom_iommu_iotlb_sync(struct iommu_domain *domain,
+ struct iommu_iotlb_gather *gather)
+{
+ qcom_iommu_flush_iotlb_all(domain);
+}
+
+static phys_addr_t qcom_iommu_iova_to_phys(struct iommu_domain *domain,
+ dma_addr_t iova)
+{
+ phys_addr_t ret;
+ unsigned long flags;
+ struct qcom_iommu_domain *qcom_domain = to_qcom_iommu_domain(domain);
+ struct io_pgtable_ops *ops = qcom_domain->pgtbl_ops;
+
+ if (!ops)
+ return 0;
+
+ spin_lock_irqsave(&qcom_domain->pgtbl_lock, flags);
+ ret = ops->iova_to_phys(ops, iova);
+ spin_unlock_irqrestore(&qcom_domain->pgtbl_lock, flags);
+
+ return ret;
+}
+
+static bool qcom_iommu_capable(enum iommu_cap cap)
+{
+ switch (cap) {
+ case IOMMU_CAP_CACHE_COHERENCY:
+ /*
+ * Return true here as the SMMU can always send out coherent
+ * requests.
+ */
+ return true;
+ case IOMMU_CAP_NOEXEC:
+ return true;
+ default:
+ return false;
+ }
+}
+
+static struct iommu_device *qcom_iommu_probe_device(struct device *dev)
+{
+ struct qcom_iommu_dev *qcom_iommu = to_iommu(dev);
+ struct device_link *link;
+
+ if (!qcom_iommu)
+ return ERR_PTR(-ENODEV);
+
+ /*
+ * Establish the link between iommu and master, so that the
+ * iommu gets runtime enabled/disabled as per the master's
+ * needs.
+ */
+ link = device_link_add(dev, qcom_iommu->dev, DL_FLAG_PM_RUNTIME);
+ if (!link) {
+ dev_err(qcom_iommu->dev, "Unable to create device link between %s and %s\n",
+ dev_name(qcom_iommu->dev), dev_name(dev));
+ return ERR_PTR(-ENODEV);
+ }
+
+ return &qcom_iommu->iommu;
+}
+
+static void qcom_iommu_release_device(struct device *dev)
+{
+ struct qcom_iommu_dev *qcom_iommu = to_iommu(dev);
+
+ if (!qcom_iommu)
+ return;
+
+ iommu_fwspec_free(dev);
+}
+
+static int qcom_iommu_of_xlate(struct device *dev, struct of_phandle_args *args)
+{
+ struct qcom_iommu_dev *qcom_iommu;
+ struct platform_device *iommu_pdev;
+ unsigned asid = args->args[0];
+
+ if (args->args_count != 1) {
+ dev_err(dev, "incorrect number of iommu params found for %s "
+ "(found %d, expected 1)\n",
+ args->np->full_name, args->args_count);
+ return -EINVAL;
+ }
+
+ iommu_pdev = of_find_device_by_node(args->np);
+ if (WARN_ON(!iommu_pdev))
+ return -EINVAL;
+
+ qcom_iommu = platform_get_drvdata(iommu_pdev);
+
+ /* make sure the asid specified in dt is valid, so we don't have
+ * to sanity check this elsewhere, since 'asid - 1' is used to
+ * index into qcom_iommu->ctxs:
+ */
+ if (WARN_ON(asid < 1) ||
+ WARN_ON(asid > qcom_iommu->num_ctxs)) {
+ put_device(&iommu_pdev->dev);
+ return -EINVAL;
+ }
+
+ if (!dev_iommu_priv_get(dev)) {
+ dev_iommu_priv_set(dev, qcom_iommu);
+ } else {
+ /* make sure devices iommus dt node isn't referring to
+ * multiple different iommu devices. Multiple context
+ * banks are ok, but multiple devices are not:
+ */
+ if (WARN_ON(qcom_iommu != dev_iommu_priv_get(dev))) {
+ put_device(&iommu_pdev->dev);
+ return -EINVAL;
+ }
+ }
+
+ return iommu_fwspec_add_ids(dev, &asid, 1);
+}
+
+static const struct iommu_ops qcom_iommu_ops = {
+ .capable = qcom_iommu_capable,
+ .domain_alloc = qcom_iommu_domain_alloc,
+ .domain_free = qcom_iommu_domain_free,
+ .attach_dev = qcom_iommu_attach_dev,
+ .detach_dev = qcom_iommu_detach_dev,
+ .map = qcom_iommu_map,
+ .unmap = qcom_iommu_unmap,
+ .flush_iotlb_all = qcom_iommu_flush_iotlb_all,
+ .iotlb_sync = qcom_iommu_iotlb_sync,
+ .iova_to_phys = qcom_iommu_iova_to_phys,
+ .probe_device = qcom_iommu_probe_device,
+ .release_device = qcom_iommu_release_device,
+ .device_group = generic_device_group,
+ .of_xlate = qcom_iommu_of_xlate,
+ .pgsize_bitmap = SZ_4K | SZ_64K | SZ_1M | SZ_16M,
+};
+
+static int qcom_iommu_sec_ptbl_init(struct device *dev)
+{
+ size_t psize = 0;
+ unsigned int spare = 0;
+ void *cpu_addr;
+ dma_addr_t paddr;
+ unsigned long attrs;
+ static bool allocated = false;
+ int ret;
+
+ if (allocated)
+ return 0;
+
+ ret = qcom_scm_iommu_secure_ptbl_size(spare, &psize);
+ if (ret) {
+ dev_err(dev, "failed to get iommu secure pgtable size (%d)\n",
+ ret);
+ return ret;
+ }
+
+ dev_info(dev, "iommu sec: pgtable size: %zu\n", psize);
+
+ attrs = DMA_ATTR_NO_KERNEL_MAPPING;
+
+ cpu_addr = dma_alloc_attrs(dev, psize, &paddr, GFP_KERNEL, attrs);
+ if (!cpu_addr) {
+ dev_err(dev, "failed to allocate %zu bytes for pgtable\n",
+ psize);
+ return -ENOMEM;
+ }
+
+ ret = qcom_scm_iommu_secure_ptbl_init(paddr, psize, spare);
+ if (ret) {
+ dev_err(dev, "failed to init iommu pgtable (%d)\n", ret);
+ goto free_mem;
+ }
+
+ allocated = true;
+ return 0;
+
+free_mem:
+ dma_free_attrs(dev, psize, cpu_addr, paddr, attrs);
+ return ret;
+}
+
+static int get_asid(const struct device_node *np)
+{
+ u32 reg;
+
+ /* read the "reg" property directly to get the relative address
+ * of the context bank, and calculate the asid from that:
+ */
+ if (of_property_read_u32_index(np, "reg", 0, &reg))
+ return -ENODEV;
+
+ return reg / 0x1000; /* context banks are 0x1000 apart */
+}
+
+static int qcom_iommu_ctx_probe(struct platform_device *pdev)
+{
+ struct qcom_iommu_ctx *ctx;
+ struct device *dev = &pdev->dev;
+ struct qcom_iommu_dev *qcom_iommu = dev_get_drvdata(dev->parent);
+ struct resource *res;
+ int ret, irq;
+
+ ctx = devm_kzalloc(dev, sizeof(*ctx), GFP_KERNEL);
+ if (!ctx)
+ return -ENOMEM;
+
+ ctx->dev = dev;
+ platform_set_drvdata(pdev, ctx);
+
+ res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+ ctx->base = devm_ioremap_resource(dev, res);
+ if (IS_ERR(ctx->base))
+ return PTR_ERR(ctx->base);
+
+ irq = platform_get_irq(pdev, 0);
+ if (irq < 0)
+ return -ENODEV;
+
+ /* clear IRQs before registering fault handler, just in case the
+ * boot-loader left us a surprise:
+ */
+ iommu_writel(ctx, ARM_SMMU_CB_FSR, iommu_readl(ctx, ARM_SMMU_CB_FSR));
+
+ ret = devm_request_irq(dev, irq,
+ qcom_iommu_fault,
+ IRQF_SHARED,
+ "qcom-iommu-fault",
+ ctx);
+ if (ret) {
+ dev_err(dev, "failed to request IRQ %u\n", irq);
+ return ret;
+ }
+
+ ret = get_asid(dev->of_node);
+ if (ret < 0) {
+ dev_err(dev, "missing reg property\n");
+ return ret;
+ }
+
+ ctx->asid = ret;
+
+ dev_dbg(dev, "found asid %u\n", ctx->asid);
+
+ qcom_iommu->ctxs[ctx->asid - 1] = ctx;
+
+ return 0;
+}
+
+static int qcom_iommu_ctx_remove(struct platform_device *pdev)
+{
+ struct qcom_iommu_dev *qcom_iommu = dev_get_drvdata(pdev->dev.parent);
+ struct qcom_iommu_ctx *ctx = platform_get_drvdata(pdev);
+
+ platform_set_drvdata(pdev, NULL);
+
+ qcom_iommu->ctxs[ctx->asid - 1] = NULL;
+
+ return 0;
+}
+
+static const struct of_device_id ctx_of_match[] = {
+ { .compatible = "qcom,msm-iommu-v1-ns" },
+ { .compatible = "qcom,msm-iommu-v1-sec" },
+ { /* sentinel */ }
+};
+
+static struct platform_driver qcom_iommu_ctx_driver = {
+ .driver = {
+ .name = "qcom-iommu-ctx",
+ .of_match_table = ctx_of_match,
+ },
+ .probe = qcom_iommu_ctx_probe,
+ .remove = qcom_iommu_ctx_remove,
+};
+
+static bool qcom_iommu_has_secure_context(struct qcom_iommu_dev *qcom_iommu)
+{
+ struct device_node *child;
+
+ for_each_child_of_node(qcom_iommu->dev->of_node, child) {
+ if (of_device_is_compatible(child, "qcom,msm-iommu-v1-sec")) {
+ of_node_put(child);
+ return true;
+ }
+ }
+
+ return false;
+}
+
+static int qcom_iommu_device_probe(struct platform_device *pdev)
+{
+ struct device_node *child;
+ struct qcom_iommu_dev *qcom_iommu;
+ struct device *dev = &pdev->dev;
+ struct resource *res;
+ struct clk *clk;
+ int ret, max_asid = 0;
+
+ /* find the max asid (which is 1:1 to ctx bank idx), so we know how
+ * many child ctx devices we have:
+ */
+ for_each_child_of_node(dev->of_node, child)
+ max_asid = max(max_asid, get_asid(child));
+
+ qcom_iommu = devm_kzalloc(dev, struct_size(qcom_iommu, ctxs, max_asid),
+ GFP_KERNEL);
+ if (!qcom_iommu)
+ return -ENOMEM;
+ qcom_iommu->num_ctxs = max_asid;
+ qcom_iommu->dev = dev;
+
+ res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+ if (res) {
+ qcom_iommu->local_base = devm_ioremap_resource(dev, res);
+ if (IS_ERR(qcom_iommu->local_base))
+ return PTR_ERR(qcom_iommu->local_base);
+ }
+
+ clk = devm_clk_get(dev, "iface");
+ if (IS_ERR(clk)) {
+ dev_err(dev, "failed to get iface clock\n");
+ return PTR_ERR(clk);
+ }
+ qcom_iommu->clks[CLK_IFACE].clk = clk;
+
+ clk = devm_clk_get(dev, "bus");
+ if (IS_ERR(clk)) {
+ dev_err(dev, "failed to get bus clock\n");
+ return PTR_ERR(clk);
+ }
+ qcom_iommu->clks[CLK_BUS].clk = clk;
+
+ clk = devm_clk_get_optional(dev, "tbu");
+ if (IS_ERR(clk)) {
+ dev_err(dev, "failed to get tbu clock\n");
+ return PTR_ERR(clk);
+ }
+ qcom_iommu->clks[CLK_TBU].clk = clk;
+
+ if (of_property_read_u32(dev->of_node, "qcom,iommu-secure-id",
+ &qcom_iommu->sec_id)) {
+ dev_err(dev, "missing qcom,iommu-secure-id property\n");
+ return -ENODEV;
+ }
+
+ if (qcom_iommu_has_secure_context(qcom_iommu)) {
+ ret = qcom_iommu_sec_ptbl_init(dev);
+ if (ret) {
+ dev_err(dev, "cannot init secure pg table(%d)\n", ret);
+ return ret;
+ }
+ }
+
+ platform_set_drvdata(pdev, qcom_iommu);
+
+ pm_runtime_enable(dev);
+
+ /* register context bank devices, which are child nodes: */
+ ret = devm_of_platform_populate(dev);
+ if (ret) {
+ dev_err(dev, "Failed to populate iommu contexts\n");
+ return ret;
+ }
+
+ ret = iommu_device_sysfs_add(&qcom_iommu->iommu, dev, NULL,
+ dev_name(dev));
+ if (ret) {
+ dev_err(dev, "Failed to register iommu in sysfs\n");
+ return ret;
+ }
+
+ iommu_device_set_ops(&qcom_iommu->iommu, &qcom_iommu_ops);
+ iommu_device_set_fwnode(&qcom_iommu->iommu, dev->fwnode);
+
+ ret = iommu_device_register(&qcom_iommu->iommu);
+ if (ret) {
+ dev_err(dev, "Failed to register iommu\n");
+ return ret;
+ }
+
+ bus_set_iommu(&platform_bus_type, &qcom_iommu_ops);
+
+ if (qcom_iommu->local_base) {
+ pm_runtime_get_sync(dev);
+ writel_relaxed(0xffffffff, qcom_iommu->local_base + SMMU_INTR_SEL_NS);
+ pm_runtime_put_sync(dev);
+ }
+
+ return 0;
+}
+
+static int qcom_iommu_device_remove(struct platform_device *pdev)
+{
+ struct qcom_iommu_dev *qcom_iommu = platform_get_drvdata(pdev);
+
+ bus_set_iommu(&platform_bus_type, NULL);
+
+ pm_runtime_force_suspend(&pdev->dev);
+ platform_set_drvdata(pdev, NULL);
+ iommu_device_sysfs_remove(&qcom_iommu->iommu);
+ iommu_device_unregister(&qcom_iommu->iommu);
+
+ return 0;
+}
+
+static int __maybe_unused qcom_iommu_resume(struct device *dev)
+{
+ struct qcom_iommu_dev *qcom_iommu = dev_get_drvdata(dev);
+
+ return clk_bulk_prepare_enable(CLK_NUM, qcom_iommu->clks);
+}
+
+static int __maybe_unused qcom_iommu_suspend(struct device *dev)
+{
+ struct qcom_iommu_dev *qcom_iommu = dev_get_drvdata(dev);
+
+ clk_bulk_disable_unprepare(CLK_NUM, qcom_iommu->clks);
+
+ return 0;
+}
+
+static const struct dev_pm_ops qcom_iommu_pm_ops = {
+ SET_RUNTIME_PM_OPS(qcom_iommu_suspend, qcom_iommu_resume, NULL)
+ SET_SYSTEM_SLEEP_PM_OPS(pm_runtime_force_suspend,
+ pm_runtime_force_resume)
+};
+
+static const struct of_device_id qcom_iommu_of_match[] = {
+ { .compatible = "qcom,msm-iommu-v1" },
+ { /* sentinel */ }
+};
+
+static struct platform_driver qcom_iommu_driver = {
+ .driver = {
+ .name = "qcom-iommu",
+ .of_match_table = qcom_iommu_of_match,
+ .pm = &qcom_iommu_pm_ops,
+ },
+ .probe = qcom_iommu_device_probe,
+ .remove = qcom_iommu_device_remove,
+};
+
+static int __init qcom_iommu_init(void)
+{
+ int ret;
+
+ ret = platform_driver_register(&qcom_iommu_ctx_driver);
+ if (ret)
+ return ret;
+
+ ret = platform_driver_register(&qcom_iommu_driver);
+ if (ret)
+ platform_driver_unregister(&qcom_iommu_ctx_driver);
+
+ return ret;
+}
+device_initcall(qcom_iommu_init);
diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
new file mode 100644
index 000000000..d1539b739
--- /dev/null
+++ b/drivers/iommu/dma-iommu.c
@@ -0,0 +1,1288 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * A fairly generic DMA-API to IOMMU-API glue layer.
+ *
+ * Copyright (C) 2014-2015 ARM Ltd.
+ *
+ * based in part on arch/arm/mm/dma-mapping.c:
+ * Copyright (C) 2000-2004 Russell King
+ */
+
+#include <linux/acpi_iort.h>
+#include <linux/device.h>
+#include <linux/dma-map-ops.h>
+#include <linux/dma-iommu.h>
+#include <linux/gfp.h>
+#include <linux/huge_mm.h>
+#include <linux/iommu.h>
+#include <linux/iova.h>
+#include <linux/irq.h>
+#include <linux/mm.h>
+#include <linux/mutex.h>
+#include <linux/pci.h>
+#include <linux/scatterlist.h>
+#include <linux/vmalloc.h>
+#include <linux/crash_dump.h>
+
+struct iommu_dma_msi_page {
+ struct list_head list;
+ dma_addr_t iova;
+ phys_addr_t phys;
+};
+
+enum iommu_dma_cookie_type {
+ IOMMU_DMA_IOVA_COOKIE,
+ IOMMU_DMA_MSI_COOKIE,
+};
+
+struct iommu_dma_cookie {
+ enum iommu_dma_cookie_type type;
+ union {
+ /* Full allocator for IOMMU_DMA_IOVA_COOKIE */
+ struct iova_domain iovad;
+ /* Trivial linear page allocator for IOMMU_DMA_MSI_COOKIE */
+ dma_addr_t msi_iova;
+ };
+ struct list_head msi_page_list;
+
+ /* Domain for flush queue callback; NULL if flush queue not in use */
+ struct iommu_domain *fq_domain;
+};
+
+static inline size_t cookie_msi_granule(struct iommu_dma_cookie *cookie)
+{
+ if (cookie->type == IOMMU_DMA_IOVA_COOKIE)
+ return cookie->iovad.granule;
+ return PAGE_SIZE;
+}
+
+static struct iommu_dma_cookie *cookie_alloc(enum iommu_dma_cookie_type type)
+{
+ struct iommu_dma_cookie *cookie;
+
+ cookie = kzalloc(sizeof(*cookie), GFP_KERNEL);
+ if (cookie) {
+ INIT_LIST_HEAD(&cookie->msi_page_list);
+ cookie->type = type;
+ }
+ return cookie;
+}
+
+/**
+ * iommu_get_dma_cookie - Acquire DMA-API resources for a domain
+ * @domain: IOMMU domain to prepare for DMA-API usage
+ *
+ * IOMMU drivers should normally call this from their domain_alloc
+ * callback when domain->type == IOMMU_DOMAIN_DMA.
+ */
+int iommu_get_dma_cookie(struct iommu_domain *domain)
+{
+ if (domain->iova_cookie)
+ return -EEXIST;
+
+ domain->iova_cookie = cookie_alloc(IOMMU_DMA_IOVA_COOKIE);
+ if (!domain->iova_cookie)
+ return -ENOMEM;
+
+ return 0;
+}
+EXPORT_SYMBOL(iommu_get_dma_cookie);
+
+/**
+ * iommu_get_msi_cookie - Acquire just MSI remapping resources
+ * @domain: IOMMU domain to prepare
+ * @base: Start address of IOVA region for MSI mappings
+ *
+ * Users who manage their own IOVA allocation and do not want DMA API support,
+ * but would still like to take advantage of automatic MSI remapping, can use
+ * this to initialise their own domain appropriately. Users should reserve a
+ * contiguous IOVA region, starting at @base, large enough to accommodate the
+ * number of PAGE_SIZE mappings necessary to cover every MSI doorbell address
+ * used by the devices attached to @domain.
+ */
+int iommu_get_msi_cookie(struct iommu_domain *domain, dma_addr_t base)
+{
+ struct iommu_dma_cookie *cookie;
+
+ if (domain->type != IOMMU_DOMAIN_UNMANAGED)
+ return -EINVAL;
+
+ if (domain->iova_cookie)
+ return -EEXIST;
+
+ cookie = cookie_alloc(IOMMU_DMA_MSI_COOKIE);
+ if (!cookie)
+ return -ENOMEM;
+
+ cookie->msi_iova = base;
+ domain->iova_cookie = cookie;
+ return 0;
+}
+EXPORT_SYMBOL(iommu_get_msi_cookie);
+
+/**
+ * iommu_put_dma_cookie - Release a domain's DMA mapping resources
+ * @domain: IOMMU domain previously prepared by iommu_get_dma_cookie() or
+ * iommu_get_msi_cookie()
+ *
+ * IOMMU drivers should normally call this from their domain_free callback.
+ */
+void iommu_put_dma_cookie(struct iommu_domain *domain)
+{
+ struct iommu_dma_cookie *cookie = domain->iova_cookie;
+ struct iommu_dma_msi_page *msi, *tmp;
+
+ if (!cookie)
+ return;
+
+ if (cookie->type == IOMMU_DMA_IOVA_COOKIE && cookie->iovad.granule)
+ put_iova_domain(&cookie->iovad);
+
+ list_for_each_entry_safe(msi, tmp, &cookie->msi_page_list, list) {
+ list_del(&msi->list);
+ kfree(msi);
+ }
+ kfree(cookie);
+ domain->iova_cookie = NULL;
+}
+EXPORT_SYMBOL(iommu_put_dma_cookie);
+
+/**
+ * iommu_dma_get_resv_regions - Reserved region driver helper
+ * @dev: Device from iommu_get_resv_regions()
+ * @list: Reserved region list from iommu_get_resv_regions()
+ *
+ * IOMMU drivers can use this to implement their .get_resv_regions callback
+ * for general non-IOMMU-specific reservations. Currently, this covers GICv3
+ * ITS region reservation on ACPI based ARM platforms that may require HW MSI
+ * reservation.
+ */
+void iommu_dma_get_resv_regions(struct device *dev, struct list_head *list)
+{
+
+ if (!is_of_node(dev_iommu_fwspec_get(dev)->iommu_fwnode))
+ iort_iommu_msi_get_resv_regions(dev, list);
+
+}
+EXPORT_SYMBOL(iommu_dma_get_resv_regions);
+
+static int cookie_init_hw_msi_region(struct iommu_dma_cookie *cookie,
+ phys_addr_t start, phys_addr_t end)
+{
+ struct iova_domain *iovad = &cookie->iovad;
+ struct iommu_dma_msi_page *msi_page;
+ int i, num_pages;
+
+ start -= iova_offset(iovad, start);
+ num_pages = iova_align(iovad, end - start) >> iova_shift(iovad);
+
+ for (i = 0; i < num_pages; i++) {
+ msi_page = kmalloc(sizeof(*msi_page), GFP_KERNEL);
+ if (!msi_page)
+ return -ENOMEM;
+
+ msi_page->phys = start;
+ msi_page->iova = start;
+ INIT_LIST_HEAD(&msi_page->list);
+ list_add(&msi_page->list, &cookie->msi_page_list);
+ start += iovad->granule;
+ }
+
+ return 0;
+}
+
+static int iova_reserve_pci_windows(struct pci_dev *dev,
+ struct iova_domain *iovad)
+{
+ struct pci_host_bridge *bridge = pci_find_host_bridge(dev->bus);
+ struct resource_entry *window;
+ unsigned long lo, hi;
+ phys_addr_t start = 0, end;
+
+ resource_list_for_each_entry(window, &bridge->windows) {
+ if (resource_type(window->res) != IORESOURCE_MEM)
+ continue;
+
+ lo = iova_pfn(iovad, window->res->start - window->offset);
+ hi = iova_pfn(iovad, window->res->end - window->offset);
+ reserve_iova(iovad, lo, hi);
+ }
+
+ /* Get reserved DMA windows from host bridge */
+ resource_list_for_each_entry(window, &bridge->dma_ranges) {
+ end = window->res->start - window->offset;
+resv_iova:
+ if (end > start) {
+ lo = iova_pfn(iovad, start);
+ hi = iova_pfn(iovad, end);
+ reserve_iova(iovad, lo, hi);
+ } else if (end < start) {
+ /* dma_ranges list should be sorted */
+ dev_err(&dev->dev,
+ "Failed to reserve IOVA [%pa-%pa]\n",
+ &start, &end);
+ return -EINVAL;
+ }
+
+ start = window->res->end - window->offset + 1;
+ /* If window is last entry */
+ if (window->node.next == &bridge->dma_ranges &&
+ end != ~(phys_addr_t)0) {
+ end = ~(phys_addr_t)0;
+ goto resv_iova;
+ }
+ }
+
+ return 0;
+}
+
+static int iova_reserve_iommu_regions(struct device *dev,
+ struct iommu_domain *domain)
+{
+ struct iommu_dma_cookie *cookie = domain->iova_cookie;
+ struct iova_domain *iovad = &cookie->iovad;
+ struct iommu_resv_region *region;
+ LIST_HEAD(resv_regions);
+ int ret = 0;
+
+ if (dev_is_pci(dev)) {
+ ret = iova_reserve_pci_windows(to_pci_dev(dev), iovad);
+ if (ret)
+ return ret;
+ }
+
+ iommu_get_resv_regions(dev, &resv_regions);
+ list_for_each_entry(region, &resv_regions, list) {
+ unsigned long lo, hi;
+
+ /* We ARE the software that manages these! */
+ if (region->type == IOMMU_RESV_SW_MSI)
+ continue;
+
+ lo = iova_pfn(iovad, region->start);
+ hi = iova_pfn(iovad, region->start + region->length - 1);
+ reserve_iova(iovad, lo, hi);
+
+ if (region->type == IOMMU_RESV_MSI)
+ ret = cookie_init_hw_msi_region(cookie, region->start,
+ region->start + region->length);
+ if (ret)
+ break;
+ }
+ iommu_put_resv_regions(dev, &resv_regions);
+
+ return ret;
+}
+
+static void iommu_dma_flush_iotlb_all(struct iova_domain *iovad)
+{
+ struct iommu_dma_cookie *cookie;
+ struct iommu_domain *domain;
+
+ cookie = container_of(iovad, struct iommu_dma_cookie, iovad);
+ domain = cookie->fq_domain;
+ /*
+ * The IOMMU driver supporting DOMAIN_ATTR_DMA_USE_FLUSH_QUEUE
+ * implies that ops->flush_iotlb_all must be non-NULL.
+ */
+ domain->ops->flush_iotlb_all(domain);
+}
+
+/**
+ * iommu_dma_init_domain - Initialise a DMA mapping domain
+ * @domain: IOMMU domain previously prepared by iommu_get_dma_cookie()
+ * @base: IOVA at which the mappable address space starts
+ * @size: Size of IOVA space
+ * @dev: Device the domain is being initialised for
+ *
+ * @base and @size should be exact multiples of IOMMU page granularity to
+ * avoid rounding surprises. If necessary, we reserve the page at address 0
+ * to ensure it is an invalid IOVA. It is safe to reinitialise a domain, but
+ * any change which could make prior IOVAs invalid will fail.
+ */
+static int iommu_dma_init_domain(struct iommu_domain *domain, dma_addr_t base,
+ u64 size, struct device *dev)
+{
+ struct iommu_dma_cookie *cookie = domain->iova_cookie;
+ unsigned long order, base_pfn;
+ struct iova_domain *iovad;
+ int attr;
+
+ if (!cookie || cookie->type != IOMMU_DMA_IOVA_COOKIE)
+ return -EINVAL;
+
+ iovad = &cookie->iovad;
+
+ /* Use the smallest supported page size for IOVA granularity */
+ order = __ffs(domain->pgsize_bitmap);
+ base_pfn = max_t(unsigned long, 1, base >> order);
+
+ /* Check the domain allows at least some access to the device... */
+ if (domain->geometry.force_aperture) {
+ if (base > domain->geometry.aperture_end ||
+ base + size <= domain->geometry.aperture_start) {
+ pr_warn("specified DMA range outside IOMMU capability\n");
+ return -EFAULT;
+ }
+ /* ...then finally give it a kicking to make sure it fits */
+ base_pfn = max_t(unsigned long, base_pfn,
+ domain->geometry.aperture_start >> order);
+ }
+
+ /* start_pfn is always nonzero for an already-initialised domain */
+ if (iovad->start_pfn) {
+ if (1UL << order != iovad->granule ||
+ base_pfn != iovad->start_pfn) {
+ pr_warn("Incompatible range for DMA domain\n");
+ return -EFAULT;
+ }
+
+ return 0;
+ }
+
+ init_iova_domain(iovad, 1UL << order, base_pfn);
+
+ if (!cookie->fq_domain && !iommu_domain_get_attr(domain,
+ DOMAIN_ATTR_DMA_USE_FLUSH_QUEUE, &attr) && attr) {
+ if (init_iova_flush_queue(iovad, iommu_dma_flush_iotlb_all,
+ NULL))
+ pr_warn("iova flush queue initialization failed\n");
+ else
+ cookie->fq_domain = domain;
+ }
+
+ if (!dev)
+ return 0;
+
+ return iova_reserve_iommu_regions(dev, domain);
+}
+
+static int iommu_dma_deferred_attach(struct device *dev,
+ struct iommu_domain *domain)
+{
+ const struct iommu_ops *ops = domain->ops;
+
+ if (!is_kdump_kernel())
+ return 0;
+
+ if (unlikely(ops->is_attach_deferred &&
+ ops->is_attach_deferred(domain, dev)))
+ return iommu_attach_device(domain, dev);
+
+ return 0;
+}
+
+/**
+ * dma_info_to_prot - Translate DMA API directions and attributes to IOMMU API
+ * page flags.
+ * @dir: Direction of DMA transfer
+ * @coherent: Is the DMA master cache-coherent?
+ * @attrs: DMA attributes for the mapping
+ *
+ * Return: corresponding IOMMU API page protection flags
+ */
+static int dma_info_to_prot(enum dma_data_direction dir, bool coherent,
+ unsigned long attrs)
+{
+ int prot = coherent ? IOMMU_CACHE : 0;
+
+ if (attrs & DMA_ATTR_PRIVILEGED)
+ prot |= IOMMU_PRIV;
+
+ switch (dir) {
+ case DMA_BIDIRECTIONAL:
+ return prot | IOMMU_READ | IOMMU_WRITE;
+ case DMA_TO_DEVICE:
+ return prot | IOMMU_READ;
+ case DMA_FROM_DEVICE:
+ return prot | IOMMU_WRITE;
+ default:
+ return 0;
+ }
+}
+
+static dma_addr_t iommu_dma_alloc_iova(struct iommu_domain *domain,
+ size_t size, u64 dma_limit, struct device *dev)
+{
+ struct iommu_dma_cookie *cookie = domain->iova_cookie;
+ struct iova_domain *iovad = &cookie->iovad;
+ unsigned long shift, iova_len, iova = 0;
+
+ if (cookie->type == IOMMU_DMA_MSI_COOKIE) {
+ cookie->msi_iova += size;
+ return cookie->msi_iova - size;
+ }
+
+ shift = iova_shift(iovad);
+ iova_len = size >> shift;
+ /*
+ * Freeing non-power-of-two-sized allocations back into the IOVA caches
+ * will come back to bite us badly, so we have to waste a bit of space
+ * rounding up anything cacheable to make sure that can't happen. The
+ * order of the unadjusted size will still match upon freeing.
+ */
+ if (iova_len < (1 << (IOVA_RANGE_CACHE_MAX_SIZE - 1)))
+ iova_len = roundup_pow_of_two(iova_len);
+
+ dma_limit = min_not_zero(dma_limit, dev->bus_dma_limit);
+
+ if (domain->geometry.force_aperture)
+ dma_limit = min(dma_limit, (u64)domain->geometry.aperture_end);
+
+ /* Try to get PCI devices a SAC address */
+ if (dma_limit > DMA_BIT_MASK(32) && dev_is_pci(dev))
+ iova = alloc_iova_fast(iovad, iova_len,
+ DMA_BIT_MASK(32) >> shift, false);
+
+ if (!iova)
+ iova = alloc_iova_fast(iovad, iova_len, dma_limit >> shift,
+ true);
+
+ return (dma_addr_t)iova << shift;
+}
+
+static void iommu_dma_free_iova(struct iommu_dma_cookie *cookie,
+ dma_addr_t iova, size_t size)
+{
+ struct iova_domain *iovad = &cookie->iovad;
+
+ /* The MSI case is only ever cleaning up its most recent allocation */
+ if (cookie->type == IOMMU_DMA_MSI_COOKIE)
+ cookie->msi_iova -= size;
+ else if (cookie->fq_domain) /* non-strict mode */
+ queue_iova(iovad, iova_pfn(iovad, iova),
+ size >> iova_shift(iovad), 0);
+ else
+ free_iova_fast(iovad, iova_pfn(iovad, iova),
+ size >> iova_shift(iovad));
+}
+
+static void __iommu_dma_unmap(struct device *dev, dma_addr_t dma_addr,
+ size_t size)
+{
+ struct iommu_domain *domain = iommu_get_dma_domain(dev);
+ struct iommu_dma_cookie *cookie = domain->iova_cookie;
+ struct iova_domain *iovad = &cookie->iovad;
+ size_t iova_off = iova_offset(iovad, dma_addr);
+ struct iommu_iotlb_gather iotlb_gather;
+ size_t unmapped;
+
+ dma_addr -= iova_off;
+ size = iova_align(iovad, size + iova_off);
+ iommu_iotlb_gather_init(&iotlb_gather);
+
+ unmapped = iommu_unmap_fast(domain, dma_addr, size, &iotlb_gather);
+ WARN_ON(unmapped != size);
+
+ if (!cookie->fq_domain)
+ iommu_iotlb_sync(domain, &iotlb_gather);
+ iommu_dma_free_iova(cookie, dma_addr, size);
+}
+
+static dma_addr_t __iommu_dma_map(struct device *dev, phys_addr_t phys,
+ size_t size, int prot, u64 dma_mask)
+{
+ struct iommu_domain *domain = iommu_get_dma_domain(dev);
+ struct iommu_dma_cookie *cookie = domain->iova_cookie;
+ struct iova_domain *iovad = &cookie->iovad;
+ size_t iova_off = iova_offset(iovad, phys);
+ dma_addr_t iova;
+
+ if (unlikely(iommu_dma_deferred_attach(dev, domain)))
+ return DMA_MAPPING_ERROR;
+
+ size = iova_align(iovad, size + iova_off);
+
+ iova = iommu_dma_alloc_iova(domain, size, dma_mask, dev);
+ if (!iova)
+ return DMA_MAPPING_ERROR;
+
+ if (iommu_map_atomic(domain, iova, phys - iova_off, size, prot)) {
+ iommu_dma_free_iova(cookie, iova, size);
+ return DMA_MAPPING_ERROR;
+ }
+ return iova + iova_off;
+}
+
+static void __iommu_dma_free_pages(struct page **pages, int count)
+{
+ while (count--)
+ __free_page(pages[count]);
+ kvfree(pages);
+}
+
+static struct page **__iommu_dma_alloc_pages(struct device *dev,
+ unsigned int count, unsigned long order_mask, gfp_t gfp)
+{
+ struct page **pages;
+ unsigned int i = 0, nid = dev_to_node(dev);
+
+ order_mask &= (2U << MAX_ORDER) - 1;
+ if (!order_mask)
+ return NULL;
+
+ pages = kvzalloc(count * sizeof(*pages), GFP_KERNEL);
+ if (!pages)
+ return NULL;
+
+ /* IOMMU can map any pages, so himem can also be used here */
+ gfp |= __GFP_NOWARN | __GFP_HIGHMEM;
+
+ /* It makes no sense to muck about with huge pages */
+ gfp &= ~__GFP_COMP;
+
+ while (count) {
+ struct page *page = NULL;
+ unsigned int order_size;
+
+ /*
+ * Higher-order allocations are a convenience rather
+ * than a necessity, hence using __GFP_NORETRY until
+ * falling back to minimum-order allocations.
+ */
+ for (order_mask &= (2U << __fls(count)) - 1;
+ order_mask; order_mask &= ~order_size) {
+ unsigned int order = __fls(order_mask);
+ gfp_t alloc_flags = gfp;
+
+ order_size = 1U << order;
+ if (order_mask > order_size)
+ alloc_flags |= __GFP_NORETRY;
+ page = alloc_pages_node(nid, alloc_flags, order);
+ if (!page)
+ continue;
+ if (order)
+ split_page(page, order);
+ break;
+ }
+ if (!page) {
+ __iommu_dma_free_pages(pages, i);
+ return NULL;
+ }
+ count -= order_size;
+ while (order_size--)
+ pages[i++] = page++;
+ }
+ return pages;
+}
+
+/**
+ * iommu_dma_alloc_remap - Allocate and map a buffer contiguous in IOVA space
+ * @dev: Device to allocate memory for. Must be a real device
+ * attached to an iommu_dma_domain
+ * @size: Size of buffer in bytes
+ * @dma_handle: Out argument for allocated DMA handle
+ * @gfp: Allocation flags
+ * @prot: pgprot_t to use for the remapped mapping
+ * @attrs: DMA attributes for this allocation
+ *
+ * If @size is less than PAGE_SIZE, then a full CPU page will be allocated,
+ * but an IOMMU which supports smaller pages might not map the whole thing.
+ *
+ * Return: Mapped virtual address, or NULL on failure.
+ */
+static void *iommu_dma_alloc_remap(struct device *dev, size_t size,
+ dma_addr_t *dma_handle, gfp_t gfp, pgprot_t prot,
+ unsigned long attrs)
+{
+ struct iommu_domain *domain = iommu_get_dma_domain(dev);
+ struct iommu_dma_cookie *cookie = domain->iova_cookie;
+ struct iova_domain *iovad = &cookie->iovad;
+ bool coherent = dev_is_dma_coherent(dev);
+ int ioprot = dma_info_to_prot(DMA_BIDIRECTIONAL, coherent, attrs);
+ unsigned int count, min_size, alloc_sizes = domain->pgsize_bitmap;
+ struct page **pages;
+ struct sg_table sgt;
+ dma_addr_t iova;
+ void *vaddr;
+
+ *dma_handle = DMA_MAPPING_ERROR;
+
+ if (unlikely(iommu_dma_deferred_attach(dev, domain)))
+ return NULL;
+
+ min_size = alloc_sizes & -alloc_sizes;
+ if (min_size < PAGE_SIZE) {
+ min_size = PAGE_SIZE;
+ alloc_sizes |= PAGE_SIZE;
+ } else {
+ size = ALIGN(size, min_size);
+ }
+ if (attrs & DMA_ATTR_ALLOC_SINGLE_PAGES)
+ alloc_sizes = min_size;
+
+ count = PAGE_ALIGN(size) >> PAGE_SHIFT;
+ pages = __iommu_dma_alloc_pages(dev, count, alloc_sizes >> PAGE_SHIFT,
+ gfp);
+ if (!pages)
+ return NULL;
+
+ size = iova_align(iovad, size);
+ iova = iommu_dma_alloc_iova(domain, size, dev->coherent_dma_mask, dev);
+ if (!iova)
+ goto out_free_pages;
+
+ if (sg_alloc_table_from_pages(&sgt, pages, count, 0, size, GFP_KERNEL))
+ goto out_free_iova;
+
+ if (!(ioprot & IOMMU_CACHE)) {
+ struct scatterlist *sg;
+ int i;
+
+ for_each_sg(sgt.sgl, sg, sgt.orig_nents, i)
+ arch_dma_prep_coherent(sg_page(sg), sg->length);
+ }
+
+ if (iommu_map_sg_atomic(domain, iova, sgt.sgl, sgt.orig_nents, ioprot)
+ < size)
+ goto out_free_sg;
+
+ vaddr = dma_common_pages_remap(pages, size, prot,
+ __builtin_return_address(0));
+ if (!vaddr)
+ goto out_unmap;
+
+ *dma_handle = iova;
+ sg_free_table(&sgt);
+ return vaddr;
+
+out_unmap:
+ __iommu_dma_unmap(dev, iova, size);
+out_free_sg:
+ sg_free_table(&sgt);
+out_free_iova:
+ iommu_dma_free_iova(cookie, iova, size);
+out_free_pages:
+ __iommu_dma_free_pages(pages, count);
+ return NULL;
+}
+
+/**
+ * __iommu_dma_mmap - Map a buffer into provided user VMA
+ * @pages: Array representing buffer from __iommu_dma_alloc()
+ * @size: Size of buffer in bytes
+ * @vma: VMA describing requested userspace mapping
+ *
+ * Maps the pages of the buffer in @pages into @vma. The caller is responsible
+ * for verifying the correct size and protection of @vma beforehand.
+ */
+static int __iommu_dma_mmap(struct page **pages, size_t size,
+ struct vm_area_struct *vma)
+{
+ return vm_map_pages(vma, pages, PAGE_ALIGN(size) >> PAGE_SHIFT);
+}
+
+static void iommu_dma_sync_single_for_cpu(struct device *dev,
+ dma_addr_t dma_handle, size_t size, enum dma_data_direction dir)
+{
+ phys_addr_t phys;
+
+ if (dev_is_dma_coherent(dev))
+ return;
+
+ phys = iommu_iova_to_phys(iommu_get_dma_domain(dev), dma_handle);
+ arch_sync_dma_for_cpu(phys, size, dir);
+}
+
+static void iommu_dma_sync_single_for_device(struct device *dev,
+ dma_addr_t dma_handle, size_t size, enum dma_data_direction dir)
+{
+ phys_addr_t phys;
+
+ if (dev_is_dma_coherent(dev))
+ return;
+
+ phys = iommu_iova_to_phys(iommu_get_dma_domain(dev), dma_handle);
+ arch_sync_dma_for_device(phys, size, dir);
+}
+
+static void iommu_dma_sync_sg_for_cpu(struct device *dev,
+ struct scatterlist *sgl, int nelems,
+ enum dma_data_direction dir)
+{
+ struct scatterlist *sg;
+ int i;
+
+ if (dev_is_dma_coherent(dev))
+ return;
+
+ for_each_sg(sgl, sg, nelems, i)
+ arch_sync_dma_for_cpu(sg_phys(sg), sg->length, dir);
+}
+
+static void iommu_dma_sync_sg_for_device(struct device *dev,
+ struct scatterlist *sgl, int nelems,
+ enum dma_data_direction dir)
+{
+ struct scatterlist *sg;
+ int i;
+
+ if (dev_is_dma_coherent(dev))
+ return;
+
+ for_each_sg(sgl, sg, nelems, i)
+ arch_sync_dma_for_device(sg_phys(sg), sg->length, dir);
+}
+
+static dma_addr_t iommu_dma_map_page(struct device *dev, struct page *page,
+ unsigned long offset, size_t size, enum dma_data_direction dir,
+ unsigned long attrs)
+{
+ phys_addr_t phys = page_to_phys(page) + offset;
+ bool coherent = dev_is_dma_coherent(dev);
+ int prot = dma_info_to_prot(dir, coherent, attrs);
+ dma_addr_t dma_handle;
+
+ dma_handle = __iommu_dma_map(dev, phys, size, prot, dma_get_mask(dev));
+ if (!coherent && !(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
+ dma_handle != DMA_MAPPING_ERROR)
+ arch_sync_dma_for_device(phys, size, dir);
+ return dma_handle;
+}
+
+static void iommu_dma_unmap_page(struct device *dev, dma_addr_t dma_handle,
+ size_t size, enum dma_data_direction dir, unsigned long attrs)
+{
+ if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
+ iommu_dma_sync_single_for_cpu(dev, dma_handle, size, dir);
+ __iommu_dma_unmap(dev, dma_handle, size);
+}
+
+/*
+ * Prepare a successfully-mapped scatterlist to give back to the caller.
+ *
+ * At this point the segments are already laid out by iommu_dma_map_sg() to
+ * avoid individually crossing any boundaries, so we merely need to check a
+ * segment's start address to avoid concatenating across one.
+ */
+static int __finalise_sg(struct device *dev, struct scatterlist *sg, int nents,
+ dma_addr_t dma_addr)
+{
+ struct scatterlist *s, *cur = sg;
+ unsigned long seg_mask = dma_get_seg_boundary(dev);
+ unsigned int cur_len = 0, max_len = dma_get_max_seg_size(dev);
+ int i, count = 0;
+
+ for_each_sg(sg, s, nents, i) {
+ /* Restore this segment's original unaligned fields first */
+ unsigned int s_iova_off = sg_dma_address(s);
+ unsigned int s_length = sg_dma_len(s);
+ unsigned int s_iova_len = s->length;
+
+ s->offset += s_iova_off;
+ s->length = s_length;
+ sg_dma_address(s) = DMA_MAPPING_ERROR;
+ sg_dma_len(s) = 0;
+
+ /*
+ * Now fill in the real DMA data. If...
+ * - there is a valid output segment to append to
+ * - and this segment starts on an IOVA page boundary
+ * - but doesn't fall at a segment boundary
+ * - and wouldn't make the resulting output segment too long
+ */
+ if (cur_len && !s_iova_off && (dma_addr & seg_mask) &&
+ (max_len - cur_len >= s_length)) {
+ /* ...then concatenate it with the previous one */
+ cur_len += s_length;
+ } else {
+ /* Otherwise start the next output segment */
+ if (i > 0)
+ cur = sg_next(cur);
+ cur_len = s_length;
+ count++;
+
+ sg_dma_address(cur) = dma_addr + s_iova_off;
+ }
+
+ sg_dma_len(cur) = cur_len;
+ dma_addr += s_iova_len;
+
+ if (s_length + s_iova_off < s_iova_len)
+ cur_len = 0;
+ }
+ return count;
+}
+
+/*
+ * If mapping failed, then just restore the original list,
+ * but making sure the DMA fields are invalidated.
+ */
+static void __invalidate_sg(struct scatterlist *sg, int nents)
+{
+ struct scatterlist *s;
+ int i;
+
+ for_each_sg(sg, s, nents, i) {
+ if (sg_dma_address(s) != DMA_MAPPING_ERROR)
+ s->offset += sg_dma_address(s);
+ if (sg_dma_len(s))
+ s->length = sg_dma_len(s);
+ sg_dma_address(s) = DMA_MAPPING_ERROR;
+ sg_dma_len(s) = 0;
+ }
+}
+
+/*
+ * The DMA API client is passing in a scatterlist which could describe
+ * any old buffer layout, but the IOMMU API requires everything to be
+ * aligned to IOMMU pages. Hence the need for this complicated bit of
+ * impedance-matching, to be able to hand off a suitably-aligned list,
+ * but still preserve the original offsets and sizes for the caller.
+ */
+static int iommu_dma_map_sg(struct device *dev, struct scatterlist *sg,
+ int nents, enum dma_data_direction dir, unsigned long attrs)
+{
+ struct iommu_domain *domain = iommu_get_dma_domain(dev);
+ struct iommu_dma_cookie *cookie = domain->iova_cookie;
+ struct iova_domain *iovad = &cookie->iovad;
+ struct scatterlist *s, *prev = NULL;
+ int prot = dma_info_to_prot(dir, dev_is_dma_coherent(dev), attrs);
+ dma_addr_t iova;
+ size_t iova_len = 0;
+ unsigned long mask = dma_get_seg_boundary(dev);
+ int i;
+
+ if (unlikely(iommu_dma_deferred_attach(dev, domain)))
+ return 0;
+
+ if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
+ iommu_dma_sync_sg_for_device(dev, sg, nents, dir);
+
+ /*
+ * Work out how much IOVA space we need, and align the segments to
+ * IOVA granules for the IOMMU driver to handle. With some clever
+ * trickery we can modify the list in-place, but reversibly, by
+ * stashing the unaligned parts in the as-yet-unused DMA fields.
+ */
+ for_each_sg(sg, s, nents, i) {
+ size_t s_iova_off = iova_offset(iovad, s->offset);
+ size_t s_length = s->length;
+ size_t pad_len = (mask - iova_len + 1) & mask;
+
+ sg_dma_address(s) = s_iova_off;
+ sg_dma_len(s) = s_length;
+ s->offset -= s_iova_off;
+ s_length = iova_align(iovad, s_length + s_iova_off);
+ s->length = s_length;
+
+ /*
+ * Due to the alignment of our single IOVA allocation, we can
+ * depend on these assumptions about the segment boundary mask:
+ * - If mask size >= IOVA size, then the IOVA range cannot
+ * possibly fall across a boundary, so we don't care.
+ * - If mask size < IOVA size, then the IOVA range must start
+ * exactly on a boundary, therefore we can lay things out
+ * based purely on segment lengths without needing to know
+ * the actual addresses beforehand.
+ * - The mask must be a power of 2, so pad_len == 0 if
+ * iova_len == 0, thus we cannot dereference prev the first
+ * time through here (i.e. before it has a meaningful value).
+ */
+ if (pad_len && pad_len < s_length - 1) {
+ prev->length += pad_len;
+ iova_len += pad_len;
+ }
+
+ iova_len += s_length;
+ prev = s;
+ }
+
+ iova = iommu_dma_alloc_iova(domain, iova_len, dma_get_mask(dev), dev);
+ if (!iova)
+ goto out_restore_sg;
+
+ /*
+ * We'll leave any physical concatenation to the IOMMU driver's
+ * implementation - it knows better than we do.
+ */
+ if (iommu_map_sg_atomic(domain, iova, sg, nents, prot) < iova_len)
+ goto out_free_iova;
+
+ return __finalise_sg(dev, sg, nents, iova);
+
+out_free_iova:
+ iommu_dma_free_iova(cookie, iova, iova_len);
+out_restore_sg:
+ __invalidate_sg(sg, nents);
+ return 0;
+}
+
+static void iommu_dma_unmap_sg(struct device *dev, struct scatterlist *sg,
+ int nents, enum dma_data_direction dir, unsigned long attrs)
+{
+ dma_addr_t start, end;
+ struct scatterlist *tmp;
+ int i;
+
+ if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
+ iommu_dma_sync_sg_for_cpu(dev, sg, nents, dir);
+
+ /*
+ * The scatterlist segments are mapped into a single
+ * contiguous IOVA allocation, so this is incredibly easy.
+ */
+ start = sg_dma_address(sg);
+ for_each_sg(sg_next(sg), tmp, nents - 1, i) {
+ if (sg_dma_len(tmp) == 0)
+ break;
+ sg = tmp;
+ }
+ end = sg_dma_address(sg) + sg_dma_len(sg);
+ __iommu_dma_unmap(dev, start, end - start);
+}
+
+static dma_addr_t iommu_dma_map_resource(struct device *dev, phys_addr_t phys,
+ size_t size, enum dma_data_direction dir, unsigned long attrs)
+{
+ return __iommu_dma_map(dev, phys, size,
+ dma_info_to_prot(dir, false, attrs) | IOMMU_MMIO,
+ dma_get_mask(dev));
+}
+
+static void iommu_dma_unmap_resource(struct device *dev, dma_addr_t handle,
+ size_t size, enum dma_data_direction dir, unsigned long attrs)
+{
+ __iommu_dma_unmap(dev, handle, size);
+}
+
+static void __iommu_dma_free(struct device *dev, size_t size, void *cpu_addr)
+{
+ size_t alloc_size = PAGE_ALIGN(size);
+ int count = alloc_size >> PAGE_SHIFT;
+ struct page *page = NULL, **pages = NULL;
+
+ /* Non-coherent atomic allocation? Easy */
+ if (IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) &&
+ dma_free_from_pool(dev, cpu_addr, alloc_size))
+ return;
+
+ if (IS_ENABLED(CONFIG_DMA_REMAP) && is_vmalloc_addr(cpu_addr)) {
+ /*
+ * If it the address is remapped, then it's either non-coherent
+ * or highmem CMA, or an iommu_dma_alloc_remap() construction.
+ */
+ pages = dma_common_find_pages(cpu_addr);
+ if (!pages)
+ page = vmalloc_to_page(cpu_addr);
+ dma_common_free_remap(cpu_addr, alloc_size);
+ } else {
+ /* Lowmem means a coherent atomic or CMA allocation */
+ page = virt_to_page(cpu_addr);
+ }
+
+ if (pages)
+ __iommu_dma_free_pages(pages, count);
+ if (page)
+ dma_free_contiguous(dev, page, alloc_size);
+}
+
+static void iommu_dma_free(struct device *dev, size_t size, void *cpu_addr,
+ dma_addr_t handle, unsigned long attrs)
+{
+ __iommu_dma_unmap(dev, handle, size);
+ __iommu_dma_free(dev, size, cpu_addr);
+}
+
+static void *iommu_dma_alloc_pages(struct device *dev, size_t size,
+ struct page **pagep, gfp_t gfp, unsigned long attrs)
+{
+ bool coherent = dev_is_dma_coherent(dev);
+ size_t alloc_size = PAGE_ALIGN(size);
+ int node = dev_to_node(dev);
+ struct page *page = NULL;
+ void *cpu_addr;
+
+ page = dma_alloc_contiguous(dev, alloc_size, gfp);
+ if (!page)
+ page = alloc_pages_node(node, gfp, get_order(alloc_size));
+ if (!page)
+ return NULL;
+
+ if (IS_ENABLED(CONFIG_DMA_REMAP) && (!coherent || PageHighMem(page))) {
+ pgprot_t prot = dma_pgprot(dev, PAGE_KERNEL, attrs);
+
+ cpu_addr = dma_common_contiguous_remap(page, alloc_size,
+ prot, __builtin_return_address(0));
+ if (!cpu_addr)
+ goto out_free_pages;
+
+ if (!coherent)
+ arch_dma_prep_coherent(page, size);
+ } else {
+ cpu_addr = page_address(page);
+ }
+
+ *pagep = page;
+ memset(cpu_addr, 0, alloc_size);
+ return cpu_addr;
+out_free_pages:
+ dma_free_contiguous(dev, page, alloc_size);
+ return NULL;
+}
+
+static void *iommu_dma_alloc(struct device *dev, size_t size,
+ dma_addr_t *handle, gfp_t gfp, unsigned long attrs)
+{
+ bool coherent = dev_is_dma_coherent(dev);
+ int ioprot = dma_info_to_prot(DMA_BIDIRECTIONAL, coherent, attrs);
+ struct page *page = NULL;
+ void *cpu_addr;
+
+ gfp |= __GFP_ZERO;
+
+ if (IS_ENABLED(CONFIG_DMA_REMAP) && gfpflags_allow_blocking(gfp) &&
+ !(attrs & DMA_ATTR_FORCE_CONTIGUOUS)) {
+ return iommu_dma_alloc_remap(dev, size, handle, gfp,
+ dma_pgprot(dev, PAGE_KERNEL, attrs), attrs);
+ }
+
+ if (IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) &&
+ !gfpflags_allow_blocking(gfp) && !coherent)
+ page = dma_alloc_from_pool(dev, PAGE_ALIGN(size), &cpu_addr,
+ gfp, NULL);
+ else
+ cpu_addr = iommu_dma_alloc_pages(dev, size, &page, gfp, attrs);
+ if (!cpu_addr)
+ return NULL;
+
+ *handle = __iommu_dma_map(dev, page_to_phys(page), size, ioprot,
+ dev->coherent_dma_mask);
+ if (*handle == DMA_MAPPING_ERROR) {
+ __iommu_dma_free(dev, size, cpu_addr);
+ return NULL;
+ }
+
+ return cpu_addr;
+}
+
+#ifdef CONFIG_DMA_REMAP
+static void *iommu_dma_alloc_noncoherent(struct device *dev, size_t size,
+ dma_addr_t *handle, enum dma_data_direction dir, gfp_t gfp)
+{
+ if (!gfpflags_allow_blocking(gfp)) {
+ struct page *page;
+
+ page = dma_common_alloc_pages(dev, size, handle, dir, gfp);
+ if (!page)
+ return NULL;
+ return page_address(page);
+ }
+
+ return iommu_dma_alloc_remap(dev, size, handle, gfp | __GFP_ZERO,
+ PAGE_KERNEL, 0);
+}
+
+static void iommu_dma_free_noncoherent(struct device *dev, size_t size,
+ void *cpu_addr, dma_addr_t handle, enum dma_data_direction dir)
+{
+ __iommu_dma_unmap(dev, handle, size);
+ __iommu_dma_free(dev, size, cpu_addr);
+}
+#else
+#define iommu_dma_alloc_noncoherent NULL
+#define iommu_dma_free_noncoherent NULL
+#endif /* CONFIG_DMA_REMAP */
+
+static int iommu_dma_mmap(struct device *dev, struct vm_area_struct *vma,
+ void *cpu_addr, dma_addr_t dma_addr, size_t size,
+ unsigned long attrs)
+{
+ unsigned long nr_pages = PAGE_ALIGN(size) >> PAGE_SHIFT;
+ unsigned long pfn, off = vma->vm_pgoff;
+ int ret;
+
+ vma->vm_page_prot = dma_pgprot(dev, vma->vm_page_prot, attrs);
+
+ if (dma_mmap_from_dev_coherent(dev, vma, cpu_addr, size, &ret))
+ return ret;
+
+ if (off >= nr_pages || vma_pages(vma) > nr_pages - off)
+ return -ENXIO;
+
+ if (IS_ENABLED(CONFIG_DMA_REMAP) && is_vmalloc_addr(cpu_addr)) {
+ struct page **pages = dma_common_find_pages(cpu_addr);
+
+ if (pages)
+ return __iommu_dma_mmap(pages, size, vma);
+ pfn = vmalloc_to_pfn(cpu_addr);
+ } else {
+ pfn = page_to_pfn(virt_to_page(cpu_addr));
+ }
+
+ return remap_pfn_range(vma, vma->vm_start, pfn + off,
+ vma->vm_end - vma->vm_start,
+ vma->vm_page_prot);
+}
+
+static int iommu_dma_get_sgtable(struct device *dev, struct sg_table *sgt,
+ void *cpu_addr, dma_addr_t dma_addr, size_t size,
+ unsigned long attrs)
+{
+ struct page *page;
+ int ret;
+
+ if (IS_ENABLED(CONFIG_DMA_REMAP) && is_vmalloc_addr(cpu_addr)) {
+ struct page **pages = dma_common_find_pages(cpu_addr);
+
+ if (pages) {
+ return sg_alloc_table_from_pages(sgt, pages,
+ PAGE_ALIGN(size) >> PAGE_SHIFT,
+ 0, size, GFP_KERNEL);
+ }
+
+ page = vmalloc_to_page(cpu_addr);
+ } else {
+ page = virt_to_page(cpu_addr);
+ }
+
+ ret = sg_alloc_table(sgt, 1, GFP_KERNEL);
+ if (!ret)
+ sg_set_page(sgt->sgl, page, PAGE_ALIGN(size), 0);
+ return ret;
+}
+
+static unsigned long iommu_dma_get_merge_boundary(struct device *dev)
+{
+ struct iommu_domain *domain = iommu_get_dma_domain(dev);
+
+ return (1UL << __ffs(domain->pgsize_bitmap)) - 1;
+}
+
+static const struct dma_map_ops iommu_dma_ops = {
+ .alloc = iommu_dma_alloc,
+ .free = iommu_dma_free,
+ .alloc_pages = dma_common_alloc_pages,
+ .free_pages = dma_common_free_pages,
+ .alloc_noncoherent = iommu_dma_alloc_noncoherent,
+ .free_noncoherent = iommu_dma_free_noncoherent,
+ .mmap = iommu_dma_mmap,
+ .get_sgtable = iommu_dma_get_sgtable,
+ .map_page = iommu_dma_map_page,
+ .unmap_page = iommu_dma_unmap_page,
+ .map_sg = iommu_dma_map_sg,
+ .unmap_sg = iommu_dma_unmap_sg,
+ .sync_single_for_cpu = iommu_dma_sync_single_for_cpu,
+ .sync_single_for_device = iommu_dma_sync_single_for_device,
+ .sync_sg_for_cpu = iommu_dma_sync_sg_for_cpu,
+ .sync_sg_for_device = iommu_dma_sync_sg_for_device,
+ .map_resource = iommu_dma_map_resource,
+ .unmap_resource = iommu_dma_unmap_resource,
+ .get_merge_boundary = iommu_dma_get_merge_boundary,
+};
+
+/*
+ * The IOMMU core code allocates the default DMA domain, which the underlying
+ * IOMMU driver needs to support via the dma-iommu layer.
+ */
+void iommu_setup_dma_ops(struct device *dev, u64 dma_base, u64 size)
+{
+ struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
+
+ if (!domain)
+ goto out_err;
+
+ /*
+ * The IOMMU core code allocates the default DMA domain, which the
+ * underlying IOMMU driver needs to support via the dma-iommu layer.
+ */
+ if (domain->type == IOMMU_DOMAIN_DMA) {
+ if (iommu_dma_init_domain(domain, dma_base, size, dev))
+ goto out_err;
+ dev->dma_ops = &iommu_dma_ops;
+ }
+
+ return;
+out_err:
+ pr_warn("Failed to set up IOMMU for device %s; retaining platform DMA ops\n",
+ dev_name(dev));
+}
+
+static struct iommu_dma_msi_page *iommu_dma_get_msi_page(struct device *dev,
+ phys_addr_t msi_addr, struct iommu_domain *domain)
+{
+ struct iommu_dma_cookie *cookie = domain->iova_cookie;
+ struct iommu_dma_msi_page *msi_page;
+ dma_addr_t iova;
+ int prot = IOMMU_WRITE | IOMMU_NOEXEC | IOMMU_MMIO;
+ size_t size = cookie_msi_granule(cookie);
+
+ msi_addr &= ~(phys_addr_t)(size - 1);
+ list_for_each_entry(msi_page, &cookie->msi_page_list, list)
+ if (msi_page->phys == msi_addr)
+ return msi_page;
+
+ msi_page = kzalloc(sizeof(*msi_page), GFP_KERNEL);
+ if (!msi_page)
+ return NULL;
+
+ iova = iommu_dma_alloc_iova(domain, size, dma_get_mask(dev), dev);
+ if (!iova)
+ goto out_free_page;
+
+ if (iommu_map(domain, iova, msi_addr, size, prot))
+ goto out_free_iova;
+
+ INIT_LIST_HEAD(&msi_page->list);
+ msi_page->phys = msi_addr;
+ msi_page->iova = iova;
+ list_add(&msi_page->list, &cookie->msi_page_list);
+ return msi_page;
+
+out_free_iova:
+ iommu_dma_free_iova(cookie, iova, size);
+out_free_page:
+ kfree(msi_page);
+ return NULL;
+}
+
+int iommu_dma_prepare_msi(struct msi_desc *desc, phys_addr_t msi_addr)
+{
+ struct device *dev = msi_desc_to_dev(desc);
+ struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
+ struct iommu_dma_msi_page *msi_page;
+ static DEFINE_MUTEX(msi_prepare_lock); /* see below */
+
+ if (!domain || !domain->iova_cookie) {
+ desc->iommu_cookie = NULL;
+ return 0;
+ }
+
+ /*
+ * In fact the whole prepare operation should already be serialised by
+ * irq_domain_mutex further up the callchain, but that's pretty subtle
+ * on its own, so consider this locking as failsafe documentation...
+ */
+ mutex_lock(&msi_prepare_lock);
+ msi_page = iommu_dma_get_msi_page(dev, msi_addr, domain);
+ mutex_unlock(&msi_prepare_lock);
+
+ msi_desc_set_iommu_cookie(desc, msi_page);
+
+ if (!msi_page)
+ return -ENOMEM;
+ return 0;
+}
+
+void iommu_dma_compose_msi_msg(struct msi_desc *desc,
+ struct msi_msg *msg)
+{
+ struct device *dev = msi_desc_to_dev(desc);
+ const struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
+ const struct iommu_dma_msi_page *msi_page;
+
+ msi_page = msi_desc_get_iommu_cookie(desc);
+
+ if (!domain || !domain->iova_cookie || WARN_ON(!msi_page))
+ return;
+
+ msg->address_hi = upper_32_bits(msi_page->iova);
+ msg->address_lo &= cookie_msi_granule(domain->iova_cookie) - 1;
+ msg->address_lo += lower_32_bits(msi_page->iova);
+}
+
+static int iommu_dma_init(void)
+{
+ return iova_cache_get();
+}
+arch_initcall(iommu_dma_init);
diff --git a/drivers/iommu/exynos-iommu.c b/drivers/iommu/exynos-iommu.c
new file mode 100644
index 000000000..0cdb5493a
--- /dev/null
+++ b/drivers/iommu/exynos-iommu.c
@@ -0,0 +1,1392 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2011,2016 Samsung Electronics Co., Ltd.
+ * http://www.samsung.com
+ */
+
+#ifdef CONFIG_EXYNOS_IOMMU_DEBUG
+#define DEBUG
+#endif
+
+#include <linux/clk.h>
+#include <linux/dma-mapping.h>
+#include <linux/err.h>
+#include <linux/io.h>
+#include <linux/iommu.h>
+#include <linux/interrupt.h>
+#include <linux/kmemleak.h>
+#include <linux/list.h>
+#include <linux/of.h>
+#include <linux/of_iommu.h>
+#include <linux/of_platform.h>
+#include <linux/platform_device.h>
+#include <linux/pm_runtime.h>
+#include <linux/slab.h>
+#include <linux/dma-iommu.h>
+
+typedef u32 sysmmu_iova_t;
+typedef u32 sysmmu_pte_t;
+
+/* We do not consider super section mapping (16MB) */
+#define SECT_ORDER 20
+#define LPAGE_ORDER 16
+#define SPAGE_ORDER 12
+
+#define SECT_SIZE (1 << SECT_ORDER)
+#define LPAGE_SIZE (1 << LPAGE_ORDER)
+#define SPAGE_SIZE (1 << SPAGE_ORDER)
+
+#define SECT_MASK (~(SECT_SIZE - 1))
+#define LPAGE_MASK (~(LPAGE_SIZE - 1))
+#define SPAGE_MASK (~(SPAGE_SIZE - 1))
+
+#define lv1ent_fault(sent) ((*(sent) == ZERO_LV2LINK) || \
+ ((*(sent) & 3) == 0) || ((*(sent) & 3) == 3))
+#define lv1ent_zero(sent) (*(sent) == ZERO_LV2LINK)
+#define lv1ent_page_zero(sent) ((*(sent) & 3) == 1)
+#define lv1ent_page(sent) ((*(sent) != ZERO_LV2LINK) && \
+ ((*(sent) & 3) == 1))
+#define lv1ent_section(sent) ((*(sent) & 3) == 2)
+
+#define lv2ent_fault(pent) ((*(pent) & 3) == 0)
+#define lv2ent_small(pent) ((*(pent) & 2) == 2)
+#define lv2ent_large(pent) ((*(pent) & 3) == 1)
+
+/*
+ * v1.x - v3.x SYSMMU supports 32bit physical and 32bit virtual address spaces
+ * v5.0 introduced support for 36bit physical address space by shifting
+ * all page entry values by 4 bits.
+ * All SYSMMU controllers in the system support the address spaces of the same
+ * size, so PG_ENT_SHIFT can be initialized on first SYSMMU probe to proper
+ * value (0 or 4).
+ */
+static short PG_ENT_SHIFT = -1;
+#define SYSMMU_PG_ENT_SHIFT 0
+#define SYSMMU_V5_PG_ENT_SHIFT 4
+
+static const sysmmu_pte_t *LV1_PROT;
+static const sysmmu_pte_t SYSMMU_LV1_PROT[] = {
+ ((0 << 15) | (0 << 10)), /* no access */
+ ((1 << 15) | (1 << 10)), /* IOMMU_READ only */
+ ((0 << 15) | (1 << 10)), /* IOMMU_WRITE not supported, use read/write */
+ ((0 << 15) | (1 << 10)), /* IOMMU_READ | IOMMU_WRITE */
+};
+static const sysmmu_pte_t SYSMMU_V5_LV1_PROT[] = {
+ (0 << 4), /* no access */
+ (1 << 4), /* IOMMU_READ only */
+ (2 << 4), /* IOMMU_WRITE only */
+ (3 << 4), /* IOMMU_READ | IOMMU_WRITE */
+};
+
+static const sysmmu_pte_t *LV2_PROT;
+static const sysmmu_pte_t SYSMMU_LV2_PROT[] = {
+ ((0 << 9) | (0 << 4)), /* no access */
+ ((1 << 9) | (1 << 4)), /* IOMMU_READ only */
+ ((0 << 9) | (1 << 4)), /* IOMMU_WRITE not supported, use read/write */
+ ((0 << 9) | (1 << 4)), /* IOMMU_READ | IOMMU_WRITE */
+};
+static const sysmmu_pte_t SYSMMU_V5_LV2_PROT[] = {
+ (0 << 2), /* no access */
+ (1 << 2), /* IOMMU_READ only */
+ (2 << 2), /* IOMMU_WRITE only */
+ (3 << 2), /* IOMMU_READ | IOMMU_WRITE */
+};
+
+#define SYSMMU_SUPPORTED_PROT_BITS (IOMMU_READ | IOMMU_WRITE)
+
+#define sect_to_phys(ent) (((phys_addr_t) ent) << PG_ENT_SHIFT)
+#define section_phys(sent) (sect_to_phys(*(sent)) & SECT_MASK)
+#define section_offs(iova) (iova & (SECT_SIZE - 1))
+#define lpage_phys(pent) (sect_to_phys(*(pent)) & LPAGE_MASK)
+#define lpage_offs(iova) (iova & (LPAGE_SIZE - 1))
+#define spage_phys(pent) (sect_to_phys(*(pent)) & SPAGE_MASK)
+#define spage_offs(iova) (iova & (SPAGE_SIZE - 1))
+
+#define NUM_LV1ENTRIES 4096
+#define NUM_LV2ENTRIES (SECT_SIZE / SPAGE_SIZE)
+
+static u32 lv1ent_offset(sysmmu_iova_t iova)
+{
+ return iova >> SECT_ORDER;
+}
+
+static u32 lv2ent_offset(sysmmu_iova_t iova)
+{
+ return (iova >> SPAGE_ORDER) & (NUM_LV2ENTRIES - 1);
+}
+
+#define LV1TABLE_SIZE (NUM_LV1ENTRIES * sizeof(sysmmu_pte_t))
+#define LV2TABLE_SIZE (NUM_LV2ENTRIES * sizeof(sysmmu_pte_t))
+
+#define SPAGES_PER_LPAGE (LPAGE_SIZE / SPAGE_SIZE)
+#define lv2table_base(sent) (sect_to_phys(*(sent) & 0xFFFFFFC0))
+
+#define mk_lv1ent_sect(pa, prot) ((pa >> PG_ENT_SHIFT) | LV1_PROT[prot] | 2)
+#define mk_lv1ent_page(pa) ((pa >> PG_ENT_SHIFT) | 1)
+#define mk_lv2ent_lpage(pa, prot) ((pa >> PG_ENT_SHIFT) | LV2_PROT[prot] | 1)
+#define mk_lv2ent_spage(pa, prot) ((pa >> PG_ENT_SHIFT) | LV2_PROT[prot] | 2)
+
+#define CTRL_ENABLE 0x5
+#define CTRL_BLOCK 0x7
+#define CTRL_DISABLE 0x0
+
+#define CFG_LRU 0x1
+#define CFG_EAP (1 << 2)
+#define CFG_QOS(n) ((n & 0xF) << 7)
+#define CFG_ACGEN (1 << 24) /* System MMU 3.3 only */
+#define CFG_SYSSEL (1 << 22) /* System MMU 3.2 only */
+#define CFG_FLPDCACHE (1 << 20) /* System MMU 3.2+ only */
+
+/* common registers */
+#define REG_MMU_CTRL 0x000
+#define REG_MMU_CFG 0x004
+#define REG_MMU_STATUS 0x008
+#define REG_MMU_VERSION 0x034
+
+#define MMU_MAJ_VER(val) ((val) >> 7)
+#define MMU_MIN_VER(val) ((val) & 0x7F)
+#define MMU_RAW_VER(reg) (((reg) >> 21) & ((1 << 11) - 1)) /* 11 bits */
+
+#define MAKE_MMU_VER(maj, min) ((((maj) & 0xF) << 7) | ((min) & 0x7F))
+
+/* v1.x - v3.x registers */
+#define REG_MMU_FLUSH 0x00C
+#define REG_MMU_FLUSH_ENTRY 0x010
+#define REG_PT_BASE_ADDR 0x014
+#define REG_INT_STATUS 0x018
+#define REG_INT_CLEAR 0x01C
+
+#define REG_PAGE_FAULT_ADDR 0x024
+#define REG_AW_FAULT_ADDR 0x028
+#define REG_AR_FAULT_ADDR 0x02C
+#define REG_DEFAULT_SLAVE_ADDR 0x030
+
+/* v5.x registers */
+#define REG_V5_PT_BASE_PFN 0x00C
+#define REG_V5_MMU_FLUSH_ALL 0x010
+#define REG_V5_MMU_FLUSH_ENTRY 0x014
+#define REG_V5_MMU_FLUSH_RANGE 0x018
+#define REG_V5_MMU_FLUSH_START 0x020
+#define REG_V5_MMU_FLUSH_END 0x024
+#define REG_V5_INT_STATUS 0x060
+#define REG_V5_INT_CLEAR 0x064
+#define REG_V5_FAULT_AR_VA 0x070
+#define REG_V5_FAULT_AW_VA 0x080
+
+#define has_sysmmu(dev) (dev_iommu_priv_get(dev) != NULL)
+
+static struct device *dma_dev;
+static struct kmem_cache *lv2table_kmem_cache;
+static sysmmu_pte_t *zero_lv2_table;
+#define ZERO_LV2LINK mk_lv1ent_page(virt_to_phys(zero_lv2_table))
+
+static sysmmu_pte_t *section_entry(sysmmu_pte_t *pgtable, sysmmu_iova_t iova)
+{
+ return pgtable + lv1ent_offset(iova);
+}
+
+static sysmmu_pte_t *page_entry(sysmmu_pte_t *sent, sysmmu_iova_t iova)
+{
+ return (sysmmu_pte_t *)phys_to_virt(
+ lv2table_base(sent)) + lv2ent_offset(iova);
+}
+
+/*
+ * IOMMU fault information register
+ */
+struct sysmmu_fault_info {
+ unsigned int bit; /* bit number in STATUS register */
+ unsigned short addr_reg; /* register to read VA fault address */
+ const char *name; /* human readable fault name */
+ unsigned int type; /* fault type for report_iommu_fault */
+};
+
+static const struct sysmmu_fault_info sysmmu_faults[] = {
+ { 0, REG_PAGE_FAULT_ADDR, "PAGE", IOMMU_FAULT_READ },
+ { 1, REG_AR_FAULT_ADDR, "AR MULTI-HIT", IOMMU_FAULT_READ },
+ { 2, REG_AW_FAULT_ADDR, "AW MULTI-HIT", IOMMU_FAULT_WRITE },
+ { 3, REG_DEFAULT_SLAVE_ADDR, "BUS ERROR", IOMMU_FAULT_READ },
+ { 4, REG_AR_FAULT_ADDR, "AR SECURITY PROTECTION", IOMMU_FAULT_READ },
+ { 5, REG_AR_FAULT_ADDR, "AR ACCESS PROTECTION", IOMMU_FAULT_READ },
+ { 6, REG_AW_FAULT_ADDR, "AW SECURITY PROTECTION", IOMMU_FAULT_WRITE },
+ { 7, REG_AW_FAULT_ADDR, "AW ACCESS PROTECTION", IOMMU_FAULT_WRITE },
+};
+
+static const struct sysmmu_fault_info sysmmu_v5_faults[] = {
+ { 0, REG_V5_FAULT_AR_VA, "AR PTW", IOMMU_FAULT_READ },
+ { 1, REG_V5_FAULT_AR_VA, "AR PAGE", IOMMU_FAULT_READ },
+ { 2, REG_V5_FAULT_AR_VA, "AR MULTI-HIT", IOMMU_FAULT_READ },
+ { 3, REG_V5_FAULT_AR_VA, "AR ACCESS PROTECTION", IOMMU_FAULT_READ },
+ { 4, REG_V5_FAULT_AR_VA, "AR SECURITY PROTECTION", IOMMU_FAULT_READ },
+ { 16, REG_V5_FAULT_AW_VA, "AW PTW", IOMMU_FAULT_WRITE },
+ { 17, REG_V5_FAULT_AW_VA, "AW PAGE", IOMMU_FAULT_WRITE },
+ { 18, REG_V5_FAULT_AW_VA, "AW MULTI-HIT", IOMMU_FAULT_WRITE },
+ { 19, REG_V5_FAULT_AW_VA, "AW ACCESS PROTECTION", IOMMU_FAULT_WRITE },
+ { 20, REG_V5_FAULT_AW_VA, "AW SECURITY PROTECTION", IOMMU_FAULT_WRITE },
+};
+
+/*
+ * This structure is attached to dev->iommu->priv of the master device
+ * on device add, contains a list of SYSMMU controllers defined by device tree,
+ * which are bound to given master device. It is usually referenced by 'owner'
+ * pointer.
+*/
+struct exynos_iommu_owner {
+ struct list_head controllers; /* list of sysmmu_drvdata.owner_node */
+ struct iommu_domain *domain; /* domain this device is attached */
+ struct mutex rpm_lock; /* for runtime pm of all sysmmus */
+};
+
+/*
+ * This structure exynos specific generalization of struct iommu_domain.
+ * It contains list of SYSMMU controllers from all master devices, which has
+ * been attached to this domain and page tables of IO address space defined by
+ * it. It is usually referenced by 'domain' pointer.
+ */
+struct exynos_iommu_domain {
+ struct list_head clients; /* list of sysmmu_drvdata.domain_node */
+ sysmmu_pte_t *pgtable; /* lv1 page table, 16KB */
+ short *lv2entcnt; /* free lv2 entry counter for each section */
+ spinlock_t lock; /* lock for modyfying list of clients */
+ spinlock_t pgtablelock; /* lock for modifying page table @ pgtable */
+ struct iommu_domain domain; /* generic domain data structure */
+};
+
+/*
+ * This structure hold all data of a single SYSMMU controller, this includes
+ * hw resources like registers and clocks, pointers and list nodes to connect
+ * it to all other structures, internal state and parameters read from device
+ * tree. It is usually referenced by 'data' pointer.
+ */
+struct sysmmu_drvdata {
+ struct device *sysmmu; /* SYSMMU controller device */
+ struct device *master; /* master device (owner) */
+ struct device_link *link; /* runtime PM link to master */
+ void __iomem *sfrbase; /* our registers */
+ struct clk *clk; /* SYSMMU's clock */
+ struct clk *aclk; /* SYSMMU's aclk clock */
+ struct clk *pclk; /* SYSMMU's pclk clock */
+ struct clk *clk_master; /* master's device clock */
+ spinlock_t lock; /* lock for modyfying state */
+ bool active; /* current status */
+ struct exynos_iommu_domain *domain; /* domain we belong to */
+ struct list_head domain_node; /* node for domain clients list */
+ struct list_head owner_node; /* node for owner controllers list */
+ phys_addr_t pgtable; /* assigned page table structure */
+ unsigned int version; /* our version */
+
+ struct iommu_device iommu; /* IOMMU core handle */
+};
+
+static struct exynos_iommu_domain *to_exynos_domain(struct iommu_domain *dom)
+{
+ return container_of(dom, struct exynos_iommu_domain, domain);
+}
+
+static void sysmmu_unblock(struct sysmmu_drvdata *data)
+{
+ writel(CTRL_ENABLE, data->sfrbase + REG_MMU_CTRL);
+}
+
+static bool sysmmu_block(struct sysmmu_drvdata *data)
+{
+ int i = 120;
+
+ writel(CTRL_BLOCK, data->sfrbase + REG_MMU_CTRL);
+ while ((i > 0) && !(readl(data->sfrbase + REG_MMU_STATUS) & 1))
+ --i;
+
+ if (!(readl(data->sfrbase + REG_MMU_STATUS) & 1)) {
+ sysmmu_unblock(data);
+ return false;
+ }
+
+ return true;
+}
+
+static void __sysmmu_tlb_invalidate(struct sysmmu_drvdata *data)
+{
+ if (MMU_MAJ_VER(data->version) < 5)
+ writel(0x1, data->sfrbase + REG_MMU_FLUSH);
+ else
+ writel(0x1, data->sfrbase + REG_V5_MMU_FLUSH_ALL);
+}
+
+static void __sysmmu_tlb_invalidate_entry(struct sysmmu_drvdata *data,
+ sysmmu_iova_t iova, unsigned int num_inv)
+{
+ unsigned int i;
+
+ if (MMU_MAJ_VER(data->version) < 5) {
+ for (i = 0; i < num_inv; i++) {
+ writel((iova & SPAGE_MASK) | 1,
+ data->sfrbase + REG_MMU_FLUSH_ENTRY);
+ iova += SPAGE_SIZE;
+ }
+ } else {
+ if (num_inv == 1) {
+ writel((iova & SPAGE_MASK) | 1,
+ data->sfrbase + REG_V5_MMU_FLUSH_ENTRY);
+ } else {
+ writel((iova & SPAGE_MASK),
+ data->sfrbase + REG_V5_MMU_FLUSH_START);
+ writel((iova & SPAGE_MASK) + (num_inv - 1) * SPAGE_SIZE,
+ data->sfrbase + REG_V5_MMU_FLUSH_END);
+ writel(1, data->sfrbase + REG_V5_MMU_FLUSH_RANGE);
+ }
+ }
+}
+
+static void __sysmmu_set_ptbase(struct sysmmu_drvdata *data, phys_addr_t pgd)
+{
+ if (MMU_MAJ_VER(data->version) < 5)
+ writel(pgd, data->sfrbase + REG_PT_BASE_ADDR);
+ else
+ writel(pgd >> PAGE_SHIFT,
+ data->sfrbase + REG_V5_PT_BASE_PFN);
+
+ __sysmmu_tlb_invalidate(data);
+}
+
+static void __sysmmu_enable_clocks(struct sysmmu_drvdata *data)
+{
+ BUG_ON(clk_prepare_enable(data->clk_master));
+ BUG_ON(clk_prepare_enable(data->clk));
+ BUG_ON(clk_prepare_enable(data->pclk));
+ BUG_ON(clk_prepare_enable(data->aclk));
+}
+
+static void __sysmmu_disable_clocks(struct sysmmu_drvdata *data)
+{
+ clk_disable_unprepare(data->aclk);
+ clk_disable_unprepare(data->pclk);
+ clk_disable_unprepare(data->clk);
+ clk_disable_unprepare(data->clk_master);
+}
+
+static void __sysmmu_get_version(struct sysmmu_drvdata *data)
+{
+ u32 ver;
+
+ __sysmmu_enable_clocks(data);
+
+ ver = readl(data->sfrbase + REG_MMU_VERSION);
+
+ /* controllers on some SoCs don't report proper version */
+ if (ver == 0x80000001u)
+ data->version = MAKE_MMU_VER(1, 0);
+ else
+ data->version = MMU_RAW_VER(ver);
+
+ dev_dbg(data->sysmmu, "hardware version: %d.%d\n",
+ MMU_MAJ_VER(data->version), MMU_MIN_VER(data->version));
+
+ __sysmmu_disable_clocks(data);
+}
+
+static void show_fault_information(struct sysmmu_drvdata *data,
+ const struct sysmmu_fault_info *finfo,
+ sysmmu_iova_t fault_addr)
+{
+ sysmmu_pte_t *ent;
+
+ dev_err(data->sysmmu, "%s: %s FAULT occurred at %#x\n",
+ dev_name(data->master), finfo->name, fault_addr);
+ dev_dbg(data->sysmmu, "Page table base: %pa\n", &data->pgtable);
+ ent = section_entry(phys_to_virt(data->pgtable), fault_addr);
+ dev_dbg(data->sysmmu, "\tLv1 entry: %#x\n", *ent);
+ if (lv1ent_page(ent)) {
+ ent = page_entry(ent, fault_addr);
+ dev_dbg(data->sysmmu, "\t Lv2 entry: %#x\n", *ent);
+ }
+}
+
+static irqreturn_t exynos_sysmmu_irq(int irq, void *dev_id)
+{
+ /* SYSMMU is in blocked state when interrupt occurred. */
+ struct sysmmu_drvdata *data = dev_id;
+ const struct sysmmu_fault_info *finfo;
+ unsigned int i, n, itype;
+ sysmmu_iova_t fault_addr = -1;
+ unsigned short reg_status, reg_clear;
+ int ret = -ENOSYS;
+
+ WARN_ON(!data->active);
+
+ if (MMU_MAJ_VER(data->version) < 5) {
+ reg_status = REG_INT_STATUS;
+ reg_clear = REG_INT_CLEAR;
+ finfo = sysmmu_faults;
+ n = ARRAY_SIZE(sysmmu_faults);
+ } else {
+ reg_status = REG_V5_INT_STATUS;
+ reg_clear = REG_V5_INT_CLEAR;
+ finfo = sysmmu_v5_faults;
+ n = ARRAY_SIZE(sysmmu_v5_faults);
+ }
+
+ spin_lock(&data->lock);
+
+ clk_enable(data->clk_master);
+
+ itype = __ffs(readl(data->sfrbase + reg_status));
+ for (i = 0; i < n; i++, finfo++)
+ if (finfo->bit == itype)
+ break;
+ /* unknown/unsupported fault */
+ BUG_ON(i == n);
+
+ /* print debug message */
+ fault_addr = readl(data->sfrbase + finfo->addr_reg);
+ show_fault_information(data, finfo, fault_addr);
+
+ if (data->domain)
+ ret = report_iommu_fault(&data->domain->domain,
+ data->master, fault_addr, finfo->type);
+ /* fault is not recovered by fault handler */
+ BUG_ON(ret != 0);
+
+ writel(1 << itype, data->sfrbase + reg_clear);
+
+ sysmmu_unblock(data);
+
+ clk_disable(data->clk_master);
+
+ spin_unlock(&data->lock);
+
+ return IRQ_HANDLED;
+}
+
+static void __sysmmu_disable(struct sysmmu_drvdata *data)
+{
+ unsigned long flags;
+
+ clk_enable(data->clk_master);
+
+ spin_lock_irqsave(&data->lock, flags);
+ writel(CTRL_DISABLE, data->sfrbase + REG_MMU_CTRL);
+ writel(0, data->sfrbase + REG_MMU_CFG);
+ data->active = false;
+ spin_unlock_irqrestore(&data->lock, flags);
+
+ __sysmmu_disable_clocks(data);
+}
+
+static void __sysmmu_init_config(struct sysmmu_drvdata *data)
+{
+ unsigned int cfg;
+
+ if (data->version <= MAKE_MMU_VER(3, 1))
+ cfg = CFG_LRU | CFG_QOS(15);
+ else if (data->version <= MAKE_MMU_VER(3, 2))
+ cfg = CFG_LRU | CFG_QOS(15) | CFG_FLPDCACHE | CFG_SYSSEL;
+ else
+ cfg = CFG_QOS(15) | CFG_FLPDCACHE | CFG_ACGEN;
+
+ cfg |= CFG_EAP; /* enable access protection bits check */
+
+ writel(cfg, data->sfrbase + REG_MMU_CFG);
+}
+
+static void __sysmmu_enable(struct sysmmu_drvdata *data)
+{
+ unsigned long flags;
+
+ __sysmmu_enable_clocks(data);
+
+ spin_lock_irqsave(&data->lock, flags);
+ writel(CTRL_BLOCK, data->sfrbase + REG_MMU_CTRL);
+ __sysmmu_init_config(data);
+ __sysmmu_set_ptbase(data, data->pgtable);
+ writel(CTRL_ENABLE, data->sfrbase + REG_MMU_CTRL);
+ data->active = true;
+ spin_unlock_irqrestore(&data->lock, flags);
+
+ /*
+ * SYSMMU driver keeps master's clock enabled only for the short
+ * time, while accessing the registers. For performing address
+ * translation during DMA transaction it relies on the client
+ * driver to enable it.
+ */
+ clk_disable(data->clk_master);
+}
+
+static void sysmmu_tlb_invalidate_flpdcache(struct sysmmu_drvdata *data,
+ sysmmu_iova_t iova)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&data->lock, flags);
+ if (data->active && data->version >= MAKE_MMU_VER(3, 3)) {
+ clk_enable(data->clk_master);
+ if (sysmmu_block(data)) {
+ if (data->version >= MAKE_MMU_VER(5, 0))
+ __sysmmu_tlb_invalidate(data);
+ else
+ __sysmmu_tlb_invalidate_entry(data, iova, 1);
+ sysmmu_unblock(data);
+ }
+ clk_disable(data->clk_master);
+ }
+ spin_unlock_irqrestore(&data->lock, flags);
+}
+
+static void sysmmu_tlb_invalidate_entry(struct sysmmu_drvdata *data,
+ sysmmu_iova_t iova, size_t size)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&data->lock, flags);
+ if (data->active) {
+ unsigned int num_inv = 1;
+
+ clk_enable(data->clk_master);
+
+ /*
+ * L2TLB invalidation required
+ * 4KB page: 1 invalidation
+ * 64KB page: 16 invalidations
+ * 1MB page: 64 invalidations
+ * because it is set-associative TLB
+ * with 8-way and 64 sets.
+ * 1MB page can be cached in one of all sets.
+ * 64KB page can be one of 16 consecutive sets.
+ */
+ if (MMU_MAJ_VER(data->version) == 2)
+ num_inv = min_t(unsigned int, size / PAGE_SIZE, 64);
+
+ if (sysmmu_block(data)) {
+ __sysmmu_tlb_invalidate_entry(data, iova, num_inv);
+ sysmmu_unblock(data);
+ }
+ clk_disable(data->clk_master);
+ }
+ spin_unlock_irqrestore(&data->lock, flags);
+}
+
+static const struct iommu_ops exynos_iommu_ops;
+
+static int exynos_sysmmu_probe(struct platform_device *pdev)
+{
+ int irq, ret;
+ struct device *dev = &pdev->dev;
+ struct sysmmu_drvdata *data;
+ struct resource *res;
+
+ data = devm_kzalloc(dev, sizeof(*data), GFP_KERNEL);
+ if (!data)
+ return -ENOMEM;
+
+ res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+ data->sfrbase = devm_ioremap_resource(dev, res);
+ if (IS_ERR(data->sfrbase))
+ return PTR_ERR(data->sfrbase);
+
+ irq = platform_get_irq(pdev, 0);
+ if (irq <= 0)
+ return irq;
+
+ ret = devm_request_irq(dev, irq, exynos_sysmmu_irq, 0,
+ dev_name(dev), data);
+ if (ret) {
+ dev_err(dev, "Unabled to register handler of irq %d\n", irq);
+ return ret;
+ }
+
+ data->clk = devm_clk_get(dev, "sysmmu");
+ if (PTR_ERR(data->clk) == -ENOENT)
+ data->clk = NULL;
+ else if (IS_ERR(data->clk))
+ return PTR_ERR(data->clk);
+
+ data->aclk = devm_clk_get(dev, "aclk");
+ if (PTR_ERR(data->aclk) == -ENOENT)
+ data->aclk = NULL;
+ else if (IS_ERR(data->aclk))
+ return PTR_ERR(data->aclk);
+
+ data->pclk = devm_clk_get(dev, "pclk");
+ if (PTR_ERR(data->pclk) == -ENOENT)
+ data->pclk = NULL;
+ else if (IS_ERR(data->pclk))
+ return PTR_ERR(data->pclk);
+
+ if (!data->clk && (!data->aclk || !data->pclk)) {
+ dev_err(dev, "Failed to get device clock(s)!\n");
+ return -ENOSYS;
+ }
+
+ data->clk_master = devm_clk_get(dev, "master");
+ if (PTR_ERR(data->clk_master) == -ENOENT)
+ data->clk_master = NULL;
+ else if (IS_ERR(data->clk_master))
+ return PTR_ERR(data->clk_master);
+
+ data->sysmmu = dev;
+ spin_lock_init(&data->lock);
+
+ ret = iommu_device_sysfs_add(&data->iommu, &pdev->dev, NULL,
+ dev_name(data->sysmmu));
+ if (ret)
+ return ret;
+
+ iommu_device_set_ops(&data->iommu, &exynos_iommu_ops);
+ iommu_device_set_fwnode(&data->iommu, &dev->of_node->fwnode);
+
+ ret = iommu_device_register(&data->iommu);
+ if (ret)
+ goto err_iommu_register;
+
+ platform_set_drvdata(pdev, data);
+
+ __sysmmu_get_version(data);
+ if (PG_ENT_SHIFT < 0) {
+ if (MMU_MAJ_VER(data->version) < 5) {
+ PG_ENT_SHIFT = SYSMMU_PG_ENT_SHIFT;
+ LV1_PROT = SYSMMU_LV1_PROT;
+ LV2_PROT = SYSMMU_LV2_PROT;
+ } else {
+ PG_ENT_SHIFT = SYSMMU_V5_PG_ENT_SHIFT;
+ LV1_PROT = SYSMMU_V5_LV1_PROT;
+ LV2_PROT = SYSMMU_V5_LV2_PROT;
+ }
+ }
+
+ /*
+ * use the first registered sysmmu device for performing
+ * dma mapping operations on iommu page tables (cpu cache flush)
+ */
+ if (!dma_dev)
+ dma_dev = &pdev->dev;
+
+ pm_runtime_enable(dev);
+
+ return 0;
+
+err_iommu_register:
+ iommu_device_sysfs_remove(&data->iommu);
+ return ret;
+}
+
+static int __maybe_unused exynos_sysmmu_suspend(struct device *dev)
+{
+ struct sysmmu_drvdata *data = dev_get_drvdata(dev);
+ struct device *master = data->master;
+
+ if (master) {
+ struct exynos_iommu_owner *owner = dev_iommu_priv_get(master);
+
+ mutex_lock(&owner->rpm_lock);
+ if (data->domain) {
+ dev_dbg(data->sysmmu, "saving state\n");
+ __sysmmu_disable(data);
+ }
+ mutex_unlock(&owner->rpm_lock);
+ }
+ return 0;
+}
+
+static int __maybe_unused exynos_sysmmu_resume(struct device *dev)
+{
+ struct sysmmu_drvdata *data = dev_get_drvdata(dev);
+ struct device *master = data->master;
+
+ if (master) {
+ struct exynos_iommu_owner *owner = dev_iommu_priv_get(master);
+
+ mutex_lock(&owner->rpm_lock);
+ if (data->domain) {
+ dev_dbg(data->sysmmu, "restoring state\n");
+ __sysmmu_enable(data);
+ }
+ mutex_unlock(&owner->rpm_lock);
+ }
+ return 0;
+}
+
+static const struct dev_pm_ops sysmmu_pm_ops = {
+ SET_RUNTIME_PM_OPS(exynos_sysmmu_suspend, exynos_sysmmu_resume, NULL)
+ SET_SYSTEM_SLEEP_PM_OPS(pm_runtime_force_suspend,
+ pm_runtime_force_resume)
+};
+
+static const struct of_device_id sysmmu_of_match[] = {
+ { .compatible = "samsung,exynos-sysmmu", },
+ { },
+};
+
+static struct platform_driver exynos_sysmmu_driver __refdata = {
+ .probe = exynos_sysmmu_probe,
+ .driver = {
+ .name = "exynos-sysmmu",
+ .of_match_table = sysmmu_of_match,
+ .pm = &sysmmu_pm_ops,
+ .suppress_bind_attrs = true,
+ }
+};
+
+static inline void exynos_iommu_set_pte(sysmmu_pte_t *ent, sysmmu_pte_t val)
+{
+ dma_sync_single_for_cpu(dma_dev, virt_to_phys(ent), sizeof(*ent),
+ DMA_TO_DEVICE);
+ *ent = cpu_to_le32(val);
+ dma_sync_single_for_device(dma_dev, virt_to_phys(ent), sizeof(*ent),
+ DMA_TO_DEVICE);
+}
+
+static struct iommu_domain *exynos_iommu_domain_alloc(unsigned type)
+{
+ struct exynos_iommu_domain *domain;
+ dma_addr_t handle;
+ int i;
+
+ /* Check if correct PTE offsets are initialized */
+ BUG_ON(PG_ENT_SHIFT < 0 || !dma_dev);
+
+ domain = kzalloc(sizeof(*domain), GFP_KERNEL);
+ if (!domain)
+ return NULL;
+
+ if (type == IOMMU_DOMAIN_DMA) {
+ if (iommu_get_dma_cookie(&domain->domain) != 0)
+ goto err_pgtable;
+ } else if (type != IOMMU_DOMAIN_UNMANAGED) {
+ goto err_pgtable;
+ }
+
+ domain->pgtable = (sysmmu_pte_t *)__get_free_pages(GFP_KERNEL, 2);
+ if (!domain->pgtable)
+ goto err_dma_cookie;
+
+ domain->lv2entcnt = (short *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, 1);
+ if (!domain->lv2entcnt)
+ goto err_counter;
+
+ /* Workaround for System MMU v3.3 to prevent caching 1MiB mapping */
+ for (i = 0; i < NUM_LV1ENTRIES; i++)
+ domain->pgtable[i] = ZERO_LV2LINK;
+
+ handle = dma_map_single(dma_dev, domain->pgtable, LV1TABLE_SIZE,
+ DMA_TO_DEVICE);
+ /* For mapping page table entries we rely on dma == phys */
+ BUG_ON(handle != virt_to_phys(domain->pgtable));
+ if (dma_mapping_error(dma_dev, handle))
+ goto err_lv2ent;
+
+ spin_lock_init(&domain->lock);
+ spin_lock_init(&domain->pgtablelock);
+ INIT_LIST_HEAD(&domain->clients);
+
+ domain->domain.geometry.aperture_start = 0;
+ domain->domain.geometry.aperture_end = ~0UL;
+ domain->domain.geometry.force_aperture = true;
+
+ return &domain->domain;
+
+err_lv2ent:
+ free_pages((unsigned long)domain->lv2entcnt, 1);
+err_counter:
+ free_pages((unsigned long)domain->pgtable, 2);
+err_dma_cookie:
+ if (type == IOMMU_DOMAIN_DMA)
+ iommu_put_dma_cookie(&domain->domain);
+err_pgtable:
+ kfree(domain);
+ return NULL;
+}
+
+static void exynos_iommu_domain_free(struct iommu_domain *iommu_domain)
+{
+ struct exynos_iommu_domain *domain = to_exynos_domain(iommu_domain);
+ struct sysmmu_drvdata *data, *next;
+ unsigned long flags;
+ int i;
+
+ WARN_ON(!list_empty(&domain->clients));
+
+ spin_lock_irqsave(&domain->lock, flags);
+
+ list_for_each_entry_safe(data, next, &domain->clients, domain_node) {
+ spin_lock(&data->lock);
+ __sysmmu_disable(data);
+ data->pgtable = 0;
+ data->domain = NULL;
+ list_del_init(&data->domain_node);
+ spin_unlock(&data->lock);
+ }
+
+ spin_unlock_irqrestore(&domain->lock, flags);
+
+ if (iommu_domain->type == IOMMU_DOMAIN_DMA)
+ iommu_put_dma_cookie(iommu_domain);
+
+ dma_unmap_single(dma_dev, virt_to_phys(domain->pgtable), LV1TABLE_SIZE,
+ DMA_TO_DEVICE);
+
+ for (i = 0; i < NUM_LV1ENTRIES; i++)
+ if (lv1ent_page(domain->pgtable + i)) {
+ phys_addr_t base = lv2table_base(domain->pgtable + i);
+
+ dma_unmap_single(dma_dev, base, LV2TABLE_SIZE,
+ DMA_TO_DEVICE);
+ kmem_cache_free(lv2table_kmem_cache,
+ phys_to_virt(base));
+ }
+
+ free_pages((unsigned long)domain->pgtable, 2);
+ free_pages((unsigned long)domain->lv2entcnt, 1);
+ kfree(domain);
+}
+
+static void exynos_iommu_detach_device(struct iommu_domain *iommu_domain,
+ struct device *dev)
+{
+ struct exynos_iommu_domain *domain = to_exynos_domain(iommu_domain);
+ struct exynos_iommu_owner *owner = dev_iommu_priv_get(dev);
+ phys_addr_t pagetable = virt_to_phys(domain->pgtable);
+ struct sysmmu_drvdata *data, *next;
+ unsigned long flags;
+
+ if (!has_sysmmu(dev) || owner->domain != iommu_domain)
+ return;
+
+ mutex_lock(&owner->rpm_lock);
+
+ list_for_each_entry(data, &owner->controllers, owner_node) {
+ pm_runtime_get_noresume(data->sysmmu);
+ if (pm_runtime_active(data->sysmmu))
+ __sysmmu_disable(data);
+ pm_runtime_put(data->sysmmu);
+ }
+
+ spin_lock_irqsave(&domain->lock, flags);
+ list_for_each_entry_safe(data, next, &domain->clients, domain_node) {
+ spin_lock(&data->lock);
+ data->pgtable = 0;
+ data->domain = NULL;
+ list_del_init(&data->domain_node);
+ spin_unlock(&data->lock);
+ }
+ owner->domain = NULL;
+ spin_unlock_irqrestore(&domain->lock, flags);
+
+ mutex_unlock(&owner->rpm_lock);
+
+ dev_dbg(dev, "%s: Detached IOMMU with pgtable %pa\n", __func__,
+ &pagetable);
+}
+
+static int exynos_iommu_attach_device(struct iommu_domain *iommu_domain,
+ struct device *dev)
+{
+ struct exynos_iommu_domain *domain = to_exynos_domain(iommu_domain);
+ struct exynos_iommu_owner *owner = dev_iommu_priv_get(dev);
+ struct sysmmu_drvdata *data;
+ phys_addr_t pagetable = virt_to_phys(domain->pgtable);
+ unsigned long flags;
+
+ if (!has_sysmmu(dev))
+ return -ENODEV;
+
+ if (owner->domain)
+ exynos_iommu_detach_device(owner->domain, dev);
+
+ mutex_lock(&owner->rpm_lock);
+
+ spin_lock_irqsave(&domain->lock, flags);
+ list_for_each_entry(data, &owner->controllers, owner_node) {
+ spin_lock(&data->lock);
+ data->pgtable = pagetable;
+ data->domain = domain;
+ list_add_tail(&data->domain_node, &domain->clients);
+ spin_unlock(&data->lock);
+ }
+ owner->domain = iommu_domain;
+ spin_unlock_irqrestore(&domain->lock, flags);
+
+ list_for_each_entry(data, &owner->controllers, owner_node) {
+ pm_runtime_get_noresume(data->sysmmu);
+ if (pm_runtime_active(data->sysmmu))
+ __sysmmu_enable(data);
+ pm_runtime_put(data->sysmmu);
+ }
+
+ mutex_unlock(&owner->rpm_lock);
+
+ dev_dbg(dev, "%s: Attached IOMMU with pgtable %pa\n", __func__,
+ &pagetable);
+
+ return 0;
+}
+
+static sysmmu_pte_t *alloc_lv2entry(struct exynos_iommu_domain *domain,
+ sysmmu_pte_t *sent, sysmmu_iova_t iova, short *pgcounter)
+{
+ if (lv1ent_section(sent)) {
+ WARN(1, "Trying mapping on %#08x mapped with 1MiB page", iova);
+ return ERR_PTR(-EADDRINUSE);
+ }
+
+ if (lv1ent_fault(sent)) {
+ dma_addr_t handle;
+ sysmmu_pte_t *pent;
+ bool need_flush_flpd_cache = lv1ent_zero(sent);
+
+ pent = kmem_cache_zalloc(lv2table_kmem_cache, GFP_ATOMIC);
+ BUG_ON((uintptr_t)pent & (LV2TABLE_SIZE - 1));
+ if (!pent)
+ return ERR_PTR(-ENOMEM);
+
+ exynos_iommu_set_pte(sent, mk_lv1ent_page(virt_to_phys(pent)));
+ kmemleak_ignore(pent);
+ *pgcounter = NUM_LV2ENTRIES;
+ handle = dma_map_single(dma_dev, pent, LV2TABLE_SIZE,
+ DMA_TO_DEVICE);
+ if (dma_mapping_error(dma_dev, handle)) {
+ kmem_cache_free(lv2table_kmem_cache, pent);
+ return ERR_PTR(-EADDRINUSE);
+ }
+
+ /*
+ * If pre-fetched SLPD is a faulty SLPD in zero_l2_table,
+ * FLPD cache may cache the address of zero_l2_table. This
+ * function replaces the zero_l2_table with new L2 page table
+ * to write valid mappings.
+ * Accessing the valid area may cause page fault since FLPD
+ * cache may still cache zero_l2_table for the valid area
+ * instead of new L2 page table that has the mapping
+ * information of the valid area.
+ * Thus any replacement of zero_l2_table with other valid L2
+ * page table must involve FLPD cache invalidation for System
+ * MMU v3.3.
+ * FLPD cache invalidation is performed with TLB invalidation
+ * by VPN without blocking. It is safe to invalidate TLB without
+ * blocking because the target address of TLB invalidation is
+ * not currently mapped.
+ */
+ if (need_flush_flpd_cache) {
+ struct sysmmu_drvdata *data;
+
+ spin_lock(&domain->lock);
+ list_for_each_entry(data, &domain->clients, domain_node)
+ sysmmu_tlb_invalidate_flpdcache(data, iova);
+ spin_unlock(&domain->lock);
+ }
+ }
+
+ return page_entry(sent, iova);
+}
+
+static int lv1set_section(struct exynos_iommu_domain *domain,
+ sysmmu_pte_t *sent, sysmmu_iova_t iova,
+ phys_addr_t paddr, int prot, short *pgcnt)
+{
+ if (lv1ent_section(sent)) {
+ WARN(1, "Trying mapping on 1MiB@%#08x that is mapped",
+ iova);
+ return -EADDRINUSE;
+ }
+
+ if (lv1ent_page(sent)) {
+ if (*pgcnt != NUM_LV2ENTRIES) {
+ WARN(1, "Trying mapping on 1MiB@%#08x that is mapped",
+ iova);
+ return -EADDRINUSE;
+ }
+
+ kmem_cache_free(lv2table_kmem_cache, page_entry(sent, 0));
+ *pgcnt = 0;
+ }
+
+ exynos_iommu_set_pte(sent, mk_lv1ent_sect(paddr, prot));
+
+ spin_lock(&domain->lock);
+ if (lv1ent_page_zero(sent)) {
+ struct sysmmu_drvdata *data;
+ /*
+ * Flushing FLPD cache in System MMU v3.3 that may cache a FLPD
+ * entry by speculative prefetch of SLPD which has no mapping.
+ */
+ list_for_each_entry(data, &domain->clients, domain_node)
+ sysmmu_tlb_invalidate_flpdcache(data, iova);
+ }
+ spin_unlock(&domain->lock);
+
+ return 0;
+}
+
+static int lv2set_page(sysmmu_pte_t *pent, phys_addr_t paddr, size_t size,
+ int prot, short *pgcnt)
+{
+ if (size == SPAGE_SIZE) {
+ if (WARN_ON(!lv2ent_fault(pent)))
+ return -EADDRINUSE;
+
+ exynos_iommu_set_pte(pent, mk_lv2ent_spage(paddr, prot));
+ *pgcnt -= 1;
+ } else { /* size == LPAGE_SIZE */
+ int i;
+ dma_addr_t pent_base = virt_to_phys(pent);
+
+ dma_sync_single_for_cpu(dma_dev, pent_base,
+ sizeof(*pent) * SPAGES_PER_LPAGE,
+ DMA_TO_DEVICE);
+ for (i = 0; i < SPAGES_PER_LPAGE; i++, pent++) {
+ if (WARN_ON(!lv2ent_fault(pent))) {
+ if (i > 0)
+ memset(pent - i, 0, sizeof(*pent) * i);
+ return -EADDRINUSE;
+ }
+
+ *pent = mk_lv2ent_lpage(paddr, prot);
+ }
+ dma_sync_single_for_device(dma_dev, pent_base,
+ sizeof(*pent) * SPAGES_PER_LPAGE,
+ DMA_TO_DEVICE);
+ *pgcnt -= SPAGES_PER_LPAGE;
+ }
+
+ return 0;
+}
+
+/*
+ * *CAUTION* to the I/O virtual memory managers that support exynos-iommu:
+ *
+ * System MMU v3.x has advanced logic to improve address translation
+ * performance with caching more page table entries by a page table walk.
+ * However, the logic has a bug that while caching faulty page table entries,
+ * System MMU reports page fault if the cached fault entry is hit even though
+ * the fault entry is updated to a valid entry after the entry is cached.
+ * To prevent caching faulty page table entries which may be updated to valid
+ * entries later, the virtual memory manager should care about the workaround
+ * for the problem. The following describes the workaround.
+ *
+ * Any two consecutive I/O virtual address regions must have a hole of 128KiB
+ * at maximum to prevent misbehavior of System MMU 3.x (workaround for h/w bug).
+ *
+ * Precisely, any start address of I/O virtual region must be aligned with
+ * the following sizes for System MMU v3.1 and v3.2.
+ * System MMU v3.1: 128KiB
+ * System MMU v3.2: 256KiB
+ *
+ * Because System MMU v3.3 caches page table entries more aggressively, it needs
+ * more workarounds.
+ * - Any two consecutive I/O virtual regions must have a hole of size larger
+ * than or equal to 128KiB.
+ * - Start address of an I/O virtual region must be aligned by 128KiB.
+ */
+static int exynos_iommu_map(struct iommu_domain *iommu_domain,
+ unsigned long l_iova, phys_addr_t paddr, size_t size,
+ int prot, gfp_t gfp)
+{
+ struct exynos_iommu_domain *domain = to_exynos_domain(iommu_domain);
+ sysmmu_pte_t *entry;
+ sysmmu_iova_t iova = (sysmmu_iova_t)l_iova;
+ unsigned long flags;
+ int ret = -ENOMEM;
+
+ BUG_ON(domain->pgtable == NULL);
+ prot &= SYSMMU_SUPPORTED_PROT_BITS;
+
+ spin_lock_irqsave(&domain->pgtablelock, flags);
+
+ entry = section_entry(domain->pgtable, iova);
+
+ if (size == SECT_SIZE) {
+ ret = lv1set_section(domain, entry, iova, paddr, prot,
+ &domain->lv2entcnt[lv1ent_offset(iova)]);
+ } else {
+ sysmmu_pte_t *pent;
+
+ pent = alloc_lv2entry(domain, entry, iova,
+ &domain->lv2entcnt[lv1ent_offset(iova)]);
+
+ if (IS_ERR(pent))
+ ret = PTR_ERR(pent);
+ else
+ ret = lv2set_page(pent, paddr, size, prot,
+ &domain->lv2entcnt[lv1ent_offset(iova)]);
+ }
+
+ if (ret)
+ pr_err("%s: Failed(%d) to map %#zx bytes @ %#x\n",
+ __func__, ret, size, iova);
+
+ spin_unlock_irqrestore(&domain->pgtablelock, flags);
+
+ return ret;
+}
+
+static void exynos_iommu_tlb_invalidate_entry(struct exynos_iommu_domain *domain,
+ sysmmu_iova_t iova, size_t size)
+{
+ struct sysmmu_drvdata *data;
+ unsigned long flags;
+
+ spin_lock_irqsave(&domain->lock, flags);
+
+ list_for_each_entry(data, &domain->clients, domain_node)
+ sysmmu_tlb_invalidate_entry(data, iova, size);
+
+ spin_unlock_irqrestore(&domain->lock, flags);
+}
+
+static size_t exynos_iommu_unmap(struct iommu_domain *iommu_domain,
+ unsigned long l_iova, size_t size,
+ struct iommu_iotlb_gather *gather)
+{
+ struct exynos_iommu_domain *domain = to_exynos_domain(iommu_domain);
+ sysmmu_iova_t iova = (sysmmu_iova_t)l_iova;
+ sysmmu_pte_t *ent;
+ size_t err_pgsize;
+ unsigned long flags;
+
+ BUG_ON(domain->pgtable == NULL);
+
+ spin_lock_irqsave(&domain->pgtablelock, flags);
+
+ ent = section_entry(domain->pgtable, iova);
+
+ if (lv1ent_section(ent)) {
+ if (WARN_ON(size < SECT_SIZE)) {
+ err_pgsize = SECT_SIZE;
+ goto err;
+ }
+
+ /* workaround for h/w bug in System MMU v3.3 */
+ exynos_iommu_set_pte(ent, ZERO_LV2LINK);
+ size = SECT_SIZE;
+ goto done;
+ }
+
+ if (unlikely(lv1ent_fault(ent))) {
+ if (size > SECT_SIZE)
+ size = SECT_SIZE;
+ goto done;
+ }
+
+ /* lv1ent_page(sent) == true here */
+
+ ent = page_entry(ent, iova);
+
+ if (unlikely(lv2ent_fault(ent))) {
+ size = SPAGE_SIZE;
+ goto done;
+ }
+
+ if (lv2ent_small(ent)) {
+ exynos_iommu_set_pte(ent, 0);
+ size = SPAGE_SIZE;
+ domain->lv2entcnt[lv1ent_offset(iova)] += 1;
+ goto done;
+ }
+
+ /* lv1ent_large(ent) == true here */
+ if (WARN_ON(size < LPAGE_SIZE)) {
+ err_pgsize = LPAGE_SIZE;
+ goto err;
+ }
+
+ dma_sync_single_for_cpu(dma_dev, virt_to_phys(ent),
+ sizeof(*ent) * SPAGES_PER_LPAGE,
+ DMA_TO_DEVICE);
+ memset(ent, 0, sizeof(*ent) * SPAGES_PER_LPAGE);
+ dma_sync_single_for_device(dma_dev, virt_to_phys(ent),
+ sizeof(*ent) * SPAGES_PER_LPAGE,
+ DMA_TO_DEVICE);
+ size = LPAGE_SIZE;
+ domain->lv2entcnt[lv1ent_offset(iova)] += SPAGES_PER_LPAGE;
+done:
+ spin_unlock_irqrestore(&domain->pgtablelock, flags);
+
+ exynos_iommu_tlb_invalidate_entry(domain, iova, size);
+
+ return size;
+err:
+ spin_unlock_irqrestore(&domain->pgtablelock, flags);
+
+ pr_err("%s: Failed: size(%#zx) @ %#x is smaller than page size %#zx\n",
+ __func__, size, iova, err_pgsize);
+
+ return 0;
+}
+
+static phys_addr_t exynos_iommu_iova_to_phys(struct iommu_domain *iommu_domain,
+ dma_addr_t iova)
+{
+ struct exynos_iommu_domain *domain = to_exynos_domain(iommu_domain);
+ sysmmu_pte_t *entry;
+ unsigned long flags;
+ phys_addr_t phys = 0;
+
+ spin_lock_irqsave(&domain->pgtablelock, flags);
+
+ entry = section_entry(domain->pgtable, iova);
+
+ if (lv1ent_section(entry)) {
+ phys = section_phys(entry) + section_offs(iova);
+ } else if (lv1ent_page(entry)) {
+ entry = page_entry(entry, iova);
+
+ if (lv2ent_large(entry))
+ phys = lpage_phys(entry) + lpage_offs(iova);
+ else if (lv2ent_small(entry))
+ phys = spage_phys(entry) + spage_offs(iova);
+ }
+
+ spin_unlock_irqrestore(&domain->pgtablelock, flags);
+
+ return phys;
+}
+
+static struct iommu_device *exynos_iommu_probe_device(struct device *dev)
+{
+ struct exynos_iommu_owner *owner = dev_iommu_priv_get(dev);
+ struct sysmmu_drvdata *data;
+
+ if (!has_sysmmu(dev))
+ return ERR_PTR(-ENODEV);
+
+ list_for_each_entry(data, &owner->controllers, owner_node) {
+ /*
+ * SYSMMU will be runtime activated via device link
+ * (dependency) to its master device, so there are no
+ * direct calls to pm_runtime_get/put in this driver.
+ */
+ data->link = device_link_add(dev, data->sysmmu,
+ DL_FLAG_STATELESS |
+ DL_FLAG_PM_RUNTIME);
+ }
+
+ /* There is always at least one entry, see exynos_iommu_of_xlate() */
+ data = list_first_entry(&owner->controllers,
+ struct sysmmu_drvdata, owner_node);
+
+ return &data->iommu;
+}
+
+static void exynos_iommu_release_device(struct device *dev)
+{
+ struct exynos_iommu_owner *owner = dev_iommu_priv_get(dev);
+ struct sysmmu_drvdata *data;
+
+ if (!has_sysmmu(dev))
+ return;
+
+ if (owner->domain) {
+ struct iommu_group *group = iommu_group_get(dev);
+
+ if (group) {
+ WARN_ON(owner->domain !=
+ iommu_group_default_domain(group));
+ exynos_iommu_detach_device(owner->domain, dev);
+ iommu_group_put(group);
+ }
+ }
+
+ list_for_each_entry(data, &owner->controllers, owner_node)
+ device_link_del(data->link);
+}
+
+static int exynos_iommu_of_xlate(struct device *dev,
+ struct of_phandle_args *spec)
+{
+ struct platform_device *sysmmu = of_find_device_by_node(spec->np);
+ struct exynos_iommu_owner *owner = dev_iommu_priv_get(dev);
+ struct sysmmu_drvdata *data, *entry;
+
+ if (!sysmmu)
+ return -ENODEV;
+
+ data = platform_get_drvdata(sysmmu);
+ if (!data) {
+ put_device(&sysmmu->dev);
+ return -ENODEV;
+ }
+
+ if (!owner) {
+ owner = kzalloc(sizeof(*owner), GFP_KERNEL);
+ if (!owner) {
+ put_device(&sysmmu->dev);
+ return -ENOMEM;
+ }
+
+ INIT_LIST_HEAD(&owner->controllers);
+ mutex_init(&owner->rpm_lock);
+ dev_iommu_priv_set(dev, owner);
+ }
+
+ list_for_each_entry(entry, &owner->controllers, owner_node)
+ if (entry == data)
+ return 0;
+
+ list_add_tail(&data->owner_node, &owner->controllers);
+ data->master = dev;
+
+ return 0;
+}
+
+static const struct iommu_ops exynos_iommu_ops = {
+ .domain_alloc = exynos_iommu_domain_alloc,
+ .domain_free = exynos_iommu_domain_free,
+ .attach_dev = exynos_iommu_attach_device,
+ .detach_dev = exynos_iommu_detach_device,
+ .map = exynos_iommu_map,
+ .unmap = exynos_iommu_unmap,
+ .iova_to_phys = exynos_iommu_iova_to_phys,
+ .device_group = generic_device_group,
+ .probe_device = exynos_iommu_probe_device,
+ .release_device = exynos_iommu_release_device,
+ .pgsize_bitmap = SECT_SIZE | LPAGE_SIZE | SPAGE_SIZE,
+ .of_xlate = exynos_iommu_of_xlate,
+};
+
+static int __init exynos_iommu_init(void)
+{
+ struct device_node *np;
+ int ret;
+
+ np = of_find_matching_node(NULL, sysmmu_of_match);
+ if (!np)
+ return 0;
+
+ of_node_put(np);
+
+ lv2table_kmem_cache = kmem_cache_create("exynos-iommu-lv2table",
+ LV2TABLE_SIZE, LV2TABLE_SIZE, 0, NULL);
+ if (!lv2table_kmem_cache) {
+ pr_err("%s: Failed to create kmem cache\n", __func__);
+ return -ENOMEM;
+ }
+
+ ret = platform_driver_register(&exynos_sysmmu_driver);
+ if (ret) {
+ pr_err("%s: Failed to register driver\n", __func__);
+ goto err_reg_driver;
+ }
+
+ zero_lv2_table = kmem_cache_zalloc(lv2table_kmem_cache, GFP_KERNEL);
+ if (zero_lv2_table == NULL) {
+ pr_err("%s: Failed to allocate zero level2 page table\n",
+ __func__);
+ ret = -ENOMEM;
+ goto err_zero_lv2;
+ }
+
+ ret = bus_set_iommu(&platform_bus_type, &exynos_iommu_ops);
+ if (ret) {
+ pr_err("%s: Failed to register exynos-iommu driver.\n",
+ __func__);
+ goto err_set_iommu;
+ }
+
+ return 0;
+err_set_iommu:
+ kmem_cache_free(lv2table_kmem_cache, zero_lv2_table);
+err_zero_lv2:
+ platform_driver_unregister(&exynos_sysmmu_driver);
+err_reg_driver:
+ kmem_cache_destroy(lv2table_kmem_cache);
+ return ret;
+}
+core_initcall(exynos_iommu_init);
diff --git a/drivers/iommu/fsl_pamu.c b/drivers/iommu/fsl_pamu.c
new file mode 100644
index 000000000..25689bdf8
--- /dev/null
+++ b/drivers/iommu/fsl_pamu.c
@@ -0,0 +1,1271 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ *
+ * Copyright (C) 2013 Freescale Semiconductor, Inc.
+ */
+
+#define pr_fmt(fmt) "fsl-pamu: %s: " fmt, __func__
+
+#include "fsl_pamu.h"
+
+#include <linux/fsl/guts.h>
+#include <linux/interrupt.h>
+#include <linux/genalloc.h>
+
+#include <asm/mpc85xx.h>
+
+/* define indexes for each operation mapping scenario */
+#define OMI_QMAN 0x00
+#define OMI_FMAN 0x01
+#define OMI_QMAN_PRIV 0x02
+#define OMI_CAAM 0x03
+
+#define make64(high, low) (((u64)(high) << 32) | (low))
+
+struct pamu_isr_data {
+ void __iomem *pamu_reg_base; /* Base address of PAMU regs */
+ unsigned int count; /* The number of PAMUs */
+};
+
+static struct paace *ppaact;
+static struct paace *spaact;
+
+static bool probed; /* Has PAMU been probed? */
+
+/*
+ * Table for matching compatible strings, for device tree
+ * guts node, for QorIQ SOCs.
+ * "fsl,qoriq-device-config-2.0" corresponds to T4 & B4
+ * SOCs. For the older SOCs "fsl,qoriq-device-config-1.0"
+ * string would be used.
+ */
+static const struct of_device_id guts_device_ids[] = {
+ { .compatible = "fsl,qoriq-device-config-1.0", },
+ { .compatible = "fsl,qoriq-device-config-2.0", },
+ {}
+};
+
+/*
+ * Table for matching compatible strings, for device tree
+ * L3 cache controller node.
+ * "fsl,t4240-l3-cache-controller" corresponds to T4,
+ * "fsl,b4860-l3-cache-controller" corresponds to B4 &
+ * "fsl,p4080-l3-cache-controller" corresponds to other,
+ * SOCs.
+ */
+static const struct of_device_id l3_device_ids[] = {
+ { .compatible = "fsl,t4240-l3-cache-controller", },
+ { .compatible = "fsl,b4860-l3-cache-controller", },
+ { .compatible = "fsl,p4080-l3-cache-controller", },
+ {}
+};
+
+/* maximum subwindows permitted per liodn */
+static u32 max_subwindow_count;
+
+/* Pool for fspi allocation */
+static struct gen_pool *spaace_pool;
+
+/**
+ * pamu_get_max_subwin_cnt() - Return the maximum supported
+ * subwindow count per liodn.
+ *
+ */
+u32 pamu_get_max_subwin_cnt(void)
+{
+ return max_subwindow_count;
+}
+
+/**
+ * pamu_get_ppaace() - Return the primary PACCE
+ * @liodn: liodn PAACT index for desired PAACE
+ *
+ * Returns the ppace pointer upon success else return
+ * null.
+ */
+static struct paace *pamu_get_ppaace(int liodn)
+{
+ if (!ppaact || liodn >= PAACE_NUMBER_ENTRIES) {
+ pr_debug("PPAACT doesn't exist\n");
+ return NULL;
+ }
+
+ return &ppaact[liodn];
+}
+
+/**
+ * pamu_enable_liodn() - Set valid bit of PACCE
+ * @liodn: liodn PAACT index for desired PAACE
+ *
+ * Returns 0 upon success else error code < 0 returned
+ */
+int pamu_enable_liodn(int liodn)
+{
+ struct paace *ppaace;
+
+ ppaace = pamu_get_ppaace(liodn);
+ if (!ppaace) {
+ pr_debug("Invalid primary paace entry\n");
+ return -ENOENT;
+ }
+
+ if (!get_bf(ppaace->addr_bitfields, PPAACE_AF_WSE)) {
+ pr_debug("liodn %d not configured\n", liodn);
+ return -EINVAL;
+ }
+
+ /* Ensure that all other stores to the ppaace complete first */
+ mb();
+
+ set_bf(ppaace->addr_bitfields, PAACE_AF_V, PAACE_V_VALID);
+ mb();
+
+ return 0;
+}
+
+/**
+ * pamu_disable_liodn() - Clears valid bit of PACCE
+ * @liodn: liodn PAACT index for desired PAACE
+ *
+ * Returns 0 upon success else error code < 0 returned
+ */
+int pamu_disable_liodn(int liodn)
+{
+ struct paace *ppaace;
+
+ ppaace = pamu_get_ppaace(liodn);
+ if (!ppaace) {
+ pr_debug("Invalid primary paace entry\n");
+ return -ENOENT;
+ }
+
+ set_bf(ppaace->addr_bitfields, PAACE_AF_V, PAACE_V_INVALID);
+ mb();
+
+ return 0;
+}
+
+/* Derive the window size encoding for a particular PAACE entry */
+static unsigned int map_addrspace_size_to_wse(phys_addr_t addrspace_size)
+{
+ /* Bug if not a power of 2 */
+ BUG_ON(addrspace_size & (addrspace_size - 1));
+
+ /* window size is 2^(WSE+1) bytes */
+ return fls64(addrspace_size) - 2;
+}
+
+/* Derive the PAACE window count encoding for the subwindow count */
+static unsigned int map_subwindow_cnt_to_wce(u32 subwindow_cnt)
+{
+ /* window count is 2^(WCE+1) bytes */
+ return __ffs(subwindow_cnt) - 1;
+}
+
+/*
+ * Set the PAACE type as primary and set the coherency required domain
+ * attribute
+ */
+static void pamu_init_ppaace(struct paace *ppaace)
+{
+ set_bf(ppaace->addr_bitfields, PAACE_AF_PT, PAACE_PT_PRIMARY);
+
+ set_bf(ppaace->domain_attr.to_host.coherency_required, PAACE_DA_HOST_CR,
+ PAACE_M_COHERENCE_REQ);
+}
+
+/*
+ * Set the PAACE type as secondary and set the coherency required domain
+ * attribute.
+ */
+static void pamu_init_spaace(struct paace *spaace)
+{
+ set_bf(spaace->addr_bitfields, PAACE_AF_PT, PAACE_PT_SECONDARY);
+ set_bf(spaace->domain_attr.to_host.coherency_required, PAACE_DA_HOST_CR,
+ PAACE_M_COHERENCE_REQ);
+}
+
+/*
+ * Return the spaace (corresponding to the secondary window index)
+ * for a particular ppaace.
+ */
+static struct paace *pamu_get_spaace(struct paace *paace, u32 wnum)
+{
+ u32 subwin_cnt;
+ struct paace *spaace = NULL;
+
+ subwin_cnt = 1UL << (get_bf(paace->impl_attr, PAACE_IA_WCE) + 1);
+
+ if (wnum < subwin_cnt)
+ spaace = &spaact[paace->fspi + wnum];
+ else
+ pr_debug("secondary paace out of bounds\n");
+
+ return spaace;
+}
+
+/**
+ * pamu_get_fspi_and_allocate() - Allocates fspi index and reserves subwindows
+ * required for primary PAACE in the secondary
+ * PAACE table.
+ * @subwin_cnt: Number of subwindows to be reserved.
+ *
+ * A PPAACE entry may have a number of associated subwindows. A subwindow
+ * corresponds to a SPAACE entry in the SPAACT table. Each PAACE entry stores
+ * the index (fspi) of the first SPAACE entry in the SPAACT table. This
+ * function returns the index of the first SPAACE entry. The remaining
+ * SPAACE entries are reserved contiguously from that index.
+ *
+ * Returns a valid fspi index in the range of 0 - SPAACE_NUMBER_ENTRIES on success.
+ * If no SPAACE entry is available or the allocator can not reserve the required
+ * number of contiguous entries function returns ULONG_MAX indicating a failure.
+ *
+ */
+static unsigned long pamu_get_fspi_and_allocate(u32 subwin_cnt)
+{
+ unsigned long spaace_addr;
+
+ spaace_addr = gen_pool_alloc(spaace_pool, subwin_cnt * sizeof(struct paace));
+ if (!spaace_addr)
+ return ULONG_MAX;
+
+ return (spaace_addr - (unsigned long)spaact) / (sizeof(struct paace));
+}
+
+/* Release the subwindows reserved for a particular LIODN */
+void pamu_free_subwins(int liodn)
+{
+ struct paace *ppaace;
+ u32 subwin_cnt, size;
+
+ ppaace = pamu_get_ppaace(liodn);
+ if (!ppaace) {
+ pr_debug("Invalid liodn entry\n");
+ return;
+ }
+
+ if (get_bf(ppaace->addr_bitfields, PPAACE_AF_MW)) {
+ subwin_cnt = 1UL << (get_bf(ppaace->impl_attr, PAACE_IA_WCE) + 1);
+ size = (subwin_cnt - 1) * sizeof(struct paace);
+ gen_pool_free(spaace_pool, (unsigned long)&spaact[ppaace->fspi], size);
+ set_bf(ppaace->addr_bitfields, PPAACE_AF_MW, 0);
+ }
+}
+
+/*
+ * Function used for updating stash destination for the coressponding
+ * LIODN.
+ */
+int pamu_update_paace_stash(int liodn, u32 subwin, u32 value)
+{
+ struct paace *paace;
+
+ paace = pamu_get_ppaace(liodn);
+ if (!paace) {
+ pr_debug("Invalid liodn entry\n");
+ return -ENOENT;
+ }
+ if (subwin) {
+ paace = pamu_get_spaace(paace, subwin - 1);
+ if (!paace)
+ return -ENOENT;
+ }
+ set_bf(paace->impl_attr, PAACE_IA_CID, value);
+
+ mb();
+
+ return 0;
+}
+
+/* Disable a subwindow corresponding to the LIODN */
+int pamu_disable_spaace(int liodn, u32 subwin)
+{
+ struct paace *paace;
+
+ paace = pamu_get_ppaace(liodn);
+ if (!paace) {
+ pr_debug("Invalid liodn entry\n");
+ return -ENOENT;
+ }
+ if (subwin) {
+ paace = pamu_get_spaace(paace, subwin - 1);
+ if (!paace)
+ return -ENOENT;
+ set_bf(paace->addr_bitfields, PAACE_AF_V, PAACE_V_INVALID);
+ } else {
+ set_bf(paace->addr_bitfields, PAACE_AF_AP,
+ PAACE_AP_PERMS_DENIED);
+ }
+
+ mb();
+
+ return 0;
+}
+
+/**
+ * pamu_config_paace() - Sets up PPAACE entry for specified liodn
+ *
+ * @liodn: Logical IO device number
+ * @win_addr: starting address of DSA window
+ * @win-size: size of DSA window
+ * @omi: Operation mapping index -- if ~omi == 0 then omi not defined
+ * @rpn: real (true physical) page number
+ * @stashid: cache stash id for associated cpu -- if ~stashid == 0 then
+ * stashid not defined
+ * @snoopid: snoop id for hardware coherency -- if ~snoopid == 0 then
+ * snoopid not defined
+ * @subwin_cnt: number of sub-windows
+ * @prot: window permissions
+ *
+ * Returns 0 upon success else error code < 0 returned
+ */
+int pamu_config_ppaace(int liodn, phys_addr_t win_addr, phys_addr_t win_size,
+ u32 omi, unsigned long rpn, u32 snoopid, u32 stashid,
+ u32 subwin_cnt, int prot)
+{
+ struct paace *ppaace;
+ unsigned long fspi;
+
+ if ((win_size & (win_size - 1)) || win_size < PAMU_PAGE_SIZE) {
+ pr_debug("window size too small or not a power of two %pa\n",
+ &win_size);
+ return -EINVAL;
+ }
+
+ if (win_addr & (win_size - 1)) {
+ pr_debug("window address is not aligned with window size\n");
+ return -EINVAL;
+ }
+
+ ppaace = pamu_get_ppaace(liodn);
+ if (!ppaace)
+ return -ENOENT;
+
+ /* window size is 2^(WSE+1) bytes */
+ set_bf(ppaace->addr_bitfields, PPAACE_AF_WSE,
+ map_addrspace_size_to_wse(win_size));
+
+ pamu_init_ppaace(ppaace);
+
+ ppaace->wbah = win_addr >> (PAMU_PAGE_SHIFT + 20);
+ set_bf(ppaace->addr_bitfields, PPAACE_AF_WBAL,
+ (win_addr >> PAMU_PAGE_SHIFT));
+
+ /* set up operation mapping if it's configured */
+ if (omi < OME_NUMBER_ENTRIES) {
+ set_bf(ppaace->impl_attr, PAACE_IA_OTM, PAACE_OTM_INDEXED);
+ ppaace->op_encode.index_ot.omi = omi;
+ } else if (~omi != 0) {
+ pr_debug("bad operation mapping index: %d\n", omi);
+ return -EINVAL;
+ }
+
+ /* configure stash id */
+ if (~stashid != 0)
+ set_bf(ppaace->impl_attr, PAACE_IA_CID, stashid);
+
+ /* configure snoop id */
+ if (~snoopid != 0)
+ ppaace->domain_attr.to_host.snpid = snoopid;
+
+ if (subwin_cnt) {
+ /* The first entry is in the primary PAACE instead */
+ fspi = pamu_get_fspi_and_allocate(subwin_cnt - 1);
+ if (fspi == ULONG_MAX) {
+ pr_debug("spaace indexes exhausted\n");
+ return -EINVAL;
+ }
+
+ /* window count is 2^(WCE+1) bytes */
+ set_bf(ppaace->impl_attr, PAACE_IA_WCE,
+ map_subwindow_cnt_to_wce(subwin_cnt));
+ set_bf(ppaace->addr_bitfields, PPAACE_AF_MW, 0x1);
+ ppaace->fspi = fspi;
+ } else {
+ set_bf(ppaace->impl_attr, PAACE_IA_ATM, PAACE_ATM_WINDOW_XLATE);
+ ppaace->twbah = rpn >> 20;
+ set_bf(ppaace->win_bitfields, PAACE_WIN_TWBAL, rpn);
+ set_bf(ppaace->addr_bitfields, PAACE_AF_AP, prot);
+ set_bf(ppaace->impl_attr, PAACE_IA_WCE, 0);
+ set_bf(ppaace->addr_bitfields, PPAACE_AF_MW, 0);
+ }
+ mb();
+
+ return 0;
+}
+
+/**
+ * pamu_config_spaace() - Sets up SPAACE entry for specified subwindow
+ *
+ * @liodn: Logical IO device number
+ * @subwin_cnt: number of sub-windows associated with dma-window
+ * @subwin: subwindow index
+ * @subwin_size: size of subwindow
+ * @omi: Operation mapping index
+ * @rpn: real (true physical) page number
+ * @snoopid: snoop id for hardware coherency -- if ~snoopid == 0 then
+ * snoopid not defined
+ * @stashid: cache stash id for associated cpu
+ * @enable: enable/disable subwindow after reconfiguration
+ * @prot: sub window permissions
+ *
+ * Returns 0 upon success else error code < 0 returned
+ */
+int pamu_config_spaace(int liodn, u32 subwin_cnt, u32 subwin,
+ phys_addr_t subwin_size, u32 omi, unsigned long rpn,
+ u32 snoopid, u32 stashid, int enable, int prot)
+{
+ struct paace *paace;
+
+ /* setup sub-windows */
+ if (!subwin_cnt) {
+ pr_debug("Invalid subwindow count\n");
+ return -EINVAL;
+ }
+
+ paace = pamu_get_ppaace(liodn);
+ if (subwin > 0 && subwin < subwin_cnt && paace) {
+ paace = pamu_get_spaace(paace, subwin - 1);
+
+ if (paace && !(paace->addr_bitfields & PAACE_V_VALID)) {
+ pamu_init_spaace(paace);
+ set_bf(paace->addr_bitfields, SPAACE_AF_LIODN, liodn);
+ }
+ }
+
+ if (!paace) {
+ pr_debug("Invalid liodn entry\n");
+ return -ENOENT;
+ }
+
+ if ((subwin_size & (subwin_size - 1)) || subwin_size < PAMU_PAGE_SIZE) {
+ pr_debug("subwindow size out of range, or not a power of 2\n");
+ return -EINVAL;
+ }
+
+ if (rpn == ULONG_MAX) {
+ pr_debug("real page number out of range\n");
+ return -EINVAL;
+ }
+
+ /* window size is 2^(WSE+1) bytes */
+ set_bf(paace->win_bitfields, PAACE_WIN_SWSE,
+ map_addrspace_size_to_wse(subwin_size));
+
+ set_bf(paace->impl_attr, PAACE_IA_ATM, PAACE_ATM_WINDOW_XLATE);
+ paace->twbah = rpn >> 20;
+ set_bf(paace->win_bitfields, PAACE_WIN_TWBAL, rpn);
+ set_bf(paace->addr_bitfields, PAACE_AF_AP, prot);
+
+ /* configure snoop id */
+ if (~snoopid != 0)
+ paace->domain_attr.to_host.snpid = snoopid;
+
+ /* set up operation mapping if it's configured */
+ if (omi < OME_NUMBER_ENTRIES) {
+ set_bf(paace->impl_attr, PAACE_IA_OTM, PAACE_OTM_INDEXED);
+ paace->op_encode.index_ot.omi = omi;
+ } else if (~omi != 0) {
+ pr_debug("bad operation mapping index: %d\n", omi);
+ return -EINVAL;
+ }
+
+ if (~stashid != 0)
+ set_bf(paace->impl_attr, PAACE_IA_CID, stashid);
+
+ smp_wmb();
+
+ if (enable)
+ set_bf(paace->addr_bitfields, PAACE_AF_V, PAACE_V_VALID);
+
+ mb();
+
+ return 0;
+}
+
+/**
+ * get_ome_index() - Returns the index in the operation mapping table
+ * for device.
+ * @*omi_index: pointer for storing the index value
+ *
+ */
+void get_ome_index(u32 *omi_index, struct device *dev)
+{
+ if (of_device_is_compatible(dev->of_node, "fsl,qman-portal"))
+ *omi_index = OMI_QMAN;
+ if (of_device_is_compatible(dev->of_node, "fsl,qman"))
+ *omi_index = OMI_QMAN_PRIV;
+}
+
+/**
+ * get_stash_id - Returns stash destination id corresponding to a
+ * cache type and vcpu.
+ * @stash_dest_hint: L1, L2 or L3
+ * @vcpu: vpcu target for a particular cache type.
+ *
+ * Returs stash on success or ~(u32)0 on failure.
+ *
+ */
+u32 get_stash_id(u32 stash_dest_hint, u32 vcpu)
+{
+ const u32 *prop;
+ struct device_node *node;
+ u32 cache_level;
+ int len, found = 0;
+ int i;
+
+ /* Fastpath, exit early if L3/CPC cache is target for stashing */
+ if (stash_dest_hint == PAMU_ATTR_CACHE_L3) {
+ node = of_find_matching_node(NULL, l3_device_ids);
+ if (node) {
+ prop = of_get_property(node, "cache-stash-id", NULL);
+ if (!prop) {
+ pr_debug("missing cache-stash-id at %pOF\n",
+ node);
+ of_node_put(node);
+ return ~(u32)0;
+ }
+ of_node_put(node);
+ return be32_to_cpup(prop);
+ }
+ return ~(u32)0;
+ }
+
+ for_each_of_cpu_node(node) {
+ prop = of_get_property(node, "reg", &len);
+ for (i = 0; i < len / sizeof(u32); i++) {
+ if (be32_to_cpup(&prop[i]) == vcpu) {
+ found = 1;
+ goto found_cpu_node;
+ }
+ }
+ }
+found_cpu_node:
+
+ /* find the hwnode that represents the cache */
+ for (cache_level = PAMU_ATTR_CACHE_L1; (cache_level < PAMU_ATTR_CACHE_L3) && found; cache_level++) {
+ if (stash_dest_hint == cache_level) {
+ prop = of_get_property(node, "cache-stash-id", NULL);
+ if (!prop) {
+ pr_debug("missing cache-stash-id at %pOF\n",
+ node);
+ of_node_put(node);
+ return ~(u32)0;
+ }
+ of_node_put(node);
+ return be32_to_cpup(prop);
+ }
+
+ prop = of_get_property(node, "next-level-cache", NULL);
+ if (!prop) {
+ pr_debug("can't find next-level-cache at %pOF\n", node);
+ of_node_put(node);
+ return ~(u32)0; /* can't traverse any further */
+ }
+ of_node_put(node);
+
+ /* advance to next node in cache hierarchy */
+ node = of_find_node_by_phandle(*prop);
+ if (!node) {
+ pr_debug("Invalid node for cache hierarchy\n");
+ return ~(u32)0;
+ }
+ }
+
+ pr_debug("stash dest not found for %d on vcpu %d\n",
+ stash_dest_hint, vcpu);
+ return ~(u32)0;
+}
+
+/* Identify if the PAACT table entry belongs to QMAN, BMAN or QMAN Portal */
+#define QMAN_PAACE 1
+#define QMAN_PORTAL_PAACE 2
+#define BMAN_PAACE 3
+
+/**
+ * Setup operation mapping and stash destinations for QMAN and QMAN portal.
+ * Memory accesses to QMAN and BMAN private memory need not be coherent, so
+ * clear the PAACE entry coherency attribute for them.
+ */
+static void setup_qbman_paace(struct paace *ppaace, int paace_type)
+{
+ switch (paace_type) {
+ case QMAN_PAACE:
+ set_bf(ppaace->impl_attr, PAACE_IA_OTM, PAACE_OTM_INDEXED);
+ ppaace->op_encode.index_ot.omi = OMI_QMAN_PRIV;
+ /* setup QMAN Private data stashing for the L3 cache */
+ set_bf(ppaace->impl_attr, PAACE_IA_CID, get_stash_id(PAMU_ATTR_CACHE_L3, 0));
+ set_bf(ppaace->domain_attr.to_host.coherency_required, PAACE_DA_HOST_CR,
+ 0);
+ break;
+ case QMAN_PORTAL_PAACE:
+ set_bf(ppaace->impl_attr, PAACE_IA_OTM, PAACE_OTM_INDEXED);
+ ppaace->op_encode.index_ot.omi = OMI_QMAN;
+ /* Set DQRR and Frame stashing for the L3 cache */
+ set_bf(ppaace->impl_attr, PAACE_IA_CID, get_stash_id(PAMU_ATTR_CACHE_L3, 0));
+ break;
+ case BMAN_PAACE:
+ set_bf(ppaace->domain_attr.to_host.coherency_required, PAACE_DA_HOST_CR,
+ 0);
+ break;
+ }
+}
+
+/**
+ * Setup the operation mapping table for various devices. This is a static
+ * table where each table index corresponds to a particular device. PAMU uses
+ * this table to translate device transaction to appropriate corenet
+ * transaction.
+ */
+static void setup_omt(struct ome *omt)
+{
+ struct ome *ome;
+
+ /* Configure OMI_QMAN */
+ ome = &omt[OMI_QMAN];
+
+ ome->moe[IOE_READ_IDX] = EOE_VALID | EOE_READ;
+ ome->moe[IOE_EREAD0_IDX] = EOE_VALID | EOE_RSA;
+ ome->moe[IOE_WRITE_IDX] = EOE_VALID | EOE_WRITE;
+ ome->moe[IOE_EWRITE0_IDX] = EOE_VALID | EOE_WWSAO;
+
+ ome->moe[IOE_DIRECT0_IDX] = EOE_VALID | EOE_LDEC;
+ ome->moe[IOE_DIRECT1_IDX] = EOE_VALID | EOE_LDECPE;
+
+ /* Configure OMI_FMAN */
+ ome = &omt[OMI_FMAN];
+ ome->moe[IOE_READ_IDX] = EOE_VALID | EOE_READI;
+ ome->moe[IOE_WRITE_IDX] = EOE_VALID | EOE_WRITE;
+
+ /* Configure OMI_QMAN private */
+ ome = &omt[OMI_QMAN_PRIV];
+ ome->moe[IOE_READ_IDX] = EOE_VALID | EOE_READ;
+ ome->moe[IOE_WRITE_IDX] = EOE_VALID | EOE_WRITE;
+ ome->moe[IOE_EREAD0_IDX] = EOE_VALID | EOE_RSA;
+ ome->moe[IOE_EWRITE0_IDX] = EOE_VALID | EOE_WWSA;
+
+ /* Configure OMI_CAAM */
+ ome = &omt[OMI_CAAM];
+ ome->moe[IOE_READ_IDX] = EOE_VALID | EOE_READI;
+ ome->moe[IOE_WRITE_IDX] = EOE_VALID | EOE_WRITE;
+}
+
+/*
+ * Get the maximum number of PAACT table entries
+ * and subwindows supported by PAMU
+ */
+static void get_pamu_cap_values(unsigned long pamu_reg_base)
+{
+ u32 pc_val;
+
+ pc_val = in_be32((u32 *)(pamu_reg_base + PAMU_PC3));
+ /* Maximum number of subwindows per liodn */
+ max_subwindow_count = 1 << (1 + PAMU_PC3_MWCE(pc_val));
+}
+
+/* Setup PAMU registers pointing to PAACT, SPAACT and OMT */
+static int setup_one_pamu(unsigned long pamu_reg_base, unsigned long pamu_reg_size,
+ phys_addr_t ppaact_phys, phys_addr_t spaact_phys,
+ phys_addr_t omt_phys)
+{
+ u32 *pc;
+ struct pamu_mmap_regs *pamu_regs;
+
+ pc = (u32 *) (pamu_reg_base + PAMU_PC);
+ pamu_regs = (struct pamu_mmap_regs *)
+ (pamu_reg_base + PAMU_MMAP_REGS_BASE);
+
+ /* set up pointers to corenet control blocks */
+
+ out_be32(&pamu_regs->ppbah, upper_32_bits(ppaact_phys));
+ out_be32(&pamu_regs->ppbal, lower_32_bits(ppaact_phys));
+ ppaact_phys = ppaact_phys + PAACT_SIZE;
+ out_be32(&pamu_regs->pplah, upper_32_bits(ppaact_phys));
+ out_be32(&pamu_regs->pplal, lower_32_bits(ppaact_phys));
+
+ out_be32(&pamu_regs->spbah, upper_32_bits(spaact_phys));
+ out_be32(&pamu_regs->spbal, lower_32_bits(spaact_phys));
+ spaact_phys = spaact_phys + SPAACT_SIZE;
+ out_be32(&pamu_regs->splah, upper_32_bits(spaact_phys));
+ out_be32(&pamu_regs->splal, lower_32_bits(spaact_phys));
+
+ out_be32(&pamu_regs->obah, upper_32_bits(omt_phys));
+ out_be32(&pamu_regs->obal, lower_32_bits(omt_phys));
+ omt_phys = omt_phys + OMT_SIZE;
+ out_be32(&pamu_regs->olah, upper_32_bits(omt_phys));
+ out_be32(&pamu_regs->olal, lower_32_bits(omt_phys));
+
+ /*
+ * set PAMU enable bit,
+ * allow ppaact & omt to be cached
+ * & enable PAMU access violation interrupts.
+ */
+
+ out_be32((u32 *)(pamu_reg_base + PAMU_PICS),
+ PAMU_ACCESS_VIOLATION_ENABLE);
+ out_be32(pc, PAMU_PC_PE | PAMU_PC_OCE | PAMU_PC_SPCC | PAMU_PC_PPCC);
+ return 0;
+}
+
+/* Enable all device LIODNS */
+static void setup_liodns(void)
+{
+ int i, len;
+ struct paace *ppaace;
+ struct device_node *node = NULL;
+ const u32 *prop;
+
+ for_each_node_with_property(node, "fsl,liodn") {
+ prop = of_get_property(node, "fsl,liodn", &len);
+ for (i = 0; i < len / sizeof(u32); i++) {
+ int liodn;
+
+ liodn = be32_to_cpup(&prop[i]);
+ if (liodn >= PAACE_NUMBER_ENTRIES) {
+ pr_debug("Invalid LIODN value %d\n", liodn);
+ continue;
+ }
+ ppaace = pamu_get_ppaace(liodn);
+ pamu_init_ppaace(ppaace);
+ /* window size is 2^(WSE+1) bytes */
+ set_bf(ppaace->addr_bitfields, PPAACE_AF_WSE, 35);
+ ppaace->wbah = 0;
+ set_bf(ppaace->addr_bitfields, PPAACE_AF_WBAL, 0);
+ set_bf(ppaace->impl_attr, PAACE_IA_ATM,
+ PAACE_ATM_NO_XLATE);
+ set_bf(ppaace->addr_bitfields, PAACE_AF_AP,
+ PAACE_AP_PERMS_ALL);
+ if (of_device_is_compatible(node, "fsl,qman-portal"))
+ setup_qbman_paace(ppaace, QMAN_PORTAL_PAACE);
+ if (of_device_is_compatible(node, "fsl,qman"))
+ setup_qbman_paace(ppaace, QMAN_PAACE);
+ if (of_device_is_compatible(node, "fsl,bman"))
+ setup_qbman_paace(ppaace, BMAN_PAACE);
+ mb();
+ pamu_enable_liodn(liodn);
+ }
+ }
+}
+
+static irqreturn_t pamu_av_isr(int irq, void *arg)
+{
+ struct pamu_isr_data *data = arg;
+ phys_addr_t phys;
+ unsigned int i, j, ret;
+
+ pr_emerg("access violation interrupt\n");
+
+ for (i = 0; i < data->count; i++) {
+ void __iomem *p = data->pamu_reg_base + i * PAMU_OFFSET;
+ u32 pics = in_be32(p + PAMU_PICS);
+
+ if (pics & PAMU_ACCESS_VIOLATION_STAT) {
+ u32 avs1 = in_be32(p + PAMU_AVS1);
+ struct paace *paace;
+
+ pr_emerg("POES1=%08x\n", in_be32(p + PAMU_POES1));
+ pr_emerg("POES2=%08x\n", in_be32(p + PAMU_POES2));
+ pr_emerg("AVS1=%08x\n", avs1);
+ pr_emerg("AVS2=%08x\n", in_be32(p + PAMU_AVS2));
+ pr_emerg("AVA=%016llx\n",
+ make64(in_be32(p + PAMU_AVAH),
+ in_be32(p + PAMU_AVAL)));
+ pr_emerg("UDAD=%08x\n", in_be32(p + PAMU_UDAD));
+ pr_emerg("POEA=%016llx\n",
+ make64(in_be32(p + PAMU_POEAH),
+ in_be32(p + PAMU_POEAL)));
+
+ phys = make64(in_be32(p + PAMU_POEAH),
+ in_be32(p + PAMU_POEAL));
+
+ /* Assume that POEA points to a PAACE */
+ if (phys) {
+ u32 *paace = phys_to_virt(phys);
+
+ /* Only the first four words are relevant */
+ for (j = 0; j < 4; j++)
+ pr_emerg("PAACE[%u]=%08x\n",
+ j, in_be32(paace + j));
+ }
+
+ /* clear access violation condition */
+ out_be32(p + PAMU_AVS1, avs1 & PAMU_AV_MASK);
+ paace = pamu_get_ppaace(avs1 >> PAMU_AVS1_LIODN_SHIFT);
+ BUG_ON(!paace);
+ /* check if we got a violation for a disabled LIODN */
+ if (!get_bf(paace->addr_bitfields, PAACE_AF_V)) {
+ /*
+ * As per hardware erratum A-003638, access
+ * violation can be reported for a disabled
+ * LIODN. If we hit that condition, disable
+ * access violation reporting.
+ */
+ pics &= ~PAMU_ACCESS_VIOLATION_ENABLE;
+ } else {
+ /* Disable the LIODN */
+ ret = pamu_disable_liodn(avs1 >> PAMU_AVS1_LIODN_SHIFT);
+ BUG_ON(ret);
+ pr_emerg("Disabling liodn %x\n",
+ avs1 >> PAMU_AVS1_LIODN_SHIFT);
+ }
+ out_be32((p + PAMU_PICS), pics);
+ }
+ }
+
+ return IRQ_HANDLED;
+}
+
+#define LAWAR_EN 0x80000000
+#define LAWAR_TARGET_MASK 0x0FF00000
+#define LAWAR_TARGET_SHIFT 20
+#define LAWAR_SIZE_MASK 0x0000003F
+#define LAWAR_CSDID_MASK 0x000FF000
+#define LAWAR_CSDID_SHIFT 12
+
+#define LAW_SIZE_4K 0xb
+
+struct ccsr_law {
+ u32 lawbarh; /* LAWn base address high */
+ u32 lawbarl; /* LAWn base address low */
+ u32 lawar; /* LAWn attributes */
+ u32 reserved;
+};
+
+/*
+ * Create a coherence subdomain for a given memory block.
+ */
+static int create_csd(phys_addr_t phys, size_t size, u32 csd_port_id)
+{
+ struct device_node *np;
+ const __be32 *iprop;
+ void __iomem *lac = NULL; /* Local Access Control registers */
+ struct ccsr_law __iomem *law;
+ void __iomem *ccm = NULL;
+ u32 __iomem *csdids;
+ unsigned int i, num_laws, num_csds;
+ u32 law_target = 0;
+ u32 csd_id = 0;
+ int ret = 0;
+
+ np = of_find_compatible_node(NULL, NULL, "fsl,corenet-law");
+ if (!np)
+ return -ENODEV;
+
+ iprop = of_get_property(np, "fsl,num-laws", NULL);
+ if (!iprop) {
+ ret = -ENODEV;
+ goto error;
+ }
+
+ num_laws = be32_to_cpup(iprop);
+ if (!num_laws) {
+ ret = -ENODEV;
+ goto error;
+ }
+
+ lac = of_iomap(np, 0);
+ if (!lac) {
+ ret = -ENODEV;
+ goto error;
+ }
+
+ /* LAW registers are at offset 0xC00 */
+ law = lac + 0xC00;
+
+ of_node_put(np);
+
+ np = of_find_compatible_node(NULL, NULL, "fsl,corenet-cf");
+ if (!np) {
+ ret = -ENODEV;
+ goto error;
+ }
+
+ iprop = of_get_property(np, "fsl,ccf-num-csdids", NULL);
+ if (!iprop) {
+ ret = -ENODEV;
+ goto error;
+ }
+
+ num_csds = be32_to_cpup(iprop);
+ if (!num_csds) {
+ ret = -ENODEV;
+ goto error;
+ }
+
+ ccm = of_iomap(np, 0);
+ if (!ccm) {
+ ret = -ENOMEM;
+ goto error;
+ }
+
+ /* The undocumented CSDID registers are at offset 0x600 */
+ csdids = ccm + 0x600;
+
+ of_node_put(np);
+ np = NULL;
+
+ /* Find an unused coherence subdomain ID */
+ for (csd_id = 0; csd_id < num_csds; csd_id++) {
+ if (!csdids[csd_id])
+ break;
+ }
+
+ /* Store the Port ID in the (undocumented) proper CIDMRxx register */
+ csdids[csd_id] = csd_port_id;
+
+ /* Find the DDR LAW that maps to our buffer. */
+ for (i = 0; i < num_laws; i++) {
+ if (law[i].lawar & LAWAR_EN) {
+ phys_addr_t law_start, law_end;
+
+ law_start = make64(law[i].lawbarh, law[i].lawbarl);
+ law_end = law_start +
+ (2ULL << (law[i].lawar & LAWAR_SIZE_MASK));
+
+ if (law_start <= phys && phys < law_end) {
+ law_target = law[i].lawar & LAWAR_TARGET_MASK;
+ break;
+ }
+ }
+ }
+
+ if (i == 0 || i == num_laws) {
+ /* This should never happen */
+ ret = -ENOENT;
+ goto error;
+ }
+
+ /* Find a free LAW entry */
+ while (law[--i].lawar & LAWAR_EN) {
+ if (i == 0) {
+ /* No higher priority LAW slots available */
+ ret = -ENOENT;
+ goto error;
+ }
+ }
+
+ law[i].lawbarh = upper_32_bits(phys);
+ law[i].lawbarl = lower_32_bits(phys);
+ wmb();
+ law[i].lawar = LAWAR_EN | law_target | (csd_id << LAWAR_CSDID_SHIFT) |
+ (LAW_SIZE_4K + get_order(size));
+ wmb();
+
+error:
+ if (ccm)
+ iounmap(ccm);
+
+ if (lac)
+ iounmap(lac);
+
+ if (np)
+ of_node_put(np);
+
+ return ret;
+}
+
+/*
+ * Table of SVRs and the corresponding PORT_ID values. Port ID corresponds to a
+ * bit map of snoopers for a given range of memory mapped by a LAW.
+ *
+ * All future CoreNet-enabled SOCs will have this erratum(A-004510) fixed, so this
+ * table should never need to be updated. SVRs are guaranteed to be unique, so
+ * there is no worry that a future SOC will inadvertently have one of these
+ * values.
+ */
+static const struct {
+ u32 svr;
+ u32 port_id;
+} port_id_map[] = {
+ {(SVR_P2040 << 8) | 0x10, 0xFF000000}, /* P2040 1.0 */
+ {(SVR_P2040 << 8) | 0x11, 0xFF000000}, /* P2040 1.1 */
+ {(SVR_P2041 << 8) | 0x10, 0xFF000000}, /* P2041 1.0 */
+ {(SVR_P2041 << 8) | 0x11, 0xFF000000}, /* P2041 1.1 */
+ {(SVR_P3041 << 8) | 0x10, 0xFF000000}, /* P3041 1.0 */
+ {(SVR_P3041 << 8) | 0x11, 0xFF000000}, /* P3041 1.1 */
+ {(SVR_P4040 << 8) | 0x20, 0xFFF80000}, /* P4040 2.0 */
+ {(SVR_P4080 << 8) | 0x20, 0xFFF80000}, /* P4080 2.0 */
+ {(SVR_P5010 << 8) | 0x10, 0xFC000000}, /* P5010 1.0 */
+ {(SVR_P5010 << 8) | 0x20, 0xFC000000}, /* P5010 2.0 */
+ {(SVR_P5020 << 8) | 0x10, 0xFC000000}, /* P5020 1.0 */
+ {(SVR_P5021 << 8) | 0x10, 0xFF800000}, /* P5021 1.0 */
+ {(SVR_P5040 << 8) | 0x10, 0xFF800000}, /* P5040 1.0 */
+};
+
+#define SVR_SECURITY 0x80000 /* The Security (E) bit */
+
+static int fsl_pamu_probe(struct platform_device *pdev)
+{
+ struct device *dev = &pdev->dev;
+ void __iomem *pamu_regs = NULL;
+ struct ccsr_guts __iomem *guts_regs = NULL;
+ u32 pamubypenr, pamu_counter;
+ unsigned long pamu_reg_off;
+ unsigned long pamu_reg_base;
+ struct pamu_isr_data *data = NULL;
+ struct device_node *guts_node;
+ u64 size;
+ struct page *p;
+ int ret = 0;
+ int irq;
+ phys_addr_t ppaact_phys;
+ phys_addr_t spaact_phys;
+ struct ome *omt;
+ phys_addr_t omt_phys;
+ size_t mem_size = 0;
+ unsigned int order = 0;
+ u32 csd_port_id = 0;
+ unsigned i;
+ /*
+ * enumerate all PAMUs and allocate and setup PAMU tables
+ * for each of them,
+ * NOTE : All PAMUs share the same LIODN tables.
+ */
+
+ if (WARN_ON(probed))
+ return -EBUSY;
+
+ pamu_regs = of_iomap(dev->of_node, 0);
+ if (!pamu_regs) {
+ dev_err(dev, "ioremap of PAMU node failed\n");
+ return -ENOMEM;
+ }
+ of_get_address(dev->of_node, 0, &size, NULL);
+
+ irq = irq_of_parse_and_map(dev->of_node, 0);
+ if (irq == NO_IRQ) {
+ dev_warn(dev, "no interrupts listed in PAMU node\n");
+ goto error;
+ }
+
+ data = kzalloc(sizeof(*data), GFP_KERNEL);
+ if (!data) {
+ ret = -ENOMEM;
+ goto error;
+ }
+ data->pamu_reg_base = pamu_regs;
+ data->count = size / PAMU_OFFSET;
+
+ /* The ISR needs access to the regs, so we won't iounmap them */
+ ret = request_irq(irq, pamu_av_isr, 0, "pamu", data);
+ if (ret < 0) {
+ dev_err(dev, "error %i installing ISR for irq %i\n", ret, irq);
+ goto error;
+ }
+
+ guts_node = of_find_matching_node(NULL, guts_device_ids);
+ if (!guts_node) {
+ dev_err(dev, "could not find GUTS node %pOF\n", dev->of_node);
+ ret = -ENODEV;
+ goto error;
+ }
+
+ guts_regs = of_iomap(guts_node, 0);
+ of_node_put(guts_node);
+ if (!guts_regs) {
+ dev_err(dev, "ioremap of GUTS node failed\n");
+ ret = -ENODEV;
+ goto error;
+ }
+
+ /* read in the PAMU capability registers */
+ get_pamu_cap_values((unsigned long)pamu_regs);
+ /*
+ * To simplify the allocation of a coherency domain, we allocate the
+ * PAACT and the OMT in the same memory buffer. Unfortunately, this
+ * wastes more memory compared to allocating the buffers separately.
+ */
+ /* Determine how much memory we need */
+ mem_size = (PAGE_SIZE << get_order(PAACT_SIZE)) +
+ (PAGE_SIZE << get_order(SPAACT_SIZE)) +
+ (PAGE_SIZE << get_order(OMT_SIZE));
+ order = get_order(mem_size);
+
+ p = alloc_pages(GFP_KERNEL | __GFP_ZERO, order);
+ if (!p) {
+ dev_err(dev, "unable to allocate PAACT/SPAACT/OMT block\n");
+ ret = -ENOMEM;
+ goto error;
+ }
+
+ ppaact = page_address(p);
+ ppaact_phys = page_to_phys(p);
+
+ /* Make sure the memory is naturally aligned */
+ if (ppaact_phys & ((PAGE_SIZE << order) - 1)) {
+ dev_err(dev, "PAACT/OMT block is unaligned\n");
+ ret = -ENOMEM;
+ goto error;
+ }
+
+ spaact = (void *)ppaact + (PAGE_SIZE << get_order(PAACT_SIZE));
+ omt = (void *)spaact + (PAGE_SIZE << get_order(SPAACT_SIZE));
+
+ dev_dbg(dev, "ppaact virt=%p phys=%pa\n", ppaact, &ppaact_phys);
+
+ /* Check to see if we need to implement the work-around on this SOC */
+
+ /* Determine the Port ID for our coherence subdomain */
+ for (i = 0; i < ARRAY_SIZE(port_id_map); i++) {
+ if (port_id_map[i].svr == (mfspr(SPRN_SVR) & ~SVR_SECURITY)) {
+ csd_port_id = port_id_map[i].port_id;
+ dev_dbg(dev, "found matching SVR %08x\n",
+ port_id_map[i].svr);
+ break;
+ }
+ }
+
+ if (csd_port_id) {
+ dev_dbg(dev, "creating coherency subdomain at address %pa, size %zu, port id 0x%08x",
+ &ppaact_phys, mem_size, csd_port_id);
+
+ ret = create_csd(ppaact_phys, mem_size, csd_port_id);
+ if (ret) {
+ dev_err(dev, "could not create coherence subdomain\n");
+ goto error;
+ }
+ }
+
+ spaact_phys = virt_to_phys(spaact);
+ omt_phys = virt_to_phys(omt);
+
+ spaace_pool = gen_pool_create(ilog2(sizeof(struct paace)), -1);
+ if (!spaace_pool) {
+ ret = -ENOMEM;
+ dev_err(dev, "Failed to allocate spaace gen pool\n");
+ goto error;
+ }
+
+ ret = gen_pool_add(spaace_pool, (unsigned long)spaact, SPAACT_SIZE, -1);
+ if (ret)
+ goto error_genpool;
+
+ pamubypenr = in_be32(&guts_regs->pamubypenr);
+
+ for (pamu_reg_off = 0, pamu_counter = 0x80000000; pamu_reg_off < size;
+ pamu_reg_off += PAMU_OFFSET, pamu_counter >>= 1) {
+
+ pamu_reg_base = (unsigned long)pamu_regs + pamu_reg_off;
+ setup_one_pamu(pamu_reg_base, pamu_reg_off, ppaact_phys,
+ spaact_phys, omt_phys);
+ /* Disable PAMU bypass for this PAMU */
+ pamubypenr &= ~pamu_counter;
+ }
+
+ setup_omt(omt);
+
+ /* Enable all relevant PAMU(s) */
+ out_be32(&guts_regs->pamubypenr, pamubypenr);
+
+ iounmap(guts_regs);
+
+ /* Enable DMA for the LIODNs in the device tree */
+
+ setup_liodns();
+
+ probed = true;
+
+ return 0;
+
+error_genpool:
+ gen_pool_destroy(spaace_pool);
+
+error:
+ if (irq != NO_IRQ)
+ free_irq(irq, data);
+
+ kfree_sensitive(data);
+
+ if (pamu_regs)
+ iounmap(pamu_regs);
+
+ if (guts_regs)
+ iounmap(guts_regs);
+
+ if (ppaact)
+ free_pages((unsigned long)ppaact, order);
+
+ ppaact = NULL;
+
+ return ret;
+}
+
+static struct platform_driver fsl_of_pamu_driver = {
+ .driver = {
+ .name = "fsl-of-pamu",
+ },
+ .probe = fsl_pamu_probe,
+};
+
+static __init int fsl_pamu_init(void)
+{
+ struct platform_device *pdev = NULL;
+ struct device_node *np;
+ int ret;
+
+ /*
+ * The normal OF process calls the probe function at some
+ * indeterminate later time, after most drivers have loaded. This is
+ * too late for us, because PAMU clients (like the Qman driver)
+ * depend on PAMU being initialized early.
+ *
+ * So instead, we "manually" call our probe function by creating the
+ * platform devices ourselves.
+ */
+
+ /*
+ * We assume that there is only one PAMU node in the device tree. A
+ * single PAMU node represents all of the PAMU devices in the SOC
+ * already. Everything else already makes that assumption, and the
+ * binding for the PAMU nodes doesn't allow for any parent-child
+ * relationships anyway. In other words, support for more than one
+ * PAMU node would require significant changes to a lot of code.
+ */
+
+ np = of_find_compatible_node(NULL, NULL, "fsl,pamu");
+ if (!np) {
+ pr_err("could not find a PAMU node\n");
+ return -ENODEV;
+ }
+
+ ret = platform_driver_register(&fsl_of_pamu_driver);
+ if (ret) {
+ pr_err("could not register driver (err=%i)\n", ret);
+ goto error_driver_register;
+ }
+
+ pdev = platform_device_alloc("fsl-of-pamu", 0);
+ if (!pdev) {
+ pr_err("could not allocate device %pOF\n", np);
+ ret = -ENOMEM;
+ goto error_device_alloc;
+ }
+ pdev->dev.of_node = of_node_get(np);
+
+ ret = pamu_domain_init();
+ if (ret)
+ goto error_device_add;
+
+ ret = platform_device_add(pdev);
+ if (ret) {
+ pr_err("could not add device %pOF (err=%i)\n", np, ret);
+ goto error_device_add;
+ }
+
+ return 0;
+
+error_device_add:
+ of_node_put(pdev->dev.of_node);
+ pdev->dev.of_node = NULL;
+
+ platform_device_put(pdev);
+
+error_device_alloc:
+ platform_driver_unregister(&fsl_of_pamu_driver);
+
+error_driver_register:
+ of_node_put(np);
+
+ return ret;
+}
+arch_initcall(fsl_pamu_init);
diff --git a/drivers/iommu/fsl_pamu.h b/drivers/iommu/fsl_pamu.h
new file mode 100644
index 000000000..e1496ba96
--- /dev/null
+++ b/drivers/iommu/fsl_pamu.h
@@ -0,0 +1,400 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ *
+ * Copyright (C) 2013 Freescale Semiconductor, Inc.
+ */
+
+#ifndef __FSL_PAMU_H
+#define __FSL_PAMU_H
+
+#include <linux/iommu.h>
+#include <linux/pci.h>
+
+#include <asm/fsl_pamu_stash.h>
+
+/* Bit Field macros
+ * v = bit field variable; m = mask, m##_SHIFT = shift, x = value to load
+ */
+#define set_bf(v, m, x) (v = ((v) & ~(m)) | (((x) << m##_SHIFT) & (m)))
+#define get_bf(v, m) (((v) & (m)) >> m##_SHIFT)
+
+/* PAMU CCSR space */
+#define PAMU_PGC 0x00000000 /* Allows all peripheral accesses */
+#define PAMU_PE 0x40000000 /* enable PAMU */
+
+/* PAMU_OFFSET to the next pamu space in ccsr */
+#define PAMU_OFFSET 0x1000
+
+#define PAMU_MMAP_REGS_BASE 0
+
+struct pamu_mmap_regs {
+ u32 ppbah;
+ u32 ppbal;
+ u32 pplah;
+ u32 pplal;
+ u32 spbah;
+ u32 spbal;
+ u32 splah;
+ u32 splal;
+ u32 obah;
+ u32 obal;
+ u32 olah;
+ u32 olal;
+};
+
+/* PAMU Error Registers */
+#define PAMU_POES1 0x0040
+#define PAMU_POES2 0x0044
+#define PAMU_POEAH 0x0048
+#define PAMU_POEAL 0x004C
+#define PAMU_AVS1 0x0050
+#define PAMU_AVS1_AV 0x1
+#define PAMU_AVS1_OTV 0x6
+#define PAMU_AVS1_APV 0x78
+#define PAMU_AVS1_WAV 0x380
+#define PAMU_AVS1_LAV 0x1c00
+#define PAMU_AVS1_GCV 0x2000
+#define PAMU_AVS1_PDV 0x4000
+#define PAMU_AV_MASK (PAMU_AVS1_AV | PAMU_AVS1_OTV | PAMU_AVS1_APV | PAMU_AVS1_WAV \
+ | PAMU_AVS1_LAV | PAMU_AVS1_GCV | PAMU_AVS1_PDV)
+#define PAMU_AVS1_LIODN_SHIFT 16
+#define PAMU_LAV_LIODN_NOT_IN_PPAACT 0x400
+
+#define PAMU_AVS2 0x0054
+#define PAMU_AVAH 0x0058
+#define PAMU_AVAL 0x005C
+#define PAMU_EECTL 0x0060
+#define PAMU_EEDIS 0x0064
+#define PAMU_EEINTEN 0x0068
+#define PAMU_EEDET 0x006C
+#define PAMU_EEATTR 0x0070
+#define PAMU_EEAHI 0x0074
+#define PAMU_EEALO 0x0078
+#define PAMU_EEDHI 0X007C
+#define PAMU_EEDLO 0x0080
+#define PAMU_EECC 0x0084
+#define PAMU_UDAD 0x0090
+
+/* PAMU Revision Registers */
+#define PAMU_PR1 0x0BF8
+#define PAMU_PR2 0x0BFC
+
+/* PAMU version mask */
+#define PAMU_PR1_MASK 0xffff
+
+/* PAMU Capabilities Registers */
+#define PAMU_PC1 0x0C00
+#define PAMU_PC2 0x0C04
+#define PAMU_PC3 0x0C08
+#define PAMU_PC4 0x0C0C
+
+/* PAMU Control Register */
+#define PAMU_PC 0x0C10
+
+/* PAMU control defs */
+#define PAMU_CONTROL 0x0C10
+#define PAMU_PC_PGC 0x80000000 /* PAMU gate closed bit */
+#define PAMU_PC_PE 0x40000000 /* PAMU enable bit */
+#define PAMU_PC_SPCC 0x00000010 /* sPAACE cache enable */
+#define PAMU_PC_PPCC 0x00000001 /* pPAACE cache enable */
+#define PAMU_PC_OCE 0x00001000 /* OMT cache enable */
+
+#define PAMU_PFA1 0x0C14
+#define PAMU_PFA2 0x0C18
+
+#define PAMU_PC2_MLIODN(X) ((X) >> 16)
+#define PAMU_PC3_MWCE(X) (((X) >> 21) & 0xf)
+
+/* PAMU Interrupt control and Status Register */
+#define PAMU_PICS 0x0C1C
+#define PAMU_ACCESS_VIOLATION_STAT 0x8
+#define PAMU_ACCESS_VIOLATION_ENABLE 0x4
+
+/* PAMU Debug Registers */
+#define PAMU_PD1 0x0F00
+#define PAMU_PD2 0x0F04
+#define PAMU_PD3 0x0F08
+#define PAMU_PD4 0x0F0C
+
+#define PAACE_AP_PERMS_DENIED 0x0
+#define PAACE_AP_PERMS_QUERY 0x1
+#define PAACE_AP_PERMS_UPDATE 0x2
+#define PAACE_AP_PERMS_ALL 0x3
+
+#define PAACE_DD_TO_HOST 0x0
+#define PAACE_DD_TO_IO 0x1
+#define PAACE_PT_PRIMARY 0x0
+#define PAACE_PT_SECONDARY 0x1
+#define PAACE_V_INVALID 0x0
+#define PAACE_V_VALID 0x1
+#define PAACE_MW_SUBWINDOWS 0x1
+
+#define PAACE_WSE_4K 0xB
+#define PAACE_WSE_8K 0xC
+#define PAACE_WSE_16K 0xD
+#define PAACE_WSE_32K 0xE
+#define PAACE_WSE_64K 0xF
+#define PAACE_WSE_128K 0x10
+#define PAACE_WSE_256K 0x11
+#define PAACE_WSE_512K 0x12
+#define PAACE_WSE_1M 0x13
+#define PAACE_WSE_2M 0x14
+#define PAACE_WSE_4M 0x15
+#define PAACE_WSE_8M 0x16
+#define PAACE_WSE_16M 0x17
+#define PAACE_WSE_32M 0x18
+#define PAACE_WSE_64M 0x19
+#define PAACE_WSE_128M 0x1A
+#define PAACE_WSE_256M 0x1B
+#define PAACE_WSE_512M 0x1C
+#define PAACE_WSE_1G 0x1D
+#define PAACE_WSE_2G 0x1E
+#define PAACE_WSE_4G 0x1F
+
+#define PAACE_DID_PCI_EXPRESS_1 0x00
+#define PAACE_DID_PCI_EXPRESS_2 0x01
+#define PAACE_DID_PCI_EXPRESS_3 0x02
+#define PAACE_DID_PCI_EXPRESS_4 0x03
+#define PAACE_DID_LOCAL_BUS 0x04
+#define PAACE_DID_SRIO 0x0C
+#define PAACE_DID_MEM_1 0x10
+#define PAACE_DID_MEM_2 0x11
+#define PAACE_DID_MEM_3 0x12
+#define PAACE_DID_MEM_4 0x13
+#define PAACE_DID_MEM_1_2 0x14
+#define PAACE_DID_MEM_3_4 0x15
+#define PAACE_DID_MEM_1_4 0x16
+#define PAACE_DID_BM_SW_PORTAL 0x18
+#define PAACE_DID_PAMU 0x1C
+#define PAACE_DID_CAAM 0x21
+#define PAACE_DID_QM_SW_PORTAL 0x3C
+#define PAACE_DID_CORE0_INST 0x80
+#define PAACE_DID_CORE0_DATA 0x81
+#define PAACE_DID_CORE1_INST 0x82
+#define PAACE_DID_CORE1_DATA 0x83
+#define PAACE_DID_CORE2_INST 0x84
+#define PAACE_DID_CORE2_DATA 0x85
+#define PAACE_DID_CORE3_INST 0x86
+#define PAACE_DID_CORE3_DATA 0x87
+#define PAACE_DID_CORE4_INST 0x88
+#define PAACE_DID_CORE4_DATA 0x89
+#define PAACE_DID_CORE5_INST 0x8A
+#define PAACE_DID_CORE5_DATA 0x8B
+#define PAACE_DID_CORE6_INST 0x8C
+#define PAACE_DID_CORE6_DATA 0x8D
+#define PAACE_DID_CORE7_INST 0x8E
+#define PAACE_DID_CORE7_DATA 0x8F
+#define PAACE_DID_BROADCAST 0xFF
+
+#define PAACE_ATM_NO_XLATE 0x00
+#define PAACE_ATM_WINDOW_XLATE 0x01
+#define PAACE_ATM_PAGE_XLATE 0x02
+#define PAACE_ATM_WIN_PG_XLATE (PAACE_ATM_WINDOW_XLATE | PAACE_ATM_PAGE_XLATE)
+#define PAACE_OTM_NO_XLATE 0x00
+#define PAACE_OTM_IMMEDIATE 0x01
+#define PAACE_OTM_INDEXED 0x02
+#define PAACE_OTM_RESERVED 0x03
+
+#define PAACE_M_COHERENCE_REQ 0x01
+
+#define PAACE_PID_0 0x0
+#define PAACE_PID_1 0x1
+#define PAACE_PID_2 0x2
+#define PAACE_PID_3 0x3
+#define PAACE_PID_4 0x4
+#define PAACE_PID_5 0x5
+#define PAACE_PID_6 0x6
+#define PAACE_PID_7 0x7
+
+#define PAACE_TCEF_FORMAT0_8B 0x00
+#define PAACE_TCEF_FORMAT1_RSVD 0x01
+/*
+ * Hard coded value for the PAACT size to accommodate
+ * maximum LIODN value generated by u-boot.
+ */
+#define PAACE_NUMBER_ENTRIES 0x500
+/* Hard coded value for the SPAACT size */
+#define SPAACE_NUMBER_ENTRIES 0x800
+
+#define OME_NUMBER_ENTRIES 16
+
+/* PAACE Bit Field Defines */
+#define PPAACE_AF_WBAL 0xfffff000
+#define PPAACE_AF_WBAL_SHIFT 12
+#define PPAACE_AF_WSE 0x00000fc0
+#define PPAACE_AF_WSE_SHIFT 6
+#define PPAACE_AF_MW 0x00000020
+#define PPAACE_AF_MW_SHIFT 5
+
+#define SPAACE_AF_LIODN 0xffff0000
+#define SPAACE_AF_LIODN_SHIFT 16
+
+#define PAACE_AF_AP 0x00000018
+#define PAACE_AF_AP_SHIFT 3
+#define PAACE_AF_DD 0x00000004
+#define PAACE_AF_DD_SHIFT 2
+#define PAACE_AF_PT 0x00000002
+#define PAACE_AF_PT_SHIFT 1
+#define PAACE_AF_V 0x00000001
+#define PAACE_AF_V_SHIFT 0
+
+#define PAACE_DA_HOST_CR 0x80
+#define PAACE_DA_HOST_CR_SHIFT 7
+
+#define PAACE_IA_CID 0x00FF0000
+#define PAACE_IA_CID_SHIFT 16
+#define PAACE_IA_WCE 0x000000F0
+#define PAACE_IA_WCE_SHIFT 4
+#define PAACE_IA_ATM 0x0000000C
+#define PAACE_IA_ATM_SHIFT 2
+#define PAACE_IA_OTM 0x00000003
+#define PAACE_IA_OTM_SHIFT 0
+
+#define PAACE_WIN_TWBAL 0xfffff000
+#define PAACE_WIN_TWBAL_SHIFT 12
+#define PAACE_WIN_SWSE 0x00000fc0
+#define PAACE_WIN_SWSE_SHIFT 6
+
+/* PAMU Data Structures */
+/* primary / secondary paact structure */
+struct paace {
+ /* PAACE Offset 0x00 */
+ u32 wbah; /* only valid for Primary PAACE */
+ u32 addr_bitfields; /* See P/S PAACE_AF_* */
+
+ /* PAACE Offset 0x08 */
+ /* Interpretation of first 32 bits dependent on DD above */
+ union {
+ struct {
+ /* Destination ID, see PAACE_DID_* defines */
+ u8 did;
+ /* Partition ID */
+ u8 pid;
+ /* Snoop ID */
+ u8 snpid;
+ /* coherency_required : 1 reserved : 7 */
+ u8 coherency_required; /* See PAACE_DA_* */
+ } to_host;
+ struct {
+ /* Destination ID, see PAACE_DID_* defines */
+ u8 did;
+ u8 reserved1;
+ u16 reserved2;
+ } to_io;
+ } domain_attr;
+
+ /* Implementation attributes + window count + address & operation translation modes */
+ u32 impl_attr; /* See PAACE_IA_* */
+
+ /* PAACE Offset 0x10 */
+ /* Translated window base address */
+ u32 twbah;
+ u32 win_bitfields; /* See PAACE_WIN_* */
+
+ /* PAACE Offset 0x18 */
+ /* first secondary paace entry */
+ u32 fspi; /* only valid for Primary PAACE */
+ union {
+ struct {
+ u8 ioea;
+ u8 moea;
+ u8 ioeb;
+ u8 moeb;
+ } immed_ot;
+ struct {
+ u16 reserved;
+ u16 omi;
+ } index_ot;
+ } op_encode;
+
+ /* PAACE Offsets 0x20-0x38 */
+ u32 reserved[8]; /* not currently implemented */
+};
+
+/* OME : Operation mapping entry
+ * MOE : Mapped Operation Encodings
+ * The operation mapping table is table containing operation mapping entries (OME).
+ * The index of a particular OME is programmed in the PAACE entry for translation
+ * in bound I/O operations corresponding to an LIODN. The OMT is used for translation
+ * specifically in case of the indexed translation mode. Each OME contains a 128
+ * byte mapped operation encoding (MOE), where each byte represents an MOE.
+ */
+#define NUM_MOE 128
+struct ome {
+ u8 moe[NUM_MOE];
+} __packed;
+
+#define PAACT_SIZE (sizeof(struct paace) * PAACE_NUMBER_ENTRIES)
+#define SPAACT_SIZE (sizeof(struct paace) * SPAACE_NUMBER_ENTRIES)
+#define OMT_SIZE (sizeof(struct ome) * OME_NUMBER_ENTRIES)
+
+#define PAMU_PAGE_SHIFT 12
+#define PAMU_PAGE_SIZE 4096ULL
+
+#define IOE_READ 0x00
+#define IOE_READ_IDX 0x00
+#define IOE_WRITE 0x81
+#define IOE_WRITE_IDX 0x01
+#define IOE_EREAD0 0x82 /* Enhanced read type 0 */
+#define IOE_EREAD0_IDX 0x02 /* Enhanced read type 0 */
+#define IOE_EWRITE0 0x83 /* Enhanced write type 0 */
+#define IOE_EWRITE0_IDX 0x03 /* Enhanced write type 0 */
+#define IOE_DIRECT0 0x84 /* Directive type 0 */
+#define IOE_DIRECT0_IDX 0x04 /* Directive type 0 */
+#define IOE_EREAD1 0x85 /* Enhanced read type 1 */
+#define IOE_EREAD1_IDX 0x05 /* Enhanced read type 1 */
+#define IOE_EWRITE1 0x86 /* Enhanced write type 1 */
+#define IOE_EWRITE1_IDX 0x06 /* Enhanced write type 1 */
+#define IOE_DIRECT1 0x87 /* Directive type 1 */
+#define IOE_DIRECT1_IDX 0x07 /* Directive type 1 */
+#define IOE_RAC 0x8c /* Read with Atomic clear */
+#define IOE_RAC_IDX 0x0c /* Read with Atomic clear */
+#define IOE_RAS 0x8d /* Read with Atomic set */
+#define IOE_RAS_IDX 0x0d /* Read with Atomic set */
+#define IOE_RAD 0x8e /* Read with Atomic decrement */
+#define IOE_RAD_IDX 0x0e /* Read with Atomic decrement */
+#define IOE_RAI 0x8f /* Read with Atomic increment */
+#define IOE_RAI_IDX 0x0f /* Read with Atomic increment */
+
+#define EOE_READ 0x00
+#define EOE_WRITE 0x01
+#define EOE_RAC 0x0c /* Read with Atomic clear */
+#define EOE_RAS 0x0d /* Read with Atomic set */
+#define EOE_RAD 0x0e /* Read with Atomic decrement */
+#define EOE_RAI 0x0f /* Read with Atomic increment */
+#define EOE_LDEC 0x10 /* Load external cache */
+#define EOE_LDECL 0x11 /* Load external cache with stash lock */
+#define EOE_LDECPE 0x12 /* Load external cache with preferred exclusive */
+#define EOE_LDECPEL 0x13 /* Load external cache with preferred exclusive and lock */
+#define EOE_LDECFE 0x14 /* Load external cache with forced exclusive */
+#define EOE_LDECFEL 0x15 /* Load external cache with forced exclusive and lock */
+#define EOE_RSA 0x16 /* Read with stash allocate */
+#define EOE_RSAU 0x17 /* Read with stash allocate and unlock */
+#define EOE_READI 0x18 /* Read with invalidate */
+#define EOE_RWNITC 0x19 /* Read with no intention to cache */
+#define EOE_WCI 0x1a /* Write cache inhibited */
+#define EOE_WWSA 0x1b /* Write with stash allocate */
+#define EOE_WWSAL 0x1c /* Write with stash allocate and lock */
+#define EOE_WWSAO 0x1d /* Write with stash allocate only */
+#define EOE_WWSAOL 0x1e /* Write with stash allocate only and lock */
+#define EOE_VALID 0x80
+
+/* Function prototypes */
+int pamu_domain_init(void);
+int pamu_enable_liodn(int liodn);
+int pamu_disable_liodn(int liodn);
+void pamu_free_subwins(int liodn);
+int pamu_config_ppaace(int liodn, phys_addr_t win_addr, phys_addr_t win_size,
+ u32 omi, unsigned long rpn, u32 snoopid, uint32_t stashid,
+ u32 subwin_cnt, int prot);
+int pamu_config_spaace(int liodn, u32 subwin_cnt, u32 subwin_addr,
+ phys_addr_t subwin_size, u32 omi, unsigned long rpn,
+ uint32_t snoopid, u32 stashid, int enable, int prot);
+
+u32 get_stash_id(u32 stash_dest_hint, u32 vcpu);
+void get_ome_index(u32 *omi_index, struct device *dev);
+int pamu_update_paace_stash(int liodn, u32 subwin, u32 value);
+int pamu_disable_spaace(int liodn, u32 subwin);
+u32 pamu_get_max_subwin_cnt(void);
+
+#endif /* __FSL_PAMU_H */
diff --git a/drivers/iommu/fsl_pamu_domain.c b/drivers/iommu/fsl_pamu_domain.c
new file mode 100644
index 000000000..b2110767c
--- /dev/null
+++ b/drivers/iommu/fsl_pamu_domain.c
@@ -0,0 +1,1069 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ *
+ * Copyright (C) 2013 Freescale Semiconductor, Inc.
+ * Author: Varun Sethi <varun.sethi@freescale.com>
+ */
+
+#define pr_fmt(fmt) "fsl-pamu-domain: %s: " fmt, __func__
+
+#include "fsl_pamu_domain.h"
+
+#include <sysdev/fsl_pci.h>
+
+/*
+ * Global spinlock that needs to be held while
+ * configuring PAMU.
+ */
+static DEFINE_SPINLOCK(iommu_lock);
+
+static struct kmem_cache *fsl_pamu_domain_cache;
+static struct kmem_cache *iommu_devinfo_cache;
+static DEFINE_SPINLOCK(device_domain_lock);
+
+struct iommu_device pamu_iommu; /* IOMMU core code handle */
+
+static struct fsl_dma_domain *to_fsl_dma_domain(struct iommu_domain *dom)
+{
+ return container_of(dom, struct fsl_dma_domain, iommu_domain);
+}
+
+static int __init iommu_init_mempool(void)
+{
+ fsl_pamu_domain_cache = kmem_cache_create("fsl_pamu_domain",
+ sizeof(struct fsl_dma_domain),
+ 0,
+ SLAB_HWCACHE_ALIGN,
+ NULL);
+ if (!fsl_pamu_domain_cache) {
+ pr_debug("Couldn't create fsl iommu_domain cache\n");
+ return -ENOMEM;
+ }
+
+ iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
+ sizeof(struct device_domain_info),
+ 0,
+ SLAB_HWCACHE_ALIGN,
+ NULL);
+ if (!iommu_devinfo_cache) {
+ pr_debug("Couldn't create devinfo cache\n");
+ kmem_cache_destroy(fsl_pamu_domain_cache);
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+static phys_addr_t get_phys_addr(struct fsl_dma_domain *dma_domain, dma_addr_t iova)
+{
+ u32 win_cnt = dma_domain->win_cnt;
+ struct dma_window *win_ptr = &dma_domain->win_arr[0];
+ struct iommu_domain_geometry *geom;
+
+ geom = &dma_domain->iommu_domain.geometry;
+
+ if (!win_cnt || !dma_domain->geom_size) {
+ pr_debug("Number of windows/geometry not configured for the domain\n");
+ return 0;
+ }
+
+ if (win_cnt > 1) {
+ u64 subwin_size;
+ dma_addr_t subwin_iova;
+ u32 wnd;
+
+ subwin_size = dma_domain->geom_size >> ilog2(win_cnt);
+ subwin_iova = iova & ~(subwin_size - 1);
+ wnd = (subwin_iova - geom->aperture_start) >> ilog2(subwin_size);
+ win_ptr = &dma_domain->win_arr[wnd];
+ }
+
+ if (win_ptr->valid)
+ return win_ptr->paddr + (iova & (win_ptr->size - 1));
+
+ return 0;
+}
+
+static int map_subwins(int liodn, struct fsl_dma_domain *dma_domain)
+{
+ struct dma_window *sub_win_ptr = &dma_domain->win_arr[0];
+ int i, ret;
+ unsigned long rpn, flags;
+
+ for (i = 0; i < dma_domain->win_cnt; i++) {
+ if (sub_win_ptr[i].valid) {
+ rpn = sub_win_ptr[i].paddr >> PAMU_PAGE_SHIFT;
+ spin_lock_irqsave(&iommu_lock, flags);
+ ret = pamu_config_spaace(liodn, dma_domain->win_cnt, i,
+ sub_win_ptr[i].size,
+ ~(u32)0,
+ rpn,
+ dma_domain->snoop_id,
+ dma_domain->stash_id,
+ (i > 0) ? 1 : 0,
+ sub_win_ptr[i].prot);
+ spin_unlock_irqrestore(&iommu_lock, flags);
+ if (ret) {
+ pr_debug("SPAACE configuration failed for liodn %d\n",
+ liodn);
+ return ret;
+ }
+ }
+ }
+
+ return ret;
+}
+
+static int map_win(int liodn, struct fsl_dma_domain *dma_domain)
+{
+ int ret;
+ struct dma_window *wnd = &dma_domain->win_arr[0];
+ phys_addr_t wnd_addr = dma_domain->iommu_domain.geometry.aperture_start;
+ unsigned long flags;
+
+ spin_lock_irqsave(&iommu_lock, flags);
+ ret = pamu_config_ppaace(liodn, wnd_addr,
+ wnd->size,
+ ~(u32)0,
+ wnd->paddr >> PAMU_PAGE_SHIFT,
+ dma_domain->snoop_id, dma_domain->stash_id,
+ 0, wnd->prot);
+ spin_unlock_irqrestore(&iommu_lock, flags);
+ if (ret)
+ pr_debug("PAACE configuration failed for liodn %d\n", liodn);
+
+ return ret;
+}
+
+/* Map the DMA window corresponding to the LIODN */
+static int map_liodn(int liodn, struct fsl_dma_domain *dma_domain)
+{
+ if (dma_domain->win_cnt > 1)
+ return map_subwins(liodn, dma_domain);
+ else
+ return map_win(liodn, dma_domain);
+}
+
+/* Update window/subwindow mapping for the LIODN */
+static int update_liodn(int liodn, struct fsl_dma_domain *dma_domain, u32 wnd_nr)
+{
+ int ret;
+ struct dma_window *wnd = &dma_domain->win_arr[wnd_nr];
+ unsigned long flags;
+
+ spin_lock_irqsave(&iommu_lock, flags);
+ if (dma_domain->win_cnt > 1) {
+ ret = pamu_config_spaace(liodn, dma_domain->win_cnt, wnd_nr,
+ wnd->size,
+ ~(u32)0,
+ wnd->paddr >> PAMU_PAGE_SHIFT,
+ dma_domain->snoop_id,
+ dma_domain->stash_id,
+ (wnd_nr > 0) ? 1 : 0,
+ wnd->prot);
+ if (ret)
+ pr_debug("Subwindow reconfiguration failed for liodn %d\n",
+ liodn);
+ } else {
+ phys_addr_t wnd_addr;
+
+ wnd_addr = dma_domain->iommu_domain.geometry.aperture_start;
+
+ ret = pamu_config_ppaace(liodn, wnd_addr,
+ wnd->size,
+ ~(u32)0,
+ wnd->paddr >> PAMU_PAGE_SHIFT,
+ dma_domain->snoop_id, dma_domain->stash_id,
+ 0, wnd->prot);
+ if (ret)
+ pr_debug("Window reconfiguration failed for liodn %d\n",
+ liodn);
+ }
+
+ spin_unlock_irqrestore(&iommu_lock, flags);
+
+ return ret;
+}
+
+static int update_liodn_stash(int liodn, struct fsl_dma_domain *dma_domain,
+ u32 val)
+{
+ int ret = 0, i;
+ unsigned long flags;
+
+ spin_lock_irqsave(&iommu_lock, flags);
+ if (!dma_domain->win_arr) {
+ pr_debug("Windows not configured, stash destination update failed for liodn %d\n",
+ liodn);
+ spin_unlock_irqrestore(&iommu_lock, flags);
+ return -EINVAL;
+ }
+
+ for (i = 0; i < dma_domain->win_cnt; i++) {
+ ret = pamu_update_paace_stash(liodn, i, val);
+ if (ret) {
+ pr_debug("Failed to update SPAACE %d field for liodn %d\n ",
+ i, liodn);
+ spin_unlock_irqrestore(&iommu_lock, flags);
+ return ret;
+ }
+ }
+
+ spin_unlock_irqrestore(&iommu_lock, flags);
+
+ return ret;
+}
+
+/* Set the geometry parameters for a LIODN */
+static int pamu_set_liodn(int liodn, struct device *dev,
+ struct fsl_dma_domain *dma_domain,
+ struct iommu_domain_geometry *geom_attr,
+ u32 win_cnt)
+{
+ phys_addr_t window_addr, window_size;
+ phys_addr_t subwin_size;
+ int ret = 0, i;
+ u32 omi_index = ~(u32)0;
+ unsigned long flags;
+
+ /*
+ * Configure the omi_index at the geometry setup time.
+ * This is a static value which depends on the type of
+ * device and would not change thereafter.
+ */
+ get_ome_index(&omi_index, dev);
+
+ window_addr = geom_attr->aperture_start;
+ window_size = dma_domain->geom_size;
+
+ spin_lock_irqsave(&iommu_lock, flags);
+ ret = pamu_disable_liodn(liodn);
+ if (!ret)
+ ret = pamu_config_ppaace(liodn, window_addr, window_size, omi_index,
+ 0, dma_domain->snoop_id,
+ dma_domain->stash_id, win_cnt, 0);
+ spin_unlock_irqrestore(&iommu_lock, flags);
+ if (ret) {
+ pr_debug("PAACE configuration failed for liodn %d, win_cnt =%d\n",
+ liodn, win_cnt);
+ return ret;
+ }
+
+ if (win_cnt > 1) {
+ subwin_size = window_size >> ilog2(win_cnt);
+ for (i = 0; i < win_cnt; i++) {
+ spin_lock_irqsave(&iommu_lock, flags);
+ ret = pamu_disable_spaace(liodn, i);
+ if (!ret)
+ ret = pamu_config_spaace(liodn, win_cnt, i,
+ subwin_size, omi_index,
+ 0, dma_domain->snoop_id,
+ dma_domain->stash_id,
+ 0, 0);
+ spin_unlock_irqrestore(&iommu_lock, flags);
+ if (ret) {
+ pr_debug("SPAACE configuration failed for liodn %d\n",
+ liodn);
+ return ret;
+ }
+ }
+ }
+
+ return ret;
+}
+
+static int check_size(u64 size, dma_addr_t iova)
+{
+ /*
+ * Size must be a power of two and at least be equal
+ * to PAMU page size.
+ */
+ if ((size & (size - 1)) || size < PAMU_PAGE_SIZE) {
+ pr_debug("Size too small or not a power of two\n");
+ return -EINVAL;
+ }
+
+ /* iova must be page size aligned */
+ if (iova & (size - 1)) {
+ pr_debug("Address is not aligned with window size\n");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static struct fsl_dma_domain *iommu_alloc_dma_domain(void)
+{
+ struct fsl_dma_domain *domain;
+
+ domain = kmem_cache_zalloc(fsl_pamu_domain_cache, GFP_KERNEL);
+ if (!domain)
+ return NULL;
+
+ domain->stash_id = ~(u32)0;
+ domain->snoop_id = ~(u32)0;
+ domain->win_cnt = pamu_get_max_subwin_cnt();
+ domain->geom_size = 0;
+
+ INIT_LIST_HEAD(&domain->devices);
+
+ spin_lock_init(&domain->domain_lock);
+
+ return domain;
+}
+
+static void remove_device_ref(struct device_domain_info *info, u32 win_cnt)
+{
+ unsigned long flags;
+
+ list_del(&info->link);
+ spin_lock_irqsave(&iommu_lock, flags);
+ if (win_cnt > 1)
+ pamu_free_subwins(info->liodn);
+ pamu_disable_liodn(info->liodn);
+ spin_unlock_irqrestore(&iommu_lock, flags);
+ spin_lock_irqsave(&device_domain_lock, flags);
+ dev_iommu_priv_set(info->dev, NULL);
+ kmem_cache_free(iommu_devinfo_cache, info);
+ spin_unlock_irqrestore(&device_domain_lock, flags);
+}
+
+static void detach_device(struct device *dev, struct fsl_dma_domain *dma_domain)
+{
+ struct device_domain_info *info, *tmp;
+ unsigned long flags;
+
+ spin_lock_irqsave(&dma_domain->domain_lock, flags);
+ /* Remove the device from the domain device list */
+ list_for_each_entry_safe(info, tmp, &dma_domain->devices, link) {
+ if (!dev || (info->dev == dev))
+ remove_device_ref(info, dma_domain->win_cnt);
+ }
+ spin_unlock_irqrestore(&dma_domain->domain_lock, flags);
+}
+
+static void attach_device(struct fsl_dma_domain *dma_domain, int liodn, struct device *dev)
+{
+ struct device_domain_info *info, *old_domain_info;
+ unsigned long flags;
+
+ spin_lock_irqsave(&device_domain_lock, flags);
+ /*
+ * Check here if the device is already attached to domain or not.
+ * If the device is already attached to a domain detach it.
+ */
+ old_domain_info = dev_iommu_priv_get(dev);
+ if (old_domain_info && old_domain_info->domain != dma_domain) {
+ spin_unlock_irqrestore(&device_domain_lock, flags);
+ detach_device(dev, old_domain_info->domain);
+ spin_lock_irqsave(&device_domain_lock, flags);
+ }
+
+ info = kmem_cache_zalloc(iommu_devinfo_cache, GFP_ATOMIC);
+
+ info->dev = dev;
+ info->liodn = liodn;
+ info->domain = dma_domain;
+
+ list_add(&info->link, &dma_domain->devices);
+ /*
+ * In case of devices with multiple LIODNs just store
+ * the info for the first LIODN as all
+ * LIODNs share the same domain
+ */
+ if (!dev_iommu_priv_get(dev))
+ dev_iommu_priv_set(dev, info);
+ spin_unlock_irqrestore(&device_domain_lock, flags);
+}
+
+static phys_addr_t fsl_pamu_iova_to_phys(struct iommu_domain *domain,
+ dma_addr_t iova)
+{
+ struct fsl_dma_domain *dma_domain = to_fsl_dma_domain(domain);
+
+ if (iova < domain->geometry.aperture_start ||
+ iova > domain->geometry.aperture_end)
+ return 0;
+
+ return get_phys_addr(dma_domain, iova);
+}
+
+static bool fsl_pamu_capable(enum iommu_cap cap)
+{
+ return cap == IOMMU_CAP_CACHE_COHERENCY;
+}
+
+static void fsl_pamu_domain_free(struct iommu_domain *domain)
+{
+ struct fsl_dma_domain *dma_domain = to_fsl_dma_domain(domain);
+
+ /* remove all the devices from the device list */
+ detach_device(NULL, dma_domain);
+
+ dma_domain->enabled = 0;
+ dma_domain->mapped = 0;
+
+ kmem_cache_free(fsl_pamu_domain_cache, dma_domain);
+}
+
+static struct iommu_domain *fsl_pamu_domain_alloc(unsigned type)
+{
+ struct fsl_dma_domain *dma_domain;
+
+ if (type != IOMMU_DOMAIN_UNMANAGED)
+ return NULL;
+
+ dma_domain = iommu_alloc_dma_domain();
+ if (!dma_domain) {
+ pr_debug("dma_domain allocation failed\n");
+ return NULL;
+ }
+ /* defaul geometry 64 GB i.e. maximum system address */
+ dma_domain->iommu_domain. geometry.aperture_start = 0;
+ dma_domain->iommu_domain.geometry.aperture_end = (1ULL << 36) - 1;
+ dma_domain->iommu_domain.geometry.force_aperture = true;
+
+ return &dma_domain->iommu_domain;
+}
+
+/* Configure geometry settings for all LIODNs associated with domain */
+static int pamu_set_domain_geometry(struct fsl_dma_domain *dma_domain,
+ struct iommu_domain_geometry *geom_attr,
+ u32 win_cnt)
+{
+ struct device_domain_info *info;
+ int ret = 0;
+
+ list_for_each_entry(info, &dma_domain->devices, link) {
+ ret = pamu_set_liodn(info->liodn, info->dev, dma_domain,
+ geom_attr, win_cnt);
+ if (ret)
+ break;
+ }
+
+ return ret;
+}
+
+/* Update stash destination for all LIODNs associated with the domain */
+static int update_domain_stash(struct fsl_dma_domain *dma_domain, u32 val)
+{
+ struct device_domain_info *info;
+ int ret = 0;
+
+ list_for_each_entry(info, &dma_domain->devices, link) {
+ ret = update_liodn_stash(info->liodn, dma_domain, val);
+ if (ret)
+ break;
+ }
+
+ return ret;
+}
+
+/* Update domain mappings for all LIODNs associated with the domain */
+static int update_domain_mapping(struct fsl_dma_domain *dma_domain, u32 wnd_nr)
+{
+ struct device_domain_info *info;
+ int ret = 0;
+
+ list_for_each_entry(info, &dma_domain->devices, link) {
+ ret = update_liodn(info->liodn, dma_domain, wnd_nr);
+ if (ret)
+ break;
+ }
+ return ret;
+}
+
+static int disable_domain_win(struct fsl_dma_domain *dma_domain, u32 wnd_nr)
+{
+ struct device_domain_info *info;
+ int ret = 0;
+
+ list_for_each_entry(info, &dma_domain->devices, link) {
+ if (dma_domain->win_cnt == 1 && dma_domain->enabled) {
+ ret = pamu_disable_liodn(info->liodn);
+ if (!ret)
+ dma_domain->enabled = 0;
+ } else {
+ ret = pamu_disable_spaace(info->liodn, wnd_nr);
+ }
+ }
+
+ return ret;
+}
+
+static void fsl_pamu_window_disable(struct iommu_domain *domain, u32 wnd_nr)
+{
+ struct fsl_dma_domain *dma_domain = to_fsl_dma_domain(domain);
+ unsigned long flags;
+ int ret;
+
+ spin_lock_irqsave(&dma_domain->domain_lock, flags);
+ if (!dma_domain->win_arr) {
+ pr_debug("Number of windows not configured\n");
+ spin_unlock_irqrestore(&dma_domain->domain_lock, flags);
+ return;
+ }
+
+ if (wnd_nr >= dma_domain->win_cnt) {
+ pr_debug("Invalid window index\n");
+ spin_unlock_irqrestore(&dma_domain->domain_lock, flags);
+ return;
+ }
+
+ if (dma_domain->win_arr[wnd_nr].valid) {
+ ret = disable_domain_win(dma_domain, wnd_nr);
+ if (!ret) {
+ dma_domain->win_arr[wnd_nr].valid = 0;
+ dma_domain->mapped--;
+ }
+ }
+
+ spin_unlock_irqrestore(&dma_domain->domain_lock, flags);
+}
+
+static int fsl_pamu_window_enable(struct iommu_domain *domain, u32 wnd_nr,
+ phys_addr_t paddr, u64 size, int prot)
+{
+ struct fsl_dma_domain *dma_domain = to_fsl_dma_domain(domain);
+ struct dma_window *wnd;
+ int pamu_prot = 0;
+ int ret;
+ unsigned long flags;
+ u64 win_size;
+
+ if (prot & IOMMU_READ)
+ pamu_prot |= PAACE_AP_PERMS_QUERY;
+ if (prot & IOMMU_WRITE)
+ pamu_prot |= PAACE_AP_PERMS_UPDATE;
+
+ spin_lock_irqsave(&dma_domain->domain_lock, flags);
+ if (!dma_domain->win_arr) {
+ pr_debug("Number of windows not configured\n");
+ spin_unlock_irqrestore(&dma_domain->domain_lock, flags);
+ return -ENODEV;
+ }
+
+ if (wnd_nr >= dma_domain->win_cnt) {
+ pr_debug("Invalid window index\n");
+ spin_unlock_irqrestore(&dma_domain->domain_lock, flags);
+ return -EINVAL;
+ }
+
+ win_size = dma_domain->geom_size >> ilog2(dma_domain->win_cnt);
+ if (size > win_size) {
+ pr_debug("Invalid window size\n");
+ spin_unlock_irqrestore(&dma_domain->domain_lock, flags);
+ return -EINVAL;
+ }
+
+ if (dma_domain->win_cnt == 1) {
+ if (dma_domain->enabled) {
+ pr_debug("Disable the window before updating the mapping\n");
+ spin_unlock_irqrestore(&dma_domain->domain_lock, flags);
+ return -EBUSY;
+ }
+
+ ret = check_size(size, domain->geometry.aperture_start);
+ if (ret) {
+ pr_debug("Aperture start not aligned to the size\n");
+ spin_unlock_irqrestore(&dma_domain->domain_lock, flags);
+ return -EINVAL;
+ }
+ }
+
+ wnd = &dma_domain->win_arr[wnd_nr];
+ if (!wnd->valid) {
+ wnd->paddr = paddr;
+ wnd->size = size;
+ wnd->prot = pamu_prot;
+
+ ret = update_domain_mapping(dma_domain, wnd_nr);
+ if (!ret) {
+ wnd->valid = 1;
+ dma_domain->mapped++;
+ }
+ } else {
+ pr_debug("Disable the window before updating the mapping\n");
+ ret = -EBUSY;
+ }
+
+ spin_unlock_irqrestore(&dma_domain->domain_lock, flags);
+
+ return ret;
+}
+
+/*
+ * Attach the LIODN to the DMA domain and configure the geometry
+ * and window mappings.
+ */
+static int handle_attach_device(struct fsl_dma_domain *dma_domain,
+ struct device *dev, const u32 *liodn,
+ int num)
+{
+ unsigned long flags;
+ struct iommu_domain *domain = &dma_domain->iommu_domain;
+ int ret = 0;
+ int i;
+
+ spin_lock_irqsave(&dma_domain->domain_lock, flags);
+ for (i = 0; i < num; i++) {
+ /* Ensure that LIODN value is valid */
+ if (liodn[i] >= PAACE_NUMBER_ENTRIES) {
+ pr_debug("Invalid liodn %d, attach device failed for %pOF\n",
+ liodn[i], dev->of_node);
+ ret = -EINVAL;
+ break;
+ }
+
+ attach_device(dma_domain, liodn[i], dev);
+ /*
+ * Check if geometry has already been configured
+ * for the domain. If yes, set the geometry for
+ * the LIODN.
+ */
+ if (dma_domain->win_arr) {
+ u32 win_cnt = dma_domain->win_cnt > 1 ? dma_domain->win_cnt : 0;
+
+ ret = pamu_set_liodn(liodn[i], dev, dma_domain,
+ &domain->geometry, win_cnt);
+ if (ret)
+ break;
+ if (dma_domain->mapped) {
+ /*
+ * Create window/subwindow mapping for
+ * the LIODN.
+ */
+ ret = map_liodn(liodn[i], dma_domain);
+ if (ret)
+ break;
+ }
+ }
+ }
+ spin_unlock_irqrestore(&dma_domain->domain_lock, flags);
+
+ return ret;
+}
+
+static int fsl_pamu_attach_device(struct iommu_domain *domain,
+ struct device *dev)
+{
+ struct fsl_dma_domain *dma_domain = to_fsl_dma_domain(domain);
+ const u32 *liodn;
+ u32 liodn_cnt;
+ int len, ret = 0;
+ struct pci_dev *pdev = NULL;
+ struct pci_controller *pci_ctl;
+
+ /*
+ * Use LIODN of the PCI controller while attaching a
+ * PCI device.
+ */
+ if (dev_is_pci(dev)) {
+ pdev = to_pci_dev(dev);
+ pci_ctl = pci_bus_to_host(pdev->bus);
+ /*
+ * make dev point to pci controller device
+ * so we can get the LIODN programmed by
+ * u-boot.
+ */
+ dev = pci_ctl->parent;
+ }
+
+ liodn = of_get_property(dev->of_node, "fsl,liodn", &len);
+ if (liodn) {
+ liodn_cnt = len / sizeof(u32);
+ ret = handle_attach_device(dma_domain, dev, liodn, liodn_cnt);
+ } else {
+ pr_debug("missing fsl,liodn property at %pOF\n", dev->of_node);
+ ret = -EINVAL;
+ }
+
+ return ret;
+}
+
+static void fsl_pamu_detach_device(struct iommu_domain *domain,
+ struct device *dev)
+{
+ struct fsl_dma_domain *dma_domain = to_fsl_dma_domain(domain);
+ const u32 *prop;
+ int len;
+ struct pci_dev *pdev = NULL;
+ struct pci_controller *pci_ctl;
+
+ /*
+ * Use LIODN of the PCI controller while detaching a
+ * PCI device.
+ */
+ if (dev_is_pci(dev)) {
+ pdev = to_pci_dev(dev);
+ pci_ctl = pci_bus_to_host(pdev->bus);
+ /*
+ * make dev point to pci controller device
+ * so we can get the LIODN programmed by
+ * u-boot.
+ */
+ dev = pci_ctl->parent;
+ }
+
+ prop = of_get_property(dev->of_node, "fsl,liodn", &len);
+ if (prop)
+ detach_device(dev, dma_domain);
+ else
+ pr_debug("missing fsl,liodn property at %pOF\n", dev->of_node);
+}
+
+static int configure_domain_geometry(struct iommu_domain *domain, void *data)
+{
+ struct iommu_domain_geometry *geom_attr = data;
+ struct fsl_dma_domain *dma_domain = to_fsl_dma_domain(domain);
+ dma_addr_t geom_size;
+ unsigned long flags;
+
+ geom_size = geom_attr->aperture_end - geom_attr->aperture_start + 1;
+ /*
+ * Sanity check the geometry size. Also, we do not support
+ * DMA outside of the geometry.
+ */
+ if (check_size(geom_size, geom_attr->aperture_start) ||
+ !geom_attr->force_aperture) {
+ pr_debug("Invalid PAMU geometry attributes\n");
+ return -EINVAL;
+ }
+
+ spin_lock_irqsave(&dma_domain->domain_lock, flags);
+ if (dma_domain->enabled) {
+ pr_debug("Can't set geometry attributes as domain is active\n");
+ spin_unlock_irqrestore(&dma_domain->domain_lock, flags);
+ return -EBUSY;
+ }
+
+ /* Copy the domain geometry information */
+ memcpy(&domain->geometry, geom_attr,
+ sizeof(struct iommu_domain_geometry));
+ dma_domain->geom_size = geom_size;
+
+ spin_unlock_irqrestore(&dma_domain->domain_lock, flags);
+
+ return 0;
+}
+
+/* Set the domain stash attribute */
+static int configure_domain_stash(struct fsl_dma_domain *dma_domain, void *data)
+{
+ struct pamu_stash_attribute *stash_attr = data;
+ unsigned long flags;
+ int ret;
+
+ spin_lock_irqsave(&dma_domain->domain_lock, flags);
+
+ memcpy(&dma_domain->dma_stash, stash_attr,
+ sizeof(struct pamu_stash_attribute));
+
+ dma_domain->stash_id = get_stash_id(stash_attr->cache,
+ stash_attr->cpu);
+ if (dma_domain->stash_id == ~(u32)0) {
+ pr_debug("Invalid stash attributes\n");
+ spin_unlock_irqrestore(&dma_domain->domain_lock, flags);
+ return -EINVAL;
+ }
+
+ ret = update_domain_stash(dma_domain, dma_domain->stash_id);
+
+ spin_unlock_irqrestore(&dma_domain->domain_lock, flags);
+
+ return ret;
+}
+
+/* Configure domain dma state i.e. enable/disable DMA */
+static int configure_domain_dma_state(struct fsl_dma_domain *dma_domain, bool enable)
+{
+ struct device_domain_info *info;
+ unsigned long flags;
+ int ret;
+
+ spin_lock_irqsave(&dma_domain->domain_lock, flags);
+
+ if (enable && !dma_domain->mapped) {
+ pr_debug("Can't enable DMA domain without valid mapping\n");
+ spin_unlock_irqrestore(&dma_domain->domain_lock, flags);
+ return -ENODEV;
+ }
+
+ dma_domain->enabled = enable;
+ list_for_each_entry(info, &dma_domain->devices, link) {
+ ret = (enable) ? pamu_enable_liodn(info->liodn) :
+ pamu_disable_liodn(info->liodn);
+ if (ret)
+ pr_debug("Unable to set dma state for liodn %d",
+ info->liodn);
+ }
+ spin_unlock_irqrestore(&dma_domain->domain_lock, flags);
+
+ return 0;
+}
+
+static int fsl_pamu_set_windows(struct iommu_domain *domain, u32 w_count)
+{
+ struct fsl_dma_domain *dma_domain = to_fsl_dma_domain(domain);
+ unsigned long flags;
+ int ret;
+
+ spin_lock_irqsave(&dma_domain->domain_lock, flags);
+ /* Ensure domain is inactive i.e. DMA should be disabled for the domain */
+ if (dma_domain->enabled) {
+ pr_debug("Can't set geometry attributes as domain is active\n");
+ spin_unlock_irqrestore(&dma_domain->domain_lock, flags);
+ return -EBUSY;
+ }
+
+ /* Ensure that the geometry has been set for the domain */
+ if (!dma_domain->geom_size) {
+ pr_debug("Please configure geometry before setting the number of windows\n");
+ spin_unlock_irqrestore(&dma_domain->domain_lock, flags);
+ return -EINVAL;
+ }
+
+ /*
+ * Ensure we have valid window count i.e. it should be less than
+ * maximum permissible limit and should be a power of two.
+ */
+ if (w_count > pamu_get_max_subwin_cnt() || !is_power_of_2(w_count)) {
+ pr_debug("Invalid window count\n");
+ spin_unlock_irqrestore(&dma_domain->domain_lock, flags);
+ return -EINVAL;
+ }
+
+ ret = pamu_set_domain_geometry(dma_domain, &domain->geometry,
+ w_count > 1 ? w_count : 0);
+ if (!ret) {
+ kfree(dma_domain->win_arr);
+ dma_domain->win_arr = kcalloc(w_count,
+ sizeof(*dma_domain->win_arr),
+ GFP_ATOMIC);
+ if (!dma_domain->win_arr) {
+ spin_unlock_irqrestore(&dma_domain->domain_lock, flags);
+ return -ENOMEM;
+ }
+ dma_domain->win_cnt = w_count;
+ }
+ spin_unlock_irqrestore(&dma_domain->domain_lock, flags);
+
+ return ret;
+}
+
+static int fsl_pamu_set_domain_attr(struct iommu_domain *domain,
+ enum iommu_attr attr_type, void *data)
+{
+ struct fsl_dma_domain *dma_domain = to_fsl_dma_domain(domain);
+ int ret = 0;
+
+ switch (attr_type) {
+ case DOMAIN_ATTR_GEOMETRY:
+ ret = configure_domain_geometry(domain, data);
+ break;
+ case DOMAIN_ATTR_FSL_PAMU_STASH:
+ ret = configure_domain_stash(dma_domain, data);
+ break;
+ case DOMAIN_ATTR_FSL_PAMU_ENABLE:
+ ret = configure_domain_dma_state(dma_domain, *(int *)data);
+ break;
+ case DOMAIN_ATTR_WINDOWS:
+ ret = fsl_pamu_set_windows(domain, *(u32 *)data);
+ break;
+ default:
+ pr_debug("Unsupported attribute type\n");
+ ret = -EINVAL;
+ break;
+ }
+
+ return ret;
+}
+
+static int fsl_pamu_get_domain_attr(struct iommu_domain *domain,
+ enum iommu_attr attr_type, void *data)
+{
+ struct fsl_dma_domain *dma_domain = to_fsl_dma_domain(domain);
+ int ret = 0;
+
+ switch (attr_type) {
+ case DOMAIN_ATTR_FSL_PAMU_STASH:
+ memcpy(data, &dma_domain->dma_stash,
+ sizeof(struct pamu_stash_attribute));
+ break;
+ case DOMAIN_ATTR_FSL_PAMU_ENABLE:
+ *(int *)data = dma_domain->enabled;
+ break;
+ case DOMAIN_ATTR_FSL_PAMUV1:
+ *(int *)data = DOMAIN_ATTR_FSL_PAMUV1;
+ break;
+ case DOMAIN_ATTR_WINDOWS:
+ *(u32 *)data = dma_domain->win_cnt;
+ break;
+ default:
+ pr_debug("Unsupported attribute type\n");
+ ret = -EINVAL;
+ break;
+ }
+
+ return ret;
+}
+
+static struct iommu_group *get_device_iommu_group(struct device *dev)
+{
+ struct iommu_group *group;
+
+ group = iommu_group_get(dev);
+ if (!group)
+ group = iommu_group_alloc();
+
+ return group;
+}
+
+static bool check_pci_ctl_endpt_part(struct pci_controller *pci_ctl)
+{
+ u32 version;
+
+ /* Check the PCI controller version number by readding BRR1 register */
+ version = in_be32(pci_ctl->cfg_addr + (PCI_FSL_BRR1 >> 2));
+ version &= PCI_FSL_BRR1_VER;
+ /* If PCI controller version is >= 0x204 we can partition endpoints */
+ return version >= 0x204;
+}
+
+/* Get iommu group information from peer devices or devices on the parent bus */
+static struct iommu_group *get_shared_pci_device_group(struct pci_dev *pdev)
+{
+ struct pci_dev *tmp;
+ struct iommu_group *group;
+ struct pci_bus *bus = pdev->bus;
+
+ /*
+ * Traverese the pci bus device list to get
+ * the shared iommu group.
+ */
+ while (bus) {
+ list_for_each_entry(tmp, &bus->devices, bus_list) {
+ if (tmp == pdev)
+ continue;
+ group = iommu_group_get(&tmp->dev);
+ if (group)
+ return group;
+ }
+
+ bus = bus->parent;
+ }
+
+ return NULL;
+}
+
+static struct iommu_group *get_pci_device_group(struct pci_dev *pdev)
+{
+ struct pci_controller *pci_ctl;
+ bool pci_endpt_partitioning;
+ struct iommu_group *group = NULL;
+
+ pci_ctl = pci_bus_to_host(pdev->bus);
+ pci_endpt_partitioning = check_pci_ctl_endpt_part(pci_ctl);
+ /* We can partition PCIe devices so assign device group to the device */
+ if (pci_endpt_partitioning) {
+ group = pci_device_group(&pdev->dev);
+
+ /*
+ * PCIe controller is not a paritionable entity
+ * free the controller device iommu_group.
+ */
+ if (pci_ctl->parent->iommu_group)
+ iommu_group_remove_device(pci_ctl->parent);
+ } else {
+ /*
+ * All devices connected to the controller will share the
+ * PCI controllers device group. If this is the first
+ * device to be probed for the pci controller, copy the
+ * device group information from the PCI controller device
+ * node and remove the PCI controller iommu group.
+ * For subsequent devices, the iommu group information can
+ * be obtained from sibling devices (i.e. from the bus_devices
+ * link list).
+ */
+ if (pci_ctl->parent->iommu_group) {
+ group = get_device_iommu_group(pci_ctl->parent);
+ iommu_group_remove_device(pci_ctl->parent);
+ } else {
+ group = get_shared_pci_device_group(pdev);
+ }
+ }
+
+ if (!group)
+ group = ERR_PTR(-ENODEV);
+
+ return group;
+}
+
+static struct iommu_group *fsl_pamu_device_group(struct device *dev)
+{
+ struct iommu_group *group = ERR_PTR(-ENODEV);
+ int len;
+
+ /*
+ * For platform devices we allocate a separate group for
+ * each of the devices.
+ */
+ if (dev_is_pci(dev))
+ group = get_pci_device_group(to_pci_dev(dev));
+ else if (of_get_property(dev->of_node, "fsl,liodn", &len))
+ group = get_device_iommu_group(dev);
+
+ return group;
+}
+
+static struct iommu_device *fsl_pamu_probe_device(struct device *dev)
+{
+ return &pamu_iommu;
+}
+
+static void fsl_pamu_release_device(struct device *dev)
+{
+}
+
+static const struct iommu_ops fsl_pamu_ops = {
+ .capable = fsl_pamu_capable,
+ .domain_alloc = fsl_pamu_domain_alloc,
+ .domain_free = fsl_pamu_domain_free,
+ .attach_dev = fsl_pamu_attach_device,
+ .detach_dev = fsl_pamu_detach_device,
+ .domain_window_enable = fsl_pamu_window_enable,
+ .domain_window_disable = fsl_pamu_window_disable,
+ .iova_to_phys = fsl_pamu_iova_to_phys,
+ .domain_set_attr = fsl_pamu_set_domain_attr,
+ .domain_get_attr = fsl_pamu_get_domain_attr,
+ .probe_device = fsl_pamu_probe_device,
+ .release_device = fsl_pamu_release_device,
+ .device_group = fsl_pamu_device_group,
+};
+
+int __init pamu_domain_init(void)
+{
+ int ret = 0;
+
+ ret = iommu_init_mempool();
+ if (ret)
+ return ret;
+
+ ret = iommu_device_sysfs_add(&pamu_iommu, NULL, NULL, "iommu0");
+ if (ret)
+ return ret;
+
+ iommu_device_set_ops(&pamu_iommu, &fsl_pamu_ops);
+
+ ret = iommu_device_register(&pamu_iommu);
+ if (ret) {
+ iommu_device_sysfs_remove(&pamu_iommu);
+ pr_err("Can't register iommu device\n");
+ return ret;
+ }
+
+ bus_set_iommu(&platform_bus_type, &fsl_pamu_ops);
+ bus_set_iommu(&pci_bus_type, &fsl_pamu_ops);
+
+ return ret;
+}
diff --git a/drivers/iommu/fsl_pamu_domain.h b/drivers/iommu/fsl_pamu_domain.h
new file mode 100644
index 000000000..2865d4278
--- /dev/null
+++ b/drivers/iommu/fsl_pamu_domain.h
@@ -0,0 +1,73 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ *
+ * Copyright (C) 2013 Freescale Semiconductor, Inc.
+ */
+
+#ifndef __FSL_PAMU_DOMAIN_H
+#define __FSL_PAMU_DOMAIN_H
+
+#include "fsl_pamu.h"
+
+struct dma_window {
+ phys_addr_t paddr;
+ u64 size;
+ int valid;
+ int prot;
+};
+
+struct fsl_dma_domain {
+ /*
+ * Indicates the geometry size for the domain.
+ * This would be set when the geometry is
+ * configured for the domain.
+ */
+ dma_addr_t geom_size;
+ /*
+ * Number of windows assocaited with this domain.
+ * During domain initialization, it is set to the
+ * the maximum number of subwindows allowed for a LIODN.
+ * Minimum value for this is 1 indicating a single PAMU
+ * window, without any sub windows. Value can be set/
+ * queried by set_attr/get_attr API for DOMAIN_ATTR_WINDOWS.
+ * Value can only be set once the geometry has been configured.
+ */
+ u32 win_cnt;
+ /*
+ * win_arr contains information of the configured
+ * windows for a domain. This is allocated only
+ * when the number of windows for the domain are
+ * set.
+ */
+ struct dma_window *win_arr;
+ /* list of devices associated with the domain */
+ struct list_head devices;
+ /* dma_domain states:
+ * mapped - A particular mapping has been created
+ * within the configured geometry.
+ * enabled - DMA has been enabled for the given
+ * domain. This translates to setting of the
+ * valid bit for the primary PAACE in the PAMU
+ * PAACT table. Domain geometry should be set and
+ * it must have a valid mapping before DMA can be
+ * enabled for it.
+ *
+ */
+ int mapped;
+ int enabled;
+ /* stash_id obtained from the stash attribute details */
+ u32 stash_id;
+ struct pamu_stash_attribute dma_stash;
+ u32 snoop_id;
+ struct iommu_domain iommu_domain;
+ spinlock_t domain_lock;
+};
+
+/* domain-device relationship */
+struct device_domain_info {
+ struct list_head link; /* link to domain siblings */
+ struct device *dev;
+ u32 liodn;
+ struct fsl_dma_domain *domain; /* pointer to domain */
+};
+#endif /* __FSL_PAMU_DOMAIN_H */
diff --git a/drivers/iommu/hyperv-iommu.c b/drivers/iommu/hyperv-iommu.c
new file mode 100644
index 000000000..e09e2d734
--- /dev/null
+++ b/drivers/iommu/hyperv-iommu.c
@@ -0,0 +1,199 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Hyper-V stub IOMMU driver.
+ *
+ * Copyright (C) 2019, Microsoft, Inc.
+ *
+ * Author : Lan Tianyu <Tianyu.Lan@microsoft.com>
+ */
+
+#include <linux/types.h>
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/iommu.h>
+#include <linux/module.h>
+
+#include <asm/apic.h>
+#include <asm/cpu.h>
+#include <asm/hw_irq.h>
+#include <asm/io_apic.h>
+#include <asm/irq_remapping.h>
+#include <asm/hypervisor.h>
+
+#include "irq_remapping.h"
+
+#ifdef CONFIG_IRQ_REMAP
+
+/*
+ * According 82093AA IO-APIC spec , IO APIC has a 24-entry Interrupt
+ * Redirection Table. Hyper-V exposes one single IO-APIC and so define
+ * 24 IO APIC remmapping entries.
+ */
+#define IOAPIC_REMAPPING_ENTRY 24
+
+static cpumask_t ioapic_max_cpumask = { CPU_BITS_NONE };
+static struct irq_domain *ioapic_ir_domain;
+
+static int hyperv_ir_set_affinity(struct irq_data *data,
+ const struct cpumask *mask, bool force)
+{
+ struct irq_data *parent = data->parent_data;
+ struct irq_cfg *cfg = irqd_cfg(data);
+ struct IO_APIC_route_entry *entry;
+ int ret;
+
+ /* Return error If new irq affinity is out of ioapic_max_cpumask. */
+ if (!cpumask_subset(mask, &ioapic_max_cpumask))
+ return -EINVAL;
+
+ ret = parent->chip->irq_set_affinity(parent, mask, force);
+ if (ret < 0 || ret == IRQ_SET_MASK_OK_DONE)
+ return ret;
+
+ entry = data->chip_data;
+ entry->dest = cfg->dest_apicid;
+ entry->vector = cfg->vector;
+ send_cleanup_vector(cfg);
+
+ return 0;
+}
+
+static struct irq_chip hyperv_ir_chip = {
+ .name = "HYPERV-IR",
+ .irq_ack = apic_ack_irq,
+ .irq_set_affinity = hyperv_ir_set_affinity,
+};
+
+static int hyperv_irq_remapping_alloc(struct irq_domain *domain,
+ unsigned int virq, unsigned int nr_irqs,
+ void *arg)
+{
+ struct irq_alloc_info *info = arg;
+ struct irq_data *irq_data;
+ struct irq_desc *desc;
+ int ret = 0;
+
+ if (!info || info->type != X86_IRQ_ALLOC_TYPE_IOAPIC || nr_irqs > 1)
+ return -EINVAL;
+
+ ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, arg);
+ if (ret < 0)
+ return ret;
+
+ irq_data = irq_domain_get_irq_data(domain, virq);
+ if (!irq_data) {
+ irq_domain_free_irqs_common(domain, virq, nr_irqs);
+ return -EINVAL;
+ }
+
+ irq_data->chip = &hyperv_ir_chip;
+
+ /*
+ * If there is interrupt remapping function of IOMMU, setting irq
+ * affinity only needs to change IRTE of IOMMU. But Hyper-V doesn't
+ * support interrupt remapping function, setting irq affinity of IO-APIC
+ * interrupts still needs to change IO-APIC registers. But ioapic_
+ * configure_entry() will ignore value of cfg->vector and cfg->
+ * dest_apicid when IO-APIC's parent irq domain is not the vector
+ * domain.(See ioapic_configure_entry()) In order to setting vector
+ * and dest_apicid to IO-APIC register, IO-APIC entry pointer is saved
+ * in the chip_data and hyperv_irq_remapping_activate()/hyperv_ir_set_
+ * affinity() set vector and dest_apicid directly into IO-APIC entry.
+ */
+ irq_data->chip_data = info->ioapic.entry;
+
+ /*
+ * Hypver-V IO APIC irq affinity should be in the scope of
+ * ioapic_max_cpumask because no irq remapping support.
+ */
+ desc = irq_data_to_desc(irq_data);
+ cpumask_copy(desc->irq_common_data.affinity, &ioapic_max_cpumask);
+
+ return 0;
+}
+
+static void hyperv_irq_remapping_free(struct irq_domain *domain,
+ unsigned int virq, unsigned int nr_irqs)
+{
+ irq_domain_free_irqs_common(domain, virq, nr_irqs);
+}
+
+static int hyperv_irq_remapping_activate(struct irq_domain *domain,
+ struct irq_data *irq_data, bool reserve)
+{
+ struct irq_cfg *cfg = irqd_cfg(irq_data);
+ struct IO_APIC_route_entry *entry = irq_data->chip_data;
+
+ entry->dest = cfg->dest_apicid;
+ entry->vector = cfg->vector;
+
+ return 0;
+}
+
+static const struct irq_domain_ops hyperv_ir_domain_ops = {
+ .alloc = hyperv_irq_remapping_alloc,
+ .free = hyperv_irq_remapping_free,
+ .activate = hyperv_irq_remapping_activate,
+};
+
+static int __init hyperv_prepare_irq_remapping(void)
+{
+ struct fwnode_handle *fn;
+ int i;
+
+ if (!hypervisor_is_type(X86_HYPER_MS_HYPERV) ||
+ !x2apic_supported())
+ return -ENODEV;
+
+ fn = irq_domain_alloc_named_id_fwnode("HYPERV-IR", 0);
+ if (!fn)
+ return -ENOMEM;
+
+ ioapic_ir_domain =
+ irq_domain_create_hierarchy(arch_get_ir_parent_domain(),
+ 0, IOAPIC_REMAPPING_ENTRY, fn,
+ &hyperv_ir_domain_ops, NULL);
+
+ if (!ioapic_ir_domain) {
+ irq_domain_free_fwnode(fn);
+ return -ENOMEM;
+ }
+
+ /*
+ * Hyper-V doesn't provide irq remapping function for
+ * IO-APIC and so IO-APIC only accepts 8-bit APIC ID.
+ * Cpu's APIC ID is read from ACPI MADT table and APIC IDs
+ * in the MADT table on Hyper-v are sorted monotonic increasingly.
+ * APIC ID reflects cpu topology. There maybe some APIC ID
+ * gaps when cpu number in a socket is not power of two. Prepare
+ * max cpu affinity for IOAPIC irqs. Scan cpu 0-255 and set cpu
+ * into ioapic_max_cpumask if its APIC ID is less than 256.
+ */
+ for (i = min_t(unsigned int, num_possible_cpus() - 1, 255); i >= 0; i--)
+ if (cpu_physical_id(i) < 256)
+ cpumask_set_cpu(i, &ioapic_max_cpumask);
+
+ return 0;
+}
+
+static int __init hyperv_enable_irq_remapping(void)
+{
+ return IRQ_REMAP_X2APIC_MODE;
+}
+
+static struct irq_domain *hyperv_get_irq_domain(struct irq_alloc_info *info)
+{
+ if (info->type == X86_IRQ_ALLOC_TYPE_IOAPIC_GET_PARENT)
+ return ioapic_ir_domain;
+ else
+ return NULL;
+}
+
+struct irq_remap_ops hyperv_irq_remap_ops = {
+ .prepare = hyperv_prepare_irq_remapping,
+ .enable = hyperv_enable_irq_remapping,
+ .get_irq_domain = hyperv_get_irq_domain,
+};
+
+#endif
diff --git a/drivers/iommu/intel/Kconfig b/drivers/iommu/intel/Kconfig
new file mode 100644
index 000000000..5337ee158
--- /dev/null
+++ b/drivers/iommu/intel/Kconfig
@@ -0,0 +1,87 @@
+# SPDX-License-Identifier: GPL-2.0-only
+# Intel IOMMU support
+config DMAR_TABLE
+ bool
+
+config INTEL_IOMMU
+ bool "Support for Intel IOMMU using DMA Remapping Devices"
+ depends on PCI_MSI && ACPI && (X86 || IA64)
+ select DMA_OPS
+ select IOMMU_API
+ select IOMMU_IOVA
+ select NEED_DMA_MAP_STATE
+ select DMAR_TABLE
+ select SWIOTLB
+ select IOASID
+ help
+ DMA remapping (DMAR) devices support enables independent address
+ translations for Direct Memory Access (DMA) from devices.
+ These DMA remapping devices are reported via ACPI tables
+ and include PCI device scope covered by these DMA
+ remapping devices.
+
+config INTEL_IOMMU_DEBUGFS
+ bool "Export Intel IOMMU internals in Debugfs"
+ depends on INTEL_IOMMU && IOMMU_DEBUGFS
+ help
+ !!!WARNING!!!
+
+ DO NOT ENABLE THIS OPTION UNLESS YOU REALLY KNOW WHAT YOU ARE DOING!!!
+
+ Expose Intel IOMMU internals in Debugfs.
+
+ This option is -NOT- intended for production environments, and should
+ only be enabled for debugging Intel IOMMU.
+
+config INTEL_IOMMU_SVM
+ bool "Support for Shared Virtual Memory with Intel IOMMU"
+ depends on INTEL_IOMMU && X86_64
+ select PCI_PASID
+ select PCI_PRI
+ select MMU_NOTIFIER
+ select IOASID
+ help
+ Shared Virtual Memory (SVM) provides a facility for devices
+ to access DMA resources through process address space by
+ means of a Process Address Space ID (PASID).
+
+config INTEL_IOMMU_DEFAULT_ON
+ def_bool y
+ prompt "Enable Intel DMA Remapping Devices by default"
+ depends on INTEL_IOMMU
+ help
+ Selecting this option will enable a DMAR device at boot time if
+ one is found. If this option is not selected, DMAR support can
+ be enabled by passing intel_iommu=on to the kernel.
+
+config INTEL_IOMMU_BROKEN_GFX_WA
+ bool "Workaround broken graphics drivers (going away soon)"
+ depends on INTEL_IOMMU && BROKEN && X86
+ help
+ Current Graphics drivers tend to use physical address
+ for DMA and avoid using DMA APIs. Setting this config
+ option permits the IOMMU driver to set a unity map for
+ all the OS-visible memory. Hence the driver can continue
+ to use physical addresses for DMA, at least until this
+ option is removed in the 2.6.32 kernel.
+
+config INTEL_IOMMU_FLOPPY_WA
+ def_bool y
+ depends on INTEL_IOMMU && X86
+ help
+ Floppy disk drivers are known to bypass DMA API calls
+ thereby failing to work when IOMMU is enabled. This
+ workaround will setup a 1:1 mapping for the first
+ 16MiB to make floppy (an ISA device) work.
+
+config INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON
+ bool "Enable Intel IOMMU scalable mode by default"
+ depends on INTEL_IOMMU
+ help
+ Selecting this option will enable by default the scalable mode if
+ hardware presents the capability. The scalable mode is defined in
+ VT-d 3.0. The scalable mode capability could be checked by reading
+ /sys/devices/virtual/iommu/dmar*/intel-iommu/ecap. If this option
+ is not selected, scalable mode support could also be enabled by
+ passing intel_iommu=sm_on to the kernel. If not sure, please use
+ the default value.
diff --git a/drivers/iommu/intel/Makefile b/drivers/iommu/intel/Makefile
new file mode 100644
index 000000000..fb8e1e8c8
--- /dev/null
+++ b/drivers/iommu/intel/Makefile
@@ -0,0 +1,7 @@
+# SPDX-License-Identifier: GPL-2.0
+obj-$(CONFIG_DMAR_TABLE) += dmar.o
+obj-$(CONFIG_INTEL_IOMMU) += iommu.o pasid.o
+obj-$(CONFIG_INTEL_IOMMU) += trace.o
+obj-$(CONFIG_INTEL_IOMMU_DEBUGFS) += debugfs.o
+obj-$(CONFIG_INTEL_IOMMU_SVM) += svm.o
+obj-$(CONFIG_IRQ_REMAP) += irq_remapping.o
diff --git a/drivers/iommu/intel/debugfs.c b/drivers/iommu/intel/debugfs.c
new file mode 100644
index 000000000..efea7f02a
--- /dev/null
+++ b/drivers/iommu/intel/debugfs.c
@@ -0,0 +1,559 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright © 2018 Intel Corporation.
+ *
+ * Authors: Gayatri Kammela <gayatri.kammela@intel.com>
+ * Sohil Mehta <sohil.mehta@intel.com>
+ * Jacob Pan <jacob.jun.pan@linux.intel.com>
+ * Lu Baolu <baolu.lu@linux.intel.com>
+ */
+
+#include <linux/debugfs.h>
+#include <linux/dmar.h>
+#include <linux/intel-iommu.h>
+#include <linux/pci.h>
+
+#include <asm/irq_remapping.h>
+
+#include "pasid.h"
+
+struct tbl_walk {
+ u16 bus;
+ u16 devfn;
+ u32 pasid;
+ struct root_entry *rt_entry;
+ struct context_entry *ctx_entry;
+ struct pasid_entry *pasid_tbl_entry;
+};
+
+struct iommu_regset {
+ int offset;
+ const char *regs;
+};
+
+#define IOMMU_REGSET_ENTRY(_reg_) \
+ { DMAR_##_reg_##_REG, __stringify(_reg_) }
+
+static const struct iommu_regset iommu_regs_32[] = {
+ IOMMU_REGSET_ENTRY(VER),
+ IOMMU_REGSET_ENTRY(GCMD),
+ IOMMU_REGSET_ENTRY(GSTS),
+ IOMMU_REGSET_ENTRY(FSTS),
+ IOMMU_REGSET_ENTRY(FECTL),
+ IOMMU_REGSET_ENTRY(FEDATA),
+ IOMMU_REGSET_ENTRY(FEADDR),
+ IOMMU_REGSET_ENTRY(FEUADDR),
+ IOMMU_REGSET_ENTRY(PMEN),
+ IOMMU_REGSET_ENTRY(PLMBASE),
+ IOMMU_REGSET_ENTRY(PLMLIMIT),
+ IOMMU_REGSET_ENTRY(ICS),
+ IOMMU_REGSET_ENTRY(PRS),
+ IOMMU_REGSET_ENTRY(PECTL),
+ IOMMU_REGSET_ENTRY(PEDATA),
+ IOMMU_REGSET_ENTRY(PEADDR),
+ IOMMU_REGSET_ENTRY(PEUADDR),
+};
+
+static const struct iommu_regset iommu_regs_64[] = {
+ IOMMU_REGSET_ENTRY(CAP),
+ IOMMU_REGSET_ENTRY(ECAP),
+ IOMMU_REGSET_ENTRY(RTADDR),
+ IOMMU_REGSET_ENTRY(CCMD),
+ IOMMU_REGSET_ENTRY(AFLOG),
+ IOMMU_REGSET_ENTRY(PHMBASE),
+ IOMMU_REGSET_ENTRY(PHMLIMIT),
+ IOMMU_REGSET_ENTRY(IQH),
+ IOMMU_REGSET_ENTRY(IQT),
+ IOMMU_REGSET_ENTRY(IQA),
+ IOMMU_REGSET_ENTRY(IRTA),
+ IOMMU_REGSET_ENTRY(PQH),
+ IOMMU_REGSET_ENTRY(PQT),
+ IOMMU_REGSET_ENTRY(PQA),
+ IOMMU_REGSET_ENTRY(MTRRCAP),
+ IOMMU_REGSET_ENTRY(MTRRDEF),
+ IOMMU_REGSET_ENTRY(MTRR_FIX64K_00000),
+ IOMMU_REGSET_ENTRY(MTRR_FIX16K_80000),
+ IOMMU_REGSET_ENTRY(MTRR_FIX16K_A0000),
+ IOMMU_REGSET_ENTRY(MTRR_FIX4K_C0000),
+ IOMMU_REGSET_ENTRY(MTRR_FIX4K_C8000),
+ IOMMU_REGSET_ENTRY(MTRR_FIX4K_D0000),
+ IOMMU_REGSET_ENTRY(MTRR_FIX4K_D8000),
+ IOMMU_REGSET_ENTRY(MTRR_FIX4K_E0000),
+ IOMMU_REGSET_ENTRY(MTRR_FIX4K_E8000),
+ IOMMU_REGSET_ENTRY(MTRR_FIX4K_F0000),
+ IOMMU_REGSET_ENTRY(MTRR_FIX4K_F8000),
+ IOMMU_REGSET_ENTRY(MTRR_PHYSBASE0),
+ IOMMU_REGSET_ENTRY(MTRR_PHYSMASK0),
+ IOMMU_REGSET_ENTRY(MTRR_PHYSBASE1),
+ IOMMU_REGSET_ENTRY(MTRR_PHYSMASK1),
+ IOMMU_REGSET_ENTRY(MTRR_PHYSBASE2),
+ IOMMU_REGSET_ENTRY(MTRR_PHYSMASK2),
+ IOMMU_REGSET_ENTRY(MTRR_PHYSBASE3),
+ IOMMU_REGSET_ENTRY(MTRR_PHYSMASK3),
+ IOMMU_REGSET_ENTRY(MTRR_PHYSBASE4),
+ IOMMU_REGSET_ENTRY(MTRR_PHYSMASK4),
+ IOMMU_REGSET_ENTRY(MTRR_PHYSBASE5),
+ IOMMU_REGSET_ENTRY(MTRR_PHYSMASK5),
+ IOMMU_REGSET_ENTRY(MTRR_PHYSBASE6),
+ IOMMU_REGSET_ENTRY(MTRR_PHYSMASK6),
+ IOMMU_REGSET_ENTRY(MTRR_PHYSBASE7),
+ IOMMU_REGSET_ENTRY(MTRR_PHYSMASK7),
+ IOMMU_REGSET_ENTRY(MTRR_PHYSBASE8),
+ IOMMU_REGSET_ENTRY(MTRR_PHYSMASK8),
+ IOMMU_REGSET_ENTRY(MTRR_PHYSBASE9),
+ IOMMU_REGSET_ENTRY(MTRR_PHYSMASK9),
+ IOMMU_REGSET_ENTRY(VCCAP),
+ IOMMU_REGSET_ENTRY(VCMD),
+ IOMMU_REGSET_ENTRY(VCRSP),
+};
+
+static int iommu_regset_show(struct seq_file *m, void *unused)
+{
+ struct dmar_drhd_unit *drhd;
+ struct intel_iommu *iommu;
+ unsigned long flag;
+ int i, ret = 0;
+ u64 value;
+
+ rcu_read_lock();
+ for_each_active_iommu(iommu, drhd) {
+ if (!drhd->reg_base_addr) {
+ seq_puts(m, "IOMMU: Invalid base address\n");
+ ret = -EINVAL;
+ goto out;
+ }
+
+ seq_printf(m, "IOMMU: %s Register Base Address: %llx\n",
+ iommu->name, drhd->reg_base_addr);
+ seq_puts(m, "Name\t\t\tOffset\t\tContents\n");
+ /*
+ * Publish the contents of the 64-bit hardware registers
+ * by adding the offset to the pointer (virtual address).
+ */
+ raw_spin_lock_irqsave(&iommu->register_lock, flag);
+ for (i = 0 ; i < ARRAY_SIZE(iommu_regs_32); i++) {
+ value = dmar_readl(iommu->reg + iommu_regs_32[i].offset);
+ seq_printf(m, "%-16s\t0x%02x\t\t0x%016llx\n",
+ iommu_regs_32[i].regs, iommu_regs_32[i].offset,
+ value);
+ }
+ for (i = 0 ; i < ARRAY_SIZE(iommu_regs_64); i++) {
+ value = dmar_readq(iommu->reg + iommu_regs_64[i].offset);
+ seq_printf(m, "%-16s\t0x%02x\t\t0x%016llx\n",
+ iommu_regs_64[i].regs, iommu_regs_64[i].offset,
+ value);
+ }
+ raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
+ seq_putc(m, '\n');
+ }
+out:
+ rcu_read_unlock();
+
+ return ret;
+}
+DEFINE_SHOW_ATTRIBUTE(iommu_regset);
+
+static inline void print_tbl_walk(struct seq_file *m)
+{
+ struct tbl_walk *tbl_wlk = m->private;
+
+ seq_printf(m, "%02x:%02x.%x\t0x%016llx:0x%016llx\t0x%016llx:0x%016llx\t",
+ tbl_wlk->bus, PCI_SLOT(tbl_wlk->devfn),
+ PCI_FUNC(tbl_wlk->devfn), tbl_wlk->rt_entry->hi,
+ tbl_wlk->rt_entry->lo, tbl_wlk->ctx_entry->hi,
+ tbl_wlk->ctx_entry->lo);
+
+ /*
+ * A legacy mode DMAR doesn't support PASID, hence default it to -1
+ * indicating that it's invalid. Also, default all PASID related fields
+ * to 0.
+ */
+ if (!tbl_wlk->pasid_tbl_entry)
+ seq_printf(m, "%-6d\t0x%016llx:0x%016llx:0x%016llx\n", -1,
+ (u64)0, (u64)0, (u64)0);
+ else
+ seq_printf(m, "%-6d\t0x%016llx:0x%016llx:0x%016llx\n",
+ tbl_wlk->pasid, tbl_wlk->pasid_tbl_entry->val[2],
+ tbl_wlk->pasid_tbl_entry->val[1],
+ tbl_wlk->pasid_tbl_entry->val[0]);
+}
+
+static void pasid_tbl_walk(struct seq_file *m, struct pasid_entry *tbl_entry,
+ u16 dir_idx)
+{
+ struct tbl_walk *tbl_wlk = m->private;
+ u8 tbl_idx;
+
+ for (tbl_idx = 0; tbl_idx < PASID_TBL_ENTRIES; tbl_idx++) {
+ if (pasid_pte_is_present(tbl_entry)) {
+ tbl_wlk->pasid_tbl_entry = tbl_entry;
+ tbl_wlk->pasid = (dir_idx << PASID_PDE_SHIFT) + tbl_idx;
+ print_tbl_walk(m);
+ }
+
+ tbl_entry++;
+ }
+}
+
+static void pasid_dir_walk(struct seq_file *m, u64 pasid_dir_ptr,
+ u16 pasid_dir_size)
+{
+ struct pasid_dir_entry *dir_entry = phys_to_virt(pasid_dir_ptr);
+ struct pasid_entry *pasid_tbl;
+ u16 dir_idx;
+
+ for (dir_idx = 0; dir_idx < pasid_dir_size; dir_idx++) {
+ pasid_tbl = get_pasid_table_from_pde(dir_entry);
+ if (pasid_tbl)
+ pasid_tbl_walk(m, pasid_tbl, dir_idx);
+
+ dir_entry++;
+ }
+}
+
+static void ctx_tbl_walk(struct seq_file *m, struct intel_iommu *iommu, u16 bus)
+{
+ struct context_entry *context;
+ u16 devfn, pasid_dir_size;
+ u64 pasid_dir_ptr;
+
+ for (devfn = 0; devfn < 256; devfn++) {
+ struct tbl_walk tbl_wlk = {0};
+
+ /*
+ * Scalable mode root entry points to upper scalable mode
+ * context table and lower scalable mode context table. Each
+ * scalable mode context table has 128 context entries where as
+ * legacy mode context table has 256 context entries. So in
+ * scalable mode, the context entries for former 128 devices are
+ * in the lower scalable mode context table, while the latter
+ * 128 devices are in the upper scalable mode context table.
+ * In scalable mode, when devfn > 127, iommu_context_addr()
+ * automatically refers to upper scalable mode context table and
+ * hence the caller doesn't have to worry about differences
+ * between scalable mode and non scalable mode.
+ */
+ context = iommu_context_addr(iommu, bus, devfn, 0);
+ if (!context)
+ return;
+
+ if (!context_present(context))
+ continue;
+
+ tbl_wlk.bus = bus;
+ tbl_wlk.devfn = devfn;
+ tbl_wlk.rt_entry = &iommu->root_entry[bus];
+ tbl_wlk.ctx_entry = context;
+ m->private = &tbl_wlk;
+
+ if (dmar_readq(iommu->reg + DMAR_RTADDR_REG) & DMA_RTADDR_SMT) {
+ pasid_dir_ptr = context->lo & VTD_PAGE_MASK;
+ pasid_dir_size = get_pasid_dir_size(context);
+ pasid_dir_walk(m, pasid_dir_ptr, pasid_dir_size);
+ continue;
+ }
+
+ print_tbl_walk(m);
+ }
+}
+
+static void root_tbl_walk(struct seq_file *m, struct intel_iommu *iommu)
+{
+ unsigned long flags;
+ u16 bus;
+
+ spin_lock_irqsave(&iommu->lock, flags);
+ seq_printf(m, "IOMMU %s: Root Table Address: 0x%llx\n", iommu->name,
+ (u64)virt_to_phys(iommu->root_entry));
+ seq_puts(m, "B.D.F\tRoot_entry\t\t\t\tContext_entry\t\t\t\tPASID\tPASID_table_entry\n");
+
+ /*
+ * No need to check if the root entry is present or not because
+ * iommu_context_addr() performs the same check before returning
+ * context entry.
+ */
+ for (bus = 0; bus < 256; bus++)
+ ctx_tbl_walk(m, iommu, bus);
+
+ spin_unlock_irqrestore(&iommu->lock, flags);
+}
+
+static int dmar_translation_struct_show(struct seq_file *m, void *unused)
+{
+ struct dmar_drhd_unit *drhd;
+ struct intel_iommu *iommu;
+ u32 sts;
+
+ rcu_read_lock();
+ for_each_active_iommu(iommu, drhd) {
+ sts = dmar_readl(iommu->reg + DMAR_GSTS_REG);
+ if (!(sts & DMA_GSTS_TES)) {
+ seq_printf(m, "DMA Remapping is not enabled on %s\n",
+ iommu->name);
+ continue;
+ }
+ root_tbl_walk(m, iommu);
+ seq_putc(m, '\n');
+ }
+ rcu_read_unlock();
+
+ return 0;
+}
+DEFINE_SHOW_ATTRIBUTE(dmar_translation_struct);
+
+static inline unsigned long level_to_directory_size(int level)
+{
+ return BIT_ULL(VTD_PAGE_SHIFT + VTD_STRIDE_SHIFT * (level - 1));
+}
+
+static inline void
+dump_page_info(struct seq_file *m, unsigned long iova, u64 *path)
+{
+ seq_printf(m, "0x%013lx |\t0x%016llx\t0x%016llx\t0x%016llx\t0x%016llx\t0x%016llx\n",
+ iova >> VTD_PAGE_SHIFT, path[5], path[4],
+ path[3], path[2], path[1]);
+}
+
+static void pgtable_walk_level(struct seq_file *m, struct dma_pte *pde,
+ int level, unsigned long start,
+ u64 *path)
+{
+ int i;
+
+ if (level > 5 || level < 1)
+ return;
+
+ for (i = 0; i < BIT_ULL(VTD_STRIDE_SHIFT);
+ i++, pde++, start += level_to_directory_size(level)) {
+ if (!dma_pte_present(pde))
+ continue;
+
+ path[level] = pde->val;
+ if (dma_pte_superpage(pde) || level == 1)
+ dump_page_info(m, start, path);
+ else
+ pgtable_walk_level(m, phys_to_virt(dma_pte_addr(pde)),
+ level - 1, start, path);
+ path[level] = 0;
+ }
+}
+
+static int show_device_domain_translation(struct device *dev, void *data)
+{
+ struct dmar_domain *domain = find_domain(dev);
+ struct seq_file *m = data;
+ u64 path[6] = { 0 };
+
+ if (!domain)
+ return 0;
+
+ seq_printf(m, "Device %s with pasid %d @0x%llx\n",
+ dev_name(dev), domain->default_pasid,
+ (u64)virt_to_phys(domain->pgd));
+ seq_puts(m, "IOVA_PFN\t\tPML5E\t\t\tPML4E\t\t\tPDPE\t\t\tPDE\t\t\tPTE\n");
+
+ pgtable_walk_level(m, domain->pgd, domain->agaw + 2, 0, path);
+ seq_putc(m, '\n');
+
+ return 0;
+}
+
+static int domain_translation_struct_show(struct seq_file *m, void *unused)
+{
+ unsigned long flags;
+ int ret;
+
+ spin_lock_irqsave(&device_domain_lock, flags);
+ ret = bus_for_each_dev(&pci_bus_type, NULL, m,
+ show_device_domain_translation);
+ spin_unlock_irqrestore(&device_domain_lock, flags);
+
+ return ret;
+}
+DEFINE_SHOW_ATTRIBUTE(domain_translation_struct);
+
+static void invalidation_queue_entry_show(struct seq_file *m,
+ struct intel_iommu *iommu)
+{
+ int index, shift = qi_shift(iommu);
+ struct qi_desc *desc;
+ int offset;
+
+ if (ecap_smts(iommu->ecap))
+ seq_puts(m, "Index\t\tqw0\t\t\tqw1\t\t\tqw2\t\t\tqw3\t\t\tstatus\n");
+ else
+ seq_puts(m, "Index\t\tqw0\t\t\tqw1\t\t\tstatus\n");
+
+ for (index = 0; index < QI_LENGTH; index++) {
+ offset = index << shift;
+ desc = iommu->qi->desc + offset;
+ if (ecap_smts(iommu->ecap))
+ seq_printf(m, "%5d\t%016llx\t%016llx\t%016llx\t%016llx\t%016x\n",
+ index, desc->qw0, desc->qw1,
+ desc->qw2, desc->qw3,
+ iommu->qi->desc_status[index]);
+ else
+ seq_printf(m, "%5d\t%016llx\t%016llx\t%016x\n",
+ index, desc->qw0, desc->qw1,
+ iommu->qi->desc_status[index]);
+ }
+}
+
+static int invalidation_queue_show(struct seq_file *m, void *unused)
+{
+ struct dmar_drhd_unit *drhd;
+ struct intel_iommu *iommu;
+ unsigned long flags;
+ struct q_inval *qi;
+ int shift;
+
+ rcu_read_lock();
+ for_each_active_iommu(iommu, drhd) {
+ qi = iommu->qi;
+ shift = qi_shift(iommu);
+
+ if (!qi || !ecap_qis(iommu->ecap))
+ continue;
+
+ seq_printf(m, "Invalidation queue on IOMMU: %s\n", iommu->name);
+
+ raw_spin_lock_irqsave(&qi->q_lock, flags);
+ seq_printf(m, " Base: 0x%llx\tHead: %lld\tTail: %lld\n",
+ (u64)virt_to_phys(qi->desc),
+ dmar_readq(iommu->reg + DMAR_IQH_REG) >> shift,
+ dmar_readq(iommu->reg + DMAR_IQT_REG) >> shift);
+ invalidation_queue_entry_show(m, iommu);
+ raw_spin_unlock_irqrestore(&qi->q_lock, flags);
+ seq_putc(m, '\n');
+ }
+ rcu_read_unlock();
+
+ return 0;
+}
+DEFINE_SHOW_ATTRIBUTE(invalidation_queue);
+
+#ifdef CONFIG_IRQ_REMAP
+static void ir_tbl_remap_entry_show(struct seq_file *m,
+ struct intel_iommu *iommu)
+{
+ struct irte *ri_entry;
+ unsigned long flags;
+ int idx;
+
+ seq_puts(m, " Entry SrcID DstID Vct IRTE_high\t\tIRTE_low\n");
+
+ raw_spin_lock_irqsave(&irq_2_ir_lock, flags);
+ for (idx = 0; idx < INTR_REMAP_TABLE_ENTRIES; idx++) {
+ ri_entry = &iommu->ir_table->base[idx];
+ if (!ri_entry->present || ri_entry->p_pst)
+ continue;
+
+ seq_printf(m, " %-5d %02x:%02x.%01x %08x %02x %016llx\t%016llx\n",
+ idx, PCI_BUS_NUM(ri_entry->sid),
+ PCI_SLOT(ri_entry->sid), PCI_FUNC(ri_entry->sid),
+ ri_entry->dest_id, ri_entry->vector,
+ ri_entry->high, ri_entry->low);
+ }
+ raw_spin_unlock_irqrestore(&irq_2_ir_lock, flags);
+}
+
+static void ir_tbl_posted_entry_show(struct seq_file *m,
+ struct intel_iommu *iommu)
+{
+ struct irte *pi_entry;
+ unsigned long flags;
+ int idx;
+
+ seq_puts(m, " Entry SrcID PDA_high PDA_low Vct IRTE_high\t\tIRTE_low\n");
+
+ raw_spin_lock_irqsave(&irq_2_ir_lock, flags);
+ for (idx = 0; idx < INTR_REMAP_TABLE_ENTRIES; idx++) {
+ pi_entry = &iommu->ir_table->base[idx];
+ if (!pi_entry->present || !pi_entry->p_pst)
+ continue;
+
+ seq_printf(m, " %-5d %02x:%02x.%01x %08x %08x %02x %016llx\t%016llx\n",
+ idx, PCI_BUS_NUM(pi_entry->sid),
+ PCI_SLOT(pi_entry->sid), PCI_FUNC(pi_entry->sid),
+ pi_entry->pda_h, pi_entry->pda_l << 6,
+ pi_entry->vector, pi_entry->high,
+ pi_entry->low);
+ }
+ raw_spin_unlock_irqrestore(&irq_2_ir_lock, flags);
+}
+
+/*
+ * For active IOMMUs go through the Interrupt remapping
+ * table and print valid entries in a table format for
+ * Remapped and Posted Interrupts.
+ */
+static int ir_translation_struct_show(struct seq_file *m, void *unused)
+{
+ struct dmar_drhd_unit *drhd;
+ struct intel_iommu *iommu;
+ u64 irta;
+ u32 sts;
+
+ rcu_read_lock();
+ for_each_active_iommu(iommu, drhd) {
+ if (!ecap_ir_support(iommu->ecap))
+ continue;
+
+ seq_printf(m, "Remapped Interrupt supported on IOMMU: %s\n",
+ iommu->name);
+
+ sts = dmar_readl(iommu->reg + DMAR_GSTS_REG);
+ if (iommu->ir_table && (sts & DMA_GSTS_IRES)) {
+ irta = virt_to_phys(iommu->ir_table->base);
+ seq_printf(m, " IR table address:%llx\n", irta);
+ ir_tbl_remap_entry_show(m, iommu);
+ } else {
+ seq_puts(m, "Interrupt Remapping is not enabled\n");
+ }
+ seq_putc(m, '\n');
+ }
+
+ seq_puts(m, "****\n\n");
+
+ for_each_active_iommu(iommu, drhd) {
+ if (!cap_pi_support(iommu->cap))
+ continue;
+
+ seq_printf(m, "Posted Interrupt supported on IOMMU: %s\n",
+ iommu->name);
+
+ if (iommu->ir_table) {
+ irta = virt_to_phys(iommu->ir_table->base);
+ seq_printf(m, " IR table address:%llx\n", irta);
+ ir_tbl_posted_entry_show(m, iommu);
+ } else {
+ seq_puts(m, "Interrupt Remapping is not enabled\n");
+ }
+ seq_putc(m, '\n');
+ }
+ rcu_read_unlock();
+
+ return 0;
+}
+DEFINE_SHOW_ATTRIBUTE(ir_translation_struct);
+#endif
+
+void __init intel_iommu_debugfs_init(void)
+{
+ struct dentry *intel_iommu_debug = debugfs_create_dir("intel",
+ iommu_debugfs_dir);
+
+ debugfs_create_file("iommu_regset", 0444, intel_iommu_debug, NULL,
+ &iommu_regset_fops);
+ debugfs_create_file("dmar_translation_struct", 0444, intel_iommu_debug,
+ NULL, &dmar_translation_struct_fops);
+ debugfs_create_file("domain_translation_struct", 0444,
+ intel_iommu_debug, NULL,
+ &domain_translation_struct_fops);
+ debugfs_create_file("invalidation_queue", 0444, intel_iommu_debug,
+ NULL, &invalidation_queue_fops);
+#ifdef CONFIG_IRQ_REMAP
+ debugfs_create_file("ir_translation_struct", 0444, intel_iommu_debug,
+ NULL, &ir_translation_struct_fops);
+#endif
+}
diff --git a/drivers/iommu/intel/dmar.c b/drivers/iommu/intel/dmar.c
new file mode 100644
index 000000000..a27765a7f
--- /dev/null
+++ b/drivers/iommu/intel/dmar.c
@@ -0,0 +1,2324 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2006, Intel Corporation.
+ *
+ * Copyright (C) 2006-2008 Intel Corporation
+ * Author: Ashok Raj <ashok.raj@intel.com>
+ * Author: Shaohua Li <shaohua.li@intel.com>
+ * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
+ *
+ * This file implements early detection/parsing of Remapping Devices
+ * reported to OS through BIOS via DMA remapping reporting (DMAR) ACPI
+ * tables.
+ *
+ * These routines are used by both DMA-remapping and Interrupt-remapping
+ */
+
+#define pr_fmt(fmt) "DMAR: " fmt
+
+#include <linux/pci.h>
+#include <linux/dmar.h>
+#include <linux/iova.h>
+#include <linux/intel-iommu.h>
+#include <linux/timer.h>
+#include <linux/irq.h>
+#include <linux/interrupt.h>
+#include <linux/tboot.h>
+#include <linux/dmi.h>
+#include <linux/slab.h>
+#include <linux/iommu.h>
+#include <linux/numa.h>
+#include <linux/limits.h>
+#include <asm/irq_remapping.h>
+#include <asm/iommu_table.h>
+
+#include "../irq_remapping.h"
+
+typedef int (*dmar_res_handler_t)(struct acpi_dmar_header *, void *);
+struct dmar_res_callback {
+ dmar_res_handler_t cb[ACPI_DMAR_TYPE_RESERVED];
+ void *arg[ACPI_DMAR_TYPE_RESERVED];
+ bool ignore_unhandled;
+ bool print_entry;
+};
+
+/*
+ * Assumptions:
+ * 1) The hotplug framework guarentees that DMAR unit will be hot-added
+ * before IO devices managed by that unit.
+ * 2) The hotplug framework guarantees that DMAR unit will be hot-removed
+ * after IO devices managed by that unit.
+ * 3) Hotplug events are rare.
+ *
+ * Locking rules for DMA and interrupt remapping related global data structures:
+ * 1) Use dmar_global_lock in process context
+ * 2) Use RCU in interrupt context
+ */
+DECLARE_RWSEM(dmar_global_lock);
+LIST_HEAD(dmar_drhd_units);
+
+struct acpi_table_header * __initdata dmar_tbl;
+static int dmar_dev_scope_status = 1;
+static unsigned long dmar_seq_ids[BITS_TO_LONGS(DMAR_UNITS_SUPPORTED)];
+
+static int alloc_iommu(struct dmar_drhd_unit *drhd);
+static void free_iommu(struct intel_iommu *iommu);
+
+extern const struct iommu_ops intel_iommu_ops;
+
+static void dmar_register_drhd_unit(struct dmar_drhd_unit *drhd)
+{
+ /*
+ * add INCLUDE_ALL at the tail, so scan the list will find it at
+ * the very end.
+ */
+ if (drhd->include_all)
+ list_add_tail_rcu(&drhd->list, &dmar_drhd_units);
+ else
+ list_add_rcu(&drhd->list, &dmar_drhd_units);
+}
+
+void *dmar_alloc_dev_scope(void *start, void *end, int *cnt)
+{
+ struct acpi_dmar_device_scope *scope;
+
+ *cnt = 0;
+ while (start < end) {
+ scope = start;
+ if (scope->entry_type == ACPI_DMAR_SCOPE_TYPE_NAMESPACE ||
+ scope->entry_type == ACPI_DMAR_SCOPE_TYPE_ENDPOINT ||
+ scope->entry_type == ACPI_DMAR_SCOPE_TYPE_BRIDGE)
+ (*cnt)++;
+ else if (scope->entry_type != ACPI_DMAR_SCOPE_TYPE_IOAPIC &&
+ scope->entry_type != ACPI_DMAR_SCOPE_TYPE_HPET) {
+ pr_warn("Unsupported device scope\n");
+ }
+ start += scope->length;
+ }
+ if (*cnt == 0)
+ return NULL;
+
+ return kcalloc(*cnt, sizeof(struct dmar_dev_scope), GFP_KERNEL);
+}
+
+void dmar_free_dev_scope(struct dmar_dev_scope **devices, int *cnt)
+{
+ int i;
+ struct device *tmp_dev;
+
+ if (*devices && *cnt) {
+ for_each_active_dev_scope(*devices, *cnt, i, tmp_dev)
+ put_device(tmp_dev);
+ kfree(*devices);
+ }
+
+ *devices = NULL;
+ *cnt = 0;
+}
+
+/* Optimize out kzalloc()/kfree() for normal cases */
+static char dmar_pci_notify_info_buf[64];
+
+static struct dmar_pci_notify_info *
+dmar_alloc_pci_notify_info(struct pci_dev *dev, unsigned long event)
+{
+ int level = 0;
+ size_t size;
+ struct pci_dev *tmp;
+ struct dmar_pci_notify_info *info;
+
+ BUG_ON(dev->is_virtfn);
+
+ /*
+ * Ignore devices that have a domain number higher than what can
+ * be looked up in DMAR, e.g. VMD subdevices with domain 0x10000
+ */
+ if (pci_domain_nr(dev->bus) > U16_MAX)
+ return NULL;
+
+ /* Only generate path[] for device addition event */
+ if (event == BUS_NOTIFY_ADD_DEVICE)
+ for (tmp = dev; tmp; tmp = tmp->bus->self)
+ level++;
+
+ size = struct_size(info, path, level);
+ if (size <= sizeof(dmar_pci_notify_info_buf)) {
+ info = (struct dmar_pci_notify_info *)dmar_pci_notify_info_buf;
+ } else {
+ info = kzalloc(size, GFP_KERNEL);
+ if (!info) {
+ pr_warn("Out of memory when allocating notify_info "
+ "for %s.\n", pci_name(dev));
+ if (dmar_dev_scope_status == 0)
+ dmar_dev_scope_status = -ENOMEM;
+ return NULL;
+ }
+ }
+
+ info->event = event;
+ info->dev = dev;
+ info->seg = pci_domain_nr(dev->bus);
+ info->level = level;
+ if (event == BUS_NOTIFY_ADD_DEVICE) {
+ for (tmp = dev; tmp; tmp = tmp->bus->self) {
+ level--;
+ info->path[level].bus = tmp->bus->number;
+ info->path[level].device = PCI_SLOT(tmp->devfn);
+ info->path[level].function = PCI_FUNC(tmp->devfn);
+ if (pci_is_root_bus(tmp->bus))
+ info->bus = tmp->bus->number;
+ }
+ }
+
+ return info;
+}
+
+static inline void dmar_free_pci_notify_info(struct dmar_pci_notify_info *info)
+{
+ if ((void *)info != dmar_pci_notify_info_buf)
+ kfree(info);
+}
+
+static bool dmar_match_pci_path(struct dmar_pci_notify_info *info, int bus,
+ struct acpi_dmar_pci_path *path, int count)
+{
+ int i;
+
+ if (info->bus != bus)
+ goto fallback;
+ if (info->level != count)
+ goto fallback;
+
+ for (i = 0; i < count; i++) {
+ if (path[i].device != info->path[i].device ||
+ path[i].function != info->path[i].function)
+ goto fallback;
+ }
+
+ return true;
+
+fallback:
+
+ if (count != 1)
+ return false;
+
+ i = info->level - 1;
+ if (bus == info->path[i].bus &&
+ path[0].device == info->path[i].device &&
+ path[0].function == info->path[i].function) {
+ pr_info(FW_BUG "RMRR entry for device %02x:%02x.%x is broken - applying workaround\n",
+ bus, path[0].device, path[0].function);
+ return true;
+ }
+
+ return false;
+}
+
+/* Return: > 0 if match found, 0 if no match found, < 0 if error happens */
+int dmar_insert_dev_scope(struct dmar_pci_notify_info *info,
+ void *start, void*end, u16 segment,
+ struct dmar_dev_scope *devices,
+ int devices_cnt)
+{
+ int i, level;
+ struct device *tmp, *dev = &info->dev->dev;
+ struct acpi_dmar_device_scope *scope;
+ struct acpi_dmar_pci_path *path;
+
+ if (segment != info->seg)
+ return 0;
+
+ for (; start < end; start += scope->length) {
+ scope = start;
+ if (scope->entry_type != ACPI_DMAR_SCOPE_TYPE_ENDPOINT &&
+ scope->entry_type != ACPI_DMAR_SCOPE_TYPE_BRIDGE)
+ continue;
+
+ path = (struct acpi_dmar_pci_path *)(scope + 1);
+ level = (scope->length - sizeof(*scope)) / sizeof(*path);
+ if (!dmar_match_pci_path(info, scope->bus, path, level))
+ continue;
+
+ /*
+ * We expect devices with endpoint scope to have normal PCI
+ * headers, and devices with bridge scope to have bridge PCI
+ * headers. However PCI NTB devices may be listed in the
+ * DMAR table with bridge scope, even though they have a
+ * normal PCI header. NTB devices are identified by class
+ * "BRIDGE_OTHER" (0680h) - we don't declare a socpe mismatch
+ * for this special case.
+ */
+ if ((scope->entry_type == ACPI_DMAR_SCOPE_TYPE_ENDPOINT &&
+ info->dev->hdr_type != PCI_HEADER_TYPE_NORMAL) ||
+ (scope->entry_type == ACPI_DMAR_SCOPE_TYPE_BRIDGE &&
+ (info->dev->hdr_type == PCI_HEADER_TYPE_NORMAL &&
+ info->dev->class >> 16 != PCI_BASE_CLASS_BRIDGE))) {
+ pr_warn("Device scope type does not match for %s\n",
+ pci_name(info->dev));
+ return -EINVAL;
+ }
+
+ for_each_dev_scope(devices, devices_cnt, i, tmp)
+ if (tmp == NULL) {
+ devices[i].bus = info->dev->bus->number;
+ devices[i].devfn = info->dev->devfn;
+ rcu_assign_pointer(devices[i].dev,
+ get_device(dev));
+ return 1;
+ }
+ BUG_ON(i >= devices_cnt);
+ }
+
+ return 0;
+}
+
+int dmar_remove_dev_scope(struct dmar_pci_notify_info *info, u16 segment,
+ struct dmar_dev_scope *devices, int count)
+{
+ int index;
+ struct device *tmp;
+
+ if (info->seg != segment)
+ return 0;
+
+ for_each_active_dev_scope(devices, count, index, tmp)
+ if (tmp == &info->dev->dev) {
+ RCU_INIT_POINTER(devices[index].dev, NULL);
+ synchronize_rcu();
+ put_device(tmp);
+ return 1;
+ }
+
+ return 0;
+}
+
+static int dmar_pci_bus_add_dev(struct dmar_pci_notify_info *info)
+{
+ int ret = 0;
+ struct dmar_drhd_unit *dmaru;
+ struct acpi_dmar_hardware_unit *drhd;
+
+ for_each_drhd_unit(dmaru) {
+ if (dmaru->include_all)
+ continue;
+
+ drhd = container_of(dmaru->hdr,
+ struct acpi_dmar_hardware_unit, header);
+ ret = dmar_insert_dev_scope(info, (void *)(drhd + 1),
+ ((void *)drhd) + drhd->header.length,
+ dmaru->segment,
+ dmaru->devices, dmaru->devices_cnt);
+ if (ret)
+ break;
+ }
+ if (ret >= 0)
+ ret = dmar_iommu_notify_scope_dev(info);
+ if (ret < 0 && dmar_dev_scope_status == 0)
+ dmar_dev_scope_status = ret;
+
+ if (ret >= 0)
+ intel_irq_remap_add_device(info);
+
+ return ret;
+}
+
+static void dmar_pci_bus_del_dev(struct dmar_pci_notify_info *info)
+{
+ struct dmar_drhd_unit *dmaru;
+
+ for_each_drhd_unit(dmaru)
+ if (dmar_remove_dev_scope(info, dmaru->segment,
+ dmaru->devices, dmaru->devices_cnt))
+ break;
+ dmar_iommu_notify_scope_dev(info);
+}
+
+static inline void vf_inherit_msi_domain(struct pci_dev *pdev)
+{
+ struct pci_dev *physfn = pci_physfn(pdev);
+
+ dev_set_msi_domain(&pdev->dev, dev_get_msi_domain(&physfn->dev));
+}
+
+static int dmar_pci_bus_notifier(struct notifier_block *nb,
+ unsigned long action, void *data)
+{
+ struct pci_dev *pdev = to_pci_dev(data);
+ struct dmar_pci_notify_info *info;
+
+ /* Only care about add/remove events for physical functions.
+ * For VFs we actually do the lookup based on the corresponding
+ * PF in device_to_iommu() anyway. */
+ if (pdev->is_virtfn) {
+ /*
+ * Ensure that the VF device inherits the irq domain of the
+ * PF device. Ideally the device would inherit the domain
+ * from the bus, but DMAR can have multiple units per bus
+ * which makes this impossible. The VF 'bus' could inherit
+ * from the PF device, but that's yet another x86'sism to
+ * inflict on everybody else.
+ */
+ if (action == BUS_NOTIFY_ADD_DEVICE)
+ vf_inherit_msi_domain(pdev);
+ return NOTIFY_DONE;
+ }
+
+ if (action != BUS_NOTIFY_ADD_DEVICE &&
+ action != BUS_NOTIFY_REMOVED_DEVICE)
+ return NOTIFY_DONE;
+
+ info = dmar_alloc_pci_notify_info(pdev, action);
+ if (!info)
+ return NOTIFY_DONE;
+
+ down_write(&dmar_global_lock);
+ if (action == BUS_NOTIFY_ADD_DEVICE)
+ dmar_pci_bus_add_dev(info);
+ else if (action == BUS_NOTIFY_REMOVED_DEVICE)
+ dmar_pci_bus_del_dev(info);
+ up_write(&dmar_global_lock);
+
+ dmar_free_pci_notify_info(info);
+
+ return NOTIFY_OK;
+}
+
+static struct notifier_block dmar_pci_bus_nb = {
+ .notifier_call = dmar_pci_bus_notifier,
+ .priority = 1,
+};
+
+static struct dmar_drhd_unit *
+dmar_find_dmaru(struct acpi_dmar_hardware_unit *drhd)
+{
+ struct dmar_drhd_unit *dmaru;
+
+ list_for_each_entry_rcu(dmaru, &dmar_drhd_units, list,
+ dmar_rcu_check())
+ if (dmaru->segment == drhd->segment &&
+ dmaru->reg_base_addr == drhd->address)
+ return dmaru;
+
+ return NULL;
+}
+
+/*
+ * dmar_parse_one_drhd - parses exactly one DMA remapping hardware definition
+ * structure which uniquely represent one DMA remapping hardware unit
+ * present in the platform
+ */
+static int dmar_parse_one_drhd(struct acpi_dmar_header *header, void *arg)
+{
+ struct acpi_dmar_hardware_unit *drhd;
+ struct dmar_drhd_unit *dmaru;
+ int ret;
+
+ drhd = (struct acpi_dmar_hardware_unit *)header;
+ dmaru = dmar_find_dmaru(drhd);
+ if (dmaru)
+ goto out;
+
+ dmaru = kzalloc(sizeof(*dmaru) + header->length, GFP_KERNEL);
+ if (!dmaru)
+ return -ENOMEM;
+
+ /*
+ * If header is allocated from slab by ACPI _DSM method, we need to
+ * copy the content because the memory buffer will be freed on return.
+ */
+ dmaru->hdr = (void *)(dmaru + 1);
+ memcpy(dmaru->hdr, header, header->length);
+ dmaru->reg_base_addr = drhd->address;
+ dmaru->segment = drhd->segment;
+ dmaru->include_all = drhd->flags & 0x1; /* BIT0: INCLUDE_ALL */
+ dmaru->devices = dmar_alloc_dev_scope((void *)(drhd + 1),
+ ((void *)drhd) + drhd->header.length,
+ &dmaru->devices_cnt);
+ if (dmaru->devices_cnt && dmaru->devices == NULL) {
+ kfree(dmaru);
+ return -ENOMEM;
+ }
+
+ ret = alloc_iommu(dmaru);
+ if (ret) {
+ dmar_free_dev_scope(&dmaru->devices,
+ &dmaru->devices_cnt);
+ kfree(dmaru);
+ return ret;
+ }
+ dmar_register_drhd_unit(dmaru);
+
+out:
+ if (arg)
+ (*(int *)arg)++;
+
+ return 0;
+}
+
+static void dmar_free_drhd(struct dmar_drhd_unit *dmaru)
+{
+ if (dmaru->devices && dmaru->devices_cnt)
+ dmar_free_dev_scope(&dmaru->devices, &dmaru->devices_cnt);
+ if (dmaru->iommu)
+ free_iommu(dmaru->iommu);
+ kfree(dmaru);
+}
+
+static int __init dmar_parse_one_andd(struct acpi_dmar_header *header,
+ void *arg)
+{
+ struct acpi_dmar_andd *andd = (void *)header;
+
+ /* Check for NUL termination within the designated length */
+ if (strnlen(andd->device_name, header->length - 8) == header->length - 8) {
+ pr_warn(FW_BUG
+ "Your BIOS is broken; ANDD object name is not NUL-terminated\n"
+ "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
+ dmi_get_system_info(DMI_BIOS_VENDOR),
+ dmi_get_system_info(DMI_BIOS_VERSION),
+ dmi_get_system_info(DMI_PRODUCT_VERSION));
+ add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
+ return -EINVAL;
+ }
+ pr_info("ANDD device: %x name: %s\n", andd->device_number,
+ andd->device_name);
+
+ return 0;
+}
+
+#ifdef CONFIG_ACPI_NUMA
+static int dmar_parse_one_rhsa(struct acpi_dmar_header *header, void *arg)
+{
+ struct acpi_dmar_rhsa *rhsa;
+ struct dmar_drhd_unit *drhd;
+
+ rhsa = (struct acpi_dmar_rhsa *)header;
+ for_each_drhd_unit(drhd) {
+ if (drhd->reg_base_addr == rhsa->base_address) {
+ int node = pxm_to_node(rhsa->proximity_domain);
+
+ if (node != NUMA_NO_NODE && !node_online(node))
+ node = NUMA_NO_NODE;
+ drhd->iommu->node = node;
+ return 0;
+ }
+ }
+ pr_warn(FW_BUG
+ "Your BIOS is broken; RHSA refers to non-existent DMAR unit at %llx\n"
+ "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
+ rhsa->base_address,
+ dmi_get_system_info(DMI_BIOS_VENDOR),
+ dmi_get_system_info(DMI_BIOS_VERSION),
+ dmi_get_system_info(DMI_PRODUCT_VERSION));
+ add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
+
+ return 0;
+}
+#else
+#define dmar_parse_one_rhsa dmar_res_noop
+#endif
+
+static void
+dmar_table_print_dmar_entry(struct acpi_dmar_header *header)
+{
+ struct acpi_dmar_hardware_unit *drhd;
+ struct acpi_dmar_reserved_memory *rmrr;
+ struct acpi_dmar_atsr *atsr;
+ struct acpi_dmar_rhsa *rhsa;
+
+ switch (header->type) {
+ case ACPI_DMAR_TYPE_HARDWARE_UNIT:
+ drhd = container_of(header, struct acpi_dmar_hardware_unit,
+ header);
+ pr_info("DRHD base: %#016Lx flags: %#x\n",
+ (unsigned long long)drhd->address, drhd->flags);
+ break;
+ case ACPI_DMAR_TYPE_RESERVED_MEMORY:
+ rmrr = container_of(header, struct acpi_dmar_reserved_memory,
+ header);
+ pr_info("RMRR base: %#016Lx end: %#016Lx\n",
+ (unsigned long long)rmrr->base_address,
+ (unsigned long long)rmrr->end_address);
+ break;
+ case ACPI_DMAR_TYPE_ROOT_ATS:
+ atsr = container_of(header, struct acpi_dmar_atsr, header);
+ pr_info("ATSR flags: %#x\n", atsr->flags);
+ break;
+ case ACPI_DMAR_TYPE_HARDWARE_AFFINITY:
+ rhsa = container_of(header, struct acpi_dmar_rhsa, header);
+ pr_info("RHSA base: %#016Lx proximity domain: %#x\n",
+ (unsigned long long)rhsa->base_address,
+ rhsa->proximity_domain);
+ break;
+ case ACPI_DMAR_TYPE_NAMESPACE:
+ /* We don't print this here because we need to sanity-check
+ it first. So print it in dmar_parse_one_andd() instead. */
+ break;
+ }
+}
+
+/**
+ * dmar_table_detect - checks to see if the platform supports DMAR devices
+ */
+static int __init dmar_table_detect(void)
+{
+ acpi_status status = AE_OK;
+
+ /* if we could find DMAR table, then there are DMAR devices */
+ status = acpi_get_table(ACPI_SIG_DMAR, 0, &dmar_tbl);
+
+ if (ACPI_SUCCESS(status) && !dmar_tbl) {
+ pr_warn("Unable to map DMAR\n");
+ status = AE_NOT_FOUND;
+ }
+
+ return ACPI_SUCCESS(status) ? 0 : -ENOENT;
+}
+
+static int dmar_walk_remapping_entries(struct acpi_dmar_header *start,
+ size_t len, struct dmar_res_callback *cb)
+{
+ struct acpi_dmar_header *iter, *next;
+ struct acpi_dmar_header *end = ((void *)start) + len;
+
+ for (iter = start; iter < end; iter = next) {
+ next = (void *)iter + iter->length;
+ if (iter->length == 0) {
+ /* Avoid looping forever on bad ACPI tables */
+ pr_debug(FW_BUG "Invalid 0-length structure\n");
+ break;
+ } else if (next > end) {
+ /* Avoid passing table end */
+ pr_warn(FW_BUG "Record passes table end\n");
+ return -EINVAL;
+ }
+
+ if (cb->print_entry)
+ dmar_table_print_dmar_entry(iter);
+
+ if (iter->type >= ACPI_DMAR_TYPE_RESERVED) {
+ /* continue for forward compatibility */
+ pr_debug("Unknown DMAR structure type %d\n",
+ iter->type);
+ } else if (cb->cb[iter->type]) {
+ int ret;
+
+ ret = cb->cb[iter->type](iter, cb->arg[iter->type]);
+ if (ret)
+ return ret;
+ } else if (!cb->ignore_unhandled) {
+ pr_warn("No handler for DMAR structure type %d\n",
+ iter->type);
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
+static inline int dmar_walk_dmar_table(struct acpi_table_dmar *dmar,
+ struct dmar_res_callback *cb)
+{
+ return dmar_walk_remapping_entries((void *)(dmar + 1),
+ dmar->header.length - sizeof(*dmar), cb);
+}
+
+/**
+ * parse_dmar_table - parses the DMA reporting table
+ */
+static int __init
+parse_dmar_table(void)
+{
+ struct acpi_table_dmar *dmar;
+ int drhd_count = 0;
+ int ret;
+ struct dmar_res_callback cb = {
+ .print_entry = true,
+ .ignore_unhandled = true,
+ .arg[ACPI_DMAR_TYPE_HARDWARE_UNIT] = &drhd_count,
+ .cb[ACPI_DMAR_TYPE_HARDWARE_UNIT] = &dmar_parse_one_drhd,
+ .cb[ACPI_DMAR_TYPE_RESERVED_MEMORY] = &dmar_parse_one_rmrr,
+ .cb[ACPI_DMAR_TYPE_ROOT_ATS] = &dmar_parse_one_atsr,
+ .cb[ACPI_DMAR_TYPE_HARDWARE_AFFINITY] = &dmar_parse_one_rhsa,
+ .cb[ACPI_DMAR_TYPE_NAMESPACE] = &dmar_parse_one_andd,
+ };
+
+ /*
+ * Do it again, earlier dmar_tbl mapping could be mapped with
+ * fixed map.
+ */
+ dmar_table_detect();
+
+ /*
+ * ACPI tables may not be DMA protected by tboot, so use DMAR copy
+ * SINIT saved in SinitMleData in TXT heap (which is DMA protected)
+ */
+ dmar_tbl = tboot_get_dmar_table(dmar_tbl);
+
+ dmar = (struct acpi_table_dmar *)dmar_tbl;
+ if (!dmar)
+ return -ENODEV;
+
+ if (dmar->width < PAGE_SHIFT - 1) {
+ pr_warn("Invalid DMAR haw\n");
+ return -EINVAL;
+ }
+
+ pr_info("Host address width %d\n", dmar->width + 1);
+ ret = dmar_walk_dmar_table(dmar, &cb);
+ if (ret == 0 && drhd_count == 0)
+ pr_warn(FW_BUG "No DRHD structure found in DMAR table\n");
+
+ return ret;
+}
+
+static int dmar_pci_device_match(struct dmar_dev_scope devices[],
+ int cnt, struct pci_dev *dev)
+{
+ int index;
+ struct device *tmp;
+
+ while (dev) {
+ for_each_active_dev_scope(devices, cnt, index, tmp)
+ if (dev_is_pci(tmp) && dev == to_pci_dev(tmp))
+ return 1;
+
+ /* Check our parent */
+ dev = dev->bus->self;
+ }
+
+ return 0;
+}
+
+struct dmar_drhd_unit *
+dmar_find_matched_drhd_unit(struct pci_dev *dev)
+{
+ struct dmar_drhd_unit *dmaru;
+ struct acpi_dmar_hardware_unit *drhd;
+
+ dev = pci_physfn(dev);
+
+ rcu_read_lock();
+ for_each_drhd_unit(dmaru) {
+ drhd = container_of(dmaru->hdr,
+ struct acpi_dmar_hardware_unit,
+ header);
+
+ if (dmaru->include_all &&
+ drhd->segment == pci_domain_nr(dev->bus))
+ goto out;
+
+ if (dmar_pci_device_match(dmaru->devices,
+ dmaru->devices_cnt, dev))
+ goto out;
+ }
+ dmaru = NULL;
+out:
+ rcu_read_unlock();
+
+ return dmaru;
+}
+
+static void __init dmar_acpi_insert_dev_scope(u8 device_number,
+ struct acpi_device *adev)
+{
+ struct dmar_drhd_unit *dmaru;
+ struct acpi_dmar_hardware_unit *drhd;
+ struct acpi_dmar_device_scope *scope;
+ struct device *tmp;
+ int i;
+ struct acpi_dmar_pci_path *path;
+
+ for_each_drhd_unit(dmaru) {
+ drhd = container_of(dmaru->hdr,
+ struct acpi_dmar_hardware_unit,
+ header);
+
+ for (scope = (void *)(drhd + 1);
+ (unsigned long)scope < ((unsigned long)drhd) + drhd->header.length;
+ scope = ((void *)scope) + scope->length) {
+ if (scope->entry_type != ACPI_DMAR_SCOPE_TYPE_NAMESPACE)
+ continue;
+ if (scope->enumeration_id != device_number)
+ continue;
+
+ path = (void *)(scope + 1);
+ pr_info("ACPI device \"%s\" under DMAR at %llx as %02x:%02x.%d\n",
+ dev_name(&adev->dev), dmaru->reg_base_addr,
+ scope->bus, path->device, path->function);
+ for_each_dev_scope(dmaru->devices, dmaru->devices_cnt, i, tmp)
+ if (tmp == NULL) {
+ dmaru->devices[i].bus = scope->bus;
+ dmaru->devices[i].devfn = PCI_DEVFN(path->device,
+ path->function);
+ rcu_assign_pointer(dmaru->devices[i].dev,
+ get_device(&adev->dev));
+ return;
+ }
+ BUG_ON(i >= dmaru->devices_cnt);
+ }
+ }
+ pr_warn("No IOMMU scope found for ANDD enumeration ID %d (%s)\n",
+ device_number, dev_name(&adev->dev));
+}
+
+static int __init dmar_acpi_dev_scope_init(void)
+{
+ struct acpi_dmar_andd *andd;
+
+ if (dmar_tbl == NULL)
+ return -ENODEV;
+
+ for (andd = (void *)dmar_tbl + sizeof(struct acpi_table_dmar);
+ ((unsigned long)andd) < ((unsigned long)dmar_tbl) + dmar_tbl->length;
+ andd = ((void *)andd) + andd->header.length) {
+ if (andd->header.type == ACPI_DMAR_TYPE_NAMESPACE) {
+ acpi_handle h;
+ struct acpi_device *adev;
+
+ if (!ACPI_SUCCESS(acpi_get_handle(ACPI_ROOT_OBJECT,
+ andd->device_name,
+ &h))) {
+ pr_err("Failed to find handle for ACPI object %s\n",
+ andd->device_name);
+ continue;
+ }
+ if (acpi_bus_get_device(h, &adev)) {
+ pr_err("Failed to get device for ACPI object %s\n",
+ andd->device_name);
+ continue;
+ }
+ dmar_acpi_insert_dev_scope(andd->device_number, adev);
+ }
+ }
+ return 0;
+}
+
+int __init dmar_dev_scope_init(void)
+{
+ struct pci_dev *dev = NULL;
+ struct dmar_pci_notify_info *info;
+
+ if (dmar_dev_scope_status != 1)
+ return dmar_dev_scope_status;
+
+ if (list_empty(&dmar_drhd_units)) {
+ dmar_dev_scope_status = -ENODEV;
+ } else {
+ dmar_dev_scope_status = 0;
+
+ dmar_acpi_dev_scope_init();
+
+ for_each_pci_dev(dev) {
+ if (dev->is_virtfn)
+ continue;
+
+ info = dmar_alloc_pci_notify_info(dev,
+ BUS_NOTIFY_ADD_DEVICE);
+ if (!info) {
+ pci_dev_put(dev);
+ return dmar_dev_scope_status;
+ } else {
+ dmar_pci_bus_add_dev(info);
+ dmar_free_pci_notify_info(info);
+ }
+ }
+ }
+
+ return dmar_dev_scope_status;
+}
+
+void __init dmar_register_bus_notifier(void)
+{
+ bus_register_notifier(&pci_bus_type, &dmar_pci_bus_nb);
+}
+
+
+int __init dmar_table_init(void)
+{
+ static int dmar_table_initialized;
+ int ret;
+
+ if (dmar_table_initialized == 0) {
+ ret = parse_dmar_table();
+ if (ret < 0) {
+ if (ret != -ENODEV)
+ pr_info("Parse DMAR table failure.\n");
+ } else if (list_empty(&dmar_drhd_units)) {
+ pr_info("No DMAR devices found\n");
+ ret = -ENODEV;
+ }
+
+ if (ret < 0)
+ dmar_table_initialized = ret;
+ else
+ dmar_table_initialized = 1;
+ }
+
+ return dmar_table_initialized < 0 ? dmar_table_initialized : 0;
+}
+
+static void warn_invalid_dmar(u64 addr, const char *message)
+{
+ pr_warn_once(FW_BUG
+ "Your BIOS is broken; DMAR reported at address %llx%s!\n"
+ "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
+ addr, message,
+ dmi_get_system_info(DMI_BIOS_VENDOR),
+ dmi_get_system_info(DMI_BIOS_VERSION),
+ dmi_get_system_info(DMI_PRODUCT_VERSION));
+ add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
+}
+
+static int __ref
+dmar_validate_one_drhd(struct acpi_dmar_header *entry, void *arg)
+{
+ struct acpi_dmar_hardware_unit *drhd;
+ void __iomem *addr;
+ u64 cap, ecap;
+
+ drhd = (void *)entry;
+ if (!drhd->address) {
+ warn_invalid_dmar(0, "");
+ return -EINVAL;
+ }
+
+ if (arg)
+ addr = ioremap(drhd->address, VTD_PAGE_SIZE);
+ else
+ addr = early_ioremap(drhd->address, VTD_PAGE_SIZE);
+ if (!addr) {
+ pr_warn("Can't validate DRHD address: %llx\n", drhd->address);
+ return -EINVAL;
+ }
+
+ cap = dmar_readq(addr + DMAR_CAP_REG);
+ ecap = dmar_readq(addr + DMAR_ECAP_REG);
+
+ if (arg)
+ iounmap(addr);
+ else
+ early_iounmap(addr, VTD_PAGE_SIZE);
+
+ if (cap == (uint64_t)-1 && ecap == (uint64_t)-1) {
+ warn_invalid_dmar(drhd->address, " returns all ones");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+int __init detect_intel_iommu(void)
+{
+ int ret;
+ struct dmar_res_callback validate_drhd_cb = {
+ .cb[ACPI_DMAR_TYPE_HARDWARE_UNIT] = &dmar_validate_one_drhd,
+ .ignore_unhandled = true,
+ };
+
+ down_write(&dmar_global_lock);
+ ret = dmar_table_detect();
+ if (!ret)
+ ret = dmar_walk_dmar_table((struct acpi_table_dmar *)dmar_tbl,
+ &validate_drhd_cb);
+ if (!ret && !no_iommu && !iommu_detected &&
+ (!dmar_disabled || dmar_platform_optin())) {
+ iommu_detected = 1;
+ /* Make sure ACS will be enabled */
+ pci_request_acs();
+ }
+
+#ifdef CONFIG_X86
+ if (!ret) {
+ x86_init.iommu.iommu_init = intel_iommu_init;
+ x86_platform.iommu_shutdown = intel_iommu_shutdown;
+ }
+
+#endif
+
+ if (dmar_tbl) {
+ acpi_put_table(dmar_tbl);
+ dmar_tbl = NULL;
+ }
+ up_write(&dmar_global_lock);
+
+ return ret ? ret : 1;
+}
+
+static void unmap_iommu(struct intel_iommu *iommu)
+{
+ iounmap(iommu->reg);
+ release_mem_region(iommu->reg_phys, iommu->reg_size);
+}
+
+/**
+ * map_iommu: map the iommu's registers
+ * @iommu: the iommu to map
+ * @phys_addr: the physical address of the base resgister
+ *
+ * Memory map the iommu's registers. Start w/ a single page, and
+ * possibly expand if that turns out to be insufficent.
+ */
+static int map_iommu(struct intel_iommu *iommu, u64 phys_addr)
+{
+ int map_size, err=0;
+
+ iommu->reg_phys = phys_addr;
+ iommu->reg_size = VTD_PAGE_SIZE;
+
+ if (!request_mem_region(iommu->reg_phys, iommu->reg_size, iommu->name)) {
+ pr_err("Can't reserve memory\n");
+ err = -EBUSY;
+ goto out;
+ }
+
+ iommu->reg = ioremap(iommu->reg_phys, iommu->reg_size);
+ if (!iommu->reg) {
+ pr_err("Can't map the region\n");
+ err = -ENOMEM;
+ goto release;
+ }
+
+ iommu->cap = dmar_readq(iommu->reg + DMAR_CAP_REG);
+ iommu->ecap = dmar_readq(iommu->reg + DMAR_ECAP_REG);
+
+ if (iommu->cap == (uint64_t)-1 && iommu->ecap == (uint64_t)-1) {
+ err = -EINVAL;
+ warn_invalid_dmar(phys_addr, " returns all ones");
+ goto unmap;
+ }
+ if (ecap_vcs(iommu->ecap))
+ iommu->vccap = dmar_readq(iommu->reg + DMAR_VCCAP_REG);
+
+ /* the registers might be more than one page */
+ map_size = max_t(int, ecap_max_iotlb_offset(iommu->ecap),
+ cap_max_fault_reg_offset(iommu->cap));
+ map_size = VTD_PAGE_ALIGN(map_size);
+ if (map_size > iommu->reg_size) {
+ iounmap(iommu->reg);
+ release_mem_region(iommu->reg_phys, iommu->reg_size);
+ iommu->reg_size = map_size;
+ if (!request_mem_region(iommu->reg_phys, iommu->reg_size,
+ iommu->name)) {
+ pr_err("Can't reserve memory\n");
+ err = -EBUSY;
+ goto out;
+ }
+ iommu->reg = ioremap(iommu->reg_phys, iommu->reg_size);
+ if (!iommu->reg) {
+ pr_err("Can't map the region\n");
+ err = -ENOMEM;
+ goto release;
+ }
+ }
+ err = 0;
+ goto out;
+
+unmap:
+ iounmap(iommu->reg);
+release:
+ release_mem_region(iommu->reg_phys, iommu->reg_size);
+out:
+ return err;
+}
+
+static int dmar_alloc_seq_id(struct intel_iommu *iommu)
+{
+ iommu->seq_id = find_first_zero_bit(dmar_seq_ids,
+ DMAR_UNITS_SUPPORTED);
+ if (iommu->seq_id >= DMAR_UNITS_SUPPORTED) {
+ iommu->seq_id = -1;
+ } else {
+ set_bit(iommu->seq_id, dmar_seq_ids);
+ sprintf(iommu->name, "dmar%d", iommu->seq_id);
+ }
+
+ return iommu->seq_id;
+}
+
+static void dmar_free_seq_id(struct intel_iommu *iommu)
+{
+ if (iommu->seq_id >= 0) {
+ clear_bit(iommu->seq_id, dmar_seq_ids);
+ iommu->seq_id = -1;
+ }
+}
+
+static int alloc_iommu(struct dmar_drhd_unit *drhd)
+{
+ struct intel_iommu *iommu;
+ u32 ver, sts;
+ int agaw = -1;
+ int msagaw = -1;
+ int err;
+
+ if (!drhd->reg_base_addr) {
+ warn_invalid_dmar(0, "");
+ return -EINVAL;
+ }
+
+ iommu = kzalloc(sizeof(*iommu), GFP_KERNEL);
+ if (!iommu)
+ return -ENOMEM;
+
+ if (dmar_alloc_seq_id(iommu) < 0) {
+ pr_err("Failed to allocate seq_id\n");
+ err = -ENOSPC;
+ goto error;
+ }
+
+ err = map_iommu(iommu, drhd->reg_base_addr);
+ if (err) {
+ pr_err("Failed to map %s\n", iommu->name);
+ goto error_free_seq_id;
+ }
+
+ err = -EINVAL;
+ if (cap_sagaw(iommu->cap) == 0) {
+ pr_info("%s: No supported address widths. Not attempting DMA translation.\n",
+ iommu->name);
+ drhd->ignored = 1;
+ }
+
+ if (!drhd->ignored) {
+ agaw = iommu_calculate_agaw(iommu);
+ if (agaw < 0) {
+ pr_err("Cannot get a valid agaw for iommu (seq_id = %d)\n",
+ iommu->seq_id);
+ drhd->ignored = 1;
+ }
+ }
+ if (!drhd->ignored) {
+ msagaw = iommu_calculate_max_sagaw(iommu);
+ if (msagaw < 0) {
+ pr_err("Cannot get a valid max agaw for iommu (seq_id = %d)\n",
+ iommu->seq_id);
+ drhd->ignored = 1;
+ agaw = -1;
+ }
+ }
+ iommu->agaw = agaw;
+ iommu->msagaw = msagaw;
+ iommu->segment = drhd->segment;
+
+ iommu->node = NUMA_NO_NODE;
+
+ ver = readl(iommu->reg + DMAR_VER_REG);
+ pr_info("%s: reg_base_addr %llx ver %d:%d cap %llx ecap %llx\n",
+ iommu->name,
+ (unsigned long long)drhd->reg_base_addr,
+ DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver),
+ (unsigned long long)iommu->cap,
+ (unsigned long long)iommu->ecap);
+
+ /* Reflect status in gcmd */
+ sts = readl(iommu->reg + DMAR_GSTS_REG);
+ if (sts & DMA_GSTS_IRES)
+ iommu->gcmd |= DMA_GCMD_IRE;
+ if (sts & DMA_GSTS_TES)
+ iommu->gcmd |= DMA_GCMD_TE;
+ if (sts & DMA_GSTS_QIES)
+ iommu->gcmd |= DMA_GCMD_QIE;
+
+ raw_spin_lock_init(&iommu->register_lock);
+
+ /*
+ * This is only for hotplug; at boot time intel_iommu_enabled won't
+ * be set yet. When intel_iommu_init() runs, it registers the units
+ * present at boot time, then sets intel_iommu_enabled.
+ */
+ if (intel_iommu_enabled && !drhd->ignored) {
+ err = iommu_device_sysfs_add(&iommu->iommu, NULL,
+ intel_iommu_groups,
+ "%s", iommu->name);
+ if (err)
+ goto err_unmap;
+
+ iommu_device_set_ops(&iommu->iommu, &intel_iommu_ops);
+
+ err = iommu_device_register(&iommu->iommu);
+ if (err)
+ goto err_sysfs;
+ }
+
+ drhd->iommu = iommu;
+ iommu->drhd = drhd;
+
+ return 0;
+
+err_sysfs:
+ iommu_device_sysfs_remove(&iommu->iommu);
+err_unmap:
+ unmap_iommu(iommu);
+error_free_seq_id:
+ dmar_free_seq_id(iommu);
+error:
+ kfree(iommu);
+ return err;
+}
+
+static void free_iommu(struct intel_iommu *iommu)
+{
+ if (intel_iommu_enabled && !iommu->drhd->ignored) {
+ iommu_device_unregister(&iommu->iommu);
+ iommu_device_sysfs_remove(&iommu->iommu);
+ }
+
+ if (iommu->irq) {
+ if (iommu->pr_irq) {
+ free_irq(iommu->pr_irq, iommu);
+ dmar_free_hwirq(iommu->pr_irq);
+ iommu->pr_irq = 0;
+ }
+ free_irq(iommu->irq, iommu);
+ dmar_free_hwirq(iommu->irq);
+ iommu->irq = 0;
+ }
+
+ if (iommu->qi) {
+ free_page((unsigned long)iommu->qi->desc);
+ kfree(iommu->qi->desc_status);
+ kfree(iommu->qi);
+ }
+
+ if (iommu->reg)
+ unmap_iommu(iommu);
+
+ dmar_free_seq_id(iommu);
+ kfree(iommu);
+}
+
+/*
+ * Reclaim all the submitted descriptors which have completed its work.
+ */
+static inline void reclaim_free_desc(struct q_inval *qi)
+{
+ while (qi->desc_status[qi->free_tail] == QI_DONE ||
+ qi->desc_status[qi->free_tail] == QI_ABORT) {
+ qi->desc_status[qi->free_tail] = QI_FREE;
+ qi->free_tail = (qi->free_tail + 1) % QI_LENGTH;
+ qi->free_cnt++;
+ }
+}
+
+static int qi_check_fault(struct intel_iommu *iommu, int index, int wait_index)
+{
+ u32 fault;
+ int head, tail;
+ struct q_inval *qi = iommu->qi;
+ int shift = qi_shift(iommu);
+
+ if (qi->desc_status[wait_index] == QI_ABORT)
+ return -EAGAIN;
+
+ fault = readl(iommu->reg + DMAR_FSTS_REG);
+
+ /*
+ * If IQE happens, the head points to the descriptor associated
+ * with the error. No new descriptors are fetched until the IQE
+ * is cleared.
+ */
+ if (fault & DMA_FSTS_IQE) {
+ head = readl(iommu->reg + DMAR_IQH_REG);
+ if ((head >> shift) == index) {
+ struct qi_desc *desc = qi->desc + head;
+
+ /*
+ * desc->qw2 and desc->qw3 are either reserved or
+ * used by software as private data. We won't print
+ * out these two qw's for security consideration.
+ */
+ pr_err("VT-d detected invalid descriptor: qw0 = %llx, qw1 = %llx\n",
+ (unsigned long long)desc->qw0,
+ (unsigned long long)desc->qw1);
+ memcpy(desc, qi->desc + (wait_index << shift),
+ 1 << shift);
+ writel(DMA_FSTS_IQE, iommu->reg + DMAR_FSTS_REG);
+ return -EINVAL;
+ }
+ }
+
+ /*
+ * If ITE happens, all pending wait_desc commands are aborted.
+ * No new descriptors are fetched until the ITE is cleared.
+ */
+ if (fault & DMA_FSTS_ITE) {
+ head = readl(iommu->reg + DMAR_IQH_REG);
+ head = ((head >> shift) - 1 + QI_LENGTH) % QI_LENGTH;
+ head |= 1;
+ tail = readl(iommu->reg + DMAR_IQT_REG);
+ tail = ((tail >> shift) - 1 + QI_LENGTH) % QI_LENGTH;
+
+ writel(DMA_FSTS_ITE, iommu->reg + DMAR_FSTS_REG);
+
+ do {
+ if (qi->desc_status[head] == QI_IN_USE)
+ qi->desc_status[head] = QI_ABORT;
+ head = (head - 2 + QI_LENGTH) % QI_LENGTH;
+ } while (head != tail);
+
+ if (qi->desc_status[wait_index] == QI_ABORT)
+ return -EAGAIN;
+ }
+
+ if (fault & DMA_FSTS_ICE)
+ writel(DMA_FSTS_ICE, iommu->reg + DMAR_FSTS_REG);
+
+ return 0;
+}
+
+/*
+ * Function to submit invalidation descriptors of all types to the queued
+ * invalidation interface(QI). Multiple descriptors can be submitted at a
+ * time, a wait descriptor will be appended to each submission to ensure
+ * hardware has completed the invalidation before return. Wait descriptors
+ * can be part of the submission but it will not be polled for completion.
+ */
+int qi_submit_sync(struct intel_iommu *iommu, struct qi_desc *desc,
+ unsigned int count, unsigned long options)
+{
+ struct q_inval *qi = iommu->qi;
+ struct qi_desc wait_desc;
+ int wait_index, index;
+ unsigned long flags;
+ int offset, shift;
+ int rc, i;
+
+ if (!qi)
+ return 0;
+
+restart:
+ rc = 0;
+
+ raw_spin_lock_irqsave(&qi->q_lock, flags);
+ /*
+ * Check if we have enough empty slots in the queue to submit,
+ * the calculation is based on:
+ * # of desc + 1 wait desc + 1 space between head and tail
+ */
+ while (qi->free_cnt < count + 2) {
+ raw_spin_unlock_irqrestore(&qi->q_lock, flags);
+ cpu_relax();
+ raw_spin_lock_irqsave(&qi->q_lock, flags);
+ }
+
+ index = qi->free_head;
+ wait_index = (index + count) % QI_LENGTH;
+ shift = qi_shift(iommu);
+
+ for (i = 0; i < count; i++) {
+ offset = ((index + i) % QI_LENGTH) << shift;
+ memcpy(qi->desc + offset, &desc[i], 1 << shift);
+ qi->desc_status[(index + i) % QI_LENGTH] = QI_IN_USE;
+ }
+ qi->desc_status[wait_index] = QI_IN_USE;
+
+ wait_desc.qw0 = QI_IWD_STATUS_DATA(QI_DONE) |
+ QI_IWD_STATUS_WRITE | QI_IWD_TYPE;
+ if (options & QI_OPT_WAIT_DRAIN)
+ wait_desc.qw0 |= QI_IWD_PRQ_DRAIN;
+ wait_desc.qw1 = virt_to_phys(&qi->desc_status[wait_index]);
+ wait_desc.qw2 = 0;
+ wait_desc.qw3 = 0;
+
+ offset = wait_index << shift;
+ memcpy(qi->desc + offset, &wait_desc, 1 << shift);
+
+ qi->free_head = (qi->free_head + count + 1) % QI_LENGTH;
+ qi->free_cnt -= count + 1;
+
+ /*
+ * update the HW tail register indicating the presence of
+ * new descriptors.
+ */
+ writel(qi->free_head << shift, iommu->reg + DMAR_IQT_REG);
+
+ while (qi->desc_status[wait_index] != QI_DONE) {
+ /*
+ * We will leave the interrupts disabled, to prevent interrupt
+ * context to queue another cmd while a cmd is already submitted
+ * and waiting for completion on this cpu. This is to avoid
+ * a deadlock where the interrupt context can wait indefinitely
+ * for free slots in the queue.
+ */
+ rc = qi_check_fault(iommu, index, wait_index);
+ if (rc)
+ break;
+
+ raw_spin_unlock(&qi->q_lock);
+ cpu_relax();
+ raw_spin_lock(&qi->q_lock);
+ }
+
+ for (i = 0; i < count; i++)
+ qi->desc_status[(index + i) % QI_LENGTH] = QI_DONE;
+
+ reclaim_free_desc(qi);
+ raw_spin_unlock_irqrestore(&qi->q_lock, flags);
+
+ if (rc == -EAGAIN)
+ goto restart;
+
+ return rc;
+}
+
+/*
+ * Flush the global interrupt entry cache.
+ */
+void qi_global_iec(struct intel_iommu *iommu)
+{
+ struct qi_desc desc;
+
+ desc.qw0 = QI_IEC_TYPE;
+ desc.qw1 = 0;
+ desc.qw2 = 0;
+ desc.qw3 = 0;
+
+ /* should never fail */
+ qi_submit_sync(iommu, &desc, 1, 0);
+}
+
+void qi_flush_context(struct intel_iommu *iommu, u16 did, u16 sid, u8 fm,
+ u64 type)
+{
+ struct qi_desc desc;
+
+ desc.qw0 = QI_CC_FM(fm) | QI_CC_SID(sid) | QI_CC_DID(did)
+ | QI_CC_GRAN(type) | QI_CC_TYPE;
+ desc.qw1 = 0;
+ desc.qw2 = 0;
+ desc.qw3 = 0;
+
+ qi_submit_sync(iommu, &desc, 1, 0);
+}
+
+void qi_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr,
+ unsigned int size_order, u64 type)
+{
+ u8 dw = 0, dr = 0;
+
+ struct qi_desc desc;
+ int ih = 0;
+
+ if (cap_write_drain(iommu->cap))
+ dw = 1;
+
+ if (cap_read_drain(iommu->cap))
+ dr = 1;
+
+ desc.qw0 = QI_IOTLB_DID(did) | QI_IOTLB_DR(dr) | QI_IOTLB_DW(dw)
+ | QI_IOTLB_GRAN(type) | QI_IOTLB_TYPE;
+ desc.qw1 = QI_IOTLB_ADDR(addr) | QI_IOTLB_IH(ih)
+ | QI_IOTLB_AM(size_order);
+ desc.qw2 = 0;
+ desc.qw3 = 0;
+
+ qi_submit_sync(iommu, &desc, 1, 0);
+}
+
+void qi_flush_dev_iotlb(struct intel_iommu *iommu, u16 sid, u16 pfsid,
+ u16 qdep, u64 addr, unsigned mask)
+{
+ struct qi_desc desc;
+
+ if (mask) {
+ addr |= (1ULL << (VTD_PAGE_SHIFT + mask - 1)) - 1;
+ desc.qw1 = QI_DEV_IOTLB_ADDR(addr) | QI_DEV_IOTLB_SIZE;
+ } else
+ desc.qw1 = QI_DEV_IOTLB_ADDR(addr);
+
+ if (qdep >= QI_DEV_IOTLB_MAX_INVS)
+ qdep = 0;
+
+ desc.qw0 = QI_DEV_IOTLB_SID(sid) | QI_DEV_IOTLB_QDEP(qdep) |
+ QI_DIOTLB_TYPE | QI_DEV_IOTLB_PFSID(pfsid);
+ desc.qw2 = 0;
+ desc.qw3 = 0;
+
+ qi_submit_sync(iommu, &desc, 1, 0);
+}
+
+/* PASID-based IOTLB invalidation */
+void qi_flush_piotlb(struct intel_iommu *iommu, u16 did, u32 pasid, u64 addr,
+ unsigned long npages, bool ih)
+{
+ struct qi_desc desc = {.qw2 = 0, .qw3 = 0};
+
+ /*
+ * npages == -1 means a PASID-selective invalidation, otherwise,
+ * a positive value for Page-selective-within-PASID invalidation.
+ * 0 is not a valid input.
+ */
+ if (WARN_ON(!npages)) {
+ pr_err("Invalid input npages = %ld\n", npages);
+ return;
+ }
+
+ if (npages == -1) {
+ desc.qw0 = QI_EIOTLB_PASID(pasid) |
+ QI_EIOTLB_DID(did) |
+ QI_EIOTLB_GRAN(QI_GRAN_NONG_PASID) |
+ QI_EIOTLB_TYPE;
+ desc.qw1 = 0;
+ } else {
+ int mask = ilog2(__roundup_pow_of_two(npages));
+ unsigned long align = (1ULL << (VTD_PAGE_SHIFT + mask));
+
+ if (WARN_ON_ONCE(!IS_ALIGNED(addr, align)))
+ addr = ALIGN_DOWN(addr, align);
+
+ desc.qw0 = QI_EIOTLB_PASID(pasid) |
+ QI_EIOTLB_DID(did) |
+ QI_EIOTLB_GRAN(QI_GRAN_PSI_PASID) |
+ QI_EIOTLB_TYPE;
+ desc.qw1 = QI_EIOTLB_ADDR(addr) |
+ QI_EIOTLB_IH(ih) |
+ QI_EIOTLB_AM(mask);
+ }
+
+ qi_submit_sync(iommu, &desc, 1, 0);
+}
+
+/* PASID-based device IOTLB Invalidate */
+void qi_flush_dev_iotlb_pasid(struct intel_iommu *iommu, u16 sid, u16 pfsid,
+ u32 pasid, u16 qdep, u64 addr, unsigned int size_order)
+{
+ unsigned long mask = 1UL << (VTD_PAGE_SHIFT + size_order - 1);
+ struct qi_desc desc = {.qw1 = 0, .qw2 = 0, .qw3 = 0};
+
+ desc.qw0 = QI_DEV_EIOTLB_PASID(pasid) | QI_DEV_EIOTLB_SID(sid) |
+ QI_DEV_EIOTLB_QDEP(qdep) | QI_DEIOTLB_TYPE |
+ QI_DEV_IOTLB_PFSID(pfsid);
+
+ /*
+ * If S bit is 0, we only flush a single page. If S bit is set,
+ * The least significant zero bit indicates the invalidation address
+ * range. VT-d spec 6.5.2.6.
+ * e.g. address bit 12[0] indicates 8KB, 13[0] indicates 16KB.
+ * size order = 0 is PAGE_SIZE 4KB
+ * Max Invs Pending (MIP) is set to 0 for now until we have DIT in
+ * ECAP.
+ */
+ if (!IS_ALIGNED(addr, VTD_PAGE_SIZE << size_order))
+ pr_warn_ratelimited("Invalidate non-aligned address %llx, order %d\n",
+ addr, size_order);
+
+ /* Take page address */
+ desc.qw1 = QI_DEV_EIOTLB_ADDR(addr);
+
+ if (size_order) {
+ /*
+ * Existing 0s in address below size_order may be the least
+ * significant bit, we must set them to 1s to avoid having
+ * smaller size than desired.
+ */
+ desc.qw1 |= GENMASK_ULL(size_order + VTD_PAGE_SHIFT - 1,
+ VTD_PAGE_SHIFT);
+ /* Clear size_order bit to indicate size */
+ desc.qw1 &= ~mask;
+ /* Set the S bit to indicate flushing more than 1 page */
+ desc.qw1 |= QI_DEV_EIOTLB_SIZE;
+ }
+
+ qi_submit_sync(iommu, &desc, 1, 0);
+}
+
+void qi_flush_pasid_cache(struct intel_iommu *iommu, u16 did,
+ u64 granu, u32 pasid)
+{
+ struct qi_desc desc = {.qw1 = 0, .qw2 = 0, .qw3 = 0};
+
+ desc.qw0 = QI_PC_PASID(pasid) | QI_PC_DID(did) |
+ QI_PC_GRAN(granu) | QI_PC_TYPE;
+ qi_submit_sync(iommu, &desc, 1, 0);
+}
+
+/*
+ * Disable Queued Invalidation interface.
+ */
+void dmar_disable_qi(struct intel_iommu *iommu)
+{
+ unsigned long flags;
+ u32 sts;
+ cycles_t start_time = get_cycles();
+
+ if (!ecap_qis(iommu->ecap))
+ return;
+
+ raw_spin_lock_irqsave(&iommu->register_lock, flags);
+
+ sts = readl(iommu->reg + DMAR_GSTS_REG);
+ if (!(sts & DMA_GSTS_QIES))
+ goto end;
+
+ /*
+ * Give a chance to HW to complete the pending invalidation requests.
+ */
+ while ((readl(iommu->reg + DMAR_IQT_REG) !=
+ readl(iommu->reg + DMAR_IQH_REG)) &&
+ (DMAR_OPERATION_TIMEOUT > (get_cycles() - start_time)))
+ cpu_relax();
+
+ iommu->gcmd &= ~DMA_GCMD_QIE;
+ writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
+
+ IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, readl,
+ !(sts & DMA_GSTS_QIES), sts);
+end:
+ raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
+}
+
+/*
+ * Enable queued invalidation.
+ */
+static void __dmar_enable_qi(struct intel_iommu *iommu)
+{
+ u32 sts;
+ unsigned long flags;
+ struct q_inval *qi = iommu->qi;
+ u64 val = virt_to_phys(qi->desc);
+
+ qi->free_head = qi->free_tail = 0;
+ qi->free_cnt = QI_LENGTH;
+
+ /*
+ * Set DW=1 and QS=1 in IQA_REG when Scalable Mode capability
+ * is present.
+ */
+ if (ecap_smts(iommu->ecap))
+ val |= (1 << 11) | 1;
+
+ raw_spin_lock_irqsave(&iommu->register_lock, flags);
+
+ /* write zero to the tail reg */
+ writel(0, iommu->reg + DMAR_IQT_REG);
+
+ dmar_writeq(iommu->reg + DMAR_IQA_REG, val);
+
+ iommu->gcmd |= DMA_GCMD_QIE;
+ writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
+
+ /* Make sure hardware complete it */
+ IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, readl, (sts & DMA_GSTS_QIES), sts);
+
+ raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
+}
+
+/*
+ * Enable Queued Invalidation interface. This is a must to support
+ * interrupt-remapping. Also used by DMA-remapping, which replaces
+ * register based IOTLB invalidation.
+ */
+int dmar_enable_qi(struct intel_iommu *iommu)
+{
+ struct q_inval *qi;
+ struct page *desc_page;
+
+ if (!ecap_qis(iommu->ecap))
+ return -ENOENT;
+
+ /*
+ * queued invalidation is already setup and enabled.
+ */
+ if (iommu->qi)
+ return 0;
+
+ iommu->qi = kmalloc(sizeof(*qi), GFP_ATOMIC);
+ if (!iommu->qi)
+ return -ENOMEM;
+
+ qi = iommu->qi;
+
+ /*
+ * Need two pages to accommodate 256 descriptors of 256 bits each
+ * if the remapping hardware supports scalable mode translation.
+ */
+ desc_page = alloc_pages_node(iommu->node, GFP_ATOMIC | __GFP_ZERO,
+ !!ecap_smts(iommu->ecap));
+ if (!desc_page) {
+ kfree(qi);
+ iommu->qi = NULL;
+ return -ENOMEM;
+ }
+
+ qi->desc = page_address(desc_page);
+
+ qi->desc_status = kcalloc(QI_LENGTH, sizeof(int), GFP_ATOMIC);
+ if (!qi->desc_status) {
+ free_page((unsigned long) qi->desc);
+ kfree(qi);
+ iommu->qi = NULL;
+ return -ENOMEM;
+ }
+
+ raw_spin_lock_init(&qi->q_lock);
+
+ __dmar_enable_qi(iommu);
+
+ return 0;
+}
+
+/* iommu interrupt handling. Most stuff are MSI-like. */
+
+enum faulttype {
+ DMA_REMAP,
+ INTR_REMAP,
+ UNKNOWN,
+};
+
+static const char *dma_remap_fault_reasons[] =
+{
+ "Software",
+ "Present bit in root entry is clear",
+ "Present bit in context entry is clear",
+ "Invalid context entry",
+ "Access beyond MGAW",
+ "PTE Write access is not set",
+ "PTE Read access is not set",
+ "Next page table ptr is invalid",
+ "Root table address invalid",
+ "Context table ptr is invalid",
+ "non-zero reserved fields in RTP",
+ "non-zero reserved fields in CTP",
+ "non-zero reserved fields in PTE",
+ "PCE for translation request specifies blocking",
+};
+
+static const char * const dma_remap_sm_fault_reasons[] = {
+ "SM: Invalid Root Table Address",
+ "SM: TTM 0 for request with PASID",
+ "SM: TTM 0 for page group request",
+ "Unknown", "Unknown", "Unknown", "Unknown", "Unknown", /* 0x33-0x37 */
+ "SM: Error attempting to access Root Entry",
+ "SM: Present bit in Root Entry is clear",
+ "SM: Non-zero reserved field set in Root Entry",
+ "Unknown", "Unknown", "Unknown", "Unknown", "Unknown", /* 0x3B-0x3F */
+ "SM: Error attempting to access Context Entry",
+ "SM: Present bit in Context Entry is clear",
+ "SM: Non-zero reserved field set in the Context Entry",
+ "SM: Invalid Context Entry",
+ "SM: DTE field in Context Entry is clear",
+ "SM: PASID Enable field in Context Entry is clear",
+ "SM: PASID is larger than the max in Context Entry",
+ "SM: PRE field in Context-Entry is clear",
+ "SM: RID_PASID field error in Context-Entry",
+ "Unknown", "Unknown", "Unknown", "Unknown", "Unknown", "Unknown", "Unknown", /* 0x49-0x4F */
+ "SM: Error attempting to access the PASID Directory Entry",
+ "SM: Present bit in Directory Entry is clear",
+ "SM: Non-zero reserved field set in PASID Directory Entry",
+ "Unknown", "Unknown", "Unknown", "Unknown", "Unknown", /* 0x53-0x57 */
+ "SM: Error attempting to access PASID Table Entry",
+ "SM: Present bit in PASID Table Entry is clear",
+ "SM: Non-zero reserved field set in PASID Table Entry",
+ "SM: Invalid Scalable-Mode PASID Table Entry",
+ "SM: ERE field is clear in PASID Table Entry",
+ "SM: SRE field is clear in PASID Table Entry",
+ "Unknown", "Unknown",/* 0x5E-0x5F */
+ "Unknown", "Unknown", "Unknown", "Unknown", "Unknown", "Unknown", "Unknown", "Unknown", /* 0x60-0x67 */
+ "Unknown", "Unknown", "Unknown", "Unknown", "Unknown", "Unknown", "Unknown", "Unknown", /* 0x68-0x6F */
+ "SM: Error attempting to access first-level paging entry",
+ "SM: Present bit in first-level paging entry is clear",
+ "SM: Non-zero reserved field set in first-level paging entry",
+ "SM: Error attempting to access FL-PML4 entry",
+ "SM: First-level entry address beyond MGAW in Nested translation",
+ "SM: Read permission error in FL-PML4 entry in Nested translation",
+ "SM: Read permission error in first-level paging entry in Nested translation",
+ "SM: Write permission error in first-level paging entry in Nested translation",
+ "SM: Error attempting to access second-level paging entry",
+ "SM: Read/Write permission error in second-level paging entry",
+ "SM: Non-zero reserved field set in second-level paging entry",
+ "SM: Invalid second-level page table pointer",
+ "SM: A/D bit update needed in second-level entry when set up in no snoop",
+ "Unknown", "Unknown", "Unknown", /* 0x7D-0x7F */
+ "SM: Address in first-level translation is not canonical",
+ "SM: U/S set 0 for first-level translation with user privilege",
+ "SM: No execute permission for request with PASID and ER=1",
+ "SM: Address beyond the DMA hardware max",
+ "SM: Second-level entry address beyond the max",
+ "SM: No write permission for Write/AtomicOp request",
+ "SM: No read permission for Read/AtomicOp request",
+ "SM: Invalid address-interrupt address",
+ "Unknown", "Unknown", "Unknown", "Unknown", "Unknown", "Unknown", "Unknown", "Unknown", /* 0x88-0x8F */
+ "SM: A/D bit update needed in first-level entry when set up in no snoop",
+};
+
+static const char *irq_remap_fault_reasons[] =
+{
+ "Detected reserved fields in the decoded interrupt-remapped request",
+ "Interrupt index exceeded the interrupt-remapping table size",
+ "Present field in the IRTE entry is clear",
+ "Error accessing interrupt-remapping table pointed by IRTA_REG",
+ "Detected reserved fields in the IRTE entry",
+ "Blocked a compatibility format interrupt request",
+ "Blocked an interrupt request due to source-id verification failure",
+};
+
+static const char *dmar_get_fault_reason(u8 fault_reason, int *fault_type)
+{
+ if (fault_reason >= 0x20 && (fault_reason - 0x20 <
+ ARRAY_SIZE(irq_remap_fault_reasons))) {
+ *fault_type = INTR_REMAP;
+ return irq_remap_fault_reasons[fault_reason - 0x20];
+ } else if (fault_reason >= 0x30 && (fault_reason - 0x30 <
+ ARRAY_SIZE(dma_remap_sm_fault_reasons))) {
+ *fault_type = DMA_REMAP;
+ return dma_remap_sm_fault_reasons[fault_reason - 0x30];
+ } else if (fault_reason < ARRAY_SIZE(dma_remap_fault_reasons)) {
+ *fault_type = DMA_REMAP;
+ return dma_remap_fault_reasons[fault_reason];
+ } else {
+ *fault_type = UNKNOWN;
+ return "Unknown";
+ }
+}
+
+
+static inline int dmar_msi_reg(struct intel_iommu *iommu, int irq)
+{
+ if (iommu->irq == irq)
+ return DMAR_FECTL_REG;
+ else if (iommu->pr_irq == irq)
+ return DMAR_PECTL_REG;
+ else
+ BUG();
+}
+
+void dmar_msi_unmask(struct irq_data *data)
+{
+ struct intel_iommu *iommu = irq_data_get_irq_handler_data(data);
+ int reg = dmar_msi_reg(iommu, data->irq);
+ unsigned long flag;
+
+ /* unmask it */
+ raw_spin_lock_irqsave(&iommu->register_lock, flag);
+ writel(0, iommu->reg + reg);
+ /* Read a reg to force flush the post write */
+ readl(iommu->reg + reg);
+ raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
+}
+
+void dmar_msi_mask(struct irq_data *data)
+{
+ struct intel_iommu *iommu = irq_data_get_irq_handler_data(data);
+ int reg = dmar_msi_reg(iommu, data->irq);
+ unsigned long flag;
+
+ /* mask it */
+ raw_spin_lock_irqsave(&iommu->register_lock, flag);
+ writel(DMA_FECTL_IM, iommu->reg + reg);
+ /* Read a reg to force flush the post write */
+ readl(iommu->reg + reg);
+ raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
+}
+
+void dmar_msi_write(int irq, struct msi_msg *msg)
+{
+ struct intel_iommu *iommu = irq_get_handler_data(irq);
+ int reg = dmar_msi_reg(iommu, irq);
+ unsigned long flag;
+
+ raw_spin_lock_irqsave(&iommu->register_lock, flag);
+ writel(msg->data, iommu->reg + reg + 4);
+ writel(msg->address_lo, iommu->reg + reg + 8);
+ writel(msg->address_hi, iommu->reg + reg + 12);
+ raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
+}
+
+void dmar_msi_read(int irq, struct msi_msg *msg)
+{
+ struct intel_iommu *iommu = irq_get_handler_data(irq);
+ int reg = dmar_msi_reg(iommu, irq);
+ unsigned long flag;
+
+ raw_spin_lock_irqsave(&iommu->register_lock, flag);
+ msg->data = readl(iommu->reg + reg + 4);
+ msg->address_lo = readl(iommu->reg + reg + 8);
+ msg->address_hi = readl(iommu->reg + reg + 12);
+ raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
+}
+
+static int dmar_fault_do_one(struct intel_iommu *iommu, int type,
+ u8 fault_reason, u32 pasid, u16 source_id,
+ unsigned long long addr)
+{
+ const char *reason;
+ int fault_type;
+
+ reason = dmar_get_fault_reason(fault_reason, &fault_type);
+
+ if (fault_type == INTR_REMAP)
+ pr_err("[INTR-REMAP] Request device [%02x:%02x.%d] fault index %llx [fault reason %02d] %s\n",
+ source_id >> 8, PCI_SLOT(source_id & 0xFF),
+ PCI_FUNC(source_id & 0xFF), addr >> 48,
+ fault_reason, reason);
+ else
+ pr_err("[%s] Request device [%02x:%02x.%d] PASID %x fault addr %llx [fault reason %02d] %s\n",
+ type ? "DMA Read" : "DMA Write",
+ source_id >> 8, PCI_SLOT(source_id & 0xFF),
+ PCI_FUNC(source_id & 0xFF), pasid, addr,
+ fault_reason, reason);
+ return 0;
+}
+
+#define PRIMARY_FAULT_REG_LEN (16)
+irqreturn_t dmar_fault(int irq, void *dev_id)
+{
+ struct intel_iommu *iommu = dev_id;
+ int reg, fault_index;
+ u32 fault_status;
+ unsigned long flag;
+ static DEFINE_RATELIMIT_STATE(rs,
+ DEFAULT_RATELIMIT_INTERVAL,
+ DEFAULT_RATELIMIT_BURST);
+
+ raw_spin_lock_irqsave(&iommu->register_lock, flag);
+ fault_status = readl(iommu->reg + DMAR_FSTS_REG);
+ if (fault_status && __ratelimit(&rs))
+ pr_err("DRHD: handling fault status reg %x\n", fault_status);
+
+ /* TBD: ignore advanced fault log currently */
+ if (!(fault_status & DMA_FSTS_PPF))
+ goto unlock_exit;
+
+ fault_index = dma_fsts_fault_record_index(fault_status);
+ reg = cap_fault_reg_offset(iommu->cap);
+ while (1) {
+ /* Disable printing, simply clear the fault when ratelimited */
+ bool ratelimited = !__ratelimit(&rs);
+ u8 fault_reason;
+ u16 source_id;
+ u64 guest_addr;
+ u32 pasid;
+ int type;
+ u32 data;
+ bool pasid_present;
+
+ /* highest 32 bits */
+ data = readl(iommu->reg + reg +
+ fault_index * PRIMARY_FAULT_REG_LEN + 12);
+ if (!(data & DMA_FRCD_F))
+ break;
+
+ if (!ratelimited) {
+ fault_reason = dma_frcd_fault_reason(data);
+ type = dma_frcd_type(data);
+
+ pasid = dma_frcd_pasid_value(data);
+ data = readl(iommu->reg + reg +
+ fault_index * PRIMARY_FAULT_REG_LEN + 8);
+ source_id = dma_frcd_source_id(data);
+
+ pasid_present = dma_frcd_pasid_present(data);
+ guest_addr = dmar_readq(iommu->reg + reg +
+ fault_index * PRIMARY_FAULT_REG_LEN);
+ guest_addr = dma_frcd_page_addr(guest_addr);
+ }
+
+ /* clear the fault */
+ writel(DMA_FRCD_F, iommu->reg + reg +
+ fault_index * PRIMARY_FAULT_REG_LEN + 12);
+
+ raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
+
+ if (!ratelimited)
+ /* Using pasid -1 if pasid is not present */
+ dmar_fault_do_one(iommu, type, fault_reason,
+ pasid_present ? pasid : -1,
+ source_id, guest_addr);
+
+ fault_index++;
+ if (fault_index >= cap_num_fault_regs(iommu->cap))
+ fault_index = 0;
+ raw_spin_lock_irqsave(&iommu->register_lock, flag);
+ }
+
+ writel(DMA_FSTS_PFO | DMA_FSTS_PPF | DMA_FSTS_PRO,
+ iommu->reg + DMAR_FSTS_REG);
+
+unlock_exit:
+ raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
+ return IRQ_HANDLED;
+}
+
+int dmar_set_interrupt(struct intel_iommu *iommu)
+{
+ int irq, ret;
+
+ /*
+ * Check if the fault interrupt is already initialized.
+ */
+ if (iommu->irq)
+ return 0;
+
+ irq = dmar_alloc_hwirq(iommu->seq_id, iommu->node, iommu);
+ if (irq > 0) {
+ iommu->irq = irq;
+ } else {
+ pr_err("No free IRQ vectors\n");
+ return -EINVAL;
+ }
+
+ ret = request_irq(irq, dmar_fault, IRQF_NO_THREAD, iommu->name, iommu);
+ if (ret)
+ pr_err("Can't request irq\n");
+ return ret;
+}
+
+int __init enable_drhd_fault_handling(void)
+{
+ struct dmar_drhd_unit *drhd;
+ struct intel_iommu *iommu;
+
+ /*
+ * Enable fault control interrupt.
+ */
+ for_each_iommu(iommu, drhd) {
+ u32 fault_status;
+ int ret = dmar_set_interrupt(iommu);
+
+ if (ret) {
+ pr_err("DRHD %Lx: failed to enable fault, interrupt, ret %d\n",
+ (unsigned long long)drhd->reg_base_addr, ret);
+ return -1;
+ }
+
+ /*
+ * Clear any previous faults.
+ */
+ dmar_fault(iommu->irq, iommu);
+ fault_status = readl(iommu->reg + DMAR_FSTS_REG);
+ writel(fault_status, iommu->reg + DMAR_FSTS_REG);
+ }
+
+ return 0;
+}
+
+/*
+ * Re-enable Queued Invalidation interface.
+ */
+int dmar_reenable_qi(struct intel_iommu *iommu)
+{
+ if (!ecap_qis(iommu->ecap))
+ return -ENOENT;
+
+ if (!iommu->qi)
+ return -ENOENT;
+
+ /*
+ * First disable queued invalidation.
+ */
+ dmar_disable_qi(iommu);
+ /*
+ * Then enable queued invalidation again. Since there is no pending
+ * invalidation requests now, it's safe to re-enable queued
+ * invalidation.
+ */
+ __dmar_enable_qi(iommu);
+
+ return 0;
+}
+
+/*
+ * Check interrupt remapping support in DMAR table description.
+ */
+int __init dmar_ir_support(void)
+{
+ struct acpi_table_dmar *dmar;
+ dmar = (struct acpi_table_dmar *)dmar_tbl;
+ if (!dmar)
+ return 0;
+ return dmar->flags & 0x1;
+}
+
+/* Check whether DMAR units are in use */
+static inline bool dmar_in_use(void)
+{
+ return irq_remapping_enabled || intel_iommu_enabled;
+}
+
+static int __init dmar_free_unused_resources(void)
+{
+ struct dmar_drhd_unit *dmaru, *dmaru_n;
+
+ if (dmar_in_use())
+ return 0;
+
+ if (dmar_dev_scope_status != 1 && !list_empty(&dmar_drhd_units))
+ bus_unregister_notifier(&pci_bus_type, &dmar_pci_bus_nb);
+
+ down_write(&dmar_global_lock);
+ list_for_each_entry_safe(dmaru, dmaru_n, &dmar_drhd_units, list) {
+ list_del(&dmaru->list);
+ dmar_free_drhd(dmaru);
+ }
+ up_write(&dmar_global_lock);
+
+ return 0;
+}
+
+late_initcall(dmar_free_unused_resources);
+IOMMU_INIT_POST(detect_intel_iommu);
+
+/*
+ * DMAR Hotplug Support
+ * For more details, please refer to Intel(R) Virtualization Technology
+ * for Directed-IO Architecture Specifiction, Rev 2.2, Section 8.8
+ * "Remapping Hardware Unit Hot Plug".
+ */
+static guid_t dmar_hp_guid =
+ GUID_INIT(0xD8C1A3A6, 0xBE9B, 0x4C9B,
+ 0x91, 0xBF, 0xC3, 0xCB, 0x81, 0xFC, 0x5D, 0xAF);
+
+/*
+ * Currently there's only one revision and BIOS will not check the revision id,
+ * so use 0 for safety.
+ */
+#define DMAR_DSM_REV_ID 0
+#define DMAR_DSM_FUNC_DRHD 1
+#define DMAR_DSM_FUNC_ATSR 2
+#define DMAR_DSM_FUNC_RHSA 3
+
+static inline bool dmar_detect_dsm(acpi_handle handle, int func)
+{
+ return acpi_check_dsm(handle, &dmar_hp_guid, DMAR_DSM_REV_ID, 1 << func);
+}
+
+static int dmar_walk_dsm_resource(acpi_handle handle, int func,
+ dmar_res_handler_t handler, void *arg)
+{
+ int ret = -ENODEV;
+ union acpi_object *obj;
+ struct acpi_dmar_header *start;
+ struct dmar_res_callback callback;
+ static int res_type[] = {
+ [DMAR_DSM_FUNC_DRHD] = ACPI_DMAR_TYPE_HARDWARE_UNIT,
+ [DMAR_DSM_FUNC_ATSR] = ACPI_DMAR_TYPE_ROOT_ATS,
+ [DMAR_DSM_FUNC_RHSA] = ACPI_DMAR_TYPE_HARDWARE_AFFINITY,
+ };
+
+ if (!dmar_detect_dsm(handle, func))
+ return 0;
+
+ obj = acpi_evaluate_dsm_typed(handle, &dmar_hp_guid, DMAR_DSM_REV_ID,
+ func, NULL, ACPI_TYPE_BUFFER);
+ if (!obj)
+ return -ENODEV;
+
+ memset(&callback, 0, sizeof(callback));
+ callback.cb[res_type[func]] = handler;
+ callback.arg[res_type[func]] = arg;
+ start = (struct acpi_dmar_header *)obj->buffer.pointer;
+ ret = dmar_walk_remapping_entries(start, obj->buffer.length, &callback);
+
+ ACPI_FREE(obj);
+
+ return ret;
+}
+
+static int dmar_hp_add_drhd(struct acpi_dmar_header *header, void *arg)
+{
+ int ret;
+ struct dmar_drhd_unit *dmaru;
+
+ dmaru = dmar_find_dmaru((struct acpi_dmar_hardware_unit *)header);
+ if (!dmaru)
+ return -ENODEV;
+
+ ret = dmar_ir_hotplug(dmaru, true);
+ if (ret == 0)
+ ret = dmar_iommu_hotplug(dmaru, true);
+
+ return ret;
+}
+
+static int dmar_hp_remove_drhd(struct acpi_dmar_header *header, void *arg)
+{
+ int i, ret;
+ struct device *dev;
+ struct dmar_drhd_unit *dmaru;
+
+ dmaru = dmar_find_dmaru((struct acpi_dmar_hardware_unit *)header);
+ if (!dmaru)
+ return 0;
+
+ /*
+ * All PCI devices managed by this unit should have been destroyed.
+ */
+ if (!dmaru->include_all && dmaru->devices && dmaru->devices_cnt) {
+ for_each_active_dev_scope(dmaru->devices,
+ dmaru->devices_cnt, i, dev)
+ return -EBUSY;
+ }
+
+ ret = dmar_ir_hotplug(dmaru, false);
+ if (ret == 0)
+ ret = dmar_iommu_hotplug(dmaru, false);
+
+ return ret;
+}
+
+static int dmar_hp_release_drhd(struct acpi_dmar_header *header, void *arg)
+{
+ struct dmar_drhd_unit *dmaru;
+
+ dmaru = dmar_find_dmaru((struct acpi_dmar_hardware_unit *)header);
+ if (dmaru) {
+ list_del_rcu(&dmaru->list);
+ synchronize_rcu();
+ dmar_free_drhd(dmaru);
+ }
+
+ return 0;
+}
+
+static int dmar_hotplug_insert(acpi_handle handle)
+{
+ int ret;
+ int drhd_count = 0;
+
+ ret = dmar_walk_dsm_resource(handle, DMAR_DSM_FUNC_DRHD,
+ &dmar_validate_one_drhd, (void *)1);
+ if (ret)
+ goto out;
+
+ ret = dmar_walk_dsm_resource(handle, DMAR_DSM_FUNC_DRHD,
+ &dmar_parse_one_drhd, (void *)&drhd_count);
+ if (ret == 0 && drhd_count == 0) {
+ pr_warn(FW_BUG "No DRHD structures in buffer returned by _DSM method\n");
+ goto out;
+ } else if (ret) {
+ goto release_drhd;
+ }
+
+ ret = dmar_walk_dsm_resource(handle, DMAR_DSM_FUNC_RHSA,
+ &dmar_parse_one_rhsa, NULL);
+ if (ret)
+ goto release_drhd;
+
+ ret = dmar_walk_dsm_resource(handle, DMAR_DSM_FUNC_ATSR,
+ &dmar_parse_one_atsr, NULL);
+ if (ret)
+ goto release_atsr;
+
+ ret = dmar_walk_dsm_resource(handle, DMAR_DSM_FUNC_DRHD,
+ &dmar_hp_add_drhd, NULL);
+ if (!ret)
+ return 0;
+
+ dmar_walk_dsm_resource(handle, DMAR_DSM_FUNC_DRHD,
+ &dmar_hp_remove_drhd, NULL);
+release_atsr:
+ dmar_walk_dsm_resource(handle, DMAR_DSM_FUNC_ATSR,
+ &dmar_release_one_atsr, NULL);
+release_drhd:
+ dmar_walk_dsm_resource(handle, DMAR_DSM_FUNC_DRHD,
+ &dmar_hp_release_drhd, NULL);
+out:
+ return ret;
+}
+
+static int dmar_hotplug_remove(acpi_handle handle)
+{
+ int ret;
+
+ ret = dmar_walk_dsm_resource(handle, DMAR_DSM_FUNC_ATSR,
+ &dmar_check_one_atsr, NULL);
+ if (ret)
+ return ret;
+
+ ret = dmar_walk_dsm_resource(handle, DMAR_DSM_FUNC_DRHD,
+ &dmar_hp_remove_drhd, NULL);
+ if (ret == 0) {
+ WARN_ON(dmar_walk_dsm_resource(handle, DMAR_DSM_FUNC_ATSR,
+ &dmar_release_one_atsr, NULL));
+ WARN_ON(dmar_walk_dsm_resource(handle, DMAR_DSM_FUNC_DRHD,
+ &dmar_hp_release_drhd, NULL));
+ } else {
+ dmar_walk_dsm_resource(handle, DMAR_DSM_FUNC_DRHD,
+ &dmar_hp_add_drhd, NULL);
+ }
+
+ return ret;
+}
+
+static acpi_status dmar_get_dsm_handle(acpi_handle handle, u32 lvl,
+ void *context, void **retval)
+{
+ acpi_handle *phdl = retval;
+
+ if (dmar_detect_dsm(handle, DMAR_DSM_FUNC_DRHD)) {
+ *phdl = handle;
+ return AE_CTRL_TERMINATE;
+ }
+
+ return AE_OK;
+}
+
+static int dmar_device_hotplug(acpi_handle handle, bool insert)
+{
+ int ret;
+ acpi_handle tmp = NULL;
+ acpi_status status;
+
+ if (!dmar_in_use())
+ return 0;
+
+ if (dmar_detect_dsm(handle, DMAR_DSM_FUNC_DRHD)) {
+ tmp = handle;
+ } else {
+ status = acpi_walk_namespace(ACPI_TYPE_DEVICE, handle,
+ ACPI_UINT32_MAX,
+ dmar_get_dsm_handle,
+ NULL, NULL, &tmp);
+ if (ACPI_FAILURE(status)) {
+ pr_warn("Failed to locate _DSM method.\n");
+ return -ENXIO;
+ }
+ }
+ if (tmp == NULL)
+ return 0;
+
+ down_write(&dmar_global_lock);
+ if (insert)
+ ret = dmar_hotplug_insert(tmp);
+ else
+ ret = dmar_hotplug_remove(tmp);
+ up_write(&dmar_global_lock);
+
+ return ret;
+}
+
+int dmar_device_add(acpi_handle handle)
+{
+ return dmar_device_hotplug(handle, true);
+}
+
+int dmar_device_remove(acpi_handle handle)
+{
+ return dmar_device_hotplug(handle, false);
+}
+
+/*
+ * dmar_platform_optin - Is %DMA_CTRL_PLATFORM_OPT_IN_FLAG set in DMAR table
+ *
+ * Returns true if the platform has %DMA_CTRL_PLATFORM_OPT_IN_FLAG set in
+ * the ACPI DMAR table. This means that the platform boot firmware has made
+ * sure no device can issue DMA outside of RMRR regions.
+ */
+bool dmar_platform_optin(void)
+{
+ struct acpi_table_dmar *dmar;
+ acpi_status status;
+ bool ret;
+
+ status = acpi_get_table(ACPI_SIG_DMAR, 0,
+ (struct acpi_table_header **)&dmar);
+ if (ACPI_FAILURE(status))
+ return false;
+
+ ret = !!(dmar->flags & DMAR_PLATFORM_OPT_IN);
+ acpi_put_table((struct acpi_table_header *)dmar);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(dmar_platform_optin);
diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
new file mode 100644
index 000000000..6be92e0af
--- /dev/null
+++ b/drivers/iommu/intel/iommu.c
@@ -0,0 +1,6406 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright © 2006-2014 Intel Corporation.
+ *
+ * Authors: David Woodhouse <dwmw2@infradead.org>,
+ * Ashok Raj <ashok.raj@intel.com>,
+ * Shaohua Li <shaohua.li@intel.com>,
+ * Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
+ * Fenghua Yu <fenghua.yu@intel.com>
+ * Joerg Roedel <jroedel@suse.de>
+ */
+
+#define pr_fmt(fmt) "DMAR: " fmt
+#define dev_fmt(fmt) pr_fmt(fmt)
+
+#include <linux/init.h>
+#include <linux/bitmap.h>
+#include <linux/debugfs.h>
+#include <linux/export.h>
+#include <linux/slab.h>
+#include <linux/irq.h>
+#include <linux/interrupt.h>
+#include <linux/spinlock.h>
+#include <linux/pci.h>
+#include <linux/dmar.h>
+#include <linux/dma-map-ops.h>
+#include <linux/mempool.h>
+#include <linux/memory.h>
+#include <linux/cpu.h>
+#include <linux/timer.h>
+#include <linux/io.h>
+#include <linux/iova.h>
+#include <linux/iommu.h>
+#include <linux/intel-iommu.h>
+#include <linux/syscore_ops.h>
+#include <linux/tboot.h>
+#include <linux/dmi.h>
+#include <linux/pci-ats.h>
+#include <linux/memblock.h>
+#include <linux/dma-map-ops.h>
+#include <linux/dma-direct.h>
+#include <linux/crash_dump.h>
+#include <linux/numa.h>
+#include <linux/swiotlb.h>
+#include <asm/irq_remapping.h>
+#include <asm/cacheflush.h>
+#include <asm/iommu.h>
+#include <trace/events/intel_iommu.h>
+
+#include "../irq_remapping.h"
+#include "pasid.h"
+
+#define ROOT_SIZE VTD_PAGE_SIZE
+#define CONTEXT_SIZE VTD_PAGE_SIZE
+
+#define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
+#define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
+#define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
+#define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
+
+#define IOAPIC_RANGE_START (0xfee00000)
+#define IOAPIC_RANGE_END (0xfeefffff)
+#define IOVA_START_ADDR (0x1000)
+
+#define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
+
+#define MAX_AGAW_WIDTH 64
+#define MAX_AGAW_PFN_WIDTH (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
+
+#define __DOMAIN_MAX_PFN(gaw) ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1)
+#define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1)
+
+/* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
+ to match. That way, we can use 'unsigned long' for PFNs with impunity. */
+#define DOMAIN_MAX_PFN(gaw) ((unsigned long) min_t(uint64_t, \
+ __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
+#define DOMAIN_MAX_ADDR(gaw) (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
+
+/* IO virtual address start page frame number */
+#define IOVA_START_PFN (1)
+
+#define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT)
+
+/* page table handling */
+#define LEVEL_STRIDE (9)
+#define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1)
+
+/*
+ * This bitmap is used to advertise the page sizes our hardware support
+ * to the IOMMU core, which will then use this information to split
+ * physically contiguous memory regions it is mapping into page sizes
+ * that we support.
+ *
+ * Traditionally the IOMMU core just handed us the mappings directly,
+ * after making sure the size is an order of a 4KiB page and that the
+ * mapping has natural alignment.
+ *
+ * To retain this behavior, we currently advertise that we support
+ * all page sizes that are an order of 4KiB.
+ *
+ * If at some point we'd like to utilize the IOMMU core's new behavior,
+ * we could change this to advertise the real page sizes we support.
+ */
+#define INTEL_IOMMU_PGSIZES (~0xFFFUL)
+
+static inline int agaw_to_level(int agaw)
+{
+ return agaw + 2;
+}
+
+static inline int agaw_to_width(int agaw)
+{
+ return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
+}
+
+static inline int width_to_agaw(int width)
+{
+ return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
+}
+
+static inline unsigned int level_to_offset_bits(int level)
+{
+ return (level - 1) * LEVEL_STRIDE;
+}
+
+static inline int pfn_level_offset(u64 pfn, int level)
+{
+ return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
+}
+
+static inline u64 level_mask(int level)
+{
+ return -1ULL << level_to_offset_bits(level);
+}
+
+static inline u64 level_size(int level)
+{
+ return 1ULL << level_to_offset_bits(level);
+}
+
+static inline u64 align_to_level(u64 pfn, int level)
+{
+ return (pfn + level_size(level) - 1) & level_mask(level);
+}
+
+static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
+{
+ return 1UL << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
+}
+
+/* VT-d pages must always be _smaller_ than MM pages. Otherwise things
+ are never going to work. */
+static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
+{
+ return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
+}
+
+static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
+{
+ return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
+}
+static inline unsigned long page_to_dma_pfn(struct page *pg)
+{
+ return mm_to_dma_pfn(page_to_pfn(pg));
+}
+static inline unsigned long virt_to_dma_pfn(void *p)
+{
+ return page_to_dma_pfn(virt_to_page(p));
+}
+
+/* global iommu list, set NULL for ignored DMAR units */
+static struct intel_iommu **g_iommus;
+
+static void __init check_tylersburg_isoch(void);
+static int rwbf_quirk;
+
+/*
+ * set to 1 to panic kernel if can't successfully enable VT-d
+ * (used when kernel is launched w/ TXT)
+ */
+static int force_on = 0;
+static int intel_iommu_tboot_noforce;
+static int no_platform_optin;
+
+#define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
+
+/*
+ * Take a root_entry and return the Lower Context Table Pointer (LCTP)
+ * if marked present.
+ */
+static phys_addr_t root_entry_lctp(struct root_entry *re)
+{
+ if (!(re->lo & 1))
+ return 0;
+
+ return re->lo & VTD_PAGE_MASK;
+}
+
+/*
+ * Take a root_entry and return the Upper Context Table Pointer (UCTP)
+ * if marked present.
+ */
+static phys_addr_t root_entry_uctp(struct root_entry *re)
+{
+ if (!(re->hi & 1))
+ return 0;
+
+ return re->hi & VTD_PAGE_MASK;
+}
+
+static inline void context_clear_pasid_enable(struct context_entry *context)
+{
+ context->lo &= ~(1ULL << 11);
+}
+
+static inline bool context_pasid_enabled(struct context_entry *context)
+{
+ return !!(context->lo & (1ULL << 11));
+}
+
+static inline void context_set_copied(struct context_entry *context)
+{
+ context->hi |= (1ull << 3);
+}
+
+static inline bool context_copied(struct context_entry *context)
+{
+ return !!(context->hi & (1ULL << 3));
+}
+
+static inline bool __context_present(struct context_entry *context)
+{
+ return (context->lo & 1);
+}
+
+bool context_present(struct context_entry *context)
+{
+ return context_pasid_enabled(context) ?
+ __context_present(context) :
+ __context_present(context) && !context_copied(context);
+}
+
+static inline void context_set_present(struct context_entry *context)
+{
+ context->lo |= 1;
+}
+
+static inline void context_set_fault_enable(struct context_entry *context)
+{
+ context->lo &= (((u64)-1) << 2) | 1;
+}
+
+static inline void context_set_translation_type(struct context_entry *context,
+ unsigned long value)
+{
+ context->lo &= (((u64)-1) << 4) | 3;
+ context->lo |= (value & 3) << 2;
+}
+
+static inline void context_set_address_root(struct context_entry *context,
+ unsigned long value)
+{
+ context->lo &= ~VTD_PAGE_MASK;
+ context->lo |= value & VTD_PAGE_MASK;
+}
+
+static inline void context_set_address_width(struct context_entry *context,
+ unsigned long value)
+{
+ context->hi |= value & 7;
+}
+
+static inline void context_set_domain_id(struct context_entry *context,
+ unsigned long value)
+{
+ context->hi |= (value & ((1 << 16) - 1)) << 8;
+}
+
+static inline int context_domain_id(struct context_entry *c)
+{
+ return((c->hi >> 8) & 0xffff);
+}
+
+static inline void context_clear_entry(struct context_entry *context)
+{
+ context->lo = 0;
+ context->hi = 0;
+}
+
+/*
+ * This domain is a statically identity mapping domain.
+ * 1. This domain creats a static 1:1 mapping to all usable memory.
+ * 2. It maps to each iommu if successful.
+ * 3. Each iommu mapps to this domain if successful.
+ */
+static struct dmar_domain *si_domain;
+static int hw_pass_through = 1;
+
+#define for_each_domain_iommu(idx, domain) \
+ for (idx = 0; idx < g_num_of_iommus; idx++) \
+ if (domain->iommu_refcnt[idx])
+
+struct dmar_rmrr_unit {
+ struct list_head list; /* list of rmrr units */
+ struct acpi_dmar_header *hdr; /* ACPI header */
+ u64 base_address; /* reserved base address*/
+ u64 end_address; /* reserved end address */
+ struct dmar_dev_scope *devices; /* target devices */
+ int devices_cnt; /* target device count */
+};
+
+struct dmar_atsr_unit {
+ struct list_head list; /* list of ATSR units */
+ struct acpi_dmar_header *hdr; /* ACPI header */
+ struct dmar_dev_scope *devices; /* target devices */
+ int devices_cnt; /* target device count */
+ u8 include_all:1; /* include all ports */
+};
+
+static LIST_HEAD(dmar_atsr_units);
+static LIST_HEAD(dmar_rmrr_units);
+
+#define for_each_rmrr_units(rmrr) \
+ list_for_each_entry(rmrr, &dmar_rmrr_units, list)
+
+/* bitmap for indexing intel_iommus */
+static int g_num_of_iommus;
+
+static void domain_exit(struct dmar_domain *domain);
+static void domain_remove_dev_info(struct dmar_domain *domain);
+static void dmar_remove_one_dev_info(struct device *dev);
+static void __dmar_remove_one_dev_info(struct device_domain_info *info);
+static int intel_iommu_attach_device(struct iommu_domain *domain,
+ struct device *dev);
+static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
+ dma_addr_t iova);
+
+#ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
+int dmar_disabled = 0;
+#else
+int dmar_disabled = 1;
+#endif /* CONFIG_INTEL_IOMMU_DEFAULT_ON */
+
+#ifdef CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON
+int intel_iommu_sm = 1;
+#else
+int intel_iommu_sm;
+#endif /* CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON */
+
+int intel_iommu_enabled = 0;
+EXPORT_SYMBOL_GPL(intel_iommu_enabled);
+
+static int dmar_map_gfx = 1;
+static int dmar_forcedac;
+static int intel_iommu_strict;
+static int intel_iommu_superpage = 1;
+static int iommu_identity_mapping;
+static int intel_no_bounce;
+static int iommu_skip_te_disable;
+
+#define IDENTMAP_GFX 2
+#define IDENTMAP_AZALIA 4
+
+int intel_iommu_gfx_mapped;
+EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
+
+#define DEFER_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-2))
+struct device_domain_info *get_domain_info(struct device *dev)
+{
+ struct device_domain_info *info;
+
+ if (!dev)
+ return NULL;
+
+ info = dev_iommu_priv_get(dev);
+ if (unlikely(info == DEFER_DEVICE_DOMAIN_INFO))
+ return NULL;
+
+ return info;
+}
+
+DEFINE_SPINLOCK(device_domain_lock);
+static LIST_HEAD(device_domain_list);
+
+#define device_needs_bounce(d) (!intel_no_bounce && dev_is_pci(d) && \
+ to_pci_dev(d)->untrusted)
+
+/*
+ * Iterate over elements in device_domain_list and call the specified
+ * callback @fn against each element.
+ */
+int for_each_device_domain(int (*fn)(struct device_domain_info *info,
+ void *data), void *data)
+{
+ int ret = 0;
+ unsigned long flags;
+ struct device_domain_info *info;
+
+ spin_lock_irqsave(&device_domain_lock, flags);
+ list_for_each_entry(info, &device_domain_list, global) {
+ ret = fn(info, data);
+ if (ret) {
+ spin_unlock_irqrestore(&device_domain_lock, flags);
+ return ret;
+ }
+ }
+ spin_unlock_irqrestore(&device_domain_lock, flags);
+
+ return 0;
+}
+
+const struct iommu_ops intel_iommu_ops;
+
+static bool translation_pre_enabled(struct intel_iommu *iommu)
+{
+ return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
+}
+
+static void clear_translation_pre_enabled(struct intel_iommu *iommu)
+{
+ iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
+}
+
+static void init_translation_status(struct intel_iommu *iommu)
+{
+ u32 gsts;
+
+ gsts = readl(iommu->reg + DMAR_GSTS_REG);
+ if (gsts & DMA_GSTS_TES)
+ iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
+}
+
+static int __init intel_iommu_setup(char *str)
+{
+ if (!str)
+ return -EINVAL;
+ while (*str) {
+ if (!strncmp(str, "on", 2)) {
+ dmar_disabled = 0;
+ pr_info("IOMMU enabled\n");
+ } else if (!strncmp(str, "off", 3)) {
+ dmar_disabled = 1;
+ no_platform_optin = 1;
+ pr_info("IOMMU disabled\n");
+ } else if (!strncmp(str, "igfx_off", 8)) {
+ dmar_map_gfx = 0;
+ pr_info("Disable GFX device mapping\n");
+ } else if (!strncmp(str, "forcedac", 8)) {
+ pr_info("Forcing DAC for PCI devices\n");
+ dmar_forcedac = 1;
+ } else if (!strncmp(str, "strict", 6)) {
+ pr_info("Disable batched IOTLB flush\n");
+ intel_iommu_strict = 1;
+ } else if (!strncmp(str, "sp_off", 6)) {
+ pr_info("Disable supported super page\n");
+ intel_iommu_superpage = 0;
+ } else if (!strncmp(str, "sm_on", 5)) {
+ pr_info("Intel-IOMMU: scalable mode supported\n");
+ intel_iommu_sm = 1;
+ } else if (!strncmp(str, "tboot_noforce", 13)) {
+ pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
+ intel_iommu_tboot_noforce = 1;
+ } else if (!strncmp(str, "nobounce", 8)) {
+ pr_info("Intel-IOMMU: No bounce buffer. This could expose security risks of DMA attacks\n");
+ intel_no_bounce = 1;
+ }
+
+ str += strcspn(str, ",");
+ while (*str == ',')
+ str++;
+ }
+ return 0;
+}
+__setup("intel_iommu=", intel_iommu_setup);
+
+static struct kmem_cache *iommu_domain_cache;
+static struct kmem_cache *iommu_devinfo_cache;
+
+static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did)
+{
+ struct dmar_domain **domains;
+ int idx = did >> 8;
+
+ domains = iommu->domains[idx];
+ if (!domains)
+ return NULL;
+
+ return domains[did & 0xff];
+}
+
+static void set_iommu_domain(struct intel_iommu *iommu, u16 did,
+ struct dmar_domain *domain)
+{
+ struct dmar_domain **domains;
+ int idx = did >> 8;
+
+ if (!iommu->domains[idx]) {
+ size_t size = 256 * sizeof(struct dmar_domain *);
+ iommu->domains[idx] = kzalloc(size, GFP_ATOMIC);
+ }
+
+ domains = iommu->domains[idx];
+ if (WARN_ON(!domains))
+ return;
+ else
+ domains[did & 0xff] = domain;
+}
+
+void *alloc_pgtable_page(int node)
+{
+ struct page *page;
+ void *vaddr = NULL;
+
+ page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
+ if (page)
+ vaddr = page_address(page);
+ return vaddr;
+}
+
+void free_pgtable_page(void *vaddr)
+{
+ free_page((unsigned long)vaddr);
+}
+
+static inline void *alloc_domain_mem(void)
+{
+ return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
+}
+
+static void free_domain_mem(void *vaddr)
+{
+ kmem_cache_free(iommu_domain_cache, vaddr);
+}
+
+static inline void * alloc_devinfo_mem(void)
+{
+ return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
+}
+
+static inline void free_devinfo_mem(void *vaddr)
+{
+ kmem_cache_free(iommu_devinfo_cache, vaddr);
+}
+
+static inline int domain_type_is_si(struct dmar_domain *domain)
+{
+ return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY;
+}
+
+static inline bool domain_use_first_level(struct dmar_domain *domain)
+{
+ return domain->flags & DOMAIN_FLAG_USE_FIRST_LEVEL;
+}
+
+static inline int domain_pfn_supported(struct dmar_domain *domain,
+ unsigned long pfn)
+{
+ int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
+
+ return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
+}
+
+/*
+ * Calculate the Supported Adjusted Guest Address Widths of an IOMMU.
+ * Refer to 11.4.2 of the VT-d spec for the encoding of each bit of
+ * the returned SAGAW.
+ */
+static unsigned long __iommu_calculate_sagaw(struct intel_iommu *iommu)
+{
+ unsigned long fl_sagaw, sl_sagaw;
+
+ fl_sagaw = BIT(2) | (cap_5lp_support(iommu->cap) ? BIT(3) : 0);
+ sl_sagaw = cap_sagaw(iommu->cap);
+
+ /* Second level only. */
+ if (!sm_supported(iommu) || !ecap_flts(iommu->ecap))
+ return sl_sagaw;
+
+ /* First level only. */
+ if (!ecap_slts(iommu->ecap))
+ return fl_sagaw;
+
+ return fl_sagaw & sl_sagaw;
+}
+
+static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
+{
+ unsigned long sagaw;
+ int agaw = -1;
+
+ sagaw = __iommu_calculate_sagaw(iommu);
+ for (agaw = width_to_agaw(max_gaw); agaw >= 0; agaw--) {
+ if (test_bit(agaw, &sagaw))
+ break;
+ }
+
+ return agaw;
+}
+
+/*
+ * Calculate max SAGAW for each iommu.
+ */
+int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
+{
+ return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
+}
+
+/*
+ * calculate agaw for each iommu.
+ * "SAGAW" may be different across iommus, use a default agaw, and
+ * get a supported less agaw for iommus that don't support the default agaw.
+ */
+int iommu_calculate_agaw(struct intel_iommu *iommu)
+{
+ return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
+}
+
+/* This functionin only returns single iommu in a domain */
+struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
+{
+ int iommu_id;
+
+ /* si_domain and vm domain should not get here. */
+ if (WARN_ON(domain->domain.type != IOMMU_DOMAIN_DMA))
+ return NULL;
+
+ for_each_domain_iommu(iommu_id, domain)
+ break;
+
+ if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
+ return NULL;
+
+ return g_iommus[iommu_id];
+}
+
+static inline bool iommu_paging_structure_coherency(struct intel_iommu *iommu)
+{
+ return sm_supported(iommu) ?
+ ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap);
+}
+
+static void domain_update_iommu_coherency(struct dmar_domain *domain)
+{
+ struct dmar_drhd_unit *drhd;
+ struct intel_iommu *iommu;
+ bool found = false;
+ int i;
+
+ domain->iommu_coherency = 1;
+
+ for_each_domain_iommu(i, domain) {
+ found = true;
+ if (!iommu_paging_structure_coherency(g_iommus[i])) {
+ domain->iommu_coherency = 0;
+ break;
+ }
+ }
+ if (found)
+ return;
+
+ /* No hardware attached; use lowest common denominator */
+ rcu_read_lock();
+ for_each_active_iommu(iommu, drhd) {
+ if (!iommu_paging_structure_coherency(iommu)) {
+ domain->iommu_coherency = 0;
+ break;
+ }
+ }
+ rcu_read_unlock();
+}
+
+static int domain_update_iommu_snooping(struct intel_iommu *skip)
+{
+ struct dmar_drhd_unit *drhd;
+ struct intel_iommu *iommu;
+ int ret = 1;
+
+ rcu_read_lock();
+ for_each_active_iommu(iommu, drhd) {
+ if (iommu != skip) {
+ /*
+ * If the hardware is operating in the scalable mode,
+ * the snooping control is always supported since we
+ * always set PASID-table-entry.PGSNP bit if the domain
+ * is managed outside (UNMANAGED).
+ */
+ if (!sm_supported(iommu) &&
+ !ecap_sc_support(iommu->ecap)) {
+ ret = 0;
+ break;
+ }
+ }
+ }
+ rcu_read_unlock();
+
+ return ret;
+}
+
+static int domain_update_iommu_superpage(struct dmar_domain *domain,
+ struct intel_iommu *skip)
+{
+ struct dmar_drhd_unit *drhd;
+ struct intel_iommu *iommu;
+ int mask = 0x3;
+
+ if (!intel_iommu_superpage) {
+ return 0;
+ }
+
+ /* set iommu_superpage to the smallest common denominator */
+ rcu_read_lock();
+ for_each_active_iommu(iommu, drhd) {
+ if (iommu != skip) {
+ if (domain && domain_use_first_level(domain)) {
+ if (!cap_fl1gp_support(iommu->cap))
+ mask = 0x1;
+ } else {
+ mask &= cap_super_page_val(iommu->cap);
+ }
+
+ if (!mask)
+ break;
+ }
+ }
+ rcu_read_unlock();
+
+ return fls(mask);
+}
+
+static int domain_update_device_node(struct dmar_domain *domain)
+{
+ struct device_domain_info *info;
+ int nid = NUMA_NO_NODE;
+
+ assert_spin_locked(&device_domain_lock);
+
+ if (list_empty(&domain->devices))
+ return NUMA_NO_NODE;
+
+ list_for_each_entry(info, &domain->devices, link) {
+ if (!info->dev)
+ continue;
+
+ /*
+ * There could possibly be multiple device numa nodes as devices
+ * within the same domain may sit behind different IOMMUs. There
+ * isn't perfect answer in such situation, so we select first
+ * come first served policy.
+ */
+ nid = dev_to_node(info->dev);
+ if (nid != NUMA_NO_NODE)
+ break;
+ }
+
+ return nid;
+}
+
+/* Some capabilities may be different across iommus */
+static void domain_update_iommu_cap(struct dmar_domain *domain)
+{
+ domain_update_iommu_coherency(domain);
+ domain->iommu_snooping = domain_update_iommu_snooping(NULL);
+ domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
+
+ /*
+ * If RHSA is missing, we should default to the device numa domain
+ * as fall back.
+ */
+ if (domain->nid == NUMA_NO_NODE)
+ domain->nid = domain_update_device_node(domain);
+
+ /*
+ * First-level translation restricts the input-address to a
+ * canonical address (i.e., address bits 63:N have the same
+ * value as address bit [N-1], where N is 48-bits with 4-level
+ * paging and 57-bits with 5-level paging). Hence, skip bit
+ * [N-1].
+ */
+ if (domain_use_first_level(domain))
+ domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1);
+ else
+ domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw);
+}
+
+struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
+ u8 devfn, int alloc)
+{
+ struct root_entry *root = &iommu->root_entry[bus];
+ struct context_entry *context;
+ u64 *entry;
+
+ entry = &root->lo;
+ if (sm_supported(iommu)) {
+ if (devfn >= 0x80) {
+ devfn -= 0x80;
+ entry = &root->hi;
+ }
+ devfn *= 2;
+ }
+ if (*entry & 1)
+ context = phys_to_virt(*entry & VTD_PAGE_MASK);
+ else {
+ unsigned long phy_addr;
+ if (!alloc)
+ return NULL;
+
+ context = alloc_pgtable_page(iommu->node);
+ if (!context)
+ return NULL;
+
+ __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
+ phy_addr = virt_to_phys((void *)context);
+ *entry = phy_addr | 1;
+ __iommu_flush_cache(iommu, entry, sizeof(*entry));
+ }
+ return &context[devfn];
+}
+
+static bool attach_deferred(struct device *dev)
+{
+ return dev_iommu_priv_get(dev) == DEFER_DEVICE_DOMAIN_INFO;
+}
+
+/**
+ * is_downstream_to_pci_bridge - test if a device belongs to the PCI
+ * sub-hierarchy of a candidate PCI-PCI bridge
+ * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
+ * @bridge: the candidate PCI-PCI bridge
+ *
+ * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
+ */
+static bool
+is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
+{
+ struct pci_dev *pdev, *pbridge;
+
+ if (!dev_is_pci(dev) || !dev_is_pci(bridge))
+ return false;
+
+ pdev = to_pci_dev(dev);
+ pbridge = to_pci_dev(bridge);
+
+ if (pbridge->subordinate &&
+ pbridge->subordinate->number <= pdev->bus->number &&
+ pbridge->subordinate->busn_res.end >= pdev->bus->number)
+ return true;
+
+ return false;
+}
+
+static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
+{
+ struct dmar_drhd_unit *drhd;
+ u32 vtbar;
+ int rc;
+
+ /* We know that this device on this chipset has its own IOMMU.
+ * If we find it under a different IOMMU, then the BIOS is lying
+ * to us. Hope that the IOMMU for this device is actually
+ * disabled, and it needs no translation...
+ */
+ rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
+ if (rc) {
+ /* "can't" happen */
+ dev_info(&pdev->dev, "failed to run vt-d quirk\n");
+ return false;
+ }
+ vtbar &= 0xffff0000;
+
+ /* we know that the this iommu should be at offset 0xa000 from vtbar */
+ drhd = dmar_find_matched_drhd_unit(pdev);
+ if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
+ pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
+ add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
+ return true;
+ }
+
+ return false;
+}
+
+static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev)
+{
+ if (!iommu || iommu->drhd->ignored)
+ return true;
+
+ if (dev_is_pci(dev)) {
+ struct pci_dev *pdev = to_pci_dev(dev);
+
+ if (pdev->vendor == PCI_VENDOR_ID_INTEL &&
+ pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB &&
+ quirk_ioat_snb_local_iommu(pdev))
+ return true;
+ }
+
+ return false;
+}
+
+struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
+{
+ struct dmar_drhd_unit *drhd = NULL;
+ struct pci_dev *pdev = NULL;
+ struct intel_iommu *iommu;
+ struct device *tmp;
+ u16 segment = 0;
+ int i;
+
+ if (!dev)
+ return NULL;
+
+ if (dev_is_pci(dev)) {
+ struct pci_dev *pf_pdev;
+
+ pdev = pci_real_dma_dev(to_pci_dev(dev));
+
+ /* VFs aren't listed in scope tables; we need to look up
+ * the PF instead to find the IOMMU. */
+ pf_pdev = pci_physfn(pdev);
+ dev = &pf_pdev->dev;
+ segment = pci_domain_nr(pdev->bus);
+ } else if (has_acpi_companion(dev))
+ dev = &ACPI_COMPANION(dev)->dev;
+
+ rcu_read_lock();
+ for_each_iommu(iommu, drhd) {
+ if (pdev && segment != drhd->segment)
+ continue;
+
+ for_each_active_dev_scope(drhd->devices,
+ drhd->devices_cnt, i, tmp) {
+ if (tmp == dev) {
+ /* For a VF use its original BDF# not that of the PF
+ * which we used for the IOMMU lookup. Strictly speaking
+ * we could do this for all PCI devices; we only need to
+ * get the BDF# from the scope table for ACPI matches. */
+ if (pdev && pdev->is_virtfn)
+ goto got_pdev;
+
+ if (bus && devfn) {
+ *bus = drhd->devices[i].bus;
+ *devfn = drhd->devices[i].devfn;
+ }
+ goto out;
+ }
+
+ if (is_downstream_to_pci_bridge(dev, tmp))
+ goto got_pdev;
+ }
+
+ if (pdev && drhd->include_all) {
+ got_pdev:
+ if (bus && devfn) {
+ *bus = pdev->bus->number;
+ *devfn = pdev->devfn;
+ }
+ goto out;
+ }
+ }
+ iommu = NULL;
+ out:
+ if (iommu_is_dummy(iommu, dev))
+ iommu = NULL;
+
+ rcu_read_unlock();
+
+ return iommu;
+}
+
+static void domain_flush_cache(struct dmar_domain *domain,
+ void *addr, int size)
+{
+ if (!domain->iommu_coherency)
+ clflush_cache_range(addr, size);
+}
+
+static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
+{
+ struct context_entry *context;
+ int ret = 0;
+ unsigned long flags;
+
+ spin_lock_irqsave(&iommu->lock, flags);
+ context = iommu_context_addr(iommu, bus, devfn, 0);
+ if (context)
+ ret = context_present(context);
+ spin_unlock_irqrestore(&iommu->lock, flags);
+ return ret;
+}
+
+static void free_context_table(struct intel_iommu *iommu)
+{
+ int i;
+ unsigned long flags;
+ struct context_entry *context;
+
+ spin_lock_irqsave(&iommu->lock, flags);
+ if (!iommu->root_entry) {
+ goto out;
+ }
+ for (i = 0; i < ROOT_ENTRY_NR; i++) {
+ context = iommu_context_addr(iommu, i, 0, 0);
+ if (context)
+ free_pgtable_page(context);
+
+ if (!sm_supported(iommu))
+ continue;
+
+ context = iommu_context_addr(iommu, i, 0x80, 0);
+ if (context)
+ free_pgtable_page(context);
+
+ }
+ free_pgtable_page(iommu->root_entry);
+ iommu->root_entry = NULL;
+out:
+ spin_unlock_irqrestore(&iommu->lock, flags);
+}
+
+static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
+ unsigned long pfn, int *target_level)
+{
+ struct dma_pte *parent, *pte;
+ int level = agaw_to_level(domain->agaw);
+ int offset;
+
+ BUG_ON(!domain->pgd);
+
+ if (!domain_pfn_supported(domain, pfn))
+ /* Address beyond IOMMU's addressing capabilities. */
+ return NULL;
+
+ parent = domain->pgd;
+
+ while (1) {
+ void *tmp_page;
+
+ offset = pfn_level_offset(pfn, level);
+ pte = &parent[offset];
+ if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
+ break;
+ if (level == *target_level)
+ break;
+
+ if (!dma_pte_present(pte)) {
+ uint64_t pteval;
+
+ tmp_page = alloc_pgtable_page(domain->nid);
+
+ if (!tmp_page)
+ return NULL;
+
+ domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
+ pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
+ if (domain_use_first_level(domain)) {
+ pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US;
+ if (domain->domain.type == IOMMU_DOMAIN_DMA)
+ pteval |= DMA_FL_PTE_ACCESS;
+ }
+ if (cmpxchg64(&pte->val, 0ULL, pteval))
+ /* Someone else set it while we were thinking; use theirs. */
+ free_pgtable_page(tmp_page);
+ else
+ domain_flush_cache(domain, pte, sizeof(*pte));
+ }
+ if (level == 1)
+ break;
+
+ parent = phys_to_virt(dma_pte_addr(pte));
+ level--;
+ }
+
+ if (!*target_level)
+ *target_level = level;
+
+ return pte;
+}
+
+/* return address's pte at specific level */
+static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
+ unsigned long pfn,
+ int level, int *large_page)
+{
+ struct dma_pte *parent, *pte;
+ int total = agaw_to_level(domain->agaw);
+ int offset;
+
+ parent = domain->pgd;
+ while (level <= total) {
+ offset = pfn_level_offset(pfn, total);
+ pte = &parent[offset];
+ if (level == total)
+ return pte;
+
+ if (!dma_pte_present(pte)) {
+ *large_page = total;
+ break;
+ }
+
+ if (dma_pte_superpage(pte)) {
+ *large_page = total;
+ return pte;
+ }
+
+ parent = phys_to_virt(dma_pte_addr(pte));
+ total--;
+ }
+ return NULL;
+}
+
+/* clear last level pte, a tlb flush should be followed */
+static void dma_pte_clear_range(struct dmar_domain *domain,
+ unsigned long start_pfn,
+ unsigned long last_pfn)
+{
+ unsigned int large_page;
+ struct dma_pte *first_pte, *pte;
+
+ BUG_ON(!domain_pfn_supported(domain, start_pfn));
+ BUG_ON(!domain_pfn_supported(domain, last_pfn));
+ BUG_ON(start_pfn > last_pfn);
+
+ /* we don't need lock here; nobody else touches the iova range */
+ do {
+ large_page = 1;
+ first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
+ if (!pte) {
+ start_pfn = align_to_level(start_pfn + 1, large_page + 1);
+ continue;
+ }
+ do {
+ dma_clear_pte(pte);
+ start_pfn += lvl_to_nr_pages(large_page);
+ pte++;
+ } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
+
+ domain_flush_cache(domain, first_pte,
+ (void *)pte - (void *)first_pte);
+
+ } while (start_pfn && start_pfn <= last_pfn);
+}
+
+static void dma_pte_free_level(struct dmar_domain *domain, int level,
+ int retain_level, struct dma_pte *pte,
+ unsigned long pfn, unsigned long start_pfn,
+ unsigned long last_pfn)
+{
+ pfn = max(start_pfn, pfn);
+ pte = &pte[pfn_level_offset(pfn, level)];
+
+ do {
+ unsigned long level_pfn;
+ struct dma_pte *level_pte;
+
+ if (!dma_pte_present(pte) || dma_pte_superpage(pte))
+ goto next;
+
+ level_pfn = pfn & level_mask(level);
+ level_pte = phys_to_virt(dma_pte_addr(pte));
+
+ if (level > 2) {
+ dma_pte_free_level(domain, level - 1, retain_level,
+ level_pte, level_pfn, start_pfn,
+ last_pfn);
+ }
+
+ /*
+ * Free the page table if we're below the level we want to
+ * retain and the range covers the entire table.
+ */
+ if (level < retain_level && !(start_pfn > level_pfn ||
+ last_pfn < level_pfn + level_size(level) - 1)) {
+ dma_clear_pte(pte);
+ domain_flush_cache(domain, pte, sizeof(*pte));
+ free_pgtable_page(level_pte);
+ }
+next:
+ pfn += level_size(level);
+ } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
+}
+
+/*
+ * clear last level (leaf) ptes and free page table pages below the
+ * level we wish to keep intact.
+ */
+static void dma_pte_free_pagetable(struct dmar_domain *domain,
+ unsigned long start_pfn,
+ unsigned long last_pfn,
+ int retain_level)
+{
+ BUG_ON(!domain_pfn_supported(domain, start_pfn));
+ BUG_ON(!domain_pfn_supported(domain, last_pfn));
+ BUG_ON(start_pfn > last_pfn);
+
+ dma_pte_clear_range(domain, start_pfn, last_pfn);
+
+ /* We don't need lock here; nobody else touches the iova range */
+ dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
+ domain->pgd, 0, start_pfn, last_pfn);
+
+ /* free pgd */
+ if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
+ free_pgtable_page(domain->pgd);
+ domain->pgd = NULL;
+ }
+}
+
+/* When a page at a given level is being unlinked from its parent, we don't
+ need to *modify* it at all. All we need to do is make a list of all the
+ pages which can be freed just as soon as we've flushed the IOTLB and we
+ know the hardware page-walk will no longer touch them.
+ The 'pte' argument is the *parent* PTE, pointing to the page that is to
+ be freed. */
+static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
+ int level, struct dma_pte *pte,
+ struct page *freelist)
+{
+ struct page *pg;
+
+ pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
+ pg->freelist = freelist;
+ freelist = pg;
+
+ if (level == 1)
+ return freelist;
+
+ pte = page_address(pg);
+ do {
+ if (dma_pte_present(pte) && !dma_pte_superpage(pte))
+ freelist = dma_pte_list_pagetables(domain, level - 1,
+ pte, freelist);
+ pte++;
+ } while (!first_pte_in_page(pte));
+
+ return freelist;
+}
+
+static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
+ struct dma_pte *pte, unsigned long pfn,
+ unsigned long start_pfn,
+ unsigned long last_pfn,
+ struct page *freelist)
+{
+ struct dma_pte *first_pte = NULL, *last_pte = NULL;
+
+ pfn = max(start_pfn, pfn);
+ pte = &pte[pfn_level_offset(pfn, level)];
+
+ do {
+ unsigned long level_pfn;
+
+ if (!dma_pte_present(pte))
+ goto next;
+
+ level_pfn = pfn & level_mask(level);
+
+ /* If range covers entire pagetable, free it */
+ if (start_pfn <= level_pfn &&
+ last_pfn >= level_pfn + level_size(level) - 1) {
+ /* These suborbinate page tables are going away entirely. Don't
+ bother to clear them; we're just going to *free* them. */
+ if (level > 1 && !dma_pte_superpage(pte))
+ freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
+
+ dma_clear_pte(pte);
+ if (!first_pte)
+ first_pte = pte;
+ last_pte = pte;
+ } else if (level > 1) {
+ /* Recurse down into a level that isn't *entirely* obsolete */
+ freelist = dma_pte_clear_level(domain, level - 1,
+ phys_to_virt(dma_pte_addr(pte)),
+ level_pfn, start_pfn, last_pfn,
+ freelist);
+ }
+next:
+ pfn += level_size(level);
+ } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
+
+ if (first_pte)
+ domain_flush_cache(domain, first_pte,
+ (void *)++last_pte - (void *)first_pte);
+
+ return freelist;
+}
+
+/* We can't just free the pages because the IOMMU may still be walking
+ the page tables, and may have cached the intermediate levels. The
+ pages can only be freed after the IOTLB flush has been done. */
+static struct page *domain_unmap(struct dmar_domain *domain,
+ unsigned long start_pfn,
+ unsigned long last_pfn)
+{
+ struct page *freelist;
+
+ BUG_ON(!domain_pfn_supported(domain, start_pfn));
+ BUG_ON(!domain_pfn_supported(domain, last_pfn));
+ BUG_ON(start_pfn > last_pfn);
+
+ /* we don't need lock here; nobody else touches the iova range */
+ freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
+ domain->pgd, 0, start_pfn, last_pfn, NULL);
+
+ /* free pgd */
+ if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
+ struct page *pgd_page = virt_to_page(domain->pgd);
+ pgd_page->freelist = freelist;
+ freelist = pgd_page;
+
+ domain->pgd = NULL;
+ }
+
+ return freelist;
+}
+
+static void dma_free_pagelist(struct page *freelist)
+{
+ struct page *pg;
+
+ while ((pg = freelist)) {
+ freelist = pg->freelist;
+ free_pgtable_page(page_address(pg));
+ }
+}
+
+static void iova_entry_free(unsigned long data)
+{
+ struct page *freelist = (struct page *)data;
+
+ dma_free_pagelist(freelist);
+}
+
+/* iommu handling */
+static int iommu_alloc_root_entry(struct intel_iommu *iommu)
+{
+ struct root_entry *root;
+ unsigned long flags;
+
+ root = (struct root_entry *)alloc_pgtable_page(iommu->node);
+ if (!root) {
+ pr_err("Allocating root entry for %s failed\n",
+ iommu->name);
+ return -ENOMEM;
+ }
+
+ __iommu_flush_cache(iommu, root, ROOT_SIZE);
+
+ spin_lock_irqsave(&iommu->lock, flags);
+ iommu->root_entry = root;
+ spin_unlock_irqrestore(&iommu->lock, flags);
+
+ return 0;
+}
+
+static void iommu_set_root_entry(struct intel_iommu *iommu)
+{
+ u64 addr;
+ u32 sts;
+ unsigned long flag;
+
+ addr = virt_to_phys(iommu->root_entry);
+ if (sm_supported(iommu))
+ addr |= DMA_RTADDR_SMT;
+
+ raw_spin_lock_irqsave(&iommu->register_lock, flag);
+ dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
+
+ writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
+
+ /* Make sure hardware complete it */
+ IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
+ readl, (sts & DMA_GSTS_RTPS), sts);
+
+ raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
+
+ iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
+ if (sm_supported(iommu))
+ qi_flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0);
+ iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
+}
+
+void iommu_flush_write_buffer(struct intel_iommu *iommu)
+{
+ u32 val;
+ unsigned long flag;
+
+ if (!rwbf_quirk && !cap_rwbf(iommu->cap))
+ return;
+
+ raw_spin_lock_irqsave(&iommu->register_lock, flag);
+ writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
+
+ /* Make sure hardware complete it */
+ IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
+ readl, (!(val & DMA_GSTS_WBFS)), val);
+
+ raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
+}
+
+/* return value determine if we need a write buffer flush */
+static void __iommu_flush_context(struct intel_iommu *iommu,
+ u16 did, u16 source_id, u8 function_mask,
+ u64 type)
+{
+ u64 val = 0;
+ unsigned long flag;
+
+ switch (type) {
+ case DMA_CCMD_GLOBAL_INVL:
+ val = DMA_CCMD_GLOBAL_INVL;
+ break;
+ case DMA_CCMD_DOMAIN_INVL:
+ val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
+ break;
+ case DMA_CCMD_DEVICE_INVL:
+ val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
+ | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
+ break;
+ default:
+ BUG();
+ }
+ val |= DMA_CCMD_ICC;
+
+ raw_spin_lock_irqsave(&iommu->register_lock, flag);
+ dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
+
+ /* Make sure hardware complete it */
+ IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
+ dmar_readq, (!(val & DMA_CCMD_ICC)), val);
+
+ raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
+}
+
+/* return value determine if we need a write buffer flush */
+static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
+ u64 addr, unsigned int size_order, u64 type)
+{
+ int tlb_offset = ecap_iotlb_offset(iommu->ecap);
+ u64 val = 0, val_iva = 0;
+ unsigned long flag;
+
+ switch (type) {
+ case DMA_TLB_GLOBAL_FLUSH:
+ /* global flush doesn't need set IVA_REG */
+ val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
+ break;
+ case DMA_TLB_DSI_FLUSH:
+ val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
+ break;
+ case DMA_TLB_PSI_FLUSH:
+ val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
+ /* IH bit is passed in as part of address */
+ val_iva = size_order | addr;
+ break;
+ default:
+ BUG();
+ }
+ /* Note: set drain read/write */
+#if 0
+ /*
+ * This is probably to be super secure.. Looks like we can
+ * ignore it without any impact.
+ */
+ if (cap_read_drain(iommu->cap))
+ val |= DMA_TLB_READ_DRAIN;
+#endif
+ if (cap_write_drain(iommu->cap))
+ val |= DMA_TLB_WRITE_DRAIN;
+
+ raw_spin_lock_irqsave(&iommu->register_lock, flag);
+ /* Note: Only uses first TLB reg currently */
+ if (val_iva)
+ dmar_writeq(iommu->reg + tlb_offset, val_iva);
+ dmar_writeq(iommu->reg + tlb_offset + 8, val);
+
+ /* Make sure hardware complete it */
+ IOMMU_WAIT_OP(iommu, tlb_offset + 8,
+ dmar_readq, (!(val & DMA_TLB_IVT)), val);
+
+ raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
+
+ /* check IOTLB invalidation granularity */
+ if (DMA_TLB_IAIG(val) == 0)
+ pr_err("Flush IOTLB failed\n");
+ if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
+ pr_debug("TLB flush request %Lx, actual %Lx\n",
+ (unsigned long long)DMA_TLB_IIRG(type),
+ (unsigned long long)DMA_TLB_IAIG(val));
+}
+
+static struct device_domain_info *
+iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
+ u8 bus, u8 devfn)
+{
+ struct device_domain_info *info;
+
+ assert_spin_locked(&device_domain_lock);
+
+ if (!iommu->qi)
+ return NULL;
+
+ list_for_each_entry(info, &domain->devices, link)
+ if (info->iommu == iommu && info->bus == bus &&
+ info->devfn == devfn) {
+ if (info->ats_supported && info->dev)
+ return info;
+ break;
+ }
+
+ return NULL;
+}
+
+static void domain_update_iotlb(struct dmar_domain *domain)
+{
+ struct device_domain_info *info;
+ bool has_iotlb_device = false;
+
+ assert_spin_locked(&device_domain_lock);
+
+ list_for_each_entry(info, &domain->devices, link) {
+ struct pci_dev *pdev;
+
+ if (!info->dev || !dev_is_pci(info->dev))
+ continue;
+
+ pdev = to_pci_dev(info->dev);
+ if (pdev->ats_enabled) {
+ has_iotlb_device = true;
+ break;
+ }
+ }
+
+ domain->has_iotlb_device = has_iotlb_device;
+}
+
+static void iommu_enable_dev_iotlb(struct device_domain_info *info)
+{
+ struct pci_dev *pdev;
+
+ assert_spin_locked(&device_domain_lock);
+
+ if (!info || !dev_is_pci(info->dev))
+ return;
+
+ pdev = to_pci_dev(info->dev);
+ /* For IOMMU that supports device IOTLB throttling (DIT), we assign
+ * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
+ * queue depth at PF level. If DIT is not set, PFSID will be treated as
+ * reserved, which should be set to 0.
+ */
+ if (!ecap_dit(info->iommu->ecap))
+ info->pfsid = 0;
+ else {
+ struct pci_dev *pf_pdev;
+
+ /* pdev will be returned if device is not a vf */
+ pf_pdev = pci_physfn(pdev);
+ info->pfsid = pci_dev_id(pf_pdev);
+ }
+
+#ifdef CONFIG_INTEL_IOMMU_SVM
+ /* The PCIe spec, in its wisdom, declares that the behaviour of
+ the device if you enable PASID support after ATS support is
+ undefined. So always enable PASID support on devices which
+ have it, even if we can't yet know if we're ever going to
+ use it. */
+ if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
+ info->pasid_enabled = 1;
+
+ if (info->pri_supported &&
+ (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1) &&
+ !pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32))
+ info->pri_enabled = 1;
+#endif
+ if (info->ats_supported && pci_ats_page_aligned(pdev) &&
+ !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
+ info->ats_enabled = 1;
+ domain_update_iotlb(info->domain);
+ info->ats_qdep = pci_ats_queue_depth(pdev);
+ }
+}
+
+static void iommu_disable_dev_iotlb(struct device_domain_info *info)
+{
+ struct pci_dev *pdev;
+
+ assert_spin_locked(&device_domain_lock);
+
+ if (!dev_is_pci(info->dev))
+ return;
+
+ pdev = to_pci_dev(info->dev);
+
+ if (info->ats_enabled) {
+ pci_disable_ats(pdev);
+ info->ats_enabled = 0;
+ domain_update_iotlb(info->domain);
+ }
+#ifdef CONFIG_INTEL_IOMMU_SVM
+ if (info->pri_enabled) {
+ pci_disable_pri(pdev);
+ info->pri_enabled = 0;
+ }
+ if (info->pasid_enabled) {
+ pci_disable_pasid(pdev);
+ info->pasid_enabled = 0;
+ }
+#endif
+}
+
+static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
+ u64 addr, unsigned mask)
+{
+ u16 sid, qdep;
+ unsigned long flags;
+ struct device_domain_info *info;
+
+ if (!domain->has_iotlb_device)
+ return;
+
+ spin_lock_irqsave(&device_domain_lock, flags);
+ list_for_each_entry(info, &domain->devices, link) {
+ if (!info->ats_enabled)
+ continue;
+
+ sid = info->bus << 8 | info->devfn;
+ qdep = info->ats_qdep;
+ qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
+ qdep, addr, mask);
+ }
+ spin_unlock_irqrestore(&device_domain_lock, flags);
+}
+
+static void domain_flush_piotlb(struct intel_iommu *iommu,
+ struct dmar_domain *domain,
+ u64 addr, unsigned long npages, bool ih)
+{
+ u16 did = domain->iommu_did[iommu->seq_id];
+
+ if (domain->default_pasid)
+ qi_flush_piotlb(iommu, did, domain->default_pasid,
+ addr, npages, ih);
+
+ if (!list_empty(&domain->devices))
+ qi_flush_piotlb(iommu, did, PASID_RID2PASID, addr, npages, ih);
+}
+
+static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
+ struct dmar_domain *domain,
+ unsigned long pfn, unsigned int pages,
+ int ih, int map)
+{
+ unsigned int aligned_pages = __roundup_pow_of_two(pages);
+ unsigned int mask = ilog2(aligned_pages);
+ uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
+ u16 did = domain->iommu_did[iommu->seq_id];
+
+ BUG_ON(pages == 0);
+
+ if (ih)
+ ih = 1 << 6;
+
+ if (domain_use_first_level(domain)) {
+ domain_flush_piotlb(iommu, domain, addr, pages, ih);
+ } else {
+ unsigned long bitmask = aligned_pages - 1;
+
+ /*
+ * PSI masks the low order bits of the base address. If the
+ * address isn't aligned to the mask, then compute a mask value
+ * needed to ensure the target range is flushed.
+ */
+ if (unlikely(bitmask & pfn)) {
+ unsigned long end_pfn = pfn + pages - 1, shared_bits;
+
+ /*
+ * Since end_pfn <= pfn + bitmask, the only way bits
+ * higher than bitmask can differ in pfn and end_pfn is
+ * by carrying. This means after masking out bitmask,
+ * high bits starting with the first set bit in
+ * shared_bits are all equal in both pfn and end_pfn.
+ */
+ shared_bits = ~(pfn ^ end_pfn) & ~bitmask;
+ mask = shared_bits ? __ffs(shared_bits) : BITS_PER_LONG;
+ }
+
+ /*
+ * Fallback to domain selective flush if no PSI support or
+ * the size is too big.
+ */
+ if (!cap_pgsel_inv(iommu->cap) ||
+ mask > cap_max_amask_val(iommu->cap))
+ iommu->flush.flush_iotlb(iommu, did, 0, 0,
+ DMA_TLB_DSI_FLUSH);
+ else
+ iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
+ DMA_TLB_PSI_FLUSH);
+ }
+
+ /*
+ * In caching mode, changes of pages from non-present to present require
+ * flush. However, device IOTLB doesn't need to be flushed in this case.
+ */
+ if (!cap_caching_mode(iommu->cap) || !map)
+ iommu_flush_dev_iotlb(domain, addr, mask);
+}
+
+/* Notification for newly created mappings */
+static inline void __mapping_notify_one(struct intel_iommu *iommu,
+ struct dmar_domain *domain,
+ unsigned long pfn, unsigned int pages)
+{
+ /*
+ * It's a non-present to present mapping. Only flush if caching mode
+ * and second level.
+ */
+ if (cap_caching_mode(iommu->cap) && !domain_use_first_level(domain))
+ iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
+ else
+ iommu_flush_write_buffer(iommu);
+}
+
+static void iommu_flush_iova(struct iova_domain *iovad)
+{
+ struct dmar_domain *domain;
+ int idx;
+
+ domain = container_of(iovad, struct dmar_domain, iovad);
+
+ for_each_domain_iommu(idx, domain) {
+ struct intel_iommu *iommu = g_iommus[idx];
+ u16 did = domain->iommu_did[iommu->seq_id];
+
+ if (domain_use_first_level(domain))
+ domain_flush_piotlb(iommu, domain, 0, -1, 0);
+ else
+ iommu->flush.flush_iotlb(iommu, did, 0, 0,
+ DMA_TLB_DSI_FLUSH);
+
+ if (!cap_caching_mode(iommu->cap))
+ iommu_flush_dev_iotlb(get_iommu_domain(iommu, did),
+ 0, MAX_AGAW_PFN_WIDTH);
+ }
+}
+
+static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
+{
+ u32 pmen;
+ unsigned long flags;
+
+ if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
+ return;
+
+ raw_spin_lock_irqsave(&iommu->register_lock, flags);
+ pmen = readl(iommu->reg + DMAR_PMEN_REG);
+ pmen &= ~DMA_PMEN_EPM;
+ writel(pmen, iommu->reg + DMAR_PMEN_REG);
+
+ /* wait for the protected region status bit to clear */
+ IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
+ readl, !(pmen & DMA_PMEN_PRS), pmen);
+
+ raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
+}
+
+static void iommu_enable_translation(struct intel_iommu *iommu)
+{
+ u32 sts;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&iommu->register_lock, flags);
+ iommu->gcmd |= DMA_GCMD_TE;
+ writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
+
+ /* Make sure hardware complete it */
+ IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
+ readl, (sts & DMA_GSTS_TES), sts);
+
+ raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
+}
+
+static void iommu_disable_translation(struct intel_iommu *iommu)
+{
+ u32 sts;
+ unsigned long flag;
+
+ if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated &&
+ (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap)))
+ return;
+
+ raw_spin_lock_irqsave(&iommu->register_lock, flag);
+ iommu->gcmd &= ~DMA_GCMD_TE;
+ writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
+
+ /* Make sure hardware complete it */
+ IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
+ readl, (!(sts & DMA_GSTS_TES)), sts);
+
+ raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
+}
+
+static int iommu_init_domains(struct intel_iommu *iommu)
+{
+ u32 ndomains, nlongs;
+ size_t size;
+
+ ndomains = cap_ndoms(iommu->cap);
+ pr_debug("%s: Number of Domains supported <%d>\n",
+ iommu->name, ndomains);
+ nlongs = BITS_TO_LONGS(ndomains);
+
+ spin_lock_init(&iommu->lock);
+
+ iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
+ if (!iommu->domain_ids) {
+ pr_err("%s: Allocating domain id array failed\n",
+ iommu->name);
+ return -ENOMEM;
+ }
+
+ size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **);
+ iommu->domains = kzalloc(size, GFP_KERNEL);
+
+ if (iommu->domains) {
+ size = 256 * sizeof(struct dmar_domain *);
+ iommu->domains[0] = kzalloc(size, GFP_KERNEL);
+ }
+
+ if (!iommu->domains || !iommu->domains[0]) {
+ pr_err("%s: Allocating domain array failed\n",
+ iommu->name);
+ kfree(iommu->domain_ids);
+ kfree(iommu->domains);
+ iommu->domain_ids = NULL;
+ iommu->domains = NULL;
+ return -ENOMEM;
+ }
+
+ /*
+ * If Caching mode is set, then invalid translations are tagged
+ * with domain-id 0, hence we need to pre-allocate it. We also
+ * use domain-id 0 as a marker for non-allocated domain-id, so
+ * make sure it is not used for a real domain.
+ */
+ set_bit(0, iommu->domain_ids);
+
+ /*
+ * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
+ * entry for first-level or pass-through translation modes should
+ * be programmed with a domain id different from those used for
+ * second-level or nested translation. We reserve a domain id for
+ * this purpose.
+ */
+ if (sm_supported(iommu))
+ set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
+
+ return 0;
+}
+
+static void disable_dmar_iommu(struct intel_iommu *iommu)
+{
+ struct device_domain_info *info, *tmp;
+ unsigned long flags;
+
+ if (!iommu->domains || !iommu->domain_ids)
+ return;
+
+ spin_lock_irqsave(&device_domain_lock, flags);
+ list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
+ if (info->iommu != iommu)
+ continue;
+
+ if (!info->dev || !info->domain)
+ continue;
+
+ __dmar_remove_one_dev_info(info);
+ }
+ spin_unlock_irqrestore(&device_domain_lock, flags);
+
+ if (iommu->gcmd & DMA_GCMD_TE)
+ iommu_disable_translation(iommu);
+}
+
+static void free_dmar_iommu(struct intel_iommu *iommu)
+{
+ if ((iommu->domains) && (iommu->domain_ids)) {
+ int elems = ALIGN(cap_ndoms(iommu->cap), 256) >> 8;
+ int i;
+
+ for (i = 0; i < elems; i++)
+ kfree(iommu->domains[i]);
+ kfree(iommu->domains);
+ kfree(iommu->domain_ids);
+ iommu->domains = NULL;
+ iommu->domain_ids = NULL;
+ }
+
+ g_iommus[iommu->seq_id] = NULL;
+
+ /* free context mapping */
+ free_context_table(iommu);
+
+#ifdef CONFIG_INTEL_IOMMU_SVM
+ if (pasid_supported(iommu)) {
+ if (ecap_prs(iommu->ecap))
+ intel_svm_finish_prq(iommu);
+ }
+ if (vccap_pasid(iommu->vccap))
+ ioasid_unregister_allocator(&iommu->pasid_allocator);
+
+#endif
+}
+
+/*
+ * Check and return whether first level is used by default for
+ * DMA translation.
+ */
+static bool first_level_by_default(void)
+{
+ struct dmar_drhd_unit *drhd;
+ struct intel_iommu *iommu;
+ static int first_level_support = -1;
+
+ if (likely(first_level_support != -1))
+ return first_level_support;
+
+ first_level_support = 1;
+
+ rcu_read_lock();
+ for_each_active_iommu(iommu, drhd) {
+ if (!sm_supported(iommu) || !ecap_flts(iommu->ecap)) {
+ first_level_support = 0;
+ break;
+ }
+ }
+ rcu_read_unlock();
+
+ return first_level_support;
+}
+
+static struct dmar_domain *alloc_domain(int flags)
+{
+ struct dmar_domain *domain;
+
+ domain = alloc_domain_mem();
+ if (!domain)
+ return NULL;
+
+ memset(domain, 0, sizeof(*domain));
+ domain->nid = NUMA_NO_NODE;
+ domain->flags = flags;
+ if (first_level_by_default())
+ domain->flags |= DOMAIN_FLAG_USE_FIRST_LEVEL;
+ domain->has_iotlb_device = false;
+ INIT_LIST_HEAD(&domain->devices);
+
+ return domain;
+}
+
+/* Must be called with iommu->lock */
+static int domain_attach_iommu(struct dmar_domain *domain,
+ struct intel_iommu *iommu)
+{
+ unsigned long ndomains;
+ int num;
+
+ assert_spin_locked(&device_domain_lock);
+ assert_spin_locked(&iommu->lock);
+
+ domain->iommu_refcnt[iommu->seq_id] += 1;
+ domain->iommu_count += 1;
+ if (domain->iommu_refcnt[iommu->seq_id] == 1) {
+ ndomains = cap_ndoms(iommu->cap);
+ num = find_first_zero_bit(iommu->domain_ids, ndomains);
+
+ if (num >= ndomains) {
+ pr_err("%s: No free domain ids\n", iommu->name);
+ domain->iommu_refcnt[iommu->seq_id] -= 1;
+ domain->iommu_count -= 1;
+ return -ENOSPC;
+ }
+
+ set_bit(num, iommu->domain_ids);
+ set_iommu_domain(iommu, num, domain);
+
+ domain->iommu_did[iommu->seq_id] = num;
+ domain->nid = iommu->node;
+
+ domain_update_iommu_cap(domain);
+ }
+
+ return 0;
+}
+
+static int domain_detach_iommu(struct dmar_domain *domain,
+ struct intel_iommu *iommu)
+{
+ int num, count;
+
+ assert_spin_locked(&device_domain_lock);
+ assert_spin_locked(&iommu->lock);
+
+ domain->iommu_refcnt[iommu->seq_id] -= 1;
+ count = --domain->iommu_count;
+ if (domain->iommu_refcnt[iommu->seq_id] == 0) {
+ num = domain->iommu_did[iommu->seq_id];
+ clear_bit(num, iommu->domain_ids);
+ set_iommu_domain(iommu, num, NULL);
+
+ domain_update_iommu_cap(domain);
+ domain->iommu_did[iommu->seq_id] = 0;
+ }
+
+ return count;
+}
+
+static struct iova_domain reserved_iova_list;
+static struct lock_class_key reserved_rbtree_key;
+
+static int dmar_init_reserved_ranges(void)
+{
+ struct pci_dev *pdev = NULL;
+ struct iova *iova;
+ int i;
+
+ init_iova_domain(&reserved_iova_list, VTD_PAGE_SIZE, IOVA_START_PFN);
+
+ lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
+ &reserved_rbtree_key);
+
+ /* IOAPIC ranges shouldn't be accessed by DMA */
+ iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
+ IOVA_PFN(IOAPIC_RANGE_END));
+ if (!iova) {
+ pr_err("Reserve IOAPIC range failed\n");
+ return -ENODEV;
+ }
+
+ /* Reserve all PCI MMIO to avoid peer-to-peer access */
+ for_each_pci_dev(pdev) {
+ struct resource *r;
+
+ for (i = 0; i < PCI_NUM_RESOURCES; i++) {
+ r = &pdev->resource[i];
+ if (!r->flags || !(r->flags & IORESOURCE_MEM))
+ continue;
+ iova = reserve_iova(&reserved_iova_list,
+ IOVA_PFN(r->start),
+ IOVA_PFN(r->end));
+ if (!iova) {
+ pci_err(pdev, "Reserve iova for %pR failed\n", r);
+ return -ENODEV;
+ }
+ }
+ }
+ return 0;
+}
+
+static inline int guestwidth_to_adjustwidth(int gaw)
+{
+ int agaw;
+ int r = (gaw - 12) % 9;
+
+ if (r == 0)
+ agaw = gaw;
+ else
+ agaw = gaw + 9 - r;
+ if (agaw > 64)
+ agaw = 64;
+ return agaw;
+}
+
+static void domain_exit(struct dmar_domain *domain)
+{
+
+ /* Remove associated devices and clear attached or cached domains */
+ domain_remove_dev_info(domain);
+
+ /* destroy iovas */
+ if (domain->domain.type == IOMMU_DOMAIN_DMA)
+ put_iova_domain(&domain->iovad);
+
+ if (domain->pgd) {
+ struct page *freelist;
+
+ freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
+ dma_free_pagelist(freelist);
+ }
+
+ free_domain_mem(domain);
+}
+
+/*
+ * Get the PASID directory size for scalable mode context entry.
+ * Value of X in the PDTS field of a scalable mode context entry
+ * indicates PASID directory with 2^(X + 7) entries.
+ */
+static inline unsigned long context_get_sm_pds(struct pasid_table *table)
+{
+ int pds, max_pde;
+
+ max_pde = table->max_pasid >> PASID_PDE_SHIFT;
+ pds = find_first_bit((unsigned long *)&max_pde, MAX_NR_PASID_BITS);
+ if (pds < 7)
+ return 0;
+
+ return pds - 7;
+}
+
+/*
+ * Set the RID_PASID field of a scalable mode context entry. The
+ * IOMMU hardware will use the PASID value set in this field for
+ * DMA translations of DMA requests without PASID.
+ */
+static inline void
+context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
+{
+ context->hi |= pasid & ((1 << 20) - 1);
+}
+
+/*
+ * Set the DTE(Device-TLB Enable) field of a scalable mode context
+ * entry.
+ */
+static inline void context_set_sm_dte(struct context_entry *context)
+{
+ context->lo |= (1 << 2);
+}
+
+/*
+ * Set the PRE(Page Request Enable) field of a scalable mode context
+ * entry.
+ */
+static inline void context_set_sm_pre(struct context_entry *context)
+{
+ context->lo |= (1 << 4);
+}
+
+/* Convert value to context PASID directory size field coding. */
+#define context_pdts(pds) (((pds) & 0x7) << 9)
+
+static int domain_context_mapping_one(struct dmar_domain *domain,
+ struct intel_iommu *iommu,
+ struct pasid_table *table,
+ u8 bus, u8 devfn)
+{
+ u16 did = domain->iommu_did[iommu->seq_id];
+ int translation = CONTEXT_TT_MULTI_LEVEL;
+ struct device_domain_info *info = NULL;
+ struct context_entry *context;
+ unsigned long flags;
+ int ret;
+
+ WARN_ON(did == 0);
+
+ if (hw_pass_through && domain_type_is_si(domain))
+ translation = CONTEXT_TT_PASS_THROUGH;
+
+ pr_debug("Set context mapping for %02x:%02x.%d\n",
+ bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
+
+ BUG_ON(!domain->pgd);
+
+ spin_lock_irqsave(&device_domain_lock, flags);
+ spin_lock(&iommu->lock);
+
+ ret = -ENOMEM;
+ context = iommu_context_addr(iommu, bus, devfn, 1);
+ if (!context)
+ goto out_unlock;
+
+ ret = 0;
+ if (context_present(context))
+ goto out_unlock;
+
+ /*
+ * For kdump cases, old valid entries may be cached due to the
+ * in-flight DMA and copied pgtable, but there is no unmapping
+ * behaviour for them, thus we need an explicit cache flush for
+ * the newly-mapped device. For kdump, at this point, the device
+ * is supposed to finish reset at its driver probe stage, so no
+ * in-flight DMA will exist, and we don't need to worry anymore
+ * hereafter.
+ */
+ if (context_copied(context)) {
+ u16 did_old = context_domain_id(context);
+
+ if (did_old < cap_ndoms(iommu->cap)) {
+ iommu->flush.flush_context(iommu, did_old,
+ (((u16)bus) << 8) | devfn,
+ DMA_CCMD_MASK_NOBIT,
+ DMA_CCMD_DEVICE_INVL);
+ iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
+ DMA_TLB_DSI_FLUSH);
+ }
+ }
+
+ context_clear_entry(context);
+
+ if (sm_supported(iommu)) {
+ unsigned long pds;
+
+ WARN_ON(!table);
+
+ /* Setup the PASID DIR pointer: */
+ pds = context_get_sm_pds(table);
+ context->lo = (u64)virt_to_phys(table->table) |
+ context_pdts(pds);
+
+ /* Setup the RID_PASID field: */
+ context_set_sm_rid2pasid(context, PASID_RID2PASID);
+
+ /*
+ * Setup the Device-TLB enable bit and Page request
+ * Enable bit:
+ */
+ info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
+ if (info && info->ats_supported)
+ context_set_sm_dte(context);
+ if (info && info->pri_supported)
+ context_set_sm_pre(context);
+ } else {
+ struct dma_pte *pgd = domain->pgd;
+ int agaw;
+
+ context_set_domain_id(context, did);
+
+ if (translation != CONTEXT_TT_PASS_THROUGH) {
+ /*
+ * Skip top levels of page tables for iommu which has
+ * less agaw than default. Unnecessary for PT mode.
+ */
+ for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
+ ret = -ENOMEM;
+ pgd = phys_to_virt(dma_pte_addr(pgd));
+ if (!dma_pte_present(pgd))
+ goto out_unlock;
+ }
+
+ info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
+ if (info && info->ats_supported)
+ translation = CONTEXT_TT_DEV_IOTLB;
+ else
+ translation = CONTEXT_TT_MULTI_LEVEL;
+
+ context_set_address_root(context, virt_to_phys(pgd));
+ context_set_address_width(context, agaw);
+ } else {
+ /*
+ * In pass through mode, AW must be programmed to
+ * indicate the largest AGAW value supported by
+ * hardware. And ASR is ignored by hardware.
+ */
+ context_set_address_width(context, iommu->msagaw);
+ }
+
+ context_set_translation_type(context, translation);
+ }
+
+ context_set_fault_enable(context);
+ context_set_present(context);
+ if (!ecap_coherent(iommu->ecap))
+ clflush_cache_range(context, sizeof(*context));
+
+ /*
+ * It's a non-present to present mapping. If hardware doesn't cache
+ * non-present entry we only need to flush the write-buffer. If the
+ * _does_ cache non-present entries, then it does so in the special
+ * domain #0, which we have to flush:
+ */
+ if (cap_caching_mode(iommu->cap)) {
+ iommu->flush.flush_context(iommu, 0,
+ (((u16)bus) << 8) | devfn,
+ DMA_CCMD_MASK_NOBIT,
+ DMA_CCMD_DEVICE_INVL);
+ iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
+ } else {
+ iommu_flush_write_buffer(iommu);
+ }
+ iommu_enable_dev_iotlb(info);
+
+ ret = 0;
+
+out_unlock:
+ spin_unlock(&iommu->lock);
+ spin_unlock_irqrestore(&device_domain_lock, flags);
+
+ return ret;
+}
+
+struct domain_context_mapping_data {
+ struct dmar_domain *domain;
+ struct intel_iommu *iommu;
+ struct pasid_table *table;
+};
+
+static int domain_context_mapping_cb(struct pci_dev *pdev,
+ u16 alias, void *opaque)
+{
+ struct domain_context_mapping_data *data = opaque;
+
+ return domain_context_mapping_one(data->domain, data->iommu,
+ data->table, PCI_BUS_NUM(alias),
+ alias & 0xff);
+}
+
+static int
+domain_context_mapping(struct dmar_domain *domain, struct device *dev)
+{
+ struct domain_context_mapping_data data;
+ struct pasid_table *table;
+ struct intel_iommu *iommu;
+ u8 bus, devfn;
+
+ iommu = device_to_iommu(dev, &bus, &devfn);
+ if (!iommu)
+ return -ENODEV;
+
+ table = intel_pasid_get_table(dev);
+
+ if (!dev_is_pci(dev))
+ return domain_context_mapping_one(domain, iommu, table,
+ bus, devfn);
+
+ data.domain = domain;
+ data.iommu = iommu;
+ data.table = table;
+
+ return pci_for_each_dma_alias(to_pci_dev(dev),
+ &domain_context_mapping_cb, &data);
+}
+
+static int domain_context_mapped_cb(struct pci_dev *pdev,
+ u16 alias, void *opaque)
+{
+ struct intel_iommu *iommu = opaque;
+
+ return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
+}
+
+static int domain_context_mapped(struct device *dev)
+{
+ struct intel_iommu *iommu;
+ u8 bus, devfn;
+
+ iommu = device_to_iommu(dev, &bus, &devfn);
+ if (!iommu)
+ return -ENODEV;
+
+ if (!dev_is_pci(dev))
+ return device_context_mapped(iommu, bus, devfn);
+
+ return !pci_for_each_dma_alias(to_pci_dev(dev),
+ domain_context_mapped_cb, iommu);
+}
+
+/* Returns a number of VTD pages, but aligned to MM page size */
+static inline unsigned long aligned_nrpages(unsigned long host_addr,
+ size_t size)
+{
+ host_addr &= ~PAGE_MASK;
+ return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
+}
+
+/* Return largest possible superpage level for a given mapping */
+static inline int hardware_largepage_caps(struct dmar_domain *domain,
+ unsigned long iov_pfn,
+ unsigned long phy_pfn,
+ unsigned long pages)
+{
+ int support, level = 1;
+ unsigned long pfnmerge;
+
+ support = domain->iommu_superpage;
+
+ /* To use a large page, the virtual *and* physical addresses
+ must be aligned to 2MiB/1GiB/etc. Lower bits set in either
+ of them will mean we have to use smaller pages. So just
+ merge them and check both at once. */
+ pfnmerge = iov_pfn | phy_pfn;
+
+ while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
+ pages >>= VTD_STRIDE_SHIFT;
+ if (!pages)
+ break;
+ pfnmerge >>= VTD_STRIDE_SHIFT;
+ level++;
+ support--;
+ }
+ return level;
+}
+
+static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
+ struct scatterlist *sg, unsigned long phys_pfn,
+ unsigned long nr_pages, int prot)
+{
+ struct dma_pte *first_pte = NULL, *pte = NULL;
+ phys_addr_t pteval;
+ unsigned long sg_res = 0;
+ unsigned int largepage_lvl = 0;
+ unsigned long lvl_pages = 0;
+ u64 attr;
+
+ BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
+
+ if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
+ return -EINVAL;
+
+ attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
+ attr |= DMA_FL_PTE_PRESENT;
+ if (domain_use_first_level(domain)) {
+ attr |= DMA_FL_PTE_XD | DMA_FL_PTE_US;
+
+ if (domain->domain.type == IOMMU_DOMAIN_DMA) {
+ attr |= DMA_FL_PTE_ACCESS;
+ if (prot & DMA_PTE_WRITE)
+ attr |= DMA_FL_PTE_DIRTY;
+ }
+ }
+
+ if (!sg) {
+ sg_res = nr_pages;
+ pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
+ }
+
+ while (nr_pages > 0) {
+ uint64_t tmp;
+
+ if (!sg_res) {
+ unsigned int pgoff = sg->offset & ~PAGE_MASK;
+
+ sg_res = aligned_nrpages(sg->offset, sg->length);
+ sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + pgoff;
+ sg->dma_length = sg->length;
+ pteval = (sg_phys(sg) - pgoff) | attr;
+ phys_pfn = pteval >> VTD_PAGE_SHIFT;
+ }
+
+ if (!pte) {
+ largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
+
+ first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
+ if (!pte)
+ return -ENOMEM;
+ /* It is large page*/
+ if (largepage_lvl > 1) {
+ unsigned long nr_superpages, end_pfn;
+
+ pteval |= DMA_PTE_LARGE_PAGE;
+ lvl_pages = lvl_to_nr_pages(largepage_lvl);
+
+ nr_superpages = sg_res / lvl_pages;
+ end_pfn = iov_pfn + nr_superpages * lvl_pages - 1;
+
+ /*
+ * Ensure that old small page tables are
+ * removed to make room for superpage(s).
+ * We're adding new large pages, so make sure
+ * we don't remove their parent tables.
+ */
+ dma_pte_free_pagetable(domain, iov_pfn, end_pfn,
+ largepage_lvl + 1);
+ } else {
+ pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
+ }
+
+ }
+ /* We don't need lock here, nobody else
+ * touches the iova range
+ */
+ tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
+ if (tmp) {
+ static int dumps = 5;
+ pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
+ iov_pfn, tmp, (unsigned long long)pteval);
+ if (dumps) {
+ dumps--;
+ debug_dma_dump_mappings(NULL);
+ }
+ WARN_ON(1);
+ }
+
+ lvl_pages = lvl_to_nr_pages(largepage_lvl);
+
+ BUG_ON(nr_pages < lvl_pages);
+ BUG_ON(sg_res < lvl_pages);
+
+ nr_pages -= lvl_pages;
+ iov_pfn += lvl_pages;
+ phys_pfn += lvl_pages;
+ pteval += lvl_pages * VTD_PAGE_SIZE;
+ sg_res -= lvl_pages;
+
+ /* If the next PTE would be the first in a new page, then we
+ need to flush the cache on the entries we've just written.
+ And then we'll need to recalculate 'pte', so clear it and
+ let it get set again in the if (!pte) block above.
+
+ If we're done (!nr_pages) we need to flush the cache too.
+
+ Also if we've been setting superpages, we may need to
+ recalculate 'pte' and switch back to smaller pages for the
+ end of the mapping, if the trailing size is not enough to
+ use another superpage (i.e. sg_res < lvl_pages). */
+ pte++;
+ if (!nr_pages || first_pte_in_page(pte) ||
+ (largepage_lvl > 1 && sg_res < lvl_pages)) {
+ domain_flush_cache(domain, first_pte,
+ (void *)pte - (void *)first_pte);
+ pte = NULL;
+ }
+
+ if (!sg_res && nr_pages)
+ sg = sg_next(sg);
+ }
+ return 0;
+}
+
+static int domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
+ struct scatterlist *sg, unsigned long phys_pfn,
+ unsigned long nr_pages, int prot)
+{
+ int iommu_id, ret;
+ struct intel_iommu *iommu;
+
+ /* Do the real mapping first */
+ ret = __domain_mapping(domain, iov_pfn, sg, phys_pfn, nr_pages, prot);
+ if (ret)
+ return ret;
+
+ for_each_domain_iommu(iommu_id, domain) {
+ iommu = g_iommus[iommu_id];
+ __mapping_notify_one(iommu, domain, iov_pfn, nr_pages);
+ }
+
+ return 0;
+}
+
+static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
+ struct scatterlist *sg, unsigned long nr_pages,
+ int prot)
+{
+ return domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
+}
+
+static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
+ unsigned long phys_pfn, unsigned long nr_pages,
+ int prot)
+{
+ return domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
+}
+
+static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn)
+{
+ unsigned long flags;
+ struct context_entry *context;
+ u16 did_old;
+
+ if (!iommu)
+ return;
+
+ spin_lock_irqsave(&iommu->lock, flags);
+ context = iommu_context_addr(iommu, bus, devfn, 0);
+ if (!context) {
+ spin_unlock_irqrestore(&iommu->lock, flags);
+ return;
+ }
+ did_old = context_domain_id(context);
+ context_clear_entry(context);
+ __iommu_flush_cache(iommu, context, sizeof(*context));
+ spin_unlock_irqrestore(&iommu->lock, flags);
+ iommu->flush.flush_context(iommu,
+ did_old,
+ (((u16)bus) << 8) | devfn,
+ DMA_CCMD_MASK_NOBIT,
+ DMA_CCMD_DEVICE_INVL);
+
+ if (sm_supported(iommu))
+ qi_flush_pasid_cache(iommu, did_old, QI_PC_ALL_PASIDS, 0);
+
+ iommu->flush.flush_iotlb(iommu,
+ did_old,
+ 0,
+ 0,
+ DMA_TLB_DSI_FLUSH);
+}
+
+static inline void unlink_domain_info(struct device_domain_info *info)
+{
+ assert_spin_locked(&device_domain_lock);
+ list_del(&info->link);
+ list_del(&info->global);
+ if (info->dev)
+ dev_iommu_priv_set(info->dev, NULL);
+}
+
+static void domain_remove_dev_info(struct dmar_domain *domain)
+{
+ struct device_domain_info *info, *tmp;
+ unsigned long flags;
+
+ spin_lock_irqsave(&device_domain_lock, flags);
+ list_for_each_entry_safe(info, tmp, &domain->devices, link)
+ __dmar_remove_one_dev_info(info);
+ spin_unlock_irqrestore(&device_domain_lock, flags);
+}
+
+struct dmar_domain *find_domain(struct device *dev)
+{
+ struct device_domain_info *info;
+
+ if (unlikely(!dev || !dev->iommu))
+ return NULL;
+
+ if (unlikely(attach_deferred(dev)))
+ return NULL;
+
+ /* No lock here, assumes no domain exit in normal case */
+ info = get_domain_info(dev);
+ if (likely(info))
+ return info->domain;
+
+ return NULL;
+}
+
+static void do_deferred_attach(struct device *dev)
+{
+ struct iommu_domain *domain;
+
+ dev_iommu_priv_set(dev, NULL);
+ domain = iommu_get_domain_for_dev(dev);
+ if (domain)
+ intel_iommu_attach_device(domain, dev);
+}
+
+static inline struct device_domain_info *
+dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
+{
+ struct device_domain_info *info;
+
+ list_for_each_entry(info, &device_domain_list, global)
+ if (info->segment == segment && info->bus == bus &&
+ info->devfn == devfn)
+ return info;
+
+ return NULL;
+}
+
+static int domain_setup_first_level(struct intel_iommu *iommu,
+ struct dmar_domain *domain,
+ struct device *dev,
+ u32 pasid)
+{
+ struct dma_pte *pgd = domain->pgd;
+ int agaw, level;
+ int flags = 0;
+
+ /*
+ * Skip top levels of page tables for iommu which has
+ * less agaw than default. Unnecessary for PT mode.
+ */
+ for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
+ pgd = phys_to_virt(dma_pte_addr(pgd));
+ if (!dma_pte_present(pgd))
+ return -ENOMEM;
+ }
+
+ level = agaw_to_level(agaw);
+ if (level != 4 && level != 5)
+ return -EINVAL;
+
+ if (pasid != PASID_RID2PASID)
+ flags |= PASID_FLAG_SUPERVISOR_MODE;
+ if (level == 5)
+ flags |= PASID_FLAG_FL5LP;
+
+ if (domain->domain.type == IOMMU_DOMAIN_UNMANAGED)
+ flags |= PASID_FLAG_PAGE_SNOOP;
+
+ return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid,
+ domain->iommu_did[iommu->seq_id],
+ flags);
+}
+
+static bool dev_is_real_dma_subdevice(struct device *dev)
+{
+ return dev && dev_is_pci(dev) &&
+ pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev);
+}
+
+static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
+ int bus, int devfn,
+ struct device *dev,
+ struct dmar_domain *domain)
+{
+ struct dmar_domain *found = NULL;
+ struct device_domain_info *info;
+ unsigned long flags;
+ int ret;
+
+ info = alloc_devinfo_mem();
+ if (!info)
+ return NULL;
+
+ if (!dev_is_real_dma_subdevice(dev)) {
+ info->bus = bus;
+ info->devfn = devfn;
+ info->segment = iommu->segment;
+ } else {
+ struct pci_dev *pdev = to_pci_dev(dev);
+
+ info->bus = pdev->bus->number;
+ info->devfn = pdev->devfn;
+ info->segment = pci_domain_nr(pdev->bus);
+ }
+
+ info->ats_supported = info->pasid_supported = info->pri_supported = 0;
+ info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0;
+ info->ats_qdep = 0;
+ info->dev = dev;
+ info->domain = domain;
+ info->iommu = iommu;
+ info->pasid_table = NULL;
+ info->auxd_enabled = 0;
+ INIT_LIST_HEAD(&info->auxiliary_domains);
+
+ if (dev && dev_is_pci(dev)) {
+ struct pci_dev *pdev = to_pci_dev(info->dev);
+
+ if (ecap_dev_iotlb_support(iommu->ecap) &&
+ pci_ats_supported(pdev) &&
+ dmar_find_matched_atsr_unit(pdev))
+ info->ats_supported = 1;
+
+ if (sm_supported(iommu)) {
+ if (pasid_supported(iommu)) {
+ int features = pci_pasid_features(pdev);
+ if (features >= 0)
+ info->pasid_supported = features | 1;
+ }
+
+ if (info->ats_supported && ecap_prs(iommu->ecap) &&
+ pci_pri_supported(pdev))
+ info->pri_supported = 1;
+ }
+ }
+
+ spin_lock_irqsave(&device_domain_lock, flags);
+ if (dev)
+ found = find_domain(dev);
+
+ if (!found) {
+ struct device_domain_info *info2;
+ info2 = dmar_search_domain_by_dev_info(info->segment, info->bus,
+ info->devfn);
+ if (info2) {
+ found = info2->domain;
+ info2->dev = dev;
+ }
+ }
+
+ if (found) {
+ spin_unlock_irqrestore(&device_domain_lock, flags);
+ free_devinfo_mem(info);
+ /* Caller must free the original domain */
+ return found;
+ }
+
+ spin_lock(&iommu->lock);
+ ret = domain_attach_iommu(domain, iommu);
+ spin_unlock(&iommu->lock);
+
+ if (ret) {
+ spin_unlock_irqrestore(&device_domain_lock, flags);
+ free_devinfo_mem(info);
+ return NULL;
+ }
+
+ list_add(&info->link, &domain->devices);
+ list_add(&info->global, &device_domain_list);
+ if (dev)
+ dev_iommu_priv_set(dev, info);
+ spin_unlock_irqrestore(&device_domain_lock, flags);
+
+ /* PASID table is mandatory for a PCI device in scalable mode. */
+ if (dev && dev_is_pci(dev) && sm_supported(iommu)) {
+ ret = intel_pasid_alloc_table(dev);
+ if (ret) {
+ dev_err(dev, "PASID table allocation failed\n");
+ dmar_remove_one_dev_info(dev);
+ return NULL;
+ }
+
+ /* Setup the PASID entry for requests without PASID: */
+ spin_lock_irqsave(&iommu->lock, flags);
+ if (hw_pass_through && domain_type_is_si(domain))
+ ret = intel_pasid_setup_pass_through(iommu, domain,
+ dev, PASID_RID2PASID);
+ else if (domain_use_first_level(domain))
+ ret = domain_setup_first_level(iommu, domain, dev,
+ PASID_RID2PASID);
+ else
+ ret = intel_pasid_setup_second_level(iommu, domain,
+ dev, PASID_RID2PASID);
+ spin_unlock_irqrestore(&iommu->lock, flags);
+ if (ret) {
+ dev_err(dev, "Setup RID2PASID failed\n");
+ dmar_remove_one_dev_info(dev);
+ return NULL;
+ }
+ }
+
+ if (dev && domain_context_mapping(domain, dev)) {
+ dev_err(dev, "Domain context map failed\n");
+ dmar_remove_one_dev_info(dev);
+ return NULL;
+ }
+
+ return domain;
+}
+
+static int iommu_domain_identity_map(struct dmar_domain *domain,
+ unsigned long first_vpfn,
+ unsigned long last_vpfn)
+{
+ /*
+ * RMRR range might have overlap with physical memory range,
+ * clear it first
+ */
+ dma_pte_clear_range(domain, first_vpfn, last_vpfn);
+
+ return __domain_mapping(domain, first_vpfn, NULL,
+ first_vpfn, last_vpfn - first_vpfn + 1,
+ DMA_PTE_READ|DMA_PTE_WRITE);
+}
+
+static int md_domain_init(struct dmar_domain *domain, int guest_width);
+
+static int __init si_domain_init(int hw)
+{
+ struct dmar_rmrr_unit *rmrr;
+ struct device *dev;
+ int i, nid, ret;
+
+ si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
+ if (!si_domain)
+ return -EFAULT;
+
+ if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
+ domain_exit(si_domain);
+ si_domain = NULL;
+ return -EFAULT;
+ }
+
+ if (hw)
+ return 0;
+
+ for_each_online_node(nid) {
+ unsigned long start_pfn, end_pfn;
+ int i;
+
+ for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
+ ret = iommu_domain_identity_map(si_domain,
+ mm_to_dma_pfn(start_pfn),
+ mm_to_dma_pfn(end_pfn));
+ if (ret)
+ return ret;
+ }
+ }
+
+ /*
+ * Identity map the RMRRs so that devices with RMRRs could also use
+ * the si_domain.
+ */
+ for_each_rmrr_units(rmrr) {
+ for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
+ i, dev) {
+ unsigned long long start = rmrr->base_address;
+ unsigned long long end = rmrr->end_address;
+
+ if (WARN_ON(end < start ||
+ end >> agaw_to_width(si_domain->agaw)))
+ continue;
+
+ ret = iommu_domain_identity_map(si_domain,
+ mm_to_dma_pfn(start >> PAGE_SHIFT),
+ mm_to_dma_pfn(end >> PAGE_SHIFT));
+ if (ret)
+ return ret;
+ }
+ }
+
+ return 0;
+}
+
+static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
+{
+ struct dmar_domain *ndomain;
+ struct intel_iommu *iommu;
+ u8 bus, devfn;
+
+ iommu = device_to_iommu(dev, &bus, &devfn);
+ if (!iommu)
+ return -ENODEV;
+
+ ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
+ if (ndomain != domain)
+ return -EBUSY;
+
+ return 0;
+}
+
+static bool device_has_rmrr(struct device *dev)
+{
+ struct dmar_rmrr_unit *rmrr;
+ struct device *tmp;
+ int i;
+
+ rcu_read_lock();
+ for_each_rmrr_units(rmrr) {
+ /*
+ * Return TRUE if this RMRR contains the device that
+ * is passed in.
+ */
+ for_each_active_dev_scope(rmrr->devices,
+ rmrr->devices_cnt, i, tmp)
+ if (tmp == dev ||
+ is_downstream_to_pci_bridge(dev, tmp)) {
+ rcu_read_unlock();
+ return true;
+ }
+ }
+ rcu_read_unlock();
+ return false;
+}
+
+/**
+ * device_rmrr_is_relaxable - Test whether the RMRR of this device
+ * is relaxable (ie. is allowed to be not enforced under some conditions)
+ * @dev: device handle
+ *
+ * We assume that PCI USB devices with RMRRs have them largely
+ * for historical reasons and that the RMRR space is not actively used post
+ * boot. This exclusion may change if vendors begin to abuse it.
+ *
+ * The same exception is made for graphics devices, with the requirement that
+ * any use of the RMRR regions will be torn down before assigning the device
+ * to a guest.
+ *
+ * Return: true if the RMRR is relaxable, false otherwise
+ */
+static bool device_rmrr_is_relaxable(struct device *dev)
+{
+ struct pci_dev *pdev;
+
+ if (!dev_is_pci(dev))
+ return false;
+
+ pdev = to_pci_dev(dev);
+ if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
+ return true;
+ else
+ return false;
+}
+
+/*
+ * There are a couple cases where we need to restrict the functionality of
+ * devices associated with RMRRs. The first is when evaluating a device for
+ * identity mapping because problems exist when devices are moved in and out
+ * of domains and their respective RMRR information is lost. This means that
+ * a device with associated RMRRs will never be in a "passthrough" domain.
+ * The second is use of the device through the IOMMU API. This interface
+ * expects to have full control of the IOVA space for the device. We cannot
+ * satisfy both the requirement that RMRR access is maintained and have an
+ * unencumbered IOVA space. We also have no ability to quiesce the device's
+ * use of the RMRR space or even inform the IOMMU API user of the restriction.
+ * We therefore prevent devices associated with an RMRR from participating in
+ * the IOMMU API, which eliminates them from device assignment.
+ *
+ * In both cases, devices which have relaxable RMRRs are not concerned by this
+ * restriction. See device_rmrr_is_relaxable comment.
+ */
+static bool device_is_rmrr_locked(struct device *dev)
+{
+ if (!device_has_rmrr(dev))
+ return false;
+
+ if (device_rmrr_is_relaxable(dev))
+ return false;
+
+ return true;
+}
+
+/*
+ * Return the required default domain type for a specific device.
+ *
+ * @dev: the device in query
+ * @startup: true if this is during early boot
+ *
+ * Returns:
+ * - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
+ * - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
+ * - 0: both identity and dynamic domains work for this device
+ */
+static int device_def_domain_type(struct device *dev)
+{
+ if (dev_is_pci(dev)) {
+ struct pci_dev *pdev = to_pci_dev(dev);
+
+ /*
+ * Prevent any device marked as untrusted from getting
+ * placed into the statically identity mapping domain.
+ */
+ if (pdev->untrusted)
+ return IOMMU_DOMAIN_DMA;
+
+ if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
+ return IOMMU_DOMAIN_IDENTITY;
+
+ if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
+ return IOMMU_DOMAIN_IDENTITY;
+ }
+
+ return 0;
+}
+
+static void intel_iommu_init_qi(struct intel_iommu *iommu)
+{
+ /*
+ * Start from the sane iommu hardware state.
+ * If the queued invalidation is already initialized by us
+ * (for example, while enabling interrupt-remapping) then
+ * we got the things already rolling from a sane state.
+ */
+ if (!iommu->qi) {
+ /*
+ * Clear any previous faults.
+ */
+ dmar_fault(-1, iommu);
+ /*
+ * Disable queued invalidation if supported and already enabled
+ * before OS handover.
+ */
+ dmar_disable_qi(iommu);
+ }
+
+ if (dmar_enable_qi(iommu)) {
+ /*
+ * Queued Invalidate not enabled, use Register Based Invalidate
+ */
+ iommu->flush.flush_context = __iommu_flush_context;
+ iommu->flush.flush_iotlb = __iommu_flush_iotlb;
+ pr_info("%s: Using Register based invalidation\n",
+ iommu->name);
+ } else {
+ iommu->flush.flush_context = qi_flush_context;
+ iommu->flush.flush_iotlb = qi_flush_iotlb;
+ pr_info("%s: Using Queued invalidation\n", iommu->name);
+ }
+}
+
+static int copy_context_table(struct intel_iommu *iommu,
+ struct root_entry *old_re,
+ struct context_entry **tbl,
+ int bus, bool ext)
+{
+ int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
+ struct context_entry *new_ce = NULL, ce;
+ struct context_entry *old_ce = NULL;
+ struct root_entry re;
+ phys_addr_t old_ce_phys;
+
+ tbl_idx = ext ? bus * 2 : bus;
+ memcpy(&re, old_re, sizeof(re));
+
+ for (devfn = 0; devfn < 256; devfn++) {
+ /* First calculate the correct index */
+ idx = (ext ? devfn * 2 : devfn) % 256;
+
+ if (idx == 0) {
+ /* First save what we may have and clean up */
+ if (new_ce) {
+ tbl[tbl_idx] = new_ce;
+ __iommu_flush_cache(iommu, new_ce,
+ VTD_PAGE_SIZE);
+ pos = 1;
+ }
+
+ if (old_ce)
+ memunmap(old_ce);
+
+ ret = 0;
+ if (devfn < 0x80)
+ old_ce_phys = root_entry_lctp(&re);
+ else
+ old_ce_phys = root_entry_uctp(&re);
+
+ if (!old_ce_phys) {
+ if (ext && devfn == 0) {
+ /* No LCTP, try UCTP */
+ devfn = 0x7f;
+ continue;
+ } else {
+ goto out;
+ }
+ }
+
+ ret = -ENOMEM;
+ old_ce = memremap(old_ce_phys, PAGE_SIZE,
+ MEMREMAP_WB);
+ if (!old_ce)
+ goto out;
+
+ new_ce = alloc_pgtable_page(iommu->node);
+ if (!new_ce)
+ goto out_unmap;
+
+ ret = 0;
+ }
+
+ /* Now copy the context entry */
+ memcpy(&ce, old_ce + idx, sizeof(ce));
+
+ if (!__context_present(&ce))
+ continue;
+
+ did = context_domain_id(&ce);
+ if (did >= 0 && did < cap_ndoms(iommu->cap))
+ set_bit(did, iommu->domain_ids);
+
+ /*
+ * We need a marker for copied context entries. This
+ * marker needs to work for the old format as well as
+ * for extended context entries.
+ *
+ * Bit 67 of the context entry is used. In the old
+ * format this bit is available to software, in the
+ * extended format it is the PGE bit, but PGE is ignored
+ * by HW if PASIDs are disabled (and thus still
+ * available).
+ *
+ * So disable PASIDs first and then mark the entry
+ * copied. This means that we don't copy PASID
+ * translations from the old kernel, but this is fine as
+ * faults there are not fatal.
+ */
+ context_clear_pasid_enable(&ce);
+ context_set_copied(&ce);
+
+ new_ce[idx] = ce;
+ }
+
+ tbl[tbl_idx + pos] = new_ce;
+
+ __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
+
+out_unmap:
+ memunmap(old_ce);
+
+out:
+ return ret;
+}
+
+static int copy_translation_tables(struct intel_iommu *iommu)
+{
+ struct context_entry **ctxt_tbls;
+ struct root_entry *old_rt;
+ phys_addr_t old_rt_phys;
+ int ctxt_table_entries;
+ unsigned long flags;
+ u64 rtaddr_reg;
+ int bus, ret;
+ bool new_ext, ext;
+
+ rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
+ ext = !!(rtaddr_reg & DMA_RTADDR_RTT);
+ new_ext = !!ecap_ecs(iommu->ecap);
+
+ /*
+ * The RTT bit can only be changed when translation is disabled,
+ * but disabling translation means to open a window for data
+ * corruption. So bail out and don't copy anything if we would
+ * have to change the bit.
+ */
+ if (new_ext != ext)
+ return -EINVAL;
+
+ old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
+ if (!old_rt_phys)
+ return -EINVAL;
+
+ old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
+ if (!old_rt)
+ return -ENOMEM;
+
+ /* This is too big for the stack - allocate it from slab */
+ ctxt_table_entries = ext ? 512 : 256;
+ ret = -ENOMEM;
+ ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
+ if (!ctxt_tbls)
+ goto out_unmap;
+
+ for (bus = 0; bus < 256; bus++) {
+ ret = copy_context_table(iommu, &old_rt[bus],
+ ctxt_tbls, bus, ext);
+ if (ret) {
+ pr_err("%s: Failed to copy context table for bus %d\n",
+ iommu->name, bus);
+ continue;
+ }
+ }
+
+ spin_lock_irqsave(&iommu->lock, flags);
+
+ /* Context tables are copied, now write them to the root_entry table */
+ for (bus = 0; bus < 256; bus++) {
+ int idx = ext ? bus * 2 : bus;
+ u64 val;
+
+ if (ctxt_tbls[idx]) {
+ val = virt_to_phys(ctxt_tbls[idx]) | 1;
+ iommu->root_entry[bus].lo = val;
+ }
+
+ if (!ext || !ctxt_tbls[idx + 1])
+ continue;
+
+ val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
+ iommu->root_entry[bus].hi = val;
+ }
+
+ spin_unlock_irqrestore(&iommu->lock, flags);
+
+ kfree(ctxt_tbls);
+
+ __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
+
+ ret = 0;
+
+out_unmap:
+ memunmap(old_rt);
+
+ return ret;
+}
+
+#ifdef CONFIG_INTEL_IOMMU_SVM
+static ioasid_t intel_vcmd_ioasid_alloc(ioasid_t min, ioasid_t max, void *data)
+{
+ struct intel_iommu *iommu = data;
+ ioasid_t ioasid;
+
+ if (!iommu)
+ return INVALID_IOASID;
+ /*
+ * VT-d virtual command interface always uses the full 20 bit
+ * PASID range. Host can partition guest PASID range based on
+ * policies but it is out of guest's control.
+ */
+ if (min < PASID_MIN || max > intel_pasid_max_id)
+ return INVALID_IOASID;
+
+ if (vcmd_alloc_pasid(iommu, &ioasid))
+ return INVALID_IOASID;
+
+ return ioasid;
+}
+
+static void intel_vcmd_ioasid_free(ioasid_t ioasid, void *data)
+{
+ struct intel_iommu *iommu = data;
+
+ if (!iommu)
+ return;
+ /*
+ * Sanity check the ioasid owner is done at upper layer, e.g. VFIO
+ * We can only free the PASID when all the devices are unbound.
+ */
+ if (ioasid_find(NULL, ioasid, NULL)) {
+ pr_alert("Cannot free active IOASID %d\n", ioasid);
+ return;
+ }
+ vcmd_free_pasid(iommu, ioasid);
+}
+
+static void register_pasid_allocator(struct intel_iommu *iommu)
+{
+ /*
+ * If we are running in the host, no need for custom allocator
+ * in that PASIDs are allocated from the host system-wide.
+ */
+ if (!cap_caching_mode(iommu->cap))
+ return;
+
+ if (!sm_supported(iommu)) {
+ pr_warn("VT-d Scalable Mode not enabled, no PASID allocation\n");
+ return;
+ }
+
+ /*
+ * Register a custom PASID allocator if we are running in a guest,
+ * guest PASID must be obtained via virtual command interface.
+ * There can be multiple vIOMMUs in each guest but only one allocator
+ * is active. All vIOMMU allocators will eventually be calling the same
+ * host allocator.
+ */
+ if (!vccap_pasid(iommu->vccap))
+ return;
+
+ pr_info("Register custom PASID allocator\n");
+ iommu->pasid_allocator.alloc = intel_vcmd_ioasid_alloc;
+ iommu->pasid_allocator.free = intel_vcmd_ioasid_free;
+ iommu->pasid_allocator.pdata = (void *)iommu;
+ if (ioasid_register_allocator(&iommu->pasid_allocator)) {
+ pr_warn("Custom PASID allocator failed, scalable mode disabled\n");
+ /*
+ * Disable scalable mode on this IOMMU if there
+ * is no custom allocator. Mixing SM capable vIOMMU
+ * and non-SM vIOMMU are not supported.
+ */
+ intel_iommu_sm = 0;
+ }
+}
+#endif
+
+static int __init init_dmars(void)
+{
+ struct dmar_drhd_unit *drhd;
+ struct intel_iommu *iommu;
+ int ret;
+
+ /*
+ * for each drhd
+ * allocate root
+ * initialize and program root entry to not present
+ * endfor
+ */
+ for_each_drhd_unit(drhd) {
+ /*
+ * lock not needed as this is only incremented in the single
+ * threaded kernel __init code path all other access are read
+ * only
+ */
+ if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
+ g_num_of_iommus++;
+ continue;
+ }
+ pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
+ }
+
+ /* Preallocate enough resources for IOMMU hot-addition */
+ if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
+ g_num_of_iommus = DMAR_UNITS_SUPPORTED;
+
+ g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
+ GFP_KERNEL);
+ if (!g_iommus) {
+ pr_err("Allocating global iommu array failed\n");
+ ret = -ENOMEM;
+ goto error;
+ }
+
+ for_each_iommu(iommu, drhd) {
+ if (drhd->ignored) {
+ iommu_disable_translation(iommu);
+ continue;
+ }
+
+ /*
+ * Find the max pasid size of all IOMMU's in the system.
+ * We need to ensure the system pasid table is no bigger
+ * than the smallest supported.
+ */
+ if (pasid_supported(iommu)) {
+ u32 temp = 2 << ecap_pss(iommu->ecap);
+
+ intel_pasid_max_id = min_t(u32, temp,
+ intel_pasid_max_id);
+ }
+
+ g_iommus[iommu->seq_id] = iommu;
+
+ intel_iommu_init_qi(iommu);
+
+ ret = iommu_init_domains(iommu);
+ if (ret)
+ goto free_iommu;
+
+ init_translation_status(iommu);
+
+ if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
+ iommu_disable_translation(iommu);
+ clear_translation_pre_enabled(iommu);
+ pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
+ iommu->name);
+ }
+
+ /*
+ * TBD:
+ * we could share the same root & context tables
+ * among all IOMMU's. Need to Split it later.
+ */
+ ret = iommu_alloc_root_entry(iommu);
+ if (ret)
+ goto free_iommu;
+
+ if (translation_pre_enabled(iommu)) {
+ pr_info("Translation already enabled - trying to copy translation structures\n");
+
+ ret = copy_translation_tables(iommu);
+ if (ret) {
+ /*
+ * We found the IOMMU with translation
+ * enabled - but failed to copy over the
+ * old root-entry table. Try to proceed
+ * by disabling translation now and
+ * allocating a clean root-entry table.
+ * This might cause DMAR faults, but
+ * probably the dump will still succeed.
+ */
+ pr_err("Failed to copy translation tables from previous kernel for %s\n",
+ iommu->name);
+ iommu_disable_translation(iommu);
+ clear_translation_pre_enabled(iommu);
+ } else {
+ pr_info("Copied translation tables from previous kernel for %s\n",
+ iommu->name);
+ }
+ }
+
+ if (!ecap_pass_through(iommu->ecap))
+ hw_pass_through = 0;
+
+ if (!intel_iommu_strict && cap_caching_mode(iommu->cap)) {
+ pr_warn("Disable batched IOTLB flush due to virtualization");
+ intel_iommu_strict = 1;
+ }
+ intel_svm_check(iommu);
+ }
+
+ /*
+ * Now that qi is enabled on all iommus, set the root entry and flush
+ * caches. This is required on some Intel X58 chipsets, otherwise the
+ * flush_context function will loop forever and the boot hangs.
+ */
+ for_each_active_iommu(iommu, drhd) {
+ iommu_flush_write_buffer(iommu);
+#ifdef CONFIG_INTEL_IOMMU_SVM
+ register_pasid_allocator(iommu);
+#endif
+ iommu_set_root_entry(iommu);
+ }
+
+#ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
+ dmar_map_gfx = 0;
+#endif
+
+ if (!dmar_map_gfx)
+ iommu_identity_mapping |= IDENTMAP_GFX;
+
+ check_tylersburg_isoch();
+
+ ret = si_domain_init(hw_pass_through);
+ if (ret)
+ goto free_iommu;
+
+ /*
+ * for each drhd
+ * enable fault log
+ * global invalidate context cache
+ * global invalidate iotlb
+ * enable translation
+ */
+ for_each_iommu(iommu, drhd) {
+ if (drhd->ignored) {
+ /*
+ * we always have to disable PMRs or DMA may fail on
+ * this device
+ */
+ if (force_on)
+ iommu_disable_protect_mem_regions(iommu);
+ continue;
+ }
+
+ iommu_flush_write_buffer(iommu);
+
+#ifdef CONFIG_INTEL_IOMMU_SVM
+ if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
+ /*
+ * Call dmar_alloc_hwirq() with dmar_global_lock held,
+ * could cause possible lock race condition.
+ */
+ up_write(&dmar_global_lock);
+ ret = intel_svm_enable_prq(iommu);
+ down_write(&dmar_global_lock);
+ if (ret)
+ goto free_iommu;
+ }
+#endif
+ ret = dmar_set_interrupt(iommu);
+ if (ret)
+ goto free_iommu;
+ }
+
+ return 0;
+
+free_iommu:
+ for_each_active_iommu(iommu, drhd) {
+ disable_dmar_iommu(iommu);
+ free_dmar_iommu(iommu);
+ }
+ if (si_domain) {
+ domain_exit(si_domain);
+ si_domain = NULL;
+ }
+
+ kfree(g_iommus);
+
+error:
+ return ret;
+}
+
+/* This takes a number of _MM_ pages, not VTD pages */
+static unsigned long intel_alloc_iova(struct device *dev,
+ struct dmar_domain *domain,
+ unsigned long nrpages, uint64_t dma_mask)
+{
+ unsigned long iova_pfn;
+
+ /*
+ * Restrict dma_mask to the width that the iommu can handle.
+ * First-level translation restricts the input-address to a
+ * canonical address (i.e., address bits 63:N have the same
+ * value as address bit [N-1], where N is 48-bits with 4-level
+ * paging and 57-bits with 5-level paging). Hence, skip bit
+ * [N-1].
+ */
+ if (domain_use_first_level(domain))
+ dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw - 1),
+ dma_mask);
+ else
+ dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw),
+ dma_mask);
+
+ /* Ensure we reserve the whole size-aligned region */
+ nrpages = __roundup_pow_of_two(nrpages);
+
+ if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
+ /*
+ * First try to allocate an io virtual address in
+ * DMA_BIT_MASK(32) and if that fails then try allocating
+ * from higher range
+ */
+ iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
+ IOVA_PFN(DMA_BIT_MASK(32)), false);
+ if (iova_pfn)
+ return iova_pfn;
+ }
+ iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
+ IOVA_PFN(dma_mask), true);
+ if (unlikely(!iova_pfn)) {
+ dev_err_once(dev, "Allocating %ld-page iova failed\n",
+ nrpages);
+ return 0;
+ }
+
+ return iova_pfn;
+}
+
+static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
+ size_t size, int dir, u64 dma_mask)
+{
+ struct dmar_domain *domain;
+ phys_addr_t start_paddr;
+ unsigned long iova_pfn;
+ int prot = 0;
+ int ret;
+ struct intel_iommu *iommu;
+ unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
+
+ BUG_ON(dir == DMA_NONE);
+
+ if (unlikely(attach_deferred(dev)))
+ do_deferred_attach(dev);
+
+ domain = find_domain(dev);
+ if (!domain)
+ return DMA_MAPPING_ERROR;
+
+ iommu = domain_get_iommu(domain);
+ size = aligned_nrpages(paddr, size);
+
+ iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), dma_mask);
+ if (!iova_pfn)
+ goto error;
+
+ /*
+ * Check if DMAR supports zero-length reads on write only
+ * mappings..
+ */
+ if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
+ !cap_zlr(iommu->cap))
+ prot |= DMA_PTE_READ;
+ if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
+ prot |= DMA_PTE_WRITE;
+ /*
+ * paddr - (paddr + size) might be partial page, we should map the whole
+ * page. Note: if two part of one page are separately mapped, we
+ * might have two guest_addr mapping to the same host paddr, but this
+ * is not a big problem
+ */
+ ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
+ mm_to_dma_pfn(paddr_pfn), size, prot);
+ if (ret)
+ goto error;
+
+ start_paddr = (phys_addr_t)iova_pfn << PAGE_SHIFT;
+ start_paddr += paddr & ~PAGE_MASK;
+
+ trace_map_single(dev, start_paddr, paddr, size << VTD_PAGE_SHIFT);
+
+ return start_paddr;
+
+error:
+ if (iova_pfn)
+ free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
+ dev_err(dev, "Device request: %zx@%llx dir %d --- failed\n",
+ size, (unsigned long long)paddr, dir);
+ return DMA_MAPPING_ERROR;
+}
+
+static dma_addr_t intel_map_page(struct device *dev, struct page *page,
+ unsigned long offset, size_t size,
+ enum dma_data_direction dir,
+ unsigned long attrs)
+{
+ return __intel_map_single(dev, page_to_phys(page) + offset,
+ size, dir, *dev->dma_mask);
+}
+
+static dma_addr_t intel_map_resource(struct device *dev, phys_addr_t phys_addr,
+ size_t size, enum dma_data_direction dir,
+ unsigned long attrs)
+{
+ return __intel_map_single(dev, phys_addr, size, dir, *dev->dma_mask);
+}
+
+static void intel_unmap(struct device *dev, dma_addr_t dev_addr, size_t size)
+{
+ struct dmar_domain *domain;
+ unsigned long start_pfn, last_pfn;
+ unsigned long nrpages;
+ unsigned long iova_pfn;
+ struct intel_iommu *iommu;
+ struct page *freelist;
+ struct pci_dev *pdev = NULL;
+
+ domain = find_domain(dev);
+ BUG_ON(!domain);
+
+ iommu = domain_get_iommu(domain);
+
+ iova_pfn = IOVA_PFN(dev_addr);
+
+ nrpages = aligned_nrpages(dev_addr, size);
+ start_pfn = mm_to_dma_pfn(iova_pfn);
+ last_pfn = start_pfn + nrpages - 1;
+
+ if (dev_is_pci(dev))
+ pdev = to_pci_dev(dev);
+
+ freelist = domain_unmap(domain, start_pfn, last_pfn);
+ if (intel_iommu_strict || (pdev && pdev->untrusted) ||
+ !has_iova_flush_queue(&domain->iovad)) {
+ iommu_flush_iotlb_psi(iommu, domain, start_pfn,
+ nrpages, !freelist, 0);
+ /* free iova */
+ free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
+ dma_free_pagelist(freelist);
+ } else {
+ queue_iova(&domain->iovad, iova_pfn, nrpages,
+ (unsigned long)freelist);
+ /*
+ * queue up the release of the unmap to save the 1/6th of the
+ * cpu used up by the iotlb flush operation...
+ */
+ }
+
+ trace_unmap_single(dev, dev_addr, size);
+}
+
+static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
+ size_t size, enum dma_data_direction dir,
+ unsigned long attrs)
+{
+ intel_unmap(dev, dev_addr, size);
+}
+
+static void intel_unmap_resource(struct device *dev, dma_addr_t dev_addr,
+ size_t size, enum dma_data_direction dir, unsigned long attrs)
+{
+ intel_unmap(dev, dev_addr, size);
+}
+
+static void *intel_alloc_coherent(struct device *dev, size_t size,
+ dma_addr_t *dma_handle, gfp_t flags,
+ unsigned long attrs)
+{
+ struct page *page = NULL;
+ int order;
+
+ if (unlikely(attach_deferred(dev)))
+ do_deferred_attach(dev);
+
+ size = PAGE_ALIGN(size);
+ order = get_order(size);
+
+ if (gfpflags_allow_blocking(flags)) {
+ unsigned int count = size >> PAGE_SHIFT;
+
+ page = dma_alloc_from_contiguous(dev, count, order,
+ flags & __GFP_NOWARN);
+ }
+
+ if (!page)
+ page = alloc_pages(flags, order);
+ if (!page)
+ return NULL;
+ memset(page_address(page), 0, size);
+
+ *dma_handle = __intel_map_single(dev, page_to_phys(page), size,
+ DMA_BIDIRECTIONAL,
+ dev->coherent_dma_mask);
+ if (*dma_handle != DMA_MAPPING_ERROR)
+ return page_address(page);
+ if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
+ __free_pages(page, order);
+
+ return NULL;
+}
+
+static void intel_free_coherent(struct device *dev, size_t size, void *vaddr,
+ dma_addr_t dma_handle, unsigned long attrs)
+{
+ int order;
+ struct page *page = virt_to_page(vaddr);
+
+ size = PAGE_ALIGN(size);
+ order = get_order(size);
+
+ intel_unmap(dev, dma_handle, size);
+ if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
+ __free_pages(page, order);
+}
+
+static void intel_unmap_sg(struct device *dev, struct scatterlist *sglist,
+ int nelems, enum dma_data_direction dir,
+ unsigned long attrs)
+{
+ dma_addr_t startaddr = sg_dma_address(sglist) & PAGE_MASK;
+ unsigned long nrpages = 0;
+ struct scatterlist *sg;
+ int i;
+
+ for_each_sg(sglist, sg, nelems, i) {
+ nrpages += aligned_nrpages(sg_dma_address(sg), sg_dma_len(sg));
+ }
+
+ intel_unmap(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
+
+ trace_unmap_sg(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
+}
+
+static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
+ enum dma_data_direction dir, unsigned long attrs)
+{
+ int i;
+ struct dmar_domain *domain;
+ size_t size = 0;
+ int prot = 0;
+ unsigned long iova_pfn;
+ int ret;
+ struct scatterlist *sg;
+ unsigned long start_vpfn;
+ struct intel_iommu *iommu;
+
+ BUG_ON(dir == DMA_NONE);
+
+ if (unlikely(attach_deferred(dev)))
+ do_deferred_attach(dev);
+
+ domain = find_domain(dev);
+ if (!domain)
+ return 0;
+
+ iommu = domain_get_iommu(domain);
+
+ for_each_sg(sglist, sg, nelems, i)
+ size += aligned_nrpages(sg->offset, sg->length);
+
+ iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size),
+ *dev->dma_mask);
+ if (!iova_pfn) {
+ sglist->dma_length = 0;
+ return 0;
+ }
+
+ /*
+ * Check if DMAR supports zero-length reads on write only
+ * mappings..
+ */
+ if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
+ !cap_zlr(iommu->cap))
+ prot |= DMA_PTE_READ;
+ if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
+ prot |= DMA_PTE_WRITE;
+
+ start_vpfn = mm_to_dma_pfn(iova_pfn);
+
+ ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
+ if (unlikely(ret)) {
+ dma_pte_free_pagetable(domain, start_vpfn,
+ start_vpfn + size - 1,
+ agaw_to_level(domain->agaw) + 1);
+ free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
+ return 0;
+ }
+
+ for_each_sg(sglist, sg, nelems, i)
+ trace_map_sg(dev, i + 1, nelems, sg);
+
+ return nelems;
+}
+
+static u64 intel_get_required_mask(struct device *dev)
+{
+ return DMA_BIT_MASK(32);
+}
+
+static const struct dma_map_ops intel_dma_ops = {
+ .alloc = intel_alloc_coherent,
+ .free = intel_free_coherent,
+ .map_sg = intel_map_sg,
+ .unmap_sg = intel_unmap_sg,
+ .map_page = intel_map_page,
+ .unmap_page = intel_unmap_page,
+ .map_resource = intel_map_resource,
+ .unmap_resource = intel_unmap_resource,
+ .dma_supported = dma_direct_supported,
+ .mmap = dma_common_mmap,
+ .get_sgtable = dma_common_get_sgtable,
+ .alloc_pages = dma_common_alloc_pages,
+ .free_pages = dma_common_free_pages,
+ .get_required_mask = intel_get_required_mask,
+};
+
+static void
+bounce_sync_single(struct device *dev, dma_addr_t addr, size_t size,
+ enum dma_data_direction dir, enum dma_sync_target target)
+{
+ struct dmar_domain *domain;
+ phys_addr_t tlb_addr;
+
+ domain = find_domain(dev);
+ if (WARN_ON(!domain))
+ return;
+
+ tlb_addr = intel_iommu_iova_to_phys(&domain->domain, addr);
+ if (is_swiotlb_buffer(tlb_addr))
+ swiotlb_tbl_sync_single(dev, tlb_addr, size, dir, target);
+}
+
+static dma_addr_t
+bounce_map_single(struct device *dev, phys_addr_t paddr, size_t size,
+ enum dma_data_direction dir, unsigned long attrs,
+ u64 dma_mask)
+{
+ size_t aligned_size = ALIGN(size, VTD_PAGE_SIZE);
+ struct dmar_domain *domain;
+ struct intel_iommu *iommu;
+ unsigned long iova_pfn;
+ unsigned long nrpages;
+ phys_addr_t tlb_addr;
+ int prot = 0;
+ int ret;
+
+ if (unlikely(attach_deferred(dev)))
+ do_deferred_attach(dev);
+
+ domain = find_domain(dev);
+
+ if (WARN_ON(dir == DMA_NONE || !domain))
+ return DMA_MAPPING_ERROR;
+
+ iommu = domain_get_iommu(domain);
+ if (WARN_ON(!iommu))
+ return DMA_MAPPING_ERROR;
+
+ nrpages = aligned_nrpages(0, size);
+ iova_pfn = intel_alloc_iova(dev, domain,
+ dma_to_mm_pfn(nrpages), dma_mask);
+ if (!iova_pfn)
+ return DMA_MAPPING_ERROR;
+
+ /*
+ * Check if DMAR supports zero-length reads on write only
+ * mappings..
+ */
+ if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL ||
+ !cap_zlr(iommu->cap))
+ prot |= DMA_PTE_READ;
+ if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
+ prot |= DMA_PTE_WRITE;
+
+ /*
+ * If both the physical buffer start address and size are
+ * page aligned, we don't need to use a bounce page.
+ */
+ if (!IS_ALIGNED(paddr | size, VTD_PAGE_SIZE)) {
+ tlb_addr = swiotlb_tbl_map_single(dev, paddr, size,
+ aligned_size, dir, attrs);
+ if (tlb_addr == DMA_MAPPING_ERROR) {
+ goto swiotlb_error;
+ } else {
+ /* Cleanup the padding area. */
+ void *padding_start = phys_to_virt(tlb_addr);
+ size_t padding_size = aligned_size;
+
+ if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
+ (dir == DMA_TO_DEVICE ||
+ dir == DMA_BIDIRECTIONAL)) {
+ padding_start += size;
+ padding_size -= size;
+ }
+
+ memset(padding_start, 0, padding_size);
+ }
+ } else {
+ tlb_addr = paddr;
+ }
+
+ ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
+ tlb_addr >> VTD_PAGE_SHIFT, nrpages, prot);
+ if (ret)
+ goto mapping_error;
+
+ trace_bounce_map_single(dev, iova_pfn << PAGE_SHIFT, paddr, size);
+
+ return (phys_addr_t)iova_pfn << PAGE_SHIFT;
+
+mapping_error:
+ if (is_swiotlb_buffer(tlb_addr))
+ swiotlb_tbl_unmap_single(dev, tlb_addr, size,
+ aligned_size, dir, attrs);
+swiotlb_error:
+ free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
+ dev_err(dev, "Device bounce map: %zx@%llx dir %d --- failed\n",
+ size, (unsigned long long)paddr, dir);
+
+ return DMA_MAPPING_ERROR;
+}
+
+static void
+bounce_unmap_single(struct device *dev, dma_addr_t dev_addr, size_t size,
+ enum dma_data_direction dir, unsigned long attrs)
+{
+ size_t aligned_size = ALIGN(size, VTD_PAGE_SIZE);
+ struct dmar_domain *domain;
+ phys_addr_t tlb_addr;
+
+ domain = find_domain(dev);
+ if (WARN_ON(!domain))
+ return;
+
+ tlb_addr = intel_iommu_iova_to_phys(&domain->domain, dev_addr);
+ if (WARN_ON(!tlb_addr))
+ return;
+
+ intel_unmap(dev, dev_addr, size);
+ if (is_swiotlb_buffer(tlb_addr))
+ swiotlb_tbl_unmap_single(dev, tlb_addr, size,
+ aligned_size, dir, attrs);
+
+ trace_bounce_unmap_single(dev, dev_addr, size);
+}
+
+static dma_addr_t
+bounce_map_page(struct device *dev, struct page *page, unsigned long offset,
+ size_t size, enum dma_data_direction dir, unsigned long attrs)
+{
+ return bounce_map_single(dev, page_to_phys(page) + offset,
+ size, dir, attrs, *dev->dma_mask);
+}
+
+static dma_addr_t
+bounce_map_resource(struct device *dev, phys_addr_t phys_addr, size_t size,
+ enum dma_data_direction dir, unsigned long attrs)
+{
+ return bounce_map_single(dev, phys_addr, size,
+ dir, attrs, *dev->dma_mask);
+}
+
+static void
+bounce_unmap_page(struct device *dev, dma_addr_t dev_addr, size_t size,
+ enum dma_data_direction dir, unsigned long attrs)
+{
+ bounce_unmap_single(dev, dev_addr, size, dir, attrs);
+}
+
+static void
+bounce_unmap_resource(struct device *dev, dma_addr_t dev_addr, size_t size,
+ enum dma_data_direction dir, unsigned long attrs)
+{
+ bounce_unmap_single(dev, dev_addr, size, dir, attrs);
+}
+
+static void
+bounce_unmap_sg(struct device *dev, struct scatterlist *sglist, int nelems,
+ enum dma_data_direction dir, unsigned long attrs)
+{
+ struct scatterlist *sg;
+ int i;
+
+ for_each_sg(sglist, sg, nelems, i)
+ bounce_unmap_page(dev, sg->dma_address,
+ sg_dma_len(sg), dir, attrs);
+}
+
+static int
+bounce_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
+ enum dma_data_direction dir, unsigned long attrs)
+{
+ int i;
+ struct scatterlist *sg;
+
+ for_each_sg(sglist, sg, nelems, i) {
+ sg->dma_address = bounce_map_page(dev, sg_page(sg),
+ sg->offset, sg->length,
+ dir, attrs);
+ if (sg->dma_address == DMA_MAPPING_ERROR)
+ goto out_unmap;
+ sg_dma_len(sg) = sg->length;
+ }
+
+ for_each_sg(sglist, sg, nelems, i)
+ trace_bounce_map_sg(dev, i + 1, nelems, sg);
+
+ return nelems;
+
+out_unmap:
+ bounce_unmap_sg(dev, sglist, i, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC);
+ return 0;
+}
+
+static void
+bounce_sync_single_for_cpu(struct device *dev, dma_addr_t addr,
+ size_t size, enum dma_data_direction dir)
+{
+ bounce_sync_single(dev, addr, size, dir, SYNC_FOR_CPU);
+}
+
+static void
+bounce_sync_single_for_device(struct device *dev, dma_addr_t addr,
+ size_t size, enum dma_data_direction dir)
+{
+ bounce_sync_single(dev, addr, size, dir, SYNC_FOR_DEVICE);
+}
+
+static void
+bounce_sync_sg_for_cpu(struct device *dev, struct scatterlist *sglist,
+ int nelems, enum dma_data_direction dir)
+{
+ struct scatterlist *sg;
+ int i;
+
+ for_each_sg(sglist, sg, nelems, i)
+ bounce_sync_single(dev, sg_dma_address(sg),
+ sg_dma_len(sg), dir, SYNC_FOR_CPU);
+}
+
+static void
+bounce_sync_sg_for_device(struct device *dev, struct scatterlist *sglist,
+ int nelems, enum dma_data_direction dir)
+{
+ struct scatterlist *sg;
+ int i;
+
+ for_each_sg(sglist, sg, nelems, i)
+ bounce_sync_single(dev, sg_dma_address(sg),
+ sg_dma_len(sg), dir, SYNC_FOR_DEVICE);
+}
+
+static const struct dma_map_ops bounce_dma_ops = {
+ .alloc = intel_alloc_coherent,
+ .free = intel_free_coherent,
+ .map_sg = bounce_map_sg,
+ .unmap_sg = bounce_unmap_sg,
+ .map_page = bounce_map_page,
+ .unmap_page = bounce_unmap_page,
+ .sync_single_for_cpu = bounce_sync_single_for_cpu,
+ .sync_single_for_device = bounce_sync_single_for_device,
+ .sync_sg_for_cpu = bounce_sync_sg_for_cpu,
+ .sync_sg_for_device = bounce_sync_sg_for_device,
+ .map_resource = bounce_map_resource,
+ .unmap_resource = bounce_unmap_resource,
+ .alloc_pages = dma_common_alloc_pages,
+ .free_pages = dma_common_free_pages,
+ .dma_supported = dma_direct_supported,
+};
+
+static inline int iommu_domain_cache_init(void)
+{
+ int ret = 0;
+
+ iommu_domain_cache = kmem_cache_create("iommu_domain",
+ sizeof(struct dmar_domain),
+ 0,
+ SLAB_HWCACHE_ALIGN,
+
+ NULL);
+ if (!iommu_domain_cache) {
+ pr_err("Couldn't create iommu_domain cache\n");
+ ret = -ENOMEM;
+ }
+
+ return ret;
+}
+
+static inline int iommu_devinfo_cache_init(void)
+{
+ int ret = 0;
+
+ iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
+ sizeof(struct device_domain_info),
+ 0,
+ SLAB_HWCACHE_ALIGN,
+ NULL);
+ if (!iommu_devinfo_cache) {
+ pr_err("Couldn't create devinfo cache\n");
+ ret = -ENOMEM;
+ }
+
+ return ret;
+}
+
+static int __init iommu_init_mempool(void)
+{
+ int ret;
+ ret = iova_cache_get();
+ if (ret)
+ return ret;
+
+ ret = iommu_domain_cache_init();
+ if (ret)
+ goto domain_error;
+
+ ret = iommu_devinfo_cache_init();
+ if (!ret)
+ return ret;
+
+ kmem_cache_destroy(iommu_domain_cache);
+domain_error:
+ iova_cache_put();
+
+ return -ENOMEM;
+}
+
+static void __init iommu_exit_mempool(void)
+{
+ kmem_cache_destroy(iommu_devinfo_cache);
+ kmem_cache_destroy(iommu_domain_cache);
+ iova_cache_put();
+}
+
+static void __init init_no_remapping_devices(void)
+{
+ struct dmar_drhd_unit *drhd;
+ struct device *dev;
+ int i;
+
+ for_each_drhd_unit(drhd) {
+ if (!drhd->include_all) {
+ for_each_active_dev_scope(drhd->devices,
+ drhd->devices_cnt, i, dev)
+ break;
+ /* ignore DMAR unit if no devices exist */
+ if (i == drhd->devices_cnt)
+ drhd->ignored = 1;
+ }
+ }
+
+ for_each_active_drhd_unit(drhd) {
+ if (drhd->include_all)
+ continue;
+
+ for_each_active_dev_scope(drhd->devices,
+ drhd->devices_cnt, i, dev)
+ if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
+ break;
+ if (i < drhd->devices_cnt)
+ continue;
+
+ /* This IOMMU has *only* gfx devices. Either bypass it or
+ set the gfx_mapped flag, as appropriate */
+ drhd->gfx_dedicated = 1;
+ if (!dmar_map_gfx)
+ drhd->ignored = 1;
+ }
+}
+
+#ifdef CONFIG_SUSPEND
+static int init_iommu_hw(void)
+{
+ struct dmar_drhd_unit *drhd;
+ struct intel_iommu *iommu = NULL;
+
+ for_each_active_iommu(iommu, drhd)
+ if (iommu->qi)
+ dmar_reenable_qi(iommu);
+
+ for_each_iommu(iommu, drhd) {
+ if (drhd->ignored) {
+ /*
+ * we always have to disable PMRs or DMA may fail on
+ * this device
+ */
+ if (force_on)
+ iommu_disable_protect_mem_regions(iommu);
+ continue;
+ }
+
+ iommu_flush_write_buffer(iommu);
+ iommu_set_root_entry(iommu);
+ iommu_enable_translation(iommu);
+ iommu_disable_protect_mem_regions(iommu);
+ }
+
+ return 0;
+}
+
+static void iommu_flush_all(void)
+{
+ struct dmar_drhd_unit *drhd;
+ struct intel_iommu *iommu;
+
+ for_each_active_iommu(iommu, drhd) {
+ iommu->flush.flush_context(iommu, 0, 0, 0,
+ DMA_CCMD_GLOBAL_INVL);
+ iommu->flush.flush_iotlb(iommu, 0, 0, 0,
+ DMA_TLB_GLOBAL_FLUSH);
+ }
+}
+
+static int iommu_suspend(void)
+{
+ struct dmar_drhd_unit *drhd;
+ struct intel_iommu *iommu = NULL;
+ unsigned long flag;
+
+ for_each_active_iommu(iommu, drhd) {
+ iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
+ GFP_ATOMIC);
+ if (!iommu->iommu_state)
+ goto nomem;
+ }
+
+ iommu_flush_all();
+
+ for_each_active_iommu(iommu, drhd) {
+ iommu_disable_translation(iommu);
+
+ raw_spin_lock_irqsave(&iommu->register_lock, flag);
+
+ iommu->iommu_state[SR_DMAR_FECTL_REG] =
+ readl(iommu->reg + DMAR_FECTL_REG);
+ iommu->iommu_state[SR_DMAR_FEDATA_REG] =
+ readl(iommu->reg + DMAR_FEDATA_REG);
+ iommu->iommu_state[SR_DMAR_FEADDR_REG] =
+ readl(iommu->reg + DMAR_FEADDR_REG);
+ iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
+ readl(iommu->reg + DMAR_FEUADDR_REG);
+
+ raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
+ }
+ return 0;
+
+nomem:
+ for_each_active_iommu(iommu, drhd)
+ kfree(iommu->iommu_state);
+
+ return -ENOMEM;
+}
+
+static void iommu_resume(void)
+{
+ struct dmar_drhd_unit *drhd;
+ struct intel_iommu *iommu = NULL;
+ unsigned long flag;
+
+ if (init_iommu_hw()) {
+ if (force_on)
+ panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
+ else
+ WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
+ return;
+ }
+
+ for_each_active_iommu(iommu, drhd) {
+
+ raw_spin_lock_irqsave(&iommu->register_lock, flag);
+
+ writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
+ iommu->reg + DMAR_FECTL_REG);
+ writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
+ iommu->reg + DMAR_FEDATA_REG);
+ writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
+ iommu->reg + DMAR_FEADDR_REG);
+ writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
+ iommu->reg + DMAR_FEUADDR_REG);
+
+ raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
+ }
+
+ for_each_active_iommu(iommu, drhd)
+ kfree(iommu->iommu_state);
+}
+
+static struct syscore_ops iommu_syscore_ops = {
+ .resume = iommu_resume,
+ .suspend = iommu_suspend,
+};
+
+static void __init init_iommu_pm_ops(void)
+{
+ register_syscore_ops(&iommu_syscore_ops);
+}
+
+#else
+static inline void init_iommu_pm_ops(void) {}
+#endif /* CONFIG_PM */
+
+static int rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
+{
+ if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
+ !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
+ rmrr->end_address <= rmrr->base_address ||
+ arch_rmrr_sanity_check(rmrr))
+ return -EINVAL;
+
+ return 0;
+}
+
+int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
+{
+ struct acpi_dmar_reserved_memory *rmrr;
+ struct dmar_rmrr_unit *rmrru;
+
+ rmrr = (struct acpi_dmar_reserved_memory *)header;
+ if (rmrr_sanity_check(rmrr)) {
+ pr_warn(FW_BUG
+ "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
+ "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
+ rmrr->base_address, rmrr->end_address,
+ dmi_get_system_info(DMI_BIOS_VENDOR),
+ dmi_get_system_info(DMI_BIOS_VERSION),
+ dmi_get_system_info(DMI_PRODUCT_VERSION));
+ add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
+ }
+
+ rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
+ if (!rmrru)
+ goto out;
+
+ rmrru->hdr = header;
+
+ rmrru->base_address = rmrr->base_address;
+ rmrru->end_address = rmrr->end_address;
+
+ rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
+ ((void *)rmrr) + rmrr->header.length,
+ &rmrru->devices_cnt);
+ if (rmrru->devices_cnt && rmrru->devices == NULL)
+ goto free_rmrru;
+
+ list_add(&rmrru->list, &dmar_rmrr_units);
+
+ return 0;
+free_rmrru:
+ kfree(rmrru);
+out:
+ return -ENOMEM;
+}
+
+static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
+{
+ struct dmar_atsr_unit *atsru;
+ struct acpi_dmar_atsr *tmp;
+
+ list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
+ dmar_rcu_check()) {
+ tmp = (struct acpi_dmar_atsr *)atsru->hdr;
+ if (atsr->segment != tmp->segment)
+ continue;
+ if (atsr->header.length != tmp->header.length)
+ continue;
+ if (memcmp(atsr, tmp, atsr->header.length) == 0)
+ return atsru;
+ }
+
+ return NULL;
+}
+
+int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
+{
+ struct acpi_dmar_atsr *atsr;
+ struct dmar_atsr_unit *atsru;
+
+ if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
+ return 0;
+
+ atsr = container_of(hdr, struct acpi_dmar_atsr, header);
+ atsru = dmar_find_atsr(atsr);
+ if (atsru)
+ return 0;
+
+ atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
+ if (!atsru)
+ return -ENOMEM;
+
+ /*
+ * If memory is allocated from slab by ACPI _DSM method, we need to
+ * copy the memory content because the memory buffer will be freed
+ * on return.
+ */
+ atsru->hdr = (void *)(atsru + 1);
+ memcpy(atsru->hdr, hdr, hdr->length);
+ atsru->include_all = atsr->flags & 0x1;
+ if (!atsru->include_all) {
+ atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
+ (void *)atsr + atsr->header.length,
+ &atsru->devices_cnt);
+ if (atsru->devices_cnt && atsru->devices == NULL) {
+ kfree(atsru);
+ return -ENOMEM;
+ }
+ }
+
+ list_add_rcu(&atsru->list, &dmar_atsr_units);
+
+ return 0;
+}
+
+static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
+{
+ dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
+ kfree(atsru);
+}
+
+int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
+{
+ struct acpi_dmar_atsr *atsr;
+ struct dmar_atsr_unit *atsru;
+
+ atsr = container_of(hdr, struct acpi_dmar_atsr, header);
+ atsru = dmar_find_atsr(atsr);
+ if (atsru) {
+ list_del_rcu(&atsru->list);
+ synchronize_rcu();
+ intel_iommu_free_atsr(atsru);
+ }
+
+ return 0;
+}
+
+int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
+{
+ int i;
+ struct device *dev;
+ struct acpi_dmar_atsr *atsr;
+ struct dmar_atsr_unit *atsru;
+
+ atsr = container_of(hdr, struct acpi_dmar_atsr, header);
+ atsru = dmar_find_atsr(atsr);
+ if (!atsru)
+ return 0;
+
+ if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
+ for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
+ i, dev)
+ return -EBUSY;
+ }
+
+ return 0;
+}
+
+static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
+{
+ int sp, ret;
+ struct intel_iommu *iommu = dmaru->iommu;
+
+ if (g_iommus[iommu->seq_id])
+ return 0;
+
+ if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
+ pr_warn("%s: Doesn't support hardware pass through.\n",
+ iommu->name);
+ return -ENXIO;
+ }
+ if (!ecap_sc_support(iommu->ecap) &&
+ domain_update_iommu_snooping(iommu)) {
+ pr_warn("%s: Doesn't support snooping.\n",
+ iommu->name);
+ return -ENXIO;
+ }
+ sp = domain_update_iommu_superpage(NULL, iommu) - 1;
+ if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
+ pr_warn("%s: Doesn't support large page.\n",
+ iommu->name);
+ return -ENXIO;
+ }
+
+ /*
+ * Disable translation if already enabled prior to OS handover.
+ */
+ if (iommu->gcmd & DMA_GCMD_TE)
+ iommu_disable_translation(iommu);
+
+ g_iommus[iommu->seq_id] = iommu;
+ ret = iommu_init_domains(iommu);
+ if (ret == 0)
+ ret = iommu_alloc_root_entry(iommu);
+ if (ret)
+ goto out;
+
+ intel_svm_check(iommu);
+
+ if (dmaru->ignored) {
+ /*
+ * we always have to disable PMRs or DMA may fail on this device
+ */
+ if (force_on)
+ iommu_disable_protect_mem_regions(iommu);
+ return 0;
+ }
+
+ intel_iommu_init_qi(iommu);
+ iommu_flush_write_buffer(iommu);
+
+#ifdef CONFIG_INTEL_IOMMU_SVM
+ if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
+ ret = intel_svm_enable_prq(iommu);
+ if (ret)
+ goto disable_iommu;
+ }
+#endif
+ ret = dmar_set_interrupt(iommu);
+ if (ret)
+ goto disable_iommu;
+
+ iommu_set_root_entry(iommu);
+ iommu_enable_translation(iommu);
+
+ iommu_disable_protect_mem_regions(iommu);
+ return 0;
+
+disable_iommu:
+ disable_dmar_iommu(iommu);
+out:
+ free_dmar_iommu(iommu);
+ return ret;
+}
+
+int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
+{
+ int ret = 0;
+ struct intel_iommu *iommu = dmaru->iommu;
+
+ if (!intel_iommu_enabled)
+ return 0;
+ if (iommu == NULL)
+ return -EINVAL;
+
+ if (insert) {
+ ret = intel_iommu_add(dmaru);
+ } else {
+ disable_dmar_iommu(iommu);
+ free_dmar_iommu(iommu);
+ }
+
+ return ret;
+}
+
+static void intel_iommu_free_dmars(void)
+{
+ struct dmar_rmrr_unit *rmrru, *rmrr_n;
+ struct dmar_atsr_unit *atsru, *atsr_n;
+
+ list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
+ list_del(&rmrru->list);
+ dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
+ kfree(rmrru);
+ }
+
+ list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
+ list_del(&atsru->list);
+ intel_iommu_free_atsr(atsru);
+ }
+}
+
+int dmar_find_matched_atsr_unit(struct pci_dev *dev)
+{
+ int i, ret = 1;
+ struct pci_bus *bus;
+ struct pci_dev *bridge = NULL;
+ struct device *tmp;
+ struct acpi_dmar_atsr *atsr;
+ struct dmar_atsr_unit *atsru;
+
+ dev = pci_physfn(dev);
+ for (bus = dev->bus; bus; bus = bus->parent) {
+ bridge = bus->self;
+ /* If it's an integrated device, allow ATS */
+ if (!bridge)
+ return 1;
+ /* Connected via non-PCIe: no ATS */
+ if (!pci_is_pcie(bridge) ||
+ pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
+ return 0;
+ /* If we found the root port, look it up in the ATSR */
+ if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
+ break;
+ }
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
+ atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
+ if (atsr->segment != pci_domain_nr(dev->bus))
+ continue;
+
+ for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
+ if (tmp == &bridge->dev)
+ goto out;
+
+ if (atsru->include_all)
+ goto out;
+ }
+ ret = 0;
+out:
+ rcu_read_unlock();
+
+ return ret;
+}
+
+int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
+{
+ int ret;
+ struct dmar_rmrr_unit *rmrru;
+ struct dmar_atsr_unit *atsru;
+ struct acpi_dmar_atsr *atsr;
+ struct acpi_dmar_reserved_memory *rmrr;
+
+ if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
+ return 0;
+
+ list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
+ rmrr = container_of(rmrru->hdr,
+ struct acpi_dmar_reserved_memory, header);
+ if (info->event == BUS_NOTIFY_ADD_DEVICE) {
+ ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
+ ((void *)rmrr) + rmrr->header.length,
+ rmrr->segment, rmrru->devices,
+ rmrru->devices_cnt);
+ if (ret < 0)
+ return ret;
+ } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
+ dmar_remove_dev_scope(info, rmrr->segment,
+ rmrru->devices, rmrru->devices_cnt);
+ }
+ }
+
+ list_for_each_entry(atsru, &dmar_atsr_units, list) {
+ if (atsru->include_all)
+ continue;
+
+ atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
+ if (info->event == BUS_NOTIFY_ADD_DEVICE) {
+ ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
+ (void *)atsr + atsr->header.length,
+ atsr->segment, atsru->devices,
+ atsru->devices_cnt);
+ if (ret > 0)
+ break;
+ else if (ret < 0)
+ return ret;
+ } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
+ if (dmar_remove_dev_scope(info, atsr->segment,
+ atsru->devices, atsru->devices_cnt))
+ break;
+ }
+ }
+
+ return 0;
+}
+
+static int intel_iommu_memory_notifier(struct notifier_block *nb,
+ unsigned long val, void *v)
+{
+ struct memory_notify *mhp = v;
+ unsigned long start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
+ unsigned long last_vpfn = mm_to_dma_pfn(mhp->start_pfn +
+ mhp->nr_pages - 1);
+
+ switch (val) {
+ case MEM_GOING_ONLINE:
+ if (iommu_domain_identity_map(si_domain,
+ start_vpfn, last_vpfn)) {
+ pr_warn("Failed to build identity map for [%lx-%lx]\n",
+ start_vpfn, last_vpfn);
+ return NOTIFY_BAD;
+ }
+ break;
+
+ case MEM_OFFLINE:
+ case MEM_CANCEL_ONLINE:
+ {
+ struct dmar_drhd_unit *drhd;
+ struct intel_iommu *iommu;
+ struct page *freelist;
+
+ freelist = domain_unmap(si_domain,
+ start_vpfn, last_vpfn);
+
+ rcu_read_lock();
+ for_each_active_iommu(iommu, drhd)
+ iommu_flush_iotlb_psi(iommu, si_domain,
+ start_vpfn, mhp->nr_pages,
+ !freelist, 0);
+ rcu_read_unlock();
+ dma_free_pagelist(freelist);
+ }
+ break;
+ }
+
+ return NOTIFY_OK;
+}
+
+static struct notifier_block intel_iommu_memory_nb = {
+ .notifier_call = intel_iommu_memory_notifier,
+ .priority = 0
+};
+
+static void free_all_cpu_cached_iovas(unsigned int cpu)
+{
+ int i;
+
+ for (i = 0; i < g_num_of_iommus; i++) {
+ struct intel_iommu *iommu = g_iommus[i];
+ struct dmar_domain *domain;
+ int did;
+
+ if (!iommu)
+ continue;
+
+ for (did = 0; did < cap_ndoms(iommu->cap); did++) {
+ domain = get_iommu_domain(iommu, (u16)did);
+
+ if (!domain || domain->domain.type != IOMMU_DOMAIN_DMA)
+ continue;
+
+ free_cpu_cached_iovas(cpu, &domain->iovad);
+ }
+ }
+}
+
+static int intel_iommu_cpu_dead(unsigned int cpu)
+{
+ free_all_cpu_cached_iovas(cpu);
+ return 0;
+}
+
+static void intel_disable_iommus(void)
+{
+ struct intel_iommu *iommu = NULL;
+ struct dmar_drhd_unit *drhd;
+
+ for_each_iommu(iommu, drhd)
+ iommu_disable_translation(iommu);
+}
+
+void intel_iommu_shutdown(void)
+{
+ struct dmar_drhd_unit *drhd;
+ struct intel_iommu *iommu = NULL;
+
+ if (no_iommu || dmar_disabled)
+ return;
+
+ down_write(&dmar_global_lock);
+
+ /* Disable PMRs explicitly here. */
+ for_each_iommu(iommu, drhd)
+ iommu_disable_protect_mem_regions(iommu);
+
+ /* Make sure the IOMMUs are switched off */
+ intel_disable_iommus();
+
+ up_write(&dmar_global_lock);
+}
+
+static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
+{
+ struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
+
+ return container_of(iommu_dev, struct intel_iommu, iommu);
+}
+
+static ssize_t intel_iommu_show_version(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct intel_iommu *iommu = dev_to_intel_iommu(dev);
+ u32 ver = readl(iommu->reg + DMAR_VER_REG);
+ return sprintf(buf, "%d:%d\n",
+ DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
+}
+static DEVICE_ATTR(version, S_IRUGO, intel_iommu_show_version, NULL);
+
+static ssize_t intel_iommu_show_address(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct intel_iommu *iommu = dev_to_intel_iommu(dev);
+ return sprintf(buf, "%llx\n", iommu->reg_phys);
+}
+static DEVICE_ATTR(address, S_IRUGO, intel_iommu_show_address, NULL);
+
+static ssize_t intel_iommu_show_cap(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct intel_iommu *iommu = dev_to_intel_iommu(dev);
+ return sprintf(buf, "%llx\n", iommu->cap);
+}
+static DEVICE_ATTR(cap, S_IRUGO, intel_iommu_show_cap, NULL);
+
+static ssize_t intel_iommu_show_ecap(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct intel_iommu *iommu = dev_to_intel_iommu(dev);
+ return sprintf(buf, "%llx\n", iommu->ecap);
+}
+static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL);
+
+static ssize_t intel_iommu_show_ndoms(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct intel_iommu *iommu = dev_to_intel_iommu(dev);
+ return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
+}
+static DEVICE_ATTR(domains_supported, S_IRUGO, intel_iommu_show_ndoms, NULL);
+
+static ssize_t intel_iommu_show_ndoms_used(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct intel_iommu *iommu = dev_to_intel_iommu(dev);
+ return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
+ cap_ndoms(iommu->cap)));
+}
+static DEVICE_ATTR(domains_used, S_IRUGO, intel_iommu_show_ndoms_used, NULL);
+
+static struct attribute *intel_iommu_attrs[] = {
+ &dev_attr_version.attr,
+ &dev_attr_address.attr,
+ &dev_attr_cap.attr,
+ &dev_attr_ecap.attr,
+ &dev_attr_domains_supported.attr,
+ &dev_attr_domains_used.attr,
+ NULL,
+};
+
+static struct attribute_group intel_iommu_group = {
+ .name = "intel-iommu",
+ .attrs = intel_iommu_attrs,
+};
+
+const struct attribute_group *intel_iommu_groups[] = {
+ &intel_iommu_group,
+ NULL,
+};
+
+static inline bool has_external_pci(void)
+{
+ struct pci_dev *pdev = NULL;
+
+ for_each_pci_dev(pdev)
+ if (pdev->external_facing) {
+ pci_dev_put(pdev);
+ return true;
+ }
+
+ return false;
+}
+
+static int __init platform_optin_force_iommu(void)
+{
+ if (!dmar_platform_optin() || no_platform_optin || !has_external_pci())
+ return 0;
+
+ if (no_iommu || dmar_disabled)
+ pr_info("Intel-IOMMU force enabled due to platform opt in\n");
+
+ /*
+ * If Intel-IOMMU is disabled by default, we will apply identity
+ * map for all devices except those marked as being untrusted.
+ */
+ if (dmar_disabled)
+ iommu_set_default_passthrough(false);
+
+ dmar_disabled = 0;
+ no_iommu = 0;
+
+ return 1;
+}
+
+static int __init probe_acpi_namespace_devices(void)
+{
+ struct dmar_drhd_unit *drhd;
+ /* To avoid a -Wunused-but-set-variable warning. */
+ struct intel_iommu *iommu __maybe_unused;
+ struct device *dev;
+ int i, ret = 0;
+
+ for_each_active_iommu(iommu, drhd) {
+ for_each_active_dev_scope(drhd->devices,
+ drhd->devices_cnt, i, dev) {
+ struct acpi_device_physical_node *pn;
+ struct iommu_group *group;
+ struct acpi_device *adev;
+
+ if (dev->bus != &acpi_bus_type)
+ continue;
+
+ adev = to_acpi_device(dev);
+ mutex_lock(&adev->physical_node_lock);
+ list_for_each_entry(pn,
+ &adev->physical_node_list, node) {
+ group = iommu_group_get(pn->dev);
+ if (group) {
+ iommu_group_put(group);
+ continue;
+ }
+
+ pn->dev->bus->iommu_ops = &intel_iommu_ops;
+ ret = iommu_probe_device(pn->dev);
+ if (ret)
+ break;
+ }
+ mutex_unlock(&adev->physical_node_lock);
+
+ if (ret)
+ return ret;
+ }
+ }
+
+ return 0;
+}
+
+int __init intel_iommu_init(void)
+{
+ int ret = -ENODEV;
+ struct dmar_drhd_unit *drhd;
+ struct intel_iommu *iommu;
+
+ /*
+ * Intel IOMMU is required for a TXT/tboot launch or platform
+ * opt in, so enforce that.
+ */
+ force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) ||
+ platform_optin_force_iommu();
+
+ if (iommu_init_mempool()) {
+ if (force_on)
+ panic("tboot: Failed to initialize iommu memory\n");
+ return -ENOMEM;
+ }
+
+ down_write(&dmar_global_lock);
+ if (dmar_table_init()) {
+ if (force_on)
+ panic("tboot: Failed to initialize DMAR table\n");
+ goto out_free_dmar;
+ }
+
+ if (dmar_dev_scope_init() < 0) {
+ if (force_on)
+ panic("tboot: Failed to initialize DMAR device scope\n");
+ goto out_free_dmar;
+ }
+
+ up_write(&dmar_global_lock);
+
+ /*
+ * The bus notifier takes the dmar_global_lock, so lockdep will
+ * complain later when we register it under the lock.
+ */
+ dmar_register_bus_notifier();
+
+ down_write(&dmar_global_lock);
+
+ if (!no_iommu)
+ intel_iommu_debugfs_init();
+
+ if (no_iommu || dmar_disabled) {
+ /*
+ * We exit the function here to ensure IOMMU's remapping and
+ * mempool aren't setup, which means that the IOMMU's PMRs
+ * won't be disabled via the call to init_dmars(). So disable
+ * it explicitly here. The PMRs were setup by tboot prior to
+ * calling SENTER, but the kernel is expected to reset/tear
+ * down the PMRs.
+ */
+ if (intel_iommu_tboot_noforce) {
+ for_each_iommu(iommu, drhd)
+ iommu_disable_protect_mem_regions(iommu);
+ }
+
+ /*
+ * Make sure the IOMMUs are switched off, even when we
+ * boot into a kexec kernel and the previous kernel left
+ * them enabled
+ */
+ intel_disable_iommus();
+ goto out_free_dmar;
+ }
+
+ if (list_empty(&dmar_rmrr_units))
+ pr_info("No RMRR found\n");
+
+ if (list_empty(&dmar_atsr_units))
+ pr_info("No ATSR found\n");
+
+ if (dmar_init_reserved_ranges()) {
+ if (force_on)
+ panic("tboot: Failed to reserve iommu ranges\n");
+ goto out_free_reserved_range;
+ }
+
+ if (dmar_map_gfx)
+ intel_iommu_gfx_mapped = 1;
+
+ init_no_remapping_devices();
+
+ ret = init_dmars();
+ if (ret) {
+ if (force_on)
+ panic("tboot: Failed to initialize DMARs\n");
+ pr_err("Initialization failed\n");
+ goto out_free_reserved_range;
+ }
+ up_write(&dmar_global_lock);
+
+ init_iommu_pm_ops();
+
+ down_read(&dmar_global_lock);
+ for_each_active_iommu(iommu, drhd) {
+ iommu_device_sysfs_add(&iommu->iommu, NULL,
+ intel_iommu_groups,
+ "%s", iommu->name);
+ iommu_device_set_ops(&iommu->iommu, &intel_iommu_ops);
+ iommu_device_register(&iommu->iommu);
+ }
+ up_read(&dmar_global_lock);
+
+ bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
+ if (si_domain && !hw_pass_through)
+ register_memory_notifier(&intel_iommu_memory_nb);
+ cpuhp_setup_state(CPUHP_IOMMU_INTEL_DEAD, "iommu/intel:dead", NULL,
+ intel_iommu_cpu_dead);
+
+ down_read(&dmar_global_lock);
+ if (probe_acpi_namespace_devices())
+ pr_warn("ACPI name space devices didn't probe correctly\n");
+
+ /* Finally, we enable the DMA remapping hardware. */
+ for_each_iommu(iommu, drhd) {
+ if (!drhd->ignored && !translation_pre_enabled(iommu))
+ iommu_enable_translation(iommu);
+
+ iommu_disable_protect_mem_regions(iommu);
+ }
+ up_read(&dmar_global_lock);
+
+ pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
+
+ intel_iommu_enabled = 1;
+
+ return 0;
+
+out_free_reserved_range:
+ put_iova_domain(&reserved_iova_list);
+out_free_dmar:
+ intel_iommu_free_dmars();
+ up_write(&dmar_global_lock);
+ iommu_exit_mempool();
+ return ret;
+}
+
+static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
+{
+ struct intel_iommu *iommu = opaque;
+
+ domain_context_clear_one(iommu, PCI_BUS_NUM(alias), alias & 0xff);
+ return 0;
+}
+
+/*
+ * NB - intel-iommu lacks any sort of reference counting for the users of
+ * dependent devices. If multiple endpoints have intersecting dependent
+ * devices, unbinding the driver from any one of them will possibly leave
+ * the others unable to operate.
+ */
+static void domain_context_clear(struct intel_iommu *iommu, struct device *dev)
+{
+ if (!iommu || !dev || !dev_is_pci(dev))
+ return;
+
+ pci_for_each_dma_alias(to_pci_dev(dev), &domain_context_clear_one_cb, iommu);
+}
+
+static void __dmar_remove_one_dev_info(struct device_domain_info *info)
+{
+ struct dmar_domain *domain;
+ struct intel_iommu *iommu;
+ unsigned long flags;
+
+ assert_spin_locked(&device_domain_lock);
+
+ if (WARN_ON(!info))
+ return;
+
+ iommu = info->iommu;
+ domain = info->domain;
+
+ if (info->dev) {
+ if (dev_is_pci(info->dev) && sm_supported(iommu))
+ intel_pasid_tear_down_entry(iommu, info->dev,
+ PASID_RID2PASID, false);
+
+ iommu_disable_dev_iotlb(info);
+ if (!dev_is_real_dma_subdevice(info->dev))
+ domain_context_clear(iommu, info->dev);
+ intel_pasid_free_table(info->dev);
+ }
+
+ unlink_domain_info(info);
+
+ spin_lock_irqsave(&iommu->lock, flags);
+ domain_detach_iommu(domain, iommu);
+ spin_unlock_irqrestore(&iommu->lock, flags);
+
+ free_devinfo_mem(info);
+}
+
+static void dmar_remove_one_dev_info(struct device *dev)
+{
+ struct device_domain_info *info;
+ unsigned long flags;
+
+ spin_lock_irqsave(&device_domain_lock, flags);
+ info = get_domain_info(dev);
+ if (info)
+ __dmar_remove_one_dev_info(info);
+ spin_unlock_irqrestore(&device_domain_lock, flags);
+}
+
+static int md_domain_init(struct dmar_domain *domain, int guest_width)
+{
+ int adjust_width;
+
+ /* calculate AGAW */
+ domain->gaw = guest_width;
+ adjust_width = guestwidth_to_adjustwidth(guest_width);
+ domain->agaw = width_to_agaw(adjust_width);
+
+ domain->iommu_coherency = 0;
+ domain->iommu_snooping = 0;
+ domain->iommu_superpage = 0;
+ domain->max_addr = 0;
+
+ /* always allocate the top pgd */
+ domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
+ if (!domain->pgd)
+ return -ENOMEM;
+ domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
+ return 0;
+}
+
+static void intel_init_iova_domain(struct dmar_domain *dmar_domain)
+{
+ init_iova_domain(&dmar_domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
+ copy_reserved_iova(&reserved_iova_list, &dmar_domain->iovad);
+
+ if (!intel_iommu_strict &&
+ init_iova_flush_queue(&dmar_domain->iovad,
+ iommu_flush_iova, iova_entry_free))
+ pr_info("iova flush queue initialization failed\n");
+}
+
+static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
+{
+ struct dmar_domain *dmar_domain;
+ struct iommu_domain *domain;
+
+ switch (type) {
+ case IOMMU_DOMAIN_DMA:
+ case IOMMU_DOMAIN_UNMANAGED:
+ dmar_domain = alloc_domain(0);
+ if (!dmar_domain) {
+ pr_err("Can't allocate dmar_domain\n");
+ return NULL;
+ }
+ if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
+ pr_err("Domain initialization failed\n");
+ domain_exit(dmar_domain);
+ return NULL;
+ }
+
+ if (type == IOMMU_DOMAIN_DMA)
+ intel_init_iova_domain(dmar_domain);
+
+ domain = &dmar_domain->domain;
+ domain->geometry.aperture_start = 0;
+ domain->geometry.aperture_end =
+ __DOMAIN_MAX_ADDR(dmar_domain->gaw);
+ domain->geometry.force_aperture = true;
+
+ return domain;
+ case IOMMU_DOMAIN_IDENTITY:
+ return &si_domain->domain;
+ default:
+ return NULL;
+ }
+
+ return NULL;
+}
+
+static void intel_iommu_domain_free(struct iommu_domain *domain)
+{
+ if (domain != &si_domain->domain)
+ domain_exit(to_dmar_domain(domain));
+}
+
+/*
+ * Check whether a @domain could be attached to the @dev through the
+ * aux-domain attach/detach APIs.
+ */
+static inline bool
+is_aux_domain(struct device *dev, struct iommu_domain *domain)
+{
+ struct device_domain_info *info = get_domain_info(dev);
+
+ return info && info->auxd_enabled &&
+ domain->type == IOMMU_DOMAIN_UNMANAGED;
+}
+
+static void auxiliary_link_device(struct dmar_domain *domain,
+ struct device *dev)
+{
+ struct device_domain_info *info = get_domain_info(dev);
+
+ assert_spin_locked(&device_domain_lock);
+ if (WARN_ON(!info))
+ return;
+
+ domain->auxd_refcnt++;
+ list_add(&domain->auxd, &info->auxiliary_domains);
+}
+
+static void auxiliary_unlink_device(struct dmar_domain *domain,
+ struct device *dev)
+{
+ struct device_domain_info *info = get_domain_info(dev);
+
+ assert_spin_locked(&device_domain_lock);
+ if (WARN_ON(!info))
+ return;
+
+ list_del(&domain->auxd);
+ domain->auxd_refcnt--;
+
+ if (!domain->auxd_refcnt && domain->default_pasid > 0)
+ ioasid_free(domain->default_pasid);
+}
+
+static int aux_domain_add_dev(struct dmar_domain *domain,
+ struct device *dev)
+{
+ int ret;
+ unsigned long flags;
+ struct intel_iommu *iommu;
+
+ iommu = device_to_iommu(dev, NULL, NULL);
+ if (!iommu)
+ return -ENODEV;
+
+ if (domain->default_pasid <= 0) {
+ u32 pasid;
+
+ /* No private data needed for the default pasid */
+ pasid = ioasid_alloc(NULL, PASID_MIN,
+ pci_max_pasids(to_pci_dev(dev)) - 1,
+ NULL);
+ if (pasid == INVALID_IOASID) {
+ pr_err("Can't allocate default pasid\n");
+ return -ENODEV;
+ }
+ domain->default_pasid = pasid;
+ }
+
+ spin_lock_irqsave(&device_domain_lock, flags);
+ /*
+ * iommu->lock must be held to attach domain to iommu and setup the
+ * pasid entry for second level translation.
+ */
+ spin_lock(&iommu->lock);
+ ret = domain_attach_iommu(domain, iommu);
+ if (ret)
+ goto attach_failed;
+
+ /* Setup the PASID entry for mediated devices: */
+ if (domain_use_first_level(domain))
+ ret = domain_setup_first_level(iommu, domain, dev,
+ domain->default_pasid);
+ else
+ ret = intel_pasid_setup_second_level(iommu, domain, dev,
+ domain->default_pasid);
+ if (ret)
+ goto table_failed;
+ spin_unlock(&iommu->lock);
+
+ auxiliary_link_device(domain, dev);
+
+ spin_unlock_irqrestore(&device_domain_lock, flags);
+
+ return 0;
+
+table_failed:
+ domain_detach_iommu(domain, iommu);
+attach_failed:
+ spin_unlock(&iommu->lock);
+ spin_unlock_irqrestore(&device_domain_lock, flags);
+ if (!domain->auxd_refcnt && domain->default_pasid > 0)
+ ioasid_free(domain->default_pasid);
+
+ return ret;
+}
+
+static void aux_domain_remove_dev(struct dmar_domain *domain,
+ struct device *dev)
+{
+ struct device_domain_info *info;
+ struct intel_iommu *iommu;
+ unsigned long flags;
+
+ if (!is_aux_domain(dev, &domain->domain))
+ return;
+
+ spin_lock_irqsave(&device_domain_lock, flags);
+ info = get_domain_info(dev);
+ iommu = info->iommu;
+
+ auxiliary_unlink_device(domain, dev);
+
+ spin_lock(&iommu->lock);
+ intel_pasid_tear_down_entry(iommu, dev, domain->default_pasid, false);
+ domain_detach_iommu(domain, iommu);
+ spin_unlock(&iommu->lock);
+
+ spin_unlock_irqrestore(&device_domain_lock, flags);
+}
+
+static int prepare_domain_attach_device(struct iommu_domain *domain,
+ struct device *dev)
+{
+ struct dmar_domain *dmar_domain = to_dmar_domain(domain);
+ struct intel_iommu *iommu;
+ int addr_width;
+
+ iommu = device_to_iommu(dev, NULL, NULL);
+ if (!iommu)
+ return -ENODEV;
+
+ /* check if this iommu agaw is sufficient for max mapped address */
+ addr_width = agaw_to_width(iommu->agaw);
+ if (addr_width > cap_mgaw(iommu->cap))
+ addr_width = cap_mgaw(iommu->cap);
+
+ if (dmar_domain->max_addr > (1LL << addr_width)) {
+ dev_err(dev, "%s: iommu width (%d) is not "
+ "sufficient for the mapped address (%llx)\n",
+ __func__, addr_width, dmar_domain->max_addr);
+ return -EFAULT;
+ }
+ dmar_domain->gaw = addr_width;
+
+ /*
+ * Knock out extra levels of page tables if necessary
+ */
+ while (iommu->agaw < dmar_domain->agaw) {
+ struct dma_pte *pte;
+
+ pte = dmar_domain->pgd;
+ if (dma_pte_present(pte)) {
+ dmar_domain->pgd = (struct dma_pte *)
+ phys_to_virt(dma_pte_addr(pte));
+ free_pgtable_page(pte);
+ }
+ dmar_domain->agaw--;
+ }
+
+ return 0;
+}
+
+static int intel_iommu_attach_device(struct iommu_domain *domain,
+ struct device *dev)
+{
+ int ret;
+
+ if (domain->type == IOMMU_DOMAIN_UNMANAGED &&
+ device_is_rmrr_locked(dev)) {
+ dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement. Contact your platform vendor.\n");
+ return -EPERM;
+ }
+
+ if (is_aux_domain(dev, domain))
+ return -EPERM;
+
+ /* normally dev is not mapped */
+ if (unlikely(domain_context_mapped(dev))) {
+ struct dmar_domain *old_domain;
+
+ old_domain = find_domain(dev);
+ if (old_domain)
+ dmar_remove_one_dev_info(dev);
+ }
+
+ ret = prepare_domain_attach_device(domain, dev);
+ if (ret)
+ return ret;
+
+ return domain_add_dev_info(to_dmar_domain(domain), dev);
+}
+
+static int intel_iommu_aux_attach_device(struct iommu_domain *domain,
+ struct device *dev)
+{
+ int ret;
+
+ if (!is_aux_domain(dev, domain))
+ return -EPERM;
+
+ ret = prepare_domain_attach_device(domain, dev);
+ if (ret)
+ return ret;
+
+ return aux_domain_add_dev(to_dmar_domain(domain), dev);
+}
+
+static void intel_iommu_detach_device(struct iommu_domain *domain,
+ struct device *dev)
+{
+ dmar_remove_one_dev_info(dev);
+}
+
+static void intel_iommu_aux_detach_device(struct iommu_domain *domain,
+ struct device *dev)
+{
+ aux_domain_remove_dev(to_dmar_domain(domain), dev);
+}
+
+#ifdef CONFIG_INTEL_IOMMU_SVM
+/*
+ * 2D array for converting and sanitizing IOMMU generic TLB granularity to
+ * VT-d granularity. Invalidation is typically included in the unmap operation
+ * as a result of DMA or VFIO unmap. However, for assigned devices guest
+ * owns the first level page tables. Invalidations of translation caches in the
+ * guest are trapped and passed down to the host.
+ *
+ * vIOMMU in the guest will only expose first level page tables, therefore
+ * we do not support IOTLB granularity for request without PASID (second level).
+ *
+ * For example, to find the VT-d granularity encoding for IOTLB
+ * type and page selective granularity within PASID:
+ * X: indexed by iommu cache type
+ * Y: indexed by enum iommu_inv_granularity
+ * [IOMMU_CACHE_INV_TYPE_IOTLB][IOMMU_INV_GRANU_ADDR]
+ */
+
+static const int
+inv_type_granu_table[IOMMU_CACHE_INV_TYPE_NR][IOMMU_INV_GRANU_NR] = {
+ /*
+ * PASID based IOTLB invalidation: PASID selective (per PASID),
+ * page selective (address granularity)
+ */
+ {-EINVAL, QI_GRAN_NONG_PASID, QI_GRAN_PSI_PASID},
+ /* PASID based dev TLBs */
+ {-EINVAL, -EINVAL, QI_DEV_IOTLB_GRAN_PASID_SEL},
+ /* PASID cache */
+ {-EINVAL, -EINVAL, -EINVAL}
+};
+
+static inline int to_vtd_granularity(int type, int granu)
+{
+ return inv_type_granu_table[type][granu];
+}
+
+static inline u64 to_vtd_size(u64 granu_size, u64 nr_granules)
+{
+ u64 nr_pages = (granu_size * nr_granules) >> VTD_PAGE_SHIFT;
+
+ /* VT-d size is encoded as 2^size of 4K pages, 0 for 4k, 9 for 2MB, etc.
+ * IOMMU cache invalidate API passes granu_size in bytes, and number of
+ * granu size in contiguous memory.
+ */
+ return order_base_2(nr_pages);
+}
+
+static int
+intel_iommu_sva_invalidate(struct iommu_domain *domain, struct device *dev,
+ struct iommu_cache_invalidate_info *inv_info)
+{
+ struct dmar_domain *dmar_domain = to_dmar_domain(domain);
+ struct device_domain_info *info;
+ struct intel_iommu *iommu;
+ unsigned long flags;
+ int cache_type;
+ u8 bus, devfn;
+ u16 did, sid;
+ int ret = 0;
+ u64 size = 0;
+
+ if (!inv_info || !dmar_domain)
+ return -EINVAL;
+
+ if (!dev || !dev_is_pci(dev))
+ return -ENODEV;
+
+ iommu = device_to_iommu(dev, &bus, &devfn);
+ if (!iommu)
+ return -ENODEV;
+
+ if (!(dmar_domain->flags & DOMAIN_FLAG_NESTING_MODE))
+ return -EINVAL;
+
+ spin_lock_irqsave(&device_domain_lock, flags);
+ spin_lock(&iommu->lock);
+ info = get_domain_info(dev);
+ if (!info) {
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+ did = dmar_domain->iommu_did[iommu->seq_id];
+ sid = PCI_DEVID(bus, devfn);
+
+ /* Size is only valid in address selective invalidation */
+ if (inv_info->granularity == IOMMU_INV_GRANU_ADDR)
+ size = to_vtd_size(inv_info->granu.addr_info.granule_size,
+ inv_info->granu.addr_info.nb_granules);
+
+ for_each_set_bit(cache_type,
+ (unsigned long *)&inv_info->cache,
+ IOMMU_CACHE_INV_TYPE_NR) {
+ int granu = 0;
+ u64 pasid = 0;
+ u64 addr = 0;
+
+ granu = to_vtd_granularity(cache_type, inv_info->granularity);
+ if (granu == -EINVAL) {
+ pr_err_ratelimited("Invalid cache type and granu combination %d/%d\n",
+ cache_type, inv_info->granularity);
+ break;
+ }
+
+ /*
+ * PASID is stored in different locations based on the
+ * granularity.
+ */
+ if (inv_info->granularity == IOMMU_INV_GRANU_PASID &&
+ (inv_info->granu.pasid_info.flags & IOMMU_INV_PASID_FLAGS_PASID))
+ pasid = inv_info->granu.pasid_info.pasid;
+ else if (inv_info->granularity == IOMMU_INV_GRANU_ADDR &&
+ (inv_info->granu.addr_info.flags & IOMMU_INV_ADDR_FLAGS_PASID))
+ pasid = inv_info->granu.addr_info.pasid;
+
+ switch (BIT(cache_type)) {
+ case IOMMU_CACHE_INV_TYPE_IOTLB:
+ /* HW will ignore LSB bits based on address mask */
+ if (inv_info->granularity == IOMMU_INV_GRANU_ADDR &&
+ size &&
+ (inv_info->granu.addr_info.addr & ((BIT(VTD_PAGE_SHIFT + size)) - 1))) {
+ pr_err_ratelimited("User address not aligned, 0x%llx, size order %llu\n",
+ inv_info->granu.addr_info.addr, size);
+ }
+
+ /*
+ * If granu is PASID-selective, address is ignored.
+ * We use npages = -1 to indicate that.
+ */
+ qi_flush_piotlb(iommu, did, pasid,
+ mm_to_dma_pfn(inv_info->granu.addr_info.addr),
+ (granu == QI_GRAN_NONG_PASID) ? -1 : 1 << size,
+ inv_info->granu.addr_info.flags & IOMMU_INV_ADDR_FLAGS_LEAF);
+
+ if (!info->ats_enabled)
+ break;
+ /*
+ * Always flush device IOTLB if ATS is enabled. vIOMMU
+ * in the guest may assume IOTLB flush is inclusive,
+ * which is more efficient.
+ */
+ fallthrough;
+ case IOMMU_CACHE_INV_TYPE_DEV_IOTLB:
+ /*
+ * PASID based device TLB invalidation does not support
+ * IOMMU_INV_GRANU_PASID granularity but only supports
+ * IOMMU_INV_GRANU_ADDR.
+ * The equivalent of that is we set the size to be the
+ * entire range of 64 bit. User only provides PASID info
+ * without address info. So we set addr to 0.
+ */
+ if (inv_info->granularity == IOMMU_INV_GRANU_PASID) {
+ size = 64 - VTD_PAGE_SHIFT;
+ addr = 0;
+ } else if (inv_info->granularity == IOMMU_INV_GRANU_ADDR) {
+ addr = inv_info->granu.addr_info.addr;
+ }
+
+ if (info->ats_enabled)
+ qi_flush_dev_iotlb_pasid(iommu, sid,
+ info->pfsid, pasid,
+ info->ats_qdep, addr,
+ size);
+ else
+ pr_warn_ratelimited("Passdown device IOTLB flush w/o ATS!\n");
+ break;
+ default:
+ dev_err_ratelimited(dev, "Unsupported IOMMU invalidation type %d\n",
+ cache_type);
+ ret = -EINVAL;
+ }
+ }
+out_unlock:
+ spin_unlock(&iommu->lock);
+ spin_unlock_irqrestore(&device_domain_lock, flags);
+
+ return ret;
+}
+#endif
+
+static int intel_iommu_map(struct iommu_domain *domain,
+ unsigned long iova, phys_addr_t hpa,
+ size_t size, int iommu_prot, gfp_t gfp)
+{
+ struct dmar_domain *dmar_domain = to_dmar_domain(domain);
+ u64 max_addr;
+ int prot = 0;
+ int ret;
+
+ if (iommu_prot & IOMMU_READ)
+ prot |= DMA_PTE_READ;
+ if (iommu_prot & IOMMU_WRITE)
+ prot |= DMA_PTE_WRITE;
+ if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
+ prot |= DMA_PTE_SNP;
+
+ max_addr = iova + size;
+ if (dmar_domain->max_addr < max_addr) {
+ u64 end;
+
+ /* check if minimum agaw is sufficient for mapped address */
+ end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
+ if (end < max_addr) {
+ pr_err("%s: iommu width (%d) is not "
+ "sufficient for the mapped address (%llx)\n",
+ __func__, dmar_domain->gaw, max_addr);
+ return -EFAULT;
+ }
+ dmar_domain->max_addr = max_addr;
+ }
+ /* Round up size to next multiple of PAGE_SIZE, if it and
+ the low bits of hpa would take us onto the next page */
+ size = aligned_nrpages(hpa, size);
+ ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
+ hpa >> VTD_PAGE_SHIFT, size, prot);
+ return ret;
+}
+
+static size_t intel_iommu_unmap(struct iommu_domain *domain,
+ unsigned long iova, size_t size,
+ struct iommu_iotlb_gather *gather)
+{
+ struct dmar_domain *dmar_domain = to_dmar_domain(domain);
+ struct page *freelist = NULL;
+ unsigned long start_pfn, last_pfn;
+ unsigned int npages;
+ int iommu_id, level = 0;
+
+ /* Cope with horrid API which requires us to unmap more than the
+ size argument if it happens to be a large-page mapping. */
+ BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
+
+ if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
+ size = VTD_PAGE_SIZE << level_to_offset_bits(level);
+
+ start_pfn = iova >> VTD_PAGE_SHIFT;
+ last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
+
+ freelist = domain_unmap(dmar_domain, start_pfn, last_pfn);
+
+ npages = last_pfn - start_pfn + 1;
+
+ for_each_domain_iommu(iommu_id, dmar_domain)
+ iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
+ start_pfn, npages, !freelist, 0);
+
+ dma_free_pagelist(freelist);
+
+ if (dmar_domain->max_addr == iova + size)
+ dmar_domain->max_addr = iova;
+
+ return size;
+}
+
+static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
+ dma_addr_t iova)
+{
+ struct dmar_domain *dmar_domain = to_dmar_domain(domain);
+ struct dma_pte *pte;
+ int level = 0;
+ u64 phys = 0;
+
+ pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
+ if (pte && dma_pte_present(pte))
+ phys = dma_pte_addr(pte) +
+ (iova & (BIT_MASK(level_to_offset_bits(level) +
+ VTD_PAGE_SHIFT) - 1));
+
+ return phys;
+}
+
+static inline bool scalable_mode_support(void)
+{
+ struct dmar_drhd_unit *drhd;
+ struct intel_iommu *iommu;
+ bool ret = true;
+
+ rcu_read_lock();
+ for_each_active_iommu(iommu, drhd) {
+ if (!sm_supported(iommu)) {
+ ret = false;
+ break;
+ }
+ }
+ rcu_read_unlock();
+
+ return ret;
+}
+
+static inline bool iommu_pasid_support(void)
+{
+ struct dmar_drhd_unit *drhd;
+ struct intel_iommu *iommu;
+ bool ret = true;
+
+ rcu_read_lock();
+ for_each_active_iommu(iommu, drhd) {
+ if (!pasid_supported(iommu)) {
+ ret = false;
+ break;
+ }
+ }
+ rcu_read_unlock();
+
+ return ret;
+}
+
+static inline bool nested_mode_support(void)
+{
+ struct dmar_drhd_unit *drhd;
+ struct intel_iommu *iommu;
+ bool ret = true;
+
+ rcu_read_lock();
+ for_each_active_iommu(iommu, drhd) {
+ if (!sm_supported(iommu) || !ecap_nest(iommu->ecap)) {
+ ret = false;
+ break;
+ }
+ }
+ rcu_read_unlock();
+
+ return ret;
+}
+
+static bool intel_iommu_capable(enum iommu_cap cap)
+{
+ if (cap == IOMMU_CAP_CACHE_COHERENCY)
+ return domain_update_iommu_snooping(NULL) == 1;
+ if (cap == IOMMU_CAP_INTR_REMAP)
+ return irq_remapping_enabled == 1;
+
+ return false;
+}
+
+static struct iommu_device *intel_iommu_probe_device(struct device *dev)
+{
+ struct intel_iommu *iommu;
+
+ iommu = device_to_iommu(dev, NULL, NULL);
+ if (!iommu)
+ return ERR_PTR(-ENODEV);
+
+ if (translation_pre_enabled(iommu))
+ dev_iommu_priv_set(dev, DEFER_DEVICE_DOMAIN_INFO);
+
+ return &iommu->iommu;
+}
+
+static void intel_iommu_release_device(struct device *dev)
+{
+ struct intel_iommu *iommu;
+
+ iommu = device_to_iommu(dev, NULL, NULL);
+ if (!iommu)
+ return;
+
+ dmar_remove_one_dev_info(dev);
+
+ set_dma_ops(dev, NULL);
+}
+
+static void intel_iommu_probe_finalize(struct device *dev)
+{
+ struct iommu_domain *domain;
+
+ domain = iommu_get_domain_for_dev(dev);
+ if (device_needs_bounce(dev))
+ set_dma_ops(dev, &bounce_dma_ops);
+ else if (domain && domain->type == IOMMU_DOMAIN_DMA)
+ set_dma_ops(dev, &intel_dma_ops);
+ else
+ set_dma_ops(dev, NULL);
+}
+
+static void intel_iommu_get_resv_regions(struct device *device,
+ struct list_head *head)
+{
+ int prot = DMA_PTE_READ | DMA_PTE_WRITE;
+ struct iommu_resv_region *reg;
+ struct dmar_rmrr_unit *rmrr;
+ struct device *i_dev;
+ int i;
+
+ down_read(&dmar_global_lock);
+ for_each_rmrr_units(rmrr) {
+ for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
+ i, i_dev) {
+ struct iommu_resv_region *resv;
+ enum iommu_resv_type type;
+ size_t length;
+
+ if (i_dev != device &&
+ !is_downstream_to_pci_bridge(device, i_dev))
+ continue;
+
+ length = rmrr->end_address - rmrr->base_address + 1;
+
+ type = device_rmrr_is_relaxable(device) ?
+ IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
+
+ resv = iommu_alloc_resv_region(rmrr->base_address,
+ length, prot, type);
+ if (!resv)
+ break;
+
+ list_add_tail(&resv->list, head);
+ }
+ }
+ up_read(&dmar_global_lock);
+
+#ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
+ if (dev_is_pci(device)) {
+ struct pci_dev *pdev = to_pci_dev(device);
+
+ if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
+ reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
+ IOMMU_RESV_DIRECT_RELAXABLE);
+ if (reg)
+ list_add_tail(&reg->list, head);
+ }
+ }
+#endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
+
+ reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
+ IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
+ 0, IOMMU_RESV_MSI);
+ if (!reg)
+ return;
+ list_add_tail(&reg->list, head);
+}
+
+int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev)
+{
+ struct device_domain_info *info;
+ struct context_entry *context;
+ struct dmar_domain *domain;
+ unsigned long flags;
+ u64 ctx_lo;
+ int ret;
+
+ domain = find_domain(dev);
+ if (!domain)
+ return -EINVAL;
+
+ spin_lock_irqsave(&device_domain_lock, flags);
+ spin_lock(&iommu->lock);
+
+ ret = -EINVAL;
+ info = get_domain_info(dev);
+ if (!info || !info->pasid_supported)
+ goto out;
+
+ context = iommu_context_addr(iommu, info->bus, info->devfn, 0);
+ if (WARN_ON(!context))
+ goto out;
+
+ ctx_lo = context[0].lo;
+
+ if (!(ctx_lo & CONTEXT_PASIDE)) {
+ ctx_lo |= CONTEXT_PASIDE;
+ context[0].lo = ctx_lo;
+ wmb();
+ iommu->flush.flush_context(iommu,
+ domain->iommu_did[iommu->seq_id],
+ PCI_DEVID(info->bus, info->devfn),
+ DMA_CCMD_MASK_NOBIT,
+ DMA_CCMD_DEVICE_INVL);
+ }
+
+ /* Enable PASID support in the device, if it wasn't already */
+ if (!info->pasid_enabled)
+ iommu_enable_dev_iotlb(info);
+
+ ret = 0;
+
+ out:
+ spin_unlock(&iommu->lock);
+ spin_unlock_irqrestore(&device_domain_lock, flags);
+
+ return ret;
+}
+
+static void intel_iommu_apply_resv_region(struct device *dev,
+ struct iommu_domain *domain,
+ struct iommu_resv_region *region)
+{
+ struct dmar_domain *dmar_domain = to_dmar_domain(domain);
+ unsigned long start, end;
+
+ start = IOVA_PFN(region->start);
+ end = IOVA_PFN(region->start + region->length - 1);
+
+ WARN_ON_ONCE(!reserve_iova(&dmar_domain->iovad, start, end));
+}
+
+static struct iommu_group *intel_iommu_device_group(struct device *dev)
+{
+ if (dev_is_pci(dev))
+ return pci_device_group(dev);
+ return generic_device_group(dev);
+}
+
+static int intel_iommu_enable_auxd(struct device *dev)
+{
+ struct device_domain_info *info;
+ struct intel_iommu *iommu;
+ unsigned long flags;
+ int ret;
+
+ iommu = device_to_iommu(dev, NULL, NULL);
+ if (!iommu || dmar_disabled)
+ return -EINVAL;
+
+ if (!sm_supported(iommu) || !pasid_supported(iommu))
+ return -EINVAL;
+
+ ret = intel_iommu_enable_pasid(iommu, dev);
+ if (ret)
+ return -ENODEV;
+
+ spin_lock_irqsave(&device_domain_lock, flags);
+ info = get_domain_info(dev);
+ info->auxd_enabled = 1;
+ spin_unlock_irqrestore(&device_domain_lock, flags);
+
+ return 0;
+}
+
+static int intel_iommu_disable_auxd(struct device *dev)
+{
+ struct device_domain_info *info;
+ unsigned long flags;
+
+ spin_lock_irqsave(&device_domain_lock, flags);
+ info = get_domain_info(dev);
+ if (!WARN_ON(!info))
+ info->auxd_enabled = 0;
+ spin_unlock_irqrestore(&device_domain_lock, flags);
+
+ return 0;
+}
+
+/*
+ * A PCI express designated vendor specific extended capability is defined
+ * in the section 3.7 of Intel scalable I/O virtualization technical spec
+ * for system software and tools to detect endpoint devices supporting the
+ * Intel scalable IO virtualization without host driver dependency.
+ *
+ * Returns the address of the matching extended capability structure within
+ * the device's PCI configuration space or 0 if the device does not support
+ * it.
+ */
+static int siov_find_pci_dvsec(struct pci_dev *pdev)
+{
+ int pos;
+ u16 vendor, id;
+
+ pos = pci_find_next_ext_capability(pdev, 0, 0x23);
+ while (pos) {
+ pci_read_config_word(pdev, pos + 4, &vendor);
+ pci_read_config_word(pdev, pos + 8, &id);
+ if (vendor == PCI_VENDOR_ID_INTEL && id == 5)
+ return pos;
+
+ pos = pci_find_next_ext_capability(pdev, pos, 0x23);
+ }
+
+ return 0;
+}
+
+static bool
+intel_iommu_dev_has_feat(struct device *dev, enum iommu_dev_features feat)
+{
+ if (feat == IOMMU_DEV_FEAT_AUX) {
+ int ret;
+
+ if (!dev_is_pci(dev) || dmar_disabled ||
+ !scalable_mode_support() || !iommu_pasid_support())
+ return false;
+
+ ret = pci_pasid_features(to_pci_dev(dev));
+ if (ret < 0)
+ return false;
+
+ return !!siov_find_pci_dvsec(to_pci_dev(dev));
+ }
+
+ if (feat == IOMMU_DEV_FEAT_SVA) {
+ struct device_domain_info *info = get_domain_info(dev);
+
+ return info && (info->iommu->flags & VTD_FLAG_SVM_CAPABLE) &&
+ info->pasid_supported && info->pri_supported &&
+ info->ats_supported;
+ }
+
+ return false;
+}
+
+static int
+intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
+{
+ if (feat == IOMMU_DEV_FEAT_AUX)
+ return intel_iommu_enable_auxd(dev);
+
+ if (feat == IOMMU_DEV_FEAT_SVA) {
+ struct device_domain_info *info = get_domain_info(dev);
+
+ if (!info)
+ return -EINVAL;
+
+ if (info->iommu->flags & VTD_FLAG_SVM_CAPABLE)
+ return 0;
+ }
+
+ return -ENODEV;
+}
+
+static int
+intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
+{
+ if (feat == IOMMU_DEV_FEAT_AUX)
+ return intel_iommu_disable_auxd(dev);
+
+ return -ENODEV;
+}
+
+static bool
+intel_iommu_dev_feat_enabled(struct device *dev, enum iommu_dev_features feat)
+{
+ struct device_domain_info *info = get_domain_info(dev);
+
+ if (feat == IOMMU_DEV_FEAT_AUX)
+ return scalable_mode_support() && info && info->auxd_enabled;
+
+ return false;
+}
+
+static int
+intel_iommu_aux_get_pasid(struct iommu_domain *domain, struct device *dev)
+{
+ struct dmar_domain *dmar_domain = to_dmar_domain(domain);
+
+ return dmar_domain->default_pasid > 0 ?
+ dmar_domain->default_pasid : -EINVAL;
+}
+
+static bool intel_iommu_is_attach_deferred(struct iommu_domain *domain,
+ struct device *dev)
+{
+ return attach_deferred(dev);
+}
+
+static int
+intel_iommu_domain_set_attr(struct iommu_domain *domain,
+ enum iommu_attr attr, void *data)
+{
+ struct dmar_domain *dmar_domain = to_dmar_domain(domain);
+ unsigned long flags;
+ int ret = 0;
+
+ if (domain->type != IOMMU_DOMAIN_UNMANAGED)
+ return -EINVAL;
+
+ switch (attr) {
+ case DOMAIN_ATTR_NESTING:
+ spin_lock_irqsave(&device_domain_lock, flags);
+ if (nested_mode_support() &&
+ list_empty(&dmar_domain->devices)) {
+ dmar_domain->flags |= DOMAIN_FLAG_NESTING_MODE;
+ dmar_domain->flags &= ~DOMAIN_FLAG_USE_FIRST_LEVEL;
+ } else {
+ ret = -ENODEV;
+ }
+ spin_unlock_irqrestore(&device_domain_lock, flags);
+ break;
+ default:
+ ret = -EINVAL;
+ break;
+ }
+
+ return ret;
+}
+
+/*
+ * Check that the device does not live on an external facing PCI port that is
+ * marked as untrusted. Such devices should not be able to apply quirks and
+ * thus not be able to bypass the IOMMU restrictions.
+ */
+static bool risky_device(struct pci_dev *pdev)
+{
+ if (pdev->untrusted) {
+ pci_info(pdev,
+ "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n",
+ pdev->vendor, pdev->device);
+ pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n");
+ return true;
+ }
+ return false;
+}
+
+const struct iommu_ops intel_iommu_ops = {
+ .capable = intel_iommu_capable,
+ .domain_alloc = intel_iommu_domain_alloc,
+ .domain_free = intel_iommu_domain_free,
+ .domain_set_attr = intel_iommu_domain_set_attr,
+ .attach_dev = intel_iommu_attach_device,
+ .detach_dev = intel_iommu_detach_device,
+ .aux_attach_dev = intel_iommu_aux_attach_device,
+ .aux_detach_dev = intel_iommu_aux_detach_device,
+ .aux_get_pasid = intel_iommu_aux_get_pasid,
+ .map = intel_iommu_map,
+ .unmap = intel_iommu_unmap,
+ .iova_to_phys = intel_iommu_iova_to_phys,
+ .probe_device = intel_iommu_probe_device,
+ .probe_finalize = intel_iommu_probe_finalize,
+ .release_device = intel_iommu_release_device,
+ .get_resv_regions = intel_iommu_get_resv_regions,
+ .put_resv_regions = generic_iommu_put_resv_regions,
+ .apply_resv_region = intel_iommu_apply_resv_region,
+ .device_group = intel_iommu_device_group,
+ .dev_has_feat = intel_iommu_dev_has_feat,
+ .dev_feat_enabled = intel_iommu_dev_feat_enabled,
+ .dev_enable_feat = intel_iommu_dev_enable_feat,
+ .dev_disable_feat = intel_iommu_dev_disable_feat,
+ .is_attach_deferred = intel_iommu_is_attach_deferred,
+ .def_domain_type = device_def_domain_type,
+ .pgsize_bitmap = INTEL_IOMMU_PGSIZES,
+#ifdef CONFIG_INTEL_IOMMU_SVM
+ .cache_invalidate = intel_iommu_sva_invalidate,
+ .sva_bind_gpasid = intel_svm_bind_gpasid,
+ .sva_unbind_gpasid = intel_svm_unbind_gpasid,
+ .sva_bind = intel_svm_bind,
+ .sva_unbind = intel_svm_unbind,
+ .sva_get_pasid = intel_svm_get_pasid,
+ .page_response = intel_svm_page_response,
+#endif
+};
+
+static void quirk_iommu_igfx(struct pci_dev *dev)
+{
+ if (risky_device(dev))
+ return;
+
+ pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
+ dmar_map_gfx = 0;
+}
+
+/* G4x/GM45 integrated gfx dmar support is totally busted. */
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
+
+/* Broadwell igfx malfunctions with dmar */
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
+
+static void quirk_iommu_rwbf(struct pci_dev *dev)
+{
+ if (risky_device(dev))
+ return;
+
+ /*
+ * Mobile 4 Series Chipset neglects to set RWBF capability,
+ * but needs it. Same seems to hold for the desktop versions.
+ */
+ pci_info(dev, "Forcing write-buffer flush capability\n");
+ rwbf_quirk = 1;
+}
+
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
+
+#define GGC 0x52
+#define GGC_MEMORY_SIZE_MASK (0xf << 8)
+#define GGC_MEMORY_SIZE_NONE (0x0 << 8)
+#define GGC_MEMORY_SIZE_1M (0x1 << 8)
+#define GGC_MEMORY_SIZE_2M (0x3 << 8)
+#define GGC_MEMORY_VT_ENABLED (0x8 << 8)
+#define GGC_MEMORY_SIZE_2M_VT (0x9 << 8)
+#define GGC_MEMORY_SIZE_3M_VT (0xa << 8)
+#define GGC_MEMORY_SIZE_4M_VT (0xb << 8)
+
+static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
+{
+ unsigned short ggc;
+
+ if (risky_device(dev))
+ return;
+
+ if (pci_read_config_word(dev, GGC, &ggc))
+ return;
+
+ if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
+ pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
+ dmar_map_gfx = 0;
+ } else if (dmar_map_gfx) {
+ /* we have to ensure the gfx device is idle before we flush */
+ pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
+ intel_iommu_strict = 1;
+ }
+}
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
+
+static void quirk_igfx_skip_te_disable(struct pci_dev *dev)
+{
+ unsigned short ver;
+
+ if (!IS_GFX_DEVICE(dev))
+ return;
+
+ ver = (dev->device >> 8) & 0xff;
+ if (ver != 0x45 && ver != 0x46 && ver != 0x4c &&
+ ver != 0x4e && ver != 0x8a && ver != 0x98 &&
+ ver != 0x9a && ver != 0xa7 && ver != 0x7d)
+ return;
+
+ if (risky_device(dev))
+ return;
+
+ pci_info(dev, "Skip IOMMU disabling for graphics\n");
+ iommu_skip_te_disable = 1;
+}
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable);
+
+/* On Tylersburg chipsets, some BIOSes have been known to enable the
+ ISOCH DMAR unit for the Azalia sound device, but not give it any
+ TLB entries, which causes it to deadlock. Check for that. We do
+ this in a function called from init_dmars(), instead of in a PCI
+ quirk, because we don't want to print the obnoxious "BIOS broken"
+ message if VT-d is actually disabled.
+*/
+static void __init check_tylersburg_isoch(void)
+{
+ struct pci_dev *pdev;
+ uint32_t vtisochctrl;
+
+ /* If there's no Azalia in the system anyway, forget it. */
+ pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
+ if (!pdev)
+ return;
+
+ if (risky_device(pdev)) {
+ pci_dev_put(pdev);
+ return;
+ }
+
+ pci_dev_put(pdev);
+
+ /* System Management Registers. Might be hidden, in which case
+ we can't do the sanity check. But that's OK, because the
+ known-broken BIOSes _don't_ actually hide it, so far. */
+ pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
+ if (!pdev)
+ return;
+
+ if (risky_device(pdev)) {
+ pci_dev_put(pdev);
+ return;
+ }
+
+ if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
+ pci_dev_put(pdev);
+ return;
+ }
+
+ pci_dev_put(pdev);
+
+ /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
+ if (vtisochctrl & 1)
+ return;
+
+ /* Drop all bits other than the number of TLB entries */
+ vtisochctrl &= 0x1c;
+
+ /* If we have the recommended number of TLB entries (16), fine. */
+ if (vtisochctrl == 0x10)
+ return;
+
+ /* Zero TLB entries? You get to ride the short bus to school. */
+ if (!vtisochctrl) {
+ WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
+ "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
+ dmi_get_system_info(DMI_BIOS_VENDOR),
+ dmi_get_system_info(DMI_BIOS_VERSION),
+ dmi_get_system_info(DMI_PRODUCT_VERSION));
+ iommu_identity_mapping |= IDENTMAP_AZALIA;
+ return;
+ }
+
+ pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
+ vtisochctrl);
+}
diff --git a/drivers/iommu/intel/irq_remapping.c b/drivers/iommu/intel/irq_remapping.c
new file mode 100644
index 000000000..b85388877
--- /dev/null
+++ b/drivers/iommu/intel/irq_remapping.c
@@ -0,0 +1,1531 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#define pr_fmt(fmt) "DMAR-IR: " fmt
+
+#include <linux/interrupt.h>
+#include <linux/dmar.h>
+#include <linux/spinlock.h>
+#include <linux/slab.h>
+#include <linux/jiffies.h>
+#include <linux/hpet.h>
+#include <linux/pci.h>
+#include <linux/irq.h>
+#include <linux/intel-iommu.h>
+#include <linux/acpi.h>
+#include <linux/irqdomain.h>
+#include <linux/crash_dump.h>
+#include <asm/io_apic.h>
+#include <asm/apic.h>
+#include <asm/smp.h>
+#include <asm/cpu.h>
+#include <asm/irq_remapping.h>
+#include <asm/pci-direct.h>
+#include <asm/msidef.h>
+
+#include "../irq_remapping.h"
+
+enum irq_mode {
+ IRQ_REMAPPING,
+ IRQ_POSTING,
+};
+
+struct ioapic_scope {
+ struct intel_iommu *iommu;
+ unsigned int id;
+ unsigned int bus; /* PCI bus number */
+ unsigned int devfn; /* PCI devfn number */
+};
+
+struct hpet_scope {
+ struct intel_iommu *iommu;
+ u8 id;
+ unsigned int bus;
+ unsigned int devfn;
+};
+
+struct irq_2_iommu {
+ struct intel_iommu *iommu;
+ u16 irte_index;
+ u16 sub_handle;
+ u8 irte_mask;
+ enum irq_mode mode;
+};
+
+struct intel_ir_data {
+ struct irq_2_iommu irq_2_iommu;
+ struct irte irte_entry;
+ union {
+ struct msi_msg msi_entry;
+ };
+};
+
+#define IR_X2APIC_MODE(mode) (mode ? (1 << 11) : 0)
+#define IRTE_DEST(dest) ((eim_mode) ? dest : dest << 8)
+
+static int __read_mostly eim_mode;
+static struct ioapic_scope ir_ioapic[MAX_IO_APICS];
+static struct hpet_scope ir_hpet[MAX_HPET_TBS];
+
+/*
+ * Lock ordering:
+ * ->dmar_global_lock
+ * ->irq_2_ir_lock
+ * ->qi->q_lock
+ * ->iommu->register_lock
+ * Note:
+ * intel_irq_remap_ops.{supported,prepare,enable,disable,reenable} are called
+ * in single-threaded environment with interrupt disabled, so no need to tabke
+ * the dmar_global_lock.
+ */
+DEFINE_RAW_SPINLOCK(irq_2_ir_lock);
+static const struct irq_domain_ops intel_ir_domain_ops;
+
+static void iommu_disable_irq_remapping(struct intel_iommu *iommu);
+static int __init parse_ioapics_under_ir(void);
+
+static bool ir_pre_enabled(struct intel_iommu *iommu)
+{
+ return (iommu->flags & VTD_FLAG_IRQ_REMAP_PRE_ENABLED);
+}
+
+static void clear_ir_pre_enabled(struct intel_iommu *iommu)
+{
+ iommu->flags &= ~VTD_FLAG_IRQ_REMAP_PRE_ENABLED;
+}
+
+static void init_ir_status(struct intel_iommu *iommu)
+{
+ u32 gsts;
+
+ gsts = readl(iommu->reg + DMAR_GSTS_REG);
+ if (gsts & DMA_GSTS_IRES)
+ iommu->flags |= VTD_FLAG_IRQ_REMAP_PRE_ENABLED;
+}
+
+static int alloc_irte(struct intel_iommu *iommu,
+ struct irq_2_iommu *irq_iommu, u16 count)
+{
+ struct ir_table *table = iommu->ir_table;
+ unsigned int mask = 0;
+ unsigned long flags;
+ int index;
+
+ if (!count || !irq_iommu)
+ return -1;
+
+ if (count > 1) {
+ count = __roundup_pow_of_two(count);
+ mask = ilog2(count);
+ }
+
+ if (mask > ecap_max_handle_mask(iommu->ecap)) {
+ pr_err("Requested mask %x exceeds the max invalidation handle"
+ " mask value %Lx\n", mask,
+ ecap_max_handle_mask(iommu->ecap));
+ return -1;
+ }
+
+ raw_spin_lock_irqsave(&irq_2_ir_lock, flags);
+ index = bitmap_find_free_region(table->bitmap,
+ INTR_REMAP_TABLE_ENTRIES, mask);
+ if (index < 0) {
+ pr_warn("IR%d: can't allocate an IRTE\n", iommu->seq_id);
+ } else {
+ irq_iommu->iommu = iommu;
+ irq_iommu->irte_index = index;
+ irq_iommu->sub_handle = 0;
+ irq_iommu->irte_mask = mask;
+ irq_iommu->mode = IRQ_REMAPPING;
+ }
+ raw_spin_unlock_irqrestore(&irq_2_ir_lock, flags);
+
+ return index;
+}
+
+static int qi_flush_iec(struct intel_iommu *iommu, int index, int mask)
+{
+ struct qi_desc desc;
+
+ desc.qw0 = QI_IEC_IIDEX(index) | QI_IEC_TYPE | QI_IEC_IM(mask)
+ | QI_IEC_SELECTIVE;
+ desc.qw1 = 0;
+ desc.qw2 = 0;
+ desc.qw3 = 0;
+
+ return qi_submit_sync(iommu, &desc, 1, 0);
+}
+
+static int modify_irte(struct irq_2_iommu *irq_iommu,
+ struct irte *irte_modified)
+{
+ struct intel_iommu *iommu;
+ unsigned long flags;
+ struct irte *irte;
+ int rc, index;
+
+ if (!irq_iommu)
+ return -1;
+
+ raw_spin_lock_irqsave(&irq_2_ir_lock, flags);
+
+ iommu = irq_iommu->iommu;
+
+ index = irq_iommu->irte_index + irq_iommu->sub_handle;
+ irte = &iommu->ir_table->base[index];
+
+#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE)
+ if ((irte->pst == 1) || (irte_modified->pst == 1)) {
+ bool ret;
+
+ ret = cmpxchg_double(&irte->low, &irte->high,
+ irte->low, irte->high,
+ irte_modified->low, irte_modified->high);
+ /*
+ * We use cmpxchg16 to atomically update the 128-bit IRTE,
+ * and it cannot be updated by the hardware or other processors
+ * behind us, so the return value of cmpxchg16 should be the
+ * same as the old value.
+ */
+ WARN_ON(!ret);
+ } else
+#endif
+ {
+ set_64bit(&irte->low, irte_modified->low);
+ set_64bit(&irte->high, irte_modified->high);
+ }
+ __iommu_flush_cache(iommu, irte, sizeof(*irte));
+
+ rc = qi_flush_iec(iommu, index, 0);
+
+ /* Update iommu mode according to the IRTE mode */
+ irq_iommu->mode = irte->pst ? IRQ_POSTING : IRQ_REMAPPING;
+ raw_spin_unlock_irqrestore(&irq_2_ir_lock, flags);
+
+ return rc;
+}
+
+static struct irq_domain *map_hpet_to_ir(u8 hpet_id)
+{
+ int i;
+
+ for (i = 0; i < MAX_HPET_TBS; i++) {
+ if (ir_hpet[i].id == hpet_id && ir_hpet[i].iommu)
+ return ir_hpet[i].iommu->ir_domain;
+ }
+ return NULL;
+}
+
+static struct intel_iommu *map_ioapic_to_iommu(int apic)
+{
+ int i;
+
+ for (i = 0; i < MAX_IO_APICS; i++) {
+ if (ir_ioapic[i].id == apic && ir_ioapic[i].iommu)
+ return ir_ioapic[i].iommu;
+ }
+ return NULL;
+}
+
+static struct irq_domain *map_ioapic_to_ir(int apic)
+{
+ struct intel_iommu *iommu = map_ioapic_to_iommu(apic);
+
+ return iommu ? iommu->ir_domain : NULL;
+}
+
+static struct irq_domain *map_dev_to_ir(struct pci_dev *dev)
+{
+ struct dmar_drhd_unit *drhd = dmar_find_matched_drhd_unit(dev);
+
+ return drhd ? drhd->iommu->ir_msi_domain : NULL;
+}
+
+static int clear_entries(struct irq_2_iommu *irq_iommu)
+{
+ struct irte *start, *entry, *end;
+ struct intel_iommu *iommu;
+ int index;
+
+ if (irq_iommu->sub_handle)
+ return 0;
+
+ iommu = irq_iommu->iommu;
+ index = irq_iommu->irte_index;
+
+ start = iommu->ir_table->base + index;
+ end = start + (1 << irq_iommu->irte_mask);
+
+ for (entry = start; entry < end; entry++) {
+ set_64bit(&entry->low, 0);
+ set_64bit(&entry->high, 0);
+ }
+ bitmap_release_region(iommu->ir_table->bitmap, index,
+ irq_iommu->irte_mask);
+
+ return qi_flush_iec(iommu, index, irq_iommu->irte_mask);
+}
+
+/*
+ * source validation type
+ */
+#define SVT_NO_VERIFY 0x0 /* no verification is required */
+#define SVT_VERIFY_SID_SQ 0x1 /* verify using SID and SQ fields */
+#define SVT_VERIFY_BUS 0x2 /* verify bus of request-id */
+
+/*
+ * source-id qualifier
+ */
+#define SQ_ALL_16 0x0 /* verify all 16 bits of request-id */
+#define SQ_13_IGNORE_1 0x1 /* verify most significant 13 bits, ignore
+ * the third least significant bit
+ */
+#define SQ_13_IGNORE_2 0x2 /* verify most significant 13 bits, ignore
+ * the second and third least significant bits
+ */
+#define SQ_13_IGNORE_3 0x3 /* verify most significant 13 bits, ignore
+ * the least three significant bits
+ */
+
+/*
+ * set SVT, SQ and SID fields of irte to verify
+ * source ids of interrupt requests
+ */
+static void set_irte_sid(struct irte *irte, unsigned int svt,
+ unsigned int sq, unsigned int sid)
+{
+ if (disable_sourceid_checking)
+ svt = SVT_NO_VERIFY;
+ irte->svt = svt;
+ irte->sq = sq;
+ irte->sid = sid;
+}
+
+/*
+ * Set an IRTE to match only the bus number. Interrupt requests that reference
+ * this IRTE must have a requester-id whose bus number is between or equal
+ * to the start_bus and end_bus arguments.
+ */
+static void set_irte_verify_bus(struct irte *irte, unsigned int start_bus,
+ unsigned int end_bus)
+{
+ set_irte_sid(irte, SVT_VERIFY_BUS, SQ_ALL_16,
+ (start_bus << 8) | end_bus);
+}
+
+static int set_ioapic_sid(struct irte *irte, int apic)
+{
+ int i;
+ u16 sid = 0;
+
+ if (!irte)
+ return -1;
+
+ down_read(&dmar_global_lock);
+ for (i = 0; i < MAX_IO_APICS; i++) {
+ if (ir_ioapic[i].iommu && ir_ioapic[i].id == apic) {
+ sid = (ir_ioapic[i].bus << 8) | ir_ioapic[i].devfn;
+ break;
+ }
+ }
+ up_read(&dmar_global_lock);
+
+ if (sid == 0) {
+ pr_warn("Failed to set source-id of IOAPIC (%d)\n", apic);
+ return -1;
+ }
+
+ set_irte_sid(irte, SVT_VERIFY_SID_SQ, SQ_ALL_16, sid);
+
+ return 0;
+}
+
+static int set_hpet_sid(struct irte *irte, u8 id)
+{
+ int i;
+ u16 sid = 0;
+
+ if (!irte)
+ return -1;
+
+ down_read(&dmar_global_lock);
+ for (i = 0; i < MAX_HPET_TBS; i++) {
+ if (ir_hpet[i].iommu && ir_hpet[i].id == id) {
+ sid = (ir_hpet[i].bus << 8) | ir_hpet[i].devfn;
+ break;
+ }
+ }
+ up_read(&dmar_global_lock);
+
+ if (sid == 0) {
+ pr_warn("Failed to set source-id of HPET block (%d)\n", id);
+ return -1;
+ }
+
+ /*
+ * Should really use SQ_ALL_16. Some platforms are broken.
+ * While we figure out the right quirks for these broken platforms, use
+ * SQ_13_IGNORE_3 for now.
+ */
+ set_irte_sid(irte, SVT_VERIFY_SID_SQ, SQ_13_IGNORE_3, sid);
+
+ return 0;
+}
+
+struct set_msi_sid_data {
+ struct pci_dev *pdev;
+ u16 alias;
+ int count;
+ int busmatch_count;
+};
+
+static int set_msi_sid_cb(struct pci_dev *pdev, u16 alias, void *opaque)
+{
+ struct set_msi_sid_data *data = opaque;
+
+ if (data->count == 0 || PCI_BUS_NUM(alias) == PCI_BUS_NUM(data->alias))
+ data->busmatch_count++;
+
+ data->pdev = pdev;
+ data->alias = alias;
+ data->count++;
+
+ return 0;
+}
+
+static int set_msi_sid(struct irte *irte, struct pci_dev *dev)
+{
+ struct set_msi_sid_data data;
+
+ if (!irte || !dev)
+ return -1;
+
+ data.count = 0;
+ data.busmatch_count = 0;
+ pci_for_each_dma_alias(dev, set_msi_sid_cb, &data);
+
+ /*
+ * DMA alias provides us with a PCI device and alias. The only case
+ * where the it will return an alias on a different bus than the
+ * device is the case of a PCIe-to-PCI bridge, where the alias is for
+ * the subordinate bus. In this case we can only verify the bus.
+ *
+ * If there are multiple aliases, all with the same bus number,
+ * then all we can do is verify the bus. This is typical in NTB
+ * hardware which use proxy IDs where the device will generate traffic
+ * from multiple devfn numbers on the same bus.
+ *
+ * If the alias device is on a different bus than our source device
+ * then we have a topology based alias, use it.
+ *
+ * Otherwise, the alias is for a device DMA quirk and we cannot
+ * assume that MSI uses the same requester ID. Therefore use the
+ * original device.
+ */
+ if (PCI_BUS_NUM(data.alias) != data.pdev->bus->number)
+ set_irte_verify_bus(irte, PCI_BUS_NUM(data.alias),
+ dev->bus->number);
+ else if (data.count >= 2 && data.busmatch_count == data.count)
+ set_irte_verify_bus(irte, dev->bus->number, dev->bus->number);
+ else if (data.pdev->bus->number != dev->bus->number)
+ set_irte_sid(irte, SVT_VERIFY_SID_SQ, SQ_ALL_16, data.alias);
+ else
+ set_irte_sid(irte, SVT_VERIFY_SID_SQ, SQ_ALL_16,
+ pci_dev_id(dev));
+
+ return 0;
+}
+
+static int iommu_load_old_irte(struct intel_iommu *iommu)
+{
+ struct irte *old_ir_table;
+ phys_addr_t irt_phys;
+ unsigned int i;
+ size_t size;
+ u64 irta;
+
+ /* Check whether the old ir-table has the same size as ours */
+ irta = dmar_readq(iommu->reg + DMAR_IRTA_REG);
+ if ((irta & INTR_REMAP_TABLE_REG_SIZE_MASK)
+ != INTR_REMAP_TABLE_REG_SIZE)
+ return -EINVAL;
+
+ irt_phys = irta & VTD_PAGE_MASK;
+ size = INTR_REMAP_TABLE_ENTRIES*sizeof(struct irte);
+
+ /* Map the old IR table */
+ old_ir_table = memremap(irt_phys, size, MEMREMAP_WB);
+ if (!old_ir_table)
+ return -ENOMEM;
+
+ /* Copy data over */
+ memcpy(iommu->ir_table->base, old_ir_table, size);
+
+ __iommu_flush_cache(iommu, iommu->ir_table->base, size);
+
+ /*
+ * Now check the table for used entries and mark those as
+ * allocated in the bitmap
+ */
+ for (i = 0; i < INTR_REMAP_TABLE_ENTRIES; i++) {
+ if (iommu->ir_table->base[i].present)
+ bitmap_set(iommu->ir_table->bitmap, i, 1);
+ }
+
+ memunmap(old_ir_table);
+
+ return 0;
+}
+
+
+static void iommu_set_irq_remapping(struct intel_iommu *iommu, int mode)
+{
+ unsigned long flags;
+ u64 addr;
+ u32 sts;
+
+ addr = virt_to_phys((void *)iommu->ir_table->base);
+
+ raw_spin_lock_irqsave(&iommu->register_lock, flags);
+
+ dmar_writeq(iommu->reg + DMAR_IRTA_REG,
+ (addr) | IR_X2APIC_MODE(mode) | INTR_REMAP_TABLE_REG_SIZE);
+
+ /* Set interrupt-remapping table pointer */
+ writel(iommu->gcmd | DMA_GCMD_SIRTP, iommu->reg + DMAR_GCMD_REG);
+
+ IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
+ readl, (sts & DMA_GSTS_IRTPS), sts);
+ raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
+
+ /*
+ * Global invalidation of interrupt entry cache to make sure the
+ * hardware uses the new irq remapping table.
+ */
+ qi_global_iec(iommu);
+}
+
+static void iommu_enable_irq_remapping(struct intel_iommu *iommu)
+{
+ unsigned long flags;
+ u32 sts;
+
+ raw_spin_lock_irqsave(&iommu->register_lock, flags);
+
+ /* Enable interrupt-remapping */
+ iommu->gcmd |= DMA_GCMD_IRE;
+ writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
+ IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
+ readl, (sts & DMA_GSTS_IRES), sts);
+
+ /* Block compatibility-format MSIs */
+ if (sts & DMA_GSTS_CFIS) {
+ iommu->gcmd &= ~DMA_GCMD_CFI;
+ writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
+ IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
+ readl, !(sts & DMA_GSTS_CFIS), sts);
+ }
+
+ /*
+ * With CFI clear in the Global Command register, we should be
+ * protected from dangerous (i.e. compatibility) interrupts
+ * regardless of x2apic status. Check just to be sure.
+ */
+ if (sts & DMA_GSTS_CFIS)
+ WARN(1, KERN_WARNING
+ "Compatibility-format IRQs enabled despite intr remapping;\n"
+ "you are vulnerable to IRQ injection.\n");
+
+ raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
+}
+
+static int intel_setup_irq_remapping(struct intel_iommu *iommu)
+{
+ struct ir_table *ir_table;
+ struct fwnode_handle *fn;
+ unsigned long *bitmap;
+ struct page *pages;
+
+ if (iommu->ir_table)
+ return 0;
+
+ ir_table = kzalloc(sizeof(struct ir_table), GFP_KERNEL);
+ if (!ir_table)
+ return -ENOMEM;
+
+ pages = alloc_pages_node(iommu->node, GFP_KERNEL | __GFP_ZERO,
+ INTR_REMAP_PAGE_ORDER);
+ if (!pages) {
+ pr_err("IR%d: failed to allocate pages of order %d\n",
+ iommu->seq_id, INTR_REMAP_PAGE_ORDER);
+ goto out_free_table;
+ }
+
+ bitmap = bitmap_zalloc(INTR_REMAP_TABLE_ENTRIES, GFP_ATOMIC);
+ if (bitmap == NULL) {
+ pr_err("IR%d: failed to allocate bitmap\n", iommu->seq_id);
+ goto out_free_pages;
+ }
+
+ fn = irq_domain_alloc_named_id_fwnode("INTEL-IR", iommu->seq_id);
+ if (!fn)
+ goto out_free_bitmap;
+
+ iommu->ir_domain =
+ irq_domain_create_hierarchy(arch_get_ir_parent_domain(),
+ 0, INTR_REMAP_TABLE_ENTRIES,
+ fn, &intel_ir_domain_ops,
+ iommu);
+ if (!iommu->ir_domain) {
+ pr_err("IR%d: failed to allocate irqdomain\n", iommu->seq_id);
+ goto out_free_fwnode;
+ }
+ iommu->ir_msi_domain =
+ arch_create_remap_msi_irq_domain(iommu->ir_domain,
+ "INTEL-IR-MSI",
+ iommu->seq_id);
+
+ ir_table->base = page_address(pages);
+ ir_table->bitmap = bitmap;
+ iommu->ir_table = ir_table;
+
+ /*
+ * If the queued invalidation is already initialized,
+ * shouldn't disable it.
+ */
+ if (!iommu->qi) {
+ /*
+ * Clear previous faults.
+ */
+ dmar_fault(-1, iommu);
+ dmar_disable_qi(iommu);
+
+ if (dmar_enable_qi(iommu)) {
+ pr_err("Failed to enable queued invalidation\n");
+ goto out_free_ir_domain;
+ }
+ }
+
+ init_ir_status(iommu);
+
+ if (ir_pre_enabled(iommu)) {
+ if (!is_kdump_kernel()) {
+ pr_warn("IRQ remapping was enabled on %s but we are not in kdump mode\n",
+ iommu->name);
+ clear_ir_pre_enabled(iommu);
+ iommu_disable_irq_remapping(iommu);
+ } else if (iommu_load_old_irte(iommu))
+ pr_err("Failed to copy IR table for %s from previous kernel\n",
+ iommu->name);
+ else
+ pr_info("Copied IR table for %s from previous kernel\n",
+ iommu->name);
+ }
+
+ iommu_set_irq_remapping(iommu, eim_mode);
+
+ return 0;
+
+out_free_ir_domain:
+ if (iommu->ir_msi_domain)
+ irq_domain_remove(iommu->ir_msi_domain);
+ iommu->ir_msi_domain = NULL;
+ irq_domain_remove(iommu->ir_domain);
+ iommu->ir_domain = NULL;
+out_free_fwnode:
+ irq_domain_free_fwnode(fn);
+out_free_bitmap:
+ bitmap_free(bitmap);
+out_free_pages:
+ __free_pages(pages, INTR_REMAP_PAGE_ORDER);
+out_free_table:
+ kfree(ir_table);
+
+ iommu->ir_table = NULL;
+
+ return -ENOMEM;
+}
+
+static void intel_teardown_irq_remapping(struct intel_iommu *iommu)
+{
+ struct fwnode_handle *fn;
+
+ if (iommu && iommu->ir_table) {
+ if (iommu->ir_msi_domain) {
+ fn = iommu->ir_msi_domain->fwnode;
+
+ irq_domain_remove(iommu->ir_msi_domain);
+ irq_domain_free_fwnode(fn);
+ iommu->ir_msi_domain = NULL;
+ }
+ if (iommu->ir_domain) {
+ fn = iommu->ir_domain->fwnode;
+
+ irq_domain_remove(iommu->ir_domain);
+ irq_domain_free_fwnode(fn);
+ iommu->ir_domain = NULL;
+ }
+ free_pages((unsigned long)iommu->ir_table->base,
+ INTR_REMAP_PAGE_ORDER);
+ bitmap_free(iommu->ir_table->bitmap);
+ kfree(iommu->ir_table);
+ iommu->ir_table = NULL;
+ }
+}
+
+/*
+ * Disable Interrupt Remapping.
+ */
+static void iommu_disable_irq_remapping(struct intel_iommu *iommu)
+{
+ unsigned long flags;
+ u32 sts;
+
+ if (!ecap_ir_support(iommu->ecap))
+ return;
+
+ /*
+ * global invalidation of interrupt entry cache before disabling
+ * interrupt-remapping.
+ */
+ qi_global_iec(iommu);
+
+ raw_spin_lock_irqsave(&iommu->register_lock, flags);
+
+ sts = readl(iommu->reg + DMAR_GSTS_REG);
+ if (!(sts & DMA_GSTS_IRES))
+ goto end;
+
+ iommu->gcmd &= ~DMA_GCMD_IRE;
+ writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
+
+ IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
+ readl, !(sts & DMA_GSTS_IRES), sts);
+
+end:
+ raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
+}
+
+static int __init dmar_x2apic_optout(void)
+{
+ struct acpi_table_dmar *dmar;
+ dmar = (struct acpi_table_dmar *)dmar_tbl;
+ if (!dmar || no_x2apic_optout)
+ return 0;
+ return dmar->flags & DMAR_X2APIC_OPT_OUT;
+}
+
+static void __init intel_cleanup_irq_remapping(void)
+{
+ struct dmar_drhd_unit *drhd;
+ struct intel_iommu *iommu;
+
+ for_each_iommu(iommu, drhd) {
+ if (ecap_ir_support(iommu->ecap)) {
+ iommu_disable_irq_remapping(iommu);
+ intel_teardown_irq_remapping(iommu);
+ }
+ }
+
+ if (x2apic_supported())
+ pr_warn("Failed to enable irq remapping. You are vulnerable to irq-injection attacks.\n");
+}
+
+static int __init intel_prepare_irq_remapping(void)
+{
+ struct dmar_drhd_unit *drhd;
+ struct intel_iommu *iommu;
+ int eim = 0;
+
+ if (irq_remap_broken) {
+ pr_warn("This system BIOS has enabled interrupt remapping\n"
+ "on a chipset that contains an erratum making that\n"
+ "feature unstable. To maintain system stability\n"
+ "interrupt remapping is being disabled. Please\n"
+ "contact your BIOS vendor for an update\n");
+ add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
+ return -ENODEV;
+ }
+
+ if (dmar_table_init() < 0)
+ return -ENODEV;
+
+ if (!dmar_ir_support())
+ return -ENODEV;
+
+ if (parse_ioapics_under_ir()) {
+ pr_info("Not enabling interrupt remapping\n");
+ goto error;
+ }
+
+ /* First make sure all IOMMUs support IRQ remapping */
+ for_each_iommu(iommu, drhd)
+ if (!ecap_ir_support(iommu->ecap))
+ goto error;
+
+ /* Detect remapping mode: lapic or x2apic */
+ if (x2apic_supported()) {
+ eim = !dmar_x2apic_optout();
+ if (!eim) {
+ pr_info("x2apic is disabled because BIOS sets x2apic opt out bit.");
+ pr_info("Use 'intremap=no_x2apic_optout' to override the BIOS setting.\n");
+ }
+ }
+
+ for_each_iommu(iommu, drhd) {
+ if (eim && !ecap_eim_support(iommu->ecap)) {
+ pr_info("%s does not support EIM\n", iommu->name);
+ eim = 0;
+ }
+ }
+
+ eim_mode = eim;
+ if (eim)
+ pr_info("Queued invalidation will be enabled to support x2apic and Intr-remapping.\n");
+
+ /* Do the initializations early */
+ for_each_iommu(iommu, drhd) {
+ if (intel_setup_irq_remapping(iommu)) {
+ pr_err("Failed to setup irq remapping for %s\n",
+ iommu->name);
+ goto error;
+ }
+ }
+
+ return 0;
+
+error:
+ intel_cleanup_irq_remapping();
+ return -ENODEV;
+}
+
+/*
+ * Set Posted-Interrupts capability.
+ */
+static inline void set_irq_posting_cap(void)
+{
+ struct dmar_drhd_unit *drhd;
+ struct intel_iommu *iommu;
+
+ if (!disable_irq_post) {
+ /*
+ * If IRTE is in posted format, the 'pda' field goes across the
+ * 64-bit boundary, we need use cmpxchg16b to atomically update
+ * it. We only expose posted-interrupt when X86_FEATURE_CX16
+ * is supported. Actually, hardware platforms supporting PI
+ * should have X86_FEATURE_CX16 support, this has been confirmed
+ * with Intel hardware guys.
+ */
+ if (boot_cpu_has(X86_FEATURE_CX16))
+ intel_irq_remap_ops.capability |= 1 << IRQ_POSTING_CAP;
+
+ for_each_iommu(iommu, drhd)
+ if (!cap_pi_support(iommu->cap)) {
+ intel_irq_remap_ops.capability &=
+ ~(1 << IRQ_POSTING_CAP);
+ break;
+ }
+ }
+}
+
+static int __init intel_enable_irq_remapping(void)
+{
+ struct dmar_drhd_unit *drhd;
+ struct intel_iommu *iommu;
+ bool setup = false;
+
+ /*
+ * Setup Interrupt-remapping for all the DRHD's now.
+ */
+ for_each_iommu(iommu, drhd) {
+ if (!ir_pre_enabled(iommu))
+ iommu_enable_irq_remapping(iommu);
+ setup = true;
+ }
+
+ if (!setup)
+ goto error;
+
+ irq_remapping_enabled = 1;
+
+ set_irq_posting_cap();
+
+ pr_info("Enabled IRQ remapping in %s mode\n", eim_mode ? "x2apic" : "xapic");
+
+ return eim_mode ? IRQ_REMAP_X2APIC_MODE : IRQ_REMAP_XAPIC_MODE;
+
+error:
+ intel_cleanup_irq_remapping();
+ return -1;
+}
+
+static int ir_parse_one_hpet_scope(struct acpi_dmar_device_scope *scope,
+ struct intel_iommu *iommu,
+ struct acpi_dmar_hardware_unit *drhd)
+{
+ struct acpi_dmar_pci_path *path;
+ u8 bus;
+ int count, free = -1;
+
+ bus = scope->bus;
+ path = (struct acpi_dmar_pci_path *)(scope + 1);
+ count = (scope->length - sizeof(struct acpi_dmar_device_scope))
+ / sizeof(struct acpi_dmar_pci_path);
+
+ while (--count > 0) {
+ /*
+ * Access PCI directly due to the PCI
+ * subsystem isn't initialized yet.
+ */
+ bus = read_pci_config_byte(bus, path->device, path->function,
+ PCI_SECONDARY_BUS);
+ path++;
+ }
+
+ for (count = 0; count < MAX_HPET_TBS; count++) {
+ if (ir_hpet[count].iommu == iommu &&
+ ir_hpet[count].id == scope->enumeration_id)
+ return 0;
+ else if (ir_hpet[count].iommu == NULL && free == -1)
+ free = count;
+ }
+ if (free == -1) {
+ pr_warn("Exceeded Max HPET blocks\n");
+ return -ENOSPC;
+ }
+
+ ir_hpet[free].iommu = iommu;
+ ir_hpet[free].id = scope->enumeration_id;
+ ir_hpet[free].bus = bus;
+ ir_hpet[free].devfn = PCI_DEVFN(path->device, path->function);
+ pr_info("HPET id %d under DRHD base 0x%Lx\n",
+ scope->enumeration_id, drhd->address);
+
+ return 0;
+}
+
+static int ir_parse_one_ioapic_scope(struct acpi_dmar_device_scope *scope,
+ struct intel_iommu *iommu,
+ struct acpi_dmar_hardware_unit *drhd)
+{
+ struct acpi_dmar_pci_path *path;
+ u8 bus;
+ int count, free = -1;
+
+ bus = scope->bus;
+ path = (struct acpi_dmar_pci_path *)(scope + 1);
+ count = (scope->length - sizeof(struct acpi_dmar_device_scope))
+ / sizeof(struct acpi_dmar_pci_path);
+
+ while (--count > 0) {
+ /*
+ * Access PCI directly due to the PCI
+ * subsystem isn't initialized yet.
+ */
+ bus = read_pci_config_byte(bus, path->device, path->function,
+ PCI_SECONDARY_BUS);
+ path++;
+ }
+
+ for (count = 0; count < MAX_IO_APICS; count++) {
+ if (ir_ioapic[count].iommu == iommu &&
+ ir_ioapic[count].id == scope->enumeration_id)
+ return 0;
+ else if (ir_ioapic[count].iommu == NULL && free == -1)
+ free = count;
+ }
+ if (free == -1) {
+ pr_warn("Exceeded Max IO APICS\n");
+ return -ENOSPC;
+ }
+
+ ir_ioapic[free].bus = bus;
+ ir_ioapic[free].devfn = PCI_DEVFN(path->device, path->function);
+ ir_ioapic[free].iommu = iommu;
+ ir_ioapic[free].id = scope->enumeration_id;
+ pr_info("IOAPIC id %d under DRHD base 0x%Lx IOMMU %d\n",
+ scope->enumeration_id, drhd->address, iommu->seq_id);
+
+ return 0;
+}
+
+static int ir_parse_ioapic_hpet_scope(struct acpi_dmar_header *header,
+ struct intel_iommu *iommu)
+{
+ int ret = 0;
+ struct acpi_dmar_hardware_unit *drhd;
+ struct acpi_dmar_device_scope *scope;
+ void *start, *end;
+
+ drhd = (struct acpi_dmar_hardware_unit *)header;
+ start = (void *)(drhd + 1);
+ end = ((void *)drhd) + header->length;
+
+ while (start < end && ret == 0) {
+ scope = start;
+ if (scope->entry_type == ACPI_DMAR_SCOPE_TYPE_IOAPIC)
+ ret = ir_parse_one_ioapic_scope(scope, iommu, drhd);
+ else if (scope->entry_type == ACPI_DMAR_SCOPE_TYPE_HPET)
+ ret = ir_parse_one_hpet_scope(scope, iommu, drhd);
+ start += scope->length;
+ }
+
+ return ret;
+}
+
+static void ir_remove_ioapic_hpet_scope(struct intel_iommu *iommu)
+{
+ int i;
+
+ for (i = 0; i < MAX_HPET_TBS; i++)
+ if (ir_hpet[i].iommu == iommu)
+ ir_hpet[i].iommu = NULL;
+
+ for (i = 0; i < MAX_IO_APICS; i++)
+ if (ir_ioapic[i].iommu == iommu)
+ ir_ioapic[i].iommu = NULL;
+}
+
+/*
+ * Finds the assocaition between IOAPIC's and its Interrupt-remapping
+ * hardware unit.
+ */
+static int __init parse_ioapics_under_ir(void)
+{
+ struct dmar_drhd_unit *drhd;
+ struct intel_iommu *iommu;
+ bool ir_supported = false;
+ int ioapic_idx;
+
+ for_each_iommu(iommu, drhd) {
+ int ret;
+
+ if (!ecap_ir_support(iommu->ecap))
+ continue;
+
+ ret = ir_parse_ioapic_hpet_scope(drhd->hdr, iommu);
+ if (ret)
+ return ret;
+
+ ir_supported = true;
+ }
+
+ if (!ir_supported)
+ return -ENODEV;
+
+ for (ioapic_idx = 0; ioapic_idx < nr_ioapics; ioapic_idx++) {
+ int ioapic_id = mpc_ioapic_id(ioapic_idx);
+ if (!map_ioapic_to_iommu(ioapic_id)) {
+ pr_err(FW_BUG "ioapic %d has no mapping iommu, "
+ "interrupt remapping will be disabled\n",
+ ioapic_id);
+ return -1;
+ }
+ }
+
+ return 0;
+}
+
+static int __init ir_dev_scope_init(void)
+{
+ int ret;
+
+ if (!irq_remapping_enabled)
+ return 0;
+
+ down_write(&dmar_global_lock);
+ ret = dmar_dev_scope_init();
+ up_write(&dmar_global_lock);
+
+ return ret;
+}
+rootfs_initcall(ir_dev_scope_init);
+
+static void disable_irq_remapping(void)
+{
+ struct dmar_drhd_unit *drhd;
+ struct intel_iommu *iommu = NULL;
+
+ /*
+ * Disable Interrupt-remapping for all the DRHD's now.
+ */
+ for_each_iommu(iommu, drhd) {
+ if (!ecap_ir_support(iommu->ecap))
+ continue;
+
+ iommu_disable_irq_remapping(iommu);
+ }
+
+ /*
+ * Clear Posted-Interrupts capability.
+ */
+ if (!disable_irq_post)
+ intel_irq_remap_ops.capability &= ~(1 << IRQ_POSTING_CAP);
+}
+
+static int reenable_irq_remapping(int eim)
+{
+ struct dmar_drhd_unit *drhd;
+ bool setup = false;
+ struct intel_iommu *iommu = NULL;
+
+ for_each_iommu(iommu, drhd)
+ if (iommu->qi)
+ dmar_reenable_qi(iommu);
+
+ /*
+ * Setup Interrupt-remapping for all the DRHD's now.
+ */
+ for_each_iommu(iommu, drhd) {
+ if (!ecap_ir_support(iommu->ecap))
+ continue;
+
+ /* Set up interrupt remapping for iommu.*/
+ iommu_set_irq_remapping(iommu, eim);
+ iommu_enable_irq_remapping(iommu);
+ setup = true;
+ }
+
+ if (!setup)
+ goto error;
+
+ set_irq_posting_cap();
+
+ return 0;
+
+error:
+ /*
+ * handle error condition gracefully here!
+ */
+ return -1;
+}
+
+/*
+ * Store the MSI remapping domain pointer in the device if enabled.
+ *
+ * This is called from dmar_pci_bus_add_dev() so it works even when DMA
+ * remapping is disabled. Only update the pointer if the device is not
+ * already handled by a non default PCI/MSI interrupt domain. This protects
+ * e.g. VMD devices.
+ */
+void intel_irq_remap_add_device(struct dmar_pci_notify_info *info)
+{
+ if (!irq_remapping_enabled || pci_dev_has_special_msi_domain(info->dev))
+ return;
+
+ dev_set_msi_domain(&info->dev->dev, map_dev_to_ir(info->dev));
+}
+
+static void prepare_irte(struct irte *irte, int vector, unsigned int dest)
+{
+ memset(irte, 0, sizeof(*irte));
+
+ irte->present = 1;
+ irte->dst_mode = apic->irq_dest_mode;
+ /*
+ * Trigger mode in the IRTE will always be edge, and for IO-APIC, the
+ * actual level or edge trigger will be setup in the IO-APIC
+ * RTE. This will help simplify level triggered irq migration.
+ * For more details, see the comments (in io_apic.c) explainig IO-APIC
+ * irq migration in the presence of interrupt-remapping.
+ */
+ irte->trigger_mode = 0;
+ irte->dlvry_mode = apic->irq_delivery_mode;
+ irte->vector = vector;
+ irte->dest_id = IRTE_DEST(dest);
+ irte->redir_hint = 1;
+}
+
+static struct irq_domain *intel_get_irq_domain(struct irq_alloc_info *info)
+{
+ if (!info)
+ return NULL;
+
+ switch (info->type) {
+ case X86_IRQ_ALLOC_TYPE_IOAPIC_GET_PARENT:
+ return map_ioapic_to_ir(info->devid);
+ case X86_IRQ_ALLOC_TYPE_HPET_GET_PARENT:
+ return map_hpet_to_ir(info->devid);
+ default:
+ WARN_ON_ONCE(1);
+ return NULL;
+ }
+}
+
+struct irq_remap_ops intel_irq_remap_ops = {
+ .prepare = intel_prepare_irq_remapping,
+ .enable = intel_enable_irq_remapping,
+ .disable = disable_irq_remapping,
+ .reenable = reenable_irq_remapping,
+ .enable_faulting = enable_drhd_fault_handling,
+ .get_irq_domain = intel_get_irq_domain,
+};
+
+static void intel_ir_reconfigure_irte(struct irq_data *irqd, bool force)
+{
+ struct intel_ir_data *ir_data = irqd->chip_data;
+ struct irte *irte = &ir_data->irte_entry;
+ struct irq_cfg *cfg = irqd_cfg(irqd);
+
+ /*
+ * Atomically updates the IRTE with the new destination, vector
+ * and flushes the interrupt entry cache.
+ */
+ irte->vector = cfg->vector;
+ irte->dest_id = IRTE_DEST(cfg->dest_apicid);
+
+ /* Update the hardware only if the interrupt is in remapped mode. */
+ if (force || ir_data->irq_2_iommu.mode == IRQ_REMAPPING)
+ modify_irte(&ir_data->irq_2_iommu, irte);
+}
+
+/*
+ * Migrate the IO-APIC irq in the presence of intr-remapping.
+ *
+ * For both level and edge triggered, irq migration is a simple atomic
+ * update(of vector and cpu destination) of IRTE and flush the hardware cache.
+ *
+ * For level triggered, we eliminate the io-apic RTE modification (with the
+ * updated vector information), by using a virtual vector (io-apic pin number).
+ * Real vector that is used for interrupting cpu will be coming from
+ * the interrupt-remapping table entry.
+ *
+ * As the migration is a simple atomic update of IRTE, the same mechanism
+ * is used to migrate MSI irq's in the presence of interrupt-remapping.
+ */
+static int
+intel_ir_set_affinity(struct irq_data *data, const struct cpumask *mask,
+ bool force)
+{
+ struct irq_data *parent = data->parent_data;
+ struct irq_cfg *cfg = irqd_cfg(data);
+ int ret;
+
+ ret = parent->chip->irq_set_affinity(parent, mask, force);
+ if (ret < 0 || ret == IRQ_SET_MASK_OK_DONE)
+ return ret;
+
+ intel_ir_reconfigure_irte(data, false);
+ /*
+ * After this point, all the interrupts will start arriving
+ * at the new destination. So, time to cleanup the previous
+ * vector allocation.
+ */
+ send_cleanup_vector(cfg);
+
+ return IRQ_SET_MASK_OK_DONE;
+}
+
+static void intel_ir_compose_msi_msg(struct irq_data *irq_data,
+ struct msi_msg *msg)
+{
+ struct intel_ir_data *ir_data = irq_data->chip_data;
+
+ *msg = ir_data->msi_entry;
+}
+
+static int intel_ir_set_vcpu_affinity(struct irq_data *data, void *info)
+{
+ struct intel_ir_data *ir_data = data->chip_data;
+ struct vcpu_data *vcpu_pi_info = info;
+
+ /* stop posting interrupts, back to remapping mode */
+ if (!vcpu_pi_info) {
+ modify_irte(&ir_data->irq_2_iommu, &ir_data->irte_entry);
+ } else {
+ struct irte irte_pi;
+
+ /*
+ * We are not caching the posted interrupt entry. We
+ * copy the data from the remapped entry and modify
+ * the fields which are relevant for posted mode. The
+ * cached remapped entry is used for switching back to
+ * remapped mode.
+ */
+ memset(&irte_pi, 0, sizeof(irte_pi));
+ dmar_copy_shared_irte(&irte_pi, &ir_data->irte_entry);
+
+ /* Update the posted mode fields */
+ irte_pi.p_pst = 1;
+ irte_pi.p_urgent = 0;
+ irte_pi.p_vector = vcpu_pi_info->vector;
+ irte_pi.pda_l = (vcpu_pi_info->pi_desc_addr >>
+ (32 - PDA_LOW_BIT)) & ~(-1UL << PDA_LOW_BIT);
+ irte_pi.pda_h = (vcpu_pi_info->pi_desc_addr >> 32) &
+ ~(-1UL << PDA_HIGH_BIT);
+
+ modify_irte(&ir_data->irq_2_iommu, &irte_pi);
+ }
+
+ return 0;
+}
+
+static struct irq_chip intel_ir_chip = {
+ .name = "INTEL-IR",
+ .irq_ack = apic_ack_irq,
+ .irq_set_affinity = intel_ir_set_affinity,
+ .irq_compose_msi_msg = intel_ir_compose_msi_msg,
+ .irq_set_vcpu_affinity = intel_ir_set_vcpu_affinity,
+};
+
+static void intel_irq_remapping_prepare_irte(struct intel_ir_data *data,
+ struct irq_cfg *irq_cfg,
+ struct irq_alloc_info *info,
+ int index, int sub_handle)
+{
+ struct IR_IO_APIC_route_entry *entry;
+ struct irte *irte = &data->irte_entry;
+ struct msi_msg *msg = &data->msi_entry;
+
+ prepare_irte(irte, irq_cfg->vector, irq_cfg->dest_apicid);
+ switch (info->type) {
+ case X86_IRQ_ALLOC_TYPE_IOAPIC:
+ /* Set source-id of interrupt request */
+ set_ioapic_sid(irte, info->devid);
+ apic_printk(APIC_VERBOSE, KERN_DEBUG "IOAPIC[%d]: Set IRTE entry (P:%d FPD:%d Dst_Mode:%d Redir_hint:%d Trig_Mode:%d Dlvry_Mode:%X Avail:%X Vector:%02X Dest:%08X SID:%04X SQ:%X SVT:%X)\n",
+ info->devid, irte->present, irte->fpd,
+ irte->dst_mode, irte->redir_hint,
+ irte->trigger_mode, irte->dlvry_mode,
+ irte->avail, irte->vector, irte->dest_id,
+ irte->sid, irte->sq, irte->svt);
+
+ entry = (struct IR_IO_APIC_route_entry *)info->ioapic.entry;
+ info->ioapic.entry = NULL;
+ memset(entry, 0, sizeof(*entry));
+ entry->index2 = (index >> 15) & 0x1;
+ entry->zero = 0;
+ entry->format = 1;
+ entry->index = (index & 0x7fff);
+ /*
+ * IO-APIC RTE will be configured with virtual vector.
+ * irq handler will do the explicit EOI to the io-apic.
+ */
+ entry->vector = info->ioapic.pin;
+ entry->mask = 0; /* enable IRQ */
+ entry->trigger = info->ioapic.trigger;
+ entry->polarity = info->ioapic.polarity;
+ if (info->ioapic.trigger)
+ entry->mask = 1; /* Mask level triggered irqs. */
+ break;
+
+ case X86_IRQ_ALLOC_TYPE_HPET:
+ case X86_IRQ_ALLOC_TYPE_PCI_MSI:
+ case X86_IRQ_ALLOC_TYPE_PCI_MSIX:
+ if (info->type == X86_IRQ_ALLOC_TYPE_HPET)
+ set_hpet_sid(irte, info->devid);
+ else
+ set_msi_sid(irte, msi_desc_to_pci_dev(info->desc));
+
+ msg->address_hi = MSI_ADDR_BASE_HI;
+ msg->data = sub_handle;
+ msg->address_lo = MSI_ADDR_BASE_LO | MSI_ADDR_IR_EXT_INT |
+ MSI_ADDR_IR_SHV |
+ MSI_ADDR_IR_INDEX1(index) |
+ MSI_ADDR_IR_INDEX2(index);
+ break;
+
+ default:
+ BUG_ON(1);
+ break;
+ }
+}
+
+static void intel_free_irq_resources(struct irq_domain *domain,
+ unsigned int virq, unsigned int nr_irqs)
+{
+ struct irq_data *irq_data;
+ struct intel_ir_data *data;
+ struct irq_2_iommu *irq_iommu;
+ unsigned long flags;
+ int i;
+ for (i = 0; i < nr_irqs; i++) {
+ irq_data = irq_domain_get_irq_data(domain, virq + i);
+ if (irq_data && irq_data->chip_data) {
+ data = irq_data->chip_data;
+ irq_iommu = &data->irq_2_iommu;
+ raw_spin_lock_irqsave(&irq_2_ir_lock, flags);
+ clear_entries(irq_iommu);
+ raw_spin_unlock_irqrestore(&irq_2_ir_lock, flags);
+ irq_domain_reset_irq_data(irq_data);
+ kfree(data);
+ }
+ }
+}
+
+static int intel_irq_remapping_alloc(struct irq_domain *domain,
+ unsigned int virq, unsigned int nr_irqs,
+ void *arg)
+{
+ struct intel_iommu *iommu = domain->host_data;
+ struct irq_alloc_info *info = arg;
+ struct intel_ir_data *data, *ird;
+ struct irq_data *irq_data;
+ struct irq_cfg *irq_cfg;
+ int i, ret, index;
+
+ if (!info || !iommu)
+ return -EINVAL;
+ if (nr_irqs > 1 && info->type != X86_IRQ_ALLOC_TYPE_PCI_MSI &&
+ info->type != X86_IRQ_ALLOC_TYPE_PCI_MSIX)
+ return -EINVAL;
+
+ /*
+ * With IRQ remapping enabled, don't need contiguous CPU vectors
+ * to support multiple MSI interrupts.
+ */
+ if (info->type == X86_IRQ_ALLOC_TYPE_PCI_MSI)
+ info->flags &= ~X86_IRQ_ALLOC_CONTIGUOUS_VECTORS;
+
+ ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, arg);
+ if (ret < 0)
+ return ret;
+
+ ret = -ENOMEM;
+ data = kzalloc(sizeof(*data), GFP_KERNEL);
+ if (!data)
+ goto out_free_parent;
+
+ down_read(&dmar_global_lock);
+ index = alloc_irte(iommu, &data->irq_2_iommu, nr_irqs);
+ up_read(&dmar_global_lock);
+ if (index < 0) {
+ pr_warn("Failed to allocate IRTE\n");
+ kfree(data);
+ goto out_free_parent;
+ }
+
+ for (i = 0; i < nr_irqs; i++) {
+ irq_data = irq_domain_get_irq_data(domain, virq + i);
+ irq_cfg = irqd_cfg(irq_data);
+ if (!irq_data || !irq_cfg) {
+ if (!i)
+ kfree(data);
+ ret = -EINVAL;
+ goto out_free_data;
+ }
+
+ if (i > 0) {
+ ird = kzalloc(sizeof(*ird), GFP_KERNEL);
+ if (!ird)
+ goto out_free_data;
+ /* Initialize the common data */
+ ird->irq_2_iommu = data->irq_2_iommu;
+ ird->irq_2_iommu.sub_handle = i;
+ } else {
+ ird = data;
+ }
+
+ irq_data->hwirq = (index << 16) + i;
+ irq_data->chip_data = ird;
+ irq_data->chip = &intel_ir_chip;
+ intel_irq_remapping_prepare_irte(ird, irq_cfg, info, index, i);
+ irq_set_status_flags(virq + i, IRQ_MOVE_PCNTXT);
+ }
+ return 0;
+
+out_free_data:
+ intel_free_irq_resources(domain, virq, i);
+out_free_parent:
+ irq_domain_free_irqs_common(domain, virq, nr_irqs);
+ return ret;
+}
+
+static void intel_irq_remapping_free(struct irq_domain *domain,
+ unsigned int virq, unsigned int nr_irqs)
+{
+ intel_free_irq_resources(domain, virq, nr_irqs);
+ irq_domain_free_irqs_common(domain, virq, nr_irqs);
+}
+
+static int intel_irq_remapping_activate(struct irq_domain *domain,
+ struct irq_data *irq_data, bool reserve)
+{
+ intel_ir_reconfigure_irte(irq_data, true);
+ return 0;
+}
+
+static void intel_irq_remapping_deactivate(struct irq_domain *domain,
+ struct irq_data *irq_data)
+{
+ struct intel_ir_data *data = irq_data->chip_data;
+ struct irte entry;
+
+ memset(&entry, 0, sizeof(entry));
+ modify_irte(&data->irq_2_iommu, &entry);
+}
+
+static const struct irq_domain_ops intel_ir_domain_ops = {
+ .alloc = intel_irq_remapping_alloc,
+ .free = intel_irq_remapping_free,
+ .activate = intel_irq_remapping_activate,
+ .deactivate = intel_irq_remapping_deactivate,
+};
+
+/*
+ * Support of Interrupt Remapping Unit Hotplug
+ */
+static int dmar_ir_add(struct dmar_drhd_unit *dmaru, struct intel_iommu *iommu)
+{
+ int ret;
+ int eim = x2apic_enabled();
+
+ if (eim && !ecap_eim_support(iommu->ecap)) {
+ pr_info("DRHD %Lx: EIM not supported by DRHD, ecap %Lx\n",
+ iommu->reg_phys, iommu->ecap);
+ return -ENODEV;
+ }
+
+ if (ir_parse_ioapic_hpet_scope(dmaru->hdr, iommu)) {
+ pr_warn("DRHD %Lx: failed to parse managed IOAPIC/HPET\n",
+ iommu->reg_phys);
+ return -ENODEV;
+ }
+
+ /* TODO: check all IOAPICs are covered by IOMMU */
+
+ /* Setup Interrupt-remapping now. */
+ ret = intel_setup_irq_remapping(iommu);
+ if (ret) {
+ pr_err("Failed to setup irq remapping for %s\n",
+ iommu->name);
+ intel_teardown_irq_remapping(iommu);
+ ir_remove_ioapic_hpet_scope(iommu);
+ } else {
+ iommu_enable_irq_remapping(iommu);
+ }
+
+ return ret;
+}
+
+int dmar_ir_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
+{
+ int ret = 0;
+ struct intel_iommu *iommu = dmaru->iommu;
+
+ if (!irq_remapping_enabled)
+ return 0;
+ if (iommu == NULL)
+ return -EINVAL;
+ if (!ecap_ir_support(iommu->ecap))
+ return 0;
+ if (irq_remapping_cap(IRQ_POSTING_CAP) &&
+ !cap_pi_support(iommu->cap))
+ return -EBUSY;
+
+ if (insert) {
+ if (!iommu->ir_table)
+ ret = dmar_ir_add(dmaru, iommu);
+ } else {
+ if (iommu->ir_table) {
+ if (!bitmap_empty(iommu->ir_table->bitmap,
+ INTR_REMAP_TABLE_ENTRIES)) {
+ ret = -EBUSY;
+ } else {
+ iommu_disable_irq_remapping(iommu);
+ intel_teardown_irq_remapping(iommu);
+ ir_remove_ioapic_hpet_scope(iommu);
+ }
+ }
+ }
+
+ return ret;
+}
diff --git a/drivers/iommu/intel/pasid.c b/drivers/iommu/intel/pasid.c
new file mode 100644
index 000000000..9b24e8224
--- /dev/null
+++ b/drivers/iommu/intel/pasid.c
@@ -0,0 +1,883 @@
+// SPDX-License-Identifier: GPL-2.0
+/**
+ * intel-pasid.c - PASID idr, table and entry manipulation
+ *
+ * Copyright (C) 2018 Intel Corporation
+ *
+ * Author: Lu Baolu <baolu.lu@linux.intel.com>
+ */
+
+#define pr_fmt(fmt) "DMAR: " fmt
+
+#include <linux/bitops.h>
+#include <linux/cpufeature.h>
+#include <linux/dmar.h>
+#include <linux/intel-iommu.h>
+#include <linux/iommu.h>
+#include <linux/memory.h>
+#include <linux/pci.h>
+#include <linux/pci-ats.h>
+#include <linux/spinlock.h>
+
+#include "pasid.h"
+
+/*
+ * Intel IOMMU system wide PASID name space:
+ */
+u32 intel_pasid_max_id = PASID_MAX;
+
+int vcmd_alloc_pasid(struct intel_iommu *iommu, u32 *pasid)
+{
+ unsigned long flags;
+ u8 status_code;
+ int ret = 0;
+ u64 res;
+
+ raw_spin_lock_irqsave(&iommu->register_lock, flags);
+ dmar_writeq(iommu->reg + DMAR_VCMD_REG, VCMD_CMD_ALLOC);
+ IOMMU_WAIT_OP(iommu, DMAR_VCRSP_REG, dmar_readq,
+ !(res & VCMD_VRSP_IP), res);
+ raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
+
+ status_code = VCMD_VRSP_SC(res);
+ switch (status_code) {
+ case VCMD_VRSP_SC_SUCCESS:
+ *pasid = VCMD_VRSP_RESULT_PASID(res);
+ break;
+ case VCMD_VRSP_SC_NO_PASID_AVAIL:
+ pr_info("IOMMU: %s: No PASID available\n", iommu->name);
+ ret = -ENOSPC;
+ break;
+ default:
+ ret = -ENODEV;
+ pr_warn("IOMMU: %s: Unexpected error code %d\n",
+ iommu->name, status_code);
+ }
+
+ return ret;
+}
+
+void vcmd_free_pasid(struct intel_iommu *iommu, u32 pasid)
+{
+ unsigned long flags;
+ u8 status_code;
+ u64 res;
+
+ raw_spin_lock_irqsave(&iommu->register_lock, flags);
+ dmar_writeq(iommu->reg + DMAR_VCMD_REG,
+ VCMD_CMD_OPERAND(pasid) | VCMD_CMD_FREE);
+ IOMMU_WAIT_OP(iommu, DMAR_VCRSP_REG, dmar_readq,
+ !(res & VCMD_VRSP_IP), res);
+ raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
+
+ status_code = VCMD_VRSP_SC(res);
+ switch (status_code) {
+ case VCMD_VRSP_SC_SUCCESS:
+ break;
+ case VCMD_VRSP_SC_INVALID_PASID:
+ pr_info("IOMMU: %s: Invalid PASID\n", iommu->name);
+ break;
+ default:
+ pr_warn("IOMMU: %s: Unexpected error code %d\n",
+ iommu->name, status_code);
+ }
+}
+
+/*
+ * Per device pasid table management:
+ */
+static inline void
+device_attach_pasid_table(struct device_domain_info *info,
+ struct pasid_table *pasid_table)
+{
+ info->pasid_table = pasid_table;
+ list_add(&info->table, &pasid_table->dev);
+}
+
+static inline void
+device_detach_pasid_table(struct device_domain_info *info,
+ struct pasid_table *pasid_table)
+{
+ info->pasid_table = NULL;
+ list_del(&info->table);
+}
+
+struct pasid_table_opaque {
+ struct pasid_table **pasid_table;
+ int segment;
+ int bus;
+ int devfn;
+};
+
+static int search_pasid_table(struct device_domain_info *info, void *opaque)
+{
+ struct pasid_table_opaque *data = opaque;
+
+ if (info->iommu->segment == data->segment &&
+ info->bus == data->bus &&
+ info->devfn == data->devfn &&
+ info->pasid_table) {
+ *data->pasid_table = info->pasid_table;
+ return 1;
+ }
+
+ return 0;
+}
+
+static int get_alias_pasid_table(struct pci_dev *pdev, u16 alias, void *opaque)
+{
+ struct pasid_table_opaque *data = opaque;
+
+ data->segment = pci_domain_nr(pdev->bus);
+ data->bus = PCI_BUS_NUM(alias);
+ data->devfn = alias & 0xff;
+
+ return for_each_device_domain(&search_pasid_table, data);
+}
+
+/*
+ * Allocate a pasid table for @dev. It should be called in a
+ * single-thread context.
+ */
+int intel_pasid_alloc_table(struct device *dev)
+{
+ struct device_domain_info *info;
+ struct pasid_table *pasid_table;
+ struct pasid_table_opaque data;
+ struct page *pages;
+ u32 max_pasid = 0;
+ int ret, order;
+ int size;
+
+ might_sleep();
+ info = get_domain_info(dev);
+ if (WARN_ON(!info || !dev_is_pci(dev) || info->pasid_table))
+ return -EINVAL;
+
+ /* DMA alias device already has a pasid table, use it: */
+ data.pasid_table = &pasid_table;
+ ret = pci_for_each_dma_alias(to_pci_dev(dev),
+ &get_alias_pasid_table, &data);
+ if (ret)
+ goto attach_out;
+
+ pasid_table = kzalloc(sizeof(*pasid_table), GFP_KERNEL);
+ if (!pasid_table)
+ return -ENOMEM;
+ INIT_LIST_HEAD(&pasid_table->dev);
+
+ if (info->pasid_supported)
+ max_pasid = min_t(u32, pci_max_pasids(to_pci_dev(dev)),
+ intel_pasid_max_id);
+
+ size = max_pasid >> (PASID_PDE_SHIFT - 3);
+ order = size ? get_order(size) : 0;
+ pages = alloc_pages_node(info->iommu->node,
+ GFP_KERNEL | __GFP_ZERO, order);
+ if (!pages) {
+ kfree(pasid_table);
+ return -ENOMEM;
+ }
+
+ pasid_table->table = page_address(pages);
+ pasid_table->order = order;
+ pasid_table->max_pasid = 1 << (order + PAGE_SHIFT + 3);
+
+attach_out:
+ device_attach_pasid_table(info, pasid_table);
+
+ if (!ecap_coherent(info->iommu->ecap))
+ clflush_cache_range(pasid_table->table, (1 << order) * PAGE_SIZE);
+
+ return 0;
+}
+
+void intel_pasid_free_table(struct device *dev)
+{
+ struct device_domain_info *info;
+ struct pasid_table *pasid_table;
+ struct pasid_dir_entry *dir;
+ struct pasid_entry *table;
+ int i, max_pde;
+
+ info = get_domain_info(dev);
+ if (!info || !dev_is_pci(dev) || !info->pasid_table)
+ return;
+
+ pasid_table = info->pasid_table;
+ device_detach_pasid_table(info, pasid_table);
+
+ if (!list_empty(&pasid_table->dev))
+ return;
+
+ /* Free scalable mode PASID directory tables: */
+ dir = pasid_table->table;
+ max_pde = pasid_table->max_pasid >> PASID_PDE_SHIFT;
+ for (i = 0; i < max_pde; i++) {
+ table = get_pasid_table_from_pde(&dir[i]);
+ free_pgtable_page(table);
+ }
+
+ free_pages((unsigned long)pasid_table->table, pasid_table->order);
+ kfree(pasid_table);
+}
+
+struct pasid_table *intel_pasid_get_table(struct device *dev)
+{
+ struct device_domain_info *info;
+
+ info = get_domain_info(dev);
+ if (!info)
+ return NULL;
+
+ return info->pasid_table;
+}
+
+int intel_pasid_get_dev_max_id(struct device *dev)
+{
+ struct device_domain_info *info;
+
+ info = get_domain_info(dev);
+ if (!info || !info->pasid_table)
+ return 0;
+
+ return info->pasid_table->max_pasid;
+}
+
+struct pasid_entry *intel_pasid_get_entry(struct device *dev, u32 pasid)
+{
+ struct device_domain_info *info;
+ struct pasid_table *pasid_table;
+ struct pasid_dir_entry *dir;
+ struct pasid_entry *entries;
+ int dir_index, index;
+
+ pasid_table = intel_pasid_get_table(dev);
+ if (WARN_ON(!pasid_table || pasid >= intel_pasid_get_dev_max_id(dev)))
+ return NULL;
+
+ dir = pasid_table->table;
+ info = get_domain_info(dev);
+ dir_index = pasid >> PASID_PDE_SHIFT;
+ index = pasid & PASID_PTE_MASK;
+
+retry:
+ entries = get_pasid_table_from_pde(&dir[dir_index]);
+ if (!entries) {
+ entries = alloc_pgtable_page(info->iommu->node);
+ if (!entries)
+ return NULL;
+
+ /*
+ * The pasid directory table entry won't be freed after
+ * allocation. No worry about the race with free and
+ * clear. However, this entry might be populated by others
+ * while we are preparing it. Use theirs with a retry.
+ */
+ if (cmpxchg64(&dir[dir_index].val, 0ULL,
+ (u64)virt_to_phys(entries) | PASID_PTE_PRESENT)) {
+ free_pgtable_page(entries);
+ goto retry;
+ }
+ if (!ecap_coherent(info->iommu->ecap)) {
+ clflush_cache_range(entries, VTD_PAGE_SIZE);
+ clflush_cache_range(&dir[dir_index].val, sizeof(*dir));
+ }
+ }
+
+ return &entries[index];
+}
+
+/*
+ * Interfaces for PASID table entry manipulation:
+ */
+static inline void pasid_clear_entry(struct pasid_entry *pe)
+{
+ WRITE_ONCE(pe->val[0], 0);
+ WRITE_ONCE(pe->val[1], 0);
+ WRITE_ONCE(pe->val[2], 0);
+ WRITE_ONCE(pe->val[3], 0);
+ WRITE_ONCE(pe->val[4], 0);
+ WRITE_ONCE(pe->val[5], 0);
+ WRITE_ONCE(pe->val[6], 0);
+ WRITE_ONCE(pe->val[7], 0);
+}
+
+static inline void pasid_clear_entry_with_fpd(struct pasid_entry *pe)
+{
+ WRITE_ONCE(pe->val[0], PASID_PTE_FPD);
+ WRITE_ONCE(pe->val[1], 0);
+ WRITE_ONCE(pe->val[2], 0);
+ WRITE_ONCE(pe->val[3], 0);
+ WRITE_ONCE(pe->val[4], 0);
+ WRITE_ONCE(pe->val[5], 0);
+ WRITE_ONCE(pe->val[6], 0);
+ WRITE_ONCE(pe->val[7], 0);
+}
+
+static void
+intel_pasid_clear_entry(struct device *dev, u32 pasid, bool fault_ignore)
+{
+ struct pasid_entry *pe;
+
+ pe = intel_pasid_get_entry(dev, pasid);
+ if (WARN_ON(!pe))
+ return;
+
+ if (fault_ignore && pasid_pte_is_present(pe))
+ pasid_clear_entry_with_fpd(pe);
+ else
+ pasid_clear_entry(pe);
+}
+
+static inline void pasid_set_bits(u64 *ptr, u64 mask, u64 bits)
+{
+ u64 old;
+
+ old = READ_ONCE(*ptr);
+ WRITE_ONCE(*ptr, (old & ~mask) | bits);
+}
+
+/*
+ * Setup the DID(Domain Identifier) field (Bit 64~79) of scalable mode
+ * PASID entry.
+ */
+static inline void
+pasid_set_domain_id(struct pasid_entry *pe, u64 value)
+{
+ pasid_set_bits(&pe->val[1], GENMASK_ULL(15, 0), value);
+}
+
+/*
+ * Get domain ID value of a scalable mode PASID entry.
+ */
+static inline u16
+pasid_get_domain_id(struct pasid_entry *pe)
+{
+ return (u16)(READ_ONCE(pe->val[1]) & GENMASK_ULL(15, 0));
+}
+
+/*
+ * Setup the SLPTPTR(Second Level Page Table Pointer) field (Bit 12~63)
+ * of a scalable mode PASID entry.
+ */
+static inline void
+pasid_set_slptr(struct pasid_entry *pe, u64 value)
+{
+ pasid_set_bits(&pe->val[0], VTD_PAGE_MASK, value);
+}
+
+/*
+ * Setup the AW(Address Width) field (Bit 2~4) of a scalable mode PASID
+ * entry.
+ */
+static inline void
+pasid_set_address_width(struct pasid_entry *pe, u64 value)
+{
+ pasid_set_bits(&pe->val[0], GENMASK_ULL(4, 2), value << 2);
+}
+
+/*
+ * Setup the PGTT(PASID Granular Translation Type) field (Bit 6~8)
+ * of a scalable mode PASID entry.
+ */
+static inline void
+pasid_set_translation_type(struct pasid_entry *pe, u64 value)
+{
+ pasid_set_bits(&pe->val[0], GENMASK_ULL(8, 6), value << 6);
+}
+
+/*
+ * Enable fault processing by clearing the FPD(Fault Processing
+ * Disable) field (Bit 1) of a scalable mode PASID entry.
+ */
+static inline void pasid_set_fault_enable(struct pasid_entry *pe)
+{
+ pasid_set_bits(&pe->val[0], 1 << 1, 0);
+}
+
+/*
+ * Setup the SRE(Supervisor Request Enable) field (Bit 128) of a
+ * scalable mode PASID entry.
+ */
+static inline void pasid_set_sre(struct pasid_entry *pe)
+{
+ pasid_set_bits(&pe->val[2], 1 << 0, 1);
+}
+
+/*
+ * Setup the P(Present) field (Bit 0) of a scalable mode PASID
+ * entry.
+ */
+static inline void pasid_set_present(struct pasid_entry *pe)
+{
+ pasid_set_bits(&pe->val[0], 1 << 0, 1);
+}
+
+/*
+ * Setup Page Walk Snoop bit (Bit 87) of a scalable mode PASID
+ * entry.
+ */
+static inline void pasid_set_page_snoop(struct pasid_entry *pe, bool value)
+{
+ pasid_set_bits(&pe->val[1], 1 << 23, value << 23);
+}
+
+/*
+ * Setup the Page Snoop (PGSNP) field (Bit 88) of a scalable mode
+ * PASID entry.
+ */
+static inline void
+pasid_set_pgsnp(struct pasid_entry *pe)
+{
+ pasid_set_bits(&pe->val[1], 1ULL << 24, 1ULL << 24);
+}
+
+/*
+ * Setup the First Level Page table Pointer field (Bit 140~191)
+ * of a scalable mode PASID entry.
+ */
+static inline void
+pasid_set_flptr(struct pasid_entry *pe, u64 value)
+{
+ pasid_set_bits(&pe->val[2], VTD_PAGE_MASK, value);
+}
+
+/*
+ * Setup the First Level Paging Mode field (Bit 130~131) of a
+ * scalable mode PASID entry.
+ */
+static inline void
+pasid_set_flpm(struct pasid_entry *pe, u64 value)
+{
+ pasid_set_bits(&pe->val[2], GENMASK_ULL(3, 2), value << 2);
+}
+
+/*
+ * Setup the Extended Access Flag Enable (EAFE) field (Bit 135)
+ * of a scalable mode PASID entry.
+ */
+static inline void
+pasid_set_eafe(struct pasid_entry *pe)
+{
+ pasid_set_bits(&pe->val[2], 1 << 7, 1 << 7);
+}
+
+static void
+pasid_cache_invalidation_with_pasid(struct intel_iommu *iommu,
+ u16 did, u32 pasid)
+{
+ struct qi_desc desc;
+
+ desc.qw0 = QI_PC_DID(did) | QI_PC_GRAN(QI_PC_PASID_SEL) |
+ QI_PC_PASID(pasid) | QI_PC_TYPE;
+ desc.qw1 = 0;
+ desc.qw2 = 0;
+ desc.qw3 = 0;
+
+ qi_submit_sync(iommu, &desc, 1, 0);
+}
+
+static void
+devtlb_invalidation_with_pasid(struct intel_iommu *iommu,
+ struct device *dev, u32 pasid)
+{
+ struct device_domain_info *info;
+ u16 sid, qdep, pfsid;
+
+ info = get_domain_info(dev);
+ if (!info || !info->ats_enabled)
+ return;
+
+ sid = info->bus << 8 | info->devfn;
+ qdep = info->ats_qdep;
+ pfsid = info->pfsid;
+
+ /*
+ * When PASID 0 is used, it indicates RID2PASID(DMA request w/o PASID),
+ * devTLB flush w/o PASID should be used. For non-zero PASID under
+ * SVA usage, device could do DMA with multiple PASIDs. It is more
+ * efficient to flush devTLB specific to the PASID.
+ */
+ if (pasid == PASID_RID2PASID)
+ qi_flush_dev_iotlb(iommu, sid, pfsid, qdep, 0, 64 - VTD_PAGE_SHIFT);
+ else
+ qi_flush_dev_iotlb_pasid(iommu, sid, pfsid, pasid, qdep, 0, 64 - VTD_PAGE_SHIFT);
+}
+
+void intel_pasid_tear_down_entry(struct intel_iommu *iommu, struct device *dev,
+ u32 pasid, bool fault_ignore)
+{
+ struct pasid_entry *pte;
+ u16 did, pgtt;
+
+ pte = intel_pasid_get_entry(dev, pasid);
+ if (WARN_ON(!pte))
+ return;
+
+ did = pasid_get_domain_id(pte);
+ pgtt = pasid_pte_get_pgtt(pte);
+
+ intel_pasid_clear_entry(dev, pasid, fault_ignore);
+
+ if (!ecap_coherent(iommu->ecap))
+ clflush_cache_range(pte, sizeof(*pte));
+
+ pasid_cache_invalidation_with_pasid(iommu, did, pasid);
+
+ if (pgtt == PASID_ENTRY_PGTT_PT || pgtt == PASID_ENTRY_PGTT_FL_ONLY)
+ qi_flush_piotlb(iommu, did, pasid, 0, -1, 0);
+ else
+ iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
+
+ /* Device IOTLB doesn't need to be flushed in caching mode. */
+ if (!cap_caching_mode(iommu->cap))
+ devtlb_invalidation_with_pasid(iommu, dev, pasid);
+}
+
+static void pasid_flush_caches(struct intel_iommu *iommu,
+ struct pasid_entry *pte,
+ u32 pasid, u16 did)
+{
+ if (!ecap_coherent(iommu->ecap))
+ clflush_cache_range(pte, sizeof(*pte));
+
+ if (cap_caching_mode(iommu->cap)) {
+ pasid_cache_invalidation_with_pasid(iommu, did, pasid);
+ qi_flush_piotlb(iommu, did, pasid, 0, -1, 0);
+ } else {
+ iommu_flush_write_buffer(iommu);
+ }
+}
+
+/*
+ * Set up the scalable mode pasid table entry for first only
+ * translation type.
+ */
+int intel_pasid_setup_first_level(struct intel_iommu *iommu,
+ struct device *dev, pgd_t *pgd,
+ u32 pasid, u16 did, int flags)
+{
+ struct pasid_entry *pte;
+
+ if (!ecap_flts(iommu->ecap)) {
+ pr_err("No first level translation support on %s\n",
+ iommu->name);
+ return -EINVAL;
+ }
+
+ pte = intel_pasid_get_entry(dev, pasid);
+ if (WARN_ON(!pte))
+ return -EINVAL;
+
+ pasid_clear_entry(pte);
+
+ /* Setup the first level page table pointer: */
+ pasid_set_flptr(pte, (u64)__pa(pgd));
+ if (flags & PASID_FLAG_SUPERVISOR_MODE) {
+ if (!ecap_srs(iommu->ecap)) {
+ pr_err("No supervisor request support on %s\n",
+ iommu->name);
+ return -EINVAL;
+ }
+ pasid_set_sre(pte);
+ }
+
+ if (flags & PASID_FLAG_FL5LP) {
+ if (cap_5lp_support(iommu->cap)) {
+ pasid_set_flpm(pte, 1);
+ } else {
+ pr_err("No 5-level paging support for first-level\n");
+ pasid_clear_entry(pte);
+ return -EINVAL;
+ }
+ }
+
+ if (flags & PASID_FLAG_PAGE_SNOOP)
+ pasid_set_pgsnp(pte);
+
+ pasid_set_domain_id(pte, did);
+ pasid_set_address_width(pte, iommu->agaw);
+ pasid_set_page_snoop(pte, !!ecap_smpwc(iommu->ecap));
+
+ /* Setup Present and PASID Granular Transfer Type: */
+ pasid_set_translation_type(pte, PASID_ENTRY_PGTT_FL_ONLY);
+ pasid_set_present(pte);
+ pasid_flush_caches(iommu, pte, pasid, did);
+
+ return 0;
+}
+
+/*
+ * Skip top levels of page tables for iommu which has less agaw
+ * than default. Unnecessary for PT mode.
+ */
+static inline int iommu_skip_agaw(struct dmar_domain *domain,
+ struct intel_iommu *iommu,
+ struct dma_pte **pgd)
+{
+ int agaw;
+
+ for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
+ *pgd = phys_to_virt(dma_pte_addr(*pgd));
+ if (!dma_pte_present(*pgd))
+ return -EINVAL;
+ }
+
+ return agaw;
+}
+
+/*
+ * Set up the scalable mode pasid entry for second only translation type.
+ */
+int intel_pasid_setup_second_level(struct intel_iommu *iommu,
+ struct dmar_domain *domain,
+ struct device *dev, u32 pasid)
+{
+ struct pasid_entry *pte;
+ struct dma_pte *pgd;
+ u64 pgd_val;
+ int agaw;
+ u16 did;
+
+ /*
+ * If hardware advertises no support for second level
+ * translation, return directly.
+ */
+ if (!ecap_slts(iommu->ecap)) {
+ pr_err("No second level translation support on %s\n",
+ iommu->name);
+ return -EINVAL;
+ }
+
+ pgd = domain->pgd;
+ agaw = iommu_skip_agaw(domain, iommu, &pgd);
+ if (agaw < 0) {
+ dev_err(dev, "Invalid domain page table\n");
+ return -EINVAL;
+ }
+
+ pgd_val = virt_to_phys(pgd);
+ did = domain->iommu_did[iommu->seq_id];
+
+ pte = intel_pasid_get_entry(dev, pasid);
+ if (!pte) {
+ dev_err(dev, "Failed to get pasid entry of PASID %d\n", pasid);
+ return -ENODEV;
+ }
+
+ pasid_clear_entry(pte);
+ pasid_set_domain_id(pte, did);
+ pasid_set_slptr(pte, pgd_val);
+ pasid_set_address_width(pte, agaw);
+ pasid_set_translation_type(pte, PASID_ENTRY_PGTT_SL_ONLY);
+ pasid_set_fault_enable(pte);
+ pasid_set_page_snoop(pte, !!ecap_smpwc(iommu->ecap));
+
+ if (domain->domain.type == IOMMU_DOMAIN_UNMANAGED)
+ pasid_set_pgsnp(pte);
+
+ /*
+ * Since it is a second level only translation setup, we should
+ * set SRE bit as well (addresses are expected to be GPAs).
+ */
+ if (pasid != PASID_RID2PASID && ecap_srs(iommu->ecap))
+ pasid_set_sre(pte);
+ pasid_set_present(pte);
+ pasid_flush_caches(iommu, pte, pasid, did);
+
+ return 0;
+}
+
+/*
+ * Set up the scalable mode pasid entry for passthrough translation type.
+ */
+int intel_pasid_setup_pass_through(struct intel_iommu *iommu,
+ struct dmar_domain *domain,
+ struct device *dev, u32 pasid)
+{
+ u16 did = FLPT_DEFAULT_DID;
+ struct pasid_entry *pte;
+
+ pte = intel_pasid_get_entry(dev, pasid);
+ if (!pte) {
+ dev_err(dev, "Failed to get pasid entry of PASID %d\n", pasid);
+ return -ENODEV;
+ }
+
+ pasid_clear_entry(pte);
+ pasid_set_domain_id(pte, did);
+ pasid_set_address_width(pte, iommu->agaw);
+ pasid_set_translation_type(pte, PASID_ENTRY_PGTT_PT);
+ pasid_set_fault_enable(pte);
+ pasid_set_page_snoop(pte, !!ecap_smpwc(iommu->ecap));
+
+ /*
+ * We should set SRE bit as well since the addresses are expected
+ * to be GPAs.
+ */
+ if (ecap_srs(iommu->ecap))
+ pasid_set_sre(pte);
+ pasid_set_present(pte);
+ pasid_flush_caches(iommu, pte, pasid, did);
+
+ return 0;
+}
+
+static int
+intel_pasid_setup_bind_data(struct intel_iommu *iommu, struct pasid_entry *pte,
+ struct iommu_gpasid_bind_data_vtd *pasid_data)
+{
+ /*
+ * Not all guest PASID table entry fields are passed down during bind,
+ * here we only set up the ones that are dependent on guest settings.
+ * Execution related bits such as NXE, SMEP are not supported.
+ * Other fields, such as snoop related, are set based on host needs
+ * regardless of guest settings.
+ */
+ if (pasid_data->flags & IOMMU_SVA_VTD_GPASID_SRE) {
+ if (!ecap_srs(iommu->ecap)) {
+ pr_err_ratelimited("No supervisor request support on %s\n",
+ iommu->name);
+ return -EINVAL;
+ }
+ pasid_set_sre(pte);
+ }
+
+ if (pasid_data->flags & IOMMU_SVA_VTD_GPASID_EAFE) {
+ if (!ecap_eafs(iommu->ecap)) {
+ pr_err_ratelimited("No extended access flag support on %s\n",
+ iommu->name);
+ return -EINVAL;
+ }
+ pasid_set_eafe(pte);
+ }
+
+ /*
+ * Memory type is only applicable to devices inside processor coherent
+ * domain. Will add MTS support once coherent devices are available.
+ */
+ if (pasid_data->flags & IOMMU_SVA_VTD_GPASID_MTS_MASK) {
+ pr_warn_ratelimited("No memory type support %s\n",
+ iommu->name);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+/**
+ * intel_pasid_setup_nested() - Set up PASID entry for nested translation.
+ * This could be used for guest shared virtual address. In this case, the
+ * first level page tables are used for GVA-GPA translation in the guest,
+ * second level page tables are used for GPA-HPA translation.
+ *
+ * @iommu: IOMMU which the device belong to
+ * @dev: Device to be set up for translation
+ * @gpgd: FLPTPTR: First Level Page translation pointer in GPA
+ * @pasid: PASID to be programmed in the device PASID table
+ * @pasid_data: Additional PASID info from the guest bind request
+ * @domain: Domain info for setting up second level page tables
+ * @addr_width: Address width of the first level (guest)
+ */
+int intel_pasid_setup_nested(struct intel_iommu *iommu, struct device *dev,
+ pgd_t *gpgd, u32 pasid,
+ struct iommu_gpasid_bind_data_vtd *pasid_data,
+ struct dmar_domain *domain, int addr_width)
+{
+ struct pasid_entry *pte;
+ struct dma_pte *pgd;
+ int ret = 0;
+ u64 pgd_val;
+ int agaw;
+ u16 did;
+
+ if (!ecap_nest(iommu->ecap)) {
+ pr_err_ratelimited("IOMMU: %s: No nested translation support\n",
+ iommu->name);
+ return -EINVAL;
+ }
+
+ if (!(domain->flags & DOMAIN_FLAG_NESTING_MODE)) {
+ pr_err_ratelimited("Domain is not in nesting mode, %x\n",
+ domain->flags);
+ return -EINVAL;
+ }
+
+ pte = intel_pasid_get_entry(dev, pasid);
+ if (WARN_ON(!pte))
+ return -EINVAL;
+
+ /*
+ * Caller must ensure PASID entry is not in use, i.e. not bind the
+ * same PASID to the same device twice.
+ */
+ if (pasid_pte_is_present(pte))
+ return -EBUSY;
+
+ pasid_clear_entry(pte);
+
+ /* Sanity checking performed by caller to make sure address
+ * width matching in two dimensions:
+ * 1. CPU vs. IOMMU
+ * 2. Guest vs. Host.
+ */
+ switch (addr_width) {
+#ifdef CONFIG_X86
+ case ADDR_WIDTH_5LEVEL:
+ if (!cpu_feature_enabled(X86_FEATURE_LA57) ||
+ !cap_5lp_support(iommu->cap)) {
+ dev_err_ratelimited(dev,
+ "5-level paging not supported\n");
+ return -EINVAL;
+ }
+
+ pasid_set_flpm(pte, 1);
+ break;
+#endif
+ case ADDR_WIDTH_4LEVEL:
+ pasid_set_flpm(pte, 0);
+ break;
+ default:
+ dev_err_ratelimited(dev, "Invalid guest address width %d\n",
+ addr_width);
+ return -EINVAL;
+ }
+
+ /* First level PGD is in GPA, must be supported by the second level */
+ if ((uintptr_t)gpgd > domain->max_addr) {
+ dev_err_ratelimited(dev,
+ "Guest PGD %lx not supported, max %llx\n",
+ (uintptr_t)gpgd, domain->max_addr);
+ return -EINVAL;
+ }
+ pasid_set_flptr(pte, (uintptr_t)gpgd);
+
+ ret = intel_pasid_setup_bind_data(iommu, pte, pasid_data);
+ if (ret)
+ return ret;
+
+ /* Setup the second level based on the given domain */
+ pgd = domain->pgd;
+
+ agaw = iommu_skip_agaw(domain, iommu, &pgd);
+ if (agaw < 0) {
+ dev_err_ratelimited(dev, "Invalid domain page table\n");
+ return -EINVAL;
+ }
+ pgd_val = virt_to_phys(pgd);
+ pasid_set_slptr(pte, pgd_val);
+ pasid_set_fault_enable(pte);
+
+ did = domain->iommu_did[iommu->seq_id];
+ pasid_set_domain_id(pte, did);
+
+ pasid_set_address_width(pte, agaw);
+ pasid_set_page_snoop(pte, !!ecap_smpwc(iommu->ecap));
+
+ pasid_set_translation_type(pte, PASID_ENTRY_PGTT_NESTED);
+ pasid_set_present(pte);
+ pasid_flush_caches(iommu, pte, pasid, did);
+
+ return ret;
+}
diff --git a/drivers/iommu/intel/pasid.h b/drivers/iommu/intel/pasid.h
new file mode 100644
index 000000000..35963e6bf
--- /dev/null
+++ b/drivers/iommu/intel/pasid.h
@@ -0,0 +1,135 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * pasid.h - PASID idr, table and entry header
+ *
+ * Copyright (C) 2018 Intel Corporation
+ *
+ * Author: Lu Baolu <baolu.lu@linux.intel.com>
+ */
+
+#ifndef __INTEL_PASID_H
+#define __INTEL_PASID_H
+
+#define PASID_RID2PASID 0x0
+#define PASID_MIN 0x1
+#define PASID_MAX 0x100000
+#define PASID_PTE_MASK 0x3F
+#define PASID_PTE_PRESENT 1
+#define PASID_PTE_FPD 2
+#define PDE_PFN_MASK PAGE_MASK
+#define PASID_PDE_SHIFT 6
+#define MAX_NR_PASID_BITS 20
+#define PASID_TBL_ENTRIES BIT(PASID_PDE_SHIFT)
+
+#define is_pasid_enabled(entry) (((entry)->lo >> 3) & 0x1)
+#define get_pasid_dir_size(entry) (1 << ((((entry)->lo >> 9) & 0x7) + 7))
+
+/* Virtual command interface for enlightened pasid management. */
+#define VCMD_CMD_ALLOC 0x1
+#define VCMD_CMD_FREE 0x2
+#define VCMD_VRSP_IP 0x1
+#define VCMD_VRSP_SC(e) (((e) & 0xff) >> 1)
+#define VCMD_VRSP_SC_SUCCESS 0
+#define VCMD_VRSP_SC_NO_PASID_AVAIL 16
+#define VCMD_VRSP_SC_INVALID_PASID 16
+#define VCMD_VRSP_RESULT_PASID(e) (((e) >> 16) & 0xfffff)
+#define VCMD_CMD_OPERAND(e) ((e) << 16)
+/*
+ * Domain ID reserved for pasid entries programmed for first-level
+ * only and pass-through transfer modes.
+ */
+#define FLPT_DEFAULT_DID 1
+
+/*
+ * The SUPERVISOR_MODE flag indicates a first level translation which
+ * can be used for access to kernel addresses. It is valid only for
+ * access to the kernel's static 1:1 mapping of physical memory — not
+ * to vmalloc or even module mappings.
+ */
+#define PASID_FLAG_SUPERVISOR_MODE BIT(0)
+#define PASID_FLAG_NESTED BIT(1)
+#define PASID_FLAG_PAGE_SNOOP BIT(2)
+
+/*
+ * The PASID_FLAG_FL5LP flag Indicates using 5-level paging for first-
+ * level translation, otherwise, 4-level paging will be used.
+ */
+#define PASID_FLAG_FL5LP BIT(1)
+
+struct pasid_dir_entry {
+ u64 val;
+};
+
+struct pasid_entry {
+ u64 val[8];
+};
+
+#define PASID_ENTRY_PGTT_FL_ONLY (1)
+#define PASID_ENTRY_PGTT_SL_ONLY (2)
+#define PASID_ENTRY_PGTT_NESTED (3)
+#define PASID_ENTRY_PGTT_PT (4)
+
+/* The representative of a PASID table */
+struct pasid_table {
+ void *table; /* pasid table pointer */
+ int order; /* page order of pasid table */
+ u32 max_pasid; /* max pasid */
+ struct list_head dev; /* device list */
+};
+
+/* Get PRESENT bit of a PASID directory entry. */
+static inline bool pasid_pde_is_present(struct pasid_dir_entry *pde)
+{
+ return READ_ONCE(pde->val) & PASID_PTE_PRESENT;
+}
+
+/* Get PASID table from a PASID directory entry. */
+static inline struct pasid_entry *
+get_pasid_table_from_pde(struct pasid_dir_entry *pde)
+{
+ if (!pasid_pde_is_present(pde))
+ return NULL;
+
+ return phys_to_virt(READ_ONCE(pde->val) & PDE_PFN_MASK);
+}
+
+/* Get PRESENT bit of a PASID table entry. */
+static inline bool pasid_pte_is_present(struct pasid_entry *pte)
+{
+ return READ_ONCE(pte->val[0]) & PASID_PTE_PRESENT;
+}
+
+/* Get PGTT field of a PASID table entry */
+static inline u16 pasid_pte_get_pgtt(struct pasid_entry *pte)
+{
+ return (u16)((READ_ONCE(pte->val[0]) >> 6) & 0x7);
+}
+
+extern unsigned int intel_pasid_max_id;
+int intel_pasid_alloc_id(void *ptr, int start, int end, gfp_t gfp);
+void intel_pasid_free_id(u32 pasid);
+void *intel_pasid_lookup_id(u32 pasid);
+int intel_pasid_alloc_table(struct device *dev);
+void intel_pasid_free_table(struct device *dev);
+struct pasid_table *intel_pasid_get_table(struct device *dev);
+int intel_pasid_get_dev_max_id(struct device *dev);
+struct pasid_entry *intel_pasid_get_entry(struct device *dev, u32 pasid);
+int intel_pasid_setup_first_level(struct intel_iommu *iommu,
+ struct device *dev, pgd_t *pgd,
+ u32 pasid, u16 did, int flags);
+int intel_pasid_setup_second_level(struct intel_iommu *iommu,
+ struct dmar_domain *domain,
+ struct device *dev, u32 pasid);
+int intel_pasid_setup_pass_through(struct intel_iommu *iommu,
+ struct dmar_domain *domain,
+ struct device *dev, u32 pasid);
+int intel_pasid_setup_nested(struct intel_iommu *iommu,
+ struct device *dev, pgd_t *pgd, u32 pasid,
+ struct iommu_gpasid_bind_data_vtd *pasid_data,
+ struct dmar_domain *domain, int addr_width);
+void intel_pasid_tear_down_entry(struct intel_iommu *iommu,
+ struct device *dev, u32 pasid,
+ bool fault_ignore);
+int vcmd_alloc_pasid(struct intel_iommu *iommu, u32 *pasid);
+void vcmd_free_pasid(struct intel_iommu *iommu, u32 pasid);
+#endif /* __INTEL_PASID_H */
diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
new file mode 100644
index 000000000..aabf56272
--- /dev/null
+++ b/drivers/iommu/intel/svm.c
@@ -0,0 +1,1223 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright © 2015 Intel Corporation.
+ *
+ * Authors: David Woodhouse <dwmw2@infradead.org>
+ */
+
+#include <linux/intel-iommu.h>
+#include <linux/mmu_notifier.h>
+#include <linux/sched.h>
+#include <linux/sched/mm.h>
+#include <linux/slab.h>
+#include <linux/intel-svm.h>
+#include <linux/rculist.h>
+#include <linux/pci.h>
+#include <linux/pci-ats.h>
+#include <linux/dmar.h>
+#include <linux/interrupt.h>
+#include <linux/mm_types.h>
+#include <linux/ioasid.h>
+#include <asm/page.h>
+#include <asm/fpu/api.h>
+
+#include "pasid.h"
+
+static irqreturn_t prq_event_thread(int irq, void *d);
+static void intel_svm_drain_prq(struct device *dev, u32 pasid);
+
+#define PRQ_ORDER 0
+
+int intel_svm_enable_prq(struct intel_iommu *iommu)
+{
+ struct page *pages;
+ int irq, ret;
+
+ pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, PRQ_ORDER);
+ if (!pages) {
+ pr_warn("IOMMU: %s: Failed to allocate page request queue\n",
+ iommu->name);
+ return -ENOMEM;
+ }
+ iommu->prq = page_address(pages);
+
+ irq = dmar_alloc_hwirq(DMAR_UNITS_SUPPORTED + iommu->seq_id, iommu->node, iommu);
+ if (irq <= 0) {
+ pr_err("IOMMU: %s: Failed to create IRQ vector for page request queue\n",
+ iommu->name);
+ ret = -EINVAL;
+ err:
+ free_pages((unsigned long)iommu->prq, PRQ_ORDER);
+ iommu->prq = NULL;
+ return ret;
+ }
+ iommu->pr_irq = irq;
+
+ snprintf(iommu->prq_name, sizeof(iommu->prq_name), "dmar%d-prq", iommu->seq_id);
+
+ ret = request_threaded_irq(irq, NULL, prq_event_thread, IRQF_ONESHOT,
+ iommu->prq_name, iommu);
+ if (ret) {
+ pr_err("IOMMU: %s: Failed to request IRQ for page request queue\n",
+ iommu->name);
+ dmar_free_hwirq(irq);
+ iommu->pr_irq = 0;
+ goto err;
+ }
+ dmar_writeq(iommu->reg + DMAR_PQH_REG, 0ULL);
+ dmar_writeq(iommu->reg + DMAR_PQT_REG, 0ULL);
+ dmar_writeq(iommu->reg + DMAR_PQA_REG, virt_to_phys(iommu->prq) | PRQ_ORDER);
+
+ init_completion(&iommu->prq_complete);
+
+ return 0;
+}
+
+int intel_svm_finish_prq(struct intel_iommu *iommu)
+{
+ dmar_writeq(iommu->reg + DMAR_PQH_REG, 0ULL);
+ dmar_writeq(iommu->reg + DMAR_PQT_REG, 0ULL);
+ dmar_writeq(iommu->reg + DMAR_PQA_REG, 0ULL);
+
+ if (iommu->pr_irq) {
+ free_irq(iommu->pr_irq, iommu);
+ dmar_free_hwirq(iommu->pr_irq);
+ iommu->pr_irq = 0;
+ }
+
+ free_pages((unsigned long)iommu->prq, PRQ_ORDER);
+ iommu->prq = NULL;
+
+ return 0;
+}
+
+static inline bool intel_svm_capable(struct intel_iommu *iommu)
+{
+ return iommu->flags & VTD_FLAG_SVM_CAPABLE;
+}
+
+void intel_svm_check(struct intel_iommu *iommu)
+{
+ if (!pasid_supported(iommu))
+ return;
+
+ if (cpu_feature_enabled(X86_FEATURE_GBPAGES) &&
+ !cap_fl1gp_support(iommu->cap)) {
+ pr_err("%s SVM disabled, incompatible 1GB page capability\n",
+ iommu->name);
+ return;
+ }
+
+ if (cpu_feature_enabled(X86_FEATURE_LA57) &&
+ !cap_5lp_support(iommu->cap)) {
+ pr_err("%s SVM disabled, incompatible paging mode\n",
+ iommu->name);
+ return;
+ }
+
+ iommu->flags |= VTD_FLAG_SVM_CAPABLE;
+}
+
+static void __flush_svm_range_dev(struct intel_svm *svm,
+ struct intel_svm_dev *sdev,
+ unsigned long address,
+ unsigned long pages, int ih)
+{
+ struct device_domain_info *info = get_domain_info(sdev->dev);
+
+ if (WARN_ON(!pages))
+ return;
+
+ qi_flush_piotlb(sdev->iommu, sdev->did, svm->pasid, address, pages, ih);
+ if (info->ats_enabled)
+ qi_flush_dev_iotlb_pasid(sdev->iommu, sdev->sid, info->pfsid,
+ svm->pasid, sdev->qdep, address,
+ order_base_2(pages));
+}
+
+static void intel_flush_svm_range_dev(struct intel_svm *svm,
+ struct intel_svm_dev *sdev,
+ unsigned long address,
+ unsigned long pages, int ih)
+{
+ unsigned long shift = ilog2(__roundup_pow_of_two(pages));
+ unsigned long align = (1ULL << (VTD_PAGE_SHIFT + shift));
+ unsigned long start = ALIGN_DOWN(address, align);
+ unsigned long end = ALIGN(address + (pages << VTD_PAGE_SHIFT), align);
+
+ while (start < end) {
+ __flush_svm_range_dev(svm, sdev, start, align >> VTD_PAGE_SHIFT, ih);
+ start += align;
+ }
+}
+
+static void intel_flush_svm_range(struct intel_svm *svm, unsigned long address,
+ unsigned long pages, int ih)
+{
+ struct intel_svm_dev *sdev;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(sdev, &svm->devs, list)
+ intel_flush_svm_range_dev(svm, sdev, address, pages, ih);
+ rcu_read_unlock();
+}
+
+/* Pages have been freed at this point */
+static void intel_invalidate_range(struct mmu_notifier *mn,
+ struct mm_struct *mm,
+ unsigned long start, unsigned long end)
+{
+ struct intel_svm *svm = container_of(mn, struct intel_svm, notifier);
+
+ intel_flush_svm_range(svm, start,
+ (end - start + PAGE_SIZE - 1) >> VTD_PAGE_SHIFT, 0);
+}
+
+static void intel_mm_release(struct mmu_notifier *mn, struct mm_struct *mm)
+{
+ struct intel_svm *svm = container_of(mn, struct intel_svm, notifier);
+ struct intel_svm_dev *sdev;
+
+ /* This might end up being called from exit_mmap(), *before* the page
+ * tables are cleared. And __mmu_notifier_release() will delete us from
+ * the list of notifiers so that our invalidate_range() callback doesn't
+ * get called when the page tables are cleared. So we need to protect
+ * against hardware accessing those page tables.
+ *
+ * We do it by clearing the entry in the PASID table and then flushing
+ * the IOTLB and the PASID table caches. This might upset hardware;
+ * perhaps we'll want to point the PASID to a dummy PGD (like the zero
+ * page) so that we end up taking a fault that the hardware really
+ * *has* to handle gracefully without affecting other processes.
+ */
+ rcu_read_lock();
+ list_for_each_entry_rcu(sdev, &svm->devs, list)
+ intel_pasid_tear_down_entry(sdev->iommu, sdev->dev,
+ svm->pasid, true);
+ rcu_read_unlock();
+
+}
+
+static const struct mmu_notifier_ops intel_mmuops = {
+ .release = intel_mm_release,
+ .invalidate_range = intel_invalidate_range,
+};
+
+static DEFINE_MUTEX(pasid_mutex);
+static LIST_HEAD(global_svm_list);
+
+#define for_each_svm_dev(sdev, svm, d) \
+ list_for_each_entry((sdev), &(svm)->devs, list) \
+ if ((d) != (sdev)->dev) {} else
+
+static int pasid_to_svm_sdev(struct device *dev, unsigned int pasid,
+ struct intel_svm **rsvm,
+ struct intel_svm_dev **rsdev)
+{
+ struct intel_svm_dev *d, *sdev = NULL;
+ struct intel_svm *svm;
+
+ /* The caller should hold the pasid_mutex lock */
+ if (WARN_ON(!mutex_is_locked(&pasid_mutex)))
+ return -EINVAL;
+
+ if (pasid == INVALID_IOASID || pasid >= PASID_MAX)
+ return -EINVAL;
+
+ svm = ioasid_find(NULL, pasid, NULL);
+ if (IS_ERR(svm))
+ return PTR_ERR(svm);
+
+ if (!svm)
+ goto out;
+
+ /*
+ * If we found svm for the PASID, there must be at least one device
+ * bond.
+ */
+ if (WARN_ON(list_empty(&svm->devs)))
+ return -EINVAL;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(d, &svm->devs, list) {
+ if (d->dev == dev) {
+ sdev = d;
+ break;
+ }
+ }
+ rcu_read_unlock();
+
+out:
+ *rsvm = svm;
+ *rsdev = sdev;
+
+ return 0;
+}
+
+int intel_svm_bind_gpasid(struct iommu_domain *domain, struct device *dev,
+ struct iommu_gpasid_bind_data *data)
+{
+ struct intel_iommu *iommu = device_to_iommu(dev, NULL, NULL);
+ struct intel_svm_dev *sdev = NULL;
+ struct dmar_domain *dmar_domain;
+ struct device_domain_info *info;
+ struct intel_svm *svm = NULL;
+ unsigned long iflags;
+ int ret = 0;
+
+ if (WARN_ON(!iommu) || !data)
+ return -EINVAL;
+
+ if (data->format != IOMMU_PASID_FORMAT_INTEL_VTD)
+ return -EINVAL;
+
+ /* IOMMU core ensures argsz is more than the start of the union */
+ if (data->argsz < offsetofend(struct iommu_gpasid_bind_data, vendor.vtd))
+ return -EINVAL;
+
+ /* Make sure no undefined flags are used in vendor data */
+ if (data->vendor.vtd.flags & ~(IOMMU_SVA_VTD_GPASID_LAST - 1))
+ return -EINVAL;
+
+ if (!dev_is_pci(dev))
+ return -ENOTSUPP;
+
+ /* VT-d supports devices with full 20 bit PASIDs only */
+ if (pci_max_pasids(to_pci_dev(dev)) != PASID_MAX)
+ return -EINVAL;
+
+ /*
+ * We only check host PASID range, we have no knowledge to check
+ * guest PASID range.
+ */
+ if (data->hpasid <= 0 || data->hpasid >= PASID_MAX)
+ return -EINVAL;
+
+ info = get_domain_info(dev);
+ if (!info)
+ return -EINVAL;
+
+ dmar_domain = to_dmar_domain(domain);
+
+ mutex_lock(&pasid_mutex);
+ ret = pasid_to_svm_sdev(dev, data->hpasid, &svm, &sdev);
+ if (ret)
+ goto out;
+
+ if (sdev) {
+ /*
+ * Do not allow multiple bindings of the same device-PASID since
+ * there is only one SL page tables per PASID. We may revisit
+ * once sharing PGD across domains are supported.
+ */
+ dev_warn_ratelimited(dev, "Already bound with PASID %u\n",
+ svm->pasid);
+ ret = -EBUSY;
+ goto out;
+ }
+
+ if (!svm) {
+ /* We come here when PASID has never been bond to a device. */
+ svm = kzalloc(sizeof(*svm), GFP_KERNEL);
+ if (!svm) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ /* REVISIT: upper layer/VFIO can track host process that bind
+ * the PASID. ioasid_set = mm might be sufficient for vfio to
+ * check pasid VMM ownership. We can drop the following line
+ * once VFIO and IOASID set check is in place.
+ */
+ svm->mm = get_task_mm(current);
+ svm->pasid = data->hpasid;
+ if (data->flags & IOMMU_SVA_GPASID_VAL) {
+ svm->gpasid = data->gpasid;
+ svm->flags |= SVM_FLAG_GUEST_PASID;
+ }
+ ioasid_set_data(data->hpasid, svm);
+ INIT_LIST_HEAD_RCU(&svm->devs);
+ mmput(svm->mm);
+ }
+ sdev = kzalloc(sizeof(*sdev), GFP_KERNEL);
+ if (!sdev) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ sdev->dev = dev;
+ sdev->sid = PCI_DEVID(info->bus, info->devfn);
+ sdev->iommu = iommu;
+
+ /* Only count users if device has aux domains */
+ if (iommu_dev_feature_enabled(dev, IOMMU_DEV_FEAT_AUX))
+ sdev->users = 1;
+
+ /* Set up device context entry for PASID if not enabled already */
+ ret = intel_iommu_enable_pasid(iommu, sdev->dev);
+ if (ret) {
+ dev_err_ratelimited(dev, "Failed to enable PASID capability\n");
+ kfree(sdev);
+ goto out;
+ }
+
+ /*
+ * PASID table is per device for better security. Therefore, for
+ * each bind of a new device even with an existing PASID, we need to
+ * call the nested mode setup function here.
+ */
+ spin_lock_irqsave(&iommu->lock, iflags);
+ ret = intel_pasid_setup_nested(iommu, dev,
+ (pgd_t *)(uintptr_t)data->gpgd,
+ data->hpasid, &data->vendor.vtd, dmar_domain,
+ data->addr_width);
+ spin_unlock_irqrestore(&iommu->lock, iflags);
+ if (ret) {
+ dev_err_ratelimited(dev, "Failed to set up PASID %llu in nested mode, Err %d\n",
+ data->hpasid, ret);
+ /*
+ * PASID entry should be in cleared state if nested mode
+ * set up failed. So we only need to clear IOASID tracking
+ * data such that free call will succeed.
+ */
+ kfree(sdev);
+ goto out;
+ }
+
+ svm->flags |= SVM_FLAG_GUEST_MODE;
+
+ init_rcu_head(&sdev->rcu);
+ list_add_rcu(&sdev->list, &svm->devs);
+ out:
+ if (!IS_ERR_OR_NULL(svm) && list_empty(&svm->devs)) {
+ ioasid_set_data(data->hpasid, NULL);
+ kfree(svm);
+ }
+
+ mutex_unlock(&pasid_mutex);
+ return ret;
+}
+
+int intel_svm_unbind_gpasid(struct device *dev, u32 pasid)
+{
+ struct intel_iommu *iommu = device_to_iommu(dev, NULL, NULL);
+ struct intel_svm_dev *sdev;
+ struct intel_svm *svm;
+ int ret;
+
+ if (WARN_ON(!iommu))
+ return -EINVAL;
+
+ mutex_lock(&pasid_mutex);
+ ret = pasid_to_svm_sdev(dev, pasid, &svm, &sdev);
+ if (ret)
+ goto out;
+
+ if (sdev) {
+ if (iommu_dev_feature_enabled(dev, IOMMU_DEV_FEAT_AUX))
+ sdev->users--;
+ if (!sdev->users) {
+ list_del_rcu(&sdev->list);
+ intel_pasid_tear_down_entry(iommu, dev,
+ svm->pasid, false);
+ intel_svm_drain_prq(dev, svm->pasid);
+ kfree_rcu(sdev, rcu);
+
+ if (list_empty(&svm->devs)) {
+ /*
+ * We do not free the IOASID here in that
+ * IOMMU driver did not allocate it.
+ * Unlike native SVM, IOASID for guest use was
+ * allocated prior to the bind call.
+ * In any case, if the free call comes before
+ * the unbind, IOMMU driver will get notified
+ * and perform cleanup.
+ */
+ ioasid_set_data(pasid, NULL);
+ kfree(svm);
+ }
+ }
+ }
+out:
+ mutex_unlock(&pasid_mutex);
+ return ret;
+}
+
+static void _load_pasid(void *unused)
+{
+ update_pasid();
+}
+
+static void load_pasid(struct mm_struct *mm, u32 pasid)
+{
+ mutex_lock(&mm->context.lock);
+
+ /* Synchronize with READ_ONCE in update_pasid(). */
+ smp_store_release(&mm->pasid, pasid);
+
+ /* Update PASID MSR on all CPUs running the mm's tasks. */
+ on_each_cpu_mask(mm_cpumask(mm), _load_pasid, NULL, true);
+
+ mutex_unlock(&mm->context.lock);
+}
+
+/* Caller must hold pasid_mutex, mm reference */
+static int
+intel_svm_bind_mm(struct device *dev, unsigned int flags,
+ struct svm_dev_ops *ops,
+ struct mm_struct *mm, struct intel_svm_dev **sd)
+{
+ struct intel_iommu *iommu = device_to_iommu(dev, NULL, NULL);
+ struct device_domain_info *info;
+ struct intel_svm_dev *sdev;
+ struct intel_svm *svm = NULL;
+ unsigned long iflags;
+ int pasid_max;
+ int ret;
+
+ if (!iommu || dmar_disabled)
+ return -EINVAL;
+
+ if (!intel_svm_capable(iommu))
+ return -ENOTSUPP;
+
+ if (dev_is_pci(dev)) {
+ pasid_max = pci_max_pasids(to_pci_dev(dev));
+ if (pasid_max < 0)
+ return -EINVAL;
+ } else
+ pasid_max = 1 << 20;
+
+ /* Bind supervisor PASID shuld have mm = NULL */
+ if (flags & SVM_FLAG_SUPERVISOR_MODE) {
+ if (!ecap_srs(iommu->ecap) || mm) {
+ pr_err("Supervisor PASID with user provided mm.\n");
+ return -EINVAL;
+ }
+ }
+
+ if (!(flags & SVM_FLAG_PRIVATE_PASID)) {
+ struct intel_svm *t;
+
+ list_for_each_entry(t, &global_svm_list, list) {
+ if (t->mm != mm || (t->flags & SVM_FLAG_PRIVATE_PASID))
+ continue;
+
+ svm = t;
+ if (svm->pasid >= pasid_max) {
+ dev_warn(dev,
+ "Limited PASID width. Cannot use existing PASID %d\n",
+ svm->pasid);
+ ret = -ENOSPC;
+ goto out;
+ }
+
+ /* Find the matching device in svm list */
+ for_each_svm_dev(sdev, svm, dev) {
+ if (sdev->ops != ops) {
+ ret = -EBUSY;
+ goto out;
+ }
+ sdev->users++;
+ goto success;
+ }
+
+ break;
+ }
+ }
+
+ sdev = kzalloc(sizeof(*sdev), GFP_KERNEL);
+ if (!sdev) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ sdev->dev = dev;
+ sdev->iommu = iommu;
+
+ ret = intel_iommu_enable_pasid(iommu, dev);
+ if (ret) {
+ kfree(sdev);
+ goto out;
+ }
+
+ info = get_domain_info(dev);
+ sdev->did = FLPT_DEFAULT_DID;
+ sdev->sid = PCI_DEVID(info->bus, info->devfn);
+ if (info->ats_enabled) {
+ sdev->dev_iotlb = 1;
+ sdev->qdep = info->ats_qdep;
+ if (sdev->qdep >= QI_DEV_EIOTLB_MAX_INVS)
+ sdev->qdep = 0;
+ }
+
+ /* Finish the setup now we know we're keeping it */
+ sdev->users = 1;
+ sdev->ops = ops;
+ init_rcu_head(&sdev->rcu);
+
+ if (!svm) {
+ svm = kzalloc(sizeof(*svm), GFP_KERNEL);
+ if (!svm) {
+ ret = -ENOMEM;
+ kfree(sdev);
+ goto out;
+ }
+
+ if (pasid_max > intel_pasid_max_id)
+ pasid_max = intel_pasid_max_id;
+
+ /* Do not use PASID 0, reserved for RID to PASID */
+ svm->pasid = ioasid_alloc(NULL, PASID_MIN,
+ pasid_max - 1, svm);
+ if (svm->pasid == INVALID_IOASID) {
+ kfree(svm);
+ kfree(sdev);
+ ret = -ENOSPC;
+ goto out;
+ }
+ svm->notifier.ops = &intel_mmuops;
+ svm->mm = mm;
+ svm->flags = flags;
+ INIT_LIST_HEAD_RCU(&svm->devs);
+ INIT_LIST_HEAD(&svm->list);
+ ret = -ENOMEM;
+ if (mm) {
+ ret = mmu_notifier_register(&svm->notifier, mm);
+ if (ret) {
+ ioasid_free(svm->pasid);
+ kfree(svm);
+ kfree(sdev);
+ goto out;
+ }
+ }
+
+ spin_lock_irqsave(&iommu->lock, iflags);
+ ret = intel_pasid_setup_first_level(iommu, dev,
+ mm ? mm->pgd : init_mm.pgd,
+ svm->pasid, FLPT_DEFAULT_DID,
+ (mm ? 0 : PASID_FLAG_SUPERVISOR_MODE) |
+ (cpu_feature_enabled(X86_FEATURE_LA57) ?
+ PASID_FLAG_FL5LP : 0));
+ spin_unlock_irqrestore(&iommu->lock, iflags);
+ if (ret) {
+ if (mm)
+ mmu_notifier_unregister(&svm->notifier, mm);
+ ioasid_free(svm->pasid);
+ kfree(svm);
+ kfree(sdev);
+ goto out;
+ }
+
+ list_add_tail(&svm->list, &global_svm_list);
+ if (mm) {
+ /* The newly allocated pasid is loaded to the mm. */
+ load_pasid(mm, svm->pasid);
+ }
+ } else {
+ /*
+ * Binding a new device with existing PASID, need to setup
+ * the PASID entry.
+ */
+ spin_lock_irqsave(&iommu->lock, iflags);
+ ret = intel_pasid_setup_first_level(iommu, dev,
+ mm ? mm->pgd : init_mm.pgd,
+ svm->pasid, FLPT_DEFAULT_DID,
+ (mm ? 0 : PASID_FLAG_SUPERVISOR_MODE) |
+ (cpu_feature_enabled(X86_FEATURE_LA57) ?
+ PASID_FLAG_FL5LP : 0));
+ spin_unlock_irqrestore(&iommu->lock, iflags);
+ if (ret) {
+ kfree(sdev);
+ goto out;
+ }
+ }
+ list_add_rcu(&sdev->list, &svm->devs);
+success:
+ sdev->pasid = svm->pasid;
+ sdev->sva.dev = dev;
+ if (sd)
+ *sd = sdev;
+ ret = 0;
+out:
+ return ret;
+}
+
+/* Caller must hold pasid_mutex */
+static int intel_svm_unbind_mm(struct device *dev, u32 pasid)
+{
+ struct intel_svm_dev *sdev;
+ struct intel_iommu *iommu;
+ struct intel_svm *svm;
+ int ret = -EINVAL;
+
+ iommu = device_to_iommu(dev, NULL, NULL);
+ if (!iommu)
+ goto out;
+
+ ret = pasid_to_svm_sdev(dev, pasid, &svm, &sdev);
+ if (ret)
+ goto out;
+
+ if (sdev) {
+ sdev->users--;
+ if (!sdev->users) {
+ list_del_rcu(&sdev->list);
+ /* Flush the PASID cache and IOTLB for this device.
+ * Note that we do depend on the hardware *not* using
+ * the PASID any more. Just as we depend on other
+ * devices never using PASIDs that they have no right
+ * to use. We have a *shared* PASID table, because it's
+ * large and has to be physically contiguous. So it's
+ * hard to be as defensive as we might like. */
+ intel_pasid_tear_down_entry(iommu, dev,
+ svm->pasid, false);
+ intel_svm_drain_prq(dev, svm->pasid);
+ kfree_rcu(sdev, rcu);
+
+ if (list_empty(&svm->devs)) {
+ ioasid_free(svm->pasid);
+ if (svm->mm) {
+ mmu_notifier_unregister(&svm->notifier, svm->mm);
+ /* Clear mm's pasid. */
+ load_pasid(svm->mm, PASID_DISABLED);
+ }
+ list_del(&svm->list);
+ /* We mandate that no page faults may be outstanding
+ * for the PASID when intel_svm_unbind_mm() is called.
+ * If that is not obeyed, subtle errors will happen.
+ * Let's make them less subtle... */
+ memset(svm, 0x6b, sizeof(*svm));
+ kfree(svm);
+ }
+ }
+ }
+out:
+ return ret;
+}
+
+/* Page request queue descriptor */
+struct page_req_dsc {
+ union {
+ struct {
+ u64 type:8;
+ u64 pasid_present:1;
+ u64 priv_data_present:1;
+ u64 rsvd:6;
+ u64 rid:16;
+ u64 pasid:20;
+ u64 exe_req:1;
+ u64 pm_req:1;
+ u64 rsvd2:10;
+ };
+ u64 qw_0;
+ };
+ union {
+ struct {
+ u64 rd_req:1;
+ u64 wr_req:1;
+ u64 lpig:1;
+ u64 prg_index:9;
+ u64 addr:52;
+ };
+ u64 qw_1;
+ };
+ u64 priv_data[2];
+};
+
+#define PRQ_RING_MASK ((0x1000 << PRQ_ORDER) - 0x20)
+
+static bool access_error(struct vm_area_struct *vma, struct page_req_dsc *req)
+{
+ unsigned long requested = 0;
+
+ if (req->exe_req)
+ requested |= VM_EXEC;
+
+ if (req->rd_req)
+ requested |= VM_READ;
+
+ if (req->wr_req)
+ requested |= VM_WRITE;
+
+ return (requested & ~vma->vm_flags) != 0;
+}
+
+static bool is_canonical_address(u64 addr)
+{
+ int shift = 64 - (__VIRTUAL_MASK_SHIFT + 1);
+ long saddr = (long) addr;
+
+ return (((saddr << shift) >> shift) == saddr);
+}
+
+/**
+ * intel_svm_drain_prq - Drain page requests and responses for a pasid
+ * @dev: target device
+ * @pasid: pasid for draining
+ *
+ * Drain all pending page requests and responses related to @pasid in both
+ * software and hardware. This is supposed to be called after the device
+ * driver has stopped DMA, the pasid entry has been cleared, and both IOTLB
+ * and DevTLB have been invalidated.
+ *
+ * It waits until all pending page requests for @pasid in the page fault
+ * queue are completed by the prq handling thread. Then follow the steps
+ * described in VT-d spec CH7.10 to drain all page requests and page
+ * responses pending in the hardware.
+ */
+static void intel_svm_drain_prq(struct device *dev, u32 pasid)
+{
+ struct device_domain_info *info;
+ struct dmar_domain *domain;
+ struct intel_iommu *iommu;
+ struct qi_desc desc[3];
+ struct pci_dev *pdev;
+ int head, tail;
+ u16 sid, did;
+ int qdep;
+
+ info = get_domain_info(dev);
+ if (WARN_ON(!info || !dev_is_pci(dev)))
+ return;
+
+ if (!info->pri_enabled)
+ return;
+
+ iommu = info->iommu;
+ domain = info->domain;
+ pdev = to_pci_dev(dev);
+ sid = PCI_DEVID(info->bus, info->devfn);
+ did = domain->iommu_did[iommu->seq_id];
+ qdep = pci_ats_queue_depth(pdev);
+
+ /*
+ * Check and wait until all pending page requests in the queue are
+ * handled by the prq handling thread.
+ */
+prq_retry:
+ reinit_completion(&iommu->prq_complete);
+ tail = dmar_readq(iommu->reg + DMAR_PQT_REG) & PRQ_RING_MASK;
+ head = dmar_readq(iommu->reg + DMAR_PQH_REG) & PRQ_RING_MASK;
+ while (head != tail) {
+ struct page_req_dsc *req;
+
+ req = &iommu->prq[head / sizeof(*req)];
+ if (!req->pasid_present || req->pasid != pasid) {
+ head = (head + sizeof(*req)) & PRQ_RING_MASK;
+ continue;
+ }
+
+ wait_for_completion(&iommu->prq_complete);
+ goto prq_retry;
+ }
+
+ /*
+ * Perform steps described in VT-d spec CH7.10 to drain page
+ * requests and responses in hardware.
+ */
+ memset(desc, 0, sizeof(desc));
+ desc[0].qw0 = QI_IWD_STATUS_DATA(QI_DONE) |
+ QI_IWD_FENCE |
+ QI_IWD_TYPE;
+ desc[1].qw0 = QI_EIOTLB_PASID(pasid) |
+ QI_EIOTLB_DID(did) |
+ QI_EIOTLB_GRAN(QI_GRAN_NONG_PASID) |
+ QI_EIOTLB_TYPE;
+ desc[2].qw0 = QI_DEV_EIOTLB_PASID(pasid) |
+ QI_DEV_EIOTLB_SID(sid) |
+ QI_DEV_EIOTLB_QDEP(qdep) |
+ QI_DEIOTLB_TYPE |
+ QI_DEV_IOTLB_PFSID(info->pfsid);
+qi_retry:
+ reinit_completion(&iommu->prq_complete);
+ qi_submit_sync(iommu, desc, 3, QI_OPT_WAIT_DRAIN);
+ if (readl(iommu->reg + DMAR_PRS_REG) & DMA_PRS_PRO) {
+ wait_for_completion(&iommu->prq_complete);
+ goto qi_retry;
+ }
+}
+
+static int prq_to_iommu_prot(struct page_req_dsc *req)
+{
+ int prot = 0;
+
+ if (req->rd_req)
+ prot |= IOMMU_FAULT_PERM_READ;
+ if (req->wr_req)
+ prot |= IOMMU_FAULT_PERM_WRITE;
+ if (req->exe_req)
+ prot |= IOMMU_FAULT_PERM_EXEC;
+ if (req->pm_req)
+ prot |= IOMMU_FAULT_PERM_PRIV;
+
+ return prot;
+}
+
+static int
+intel_svm_prq_report(struct device *dev, struct page_req_dsc *desc)
+{
+ struct iommu_fault_event event;
+
+ if (!dev || !dev_is_pci(dev))
+ return -ENODEV;
+
+ /* Fill in event data for device specific processing */
+ memset(&event, 0, sizeof(struct iommu_fault_event));
+ event.fault.type = IOMMU_FAULT_PAGE_REQ;
+ event.fault.prm.addr = (u64)desc->addr << VTD_PAGE_SHIFT;
+ event.fault.prm.pasid = desc->pasid;
+ event.fault.prm.grpid = desc->prg_index;
+ event.fault.prm.perm = prq_to_iommu_prot(desc);
+
+ if (desc->lpig)
+ event.fault.prm.flags |= IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE;
+ if (desc->pasid_present) {
+ event.fault.prm.flags |= IOMMU_FAULT_PAGE_REQUEST_PASID_VALID;
+ event.fault.prm.flags |= IOMMU_FAULT_PAGE_RESPONSE_NEEDS_PASID;
+ }
+ if (desc->priv_data_present) {
+ /*
+ * Set last page in group bit if private data is present,
+ * page response is required as it does for LPIG.
+ * iommu_report_device_fault() doesn't understand this vendor
+ * specific requirement thus we set last_page as a workaround.
+ */
+ event.fault.prm.flags |= IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE;
+ event.fault.prm.flags |= IOMMU_FAULT_PAGE_REQUEST_PRIV_DATA;
+ memcpy(event.fault.prm.private_data, desc->priv_data,
+ sizeof(desc->priv_data));
+ }
+
+ return iommu_report_device_fault(dev, &event);
+}
+
+static irqreturn_t prq_event_thread(int irq, void *d)
+{
+ struct intel_svm_dev *sdev = NULL;
+ struct intel_iommu *iommu = d;
+ struct intel_svm *svm = NULL;
+ int head, tail, handled = 0;
+
+ /* Clear PPR bit before reading head/tail registers, to
+ * ensure that we get a new interrupt if needed. */
+ writel(DMA_PRS_PPR, iommu->reg + DMAR_PRS_REG);
+
+ tail = dmar_readq(iommu->reg + DMAR_PQT_REG) & PRQ_RING_MASK;
+ head = dmar_readq(iommu->reg + DMAR_PQH_REG) & PRQ_RING_MASK;
+ while (head != tail) {
+ struct vm_area_struct *vma;
+ struct page_req_dsc *req;
+ struct qi_desc resp;
+ int result;
+ vm_fault_t ret;
+ u64 address;
+
+ handled = 1;
+
+ req = &iommu->prq[head / sizeof(*req)];
+
+ result = QI_RESP_FAILURE;
+ address = (u64)req->addr << VTD_PAGE_SHIFT;
+ if (!req->pasid_present) {
+ pr_err("%s: Page request without PASID: %08llx %08llx\n",
+ iommu->name, ((unsigned long long *)req)[0],
+ ((unsigned long long *)req)[1]);
+ goto no_pasid;
+ }
+ /* We shall not receive page request for supervisor SVM */
+ if (req->pm_req && (req->rd_req | req->wr_req)) {
+ pr_err("Unexpected page request in Privilege Mode");
+ /* No need to find the matching sdev as for bad_req */
+ goto no_pasid;
+ }
+ /* DMA read with exec requeset is not supported. */
+ if (req->exe_req && req->rd_req) {
+ pr_err("Execution request not supported\n");
+ goto no_pasid;
+ }
+ if (!svm || svm->pasid != req->pasid) {
+ rcu_read_lock();
+ svm = ioasid_find(NULL, req->pasid, NULL);
+ /* It *can't* go away, because the driver is not permitted
+ * to unbind the mm while any page faults are outstanding.
+ * So we only need RCU to protect the internal idr code. */
+ rcu_read_unlock();
+ if (IS_ERR_OR_NULL(svm)) {
+ pr_err("%s: Page request for invalid PASID %d: %08llx %08llx\n",
+ iommu->name, req->pasid, ((unsigned long long *)req)[0],
+ ((unsigned long long *)req)[1]);
+ goto no_pasid;
+ }
+ }
+
+ if (!sdev || sdev->sid != req->rid) {
+ struct intel_svm_dev *t;
+
+ sdev = NULL;
+ rcu_read_lock();
+ list_for_each_entry_rcu(t, &svm->devs, list) {
+ if (t->sid == req->rid) {
+ sdev = t;
+ break;
+ }
+ }
+ rcu_read_unlock();
+ }
+
+ result = QI_RESP_INVALID;
+ /* Since we're using init_mm.pgd directly, we should never take
+ * any faults on kernel addresses. */
+ if (!svm->mm)
+ goto bad_req;
+
+ /* If address is not canonical, return invalid response */
+ if (!is_canonical_address(address))
+ goto bad_req;
+
+ /*
+ * If prq is to be handled outside iommu driver via receiver of
+ * the fault notifiers, we skip the page response here.
+ */
+ if (svm->flags & SVM_FLAG_GUEST_MODE) {
+ if (sdev && !intel_svm_prq_report(sdev->dev, req))
+ goto prq_advance;
+ else
+ goto bad_req;
+ }
+
+ /* If the mm is already defunct, don't handle faults. */
+ if (!mmget_not_zero(svm->mm))
+ goto bad_req;
+
+ mmap_read_lock(svm->mm);
+ vma = find_extend_vma(svm->mm, address);
+ if (!vma || address < vma->vm_start)
+ goto invalid;
+
+ if (access_error(vma, req))
+ goto invalid;
+
+ ret = handle_mm_fault(vma, address,
+ req->wr_req ? FAULT_FLAG_WRITE : 0,
+ NULL);
+ if (ret & VM_FAULT_ERROR)
+ goto invalid;
+
+ result = QI_RESP_SUCCESS;
+invalid:
+ mmap_read_unlock(svm->mm);
+ mmput(svm->mm);
+bad_req:
+ WARN_ON(!sdev);
+ if (sdev && sdev->ops && sdev->ops->fault_cb) {
+ int rwxp = (req->rd_req << 3) | (req->wr_req << 2) |
+ (req->exe_req << 1) | (req->pm_req);
+ sdev->ops->fault_cb(sdev->dev, req->pasid, req->addr,
+ req->priv_data, rwxp, result);
+ }
+ /* We get here in the error case where the PASID lookup failed,
+ and these can be NULL. Do not use them below this point! */
+ sdev = NULL;
+ svm = NULL;
+no_pasid:
+ if (req->lpig || req->priv_data_present) {
+ /*
+ * Per VT-d spec. v3.0 ch7.7, system software must
+ * respond with page group response if private data
+ * is present (PDP) or last page in group (LPIG) bit
+ * is set. This is an additional VT-d feature beyond
+ * PCI ATS spec.
+ */
+ resp.qw0 = QI_PGRP_PASID(req->pasid) |
+ QI_PGRP_DID(req->rid) |
+ QI_PGRP_PASID_P(req->pasid_present) |
+ QI_PGRP_PDP(req->priv_data_present) |
+ QI_PGRP_RESP_CODE(result) |
+ QI_PGRP_RESP_TYPE;
+ resp.qw1 = QI_PGRP_IDX(req->prg_index) |
+ QI_PGRP_LPIG(req->lpig);
+ resp.qw2 = 0;
+ resp.qw3 = 0;
+
+ if (req->priv_data_present)
+ memcpy(&resp.qw2, req->priv_data,
+ sizeof(req->priv_data));
+ qi_submit_sync(iommu, &resp, 1, 0);
+ }
+prq_advance:
+ head = (head + sizeof(*req)) & PRQ_RING_MASK;
+ }
+
+ dmar_writeq(iommu->reg + DMAR_PQH_REG, tail);
+
+ /*
+ * Clear the page request overflow bit and wake up all threads that
+ * are waiting for the completion of this handling.
+ */
+ if (readl(iommu->reg + DMAR_PRS_REG) & DMA_PRS_PRO) {
+ pr_info_ratelimited("IOMMU: %s: PRQ overflow detected\n",
+ iommu->name);
+ head = dmar_readq(iommu->reg + DMAR_PQH_REG) & PRQ_RING_MASK;
+ tail = dmar_readq(iommu->reg + DMAR_PQT_REG) & PRQ_RING_MASK;
+ if (head == tail) {
+ writel(DMA_PRS_PRO, iommu->reg + DMAR_PRS_REG);
+ pr_info_ratelimited("IOMMU: %s: PRQ overflow cleared",
+ iommu->name);
+ }
+ }
+
+ if (!completion_done(&iommu->prq_complete))
+ complete(&iommu->prq_complete);
+
+ return IRQ_RETVAL(handled);
+}
+
+#define to_intel_svm_dev(handle) container_of(handle, struct intel_svm_dev, sva)
+struct iommu_sva *
+intel_svm_bind(struct device *dev, struct mm_struct *mm, void *drvdata)
+{
+ struct iommu_sva *sva = ERR_PTR(-EINVAL);
+ struct intel_svm_dev *sdev = NULL;
+ unsigned int flags = 0;
+ int ret;
+
+ /*
+ * TODO: Consolidate with generic iommu-sva bind after it is merged.
+ * It will require shared SVM data structures, i.e. combine io_mm
+ * and intel_svm etc.
+ */
+ if (drvdata)
+ flags = *(unsigned int *)drvdata;
+ mutex_lock(&pasid_mutex);
+ ret = intel_svm_bind_mm(dev, flags, NULL, mm, &sdev);
+ if (ret)
+ sva = ERR_PTR(ret);
+ else if (sdev)
+ sva = &sdev->sva;
+ else
+ WARN(!sdev, "SVM bind succeeded with no sdev!\n");
+
+ mutex_unlock(&pasid_mutex);
+
+ return sva;
+}
+
+void intel_svm_unbind(struct iommu_sva *sva)
+{
+ struct intel_svm_dev *sdev;
+
+ mutex_lock(&pasid_mutex);
+ sdev = to_intel_svm_dev(sva);
+ intel_svm_unbind_mm(sdev->dev, sdev->pasid);
+ mutex_unlock(&pasid_mutex);
+}
+
+u32 intel_svm_get_pasid(struct iommu_sva *sva)
+{
+ struct intel_svm_dev *sdev;
+ u32 pasid;
+
+ mutex_lock(&pasid_mutex);
+ sdev = to_intel_svm_dev(sva);
+ pasid = sdev->pasid;
+ mutex_unlock(&pasid_mutex);
+
+ return pasid;
+}
+
+int intel_svm_page_response(struct device *dev,
+ struct iommu_fault_event *evt,
+ struct iommu_page_response *msg)
+{
+ struct iommu_fault_page_request *prm;
+ struct intel_svm_dev *sdev = NULL;
+ struct intel_svm *svm = NULL;
+ struct intel_iommu *iommu;
+ bool private_present;
+ bool pasid_present;
+ bool last_page;
+ u8 bus, devfn;
+ int ret = 0;
+ u16 sid;
+
+ if (!dev || !dev_is_pci(dev))
+ return -ENODEV;
+
+ iommu = device_to_iommu(dev, &bus, &devfn);
+ if (!iommu)
+ return -ENODEV;
+
+ if (!msg || !evt)
+ return -EINVAL;
+
+ mutex_lock(&pasid_mutex);
+
+ prm = &evt->fault.prm;
+ sid = PCI_DEVID(bus, devfn);
+ pasid_present = prm->flags & IOMMU_FAULT_PAGE_REQUEST_PASID_VALID;
+ private_present = prm->flags & IOMMU_FAULT_PAGE_REQUEST_PRIV_DATA;
+ last_page = prm->flags & IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE;
+
+ if (!pasid_present) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (prm->pasid == 0 || prm->pasid >= PASID_MAX) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ret = pasid_to_svm_sdev(dev, prm->pasid, &svm, &sdev);
+ if (ret || !sdev) {
+ ret = -ENODEV;
+ goto out;
+ }
+
+ /*
+ * For responses from userspace, need to make sure that the
+ * pasid has been bound to its mm.
+ */
+ if (svm->flags & SVM_FLAG_GUEST_MODE) {
+ struct mm_struct *mm;
+
+ mm = get_task_mm(current);
+ if (!mm) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (mm != svm->mm) {
+ ret = -ENODEV;
+ mmput(mm);
+ goto out;
+ }
+
+ mmput(mm);
+ }
+
+ /*
+ * Per VT-d spec. v3.0 ch7.7, system software must respond
+ * with page group response if private data is present (PDP)
+ * or last page in group (LPIG) bit is set. This is an
+ * additional VT-d requirement beyond PCI ATS spec.
+ */
+ if (last_page || private_present) {
+ struct qi_desc desc;
+
+ desc.qw0 = QI_PGRP_PASID(prm->pasid) | QI_PGRP_DID(sid) |
+ QI_PGRP_PASID_P(pasid_present) |
+ QI_PGRP_PDP(private_present) |
+ QI_PGRP_RESP_CODE(msg->code) |
+ QI_PGRP_RESP_TYPE;
+ desc.qw1 = QI_PGRP_IDX(prm->grpid) | QI_PGRP_LPIG(last_page);
+ desc.qw2 = 0;
+ desc.qw3 = 0;
+ if (private_present)
+ memcpy(&desc.qw2, prm->private_data,
+ sizeof(prm->private_data));
+
+ qi_submit_sync(iommu, &desc, 1, 0);
+ }
+out:
+ mutex_unlock(&pasid_mutex);
+ return ret;
+}
diff --git a/drivers/iommu/intel/trace.c b/drivers/iommu/intel/trace.c
new file mode 100644
index 000000000..bfb6a6e37
--- /dev/null
+++ b/drivers/iommu/intel/trace.c
@@ -0,0 +1,14 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Intel IOMMU trace support
+ *
+ * Copyright (C) 2019 Intel Corporation
+ *
+ * Author: Lu Baolu <baolu.lu@linux.intel.com>
+ */
+
+#include <linux/string.h>
+#include <linux/types.h>
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/intel_iommu.h>
diff --git a/drivers/iommu/io-pgtable-arm-v7s.c b/drivers/iommu/io-pgtable-arm-v7s.c
new file mode 100644
index 000000000..3bcd3afe9
--- /dev/null
+++ b/drivers/iommu/io-pgtable-arm-v7s.c
@@ -0,0 +1,991 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * CPU-agnostic ARM page table allocator.
+ *
+ * ARMv7 Short-descriptor format, supporting
+ * - Basic memory attributes
+ * - Simplified access permissions (AP[2:1] model)
+ * - Backwards-compatible TEX remap
+ * - Large pages/supersections (if indicated by the caller)
+ *
+ * Not supporting:
+ * - Legacy access permissions (AP[2:0] model)
+ *
+ * Almost certainly never supporting:
+ * - PXN
+ * - Domains
+ *
+ * Copyright (C) 2014-2015 ARM Limited
+ * Copyright (c) 2014-2015 MediaTek Inc.
+ */
+
+#define pr_fmt(fmt) "arm-v7s io-pgtable: " fmt
+
+#include <linux/atomic.h>
+#include <linux/dma-mapping.h>
+#include <linux/gfp.h>
+#include <linux/io-pgtable.h>
+#include <linux/iommu.h>
+#include <linux/kernel.h>
+#include <linux/kmemleak.h>
+#include <linux/sizes.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/types.h>
+
+#include <asm/barrier.h>
+
+/* Struct accessors */
+#define io_pgtable_to_data(x) \
+ container_of((x), struct arm_v7s_io_pgtable, iop)
+
+#define io_pgtable_ops_to_data(x) \
+ io_pgtable_to_data(io_pgtable_ops_to_pgtable(x))
+
+/*
+ * We have 32 bits total; 12 bits resolved at level 1, 8 bits at level 2,
+ * and 12 bits in a page. With some carefully-chosen coefficients we can
+ * hide the ugly inconsistencies behind these macros and at least let the
+ * rest of the code pretend to be somewhat sane.
+ */
+#define ARM_V7S_ADDR_BITS 32
+#define _ARM_V7S_LVL_BITS(lvl) (16 - (lvl) * 4)
+#define ARM_V7S_LVL_SHIFT(lvl) (ARM_V7S_ADDR_BITS - (4 + 8 * (lvl)))
+#define ARM_V7S_TABLE_SHIFT 10
+
+#define ARM_V7S_PTES_PER_LVL(lvl) (1 << _ARM_V7S_LVL_BITS(lvl))
+#define ARM_V7S_TABLE_SIZE(lvl) \
+ (ARM_V7S_PTES_PER_LVL(lvl) * sizeof(arm_v7s_iopte))
+
+#define ARM_V7S_BLOCK_SIZE(lvl) (1UL << ARM_V7S_LVL_SHIFT(lvl))
+#define ARM_V7S_LVL_MASK(lvl) ((u32)(~0U << ARM_V7S_LVL_SHIFT(lvl)))
+#define ARM_V7S_TABLE_MASK ((u32)(~0U << ARM_V7S_TABLE_SHIFT))
+#define _ARM_V7S_IDX_MASK(lvl) (ARM_V7S_PTES_PER_LVL(lvl) - 1)
+#define ARM_V7S_LVL_IDX(addr, lvl) ({ \
+ int _l = lvl; \
+ ((u32)(addr) >> ARM_V7S_LVL_SHIFT(_l)) & _ARM_V7S_IDX_MASK(_l); \
+})
+
+/*
+ * Large page/supersection entries are effectively a block of 16 page/section
+ * entries, along the lines of the LPAE contiguous hint, but all with the
+ * same output address. For want of a better common name we'll call them
+ * "contiguous" versions of their respective page/section entries here, but
+ * noting the distinction (WRT to TLB maintenance) that they represent *one*
+ * entry repeated 16 times, not 16 separate entries (as in the LPAE case).
+ */
+#define ARM_V7S_CONT_PAGES 16
+
+/* PTE type bits: these are all mixed up with XN/PXN bits in most cases */
+#define ARM_V7S_PTE_TYPE_TABLE 0x1
+#define ARM_V7S_PTE_TYPE_PAGE 0x2
+#define ARM_V7S_PTE_TYPE_CONT_PAGE 0x1
+
+#define ARM_V7S_PTE_IS_VALID(pte) (((pte) & 0x3) != 0)
+#define ARM_V7S_PTE_IS_TABLE(pte, lvl) \
+ ((lvl) == 1 && (((pte) & 0x3) == ARM_V7S_PTE_TYPE_TABLE))
+
+/* Page table bits */
+#define ARM_V7S_ATTR_XN(lvl) BIT(4 * (2 - (lvl)))
+#define ARM_V7S_ATTR_B BIT(2)
+#define ARM_V7S_ATTR_C BIT(3)
+#define ARM_V7S_ATTR_NS_TABLE BIT(3)
+#define ARM_V7S_ATTR_NS_SECTION BIT(19)
+
+#define ARM_V7S_CONT_SECTION BIT(18)
+#define ARM_V7S_CONT_PAGE_XN_SHIFT 15
+
+/*
+ * The attribute bits are consistently ordered*, but occupy bits [17:10] of
+ * a level 1 PTE vs. bits [11:4] at level 2. Thus we define the individual
+ * fields relative to that 8-bit block, plus a total shift relative to the PTE.
+ */
+#define ARM_V7S_ATTR_SHIFT(lvl) (16 - (lvl) * 6)
+
+#define ARM_V7S_ATTR_MASK 0xff
+#define ARM_V7S_ATTR_AP0 BIT(0)
+#define ARM_V7S_ATTR_AP1 BIT(1)
+#define ARM_V7S_ATTR_AP2 BIT(5)
+#define ARM_V7S_ATTR_S BIT(6)
+#define ARM_V7S_ATTR_NG BIT(7)
+#define ARM_V7S_TEX_SHIFT 2
+#define ARM_V7S_TEX_MASK 0x7
+#define ARM_V7S_ATTR_TEX(val) (((val) & ARM_V7S_TEX_MASK) << ARM_V7S_TEX_SHIFT)
+
+/* MediaTek extend the two bits for PA 32bit/33bit */
+#define ARM_V7S_ATTR_MTK_PA_BIT32 BIT(9)
+#define ARM_V7S_ATTR_MTK_PA_BIT33 BIT(4)
+
+/* *well, except for TEX on level 2 large pages, of course :( */
+#define ARM_V7S_CONT_PAGE_TEX_SHIFT 6
+#define ARM_V7S_CONT_PAGE_TEX_MASK (ARM_V7S_TEX_MASK << ARM_V7S_CONT_PAGE_TEX_SHIFT)
+
+/* Simplified access permissions */
+#define ARM_V7S_PTE_AF ARM_V7S_ATTR_AP0
+#define ARM_V7S_PTE_AP_UNPRIV ARM_V7S_ATTR_AP1
+#define ARM_V7S_PTE_AP_RDONLY ARM_V7S_ATTR_AP2
+
+/* Register bits */
+#define ARM_V7S_RGN_NC 0
+#define ARM_V7S_RGN_WBWA 1
+#define ARM_V7S_RGN_WT 2
+#define ARM_V7S_RGN_WB 3
+
+#define ARM_V7S_PRRR_TYPE_DEVICE 1
+#define ARM_V7S_PRRR_TYPE_NORMAL 2
+#define ARM_V7S_PRRR_TR(n, type) (((type) & 0x3) << ((n) * 2))
+#define ARM_V7S_PRRR_DS0 BIT(16)
+#define ARM_V7S_PRRR_DS1 BIT(17)
+#define ARM_V7S_PRRR_NS0 BIT(18)
+#define ARM_V7S_PRRR_NS1 BIT(19)
+#define ARM_V7S_PRRR_NOS(n) BIT((n) + 24)
+
+#define ARM_V7S_NMRR_IR(n, attr) (((attr) & 0x3) << ((n) * 2))
+#define ARM_V7S_NMRR_OR(n, attr) (((attr) & 0x3) << ((n) * 2 + 16))
+
+#define ARM_V7S_TTBR_S BIT(1)
+#define ARM_V7S_TTBR_NOS BIT(5)
+#define ARM_V7S_TTBR_ORGN_ATTR(attr) (((attr) & 0x3) << 3)
+#define ARM_V7S_TTBR_IRGN_ATTR(attr) \
+ ((((attr) & 0x1) << 6) | (((attr) & 0x2) >> 1))
+
+#ifdef CONFIG_ZONE_DMA32
+#define ARM_V7S_TABLE_GFP_DMA GFP_DMA32
+#define ARM_V7S_TABLE_SLAB_FLAGS SLAB_CACHE_DMA32
+#else
+#define ARM_V7S_TABLE_GFP_DMA GFP_DMA
+#define ARM_V7S_TABLE_SLAB_FLAGS SLAB_CACHE_DMA
+#endif
+
+typedef u32 arm_v7s_iopte;
+
+static bool selftest_running;
+
+struct arm_v7s_io_pgtable {
+ struct io_pgtable iop;
+
+ arm_v7s_iopte *pgd;
+ struct kmem_cache *l2_tables;
+ spinlock_t split_lock;
+};
+
+static bool arm_v7s_pte_is_cont(arm_v7s_iopte pte, int lvl);
+
+static dma_addr_t __arm_v7s_dma_addr(void *pages)
+{
+ return (dma_addr_t)virt_to_phys(pages);
+}
+
+static bool arm_v7s_is_mtk_enabled(struct io_pgtable_cfg *cfg)
+{
+ return IS_ENABLED(CONFIG_PHYS_ADDR_T_64BIT) &&
+ (cfg->quirks & IO_PGTABLE_QUIRK_ARM_MTK_EXT);
+}
+
+static arm_v7s_iopte paddr_to_iopte(phys_addr_t paddr, int lvl,
+ struct io_pgtable_cfg *cfg)
+{
+ arm_v7s_iopte pte = paddr & ARM_V7S_LVL_MASK(lvl);
+
+ if (!arm_v7s_is_mtk_enabled(cfg))
+ return pte;
+
+ if (paddr & BIT_ULL(32))
+ pte |= ARM_V7S_ATTR_MTK_PA_BIT32;
+ if (paddr & BIT_ULL(33))
+ pte |= ARM_V7S_ATTR_MTK_PA_BIT33;
+ return pte;
+}
+
+static phys_addr_t iopte_to_paddr(arm_v7s_iopte pte, int lvl,
+ struct io_pgtable_cfg *cfg)
+{
+ arm_v7s_iopte mask;
+ phys_addr_t paddr;
+
+ if (ARM_V7S_PTE_IS_TABLE(pte, lvl))
+ mask = ARM_V7S_TABLE_MASK;
+ else if (arm_v7s_pte_is_cont(pte, lvl))
+ mask = ARM_V7S_LVL_MASK(lvl) * ARM_V7S_CONT_PAGES;
+ else
+ mask = ARM_V7S_LVL_MASK(lvl);
+
+ paddr = pte & mask;
+ if (!arm_v7s_is_mtk_enabled(cfg))
+ return paddr;
+
+ if (pte & ARM_V7S_ATTR_MTK_PA_BIT32)
+ paddr |= BIT_ULL(32);
+ if (pte & ARM_V7S_ATTR_MTK_PA_BIT33)
+ paddr |= BIT_ULL(33);
+ return paddr;
+}
+
+static arm_v7s_iopte *iopte_deref(arm_v7s_iopte pte, int lvl,
+ struct arm_v7s_io_pgtable *data)
+{
+ return phys_to_virt(iopte_to_paddr(pte, lvl, &data->iop.cfg));
+}
+
+static void *__arm_v7s_alloc_table(int lvl, gfp_t gfp,
+ struct arm_v7s_io_pgtable *data)
+{
+ struct io_pgtable_cfg *cfg = &data->iop.cfg;
+ struct device *dev = cfg->iommu_dev;
+ phys_addr_t phys;
+ dma_addr_t dma;
+ size_t size = ARM_V7S_TABLE_SIZE(lvl);
+ void *table = NULL;
+
+ if (lvl == 1)
+ table = (void *)__get_free_pages(
+ __GFP_ZERO | ARM_V7S_TABLE_GFP_DMA, get_order(size));
+ else if (lvl == 2)
+ table = kmem_cache_zalloc(data->l2_tables, gfp);
+
+ if (!table)
+ return NULL;
+
+ phys = virt_to_phys(table);
+ if (phys != (arm_v7s_iopte)phys) {
+ /* Doesn't fit in PTE */
+ dev_err(dev, "Page table does not fit in PTE: %pa", &phys);
+ goto out_free;
+ }
+ if (!cfg->coherent_walk) {
+ dma = dma_map_single(dev, table, size, DMA_TO_DEVICE);
+ if (dma_mapping_error(dev, dma))
+ goto out_free;
+ /*
+ * We depend on the IOMMU being able to work with any physical
+ * address directly, so if the DMA layer suggests otherwise by
+ * translating or truncating them, that bodes very badly...
+ */
+ if (dma != phys)
+ goto out_unmap;
+ }
+ if (lvl == 2)
+ kmemleak_ignore(table);
+ return table;
+
+out_unmap:
+ dev_err(dev, "Cannot accommodate DMA translation for IOMMU page tables\n");
+ dma_unmap_single(dev, dma, size, DMA_TO_DEVICE);
+out_free:
+ if (lvl == 1)
+ free_pages((unsigned long)table, get_order(size));
+ else
+ kmem_cache_free(data->l2_tables, table);
+ return NULL;
+}
+
+static void __arm_v7s_free_table(void *table, int lvl,
+ struct arm_v7s_io_pgtable *data)
+{
+ struct io_pgtable_cfg *cfg = &data->iop.cfg;
+ struct device *dev = cfg->iommu_dev;
+ size_t size = ARM_V7S_TABLE_SIZE(lvl);
+
+ if (!cfg->coherent_walk)
+ dma_unmap_single(dev, __arm_v7s_dma_addr(table), size,
+ DMA_TO_DEVICE);
+ if (lvl == 1)
+ free_pages((unsigned long)table, get_order(size));
+ else
+ kmem_cache_free(data->l2_tables, table);
+}
+
+static void __arm_v7s_pte_sync(arm_v7s_iopte *ptep, int num_entries,
+ struct io_pgtable_cfg *cfg)
+{
+ if (cfg->coherent_walk)
+ return;
+
+ dma_sync_single_for_device(cfg->iommu_dev, __arm_v7s_dma_addr(ptep),
+ num_entries * sizeof(*ptep), DMA_TO_DEVICE);
+}
+static void __arm_v7s_set_pte(arm_v7s_iopte *ptep, arm_v7s_iopte pte,
+ int num_entries, struct io_pgtable_cfg *cfg)
+{
+ int i;
+
+ for (i = 0; i < num_entries; i++)
+ ptep[i] = pte;
+
+ __arm_v7s_pte_sync(ptep, num_entries, cfg);
+}
+
+static arm_v7s_iopte arm_v7s_prot_to_pte(int prot, int lvl,
+ struct io_pgtable_cfg *cfg)
+{
+ bool ap = !(cfg->quirks & IO_PGTABLE_QUIRK_NO_PERMS);
+ arm_v7s_iopte pte = ARM_V7S_ATTR_NG | ARM_V7S_ATTR_S;
+
+ if (!(prot & IOMMU_MMIO))
+ pte |= ARM_V7S_ATTR_TEX(1);
+ if (ap) {
+ pte |= ARM_V7S_PTE_AF;
+ if (!(prot & IOMMU_PRIV))
+ pte |= ARM_V7S_PTE_AP_UNPRIV;
+ if (!(prot & IOMMU_WRITE))
+ pte |= ARM_V7S_PTE_AP_RDONLY;
+ }
+ pte <<= ARM_V7S_ATTR_SHIFT(lvl);
+
+ if ((prot & IOMMU_NOEXEC) && ap)
+ pte |= ARM_V7S_ATTR_XN(lvl);
+ if (prot & IOMMU_MMIO)
+ pte |= ARM_V7S_ATTR_B;
+ else if (prot & IOMMU_CACHE)
+ pte |= ARM_V7S_ATTR_B | ARM_V7S_ATTR_C;
+
+ pte |= ARM_V7S_PTE_TYPE_PAGE;
+ if (lvl == 1 && (cfg->quirks & IO_PGTABLE_QUIRK_ARM_NS))
+ pte |= ARM_V7S_ATTR_NS_SECTION;
+
+ return pte;
+}
+
+static int arm_v7s_pte_to_prot(arm_v7s_iopte pte, int lvl)
+{
+ int prot = IOMMU_READ;
+ arm_v7s_iopte attr = pte >> ARM_V7S_ATTR_SHIFT(lvl);
+
+ if (!(attr & ARM_V7S_PTE_AP_RDONLY))
+ prot |= IOMMU_WRITE;
+ if (!(attr & ARM_V7S_PTE_AP_UNPRIV))
+ prot |= IOMMU_PRIV;
+ if ((attr & (ARM_V7S_TEX_MASK << ARM_V7S_TEX_SHIFT)) == 0)
+ prot |= IOMMU_MMIO;
+ else if (pte & ARM_V7S_ATTR_C)
+ prot |= IOMMU_CACHE;
+ if (pte & ARM_V7S_ATTR_XN(lvl))
+ prot |= IOMMU_NOEXEC;
+
+ return prot;
+}
+
+static arm_v7s_iopte arm_v7s_pte_to_cont(arm_v7s_iopte pte, int lvl)
+{
+ if (lvl == 1) {
+ pte |= ARM_V7S_CONT_SECTION;
+ } else if (lvl == 2) {
+ arm_v7s_iopte xn = pte & ARM_V7S_ATTR_XN(lvl);
+ arm_v7s_iopte tex = pte & ARM_V7S_CONT_PAGE_TEX_MASK;
+
+ pte ^= xn | tex | ARM_V7S_PTE_TYPE_PAGE;
+ pte |= (xn << ARM_V7S_CONT_PAGE_XN_SHIFT) |
+ (tex << ARM_V7S_CONT_PAGE_TEX_SHIFT) |
+ ARM_V7S_PTE_TYPE_CONT_PAGE;
+ }
+ return pte;
+}
+
+static arm_v7s_iopte arm_v7s_cont_to_pte(arm_v7s_iopte pte, int lvl)
+{
+ if (lvl == 1) {
+ pte &= ~ARM_V7S_CONT_SECTION;
+ } else if (lvl == 2) {
+ arm_v7s_iopte xn = pte & BIT(ARM_V7S_CONT_PAGE_XN_SHIFT);
+ arm_v7s_iopte tex = pte & (ARM_V7S_CONT_PAGE_TEX_MASK <<
+ ARM_V7S_CONT_PAGE_TEX_SHIFT);
+
+ pte ^= xn | tex | ARM_V7S_PTE_TYPE_CONT_PAGE;
+ pte |= (xn >> ARM_V7S_CONT_PAGE_XN_SHIFT) |
+ (tex >> ARM_V7S_CONT_PAGE_TEX_SHIFT) |
+ ARM_V7S_PTE_TYPE_PAGE;
+ }
+ return pte;
+}
+
+static bool arm_v7s_pte_is_cont(arm_v7s_iopte pte, int lvl)
+{
+ if (lvl == 1 && !ARM_V7S_PTE_IS_TABLE(pte, lvl))
+ return pte & ARM_V7S_CONT_SECTION;
+ else if (lvl == 2)
+ return !(pte & ARM_V7S_PTE_TYPE_PAGE);
+ return false;
+}
+
+static size_t __arm_v7s_unmap(struct arm_v7s_io_pgtable *,
+ struct iommu_iotlb_gather *, unsigned long,
+ size_t, int, arm_v7s_iopte *);
+
+static int arm_v7s_init_pte(struct arm_v7s_io_pgtable *data,
+ unsigned long iova, phys_addr_t paddr, int prot,
+ int lvl, int num_entries, arm_v7s_iopte *ptep)
+{
+ struct io_pgtable_cfg *cfg = &data->iop.cfg;
+ arm_v7s_iopte pte;
+ int i;
+
+ for (i = 0; i < num_entries; i++)
+ if (ARM_V7S_PTE_IS_TABLE(ptep[i], lvl)) {
+ /*
+ * We need to unmap and free the old table before
+ * overwriting it with a block entry.
+ */
+ arm_v7s_iopte *tblp;
+ size_t sz = ARM_V7S_BLOCK_SIZE(lvl);
+
+ tblp = ptep - ARM_V7S_LVL_IDX(iova, lvl);
+ if (WARN_ON(__arm_v7s_unmap(data, NULL, iova + i * sz,
+ sz, lvl, tblp) != sz))
+ return -EINVAL;
+ } else if (ptep[i]) {
+ /* We require an unmap first */
+ WARN_ON(!selftest_running);
+ return -EEXIST;
+ }
+
+ pte = arm_v7s_prot_to_pte(prot, lvl, cfg);
+ if (num_entries > 1)
+ pte = arm_v7s_pte_to_cont(pte, lvl);
+
+ pte |= paddr_to_iopte(paddr, lvl, cfg);
+
+ __arm_v7s_set_pte(ptep, pte, num_entries, cfg);
+ return 0;
+}
+
+static arm_v7s_iopte arm_v7s_install_table(arm_v7s_iopte *table,
+ arm_v7s_iopte *ptep,
+ arm_v7s_iopte curr,
+ struct io_pgtable_cfg *cfg)
+{
+ arm_v7s_iopte old, new;
+
+ new = virt_to_phys(table) | ARM_V7S_PTE_TYPE_TABLE;
+ if (cfg->quirks & IO_PGTABLE_QUIRK_ARM_NS)
+ new |= ARM_V7S_ATTR_NS_TABLE;
+
+ /*
+ * Ensure the table itself is visible before its PTE can be.
+ * Whilst we could get away with cmpxchg64_release below, this
+ * doesn't have any ordering semantics when !CONFIG_SMP.
+ */
+ dma_wmb();
+
+ old = cmpxchg_relaxed(ptep, curr, new);
+ __arm_v7s_pte_sync(ptep, 1, cfg);
+
+ return old;
+}
+
+static int __arm_v7s_map(struct arm_v7s_io_pgtable *data, unsigned long iova,
+ phys_addr_t paddr, size_t size, int prot,
+ int lvl, arm_v7s_iopte *ptep, gfp_t gfp)
+{
+ struct io_pgtable_cfg *cfg = &data->iop.cfg;
+ arm_v7s_iopte pte, *cptep;
+ int num_entries = size >> ARM_V7S_LVL_SHIFT(lvl);
+
+ /* Find our entry at the current level */
+ ptep += ARM_V7S_LVL_IDX(iova, lvl);
+
+ /* If we can install a leaf entry at this level, then do so */
+ if (num_entries)
+ return arm_v7s_init_pte(data, iova, paddr, prot,
+ lvl, num_entries, ptep);
+
+ /* We can't allocate tables at the final level */
+ if (WARN_ON(lvl == 2))
+ return -EINVAL;
+
+ /* Grab a pointer to the next level */
+ pte = READ_ONCE(*ptep);
+ if (!pte) {
+ cptep = __arm_v7s_alloc_table(lvl + 1, gfp, data);
+ if (!cptep)
+ return -ENOMEM;
+
+ pte = arm_v7s_install_table(cptep, ptep, 0, cfg);
+ if (pte)
+ __arm_v7s_free_table(cptep, lvl + 1, data);
+ } else {
+ /* We've no easy way of knowing if it's synced yet, so... */
+ __arm_v7s_pte_sync(ptep, 1, cfg);
+ }
+
+ if (ARM_V7S_PTE_IS_TABLE(pte, lvl)) {
+ cptep = iopte_deref(pte, lvl, data);
+ } else if (pte) {
+ /* We require an unmap first */
+ WARN_ON(!selftest_running);
+ return -EEXIST;
+ }
+
+ /* Rinse, repeat */
+ return __arm_v7s_map(data, iova, paddr, size, prot, lvl + 1, cptep, gfp);
+}
+
+static int arm_v7s_map(struct io_pgtable_ops *ops, unsigned long iova,
+ phys_addr_t paddr, size_t size, int prot, gfp_t gfp)
+{
+ struct arm_v7s_io_pgtable *data = io_pgtable_ops_to_data(ops);
+ struct io_pgtable *iop = &data->iop;
+ int ret;
+
+ /* If no access, then nothing to do */
+ if (!(prot & (IOMMU_READ | IOMMU_WRITE)))
+ return 0;
+
+ if (WARN_ON(iova >= (1ULL << data->iop.cfg.ias) ||
+ paddr >= (1ULL << data->iop.cfg.oas)))
+ return -ERANGE;
+
+ ret = __arm_v7s_map(data, iova, paddr, size, prot, 1, data->pgd, gfp);
+ /*
+ * Synchronise all PTE updates for the new mapping before there's
+ * a chance for anything to kick off a table walk for the new iova.
+ */
+ if (iop->cfg.quirks & IO_PGTABLE_QUIRK_TLBI_ON_MAP) {
+ io_pgtable_tlb_flush_walk(iop, iova, size,
+ ARM_V7S_BLOCK_SIZE(2));
+ } else {
+ wmb();
+ }
+
+ return ret;
+}
+
+static void arm_v7s_free_pgtable(struct io_pgtable *iop)
+{
+ struct arm_v7s_io_pgtable *data = io_pgtable_to_data(iop);
+ int i;
+
+ for (i = 0; i < ARM_V7S_PTES_PER_LVL(1); i++) {
+ arm_v7s_iopte pte = data->pgd[i];
+
+ if (ARM_V7S_PTE_IS_TABLE(pte, 1))
+ __arm_v7s_free_table(iopte_deref(pte, 1, data),
+ 2, data);
+ }
+ __arm_v7s_free_table(data->pgd, 1, data);
+ kmem_cache_destroy(data->l2_tables);
+ kfree(data);
+}
+
+static arm_v7s_iopte arm_v7s_split_cont(struct arm_v7s_io_pgtable *data,
+ unsigned long iova, int idx, int lvl,
+ arm_v7s_iopte *ptep)
+{
+ struct io_pgtable *iop = &data->iop;
+ arm_v7s_iopte pte;
+ size_t size = ARM_V7S_BLOCK_SIZE(lvl);
+ int i;
+
+ /* Check that we didn't lose a race to get the lock */
+ pte = *ptep;
+ if (!arm_v7s_pte_is_cont(pte, lvl))
+ return pte;
+
+ ptep -= idx & (ARM_V7S_CONT_PAGES - 1);
+ pte = arm_v7s_cont_to_pte(pte, lvl);
+ for (i = 0; i < ARM_V7S_CONT_PAGES; i++)
+ ptep[i] = pte + i * size;
+
+ __arm_v7s_pte_sync(ptep, ARM_V7S_CONT_PAGES, &iop->cfg);
+
+ size *= ARM_V7S_CONT_PAGES;
+ io_pgtable_tlb_flush_leaf(iop, iova, size, size);
+ return pte;
+}
+
+static size_t arm_v7s_split_blk_unmap(struct arm_v7s_io_pgtable *data,
+ struct iommu_iotlb_gather *gather,
+ unsigned long iova, size_t size,
+ arm_v7s_iopte blk_pte,
+ arm_v7s_iopte *ptep)
+{
+ struct io_pgtable_cfg *cfg = &data->iop.cfg;
+ arm_v7s_iopte pte, *tablep;
+ int i, unmap_idx, num_entries, num_ptes;
+
+ tablep = __arm_v7s_alloc_table(2, GFP_ATOMIC, data);
+ if (!tablep)
+ return 0; /* Bytes unmapped */
+
+ num_ptes = ARM_V7S_PTES_PER_LVL(2);
+ num_entries = size >> ARM_V7S_LVL_SHIFT(2);
+ unmap_idx = ARM_V7S_LVL_IDX(iova, 2);
+
+ pte = arm_v7s_prot_to_pte(arm_v7s_pte_to_prot(blk_pte, 1), 2, cfg);
+ if (num_entries > 1)
+ pte = arm_v7s_pte_to_cont(pte, 2);
+
+ for (i = 0; i < num_ptes; i += num_entries, pte += size) {
+ /* Unmap! */
+ if (i == unmap_idx)
+ continue;
+
+ __arm_v7s_set_pte(&tablep[i], pte, num_entries, cfg);
+ }
+
+ pte = arm_v7s_install_table(tablep, ptep, blk_pte, cfg);
+ if (pte != blk_pte) {
+ __arm_v7s_free_table(tablep, 2, data);
+
+ if (!ARM_V7S_PTE_IS_TABLE(pte, 1))
+ return 0;
+
+ tablep = iopte_deref(pte, 1, data);
+ return __arm_v7s_unmap(data, gather, iova, size, 2, tablep);
+ }
+
+ io_pgtable_tlb_add_page(&data->iop, gather, iova, size);
+ return size;
+}
+
+static size_t __arm_v7s_unmap(struct arm_v7s_io_pgtable *data,
+ struct iommu_iotlb_gather *gather,
+ unsigned long iova, size_t size, int lvl,
+ arm_v7s_iopte *ptep)
+{
+ arm_v7s_iopte pte[ARM_V7S_CONT_PAGES];
+ struct io_pgtable *iop = &data->iop;
+ int idx, i = 0, num_entries = size >> ARM_V7S_LVL_SHIFT(lvl);
+
+ /* Something went horribly wrong and we ran out of page table */
+ if (WARN_ON(lvl > 2))
+ return 0;
+
+ idx = ARM_V7S_LVL_IDX(iova, lvl);
+ ptep += idx;
+ do {
+ pte[i] = READ_ONCE(ptep[i]);
+ if (WARN_ON(!ARM_V7S_PTE_IS_VALID(pte[i])))
+ return 0;
+ } while (++i < num_entries);
+
+ /*
+ * If we've hit a contiguous 'large page' entry at this level, it
+ * needs splitting first, unless we're unmapping the whole lot.
+ *
+ * For splitting, we can't rewrite 16 PTEs atomically, and since we
+ * can't necessarily assume TEX remap we don't have a software bit to
+ * mark live entries being split. In practice (i.e. DMA API code), we
+ * will never be splitting large pages anyway, so just wrap this edge
+ * case in a lock for the sake of correctness and be done with it.
+ */
+ if (num_entries <= 1 && arm_v7s_pte_is_cont(pte[0], lvl)) {
+ unsigned long flags;
+
+ spin_lock_irqsave(&data->split_lock, flags);
+ pte[0] = arm_v7s_split_cont(data, iova, idx, lvl, ptep);
+ spin_unlock_irqrestore(&data->split_lock, flags);
+ }
+
+ /* If the size matches this level, we're in the right place */
+ if (num_entries) {
+ size_t blk_size = ARM_V7S_BLOCK_SIZE(lvl);
+
+ __arm_v7s_set_pte(ptep, 0, num_entries, &iop->cfg);
+
+ for (i = 0; i < num_entries; i++) {
+ if (ARM_V7S_PTE_IS_TABLE(pte[i], lvl)) {
+ /* Also flush any partial walks */
+ io_pgtable_tlb_flush_walk(iop, iova, blk_size,
+ ARM_V7S_BLOCK_SIZE(lvl + 1));
+ ptep = iopte_deref(pte[i], lvl, data);
+ __arm_v7s_free_table(ptep, lvl + 1, data);
+ } else if (iop->cfg.quirks & IO_PGTABLE_QUIRK_NON_STRICT) {
+ /*
+ * Order the PTE update against queueing the IOVA, to
+ * guarantee that a flush callback from a different CPU
+ * has observed it before the TLBIALL can be issued.
+ */
+ smp_wmb();
+ } else {
+ io_pgtable_tlb_add_page(iop, gather, iova, blk_size);
+ }
+ iova += blk_size;
+ }
+ return size;
+ } else if (lvl == 1 && !ARM_V7S_PTE_IS_TABLE(pte[0], lvl)) {
+ /*
+ * Insert a table at the next level to map the old region,
+ * minus the part we want to unmap
+ */
+ return arm_v7s_split_blk_unmap(data, gather, iova, size, pte[0],
+ ptep);
+ }
+
+ /* Keep on walkin' */
+ ptep = iopte_deref(pte[0], lvl, data);
+ return __arm_v7s_unmap(data, gather, iova, size, lvl + 1, ptep);
+}
+
+static size_t arm_v7s_unmap(struct io_pgtable_ops *ops, unsigned long iova,
+ size_t size, struct iommu_iotlb_gather *gather)
+{
+ struct arm_v7s_io_pgtable *data = io_pgtable_ops_to_data(ops);
+
+ if (WARN_ON(upper_32_bits(iova)))
+ return 0;
+
+ return __arm_v7s_unmap(data, gather, iova, size, 1, data->pgd);
+}
+
+static phys_addr_t arm_v7s_iova_to_phys(struct io_pgtable_ops *ops,
+ unsigned long iova)
+{
+ struct arm_v7s_io_pgtable *data = io_pgtable_ops_to_data(ops);
+ arm_v7s_iopte *ptep = data->pgd, pte;
+ int lvl = 0;
+ u32 mask;
+
+ do {
+ ptep += ARM_V7S_LVL_IDX(iova, ++lvl);
+ pte = READ_ONCE(*ptep);
+ ptep = iopte_deref(pte, lvl, data);
+ } while (ARM_V7S_PTE_IS_TABLE(pte, lvl));
+
+ if (!ARM_V7S_PTE_IS_VALID(pte))
+ return 0;
+
+ mask = ARM_V7S_LVL_MASK(lvl);
+ if (arm_v7s_pte_is_cont(pte, lvl))
+ mask *= ARM_V7S_CONT_PAGES;
+ return iopte_to_paddr(pte, lvl, &data->iop.cfg) | (iova & ~mask);
+}
+
+static struct io_pgtable *arm_v7s_alloc_pgtable(struct io_pgtable_cfg *cfg,
+ void *cookie)
+{
+ struct arm_v7s_io_pgtable *data;
+
+ if (cfg->ias > ARM_V7S_ADDR_BITS)
+ return NULL;
+
+ if (cfg->oas > (arm_v7s_is_mtk_enabled(cfg) ? 34 : ARM_V7S_ADDR_BITS))
+ return NULL;
+
+ if (cfg->quirks & ~(IO_PGTABLE_QUIRK_ARM_NS |
+ IO_PGTABLE_QUIRK_NO_PERMS |
+ IO_PGTABLE_QUIRK_TLBI_ON_MAP |
+ IO_PGTABLE_QUIRK_ARM_MTK_EXT |
+ IO_PGTABLE_QUIRK_NON_STRICT))
+ return NULL;
+
+ /* If ARM_MTK_4GB is enabled, the NO_PERMS is also expected. */
+ if (cfg->quirks & IO_PGTABLE_QUIRK_ARM_MTK_EXT &&
+ !(cfg->quirks & IO_PGTABLE_QUIRK_NO_PERMS))
+ return NULL;
+
+ data = kmalloc(sizeof(*data), GFP_KERNEL);
+ if (!data)
+ return NULL;
+
+ spin_lock_init(&data->split_lock);
+ data->l2_tables = kmem_cache_create("io-pgtable_armv7s_l2",
+ ARM_V7S_TABLE_SIZE(2),
+ ARM_V7S_TABLE_SIZE(2),
+ ARM_V7S_TABLE_SLAB_FLAGS, NULL);
+ if (!data->l2_tables)
+ goto out_free_data;
+
+ data->iop.ops = (struct io_pgtable_ops) {
+ .map = arm_v7s_map,
+ .unmap = arm_v7s_unmap,
+ .iova_to_phys = arm_v7s_iova_to_phys,
+ };
+
+ /* We have to do this early for __arm_v7s_alloc_table to work... */
+ data->iop.cfg = *cfg;
+
+ /*
+ * Unless the IOMMU driver indicates supersection support by
+ * having SZ_16M set in the initial bitmap, they won't be used.
+ */
+ cfg->pgsize_bitmap &= SZ_4K | SZ_64K | SZ_1M | SZ_16M;
+
+ /* TCR: T0SZ=0, EAE=0 (if applicable) */
+ cfg->arm_v7s_cfg.tcr = 0;
+
+ /*
+ * TEX remap: the indices used map to the closest equivalent types
+ * under the non-TEX-remap interpretation of those attribute bits,
+ * excepting various implementation-defined aspects of shareability.
+ */
+ cfg->arm_v7s_cfg.prrr = ARM_V7S_PRRR_TR(1, ARM_V7S_PRRR_TYPE_DEVICE) |
+ ARM_V7S_PRRR_TR(4, ARM_V7S_PRRR_TYPE_NORMAL) |
+ ARM_V7S_PRRR_TR(7, ARM_V7S_PRRR_TYPE_NORMAL) |
+ ARM_V7S_PRRR_DS0 | ARM_V7S_PRRR_DS1 |
+ ARM_V7S_PRRR_NS1 | ARM_V7S_PRRR_NOS(7);
+ cfg->arm_v7s_cfg.nmrr = ARM_V7S_NMRR_IR(7, ARM_V7S_RGN_WBWA) |
+ ARM_V7S_NMRR_OR(7, ARM_V7S_RGN_WBWA);
+
+ /* Looking good; allocate a pgd */
+ data->pgd = __arm_v7s_alloc_table(1, GFP_KERNEL, data);
+ if (!data->pgd)
+ goto out_free_data;
+
+ /* Ensure the empty pgd is visible before any actual TTBR write */
+ wmb();
+
+ /* TTBR */
+ cfg->arm_v7s_cfg.ttbr = virt_to_phys(data->pgd) | ARM_V7S_TTBR_S |
+ (cfg->coherent_walk ? (ARM_V7S_TTBR_NOS |
+ ARM_V7S_TTBR_IRGN_ATTR(ARM_V7S_RGN_WBWA) |
+ ARM_V7S_TTBR_ORGN_ATTR(ARM_V7S_RGN_WBWA)) :
+ (ARM_V7S_TTBR_IRGN_ATTR(ARM_V7S_RGN_NC) |
+ ARM_V7S_TTBR_ORGN_ATTR(ARM_V7S_RGN_NC)));
+ return &data->iop;
+
+out_free_data:
+ kmem_cache_destroy(data->l2_tables);
+ kfree(data);
+ return NULL;
+}
+
+struct io_pgtable_init_fns io_pgtable_arm_v7s_init_fns = {
+ .alloc = arm_v7s_alloc_pgtable,
+ .free = arm_v7s_free_pgtable,
+};
+
+#ifdef CONFIG_IOMMU_IO_PGTABLE_ARMV7S_SELFTEST
+
+static struct io_pgtable_cfg *cfg_cookie __initdata;
+
+static void __init dummy_tlb_flush_all(void *cookie)
+{
+ WARN_ON(cookie != cfg_cookie);
+}
+
+static void __init dummy_tlb_flush(unsigned long iova, size_t size,
+ size_t granule, void *cookie)
+{
+ WARN_ON(cookie != cfg_cookie);
+ WARN_ON(!(size & cfg_cookie->pgsize_bitmap));
+}
+
+static void __init dummy_tlb_add_page(struct iommu_iotlb_gather *gather,
+ unsigned long iova, size_t granule,
+ void *cookie)
+{
+ dummy_tlb_flush(iova, granule, granule, cookie);
+}
+
+static const struct iommu_flush_ops dummy_tlb_ops __initconst = {
+ .tlb_flush_all = dummy_tlb_flush_all,
+ .tlb_flush_walk = dummy_tlb_flush,
+ .tlb_flush_leaf = dummy_tlb_flush,
+ .tlb_add_page = dummy_tlb_add_page,
+};
+
+#define __FAIL(ops) ({ \
+ WARN(1, "selftest: test failed\n"); \
+ selftest_running = false; \
+ -EFAULT; \
+})
+
+static int __init arm_v7s_do_selftests(void)
+{
+ struct io_pgtable_ops *ops;
+ struct io_pgtable_cfg cfg = {
+ .tlb = &dummy_tlb_ops,
+ .oas = 32,
+ .ias = 32,
+ .coherent_walk = true,
+ .quirks = IO_PGTABLE_QUIRK_ARM_NS,
+ .pgsize_bitmap = SZ_4K | SZ_64K | SZ_1M | SZ_16M,
+ };
+ unsigned int iova, size, iova_start;
+ unsigned int i, loopnr = 0;
+
+ selftest_running = true;
+
+ cfg_cookie = &cfg;
+
+ ops = alloc_io_pgtable_ops(ARM_V7S, &cfg, &cfg);
+ if (!ops) {
+ pr_err("selftest: failed to allocate io pgtable ops\n");
+ return -EINVAL;
+ }
+
+ /*
+ * Initial sanity checks.
+ * Empty page tables shouldn't provide any translations.
+ */
+ if (ops->iova_to_phys(ops, 42))
+ return __FAIL(ops);
+
+ if (ops->iova_to_phys(ops, SZ_1G + 42))
+ return __FAIL(ops);
+
+ if (ops->iova_to_phys(ops, SZ_2G + 42))
+ return __FAIL(ops);
+
+ /*
+ * Distinct mappings of different granule sizes.
+ */
+ iova = 0;
+ for_each_set_bit(i, &cfg.pgsize_bitmap, BITS_PER_LONG) {
+ size = 1UL << i;
+ if (ops->map(ops, iova, iova, size, IOMMU_READ |
+ IOMMU_WRITE |
+ IOMMU_NOEXEC |
+ IOMMU_CACHE, GFP_KERNEL))
+ return __FAIL(ops);
+
+ /* Overlapping mappings */
+ if (!ops->map(ops, iova, iova + size, size,
+ IOMMU_READ | IOMMU_NOEXEC, GFP_KERNEL))
+ return __FAIL(ops);
+
+ if (ops->iova_to_phys(ops, iova + 42) != (iova + 42))
+ return __FAIL(ops);
+
+ iova += SZ_16M;
+ loopnr++;
+ }
+
+ /* Partial unmap */
+ i = 1;
+ size = 1UL << __ffs(cfg.pgsize_bitmap);
+ while (i < loopnr) {
+ iova_start = i * SZ_16M;
+ if (ops->unmap(ops, iova_start + size, size, NULL) != size)
+ return __FAIL(ops);
+
+ /* Remap of partial unmap */
+ if (ops->map(ops, iova_start + size, size, size, IOMMU_READ, GFP_KERNEL))
+ return __FAIL(ops);
+
+ if (ops->iova_to_phys(ops, iova_start + size + 42)
+ != (size + 42))
+ return __FAIL(ops);
+ i++;
+ }
+
+ /* Full unmap */
+ iova = 0;
+ for_each_set_bit(i, &cfg.pgsize_bitmap, BITS_PER_LONG) {
+ size = 1UL << i;
+
+ if (ops->unmap(ops, iova, size, NULL) != size)
+ return __FAIL(ops);
+
+ if (ops->iova_to_phys(ops, iova + 42))
+ return __FAIL(ops);
+
+ /* Remap full block */
+ if (ops->map(ops, iova, iova, size, IOMMU_WRITE, GFP_KERNEL))
+ return __FAIL(ops);
+
+ if (ops->iova_to_phys(ops, iova + 42) != (iova + 42))
+ return __FAIL(ops);
+
+ iova += SZ_16M;
+ }
+
+ free_io_pgtable_ops(ops);
+
+ selftest_running = false;
+
+ pr_info("self test ok\n");
+ return 0;
+}
+subsys_initcall(arm_v7s_do_selftests);
+#endif
diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c
new file mode 100644
index 000000000..e1cd31c0e
--- /dev/null
+++ b/drivers/iommu/io-pgtable-arm.c
@@ -0,0 +1,1250 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * CPU-agnostic ARM page table allocator.
+ *
+ * Copyright (C) 2014 ARM Limited
+ *
+ * Author: Will Deacon <will.deacon@arm.com>
+ */
+
+#define pr_fmt(fmt) "arm-lpae io-pgtable: " fmt
+
+#include <linux/atomic.h>
+#include <linux/bitops.h>
+#include <linux/io-pgtable.h>
+#include <linux/kernel.h>
+#include <linux/sizes.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/dma-mapping.h>
+
+#include <asm/barrier.h>
+
+#include "io-pgtable-arm.h"
+
+#define ARM_LPAE_MAX_ADDR_BITS 52
+#define ARM_LPAE_S2_MAX_CONCAT_PAGES 16
+#define ARM_LPAE_MAX_LEVELS 4
+
+/* Struct accessors */
+#define io_pgtable_to_data(x) \
+ container_of((x), struct arm_lpae_io_pgtable, iop)
+
+#define io_pgtable_ops_to_data(x) \
+ io_pgtable_to_data(io_pgtable_ops_to_pgtable(x))
+
+/*
+ * Calculate the right shift amount to get to the portion describing level l
+ * in a virtual address mapped by the pagetable in d.
+ */
+#define ARM_LPAE_LVL_SHIFT(l,d) \
+ (((ARM_LPAE_MAX_LEVELS - (l)) * (d)->bits_per_level) + \
+ ilog2(sizeof(arm_lpae_iopte)))
+
+#define ARM_LPAE_GRANULE(d) \
+ (sizeof(arm_lpae_iopte) << (d)->bits_per_level)
+#define ARM_LPAE_PGD_SIZE(d) \
+ (sizeof(arm_lpae_iopte) << (d)->pgd_bits)
+
+/*
+ * Calculate the index at level l used to map virtual address a using the
+ * pagetable in d.
+ */
+#define ARM_LPAE_PGD_IDX(l,d) \
+ ((l) == (d)->start_level ? (d)->pgd_bits - (d)->bits_per_level : 0)
+
+#define ARM_LPAE_LVL_IDX(a,l,d) \
+ (((u64)(a) >> ARM_LPAE_LVL_SHIFT(l,d)) & \
+ ((1 << ((d)->bits_per_level + ARM_LPAE_PGD_IDX(l,d))) - 1))
+
+/* Calculate the block/page mapping size at level l for pagetable in d. */
+#define ARM_LPAE_BLOCK_SIZE(l,d) (1ULL << ARM_LPAE_LVL_SHIFT(l,d))
+
+/* Page table bits */
+#define ARM_LPAE_PTE_TYPE_SHIFT 0
+#define ARM_LPAE_PTE_TYPE_MASK 0x3
+
+#define ARM_LPAE_PTE_TYPE_BLOCK 1
+#define ARM_LPAE_PTE_TYPE_TABLE 3
+#define ARM_LPAE_PTE_TYPE_PAGE 3
+
+#define ARM_LPAE_PTE_ADDR_MASK GENMASK_ULL(47,12)
+
+#define ARM_LPAE_PTE_NSTABLE (((arm_lpae_iopte)1) << 63)
+#define ARM_LPAE_PTE_XN (((arm_lpae_iopte)3) << 53)
+#define ARM_LPAE_PTE_AF (((arm_lpae_iopte)1) << 10)
+#define ARM_LPAE_PTE_SH_NS (((arm_lpae_iopte)0) << 8)
+#define ARM_LPAE_PTE_SH_OS (((arm_lpae_iopte)2) << 8)
+#define ARM_LPAE_PTE_SH_IS (((arm_lpae_iopte)3) << 8)
+#define ARM_LPAE_PTE_NS (((arm_lpae_iopte)1) << 5)
+#define ARM_LPAE_PTE_VALID (((arm_lpae_iopte)1) << 0)
+
+#define ARM_LPAE_PTE_ATTR_LO_MASK (((arm_lpae_iopte)0x3ff) << 2)
+/* Ignore the contiguous bit for block splitting */
+#define ARM_LPAE_PTE_ATTR_HI_MASK (((arm_lpae_iopte)6) << 52)
+#define ARM_LPAE_PTE_ATTR_MASK (ARM_LPAE_PTE_ATTR_LO_MASK | \
+ ARM_LPAE_PTE_ATTR_HI_MASK)
+/* Software bit for solving coherency races */
+#define ARM_LPAE_PTE_SW_SYNC (((arm_lpae_iopte)1) << 55)
+
+/* Stage-1 PTE */
+#define ARM_LPAE_PTE_AP_UNPRIV (((arm_lpae_iopte)1) << 6)
+#define ARM_LPAE_PTE_AP_RDONLY (((arm_lpae_iopte)2) << 6)
+#define ARM_LPAE_PTE_ATTRINDX_SHIFT 2
+#define ARM_LPAE_PTE_nG (((arm_lpae_iopte)1) << 11)
+
+/* Stage-2 PTE */
+#define ARM_LPAE_PTE_HAP_FAULT (((arm_lpae_iopte)0) << 6)
+#define ARM_LPAE_PTE_HAP_READ (((arm_lpae_iopte)1) << 6)
+#define ARM_LPAE_PTE_HAP_WRITE (((arm_lpae_iopte)2) << 6)
+#define ARM_LPAE_PTE_MEMATTR_OIWB (((arm_lpae_iopte)0xf) << 2)
+#define ARM_LPAE_PTE_MEMATTR_NC (((arm_lpae_iopte)0x5) << 2)
+#define ARM_LPAE_PTE_MEMATTR_DEV (((arm_lpae_iopte)0x1) << 2)
+
+/* Register bits */
+#define ARM_LPAE_VTCR_SL0_MASK 0x3
+
+#define ARM_LPAE_TCR_T0SZ_SHIFT 0
+
+#define ARM_LPAE_VTCR_PS_SHIFT 16
+#define ARM_LPAE_VTCR_PS_MASK 0x7
+
+#define ARM_LPAE_MAIR_ATTR_SHIFT(n) ((n) << 3)
+#define ARM_LPAE_MAIR_ATTR_MASK 0xff
+#define ARM_LPAE_MAIR_ATTR_DEVICE 0x04
+#define ARM_LPAE_MAIR_ATTR_NC 0x44
+#define ARM_LPAE_MAIR_ATTR_INC_OWBRWA 0xf4
+#define ARM_LPAE_MAIR_ATTR_WBRWA 0xff
+#define ARM_LPAE_MAIR_ATTR_IDX_NC 0
+#define ARM_LPAE_MAIR_ATTR_IDX_CACHE 1
+#define ARM_LPAE_MAIR_ATTR_IDX_DEV 2
+#define ARM_LPAE_MAIR_ATTR_IDX_INC_OCACHE 3
+
+#define ARM_MALI_LPAE_TTBR_ADRMODE_TABLE (3u << 0)
+#define ARM_MALI_LPAE_TTBR_READ_INNER BIT(2)
+#define ARM_MALI_LPAE_TTBR_SHARE_OUTER BIT(4)
+
+#define ARM_MALI_LPAE_MEMATTR_IMP_DEF 0x88ULL
+#define ARM_MALI_LPAE_MEMATTR_WRITE_ALLOC 0x8DULL
+
+/* IOPTE accessors */
+#define iopte_deref(pte,d) __va(iopte_to_paddr(pte, d))
+
+#define iopte_type(pte,l) \
+ (((pte) >> ARM_LPAE_PTE_TYPE_SHIFT) & ARM_LPAE_PTE_TYPE_MASK)
+
+#define iopte_prot(pte) ((pte) & ARM_LPAE_PTE_ATTR_MASK)
+
+struct arm_lpae_io_pgtable {
+ struct io_pgtable iop;
+
+ int pgd_bits;
+ int start_level;
+ int bits_per_level;
+
+ void *pgd;
+};
+
+typedef u64 arm_lpae_iopte;
+
+static inline bool iopte_leaf(arm_lpae_iopte pte, int lvl,
+ enum io_pgtable_fmt fmt)
+{
+ if (lvl == (ARM_LPAE_MAX_LEVELS - 1) && fmt != ARM_MALI_LPAE)
+ return iopte_type(pte, lvl) == ARM_LPAE_PTE_TYPE_PAGE;
+
+ return iopte_type(pte, lvl) == ARM_LPAE_PTE_TYPE_BLOCK;
+}
+
+static arm_lpae_iopte paddr_to_iopte(phys_addr_t paddr,
+ struct arm_lpae_io_pgtable *data)
+{
+ arm_lpae_iopte pte = paddr;
+
+ /* Of the bits which overlap, either 51:48 or 15:12 are always RES0 */
+ return (pte | (pte >> (48 - 12))) & ARM_LPAE_PTE_ADDR_MASK;
+}
+
+static phys_addr_t iopte_to_paddr(arm_lpae_iopte pte,
+ struct arm_lpae_io_pgtable *data)
+{
+ u64 paddr = pte & ARM_LPAE_PTE_ADDR_MASK;
+
+ if (ARM_LPAE_GRANULE(data) < SZ_64K)
+ return paddr;
+
+ /* Rotate the packed high-order bits back to the top */
+ return (paddr | (paddr << (48 - 12))) & (ARM_LPAE_PTE_ADDR_MASK << 4);
+}
+
+static bool selftest_running = false;
+
+static dma_addr_t __arm_lpae_dma_addr(void *pages)
+{
+ return (dma_addr_t)virt_to_phys(pages);
+}
+
+static void *__arm_lpae_alloc_pages(size_t size, gfp_t gfp,
+ struct io_pgtable_cfg *cfg)
+{
+ struct device *dev = cfg->iommu_dev;
+ int order = get_order(size);
+ struct page *p;
+ dma_addr_t dma;
+ void *pages;
+
+ VM_BUG_ON((gfp & __GFP_HIGHMEM));
+ p = alloc_pages_node(dev ? dev_to_node(dev) : NUMA_NO_NODE,
+ gfp | __GFP_ZERO, order);
+ if (!p)
+ return NULL;
+
+ pages = page_address(p);
+ if (!cfg->coherent_walk) {
+ dma = dma_map_single(dev, pages, size, DMA_TO_DEVICE);
+ if (dma_mapping_error(dev, dma))
+ goto out_free;
+ /*
+ * We depend on the IOMMU being able to work with any physical
+ * address directly, so if the DMA layer suggests otherwise by
+ * translating or truncating them, that bodes very badly...
+ */
+ if (dma != virt_to_phys(pages))
+ goto out_unmap;
+ }
+
+ return pages;
+
+out_unmap:
+ dev_err(dev, "Cannot accommodate DMA translation for IOMMU page tables\n");
+ dma_unmap_single(dev, dma, size, DMA_TO_DEVICE);
+out_free:
+ __free_pages(p, order);
+ return NULL;
+}
+
+static void __arm_lpae_free_pages(void *pages, size_t size,
+ struct io_pgtable_cfg *cfg)
+{
+ if (!cfg->coherent_walk)
+ dma_unmap_single(cfg->iommu_dev, __arm_lpae_dma_addr(pages),
+ size, DMA_TO_DEVICE);
+ free_pages((unsigned long)pages, get_order(size));
+}
+
+static void __arm_lpae_sync_pte(arm_lpae_iopte *ptep,
+ struct io_pgtable_cfg *cfg)
+{
+ dma_sync_single_for_device(cfg->iommu_dev, __arm_lpae_dma_addr(ptep),
+ sizeof(*ptep), DMA_TO_DEVICE);
+}
+
+static void __arm_lpae_set_pte(arm_lpae_iopte *ptep, arm_lpae_iopte pte,
+ struct io_pgtable_cfg *cfg)
+{
+ *ptep = pte;
+
+ if (!cfg->coherent_walk)
+ __arm_lpae_sync_pte(ptep, cfg);
+}
+
+static size_t __arm_lpae_unmap(struct arm_lpae_io_pgtable *data,
+ struct iommu_iotlb_gather *gather,
+ unsigned long iova, size_t size, int lvl,
+ arm_lpae_iopte *ptep);
+
+static void __arm_lpae_init_pte(struct arm_lpae_io_pgtable *data,
+ phys_addr_t paddr, arm_lpae_iopte prot,
+ int lvl, arm_lpae_iopte *ptep)
+{
+ arm_lpae_iopte pte = prot;
+
+ if (data->iop.fmt != ARM_MALI_LPAE && lvl == ARM_LPAE_MAX_LEVELS - 1)
+ pte |= ARM_LPAE_PTE_TYPE_PAGE;
+ else
+ pte |= ARM_LPAE_PTE_TYPE_BLOCK;
+
+ pte |= paddr_to_iopte(paddr, data);
+
+ __arm_lpae_set_pte(ptep, pte, &data->iop.cfg);
+}
+
+static int arm_lpae_init_pte(struct arm_lpae_io_pgtable *data,
+ unsigned long iova, phys_addr_t paddr,
+ arm_lpae_iopte prot, int lvl,
+ arm_lpae_iopte *ptep)
+{
+ arm_lpae_iopte pte = *ptep;
+
+ if (iopte_leaf(pte, lvl, data->iop.fmt)) {
+ /* We require an unmap first */
+ WARN_ON(!selftest_running);
+ return -EEXIST;
+ } else if (iopte_type(pte, lvl) == ARM_LPAE_PTE_TYPE_TABLE) {
+ /*
+ * We need to unmap and free the old table before
+ * overwriting it with a block entry.
+ */
+ arm_lpae_iopte *tblp;
+ size_t sz = ARM_LPAE_BLOCK_SIZE(lvl, data);
+
+ tblp = ptep - ARM_LPAE_LVL_IDX(iova, lvl, data);
+ if (__arm_lpae_unmap(data, NULL, iova, sz, lvl, tblp) != sz) {
+ WARN_ON(1);
+ return -EINVAL;
+ }
+ }
+
+ __arm_lpae_init_pte(data, paddr, prot, lvl, ptep);
+ return 0;
+}
+
+static arm_lpae_iopte arm_lpae_install_table(arm_lpae_iopte *table,
+ arm_lpae_iopte *ptep,
+ arm_lpae_iopte curr,
+ struct arm_lpae_io_pgtable *data)
+{
+ arm_lpae_iopte old, new;
+ struct io_pgtable_cfg *cfg = &data->iop.cfg;
+
+ new = paddr_to_iopte(__pa(table), data) | ARM_LPAE_PTE_TYPE_TABLE;
+ if (cfg->quirks & IO_PGTABLE_QUIRK_ARM_NS)
+ new |= ARM_LPAE_PTE_NSTABLE;
+
+ /*
+ * Ensure the table itself is visible before its PTE can be.
+ * Whilst we could get away with cmpxchg64_release below, this
+ * doesn't have any ordering semantics when !CONFIG_SMP.
+ */
+ dma_wmb();
+
+ old = cmpxchg64_relaxed(ptep, curr, new);
+
+ if (cfg->coherent_walk || (old & ARM_LPAE_PTE_SW_SYNC))
+ return old;
+
+ /* Even if it's not ours, there's no point waiting; just kick it */
+ __arm_lpae_sync_pte(ptep, cfg);
+ if (old == curr)
+ WRITE_ONCE(*ptep, new | ARM_LPAE_PTE_SW_SYNC);
+
+ return old;
+}
+
+static int __arm_lpae_map(struct arm_lpae_io_pgtable *data, unsigned long iova,
+ phys_addr_t paddr, size_t size, arm_lpae_iopte prot,
+ int lvl, arm_lpae_iopte *ptep, gfp_t gfp)
+{
+ arm_lpae_iopte *cptep, pte;
+ size_t block_size = ARM_LPAE_BLOCK_SIZE(lvl, data);
+ size_t tblsz = ARM_LPAE_GRANULE(data);
+ struct io_pgtable_cfg *cfg = &data->iop.cfg;
+
+ /* Find our entry at the current level */
+ ptep += ARM_LPAE_LVL_IDX(iova, lvl, data);
+
+ /* If we can install a leaf entry at this level, then do so */
+ if (size == block_size)
+ return arm_lpae_init_pte(data, iova, paddr, prot, lvl, ptep);
+
+ /* We can't allocate tables at the final level */
+ if (WARN_ON(lvl >= ARM_LPAE_MAX_LEVELS - 1))
+ return -EINVAL;
+
+ /* Grab a pointer to the next level */
+ pte = READ_ONCE(*ptep);
+ if (!pte) {
+ cptep = __arm_lpae_alloc_pages(tblsz, gfp, cfg);
+ if (!cptep)
+ return -ENOMEM;
+
+ pte = arm_lpae_install_table(cptep, ptep, 0, data);
+ if (pte)
+ __arm_lpae_free_pages(cptep, tblsz, cfg);
+ } else if (!cfg->coherent_walk && !(pte & ARM_LPAE_PTE_SW_SYNC)) {
+ __arm_lpae_sync_pte(ptep, cfg);
+ }
+
+ if (pte && !iopte_leaf(pte, lvl, data->iop.fmt)) {
+ cptep = iopte_deref(pte, data);
+ } else if (pte) {
+ /* We require an unmap first */
+ WARN_ON(!selftest_running);
+ return -EEXIST;
+ }
+
+ /* Rinse, repeat */
+ return __arm_lpae_map(data, iova, paddr, size, prot, lvl + 1, cptep, gfp);
+}
+
+static arm_lpae_iopte arm_lpae_prot_to_pte(struct arm_lpae_io_pgtable *data,
+ int prot)
+{
+ arm_lpae_iopte pte;
+
+ if (data->iop.fmt == ARM_64_LPAE_S1 ||
+ data->iop.fmt == ARM_32_LPAE_S1) {
+ pte = ARM_LPAE_PTE_nG;
+ if (!(prot & IOMMU_WRITE) && (prot & IOMMU_READ))
+ pte |= ARM_LPAE_PTE_AP_RDONLY;
+ if (!(prot & IOMMU_PRIV))
+ pte |= ARM_LPAE_PTE_AP_UNPRIV;
+ } else {
+ pte = ARM_LPAE_PTE_HAP_FAULT;
+ if (prot & IOMMU_READ)
+ pte |= ARM_LPAE_PTE_HAP_READ;
+ if (prot & IOMMU_WRITE)
+ pte |= ARM_LPAE_PTE_HAP_WRITE;
+ }
+
+ /*
+ * Note that this logic is structured to accommodate Mali LPAE
+ * having stage-1-like attributes but stage-2-like permissions.
+ */
+ if (data->iop.fmt == ARM_64_LPAE_S2 ||
+ data->iop.fmt == ARM_32_LPAE_S2) {
+ if (prot & IOMMU_MMIO)
+ pte |= ARM_LPAE_PTE_MEMATTR_DEV;
+ else if (prot & IOMMU_CACHE)
+ pte |= ARM_LPAE_PTE_MEMATTR_OIWB;
+ else
+ pte |= ARM_LPAE_PTE_MEMATTR_NC;
+ } else {
+ if (prot & IOMMU_MMIO)
+ pte |= (ARM_LPAE_MAIR_ATTR_IDX_DEV
+ << ARM_LPAE_PTE_ATTRINDX_SHIFT);
+ else if (prot & IOMMU_CACHE)
+ pte |= (ARM_LPAE_MAIR_ATTR_IDX_CACHE
+ << ARM_LPAE_PTE_ATTRINDX_SHIFT);
+ }
+
+ /*
+ * Also Mali has its own notions of shareability wherein its Inner
+ * domain covers the cores within the GPU, and its Outer domain is
+ * "outside the GPU" (i.e. either the Inner or System domain in CPU
+ * terms, depending on coherency).
+ */
+ if (prot & IOMMU_CACHE && data->iop.fmt != ARM_MALI_LPAE)
+ pte |= ARM_LPAE_PTE_SH_IS;
+ else
+ pte |= ARM_LPAE_PTE_SH_OS;
+
+ if (prot & IOMMU_NOEXEC)
+ pte |= ARM_LPAE_PTE_XN;
+
+ if (data->iop.cfg.quirks & IO_PGTABLE_QUIRK_ARM_NS)
+ pte |= ARM_LPAE_PTE_NS;
+
+ if (data->iop.fmt != ARM_MALI_LPAE)
+ pte |= ARM_LPAE_PTE_AF;
+
+ return pte;
+}
+
+static int arm_lpae_map(struct io_pgtable_ops *ops, unsigned long iova,
+ phys_addr_t paddr, size_t size, int iommu_prot, gfp_t gfp)
+{
+ struct arm_lpae_io_pgtable *data = io_pgtable_ops_to_data(ops);
+ struct io_pgtable_cfg *cfg = &data->iop.cfg;
+ arm_lpae_iopte *ptep = data->pgd;
+ int ret, lvl = data->start_level;
+ arm_lpae_iopte prot;
+ long iaext = (s64)iova >> cfg->ias;
+
+ /* If no access, then nothing to do */
+ if (!(iommu_prot & (IOMMU_READ | IOMMU_WRITE)))
+ return 0;
+
+ if (WARN_ON(!size || (size & cfg->pgsize_bitmap) != size))
+ return -EINVAL;
+
+ if (cfg->quirks & IO_PGTABLE_QUIRK_ARM_TTBR1)
+ iaext = ~iaext;
+ if (WARN_ON(iaext || paddr >> cfg->oas))
+ return -ERANGE;
+
+ prot = arm_lpae_prot_to_pte(data, iommu_prot);
+ ret = __arm_lpae_map(data, iova, paddr, size, prot, lvl, ptep, gfp);
+ /*
+ * Synchronise all PTE updates for the new mapping before there's
+ * a chance for anything to kick off a table walk for the new iova.
+ */
+ wmb();
+
+ return ret;
+}
+
+static void __arm_lpae_free_pgtable(struct arm_lpae_io_pgtable *data, int lvl,
+ arm_lpae_iopte *ptep)
+{
+ arm_lpae_iopte *start, *end;
+ unsigned long table_size;
+
+ if (lvl == data->start_level)
+ table_size = ARM_LPAE_PGD_SIZE(data);
+ else
+ table_size = ARM_LPAE_GRANULE(data);
+
+ start = ptep;
+
+ /* Only leaf entries at the last level */
+ if (lvl == ARM_LPAE_MAX_LEVELS - 1)
+ end = ptep;
+ else
+ end = (void *)ptep + table_size;
+
+ while (ptep != end) {
+ arm_lpae_iopte pte = *ptep++;
+
+ if (!pte || iopte_leaf(pte, lvl, data->iop.fmt))
+ continue;
+
+ __arm_lpae_free_pgtable(data, lvl + 1, iopte_deref(pte, data));
+ }
+
+ __arm_lpae_free_pages(start, table_size, &data->iop.cfg);
+}
+
+static void arm_lpae_free_pgtable(struct io_pgtable *iop)
+{
+ struct arm_lpae_io_pgtable *data = io_pgtable_to_data(iop);
+
+ __arm_lpae_free_pgtable(data, data->start_level, data->pgd);
+ kfree(data);
+}
+
+static size_t arm_lpae_split_blk_unmap(struct arm_lpae_io_pgtable *data,
+ struct iommu_iotlb_gather *gather,
+ unsigned long iova, size_t size,
+ arm_lpae_iopte blk_pte, int lvl,
+ arm_lpae_iopte *ptep)
+{
+ struct io_pgtable_cfg *cfg = &data->iop.cfg;
+ arm_lpae_iopte pte, *tablep;
+ phys_addr_t blk_paddr;
+ size_t tablesz = ARM_LPAE_GRANULE(data);
+ size_t split_sz = ARM_LPAE_BLOCK_SIZE(lvl, data);
+ int i, unmap_idx = -1;
+
+ if (WARN_ON(lvl == ARM_LPAE_MAX_LEVELS))
+ return 0;
+
+ tablep = __arm_lpae_alloc_pages(tablesz, GFP_ATOMIC, cfg);
+ if (!tablep)
+ return 0; /* Bytes unmapped */
+
+ if (size == split_sz)
+ unmap_idx = ARM_LPAE_LVL_IDX(iova, lvl, data);
+
+ blk_paddr = iopte_to_paddr(blk_pte, data);
+ pte = iopte_prot(blk_pte);
+
+ for (i = 0; i < tablesz / sizeof(pte); i++, blk_paddr += split_sz) {
+ /* Unmap! */
+ if (i == unmap_idx)
+ continue;
+
+ __arm_lpae_init_pte(data, blk_paddr, pte, lvl, &tablep[i]);
+ }
+
+ pte = arm_lpae_install_table(tablep, ptep, blk_pte, data);
+ if (pte != blk_pte) {
+ __arm_lpae_free_pages(tablep, tablesz, cfg);
+ /*
+ * We may race against someone unmapping another part of this
+ * block, but anything else is invalid. We can't misinterpret
+ * a page entry here since we're never at the last level.
+ */
+ if (iopte_type(pte, lvl - 1) != ARM_LPAE_PTE_TYPE_TABLE)
+ return 0;
+
+ tablep = iopte_deref(pte, data);
+ } else if (unmap_idx >= 0) {
+ io_pgtable_tlb_add_page(&data->iop, gather, iova, size);
+ return size;
+ }
+
+ return __arm_lpae_unmap(data, gather, iova, size, lvl, tablep);
+}
+
+static size_t __arm_lpae_unmap(struct arm_lpae_io_pgtable *data,
+ struct iommu_iotlb_gather *gather,
+ unsigned long iova, size_t size, int lvl,
+ arm_lpae_iopte *ptep)
+{
+ arm_lpae_iopte pte;
+ struct io_pgtable *iop = &data->iop;
+
+ /* Something went horribly wrong and we ran out of page table */
+ if (WARN_ON(lvl == ARM_LPAE_MAX_LEVELS))
+ return 0;
+
+ ptep += ARM_LPAE_LVL_IDX(iova, lvl, data);
+ pte = READ_ONCE(*ptep);
+ if (WARN_ON(!pte))
+ return 0;
+
+ /* If the size matches this level, we're in the right place */
+ if (size == ARM_LPAE_BLOCK_SIZE(lvl, data)) {
+ __arm_lpae_set_pte(ptep, 0, &iop->cfg);
+
+ if (!iopte_leaf(pte, lvl, iop->fmt)) {
+ /* Also flush any partial walks */
+ io_pgtable_tlb_flush_walk(iop, iova, size,
+ ARM_LPAE_GRANULE(data));
+ ptep = iopte_deref(pte, data);
+ __arm_lpae_free_pgtable(data, lvl + 1, ptep);
+ } else if (iop->cfg.quirks & IO_PGTABLE_QUIRK_NON_STRICT) {
+ /*
+ * Order the PTE update against queueing the IOVA, to
+ * guarantee that a flush callback from a different CPU
+ * has observed it before the TLBIALL can be issued.
+ */
+ smp_wmb();
+ } else {
+ io_pgtable_tlb_add_page(iop, gather, iova, size);
+ }
+
+ return size;
+ } else if (iopte_leaf(pte, lvl, iop->fmt)) {
+ /*
+ * Insert a table at the next level to map the old region,
+ * minus the part we want to unmap
+ */
+ return arm_lpae_split_blk_unmap(data, gather, iova, size, pte,
+ lvl + 1, ptep);
+ }
+
+ /* Keep on walkin' */
+ ptep = iopte_deref(pte, data);
+ return __arm_lpae_unmap(data, gather, iova, size, lvl + 1, ptep);
+}
+
+static size_t arm_lpae_unmap(struct io_pgtable_ops *ops, unsigned long iova,
+ size_t size, struct iommu_iotlb_gather *gather)
+{
+ struct arm_lpae_io_pgtable *data = io_pgtable_ops_to_data(ops);
+ struct io_pgtable_cfg *cfg = &data->iop.cfg;
+ arm_lpae_iopte *ptep = data->pgd;
+ long iaext = (s64)iova >> cfg->ias;
+
+ if (WARN_ON(!size || (size & cfg->pgsize_bitmap) != size))
+ return 0;
+
+ if (cfg->quirks & IO_PGTABLE_QUIRK_ARM_TTBR1)
+ iaext = ~iaext;
+ if (WARN_ON(iaext))
+ return 0;
+
+ return __arm_lpae_unmap(data, gather, iova, size, data->start_level, ptep);
+}
+
+static phys_addr_t arm_lpae_iova_to_phys(struct io_pgtable_ops *ops,
+ unsigned long iova)
+{
+ struct arm_lpae_io_pgtable *data = io_pgtable_ops_to_data(ops);
+ arm_lpae_iopte pte, *ptep = data->pgd;
+ int lvl = data->start_level;
+
+ do {
+ /* Valid IOPTE pointer? */
+ if (!ptep)
+ return 0;
+
+ /* Grab the IOPTE we're interested in */
+ ptep += ARM_LPAE_LVL_IDX(iova, lvl, data);
+ pte = READ_ONCE(*ptep);
+
+ /* Valid entry? */
+ if (!pte)
+ return 0;
+
+ /* Leaf entry? */
+ if (iopte_leaf(pte, lvl, data->iop.fmt))
+ goto found_translation;
+
+ /* Take it to the next level */
+ ptep = iopte_deref(pte, data);
+ } while (++lvl < ARM_LPAE_MAX_LEVELS);
+
+ /* Ran out of page tables to walk */
+ return 0;
+
+found_translation:
+ iova &= (ARM_LPAE_BLOCK_SIZE(lvl, data) - 1);
+ return iopte_to_paddr(pte, data) | iova;
+}
+
+static void arm_lpae_restrict_pgsizes(struct io_pgtable_cfg *cfg)
+{
+ unsigned long granule, page_sizes;
+ unsigned int max_addr_bits = 48;
+
+ /*
+ * We need to restrict the supported page sizes to match the
+ * translation regime for a particular granule. Aim to match
+ * the CPU page size if possible, otherwise prefer smaller sizes.
+ * While we're at it, restrict the block sizes to match the
+ * chosen granule.
+ */
+ if (cfg->pgsize_bitmap & PAGE_SIZE)
+ granule = PAGE_SIZE;
+ else if (cfg->pgsize_bitmap & ~PAGE_MASK)
+ granule = 1UL << __fls(cfg->pgsize_bitmap & ~PAGE_MASK);
+ else if (cfg->pgsize_bitmap & PAGE_MASK)
+ granule = 1UL << __ffs(cfg->pgsize_bitmap & PAGE_MASK);
+ else
+ granule = 0;
+
+ switch (granule) {
+ case SZ_4K:
+ page_sizes = (SZ_4K | SZ_2M | SZ_1G);
+ break;
+ case SZ_16K:
+ page_sizes = (SZ_16K | SZ_32M);
+ break;
+ case SZ_64K:
+ max_addr_bits = 52;
+ page_sizes = (SZ_64K | SZ_512M);
+ if (cfg->oas > 48)
+ page_sizes |= 1ULL << 42; /* 4TB */
+ break;
+ default:
+ page_sizes = 0;
+ }
+
+ cfg->pgsize_bitmap &= page_sizes;
+ cfg->ias = min(cfg->ias, max_addr_bits);
+ cfg->oas = min(cfg->oas, max_addr_bits);
+}
+
+static struct arm_lpae_io_pgtable *
+arm_lpae_alloc_pgtable(struct io_pgtable_cfg *cfg)
+{
+ struct arm_lpae_io_pgtable *data;
+ int levels, va_bits, pg_shift;
+
+ arm_lpae_restrict_pgsizes(cfg);
+
+ if (!(cfg->pgsize_bitmap & (SZ_4K | SZ_16K | SZ_64K)))
+ return NULL;
+
+ if (cfg->ias > ARM_LPAE_MAX_ADDR_BITS)
+ return NULL;
+
+ if (cfg->oas > ARM_LPAE_MAX_ADDR_BITS)
+ return NULL;
+
+ data = kmalloc(sizeof(*data), GFP_KERNEL);
+ if (!data)
+ return NULL;
+
+ pg_shift = __ffs(cfg->pgsize_bitmap);
+ data->bits_per_level = pg_shift - ilog2(sizeof(arm_lpae_iopte));
+
+ va_bits = cfg->ias - pg_shift;
+ levels = DIV_ROUND_UP(va_bits, data->bits_per_level);
+ data->start_level = ARM_LPAE_MAX_LEVELS - levels;
+
+ /* Calculate the actual size of our pgd (without concatenation) */
+ data->pgd_bits = va_bits - (data->bits_per_level * (levels - 1));
+
+ data->iop.ops = (struct io_pgtable_ops) {
+ .map = arm_lpae_map,
+ .unmap = arm_lpae_unmap,
+ .iova_to_phys = arm_lpae_iova_to_phys,
+ };
+
+ return data;
+}
+
+static struct io_pgtable *
+arm_64_lpae_alloc_pgtable_s1(struct io_pgtable_cfg *cfg, void *cookie)
+{
+ u64 reg;
+ struct arm_lpae_io_pgtable *data;
+ typeof(&cfg->arm_lpae_s1_cfg.tcr) tcr = &cfg->arm_lpae_s1_cfg.tcr;
+ bool tg1;
+
+ if (cfg->quirks & ~(IO_PGTABLE_QUIRK_ARM_NS |
+ IO_PGTABLE_QUIRK_NON_STRICT |
+ IO_PGTABLE_QUIRK_ARM_TTBR1))
+ return NULL;
+
+ data = arm_lpae_alloc_pgtable(cfg);
+ if (!data)
+ return NULL;
+
+ /* TCR */
+ if (cfg->coherent_walk) {
+ tcr->sh = ARM_LPAE_TCR_SH_IS;
+ tcr->irgn = ARM_LPAE_TCR_RGN_WBWA;
+ tcr->orgn = ARM_LPAE_TCR_RGN_WBWA;
+ } else {
+ tcr->sh = ARM_LPAE_TCR_SH_OS;
+ tcr->irgn = ARM_LPAE_TCR_RGN_NC;
+ tcr->orgn = ARM_LPAE_TCR_RGN_NC;
+ }
+
+ tg1 = cfg->quirks & IO_PGTABLE_QUIRK_ARM_TTBR1;
+ switch (ARM_LPAE_GRANULE(data)) {
+ case SZ_4K:
+ tcr->tg = tg1 ? ARM_LPAE_TCR_TG1_4K : ARM_LPAE_TCR_TG0_4K;
+ break;
+ case SZ_16K:
+ tcr->tg = tg1 ? ARM_LPAE_TCR_TG1_16K : ARM_LPAE_TCR_TG0_16K;
+ break;
+ case SZ_64K:
+ tcr->tg = tg1 ? ARM_LPAE_TCR_TG1_64K : ARM_LPAE_TCR_TG0_64K;
+ break;
+ }
+
+ switch (cfg->oas) {
+ case 32:
+ tcr->ips = ARM_LPAE_TCR_PS_32_BIT;
+ break;
+ case 36:
+ tcr->ips = ARM_LPAE_TCR_PS_36_BIT;
+ break;
+ case 40:
+ tcr->ips = ARM_LPAE_TCR_PS_40_BIT;
+ break;
+ case 42:
+ tcr->ips = ARM_LPAE_TCR_PS_42_BIT;
+ break;
+ case 44:
+ tcr->ips = ARM_LPAE_TCR_PS_44_BIT;
+ break;
+ case 48:
+ tcr->ips = ARM_LPAE_TCR_PS_48_BIT;
+ break;
+ case 52:
+ tcr->ips = ARM_LPAE_TCR_PS_52_BIT;
+ break;
+ default:
+ goto out_free_data;
+ }
+
+ tcr->tsz = 64ULL - cfg->ias;
+
+ /* MAIRs */
+ reg = (ARM_LPAE_MAIR_ATTR_NC
+ << ARM_LPAE_MAIR_ATTR_SHIFT(ARM_LPAE_MAIR_ATTR_IDX_NC)) |
+ (ARM_LPAE_MAIR_ATTR_WBRWA
+ << ARM_LPAE_MAIR_ATTR_SHIFT(ARM_LPAE_MAIR_ATTR_IDX_CACHE)) |
+ (ARM_LPAE_MAIR_ATTR_DEVICE
+ << ARM_LPAE_MAIR_ATTR_SHIFT(ARM_LPAE_MAIR_ATTR_IDX_DEV)) |
+ (ARM_LPAE_MAIR_ATTR_INC_OWBRWA
+ << ARM_LPAE_MAIR_ATTR_SHIFT(ARM_LPAE_MAIR_ATTR_IDX_INC_OCACHE));
+
+ cfg->arm_lpae_s1_cfg.mair = reg;
+
+ /* Looking good; allocate a pgd */
+ data->pgd = __arm_lpae_alloc_pages(ARM_LPAE_PGD_SIZE(data),
+ GFP_KERNEL, cfg);
+ if (!data->pgd)
+ goto out_free_data;
+
+ /* Ensure the empty pgd is visible before any actual TTBR write */
+ wmb();
+
+ /* TTBR */
+ cfg->arm_lpae_s1_cfg.ttbr = virt_to_phys(data->pgd);
+ return &data->iop;
+
+out_free_data:
+ kfree(data);
+ return NULL;
+}
+
+static struct io_pgtable *
+arm_64_lpae_alloc_pgtable_s2(struct io_pgtable_cfg *cfg, void *cookie)
+{
+ u64 sl;
+ struct arm_lpae_io_pgtable *data;
+ typeof(&cfg->arm_lpae_s2_cfg.vtcr) vtcr = &cfg->arm_lpae_s2_cfg.vtcr;
+
+ /* The NS quirk doesn't apply at stage 2 */
+ if (cfg->quirks & ~(IO_PGTABLE_QUIRK_NON_STRICT))
+ return NULL;
+
+ data = arm_lpae_alloc_pgtable(cfg);
+ if (!data)
+ return NULL;
+
+ /*
+ * Concatenate PGDs at level 1 if possible in order to reduce
+ * the depth of the stage-2 walk.
+ */
+ if (data->start_level == 0) {
+ unsigned long pgd_pages;
+
+ pgd_pages = ARM_LPAE_PGD_SIZE(data) / sizeof(arm_lpae_iopte);
+ if (pgd_pages <= ARM_LPAE_S2_MAX_CONCAT_PAGES) {
+ data->pgd_bits += data->bits_per_level;
+ data->start_level++;
+ }
+ }
+
+ /* VTCR */
+ if (cfg->coherent_walk) {
+ vtcr->sh = ARM_LPAE_TCR_SH_IS;
+ vtcr->irgn = ARM_LPAE_TCR_RGN_WBWA;
+ vtcr->orgn = ARM_LPAE_TCR_RGN_WBWA;
+ } else {
+ vtcr->sh = ARM_LPAE_TCR_SH_OS;
+ vtcr->irgn = ARM_LPAE_TCR_RGN_NC;
+ vtcr->orgn = ARM_LPAE_TCR_RGN_NC;
+ }
+
+ sl = data->start_level;
+
+ switch (ARM_LPAE_GRANULE(data)) {
+ case SZ_4K:
+ vtcr->tg = ARM_LPAE_TCR_TG0_4K;
+ sl++; /* SL0 format is different for 4K granule size */
+ break;
+ case SZ_16K:
+ vtcr->tg = ARM_LPAE_TCR_TG0_16K;
+ break;
+ case SZ_64K:
+ vtcr->tg = ARM_LPAE_TCR_TG0_64K;
+ break;
+ }
+
+ switch (cfg->oas) {
+ case 32:
+ vtcr->ps = ARM_LPAE_TCR_PS_32_BIT;
+ break;
+ case 36:
+ vtcr->ps = ARM_LPAE_TCR_PS_36_BIT;
+ break;
+ case 40:
+ vtcr->ps = ARM_LPAE_TCR_PS_40_BIT;
+ break;
+ case 42:
+ vtcr->ps = ARM_LPAE_TCR_PS_42_BIT;
+ break;
+ case 44:
+ vtcr->ps = ARM_LPAE_TCR_PS_44_BIT;
+ break;
+ case 48:
+ vtcr->ps = ARM_LPAE_TCR_PS_48_BIT;
+ break;
+ case 52:
+ vtcr->ps = ARM_LPAE_TCR_PS_52_BIT;
+ break;
+ default:
+ goto out_free_data;
+ }
+
+ vtcr->tsz = 64ULL - cfg->ias;
+ vtcr->sl = ~sl & ARM_LPAE_VTCR_SL0_MASK;
+
+ /* Allocate pgd pages */
+ data->pgd = __arm_lpae_alloc_pages(ARM_LPAE_PGD_SIZE(data),
+ GFP_KERNEL, cfg);
+ if (!data->pgd)
+ goto out_free_data;
+
+ /* Ensure the empty pgd is visible before any actual TTBR write */
+ wmb();
+
+ /* VTTBR */
+ cfg->arm_lpae_s2_cfg.vttbr = virt_to_phys(data->pgd);
+ return &data->iop;
+
+out_free_data:
+ kfree(data);
+ return NULL;
+}
+
+static struct io_pgtable *
+arm_32_lpae_alloc_pgtable_s1(struct io_pgtable_cfg *cfg, void *cookie)
+{
+ if (cfg->ias > 32 || cfg->oas > 40)
+ return NULL;
+
+ cfg->pgsize_bitmap &= (SZ_4K | SZ_2M | SZ_1G);
+ return arm_64_lpae_alloc_pgtable_s1(cfg, cookie);
+}
+
+static struct io_pgtable *
+arm_32_lpae_alloc_pgtable_s2(struct io_pgtable_cfg *cfg, void *cookie)
+{
+ if (cfg->ias > 40 || cfg->oas > 40)
+ return NULL;
+
+ cfg->pgsize_bitmap &= (SZ_4K | SZ_2M | SZ_1G);
+ return arm_64_lpae_alloc_pgtable_s2(cfg, cookie);
+}
+
+static struct io_pgtable *
+arm_mali_lpae_alloc_pgtable(struct io_pgtable_cfg *cfg, void *cookie)
+{
+ struct arm_lpae_io_pgtable *data;
+
+ /* No quirks for Mali (hopefully) */
+ if (cfg->quirks)
+ return NULL;
+
+ if (cfg->ias > 48 || cfg->oas > 40)
+ return NULL;
+
+ cfg->pgsize_bitmap &= (SZ_4K | SZ_2M | SZ_1G);
+
+ data = arm_lpae_alloc_pgtable(cfg);
+ if (!data)
+ return NULL;
+
+ /* Mali seems to need a full 4-level table regardless of IAS */
+ if (data->start_level > 0) {
+ data->start_level = 0;
+ data->pgd_bits = 0;
+ }
+ /*
+ * MEMATTR: Mali has no actual notion of a non-cacheable type, so the
+ * best we can do is mimic the out-of-tree driver and hope that the
+ * "implementation-defined caching policy" is good enough. Similarly,
+ * we'll use it for the sake of a valid attribute for our 'device'
+ * index, although callers should never request that in practice.
+ */
+ cfg->arm_mali_lpae_cfg.memattr =
+ (ARM_MALI_LPAE_MEMATTR_IMP_DEF
+ << ARM_LPAE_MAIR_ATTR_SHIFT(ARM_LPAE_MAIR_ATTR_IDX_NC)) |
+ (ARM_MALI_LPAE_MEMATTR_WRITE_ALLOC
+ << ARM_LPAE_MAIR_ATTR_SHIFT(ARM_LPAE_MAIR_ATTR_IDX_CACHE)) |
+ (ARM_MALI_LPAE_MEMATTR_IMP_DEF
+ << ARM_LPAE_MAIR_ATTR_SHIFT(ARM_LPAE_MAIR_ATTR_IDX_DEV));
+
+ data->pgd = __arm_lpae_alloc_pages(ARM_LPAE_PGD_SIZE(data), GFP_KERNEL,
+ cfg);
+ if (!data->pgd)
+ goto out_free_data;
+
+ /* Ensure the empty pgd is visible before TRANSTAB can be written */
+ wmb();
+
+ cfg->arm_mali_lpae_cfg.transtab = virt_to_phys(data->pgd) |
+ ARM_MALI_LPAE_TTBR_READ_INNER |
+ ARM_MALI_LPAE_TTBR_ADRMODE_TABLE;
+ if (cfg->coherent_walk)
+ cfg->arm_mali_lpae_cfg.transtab |= ARM_MALI_LPAE_TTBR_SHARE_OUTER;
+
+ return &data->iop;
+
+out_free_data:
+ kfree(data);
+ return NULL;
+}
+
+struct io_pgtable_init_fns io_pgtable_arm_64_lpae_s1_init_fns = {
+ .alloc = arm_64_lpae_alloc_pgtable_s1,
+ .free = arm_lpae_free_pgtable,
+};
+
+struct io_pgtable_init_fns io_pgtable_arm_64_lpae_s2_init_fns = {
+ .alloc = arm_64_lpae_alloc_pgtable_s2,
+ .free = arm_lpae_free_pgtable,
+};
+
+struct io_pgtable_init_fns io_pgtable_arm_32_lpae_s1_init_fns = {
+ .alloc = arm_32_lpae_alloc_pgtable_s1,
+ .free = arm_lpae_free_pgtable,
+};
+
+struct io_pgtable_init_fns io_pgtable_arm_32_lpae_s2_init_fns = {
+ .alloc = arm_32_lpae_alloc_pgtable_s2,
+ .free = arm_lpae_free_pgtable,
+};
+
+struct io_pgtable_init_fns io_pgtable_arm_mali_lpae_init_fns = {
+ .alloc = arm_mali_lpae_alloc_pgtable,
+ .free = arm_lpae_free_pgtable,
+};
+
+#ifdef CONFIG_IOMMU_IO_PGTABLE_LPAE_SELFTEST
+
+static struct io_pgtable_cfg *cfg_cookie __initdata;
+
+static void __init dummy_tlb_flush_all(void *cookie)
+{
+ WARN_ON(cookie != cfg_cookie);
+}
+
+static void __init dummy_tlb_flush(unsigned long iova, size_t size,
+ size_t granule, void *cookie)
+{
+ WARN_ON(cookie != cfg_cookie);
+ WARN_ON(!(size & cfg_cookie->pgsize_bitmap));
+}
+
+static void __init dummy_tlb_add_page(struct iommu_iotlb_gather *gather,
+ unsigned long iova, size_t granule,
+ void *cookie)
+{
+ dummy_tlb_flush(iova, granule, granule, cookie);
+}
+
+static const struct iommu_flush_ops dummy_tlb_ops __initconst = {
+ .tlb_flush_all = dummy_tlb_flush_all,
+ .tlb_flush_walk = dummy_tlb_flush,
+ .tlb_flush_leaf = dummy_tlb_flush,
+ .tlb_add_page = dummy_tlb_add_page,
+};
+
+static void __init arm_lpae_dump_ops(struct io_pgtable_ops *ops)
+{
+ struct arm_lpae_io_pgtable *data = io_pgtable_ops_to_data(ops);
+ struct io_pgtable_cfg *cfg = &data->iop.cfg;
+
+ pr_err("cfg: pgsize_bitmap 0x%lx, ias %u-bit\n",
+ cfg->pgsize_bitmap, cfg->ias);
+ pr_err("data: %d levels, 0x%zx pgd_size, %u pg_shift, %u bits_per_level, pgd @ %p\n",
+ ARM_LPAE_MAX_LEVELS - data->start_level, ARM_LPAE_PGD_SIZE(data),
+ ilog2(ARM_LPAE_GRANULE(data)), data->bits_per_level, data->pgd);
+}
+
+#define __FAIL(ops, i) ({ \
+ WARN(1, "selftest: test failed for fmt idx %d\n", (i)); \
+ arm_lpae_dump_ops(ops); \
+ selftest_running = false; \
+ -EFAULT; \
+})
+
+static int __init arm_lpae_run_tests(struct io_pgtable_cfg *cfg)
+{
+ static const enum io_pgtable_fmt fmts[] __initconst = {
+ ARM_64_LPAE_S1,
+ ARM_64_LPAE_S2,
+ };
+
+ int i, j;
+ unsigned long iova;
+ size_t size;
+ struct io_pgtable_ops *ops;
+
+ selftest_running = true;
+
+ for (i = 0; i < ARRAY_SIZE(fmts); ++i) {
+ cfg_cookie = cfg;
+ ops = alloc_io_pgtable_ops(fmts[i], cfg, cfg);
+ if (!ops) {
+ pr_err("selftest: failed to allocate io pgtable ops\n");
+ return -ENOMEM;
+ }
+
+ /*
+ * Initial sanity checks.
+ * Empty page tables shouldn't provide any translations.
+ */
+ if (ops->iova_to_phys(ops, 42))
+ return __FAIL(ops, i);
+
+ if (ops->iova_to_phys(ops, SZ_1G + 42))
+ return __FAIL(ops, i);
+
+ if (ops->iova_to_phys(ops, SZ_2G + 42))
+ return __FAIL(ops, i);
+
+ /*
+ * Distinct mappings of different granule sizes.
+ */
+ iova = 0;
+ for_each_set_bit(j, &cfg->pgsize_bitmap, BITS_PER_LONG) {
+ size = 1UL << j;
+
+ if (ops->map(ops, iova, iova, size, IOMMU_READ |
+ IOMMU_WRITE |
+ IOMMU_NOEXEC |
+ IOMMU_CACHE, GFP_KERNEL))
+ return __FAIL(ops, i);
+
+ /* Overlapping mappings */
+ if (!ops->map(ops, iova, iova + size, size,
+ IOMMU_READ | IOMMU_NOEXEC, GFP_KERNEL))
+ return __FAIL(ops, i);
+
+ if (ops->iova_to_phys(ops, iova + 42) != (iova + 42))
+ return __FAIL(ops, i);
+
+ iova += SZ_1G;
+ }
+
+ /* Partial unmap */
+ size = 1UL << __ffs(cfg->pgsize_bitmap);
+ if (ops->unmap(ops, SZ_1G + size, size, NULL) != size)
+ return __FAIL(ops, i);
+
+ /* Remap of partial unmap */
+ if (ops->map(ops, SZ_1G + size, size, size, IOMMU_READ, GFP_KERNEL))
+ return __FAIL(ops, i);
+
+ if (ops->iova_to_phys(ops, SZ_1G + size + 42) != (size + 42))
+ return __FAIL(ops, i);
+
+ /* Full unmap */
+ iova = 0;
+ for_each_set_bit(j, &cfg->pgsize_bitmap, BITS_PER_LONG) {
+ size = 1UL << j;
+
+ if (ops->unmap(ops, iova, size, NULL) != size)
+ return __FAIL(ops, i);
+
+ if (ops->iova_to_phys(ops, iova + 42))
+ return __FAIL(ops, i);
+
+ /* Remap full block */
+ if (ops->map(ops, iova, iova, size, IOMMU_WRITE, GFP_KERNEL))
+ return __FAIL(ops, i);
+
+ if (ops->iova_to_phys(ops, iova + 42) != (iova + 42))
+ return __FAIL(ops, i);
+
+ iova += SZ_1G;
+ }
+
+ free_io_pgtable_ops(ops);
+ }
+
+ selftest_running = false;
+ return 0;
+}
+
+static int __init arm_lpae_do_selftests(void)
+{
+ static const unsigned long pgsize[] __initconst = {
+ SZ_4K | SZ_2M | SZ_1G,
+ SZ_16K | SZ_32M,
+ SZ_64K | SZ_512M,
+ };
+
+ static const unsigned int ias[] __initconst = {
+ 32, 36, 40, 42, 44, 48,
+ };
+
+ int i, j, pass = 0, fail = 0;
+ struct io_pgtable_cfg cfg = {
+ .tlb = &dummy_tlb_ops,
+ .oas = 48,
+ .coherent_walk = true,
+ };
+
+ for (i = 0; i < ARRAY_SIZE(pgsize); ++i) {
+ for (j = 0; j < ARRAY_SIZE(ias); ++j) {
+ cfg.pgsize_bitmap = pgsize[i];
+ cfg.ias = ias[j];
+ pr_info("selftest: pgsize_bitmap 0x%08lx, IAS %u\n",
+ pgsize[i], ias[j]);
+ if (arm_lpae_run_tests(&cfg))
+ fail++;
+ else
+ pass++;
+ }
+ }
+
+ pr_info("selftest: completed with %d PASS %d FAIL\n", pass, fail);
+ return fail ? -EFAULT : 0;
+}
+subsys_initcall(arm_lpae_do_selftests);
+#endif
diff --git a/drivers/iommu/io-pgtable-arm.h b/drivers/iommu/io-pgtable-arm.h
new file mode 100644
index 000000000..ba7cfdf7a
--- /dev/null
+++ b/drivers/iommu/io-pgtable-arm.h
@@ -0,0 +1,30 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef IO_PGTABLE_ARM_H_
+#define IO_PGTABLE_ARM_H_
+
+#define ARM_LPAE_TCR_TG0_4K 0
+#define ARM_LPAE_TCR_TG0_64K 1
+#define ARM_LPAE_TCR_TG0_16K 2
+
+#define ARM_LPAE_TCR_TG1_16K 1
+#define ARM_LPAE_TCR_TG1_4K 2
+#define ARM_LPAE_TCR_TG1_64K 3
+
+#define ARM_LPAE_TCR_SH_NS 0
+#define ARM_LPAE_TCR_SH_OS 2
+#define ARM_LPAE_TCR_SH_IS 3
+
+#define ARM_LPAE_TCR_RGN_NC 0
+#define ARM_LPAE_TCR_RGN_WBWA 1
+#define ARM_LPAE_TCR_RGN_WT 2
+#define ARM_LPAE_TCR_RGN_WB 3
+
+#define ARM_LPAE_TCR_PS_32_BIT 0x0ULL
+#define ARM_LPAE_TCR_PS_36_BIT 0x1ULL
+#define ARM_LPAE_TCR_PS_40_BIT 0x2ULL
+#define ARM_LPAE_TCR_PS_42_BIT 0x3ULL
+#define ARM_LPAE_TCR_PS_44_BIT 0x4ULL
+#define ARM_LPAE_TCR_PS_48_BIT 0x5ULL
+#define ARM_LPAE_TCR_PS_52_BIT 0x6ULL
+
+#endif /* IO_PGTABLE_ARM_H_ */
diff --git a/drivers/iommu/io-pgtable.c b/drivers/iommu/io-pgtable.c
new file mode 100644
index 000000000..94394c814
--- /dev/null
+++ b/drivers/iommu/io-pgtable.c
@@ -0,0 +1,70 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Generic page table allocator for IOMMUs.
+ *
+ * Copyright (C) 2014 ARM Limited
+ *
+ * Author: Will Deacon <will.deacon@arm.com>
+ */
+
+#include <linux/bug.h>
+#include <linux/io-pgtable.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+
+static const struct io_pgtable_init_fns *
+io_pgtable_init_table[IO_PGTABLE_NUM_FMTS] = {
+#ifdef CONFIG_IOMMU_IO_PGTABLE_LPAE
+ [ARM_32_LPAE_S1] = &io_pgtable_arm_32_lpae_s1_init_fns,
+ [ARM_32_LPAE_S2] = &io_pgtable_arm_32_lpae_s2_init_fns,
+ [ARM_64_LPAE_S1] = &io_pgtable_arm_64_lpae_s1_init_fns,
+ [ARM_64_LPAE_S2] = &io_pgtable_arm_64_lpae_s2_init_fns,
+ [ARM_MALI_LPAE] = &io_pgtable_arm_mali_lpae_init_fns,
+#endif
+#ifdef CONFIG_IOMMU_IO_PGTABLE_ARMV7S
+ [ARM_V7S] = &io_pgtable_arm_v7s_init_fns,
+#endif
+};
+
+struct io_pgtable_ops *alloc_io_pgtable_ops(enum io_pgtable_fmt fmt,
+ struct io_pgtable_cfg *cfg,
+ void *cookie)
+{
+ struct io_pgtable *iop;
+ const struct io_pgtable_init_fns *fns;
+
+ if (fmt >= IO_PGTABLE_NUM_FMTS)
+ return NULL;
+
+ fns = io_pgtable_init_table[fmt];
+ if (!fns)
+ return NULL;
+
+ iop = fns->alloc(cfg, cookie);
+ if (!iop)
+ return NULL;
+
+ iop->fmt = fmt;
+ iop->cookie = cookie;
+ iop->cfg = *cfg;
+
+ return &iop->ops;
+}
+EXPORT_SYMBOL_GPL(alloc_io_pgtable_ops);
+
+/*
+ * It is the IOMMU driver's responsibility to ensure that the page table
+ * is no longer accessible to the walker by this point.
+ */
+void free_io_pgtable_ops(struct io_pgtable_ops *ops)
+{
+ struct io_pgtable *iop;
+
+ if (!ops)
+ return;
+
+ iop = io_pgtable_ops_to_pgtable(ops);
+ io_pgtable_tlb_flush_all(iop);
+ io_pgtable_init_table[iop->fmt]->free(iop);
+}
+EXPORT_SYMBOL_GPL(free_io_pgtable_ops);
diff --git a/drivers/iommu/ioasid.c b/drivers/iommu/ioasid.c
new file mode 100644
index 000000000..0f8dd377a
--- /dev/null
+++ b/drivers/iommu/ioasid.c
@@ -0,0 +1,422 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * I/O Address Space ID allocator. There is one global IOASID space, split into
+ * subsets. Users create a subset with DECLARE_IOASID_SET, then allocate and
+ * free IOASIDs with ioasid_alloc and ioasid_free.
+ */
+#include <linux/ioasid.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/xarray.h>
+
+struct ioasid_data {
+ ioasid_t id;
+ struct ioasid_set *set;
+ void *private;
+ struct rcu_head rcu;
+};
+
+/*
+ * struct ioasid_allocator_data - Internal data structure to hold information
+ * about an allocator. There are two types of allocators:
+ *
+ * - Default allocator always has its own XArray to track the IOASIDs allocated.
+ * - Custom allocators may share allocation helpers with different private data.
+ * Custom allocators that share the same helper functions also share the same
+ * XArray.
+ * Rules:
+ * 1. Default allocator is always available, not dynamically registered. This is
+ * to prevent race conditions with early boot code that want to register
+ * custom allocators or allocate IOASIDs.
+ * 2. Custom allocators take precedence over the default allocator.
+ * 3. When all custom allocators sharing the same helper functions are
+ * unregistered (e.g. due to hotplug), all outstanding IOASIDs must be
+ * freed. Otherwise, outstanding IOASIDs will be lost and orphaned.
+ * 4. When switching between custom allocators sharing the same helper
+ * functions, outstanding IOASIDs are preserved.
+ * 5. When switching between custom allocator and default allocator, all IOASIDs
+ * must be freed to ensure unadulterated space for the new allocator.
+ *
+ * @ops: allocator helper functions and its data
+ * @list: registered custom allocators
+ * @slist: allocators share the same ops but different data
+ * @flags: attributes of the allocator
+ * @xa: xarray holds the IOASID space
+ * @rcu: used for kfree_rcu when unregistering allocator
+ */
+struct ioasid_allocator_data {
+ struct ioasid_allocator_ops *ops;
+ struct list_head list;
+ struct list_head slist;
+#define IOASID_ALLOCATOR_CUSTOM BIT(0) /* Needs framework to track results */
+ unsigned long flags;
+ struct xarray xa;
+ struct rcu_head rcu;
+};
+
+static DEFINE_SPINLOCK(ioasid_allocator_lock);
+static LIST_HEAD(allocators_list);
+
+static ioasid_t default_alloc(ioasid_t min, ioasid_t max, void *opaque);
+static void default_free(ioasid_t ioasid, void *opaque);
+
+static struct ioasid_allocator_ops default_ops = {
+ .alloc = default_alloc,
+ .free = default_free,
+};
+
+static struct ioasid_allocator_data default_allocator = {
+ .ops = &default_ops,
+ .flags = 0,
+ .xa = XARRAY_INIT(ioasid_xa, XA_FLAGS_ALLOC),
+};
+
+static struct ioasid_allocator_data *active_allocator = &default_allocator;
+
+static ioasid_t default_alloc(ioasid_t min, ioasid_t max, void *opaque)
+{
+ ioasid_t id;
+
+ if (xa_alloc(&default_allocator.xa, &id, opaque, XA_LIMIT(min, max), GFP_ATOMIC)) {
+ pr_err("Failed to alloc ioasid from %d to %d\n", min, max);
+ return INVALID_IOASID;
+ }
+
+ return id;
+}
+
+static void default_free(ioasid_t ioasid, void *opaque)
+{
+ struct ioasid_data *ioasid_data;
+
+ ioasid_data = xa_erase(&default_allocator.xa, ioasid);
+ kfree_rcu(ioasid_data, rcu);
+}
+
+/* Allocate and initialize a new custom allocator with its helper functions */
+static struct ioasid_allocator_data *ioasid_alloc_allocator(struct ioasid_allocator_ops *ops)
+{
+ struct ioasid_allocator_data *ia_data;
+
+ ia_data = kzalloc(sizeof(*ia_data), GFP_ATOMIC);
+ if (!ia_data)
+ return NULL;
+
+ xa_init_flags(&ia_data->xa, XA_FLAGS_ALLOC);
+ INIT_LIST_HEAD(&ia_data->slist);
+ ia_data->flags |= IOASID_ALLOCATOR_CUSTOM;
+ ia_data->ops = ops;
+
+ /* For tracking custom allocators that share the same ops */
+ list_add_tail(&ops->list, &ia_data->slist);
+
+ return ia_data;
+}
+
+static bool use_same_ops(struct ioasid_allocator_ops *a, struct ioasid_allocator_ops *b)
+{
+ return (a->free == b->free) && (a->alloc == b->alloc);
+}
+
+/**
+ * ioasid_register_allocator - register a custom allocator
+ * @ops: the custom allocator ops to be registered
+ *
+ * Custom allocators take precedence over the default xarray based allocator.
+ * Private data associated with the IOASID allocated by the custom allocators
+ * are managed by IOASID framework similar to data stored in xa by default
+ * allocator.
+ *
+ * There can be multiple allocators registered but only one is active. In case
+ * of runtime removal of a custom allocator, the next one is activated based
+ * on the registration ordering.
+ *
+ * Multiple allocators can share the same alloc() function, in this case the
+ * IOASID space is shared.
+ */
+int ioasid_register_allocator(struct ioasid_allocator_ops *ops)
+{
+ struct ioasid_allocator_data *ia_data;
+ struct ioasid_allocator_data *pallocator;
+ int ret = 0;
+
+ spin_lock(&ioasid_allocator_lock);
+
+ ia_data = ioasid_alloc_allocator(ops);
+ if (!ia_data) {
+ ret = -ENOMEM;
+ goto out_unlock;
+ }
+
+ /*
+ * No particular preference, we activate the first one and keep
+ * the later registered allocators in a list in case the first one gets
+ * removed due to hotplug.
+ */
+ if (list_empty(&allocators_list)) {
+ WARN_ON(active_allocator != &default_allocator);
+ /* Use this new allocator if default is not active */
+ if (xa_empty(&active_allocator->xa)) {
+ rcu_assign_pointer(active_allocator, ia_data);
+ list_add_tail(&ia_data->list, &allocators_list);
+ goto out_unlock;
+ }
+ pr_warn("Default allocator active with outstanding IOASID\n");
+ ret = -EAGAIN;
+ goto out_free;
+ }
+
+ /* Check if the allocator is already registered */
+ list_for_each_entry(pallocator, &allocators_list, list) {
+ if (pallocator->ops == ops) {
+ pr_err("IOASID allocator already registered\n");
+ ret = -EEXIST;
+ goto out_free;
+ } else if (use_same_ops(pallocator->ops, ops)) {
+ /*
+ * If the new allocator shares the same ops,
+ * then they will share the same IOASID space.
+ * We should put them under the same xarray.
+ */
+ list_add_tail(&ops->list, &pallocator->slist);
+ goto out_free;
+ }
+ }
+ list_add_tail(&ia_data->list, &allocators_list);
+
+ spin_unlock(&ioasid_allocator_lock);
+ return 0;
+out_free:
+ kfree(ia_data);
+out_unlock:
+ spin_unlock(&ioasid_allocator_lock);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(ioasid_register_allocator);
+
+/**
+ * ioasid_unregister_allocator - Remove a custom IOASID allocator ops
+ * @ops: the custom allocator to be removed
+ *
+ * Remove an allocator from the list, activate the next allocator in
+ * the order it was registered. Or revert to default allocator if all
+ * custom allocators are unregistered without outstanding IOASIDs.
+ */
+void ioasid_unregister_allocator(struct ioasid_allocator_ops *ops)
+{
+ struct ioasid_allocator_data *pallocator;
+ struct ioasid_allocator_ops *sops;
+
+ spin_lock(&ioasid_allocator_lock);
+ if (list_empty(&allocators_list)) {
+ pr_warn("No custom IOASID allocators active!\n");
+ goto exit_unlock;
+ }
+
+ list_for_each_entry(pallocator, &allocators_list, list) {
+ if (!use_same_ops(pallocator->ops, ops))
+ continue;
+
+ if (list_is_singular(&pallocator->slist)) {
+ /* No shared helper functions */
+ list_del(&pallocator->list);
+ /*
+ * All IOASIDs should have been freed before
+ * the last allocator that shares the same ops
+ * is unregistered.
+ */
+ WARN_ON(!xa_empty(&pallocator->xa));
+ if (list_empty(&allocators_list)) {
+ pr_info("No custom IOASID allocators, switch to default.\n");
+ rcu_assign_pointer(active_allocator, &default_allocator);
+ } else if (pallocator == active_allocator) {
+ rcu_assign_pointer(active_allocator,
+ list_first_entry(&allocators_list,
+ struct ioasid_allocator_data, list));
+ pr_info("IOASID allocator changed");
+ }
+ kfree_rcu(pallocator, rcu);
+ break;
+ }
+ /*
+ * Find the matching shared ops to delete,
+ * but keep outstanding IOASIDs
+ */
+ list_for_each_entry(sops, &pallocator->slist, list) {
+ if (sops == ops) {
+ list_del(&ops->list);
+ break;
+ }
+ }
+ break;
+ }
+
+exit_unlock:
+ spin_unlock(&ioasid_allocator_lock);
+}
+EXPORT_SYMBOL_GPL(ioasid_unregister_allocator);
+
+/**
+ * ioasid_set_data - Set private data for an allocated ioasid
+ * @ioasid: the ID to set data
+ * @data: the private data
+ *
+ * For IOASID that is already allocated, private data can be set
+ * via this API. Future lookup can be done via ioasid_find.
+ */
+int ioasid_set_data(ioasid_t ioasid, void *data)
+{
+ struct ioasid_data *ioasid_data;
+ int ret = 0;
+
+ spin_lock(&ioasid_allocator_lock);
+ ioasid_data = xa_load(&active_allocator->xa, ioasid);
+ if (ioasid_data)
+ rcu_assign_pointer(ioasid_data->private, data);
+ else
+ ret = -ENOENT;
+ spin_unlock(&ioasid_allocator_lock);
+
+ /*
+ * Wait for readers to stop accessing the old private data, so the
+ * caller can free it.
+ */
+ if (!ret)
+ synchronize_rcu();
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(ioasid_set_data);
+
+/**
+ * ioasid_alloc - Allocate an IOASID
+ * @set: the IOASID set
+ * @min: the minimum ID (inclusive)
+ * @max: the maximum ID (inclusive)
+ * @private: data private to the caller
+ *
+ * Allocate an ID between @min and @max. The @private pointer is stored
+ * internally and can be retrieved with ioasid_find().
+ *
+ * Return: the allocated ID on success, or %INVALID_IOASID on failure.
+ */
+ioasid_t ioasid_alloc(struct ioasid_set *set, ioasid_t min, ioasid_t max,
+ void *private)
+{
+ struct ioasid_data *data;
+ void *adata;
+ ioasid_t id;
+
+ data = kzalloc(sizeof(*data), GFP_ATOMIC);
+ if (!data)
+ return INVALID_IOASID;
+
+ data->set = set;
+ data->private = private;
+
+ /*
+ * Custom allocator needs allocator data to perform platform specific
+ * operations.
+ */
+ spin_lock(&ioasid_allocator_lock);
+ adata = active_allocator->flags & IOASID_ALLOCATOR_CUSTOM ? active_allocator->ops->pdata : data;
+ id = active_allocator->ops->alloc(min, max, adata);
+ if (id == INVALID_IOASID) {
+ pr_err("Failed ASID allocation %lu\n", active_allocator->flags);
+ goto exit_free;
+ }
+
+ if ((active_allocator->flags & IOASID_ALLOCATOR_CUSTOM) &&
+ xa_alloc(&active_allocator->xa, &id, data, XA_LIMIT(id, id), GFP_ATOMIC)) {
+ /* Custom allocator needs framework to store and track allocation results */
+ pr_err("Failed to alloc ioasid from %d\n", id);
+ active_allocator->ops->free(id, active_allocator->ops->pdata);
+ goto exit_free;
+ }
+ data->id = id;
+
+ spin_unlock(&ioasid_allocator_lock);
+ return id;
+exit_free:
+ spin_unlock(&ioasid_allocator_lock);
+ kfree(data);
+ return INVALID_IOASID;
+}
+EXPORT_SYMBOL_GPL(ioasid_alloc);
+
+/**
+ * ioasid_free - Free an IOASID
+ * @ioasid: the ID to remove
+ */
+void ioasid_free(ioasid_t ioasid)
+{
+ struct ioasid_data *ioasid_data;
+
+ spin_lock(&ioasid_allocator_lock);
+ ioasid_data = xa_load(&active_allocator->xa, ioasid);
+ if (!ioasid_data) {
+ pr_err("Trying to free unknown IOASID %u\n", ioasid);
+ goto exit_unlock;
+ }
+
+ active_allocator->ops->free(ioasid, active_allocator->ops->pdata);
+ /* Custom allocator needs additional steps to free the xa element */
+ if (active_allocator->flags & IOASID_ALLOCATOR_CUSTOM) {
+ ioasid_data = xa_erase(&active_allocator->xa, ioasid);
+ kfree_rcu(ioasid_data, rcu);
+ }
+
+exit_unlock:
+ spin_unlock(&ioasid_allocator_lock);
+}
+EXPORT_SYMBOL_GPL(ioasid_free);
+
+/**
+ * ioasid_find - Find IOASID data
+ * @set: the IOASID set
+ * @ioasid: the IOASID to find
+ * @getter: function to call on the found object
+ *
+ * The optional getter function allows to take a reference to the found object
+ * under the rcu lock. The function can also check if the object is still valid:
+ * if @getter returns false, then the object is invalid and NULL is returned.
+ *
+ * If the IOASID exists, return the private pointer passed to ioasid_alloc.
+ * Private data can be NULL if not set. Return an error if the IOASID is not
+ * found, or if @set is not NULL and the IOASID does not belong to the set.
+ */
+void *ioasid_find(struct ioasid_set *set, ioasid_t ioasid,
+ bool (*getter)(void *))
+{
+ void *priv;
+ struct ioasid_data *ioasid_data;
+ struct ioasid_allocator_data *idata;
+
+ rcu_read_lock();
+ idata = rcu_dereference(active_allocator);
+ ioasid_data = xa_load(&idata->xa, ioasid);
+ if (!ioasid_data) {
+ priv = ERR_PTR(-ENOENT);
+ goto unlock;
+ }
+ if (set && ioasid_data->set != set) {
+ /* data found but does not belong to the set */
+ priv = ERR_PTR(-EACCES);
+ goto unlock;
+ }
+ /* Now IOASID and its set is verified, we can return the private data */
+ priv = rcu_dereference(ioasid_data->private);
+ if (getter && !getter(priv))
+ priv = NULL;
+unlock:
+ rcu_read_unlock();
+
+ return priv;
+}
+EXPORT_SYMBOL_GPL(ioasid_find);
+
+MODULE_AUTHOR("Jean-Philippe Brucker <jean-philippe.brucker@arm.com>");
+MODULE_AUTHOR("Jacob Pan <jacob.jun.pan@linux.intel.com>");
+MODULE_DESCRIPTION("IO Address Space ID (IOASID) allocator");
+MODULE_LICENSE("GPL");
diff --git a/drivers/iommu/iommu-debugfs.c b/drivers/iommu/iommu-debugfs.c
new file mode 100644
index 000000000..f03548942
--- /dev/null
+++ b/drivers/iommu/iommu-debugfs.c
@@ -0,0 +1,51 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * IOMMU debugfs core infrastructure
+ *
+ * Copyright (C) 2018 Advanced Micro Devices, Inc.
+ *
+ * Author: Gary R Hook <gary.hook@amd.com>
+ */
+
+#include <linux/pci.h>
+#include <linux/iommu.h>
+#include <linux/debugfs.h>
+
+struct dentry *iommu_debugfs_dir;
+EXPORT_SYMBOL_GPL(iommu_debugfs_dir);
+
+/**
+ * iommu_debugfs_setup - create the top-level iommu directory in debugfs
+ *
+ * Provide base enablement for using debugfs to expose internal data of an
+ * IOMMU driver. When called, this function creates the
+ * /sys/kernel/debug/iommu directory.
+ *
+ * Emit a strong warning at boot time to indicate that this feature is
+ * enabled.
+ *
+ * This function is called from iommu_init; drivers may then use
+ * iommu_debugfs_dir to instantiate a vendor-specific directory to be used
+ * to expose internal data.
+ */
+void iommu_debugfs_setup(void)
+{
+ if (!iommu_debugfs_dir) {
+ iommu_debugfs_dir = debugfs_create_dir("iommu", NULL);
+ pr_warn("\n");
+ pr_warn("*************************************************************\n");
+ pr_warn("** NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE **\n");
+ pr_warn("** **\n");
+ pr_warn("** IOMMU DebugFS SUPPORT HAS BEEN ENABLED IN THIS KERNEL **\n");
+ pr_warn("** **\n");
+ pr_warn("** This means that this kernel is built to expose internal **\n");
+ pr_warn("** IOMMU data structures, which may compromise security on **\n");
+ pr_warn("** your system. **\n");
+ pr_warn("** **\n");
+ pr_warn("** If you see this message and you are not debugging the **\n");
+ pr_warn("** kernel, report this immediately to your vendor! **\n");
+ pr_warn("** **\n");
+ pr_warn("** NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE **\n");
+ pr_warn("*************************************************************\n");
+ }
+}
diff --git a/drivers/iommu/iommu-sysfs.c b/drivers/iommu/iommu-sysfs.c
new file mode 100644
index 000000000..99869217f
--- /dev/null
+++ b/drivers/iommu/iommu-sysfs.c
@@ -0,0 +1,135 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * IOMMU sysfs class support
+ *
+ * Copyright (C) 2014 Red Hat, Inc. All rights reserved.
+ * Author: Alex Williamson <alex.williamson@redhat.com>
+ */
+
+#include <linux/device.h>
+#include <linux/iommu.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+
+/*
+ * We provide a common class "devices" group which initially has no attributes.
+ * As devices are added to the IOMMU, we'll add links to the group.
+ */
+static struct attribute *devices_attr[] = {
+ NULL,
+};
+
+static const struct attribute_group devices_attr_group = {
+ .name = "devices",
+ .attrs = devices_attr,
+};
+
+static const struct attribute_group *dev_groups[] = {
+ &devices_attr_group,
+ NULL,
+};
+
+static void release_device(struct device *dev)
+{
+ kfree(dev);
+}
+
+static struct class iommu_class = {
+ .name = "iommu",
+ .dev_release = release_device,
+ .dev_groups = dev_groups,
+};
+
+static int __init iommu_dev_init(void)
+{
+ return class_register(&iommu_class);
+}
+postcore_initcall(iommu_dev_init);
+
+/*
+ * Init the struct device for the IOMMU. IOMMU specific attributes can
+ * be provided as an attribute group, allowing a unique namespace per
+ * IOMMU type.
+ */
+int iommu_device_sysfs_add(struct iommu_device *iommu,
+ struct device *parent,
+ const struct attribute_group **groups,
+ const char *fmt, ...)
+{
+ va_list vargs;
+ int ret;
+
+ iommu->dev = kzalloc(sizeof(*iommu->dev), GFP_KERNEL);
+ if (!iommu->dev)
+ return -ENOMEM;
+
+ device_initialize(iommu->dev);
+
+ iommu->dev->class = &iommu_class;
+ iommu->dev->parent = parent;
+ iommu->dev->groups = groups;
+
+ va_start(vargs, fmt);
+ ret = kobject_set_name_vargs(&iommu->dev->kobj, fmt, vargs);
+ va_end(vargs);
+ if (ret)
+ goto error;
+
+ ret = device_add(iommu->dev);
+ if (ret)
+ goto error;
+
+ dev_set_drvdata(iommu->dev, iommu);
+
+ return 0;
+
+error:
+ put_device(iommu->dev);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(iommu_device_sysfs_add);
+
+void iommu_device_sysfs_remove(struct iommu_device *iommu)
+{
+ dev_set_drvdata(iommu->dev, NULL);
+ device_unregister(iommu->dev);
+ iommu->dev = NULL;
+}
+EXPORT_SYMBOL_GPL(iommu_device_sysfs_remove);
+
+/*
+ * IOMMU drivers can indicate a device is managed by a given IOMMU using
+ * this interface. A link to the device will be created in the "devices"
+ * directory of the IOMMU device in sysfs and an "iommu" link will be
+ * created under the linked device, pointing back at the IOMMU device.
+ */
+int iommu_device_link(struct iommu_device *iommu, struct device *link)
+{
+ int ret;
+
+ if (!iommu || IS_ERR(iommu))
+ return -ENODEV;
+
+ ret = sysfs_add_link_to_group(&iommu->dev->kobj, "devices",
+ &link->kobj, dev_name(link));
+ if (ret)
+ return ret;
+
+ ret = sysfs_create_link_nowarn(&link->kobj, &iommu->dev->kobj, "iommu");
+ if (ret)
+ sysfs_remove_link_from_group(&iommu->dev->kobj, "devices",
+ dev_name(link));
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(iommu_device_link);
+
+void iommu_device_unlink(struct iommu_device *iommu, struct device *link)
+{
+ if (!iommu || IS_ERR(iommu))
+ return;
+
+ sysfs_remove_link(&link->kobj, "iommu");
+ sysfs_remove_link_from_group(&iommu->dev->kobj, "devices", dev_name(link));
+}
+EXPORT_SYMBOL_GPL(iommu_device_unlink);
diff --git a/drivers/iommu/iommu-traces.c b/drivers/iommu/iommu-traces.c
new file mode 100644
index 000000000..1e9ca7789
--- /dev/null
+++ b/drivers/iommu/iommu-traces.c
@@ -0,0 +1,28 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * iommu trace points
+ *
+ * Copyright (C) 2013 Shuah Khan <shuah.kh@samsung.com>
+ *
+ */
+
+#include <linux/string.h>
+#include <linux/types.h>
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/iommu.h>
+
+/* iommu_group_event */
+EXPORT_TRACEPOINT_SYMBOL_GPL(add_device_to_group);
+EXPORT_TRACEPOINT_SYMBOL_GPL(remove_device_from_group);
+
+/* iommu_device_event */
+EXPORT_TRACEPOINT_SYMBOL_GPL(attach_device_to_domain);
+EXPORT_TRACEPOINT_SYMBOL_GPL(detach_device_from_domain);
+
+/* iommu_map_unmap */
+EXPORT_TRACEPOINT_SYMBOL_GPL(map);
+EXPORT_TRACEPOINT_SYMBOL_GPL(unmap);
+
+/* iommu_error */
+EXPORT_TRACEPOINT_SYMBOL_GPL(io_page_fault);
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
new file mode 100644
index 000000000..9d65557df
--- /dev/null
+++ b/drivers/iommu/iommu.c
@@ -0,0 +1,3060 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2007-2008 Advanced Micro Devices, Inc.
+ * Author: Joerg Roedel <jroedel@suse.de>
+ */
+
+#define pr_fmt(fmt) "iommu: " fmt
+
+#include <linux/device.h>
+#include <linux/kernel.h>
+#include <linux/bug.h>
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/export.h>
+#include <linux/slab.h>
+#include <linux/errno.h>
+#include <linux/iommu.h>
+#include <linux/idr.h>
+#include <linux/notifier.h>
+#include <linux/err.h>
+#include <linux/pci.h>
+#include <linux/bitops.h>
+#include <linux/property.h>
+#include <linux/fsl/mc.h>
+#include <linux/module.h>
+#include <trace/events/iommu.h>
+
+static struct kset *iommu_group_kset;
+static DEFINE_IDA(iommu_group_ida);
+
+static unsigned int iommu_def_domain_type __read_mostly;
+static bool iommu_dma_strict __read_mostly = true;
+static u32 iommu_cmd_line __read_mostly;
+
+struct iommu_group {
+ struct kobject kobj;
+ struct kobject *devices_kobj;
+ struct list_head devices;
+ struct mutex mutex;
+ struct blocking_notifier_head notifier;
+ void *iommu_data;
+ void (*iommu_data_release)(void *iommu_data);
+ char *name;
+ int id;
+ struct iommu_domain *default_domain;
+ struct iommu_domain *domain;
+ struct list_head entry;
+};
+
+struct group_device {
+ struct list_head list;
+ struct device *dev;
+ char *name;
+};
+
+struct iommu_group_attribute {
+ struct attribute attr;
+ ssize_t (*show)(struct iommu_group *group, char *buf);
+ ssize_t (*store)(struct iommu_group *group,
+ const char *buf, size_t count);
+};
+
+static const char * const iommu_group_resv_type_string[] = {
+ [IOMMU_RESV_DIRECT] = "direct",
+ [IOMMU_RESV_DIRECT_RELAXABLE] = "direct-relaxable",
+ [IOMMU_RESV_RESERVED] = "reserved",
+ [IOMMU_RESV_MSI] = "msi",
+ [IOMMU_RESV_SW_MSI] = "msi",
+};
+
+#define IOMMU_CMD_LINE_DMA_API BIT(0)
+
+static void iommu_set_cmd_line_dma_api(void)
+{
+ iommu_cmd_line |= IOMMU_CMD_LINE_DMA_API;
+}
+
+static bool iommu_cmd_line_dma_api(void)
+{
+ return !!(iommu_cmd_line & IOMMU_CMD_LINE_DMA_API);
+}
+
+static int iommu_alloc_default_domain(struct iommu_group *group,
+ struct device *dev);
+static struct iommu_domain *__iommu_domain_alloc(struct bus_type *bus,
+ unsigned type);
+static int __iommu_attach_device(struct iommu_domain *domain,
+ struct device *dev);
+static int __iommu_attach_group(struct iommu_domain *domain,
+ struct iommu_group *group);
+static void __iommu_detach_group(struct iommu_domain *domain,
+ struct iommu_group *group);
+static int iommu_create_device_direct_mappings(struct iommu_group *group,
+ struct device *dev);
+static struct iommu_group *iommu_group_get_for_dev(struct device *dev);
+
+#define IOMMU_GROUP_ATTR(_name, _mode, _show, _store) \
+struct iommu_group_attribute iommu_group_attr_##_name = \
+ __ATTR(_name, _mode, _show, _store)
+
+#define to_iommu_group_attr(_attr) \
+ container_of(_attr, struct iommu_group_attribute, attr)
+#define to_iommu_group(_kobj) \
+ container_of(_kobj, struct iommu_group, kobj)
+
+static LIST_HEAD(iommu_device_list);
+static DEFINE_SPINLOCK(iommu_device_lock);
+
+/*
+ * Use a function instead of an array here because the domain-type is a
+ * bit-field, so an array would waste memory.
+ */
+static const char *iommu_domain_type_str(unsigned int t)
+{
+ switch (t) {
+ case IOMMU_DOMAIN_BLOCKED:
+ return "Blocked";
+ case IOMMU_DOMAIN_IDENTITY:
+ return "Passthrough";
+ case IOMMU_DOMAIN_UNMANAGED:
+ return "Unmanaged";
+ case IOMMU_DOMAIN_DMA:
+ return "Translated";
+ default:
+ return "Unknown";
+ }
+}
+
+static int __init iommu_subsys_init(void)
+{
+ bool cmd_line = iommu_cmd_line_dma_api();
+
+ if (!cmd_line) {
+ if (IS_ENABLED(CONFIG_IOMMU_DEFAULT_PASSTHROUGH))
+ iommu_set_default_passthrough(false);
+ else
+ iommu_set_default_translated(false);
+
+ if (iommu_default_passthrough() && mem_encrypt_active()) {
+ pr_info("Memory encryption detected - Disabling default IOMMU Passthrough\n");
+ iommu_set_default_translated(false);
+ }
+ }
+
+ pr_info("Default domain type: %s %s\n",
+ iommu_domain_type_str(iommu_def_domain_type),
+ cmd_line ? "(set via kernel command line)" : "");
+
+ return 0;
+}
+subsys_initcall(iommu_subsys_init);
+
+int iommu_device_register(struct iommu_device *iommu)
+{
+ spin_lock(&iommu_device_lock);
+ list_add_tail(&iommu->list, &iommu_device_list);
+ spin_unlock(&iommu_device_lock);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(iommu_device_register);
+
+void iommu_device_unregister(struct iommu_device *iommu)
+{
+ spin_lock(&iommu_device_lock);
+ list_del(&iommu->list);
+ spin_unlock(&iommu_device_lock);
+}
+EXPORT_SYMBOL_GPL(iommu_device_unregister);
+
+static struct dev_iommu *dev_iommu_get(struct device *dev)
+{
+ struct dev_iommu *param = dev->iommu;
+
+ if (param)
+ return param;
+
+ param = kzalloc(sizeof(*param), GFP_KERNEL);
+ if (!param)
+ return NULL;
+
+ mutex_init(&param->lock);
+ dev->iommu = param;
+ return param;
+}
+
+static void dev_iommu_free(struct device *dev)
+{
+ struct dev_iommu *param = dev->iommu;
+
+ dev->iommu = NULL;
+ if (param->fwspec) {
+ fwnode_handle_put(param->fwspec->iommu_fwnode);
+ kfree(param->fwspec);
+ }
+ kfree(param);
+}
+
+static int __iommu_probe_device(struct device *dev, struct list_head *group_list)
+{
+ const struct iommu_ops *ops = dev->bus->iommu_ops;
+ struct iommu_device *iommu_dev;
+ struct iommu_group *group;
+ int ret;
+
+ if (!ops)
+ return -ENODEV;
+
+ if (!dev_iommu_get(dev))
+ return -ENOMEM;
+
+ if (!try_module_get(ops->owner)) {
+ ret = -EINVAL;
+ goto err_free;
+ }
+
+ iommu_dev = ops->probe_device(dev);
+ if (IS_ERR(iommu_dev)) {
+ ret = PTR_ERR(iommu_dev);
+ goto out_module_put;
+ }
+
+ dev->iommu->iommu_dev = iommu_dev;
+
+ group = iommu_group_get_for_dev(dev);
+ if (IS_ERR(group)) {
+ ret = PTR_ERR(group);
+ goto out_release;
+ }
+ iommu_group_put(group);
+
+ if (group_list && !group->default_domain && list_empty(&group->entry))
+ list_add_tail(&group->entry, group_list);
+
+ iommu_device_link(iommu_dev, dev);
+
+ return 0;
+
+out_release:
+ ops->release_device(dev);
+
+out_module_put:
+ module_put(ops->owner);
+
+err_free:
+ dev_iommu_free(dev);
+
+ return ret;
+}
+
+int iommu_probe_device(struct device *dev)
+{
+ const struct iommu_ops *ops = dev->bus->iommu_ops;
+ struct iommu_group *group;
+ int ret;
+
+ ret = __iommu_probe_device(dev, NULL);
+ if (ret)
+ goto err_out;
+
+ group = iommu_group_get(dev);
+ if (!group)
+ goto err_release;
+
+ /*
+ * Try to allocate a default domain - needs support from the
+ * IOMMU driver. There are still some drivers which don't
+ * support default domains, so the return value is not yet
+ * checked.
+ */
+ iommu_alloc_default_domain(group, dev);
+
+ if (group->default_domain) {
+ ret = __iommu_attach_device(group->default_domain, dev);
+ if (ret) {
+ iommu_group_put(group);
+ goto err_release;
+ }
+ }
+
+ iommu_create_device_direct_mappings(group, dev);
+
+ iommu_group_put(group);
+
+ if (ops->probe_finalize)
+ ops->probe_finalize(dev);
+
+ return 0;
+
+err_release:
+ iommu_release_device(dev);
+
+err_out:
+ return ret;
+
+}
+
+void iommu_release_device(struct device *dev)
+{
+ const struct iommu_ops *ops = dev->bus->iommu_ops;
+
+ if (!dev->iommu)
+ return;
+
+ iommu_device_unlink(dev->iommu->iommu_dev, dev);
+
+ ops->release_device(dev);
+
+ iommu_group_remove_device(dev);
+ module_put(ops->owner);
+ dev_iommu_free(dev);
+}
+
+static int __init iommu_set_def_domain_type(char *str)
+{
+ bool pt;
+ int ret;
+
+ ret = kstrtobool(str, &pt);
+ if (ret)
+ return ret;
+
+ if (pt)
+ iommu_set_default_passthrough(true);
+ else
+ iommu_set_default_translated(true);
+
+ return 0;
+}
+early_param("iommu.passthrough", iommu_set_def_domain_type);
+
+static int __init iommu_dma_setup(char *str)
+{
+ return kstrtobool(str, &iommu_dma_strict);
+}
+early_param("iommu.strict", iommu_dma_setup);
+
+static ssize_t iommu_group_attr_show(struct kobject *kobj,
+ struct attribute *__attr, char *buf)
+{
+ struct iommu_group_attribute *attr = to_iommu_group_attr(__attr);
+ struct iommu_group *group = to_iommu_group(kobj);
+ ssize_t ret = -EIO;
+
+ if (attr->show)
+ ret = attr->show(group, buf);
+ return ret;
+}
+
+static ssize_t iommu_group_attr_store(struct kobject *kobj,
+ struct attribute *__attr,
+ const char *buf, size_t count)
+{
+ struct iommu_group_attribute *attr = to_iommu_group_attr(__attr);
+ struct iommu_group *group = to_iommu_group(kobj);
+ ssize_t ret = -EIO;
+
+ if (attr->store)
+ ret = attr->store(group, buf, count);
+ return ret;
+}
+
+static const struct sysfs_ops iommu_group_sysfs_ops = {
+ .show = iommu_group_attr_show,
+ .store = iommu_group_attr_store,
+};
+
+static int iommu_group_create_file(struct iommu_group *group,
+ struct iommu_group_attribute *attr)
+{
+ return sysfs_create_file(&group->kobj, &attr->attr);
+}
+
+static void iommu_group_remove_file(struct iommu_group *group,
+ struct iommu_group_attribute *attr)
+{
+ sysfs_remove_file(&group->kobj, &attr->attr);
+}
+
+static ssize_t iommu_group_show_name(struct iommu_group *group, char *buf)
+{
+ return sprintf(buf, "%s\n", group->name);
+}
+
+/**
+ * iommu_insert_resv_region - Insert a new region in the
+ * list of reserved regions.
+ * @new: new region to insert
+ * @regions: list of regions
+ *
+ * Elements are sorted by start address and overlapping segments
+ * of the same type are merged.
+ */
+static int iommu_insert_resv_region(struct iommu_resv_region *new,
+ struct list_head *regions)
+{
+ struct iommu_resv_region *iter, *tmp, *nr, *top;
+ LIST_HEAD(stack);
+
+ nr = iommu_alloc_resv_region(new->start, new->length,
+ new->prot, new->type);
+ if (!nr)
+ return -ENOMEM;
+
+ /* First add the new element based on start address sorting */
+ list_for_each_entry(iter, regions, list) {
+ if (nr->start < iter->start ||
+ (nr->start == iter->start && nr->type <= iter->type))
+ break;
+ }
+ list_add_tail(&nr->list, &iter->list);
+
+ /* Merge overlapping segments of type nr->type in @regions, if any */
+ list_for_each_entry_safe(iter, tmp, regions, list) {
+ phys_addr_t top_end, iter_end = iter->start + iter->length - 1;
+
+ /* no merge needed on elements of different types than @new */
+ if (iter->type != new->type) {
+ list_move_tail(&iter->list, &stack);
+ continue;
+ }
+
+ /* look for the last stack element of same type as @iter */
+ list_for_each_entry_reverse(top, &stack, list)
+ if (top->type == iter->type)
+ goto check_overlap;
+
+ list_move_tail(&iter->list, &stack);
+ continue;
+
+check_overlap:
+ top_end = top->start + top->length - 1;
+
+ if (iter->start > top_end + 1) {
+ list_move_tail(&iter->list, &stack);
+ } else {
+ top->length = max(top_end, iter_end) - top->start + 1;
+ list_del(&iter->list);
+ kfree(iter);
+ }
+ }
+ list_splice(&stack, regions);
+ return 0;
+}
+
+static int
+iommu_insert_device_resv_regions(struct list_head *dev_resv_regions,
+ struct list_head *group_resv_regions)
+{
+ struct iommu_resv_region *entry;
+ int ret = 0;
+
+ list_for_each_entry(entry, dev_resv_regions, list) {
+ ret = iommu_insert_resv_region(entry, group_resv_regions);
+ if (ret)
+ break;
+ }
+ return ret;
+}
+
+int iommu_get_group_resv_regions(struct iommu_group *group,
+ struct list_head *head)
+{
+ struct group_device *device;
+ int ret = 0;
+
+ mutex_lock(&group->mutex);
+ list_for_each_entry(device, &group->devices, list) {
+ struct list_head dev_resv_regions;
+
+ INIT_LIST_HEAD(&dev_resv_regions);
+ iommu_get_resv_regions(device->dev, &dev_resv_regions);
+ ret = iommu_insert_device_resv_regions(&dev_resv_regions, head);
+ iommu_put_resv_regions(device->dev, &dev_resv_regions);
+ if (ret)
+ break;
+ }
+ mutex_unlock(&group->mutex);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(iommu_get_group_resv_regions);
+
+static ssize_t iommu_group_show_resv_regions(struct iommu_group *group,
+ char *buf)
+{
+ struct iommu_resv_region *region, *next;
+ struct list_head group_resv_regions;
+ char *str = buf;
+
+ INIT_LIST_HEAD(&group_resv_regions);
+ iommu_get_group_resv_regions(group, &group_resv_regions);
+
+ list_for_each_entry_safe(region, next, &group_resv_regions, list) {
+ str += sprintf(str, "0x%016llx 0x%016llx %s\n",
+ (long long int)region->start,
+ (long long int)(region->start +
+ region->length - 1),
+ iommu_group_resv_type_string[region->type]);
+ kfree(region);
+ }
+
+ return (str - buf);
+}
+
+static ssize_t iommu_group_show_type(struct iommu_group *group,
+ char *buf)
+{
+ char *type = "unknown\n";
+
+ if (group->default_domain) {
+ switch (group->default_domain->type) {
+ case IOMMU_DOMAIN_BLOCKED:
+ type = "blocked\n";
+ break;
+ case IOMMU_DOMAIN_IDENTITY:
+ type = "identity\n";
+ break;
+ case IOMMU_DOMAIN_UNMANAGED:
+ type = "unmanaged\n";
+ break;
+ case IOMMU_DOMAIN_DMA:
+ type = "DMA\n";
+ break;
+ }
+ }
+ strcpy(buf, type);
+
+ return strlen(type);
+}
+
+static IOMMU_GROUP_ATTR(name, S_IRUGO, iommu_group_show_name, NULL);
+
+static IOMMU_GROUP_ATTR(reserved_regions, 0444,
+ iommu_group_show_resv_regions, NULL);
+
+static IOMMU_GROUP_ATTR(type, 0444, iommu_group_show_type, NULL);
+
+static void iommu_group_release(struct kobject *kobj)
+{
+ struct iommu_group *group = to_iommu_group(kobj);
+
+ pr_debug("Releasing group %d\n", group->id);
+
+ if (group->iommu_data_release)
+ group->iommu_data_release(group->iommu_data);
+
+ ida_simple_remove(&iommu_group_ida, group->id);
+
+ if (group->default_domain)
+ iommu_domain_free(group->default_domain);
+
+ kfree(group->name);
+ kfree(group);
+}
+
+static struct kobj_type iommu_group_ktype = {
+ .sysfs_ops = &iommu_group_sysfs_ops,
+ .release = iommu_group_release,
+};
+
+/**
+ * iommu_group_alloc - Allocate a new group
+ *
+ * This function is called by an iommu driver to allocate a new iommu
+ * group. The iommu group represents the minimum granularity of the iommu.
+ * Upon successful return, the caller holds a reference to the supplied
+ * group in order to hold the group until devices are added. Use
+ * iommu_group_put() to release this extra reference count, allowing the
+ * group to be automatically reclaimed once it has no devices or external
+ * references.
+ */
+struct iommu_group *iommu_group_alloc(void)
+{
+ struct iommu_group *group;
+ int ret;
+
+ group = kzalloc(sizeof(*group), GFP_KERNEL);
+ if (!group)
+ return ERR_PTR(-ENOMEM);
+
+ group->kobj.kset = iommu_group_kset;
+ mutex_init(&group->mutex);
+ INIT_LIST_HEAD(&group->devices);
+ INIT_LIST_HEAD(&group->entry);
+ BLOCKING_INIT_NOTIFIER_HEAD(&group->notifier);
+
+ ret = ida_simple_get(&iommu_group_ida, 0, 0, GFP_KERNEL);
+ if (ret < 0) {
+ kfree(group);
+ return ERR_PTR(ret);
+ }
+ group->id = ret;
+
+ ret = kobject_init_and_add(&group->kobj, &iommu_group_ktype,
+ NULL, "%d", group->id);
+ if (ret) {
+ ida_simple_remove(&iommu_group_ida, group->id);
+ kobject_put(&group->kobj);
+ return ERR_PTR(ret);
+ }
+
+ group->devices_kobj = kobject_create_and_add("devices", &group->kobj);
+ if (!group->devices_kobj) {
+ kobject_put(&group->kobj); /* triggers .release & free */
+ return ERR_PTR(-ENOMEM);
+ }
+
+ /*
+ * The devices_kobj holds a reference on the group kobject, so
+ * as long as that exists so will the group. We can therefore
+ * use the devices_kobj for reference counting.
+ */
+ kobject_put(&group->kobj);
+
+ ret = iommu_group_create_file(group,
+ &iommu_group_attr_reserved_regions);
+ if (ret)
+ return ERR_PTR(ret);
+
+ ret = iommu_group_create_file(group, &iommu_group_attr_type);
+ if (ret)
+ return ERR_PTR(ret);
+
+ pr_debug("Allocated group %d\n", group->id);
+
+ return group;
+}
+EXPORT_SYMBOL_GPL(iommu_group_alloc);
+
+struct iommu_group *iommu_group_get_by_id(int id)
+{
+ struct kobject *group_kobj;
+ struct iommu_group *group;
+ const char *name;
+
+ if (!iommu_group_kset)
+ return NULL;
+
+ name = kasprintf(GFP_KERNEL, "%d", id);
+ if (!name)
+ return NULL;
+
+ group_kobj = kset_find_obj(iommu_group_kset, name);
+ kfree(name);
+
+ if (!group_kobj)
+ return NULL;
+
+ group = container_of(group_kobj, struct iommu_group, kobj);
+ BUG_ON(group->id != id);
+
+ kobject_get(group->devices_kobj);
+ kobject_put(&group->kobj);
+
+ return group;
+}
+EXPORT_SYMBOL_GPL(iommu_group_get_by_id);
+
+/**
+ * iommu_group_get_iommudata - retrieve iommu_data registered for a group
+ * @group: the group
+ *
+ * iommu drivers can store data in the group for use when doing iommu
+ * operations. This function provides a way to retrieve it. Caller
+ * should hold a group reference.
+ */
+void *iommu_group_get_iommudata(struct iommu_group *group)
+{
+ return group->iommu_data;
+}
+EXPORT_SYMBOL_GPL(iommu_group_get_iommudata);
+
+/**
+ * iommu_group_set_iommudata - set iommu_data for a group
+ * @group: the group
+ * @iommu_data: new data
+ * @release: release function for iommu_data
+ *
+ * iommu drivers can store data in the group for use when doing iommu
+ * operations. This function provides a way to set the data after
+ * the group has been allocated. Caller should hold a group reference.
+ */
+void iommu_group_set_iommudata(struct iommu_group *group, void *iommu_data,
+ void (*release)(void *iommu_data))
+{
+ group->iommu_data = iommu_data;
+ group->iommu_data_release = release;
+}
+EXPORT_SYMBOL_GPL(iommu_group_set_iommudata);
+
+/**
+ * iommu_group_set_name - set name for a group
+ * @group: the group
+ * @name: name
+ *
+ * Allow iommu driver to set a name for a group. When set it will
+ * appear in a name attribute file under the group in sysfs.
+ */
+int iommu_group_set_name(struct iommu_group *group, const char *name)
+{
+ int ret;
+
+ if (group->name) {
+ iommu_group_remove_file(group, &iommu_group_attr_name);
+ kfree(group->name);
+ group->name = NULL;
+ if (!name)
+ return 0;
+ }
+
+ group->name = kstrdup(name, GFP_KERNEL);
+ if (!group->name)
+ return -ENOMEM;
+
+ ret = iommu_group_create_file(group, &iommu_group_attr_name);
+ if (ret) {
+ kfree(group->name);
+ group->name = NULL;
+ return ret;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(iommu_group_set_name);
+
+static int iommu_create_device_direct_mappings(struct iommu_group *group,
+ struct device *dev)
+{
+ struct iommu_domain *domain = group->default_domain;
+ struct iommu_resv_region *entry;
+ struct list_head mappings;
+ unsigned long pg_size;
+ int ret = 0;
+
+ if (!domain || domain->type != IOMMU_DOMAIN_DMA)
+ return 0;
+
+ BUG_ON(!domain->pgsize_bitmap);
+
+ pg_size = 1UL << __ffs(domain->pgsize_bitmap);
+ INIT_LIST_HEAD(&mappings);
+
+ iommu_get_resv_regions(dev, &mappings);
+
+ /* We need to consider overlapping regions for different devices */
+ list_for_each_entry(entry, &mappings, list) {
+ dma_addr_t start, end, addr;
+
+ if (domain->ops->apply_resv_region)
+ domain->ops->apply_resv_region(dev, domain, entry);
+
+ start = ALIGN(entry->start, pg_size);
+ end = ALIGN(entry->start + entry->length, pg_size);
+
+ if (entry->type != IOMMU_RESV_DIRECT &&
+ entry->type != IOMMU_RESV_DIRECT_RELAXABLE)
+ continue;
+
+ for (addr = start; addr < end; addr += pg_size) {
+ phys_addr_t phys_addr;
+
+ phys_addr = iommu_iova_to_phys(domain, addr);
+ if (phys_addr)
+ continue;
+
+ ret = iommu_map(domain, addr, addr, pg_size, entry->prot);
+ if (ret)
+ goto out;
+ }
+
+ }
+
+ iommu_flush_iotlb_all(domain);
+
+out:
+ iommu_put_resv_regions(dev, &mappings);
+
+ return ret;
+}
+
+static bool iommu_is_attach_deferred(struct iommu_domain *domain,
+ struct device *dev)
+{
+ if (domain->ops->is_attach_deferred)
+ return domain->ops->is_attach_deferred(domain, dev);
+
+ return false;
+}
+
+/**
+ * iommu_group_add_device - add a device to an iommu group
+ * @group: the group into which to add the device (reference should be held)
+ * @dev: the device
+ *
+ * This function is called by an iommu driver to add a device into a
+ * group. Adding a device increments the group reference count.
+ */
+int iommu_group_add_device(struct iommu_group *group, struct device *dev)
+{
+ int ret, i = 0;
+ struct group_device *device;
+
+ device = kzalloc(sizeof(*device), GFP_KERNEL);
+ if (!device)
+ return -ENOMEM;
+
+ device->dev = dev;
+
+ ret = sysfs_create_link(&dev->kobj, &group->kobj, "iommu_group");
+ if (ret)
+ goto err_free_device;
+
+ device->name = kasprintf(GFP_KERNEL, "%s", kobject_name(&dev->kobj));
+rename:
+ if (!device->name) {
+ ret = -ENOMEM;
+ goto err_remove_link;
+ }
+
+ ret = sysfs_create_link_nowarn(group->devices_kobj,
+ &dev->kobj, device->name);
+ if (ret) {
+ if (ret == -EEXIST && i >= 0) {
+ /*
+ * Account for the slim chance of collision
+ * and append an instance to the name.
+ */
+ kfree(device->name);
+ device->name = kasprintf(GFP_KERNEL, "%s.%d",
+ kobject_name(&dev->kobj), i++);
+ goto rename;
+ }
+ goto err_free_name;
+ }
+
+ kobject_get(group->devices_kobj);
+
+ dev->iommu_group = group;
+
+ mutex_lock(&group->mutex);
+ list_add_tail(&device->list, &group->devices);
+ if (group->domain && !iommu_is_attach_deferred(group->domain, dev))
+ ret = __iommu_attach_device(group->domain, dev);
+ mutex_unlock(&group->mutex);
+ if (ret)
+ goto err_put_group;
+
+ /* Notify any listeners about change to group. */
+ blocking_notifier_call_chain(&group->notifier,
+ IOMMU_GROUP_NOTIFY_ADD_DEVICE, dev);
+
+ trace_add_device_to_group(group->id, dev);
+
+ dev_info(dev, "Adding to iommu group %d\n", group->id);
+
+ return 0;
+
+err_put_group:
+ mutex_lock(&group->mutex);
+ list_del(&device->list);
+ mutex_unlock(&group->mutex);
+ dev->iommu_group = NULL;
+ kobject_put(group->devices_kobj);
+ sysfs_remove_link(group->devices_kobj, device->name);
+err_free_name:
+ kfree(device->name);
+err_remove_link:
+ sysfs_remove_link(&dev->kobj, "iommu_group");
+err_free_device:
+ kfree(device);
+ dev_err(dev, "Failed to add to iommu group %d: %d\n", group->id, ret);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(iommu_group_add_device);
+
+/**
+ * iommu_group_remove_device - remove a device from it's current group
+ * @dev: device to be removed
+ *
+ * This function is called by an iommu driver to remove the device from
+ * it's current group. This decrements the iommu group reference count.
+ */
+void iommu_group_remove_device(struct device *dev)
+{
+ struct iommu_group *group = dev->iommu_group;
+ struct group_device *tmp_device, *device = NULL;
+
+ if (!group)
+ return;
+
+ dev_info(dev, "Removing from iommu group %d\n", group->id);
+
+ /* Pre-notify listeners that a device is being removed. */
+ blocking_notifier_call_chain(&group->notifier,
+ IOMMU_GROUP_NOTIFY_DEL_DEVICE, dev);
+
+ mutex_lock(&group->mutex);
+ list_for_each_entry(tmp_device, &group->devices, list) {
+ if (tmp_device->dev == dev) {
+ device = tmp_device;
+ list_del(&device->list);
+ break;
+ }
+ }
+ mutex_unlock(&group->mutex);
+
+ if (!device)
+ return;
+
+ sysfs_remove_link(group->devices_kobj, device->name);
+ sysfs_remove_link(&dev->kobj, "iommu_group");
+
+ trace_remove_device_from_group(group->id, dev);
+
+ kfree(device->name);
+ kfree(device);
+ dev->iommu_group = NULL;
+ kobject_put(group->devices_kobj);
+}
+EXPORT_SYMBOL_GPL(iommu_group_remove_device);
+
+static int iommu_group_device_count(struct iommu_group *group)
+{
+ struct group_device *entry;
+ int ret = 0;
+
+ list_for_each_entry(entry, &group->devices, list)
+ ret++;
+
+ return ret;
+}
+
+/**
+ * iommu_group_for_each_dev - iterate over each device in the group
+ * @group: the group
+ * @data: caller opaque data to be passed to callback function
+ * @fn: caller supplied callback function
+ *
+ * This function is called by group users to iterate over group devices.
+ * Callers should hold a reference count to the group during callback.
+ * The group->mutex is held across callbacks, which will block calls to
+ * iommu_group_add/remove_device.
+ */
+static int __iommu_group_for_each_dev(struct iommu_group *group, void *data,
+ int (*fn)(struct device *, void *))
+{
+ struct group_device *device;
+ int ret = 0;
+
+ list_for_each_entry(device, &group->devices, list) {
+ ret = fn(device->dev, data);
+ if (ret)
+ break;
+ }
+ return ret;
+}
+
+
+int iommu_group_for_each_dev(struct iommu_group *group, void *data,
+ int (*fn)(struct device *, void *))
+{
+ int ret;
+
+ mutex_lock(&group->mutex);
+ ret = __iommu_group_for_each_dev(group, data, fn);
+ mutex_unlock(&group->mutex);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(iommu_group_for_each_dev);
+
+/**
+ * iommu_group_get - Return the group for a device and increment reference
+ * @dev: get the group that this device belongs to
+ *
+ * This function is called by iommu drivers and users to get the group
+ * for the specified device. If found, the group is returned and the group
+ * reference in incremented, else NULL.
+ */
+struct iommu_group *iommu_group_get(struct device *dev)
+{
+ struct iommu_group *group = dev->iommu_group;
+
+ if (group)
+ kobject_get(group->devices_kobj);
+
+ return group;
+}
+EXPORT_SYMBOL_GPL(iommu_group_get);
+
+/**
+ * iommu_group_ref_get - Increment reference on a group
+ * @group: the group to use, must not be NULL
+ *
+ * This function is called by iommu drivers to take additional references on an
+ * existing group. Returns the given group for convenience.
+ */
+struct iommu_group *iommu_group_ref_get(struct iommu_group *group)
+{
+ kobject_get(group->devices_kobj);
+ return group;
+}
+EXPORT_SYMBOL_GPL(iommu_group_ref_get);
+
+/**
+ * iommu_group_put - Decrement group reference
+ * @group: the group to use
+ *
+ * This function is called by iommu drivers and users to release the
+ * iommu group. Once the reference count is zero, the group is released.
+ */
+void iommu_group_put(struct iommu_group *group)
+{
+ if (group)
+ kobject_put(group->devices_kobj);
+}
+EXPORT_SYMBOL_GPL(iommu_group_put);
+
+/**
+ * iommu_group_register_notifier - Register a notifier for group changes
+ * @group: the group to watch
+ * @nb: notifier block to signal
+ *
+ * This function allows iommu group users to track changes in a group.
+ * See include/linux/iommu.h for actions sent via this notifier. Caller
+ * should hold a reference to the group throughout notifier registration.
+ */
+int iommu_group_register_notifier(struct iommu_group *group,
+ struct notifier_block *nb)
+{
+ return blocking_notifier_chain_register(&group->notifier, nb);
+}
+EXPORT_SYMBOL_GPL(iommu_group_register_notifier);
+
+/**
+ * iommu_group_unregister_notifier - Unregister a notifier
+ * @group: the group to watch
+ * @nb: notifier block to signal
+ *
+ * Unregister a previously registered group notifier block.
+ */
+int iommu_group_unregister_notifier(struct iommu_group *group,
+ struct notifier_block *nb)
+{
+ return blocking_notifier_chain_unregister(&group->notifier, nb);
+}
+EXPORT_SYMBOL_GPL(iommu_group_unregister_notifier);
+
+/**
+ * iommu_register_device_fault_handler() - Register a device fault handler
+ * @dev: the device
+ * @handler: the fault handler
+ * @data: private data passed as argument to the handler
+ *
+ * When an IOMMU fault event is received, this handler gets called with the
+ * fault event and data as argument. The handler should return 0 on success. If
+ * the fault is recoverable (IOMMU_FAULT_PAGE_REQ), the consumer should also
+ * complete the fault by calling iommu_page_response() with one of the following
+ * response code:
+ * - IOMMU_PAGE_RESP_SUCCESS: retry the translation
+ * - IOMMU_PAGE_RESP_INVALID: terminate the fault
+ * - IOMMU_PAGE_RESP_FAILURE: terminate the fault and stop reporting
+ * page faults if possible.
+ *
+ * Return 0 if the fault handler was installed successfully, or an error.
+ */
+int iommu_register_device_fault_handler(struct device *dev,
+ iommu_dev_fault_handler_t handler,
+ void *data)
+{
+ struct dev_iommu *param = dev->iommu;
+ int ret = 0;
+
+ if (!param)
+ return -EINVAL;
+
+ mutex_lock(&param->lock);
+ /* Only allow one fault handler registered for each device */
+ if (param->fault_param) {
+ ret = -EBUSY;
+ goto done_unlock;
+ }
+
+ get_device(dev);
+ param->fault_param = kzalloc(sizeof(*param->fault_param), GFP_KERNEL);
+ if (!param->fault_param) {
+ put_device(dev);
+ ret = -ENOMEM;
+ goto done_unlock;
+ }
+ param->fault_param->handler = handler;
+ param->fault_param->data = data;
+ mutex_init(&param->fault_param->lock);
+ INIT_LIST_HEAD(&param->fault_param->faults);
+
+done_unlock:
+ mutex_unlock(&param->lock);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(iommu_register_device_fault_handler);
+
+/**
+ * iommu_unregister_device_fault_handler() - Unregister the device fault handler
+ * @dev: the device
+ *
+ * Remove the device fault handler installed with
+ * iommu_register_device_fault_handler().
+ *
+ * Return 0 on success, or an error.
+ */
+int iommu_unregister_device_fault_handler(struct device *dev)
+{
+ struct dev_iommu *param = dev->iommu;
+ int ret = 0;
+
+ if (!param)
+ return -EINVAL;
+
+ mutex_lock(&param->lock);
+
+ if (!param->fault_param)
+ goto unlock;
+
+ /* we cannot unregister handler if there are pending faults */
+ if (!list_empty(&param->fault_param->faults)) {
+ ret = -EBUSY;
+ goto unlock;
+ }
+
+ kfree(param->fault_param);
+ param->fault_param = NULL;
+ put_device(dev);
+unlock:
+ mutex_unlock(&param->lock);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(iommu_unregister_device_fault_handler);
+
+/**
+ * iommu_report_device_fault() - Report fault event to device driver
+ * @dev: the device
+ * @evt: fault event data
+ *
+ * Called by IOMMU drivers when a fault is detected, typically in a threaded IRQ
+ * handler. When this function fails and the fault is recoverable, it is the
+ * caller's responsibility to complete the fault.
+ *
+ * Return 0 on success, or an error.
+ */
+int iommu_report_device_fault(struct device *dev, struct iommu_fault_event *evt)
+{
+ struct dev_iommu *param = dev->iommu;
+ struct iommu_fault_event *evt_pending = NULL;
+ struct iommu_fault_param *fparam;
+ int ret = 0;
+
+ if (!param || !evt)
+ return -EINVAL;
+
+ /* we only report device fault if there is a handler registered */
+ mutex_lock(&param->lock);
+ fparam = param->fault_param;
+ if (!fparam || !fparam->handler) {
+ ret = -EINVAL;
+ goto done_unlock;
+ }
+
+ if (evt->fault.type == IOMMU_FAULT_PAGE_REQ &&
+ (evt->fault.prm.flags & IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE)) {
+ evt_pending = kmemdup(evt, sizeof(struct iommu_fault_event),
+ GFP_KERNEL);
+ if (!evt_pending) {
+ ret = -ENOMEM;
+ goto done_unlock;
+ }
+ mutex_lock(&fparam->lock);
+ list_add_tail(&evt_pending->list, &fparam->faults);
+ mutex_unlock(&fparam->lock);
+ }
+
+ ret = fparam->handler(&evt->fault, fparam->data);
+ if (ret && evt_pending) {
+ mutex_lock(&fparam->lock);
+ list_del(&evt_pending->list);
+ mutex_unlock(&fparam->lock);
+ kfree(evt_pending);
+ }
+done_unlock:
+ mutex_unlock(&param->lock);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(iommu_report_device_fault);
+
+int iommu_page_response(struct device *dev,
+ struct iommu_page_response *msg)
+{
+ bool needs_pasid;
+ int ret = -EINVAL;
+ struct iommu_fault_event *evt;
+ struct iommu_fault_page_request *prm;
+ struct dev_iommu *param = dev->iommu;
+ bool has_pasid = msg->flags & IOMMU_PAGE_RESP_PASID_VALID;
+ struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
+
+ if (!domain || !domain->ops->page_response)
+ return -ENODEV;
+
+ if (!param || !param->fault_param)
+ return -EINVAL;
+
+ if (msg->version != IOMMU_PAGE_RESP_VERSION_1 ||
+ msg->flags & ~IOMMU_PAGE_RESP_PASID_VALID)
+ return -EINVAL;
+
+ /* Only send response if there is a fault report pending */
+ mutex_lock(&param->fault_param->lock);
+ if (list_empty(&param->fault_param->faults)) {
+ dev_warn_ratelimited(dev, "no pending PRQ, drop response\n");
+ goto done_unlock;
+ }
+ /*
+ * Check if we have a matching page request pending to respond,
+ * otherwise return -EINVAL
+ */
+ list_for_each_entry(evt, &param->fault_param->faults, list) {
+ prm = &evt->fault.prm;
+ if (prm->grpid != msg->grpid)
+ continue;
+
+ /*
+ * If the PASID is required, the corresponding request is
+ * matched using the group ID, the PASID valid bit and the PASID
+ * value. Otherwise only the group ID matches request and
+ * response.
+ */
+ needs_pasid = prm->flags & IOMMU_FAULT_PAGE_RESPONSE_NEEDS_PASID;
+ if (needs_pasid && (!has_pasid || msg->pasid != prm->pasid))
+ continue;
+
+ if (!needs_pasid && has_pasid) {
+ /* No big deal, just clear it. */
+ msg->flags &= ~IOMMU_PAGE_RESP_PASID_VALID;
+ msg->pasid = 0;
+ }
+
+ ret = domain->ops->page_response(dev, evt, msg);
+ list_del(&evt->list);
+ kfree(evt);
+ break;
+ }
+
+done_unlock:
+ mutex_unlock(&param->fault_param->lock);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(iommu_page_response);
+
+/**
+ * iommu_group_id - Return ID for a group
+ * @group: the group to ID
+ *
+ * Return the unique ID for the group matching the sysfs group number.
+ */
+int iommu_group_id(struct iommu_group *group)
+{
+ return group->id;
+}
+EXPORT_SYMBOL_GPL(iommu_group_id);
+
+static struct iommu_group *get_pci_alias_group(struct pci_dev *pdev,
+ unsigned long *devfns);
+
+/*
+ * To consider a PCI device isolated, we require ACS to support Source
+ * Validation, Request Redirection, Completer Redirection, and Upstream
+ * Forwarding. This effectively means that devices cannot spoof their
+ * requester ID, requests and completions cannot be redirected, and all
+ * transactions are forwarded upstream, even as it passes through a
+ * bridge where the target device is downstream.
+ */
+#define REQ_ACS_FLAGS (PCI_ACS_SV | PCI_ACS_RR | PCI_ACS_CR | PCI_ACS_UF)
+
+/*
+ * For multifunction devices which are not isolated from each other, find
+ * all the other non-isolated functions and look for existing groups. For
+ * each function, we also need to look for aliases to or from other devices
+ * that may already have a group.
+ */
+static struct iommu_group *get_pci_function_alias_group(struct pci_dev *pdev,
+ unsigned long *devfns)
+{
+ struct pci_dev *tmp = NULL;
+ struct iommu_group *group;
+
+ if (!pdev->multifunction || pci_acs_enabled(pdev, REQ_ACS_FLAGS))
+ return NULL;
+
+ for_each_pci_dev(tmp) {
+ if (tmp == pdev || tmp->bus != pdev->bus ||
+ PCI_SLOT(tmp->devfn) != PCI_SLOT(pdev->devfn) ||
+ pci_acs_enabled(tmp, REQ_ACS_FLAGS))
+ continue;
+
+ group = get_pci_alias_group(tmp, devfns);
+ if (group) {
+ pci_dev_put(tmp);
+ return group;
+ }
+ }
+
+ return NULL;
+}
+
+/*
+ * Look for aliases to or from the given device for existing groups. DMA
+ * aliases are only supported on the same bus, therefore the search
+ * space is quite small (especially since we're really only looking at pcie
+ * device, and therefore only expect multiple slots on the root complex or
+ * downstream switch ports). It's conceivable though that a pair of
+ * multifunction devices could have aliases between them that would cause a
+ * loop. To prevent this, we use a bitmap to track where we've been.
+ */
+static struct iommu_group *get_pci_alias_group(struct pci_dev *pdev,
+ unsigned long *devfns)
+{
+ struct pci_dev *tmp = NULL;
+ struct iommu_group *group;
+
+ if (test_and_set_bit(pdev->devfn & 0xff, devfns))
+ return NULL;
+
+ group = iommu_group_get(&pdev->dev);
+ if (group)
+ return group;
+
+ for_each_pci_dev(tmp) {
+ if (tmp == pdev || tmp->bus != pdev->bus)
+ continue;
+
+ /* We alias them or they alias us */
+ if (pci_devs_are_dma_aliases(pdev, tmp)) {
+ group = get_pci_alias_group(tmp, devfns);
+ if (group) {
+ pci_dev_put(tmp);
+ return group;
+ }
+
+ group = get_pci_function_alias_group(tmp, devfns);
+ if (group) {
+ pci_dev_put(tmp);
+ return group;
+ }
+ }
+ }
+
+ return NULL;
+}
+
+struct group_for_pci_data {
+ struct pci_dev *pdev;
+ struct iommu_group *group;
+};
+
+/*
+ * DMA alias iterator callback, return the last seen device. Stop and return
+ * the IOMMU group if we find one along the way.
+ */
+static int get_pci_alias_or_group(struct pci_dev *pdev, u16 alias, void *opaque)
+{
+ struct group_for_pci_data *data = opaque;
+
+ data->pdev = pdev;
+ data->group = iommu_group_get(&pdev->dev);
+
+ return data->group != NULL;
+}
+
+/*
+ * Generic device_group call-back function. It just allocates one
+ * iommu-group per device.
+ */
+struct iommu_group *generic_device_group(struct device *dev)
+{
+ return iommu_group_alloc();
+}
+EXPORT_SYMBOL_GPL(generic_device_group);
+
+/*
+ * Use standard PCI bus topology, isolation features, and DMA alias quirks
+ * to find or create an IOMMU group for a device.
+ */
+struct iommu_group *pci_device_group(struct device *dev)
+{
+ struct pci_dev *pdev = to_pci_dev(dev);
+ struct group_for_pci_data data;
+ struct pci_bus *bus;
+ struct iommu_group *group = NULL;
+ u64 devfns[4] = { 0 };
+
+ if (WARN_ON(!dev_is_pci(dev)))
+ return ERR_PTR(-EINVAL);
+
+ /*
+ * Find the upstream DMA alias for the device. A device must not
+ * be aliased due to topology in order to have its own IOMMU group.
+ * If we find an alias along the way that already belongs to a
+ * group, use it.
+ */
+ if (pci_for_each_dma_alias(pdev, get_pci_alias_or_group, &data))
+ return data.group;
+
+ pdev = data.pdev;
+
+ /*
+ * Continue upstream from the point of minimum IOMMU granularity
+ * due to aliases to the point where devices are protected from
+ * peer-to-peer DMA by PCI ACS. Again, if we find an existing
+ * group, use it.
+ */
+ for (bus = pdev->bus; !pci_is_root_bus(bus); bus = bus->parent) {
+ if (!bus->self)
+ continue;
+
+ if (pci_acs_path_enabled(bus->self, NULL, REQ_ACS_FLAGS))
+ break;
+
+ pdev = bus->self;
+
+ group = iommu_group_get(&pdev->dev);
+ if (group)
+ return group;
+ }
+
+ /*
+ * Look for existing groups on device aliases. If we alias another
+ * device or another device aliases us, use the same group.
+ */
+ group = get_pci_alias_group(pdev, (unsigned long *)devfns);
+ if (group)
+ return group;
+
+ /*
+ * Look for existing groups on non-isolated functions on the same
+ * slot and aliases of those funcions, if any. No need to clear
+ * the search bitmap, the tested devfns are still valid.
+ */
+ group = get_pci_function_alias_group(pdev, (unsigned long *)devfns);
+ if (group)
+ return group;
+
+ /* No shared group found, allocate new */
+ return iommu_group_alloc();
+}
+EXPORT_SYMBOL_GPL(pci_device_group);
+
+/* Get the IOMMU group for device on fsl-mc bus */
+struct iommu_group *fsl_mc_device_group(struct device *dev)
+{
+ struct device *cont_dev = fsl_mc_cont_dev(dev);
+ struct iommu_group *group;
+
+ group = iommu_group_get(cont_dev);
+ if (!group)
+ group = iommu_group_alloc();
+ return group;
+}
+EXPORT_SYMBOL_GPL(fsl_mc_device_group);
+
+static int iommu_get_def_domain_type(struct device *dev)
+{
+ const struct iommu_ops *ops = dev->bus->iommu_ops;
+ unsigned int type = 0;
+
+ if (ops->def_domain_type)
+ type = ops->def_domain_type(dev);
+
+ return (type == 0) ? iommu_def_domain_type : type;
+}
+
+static int iommu_group_alloc_default_domain(struct bus_type *bus,
+ struct iommu_group *group,
+ unsigned int type)
+{
+ struct iommu_domain *dom;
+
+ dom = __iommu_domain_alloc(bus, type);
+ if (!dom && type != IOMMU_DOMAIN_DMA) {
+ dom = __iommu_domain_alloc(bus, IOMMU_DOMAIN_DMA);
+ if (dom)
+ pr_warn("Failed to allocate default IOMMU domain of type %u for group %s - Falling back to IOMMU_DOMAIN_DMA",
+ type, group->name);
+ }
+
+ if (!dom)
+ return -ENOMEM;
+
+ group->default_domain = dom;
+ if (!group->domain)
+ group->domain = dom;
+
+ if (!iommu_dma_strict) {
+ int attr = 1;
+ iommu_domain_set_attr(dom,
+ DOMAIN_ATTR_DMA_USE_FLUSH_QUEUE,
+ &attr);
+ }
+
+ return 0;
+}
+
+static int iommu_alloc_default_domain(struct iommu_group *group,
+ struct device *dev)
+{
+ unsigned int type;
+
+ if (group->default_domain)
+ return 0;
+
+ type = iommu_get_def_domain_type(dev);
+
+ return iommu_group_alloc_default_domain(dev->bus, group, type);
+}
+
+/**
+ * iommu_group_get_for_dev - Find or create the IOMMU group for a device
+ * @dev: target device
+ *
+ * This function is intended to be called by IOMMU drivers and extended to
+ * support common, bus-defined algorithms when determining or creating the
+ * IOMMU group for a device. On success, the caller will hold a reference
+ * to the returned IOMMU group, which will already include the provided
+ * device. The reference should be released with iommu_group_put().
+ */
+static struct iommu_group *iommu_group_get_for_dev(struct device *dev)
+{
+ const struct iommu_ops *ops = dev->bus->iommu_ops;
+ struct iommu_group *group;
+ int ret;
+
+ group = iommu_group_get(dev);
+ if (group)
+ return group;
+
+ if (!ops)
+ return ERR_PTR(-EINVAL);
+
+ group = ops->device_group(dev);
+ if (WARN_ON_ONCE(group == NULL))
+ return ERR_PTR(-EINVAL);
+
+ if (IS_ERR(group))
+ return group;
+
+ ret = iommu_group_add_device(group, dev);
+ if (ret)
+ goto out_put_group;
+
+ return group;
+
+out_put_group:
+ iommu_group_put(group);
+
+ return ERR_PTR(ret);
+}
+
+struct iommu_domain *iommu_group_default_domain(struct iommu_group *group)
+{
+ return group->default_domain;
+}
+
+static int probe_iommu_group(struct device *dev, void *data)
+{
+ struct list_head *group_list = data;
+ struct iommu_group *group;
+ int ret;
+
+ /* Device is probed already if in a group */
+ group = iommu_group_get(dev);
+ if (group) {
+ iommu_group_put(group);
+ return 0;
+ }
+
+ ret = __iommu_probe_device(dev, group_list);
+ if (ret == -ENODEV)
+ ret = 0;
+
+ return ret;
+}
+
+static int remove_iommu_group(struct device *dev, void *data)
+{
+ iommu_release_device(dev);
+
+ return 0;
+}
+
+static int iommu_bus_notifier(struct notifier_block *nb,
+ unsigned long action, void *data)
+{
+ unsigned long group_action = 0;
+ struct device *dev = data;
+ struct iommu_group *group;
+
+ /*
+ * ADD/DEL call into iommu driver ops if provided, which may
+ * result in ADD/DEL notifiers to group->notifier
+ */
+ if (action == BUS_NOTIFY_ADD_DEVICE) {
+ int ret;
+
+ ret = iommu_probe_device(dev);
+ return (ret) ? NOTIFY_DONE : NOTIFY_OK;
+ } else if (action == BUS_NOTIFY_REMOVED_DEVICE) {
+ iommu_release_device(dev);
+ return NOTIFY_OK;
+ }
+
+ /*
+ * Remaining BUS_NOTIFYs get filtered and republished to the
+ * group, if anyone is listening
+ */
+ group = iommu_group_get(dev);
+ if (!group)
+ return 0;
+
+ switch (action) {
+ case BUS_NOTIFY_BIND_DRIVER:
+ group_action = IOMMU_GROUP_NOTIFY_BIND_DRIVER;
+ break;
+ case BUS_NOTIFY_BOUND_DRIVER:
+ group_action = IOMMU_GROUP_NOTIFY_BOUND_DRIVER;
+ break;
+ case BUS_NOTIFY_UNBIND_DRIVER:
+ group_action = IOMMU_GROUP_NOTIFY_UNBIND_DRIVER;
+ break;
+ case BUS_NOTIFY_UNBOUND_DRIVER:
+ group_action = IOMMU_GROUP_NOTIFY_UNBOUND_DRIVER;
+ break;
+ }
+
+ if (group_action)
+ blocking_notifier_call_chain(&group->notifier,
+ group_action, dev);
+
+ iommu_group_put(group);
+ return 0;
+}
+
+struct __group_domain_type {
+ struct device *dev;
+ unsigned int type;
+};
+
+static int probe_get_default_domain_type(struct device *dev, void *data)
+{
+ const struct iommu_ops *ops = dev->bus->iommu_ops;
+ struct __group_domain_type *gtype = data;
+ unsigned int type = 0;
+
+ if (ops->def_domain_type)
+ type = ops->def_domain_type(dev);
+
+ if (type) {
+ if (gtype->type && gtype->type != type) {
+ dev_warn(dev, "Device needs domain type %s, but device %s in the same iommu group requires type %s - using default\n",
+ iommu_domain_type_str(type),
+ dev_name(gtype->dev),
+ iommu_domain_type_str(gtype->type));
+ gtype->type = 0;
+ }
+
+ if (!gtype->dev) {
+ gtype->dev = dev;
+ gtype->type = type;
+ }
+ }
+
+ return 0;
+}
+
+static void probe_alloc_default_domain(struct bus_type *bus,
+ struct iommu_group *group)
+{
+ struct __group_domain_type gtype;
+
+ memset(&gtype, 0, sizeof(gtype));
+
+ /* Ask for default domain requirements of all devices in the group */
+ __iommu_group_for_each_dev(group, &gtype,
+ probe_get_default_domain_type);
+
+ if (!gtype.type)
+ gtype.type = iommu_def_domain_type;
+
+ iommu_group_alloc_default_domain(bus, group, gtype.type);
+
+}
+
+static int iommu_group_do_dma_attach(struct device *dev, void *data)
+{
+ struct iommu_domain *domain = data;
+ int ret = 0;
+
+ if (!iommu_is_attach_deferred(domain, dev))
+ ret = __iommu_attach_device(domain, dev);
+
+ return ret;
+}
+
+static int __iommu_group_dma_attach(struct iommu_group *group)
+{
+ return __iommu_group_for_each_dev(group, group->default_domain,
+ iommu_group_do_dma_attach);
+}
+
+static int iommu_group_do_probe_finalize(struct device *dev, void *data)
+{
+ struct iommu_domain *domain = data;
+
+ if (domain->ops->probe_finalize)
+ domain->ops->probe_finalize(dev);
+
+ return 0;
+}
+
+static void __iommu_group_dma_finalize(struct iommu_group *group)
+{
+ __iommu_group_for_each_dev(group, group->default_domain,
+ iommu_group_do_probe_finalize);
+}
+
+static int iommu_do_create_direct_mappings(struct device *dev, void *data)
+{
+ struct iommu_group *group = data;
+
+ iommu_create_device_direct_mappings(group, dev);
+
+ return 0;
+}
+
+static int iommu_group_create_direct_mappings(struct iommu_group *group)
+{
+ return __iommu_group_for_each_dev(group, group,
+ iommu_do_create_direct_mappings);
+}
+
+int bus_iommu_probe(struct bus_type *bus)
+{
+ struct iommu_group *group, *next;
+ LIST_HEAD(group_list);
+ int ret;
+
+ /*
+ * This code-path does not allocate the default domain when
+ * creating the iommu group, so do it after the groups are
+ * created.
+ */
+ ret = bus_for_each_dev(bus, NULL, &group_list, probe_iommu_group);
+ if (ret)
+ return ret;
+
+ list_for_each_entry_safe(group, next, &group_list, entry) {
+ /* Remove item from the list */
+ list_del_init(&group->entry);
+
+ mutex_lock(&group->mutex);
+
+ /* Try to allocate default domain */
+ probe_alloc_default_domain(bus, group);
+
+ if (!group->default_domain) {
+ mutex_unlock(&group->mutex);
+ continue;
+ }
+
+ iommu_group_create_direct_mappings(group);
+
+ ret = __iommu_group_dma_attach(group);
+
+ mutex_unlock(&group->mutex);
+
+ if (ret)
+ break;
+
+ __iommu_group_dma_finalize(group);
+ }
+
+ return ret;
+}
+
+static int iommu_bus_init(struct bus_type *bus, const struct iommu_ops *ops)
+{
+ struct notifier_block *nb;
+ int err;
+
+ nb = kzalloc(sizeof(struct notifier_block), GFP_KERNEL);
+ if (!nb)
+ return -ENOMEM;
+
+ nb->notifier_call = iommu_bus_notifier;
+
+ err = bus_register_notifier(bus, nb);
+ if (err)
+ goto out_free;
+
+ err = bus_iommu_probe(bus);
+ if (err)
+ goto out_err;
+
+
+ return 0;
+
+out_err:
+ /* Clean up */
+ bus_for_each_dev(bus, NULL, NULL, remove_iommu_group);
+ bus_unregister_notifier(bus, nb);
+
+out_free:
+ kfree(nb);
+
+ return err;
+}
+
+/**
+ * bus_set_iommu - set iommu-callbacks for the bus
+ * @bus: bus.
+ * @ops: the callbacks provided by the iommu-driver
+ *
+ * This function is called by an iommu driver to set the iommu methods
+ * used for a particular bus. Drivers for devices on that bus can use
+ * the iommu-api after these ops are registered.
+ * This special function is needed because IOMMUs are usually devices on
+ * the bus itself, so the iommu drivers are not initialized when the bus
+ * is set up. With this function the iommu-driver can set the iommu-ops
+ * afterwards.
+ */
+int bus_set_iommu(struct bus_type *bus, const struct iommu_ops *ops)
+{
+ int err;
+
+ if (ops == NULL) {
+ bus->iommu_ops = NULL;
+ return 0;
+ }
+
+ if (bus->iommu_ops != NULL)
+ return -EBUSY;
+
+ bus->iommu_ops = ops;
+
+ /* Do IOMMU specific setup for this bus-type */
+ err = iommu_bus_init(bus, ops);
+ if (err)
+ bus->iommu_ops = NULL;
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(bus_set_iommu);
+
+bool iommu_present(struct bus_type *bus)
+{
+ return bus->iommu_ops != NULL;
+}
+EXPORT_SYMBOL_GPL(iommu_present);
+
+bool iommu_capable(struct bus_type *bus, enum iommu_cap cap)
+{
+ if (!bus->iommu_ops || !bus->iommu_ops->capable)
+ return false;
+
+ return bus->iommu_ops->capable(cap);
+}
+EXPORT_SYMBOL_GPL(iommu_capable);
+
+/**
+ * iommu_set_fault_handler() - set a fault handler for an iommu domain
+ * @domain: iommu domain
+ * @handler: fault handler
+ * @token: user data, will be passed back to the fault handler
+ *
+ * This function should be used by IOMMU users which want to be notified
+ * whenever an IOMMU fault happens.
+ *
+ * The fault handler itself should return 0 on success, and an appropriate
+ * error code otherwise.
+ */
+void iommu_set_fault_handler(struct iommu_domain *domain,
+ iommu_fault_handler_t handler,
+ void *token)
+{
+ BUG_ON(!domain);
+
+ domain->handler = handler;
+ domain->handler_token = token;
+}
+EXPORT_SYMBOL_GPL(iommu_set_fault_handler);
+
+static struct iommu_domain *__iommu_domain_alloc(struct bus_type *bus,
+ unsigned type)
+{
+ struct iommu_domain *domain;
+
+ if (bus == NULL || bus->iommu_ops == NULL)
+ return NULL;
+
+ domain = bus->iommu_ops->domain_alloc(type);
+ if (!domain)
+ return NULL;
+
+ domain->ops = bus->iommu_ops;
+ domain->type = type;
+ /* Assume all sizes by default; the driver may override this later */
+ domain->pgsize_bitmap = bus->iommu_ops->pgsize_bitmap;
+
+ return domain;
+}
+
+struct iommu_domain *iommu_domain_alloc(struct bus_type *bus)
+{
+ return __iommu_domain_alloc(bus, IOMMU_DOMAIN_UNMANAGED);
+}
+EXPORT_SYMBOL_GPL(iommu_domain_alloc);
+
+void iommu_domain_free(struct iommu_domain *domain)
+{
+ domain->ops->domain_free(domain);
+}
+EXPORT_SYMBOL_GPL(iommu_domain_free);
+
+static int __iommu_attach_device(struct iommu_domain *domain,
+ struct device *dev)
+{
+ int ret;
+
+ if (unlikely(domain->ops->attach_dev == NULL))
+ return -ENODEV;
+
+ ret = domain->ops->attach_dev(domain, dev);
+ if (!ret)
+ trace_attach_device_to_domain(dev);
+ return ret;
+}
+
+int iommu_attach_device(struct iommu_domain *domain, struct device *dev)
+{
+ struct iommu_group *group;
+ int ret;
+
+ group = iommu_group_get(dev);
+ if (!group)
+ return -ENODEV;
+
+ /*
+ * Lock the group to make sure the device-count doesn't
+ * change while we are attaching
+ */
+ mutex_lock(&group->mutex);
+ ret = -EINVAL;
+ if (iommu_group_device_count(group) != 1)
+ goto out_unlock;
+
+ ret = __iommu_attach_group(domain, group);
+
+out_unlock:
+ mutex_unlock(&group->mutex);
+ iommu_group_put(group);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(iommu_attach_device);
+
+/*
+ * Check flags and other user provided data for valid combinations. We also
+ * make sure no reserved fields or unused flags are set. This is to ensure
+ * not breaking userspace in the future when these fields or flags are used.
+ */
+static int iommu_check_cache_invl_data(struct iommu_cache_invalidate_info *info)
+{
+ u32 mask;
+ int i;
+
+ if (info->version != IOMMU_CACHE_INVALIDATE_INFO_VERSION_1)
+ return -EINVAL;
+
+ mask = (1 << IOMMU_CACHE_INV_TYPE_NR) - 1;
+ if (info->cache & ~mask)
+ return -EINVAL;
+
+ if (info->granularity >= IOMMU_INV_GRANU_NR)
+ return -EINVAL;
+
+ switch (info->granularity) {
+ case IOMMU_INV_GRANU_ADDR:
+ if (info->cache & IOMMU_CACHE_INV_TYPE_PASID)
+ return -EINVAL;
+
+ mask = IOMMU_INV_ADDR_FLAGS_PASID |
+ IOMMU_INV_ADDR_FLAGS_ARCHID |
+ IOMMU_INV_ADDR_FLAGS_LEAF;
+
+ if (info->granu.addr_info.flags & ~mask)
+ return -EINVAL;
+ break;
+ case IOMMU_INV_GRANU_PASID:
+ mask = IOMMU_INV_PASID_FLAGS_PASID |
+ IOMMU_INV_PASID_FLAGS_ARCHID;
+ if (info->granu.pasid_info.flags & ~mask)
+ return -EINVAL;
+
+ break;
+ case IOMMU_INV_GRANU_DOMAIN:
+ if (info->cache & IOMMU_CACHE_INV_TYPE_DEV_IOTLB)
+ return -EINVAL;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ /* Check reserved padding fields */
+ for (i = 0; i < sizeof(info->padding); i++) {
+ if (info->padding[i])
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+int iommu_uapi_cache_invalidate(struct iommu_domain *domain, struct device *dev,
+ void __user *uinfo)
+{
+ struct iommu_cache_invalidate_info inv_info = { 0 };
+ u32 minsz;
+ int ret;
+
+ if (unlikely(!domain->ops->cache_invalidate))
+ return -ENODEV;
+
+ /*
+ * No new spaces can be added before the variable sized union, the
+ * minimum size is the offset to the union.
+ */
+ minsz = offsetof(struct iommu_cache_invalidate_info, granu);
+
+ /* Copy minsz from user to get flags and argsz */
+ if (copy_from_user(&inv_info, uinfo, minsz))
+ return -EFAULT;
+
+ /* Fields before the variable size union are mandatory */
+ if (inv_info.argsz < minsz)
+ return -EINVAL;
+
+ /* PASID and address granu require additional info beyond minsz */
+ if (inv_info.granularity == IOMMU_INV_GRANU_PASID &&
+ inv_info.argsz < offsetofend(struct iommu_cache_invalidate_info, granu.pasid_info))
+ return -EINVAL;
+
+ if (inv_info.granularity == IOMMU_INV_GRANU_ADDR &&
+ inv_info.argsz < offsetofend(struct iommu_cache_invalidate_info, granu.addr_info))
+ return -EINVAL;
+
+ /*
+ * User might be using a newer UAPI header which has a larger data
+ * size, we shall support the existing flags within the current
+ * size. Copy the remaining user data _after_ minsz but not more
+ * than the current kernel supported size.
+ */
+ if (copy_from_user((void *)&inv_info + minsz, uinfo + minsz,
+ min_t(u32, inv_info.argsz, sizeof(inv_info)) - minsz))
+ return -EFAULT;
+
+ /* Now the argsz is validated, check the content */
+ ret = iommu_check_cache_invl_data(&inv_info);
+ if (ret)
+ return ret;
+
+ return domain->ops->cache_invalidate(domain, dev, &inv_info);
+}
+EXPORT_SYMBOL_GPL(iommu_uapi_cache_invalidate);
+
+static int iommu_check_bind_data(struct iommu_gpasid_bind_data *data)
+{
+ u64 mask;
+ int i;
+
+ if (data->version != IOMMU_GPASID_BIND_VERSION_1)
+ return -EINVAL;
+
+ /* Check the range of supported formats */
+ if (data->format >= IOMMU_PASID_FORMAT_LAST)
+ return -EINVAL;
+
+ /* Check all flags */
+ mask = IOMMU_SVA_GPASID_VAL;
+ if (data->flags & ~mask)
+ return -EINVAL;
+
+ /* Check reserved padding fields */
+ for (i = 0; i < sizeof(data->padding); i++) {
+ if (data->padding[i])
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int iommu_sva_prepare_bind_data(void __user *udata,
+ struct iommu_gpasid_bind_data *data)
+{
+ u32 minsz;
+
+ /*
+ * No new spaces can be added before the variable sized union, the
+ * minimum size is the offset to the union.
+ */
+ minsz = offsetof(struct iommu_gpasid_bind_data, vendor);
+
+ /* Copy minsz from user to get flags and argsz */
+ if (copy_from_user(data, udata, minsz))
+ return -EFAULT;
+
+ /* Fields before the variable size union are mandatory */
+ if (data->argsz < minsz)
+ return -EINVAL;
+ /*
+ * User might be using a newer UAPI header, we shall let IOMMU vendor
+ * driver decide on what size it needs. Since the guest PASID bind data
+ * can be vendor specific, larger argsz could be the result of extension
+ * for one vendor but it should not affect another vendor.
+ * Copy the remaining user data _after_ minsz
+ */
+ if (copy_from_user((void *)data + minsz, udata + minsz,
+ min_t(u32, data->argsz, sizeof(*data)) - minsz))
+ return -EFAULT;
+
+ return iommu_check_bind_data(data);
+}
+
+int iommu_uapi_sva_bind_gpasid(struct iommu_domain *domain, struct device *dev,
+ void __user *udata)
+{
+ struct iommu_gpasid_bind_data data = { 0 };
+ int ret;
+
+ if (unlikely(!domain->ops->sva_bind_gpasid))
+ return -ENODEV;
+
+ ret = iommu_sva_prepare_bind_data(udata, &data);
+ if (ret)
+ return ret;
+
+ return domain->ops->sva_bind_gpasid(domain, dev, &data);
+}
+EXPORT_SYMBOL_GPL(iommu_uapi_sva_bind_gpasid);
+
+int iommu_sva_unbind_gpasid(struct iommu_domain *domain, struct device *dev,
+ ioasid_t pasid)
+{
+ if (unlikely(!domain->ops->sva_unbind_gpasid))
+ return -ENODEV;
+
+ return domain->ops->sva_unbind_gpasid(dev, pasid);
+}
+EXPORT_SYMBOL_GPL(iommu_sva_unbind_gpasid);
+
+int iommu_uapi_sva_unbind_gpasid(struct iommu_domain *domain, struct device *dev,
+ void __user *udata)
+{
+ struct iommu_gpasid_bind_data data = { 0 };
+ int ret;
+
+ if (unlikely(!domain->ops->sva_bind_gpasid))
+ return -ENODEV;
+
+ ret = iommu_sva_prepare_bind_data(udata, &data);
+ if (ret)
+ return ret;
+
+ return iommu_sva_unbind_gpasid(domain, dev, data.hpasid);
+}
+EXPORT_SYMBOL_GPL(iommu_uapi_sva_unbind_gpasid);
+
+static void __iommu_detach_device(struct iommu_domain *domain,
+ struct device *dev)
+{
+ if (iommu_is_attach_deferred(domain, dev))
+ return;
+
+ if (unlikely(domain->ops->detach_dev == NULL))
+ return;
+
+ domain->ops->detach_dev(domain, dev);
+ trace_detach_device_from_domain(dev);
+}
+
+void iommu_detach_device(struct iommu_domain *domain, struct device *dev)
+{
+ struct iommu_group *group;
+
+ group = iommu_group_get(dev);
+ if (!group)
+ return;
+
+ mutex_lock(&group->mutex);
+ if (iommu_group_device_count(group) != 1) {
+ WARN_ON(1);
+ goto out_unlock;
+ }
+
+ __iommu_detach_group(domain, group);
+
+out_unlock:
+ mutex_unlock(&group->mutex);
+ iommu_group_put(group);
+}
+EXPORT_SYMBOL_GPL(iommu_detach_device);
+
+struct iommu_domain *iommu_get_domain_for_dev(struct device *dev)
+{
+ struct iommu_domain *domain;
+ struct iommu_group *group;
+
+ group = iommu_group_get(dev);
+ if (!group)
+ return NULL;
+
+ domain = group->domain;
+
+ iommu_group_put(group);
+
+ return domain;
+}
+EXPORT_SYMBOL_GPL(iommu_get_domain_for_dev);
+
+/*
+ * For IOMMU_DOMAIN_DMA implementations which already provide their own
+ * guarantees that the group and its default domain are valid and correct.
+ */
+struct iommu_domain *iommu_get_dma_domain(struct device *dev)
+{
+ return dev->iommu_group->default_domain;
+}
+
+/*
+ * IOMMU groups are really the natural working unit of the IOMMU, but
+ * the IOMMU API works on domains and devices. Bridge that gap by
+ * iterating over the devices in a group. Ideally we'd have a single
+ * device which represents the requestor ID of the group, but we also
+ * allow IOMMU drivers to create policy defined minimum sets, where
+ * the physical hardware may be able to distiguish members, but we
+ * wish to group them at a higher level (ex. untrusted multi-function
+ * PCI devices). Thus we attach each device.
+ */
+static int iommu_group_do_attach_device(struct device *dev, void *data)
+{
+ struct iommu_domain *domain = data;
+
+ return __iommu_attach_device(domain, dev);
+}
+
+static int __iommu_attach_group(struct iommu_domain *domain,
+ struct iommu_group *group)
+{
+ int ret;
+
+ if (group->default_domain && group->domain != group->default_domain)
+ return -EBUSY;
+
+ ret = __iommu_group_for_each_dev(group, domain,
+ iommu_group_do_attach_device);
+ if (ret == 0)
+ group->domain = domain;
+
+ return ret;
+}
+
+int iommu_attach_group(struct iommu_domain *domain, struct iommu_group *group)
+{
+ int ret;
+
+ mutex_lock(&group->mutex);
+ ret = __iommu_attach_group(domain, group);
+ mutex_unlock(&group->mutex);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(iommu_attach_group);
+
+static int iommu_group_do_detach_device(struct device *dev, void *data)
+{
+ struct iommu_domain *domain = data;
+
+ __iommu_detach_device(domain, dev);
+
+ return 0;
+}
+
+static void __iommu_detach_group(struct iommu_domain *domain,
+ struct iommu_group *group)
+{
+ int ret;
+
+ if (!group->default_domain) {
+ __iommu_group_for_each_dev(group, domain,
+ iommu_group_do_detach_device);
+ group->domain = NULL;
+ return;
+ }
+
+ if (group->domain == group->default_domain)
+ return;
+
+ /* Detach by re-attaching to the default domain */
+ ret = __iommu_group_for_each_dev(group, group->default_domain,
+ iommu_group_do_attach_device);
+ if (ret != 0)
+ WARN_ON(1);
+ else
+ group->domain = group->default_domain;
+}
+
+void iommu_detach_group(struct iommu_domain *domain, struct iommu_group *group)
+{
+ mutex_lock(&group->mutex);
+ __iommu_detach_group(domain, group);
+ mutex_unlock(&group->mutex);
+}
+EXPORT_SYMBOL_GPL(iommu_detach_group);
+
+phys_addr_t iommu_iova_to_phys(struct iommu_domain *domain, dma_addr_t iova)
+{
+ if (unlikely(domain->ops->iova_to_phys == NULL))
+ return 0;
+
+ return domain->ops->iova_to_phys(domain, iova);
+}
+EXPORT_SYMBOL_GPL(iommu_iova_to_phys);
+
+static size_t iommu_pgsize(struct iommu_domain *domain,
+ unsigned long addr_merge, size_t size)
+{
+ unsigned int pgsize_idx;
+ size_t pgsize;
+
+ /* Max page size that still fits into 'size' */
+ pgsize_idx = __fls(size);
+
+ /* need to consider alignment requirements ? */
+ if (likely(addr_merge)) {
+ /* Max page size allowed by address */
+ unsigned int align_pgsize_idx = __ffs(addr_merge);
+ pgsize_idx = min(pgsize_idx, align_pgsize_idx);
+ }
+
+ /* build a mask of acceptable page sizes */
+ pgsize = (1UL << (pgsize_idx + 1)) - 1;
+
+ /* throw away page sizes not supported by the hardware */
+ pgsize &= domain->pgsize_bitmap;
+
+ /* make sure we're still sane */
+ BUG_ON(!pgsize);
+
+ /* pick the biggest page */
+ pgsize_idx = __fls(pgsize);
+ pgsize = 1UL << pgsize_idx;
+
+ return pgsize;
+}
+
+static int __iommu_map(struct iommu_domain *domain, unsigned long iova,
+ phys_addr_t paddr, size_t size, int prot, gfp_t gfp)
+{
+ const struct iommu_ops *ops = domain->ops;
+ unsigned long orig_iova = iova;
+ unsigned int min_pagesz;
+ size_t orig_size = size;
+ phys_addr_t orig_paddr = paddr;
+ int ret = 0;
+
+ if (unlikely(ops->map == NULL ||
+ domain->pgsize_bitmap == 0UL))
+ return -ENODEV;
+
+ if (unlikely(!(domain->type & __IOMMU_DOMAIN_PAGING)))
+ return -EINVAL;
+
+ /* find out the minimum page size supported */
+ min_pagesz = 1 << __ffs(domain->pgsize_bitmap);
+
+ /*
+ * both the virtual address and the physical one, as well as
+ * the size of the mapping, must be aligned (at least) to the
+ * size of the smallest page supported by the hardware
+ */
+ if (!IS_ALIGNED(iova | paddr | size, min_pagesz)) {
+ pr_err("unaligned: iova 0x%lx pa %pa size 0x%zx min_pagesz 0x%x\n",
+ iova, &paddr, size, min_pagesz);
+ return -EINVAL;
+ }
+
+ pr_debug("map: iova 0x%lx pa %pa size 0x%zx\n", iova, &paddr, size);
+
+ while (size) {
+ size_t pgsize = iommu_pgsize(domain, iova | paddr, size);
+
+ pr_debug("mapping: iova 0x%lx pa %pa pgsize 0x%zx\n",
+ iova, &paddr, pgsize);
+ ret = ops->map(domain, iova, paddr, pgsize, prot, gfp);
+
+ if (ret)
+ break;
+
+ iova += pgsize;
+ paddr += pgsize;
+ size -= pgsize;
+ }
+
+ /* unroll mapping in case something went wrong */
+ if (ret)
+ iommu_unmap(domain, orig_iova, orig_size - size);
+ else
+ trace_map(orig_iova, orig_paddr, orig_size);
+
+ return ret;
+}
+
+static int _iommu_map(struct iommu_domain *domain, unsigned long iova,
+ phys_addr_t paddr, size_t size, int prot, gfp_t gfp)
+{
+ const struct iommu_ops *ops = domain->ops;
+ int ret;
+
+ ret = __iommu_map(domain, iova, paddr, size, prot, gfp);
+ if (ret == 0 && ops->iotlb_sync_map)
+ ops->iotlb_sync_map(domain);
+
+ return ret;
+}
+
+int iommu_map(struct iommu_domain *domain, unsigned long iova,
+ phys_addr_t paddr, size_t size, int prot)
+{
+ might_sleep();
+ return _iommu_map(domain, iova, paddr, size, prot, GFP_KERNEL);
+}
+EXPORT_SYMBOL_GPL(iommu_map);
+
+int iommu_map_atomic(struct iommu_domain *domain, unsigned long iova,
+ phys_addr_t paddr, size_t size, int prot)
+{
+ return _iommu_map(domain, iova, paddr, size, prot, GFP_ATOMIC);
+}
+EXPORT_SYMBOL_GPL(iommu_map_atomic);
+
+static size_t __iommu_unmap(struct iommu_domain *domain,
+ unsigned long iova, size_t size,
+ struct iommu_iotlb_gather *iotlb_gather)
+{
+ const struct iommu_ops *ops = domain->ops;
+ size_t unmapped_page, unmapped = 0;
+ unsigned long orig_iova = iova;
+ unsigned int min_pagesz;
+
+ if (unlikely(ops->unmap == NULL ||
+ domain->pgsize_bitmap == 0UL))
+ return 0;
+
+ if (unlikely(!(domain->type & __IOMMU_DOMAIN_PAGING)))
+ return 0;
+
+ /* find out the minimum page size supported */
+ min_pagesz = 1 << __ffs(domain->pgsize_bitmap);
+
+ /*
+ * The virtual address, as well as the size of the mapping, must be
+ * aligned (at least) to the size of the smallest page supported
+ * by the hardware
+ */
+ if (!IS_ALIGNED(iova | size, min_pagesz)) {
+ pr_err("unaligned: iova 0x%lx size 0x%zx min_pagesz 0x%x\n",
+ iova, size, min_pagesz);
+ return 0;
+ }
+
+ pr_debug("unmap this: iova 0x%lx size 0x%zx\n", iova, size);
+
+ /*
+ * Keep iterating until we either unmap 'size' bytes (or more)
+ * or we hit an area that isn't mapped.
+ */
+ while (unmapped < size) {
+ size_t pgsize = iommu_pgsize(domain, iova, size - unmapped);
+
+ unmapped_page = ops->unmap(domain, iova, pgsize, iotlb_gather);
+ if (!unmapped_page)
+ break;
+
+ pr_debug("unmapped: iova 0x%lx size 0x%zx\n",
+ iova, unmapped_page);
+
+ iova += unmapped_page;
+ unmapped += unmapped_page;
+ }
+
+ trace_unmap(orig_iova, size, unmapped);
+ return unmapped;
+}
+
+size_t iommu_unmap(struct iommu_domain *domain,
+ unsigned long iova, size_t size)
+{
+ struct iommu_iotlb_gather iotlb_gather;
+ size_t ret;
+
+ iommu_iotlb_gather_init(&iotlb_gather);
+ ret = __iommu_unmap(domain, iova, size, &iotlb_gather);
+ iommu_iotlb_sync(domain, &iotlb_gather);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(iommu_unmap);
+
+size_t iommu_unmap_fast(struct iommu_domain *domain,
+ unsigned long iova, size_t size,
+ struct iommu_iotlb_gather *iotlb_gather)
+{
+ return __iommu_unmap(domain, iova, size, iotlb_gather);
+}
+EXPORT_SYMBOL_GPL(iommu_unmap_fast);
+
+static size_t __iommu_map_sg(struct iommu_domain *domain, unsigned long iova,
+ struct scatterlist *sg, unsigned int nents, int prot,
+ gfp_t gfp)
+{
+ const struct iommu_ops *ops = domain->ops;
+ size_t len = 0, mapped = 0;
+ phys_addr_t start;
+ unsigned int i = 0;
+ int ret;
+
+ while (i <= nents) {
+ phys_addr_t s_phys = sg_phys(sg);
+
+ if (len && s_phys != start + len) {
+ ret = __iommu_map(domain, iova + mapped, start,
+ len, prot, gfp);
+
+ if (ret)
+ goto out_err;
+
+ mapped += len;
+ len = 0;
+ }
+
+ if (len) {
+ len += sg->length;
+ } else {
+ len = sg->length;
+ start = s_phys;
+ }
+
+ if (++i < nents)
+ sg = sg_next(sg);
+ }
+
+ if (ops->iotlb_sync_map)
+ ops->iotlb_sync_map(domain);
+ return mapped;
+
+out_err:
+ /* undo mappings already done */
+ iommu_unmap(domain, iova, mapped);
+
+ return 0;
+
+}
+
+size_t iommu_map_sg(struct iommu_domain *domain, unsigned long iova,
+ struct scatterlist *sg, unsigned int nents, int prot)
+{
+ might_sleep();
+ return __iommu_map_sg(domain, iova, sg, nents, prot, GFP_KERNEL);
+}
+EXPORT_SYMBOL_GPL(iommu_map_sg);
+
+size_t iommu_map_sg_atomic(struct iommu_domain *domain, unsigned long iova,
+ struct scatterlist *sg, unsigned int nents, int prot)
+{
+ return __iommu_map_sg(domain, iova, sg, nents, prot, GFP_ATOMIC);
+}
+EXPORT_SYMBOL_GPL(iommu_map_sg_atomic);
+
+int iommu_domain_window_enable(struct iommu_domain *domain, u32 wnd_nr,
+ phys_addr_t paddr, u64 size, int prot)
+{
+ if (unlikely(domain->ops->domain_window_enable == NULL))
+ return -ENODEV;
+
+ return domain->ops->domain_window_enable(domain, wnd_nr, paddr, size,
+ prot);
+}
+EXPORT_SYMBOL_GPL(iommu_domain_window_enable);
+
+void iommu_domain_window_disable(struct iommu_domain *domain, u32 wnd_nr)
+{
+ if (unlikely(domain->ops->domain_window_disable == NULL))
+ return;
+
+ return domain->ops->domain_window_disable(domain, wnd_nr);
+}
+EXPORT_SYMBOL_GPL(iommu_domain_window_disable);
+
+/**
+ * report_iommu_fault() - report about an IOMMU fault to the IOMMU framework
+ * @domain: the iommu domain where the fault has happened
+ * @dev: the device where the fault has happened
+ * @iova: the faulting address
+ * @flags: mmu fault flags (e.g. IOMMU_FAULT_READ/IOMMU_FAULT_WRITE/...)
+ *
+ * This function should be called by the low-level IOMMU implementations
+ * whenever IOMMU faults happen, to allow high-level users, that are
+ * interested in such events, to know about them.
+ *
+ * This event may be useful for several possible use cases:
+ * - mere logging of the event
+ * - dynamic TLB/PTE loading
+ * - if restarting of the faulting device is required
+ *
+ * Returns 0 on success and an appropriate error code otherwise (if dynamic
+ * PTE/TLB loading will one day be supported, implementations will be able
+ * to tell whether it succeeded or not according to this return value).
+ *
+ * Specifically, -ENOSYS is returned if a fault handler isn't installed
+ * (though fault handlers can also return -ENOSYS, in case they want to
+ * elicit the default behavior of the IOMMU drivers).
+ */
+int report_iommu_fault(struct iommu_domain *domain, struct device *dev,
+ unsigned long iova, int flags)
+{
+ int ret = -ENOSYS;
+
+ /*
+ * if upper layers showed interest and installed a fault handler,
+ * invoke it.
+ */
+ if (domain->handler)
+ ret = domain->handler(domain, dev, iova, flags,
+ domain->handler_token);
+
+ trace_io_page_fault(dev, iova, flags);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(report_iommu_fault);
+
+static int __init iommu_init(void)
+{
+ iommu_group_kset = kset_create_and_add("iommu_groups",
+ NULL, kernel_kobj);
+ BUG_ON(!iommu_group_kset);
+
+ iommu_debugfs_setup();
+
+ return 0;
+}
+core_initcall(iommu_init);
+
+int iommu_domain_get_attr(struct iommu_domain *domain,
+ enum iommu_attr attr, void *data)
+{
+ struct iommu_domain_geometry *geometry;
+ bool *paging;
+ int ret = 0;
+
+ switch (attr) {
+ case DOMAIN_ATTR_GEOMETRY:
+ geometry = data;
+ *geometry = domain->geometry;
+
+ break;
+ case DOMAIN_ATTR_PAGING:
+ paging = data;
+ *paging = (domain->pgsize_bitmap != 0UL);
+ break;
+ default:
+ if (!domain->ops->domain_get_attr)
+ return -EINVAL;
+
+ ret = domain->ops->domain_get_attr(domain, attr, data);
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(iommu_domain_get_attr);
+
+int iommu_domain_set_attr(struct iommu_domain *domain,
+ enum iommu_attr attr, void *data)
+{
+ int ret = 0;
+
+ switch (attr) {
+ default:
+ if (domain->ops->domain_set_attr == NULL)
+ return -EINVAL;
+
+ ret = domain->ops->domain_set_attr(domain, attr, data);
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(iommu_domain_set_attr);
+
+void iommu_get_resv_regions(struct device *dev, struct list_head *list)
+{
+ const struct iommu_ops *ops = dev->bus->iommu_ops;
+
+ if (ops && ops->get_resv_regions)
+ ops->get_resv_regions(dev, list);
+}
+
+void iommu_put_resv_regions(struct device *dev, struct list_head *list)
+{
+ const struct iommu_ops *ops = dev->bus->iommu_ops;
+
+ if (ops && ops->put_resv_regions)
+ ops->put_resv_regions(dev, list);
+}
+
+/**
+ * generic_iommu_put_resv_regions - Reserved region driver helper
+ * @dev: device for which to free reserved regions
+ * @list: reserved region list for device
+ *
+ * IOMMU drivers can use this to implement their .put_resv_regions() callback
+ * for simple reservations. Memory allocated for each reserved region will be
+ * freed. If an IOMMU driver allocates additional resources per region, it is
+ * going to have to implement a custom callback.
+ */
+void generic_iommu_put_resv_regions(struct device *dev, struct list_head *list)
+{
+ struct iommu_resv_region *entry, *next;
+
+ list_for_each_entry_safe(entry, next, list, list)
+ kfree(entry);
+}
+EXPORT_SYMBOL(generic_iommu_put_resv_regions);
+
+struct iommu_resv_region *iommu_alloc_resv_region(phys_addr_t start,
+ size_t length, int prot,
+ enum iommu_resv_type type)
+{
+ struct iommu_resv_region *region;
+
+ region = kzalloc(sizeof(*region), GFP_KERNEL);
+ if (!region)
+ return NULL;
+
+ INIT_LIST_HEAD(&region->list);
+ region->start = start;
+ region->length = length;
+ region->prot = prot;
+ region->type = type;
+ return region;
+}
+EXPORT_SYMBOL_GPL(iommu_alloc_resv_region);
+
+void iommu_set_default_passthrough(bool cmd_line)
+{
+ if (cmd_line)
+ iommu_set_cmd_line_dma_api();
+
+ iommu_def_domain_type = IOMMU_DOMAIN_IDENTITY;
+}
+
+void iommu_set_default_translated(bool cmd_line)
+{
+ if (cmd_line)
+ iommu_set_cmd_line_dma_api();
+
+ iommu_def_domain_type = IOMMU_DOMAIN_DMA;
+}
+
+bool iommu_default_passthrough(void)
+{
+ return iommu_def_domain_type == IOMMU_DOMAIN_IDENTITY;
+}
+EXPORT_SYMBOL_GPL(iommu_default_passthrough);
+
+const struct iommu_ops *iommu_ops_from_fwnode(struct fwnode_handle *fwnode)
+{
+ const struct iommu_ops *ops = NULL;
+ struct iommu_device *iommu;
+
+ spin_lock(&iommu_device_lock);
+ list_for_each_entry(iommu, &iommu_device_list, list)
+ if (iommu->fwnode == fwnode) {
+ ops = iommu->ops;
+ break;
+ }
+ spin_unlock(&iommu_device_lock);
+ return ops;
+}
+
+int iommu_fwspec_init(struct device *dev, struct fwnode_handle *iommu_fwnode,
+ const struct iommu_ops *ops)
+{
+ struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
+
+ if (fwspec)
+ return ops == fwspec->ops ? 0 : -EINVAL;
+
+ if (!dev_iommu_get(dev))
+ return -ENOMEM;
+
+ /* Preallocate for the overwhelmingly common case of 1 ID */
+ fwspec = kzalloc(struct_size(fwspec, ids, 1), GFP_KERNEL);
+ if (!fwspec)
+ return -ENOMEM;
+
+ of_node_get(to_of_node(iommu_fwnode));
+ fwspec->iommu_fwnode = iommu_fwnode;
+ fwspec->ops = ops;
+ dev_iommu_fwspec_set(dev, fwspec);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(iommu_fwspec_init);
+
+void iommu_fwspec_free(struct device *dev)
+{
+ struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
+
+ if (fwspec) {
+ fwnode_handle_put(fwspec->iommu_fwnode);
+ kfree(fwspec);
+ dev_iommu_fwspec_set(dev, NULL);
+ }
+}
+EXPORT_SYMBOL_GPL(iommu_fwspec_free);
+
+int iommu_fwspec_add_ids(struct device *dev, u32 *ids, int num_ids)
+{
+ struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
+ int i, new_num;
+
+ if (!fwspec)
+ return -EINVAL;
+
+ new_num = fwspec->num_ids + num_ids;
+ if (new_num > 1) {
+ fwspec = krealloc(fwspec, struct_size(fwspec, ids, new_num),
+ GFP_KERNEL);
+ if (!fwspec)
+ return -ENOMEM;
+
+ dev_iommu_fwspec_set(dev, fwspec);
+ }
+
+ for (i = 0; i < num_ids; i++)
+ fwspec->ids[fwspec->num_ids + i] = ids[i];
+
+ fwspec->num_ids = new_num;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(iommu_fwspec_add_ids);
+
+/*
+ * Per device IOMMU features.
+ */
+bool iommu_dev_has_feature(struct device *dev, enum iommu_dev_features feat)
+{
+ const struct iommu_ops *ops = dev->bus->iommu_ops;
+
+ if (ops && ops->dev_has_feat)
+ return ops->dev_has_feat(dev, feat);
+
+ return false;
+}
+EXPORT_SYMBOL_GPL(iommu_dev_has_feature);
+
+int iommu_dev_enable_feature(struct device *dev, enum iommu_dev_features feat)
+{
+ if (dev->iommu && dev->iommu->iommu_dev) {
+ const struct iommu_ops *ops = dev->iommu->iommu_dev->ops;
+
+ if (ops->dev_enable_feat)
+ return ops->dev_enable_feat(dev, feat);
+ }
+
+ return -ENODEV;
+}
+EXPORT_SYMBOL_GPL(iommu_dev_enable_feature);
+
+/*
+ * The device drivers should do the necessary cleanups before calling this.
+ * For example, before disabling the aux-domain feature, the device driver
+ * should detach all aux-domains. Otherwise, this will return -EBUSY.
+ */
+int iommu_dev_disable_feature(struct device *dev, enum iommu_dev_features feat)
+{
+ if (dev->iommu && dev->iommu->iommu_dev) {
+ const struct iommu_ops *ops = dev->iommu->iommu_dev->ops;
+
+ if (ops->dev_disable_feat)
+ return ops->dev_disable_feat(dev, feat);
+ }
+
+ return -EBUSY;
+}
+EXPORT_SYMBOL_GPL(iommu_dev_disable_feature);
+
+bool iommu_dev_feature_enabled(struct device *dev, enum iommu_dev_features feat)
+{
+ if (dev->iommu && dev->iommu->iommu_dev) {
+ const struct iommu_ops *ops = dev->iommu->iommu_dev->ops;
+
+ if (ops->dev_feat_enabled)
+ return ops->dev_feat_enabled(dev, feat);
+ }
+
+ return false;
+}
+EXPORT_SYMBOL_GPL(iommu_dev_feature_enabled);
+
+/*
+ * Aux-domain specific attach/detach.
+ *
+ * Only works if iommu_dev_feature_enabled(dev, IOMMU_DEV_FEAT_AUX) returns
+ * true. Also, as long as domains are attached to a device through this
+ * interface, any tries to call iommu_attach_device() should fail
+ * (iommu_detach_device() can't fail, so we fail when trying to re-attach).
+ * This should make us safe against a device being attached to a guest as a
+ * whole while there are still pasid users on it (aux and sva).
+ */
+int iommu_aux_attach_device(struct iommu_domain *domain, struct device *dev)
+{
+ int ret = -ENODEV;
+
+ if (domain->ops->aux_attach_dev)
+ ret = domain->ops->aux_attach_dev(domain, dev);
+
+ if (!ret)
+ trace_attach_device_to_domain(dev);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(iommu_aux_attach_device);
+
+void iommu_aux_detach_device(struct iommu_domain *domain, struct device *dev)
+{
+ if (domain->ops->aux_detach_dev) {
+ domain->ops->aux_detach_dev(domain, dev);
+ trace_detach_device_from_domain(dev);
+ }
+}
+EXPORT_SYMBOL_GPL(iommu_aux_detach_device);
+
+int iommu_aux_get_pasid(struct iommu_domain *domain, struct device *dev)
+{
+ int ret = -ENODEV;
+
+ if (domain->ops->aux_get_pasid)
+ ret = domain->ops->aux_get_pasid(domain, dev);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(iommu_aux_get_pasid);
+
+/**
+ * iommu_sva_bind_device() - Bind a process address space to a device
+ * @dev: the device
+ * @mm: the mm to bind, caller must hold a reference to it
+ *
+ * Create a bond between device and address space, allowing the device to access
+ * the mm using the returned PASID. If a bond already exists between @device and
+ * @mm, it is returned and an additional reference is taken. Caller must call
+ * iommu_sva_unbind_device() to release each reference.
+ *
+ * iommu_dev_enable_feature(dev, IOMMU_DEV_FEAT_SVA) must be called first, to
+ * initialize the required SVA features.
+ *
+ * On error, returns an ERR_PTR value.
+ */
+struct iommu_sva *
+iommu_sva_bind_device(struct device *dev, struct mm_struct *mm, void *drvdata)
+{
+ struct iommu_group *group;
+ struct iommu_sva *handle = ERR_PTR(-EINVAL);
+ const struct iommu_ops *ops = dev->bus->iommu_ops;
+
+ if (!ops || !ops->sva_bind)
+ return ERR_PTR(-ENODEV);
+
+ group = iommu_group_get(dev);
+ if (!group)
+ return ERR_PTR(-ENODEV);
+
+ /* Ensure device count and domain don't change while we're binding */
+ mutex_lock(&group->mutex);
+
+ /*
+ * To keep things simple, SVA currently doesn't support IOMMU groups
+ * with more than one device. Existing SVA-capable systems are not
+ * affected by the problems that required IOMMU groups (lack of ACS
+ * isolation, device ID aliasing and other hardware issues).
+ */
+ if (iommu_group_device_count(group) != 1)
+ goto out_unlock;
+
+ handle = ops->sva_bind(dev, mm, drvdata);
+
+out_unlock:
+ mutex_unlock(&group->mutex);
+ iommu_group_put(group);
+
+ return handle;
+}
+EXPORT_SYMBOL_GPL(iommu_sva_bind_device);
+
+/**
+ * iommu_sva_unbind_device() - Remove a bond created with iommu_sva_bind_device
+ * @handle: the handle returned by iommu_sva_bind_device()
+ *
+ * Put reference to a bond between device and address space. The device should
+ * not be issuing any more transaction for this PASID. All outstanding page
+ * requests for this PASID must have been flushed to the IOMMU.
+ *
+ * Returns 0 on success, or an error value
+ */
+void iommu_sva_unbind_device(struct iommu_sva *handle)
+{
+ struct iommu_group *group;
+ struct device *dev = handle->dev;
+ const struct iommu_ops *ops = dev->bus->iommu_ops;
+
+ if (!ops || !ops->sva_unbind)
+ return;
+
+ group = iommu_group_get(dev);
+ if (!group)
+ return;
+
+ mutex_lock(&group->mutex);
+ ops->sva_unbind(handle);
+ mutex_unlock(&group->mutex);
+
+ iommu_group_put(group);
+}
+EXPORT_SYMBOL_GPL(iommu_sva_unbind_device);
+
+u32 iommu_sva_get_pasid(struct iommu_sva *handle)
+{
+ const struct iommu_ops *ops = handle->dev->bus->iommu_ops;
+
+ if (!ops || !ops->sva_get_pasid)
+ return IOMMU_PASID_INVALID;
+
+ return ops->sva_get_pasid(handle);
+}
+EXPORT_SYMBOL_GPL(iommu_sva_get_pasid);
diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
new file mode 100644
index 000000000..4600e97ac
--- /dev/null
+++ b/drivers/iommu/iova.c
@@ -0,0 +1,1050 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright © 2006-2009, Intel Corporation.
+ *
+ * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
+ */
+
+#include <linux/iova.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/smp.h>
+#include <linux/bitops.h>
+#include <linux/cpu.h>
+
+/* The anchor node sits above the top of the usable address space */
+#define IOVA_ANCHOR ~0UL
+
+static bool iova_rcache_insert(struct iova_domain *iovad,
+ unsigned long pfn,
+ unsigned long size);
+static unsigned long iova_rcache_get(struct iova_domain *iovad,
+ unsigned long size,
+ unsigned long limit_pfn);
+static void init_iova_rcaches(struct iova_domain *iovad);
+static void free_iova_rcaches(struct iova_domain *iovad);
+static void fq_destroy_all_entries(struct iova_domain *iovad);
+static void fq_flush_timeout(struct timer_list *t);
+
+void
+init_iova_domain(struct iova_domain *iovad, unsigned long granule,
+ unsigned long start_pfn)
+{
+ /*
+ * IOVA granularity will normally be equal to the smallest
+ * supported IOMMU page size; both *must* be capable of
+ * representing individual CPU pages exactly.
+ */
+ BUG_ON((granule > PAGE_SIZE) || !is_power_of_2(granule));
+
+ spin_lock_init(&iovad->iova_rbtree_lock);
+ iovad->rbroot = RB_ROOT;
+ iovad->cached_node = &iovad->anchor.node;
+ iovad->cached32_node = &iovad->anchor.node;
+ iovad->granule = granule;
+ iovad->start_pfn = start_pfn;
+ iovad->dma_32bit_pfn = 1UL << (32 - iova_shift(iovad));
+ iovad->max32_alloc_size = iovad->dma_32bit_pfn;
+ iovad->flush_cb = NULL;
+ iovad->fq = NULL;
+ iovad->anchor.pfn_lo = iovad->anchor.pfn_hi = IOVA_ANCHOR;
+ rb_link_node(&iovad->anchor.node, NULL, &iovad->rbroot.rb_node);
+ rb_insert_color(&iovad->anchor.node, &iovad->rbroot);
+ init_iova_rcaches(iovad);
+}
+EXPORT_SYMBOL_GPL(init_iova_domain);
+
+bool has_iova_flush_queue(struct iova_domain *iovad)
+{
+ return !!iovad->fq;
+}
+
+static void free_iova_flush_queue(struct iova_domain *iovad)
+{
+ if (!has_iova_flush_queue(iovad))
+ return;
+
+ del_timer_sync(&iovad->fq_timer);
+
+ fq_destroy_all_entries(iovad);
+
+ free_percpu(iovad->fq);
+
+ iovad->fq = NULL;
+ iovad->flush_cb = NULL;
+ iovad->entry_dtor = NULL;
+}
+
+int init_iova_flush_queue(struct iova_domain *iovad,
+ iova_flush_cb flush_cb, iova_entry_dtor entry_dtor)
+{
+ struct iova_fq __percpu *queue;
+ int cpu;
+
+ atomic64_set(&iovad->fq_flush_start_cnt, 0);
+ atomic64_set(&iovad->fq_flush_finish_cnt, 0);
+
+ queue = alloc_percpu(struct iova_fq);
+ if (!queue)
+ return -ENOMEM;
+
+ iovad->flush_cb = flush_cb;
+ iovad->entry_dtor = entry_dtor;
+
+ for_each_possible_cpu(cpu) {
+ struct iova_fq *fq;
+
+ fq = per_cpu_ptr(queue, cpu);
+ fq->head = 0;
+ fq->tail = 0;
+
+ spin_lock_init(&fq->lock);
+ }
+
+ smp_wmb();
+
+ iovad->fq = queue;
+
+ timer_setup(&iovad->fq_timer, fq_flush_timeout, 0);
+ atomic_set(&iovad->fq_timer_on, 0);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(init_iova_flush_queue);
+
+static struct rb_node *
+__get_cached_rbnode(struct iova_domain *iovad, unsigned long limit_pfn)
+{
+ if (limit_pfn <= iovad->dma_32bit_pfn)
+ return iovad->cached32_node;
+
+ return iovad->cached_node;
+}
+
+static void
+__cached_rbnode_insert_update(struct iova_domain *iovad, struct iova *new)
+{
+ if (new->pfn_hi < iovad->dma_32bit_pfn)
+ iovad->cached32_node = &new->node;
+ else
+ iovad->cached_node = &new->node;
+}
+
+static void
+__cached_rbnode_delete_update(struct iova_domain *iovad, struct iova *free)
+{
+ struct iova *cached_iova;
+
+ cached_iova = rb_entry(iovad->cached32_node, struct iova, node);
+ if (free == cached_iova ||
+ (free->pfn_hi < iovad->dma_32bit_pfn &&
+ free->pfn_lo >= cached_iova->pfn_lo))
+ iovad->cached32_node = rb_next(&free->node);
+
+ if (free->pfn_lo < iovad->dma_32bit_pfn)
+ iovad->max32_alloc_size = iovad->dma_32bit_pfn;
+
+ cached_iova = rb_entry(iovad->cached_node, struct iova, node);
+ if (free->pfn_lo >= cached_iova->pfn_lo)
+ iovad->cached_node = rb_next(&free->node);
+}
+
+/* Insert the iova into domain rbtree by holding writer lock */
+static void
+iova_insert_rbtree(struct rb_root *root, struct iova *iova,
+ struct rb_node *start)
+{
+ struct rb_node **new, *parent = NULL;
+
+ new = (start) ? &start : &(root->rb_node);
+ /* Figure out where to put new node */
+ while (*new) {
+ struct iova *this = rb_entry(*new, struct iova, node);
+
+ parent = *new;
+
+ if (iova->pfn_lo < this->pfn_lo)
+ new = &((*new)->rb_left);
+ else if (iova->pfn_lo > this->pfn_lo)
+ new = &((*new)->rb_right);
+ else {
+ WARN_ON(1); /* this should not happen */
+ return;
+ }
+ }
+ /* Add new node and rebalance tree. */
+ rb_link_node(&iova->node, parent, new);
+ rb_insert_color(&iova->node, root);
+}
+
+static int __alloc_and_insert_iova_range(struct iova_domain *iovad,
+ unsigned long size, unsigned long limit_pfn,
+ struct iova *new, bool size_aligned)
+{
+ struct rb_node *curr, *prev;
+ struct iova *curr_iova;
+ unsigned long flags;
+ unsigned long new_pfn;
+ unsigned long align_mask = ~0UL;
+
+ if (size_aligned)
+ align_mask <<= fls_long(size - 1);
+
+ /* Walk the tree backwards */
+ spin_lock_irqsave(&iovad->iova_rbtree_lock, flags);
+ if (limit_pfn <= iovad->dma_32bit_pfn &&
+ size >= iovad->max32_alloc_size)
+ goto iova32_full;
+
+ curr = __get_cached_rbnode(iovad, limit_pfn);
+ curr_iova = rb_entry(curr, struct iova, node);
+ do {
+ limit_pfn = min(limit_pfn, curr_iova->pfn_lo);
+ new_pfn = (limit_pfn - size) & align_mask;
+ prev = curr;
+ curr = rb_prev(curr);
+ curr_iova = rb_entry(curr, struct iova, node);
+ } while (curr && new_pfn <= curr_iova->pfn_hi);
+
+ if (limit_pfn < size || new_pfn < iovad->start_pfn) {
+ iovad->max32_alloc_size = size;
+ goto iova32_full;
+ }
+
+ /* pfn_lo will point to size aligned address if size_aligned is set */
+ new->pfn_lo = new_pfn;
+ new->pfn_hi = new->pfn_lo + size - 1;
+
+ /* If we have 'prev', it's a valid place to start the insertion. */
+ iova_insert_rbtree(&iovad->rbroot, new, prev);
+ __cached_rbnode_insert_update(iovad, new);
+
+ spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
+ return 0;
+
+iova32_full:
+ spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
+ return -ENOMEM;
+}
+
+static struct kmem_cache *iova_cache;
+static unsigned int iova_cache_users;
+static DEFINE_MUTEX(iova_cache_mutex);
+
+struct iova *alloc_iova_mem(void)
+{
+ return kmem_cache_zalloc(iova_cache, GFP_ATOMIC | __GFP_NOWARN);
+}
+EXPORT_SYMBOL(alloc_iova_mem);
+
+void free_iova_mem(struct iova *iova)
+{
+ if (iova->pfn_lo != IOVA_ANCHOR)
+ kmem_cache_free(iova_cache, iova);
+}
+EXPORT_SYMBOL(free_iova_mem);
+
+int iova_cache_get(void)
+{
+ mutex_lock(&iova_cache_mutex);
+ if (!iova_cache_users) {
+ iova_cache = kmem_cache_create(
+ "iommu_iova", sizeof(struct iova), 0,
+ SLAB_HWCACHE_ALIGN, NULL);
+ if (!iova_cache) {
+ mutex_unlock(&iova_cache_mutex);
+ pr_err("Couldn't create iova cache\n");
+ return -ENOMEM;
+ }
+ }
+
+ iova_cache_users++;
+ mutex_unlock(&iova_cache_mutex);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(iova_cache_get);
+
+void iova_cache_put(void)
+{
+ mutex_lock(&iova_cache_mutex);
+ if (WARN_ON(!iova_cache_users)) {
+ mutex_unlock(&iova_cache_mutex);
+ return;
+ }
+ iova_cache_users--;
+ if (!iova_cache_users)
+ kmem_cache_destroy(iova_cache);
+ mutex_unlock(&iova_cache_mutex);
+}
+EXPORT_SYMBOL_GPL(iova_cache_put);
+
+/**
+ * alloc_iova - allocates an iova
+ * @iovad: - iova domain in question
+ * @size: - size of page frames to allocate
+ * @limit_pfn: - max limit address
+ * @size_aligned: - set if size_aligned address range is required
+ * This function allocates an iova in the range iovad->start_pfn to limit_pfn,
+ * searching top-down from limit_pfn to iovad->start_pfn. If the size_aligned
+ * flag is set then the allocated address iova->pfn_lo will be naturally
+ * aligned on roundup_power_of_two(size).
+ */
+struct iova *
+alloc_iova(struct iova_domain *iovad, unsigned long size,
+ unsigned long limit_pfn,
+ bool size_aligned)
+{
+ struct iova *new_iova;
+ int ret;
+
+ new_iova = alloc_iova_mem();
+ if (!new_iova)
+ return NULL;
+
+ ret = __alloc_and_insert_iova_range(iovad, size, limit_pfn + 1,
+ new_iova, size_aligned);
+
+ if (ret) {
+ free_iova_mem(new_iova);
+ return NULL;
+ }
+
+ return new_iova;
+}
+EXPORT_SYMBOL_GPL(alloc_iova);
+
+static struct iova *
+private_find_iova(struct iova_domain *iovad, unsigned long pfn)
+{
+ struct rb_node *node = iovad->rbroot.rb_node;
+
+ assert_spin_locked(&iovad->iova_rbtree_lock);
+
+ while (node) {
+ struct iova *iova = rb_entry(node, struct iova, node);
+
+ if (pfn < iova->pfn_lo)
+ node = node->rb_left;
+ else if (pfn > iova->pfn_hi)
+ node = node->rb_right;
+ else
+ return iova; /* pfn falls within iova's range */
+ }
+
+ return NULL;
+}
+
+static void private_free_iova(struct iova_domain *iovad, struct iova *iova)
+{
+ assert_spin_locked(&iovad->iova_rbtree_lock);
+ __cached_rbnode_delete_update(iovad, iova);
+ rb_erase(&iova->node, &iovad->rbroot);
+ free_iova_mem(iova);
+}
+
+/**
+ * find_iova - finds an iova for a given pfn
+ * @iovad: - iova domain in question.
+ * @pfn: - page frame number
+ * This function finds and returns an iova belonging to the
+ * given doamin which matches the given pfn.
+ */
+struct iova *find_iova(struct iova_domain *iovad, unsigned long pfn)
+{
+ unsigned long flags;
+ struct iova *iova;
+
+ /* Take the lock so that no other thread is manipulating the rbtree */
+ spin_lock_irqsave(&iovad->iova_rbtree_lock, flags);
+ iova = private_find_iova(iovad, pfn);
+ spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
+ return iova;
+}
+EXPORT_SYMBOL_GPL(find_iova);
+
+/**
+ * __free_iova - frees the given iova
+ * @iovad: iova domain in question.
+ * @iova: iova in question.
+ * Frees the given iova belonging to the giving domain
+ */
+void
+__free_iova(struct iova_domain *iovad, struct iova *iova)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&iovad->iova_rbtree_lock, flags);
+ private_free_iova(iovad, iova);
+ spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
+}
+EXPORT_SYMBOL_GPL(__free_iova);
+
+/**
+ * free_iova - finds and frees the iova for a given pfn
+ * @iovad: - iova domain in question.
+ * @pfn: - pfn that is allocated previously
+ * This functions finds an iova for a given pfn and then
+ * frees the iova from that domain.
+ */
+void
+free_iova(struct iova_domain *iovad, unsigned long pfn)
+{
+ struct iova *iova = find_iova(iovad, pfn);
+
+ if (iova)
+ __free_iova(iovad, iova);
+
+}
+EXPORT_SYMBOL_GPL(free_iova);
+
+/**
+ * alloc_iova_fast - allocates an iova from rcache
+ * @iovad: - iova domain in question
+ * @size: - size of page frames to allocate
+ * @limit_pfn: - max limit address
+ * @flush_rcache: - set to flush rcache on regular allocation failure
+ * This function tries to satisfy an iova allocation from the rcache,
+ * and falls back to regular allocation on failure. If regular allocation
+ * fails too and the flush_rcache flag is set then the rcache will be flushed.
+*/
+unsigned long
+alloc_iova_fast(struct iova_domain *iovad, unsigned long size,
+ unsigned long limit_pfn, bool flush_rcache)
+{
+ unsigned long iova_pfn;
+ struct iova *new_iova;
+
+ iova_pfn = iova_rcache_get(iovad, size, limit_pfn + 1);
+ if (iova_pfn)
+ return iova_pfn;
+
+retry:
+ new_iova = alloc_iova(iovad, size, limit_pfn, true);
+ if (!new_iova) {
+ unsigned int cpu;
+
+ if (!flush_rcache)
+ return 0;
+
+ /* Try replenishing IOVAs by flushing rcache. */
+ flush_rcache = false;
+ for_each_online_cpu(cpu)
+ free_cpu_cached_iovas(cpu, iovad);
+ goto retry;
+ }
+
+ return new_iova->pfn_lo;
+}
+EXPORT_SYMBOL_GPL(alloc_iova_fast);
+
+/**
+ * free_iova_fast - free iova pfn range into rcache
+ * @iovad: - iova domain in question.
+ * @pfn: - pfn that is allocated previously
+ * @size: - # of pages in range
+ * This functions frees an iova range by trying to put it into the rcache,
+ * falling back to regular iova deallocation via free_iova() if this fails.
+ */
+void
+free_iova_fast(struct iova_domain *iovad, unsigned long pfn, unsigned long size)
+{
+ if (iova_rcache_insert(iovad, pfn, size))
+ return;
+
+ free_iova(iovad, pfn);
+}
+EXPORT_SYMBOL_GPL(free_iova_fast);
+
+#define fq_ring_for_each(i, fq) \
+ for ((i) = (fq)->head; (i) != (fq)->tail; (i) = ((i) + 1) % IOVA_FQ_SIZE)
+
+static inline bool fq_full(struct iova_fq *fq)
+{
+ assert_spin_locked(&fq->lock);
+ return (((fq->tail + 1) % IOVA_FQ_SIZE) == fq->head);
+}
+
+static inline unsigned fq_ring_add(struct iova_fq *fq)
+{
+ unsigned idx = fq->tail;
+
+ assert_spin_locked(&fq->lock);
+
+ fq->tail = (idx + 1) % IOVA_FQ_SIZE;
+
+ return idx;
+}
+
+static void fq_ring_free(struct iova_domain *iovad, struct iova_fq *fq)
+{
+ u64 counter = atomic64_read(&iovad->fq_flush_finish_cnt);
+ unsigned idx;
+
+ assert_spin_locked(&fq->lock);
+
+ fq_ring_for_each(idx, fq) {
+
+ if (fq->entries[idx].counter >= counter)
+ break;
+
+ if (iovad->entry_dtor)
+ iovad->entry_dtor(fq->entries[idx].data);
+
+ free_iova_fast(iovad,
+ fq->entries[idx].iova_pfn,
+ fq->entries[idx].pages);
+
+ fq->head = (fq->head + 1) % IOVA_FQ_SIZE;
+ }
+}
+
+static void iova_domain_flush(struct iova_domain *iovad)
+{
+ atomic64_inc(&iovad->fq_flush_start_cnt);
+ iovad->flush_cb(iovad);
+ atomic64_inc(&iovad->fq_flush_finish_cnt);
+}
+
+static void fq_destroy_all_entries(struct iova_domain *iovad)
+{
+ int cpu;
+
+ /*
+ * This code runs when the iova_domain is being detroyed, so don't
+ * bother to free iovas, just call the entry_dtor on all remaining
+ * entries.
+ */
+ if (!iovad->entry_dtor)
+ return;
+
+ for_each_possible_cpu(cpu) {
+ struct iova_fq *fq = per_cpu_ptr(iovad->fq, cpu);
+ int idx;
+
+ fq_ring_for_each(idx, fq)
+ iovad->entry_dtor(fq->entries[idx].data);
+ }
+}
+
+static void fq_flush_timeout(struct timer_list *t)
+{
+ struct iova_domain *iovad = from_timer(iovad, t, fq_timer);
+ int cpu;
+
+ atomic_set(&iovad->fq_timer_on, 0);
+ iova_domain_flush(iovad);
+
+ for_each_possible_cpu(cpu) {
+ unsigned long flags;
+ struct iova_fq *fq;
+
+ fq = per_cpu_ptr(iovad->fq, cpu);
+ spin_lock_irqsave(&fq->lock, flags);
+ fq_ring_free(iovad, fq);
+ spin_unlock_irqrestore(&fq->lock, flags);
+ }
+}
+
+void queue_iova(struct iova_domain *iovad,
+ unsigned long pfn, unsigned long pages,
+ unsigned long data)
+{
+ struct iova_fq *fq = raw_cpu_ptr(iovad->fq);
+ unsigned long flags;
+ unsigned idx;
+
+ spin_lock_irqsave(&fq->lock, flags);
+
+ /*
+ * First remove all entries from the flush queue that have already been
+ * flushed out on another CPU. This makes the fq_full() check below less
+ * likely to be true.
+ */
+ fq_ring_free(iovad, fq);
+
+ if (fq_full(fq)) {
+ iova_domain_flush(iovad);
+ fq_ring_free(iovad, fq);
+ }
+
+ idx = fq_ring_add(fq);
+
+ fq->entries[idx].iova_pfn = pfn;
+ fq->entries[idx].pages = pages;
+ fq->entries[idx].data = data;
+ fq->entries[idx].counter = atomic64_read(&iovad->fq_flush_start_cnt);
+
+ spin_unlock_irqrestore(&fq->lock, flags);
+
+ /* Avoid false sharing as much as possible. */
+ if (!atomic_read(&iovad->fq_timer_on) &&
+ !atomic_xchg(&iovad->fq_timer_on, 1))
+ mod_timer(&iovad->fq_timer,
+ jiffies + msecs_to_jiffies(IOVA_FQ_TIMEOUT));
+}
+EXPORT_SYMBOL_GPL(queue_iova);
+
+/**
+ * put_iova_domain - destroys the iova doamin
+ * @iovad: - iova domain in question.
+ * All the iova's in that domain are destroyed.
+ */
+void put_iova_domain(struct iova_domain *iovad)
+{
+ struct iova *iova, *tmp;
+
+ free_iova_flush_queue(iovad);
+ free_iova_rcaches(iovad);
+ rbtree_postorder_for_each_entry_safe(iova, tmp, &iovad->rbroot, node)
+ free_iova_mem(iova);
+}
+EXPORT_SYMBOL_GPL(put_iova_domain);
+
+static int
+__is_range_overlap(struct rb_node *node,
+ unsigned long pfn_lo, unsigned long pfn_hi)
+{
+ struct iova *iova = rb_entry(node, struct iova, node);
+
+ if ((pfn_lo <= iova->pfn_hi) && (pfn_hi >= iova->pfn_lo))
+ return 1;
+ return 0;
+}
+
+static inline struct iova *
+alloc_and_init_iova(unsigned long pfn_lo, unsigned long pfn_hi)
+{
+ struct iova *iova;
+
+ iova = alloc_iova_mem();
+ if (iova) {
+ iova->pfn_lo = pfn_lo;
+ iova->pfn_hi = pfn_hi;
+ }
+
+ return iova;
+}
+
+static struct iova *
+__insert_new_range(struct iova_domain *iovad,
+ unsigned long pfn_lo, unsigned long pfn_hi)
+{
+ struct iova *iova;
+
+ iova = alloc_and_init_iova(pfn_lo, pfn_hi);
+ if (iova)
+ iova_insert_rbtree(&iovad->rbroot, iova, NULL);
+
+ return iova;
+}
+
+static void
+__adjust_overlap_range(struct iova *iova,
+ unsigned long *pfn_lo, unsigned long *pfn_hi)
+{
+ if (*pfn_lo < iova->pfn_lo)
+ iova->pfn_lo = *pfn_lo;
+ if (*pfn_hi > iova->pfn_hi)
+ *pfn_lo = iova->pfn_hi + 1;
+}
+
+/**
+ * reserve_iova - reserves an iova in the given range
+ * @iovad: - iova domain pointer
+ * @pfn_lo: - lower page frame address
+ * @pfn_hi:- higher pfn adderss
+ * This function allocates reserves the address range from pfn_lo to pfn_hi so
+ * that this address is not dished out as part of alloc_iova.
+ */
+struct iova *
+reserve_iova(struct iova_domain *iovad,
+ unsigned long pfn_lo, unsigned long pfn_hi)
+{
+ struct rb_node *node;
+ unsigned long flags;
+ struct iova *iova;
+ unsigned int overlap = 0;
+
+ /* Don't allow nonsensical pfns */
+ if (WARN_ON((pfn_hi | pfn_lo) > (ULLONG_MAX >> iova_shift(iovad))))
+ return NULL;
+
+ spin_lock_irqsave(&iovad->iova_rbtree_lock, flags);
+ for (node = rb_first(&iovad->rbroot); node; node = rb_next(node)) {
+ if (__is_range_overlap(node, pfn_lo, pfn_hi)) {
+ iova = rb_entry(node, struct iova, node);
+ __adjust_overlap_range(iova, &pfn_lo, &pfn_hi);
+ if ((pfn_lo >= iova->pfn_lo) &&
+ (pfn_hi <= iova->pfn_hi))
+ goto finish;
+ overlap = 1;
+
+ } else if (overlap)
+ break;
+ }
+
+ /* We are here either because this is the first reserver node
+ * or need to insert remaining non overlap addr range
+ */
+ iova = __insert_new_range(iovad, pfn_lo, pfn_hi);
+finish:
+
+ spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
+ return iova;
+}
+EXPORT_SYMBOL_GPL(reserve_iova);
+
+/**
+ * copy_reserved_iova - copies the reserved between domains
+ * @from: - source doamin from where to copy
+ * @to: - destination domin where to copy
+ * This function copies reserved iova's from one doamin to
+ * other.
+ */
+void
+copy_reserved_iova(struct iova_domain *from, struct iova_domain *to)
+{
+ unsigned long flags;
+ struct rb_node *node;
+
+ spin_lock_irqsave(&from->iova_rbtree_lock, flags);
+ for (node = rb_first(&from->rbroot); node; node = rb_next(node)) {
+ struct iova *iova = rb_entry(node, struct iova, node);
+ struct iova *new_iova;
+
+ if (iova->pfn_lo == IOVA_ANCHOR)
+ continue;
+
+ new_iova = reserve_iova(to, iova->pfn_lo, iova->pfn_hi);
+ if (!new_iova)
+ pr_err("Reserve iova range %lx@%lx failed\n",
+ iova->pfn_lo, iova->pfn_lo);
+ }
+ spin_unlock_irqrestore(&from->iova_rbtree_lock, flags);
+}
+EXPORT_SYMBOL_GPL(copy_reserved_iova);
+
+struct iova *
+split_and_remove_iova(struct iova_domain *iovad, struct iova *iova,
+ unsigned long pfn_lo, unsigned long pfn_hi)
+{
+ unsigned long flags;
+ struct iova *prev = NULL, *next = NULL;
+
+ spin_lock_irqsave(&iovad->iova_rbtree_lock, flags);
+ if (iova->pfn_lo < pfn_lo) {
+ prev = alloc_and_init_iova(iova->pfn_lo, pfn_lo - 1);
+ if (prev == NULL)
+ goto error;
+ }
+ if (iova->pfn_hi > pfn_hi) {
+ next = alloc_and_init_iova(pfn_hi + 1, iova->pfn_hi);
+ if (next == NULL)
+ goto error;
+ }
+
+ __cached_rbnode_delete_update(iovad, iova);
+ rb_erase(&iova->node, &iovad->rbroot);
+
+ if (prev) {
+ iova_insert_rbtree(&iovad->rbroot, prev, NULL);
+ iova->pfn_lo = pfn_lo;
+ }
+ if (next) {
+ iova_insert_rbtree(&iovad->rbroot, next, NULL);
+ iova->pfn_hi = pfn_hi;
+ }
+ spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
+
+ return iova;
+
+error:
+ spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
+ if (prev)
+ free_iova_mem(prev);
+ return NULL;
+}
+
+/*
+ * Magazine caches for IOVA ranges. For an introduction to magazines,
+ * see the USENIX 2001 paper "Magazines and Vmem: Extending the Slab
+ * Allocator to Many CPUs and Arbitrary Resources" by Bonwick and Adams.
+ * For simplicity, we use a static magazine size and don't implement the
+ * dynamic size tuning described in the paper.
+ */
+
+#define IOVA_MAG_SIZE 128
+
+struct iova_magazine {
+ unsigned long size;
+ unsigned long pfns[IOVA_MAG_SIZE];
+};
+
+struct iova_cpu_rcache {
+ spinlock_t lock;
+ struct iova_magazine *loaded;
+ struct iova_magazine *prev;
+};
+
+static struct iova_magazine *iova_magazine_alloc(gfp_t flags)
+{
+ return kzalloc(sizeof(struct iova_magazine), flags);
+}
+
+static void iova_magazine_free(struct iova_magazine *mag)
+{
+ kfree(mag);
+}
+
+static void
+iova_magazine_free_pfns(struct iova_magazine *mag, struct iova_domain *iovad)
+{
+ unsigned long flags;
+ int i;
+
+ if (!mag)
+ return;
+
+ spin_lock_irqsave(&iovad->iova_rbtree_lock, flags);
+
+ for (i = 0 ; i < mag->size; ++i) {
+ struct iova *iova = private_find_iova(iovad, mag->pfns[i]);
+
+ if (WARN_ON(!iova))
+ continue;
+
+ private_free_iova(iovad, iova);
+ }
+
+ spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
+
+ mag->size = 0;
+}
+
+static bool iova_magazine_full(struct iova_magazine *mag)
+{
+ return (mag && mag->size == IOVA_MAG_SIZE);
+}
+
+static bool iova_magazine_empty(struct iova_magazine *mag)
+{
+ return (!mag || mag->size == 0);
+}
+
+static unsigned long iova_magazine_pop(struct iova_magazine *mag,
+ unsigned long limit_pfn)
+{
+ int i;
+ unsigned long pfn;
+
+ BUG_ON(iova_magazine_empty(mag));
+
+ /* Only fall back to the rbtree if we have no suitable pfns at all */
+ for (i = mag->size - 1; mag->pfns[i] > limit_pfn; i--)
+ if (i == 0)
+ return 0;
+
+ /* Swap it to pop it */
+ pfn = mag->pfns[i];
+ mag->pfns[i] = mag->pfns[--mag->size];
+
+ return pfn;
+}
+
+static void iova_magazine_push(struct iova_magazine *mag, unsigned long pfn)
+{
+ BUG_ON(iova_magazine_full(mag));
+
+ mag->pfns[mag->size++] = pfn;
+}
+
+static void init_iova_rcaches(struct iova_domain *iovad)
+{
+ struct iova_cpu_rcache *cpu_rcache;
+ struct iova_rcache *rcache;
+ unsigned int cpu;
+ int i;
+
+ for (i = 0; i < IOVA_RANGE_CACHE_MAX_SIZE; ++i) {
+ rcache = &iovad->rcaches[i];
+ spin_lock_init(&rcache->lock);
+ rcache->depot_size = 0;
+ rcache->cpu_rcaches = __alloc_percpu(sizeof(*cpu_rcache), cache_line_size());
+ if (WARN_ON(!rcache->cpu_rcaches))
+ continue;
+ for_each_possible_cpu(cpu) {
+ cpu_rcache = per_cpu_ptr(rcache->cpu_rcaches, cpu);
+ spin_lock_init(&cpu_rcache->lock);
+ cpu_rcache->loaded = iova_magazine_alloc(GFP_KERNEL);
+ cpu_rcache->prev = iova_magazine_alloc(GFP_KERNEL);
+ }
+ }
+}
+
+/*
+ * Try inserting IOVA range starting with 'iova_pfn' into 'rcache', and
+ * return true on success. Can fail if rcache is full and we can't free
+ * space, and free_iova() (our only caller) will then return the IOVA
+ * range to the rbtree instead.
+ */
+static bool __iova_rcache_insert(struct iova_domain *iovad,
+ struct iova_rcache *rcache,
+ unsigned long iova_pfn)
+{
+ struct iova_magazine *mag_to_free = NULL;
+ struct iova_cpu_rcache *cpu_rcache;
+ bool can_insert = false;
+ unsigned long flags;
+
+ cpu_rcache = raw_cpu_ptr(rcache->cpu_rcaches);
+ spin_lock_irqsave(&cpu_rcache->lock, flags);
+
+ if (!iova_magazine_full(cpu_rcache->loaded)) {
+ can_insert = true;
+ } else if (!iova_magazine_full(cpu_rcache->prev)) {
+ swap(cpu_rcache->prev, cpu_rcache->loaded);
+ can_insert = true;
+ } else {
+ struct iova_magazine *new_mag = iova_magazine_alloc(GFP_ATOMIC);
+
+ if (new_mag) {
+ spin_lock(&rcache->lock);
+ if (rcache->depot_size < MAX_GLOBAL_MAGS) {
+ rcache->depot[rcache->depot_size++] =
+ cpu_rcache->loaded;
+ } else {
+ mag_to_free = cpu_rcache->loaded;
+ }
+ spin_unlock(&rcache->lock);
+
+ cpu_rcache->loaded = new_mag;
+ can_insert = true;
+ }
+ }
+
+ if (can_insert)
+ iova_magazine_push(cpu_rcache->loaded, iova_pfn);
+
+ spin_unlock_irqrestore(&cpu_rcache->lock, flags);
+
+ if (mag_to_free) {
+ iova_magazine_free_pfns(mag_to_free, iovad);
+ iova_magazine_free(mag_to_free);
+ }
+
+ return can_insert;
+}
+
+static bool iova_rcache_insert(struct iova_domain *iovad, unsigned long pfn,
+ unsigned long size)
+{
+ unsigned int log_size = order_base_2(size);
+
+ if (log_size >= IOVA_RANGE_CACHE_MAX_SIZE)
+ return false;
+
+ return __iova_rcache_insert(iovad, &iovad->rcaches[log_size], pfn);
+}
+
+/*
+ * Caller wants to allocate a new IOVA range from 'rcache'. If we can
+ * satisfy the request, return a matching non-NULL range and remove
+ * it from the 'rcache'.
+ */
+static unsigned long __iova_rcache_get(struct iova_rcache *rcache,
+ unsigned long limit_pfn)
+{
+ struct iova_cpu_rcache *cpu_rcache;
+ unsigned long iova_pfn = 0;
+ bool has_pfn = false;
+ unsigned long flags;
+
+ cpu_rcache = raw_cpu_ptr(rcache->cpu_rcaches);
+ spin_lock_irqsave(&cpu_rcache->lock, flags);
+
+ if (!iova_magazine_empty(cpu_rcache->loaded)) {
+ has_pfn = true;
+ } else if (!iova_magazine_empty(cpu_rcache->prev)) {
+ swap(cpu_rcache->prev, cpu_rcache->loaded);
+ has_pfn = true;
+ } else {
+ spin_lock(&rcache->lock);
+ if (rcache->depot_size > 0) {
+ iova_magazine_free(cpu_rcache->loaded);
+ cpu_rcache->loaded = rcache->depot[--rcache->depot_size];
+ has_pfn = true;
+ }
+ spin_unlock(&rcache->lock);
+ }
+
+ if (has_pfn)
+ iova_pfn = iova_magazine_pop(cpu_rcache->loaded, limit_pfn);
+
+ spin_unlock_irqrestore(&cpu_rcache->lock, flags);
+
+ return iova_pfn;
+}
+
+/*
+ * Try to satisfy IOVA allocation range from rcache. Fail if requested
+ * size is too big or the DMA limit we are given isn't satisfied by the
+ * top element in the magazine.
+ */
+static unsigned long iova_rcache_get(struct iova_domain *iovad,
+ unsigned long size,
+ unsigned long limit_pfn)
+{
+ unsigned int log_size = order_base_2(size);
+
+ if (log_size >= IOVA_RANGE_CACHE_MAX_SIZE)
+ return 0;
+
+ return __iova_rcache_get(&iovad->rcaches[log_size], limit_pfn - size);
+}
+
+/*
+ * free rcache data structures.
+ */
+static void free_iova_rcaches(struct iova_domain *iovad)
+{
+ struct iova_rcache *rcache;
+ struct iova_cpu_rcache *cpu_rcache;
+ unsigned int cpu;
+ int i, j;
+
+ for (i = 0; i < IOVA_RANGE_CACHE_MAX_SIZE; ++i) {
+ rcache = &iovad->rcaches[i];
+ for_each_possible_cpu(cpu) {
+ cpu_rcache = per_cpu_ptr(rcache->cpu_rcaches, cpu);
+ iova_magazine_free(cpu_rcache->loaded);
+ iova_magazine_free(cpu_rcache->prev);
+ }
+ free_percpu(rcache->cpu_rcaches);
+ for (j = 0; j < rcache->depot_size; ++j)
+ iova_magazine_free(rcache->depot[j]);
+ }
+}
+
+/*
+ * free all the IOVA ranges cached by a cpu (used when cpu is unplugged)
+ */
+void free_cpu_cached_iovas(unsigned int cpu, struct iova_domain *iovad)
+{
+ struct iova_cpu_rcache *cpu_rcache;
+ struct iova_rcache *rcache;
+ unsigned long flags;
+ int i;
+
+ for (i = 0; i < IOVA_RANGE_CACHE_MAX_SIZE; ++i) {
+ rcache = &iovad->rcaches[i];
+ cpu_rcache = per_cpu_ptr(rcache->cpu_rcaches, cpu);
+ spin_lock_irqsave(&cpu_rcache->lock, flags);
+ iova_magazine_free_pfns(cpu_rcache->loaded, iovad);
+ iova_magazine_free_pfns(cpu_rcache->prev, iovad);
+ spin_unlock_irqrestore(&cpu_rcache->lock, flags);
+ }
+}
+
+MODULE_AUTHOR("Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>");
+MODULE_LICENSE("GPL");
diff --git a/drivers/iommu/ipmmu-vmsa.c b/drivers/iommu/ipmmu-vmsa.c
new file mode 100644
index 000000000..bae6c7078
--- /dev/null
+++ b/drivers/iommu/ipmmu-vmsa.c
@@ -0,0 +1,1204 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * IOMMU API for Renesas VMSA-compatible IPMMU
+ * Author: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
+ *
+ * Copyright (C) 2014-2020 Renesas Electronics Corporation
+ */
+
+#include <linux/bitmap.h>
+#include <linux/delay.h>
+#include <linux/dma-iommu.h>
+#include <linux/dma-mapping.h>
+#include <linux/err.h>
+#include <linux/export.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/io.h>
+#include <linux/io-pgtable.h>
+#include <linux/iommu.h>
+#include <linux/of.h>
+#include <linux/of_device.h>
+#include <linux/of_iommu.h>
+#include <linux/of_platform.h>
+#include <linux/platform_device.h>
+#include <linux/sizes.h>
+#include <linux/slab.h>
+#include <linux/sys_soc.h>
+
+#if defined(CONFIG_ARM) && !defined(CONFIG_IOMMU_DMA)
+#include <asm/dma-iommu.h>
+#else
+#define arm_iommu_create_mapping(...) NULL
+#define arm_iommu_attach_device(...) -ENODEV
+#define arm_iommu_release_mapping(...) do {} while (0)
+#define arm_iommu_detach_device(...) do {} while (0)
+#endif
+
+#define IPMMU_CTX_MAX 8U
+#define IPMMU_CTX_INVALID -1
+
+#define IPMMU_UTLB_MAX 48U
+
+struct ipmmu_features {
+ bool use_ns_alias_offset;
+ bool has_cache_leaf_nodes;
+ unsigned int number_of_contexts;
+ unsigned int num_utlbs;
+ bool setup_imbuscr;
+ bool twobit_imttbcr_sl0;
+ bool reserved_context;
+ bool cache_snoop;
+ unsigned int ctx_offset_base;
+ unsigned int ctx_offset_stride;
+ unsigned int utlb_offset_base;
+};
+
+struct ipmmu_vmsa_device {
+ struct device *dev;
+ void __iomem *base;
+ struct iommu_device iommu;
+ struct ipmmu_vmsa_device *root;
+ const struct ipmmu_features *features;
+ unsigned int num_ctx;
+ spinlock_t lock; /* Protects ctx and domains[] */
+ DECLARE_BITMAP(ctx, IPMMU_CTX_MAX);
+ struct ipmmu_vmsa_domain *domains[IPMMU_CTX_MAX];
+ s8 utlb_ctx[IPMMU_UTLB_MAX];
+
+ struct iommu_group *group;
+ struct dma_iommu_mapping *mapping;
+};
+
+struct ipmmu_vmsa_domain {
+ struct ipmmu_vmsa_device *mmu;
+ struct iommu_domain io_domain;
+
+ struct io_pgtable_cfg cfg;
+ struct io_pgtable_ops *iop;
+
+ unsigned int context_id;
+ struct mutex mutex; /* Protects mappings */
+};
+
+static struct ipmmu_vmsa_domain *to_vmsa_domain(struct iommu_domain *dom)
+{
+ return container_of(dom, struct ipmmu_vmsa_domain, io_domain);
+}
+
+static struct ipmmu_vmsa_device *to_ipmmu(struct device *dev)
+{
+ return dev_iommu_priv_get(dev);
+}
+
+#define TLB_LOOP_TIMEOUT 100 /* 100us */
+
+/* -----------------------------------------------------------------------------
+ * Registers Definition
+ */
+
+#define IM_NS_ALIAS_OFFSET 0x800
+
+/* MMU "context" registers */
+#define IMCTR 0x0000 /* R-Car Gen2/3 */
+#define IMCTR_INTEN (1 << 2) /* R-Car Gen2/3 */
+#define IMCTR_FLUSH (1 << 1) /* R-Car Gen2/3 */
+#define IMCTR_MMUEN (1 << 0) /* R-Car Gen2/3 */
+
+#define IMTTBCR 0x0008 /* R-Car Gen2/3 */
+#define IMTTBCR_EAE (1 << 31) /* R-Car Gen2/3 */
+#define IMTTBCR_SH0_INNER_SHAREABLE (3 << 12) /* R-Car Gen2 only */
+#define IMTTBCR_ORGN0_WB_WA (1 << 10) /* R-Car Gen2 only */
+#define IMTTBCR_IRGN0_WB_WA (1 << 8) /* R-Car Gen2 only */
+#define IMTTBCR_SL0_TWOBIT_LVL_1 (2 << 6) /* R-Car Gen3 only */
+#define IMTTBCR_SL0_LVL_1 (1 << 4) /* R-Car Gen2 only */
+
+#define IMBUSCR 0x000c /* R-Car Gen2 only */
+#define IMBUSCR_DVM (1 << 2) /* R-Car Gen2 only */
+#define IMBUSCR_BUSSEL_MASK (3 << 0) /* R-Car Gen2 only */
+
+#define IMTTLBR0 0x0010 /* R-Car Gen2/3 */
+#define IMTTUBR0 0x0014 /* R-Car Gen2/3 */
+
+#define IMSTR 0x0020 /* R-Car Gen2/3 */
+#define IMSTR_MHIT (1 << 4) /* R-Car Gen2/3 */
+#define IMSTR_ABORT (1 << 2) /* R-Car Gen2/3 */
+#define IMSTR_PF (1 << 1) /* R-Car Gen2/3 */
+#define IMSTR_TF (1 << 0) /* R-Car Gen2/3 */
+
+#define IMMAIR0 0x0028 /* R-Car Gen2/3 */
+
+#define IMELAR 0x0030 /* R-Car Gen2/3, IMEAR on R-Car Gen2 */
+#define IMEUAR 0x0034 /* R-Car Gen3 only */
+
+/* uTLB registers */
+#define IMUCTR(n) ((n) < 32 ? IMUCTR0(n) : IMUCTR32(n))
+#define IMUCTR0(n) (0x0300 + ((n) * 16)) /* R-Car Gen2/3 */
+#define IMUCTR32(n) (0x0600 + (((n) - 32) * 16)) /* R-Car Gen3 only */
+#define IMUCTR_TTSEL_MMU(n) ((n) << 4) /* R-Car Gen2/3 */
+#define IMUCTR_FLUSH (1 << 1) /* R-Car Gen2/3 */
+#define IMUCTR_MMUEN (1 << 0) /* R-Car Gen2/3 */
+
+#define IMUASID(n) ((n) < 32 ? IMUASID0(n) : IMUASID32(n))
+#define IMUASID0(n) (0x0308 + ((n) * 16)) /* R-Car Gen2/3 */
+#define IMUASID32(n) (0x0608 + (((n) - 32) * 16)) /* R-Car Gen3 only */
+
+/* -----------------------------------------------------------------------------
+ * Root device handling
+ */
+
+static struct platform_driver ipmmu_driver;
+
+static bool ipmmu_is_root(struct ipmmu_vmsa_device *mmu)
+{
+ return mmu->root == mmu;
+}
+
+static int __ipmmu_check_device(struct device *dev, void *data)
+{
+ struct ipmmu_vmsa_device *mmu = dev_get_drvdata(dev);
+ struct ipmmu_vmsa_device **rootp = data;
+
+ if (ipmmu_is_root(mmu))
+ *rootp = mmu;
+
+ return 0;
+}
+
+static struct ipmmu_vmsa_device *ipmmu_find_root(void)
+{
+ struct ipmmu_vmsa_device *root = NULL;
+
+ return driver_for_each_device(&ipmmu_driver.driver, NULL, &root,
+ __ipmmu_check_device) == 0 ? root : NULL;
+}
+
+/* -----------------------------------------------------------------------------
+ * Read/Write Access
+ */
+
+static u32 ipmmu_read(struct ipmmu_vmsa_device *mmu, unsigned int offset)
+{
+ return ioread32(mmu->base + offset);
+}
+
+static void ipmmu_write(struct ipmmu_vmsa_device *mmu, unsigned int offset,
+ u32 data)
+{
+ iowrite32(data, mmu->base + offset);
+}
+
+static unsigned int ipmmu_ctx_reg(struct ipmmu_vmsa_device *mmu,
+ unsigned int context_id, unsigned int reg)
+{
+ return mmu->features->ctx_offset_base +
+ context_id * mmu->features->ctx_offset_stride + reg;
+}
+
+static u32 ipmmu_ctx_read(struct ipmmu_vmsa_device *mmu,
+ unsigned int context_id, unsigned int reg)
+{
+ return ipmmu_read(mmu, ipmmu_ctx_reg(mmu, context_id, reg));
+}
+
+static void ipmmu_ctx_write(struct ipmmu_vmsa_device *mmu,
+ unsigned int context_id, unsigned int reg, u32 data)
+{
+ ipmmu_write(mmu, ipmmu_ctx_reg(mmu, context_id, reg), data);
+}
+
+static u32 ipmmu_ctx_read_root(struct ipmmu_vmsa_domain *domain,
+ unsigned int reg)
+{
+ return ipmmu_ctx_read(domain->mmu->root, domain->context_id, reg);
+}
+
+static void ipmmu_ctx_write_root(struct ipmmu_vmsa_domain *domain,
+ unsigned int reg, u32 data)
+{
+ ipmmu_ctx_write(domain->mmu->root, domain->context_id, reg, data);
+}
+
+static void ipmmu_ctx_write_all(struct ipmmu_vmsa_domain *domain,
+ unsigned int reg, u32 data)
+{
+ if (domain->mmu != domain->mmu->root)
+ ipmmu_ctx_write(domain->mmu, domain->context_id, reg, data);
+
+ ipmmu_ctx_write(domain->mmu->root, domain->context_id, reg, data);
+}
+
+static u32 ipmmu_utlb_reg(struct ipmmu_vmsa_device *mmu, unsigned int reg)
+{
+ return mmu->features->utlb_offset_base + reg;
+}
+
+static void ipmmu_imuasid_write(struct ipmmu_vmsa_device *mmu,
+ unsigned int utlb, u32 data)
+{
+ ipmmu_write(mmu, ipmmu_utlb_reg(mmu, IMUASID(utlb)), data);
+}
+
+static void ipmmu_imuctr_write(struct ipmmu_vmsa_device *mmu,
+ unsigned int utlb, u32 data)
+{
+ ipmmu_write(mmu, ipmmu_utlb_reg(mmu, IMUCTR(utlb)), data);
+}
+
+/* -----------------------------------------------------------------------------
+ * TLB and microTLB Management
+ */
+
+/* Wait for any pending TLB invalidations to complete */
+static void ipmmu_tlb_sync(struct ipmmu_vmsa_domain *domain)
+{
+ unsigned int count = 0;
+
+ while (ipmmu_ctx_read_root(domain, IMCTR) & IMCTR_FLUSH) {
+ cpu_relax();
+ if (++count == TLB_LOOP_TIMEOUT) {
+ dev_err_ratelimited(domain->mmu->dev,
+ "TLB sync timed out -- MMU may be deadlocked\n");
+ return;
+ }
+ udelay(1);
+ }
+}
+
+static void ipmmu_tlb_invalidate(struct ipmmu_vmsa_domain *domain)
+{
+ u32 reg;
+
+ reg = ipmmu_ctx_read_root(domain, IMCTR);
+ reg |= IMCTR_FLUSH;
+ ipmmu_ctx_write_all(domain, IMCTR, reg);
+
+ ipmmu_tlb_sync(domain);
+}
+
+/*
+ * Enable MMU translation for the microTLB.
+ */
+static void ipmmu_utlb_enable(struct ipmmu_vmsa_domain *domain,
+ unsigned int utlb)
+{
+ struct ipmmu_vmsa_device *mmu = domain->mmu;
+
+ /*
+ * TODO: Reference-count the microTLB as several bus masters can be
+ * connected to the same microTLB.
+ */
+
+ /* TODO: What should we set the ASID to ? */
+ ipmmu_imuasid_write(mmu, utlb, 0);
+ /* TODO: Do we need to flush the microTLB ? */
+ ipmmu_imuctr_write(mmu, utlb, IMUCTR_TTSEL_MMU(domain->context_id) |
+ IMUCTR_FLUSH | IMUCTR_MMUEN);
+ mmu->utlb_ctx[utlb] = domain->context_id;
+}
+
+/*
+ * Disable MMU translation for the microTLB.
+ */
+static void ipmmu_utlb_disable(struct ipmmu_vmsa_domain *domain,
+ unsigned int utlb)
+{
+ struct ipmmu_vmsa_device *mmu = domain->mmu;
+
+ ipmmu_imuctr_write(mmu, utlb, 0);
+ mmu->utlb_ctx[utlb] = IPMMU_CTX_INVALID;
+}
+
+static void ipmmu_tlb_flush_all(void *cookie)
+{
+ struct ipmmu_vmsa_domain *domain = cookie;
+
+ ipmmu_tlb_invalidate(domain);
+}
+
+static void ipmmu_tlb_flush(unsigned long iova, size_t size,
+ size_t granule, void *cookie)
+{
+ ipmmu_tlb_flush_all(cookie);
+}
+
+static const struct iommu_flush_ops ipmmu_flush_ops = {
+ .tlb_flush_all = ipmmu_tlb_flush_all,
+ .tlb_flush_walk = ipmmu_tlb_flush,
+ .tlb_flush_leaf = ipmmu_tlb_flush,
+};
+
+/* -----------------------------------------------------------------------------
+ * Domain/Context Management
+ */
+
+static int ipmmu_domain_allocate_context(struct ipmmu_vmsa_device *mmu,
+ struct ipmmu_vmsa_domain *domain)
+{
+ unsigned long flags;
+ int ret;
+
+ spin_lock_irqsave(&mmu->lock, flags);
+
+ ret = find_first_zero_bit(mmu->ctx, mmu->num_ctx);
+ if (ret != mmu->num_ctx) {
+ mmu->domains[ret] = domain;
+ set_bit(ret, mmu->ctx);
+ } else
+ ret = -EBUSY;
+
+ spin_unlock_irqrestore(&mmu->lock, flags);
+
+ return ret;
+}
+
+static void ipmmu_domain_free_context(struct ipmmu_vmsa_device *mmu,
+ unsigned int context_id)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&mmu->lock, flags);
+
+ clear_bit(context_id, mmu->ctx);
+ mmu->domains[context_id] = NULL;
+
+ spin_unlock_irqrestore(&mmu->lock, flags);
+}
+
+static void ipmmu_domain_setup_context(struct ipmmu_vmsa_domain *domain)
+{
+ u64 ttbr;
+ u32 tmp;
+
+ /* TTBR0 */
+ ttbr = domain->cfg.arm_lpae_s1_cfg.ttbr;
+ ipmmu_ctx_write_root(domain, IMTTLBR0, ttbr);
+ ipmmu_ctx_write_root(domain, IMTTUBR0, ttbr >> 32);
+
+ /*
+ * TTBCR
+ * We use long descriptors and allocate the whole 32-bit VA space to
+ * TTBR0.
+ */
+ if (domain->mmu->features->twobit_imttbcr_sl0)
+ tmp = IMTTBCR_SL0_TWOBIT_LVL_1;
+ else
+ tmp = IMTTBCR_SL0_LVL_1;
+
+ if (domain->mmu->features->cache_snoop)
+ tmp |= IMTTBCR_SH0_INNER_SHAREABLE | IMTTBCR_ORGN0_WB_WA |
+ IMTTBCR_IRGN0_WB_WA;
+
+ ipmmu_ctx_write_root(domain, IMTTBCR, IMTTBCR_EAE | tmp);
+
+ /* MAIR0 */
+ ipmmu_ctx_write_root(domain, IMMAIR0,
+ domain->cfg.arm_lpae_s1_cfg.mair);
+
+ /* IMBUSCR */
+ if (domain->mmu->features->setup_imbuscr)
+ ipmmu_ctx_write_root(domain, IMBUSCR,
+ ipmmu_ctx_read_root(domain, IMBUSCR) &
+ ~(IMBUSCR_DVM | IMBUSCR_BUSSEL_MASK));
+
+ /*
+ * IMSTR
+ * Clear all interrupt flags.
+ */
+ ipmmu_ctx_write_root(domain, IMSTR, ipmmu_ctx_read_root(domain, IMSTR));
+
+ /*
+ * IMCTR
+ * Enable the MMU and interrupt generation. The long-descriptor
+ * translation table format doesn't use TEX remapping. Don't enable AF
+ * software management as we have no use for it. Flush the TLB as
+ * required when modifying the context registers.
+ */
+ ipmmu_ctx_write_all(domain, IMCTR,
+ IMCTR_INTEN | IMCTR_FLUSH | IMCTR_MMUEN);
+}
+
+static int ipmmu_domain_init_context(struct ipmmu_vmsa_domain *domain)
+{
+ int ret;
+
+ /*
+ * Allocate the page table operations.
+ *
+ * VMSA states in section B3.6.3 "Control of Secure or Non-secure memory
+ * access, Long-descriptor format" that the NStable bit being set in a
+ * table descriptor will result in the NStable and NS bits of all child
+ * entries being ignored and considered as being set. The IPMMU seems
+ * not to comply with this, as it generates a secure access page fault
+ * if any of the NStable and NS bits isn't set when running in
+ * non-secure mode.
+ */
+ domain->cfg.quirks = IO_PGTABLE_QUIRK_ARM_NS;
+ domain->cfg.pgsize_bitmap = SZ_1G | SZ_2M | SZ_4K;
+ domain->cfg.ias = 32;
+ domain->cfg.oas = 40;
+ domain->cfg.tlb = &ipmmu_flush_ops;
+ domain->io_domain.geometry.aperture_end = DMA_BIT_MASK(32);
+ domain->io_domain.geometry.force_aperture = true;
+ /*
+ * TODO: Add support for coherent walk through CCI with DVM and remove
+ * cache handling. For now, delegate it to the io-pgtable code.
+ */
+ domain->cfg.coherent_walk = false;
+ domain->cfg.iommu_dev = domain->mmu->root->dev;
+
+ /*
+ * Find an unused context.
+ */
+ ret = ipmmu_domain_allocate_context(domain->mmu->root, domain);
+ if (ret < 0)
+ return ret;
+
+ domain->context_id = ret;
+
+ domain->iop = alloc_io_pgtable_ops(ARM_32_LPAE_S1, &domain->cfg,
+ domain);
+ if (!domain->iop) {
+ ipmmu_domain_free_context(domain->mmu->root,
+ domain->context_id);
+ return -EINVAL;
+ }
+
+ ipmmu_domain_setup_context(domain);
+ return 0;
+}
+
+static void ipmmu_domain_destroy_context(struct ipmmu_vmsa_domain *domain)
+{
+ if (!domain->mmu)
+ return;
+
+ /*
+ * Disable the context. Flush the TLB as required when modifying the
+ * context registers.
+ *
+ * TODO: Is TLB flush really needed ?
+ */
+ ipmmu_ctx_write_all(domain, IMCTR, IMCTR_FLUSH);
+ ipmmu_tlb_sync(domain);
+ ipmmu_domain_free_context(domain->mmu->root, domain->context_id);
+}
+
+/* -----------------------------------------------------------------------------
+ * Fault Handling
+ */
+
+static irqreturn_t ipmmu_domain_irq(struct ipmmu_vmsa_domain *domain)
+{
+ const u32 err_mask = IMSTR_MHIT | IMSTR_ABORT | IMSTR_PF | IMSTR_TF;
+ struct ipmmu_vmsa_device *mmu = domain->mmu;
+ unsigned long iova;
+ u32 status;
+
+ status = ipmmu_ctx_read_root(domain, IMSTR);
+ if (!(status & err_mask))
+ return IRQ_NONE;
+
+ iova = ipmmu_ctx_read_root(domain, IMELAR);
+ if (IS_ENABLED(CONFIG_64BIT))
+ iova |= (u64)ipmmu_ctx_read_root(domain, IMEUAR) << 32;
+
+ /*
+ * Clear the error status flags. Unlike traditional interrupt flag
+ * registers that must be cleared by writing 1, this status register
+ * seems to require 0. The error address register must be read before,
+ * otherwise its value will be 0.
+ */
+ ipmmu_ctx_write_root(domain, IMSTR, 0);
+
+ /* Log fatal errors. */
+ if (status & IMSTR_MHIT)
+ dev_err_ratelimited(mmu->dev, "Multiple TLB hits @0x%lx\n",
+ iova);
+ if (status & IMSTR_ABORT)
+ dev_err_ratelimited(mmu->dev, "Page Table Walk Abort @0x%lx\n",
+ iova);
+
+ if (!(status & (IMSTR_PF | IMSTR_TF)))
+ return IRQ_NONE;
+
+ /*
+ * Try to handle page faults and translation faults.
+ *
+ * TODO: We need to look up the faulty device based on the I/O VA. Use
+ * the IOMMU device for now.
+ */
+ if (!report_iommu_fault(&domain->io_domain, mmu->dev, iova, 0))
+ return IRQ_HANDLED;
+
+ dev_err_ratelimited(mmu->dev,
+ "Unhandled fault: status 0x%08x iova 0x%lx\n",
+ status, iova);
+
+ return IRQ_HANDLED;
+}
+
+static irqreturn_t ipmmu_irq(int irq, void *dev)
+{
+ struct ipmmu_vmsa_device *mmu = dev;
+ irqreturn_t status = IRQ_NONE;
+ unsigned int i;
+ unsigned long flags;
+
+ spin_lock_irqsave(&mmu->lock, flags);
+
+ /*
+ * Check interrupts for all active contexts.
+ */
+ for (i = 0; i < mmu->num_ctx; i++) {
+ if (!mmu->domains[i])
+ continue;
+ if (ipmmu_domain_irq(mmu->domains[i]) == IRQ_HANDLED)
+ status = IRQ_HANDLED;
+ }
+
+ spin_unlock_irqrestore(&mmu->lock, flags);
+
+ return status;
+}
+
+/* -----------------------------------------------------------------------------
+ * IOMMU Operations
+ */
+
+static struct iommu_domain *__ipmmu_domain_alloc(unsigned type)
+{
+ struct ipmmu_vmsa_domain *domain;
+
+ domain = kzalloc(sizeof(*domain), GFP_KERNEL);
+ if (!domain)
+ return NULL;
+
+ mutex_init(&domain->mutex);
+
+ return &domain->io_domain;
+}
+
+static struct iommu_domain *ipmmu_domain_alloc(unsigned type)
+{
+ struct iommu_domain *io_domain = NULL;
+
+ switch (type) {
+ case IOMMU_DOMAIN_UNMANAGED:
+ io_domain = __ipmmu_domain_alloc(type);
+ break;
+
+ case IOMMU_DOMAIN_DMA:
+ io_domain = __ipmmu_domain_alloc(type);
+ if (io_domain && iommu_get_dma_cookie(io_domain)) {
+ kfree(io_domain);
+ io_domain = NULL;
+ }
+ break;
+ }
+
+ return io_domain;
+}
+
+static void ipmmu_domain_free(struct iommu_domain *io_domain)
+{
+ struct ipmmu_vmsa_domain *domain = to_vmsa_domain(io_domain);
+
+ /*
+ * Free the domain resources. We assume that all devices have already
+ * been detached.
+ */
+ iommu_put_dma_cookie(io_domain);
+ ipmmu_domain_destroy_context(domain);
+ free_io_pgtable_ops(domain->iop);
+ kfree(domain);
+}
+
+static int ipmmu_attach_device(struct iommu_domain *io_domain,
+ struct device *dev)
+{
+ struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
+ struct ipmmu_vmsa_device *mmu = to_ipmmu(dev);
+ struct ipmmu_vmsa_domain *domain = to_vmsa_domain(io_domain);
+ unsigned int i;
+ int ret = 0;
+
+ if (!mmu) {
+ dev_err(dev, "Cannot attach to IPMMU\n");
+ return -ENXIO;
+ }
+
+ mutex_lock(&domain->mutex);
+
+ if (!domain->mmu) {
+ /* The domain hasn't been used yet, initialize it. */
+ domain->mmu = mmu;
+ ret = ipmmu_domain_init_context(domain);
+ if (ret < 0) {
+ dev_err(dev, "Unable to initialize IPMMU context\n");
+ domain->mmu = NULL;
+ } else {
+ dev_info(dev, "Using IPMMU context %u\n",
+ domain->context_id);
+ }
+ } else if (domain->mmu != mmu) {
+ /*
+ * Something is wrong, we can't attach two devices using
+ * different IOMMUs to the same domain.
+ */
+ dev_err(dev, "Can't attach IPMMU %s to domain on IPMMU %s\n",
+ dev_name(mmu->dev), dev_name(domain->mmu->dev));
+ ret = -EINVAL;
+ } else
+ dev_info(dev, "Reusing IPMMU context %u\n", domain->context_id);
+
+ mutex_unlock(&domain->mutex);
+
+ if (ret < 0)
+ return ret;
+
+ for (i = 0; i < fwspec->num_ids; ++i)
+ ipmmu_utlb_enable(domain, fwspec->ids[i]);
+
+ return 0;
+}
+
+static void ipmmu_detach_device(struct iommu_domain *io_domain,
+ struct device *dev)
+{
+ struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
+ struct ipmmu_vmsa_domain *domain = to_vmsa_domain(io_domain);
+ unsigned int i;
+
+ for (i = 0; i < fwspec->num_ids; ++i)
+ ipmmu_utlb_disable(domain, fwspec->ids[i]);
+
+ /*
+ * TODO: Optimize by disabling the context when no device is attached.
+ */
+}
+
+static int ipmmu_map(struct iommu_domain *io_domain, unsigned long iova,
+ phys_addr_t paddr, size_t size, int prot, gfp_t gfp)
+{
+ struct ipmmu_vmsa_domain *domain = to_vmsa_domain(io_domain);
+
+ if (!domain)
+ return -ENODEV;
+
+ return domain->iop->map(domain->iop, iova, paddr, size, prot, gfp);
+}
+
+static size_t ipmmu_unmap(struct iommu_domain *io_domain, unsigned long iova,
+ size_t size, struct iommu_iotlb_gather *gather)
+{
+ struct ipmmu_vmsa_domain *domain = to_vmsa_domain(io_domain);
+
+ return domain->iop->unmap(domain->iop, iova, size, gather);
+}
+
+static void ipmmu_flush_iotlb_all(struct iommu_domain *io_domain)
+{
+ struct ipmmu_vmsa_domain *domain = to_vmsa_domain(io_domain);
+
+ if (domain->mmu)
+ ipmmu_tlb_flush_all(domain);
+}
+
+static void ipmmu_iotlb_sync(struct iommu_domain *io_domain,
+ struct iommu_iotlb_gather *gather)
+{
+ ipmmu_flush_iotlb_all(io_domain);
+}
+
+static phys_addr_t ipmmu_iova_to_phys(struct iommu_domain *io_domain,
+ dma_addr_t iova)
+{
+ struct ipmmu_vmsa_domain *domain = to_vmsa_domain(io_domain);
+
+ /* TODO: Is locking needed ? */
+
+ return domain->iop->iova_to_phys(domain->iop, iova);
+}
+
+static int ipmmu_init_platform_device(struct device *dev,
+ struct of_phandle_args *args)
+{
+ struct platform_device *ipmmu_pdev;
+
+ ipmmu_pdev = of_find_device_by_node(args->np);
+ if (!ipmmu_pdev)
+ return -ENODEV;
+
+ dev_iommu_priv_set(dev, platform_get_drvdata(ipmmu_pdev));
+
+ return 0;
+}
+
+static const struct soc_device_attribute soc_rcar_gen3[] = {
+ { .soc_id = "r8a774a1", },
+ { .soc_id = "r8a774b1", },
+ { .soc_id = "r8a774c0", },
+ { .soc_id = "r8a774e1", },
+ { .soc_id = "r8a7795", },
+ { .soc_id = "r8a77961", },
+ { .soc_id = "r8a7796", },
+ { .soc_id = "r8a77965", },
+ { .soc_id = "r8a77970", },
+ { .soc_id = "r8a77990", },
+ { .soc_id = "r8a77995", },
+ { /* sentinel */ }
+};
+
+static const struct soc_device_attribute soc_rcar_gen3_whitelist[] = {
+ { .soc_id = "r8a774b1", },
+ { .soc_id = "r8a774c0", },
+ { .soc_id = "r8a774e1", },
+ { .soc_id = "r8a7795", .revision = "ES3.*" },
+ { .soc_id = "r8a77961", },
+ { .soc_id = "r8a77965", },
+ { .soc_id = "r8a77990", },
+ { .soc_id = "r8a77995", },
+ { /* sentinel */ }
+};
+
+static const char * const rcar_gen3_slave_whitelist[] = {
+};
+
+static bool ipmmu_slave_whitelist(struct device *dev)
+{
+ unsigned int i;
+
+ /*
+ * For R-Car Gen3 use a white list to opt-in slave devices.
+ * For Other SoCs, this returns true anyway.
+ */
+ if (!soc_device_match(soc_rcar_gen3))
+ return true;
+
+ /* Check whether this R-Car Gen3 can use the IPMMU correctly or not */
+ if (!soc_device_match(soc_rcar_gen3_whitelist))
+ return false;
+
+ /* Check whether this slave device can work with the IPMMU */
+ for (i = 0; i < ARRAY_SIZE(rcar_gen3_slave_whitelist); i++) {
+ if (!strcmp(dev_name(dev), rcar_gen3_slave_whitelist[i]))
+ return true;
+ }
+
+ /* Otherwise, do not allow use of IPMMU */
+ return false;
+}
+
+static int ipmmu_of_xlate(struct device *dev,
+ struct of_phandle_args *spec)
+{
+ if (!ipmmu_slave_whitelist(dev))
+ return -ENODEV;
+
+ iommu_fwspec_add_ids(dev, spec->args, 1);
+
+ /* Initialize once - xlate() will call multiple times */
+ if (to_ipmmu(dev))
+ return 0;
+
+ return ipmmu_init_platform_device(dev, spec);
+}
+
+static int ipmmu_init_arm_mapping(struct device *dev)
+{
+ struct ipmmu_vmsa_device *mmu = to_ipmmu(dev);
+ int ret;
+
+ /*
+ * Create the ARM mapping, used by the ARM DMA mapping core to allocate
+ * VAs. This will allocate a corresponding IOMMU domain.
+ *
+ * TODO:
+ * - Create one mapping per context (TLB).
+ * - Make the mapping size configurable ? We currently use a 2GB mapping
+ * at a 1GB offset to ensure that NULL VAs will fault.
+ */
+ if (!mmu->mapping) {
+ struct dma_iommu_mapping *mapping;
+
+ mapping = arm_iommu_create_mapping(&platform_bus_type,
+ SZ_1G, SZ_2G);
+ if (IS_ERR(mapping)) {
+ dev_err(mmu->dev, "failed to create ARM IOMMU mapping\n");
+ ret = PTR_ERR(mapping);
+ goto error;
+ }
+
+ mmu->mapping = mapping;
+ }
+
+ /* Attach the ARM VA mapping to the device. */
+ ret = arm_iommu_attach_device(dev, mmu->mapping);
+ if (ret < 0) {
+ dev_err(dev, "Failed to attach device to VA mapping\n");
+ goto error;
+ }
+
+ return 0;
+
+error:
+ if (mmu->mapping)
+ arm_iommu_release_mapping(mmu->mapping);
+
+ return ret;
+}
+
+static struct iommu_device *ipmmu_probe_device(struct device *dev)
+{
+ struct ipmmu_vmsa_device *mmu = to_ipmmu(dev);
+
+ /*
+ * Only let through devices that have been verified in xlate()
+ */
+ if (!mmu)
+ return ERR_PTR(-ENODEV);
+
+ return &mmu->iommu;
+}
+
+static void ipmmu_probe_finalize(struct device *dev)
+{
+ int ret = 0;
+
+ if (IS_ENABLED(CONFIG_ARM) && !IS_ENABLED(CONFIG_IOMMU_DMA))
+ ret = ipmmu_init_arm_mapping(dev);
+
+ if (ret)
+ dev_err(dev, "Can't create IOMMU mapping - DMA-OPS will not work\n");
+}
+
+static void ipmmu_release_device(struct device *dev)
+{
+ arm_iommu_detach_device(dev);
+}
+
+static struct iommu_group *ipmmu_find_group(struct device *dev)
+{
+ struct ipmmu_vmsa_device *mmu = to_ipmmu(dev);
+ struct iommu_group *group;
+
+ if (mmu->group)
+ return iommu_group_ref_get(mmu->group);
+
+ group = iommu_group_alloc();
+ if (!IS_ERR(group))
+ mmu->group = group;
+
+ return group;
+}
+
+static const struct iommu_ops ipmmu_ops = {
+ .domain_alloc = ipmmu_domain_alloc,
+ .domain_free = ipmmu_domain_free,
+ .attach_dev = ipmmu_attach_device,
+ .detach_dev = ipmmu_detach_device,
+ .map = ipmmu_map,
+ .unmap = ipmmu_unmap,
+ .flush_iotlb_all = ipmmu_flush_iotlb_all,
+ .iotlb_sync = ipmmu_iotlb_sync,
+ .iova_to_phys = ipmmu_iova_to_phys,
+ .probe_device = ipmmu_probe_device,
+ .release_device = ipmmu_release_device,
+ .probe_finalize = ipmmu_probe_finalize,
+ .device_group = IS_ENABLED(CONFIG_ARM) && !IS_ENABLED(CONFIG_IOMMU_DMA)
+ ? generic_device_group : ipmmu_find_group,
+ .pgsize_bitmap = SZ_1G | SZ_2M | SZ_4K,
+ .of_xlate = ipmmu_of_xlate,
+};
+
+/* -----------------------------------------------------------------------------
+ * Probe/remove and init
+ */
+
+static void ipmmu_device_reset(struct ipmmu_vmsa_device *mmu)
+{
+ unsigned int i;
+
+ /* Disable all contexts. */
+ for (i = 0; i < mmu->num_ctx; ++i)
+ ipmmu_ctx_write(mmu, i, IMCTR, 0);
+}
+
+static const struct ipmmu_features ipmmu_features_default = {
+ .use_ns_alias_offset = true,
+ .has_cache_leaf_nodes = false,
+ .number_of_contexts = 1, /* software only tested with one context */
+ .num_utlbs = 32,
+ .setup_imbuscr = true,
+ .twobit_imttbcr_sl0 = false,
+ .reserved_context = false,
+ .cache_snoop = true,
+ .ctx_offset_base = 0,
+ .ctx_offset_stride = 0x40,
+ .utlb_offset_base = 0,
+};
+
+static const struct ipmmu_features ipmmu_features_rcar_gen3 = {
+ .use_ns_alias_offset = false,
+ .has_cache_leaf_nodes = true,
+ .number_of_contexts = 8,
+ .num_utlbs = 48,
+ .setup_imbuscr = false,
+ .twobit_imttbcr_sl0 = true,
+ .reserved_context = true,
+ .cache_snoop = false,
+ .ctx_offset_base = 0,
+ .ctx_offset_stride = 0x40,
+ .utlb_offset_base = 0,
+};
+
+static const struct of_device_id ipmmu_of_ids[] = {
+ {
+ .compatible = "renesas,ipmmu-vmsa",
+ .data = &ipmmu_features_default,
+ }, {
+ .compatible = "renesas,ipmmu-r8a774a1",
+ .data = &ipmmu_features_rcar_gen3,
+ }, {
+ .compatible = "renesas,ipmmu-r8a774b1",
+ .data = &ipmmu_features_rcar_gen3,
+ }, {
+ .compatible = "renesas,ipmmu-r8a774c0",
+ .data = &ipmmu_features_rcar_gen3,
+ }, {
+ .compatible = "renesas,ipmmu-r8a774e1",
+ .data = &ipmmu_features_rcar_gen3,
+ }, {
+ .compatible = "renesas,ipmmu-r8a7795",
+ .data = &ipmmu_features_rcar_gen3,
+ }, {
+ .compatible = "renesas,ipmmu-r8a7796",
+ .data = &ipmmu_features_rcar_gen3,
+ }, {
+ .compatible = "renesas,ipmmu-r8a77961",
+ .data = &ipmmu_features_rcar_gen3,
+ }, {
+ .compatible = "renesas,ipmmu-r8a77965",
+ .data = &ipmmu_features_rcar_gen3,
+ }, {
+ .compatible = "renesas,ipmmu-r8a77970",
+ .data = &ipmmu_features_rcar_gen3,
+ }, {
+ .compatible = "renesas,ipmmu-r8a77990",
+ .data = &ipmmu_features_rcar_gen3,
+ }, {
+ .compatible = "renesas,ipmmu-r8a77995",
+ .data = &ipmmu_features_rcar_gen3,
+ }, {
+ /* Terminator */
+ },
+};
+
+static int ipmmu_probe(struct platform_device *pdev)
+{
+ struct ipmmu_vmsa_device *mmu;
+ struct resource *res;
+ int irq;
+ int ret;
+
+ mmu = devm_kzalloc(&pdev->dev, sizeof(*mmu), GFP_KERNEL);
+ if (!mmu) {
+ dev_err(&pdev->dev, "cannot allocate device data\n");
+ return -ENOMEM;
+ }
+
+ mmu->dev = &pdev->dev;
+ spin_lock_init(&mmu->lock);
+ bitmap_zero(mmu->ctx, IPMMU_CTX_MAX);
+ mmu->features = of_device_get_match_data(&pdev->dev);
+ memset(mmu->utlb_ctx, IPMMU_CTX_INVALID, mmu->features->num_utlbs);
+ ret = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(40));
+ if (ret)
+ return ret;
+
+ /* Map I/O memory and request IRQ. */
+ res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+ mmu->base = devm_ioremap_resource(&pdev->dev, res);
+ if (IS_ERR(mmu->base))
+ return PTR_ERR(mmu->base);
+
+ /*
+ * The IPMMU has two register banks, for secure and non-secure modes.
+ * The bank mapped at the beginning of the IPMMU address space
+ * corresponds to the running mode of the CPU. When running in secure
+ * mode the non-secure register bank is also available at an offset.
+ *
+ * Secure mode operation isn't clearly documented and is thus currently
+ * not implemented in the driver. Furthermore, preliminary tests of
+ * non-secure operation with the main register bank were not successful.
+ * Offset the registers base unconditionally to point to the non-secure
+ * alias space for now.
+ */
+ if (mmu->features->use_ns_alias_offset)
+ mmu->base += IM_NS_ALIAS_OFFSET;
+
+ mmu->num_ctx = min(IPMMU_CTX_MAX, mmu->features->number_of_contexts);
+
+ /*
+ * Determine if this IPMMU instance is a root device by checking for
+ * the lack of has_cache_leaf_nodes flag or renesas,ipmmu-main property.
+ */
+ if (!mmu->features->has_cache_leaf_nodes ||
+ !of_find_property(pdev->dev.of_node, "renesas,ipmmu-main", NULL))
+ mmu->root = mmu;
+ else
+ mmu->root = ipmmu_find_root();
+
+ /*
+ * Wait until the root device has been registered for sure.
+ */
+ if (!mmu->root)
+ return -EPROBE_DEFER;
+
+ /* Root devices have mandatory IRQs */
+ if (ipmmu_is_root(mmu)) {
+ irq = platform_get_irq(pdev, 0);
+ if (irq < 0)
+ return irq;
+
+ ret = devm_request_irq(&pdev->dev, irq, ipmmu_irq, 0,
+ dev_name(&pdev->dev), mmu);
+ if (ret < 0) {
+ dev_err(&pdev->dev, "failed to request IRQ %d\n", irq);
+ return ret;
+ }
+
+ ipmmu_device_reset(mmu);
+
+ if (mmu->features->reserved_context) {
+ dev_info(&pdev->dev, "IPMMU context 0 is reserved\n");
+ set_bit(0, mmu->ctx);
+ }
+ }
+
+ /*
+ * Register the IPMMU to the IOMMU subsystem in the following cases:
+ * - R-Car Gen2 IPMMU (all devices registered)
+ * - R-Car Gen3 IPMMU (leaf devices only - skip root IPMMU-MM device)
+ */
+ if (!mmu->features->has_cache_leaf_nodes || !ipmmu_is_root(mmu)) {
+ ret = iommu_device_sysfs_add(&mmu->iommu, &pdev->dev, NULL,
+ dev_name(&pdev->dev));
+ if (ret)
+ return ret;
+
+ iommu_device_set_ops(&mmu->iommu, &ipmmu_ops);
+ iommu_device_set_fwnode(&mmu->iommu,
+ &pdev->dev.of_node->fwnode);
+
+ ret = iommu_device_register(&mmu->iommu);
+ if (ret)
+ return ret;
+
+#if defined(CONFIG_IOMMU_DMA)
+ if (!iommu_present(&platform_bus_type))
+ bus_set_iommu(&platform_bus_type, &ipmmu_ops);
+#endif
+ }
+
+ /*
+ * We can't create the ARM mapping here as it requires the bus to have
+ * an IOMMU, which only happens when bus_set_iommu() is called in
+ * ipmmu_init() after the probe function returns.
+ */
+
+ platform_set_drvdata(pdev, mmu);
+
+ return 0;
+}
+
+static int ipmmu_remove(struct platform_device *pdev)
+{
+ struct ipmmu_vmsa_device *mmu = platform_get_drvdata(pdev);
+
+ iommu_device_sysfs_remove(&mmu->iommu);
+ iommu_device_unregister(&mmu->iommu);
+
+ arm_iommu_release_mapping(mmu->mapping);
+
+ ipmmu_device_reset(mmu);
+
+ return 0;
+}
+
+#ifdef CONFIG_PM_SLEEP
+static int ipmmu_resume_noirq(struct device *dev)
+{
+ struct ipmmu_vmsa_device *mmu = dev_get_drvdata(dev);
+ unsigned int i;
+
+ /* Reset root MMU and restore contexts */
+ if (ipmmu_is_root(mmu)) {
+ ipmmu_device_reset(mmu);
+
+ for (i = 0; i < mmu->num_ctx; i++) {
+ if (!mmu->domains[i])
+ continue;
+
+ ipmmu_domain_setup_context(mmu->domains[i]);
+ }
+ }
+
+ /* Re-enable active micro-TLBs */
+ for (i = 0; i < mmu->features->num_utlbs; i++) {
+ if (mmu->utlb_ctx[i] == IPMMU_CTX_INVALID)
+ continue;
+
+ ipmmu_utlb_enable(mmu->root->domains[mmu->utlb_ctx[i]], i);
+ }
+
+ return 0;
+}
+
+static const struct dev_pm_ops ipmmu_pm = {
+ SET_NOIRQ_SYSTEM_SLEEP_PM_OPS(NULL, ipmmu_resume_noirq)
+};
+#define DEV_PM_OPS &ipmmu_pm
+#else
+#define DEV_PM_OPS NULL
+#endif /* CONFIG_PM_SLEEP */
+
+static struct platform_driver ipmmu_driver = {
+ .driver = {
+ .name = "ipmmu-vmsa",
+ .of_match_table = of_match_ptr(ipmmu_of_ids),
+ .pm = DEV_PM_OPS,
+ },
+ .probe = ipmmu_probe,
+ .remove = ipmmu_remove,
+};
+
+static int __init ipmmu_init(void)
+{
+ struct device_node *np;
+ static bool setup_done;
+ int ret;
+
+ if (setup_done)
+ return 0;
+
+ np = of_find_matching_node(NULL, ipmmu_of_ids);
+ if (!np)
+ return 0;
+
+ of_node_put(np);
+
+ ret = platform_driver_register(&ipmmu_driver);
+ if (ret < 0)
+ return ret;
+
+#if defined(CONFIG_ARM) && !defined(CONFIG_IOMMU_DMA)
+ if (!iommu_present(&platform_bus_type))
+ bus_set_iommu(&platform_bus_type, &ipmmu_ops);
+#endif
+
+ setup_done = true;
+ return 0;
+}
+subsys_initcall(ipmmu_init);
diff --git a/drivers/iommu/irq_remapping.c b/drivers/iommu/irq_remapping.c
new file mode 100644
index 000000000..2d84b1ed2
--- /dev/null
+++ b/drivers/iommu/irq_remapping.c
@@ -0,0 +1,174 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <linux/cpumask.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/msi.h>
+#include <linux/irq.h>
+#include <linux/pci.h>
+#include <linux/irqdomain.h>
+
+#include <asm/hw_irq.h>
+#include <asm/irq_remapping.h>
+#include <asm/processor.h>
+#include <asm/x86_init.h>
+#include <asm/apic.h>
+#include <asm/hpet.h>
+
+#include "irq_remapping.h"
+
+int irq_remapping_enabled;
+int irq_remap_broken;
+int disable_sourceid_checking;
+int no_x2apic_optout;
+
+int disable_irq_post = 0;
+
+static int disable_irq_remap;
+static struct irq_remap_ops *remap_ops;
+
+static void irq_remapping_restore_boot_irq_mode(void)
+{
+ /*
+ * With interrupt-remapping, for now we will use virtual wire A
+ * mode, as virtual wire B is little complex (need to configure
+ * both IOAPIC RTE as well as interrupt-remapping table entry).
+ * As this gets called during crash dump, keep this simple for
+ * now.
+ */
+ if (boot_cpu_has(X86_FEATURE_APIC) || apic_from_smp_config())
+ disconnect_bsp_APIC(0);
+}
+
+static void __init irq_remapping_modify_x86_ops(void)
+{
+ x86_apic_ops.restore = irq_remapping_restore_boot_irq_mode;
+}
+
+static __init int setup_nointremap(char *str)
+{
+ disable_irq_remap = 1;
+ return 0;
+}
+early_param("nointremap", setup_nointremap);
+
+static __init int setup_irqremap(char *str)
+{
+ if (!str)
+ return -EINVAL;
+
+ while (*str) {
+ if (!strncmp(str, "on", 2)) {
+ disable_irq_remap = 0;
+ disable_irq_post = 0;
+ } else if (!strncmp(str, "off", 3)) {
+ disable_irq_remap = 1;
+ disable_irq_post = 1;
+ } else if (!strncmp(str, "nosid", 5))
+ disable_sourceid_checking = 1;
+ else if (!strncmp(str, "no_x2apic_optout", 16))
+ no_x2apic_optout = 1;
+ else if (!strncmp(str, "nopost", 6))
+ disable_irq_post = 1;
+
+ str += strcspn(str, ",");
+ while (*str == ',')
+ str++;
+ }
+
+ return 0;
+}
+early_param("intremap", setup_irqremap);
+
+void set_irq_remapping_broken(void)
+{
+ irq_remap_broken = 1;
+}
+
+bool irq_remapping_cap(enum irq_remap_cap cap)
+{
+ if (!remap_ops || disable_irq_post)
+ return false;
+
+ return (remap_ops->capability & (1 << cap));
+}
+EXPORT_SYMBOL_GPL(irq_remapping_cap);
+
+int __init irq_remapping_prepare(void)
+{
+ if (disable_irq_remap)
+ return -ENOSYS;
+
+ if (intel_irq_remap_ops.prepare() == 0)
+ remap_ops = &intel_irq_remap_ops;
+ else if (IS_ENABLED(CONFIG_AMD_IOMMU) &&
+ amd_iommu_irq_ops.prepare() == 0)
+ remap_ops = &amd_iommu_irq_ops;
+ else if (IS_ENABLED(CONFIG_HYPERV_IOMMU) &&
+ hyperv_irq_remap_ops.prepare() == 0)
+ remap_ops = &hyperv_irq_remap_ops;
+ else
+ return -ENOSYS;
+
+ return 0;
+}
+
+int __init irq_remapping_enable(void)
+{
+ int ret;
+
+ if (!remap_ops->enable)
+ return -ENODEV;
+
+ ret = remap_ops->enable();
+
+ if (irq_remapping_enabled)
+ irq_remapping_modify_x86_ops();
+
+ return ret;
+}
+
+void irq_remapping_disable(void)
+{
+ if (irq_remapping_enabled && remap_ops->disable)
+ remap_ops->disable();
+}
+
+int irq_remapping_reenable(int mode)
+{
+ if (irq_remapping_enabled && remap_ops->reenable)
+ return remap_ops->reenable(mode);
+
+ return 0;
+}
+
+int __init irq_remap_enable_fault_handling(void)
+{
+ if (!irq_remapping_enabled)
+ return 0;
+
+ if (!remap_ops->enable_faulting)
+ return -ENODEV;
+
+ return remap_ops->enable_faulting();
+}
+
+void panic_if_irq_remap(const char *msg)
+{
+ if (irq_remapping_enabled)
+ panic(msg);
+}
+
+/**
+ * irq_remapping_get_irq_domain - Get the irqdomain serving the request @info
+ * @info: interrupt allocation information, used to identify the IOMMU device
+ *
+ * Returns pointer to IRQ domain, or NULL on failure.
+ */
+struct irq_domain *irq_remapping_get_irq_domain(struct irq_alloc_info *info)
+{
+ if (!remap_ops || !remap_ops->get_irq_domain)
+ return NULL;
+
+ return remap_ops->get_irq_domain(info);
+}
diff --git a/drivers/iommu/irq_remapping.h b/drivers/iommu/irq_remapping.h
new file mode 100644
index 000000000..1661b3d75
--- /dev/null
+++ b/drivers/iommu/irq_remapping.h
@@ -0,0 +1,62 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2012 Advanced Micro Devices, Inc.
+ * Author: Joerg Roedel <jroedel@suse.de>
+ *
+ * This header file contains stuff that is shared between different interrupt
+ * remapping drivers but with no need to be visible outside of the IOMMU layer.
+ */
+
+#ifndef __IRQ_REMAPPING_H
+#define __IRQ_REMAPPING_H
+
+#ifdef CONFIG_IRQ_REMAP
+
+struct irq_data;
+struct msi_msg;
+struct irq_domain;
+struct irq_alloc_info;
+
+extern int irq_remap_broken;
+extern int disable_sourceid_checking;
+extern int no_x2apic_optout;
+extern int irq_remapping_enabled;
+
+extern int disable_irq_post;
+
+struct irq_remap_ops {
+ /* The supported capabilities */
+ int capability;
+
+ /* Initializes hardware and makes it ready for remapping interrupts */
+ int (*prepare)(void);
+
+ /* Enables the remapping hardware */
+ int (*enable)(void);
+
+ /* Disables the remapping hardware */
+ void (*disable)(void);
+
+ /* Reenables the remapping hardware */
+ int (*reenable)(int);
+
+ /* Enable fault handling */
+ int (*enable_faulting)(void);
+
+ /* Get the irqdomain associated to IOMMU device */
+ struct irq_domain *(*get_irq_domain)(struct irq_alloc_info *);
+};
+
+extern struct irq_remap_ops intel_irq_remap_ops;
+extern struct irq_remap_ops amd_iommu_irq_ops;
+extern struct irq_remap_ops hyperv_irq_remap_ops;
+
+#else /* CONFIG_IRQ_REMAP */
+
+#define irq_remapping_enabled 0
+#define irq_remap_broken 0
+#define disable_irq_post 1
+
+#endif /* CONFIG_IRQ_REMAP */
+
+#endif /* __IRQ_REMAPPING_H */
diff --git a/drivers/iommu/msm_iommu.c b/drivers/iommu/msm_iommu.c
new file mode 100644
index 000000000..149f2e53b
--- /dev/null
+++ b/drivers/iommu/msm_iommu.c
@@ -0,0 +1,852 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2010-2011, Code Aurora Forum. All rights reserved.
+ *
+ * Author: Stepan Moskovchenko <stepanm@codeaurora.org>
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/platform_device.h>
+#include <linux/errno.h>
+#include <linux/io.h>
+#include <linux/io-pgtable.h>
+#include <linux/interrupt.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/slab.h>
+#include <linux/iommu.h>
+#include <linux/clk.h>
+#include <linux/err.h>
+#include <linux/of_iommu.h>
+
+#include <asm/cacheflush.h>
+#include <linux/sizes.h>
+
+#include "msm_iommu_hw-8xxx.h"
+#include "msm_iommu.h"
+
+#define MRC(reg, processor, op1, crn, crm, op2) \
+__asm__ __volatile__ ( \
+" mrc " #processor "," #op1 ", %0," #crn "," #crm "," #op2 "\n" \
+: "=r" (reg))
+
+/* bitmap of the page sizes currently supported */
+#define MSM_IOMMU_PGSIZES (SZ_4K | SZ_64K | SZ_1M | SZ_16M)
+
+static DEFINE_SPINLOCK(msm_iommu_lock);
+static LIST_HEAD(qcom_iommu_devices);
+static struct iommu_ops msm_iommu_ops;
+
+struct msm_priv {
+ struct list_head list_attached;
+ struct iommu_domain domain;
+ struct io_pgtable_cfg cfg;
+ struct io_pgtable_ops *iop;
+ struct device *dev;
+ spinlock_t pgtlock; /* pagetable lock */
+};
+
+static struct msm_priv *to_msm_priv(struct iommu_domain *dom)
+{
+ return container_of(dom, struct msm_priv, domain);
+}
+
+static int __enable_clocks(struct msm_iommu_dev *iommu)
+{
+ int ret;
+
+ ret = clk_enable(iommu->pclk);
+ if (ret)
+ goto fail;
+
+ if (iommu->clk) {
+ ret = clk_enable(iommu->clk);
+ if (ret)
+ clk_disable(iommu->pclk);
+ }
+fail:
+ return ret;
+}
+
+static void __disable_clocks(struct msm_iommu_dev *iommu)
+{
+ if (iommu->clk)
+ clk_disable(iommu->clk);
+ clk_disable(iommu->pclk);
+}
+
+static void msm_iommu_reset(void __iomem *base, int ncb)
+{
+ int ctx;
+
+ SET_RPUE(base, 0);
+ SET_RPUEIE(base, 0);
+ SET_ESRRESTORE(base, 0);
+ SET_TBE(base, 0);
+ SET_CR(base, 0);
+ SET_SPDMBE(base, 0);
+ SET_TESTBUSCR(base, 0);
+ SET_TLBRSW(base, 0);
+ SET_GLOBAL_TLBIALL(base, 0);
+ SET_RPU_ACR(base, 0);
+ SET_TLBLKCRWE(base, 1);
+
+ for (ctx = 0; ctx < ncb; ctx++) {
+ SET_BPRCOSH(base, ctx, 0);
+ SET_BPRCISH(base, ctx, 0);
+ SET_BPRCNSH(base, ctx, 0);
+ SET_BPSHCFG(base, ctx, 0);
+ SET_BPMTCFG(base, ctx, 0);
+ SET_ACTLR(base, ctx, 0);
+ SET_SCTLR(base, ctx, 0);
+ SET_FSRRESTORE(base, ctx, 0);
+ SET_TTBR0(base, ctx, 0);
+ SET_TTBR1(base, ctx, 0);
+ SET_TTBCR(base, ctx, 0);
+ SET_BFBCR(base, ctx, 0);
+ SET_PAR(base, ctx, 0);
+ SET_FAR(base, ctx, 0);
+ SET_CTX_TLBIALL(base, ctx, 0);
+ SET_TLBFLPTER(base, ctx, 0);
+ SET_TLBSLPTER(base, ctx, 0);
+ SET_TLBLKCR(base, ctx, 0);
+ SET_CONTEXTIDR(base, ctx, 0);
+ }
+}
+
+static void __flush_iotlb(void *cookie)
+{
+ struct msm_priv *priv = cookie;
+ struct msm_iommu_dev *iommu = NULL;
+ struct msm_iommu_ctx_dev *master;
+ int ret = 0;
+
+ list_for_each_entry(iommu, &priv->list_attached, dom_node) {
+ ret = __enable_clocks(iommu);
+ if (ret)
+ goto fail;
+
+ list_for_each_entry(master, &iommu->ctx_list, list)
+ SET_CTX_TLBIALL(iommu->base, master->num, 0);
+
+ __disable_clocks(iommu);
+ }
+fail:
+ return;
+}
+
+static void __flush_iotlb_range(unsigned long iova, size_t size,
+ size_t granule, bool leaf, void *cookie)
+{
+ struct msm_priv *priv = cookie;
+ struct msm_iommu_dev *iommu = NULL;
+ struct msm_iommu_ctx_dev *master;
+ int ret = 0;
+ int temp_size;
+
+ list_for_each_entry(iommu, &priv->list_attached, dom_node) {
+ ret = __enable_clocks(iommu);
+ if (ret)
+ goto fail;
+
+ list_for_each_entry(master, &iommu->ctx_list, list) {
+ temp_size = size;
+ do {
+ iova &= TLBIVA_VA;
+ iova |= GET_CONTEXTIDR_ASID(iommu->base,
+ master->num);
+ SET_TLBIVA(iommu->base, master->num, iova);
+ iova += granule;
+ } while (temp_size -= granule);
+ }
+
+ __disable_clocks(iommu);
+ }
+
+fail:
+ return;
+}
+
+static void __flush_iotlb_walk(unsigned long iova, size_t size,
+ size_t granule, void *cookie)
+{
+ __flush_iotlb_range(iova, size, granule, false, cookie);
+}
+
+static void __flush_iotlb_leaf(unsigned long iova, size_t size,
+ size_t granule, void *cookie)
+{
+ __flush_iotlb_range(iova, size, granule, true, cookie);
+}
+
+static void __flush_iotlb_page(struct iommu_iotlb_gather *gather,
+ unsigned long iova, size_t granule, void *cookie)
+{
+ __flush_iotlb_range(iova, granule, granule, true, cookie);
+}
+
+static const struct iommu_flush_ops msm_iommu_flush_ops = {
+ .tlb_flush_all = __flush_iotlb,
+ .tlb_flush_walk = __flush_iotlb_walk,
+ .tlb_flush_leaf = __flush_iotlb_leaf,
+ .tlb_add_page = __flush_iotlb_page,
+};
+
+static int msm_iommu_alloc_ctx(unsigned long *map, int start, int end)
+{
+ int idx;
+
+ do {
+ idx = find_next_zero_bit(map, end, start);
+ if (idx == end)
+ return -ENOSPC;
+ } while (test_and_set_bit(idx, map));
+
+ return idx;
+}
+
+static void msm_iommu_free_ctx(unsigned long *map, int idx)
+{
+ clear_bit(idx, map);
+}
+
+static void config_mids(struct msm_iommu_dev *iommu,
+ struct msm_iommu_ctx_dev *master)
+{
+ int mid, ctx, i;
+
+ for (i = 0; i < master->num_mids; i++) {
+ mid = master->mids[i];
+ ctx = master->num;
+
+ SET_M2VCBR_N(iommu->base, mid, 0);
+ SET_CBACR_N(iommu->base, ctx, 0);
+
+ /* Set VMID = 0 */
+ SET_VMID(iommu->base, mid, 0);
+
+ /* Set the context number for that MID to this context */
+ SET_CBNDX(iommu->base, mid, ctx);
+
+ /* Set MID associated with this context bank to 0*/
+ SET_CBVMID(iommu->base, ctx, 0);
+
+ /* Set the ASID for TLB tagging for this context */
+ SET_CONTEXTIDR_ASID(iommu->base, ctx, ctx);
+
+ /* Set security bit override to be Non-secure */
+ SET_NSCFG(iommu->base, mid, 3);
+ }
+}
+
+static void __reset_context(void __iomem *base, int ctx)
+{
+ SET_BPRCOSH(base, ctx, 0);
+ SET_BPRCISH(base, ctx, 0);
+ SET_BPRCNSH(base, ctx, 0);
+ SET_BPSHCFG(base, ctx, 0);
+ SET_BPMTCFG(base, ctx, 0);
+ SET_ACTLR(base, ctx, 0);
+ SET_SCTLR(base, ctx, 0);
+ SET_FSRRESTORE(base, ctx, 0);
+ SET_TTBR0(base, ctx, 0);
+ SET_TTBR1(base, ctx, 0);
+ SET_TTBCR(base, ctx, 0);
+ SET_BFBCR(base, ctx, 0);
+ SET_PAR(base, ctx, 0);
+ SET_FAR(base, ctx, 0);
+ SET_CTX_TLBIALL(base, ctx, 0);
+ SET_TLBFLPTER(base, ctx, 0);
+ SET_TLBSLPTER(base, ctx, 0);
+ SET_TLBLKCR(base, ctx, 0);
+}
+
+static void __program_context(void __iomem *base, int ctx,
+ struct msm_priv *priv)
+{
+ __reset_context(base, ctx);
+
+ /* Turn on TEX Remap */
+ SET_TRE(base, ctx, 1);
+ SET_AFE(base, ctx, 1);
+
+ /* Set up HTW mode */
+ /* TLB miss configuration: perform HTW on miss */
+ SET_TLBMCFG(base, ctx, 0x3);
+
+ /* V2P configuration: HTW for access */
+ SET_V2PCFG(base, ctx, 0x3);
+
+ SET_TTBCR(base, ctx, priv->cfg.arm_v7s_cfg.tcr);
+ SET_TTBR0(base, ctx, priv->cfg.arm_v7s_cfg.ttbr);
+ SET_TTBR1(base, ctx, 0);
+
+ /* Set prrr and nmrr */
+ SET_PRRR(base, ctx, priv->cfg.arm_v7s_cfg.prrr);
+ SET_NMRR(base, ctx, priv->cfg.arm_v7s_cfg.nmrr);
+
+ /* Invalidate the TLB for this context */
+ SET_CTX_TLBIALL(base, ctx, 0);
+
+ /* Set interrupt number to "secure" interrupt */
+ SET_IRPTNDX(base, ctx, 0);
+
+ /* Enable context fault interrupt */
+ SET_CFEIE(base, ctx, 1);
+
+ /* Stall access on a context fault and let the handler deal with it */
+ SET_CFCFG(base, ctx, 1);
+
+ /* Redirect all cacheable requests to L2 slave port. */
+ SET_RCISH(base, ctx, 1);
+ SET_RCOSH(base, ctx, 1);
+ SET_RCNSH(base, ctx, 1);
+
+ /* Turn on BFB prefetch */
+ SET_BFBDFE(base, ctx, 1);
+
+ /* Enable the MMU */
+ SET_M(base, ctx, 1);
+}
+
+static struct iommu_domain *msm_iommu_domain_alloc(unsigned type)
+{
+ struct msm_priv *priv;
+
+ if (type != IOMMU_DOMAIN_UNMANAGED)
+ return NULL;
+
+ priv = kzalloc(sizeof(*priv), GFP_KERNEL);
+ if (!priv)
+ goto fail_nomem;
+
+ INIT_LIST_HEAD(&priv->list_attached);
+
+ priv->domain.geometry.aperture_start = 0;
+ priv->domain.geometry.aperture_end = (1ULL << 32) - 1;
+ priv->domain.geometry.force_aperture = true;
+
+ return &priv->domain;
+
+fail_nomem:
+ kfree(priv);
+ return NULL;
+}
+
+static void msm_iommu_domain_free(struct iommu_domain *domain)
+{
+ struct msm_priv *priv;
+ unsigned long flags;
+
+ spin_lock_irqsave(&msm_iommu_lock, flags);
+ priv = to_msm_priv(domain);
+ kfree(priv);
+ spin_unlock_irqrestore(&msm_iommu_lock, flags);
+}
+
+static int msm_iommu_domain_config(struct msm_priv *priv)
+{
+ spin_lock_init(&priv->pgtlock);
+
+ priv->cfg = (struct io_pgtable_cfg) {
+ .quirks = IO_PGTABLE_QUIRK_TLBI_ON_MAP,
+ .pgsize_bitmap = msm_iommu_ops.pgsize_bitmap,
+ .ias = 32,
+ .oas = 32,
+ .tlb = &msm_iommu_flush_ops,
+ .iommu_dev = priv->dev,
+ };
+
+ priv->iop = alloc_io_pgtable_ops(ARM_V7S, &priv->cfg, priv);
+ if (!priv->iop) {
+ dev_err(priv->dev, "Failed to allocate pgtable\n");
+ return -EINVAL;
+ }
+
+ msm_iommu_ops.pgsize_bitmap = priv->cfg.pgsize_bitmap;
+
+ return 0;
+}
+
+/* Must be called under msm_iommu_lock */
+static struct msm_iommu_dev *find_iommu_for_dev(struct device *dev)
+{
+ struct msm_iommu_dev *iommu, *ret = NULL;
+ struct msm_iommu_ctx_dev *master;
+
+ list_for_each_entry(iommu, &qcom_iommu_devices, dev_node) {
+ master = list_first_entry(&iommu->ctx_list,
+ struct msm_iommu_ctx_dev,
+ list);
+ if (master->of_node == dev->of_node) {
+ ret = iommu;
+ break;
+ }
+ }
+
+ return ret;
+}
+
+static struct iommu_device *msm_iommu_probe_device(struct device *dev)
+{
+ struct msm_iommu_dev *iommu;
+ unsigned long flags;
+
+ spin_lock_irqsave(&msm_iommu_lock, flags);
+ iommu = find_iommu_for_dev(dev);
+ spin_unlock_irqrestore(&msm_iommu_lock, flags);
+
+ if (!iommu)
+ return ERR_PTR(-ENODEV);
+
+ return &iommu->iommu;
+}
+
+static void msm_iommu_release_device(struct device *dev)
+{
+}
+
+static int msm_iommu_attach_dev(struct iommu_domain *domain, struct device *dev)
+{
+ int ret = 0;
+ unsigned long flags;
+ struct msm_iommu_dev *iommu;
+ struct msm_priv *priv = to_msm_priv(domain);
+ struct msm_iommu_ctx_dev *master;
+
+ priv->dev = dev;
+ msm_iommu_domain_config(priv);
+
+ spin_lock_irqsave(&msm_iommu_lock, flags);
+ list_for_each_entry(iommu, &qcom_iommu_devices, dev_node) {
+ master = list_first_entry(&iommu->ctx_list,
+ struct msm_iommu_ctx_dev,
+ list);
+ if (master->of_node == dev->of_node) {
+ ret = __enable_clocks(iommu);
+ if (ret)
+ goto fail;
+
+ list_for_each_entry(master, &iommu->ctx_list, list) {
+ if (master->num) {
+ dev_err(dev, "domain already attached");
+ ret = -EEXIST;
+ goto fail;
+ }
+ master->num =
+ msm_iommu_alloc_ctx(iommu->context_map,
+ 0, iommu->ncb);
+ if (IS_ERR_VALUE(master->num)) {
+ ret = -ENODEV;
+ goto fail;
+ }
+ config_mids(iommu, master);
+ __program_context(iommu->base, master->num,
+ priv);
+ }
+ __disable_clocks(iommu);
+ list_add(&iommu->dom_node, &priv->list_attached);
+ }
+ }
+
+fail:
+ spin_unlock_irqrestore(&msm_iommu_lock, flags);
+
+ return ret;
+}
+
+static void msm_iommu_detach_dev(struct iommu_domain *domain,
+ struct device *dev)
+{
+ struct msm_priv *priv = to_msm_priv(domain);
+ unsigned long flags;
+ struct msm_iommu_dev *iommu;
+ struct msm_iommu_ctx_dev *master;
+ int ret;
+
+ free_io_pgtable_ops(priv->iop);
+
+ spin_lock_irqsave(&msm_iommu_lock, flags);
+ list_for_each_entry(iommu, &priv->list_attached, dom_node) {
+ ret = __enable_clocks(iommu);
+ if (ret)
+ goto fail;
+
+ list_for_each_entry(master, &iommu->ctx_list, list) {
+ msm_iommu_free_ctx(iommu->context_map, master->num);
+ __reset_context(iommu->base, master->num);
+ }
+ __disable_clocks(iommu);
+ }
+fail:
+ spin_unlock_irqrestore(&msm_iommu_lock, flags);
+}
+
+static int msm_iommu_map(struct iommu_domain *domain, unsigned long iova,
+ phys_addr_t pa, size_t len, int prot, gfp_t gfp)
+{
+ struct msm_priv *priv = to_msm_priv(domain);
+ unsigned long flags;
+ int ret;
+
+ spin_lock_irqsave(&priv->pgtlock, flags);
+ ret = priv->iop->map(priv->iop, iova, pa, len, prot, GFP_ATOMIC);
+ spin_unlock_irqrestore(&priv->pgtlock, flags);
+
+ return ret;
+}
+
+static size_t msm_iommu_unmap(struct iommu_domain *domain, unsigned long iova,
+ size_t len, struct iommu_iotlb_gather *gather)
+{
+ struct msm_priv *priv = to_msm_priv(domain);
+ unsigned long flags;
+
+ spin_lock_irqsave(&priv->pgtlock, flags);
+ len = priv->iop->unmap(priv->iop, iova, len, gather);
+ spin_unlock_irqrestore(&priv->pgtlock, flags);
+
+ return len;
+}
+
+static phys_addr_t msm_iommu_iova_to_phys(struct iommu_domain *domain,
+ dma_addr_t va)
+{
+ struct msm_priv *priv;
+ struct msm_iommu_dev *iommu;
+ struct msm_iommu_ctx_dev *master;
+ unsigned int par;
+ unsigned long flags;
+ phys_addr_t ret = 0;
+
+ spin_lock_irqsave(&msm_iommu_lock, flags);
+
+ priv = to_msm_priv(domain);
+ iommu = list_first_entry(&priv->list_attached,
+ struct msm_iommu_dev, dom_node);
+
+ if (list_empty(&iommu->ctx_list))
+ goto fail;
+
+ master = list_first_entry(&iommu->ctx_list,
+ struct msm_iommu_ctx_dev, list);
+ if (!master)
+ goto fail;
+
+ ret = __enable_clocks(iommu);
+ if (ret)
+ goto fail;
+
+ /* Invalidate context TLB */
+ SET_CTX_TLBIALL(iommu->base, master->num, 0);
+ SET_V2PPR(iommu->base, master->num, va & V2Pxx_VA);
+
+ par = GET_PAR(iommu->base, master->num);
+
+ /* We are dealing with a supersection */
+ if (GET_NOFAULT_SS(iommu->base, master->num))
+ ret = (par & 0xFF000000) | (va & 0x00FFFFFF);
+ else /* Upper 20 bits from PAR, lower 12 from VA */
+ ret = (par & 0xFFFFF000) | (va & 0x00000FFF);
+
+ if (GET_FAULT(iommu->base, master->num))
+ ret = 0;
+
+ __disable_clocks(iommu);
+fail:
+ spin_unlock_irqrestore(&msm_iommu_lock, flags);
+ return ret;
+}
+
+static bool msm_iommu_capable(enum iommu_cap cap)
+{
+ return false;
+}
+
+static void print_ctx_regs(void __iomem *base, int ctx)
+{
+ unsigned int fsr = GET_FSR(base, ctx);
+ pr_err("FAR = %08x PAR = %08x\n",
+ GET_FAR(base, ctx), GET_PAR(base, ctx));
+ pr_err("FSR = %08x [%s%s%s%s%s%s%s%s%s%s]\n", fsr,
+ (fsr & 0x02) ? "TF " : "",
+ (fsr & 0x04) ? "AFF " : "",
+ (fsr & 0x08) ? "APF " : "",
+ (fsr & 0x10) ? "TLBMF " : "",
+ (fsr & 0x20) ? "HTWDEEF " : "",
+ (fsr & 0x40) ? "HTWSEEF " : "",
+ (fsr & 0x80) ? "MHF " : "",
+ (fsr & 0x10000) ? "SL " : "",
+ (fsr & 0x40000000) ? "SS " : "",
+ (fsr & 0x80000000) ? "MULTI " : "");
+
+ pr_err("FSYNR0 = %08x FSYNR1 = %08x\n",
+ GET_FSYNR0(base, ctx), GET_FSYNR1(base, ctx));
+ pr_err("TTBR0 = %08x TTBR1 = %08x\n",
+ GET_TTBR0(base, ctx), GET_TTBR1(base, ctx));
+ pr_err("SCTLR = %08x ACTLR = %08x\n",
+ GET_SCTLR(base, ctx), GET_ACTLR(base, ctx));
+}
+
+static void insert_iommu_master(struct device *dev,
+ struct msm_iommu_dev **iommu,
+ struct of_phandle_args *spec)
+{
+ struct msm_iommu_ctx_dev *master = dev_iommu_priv_get(dev);
+ int sid;
+
+ if (list_empty(&(*iommu)->ctx_list)) {
+ master = kzalloc(sizeof(*master), GFP_ATOMIC);
+ master->of_node = dev->of_node;
+ list_add(&master->list, &(*iommu)->ctx_list);
+ dev_iommu_priv_set(dev, master);
+ }
+
+ for (sid = 0; sid < master->num_mids; sid++)
+ if (master->mids[sid] == spec->args[0]) {
+ dev_warn(dev, "Stream ID 0x%hx repeated; ignoring\n",
+ sid);
+ return;
+ }
+
+ master->mids[master->num_mids++] = spec->args[0];
+}
+
+static int qcom_iommu_of_xlate(struct device *dev,
+ struct of_phandle_args *spec)
+{
+ struct msm_iommu_dev *iommu = NULL, *iter;
+ unsigned long flags;
+ int ret = 0;
+
+ spin_lock_irqsave(&msm_iommu_lock, flags);
+ list_for_each_entry(iter, &qcom_iommu_devices, dev_node) {
+ if (iter->dev->of_node == spec->np) {
+ iommu = iter;
+ break;
+ }
+ }
+
+ if (!iommu) {
+ ret = -ENODEV;
+ goto fail;
+ }
+
+ insert_iommu_master(dev, &iommu, spec);
+fail:
+ spin_unlock_irqrestore(&msm_iommu_lock, flags);
+
+ return ret;
+}
+
+irqreturn_t msm_iommu_fault_handler(int irq, void *dev_id)
+{
+ struct msm_iommu_dev *iommu = dev_id;
+ unsigned int fsr;
+ int i, ret;
+
+ spin_lock(&msm_iommu_lock);
+
+ if (!iommu) {
+ pr_err("Invalid device ID in context interrupt handler\n");
+ goto fail;
+ }
+
+ pr_err("Unexpected IOMMU page fault!\n");
+ pr_err("base = %08x\n", (unsigned int)iommu->base);
+
+ ret = __enable_clocks(iommu);
+ if (ret)
+ goto fail;
+
+ for (i = 0; i < iommu->ncb; i++) {
+ fsr = GET_FSR(iommu->base, i);
+ if (fsr) {
+ pr_err("Fault occurred in context %d.\n", i);
+ pr_err("Interesting registers:\n");
+ print_ctx_regs(iommu->base, i);
+ SET_FSR(iommu->base, i, 0x4000000F);
+ }
+ }
+ __disable_clocks(iommu);
+fail:
+ spin_unlock(&msm_iommu_lock);
+ return 0;
+}
+
+static struct iommu_ops msm_iommu_ops = {
+ .capable = msm_iommu_capable,
+ .domain_alloc = msm_iommu_domain_alloc,
+ .domain_free = msm_iommu_domain_free,
+ .attach_dev = msm_iommu_attach_dev,
+ .detach_dev = msm_iommu_detach_dev,
+ .map = msm_iommu_map,
+ .unmap = msm_iommu_unmap,
+ /*
+ * Nothing is needed here, the barrier to guarantee
+ * completion of the tlb sync operation is implicitly
+ * taken care when the iommu client does a writel before
+ * kick starting the other master.
+ */
+ .iotlb_sync = NULL,
+ .iova_to_phys = msm_iommu_iova_to_phys,
+ .probe_device = msm_iommu_probe_device,
+ .release_device = msm_iommu_release_device,
+ .device_group = generic_device_group,
+ .pgsize_bitmap = MSM_IOMMU_PGSIZES,
+ .of_xlate = qcom_iommu_of_xlate,
+};
+
+static int msm_iommu_probe(struct platform_device *pdev)
+{
+ struct resource *r;
+ resource_size_t ioaddr;
+ struct msm_iommu_dev *iommu;
+ int ret, par, val;
+
+ iommu = devm_kzalloc(&pdev->dev, sizeof(*iommu), GFP_KERNEL);
+ if (!iommu)
+ return -ENODEV;
+
+ iommu->dev = &pdev->dev;
+ INIT_LIST_HEAD(&iommu->ctx_list);
+
+ iommu->pclk = devm_clk_get(iommu->dev, "smmu_pclk");
+ if (IS_ERR(iommu->pclk)) {
+ dev_err(iommu->dev, "could not get smmu_pclk\n");
+ return PTR_ERR(iommu->pclk);
+ }
+
+ ret = clk_prepare(iommu->pclk);
+ if (ret) {
+ dev_err(iommu->dev, "could not prepare smmu_pclk\n");
+ return ret;
+ }
+
+ iommu->clk = devm_clk_get(iommu->dev, "iommu_clk");
+ if (IS_ERR(iommu->clk)) {
+ dev_err(iommu->dev, "could not get iommu_clk\n");
+ clk_unprepare(iommu->pclk);
+ return PTR_ERR(iommu->clk);
+ }
+
+ ret = clk_prepare(iommu->clk);
+ if (ret) {
+ dev_err(iommu->dev, "could not prepare iommu_clk\n");
+ clk_unprepare(iommu->pclk);
+ return ret;
+ }
+
+ r = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+ iommu->base = devm_ioremap_resource(iommu->dev, r);
+ if (IS_ERR(iommu->base)) {
+ dev_err(iommu->dev, "could not get iommu base\n");
+ ret = PTR_ERR(iommu->base);
+ goto fail;
+ }
+ ioaddr = r->start;
+
+ iommu->irq = platform_get_irq(pdev, 0);
+ if (iommu->irq < 0) {
+ ret = -ENODEV;
+ goto fail;
+ }
+
+ ret = of_property_read_u32(iommu->dev->of_node, "qcom,ncb", &val);
+ if (ret) {
+ dev_err(iommu->dev, "could not get ncb\n");
+ goto fail;
+ }
+ iommu->ncb = val;
+
+ msm_iommu_reset(iommu->base, iommu->ncb);
+ SET_M(iommu->base, 0, 1);
+ SET_PAR(iommu->base, 0, 0);
+ SET_V2PCFG(iommu->base, 0, 1);
+ SET_V2PPR(iommu->base, 0, 0);
+ par = GET_PAR(iommu->base, 0);
+ SET_V2PCFG(iommu->base, 0, 0);
+ SET_M(iommu->base, 0, 0);
+
+ if (!par) {
+ pr_err("Invalid PAR value detected\n");
+ ret = -ENODEV;
+ goto fail;
+ }
+
+ ret = devm_request_threaded_irq(iommu->dev, iommu->irq, NULL,
+ msm_iommu_fault_handler,
+ IRQF_ONESHOT | IRQF_SHARED,
+ "msm_iommu_secure_irpt_handler",
+ iommu);
+ if (ret) {
+ pr_err("Request IRQ %d failed with ret=%d\n", iommu->irq, ret);
+ goto fail;
+ }
+
+ list_add(&iommu->dev_node, &qcom_iommu_devices);
+
+ ret = iommu_device_sysfs_add(&iommu->iommu, iommu->dev, NULL,
+ "msm-smmu.%pa", &ioaddr);
+ if (ret) {
+ pr_err("Could not add msm-smmu at %pa to sysfs\n", &ioaddr);
+ goto fail;
+ }
+
+ iommu_device_set_ops(&iommu->iommu, &msm_iommu_ops);
+ iommu_device_set_fwnode(&iommu->iommu, &pdev->dev.of_node->fwnode);
+
+ ret = iommu_device_register(&iommu->iommu);
+ if (ret) {
+ pr_err("Could not register msm-smmu at %pa\n", &ioaddr);
+ goto fail;
+ }
+
+ bus_set_iommu(&platform_bus_type, &msm_iommu_ops);
+
+ pr_info("device mapped at %p, irq %d with %d ctx banks\n",
+ iommu->base, iommu->irq, iommu->ncb);
+
+ return ret;
+fail:
+ clk_unprepare(iommu->clk);
+ clk_unprepare(iommu->pclk);
+ return ret;
+}
+
+static const struct of_device_id msm_iommu_dt_match[] = {
+ { .compatible = "qcom,apq8064-iommu" },
+ {}
+};
+
+static int msm_iommu_remove(struct platform_device *pdev)
+{
+ struct msm_iommu_dev *iommu = platform_get_drvdata(pdev);
+
+ clk_unprepare(iommu->clk);
+ clk_unprepare(iommu->pclk);
+ return 0;
+}
+
+static struct platform_driver msm_iommu_driver = {
+ .driver = {
+ .name = "msm_iommu",
+ .of_match_table = msm_iommu_dt_match,
+ },
+ .probe = msm_iommu_probe,
+ .remove = msm_iommu_remove,
+};
+
+static int __init msm_iommu_driver_init(void)
+{
+ int ret;
+
+ ret = platform_driver_register(&msm_iommu_driver);
+ if (ret != 0)
+ pr_err("Failed to register IOMMU driver\n");
+
+ return ret;
+}
+subsys_initcall(msm_iommu_driver_init);
+
diff --git a/drivers/iommu/msm_iommu.h b/drivers/iommu/msm_iommu.h
new file mode 100644
index 000000000..ddae37809
--- /dev/null
+++ b/drivers/iommu/msm_iommu.h
@@ -0,0 +1,89 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/* Copyright (c) 2010-2011, Code Aurora Forum. All rights reserved.
+ */
+
+#ifndef MSM_IOMMU_H
+#define MSM_IOMMU_H
+
+#include <linux/interrupt.h>
+#include <linux/iommu.h>
+#include <linux/clk.h>
+
+/* Sharability attributes of MSM IOMMU mappings */
+#define MSM_IOMMU_ATTR_NON_SH 0x0
+#define MSM_IOMMU_ATTR_SH 0x4
+
+/* Cacheability attributes of MSM IOMMU mappings */
+#define MSM_IOMMU_ATTR_NONCACHED 0x0
+#define MSM_IOMMU_ATTR_CACHED_WB_WA 0x1
+#define MSM_IOMMU_ATTR_CACHED_WB_NWA 0x2
+#define MSM_IOMMU_ATTR_CACHED_WT 0x3
+
+/* Mask for the cache policy attribute */
+#define MSM_IOMMU_CP_MASK 0x03
+
+/* Maximum number of Machine IDs that we are allowing to be mapped to the same
+ * context bank. The number of MIDs mapped to the same CB does not affect
+ * performance, but there is a practical limit on how many distinct MIDs may
+ * be present. These mappings are typically determined at design time and are
+ * not expected to change at run time.
+ */
+#define MAX_NUM_MIDS 32
+
+/* Maximum number of context banks that can be present in IOMMU */
+#define IOMMU_MAX_CBS 128
+
+/**
+ * struct msm_iommu_dev - a single IOMMU hardware instance
+ * ncb Number of context banks present on this IOMMU HW instance
+ * dev: IOMMU device
+ * irq: Interrupt number
+ * clk: The bus clock for this IOMMU hardware instance
+ * pclk: The clock for the IOMMU bus interconnect
+ * dev_node: list head in qcom_iommu_device_list
+ * dom_node: list head for domain
+ * ctx_list: list of 'struct msm_iommu_ctx_dev'
+ * context_map: Bitmap to track allocated context banks
+ */
+struct msm_iommu_dev {
+ void __iomem *base;
+ int ncb;
+ struct device *dev;
+ int irq;
+ struct clk *clk;
+ struct clk *pclk;
+ struct list_head dev_node;
+ struct list_head dom_node;
+ struct list_head ctx_list;
+ DECLARE_BITMAP(context_map, IOMMU_MAX_CBS);
+
+ struct iommu_device iommu;
+};
+
+/**
+ * struct msm_iommu_ctx_dev - an IOMMU context bank instance
+ * of_node node ptr of client device
+ * num Index of this context bank within the hardware
+ * mids List of Machine IDs that are to be mapped into this context
+ * bank, terminated by -1. The MID is a set of signals on the
+ * AXI bus that identifies the function associated with a specific
+ * memory request. (See ARM spec).
+ * num_mids Total number of mids
+ * node list head in ctx_list
+ */
+struct msm_iommu_ctx_dev {
+ struct device_node *of_node;
+ int num;
+ int mids[MAX_NUM_MIDS];
+ int num_mids;
+ struct list_head list;
+};
+
+/*
+ * Interrupt handler for the IOMMU context fault interrupt. Hooking the
+ * interrupt is not supported in the API yet, but this will print an error
+ * message and dump useful IOMMU registers.
+ */
+irqreturn_t msm_iommu_fault_handler(int irq, void *dev_id);
+
+#endif
diff --git a/drivers/iommu/msm_iommu_hw-8xxx.h b/drivers/iommu/msm_iommu_hw-8xxx.h
new file mode 100644
index 000000000..cd957c75b
--- /dev/null
+++ b/drivers/iommu/msm_iommu_hw-8xxx.h
@@ -0,0 +1,1852 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/* Copyright (c) 2010-2011, Code Aurora Forum. All rights reserved.
+ */
+
+#ifndef __ARCH_ARM_MACH_MSM_IOMMU_HW_8XXX_H
+#define __ARCH_ARM_MACH_MSM_IOMMU_HW_8XXX_H
+
+#define CTX_SHIFT 12
+
+#define GET_GLOBAL_REG(reg, base) (readl((base) + (reg)))
+#define GET_CTX_REG(reg, base, ctx) \
+ (readl((base) + (reg) + ((ctx) << CTX_SHIFT)))
+
+#define SET_GLOBAL_REG(reg, base, val) writel((val), ((base) + (reg)))
+
+#define SET_CTX_REG(reg, base, ctx, val) \
+ writel((val), ((base) + (reg) + ((ctx) << CTX_SHIFT)))
+
+/* Wrappers for numbered registers */
+#define SET_GLOBAL_REG_N(b, n, r, v) SET_GLOBAL_REG(b, ((r) + (n << 2)), (v))
+#define GET_GLOBAL_REG_N(b, n, r) GET_GLOBAL_REG(b, ((r) + (n << 2)))
+
+/* Field wrappers */
+#define GET_GLOBAL_FIELD(b, r, F) GET_FIELD(((b) + (r)), F##_MASK, F##_SHIFT)
+#define GET_CONTEXT_FIELD(b, c, r, F) \
+ GET_FIELD(((b) + (r) + ((c) << CTX_SHIFT)), F##_MASK, F##_SHIFT)
+
+#define SET_GLOBAL_FIELD(b, r, F, v) \
+ SET_FIELD(((b) + (r)), F##_MASK, F##_SHIFT, (v))
+#define SET_CONTEXT_FIELD(b, c, r, F, v) \
+ SET_FIELD(((b) + (r) + ((c) << CTX_SHIFT)), F##_MASK, F##_SHIFT, (v))
+
+#define GET_FIELD(addr, mask, shift) ((readl(addr) >> (shift)) & (mask))
+
+#define SET_FIELD(addr, mask, shift, v) \
+do { \
+ int t = readl(addr); \
+ writel((t & ~((mask) << (shift))) + (((v) & (mask)) << (shift)), addr);\
+} while (0)
+
+
+#define NUM_FL_PTE 4096
+#define NUM_SL_PTE 256
+#define NUM_TEX_CLASS 8
+
+/* First-level page table bits */
+#define FL_BASE_MASK 0xFFFFFC00
+#define FL_TYPE_TABLE (1 << 0)
+#define FL_TYPE_SECT (2 << 0)
+#define FL_SUPERSECTION (1 << 18)
+#define FL_AP_WRITE (1 << 10)
+#define FL_AP_READ (1 << 11)
+#define FL_SHARED (1 << 16)
+#define FL_BUFFERABLE (1 << 2)
+#define FL_CACHEABLE (1 << 3)
+#define FL_TEX0 (1 << 12)
+#define FL_OFFSET(va) (((va) & 0xFFF00000) >> 20)
+#define FL_NG (1 << 17)
+
+/* Second-level page table bits */
+#define SL_BASE_MASK_LARGE 0xFFFF0000
+#define SL_BASE_MASK_SMALL 0xFFFFF000
+#define SL_TYPE_LARGE (1 << 0)
+#define SL_TYPE_SMALL (2 << 0)
+#define SL_AP0 (1 << 4)
+#define SL_AP1 (2 << 4)
+#define SL_SHARED (1 << 10)
+#define SL_BUFFERABLE (1 << 2)
+#define SL_CACHEABLE (1 << 3)
+#define SL_TEX0 (1 << 6)
+#define SL_OFFSET(va) (((va) & 0xFF000) >> 12)
+#define SL_NG (1 << 11)
+
+/* Memory type and cache policy attributes */
+#define MT_SO 0
+#define MT_DEV 1
+#define MT_NORMAL 2
+#define CP_NONCACHED 0
+#define CP_WB_WA 1
+#define CP_WT 2
+#define CP_WB_NWA 3
+
+/* Global register setters / getters */
+#define SET_M2VCBR_N(b, N, v) SET_GLOBAL_REG_N(M2VCBR_N, N, (b), (v))
+#define SET_CBACR_N(b, N, v) SET_GLOBAL_REG_N(CBACR_N, N, (b), (v))
+#define SET_TLBRSW(b, v) SET_GLOBAL_REG(TLBRSW, (b), (v))
+#define SET_TLBTR0(b, v) SET_GLOBAL_REG(TLBTR0, (b), (v))
+#define SET_TLBTR1(b, v) SET_GLOBAL_REG(TLBTR1, (b), (v))
+#define SET_TLBTR2(b, v) SET_GLOBAL_REG(TLBTR2, (b), (v))
+#define SET_TESTBUSCR(b, v) SET_GLOBAL_REG(TESTBUSCR, (b), (v))
+#define SET_GLOBAL_TLBIALL(b, v) SET_GLOBAL_REG(GLOBAL_TLBIALL, (b), (v))
+#define SET_TLBIVMID(b, v) SET_GLOBAL_REG(TLBIVMID, (b), (v))
+#define SET_CR(b, v) SET_GLOBAL_REG(CR, (b), (v))
+#define SET_EAR(b, v) SET_GLOBAL_REG(EAR, (b), (v))
+#define SET_ESR(b, v) SET_GLOBAL_REG(ESR, (b), (v))
+#define SET_ESRRESTORE(b, v) SET_GLOBAL_REG(ESRRESTORE, (b), (v))
+#define SET_ESYNR0(b, v) SET_GLOBAL_REG(ESYNR0, (b), (v))
+#define SET_ESYNR1(b, v) SET_GLOBAL_REG(ESYNR1, (b), (v))
+#define SET_RPU_ACR(b, v) SET_GLOBAL_REG(RPU_ACR, (b), (v))
+
+#define GET_M2VCBR_N(b, N) GET_GLOBAL_REG_N(M2VCBR_N, N, (b))
+#define GET_CBACR_N(b, N) GET_GLOBAL_REG_N(CBACR_N, N, (b))
+#define GET_TLBTR0(b) GET_GLOBAL_REG(TLBTR0, (b))
+#define GET_TLBTR1(b) GET_GLOBAL_REG(TLBTR1, (b))
+#define GET_TLBTR2(b) GET_GLOBAL_REG(TLBTR2, (b))
+#define GET_TESTBUSCR(b) GET_GLOBAL_REG(TESTBUSCR, (b))
+#define GET_GLOBAL_TLBIALL(b) GET_GLOBAL_REG(GLOBAL_TLBIALL, (b))
+#define GET_TLBIVMID(b) GET_GLOBAL_REG(TLBIVMID, (b))
+#define GET_CR(b) GET_GLOBAL_REG(CR, (b))
+#define GET_EAR(b) GET_GLOBAL_REG(EAR, (b))
+#define GET_ESR(b) GET_GLOBAL_REG(ESR, (b))
+#define GET_ESRRESTORE(b) GET_GLOBAL_REG(ESRRESTORE, (b))
+#define GET_ESYNR0(b) GET_GLOBAL_REG(ESYNR0, (b))
+#define GET_ESYNR1(b) GET_GLOBAL_REG(ESYNR1, (b))
+#define GET_REV(b) GET_GLOBAL_REG(REV, (b))
+#define GET_IDR(b) GET_GLOBAL_REG(IDR, (b))
+#define GET_RPU_ACR(b) GET_GLOBAL_REG(RPU_ACR, (b))
+
+
+/* Context register setters/getters */
+#define SET_SCTLR(b, c, v) SET_CTX_REG(SCTLR, (b), (c), (v))
+#define SET_ACTLR(b, c, v) SET_CTX_REG(ACTLR, (b), (c), (v))
+#define SET_CONTEXTIDR(b, c, v) SET_CTX_REG(CONTEXTIDR, (b), (c), (v))
+#define SET_TTBR0(b, c, v) SET_CTX_REG(TTBR0, (b), (c), (v))
+#define SET_TTBR1(b, c, v) SET_CTX_REG(TTBR1, (b), (c), (v))
+#define SET_TTBCR(b, c, v) SET_CTX_REG(TTBCR, (b), (c), (v))
+#define SET_PAR(b, c, v) SET_CTX_REG(PAR, (b), (c), (v))
+#define SET_FSR(b, c, v) SET_CTX_REG(FSR, (b), (c), (v))
+#define SET_FSRRESTORE(b, c, v) SET_CTX_REG(FSRRESTORE, (b), (c), (v))
+#define SET_FAR(b, c, v) SET_CTX_REG(FAR, (b), (c), (v))
+#define SET_FSYNR0(b, c, v) SET_CTX_REG(FSYNR0, (b), (c), (v))
+#define SET_FSYNR1(b, c, v) SET_CTX_REG(FSYNR1, (b), (c), (v))
+#define SET_PRRR(b, c, v) SET_CTX_REG(PRRR, (b), (c), (v))
+#define SET_NMRR(b, c, v) SET_CTX_REG(NMRR, (b), (c), (v))
+#define SET_TLBLKCR(b, c, v) SET_CTX_REG(TLBLCKR, (b), (c), (v))
+#define SET_V2PSR(b, c, v) SET_CTX_REG(V2PSR, (b), (c), (v))
+#define SET_TLBFLPTER(b, c, v) SET_CTX_REG(TLBFLPTER, (b), (c), (v))
+#define SET_TLBSLPTER(b, c, v) SET_CTX_REG(TLBSLPTER, (b), (c), (v))
+#define SET_BFBCR(b, c, v) SET_CTX_REG(BFBCR, (b), (c), (v))
+#define SET_CTX_TLBIALL(b, c, v) SET_CTX_REG(CTX_TLBIALL, (b), (c), (v))
+#define SET_TLBIASID(b, c, v) SET_CTX_REG(TLBIASID, (b), (c), (v))
+#define SET_TLBIVA(b, c, v) SET_CTX_REG(TLBIVA, (b), (c), (v))
+#define SET_TLBIVAA(b, c, v) SET_CTX_REG(TLBIVAA, (b), (c), (v))
+#define SET_V2PPR(b, c, v) SET_CTX_REG(V2PPR, (b), (c), (v))
+#define SET_V2PPW(b, c, v) SET_CTX_REG(V2PPW, (b), (c), (v))
+#define SET_V2PUR(b, c, v) SET_CTX_REG(V2PUR, (b), (c), (v))
+#define SET_V2PUW(b, c, v) SET_CTX_REG(V2PUW, (b), (c), (v))
+#define SET_RESUME(b, c, v) SET_CTX_REG(RESUME, (b), (c), (v))
+
+#define GET_SCTLR(b, c) GET_CTX_REG(SCTLR, (b), (c))
+#define GET_ACTLR(b, c) GET_CTX_REG(ACTLR, (b), (c))
+#define GET_CONTEXTIDR(b, c) GET_CTX_REG(CONTEXTIDR, (b), (c))
+#define GET_TTBR0(b, c) GET_CTX_REG(TTBR0, (b), (c))
+#define GET_TTBR1(b, c) GET_CTX_REG(TTBR1, (b), (c))
+#define GET_TTBCR(b, c) GET_CTX_REG(TTBCR, (b), (c))
+#define GET_PAR(b, c) GET_CTX_REG(PAR, (b), (c))
+#define GET_FSR(b, c) GET_CTX_REG(FSR, (b), (c))
+#define GET_FSRRESTORE(b, c) GET_CTX_REG(FSRRESTORE, (b), (c))
+#define GET_FAR(b, c) GET_CTX_REG(FAR, (b), (c))
+#define GET_FSYNR0(b, c) GET_CTX_REG(FSYNR0, (b), (c))
+#define GET_FSYNR1(b, c) GET_CTX_REG(FSYNR1, (b), (c))
+#define GET_PRRR(b, c) GET_CTX_REG(PRRR, (b), (c))
+#define GET_NMRR(b, c) GET_CTX_REG(NMRR, (b), (c))
+#define GET_TLBLCKR(b, c) GET_CTX_REG(TLBLCKR, (b), (c))
+#define GET_V2PSR(b, c) GET_CTX_REG(V2PSR, (b), (c))
+#define GET_TLBFLPTER(b, c) GET_CTX_REG(TLBFLPTER, (b), (c))
+#define GET_TLBSLPTER(b, c) GET_CTX_REG(TLBSLPTER, (b), (c))
+#define GET_BFBCR(b, c) GET_CTX_REG(BFBCR, (b), (c))
+#define GET_CTX_TLBIALL(b, c) GET_CTX_REG(CTX_TLBIALL, (b), (c))
+#define GET_TLBIASID(b, c) GET_CTX_REG(TLBIASID, (b), (c))
+#define GET_TLBIVA(b, c) GET_CTX_REG(TLBIVA, (b), (c))
+#define GET_TLBIVAA(b, c) GET_CTX_REG(TLBIVAA, (b), (c))
+#define GET_V2PPR(b, c) GET_CTX_REG(V2PPR, (b), (c))
+#define GET_V2PPW(b, c) GET_CTX_REG(V2PPW, (b), (c))
+#define GET_V2PUR(b, c) GET_CTX_REG(V2PUR, (b), (c))
+#define GET_V2PUW(b, c) GET_CTX_REG(V2PUW, (b), (c))
+#define GET_RESUME(b, c) GET_CTX_REG(RESUME, (b), (c))
+
+
+/* Global field setters / getters */
+/* Global Field Setters: */
+/* CBACR_N */
+#define SET_RWVMID(b, n, v) SET_GLOBAL_FIELD(b, (n<<2)|(CBACR_N), RWVMID, v)
+#define SET_RWE(b, n, v) SET_GLOBAL_FIELD(b, (n<<2)|(CBACR_N), RWE, v)
+#define SET_RWGE(b, n, v) SET_GLOBAL_FIELD(b, (n<<2)|(CBACR_N), RWGE, v)
+#define SET_CBVMID(b, n, v) SET_GLOBAL_FIELD(b, (n<<2)|(CBACR_N), CBVMID, v)
+#define SET_IRPTNDX(b, n, v) SET_GLOBAL_FIELD(b, (n<<2)|(CBACR_N), IRPTNDX, v)
+
+
+/* M2VCBR_N */
+#define SET_VMID(b, n, v) SET_GLOBAL_FIELD(b, (n<<2)|(M2VCBR_N), VMID, v)
+#define SET_CBNDX(b, n, v) SET_GLOBAL_FIELD(b, (n<<2)|(M2VCBR_N), CBNDX, v)
+#define SET_BYPASSD(b, n, v) SET_GLOBAL_FIELD(b, (n<<2)|(M2VCBR_N), BYPASSD, v)
+#define SET_BPRCOSH(b, n, v) SET_GLOBAL_FIELD(b, (n<<2)|(M2VCBR_N), BPRCOSH, v)
+#define SET_BPRCISH(b, n, v) SET_GLOBAL_FIELD(b, (n<<2)|(M2VCBR_N), BPRCISH, v)
+#define SET_BPRCNSH(b, n, v) SET_GLOBAL_FIELD(b, (n<<2)|(M2VCBR_N), BPRCNSH, v)
+#define SET_BPSHCFG(b, n, v) SET_GLOBAL_FIELD(b, (n<<2)|(M2VCBR_N), BPSHCFG, v)
+#define SET_NSCFG(b, n, v) SET_GLOBAL_FIELD(b, (n<<2)|(M2VCBR_N), NSCFG, v)
+#define SET_BPMTCFG(b, n, v) SET_GLOBAL_FIELD(b, (n<<2)|(M2VCBR_N), BPMTCFG, v)
+#define SET_BPMEMTYPE(b, n, v) \
+ SET_GLOBAL_FIELD(b, (n<<2)|(M2VCBR_N), BPMEMTYPE, v)
+
+
+/* CR */
+#define SET_RPUE(b, v) SET_GLOBAL_FIELD(b, CR, RPUE, v)
+#define SET_RPUERE(b, v) SET_GLOBAL_FIELD(b, CR, RPUERE, v)
+#define SET_RPUEIE(b, v) SET_GLOBAL_FIELD(b, CR, RPUEIE, v)
+#define SET_DCDEE(b, v) SET_GLOBAL_FIELD(b, CR, DCDEE, v)
+#define SET_CLIENTPD(b, v) SET_GLOBAL_FIELD(b, CR, CLIENTPD, v)
+#define SET_STALLD(b, v) SET_GLOBAL_FIELD(b, CR, STALLD, v)
+#define SET_TLBLKCRWE(b, v) SET_GLOBAL_FIELD(b, CR, TLBLKCRWE, v)
+#define SET_CR_TLBIALLCFG(b, v) SET_GLOBAL_FIELD(b, CR, CR_TLBIALLCFG, v)
+#define SET_TLBIVMIDCFG(b, v) SET_GLOBAL_FIELD(b, CR, TLBIVMIDCFG, v)
+#define SET_CR_HUME(b, v) SET_GLOBAL_FIELD(b, CR, CR_HUME, v)
+
+
+/* ESR */
+#define SET_CFG(b, v) SET_GLOBAL_FIELD(b, ESR, CFG, v)
+#define SET_BYPASS(b, v) SET_GLOBAL_FIELD(b, ESR, BYPASS, v)
+#define SET_ESR_MULTI(b, v) SET_GLOBAL_FIELD(b, ESR, ESR_MULTI, v)
+
+
+/* ESYNR0 */
+#define SET_ESYNR0_AMID(b, v) SET_GLOBAL_FIELD(b, ESYNR0, ESYNR0_AMID, v)
+#define SET_ESYNR0_APID(b, v) SET_GLOBAL_FIELD(b, ESYNR0, ESYNR0_APID, v)
+#define SET_ESYNR0_ABID(b, v) SET_GLOBAL_FIELD(b, ESYNR0, ESYNR0_ABID, v)
+#define SET_ESYNR0_AVMID(b, v) SET_GLOBAL_FIELD(b, ESYNR0, ESYNR0_AVMID, v)
+#define SET_ESYNR0_ATID(b, v) SET_GLOBAL_FIELD(b, ESYNR0, ESYNR0_ATID, v)
+
+
+/* ESYNR1 */
+#define SET_ESYNR1_AMEMTYPE(b, v) \
+ SET_GLOBAL_FIELD(b, ESYNR1, ESYNR1_AMEMTYPE, v)
+#define SET_ESYNR1_ASHARED(b, v) SET_GLOBAL_FIELD(b, ESYNR1, ESYNR1_ASHARED, v)
+#define SET_ESYNR1_AINNERSHARED(b, v) \
+ SET_GLOBAL_FIELD(b, ESYNR1, ESYNR1_AINNERSHARED, v)
+#define SET_ESYNR1_APRIV(b, v) SET_GLOBAL_FIELD(b, ESYNR1, ESYNR1_APRIV, v)
+#define SET_ESYNR1_APROTNS(b, v) SET_GLOBAL_FIELD(b, ESYNR1, ESYNR1_APROTNS, v)
+#define SET_ESYNR1_AINST(b, v) SET_GLOBAL_FIELD(b, ESYNR1, ESYNR1_AINST, v)
+#define SET_ESYNR1_AWRITE(b, v) SET_GLOBAL_FIELD(b, ESYNR1, ESYNR1_AWRITE, v)
+#define SET_ESYNR1_ABURST(b, v) SET_GLOBAL_FIELD(b, ESYNR1, ESYNR1_ABURST, v)
+#define SET_ESYNR1_ALEN(b, v) SET_GLOBAL_FIELD(b, ESYNR1, ESYNR1_ALEN, v)
+#define SET_ESYNR1_ASIZE(b, v) SET_GLOBAL_FIELD(b, ESYNR1, ESYNR1_ASIZE, v)
+#define SET_ESYNR1_ALOCK(b, v) SET_GLOBAL_FIELD(b, ESYNR1, ESYNR1_ALOCK, v)
+#define SET_ESYNR1_AOOO(b, v) SET_GLOBAL_FIELD(b, ESYNR1, ESYNR1_AOOO, v)
+#define SET_ESYNR1_AFULL(b, v) SET_GLOBAL_FIELD(b, ESYNR1, ESYNR1_AFULL, v)
+#define SET_ESYNR1_AC(b, v) SET_GLOBAL_FIELD(b, ESYNR1, ESYNR1_AC, v)
+#define SET_ESYNR1_DCD(b, v) SET_GLOBAL_FIELD(b, ESYNR1, ESYNR1_DCD, v)
+
+
+/* TESTBUSCR */
+#define SET_TBE(b, v) SET_GLOBAL_FIELD(b, TESTBUSCR, TBE, v)
+#define SET_SPDMBE(b, v) SET_GLOBAL_FIELD(b, TESTBUSCR, SPDMBE, v)
+#define SET_WGSEL(b, v) SET_GLOBAL_FIELD(b, TESTBUSCR, WGSEL, v)
+#define SET_TBLSEL(b, v) SET_GLOBAL_FIELD(b, TESTBUSCR, TBLSEL, v)
+#define SET_TBHSEL(b, v) SET_GLOBAL_FIELD(b, TESTBUSCR, TBHSEL, v)
+#define SET_SPDM0SEL(b, v) SET_GLOBAL_FIELD(b, TESTBUSCR, SPDM0SEL, v)
+#define SET_SPDM1SEL(b, v) SET_GLOBAL_FIELD(b, TESTBUSCR, SPDM1SEL, v)
+#define SET_SPDM2SEL(b, v) SET_GLOBAL_FIELD(b, TESTBUSCR, SPDM2SEL, v)
+#define SET_SPDM3SEL(b, v) SET_GLOBAL_FIELD(b, TESTBUSCR, SPDM3SEL, v)
+
+
+/* TLBIVMID */
+#define SET_TLBIVMID_VMID(b, v) SET_GLOBAL_FIELD(b, TLBIVMID, TLBIVMID_VMID, v)
+
+
+/* TLBRSW */
+#define SET_TLBRSW_INDEX(b, v) SET_GLOBAL_FIELD(b, TLBRSW, TLBRSW_INDEX, v)
+#define SET_TLBBFBS(b, v) SET_GLOBAL_FIELD(b, TLBRSW, TLBBFBS, v)
+
+
+/* TLBTR0 */
+#define SET_PR(b, v) SET_GLOBAL_FIELD(b, TLBTR0, PR, v)
+#define SET_PW(b, v) SET_GLOBAL_FIELD(b, TLBTR0, PW, v)
+#define SET_UR(b, v) SET_GLOBAL_FIELD(b, TLBTR0, UR, v)
+#define SET_UW(b, v) SET_GLOBAL_FIELD(b, TLBTR0, UW, v)
+#define SET_XN(b, v) SET_GLOBAL_FIELD(b, TLBTR0, XN, v)
+#define SET_NSDESC(b, v) SET_GLOBAL_FIELD(b, TLBTR0, NSDESC, v)
+#define SET_ISH(b, v) SET_GLOBAL_FIELD(b, TLBTR0, ISH, v)
+#define SET_SH(b, v) SET_GLOBAL_FIELD(b, TLBTR0, SH, v)
+#define SET_MT(b, v) SET_GLOBAL_FIELD(b, TLBTR0, MT, v)
+#define SET_DPSIZR(b, v) SET_GLOBAL_FIELD(b, TLBTR0, DPSIZR, v)
+#define SET_DPSIZC(b, v) SET_GLOBAL_FIELD(b, TLBTR0, DPSIZC, v)
+
+
+/* TLBTR1 */
+#define SET_TLBTR1_VMID(b, v) SET_GLOBAL_FIELD(b, TLBTR1, TLBTR1_VMID, v)
+#define SET_TLBTR1_PA(b, v) SET_GLOBAL_FIELD(b, TLBTR1, TLBTR1_PA, v)
+
+
+/* TLBTR2 */
+#define SET_TLBTR2_ASID(b, v) SET_GLOBAL_FIELD(b, TLBTR2, TLBTR2_ASID, v)
+#define SET_TLBTR2_V(b, v) SET_GLOBAL_FIELD(b, TLBTR2, TLBTR2_V, v)
+#define SET_TLBTR2_NSTID(b, v) SET_GLOBAL_FIELD(b, TLBTR2, TLBTR2_NSTID, v)
+#define SET_TLBTR2_NV(b, v) SET_GLOBAL_FIELD(b, TLBTR2, TLBTR2_NV, v)
+#define SET_TLBTR2_VA(b, v) SET_GLOBAL_FIELD(b, TLBTR2, TLBTR2_VA, v)
+
+
+/* Global Field Getters */
+/* CBACR_N */
+#define GET_RWVMID(b, n) GET_GLOBAL_FIELD(b, (n<<2)|(CBACR_N), RWVMID)
+#define GET_RWE(b, n) GET_GLOBAL_FIELD(b, (n<<2)|(CBACR_N), RWE)
+#define GET_RWGE(b, n) GET_GLOBAL_FIELD(b, (n<<2)|(CBACR_N), RWGE)
+#define GET_CBVMID(b, n) GET_GLOBAL_FIELD(b, (n<<2)|(CBACR_N), CBVMID)
+#define GET_IRPTNDX(b, n) GET_GLOBAL_FIELD(b, (n<<2)|(CBACR_N), IRPTNDX)
+
+
+/* M2VCBR_N */
+#define GET_VMID(b, n) GET_GLOBAL_FIELD(b, (n<<2)|(M2VCBR_N), VMID)
+#define GET_CBNDX(b, n) GET_GLOBAL_FIELD(b, (n<<2)|(M2VCBR_N), CBNDX)
+#define GET_BYPASSD(b, n) GET_GLOBAL_FIELD(b, (n<<2)|(M2VCBR_N), BYPASSD)
+#define GET_BPRCOSH(b, n) GET_GLOBAL_FIELD(b, (n<<2)|(M2VCBR_N), BPRCOSH)
+#define GET_BPRCISH(b, n) GET_GLOBAL_FIELD(b, (n<<2)|(M2VCBR_N), BPRCISH)
+#define GET_BPRCNSH(b, n) GET_GLOBAL_FIELD(b, (n<<2)|(M2VCBR_N), BPRCNSH)
+#define GET_BPSHCFG(b, n) GET_GLOBAL_FIELD(b, (n<<2)|(M2VCBR_N), BPSHCFG)
+#define GET_NSCFG(b, n) GET_GLOBAL_FIELD(b, (n<<2)|(M2VCBR_N), NSCFG)
+#define GET_BPMTCFG(b, n) GET_GLOBAL_FIELD(b, (n<<2)|(M2VCBR_N), BPMTCFG)
+#define GET_BPMEMTYPE(b, n) GET_GLOBAL_FIELD(b, (n<<2)|(M2VCBR_N), BPMEMTYPE)
+
+
+/* CR */
+#define GET_RPUE(b) GET_GLOBAL_FIELD(b, CR, RPUE)
+#define GET_RPUERE(b) GET_GLOBAL_FIELD(b, CR, RPUERE)
+#define GET_RPUEIE(b) GET_GLOBAL_FIELD(b, CR, RPUEIE)
+#define GET_DCDEE(b) GET_GLOBAL_FIELD(b, CR, DCDEE)
+#define GET_CLIENTPD(b) GET_GLOBAL_FIELD(b, CR, CLIENTPD)
+#define GET_STALLD(b) GET_GLOBAL_FIELD(b, CR, STALLD)
+#define GET_TLBLKCRWE(b) GET_GLOBAL_FIELD(b, CR, TLBLKCRWE)
+#define GET_CR_TLBIALLCFG(b) GET_GLOBAL_FIELD(b, CR, CR_TLBIALLCFG)
+#define GET_TLBIVMIDCFG(b) GET_GLOBAL_FIELD(b, CR, TLBIVMIDCFG)
+#define GET_CR_HUME(b) GET_GLOBAL_FIELD(b, CR, CR_HUME)
+
+
+/* ESR */
+#define GET_CFG(b) GET_GLOBAL_FIELD(b, ESR, CFG)
+#define GET_BYPASS(b) GET_GLOBAL_FIELD(b, ESR, BYPASS)
+#define GET_ESR_MULTI(b) GET_GLOBAL_FIELD(b, ESR, ESR_MULTI)
+
+
+/* ESYNR0 */
+#define GET_ESYNR0_AMID(b) GET_GLOBAL_FIELD(b, ESYNR0, ESYNR0_AMID)
+#define GET_ESYNR0_APID(b) GET_GLOBAL_FIELD(b, ESYNR0, ESYNR0_APID)
+#define GET_ESYNR0_ABID(b) GET_GLOBAL_FIELD(b, ESYNR0, ESYNR0_ABID)
+#define GET_ESYNR0_AVMID(b) GET_GLOBAL_FIELD(b, ESYNR0, ESYNR0_AVMID)
+#define GET_ESYNR0_ATID(b) GET_GLOBAL_FIELD(b, ESYNR0, ESYNR0_ATID)
+
+
+/* ESYNR1 */
+#define GET_ESYNR1_AMEMTYPE(b) GET_GLOBAL_FIELD(b, ESYNR1, ESYNR1_AMEMTYPE)
+#define GET_ESYNR1_ASHARED(b) GET_GLOBAL_FIELD(b, ESYNR1, ESYNR1_ASHARED)
+#define GET_ESYNR1_AINNERSHARED(b) \
+ GET_GLOBAL_FIELD(b, ESYNR1, ESYNR1_AINNERSHARED)
+#define GET_ESYNR1_APRIV(b) GET_GLOBAL_FIELD(b, ESYNR1, ESYNR1_APRIV)
+#define GET_ESYNR1_APROTNS(b) GET_GLOBAL_FIELD(b, ESYNR1, ESYNR1_APROTNS)
+#define GET_ESYNR1_AINST(b) GET_GLOBAL_FIELD(b, ESYNR1, ESYNR1_AINST)
+#define GET_ESYNR1_AWRITE(b) GET_GLOBAL_FIELD(b, ESYNR1, ESYNR1_AWRITE)
+#define GET_ESYNR1_ABURST(b) GET_GLOBAL_FIELD(b, ESYNR1, ESYNR1_ABURST)
+#define GET_ESYNR1_ALEN(b) GET_GLOBAL_FIELD(b, ESYNR1, ESYNR1_ALEN)
+#define GET_ESYNR1_ASIZE(b) GET_GLOBAL_FIELD(b, ESYNR1, ESYNR1_ASIZE)
+#define GET_ESYNR1_ALOCK(b) GET_GLOBAL_FIELD(b, ESYNR1, ESYNR1_ALOCK)
+#define GET_ESYNR1_AOOO(b) GET_GLOBAL_FIELD(b, ESYNR1, ESYNR1_AOOO)
+#define GET_ESYNR1_AFULL(b) GET_GLOBAL_FIELD(b, ESYNR1, ESYNR1_AFULL)
+#define GET_ESYNR1_AC(b) GET_GLOBAL_FIELD(b, ESYNR1, ESYNR1_AC)
+#define GET_ESYNR1_DCD(b) GET_GLOBAL_FIELD(b, ESYNR1, ESYNR1_DCD)
+
+
+/* IDR */
+#define GET_NM2VCBMT(b) GET_GLOBAL_FIELD(b, IDR, NM2VCBMT)
+#define GET_HTW(b) GET_GLOBAL_FIELD(b, IDR, HTW)
+#define GET_HUM(b) GET_GLOBAL_FIELD(b, IDR, HUM)
+#define GET_TLBSIZE(b) GET_GLOBAL_FIELD(b, IDR, TLBSIZE)
+#define GET_NCB(b) GET_GLOBAL_FIELD(b, IDR, NCB)
+#define GET_NIRPT(b) GET_GLOBAL_FIELD(b, IDR, NIRPT)
+
+
+/* REV */
+#define GET_MAJOR(b) GET_GLOBAL_FIELD(b, REV, MAJOR)
+#define GET_MINOR(b) GET_GLOBAL_FIELD(b, REV, MINOR)
+
+
+/* TESTBUSCR */
+#define GET_TBE(b) GET_GLOBAL_FIELD(b, TESTBUSCR, TBE)
+#define GET_SPDMBE(b) GET_GLOBAL_FIELD(b, TESTBUSCR, SPDMBE)
+#define GET_WGSEL(b) GET_GLOBAL_FIELD(b, TESTBUSCR, WGSEL)
+#define GET_TBLSEL(b) GET_GLOBAL_FIELD(b, TESTBUSCR, TBLSEL)
+#define GET_TBHSEL(b) GET_GLOBAL_FIELD(b, TESTBUSCR, TBHSEL)
+#define GET_SPDM0SEL(b) GET_GLOBAL_FIELD(b, TESTBUSCR, SPDM0SEL)
+#define GET_SPDM1SEL(b) GET_GLOBAL_FIELD(b, TESTBUSCR, SPDM1SEL)
+#define GET_SPDM2SEL(b) GET_GLOBAL_FIELD(b, TESTBUSCR, SPDM2SEL)
+#define GET_SPDM3SEL(b) GET_GLOBAL_FIELD(b, TESTBUSCR, SPDM3SEL)
+
+
+/* TLBIVMID */
+#define GET_TLBIVMID_VMID(b) GET_GLOBAL_FIELD(b, TLBIVMID, TLBIVMID_VMID)
+
+
+/* TLBTR0 */
+#define GET_PR(b) GET_GLOBAL_FIELD(b, TLBTR0, PR)
+#define GET_PW(b) GET_GLOBAL_FIELD(b, TLBTR0, PW)
+#define GET_UR(b) GET_GLOBAL_FIELD(b, TLBTR0, UR)
+#define GET_UW(b) GET_GLOBAL_FIELD(b, TLBTR0, UW)
+#define GET_XN(b) GET_GLOBAL_FIELD(b, TLBTR0, XN)
+#define GET_NSDESC(b) GET_GLOBAL_FIELD(b, TLBTR0, NSDESC)
+#define GET_ISH(b) GET_GLOBAL_FIELD(b, TLBTR0, ISH)
+#define GET_SH(b) GET_GLOBAL_FIELD(b, TLBTR0, SH)
+#define GET_MT(b) GET_GLOBAL_FIELD(b, TLBTR0, MT)
+#define GET_DPSIZR(b) GET_GLOBAL_FIELD(b, TLBTR0, DPSIZR)
+#define GET_DPSIZC(b) GET_GLOBAL_FIELD(b, TLBTR0, DPSIZC)
+
+
+/* TLBTR1 */
+#define GET_TLBTR1_VMID(b) GET_GLOBAL_FIELD(b, TLBTR1, TLBTR1_VMID)
+#define GET_TLBTR1_PA(b) GET_GLOBAL_FIELD(b, TLBTR1, TLBTR1_PA)
+
+
+/* TLBTR2 */
+#define GET_TLBTR2_ASID(b) GET_GLOBAL_FIELD(b, TLBTR2, TLBTR2_ASID)
+#define GET_TLBTR2_V(b) GET_GLOBAL_FIELD(b, TLBTR2, TLBTR2_V)
+#define GET_TLBTR2_NSTID(b) GET_GLOBAL_FIELD(b, TLBTR2, TLBTR2_NSTID)
+#define GET_TLBTR2_NV(b) GET_GLOBAL_FIELD(b, TLBTR2, TLBTR2_NV)
+#define GET_TLBTR2_VA(b) GET_GLOBAL_FIELD(b, TLBTR2, TLBTR2_VA)
+
+
+/* Context Register setters / getters */
+/* Context Register setters */
+/* ACTLR */
+#define SET_CFERE(b, c, v) SET_CONTEXT_FIELD(b, c, ACTLR, CFERE, v)
+#define SET_CFEIE(b, c, v) SET_CONTEXT_FIELD(b, c, ACTLR, CFEIE, v)
+#define SET_PTSHCFG(b, c, v) SET_CONTEXT_FIELD(b, c, ACTLR, PTSHCFG, v)
+#define SET_RCOSH(b, c, v) SET_CONTEXT_FIELD(b, c, ACTLR, RCOSH, v)
+#define SET_RCISH(b, c, v) SET_CONTEXT_FIELD(b, c, ACTLR, RCISH, v)
+#define SET_RCNSH(b, c, v) SET_CONTEXT_FIELD(b, c, ACTLR, RCNSH, v)
+#define SET_PRIVCFG(b, c, v) SET_CONTEXT_FIELD(b, c, ACTLR, PRIVCFG, v)
+#define SET_DNA(b, c, v) SET_CONTEXT_FIELD(b, c, ACTLR, DNA, v)
+#define SET_DNLV2PA(b, c, v) SET_CONTEXT_FIELD(b, c, ACTLR, DNLV2PA, v)
+#define SET_TLBMCFG(b, c, v) SET_CONTEXT_FIELD(b, c, ACTLR, TLBMCFG, v)
+#define SET_CFCFG(b, c, v) SET_CONTEXT_FIELD(b, c, ACTLR, CFCFG, v)
+#define SET_TIPCF(b, c, v) SET_CONTEXT_FIELD(b, c, ACTLR, TIPCF, v)
+#define SET_V2PCFG(b, c, v) SET_CONTEXT_FIELD(b, c, ACTLR, V2PCFG, v)
+#define SET_HUME(b, c, v) SET_CONTEXT_FIELD(b, c, ACTLR, HUME, v)
+#define SET_PTMTCFG(b, c, v) SET_CONTEXT_FIELD(b, c, ACTLR, PTMTCFG, v)
+#define SET_PTMEMTYPE(b, c, v) SET_CONTEXT_FIELD(b, c, ACTLR, PTMEMTYPE, v)
+
+
+/* BFBCR */
+#define SET_BFBDFE(b, c, v) SET_CONTEXT_FIELD(b, c, BFBCR, BFBDFE, v)
+#define SET_BFBSFE(b, c, v) SET_CONTEXT_FIELD(b, c, BFBCR, BFBSFE, v)
+#define SET_SFVS(b, c, v) SET_CONTEXT_FIELD(b, c, BFBCR, SFVS, v)
+#define SET_FLVIC(b, c, v) SET_CONTEXT_FIELD(b, c, BFBCR, FLVIC, v)
+#define SET_SLVIC(b, c, v) SET_CONTEXT_FIELD(b, c, BFBCR, SLVIC, v)
+
+
+/* CONTEXTIDR */
+#define SET_CONTEXTIDR_ASID(b, c, v) \
+ SET_CONTEXT_FIELD(b, c, CONTEXTIDR, CONTEXTIDR_ASID, v)
+#define SET_CONTEXTIDR_PROCID(b, c, v) \
+ SET_CONTEXT_FIELD(b, c, CONTEXTIDR, PROCID, v)
+
+
+/* FSR */
+#define SET_TF(b, c, v) SET_CONTEXT_FIELD(b, c, FSR, TF, v)
+#define SET_AFF(b, c, v) SET_CONTEXT_FIELD(b, c, FSR, AFF, v)
+#define SET_APF(b, c, v) SET_CONTEXT_FIELD(b, c, FSR, APF, v)
+#define SET_TLBMF(b, c, v) SET_CONTEXT_FIELD(b, c, FSR, TLBMF, v)
+#define SET_HTWDEEF(b, c, v) SET_CONTEXT_FIELD(b, c, FSR, HTWDEEF, v)
+#define SET_HTWSEEF(b, c, v) SET_CONTEXT_FIELD(b, c, FSR, HTWSEEF, v)
+#define SET_MHF(b, c, v) SET_CONTEXT_FIELD(b, c, FSR, MHF, v)
+#define SET_SL(b, c, v) SET_CONTEXT_FIELD(b, c, FSR, SL, v)
+#define SET_SS(b, c, v) SET_CONTEXT_FIELD(b, c, FSR, SS, v)
+#define SET_MULTI(b, c, v) SET_CONTEXT_FIELD(b, c, FSR, MULTI, v)
+
+
+/* FSYNR0 */
+#define SET_AMID(b, c, v) SET_CONTEXT_FIELD(b, c, FSYNR0, AMID, v)
+#define SET_APID(b, c, v) SET_CONTEXT_FIELD(b, c, FSYNR0, APID, v)
+#define SET_ABID(b, c, v) SET_CONTEXT_FIELD(b, c, FSYNR0, ABID, v)
+#define SET_ATID(b, c, v) SET_CONTEXT_FIELD(b, c, FSYNR0, ATID, v)
+
+
+/* FSYNR1 */
+#define SET_AMEMTYPE(b, c, v) SET_CONTEXT_FIELD(b, c, FSYNR1, AMEMTYPE, v)
+#define SET_ASHARED(b, c, v) SET_CONTEXT_FIELD(b, c, FSYNR1, ASHARED, v)
+#define SET_AINNERSHARED(b, c, v) \
+ SET_CONTEXT_FIELD(b, c, FSYNR1, AINNERSHARED, v)
+#define SET_APRIV(b, c, v) SET_CONTEXT_FIELD(b, c, FSYNR1, APRIV, v)
+#define SET_APROTNS(b, c, v) SET_CONTEXT_FIELD(b, c, FSYNR1, APROTNS, v)
+#define SET_AINST(b, c, v) SET_CONTEXT_FIELD(b, c, FSYNR1, AINST, v)
+#define SET_AWRITE(b, c, v) SET_CONTEXT_FIELD(b, c, FSYNR1, AWRITE, v)
+#define SET_ABURST(b, c, v) SET_CONTEXT_FIELD(b, c, FSYNR1, ABURST, v)
+#define SET_ALEN(b, c, v) SET_CONTEXT_FIELD(b, c, FSYNR1, ALEN, v)
+#define SET_FSYNR1_ASIZE(b, c, v) \
+ SET_CONTEXT_FIELD(b, c, FSYNR1, FSYNR1_ASIZE, v)
+#define SET_ALOCK(b, c, v) SET_CONTEXT_FIELD(b, c, FSYNR1, ALOCK, v)
+#define SET_AFULL(b, c, v) SET_CONTEXT_FIELD(b, c, FSYNR1, AFULL, v)
+
+
+/* NMRR */
+#define SET_ICPC0(b, c, v) SET_CONTEXT_FIELD(b, c, NMRR, ICPC0, v)
+#define SET_ICPC1(b, c, v) SET_CONTEXT_FIELD(b, c, NMRR, ICPC1, v)
+#define SET_ICPC2(b, c, v) SET_CONTEXT_FIELD(b, c, NMRR, ICPC2, v)
+#define SET_ICPC3(b, c, v) SET_CONTEXT_FIELD(b, c, NMRR, ICPC3, v)
+#define SET_ICPC4(b, c, v) SET_CONTEXT_FIELD(b, c, NMRR, ICPC4, v)
+#define SET_ICPC5(b, c, v) SET_CONTEXT_FIELD(b, c, NMRR, ICPC5, v)
+#define SET_ICPC6(b, c, v) SET_CONTEXT_FIELD(b, c, NMRR, ICPC6, v)
+#define SET_ICPC7(b, c, v) SET_CONTEXT_FIELD(b, c, NMRR, ICPC7, v)
+#define SET_OCPC0(b, c, v) SET_CONTEXT_FIELD(b, c, NMRR, OCPC0, v)
+#define SET_OCPC1(b, c, v) SET_CONTEXT_FIELD(b, c, NMRR, OCPC1, v)
+#define SET_OCPC2(b, c, v) SET_CONTEXT_FIELD(b, c, NMRR, OCPC2, v)
+#define SET_OCPC3(b, c, v) SET_CONTEXT_FIELD(b, c, NMRR, OCPC3, v)
+#define SET_OCPC4(b, c, v) SET_CONTEXT_FIELD(b, c, NMRR, OCPC4, v)
+#define SET_OCPC5(b, c, v) SET_CONTEXT_FIELD(b, c, NMRR, OCPC5, v)
+#define SET_OCPC6(b, c, v) SET_CONTEXT_FIELD(b, c, NMRR, OCPC6, v)
+#define SET_OCPC7(b, c, v) SET_CONTEXT_FIELD(b, c, NMRR, OCPC7, v)
+
+
+/* PAR */
+#define SET_FAULT(b, c, v) SET_CONTEXT_FIELD(b, c, PAR, FAULT, v)
+
+#define SET_FAULT_TF(b, c, v) SET_CONTEXT_FIELD(b, c, PAR, FAULT_TF, v)
+#define SET_FAULT_AFF(b, c, v) SET_CONTEXT_FIELD(b, c, PAR, FAULT_AFF, v)
+#define SET_FAULT_APF(b, c, v) SET_CONTEXT_FIELD(b, c, PAR, FAULT_APF, v)
+#define SET_FAULT_TLBMF(b, c, v) SET_CONTEXT_FIELD(b, c, PAR, FAULT_TLBMF, v)
+#define SET_FAULT_HTWDEEF(b, c, v) \
+ SET_CONTEXT_FIELD(b, c, PAR, FAULT_HTWDEEF, v)
+#define SET_FAULT_HTWSEEF(b, c, v) \
+ SET_CONTEXT_FIELD(b, c, PAR, FAULT_HTWSEEF, v)
+#define SET_FAULT_MHF(b, c, v) SET_CONTEXT_FIELD(b, c, PAR, FAULT_MHF, v)
+#define SET_FAULT_SL(b, c, v) SET_CONTEXT_FIELD(b, c, PAR, FAULT_SL, v)
+#define SET_FAULT_SS(b, c, v) SET_CONTEXT_FIELD(b, c, PAR, FAULT_SS, v)
+
+#define SET_NOFAULT_SS(b, c, v) SET_CONTEXT_FIELD(b, c, PAR, NOFAULT_SS, v)
+#define SET_NOFAULT_MT(b, c, v) SET_CONTEXT_FIELD(b, c, PAR, NOFAULT_MT, v)
+#define SET_NOFAULT_SH(b, c, v) SET_CONTEXT_FIELD(b, c, PAR, NOFAULT_SH, v)
+#define SET_NOFAULT_NS(b, c, v) SET_CONTEXT_FIELD(b, c, PAR, NOFAULT_NS, v)
+#define SET_NOFAULT_NOS(b, c, v) SET_CONTEXT_FIELD(b, c, PAR, NOFAULT_NOS, v)
+#define SET_NPFAULT_PA(b, c, v) SET_CONTEXT_FIELD(b, c, PAR, NPFAULT_PA, v)
+
+
+/* PRRR */
+#define SET_MTC0(b, c, v) SET_CONTEXT_FIELD(b, c, PRRR, MTC0, v)
+#define SET_MTC1(b, c, v) SET_CONTEXT_FIELD(b, c, PRRR, MTC1, v)
+#define SET_MTC2(b, c, v) SET_CONTEXT_FIELD(b, c, PRRR, MTC2, v)
+#define SET_MTC3(b, c, v) SET_CONTEXT_FIELD(b, c, PRRR, MTC3, v)
+#define SET_MTC4(b, c, v) SET_CONTEXT_FIELD(b, c, PRRR, MTC4, v)
+#define SET_MTC5(b, c, v) SET_CONTEXT_FIELD(b, c, PRRR, MTC5, v)
+#define SET_MTC6(b, c, v) SET_CONTEXT_FIELD(b, c, PRRR, MTC6, v)
+#define SET_MTC7(b, c, v) SET_CONTEXT_FIELD(b, c, PRRR, MTC7, v)
+#define SET_SHDSH0(b, c, v) SET_CONTEXT_FIELD(b, c, PRRR, SHDSH0, v)
+#define SET_SHDSH1(b, c, v) SET_CONTEXT_FIELD(b, c, PRRR, SHDSH1, v)
+#define SET_SHNMSH0(b, c, v) SET_CONTEXT_FIELD(b, c, PRRR, SHNMSH0, v)
+#define SET_SHNMSH1(b, c, v) SET_CONTEXT_FIELD(b, c, PRRR, SHNMSH1, v)
+#define SET_NOS0(b, c, v) SET_CONTEXT_FIELD(b, c, PRRR, NOS0, v)
+#define SET_NOS1(b, c, v) SET_CONTEXT_FIELD(b, c, PRRR, NOS1, v)
+#define SET_NOS2(b, c, v) SET_CONTEXT_FIELD(b, c, PRRR, NOS2, v)
+#define SET_NOS3(b, c, v) SET_CONTEXT_FIELD(b, c, PRRR, NOS3, v)
+#define SET_NOS4(b, c, v) SET_CONTEXT_FIELD(b, c, PRRR, NOS4, v)
+#define SET_NOS5(b, c, v) SET_CONTEXT_FIELD(b, c, PRRR, NOS5, v)
+#define SET_NOS6(b, c, v) SET_CONTEXT_FIELD(b, c, PRRR, NOS6, v)
+#define SET_NOS7(b, c, v) SET_CONTEXT_FIELD(b, c, PRRR, NOS7, v)
+
+
+/* RESUME */
+#define SET_TNR(b, c, v) SET_CONTEXT_FIELD(b, c, RESUME, TNR, v)
+
+
+/* SCTLR */
+#define SET_M(b, c, v) SET_CONTEXT_FIELD(b, c, SCTLR, M, v)
+#define SET_TRE(b, c, v) SET_CONTEXT_FIELD(b, c, SCTLR, TRE, v)
+#define SET_AFE(b, c, v) SET_CONTEXT_FIELD(b, c, SCTLR, AFE, v)
+#define SET_HAF(b, c, v) SET_CONTEXT_FIELD(b, c, SCTLR, HAF, v)
+#define SET_BE(b, c, v) SET_CONTEXT_FIELD(b, c, SCTLR, BE, v)
+#define SET_AFFD(b, c, v) SET_CONTEXT_FIELD(b, c, SCTLR, AFFD, v)
+
+
+/* TLBLKCR */
+#define SET_LKE(b, c, v) SET_CONTEXT_FIELD(b, c, TLBLKCR, LKE, v)
+#define SET_TLBLKCR_TLBIALLCFG(b, c, v) \
+ SET_CONTEXT_FIELD(b, c, TLBLKCR, TLBLCKR_TLBIALLCFG, v)
+#define SET_TLBIASIDCFG(b, c, v) \
+ SET_CONTEXT_FIELD(b, c, TLBLKCR, TLBIASIDCFG, v)
+#define SET_TLBIVAACFG(b, c, v) SET_CONTEXT_FIELD(b, c, TLBLKCR, TLBIVAACFG, v)
+#define SET_FLOOR(b, c, v) SET_CONTEXT_FIELD(b, c, TLBLKCR, FLOOR, v)
+#define SET_VICTIM(b, c, v) SET_CONTEXT_FIELD(b, c, TLBLKCR, VICTIM, v)
+
+
+/* TTBCR */
+#define SET_N(b, c, v) SET_CONTEXT_FIELD(b, c, TTBCR, N, v)
+#define SET_PD0(b, c, v) SET_CONTEXT_FIELD(b, c, TTBCR, PD0, v)
+#define SET_PD1(b, c, v) SET_CONTEXT_FIELD(b, c, TTBCR, PD1, v)
+
+
+/* TTBR0 */
+#define SET_TTBR0_IRGNH(b, c, v) SET_CONTEXT_FIELD(b, c, TTBR0, TTBR0_IRGNH, v)
+#define SET_TTBR0_SH(b, c, v) SET_CONTEXT_FIELD(b, c, TTBR0, TTBR0_SH, v)
+#define SET_TTBR0_ORGN(b, c, v) SET_CONTEXT_FIELD(b, c, TTBR0, TTBR0_ORGN, v)
+#define SET_TTBR0_NOS(b, c, v) SET_CONTEXT_FIELD(b, c, TTBR0, TTBR0_NOS, v)
+#define SET_TTBR0_IRGNL(b, c, v) SET_CONTEXT_FIELD(b, c, TTBR0, TTBR0_IRGNL, v)
+#define SET_TTBR0_PA(b, c, v) SET_CONTEXT_FIELD(b, c, TTBR0, TTBR0_PA, v)
+
+
+/* TTBR1 */
+#define SET_TTBR1_IRGNH(b, c, v) SET_CONTEXT_FIELD(b, c, TTBR1, TTBR1_IRGNH, v)
+#define SET_TTBR1_SH(b, c, v) SET_CONTEXT_FIELD(b, c, TTBR1, TTBR1_SH, v)
+#define SET_TTBR1_ORGN(b, c, v) SET_CONTEXT_FIELD(b, c, TTBR1, TTBR1_ORGN, v)
+#define SET_TTBR1_NOS(b, c, v) SET_CONTEXT_FIELD(b, c, TTBR1, TTBR1_NOS, v)
+#define SET_TTBR1_IRGNL(b, c, v) SET_CONTEXT_FIELD(b, c, TTBR1, TTBR1_IRGNL, v)
+#define SET_TTBR1_PA(b, c, v) SET_CONTEXT_FIELD(b, c, TTBR1, TTBR1_PA, v)
+
+
+/* V2PSR */
+#define SET_HIT(b, c, v) SET_CONTEXT_FIELD(b, c, V2PSR, HIT, v)
+#define SET_INDEX(b, c, v) SET_CONTEXT_FIELD(b, c, V2PSR, INDEX, v)
+
+
+/* Context Register getters */
+/* ACTLR */
+#define GET_CFERE(b, c) GET_CONTEXT_FIELD(b, c, ACTLR, CFERE)
+#define GET_CFEIE(b, c) GET_CONTEXT_FIELD(b, c, ACTLR, CFEIE)
+#define GET_PTSHCFG(b, c) GET_CONTEXT_FIELD(b, c, ACTLR, PTSHCFG)
+#define GET_RCOSH(b, c) GET_CONTEXT_FIELD(b, c, ACTLR, RCOSH)
+#define GET_RCISH(b, c) GET_CONTEXT_FIELD(b, c, ACTLR, RCISH)
+#define GET_RCNSH(b, c) GET_CONTEXT_FIELD(b, c, ACTLR, RCNSH)
+#define GET_PRIVCFG(b, c) GET_CONTEXT_FIELD(b, c, ACTLR, PRIVCFG)
+#define GET_DNA(b, c) GET_CONTEXT_FIELD(b, c, ACTLR, DNA)
+#define GET_DNLV2PA(b, c) GET_CONTEXT_FIELD(b, c, ACTLR, DNLV2PA)
+#define GET_TLBMCFG(b, c) GET_CONTEXT_FIELD(b, c, ACTLR, TLBMCFG)
+#define GET_CFCFG(b, c) GET_CONTEXT_FIELD(b, c, ACTLR, CFCFG)
+#define GET_TIPCF(b, c) GET_CONTEXT_FIELD(b, c, ACTLR, TIPCF)
+#define GET_V2PCFG(b, c) GET_CONTEXT_FIELD(b, c, ACTLR, V2PCFG)
+#define GET_HUME(b, c) GET_CONTEXT_FIELD(b, c, ACTLR, HUME)
+#define GET_PTMTCFG(b, c) GET_CONTEXT_FIELD(b, c, ACTLR, PTMTCFG)
+#define GET_PTMEMTYPE(b, c) GET_CONTEXT_FIELD(b, c, ACTLR, PTMEMTYPE)
+
+/* BFBCR */
+#define GET_BFBDFE(b, c) GET_CONTEXT_FIELD(b, c, BFBCR, BFBDFE)
+#define GET_BFBSFE(b, c) GET_CONTEXT_FIELD(b, c, BFBCR, BFBSFE)
+#define GET_SFVS(b, c) GET_CONTEXT_FIELD(b, c, BFBCR, SFVS)
+#define GET_FLVIC(b, c) GET_CONTEXT_FIELD(b, c, BFBCR, FLVIC)
+#define GET_SLVIC(b, c) GET_CONTEXT_FIELD(b, c, BFBCR, SLVIC)
+
+
+/* CONTEXTIDR */
+#define GET_CONTEXTIDR_ASID(b, c) \
+ GET_CONTEXT_FIELD(b, c, CONTEXTIDR, CONTEXTIDR_ASID)
+#define GET_CONTEXTIDR_PROCID(b, c) GET_CONTEXT_FIELD(b, c, CONTEXTIDR, PROCID)
+
+
+/* FSR */
+#define GET_TF(b, c) GET_CONTEXT_FIELD(b, c, FSR, TF)
+#define GET_AFF(b, c) GET_CONTEXT_FIELD(b, c, FSR, AFF)
+#define GET_APF(b, c) GET_CONTEXT_FIELD(b, c, FSR, APF)
+#define GET_TLBMF(b, c) GET_CONTEXT_FIELD(b, c, FSR, TLBMF)
+#define GET_HTWDEEF(b, c) GET_CONTEXT_FIELD(b, c, FSR, HTWDEEF)
+#define GET_HTWSEEF(b, c) GET_CONTEXT_FIELD(b, c, FSR, HTWSEEF)
+#define GET_MHF(b, c) GET_CONTEXT_FIELD(b, c, FSR, MHF)
+#define GET_SL(b, c) GET_CONTEXT_FIELD(b, c, FSR, SL)
+#define GET_SS(b, c) GET_CONTEXT_FIELD(b, c, FSR, SS)
+#define GET_MULTI(b, c) GET_CONTEXT_FIELD(b, c, FSR, MULTI)
+
+
+/* FSYNR0 */
+#define GET_AMID(b, c) GET_CONTEXT_FIELD(b, c, FSYNR0, AMID)
+#define GET_APID(b, c) GET_CONTEXT_FIELD(b, c, FSYNR0, APID)
+#define GET_ABID(b, c) GET_CONTEXT_FIELD(b, c, FSYNR0, ABID)
+#define GET_ATID(b, c) GET_CONTEXT_FIELD(b, c, FSYNR0, ATID)
+
+
+/* FSYNR1 */
+#define GET_AMEMTYPE(b, c) GET_CONTEXT_FIELD(b, c, FSYNR1, AMEMTYPE)
+#define GET_ASHARED(b, c) GET_CONTEXT_FIELD(b, c, FSYNR1, ASHARED)
+#define GET_AINNERSHARED(b, c) GET_CONTEXT_FIELD(b, c, FSYNR1, AINNERSHARED)
+#define GET_APRIV(b, c) GET_CONTEXT_FIELD(b, c, FSYNR1, APRIV)
+#define GET_APROTNS(b, c) GET_CONTEXT_FIELD(b, c, FSYNR1, APROTNS)
+#define GET_AINST(b, c) GET_CONTEXT_FIELD(b, c, FSYNR1, AINST)
+#define GET_AWRITE(b, c) GET_CONTEXT_FIELD(b, c, FSYNR1, AWRITE)
+#define GET_ABURST(b, c) GET_CONTEXT_FIELD(b, c, FSYNR1, ABURST)
+#define GET_ALEN(b, c) GET_CONTEXT_FIELD(b, c, FSYNR1, ALEN)
+#define GET_FSYNR1_ASIZE(b, c) GET_CONTEXT_FIELD(b, c, FSYNR1, FSYNR1_ASIZE)
+#define GET_ALOCK(b, c) GET_CONTEXT_FIELD(b, c, FSYNR1, ALOCK)
+#define GET_AFULL(b, c) GET_CONTEXT_FIELD(b, c, FSYNR1, AFULL)
+
+
+/* NMRR */
+#define GET_ICPC0(b, c) GET_CONTEXT_FIELD(b, c, NMRR, ICPC0)
+#define GET_ICPC1(b, c) GET_CONTEXT_FIELD(b, c, NMRR, ICPC1)
+#define GET_ICPC2(b, c) GET_CONTEXT_FIELD(b, c, NMRR, ICPC2)
+#define GET_ICPC3(b, c) GET_CONTEXT_FIELD(b, c, NMRR, ICPC3)
+#define GET_ICPC4(b, c) GET_CONTEXT_FIELD(b, c, NMRR, ICPC4)
+#define GET_ICPC5(b, c) GET_CONTEXT_FIELD(b, c, NMRR, ICPC5)
+#define GET_ICPC6(b, c) GET_CONTEXT_FIELD(b, c, NMRR, ICPC6)
+#define GET_ICPC7(b, c) GET_CONTEXT_FIELD(b, c, NMRR, ICPC7)
+#define GET_OCPC0(b, c) GET_CONTEXT_FIELD(b, c, NMRR, OCPC0)
+#define GET_OCPC1(b, c) GET_CONTEXT_FIELD(b, c, NMRR, OCPC1)
+#define GET_OCPC2(b, c) GET_CONTEXT_FIELD(b, c, NMRR, OCPC2)
+#define GET_OCPC3(b, c) GET_CONTEXT_FIELD(b, c, NMRR, OCPC3)
+#define GET_OCPC4(b, c) GET_CONTEXT_FIELD(b, c, NMRR, OCPC4)
+#define GET_OCPC5(b, c) GET_CONTEXT_FIELD(b, c, NMRR, OCPC5)
+#define GET_OCPC6(b, c) GET_CONTEXT_FIELD(b, c, NMRR, OCPC6)
+#define GET_OCPC7(b, c) GET_CONTEXT_FIELD(b, c, NMRR, OCPC7)
+#define NMRR_ICP(nmrr, n) (((nmrr) & (3 << ((n) * 2))) >> ((n) * 2))
+#define NMRR_OCP(nmrr, n) (((nmrr) & (3 << ((n) * 2 + 16))) >> \
+ ((n) * 2 + 16))
+
+/* PAR */
+#define GET_FAULT(b, c) GET_CONTEXT_FIELD(b, c, PAR, FAULT)
+
+#define GET_FAULT_TF(b, c) GET_CONTEXT_FIELD(b, c, PAR, FAULT_TF)
+#define GET_FAULT_AFF(b, c) GET_CONTEXT_FIELD(b, c, PAR, FAULT_AFF)
+#define GET_FAULT_APF(b, c) GET_CONTEXT_FIELD(b, c, PAR, FAULT_APF)
+#define GET_FAULT_TLBMF(b, c) GET_CONTEXT_FIELD(b, c, PAR, FAULT_TLBMF)
+#define GET_FAULT_HTWDEEF(b, c) GET_CONTEXT_FIELD(b, c, PAR, FAULT_HTWDEEF)
+#define GET_FAULT_HTWSEEF(b, c) GET_CONTEXT_FIELD(b, c, PAR, FAULT_HTWSEEF)
+#define GET_FAULT_MHF(b, c) GET_CONTEXT_FIELD(b, c, PAR, FAULT_MHF)
+#define GET_FAULT_SL(b, c) GET_CONTEXT_FIELD(b, c, PAR, FAULT_SL)
+#define GET_FAULT_SS(b, c) GET_CONTEXT_FIELD(b, c, PAR, FAULT_SS)
+
+#define GET_NOFAULT_SS(b, c) GET_CONTEXT_FIELD(b, c, PAR, PAR_NOFAULT_SS)
+#define GET_NOFAULT_MT(b, c) GET_CONTEXT_FIELD(b, c, PAR, PAR_NOFAULT_MT)
+#define GET_NOFAULT_SH(b, c) GET_CONTEXT_FIELD(b, c, PAR, PAR_NOFAULT_SH)
+#define GET_NOFAULT_NS(b, c) GET_CONTEXT_FIELD(b, c, PAR, PAR_NOFAULT_NS)
+#define GET_NOFAULT_NOS(b, c) GET_CONTEXT_FIELD(b, c, PAR, PAR_NOFAULT_NOS)
+#define GET_NPFAULT_PA(b, c) GET_CONTEXT_FIELD(b, c, PAR, PAR_NPFAULT_PA)
+
+
+/* PRRR */
+#define GET_MTC0(b, c) GET_CONTEXT_FIELD(b, c, PRRR, MTC0)
+#define GET_MTC1(b, c) GET_CONTEXT_FIELD(b, c, PRRR, MTC1)
+#define GET_MTC2(b, c) GET_CONTEXT_FIELD(b, c, PRRR, MTC2)
+#define GET_MTC3(b, c) GET_CONTEXT_FIELD(b, c, PRRR, MTC3)
+#define GET_MTC4(b, c) GET_CONTEXT_FIELD(b, c, PRRR, MTC4)
+#define GET_MTC5(b, c) GET_CONTEXT_FIELD(b, c, PRRR, MTC5)
+#define GET_MTC6(b, c) GET_CONTEXT_FIELD(b, c, PRRR, MTC6)
+#define GET_MTC7(b, c) GET_CONTEXT_FIELD(b, c, PRRR, MTC7)
+#define GET_SHDSH0(b, c) GET_CONTEXT_FIELD(b, c, PRRR, SHDSH0)
+#define GET_SHDSH1(b, c) GET_CONTEXT_FIELD(b, c, PRRR, SHDSH1)
+#define GET_SHNMSH0(b, c) GET_CONTEXT_FIELD(b, c, PRRR, SHNMSH0)
+#define GET_SHNMSH1(b, c) GET_CONTEXT_FIELD(b, c, PRRR, SHNMSH1)
+#define GET_NOS0(b, c) GET_CONTEXT_FIELD(b, c, PRRR, NOS0)
+#define GET_NOS1(b, c) GET_CONTEXT_FIELD(b, c, PRRR, NOS1)
+#define GET_NOS2(b, c) GET_CONTEXT_FIELD(b, c, PRRR, NOS2)
+#define GET_NOS3(b, c) GET_CONTEXT_FIELD(b, c, PRRR, NOS3)
+#define GET_NOS4(b, c) GET_CONTEXT_FIELD(b, c, PRRR, NOS4)
+#define GET_NOS5(b, c) GET_CONTEXT_FIELD(b, c, PRRR, NOS5)
+#define GET_NOS6(b, c) GET_CONTEXT_FIELD(b, c, PRRR, NOS6)
+#define GET_NOS7(b, c) GET_CONTEXT_FIELD(b, c, PRRR, NOS7)
+#define PRRR_NOS(prrr, n) ((prrr) & (1 << ((n) + 24)) ? 1 : 0)
+#define PRRR_MT(prrr, n) ((((prrr) & (3 << ((n) * 2))) >> ((n) * 2)))
+
+
+/* RESUME */
+#define GET_TNR(b, c) GET_CONTEXT_FIELD(b, c, RESUME, TNR)
+
+
+/* SCTLR */
+#define GET_M(b, c) GET_CONTEXT_FIELD(b, c, SCTLR, M)
+#define GET_TRE(b, c) GET_CONTEXT_FIELD(b, c, SCTLR, TRE)
+#define GET_AFE(b, c) GET_CONTEXT_FIELD(b, c, SCTLR, AFE)
+#define GET_HAF(b, c) GET_CONTEXT_FIELD(b, c, SCTLR, HAF)
+#define GET_BE(b, c) GET_CONTEXT_FIELD(b, c, SCTLR, BE)
+#define GET_AFFD(b, c) GET_CONTEXT_FIELD(b, c, SCTLR, AFFD)
+
+
+/* TLBLKCR */
+#define GET_LKE(b, c) GET_CONTEXT_FIELD(b, c, TLBLKCR, LKE)
+#define GET_TLBLCKR_TLBIALLCFG(b, c) \
+ GET_CONTEXT_FIELD(b, c, TLBLKCR, TLBLCKR_TLBIALLCFG)
+#define GET_TLBIASIDCFG(b, c) GET_CONTEXT_FIELD(b, c, TLBLKCR, TLBIASIDCFG)
+#define GET_TLBIVAACFG(b, c) GET_CONTEXT_FIELD(b, c, TLBLKCR, TLBIVAACFG)
+#define GET_FLOOR(b, c) GET_CONTEXT_FIELD(b, c, TLBLKCR, FLOOR)
+#define GET_VICTIM(b, c) GET_CONTEXT_FIELD(b, c, TLBLKCR, VICTIM)
+
+
+/* TTBCR */
+#define GET_N(b, c) GET_CONTEXT_FIELD(b, c, TTBCR, N)
+#define GET_PD0(b, c) GET_CONTEXT_FIELD(b, c, TTBCR, PD0)
+#define GET_PD1(b, c) GET_CONTEXT_FIELD(b, c, TTBCR, PD1)
+
+
+/* TTBR0 */
+#define GET_TTBR0_IRGNH(b, c) GET_CONTEXT_FIELD(b, c, TTBR0, TTBR0_IRGNH)
+#define GET_TTBR0_SH(b, c) GET_CONTEXT_FIELD(b, c, TTBR0, TTBR0_SH)
+#define GET_TTBR0_ORGN(b, c) GET_CONTEXT_FIELD(b, c, TTBR0, TTBR0_ORGN)
+#define GET_TTBR0_NOS(b, c) GET_CONTEXT_FIELD(b, c, TTBR0, TTBR0_NOS)
+#define GET_TTBR0_IRGNL(b, c) GET_CONTEXT_FIELD(b, c, TTBR0, TTBR0_IRGNL)
+#define GET_TTBR0_PA(b, c) GET_CONTEXT_FIELD(b, c, TTBR0, TTBR0_PA)
+
+
+/* TTBR1 */
+#define GET_TTBR1_IRGNH(b, c) GET_CONTEXT_FIELD(b, c, TTBR1, TTBR1_IRGNH)
+#define GET_TTBR1_SH(b, c) GET_CONTEXT_FIELD(b, c, TTBR1, TTBR1_SH)
+#define GET_TTBR1_ORGN(b, c) GET_CONTEXT_FIELD(b, c, TTBR1, TTBR1_ORGN)
+#define GET_TTBR1_NOS(b, c) GET_CONTEXT_FIELD(b, c, TTBR1, TTBR1_NOS)
+#define GET_TTBR1_IRGNL(b, c) GET_CONTEXT_FIELD(b, c, TTBR1, TTBR1_IRGNL)
+#define GET_TTBR1_PA(b, c) GET_CONTEXT_FIELD(b, c, TTBR1, TTBR1_PA)
+
+
+/* V2PSR */
+#define GET_HIT(b, c) GET_CONTEXT_FIELD(b, c, V2PSR, HIT)
+#define GET_INDEX(b, c) GET_CONTEXT_FIELD(b, c, V2PSR, INDEX)
+
+
+/* Global Registers */
+#define M2VCBR_N (0xFF000)
+#define CBACR_N (0xFF800)
+#define TLBRSW (0xFFE00)
+#define TLBTR0 (0xFFE80)
+#define TLBTR1 (0xFFE84)
+#define TLBTR2 (0xFFE88)
+#define TESTBUSCR (0xFFE8C)
+#define GLOBAL_TLBIALL (0xFFF00)
+#define TLBIVMID (0xFFF04)
+#define CR (0xFFF80)
+#define EAR (0xFFF84)
+#define ESR (0xFFF88)
+#define ESRRESTORE (0xFFF8C)
+#define ESYNR0 (0xFFF90)
+#define ESYNR1 (0xFFF94)
+#define REV (0xFFFF4)
+#define IDR (0xFFFF8)
+#define RPU_ACR (0xFFFFC)
+
+
+/* Context Bank Registers */
+#define SCTLR (0x000)
+#define ACTLR (0x004)
+#define CONTEXTIDR (0x008)
+#define TTBR0 (0x010)
+#define TTBR1 (0x014)
+#define TTBCR (0x018)
+#define PAR (0x01C)
+#define FSR (0x020)
+#define FSRRESTORE (0x024)
+#define FAR (0x028)
+#define FSYNR0 (0x02C)
+#define FSYNR1 (0x030)
+#define PRRR (0x034)
+#define NMRR (0x038)
+#define TLBLCKR (0x03C)
+#define V2PSR (0x040)
+#define TLBFLPTER (0x044)
+#define TLBSLPTER (0x048)
+#define BFBCR (0x04C)
+#define CTX_TLBIALL (0x800)
+#define TLBIASID (0x804)
+#define TLBIVA (0x808)
+#define TLBIVAA (0x80C)
+#define V2PPR (0x810)
+#define V2PPW (0x814)
+#define V2PUR (0x818)
+#define V2PUW (0x81C)
+#define RESUME (0x820)
+
+
+/* Global Register Fields */
+/* CBACRn */
+#define RWVMID (RWVMID_MASK << RWVMID_SHIFT)
+#define RWE (RWE_MASK << RWE_SHIFT)
+#define RWGE (RWGE_MASK << RWGE_SHIFT)
+#define CBVMID (CBVMID_MASK << CBVMID_SHIFT)
+#define IRPTNDX (IRPTNDX_MASK << IRPTNDX_SHIFT)
+
+
+/* CR */
+#define RPUE (RPUE_MASK << RPUE_SHIFT)
+#define RPUERE (RPUERE_MASK << RPUERE_SHIFT)
+#define RPUEIE (RPUEIE_MASK << RPUEIE_SHIFT)
+#define DCDEE (DCDEE_MASK << DCDEE_SHIFT)
+#define CLIENTPD (CLIENTPD_MASK << CLIENTPD_SHIFT)
+#define STALLD (STALLD_MASK << STALLD_SHIFT)
+#define TLBLKCRWE (TLBLKCRWE_MASK << TLBLKCRWE_SHIFT)
+#define CR_TLBIALLCFG (CR_TLBIALLCFG_MASK << CR_TLBIALLCFG_SHIFT)
+#define TLBIVMIDCFG (TLBIVMIDCFG_MASK << TLBIVMIDCFG_SHIFT)
+#define CR_HUME (CR_HUME_MASK << CR_HUME_SHIFT)
+
+
+/* ESR */
+#define CFG (CFG_MASK << CFG_SHIFT)
+#define BYPASS (BYPASS_MASK << BYPASS_SHIFT)
+#define ESR_MULTI (ESR_MULTI_MASK << ESR_MULTI_SHIFT)
+
+
+/* ESYNR0 */
+#define ESYNR0_AMID (ESYNR0_AMID_MASK << ESYNR0_AMID_SHIFT)
+#define ESYNR0_APID (ESYNR0_APID_MASK << ESYNR0_APID_SHIFT)
+#define ESYNR0_ABID (ESYNR0_ABID_MASK << ESYNR0_ABID_SHIFT)
+#define ESYNR0_AVMID (ESYNR0_AVMID_MASK << ESYNR0_AVMID_SHIFT)
+#define ESYNR0_ATID (ESYNR0_ATID_MASK << ESYNR0_ATID_SHIFT)
+
+
+/* ESYNR1 */
+#define ESYNR1_AMEMTYPE (ESYNR1_AMEMTYPE_MASK << ESYNR1_AMEMTYPE_SHIFT)
+#define ESYNR1_ASHARED (ESYNR1_ASHARED_MASK << ESYNR1_ASHARED_SHIFT)
+#define ESYNR1_AINNERSHARED (ESYNR1_AINNERSHARED_MASK<< \
+ ESYNR1_AINNERSHARED_SHIFT)
+#define ESYNR1_APRIV (ESYNR1_APRIV_MASK << ESYNR1_APRIV_SHIFT)
+#define ESYNR1_APROTNS (ESYNR1_APROTNS_MASK << ESYNR1_APROTNS_SHIFT)
+#define ESYNR1_AINST (ESYNR1_AINST_MASK << ESYNR1_AINST_SHIFT)
+#define ESYNR1_AWRITE (ESYNR1_AWRITE_MASK << ESYNR1_AWRITE_SHIFT)
+#define ESYNR1_ABURST (ESYNR1_ABURST_MASK << ESYNR1_ABURST_SHIFT)
+#define ESYNR1_ALEN (ESYNR1_ALEN_MASK << ESYNR1_ALEN_SHIFT)
+#define ESYNR1_ASIZE (ESYNR1_ASIZE_MASK << ESYNR1_ASIZE_SHIFT)
+#define ESYNR1_ALOCK (ESYNR1_ALOCK_MASK << ESYNR1_ALOCK_SHIFT)
+#define ESYNR1_AOOO (ESYNR1_AOOO_MASK << ESYNR1_AOOO_SHIFT)
+#define ESYNR1_AFULL (ESYNR1_AFULL_MASK << ESYNR1_AFULL_SHIFT)
+#define ESYNR1_AC (ESYNR1_AC_MASK << ESYNR1_AC_SHIFT)
+#define ESYNR1_DCD (ESYNR1_DCD_MASK << ESYNR1_DCD_SHIFT)
+
+
+/* IDR */
+#define NM2VCBMT (NM2VCBMT_MASK << NM2VCBMT_SHIFT)
+#define HTW (HTW_MASK << HTW_SHIFT)
+#define HUM (HUM_MASK << HUM_SHIFT)
+#define TLBSIZE (TLBSIZE_MASK << TLBSIZE_SHIFT)
+#define NCB (NCB_MASK << NCB_SHIFT)
+#define NIRPT (NIRPT_MASK << NIRPT_SHIFT)
+
+
+/* M2VCBRn */
+#define VMID (VMID_MASK << VMID_SHIFT)
+#define CBNDX (CBNDX_MASK << CBNDX_SHIFT)
+#define BYPASSD (BYPASSD_MASK << BYPASSD_SHIFT)
+#define BPRCOSH (BPRCOSH_MASK << BPRCOSH_SHIFT)
+#define BPRCISH (BPRCISH_MASK << BPRCISH_SHIFT)
+#define BPRCNSH (BPRCNSH_MASK << BPRCNSH_SHIFT)
+#define BPSHCFG (BPSHCFG_MASK << BPSHCFG_SHIFT)
+#define NSCFG (NSCFG_MASK << NSCFG_SHIFT)
+#define BPMTCFG (BPMTCFG_MASK << BPMTCFG_SHIFT)
+#define BPMEMTYPE (BPMEMTYPE_MASK << BPMEMTYPE_SHIFT)
+
+
+/* REV */
+#define IDR_MINOR (MINOR_MASK << MINOR_SHIFT)
+#define IDR_MAJOR (MAJOR_MASK << MAJOR_SHIFT)
+
+
+/* TESTBUSCR */
+#define TBE (TBE_MASK << TBE_SHIFT)
+#define SPDMBE (SPDMBE_MASK << SPDMBE_SHIFT)
+#define WGSEL (WGSEL_MASK << WGSEL_SHIFT)
+#define TBLSEL (TBLSEL_MASK << TBLSEL_SHIFT)
+#define TBHSEL (TBHSEL_MASK << TBHSEL_SHIFT)
+#define SPDM0SEL (SPDM0SEL_MASK << SPDM0SEL_SHIFT)
+#define SPDM1SEL (SPDM1SEL_MASK << SPDM1SEL_SHIFT)
+#define SPDM2SEL (SPDM2SEL_MASK << SPDM2SEL_SHIFT)
+#define SPDM3SEL (SPDM3SEL_MASK << SPDM3SEL_SHIFT)
+
+
+/* TLBIVMID */
+#define TLBIVMID_VMID (TLBIVMID_VMID_MASK << TLBIVMID_VMID_SHIFT)
+
+
+/* TLBRSW */
+#define TLBRSW_INDEX (TLBRSW_INDEX_MASK << TLBRSW_INDEX_SHIFT)
+#define TLBBFBS (TLBBFBS_MASK << TLBBFBS_SHIFT)
+
+
+/* TLBTR0 */
+#define PR (PR_MASK << PR_SHIFT)
+#define PW (PW_MASK << PW_SHIFT)
+#define UR (UR_MASK << UR_SHIFT)
+#define UW (UW_MASK << UW_SHIFT)
+#define XN (XN_MASK << XN_SHIFT)
+#define NSDESC (NSDESC_MASK << NSDESC_SHIFT)
+#define ISH (ISH_MASK << ISH_SHIFT)
+#define SH (SH_MASK << SH_SHIFT)
+#define MT (MT_MASK << MT_SHIFT)
+#define DPSIZR (DPSIZR_MASK << DPSIZR_SHIFT)
+#define DPSIZC (DPSIZC_MASK << DPSIZC_SHIFT)
+
+
+/* TLBTR1 */
+#define TLBTR1_VMID (TLBTR1_VMID_MASK << TLBTR1_VMID_SHIFT)
+#define TLBTR1_PA (TLBTR1_PA_MASK << TLBTR1_PA_SHIFT)
+
+
+/* TLBTR2 */
+#define TLBTR2_ASID (TLBTR2_ASID_MASK << TLBTR2_ASID_SHIFT)
+#define TLBTR2_V (TLBTR2_V_MASK << TLBTR2_V_SHIFT)
+#define TLBTR2_NSTID (TLBTR2_NSTID_MASK << TLBTR2_NSTID_SHIFT)
+#define TLBTR2_NV (TLBTR2_NV_MASK << TLBTR2_NV_SHIFT)
+#define TLBTR2_VA (TLBTR2_VA_MASK << TLBTR2_VA_SHIFT)
+
+
+/* Context Register Fields */
+/* ACTLR */
+#define CFERE (CFERE_MASK << CFERE_SHIFT)
+#define CFEIE (CFEIE_MASK << CFEIE_SHIFT)
+#define PTSHCFG (PTSHCFG_MASK << PTSHCFG_SHIFT)
+#define RCOSH (RCOSH_MASK << RCOSH_SHIFT)
+#define RCISH (RCISH_MASK << RCISH_SHIFT)
+#define RCNSH (RCNSH_MASK << RCNSH_SHIFT)
+#define PRIVCFG (PRIVCFG_MASK << PRIVCFG_SHIFT)
+#define DNA (DNA_MASK << DNA_SHIFT)
+#define DNLV2PA (DNLV2PA_MASK << DNLV2PA_SHIFT)
+#define TLBMCFG (TLBMCFG_MASK << TLBMCFG_SHIFT)
+#define CFCFG (CFCFG_MASK << CFCFG_SHIFT)
+#define TIPCF (TIPCF_MASK << TIPCF_SHIFT)
+#define V2PCFG (V2PCFG_MASK << V2PCFG_SHIFT)
+#define HUME (HUME_MASK << HUME_SHIFT)
+#define PTMTCFG (PTMTCFG_MASK << PTMTCFG_SHIFT)
+#define PTMEMTYPE (PTMEMTYPE_MASK << PTMEMTYPE_SHIFT)
+
+
+/* BFBCR */
+#define BFBDFE (BFBDFE_MASK << BFBDFE_SHIFT)
+#define BFBSFE (BFBSFE_MASK << BFBSFE_SHIFT)
+#define SFVS (SFVS_MASK << SFVS_SHIFT)
+#define FLVIC (FLVIC_MASK << FLVIC_SHIFT)
+#define SLVIC (SLVIC_MASK << SLVIC_SHIFT)
+
+
+/* CONTEXTIDR */
+#define CONTEXTIDR_ASID (CONTEXTIDR_ASID_MASK << CONTEXTIDR_ASID_SHIFT)
+#define PROCID (PROCID_MASK << PROCID_SHIFT)
+
+
+/* FSR */
+#define TF (TF_MASK << TF_SHIFT)
+#define AFF (AFF_MASK << AFF_SHIFT)
+#define APF (APF_MASK << APF_SHIFT)
+#define TLBMF (TLBMF_MASK << TLBMF_SHIFT)
+#define HTWDEEF (HTWDEEF_MASK << HTWDEEF_SHIFT)
+#define HTWSEEF (HTWSEEF_MASK << HTWSEEF_SHIFT)
+#define MHF (MHF_MASK << MHF_SHIFT)
+#define SL (SL_MASK << SL_SHIFT)
+#define SS (SS_MASK << SS_SHIFT)
+#define MULTI (MULTI_MASK << MULTI_SHIFT)
+
+
+/* FSYNR0 */
+#define AMID (AMID_MASK << AMID_SHIFT)
+#define APID (APID_MASK << APID_SHIFT)
+#define ABID (ABID_MASK << ABID_SHIFT)
+#define ATID (ATID_MASK << ATID_SHIFT)
+
+
+/* FSYNR1 */
+#define AMEMTYPE (AMEMTYPE_MASK << AMEMTYPE_SHIFT)
+#define ASHARED (ASHARED_MASK << ASHARED_SHIFT)
+#define AINNERSHARED (AINNERSHARED_MASK << AINNERSHARED_SHIFT)
+#define APRIV (APRIV_MASK << APRIV_SHIFT)
+#define APROTNS (APROTNS_MASK << APROTNS_SHIFT)
+#define AINST (AINST_MASK << AINST_SHIFT)
+#define AWRITE (AWRITE_MASK << AWRITE_SHIFT)
+#define ABURST (ABURST_MASK << ABURST_SHIFT)
+#define ALEN (ALEN_MASK << ALEN_SHIFT)
+#define FSYNR1_ASIZE (FSYNR1_ASIZE_MASK << FSYNR1_ASIZE_SHIFT)
+#define ALOCK (ALOCK_MASK << ALOCK_SHIFT)
+#define AFULL (AFULL_MASK << AFULL_SHIFT)
+
+
+/* NMRR */
+#define ICPC0 (ICPC0_MASK << ICPC0_SHIFT)
+#define ICPC1 (ICPC1_MASK << ICPC1_SHIFT)
+#define ICPC2 (ICPC2_MASK << ICPC2_SHIFT)
+#define ICPC3 (ICPC3_MASK << ICPC3_SHIFT)
+#define ICPC4 (ICPC4_MASK << ICPC4_SHIFT)
+#define ICPC5 (ICPC5_MASK << ICPC5_SHIFT)
+#define ICPC6 (ICPC6_MASK << ICPC6_SHIFT)
+#define ICPC7 (ICPC7_MASK << ICPC7_SHIFT)
+#define OCPC0 (OCPC0_MASK << OCPC0_SHIFT)
+#define OCPC1 (OCPC1_MASK << OCPC1_SHIFT)
+#define OCPC2 (OCPC2_MASK << OCPC2_SHIFT)
+#define OCPC3 (OCPC3_MASK << OCPC3_SHIFT)
+#define OCPC4 (OCPC4_MASK << OCPC4_SHIFT)
+#define OCPC5 (OCPC5_MASK << OCPC5_SHIFT)
+#define OCPC6 (OCPC6_MASK << OCPC6_SHIFT)
+#define OCPC7 (OCPC7_MASK << OCPC7_SHIFT)
+
+
+/* PAR */
+#define FAULT (FAULT_MASK << FAULT_SHIFT)
+/* If a fault is present, these are the
+same as the fault fields in the FAR */
+#define FAULT_TF (FAULT_TF_MASK << FAULT_TF_SHIFT)
+#define FAULT_AFF (FAULT_AFF_MASK << FAULT_AFF_SHIFT)
+#define FAULT_APF (FAULT_APF_MASK << FAULT_APF_SHIFT)
+#define FAULT_TLBMF (FAULT_TLBMF_MASK << FAULT_TLBMF_SHIFT)
+#define FAULT_HTWDEEF (FAULT_HTWDEEF_MASK << FAULT_HTWDEEF_SHIFT)
+#define FAULT_HTWSEEF (FAULT_HTWSEEF_MASK << FAULT_HTWSEEF_SHIFT)
+#define FAULT_MHF (FAULT_MHF_MASK << FAULT_MHF_SHIFT)
+#define FAULT_SL (FAULT_SL_MASK << FAULT_SL_SHIFT)
+#define FAULT_SS (FAULT_SS_MASK << FAULT_SS_SHIFT)
+
+/* If NO fault is present, the following fields are in effect */
+/* (FAULT remains as before) */
+#define PAR_NOFAULT_SS (PAR_NOFAULT_SS_MASK << PAR_NOFAULT_SS_SHIFT)
+#define PAR_NOFAULT_MT (PAR_NOFAULT_MT_MASK << PAR_NOFAULT_MT_SHIFT)
+#define PAR_NOFAULT_SH (PAR_NOFAULT_SH_MASK << PAR_NOFAULT_SH_SHIFT)
+#define PAR_NOFAULT_NS (PAR_NOFAULT_NS_MASK << PAR_NOFAULT_NS_SHIFT)
+#define PAR_NOFAULT_NOS (PAR_NOFAULT_NOS_MASK << PAR_NOFAULT_NOS_SHIFT)
+#define PAR_NPFAULT_PA (PAR_NPFAULT_PA_MASK << PAR_NPFAULT_PA_SHIFT)
+
+
+/* PRRR */
+#define MTC0 (MTC0_MASK << MTC0_SHIFT)
+#define MTC1 (MTC1_MASK << MTC1_SHIFT)
+#define MTC2 (MTC2_MASK << MTC2_SHIFT)
+#define MTC3 (MTC3_MASK << MTC3_SHIFT)
+#define MTC4 (MTC4_MASK << MTC4_SHIFT)
+#define MTC5 (MTC5_MASK << MTC5_SHIFT)
+#define MTC6 (MTC6_MASK << MTC6_SHIFT)
+#define MTC7 (MTC7_MASK << MTC7_SHIFT)
+#define SHDSH0 (SHDSH0_MASK << SHDSH0_SHIFT)
+#define SHDSH1 (SHDSH1_MASK << SHDSH1_SHIFT)
+#define SHNMSH0 (SHNMSH0_MASK << SHNMSH0_SHIFT)
+#define SHNMSH1 (SHNMSH1_MASK << SHNMSH1_SHIFT)
+#define NOS0 (NOS0_MASK << NOS0_SHIFT)
+#define NOS1 (NOS1_MASK << NOS1_SHIFT)
+#define NOS2 (NOS2_MASK << NOS2_SHIFT)
+#define NOS3 (NOS3_MASK << NOS3_SHIFT)
+#define NOS4 (NOS4_MASK << NOS4_SHIFT)
+#define NOS5 (NOS5_MASK << NOS5_SHIFT)
+#define NOS6 (NOS6_MASK << NOS6_SHIFT)
+#define NOS7 (NOS7_MASK << NOS7_SHIFT)
+
+
+/* RESUME */
+#define TNR (TNR_MASK << TNR_SHIFT)
+
+
+/* SCTLR */
+#define M (M_MASK << M_SHIFT)
+#define TRE (TRE_MASK << TRE_SHIFT)
+#define AFE (AFE_MASK << AFE_SHIFT)
+#define HAF (HAF_MASK << HAF_SHIFT)
+#define BE (BE_MASK << BE_SHIFT)
+#define AFFD (AFFD_MASK << AFFD_SHIFT)
+
+
+/* TLBIASID */
+#define TLBIASID_ASID (TLBIASID_ASID_MASK << TLBIASID_ASID_SHIFT)
+
+
+/* TLBIVA */
+#define TLBIVA_ASID (TLBIVA_ASID_MASK << TLBIVA_ASID_SHIFT)
+#define TLBIVA_VA (TLBIVA_VA_MASK << TLBIVA_VA_SHIFT)
+
+
+/* TLBIVAA */
+#define TLBIVAA_VA (TLBIVAA_VA_MASK << TLBIVAA_VA_SHIFT)
+
+
+/* TLBLCKR */
+#define LKE (LKE_MASK << LKE_SHIFT)
+#define TLBLCKR_TLBIALLCFG (TLBLCKR_TLBIALLCFG_MASK<<TLBLCKR_TLBIALLCFG_SHIFT)
+#define TLBIASIDCFG (TLBIASIDCFG_MASK << TLBIASIDCFG_SHIFT)
+#define TLBIVAACFG (TLBIVAACFG_MASK << TLBIVAACFG_SHIFT)
+#define FLOOR (FLOOR_MASK << FLOOR_SHIFT)
+#define VICTIM (VICTIM_MASK << VICTIM_SHIFT)
+
+
+/* TTBCR */
+#define N (N_MASK << N_SHIFT)
+#define PD0 (PD0_MASK << PD0_SHIFT)
+#define PD1 (PD1_MASK << PD1_SHIFT)
+
+
+/* TTBR0 */
+#define TTBR0_IRGNH (TTBR0_IRGNH_MASK << TTBR0_IRGNH_SHIFT)
+#define TTBR0_SH (TTBR0_SH_MASK << TTBR0_SH_SHIFT)
+#define TTBR0_ORGN (TTBR0_ORGN_MASK << TTBR0_ORGN_SHIFT)
+#define TTBR0_NOS (TTBR0_NOS_MASK << TTBR0_NOS_SHIFT)
+#define TTBR0_IRGNL (TTBR0_IRGNL_MASK << TTBR0_IRGNL_SHIFT)
+#define TTBR0_PA (TTBR0_PA_MASK << TTBR0_PA_SHIFT)
+
+
+/* TTBR1 */
+#define TTBR1_IRGNH (TTBR1_IRGNH_MASK << TTBR1_IRGNH_SHIFT)
+#define TTBR1_SH (TTBR1_SH_MASK << TTBR1_SH_SHIFT)
+#define TTBR1_ORGN (TTBR1_ORGN_MASK << TTBR1_ORGN_SHIFT)
+#define TTBR1_NOS (TTBR1_NOS_MASK << TTBR1_NOS_SHIFT)
+#define TTBR1_IRGNL (TTBR1_IRGNL_MASK << TTBR1_IRGNL_SHIFT)
+#define TTBR1_PA (TTBR1_PA_MASK << TTBR1_PA_SHIFT)
+
+
+/* V2PSR */
+#define HIT (HIT_MASK << HIT_SHIFT)
+#define INDEX (INDEX_MASK << INDEX_SHIFT)
+
+
+/* V2Pxx */
+#define V2Pxx_INDEX (V2Pxx_INDEX_MASK << V2Pxx_INDEX_SHIFT)
+#define V2Pxx_VA (V2Pxx_VA_MASK << V2Pxx_VA_SHIFT)
+
+
+/* Global Register Masks */
+/* CBACRn */
+#define RWVMID_MASK 0x1F
+#define RWE_MASK 0x01
+#define RWGE_MASK 0x01
+#define CBVMID_MASK 0x1F
+#define IRPTNDX_MASK 0xFF
+
+
+/* CR */
+#define RPUE_MASK 0x01
+#define RPUERE_MASK 0x01
+#define RPUEIE_MASK 0x01
+#define DCDEE_MASK 0x01
+#define CLIENTPD_MASK 0x01
+#define STALLD_MASK 0x01
+#define TLBLKCRWE_MASK 0x01
+#define CR_TLBIALLCFG_MASK 0x01
+#define TLBIVMIDCFG_MASK 0x01
+#define CR_HUME_MASK 0x01
+
+
+/* ESR */
+#define CFG_MASK 0x01
+#define BYPASS_MASK 0x01
+#define ESR_MULTI_MASK 0x01
+
+
+/* ESYNR0 */
+#define ESYNR0_AMID_MASK 0xFF
+#define ESYNR0_APID_MASK 0x1F
+#define ESYNR0_ABID_MASK 0x07
+#define ESYNR0_AVMID_MASK 0x1F
+#define ESYNR0_ATID_MASK 0xFF
+
+
+/* ESYNR1 */
+#define ESYNR1_AMEMTYPE_MASK 0x07
+#define ESYNR1_ASHARED_MASK 0x01
+#define ESYNR1_AINNERSHARED_MASK 0x01
+#define ESYNR1_APRIV_MASK 0x01
+#define ESYNR1_APROTNS_MASK 0x01
+#define ESYNR1_AINST_MASK 0x01
+#define ESYNR1_AWRITE_MASK 0x01
+#define ESYNR1_ABURST_MASK 0x01
+#define ESYNR1_ALEN_MASK 0x0F
+#define ESYNR1_ASIZE_MASK 0x01
+#define ESYNR1_ALOCK_MASK 0x03
+#define ESYNR1_AOOO_MASK 0x01
+#define ESYNR1_AFULL_MASK 0x01
+#define ESYNR1_AC_MASK 0x01
+#define ESYNR1_DCD_MASK 0x01
+
+
+/* IDR */
+#define NM2VCBMT_MASK 0x1FF
+#define HTW_MASK 0x01
+#define HUM_MASK 0x01
+#define TLBSIZE_MASK 0x0F
+#define NCB_MASK 0xFF
+#define NIRPT_MASK 0xFF
+
+
+/* M2VCBRn */
+#define VMID_MASK 0x1F
+#define CBNDX_MASK 0xFF
+#define BYPASSD_MASK 0x01
+#define BPRCOSH_MASK 0x01
+#define BPRCISH_MASK 0x01
+#define BPRCNSH_MASK 0x01
+#define BPSHCFG_MASK 0x03
+#define NSCFG_MASK 0x03
+#define BPMTCFG_MASK 0x01
+#define BPMEMTYPE_MASK 0x07
+
+
+/* REV */
+#define MINOR_MASK 0x0F
+#define MAJOR_MASK 0x0F
+
+
+/* TESTBUSCR */
+#define TBE_MASK 0x01
+#define SPDMBE_MASK 0x01
+#define WGSEL_MASK 0x03
+#define TBLSEL_MASK 0x03
+#define TBHSEL_MASK 0x03
+#define SPDM0SEL_MASK 0x0F
+#define SPDM1SEL_MASK 0x0F
+#define SPDM2SEL_MASK 0x0F
+#define SPDM3SEL_MASK 0x0F
+
+
+/* TLBIMID */
+#define TLBIVMID_VMID_MASK 0x1F
+
+
+/* TLBRSW */
+#define TLBRSW_INDEX_MASK 0xFF
+#define TLBBFBS_MASK 0x03
+
+
+/* TLBTR0 */
+#define PR_MASK 0x01
+#define PW_MASK 0x01
+#define UR_MASK 0x01
+#define UW_MASK 0x01
+#define XN_MASK 0x01
+#define NSDESC_MASK 0x01
+#define ISH_MASK 0x01
+#define SH_MASK 0x01
+#define MT_MASK 0x07
+#define DPSIZR_MASK 0x07
+#define DPSIZC_MASK 0x07
+
+
+/* TLBTR1 */
+#define TLBTR1_VMID_MASK 0x1F
+#define TLBTR1_PA_MASK 0x000FFFFF
+
+
+/* TLBTR2 */
+#define TLBTR2_ASID_MASK 0xFF
+#define TLBTR2_V_MASK 0x01
+#define TLBTR2_NSTID_MASK 0x01
+#define TLBTR2_NV_MASK 0x01
+#define TLBTR2_VA_MASK 0x000FFFFF
+
+
+/* Global Register Shifts */
+/* CBACRn */
+#define RWVMID_SHIFT 0
+#define RWE_SHIFT 8
+#define RWGE_SHIFT 9
+#define CBVMID_SHIFT 16
+#define IRPTNDX_SHIFT 24
+
+
+/* CR */
+#define RPUE_SHIFT 0
+#define RPUERE_SHIFT 1
+#define RPUEIE_SHIFT 2
+#define DCDEE_SHIFT 3
+#define CLIENTPD_SHIFT 4
+#define STALLD_SHIFT 5
+#define TLBLKCRWE_SHIFT 6
+#define CR_TLBIALLCFG_SHIFT 7
+#define TLBIVMIDCFG_SHIFT 8
+#define CR_HUME_SHIFT 9
+
+
+/* ESR */
+#define CFG_SHIFT 0
+#define BYPASS_SHIFT 1
+#define ESR_MULTI_SHIFT 31
+
+
+/* ESYNR0 */
+#define ESYNR0_AMID_SHIFT 0
+#define ESYNR0_APID_SHIFT 8
+#define ESYNR0_ABID_SHIFT 13
+#define ESYNR0_AVMID_SHIFT 16
+#define ESYNR0_ATID_SHIFT 24
+
+
+/* ESYNR1 */
+#define ESYNR1_AMEMTYPE_SHIFT 0
+#define ESYNR1_ASHARED_SHIFT 3
+#define ESYNR1_AINNERSHARED_SHIFT 4
+#define ESYNR1_APRIV_SHIFT 5
+#define ESYNR1_APROTNS_SHIFT 6
+#define ESYNR1_AINST_SHIFT 7
+#define ESYNR1_AWRITE_SHIFT 8
+#define ESYNR1_ABURST_SHIFT 10
+#define ESYNR1_ALEN_SHIFT 12
+#define ESYNR1_ASIZE_SHIFT 16
+#define ESYNR1_ALOCK_SHIFT 20
+#define ESYNR1_AOOO_SHIFT 22
+#define ESYNR1_AFULL_SHIFT 24
+#define ESYNR1_AC_SHIFT 30
+#define ESYNR1_DCD_SHIFT 31
+
+
+/* IDR */
+#define NM2VCBMT_SHIFT 0
+#define HTW_SHIFT 9
+#define HUM_SHIFT 10
+#define TLBSIZE_SHIFT 12
+#define NCB_SHIFT 16
+#define NIRPT_SHIFT 24
+
+
+/* M2VCBRn */
+#define VMID_SHIFT 0
+#define CBNDX_SHIFT 8
+#define BYPASSD_SHIFT 16
+#define BPRCOSH_SHIFT 17
+#define BPRCISH_SHIFT 18
+#define BPRCNSH_SHIFT 19
+#define BPSHCFG_SHIFT 20
+#define NSCFG_SHIFT 22
+#define BPMTCFG_SHIFT 24
+#define BPMEMTYPE_SHIFT 25
+
+
+/* REV */
+#define MINOR_SHIFT 0
+#define MAJOR_SHIFT 4
+
+
+/* TESTBUSCR */
+#define TBE_SHIFT 0
+#define SPDMBE_SHIFT 1
+#define WGSEL_SHIFT 8
+#define TBLSEL_SHIFT 12
+#define TBHSEL_SHIFT 14
+#define SPDM0SEL_SHIFT 16
+#define SPDM1SEL_SHIFT 20
+#define SPDM2SEL_SHIFT 24
+#define SPDM3SEL_SHIFT 28
+
+
+/* TLBIMID */
+#define TLBIVMID_VMID_SHIFT 0
+
+
+/* TLBRSW */
+#define TLBRSW_INDEX_SHIFT 0
+#define TLBBFBS_SHIFT 8
+
+
+/* TLBTR0 */
+#define PR_SHIFT 0
+#define PW_SHIFT 1
+#define UR_SHIFT 2
+#define UW_SHIFT 3
+#define XN_SHIFT 4
+#define NSDESC_SHIFT 6
+#define ISH_SHIFT 7
+#define SH_SHIFT 8
+#define MT_SHIFT 9
+#define DPSIZR_SHIFT 16
+#define DPSIZC_SHIFT 20
+
+
+/* TLBTR1 */
+#define TLBTR1_VMID_SHIFT 0
+#define TLBTR1_PA_SHIFT 12
+
+
+/* TLBTR2 */
+#define TLBTR2_ASID_SHIFT 0
+#define TLBTR2_V_SHIFT 8
+#define TLBTR2_NSTID_SHIFT 9
+#define TLBTR2_NV_SHIFT 10
+#define TLBTR2_VA_SHIFT 12
+
+
+/* Context Register Masks */
+/* ACTLR */
+#define CFERE_MASK 0x01
+#define CFEIE_MASK 0x01
+#define PTSHCFG_MASK 0x03
+#define RCOSH_MASK 0x01
+#define RCISH_MASK 0x01
+#define RCNSH_MASK 0x01
+#define PRIVCFG_MASK 0x03
+#define DNA_MASK 0x01
+#define DNLV2PA_MASK 0x01
+#define TLBMCFG_MASK 0x03
+#define CFCFG_MASK 0x01
+#define TIPCF_MASK 0x01
+#define V2PCFG_MASK 0x03
+#define HUME_MASK 0x01
+#define PTMTCFG_MASK 0x01
+#define PTMEMTYPE_MASK 0x07
+
+
+/* BFBCR */
+#define BFBDFE_MASK 0x01
+#define BFBSFE_MASK 0x01
+#define SFVS_MASK 0x01
+#define FLVIC_MASK 0x0F
+#define SLVIC_MASK 0x0F
+
+
+/* CONTEXTIDR */
+#define CONTEXTIDR_ASID_MASK 0xFF
+#define PROCID_MASK 0x00FFFFFF
+
+
+/* FSR */
+#define TF_MASK 0x01
+#define AFF_MASK 0x01
+#define APF_MASK 0x01
+#define TLBMF_MASK 0x01
+#define HTWDEEF_MASK 0x01
+#define HTWSEEF_MASK 0x01
+#define MHF_MASK 0x01
+#define SL_MASK 0x01
+#define SS_MASK 0x01
+#define MULTI_MASK 0x01
+
+
+/* FSYNR0 */
+#define AMID_MASK 0xFF
+#define APID_MASK 0x1F
+#define ABID_MASK 0x07
+#define ATID_MASK 0xFF
+
+
+/* FSYNR1 */
+#define AMEMTYPE_MASK 0x07
+#define ASHARED_MASK 0x01
+#define AINNERSHARED_MASK 0x01
+#define APRIV_MASK 0x01
+#define APROTNS_MASK 0x01
+#define AINST_MASK 0x01
+#define AWRITE_MASK 0x01
+#define ABURST_MASK 0x01
+#define ALEN_MASK 0x0F
+#define FSYNR1_ASIZE_MASK 0x07
+#define ALOCK_MASK 0x03
+#define AFULL_MASK 0x01
+
+
+/* NMRR */
+#define ICPC0_MASK 0x03
+#define ICPC1_MASK 0x03
+#define ICPC2_MASK 0x03
+#define ICPC3_MASK 0x03
+#define ICPC4_MASK 0x03
+#define ICPC5_MASK 0x03
+#define ICPC6_MASK 0x03
+#define ICPC7_MASK 0x03
+#define OCPC0_MASK 0x03
+#define OCPC1_MASK 0x03
+#define OCPC2_MASK 0x03
+#define OCPC3_MASK 0x03
+#define OCPC4_MASK 0x03
+#define OCPC5_MASK 0x03
+#define OCPC6_MASK 0x03
+#define OCPC7_MASK 0x03
+
+
+/* PAR */
+#define FAULT_MASK 0x01
+/* If a fault is present, these are the
+same as the fault fields in the FAR */
+#define FAULT_TF_MASK 0x01
+#define FAULT_AFF_MASK 0x01
+#define FAULT_APF_MASK 0x01
+#define FAULT_TLBMF_MASK 0x01
+#define FAULT_HTWDEEF_MASK 0x01
+#define FAULT_HTWSEEF_MASK 0x01
+#define FAULT_MHF_MASK 0x01
+#define FAULT_SL_MASK 0x01
+#define FAULT_SS_MASK 0x01
+
+/* If NO fault is present, the following
+ * fields are in effect
+ * (FAULT remains as before) */
+#define PAR_NOFAULT_SS_MASK 0x01
+#define PAR_NOFAULT_MT_MASK 0x07
+#define PAR_NOFAULT_SH_MASK 0x01
+#define PAR_NOFAULT_NS_MASK 0x01
+#define PAR_NOFAULT_NOS_MASK 0x01
+#define PAR_NPFAULT_PA_MASK 0x000FFFFF
+
+
+/* PRRR */
+#define MTC0_MASK 0x03
+#define MTC1_MASK 0x03
+#define MTC2_MASK 0x03
+#define MTC3_MASK 0x03
+#define MTC4_MASK 0x03
+#define MTC5_MASK 0x03
+#define MTC6_MASK 0x03
+#define MTC7_MASK 0x03
+#define SHDSH0_MASK 0x01
+#define SHDSH1_MASK 0x01
+#define SHNMSH0_MASK 0x01
+#define SHNMSH1_MASK 0x01
+#define NOS0_MASK 0x01
+#define NOS1_MASK 0x01
+#define NOS2_MASK 0x01
+#define NOS3_MASK 0x01
+#define NOS4_MASK 0x01
+#define NOS5_MASK 0x01
+#define NOS6_MASK 0x01
+#define NOS7_MASK 0x01
+
+
+/* RESUME */
+#define TNR_MASK 0x01
+
+
+/* SCTLR */
+#define M_MASK 0x01
+#define TRE_MASK 0x01
+#define AFE_MASK 0x01
+#define HAF_MASK 0x01
+#define BE_MASK 0x01
+#define AFFD_MASK 0x01
+
+
+/* TLBIASID */
+#define TLBIASID_ASID_MASK 0xFF
+
+
+/* TLBIVA */
+#define TLBIVA_ASID_MASK 0xFF
+#define TLBIVA_VA_MASK 0x000FFFFF
+
+
+/* TLBIVAA */
+#define TLBIVAA_VA_MASK 0x000FFFFF
+
+
+/* TLBLCKR */
+#define LKE_MASK 0x01
+#define TLBLCKR_TLBIALLCFG_MASK 0x01
+#define TLBIASIDCFG_MASK 0x01
+#define TLBIVAACFG_MASK 0x01
+#define FLOOR_MASK 0xFF
+#define VICTIM_MASK 0xFF
+
+
+/* TTBCR */
+#define N_MASK 0x07
+#define PD0_MASK 0x01
+#define PD1_MASK 0x01
+
+
+/* TTBR0 */
+#define TTBR0_IRGNH_MASK 0x01
+#define TTBR0_SH_MASK 0x01
+#define TTBR0_ORGN_MASK 0x03
+#define TTBR0_NOS_MASK 0x01
+#define TTBR0_IRGNL_MASK 0x01
+#define TTBR0_PA_MASK 0x0003FFFF
+
+
+/* TTBR1 */
+#define TTBR1_IRGNH_MASK 0x01
+#define TTBR1_SH_MASK 0x01
+#define TTBR1_ORGN_MASK 0x03
+#define TTBR1_NOS_MASK 0x01
+#define TTBR1_IRGNL_MASK 0x01
+#define TTBR1_PA_MASK 0x0003FFFF
+
+
+/* V2PSR */
+#define HIT_MASK 0x01
+#define INDEX_MASK 0xFF
+
+
+/* V2Pxx */
+#define V2Pxx_INDEX_MASK 0xFF
+#define V2Pxx_VA_MASK 0x000FFFFF
+
+
+/* Context Register Shifts */
+/* ACTLR */
+#define CFERE_SHIFT 0
+#define CFEIE_SHIFT 1
+#define PTSHCFG_SHIFT 2
+#define RCOSH_SHIFT 4
+#define RCISH_SHIFT 5
+#define RCNSH_SHIFT 6
+#define PRIVCFG_SHIFT 8
+#define DNA_SHIFT 10
+#define DNLV2PA_SHIFT 11
+#define TLBMCFG_SHIFT 12
+#define CFCFG_SHIFT 14
+#define TIPCF_SHIFT 15
+#define V2PCFG_SHIFT 16
+#define HUME_SHIFT 18
+#define PTMTCFG_SHIFT 20
+#define PTMEMTYPE_SHIFT 21
+
+
+/* BFBCR */
+#define BFBDFE_SHIFT 0
+#define BFBSFE_SHIFT 1
+#define SFVS_SHIFT 2
+#define FLVIC_SHIFT 4
+#define SLVIC_SHIFT 8
+
+
+/* CONTEXTIDR */
+#define CONTEXTIDR_ASID_SHIFT 0
+#define PROCID_SHIFT 8
+
+
+/* FSR */
+#define TF_SHIFT 1
+#define AFF_SHIFT 2
+#define APF_SHIFT 3
+#define TLBMF_SHIFT 4
+#define HTWDEEF_SHIFT 5
+#define HTWSEEF_SHIFT 6
+#define MHF_SHIFT 7
+#define SL_SHIFT 16
+#define SS_SHIFT 30
+#define MULTI_SHIFT 31
+
+
+/* FSYNR0 */
+#define AMID_SHIFT 0
+#define APID_SHIFT 8
+#define ABID_SHIFT 13
+#define ATID_SHIFT 24
+
+
+/* FSYNR1 */
+#define AMEMTYPE_SHIFT 0
+#define ASHARED_SHIFT 3
+#define AINNERSHARED_SHIFT 4
+#define APRIV_SHIFT 5
+#define APROTNS_SHIFT 6
+#define AINST_SHIFT 7
+#define AWRITE_SHIFT 8
+#define ABURST_SHIFT 10
+#define ALEN_SHIFT 12
+#define FSYNR1_ASIZE_SHIFT 16
+#define ALOCK_SHIFT 20
+#define AFULL_SHIFT 24
+
+
+/* NMRR */
+#define ICPC0_SHIFT 0
+#define ICPC1_SHIFT 2
+#define ICPC2_SHIFT 4
+#define ICPC3_SHIFT 6
+#define ICPC4_SHIFT 8
+#define ICPC5_SHIFT 10
+#define ICPC6_SHIFT 12
+#define ICPC7_SHIFT 14
+#define OCPC0_SHIFT 16
+#define OCPC1_SHIFT 18
+#define OCPC2_SHIFT 20
+#define OCPC3_SHIFT 22
+#define OCPC4_SHIFT 24
+#define OCPC5_SHIFT 26
+#define OCPC6_SHIFT 28
+#define OCPC7_SHIFT 30
+
+
+/* PAR */
+#define FAULT_SHIFT 0
+/* If a fault is present, these are the
+same as the fault fields in the FAR */
+#define FAULT_TF_SHIFT 1
+#define FAULT_AFF_SHIFT 2
+#define FAULT_APF_SHIFT 3
+#define FAULT_TLBMF_SHIFT 4
+#define FAULT_HTWDEEF_SHIFT 5
+#define FAULT_HTWSEEF_SHIFT 6
+#define FAULT_MHF_SHIFT 7
+#define FAULT_SL_SHIFT 16
+#define FAULT_SS_SHIFT 30
+
+/* If NO fault is present, the following
+ * fields are in effect
+ * (FAULT remains as before) */
+#define PAR_NOFAULT_SS_SHIFT 1
+#define PAR_NOFAULT_MT_SHIFT 4
+#define PAR_NOFAULT_SH_SHIFT 7
+#define PAR_NOFAULT_NS_SHIFT 9
+#define PAR_NOFAULT_NOS_SHIFT 10
+#define PAR_NPFAULT_PA_SHIFT 12
+
+
+/* PRRR */
+#define MTC0_SHIFT 0
+#define MTC1_SHIFT 2
+#define MTC2_SHIFT 4
+#define MTC3_SHIFT 6
+#define MTC4_SHIFT 8
+#define MTC5_SHIFT 10
+#define MTC6_SHIFT 12
+#define MTC7_SHIFT 14
+#define SHDSH0_SHIFT 16
+#define SHDSH1_SHIFT 17
+#define SHNMSH0_SHIFT 18
+#define SHNMSH1_SHIFT 19
+#define NOS0_SHIFT 24
+#define NOS1_SHIFT 25
+#define NOS2_SHIFT 26
+#define NOS3_SHIFT 27
+#define NOS4_SHIFT 28
+#define NOS5_SHIFT 29
+#define NOS6_SHIFT 30
+#define NOS7_SHIFT 31
+
+
+/* RESUME */
+#define TNR_SHIFT 0
+
+
+/* SCTLR */
+#define M_SHIFT 0
+#define TRE_SHIFT 1
+#define AFE_SHIFT 2
+#define HAF_SHIFT 3
+#define BE_SHIFT 4
+#define AFFD_SHIFT 5
+
+
+/* TLBIASID */
+#define TLBIASID_ASID_SHIFT 0
+
+
+/* TLBIVA */
+#define TLBIVA_ASID_SHIFT 0
+#define TLBIVA_VA_SHIFT 12
+
+
+/* TLBIVAA */
+#define TLBIVAA_VA_SHIFT 12
+
+
+/* TLBLCKR */
+#define LKE_SHIFT 0
+#define TLBLCKR_TLBIALLCFG_SHIFT 1
+#define TLBIASIDCFG_SHIFT 2
+#define TLBIVAACFG_SHIFT 3
+#define FLOOR_SHIFT 8
+#define VICTIM_SHIFT 8
+
+
+/* TTBCR */
+#define N_SHIFT 3
+#define PD0_SHIFT 4
+#define PD1_SHIFT 5
+
+
+/* TTBR0 */
+#define TTBR0_IRGNH_SHIFT 0
+#define TTBR0_SH_SHIFT 1
+#define TTBR0_ORGN_SHIFT 3
+#define TTBR0_NOS_SHIFT 5
+#define TTBR0_IRGNL_SHIFT 6
+#define TTBR0_PA_SHIFT 14
+
+
+/* TTBR1 */
+#define TTBR1_IRGNH_SHIFT 0
+#define TTBR1_SH_SHIFT 1
+#define TTBR1_ORGN_SHIFT 3
+#define TTBR1_NOS_SHIFT 5
+#define TTBR1_IRGNL_SHIFT 6
+#define TTBR1_PA_SHIFT 14
+
+
+/* V2PSR */
+#define HIT_SHIFT 0
+#define INDEX_SHIFT 8
+
+
+/* V2Pxx */
+#define V2Pxx_INDEX_SHIFT 0
+#define V2Pxx_VA_SHIFT 12
+
+#endif
diff --git a/drivers/iommu/mtk_iommu.c b/drivers/iommu/mtk_iommu.c
new file mode 100644
index 000000000..051815c9d
--- /dev/null
+++ b/drivers/iommu/mtk_iommu.c
@@ -0,0 +1,894 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2015-2016 MediaTek Inc.
+ * Author: Yong Wu <yong.wu@mediatek.com>
+ */
+#include <linux/bug.h>
+#include <linux/clk.h>
+#include <linux/component.h>
+#include <linux/device.h>
+#include <linux/dma-iommu.h>
+#include <linux/err.h>
+#include <linux/interrupt.h>
+#include <linux/io.h>
+#include <linux/iommu.h>
+#include <linux/iopoll.h>
+#include <linux/list.h>
+#include <linux/mfd/syscon.h>
+#include <linux/of_address.h>
+#include <linux/of_iommu.h>
+#include <linux/of_irq.h>
+#include <linux/of_platform.h>
+#include <linux/platform_device.h>
+#include <linux/regmap.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/soc/mediatek/infracfg.h>
+#include <asm/barrier.h>
+#include <soc/mediatek/smi.h>
+
+#include "mtk_iommu.h"
+
+#define REG_MMU_PT_BASE_ADDR 0x000
+#define MMU_PT_ADDR_MASK GENMASK(31, 7)
+
+#define REG_MMU_INVALIDATE 0x020
+#define F_ALL_INVLD 0x2
+#define F_MMU_INV_RANGE 0x1
+
+#define REG_MMU_INVLD_START_A 0x024
+#define REG_MMU_INVLD_END_A 0x028
+
+#define REG_MMU_INV_SEL_GEN2 0x02c
+#define REG_MMU_INV_SEL_GEN1 0x038
+#define F_INVLD_EN0 BIT(0)
+#define F_INVLD_EN1 BIT(1)
+
+#define REG_MMU_MISC_CTRL 0x048
+#define F_MMU_IN_ORDER_WR_EN_MASK (BIT(1) | BIT(17))
+#define F_MMU_STANDARD_AXI_MODE_MASK (BIT(3) | BIT(19))
+
+#define REG_MMU_DCM_DIS 0x050
+#define REG_MMU_WR_LEN_CTRL 0x054
+#define F_MMU_WR_THROT_DIS_MASK (BIT(5) | BIT(21))
+
+#define REG_MMU_CTRL_REG 0x110
+#define F_MMU_TF_PROT_TO_PROGRAM_ADDR (2 << 4)
+#define F_MMU_PREFETCH_RT_REPLACE_MOD BIT(4)
+#define F_MMU_TF_PROT_TO_PROGRAM_ADDR_MT8173 (2 << 5)
+
+#define REG_MMU_IVRP_PADDR 0x114
+
+#define REG_MMU_VLD_PA_RNG 0x118
+#define F_MMU_VLD_PA_RNG(EA, SA) (((EA) << 8) | (SA))
+
+#define REG_MMU_INT_CONTROL0 0x120
+#define F_L2_MULIT_HIT_EN BIT(0)
+#define F_TABLE_WALK_FAULT_INT_EN BIT(1)
+#define F_PREETCH_FIFO_OVERFLOW_INT_EN BIT(2)
+#define F_MISS_FIFO_OVERFLOW_INT_EN BIT(3)
+#define F_PREFETCH_FIFO_ERR_INT_EN BIT(5)
+#define F_MISS_FIFO_ERR_INT_EN BIT(6)
+#define F_INT_CLR_BIT BIT(12)
+
+#define REG_MMU_INT_MAIN_CONTROL 0x124
+ /* mmu0 | mmu1 */
+#define F_INT_TRANSLATION_FAULT (BIT(0) | BIT(7))
+#define F_INT_MAIN_MULTI_HIT_FAULT (BIT(1) | BIT(8))
+#define F_INT_INVALID_PA_FAULT (BIT(2) | BIT(9))
+#define F_INT_ENTRY_REPLACEMENT_FAULT (BIT(3) | BIT(10))
+#define F_INT_TLB_MISS_FAULT (BIT(4) | BIT(11))
+#define F_INT_MISS_TRANSACTION_FIFO_FAULT (BIT(5) | BIT(12))
+#define F_INT_PRETETCH_TRANSATION_FIFO_FAULT (BIT(6) | BIT(13))
+
+#define REG_MMU_CPE_DONE 0x12C
+
+#define REG_MMU_FAULT_ST1 0x134
+#define F_REG_MMU0_FAULT_MASK GENMASK(6, 0)
+#define F_REG_MMU1_FAULT_MASK GENMASK(13, 7)
+
+#define REG_MMU0_FAULT_VA 0x13c
+#define F_MMU_FAULT_VA_WRITE_BIT BIT(1)
+#define F_MMU_FAULT_VA_LAYER_BIT BIT(0)
+
+#define REG_MMU0_INVLD_PA 0x140
+#define REG_MMU1_FAULT_VA 0x144
+#define REG_MMU1_INVLD_PA 0x148
+#define REG_MMU0_INT_ID 0x150
+#define REG_MMU1_INT_ID 0x154
+#define F_MMU_INT_ID_COMM_ID(a) (((a) >> 9) & 0x7)
+#define F_MMU_INT_ID_SUB_COMM_ID(a) (((a) >> 7) & 0x3)
+#define F_MMU_INT_ID_LARB_ID(a) (((a) >> 7) & 0x7)
+#define F_MMU_INT_ID_PORT_ID(a) (((a) >> 2) & 0x1f)
+
+#define MTK_PROTECT_PA_ALIGN 256
+
+/*
+ * Get the local arbiter ID and the portid within the larb arbiter
+ * from mtk_m4u_id which is defined by MTK_M4U_ID.
+ */
+#define MTK_M4U_TO_LARB(id) (((id) >> 5) & 0xf)
+#define MTK_M4U_TO_PORT(id) ((id) & 0x1f)
+
+#define HAS_4GB_MODE BIT(0)
+/* HW will use the EMI clock if there isn't the "bclk". */
+#define HAS_BCLK BIT(1)
+#define HAS_VLD_PA_RNG BIT(2)
+#define RESET_AXI BIT(3)
+#define OUT_ORDER_WR_EN BIT(4)
+#define HAS_SUB_COMM BIT(5)
+#define WR_THROT_EN BIT(6)
+#define HAS_LEGACY_IVRP_PADDR BIT(7)
+
+#define MTK_IOMMU_HAS_FLAG(pdata, _x) \
+ ((((pdata)->flags) & (_x)) == (_x))
+
+struct mtk_iommu_domain {
+ struct io_pgtable_cfg cfg;
+ struct io_pgtable_ops *iop;
+
+ struct iommu_domain domain;
+};
+
+static const struct iommu_ops mtk_iommu_ops;
+
+/*
+ * In M4U 4GB mode, the physical address is remapped as below:
+ *
+ * CPU Physical address:
+ * ====================
+ *
+ * 0 1G 2G 3G 4G 5G
+ * |---A---|---B---|---C---|---D---|---E---|
+ * +--I/O--+------------Memory-------------+
+ *
+ * IOMMU output physical address:
+ * =============================
+ *
+ * 4G 5G 6G 7G 8G
+ * |---E---|---B---|---C---|---D---|
+ * +------------Memory-------------+
+ *
+ * The Region 'A'(I/O) can NOT be mapped by M4U; For Region 'B'/'C'/'D', the
+ * bit32 of the CPU physical address always is needed to set, and for Region
+ * 'E', the CPU physical address keep as is.
+ * Additionally, The iommu consumers always use the CPU phyiscal address.
+ */
+#define MTK_IOMMU_4GB_MODE_REMAP_BASE 0x140000000UL
+
+static LIST_HEAD(m4ulist); /* List all the M4U HWs */
+
+#define for_each_m4u(data) list_for_each_entry(data, &m4ulist, list)
+
+/*
+ * There may be 1 or 2 M4U HWs, But we always expect they are in the same domain
+ * for the performance.
+ *
+ * Here always return the mtk_iommu_data of the first probed M4U where the
+ * iommu domain information is recorded.
+ */
+static struct mtk_iommu_data *mtk_iommu_get_m4u_data(void)
+{
+ struct mtk_iommu_data *data;
+
+ for_each_m4u(data)
+ return data;
+
+ return NULL;
+}
+
+static struct mtk_iommu_domain *to_mtk_domain(struct iommu_domain *dom)
+{
+ return container_of(dom, struct mtk_iommu_domain, domain);
+}
+
+static void mtk_iommu_tlb_flush_all(void *cookie)
+{
+ struct mtk_iommu_data *data = cookie;
+
+ for_each_m4u(data) {
+ writel_relaxed(F_INVLD_EN1 | F_INVLD_EN0,
+ data->base + data->plat_data->inv_sel_reg);
+ writel_relaxed(F_ALL_INVLD, data->base + REG_MMU_INVALIDATE);
+ wmb(); /* Make sure the tlb flush all done */
+ }
+}
+
+static void mtk_iommu_tlb_flush_range_sync(unsigned long iova, size_t size,
+ size_t granule, void *cookie)
+{
+ struct mtk_iommu_data *data = cookie;
+ unsigned long flags;
+ int ret;
+ u32 tmp;
+
+ for_each_m4u(data) {
+ spin_lock_irqsave(&data->tlb_lock, flags);
+ writel_relaxed(F_INVLD_EN1 | F_INVLD_EN0,
+ data->base + data->plat_data->inv_sel_reg);
+
+ writel_relaxed(iova, data->base + REG_MMU_INVLD_START_A);
+ writel_relaxed(iova + size - 1,
+ data->base + REG_MMU_INVLD_END_A);
+ writel_relaxed(F_MMU_INV_RANGE,
+ data->base + REG_MMU_INVALIDATE);
+
+ /* tlb sync */
+ ret = readl_poll_timeout_atomic(data->base + REG_MMU_CPE_DONE,
+ tmp, tmp != 0, 10, 1000);
+ if (ret) {
+ dev_warn(data->dev,
+ "Partial TLB flush timed out, falling back to full flush\n");
+ mtk_iommu_tlb_flush_all(cookie);
+ }
+ /* Clear the CPE status */
+ writel_relaxed(0, data->base + REG_MMU_CPE_DONE);
+ spin_unlock_irqrestore(&data->tlb_lock, flags);
+ }
+}
+
+static void mtk_iommu_tlb_flush_page_nosync(struct iommu_iotlb_gather *gather,
+ unsigned long iova, size_t granule,
+ void *cookie)
+{
+ struct mtk_iommu_data *data = cookie;
+ struct iommu_domain *domain = &data->m4u_dom->domain;
+
+ iommu_iotlb_gather_add_page(domain, gather, iova, granule);
+}
+
+static const struct iommu_flush_ops mtk_iommu_flush_ops = {
+ .tlb_flush_all = mtk_iommu_tlb_flush_all,
+ .tlb_flush_walk = mtk_iommu_tlb_flush_range_sync,
+ .tlb_flush_leaf = mtk_iommu_tlb_flush_range_sync,
+ .tlb_add_page = mtk_iommu_tlb_flush_page_nosync,
+};
+
+static irqreturn_t mtk_iommu_isr(int irq, void *dev_id)
+{
+ struct mtk_iommu_data *data = dev_id;
+ struct mtk_iommu_domain *dom = data->m4u_dom;
+ u32 int_state, regval, fault_iova, fault_pa;
+ unsigned int fault_larb, fault_port, sub_comm = 0;
+ bool layer, write;
+
+ /* Read error info from registers */
+ int_state = readl_relaxed(data->base + REG_MMU_FAULT_ST1);
+ if (int_state & F_REG_MMU0_FAULT_MASK) {
+ regval = readl_relaxed(data->base + REG_MMU0_INT_ID);
+ fault_iova = readl_relaxed(data->base + REG_MMU0_FAULT_VA);
+ fault_pa = readl_relaxed(data->base + REG_MMU0_INVLD_PA);
+ } else {
+ regval = readl_relaxed(data->base + REG_MMU1_INT_ID);
+ fault_iova = readl_relaxed(data->base + REG_MMU1_FAULT_VA);
+ fault_pa = readl_relaxed(data->base + REG_MMU1_INVLD_PA);
+ }
+ layer = fault_iova & F_MMU_FAULT_VA_LAYER_BIT;
+ write = fault_iova & F_MMU_FAULT_VA_WRITE_BIT;
+ fault_port = F_MMU_INT_ID_PORT_ID(regval);
+ if (MTK_IOMMU_HAS_FLAG(data->plat_data, HAS_SUB_COMM)) {
+ fault_larb = F_MMU_INT_ID_COMM_ID(regval);
+ sub_comm = F_MMU_INT_ID_SUB_COMM_ID(regval);
+ } else {
+ fault_larb = F_MMU_INT_ID_LARB_ID(regval);
+ }
+ fault_larb = data->plat_data->larbid_remap[fault_larb][sub_comm];
+
+ if (report_iommu_fault(&dom->domain, data->dev, fault_iova,
+ write ? IOMMU_FAULT_WRITE : IOMMU_FAULT_READ)) {
+ dev_err_ratelimited(
+ data->dev,
+ "fault type=0x%x iova=0x%x pa=0x%x larb=%d port=%d layer=%d %s\n",
+ int_state, fault_iova, fault_pa, fault_larb, fault_port,
+ layer, write ? "write" : "read");
+ }
+
+ /* Interrupt clear */
+ regval = readl_relaxed(data->base + REG_MMU_INT_CONTROL0);
+ regval |= F_INT_CLR_BIT;
+ writel_relaxed(regval, data->base + REG_MMU_INT_CONTROL0);
+
+ mtk_iommu_tlb_flush_all(data);
+
+ return IRQ_HANDLED;
+}
+
+static void mtk_iommu_config(struct mtk_iommu_data *data,
+ struct device *dev, bool enable)
+{
+ struct mtk_smi_larb_iommu *larb_mmu;
+ unsigned int larbid, portid;
+ struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
+ int i;
+
+ for (i = 0; i < fwspec->num_ids; ++i) {
+ larbid = MTK_M4U_TO_LARB(fwspec->ids[i]);
+ portid = MTK_M4U_TO_PORT(fwspec->ids[i]);
+ larb_mmu = &data->larb_imu[larbid];
+
+ dev_dbg(dev, "%s iommu port: %d\n",
+ enable ? "enable" : "disable", portid);
+
+ if (enable)
+ larb_mmu->mmu |= MTK_SMI_MMU_EN(portid);
+ else
+ larb_mmu->mmu &= ~MTK_SMI_MMU_EN(portid);
+ }
+}
+
+static int mtk_iommu_domain_finalise(struct mtk_iommu_domain *dom)
+{
+ struct mtk_iommu_data *data = mtk_iommu_get_m4u_data();
+
+ dom->cfg = (struct io_pgtable_cfg) {
+ .quirks = IO_PGTABLE_QUIRK_ARM_NS |
+ IO_PGTABLE_QUIRK_NO_PERMS |
+ IO_PGTABLE_QUIRK_TLBI_ON_MAP |
+ IO_PGTABLE_QUIRK_ARM_MTK_EXT,
+ .pgsize_bitmap = mtk_iommu_ops.pgsize_bitmap,
+ .ias = 32,
+ .oas = 34,
+ .tlb = &mtk_iommu_flush_ops,
+ .iommu_dev = data->dev,
+ };
+
+ dom->iop = alloc_io_pgtable_ops(ARM_V7S, &dom->cfg, data);
+ if (!dom->iop) {
+ dev_err(data->dev, "Failed to alloc io pgtable\n");
+ return -EINVAL;
+ }
+
+ /* Update our support page sizes bitmap */
+ dom->domain.pgsize_bitmap = dom->cfg.pgsize_bitmap;
+ return 0;
+}
+
+static struct iommu_domain *mtk_iommu_domain_alloc(unsigned type)
+{
+ struct mtk_iommu_domain *dom;
+
+ if (type != IOMMU_DOMAIN_DMA)
+ return NULL;
+
+ dom = kzalloc(sizeof(*dom), GFP_KERNEL);
+ if (!dom)
+ return NULL;
+
+ if (iommu_get_dma_cookie(&dom->domain))
+ goto free_dom;
+
+ if (mtk_iommu_domain_finalise(dom))
+ goto put_dma_cookie;
+
+ dom->domain.geometry.aperture_start = 0;
+ dom->domain.geometry.aperture_end = DMA_BIT_MASK(32);
+ dom->domain.geometry.force_aperture = true;
+
+ return &dom->domain;
+
+put_dma_cookie:
+ iommu_put_dma_cookie(&dom->domain);
+free_dom:
+ kfree(dom);
+ return NULL;
+}
+
+static void mtk_iommu_domain_free(struct iommu_domain *domain)
+{
+ struct mtk_iommu_domain *dom = to_mtk_domain(domain);
+
+ free_io_pgtable_ops(dom->iop);
+ iommu_put_dma_cookie(domain);
+ kfree(to_mtk_domain(domain));
+}
+
+static int mtk_iommu_attach_device(struct iommu_domain *domain,
+ struct device *dev)
+{
+ struct mtk_iommu_data *data = dev_iommu_priv_get(dev);
+ struct mtk_iommu_domain *dom = to_mtk_domain(domain);
+
+ if (!data)
+ return -ENODEV;
+
+ /* Update the pgtable base address register of the M4U HW */
+ if (!data->m4u_dom) {
+ data->m4u_dom = dom;
+ writel(dom->cfg.arm_v7s_cfg.ttbr & MMU_PT_ADDR_MASK,
+ data->base + REG_MMU_PT_BASE_ADDR);
+ }
+
+ mtk_iommu_config(data, dev, true);
+ return 0;
+}
+
+static void mtk_iommu_detach_device(struct iommu_domain *domain,
+ struct device *dev)
+{
+ struct mtk_iommu_data *data = dev_iommu_priv_get(dev);
+
+ if (!data)
+ return;
+
+ mtk_iommu_config(data, dev, false);
+}
+
+static int mtk_iommu_map(struct iommu_domain *domain, unsigned long iova,
+ phys_addr_t paddr, size_t size, int prot, gfp_t gfp)
+{
+ struct mtk_iommu_domain *dom = to_mtk_domain(domain);
+ struct mtk_iommu_data *data = mtk_iommu_get_m4u_data();
+
+ /* The "4GB mode" M4U physically can not use the lower remap of Dram. */
+ if (data->enable_4GB)
+ paddr |= BIT_ULL(32);
+
+ /* Synchronize with the tlb_lock */
+ return dom->iop->map(dom->iop, iova, paddr, size, prot, gfp);
+}
+
+static size_t mtk_iommu_unmap(struct iommu_domain *domain,
+ unsigned long iova, size_t size,
+ struct iommu_iotlb_gather *gather)
+{
+ struct mtk_iommu_domain *dom = to_mtk_domain(domain);
+
+ return dom->iop->unmap(dom->iop, iova, size, gather);
+}
+
+static void mtk_iommu_flush_iotlb_all(struct iommu_domain *domain)
+{
+ mtk_iommu_tlb_flush_all(mtk_iommu_get_m4u_data());
+}
+
+static void mtk_iommu_iotlb_sync(struct iommu_domain *domain,
+ struct iommu_iotlb_gather *gather)
+{
+ struct mtk_iommu_data *data = mtk_iommu_get_m4u_data();
+ size_t length = gather->end - gather->start + 1;
+
+ if (gather->start == ULONG_MAX)
+ return;
+
+ mtk_iommu_tlb_flush_range_sync(gather->start, length, gather->pgsize,
+ data);
+}
+
+static phys_addr_t mtk_iommu_iova_to_phys(struct iommu_domain *domain,
+ dma_addr_t iova)
+{
+ struct mtk_iommu_domain *dom = to_mtk_domain(domain);
+ struct mtk_iommu_data *data = mtk_iommu_get_m4u_data();
+ phys_addr_t pa;
+
+ pa = dom->iop->iova_to_phys(dom->iop, iova);
+ if (data->enable_4GB && pa >= MTK_IOMMU_4GB_MODE_REMAP_BASE)
+ pa &= ~BIT_ULL(32);
+
+ return pa;
+}
+
+static struct iommu_device *mtk_iommu_probe_device(struct device *dev)
+{
+ struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
+ struct mtk_iommu_data *data;
+
+ if (!fwspec || fwspec->ops != &mtk_iommu_ops)
+ return ERR_PTR(-ENODEV); /* Not a iommu client device */
+
+ data = dev_iommu_priv_get(dev);
+
+ return &data->iommu;
+}
+
+static void mtk_iommu_release_device(struct device *dev)
+{
+ struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
+
+ if (!fwspec || fwspec->ops != &mtk_iommu_ops)
+ return;
+
+ iommu_fwspec_free(dev);
+}
+
+static struct iommu_group *mtk_iommu_device_group(struct device *dev)
+{
+ struct mtk_iommu_data *data = mtk_iommu_get_m4u_data();
+
+ if (!data)
+ return ERR_PTR(-ENODEV);
+
+ /* All the client devices are in the same m4u iommu-group */
+ if (!data->m4u_group) {
+ data->m4u_group = iommu_group_alloc();
+ if (IS_ERR(data->m4u_group))
+ dev_err(dev, "Failed to allocate M4U IOMMU group\n");
+ } else {
+ iommu_group_ref_get(data->m4u_group);
+ }
+ return data->m4u_group;
+}
+
+static int mtk_iommu_of_xlate(struct device *dev, struct of_phandle_args *args)
+{
+ struct platform_device *m4updev;
+
+ if (args->args_count != 1) {
+ dev_err(dev, "invalid #iommu-cells(%d) property for IOMMU\n",
+ args->args_count);
+ return -EINVAL;
+ }
+
+ if (!dev_iommu_priv_get(dev)) {
+ /* Get the m4u device */
+ m4updev = of_find_device_by_node(args->np);
+ if (WARN_ON(!m4updev))
+ return -EINVAL;
+
+ dev_iommu_priv_set(dev, platform_get_drvdata(m4updev));
+ }
+
+ return iommu_fwspec_add_ids(dev, args->args, 1);
+}
+
+static const struct iommu_ops mtk_iommu_ops = {
+ .domain_alloc = mtk_iommu_domain_alloc,
+ .domain_free = mtk_iommu_domain_free,
+ .attach_dev = mtk_iommu_attach_device,
+ .detach_dev = mtk_iommu_detach_device,
+ .map = mtk_iommu_map,
+ .unmap = mtk_iommu_unmap,
+ .flush_iotlb_all = mtk_iommu_flush_iotlb_all,
+ .iotlb_sync = mtk_iommu_iotlb_sync,
+ .iova_to_phys = mtk_iommu_iova_to_phys,
+ .probe_device = mtk_iommu_probe_device,
+ .release_device = mtk_iommu_release_device,
+ .device_group = mtk_iommu_device_group,
+ .of_xlate = mtk_iommu_of_xlate,
+ .pgsize_bitmap = SZ_4K | SZ_64K | SZ_1M | SZ_16M,
+};
+
+static int mtk_iommu_hw_init(const struct mtk_iommu_data *data)
+{
+ u32 regval;
+ int ret;
+
+ ret = clk_prepare_enable(data->bclk);
+ if (ret) {
+ dev_err(data->dev, "Failed to enable iommu bclk(%d)\n", ret);
+ return ret;
+ }
+
+ if (data->plat_data->m4u_plat == M4U_MT8173) {
+ regval = F_MMU_PREFETCH_RT_REPLACE_MOD |
+ F_MMU_TF_PROT_TO_PROGRAM_ADDR_MT8173;
+ } else {
+ regval = readl_relaxed(data->base + REG_MMU_CTRL_REG);
+ regval |= F_MMU_TF_PROT_TO_PROGRAM_ADDR;
+ }
+ writel_relaxed(regval, data->base + REG_MMU_CTRL_REG);
+
+ regval = F_L2_MULIT_HIT_EN |
+ F_TABLE_WALK_FAULT_INT_EN |
+ F_PREETCH_FIFO_OVERFLOW_INT_EN |
+ F_MISS_FIFO_OVERFLOW_INT_EN |
+ F_PREFETCH_FIFO_ERR_INT_EN |
+ F_MISS_FIFO_ERR_INT_EN;
+ writel_relaxed(regval, data->base + REG_MMU_INT_CONTROL0);
+
+ regval = F_INT_TRANSLATION_FAULT |
+ F_INT_MAIN_MULTI_HIT_FAULT |
+ F_INT_INVALID_PA_FAULT |
+ F_INT_ENTRY_REPLACEMENT_FAULT |
+ F_INT_TLB_MISS_FAULT |
+ F_INT_MISS_TRANSACTION_FIFO_FAULT |
+ F_INT_PRETETCH_TRANSATION_FIFO_FAULT;
+ writel_relaxed(regval, data->base + REG_MMU_INT_MAIN_CONTROL);
+
+ if (MTK_IOMMU_HAS_FLAG(data->plat_data, HAS_LEGACY_IVRP_PADDR))
+ regval = (data->protect_base >> 1) | (data->enable_4GB << 31);
+ else
+ regval = lower_32_bits(data->protect_base) |
+ upper_32_bits(data->protect_base);
+ writel_relaxed(regval, data->base + REG_MMU_IVRP_PADDR);
+
+ if (data->enable_4GB &&
+ MTK_IOMMU_HAS_FLAG(data->plat_data, HAS_VLD_PA_RNG)) {
+ /*
+ * If 4GB mode is enabled, the validate PA range is from
+ * 0x1_0000_0000 to 0x1_ffff_ffff. here record bit[32:30].
+ */
+ regval = F_MMU_VLD_PA_RNG(7, 4);
+ writel_relaxed(regval, data->base + REG_MMU_VLD_PA_RNG);
+ }
+ writel_relaxed(0, data->base + REG_MMU_DCM_DIS);
+ if (MTK_IOMMU_HAS_FLAG(data->plat_data, WR_THROT_EN)) {
+ /* write command throttling mode */
+ regval = readl_relaxed(data->base + REG_MMU_WR_LEN_CTRL);
+ regval &= ~F_MMU_WR_THROT_DIS_MASK;
+ writel_relaxed(regval, data->base + REG_MMU_WR_LEN_CTRL);
+ }
+
+ if (MTK_IOMMU_HAS_FLAG(data->plat_data, RESET_AXI)) {
+ /* The register is called STANDARD_AXI_MODE in this case */
+ regval = 0;
+ } else {
+ regval = readl_relaxed(data->base + REG_MMU_MISC_CTRL);
+ regval &= ~F_MMU_STANDARD_AXI_MODE_MASK;
+ if (MTK_IOMMU_HAS_FLAG(data->plat_data, OUT_ORDER_WR_EN))
+ regval &= ~F_MMU_IN_ORDER_WR_EN_MASK;
+ }
+ writel_relaxed(regval, data->base + REG_MMU_MISC_CTRL);
+
+ if (devm_request_irq(data->dev, data->irq, mtk_iommu_isr, 0,
+ dev_name(data->dev), (void *)data)) {
+ writel_relaxed(0, data->base + REG_MMU_PT_BASE_ADDR);
+ clk_disable_unprepare(data->bclk);
+ dev_err(data->dev, "Failed @ IRQ-%d Request\n", data->irq);
+ return -ENODEV;
+ }
+
+ return 0;
+}
+
+static const struct component_master_ops mtk_iommu_com_ops = {
+ .bind = mtk_iommu_bind,
+ .unbind = mtk_iommu_unbind,
+};
+
+static int mtk_iommu_probe(struct platform_device *pdev)
+{
+ struct mtk_iommu_data *data;
+ struct device *dev = &pdev->dev;
+ struct resource *res;
+ resource_size_t ioaddr;
+ struct component_match *match = NULL;
+ struct regmap *infracfg;
+ void *protect;
+ int i, larb_nr, ret;
+ u32 val;
+ char *p;
+
+ data = devm_kzalloc(dev, sizeof(*data), GFP_KERNEL);
+ if (!data)
+ return -ENOMEM;
+ data->dev = dev;
+ data->plat_data = of_device_get_match_data(dev);
+
+ /* Protect memory. HW will access here while translation fault.*/
+ protect = devm_kzalloc(dev, MTK_PROTECT_PA_ALIGN * 2, GFP_KERNEL);
+ if (!protect)
+ return -ENOMEM;
+ data->protect_base = ALIGN(virt_to_phys(protect), MTK_PROTECT_PA_ALIGN);
+
+ if (MTK_IOMMU_HAS_FLAG(data->plat_data, HAS_4GB_MODE)) {
+ switch (data->plat_data->m4u_plat) {
+ case M4U_MT2712:
+ p = "mediatek,mt2712-infracfg";
+ break;
+ case M4U_MT8173:
+ p = "mediatek,mt8173-infracfg";
+ break;
+ default:
+ p = NULL;
+ }
+
+ infracfg = syscon_regmap_lookup_by_compatible(p);
+
+ if (IS_ERR(infracfg))
+ return PTR_ERR(infracfg);
+
+ ret = regmap_read(infracfg, REG_INFRA_MISC, &val);
+ if (ret)
+ return ret;
+ data->enable_4GB = !!(val & F_DDR_4GB_SUPPORT_EN);
+ }
+
+ res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+ data->base = devm_ioremap_resource(dev, res);
+ if (IS_ERR(data->base))
+ return PTR_ERR(data->base);
+ ioaddr = res->start;
+
+ data->irq = platform_get_irq(pdev, 0);
+ if (data->irq < 0)
+ return data->irq;
+
+ if (MTK_IOMMU_HAS_FLAG(data->plat_data, HAS_BCLK)) {
+ data->bclk = devm_clk_get(dev, "bclk");
+ if (IS_ERR(data->bclk))
+ return PTR_ERR(data->bclk);
+ }
+
+ larb_nr = of_count_phandle_with_args(dev->of_node,
+ "mediatek,larbs", NULL);
+ if (larb_nr < 0)
+ return larb_nr;
+
+ for (i = 0; i < larb_nr; i++) {
+ struct device_node *larbnode;
+ struct platform_device *plarbdev;
+ u32 id;
+
+ larbnode = of_parse_phandle(dev->of_node, "mediatek,larbs", i);
+ if (!larbnode)
+ return -EINVAL;
+
+ if (!of_device_is_available(larbnode)) {
+ of_node_put(larbnode);
+ continue;
+ }
+
+ ret = of_property_read_u32(larbnode, "mediatek,larb-id", &id);
+ if (ret)/* The id is consecutive if there is no this property */
+ id = i;
+
+ plarbdev = of_find_device_by_node(larbnode);
+ if (!plarbdev) {
+ of_node_put(larbnode);
+ return -EPROBE_DEFER;
+ }
+ data->larb_imu[id].dev = &plarbdev->dev;
+
+ component_match_add_release(dev, &match, release_of,
+ compare_of, larbnode);
+ }
+
+ platform_set_drvdata(pdev, data);
+
+ ret = mtk_iommu_hw_init(data);
+ if (ret)
+ return ret;
+
+ ret = iommu_device_sysfs_add(&data->iommu, dev, NULL,
+ "mtk-iommu.%pa", &ioaddr);
+ if (ret)
+ return ret;
+
+ iommu_device_set_ops(&data->iommu, &mtk_iommu_ops);
+ iommu_device_set_fwnode(&data->iommu, &pdev->dev.of_node->fwnode);
+
+ ret = iommu_device_register(&data->iommu);
+ if (ret)
+ return ret;
+
+ spin_lock_init(&data->tlb_lock);
+ list_add_tail(&data->list, &m4ulist);
+
+ if (!iommu_present(&platform_bus_type))
+ bus_set_iommu(&platform_bus_type, &mtk_iommu_ops);
+
+ return component_master_add_with_match(dev, &mtk_iommu_com_ops, match);
+}
+
+static int mtk_iommu_remove(struct platform_device *pdev)
+{
+ struct mtk_iommu_data *data = platform_get_drvdata(pdev);
+
+ iommu_device_sysfs_remove(&data->iommu);
+ iommu_device_unregister(&data->iommu);
+
+ list_del(&data->list);
+
+ clk_disable_unprepare(data->bclk);
+ devm_free_irq(&pdev->dev, data->irq, data);
+ component_master_del(&pdev->dev, &mtk_iommu_com_ops);
+ return 0;
+}
+
+static int __maybe_unused mtk_iommu_suspend(struct device *dev)
+{
+ struct mtk_iommu_data *data = dev_get_drvdata(dev);
+ struct mtk_iommu_suspend_reg *reg = &data->reg;
+ void __iomem *base = data->base;
+
+ reg->wr_len_ctrl = readl_relaxed(base + REG_MMU_WR_LEN_CTRL);
+ reg->misc_ctrl = readl_relaxed(base + REG_MMU_MISC_CTRL);
+ reg->dcm_dis = readl_relaxed(base + REG_MMU_DCM_DIS);
+ reg->ctrl_reg = readl_relaxed(base + REG_MMU_CTRL_REG);
+ reg->int_control0 = readl_relaxed(base + REG_MMU_INT_CONTROL0);
+ reg->int_main_control = readl_relaxed(base + REG_MMU_INT_MAIN_CONTROL);
+ reg->ivrp_paddr = readl_relaxed(base + REG_MMU_IVRP_PADDR);
+ reg->vld_pa_rng = readl_relaxed(base + REG_MMU_VLD_PA_RNG);
+ clk_disable_unprepare(data->bclk);
+ return 0;
+}
+
+static int __maybe_unused mtk_iommu_resume(struct device *dev)
+{
+ struct mtk_iommu_data *data = dev_get_drvdata(dev);
+ struct mtk_iommu_suspend_reg *reg = &data->reg;
+ struct mtk_iommu_domain *m4u_dom = data->m4u_dom;
+ void __iomem *base = data->base;
+ int ret;
+
+ ret = clk_prepare_enable(data->bclk);
+ if (ret) {
+ dev_err(data->dev, "Failed to enable clk(%d) in resume\n", ret);
+ return ret;
+ }
+ writel_relaxed(reg->wr_len_ctrl, base + REG_MMU_WR_LEN_CTRL);
+ writel_relaxed(reg->misc_ctrl, base + REG_MMU_MISC_CTRL);
+ writel_relaxed(reg->dcm_dis, base + REG_MMU_DCM_DIS);
+ writel_relaxed(reg->ctrl_reg, base + REG_MMU_CTRL_REG);
+ writel_relaxed(reg->int_control0, base + REG_MMU_INT_CONTROL0);
+ writel_relaxed(reg->int_main_control, base + REG_MMU_INT_MAIN_CONTROL);
+ writel_relaxed(reg->ivrp_paddr, base + REG_MMU_IVRP_PADDR);
+ writel_relaxed(reg->vld_pa_rng, base + REG_MMU_VLD_PA_RNG);
+ if (m4u_dom)
+ writel(m4u_dom->cfg.arm_v7s_cfg.ttbr & MMU_PT_ADDR_MASK,
+ base + REG_MMU_PT_BASE_ADDR);
+ return 0;
+}
+
+static const struct dev_pm_ops mtk_iommu_pm_ops = {
+ SET_NOIRQ_SYSTEM_SLEEP_PM_OPS(mtk_iommu_suspend, mtk_iommu_resume)
+};
+
+static const struct mtk_iommu_plat_data mt2712_data = {
+ .m4u_plat = M4U_MT2712,
+ .flags = HAS_4GB_MODE | HAS_BCLK | HAS_VLD_PA_RNG,
+ .inv_sel_reg = REG_MMU_INV_SEL_GEN1,
+ .larbid_remap = {{0}, {1}, {2}, {3}, {4}, {5}, {6}, {7}},
+};
+
+static const struct mtk_iommu_plat_data mt6779_data = {
+ .m4u_plat = M4U_MT6779,
+ .flags = HAS_SUB_COMM | OUT_ORDER_WR_EN | WR_THROT_EN,
+ .inv_sel_reg = REG_MMU_INV_SEL_GEN2,
+ .larbid_remap = {{0}, {1}, {2}, {3}, {5}, {7, 8}, {10}, {9}},
+};
+
+static const struct mtk_iommu_plat_data mt8167_data = {
+ .m4u_plat = M4U_MT8167,
+ .flags = RESET_AXI | HAS_LEGACY_IVRP_PADDR,
+ .inv_sel_reg = REG_MMU_INV_SEL_GEN1,
+ .larbid_remap = {{0}, {1}, {2}}, /* Linear mapping. */
+};
+
+static const struct mtk_iommu_plat_data mt8173_data = {
+ .m4u_plat = M4U_MT8173,
+ .flags = HAS_4GB_MODE | HAS_BCLK | RESET_AXI |
+ HAS_LEGACY_IVRP_PADDR,
+ .inv_sel_reg = REG_MMU_INV_SEL_GEN1,
+ .larbid_remap = {{0}, {1}, {2}, {3}, {4}, {5}}, /* Linear mapping. */
+};
+
+static const struct mtk_iommu_plat_data mt8183_data = {
+ .m4u_plat = M4U_MT8183,
+ .flags = RESET_AXI,
+ .inv_sel_reg = REG_MMU_INV_SEL_GEN1,
+ .larbid_remap = {{0}, {4}, {5}, {6}, {7}, {2}, {3}, {1}},
+};
+
+static const struct of_device_id mtk_iommu_of_ids[] = {
+ { .compatible = "mediatek,mt2712-m4u", .data = &mt2712_data},
+ { .compatible = "mediatek,mt6779-m4u", .data = &mt6779_data},
+ { .compatible = "mediatek,mt8167-m4u", .data = &mt8167_data},
+ { .compatible = "mediatek,mt8173-m4u", .data = &mt8173_data},
+ { .compatible = "mediatek,mt8183-m4u", .data = &mt8183_data},
+ {}
+};
+
+static struct platform_driver mtk_iommu_driver = {
+ .probe = mtk_iommu_probe,
+ .remove = mtk_iommu_remove,
+ .driver = {
+ .name = "mtk-iommu",
+ .of_match_table = mtk_iommu_of_ids,
+ .pm = &mtk_iommu_pm_ops,
+ }
+};
+
+static int __init mtk_iommu_init(void)
+{
+ int ret;
+
+ ret = platform_driver_register(&mtk_iommu_driver);
+ if (ret != 0)
+ pr_err("Failed to register MTK IOMMU driver\n");
+
+ return ret;
+}
+
+subsys_initcall(mtk_iommu_init)
diff --git a/drivers/iommu/mtk_iommu.h b/drivers/iommu/mtk_iommu.h
new file mode 100644
index 000000000..df32b3e34
--- /dev/null
+++ b/drivers/iommu/mtk_iommu.h
@@ -0,0 +1,101 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2015-2016 MediaTek Inc.
+ * Author: Honghui Zhang <honghui.zhang@mediatek.com>
+ */
+
+#ifndef _MTK_IOMMU_H_
+#define _MTK_IOMMU_H_
+
+#include <linux/clk.h>
+#include <linux/component.h>
+#include <linux/device.h>
+#include <linux/io.h>
+#include <linux/io-pgtable.h>
+#include <linux/iommu.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/dma-mapping.h>
+#include <soc/mediatek/smi.h>
+
+#define MTK_LARB_COM_MAX 8
+#define MTK_LARB_SUBCOM_MAX 4
+
+struct mtk_iommu_suspend_reg {
+ union {
+ u32 standard_axi_mode;/* v1 */
+ u32 misc_ctrl;/* v2 */
+ };
+ u32 dcm_dis;
+ u32 ctrl_reg;
+ u32 int_control0;
+ u32 int_main_control;
+ u32 ivrp_paddr;
+ u32 vld_pa_rng;
+ u32 wr_len_ctrl;
+};
+
+enum mtk_iommu_plat {
+ M4U_MT2701,
+ M4U_MT2712,
+ M4U_MT6779,
+ M4U_MT8167,
+ M4U_MT8173,
+ M4U_MT8183,
+};
+
+struct mtk_iommu_plat_data {
+ enum mtk_iommu_plat m4u_plat;
+ u32 flags;
+ u32 inv_sel_reg;
+ unsigned char larbid_remap[MTK_LARB_COM_MAX][MTK_LARB_SUBCOM_MAX];
+};
+
+struct mtk_iommu_domain;
+
+struct mtk_iommu_data {
+ void __iomem *base;
+ int irq;
+ struct device *dev;
+ struct clk *bclk;
+ phys_addr_t protect_base; /* protect memory base */
+ struct mtk_iommu_suspend_reg reg;
+ struct mtk_iommu_domain *m4u_dom;
+ struct iommu_group *m4u_group;
+ bool enable_4GB;
+ spinlock_t tlb_lock; /* lock for tlb range flush */
+
+ struct iommu_device iommu;
+ const struct mtk_iommu_plat_data *plat_data;
+
+ struct dma_iommu_mapping *mapping; /* For mtk_iommu_v1.c */
+
+ struct list_head list;
+ struct mtk_smi_larb_iommu larb_imu[MTK_LARB_NR_MAX];
+};
+
+static inline int compare_of(struct device *dev, void *data)
+{
+ return dev->of_node == data;
+}
+
+static inline void release_of(struct device *dev, void *data)
+{
+ of_node_put(data);
+}
+
+static inline int mtk_iommu_bind(struct device *dev)
+{
+ struct mtk_iommu_data *data = dev_get_drvdata(dev);
+
+ return component_bind_all(dev, &data->larb_imu);
+}
+
+static inline void mtk_iommu_unbind(struct device *dev)
+{
+ struct mtk_iommu_data *data = dev_get_drvdata(dev);
+
+ component_unbind_all(dev, &data->larb_imu);
+}
+
+#endif
diff --git a/drivers/iommu/mtk_iommu_v1.c b/drivers/iommu/mtk_iommu_v1.c
new file mode 100644
index 000000000..2abbdd71d
--- /dev/null
+++ b/drivers/iommu/mtk_iommu_v1.c
@@ -0,0 +1,715 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * IOMMU API for MTK architected m4u v1 implementations
+ *
+ * Copyright (c) 2015-2016 MediaTek Inc.
+ * Author: Honghui Zhang <honghui.zhang@mediatek.com>
+ *
+ * Based on driver/iommu/mtk_iommu.c
+ */
+#include <linux/memblock.h>
+#include <linux/bug.h>
+#include <linux/clk.h>
+#include <linux/component.h>
+#include <linux/device.h>
+#include <linux/dma-mapping.h>
+#include <linux/dma-iommu.h>
+#include <linux/err.h>
+#include <linux/interrupt.h>
+#include <linux/io.h>
+#include <linux/iommu.h>
+#include <linux/iopoll.h>
+#include <linux/list.h>
+#include <linux/of_address.h>
+#include <linux/of_iommu.h>
+#include <linux/of_irq.h>
+#include <linux/of_platform.h>
+#include <linux/platform_device.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <asm/barrier.h>
+#include <asm/dma-iommu.h>
+#include <linux/init.h>
+#include <dt-bindings/memory/mt2701-larb-port.h>
+#include <soc/mediatek/smi.h>
+#include "mtk_iommu.h"
+
+#define REG_MMU_PT_BASE_ADDR 0x000
+
+#define F_ALL_INVLD 0x2
+#define F_MMU_INV_RANGE 0x1
+#define F_INVLD_EN0 BIT(0)
+#define F_INVLD_EN1 BIT(1)
+
+#define F_MMU_FAULT_VA_MSK 0xfffff000
+#define MTK_PROTECT_PA_ALIGN 128
+
+#define REG_MMU_CTRL_REG 0x210
+#define F_MMU_CTRL_COHERENT_EN BIT(8)
+#define REG_MMU_IVRP_PADDR 0x214
+#define REG_MMU_INT_CONTROL 0x220
+#define F_INT_TRANSLATION_FAULT BIT(0)
+#define F_INT_MAIN_MULTI_HIT_FAULT BIT(1)
+#define F_INT_INVALID_PA_FAULT BIT(2)
+#define F_INT_ENTRY_REPLACEMENT_FAULT BIT(3)
+#define F_INT_TABLE_WALK_FAULT BIT(4)
+#define F_INT_TLB_MISS_FAULT BIT(5)
+#define F_INT_PFH_DMA_FIFO_OVERFLOW BIT(6)
+#define F_INT_MISS_DMA_FIFO_OVERFLOW BIT(7)
+
+#define F_MMU_TF_PROTECT_SEL(prot) (((prot) & 0x3) << 5)
+#define F_INT_CLR_BIT BIT(12)
+
+#define REG_MMU_FAULT_ST 0x224
+#define REG_MMU_FAULT_VA 0x228
+#define REG_MMU_INVLD_PA 0x22C
+#define REG_MMU_INT_ID 0x388
+#define REG_MMU_INVALIDATE 0x5c0
+#define REG_MMU_INVLD_START_A 0x5c4
+#define REG_MMU_INVLD_END_A 0x5c8
+
+#define REG_MMU_INV_SEL 0x5d8
+#define REG_MMU_STANDARD_AXI_MODE 0x5e8
+
+#define REG_MMU_DCM 0x5f0
+#define F_MMU_DCM_ON BIT(1)
+#define REG_MMU_CPE_DONE 0x60c
+#define F_DESC_VALID 0x2
+#define F_DESC_NONSEC BIT(3)
+#define MT2701_M4U_TF_LARB(TF) (6 - (((TF) >> 13) & 0x7))
+#define MT2701_M4U_TF_PORT(TF) (((TF) >> 8) & 0xF)
+/* MTK generation one iommu HW only support 4K size mapping */
+#define MT2701_IOMMU_PAGE_SHIFT 12
+#define MT2701_IOMMU_PAGE_SIZE (1UL << MT2701_IOMMU_PAGE_SHIFT)
+
+/*
+ * MTK m4u support 4GB iova address space, and only support 4K page
+ * mapping. So the pagetable size should be exactly as 4M.
+ */
+#define M2701_IOMMU_PGT_SIZE SZ_4M
+
+struct mtk_iommu_domain {
+ spinlock_t pgtlock; /* lock for page table */
+ struct iommu_domain domain;
+ u32 *pgt_va;
+ dma_addr_t pgt_pa;
+ struct mtk_iommu_data *data;
+};
+
+static struct mtk_iommu_domain *to_mtk_domain(struct iommu_domain *dom)
+{
+ return container_of(dom, struct mtk_iommu_domain, domain);
+}
+
+static const int mt2701_m4u_in_larb[] = {
+ LARB0_PORT_OFFSET, LARB1_PORT_OFFSET,
+ LARB2_PORT_OFFSET, LARB3_PORT_OFFSET
+};
+
+static inline int mt2701_m4u_to_larb(int id)
+{
+ int i;
+
+ for (i = ARRAY_SIZE(mt2701_m4u_in_larb) - 1; i >= 0; i--)
+ if ((id) >= mt2701_m4u_in_larb[i])
+ return i;
+
+ return 0;
+}
+
+static inline int mt2701_m4u_to_port(int id)
+{
+ int larb = mt2701_m4u_to_larb(id);
+
+ return id - mt2701_m4u_in_larb[larb];
+}
+
+static void mtk_iommu_tlb_flush_all(struct mtk_iommu_data *data)
+{
+ writel_relaxed(F_INVLD_EN1 | F_INVLD_EN0,
+ data->base + REG_MMU_INV_SEL);
+ writel_relaxed(F_ALL_INVLD, data->base + REG_MMU_INVALIDATE);
+ wmb(); /* Make sure the tlb flush all done */
+}
+
+static void mtk_iommu_tlb_flush_range(struct mtk_iommu_data *data,
+ unsigned long iova, size_t size)
+{
+ int ret;
+ u32 tmp;
+
+ writel_relaxed(F_INVLD_EN1 | F_INVLD_EN0,
+ data->base + REG_MMU_INV_SEL);
+ writel_relaxed(iova & F_MMU_FAULT_VA_MSK,
+ data->base + REG_MMU_INVLD_START_A);
+ writel_relaxed((iova + size - 1) & F_MMU_FAULT_VA_MSK,
+ data->base + REG_MMU_INVLD_END_A);
+ writel_relaxed(F_MMU_INV_RANGE, data->base + REG_MMU_INVALIDATE);
+
+ ret = readl_poll_timeout_atomic(data->base + REG_MMU_CPE_DONE,
+ tmp, tmp != 0, 10, 100000);
+ if (ret) {
+ dev_warn(data->dev,
+ "Partial TLB flush timed out, falling back to full flush\n");
+ mtk_iommu_tlb_flush_all(data);
+ }
+ /* Clear the CPE status */
+ writel_relaxed(0, data->base + REG_MMU_CPE_DONE);
+}
+
+static irqreturn_t mtk_iommu_isr(int irq, void *dev_id)
+{
+ struct mtk_iommu_data *data = dev_id;
+ struct mtk_iommu_domain *dom = data->m4u_dom;
+ u32 int_state, regval, fault_iova, fault_pa;
+ unsigned int fault_larb, fault_port;
+
+ /* Read error information from registers */
+ int_state = readl_relaxed(data->base + REG_MMU_FAULT_ST);
+ fault_iova = readl_relaxed(data->base + REG_MMU_FAULT_VA);
+
+ fault_iova &= F_MMU_FAULT_VA_MSK;
+ fault_pa = readl_relaxed(data->base + REG_MMU_INVLD_PA);
+ regval = readl_relaxed(data->base + REG_MMU_INT_ID);
+ fault_larb = MT2701_M4U_TF_LARB(regval);
+ fault_port = MT2701_M4U_TF_PORT(regval);
+
+ /*
+ * MTK v1 iommu HW could not determine whether the fault is read or
+ * write fault, report as read fault.
+ */
+ if (report_iommu_fault(&dom->domain, data->dev, fault_iova,
+ IOMMU_FAULT_READ))
+ dev_err_ratelimited(data->dev,
+ "fault type=0x%x iova=0x%x pa=0x%x larb=%d port=%d\n",
+ int_state, fault_iova, fault_pa,
+ fault_larb, fault_port);
+
+ /* Interrupt clear */
+ regval = readl_relaxed(data->base + REG_MMU_INT_CONTROL);
+ regval |= F_INT_CLR_BIT;
+ writel_relaxed(regval, data->base + REG_MMU_INT_CONTROL);
+
+ mtk_iommu_tlb_flush_all(data);
+
+ return IRQ_HANDLED;
+}
+
+static void mtk_iommu_config(struct mtk_iommu_data *data,
+ struct device *dev, bool enable)
+{
+ struct mtk_smi_larb_iommu *larb_mmu;
+ unsigned int larbid, portid;
+ struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
+ int i;
+
+ for (i = 0; i < fwspec->num_ids; ++i) {
+ larbid = mt2701_m4u_to_larb(fwspec->ids[i]);
+ portid = mt2701_m4u_to_port(fwspec->ids[i]);
+ larb_mmu = &data->larb_imu[larbid];
+
+ dev_dbg(dev, "%s iommu port: %d\n",
+ enable ? "enable" : "disable", portid);
+
+ if (enable)
+ larb_mmu->mmu |= MTK_SMI_MMU_EN(portid);
+ else
+ larb_mmu->mmu &= ~MTK_SMI_MMU_EN(portid);
+ }
+}
+
+static int mtk_iommu_domain_finalise(struct mtk_iommu_data *data)
+{
+ struct mtk_iommu_domain *dom = data->m4u_dom;
+
+ spin_lock_init(&dom->pgtlock);
+
+ dom->pgt_va = dma_alloc_coherent(data->dev, M2701_IOMMU_PGT_SIZE,
+ &dom->pgt_pa, GFP_KERNEL);
+ if (!dom->pgt_va)
+ return -ENOMEM;
+
+ writel(dom->pgt_pa, data->base + REG_MMU_PT_BASE_ADDR);
+
+ dom->data = data;
+
+ return 0;
+}
+
+static struct iommu_domain *mtk_iommu_domain_alloc(unsigned type)
+{
+ struct mtk_iommu_domain *dom;
+
+ if (type != IOMMU_DOMAIN_UNMANAGED)
+ return NULL;
+
+ dom = kzalloc(sizeof(*dom), GFP_KERNEL);
+ if (!dom)
+ return NULL;
+
+ return &dom->domain;
+}
+
+static void mtk_iommu_domain_free(struct iommu_domain *domain)
+{
+ struct mtk_iommu_domain *dom = to_mtk_domain(domain);
+ struct mtk_iommu_data *data = dom->data;
+
+ dma_free_coherent(data->dev, M2701_IOMMU_PGT_SIZE,
+ dom->pgt_va, dom->pgt_pa);
+ kfree(to_mtk_domain(domain));
+}
+
+static int mtk_iommu_attach_device(struct iommu_domain *domain,
+ struct device *dev)
+{
+ struct mtk_iommu_data *data = dev_iommu_priv_get(dev);
+ struct mtk_iommu_domain *dom = to_mtk_domain(domain);
+ struct dma_iommu_mapping *mtk_mapping;
+ int ret;
+
+ /* Only allow the domain created internally. */
+ mtk_mapping = data->mapping;
+ if (mtk_mapping->domain != domain)
+ return 0;
+
+ if (!data->m4u_dom) {
+ data->m4u_dom = dom;
+ ret = mtk_iommu_domain_finalise(data);
+ if (ret) {
+ data->m4u_dom = NULL;
+ return ret;
+ }
+ }
+
+ mtk_iommu_config(data, dev, true);
+ return 0;
+}
+
+static void mtk_iommu_detach_device(struct iommu_domain *domain,
+ struct device *dev)
+{
+ struct mtk_iommu_data *data = dev_iommu_priv_get(dev);
+
+ mtk_iommu_config(data, dev, false);
+}
+
+static int mtk_iommu_map(struct iommu_domain *domain, unsigned long iova,
+ phys_addr_t paddr, size_t size, int prot, gfp_t gfp)
+{
+ struct mtk_iommu_domain *dom = to_mtk_domain(domain);
+ unsigned int page_num = size >> MT2701_IOMMU_PAGE_SHIFT;
+ unsigned long flags;
+ unsigned int i;
+ u32 *pgt_base_iova = dom->pgt_va + (iova >> MT2701_IOMMU_PAGE_SHIFT);
+ u32 pabase = (u32)paddr;
+ int map_size = 0;
+
+ spin_lock_irqsave(&dom->pgtlock, flags);
+ for (i = 0; i < page_num; i++) {
+ if (pgt_base_iova[i]) {
+ memset(pgt_base_iova, 0, i * sizeof(u32));
+ break;
+ }
+ pgt_base_iova[i] = pabase | F_DESC_VALID | F_DESC_NONSEC;
+ pabase += MT2701_IOMMU_PAGE_SIZE;
+ map_size += MT2701_IOMMU_PAGE_SIZE;
+ }
+
+ spin_unlock_irqrestore(&dom->pgtlock, flags);
+
+ mtk_iommu_tlb_flush_range(dom->data, iova, size);
+
+ return map_size == size ? 0 : -EEXIST;
+}
+
+static size_t mtk_iommu_unmap(struct iommu_domain *domain,
+ unsigned long iova, size_t size,
+ struct iommu_iotlb_gather *gather)
+{
+ struct mtk_iommu_domain *dom = to_mtk_domain(domain);
+ unsigned long flags;
+ u32 *pgt_base_iova = dom->pgt_va + (iova >> MT2701_IOMMU_PAGE_SHIFT);
+ unsigned int page_num = size >> MT2701_IOMMU_PAGE_SHIFT;
+
+ spin_lock_irqsave(&dom->pgtlock, flags);
+ memset(pgt_base_iova, 0, page_num * sizeof(u32));
+ spin_unlock_irqrestore(&dom->pgtlock, flags);
+
+ mtk_iommu_tlb_flush_range(dom->data, iova, size);
+
+ return size;
+}
+
+static phys_addr_t mtk_iommu_iova_to_phys(struct iommu_domain *domain,
+ dma_addr_t iova)
+{
+ struct mtk_iommu_domain *dom = to_mtk_domain(domain);
+ unsigned long flags;
+ phys_addr_t pa;
+
+ spin_lock_irqsave(&dom->pgtlock, flags);
+ pa = *(dom->pgt_va + (iova >> MT2701_IOMMU_PAGE_SHIFT));
+ pa = pa & (~(MT2701_IOMMU_PAGE_SIZE - 1));
+ spin_unlock_irqrestore(&dom->pgtlock, flags);
+
+ return pa;
+}
+
+static const struct iommu_ops mtk_iommu_ops;
+
+/*
+ * MTK generation one iommu HW only support one iommu domain, and all the client
+ * sharing the same iova address space.
+ */
+static int mtk_iommu_create_mapping(struct device *dev,
+ struct of_phandle_args *args)
+{
+ struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
+ struct mtk_iommu_data *data;
+ struct platform_device *m4updev;
+ struct dma_iommu_mapping *mtk_mapping;
+ int ret;
+
+ if (args->args_count != 1) {
+ dev_err(dev, "invalid #iommu-cells(%d) property for IOMMU\n",
+ args->args_count);
+ return -EINVAL;
+ }
+
+ if (!fwspec) {
+ ret = iommu_fwspec_init(dev, &args->np->fwnode, &mtk_iommu_ops);
+ if (ret)
+ return ret;
+ fwspec = dev_iommu_fwspec_get(dev);
+ } else if (dev_iommu_fwspec_get(dev)->ops != &mtk_iommu_ops) {
+ return -EINVAL;
+ }
+
+ if (!dev_iommu_priv_get(dev)) {
+ /* Get the m4u device */
+ m4updev = of_find_device_by_node(args->np);
+ if (WARN_ON(!m4updev))
+ return -EINVAL;
+
+ dev_iommu_priv_set(dev, platform_get_drvdata(m4updev));
+ }
+
+ ret = iommu_fwspec_add_ids(dev, args->args, 1);
+ if (ret)
+ return ret;
+
+ data = dev_iommu_priv_get(dev);
+ mtk_mapping = data->mapping;
+ if (!mtk_mapping) {
+ /* MTK iommu support 4GB iova address space. */
+ mtk_mapping = arm_iommu_create_mapping(&platform_bus_type,
+ 0, 1ULL << 32);
+ if (IS_ERR(mtk_mapping))
+ return PTR_ERR(mtk_mapping);
+
+ data->mapping = mtk_mapping;
+ }
+
+ return 0;
+}
+
+static int mtk_iommu_def_domain_type(struct device *dev)
+{
+ return IOMMU_DOMAIN_UNMANAGED;
+}
+
+static struct iommu_device *mtk_iommu_probe_device(struct device *dev)
+{
+ struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
+ struct of_phandle_args iommu_spec;
+ struct of_phandle_iterator it;
+ struct mtk_iommu_data *data;
+ int err;
+
+ of_for_each_phandle(&it, err, dev->of_node, "iommus",
+ "#iommu-cells", -1) {
+ int count = of_phandle_iterator_args(&it, iommu_spec.args,
+ MAX_PHANDLE_ARGS);
+ iommu_spec.np = of_node_get(it.node);
+ iommu_spec.args_count = count;
+
+ mtk_iommu_create_mapping(dev, &iommu_spec);
+
+ /* dev->iommu_fwspec might have changed */
+ fwspec = dev_iommu_fwspec_get(dev);
+
+ of_node_put(iommu_spec.np);
+ }
+
+ if (!fwspec || fwspec->ops != &mtk_iommu_ops)
+ return ERR_PTR(-ENODEV); /* Not a iommu client device */
+
+ data = dev_iommu_priv_get(dev);
+
+ return &data->iommu;
+}
+
+static void mtk_iommu_probe_finalize(struct device *dev)
+{
+ struct dma_iommu_mapping *mtk_mapping;
+ struct mtk_iommu_data *data;
+ int err;
+
+ data = dev_iommu_priv_get(dev);
+ mtk_mapping = data->mapping;
+
+ err = arm_iommu_attach_device(dev, mtk_mapping);
+ if (err)
+ dev_err(dev, "Can't create IOMMU mapping - DMA-OPS will not work\n");
+}
+
+static void mtk_iommu_release_device(struct device *dev)
+{
+ struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
+
+ if (!fwspec || fwspec->ops != &mtk_iommu_ops)
+ return;
+
+ iommu_fwspec_free(dev);
+}
+
+static int mtk_iommu_hw_init(const struct mtk_iommu_data *data)
+{
+ u32 regval;
+ int ret;
+
+ ret = clk_prepare_enable(data->bclk);
+ if (ret) {
+ dev_err(data->dev, "Failed to enable iommu bclk(%d)\n", ret);
+ return ret;
+ }
+
+ regval = F_MMU_CTRL_COHERENT_EN | F_MMU_TF_PROTECT_SEL(2);
+ writel_relaxed(regval, data->base + REG_MMU_CTRL_REG);
+
+ regval = F_INT_TRANSLATION_FAULT |
+ F_INT_MAIN_MULTI_HIT_FAULT |
+ F_INT_INVALID_PA_FAULT |
+ F_INT_ENTRY_REPLACEMENT_FAULT |
+ F_INT_TABLE_WALK_FAULT |
+ F_INT_TLB_MISS_FAULT |
+ F_INT_PFH_DMA_FIFO_OVERFLOW |
+ F_INT_MISS_DMA_FIFO_OVERFLOW;
+ writel_relaxed(regval, data->base + REG_MMU_INT_CONTROL);
+
+ /* protect memory,hw will write here while translation fault */
+ writel_relaxed(data->protect_base,
+ data->base + REG_MMU_IVRP_PADDR);
+
+ writel_relaxed(F_MMU_DCM_ON, data->base + REG_MMU_DCM);
+
+ if (devm_request_irq(data->dev, data->irq, mtk_iommu_isr, 0,
+ dev_name(data->dev), (void *)data)) {
+ writel_relaxed(0, data->base + REG_MMU_PT_BASE_ADDR);
+ clk_disable_unprepare(data->bclk);
+ dev_err(data->dev, "Failed @ IRQ-%d Request\n", data->irq);
+ return -ENODEV;
+ }
+
+ return 0;
+}
+
+static const struct iommu_ops mtk_iommu_ops = {
+ .domain_alloc = mtk_iommu_domain_alloc,
+ .domain_free = mtk_iommu_domain_free,
+ .attach_dev = mtk_iommu_attach_device,
+ .detach_dev = mtk_iommu_detach_device,
+ .map = mtk_iommu_map,
+ .unmap = mtk_iommu_unmap,
+ .iova_to_phys = mtk_iommu_iova_to_phys,
+ .probe_device = mtk_iommu_probe_device,
+ .probe_finalize = mtk_iommu_probe_finalize,
+ .release_device = mtk_iommu_release_device,
+ .def_domain_type = mtk_iommu_def_domain_type,
+ .device_group = generic_device_group,
+ .pgsize_bitmap = ~0UL << MT2701_IOMMU_PAGE_SHIFT,
+};
+
+static const struct of_device_id mtk_iommu_of_ids[] = {
+ { .compatible = "mediatek,mt2701-m4u", },
+ {}
+};
+
+static const struct component_master_ops mtk_iommu_com_ops = {
+ .bind = mtk_iommu_bind,
+ .unbind = mtk_iommu_unbind,
+};
+
+static int mtk_iommu_probe(struct platform_device *pdev)
+{
+ struct mtk_iommu_data *data;
+ struct device *dev = &pdev->dev;
+ struct resource *res;
+ struct component_match *match = NULL;
+ struct of_phandle_args larb_spec;
+ struct of_phandle_iterator it;
+ void *protect;
+ int larb_nr, ret, err;
+
+ data = devm_kzalloc(dev, sizeof(*data), GFP_KERNEL);
+ if (!data)
+ return -ENOMEM;
+
+ data->dev = dev;
+
+ /* Protect memory. HW will access here while translation fault.*/
+ protect = devm_kzalloc(dev, MTK_PROTECT_PA_ALIGN * 2,
+ GFP_KERNEL | GFP_DMA);
+ if (!protect)
+ return -ENOMEM;
+ data->protect_base = ALIGN(virt_to_phys(protect), MTK_PROTECT_PA_ALIGN);
+
+ res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+ data->base = devm_ioremap_resource(dev, res);
+ if (IS_ERR(data->base))
+ return PTR_ERR(data->base);
+
+ data->irq = platform_get_irq(pdev, 0);
+ if (data->irq < 0)
+ return data->irq;
+
+ data->bclk = devm_clk_get(dev, "bclk");
+ if (IS_ERR(data->bclk))
+ return PTR_ERR(data->bclk);
+
+ larb_nr = 0;
+ of_for_each_phandle(&it, err, dev->of_node,
+ "mediatek,larbs", NULL, 0) {
+ struct platform_device *plarbdev;
+ int count = of_phandle_iterator_args(&it, larb_spec.args,
+ MAX_PHANDLE_ARGS);
+
+ if (count)
+ continue;
+
+ larb_spec.np = of_node_get(it.node);
+ if (!of_device_is_available(larb_spec.np))
+ continue;
+
+ plarbdev = of_find_device_by_node(larb_spec.np);
+ if (!plarbdev) {
+ plarbdev = of_platform_device_create(
+ larb_spec.np, NULL,
+ platform_bus_type.dev_root);
+ if (!plarbdev) {
+ of_node_put(larb_spec.np);
+ return -EPROBE_DEFER;
+ }
+ }
+
+ data->larb_imu[larb_nr].dev = &plarbdev->dev;
+ component_match_add_release(dev, &match, release_of,
+ compare_of, larb_spec.np);
+ larb_nr++;
+ }
+
+ platform_set_drvdata(pdev, data);
+
+ ret = mtk_iommu_hw_init(data);
+ if (ret)
+ return ret;
+
+ ret = iommu_device_sysfs_add(&data->iommu, &pdev->dev, NULL,
+ dev_name(&pdev->dev));
+ if (ret)
+ goto out_clk_unprepare;
+
+ iommu_device_set_ops(&data->iommu, &mtk_iommu_ops);
+
+ ret = iommu_device_register(&data->iommu);
+ if (ret)
+ goto out_sysfs_remove;
+
+ if (!iommu_present(&platform_bus_type)) {
+ ret = bus_set_iommu(&platform_bus_type, &mtk_iommu_ops);
+ if (ret)
+ goto out_dev_unreg;
+ }
+
+ ret = component_master_add_with_match(dev, &mtk_iommu_com_ops, match);
+ if (ret)
+ goto out_bus_set_null;
+ return ret;
+
+out_bus_set_null:
+ bus_set_iommu(&platform_bus_type, NULL);
+out_dev_unreg:
+ iommu_device_unregister(&data->iommu);
+out_sysfs_remove:
+ iommu_device_sysfs_remove(&data->iommu);
+out_clk_unprepare:
+ clk_disable_unprepare(data->bclk);
+ return ret;
+}
+
+static int mtk_iommu_remove(struct platform_device *pdev)
+{
+ struct mtk_iommu_data *data = platform_get_drvdata(pdev);
+
+ iommu_device_sysfs_remove(&data->iommu);
+ iommu_device_unregister(&data->iommu);
+
+ if (iommu_present(&platform_bus_type))
+ bus_set_iommu(&platform_bus_type, NULL);
+
+ clk_disable_unprepare(data->bclk);
+ devm_free_irq(&pdev->dev, data->irq, data);
+ component_master_del(&pdev->dev, &mtk_iommu_com_ops);
+ return 0;
+}
+
+static int __maybe_unused mtk_iommu_suspend(struct device *dev)
+{
+ struct mtk_iommu_data *data = dev_get_drvdata(dev);
+ struct mtk_iommu_suspend_reg *reg = &data->reg;
+ void __iomem *base = data->base;
+
+ reg->standard_axi_mode = readl_relaxed(base +
+ REG_MMU_STANDARD_AXI_MODE);
+ reg->dcm_dis = readl_relaxed(base + REG_MMU_DCM);
+ reg->ctrl_reg = readl_relaxed(base + REG_MMU_CTRL_REG);
+ reg->int_control0 = readl_relaxed(base + REG_MMU_INT_CONTROL);
+ return 0;
+}
+
+static int __maybe_unused mtk_iommu_resume(struct device *dev)
+{
+ struct mtk_iommu_data *data = dev_get_drvdata(dev);
+ struct mtk_iommu_suspend_reg *reg = &data->reg;
+ void __iomem *base = data->base;
+
+ writel_relaxed(data->m4u_dom->pgt_pa, base + REG_MMU_PT_BASE_ADDR);
+ writel_relaxed(reg->standard_axi_mode,
+ base + REG_MMU_STANDARD_AXI_MODE);
+ writel_relaxed(reg->dcm_dis, base + REG_MMU_DCM);
+ writel_relaxed(reg->ctrl_reg, base + REG_MMU_CTRL_REG);
+ writel_relaxed(reg->int_control0, base + REG_MMU_INT_CONTROL);
+ writel_relaxed(data->protect_base, base + REG_MMU_IVRP_PADDR);
+ return 0;
+}
+
+static const struct dev_pm_ops mtk_iommu_pm_ops = {
+ SET_SYSTEM_SLEEP_PM_OPS(mtk_iommu_suspend, mtk_iommu_resume)
+};
+
+static struct platform_driver mtk_iommu_driver = {
+ .probe = mtk_iommu_probe,
+ .remove = mtk_iommu_remove,
+ .driver = {
+ .name = "mtk-iommu-v1",
+ .of_match_table = mtk_iommu_of_ids,
+ .pm = &mtk_iommu_pm_ops,
+ }
+};
+
+static int __init m4u_init(void)
+{
+ return platform_driver_register(&mtk_iommu_driver);
+}
+subsys_initcall(m4u_init);
diff --git a/drivers/iommu/of_iommu.c b/drivers/iommu/of_iommu.c
new file mode 100644
index 000000000..e505b9130
--- /dev/null
+++ b/drivers/iommu/of_iommu.c
@@ -0,0 +1,247 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * OF helpers for IOMMU
+ *
+ * Copyright (c) 2012, NVIDIA CORPORATION. All rights reserved.
+ */
+
+#include <linux/export.h>
+#include <linux/iommu.h>
+#include <linux/limits.h>
+#include <linux/module.h>
+#include <linux/msi.h>
+#include <linux/of.h>
+#include <linux/of_iommu.h>
+#include <linux/of_pci.h>
+#include <linux/pci.h>
+#include <linux/slab.h>
+#include <linux/fsl/mc.h>
+
+#define NO_IOMMU 1
+
+/**
+ * of_get_dma_window - Parse *dma-window property and returns 0 if found.
+ *
+ * @dn: device node
+ * @prefix: prefix for property name if any
+ * @index: index to start to parse
+ * @busno: Returns busno if supported. Otherwise pass NULL
+ * @addr: Returns address that DMA starts
+ * @size: Returns the range that DMA can handle
+ *
+ * This supports different formats flexibly. "prefix" can be
+ * configured if any. "busno" and "index" are optionally
+ * specified. Set 0(or NULL) if not used.
+ */
+int of_get_dma_window(struct device_node *dn, const char *prefix, int index,
+ unsigned long *busno, dma_addr_t *addr, size_t *size)
+{
+ const __be32 *dma_window, *end;
+ int bytes, cur_index = 0;
+ char propname[NAME_MAX], addrname[NAME_MAX], sizename[NAME_MAX];
+
+ if (!dn || !addr || !size)
+ return -EINVAL;
+
+ if (!prefix)
+ prefix = "";
+
+ snprintf(propname, sizeof(propname), "%sdma-window", prefix);
+ snprintf(addrname, sizeof(addrname), "%s#dma-address-cells", prefix);
+ snprintf(sizename, sizeof(sizename), "%s#dma-size-cells", prefix);
+
+ dma_window = of_get_property(dn, propname, &bytes);
+ if (!dma_window)
+ return -ENODEV;
+ end = dma_window + bytes / sizeof(*dma_window);
+
+ while (dma_window < end) {
+ u32 cells;
+ const void *prop;
+
+ /* busno is one cell if supported */
+ if (busno)
+ *busno = be32_to_cpup(dma_window++);
+
+ prop = of_get_property(dn, addrname, NULL);
+ if (!prop)
+ prop = of_get_property(dn, "#address-cells", NULL);
+
+ cells = prop ? be32_to_cpup(prop) : of_n_addr_cells(dn);
+ if (!cells)
+ return -EINVAL;
+ *addr = of_read_number(dma_window, cells);
+ dma_window += cells;
+
+ prop = of_get_property(dn, sizename, NULL);
+ cells = prop ? be32_to_cpup(prop) : of_n_size_cells(dn);
+ if (!cells)
+ return -EINVAL;
+ *size = of_read_number(dma_window, cells);
+ dma_window += cells;
+
+ if (cur_index++ == index)
+ break;
+ }
+ return 0;
+}
+EXPORT_SYMBOL_GPL(of_get_dma_window);
+
+static int of_iommu_xlate(struct device *dev,
+ struct of_phandle_args *iommu_spec)
+{
+ const struct iommu_ops *ops;
+ struct fwnode_handle *fwnode = &iommu_spec->np->fwnode;
+ int ret;
+
+ ops = iommu_ops_from_fwnode(fwnode);
+ if ((ops && !ops->of_xlate) ||
+ !of_device_is_available(iommu_spec->np))
+ return NO_IOMMU;
+
+ ret = iommu_fwspec_init(dev, &iommu_spec->np->fwnode, ops);
+ if (ret)
+ return ret;
+ /*
+ * The otherwise-empty fwspec handily serves to indicate the specific
+ * IOMMU device we're waiting for, which will be useful if we ever get
+ * a proper probe-ordering dependency mechanism in future.
+ */
+ if (!ops)
+ return driver_deferred_probe_check_state(dev);
+
+ if (!try_module_get(ops->owner))
+ return -ENODEV;
+
+ ret = ops->of_xlate(dev, iommu_spec);
+ module_put(ops->owner);
+ return ret;
+}
+
+static int of_iommu_configure_dev_id(struct device_node *master_np,
+ struct device *dev,
+ const u32 *id)
+{
+ struct of_phandle_args iommu_spec = { .args_count = 1 };
+ int err;
+
+ err = of_map_id(master_np, *id, "iommu-map",
+ "iommu-map-mask", &iommu_spec.np,
+ iommu_spec.args);
+ if (err)
+ return err == -ENODEV ? NO_IOMMU : err;
+
+ err = of_iommu_xlate(dev, &iommu_spec);
+ of_node_put(iommu_spec.np);
+ return err;
+}
+
+static int of_iommu_configure_dev(struct device_node *master_np,
+ struct device *dev)
+{
+ struct of_phandle_args iommu_spec;
+ int err = NO_IOMMU, idx = 0;
+
+ while (!of_parse_phandle_with_args(master_np, "iommus",
+ "#iommu-cells",
+ idx, &iommu_spec)) {
+ err = of_iommu_xlate(dev, &iommu_spec);
+ of_node_put(iommu_spec.np);
+ idx++;
+ if (err)
+ break;
+ }
+
+ return err;
+}
+
+struct of_pci_iommu_alias_info {
+ struct device *dev;
+ struct device_node *np;
+};
+
+static int of_pci_iommu_init(struct pci_dev *pdev, u16 alias, void *data)
+{
+ struct of_pci_iommu_alias_info *info = data;
+ u32 input_id = alias;
+
+ return of_iommu_configure_dev_id(info->np, info->dev, &input_id);
+}
+
+static int of_iommu_configure_device(struct device_node *master_np,
+ struct device *dev, const u32 *id)
+{
+ return (id) ? of_iommu_configure_dev_id(master_np, dev, id) :
+ of_iommu_configure_dev(master_np, dev);
+}
+
+const struct iommu_ops *of_iommu_configure(struct device *dev,
+ struct device_node *master_np,
+ const u32 *id)
+{
+ const struct iommu_ops *ops = NULL;
+ struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
+ int err = NO_IOMMU;
+
+ if (!master_np)
+ return NULL;
+
+ if (fwspec) {
+ if (fwspec->ops)
+ return fwspec->ops;
+
+ /* In the deferred case, start again from scratch */
+ iommu_fwspec_free(dev);
+ }
+
+ /*
+ * We don't currently walk up the tree looking for a parent IOMMU.
+ * See the `Notes:' section of
+ * Documentation/devicetree/bindings/iommu/iommu.txt
+ */
+ if (dev_is_pci(dev)) {
+ struct of_pci_iommu_alias_info info = {
+ .dev = dev,
+ .np = master_np,
+ };
+
+ pci_request_acs();
+ err = pci_for_each_dma_alias(to_pci_dev(dev),
+ of_pci_iommu_init, &info);
+ } else {
+ err = of_iommu_configure_device(master_np, dev, id);
+
+ fwspec = dev_iommu_fwspec_get(dev);
+ if (!err && fwspec)
+ of_property_read_u32(master_np, "pasid-num-bits",
+ &fwspec->num_pasid_bits);
+ }
+
+ /*
+ * Two success conditions can be represented by non-negative err here:
+ * >0 : there is no IOMMU, or one was unavailable for non-fatal reasons
+ * 0 : we found an IOMMU, and dev->fwspec is initialised appropriately
+ * <0 : any actual error
+ */
+ if (!err) {
+ /* The fwspec pointer changed, read it again */
+ fwspec = dev_iommu_fwspec_get(dev);
+ ops = fwspec->ops;
+ }
+ /*
+ * If we have reason to believe the IOMMU driver missed the initial
+ * probe for dev, replay it to get things in order.
+ */
+ if (!err && dev->bus && !device_iommu_mapped(dev))
+ err = iommu_probe_device(dev);
+
+ /* Ignore all other errors apart from EPROBE_DEFER */
+ if (err == -EPROBE_DEFER) {
+ ops = ERR_PTR(err);
+ } else if (err < 0) {
+ dev_dbg(dev, "Adding to IOMMU failed: %d\n", err);
+ ops = NULL;
+ }
+
+ return ops;
+}
diff --git a/drivers/iommu/omap-iommu-debug.c b/drivers/iommu/omap-iommu-debug.c
new file mode 100644
index 000000000..259f65291
--- /dev/null
+++ b/drivers/iommu/omap-iommu-debug.c
@@ -0,0 +1,274 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * omap iommu: debugfs interface
+ *
+ * Copyright (C) 2008-2009 Nokia Corporation
+ *
+ * Written by Hiroshi DOYU <Hiroshi.DOYU@nokia.com>
+ */
+
+#include <linux/err.h>
+#include <linux/io.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <linux/pm_runtime.h>
+#include <linux/debugfs.h>
+#include <linux/platform_data/iommu-omap.h>
+
+#include "omap-iopgtable.h"
+#include "omap-iommu.h"
+
+static DEFINE_MUTEX(iommu_debug_lock);
+
+static struct dentry *iommu_debug_root;
+
+static inline bool is_omap_iommu_detached(struct omap_iommu *obj)
+{
+ return !obj->domain;
+}
+
+#define pr_reg(name) \
+ do { \
+ ssize_t bytes; \
+ const char *str = "%20s: %08x\n"; \
+ const int maxcol = 32; \
+ if (len < maxcol) \
+ goto out; \
+ bytes = scnprintf(p, maxcol, str, __stringify(name), \
+ iommu_read_reg(obj, MMU_##name)); \
+ p += bytes; \
+ len -= bytes; \
+ } while (0)
+
+static ssize_t
+omap2_iommu_dump_ctx(struct omap_iommu *obj, char *buf, ssize_t len)
+{
+ char *p = buf;
+
+ pr_reg(REVISION);
+ pr_reg(IRQSTATUS);
+ pr_reg(IRQENABLE);
+ pr_reg(WALKING_ST);
+ pr_reg(CNTL);
+ pr_reg(FAULT_AD);
+ pr_reg(TTB);
+ pr_reg(LOCK);
+ pr_reg(LD_TLB);
+ pr_reg(CAM);
+ pr_reg(RAM);
+ pr_reg(GFLUSH);
+ pr_reg(FLUSH_ENTRY);
+ pr_reg(READ_CAM);
+ pr_reg(READ_RAM);
+ pr_reg(EMU_FAULT_AD);
+out:
+ return p - buf;
+}
+
+static ssize_t omap_iommu_dump_ctx(struct omap_iommu *obj, char *buf,
+ ssize_t bytes)
+{
+ if (!obj || !buf)
+ return -EINVAL;
+
+ pm_runtime_get_sync(obj->dev);
+
+ bytes = omap2_iommu_dump_ctx(obj, buf, bytes);
+
+ pm_runtime_put_sync(obj->dev);
+
+ return bytes;
+}
+
+static ssize_t debug_read_regs(struct file *file, char __user *userbuf,
+ size_t count, loff_t *ppos)
+{
+ struct omap_iommu *obj = file->private_data;
+ char *p, *buf;
+ ssize_t bytes;
+
+ if (is_omap_iommu_detached(obj))
+ return -EPERM;
+
+ buf = kmalloc(count, GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
+ p = buf;
+
+ mutex_lock(&iommu_debug_lock);
+
+ bytes = omap_iommu_dump_ctx(obj, p, count);
+ if (bytes < 0)
+ goto err;
+ bytes = simple_read_from_buffer(userbuf, count, ppos, buf, bytes);
+
+err:
+ mutex_unlock(&iommu_debug_lock);
+ kfree(buf);
+
+ return bytes;
+}
+
+static int
+__dump_tlb_entries(struct omap_iommu *obj, struct cr_regs *crs, int num)
+{
+ int i;
+ struct iotlb_lock saved;
+ struct cr_regs tmp;
+ struct cr_regs *p = crs;
+
+ pm_runtime_get_sync(obj->dev);
+ iotlb_lock_get(obj, &saved);
+
+ for_each_iotlb_cr(obj, num, i, tmp) {
+ if (!iotlb_cr_valid(&tmp))
+ continue;
+ *p++ = tmp;
+ }
+
+ iotlb_lock_set(obj, &saved);
+ pm_runtime_put_sync(obj->dev);
+
+ return p - crs;
+}
+
+static ssize_t iotlb_dump_cr(struct omap_iommu *obj, struct cr_regs *cr,
+ struct seq_file *s)
+{
+ seq_printf(s, "%08x %08x %01x\n", cr->cam, cr->ram,
+ (cr->cam & MMU_CAM_P) ? 1 : 0);
+ return 0;
+}
+
+static size_t omap_dump_tlb_entries(struct omap_iommu *obj, struct seq_file *s)
+{
+ int i, num;
+ struct cr_regs *cr;
+
+ num = obj->nr_tlb_entries;
+
+ cr = kcalloc(num, sizeof(*cr), GFP_KERNEL);
+ if (!cr)
+ return 0;
+
+ num = __dump_tlb_entries(obj, cr, num);
+ for (i = 0; i < num; i++)
+ iotlb_dump_cr(obj, cr + i, s);
+ kfree(cr);
+
+ return 0;
+}
+
+static int tlb_show(struct seq_file *s, void *data)
+{
+ struct omap_iommu *obj = s->private;
+
+ if (is_omap_iommu_detached(obj))
+ return -EPERM;
+
+ mutex_lock(&iommu_debug_lock);
+
+ seq_printf(s, "%8s %8s\n", "cam:", "ram:");
+ seq_puts(s, "-----------------------------------------\n");
+ omap_dump_tlb_entries(obj, s);
+
+ mutex_unlock(&iommu_debug_lock);
+
+ return 0;
+}
+
+static void dump_ioptable(struct seq_file *s)
+{
+ int i, j;
+ u32 da;
+ u32 *iopgd, *iopte;
+ struct omap_iommu *obj = s->private;
+
+ spin_lock(&obj->page_table_lock);
+
+ iopgd = iopgd_offset(obj, 0);
+ for (i = 0; i < PTRS_PER_IOPGD; i++, iopgd++) {
+ if (!*iopgd)
+ continue;
+
+ if (!(*iopgd & IOPGD_TABLE)) {
+ da = i << IOPGD_SHIFT;
+ seq_printf(s, "1: 0x%08x 0x%08x\n", da, *iopgd);
+ continue;
+ }
+
+ iopte = iopte_offset(iopgd, 0);
+ for (j = 0; j < PTRS_PER_IOPTE; j++, iopte++) {
+ if (!*iopte)
+ continue;
+
+ da = (i << IOPGD_SHIFT) + (j << IOPTE_SHIFT);
+ seq_printf(s, "2: 0x%08x 0x%08x\n", da, *iopte);
+ }
+ }
+
+ spin_unlock(&obj->page_table_lock);
+}
+
+static int pagetable_show(struct seq_file *s, void *data)
+{
+ struct omap_iommu *obj = s->private;
+
+ if (is_omap_iommu_detached(obj))
+ return -EPERM;
+
+ mutex_lock(&iommu_debug_lock);
+
+ seq_printf(s, "L: %8s %8s\n", "da:", "pte:");
+ seq_puts(s, "--------------------------\n");
+ dump_ioptable(s);
+
+ mutex_unlock(&iommu_debug_lock);
+
+ return 0;
+}
+
+#define DEBUG_FOPS_RO(name) \
+ static const struct file_operations name##_fops = { \
+ .open = simple_open, \
+ .read = debug_read_##name, \
+ .llseek = generic_file_llseek, \
+ }
+
+DEBUG_FOPS_RO(regs);
+DEFINE_SHOW_ATTRIBUTE(tlb);
+DEFINE_SHOW_ATTRIBUTE(pagetable);
+
+void omap_iommu_debugfs_add(struct omap_iommu *obj)
+{
+ struct dentry *d;
+
+ if (!iommu_debug_root)
+ return;
+
+ d = debugfs_create_dir(obj->name, iommu_debug_root);
+ obj->debug_dir = d;
+
+ debugfs_create_u32("nr_tlb_entries", 0400, d, &obj->nr_tlb_entries);
+ debugfs_create_file("regs", 0400, d, obj, &regs_fops);
+ debugfs_create_file("tlb", 0400, d, obj, &tlb_fops);
+ debugfs_create_file("pagetable", 0400, d, obj, &pagetable_fops);
+}
+
+void omap_iommu_debugfs_remove(struct omap_iommu *obj)
+{
+ if (!obj->debug_dir)
+ return;
+
+ debugfs_remove_recursive(obj->debug_dir);
+}
+
+void __init omap_iommu_debugfs_init(void)
+{
+ iommu_debug_root = debugfs_create_dir("omap_iommu", NULL);
+}
+
+void __exit omap_iommu_debugfs_exit(void)
+{
+ debugfs_remove(iommu_debug_root);
+}
diff --git a/drivers/iommu/omap-iommu.c b/drivers/iommu/omap-iommu.c
new file mode 100644
index 000000000..ff2c692c0
--- /dev/null
+++ b/drivers/iommu/omap-iommu.c
@@ -0,0 +1,1794 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * omap iommu: tlb and pagetable primitives
+ *
+ * Copyright (C) 2008-2010 Nokia Corporation
+ * Copyright (C) 2013-2017 Texas Instruments Incorporated - https://www.ti.com/
+ *
+ * Written by Hiroshi DOYU <Hiroshi.DOYU@nokia.com>,
+ * Paul Mundt and Toshihiro Kobayashi
+ */
+
+#include <linux/dma-mapping.h>
+#include <linux/err.h>
+#include <linux/slab.h>
+#include <linux/interrupt.h>
+#include <linux/ioport.h>
+#include <linux/platform_device.h>
+#include <linux/iommu.h>
+#include <linux/omap-iommu.h>
+#include <linux/mutex.h>
+#include <linux/spinlock.h>
+#include <linux/io.h>
+#include <linux/pm_runtime.h>
+#include <linux/of.h>
+#include <linux/of_iommu.h>
+#include <linux/of_irq.h>
+#include <linux/of_platform.h>
+#include <linux/regmap.h>
+#include <linux/mfd/syscon.h>
+
+#include <linux/platform_data/iommu-omap.h>
+
+#include "omap-iopgtable.h"
+#include "omap-iommu.h"
+
+static const struct iommu_ops omap_iommu_ops;
+
+#define to_iommu(dev) ((struct omap_iommu *)dev_get_drvdata(dev))
+
+/* bitmap of the page sizes currently supported */
+#define OMAP_IOMMU_PGSIZES (SZ_4K | SZ_64K | SZ_1M | SZ_16M)
+
+#define MMU_LOCK_BASE_SHIFT 10
+#define MMU_LOCK_BASE_MASK (0x1f << MMU_LOCK_BASE_SHIFT)
+#define MMU_LOCK_BASE(x) \
+ ((x & MMU_LOCK_BASE_MASK) >> MMU_LOCK_BASE_SHIFT)
+
+#define MMU_LOCK_VICT_SHIFT 4
+#define MMU_LOCK_VICT_MASK (0x1f << MMU_LOCK_VICT_SHIFT)
+#define MMU_LOCK_VICT(x) \
+ ((x & MMU_LOCK_VICT_MASK) >> MMU_LOCK_VICT_SHIFT)
+
+static struct platform_driver omap_iommu_driver;
+static struct kmem_cache *iopte_cachep;
+
+/**
+ * to_omap_domain - Get struct omap_iommu_domain from generic iommu_domain
+ * @dom: generic iommu domain handle
+ **/
+static struct omap_iommu_domain *to_omap_domain(struct iommu_domain *dom)
+{
+ return container_of(dom, struct omap_iommu_domain, domain);
+}
+
+/**
+ * omap_iommu_save_ctx - Save registers for pm off-mode support
+ * @dev: client device
+ *
+ * This should be treated as an deprecated API. It is preserved only
+ * to maintain existing functionality for OMAP3 ISP driver.
+ **/
+void omap_iommu_save_ctx(struct device *dev)
+{
+ struct omap_iommu_arch_data *arch_data = dev_iommu_priv_get(dev);
+ struct omap_iommu *obj;
+ u32 *p;
+ int i;
+
+ if (!arch_data)
+ return;
+
+ while (arch_data->iommu_dev) {
+ obj = arch_data->iommu_dev;
+ p = obj->ctx;
+ for (i = 0; i < (MMU_REG_SIZE / sizeof(u32)); i++) {
+ p[i] = iommu_read_reg(obj, i * sizeof(u32));
+ dev_dbg(obj->dev, "%s\t[%02d] %08x\n", __func__, i,
+ p[i]);
+ }
+ arch_data++;
+ }
+}
+EXPORT_SYMBOL_GPL(omap_iommu_save_ctx);
+
+/**
+ * omap_iommu_restore_ctx - Restore registers for pm off-mode support
+ * @dev: client device
+ *
+ * This should be treated as an deprecated API. It is preserved only
+ * to maintain existing functionality for OMAP3 ISP driver.
+ **/
+void omap_iommu_restore_ctx(struct device *dev)
+{
+ struct omap_iommu_arch_data *arch_data = dev_iommu_priv_get(dev);
+ struct omap_iommu *obj;
+ u32 *p;
+ int i;
+
+ if (!arch_data)
+ return;
+
+ while (arch_data->iommu_dev) {
+ obj = arch_data->iommu_dev;
+ p = obj->ctx;
+ for (i = 0; i < (MMU_REG_SIZE / sizeof(u32)); i++) {
+ iommu_write_reg(obj, p[i], i * sizeof(u32));
+ dev_dbg(obj->dev, "%s\t[%02d] %08x\n", __func__, i,
+ p[i]);
+ }
+ arch_data++;
+ }
+}
+EXPORT_SYMBOL_GPL(omap_iommu_restore_ctx);
+
+static void dra7_cfg_dspsys_mmu(struct omap_iommu *obj, bool enable)
+{
+ u32 val, mask;
+
+ if (!obj->syscfg)
+ return;
+
+ mask = (1 << (obj->id * DSP_SYS_MMU_CONFIG_EN_SHIFT));
+ val = enable ? mask : 0;
+ regmap_update_bits(obj->syscfg, DSP_SYS_MMU_CONFIG, mask, val);
+}
+
+static void __iommu_set_twl(struct omap_iommu *obj, bool on)
+{
+ u32 l = iommu_read_reg(obj, MMU_CNTL);
+
+ if (on)
+ iommu_write_reg(obj, MMU_IRQ_TWL_MASK, MMU_IRQENABLE);
+ else
+ iommu_write_reg(obj, MMU_IRQ_TLB_MISS_MASK, MMU_IRQENABLE);
+
+ l &= ~MMU_CNTL_MASK;
+ if (on)
+ l |= (MMU_CNTL_MMU_EN | MMU_CNTL_TWL_EN);
+ else
+ l |= (MMU_CNTL_MMU_EN);
+
+ iommu_write_reg(obj, l, MMU_CNTL);
+}
+
+static int omap2_iommu_enable(struct omap_iommu *obj)
+{
+ u32 l, pa;
+
+ if (!obj->iopgd || !IS_ALIGNED((unsigned long)obj->iopgd, SZ_16K))
+ return -EINVAL;
+
+ pa = virt_to_phys(obj->iopgd);
+ if (!IS_ALIGNED(pa, SZ_16K))
+ return -EINVAL;
+
+ l = iommu_read_reg(obj, MMU_REVISION);
+ dev_info(obj->dev, "%s: version %d.%d\n", obj->name,
+ (l >> 4) & 0xf, l & 0xf);
+
+ iommu_write_reg(obj, pa, MMU_TTB);
+
+ dra7_cfg_dspsys_mmu(obj, true);
+
+ if (obj->has_bus_err_back)
+ iommu_write_reg(obj, MMU_GP_REG_BUS_ERR_BACK_EN, MMU_GP_REG);
+
+ __iommu_set_twl(obj, true);
+
+ return 0;
+}
+
+static void omap2_iommu_disable(struct omap_iommu *obj)
+{
+ u32 l = iommu_read_reg(obj, MMU_CNTL);
+
+ l &= ~MMU_CNTL_MASK;
+ iommu_write_reg(obj, l, MMU_CNTL);
+ dra7_cfg_dspsys_mmu(obj, false);
+
+ dev_dbg(obj->dev, "%s is shutting down\n", obj->name);
+}
+
+static int iommu_enable(struct omap_iommu *obj)
+{
+ int ret;
+
+ ret = pm_runtime_get_sync(obj->dev);
+ if (ret < 0)
+ pm_runtime_put_noidle(obj->dev);
+
+ return ret < 0 ? ret : 0;
+}
+
+static void iommu_disable(struct omap_iommu *obj)
+{
+ pm_runtime_put_sync(obj->dev);
+}
+
+/*
+ * TLB operations
+ */
+static u32 iotlb_cr_to_virt(struct cr_regs *cr)
+{
+ u32 page_size = cr->cam & MMU_CAM_PGSZ_MASK;
+ u32 mask = get_cam_va_mask(cr->cam & page_size);
+
+ return cr->cam & mask;
+}
+
+static u32 get_iopte_attr(struct iotlb_entry *e)
+{
+ u32 attr;
+
+ attr = e->mixed << 5;
+ attr |= e->endian;
+ attr |= e->elsz >> 3;
+ attr <<= (((e->pgsz == MMU_CAM_PGSZ_4K) ||
+ (e->pgsz == MMU_CAM_PGSZ_64K)) ? 0 : 6);
+ return attr;
+}
+
+static u32 iommu_report_fault(struct omap_iommu *obj, u32 *da)
+{
+ u32 status, fault_addr;
+
+ status = iommu_read_reg(obj, MMU_IRQSTATUS);
+ status &= MMU_IRQ_MASK;
+ if (!status) {
+ *da = 0;
+ return 0;
+ }
+
+ fault_addr = iommu_read_reg(obj, MMU_FAULT_AD);
+ *da = fault_addr;
+
+ iommu_write_reg(obj, status, MMU_IRQSTATUS);
+
+ return status;
+}
+
+void iotlb_lock_get(struct omap_iommu *obj, struct iotlb_lock *l)
+{
+ u32 val;
+
+ val = iommu_read_reg(obj, MMU_LOCK);
+
+ l->base = MMU_LOCK_BASE(val);
+ l->vict = MMU_LOCK_VICT(val);
+}
+
+void iotlb_lock_set(struct omap_iommu *obj, struct iotlb_lock *l)
+{
+ u32 val;
+
+ val = (l->base << MMU_LOCK_BASE_SHIFT);
+ val |= (l->vict << MMU_LOCK_VICT_SHIFT);
+
+ iommu_write_reg(obj, val, MMU_LOCK);
+}
+
+static void iotlb_read_cr(struct omap_iommu *obj, struct cr_regs *cr)
+{
+ cr->cam = iommu_read_reg(obj, MMU_READ_CAM);
+ cr->ram = iommu_read_reg(obj, MMU_READ_RAM);
+}
+
+static void iotlb_load_cr(struct omap_iommu *obj, struct cr_regs *cr)
+{
+ iommu_write_reg(obj, cr->cam | MMU_CAM_V, MMU_CAM);
+ iommu_write_reg(obj, cr->ram, MMU_RAM);
+
+ iommu_write_reg(obj, 1, MMU_FLUSH_ENTRY);
+ iommu_write_reg(obj, 1, MMU_LD_TLB);
+}
+
+/* only used in iotlb iteration for-loop */
+struct cr_regs __iotlb_read_cr(struct omap_iommu *obj, int n)
+{
+ struct cr_regs cr;
+ struct iotlb_lock l;
+
+ iotlb_lock_get(obj, &l);
+ l.vict = n;
+ iotlb_lock_set(obj, &l);
+ iotlb_read_cr(obj, &cr);
+
+ return cr;
+}
+
+#ifdef PREFETCH_IOTLB
+static struct cr_regs *iotlb_alloc_cr(struct omap_iommu *obj,
+ struct iotlb_entry *e)
+{
+ struct cr_regs *cr;
+
+ if (!e)
+ return NULL;
+
+ if (e->da & ~(get_cam_va_mask(e->pgsz))) {
+ dev_err(obj->dev, "%s:\twrong alignment: %08x\n", __func__,
+ e->da);
+ return ERR_PTR(-EINVAL);
+ }
+
+ cr = kmalloc(sizeof(*cr), GFP_KERNEL);
+ if (!cr)
+ return ERR_PTR(-ENOMEM);
+
+ cr->cam = (e->da & MMU_CAM_VATAG_MASK) | e->prsvd | e->pgsz | e->valid;
+ cr->ram = e->pa | e->endian | e->elsz | e->mixed;
+
+ return cr;
+}
+
+/**
+ * load_iotlb_entry - Set an iommu tlb entry
+ * @obj: target iommu
+ * @e: an iommu tlb entry info
+ **/
+static int load_iotlb_entry(struct omap_iommu *obj, struct iotlb_entry *e)
+{
+ int err = 0;
+ struct iotlb_lock l;
+ struct cr_regs *cr;
+
+ if (!obj || !obj->nr_tlb_entries || !e)
+ return -EINVAL;
+
+ pm_runtime_get_sync(obj->dev);
+
+ iotlb_lock_get(obj, &l);
+ if (l.base == obj->nr_tlb_entries) {
+ dev_warn(obj->dev, "%s: preserve entries full\n", __func__);
+ err = -EBUSY;
+ goto out;
+ }
+ if (!e->prsvd) {
+ int i;
+ struct cr_regs tmp;
+
+ for_each_iotlb_cr(obj, obj->nr_tlb_entries, i, tmp)
+ if (!iotlb_cr_valid(&tmp))
+ break;
+
+ if (i == obj->nr_tlb_entries) {
+ dev_dbg(obj->dev, "%s: full: no entry\n", __func__);
+ err = -EBUSY;
+ goto out;
+ }
+
+ iotlb_lock_get(obj, &l);
+ } else {
+ l.vict = l.base;
+ iotlb_lock_set(obj, &l);
+ }
+
+ cr = iotlb_alloc_cr(obj, e);
+ if (IS_ERR(cr)) {
+ pm_runtime_put_sync(obj->dev);
+ return PTR_ERR(cr);
+ }
+
+ iotlb_load_cr(obj, cr);
+ kfree(cr);
+
+ if (e->prsvd)
+ l.base++;
+ /* increment victim for next tlb load */
+ if (++l.vict == obj->nr_tlb_entries)
+ l.vict = l.base;
+ iotlb_lock_set(obj, &l);
+out:
+ pm_runtime_put_sync(obj->dev);
+ return err;
+}
+
+#else /* !PREFETCH_IOTLB */
+
+static int load_iotlb_entry(struct omap_iommu *obj, struct iotlb_entry *e)
+{
+ return 0;
+}
+
+#endif /* !PREFETCH_IOTLB */
+
+static int prefetch_iotlb_entry(struct omap_iommu *obj, struct iotlb_entry *e)
+{
+ return load_iotlb_entry(obj, e);
+}
+
+/**
+ * flush_iotlb_page - Clear an iommu tlb entry
+ * @obj: target iommu
+ * @da: iommu device virtual address
+ *
+ * Clear an iommu tlb entry which includes 'da' address.
+ **/
+static void flush_iotlb_page(struct omap_iommu *obj, u32 da)
+{
+ int i;
+ struct cr_regs cr;
+
+ pm_runtime_get_sync(obj->dev);
+
+ for_each_iotlb_cr(obj, obj->nr_tlb_entries, i, cr) {
+ u32 start;
+ size_t bytes;
+
+ if (!iotlb_cr_valid(&cr))
+ continue;
+
+ start = iotlb_cr_to_virt(&cr);
+ bytes = iopgsz_to_bytes(cr.cam & 3);
+
+ if ((start <= da) && (da < start + bytes)) {
+ dev_dbg(obj->dev, "%s: %08x<=%08x(%zx)\n",
+ __func__, start, da, bytes);
+ iotlb_load_cr(obj, &cr);
+ iommu_write_reg(obj, 1, MMU_FLUSH_ENTRY);
+ break;
+ }
+ }
+ pm_runtime_put_sync(obj->dev);
+
+ if (i == obj->nr_tlb_entries)
+ dev_dbg(obj->dev, "%s: no page for %08x\n", __func__, da);
+}
+
+/**
+ * flush_iotlb_all - Clear all iommu tlb entries
+ * @obj: target iommu
+ **/
+static void flush_iotlb_all(struct omap_iommu *obj)
+{
+ struct iotlb_lock l;
+
+ pm_runtime_get_sync(obj->dev);
+
+ l.base = 0;
+ l.vict = 0;
+ iotlb_lock_set(obj, &l);
+
+ iommu_write_reg(obj, 1, MMU_GFLUSH);
+
+ pm_runtime_put_sync(obj->dev);
+}
+
+/*
+ * H/W pagetable operations
+ */
+static void flush_iopte_range(struct device *dev, dma_addr_t dma,
+ unsigned long offset, int num_entries)
+{
+ size_t size = num_entries * sizeof(u32);
+
+ dma_sync_single_range_for_device(dev, dma, offset, size, DMA_TO_DEVICE);
+}
+
+static void iopte_free(struct omap_iommu *obj, u32 *iopte, bool dma_valid)
+{
+ dma_addr_t pt_dma;
+
+ /* Note: freed iopte's must be clean ready for re-use */
+ if (iopte) {
+ if (dma_valid) {
+ pt_dma = virt_to_phys(iopte);
+ dma_unmap_single(obj->dev, pt_dma, IOPTE_TABLE_SIZE,
+ DMA_TO_DEVICE);
+ }
+
+ kmem_cache_free(iopte_cachep, iopte);
+ }
+}
+
+static u32 *iopte_alloc(struct omap_iommu *obj, u32 *iopgd,
+ dma_addr_t *pt_dma, u32 da)
+{
+ u32 *iopte;
+ unsigned long offset = iopgd_index(da) * sizeof(da);
+
+ /* a table has already existed */
+ if (*iopgd)
+ goto pte_ready;
+
+ /*
+ * do the allocation outside the page table lock
+ */
+ spin_unlock(&obj->page_table_lock);
+ iopte = kmem_cache_zalloc(iopte_cachep, GFP_KERNEL);
+ spin_lock(&obj->page_table_lock);
+
+ if (!*iopgd) {
+ if (!iopte)
+ return ERR_PTR(-ENOMEM);
+
+ *pt_dma = dma_map_single(obj->dev, iopte, IOPTE_TABLE_SIZE,
+ DMA_TO_DEVICE);
+ if (dma_mapping_error(obj->dev, *pt_dma)) {
+ dev_err(obj->dev, "DMA map error for L2 table\n");
+ iopte_free(obj, iopte, false);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ /*
+ * we rely on dma address and the physical address to be
+ * the same for mapping the L2 table
+ */
+ if (WARN_ON(*pt_dma != virt_to_phys(iopte))) {
+ dev_err(obj->dev, "DMA translation error for L2 table\n");
+ dma_unmap_single(obj->dev, *pt_dma, IOPTE_TABLE_SIZE,
+ DMA_TO_DEVICE);
+ iopte_free(obj, iopte, false);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ *iopgd = virt_to_phys(iopte) | IOPGD_TABLE;
+
+ flush_iopte_range(obj->dev, obj->pd_dma, offset, 1);
+ dev_vdbg(obj->dev, "%s: a new pte:%p\n", __func__, iopte);
+ } else {
+ /* We raced, free the reduniovant table */
+ iopte_free(obj, iopte, false);
+ }
+
+pte_ready:
+ iopte = iopte_offset(iopgd, da);
+ *pt_dma = iopgd_page_paddr(iopgd);
+ dev_vdbg(obj->dev,
+ "%s: da:%08x pgd:%p *pgd:%08x pte:%p *pte:%08x\n",
+ __func__, da, iopgd, *iopgd, iopte, *iopte);
+
+ return iopte;
+}
+
+static int iopgd_alloc_section(struct omap_iommu *obj, u32 da, u32 pa, u32 prot)
+{
+ u32 *iopgd = iopgd_offset(obj, da);
+ unsigned long offset = iopgd_index(da) * sizeof(da);
+
+ if ((da | pa) & ~IOSECTION_MASK) {
+ dev_err(obj->dev, "%s: %08x:%08x should aligned on %08lx\n",
+ __func__, da, pa, IOSECTION_SIZE);
+ return -EINVAL;
+ }
+
+ *iopgd = (pa & IOSECTION_MASK) | prot | IOPGD_SECTION;
+ flush_iopte_range(obj->dev, obj->pd_dma, offset, 1);
+ return 0;
+}
+
+static int iopgd_alloc_super(struct omap_iommu *obj, u32 da, u32 pa, u32 prot)
+{
+ u32 *iopgd = iopgd_offset(obj, da);
+ unsigned long offset = iopgd_index(da) * sizeof(da);
+ int i;
+
+ if ((da | pa) & ~IOSUPER_MASK) {
+ dev_err(obj->dev, "%s: %08x:%08x should aligned on %08lx\n",
+ __func__, da, pa, IOSUPER_SIZE);
+ return -EINVAL;
+ }
+
+ for (i = 0; i < 16; i++)
+ *(iopgd + i) = (pa & IOSUPER_MASK) | prot | IOPGD_SUPER;
+ flush_iopte_range(obj->dev, obj->pd_dma, offset, 16);
+ return 0;
+}
+
+static int iopte_alloc_page(struct omap_iommu *obj, u32 da, u32 pa, u32 prot)
+{
+ u32 *iopgd = iopgd_offset(obj, da);
+ dma_addr_t pt_dma;
+ u32 *iopte = iopte_alloc(obj, iopgd, &pt_dma, da);
+ unsigned long offset = iopte_index(da) * sizeof(da);
+
+ if (IS_ERR(iopte))
+ return PTR_ERR(iopte);
+
+ *iopte = (pa & IOPAGE_MASK) | prot | IOPTE_SMALL;
+ flush_iopte_range(obj->dev, pt_dma, offset, 1);
+
+ dev_vdbg(obj->dev, "%s: da:%08x pa:%08x pte:%p *pte:%08x\n",
+ __func__, da, pa, iopte, *iopte);
+
+ return 0;
+}
+
+static int iopte_alloc_large(struct omap_iommu *obj, u32 da, u32 pa, u32 prot)
+{
+ u32 *iopgd = iopgd_offset(obj, da);
+ dma_addr_t pt_dma;
+ u32 *iopte = iopte_alloc(obj, iopgd, &pt_dma, da);
+ unsigned long offset = iopte_index(da) * sizeof(da);
+ int i;
+
+ if ((da | pa) & ~IOLARGE_MASK) {
+ dev_err(obj->dev, "%s: %08x:%08x should aligned on %08lx\n",
+ __func__, da, pa, IOLARGE_SIZE);
+ return -EINVAL;
+ }
+
+ if (IS_ERR(iopte))
+ return PTR_ERR(iopte);
+
+ for (i = 0; i < 16; i++)
+ *(iopte + i) = (pa & IOLARGE_MASK) | prot | IOPTE_LARGE;
+ flush_iopte_range(obj->dev, pt_dma, offset, 16);
+ return 0;
+}
+
+static int
+iopgtable_store_entry_core(struct omap_iommu *obj, struct iotlb_entry *e)
+{
+ int (*fn)(struct omap_iommu *, u32, u32, u32);
+ u32 prot;
+ int err;
+
+ if (!obj || !e)
+ return -EINVAL;
+
+ switch (e->pgsz) {
+ case MMU_CAM_PGSZ_16M:
+ fn = iopgd_alloc_super;
+ break;
+ case MMU_CAM_PGSZ_1M:
+ fn = iopgd_alloc_section;
+ break;
+ case MMU_CAM_PGSZ_64K:
+ fn = iopte_alloc_large;
+ break;
+ case MMU_CAM_PGSZ_4K:
+ fn = iopte_alloc_page;
+ break;
+ default:
+ fn = NULL;
+ break;
+ }
+
+ if (WARN_ON(!fn))
+ return -EINVAL;
+
+ prot = get_iopte_attr(e);
+
+ spin_lock(&obj->page_table_lock);
+ err = fn(obj, e->da, e->pa, prot);
+ spin_unlock(&obj->page_table_lock);
+
+ return err;
+}
+
+/**
+ * omap_iopgtable_store_entry - Make an iommu pte entry
+ * @obj: target iommu
+ * @e: an iommu tlb entry info
+ **/
+static int
+omap_iopgtable_store_entry(struct omap_iommu *obj, struct iotlb_entry *e)
+{
+ int err;
+
+ flush_iotlb_page(obj, e->da);
+ err = iopgtable_store_entry_core(obj, e);
+ if (!err)
+ prefetch_iotlb_entry(obj, e);
+ return err;
+}
+
+/**
+ * iopgtable_lookup_entry - Lookup an iommu pte entry
+ * @obj: target iommu
+ * @da: iommu device virtual address
+ * @ppgd: iommu pgd entry pointer to be returned
+ * @ppte: iommu pte entry pointer to be returned
+ **/
+static void
+iopgtable_lookup_entry(struct omap_iommu *obj, u32 da, u32 **ppgd, u32 **ppte)
+{
+ u32 *iopgd, *iopte = NULL;
+
+ iopgd = iopgd_offset(obj, da);
+ if (!*iopgd)
+ goto out;
+
+ if (iopgd_is_table(*iopgd))
+ iopte = iopte_offset(iopgd, da);
+out:
+ *ppgd = iopgd;
+ *ppte = iopte;
+}
+
+static size_t iopgtable_clear_entry_core(struct omap_iommu *obj, u32 da)
+{
+ size_t bytes;
+ u32 *iopgd = iopgd_offset(obj, da);
+ int nent = 1;
+ dma_addr_t pt_dma;
+ unsigned long pd_offset = iopgd_index(da) * sizeof(da);
+ unsigned long pt_offset = iopte_index(da) * sizeof(da);
+
+ if (!*iopgd)
+ return 0;
+
+ if (iopgd_is_table(*iopgd)) {
+ int i;
+ u32 *iopte = iopte_offset(iopgd, da);
+
+ bytes = IOPTE_SIZE;
+ if (*iopte & IOPTE_LARGE) {
+ nent *= 16;
+ /* rewind to the 1st entry */
+ iopte = iopte_offset(iopgd, (da & IOLARGE_MASK));
+ }
+ bytes *= nent;
+ memset(iopte, 0, nent * sizeof(*iopte));
+ pt_dma = iopgd_page_paddr(iopgd);
+ flush_iopte_range(obj->dev, pt_dma, pt_offset, nent);
+
+ /*
+ * do table walk to check if this table is necessary or not
+ */
+ iopte = iopte_offset(iopgd, 0);
+ for (i = 0; i < PTRS_PER_IOPTE; i++)
+ if (iopte[i])
+ goto out;
+
+ iopte_free(obj, iopte, true);
+ nent = 1; /* for the next L1 entry */
+ } else {
+ bytes = IOPGD_SIZE;
+ if ((*iopgd & IOPGD_SUPER) == IOPGD_SUPER) {
+ nent *= 16;
+ /* rewind to the 1st entry */
+ iopgd = iopgd_offset(obj, (da & IOSUPER_MASK));
+ }
+ bytes *= nent;
+ }
+ memset(iopgd, 0, nent * sizeof(*iopgd));
+ flush_iopte_range(obj->dev, obj->pd_dma, pd_offset, nent);
+out:
+ return bytes;
+}
+
+/**
+ * iopgtable_clear_entry - Remove an iommu pte entry
+ * @obj: target iommu
+ * @da: iommu device virtual address
+ **/
+static size_t iopgtable_clear_entry(struct omap_iommu *obj, u32 da)
+{
+ size_t bytes;
+
+ spin_lock(&obj->page_table_lock);
+
+ bytes = iopgtable_clear_entry_core(obj, da);
+ flush_iotlb_page(obj, da);
+
+ spin_unlock(&obj->page_table_lock);
+
+ return bytes;
+}
+
+static void iopgtable_clear_entry_all(struct omap_iommu *obj)
+{
+ unsigned long offset;
+ int i;
+
+ spin_lock(&obj->page_table_lock);
+
+ for (i = 0; i < PTRS_PER_IOPGD; i++) {
+ u32 da;
+ u32 *iopgd;
+
+ da = i << IOPGD_SHIFT;
+ iopgd = iopgd_offset(obj, da);
+ offset = iopgd_index(da) * sizeof(da);
+
+ if (!*iopgd)
+ continue;
+
+ if (iopgd_is_table(*iopgd))
+ iopte_free(obj, iopte_offset(iopgd, 0), true);
+
+ *iopgd = 0;
+ flush_iopte_range(obj->dev, obj->pd_dma, offset, 1);
+ }
+
+ flush_iotlb_all(obj);
+
+ spin_unlock(&obj->page_table_lock);
+}
+
+/*
+ * Device IOMMU generic operations
+ */
+static irqreturn_t iommu_fault_handler(int irq, void *data)
+{
+ u32 da, errs;
+ u32 *iopgd, *iopte;
+ struct omap_iommu *obj = data;
+ struct iommu_domain *domain = obj->domain;
+ struct omap_iommu_domain *omap_domain = to_omap_domain(domain);
+
+ if (!omap_domain->dev)
+ return IRQ_NONE;
+
+ errs = iommu_report_fault(obj, &da);
+ if (errs == 0)
+ return IRQ_HANDLED;
+
+ /* Fault callback or TLB/PTE Dynamic loading */
+ if (!report_iommu_fault(domain, obj->dev, da, 0))
+ return IRQ_HANDLED;
+
+ iommu_write_reg(obj, 0, MMU_IRQENABLE);
+
+ iopgd = iopgd_offset(obj, da);
+
+ if (!iopgd_is_table(*iopgd)) {
+ dev_err(obj->dev, "%s: errs:0x%08x da:0x%08x pgd:0x%p *pgd:px%08x\n",
+ obj->name, errs, da, iopgd, *iopgd);
+ return IRQ_NONE;
+ }
+
+ iopte = iopte_offset(iopgd, da);
+
+ dev_err(obj->dev, "%s: errs:0x%08x da:0x%08x pgd:0x%p *pgd:0x%08x pte:0x%p *pte:0x%08x\n",
+ obj->name, errs, da, iopgd, *iopgd, iopte, *iopte);
+
+ return IRQ_NONE;
+}
+
+/**
+ * omap_iommu_attach() - attach iommu device to an iommu domain
+ * @obj: target omap iommu device
+ * @iopgd: page table
+ **/
+static int omap_iommu_attach(struct omap_iommu *obj, u32 *iopgd)
+{
+ int err;
+
+ spin_lock(&obj->iommu_lock);
+
+ obj->pd_dma = dma_map_single(obj->dev, iopgd, IOPGD_TABLE_SIZE,
+ DMA_TO_DEVICE);
+ if (dma_mapping_error(obj->dev, obj->pd_dma)) {
+ dev_err(obj->dev, "DMA map error for L1 table\n");
+ err = -ENOMEM;
+ goto out_err;
+ }
+
+ obj->iopgd = iopgd;
+ err = iommu_enable(obj);
+ if (err)
+ goto out_err;
+ flush_iotlb_all(obj);
+
+ spin_unlock(&obj->iommu_lock);
+
+ dev_dbg(obj->dev, "%s: %s\n", __func__, obj->name);
+
+ return 0;
+
+out_err:
+ spin_unlock(&obj->iommu_lock);
+
+ return err;
+}
+
+/**
+ * omap_iommu_detach - release iommu device
+ * @obj: target iommu
+ **/
+static void omap_iommu_detach(struct omap_iommu *obj)
+{
+ if (!obj || IS_ERR(obj))
+ return;
+
+ spin_lock(&obj->iommu_lock);
+
+ dma_unmap_single(obj->dev, obj->pd_dma, IOPGD_TABLE_SIZE,
+ DMA_TO_DEVICE);
+ obj->pd_dma = 0;
+ obj->iopgd = NULL;
+ iommu_disable(obj);
+
+ spin_unlock(&obj->iommu_lock);
+
+ dev_dbg(obj->dev, "%s: %s\n", __func__, obj->name);
+}
+
+static void omap_iommu_save_tlb_entries(struct omap_iommu *obj)
+{
+ struct iotlb_lock lock;
+ struct cr_regs cr;
+ struct cr_regs *tmp;
+ int i;
+
+ /* check if there are any locked tlbs to save */
+ iotlb_lock_get(obj, &lock);
+ obj->num_cr_ctx = lock.base;
+ if (!obj->num_cr_ctx)
+ return;
+
+ tmp = obj->cr_ctx;
+ for_each_iotlb_cr(obj, obj->num_cr_ctx, i, cr)
+ * tmp++ = cr;
+}
+
+static void omap_iommu_restore_tlb_entries(struct omap_iommu *obj)
+{
+ struct iotlb_lock l;
+ struct cr_regs *tmp;
+ int i;
+
+ /* no locked tlbs to restore */
+ if (!obj->num_cr_ctx)
+ return;
+
+ l.base = 0;
+ tmp = obj->cr_ctx;
+ for (i = 0; i < obj->num_cr_ctx; i++, tmp++) {
+ l.vict = i;
+ iotlb_lock_set(obj, &l);
+ iotlb_load_cr(obj, tmp);
+ }
+ l.base = obj->num_cr_ctx;
+ l.vict = i;
+ iotlb_lock_set(obj, &l);
+}
+
+/**
+ * omap_iommu_domain_deactivate - deactivate attached iommu devices
+ * @domain: iommu domain attached to the target iommu device
+ *
+ * This API allows the client devices of IOMMU devices to suspend
+ * the IOMMUs they control at runtime, after they are idled and
+ * suspended all activity. System Suspend will leverage the PM
+ * driver late callbacks.
+ **/
+int omap_iommu_domain_deactivate(struct iommu_domain *domain)
+{
+ struct omap_iommu_domain *omap_domain = to_omap_domain(domain);
+ struct omap_iommu_device *iommu;
+ struct omap_iommu *oiommu;
+ int i;
+
+ if (!omap_domain->dev)
+ return 0;
+
+ iommu = omap_domain->iommus;
+ iommu += (omap_domain->num_iommus - 1);
+ for (i = 0; i < omap_domain->num_iommus; i++, iommu--) {
+ oiommu = iommu->iommu_dev;
+ pm_runtime_put_sync(oiommu->dev);
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(omap_iommu_domain_deactivate);
+
+/**
+ * omap_iommu_domain_activate - activate attached iommu devices
+ * @domain: iommu domain attached to the target iommu device
+ *
+ * This API allows the client devices of IOMMU devices to resume the
+ * IOMMUs they control at runtime, before they can resume operations.
+ * System Resume will leverage the PM driver late callbacks.
+ **/
+int omap_iommu_domain_activate(struct iommu_domain *domain)
+{
+ struct omap_iommu_domain *omap_domain = to_omap_domain(domain);
+ struct omap_iommu_device *iommu;
+ struct omap_iommu *oiommu;
+ int i;
+
+ if (!omap_domain->dev)
+ return 0;
+
+ iommu = omap_domain->iommus;
+ for (i = 0; i < omap_domain->num_iommus; i++, iommu++) {
+ oiommu = iommu->iommu_dev;
+ pm_runtime_get_sync(oiommu->dev);
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(omap_iommu_domain_activate);
+
+/**
+ * omap_iommu_runtime_suspend - disable an iommu device
+ * @dev: iommu device
+ *
+ * This function performs all that is necessary to disable an
+ * IOMMU device, either during final detachment from a client
+ * device, or during system/runtime suspend of the device. This
+ * includes programming all the appropriate IOMMU registers, and
+ * managing the associated omap_hwmod's state and the device's
+ * reset line. This function also saves the context of any
+ * locked TLBs if suspending.
+ **/
+static __maybe_unused int omap_iommu_runtime_suspend(struct device *dev)
+{
+ struct platform_device *pdev = to_platform_device(dev);
+ struct iommu_platform_data *pdata = dev_get_platdata(dev);
+ struct omap_iommu *obj = to_iommu(dev);
+ int ret;
+
+ /* save the TLBs only during suspend, and not for power down */
+ if (obj->domain && obj->iopgd)
+ omap_iommu_save_tlb_entries(obj);
+
+ omap2_iommu_disable(obj);
+
+ if (pdata && pdata->device_idle)
+ pdata->device_idle(pdev);
+
+ if (pdata && pdata->assert_reset)
+ pdata->assert_reset(pdev, pdata->reset_name);
+
+ if (pdata && pdata->set_pwrdm_constraint) {
+ ret = pdata->set_pwrdm_constraint(pdev, false, &obj->pwrst);
+ if (ret) {
+ dev_warn(obj->dev, "pwrdm_constraint failed to be reset, status = %d\n",
+ ret);
+ }
+ }
+
+ return 0;
+}
+
+/**
+ * omap_iommu_runtime_resume - enable an iommu device
+ * @dev: iommu device
+ *
+ * This function performs all that is necessary to enable an
+ * IOMMU device, either during initial attachment to a client
+ * device, or during system/runtime resume of the device. This
+ * includes programming all the appropriate IOMMU registers, and
+ * managing the associated omap_hwmod's state and the device's
+ * reset line. The function also restores any locked TLBs if
+ * resuming after a suspend.
+ **/
+static __maybe_unused int omap_iommu_runtime_resume(struct device *dev)
+{
+ struct platform_device *pdev = to_platform_device(dev);
+ struct iommu_platform_data *pdata = dev_get_platdata(dev);
+ struct omap_iommu *obj = to_iommu(dev);
+ int ret = 0;
+
+ if (pdata && pdata->set_pwrdm_constraint) {
+ ret = pdata->set_pwrdm_constraint(pdev, true, &obj->pwrst);
+ if (ret) {
+ dev_warn(obj->dev, "pwrdm_constraint failed to be set, status = %d\n",
+ ret);
+ }
+ }
+
+ if (pdata && pdata->deassert_reset) {
+ ret = pdata->deassert_reset(pdev, pdata->reset_name);
+ if (ret) {
+ dev_err(dev, "deassert_reset failed: %d\n", ret);
+ return ret;
+ }
+ }
+
+ if (pdata && pdata->device_enable)
+ pdata->device_enable(pdev);
+
+ /* restore the TLBs only during resume, and not for power up */
+ if (obj->domain)
+ omap_iommu_restore_tlb_entries(obj);
+
+ ret = omap2_iommu_enable(obj);
+
+ return ret;
+}
+
+/**
+ * omap_iommu_suspend_prepare - prepare() dev_pm_ops implementation
+ * @dev: iommu device
+ *
+ * This function performs the necessary checks to determine if the IOMMU
+ * device needs suspending or not. The function checks if the runtime_pm
+ * status of the device is suspended, and returns 1 in that case. This
+ * results in the PM core to skip invoking any of the Sleep PM callbacks
+ * (suspend, suspend_late, resume, resume_early etc).
+ */
+static int omap_iommu_prepare(struct device *dev)
+{
+ if (pm_runtime_status_suspended(dev))
+ return 1;
+ return 0;
+}
+
+static bool omap_iommu_can_register(struct platform_device *pdev)
+{
+ struct device_node *np = pdev->dev.of_node;
+
+ if (!of_device_is_compatible(np, "ti,dra7-dsp-iommu"))
+ return true;
+
+ /*
+ * restrict IOMMU core registration only for processor-port MDMA MMUs
+ * on DRA7 DSPs
+ */
+ if ((!strcmp(dev_name(&pdev->dev), "40d01000.mmu")) ||
+ (!strcmp(dev_name(&pdev->dev), "41501000.mmu")))
+ return true;
+
+ return false;
+}
+
+static int omap_iommu_dra7_get_dsp_system_cfg(struct platform_device *pdev,
+ struct omap_iommu *obj)
+{
+ struct device_node *np = pdev->dev.of_node;
+ int ret;
+
+ if (!of_device_is_compatible(np, "ti,dra7-dsp-iommu"))
+ return 0;
+
+ if (!of_property_read_bool(np, "ti,syscon-mmuconfig")) {
+ dev_err(&pdev->dev, "ti,syscon-mmuconfig property is missing\n");
+ return -EINVAL;
+ }
+
+ obj->syscfg =
+ syscon_regmap_lookup_by_phandle(np, "ti,syscon-mmuconfig");
+ if (IS_ERR(obj->syscfg)) {
+ /* can fail with -EPROBE_DEFER */
+ ret = PTR_ERR(obj->syscfg);
+ return ret;
+ }
+
+ if (of_property_read_u32_index(np, "ti,syscon-mmuconfig", 1,
+ &obj->id)) {
+ dev_err(&pdev->dev, "couldn't get the IOMMU instance id within subsystem\n");
+ return -EINVAL;
+ }
+
+ if (obj->id != 0 && obj->id != 1) {
+ dev_err(&pdev->dev, "invalid IOMMU instance id\n");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+/*
+ * OMAP Device MMU(IOMMU) detection
+ */
+static int omap_iommu_probe(struct platform_device *pdev)
+{
+ int err = -ENODEV;
+ int irq;
+ struct omap_iommu *obj;
+ struct resource *res;
+ struct device_node *of = pdev->dev.of_node;
+
+ if (!of) {
+ pr_err("%s: only DT-based devices are supported\n", __func__);
+ return -ENODEV;
+ }
+
+ obj = devm_kzalloc(&pdev->dev, sizeof(*obj) + MMU_REG_SIZE, GFP_KERNEL);
+ if (!obj)
+ return -ENOMEM;
+
+ /*
+ * self-manage the ordering dependencies between omap_device_enable/idle
+ * and omap_device_assert/deassert_hardreset API
+ */
+ if (pdev->dev.pm_domain) {
+ dev_dbg(&pdev->dev, "device pm_domain is being reset\n");
+ pdev->dev.pm_domain = NULL;
+ }
+
+ obj->name = dev_name(&pdev->dev);
+ obj->nr_tlb_entries = 32;
+ err = of_property_read_u32(of, "ti,#tlb-entries", &obj->nr_tlb_entries);
+ if (err && err != -EINVAL)
+ return err;
+ if (obj->nr_tlb_entries != 32 && obj->nr_tlb_entries != 8)
+ return -EINVAL;
+ if (of_find_property(of, "ti,iommu-bus-err-back", NULL))
+ obj->has_bus_err_back = MMU_GP_REG_BUS_ERR_BACK_EN;
+
+ obj->dev = &pdev->dev;
+ obj->ctx = (void *)obj + sizeof(*obj);
+ obj->cr_ctx = devm_kzalloc(&pdev->dev,
+ sizeof(*obj->cr_ctx) * obj->nr_tlb_entries,
+ GFP_KERNEL);
+ if (!obj->cr_ctx)
+ return -ENOMEM;
+
+ spin_lock_init(&obj->iommu_lock);
+ spin_lock_init(&obj->page_table_lock);
+
+ res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+ obj->regbase = devm_ioremap_resource(obj->dev, res);
+ if (IS_ERR(obj->regbase))
+ return PTR_ERR(obj->regbase);
+
+ err = omap_iommu_dra7_get_dsp_system_cfg(pdev, obj);
+ if (err)
+ return err;
+
+ irq = platform_get_irq(pdev, 0);
+ if (irq < 0)
+ return -ENODEV;
+
+ err = devm_request_irq(obj->dev, irq, iommu_fault_handler, IRQF_SHARED,
+ dev_name(obj->dev), obj);
+ if (err < 0)
+ return err;
+ platform_set_drvdata(pdev, obj);
+
+ if (omap_iommu_can_register(pdev)) {
+ obj->group = iommu_group_alloc();
+ if (IS_ERR(obj->group))
+ return PTR_ERR(obj->group);
+
+ err = iommu_device_sysfs_add(&obj->iommu, obj->dev, NULL,
+ obj->name);
+ if (err)
+ goto out_group;
+
+ iommu_device_set_ops(&obj->iommu, &omap_iommu_ops);
+ iommu_device_set_fwnode(&obj->iommu, &of->fwnode);
+
+ err = iommu_device_register(&obj->iommu);
+ if (err)
+ goto out_sysfs;
+ }
+
+ pm_runtime_enable(obj->dev);
+
+ omap_iommu_debugfs_add(obj);
+
+ dev_info(&pdev->dev, "%s registered\n", obj->name);
+
+ /* Re-probe bus to probe device attached to this IOMMU */
+ bus_iommu_probe(&platform_bus_type);
+
+ return 0;
+
+out_sysfs:
+ iommu_device_sysfs_remove(&obj->iommu);
+out_group:
+ iommu_group_put(obj->group);
+ return err;
+}
+
+static int omap_iommu_remove(struct platform_device *pdev)
+{
+ struct omap_iommu *obj = platform_get_drvdata(pdev);
+
+ if (obj->group) {
+ iommu_group_put(obj->group);
+ obj->group = NULL;
+
+ iommu_device_sysfs_remove(&obj->iommu);
+ iommu_device_unregister(&obj->iommu);
+ }
+
+ omap_iommu_debugfs_remove(obj);
+
+ pm_runtime_disable(obj->dev);
+
+ dev_info(&pdev->dev, "%s removed\n", obj->name);
+ return 0;
+}
+
+static const struct dev_pm_ops omap_iommu_pm_ops = {
+ .prepare = omap_iommu_prepare,
+ SET_LATE_SYSTEM_SLEEP_PM_OPS(pm_runtime_force_suspend,
+ pm_runtime_force_resume)
+ SET_RUNTIME_PM_OPS(omap_iommu_runtime_suspend,
+ omap_iommu_runtime_resume, NULL)
+};
+
+static const struct of_device_id omap_iommu_of_match[] = {
+ { .compatible = "ti,omap2-iommu" },
+ { .compatible = "ti,omap4-iommu" },
+ { .compatible = "ti,dra7-iommu" },
+ { .compatible = "ti,dra7-dsp-iommu" },
+ {},
+};
+
+static struct platform_driver omap_iommu_driver = {
+ .probe = omap_iommu_probe,
+ .remove = omap_iommu_remove,
+ .driver = {
+ .name = "omap-iommu",
+ .pm = &omap_iommu_pm_ops,
+ .of_match_table = of_match_ptr(omap_iommu_of_match),
+ },
+};
+
+static u32 iotlb_init_entry(struct iotlb_entry *e, u32 da, u32 pa, int pgsz)
+{
+ memset(e, 0, sizeof(*e));
+
+ e->da = da;
+ e->pa = pa;
+ e->valid = MMU_CAM_V;
+ e->pgsz = pgsz;
+ e->endian = MMU_RAM_ENDIAN_LITTLE;
+ e->elsz = MMU_RAM_ELSZ_8;
+ e->mixed = 0;
+
+ return iopgsz_to_bytes(e->pgsz);
+}
+
+static int omap_iommu_map(struct iommu_domain *domain, unsigned long da,
+ phys_addr_t pa, size_t bytes, int prot, gfp_t gfp)
+{
+ struct omap_iommu_domain *omap_domain = to_omap_domain(domain);
+ struct device *dev = omap_domain->dev;
+ struct omap_iommu_device *iommu;
+ struct omap_iommu *oiommu;
+ struct iotlb_entry e;
+ int omap_pgsz;
+ u32 ret = -EINVAL;
+ int i;
+
+ omap_pgsz = bytes_to_iopgsz(bytes);
+ if (omap_pgsz < 0) {
+ dev_err(dev, "invalid size to map: %zu\n", bytes);
+ return -EINVAL;
+ }
+
+ dev_dbg(dev, "mapping da 0x%lx to pa %pa size 0x%zx\n", da, &pa, bytes);
+
+ iotlb_init_entry(&e, da, pa, omap_pgsz);
+
+ iommu = omap_domain->iommus;
+ for (i = 0; i < omap_domain->num_iommus; i++, iommu++) {
+ oiommu = iommu->iommu_dev;
+ ret = omap_iopgtable_store_entry(oiommu, &e);
+ if (ret) {
+ dev_err(dev, "omap_iopgtable_store_entry failed: %d\n",
+ ret);
+ break;
+ }
+ }
+
+ if (ret) {
+ while (i--) {
+ iommu--;
+ oiommu = iommu->iommu_dev;
+ iopgtable_clear_entry(oiommu, da);
+ }
+ }
+
+ return ret;
+}
+
+static size_t omap_iommu_unmap(struct iommu_domain *domain, unsigned long da,
+ size_t size, struct iommu_iotlb_gather *gather)
+{
+ struct omap_iommu_domain *omap_domain = to_omap_domain(domain);
+ struct device *dev = omap_domain->dev;
+ struct omap_iommu_device *iommu;
+ struct omap_iommu *oiommu;
+ bool error = false;
+ size_t bytes = 0;
+ int i;
+
+ dev_dbg(dev, "unmapping da 0x%lx size %zu\n", da, size);
+
+ iommu = omap_domain->iommus;
+ for (i = 0; i < omap_domain->num_iommus; i++, iommu++) {
+ oiommu = iommu->iommu_dev;
+ bytes = iopgtable_clear_entry(oiommu, da);
+ if (!bytes)
+ error = true;
+ }
+
+ /*
+ * simplify return - we are only checking if any of the iommus
+ * reported an error, but not if all of them are unmapping the
+ * same number of entries. This should not occur due to the
+ * mirror programming.
+ */
+ return error ? 0 : bytes;
+}
+
+static int omap_iommu_count(struct device *dev)
+{
+ struct omap_iommu_arch_data *arch_data = dev_iommu_priv_get(dev);
+ int count = 0;
+
+ while (arch_data->iommu_dev) {
+ count++;
+ arch_data++;
+ }
+
+ return count;
+}
+
+/* caller should call cleanup if this function fails */
+static int omap_iommu_attach_init(struct device *dev,
+ struct omap_iommu_domain *odomain)
+{
+ struct omap_iommu_device *iommu;
+ int i;
+
+ odomain->num_iommus = omap_iommu_count(dev);
+ if (!odomain->num_iommus)
+ return -EINVAL;
+
+ odomain->iommus = kcalloc(odomain->num_iommus, sizeof(*iommu),
+ GFP_ATOMIC);
+ if (!odomain->iommus)
+ return -ENOMEM;
+
+ iommu = odomain->iommus;
+ for (i = 0; i < odomain->num_iommus; i++, iommu++) {
+ iommu->pgtable = kzalloc(IOPGD_TABLE_SIZE, GFP_ATOMIC);
+ if (!iommu->pgtable)
+ return -ENOMEM;
+
+ /*
+ * should never fail, but please keep this around to ensure
+ * we keep the hardware happy
+ */
+ if (WARN_ON(!IS_ALIGNED((long)iommu->pgtable,
+ IOPGD_TABLE_SIZE)))
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static void omap_iommu_detach_fini(struct omap_iommu_domain *odomain)
+{
+ int i;
+ struct omap_iommu_device *iommu = odomain->iommus;
+
+ for (i = 0; iommu && i < odomain->num_iommus; i++, iommu++)
+ kfree(iommu->pgtable);
+
+ kfree(odomain->iommus);
+ odomain->num_iommus = 0;
+ odomain->iommus = NULL;
+}
+
+static int
+omap_iommu_attach_dev(struct iommu_domain *domain, struct device *dev)
+{
+ struct omap_iommu_arch_data *arch_data = dev_iommu_priv_get(dev);
+ struct omap_iommu_domain *omap_domain = to_omap_domain(domain);
+ struct omap_iommu_device *iommu;
+ struct omap_iommu *oiommu;
+ int ret = 0;
+ int i;
+
+ if (!arch_data || !arch_data->iommu_dev) {
+ dev_err(dev, "device doesn't have an associated iommu\n");
+ return -EINVAL;
+ }
+
+ spin_lock(&omap_domain->lock);
+
+ /* only a single client device can be attached to a domain */
+ if (omap_domain->dev) {
+ dev_err(dev, "iommu domain is already attached\n");
+ ret = -EBUSY;
+ goto out;
+ }
+
+ ret = omap_iommu_attach_init(dev, omap_domain);
+ if (ret) {
+ dev_err(dev, "failed to allocate required iommu data %d\n",
+ ret);
+ goto init_fail;
+ }
+
+ iommu = omap_domain->iommus;
+ for (i = 0; i < omap_domain->num_iommus; i++, iommu++, arch_data++) {
+ /* configure and enable the omap iommu */
+ oiommu = arch_data->iommu_dev;
+ ret = omap_iommu_attach(oiommu, iommu->pgtable);
+ if (ret) {
+ dev_err(dev, "can't get omap iommu: %d\n", ret);
+ goto attach_fail;
+ }
+
+ oiommu->domain = domain;
+ iommu->iommu_dev = oiommu;
+ }
+
+ omap_domain->dev = dev;
+
+ goto out;
+
+attach_fail:
+ while (i--) {
+ iommu--;
+ arch_data--;
+ oiommu = iommu->iommu_dev;
+ omap_iommu_detach(oiommu);
+ iommu->iommu_dev = NULL;
+ oiommu->domain = NULL;
+ }
+init_fail:
+ omap_iommu_detach_fini(omap_domain);
+out:
+ spin_unlock(&omap_domain->lock);
+ return ret;
+}
+
+static void _omap_iommu_detach_dev(struct omap_iommu_domain *omap_domain,
+ struct device *dev)
+{
+ struct omap_iommu_arch_data *arch_data = dev_iommu_priv_get(dev);
+ struct omap_iommu_device *iommu = omap_domain->iommus;
+ struct omap_iommu *oiommu;
+ int i;
+
+ if (!omap_domain->dev) {
+ dev_err(dev, "domain has no attached device\n");
+ return;
+ }
+
+ /* only a single device is supported per domain for now */
+ if (omap_domain->dev != dev) {
+ dev_err(dev, "invalid attached device\n");
+ return;
+ }
+
+ /*
+ * cleanup in the reverse order of attachment - this addresses
+ * any h/w dependencies between multiple instances, if any
+ */
+ iommu += (omap_domain->num_iommus - 1);
+ arch_data += (omap_domain->num_iommus - 1);
+ for (i = 0; i < omap_domain->num_iommus; i++, iommu--, arch_data--) {
+ oiommu = iommu->iommu_dev;
+ iopgtable_clear_entry_all(oiommu);
+
+ omap_iommu_detach(oiommu);
+ iommu->iommu_dev = NULL;
+ oiommu->domain = NULL;
+ }
+
+ omap_iommu_detach_fini(omap_domain);
+
+ omap_domain->dev = NULL;
+}
+
+static void omap_iommu_detach_dev(struct iommu_domain *domain,
+ struct device *dev)
+{
+ struct omap_iommu_domain *omap_domain = to_omap_domain(domain);
+
+ spin_lock(&omap_domain->lock);
+ _omap_iommu_detach_dev(omap_domain, dev);
+ spin_unlock(&omap_domain->lock);
+}
+
+static struct iommu_domain *omap_iommu_domain_alloc(unsigned type)
+{
+ struct omap_iommu_domain *omap_domain;
+
+ if (type != IOMMU_DOMAIN_UNMANAGED)
+ return NULL;
+
+ omap_domain = kzalloc(sizeof(*omap_domain), GFP_KERNEL);
+ if (!omap_domain)
+ return NULL;
+
+ spin_lock_init(&omap_domain->lock);
+
+ omap_domain->domain.geometry.aperture_start = 0;
+ omap_domain->domain.geometry.aperture_end = (1ULL << 32) - 1;
+ omap_domain->domain.geometry.force_aperture = true;
+
+ return &omap_domain->domain;
+}
+
+static void omap_iommu_domain_free(struct iommu_domain *domain)
+{
+ struct omap_iommu_domain *omap_domain = to_omap_domain(domain);
+
+ /*
+ * An iommu device is still attached
+ * (currently, only one device can be attached) ?
+ */
+ if (omap_domain->dev)
+ _omap_iommu_detach_dev(omap_domain, omap_domain->dev);
+
+ kfree(omap_domain);
+}
+
+static phys_addr_t omap_iommu_iova_to_phys(struct iommu_domain *domain,
+ dma_addr_t da)
+{
+ struct omap_iommu_domain *omap_domain = to_omap_domain(domain);
+ struct omap_iommu_device *iommu = omap_domain->iommus;
+ struct omap_iommu *oiommu = iommu->iommu_dev;
+ struct device *dev = oiommu->dev;
+ u32 *pgd, *pte;
+ phys_addr_t ret = 0;
+
+ /*
+ * all the iommus within the domain will have identical programming,
+ * so perform the lookup using just the first iommu
+ */
+ iopgtable_lookup_entry(oiommu, da, &pgd, &pte);
+
+ if (pte) {
+ if (iopte_is_small(*pte))
+ ret = omap_iommu_translate(*pte, da, IOPTE_MASK);
+ else if (iopte_is_large(*pte))
+ ret = omap_iommu_translate(*pte, da, IOLARGE_MASK);
+ else
+ dev_err(dev, "bogus pte 0x%x, da 0x%llx", *pte,
+ (unsigned long long)da);
+ } else {
+ if (iopgd_is_section(*pgd))
+ ret = omap_iommu_translate(*pgd, da, IOSECTION_MASK);
+ else if (iopgd_is_super(*pgd))
+ ret = omap_iommu_translate(*pgd, da, IOSUPER_MASK);
+ else
+ dev_err(dev, "bogus pgd 0x%x, da 0x%llx", *pgd,
+ (unsigned long long)da);
+ }
+
+ return ret;
+}
+
+static struct iommu_device *omap_iommu_probe_device(struct device *dev)
+{
+ struct omap_iommu_arch_data *arch_data, *tmp;
+ struct platform_device *pdev;
+ struct omap_iommu *oiommu;
+ struct device_node *np;
+ int num_iommus, i;
+
+ /*
+ * Allocate the per-device iommu structure for DT-based devices.
+ *
+ * TODO: Simplify this when removing non-DT support completely from the
+ * IOMMU users.
+ */
+ if (!dev->of_node)
+ return ERR_PTR(-ENODEV);
+
+ /*
+ * retrieve the count of IOMMU nodes using phandle size as element size
+ * since #iommu-cells = 0 for OMAP
+ */
+ num_iommus = of_property_count_elems_of_size(dev->of_node, "iommus",
+ sizeof(phandle));
+ if (num_iommus < 0)
+ return ERR_PTR(-ENODEV);
+
+ arch_data = kcalloc(num_iommus + 1, sizeof(*arch_data), GFP_KERNEL);
+ if (!arch_data)
+ return ERR_PTR(-ENOMEM);
+
+ for (i = 0, tmp = arch_data; i < num_iommus; i++, tmp++) {
+ np = of_parse_phandle(dev->of_node, "iommus", i);
+ if (!np) {
+ kfree(arch_data);
+ return ERR_PTR(-EINVAL);
+ }
+
+ pdev = of_find_device_by_node(np);
+ if (!pdev) {
+ of_node_put(np);
+ kfree(arch_data);
+ return ERR_PTR(-ENODEV);
+ }
+
+ oiommu = platform_get_drvdata(pdev);
+ if (!oiommu) {
+ of_node_put(np);
+ kfree(arch_data);
+ return ERR_PTR(-EINVAL);
+ }
+
+ tmp->iommu_dev = oiommu;
+ tmp->dev = &pdev->dev;
+
+ of_node_put(np);
+ }
+
+ dev_iommu_priv_set(dev, arch_data);
+
+ /*
+ * use the first IOMMU alone for the sysfs device linking.
+ * TODO: Evaluate if a single iommu_group needs to be
+ * maintained for both IOMMUs
+ */
+ oiommu = arch_data->iommu_dev;
+
+ return &oiommu->iommu;
+}
+
+static void omap_iommu_release_device(struct device *dev)
+{
+ struct omap_iommu_arch_data *arch_data = dev_iommu_priv_get(dev);
+
+ if (!dev->of_node || !arch_data)
+ return;
+
+ dev_iommu_priv_set(dev, NULL);
+ kfree(arch_data);
+
+}
+
+static struct iommu_group *omap_iommu_device_group(struct device *dev)
+{
+ struct omap_iommu_arch_data *arch_data = dev_iommu_priv_get(dev);
+ struct iommu_group *group = ERR_PTR(-EINVAL);
+
+ if (!arch_data)
+ return ERR_PTR(-ENODEV);
+
+ if (arch_data->iommu_dev)
+ group = iommu_group_ref_get(arch_data->iommu_dev->group);
+
+ return group;
+}
+
+static const struct iommu_ops omap_iommu_ops = {
+ .domain_alloc = omap_iommu_domain_alloc,
+ .domain_free = omap_iommu_domain_free,
+ .attach_dev = omap_iommu_attach_dev,
+ .detach_dev = omap_iommu_detach_dev,
+ .map = omap_iommu_map,
+ .unmap = omap_iommu_unmap,
+ .iova_to_phys = omap_iommu_iova_to_phys,
+ .probe_device = omap_iommu_probe_device,
+ .release_device = omap_iommu_release_device,
+ .device_group = omap_iommu_device_group,
+ .pgsize_bitmap = OMAP_IOMMU_PGSIZES,
+};
+
+static int __init omap_iommu_init(void)
+{
+ struct kmem_cache *p;
+ const slab_flags_t flags = SLAB_HWCACHE_ALIGN;
+ size_t align = 1 << 10; /* L2 pagetable alignement */
+ struct device_node *np;
+ int ret;
+
+ np = of_find_matching_node(NULL, omap_iommu_of_match);
+ if (!np)
+ return 0;
+
+ of_node_put(np);
+
+ p = kmem_cache_create("iopte_cache", IOPTE_TABLE_SIZE, align, flags,
+ NULL);
+ if (!p)
+ return -ENOMEM;
+ iopte_cachep = p;
+
+ omap_iommu_debugfs_init();
+
+ ret = platform_driver_register(&omap_iommu_driver);
+ if (ret) {
+ pr_err("%s: failed to register driver\n", __func__);
+ goto fail_driver;
+ }
+
+ ret = bus_set_iommu(&platform_bus_type, &omap_iommu_ops);
+ if (ret)
+ goto fail_bus;
+
+ return 0;
+
+fail_bus:
+ platform_driver_unregister(&omap_iommu_driver);
+fail_driver:
+ kmem_cache_destroy(iopte_cachep);
+ return ret;
+}
+subsys_initcall(omap_iommu_init);
+/* must be ready before omap3isp is probed */
diff --git a/drivers/iommu/omap-iommu.h b/drivers/iommu/omap-iommu.h
new file mode 100644
index 000000000..18ee713ed
--- /dev/null
+++ b/drivers/iommu/omap-iommu.h
@@ -0,0 +1,274 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * omap iommu: main structures
+ *
+ * Copyright (C) 2008-2009 Nokia Corporation
+ *
+ * Written by Hiroshi DOYU <Hiroshi.DOYU@nokia.com>
+ */
+
+#ifndef _OMAP_IOMMU_H
+#define _OMAP_IOMMU_H
+
+#include <linux/bitops.h>
+#include <linux/iommu.h>
+
+#define for_each_iotlb_cr(obj, n, __i, cr) \
+ for (__i = 0; \
+ (__i < (n)) && (cr = __iotlb_read_cr((obj), __i), true); \
+ __i++)
+
+struct iotlb_entry {
+ u32 da;
+ u32 pa;
+ u32 pgsz, prsvd, valid;
+ u32 endian, elsz, mixed;
+};
+
+/**
+ * struct omap_iommu_device - omap iommu device data
+ * @pgtable: page table used by an omap iommu attached to a domain
+ * @iommu_dev: pointer to store an omap iommu instance attached to a domain
+ */
+struct omap_iommu_device {
+ u32 *pgtable;
+ struct omap_iommu *iommu_dev;
+};
+
+/**
+ * struct omap_iommu_domain - omap iommu domain
+ * @num_iommus: number of iommus in this domain
+ * @iommus: omap iommu device data for all iommus in this domain
+ * @dev: Device using this domain.
+ * @lock: domain lock, should be taken when attaching/detaching
+ * @domain: generic domain handle used by iommu core code
+ */
+struct omap_iommu_domain {
+ u32 num_iommus;
+ struct omap_iommu_device *iommus;
+ struct device *dev;
+ spinlock_t lock;
+ struct iommu_domain domain;
+};
+
+struct omap_iommu {
+ const char *name;
+ void __iomem *regbase;
+ struct regmap *syscfg;
+ struct device *dev;
+ struct iommu_domain *domain;
+ struct dentry *debug_dir;
+
+ spinlock_t iommu_lock; /* global for this whole object */
+
+ /*
+ * We don't change iopgd for a situation like pgd for a task,
+ * but share it globally for each iommu.
+ */
+ u32 *iopgd;
+ spinlock_t page_table_lock; /* protect iopgd */
+ dma_addr_t pd_dma;
+
+ int nr_tlb_entries;
+
+ void *ctx; /* iommu context: registres saved area */
+
+ struct cr_regs *cr_ctx;
+ u32 num_cr_ctx;
+
+ int has_bus_err_back;
+ u32 id;
+
+ struct iommu_device iommu;
+ struct iommu_group *group;
+
+ u8 pwrst;
+};
+
+/**
+ * struct omap_iommu_arch_data - omap iommu private data
+ * @iommu_dev: handle of the OMAP iommu device
+ * @dev: handle of the iommu device
+ *
+ * This is an omap iommu private data object, which binds an iommu user
+ * to its iommu device. This object should be placed at the iommu user's
+ * dev_archdata so generic IOMMU API can be used without having to
+ * utilize omap-specific plumbing anymore.
+ */
+struct omap_iommu_arch_data {
+ struct omap_iommu *iommu_dev;
+ struct device *dev;
+};
+
+struct cr_regs {
+ u32 cam;
+ u32 ram;
+};
+
+struct iotlb_lock {
+ short base;
+ short vict;
+};
+
+/*
+ * MMU Register offsets
+ */
+#define MMU_REVISION 0x00
+#define MMU_IRQSTATUS 0x18
+#define MMU_IRQENABLE 0x1c
+#define MMU_WALKING_ST 0x40
+#define MMU_CNTL 0x44
+#define MMU_FAULT_AD 0x48
+#define MMU_TTB 0x4c
+#define MMU_LOCK 0x50
+#define MMU_LD_TLB 0x54
+#define MMU_CAM 0x58
+#define MMU_RAM 0x5c
+#define MMU_GFLUSH 0x60
+#define MMU_FLUSH_ENTRY 0x64
+#define MMU_READ_CAM 0x68
+#define MMU_READ_RAM 0x6c
+#define MMU_EMU_FAULT_AD 0x70
+#define MMU_GP_REG 0x88
+
+#define MMU_REG_SIZE 256
+
+/*
+ * MMU Register bit definitions
+ */
+/* IRQSTATUS & IRQENABLE */
+#define MMU_IRQ_MULTIHITFAULT BIT(4)
+#define MMU_IRQ_TABLEWALKFAULT BIT(3)
+#define MMU_IRQ_EMUMISS BIT(2)
+#define MMU_IRQ_TRANSLATIONFAULT BIT(1)
+#define MMU_IRQ_TLBMISS BIT(0)
+
+#define __MMU_IRQ_FAULT \
+ (MMU_IRQ_MULTIHITFAULT | MMU_IRQ_EMUMISS | MMU_IRQ_TRANSLATIONFAULT)
+#define MMU_IRQ_MASK \
+ (__MMU_IRQ_FAULT | MMU_IRQ_TABLEWALKFAULT | MMU_IRQ_TLBMISS)
+#define MMU_IRQ_TWL_MASK (__MMU_IRQ_FAULT | MMU_IRQ_TABLEWALKFAULT)
+#define MMU_IRQ_TLB_MISS_MASK (__MMU_IRQ_FAULT | MMU_IRQ_TLBMISS)
+
+/* MMU_CNTL */
+#define MMU_CNTL_SHIFT 1
+#define MMU_CNTL_MASK (7 << MMU_CNTL_SHIFT)
+#define MMU_CNTL_EML_TLB BIT(3)
+#define MMU_CNTL_TWL_EN BIT(2)
+#define MMU_CNTL_MMU_EN BIT(1)
+
+/* CAM */
+#define MMU_CAM_VATAG_SHIFT 12
+#define MMU_CAM_VATAG_MASK \
+ ((~0UL >> MMU_CAM_VATAG_SHIFT) << MMU_CAM_VATAG_SHIFT)
+#define MMU_CAM_P BIT(3)
+#define MMU_CAM_V BIT(2)
+#define MMU_CAM_PGSZ_MASK 3
+#define MMU_CAM_PGSZ_1M (0 << 0)
+#define MMU_CAM_PGSZ_64K (1 << 0)
+#define MMU_CAM_PGSZ_4K (2 << 0)
+#define MMU_CAM_PGSZ_16M (3 << 0)
+
+/* RAM */
+#define MMU_RAM_PADDR_SHIFT 12
+#define MMU_RAM_PADDR_MASK \
+ ((~0UL >> MMU_RAM_PADDR_SHIFT) << MMU_RAM_PADDR_SHIFT)
+
+#define MMU_RAM_ENDIAN_SHIFT 9
+#define MMU_RAM_ENDIAN_MASK BIT(MMU_RAM_ENDIAN_SHIFT)
+#define MMU_RAM_ENDIAN_LITTLE (0 << MMU_RAM_ENDIAN_SHIFT)
+#define MMU_RAM_ENDIAN_BIG BIT(MMU_RAM_ENDIAN_SHIFT)
+
+#define MMU_RAM_ELSZ_SHIFT 7
+#define MMU_RAM_ELSZ_MASK (3 << MMU_RAM_ELSZ_SHIFT)
+#define MMU_RAM_ELSZ_8 (0 << MMU_RAM_ELSZ_SHIFT)
+#define MMU_RAM_ELSZ_16 (1 << MMU_RAM_ELSZ_SHIFT)
+#define MMU_RAM_ELSZ_32 (2 << MMU_RAM_ELSZ_SHIFT)
+#define MMU_RAM_ELSZ_NONE (3 << MMU_RAM_ELSZ_SHIFT)
+#define MMU_RAM_MIXED_SHIFT 6
+#define MMU_RAM_MIXED_MASK BIT(MMU_RAM_MIXED_SHIFT)
+#define MMU_RAM_MIXED MMU_RAM_MIXED_MASK
+
+#define MMU_GP_REG_BUS_ERR_BACK_EN 0x1
+
+#define get_cam_va_mask(pgsz) \
+ (((pgsz) == MMU_CAM_PGSZ_16M) ? 0xff000000 : \
+ ((pgsz) == MMU_CAM_PGSZ_1M) ? 0xfff00000 : \
+ ((pgsz) == MMU_CAM_PGSZ_64K) ? 0xffff0000 : \
+ ((pgsz) == MMU_CAM_PGSZ_4K) ? 0xfffff000 : 0)
+
+/*
+ * DSP_SYSTEM registers and bit definitions (applicable only for DRA7xx DSP)
+ */
+#define DSP_SYS_REVISION 0x00
+#define DSP_SYS_MMU_CONFIG 0x18
+#define DSP_SYS_MMU_CONFIG_EN_SHIFT 4
+
+/*
+ * utilities for super page(16MB, 1MB, 64KB and 4KB)
+ */
+
+#define iopgsz_max(bytes) \
+ (((bytes) >= SZ_16M) ? SZ_16M : \
+ ((bytes) >= SZ_1M) ? SZ_1M : \
+ ((bytes) >= SZ_64K) ? SZ_64K : \
+ ((bytes) >= SZ_4K) ? SZ_4K : 0)
+
+#define bytes_to_iopgsz(bytes) \
+ (((bytes) == SZ_16M) ? MMU_CAM_PGSZ_16M : \
+ ((bytes) == SZ_1M) ? MMU_CAM_PGSZ_1M : \
+ ((bytes) == SZ_64K) ? MMU_CAM_PGSZ_64K : \
+ ((bytes) == SZ_4K) ? MMU_CAM_PGSZ_4K : -1)
+
+#define iopgsz_to_bytes(iopgsz) \
+ (((iopgsz) == MMU_CAM_PGSZ_16M) ? SZ_16M : \
+ ((iopgsz) == MMU_CAM_PGSZ_1M) ? SZ_1M : \
+ ((iopgsz) == MMU_CAM_PGSZ_64K) ? SZ_64K : \
+ ((iopgsz) == MMU_CAM_PGSZ_4K) ? SZ_4K : 0)
+
+#define iopgsz_ok(bytes) (bytes_to_iopgsz(bytes) >= 0)
+
+/*
+ * global functions
+ */
+
+struct cr_regs __iotlb_read_cr(struct omap_iommu *obj, int n);
+void iotlb_lock_get(struct omap_iommu *obj, struct iotlb_lock *l);
+void iotlb_lock_set(struct omap_iommu *obj, struct iotlb_lock *l);
+
+#ifdef CONFIG_OMAP_IOMMU_DEBUG
+void omap_iommu_debugfs_init(void);
+void omap_iommu_debugfs_exit(void);
+
+void omap_iommu_debugfs_add(struct omap_iommu *obj);
+void omap_iommu_debugfs_remove(struct omap_iommu *obj);
+#else
+static inline void omap_iommu_debugfs_init(void) { }
+static inline void omap_iommu_debugfs_exit(void) { }
+
+static inline void omap_iommu_debugfs_add(struct omap_iommu *obj) { }
+static inline void omap_iommu_debugfs_remove(struct omap_iommu *obj) { }
+#endif
+
+/*
+ * register accessors
+ */
+static inline u32 iommu_read_reg(struct omap_iommu *obj, size_t offs)
+{
+ return __raw_readl(obj->regbase + offs);
+}
+
+static inline void iommu_write_reg(struct omap_iommu *obj, u32 val, size_t offs)
+{
+ __raw_writel(val, obj->regbase + offs);
+}
+
+static inline int iotlb_cr_valid(struct cr_regs *cr)
+{
+ if (!cr)
+ return -EINVAL;
+
+ return cr->cam & MMU_CAM_V;
+}
+
+#endif /* _OMAP_IOMMU_H */
diff --git a/drivers/iommu/omap-iopgtable.h b/drivers/iommu/omap-iopgtable.h
new file mode 100644
index 000000000..51d74002c
--- /dev/null
+++ b/drivers/iommu/omap-iopgtable.h
@@ -0,0 +1,100 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * omap iommu: pagetable definitions
+ *
+ * Copyright (C) 2008-2010 Nokia Corporation
+ *
+ * Written by Hiroshi DOYU <Hiroshi.DOYU@nokia.com>
+ */
+
+#ifndef _OMAP_IOPGTABLE_H
+#define _OMAP_IOPGTABLE_H
+
+#include <linux/bitops.h>
+
+/*
+ * "L2 table" address mask and size definitions.
+ */
+#define IOPGD_SHIFT 20
+#define IOPGD_SIZE BIT(IOPGD_SHIFT)
+#define IOPGD_MASK (~(IOPGD_SIZE - 1))
+
+/*
+ * "section" address mask and size definitions.
+ */
+#define IOSECTION_SHIFT 20
+#define IOSECTION_SIZE BIT(IOSECTION_SHIFT)
+#define IOSECTION_MASK (~(IOSECTION_SIZE - 1))
+
+/*
+ * "supersection" address mask and size definitions.
+ */
+#define IOSUPER_SHIFT 24
+#define IOSUPER_SIZE BIT(IOSUPER_SHIFT)
+#define IOSUPER_MASK (~(IOSUPER_SIZE - 1))
+
+#define PTRS_PER_IOPGD (1UL << (32 - IOPGD_SHIFT))
+#define IOPGD_TABLE_SIZE (PTRS_PER_IOPGD * sizeof(u32))
+
+/*
+ * "small page" address mask and size definitions.
+ */
+#define IOPTE_SHIFT 12
+#define IOPTE_SIZE BIT(IOPTE_SHIFT)
+#define IOPTE_MASK (~(IOPTE_SIZE - 1))
+
+/*
+ * "large page" address mask and size definitions.
+ */
+#define IOLARGE_SHIFT 16
+#define IOLARGE_SIZE BIT(IOLARGE_SHIFT)
+#define IOLARGE_MASK (~(IOLARGE_SIZE - 1))
+
+#define PTRS_PER_IOPTE (1UL << (IOPGD_SHIFT - IOPTE_SHIFT))
+#define IOPTE_TABLE_SIZE (PTRS_PER_IOPTE * sizeof(u32))
+
+#define IOPAGE_MASK IOPTE_MASK
+
+/**
+ * omap_iommu_translate() - va to pa translation
+ * @d: omap iommu descriptor
+ * @va: virtual address
+ * @mask: omap iommu descriptor mask
+ *
+ * va to pa translation
+ */
+static inline phys_addr_t omap_iommu_translate(unsigned long d, dma_addr_t va,
+ dma_addr_t mask)
+{
+ return (d & mask) | (va & (~mask));
+}
+
+/*
+ * some descriptor attributes.
+ */
+#define IOPGD_TABLE (1)
+#define IOPGD_SECTION (2)
+#define IOPGD_SUPER (BIT(18) | IOPGD_SECTION)
+
+#define iopgd_is_table(x) (((x) & 3) == IOPGD_TABLE)
+#define iopgd_is_section(x) (((x) & (1 << 18 | 3)) == IOPGD_SECTION)
+#define iopgd_is_super(x) (((x) & (1 << 18 | 3)) == IOPGD_SUPER)
+
+#define IOPTE_SMALL (2)
+#define IOPTE_LARGE (1)
+
+#define iopte_is_small(x) (((x) & 2) == IOPTE_SMALL)
+#define iopte_is_large(x) (((x) & 3) == IOPTE_LARGE)
+
+/* to find an entry in a page-table-directory */
+#define iopgd_index(da) (((da) >> IOPGD_SHIFT) & (PTRS_PER_IOPGD - 1))
+#define iopgd_offset(obj, da) ((obj)->iopgd + iopgd_index(da))
+
+#define iopgd_page_paddr(iopgd) (*iopgd & ~((1 << 10) - 1))
+#define iopgd_page_vaddr(iopgd) ((u32 *)phys_to_virt(iopgd_page_paddr(iopgd)))
+
+/* to find an entry in the second-level page table. */
+#define iopte_index(da) (((da) >> IOPTE_SHIFT) & (PTRS_PER_IOPTE - 1))
+#define iopte_offset(iopgd, da) (iopgd_page_vaddr(iopgd) + iopte_index(da))
+
+#endif /* _OMAP_IOPGTABLE_H */
diff --git a/drivers/iommu/rockchip-iommu.c b/drivers/iommu/rockchip-iommu.c
new file mode 100644
index 000000000..12551dc11
--- /dev/null
+++ b/drivers/iommu/rockchip-iommu.c
@@ -0,0 +1,1305 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * IOMMU API for Rockchip
+ *
+ * Module Authors: Simon Xue <xxm@rock-chips.com>
+ * Daniel Kurtz <djkurtz@chromium.org>
+ */
+
+#include <linux/clk.h>
+#include <linux/compiler.h>
+#include <linux/delay.h>
+#include <linux/device.h>
+#include <linux/dma-iommu.h>
+#include <linux/dma-mapping.h>
+#include <linux/errno.h>
+#include <linux/interrupt.h>
+#include <linux/io.h>
+#include <linux/iommu.h>
+#include <linux/iopoll.h>
+#include <linux/list.h>
+#include <linux/mm.h>
+#include <linux/init.h>
+#include <linux/of.h>
+#include <linux/of_iommu.h>
+#include <linux/of_platform.h>
+#include <linux/platform_device.h>
+#include <linux/pm_runtime.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+
+/** MMU register offsets */
+#define RK_MMU_DTE_ADDR 0x00 /* Directory table address */
+#define RK_MMU_STATUS 0x04
+#define RK_MMU_COMMAND 0x08
+#define RK_MMU_PAGE_FAULT_ADDR 0x0C /* IOVA of last page fault */
+#define RK_MMU_ZAP_ONE_LINE 0x10 /* Shootdown one IOTLB entry */
+#define RK_MMU_INT_RAWSTAT 0x14 /* IRQ status ignoring mask */
+#define RK_MMU_INT_CLEAR 0x18 /* Acknowledge and re-arm irq */
+#define RK_MMU_INT_MASK 0x1C /* IRQ enable */
+#define RK_MMU_INT_STATUS 0x20 /* IRQ status after masking */
+#define RK_MMU_AUTO_GATING 0x24
+
+#define DTE_ADDR_DUMMY 0xCAFEBABE
+
+#define RK_MMU_POLL_PERIOD_US 100
+#define RK_MMU_FORCE_RESET_TIMEOUT_US 100000
+#define RK_MMU_POLL_TIMEOUT_US 1000
+
+/* RK_MMU_STATUS fields */
+#define RK_MMU_STATUS_PAGING_ENABLED BIT(0)
+#define RK_MMU_STATUS_PAGE_FAULT_ACTIVE BIT(1)
+#define RK_MMU_STATUS_STALL_ACTIVE BIT(2)
+#define RK_MMU_STATUS_IDLE BIT(3)
+#define RK_MMU_STATUS_REPLAY_BUFFER_EMPTY BIT(4)
+#define RK_MMU_STATUS_PAGE_FAULT_IS_WRITE BIT(5)
+#define RK_MMU_STATUS_STALL_NOT_ACTIVE BIT(31)
+
+/* RK_MMU_COMMAND command values */
+#define RK_MMU_CMD_ENABLE_PAGING 0 /* Enable memory translation */
+#define RK_MMU_CMD_DISABLE_PAGING 1 /* Disable memory translation */
+#define RK_MMU_CMD_ENABLE_STALL 2 /* Stall paging to allow other cmds */
+#define RK_MMU_CMD_DISABLE_STALL 3 /* Stop stall re-enables paging */
+#define RK_MMU_CMD_ZAP_CACHE 4 /* Shoot down entire IOTLB */
+#define RK_MMU_CMD_PAGE_FAULT_DONE 5 /* Clear page fault */
+#define RK_MMU_CMD_FORCE_RESET 6 /* Reset all registers */
+
+/* RK_MMU_INT_* register fields */
+#define RK_MMU_IRQ_PAGE_FAULT 0x01 /* page fault */
+#define RK_MMU_IRQ_BUS_ERROR 0x02 /* bus read error */
+#define RK_MMU_IRQ_MASK (RK_MMU_IRQ_PAGE_FAULT | RK_MMU_IRQ_BUS_ERROR)
+
+#define NUM_DT_ENTRIES 1024
+#define NUM_PT_ENTRIES 1024
+
+#define SPAGE_ORDER 12
+#define SPAGE_SIZE (1 << SPAGE_ORDER)
+
+ /*
+ * Support mapping any size that fits in one page table:
+ * 4 KiB to 4 MiB
+ */
+#define RK_IOMMU_PGSIZE_BITMAP 0x007ff000
+
+struct rk_iommu_domain {
+ struct list_head iommus;
+ u32 *dt; /* page directory table */
+ dma_addr_t dt_dma;
+ spinlock_t iommus_lock; /* lock for iommus list */
+ spinlock_t dt_lock; /* lock for modifying page directory table */
+
+ struct iommu_domain domain;
+};
+
+/* list of clocks required by IOMMU */
+static const char * const rk_iommu_clocks[] = {
+ "aclk", "iface",
+};
+
+struct rk_iommu {
+ struct device *dev;
+ void __iomem **bases;
+ int num_mmu;
+ int num_irq;
+ struct clk_bulk_data *clocks;
+ int num_clocks;
+ bool reset_disabled;
+ struct iommu_device iommu;
+ struct list_head node; /* entry in rk_iommu_domain.iommus */
+ struct iommu_domain *domain; /* domain to which iommu is attached */
+ struct iommu_group *group;
+};
+
+struct rk_iommudata {
+ struct device_link *link; /* runtime PM link from IOMMU to master */
+ struct rk_iommu *iommu;
+};
+
+static struct device *dma_dev;
+
+static inline void rk_table_flush(struct rk_iommu_domain *dom, dma_addr_t dma,
+ unsigned int count)
+{
+ size_t size = count * sizeof(u32); /* count of u32 entry */
+
+ dma_sync_single_for_device(dma_dev, dma, size, DMA_TO_DEVICE);
+}
+
+static struct rk_iommu_domain *to_rk_domain(struct iommu_domain *dom)
+{
+ return container_of(dom, struct rk_iommu_domain, domain);
+}
+
+/*
+ * The Rockchip rk3288 iommu uses a 2-level page table.
+ * The first level is the "Directory Table" (DT).
+ * The DT consists of 1024 4-byte Directory Table Entries (DTEs), each pointing
+ * to a "Page Table".
+ * The second level is the 1024 Page Tables (PT).
+ * Each PT consists of 1024 4-byte Page Table Entries (PTEs), each pointing to
+ * a 4 KB page of physical memory.
+ *
+ * The DT and each PT fits in a single 4 KB page (4-bytes * 1024 entries).
+ * Each iommu device has a MMU_DTE_ADDR register that contains the physical
+ * address of the start of the DT page.
+ *
+ * The structure of the page table is as follows:
+ *
+ * DT
+ * MMU_DTE_ADDR -> +-----+
+ * | |
+ * +-----+ PT
+ * | DTE | -> +-----+
+ * +-----+ | | Memory
+ * | | +-----+ Page
+ * | | | PTE | -> +-----+
+ * +-----+ +-----+ | |
+ * | | | |
+ * | | | |
+ * +-----+ | |
+ * | |
+ * | |
+ * +-----+
+ */
+
+/*
+ * Each DTE has a PT address and a valid bit:
+ * +---------------------+-----------+-+
+ * | PT address | Reserved |V|
+ * +---------------------+-----------+-+
+ * 31:12 - PT address (PTs always starts on a 4 KB boundary)
+ * 11: 1 - Reserved
+ * 0 - 1 if PT @ PT address is valid
+ */
+#define RK_DTE_PT_ADDRESS_MASK 0xfffff000
+#define RK_DTE_PT_VALID BIT(0)
+
+static inline phys_addr_t rk_dte_pt_address(u32 dte)
+{
+ return (phys_addr_t)dte & RK_DTE_PT_ADDRESS_MASK;
+}
+
+static inline bool rk_dte_is_pt_valid(u32 dte)
+{
+ return dte & RK_DTE_PT_VALID;
+}
+
+static inline u32 rk_mk_dte(dma_addr_t pt_dma)
+{
+ return (pt_dma & RK_DTE_PT_ADDRESS_MASK) | RK_DTE_PT_VALID;
+}
+
+/*
+ * Each PTE has a Page address, some flags and a valid bit:
+ * +---------------------+---+-------+-+
+ * | Page address |Rsv| Flags |V|
+ * +---------------------+---+-------+-+
+ * 31:12 - Page address (Pages always start on a 4 KB boundary)
+ * 11: 9 - Reserved
+ * 8: 1 - Flags
+ * 8 - Read allocate - allocate cache space on read misses
+ * 7 - Read cache - enable cache & prefetch of data
+ * 6 - Write buffer - enable delaying writes on their way to memory
+ * 5 - Write allocate - allocate cache space on write misses
+ * 4 - Write cache - different writes can be merged together
+ * 3 - Override cache attributes
+ * if 1, bits 4-8 control cache attributes
+ * if 0, the system bus defaults are used
+ * 2 - Writable
+ * 1 - Readable
+ * 0 - 1 if Page @ Page address is valid
+ */
+#define RK_PTE_PAGE_ADDRESS_MASK 0xfffff000
+#define RK_PTE_PAGE_FLAGS_MASK 0x000001fe
+#define RK_PTE_PAGE_WRITABLE BIT(2)
+#define RK_PTE_PAGE_READABLE BIT(1)
+#define RK_PTE_PAGE_VALID BIT(0)
+
+static inline phys_addr_t rk_pte_page_address(u32 pte)
+{
+ return (phys_addr_t)pte & RK_PTE_PAGE_ADDRESS_MASK;
+}
+
+static inline bool rk_pte_is_page_valid(u32 pte)
+{
+ return pte & RK_PTE_PAGE_VALID;
+}
+
+/* TODO: set cache flags per prot IOMMU_CACHE */
+static u32 rk_mk_pte(phys_addr_t page, int prot)
+{
+ u32 flags = 0;
+ flags |= (prot & IOMMU_READ) ? RK_PTE_PAGE_READABLE : 0;
+ flags |= (prot & IOMMU_WRITE) ? RK_PTE_PAGE_WRITABLE : 0;
+ page &= RK_PTE_PAGE_ADDRESS_MASK;
+ return page | flags | RK_PTE_PAGE_VALID;
+}
+
+static u32 rk_mk_pte_invalid(u32 pte)
+{
+ return pte & ~RK_PTE_PAGE_VALID;
+}
+
+/*
+ * rk3288 iova (IOMMU Virtual Address) format
+ * 31 22.21 12.11 0
+ * +-----------+-----------+-------------+
+ * | DTE index | PTE index | Page offset |
+ * +-----------+-----------+-------------+
+ * 31:22 - DTE index - index of DTE in DT
+ * 21:12 - PTE index - index of PTE in PT @ DTE.pt_address
+ * 11: 0 - Page offset - offset into page @ PTE.page_address
+ */
+#define RK_IOVA_DTE_MASK 0xffc00000
+#define RK_IOVA_DTE_SHIFT 22
+#define RK_IOVA_PTE_MASK 0x003ff000
+#define RK_IOVA_PTE_SHIFT 12
+#define RK_IOVA_PAGE_MASK 0x00000fff
+#define RK_IOVA_PAGE_SHIFT 0
+
+static u32 rk_iova_dte_index(dma_addr_t iova)
+{
+ return (u32)(iova & RK_IOVA_DTE_MASK) >> RK_IOVA_DTE_SHIFT;
+}
+
+static u32 rk_iova_pte_index(dma_addr_t iova)
+{
+ return (u32)(iova & RK_IOVA_PTE_MASK) >> RK_IOVA_PTE_SHIFT;
+}
+
+static u32 rk_iova_page_offset(dma_addr_t iova)
+{
+ return (u32)(iova & RK_IOVA_PAGE_MASK) >> RK_IOVA_PAGE_SHIFT;
+}
+
+static u32 rk_iommu_read(void __iomem *base, u32 offset)
+{
+ return readl(base + offset);
+}
+
+static void rk_iommu_write(void __iomem *base, u32 offset, u32 value)
+{
+ writel(value, base + offset);
+}
+
+static void rk_iommu_command(struct rk_iommu *iommu, u32 command)
+{
+ int i;
+
+ for (i = 0; i < iommu->num_mmu; i++)
+ writel(command, iommu->bases[i] + RK_MMU_COMMAND);
+}
+
+static void rk_iommu_base_command(void __iomem *base, u32 command)
+{
+ writel(command, base + RK_MMU_COMMAND);
+}
+static void rk_iommu_zap_lines(struct rk_iommu *iommu, dma_addr_t iova_start,
+ size_t size)
+{
+ int i;
+ dma_addr_t iova_end = iova_start + size;
+ /*
+ * TODO(djkurtz): Figure out when it is more efficient to shootdown the
+ * entire iotlb rather than iterate over individual iovas.
+ */
+ for (i = 0; i < iommu->num_mmu; i++) {
+ dma_addr_t iova;
+
+ for (iova = iova_start; iova < iova_end; iova += SPAGE_SIZE)
+ rk_iommu_write(iommu->bases[i], RK_MMU_ZAP_ONE_LINE, iova);
+ }
+}
+
+static bool rk_iommu_is_stall_active(struct rk_iommu *iommu)
+{
+ bool active = true;
+ int i;
+
+ for (i = 0; i < iommu->num_mmu; i++)
+ active &= !!(rk_iommu_read(iommu->bases[i], RK_MMU_STATUS) &
+ RK_MMU_STATUS_STALL_ACTIVE);
+
+ return active;
+}
+
+static bool rk_iommu_is_paging_enabled(struct rk_iommu *iommu)
+{
+ bool enable = true;
+ int i;
+
+ for (i = 0; i < iommu->num_mmu; i++)
+ enable &= !!(rk_iommu_read(iommu->bases[i], RK_MMU_STATUS) &
+ RK_MMU_STATUS_PAGING_ENABLED);
+
+ return enable;
+}
+
+static bool rk_iommu_is_reset_done(struct rk_iommu *iommu)
+{
+ bool done = true;
+ int i;
+
+ for (i = 0; i < iommu->num_mmu; i++)
+ done &= rk_iommu_read(iommu->bases[i], RK_MMU_DTE_ADDR) == 0;
+
+ return done;
+}
+
+static int rk_iommu_enable_stall(struct rk_iommu *iommu)
+{
+ int ret, i;
+ bool val;
+
+ if (rk_iommu_is_stall_active(iommu))
+ return 0;
+
+ /* Stall can only be enabled if paging is enabled */
+ if (!rk_iommu_is_paging_enabled(iommu))
+ return 0;
+
+ rk_iommu_command(iommu, RK_MMU_CMD_ENABLE_STALL);
+
+ ret = readx_poll_timeout(rk_iommu_is_stall_active, iommu, val,
+ val, RK_MMU_POLL_PERIOD_US,
+ RK_MMU_POLL_TIMEOUT_US);
+ if (ret)
+ for (i = 0; i < iommu->num_mmu; i++)
+ dev_err(iommu->dev, "Enable stall request timed out, status: %#08x\n",
+ rk_iommu_read(iommu->bases[i], RK_MMU_STATUS));
+
+ return ret;
+}
+
+static int rk_iommu_disable_stall(struct rk_iommu *iommu)
+{
+ int ret, i;
+ bool val;
+
+ if (!rk_iommu_is_stall_active(iommu))
+ return 0;
+
+ rk_iommu_command(iommu, RK_MMU_CMD_DISABLE_STALL);
+
+ ret = readx_poll_timeout(rk_iommu_is_stall_active, iommu, val,
+ !val, RK_MMU_POLL_PERIOD_US,
+ RK_MMU_POLL_TIMEOUT_US);
+ if (ret)
+ for (i = 0; i < iommu->num_mmu; i++)
+ dev_err(iommu->dev, "Disable stall request timed out, status: %#08x\n",
+ rk_iommu_read(iommu->bases[i], RK_MMU_STATUS));
+
+ return ret;
+}
+
+static int rk_iommu_enable_paging(struct rk_iommu *iommu)
+{
+ int ret, i;
+ bool val;
+
+ if (rk_iommu_is_paging_enabled(iommu))
+ return 0;
+
+ rk_iommu_command(iommu, RK_MMU_CMD_ENABLE_PAGING);
+
+ ret = readx_poll_timeout(rk_iommu_is_paging_enabled, iommu, val,
+ val, RK_MMU_POLL_PERIOD_US,
+ RK_MMU_POLL_TIMEOUT_US);
+ if (ret)
+ for (i = 0; i < iommu->num_mmu; i++)
+ dev_err(iommu->dev, "Enable paging request timed out, status: %#08x\n",
+ rk_iommu_read(iommu->bases[i], RK_MMU_STATUS));
+
+ return ret;
+}
+
+static int rk_iommu_disable_paging(struct rk_iommu *iommu)
+{
+ int ret, i;
+ bool val;
+
+ if (!rk_iommu_is_paging_enabled(iommu))
+ return 0;
+
+ rk_iommu_command(iommu, RK_MMU_CMD_DISABLE_PAGING);
+
+ ret = readx_poll_timeout(rk_iommu_is_paging_enabled, iommu, val,
+ !val, RK_MMU_POLL_PERIOD_US,
+ RK_MMU_POLL_TIMEOUT_US);
+ if (ret)
+ for (i = 0; i < iommu->num_mmu; i++)
+ dev_err(iommu->dev, "Disable paging request timed out, status: %#08x\n",
+ rk_iommu_read(iommu->bases[i], RK_MMU_STATUS));
+
+ return ret;
+}
+
+static int rk_iommu_force_reset(struct rk_iommu *iommu)
+{
+ int ret, i;
+ u32 dte_addr;
+ bool val;
+
+ if (iommu->reset_disabled)
+ return 0;
+
+ /*
+ * Check if register DTE_ADDR is working by writing DTE_ADDR_DUMMY
+ * and verifying that upper 5 nybbles are read back.
+ */
+ for (i = 0; i < iommu->num_mmu; i++) {
+ rk_iommu_write(iommu->bases[i], RK_MMU_DTE_ADDR, DTE_ADDR_DUMMY);
+
+ dte_addr = rk_iommu_read(iommu->bases[i], RK_MMU_DTE_ADDR);
+ if (dte_addr != (DTE_ADDR_DUMMY & RK_DTE_PT_ADDRESS_MASK)) {
+ dev_err(iommu->dev, "Error during raw reset. MMU_DTE_ADDR is not functioning\n");
+ return -EFAULT;
+ }
+ }
+
+ rk_iommu_command(iommu, RK_MMU_CMD_FORCE_RESET);
+
+ ret = readx_poll_timeout(rk_iommu_is_reset_done, iommu, val,
+ val, RK_MMU_FORCE_RESET_TIMEOUT_US,
+ RK_MMU_POLL_TIMEOUT_US);
+ if (ret) {
+ dev_err(iommu->dev, "FORCE_RESET command timed out\n");
+ return ret;
+ }
+
+ return 0;
+}
+
+static void log_iova(struct rk_iommu *iommu, int index, dma_addr_t iova)
+{
+ void __iomem *base = iommu->bases[index];
+ u32 dte_index, pte_index, page_offset;
+ u32 mmu_dte_addr;
+ phys_addr_t mmu_dte_addr_phys, dte_addr_phys;
+ u32 *dte_addr;
+ u32 dte;
+ phys_addr_t pte_addr_phys = 0;
+ u32 *pte_addr = NULL;
+ u32 pte = 0;
+ phys_addr_t page_addr_phys = 0;
+ u32 page_flags = 0;
+
+ dte_index = rk_iova_dte_index(iova);
+ pte_index = rk_iova_pte_index(iova);
+ page_offset = rk_iova_page_offset(iova);
+
+ mmu_dte_addr = rk_iommu_read(base, RK_MMU_DTE_ADDR);
+ mmu_dte_addr_phys = (phys_addr_t)mmu_dte_addr;
+
+ dte_addr_phys = mmu_dte_addr_phys + (4 * dte_index);
+ dte_addr = phys_to_virt(dte_addr_phys);
+ dte = *dte_addr;
+
+ if (!rk_dte_is_pt_valid(dte))
+ goto print_it;
+
+ pte_addr_phys = rk_dte_pt_address(dte) + (pte_index * 4);
+ pte_addr = phys_to_virt(pte_addr_phys);
+ pte = *pte_addr;
+
+ if (!rk_pte_is_page_valid(pte))
+ goto print_it;
+
+ page_addr_phys = rk_pte_page_address(pte) + page_offset;
+ page_flags = pte & RK_PTE_PAGE_FLAGS_MASK;
+
+print_it:
+ dev_err(iommu->dev, "iova = %pad: dte_index: %#03x pte_index: %#03x page_offset: %#03x\n",
+ &iova, dte_index, pte_index, page_offset);
+ dev_err(iommu->dev, "mmu_dte_addr: %pa dte@%pa: %#08x valid: %u pte@%pa: %#08x valid: %u page@%pa flags: %#03x\n",
+ &mmu_dte_addr_phys, &dte_addr_phys, dte,
+ rk_dte_is_pt_valid(dte), &pte_addr_phys, pte,
+ rk_pte_is_page_valid(pte), &page_addr_phys, page_flags);
+}
+
+static irqreturn_t rk_iommu_irq(int irq, void *dev_id)
+{
+ struct rk_iommu *iommu = dev_id;
+ u32 status;
+ u32 int_status;
+ dma_addr_t iova;
+ irqreturn_t ret = IRQ_NONE;
+ int i, err;
+
+ err = pm_runtime_get_if_in_use(iommu->dev);
+ if (!err || WARN_ON_ONCE(err < 0))
+ return ret;
+
+ if (WARN_ON(clk_bulk_enable(iommu->num_clocks, iommu->clocks)))
+ goto out;
+
+ for (i = 0; i < iommu->num_mmu; i++) {
+ int_status = rk_iommu_read(iommu->bases[i], RK_MMU_INT_STATUS);
+ if (int_status == 0)
+ continue;
+
+ ret = IRQ_HANDLED;
+ iova = rk_iommu_read(iommu->bases[i], RK_MMU_PAGE_FAULT_ADDR);
+
+ if (int_status & RK_MMU_IRQ_PAGE_FAULT) {
+ int flags;
+
+ status = rk_iommu_read(iommu->bases[i], RK_MMU_STATUS);
+ flags = (status & RK_MMU_STATUS_PAGE_FAULT_IS_WRITE) ?
+ IOMMU_FAULT_WRITE : IOMMU_FAULT_READ;
+
+ dev_err(iommu->dev, "Page fault at %pad of type %s\n",
+ &iova,
+ (flags == IOMMU_FAULT_WRITE) ? "write" : "read");
+
+ log_iova(iommu, i, iova);
+
+ /*
+ * Report page fault to any installed handlers.
+ * Ignore the return code, though, since we always zap cache
+ * and clear the page fault anyway.
+ */
+ if (iommu->domain)
+ report_iommu_fault(iommu->domain, iommu->dev, iova,
+ flags);
+ else
+ dev_err(iommu->dev, "Page fault while iommu not attached to domain?\n");
+
+ rk_iommu_base_command(iommu->bases[i], RK_MMU_CMD_ZAP_CACHE);
+ rk_iommu_base_command(iommu->bases[i], RK_MMU_CMD_PAGE_FAULT_DONE);
+ }
+
+ if (int_status & RK_MMU_IRQ_BUS_ERROR)
+ dev_err(iommu->dev, "BUS_ERROR occurred at %pad\n", &iova);
+
+ if (int_status & ~RK_MMU_IRQ_MASK)
+ dev_err(iommu->dev, "unexpected int_status: %#08x\n",
+ int_status);
+
+ rk_iommu_write(iommu->bases[i], RK_MMU_INT_CLEAR, int_status);
+ }
+
+ clk_bulk_disable(iommu->num_clocks, iommu->clocks);
+
+out:
+ pm_runtime_put(iommu->dev);
+ return ret;
+}
+
+static phys_addr_t rk_iommu_iova_to_phys(struct iommu_domain *domain,
+ dma_addr_t iova)
+{
+ struct rk_iommu_domain *rk_domain = to_rk_domain(domain);
+ unsigned long flags;
+ phys_addr_t pt_phys, phys = 0;
+ u32 dte, pte;
+ u32 *page_table;
+
+ spin_lock_irqsave(&rk_domain->dt_lock, flags);
+
+ dte = rk_domain->dt[rk_iova_dte_index(iova)];
+ if (!rk_dte_is_pt_valid(dte))
+ goto out;
+
+ pt_phys = rk_dte_pt_address(dte);
+ page_table = (u32 *)phys_to_virt(pt_phys);
+ pte = page_table[rk_iova_pte_index(iova)];
+ if (!rk_pte_is_page_valid(pte))
+ goto out;
+
+ phys = rk_pte_page_address(pte) + rk_iova_page_offset(iova);
+out:
+ spin_unlock_irqrestore(&rk_domain->dt_lock, flags);
+
+ return phys;
+}
+
+static void rk_iommu_zap_iova(struct rk_iommu_domain *rk_domain,
+ dma_addr_t iova, size_t size)
+{
+ struct list_head *pos;
+ unsigned long flags;
+
+ /* shootdown these iova from all iommus using this domain */
+ spin_lock_irqsave(&rk_domain->iommus_lock, flags);
+ list_for_each(pos, &rk_domain->iommus) {
+ struct rk_iommu *iommu;
+ int ret;
+
+ iommu = list_entry(pos, struct rk_iommu, node);
+
+ /* Only zap TLBs of IOMMUs that are powered on. */
+ ret = pm_runtime_get_if_in_use(iommu->dev);
+ if (WARN_ON_ONCE(ret < 0))
+ continue;
+ if (ret) {
+ WARN_ON(clk_bulk_enable(iommu->num_clocks,
+ iommu->clocks));
+ rk_iommu_zap_lines(iommu, iova, size);
+ clk_bulk_disable(iommu->num_clocks, iommu->clocks);
+ pm_runtime_put(iommu->dev);
+ }
+ }
+ spin_unlock_irqrestore(&rk_domain->iommus_lock, flags);
+}
+
+static void rk_iommu_zap_iova_first_last(struct rk_iommu_domain *rk_domain,
+ dma_addr_t iova, size_t size)
+{
+ rk_iommu_zap_iova(rk_domain, iova, SPAGE_SIZE);
+ if (size > SPAGE_SIZE)
+ rk_iommu_zap_iova(rk_domain, iova + size - SPAGE_SIZE,
+ SPAGE_SIZE);
+}
+
+static u32 *rk_dte_get_page_table(struct rk_iommu_domain *rk_domain,
+ dma_addr_t iova)
+{
+ u32 *page_table, *dte_addr;
+ u32 dte_index, dte;
+ phys_addr_t pt_phys;
+ dma_addr_t pt_dma;
+
+ assert_spin_locked(&rk_domain->dt_lock);
+
+ dte_index = rk_iova_dte_index(iova);
+ dte_addr = &rk_domain->dt[dte_index];
+ dte = *dte_addr;
+ if (rk_dte_is_pt_valid(dte))
+ goto done;
+
+ page_table = (u32 *)get_zeroed_page(GFP_ATOMIC | GFP_DMA32);
+ if (!page_table)
+ return ERR_PTR(-ENOMEM);
+
+ pt_dma = dma_map_single(dma_dev, page_table, SPAGE_SIZE, DMA_TO_DEVICE);
+ if (dma_mapping_error(dma_dev, pt_dma)) {
+ dev_err(dma_dev, "DMA mapping error while allocating page table\n");
+ free_page((unsigned long)page_table);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ dte = rk_mk_dte(pt_dma);
+ *dte_addr = dte;
+
+ rk_table_flush(rk_domain, pt_dma, NUM_PT_ENTRIES);
+ rk_table_flush(rk_domain,
+ rk_domain->dt_dma + dte_index * sizeof(u32), 1);
+done:
+ pt_phys = rk_dte_pt_address(dte);
+ return (u32 *)phys_to_virt(pt_phys);
+}
+
+static size_t rk_iommu_unmap_iova(struct rk_iommu_domain *rk_domain,
+ u32 *pte_addr, dma_addr_t pte_dma,
+ size_t size)
+{
+ unsigned int pte_count;
+ unsigned int pte_total = size / SPAGE_SIZE;
+
+ assert_spin_locked(&rk_domain->dt_lock);
+
+ for (pte_count = 0; pte_count < pte_total; pte_count++) {
+ u32 pte = pte_addr[pte_count];
+ if (!rk_pte_is_page_valid(pte))
+ break;
+
+ pte_addr[pte_count] = rk_mk_pte_invalid(pte);
+ }
+
+ rk_table_flush(rk_domain, pte_dma, pte_count);
+
+ return pte_count * SPAGE_SIZE;
+}
+
+static int rk_iommu_map_iova(struct rk_iommu_domain *rk_domain, u32 *pte_addr,
+ dma_addr_t pte_dma, dma_addr_t iova,
+ phys_addr_t paddr, size_t size, int prot)
+{
+ unsigned int pte_count;
+ unsigned int pte_total = size / SPAGE_SIZE;
+ phys_addr_t page_phys;
+
+ assert_spin_locked(&rk_domain->dt_lock);
+
+ for (pte_count = 0; pte_count < pte_total; pte_count++) {
+ u32 pte = pte_addr[pte_count];
+
+ if (rk_pte_is_page_valid(pte))
+ goto unwind;
+
+ pte_addr[pte_count] = rk_mk_pte(paddr, prot);
+
+ paddr += SPAGE_SIZE;
+ }
+
+ rk_table_flush(rk_domain, pte_dma, pte_total);
+
+ /*
+ * Zap the first and last iova to evict from iotlb any previously
+ * mapped cachelines holding stale values for its dte and pte.
+ * We only zap the first and last iova, since only they could have
+ * dte or pte shared with an existing mapping.
+ */
+ rk_iommu_zap_iova_first_last(rk_domain, iova, size);
+
+ return 0;
+unwind:
+ /* Unmap the range of iovas that we just mapped */
+ rk_iommu_unmap_iova(rk_domain, pte_addr, pte_dma,
+ pte_count * SPAGE_SIZE);
+
+ iova += pte_count * SPAGE_SIZE;
+ page_phys = rk_pte_page_address(pte_addr[pte_count]);
+ pr_err("iova: %pad already mapped to %pa cannot remap to phys: %pa prot: %#x\n",
+ &iova, &page_phys, &paddr, prot);
+
+ return -EADDRINUSE;
+}
+
+static int rk_iommu_map(struct iommu_domain *domain, unsigned long _iova,
+ phys_addr_t paddr, size_t size, int prot, gfp_t gfp)
+{
+ struct rk_iommu_domain *rk_domain = to_rk_domain(domain);
+ unsigned long flags;
+ dma_addr_t pte_dma, iova = (dma_addr_t)_iova;
+ u32 *page_table, *pte_addr;
+ u32 dte_index, pte_index;
+ int ret;
+
+ spin_lock_irqsave(&rk_domain->dt_lock, flags);
+
+ /*
+ * pgsize_bitmap specifies iova sizes that fit in one page table
+ * (1024 4-KiB pages = 4 MiB).
+ * So, size will always be 4096 <= size <= 4194304.
+ * Since iommu_map() guarantees that both iova and size will be
+ * aligned, we will always only be mapping from a single dte here.
+ */
+ page_table = rk_dte_get_page_table(rk_domain, iova);
+ if (IS_ERR(page_table)) {
+ spin_unlock_irqrestore(&rk_domain->dt_lock, flags);
+ return PTR_ERR(page_table);
+ }
+
+ dte_index = rk_domain->dt[rk_iova_dte_index(iova)];
+ pte_index = rk_iova_pte_index(iova);
+ pte_addr = &page_table[pte_index];
+ pte_dma = rk_dte_pt_address(dte_index) + pte_index * sizeof(u32);
+ ret = rk_iommu_map_iova(rk_domain, pte_addr, pte_dma, iova,
+ paddr, size, prot);
+
+ spin_unlock_irqrestore(&rk_domain->dt_lock, flags);
+
+ return ret;
+}
+
+static size_t rk_iommu_unmap(struct iommu_domain *domain, unsigned long _iova,
+ size_t size, struct iommu_iotlb_gather *gather)
+{
+ struct rk_iommu_domain *rk_domain = to_rk_domain(domain);
+ unsigned long flags;
+ dma_addr_t pte_dma, iova = (dma_addr_t)_iova;
+ phys_addr_t pt_phys;
+ u32 dte;
+ u32 *pte_addr;
+ size_t unmap_size;
+
+ spin_lock_irqsave(&rk_domain->dt_lock, flags);
+
+ /*
+ * pgsize_bitmap specifies iova sizes that fit in one page table
+ * (1024 4-KiB pages = 4 MiB).
+ * So, size will always be 4096 <= size <= 4194304.
+ * Since iommu_unmap() guarantees that both iova and size will be
+ * aligned, we will always only be unmapping from a single dte here.
+ */
+ dte = rk_domain->dt[rk_iova_dte_index(iova)];
+ /* Just return 0 if iova is unmapped */
+ if (!rk_dte_is_pt_valid(dte)) {
+ spin_unlock_irqrestore(&rk_domain->dt_lock, flags);
+ return 0;
+ }
+
+ pt_phys = rk_dte_pt_address(dte);
+ pte_addr = (u32 *)phys_to_virt(pt_phys) + rk_iova_pte_index(iova);
+ pte_dma = pt_phys + rk_iova_pte_index(iova) * sizeof(u32);
+ unmap_size = rk_iommu_unmap_iova(rk_domain, pte_addr, pte_dma, size);
+
+ spin_unlock_irqrestore(&rk_domain->dt_lock, flags);
+
+ /* Shootdown iotlb entries for iova range that was just unmapped */
+ rk_iommu_zap_iova(rk_domain, iova, unmap_size);
+
+ return unmap_size;
+}
+
+static struct rk_iommu *rk_iommu_from_dev(struct device *dev)
+{
+ struct rk_iommudata *data = dev_iommu_priv_get(dev);
+
+ return data ? data->iommu : NULL;
+}
+
+/* Must be called with iommu powered on and attached */
+static void rk_iommu_disable(struct rk_iommu *iommu)
+{
+ int i;
+
+ /* Ignore error while disabling, just keep going */
+ WARN_ON(clk_bulk_enable(iommu->num_clocks, iommu->clocks));
+ rk_iommu_enable_stall(iommu);
+ rk_iommu_disable_paging(iommu);
+ for (i = 0; i < iommu->num_mmu; i++) {
+ rk_iommu_write(iommu->bases[i], RK_MMU_INT_MASK, 0);
+ rk_iommu_write(iommu->bases[i], RK_MMU_DTE_ADDR, 0);
+ }
+ rk_iommu_disable_stall(iommu);
+ clk_bulk_disable(iommu->num_clocks, iommu->clocks);
+}
+
+/* Must be called with iommu powered on and attached */
+static int rk_iommu_enable(struct rk_iommu *iommu)
+{
+ struct iommu_domain *domain = iommu->domain;
+ struct rk_iommu_domain *rk_domain = to_rk_domain(domain);
+ int ret, i;
+
+ ret = clk_bulk_enable(iommu->num_clocks, iommu->clocks);
+ if (ret)
+ return ret;
+
+ ret = rk_iommu_enable_stall(iommu);
+ if (ret)
+ goto out_disable_clocks;
+
+ ret = rk_iommu_force_reset(iommu);
+ if (ret)
+ goto out_disable_stall;
+
+ for (i = 0; i < iommu->num_mmu; i++) {
+ rk_iommu_write(iommu->bases[i], RK_MMU_DTE_ADDR,
+ rk_domain->dt_dma);
+ rk_iommu_base_command(iommu->bases[i], RK_MMU_CMD_ZAP_CACHE);
+ rk_iommu_write(iommu->bases[i], RK_MMU_INT_MASK, RK_MMU_IRQ_MASK);
+ }
+
+ ret = rk_iommu_enable_paging(iommu);
+
+out_disable_stall:
+ rk_iommu_disable_stall(iommu);
+out_disable_clocks:
+ clk_bulk_disable(iommu->num_clocks, iommu->clocks);
+ return ret;
+}
+
+static void rk_iommu_detach_device(struct iommu_domain *domain,
+ struct device *dev)
+{
+ struct rk_iommu *iommu;
+ struct rk_iommu_domain *rk_domain = to_rk_domain(domain);
+ unsigned long flags;
+ int ret;
+
+ /* Allow 'virtual devices' (eg drm) to detach from domain */
+ iommu = rk_iommu_from_dev(dev);
+ if (!iommu)
+ return;
+
+ dev_dbg(dev, "Detaching from iommu domain\n");
+
+ /* iommu already detached */
+ if (iommu->domain != domain)
+ return;
+
+ iommu->domain = NULL;
+
+ spin_lock_irqsave(&rk_domain->iommus_lock, flags);
+ list_del_init(&iommu->node);
+ spin_unlock_irqrestore(&rk_domain->iommus_lock, flags);
+
+ ret = pm_runtime_get_if_in_use(iommu->dev);
+ WARN_ON_ONCE(ret < 0);
+ if (ret > 0) {
+ rk_iommu_disable(iommu);
+ pm_runtime_put(iommu->dev);
+ }
+}
+
+static int rk_iommu_attach_device(struct iommu_domain *domain,
+ struct device *dev)
+{
+ struct rk_iommu *iommu;
+ struct rk_iommu_domain *rk_domain = to_rk_domain(domain);
+ unsigned long flags;
+ int ret;
+
+ /*
+ * Allow 'virtual devices' (e.g., drm) to attach to domain.
+ * Such a device does not belong to an iommu group.
+ */
+ iommu = rk_iommu_from_dev(dev);
+ if (!iommu)
+ return 0;
+
+ dev_dbg(dev, "Attaching to iommu domain\n");
+
+ /* iommu already attached */
+ if (iommu->domain == domain)
+ return 0;
+
+ if (iommu->domain)
+ rk_iommu_detach_device(iommu->domain, dev);
+
+ iommu->domain = domain;
+
+ spin_lock_irqsave(&rk_domain->iommus_lock, flags);
+ list_add_tail(&iommu->node, &rk_domain->iommus);
+ spin_unlock_irqrestore(&rk_domain->iommus_lock, flags);
+
+ ret = pm_runtime_get_if_in_use(iommu->dev);
+ if (!ret || WARN_ON_ONCE(ret < 0))
+ return 0;
+
+ ret = rk_iommu_enable(iommu);
+ if (ret)
+ rk_iommu_detach_device(iommu->domain, dev);
+
+ pm_runtime_put(iommu->dev);
+
+ return ret;
+}
+
+static struct iommu_domain *rk_iommu_domain_alloc(unsigned type)
+{
+ struct rk_iommu_domain *rk_domain;
+
+ if (type != IOMMU_DOMAIN_UNMANAGED && type != IOMMU_DOMAIN_DMA)
+ return NULL;
+
+ if (!dma_dev)
+ return NULL;
+
+ rk_domain = kzalloc(sizeof(*rk_domain), GFP_KERNEL);
+ if (!rk_domain)
+ return NULL;
+
+ if (type == IOMMU_DOMAIN_DMA &&
+ iommu_get_dma_cookie(&rk_domain->domain))
+ goto err_free_domain;
+
+ /*
+ * rk32xx iommus use a 2 level pagetable.
+ * Each level1 (dt) and level2 (pt) table has 1024 4-byte entries.
+ * Allocate one 4 KiB page for each table.
+ */
+ rk_domain->dt = (u32 *)get_zeroed_page(GFP_KERNEL | GFP_DMA32);
+ if (!rk_domain->dt)
+ goto err_put_cookie;
+
+ rk_domain->dt_dma = dma_map_single(dma_dev, rk_domain->dt,
+ SPAGE_SIZE, DMA_TO_DEVICE);
+ if (dma_mapping_error(dma_dev, rk_domain->dt_dma)) {
+ dev_err(dma_dev, "DMA map error for DT\n");
+ goto err_free_dt;
+ }
+
+ rk_table_flush(rk_domain, rk_domain->dt_dma, NUM_DT_ENTRIES);
+
+ spin_lock_init(&rk_domain->iommus_lock);
+ spin_lock_init(&rk_domain->dt_lock);
+ INIT_LIST_HEAD(&rk_domain->iommus);
+
+ rk_domain->domain.geometry.aperture_start = 0;
+ rk_domain->domain.geometry.aperture_end = DMA_BIT_MASK(32);
+ rk_domain->domain.geometry.force_aperture = true;
+
+ return &rk_domain->domain;
+
+err_free_dt:
+ free_page((unsigned long)rk_domain->dt);
+err_put_cookie:
+ if (type == IOMMU_DOMAIN_DMA)
+ iommu_put_dma_cookie(&rk_domain->domain);
+err_free_domain:
+ kfree(rk_domain);
+
+ return NULL;
+}
+
+static void rk_iommu_domain_free(struct iommu_domain *domain)
+{
+ struct rk_iommu_domain *rk_domain = to_rk_domain(domain);
+ int i;
+
+ WARN_ON(!list_empty(&rk_domain->iommus));
+
+ for (i = 0; i < NUM_DT_ENTRIES; i++) {
+ u32 dte = rk_domain->dt[i];
+ if (rk_dte_is_pt_valid(dte)) {
+ phys_addr_t pt_phys = rk_dte_pt_address(dte);
+ u32 *page_table = phys_to_virt(pt_phys);
+ dma_unmap_single(dma_dev, pt_phys,
+ SPAGE_SIZE, DMA_TO_DEVICE);
+ free_page((unsigned long)page_table);
+ }
+ }
+
+ dma_unmap_single(dma_dev, rk_domain->dt_dma,
+ SPAGE_SIZE, DMA_TO_DEVICE);
+ free_page((unsigned long)rk_domain->dt);
+
+ if (domain->type == IOMMU_DOMAIN_DMA)
+ iommu_put_dma_cookie(&rk_domain->domain);
+ kfree(rk_domain);
+}
+
+static struct iommu_device *rk_iommu_probe_device(struct device *dev)
+{
+ struct rk_iommudata *data;
+ struct rk_iommu *iommu;
+
+ data = dev_iommu_priv_get(dev);
+ if (!data)
+ return ERR_PTR(-ENODEV);
+
+ iommu = rk_iommu_from_dev(dev);
+
+ data->link = device_link_add(dev, iommu->dev,
+ DL_FLAG_STATELESS | DL_FLAG_PM_RUNTIME);
+
+ return &iommu->iommu;
+}
+
+static void rk_iommu_release_device(struct device *dev)
+{
+ struct rk_iommudata *data = dev_iommu_priv_get(dev);
+
+ device_link_del(data->link);
+}
+
+static struct iommu_group *rk_iommu_device_group(struct device *dev)
+{
+ struct rk_iommu *iommu;
+
+ iommu = rk_iommu_from_dev(dev);
+
+ return iommu_group_ref_get(iommu->group);
+}
+
+static int rk_iommu_of_xlate(struct device *dev,
+ struct of_phandle_args *args)
+{
+ struct platform_device *iommu_dev;
+ struct rk_iommudata *data;
+
+ data = devm_kzalloc(dma_dev, sizeof(*data), GFP_KERNEL);
+ if (!data)
+ return -ENOMEM;
+
+ iommu_dev = of_find_device_by_node(args->np);
+
+ data->iommu = platform_get_drvdata(iommu_dev);
+ dev_iommu_priv_set(dev, data);
+
+ platform_device_put(iommu_dev);
+
+ return 0;
+}
+
+static const struct iommu_ops rk_iommu_ops = {
+ .domain_alloc = rk_iommu_domain_alloc,
+ .domain_free = rk_iommu_domain_free,
+ .attach_dev = rk_iommu_attach_device,
+ .detach_dev = rk_iommu_detach_device,
+ .map = rk_iommu_map,
+ .unmap = rk_iommu_unmap,
+ .probe_device = rk_iommu_probe_device,
+ .release_device = rk_iommu_release_device,
+ .iova_to_phys = rk_iommu_iova_to_phys,
+ .device_group = rk_iommu_device_group,
+ .pgsize_bitmap = RK_IOMMU_PGSIZE_BITMAP,
+ .of_xlate = rk_iommu_of_xlate,
+};
+
+static int rk_iommu_probe(struct platform_device *pdev)
+{
+ struct device *dev = &pdev->dev;
+ struct rk_iommu *iommu;
+ struct resource *res;
+ int num_res = pdev->num_resources;
+ int err, i;
+
+ iommu = devm_kzalloc(dev, sizeof(*iommu), GFP_KERNEL);
+ if (!iommu)
+ return -ENOMEM;
+
+ platform_set_drvdata(pdev, iommu);
+ iommu->dev = dev;
+ iommu->num_mmu = 0;
+
+ iommu->bases = devm_kcalloc(dev, num_res, sizeof(*iommu->bases),
+ GFP_KERNEL);
+ if (!iommu->bases)
+ return -ENOMEM;
+
+ for (i = 0; i < num_res; i++) {
+ res = platform_get_resource(pdev, IORESOURCE_MEM, i);
+ if (!res)
+ continue;
+ iommu->bases[i] = devm_ioremap_resource(&pdev->dev, res);
+ if (IS_ERR(iommu->bases[i]))
+ continue;
+ iommu->num_mmu++;
+ }
+ if (iommu->num_mmu == 0)
+ return PTR_ERR(iommu->bases[0]);
+
+ iommu->num_irq = platform_irq_count(pdev);
+ if (iommu->num_irq < 0)
+ return iommu->num_irq;
+
+ iommu->reset_disabled = device_property_read_bool(dev,
+ "rockchip,disable-mmu-reset");
+
+ iommu->num_clocks = ARRAY_SIZE(rk_iommu_clocks);
+ iommu->clocks = devm_kcalloc(iommu->dev, iommu->num_clocks,
+ sizeof(*iommu->clocks), GFP_KERNEL);
+ if (!iommu->clocks)
+ return -ENOMEM;
+
+ for (i = 0; i < iommu->num_clocks; ++i)
+ iommu->clocks[i].id = rk_iommu_clocks[i];
+
+ /*
+ * iommu clocks should be present for all new devices and devicetrees
+ * but there are older devicetrees without clocks out in the wild.
+ * So clocks as optional for the time being.
+ */
+ err = devm_clk_bulk_get(iommu->dev, iommu->num_clocks, iommu->clocks);
+ if (err == -ENOENT)
+ iommu->num_clocks = 0;
+ else if (err)
+ return err;
+
+ err = clk_bulk_prepare(iommu->num_clocks, iommu->clocks);
+ if (err)
+ return err;
+
+ iommu->group = iommu_group_alloc();
+ if (IS_ERR(iommu->group)) {
+ err = PTR_ERR(iommu->group);
+ goto err_unprepare_clocks;
+ }
+
+ err = iommu_device_sysfs_add(&iommu->iommu, dev, NULL, dev_name(dev));
+ if (err)
+ goto err_put_group;
+
+ iommu_device_set_ops(&iommu->iommu, &rk_iommu_ops);
+ iommu_device_set_fwnode(&iommu->iommu, &dev->of_node->fwnode);
+
+ err = iommu_device_register(&iommu->iommu);
+ if (err)
+ goto err_remove_sysfs;
+
+ /*
+ * Use the first registered IOMMU device for domain to use with DMA
+ * API, since a domain might not physically correspond to a single
+ * IOMMU device..
+ */
+ if (!dma_dev)
+ dma_dev = &pdev->dev;
+
+ bus_set_iommu(&platform_bus_type, &rk_iommu_ops);
+
+ pm_runtime_enable(dev);
+
+ for (i = 0; i < iommu->num_irq; i++) {
+ int irq = platform_get_irq(pdev, i);
+
+ if (irq < 0) {
+ err = irq;
+ goto err_pm_disable;
+ }
+
+ err = devm_request_irq(iommu->dev, irq, rk_iommu_irq,
+ IRQF_SHARED, dev_name(dev), iommu);
+ if (err)
+ goto err_pm_disable;
+ }
+
+ return 0;
+err_pm_disable:
+ pm_runtime_disable(dev);
+err_remove_sysfs:
+ iommu_device_sysfs_remove(&iommu->iommu);
+err_put_group:
+ iommu_group_put(iommu->group);
+err_unprepare_clocks:
+ clk_bulk_unprepare(iommu->num_clocks, iommu->clocks);
+ return err;
+}
+
+static void rk_iommu_shutdown(struct platform_device *pdev)
+{
+ struct rk_iommu *iommu = platform_get_drvdata(pdev);
+ int i;
+
+ for (i = 0; i < iommu->num_irq; i++) {
+ int irq = platform_get_irq(pdev, i);
+
+ devm_free_irq(iommu->dev, irq, iommu);
+ }
+
+ pm_runtime_force_suspend(&pdev->dev);
+}
+
+static int __maybe_unused rk_iommu_suspend(struct device *dev)
+{
+ struct rk_iommu *iommu = dev_get_drvdata(dev);
+
+ if (!iommu->domain)
+ return 0;
+
+ rk_iommu_disable(iommu);
+ return 0;
+}
+
+static int __maybe_unused rk_iommu_resume(struct device *dev)
+{
+ struct rk_iommu *iommu = dev_get_drvdata(dev);
+
+ if (!iommu->domain)
+ return 0;
+
+ return rk_iommu_enable(iommu);
+}
+
+static const struct dev_pm_ops rk_iommu_pm_ops = {
+ SET_RUNTIME_PM_OPS(rk_iommu_suspend, rk_iommu_resume, NULL)
+ SET_SYSTEM_SLEEP_PM_OPS(pm_runtime_force_suspend,
+ pm_runtime_force_resume)
+};
+
+static const struct of_device_id rk_iommu_dt_ids[] = {
+ { .compatible = "rockchip,iommu" },
+ { /* sentinel */ }
+};
+
+static struct platform_driver rk_iommu_driver = {
+ .probe = rk_iommu_probe,
+ .shutdown = rk_iommu_shutdown,
+ .driver = {
+ .name = "rk_iommu",
+ .of_match_table = rk_iommu_dt_ids,
+ .pm = &rk_iommu_pm_ops,
+ .suppress_bind_attrs = true,
+ },
+};
+
+static int __init rk_iommu_init(void)
+{
+ return platform_driver_register(&rk_iommu_driver);
+}
+subsys_initcall(rk_iommu_init);
diff --git a/drivers/iommu/s390-iommu.c b/drivers/iommu/s390-iommu.c
new file mode 100644
index 000000000..8895dbb70
--- /dev/null
+++ b/drivers/iommu/s390-iommu.c
@@ -0,0 +1,376 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * IOMMU API for s390 PCI devices
+ *
+ * Copyright IBM Corp. 2015
+ * Author(s): Gerald Schaefer <gerald.schaefer@de.ibm.com>
+ */
+
+#include <linux/pci.h>
+#include <linux/iommu.h>
+#include <linux/iommu-helper.h>
+#include <linux/sizes.h>
+#include <asm/pci_dma.h>
+
+/*
+ * Physically contiguous memory regions can be mapped with 4 KiB alignment,
+ * we allow all page sizes that are an order of 4KiB (no special large page
+ * support so far).
+ */
+#define S390_IOMMU_PGSIZES (~0xFFFUL)
+
+static const struct iommu_ops s390_iommu_ops;
+
+struct s390_domain {
+ struct iommu_domain domain;
+ struct list_head devices;
+ unsigned long *dma_table;
+ spinlock_t dma_table_lock;
+ spinlock_t list_lock;
+};
+
+struct s390_domain_device {
+ struct list_head list;
+ struct zpci_dev *zdev;
+};
+
+static struct s390_domain *to_s390_domain(struct iommu_domain *dom)
+{
+ return container_of(dom, struct s390_domain, domain);
+}
+
+static bool s390_iommu_capable(enum iommu_cap cap)
+{
+ switch (cap) {
+ case IOMMU_CAP_CACHE_COHERENCY:
+ return true;
+ case IOMMU_CAP_INTR_REMAP:
+ return true;
+ default:
+ return false;
+ }
+}
+
+static struct iommu_domain *s390_domain_alloc(unsigned domain_type)
+{
+ struct s390_domain *s390_domain;
+
+ if (domain_type != IOMMU_DOMAIN_UNMANAGED)
+ return NULL;
+
+ s390_domain = kzalloc(sizeof(*s390_domain), GFP_KERNEL);
+ if (!s390_domain)
+ return NULL;
+
+ s390_domain->dma_table = dma_alloc_cpu_table();
+ if (!s390_domain->dma_table) {
+ kfree(s390_domain);
+ return NULL;
+ }
+
+ spin_lock_init(&s390_domain->dma_table_lock);
+ spin_lock_init(&s390_domain->list_lock);
+ INIT_LIST_HEAD(&s390_domain->devices);
+
+ return &s390_domain->domain;
+}
+
+static void s390_domain_free(struct iommu_domain *domain)
+{
+ struct s390_domain *s390_domain = to_s390_domain(domain);
+
+ dma_cleanup_tables(s390_domain->dma_table);
+ kfree(s390_domain);
+}
+
+static int s390_iommu_attach_device(struct iommu_domain *domain,
+ struct device *dev)
+{
+ struct s390_domain *s390_domain = to_s390_domain(domain);
+ struct zpci_dev *zdev = to_zpci_dev(dev);
+ struct s390_domain_device *domain_device;
+ unsigned long flags;
+ int rc;
+
+ if (!zdev)
+ return -ENODEV;
+
+ domain_device = kzalloc(sizeof(*domain_device), GFP_KERNEL);
+ if (!domain_device)
+ return -ENOMEM;
+
+ if (zdev->dma_table)
+ zpci_dma_exit_device(zdev);
+
+ zdev->dma_table = s390_domain->dma_table;
+ rc = zpci_register_ioat(zdev, 0, zdev->start_dma, zdev->end_dma,
+ (u64) zdev->dma_table);
+ if (rc)
+ goto out_restore;
+
+ spin_lock_irqsave(&s390_domain->list_lock, flags);
+ /* First device defines the DMA range limits */
+ if (list_empty(&s390_domain->devices)) {
+ domain->geometry.aperture_start = zdev->start_dma;
+ domain->geometry.aperture_end = zdev->end_dma;
+ domain->geometry.force_aperture = true;
+ /* Allow only devices with identical DMA range limits */
+ } else if (domain->geometry.aperture_start != zdev->start_dma ||
+ domain->geometry.aperture_end != zdev->end_dma) {
+ rc = -EINVAL;
+ spin_unlock_irqrestore(&s390_domain->list_lock, flags);
+ goto out_restore;
+ }
+ domain_device->zdev = zdev;
+ zdev->s390_domain = s390_domain;
+ list_add(&domain_device->list, &s390_domain->devices);
+ spin_unlock_irqrestore(&s390_domain->list_lock, flags);
+
+ return 0;
+
+out_restore:
+ zpci_dma_init_device(zdev);
+ kfree(domain_device);
+
+ return rc;
+}
+
+static void s390_iommu_detach_device(struct iommu_domain *domain,
+ struct device *dev)
+{
+ struct s390_domain *s390_domain = to_s390_domain(domain);
+ struct zpci_dev *zdev = to_zpci_dev(dev);
+ struct s390_domain_device *domain_device, *tmp;
+ unsigned long flags;
+ int found = 0;
+
+ if (!zdev)
+ return;
+
+ spin_lock_irqsave(&s390_domain->list_lock, flags);
+ list_for_each_entry_safe(domain_device, tmp, &s390_domain->devices,
+ list) {
+ if (domain_device->zdev == zdev) {
+ list_del(&domain_device->list);
+ kfree(domain_device);
+ found = 1;
+ break;
+ }
+ }
+ spin_unlock_irqrestore(&s390_domain->list_lock, flags);
+
+ if (found) {
+ zdev->s390_domain = NULL;
+ zpci_unregister_ioat(zdev, 0);
+ zpci_dma_init_device(zdev);
+ }
+}
+
+static struct iommu_device *s390_iommu_probe_device(struct device *dev)
+{
+ struct zpci_dev *zdev = to_zpci_dev(dev);
+
+ return &zdev->iommu_dev;
+}
+
+static void s390_iommu_release_device(struct device *dev)
+{
+ struct zpci_dev *zdev = to_zpci_dev(dev);
+ struct iommu_domain *domain;
+
+ /*
+ * This is a workaround for a scenario where the IOMMU API common code
+ * "forgets" to call the detach_dev callback: After binding a device
+ * to vfio-pci and completing the VFIO_SET_IOMMU ioctl (which triggers
+ * the attach_dev), removing the device via
+ * "echo 1 > /sys/bus/pci/devices/.../remove" won't trigger detach_dev,
+ * only release_device will be called via the BUS_NOTIFY_REMOVED_DEVICE
+ * notifier.
+ *
+ * So let's call detach_dev from here if it hasn't been called before.
+ */
+ if (zdev && zdev->s390_domain) {
+ domain = iommu_get_domain_for_dev(dev);
+ if (domain)
+ s390_iommu_detach_device(domain, dev);
+ }
+}
+
+static int s390_iommu_update_trans(struct s390_domain *s390_domain,
+ unsigned long pa, dma_addr_t dma_addr,
+ size_t size, int flags)
+{
+ struct s390_domain_device *domain_device;
+ u8 *page_addr = (u8 *) (pa & PAGE_MASK);
+ dma_addr_t start_dma_addr = dma_addr;
+ unsigned long irq_flags, nr_pages, i;
+ unsigned long *entry;
+ int rc = 0;
+
+ if (dma_addr < s390_domain->domain.geometry.aperture_start ||
+ dma_addr + size > s390_domain->domain.geometry.aperture_end)
+ return -EINVAL;
+
+ nr_pages = PAGE_ALIGN(size) >> PAGE_SHIFT;
+ if (!nr_pages)
+ return 0;
+
+ spin_lock_irqsave(&s390_domain->dma_table_lock, irq_flags);
+ for (i = 0; i < nr_pages; i++) {
+ entry = dma_walk_cpu_trans(s390_domain->dma_table, dma_addr);
+ if (!entry) {
+ rc = -ENOMEM;
+ goto undo_cpu_trans;
+ }
+ dma_update_cpu_trans(entry, page_addr, flags);
+ page_addr += PAGE_SIZE;
+ dma_addr += PAGE_SIZE;
+ }
+
+ spin_lock(&s390_domain->list_lock);
+ list_for_each_entry(domain_device, &s390_domain->devices, list) {
+ rc = zpci_refresh_trans((u64) domain_device->zdev->fh << 32,
+ start_dma_addr, nr_pages * PAGE_SIZE);
+ if (rc)
+ break;
+ }
+ spin_unlock(&s390_domain->list_lock);
+
+undo_cpu_trans:
+ if (rc && ((flags & ZPCI_PTE_VALID_MASK) == ZPCI_PTE_VALID)) {
+ flags = ZPCI_PTE_INVALID;
+ while (i-- > 0) {
+ page_addr -= PAGE_SIZE;
+ dma_addr -= PAGE_SIZE;
+ entry = dma_walk_cpu_trans(s390_domain->dma_table,
+ dma_addr);
+ if (!entry)
+ break;
+ dma_update_cpu_trans(entry, page_addr, flags);
+ }
+ }
+ spin_unlock_irqrestore(&s390_domain->dma_table_lock, irq_flags);
+
+ return rc;
+}
+
+static int s390_iommu_map(struct iommu_domain *domain, unsigned long iova,
+ phys_addr_t paddr, size_t size, int prot, gfp_t gfp)
+{
+ struct s390_domain *s390_domain = to_s390_domain(domain);
+ int flags = ZPCI_PTE_VALID, rc = 0;
+
+ if (!(prot & IOMMU_READ))
+ return -EINVAL;
+
+ if (!(prot & IOMMU_WRITE))
+ flags |= ZPCI_TABLE_PROTECTED;
+
+ rc = s390_iommu_update_trans(s390_domain, (unsigned long) paddr, iova,
+ size, flags);
+
+ return rc;
+}
+
+static phys_addr_t s390_iommu_iova_to_phys(struct iommu_domain *domain,
+ dma_addr_t iova)
+{
+ struct s390_domain *s390_domain = to_s390_domain(domain);
+ unsigned long *sto, *pto, *rto, flags;
+ unsigned int rtx, sx, px;
+ phys_addr_t phys = 0;
+
+ if (iova < domain->geometry.aperture_start ||
+ iova > domain->geometry.aperture_end)
+ return 0;
+
+ rtx = calc_rtx(iova);
+ sx = calc_sx(iova);
+ px = calc_px(iova);
+ rto = s390_domain->dma_table;
+
+ spin_lock_irqsave(&s390_domain->dma_table_lock, flags);
+ if (rto && reg_entry_isvalid(rto[rtx])) {
+ sto = get_rt_sto(rto[rtx]);
+ if (sto && reg_entry_isvalid(sto[sx])) {
+ pto = get_st_pto(sto[sx]);
+ if (pto && pt_entry_isvalid(pto[px]))
+ phys = pto[px] & ZPCI_PTE_ADDR_MASK;
+ }
+ }
+ spin_unlock_irqrestore(&s390_domain->dma_table_lock, flags);
+
+ return phys;
+}
+
+static size_t s390_iommu_unmap(struct iommu_domain *domain,
+ unsigned long iova, size_t size,
+ struct iommu_iotlb_gather *gather)
+{
+ struct s390_domain *s390_domain = to_s390_domain(domain);
+ int flags = ZPCI_PTE_INVALID;
+ phys_addr_t paddr;
+ int rc;
+
+ paddr = s390_iommu_iova_to_phys(domain, iova);
+ if (!paddr)
+ return 0;
+
+ rc = s390_iommu_update_trans(s390_domain, (unsigned long) paddr, iova,
+ size, flags);
+ if (rc)
+ return 0;
+
+ return size;
+}
+
+int zpci_init_iommu(struct zpci_dev *zdev)
+{
+ int rc = 0;
+
+ rc = iommu_device_sysfs_add(&zdev->iommu_dev, NULL, NULL,
+ "s390-iommu.%08x", zdev->fid);
+ if (rc)
+ goto out_err;
+
+ iommu_device_set_ops(&zdev->iommu_dev, &s390_iommu_ops);
+
+ rc = iommu_device_register(&zdev->iommu_dev);
+ if (rc)
+ goto out_sysfs;
+
+ return 0;
+
+out_sysfs:
+ iommu_device_sysfs_remove(&zdev->iommu_dev);
+
+out_err:
+ return rc;
+}
+
+void zpci_destroy_iommu(struct zpci_dev *zdev)
+{
+ iommu_device_unregister(&zdev->iommu_dev);
+ iommu_device_sysfs_remove(&zdev->iommu_dev);
+}
+
+static const struct iommu_ops s390_iommu_ops = {
+ .capable = s390_iommu_capable,
+ .domain_alloc = s390_domain_alloc,
+ .domain_free = s390_domain_free,
+ .attach_dev = s390_iommu_attach_device,
+ .detach_dev = s390_iommu_detach_device,
+ .map = s390_iommu_map,
+ .unmap = s390_iommu_unmap,
+ .iova_to_phys = s390_iommu_iova_to_phys,
+ .probe_device = s390_iommu_probe_device,
+ .release_device = s390_iommu_release_device,
+ .device_group = generic_device_group,
+ .pgsize_bitmap = S390_IOMMU_PGSIZES,
+};
+
+static int __init s390_iommu_init(void)
+{
+ return bus_set_iommu(&pci_bus_type, &s390_iommu_ops);
+}
+subsys_initcall(s390_iommu_init);
diff --git a/drivers/iommu/sun50i-iommu.c b/drivers/iommu/sun50i-iommu.c
new file mode 100644
index 000000000..65aa30d55
--- /dev/null
+++ b/drivers/iommu/sun50i-iommu.c
@@ -0,0 +1,1024 @@
+// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+// Copyright (C) 2016-2018, Allwinner Technology CO., LTD.
+// Copyright (C) 2019-2020, Cerno
+
+#include <linux/bitfield.h>
+#include <linux/bug.h>
+#include <linux/clk.h>
+#include <linux/device.h>
+#include <linux/dma-direction.h>
+#include <linux/dma-iommu.h>
+#include <linux/dma-mapping.h>
+#include <linux/err.h>
+#include <linux/errno.h>
+#include <linux/interrupt.h>
+#include <linux/iommu.h>
+#include <linux/iopoll.h>
+#include <linux/ioport.h>
+#include <linux/log2.h>
+#include <linux/module.h>
+#include <linux/of_platform.h>
+#include <linux/platform_device.h>
+#include <linux/pm.h>
+#include <linux/pm_runtime.h>
+#include <linux/reset.h>
+#include <linux/sizes.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/types.h>
+
+#define IOMMU_RESET_REG 0x010
+#define IOMMU_RESET_RELEASE_ALL 0xffffffff
+#define IOMMU_ENABLE_REG 0x020
+#define IOMMU_ENABLE_ENABLE BIT(0)
+
+#define IOMMU_BYPASS_REG 0x030
+#define IOMMU_AUTO_GATING_REG 0x040
+#define IOMMU_AUTO_GATING_ENABLE BIT(0)
+
+#define IOMMU_WBUF_CTRL_REG 0x044
+#define IOMMU_OOO_CTRL_REG 0x048
+#define IOMMU_4KB_BDY_PRT_CTRL_REG 0x04c
+#define IOMMU_TTB_REG 0x050
+#define IOMMU_TLB_ENABLE_REG 0x060
+#define IOMMU_TLB_PREFETCH_REG 0x070
+#define IOMMU_TLB_PREFETCH_MASTER_ENABLE(m) BIT(m)
+
+#define IOMMU_TLB_FLUSH_REG 0x080
+#define IOMMU_TLB_FLUSH_PTW_CACHE BIT(17)
+#define IOMMU_TLB_FLUSH_MACRO_TLB BIT(16)
+#define IOMMU_TLB_FLUSH_MICRO_TLB(i) (BIT(i) & GENMASK(5, 0))
+
+#define IOMMU_TLB_IVLD_ADDR_REG 0x090
+#define IOMMU_TLB_IVLD_ADDR_MASK_REG 0x094
+#define IOMMU_TLB_IVLD_ENABLE_REG 0x098
+#define IOMMU_TLB_IVLD_ENABLE_ENABLE BIT(0)
+
+#define IOMMU_PC_IVLD_ADDR_REG 0x0a0
+#define IOMMU_PC_IVLD_ENABLE_REG 0x0a8
+#define IOMMU_PC_IVLD_ENABLE_ENABLE BIT(0)
+
+#define IOMMU_DM_AUT_CTRL_REG(d) (0x0b0 + ((d) / 2) * 4)
+#define IOMMU_DM_AUT_CTRL_RD_UNAVAIL(d, m) (1 << (((d & 1) * 16) + ((m) * 2)))
+#define IOMMU_DM_AUT_CTRL_WR_UNAVAIL(d, m) (1 << (((d & 1) * 16) + ((m) * 2) + 1))
+
+#define IOMMU_DM_AUT_OVWT_REG 0x0d0
+#define IOMMU_INT_ENABLE_REG 0x100
+#define IOMMU_INT_CLR_REG 0x104
+#define IOMMU_INT_STA_REG 0x108
+#define IOMMU_INT_ERR_ADDR_REG(i) (0x110 + (i) * 4)
+#define IOMMU_INT_ERR_ADDR_L1_REG 0x130
+#define IOMMU_INT_ERR_ADDR_L2_REG 0x134
+#define IOMMU_INT_ERR_DATA_REG(i) (0x150 + (i) * 4)
+#define IOMMU_L1PG_INT_REG 0x0180
+#define IOMMU_L2PG_INT_REG 0x0184
+
+#define IOMMU_INT_INVALID_L2PG BIT(17)
+#define IOMMU_INT_INVALID_L1PG BIT(16)
+#define IOMMU_INT_MASTER_PERMISSION(m) BIT(m)
+#define IOMMU_INT_MASTER_MASK (IOMMU_INT_MASTER_PERMISSION(0) | \
+ IOMMU_INT_MASTER_PERMISSION(1) | \
+ IOMMU_INT_MASTER_PERMISSION(2) | \
+ IOMMU_INT_MASTER_PERMISSION(3) | \
+ IOMMU_INT_MASTER_PERMISSION(4) | \
+ IOMMU_INT_MASTER_PERMISSION(5))
+#define IOMMU_INT_MASK (IOMMU_INT_INVALID_L1PG | \
+ IOMMU_INT_INVALID_L2PG | \
+ IOMMU_INT_MASTER_MASK)
+
+#define PT_ENTRY_SIZE sizeof(u32)
+
+#define NUM_DT_ENTRIES 4096
+#define DT_SIZE (NUM_DT_ENTRIES * PT_ENTRY_SIZE)
+
+#define NUM_PT_ENTRIES 256
+#define PT_SIZE (NUM_PT_ENTRIES * PT_ENTRY_SIZE)
+
+struct sun50i_iommu {
+ struct iommu_device iommu;
+
+ /* Lock to modify the IOMMU registers */
+ spinlock_t iommu_lock;
+
+ struct device *dev;
+ void __iomem *base;
+ struct reset_control *reset;
+ struct clk *clk;
+
+ struct iommu_domain *domain;
+ struct iommu_group *group;
+ struct kmem_cache *pt_pool;
+};
+
+struct sun50i_iommu_domain {
+ struct iommu_domain domain;
+
+ /* Number of devices attached to the domain */
+ refcount_t refcnt;
+
+ /* L1 Page Table */
+ u32 *dt;
+ dma_addr_t dt_dma;
+
+ struct sun50i_iommu *iommu;
+};
+
+static struct sun50i_iommu_domain *to_sun50i_domain(struct iommu_domain *domain)
+{
+ return container_of(domain, struct sun50i_iommu_domain, domain);
+}
+
+static struct sun50i_iommu *sun50i_iommu_from_dev(struct device *dev)
+{
+ return dev_iommu_priv_get(dev);
+}
+
+static u32 iommu_read(struct sun50i_iommu *iommu, u32 offset)
+{
+ return readl(iommu->base + offset);
+}
+
+static void iommu_write(struct sun50i_iommu *iommu, u32 offset, u32 value)
+{
+ writel(value, iommu->base + offset);
+}
+
+/*
+ * The Allwinner H6 IOMMU uses a 2-level page table.
+ *
+ * The first level is the usual Directory Table (DT), that consists of
+ * 4096 4-bytes Directory Table Entries (DTE), each pointing to a Page
+ * Table (PT).
+ *
+ * Each PT consits of 256 4-bytes Page Table Entries (PTE), each
+ * pointing to a 4kB page of physical memory.
+ *
+ * The IOMMU supports a single DT, pointed by the IOMMU_TTB_REG
+ * register that contains its physical address.
+ */
+
+#define SUN50I_IOVA_DTE_MASK GENMASK(31, 20)
+#define SUN50I_IOVA_PTE_MASK GENMASK(19, 12)
+#define SUN50I_IOVA_PAGE_MASK GENMASK(11, 0)
+
+static u32 sun50i_iova_get_dte_index(dma_addr_t iova)
+{
+ return FIELD_GET(SUN50I_IOVA_DTE_MASK, iova);
+}
+
+static u32 sun50i_iova_get_pte_index(dma_addr_t iova)
+{
+ return FIELD_GET(SUN50I_IOVA_PTE_MASK, iova);
+}
+
+static u32 sun50i_iova_get_page_offset(dma_addr_t iova)
+{
+ return FIELD_GET(SUN50I_IOVA_PAGE_MASK, iova);
+}
+
+/*
+ * Each Directory Table Entry has a Page Table address and a valid
+ * bit:
+
+ * +---------------------+-----------+-+
+ * | PT address | Reserved |V|
+ * +---------------------+-----------+-+
+ * 31:10 - Page Table address
+ * 9:2 - Reserved
+ * 1:0 - 1 if the entry is valid
+ */
+
+#define SUN50I_DTE_PT_ADDRESS_MASK GENMASK(31, 10)
+#define SUN50I_DTE_PT_ATTRS GENMASK(1, 0)
+#define SUN50I_DTE_PT_VALID 1
+
+static phys_addr_t sun50i_dte_get_pt_address(u32 dte)
+{
+ return (phys_addr_t)dte & SUN50I_DTE_PT_ADDRESS_MASK;
+}
+
+static bool sun50i_dte_is_pt_valid(u32 dte)
+{
+ return (dte & SUN50I_DTE_PT_ATTRS) == SUN50I_DTE_PT_VALID;
+}
+
+static u32 sun50i_mk_dte(dma_addr_t pt_dma)
+{
+ return (pt_dma & SUN50I_DTE_PT_ADDRESS_MASK) | SUN50I_DTE_PT_VALID;
+}
+
+/*
+ * Each PTE has a Page address, an authority index and a valid bit:
+ *
+ * +----------------+-----+-----+-----+---+-----+
+ * | Page address | Rsv | ACI | Rsv | V | Rsv |
+ * +----------------+-----+-----+-----+---+-----+
+ * 31:12 - Page address
+ * 11:8 - Reserved
+ * 7:4 - Authority Control Index
+ * 3:2 - Reserved
+ * 1 - 1 if the entry is valid
+ * 0 - Reserved
+ *
+ * The way permissions work is that the IOMMU has 16 "domains" that
+ * can be configured to give each masters either read or write
+ * permissions through the IOMMU_DM_AUT_CTRL_REG registers. The domain
+ * 0 seems like the default domain, and its permissions in the
+ * IOMMU_DM_AUT_CTRL_REG are only read-only, so it's not really
+ * useful to enforce any particular permission.
+ *
+ * Each page entry will then have a reference to the domain they are
+ * affected to, so that we can actually enforce them on a per-page
+ * basis.
+ *
+ * In order to make it work with the IOMMU framework, we will be using
+ * 4 different domains, starting at 1: RD_WR, RD, WR and NONE
+ * depending on the permission we want to enforce. Each domain will
+ * have each master setup in the same way, since the IOMMU framework
+ * doesn't seem to restrict page access on a per-device basis. And
+ * then we will use the relevant domain index when generating the page
+ * table entry depending on the permissions we want to be enforced.
+ */
+
+enum sun50i_iommu_aci {
+ SUN50I_IOMMU_ACI_DO_NOT_USE = 0,
+ SUN50I_IOMMU_ACI_NONE,
+ SUN50I_IOMMU_ACI_RD,
+ SUN50I_IOMMU_ACI_WR,
+ SUN50I_IOMMU_ACI_RD_WR,
+};
+
+#define SUN50I_PTE_PAGE_ADDRESS_MASK GENMASK(31, 12)
+#define SUN50I_PTE_ACI_MASK GENMASK(7, 4)
+#define SUN50I_PTE_PAGE_VALID BIT(1)
+
+static phys_addr_t sun50i_pte_get_page_address(u32 pte)
+{
+ return (phys_addr_t)pte & SUN50I_PTE_PAGE_ADDRESS_MASK;
+}
+
+static enum sun50i_iommu_aci sun50i_get_pte_aci(u32 pte)
+{
+ return FIELD_GET(SUN50I_PTE_ACI_MASK, pte);
+}
+
+static bool sun50i_pte_is_page_valid(u32 pte)
+{
+ return pte & SUN50I_PTE_PAGE_VALID;
+}
+
+static u32 sun50i_mk_pte(phys_addr_t page, int prot)
+{
+ enum sun50i_iommu_aci aci;
+ u32 flags = 0;
+
+ if ((prot & (IOMMU_READ | IOMMU_WRITE)) == (IOMMU_READ | IOMMU_WRITE))
+ aci = SUN50I_IOMMU_ACI_RD_WR;
+ else if (prot & IOMMU_READ)
+ aci = SUN50I_IOMMU_ACI_RD;
+ else if (prot & IOMMU_WRITE)
+ aci = SUN50I_IOMMU_ACI_WR;
+ else
+ aci = SUN50I_IOMMU_ACI_NONE;
+
+ flags |= FIELD_PREP(SUN50I_PTE_ACI_MASK, aci);
+ page &= SUN50I_PTE_PAGE_ADDRESS_MASK;
+ return page | flags | SUN50I_PTE_PAGE_VALID;
+}
+
+static void sun50i_table_flush(struct sun50i_iommu_domain *sun50i_domain,
+ void *vaddr, unsigned int count)
+{
+ struct sun50i_iommu *iommu = sun50i_domain->iommu;
+ dma_addr_t dma = virt_to_phys(vaddr);
+ size_t size = count * PT_ENTRY_SIZE;
+
+ dma_sync_single_for_device(iommu->dev, dma, size, DMA_TO_DEVICE);
+}
+
+static int sun50i_iommu_flush_all_tlb(struct sun50i_iommu *iommu)
+{
+ u32 reg;
+ int ret;
+
+ assert_spin_locked(&iommu->iommu_lock);
+
+ iommu_write(iommu,
+ IOMMU_TLB_FLUSH_REG,
+ IOMMU_TLB_FLUSH_PTW_CACHE |
+ IOMMU_TLB_FLUSH_MACRO_TLB |
+ IOMMU_TLB_FLUSH_MICRO_TLB(5) |
+ IOMMU_TLB_FLUSH_MICRO_TLB(4) |
+ IOMMU_TLB_FLUSH_MICRO_TLB(3) |
+ IOMMU_TLB_FLUSH_MICRO_TLB(2) |
+ IOMMU_TLB_FLUSH_MICRO_TLB(1) |
+ IOMMU_TLB_FLUSH_MICRO_TLB(0));
+
+ ret = readl_poll_timeout_atomic(iommu->base + IOMMU_TLB_FLUSH_REG,
+ reg, !reg,
+ 1, 2000);
+ if (ret)
+ dev_warn(iommu->dev, "TLB Flush timed out!\n");
+
+ return ret;
+}
+
+static void sun50i_iommu_flush_iotlb_all(struct iommu_domain *domain)
+{
+ struct sun50i_iommu_domain *sun50i_domain = to_sun50i_domain(domain);
+ struct sun50i_iommu *iommu = sun50i_domain->iommu;
+ unsigned long flags;
+
+ /*
+ * At boot, we'll have a first call into .flush_iotlb_all right after
+ * .probe_device, and since we link our (single) domain to our iommu in
+ * the .attach_device callback, we don't have that pointer set.
+ *
+ * It shouldn't really be any trouble to ignore it though since we flush
+ * all caches as part of the device powerup.
+ */
+ if (!iommu)
+ return;
+
+ spin_lock_irqsave(&iommu->iommu_lock, flags);
+ sun50i_iommu_flush_all_tlb(iommu);
+ spin_unlock_irqrestore(&iommu->iommu_lock, flags);
+}
+
+static void sun50i_iommu_iotlb_sync(struct iommu_domain *domain,
+ struct iommu_iotlb_gather *gather)
+{
+ sun50i_iommu_flush_iotlb_all(domain);
+}
+
+static int sun50i_iommu_enable(struct sun50i_iommu *iommu)
+{
+ struct sun50i_iommu_domain *sun50i_domain;
+ unsigned long flags;
+ int ret;
+
+ if (!iommu->domain)
+ return 0;
+
+ sun50i_domain = to_sun50i_domain(iommu->domain);
+
+ ret = reset_control_deassert(iommu->reset);
+ if (ret)
+ return ret;
+
+ ret = clk_prepare_enable(iommu->clk);
+ if (ret)
+ goto err_reset_assert;
+
+ spin_lock_irqsave(&iommu->iommu_lock, flags);
+
+ iommu_write(iommu, IOMMU_TTB_REG, sun50i_domain->dt_dma);
+ iommu_write(iommu, IOMMU_TLB_PREFETCH_REG,
+ IOMMU_TLB_PREFETCH_MASTER_ENABLE(0) |
+ IOMMU_TLB_PREFETCH_MASTER_ENABLE(1) |
+ IOMMU_TLB_PREFETCH_MASTER_ENABLE(2) |
+ IOMMU_TLB_PREFETCH_MASTER_ENABLE(3) |
+ IOMMU_TLB_PREFETCH_MASTER_ENABLE(4) |
+ IOMMU_TLB_PREFETCH_MASTER_ENABLE(5));
+ iommu_write(iommu, IOMMU_INT_ENABLE_REG, IOMMU_INT_MASK);
+ iommu_write(iommu, IOMMU_DM_AUT_CTRL_REG(SUN50I_IOMMU_ACI_NONE),
+ IOMMU_DM_AUT_CTRL_RD_UNAVAIL(SUN50I_IOMMU_ACI_NONE, 0) |
+ IOMMU_DM_AUT_CTRL_WR_UNAVAIL(SUN50I_IOMMU_ACI_NONE, 0) |
+ IOMMU_DM_AUT_CTRL_RD_UNAVAIL(SUN50I_IOMMU_ACI_NONE, 1) |
+ IOMMU_DM_AUT_CTRL_WR_UNAVAIL(SUN50I_IOMMU_ACI_NONE, 1) |
+ IOMMU_DM_AUT_CTRL_RD_UNAVAIL(SUN50I_IOMMU_ACI_NONE, 2) |
+ IOMMU_DM_AUT_CTRL_WR_UNAVAIL(SUN50I_IOMMU_ACI_NONE, 2) |
+ IOMMU_DM_AUT_CTRL_RD_UNAVAIL(SUN50I_IOMMU_ACI_NONE, 3) |
+ IOMMU_DM_AUT_CTRL_WR_UNAVAIL(SUN50I_IOMMU_ACI_NONE, 3) |
+ IOMMU_DM_AUT_CTRL_RD_UNAVAIL(SUN50I_IOMMU_ACI_NONE, 4) |
+ IOMMU_DM_AUT_CTRL_WR_UNAVAIL(SUN50I_IOMMU_ACI_NONE, 4) |
+ IOMMU_DM_AUT_CTRL_RD_UNAVAIL(SUN50I_IOMMU_ACI_NONE, 5) |
+ IOMMU_DM_AUT_CTRL_WR_UNAVAIL(SUN50I_IOMMU_ACI_NONE, 5));
+
+ iommu_write(iommu, IOMMU_DM_AUT_CTRL_REG(SUN50I_IOMMU_ACI_RD),
+ IOMMU_DM_AUT_CTRL_WR_UNAVAIL(SUN50I_IOMMU_ACI_RD, 0) |
+ IOMMU_DM_AUT_CTRL_WR_UNAVAIL(SUN50I_IOMMU_ACI_RD, 1) |
+ IOMMU_DM_AUT_CTRL_WR_UNAVAIL(SUN50I_IOMMU_ACI_RD, 2) |
+ IOMMU_DM_AUT_CTRL_WR_UNAVAIL(SUN50I_IOMMU_ACI_RD, 3) |
+ IOMMU_DM_AUT_CTRL_WR_UNAVAIL(SUN50I_IOMMU_ACI_RD, 4) |
+ IOMMU_DM_AUT_CTRL_WR_UNAVAIL(SUN50I_IOMMU_ACI_RD, 5));
+
+ iommu_write(iommu, IOMMU_DM_AUT_CTRL_REG(SUN50I_IOMMU_ACI_WR),
+ IOMMU_DM_AUT_CTRL_RD_UNAVAIL(SUN50I_IOMMU_ACI_WR, 0) |
+ IOMMU_DM_AUT_CTRL_RD_UNAVAIL(SUN50I_IOMMU_ACI_WR, 1) |
+ IOMMU_DM_AUT_CTRL_RD_UNAVAIL(SUN50I_IOMMU_ACI_WR, 2) |
+ IOMMU_DM_AUT_CTRL_RD_UNAVAIL(SUN50I_IOMMU_ACI_WR, 3) |
+ IOMMU_DM_AUT_CTRL_RD_UNAVAIL(SUN50I_IOMMU_ACI_WR, 4) |
+ IOMMU_DM_AUT_CTRL_RD_UNAVAIL(SUN50I_IOMMU_ACI_WR, 5));
+
+ ret = sun50i_iommu_flush_all_tlb(iommu);
+ if (ret) {
+ spin_unlock_irqrestore(&iommu->iommu_lock, flags);
+ goto err_clk_disable;
+ }
+
+ iommu_write(iommu, IOMMU_AUTO_GATING_REG, IOMMU_AUTO_GATING_ENABLE);
+ iommu_write(iommu, IOMMU_ENABLE_REG, IOMMU_ENABLE_ENABLE);
+
+ spin_unlock_irqrestore(&iommu->iommu_lock, flags);
+
+ return 0;
+
+err_clk_disable:
+ clk_disable_unprepare(iommu->clk);
+
+err_reset_assert:
+ reset_control_assert(iommu->reset);
+
+ return ret;
+}
+
+static void sun50i_iommu_disable(struct sun50i_iommu *iommu)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&iommu->iommu_lock, flags);
+
+ iommu_write(iommu, IOMMU_ENABLE_REG, 0);
+ iommu_write(iommu, IOMMU_TTB_REG, 0);
+
+ spin_unlock_irqrestore(&iommu->iommu_lock, flags);
+
+ clk_disable_unprepare(iommu->clk);
+ reset_control_assert(iommu->reset);
+}
+
+static void *sun50i_iommu_alloc_page_table(struct sun50i_iommu *iommu,
+ gfp_t gfp)
+{
+ dma_addr_t pt_dma;
+ u32 *page_table;
+
+ page_table = kmem_cache_zalloc(iommu->pt_pool, gfp);
+ if (!page_table)
+ return ERR_PTR(-ENOMEM);
+
+ pt_dma = dma_map_single(iommu->dev, page_table, PT_SIZE, DMA_TO_DEVICE);
+ if (dma_mapping_error(iommu->dev, pt_dma)) {
+ dev_err(iommu->dev, "Couldn't map L2 Page Table\n");
+ kmem_cache_free(iommu->pt_pool, page_table);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ /* We rely on the physical address and DMA address being the same */
+ WARN_ON(pt_dma != virt_to_phys(page_table));
+
+ return page_table;
+}
+
+static void sun50i_iommu_free_page_table(struct sun50i_iommu *iommu,
+ u32 *page_table)
+{
+ phys_addr_t pt_phys = virt_to_phys(page_table);
+
+ dma_unmap_single(iommu->dev, pt_phys, PT_SIZE, DMA_TO_DEVICE);
+ kmem_cache_free(iommu->pt_pool, page_table);
+}
+
+static u32 *sun50i_dte_get_page_table(struct sun50i_iommu_domain *sun50i_domain,
+ dma_addr_t iova, gfp_t gfp)
+{
+ struct sun50i_iommu *iommu = sun50i_domain->iommu;
+ u32 *page_table;
+ u32 *dte_addr;
+ u32 old_dte;
+ u32 dte;
+
+ dte_addr = &sun50i_domain->dt[sun50i_iova_get_dte_index(iova)];
+ dte = *dte_addr;
+ if (sun50i_dte_is_pt_valid(dte)) {
+ phys_addr_t pt_phys = sun50i_dte_get_pt_address(dte);
+ return (u32 *)phys_to_virt(pt_phys);
+ }
+
+ page_table = sun50i_iommu_alloc_page_table(iommu, gfp);
+ if (IS_ERR(page_table))
+ return page_table;
+
+ dte = sun50i_mk_dte(virt_to_phys(page_table));
+ old_dte = cmpxchg(dte_addr, 0, dte);
+ if (old_dte) {
+ phys_addr_t installed_pt_phys =
+ sun50i_dte_get_pt_address(old_dte);
+ u32 *installed_pt = phys_to_virt(installed_pt_phys);
+ u32 *drop_pt = page_table;
+
+ page_table = installed_pt;
+ dte = old_dte;
+ sun50i_iommu_free_page_table(iommu, drop_pt);
+ }
+
+ sun50i_table_flush(sun50i_domain, page_table, NUM_PT_ENTRIES);
+ sun50i_table_flush(sun50i_domain, dte_addr, 1);
+
+ return page_table;
+}
+
+static int sun50i_iommu_map(struct iommu_domain *domain, unsigned long iova,
+ phys_addr_t paddr, size_t size, int prot, gfp_t gfp)
+{
+ struct sun50i_iommu_domain *sun50i_domain = to_sun50i_domain(domain);
+ struct sun50i_iommu *iommu = sun50i_domain->iommu;
+ u32 pte_index;
+ u32 *page_table, *pte_addr;
+ int ret = 0;
+
+ page_table = sun50i_dte_get_page_table(sun50i_domain, iova, gfp);
+ if (IS_ERR(page_table)) {
+ ret = PTR_ERR(page_table);
+ goto out;
+ }
+
+ pte_index = sun50i_iova_get_pte_index(iova);
+ pte_addr = &page_table[pte_index];
+ if (unlikely(sun50i_pte_is_page_valid(*pte_addr))) {
+ phys_addr_t page_phys = sun50i_pte_get_page_address(*pte_addr);
+ dev_err(iommu->dev,
+ "iova %pad already mapped to %pa cannot remap to %pa prot: %#x\n",
+ &iova, &page_phys, &paddr, prot);
+ ret = -EBUSY;
+ goto out;
+ }
+
+ *pte_addr = sun50i_mk_pte(paddr, prot);
+ sun50i_table_flush(sun50i_domain, pte_addr, 1);
+
+out:
+ return ret;
+}
+
+static size_t sun50i_iommu_unmap(struct iommu_domain *domain, unsigned long iova,
+ size_t size, struct iommu_iotlb_gather *gather)
+{
+ struct sun50i_iommu_domain *sun50i_domain = to_sun50i_domain(domain);
+ phys_addr_t pt_phys;
+ u32 *pte_addr;
+ u32 dte;
+
+ dte = sun50i_domain->dt[sun50i_iova_get_dte_index(iova)];
+ if (!sun50i_dte_is_pt_valid(dte))
+ return 0;
+
+ pt_phys = sun50i_dte_get_pt_address(dte);
+ pte_addr = (u32 *)phys_to_virt(pt_phys) + sun50i_iova_get_pte_index(iova);
+
+ if (!sun50i_pte_is_page_valid(*pte_addr))
+ return 0;
+
+ memset(pte_addr, 0, sizeof(*pte_addr));
+ sun50i_table_flush(sun50i_domain, pte_addr, 1);
+
+ return SZ_4K;
+}
+
+static phys_addr_t sun50i_iommu_iova_to_phys(struct iommu_domain *domain,
+ dma_addr_t iova)
+{
+ struct sun50i_iommu_domain *sun50i_domain = to_sun50i_domain(domain);
+ phys_addr_t pt_phys;
+ u32 *page_table;
+ u32 dte, pte;
+
+ dte = sun50i_domain->dt[sun50i_iova_get_dte_index(iova)];
+ if (!sun50i_dte_is_pt_valid(dte))
+ return 0;
+
+ pt_phys = sun50i_dte_get_pt_address(dte);
+ page_table = (u32 *)phys_to_virt(pt_phys);
+ pte = page_table[sun50i_iova_get_pte_index(iova)];
+ if (!sun50i_pte_is_page_valid(pte))
+ return 0;
+
+ return sun50i_pte_get_page_address(pte) +
+ sun50i_iova_get_page_offset(iova);
+}
+
+static struct iommu_domain *sun50i_iommu_domain_alloc(unsigned type)
+{
+ struct sun50i_iommu_domain *sun50i_domain;
+
+ if (type != IOMMU_DOMAIN_DMA &&
+ type != IOMMU_DOMAIN_UNMANAGED)
+ return NULL;
+
+ sun50i_domain = kzalloc(sizeof(*sun50i_domain), GFP_KERNEL);
+ if (!sun50i_domain)
+ return NULL;
+
+ if (type == IOMMU_DOMAIN_DMA &&
+ iommu_get_dma_cookie(&sun50i_domain->domain))
+ goto err_free_domain;
+
+ sun50i_domain->dt = (u32 *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
+ get_order(DT_SIZE));
+ if (!sun50i_domain->dt)
+ goto err_put_cookie;
+
+ refcount_set(&sun50i_domain->refcnt, 1);
+
+ sun50i_domain->domain.geometry.aperture_start = 0;
+ sun50i_domain->domain.geometry.aperture_end = DMA_BIT_MASK(32);
+ sun50i_domain->domain.geometry.force_aperture = true;
+
+ return &sun50i_domain->domain;
+
+err_put_cookie:
+ if (type == IOMMU_DOMAIN_DMA)
+ iommu_put_dma_cookie(&sun50i_domain->domain);
+
+err_free_domain:
+ kfree(sun50i_domain);
+
+ return NULL;
+}
+
+static void sun50i_iommu_domain_free(struct iommu_domain *domain)
+{
+ struct sun50i_iommu_domain *sun50i_domain = to_sun50i_domain(domain);
+
+ free_pages((unsigned long)sun50i_domain->dt, get_order(DT_SIZE));
+ sun50i_domain->dt = NULL;
+
+ iommu_put_dma_cookie(domain);
+
+ kfree(sun50i_domain);
+}
+
+static int sun50i_iommu_attach_domain(struct sun50i_iommu *iommu,
+ struct sun50i_iommu_domain *sun50i_domain)
+{
+ iommu->domain = &sun50i_domain->domain;
+ sun50i_domain->iommu = iommu;
+
+ sun50i_domain->dt_dma = dma_map_single(iommu->dev, sun50i_domain->dt,
+ DT_SIZE, DMA_TO_DEVICE);
+ if (dma_mapping_error(iommu->dev, sun50i_domain->dt_dma)) {
+ dev_err(iommu->dev, "Couldn't map L1 Page Table\n");
+ return -ENOMEM;
+ }
+
+ return sun50i_iommu_enable(iommu);
+}
+
+static void sun50i_iommu_detach_domain(struct sun50i_iommu *iommu,
+ struct sun50i_iommu_domain *sun50i_domain)
+{
+ unsigned int i;
+
+ for (i = 0; i < NUM_DT_ENTRIES; i++) {
+ phys_addr_t pt_phys;
+ u32 *page_table;
+ u32 *dte_addr;
+ u32 dte;
+
+ dte_addr = &sun50i_domain->dt[i];
+ dte = *dte_addr;
+ if (!sun50i_dte_is_pt_valid(dte))
+ continue;
+
+ memset(dte_addr, 0, sizeof(*dte_addr));
+ sun50i_table_flush(sun50i_domain, dte_addr, 1);
+
+ pt_phys = sun50i_dte_get_pt_address(dte);
+ page_table = phys_to_virt(pt_phys);
+ sun50i_iommu_free_page_table(iommu, page_table);
+ }
+
+
+ sun50i_iommu_disable(iommu);
+
+ dma_unmap_single(iommu->dev, virt_to_phys(sun50i_domain->dt),
+ DT_SIZE, DMA_TO_DEVICE);
+
+ iommu->domain = NULL;
+}
+
+static void sun50i_iommu_detach_device(struct iommu_domain *domain,
+ struct device *dev)
+{
+ struct sun50i_iommu_domain *sun50i_domain = to_sun50i_domain(domain);
+ struct sun50i_iommu *iommu = dev_iommu_priv_get(dev);
+
+ dev_dbg(dev, "Detaching from IOMMU domain\n");
+
+ if (iommu->domain != domain)
+ return;
+
+ if (refcount_dec_and_test(&sun50i_domain->refcnt))
+ sun50i_iommu_detach_domain(iommu, sun50i_domain);
+}
+
+static int sun50i_iommu_attach_device(struct iommu_domain *domain,
+ struct device *dev)
+{
+ struct sun50i_iommu_domain *sun50i_domain = to_sun50i_domain(domain);
+ struct sun50i_iommu *iommu;
+
+ iommu = sun50i_iommu_from_dev(dev);
+ if (!iommu)
+ return -ENODEV;
+
+ dev_dbg(dev, "Attaching to IOMMU domain\n");
+
+ refcount_inc(&sun50i_domain->refcnt);
+
+ if (iommu->domain == domain)
+ return 0;
+
+ if (iommu->domain)
+ sun50i_iommu_detach_device(iommu->domain, dev);
+
+ sun50i_iommu_attach_domain(iommu, sun50i_domain);
+
+ return 0;
+}
+
+static struct iommu_device *sun50i_iommu_probe_device(struct device *dev)
+{
+ struct sun50i_iommu *iommu;
+
+ iommu = sun50i_iommu_from_dev(dev);
+ if (!iommu)
+ return ERR_PTR(-ENODEV);
+
+ return &iommu->iommu;
+}
+
+static void sun50i_iommu_release_device(struct device *dev) {}
+
+static struct iommu_group *sun50i_iommu_device_group(struct device *dev)
+{
+ struct sun50i_iommu *iommu = sun50i_iommu_from_dev(dev);
+
+ return iommu_group_ref_get(iommu->group);
+}
+
+static int sun50i_iommu_of_xlate(struct device *dev,
+ struct of_phandle_args *args)
+{
+ struct platform_device *iommu_pdev = of_find_device_by_node(args->np);
+ unsigned id = args->args[0];
+
+ dev_iommu_priv_set(dev, platform_get_drvdata(iommu_pdev));
+
+ return iommu_fwspec_add_ids(dev, &id, 1);
+}
+
+static const struct iommu_ops sun50i_iommu_ops = {
+ .pgsize_bitmap = SZ_4K,
+ .attach_dev = sun50i_iommu_attach_device,
+ .detach_dev = sun50i_iommu_detach_device,
+ .device_group = sun50i_iommu_device_group,
+ .domain_alloc = sun50i_iommu_domain_alloc,
+ .domain_free = sun50i_iommu_domain_free,
+ .flush_iotlb_all = sun50i_iommu_flush_iotlb_all,
+ .iotlb_sync = sun50i_iommu_iotlb_sync,
+ .iova_to_phys = sun50i_iommu_iova_to_phys,
+ .map = sun50i_iommu_map,
+ .of_xlate = sun50i_iommu_of_xlate,
+ .probe_device = sun50i_iommu_probe_device,
+ .release_device = sun50i_iommu_release_device,
+ .unmap = sun50i_iommu_unmap,
+};
+
+static void sun50i_iommu_report_fault(struct sun50i_iommu *iommu,
+ unsigned master, phys_addr_t iova,
+ unsigned prot)
+{
+ dev_err(iommu->dev, "Page fault for %pad (master %d, dir %s)\n",
+ &iova, master, (prot == IOMMU_FAULT_WRITE) ? "wr" : "rd");
+
+ if (iommu->domain)
+ report_iommu_fault(iommu->domain, iommu->dev, iova, prot);
+ else
+ dev_err(iommu->dev, "Page fault while iommu not attached to any domain?\n");
+}
+
+static phys_addr_t sun50i_iommu_handle_pt_irq(struct sun50i_iommu *iommu,
+ unsigned addr_reg,
+ unsigned blame_reg)
+{
+ phys_addr_t iova;
+ unsigned master;
+ u32 blame;
+
+ assert_spin_locked(&iommu->iommu_lock);
+
+ iova = iommu_read(iommu, addr_reg);
+ blame = iommu_read(iommu, blame_reg);
+ master = ilog2(blame & IOMMU_INT_MASTER_MASK);
+
+ /*
+ * If the address is not in the page table, we can't get what
+ * operation triggered the fault. Assume it's a read
+ * operation.
+ */
+ sun50i_iommu_report_fault(iommu, master, iova, IOMMU_FAULT_READ);
+
+ return iova;
+}
+
+static phys_addr_t sun50i_iommu_handle_perm_irq(struct sun50i_iommu *iommu)
+{
+ enum sun50i_iommu_aci aci;
+ phys_addr_t iova;
+ unsigned master;
+ unsigned dir;
+ u32 blame;
+
+ assert_spin_locked(&iommu->iommu_lock);
+
+ blame = iommu_read(iommu, IOMMU_INT_STA_REG);
+ master = ilog2(blame & IOMMU_INT_MASTER_MASK);
+ iova = iommu_read(iommu, IOMMU_INT_ERR_ADDR_REG(master));
+ aci = sun50i_get_pte_aci(iommu_read(iommu,
+ IOMMU_INT_ERR_DATA_REG(master)));
+
+ switch (aci) {
+ /*
+ * If we are in the read-only domain, then it means we
+ * tried to write.
+ */
+ case SUN50I_IOMMU_ACI_RD:
+ dir = IOMMU_FAULT_WRITE;
+ break;
+
+ /*
+ * If we are in the write-only domain, then it means
+ * we tried to read.
+ */
+ case SUN50I_IOMMU_ACI_WR:
+
+ /*
+ * If we are in the domain without any permission, we
+ * can't really tell. Let's default to a read
+ * operation.
+ */
+ case SUN50I_IOMMU_ACI_NONE:
+
+ /* WTF? */
+ case SUN50I_IOMMU_ACI_RD_WR:
+ default:
+ dir = IOMMU_FAULT_READ;
+ break;
+ }
+
+ /*
+ * If the address is not in the page table, we can't get what
+ * operation triggered the fault. Assume it's a read
+ * operation.
+ */
+ sun50i_iommu_report_fault(iommu, master, iova, dir);
+
+ return iova;
+}
+
+static irqreturn_t sun50i_iommu_irq(int irq, void *dev_id)
+{
+ u32 status, l1_status, l2_status, resets;
+ struct sun50i_iommu *iommu = dev_id;
+
+ spin_lock(&iommu->iommu_lock);
+
+ status = iommu_read(iommu, IOMMU_INT_STA_REG);
+ if (!(status & IOMMU_INT_MASK)) {
+ spin_unlock(&iommu->iommu_lock);
+ return IRQ_NONE;
+ }
+
+ l1_status = iommu_read(iommu, IOMMU_L1PG_INT_REG);
+ l2_status = iommu_read(iommu, IOMMU_L2PG_INT_REG);
+
+ if (status & IOMMU_INT_INVALID_L2PG)
+ sun50i_iommu_handle_pt_irq(iommu,
+ IOMMU_INT_ERR_ADDR_L2_REG,
+ IOMMU_L2PG_INT_REG);
+ else if (status & IOMMU_INT_INVALID_L1PG)
+ sun50i_iommu_handle_pt_irq(iommu,
+ IOMMU_INT_ERR_ADDR_L1_REG,
+ IOMMU_L1PG_INT_REG);
+ else
+ sun50i_iommu_handle_perm_irq(iommu);
+
+ iommu_write(iommu, IOMMU_INT_CLR_REG, status);
+
+ resets = (status | l1_status | l2_status) & IOMMU_INT_MASTER_MASK;
+ iommu_write(iommu, IOMMU_RESET_REG, ~resets);
+ iommu_write(iommu, IOMMU_RESET_REG, IOMMU_RESET_RELEASE_ALL);
+
+ spin_unlock(&iommu->iommu_lock);
+
+ return IRQ_HANDLED;
+}
+
+static int sun50i_iommu_probe(struct platform_device *pdev)
+{
+ struct sun50i_iommu *iommu;
+ int ret, irq;
+
+ iommu = devm_kzalloc(&pdev->dev, sizeof(*iommu), GFP_KERNEL);
+ if (!iommu)
+ return -ENOMEM;
+ spin_lock_init(&iommu->iommu_lock);
+ platform_set_drvdata(pdev, iommu);
+ iommu->dev = &pdev->dev;
+
+ iommu->pt_pool = kmem_cache_create(dev_name(&pdev->dev),
+ PT_SIZE, PT_SIZE,
+ SLAB_HWCACHE_ALIGN,
+ NULL);
+ if (!iommu->pt_pool)
+ return -ENOMEM;
+
+ iommu->group = iommu_group_alloc();
+ if (IS_ERR(iommu->group)) {
+ ret = PTR_ERR(iommu->group);
+ goto err_free_cache;
+ }
+
+ iommu->base = devm_platform_ioremap_resource(pdev, 0);
+ if (IS_ERR(iommu->base)) {
+ ret = PTR_ERR(iommu->base);
+ goto err_free_group;
+ }
+
+ irq = platform_get_irq(pdev, 0);
+ if (irq < 0) {
+ ret = irq;
+ goto err_free_group;
+ }
+
+ iommu->clk = devm_clk_get(&pdev->dev, NULL);
+ if (IS_ERR(iommu->clk)) {
+ dev_err(&pdev->dev, "Couldn't get our clock.\n");
+ ret = PTR_ERR(iommu->clk);
+ goto err_free_group;
+ }
+
+ iommu->reset = devm_reset_control_get(&pdev->dev, NULL);
+ if (IS_ERR(iommu->reset)) {
+ dev_err(&pdev->dev, "Couldn't get our reset line.\n");
+ ret = PTR_ERR(iommu->reset);
+ goto err_free_group;
+ }
+
+ ret = iommu_device_sysfs_add(&iommu->iommu, &pdev->dev,
+ NULL, dev_name(&pdev->dev));
+ if (ret)
+ goto err_free_group;
+
+ iommu_device_set_ops(&iommu->iommu, &sun50i_iommu_ops);
+ iommu_device_set_fwnode(&iommu->iommu, &pdev->dev.of_node->fwnode);
+
+ ret = iommu_device_register(&iommu->iommu);
+ if (ret)
+ goto err_remove_sysfs;
+
+ ret = devm_request_irq(&pdev->dev, irq, sun50i_iommu_irq, 0,
+ dev_name(&pdev->dev), iommu);
+ if (ret < 0)
+ goto err_unregister;
+
+ bus_set_iommu(&platform_bus_type, &sun50i_iommu_ops);
+
+ return 0;
+
+err_unregister:
+ iommu_device_unregister(&iommu->iommu);
+
+err_remove_sysfs:
+ iommu_device_sysfs_remove(&iommu->iommu);
+
+err_free_group:
+ iommu_group_put(iommu->group);
+
+err_free_cache:
+ kmem_cache_destroy(iommu->pt_pool);
+
+ return ret;
+}
+
+static const struct of_device_id sun50i_iommu_dt[] = {
+ { .compatible = "allwinner,sun50i-h6-iommu", },
+ { /* sentinel */ },
+};
+MODULE_DEVICE_TABLE(of, sun50i_iommu_dt);
+
+static struct platform_driver sun50i_iommu_driver = {
+ .driver = {
+ .name = "sun50i-iommu",
+ .of_match_table = sun50i_iommu_dt,
+ .suppress_bind_attrs = true,
+ }
+};
+builtin_platform_driver_probe(sun50i_iommu_driver, sun50i_iommu_probe);
+
+MODULE_DESCRIPTION("Allwinner H6 IOMMU driver");
+MODULE_AUTHOR("Maxime Ripard <maxime@cerno.tech>");
+MODULE_AUTHOR("zhuxianbin <zhuxianbin@allwinnertech.com>");
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/drivers/iommu/tegra-gart.c b/drivers/iommu/tegra-gart.c
new file mode 100644
index 000000000..fac720273
--- /dev/null
+++ b/drivers/iommu/tegra-gart.c
@@ -0,0 +1,380 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * IOMMU API for Graphics Address Relocation Table on Tegra20
+ *
+ * Copyright (c) 2010-2012, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Author: Hiroshi DOYU <hdoyu@nvidia.com>
+ */
+
+#define dev_fmt(fmt) "gart: " fmt
+
+#include <linux/io.h>
+#include <linux/iommu.h>
+#include <linux/moduleparam.h>
+#include <linux/platform_device.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/vmalloc.h>
+
+#include <soc/tegra/mc.h>
+
+#define GART_REG_BASE 0x24
+#define GART_CONFIG (0x24 - GART_REG_BASE)
+#define GART_ENTRY_ADDR (0x28 - GART_REG_BASE)
+#define GART_ENTRY_DATA (0x2c - GART_REG_BASE)
+
+#define GART_ENTRY_PHYS_ADDR_VALID BIT(31)
+
+#define GART_PAGE_SHIFT 12
+#define GART_PAGE_SIZE (1 << GART_PAGE_SHIFT)
+#define GART_PAGE_MASK GENMASK(30, GART_PAGE_SHIFT)
+
+/* bitmap of the page sizes currently supported */
+#define GART_IOMMU_PGSIZES (GART_PAGE_SIZE)
+
+struct gart_device {
+ void __iomem *regs;
+ u32 *savedata;
+ unsigned long iovmm_base; /* offset to vmm_area start */
+ unsigned long iovmm_end; /* offset to vmm_area end */
+ spinlock_t pte_lock; /* for pagetable */
+ spinlock_t dom_lock; /* for active domain */
+ unsigned int active_devices; /* number of active devices */
+ struct iommu_domain *active_domain; /* current active domain */
+ struct iommu_device iommu; /* IOMMU Core handle */
+ struct device *dev;
+};
+
+static struct gart_device *gart_handle; /* unique for a system */
+
+static bool gart_debug;
+
+/*
+ * Any interaction between any block on PPSB and a block on APB or AHB
+ * must have these read-back to ensure the APB/AHB bus transaction is
+ * complete before initiating activity on the PPSB block.
+ */
+#define FLUSH_GART_REGS(gart) readl_relaxed((gart)->regs + GART_CONFIG)
+
+#define for_each_gart_pte(gart, iova) \
+ for (iova = gart->iovmm_base; \
+ iova < gart->iovmm_end; \
+ iova += GART_PAGE_SIZE)
+
+static inline void gart_set_pte(struct gart_device *gart,
+ unsigned long iova, unsigned long pte)
+{
+ writel_relaxed(iova, gart->regs + GART_ENTRY_ADDR);
+ writel_relaxed(pte, gart->regs + GART_ENTRY_DATA);
+}
+
+static inline unsigned long gart_read_pte(struct gart_device *gart,
+ unsigned long iova)
+{
+ unsigned long pte;
+
+ writel_relaxed(iova, gart->regs + GART_ENTRY_ADDR);
+ pte = readl_relaxed(gart->regs + GART_ENTRY_DATA);
+
+ return pte;
+}
+
+static void do_gart_setup(struct gart_device *gart, const u32 *data)
+{
+ unsigned long iova;
+
+ for_each_gart_pte(gart, iova)
+ gart_set_pte(gart, iova, data ? *(data++) : 0);
+
+ writel_relaxed(1, gart->regs + GART_CONFIG);
+ FLUSH_GART_REGS(gart);
+}
+
+static inline bool gart_iova_range_invalid(struct gart_device *gart,
+ unsigned long iova, size_t bytes)
+{
+ return unlikely(iova < gart->iovmm_base || bytes != GART_PAGE_SIZE ||
+ iova + bytes > gart->iovmm_end);
+}
+
+static inline bool gart_pte_valid(struct gart_device *gart, unsigned long iova)
+{
+ return !!(gart_read_pte(gart, iova) & GART_ENTRY_PHYS_ADDR_VALID);
+}
+
+static int gart_iommu_attach_dev(struct iommu_domain *domain,
+ struct device *dev)
+{
+ struct gart_device *gart = gart_handle;
+ int ret = 0;
+
+ spin_lock(&gart->dom_lock);
+
+ if (gart->active_domain && gart->active_domain != domain) {
+ ret = -EBUSY;
+ } else if (dev_iommu_priv_get(dev) != domain) {
+ dev_iommu_priv_set(dev, domain);
+ gart->active_domain = domain;
+ gart->active_devices++;
+ }
+
+ spin_unlock(&gart->dom_lock);
+
+ return ret;
+}
+
+static void gart_iommu_detach_dev(struct iommu_domain *domain,
+ struct device *dev)
+{
+ struct gart_device *gart = gart_handle;
+
+ spin_lock(&gart->dom_lock);
+
+ if (dev_iommu_priv_get(dev) == domain) {
+ dev_iommu_priv_set(dev, NULL);
+
+ if (--gart->active_devices == 0)
+ gart->active_domain = NULL;
+ }
+
+ spin_unlock(&gart->dom_lock);
+}
+
+static struct iommu_domain *gart_iommu_domain_alloc(unsigned type)
+{
+ struct iommu_domain *domain;
+
+ if (type != IOMMU_DOMAIN_UNMANAGED)
+ return NULL;
+
+ domain = kzalloc(sizeof(*domain), GFP_KERNEL);
+ if (domain) {
+ domain->geometry.aperture_start = gart_handle->iovmm_base;
+ domain->geometry.aperture_end = gart_handle->iovmm_end - 1;
+ domain->geometry.force_aperture = true;
+ }
+
+ return domain;
+}
+
+static void gart_iommu_domain_free(struct iommu_domain *domain)
+{
+ WARN_ON(gart_handle->active_domain == domain);
+ kfree(domain);
+}
+
+static inline int __gart_iommu_map(struct gart_device *gart, unsigned long iova,
+ unsigned long pa)
+{
+ if (unlikely(gart_debug && gart_pte_valid(gart, iova))) {
+ dev_err(gart->dev, "Page entry is in-use\n");
+ return -EINVAL;
+ }
+
+ gart_set_pte(gart, iova, GART_ENTRY_PHYS_ADDR_VALID | pa);
+
+ return 0;
+}
+
+static int gart_iommu_map(struct iommu_domain *domain, unsigned long iova,
+ phys_addr_t pa, size_t bytes, int prot, gfp_t gfp)
+{
+ struct gart_device *gart = gart_handle;
+ int ret;
+
+ if (gart_iova_range_invalid(gart, iova, bytes))
+ return -EINVAL;
+
+ spin_lock(&gart->pte_lock);
+ ret = __gart_iommu_map(gart, iova, (unsigned long)pa);
+ spin_unlock(&gart->pte_lock);
+
+ return ret;
+}
+
+static inline int __gart_iommu_unmap(struct gart_device *gart,
+ unsigned long iova)
+{
+ if (unlikely(gart_debug && !gart_pte_valid(gart, iova))) {
+ dev_err(gart->dev, "Page entry is invalid\n");
+ return -EINVAL;
+ }
+
+ gart_set_pte(gart, iova, 0);
+
+ return 0;
+}
+
+static size_t gart_iommu_unmap(struct iommu_domain *domain, unsigned long iova,
+ size_t bytes, struct iommu_iotlb_gather *gather)
+{
+ struct gart_device *gart = gart_handle;
+ int err;
+
+ if (gart_iova_range_invalid(gart, iova, bytes))
+ return 0;
+
+ spin_lock(&gart->pte_lock);
+ err = __gart_iommu_unmap(gart, iova);
+ spin_unlock(&gart->pte_lock);
+
+ return err ? 0 : bytes;
+}
+
+static phys_addr_t gart_iommu_iova_to_phys(struct iommu_domain *domain,
+ dma_addr_t iova)
+{
+ struct gart_device *gart = gart_handle;
+ unsigned long pte;
+
+ if (gart_iova_range_invalid(gart, iova, GART_PAGE_SIZE))
+ return -EINVAL;
+
+ spin_lock(&gart->pte_lock);
+ pte = gart_read_pte(gart, iova);
+ spin_unlock(&gart->pte_lock);
+
+ return pte & GART_PAGE_MASK;
+}
+
+static bool gart_iommu_capable(enum iommu_cap cap)
+{
+ return false;
+}
+
+static struct iommu_device *gart_iommu_probe_device(struct device *dev)
+{
+ if (!dev_iommu_fwspec_get(dev))
+ return ERR_PTR(-ENODEV);
+
+ return &gart_handle->iommu;
+}
+
+static void gart_iommu_release_device(struct device *dev)
+{
+}
+
+static int gart_iommu_of_xlate(struct device *dev,
+ struct of_phandle_args *args)
+{
+ return 0;
+}
+
+static void gart_iommu_sync_map(struct iommu_domain *domain)
+{
+ FLUSH_GART_REGS(gart_handle);
+}
+
+static void gart_iommu_sync(struct iommu_domain *domain,
+ struct iommu_iotlb_gather *gather)
+{
+ gart_iommu_sync_map(domain);
+}
+
+static const struct iommu_ops gart_iommu_ops = {
+ .capable = gart_iommu_capable,
+ .domain_alloc = gart_iommu_domain_alloc,
+ .domain_free = gart_iommu_domain_free,
+ .attach_dev = gart_iommu_attach_dev,
+ .detach_dev = gart_iommu_detach_dev,
+ .probe_device = gart_iommu_probe_device,
+ .release_device = gart_iommu_release_device,
+ .device_group = generic_device_group,
+ .map = gart_iommu_map,
+ .unmap = gart_iommu_unmap,
+ .iova_to_phys = gart_iommu_iova_to_phys,
+ .pgsize_bitmap = GART_IOMMU_PGSIZES,
+ .of_xlate = gart_iommu_of_xlate,
+ .iotlb_sync_map = gart_iommu_sync_map,
+ .iotlb_sync = gart_iommu_sync,
+};
+
+int tegra_gart_suspend(struct gart_device *gart)
+{
+ u32 *data = gart->savedata;
+ unsigned long iova;
+
+ /*
+ * All GART users shall be suspended at this point. Disable
+ * address translation to trap all GART accesses as invalid
+ * memory accesses.
+ */
+ writel_relaxed(0, gart->regs + GART_CONFIG);
+ FLUSH_GART_REGS(gart);
+
+ for_each_gart_pte(gart, iova)
+ *(data++) = gart_read_pte(gart, iova);
+
+ return 0;
+}
+
+int tegra_gart_resume(struct gart_device *gart)
+{
+ do_gart_setup(gart, gart->savedata);
+
+ return 0;
+}
+
+struct gart_device *tegra_gart_probe(struct device *dev, struct tegra_mc *mc)
+{
+ struct gart_device *gart;
+ struct resource *res;
+ int err;
+
+ BUILD_BUG_ON(PAGE_SHIFT != GART_PAGE_SHIFT);
+
+ /* the GART memory aperture is required */
+ res = platform_get_resource(to_platform_device(dev), IORESOURCE_MEM, 1);
+ if (!res) {
+ dev_err(dev, "Memory aperture resource unavailable\n");
+ return ERR_PTR(-ENXIO);
+ }
+
+ gart = kzalloc(sizeof(*gart), GFP_KERNEL);
+ if (!gart)
+ return ERR_PTR(-ENOMEM);
+
+ gart_handle = gart;
+
+ gart->dev = dev;
+ gart->regs = mc->regs + GART_REG_BASE;
+ gart->iovmm_base = res->start;
+ gart->iovmm_end = res->end + 1;
+ spin_lock_init(&gart->pte_lock);
+ spin_lock_init(&gart->dom_lock);
+
+ do_gart_setup(gart, NULL);
+
+ err = iommu_device_sysfs_add(&gart->iommu, dev, NULL, "gart");
+ if (err)
+ goto free_gart;
+
+ iommu_device_set_ops(&gart->iommu, &gart_iommu_ops);
+ iommu_device_set_fwnode(&gart->iommu, dev->fwnode);
+
+ err = iommu_device_register(&gart->iommu);
+ if (err)
+ goto remove_sysfs;
+
+ gart->savedata = vmalloc(resource_size(res) / GART_PAGE_SIZE *
+ sizeof(u32));
+ if (!gart->savedata) {
+ err = -ENOMEM;
+ goto unregister_iommu;
+ }
+
+ return gart;
+
+unregister_iommu:
+ iommu_device_unregister(&gart->iommu);
+remove_sysfs:
+ iommu_device_sysfs_remove(&gart->iommu);
+free_gart:
+ kfree(gart);
+
+ return ERR_PTR(err);
+}
+
+module_param(gart_debug, bool, 0644);
+MODULE_PARM_DESC(gart_debug, "Enable GART debugging");
diff --git a/drivers/iommu/tegra-smmu.c b/drivers/iommu/tegra-smmu.c
new file mode 100644
index 000000000..0becdbfea
--- /dev/null
+++ b/drivers/iommu/tegra-smmu.c
@@ -0,0 +1,1182 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2011-2014 NVIDIA CORPORATION. All rights reserved.
+ */
+
+#include <linux/bitops.h>
+#include <linux/debugfs.h>
+#include <linux/err.h>
+#include <linux/iommu.h>
+#include <linux/kernel.h>
+#include <linux/of.h>
+#include <linux/of_device.h>
+#include <linux/platform_device.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/dma-mapping.h>
+
+#include <soc/tegra/ahb.h>
+#include <soc/tegra/mc.h>
+
+struct tegra_smmu_group {
+ struct list_head list;
+ struct tegra_smmu *smmu;
+ const struct tegra_smmu_group_soc *soc;
+ struct iommu_group *group;
+ unsigned int swgroup;
+};
+
+struct tegra_smmu {
+ void __iomem *regs;
+ struct device *dev;
+
+ struct tegra_mc *mc;
+ const struct tegra_smmu_soc *soc;
+
+ struct list_head groups;
+
+ unsigned long pfn_mask;
+ unsigned long tlb_mask;
+
+ unsigned long *asids;
+ struct mutex lock;
+
+ struct list_head list;
+
+ struct dentry *debugfs;
+
+ struct iommu_device iommu; /* IOMMU Core code handle */
+};
+
+struct tegra_smmu_as {
+ struct iommu_domain domain;
+ struct tegra_smmu *smmu;
+ unsigned int use_count;
+ spinlock_t lock;
+ u32 *count;
+ struct page **pts;
+ struct page *pd;
+ dma_addr_t pd_dma;
+ unsigned id;
+ u32 attr;
+};
+
+static struct tegra_smmu_as *to_smmu_as(struct iommu_domain *dom)
+{
+ return container_of(dom, struct tegra_smmu_as, domain);
+}
+
+static inline void smmu_writel(struct tegra_smmu *smmu, u32 value,
+ unsigned long offset)
+{
+ writel(value, smmu->regs + offset);
+}
+
+static inline u32 smmu_readl(struct tegra_smmu *smmu, unsigned long offset)
+{
+ return readl(smmu->regs + offset);
+}
+
+#define SMMU_CONFIG 0x010
+#define SMMU_CONFIG_ENABLE (1 << 0)
+
+#define SMMU_TLB_CONFIG 0x14
+#define SMMU_TLB_CONFIG_HIT_UNDER_MISS (1 << 29)
+#define SMMU_TLB_CONFIG_ROUND_ROBIN_ARBITRATION (1 << 28)
+#define SMMU_TLB_CONFIG_ACTIVE_LINES(smmu) \
+ ((smmu)->soc->num_tlb_lines & (smmu)->tlb_mask)
+
+#define SMMU_PTC_CONFIG 0x18
+#define SMMU_PTC_CONFIG_ENABLE (1 << 29)
+#define SMMU_PTC_CONFIG_REQ_LIMIT(x) (((x) & 0x0f) << 24)
+#define SMMU_PTC_CONFIG_INDEX_MAP(x) ((x) & 0x3f)
+
+#define SMMU_PTB_ASID 0x01c
+#define SMMU_PTB_ASID_VALUE(x) ((x) & 0x7f)
+
+#define SMMU_PTB_DATA 0x020
+#define SMMU_PTB_DATA_VALUE(dma, attr) ((dma) >> 12 | (attr))
+
+#define SMMU_MK_PDE(dma, attr) ((dma) >> SMMU_PTE_SHIFT | (attr))
+
+#define SMMU_TLB_FLUSH 0x030
+#define SMMU_TLB_FLUSH_VA_MATCH_ALL (0 << 0)
+#define SMMU_TLB_FLUSH_VA_MATCH_SECTION (2 << 0)
+#define SMMU_TLB_FLUSH_VA_MATCH_GROUP (3 << 0)
+#define SMMU_TLB_FLUSH_VA_SECTION(addr) ((((addr) & 0xffc00000) >> 12) | \
+ SMMU_TLB_FLUSH_VA_MATCH_SECTION)
+#define SMMU_TLB_FLUSH_VA_GROUP(addr) ((((addr) & 0xffffc000) >> 12) | \
+ SMMU_TLB_FLUSH_VA_MATCH_GROUP)
+#define SMMU_TLB_FLUSH_ASID_MATCH (1 << 31)
+
+#define SMMU_PTC_FLUSH 0x034
+#define SMMU_PTC_FLUSH_TYPE_ALL (0 << 0)
+#define SMMU_PTC_FLUSH_TYPE_ADR (1 << 0)
+
+#define SMMU_PTC_FLUSH_HI 0x9b8
+#define SMMU_PTC_FLUSH_HI_MASK 0x3
+
+/* per-SWGROUP SMMU_*_ASID register */
+#define SMMU_ASID_ENABLE (1 << 31)
+#define SMMU_ASID_MASK 0x7f
+#define SMMU_ASID_VALUE(x) ((x) & SMMU_ASID_MASK)
+
+/* page table definitions */
+#define SMMU_NUM_PDE 1024
+#define SMMU_NUM_PTE 1024
+
+#define SMMU_SIZE_PD (SMMU_NUM_PDE * 4)
+#define SMMU_SIZE_PT (SMMU_NUM_PTE * 4)
+
+#define SMMU_PDE_SHIFT 22
+#define SMMU_PTE_SHIFT 12
+
+#define SMMU_PAGE_MASK (~(SMMU_SIZE_PT-1))
+#define SMMU_OFFSET_IN_PAGE(x) ((unsigned long)(x) & ~SMMU_PAGE_MASK)
+#define SMMU_PFN_PHYS(x) ((phys_addr_t)(x) << SMMU_PTE_SHIFT)
+#define SMMU_PHYS_PFN(x) ((unsigned long)((x) >> SMMU_PTE_SHIFT))
+
+#define SMMU_PD_READABLE (1 << 31)
+#define SMMU_PD_WRITABLE (1 << 30)
+#define SMMU_PD_NONSECURE (1 << 29)
+
+#define SMMU_PDE_READABLE (1 << 31)
+#define SMMU_PDE_WRITABLE (1 << 30)
+#define SMMU_PDE_NONSECURE (1 << 29)
+#define SMMU_PDE_NEXT (1 << 28)
+
+#define SMMU_PTE_READABLE (1 << 31)
+#define SMMU_PTE_WRITABLE (1 << 30)
+#define SMMU_PTE_NONSECURE (1 << 29)
+
+#define SMMU_PDE_ATTR (SMMU_PDE_READABLE | SMMU_PDE_WRITABLE | \
+ SMMU_PDE_NONSECURE)
+
+static unsigned int iova_pd_index(unsigned long iova)
+{
+ return (iova >> SMMU_PDE_SHIFT) & (SMMU_NUM_PDE - 1);
+}
+
+static unsigned int iova_pt_index(unsigned long iova)
+{
+ return (iova >> SMMU_PTE_SHIFT) & (SMMU_NUM_PTE - 1);
+}
+
+static bool smmu_dma_addr_valid(struct tegra_smmu *smmu, dma_addr_t addr)
+{
+ addr >>= 12;
+ return (addr & smmu->pfn_mask) == addr;
+}
+
+static dma_addr_t smmu_pde_to_dma(struct tegra_smmu *smmu, u32 pde)
+{
+ return (dma_addr_t)(pde & smmu->pfn_mask) << 12;
+}
+
+static void smmu_flush_ptc_all(struct tegra_smmu *smmu)
+{
+ smmu_writel(smmu, SMMU_PTC_FLUSH_TYPE_ALL, SMMU_PTC_FLUSH);
+}
+
+static inline void smmu_flush_ptc(struct tegra_smmu *smmu, dma_addr_t dma,
+ unsigned long offset)
+{
+ u32 value;
+
+ offset &= ~(smmu->mc->soc->atom_size - 1);
+
+ if (smmu->mc->soc->num_address_bits > 32) {
+#ifdef CONFIG_ARCH_DMA_ADDR_T_64BIT
+ value = (dma >> 32) & SMMU_PTC_FLUSH_HI_MASK;
+#else
+ value = 0;
+#endif
+ smmu_writel(smmu, value, SMMU_PTC_FLUSH_HI);
+ }
+
+ value = (dma + offset) | SMMU_PTC_FLUSH_TYPE_ADR;
+ smmu_writel(smmu, value, SMMU_PTC_FLUSH);
+}
+
+static inline void smmu_flush_tlb(struct tegra_smmu *smmu)
+{
+ smmu_writel(smmu, SMMU_TLB_FLUSH_VA_MATCH_ALL, SMMU_TLB_FLUSH);
+}
+
+static inline void smmu_flush_tlb_asid(struct tegra_smmu *smmu,
+ unsigned long asid)
+{
+ u32 value;
+
+ if (smmu->soc->num_asids == 4)
+ value = (asid & 0x3) << 29;
+ else
+ value = (asid & 0x7f) << 24;
+
+ value |= SMMU_TLB_FLUSH_ASID_MATCH | SMMU_TLB_FLUSH_VA_MATCH_ALL;
+ smmu_writel(smmu, value, SMMU_TLB_FLUSH);
+}
+
+static inline void smmu_flush_tlb_section(struct tegra_smmu *smmu,
+ unsigned long asid,
+ unsigned long iova)
+{
+ u32 value;
+
+ if (smmu->soc->num_asids == 4)
+ value = (asid & 0x3) << 29;
+ else
+ value = (asid & 0x7f) << 24;
+
+ value |= SMMU_TLB_FLUSH_ASID_MATCH | SMMU_TLB_FLUSH_VA_SECTION(iova);
+ smmu_writel(smmu, value, SMMU_TLB_FLUSH);
+}
+
+static inline void smmu_flush_tlb_group(struct tegra_smmu *smmu,
+ unsigned long asid,
+ unsigned long iova)
+{
+ u32 value;
+
+ if (smmu->soc->num_asids == 4)
+ value = (asid & 0x3) << 29;
+ else
+ value = (asid & 0x7f) << 24;
+
+ value |= SMMU_TLB_FLUSH_ASID_MATCH | SMMU_TLB_FLUSH_VA_GROUP(iova);
+ smmu_writel(smmu, value, SMMU_TLB_FLUSH);
+}
+
+static inline void smmu_flush(struct tegra_smmu *smmu)
+{
+ smmu_readl(smmu, SMMU_PTB_ASID);
+}
+
+static int tegra_smmu_alloc_asid(struct tegra_smmu *smmu, unsigned int *idp)
+{
+ unsigned long id;
+
+ mutex_lock(&smmu->lock);
+
+ id = find_first_zero_bit(smmu->asids, smmu->soc->num_asids);
+ if (id >= smmu->soc->num_asids) {
+ mutex_unlock(&smmu->lock);
+ return -ENOSPC;
+ }
+
+ set_bit(id, smmu->asids);
+ *idp = id;
+
+ mutex_unlock(&smmu->lock);
+ return 0;
+}
+
+static void tegra_smmu_free_asid(struct tegra_smmu *smmu, unsigned int id)
+{
+ mutex_lock(&smmu->lock);
+ clear_bit(id, smmu->asids);
+ mutex_unlock(&smmu->lock);
+}
+
+static bool tegra_smmu_capable(enum iommu_cap cap)
+{
+ return false;
+}
+
+static struct iommu_domain *tegra_smmu_domain_alloc(unsigned type)
+{
+ struct tegra_smmu_as *as;
+
+ if (type != IOMMU_DOMAIN_UNMANAGED)
+ return NULL;
+
+ as = kzalloc(sizeof(*as), GFP_KERNEL);
+ if (!as)
+ return NULL;
+
+ as->attr = SMMU_PD_READABLE | SMMU_PD_WRITABLE | SMMU_PD_NONSECURE;
+
+ as->pd = alloc_page(GFP_KERNEL | __GFP_DMA | __GFP_ZERO);
+ if (!as->pd) {
+ kfree(as);
+ return NULL;
+ }
+
+ as->count = kcalloc(SMMU_NUM_PDE, sizeof(u32), GFP_KERNEL);
+ if (!as->count) {
+ __free_page(as->pd);
+ kfree(as);
+ return NULL;
+ }
+
+ as->pts = kcalloc(SMMU_NUM_PDE, sizeof(*as->pts), GFP_KERNEL);
+ if (!as->pts) {
+ kfree(as->count);
+ __free_page(as->pd);
+ kfree(as);
+ return NULL;
+ }
+
+ spin_lock_init(&as->lock);
+
+ /* setup aperture */
+ as->domain.geometry.aperture_start = 0;
+ as->domain.geometry.aperture_end = 0xffffffff;
+ as->domain.geometry.force_aperture = true;
+
+ return &as->domain;
+}
+
+static void tegra_smmu_domain_free(struct iommu_domain *domain)
+{
+ struct tegra_smmu_as *as = to_smmu_as(domain);
+
+ /* TODO: free page directory and page tables */
+
+ WARN_ON_ONCE(as->use_count);
+ kfree(as->count);
+ kfree(as->pts);
+ kfree(as);
+}
+
+static const struct tegra_smmu_swgroup *
+tegra_smmu_find_swgroup(struct tegra_smmu *smmu, unsigned int swgroup)
+{
+ const struct tegra_smmu_swgroup *group = NULL;
+ unsigned int i;
+
+ for (i = 0; i < smmu->soc->num_swgroups; i++) {
+ if (smmu->soc->swgroups[i].swgroup == swgroup) {
+ group = &smmu->soc->swgroups[i];
+ break;
+ }
+ }
+
+ return group;
+}
+
+static void tegra_smmu_enable(struct tegra_smmu *smmu, unsigned int swgroup,
+ unsigned int asid)
+{
+ const struct tegra_smmu_swgroup *group;
+ unsigned int i;
+ u32 value;
+
+ group = tegra_smmu_find_swgroup(smmu, swgroup);
+ if (group) {
+ value = smmu_readl(smmu, group->reg);
+ value &= ~SMMU_ASID_MASK;
+ value |= SMMU_ASID_VALUE(asid);
+ value |= SMMU_ASID_ENABLE;
+ smmu_writel(smmu, value, group->reg);
+ } else {
+ pr_warn("%s group from swgroup %u not found\n", __func__,
+ swgroup);
+ /* No point moving ahead if group was not found */
+ return;
+ }
+
+ for (i = 0; i < smmu->soc->num_clients; i++) {
+ const struct tegra_mc_client *client = &smmu->soc->clients[i];
+
+ if (client->swgroup != swgroup)
+ continue;
+
+ value = smmu_readl(smmu, client->smmu.reg);
+ value |= BIT(client->smmu.bit);
+ smmu_writel(smmu, value, client->smmu.reg);
+ }
+}
+
+static void tegra_smmu_disable(struct tegra_smmu *smmu, unsigned int swgroup,
+ unsigned int asid)
+{
+ const struct tegra_smmu_swgroup *group;
+ unsigned int i;
+ u32 value;
+
+ group = tegra_smmu_find_swgroup(smmu, swgroup);
+ if (group) {
+ value = smmu_readl(smmu, group->reg);
+ value &= ~SMMU_ASID_MASK;
+ value |= SMMU_ASID_VALUE(asid);
+ value &= ~SMMU_ASID_ENABLE;
+ smmu_writel(smmu, value, group->reg);
+ }
+
+ for (i = 0; i < smmu->soc->num_clients; i++) {
+ const struct tegra_mc_client *client = &smmu->soc->clients[i];
+
+ if (client->swgroup != swgroup)
+ continue;
+
+ value = smmu_readl(smmu, client->smmu.reg);
+ value &= ~BIT(client->smmu.bit);
+ smmu_writel(smmu, value, client->smmu.reg);
+ }
+}
+
+static int tegra_smmu_as_prepare(struct tegra_smmu *smmu,
+ struct tegra_smmu_as *as)
+{
+ u32 value;
+ int err;
+
+ if (as->use_count > 0) {
+ as->use_count++;
+ return 0;
+ }
+
+ as->pd_dma = dma_map_page(smmu->dev, as->pd, 0, SMMU_SIZE_PD,
+ DMA_TO_DEVICE);
+ if (dma_mapping_error(smmu->dev, as->pd_dma))
+ return -ENOMEM;
+
+ /* We can't handle 64-bit DMA addresses */
+ if (!smmu_dma_addr_valid(smmu, as->pd_dma)) {
+ err = -ENOMEM;
+ goto err_unmap;
+ }
+
+ err = tegra_smmu_alloc_asid(smmu, &as->id);
+ if (err < 0)
+ goto err_unmap;
+
+ smmu_flush_ptc(smmu, as->pd_dma, 0);
+ smmu_flush_tlb_asid(smmu, as->id);
+
+ smmu_writel(smmu, as->id & 0x7f, SMMU_PTB_ASID);
+ value = SMMU_PTB_DATA_VALUE(as->pd_dma, as->attr);
+ smmu_writel(smmu, value, SMMU_PTB_DATA);
+ smmu_flush(smmu);
+
+ as->smmu = smmu;
+ as->use_count++;
+
+ return 0;
+
+err_unmap:
+ dma_unmap_page(smmu->dev, as->pd_dma, SMMU_SIZE_PD, DMA_TO_DEVICE);
+ return err;
+}
+
+static void tegra_smmu_as_unprepare(struct tegra_smmu *smmu,
+ struct tegra_smmu_as *as)
+{
+ if (--as->use_count > 0)
+ return;
+
+ tegra_smmu_free_asid(smmu, as->id);
+
+ dma_unmap_page(smmu->dev, as->pd_dma, SMMU_SIZE_PD, DMA_TO_DEVICE);
+
+ as->smmu = NULL;
+}
+
+static int tegra_smmu_attach_dev(struct iommu_domain *domain,
+ struct device *dev)
+{
+ struct tegra_smmu *smmu = dev_iommu_priv_get(dev);
+ struct tegra_smmu_as *as = to_smmu_as(domain);
+ struct device_node *np = dev->of_node;
+ struct of_phandle_args args;
+ unsigned int index = 0;
+ int err = 0;
+
+ while (!of_parse_phandle_with_args(np, "iommus", "#iommu-cells", index,
+ &args)) {
+ unsigned int swgroup = args.args[0];
+
+ if (args.np != smmu->dev->of_node) {
+ of_node_put(args.np);
+ continue;
+ }
+
+ of_node_put(args.np);
+
+ err = tegra_smmu_as_prepare(smmu, as);
+ if (err < 0)
+ return err;
+
+ tegra_smmu_enable(smmu, swgroup, as->id);
+ index++;
+ }
+
+ if (index == 0)
+ return -ENODEV;
+
+ return 0;
+}
+
+static void tegra_smmu_detach_dev(struct iommu_domain *domain, struct device *dev)
+{
+ struct tegra_smmu_as *as = to_smmu_as(domain);
+ struct device_node *np = dev->of_node;
+ struct tegra_smmu *smmu = as->smmu;
+ struct of_phandle_args args;
+ unsigned int index = 0;
+
+ while (!of_parse_phandle_with_args(np, "iommus", "#iommu-cells", index,
+ &args)) {
+ unsigned int swgroup = args.args[0];
+
+ if (args.np != smmu->dev->of_node) {
+ of_node_put(args.np);
+ continue;
+ }
+
+ of_node_put(args.np);
+
+ tegra_smmu_disable(smmu, swgroup, as->id);
+ tegra_smmu_as_unprepare(smmu, as);
+ index++;
+ }
+}
+
+static void tegra_smmu_set_pde(struct tegra_smmu_as *as, unsigned long iova,
+ u32 value)
+{
+ unsigned int pd_index = iova_pd_index(iova);
+ struct tegra_smmu *smmu = as->smmu;
+ u32 *pd = page_address(as->pd);
+ unsigned long offset = pd_index * sizeof(*pd);
+
+ /* Set the page directory entry first */
+ pd[pd_index] = value;
+
+ /* The flush the page directory entry from caches */
+ dma_sync_single_range_for_device(smmu->dev, as->pd_dma, offset,
+ sizeof(*pd), DMA_TO_DEVICE);
+
+ /* And flush the iommu */
+ smmu_flush_ptc(smmu, as->pd_dma, offset);
+ smmu_flush_tlb_section(smmu, as->id, iova);
+ smmu_flush(smmu);
+}
+
+static u32 *tegra_smmu_pte_offset(struct page *pt_page, unsigned long iova)
+{
+ u32 *pt = page_address(pt_page);
+
+ return pt + iova_pt_index(iova);
+}
+
+static u32 *tegra_smmu_pte_lookup(struct tegra_smmu_as *as, unsigned long iova,
+ dma_addr_t *dmap)
+{
+ unsigned int pd_index = iova_pd_index(iova);
+ struct tegra_smmu *smmu = as->smmu;
+ struct page *pt_page;
+ u32 *pd;
+
+ pt_page = as->pts[pd_index];
+ if (!pt_page)
+ return NULL;
+
+ pd = page_address(as->pd);
+ *dmap = smmu_pde_to_dma(smmu, pd[pd_index]);
+
+ return tegra_smmu_pte_offset(pt_page, iova);
+}
+
+static u32 *as_get_pte(struct tegra_smmu_as *as, dma_addr_t iova,
+ dma_addr_t *dmap, struct page *page)
+{
+ unsigned int pde = iova_pd_index(iova);
+ struct tegra_smmu *smmu = as->smmu;
+
+ if (!as->pts[pde]) {
+ dma_addr_t dma;
+
+ dma = dma_map_page(smmu->dev, page, 0, SMMU_SIZE_PT,
+ DMA_TO_DEVICE);
+ if (dma_mapping_error(smmu->dev, dma)) {
+ __free_page(page);
+ return NULL;
+ }
+
+ if (!smmu_dma_addr_valid(smmu, dma)) {
+ dma_unmap_page(smmu->dev, dma, SMMU_SIZE_PT,
+ DMA_TO_DEVICE);
+ __free_page(page);
+ return NULL;
+ }
+
+ as->pts[pde] = page;
+
+ tegra_smmu_set_pde(as, iova, SMMU_MK_PDE(dma, SMMU_PDE_ATTR |
+ SMMU_PDE_NEXT));
+
+ *dmap = dma;
+ } else {
+ u32 *pd = page_address(as->pd);
+
+ *dmap = smmu_pde_to_dma(smmu, pd[pde]);
+ }
+
+ return tegra_smmu_pte_offset(as->pts[pde], iova);
+}
+
+static void tegra_smmu_pte_get_use(struct tegra_smmu_as *as, unsigned long iova)
+{
+ unsigned int pd_index = iova_pd_index(iova);
+
+ as->count[pd_index]++;
+}
+
+static void tegra_smmu_pte_put_use(struct tegra_smmu_as *as, unsigned long iova)
+{
+ unsigned int pde = iova_pd_index(iova);
+ struct page *page = as->pts[pde];
+
+ /*
+ * When no entries in this page table are used anymore, return the
+ * memory page to the system.
+ */
+ if (--as->count[pde] == 0) {
+ struct tegra_smmu *smmu = as->smmu;
+ u32 *pd = page_address(as->pd);
+ dma_addr_t pte_dma = smmu_pde_to_dma(smmu, pd[pde]);
+
+ tegra_smmu_set_pde(as, iova, 0);
+
+ dma_unmap_page(smmu->dev, pte_dma, SMMU_SIZE_PT, DMA_TO_DEVICE);
+ __free_page(page);
+ as->pts[pde] = NULL;
+ }
+}
+
+static void tegra_smmu_set_pte(struct tegra_smmu_as *as, unsigned long iova,
+ u32 *pte, dma_addr_t pte_dma, u32 val)
+{
+ struct tegra_smmu *smmu = as->smmu;
+ unsigned long offset = SMMU_OFFSET_IN_PAGE(pte);
+
+ *pte = val;
+
+ dma_sync_single_range_for_device(smmu->dev, pte_dma, offset,
+ 4, DMA_TO_DEVICE);
+ smmu_flush_ptc(smmu, pte_dma, offset);
+ smmu_flush_tlb_group(smmu, as->id, iova);
+ smmu_flush(smmu);
+}
+
+static struct page *as_get_pde_page(struct tegra_smmu_as *as,
+ unsigned long iova, gfp_t gfp,
+ unsigned long *flags)
+{
+ unsigned int pde = iova_pd_index(iova);
+ struct page *page = as->pts[pde];
+
+ /* at first check whether allocation needs to be done at all */
+ if (page)
+ return page;
+
+ /*
+ * In order to prevent exhaustion of the atomic memory pool, we
+ * allocate page in a sleeping context if GFP flags permit. Hence
+ * spinlock needs to be unlocked and re-locked after allocation.
+ */
+ if (!(gfp & __GFP_ATOMIC))
+ spin_unlock_irqrestore(&as->lock, *flags);
+
+ page = alloc_page(gfp | __GFP_DMA | __GFP_ZERO);
+
+ if (!(gfp & __GFP_ATOMIC))
+ spin_lock_irqsave(&as->lock, *flags);
+
+ /*
+ * In a case of blocking allocation, a concurrent mapping may win
+ * the PDE allocation. In this case the allocated page isn't needed
+ * if allocation succeeded and the allocation failure isn't fatal.
+ */
+ if (as->pts[pde]) {
+ if (page)
+ __free_page(page);
+
+ page = as->pts[pde];
+ }
+
+ return page;
+}
+
+static int
+__tegra_smmu_map(struct iommu_domain *domain, unsigned long iova,
+ phys_addr_t paddr, size_t size, int prot, gfp_t gfp,
+ unsigned long *flags)
+{
+ struct tegra_smmu_as *as = to_smmu_as(domain);
+ dma_addr_t pte_dma;
+ struct page *page;
+ u32 pte_attrs;
+ u32 *pte;
+
+ page = as_get_pde_page(as, iova, gfp, flags);
+ if (!page)
+ return -ENOMEM;
+
+ pte = as_get_pte(as, iova, &pte_dma, page);
+ if (!pte)
+ return -ENOMEM;
+
+ /* If we aren't overwriting a pre-existing entry, increment use */
+ if (*pte == 0)
+ tegra_smmu_pte_get_use(as, iova);
+
+ pte_attrs = SMMU_PTE_NONSECURE;
+
+ if (prot & IOMMU_READ)
+ pte_attrs |= SMMU_PTE_READABLE;
+
+ if (prot & IOMMU_WRITE)
+ pte_attrs |= SMMU_PTE_WRITABLE;
+
+ tegra_smmu_set_pte(as, iova, pte, pte_dma,
+ SMMU_PHYS_PFN(paddr) | pte_attrs);
+
+ return 0;
+}
+
+static size_t
+__tegra_smmu_unmap(struct iommu_domain *domain, unsigned long iova,
+ size_t size, struct iommu_iotlb_gather *gather)
+{
+ struct tegra_smmu_as *as = to_smmu_as(domain);
+ dma_addr_t pte_dma;
+ u32 *pte;
+
+ pte = tegra_smmu_pte_lookup(as, iova, &pte_dma);
+ if (!pte || !*pte)
+ return 0;
+
+ tegra_smmu_set_pte(as, iova, pte, pte_dma, 0);
+ tegra_smmu_pte_put_use(as, iova);
+
+ return size;
+}
+
+static int tegra_smmu_map(struct iommu_domain *domain, unsigned long iova,
+ phys_addr_t paddr, size_t size, int prot, gfp_t gfp)
+{
+ struct tegra_smmu_as *as = to_smmu_as(domain);
+ unsigned long flags;
+ int ret;
+
+ spin_lock_irqsave(&as->lock, flags);
+ ret = __tegra_smmu_map(domain, iova, paddr, size, prot, gfp, &flags);
+ spin_unlock_irqrestore(&as->lock, flags);
+
+ return ret;
+}
+
+static size_t tegra_smmu_unmap(struct iommu_domain *domain, unsigned long iova,
+ size_t size, struct iommu_iotlb_gather *gather)
+{
+ struct tegra_smmu_as *as = to_smmu_as(domain);
+ unsigned long flags;
+
+ spin_lock_irqsave(&as->lock, flags);
+ size = __tegra_smmu_unmap(domain, iova, size, gather);
+ spin_unlock_irqrestore(&as->lock, flags);
+
+ return size;
+}
+
+static phys_addr_t tegra_smmu_iova_to_phys(struct iommu_domain *domain,
+ dma_addr_t iova)
+{
+ struct tegra_smmu_as *as = to_smmu_as(domain);
+ unsigned long pfn;
+ dma_addr_t pte_dma;
+ u32 *pte;
+
+ pte = tegra_smmu_pte_lookup(as, iova, &pte_dma);
+ if (!pte || !*pte)
+ return 0;
+
+ pfn = *pte & as->smmu->pfn_mask;
+
+ return SMMU_PFN_PHYS(pfn) + SMMU_OFFSET_IN_PAGE(iova);
+}
+
+static struct tegra_smmu *tegra_smmu_find(struct device_node *np)
+{
+ struct platform_device *pdev;
+ struct tegra_mc *mc;
+
+ pdev = of_find_device_by_node(np);
+ if (!pdev)
+ return NULL;
+
+ mc = platform_get_drvdata(pdev);
+ if (!mc)
+ return NULL;
+
+ return mc->smmu;
+}
+
+static int tegra_smmu_configure(struct tegra_smmu *smmu, struct device *dev,
+ struct of_phandle_args *args)
+{
+ const struct iommu_ops *ops = smmu->iommu.ops;
+ int err;
+
+ err = iommu_fwspec_init(dev, &dev->of_node->fwnode, ops);
+ if (err < 0) {
+ dev_err(dev, "failed to initialize fwspec: %d\n", err);
+ return err;
+ }
+
+ err = ops->of_xlate(dev, args);
+ if (err < 0) {
+ dev_err(dev, "failed to parse SW group ID: %d\n", err);
+ iommu_fwspec_free(dev);
+ return err;
+ }
+
+ return 0;
+}
+
+static struct iommu_device *tegra_smmu_probe_device(struct device *dev)
+{
+ struct device_node *np = dev->of_node;
+ struct tegra_smmu *smmu = NULL;
+ struct of_phandle_args args;
+ unsigned int index = 0;
+ int err;
+
+ while (of_parse_phandle_with_args(np, "iommus", "#iommu-cells", index,
+ &args) == 0) {
+ smmu = tegra_smmu_find(args.np);
+ if (smmu) {
+ err = tegra_smmu_configure(smmu, dev, &args);
+ of_node_put(args.np);
+
+ if (err < 0)
+ return ERR_PTR(err);
+
+ /*
+ * Only a single IOMMU master interface is currently
+ * supported by the Linux kernel, so abort after the
+ * first match.
+ */
+ dev_iommu_priv_set(dev, smmu);
+
+ break;
+ }
+
+ of_node_put(args.np);
+ index++;
+ }
+
+ if (!smmu)
+ return ERR_PTR(-ENODEV);
+
+ return &smmu->iommu;
+}
+
+static void tegra_smmu_release_device(struct device *dev)
+{
+ dev_iommu_priv_set(dev, NULL);
+}
+
+static const struct tegra_smmu_group_soc *
+tegra_smmu_find_group(struct tegra_smmu *smmu, unsigned int swgroup)
+{
+ unsigned int i, j;
+
+ for (i = 0; i < smmu->soc->num_groups; i++)
+ for (j = 0; j < smmu->soc->groups[i].num_swgroups; j++)
+ if (smmu->soc->groups[i].swgroups[j] == swgroup)
+ return &smmu->soc->groups[i];
+
+ return NULL;
+}
+
+static void tegra_smmu_group_release(void *iommu_data)
+{
+ struct tegra_smmu_group *group = iommu_data;
+ struct tegra_smmu *smmu = group->smmu;
+
+ mutex_lock(&smmu->lock);
+ list_del(&group->list);
+ mutex_unlock(&smmu->lock);
+}
+
+static struct iommu_group *tegra_smmu_group_get(struct tegra_smmu *smmu,
+ unsigned int swgroup)
+{
+ const struct tegra_smmu_group_soc *soc;
+ struct tegra_smmu_group *group;
+ struct iommu_group *grp;
+
+ /* Find group_soc associating with swgroup */
+ soc = tegra_smmu_find_group(smmu, swgroup);
+
+ mutex_lock(&smmu->lock);
+
+ /* Find existing iommu_group associating with swgroup or group_soc */
+ list_for_each_entry(group, &smmu->groups, list)
+ if ((group->swgroup == swgroup) || (soc && group->soc == soc)) {
+ grp = iommu_group_ref_get(group->group);
+ mutex_unlock(&smmu->lock);
+ return grp;
+ }
+
+ group = devm_kzalloc(smmu->dev, sizeof(*group), GFP_KERNEL);
+ if (!group) {
+ mutex_unlock(&smmu->lock);
+ return NULL;
+ }
+
+ INIT_LIST_HEAD(&group->list);
+ group->swgroup = swgroup;
+ group->smmu = smmu;
+ group->soc = soc;
+
+ group->group = iommu_group_alloc();
+ if (IS_ERR(group->group)) {
+ devm_kfree(smmu->dev, group);
+ mutex_unlock(&smmu->lock);
+ return NULL;
+ }
+
+ iommu_group_set_iommudata(group->group, group, tegra_smmu_group_release);
+ if (soc)
+ iommu_group_set_name(group->group, soc->name);
+ list_add_tail(&group->list, &smmu->groups);
+ mutex_unlock(&smmu->lock);
+
+ return group->group;
+}
+
+static struct iommu_group *tegra_smmu_device_group(struct device *dev)
+{
+ struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
+ struct tegra_smmu *smmu = dev_iommu_priv_get(dev);
+ struct iommu_group *group;
+
+ group = tegra_smmu_group_get(smmu, fwspec->ids[0]);
+ if (!group)
+ group = generic_device_group(dev);
+
+ return group;
+}
+
+static int tegra_smmu_of_xlate(struct device *dev,
+ struct of_phandle_args *args)
+{
+ u32 id = args->args[0];
+
+ return iommu_fwspec_add_ids(dev, &id, 1);
+}
+
+static const struct iommu_ops tegra_smmu_ops = {
+ .capable = tegra_smmu_capable,
+ .domain_alloc = tegra_smmu_domain_alloc,
+ .domain_free = tegra_smmu_domain_free,
+ .attach_dev = tegra_smmu_attach_dev,
+ .detach_dev = tegra_smmu_detach_dev,
+ .probe_device = tegra_smmu_probe_device,
+ .release_device = tegra_smmu_release_device,
+ .device_group = tegra_smmu_device_group,
+ .map = tegra_smmu_map,
+ .unmap = tegra_smmu_unmap,
+ .iova_to_phys = tegra_smmu_iova_to_phys,
+ .of_xlate = tegra_smmu_of_xlate,
+ .pgsize_bitmap = SZ_4K,
+};
+
+static void tegra_smmu_ahb_enable(void)
+{
+ static const struct of_device_id ahb_match[] = {
+ { .compatible = "nvidia,tegra30-ahb", },
+ { }
+ };
+ struct device_node *ahb;
+
+ ahb = of_find_matching_node(NULL, ahb_match);
+ if (ahb) {
+ tegra_ahb_enable_smmu(ahb);
+ of_node_put(ahb);
+ }
+}
+
+static int tegra_smmu_swgroups_show(struct seq_file *s, void *data)
+{
+ struct tegra_smmu *smmu = s->private;
+ unsigned int i;
+ u32 value;
+
+ seq_printf(s, "swgroup enabled ASID\n");
+ seq_printf(s, "------------------------\n");
+
+ for (i = 0; i < smmu->soc->num_swgroups; i++) {
+ const struct tegra_smmu_swgroup *group = &smmu->soc->swgroups[i];
+ const char *status;
+ unsigned int asid;
+
+ value = smmu_readl(smmu, group->reg);
+
+ if (value & SMMU_ASID_ENABLE)
+ status = "yes";
+ else
+ status = "no";
+
+ asid = value & SMMU_ASID_MASK;
+
+ seq_printf(s, "%-9s %-7s %#04x\n", group->name, status,
+ asid);
+ }
+
+ return 0;
+}
+
+DEFINE_SHOW_ATTRIBUTE(tegra_smmu_swgroups);
+
+static int tegra_smmu_clients_show(struct seq_file *s, void *data)
+{
+ struct tegra_smmu *smmu = s->private;
+ unsigned int i;
+ u32 value;
+
+ seq_printf(s, "client enabled\n");
+ seq_printf(s, "--------------------\n");
+
+ for (i = 0; i < smmu->soc->num_clients; i++) {
+ const struct tegra_mc_client *client = &smmu->soc->clients[i];
+ const char *status;
+
+ value = smmu_readl(smmu, client->smmu.reg);
+
+ if (value & BIT(client->smmu.bit))
+ status = "yes";
+ else
+ status = "no";
+
+ seq_printf(s, "%-12s %s\n", client->name, status);
+ }
+
+ return 0;
+}
+
+DEFINE_SHOW_ATTRIBUTE(tegra_smmu_clients);
+
+static void tegra_smmu_debugfs_init(struct tegra_smmu *smmu)
+{
+ smmu->debugfs = debugfs_create_dir("smmu", NULL);
+ if (!smmu->debugfs)
+ return;
+
+ debugfs_create_file("swgroups", S_IRUGO, smmu->debugfs, smmu,
+ &tegra_smmu_swgroups_fops);
+ debugfs_create_file("clients", S_IRUGO, smmu->debugfs, smmu,
+ &tegra_smmu_clients_fops);
+}
+
+static void tegra_smmu_debugfs_exit(struct tegra_smmu *smmu)
+{
+ debugfs_remove_recursive(smmu->debugfs);
+}
+
+struct tegra_smmu *tegra_smmu_probe(struct device *dev,
+ const struct tegra_smmu_soc *soc,
+ struct tegra_mc *mc)
+{
+ struct tegra_smmu *smmu;
+ size_t size;
+ u32 value;
+ int err;
+
+ smmu = devm_kzalloc(dev, sizeof(*smmu), GFP_KERNEL);
+ if (!smmu)
+ return ERR_PTR(-ENOMEM);
+
+ /*
+ * This is a bit of a hack. Ideally we'd want to simply return this
+ * value. However the IOMMU registration process will attempt to add
+ * all devices to the IOMMU when bus_set_iommu() is called. In order
+ * not to rely on global variables to track the IOMMU instance, we
+ * set it here so that it can be looked up from the .probe_device()
+ * callback via the IOMMU device's .drvdata field.
+ */
+ mc->smmu = smmu;
+
+ size = BITS_TO_LONGS(soc->num_asids) * sizeof(long);
+
+ smmu->asids = devm_kzalloc(dev, size, GFP_KERNEL);
+ if (!smmu->asids)
+ return ERR_PTR(-ENOMEM);
+
+ INIT_LIST_HEAD(&smmu->groups);
+ mutex_init(&smmu->lock);
+
+ smmu->regs = mc->regs;
+ smmu->soc = soc;
+ smmu->dev = dev;
+ smmu->mc = mc;
+
+ smmu->pfn_mask =
+ BIT_MASK(mc->soc->num_address_bits - SMMU_PTE_SHIFT) - 1;
+ dev_dbg(dev, "address bits: %u, PFN mask: %#lx\n",
+ mc->soc->num_address_bits, smmu->pfn_mask);
+ smmu->tlb_mask = (1 << fls(smmu->soc->num_tlb_lines)) - 1;
+ dev_dbg(dev, "TLB lines: %u, mask: %#lx\n", smmu->soc->num_tlb_lines,
+ smmu->tlb_mask);
+
+ value = SMMU_PTC_CONFIG_ENABLE | SMMU_PTC_CONFIG_INDEX_MAP(0x3f);
+
+ if (soc->supports_request_limit)
+ value |= SMMU_PTC_CONFIG_REQ_LIMIT(8);
+
+ smmu_writel(smmu, value, SMMU_PTC_CONFIG);
+
+ value = SMMU_TLB_CONFIG_HIT_UNDER_MISS |
+ SMMU_TLB_CONFIG_ACTIVE_LINES(smmu);
+
+ if (soc->supports_round_robin_arbitration)
+ value |= SMMU_TLB_CONFIG_ROUND_ROBIN_ARBITRATION;
+
+ smmu_writel(smmu, value, SMMU_TLB_CONFIG);
+
+ smmu_flush_ptc_all(smmu);
+ smmu_flush_tlb(smmu);
+ smmu_writel(smmu, SMMU_CONFIG_ENABLE, SMMU_CONFIG);
+ smmu_flush(smmu);
+
+ tegra_smmu_ahb_enable();
+
+ err = iommu_device_sysfs_add(&smmu->iommu, dev, NULL, dev_name(dev));
+ if (err)
+ return ERR_PTR(err);
+
+ iommu_device_set_ops(&smmu->iommu, &tegra_smmu_ops);
+ iommu_device_set_fwnode(&smmu->iommu, dev->fwnode);
+
+ err = iommu_device_register(&smmu->iommu);
+ if (err) {
+ iommu_device_sysfs_remove(&smmu->iommu);
+ return ERR_PTR(err);
+ }
+
+ err = bus_set_iommu(&platform_bus_type, &tegra_smmu_ops);
+ if (err < 0) {
+ iommu_device_unregister(&smmu->iommu);
+ iommu_device_sysfs_remove(&smmu->iommu);
+ return ERR_PTR(err);
+ }
+
+ if (IS_ENABLED(CONFIG_DEBUG_FS))
+ tegra_smmu_debugfs_init(smmu);
+
+ return smmu;
+}
+
+void tegra_smmu_remove(struct tegra_smmu *smmu)
+{
+ iommu_device_unregister(&smmu->iommu);
+ iommu_device_sysfs_remove(&smmu->iommu);
+
+ if (IS_ENABLED(CONFIG_DEBUG_FS))
+ tegra_smmu_debugfs_exit(smmu);
+}
diff --git a/drivers/iommu/virtio-iommu.c b/drivers/iommu/virtio-iommu.c
new file mode 100644
index 000000000..81dea4caf
--- /dev/null
+++ b/drivers/iommu/virtio-iommu.c
@@ -0,0 +1,1158 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Virtio driver for the paravirtualized IOMMU
+ *
+ * Copyright (C) 2019 Arm Limited
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/amba/bus.h>
+#include <linux/delay.h>
+#include <linux/dma-iommu.h>
+#include <linux/freezer.h>
+#include <linux/interval_tree.h>
+#include <linux/iommu.h>
+#include <linux/module.h>
+#include <linux/of_iommu.h>
+#include <linux/of_platform.h>
+#include <linux/pci.h>
+#include <linux/platform_device.h>
+#include <linux/virtio.h>
+#include <linux/virtio_config.h>
+#include <linux/virtio_ids.h>
+#include <linux/wait.h>
+
+#include <uapi/linux/virtio_iommu.h>
+
+#define MSI_IOVA_BASE 0x8000000
+#define MSI_IOVA_LENGTH 0x100000
+
+#define VIOMMU_REQUEST_VQ 0
+#define VIOMMU_EVENT_VQ 1
+#define VIOMMU_NR_VQS 2
+
+struct viommu_dev {
+ struct iommu_device iommu;
+ struct device *dev;
+ struct virtio_device *vdev;
+
+ struct ida domain_ids;
+
+ struct virtqueue *vqs[VIOMMU_NR_VQS];
+ spinlock_t request_lock;
+ struct list_head requests;
+ void *evts;
+
+ /* Device configuration */
+ struct iommu_domain_geometry geometry;
+ u64 pgsize_bitmap;
+ u32 first_domain;
+ u32 last_domain;
+ /* Supported MAP flags */
+ u32 map_flags;
+ u32 probe_size;
+};
+
+struct viommu_mapping {
+ phys_addr_t paddr;
+ struct interval_tree_node iova;
+ u32 flags;
+};
+
+struct viommu_domain {
+ struct iommu_domain domain;
+ struct viommu_dev *viommu;
+ struct mutex mutex; /* protects viommu pointer */
+ unsigned int id;
+ u32 map_flags;
+
+ spinlock_t mappings_lock;
+ struct rb_root_cached mappings;
+
+ unsigned long nr_endpoints;
+};
+
+struct viommu_endpoint {
+ struct device *dev;
+ struct viommu_dev *viommu;
+ struct viommu_domain *vdomain;
+ struct list_head resv_regions;
+};
+
+struct viommu_request {
+ struct list_head list;
+ void *writeback;
+ unsigned int write_offset;
+ unsigned int len;
+ char buf[];
+};
+
+#define VIOMMU_FAULT_RESV_MASK 0xffffff00
+
+struct viommu_event {
+ union {
+ u32 head;
+ struct virtio_iommu_fault fault;
+ };
+};
+
+#define to_viommu_domain(domain) \
+ container_of(domain, struct viommu_domain, domain)
+
+static int viommu_get_req_errno(void *buf, size_t len)
+{
+ struct virtio_iommu_req_tail *tail = buf + len - sizeof(*tail);
+
+ switch (tail->status) {
+ case VIRTIO_IOMMU_S_OK:
+ return 0;
+ case VIRTIO_IOMMU_S_UNSUPP:
+ return -ENOSYS;
+ case VIRTIO_IOMMU_S_INVAL:
+ return -EINVAL;
+ case VIRTIO_IOMMU_S_RANGE:
+ return -ERANGE;
+ case VIRTIO_IOMMU_S_NOENT:
+ return -ENOENT;
+ case VIRTIO_IOMMU_S_FAULT:
+ return -EFAULT;
+ case VIRTIO_IOMMU_S_NOMEM:
+ return -ENOMEM;
+ case VIRTIO_IOMMU_S_IOERR:
+ case VIRTIO_IOMMU_S_DEVERR:
+ default:
+ return -EIO;
+ }
+}
+
+static void viommu_set_req_status(void *buf, size_t len, int status)
+{
+ struct virtio_iommu_req_tail *tail = buf + len - sizeof(*tail);
+
+ tail->status = status;
+}
+
+static off_t viommu_get_write_desc_offset(struct viommu_dev *viommu,
+ struct virtio_iommu_req_head *req,
+ size_t len)
+{
+ size_t tail_size = sizeof(struct virtio_iommu_req_tail);
+
+ if (req->type == VIRTIO_IOMMU_T_PROBE)
+ return len - viommu->probe_size - tail_size;
+
+ return len - tail_size;
+}
+
+/*
+ * __viommu_sync_req - Complete all in-flight requests
+ *
+ * Wait for all added requests to complete. When this function returns, all
+ * requests that were in-flight at the time of the call have completed.
+ */
+static int __viommu_sync_req(struct viommu_dev *viommu)
+{
+ unsigned int len;
+ size_t write_len;
+ struct viommu_request *req;
+ struct virtqueue *vq = viommu->vqs[VIOMMU_REQUEST_VQ];
+
+ assert_spin_locked(&viommu->request_lock);
+
+ virtqueue_kick(vq);
+
+ while (!list_empty(&viommu->requests)) {
+ len = 0;
+ req = virtqueue_get_buf(vq, &len);
+ if (!req)
+ continue;
+
+ if (!len)
+ viommu_set_req_status(req->buf, req->len,
+ VIRTIO_IOMMU_S_IOERR);
+
+ write_len = req->len - req->write_offset;
+ if (req->writeback && len == write_len)
+ memcpy(req->writeback, req->buf + req->write_offset,
+ write_len);
+
+ list_del(&req->list);
+ kfree(req);
+ }
+
+ return 0;
+}
+
+static int viommu_sync_req(struct viommu_dev *viommu)
+{
+ int ret;
+ unsigned long flags;
+
+ spin_lock_irqsave(&viommu->request_lock, flags);
+ ret = __viommu_sync_req(viommu);
+ if (ret)
+ dev_dbg(viommu->dev, "could not sync requests (%d)\n", ret);
+ spin_unlock_irqrestore(&viommu->request_lock, flags);
+
+ return ret;
+}
+
+/*
+ * __viommu_add_request - Add one request to the queue
+ * @buf: pointer to the request buffer
+ * @len: length of the request buffer
+ * @writeback: copy data back to the buffer when the request completes.
+ *
+ * Add a request to the queue. Only synchronize the queue if it's already full.
+ * Otherwise don't kick the queue nor wait for requests to complete.
+ *
+ * When @writeback is true, data written by the device, including the request
+ * status, is copied into @buf after the request completes. This is unsafe if
+ * the caller allocates @buf on stack and drops the lock between add_req() and
+ * sync_req().
+ *
+ * Return 0 if the request was successfully added to the queue.
+ */
+static int __viommu_add_req(struct viommu_dev *viommu, void *buf, size_t len,
+ bool writeback)
+{
+ int ret;
+ off_t write_offset;
+ struct viommu_request *req;
+ struct scatterlist top_sg, bottom_sg;
+ struct scatterlist *sg[2] = { &top_sg, &bottom_sg };
+ struct virtqueue *vq = viommu->vqs[VIOMMU_REQUEST_VQ];
+
+ assert_spin_locked(&viommu->request_lock);
+
+ write_offset = viommu_get_write_desc_offset(viommu, buf, len);
+ if (write_offset <= 0)
+ return -EINVAL;
+
+ req = kzalloc(sizeof(*req) + len, GFP_ATOMIC);
+ if (!req)
+ return -ENOMEM;
+
+ req->len = len;
+ if (writeback) {
+ req->writeback = buf + write_offset;
+ req->write_offset = write_offset;
+ }
+ memcpy(&req->buf, buf, write_offset);
+
+ sg_init_one(&top_sg, req->buf, write_offset);
+ sg_init_one(&bottom_sg, req->buf + write_offset, len - write_offset);
+
+ ret = virtqueue_add_sgs(vq, sg, 1, 1, req, GFP_ATOMIC);
+ if (ret == -ENOSPC) {
+ /* If the queue is full, sync and retry */
+ if (!__viommu_sync_req(viommu))
+ ret = virtqueue_add_sgs(vq, sg, 1, 1, req, GFP_ATOMIC);
+ }
+ if (ret)
+ goto err_free;
+
+ list_add_tail(&req->list, &viommu->requests);
+ return 0;
+
+err_free:
+ kfree(req);
+ return ret;
+}
+
+static int viommu_add_req(struct viommu_dev *viommu, void *buf, size_t len)
+{
+ int ret;
+ unsigned long flags;
+
+ spin_lock_irqsave(&viommu->request_lock, flags);
+ ret = __viommu_add_req(viommu, buf, len, false);
+ if (ret)
+ dev_dbg(viommu->dev, "could not add request: %d\n", ret);
+ spin_unlock_irqrestore(&viommu->request_lock, flags);
+
+ return ret;
+}
+
+/*
+ * Send a request and wait for it to complete. Return the request status (as an
+ * errno)
+ */
+static int viommu_send_req_sync(struct viommu_dev *viommu, void *buf,
+ size_t len)
+{
+ int ret;
+ unsigned long flags;
+
+ spin_lock_irqsave(&viommu->request_lock, flags);
+
+ ret = __viommu_add_req(viommu, buf, len, true);
+ if (ret) {
+ dev_dbg(viommu->dev, "could not add request (%d)\n", ret);
+ goto out_unlock;
+ }
+
+ ret = __viommu_sync_req(viommu);
+ if (ret) {
+ dev_dbg(viommu->dev, "could not sync requests (%d)\n", ret);
+ /* Fall-through (get the actual request status) */
+ }
+
+ ret = viommu_get_req_errno(buf, len);
+out_unlock:
+ spin_unlock_irqrestore(&viommu->request_lock, flags);
+ return ret;
+}
+
+/*
+ * viommu_add_mapping - add a mapping to the internal tree
+ *
+ * On success, return the new mapping. Otherwise return NULL.
+ */
+static int viommu_add_mapping(struct viommu_domain *vdomain, unsigned long iova,
+ phys_addr_t paddr, size_t size, u32 flags)
+{
+ unsigned long irqflags;
+ struct viommu_mapping *mapping;
+
+ mapping = kzalloc(sizeof(*mapping), GFP_ATOMIC);
+ if (!mapping)
+ return -ENOMEM;
+
+ mapping->paddr = paddr;
+ mapping->iova.start = iova;
+ mapping->iova.last = iova + size - 1;
+ mapping->flags = flags;
+
+ spin_lock_irqsave(&vdomain->mappings_lock, irqflags);
+ interval_tree_insert(&mapping->iova, &vdomain->mappings);
+ spin_unlock_irqrestore(&vdomain->mappings_lock, irqflags);
+
+ return 0;
+}
+
+/*
+ * viommu_del_mappings - remove mappings from the internal tree
+ *
+ * @vdomain: the domain
+ * @iova: start of the range
+ * @size: size of the range. A size of 0 corresponds to the entire address
+ * space.
+ *
+ * On success, returns the number of unmapped bytes (>= size)
+ */
+static size_t viommu_del_mappings(struct viommu_domain *vdomain,
+ unsigned long iova, size_t size)
+{
+ size_t unmapped = 0;
+ unsigned long flags;
+ unsigned long last = iova + size - 1;
+ struct viommu_mapping *mapping = NULL;
+ struct interval_tree_node *node, *next;
+
+ spin_lock_irqsave(&vdomain->mappings_lock, flags);
+ next = interval_tree_iter_first(&vdomain->mappings, iova, last);
+ while (next) {
+ node = next;
+ mapping = container_of(node, struct viommu_mapping, iova);
+ next = interval_tree_iter_next(node, iova, last);
+
+ /* Trying to split a mapping? */
+ if (mapping->iova.start < iova)
+ break;
+
+ /*
+ * Virtio-iommu doesn't allow UNMAP to split a mapping created
+ * with a single MAP request, so remove the full mapping.
+ */
+ unmapped += mapping->iova.last - mapping->iova.start + 1;
+
+ interval_tree_remove(node, &vdomain->mappings);
+ kfree(mapping);
+ }
+ spin_unlock_irqrestore(&vdomain->mappings_lock, flags);
+
+ return unmapped;
+}
+
+/*
+ * viommu_replay_mappings - re-send MAP requests
+ *
+ * When reattaching a domain that was previously detached from all endpoints,
+ * mappings were deleted from the device. Re-create the mappings available in
+ * the internal tree.
+ */
+static int viommu_replay_mappings(struct viommu_domain *vdomain)
+{
+ int ret = 0;
+ unsigned long flags;
+ struct viommu_mapping *mapping;
+ struct interval_tree_node *node;
+ struct virtio_iommu_req_map map;
+
+ spin_lock_irqsave(&vdomain->mappings_lock, flags);
+ node = interval_tree_iter_first(&vdomain->mappings, 0, -1UL);
+ while (node) {
+ mapping = container_of(node, struct viommu_mapping, iova);
+ map = (struct virtio_iommu_req_map) {
+ .head.type = VIRTIO_IOMMU_T_MAP,
+ .domain = cpu_to_le32(vdomain->id),
+ .virt_start = cpu_to_le64(mapping->iova.start),
+ .virt_end = cpu_to_le64(mapping->iova.last),
+ .phys_start = cpu_to_le64(mapping->paddr),
+ .flags = cpu_to_le32(mapping->flags),
+ };
+
+ ret = viommu_send_req_sync(vdomain->viommu, &map, sizeof(map));
+ if (ret)
+ break;
+
+ node = interval_tree_iter_next(node, 0, -1UL);
+ }
+ spin_unlock_irqrestore(&vdomain->mappings_lock, flags);
+
+ return ret;
+}
+
+static int viommu_add_resv_mem(struct viommu_endpoint *vdev,
+ struct virtio_iommu_probe_resv_mem *mem,
+ size_t len)
+{
+ size_t size;
+ u64 start64, end64;
+ phys_addr_t start, end;
+ struct iommu_resv_region *region = NULL;
+ unsigned long prot = IOMMU_WRITE | IOMMU_NOEXEC | IOMMU_MMIO;
+
+ start = start64 = le64_to_cpu(mem->start);
+ end = end64 = le64_to_cpu(mem->end);
+ size = end64 - start64 + 1;
+
+ /* Catch any overflow, including the unlikely end64 - start64 + 1 = 0 */
+ if (start != start64 || end != end64 || size < end64 - start64)
+ return -EOVERFLOW;
+
+ if (len < sizeof(*mem))
+ return -EINVAL;
+
+ switch (mem->subtype) {
+ default:
+ dev_warn(vdev->dev, "unknown resv mem subtype 0x%x\n",
+ mem->subtype);
+ fallthrough;
+ case VIRTIO_IOMMU_RESV_MEM_T_RESERVED:
+ region = iommu_alloc_resv_region(start, size, 0,
+ IOMMU_RESV_RESERVED);
+ break;
+ case VIRTIO_IOMMU_RESV_MEM_T_MSI:
+ region = iommu_alloc_resv_region(start, size, prot,
+ IOMMU_RESV_MSI);
+ break;
+ }
+ if (!region)
+ return -ENOMEM;
+
+ list_add(&region->list, &vdev->resv_regions);
+ return 0;
+}
+
+static int viommu_probe_endpoint(struct viommu_dev *viommu, struct device *dev)
+{
+ int ret;
+ u16 type, len;
+ size_t cur = 0;
+ size_t probe_len;
+ struct virtio_iommu_req_probe *probe;
+ struct virtio_iommu_probe_property *prop;
+ struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
+ struct viommu_endpoint *vdev = dev_iommu_priv_get(dev);
+
+ if (!fwspec->num_ids)
+ return -EINVAL;
+
+ probe_len = sizeof(*probe) + viommu->probe_size +
+ sizeof(struct virtio_iommu_req_tail);
+ probe = kzalloc(probe_len, GFP_KERNEL);
+ if (!probe)
+ return -ENOMEM;
+
+ probe->head.type = VIRTIO_IOMMU_T_PROBE;
+ /*
+ * For now, assume that properties of an endpoint that outputs multiple
+ * IDs are consistent. Only probe the first one.
+ */
+ probe->endpoint = cpu_to_le32(fwspec->ids[0]);
+
+ ret = viommu_send_req_sync(viommu, probe, probe_len);
+ if (ret)
+ goto out_free;
+
+ prop = (void *)probe->properties;
+ type = le16_to_cpu(prop->type) & VIRTIO_IOMMU_PROBE_T_MASK;
+
+ while (type != VIRTIO_IOMMU_PROBE_T_NONE &&
+ cur < viommu->probe_size) {
+ len = le16_to_cpu(prop->length) + sizeof(*prop);
+
+ switch (type) {
+ case VIRTIO_IOMMU_PROBE_T_RESV_MEM:
+ ret = viommu_add_resv_mem(vdev, (void *)prop, len);
+ break;
+ default:
+ dev_err(dev, "unknown viommu prop 0x%x\n", type);
+ }
+
+ if (ret)
+ dev_err(dev, "failed to parse viommu prop 0x%x\n", type);
+
+ cur += len;
+ if (cur >= viommu->probe_size)
+ break;
+
+ prop = (void *)probe->properties + cur;
+ type = le16_to_cpu(prop->type) & VIRTIO_IOMMU_PROBE_T_MASK;
+ }
+
+out_free:
+ kfree(probe);
+ return ret;
+}
+
+static int viommu_fault_handler(struct viommu_dev *viommu,
+ struct virtio_iommu_fault *fault)
+{
+ char *reason_str;
+
+ u8 reason = fault->reason;
+ u32 flags = le32_to_cpu(fault->flags);
+ u32 endpoint = le32_to_cpu(fault->endpoint);
+ u64 address = le64_to_cpu(fault->address);
+
+ switch (reason) {
+ case VIRTIO_IOMMU_FAULT_R_DOMAIN:
+ reason_str = "domain";
+ break;
+ case VIRTIO_IOMMU_FAULT_R_MAPPING:
+ reason_str = "page";
+ break;
+ case VIRTIO_IOMMU_FAULT_R_UNKNOWN:
+ default:
+ reason_str = "unknown";
+ break;
+ }
+
+ /* TODO: find EP by ID and report_iommu_fault */
+ if (flags & VIRTIO_IOMMU_FAULT_F_ADDRESS)
+ dev_err_ratelimited(viommu->dev, "%s fault from EP %u at %#llx [%s%s%s]\n",
+ reason_str, endpoint, address,
+ flags & VIRTIO_IOMMU_FAULT_F_READ ? "R" : "",
+ flags & VIRTIO_IOMMU_FAULT_F_WRITE ? "W" : "",
+ flags & VIRTIO_IOMMU_FAULT_F_EXEC ? "X" : "");
+ else
+ dev_err_ratelimited(viommu->dev, "%s fault from EP %u\n",
+ reason_str, endpoint);
+ return 0;
+}
+
+static void viommu_event_handler(struct virtqueue *vq)
+{
+ int ret;
+ unsigned int len;
+ struct scatterlist sg[1];
+ struct viommu_event *evt;
+ struct viommu_dev *viommu = vq->vdev->priv;
+
+ while ((evt = virtqueue_get_buf(vq, &len)) != NULL) {
+ if (len > sizeof(*evt)) {
+ dev_err(viommu->dev,
+ "invalid event buffer (len %u != %zu)\n",
+ len, sizeof(*evt));
+ } else if (!(evt->head & VIOMMU_FAULT_RESV_MASK)) {
+ viommu_fault_handler(viommu, &evt->fault);
+ }
+
+ sg_init_one(sg, evt, sizeof(*evt));
+ ret = virtqueue_add_inbuf(vq, sg, 1, evt, GFP_ATOMIC);
+ if (ret)
+ dev_err(viommu->dev, "could not add event buffer\n");
+ }
+
+ virtqueue_kick(vq);
+}
+
+/* IOMMU API */
+
+static struct iommu_domain *viommu_domain_alloc(unsigned type)
+{
+ struct viommu_domain *vdomain;
+
+ if (type != IOMMU_DOMAIN_UNMANAGED && type != IOMMU_DOMAIN_DMA)
+ return NULL;
+
+ vdomain = kzalloc(sizeof(*vdomain), GFP_KERNEL);
+ if (!vdomain)
+ return NULL;
+
+ mutex_init(&vdomain->mutex);
+ spin_lock_init(&vdomain->mappings_lock);
+ vdomain->mappings = RB_ROOT_CACHED;
+
+ if (type == IOMMU_DOMAIN_DMA &&
+ iommu_get_dma_cookie(&vdomain->domain)) {
+ kfree(vdomain);
+ return NULL;
+ }
+
+ return &vdomain->domain;
+}
+
+static int viommu_domain_finalise(struct viommu_endpoint *vdev,
+ struct iommu_domain *domain)
+{
+ int ret;
+ unsigned long viommu_page_size;
+ struct viommu_dev *viommu = vdev->viommu;
+ struct viommu_domain *vdomain = to_viommu_domain(domain);
+
+ viommu_page_size = 1UL << __ffs(viommu->pgsize_bitmap);
+ if (viommu_page_size > PAGE_SIZE) {
+ dev_err(vdev->dev,
+ "granule 0x%lx larger than system page size 0x%lx\n",
+ viommu_page_size, PAGE_SIZE);
+ return -EINVAL;
+ }
+
+ ret = ida_alloc_range(&viommu->domain_ids, viommu->first_domain,
+ viommu->last_domain, GFP_KERNEL);
+ if (ret < 0)
+ return ret;
+
+ vdomain->id = (unsigned int)ret;
+
+ domain->pgsize_bitmap = viommu->pgsize_bitmap;
+ domain->geometry = viommu->geometry;
+
+ vdomain->map_flags = viommu->map_flags;
+ vdomain->viommu = viommu;
+
+ return 0;
+}
+
+static void viommu_domain_free(struct iommu_domain *domain)
+{
+ struct viommu_domain *vdomain = to_viommu_domain(domain);
+
+ iommu_put_dma_cookie(domain);
+
+ /* Free all remaining mappings (size 2^64) */
+ viommu_del_mappings(vdomain, 0, 0);
+
+ if (vdomain->viommu)
+ ida_free(&vdomain->viommu->domain_ids, vdomain->id);
+
+ kfree(vdomain);
+}
+
+static int viommu_attach_dev(struct iommu_domain *domain, struct device *dev)
+{
+ int i;
+ int ret = 0;
+ struct virtio_iommu_req_attach req;
+ struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
+ struct viommu_endpoint *vdev = dev_iommu_priv_get(dev);
+ struct viommu_domain *vdomain = to_viommu_domain(domain);
+
+ mutex_lock(&vdomain->mutex);
+ if (!vdomain->viommu) {
+ /*
+ * Properly initialize the domain now that we know which viommu
+ * owns it.
+ */
+ ret = viommu_domain_finalise(vdev, domain);
+ } else if (vdomain->viommu != vdev->viommu) {
+ dev_err(dev, "cannot attach to foreign vIOMMU\n");
+ ret = -EXDEV;
+ }
+ mutex_unlock(&vdomain->mutex);
+
+ if (ret)
+ return ret;
+
+ /*
+ * In the virtio-iommu device, when attaching the endpoint to a new
+ * domain, it is detached from the old one and, if as as a result the
+ * old domain isn't attached to any endpoint, all mappings are removed
+ * from the old domain and it is freed.
+ *
+ * In the driver the old domain still exists, and its mappings will be
+ * recreated if it gets reattached to an endpoint. Otherwise it will be
+ * freed explicitly.
+ *
+ * vdev->vdomain is protected by group->mutex
+ */
+ if (vdev->vdomain)
+ vdev->vdomain->nr_endpoints--;
+
+ req = (struct virtio_iommu_req_attach) {
+ .head.type = VIRTIO_IOMMU_T_ATTACH,
+ .domain = cpu_to_le32(vdomain->id),
+ };
+
+ for (i = 0; i < fwspec->num_ids; i++) {
+ req.endpoint = cpu_to_le32(fwspec->ids[i]);
+
+ ret = viommu_send_req_sync(vdomain->viommu, &req, sizeof(req));
+ if (ret)
+ return ret;
+ }
+
+ if (!vdomain->nr_endpoints) {
+ /*
+ * This endpoint is the first to be attached to the domain.
+ * Replay existing mappings (e.g. SW MSI).
+ */
+ ret = viommu_replay_mappings(vdomain);
+ if (ret)
+ return ret;
+ }
+
+ vdomain->nr_endpoints++;
+ vdev->vdomain = vdomain;
+
+ return 0;
+}
+
+static int viommu_map(struct iommu_domain *domain, unsigned long iova,
+ phys_addr_t paddr, size_t size, int prot, gfp_t gfp)
+{
+ int ret;
+ u32 flags;
+ struct virtio_iommu_req_map map;
+ struct viommu_domain *vdomain = to_viommu_domain(domain);
+
+ flags = (prot & IOMMU_READ ? VIRTIO_IOMMU_MAP_F_READ : 0) |
+ (prot & IOMMU_WRITE ? VIRTIO_IOMMU_MAP_F_WRITE : 0) |
+ (prot & IOMMU_MMIO ? VIRTIO_IOMMU_MAP_F_MMIO : 0);
+
+ if (flags & ~vdomain->map_flags)
+ return -EINVAL;
+
+ ret = viommu_add_mapping(vdomain, iova, paddr, size, flags);
+ if (ret)
+ return ret;
+
+ map = (struct virtio_iommu_req_map) {
+ .head.type = VIRTIO_IOMMU_T_MAP,
+ .domain = cpu_to_le32(vdomain->id),
+ .virt_start = cpu_to_le64(iova),
+ .phys_start = cpu_to_le64(paddr),
+ .virt_end = cpu_to_le64(iova + size - 1),
+ .flags = cpu_to_le32(flags),
+ };
+
+ if (!vdomain->nr_endpoints)
+ return 0;
+
+ ret = viommu_send_req_sync(vdomain->viommu, &map, sizeof(map));
+ if (ret)
+ viommu_del_mappings(vdomain, iova, size);
+
+ return ret;
+}
+
+static size_t viommu_unmap(struct iommu_domain *domain, unsigned long iova,
+ size_t size, struct iommu_iotlb_gather *gather)
+{
+ int ret = 0;
+ size_t unmapped;
+ struct virtio_iommu_req_unmap unmap;
+ struct viommu_domain *vdomain = to_viommu_domain(domain);
+
+ unmapped = viommu_del_mappings(vdomain, iova, size);
+ if (unmapped < size)
+ return 0;
+
+ /* Device already removed all mappings after detach. */
+ if (!vdomain->nr_endpoints)
+ return unmapped;
+
+ unmap = (struct virtio_iommu_req_unmap) {
+ .head.type = VIRTIO_IOMMU_T_UNMAP,
+ .domain = cpu_to_le32(vdomain->id),
+ .virt_start = cpu_to_le64(iova),
+ .virt_end = cpu_to_le64(iova + unmapped - 1),
+ };
+
+ ret = viommu_add_req(vdomain->viommu, &unmap, sizeof(unmap));
+ return ret ? 0 : unmapped;
+}
+
+static phys_addr_t viommu_iova_to_phys(struct iommu_domain *domain,
+ dma_addr_t iova)
+{
+ u64 paddr = 0;
+ unsigned long flags;
+ struct viommu_mapping *mapping;
+ struct interval_tree_node *node;
+ struct viommu_domain *vdomain = to_viommu_domain(domain);
+
+ spin_lock_irqsave(&vdomain->mappings_lock, flags);
+ node = interval_tree_iter_first(&vdomain->mappings, iova, iova);
+ if (node) {
+ mapping = container_of(node, struct viommu_mapping, iova);
+ paddr = mapping->paddr + (iova - mapping->iova.start);
+ }
+ spin_unlock_irqrestore(&vdomain->mappings_lock, flags);
+
+ return paddr;
+}
+
+static void viommu_iotlb_sync(struct iommu_domain *domain,
+ struct iommu_iotlb_gather *gather)
+{
+ struct viommu_domain *vdomain = to_viommu_domain(domain);
+
+ viommu_sync_req(vdomain->viommu);
+}
+
+static void viommu_get_resv_regions(struct device *dev, struct list_head *head)
+{
+ struct iommu_resv_region *entry, *new_entry, *msi = NULL;
+ struct viommu_endpoint *vdev = dev_iommu_priv_get(dev);
+ int prot = IOMMU_WRITE | IOMMU_NOEXEC | IOMMU_MMIO;
+
+ list_for_each_entry(entry, &vdev->resv_regions, list) {
+ if (entry->type == IOMMU_RESV_MSI)
+ msi = entry;
+
+ new_entry = kmemdup(entry, sizeof(*entry), GFP_KERNEL);
+ if (!new_entry)
+ return;
+ list_add_tail(&new_entry->list, head);
+ }
+
+ /*
+ * If the device didn't register any bypass MSI window, add a
+ * software-mapped region.
+ */
+ if (!msi) {
+ msi = iommu_alloc_resv_region(MSI_IOVA_BASE, MSI_IOVA_LENGTH,
+ prot, IOMMU_RESV_SW_MSI);
+ if (!msi)
+ return;
+
+ list_add_tail(&msi->list, head);
+ }
+
+ iommu_dma_get_resv_regions(dev, head);
+}
+
+static struct iommu_ops viommu_ops;
+static struct virtio_driver virtio_iommu_drv;
+
+static int viommu_match_node(struct device *dev, const void *data)
+{
+ return dev->parent->fwnode == data;
+}
+
+static struct viommu_dev *viommu_get_by_fwnode(struct fwnode_handle *fwnode)
+{
+ struct device *dev = driver_find_device(&virtio_iommu_drv.driver, NULL,
+ fwnode, viommu_match_node);
+ put_device(dev);
+
+ return dev ? dev_to_virtio(dev)->priv : NULL;
+}
+
+static struct iommu_device *viommu_probe_device(struct device *dev)
+{
+ int ret;
+ struct viommu_endpoint *vdev;
+ struct viommu_dev *viommu = NULL;
+ struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
+
+ if (!fwspec || fwspec->ops != &viommu_ops)
+ return ERR_PTR(-ENODEV);
+
+ viommu = viommu_get_by_fwnode(fwspec->iommu_fwnode);
+ if (!viommu)
+ return ERR_PTR(-ENODEV);
+
+ vdev = kzalloc(sizeof(*vdev), GFP_KERNEL);
+ if (!vdev)
+ return ERR_PTR(-ENOMEM);
+
+ vdev->dev = dev;
+ vdev->viommu = viommu;
+ INIT_LIST_HEAD(&vdev->resv_regions);
+ dev_iommu_priv_set(dev, vdev);
+
+ if (viommu->probe_size) {
+ /* Get additional information for this endpoint */
+ ret = viommu_probe_endpoint(viommu, dev);
+ if (ret)
+ goto err_free_dev;
+ }
+
+ return &viommu->iommu;
+
+err_free_dev:
+ generic_iommu_put_resv_regions(dev, &vdev->resv_regions);
+ kfree(vdev);
+
+ return ERR_PTR(ret);
+}
+
+static void viommu_release_device(struct device *dev)
+{
+ struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
+ struct viommu_endpoint *vdev;
+
+ if (!fwspec || fwspec->ops != &viommu_ops)
+ return;
+
+ vdev = dev_iommu_priv_get(dev);
+
+ generic_iommu_put_resv_regions(dev, &vdev->resv_regions);
+ kfree(vdev);
+}
+
+static struct iommu_group *viommu_device_group(struct device *dev)
+{
+ if (dev_is_pci(dev))
+ return pci_device_group(dev);
+ else
+ return generic_device_group(dev);
+}
+
+static int viommu_of_xlate(struct device *dev, struct of_phandle_args *args)
+{
+ return iommu_fwspec_add_ids(dev, args->args, 1);
+}
+
+static struct iommu_ops viommu_ops = {
+ .domain_alloc = viommu_domain_alloc,
+ .domain_free = viommu_domain_free,
+ .attach_dev = viommu_attach_dev,
+ .map = viommu_map,
+ .unmap = viommu_unmap,
+ .iova_to_phys = viommu_iova_to_phys,
+ .iotlb_sync = viommu_iotlb_sync,
+ .probe_device = viommu_probe_device,
+ .release_device = viommu_release_device,
+ .device_group = viommu_device_group,
+ .get_resv_regions = viommu_get_resv_regions,
+ .put_resv_regions = generic_iommu_put_resv_regions,
+ .of_xlate = viommu_of_xlate,
+};
+
+static int viommu_init_vqs(struct viommu_dev *viommu)
+{
+ struct virtio_device *vdev = dev_to_virtio(viommu->dev);
+ const char *names[] = { "request", "event" };
+ vq_callback_t *callbacks[] = {
+ NULL, /* No async requests */
+ viommu_event_handler,
+ };
+
+ return virtio_find_vqs(vdev, VIOMMU_NR_VQS, viommu->vqs, callbacks,
+ names, NULL);
+}
+
+static int viommu_fill_evtq(struct viommu_dev *viommu)
+{
+ int i, ret;
+ struct scatterlist sg[1];
+ struct viommu_event *evts;
+ struct virtqueue *vq = viommu->vqs[VIOMMU_EVENT_VQ];
+ size_t nr_evts = vq->num_free;
+
+ viommu->evts = evts = devm_kmalloc_array(viommu->dev, nr_evts,
+ sizeof(*evts), GFP_KERNEL);
+ if (!evts)
+ return -ENOMEM;
+
+ for (i = 0; i < nr_evts; i++) {
+ sg_init_one(sg, &evts[i], sizeof(*evts));
+ ret = virtqueue_add_inbuf(vq, sg, 1, &evts[i], GFP_KERNEL);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+static int viommu_probe(struct virtio_device *vdev)
+{
+ struct device *parent_dev = vdev->dev.parent;
+ struct viommu_dev *viommu = NULL;
+ struct device *dev = &vdev->dev;
+ u64 input_start = 0;
+ u64 input_end = -1UL;
+ int ret;
+
+ if (!virtio_has_feature(vdev, VIRTIO_F_VERSION_1) ||
+ !virtio_has_feature(vdev, VIRTIO_IOMMU_F_MAP_UNMAP))
+ return -ENODEV;
+
+ viommu = devm_kzalloc(dev, sizeof(*viommu), GFP_KERNEL);
+ if (!viommu)
+ return -ENOMEM;
+
+ spin_lock_init(&viommu->request_lock);
+ ida_init(&viommu->domain_ids);
+ viommu->dev = dev;
+ viommu->vdev = vdev;
+ INIT_LIST_HEAD(&viommu->requests);
+
+ ret = viommu_init_vqs(viommu);
+ if (ret)
+ return ret;
+
+ virtio_cread_le(vdev, struct virtio_iommu_config, page_size_mask,
+ &viommu->pgsize_bitmap);
+
+ if (!viommu->pgsize_bitmap) {
+ ret = -EINVAL;
+ goto err_free_vqs;
+ }
+
+ viommu->map_flags = VIRTIO_IOMMU_MAP_F_READ | VIRTIO_IOMMU_MAP_F_WRITE;
+ viommu->last_domain = ~0U;
+
+ /* Optional features */
+ virtio_cread_le_feature(vdev, VIRTIO_IOMMU_F_INPUT_RANGE,
+ struct virtio_iommu_config, input_range.start,
+ &input_start);
+
+ virtio_cread_le_feature(vdev, VIRTIO_IOMMU_F_INPUT_RANGE,
+ struct virtio_iommu_config, input_range.end,
+ &input_end);
+
+ virtio_cread_le_feature(vdev, VIRTIO_IOMMU_F_DOMAIN_RANGE,
+ struct virtio_iommu_config, domain_range.start,
+ &viommu->first_domain);
+
+ virtio_cread_le_feature(vdev, VIRTIO_IOMMU_F_DOMAIN_RANGE,
+ struct virtio_iommu_config, domain_range.end,
+ &viommu->last_domain);
+
+ virtio_cread_le_feature(vdev, VIRTIO_IOMMU_F_PROBE,
+ struct virtio_iommu_config, probe_size,
+ &viommu->probe_size);
+
+ viommu->geometry = (struct iommu_domain_geometry) {
+ .aperture_start = input_start,
+ .aperture_end = input_end,
+ .force_aperture = true,
+ };
+
+ if (virtio_has_feature(vdev, VIRTIO_IOMMU_F_MMIO))
+ viommu->map_flags |= VIRTIO_IOMMU_MAP_F_MMIO;
+
+ viommu_ops.pgsize_bitmap = viommu->pgsize_bitmap;
+
+ virtio_device_ready(vdev);
+
+ /* Populate the event queue with buffers */
+ ret = viommu_fill_evtq(viommu);
+ if (ret)
+ goto err_free_vqs;
+
+ ret = iommu_device_sysfs_add(&viommu->iommu, dev, NULL, "%s",
+ virtio_bus_name(vdev));
+ if (ret)
+ goto err_free_vqs;
+
+ iommu_device_set_ops(&viommu->iommu, &viommu_ops);
+ iommu_device_set_fwnode(&viommu->iommu, parent_dev->fwnode);
+
+ iommu_device_register(&viommu->iommu);
+
+#ifdef CONFIG_PCI
+ if (pci_bus_type.iommu_ops != &viommu_ops) {
+ ret = bus_set_iommu(&pci_bus_type, &viommu_ops);
+ if (ret)
+ goto err_unregister;
+ }
+#endif
+#ifdef CONFIG_ARM_AMBA
+ if (amba_bustype.iommu_ops != &viommu_ops) {
+ ret = bus_set_iommu(&amba_bustype, &viommu_ops);
+ if (ret)
+ goto err_unregister;
+ }
+#endif
+ if (platform_bus_type.iommu_ops != &viommu_ops) {
+ ret = bus_set_iommu(&platform_bus_type, &viommu_ops);
+ if (ret)
+ goto err_unregister;
+ }
+
+ vdev->priv = viommu;
+
+ dev_info(dev, "input address: %u bits\n",
+ order_base_2(viommu->geometry.aperture_end));
+ dev_info(dev, "page mask: %#llx\n", viommu->pgsize_bitmap);
+
+ return 0;
+
+err_unregister:
+ iommu_device_sysfs_remove(&viommu->iommu);
+ iommu_device_unregister(&viommu->iommu);
+err_free_vqs:
+ vdev->config->del_vqs(vdev);
+
+ return ret;
+}
+
+static void viommu_remove(struct virtio_device *vdev)
+{
+ struct viommu_dev *viommu = vdev->priv;
+
+ iommu_device_sysfs_remove(&viommu->iommu);
+ iommu_device_unregister(&viommu->iommu);
+
+ /* Stop all virtqueues */
+ vdev->config->reset(vdev);
+ vdev->config->del_vqs(vdev);
+
+ dev_info(&vdev->dev, "device removed\n");
+}
+
+static void viommu_config_changed(struct virtio_device *vdev)
+{
+ dev_warn(&vdev->dev, "config changed\n");
+}
+
+static unsigned int features[] = {
+ VIRTIO_IOMMU_F_MAP_UNMAP,
+ VIRTIO_IOMMU_F_INPUT_RANGE,
+ VIRTIO_IOMMU_F_DOMAIN_RANGE,
+ VIRTIO_IOMMU_F_PROBE,
+ VIRTIO_IOMMU_F_MMIO,
+};
+
+static struct virtio_device_id id_table[] = {
+ { VIRTIO_ID_IOMMU, VIRTIO_DEV_ANY_ID },
+ { 0 },
+};
+MODULE_DEVICE_TABLE(virtio, id_table);
+
+static struct virtio_driver virtio_iommu_drv = {
+ .driver.name = KBUILD_MODNAME,
+ .driver.owner = THIS_MODULE,
+ .id_table = id_table,
+ .feature_table = features,
+ .feature_table_size = ARRAY_SIZE(features),
+ .probe = viommu_probe,
+ .remove = viommu_remove,
+ .config_changed = viommu_config_changed,
+};
+
+module_virtio_driver(virtio_iommu_drv);
+
+MODULE_DESCRIPTION("Virtio IOMMU driver");
+MODULE_AUTHOR("Jean-Philippe Brucker <jean-philippe.brucker@arm.com>");
+MODULE_LICENSE("GPL v2");