53 files changed, 49291 insertions, 0 deletions
diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
new file mode 100644
index 000000000..c60395b74
--- /dev/null
+++ b/drivers/iommu/Kconfig
@@ -0,0 +1,417 @@
+# IOMMU_API always gets selected by whoever wants it.
+config IOMMU_API
+	bool
+
+menuconfig IOMMU_SUPPORT
+	bool "IOMMU Hardware Support"
+	depends on MMU
+	default y
+	---help---
+	  Say Y here if you want to compile device drivers for IO Memory
+	  Management Units into the kernel. These devices usually allow to
+	  remap DMA requests and/or remap interrupts from other devices on the
+	  system.
+
+if IOMMU_SUPPORT
+
+menu "Generic IOMMU Pagetable Support"
+
+# Selected by the actual pagetable implementations
+config IOMMU_IO_PGTABLE
+	bool
+
+config IOMMU_IO_PGTABLE_LPAE
+	bool "ARMv7/v8 Long Descriptor Format"
+	select IOMMU_IO_PGTABLE
+	depends on ARM || ARM64 || (COMPILE_TEST && !GENERIC_ATOMIC64)
+	help
+	  Enable support for the ARM long descriptor pagetable format.
+	  This allocator supports 4K/2M/1G, 16K/32M and 64K/512M page
+	  sizes at both stage-1 and stage-2, as well as address spaces
+	  up to 48-bits in size.
+
+config IOMMU_IO_PGTABLE_LPAE_SELFTEST
+	bool "LPAE selftests"
+	depends on IOMMU_IO_PGTABLE_LPAE
+	help
+	  Enable self-tests for LPAE page table allocator. This performs
+	  a series of page-table consistency checks during boot.
+
+	  If unsure, say N here.
+
+config IOMMU_IO_PGTABLE_ARMV7S
+	bool "ARMv7/v8 Short Descriptor Format"
+	select IOMMU_IO_PGTABLE
+	depends on ARM || ARM64 || COMPILE_TEST
+	help
+	  Enable support for the ARM Short-descriptor pagetable format.
+	  This supports 32-bit virtual and physical addresses mapped using
+	  2-level tables with 4KB pages/1MB sections, and contiguous entries
+	  for 64KB pages/16MB supersections if indicated by the IOMMU driver.
+
+config IOMMU_IO_PGTABLE_ARMV7S_SELFTEST
+	bool "ARMv7s selftests"
+	depends on IOMMU_IO_PGTABLE_ARMV7S
+	help
+	  Enable self-tests for ARMv7s page table allocator. This performs
+	  a series of page-table consistency checks during boot.
+
+	  If unsure, say N here.
+
+endmenu
+
+config IOMMU_DEBUGFS
+	bool "Export IOMMU internals in DebugFS"
+	depends on DEBUG_FS
+	help
+	  Allows exposure of IOMMU device internals. This option enables
+	  the use of debugfs by IOMMU drivers as required. Devices can,
+	  at initialization time, cause the IOMMU code to create a top-level
+	  debug/iommu directory, and then populate a subdirectory with
+	  entries as required.
+
+config IOMMU_DEFAULT_PASSTHROUGH
+	bool "IOMMU passthrough by default"
+	depends on IOMMU_API
+        help
+	  Enable passthrough by default, removing the need to pass in
+	  iommu.passthrough=on or iommu=pt through command line. If this
+	  is enabled, you can still disable with iommu.passthrough=off
+	  or iommu=nopt depending on the architecture.
+
+	  If unsure, say N here.
+
+config IOMMU_IOVA
+	tristate
+
+config OF_IOMMU
+       def_bool y
+       depends on OF && IOMMU_API
+
+# IOMMU-agnostic DMA-mapping layer
+config IOMMU_DMA
+	bool
+	select IOMMU_API
+	select IOMMU_IOVA
+	select NEED_SG_DMA_LENGTH
+
+config FSL_PAMU
+	bool "Freescale IOMMU support"
+	depends on PCI
+	depends on PHYS_64BIT
+	depends on PPC_E500MC || (COMPILE_TEST && PPC)
+	select IOMMU_API
+	select GENERIC_ALLOCATOR
+	help
+	  Freescale PAMU support. PAMU is the IOMMU present on Freescale QorIQ platforms.
+	  PAMU can authorize memory access, remap the memory address, and remap I/O
+	  transaction types.
+
+# MSM IOMMU support
+config MSM_IOMMU
+	bool "MSM IOMMU Support"
+	depends on ARM
+	depends on ARCH_MSM8X60 || ARCH_MSM8960 || COMPILE_TEST
+	select IOMMU_API
+	select IOMMU_IO_PGTABLE_ARMV7S
+	help
+	  Support for the IOMMUs found on certain Qualcomm SOCs.
+	  These IOMMUs allow virtualization of the address space used by most
+	  cores within the multimedia subsystem.
+
+	  If unsure, say N here.
+
+config IOMMU_PGTABLES_L2
+	def_bool y
+	depends on MSM_IOMMU && MMU && SMP && CPU_DCACHE_DISABLE=n
+
+# AMD IOMMU support
+config AMD_IOMMU
+	bool "AMD IOMMU support"
+	select SWIOTLB
+	select PCI_MSI
+	select PCI_ATS
+	select PCI_PRI
+	select PCI_PASID
+	select IOMMU_API
+	select IOMMU_IOVA
+	depends on X86_64 && PCI && ACPI
+	---help---
+	  With this option you can enable support for AMD IOMMU hardware in
+	  your system. An IOMMU is a hardware component which provides
+	  remapping of DMA memory accesses from devices. With an AMD IOMMU you
+	  can isolate the DMA memory of different devices and protect the
+	  system from misbehaving device drivers or hardware.
+
+	  You can find out if your system has an AMD IOMMU if you look into
+	  your BIOS for an option to enable it or if you have an IVRS ACPI
+	  table.
+
+config AMD_IOMMU_V2
+	tristate "AMD IOMMU Version 2 driver"
+	depends on AMD_IOMMU
+	select MMU_NOTIFIER
+	---help---
+	  This option enables support for the AMD IOMMUv2 features of the IOMMU
+	  hardware. Select this option if you want to use devices that support
+	  the PCI PRI and PASID interface.
+
+config AMD_IOMMU_DEBUGFS
+	bool "Enable AMD IOMMU internals in DebugFS"
+	depends on AMD_IOMMU && IOMMU_DEBUGFS
+	---help---
+	  !!!WARNING!!!  !!!WARNING!!!  !!!WARNING!!!  !!!WARNING!!!
+
+	  DO NOT ENABLE THIS OPTION UNLESS YOU REALLY, -REALLY- KNOW WHAT YOU ARE DOING!!!
+	  Exposes AMD IOMMU device internals in DebugFS.
+
+	  This option is -NOT- intended for production environments, and should
+	  not generally be enabled.
+
+# Intel IOMMU support
+config DMAR_TABLE
+	bool
+
+config INTEL_IOMMU
+	bool "Support for Intel IOMMU using DMA Remapping Devices"
+	depends on PCI_MSI && ACPI && (X86 || IA64_GENERIC)
+	select IOMMU_API
+	select IOMMU_IOVA
+	select NEED_DMA_MAP_STATE
+	select DMAR_TABLE
+	help
+	  DMA remapping (DMAR) devices support enables independent address
+	  translations for Direct Memory Access (DMA) from devices.
+	  These DMA remapping devices are reported via ACPI tables
+	  and include PCI device scope covered by these DMA
+	  remapping devices.
+
+config INTEL_IOMMU_SVM
+	bool "Support for Shared Virtual Memory with Intel IOMMU"
+	depends on INTEL_IOMMU && X86
+	select PCI_PASID
+	select MMU_NOTIFIER
+	help
+	  Shared Virtual Memory (SVM) provides a facility for devices
+	  to access DMA resources through process address space by
+	  means of a Process Address Space ID (PASID).
+
+config INTEL_IOMMU_DEFAULT_ON
+	def_bool y
+	prompt "Enable Intel DMA Remapping Devices by default"
+	depends on INTEL_IOMMU
+	help
+	  Selecting this option will enable a DMAR device at boot time if
+	  one is found. If this option is not selected, DMAR support can
+	  be enabled by passing intel_iommu=on to the kernel.
+
+config INTEL_IOMMU_BROKEN_GFX_WA
+	bool "Workaround broken graphics drivers (going away soon)"
+	depends on INTEL_IOMMU && BROKEN && X86
+	---help---
+	  Current Graphics drivers tend to use physical address
+	  for DMA and avoid using DMA APIs. Setting this config
+	  option permits the IOMMU driver to set a unity map for
+	  all the OS-visible memory. Hence the driver can continue
+	  to use physical addresses for DMA, at least until this
+	  option is removed in the 2.6.32 kernel.
+
+config INTEL_IOMMU_FLOPPY_WA
+	def_bool y
+	depends on INTEL_IOMMU && X86
+	---help---
+	  Floppy disk drivers are known to bypass DMA API calls
+	  thereby failing to work when IOMMU is enabled. This
+	  workaround will setup a 1:1 mapping for the first
+	  16MiB to make floppy (an ISA device) work.
+
+config IRQ_REMAP
+	bool "Support for Interrupt Remapping"
+	depends on X86_64 && X86_IO_APIC && PCI_MSI && ACPI
+	select DMAR_TABLE
+	---help---
+	  Supports Interrupt remapping for IO-APIC and MSI devices.
+	  To use x2apic mode in the CPU's which support x2APIC enhancements or
+	  to support platforms with CPU's having > 8 bit APIC ID, say Y.
+
+# OMAP IOMMU support
+config OMAP_IOMMU
+	bool "OMAP IOMMU Support"
+	depends on ARM && MMU
+	depends on ARCH_OMAP2PLUS || COMPILE_TEST
+	select IOMMU_API
+	---help---
+	  The OMAP3 media platform drivers depend on iommu support,
+	  if you need them say Y here.
+
+config OMAP_IOMMU_DEBUG
+	bool "Export OMAP IOMMU internals in DebugFS"
+	depends on OMAP_IOMMU && DEBUG_FS
+	---help---
+	  Select this to see extensive information about
+	  the internal state of OMAP IOMMU in debugfs.
+
+	  Say N unless you know you need this.
+
+config ROCKCHIP_IOMMU
+	bool "Rockchip IOMMU Support"
+	depends on ARM || ARM64
+	depends on ARCH_ROCKCHIP || COMPILE_TEST
+	select IOMMU_API
+	select ARM_DMA_USE_IOMMU
+	help
+	  Support for IOMMUs found on Rockchip rk32xx SOCs.
+	  These IOMMUs allow virtualization of the address space used by most
+	  cores within the multimedia subsystem.
+	  Say Y here if you are using a Rockchip SoC that includes an IOMMU
+	  device.
+
+config TEGRA_IOMMU_GART
+	bool "Tegra GART IOMMU Support"
+	depends on ARCH_TEGRA_2x_SOC
+	select IOMMU_API
+	help
+	  Enables support for remapping discontiguous physical memory
+	  shared with the operating system into contiguous I/O virtual
+	  space through the GART (Graphics Address Relocation Table)
+	  hardware included on Tegra SoCs.
+
+config TEGRA_IOMMU_SMMU
+	bool "NVIDIA Tegra SMMU Support"
+	depends on ARCH_TEGRA
+	depends on TEGRA_AHB
+	depends on TEGRA_MC
+	select IOMMU_API
+	help
+	  This driver supports the IOMMU hardware (SMMU) found on NVIDIA Tegra
+	  SoCs (Tegra30 up to Tegra210).
+
+config EXYNOS_IOMMU
+	bool "Exynos IOMMU Support"
+	depends on ARCH_EXYNOS && MMU
+	depends on !CPU_BIG_ENDIAN # revisit driver if we can enable big-endian ptes
+	select IOMMU_API
+	select ARM_DMA_USE_IOMMU
+	help
+	  Support for the IOMMU (System MMU) of Samsung Exynos application
+	  processor family. This enables H/W multimedia accelerators to see
+	  non-linear physical memory chunks as linear memory in their
+	  address space.
+
+	  If unsure, say N here.
+
+config EXYNOS_IOMMU_DEBUG
+	bool "Debugging log for Exynos IOMMU"
+	depends on EXYNOS_IOMMU
+	help
+	  Select this to see the detailed log message that shows what
+	  happens in the IOMMU driver.
+
+	  Say N unless you need kernel log message for IOMMU debugging.
+
+config IPMMU_VMSA
+	bool "Renesas VMSA-compatible IPMMU"
+	depends on ARM || IOMMU_DMA
+	depends on ARCH_RENESAS || (COMPILE_TEST && !GENERIC_ATOMIC64)
+	select IOMMU_API
+	select IOMMU_IO_PGTABLE_LPAE
+	select ARM_DMA_USE_IOMMU
+	help
+	  Support for the Renesas VMSA-compatible IPMMU found in the R-Mobile
+	  APE6, R-Car Gen2, and R-Car Gen3 SoCs.
+
+	  If unsure, say N.
+
+config SPAPR_TCE_IOMMU
+	bool "sPAPR TCE IOMMU Support"
+	depends on PPC_POWERNV || PPC_PSERIES
+	select IOMMU_API
+	help
+	  Enables bits of IOMMU API required by VFIO. The iommu_ops
+	  is not implemented as it is not necessary for VFIO.
+
+# ARM IOMMU support
+config ARM_SMMU
+	bool "ARM Ltd. System MMU (SMMU) Support"
+	depends on (ARM64 || ARM) && MMU
+	select IOMMU_API
+	select IOMMU_IO_PGTABLE_LPAE
+	select ARM_DMA_USE_IOMMU if ARM
+	help
+	  Support for implementations of the ARM System MMU architecture
+	  versions 1 and 2.
+
+	  Say Y here if your SoC includes an IOMMU device implementing
+	  the ARM SMMU architecture.
+
+config ARM_SMMU_V3
+	bool "ARM Ltd. System MMU Version 3 (SMMUv3) Support"
+	depends on ARM64
+	select IOMMU_API
+	select IOMMU_IO_PGTABLE_LPAE
+	select GENERIC_MSI_IRQ_DOMAIN
+	help
+	  Support for implementations of the ARM System MMU architecture
+	  version 3 providing translation support to a PCIe root complex.
+
+	  Say Y here if your system includes an IOMMU device implementing
+	  the ARM SMMUv3 architecture.
+
+config S390_IOMMU
+	def_bool y if S390 && PCI
+	depends on S390 && PCI
+	select IOMMU_API
+	help
+	  Support for the IOMMU API for s390 PCI devices.
+
+config S390_CCW_IOMMU
+	bool "S390 CCW IOMMU Support"
+	depends on S390 && CCW
+	select IOMMU_API
+	help
+	  Enables bits of IOMMU API required by VFIO. The iommu_ops
+	  is not implemented as it is not necessary for VFIO.
+
+config MTK_IOMMU
+	bool "MTK IOMMU Support"
+	depends on ARM || ARM64
+	depends on ARCH_MEDIATEK || COMPILE_TEST
+	select ARM_DMA_USE_IOMMU
+	select IOMMU_API
+	select IOMMU_DMA
+	select IOMMU_IO_PGTABLE_ARMV7S
+	select MEMORY
+	select MTK_SMI
+	help
+	  Support for the M4U on certain Mediatek SOCs. M4U is MultiMedia
+	  Memory Management Unit. This option enables remapping of DMA memory
+	  accesses for the multimedia subsystem.
+
+	  If unsure, say N here.
+
+config MTK_IOMMU_V1
+	bool "MTK IOMMU Version 1 (M4U gen1) Support"
+	depends on ARM
+	depends on ARCH_MEDIATEK || COMPILE_TEST
+	select ARM_DMA_USE_IOMMU
+	select IOMMU_API
+	select MEMORY
+	select MTK_SMI
+	help
+	  Support for the M4U on certain Mediatek SoCs. M4U generation 1 HW is
+	  Multimedia Memory Managememt Unit. This option enables remapping of
+	  DMA memory accesses for the multimedia subsystem.
+
+	  if unsure, say N here.
+
+config QCOM_IOMMU
+	# Note: iommu drivers cannot (yet?) be built as modules
+	bool "Qualcomm IOMMU Support"
+	depends on ARCH_QCOM || (COMPILE_TEST && !GENERIC_ATOMIC64)
+	select IOMMU_API
+	select IOMMU_IO_PGTABLE_LPAE
+	select ARM_DMA_USE_IOMMU
+	help
+	  Support for IOMMU on certain Qualcomm SoCs.
+
+endif # IOMMU_SUPPORT
diff --git a/drivers/iommu/Makefile b/drivers/iommu/Makefile
new file mode 100644
index 000000000..e13ea199f
--- /dev/null
+++ b/drivers/iommu/Makefile
@@ -0,0 +1,33 @@
+# SPDX-License-Identifier: GPL-2.0
+obj-$(CONFIG_IOMMU_API) += iommu.o
+obj-$(CONFIG_IOMMU_API) += iommu-traces.o
+obj-$(CONFIG_IOMMU_API) += iommu-sysfs.o
+obj-$(CONFIG_IOMMU_DEBUGFS) += iommu-debugfs.o
+obj-$(CONFIG_IOMMU_DMA) += dma-iommu.o
+obj-$(CONFIG_IOMMU_IO_PGTABLE) += io-pgtable.o
+obj-$(CONFIG_IOMMU_IO_PGTABLE_ARMV7S) += io-pgtable-arm-v7s.o
+obj-$(CONFIG_IOMMU_IO_PGTABLE_LPAE) += io-pgtable-arm.o
+obj-$(CONFIG_IOMMU_IOVA) += iova.o
+obj-$(CONFIG_OF_IOMMU)	+= of_iommu.o
+obj-$(CONFIG_MSM_IOMMU) += msm_iommu.o
+obj-$(CONFIG_AMD_IOMMU) += amd_iommu.o amd_iommu_init.o amd_iommu_quirks.o
+obj-$(CONFIG_AMD_IOMMU_DEBUGFS) += amd_iommu_debugfs.o
+obj-$(CONFIG_AMD_IOMMU_V2) += amd_iommu_v2.o
+obj-$(CONFIG_ARM_SMMU) += arm-smmu.o
+obj-$(CONFIG_ARM_SMMU_V3) += arm-smmu-v3.o
+obj-$(CONFIG_DMAR_TABLE) += dmar.o
+obj-$(CONFIG_INTEL_IOMMU) += intel-iommu.o intel-pasid.o
+obj-$(CONFIG_INTEL_IOMMU_SVM) += intel-svm.o
+obj-$(CONFIG_IPMMU_VMSA) += ipmmu-vmsa.o
+obj-$(CONFIG_IRQ_REMAP) += intel_irq_remapping.o irq_remapping.o
+obj-$(CONFIG_MTK_IOMMU) += mtk_iommu.o
+obj-$(CONFIG_MTK_IOMMU_V1) += mtk_iommu_v1.o
+obj-$(CONFIG_OMAP_IOMMU) += omap-iommu.o
+obj-$(CONFIG_OMAP_IOMMU_DEBUG) += omap-iommu-debug.o
+obj-$(CONFIG_ROCKCHIP_IOMMU) += rockchip-iommu.o
+obj-$(CONFIG_TEGRA_IOMMU_GART) += tegra-gart.o
+obj-$(CONFIG_TEGRA_IOMMU_SMMU) += tegra-smmu.o
+obj-$(CONFIG_EXYNOS_IOMMU) += exynos-iommu.o
+obj-$(CONFIG_FSL_PAMU) += fsl_pamu.o fsl_pamu_domain.o
+obj-$(CONFIG_S390_IOMMU) += s390-iommu.o
+obj-$(CONFIG_QCOM_IOMMU) += qcom_iommu.o
diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
new file mode 100644
index 000000000..5d5941aca
--- /dev/null
+++ b/drivers/iommu/amd_iommu.c
@@ -0,0 +1,4567 @@
+/*
+ * Copyright (C) 2007-2010 Advanced Micro Devices, Inc.
+ * Author: Joerg Roedel <jroedel@suse.de>
+ *         Leo Duran <leo.duran@amd.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ */
+
+#include <linux/ratelimit.h>
+#include <linux/pci.h>
+#include <linux/acpi.h>
+#include <linux/amba/bus.h>
+#include <linux/platform_device.h>
+#include <linux/pci-ats.h>
+#include <linux/bitmap.h>
+#include <linux/slab.h>
+#include <linux/debugfs.h>
+#include <linux/scatterlist.h>
+#include <linux/dma-mapping.h>
+#include <linux/dma-direct.h>
+#include <linux/iommu-helper.h>
+#include <linux/iommu.h>
+#include <linux/delay.h>
+#include <linux/amd-iommu.h>
+#include <linux/notifier.h>
+#include <linux/export.h>
+#include <linux/irq.h>
+#include <linux/msi.h>
+#include <linux/dma-contiguous.h>
+#include <linux/irqdomain.h>
+#include <linux/percpu.h>
+#include <linux/iova.h>
+#include <asm/irq_remapping.h>
+#include <asm/io_apic.h>
+#include <asm/apic.h>
+#include <asm/hw_irq.h>
+#include <asm/msidef.h>
+#include <asm/proto.h>
+#include <asm/iommu.h>
+#include <asm/gart.h>
+#include <asm/dma.h>
+
+#include "amd_iommu_proto.h"
+#include "amd_iommu_types.h"
+#include "irq_remapping.h"
+
+#define AMD_IOMMU_MAPPING_ERROR	0
+
+#define CMD_SET_TYPE(cmd, t) ((cmd)->data[1] |= ((t) << 28))
+
+#define LOOP_TIMEOUT	100000
+
+/* IO virtual address start page frame number */
+#define IOVA_START_PFN		(1)
+#define IOVA_PFN(addr)		((addr) >> PAGE_SHIFT)
+
+/* Reserved IOVA ranges */
+#define MSI_RANGE_START		(0xfee00000)
+#define MSI_RANGE_END		(0xfeefffff)
+#define HT_RANGE_START		(0xfd00000000ULL)
+#define HT_RANGE_END		(0xffffffffffULL)
+
+/*
+ * This bitmap is used to advertise the page sizes our hardware support
+ * to the IOMMU core, which will then use this information to split
+ * physically contiguous memory regions it is mapping into page sizes
+ * that we support.
+ *
+ * 512GB Pages are not supported due to a hardware bug
+ */
+#define AMD_IOMMU_PGSIZES	((~0xFFFUL) & ~(2ULL << 38))
+
+static DEFINE_SPINLOCK(amd_iommu_devtable_lock);
+static DEFINE_SPINLOCK(pd_bitmap_lock);
+
+/* List of all available dev_data structures */
+static LLIST_HEAD(dev_data_list);
+
+LIST_HEAD(ioapic_map);
+LIST_HEAD(hpet_map);
+LIST_HEAD(acpihid_map);
+
+/*
+ * Domain for untranslated devices - only allocated
+ * if iommu=pt passed on kernel cmd line.
+ */
+const struct iommu_ops amd_iommu_ops;
+
+static ATOMIC_NOTIFIER_HEAD(ppr_notifier);
+int amd_iommu_max_glx_val = -1;
+
+static const struct dma_map_ops amd_iommu_dma_ops;
+
+/*
+ * general struct to manage commands send to an IOMMU
+ */
+struct iommu_cmd {
+	u32 data[4];
+};
+
+struct kmem_cache *amd_iommu_irq_cache;
+
+static void update_domain(struct protection_domain *domain);
+static int protection_domain_init(struct protection_domain *domain);
+static void detach_device(struct device *dev);
+static void iova_domain_flush_tlb(struct iova_domain *iovad);
+
+/*
+ * Data container for a dma_ops specific protection domain
+ */
+struct dma_ops_domain {
+	/* generic protection domain information */
+	struct protection_domain domain;
+
+	/* IOVA RB-Tree */
+	struct iova_domain iovad;
+};
+
+static struct iova_domain reserved_iova_ranges;
+static struct lock_class_key reserved_rbtree_key;
+
+/****************************************************************************
+ *
+ * Helper functions
+ *
+ ****************************************************************************/
+
+static inline int match_hid_uid(struct device *dev,
+				struct acpihid_map_entry *entry)
+{
+	struct acpi_device *adev = ACPI_COMPANION(dev);
+	const char *hid, *uid;
+
+	if (!adev)
+		return -ENODEV;
+
+	hid = acpi_device_hid(adev);
+	uid = acpi_device_uid(adev);
+
+	if (!hid || !(*hid))
+		return -ENODEV;
+
+	if (!uid || !(*uid))
+		return strcmp(hid, entry->hid);
+
+	if (!(*entry->uid))
+		return strcmp(hid, entry->hid);
+
+	return (strcmp(hid, entry->hid) || strcmp(uid, entry->uid));
+}
+
+static inline u16 get_pci_device_id(struct device *dev)
+{
+	struct pci_dev *pdev = to_pci_dev(dev);
+
+	return PCI_DEVID(pdev->bus->number, pdev->devfn);
+}
+
+static inline int get_acpihid_device_id(struct device *dev,
+					struct acpihid_map_entry **entry)
+{
+	struct acpihid_map_entry *p;
+
+	list_for_each_entry(p, &acpihid_map, list) {
+		if (!match_hid_uid(dev, p)) {
+			if (entry)
+				*entry = p;
+			return p->devid;
+		}
+	}
+	return -EINVAL;
+}
+
+static inline int get_device_id(struct device *dev)
+{
+	int devid;
+
+	if (dev_is_pci(dev))
+		devid = get_pci_device_id(dev);
+	else
+		devid = get_acpihid_device_id(dev, NULL);
+
+	return devid;
+}
+
+static struct protection_domain *to_pdomain(struct iommu_domain *dom)
+{
+	return container_of(dom, struct protection_domain, domain);
+}
+
+static struct dma_ops_domain* to_dma_ops_domain(struct protection_domain *domain)
+{
+	BUG_ON(domain->flags != PD_DMA_OPS_MASK);
+	return container_of(domain, struct dma_ops_domain, domain);
+}
+
+static struct iommu_dev_data *alloc_dev_data(u16 devid)
+{
+	struct iommu_dev_data *dev_data;
+
+	dev_data = kzalloc(sizeof(*dev_data), GFP_KERNEL);
+	if (!dev_data)
+		return NULL;
+
+	dev_data->devid = devid;
+	ratelimit_default_init(&dev_data->rs);
+
+	llist_add(&dev_data->dev_data_list, &dev_data_list);
+	return dev_data;
+}
+
+static struct iommu_dev_data *search_dev_data(u16 devid)
+{
+	struct iommu_dev_data *dev_data;
+	struct llist_node *node;
+
+	if (llist_empty(&dev_data_list))
+		return NULL;
+
+	node = dev_data_list.first;
+	llist_for_each_entry(dev_data, node, dev_data_list) {
+		if (dev_data->devid == devid)
+			return dev_data;
+	}
+
+	return NULL;
+}
+
+static int __last_alias(struct pci_dev *pdev, u16 alias, void *data)
+{
+	*(u16 *)data = alias;
+	return 0;
+}
+
+static u16 get_alias(struct device *dev)
+{
+	struct pci_dev *pdev = to_pci_dev(dev);
+	u16 devid, ivrs_alias, pci_alias;
+
+	/* The callers make sure that get_device_id() does not fail here */
+	devid = get_device_id(dev);
+
+	/* For ACPI HID devices, we simply return the devid as such */
+	if (!dev_is_pci(dev))
+		return devid;
+
+	ivrs_alias = amd_iommu_alias_table[devid];
+
+	pci_for_each_dma_alias(pdev, __last_alias, &pci_alias);
+
+	if (ivrs_alias == pci_alias)
+		return ivrs_alias;
+
+	/*
+	 * DMA alias showdown
+	 *
+	 * The IVRS is fairly reliable in telling us about aliases, but it
+	 * can't know about every screwy device.  If we don't have an IVRS
+	 * reported alias, use the PCI reported alias.  In that case we may
+	 * still need to initialize the rlookup and dev_table entries if the
+	 * alias is to a non-existent device.
+	 */
+	if (ivrs_alias == devid) {
+		if (!amd_iommu_rlookup_table[pci_alias]) {
+			amd_iommu_rlookup_table[pci_alias] =
+				amd_iommu_rlookup_table[devid];
+			memcpy(amd_iommu_dev_table[pci_alias].data,
+			       amd_iommu_dev_table[devid].data,
+			       sizeof(amd_iommu_dev_table[pci_alias].data));
+		}
+
+		return pci_alias;
+	}
+
+	pr_info("AMD-Vi: Using IVRS reported alias %02x:%02x.%d "
+		"for device %s[%04x:%04x], kernel reported alias "
+		"%02x:%02x.%d\n", PCI_BUS_NUM(ivrs_alias), PCI_SLOT(ivrs_alias),
+		PCI_FUNC(ivrs_alias), dev_name(dev), pdev->vendor, pdev->device,
+		PCI_BUS_NUM(pci_alias), PCI_SLOT(pci_alias),
+		PCI_FUNC(pci_alias));
+
+	/*
+	 * If we don't have a PCI DMA alias and the IVRS alias is on the same
+	 * bus, then the IVRS table may know about a quirk that we don't.
+	 */
+	if (pci_alias == devid &&
+	    PCI_BUS_NUM(ivrs_alias) == pdev->bus->number) {
+		pci_add_dma_alias(pdev, ivrs_alias & 0xff);
+		pr_info("AMD-Vi: Added PCI DMA alias %02x.%d for %s\n",
+			PCI_SLOT(ivrs_alias), PCI_FUNC(ivrs_alias),
+			dev_name(dev));
+	}
+
+	return ivrs_alias;
+}
+
+static struct iommu_dev_data *find_dev_data(u16 devid)
+{
+	struct iommu_dev_data *dev_data;
+	struct amd_iommu *iommu = amd_iommu_rlookup_table[devid];
+
+	dev_data = search_dev_data(devid);
+
+	if (dev_data == NULL) {
+		dev_data = alloc_dev_data(devid);
+		if (!dev_data)
+			return NULL;
+
+		if (translation_pre_enabled(iommu))
+			dev_data->defer_attach = true;
+	}
+
+	return dev_data;
+}
+
+struct iommu_dev_data *get_dev_data(struct device *dev)
+{
+	return dev->archdata.iommu;
+}
+EXPORT_SYMBOL(get_dev_data);
+
+/*
+* Find or create an IOMMU group for a acpihid device.
+*/
+static struct iommu_group *acpihid_device_group(struct device *dev)
+{
+	struct acpihid_map_entry *p, *entry = NULL;
+	int devid;
+
+	devid = get_acpihid_device_id(dev, &entry);
+	if (devid < 0)
+		return ERR_PTR(devid);
+
+	list_for_each_entry(p, &acpihid_map, list) {
+		if ((devid == p->devid) && p->group)
+			entry->group = p->group;
+	}
+
+	if (!entry->group)
+		entry->group = generic_device_group(dev);
+	else
+		iommu_group_ref_get(entry->group);
+
+	return entry->group;
+}
+
+static bool pci_iommuv2_capable(struct pci_dev *pdev)
+{
+	static const int caps[] = {
+		PCI_EXT_CAP_ID_ATS,
+		PCI_EXT_CAP_ID_PRI,
+		PCI_EXT_CAP_ID_PASID,
+	};
+	int i, pos;
+
+	if (pci_ats_disabled())
+		return false;
+
+	for (i = 0; i < 3; ++i) {
+		pos = pci_find_ext_capability(pdev, caps[i]);
+		if (pos == 0)
+			return false;
+	}
+
+	return true;
+}
+
+static bool pdev_pri_erratum(struct pci_dev *pdev, u32 erratum)
+{
+	struct iommu_dev_data *dev_data;
+
+	dev_data = get_dev_data(&pdev->dev);
+
+	return dev_data->errata & (1 << erratum) ? true : false;
+}
+
+/*
+ * This function checks if the driver got a valid device from the caller to
+ * avoid dereferencing invalid pointers.
+ */
+static bool check_device(struct device *dev)
+{
+	int devid;
+
+	if (!dev || !dev->dma_mask)
+		return false;
+
+	devid = get_device_id(dev);
+	if (devid < 0)
+		return false;
+
+	/* Out of our scope? */
+	if (devid > amd_iommu_last_bdf)
+		return false;
+
+	if (amd_iommu_rlookup_table[devid] == NULL)
+		return false;
+
+	return true;
+}
+
+static void init_iommu_group(struct device *dev)
+{
+	struct iommu_group *group;
+
+	group = iommu_group_get_for_dev(dev);
+	if (IS_ERR(group))
+		return;
+
+	iommu_group_put(group);
+}
+
+static int iommu_init_device(struct device *dev)
+{
+	struct iommu_dev_data *dev_data;
+	struct amd_iommu *iommu;
+	int devid;
+
+	if (dev->archdata.iommu)
+		return 0;
+
+	devid = get_device_id(dev);
+	if (devid < 0)
+		return devid;
+
+	iommu = amd_iommu_rlookup_table[devid];
+
+	dev_data = find_dev_data(devid);
+	if (!dev_data)
+		return -ENOMEM;
+
+	dev_data->alias = get_alias(dev);
+
+	/*
+	 * By default we use passthrough mode for IOMMUv2 capable device.
+	 * But if amd_iommu=force_isolation is set (e.g. to debug DMA to
+	 * invalid address), we ignore the capability for the device so
+	 * it'll be forced to go into translation mode.
+	 */
+	if ((iommu_pass_through || !amd_iommu_force_isolation) &&
+	    dev_is_pci(dev) && pci_iommuv2_capable(to_pci_dev(dev))) {
+		struct amd_iommu *iommu;
+
+		iommu = amd_iommu_rlookup_table[dev_data->devid];
+		dev_data->iommu_v2 = iommu->is_iommu_v2;
+	}
+
+	dev->archdata.iommu = dev_data;
+
+	iommu_device_link(&iommu->iommu, dev);
+
+	return 0;
+}
+
+static void iommu_ignore_device(struct device *dev)
+{
+	u16 alias;
+	int devid;
+
+	devid = get_device_id(dev);
+	if (devid < 0)
+		return;
+
+	alias = get_alias(dev);
+
+	memset(&amd_iommu_dev_table[devid], 0, sizeof(struct dev_table_entry));
+	memset(&amd_iommu_dev_table[alias], 0, sizeof(struct dev_table_entry));
+
+	amd_iommu_rlookup_table[devid] = NULL;
+	amd_iommu_rlookup_table[alias] = NULL;
+}
+
+static void iommu_uninit_device(struct device *dev)
+{
+	struct iommu_dev_data *dev_data;
+	struct amd_iommu *iommu;
+	int devid;
+
+	devid = get_device_id(dev);
+	if (devid < 0)
+		return;
+
+	iommu = amd_iommu_rlookup_table[devid];
+
+	dev_data = search_dev_data(devid);
+	if (!dev_data)
+		return;
+
+	if (dev_data->domain)
+		detach_device(dev);
+
+	iommu_device_unlink(&iommu->iommu, dev);
+
+	iommu_group_remove_device(dev);
+
+	/* Remove dma-ops */
+	dev->dma_ops = NULL;
+
+	/*
+	 * We keep dev_data around for unplugged devices and reuse it when the
+	 * device is re-plugged - not doing so would introduce a ton of races.
+	 */
+}
+
+/****************************************************************************
+ *
+ * Interrupt handling functions
+ *
+ ****************************************************************************/
+
+static void dump_dte_entry(u16 devid)
+{
+	int i;
+
+	for (i = 0; i < 4; ++i)
+		pr_err("AMD-Vi: DTE[%d]: %016llx\n", i,
+			amd_iommu_dev_table[devid].data[i]);
+}
+
+static void dump_command(unsigned long phys_addr)
+{
+	struct iommu_cmd *cmd = iommu_phys_to_virt(phys_addr);
+	int i;
+
+	for (i = 0; i < 4; ++i)
+		pr_err("AMD-Vi: CMD[%d]: %08x\n", i, cmd->data[i]);
+}
+
+static void amd_iommu_report_page_fault(u16 devid, u16 domain_id,
+					u64 address, int flags)
+{
+	struct iommu_dev_data *dev_data = NULL;
+	struct pci_dev *pdev;
+
+	pdev = pci_get_domain_bus_and_slot(0, PCI_BUS_NUM(devid),
+					   devid & 0xff);
+	if (pdev)
+		dev_data = get_dev_data(&pdev->dev);
+
+	if (dev_data && __ratelimit(&dev_data->rs)) {
+		dev_err(&pdev->dev, "Event logged [IO_PAGE_FAULT domain=0x%04x address=0x%016llx flags=0x%04x]\n",
+			domain_id, address, flags);
+	} else if (printk_ratelimit()) {
+		pr_err("AMD-Vi: Event logged [IO_PAGE_FAULT device=%02x:%02x.%x domain=0x%04x address=0x%016llx flags=0x%04x]\n",
+			PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid),
+			domain_id, address, flags);
+	}
+
+	if (pdev)
+		pci_dev_put(pdev);
+}
+
+static void iommu_print_event(struct amd_iommu *iommu, void *__evt)
+{
+	struct device *dev = iommu->iommu.dev;
+	int type, devid, pasid, flags, tag;
+	volatile u32 *event = __evt;
+	int count = 0;
+	u64 address;
+
+retry:
+	type    = (event[1] >> EVENT_TYPE_SHIFT)  & EVENT_TYPE_MASK;
+	devid   = (event[0] >> EVENT_DEVID_SHIFT) & EVENT_DEVID_MASK;
+	pasid   = PPR_PASID(*(u64 *)&event[0]);
+	flags   = (event[1] >> EVENT_FLAGS_SHIFT) & EVENT_FLAGS_MASK;
+	address = (u64)(((u64)event[3]) << 32) | event[2];
+
+	if (type == 0) {
+		/* Did we hit the erratum? */
+		if (++count == LOOP_TIMEOUT) {
+			pr_err("AMD-Vi: No event written to event log\n");
+			return;
+		}
+		udelay(1);
+		goto retry;
+	}
+
+	if (type == EVENT_TYPE_IO_FAULT) {
+		amd_iommu_report_page_fault(devid, pasid, address, flags);
+		return;
+	}
+
+	switch (type) {
+	case EVENT_TYPE_ILL_DEV:
+		dev_err(dev, "Event logged [ILLEGAL_DEV_TABLE_ENTRY device=%02x:%02x.%x pasid=0x%05x address=0x%016llx flags=0x%04x]\n",
+			PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid),
+			pasid, address, flags);
+		dump_dte_entry(devid);
+		break;
+	case EVENT_TYPE_DEV_TAB_ERR:
+		dev_err(dev, "Event logged [DEV_TAB_HARDWARE_ERROR device=%02x:%02x.%x "
+			"address=0x%016llx flags=0x%04x]\n",
+			PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid),
+			address, flags);
+		break;
+	case EVENT_TYPE_PAGE_TAB_ERR:
+		dev_err(dev, "Event logged [PAGE_TAB_HARDWARE_ERROR device=%02x:%02x.%x domain=0x%04x address=0x%016llx flags=0x%04x]\n",
+			PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid),
+			pasid, address, flags);
+		break;
+	case EVENT_TYPE_ILL_CMD:
+		dev_err(dev, "Event logged [ILLEGAL_COMMAND_ERROR address=0x%016llx]\n", address);
+		dump_command(address);
+		break;
+	case EVENT_TYPE_CMD_HARD_ERR:
+		dev_err(dev, "Event logged [COMMAND_HARDWARE_ERROR address=0x%016llx flags=0x%04x]\n",
+			address, flags);
+		break;
+	case EVENT_TYPE_IOTLB_INV_TO:
+		dev_err(dev, "Event logged [IOTLB_INV_TIMEOUT device=%02x:%02x.%x address=0x%016llx]\n",
+			PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid),
+			address);
+		break;
+	case EVENT_TYPE_INV_DEV_REQ:
+		dev_err(dev, "Event logged [INVALID_DEVICE_REQUEST device=%02x:%02x.%x pasid=0x%05x address=0x%016llx flags=0x%04x]\n",
+			PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid),
+			pasid, address, flags);
+		break;
+	case EVENT_TYPE_INV_PPR_REQ:
+		pasid = ((event[0] >> 16) & 0xFFFF)
+			| ((event[1] << 6) & 0xF0000);
+		tag = event[1] & 0x03FF;
+		dev_err(dev, "Event logged [INVALID_PPR_REQUEST device=%02x:%02x.%x pasid=0x%05x address=0x%016llx flags=0x%04x]\n",
+			PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid),
+			pasid, address, flags);
+		break;
+	default:
+		dev_err(dev, "Event logged [UNKNOWN event[0]=0x%08x event[1]=0x%08x event[2]=0x%08x event[3]=0x%08x\n",
+			event[0], event[1], event[2], event[3]);
+	}
+
+	memset(__evt, 0, 4 * sizeof(u32));
+}
+
+static void iommu_poll_events(struct amd_iommu *iommu)
+{
+	u32 head, tail;
+
+	head = readl(iommu->mmio_base + MMIO_EVT_HEAD_OFFSET);
+	tail = readl(iommu->mmio_base + MMIO_EVT_TAIL_OFFSET);
+
+	while (head != tail) {
+		iommu_print_event(iommu, iommu->evt_buf + head);
+		head = (head + EVENT_ENTRY_SIZE) % EVT_BUFFER_SIZE;
+	}
+
+	writel(head, iommu->mmio_base + MMIO_EVT_HEAD_OFFSET);
+}
+
+static void iommu_handle_ppr_entry(struct amd_iommu *iommu, u64 *raw)
+{
+	struct amd_iommu_fault fault;
+
+	if (PPR_REQ_TYPE(raw[0]) != PPR_REQ_FAULT) {
+		pr_err_ratelimited("AMD-Vi: Unknown PPR request received\n");
+		return;
+	}
+
+	fault.address   = raw[1];
+	fault.pasid     = PPR_PASID(raw[0]);
+	fault.device_id = PPR_DEVID(raw[0]);
+	fault.tag       = PPR_TAG(raw[0]);
+	fault.flags     = PPR_FLAGS(raw[0]);
+
+	atomic_notifier_call_chain(&ppr_notifier, 0, &fault);
+}
+
+static void iommu_poll_ppr_log(struct amd_iommu *iommu)
+{
+	u32 head, tail;
+
+	if (iommu->ppr_log == NULL)
+		return;
+
+	head = readl(iommu->mmio_base + MMIO_PPR_HEAD_OFFSET);
+	tail = readl(iommu->mmio_base + MMIO_PPR_TAIL_OFFSET);
+
+	while (head != tail) {
+		volatile u64 *raw;
+		u64 entry[2];
+		int i;
+
+		raw = (u64 *)(iommu->ppr_log + head);
+
+		/*
+		 * Hardware bug: Interrupt may arrive before the entry is
+		 * written to memory. If this happens we need to wait for the
+		 * entry to arrive.
+		 */
+		for (i = 0; i < LOOP_TIMEOUT; ++i) {
+			if (PPR_REQ_TYPE(raw[0]) != 0)
+				break;
+			udelay(1);
+		}
+
+		/* Avoid memcpy function-call overhead */
+		entry[0] = raw[0];
+		entry[1] = raw[1];
+
+		/*
+		 * To detect the hardware bug we need to clear the entry
+		 * back to zero.
+		 */
+		raw[0] = raw[1] = 0UL;
+
+		/* Update head pointer of hardware ring-buffer */
+		head = (head + PPR_ENTRY_SIZE) % PPR_LOG_SIZE;
+		writel(head, iommu->mmio_base + MMIO_PPR_HEAD_OFFSET);
+
+		/* Handle PPR entry */
+		iommu_handle_ppr_entry(iommu, entry);
+
+		/* Refresh ring-buffer information */
+		head = readl(iommu->mmio_base + MMIO_PPR_HEAD_OFFSET);
+		tail = readl(iommu->mmio_base + MMIO_PPR_TAIL_OFFSET);
+	}
+}
+
+#ifdef CONFIG_IRQ_REMAP
+static int (*iommu_ga_log_notifier)(u32);
+
+int amd_iommu_register_ga_log_notifier(int (*notifier)(u32))
+{
+	iommu_ga_log_notifier = notifier;
+
+	return 0;
+}
+EXPORT_SYMBOL(amd_iommu_register_ga_log_notifier);
+
+static void iommu_poll_ga_log(struct amd_iommu *iommu)
+{
+	u32 head, tail, cnt = 0;
+
+	if (iommu->ga_log == NULL)
+		return;
+
+	head = readl(iommu->mmio_base + MMIO_GA_HEAD_OFFSET);
+	tail = readl(iommu->mmio_base + MMIO_GA_TAIL_OFFSET);
+
+	while (head != tail) {
+		volatile u64 *raw;
+		u64 log_entry;
+
+		raw = (u64 *)(iommu->ga_log + head);
+		cnt++;
+
+		/* Avoid memcpy function-call overhead */
+		log_entry = *raw;
+
+		/* Update head pointer of hardware ring-buffer */
+		head = (head + GA_ENTRY_SIZE) % GA_LOG_SIZE;
+		writel(head, iommu->mmio_base + MMIO_GA_HEAD_OFFSET);
+
+		/* Handle GA entry */
+		switch (GA_REQ_TYPE(log_entry)) {
+		case GA_GUEST_NR:
+			if (!iommu_ga_log_notifier)
+				break;
+
+			pr_debug("AMD-Vi: %s: devid=%#x, ga_tag=%#x\n",
+				 __func__, GA_DEVID(log_entry),
+				 GA_TAG(log_entry));
+
+			if (iommu_ga_log_notifier(GA_TAG(log_entry)) != 0)
+				pr_err("AMD-Vi: GA log notifier failed.\n");
+			break;
+		default:
+			break;
+		}
+	}
+}
+#endif /* CONFIG_IRQ_REMAP */
+
+#define AMD_IOMMU_INT_MASK	\
+	(MMIO_STATUS_EVT_INT_MASK | \
+	 MMIO_STATUS_PPR_INT_MASK | \
+	 MMIO_STATUS_GALOG_INT_MASK)
+
+irqreturn_t amd_iommu_int_thread(int irq, void *data)
+{
+	struct amd_iommu *iommu = (struct amd_iommu *) data;
+	u32 status = readl(iommu->mmio_base + MMIO_STATUS_OFFSET);
+
+	while (status & AMD_IOMMU_INT_MASK) {
+		/* Enable EVT and PPR and GA interrupts again */
+		writel(AMD_IOMMU_INT_MASK,
+			iommu->mmio_base + MMIO_STATUS_OFFSET);
+
+		if (status & MMIO_STATUS_EVT_INT_MASK) {
+			pr_devel("AMD-Vi: Processing IOMMU Event Log\n");
+			iommu_poll_events(iommu);
+		}
+
+		if (status & MMIO_STATUS_PPR_INT_MASK) {
+			pr_devel("AMD-Vi: Processing IOMMU PPR Log\n");
+			iommu_poll_ppr_log(iommu);
+		}
+
+#ifdef CONFIG_IRQ_REMAP
+		if (status & MMIO_STATUS_GALOG_INT_MASK) {
+			pr_devel("AMD-Vi: Processing IOMMU GA Log\n");
+			iommu_poll_ga_log(iommu);
+		}
+#endif
+
+		/*
+		 * Hardware bug: ERBT1312
+		 * When re-enabling interrupt (by writing 1
+		 * to clear the bit), the hardware might also try to set
+		 * the interrupt bit in the event status register.
+		 * In this scenario, the bit will be set, and disable
+		 * subsequent interrupts.
+		 *
+		 * Workaround: The IOMMU driver should read back the
+		 * status register and check if the interrupt bits are cleared.
+		 * If not, driver will need to go through the interrupt handler
+		 * again and re-clear the bits
+		 */
+		status = readl(iommu->mmio_base + MMIO_STATUS_OFFSET);
+	}
+	return IRQ_HANDLED;
+}
+
+irqreturn_t amd_iommu_int_handler(int irq, void *data)
+{
+	return IRQ_WAKE_THREAD;
+}
+
+/****************************************************************************
+ *
+ * IOMMU command queuing functions
+ *
+ ****************************************************************************/
+
+static int wait_on_sem(volatile u64 *sem)
+{
+	int i = 0;
+
+	while (*sem == 0 && i < LOOP_TIMEOUT) {
+		udelay(1);
+		i += 1;
+	}
+
+	if (i == LOOP_TIMEOUT) {
+		pr_alert("AMD-Vi: Completion-Wait loop timed out\n");
+		return -EIO;
+	}
+
+	return 0;
+}
+
+static void copy_cmd_to_buffer(struct amd_iommu *iommu,
+			       struct iommu_cmd *cmd)
+{
+	u8 *target;
+
+	target = iommu->cmd_buf + iommu->cmd_buf_tail;
+
+	iommu->cmd_buf_tail += sizeof(*cmd);
+	iommu->cmd_buf_tail %= CMD_BUFFER_SIZE;
+
+	/* Copy command to buffer */
+	memcpy(target, cmd, sizeof(*cmd));
+
+	/* Tell the IOMMU about it */
+	writel(iommu->cmd_buf_tail, iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
+}
+
+static void build_completion_wait(struct iommu_cmd *cmd, u64 address)
+{
+	u64 paddr = iommu_virt_to_phys((void *)address);
+
+	WARN_ON(address & 0x7ULL);
+
+	memset(cmd, 0, sizeof(*cmd));
+	cmd->data[0] = lower_32_bits(paddr) | CMD_COMPL_WAIT_STORE_MASK;
+	cmd->data[1] = upper_32_bits(paddr);
+	cmd->data[2] = 1;
+	CMD_SET_TYPE(cmd, CMD_COMPL_WAIT);
+}
+
+static void build_inv_dte(struct iommu_cmd *cmd, u16 devid)
+{
+	memset(cmd, 0, sizeof(*cmd));
+	cmd->data[0] = devid;
+	CMD_SET_TYPE(cmd, CMD_INV_DEV_ENTRY);
+}
+
+static void build_inv_iommu_pages(struct iommu_cmd *cmd, u64 address,
+				  size_t size, u16 domid, int pde)
+{
+	u64 pages;
+	bool s;
+
+	pages = iommu_num_pages(address, size, PAGE_SIZE);
+	s     = false;
+
+	if (pages > 1) {
+		/*
+		 * If we have to flush more than one page, flush all
+		 * TLB entries for this domain
+		 */
+		address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS;
+		s = true;
+	}
+
+	address &= PAGE_MASK;
+
+	memset(cmd, 0, sizeof(*cmd));
+	cmd->data[1] |= domid;
+	cmd->data[2]  = lower_32_bits(address);
+	cmd->data[3]  = upper_32_bits(address);
+	CMD_SET_TYPE(cmd, CMD_INV_IOMMU_PAGES);
+	if (s) /* size bit - we flush more than one 4kb page */
+		cmd->data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK;
+	if (pde) /* PDE bit - we want to flush everything, not only the PTEs */
+		cmd->data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK;
+}
+
+static void build_inv_iotlb_pages(struct iommu_cmd *cmd, u16 devid, int qdep,
+				  u64 address, size_t size)
+{
+	u64 pages;
+	bool s;
+
+	pages = iommu_num_pages(address, size, PAGE_SIZE);
+	s     = false;
+
+	if (pages > 1) {
+		/*
+		 * If we have to flush more than one page, flush all
+		 * TLB entries for this domain
+		 */
+		address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS;
+		s = true;
+	}
+
+	address &= PAGE_MASK;
+
+	memset(cmd, 0, sizeof(*cmd));
+	cmd->data[0]  = devid;
+	cmd->data[0] |= (qdep & 0xff) << 24;
+	cmd->data[1]  = devid;
+	cmd->data[2]  = lower_32_bits(address);
+	cmd->data[3]  = upper_32_bits(address);
+	CMD_SET_TYPE(cmd, CMD_INV_IOTLB_PAGES);
+	if (s)
+		cmd->data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK;
+}
+
+static void build_inv_iommu_pasid(struct iommu_cmd *cmd, u16 domid, int pasid,
+				  u64 address, bool size)
+{
+	memset(cmd, 0, sizeof(*cmd));
+
+	address &= ~(0xfffULL);
+
+	cmd->data[0]  = pasid;
+	cmd->data[1]  = domid;
+	cmd->data[2]  = lower_32_bits(address);
+	cmd->data[3]  = upper_32_bits(address);
+	cmd->data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK;
+	cmd->data[2] |= CMD_INV_IOMMU_PAGES_GN_MASK;
+	if (size)
+		cmd->data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK;
+	CMD_SET_TYPE(cmd, CMD_INV_IOMMU_PAGES);
+}
+
+static void build_inv_iotlb_pasid(struct iommu_cmd *cmd, u16 devid, int pasid,
+				  int qdep, u64 address, bool size)
+{
+	memset(cmd, 0, sizeof(*cmd));
+
+	address &= ~(0xfffULL);
+
+	cmd->data[0]  = devid;
+	cmd->data[0] |= ((pasid >> 8) & 0xff) << 16;
+	cmd->data[0] |= (qdep  & 0xff) << 24;
+	cmd->data[1]  = devid;
+	cmd->data[1] |= (pasid & 0xff) << 16;
+	cmd->data[2]  = lower_32_bits(address);
+	cmd->data[2] |= CMD_INV_IOMMU_PAGES_GN_MASK;
+	cmd->data[3]  = upper_32_bits(address);
+	if (size)
+		cmd->data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK;
+	CMD_SET_TYPE(cmd, CMD_INV_IOTLB_PAGES);
+}
+
+static void build_complete_ppr(struct iommu_cmd *cmd, u16 devid, int pasid,
+			       int status, int tag, bool gn)
+{
+	memset(cmd, 0, sizeof(*cmd));
+
+	cmd->data[0]  = devid;
+	if (gn) {
+		cmd->data[1]  = pasid;
+		cmd->data[2]  = CMD_INV_IOMMU_PAGES_GN_MASK;
+	}
+	cmd->data[3]  = tag & 0x1ff;
+	cmd->data[3] |= (status & PPR_STATUS_MASK) << PPR_STATUS_SHIFT;
+
+	CMD_SET_TYPE(cmd, CMD_COMPLETE_PPR);
+}
+
+static void build_inv_all(struct iommu_cmd *cmd)
+{
+	memset(cmd, 0, sizeof(*cmd));
+	CMD_SET_TYPE(cmd, CMD_INV_ALL);
+}
+
+static void build_inv_irt(struct iommu_cmd *cmd, u16 devid)
+{
+	memset(cmd, 0, sizeof(*cmd));
+	cmd->data[0] = devid;
+	CMD_SET_TYPE(cmd, CMD_INV_IRT);
+}
+
+/*
+ * Writes the command to the IOMMUs command buffer and informs the
+ * hardware about the new command.
+ */
+static int __iommu_queue_command_sync(struct amd_iommu *iommu,
+				      struct iommu_cmd *cmd,
+				      bool sync)
+{
+	unsigned int count = 0;
+	u32 left, next_tail;
+
+	next_tail = (iommu->cmd_buf_tail + sizeof(*cmd)) % CMD_BUFFER_SIZE;
+again:
+	left      = (iommu->cmd_buf_head - next_tail) % CMD_BUFFER_SIZE;
+
+	if (left <= 0x20) {
+		/* Skip udelay() the first time around */
+		if (count++) {
+			if (count == LOOP_TIMEOUT) {
+				pr_err("AMD-Vi: Command buffer timeout\n");
+				return -EIO;
+			}
+
+			udelay(1);
+		}
+
+		/* Update head and recheck remaining space */
+		iommu->cmd_buf_head = readl(iommu->mmio_base +
+					    MMIO_CMD_HEAD_OFFSET);
+
+		goto again;
+	}
+
+	copy_cmd_to_buffer(iommu, cmd);
+
+	/* Do we need to make sure all commands are processed? */
+	iommu->need_sync = sync;
+
+	return 0;
+}
+
+static int iommu_queue_command_sync(struct amd_iommu *iommu,
+				    struct iommu_cmd *cmd,
+				    bool sync)
+{
+	unsigned long flags;
+	int ret;
+
+	raw_spin_lock_irqsave(&iommu->lock, flags);
+	ret = __iommu_queue_command_sync(iommu, cmd, sync);
+	raw_spin_unlock_irqrestore(&iommu->lock, flags);
+
+	return ret;
+}
+
+static int iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd)
+{
+	return iommu_queue_command_sync(iommu, cmd, true);
+}
+
+/*
+ * This function queues a completion wait command into the command
+ * buffer of an IOMMU
+ */
+static int iommu_completion_wait(struct amd_iommu *iommu)
+{
+	struct iommu_cmd cmd;
+	unsigned long flags;
+	int ret;
+
+	if (!iommu->need_sync)
+		return 0;
+
+
+	build_completion_wait(&cmd, (u64)&iommu->cmd_sem);
+
+	raw_spin_lock_irqsave(&iommu->lock, flags);
+
+	iommu->cmd_sem = 0;
+
+	ret = __iommu_queue_command_sync(iommu, &cmd, false);
+	if (ret)
+		goto out_unlock;
+
+	ret = wait_on_sem(&iommu->cmd_sem);
+
+out_unlock:
+	raw_spin_unlock_irqrestore(&iommu->lock, flags);
+
+	return ret;
+}
+
+static int iommu_flush_dte(struct amd_iommu *iommu, u16 devid)
+{
+	struct iommu_cmd cmd;
+
+	build_inv_dte(&cmd, devid);
+
+	return iommu_queue_command(iommu, &cmd);
+}
+
+static void amd_iommu_flush_dte_all(struct amd_iommu *iommu)
+{
+	u32 devid;
+
+	for (devid = 0; devid <= 0xffff; ++devid)
+		iommu_flush_dte(iommu, devid);
+
+	iommu_completion_wait(iommu);
+}
+
+/*
+ * This function uses heavy locking and may disable irqs for some time. But
+ * this is no issue because it is only called during resume.
+ */
+static void amd_iommu_flush_tlb_all(struct amd_iommu *iommu)
+{
+	u32 dom_id;
+
+	for (dom_id = 0; dom_id <= 0xffff; ++dom_id) {
+		struct iommu_cmd cmd;
+		build_inv_iommu_pages(&cmd, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS,
+				      dom_id, 1);
+		iommu_queue_command(iommu, &cmd);
+	}
+
+	iommu_completion_wait(iommu);
+}
+
+static void amd_iommu_flush_tlb_domid(struct amd_iommu *iommu, u32 dom_id)
+{
+	struct iommu_cmd cmd;
+
+	build_inv_iommu_pages(&cmd, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS,
+			      dom_id, 1);
+	iommu_queue_command(iommu, &cmd);
+
+	iommu_completion_wait(iommu);
+}
+
+static void amd_iommu_flush_all(struct amd_iommu *iommu)
+{
+	struct iommu_cmd cmd;
+
+	build_inv_all(&cmd);
+
+	iommu_queue_command(iommu, &cmd);
+	iommu_completion_wait(iommu);
+}
+
+static void iommu_flush_irt(struct amd_iommu *iommu, u16 devid)
+{
+	struct iommu_cmd cmd;
+
+	build_inv_irt(&cmd, devid);
+
+	iommu_queue_command(iommu, &cmd);
+}
+
+static void amd_iommu_flush_irt_all(struct amd_iommu *iommu)
+{
+	u32 devid;
+
+	for (devid = 0; devid <= MAX_DEV_TABLE_ENTRIES; devid++)
+		iommu_flush_irt(iommu, devid);
+
+	iommu_completion_wait(iommu);
+}
+
+void iommu_flush_all_caches(struct amd_iommu *iommu)
+{
+	if (iommu_feature(iommu, FEATURE_IA)) {
+		amd_iommu_flush_all(iommu);
+	} else {
+		amd_iommu_flush_dte_all(iommu);
+		amd_iommu_flush_irt_all(iommu);
+		amd_iommu_flush_tlb_all(iommu);
+	}
+}
+
+/*
+ * Command send function for flushing on-device TLB
+ */
+static int device_flush_iotlb(struct iommu_dev_data *dev_data,
+			      u64 address, size_t size)
+{
+	struct amd_iommu *iommu;
+	struct iommu_cmd cmd;
+	int qdep;
+
+	qdep     = dev_data->ats.qdep;
+	iommu    = amd_iommu_rlookup_table[dev_data->devid];
+
+	build_inv_iotlb_pages(&cmd, dev_data->devid, qdep, address, size);
+
+	return iommu_queue_command(iommu, &cmd);
+}
+
+/*
+ * Command send function for invalidating a device table entry
+ */
+static int device_flush_dte(struct iommu_dev_data *dev_data)
+{
+	struct amd_iommu *iommu;
+	u16 alias;
+	int ret;
+
+	iommu = amd_iommu_rlookup_table[dev_data->devid];
+	alias = dev_data->alias;
+
+	ret = iommu_flush_dte(iommu, dev_data->devid);
+	if (!ret && alias != dev_data->devid)
+		ret = iommu_flush_dte(iommu, alias);
+	if (ret)
+		return ret;
+
+	if (dev_data->ats.enabled)
+		ret = device_flush_iotlb(dev_data, 0, ~0UL);
+
+	return ret;
+}
+
+/*
+ * TLB invalidation function which is called from the mapping functions.
+ * It invalidates a single PTE if the range to flush is within a single
+ * page. Otherwise it flushes the whole TLB of the IOMMU.
+ */
+static void __domain_flush_pages(struct protection_domain *domain,
+				 u64 address, size_t size, int pde)
+{
+	struct iommu_dev_data *dev_data;
+	struct iommu_cmd cmd;
+	int ret = 0, i;
+
+	build_inv_iommu_pages(&cmd, address, size, domain->id, pde);
+
+	for (i = 0; i < amd_iommu_get_num_iommus(); ++i) {
+		if (!domain->dev_iommu[i])
+			continue;
+
+		/*
+		 * Devices of this domain are behind this IOMMU
+		 * We need a TLB flush
+		 */
+		ret |= iommu_queue_command(amd_iommus[i], &cmd);
+	}
+
+	list_for_each_entry(dev_data, &domain->dev_list, list) {
+
+		if (!dev_data->ats.enabled)
+			continue;
+
+		ret |= device_flush_iotlb(dev_data, address, size);
+	}
+
+	WARN_ON(ret);
+}
+
+static void domain_flush_pages(struct protection_domain *domain,
+			       u64 address, size_t size)
+{
+	__domain_flush_pages(domain, address, size, 0);
+}
+
+/* Flush the whole IO/TLB for a given protection domain */
+static void domain_flush_tlb(struct protection_domain *domain)
+{
+	__domain_flush_pages(domain, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, 0);
+}
+
+/* Flush the whole IO/TLB for a given protection domain - including PDE */
+static void domain_flush_tlb_pde(struct protection_domain *domain)
+{
+	__domain_flush_pages(domain, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, 1);
+}
+
+static void domain_flush_complete(struct protection_domain *domain)
+{
+	int i;
+
+	for (i = 0; i < amd_iommu_get_num_iommus(); ++i) {
+		if (domain && !domain->dev_iommu[i])
+			continue;
+
+		/*
+		 * Devices of this domain are behind this IOMMU
+		 * We need to wait for completion of all commands.
+		 */
+		iommu_completion_wait(amd_iommus[i]);
+	}
+}
+
+
+/*
+ * This function flushes the DTEs for all devices in domain
+ */
+static void domain_flush_devices(struct protection_domain *domain)
+{
+	struct iommu_dev_data *dev_data;
+
+	list_for_each_entry(dev_data, &domain->dev_list, list)
+		device_flush_dte(dev_data);
+}
+
+/****************************************************************************
+ *
+ * The functions below are used the create the page table mappings for
+ * unity mapped regions.
+ *
+ ****************************************************************************/
+
+/*
+ * This function is used to add another level to an IO page table. Adding
+ * another level increases the size of the address space by 9 bits to a size up
+ * to 64 bits.
+ */
+static void increase_address_space(struct protection_domain *domain,
+				   gfp_t gfp)
+{
+	unsigned long flags;
+	u64 *pte;
+
+	pte = (void *)get_zeroed_page(gfp);
+	if (!pte)
+		return;
+
+	spin_lock_irqsave(&domain->lock, flags);
+
+	if (WARN_ON_ONCE(domain->mode == PAGE_MODE_6_LEVEL))
+		/* address space already 64 bit large */
+		goto out;
+
+	*pte             = PM_LEVEL_PDE(domain->mode,
+					iommu_virt_to_phys(domain->pt_root));
+	domain->pt_root  = pte;
+	domain->mode    += 1;
+	domain->updated  = true;
+	pte              = NULL;
+
+out:
+	spin_unlock_irqrestore(&domain->lock, flags);
+	free_page((unsigned long)pte);
+
+	return;
+}
+
+static u64 *alloc_pte(struct protection_domain *domain,
+		      unsigned long address,
+		      unsigned long page_size,
+		      u64 **pte_page,
+		      gfp_t gfp)
+{
+	int level, end_lvl;
+	u64 *pte, *page;
+
+	BUG_ON(!is_power_of_2(page_size));
+
+	while (address > PM_LEVEL_SIZE(domain->mode))
+		increase_address_space(domain, gfp);
+
+	level   = domain->mode - 1;
+	pte     = &domain->pt_root[PM_LEVEL_INDEX(level, address)];
+	address = PAGE_SIZE_ALIGN(address, page_size);
+	end_lvl = PAGE_SIZE_LEVEL(page_size);
+
+	while (level > end_lvl) {
+		u64 __pte, __npte;
+
+		__pte = *pte;
+
+		if (!IOMMU_PTE_PRESENT(__pte)) {
+			page = (u64 *)get_zeroed_page(gfp);
+			if (!page)
+				return NULL;
+
+			__npte = PM_LEVEL_PDE(level, iommu_virt_to_phys(page));
+
+			/* pte could have been changed somewhere. */
+			if (cmpxchg64(pte, __pte, __npte) != __pte) {
+				free_page((unsigned long)page);
+				continue;
+			}
+		}
+
+		/* No level skipping support yet */
+		if (PM_PTE_LEVEL(*pte) != level)
+			return NULL;
+
+		level -= 1;
+
+		pte = IOMMU_PTE_PAGE(*pte);
+
+		if (pte_page && level == end_lvl)
+			*pte_page = pte;
+
+		pte = &pte[PM_LEVEL_INDEX(level, address)];
+	}
+
+	return pte;
+}
+
+/*
+ * This function checks if there is a PTE for a given dma address. If
+ * there is one, it returns the pointer to it.
+ */
+static u64 *fetch_pte(struct protection_domain *domain,
+		      unsigned long address,
+		      unsigned long *page_size)
+{
+	int level;
+	u64 *pte;
+
+	*page_size = 0;
+
+	if (address > PM_LEVEL_SIZE(domain->mode))
+		return NULL;
+
+	level	   =  domain->mode - 1;
+	pte	   = &domain->pt_root[PM_LEVEL_INDEX(level, address)];
+	*page_size =  PTE_LEVEL_PAGE_SIZE(level);
+
+	while (level > 0) {
+
+		/* Not Present */
+		if (!IOMMU_PTE_PRESENT(*pte))
+			return NULL;
+
+		/* Large PTE */
+		if (PM_PTE_LEVEL(*pte) == 7 ||
+		    PM_PTE_LEVEL(*pte) == 0)
+			break;
+
+		/* No level skipping support yet */
+		if (PM_PTE_LEVEL(*pte) != level)
+			return NULL;
+
+		level -= 1;
+
+		/* Walk to the next level */
+		pte	   = IOMMU_PTE_PAGE(*pte);
+		pte	   = &pte[PM_LEVEL_INDEX(level, address)];
+		*page_size = PTE_LEVEL_PAGE_SIZE(level);
+	}
+
+	if (PM_PTE_LEVEL(*pte) == 0x07) {
+		unsigned long pte_mask;
+
+		/*
+		 * If we have a series of large PTEs, make
+		 * sure to return a pointer to the first one.
+		 */
+		*page_size = pte_mask = PTE_PAGE_SIZE(*pte);
+		pte_mask   = ~((PAGE_SIZE_PTE_COUNT(pte_mask) << 3) - 1);
+		pte        = (u64 *)(((unsigned long)pte) & pte_mask);
+	}
+
+	return pte;
+}
+
+/*
+ * Generic mapping functions. It maps a physical address into a DMA
+ * address space. It allocates the page table pages if necessary.
+ * In the future it can be extended to a generic mapping function
+ * supporting all features of AMD IOMMU page tables like level skipping
+ * and full 64 bit address spaces.
+ */
+static int iommu_map_page(struct protection_domain *dom,
+			  unsigned long bus_addr,
+			  unsigned long phys_addr,
+			  unsigned long page_size,
+			  int prot,
+			  gfp_t gfp)
+{
+	u64 __pte, *pte;
+	int i, count;
+
+	BUG_ON(!IS_ALIGNED(bus_addr, page_size));
+	BUG_ON(!IS_ALIGNED(phys_addr, page_size));
+
+	if (!(prot & IOMMU_PROT_MASK))
+		return -EINVAL;
+
+	count = PAGE_SIZE_PTE_COUNT(page_size);
+	pte   = alloc_pte(dom, bus_addr, page_size, NULL, gfp);
+
+	if (!pte)
+		return -ENOMEM;
+
+	for (i = 0; i < count; ++i)
+		if (IOMMU_PTE_PRESENT(pte[i]))
+			return -EBUSY;
+
+	if (count > 1) {
+		__pte = PAGE_SIZE_PTE(__sme_set(phys_addr), page_size);
+		__pte |= PM_LEVEL_ENC(7) | IOMMU_PTE_PR | IOMMU_PTE_FC;
+	} else
+		__pte = __sme_set(phys_addr) | IOMMU_PTE_PR | IOMMU_PTE_FC;
+
+	if (prot & IOMMU_PROT_IR)
+		__pte |= IOMMU_PTE_IR;
+	if (prot & IOMMU_PROT_IW)
+		__pte |= IOMMU_PTE_IW;
+
+	for (i = 0; i < count; ++i)
+		pte[i] = __pte;
+
+	update_domain(dom);
+
+	return 0;
+}
+
+static unsigned long iommu_unmap_page(struct protection_domain *dom,
+				      unsigned long bus_addr,
+				      unsigned long page_size)
+{
+	unsigned long long unmapped;
+	unsigned long unmap_size;
+	u64 *pte;
+
+	BUG_ON(!is_power_of_2(page_size));
+
+	unmapped = 0;
+
+	while (unmapped < page_size) {
+
+		pte = fetch_pte(dom, bus_addr, &unmap_size);
+
+		if (pte) {
+			int i, count;
+
+			count = PAGE_SIZE_PTE_COUNT(unmap_size);
+			for (i = 0; i < count; i++)
+				pte[i] = 0ULL;
+		}
+
+		bus_addr  = (bus_addr & ~(unmap_size - 1)) + unmap_size;
+		unmapped += unmap_size;
+	}
+
+	BUG_ON(unmapped && !is_power_of_2(unmapped));
+
+	return unmapped;
+}
+
+/****************************************************************************
+ *
+ * The next functions belong to the address allocator for the dma_ops
+ * interface functions.
+ *
+ ****************************************************************************/
+
+
+static unsigned long dma_ops_alloc_iova(struct device *dev,
+					struct dma_ops_domain *dma_dom,
+					unsigned int pages, u64 dma_mask)
+{
+	unsigned long pfn = 0;
+
+	pages = __roundup_pow_of_two(pages);
+
+	if (dma_mask > DMA_BIT_MASK(32))
+		pfn = alloc_iova_fast(&dma_dom->iovad, pages,
+				      IOVA_PFN(DMA_BIT_MASK(32)), false);
+
+	if (!pfn)
+		pfn = alloc_iova_fast(&dma_dom->iovad, pages,
+				      IOVA_PFN(dma_mask), true);
+
+	return (pfn << PAGE_SHIFT);
+}
+
+static void dma_ops_free_iova(struct dma_ops_domain *dma_dom,
+			      unsigned long address,
+			      unsigned int pages)
+{
+	pages = __roundup_pow_of_two(pages);
+	address >>= PAGE_SHIFT;
+
+	free_iova_fast(&dma_dom->iovad, address, pages);
+}
+
+/****************************************************************************
+ *
+ * The next functions belong to the domain allocation. A domain is
+ * allocated for every IOMMU as the default domain. If device isolation
+ * is enabled, every device get its own domain. The most important thing
+ * about domains is the page table mapping the DMA address space they
+ * contain.
+ *
+ ****************************************************************************/
+
+/*
+ * This function adds a protection domain to the global protection domain list
+ */
+static void add_domain_to_list(struct protection_domain *domain)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&amd_iommu_pd_lock, flags);
+	list_add(&domain->list, &amd_iommu_pd_list);
+	spin_unlock_irqrestore(&amd_iommu_pd_lock, flags);
+}
+
+/*
+ * This function removes a protection domain to the global
+ * protection domain list
+ */
+static void del_domain_from_list(struct protection_domain *domain)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&amd_iommu_pd_lock, flags);
+	list_del(&domain->list);
+	spin_unlock_irqrestore(&amd_iommu_pd_lock, flags);
+}
+
+static u16 domain_id_alloc(void)
+{
+	int id;
+
+	spin_lock(&pd_bitmap_lock);
+	id = find_first_zero_bit(amd_iommu_pd_alloc_bitmap, MAX_DOMAIN_ID);
+	BUG_ON(id == 0);
+	if (id > 0 && id < MAX_DOMAIN_ID)
+		__set_bit(id, amd_iommu_pd_alloc_bitmap);
+	else
+		id = 0;
+	spin_unlock(&pd_bitmap_lock);
+
+	return id;
+}
+
+static void domain_id_free(int id)
+{
+	spin_lock(&pd_bitmap_lock);
+	if (id > 0 && id < MAX_DOMAIN_ID)
+		__clear_bit(id, amd_iommu_pd_alloc_bitmap);
+	spin_unlock(&pd_bitmap_lock);
+}
+
+#define DEFINE_FREE_PT_FN(LVL, FN)				\
+static void free_pt_##LVL (unsigned long __pt)			\
+{								\
+	unsigned long p;					\
+	u64 *pt;						\
+	int i;							\
+								\
+	pt = (u64 *)__pt;					\
+								\
+	for (i = 0; i < 512; ++i) {				\
+		/* PTE present? */				\
+		if (!IOMMU_PTE_PRESENT(pt[i]))			\
+			continue;				\
+								\
+		/* Large PTE? */				\
+		if (PM_PTE_LEVEL(pt[i]) == 0 ||			\
+		    PM_PTE_LEVEL(pt[i]) == 7)			\
+			continue;				\
+								\
+		p = (unsigned long)IOMMU_PTE_PAGE(pt[i]);	\
+		FN(p);						\
+	}							\
+	free_page((unsigned long)pt);				\
+}
+
+DEFINE_FREE_PT_FN(l2, free_page)
+DEFINE_FREE_PT_FN(l3, free_pt_l2)
+DEFINE_FREE_PT_FN(l4, free_pt_l3)
+DEFINE_FREE_PT_FN(l5, free_pt_l4)
+DEFINE_FREE_PT_FN(l6, free_pt_l5)
+
+static void free_pagetable(struct protection_domain *domain)
+{
+	unsigned long root = (unsigned long)domain->pt_root;
+
+	switch (domain->mode) {
+	case PAGE_MODE_NONE:
+		break;
+	case PAGE_MODE_1_LEVEL:
+		free_page(root);
+		break;
+	case PAGE_MODE_2_LEVEL:
+		free_pt_l2(root);
+		break;
+	case PAGE_MODE_3_LEVEL:
+		free_pt_l3(root);
+		break;
+	case PAGE_MODE_4_LEVEL:
+		free_pt_l4(root);
+		break;
+	case PAGE_MODE_5_LEVEL:
+		free_pt_l5(root);
+		break;
+	case PAGE_MODE_6_LEVEL:
+		free_pt_l6(root);
+		break;
+	default:
+		BUG();
+	}
+}
+
+static void free_gcr3_tbl_level1(u64 *tbl)
+{
+	u64 *ptr;
+	int i;
+
+	for (i = 0; i < 512; ++i) {
+		if (!(tbl[i] & GCR3_VALID))
+			continue;
+
+		ptr = iommu_phys_to_virt(tbl[i] & PAGE_MASK);
+
+		free_page((unsigned long)ptr);
+	}
+}
+
+static void free_gcr3_tbl_level2(u64 *tbl)
+{
+	u64 *ptr;
+	int i;
+
+	for (i = 0; i < 512; ++i) {
+		if (!(tbl[i] & GCR3_VALID))
+			continue;
+
+		ptr = iommu_phys_to_virt(tbl[i] & PAGE_MASK);
+
+		free_gcr3_tbl_level1(ptr);
+	}
+}
+
+static void free_gcr3_table(struct protection_domain *domain)
+{
+	if (domain->glx == 2)
+		free_gcr3_tbl_level2(domain->gcr3_tbl);
+	else if (domain->glx == 1)
+		free_gcr3_tbl_level1(domain->gcr3_tbl);
+	else
+		BUG_ON(domain->glx != 0);
+
+	free_page((unsigned long)domain->gcr3_tbl);
+}
+
+static void dma_ops_domain_flush_tlb(struct dma_ops_domain *dom)
+{
+	domain_flush_tlb(&dom->domain);
+	domain_flush_complete(&dom->domain);
+}
+
+static void iova_domain_flush_tlb(struct iova_domain *iovad)
+{
+	struct dma_ops_domain *dom;
+
+	dom = container_of(iovad, struct dma_ops_domain, iovad);
+
+	dma_ops_domain_flush_tlb(dom);
+}
+
+/*
+ * Free a domain, only used if something went wrong in the
+ * allocation path and we need to free an already allocated page table
+ */
+static void dma_ops_domain_free(struct dma_ops_domain *dom)
+{
+	if (!dom)
+		return;
+
+	del_domain_from_list(&dom->domain);
+
+	put_iova_domain(&dom->iovad);
+
+	free_pagetable(&dom->domain);
+
+	if (dom->domain.id)
+		domain_id_free(dom->domain.id);
+
+	kfree(dom);
+}
+
+/*
+ * Allocates a new protection domain usable for the dma_ops functions.
+ * It also initializes the page table and the address allocator data
+ * structures required for the dma_ops interface
+ */
+static struct dma_ops_domain *dma_ops_domain_alloc(void)
+{
+	struct dma_ops_domain *dma_dom;
+
+	dma_dom = kzalloc(sizeof(struct dma_ops_domain), GFP_KERNEL);
+	if (!dma_dom)
+		return NULL;
+
+	if (protection_domain_init(&dma_dom->domain))
+		goto free_dma_dom;
+
+	dma_dom->domain.mode = PAGE_MODE_3_LEVEL;
+	dma_dom->domain.pt_root = (void *)get_zeroed_page(GFP_KERNEL);
+	dma_dom->domain.flags = PD_DMA_OPS_MASK;
+	if (!dma_dom->domain.pt_root)
+		goto free_dma_dom;
+
+	init_iova_domain(&dma_dom->iovad, PAGE_SIZE, IOVA_START_PFN);
+
+	if (init_iova_flush_queue(&dma_dom->iovad, iova_domain_flush_tlb, NULL))
+		goto free_dma_dom;
+
+	/* Initialize reserved ranges */
+	copy_reserved_iova(&reserved_iova_ranges, &dma_dom->iovad);
+
+	add_domain_to_list(&dma_dom->domain);
+
+	return dma_dom;
+
+free_dma_dom:
+	dma_ops_domain_free(dma_dom);
+
+	return NULL;
+}
+
+/*
+ * little helper function to check whether a given protection domain is a
+ * dma_ops domain
+ */
+static bool dma_ops_domain(struct protection_domain *domain)
+{
+	return domain->flags & PD_DMA_OPS_MASK;
+}
+
+static void set_dte_entry(u16 devid, struct protection_domain *domain,
+			  bool ats, bool ppr)
+{
+	u64 pte_root = 0;
+	u64 flags = 0;
+	u32 old_domid;
+
+	if (domain->mode != PAGE_MODE_NONE)
+		pte_root = iommu_virt_to_phys(domain->pt_root);
+
+	pte_root |= (domain->mode & DEV_ENTRY_MODE_MASK)
+		    << DEV_ENTRY_MODE_SHIFT;
+	pte_root |= DTE_FLAG_IR | DTE_FLAG_IW | DTE_FLAG_V | DTE_FLAG_TV;
+
+	flags = amd_iommu_dev_table[devid].data[1];
+
+	if (ats)
+		flags |= DTE_FLAG_IOTLB;
+
+	if (ppr) {
+		struct amd_iommu *iommu = amd_iommu_rlookup_table[devid];
+
+		if (iommu_feature(iommu, FEATURE_EPHSUP))
+			pte_root |= 1ULL << DEV_ENTRY_PPR;
+	}
+
+	if (domain->flags & PD_IOMMUV2_MASK) {
+		u64 gcr3 = iommu_virt_to_phys(domain->gcr3_tbl);
+		u64 glx  = domain->glx;
+		u64 tmp;
+
+		pte_root |= DTE_FLAG_GV;
+		pte_root |= (glx & DTE_GLX_MASK) << DTE_GLX_SHIFT;
+
+		/* First mask out possible old values for GCR3 table */
+		tmp = DTE_GCR3_VAL_B(~0ULL) << DTE_GCR3_SHIFT_B;
+		flags    &= ~tmp;
+
+		tmp = DTE_GCR3_VAL_C(~0ULL) << DTE_GCR3_SHIFT_C;
+		flags    &= ~tmp;
+
+		/* Encode GCR3 table into DTE */
+		tmp = DTE_GCR3_VAL_A(gcr3) << DTE_GCR3_SHIFT_A;
+		pte_root |= tmp;
+
+		tmp = DTE_GCR3_VAL_B(gcr3) << DTE_GCR3_SHIFT_B;
+		flags    |= tmp;
+
+		tmp = DTE_GCR3_VAL_C(gcr3) << DTE_GCR3_SHIFT_C;
+		flags    |= tmp;
+	}
+
+	flags &= ~DEV_DOMID_MASK;
+	flags |= domain->id;
+
+	old_domid = amd_iommu_dev_table[devid].data[1] & DEV_DOMID_MASK;
+	amd_iommu_dev_table[devid].data[1]  = flags;
+	amd_iommu_dev_table[devid].data[0]  = pte_root;
+
+	/*
+	 * A kdump kernel might be replacing a domain ID that was copied from
+	 * the previous kernel--if so, it needs to flush the translation cache
+	 * entries for the old domain ID that is being overwritten
+	 */
+	if (old_domid) {
+		struct amd_iommu *iommu = amd_iommu_rlookup_table[devid];
+
+		amd_iommu_flush_tlb_domid(iommu, old_domid);
+	}
+}
+
+static void clear_dte_entry(u16 devid)
+{
+	/* remove entry from the device table seen by the hardware */
+	amd_iommu_dev_table[devid].data[0]  = DTE_FLAG_V | DTE_FLAG_TV;
+	amd_iommu_dev_table[devid].data[1] &= DTE_FLAG_MASK;
+
+	amd_iommu_apply_erratum_63(devid);
+}
+
+static void do_attach(struct iommu_dev_data *dev_data,
+		      struct protection_domain *domain)
+{
+	struct amd_iommu *iommu;
+	u16 alias;
+	bool ats;
+
+	iommu = amd_iommu_rlookup_table[dev_data->devid];
+	alias = dev_data->alias;
+	ats   = dev_data->ats.enabled;
+
+	/* Update data structures */
+	dev_data->domain = domain;
+	list_add(&dev_data->list, &domain->dev_list);
+
+	/* Do reference counting */
+	domain->dev_iommu[iommu->index] += 1;
+	domain->dev_cnt                 += 1;
+
+	/* Update device table */
+	set_dte_entry(dev_data->devid, domain, ats, dev_data->iommu_v2);
+	if (alias != dev_data->devid)
+		set_dte_entry(alias, domain, ats, dev_data->iommu_v2);
+
+	device_flush_dte(dev_data);
+}
+
+static void do_detach(struct iommu_dev_data *dev_data)
+{
+	struct protection_domain *domain = dev_data->domain;
+	struct amd_iommu *iommu;
+	u16 alias;
+
+	iommu = amd_iommu_rlookup_table[dev_data->devid];
+	alias = dev_data->alias;
+
+	/* Update data structures */
+	dev_data->domain = NULL;
+	list_del(&dev_data->list);
+	clear_dte_entry(dev_data->devid);
+	if (alias != dev_data->devid)
+		clear_dte_entry(alias);
+
+	/* Flush the DTE entry */
+	device_flush_dte(dev_data);
+
+	/* Flush IOTLB */
+	domain_flush_tlb_pde(domain);
+
+	/* Wait for the flushes to finish */
+	domain_flush_complete(domain);
+
+	/* decrease reference counters - needs to happen after the flushes */
+	domain->dev_iommu[iommu->index] -= 1;
+	domain->dev_cnt                 -= 1;
+}
+
+/*
+ * If a device is not yet associated with a domain, this function makes the
+ * device visible in the domain
+ */
+static int __attach_device(struct iommu_dev_data *dev_data,
+			   struct protection_domain *domain)
+{
+	int ret;
+
+	/* lock domain */
+	spin_lock(&domain->lock);
+
+	ret = -EBUSY;
+	if (dev_data->domain != NULL)
+		goto out_unlock;
+
+	/* Attach alias group root */
+	do_attach(dev_data, domain);
+
+	ret = 0;
+
+out_unlock:
+
+	/* ready */
+	spin_unlock(&domain->lock);
+
+	return ret;
+}
+
+
+static void pdev_iommuv2_disable(struct pci_dev *pdev)
+{
+	pci_disable_ats(pdev);
+	pci_disable_pri(pdev);
+	pci_disable_pasid(pdev);
+}
+
+/* FIXME: Change generic reset-function to do the same */
+static int pri_reset_while_enabled(struct pci_dev *pdev)
+{
+	u16 control;
+	int pos;
+
+	pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI);
+	if (!pos)
+		return -EINVAL;
+
+	pci_read_config_word(pdev, pos + PCI_PRI_CTRL, &control);
+	control |= PCI_PRI_CTRL_RESET;
+	pci_write_config_word(pdev, pos + PCI_PRI_CTRL, control);
+
+	return 0;
+}
+
+static int pdev_iommuv2_enable(struct pci_dev *pdev)
+{
+	bool reset_enable;
+	int reqs, ret;
+
+	/* FIXME: Hardcode number of outstanding requests for now */
+	reqs = 32;
+	if (pdev_pri_erratum(pdev, AMD_PRI_DEV_ERRATUM_LIMIT_REQ_ONE))
+		reqs = 1;
+	reset_enable = pdev_pri_erratum(pdev, AMD_PRI_DEV_ERRATUM_ENABLE_RESET);
+
+	/* Only allow access to user-accessible pages */
+	ret = pci_enable_pasid(pdev, 0);
+	if (ret)
+		goto out_err;
+
+	/* First reset the PRI state of the device */
+	ret = pci_reset_pri(pdev);
+	if (ret)
+		goto out_err;
+
+	/* Enable PRI */
+	ret = pci_enable_pri(pdev, reqs);
+	if (ret)
+		goto out_err;
+
+	if (reset_enable) {
+		ret = pri_reset_while_enabled(pdev);
+		if (ret)
+			goto out_err;
+	}
+
+	ret = pci_enable_ats(pdev, PAGE_SHIFT);
+	if (ret)
+		goto out_err;
+
+	return 0;
+
+out_err:
+	pci_disable_pri(pdev);
+	pci_disable_pasid(pdev);
+
+	return ret;
+}
+
+/* FIXME: Move this to PCI code */
+#define PCI_PRI_TLP_OFF		(1 << 15)
+
+static bool pci_pri_tlp_required(struct pci_dev *pdev)
+{
+	u16 status;
+	int pos;
+
+	pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI);
+	if (!pos)
+		return false;
+
+	pci_read_config_word(pdev, pos + PCI_PRI_STATUS, &status);
+
+	return (status & PCI_PRI_TLP_OFF) ? true : false;
+}
+
+/*
+ * If a device is not yet associated with a domain, this function makes the
+ * device visible in the domain
+ */
+static int attach_device(struct device *dev,
+			 struct protection_domain *domain)
+{
+	struct pci_dev *pdev;
+	struct iommu_dev_data *dev_data;
+	unsigned long flags;
+	int ret;
+
+	dev_data = get_dev_data(dev);
+
+	if (!dev_is_pci(dev))
+		goto skip_ats_check;
+
+	pdev = to_pci_dev(dev);
+	if (domain->flags & PD_IOMMUV2_MASK) {
+		if (!dev_data->passthrough)
+			return -EINVAL;
+
+		if (dev_data->iommu_v2) {
+			if (pdev_iommuv2_enable(pdev) != 0)
+				return -EINVAL;
+
+			dev_data->ats.enabled = true;
+			dev_data->ats.qdep    = pci_ats_queue_depth(pdev);
+			dev_data->pri_tlp     = pci_pri_tlp_required(pdev);
+		}
+	} else if (amd_iommu_iotlb_sup &&
+		   pci_enable_ats(pdev, PAGE_SHIFT) == 0) {
+		dev_data->ats.enabled = true;
+		dev_data->ats.qdep    = pci_ats_queue_depth(pdev);
+	}
+
+skip_ats_check:
+	spin_lock_irqsave(&amd_iommu_devtable_lock, flags);
+	ret = __attach_device(dev_data, domain);
+	spin_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
+
+	/*
+	 * We might boot into a crash-kernel here. The crashed kernel
+	 * left the caches in the IOMMU dirty. So we have to flush
+	 * here to evict all dirty stuff.
+	 */
+	domain_flush_tlb_pde(domain);
+
+	domain_flush_complete(domain);
+
+	return ret;
+}
+
+/*
+ * Removes a device from a protection domain (unlocked)
+ */
+static void __detach_device(struct iommu_dev_data *dev_data)
+{
+	struct protection_domain *domain;
+
+	domain = dev_data->domain;
+
+	spin_lock(&domain->lock);
+
+	do_detach(dev_data);
+
+	spin_unlock(&domain->lock);
+}
+
+/*
+ * Removes a device from a protection domain (with devtable_lock held)
+ */
+static void detach_device(struct device *dev)
+{
+	struct protection_domain *domain;
+	struct iommu_dev_data *dev_data;
+	unsigned long flags;
+
+	dev_data = get_dev_data(dev);
+	domain   = dev_data->domain;
+
+	/*
+	 * First check if the device is still attached. It might already
+	 * be detached from its domain because the generic
+	 * iommu_detach_group code detached it and we try again here in
+	 * our alias handling.
+	 */
+	if (WARN_ON(!dev_data->domain))
+		return;
+
+	/* lock device table */
+	spin_lock_irqsave(&amd_iommu_devtable_lock, flags);
+	__detach_device(dev_data);
+	spin_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
+
+	if (!dev_is_pci(dev))
+		return;
+
+	if (domain->flags & PD_IOMMUV2_MASK && dev_data->iommu_v2)
+		pdev_iommuv2_disable(to_pci_dev(dev));
+	else if (dev_data->ats.enabled)
+		pci_disable_ats(to_pci_dev(dev));
+
+	dev_data->ats.enabled = false;
+}
+
+static int amd_iommu_add_device(struct device *dev)
+{
+	struct iommu_dev_data *dev_data;
+	struct iommu_domain *domain;
+	struct amd_iommu *iommu;
+	int ret, devid;
+
+	if (!check_device(dev) || get_dev_data(dev))
+		return 0;
+
+	devid = get_device_id(dev);
+	if (devid < 0)
+		return devid;
+
+	iommu = amd_iommu_rlookup_table[devid];
+
+	ret = iommu_init_device(dev);
+	if (ret) {
+		if (ret != -ENOTSUPP)
+			pr_err("Failed to initialize device %s - trying to proceed anyway\n",
+				dev_name(dev));
+
+		iommu_ignore_device(dev);
+		dev->dma_ops = &dma_direct_ops;
+		goto out;
+	}
+	init_iommu_group(dev);
+
+	dev_data = get_dev_data(dev);
+
+	BUG_ON(!dev_data);
+
+	if (iommu_pass_through || dev_data->iommu_v2)
+		iommu_request_dm_for_dev(dev);
+
+	/* Domains are initialized for this device - have a look what we ended up with */
+	domain = iommu_get_domain_for_dev(dev);
+	if (domain->type == IOMMU_DOMAIN_IDENTITY)
+		dev_data->passthrough = true;
+	else
+		dev->dma_ops = &amd_iommu_dma_ops;
+
+out:
+	iommu_completion_wait(iommu);
+
+	return 0;
+}
+
+static void amd_iommu_remove_device(struct device *dev)
+{
+	struct amd_iommu *iommu;
+	int devid;
+
+	if (!check_device(dev))
+		return;
+
+	devid = get_device_id(dev);
+	if (devid < 0)
+		return;
+
+	iommu = amd_iommu_rlookup_table[devid];
+
+	iommu_uninit_device(dev);
+	iommu_completion_wait(iommu);
+}
+
+static struct iommu_group *amd_iommu_device_group(struct device *dev)
+{
+	if (dev_is_pci(dev))
+		return pci_device_group(dev);
+
+	return acpihid_device_group(dev);
+}
+
+/*****************************************************************************
+ *
+ * The next functions belong to the dma_ops mapping/unmapping code.
+ *
+ *****************************************************************************/
+
+/*
+ * In the dma_ops path we only have the struct device. This function
+ * finds the corresponding IOMMU, the protection domain and the
+ * requestor id for a given device.
+ * If the device is not yet associated with a domain this is also done
+ * in this function.
+ */
+static struct protection_domain *get_domain(struct device *dev)
+{
+	struct protection_domain *domain;
+	struct iommu_domain *io_domain;
+
+	if (!check_device(dev))
+		return ERR_PTR(-EINVAL);
+
+	domain = get_dev_data(dev)->domain;
+	if (domain == NULL && get_dev_data(dev)->defer_attach) {
+		get_dev_data(dev)->defer_attach = false;
+		io_domain = iommu_get_domain_for_dev(dev);
+		domain = to_pdomain(io_domain);
+		attach_device(dev, domain);
+	}
+	if (domain == NULL)
+		return ERR_PTR(-EBUSY);
+
+	if (!dma_ops_domain(domain))
+		return ERR_PTR(-EBUSY);
+
+	return domain;
+}
+
+static void update_device_table(struct protection_domain *domain)
+{
+	struct iommu_dev_data *dev_data;
+
+	list_for_each_entry(dev_data, &domain->dev_list, list) {
+		set_dte_entry(dev_data->devid, domain, dev_data->ats.enabled,
+			      dev_data->iommu_v2);
+
+		if (dev_data->devid == dev_data->alias)
+			continue;
+
+		/* There is an alias, update device table entry for it */
+		set_dte_entry(dev_data->alias, domain, dev_data->ats.enabled,
+			      dev_data->iommu_v2);
+	}
+}
+
+static void update_domain(struct protection_domain *domain)
+{
+	if (!domain->updated)
+		return;
+
+	update_device_table(domain);
+
+	domain_flush_devices(domain);
+	domain_flush_tlb_pde(domain);
+
+	domain->updated = false;
+}
+
+static int dir2prot(enum dma_data_direction direction)
+{
+	if (direction == DMA_TO_DEVICE)
+		return IOMMU_PROT_IR;
+	else if (direction == DMA_FROM_DEVICE)
+		return IOMMU_PROT_IW;
+	else if (direction == DMA_BIDIRECTIONAL)
+		return IOMMU_PROT_IW | IOMMU_PROT_IR;
+	else
+		return 0;
+}
+
+/*
+ * This function contains common code for mapping of a physically
+ * contiguous memory region into DMA address space. It is used by all
+ * mapping functions provided with this IOMMU driver.
+ * Must be called with the domain lock held.
+ */
+static dma_addr_t __map_single(struct device *dev,
+			       struct dma_ops_domain *dma_dom,
+			       phys_addr_t paddr,
+			       size_t size,
+			       enum dma_data_direction direction,
+			       u64 dma_mask)
+{
+	dma_addr_t offset = paddr & ~PAGE_MASK;
+	dma_addr_t address, start, ret;
+	unsigned int pages;
+	int prot = 0;
+	int i;
+
+	pages = iommu_num_pages(paddr, size, PAGE_SIZE);
+	paddr &= PAGE_MASK;
+
+	address = dma_ops_alloc_iova(dev, dma_dom, pages, dma_mask);
+	if (address == AMD_IOMMU_MAPPING_ERROR)
+		goto out;
+
+	prot = dir2prot(direction);
+
+	start = address;
+	for (i = 0; i < pages; ++i) {
+		ret = iommu_map_page(&dma_dom->domain, start, paddr,
+				     PAGE_SIZE, prot, GFP_ATOMIC);
+		if (ret)
+			goto out_unmap;
+
+		paddr += PAGE_SIZE;
+		start += PAGE_SIZE;
+	}
+	address += offset;
+
+	if (unlikely(amd_iommu_np_cache)) {
+		domain_flush_pages(&dma_dom->domain, address, size);
+		domain_flush_complete(&dma_dom->domain);
+	}
+
+out:
+	return address;
+
+out_unmap:
+
+	for (--i; i >= 0; --i) {
+		start -= PAGE_SIZE;
+		iommu_unmap_page(&dma_dom->domain, start, PAGE_SIZE);
+	}
+
+	domain_flush_tlb(&dma_dom->domain);
+	domain_flush_complete(&dma_dom->domain);
+
+	dma_ops_free_iova(dma_dom, address, pages);
+
+	return AMD_IOMMU_MAPPING_ERROR;
+}
+
+/*
+ * Does the reverse of the __map_single function. Must be called with
+ * the domain lock held too
+ */
+static void __unmap_single(struct dma_ops_domain *dma_dom,
+			   dma_addr_t dma_addr,
+			   size_t size,
+			   int dir)
+{
+	dma_addr_t i, start;
+	unsigned int pages;
+
+	pages = iommu_num_pages(dma_addr, size, PAGE_SIZE);
+	dma_addr &= PAGE_MASK;
+	start = dma_addr;
+
+	for (i = 0; i < pages; ++i) {
+		iommu_unmap_page(&dma_dom->domain, start, PAGE_SIZE);
+		start += PAGE_SIZE;
+	}
+
+	if (amd_iommu_unmap_flush) {
+		domain_flush_tlb(&dma_dom->domain);
+		domain_flush_complete(&dma_dom->domain);
+		dma_ops_free_iova(dma_dom, dma_addr, pages);
+	} else {
+		pages = __roundup_pow_of_two(pages);
+		queue_iova(&dma_dom->iovad, dma_addr >> PAGE_SHIFT, pages, 0);
+	}
+}
+
+/*
+ * The exported map_single function for dma_ops.
+ */
+static dma_addr_t map_page(struct device *dev, struct page *page,
+			   unsigned long offset, size_t size,
+			   enum dma_data_direction dir,
+			   unsigned long attrs)
+{
+	phys_addr_t paddr = page_to_phys(page) + offset;
+	struct protection_domain *domain;
+	struct dma_ops_domain *dma_dom;
+	u64 dma_mask;
+
+	domain = get_domain(dev);
+	if (PTR_ERR(domain) == -EINVAL)
+		return (dma_addr_t)paddr;
+	else if (IS_ERR(domain))
+		return AMD_IOMMU_MAPPING_ERROR;
+
+	dma_mask = *dev->dma_mask;
+	dma_dom = to_dma_ops_domain(domain);
+
+	return __map_single(dev, dma_dom, paddr, size, dir, dma_mask);
+}
+
+/*
+ * The exported unmap_single function for dma_ops.
+ */
+static void unmap_page(struct device *dev, dma_addr_t dma_addr, size_t size,
+		       enum dma_data_direction dir, unsigned long attrs)
+{
+	struct protection_domain *domain;
+	struct dma_ops_domain *dma_dom;
+
+	domain = get_domain(dev);
+	if (IS_ERR(domain))
+		return;
+
+	dma_dom = to_dma_ops_domain(domain);
+
+	__unmap_single(dma_dom, dma_addr, size, dir);
+}
+
+static int sg_num_pages(struct device *dev,
+			struct scatterlist *sglist,
+			int nelems)
+{
+	unsigned long mask, boundary_size;
+	struct scatterlist *s;
+	int i, npages = 0;
+
+	mask          = dma_get_seg_boundary(dev);
+	boundary_size = mask + 1 ? ALIGN(mask + 1, PAGE_SIZE) >> PAGE_SHIFT :
+				   1UL << (BITS_PER_LONG - PAGE_SHIFT);
+
+	for_each_sg(sglist, s, nelems, i) {
+		int p, n;
+
+		s->dma_address = npages << PAGE_SHIFT;
+		p = npages % boundary_size;
+		n = iommu_num_pages(sg_phys(s), s->length, PAGE_SIZE);
+		if (p + n > boundary_size)
+			npages += boundary_size - p;
+		npages += n;
+	}
+
+	return npages;
+}
+
+/*
+ * The exported map_sg function for dma_ops (handles scatter-gather
+ * lists).
+ */
+static int map_sg(struct device *dev, struct scatterlist *sglist,
+		  int nelems, enum dma_data_direction direction,
+		  unsigned long attrs)
+{
+	int mapped_pages = 0, npages = 0, prot = 0, i;
+	struct protection_domain *domain;
+	struct dma_ops_domain *dma_dom;
+	struct scatterlist *s;
+	unsigned long address;
+	u64 dma_mask;
+
+	domain = get_domain(dev);
+	if (IS_ERR(domain))
+		return 0;
+
+	dma_dom  = to_dma_ops_domain(domain);
+	dma_mask = *dev->dma_mask;
+
+	npages = sg_num_pages(dev, sglist, nelems);
+
+	address = dma_ops_alloc_iova(dev, dma_dom, npages, dma_mask);
+	if (address == AMD_IOMMU_MAPPING_ERROR)
+		goto out_err;
+
+	prot = dir2prot(direction);
+
+	/* Map all sg entries */
+	for_each_sg(sglist, s, nelems, i) {
+		int j, pages = iommu_num_pages(sg_phys(s), s->length, PAGE_SIZE);
+
+		for (j = 0; j < pages; ++j) {
+			unsigned long bus_addr, phys_addr;
+			int ret;
+
+			bus_addr  = address + s->dma_address + (j << PAGE_SHIFT);
+			phys_addr = (sg_phys(s) & PAGE_MASK) + (j << PAGE_SHIFT);
+			ret = iommu_map_page(domain, bus_addr, phys_addr,
+					     PAGE_SIZE, prot,
+					     GFP_ATOMIC | __GFP_NOWARN);
+			if (ret)
+				goto out_unmap;
+
+			mapped_pages += 1;
+		}
+	}
+
+	/* Everything is mapped - write the right values into s->dma_address */
+	for_each_sg(sglist, s, nelems, i) {
+		/*
+		 * Add in the remaining piece of the scatter-gather offset that
+		 * was masked out when we were determining the physical address
+		 * via (sg_phys(s) & PAGE_MASK) earlier.
+		 */
+		s->dma_address += address + (s->offset & ~PAGE_MASK);
+		s->dma_length   = s->length;
+	}
+
+	return nelems;
+
+out_unmap:
+	pr_err("%s: IOMMU mapping error in map_sg (io-pages: %d)\n",
+	       dev_name(dev), npages);
+
+	for_each_sg(sglist, s, nelems, i) {
+		int j, pages = iommu_num_pages(sg_phys(s), s->length, PAGE_SIZE);
+
+		for (j = 0; j < pages; ++j) {
+			unsigned long bus_addr;
+
+			bus_addr  = address + s->dma_address + (j << PAGE_SHIFT);
+			iommu_unmap_page(domain, bus_addr, PAGE_SIZE);
+
+			if (--mapped_pages == 0)
+				goto out_free_iova;
+		}
+	}
+
+out_free_iova:
+	free_iova_fast(&dma_dom->iovad, address >> PAGE_SHIFT, npages);
+
+out_err:
+	return 0;
+}
+
+/*
+ * The exported map_sg function for dma_ops (handles scatter-gather
+ * lists).
+ */
+static void unmap_sg(struct device *dev, struct scatterlist *sglist,
+		     int nelems, enum dma_data_direction dir,
+		     unsigned long attrs)
+{
+	struct protection_domain *domain;
+	struct dma_ops_domain *dma_dom;
+	unsigned long startaddr;
+	int npages = 2;
+
+	domain = get_domain(dev);
+	if (IS_ERR(domain))
+		return;
+
+	startaddr = sg_dma_address(sglist) & PAGE_MASK;
+	dma_dom   = to_dma_ops_domain(domain);
+	npages    = sg_num_pages(dev, sglist, nelems);
+
+	__unmap_single(dma_dom, startaddr, npages << PAGE_SHIFT, dir);
+}
+
+/*
+ * The exported alloc_coherent function for dma_ops.
+ */
+static void *alloc_coherent(struct device *dev, size_t size,
+			    dma_addr_t *dma_addr, gfp_t flag,
+			    unsigned long attrs)
+{
+	u64 dma_mask = dev->coherent_dma_mask;
+	struct protection_domain *domain;
+	struct dma_ops_domain *dma_dom;
+	struct page *page;
+
+	domain = get_domain(dev);
+	if (PTR_ERR(domain) == -EINVAL) {
+		page = alloc_pages(flag, get_order(size));
+		*dma_addr = page_to_phys(page);
+		return page_address(page);
+	} else if (IS_ERR(domain))
+		return NULL;
+
+	dma_dom   = to_dma_ops_domain(domain);
+	size	  = PAGE_ALIGN(size);
+	dma_mask  = dev->coherent_dma_mask;
+	flag     &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32);
+	flag     |= __GFP_ZERO;
+
+	page = alloc_pages(flag | __GFP_NOWARN,  get_order(size));
+	if (!page) {
+		if (!gfpflags_allow_blocking(flag))
+			return NULL;
+
+		page = dma_alloc_from_contiguous(dev, size >> PAGE_SHIFT,
+					get_order(size), flag & __GFP_NOWARN);
+		if (!page)
+			return NULL;
+	}
+
+	if (!dma_mask)
+		dma_mask = *dev->dma_mask;
+
+	*dma_addr = __map_single(dev, dma_dom, page_to_phys(page),
+				 size, DMA_BIDIRECTIONAL, dma_mask);
+
+	if (*dma_addr == AMD_IOMMU_MAPPING_ERROR)
+		goto out_free;
+
+	return page_address(page);
+
+out_free:
+
+	if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
+		__free_pages(page, get_order(size));
+
+	return NULL;
+}
+
+/*
+ * The exported free_coherent function for dma_ops.
+ */
+static void free_coherent(struct device *dev, size_t size,
+			  void *virt_addr, dma_addr_t dma_addr,
+			  unsigned long attrs)
+{
+	struct protection_domain *domain;
+	struct dma_ops_domain *dma_dom;
+	struct page *page;
+
+	page = virt_to_page(virt_addr);
+	size = PAGE_ALIGN(size);
+
+	domain = get_domain(dev);
+	if (IS_ERR(domain))
+		goto free_mem;
+
+	dma_dom = to_dma_ops_domain(domain);
+
+	__unmap_single(dma_dom, dma_addr, size, DMA_BIDIRECTIONAL);
+
+free_mem:
+	if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
+		__free_pages(page, get_order(size));
+}
+
+/*
+ * This function is called by the DMA layer to find out if we can handle a
+ * particular device. It is part of the dma_ops.
+ */
+static int amd_iommu_dma_supported(struct device *dev, u64 mask)
+{
+	if (!dma_direct_supported(dev, mask))
+		return 0;
+	return check_device(dev);
+}
+
+static int amd_iommu_mapping_error(struct device *dev, dma_addr_t dma_addr)
+{
+	return dma_addr == AMD_IOMMU_MAPPING_ERROR;
+}
+
+static const struct dma_map_ops amd_iommu_dma_ops = {
+	.alloc		= alloc_coherent,
+	.free		= free_coherent,
+	.map_page	= map_page,
+	.unmap_page	= unmap_page,
+	.map_sg		= map_sg,
+	.unmap_sg	= unmap_sg,
+	.dma_supported	= amd_iommu_dma_supported,
+	.mapping_error	= amd_iommu_mapping_error,
+};
+
+static int init_reserved_iova_ranges(void)
+{
+	struct pci_dev *pdev = NULL;
+	struct iova *val;
+
+	init_iova_domain(&reserved_iova_ranges, PAGE_SIZE, IOVA_START_PFN);
+
+	lockdep_set_class(&reserved_iova_ranges.iova_rbtree_lock,
+			  &reserved_rbtree_key);
+
+	/* MSI memory range */
+	val = reserve_iova(&reserved_iova_ranges,
+			   IOVA_PFN(MSI_RANGE_START), IOVA_PFN(MSI_RANGE_END));
+	if (!val) {
+		pr_err("Reserving MSI range failed\n");
+		return -ENOMEM;
+	}
+
+	/* HT memory range */
+	val = reserve_iova(&reserved_iova_ranges,
+			   IOVA_PFN(HT_RANGE_START), IOVA_PFN(HT_RANGE_END));
+	if (!val) {
+		pr_err("Reserving HT range failed\n");
+		return -ENOMEM;
+	}
+
+	/*
+	 * Memory used for PCI resources
+	 * FIXME: Check whether we can reserve the PCI-hole completly
+	 */
+	for_each_pci_dev(pdev) {
+		int i;
+
+		for (i = 0; i < PCI_NUM_RESOURCES; ++i) {
+			struct resource *r = &pdev->resource[i];
+
+			if (!(r->flags & IORESOURCE_MEM))
+				continue;
+
+			val = reserve_iova(&reserved_iova_ranges,
+					   IOVA_PFN(r->start),
+					   IOVA_PFN(r->end));
+			if (!val) {
+				pr_err("Reserve pci-resource range failed\n");
+				return -ENOMEM;
+			}
+		}
+	}
+
+	return 0;
+}
+
+int __init amd_iommu_init_api(void)
+{
+	int ret, err = 0;
+
+	ret = iova_cache_get();
+	if (ret)
+		return ret;
+
+	ret = init_reserved_iova_ranges();
+	if (ret)
+		return ret;
+
+	err = bus_set_iommu(&pci_bus_type, &amd_iommu_ops);
+	if (err)
+		return err;
+#ifdef CONFIG_ARM_AMBA
+	err = bus_set_iommu(&amba_bustype, &amd_iommu_ops);
+	if (err)
+		return err;
+#endif
+	err = bus_set_iommu(&platform_bus_type, &amd_iommu_ops);
+	if (err)
+		return err;
+
+	return 0;
+}
+
+int __init amd_iommu_init_dma_ops(void)
+{
+	swiotlb        = (iommu_pass_through || sme_me_mask) ? 1 : 0;
+	iommu_detected = 1;
+
+	/*
+	 * In case we don't initialize SWIOTLB (actually the common case
+	 * when AMD IOMMU is enabled and SME is not active), make sure there
+	 * are global dma_ops set as a fall-back for devices not handled by
+	 * this driver (for example non-PCI devices). When SME is active,
+	 * make sure that swiotlb variable remains set so the global dma_ops
+	 * continue to be SWIOTLB.
+	 */
+	if (!swiotlb)
+		dma_ops = &dma_direct_ops;
+
+	if (amd_iommu_unmap_flush)
+		pr_info("AMD-Vi: IO/TLB flush on unmap enabled\n");
+	else
+		pr_info("AMD-Vi: Lazy IO/TLB flushing enabled\n");
+
+	return 0;
+
+}
+
+/*****************************************************************************
+ *
+ * The following functions belong to the exported interface of AMD IOMMU
+ *
+ * This interface allows access to lower level functions of the IOMMU
+ * like protection domain handling and assignement of devices to domains
+ * which is not possible with the dma_ops interface.
+ *
+ *****************************************************************************/
+
+static void cleanup_domain(struct protection_domain *domain)
+{
+	struct iommu_dev_data *entry;
+	unsigned long flags;
+
+	spin_lock_irqsave(&amd_iommu_devtable_lock, flags);
+
+	while (!list_empty(&domain->dev_list)) {
+		entry = list_first_entry(&domain->dev_list,
+					 struct iommu_dev_data, list);
+		BUG_ON(!entry->domain);
+		__detach_device(entry);
+	}
+
+	spin_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
+}
+
+static void protection_domain_free(struct protection_domain *domain)
+{
+	if (!domain)
+		return;
+
+	del_domain_from_list(domain);
+
+	if (domain->id)
+		domain_id_free(domain->id);
+
+	kfree(domain);
+}
+
+static int protection_domain_init(struct protection_domain *domain)
+{
+	spin_lock_init(&domain->lock);
+	mutex_init(&domain->api_lock);
+	domain->id = domain_id_alloc();
+	if (!domain->id)
+		return -ENOMEM;
+	INIT_LIST_HEAD(&domain->dev_list);
+
+	return 0;
+}
+
+static struct protection_domain *protection_domain_alloc(void)
+{
+	struct protection_domain *domain;
+
+	domain = kzalloc(sizeof(*domain), GFP_KERNEL);
+	if (!domain)
+		return NULL;
+
+	if (protection_domain_init(domain))
+		goto out_err;
+
+	add_domain_to_list(domain);
+
+	return domain;
+
+out_err:
+	kfree(domain);
+
+	return NULL;
+}
+
+static struct iommu_domain *amd_iommu_domain_alloc(unsigned type)
+{
+	struct protection_domain *pdomain;
+	struct dma_ops_domain *dma_domain;
+
+	switch (type) {
+	case IOMMU_DOMAIN_UNMANAGED:
+		pdomain = protection_domain_alloc();
+		if (!pdomain)
+			return NULL;
+
+		pdomain->mode    = PAGE_MODE_3_LEVEL;
+		pdomain->pt_root = (void *)get_zeroed_page(GFP_KERNEL);
+		if (!pdomain->pt_root) {
+			protection_domain_free(pdomain);
+			return NULL;
+		}
+
+		pdomain->domain.geometry.aperture_start = 0;
+		pdomain->domain.geometry.aperture_end   = ~0ULL;
+		pdomain->domain.geometry.force_aperture = true;
+
+		break;
+	case IOMMU_DOMAIN_DMA:
+		dma_domain = dma_ops_domain_alloc();
+		if (!dma_domain) {
+			pr_err("AMD-Vi: Failed to allocate\n");
+			return NULL;
+		}
+		pdomain = &dma_domain->domain;
+		break;
+	case IOMMU_DOMAIN_IDENTITY:
+		pdomain = protection_domain_alloc();
+		if (!pdomain)
+			return NULL;
+
+		pdomain->mode = PAGE_MODE_NONE;
+		break;
+	default:
+		return NULL;
+	}
+
+	return &pdomain->domain;
+}
+
+static void amd_iommu_domain_free(struct iommu_domain *dom)
+{
+	struct protection_domain *domain;
+	struct dma_ops_domain *dma_dom;
+
+	domain = to_pdomain(dom);
+
+	if (domain->dev_cnt > 0)
+		cleanup_domain(domain);
+
+	BUG_ON(domain->dev_cnt != 0);
+
+	if (!dom)
+		return;
+
+	switch (dom->type) {
+	case IOMMU_DOMAIN_DMA:
+		/* Now release the domain */
+		dma_dom = to_dma_ops_domain(domain);
+		dma_ops_domain_free(dma_dom);
+		break;
+	default:
+		if (domain->mode != PAGE_MODE_NONE)
+			free_pagetable(domain);
+
+		if (domain->flags & PD_IOMMUV2_MASK)
+			free_gcr3_table(domain);
+
+		protection_domain_free(domain);
+		break;
+	}
+}
+
+static void amd_iommu_detach_device(struct iommu_domain *dom,
+				    struct device *dev)
+{
+	struct iommu_dev_data *dev_data = dev->archdata.iommu;
+	struct amd_iommu *iommu;
+	int devid;
+
+	if (!check_device(dev))
+		return;
+
+	devid = get_device_id(dev);
+	if (devid < 0)
+		return;
+
+	if (dev_data->domain != NULL)
+		detach_device(dev);
+
+	iommu = amd_iommu_rlookup_table[devid];
+	if (!iommu)
+		return;
+
+#ifdef CONFIG_IRQ_REMAP
+	if (AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir) &&
+	    (dom->type == IOMMU_DOMAIN_UNMANAGED))
+		dev_data->use_vapic = 0;
+#endif
+
+	iommu_completion_wait(iommu);
+}
+
+static int amd_iommu_attach_device(struct iommu_domain *dom,
+				   struct device *dev)
+{
+	struct protection_domain *domain = to_pdomain(dom);
+	struct iommu_dev_data *dev_data;
+	struct amd_iommu *iommu;
+	int ret;
+
+	if (!check_device(dev))
+		return -EINVAL;
+
+	dev_data = dev->archdata.iommu;
+
+	iommu = amd_iommu_rlookup_table[dev_data->devid];
+	if (!iommu)
+		return -EINVAL;
+
+	if (dev_data->domain)
+		detach_device(dev);
+
+	ret = attach_device(dev, domain);
+
+#ifdef CONFIG_IRQ_REMAP
+	if (AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir)) {
+		if (dom->type == IOMMU_DOMAIN_UNMANAGED)
+			dev_data->use_vapic = 1;
+		else
+			dev_data->use_vapic = 0;
+	}
+#endif
+
+	iommu_completion_wait(iommu);
+
+	return ret;
+}
+
+static int amd_iommu_map(struct iommu_domain *dom, unsigned long iova,
+			 phys_addr_t paddr, size_t page_size, int iommu_prot)
+{
+	struct protection_domain *domain = to_pdomain(dom);
+	int prot = 0;
+	int ret;
+
+	if (domain->mode == PAGE_MODE_NONE)
+		return -EINVAL;
+
+	if (iommu_prot & IOMMU_READ)
+		prot |= IOMMU_PROT_IR;
+	if (iommu_prot & IOMMU_WRITE)
+		prot |= IOMMU_PROT_IW;
+
+	mutex_lock(&domain->api_lock);
+	ret = iommu_map_page(domain, iova, paddr, page_size, prot, GFP_KERNEL);
+	mutex_unlock(&domain->api_lock);
+
+	return ret;
+}
+
+static size_t amd_iommu_unmap(struct iommu_domain *dom, unsigned long iova,
+			   size_t page_size)
+{
+	struct protection_domain *domain = to_pdomain(dom);
+	size_t unmap_size;
+
+	if (domain->mode == PAGE_MODE_NONE)
+		return 0;
+
+	mutex_lock(&domain->api_lock);
+	unmap_size = iommu_unmap_page(domain, iova, page_size);
+	mutex_unlock(&domain->api_lock);
+
+	return unmap_size;
+}
+
+static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom,
+					  dma_addr_t iova)
+{
+	struct protection_domain *domain = to_pdomain(dom);
+	unsigned long offset_mask, pte_pgsize;
+	u64 *pte, __pte;
+
+	if (domain->mode == PAGE_MODE_NONE)
+		return iova;
+
+	pte = fetch_pte(domain, iova, &pte_pgsize);
+
+	if (!pte || !IOMMU_PTE_PRESENT(*pte))
+		return 0;
+
+	offset_mask = pte_pgsize - 1;
+	__pte	    = __sme_clr(*pte & PM_ADDR_MASK);
+
+	return (__pte & ~offset_mask) | (iova & offset_mask);
+}
+
+static bool amd_iommu_capable(enum iommu_cap cap)
+{
+	switch (cap) {
+	case IOMMU_CAP_CACHE_COHERENCY:
+		return true;
+	case IOMMU_CAP_INTR_REMAP:
+		return (irq_remapping_enabled == 1);
+	case IOMMU_CAP_NOEXEC:
+		return false;
+	}
+
+	return false;
+}
+
+static void amd_iommu_get_resv_regions(struct device *dev,
+				       struct list_head *head)
+{
+	struct iommu_resv_region *region;
+	struct unity_map_entry *entry;
+	int devid;
+
+	devid = get_device_id(dev);
+	if (devid < 0)
+		return;
+
+	list_for_each_entry(entry, &amd_iommu_unity_map, list) {
+		int type, prot = 0;
+		size_t length;
+
+		if (devid < entry->devid_start || devid > entry->devid_end)
+			continue;
+
+		type   = IOMMU_RESV_DIRECT;
+		length = entry->address_end - entry->address_start;
+		if (entry->prot & IOMMU_PROT_IR)
+			prot |= IOMMU_READ;
+		if (entry->prot & IOMMU_PROT_IW)
+			prot |= IOMMU_WRITE;
+		if (entry->prot & IOMMU_UNITY_MAP_FLAG_EXCL_RANGE)
+			/* Exclusion range */
+			type = IOMMU_RESV_RESERVED;
+
+		region = iommu_alloc_resv_region(entry->address_start,
+						 length, prot, type);
+		if (!region) {
+			pr_err("Out of memory allocating dm-regions for %s\n",
+				dev_name(dev));
+			return;
+		}
+		list_add_tail(&region->list, head);
+	}
+
+	region = iommu_alloc_resv_region(MSI_RANGE_START,
+					 MSI_RANGE_END - MSI_RANGE_START + 1,
+					 0, IOMMU_RESV_MSI);
+	if (!region)
+		return;
+	list_add_tail(&region->list, head);
+
+	region = iommu_alloc_resv_region(HT_RANGE_START,
+					 HT_RANGE_END - HT_RANGE_START + 1,
+					 0, IOMMU_RESV_RESERVED);
+	if (!region)
+		return;
+	list_add_tail(&region->list, head);
+}
+
+static void amd_iommu_put_resv_regions(struct device *dev,
+				     struct list_head *head)
+{
+	struct iommu_resv_region *entry, *next;
+
+	list_for_each_entry_safe(entry, next, head, list)
+		kfree(entry);
+}
+
+static void amd_iommu_apply_resv_region(struct device *dev,
+				      struct iommu_domain *domain,
+				      struct iommu_resv_region *region)
+{
+	struct dma_ops_domain *dma_dom = to_dma_ops_domain(to_pdomain(domain));
+	unsigned long start, end;
+
+	start = IOVA_PFN(region->start);
+	end   = IOVA_PFN(region->start + region->length - 1);
+
+	WARN_ON_ONCE(reserve_iova(&dma_dom->iovad, start, end) == NULL);
+}
+
+static bool amd_iommu_is_attach_deferred(struct iommu_domain *domain,
+					 struct device *dev)
+{
+	struct iommu_dev_data *dev_data = dev->archdata.iommu;
+	return dev_data->defer_attach;
+}
+
+static void amd_iommu_flush_iotlb_all(struct iommu_domain *domain)
+{
+	struct protection_domain *dom = to_pdomain(domain);
+
+	domain_flush_tlb_pde(dom);
+	domain_flush_complete(dom);
+}
+
+static void amd_iommu_iotlb_range_add(struct iommu_domain *domain,
+				      unsigned long iova, size_t size)
+{
+}
+
+const struct iommu_ops amd_iommu_ops = {
+	.capable = amd_iommu_capable,
+	.domain_alloc = amd_iommu_domain_alloc,
+	.domain_free  = amd_iommu_domain_free,
+	.attach_dev = amd_iommu_attach_device,
+	.detach_dev = amd_iommu_detach_device,
+	.map = amd_iommu_map,
+	.unmap = amd_iommu_unmap,
+	.iova_to_phys = amd_iommu_iova_to_phys,
+	.add_device = amd_iommu_add_device,
+	.remove_device = amd_iommu_remove_device,
+	.device_group = amd_iommu_device_group,
+	.get_resv_regions = amd_iommu_get_resv_regions,
+	.put_resv_regions = amd_iommu_put_resv_regions,
+	.apply_resv_region = amd_iommu_apply_resv_region,
+	.is_attach_deferred = amd_iommu_is_attach_deferred,
+	.pgsize_bitmap	= AMD_IOMMU_PGSIZES,
+	.flush_iotlb_all = amd_iommu_flush_iotlb_all,
+	.iotlb_range_add = amd_iommu_iotlb_range_add,
+	.iotlb_sync = amd_iommu_flush_iotlb_all,
+};
+
+/*****************************************************************************
+ *
+ * The next functions do a basic initialization of IOMMU for pass through
+ * mode
+ *
+ * In passthrough mode the IOMMU is initialized and enabled but not used for
+ * DMA-API translation.
+ *
+ *****************************************************************************/
+
+/* IOMMUv2 specific functions */
+int amd_iommu_register_ppr_notifier(struct notifier_block *nb)
+{
+	return atomic_notifier_chain_register(&ppr_notifier, nb);
+}
+EXPORT_SYMBOL(amd_iommu_register_ppr_notifier);
+
+int amd_iommu_unregister_ppr_notifier(struct notifier_block *nb)
+{
+	return atomic_notifier_chain_unregister(&ppr_notifier, nb);
+}
+EXPORT_SYMBOL(amd_iommu_unregister_ppr_notifier);
+
+void amd_iommu_domain_direct_map(struct iommu_domain *dom)
+{
+	struct protection_domain *domain = to_pdomain(dom);
+	unsigned long flags;
+
+	spin_lock_irqsave(&domain->lock, flags);
+
+	/* Update data structure */
+	domain->mode    = PAGE_MODE_NONE;
+	domain->updated = true;
+
+	/* Make changes visible to IOMMUs */
+	update_domain(domain);
+
+	/* Page-table is not visible to IOMMU anymore, so free it */
+	free_pagetable(domain);
+
+	spin_unlock_irqrestore(&domain->lock, flags);
+}
+EXPORT_SYMBOL(amd_iommu_domain_direct_map);
+
+int amd_iommu_domain_enable_v2(struct iommu_domain *dom, int pasids)
+{
+	struct protection_domain *domain = to_pdomain(dom);
+	unsigned long flags;
+	int levels, ret;
+
+	if (pasids <= 0 || pasids > (PASID_MASK + 1))
+		return -EINVAL;
+
+	/* Number of GCR3 table levels required */
+	for (levels = 0; (pasids - 1) & ~0x1ff; pasids >>= 9)
+		levels += 1;
+
+	if (levels > amd_iommu_max_glx_val)
+		return -EINVAL;
+
+	spin_lock_irqsave(&domain->lock, flags);
+
+	/*
+	 * Save us all sanity checks whether devices already in the
+	 * domain support IOMMUv2. Just force that the domain has no
+	 * devices attached when it is switched into IOMMUv2 mode.
+	 */
+	ret = -EBUSY;
+	if (domain->dev_cnt > 0 || domain->flags & PD_IOMMUV2_MASK)
+		goto out;
+
+	ret = -ENOMEM;
+	domain->gcr3_tbl = (void *)get_zeroed_page(GFP_ATOMIC);
+	if (domain->gcr3_tbl == NULL)
+		goto out;
+
+	domain->glx      = levels;
+	domain->flags   |= PD_IOMMUV2_MASK;
+	domain->updated  = true;
+
+	update_domain(domain);
+
+	ret = 0;
+
+out:
+	spin_unlock_irqrestore(&domain->lock, flags);
+
+	return ret;
+}
+EXPORT_SYMBOL(amd_iommu_domain_enable_v2);
+
+static int __flush_pasid(struct protection_domain *domain, int pasid,
+			 u64 address, bool size)
+{
+	struct iommu_dev_data *dev_data;
+	struct iommu_cmd cmd;
+	int i, ret;
+
+	if (!(domain->flags & PD_IOMMUV2_MASK))
+		return -EINVAL;
+
+	build_inv_iommu_pasid(&cmd, domain->id, pasid, address, size);
+
+	/*
+	 * IOMMU TLB needs to be flushed before Device TLB to
+	 * prevent device TLB refill from IOMMU TLB
+	 */
+	for (i = 0; i < amd_iommu_get_num_iommus(); ++i) {
+		if (domain->dev_iommu[i] == 0)
+			continue;
+
+		ret = iommu_queue_command(amd_iommus[i], &cmd);
+		if (ret != 0)
+			goto out;
+	}
+
+	/* Wait until IOMMU TLB flushes are complete */
+	domain_flush_complete(domain);
+
+	/* Now flush device TLBs */
+	list_for_each_entry(dev_data, &domain->dev_list, list) {
+		struct amd_iommu *iommu;
+		int qdep;
+
+		/*
+		   There might be non-IOMMUv2 capable devices in an IOMMUv2
+		 * domain.
+		 */
+		if (!dev_data->ats.enabled)
+			continue;
+
+		qdep  = dev_data->ats.qdep;
+		iommu = amd_iommu_rlookup_table[dev_data->devid];
+
+		build_inv_iotlb_pasid(&cmd, dev_data->devid, pasid,
+				      qdep, address, size);
+
+		ret = iommu_queue_command(iommu, &cmd);
+		if (ret != 0)
+			goto out;
+	}
+
+	/* Wait until all device TLBs are flushed */
+	domain_flush_complete(domain);
+
+	ret = 0;
+
+out:
+
+	return ret;
+}
+
+static int __amd_iommu_flush_page(struct protection_domain *domain, int pasid,
+				  u64 address)
+{
+	return __flush_pasid(domain, pasid, address, false);
+}
+
+int amd_iommu_flush_page(struct iommu_domain *dom, int pasid,
+			 u64 address)
+{
+	struct protection_domain *domain = to_pdomain(dom);
+	unsigned long flags;
+	int ret;
+
+	spin_lock_irqsave(&domain->lock, flags);
+	ret = __amd_iommu_flush_page(domain, pasid, address);
+	spin_unlock_irqrestore(&domain->lock, flags);
+
+	return ret;
+}
+EXPORT_SYMBOL(amd_iommu_flush_page);
+
+static int __amd_iommu_flush_tlb(struct protection_domain *domain, int pasid)
+{
+	return __flush_pasid(domain, pasid, CMD_INV_IOMMU_ALL_PAGES_ADDRESS,
+			     true);
+}
+
+int amd_iommu_flush_tlb(struct iommu_domain *dom, int pasid)
+{
+	struct protection_domain *domain = to_pdomain(dom);
+	unsigned long flags;
+	int ret;
+
+	spin_lock_irqsave(&domain->lock, flags);
+	ret = __amd_iommu_flush_tlb(domain, pasid);
+	spin_unlock_irqrestore(&domain->lock, flags);
+
+	return ret;
+}
+EXPORT_SYMBOL(amd_iommu_flush_tlb);
+
+static u64 *__get_gcr3_pte(u64 *root, int level, int pasid, bool alloc)
+{
+	int index;
+	u64 *pte;
+
+	while (true) {
+
+		index = (pasid >> (9 * level)) & 0x1ff;
+		pte   = &root[index];
+
+		if (level == 0)
+			break;
+
+		if (!(*pte & GCR3_VALID)) {
+			if (!alloc)
+				return NULL;
+
+			root = (void *)get_zeroed_page(GFP_ATOMIC);
+			if (root == NULL)
+				return NULL;
+
+			*pte = iommu_virt_to_phys(root) | GCR3_VALID;
+		}
+
+		root = iommu_phys_to_virt(*pte & PAGE_MASK);
+
+		level -= 1;
+	}
+
+	return pte;
+}
+
+static int __set_gcr3(struct protection_domain *domain, int pasid,
+		      unsigned long cr3)
+{
+	u64 *pte;
+
+	if (domain->mode != PAGE_MODE_NONE)
+		return -EINVAL;
+
+	pte = __get_gcr3_pte(domain->gcr3_tbl, domain->glx, pasid, true);
+	if (pte == NULL)
+		return -ENOMEM;
+
+	*pte = (cr3 & PAGE_MASK) | GCR3_VALID;
+
+	return __amd_iommu_flush_tlb(domain, pasid);
+}
+
+static int __clear_gcr3(struct protection_domain *domain, int pasid)
+{
+	u64 *pte;
+
+	if (domain->mode != PAGE_MODE_NONE)
+		return -EINVAL;
+
+	pte = __get_gcr3_pte(domain->gcr3_tbl, domain->glx, pasid, false);
+	if (pte == NULL)
+		return 0;
+
+	*pte = 0;
+
+	return __amd_iommu_flush_tlb(domain, pasid);
+}
+
+int amd_iommu_domain_set_gcr3(struct iommu_domain *dom, int pasid,
+			      unsigned long cr3)
+{
+	struct protection_domain *domain = to_pdomain(dom);
+	unsigned long flags;
+	int ret;
+
+	spin_lock_irqsave(&domain->lock, flags);
+	ret = __set_gcr3(domain, pasid, cr3);
+	spin_unlock_irqrestore(&domain->lock, flags);
+
+	return ret;
+}
+EXPORT_SYMBOL(amd_iommu_domain_set_gcr3);
+
+int amd_iommu_domain_clear_gcr3(struct iommu_domain *dom, int pasid)
+{
+	struct protection_domain *domain = to_pdomain(dom);
+	unsigned long flags;
+	int ret;
+
+	spin_lock_irqsave(&domain->lock, flags);
+	ret = __clear_gcr3(domain, pasid);
+	spin_unlock_irqrestore(&domain->lock, flags);
+
+	return ret;
+}
+EXPORT_SYMBOL(amd_iommu_domain_clear_gcr3);
+
+int amd_iommu_complete_ppr(struct pci_dev *pdev, int pasid,
+			   int status, int tag)
+{
+	struct iommu_dev_data *dev_data;
+	struct amd_iommu *iommu;
+	struct iommu_cmd cmd;
+
+	dev_data = get_dev_data(&pdev->dev);
+	iommu    = amd_iommu_rlookup_table[dev_data->devid];
+
+	build_complete_ppr(&cmd, dev_data->devid, pasid, status,
+			   tag, dev_data->pri_tlp);
+
+	return iommu_queue_command(iommu, &cmd);
+}
+EXPORT_SYMBOL(amd_iommu_complete_ppr);
+
+struct iommu_domain *amd_iommu_get_v2_domain(struct pci_dev *pdev)
+{
+	struct protection_domain *pdomain;
+
+	pdomain = get_domain(&pdev->dev);
+	if (IS_ERR(pdomain))
+		return NULL;
+
+	/* Only return IOMMUv2 domains */
+	if (!(pdomain->flags & PD_IOMMUV2_MASK))
+		return NULL;
+
+	return &pdomain->domain;
+}
+EXPORT_SYMBOL(amd_iommu_get_v2_domain);
+
+void amd_iommu_enable_device_erratum(struct pci_dev *pdev, u32 erratum)
+{
+	struct iommu_dev_data *dev_data;
+
+	if (!amd_iommu_v2_supported())
+		return;
+
+	dev_data = get_dev_data(&pdev->dev);
+	dev_data->errata |= (1 << erratum);
+}
+EXPORT_SYMBOL(amd_iommu_enable_device_erratum);
+
+int amd_iommu_device_info(struct pci_dev *pdev,
+                          struct amd_iommu_device_info *info)
+{
+	int max_pasids;
+	int pos;
+
+	if (pdev == NULL || info == NULL)
+		return -EINVAL;
+
+	if (!amd_iommu_v2_supported())
+		return -EINVAL;
+
+	memset(info, 0, sizeof(*info));
+
+	if (!pci_ats_disabled()) {
+		pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ATS);
+		if (pos)
+			info->flags |= AMD_IOMMU_DEVICE_FLAG_ATS_SUP;
+	}
+
+	pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI);
+	if (pos)
+		info->flags |= AMD_IOMMU_DEVICE_FLAG_PRI_SUP;
+
+	pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PASID);
+	if (pos) {
+		int features;
+
+		max_pasids = 1 << (9 * (amd_iommu_max_glx_val + 1));
+		max_pasids = min(max_pasids, (1 << 20));
+
+		info->flags |= AMD_IOMMU_DEVICE_FLAG_PASID_SUP;
+		info->max_pasids = min(pci_max_pasids(pdev), max_pasids);
+
+		features = pci_pasid_features(pdev);
+		if (features & PCI_PASID_CAP_EXEC)
+			info->flags |= AMD_IOMMU_DEVICE_FLAG_EXEC_SUP;
+		if (features & PCI_PASID_CAP_PRIV)
+			info->flags |= AMD_IOMMU_DEVICE_FLAG_PRIV_SUP;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(amd_iommu_device_info);
+
+#ifdef CONFIG_IRQ_REMAP
+
+/*****************************************************************************
+ *
+ * Interrupt Remapping Implementation
+ *
+ *****************************************************************************/
+
+static struct irq_chip amd_ir_chip;
+static DEFINE_SPINLOCK(iommu_table_lock);
+
+static void set_dte_irq_entry(u16 devid, struct irq_remap_table *table)
+{
+	u64 dte;
+
+	dte	= amd_iommu_dev_table[devid].data[2];
+	dte	&= ~DTE_IRQ_PHYS_ADDR_MASK;
+	dte	|= iommu_virt_to_phys(table->table);
+	dte	|= DTE_IRQ_REMAP_INTCTL;
+	dte	|= DTE_IRQ_TABLE_LEN;
+	dte	|= DTE_IRQ_REMAP_ENABLE;
+
+	amd_iommu_dev_table[devid].data[2] = dte;
+}
+
+static struct irq_remap_table *get_irq_table(u16 devid)
+{
+	struct irq_remap_table *table;
+
+	if (WARN_ONCE(!amd_iommu_rlookup_table[devid],
+		      "%s: no iommu for devid %x\n", __func__, devid))
+		return NULL;
+
+	table = irq_lookup_table[devid];
+	if (WARN_ONCE(!table, "%s: no table for devid %x\n", __func__, devid))
+		return NULL;
+
+	return table;
+}
+
+static struct irq_remap_table *__alloc_irq_table(void)
+{
+	struct irq_remap_table *table;
+
+	table = kzalloc(sizeof(*table), GFP_KERNEL);
+	if (!table)
+		return NULL;
+
+	table->table = kmem_cache_alloc(amd_iommu_irq_cache, GFP_KERNEL);
+	if (!table->table) {
+		kfree(table);
+		return NULL;
+	}
+	raw_spin_lock_init(&table->lock);
+
+	if (!AMD_IOMMU_GUEST_IR_GA(amd_iommu_guest_ir))
+		memset(table->table, 0,
+		       MAX_IRQS_PER_TABLE * sizeof(u32));
+	else
+		memset(table->table, 0,
+		       (MAX_IRQS_PER_TABLE * (sizeof(u64) * 2)));
+	return table;
+}
+
+static void set_remap_table_entry(struct amd_iommu *iommu, u16 devid,
+				  struct irq_remap_table *table)
+{
+	irq_lookup_table[devid] = table;
+	set_dte_irq_entry(devid, table);
+	iommu_flush_dte(iommu, devid);
+}
+
+static int set_remap_table_entry_alias(struct pci_dev *pdev, u16 alias,
+				       void *data)
+{
+	struct irq_remap_table *table = data;
+
+	irq_lookup_table[alias] = table;
+	set_dte_irq_entry(alias, table);
+
+	iommu_flush_dte(amd_iommu_rlookup_table[alias], alias);
+
+	return 0;
+}
+
+static struct irq_remap_table *alloc_irq_table(u16 devid, struct pci_dev *pdev)
+{
+	struct irq_remap_table *table = NULL;
+	struct irq_remap_table *new_table = NULL;
+	struct amd_iommu *iommu;
+	unsigned long flags;
+	u16 alias;
+
+	spin_lock_irqsave(&iommu_table_lock, flags);
+
+	iommu = amd_iommu_rlookup_table[devid];
+	if (!iommu)
+		goto out_unlock;
+
+	table = irq_lookup_table[devid];
+	if (table)
+		goto out_unlock;
+
+	alias = amd_iommu_alias_table[devid];
+	table = irq_lookup_table[alias];
+	if (table) {
+		set_remap_table_entry(iommu, devid, table);
+		goto out_wait;
+	}
+	spin_unlock_irqrestore(&iommu_table_lock, flags);
+
+	/* Nothing there yet, allocate new irq remapping table */
+	new_table = __alloc_irq_table();
+	if (!new_table)
+		return NULL;
+
+	spin_lock_irqsave(&iommu_table_lock, flags);
+
+	table = irq_lookup_table[devid];
+	if (table)
+		goto out_unlock;
+
+	table = irq_lookup_table[alias];
+	if (table) {
+		set_remap_table_entry(iommu, devid, table);
+		goto out_wait;
+	}
+
+	table = new_table;
+	new_table = NULL;
+
+	if (pdev)
+		pci_for_each_dma_alias(pdev, set_remap_table_entry_alias,
+				       table);
+	else
+		set_remap_table_entry(iommu, devid, table);
+
+	if (devid != alias)
+		set_remap_table_entry(iommu, alias, table);
+
+out_wait:
+	iommu_completion_wait(iommu);
+
+out_unlock:
+	spin_unlock_irqrestore(&iommu_table_lock, flags);
+
+	if (new_table) {
+		kmem_cache_free(amd_iommu_irq_cache, new_table->table);
+		kfree(new_table);
+	}
+	return table;
+}
+
+static int alloc_irq_index(u16 devid, int count, bool align,
+			   struct pci_dev *pdev)
+{
+	struct irq_remap_table *table;
+	int index, c, alignment = 1;
+	unsigned long flags;
+	struct amd_iommu *iommu = amd_iommu_rlookup_table[devid];
+
+	if (!iommu)
+		return -ENODEV;
+
+	table = alloc_irq_table(devid, pdev);
+	if (!table)
+		return -ENODEV;
+
+	if (align)
+		alignment = roundup_pow_of_two(count);
+
+	raw_spin_lock_irqsave(&table->lock, flags);
+
+	/* Scan table for free entries */
+	for (index = ALIGN(table->min_index, alignment), c = 0;
+	     index < MAX_IRQS_PER_TABLE;) {
+		if (!iommu->irte_ops->is_allocated(table, index)) {
+			c += 1;
+		} else {
+			c     = 0;
+			index = ALIGN(index + 1, alignment);
+			continue;
+		}
+
+		if (c == count)	{
+			for (; c != 0; --c)
+				iommu->irte_ops->set_allocated(table, index - c + 1);
+
+			index -= count - 1;
+			goto out;
+		}
+
+		index++;
+	}
+
+	index = -ENOSPC;
+
+out:
+	raw_spin_unlock_irqrestore(&table->lock, flags);
+
+	return index;
+}
+
+static int modify_irte_ga(u16 devid, int index, struct irte_ga *irte,
+			  struct amd_ir_data *data)
+{
+	struct irq_remap_table *table;
+	struct amd_iommu *iommu;
+	unsigned long flags;
+	struct irte_ga *entry;
+
+	iommu = amd_iommu_rlookup_table[devid];
+	if (iommu == NULL)
+		return -EINVAL;
+
+	table = get_irq_table(devid);
+	if (!table)
+		return -ENOMEM;
+
+	raw_spin_lock_irqsave(&table->lock, flags);
+
+	entry = (struct irte_ga *)table->table;
+	entry = &entry[index];
+	entry->lo.fields_remap.valid = 0;
+	entry->hi.val = irte->hi.val;
+	entry->lo.val = irte->lo.val;
+	entry->lo.fields_remap.valid = 1;
+	if (data)
+		data->ref = entry;
+
+	raw_spin_unlock_irqrestore(&table->lock, flags);
+
+	iommu_flush_irt(iommu, devid);
+	iommu_completion_wait(iommu);
+
+	return 0;
+}
+
+static int modify_irte(u16 devid, int index, union irte *irte)
+{
+	struct irq_remap_table *table;
+	struct amd_iommu *iommu;
+	unsigned long flags;
+
+	iommu = amd_iommu_rlookup_table[devid];
+	if (iommu == NULL)
+		return -EINVAL;
+
+	table = get_irq_table(devid);
+	if (!table)
+		return -ENOMEM;
+
+	raw_spin_lock_irqsave(&table->lock, flags);
+	table->table[index] = irte->val;
+	raw_spin_unlock_irqrestore(&table->lock, flags);
+
+	iommu_flush_irt(iommu, devid);
+	iommu_completion_wait(iommu);
+
+	return 0;
+}
+
+static void free_irte(u16 devid, int index)
+{
+	struct irq_remap_table *table;
+	struct amd_iommu *iommu;
+	unsigned long flags;
+
+	iommu = amd_iommu_rlookup_table[devid];
+	if (iommu == NULL)
+		return;
+
+	table = get_irq_table(devid);
+	if (!table)
+		return;
+
+	raw_spin_lock_irqsave(&table->lock, flags);
+	iommu->irte_ops->clear_allocated(table, index);
+	raw_spin_unlock_irqrestore(&table->lock, flags);
+
+	iommu_flush_irt(iommu, devid);
+	iommu_completion_wait(iommu);
+}
+
+static void irte_prepare(void *entry,
+			 u32 delivery_mode, u32 dest_mode,
+			 u8 vector, u32 dest_apicid, int devid)
+{
+	union irte *irte = (union irte *) entry;
+
+	irte->val                = 0;
+	irte->fields.vector      = vector;
+	irte->fields.int_type    = delivery_mode;
+	irte->fields.destination = dest_apicid;
+	irte->fields.dm          = dest_mode;
+	irte->fields.valid       = 1;
+}
+
+static void irte_ga_prepare(void *entry,
+			    u32 delivery_mode, u32 dest_mode,
+			    u8 vector, u32 dest_apicid, int devid)
+{
+	struct irte_ga *irte = (struct irte_ga *) entry;
+
+	irte->lo.val                      = 0;
+	irte->hi.val                      = 0;
+	irte->lo.fields_remap.int_type    = delivery_mode;
+	irte->lo.fields_remap.dm          = dest_mode;
+	irte->hi.fields.vector            = vector;
+	irte->lo.fields_remap.destination = APICID_TO_IRTE_DEST_LO(dest_apicid);
+	irte->hi.fields.destination       = APICID_TO_IRTE_DEST_HI(dest_apicid);
+	irte->lo.fields_remap.valid       = 1;
+}
+
+static void irte_activate(void *entry, u16 devid, u16 index)
+{
+	union irte *irte = (union irte *) entry;
+
+	irte->fields.valid = 1;
+	modify_irte(devid, index, irte);
+}
+
+static void irte_ga_activate(void *entry, u16 devid, u16 index)
+{
+	struct irte_ga *irte = (struct irte_ga *) entry;
+
+	irte->lo.fields_remap.valid = 1;
+	modify_irte_ga(devid, index, irte, NULL);
+}
+
+static void irte_deactivate(void *entry, u16 devid, u16 index)
+{
+	union irte *irte = (union irte *) entry;
+
+	irte->fields.valid = 0;
+	modify_irte(devid, index, irte);
+}
+
+static void irte_ga_deactivate(void *entry, u16 devid, u16 index)
+{
+	struct irte_ga *irte = (struct irte_ga *) entry;
+
+	irte->lo.fields_remap.valid = 0;
+	modify_irte_ga(devid, index, irte, NULL);
+}
+
+static void irte_set_affinity(void *entry, u16 devid, u16 index,
+			      u8 vector, u32 dest_apicid)
+{
+	union irte *irte = (union irte *) entry;
+
+	irte->fields.vector = vector;
+	irte->fields.destination = dest_apicid;
+	modify_irte(devid, index, irte);
+}
+
+static void irte_ga_set_affinity(void *entry, u16 devid, u16 index,
+				 u8 vector, u32 dest_apicid)
+{
+	struct irte_ga *irte = (struct irte_ga *) entry;
+
+	if (!irte->lo.fields_remap.guest_mode) {
+		irte->hi.fields.vector = vector;
+		irte->lo.fields_remap.destination =
+					APICID_TO_IRTE_DEST_LO(dest_apicid);
+		irte->hi.fields.destination =
+					APICID_TO_IRTE_DEST_HI(dest_apicid);
+		modify_irte_ga(devid, index, irte, NULL);
+	}
+}
+
+#define IRTE_ALLOCATED (~1U)
+static void irte_set_allocated(struct irq_remap_table *table, int index)
+{
+	table->table[index] = IRTE_ALLOCATED;
+}
+
+static void irte_ga_set_allocated(struct irq_remap_table *table, int index)
+{
+	struct irte_ga *ptr = (struct irte_ga *)table->table;
+	struct irte_ga *irte = &ptr[index];
+
+	memset(&irte->lo.val, 0, sizeof(u64));
+	memset(&irte->hi.val, 0, sizeof(u64));
+	irte->hi.fields.vector = 0xff;
+}
+
+static bool irte_is_allocated(struct irq_remap_table *table, int index)
+{
+	union irte *ptr = (union irte *)table->table;
+	union irte *irte = &ptr[index];
+
+	return irte->val != 0;
+}
+
+static bool irte_ga_is_allocated(struct irq_remap_table *table, int index)
+{
+	struct irte_ga *ptr = (struct irte_ga *)table->table;
+	struct irte_ga *irte = &ptr[index];
+
+	return irte->hi.fields.vector != 0;
+}
+
+static void irte_clear_allocated(struct irq_remap_table *table, int index)
+{
+	table->table[index] = 0;
+}
+
+static void irte_ga_clear_allocated(struct irq_remap_table *table, int index)
+{
+	struct irte_ga *ptr = (struct irte_ga *)table->table;
+	struct irte_ga *irte = &ptr[index];
+
+	memset(&irte->lo.val, 0, sizeof(u64));
+	memset(&irte->hi.val, 0, sizeof(u64));
+}
+
+static int get_devid(struct irq_alloc_info *info)
+{
+	int devid = -1;
+
+	switch (info->type) {
+	case X86_IRQ_ALLOC_TYPE_IOAPIC:
+		devid     = get_ioapic_devid(info->ioapic_id);
+		break;
+	case X86_IRQ_ALLOC_TYPE_HPET:
+		devid     = get_hpet_devid(info->hpet_id);
+		break;
+	case X86_IRQ_ALLOC_TYPE_MSI:
+	case X86_IRQ_ALLOC_TYPE_MSIX:
+		devid = get_device_id(&info->msi_dev->dev);
+		break;
+	default:
+		BUG_ON(1);
+		break;
+	}
+
+	return devid;
+}
+
+static struct irq_domain *get_ir_irq_domain(struct irq_alloc_info *info)
+{
+	struct amd_iommu *iommu;
+	int devid;
+
+	if (!info)
+		return NULL;
+
+	devid = get_devid(info);
+	if (devid >= 0) {
+		iommu = amd_iommu_rlookup_table[devid];
+		if (iommu)
+			return iommu->ir_domain;
+	}
+
+	return NULL;
+}
+
+static struct irq_domain *get_irq_domain(struct irq_alloc_info *info)
+{
+	struct amd_iommu *iommu;
+	int devid;
+
+	if (!info)
+		return NULL;
+
+	switch (info->type) {
+	case X86_IRQ_ALLOC_TYPE_MSI:
+	case X86_IRQ_ALLOC_TYPE_MSIX:
+		devid = get_device_id(&info->msi_dev->dev);
+		if (devid < 0)
+			return NULL;
+
+		iommu = amd_iommu_rlookup_table[devid];
+		if (iommu)
+			return iommu->msi_domain;
+		break;
+	default:
+		break;
+	}
+
+	return NULL;
+}
+
+struct irq_remap_ops amd_iommu_irq_ops = {
+	.prepare		= amd_iommu_prepare,
+	.enable			= amd_iommu_enable,
+	.disable		= amd_iommu_disable,
+	.reenable		= amd_iommu_reenable,
+	.enable_faulting	= amd_iommu_enable_faulting,
+	.get_ir_irq_domain	= get_ir_irq_domain,
+	.get_irq_domain		= get_irq_domain,
+};
+
+static void irq_remapping_prepare_irte(struct amd_ir_data *data,
+				       struct irq_cfg *irq_cfg,
+				       struct irq_alloc_info *info,
+				       int devid, int index, int sub_handle)
+{
+	struct irq_2_irte *irte_info = &data->irq_2_irte;
+	struct msi_msg *msg = &data->msi_entry;
+	struct IO_APIC_route_entry *entry;
+	struct amd_iommu *iommu = amd_iommu_rlookup_table[devid];
+
+	if (!iommu)
+		return;
+
+	data->irq_2_irte.devid = devid;
+	data->irq_2_irte.index = index + sub_handle;
+	iommu->irte_ops->prepare(data->entry, apic->irq_delivery_mode,
+				 apic->irq_dest_mode, irq_cfg->vector,
+				 irq_cfg->dest_apicid, devid);
+
+	switch (info->type) {
+	case X86_IRQ_ALLOC_TYPE_IOAPIC:
+		/* Setup IOAPIC entry */
+		entry = info->ioapic_entry;
+		info->ioapic_entry = NULL;
+		memset(entry, 0, sizeof(*entry));
+		entry->vector        = index;
+		entry->mask          = 0;
+		entry->trigger       = info->ioapic_trigger;
+		entry->polarity      = info->ioapic_polarity;
+		/* Mask level triggered irqs. */
+		if (info->ioapic_trigger)
+			entry->mask = 1;
+		break;
+
+	case X86_IRQ_ALLOC_TYPE_HPET:
+	case X86_IRQ_ALLOC_TYPE_MSI:
+	case X86_IRQ_ALLOC_TYPE_MSIX:
+		msg->address_hi = MSI_ADDR_BASE_HI;
+		msg->address_lo = MSI_ADDR_BASE_LO;
+		msg->data = irte_info->index;
+		break;
+
+	default:
+		BUG_ON(1);
+		break;
+	}
+}
+
+struct amd_irte_ops irte_32_ops = {
+	.prepare = irte_prepare,
+	.activate = irte_activate,
+	.deactivate = irte_deactivate,
+	.set_affinity = irte_set_affinity,
+	.set_allocated = irte_set_allocated,
+	.is_allocated = irte_is_allocated,
+	.clear_allocated = irte_clear_allocated,
+};
+
+struct amd_irte_ops irte_128_ops = {
+	.prepare = irte_ga_prepare,
+	.activate = irte_ga_activate,
+	.deactivate = irte_ga_deactivate,
+	.set_affinity = irte_ga_set_affinity,
+	.set_allocated = irte_ga_set_allocated,
+	.is_allocated = irte_ga_is_allocated,
+	.clear_allocated = irte_ga_clear_allocated,
+};
+
+static int irq_remapping_alloc(struct irq_domain *domain, unsigned int virq,
+			       unsigned int nr_irqs, void *arg)
+{
+	struct irq_alloc_info *info = arg;
+	struct irq_data *irq_data;
+	struct amd_ir_data *data = NULL;
+	struct irq_cfg *cfg;
+	int i, ret, devid;
+	int index;
+
+	if (!info)
+		return -EINVAL;
+	if (nr_irqs > 1 && info->type != X86_IRQ_ALLOC_TYPE_MSI &&
+	    info->type != X86_IRQ_ALLOC_TYPE_MSIX)
+		return -EINVAL;
+
+	/*
+	 * With IRQ remapping enabled, don't need contiguous CPU vectors
+	 * to support multiple MSI interrupts.
+	 */
+	if (info->type == X86_IRQ_ALLOC_TYPE_MSI)
+		info->flags &= ~X86_IRQ_ALLOC_CONTIGUOUS_VECTORS;
+
+	devid = get_devid(info);
+	if (devid < 0)
+		return -EINVAL;
+
+	ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, arg);
+	if (ret < 0)
+		return ret;
+
+	if (info->type == X86_IRQ_ALLOC_TYPE_IOAPIC) {
+		struct irq_remap_table *table;
+		struct amd_iommu *iommu;
+
+		table = alloc_irq_table(devid, NULL);
+		if (table) {
+			if (!table->min_index) {
+				/*
+				 * Keep the first 32 indexes free for IOAPIC
+				 * interrupts.
+				 */
+				table->min_index = 32;
+				iommu = amd_iommu_rlookup_table[devid];
+				for (i = 0; i < 32; ++i)
+					iommu->irte_ops->set_allocated(table, i);
+			}
+			WARN_ON(table->min_index != 32);
+			index = info->ioapic_pin;
+		} else {
+			index = -ENOMEM;
+		}
+	} else if (info->type == X86_IRQ_ALLOC_TYPE_MSI ||
+		   info->type == X86_IRQ_ALLOC_TYPE_MSIX) {
+		bool align = (info->type == X86_IRQ_ALLOC_TYPE_MSI);
+
+		index = alloc_irq_index(devid, nr_irqs, align, info->msi_dev);
+	} else {
+		index = alloc_irq_index(devid, nr_irqs, false, NULL);
+	}
+
+	if (index < 0) {
+		pr_warn("Failed to allocate IRTE\n");
+		ret = index;
+		goto out_free_parent;
+	}
+
+	for (i = 0; i < nr_irqs; i++) {
+		irq_data = irq_domain_get_irq_data(domain, virq + i);
+		cfg = irqd_cfg(irq_data);
+		if (!irq_data || !cfg) {
+			ret = -EINVAL;
+			goto out_free_data;
+		}
+
+		ret = -ENOMEM;
+		data = kzalloc(sizeof(*data), GFP_KERNEL);
+		if (!data)
+			goto out_free_data;
+
+		if (!AMD_IOMMU_GUEST_IR_GA(amd_iommu_guest_ir))
+			data->entry = kzalloc(sizeof(union irte), GFP_KERNEL);
+		else
+			data->entry = kzalloc(sizeof(struct irte_ga),
+						     GFP_KERNEL);
+		if (!data->entry) {
+			kfree(data);
+			goto out_free_data;
+		}
+
+		irq_data->hwirq = (devid << 16) + i;
+		irq_data->chip_data = data;
+		irq_data->chip = &amd_ir_chip;
+		irq_remapping_prepare_irte(data, cfg, info, devid, index, i);
+		irq_set_status_flags(virq + i, IRQ_MOVE_PCNTXT);
+	}
+
+	return 0;
+
+out_free_data:
+	for (i--; i >= 0; i--) {
+		irq_data = irq_domain_get_irq_data(domain, virq + i);
+		if (irq_data)
+			kfree(irq_data->chip_data);
+	}
+	for (i = 0; i < nr_irqs; i++)
+		free_irte(devid, index + i);
+out_free_parent:
+	irq_domain_free_irqs_common(domain, virq, nr_irqs);
+	return ret;
+}
+
+static void irq_remapping_free(struct irq_domain *domain, unsigned int virq,
+			       unsigned int nr_irqs)
+{
+	struct irq_2_irte *irte_info;
+	struct irq_data *irq_data;
+	struct amd_ir_data *data;
+	int i;
+
+	for (i = 0; i < nr_irqs; i++) {
+		irq_data = irq_domain_get_irq_data(domain, virq  + i);
+		if (irq_data && irq_data->chip_data) {
+			data = irq_data->chip_data;
+			irte_info = &data->irq_2_irte;
+			free_irte(irte_info->devid, irte_info->index);
+			kfree(data->entry);
+			kfree(data);
+		}
+	}
+	irq_domain_free_irqs_common(domain, virq, nr_irqs);
+}
+
+static void amd_ir_update_irte(struct irq_data *irqd, struct amd_iommu *iommu,
+			       struct amd_ir_data *ir_data,
+			       struct irq_2_irte *irte_info,
+			       struct irq_cfg *cfg);
+
+static int irq_remapping_activate(struct irq_domain *domain,
+				  struct irq_data *irq_data, bool reserve)
+{
+	struct amd_ir_data *data = irq_data->chip_data;
+	struct irq_2_irte *irte_info = &data->irq_2_irte;
+	struct amd_iommu *iommu = amd_iommu_rlookup_table[irte_info->devid];
+	struct irq_cfg *cfg = irqd_cfg(irq_data);
+
+	if (!iommu)
+		return 0;
+
+	iommu->irte_ops->activate(data->entry, irte_info->devid,
+				  irte_info->index);
+	amd_ir_update_irte(irq_data, iommu, data, irte_info, cfg);
+	return 0;
+}
+
+static void irq_remapping_deactivate(struct irq_domain *domain,
+				     struct irq_data *irq_data)
+{
+	struct amd_ir_data *data = irq_data->chip_data;
+	struct irq_2_irte *irte_info = &data->irq_2_irte;
+	struct amd_iommu *iommu = amd_iommu_rlookup_table[irte_info->devid];
+
+	if (iommu)
+		iommu->irte_ops->deactivate(data->entry, irte_info->devid,
+					    irte_info->index);
+}
+
+static const struct irq_domain_ops amd_ir_domain_ops = {
+	.alloc = irq_remapping_alloc,
+	.free = irq_remapping_free,
+	.activate = irq_remapping_activate,
+	.deactivate = irq_remapping_deactivate,
+};
+
+static int amd_ir_set_vcpu_affinity(struct irq_data *data, void *vcpu_info)
+{
+	struct amd_iommu *iommu;
+	struct amd_iommu_pi_data *pi_data = vcpu_info;
+	struct vcpu_data *vcpu_pi_info = pi_data->vcpu_data;
+	struct amd_ir_data *ir_data = data->chip_data;
+	struct irte_ga *irte = (struct irte_ga *) ir_data->entry;
+	struct irq_2_irte *irte_info = &ir_data->irq_2_irte;
+	struct iommu_dev_data *dev_data = search_dev_data(irte_info->devid);
+
+	/* Note:
+	 * This device has never been set up for guest mode.
+	 * we should not modify the IRTE
+	 */
+	if (!dev_data || !dev_data->use_vapic)
+		return 0;
+
+	pi_data->ir_data = ir_data;
+
+	/* Note:
+	 * SVM tries to set up for VAPIC mode, but we are in
+	 * legacy mode. So, we force legacy mode instead.
+	 */
+	if (!AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir)) {
+		pr_debug("AMD-Vi: %s: Fall back to using intr legacy remap\n",
+			 __func__);
+		pi_data->is_guest_mode = false;
+	}
+
+	iommu = amd_iommu_rlookup_table[irte_info->devid];
+	if (iommu == NULL)
+		return -EINVAL;
+
+	pi_data->prev_ga_tag = ir_data->cached_ga_tag;
+	if (pi_data->is_guest_mode) {
+		/* Setting */
+		irte->hi.fields.ga_root_ptr = (pi_data->base >> 12);
+		irte->hi.fields.vector = vcpu_pi_info->vector;
+		irte->lo.fields_vapic.ga_log_intr = 1;
+		irte->lo.fields_vapic.guest_mode = 1;
+		irte->lo.fields_vapic.ga_tag = pi_data->ga_tag;
+
+		ir_data->cached_ga_tag = pi_data->ga_tag;
+	} else {
+		/* Un-Setting */
+		struct irq_cfg *cfg = irqd_cfg(data);
+
+		irte->hi.val = 0;
+		irte->lo.val = 0;
+		irte->hi.fields.vector = cfg->vector;
+		irte->lo.fields_remap.guest_mode = 0;
+		irte->lo.fields_remap.destination =
+				APICID_TO_IRTE_DEST_LO(cfg->dest_apicid);
+		irte->hi.fields.destination =
+				APICID_TO_IRTE_DEST_HI(cfg->dest_apicid);
+		irte->lo.fields_remap.int_type = apic->irq_delivery_mode;
+		irte->lo.fields_remap.dm = apic->irq_dest_mode;
+
+		/*
+		 * This communicates the ga_tag back to the caller
+		 * so that it can do all the necessary clean up.
+		 */
+		ir_data->cached_ga_tag = 0;
+	}
+
+	return modify_irte_ga(irte_info->devid, irte_info->index, irte, ir_data);
+}
+
+
+static void amd_ir_update_irte(struct irq_data *irqd, struct amd_iommu *iommu,
+			       struct amd_ir_data *ir_data,
+			       struct irq_2_irte *irte_info,
+			       struct irq_cfg *cfg)
+{
+
+	/*
+	 * Atomically updates the IRTE with the new destination, vector
+	 * and flushes the interrupt entry cache.
+	 */
+	iommu->irte_ops->set_affinity(ir_data->entry, irte_info->devid,
+				      irte_info->index, cfg->vector,
+				      cfg->dest_apicid);
+}
+
+static int amd_ir_set_affinity(struct irq_data *data,
+			       const struct cpumask *mask, bool force)
+{
+	struct amd_ir_data *ir_data = data->chip_data;
+	struct irq_2_irte *irte_info = &ir_data->irq_2_irte;
+	struct irq_cfg *cfg = irqd_cfg(data);
+	struct irq_data *parent = data->parent_data;
+	struct amd_iommu *iommu = amd_iommu_rlookup_table[irte_info->devid];
+	int ret;
+
+	if (!iommu)
+		return -ENODEV;
+
+	ret = parent->chip->irq_set_affinity(parent, mask, force);
+	if (ret < 0 || ret == IRQ_SET_MASK_OK_DONE)
+		return ret;
+
+	amd_ir_update_irte(data, iommu, ir_data, irte_info, cfg);
+	/*
+	 * After this point, all the interrupts will start arriving
+	 * at the new destination. So, time to cleanup the previous
+	 * vector allocation.
+	 */
+	send_cleanup_vector(cfg);
+
+	return IRQ_SET_MASK_OK_DONE;
+}
+
+static void ir_compose_msi_msg(struct irq_data *irq_data, struct msi_msg *msg)
+{
+	struct amd_ir_data *ir_data = irq_data->chip_data;
+
+	*msg = ir_data->msi_entry;
+}
+
+static struct irq_chip amd_ir_chip = {
+	.name			= "AMD-IR",
+	.irq_ack		= apic_ack_irq,
+	.irq_set_affinity	= amd_ir_set_affinity,
+	.irq_set_vcpu_affinity	= amd_ir_set_vcpu_affinity,
+	.irq_compose_msi_msg	= ir_compose_msi_msg,
+};
+
+int amd_iommu_create_irq_domain(struct amd_iommu *iommu)
+{
+	struct fwnode_handle *fn;
+
+	fn = irq_domain_alloc_named_id_fwnode("AMD-IR", iommu->index);
+	if (!fn)
+		return -ENOMEM;
+	iommu->ir_domain = irq_domain_create_tree(fn, &amd_ir_domain_ops, iommu);
+	if (!iommu->ir_domain) {
+		irq_domain_free_fwnode(fn);
+		return -ENOMEM;
+	}
+
+	iommu->ir_domain->parent = arch_get_ir_parent_domain();
+	iommu->msi_domain = arch_create_remap_msi_irq_domain(iommu->ir_domain,
+							     "AMD-IR-MSI",
+							     iommu->index);
+	return 0;
+}
+
+int amd_iommu_update_ga(int cpu, bool is_run, void *data)
+{
+	unsigned long flags;
+	struct amd_iommu *iommu;
+	struct irq_remap_table *table;
+	struct amd_ir_data *ir_data = (struct amd_ir_data *)data;
+	int devid = ir_data->irq_2_irte.devid;
+	struct irte_ga *entry = (struct irte_ga *) ir_data->entry;
+	struct irte_ga *ref = (struct irte_ga *) ir_data->ref;
+
+	if (!AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir) ||
+	    !ref || !entry || !entry->lo.fields_vapic.guest_mode)
+		return 0;
+
+	iommu = amd_iommu_rlookup_table[devid];
+	if (!iommu)
+		return -ENODEV;
+
+	table = get_irq_table(devid);
+	if (!table)
+		return -ENODEV;
+
+	raw_spin_lock_irqsave(&table->lock, flags);
+
+	if (ref->lo.fields_vapic.guest_mode) {
+		if (cpu >= 0) {
+			ref->lo.fields_vapic.destination =
+						APICID_TO_IRTE_DEST_LO(cpu);
+			ref->hi.fields.destination =
+						APICID_TO_IRTE_DEST_HI(cpu);
+		}
+		ref->lo.fields_vapic.is_run = is_run;
+		barrier();
+	}
+
+	raw_spin_unlock_irqrestore(&table->lock, flags);
+
+	iommu_flush_irt(iommu, devid);
+	iommu_completion_wait(iommu);
+	return 0;
+}
+EXPORT_SYMBOL(amd_iommu_update_ga);
+#endif
diff --git a/drivers/iommu/amd_iommu.h b/drivers/iommu/amd_iommu.h
new file mode 100644
index 000000000..12d540d9b
--- /dev/null
+++ b/drivers/iommu/amd_iommu.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+#ifndef AMD_IOMMU_H
+#define AMD_IOMMU_H
+
+int __init add_special_device(u8 type, u8 id, u16 *devid, bool cmd_line);
+
+#ifdef CONFIG_DMI
+void amd_iommu_apply_ivrs_quirks(void);
+#else
+static void amd_iommu_apply_ivrs_quirks(void) { }
+#endif
+
+#endif
diff --git a/drivers/iommu/amd_iommu_debugfs.c b/drivers/iommu/amd_iommu_debugfs.c
new file mode 100644
index 000000000..c6a5c737e
--- /dev/null
+++ b/drivers/iommu/amd_iommu_debugfs.c
@@ -0,0 +1,33 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * AMD IOMMU driver
+ *
+ * Copyright (C) 2018 Advanced Micro Devices, Inc.
+ *
+ * Author: Gary R Hook <gary.hook@amd.com>
+ */
+
+#include <linux/debugfs.h>
+#include <linux/iommu.h>
+#include <linux/pci.h>
+#include "amd_iommu_proto.h"
+#include "amd_iommu_types.h"
+
+static struct dentry *amd_iommu_debugfs;
+static DEFINE_MUTEX(amd_iommu_debugfs_lock);
+
+#define	MAX_NAME_LEN	20
+
+void amd_iommu_debugfs_setup(struct amd_iommu *iommu)
+{
+	char name[MAX_NAME_LEN + 1];
+
+	mutex_lock(&amd_iommu_debugfs_lock);
+	if (!amd_iommu_debugfs)
+		amd_iommu_debugfs = debugfs_create_dir("amd",
+						       iommu_debugfs_dir);
+	mutex_unlock(&amd_iommu_debugfs_lock);
+
+	snprintf(name, MAX_NAME_LEN, "iommu%02d", iommu->index);
+	iommu->debugfs = debugfs_create_dir(name, amd_iommu_debugfs);
+}
diff --git a/drivers/iommu/amd_iommu_init.c b/drivers/iommu/amd_iommu_init.c
new file mode 100644
index 000000000..1c61cd0b1
--- /dev/null
+++ b/drivers/iommu/amd_iommu_init.c
@@ -0,0 +1,3078 @@
+/*
+ * Copyright (C) 2007-2010 Advanced Micro Devices, Inc.
+ * Author: Joerg Roedel <jroedel@suse.de>
+ *         Leo Duran <leo.duran@amd.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ */
+
+#include <linux/pci.h>
+#include <linux/acpi.h>
+#include <linux/list.h>
+#include <linux/bitmap.h>
+#include <linux/slab.h>
+#include <linux/syscore_ops.h>
+#include <linux/interrupt.h>
+#include <linux/msi.h>
+#include <linux/amd-iommu.h>
+#include <linux/export.h>
+#include <linux/iommu.h>
+#include <linux/kmemleak.h>
+#include <linux/mem_encrypt.h>
+#include <linux/iopoll.h>
+#include <asm/pci-direct.h>
+#include <asm/iommu.h>
+#include <asm/gart.h>
+#include <asm/x86_init.h>
+#include <asm/iommu_table.h>
+#include <asm/io_apic.h>
+#include <asm/irq_remapping.h>
+
+#include <linux/crash_dump.h>
+#include "amd_iommu.h"
+#include "amd_iommu_proto.h"
+#include "amd_iommu_types.h"
+#include "irq_remapping.h"
+
+/*
+ * definitions for the ACPI scanning code
+ */
+#define IVRS_HEADER_LENGTH 48
+
+#define ACPI_IVHD_TYPE_MAX_SUPPORTED	0x40
+#define ACPI_IVMD_TYPE_ALL              0x20
+#define ACPI_IVMD_TYPE                  0x21
+#define ACPI_IVMD_TYPE_RANGE            0x22
+
+#define IVHD_DEV_ALL                    0x01
+#define IVHD_DEV_SELECT                 0x02
+#define IVHD_DEV_SELECT_RANGE_START     0x03
+#define IVHD_DEV_RANGE_END              0x04
+#define IVHD_DEV_ALIAS                  0x42
+#define IVHD_DEV_ALIAS_RANGE            0x43
+#define IVHD_DEV_EXT_SELECT             0x46
+#define IVHD_DEV_EXT_SELECT_RANGE       0x47
+#define IVHD_DEV_SPECIAL		0x48
+#define IVHD_DEV_ACPI_HID		0xf0
+
+#define UID_NOT_PRESENT                 0
+#define UID_IS_INTEGER                  1
+#define UID_IS_CHARACTER                2
+
+#define IVHD_SPECIAL_IOAPIC		1
+#define IVHD_SPECIAL_HPET		2
+
+#define IVHD_FLAG_HT_TUN_EN_MASK        0x01
+#define IVHD_FLAG_PASSPW_EN_MASK        0x02
+#define IVHD_FLAG_RESPASSPW_EN_MASK     0x04
+#define IVHD_FLAG_ISOC_EN_MASK          0x08
+
+#define IVMD_FLAG_EXCL_RANGE            0x08
+#define IVMD_FLAG_UNITY_MAP             0x01
+
+#define ACPI_DEVFLAG_INITPASS           0x01
+#define ACPI_DEVFLAG_EXTINT             0x02
+#define ACPI_DEVFLAG_NMI                0x04
+#define ACPI_DEVFLAG_SYSMGT1            0x10
+#define ACPI_DEVFLAG_SYSMGT2            0x20
+#define ACPI_DEVFLAG_LINT0              0x40
+#define ACPI_DEVFLAG_LINT1              0x80
+#define ACPI_DEVFLAG_ATSDIS             0x10000000
+
+#define LOOP_TIMEOUT	2000000
+/*
+ * ACPI table definitions
+ *
+ * These data structures are laid over the table to parse the important values
+ * out of it.
+ */
+
+extern const struct iommu_ops amd_iommu_ops;
+
+/*
+ * structure describing one IOMMU in the ACPI table. Typically followed by one
+ * or more ivhd_entrys.
+ */
+struct ivhd_header {
+	u8 type;
+	u8 flags;
+	u16 length;
+	u16 devid;
+	u16 cap_ptr;
+	u64 mmio_phys;
+	u16 pci_seg;
+	u16 info;
+	u32 efr_attr;
+
+	/* Following only valid on IVHD type 11h and 40h */
+	u64 efr_reg; /* Exact copy of MMIO_EXT_FEATURES */
+	u64 res;
+} __attribute__((packed));
+
+/*
+ * A device entry describing which devices a specific IOMMU translates and
+ * which requestor ids they use.
+ */
+struct ivhd_entry {
+	u8 type;
+	u16 devid;
+	u8 flags;
+	u32 ext;
+	u32 hidh;
+	u64 cid;
+	u8 uidf;
+	u8 uidl;
+	u8 uid;
+} __attribute__((packed));
+
+/*
+ * An AMD IOMMU memory definition structure. It defines things like exclusion
+ * ranges for devices and regions that should be unity mapped.
+ */
+struct ivmd_header {
+	u8 type;
+	u8 flags;
+	u16 length;
+	u16 devid;
+	u16 aux;
+	u64 resv;
+	u64 range_start;
+	u64 range_length;
+} __attribute__((packed));
+
+bool amd_iommu_dump;
+bool amd_iommu_irq_remap __read_mostly;
+
+int amd_iommu_guest_ir = AMD_IOMMU_GUEST_IR_VAPIC;
+static int amd_iommu_xt_mode = IRQ_REMAP_X2APIC_MODE;
+
+static bool amd_iommu_detected;
+static bool __initdata amd_iommu_disabled;
+static int amd_iommu_target_ivhd_type;
+
+u16 amd_iommu_last_bdf;			/* largest PCI device id we have
+					   to handle */
+LIST_HEAD(amd_iommu_unity_map);		/* a list of required unity mappings
+					   we find in ACPI */
+bool amd_iommu_unmap_flush;		/* if true, flush on every unmap */
+
+LIST_HEAD(amd_iommu_list);		/* list of all AMD IOMMUs in the
+					   system */
+
+/* Array to assign indices to IOMMUs*/
+struct amd_iommu *amd_iommus[MAX_IOMMUS];
+
+/* Number of IOMMUs present in the system */
+static int amd_iommus_present;
+
+/* IOMMUs have a non-present cache? */
+bool amd_iommu_np_cache __read_mostly;
+bool amd_iommu_iotlb_sup __read_mostly = true;
+
+u32 amd_iommu_max_pasid __read_mostly = ~0;
+
+bool amd_iommu_v2_present __read_mostly;
+static bool amd_iommu_pc_present __read_mostly;
+
+bool amd_iommu_force_isolation __read_mostly;
+
+/*
+ * List of protection domains - used during resume
+ */
+LIST_HEAD(amd_iommu_pd_list);
+spinlock_t amd_iommu_pd_lock;
+
+/*
+ * Pointer to the device table which is shared by all AMD IOMMUs
+ * it is indexed by the PCI device id or the HT unit id and contains
+ * information about the domain the device belongs to as well as the
+ * page table root pointer.
+ */
+struct dev_table_entry *amd_iommu_dev_table;
+/*
+ * Pointer to a device table which the content of old device table
+ * will be copied to. It's only be used in kdump kernel.
+ */
+static struct dev_table_entry *old_dev_tbl_cpy;
+
+/*
+ * The alias table is a driver specific data structure which contains the
+ * mappings of the PCI device ids to the actual requestor ids on the IOMMU.
+ * More than one device can share the same requestor id.
+ */
+u16 *amd_iommu_alias_table;
+
+/*
+ * The rlookup table is used to find the IOMMU which is responsible
+ * for a specific device. It is also indexed by the PCI device id.
+ */
+struct amd_iommu **amd_iommu_rlookup_table;
+EXPORT_SYMBOL(amd_iommu_rlookup_table);
+
+/*
+ * This table is used to find the irq remapping table for a given device id
+ * quickly.
+ */
+struct irq_remap_table **irq_lookup_table;
+
+/*
+ * AMD IOMMU allows up to 2^16 different protection domains. This is a bitmap
+ * to know which ones are already in use.
+ */
+unsigned long *amd_iommu_pd_alloc_bitmap;
+
+static u32 dev_table_size;	/* size of the device table */
+static u32 alias_table_size;	/* size of the alias table */
+static u32 rlookup_table_size;	/* size if the rlookup table */
+
+enum iommu_init_state {
+	IOMMU_START_STATE,
+	IOMMU_IVRS_DETECTED,
+	IOMMU_ACPI_FINISHED,
+	IOMMU_ENABLED,
+	IOMMU_PCI_INIT,
+	IOMMU_INTERRUPTS_EN,
+	IOMMU_DMA_OPS,
+	IOMMU_INITIALIZED,
+	IOMMU_NOT_FOUND,
+	IOMMU_INIT_ERROR,
+	IOMMU_CMDLINE_DISABLED,
+};
+
+/* Early ioapic and hpet maps from kernel command line */
+#define EARLY_MAP_SIZE		4
+static struct devid_map __initdata early_ioapic_map[EARLY_MAP_SIZE];
+static struct devid_map __initdata early_hpet_map[EARLY_MAP_SIZE];
+static struct acpihid_map_entry __initdata early_acpihid_map[EARLY_MAP_SIZE];
+
+static int __initdata early_ioapic_map_size;
+static int __initdata early_hpet_map_size;
+static int __initdata early_acpihid_map_size;
+
+static bool __initdata cmdline_maps;
+
+static enum iommu_init_state init_state = IOMMU_START_STATE;
+
+static int amd_iommu_enable_interrupts(void);
+static int __init iommu_go_to_state(enum iommu_init_state state);
+static void init_device_table_dma(void);
+
+static bool amd_iommu_pre_enabled = true;
+
+bool translation_pre_enabled(struct amd_iommu *iommu)
+{
+	return (iommu->flags & AMD_IOMMU_FLAG_TRANS_PRE_ENABLED);
+}
+EXPORT_SYMBOL(translation_pre_enabled);
+
+static void clear_translation_pre_enabled(struct amd_iommu *iommu)
+{
+	iommu->flags &= ~AMD_IOMMU_FLAG_TRANS_PRE_ENABLED;
+}
+
+static void init_translation_status(struct amd_iommu *iommu)
+{
+	u64 ctrl;
+
+	ctrl = readq(iommu->mmio_base + MMIO_CONTROL_OFFSET);
+	if (ctrl & (1<<CONTROL_IOMMU_EN))
+		iommu->flags |= AMD_IOMMU_FLAG_TRANS_PRE_ENABLED;
+}
+
+static inline void update_last_devid(u16 devid)
+{
+	if (devid > amd_iommu_last_bdf)
+		amd_iommu_last_bdf = devid;
+}
+
+static inline unsigned long tbl_size(int entry_size)
+{
+	unsigned shift = PAGE_SHIFT +
+			 get_order(((int)amd_iommu_last_bdf + 1) * entry_size);
+
+	return 1UL << shift;
+}
+
+int amd_iommu_get_num_iommus(void)
+{
+	return amd_iommus_present;
+}
+
+/* Access to l1 and l2 indexed register spaces */
+
+static u32 iommu_read_l1(struct amd_iommu *iommu, u16 l1, u8 address)
+{
+	u32 val;
+
+	pci_write_config_dword(iommu->dev, 0xf8, (address | l1 << 16));
+	pci_read_config_dword(iommu->dev, 0xfc, &val);
+	return val;
+}
+
+static void iommu_write_l1(struct amd_iommu *iommu, u16 l1, u8 address, u32 val)
+{
+	pci_write_config_dword(iommu->dev, 0xf8, (address | l1 << 16 | 1 << 31));
+	pci_write_config_dword(iommu->dev, 0xfc, val);
+	pci_write_config_dword(iommu->dev, 0xf8, (address | l1 << 16));
+}
+
+static u32 iommu_read_l2(struct amd_iommu *iommu, u8 address)
+{
+	u32 val;
+
+	pci_write_config_dword(iommu->dev, 0xf0, address);
+	pci_read_config_dword(iommu->dev, 0xf4, &val);
+	return val;
+}
+
+static void iommu_write_l2(struct amd_iommu *iommu, u8 address, u32 val)
+{
+	pci_write_config_dword(iommu->dev, 0xf0, (address | 1 << 8));
+	pci_write_config_dword(iommu->dev, 0xf4, val);
+}
+
+/****************************************************************************
+ *
+ * AMD IOMMU MMIO register space handling functions
+ *
+ * These functions are used to program the IOMMU device registers in
+ * MMIO space required for that driver.
+ *
+ ****************************************************************************/
+
+/*
+ * This function set the exclusion range in the IOMMU. DMA accesses to the
+ * exclusion range are passed through untranslated
+ */
+static void iommu_set_exclusion_range(struct amd_iommu *iommu)
+{
+	u64 start = iommu->exclusion_start & PAGE_MASK;
+	u64 limit = (start + iommu->exclusion_length - 1) & PAGE_MASK;
+	u64 entry;
+
+	if (!iommu->exclusion_start)
+		return;
+
+	entry = start | MMIO_EXCL_ENABLE_MASK;
+	memcpy_toio(iommu->mmio_base + MMIO_EXCL_BASE_OFFSET,
+			&entry, sizeof(entry));
+
+	entry = limit;
+	memcpy_toio(iommu->mmio_base + MMIO_EXCL_LIMIT_OFFSET,
+			&entry, sizeof(entry));
+}
+
+/* Programs the physical address of the device table into the IOMMU hardware */
+static void iommu_set_device_table(struct amd_iommu *iommu)
+{
+	u64 entry;
+
+	BUG_ON(iommu->mmio_base == NULL);
+
+	entry = iommu_virt_to_phys(amd_iommu_dev_table);
+	entry |= (dev_table_size >> 12) - 1;
+	memcpy_toio(iommu->mmio_base + MMIO_DEV_TABLE_OFFSET,
+			&entry, sizeof(entry));
+}
+
+/* Generic functions to enable/disable certain features of the IOMMU. */
+static void iommu_feature_enable(struct amd_iommu *iommu, u8 bit)
+{
+	u64 ctrl;
+
+	ctrl = readq(iommu->mmio_base +  MMIO_CONTROL_OFFSET);
+	ctrl |= (1ULL << bit);
+	writeq(ctrl, iommu->mmio_base +  MMIO_CONTROL_OFFSET);
+}
+
+static void iommu_feature_disable(struct amd_iommu *iommu, u8 bit)
+{
+	u64 ctrl;
+
+	ctrl = readq(iommu->mmio_base + MMIO_CONTROL_OFFSET);
+	ctrl &= ~(1ULL << bit);
+	writeq(ctrl, iommu->mmio_base + MMIO_CONTROL_OFFSET);
+}
+
+static void iommu_set_inv_tlb_timeout(struct amd_iommu *iommu, int timeout)
+{
+	u64 ctrl;
+
+	ctrl = readq(iommu->mmio_base + MMIO_CONTROL_OFFSET);
+	ctrl &= ~CTRL_INV_TO_MASK;
+	ctrl |= (timeout << CONTROL_INV_TIMEOUT) & CTRL_INV_TO_MASK;
+	writeq(ctrl, iommu->mmio_base + MMIO_CONTROL_OFFSET);
+}
+
+/* Function to enable the hardware */
+static void iommu_enable(struct amd_iommu *iommu)
+{
+	iommu_feature_enable(iommu, CONTROL_IOMMU_EN);
+}
+
+static void iommu_disable(struct amd_iommu *iommu)
+{
+	if (!iommu->mmio_base)
+		return;
+
+	/* Disable command buffer */
+	iommu_feature_disable(iommu, CONTROL_CMDBUF_EN);
+
+	/* Disable event logging and event interrupts */
+	iommu_feature_disable(iommu, CONTROL_EVT_INT_EN);
+	iommu_feature_disable(iommu, CONTROL_EVT_LOG_EN);
+
+	/* Disable IOMMU GA_LOG */
+	iommu_feature_disable(iommu, CONTROL_GALOG_EN);
+	iommu_feature_disable(iommu, CONTROL_GAINT_EN);
+
+	/* Disable IOMMU hardware itself */
+	iommu_feature_disable(iommu, CONTROL_IOMMU_EN);
+}
+
+/*
+ * mapping and unmapping functions for the IOMMU MMIO space. Each AMD IOMMU in
+ * the system has one.
+ */
+static u8 __iomem * __init iommu_map_mmio_space(u64 address, u64 end)
+{
+	if (!request_mem_region(address, end, "amd_iommu")) {
+		pr_err("AMD-Vi: Can not reserve memory region %llx-%llx for mmio\n",
+			address, end);
+		pr_err("AMD-Vi: This is a BIOS bug. Please contact your hardware vendor\n");
+		return NULL;
+	}
+
+	return (u8 __iomem *)ioremap_nocache(address, end);
+}
+
+static void __init iommu_unmap_mmio_space(struct amd_iommu *iommu)
+{
+	if (iommu->mmio_base)
+		iounmap(iommu->mmio_base);
+	release_mem_region(iommu->mmio_phys, iommu->mmio_phys_end);
+}
+
+static inline u32 get_ivhd_header_size(struct ivhd_header *h)
+{
+	u32 size = 0;
+
+	switch (h->type) {
+	case 0x10:
+		size = 24;
+		break;
+	case 0x11:
+	case 0x40:
+		size = 40;
+		break;
+	}
+	return size;
+}
+
+/****************************************************************************
+ *
+ * The functions below belong to the first pass of AMD IOMMU ACPI table
+ * parsing. In this pass we try to find out the highest device id this
+ * code has to handle. Upon this information the size of the shared data
+ * structures is determined later.
+ *
+ ****************************************************************************/
+
+/*
+ * This function calculates the length of a given IVHD entry
+ */
+static inline int ivhd_entry_length(u8 *ivhd)
+{
+	u32 type = ((struct ivhd_entry *)ivhd)->type;
+
+	if (type < 0x80) {
+		return 0x04 << (*ivhd >> 6);
+	} else if (type == IVHD_DEV_ACPI_HID) {
+		/* For ACPI_HID, offset 21 is uid len */
+		return *((u8 *)ivhd + 21) + 22;
+	}
+	return 0;
+}
+
+/*
+ * After reading the highest device id from the IOMMU PCI capability header
+ * this function looks if there is a higher device id defined in the ACPI table
+ */
+static int __init find_last_devid_from_ivhd(struct ivhd_header *h)
+{
+	u8 *p = (void *)h, *end = (void *)h;
+	struct ivhd_entry *dev;
+
+	u32 ivhd_size = get_ivhd_header_size(h);
+
+	if (!ivhd_size) {
+		pr_err("AMD-Vi: Unsupported IVHD type %#x\n", h->type);
+		return -EINVAL;
+	}
+
+	p += ivhd_size;
+	end += h->length;
+
+	while (p < end) {
+		dev = (struct ivhd_entry *)p;
+		switch (dev->type) {
+		case IVHD_DEV_ALL:
+			/* Use maximum BDF value for DEV_ALL */
+			update_last_devid(0xffff);
+			break;
+		case IVHD_DEV_SELECT:
+		case IVHD_DEV_RANGE_END:
+		case IVHD_DEV_ALIAS:
+		case IVHD_DEV_EXT_SELECT:
+			/* all the above subfield types refer to device ids */
+			update_last_devid(dev->devid);
+			break;
+		default:
+			break;
+		}
+		p += ivhd_entry_length(p);
+	}
+
+	WARN_ON(p != end);
+
+	return 0;
+}
+
+static int __init check_ivrs_checksum(struct acpi_table_header *table)
+{
+	int i;
+	u8 checksum = 0, *p = (u8 *)table;
+
+	for (i = 0; i < table->length; ++i)
+		checksum += p[i];
+	if (checksum != 0) {
+		/* ACPI table corrupt */
+		pr_err(FW_BUG "AMD-Vi: IVRS invalid checksum\n");
+		return -ENODEV;
+	}
+
+	return 0;
+}
+
+/*
+ * Iterate over all IVHD entries in the ACPI table and find the highest device
+ * id which we need to handle. This is the first of three functions which parse
+ * the ACPI table. So we check the checksum here.
+ */
+static int __init find_last_devid_acpi(struct acpi_table_header *table)
+{
+	u8 *p = (u8 *)table, *end = (u8 *)table;
+	struct ivhd_header *h;
+
+	p += IVRS_HEADER_LENGTH;
+
+	end += table->length;
+	while (p < end) {
+		h = (struct ivhd_header *)p;
+		if (h->type == amd_iommu_target_ivhd_type) {
+			int ret = find_last_devid_from_ivhd(h);
+
+			if (ret)
+				return ret;
+		}
+		p += h->length;
+	}
+	WARN_ON(p != end);
+
+	return 0;
+}
+
+/****************************************************************************
+ *
+ * The following functions belong to the code path which parses the ACPI table
+ * the second time. In this ACPI parsing iteration we allocate IOMMU specific
+ * data structures, initialize the device/alias/rlookup table and also
+ * basically initialize the hardware.
+ *
+ ****************************************************************************/
+
+/*
+ * Allocates the command buffer. This buffer is per AMD IOMMU. We can
+ * write commands to that buffer later and the IOMMU will execute them
+ * asynchronously
+ */
+static int __init alloc_command_buffer(struct amd_iommu *iommu)
+{
+	iommu->cmd_buf = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
+						  get_order(CMD_BUFFER_SIZE));
+
+	return iommu->cmd_buf ? 0 : -ENOMEM;
+}
+
+/*
+ * This function resets the command buffer if the IOMMU stopped fetching
+ * commands from it.
+ */
+void amd_iommu_reset_cmd_buffer(struct amd_iommu *iommu)
+{
+	iommu_feature_disable(iommu, CONTROL_CMDBUF_EN);
+
+	writel(0x00, iommu->mmio_base + MMIO_CMD_HEAD_OFFSET);
+	writel(0x00, iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
+	iommu->cmd_buf_head = 0;
+	iommu->cmd_buf_tail = 0;
+
+	iommu_feature_enable(iommu, CONTROL_CMDBUF_EN);
+}
+
+/*
+ * This function writes the command buffer address to the hardware and
+ * enables it.
+ */
+static void iommu_enable_command_buffer(struct amd_iommu *iommu)
+{
+	u64 entry;
+
+	BUG_ON(iommu->cmd_buf == NULL);
+
+	entry = iommu_virt_to_phys(iommu->cmd_buf);
+	entry |= MMIO_CMD_SIZE_512;
+
+	memcpy_toio(iommu->mmio_base + MMIO_CMD_BUF_OFFSET,
+		    &entry, sizeof(entry));
+
+	amd_iommu_reset_cmd_buffer(iommu);
+}
+
+/*
+ * This function disables the command buffer
+ */
+static void iommu_disable_command_buffer(struct amd_iommu *iommu)
+{
+	iommu_feature_disable(iommu, CONTROL_CMDBUF_EN);
+}
+
+static void __init free_command_buffer(struct amd_iommu *iommu)
+{
+	free_pages((unsigned long)iommu->cmd_buf, get_order(CMD_BUFFER_SIZE));
+}
+
+/* allocates the memory where the IOMMU will log its events to */
+static int __init alloc_event_buffer(struct amd_iommu *iommu)
+{
+	iommu->evt_buf = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
+						  get_order(EVT_BUFFER_SIZE));
+
+	return iommu->evt_buf ? 0 : -ENOMEM;
+}
+
+static void iommu_enable_event_buffer(struct amd_iommu *iommu)
+{
+	u64 entry;
+
+	BUG_ON(iommu->evt_buf == NULL);
+
+	entry = iommu_virt_to_phys(iommu->evt_buf) | EVT_LEN_MASK;
+
+	memcpy_toio(iommu->mmio_base + MMIO_EVT_BUF_OFFSET,
+		    &entry, sizeof(entry));
+
+	/* set head and tail to zero manually */
+	writel(0x00, iommu->mmio_base + MMIO_EVT_HEAD_OFFSET);
+	writel(0x00, iommu->mmio_base + MMIO_EVT_TAIL_OFFSET);
+
+	iommu_feature_enable(iommu, CONTROL_EVT_LOG_EN);
+}
+
+/*
+ * This function disables the event log buffer
+ */
+static void iommu_disable_event_buffer(struct amd_iommu *iommu)
+{
+	iommu_feature_disable(iommu, CONTROL_EVT_LOG_EN);
+}
+
+static void __init free_event_buffer(struct amd_iommu *iommu)
+{
+	free_pages((unsigned long)iommu->evt_buf, get_order(EVT_BUFFER_SIZE));
+}
+
+/* allocates the memory where the IOMMU will log its events to */
+static int __init alloc_ppr_log(struct amd_iommu *iommu)
+{
+	iommu->ppr_log = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
+						  get_order(PPR_LOG_SIZE));
+
+	return iommu->ppr_log ? 0 : -ENOMEM;
+}
+
+static void iommu_enable_ppr_log(struct amd_iommu *iommu)
+{
+	u64 entry;
+
+	if (iommu->ppr_log == NULL)
+		return;
+
+	entry = iommu_virt_to_phys(iommu->ppr_log) | PPR_LOG_SIZE_512;
+
+	memcpy_toio(iommu->mmio_base + MMIO_PPR_LOG_OFFSET,
+		    &entry, sizeof(entry));
+
+	/* set head and tail to zero manually */
+	writel(0x00, iommu->mmio_base + MMIO_PPR_HEAD_OFFSET);
+	writel(0x00, iommu->mmio_base + MMIO_PPR_TAIL_OFFSET);
+
+	iommu_feature_enable(iommu, CONTROL_PPFLOG_EN);
+	iommu_feature_enable(iommu, CONTROL_PPR_EN);
+}
+
+static void __init free_ppr_log(struct amd_iommu *iommu)
+{
+	if (iommu->ppr_log == NULL)
+		return;
+
+	free_pages((unsigned long)iommu->ppr_log, get_order(PPR_LOG_SIZE));
+}
+
+static void free_ga_log(struct amd_iommu *iommu)
+{
+#ifdef CONFIG_IRQ_REMAP
+	if (iommu->ga_log)
+		free_pages((unsigned long)iommu->ga_log,
+			    get_order(GA_LOG_SIZE));
+	if (iommu->ga_log_tail)
+		free_pages((unsigned long)iommu->ga_log_tail,
+			    get_order(8));
+#endif
+}
+
+static int iommu_ga_log_enable(struct amd_iommu *iommu)
+{
+#ifdef CONFIG_IRQ_REMAP
+	u32 status, i;
+
+	if (!iommu->ga_log)
+		return -EINVAL;
+
+	status = readl(iommu->mmio_base + MMIO_STATUS_OFFSET);
+
+	/* Check if already running */
+	if (status & (MMIO_STATUS_GALOG_RUN_MASK))
+		return 0;
+
+	iommu_feature_enable(iommu, CONTROL_GAINT_EN);
+	iommu_feature_enable(iommu, CONTROL_GALOG_EN);
+
+	for (i = 0; i < LOOP_TIMEOUT; ++i) {
+		status = readl(iommu->mmio_base + MMIO_STATUS_OFFSET);
+		if (status & (MMIO_STATUS_GALOG_RUN_MASK))
+			break;
+		udelay(10);
+	}
+
+	if (i >= LOOP_TIMEOUT)
+		return -EINVAL;
+#endif /* CONFIG_IRQ_REMAP */
+	return 0;
+}
+
+#ifdef CONFIG_IRQ_REMAP
+static int iommu_init_ga_log(struct amd_iommu *iommu)
+{
+	u64 entry;
+
+	if (!AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir))
+		return 0;
+
+	iommu->ga_log = (u8 *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
+					get_order(GA_LOG_SIZE));
+	if (!iommu->ga_log)
+		goto err_out;
+
+	iommu->ga_log_tail = (u8 *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
+					get_order(8));
+	if (!iommu->ga_log_tail)
+		goto err_out;
+
+	entry = iommu_virt_to_phys(iommu->ga_log) | GA_LOG_SIZE_512;
+	memcpy_toio(iommu->mmio_base + MMIO_GA_LOG_BASE_OFFSET,
+		    &entry, sizeof(entry));
+	entry = (iommu_virt_to_phys(iommu->ga_log_tail) &
+		 (BIT_ULL(52)-1)) & ~7ULL;
+	memcpy_toio(iommu->mmio_base + MMIO_GA_LOG_TAIL_OFFSET,
+		    &entry, sizeof(entry));
+	writel(0x00, iommu->mmio_base + MMIO_GA_HEAD_OFFSET);
+	writel(0x00, iommu->mmio_base + MMIO_GA_TAIL_OFFSET);
+
+	return 0;
+err_out:
+	free_ga_log(iommu);
+	return -EINVAL;
+}
+#endif /* CONFIG_IRQ_REMAP */
+
+static int iommu_init_ga(struct amd_iommu *iommu)
+{
+	int ret = 0;
+
+#ifdef CONFIG_IRQ_REMAP
+	/* Note: We have already checked GASup from IVRS table.
+	 *       Now, we need to make sure that GAMSup is set.
+	 */
+	if (AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir) &&
+	    !iommu_feature(iommu, FEATURE_GAM_VAPIC))
+		amd_iommu_guest_ir = AMD_IOMMU_GUEST_IR_LEGACY_GA;
+
+	ret = iommu_init_ga_log(iommu);
+#endif /* CONFIG_IRQ_REMAP */
+
+	return ret;
+}
+
+static void iommu_enable_xt(struct amd_iommu *iommu)
+{
+#ifdef CONFIG_IRQ_REMAP
+	/*
+	 * XT mode (32-bit APIC destination ID) requires
+	 * GA mode (128-bit IRTE support) as a prerequisite.
+	 */
+	if (AMD_IOMMU_GUEST_IR_GA(amd_iommu_guest_ir) &&
+	    amd_iommu_xt_mode == IRQ_REMAP_X2APIC_MODE)
+		iommu_feature_enable(iommu, CONTROL_XT_EN);
+#endif /* CONFIG_IRQ_REMAP */
+}
+
+static void iommu_enable_gt(struct amd_iommu *iommu)
+{
+	if (!iommu_feature(iommu, FEATURE_GT))
+		return;
+
+	iommu_feature_enable(iommu, CONTROL_GT_EN);
+}
+
+/* sets a specific bit in the device table entry. */
+static void set_dev_entry_bit(u16 devid, u8 bit)
+{
+	int i = (bit >> 6) & 0x03;
+	int _bit = bit & 0x3f;
+
+	amd_iommu_dev_table[devid].data[i] |= (1UL << _bit);
+}
+
+static int get_dev_entry_bit(u16 devid, u8 bit)
+{
+	int i = (bit >> 6) & 0x03;
+	int _bit = bit & 0x3f;
+
+	return (amd_iommu_dev_table[devid].data[i] & (1UL << _bit)) >> _bit;
+}
+
+
+static bool copy_device_table(void)
+{
+	u64 int_ctl, int_tab_len, entry = 0, last_entry = 0;
+	struct dev_table_entry *old_devtb = NULL;
+	u32 lo, hi, devid, old_devtb_size;
+	phys_addr_t old_devtb_phys;
+	struct amd_iommu *iommu;
+	u16 dom_id, dte_v, irq_v;
+	gfp_t gfp_flag;
+	u64 tmp;
+
+	if (!amd_iommu_pre_enabled)
+		return false;
+
+	pr_warn("Translation is already enabled - trying to copy translation structures\n");
+	for_each_iommu(iommu) {
+		/* All IOMMUs should use the same device table with the same size */
+		lo = readl(iommu->mmio_base + MMIO_DEV_TABLE_OFFSET);
+		hi = readl(iommu->mmio_base + MMIO_DEV_TABLE_OFFSET + 4);
+		entry = (((u64) hi) << 32) + lo;
+		if (last_entry && last_entry != entry) {
+			pr_err("IOMMU:%d should use the same dev table as others!\n",
+				iommu->index);
+			return false;
+		}
+		last_entry = entry;
+
+		old_devtb_size = ((entry & ~PAGE_MASK) + 1) << 12;
+		if (old_devtb_size != dev_table_size) {
+			pr_err("The device table size of IOMMU:%d is not expected!\n",
+				iommu->index);
+			return false;
+		}
+	}
+
+	old_devtb_phys = entry & PAGE_MASK;
+	if (old_devtb_phys >= 0x100000000ULL) {
+		pr_err("The address of old device table is above 4G, not trustworthy!\n");
+		return false;
+	}
+	old_devtb = memremap(old_devtb_phys, dev_table_size, MEMREMAP_WB);
+	if (!old_devtb)
+		return false;
+
+	gfp_flag = GFP_KERNEL | __GFP_ZERO | GFP_DMA32;
+	old_dev_tbl_cpy = (void *)__get_free_pages(gfp_flag,
+				get_order(dev_table_size));
+	if (old_dev_tbl_cpy == NULL) {
+		pr_err("Failed to allocate memory for copying old device table!\n");
+		return false;
+	}
+
+	for (devid = 0; devid <= amd_iommu_last_bdf; ++devid) {
+		old_dev_tbl_cpy[devid] = old_devtb[devid];
+		dom_id = old_devtb[devid].data[1] & DEV_DOMID_MASK;
+		dte_v = old_devtb[devid].data[0] & DTE_FLAG_V;
+
+		if (dte_v && dom_id) {
+			old_dev_tbl_cpy[devid].data[0] = old_devtb[devid].data[0];
+			old_dev_tbl_cpy[devid].data[1] = old_devtb[devid].data[1];
+			__set_bit(dom_id, amd_iommu_pd_alloc_bitmap);
+			/* If gcr3 table existed, mask it out */
+			if (old_devtb[devid].data[0] & DTE_FLAG_GV) {
+				tmp = DTE_GCR3_VAL_B(~0ULL) << DTE_GCR3_SHIFT_B;
+				tmp |= DTE_GCR3_VAL_C(~0ULL) << DTE_GCR3_SHIFT_C;
+				old_dev_tbl_cpy[devid].data[1] &= ~tmp;
+				tmp = DTE_GCR3_VAL_A(~0ULL) << DTE_GCR3_SHIFT_A;
+				tmp |= DTE_FLAG_GV;
+				old_dev_tbl_cpy[devid].data[0] &= ~tmp;
+			}
+		}
+
+		irq_v = old_devtb[devid].data[2] & DTE_IRQ_REMAP_ENABLE;
+		int_ctl = old_devtb[devid].data[2] & DTE_IRQ_REMAP_INTCTL_MASK;
+		int_tab_len = old_devtb[devid].data[2] & DTE_IRQ_TABLE_LEN_MASK;
+		if (irq_v && (int_ctl || int_tab_len)) {
+			if ((int_ctl != DTE_IRQ_REMAP_INTCTL) ||
+			    (int_tab_len != DTE_IRQ_TABLE_LEN)) {
+				pr_err("Wrong old irq remapping flag: %#x\n", devid);
+				return false;
+			}
+
+		        old_dev_tbl_cpy[devid].data[2] = old_devtb[devid].data[2];
+		}
+	}
+	memunmap(old_devtb);
+
+	return true;
+}
+
+void amd_iommu_apply_erratum_63(u16 devid)
+{
+	int sysmgt;
+
+	sysmgt = get_dev_entry_bit(devid, DEV_ENTRY_SYSMGT1) |
+		 (get_dev_entry_bit(devid, DEV_ENTRY_SYSMGT2) << 1);
+
+	if (sysmgt == 0x01)
+		set_dev_entry_bit(devid, DEV_ENTRY_IW);
+}
+
+/* Writes the specific IOMMU for a device into the rlookup table */
+static void __init set_iommu_for_device(struct amd_iommu *iommu, u16 devid)
+{
+	amd_iommu_rlookup_table[devid] = iommu;
+}
+
+/*
+ * This function takes the device specific flags read from the ACPI
+ * table and sets up the device table entry with that information
+ */
+static void __init set_dev_entry_from_acpi(struct amd_iommu *iommu,
+					   u16 devid, u32 flags, u32 ext_flags)
+{
+	if (flags & ACPI_DEVFLAG_INITPASS)
+		set_dev_entry_bit(devid, DEV_ENTRY_INIT_PASS);
+	if (flags & ACPI_DEVFLAG_EXTINT)
+		set_dev_entry_bit(devid, DEV_ENTRY_EINT_PASS);
+	if (flags & ACPI_DEVFLAG_NMI)
+		set_dev_entry_bit(devid, DEV_ENTRY_NMI_PASS);
+	if (flags & ACPI_DEVFLAG_SYSMGT1)
+		set_dev_entry_bit(devid, DEV_ENTRY_SYSMGT1);
+	if (flags & ACPI_DEVFLAG_SYSMGT2)
+		set_dev_entry_bit(devid, DEV_ENTRY_SYSMGT2);
+	if (flags & ACPI_DEVFLAG_LINT0)
+		set_dev_entry_bit(devid, DEV_ENTRY_LINT0_PASS);
+	if (flags & ACPI_DEVFLAG_LINT1)
+		set_dev_entry_bit(devid, DEV_ENTRY_LINT1_PASS);
+
+	amd_iommu_apply_erratum_63(devid);
+
+	set_iommu_for_device(iommu, devid);
+}
+
+int __init add_special_device(u8 type, u8 id, u16 *devid, bool cmd_line)
+{
+	struct devid_map *entry;
+	struct list_head *list;
+
+	if (type == IVHD_SPECIAL_IOAPIC)
+		list = &ioapic_map;
+	else if (type == IVHD_SPECIAL_HPET)
+		list = &hpet_map;
+	else
+		return -EINVAL;
+
+	list_for_each_entry(entry, list, list) {
+		if (!(entry->id == id && entry->cmd_line))
+			continue;
+
+		pr_info("AMD-Vi: Command-line override present for %s id %d - ignoring\n",
+			type == IVHD_SPECIAL_IOAPIC ? "IOAPIC" : "HPET", id);
+
+		*devid = entry->devid;
+
+		return 0;
+	}
+
+	entry = kzalloc(sizeof(*entry), GFP_KERNEL);
+	if (!entry)
+		return -ENOMEM;
+
+	entry->id	= id;
+	entry->devid	= *devid;
+	entry->cmd_line	= cmd_line;
+
+	list_add_tail(&entry->list, list);
+
+	return 0;
+}
+
+static int __init add_acpi_hid_device(u8 *hid, u8 *uid, u16 *devid,
+				      bool cmd_line)
+{
+	struct acpihid_map_entry *entry;
+	struct list_head *list = &acpihid_map;
+
+	list_for_each_entry(entry, list, list) {
+		if (strcmp(entry->hid, hid) ||
+		    (*uid && *entry->uid && strcmp(entry->uid, uid)) ||
+		    !entry->cmd_line)
+			continue;
+
+		pr_info("AMD-Vi: Command-line override for hid:%s uid:%s\n",
+			hid, uid);
+		*devid = entry->devid;
+		return 0;
+	}
+
+	entry = kzalloc(sizeof(*entry), GFP_KERNEL);
+	if (!entry)
+		return -ENOMEM;
+
+	memcpy(entry->uid, uid, strlen(uid));
+	memcpy(entry->hid, hid, strlen(hid));
+	entry->devid = *devid;
+	entry->cmd_line	= cmd_line;
+	entry->root_devid = (entry->devid & (~0x7));
+
+	pr_info("AMD-Vi:%s, add hid:%s, uid:%s, rdevid:%d\n",
+		entry->cmd_line ? "cmd" : "ivrs",
+		entry->hid, entry->uid, entry->root_devid);
+
+	list_add_tail(&entry->list, list);
+	return 0;
+}
+
+static int __init add_early_maps(void)
+{
+	int i, ret;
+
+	for (i = 0; i < early_ioapic_map_size; ++i) {
+		ret = add_special_device(IVHD_SPECIAL_IOAPIC,
+					 early_ioapic_map[i].id,
+					 &early_ioapic_map[i].devid,
+					 early_ioapic_map[i].cmd_line);
+		if (ret)
+			return ret;
+	}
+
+	for (i = 0; i < early_hpet_map_size; ++i) {
+		ret = add_special_device(IVHD_SPECIAL_HPET,
+					 early_hpet_map[i].id,
+					 &early_hpet_map[i].devid,
+					 early_hpet_map[i].cmd_line);
+		if (ret)
+			return ret;
+	}
+
+	for (i = 0; i < early_acpihid_map_size; ++i) {
+		ret = add_acpi_hid_device(early_acpihid_map[i].hid,
+					  early_acpihid_map[i].uid,
+					  &early_acpihid_map[i].devid,
+					  early_acpihid_map[i].cmd_line);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+/*
+ * Reads the device exclusion range from ACPI and initializes the IOMMU with
+ * it
+ */
+static void __init set_device_exclusion_range(u16 devid, struct ivmd_header *m)
+{
+	struct amd_iommu *iommu = amd_iommu_rlookup_table[devid];
+
+	if (!(m->flags & IVMD_FLAG_EXCL_RANGE))
+		return;
+
+	if (iommu) {
+		/*
+		 * We only can configure exclusion ranges per IOMMU, not
+		 * per device. But we can enable the exclusion range per
+		 * device. This is done here
+		 */
+		set_dev_entry_bit(devid, DEV_ENTRY_EX);
+		iommu->exclusion_start = m->range_start;
+		iommu->exclusion_length = m->range_length;
+	}
+}
+
+/*
+ * Takes a pointer to an AMD IOMMU entry in the ACPI table and
+ * initializes the hardware and our data structures with it.
+ */
+static int __init init_iommu_from_acpi(struct amd_iommu *iommu,
+					struct ivhd_header *h)
+{
+	u8 *p = (u8 *)h;
+	u8 *end = p, flags = 0;
+	u16 devid = 0, devid_start = 0, devid_to = 0;
+	u32 dev_i, ext_flags = 0;
+	bool alias = false;
+	struct ivhd_entry *e;
+	u32 ivhd_size;
+	int ret;
+
+
+	ret = add_early_maps();
+	if (ret)
+		return ret;
+
+	amd_iommu_apply_ivrs_quirks();
+
+	/*
+	 * First save the recommended feature enable bits from ACPI
+	 */
+	iommu->acpi_flags = h->flags;
+
+	/*
+	 * Done. Now parse the device entries
+	 */
+	ivhd_size = get_ivhd_header_size(h);
+	if (!ivhd_size) {
+		pr_err("AMD-Vi: Unsupported IVHD type %#x\n", h->type);
+		return -EINVAL;
+	}
+
+	p += ivhd_size;
+
+	end += h->length;
+
+
+	while (p < end) {
+		e = (struct ivhd_entry *)p;
+		switch (e->type) {
+		case IVHD_DEV_ALL:
+
+			DUMP_printk("  DEV_ALL\t\t\tflags: %02x\n", e->flags);
+
+			for (dev_i = 0; dev_i <= amd_iommu_last_bdf; ++dev_i)
+				set_dev_entry_from_acpi(iommu, dev_i, e->flags, 0);
+			break;
+		case IVHD_DEV_SELECT:
+
+			DUMP_printk("  DEV_SELECT\t\t\t devid: %02x:%02x.%x "
+				    "flags: %02x\n",
+				    PCI_BUS_NUM(e->devid),
+				    PCI_SLOT(e->devid),
+				    PCI_FUNC(e->devid),
+				    e->flags);
+
+			devid = e->devid;
+			set_dev_entry_from_acpi(iommu, devid, e->flags, 0);
+			break;
+		case IVHD_DEV_SELECT_RANGE_START:
+
+			DUMP_printk("  DEV_SELECT_RANGE_START\t "
+				    "devid: %02x:%02x.%x flags: %02x\n",
+				    PCI_BUS_NUM(e->devid),
+				    PCI_SLOT(e->devid),
+				    PCI_FUNC(e->devid),
+				    e->flags);
+
+			devid_start = e->devid;
+			flags = e->flags;
+			ext_flags = 0;
+			alias = false;
+			break;
+		case IVHD_DEV_ALIAS:
+
+			DUMP_printk("  DEV_ALIAS\t\t\t devid: %02x:%02x.%x "
+				    "flags: %02x devid_to: %02x:%02x.%x\n",
+				    PCI_BUS_NUM(e->devid),
+				    PCI_SLOT(e->devid),
+				    PCI_FUNC(e->devid),
+				    e->flags,
+				    PCI_BUS_NUM(e->ext >> 8),
+				    PCI_SLOT(e->ext >> 8),
+				    PCI_FUNC(e->ext >> 8));
+
+			devid = e->devid;
+			devid_to = e->ext >> 8;
+			set_dev_entry_from_acpi(iommu, devid   , e->flags, 0);
+			set_dev_entry_from_acpi(iommu, devid_to, e->flags, 0);
+			amd_iommu_alias_table[devid] = devid_to;
+			break;
+		case IVHD_DEV_ALIAS_RANGE:
+
+			DUMP_printk("  DEV_ALIAS_RANGE\t\t "
+				    "devid: %02x:%02x.%x flags: %02x "
+				    "devid_to: %02x:%02x.%x\n",
+				    PCI_BUS_NUM(e->devid),
+				    PCI_SLOT(e->devid),
+				    PCI_FUNC(e->devid),
+				    e->flags,
+				    PCI_BUS_NUM(e->ext >> 8),
+				    PCI_SLOT(e->ext >> 8),
+				    PCI_FUNC(e->ext >> 8));
+
+			devid_start = e->devid;
+			flags = e->flags;
+			devid_to = e->ext >> 8;
+			ext_flags = 0;
+			alias = true;
+			break;
+		case IVHD_DEV_EXT_SELECT:
+
+			DUMP_printk("  DEV_EXT_SELECT\t\t devid: %02x:%02x.%x "
+				    "flags: %02x ext: %08x\n",
+				    PCI_BUS_NUM(e->devid),
+				    PCI_SLOT(e->devid),
+				    PCI_FUNC(e->devid),
+				    e->flags, e->ext);
+
+			devid = e->devid;
+			set_dev_entry_from_acpi(iommu, devid, e->flags,
+						e->ext);
+			break;
+		case IVHD_DEV_EXT_SELECT_RANGE:
+
+			DUMP_printk("  DEV_EXT_SELECT_RANGE\t devid: "
+				    "%02x:%02x.%x flags: %02x ext: %08x\n",
+				    PCI_BUS_NUM(e->devid),
+				    PCI_SLOT(e->devid),
+				    PCI_FUNC(e->devid),
+				    e->flags, e->ext);
+
+			devid_start = e->devid;
+			flags = e->flags;
+			ext_flags = e->ext;
+			alias = false;
+			break;
+		case IVHD_DEV_RANGE_END:
+
+			DUMP_printk("  DEV_RANGE_END\t\t devid: %02x:%02x.%x\n",
+				    PCI_BUS_NUM(e->devid),
+				    PCI_SLOT(e->devid),
+				    PCI_FUNC(e->devid));
+
+			devid = e->devid;
+			for (dev_i = devid_start; dev_i <= devid; ++dev_i) {
+				if (alias) {
+					amd_iommu_alias_table[dev_i] = devid_to;
+					set_dev_entry_from_acpi(iommu,
+						devid_to, flags, ext_flags);
+				}
+				set_dev_entry_from_acpi(iommu, dev_i,
+							flags, ext_flags);
+			}
+			break;
+		case IVHD_DEV_SPECIAL: {
+			u8 handle, type;
+			const char *var;
+			u16 devid;
+			int ret;
+
+			handle = e->ext & 0xff;
+			devid  = (e->ext >>  8) & 0xffff;
+			type   = (e->ext >> 24) & 0xff;
+
+			if (type == IVHD_SPECIAL_IOAPIC)
+				var = "IOAPIC";
+			else if (type == IVHD_SPECIAL_HPET)
+				var = "HPET";
+			else
+				var = "UNKNOWN";
+
+			DUMP_printk("  DEV_SPECIAL(%s[%d])\t\tdevid: %02x:%02x.%x\n",
+				    var, (int)handle,
+				    PCI_BUS_NUM(devid),
+				    PCI_SLOT(devid),
+				    PCI_FUNC(devid));
+
+			ret = add_special_device(type, handle, &devid, false);
+			if (ret)
+				return ret;
+
+			/*
+			 * add_special_device might update the devid in case a
+			 * command-line override is present. So call
+			 * set_dev_entry_from_acpi after add_special_device.
+			 */
+			set_dev_entry_from_acpi(iommu, devid, e->flags, 0);
+
+			break;
+		}
+		case IVHD_DEV_ACPI_HID: {
+			u16 devid;
+			u8 hid[ACPIHID_HID_LEN];
+			u8 uid[ACPIHID_UID_LEN];
+			int ret;
+
+			if (h->type != 0x40) {
+				pr_err(FW_BUG "Invalid IVHD device type %#x\n",
+				       e->type);
+				break;
+			}
+
+			memcpy(hid, (u8 *)(&e->ext), ACPIHID_HID_LEN - 1);
+			hid[ACPIHID_HID_LEN - 1] = '\0';
+
+			if (!(*hid)) {
+				pr_err(FW_BUG "Invalid HID.\n");
+				break;
+			}
+
+			uid[0] = '\0';
+			switch (e->uidf) {
+			case UID_NOT_PRESENT:
+
+				if (e->uidl != 0)
+					pr_warn(FW_BUG "Invalid UID length.\n");
+
+				break;
+			case UID_IS_INTEGER:
+
+				sprintf(uid, "%d", e->uid);
+
+				break;
+			case UID_IS_CHARACTER:
+
+				memcpy(uid, &e->uid, e->uidl);
+				uid[e->uidl] = '\0';
+
+				break;
+			default:
+				break;
+			}
+
+			devid = e->devid;
+			DUMP_printk("  DEV_ACPI_HID(%s[%s])\t\tdevid: %02x:%02x.%x\n",
+				    hid, uid,
+				    PCI_BUS_NUM(devid),
+				    PCI_SLOT(devid),
+				    PCI_FUNC(devid));
+
+			flags = e->flags;
+
+			ret = add_acpi_hid_device(hid, uid, &devid, false);
+			if (ret)
+				return ret;
+
+			/*
+			 * add_special_device might update the devid in case a
+			 * command-line override is present. So call
+			 * set_dev_entry_from_acpi after add_special_device.
+			 */
+			set_dev_entry_from_acpi(iommu, devid, e->flags, 0);
+
+			break;
+		}
+		default:
+			break;
+		}
+
+		p += ivhd_entry_length(p);
+	}
+
+	return 0;
+}
+
+static void __init free_iommu_one(struct amd_iommu *iommu)
+{
+	free_command_buffer(iommu);
+	free_event_buffer(iommu);
+	free_ppr_log(iommu);
+	free_ga_log(iommu);
+	iommu_unmap_mmio_space(iommu);
+}
+
+static void __init free_iommu_all(void)
+{
+	struct amd_iommu *iommu, *next;
+
+	for_each_iommu_safe(iommu, next) {
+		list_del(&iommu->list);
+		free_iommu_one(iommu);
+		kfree(iommu);
+	}
+}
+
+/*
+ * Family15h Model 10h-1fh erratum 746 (IOMMU Logging May Stall Translations)
+ * Workaround:
+ *     BIOS should disable L2B micellaneous clock gating by setting
+ *     L2_L2B_CK_GATE_CONTROL[CKGateL2BMiscDisable](D0F2xF4_x90[2]) = 1b
+ */
+static void amd_iommu_erratum_746_workaround(struct amd_iommu *iommu)
+{
+	u32 value;
+
+	if ((boot_cpu_data.x86 != 0x15) ||
+	    (boot_cpu_data.x86_model < 0x10) ||
+	    (boot_cpu_data.x86_model > 0x1f))
+		return;
+
+	pci_write_config_dword(iommu->dev, 0xf0, 0x90);
+	pci_read_config_dword(iommu->dev, 0xf4, &value);
+
+	if (value & BIT(2))
+		return;
+
+	/* Select NB indirect register 0x90 and enable writing */
+	pci_write_config_dword(iommu->dev, 0xf0, 0x90 | (1 << 8));
+
+	pci_write_config_dword(iommu->dev, 0xf4, value | 0x4);
+	pr_info("AMD-Vi: Applying erratum 746 workaround for IOMMU at %s\n",
+		dev_name(&iommu->dev->dev));
+
+	/* Clear the enable writing bit */
+	pci_write_config_dword(iommu->dev, 0xf0, 0x90);
+}
+
+/*
+ * Family15h Model 30h-3fh (IOMMU Mishandles ATS Write Permission)
+ * Workaround:
+ *     BIOS should enable ATS write permission check by setting
+ *     L2_DEBUG_3[AtsIgnoreIWDis](D0F2xF4_x47[0]) = 1b
+ */
+static void amd_iommu_ats_write_check_workaround(struct amd_iommu *iommu)
+{
+	u32 value;
+
+	if ((boot_cpu_data.x86 != 0x15) ||
+	    (boot_cpu_data.x86_model < 0x30) ||
+	    (boot_cpu_data.x86_model > 0x3f))
+		return;
+
+	/* Test L2_DEBUG_3[AtsIgnoreIWDis] == 1 */
+	value = iommu_read_l2(iommu, 0x47);
+
+	if (value & BIT(0))
+		return;
+
+	/* Set L2_DEBUG_3[AtsIgnoreIWDis] = 1 */
+	iommu_write_l2(iommu, 0x47, value | BIT(0));
+
+	pr_info("AMD-Vi: Applying ATS write check workaround for IOMMU at %s\n",
+		dev_name(&iommu->dev->dev));
+}
+
+/*
+ * This function clues the initialization function for one IOMMU
+ * together and also allocates the command buffer and programs the
+ * hardware. It does NOT enable the IOMMU. This is done afterwards.
+ */
+static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h)
+{
+	int ret;
+
+	raw_spin_lock_init(&iommu->lock);
+
+	/* Add IOMMU to internal data structures */
+	list_add_tail(&iommu->list, &amd_iommu_list);
+	iommu->index = amd_iommus_present++;
+
+	if (unlikely(iommu->index >= MAX_IOMMUS)) {
+		WARN(1, "AMD-Vi: System has more IOMMUs than supported by this driver\n");
+		return -ENOSYS;
+	}
+
+	/* Index is fine - add IOMMU to the array */
+	amd_iommus[iommu->index] = iommu;
+
+	/*
+	 * Copy data from ACPI table entry to the iommu struct
+	 */
+	iommu->devid   = h->devid;
+	iommu->cap_ptr = h->cap_ptr;
+	iommu->pci_seg = h->pci_seg;
+	iommu->mmio_phys = h->mmio_phys;
+
+	switch (h->type) {
+	case 0x10:
+		/* Check if IVHD EFR contains proper max banks/counters */
+		if ((h->efr_attr != 0) &&
+		    ((h->efr_attr & (0xF << 13)) != 0) &&
+		    ((h->efr_attr & (0x3F << 17)) != 0))
+			iommu->mmio_phys_end = MMIO_REG_END_OFFSET;
+		else
+			iommu->mmio_phys_end = MMIO_CNTR_CONF_OFFSET;
+		if (((h->efr_attr & (0x1 << IOMMU_FEAT_GASUP_SHIFT)) == 0))
+			amd_iommu_guest_ir = AMD_IOMMU_GUEST_IR_LEGACY;
+		if (((h->efr_attr & (0x1 << IOMMU_FEAT_XTSUP_SHIFT)) == 0))
+			amd_iommu_xt_mode = IRQ_REMAP_XAPIC_MODE;
+		break;
+	case 0x11:
+	case 0x40:
+		if (h->efr_reg & (1 << 9))
+			iommu->mmio_phys_end = MMIO_REG_END_OFFSET;
+		else
+			iommu->mmio_phys_end = MMIO_CNTR_CONF_OFFSET;
+		if (((h->efr_reg & (0x1 << IOMMU_EFR_GASUP_SHIFT)) == 0))
+			amd_iommu_guest_ir = AMD_IOMMU_GUEST_IR_LEGACY;
+		if (((h->efr_reg & (0x1 << IOMMU_EFR_XTSUP_SHIFT)) == 0))
+			amd_iommu_xt_mode = IRQ_REMAP_XAPIC_MODE;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	iommu->mmio_base = iommu_map_mmio_space(iommu->mmio_phys,
+						iommu->mmio_phys_end);
+	if (!iommu->mmio_base)
+		return -ENOMEM;
+
+	if (alloc_command_buffer(iommu))
+		return -ENOMEM;
+
+	if (alloc_event_buffer(iommu))
+		return -ENOMEM;
+
+	iommu->int_enabled = false;
+
+	init_translation_status(iommu);
+	if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
+		iommu_disable(iommu);
+		clear_translation_pre_enabled(iommu);
+		pr_warn("Translation was enabled for IOMMU:%d but we are not in kdump mode\n",
+			iommu->index);
+	}
+	if (amd_iommu_pre_enabled)
+		amd_iommu_pre_enabled = translation_pre_enabled(iommu);
+
+	ret = init_iommu_from_acpi(iommu, h);
+	if (ret)
+		return ret;
+
+	ret = amd_iommu_create_irq_domain(iommu);
+	if (ret)
+		return ret;
+
+	/*
+	 * Make sure IOMMU is not considered to translate itself. The IVRS
+	 * table tells us so, but this is a lie!
+	 */
+	amd_iommu_rlookup_table[iommu->devid] = NULL;
+
+	return 0;
+}
+
+/**
+ * get_highest_supported_ivhd_type - Look up the appropriate IVHD type
+ * @ivrs          Pointer to the IVRS header
+ *
+ * This function search through all IVDB of the maximum supported IVHD
+ */
+static u8 get_highest_supported_ivhd_type(struct acpi_table_header *ivrs)
+{
+	u8 *base = (u8 *)ivrs;
+	struct ivhd_header *ivhd = (struct ivhd_header *)
+					(base + IVRS_HEADER_LENGTH);
+	u8 last_type = ivhd->type;
+	u16 devid = ivhd->devid;
+
+	while (((u8 *)ivhd - base < ivrs->length) &&
+	       (ivhd->type <= ACPI_IVHD_TYPE_MAX_SUPPORTED)) {
+		u8 *p = (u8 *) ivhd;
+
+		if (ivhd->devid == devid)
+			last_type = ivhd->type;
+		ivhd = (struct ivhd_header *)(p + ivhd->length);
+	}
+
+	return last_type;
+}
+
+/*
+ * Iterates over all IOMMU entries in the ACPI table, allocates the
+ * IOMMU structure and initializes it with init_iommu_one()
+ */
+static int __init init_iommu_all(struct acpi_table_header *table)
+{
+	u8 *p = (u8 *)table, *end = (u8 *)table;
+	struct ivhd_header *h;
+	struct amd_iommu *iommu;
+	int ret;
+
+	end += table->length;
+	p += IVRS_HEADER_LENGTH;
+
+	while (p < end) {
+		h = (struct ivhd_header *)p;
+		if (*p == amd_iommu_target_ivhd_type) {
+
+			DUMP_printk("device: %02x:%02x.%01x cap: %04x "
+				    "seg: %d flags: %01x info %04x\n",
+				    PCI_BUS_NUM(h->devid), PCI_SLOT(h->devid),
+				    PCI_FUNC(h->devid), h->cap_ptr,
+				    h->pci_seg, h->flags, h->info);
+			DUMP_printk("       mmio-addr: %016llx\n",
+				    h->mmio_phys);
+
+			iommu = kzalloc(sizeof(struct amd_iommu), GFP_KERNEL);
+			if (iommu == NULL)
+				return -ENOMEM;
+
+			ret = init_iommu_one(iommu, h);
+			if (ret)
+				return ret;
+		}
+		p += h->length;
+
+	}
+	WARN_ON(p != end);
+
+	return 0;
+}
+
+static int iommu_pc_get_set_reg(struct amd_iommu *iommu, u8 bank, u8 cntr,
+				u8 fxn, u64 *value, bool is_write);
+
+static void init_iommu_perf_ctr(struct amd_iommu *iommu)
+{
+	u64 val = 0xabcd, val2 = 0;
+
+	if (!iommu_feature(iommu, FEATURE_PC))
+		return;
+
+	amd_iommu_pc_present = true;
+
+	/* Check if the performance counters can be written to */
+	if ((iommu_pc_get_set_reg(iommu, 0, 0, 0, &val, true)) ||
+	    (iommu_pc_get_set_reg(iommu, 0, 0, 0, &val2, false)) ||
+	    (val != val2)) {
+		pr_err("AMD-Vi: Unable to write to IOMMU perf counter.\n");
+		amd_iommu_pc_present = false;
+		return;
+	}
+
+	pr_info("AMD-Vi: IOMMU performance counters supported\n");
+
+	val = readl(iommu->mmio_base + MMIO_CNTR_CONF_OFFSET);
+	iommu->max_banks = (u8) ((val >> 12) & 0x3f);
+	iommu->max_counters = (u8) ((val >> 7) & 0xf);
+}
+
+static ssize_t amd_iommu_show_cap(struct device *dev,
+				  struct device_attribute *attr,
+				  char *buf)
+{
+	struct amd_iommu *iommu = dev_to_amd_iommu(dev);
+	return sprintf(buf, "%x\n", iommu->cap);
+}
+static DEVICE_ATTR(cap, S_IRUGO, amd_iommu_show_cap, NULL);
+
+static ssize_t amd_iommu_show_features(struct device *dev,
+				       struct device_attribute *attr,
+				       char *buf)
+{
+	struct amd_iommu *iommu = dev_to_amd_iommu(dev);
+	return sprintf(buf, "%llx\n", iommu->features);
+}
+static DEVICE_ATTR(features, S_IRUGO, amd_iommu_show_features, NULL);
+
+static struct attribute *amd_iommu_attrs[] = {
+	&dev_attr_cap.attr,
+	&dev_attr_features.attr,
+	NULL,
+};
+
+static struct attribute_group amd_iommu_group = {
+	.name = "amd-iommu",
+	.attrs = amd_iommu_attrs,
+};
+
+static const struct attribute_group *amd_iommu_groups[] = {
+	&amd_iommu_group,
+	NULL,
+};
+
+static int __init iommu_init_pci(struct amd_iommu *iommu)
+{
+	int cap_ptr = iommu->cap_ptr;
+	u32 range, misc, low, high;
+	int ret;
+
+	iommu->dev = pci_get_domain_bus_and_slot(0, PCI_BUS_NUM(iommu->devid),
+						 iommu->devid & 0xff);
+	if (!iommu->dev)
+		return -ENODEV;
+
+	/* Prevent binding other PCI device drivers to IOMMU devices */
+	iommu->dev->match_driver = false;
+
+	pci_read_config_dword(iommu->dev, cap_ptr + MMIO_CAP_HDR_OFFSET,
+			      &iommu->cap);
+	pci_read_config_dword(iommu->dev, cap_ptr + MMIO_RANGE_OFFSET,
+			      &range);
+	pci_read_config_dword(iommu->dev, cap_ptr + MMIO_MISC_OFFSET,
+			      &misc);
+
+	if (!(iommu->cap & (1 << IOMMU_CAP_IOTLB)))
+		amd_iommu_iotlb_sup = false;
+
+	/* read extended feature bits */
+	low  = readl(iommu->mmio_base + MMIO_EXT_FEATURES);
+	high = readl(iommu->mmio_base + MMIO_EXT_FEATURES + 4);
+
+	iommu->features = ((u64)high << 32) | low;
+
+	if (iommu_feature(iommu, FEATURE_GT)) {
+		int glxval;
+		u32 max_pasid;
+		u64 pasmax;
+
+		pasmax = iommu->features & FEATURE_PASID_MASK;
+		pasmax >>= FEATURE_PASID_SHIFT;
+		max_pasid  = (1 << (pasmax + 1)) - 1;
+
+		amd_iommu_max_pasid = min(amd_iommu_max_pasid, max_pasid);
+
+		BUG_ON(amd_iommu_max_pasid & ~PASID_MASK);
+
+		glxval   = iommu->features & FEATURE_GLXVAL_MASK;
+		glxval >>= FEATURE_GLXVAL_SHIFT;
+
+		if (amd_iommu_max_glx_val == -1)
+			amd_iommu_max_glx_val = glxval;
+		else
+			amd_iommu_max_glx_val = min(amd_iommu_max_glx_val, glxval);
+	}
+
+	if (iommu_feature(iommu, FEATURE_GT) &&
+	    iommu_feature(iommu, FEATURE_PPR)) {
+		iommu->is_iommu_v2   = true;
+		amd_iommu_v2_present = true;
+	}
+
+	if (iommu_feature(iommu, FEATURE_PPR) && alloc_ppr_log(iommu))
+		return -ENOMEM;
+
+	ret = iommu_init_ga(iommu);
+	if (ret)
+		return ret;
+
+	if (iommu->cap & (1UL << IOMMU_CAP_NPCACHE))
+		amd_iommu_np_cache = true;
+
+	init_iommu_perf_ctr(iommu);
+
+	if (is_rd890_iommu(iommu->dev)) {
+		int i, j;
+
+		iommu->root_pdev =
+			pci_get_domain_bus_and_slot(0, iommu->dev->bus->number,
+						    PCI_DEVFN(0, 0));
+
+		/*
+		 * Some rd890 systems may not be fully reconfigured by the
+		 * BIOS, so it's necessary for us to store this information so
+		 * it can be reprogrammed on resume
+		 */
+		pci_read_config_dword(iommu->dev, iommu->cap_ptr + 4,
+				&iommu->stored_addr_lo);
+		pci_read_config_dword(iommu->dev, iommu->cap_ptr + 8,
+				&iommu->stored_addr_hi);
+
+		/* Low bit locks writes to configuration space */
+		iommu->stored_addr_lo &= ~1;
+
+		for (i = 0; i < 6; i++)
+			for (j = 0; j < 0x12; j++)
+				iommu->stored_l1[i][j] = iommu_read_l1(iommu, i, j);
+
+		for (i = 0; i < 0x83; i++)
+			iommu->stored_l2[i] = iommu_read_l2(iommu, i);
+	}
+
+	amd_iommu_erratum_746_workaround(iommu);
+	amd_iommu_ats_write_check_workaround(iommu);
+
+	iommu_device_sysfs_add(&iommu->iommu, &iommu->dev->dev,
+			       amd_iommu_groups, "ivhd%d", iommu->index);
+	iommu_device_set_ops(&iommu->iommu, &amd_iommu_ops);
+	iommu_device_register(&iommu->iommu);
+
+	return pci_enable_device(iommu->dev);
+}
+
+static void print_iommu_info(void)
+{
+	static const char * const feat_str[] = {
+		"PreF", "PPR", "X2APIC", "NX", "GT", "[5]",
+		"IA", "GA", "HE", "PC"
+	};
+	struct amd_iommu *iommu;
+
+	for_each_iommu(iommu) {
+		int i;
+
+		pr_info("AMD-Vi: Found IOMMU at %s cap 0x%hx\n",
+			dev_name(&iommu->dev->dev), iommu->cap_ptr);
+
+		if (iommu->cap & (1 << IOMMU_CAP_EFR)) {
+			pr_info("AMD-Vi: Extended features (%#llx):\n",
+				iommu->features);
+			for (i = 0; i < ARRAY_SIZE(feat_str); ++i) {
+				if (iommu_feature(iommu, (1ULL << i)))
+					pr_cont(" %s", feat_str[i]);
+			}
+
+			if (iommu->features & FEATURE_GAM_VAPIC)
+				pr_cont(" GA_vAPIC");
+
+			pr_cont("\n");
+		}
+	}
+	if (irq_remapping_enabled) {
+		pr_info("AMD-Vi: Interrupt remapping enabled\n");
+		if (AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir))
+			pr_info("AMD-Vi: virtual APIC enabled\n");
+		if (amd_iommu_xt_mode == IRQ_REMAP_X2APIC_MODE)
+			pr_info("AMD-Vi: X2APIC enabled\n");
+	}
+}
+
+static int __init amd_iommu_init_pci(void)
+{
+	struct amd_iommu *iommu;
+	int ret = 0;
+
+	for_each_iommu(iommu) {
+		ret = iommu_init_pci(iommu);
+		if (ret)
+			break;
+	}
+
+	/*
+	 * Order is important here to make sure any unity map requirements are
+	 * fulfilled. The unity mappings are created and written to the device
+	 * table during the amd_iommu_init_api() call.
+	 *
+	 * After that we call init_device_table_dma() to make sure any
+	 * uninitialized DTE will block DMA, and in the end we flush the caches
+	 * of all IOMMUs to make sure the changes to the device table are
+	 * active.
+	 */
+	ret = amd_iommu_init_api();
+
+	init_device_table_dma();
+
+	for_each_iommu(iommu)
+		iommu_flush_all_caches(iommu);
+
+	if (!ret)
+		print_iommu_info();
+
+	return ret;
+}
+
+/****************************************************************************
+ *
+ * The following functions initialize the MSI interrupts for all IOMMUs
+ * in the system. It's a bit challenging because there could be multiple
+ * IOMMUs per PCI BDF but we can call pci_enable_msi(x) only once per
+ * pci_dev.
+ *
+ ****************************************************************************/
+
+static int iommu_setup_msi(struct amd_iommu *iommu)
+{
+	int r;
+
+	r = pci_enable_msi(iommu->dev);
+	if (r)
+		return r;
+
+	r = request_threaded_irq(iommu->dev->irq,
+				 amd_iommu_int_handler,
+				 amd_iommu_int_thread,
+				 0, "AMD-Vi",
+				 iommu);
+
+	if (r) {
+		pci_disable_msi(iommu->dev);
+		return r;
+	}
+
+	iommu->int_enabled = true;
+
+	return 0;
+}
+
+static int iommu_init_msi(struct amd_iommu *iommu)
+{
+	int ret;
+
+	if (iommu->int_enabled)
+		goto enable_faults;
+
+	if (iommu->dev->msi_cap)
+		ret = iommu_setup_msi(iommu);
+	else
+		ret = -ENODEV;
+
+	if (ret)
+		return ret;
+
+enable_faults:
+	iommu_feature_enable(iommu, CONTROL_EVT_INT_EN);
+
+	if (iommu->ppr_log != NULL)
+		iommu_feature_enable(iommu, CONTROL_PPFINT_EN);
+
+	iommu_ga_log_enable(iommu);
+
+	return 0;
+}
+
+/****************************************************************************
+ *
+ * The next functions belong to the third pass of parsing the ACPI
+ * table. In this last pass the memory mapping requirements are
+ * gathered (like exclusion and unity mapping ranges).
+ *
+ ****************************************************************************/
+
+static void __init free_unity_maps(void)
+{
+	struct unity_map_entry *entry, *next;
+
+	list_for_each_entry_safe(entry, next, &amd_iommu_unity_map, list) {
+		list_del(&entry->list);
+		kfree(entry);
+	}
+}
+
+/* called when we find an exclusion range definition in ACPI */
+static int __init init_exclusion_range(struct ivmd_header *m)
+{
+	int i;
+
+	switch (m->type) {
+	case ACPI_IVMD_TYPE:
+		set_device_exclusion_range(m->devid, m);
+		break;
+	case ACPI_IVMD_TYPE_ALL:
+		for (i = 0; i <= amd_iommu_last_bdf; ++i)
+			set_device_exclusion_range(i, m);
+		break;
+	case ACPI_IVMD_TYPE_RANGE:
+		for (i = m->devid; i <= m->aux; ++i)
+			set_device_exclusion_range(i, m);
+		break;
+	default:
+		break;
+	}
+
+	return 0;
+}
+
+/* called for unity map ACPI definition */
+static int __init init_unity_map_range(struct ivmd_header *m)
+{
+	struct unity_map_entry *e = NULL;
+	char *s;
+
+	e = kzalloc(sizeof(*e), GFP_KERNEL);
+	if (e == NULL)
+		return -ENOMEM;
+
+	if (m->flags & IVMD_FLAG_EXCL_RANGE)
+		init_exclusion_range(m);
+
+	switch (m->type) {
+	default:
+		kfree(e);
+		return 0;
+	case ACPI_IVMD_TYPE:
+		s = "IVMD_TYPEi\t\t\t";
+		e->devid_start = e->devid_end = m->devid;
+		break;
+	case ACPI_IVMD_TYPE_ALL:
+		s = "IVMD_TYPE_ALL\t\t";
+		e->devid_start = 0;
+		e->devid_end = amd_iommu_last_bdf;
+		break;
+	case ACPI_IVMD_TYPE_RANGE:
+		s = "IVMD_TYPE_RANGE\t\t";
+		e->devid_start = m->devid;
+		e->devid_end = m->aux;
+		break;
+	}
+	e->address_start = PAGE_ALIGN(m->range_start);
+	e->address_end = e->address_start + PAGE_ALIGN(m->range_length);
+	e->prot = m->flags >> 1;
+
+	DUMP_printk("%s devid_start: %02x:%02x.%x devid_end: %02x:%02x.%x"
+		    " range_start: %016llx range_end: %016llx flags: %x\n", s,
+		    PCI_BUS_NUM(e->devid_start), PCI_SLOT(e->devid_start),
+		    PCI_FUNC(e->devid_start), PCI_BUS_NUM(e->devid_end),
+		    PCI_SLOT(e->devid_end), PCI_FUNC(e->devid_end),
+		    e->address_start, e->address_end, m->flags);
+
+	list_add_tail(&e->list, &amd_iommu_unity_map);
+
+	return 0;
+}
+
+/* iterates over all memory definitions we find in the ACPI table */
+static int __init init_memory_definitions(struct acpi_table_header *table)
+{
+	u8 *p = (u8 *)table, *end = (u8 *)table;
+	struct ivmd_header *m;
+
+	end += table->length;
+	p += IVRS_HEADER_LENGTH;
+
+	while (p < end) {
+		m = (struct ivmd_header *)p;
+		if (m->flags & (IVMD_FLAG_UNITY_MAP | IVMD_FLAG_EXCL_RANGE))
+			init_unity_map_range(m);
+
+		p += m->length;
+	}
+
+	return 0;
+}
+
+/*
+ * Init the device table to not allow DMA access for devices
+ */
+static void init_device_table_dma(void)
+{
+	u32 devid;
+
+	for (devid = 0; devid <= amd_iommu_last_bdf; ++devid) {
+		set_dev_entry_bit(devid, DEV_ENTRY_VALID);
+		set_dev_entry_bit(devid, DEV_ENTRY_TRANSLATION);
+	}
+}
+
+static void __init uninit_device_table_dma(void)
+{
+	u32 devid;
+
+	for (devid = 0; devid <= amd_iommu_last_bdf; ++devid) {
+		amd_iommu_dev_table[devid].data[0] = 0ULL;
+		amd_iommu_dev_table[devid].data[1] = 0ULL;
+	}
+}
+
+static void init_device_table(void)
+{
+	u32 devid;
+
+	if (!amd_iommu_irq_remap)
+		return;
+
+	for (devid = 0; devid <= amd_iommu_last_bdf; ++devid)
+		set_dev_entry_bit(devid, DEV_ENTRY_IRQ_TBL_EN);
+}
+
+static void iommu_init_flags(struct amd_iommu *iommu)
+{
+	iommu->acpi_flags & IVHD_FLAG_HT_TUN_EN_MASK ?
+		iommu_feature_enable(iommu, CONTROL_HT_TUN_EN) :
+		iommu_feature_disable(iommu, CONTROL_HT_TUN_EN);
+
+	iommu->acpi_flags & IVHD_FLAG_PASSPW_EN_MASK ?
+		iommu_feature_enable(iommu, CONTROL_PASSPW_EN) :
+		iommu_feature_disable(iommu, CONTROL_PASSPW_EN);
+
+	iommu->acpi_flags & IVHD_FLAG_RESPASSPW_EN_MASK ?
+		iommu_feature_enable(iommu, CONTROL_RESPASSPW_EN) :
+		iommu_feature_disable(iommu, CONTROL_RESPASSPW_EN);
+
+	iommu->acpi_flags & IVHD_FLAG_ISOC_EN_MASK ?
+		iommu_feature_enable(iommu, CONTROL_ISOC_EN) :
+		iommu_feature_disable(iommu, CONTROL_ISOC_EN);
+
+	/*
+	 * make IOMMU memory accesses cache coherent
+	 */
+	iommu_feature_enable(iommu, CONTROL_COHERENT_EN);
+
+	/* Set IOTLB invalidation timeout to 1s */
+	iommu_set_inv_tlb_timeout(iommu, CTRL_INV_TO_1S);
+}
+
+static void iommu_apply_resume_quirks(struct amd_iommu *iommu)
+{
+	int i, j;
+	u32 ioc_feature_control;
+	struct pci_dev *pdev = iommu->root_pdev;
+
+	/* RD890 BIOSes may not have completely reconfigured the iommu */
+	if (!is_rd890_iommu(iommu->dev) || !pdev)
+		return;
+
+	/*
+	 * First, we need to ensure that the iommu is enabled. This is
+	 * controlled by a register in the northbridge
+	 */
+
+	/* Select Northbridge indirect register 0x75 and enable writing */
+	pci_write_config_dword(pdev, 0x60, 0x75 | (1 << 7));
+	pci_read_config_dword(pdev, 0x64, &ioc_feature_control);
+
+	/* Enable the iommu */
+	if (!(ioc_feature_control & 0x1))
+		pci_write_config_dword(pdev, 0x64, ioc_feature_control | 1);
+
+	/* Restore the iommu BAR */
+	pci_write_config_dword(iommu->dev, iommu->cap_ptr + 4,
+			       iommu->stored_addr_lo);
+	pci_write_config_dword(iommu->dev, iommu->cap_ptr + 8,
+			       iommu->stored_addr_hi);
+
+	/* Restore the l1 indirect regs for each of the 6 l1s */
+	for (i = 0; i < 6; i++)
+		for (j = 0; j < 0x12; j++)
+			iommu_write_l1(iommu, i, j, iommu->stored_l1[i][j]);
+
+	/* Restore the l2 indirect regs */
+	for (i = 0; i < 0x83; i++)
+		iommu_write_l2(iommu, i, iommu->stored_l2[i]);
+
+	/* Lock PCI setup registers */
+	pci_write_config_dword(iommu->dev, iommu->cap_ptr + 4,
+			       iommu->stored_addr_lo | 1);
+}
+
+static void iommu_enable_ga(struct amd_iommu *iommu)
+{
+#ifdef CONFIG_IRQ_REMAP
+	switch (amd_iommu_guest_ir) {
+	case AMD_IOMMU_GUEST_IR_VAPIC:
+		iommu_feature_enable(iommu, CONTROL_GAM_EN);
+		/* Fall through */
+	case AMD_IOMMU_GUEST_IR_LEGACY_GA:
+		iommu_feature_enable(iommu, CONTROL_GA_EN);
+		iommu->irte_ops = &irte_128_ops;
+		break;
+	default:
+		iommu->irte_ops = &irte_32_ops;
+		break;
+	}
+#endif
+}
+
+static void early_enable_iommu(struct amd_iommu *iommu)
+{
+	iommu_disable(iommu);
+	iommu_init_flags(iommu);
+	iommu_set_device_table(iommu);
+	iommu_enable_command_buffer(iommu);
+	iommu_enable_event_buffer(iommu);
+	iommu_set_exclusion_range(iommu);
+	iommu_enable_ga(iommu);
+	iommu_enable_xt(iommu);
+	iommu_enable(iommu);
+	iommu_flush_all_caches(iommu);
+}
+
+/*
+ * This function finally enables all IOMMUs found in the system after
+ * they have been initialized.
+ *
+ * Or if in kdump kernel and IOMMUs are all pre-enabled, try to copy
+ * the old content of device table entries. Not this case or copy failed,
+ * just continue as normal kernel does.
+ */
+static void early_enable_iommus(void)
+{
+	struct amd_iommu *iommu;
+
+
+	if (!copy_device_table()) {
+		/*
+		 * If come here because of failure in copying device table from old
+		 * kernel with all IOMMUs enabled, print error message and try to
+		 * free allocated old_dev_tbl_cpy.
+		 */
+		if (amd_iommu_pre_enabled)
+			pr_err("Failed to copy DEV table from previous kernel.\n");
+		if (old_dev_tbl_cpy != NULL)
+			free_pages((unsigned long)old_dev_tbl_cpy,
+					get_order(dev_table_size));
+
+		for_each_iommu(iommu) {
+			clear_translation_pre_enabled(iommu);
+			early_enable_iommu(iommu);
+		}
+	} else {
+		pr_info("Copied DEV table from previous kernel.\n");
+		free_pages((unsigned long)amd_iommu_dev_table,
+				get_order(dev_table_size));
+		amd_iommu_dev_table = old_dev_tbl_cpy;
+		for_each_iommu(iommu) {
+			iommu_disable_command_buffer(iommu);
+			iommu_disable_event_buffer(iommu);
+			iommu_enable_command_buffer(iommu);
+			iommu_enable_event_buffer(iommu);
+			iommu_enable_ga(iommu);
+			iommu_enable_xt(iommu);
+			iommu_set_device_table(iommu);
+			iommu_flush_all_caches(iommu);
+		}
+	}
+
+#ifdef CONFIG_IRQ_REMAP
+	if (AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir))
+		amd_iommu_irq_ops.capability |= (1 << IRQ_POSTING_CAP);
+#endif
+}
+
+static void enable_iommus_v2(void)
+{
+	struct amd_iommu *iommu;
+
+	for_each_iommu(iommu) {
+		iommu_enable_ppr_log(iommu);
+		iommu_enable_gt(iommu);
+	}
+}
+
+static void enable_iommus(void)
+{
+	early_enable_iommus();
+
+	enable_iommus_v2();
+}
+
+static void disable_iommus(void)
+{
+	struct amd_iommu *iommu;
+
+	for_each_iommu(iommu)
+		iommu_disable(iommu);
+
+#ifdef CONFIG_IRQ_REMAP
+	if (AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir))
+		amd_iommu_irq_ops.capability &= ~(1 << IRQ_POSTING_CAP);
+#endif
+}
+
+/*
+ * Suspend/Resume support
+ * disable suspend until real resume implemented
+ */
+
+static void amd_iommu_resume(void)
+{
+	struct amd_iommu *iommu;
+
+	for_each_iommu(iommu)
+		iommu_apply_resume_quirks(iommu);
+
+	/* re-load the hardware */
+	enable_iommus();
+
+	amd_iommu_enable_interrupts();
+}
+
+static int amd_iommu_suspend(void)
+{
+	/* disable IOMMUs to go out of the way for BIOS */
+	disable_iommus();
+
+	return 0;
+}
+
+static struct syscore_ops amd_iommu_syscore_ops = {
+	.suspend = amd_iommu_suspend,
+	.resume = amd_iommu_resume,
+};
+
+static void __init free_iommu_resources(void)
+{
+	kmemleak_free(irq_lookup_table);
+	free_pages((unsigned long)irq_lookup_table,
+		   get_order(rlookup_table_size));
+	irq_lookup_table = NULL;
+
+	kmem_cache_destroy(amd_iommu_irq_cache);
+	amd_iommu_irq_cache = NULL;
+
+	free_pages((unsigned long)amd_iommu_rlookup_table,
+		   get_order(rlookup_table_size));
+	amd_iommu_rlookup_table = NULL;
+
+	free_pages((unsigned long)amd_iommu_alias_table,
+		   get_order(alias_table_size));
+	amd_iommu_alias_table = NULL;
+
+	free_pages((unsigned long)amd_iommu_dev_table,
+		   get_order(dev_table_size));
+	amd_iommu_dev_table = NULL;
+
+	free_iommu_all();
+
+#ifdef CONFIG_GART_IOMMU
+	/*
+	 * We failed to initialize the AMD IOMMU - try fallback to GART
+	 * if possible.
+	 */
+	gart_iommu_init();
+
+#endif
+}
+
+/* SB IOAPIC is always on this device in AMD systems */
+#define IOAPIC_SB_DEVID		((0x00 << 8) | PCI_DEVFN(0x14, 0))
+
+static bool __init check_ioapic_information(void)
+{
+	const char *fw_bug = FW_BUG;
+	bool ret, has_sb_ioapic;
+	int idx;
+
+	has_sb_ioapic = false;
+	ret           = false;
+
+	/*
+	 * If we have map overrides on the kernel command line the
+	 * messages in this function might not describe firmware bugs
+	 * anymore - so be careful
+	 */
+	if (cmdline_maps)
+		fw_bug = "";
+
+	for (idx = 0; idx < nr_ioapics; idx++) {
+		int devid, id = mpc_ioapic_id(idx);
+
+		devid = get_ioapic_devid(id);
+		if (devid < 0) {
+			pr_err("%sAMD-Vi: IOAPIC[%d] not in IVRS table\n",
+				fw_bug, id);
+			ret = false;
+		} else if (devid == IOAPIC_SB_DEVID) {
+			has_sb_ioapic = true;
+			ret           = true;
+		}
+	}
+
+	if (!has_sb_ioapic) {
+		/*
+		 * We expect the SB IOAPIC to be listed in the IVRS
+		 * table. The system timer is connected to the SB IOAPIC
+		 * and if we don't have it in the list the system will
+		 * panic at boot time.  This situation usually happens
+		 * when the BIOS is buggy and provides us the wrong
+		 * device id for the IOAPIC in the system.
+		 */
+		pr_err("%sAMD-Vi: No southbridge IOAPIC found\n", fw_bug);
+	}
+
+	if (!ret)
+		pr_err("AMD-Vi: Disabling interrupt remapping\n");
+
+	return ret;
+}
+
+static void __init free_dma_resources(void)
+{
+	free_pages((unsigned long)amd_iommu_pd_alloc_bitmap,
+		   get_order(MAX_DOMAIN_ID/8));
+	amd_iommu_pd_alloc_bitmap = NULL;
+
+	free_unity_maps();
+}
+
+/*
+ * This is the hardware init function for AMD IOMMU in the system.
+ * This function is called either from amd_iommu_init or from the interrupt
+ * remapping setup code.
+ *
+ * This function basically parses the ACPI table for AMD IOMMU (IVRS)
+ * four times:
+ *
+ *	1 pass) Discover the most comprehensive IVHD type to use.
+ *
+ *	2 pass) Find the highest PCI device id the driver has to handle.
+ *		Upon this information the size of the data structures is
+ *		determined that needs to be allocated.
+ *
+ *	3 pass) Initialize the data structures just allocated with the
+ *		information in the ACPI table about available AMD IOMMUs
+ *		in the system. It also maps the PCI devices in the
+ *		system to specific IOMMUs
+ *
+ *	4 pass) After the basic data structures are allocated and
+ *		initialized we update them with information about memory
+ *		remapping requirements parsed out of the ACPI table in
+ *		this last pass.
+ *
+ * After everything is set up the IOMMUs are enabled and the necessary
+ * hotplug and suspend notifiers are registered.
+ */
+static int __init early_amd_iommu_init(void)
+{
+	struct acpi_table_header *ivrs_base;
+	acpi_status status;
+	int i, remap_cache_sz, ret = 0;
+
+	if (!amd_iommu_detected)
+		return -ENODEV;
+
+	status = acpi_get_table("IVRS", 0, &ivrs_base);
+	if (status == AE_NOT_FOUND)
+		return -ENODEV;
+	else if (ACPI_FAILURE(status)) {
+		const char *err = acpi_format_exception(status);
+		pr_err("AMD-Vi: IVRS table error: %s\n", err);
+		return -EINVAL;
+	}
+
+	/*
+	 * Validate checksum here so we don't need to do it when
+	 * we actually parse the table
+	 */
+	ret = check_ivrs_checksum(ivrs_base);
+	if (ret)
+		goto out;
+
+	amd_iommu_target_ivhd_type = get_highest_supported_ivhd_type(ivrs_base);
+	DUMP_printk("Using IVHD type %#x\n", amd_iommu_target_ivhd_type);
+
+	/*
+	 * First parse ACPI tables to find the largest Bus/Dev/Func
+	 * we need to handle. Upon this information the shared data
+	 * structures for the IOMMUs in the system will be allocated
+	 */
+	ret = find_last_devid_acpi(ivrs_base);
+	if (ret)
+		goto out;
+
+	dev_table_size     = tbl_size(DEV_TABLE_ENTRY_SIZE);
+	alias_table_size   = tbl_size(ALIAS_TABLE_ENTRY_SIZE);
+	rlookup_table_size = tbl_size(RLOOKUP_TABLE_ENTRY_SIZE);
+
+	/* Device table - directly used by all IOMMUs */
+	ret = -ENOMEM;
+	amd_iommu_dev_table = (void *)__get_free_pages(
+				      GFP_KERNEL | __GFP_ZERO | GFP_DMA32,
+				      get_order(dev_table_size));
+	if (amd_iommu_dev_table == NULL)
+		goto out;
+
+	/*
+	 * Alias table - map PCI Bus/Dev/Func to Bus/Dev/Func the
+	 * IOMMU see for that device
+	 */
+	amd_iommu_alias_table = (void *)__get_free_pages(GFP_KERNEL,
+			get_order(alias_table_size));
+	if (amd_iommu_alias_table == NULL)
+		goto out;
+
+	/* IOMMU rlookup table - find the IOMMU for a specific device */
+	amd_iommu_rlookup_table = (void *)__get_free_pages(
+			GFP_KERNEL | __GFP_ZERO,
+			get_order(rlookup_table_size));
+	if (amd_iommu_rlookup_table == NULL)
+		goto out;
+
+	amd_iommu_pd_alloc_bitmap = (void *)__get_free_pages(
+					    GFP_KERNEL | __GFP_ZERO,
+					    get_order(MAX_DOMAIN_ID/8));
+	if (amd_iommu_pd_alloc_bitmap == NULL)
+		goto out;
+
+	/*
+	 * let all alias entries point to itself
+	 */
+	for (i = 0; i <= amd_iommu_last_bdf; ++i)
+		amd_iommu_alias_table[i] = i;
+
+	/*
+	 * never allocate domain 0 because its used as the non-allocated and
+	 * error value placeholder
+	 */
+	__set_bit(0, amd_iommu_pd_alloc_bitmap);
+
+	spin_lock_init(&amd_iommu_pd_lock);
+
+	/*
+	 * now the data structures are allocated and basically initialized
+	 * start the real acpi table scan
+	 */
+	ret = init_iommu_all(ivrs_base);
+	if (ret)
+		goto out;
+
+	/* Disable any previously enabled IOMMUs */
+	if (!is_kdump_kernel() || amd_iommu_disabled)
+		disable_iommus();
+
+	if (amd_iommu_irq_remap)
+		amd_iommu_irq_remap = check_ioapic_information();
+
+	if (amd_iommu_irq_remap) {
+		/*
+		 * Interrupt remapping enabled, create kmem_cache for the
+		 * remapping tables.
+		 */
+		ret = -ENOMEM;
+		if (!AMD_IOMMU_GUEST_IR_GA(amd_iommu_guest_ir))
+			remap_cache_sz = MAX_IRQS_PER_TABLE * sizeof(u32);
+		else
+			remap_cache_sz = MAX_IRQS_PER_TABLE * (sizeof(u64) * 2);
+		amd_iommu_irq_cache = kmem_cache_create("irq_remap_cache",
+							remap_cache_sz,
+							IRQ_TABLE_ALIGNMENT,
+							0, NULL);
+		if (!amd_iommu_irq_cache)
+			goto out;
+
+		irq_lookup_table = (void *)__get_free_pages(
+				GFP_KERNEL | __GFP_ZERO,
+				get_order(rlookup_table_size));
+		kmemleak_alloc(irq_lookup_table, rlookup_table_size,
+			       1, GFP_KERNEL);
+		if (!irq_lookup_table)
+			goto out;
+	}
+
+	ret = init_memory_definitions(ivrs_base);
+	if (ret)
+		goto out;
+
+	/* init the device table */
+	init_device_table();
+
+out:
+	/* Don't leak any ACPI memory */
+	acpi_put_table(ivrs_base);
+	ivrs_base = NULL;
+
+	return ret;
+}
+
+static int amd_iommu_enable_interrupts(void)
+{
+	struct amd_iommu *iommu;
+	int ret = 0;
+
+	for_each_iommu(iommu) {
+		ret = iommu_init_msi(iommu);
+		if (ret)
+			goto out;
+	}
+
+out:
+	return ret;
+}
+
+static bool detect_ivrs(void)
+{
+	struct acpi_table_header *ivrs_base;
+	acpi_status status;
+
+	status = acpi_get_table("IVRS", 0, &ivrs_base);
+	if (status == AE_NOT_FOUND)
+		return false;
+	else if (ACPI_FAILURE(status)) {
+		const char *err = acpi_format_exception(status);
+		pr_err("AMD-Vi: IVRS table error: %s\n", err);
+		return false;
+	}
+
+	acpi_put_table(ivrs_base);
+
+	/* Make sure ACS will be enabled during PCI probe */
+	pci_request_acs();
+
+	return true;
+}
+
+/****************************************************************************
+ *
+ * AMD IOMMU Initialization State Machine
+ *
+ ****************************************************************************/
+
+static int __init state_next(void)
+{
+	int ret = 0;
+
+	switch (init_state) {
+	case IOMMU_START_STATE:
+		if (!detect_ivrs()) {
+			init_state	= IOMMU_NOT_FOUND;
+			ret		= -ENODEV;
+		} else {
+			init_state	= IOMMU_IVRS_DETECTED;
+		}
+		break;
+	case IOMMU_IVRS_DETECTED:
+		ret = early_amd_iommu_init();
+		init_state = ret ? IOMMU_INIT_ERROR : IOMMU_ACPI_FINISHED;
+		if (init_state == IOMMU_ACPI_FINISHED && amd_iommu_disabled) {
+			pr_info("AMD-Vi: AMD IOMMU disabled on kernel command-line\n");
+			free_dma_resources();
+			free_iommu_resources();
+			init_state = IOMMU_CMDLINE_DISABLED;
+			ret = -EINVAL;
+		}
+		break;
+	case IOMMU_ACPI_FINISHED:
+		early_enable_iommus();
+		x86_platform.iommu_shutdown = disable_iommus;
+		init_state = IOMMU_ENABLED;
+		break;
+	case IOMMU_ENABLED:
+		register_syscore_ops(&amd_iommu_syscore_ops);
+		ret = amd_iommu_init_pci();
+		init_state = ret ? IOMMU_INIT_ERROR : IOMMU_PCI_INIT;
+		enable_iommus_v2();
+		break;
+	case IOMMU_PCI_INIT:
+		ret = amd_iommu_enable_interrupts();
+		init_state = ret ? IOMMU_INIT_ERROR : IOMMU_INTERRUPTS_EN;
+		break;
+	case IOMMU_INTERRUPTS_EN:
+		ret = amd_iommu_init_dma_ops();
+		init_state = ret ? IOMMU_INIT_ERROR : IOMMU_DMA_OPS;
+		break;
+	case IOMMU_DMA_OPS:
+		init_state = IOMMU_INITIALIZED;
+		break;
+	case IOMMU_INITIALIZED:
+		/* Nothing to do */
+		break;
+	case IOMMU_NOT_FOUND:
+	case IOMMU_INIT_ERROR:
+	case IOMMU_CMDLINE_DISABLED:
+		/* Error states => do nothing */
+		ret = -EINVAL;
+		break;
+	default:
+		/* Unknown state */
+		BUG();
+	}
+
+	return ret;
+}
+
+static int __init iommu_go_to_state(enum iommu_init_state state)
+{
+	int ret = -EINVAL;
+
+	while (init_state != state) {
+		if (init_state == IOMMU_NOT_FOUND         ||
+		    init_state == IOMMU_INIT_ERROR        ||
+		    init_state == IOMMU_CMDLINE_DISABLED)
+			break;
+		ret = state_next();
+	}
+
+	return ret;
+}
+
+#ifdef CONFIG_IRQ_REMAP
+int __init amd_iommu_prepare(void)
+{
+	int ret;
+
+	amd_iommu_irq_remap = true;
+
+	ret = iommu_go_to_state(IOMMU_ACPI_FINISHED);
+	if (ret)
+		return ret;
+	return amd_iommu_irq_remap ? 0 : -ENODEV;
+}
+
+int __init amd_iommu_enable(void)
+{
+	int ret;
+
+	ret = iommu_go_to_state(IOMMU_ENABLED);
+	if (ret)
+		return ret;
+
+	irq_remapping_enabled = 1;
+	return amd_iommu_xt_mode;
+}
+
+void amd_iommu_disable(void)
+{
+	amd_iommu_suspend();
+}
+
+int amd_iommu_reenable(int mode)
+{
+	amd_iommu_resume();
+
+	return 0;
+}
+
+int __init amd_iommu_enable_faulting(void)
+{
+	/* We enable MSI later when PCI is initialized */
+	return 0;
+}
+#endif
+
+/*
+ * This is the core init function for AMD IOMMU hardware in the system.
+ * This function is called from the generic x86 DMA layer initialization
+ * code.
+ */
+static int __init amd_iommu_init(void)
+{
+	struct amd_iommu *iommu;
+	int ret;
+
+	ret = iommu_go_to_state(IOMMU_INITIALIZED);
+	if (ret) {
+		free_dma_resources();
+		if (!irq_remapping_enabled) {
+			disable_iommus();
+			free_iommu_resources();
+		} else {
+			uninit_device_table_dma();
+			for_each_iommu(iommu)
+				iommu_flush_all_caches(iommu);
+		}
+	}
+
+	for_each_iommu(iommu)
+		amd_iommu_debugfs_setup(iommu);
+
+	return ret;
+}
+
+static bool amd_iommu_sme_check(void)
+{
+	if (!sme_active() || (boot_cpu_data.x86 != 0x17))
+		return true;
+
+	/* For Fam17h, a specific level of support is required */
+	if (boot_cpu_data.microcode >= 0x08001205)
+		return true;
+
+	if ((boot_cpu_data.microcode >= 0x08001126) &&
+	    (boot_cpu_data.microcode <= 0x080011ff))
+		return true;
+
+	pr_notice("AMD-Vi: IOMMU not currently supported when SME is active\n");
+
+	return false;
+}
+
+/****************************************************************************
+ *
+ * Early detect code. This code runs at IOMMU detection time in the DMA
+ * layer. It just looks if there is an IVRS ACPI table to detect AMD
+ * IOMMUs
+ *
+ ****************************************************************************/
+int __init amd_iommu_detect(void)
+{
+	int ret;
+
+	if (no_iommu || (iommu_detected && !gart_iommu_aperture))
+		return -ENODEV;
+
+	if (!amd_iommu_sme_check())
+		return -ENODEV;
+
+	ret = iommu_go_to_state(IOMMU_IVRS_DETECTED);
+	if (ret)
+		return ret;
+
+	amd_iommu_detected = true;
+	iommu_detected = 1;
+	x86_init.iommu.iommu_init = amd_iommu_init;
+
+	return 1;
+}
+
+/****************************************************************************
+ *
+ * Parsing functions for the AMD IOMMU specific kernel command line
+ * options.
+ *
+ ****************************************************************************/
+
+static int __init parse_amd_iommu_dump(char *str)
+{
+	amd_iommu_dump = true;
+
+	return 1;
+}
+
+static int __init parse_amd_iommu_intr(char *str)
+{
+	for (; *str; ++str) {
+		if (strncmp(str, "legacy", 6) == 0) {
+			amd_iommu_guest_ir = AMD_IOMMU_GUEST_IR_LEGACY_GA;
+			break;
+		}
+		if (strncmp(str, "vapic", 5) == 0) {
+			amd_iommu_guest_ir = AMD_IOMMU_GUEST_IR_VAPIC;
+			break;
+		}
+	}
+	return 1;
+}
+
+static int __init parse_amd_iommu_options(char *str)
+{
+	for (; *str; ++str) {
+		if (strncmp(str, "fullflush", 9) == 0)
+			amd_iommu_unmap_flush = true;
+		if (strncmp(str, "off", 3) == 0)
+			amd_iommu_disabled = true;
+		if (strncmp(str, "force_isolation", 15) == 0)
+			amd_iommu_force_isolation = true;
+	}
+
+	return 1;
+}
+
+static int __init parse_ivrs_ioapic(char *str)
+{
+	unsigned int bus, dev, fn;
+	int ret, id, i;
+	u16 devid;
+
+	ret = sscanf(str, "[%d]=%x:%x.%x", &id, &bus, &dev, &fn);
+
+	if (ret != 4) {
+		pr_err("AMD-Vi: Invalid command line: ivrs_ioapic%s\n", str);
+		return 1;
+	}
+
+	if (early_ioapic_map_size == EARLY_MAP_SIZE) {
+		pr_err("AMD-Vi: Early IOAPIC map overflow - ignoring ivrs_ioapic%s\n",
+			str);
+		return 1;
+	}
+
+	devid = ((bus & 0xff) << 8) | ((dev & 0x1f) << 3) | (fn & 0x7);
+
+	cmdline_maps			= true;
+	i				= early_ioapic_map_size++;
+	early_ioapic_map[i].id		= id;
+	early_ioapic_map[i].devid	= devid;
+	early_ioapic_map[i].cmd_line	= true;
+
+	return 1;
+}
+
+static int __init parse_ivrs_hpet(char *str)
+{
+	unsigned int bus, dev, fn;
+	int ret, id, i;
+	u16 devid;
+
+	ret = sscanf(str, "[%d]=%x:%x.%x", &id, &bus, &dev, &fn);
+
+	if (ret != 4) {
+		pr_err("AMD-Vi: Invalid command line: ivrs_hpet%s\n", str);
+		return 1;
+	}
+
+	if (early_hpet_map_size == EARLY_MAP_SIZE) {
+		pr_err("AMD-Vi: Early HPET map overflow - ignoring ivrs_hpet%s\n",
+			str);
+		return 1;
+	}
+
+	devid = ((bus & 0xff) << 8) | ((dev & 0x1f) << 3) | (fn & 0x7);
+
+	cmdline_maps			= true;
+	i				= early_hpet_map_size++;
+	early_hpet_map[i].id		= id;
+	early_hpet_map[i].devid		= devid;
+	early_hpet_map[i].cmd_line	= true;
+
+	return 1;
+}
+
+static int __init parse_ivrs_acpihid(char *str)
+{
+	u32 bus, dev, fn;
+	char *hid, *uid, *p;
+	char acpiid[ACPIHID_UID_LEN + ACPIHID_HID_LEN] = {0};
+	int ret, i;
+
+	ret = sscanf(str, "[%x:%x.%x]=%s", &bus, &dev, &fn, acpiid);
+	if (ret != 4) {
+		pr_err("AMD-Vi: Invalid command line: ivrs_acpihid(%s)\n", str);
+		return 1;
+	}
+
+	p = acpiid;
+	hid = strsep(&p, ":");
+	uid = p;
+
+	if (!hid || !(*hid) || !uid) {
+		pr_err("AMD-Vi: Invalid command line: hid or uid\n");
+		return 1;
+	}
+
+	i = early_acpihid_map_size++;
+	memcpy(early_acpihid_map[i].hid, hid, strlen(hid));
+	memcpy(early_acpihid_map[i].uid, uid, strlen(uid));
+	early_acpihid_map[i].devid =
+		((bus & 0xff) << 8) | ((dev & 0x1f) << 3) | (fn & 0x7);
+	early_acpihid_map[i].cmd_line	= true;
+
+	return 1;
+}
+
+__setup("amd_iommu_dump",	parse_amd_iommu_dump);
+__setup("amd_iommu=",		parse_amd_iommu_options);
+__setup("amd_iommu_intr=",	parse_amd_iommu_intr);
+__setup("ivrs_ioapic",		parse_ivrs_ioapic);
+__setup("ivrs_hpet",		parse_ivrs_hpet);
+__setup("ivrs_acpihid",		parse_ivrs_acpihid);
+
+IOMMU_INIT_FINISH(amd_iommu_detect,
+		  gart_iommu_hole_init,
+		  NULL,
+		  NULL);
+
+bool amd_iommu_v2_supported(void)
+{
+	return amd_iommu_v2_present;
+}
+EXPORT_SYMBOL(amd_iommu_v2_supported);
+
+struct amd_iommu *get_amd_iommu(unsigned int idx)
+{
+	unsigned int i = 0;
+	struct amd_iommu *iommu;
+
+	for_each_iommu(iommu)
+		if (i++ == idx)
+			return iommu;
+	return NULL;
+}
+EXPORT_SYMBOL(get_amd_iommu);
+
+/****************************************************************************
+ *
+ * IOMMU EFR Performance Counter support functionality. This code allows
+ * access to the IOMMU PC functionality.
+ *
+ ****************************************************************************/
+
+u8 amd_iommu_pc_get_max_banks(unsigned int idx)
+{
+	struct amd_iommu *iommu = get_amd_iommu(idx);
+
+	if (iommu)
+		return iommu->max_banks;
+
+	return 0;
+}
+EXPORT_SYMBOL(amd_iommu_pc_get_max_banks);
+
+bool amd_iommu_pc_supported(void)
+{
+	return amd_iommu_pc_present;
+}
+EXPORT_SYMBOL(amd_iommu_pc_supported);
+
+u8 amd_iommu_pc_get_max_counters(unsigned int idx)
+{
+	struct amd_iommu *iommu = get_amd_iommu(idx);
+
+	if (iommu)
+		return iommu->max_counters;
+
+	return 0;
+}
+EXPORT_SYMBOL(amd_iommu_pc_get_max_counters);
+
+static int iommu_pc_get_set_reg(struct amd_iommu *iommu, u8 bank, u8 cntr,
+				u8 fxn, u64 *value, bool is_write)
+{
+	u32 offset;
+	u32 max_offset_lim;
+
+	/* Make sure the IOMMU PC resource is available */
+	if (!amd_iommu_pc_present)
+		return -ENODEV;
+
+	/* Check for valid iommu and pc register indexing */
+	if (WARN_ON(!iommu || (fxn > 0x28) || (fxn & 7)))
+		return -ENODEV;
+
+	offset = (u32)(((0x40 | bank) << 12) | (cntr << 8) | fxn);
+
+	/* Limit the offset to the hw defined mmio region aperture */
+	max_offset_lim = (u32)(((0x40 | iommu->max_banks) << 12) |
+				(iommu->max_counters << 8) | 0x28);
+	if ((offset < MMIO_CNTR_REG_OFFSET) ||
+	    (offset > max_offset_lim))
+		return -EINVAL;
+
+	if (is_write) {
+		u64 val = *value & GENMASK_ULL(47, 0);
+
+		writel((u32)val, iommu->mmio_base + offset);
+		writel((val >> 32), iommu->mmio_base + offset + 4);
+	} else {
+		*value = readl(iommu->mmio_base + offset + 4);
+		*value <<= 32;
+		*value |= readl(iommu->mmio_base + offset);
+		*value &= GENMASK_ULL(47, 0);
+	}
+
+	return 0;
+}
+
+int amd_iommu_pc_get_reg(struct amd_iommu *iommu, u8 bank, u8 cntr, u8 fxn, u64 *value)
+{
+	if (!iommu)
+		return -EINVAL;
+
+	return iommu_pc_get_set_reg(iommu, bank, cntr, fxn, value, false);
+}
+EXPORT_SYMBOL(amd_iommu_pc_get_reg);
+
+int amd_iommu_pc_set_reg(struct amd_iommu *iommu, u8 bank, u8 cntr, u8 fxn, u64 *value)
+{
+	if (!iommu)
+		return -EINVAL;
+
+	return iommu_pc_get_set_reg(iommu, bank, cntr, fxn, value, true);
+}
+EXPORT_SYMBOL(amd_iommu_pc_set_reg);
diff --git a/drivers/iommu/amd_iommu_proto.h b/drivers/iommu/amd_iommu_proto.h
new file mode 100644
index 000000000..a8cd0296f
--- /dev/null
+++ b/drivers/iommu/amd_iommu_proto.h
@@ -0,0 +1,108 @@
+/*
+ * Copyright (C) 2009-2010 Advanced Micro Devices, Inc.
+ * Author: Joerg Roedel <jroedel@suse.de>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ */
+
+#ifndef _ASM_X86_AMD_IOMMU_PROTO_H
+#define _ASM_X86_AMD_IOMMU_PROTO_H
+
+#include "amd_iommu_types.h"
+
+extern int amd_iommu_get_num_iommus(void);
+extern int amd_iommu_init_dma_ops(void);
+extern int amd_iommu_init_passthrough(void);
+extern irqreturn_t amd_iommu_int_thread(int irq, void *data);
+extern irqreturn_t amd_iommu_int_handler(int irq, void *data);
+extern void amd_iommu_apply_erratum_63(u16 devid);
+extern void amd_iommu_reset_cmd_buffer(struct amd_iommu *iommu);
+extern int amd_iommu_init_devices(void);
+extern void amd_iommu_uninit_devices(void);
+extern void amd_iommu_init_notifier(void);
+extern int amd_iommu_init_api(void);
+
+#ifdef CONFIG_AMD_IOMMU_DEBUGFS
+void amd_iommu_debugfs_setup(struct amd_iommu *iommu);
+#else
+static inline void amd_iommu_debugfs_setup(struct amd_iommu *iommu) {}
+#endif
+
+/* Needed for interrupt remapping */
+extern int amd_iommu_prepare(void);
+extern int amd_iommu_enable(void);
+extern void amd_iommu_disable(void);
+extern int amd_iommu_reenable(int);
+extern int amd_iommu_enable_faulting(void);
+extern int amd_iommu_guest_ir;
+
+/* IOMMUv2 specific functions */
+struct iommu_domain;
+
+extern bool amd_iommu_v2_supported(void);
+extern int amd_iommu_register_ppr_notifier(struct notifier_block *nb);
+extern int amd_iommu_unregister_ppr_notifier(struct notifier_block *nb);
+extern void amd_iommu_domain_direct_map(struct iommu_domain *dom);
+extern int amd_iommu_domain_enable_v2(struct iommu_domain *dom, int pasids);
+extern int amd_iommu_flush_page(struct iommu_domain *dom, int pasid,
+				u64 address);
+extern int amd_iommu_flush_tlb(struct iommu_domain *dom, int pasid);
+extern int amd_iommu_domain_set_gcr3(struct iommu_domain *dom, int pasid,
+				     unsigned long cr3);
+extern int amd_iommu_domain_clear_gcr3(struct iommu_domain *dom, int pasid);
+extern struct iommu_domain *amd_iommu_get_v2_domain(struct pci_dev *pdev);
+
+#ifdef CONFIG_IRQ_REMAP
+extern int amd_iommu_create_irq_domain(struct amd_iommu *iommu);
+#else
+static inline int amd_iommu_create_irq_domain(struct amd_iommu *iommu)
+{
+	return 0;
+}
+#endif
+
+#define PPR_SUCCESS			0x0
+#define PPR_INVALID			0x1
+#define PPR_FAILURE			0xf
+
+extern int amd_iommu_complete_ppr(struct pci_dev *pdev, int pasid,
+				  int status, int tag);
+
+static inline bool is_rd890_iommu(struct pci_dev *pdev)
+{
+	return (pdev->vendor == PCI_VENDOR_ID_ATI) &&
+	       (pdev->device == PCI_DEVICE_ID_RD890_IOMMU);
+}
+
+static inline bool iommu_feature(struct amd_iommu *iommu, u64 f)
+{
+	if (!(iommu->cap & (1 << IOMMU_CAP_EFR)))
+		return false;
+
+	return !!(iommu->features & f);
+}
+
+static inline u64 iommu_virt_to_phys(void *vaddr)
+{
+	return (u64)__sme_set(virt_to_phys(vaddr));
+}
+
+static inline void *iommu_phys_to_virt(unsigned long paddr)
+{
+	return phys_to_virt(__sme_clr(paddr));
+}
+
+extern bool translation_pre_enabled(struct amd_iommu *iommu);
+extern struct iommu_dev_data *get_dev_data(struct device *dev);
+#endif /* _ASM_X86_AMD_IOMMU_PROTO_H  */
diff --git a/drivers/iommu/amd_iommu_quirks.c b/drivers/iommu/amd_iommu_quirks.c
new file mode 100644
index 000000000..5120ce4fd
--- /dev/null
+++ b/drivers/iommu/amd_iommu_quirks.c
@@ -0,0 +1,105 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+/*
+ * Quirks for AMD IOMMU
+ *
+ * Copyright (C) 2019 Kai-Heng Feng <kai.heng.feng@canonical.com>
+ */
+
+#ifdef CONFIG_DMI
+#include <linux/dmi.h>
+
+#include "amd_iommu.h"
+
+#define IVHD_SPECIAL_IOAPIC		1
+
+struct ivrs_quirk_entry {
+	u8 id;
+	u16 devid;
+};
+
+enum {
+	DELL_INSPIRON_7375 = 0,
+	DELL_LATITUDE_5495,
+	LENOVO_IDEAPAD_330S_15ARR,
+};
+
+static const struct ivrs_quirk_entry ivrs_ioapic_quirks[][3] __initconst = {
+	/* ivrs_ioapic[4]=00:14.0 ivrs_ioapic[5]=00:00.2 */
+	[DELL_INSPIRON_7375] = {
+		{ .id = 4, .devid = 0xa0 },
+		{ .id = 5, .devid = 0x2 },
+		{}
+	},
+	/* ivrs_ioapic[4]=00:14.0 */
+	[DELL_LATITUDE_5495] = {
+		{ .id = 4, .devid = 0xa0 },
+		{}
+	},
+	/* ivrs_ioapic[32]=00:14.0 */
+	[LENOVO_IDEAPAD_330S_15ARR] = {
+		{ .id = 32, .devid = 0xa0 },
+		{}
+	},
+	{}
+};
+
+static int __init ivrs_ioapic_quirk_cb(const struct dmi_system_id *d)
+{
+	const struct ivrs_quirk_entry *i;
+
+	for (i = d->driver_data; i->id != 0 && i->devid != 0; i++)
+		add_special_device(IVHD_SPECIAL_IOAPIC, i->id, (u16 *)&i->devid, 0);
+
+	return 0;
+}
+
+static const struct dmi_system_id ivrs_quirks[] __initconst = {
+	{
+		.callback = ivrs_ioapic_quirk_cb,
+		.ident = "Dell Inspiron 7375",
+		.matches = {
+			DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+			DMI_MATCH(DMI_PRODUCT_NAME, "Inspiron 7375"),
+		},
+		.driver_data = (void *)&ivrs_ioapic_quirks[DELL_INSPIRON_7375],
+	},
+	{
+		.callback = ivrs_ioapic_quirk_cb,
+		.ident = "Dell Latitude 5495",
+		.matches = {
+			DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+			DMI_MATCH(DMI_PRODUCT_NAME, "Latitude 5495"),
+		},
+		.driver_data = (void *)&ivrs_ioapic_quirks[DELL_LATITUDE_5495],
+	},
+	{
+		/*
+		 * Acer Aspire A315-41 requires the very same workaround as
+		 * Dell Latitude 5495
+		 */
+		.callback = ivrs_ioapic_quirk_cb,
+		.ident = "Acer Aspire A315-41",
+		.matches = {
+			DMI_MATCH(DMI_SYS_VENDOR, "Acer"),
+			DMI_MATCH(DMI_PRODUCT_NAME, "Aspire A315-41"),
+		},
+		.driver_data = (void *)&ivrs_ioapic_quirks[DELL_LATITUDE_5495],
+	},
+	{
+		.callback = ivrs_ioapic_quirk_cb,
+		.ident = "Lenovo ideapad 330S-15ARR",
+		.matches = {
+			DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"),
+			DMI_MATCH(DMI_PRODUCT_NAME, "81FB"),
+		},
+		.driver_data = (void *)&ivrs_ioapic_quirks[LENOVO_IDEAPAD_330S_15ARR],
+	},
+	{}
+};
+
+void __init amd_iommu_apply_ivrs_quirks(void)
+{
+	dmi_check_system(ivrs_quirks);
+}
+#endif
diff --git a/drivers/iommu/amd_iommu_types.h b/drivers/iommu/amd_iommu_types.h
new file mode 100644
index 000000000..0948c425d
--- /dev/null
+++ b/drivers/iommu/amd_iommu_types.h
@@ -0,0 +1,906 @@
+/*
+ * Copyright (C) 2007-2010 Advanced Micro Devices, Inc.
+ * Author: Joerg Roedel <jroedel@suse.de>
+ *         Leo Duran <leo.duran@amd.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ */
+
+#ifndef _ASM_X86_AMD_IOMMU_TYPES_H
+#define _ASM_X86_AMD_IOMMU_TYPES_H
+
+#include <linux/types.h>
+#include <linux/mutex.h>
+#include <linux/msi.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/pci.h>
+#include <linux/irqreturn.h>
+
+/*
+ * Maximum number of IOMMUs supported
+ */
+#define MAX_IOMMUS	32
+
+/*
+ * some size calculation constants
+ */
+#define DEV_TABLE_ENTRY_SIZE		32
+#define ALIAS_TABLE_ENTRY_SIZE		2
+#define RLOOKUP_TABLE_ENTRY_SIZE	(sizeof(void *))
+
+/* Capability offsets used by the driver */
+#define MMIO_CAP_HDR_OFFSET	0x00
+#define MMIO_RANGE_OFFSET	0x0c
+#define MMIO_MISC_OFFSET	0x10
+
+/* Masks, shifts and macros to parse the device range capability */
+#define MMIO_RANGE_LD_MASK	0xff000000
+#define MMIO_RANGE_FD_MASK	0x00ff0000
+#define MMIO_RANGE_BUS_MASK	0x0000ff00
+#define MMIO_RANGE_LD_SHIFT	24
+#define MMIO_RANGE_FD_SHIFT	16
+#define MMIO_RANGE_BUS_SHIFT	8
+#define MMIO_GET_LD(x)  (((x) & MMIO_RANGE_LD_MASK) >> MMIO_RANGE_LD_SHIFT)
+#define MMIO_GET_FD(x)  (((x) & MMIO_RANGE_FD_MASK) >> MMIO_RANGE_FD_SHIFT)
+#define MMIO_GET_BUS(x) (((x) & MMIO_RANGE_BUS_MASK) >> MMIO_RANGE_BUS_SHIFT)
+#define MMIO_MSI_NUM(x)	((x) & 0x1f)
+
+/* Flag masks for the AMD IOMMU exclusion range */
+#define MMIO_EXCL_ENABLE_MASK 0x01ULL
+#define MMIO_EXCL_ALLOW_MASK  0x02ULL
+
+/* Used offsets into the MMIO space */
+#define MMIO_DEV_TABLE_OFFSET   0x0000
+#define MMIO_CMD_BUF_OFFSET     0x0008
+#define MMIO_EVT_BUF_OFFSET     0x0010
+#define MMIO_CONTROL_OFFSET     0x0018
+#define MMIO_EXCL_BASE_OFFSET   0x0020
+#define MMIO_EXCL_LIMIT_OFFSET  0x0028
+#define MMIO_EXT_FEATURES	0x0030
+#define MMIO_PPR_LOG_OFFSET	0x0038
+#define MMIO_GA_LOG_BASE_OFFSET	0x00e0
+#define MMIO_GA_LOG_TAIL_OFFSET	0x00e8
+#define MMIO_CMD_HEAD_OFFSET	0x2000
+#define MMIO_CMD_TAIL_OFFSET	0x2008
+#define MMIO_EVT_HEAD_OFFSET	0x2010
+#define MMIO_EVT_TAIL_OFFSET	0x2018
+#define MMIO_STATUS_OFFSET	0x2020
+#define MMIO_PPR_HEAD_OFFSET	0x2030
+#define MMIO_PPR_TAIL_OFFSET	0x2038
+#define MMIO_GA_HEAD_OFFSET	0x2040
+#define MMIO_GA_TAIL_OFFSET	0x2048
+#define MMIO_CNTR_CONF_OFFSET	0x4000
+#define MMIO_CNTR_REG_OFFSET	0x40000
+#define MMIO_REG_END_OFFSET	0x80000
+
+
+
+/* Extended Feature Bits */
+#define FEATURE_PREFETCH	(1ULL<<0)
+#define FEATURE_PPR		(1ULL<<1)
+#define FEATURE_X2APIC		(1ULL<<2)
+#define FEATURE_NX		(1ULL<<3)
+#define FEATURE_GT		(1ULL<<4)
+#define FEATURE_IA		(1ULL<<6)
+#define FEATURE_GA		(1ULL<<7)
+#define FEATURE_HE		(1ULL<<8)
+#define FEATURE_PC		(1ULL<<9)
+#define FEATURE_GAM_VAPIC	(1ULL<<21)
+#define FEATURE_EPHSUP		(1ULL<<50)
+
+#define FEATURE_PASID_SHIFT	32
+#define FEATURE_PASID_MASK	(0x1fULL << FEATURE_PASID_SHIFT)
+
+#define FEATURE_GLXVAL_SHIFT	14
+#define FEATURE_GLXVAL_MASK	(0x03ULL << FEATURE_GLXVAL_SHIFT)
+
+/* Note:
+ * The current driver only support 16-bit PASID.
+ * Currently, hardware only implement upto 16-bit PASID
+ * even though the spec says it could have upto 20 bits.
+ */
+#define PASID_MASK		0x0000ffff
+
+/* MMIO status bits */
+#define MMIO_STATUS_EVT_INT_MASK	(1 << 1)
+#define MMIO_STATUS_COM_WAIT_INT_MASK	(1 << 2)
+#define MMIO_STATUS_PPR_INT_MASK	(1 << 6)
+#define MMIO_STATUS_GALOG_RUN_MASK	(1 << 8)
+#define MMIO_STATUS_GALOG_OVERFLOW_MASK	(1 << 9)
+#define MMIO_STATUS_GALOG_INT_MASK	(1 << 10)
+
+/* event logging constants */
+#define EVENT_ENTRY_SIZE	0x10
+#define EVENT_TYPE_SHIFT	28
+#define EVENT_TYPE_MASK		0xf
+#define EVENT_TYPE_ILL_DEV	0x1
+#define EVENT_TYPE_IO_FAULT	0x2
+#define EVENT_TYPE_DEV_TAB_ERR	0x3
+#define EVENT_TYPE_PAGE_TAB_ERR	0x4
+#define EVENT_TYPE_ILL_CMD	0x5
+#define EVENT_TYPE_CMD_HARD_ERR	0x6
+#define EVENT_TYPE_IOTLB_INV_TO	0x7
+#define EVENT_TYPE_INV_DEV_REQ	0x8
+#define EVENT_TYPE_INV_PPR_REQ	0x9
+#define EVENT_DEVID_MASK	0xffff
+#define EVENT_DEVID_SHIFT	0
+#define EVENT_DOMID_MASK	0xffff
+#define EVENT_DOMID_SHIFT	0
+#define EVENT_FLAGS_MASK	0xfff
+#define EVENT_FLAGS_SHIFT	0x10
+
+/* feature control bits */
+#define CONTROL_IOMMU_EN        0x00ULL
+#define CONTROL_HT_TUN_EN       0x01ULL
+#define CONTROL_EVT_LOG_EN      0x02ULL
+#define CONTROL_EVT_INT_EN      0x03ULL
+#define CONTROL_COMWAIT_EN      0x04ULL
+#define CONTROL_INV_TIMEOUT	0x05ULL
+#define CONTROL_PASSPW_EN       0x08ULL
+#define CONTROL_RESPASSPW_EN    0x09ULL
+#define CONTROL_COHERENT_EN     0x0aULL
+#define CONTROL_ISOC_EN         0x0bULL
+#define CONTROL_CMDBUF_EN       0x0cULL
+#define CONTROL_PPFLOG_EN       0x0dULL
+#define CONTROL_PPFINT_EN       0x0eULL
+#define CONTROL_PPR_EN          0x0fULL
+#define CONTROL_GT_EN           0x10ULL
+#define CONTROL_GA_EN           0x11ULL
+#define CONTROL_GAM_EN          0x19ULL
+#define CONTROL_GALOG_EN        0x1CULL
+#define CONTROL_GAINT_EN        0x1DULL
+#define CONTROL_XT_EN           0x32ULL
+
+#define CTRL_INV_TO_MASK	(7 << CONTROL_INV_TIMEOUT)
+#define CTRL_INV_TO_NONE	0
+#define CTRL_INV_TO_1MS		1
+#define CTRL_INV_TO_10MS	2
+#define CTRL_INV_TO_100MS	3
+#define CTRL_INV_TO_1S		4
+#define CTRL_INV_TO_10S		5
+#define CTRL_INV_TO_100S	6
+
+/* command specific defines */
+#define CMD_COMPL_WAIT          0x01
+#define CMD_INV_DEV_ENTRY       0x02
+#define CMD_INV_IOMMU_PAGES	0x03
+#define CMD_INV_IOTLB_PAGES	0x04
+#define CMD_INV_IRT		0x05
+#define CMD_COMPLETE_PPR	0x07
+#define CMD_INV_ALL		0x08
+
+#define CMD_COMPL_WAIT_STORE_MASK	0x01
+#define CMD_COMPL_WAIT_INT_MASK		0x02
+#define CMD_INV_IOMMU_PAGES_SIZE_MASK	0x01
+#define CMD_INV_IOMMU_PAGES_PDE_MASK	0x02
+#define CMD_INV_IOMMU_PAGES_GN_MASK	0x04
+
+#define PPR_STATUS_MASK			0xf
+#define PPR_STATUS_SHIFT		12
+
+#define CMD_INV_IOMMU_ALL_PAGES_ADDRESS	0x7fffffffffffffffULL
+
+/* macros and definitions for device table entries */
+#define DEV_ENTRY_VALID         0x00
+#define DEV_ENTRY_TRANSLATION   0x01
+#define DEV_ENTRY_PPR           0x34
+#define DEV_ENTRY_IR            0x3d
+#define DEV_ENTRY_IW            0x3e
+#define DEV_ENTRY_NO_PAGE_FAULT	0x62
+#define DEV_ENTRY_EX            0x67
+#define DEV_ENTRY_SYSMGT1       0x68
+#define DEV_ENTRY_SYSMGT2       0x69
+#define DEV_ENTRY_IRQ_TBL_EN	0x80
+#define DEV_ENTRY_INIT_PASS     0xb8
+#define DEV_ENTRY_EINT_PASS     0xb9
+#define DEV_ENTRY_NMI_PASS      0xba
+#define DEV_ENTRY_LINT0_PASS    0xbe
+#define DEV_ENTRY_LINT1_PASS    0xbf
+#define DEV_ENTRY_MODE_MASK	0x07
+#define DEV_ENTRY_MODE_SHIFT	0x09
+
+#define MAX_DEV_TABLE_ENTRIES	0xffff
+
+/* constants to configure the command buffer */
+#define CMD_BUFFER_SIZE    8192
+#define CMD_BUFFER_UNINITIALIZED 1
+#define CMD_BUFFER_ENTRIES 512
+#define MMIO_CMD_SIZE_SHIFT 56
+#define MMIO_CMD_SIZE_512 (0x9ULL << MMIO_CMD_SIZE_SHIFT)
+
+/* constants for event buffer handling */
+#define EVT_BUFFER_SIZE		8192 /* 512 entries */
+#define EVT_LEN_MASK		(0x9ULL << 56)
+
+/* Constants for PPR Log handling */
+#define PPR_LOG_ENTRIES		512
+#define PPR_LOG_SIZE_SHIFT	56
+#define PPR_LOG_SIZE_512	(0x9ULL << PPR_LOG_SIZE_SHIFT)
+#define PPR_ENTRY_SIZE		16
+#define PPR_LOG_SIZE		(PPR_ENTRY_SIZE * PPR_LOG_ENTRIES)
+
+#define PPR_REQ_TYPE(x)		(((x) >> 60) & 0xfULL)
+#define PPR_FLAGS(x)		(((x) >> 48) & 0xfffULL)
+#define PPR_DEVID(x)		((x) & 0xffffULL)
+#define PPR_TAG(x)		(((x) >> 32) & 0x3ffULL)
+#define PPR_PASID1(x)		(((x) >> 16) & 0xffffULL)
+#define PPR_PASID2(x)		(((x) >> 42) & 0xfULL)
+#define PPR_PASID(x)		((PPR_PASID2(x) << 16) | PPR_PASID1(x))
+
+#define PPR_REQ_FAULT		0x01
+
+/* Constants for GA Log handling */
+#define GA_LOG_ENTRIES		512
+#define GA_LOG_SIZE_SHIFT	56
+#define GA_LOG_SIZE_512		(0x8ULL << GA_LOG_SIZE_SHIFT)
+#define GA_ENTRY_SIZE		8
+#define GA_LOG_SIZE		(GA_ENTRY_SIZE * GA_LOG_ENTRIES)
+
+#define GA_TAG(x)		(u32)(x & 0xffffffffULL)
+#define GA_DEVID(x)		(u16)(((x) >> 32) & 0xffffULL)
+#define GA_REQ_TYPE(x)		(((x) >> 60) & 0xfULL)
+
+#define GA_GUEST_NR		0x1
+
+/* Bit value definition for dte irq remapping fields*/
+#define DTE_IRQ_PHYS_ADDR_MASK	(((1ULL << 45)-1) << 6)
+#define DTE_IRQ_REMAP_INTCTL_MASK	(0x3ULL << 60)
+#define DTE_IRQ_TABLE_LEN_MASK	(0xfULL << 1)
+#define DTE_IRQ_REMAP_INTCTL    (2ULL << 60)
+#define DTE_IRQ_TABLE_LEN       (9ULL << 1)
+#define DTE_IRQ_REMAP_ENABLE    1ULL
+
+#define PAGE_MODE_NONE    0x00
+#define PAGE_MODE_1_LEVEL 0x01
+#define PAGE_MODE_2_LEVEL 0x02
+#define PAGE_MODE_3_LEVEL 0x03
+#define PAGE_MODE_4_LEVEL 0x04
+#define PAGE_MODE_5_LEVEL 0x05
+#define PAGE_MODE_6_LEVEL 0x06
+
+#define PM_LEVEL_SHIFT(x)	(12 + ((x) * 9))
+#define PM_LEVEL_SIZE(x)	(((x) < 6) ? \
+				  ((1ULL << PM_LEVEL_SHIFT((x))) - 1): \
+				   (0xffffffffffffffffULL))
+#define PM_LEVEL_INDEX(x, a)	(((a) >> PM_LEVEL_SHIFT((x))) & 0x1ffULL)
+#define PM_LEVEL_ENC(x)		(((x) << 9) & 0xe00ULL)
+#define PM_LEVEL_PDE(x, a)	((a) | PM_LEVEL_ENC((x)) | \
+				 IOMMU_PTE_PR | IOMMU_PTE_IR | IOMMU_PTE_IW)
+#define PM_PTE_LEVEL(pte)	(((pte) >> 9) & 0x7ULL)
+
+#define PM_MAP_4k		0
+#define PM_ADDR_MASK		0x000ffffffffff000ULL
+#define PM_MAP_MASK(lvl)	(PM_ADDR_MASK & \
+				(~((1ULL << (12 + ((lvl) * 9))) - 1)))
+#define PM_ALIGNED(lvl, addr)	((PM_MAP_MASK(lvl) & (addr)) == (addr))
+
+/*
+ * Returns the page table level to use for a given page size
+ * Pagesize is expected to be a power-of-two
+ */
+#define PAGE_SIZE_LEVEL(pagesize) \
+		((__ffs(pagesize) - 12) / 9)
+/*
+ * Returns the number of ptes to use for a given page size
+ * Pagesize is expected to be a power-of-two
+ */
+#define PAGE_SIZE_PTE_COUNT(pagesize) \
+		(1ULL << ((__ffs(pagesize) - 12) % 9))
+
+/*
+ * Aligns a given io-virtual address to a given page size
+ * Pagesize is expected to be a power-of-two
+ */
+#define PAGE_SIZE_ALIGN(address, pagesize) \
+		((address) & ~((pagesize) - 1))
+/*
+ * Creates an IOMMU PTE for an address and a given pagesize
+ * The PTE has no permission bits set
+ * Pagesize is expected to be a power-of-two larger than 4096
+ */
+#define PAGE_SIZE_PTE(address, pagesize)		\
+		(((address) | ((pagesize) - 1)) &	\
+		 (~(pagesize >> 1)) & PM_ADDR_MASK)
+
+/*
+ * Takes a PTE value with mode=0x07 and returns the page size it maps
+ */
+#define PTE_PAGE_SIZE(pte) \
+	(1ULL << (1 + ffz(((pte) | 0xfffULL))))
+
+/*
+ * Takes a page-table level and returns the default page-size for this level
+ */
+#define PTE_LEVEL_PAGE_SIZE(level)			\
+	(1ULL << (12 + (9 * (level))))
+
+/*
+ * Bit value definition for I/O PTE fields
+ */
+#define IOMMU_PTE_PR (1ULL << 0)
+#define IOMMU_PTE_U  (1ULL << 59)
+#define IOMMU_PTE_FC (1ULL << 60)
+#define IOMMU_PTE_IR (1ULL << 61)
+#define IOMMU_PTE_IW (1ULL << 62)
+
+/*
+ * Bit value definition for DTE fields
+ */
+#define DTE_FLAG_V  (1ULL << 0)
+#define DTE_FLAG_TV (1ULL << 1)
+#define DTE_FLAG_IR (1ULL << 61)
+#define DTE_FLAG_IW (1ULL << 62)
+
+#define DTE_FLAG_IOTLB	(1ULL << 32)
+#define DTE_FLAG_GV	(1ULL << 55)
+#define DTE_FLAG_MASK	(0x3ffULL << 32)
+#define DTE_GLX_SHIFT	(56)
+#define DTE_GLX_MASK	(3)
+#define DEV_DOMID_MASK	0xffffULL
+
+#define DTE_GCR3_VAL_A(x)	(((x) >> 12) & 0x00007ULL)
+#define DTE_GCR3_VAL_B(x)	(((x) >> 15) & 0x0ffffULL)
+#define DTE_GCR3_VAL_C(x)	(((x) >> 31) & 0x1fffffULL)
+
+#define DTE_GCR3_INDEX_A	0
+#define DTE_GCR3_INDEX_B	1
+#define DTE_GCR3_INDEX_C	1
+
+#define DTE_GCR3_SHIFT_A	58
+#define DTE_GCR3_SHIFT_B	16
+#define DTE_GCR3_SHIFT_C	43
+
+#define GCR3_VALID		0x01ULL
+
+#define IOMMU_PAGE_MASK (((1ULL << 52) - 1) & ~0xfffULL)
+#define IOMMU_PTE_PRESENT(pte) ((pte) & IOMMU_PTE_PR)
+#define IOMMU_PTE_PAGE(pte) (iommu_phys_to_virt((pte) & IOMMU_PAGE_MASK))
+#define IOMMU_PTE_MODE(pte) (((pte) >> 9) & 0x07)
+
+#define IOMMU_PROT_MASK 0x03
+#define IOMMU_PROT_IR 0x01
+#define IOMMU_PROT_IW 0x02
+
+#define IOMMU_UNITY_MAP_FLAG_EXCL_RANGE	(1 << 2)
+
+/* IOMMU capabilities */
+#define IOMMU_CAP_IOTLB   24
+#define IOMMU_CAP_NPCACHE 26
+#define IOMMU_CAP_EFR     27
+
+/* IOMMU Feature Reporting Field (for IVHD type 10h */
+#define IOMMU_FEAT_XTSUP_SHIFT	0
+#define IOMMU_FEAT_GASUP_SHIFT	6
+
+/* IOMMU Extended Feature Register (EFR) */
+#define IOMMU_EFR_XTSUP_SHIFT	2
+#define IOMMU_EFR_GASUP_SHIFT	7
+
+#define MAX_DOMAIN_ID 65536
+
+/* Protection domain flags */
+#define PD_DMA_OPS_MASK		(1UL << 0) /* domain used for dma_ops */
+#define PD_DEFAULT_MASK		(1UL << 1) /* domain is a default dma_ops
+					      domain for an IOMMU */
+#define PD_PASSTHROUGH_MASK	(1UL << 2) /* domain has no page
+					      translation */
+#define PD_IOMMUV2_MASK		(1UL << 3) /* domain has gcr3 table */
+
+extern bool amd_iommu_dump;
+#define DUMP_printk(format, arg...)					\
+	do {								\
+		if (amd_iommu_dump)						\
+			printk(KERN_INFO "AMD-Vi: " format, ## arg);	\
+	} while(0);
+
+/* global flag if IOMMUs cache non-present entries */
+extern bool amd_iommu_np_cache;
+/* Only true if all IOMMUs support device IOTLBs */
+extern bool amd_iommu_iotlb_sup;
+
+/*
+ * AMD IOMMU hardware only support 512 IRTEs despite
+ * the architectural limitation of 2048 entries.
+ */
+#define MAX_IRQS_PER_TABLE	512
+#define IRQ_TABLE_ALIGNMENT	128
+
+struct irq_remap_table {
+	raw_spinlock_t lock;
+	unsigned min_index;
+	u32 *table;
+};
+
+extern struct irq_remap_table **irq_lookup_table;
+
+/* Interrupt remapping feature used? */
+extern bool amd_iommu_irq_remap;
+
+/* kmem_cache to get tables with 128 byte alignement */
+extern struct kmem_cache *amd_iommu_irq_cache;
+
+/*
+ * Make iterating over all IOMMUs easier
+ */
+#define for_each_iommu(iommu) \
+	list_for_each_entry((iommu), &amd_iommu_list, list)
+#define for_each_iommu_safe(iommu, next) \
+	list_for_each_entry_safe((iommu), (next), &amd_iommu_list, list)
+
+#define APERTURE_RANGE_SHIFT	27	/* 128 MB */
+#define APERTURE_RANGE_SIZE	(1ULL << APERTURE_RANGE_SHIFT)
+#define APERTURE_RANGE_PAGES	(APERTURE_RANGE_SIZE >> PAGE_SHIFT)
+#define APERTURE_MAX_RANGES	32	/* allows 4GB of DMA address space */
+#define APERTURE_RANGE_INDEX(a)	((a) >> APERTURE_RANGE_SHIFT)
+#define APERTURE_PAGE_INDEX(a)	(((a) >> 21) & 0x3fULL)
+
+/*
+ * This struct is used to pass information about
+ * incoming PPR faults around.
+ */
+struct amd_iommu_fault {
+	u64 address;    /* IO virtual address of the fault*/
+	u32 pasid;      /* Address space identifier */
+	u16 device_id;  /* Originating PCI device id */
+	u16 tag;        /* PPR tag */
+	u16 flags;      /* Fault flags */
+
+};
+
+
+struct iommu_domain;
+struct irq_domain;
+struct amd_irte_ops;
+
+#define AMD_IOMMU_FLAG_TRANS_PRE_ENABLED      (1 << 0)
+
+/*
+ * This structure contains generic data for  IOMMU protection domains
+ * independent of their use.
+ */
+struct protection_domain {
+	struct list_head list;  /* for list of all protection domains */
+	struct list_head dev_list; /* List of all devices in this domain */
+	struct iommu_domain domain; /* generic domain handle used by
+				       iommu core code */
+	spinlock_t lock;	/* mostly used to lock the page table*/
+	struct mutex api_lock;	/* protect page tables in the iommu-api path */
+	u16 id;			/* the domain id written to the device table */
+	int mode;		/* paging mode (0-6 levels) */
+	u64 *pt_root;		/* page table root pointer */
+	int glx;		/* Number of levels for GCR3 table */
+	u64 *gcr3_tbl;		/* Guest CR3 table */
+	unsigned long flags;	/* flags to find out type of domain */
+	bool updated;		/* complete domain flush required */
+	unsigned dev_cnt;	/* devices assigned to this domain */
+	unsigned dev_iommu[MAX_IOMMUS]; /* per-IOMMU reference count */
+};
+
+/*
+ * Structure where we save information about one hardware AMD IOMMU in the
+ * system.
+ */
+struct amd_iommu {
+	struct list_head list;
+
+	/* Index within the IOMMU array */
+	int index;
+
+	/* locks the accesses to the hardware */
+	raw_spinlock_t lock;
+
+	/* Pointer to PCI device of this IOMMU */
+	struct pci_dev *dev;
+
+	/* Cache pdev to root device for resume quirks */
+	struct pci_dev *root_pdev;
+
+	/* physical address of MMIO space */
+	u64 mmio_phys;
+
+	/* physical end address of MMIO space */
+	u64 mmio_phys_end;
+
+	/* virtual address of MMIO space */
+	u8 __iomem *mmio_base;
+
+	/* capabilities of that IOMMU read from ACPI */
+	u32 cap;
+
+	/* flags read from acpi table */
+	u8 acpi_flags;
+
+	/* Extended features */
+	u64 features;
+
+	/* IOMMUv2 */
+	bool is_iommu_v2;
+
+	/* PCI device id of the IOMMU device */
+	u16 devid;
+
+	/*
+	 * Capability pointer. There could be more than one IOMMU per PCI
+	 * device function if there are more than one AMD IOMMU capability
+	 * pointers.
+	 */
+	u16 cap_ptr;
+
+	/* pci domain of this IOMMU */
+	u16 pci_seg;
+
+	/* start of exclusion range of that IOMMU */
+	u64 exclusion_start;
+	/* length of exclusion range of that IOMMU */
+	u64 exclusion_length;
+
+	/* command buffer virtual address */
+	u8 *cmd_buf;
+	u32 cmd_buf_head;
+	u32 cmd_buf_tail;
+
+	/* event buffer virtual address */
+	u8 *evt_buf;
+
+	/* Base of the PPR log, if present */
+	u8 *ppr_log;
+
+	/* Base of the GA log, if present */
+	u8 *ga_log;
+
+	/* Tail of the GA log, if present */
+	u8 *ga_log_tail;
+
+	/* true if interrupts for this IOMMU are already enabled */
+	bool int_enabled;
+
+	/* if one, we need to send a completion wait command */
+	bool need_sync;
+
+	/* Handle for IOMMU core code */
+	struct iommu_device iommu;
+
+	/*
+	 * We can't rely on the BIOS to restore all values on reinit, so we
+	 * need to stash them
+	 */
+
+	/* The iommu BAR */
+	u32 stored_addr_lo;
+	u32 stored_addr_hi;
+
+	/*
+	 * Each iommu has 6 l1s, each of which is documented as having 0x12
+	 * registers
+	 */
+	u32 stored_l1[6][0x12];
+
+	/* The l2 indirect registers */
+	u32 stored_l2[0x83];
+
+	/* The maximum PC banks and counters/bank (PCSup=1) */
+	u8 max_banks;
+	u8 max_counters;
+#ifdef CONFIG_IRQ_REMAP
+	struct irq_domain *ir_domain;
+	struct irq_domain *msi_domain;
+
+	struct amd_irte_ops *irte_ops;
+#endif
+
+	u32 flags;
+	volatile u64 __aligned(8) cmd_sem;
+
+#ifdef CONFIG_AMD_IOMMU_DEBUGFS
+	/* DebugFS Info */
+	struct dentry *debugfs;
+#endif
+};
+
+static inline struct amd_iommu *dev_to_amd_iommu(struct device *dev)
+{
+	struct iommu_device *iommu = dev_to_iommu_device(dev);
+
+	return container_of(iommu, struct amd_iommu, iommu);
+}
+
+#define ACPIHID_UID_LEN 256
+#define ACPIHID_HID_LEN 9
+
+struct acpihid_map_entry {
+	struct list_head list;
+	u8 uid[ACPIHID_UID_LEN];
+	u8 hid[ACPIHID_HID_LEN];
+	u16 devid;
+	u16 root_devid;
+	bool cmd_line;
+	struct iommu_group *group;
+};
+
+struct devid_map {
+	struct list_head list;
+	u8 id;
+	u16 devid;
+	bool cmd_line;
+};
+
+/*
+ * This struct contains device specific data for the IOMMU
+ */
+struct iommu_dev_data {
+	struct list_head list;		  /* For domain->dev_list */
+	struct llist_node dev_data_list;  /* For global dev_data_list */
+	struct protection_domain *domain; /* Domain the device is bound to */
+	u16 devid;			  /* PCI Device ID */
+	u16 alias;			  /* Alias Device ID */
+	bool iommu_v2;			  /* Device can make use of IOMMUv2 */
+	bool passthrough;		  /* Device is identity mapped */
+	struct {
+		bool enabled;
+		int qdep;
+	} ats;				  /* ATS state */
+	bool pri_tlp;			  /* PASID TLB required for
+					     PPR completions */
+	u32 errata;			  /* Bitmap for errata to apply */
+	bool use_vapic;			  /* Enable device to use vapic mode */
+	bool defer_attach;
+
+	struct ratelimit_state rs;        /* Ratelimit IOPF messages */
+};
+
+/* Map HPET and IOAPIC ids to the devid used by the IOMMU */
+extern struct list_head ioapic_map;
+extern struct list_head hpet_map;
+extern struct list_head acpihid_map;
+
+/*
+ * List with all IOMMUs in the system. This list is not locked because it is
+ * only written and read at driver initialization or suspend time
+ */
+extern struct list_head amd_iommu_list;
+
+/*
+ * Array with pointers to each IOMMU struct
+ * The indices are referenced in the protection domains
+ */
+extern struct amd_iommu *amd_iommus[MAX_IOMMUS];
+
+/*
+ * Declarations for the global list of all protection domains
+ */
+extern spinlock_t amd_iommu_pd_lock;
+extern struct list_head amd_iommu_pd_list;
+
+/*
+ * Structure defining one entry in the device table
+ */
+struct dev_table_entry {
+	u64 data[4];
+};
+
+/*
+ * One entry for unity mappings parsed out of the ACPI table.
+ */
+struct unity_map_entry {
+	struct list_head list;
+
+	/* starting device id this entry is used for (including) */
+	u16 devid_start;
+	/* end device id this entry is used for (including) */
+	u16 devid_end;
+
+	/* start address to unity map (including) */
+	u64 address_start;
+	/* end address to unity map (including) */
+	u64 address_end;
+
+	/* required protection */
+	int prot;
+};
+
+/*
+ * List of all unity mappings. It is not locked because as runtime it is only
+ * read. It is created at ACPI table parsing time.
+ */
+extern struct list_head amd_iommu_unity_map;
+
+/*
+ * Data structures for device handling
+ */
+
+/*
+ * Device table used by hardware. Read and write accesses by software are
+ * locked with the amd_iommu_pd_table lock.
+ */
+extern struct dev_table_entry *amd_iommu_dev_table;
+
+/*
+ * Alias table to find requestor ids to device ids. Not locked because only
+ * read on runtime.
+ */
+extern u16 *amd_iommu_alias_table;
+
+/*
+ * Reverse lookup table to find the IOMMU which translates a specific device.
+ */
+extern struct amd_iommu **amd_iommu_rlookup_table;
+
+/* size of the dma_ops aperture as power of 2 */
+extern unsigned amd_iommu_aperture_order;
+
+/* largest PCI device id we expect translation requests for */
+extern u16 amd_iommu_last_bdf;
+
+/* allocation bitmap for domain ids */
+extern unsigned long *amd_iommu_pd_alloc_bitmap;
+
+/*
+ * If true, the addresses will be flushed on unmap time, not when
+ * they are reused
+ */
+extern bool amd_iommu_unmap_flush;
+
+/* Smallest max PASID supported by any IOMMU in the system */
+extern u32 amd_iommu_max_pasid;
+
+extern bool amd_iommu_v2_present;
+
+extern bool amd_iommu_force_isolation;
+
+/* Max levels of glxval supported */
+extern int amd_iommu_max_glx_val;
+
+/*
+ * This function flushes all internal caches of
+ * the IOMMU used by this driver.
+ */
+extern void iommu_flush_all_caches(struct amd_iommu *iommu);
+
+static inline int get_ioapic_devid(int id)
+{
+	struct devid_map *entry;
+
+	list_for_each_entry(entry, &ioapic_map, list) {
+		if (entry->id == id)
+			return entry->devid;
+	}
+
+	return -EINVAL;
+}
+
+static inline int get_hpet_devid(int id)
+{
+	struct devid_map *entry;
+
+	list_for_each_entry(entry, &hpet_map, list) {
+		if (entry->id == id)
+			return entry->devid;
+	}
+
+	return -EINVAL;
+}
+
+enum amd_iommu_intr_mode_type {
+	AMD_IOMMU_GUEST_IR_LEGACY,
+
+	/* This mode is not visible to users. It is used when
+	 * we cannot fully enable vAPIC and fallback to only support
+	 * legacy interrupt remapping via 128-bit IRTE.
+	 */
+	AMD_IOMMU_GUEST_IR_LEGACY_GA,
+	AMD_IOMMU_GUEST_IR_VAPIC,
+};
+
+#define AMD_IOMMU_GUEST_IR_GA(x)	(x == AMD_IOMMU_GUEST_IR_VAPIC || \
+					 x == AMD_IOMMU_GUEST_IR_LEGACY_GA)
+
+#define AMD_IOMMU_GUEST_IR_VAPIC(x)	(x == AMD_IOMMU_GUEST_IR_VAPIC)
+
+union irte {
+	u32 val;
+	struct {
+		u32 valid	: 1,
+		    no_fault	: 1,
+		    int_type	: 3,
+		    rq_eoi	: 1,
+		    dm		: 1,
+		    rsvd_1	: 1,
+		    destination	: 8,
+		    vector	: 8,
+		    rsvd_2	: 8;
+	} fields;
+};
+
+#define APICID_TO_IRTE_DEST_LO(x)    (x & 0xffffff)
+#define APICID_TO_IRTE_DEST_HI(x)    ((x >> 24) & 0xff)
+
+union irte_ga_lo {
+	u64 val;
+
+	/* For int remapping */
+	struct {
+		u64 valid	: 1,
+		    no_fault	: 1,
+		    /* ------ */
+		    int_type	: 3,
+		    rq_eoi	: 1,
+		    dm		: 1,
+		    /* ------ */
+		    guest_mode	: 1,
+		    destination	: 24,
+		    ga_tag	: 32;
+	} fields_remap;
+
+	/* For guest vAPIC */
+	struct {
+		u64 valid	: 1,
+		    no_fault	: 1,
+		    /* ------ */
+		    ga_log_intr	: 1,
+		    rsvd1	: 3,
+		    is_run	: 1,
+		    /* ------ */
+		    guest_mode	: 1,
+		    destination	: 24,
+		    ga_tag	: 32;
+	} fields_vapic;
+};
+
+union irte_ga_hi {
+	u64 val;
+	struct {
+		u64 vector	: 8,
+		    rsvd_1	: 4,
+		    ga_root_ptr	: 40,
+		    rsvd_2	: 4,
+		    destination : 8;
+	} fields;
+};
+
+struct irte_ga {
+	union irte_ga_lo lo;
+	union irte_ga_hi hi;
+};
+
+struct irq_2_irte {
+	u16 devid; /* Device ID for IRTE table */
+	u16 index; /* Index into IRTE table*/
+};
+
+struct amd_ir_data {
+	u32 cached_ga_tag;
+	struct irq_2_irte irq_2_irte;
+	struct msi_msg msi_entry;
+	void *entry;    /* Pointer to union irte or struct irte_ga */
+	void *ref;      /* Pointer to the actual irte */
+};
+
+struct amd_irte_ops {
+	void (*prepare)(void *, u32, u32, u8, u32, int);
+	void (*activate)(void *, u16, u16);
+	void (*deactivate)(void *, u16, u16);
+	void (*set_affinity)(void *, u16, u16, u8, u32);
+	void *(*get)(struct irq_remap_table *, int);
+	void (*set_allocated)(struct irq_remap_table *, int);
+	bool (*is_allocated)(struct irq_remap_table *, int);
+	void (*clear_allocated)(struct irq_remap_table *, int);
+};
+
+#ifdef CONFIG_IRQ_REMAP
+extern struct amd_irte_ops irte_32_ops;
+extern struct amd_irte_ops irte_128_ops;
+#endif
+
+#endif /* _ASM_X86_AMD_IOMMU_TYPES_H */
diff --git a/drivers/iommu/amd_iommu_v2.c b/drivers/iommu/amd_iommu_v2.c
new file mode 100644
index 000000000..7a59a8eba
--- /dev/null
+++ b/drivers/iommu/amd_iommu_v2.c
@@ -0,0 +1,1027 @@
+/*
+ * Copyright (C) 2010-2012 Advanced Micro Devices, Inc.
+ * Author: Joerg Roedel <jroedel@suse.de>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ */
+
+#include <linux/mmu_notifier.h>
+#include <linux/amd-iommu.h>
+#include <linux/mm_types.h>
+#include <linux/profile.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/sched/mm.h>
+#include <linux/iommu.h>
+#include <linux/wait.h>
+#include <linux/pci.h>
+#include <linux/gfp.h>
+
+#include "amd_iommu_types.h"
+#include "amd_iommu_proto.h"
+
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Joerg Roedel <jroedel@suse.de>");
+
+#define MAX_DEVICES		0x10000
+#define PRI_QUEUE_SIZE		512
+
+struct pri_queue {
+	atomic_t inflight;
+	bool finish;
+	int status;
+};
+
+struct pasid_state {
+	struct list_head list;			/* For global state-list */
+	atomic_t count;				/* Reference count */
+	unsigned mmu_notifier_count;		/* Counting nested mmu_notifier
+						   calls */
+	struct mm_struct *mm;			/* mm_struct for the faults */
+	struct mmu_notifier mn;                 /* mmu_notifier handle */
+	struct pri_queue pri[PRI_QUEUE_SIZE];	/* PRI tag states */
+	struct device_state *device_state;	/* Link to our device_state */
+	int pasid;				/* PASID index */
+	bool invalid;				/* Used during setup and
+						   teardown of the pasid */
+	spinlock_t lock;			/* Protect pri_queues and
+						   mmu_notifer_count */
+	wait_queue_head_t wq;			/* To wait for count == 0 */
+};
+
+struct device_state {
+	struct list_head list;
+	u16 devid;
+	atomic_t count;
+	struct pci_dev *pdev;
+	struct pasid_state **states;
+	struct iommu_domain *domain;
+	int pasid_levels;
+	int max_pasids;
+	amd_iommu_invalid_ppr_cb inv_ppr_cb;
+	amd_iommu_invalidate_ctx inv_ctx_cb;
+	spinlock_t lock;
+	wait_queue_head_t wq;
+};
+
+struct fault {
+	struct work_struct work;
+	struct device_state *dev_state;
+	struct pasid_state *state;
+	struct mm_struct *mm;
+	u64 address;
+	u16 devid;
+	u16 pasid;
+	u16 tag;
+	u16 finish;
+	u16 flags;
+};
+
+static LIST_HEAD(state_list);
+static spinlock_t state_lock;
+
+static struct workqueue_struct *iommu_wq;
+
+static void free_pasid_states(struct device_state *dev_state);
+
+static u16 device_id(struct pci_dev *pdev)
+{
+	u16 devid;
+
+	devid = pdev->bus->number;
+	devid = (devid << 8) | pdev->devfn;
+
+	return devid;
+}
+
+static struct device_state *__get_device_state(u16 devid)
+{
+	struct device_state *dev_state;
+
+	list_for_each_entry(dev_state, &state_list, list) {
+		if (dev_state->devid == devid)
+			return dev_state;
+	}
+
+	return NULL;
+}
+
+static struct device_state *get_device_state(u16 devid)
+{
+	struct device_state *dev_state;
+	unsigned long flags;
+
+	spin_lock_irqsave(&state_lock, flags);
+	dev_state = __get_device_state(devid);
+	if (dev_state != NULL)
+		atomic_inc(&dev_state->count);
+	spin_unlock_irqrestore(&state_lock, flags);
+
+	return dev_state;
+}
+
+static void free_device_state(struct device_state *dev_state)
+{
+	struct iommu_group *group;
+
+	/*
+	 * First detach device from domain - No more PRI requests will arrive
+	 * from that device after it is unbound from the IOMMUv2 domain.
+	 */
+	group = iommu_group_get(&dev_state->pdev->dev);
+	if (WARN_ON(!group))
+		return;
+
+	iommu_detach_group(dev_state->domain, group);
+
+	iommu_group_put(group);
+
+	/* Everything is down now, free the IOMMUv2 domain */
+	iommu_domain_free(dev_state->domain);
+
+	/* Finally get rid of the device-state */
+	kfree(dev_state);
+}
+
+static void put_device_state(struct device_state *dev_state)
+{
+	if (atomic_dec_and_test(&dev_state->count))
+		wake_up(&dev_state->wq);
+}
+
+/* Must be called under dev_state->lock */
+static struct pasid_state **__get_pasid_state_ptr(struct device_state *dev_state,
+						  int pasid, bool alloc)
+{
+	struct pasid_state **root, **ptr;
+	int level, index;
+
+	level = dev_state->pasid_levels;
+	root  = dev_state->states;
+
+	while (true) {
+
+		index = (pasid >> (9 * level)) & 0x1ff;
+		ptr   = &root[index];
+
+		if (level == 0)
+			break;
+
+		if (*ptr == NULL) {
+			if (!alloc)
+				return NULL;
+
+			*ptr = (void *)get_zeroed_page(GFP_ATOMIC);
+			if (*ptr == NULL)
+				return NULL;
+		}
+
+		root   = (struct pasid_state **)*ptr;
+		level -= 1;
+	}
+
+	return ptr;
+}
+
+static int set_pasid_state(struct device_state *dev_state,
+			   struct pasid_state *pasid_state,
+			   int pasid)
+{
+	struct pasid_state **ptr;
+	unsigned long flags;
+	int ret;
+
+	spin_lock_irqsave(&dev_state->lock, flags);
+	ptr = __get_pasid_state_ptr(dev_state, pasid, true);
+
+	ret = -ENOMEM;
+	if (ptr == NULL)
+		goto out_unlock;
+
+	ret = -ENOMEM;
+	if (*ptr != NULL)
+		goto out_unlock;
+
+	*ptr = pasid_state;
+
+	ret = 0;
+
+out_unlock:
+	spin_unlock_irqrestore(&dev_state->lock, flags);
+
+	return ret;
+}
+
+static void clear_pasid_state(struct device_state *dev_state, int pasid)
+{
+	struct pasid_state **ptr;
+	unsigned long flags;
+
+	spin_lock_irqsave(&dev_state->lock, flags);
+	ptr = __get_pasid_state_ptr(dev_state, pasid, true);
+
+	if (ptr == NULL)
+		goto out_unlock;
+
+	*ptr = NULL;
+
+out_unlock:
+	spin_unlock_irqrestore(&dev_state->lock, flags);
+}
+
+static struct pasid_state *get_pasid_state(struct device_state *dev_state,
+					   int pasid)
+{
+	struct pasid_state **ptr, *ret = NULL;
+	unsigned long flags;
+
+	spin_lock_irqsave(&dev_state->lock, flags);
+	ptr = __get_pasid_state_ptr(dev_state, pasid, false);
+
+	if (ptr == NULL)
+		goto out_unlock;
+
+	ret = *ptr;
+	if (ret)
+		atomic_inc(&ret->count);
+
+out_unlock:
+	spin_unlock_irqrestore(&dev_state->lock, flags);
+
+	return ret;
+}
+
+static void free_pasid_state(struct pasid_state *pasid_state)
+{
+	kfree(pasid_state);
+}
+
+static void put_pasid_state(struct pasid_state *pasid_state)
+{
+	if (atomic_dec_and_test(&pasid_state->count))
+		wake_up(&pasid_state->wq);
+}
+
+static void put_pasid_state_wait(struct pasid_state *pasid_state)
+{
+	atomic_dec(&pasid_state->count);
+	wait_event(pasid_state->wq, !atomic_read(&pasid_state->count));
+	free_pasid_state(pasid_state);
+}
+
+static void unbind_pasid(struct pasid_state *pasid_state)
+{
+	struct iommu_domain *domain;
+
+	domain = pasid_state->device_state->domain;
+
+	/*
+	 * Mark pasid_state as invalid, no more faults will we added to the
+	 * work queue after this is visible everywhere.
+	 */
+	pasid_state->invalid = true;
+
+	/* Make sure this is visible */
+	smp_wmb();
+
+	/* After this the device/pasid can't access the mm anymore */
+	amd_iommu_domain_clear_gcr3(domain, pasid_state->pasid);
+
+	/* Make sure no more pending faults are in the queue */
+	flush_workqueue(iommu_wq);
+}
+
+static void free_pasid_states_level1(struct pasid_state **tbl)
+{
+	int i;
+
+	for (i = 0; i < 512; ++i) {
+		if (tbl[i] == NULL)
+			continue;
+
+		free_page((unsigned long)tbl[i]);
+	}
+}
+
+static void free_pasid_states_level2(struct pasid_state **tbl)
+{
+	struct pasid_state **ptr;
+	int i;
+
+	for (i = 0; i < 512; ++i) {
+		if (tbl[i] == NULL)
+			continue;
+
+		ptr = (struct pasid_state **)tbl[i];
+		free_pasid_states_level1(ptr);
+	}
+}
+
+static void free_pasid_states(struct device_state *dev_state)
+{
+	struct pasid_state *pasid_state;
+	int i;
+
+	for (i = 0; i < dev_state->max_pasids; ++i) {
+		pasid_state = get_pasid_state(dev_state, i);
+		if (pasid_state == NULL)
+			continue;
+
+		put_pasid_state(pasid_state);
+
+		/*
+		 * This will call the mn_release function and
+		 * unbind the PASID
+		 */
+		mmu_notifier_unregister(&pasid_state->mn, pasid_state->mm);
+
+		put_pasid_state_wait(pasid_state); /* Reference taken in
+						      amd_iommu_bind_pasid */
+
+		/* Drop reference taken in amd_iommu_bind_pasid */
+		put_device_state(dev_state);
+	}
+
+	if (dev_state->pasid_levels == 2)
+		free_pasid_states_level2(dev_state->states);
+	else if (dev_state->pasid_levels == 1)
+		free_pasid_states_level1(dev_state->states);
+	else
+		BUG_ON(dev_state->pasid_levels != 0);
+
+	free_page((unsigned long)dev_state->states);
+}
+
+static struct pasid_state *mn_to_state(struct mmu_notifier *mn)
+{
+	return container_of(mn, struct pasid_state, mn);
+}
+
+static void __mn_flush_page(struct mmu_notifier *mn,
+			    unsigned long address)
+{
+	struct pasid_state *pasid_state;
+	struct device_state *dev_state;
+
+	pasid_state = mn_to_state(mn);
+	dev_state   = pasid_state->device_state;
+
+	amd_iommu_flush_page(dev_state->domain, pasid_state->pasid, address);
+}
+
+static int mn_clear_flush_young(struct mmu_notifier *mn,
+				struct mm_struct *mm,
+				unsigned long start,
+				unsigned long end)
+{
+	for (; start < end; start += PAGE_SIZE)
+		__mn_flush_page(mn, start);
+
+	return 0;
+}
+
+static void mn_invalidate_range(struct mmu_notifier *mn,
+				struct mm_struct *mm,
+				unsigned long start, unsigned long end)
+{
+	struct pasid_state *pasid_state;
+	struct device_state *dev_state;
+
+	pasid_state = mn_to_state(mn);
+	dev_state   = pasid_state->device_state;
+
+	if ((start ^ (end - 1)) < PAGE_SIZE)
+		amd_iommu_flush_page(dev_state->domain, pasid_state->pasid,
+				     start);
+	else
+		amd_iommu_flush_tlb(dev_state->domain, pasid_state->pasid);
+}
+
+static void mn_release(struct mmu_notifier *mn, struct mm_struct *mm)
+{
+	struct pasid_state *pasid_state;
+	struct device_state *dev_state;
+	bool run_inv_ctx_cb;
+
+	might_sleep();
+
+	pasid_state    = mn_to_state(mn);
+	dev_state      = pasid_state->device_state;
+	run_inv_ctx_cb = !pasid_state->invalid;
+
+	if (run_inv_ctx_cb && dev_state->inv_ctx_cb)
+		dev_state->inv_ctx_cb(dev_state->pdev, pasid_state->pasid);
+
+	unbind_pasid(pasid_state);
+}
+
+static const struct mmu_notifier_ops iommu_mn = {
+	.flags			= MMU_INVALIDATE_DOES_NOT_BLOCK,
+	.release		= mn_release,
+	.clear_flush_young      = mn_clear_flush_young,
+	.invalidate_range       = mn_invalidate_range,
+};
+
+static void set_pri_tag_status(struct pasid_state *pasid_state,
+			       u16 tag, int status)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&pasid_state->lock, flags);
+	pasid_state->pri[tag].status = status;
+	spin_unlock_irqrestore(&pasid_state->lock, flags);
+}
+
+static void finish_pri_tag(struct device_state *dev_state,
+			   struct pasid_state *pasid_state,
+			   u16 tag)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&pasid_state->lock, flags);
+	if (atomic_dec_and_test(&pasid_state->pri[tag].inflight) &&
+	    pasid_state->pri[tag].finish) {
+		amd_iommu_complete_ppr(dev_state->pdev, pasid_state->pasid,
+				       pasid_state->pri[tag].status, tag);
+		pasid_state->pri[tag].finish = false;
+		pasid_state->pri[tag].status = PPR_SUCCESS;
+	}
+	spin_unlock_irqrestore(&pasid_state->lock, flags);
+}
+
+static void handle_fault_error(struct fault *fault)
+{
+	int status;
+
+	if (!fault->dev_state->inv_ppr_cb) {
+		set_pri_tag_status(fault->state, fault->tag, PPR_INVALID);
+		return;
+	}
+
+	status = fault->dev_state->inv_ppr_cb(fault->dev_state->pdev,
+					      fault->pasid,
+					      fault->address,
+					      fault->flags);
+	switch (status) {
+	case AMD_IOMMU_INV_PRI_RSP_SUCCESS:
+		set_pri_tag_status(fault->state, fault->tag, PPR_SUCCESS);
+		break;
+	case AMD_IOMMU_INV_PRI_RSP_INVALID:
+		set_pri_tag_status(fault->state, fault->tag, PPR_INVALID);
+		break;
+	case AMD_IOMMU_INV_PRI_RSP_FAIL:
+		set_pri_tag_status(fault->state, fault->tag, PPR_FAILURE);
+		break;
+	default:
+		BUG();
+	}
+}
+
+static bool access_error(struct vm_area_struct *vma, struct fault *fault)
+{
+	unsigned long requested = 0;
+
+	if (fault->flags & PPR_FAULT_EXEC)
+		requested |= VM_EXEC;
+
+	if (fault->flags & PPR_FAULT_READ)
+		requested |= VM_READ;
+
+	if (fault->flags & PPR_FAULT_WRITE)
+		requested |= VM_WRITE;
+
+	return (requested & ~vma->vm_flags) != 0;
+}
+
+static void do_fault(struct work_struct *work)
+{
+	struct fault *fault = container_of(work, struct fault, work);
+	struct vm_area_struct *vma;
+	vm_fault_t ret = VM_FAULT_ERROR;
+	unsigned int flags = 0;
+	struct mm_struct *mm;
+	u64 address;
+
+	mm = fault->state->mm;
+	address = fault->address;
+
+	if (fault->flags & PPR_FAULT_USER)
+		flags |= FAULT_FLAG_USER;
+	if (fault->flags & PPR_FAULT_WRITE)
+		flags |= FAULT_FLAG_WRITE;
+	flags |= FAULT_FLAG_REMOTE;
+
+	down_read(&mm->mmap_sem);
+	vma = find_extend_vma(mm, address);
+	if (!vma || address < vma->vm_start)
+		/* failed to get a vma in the right range */
+		goto out;
+
+	/* Check if we have the right permissions on the vma */
+	if (access_error(vma, fault))
+		goto out;
+
+	ret = handle_mm_fault(vma, address, flags);
+out:
+	up_read(&mm->mmap_sem);
+
+	if (ret & VM_FAULT_ERROR)
+		/* failed to service fault */
+		handle_fault_error(fault);
+
+	finish_pri_tag(fault->dev_state, fault->state, fault->tag);
+
+	put_pasid_state(fault->state);
+
+	kfree(fault);
+}
+
+static int ppr_notifier(struct notifier_block *nb, unsigned long e, void *data)
+{
+	struct amd_iommu_fault *iommu_fault;
+	struct pasid_state *pasid_state;
+	struct device_state *dev_state;
+	unsigned long flags;
+	struct fault *fault;
+	bool finish;
+	u16 tag, devid;
+	int ret;
+	struct iommu_dev_data *dev_data;
+	struct pci_dev *pdev = NULL;
+
+	iommu_fault = data;
+	tag         = iommu_fault->tag & 0x1ff;
+	finish      = (iommu_fault->tag >> 9) & 1;
+
+	devid = iommu_fault->device_id;
+	pdev = pci_get_domain_bus_and_slot(0, PCI_BUS_NUM(devid),
+					   devid & 0xff);
+	if (!pdev)
+		return -ENODEV;
+	dev_data = get_dev_data(&pdev->dev);
+
+	/* In kdump kernel pci dev is not initialized yet -> send INVALID */
+	ret = NOTIFY_DONE;
+	if (translation_pre_enabled(amd_iommu_rlookup_table[devid])
+		&& dev_data->defer_attach) {
+		amd_iommu_complete_ppr(pdev, iommu_fault->pasid,
+				       PPR_INVALID, tag);
+		goto out;
+	}
+
+	dev_state = get_device_state(iommu_fault->device_id);
+	if (dev_state == NULL)
+		goto out;
+
+	pasid_state = get_pasid_state(dev_state, iommu_fault->pasid);
+	if (pasid_state == NULL || pasid_state->invalid) {
+		/* We know the device but not the PASID -> send INVALID */
+		amd_iommu_complete_ppr(dev_state->pdev, iommu_fault->pasid,
+				       PPR_INVALID, tag);
+		goto out_drop_state;
+	}
+
+	spin_lock_irqsave(&pasid_state->lock, flags);
+	atomic_inc(&pasid_state->pri[tag].inflight);
+	if (finish)
+		pasid_state->pri[tag].finish = true;
+	spin_unlock_irqrestore(&pasid_state->lock, flags);
+
+	fault = kzalloc(sizeof(*fault), GFP_ATOMIC);
+	if (fault == NULL) {
+		/* We are OOM - send success and let the device re-fault */
+		finish_pri_tag(dev_state, pasid_state, tag);
+		goto out_drop_state;
+	}
+
+	fault->dev_state = dev_state;
+	fault->address   = iommu_fault->address;
+	fault->state     = pasid_state;
+	fault->tag       = tag;
+	fault->finish    = finish;
+	fault->pasid     = iommu_fault->pasid;
+	fault->flags     = iommu_fault->flags;
+	INIT_WORK(&fault->work, do_fault);
+
+	queue_work(iommu_wq, &fault->work);
+
+	ret = NOTIFY_OK;
+
+out_drop_state:
+
+	if (ret != NOTIFY_OK && pasid_state)
+		put_pasid_state(pasid_state);
+
+	put_device_state(dev_state);
+
+out:
+	return ret;
+}
+
+static struct notifier_block ppr_nb = {
+	.notifier_call = ppr_notifier,
+};
+
+int amd_iommu_bind_pasid(struct pci_dev *pdev, int pasid,
+			 struct task_struct *task)
+{
+	struct pasid_state *pasid_state;
+	struct device_state *dev_state;
+	struct mm_struct *mm;
+	u16 devid;
+	int ret;
+
+	might_sleep();
+
+	if (!amd_iommu_v2_supported())
+		return -ENODEV;
+
+	devid     = device_id(pdev);
+	dev_state = get_device_state(devid);
+
+	if (dev_state == NULL)
+		return -EINVAL;
+
+	ret = -EINVAL;
+	if (pasid < 0 || pasid >= dev_state->max_pasids)
+		goto out;
+
+	ret = -ENOMEM;
+	pasid_state = kzalloc(sizeof(*pasid_state), GFP_KERNEL);
+	if (pasid_state == NULL)
+		goto out;
+
+
+	atomic_set(&pasid_state->count, 1);
+	init_waitqueue_head(&pasid_state->wq);
+	spin_lock_init(&pasid_state->lock);
+
+	mm                        = get_task_mm(task);
+	pasid_state->mm           = mm;
+	pasid_state->device_state = dev_state;
+	pasid_state->pasid        = pasid;
+	pasid_state->invalid      = true; /* Mark as valid only if we are
+					     done with setting up the pasid */
+	pasid_state->mn.ops       = &iommu_mn;
+
+	if (pasid_state->mm == NULL)
+		goto out_free;
+
+	mmu_notifier_register(&pasid_state->mn, mm);
+
+	ret = set_pasid_state(dev_state, pasid_state, pasid);
+	if (ret)
+		goto out_unregister;
+
+	ret = amd_iommu_domain_set_gcr3(dev_state->domain, pasid,
+					__pa(pasid_state->mm->pgd));
+	if (ret)
+		goto out_clear_state;
+
+	/* Now we are ready to handle faults */
+	pasid_state->invalid = false;
+
+	/*
+	 * Drop the reference to the mm_struct here. We rely on the
+	 * mmu_notifier release call-back to inform us when the mm
+	 * is going away.
+	 */
+	mmput(mm);
+
+	return 0;
+
+out_clear_state:
+	clear_pasid_state(dev_state, pasid);
+
+out_unregister:
+	mmu_notifier_unregister(&pasid_state->mn, mm);
+	mmput(mm);
+
+out_free:
+	free_pasid_state(pasid_state);
+
+out:
+	put_device_state(dev_state);
+
+	return ret;
+}
+EXPORT_SYMBOL(amd_iommu_bind_pasid);
+
+void amd_iommu_unbind_pasid(struct pci_dev *pdev, int pasid)
+{
+	struct pasid_state *pasid_state;
+	struct device_state *dev_state;
+	u16 devid;
+
+	might_sleep();
+
+	if (!amd_iommu_v2_supported())
+		return;
+
+	devid = device_id(pdev);
+	dev_state = get_device_state(devid);
+	if (dev_state == NULL)
+		return;
+
+	if (pasid < 0 || pasid >= dev_state->max_pasids)
+		goto out;
+
+	pasid_state = get_pasid_state(dev_state, pasid);
+	if (pasid_state == NULL)
+		goto out;
+	/*
+	 * Drop reference taken here. We are safe because we still hold
+	 * the reference taken in the amd_iommu_bind_pasid function.
+	 */
+	put_pasid_state(pasid_state);
+
+	/* Clear the pasid state so that the pasid can be re-used */
+	clear_pasid_state(dev_state, pasid_state->pasid);
+
+	/*
+	 * Call mmu_notifier_unregister to drop our reference
+	 * to pasid_state->mm
+	 */
+	mmu_notifier_unregister(&pasid_state->mn, pasid_state->mm);
+
+	put_pasid_state_wait(pasid_state); /* Reference taken in
+					      amd_iommu_bind_pasid */
+out:
+	/* Drop reference taken in this function */
+	put_device_state(dev_state);
+
+	/* Drop reference taken in amd_iommu_bind_pasid */
+	put_device_state(dev_state);
+}
+EXPORT_SYMBOL(amd_iommu_unbind_pasid);
+
+int amd_iommu_init_device(struct pci_dev *pdev, int pasids)
+{
+	struct device_state *dev_state;
+	struct iommu_group *group;
+	unsigned long flags;
+	int ret, tmp;
+	u16 devid;
+
+	might_sleep();
+
+	/*
+	 * When memory encryption is active the device is likely not in a
+	 * direct-mapped domain. Forbid using IOMMUv2 functionality for now.
+	 */
+	if (mem_encrypt_active())
+		return -ENODEV;
+
+	if (!amd_iommu_v2_supported())
+		return -ENODEV;
+
+	if (pasids <= 0 || pasids > (PASID_MASK + 1))
+		return -EINVAL;
+
+	devid = device_id(pdev);
+
+	dev_state = kzalloc(sizeof(*dev_state), GFP_KERNEL);
+	if (dev_state == NULL)
+		return -ENOMEM;
+
+	spin_lock_init(&dev_state->lock);
+	init_waitqueue_head(&dev_state->wq);
+	dev_state->pdev  = pdev;
+	dev_state->devid = devid;
+
+	tmp = pasids;
+	for (dev_state->pasid_levels = 0; (tmp - 1) & ~0x1ff; tmp >>= 9)
+		dev_state->pasid_levels += 1;
+
+	atomic_set(&dev_state->count, 1);
+	dev_state->max_pasids = pasids;
+
+	ret = -ENOMEM;
+	dev_state->states = (void *)get_zeroed_page(GFP_KERNEL);
+	if (dev_state->states == NULL)
+		goto out_free_dev_state;
+
+	dev_state->domain = iommu_domain_alloc(&pci_bus_type);
+	if (dev_state->domain == NULL)
+		goto out_free_states;
+
+	amd_iommu_domain_direct_map(dev_state->domain);
+
+	ret = amd_iommu_domain_enable_v2(dev_state->domain, pasids);
+	if (ret)
+		goto out_free_domain;
+
+	group = iommu_group_get(&pdev->dev);
+	if (!group) {
+		ret = -EINVAL;
+		goto out_free_domain;
+	}
+
+	ret = iommu_attach_group(dev_state->domain, group);
+	if (ret != 0)
+		goto out_drop_group;
+
+	iommu_group_put(group);
+
+	spin_lock_irqsave(&state_lock, flags);
+
+	if (__get_device_state(devid) != NULL) {
+		spin_unlock_irqrestore(&state_lock, flags);
+		ret = -EBUSY;
+		goto out_free_domain;
+	}
+
+	list_add_tail(&dev_state->list, &state_list);
+
+	spin_unlock_irqrestore(&state_lock, flags);
+
+	return 0;
+
+out_drop_group:
+	iommu_group_put(group);
+
+out_free_domain:
+	iommu_domain_free(dev_state->domain);
+
+out_free_states:
+	free_page((unsigned long)dev_state->states);
+
+out_free_dev_state:
+	kfree(dev_state);
+
+	return ret;
+}
+EXPORT_SYMBOL(amd_iommu_init_device);
+
+void amd_iommu_free_device(struct pci_dev *pdev)
+{
+	struct device_state *dev_state;
+	unsigned long flags;
+	u16 devid;
+
+	if (!amd_iommu_v2_supported())
+		return;
+
+	devid = device_id(pdev);
+
+	spin_lock_irqsave(&state_lock, flags);
+
+	dev_state = __get_device_state(devid);
+	if (dev_state == NULL) {
+		spin_unlock_irqrestore(&state_lock, flags);
+		return;
+	}
+
+	list_del(&dev_state->list);
+
+	spin_unlock_irqrestore(&state_lock, flags);
+
+	/* Get rid of any remaining pasid states */
+	free_pasid_states(dev_state);
+
+	put_device_state(dev_state);
+	/*
+	 * Wait until the last reference is dropped before freeing
+	 * the device state.
+	 */
+	wait_event(dev_state->wq, !atomic_read(&dev_state->count));
+	free_device_state(dev_state);
+}
+EXPORT_SYMBOL(amd_iommu_free_device);
+
+int amd_iommu_set_invalid_ppr_cb(struct pci_dev *pdev,
+				 amd_iommu_invalid_ppr_cb cb)
+{
+	struct device_state *dev_state;
+	unsigned long flags;
+	u16 devid;
+	int ret;
+
+	if (!amd_iommu_v2_supported())
+		return -ENODEV;
+
+	devid = device_id(pdev);
+
+	spin_lock_irqsave(&state_lock, flags);
+
+	ret = -EINVAL;
+	dev_state = __get_device_state(devid);
+	if (dev_state == NULL)
+		goto out_unlock;
+
+	dev_state->inv_ppr_cb = cb;
+
+	ret = 0;
+
+out_unlock:
+	spin_unlock_irqrestore(&state_lock, flags);
+
+	return ret;
+}
+EXPORT_SYMBOL(amd_iommu_set_invalid_ppr_cb);
+
+int amd_iommu_set_invalidate_ctx_cb(struct pci_dev *pdev,
+				    amd_iommu_invalidate_ctx cb)
+{
+	struct device_state *dev_state;
+	unsigned long flags;
+	u16 devid;
+	int ret;
+
+	if (!amd_iommu_v2_supported())
+		return -ENODEV;
+
+	devid = device_id(pdev);
+
+	spin_lock_irqsave(&state_lock, flags);
+
+	ret = -EINVAL;
+	dev_state = __get_device_state(devid);
+	if (dev_state == NULL)
+		goto out_unlock;
+
+	dev_state->inv_ctx_cb = cb;
+
+	ret = 0;
+
+out_unlock:
+	spin_unlock_irqrestore(&state_lock, flags);
+
+	return ret;
+}
+EXPORT_SYMBOL(amd_iommu_set_invalidate_ctx_cb);
+
+static int __init amd_iommu_v2_init(void)
+{
+	int ret;
+
+	pr_info("AMD IOMMUv2 driver by Joerg Roedel <jroedel@suse.de>\n");
+
+	if (!amd_iommu_v2_supported()) {
+		pr_info("AMD IOMMUv2 functionality not available on this system\n");
+		/*
+		 * Load anyway to provide the symbols to other modules
+		 * which may use AMD IOMMUv2 optionally.
+		 */
+		return 0;
+	}
+
+	spin_lock_init(&state_lock);
+
+	ret = -ENOMEM;
+	iommu_wq = alloc_workqueue("amd_iommu_v2", WQ_MEM_RECLAIM, 0);
+	if (iommu_wq == NULL)
+		goto out;
+
+	amd_iommu_register_ppr_notifier(&ppr_nb);
+
+	return 0;
+
+out:
+	return ret;
+}
+
+static void __exit amd_iommu_v2_exit(void)
+{
+	struct device_state *dev_state;
+	int i;
+
+	if (!amd_iommu_v2_supported())
+		return;
+
+	amd_iommu_unregister_ppr_notifier(&ppr_nb);
+
+	flush_workqueue(iommu_wq);
+
+	/*
+	 * The loop below might call flush_workqueue(), so call
+	 * destroy_workqueue() after it
+	 */
+	for (i = 0; i < MAX_DEVICES; ++i) {
+		dev_state = get_device_state(i);
+
+		if (dev_state == NULL)
+			continue;
+
+		WARN_ON_ONCE(1);
+
+		put_device_state(dev_state);
+		amd_iommu_free_device(dev_state->pdev);
+	}
+
+	destroy_workqueue(iommu_wq);
+}
+
+module_init(amd_iommu_v2_init);
+module_exit(amd_iommu_v2_exit);
diff --git a/drivers/iommu/arm-smmu-regs.h b/drivers/iommu/arm-smmu-regs.h
new file mode 100644
index 000000000..a1226e4ab
--- /dev/null
+++ b/drivers/iommu/arm-smmu-regs.h
@@ -0,0 +1,220 @@
+/*
+ * IOMMU API for ARM architected SMMU implementations.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) 2013 ARM Limited
+ *
+ * Author: Will Deacon <will.deacon@arm.com>
+ */
+
+#ifndef _ARM_SMMU_REGS_H
+#define _ARM_SMMU_REGS_H
+
+/* Configuration registers */
+#define ARM_SMMU_GR0_sCR0		0x0
+#define sCR0_CLIENTPD			(1 << 0)
+#define sCR0_GFRE			(1 << 1)
+#define sCR0_GFIE			(1 << 2)
+#define sCR0_EXIDENABLE			(1 << 3)
+#define sCR0_GCFGFRE			(1 << 4)
+#define sCR0_GCFGFIE			(1 << 5)
+#define sCR0_USFCFG			(1 << 10)
+#define sCR0_VMIDPNE			(1 << 11)
+#define sCR0_PTM			(1 << 12)
+#define sCR0_FB				(1 << 13)
+#define sCR0_VMID16EN			(1 << 31)
+#define sCR0_BSU_SHIFT			14
+#define sCR0_BSU_MASK			0x3
+
+/* Auxiliary Configuration register */
+#define ARM_SMMU_GR0_sACR		0x10
+
+/* Identification registers */
+#define ARM_SMMU_GR0_ID0		0x20
+#define ARM_SMMU_GR0_ID1		0x24
+#define ARM_SMMU_GR0_ID2		0x28
+#define ARM_SMMU_GR0_ID3		0x2c
+#define ARM_SMMU_GR0_ID4		0x30
+#define ARM_SMMU_GR0_ID5		0x34
+#define ARM_SMMU_GR0_ID6		0x38
+#define ARM_SMMU_GR0_ID7		0x3c
+#define ARM_SMMU_GR0_sGFSR		0x48
+#define ARM_SMMU_GR0_sGFSYNR0		0x50
+#define ARM_SMMU_GR0_sGFSYNR1		0x54
+#define ARM_SMMU_GR0_sGFSYNR2		0x58
+
+#define ID0_S1TS			(1 << 30)
+#define ID0_S2TS			(1 << 29)
+#define ID0_NTS				(1 << 28)
+#define ID0_SMS				(1 << 27)
+#define ID0_ATOSNS			(1 << 26)
+#define ID0_PTFS_NO_AARCH32		(1 << 25)
+#define ID0_PTFS_NO_AARCH32S		(1 << 24)
+#define ID0_CTTW			(1 << 14)
+#define ID0_NUMIRPT_SHIFT		16
+#define ID0_NUMIRPT_MASK		0xff
+#define ID0_NUMSIDB_SHIFT		9
+#define ID0_NUMSIDB_MASK		0xf
+#define ID0_EXIDS			(1 << 8)
+#define ID0_NUMSMRG_SHIFT		0
+#define ID0_NUMSMRG_MASK		0xff
+
+#define ID1_PAGESIZE			(1 << 31)
+#define ID1_NUMPAGENDXB_SHIFT		28
+#define ID1_NUMPAGENDXB_MASK		7
+#define ID1_NUMS2CB_SHIFT		16
+#define ID1_NUMS2CB_MASK		0xff
+#define ID1_NUMCB_SHIFT			0
+#define ID1_NUMCB_MASK			0xff
+
+#define ID2_OAS_SHIFT			4
+#define ID2_OAS_MASK			0xf
+#define ID2_IAS_SHIFT			0
+#define ID2_IAS_MASK			0xf
+#define ID2_UBS_SHIFT			8
+#define ID2_UBS_MASK			0xf
+#define ID2_PTFS_4K			(1 << 12)
+#define ID2_PTFS_16K			(1 << 13)
+#define ID2_PTFS_64K			(1 << 14)
+#define ID2_VMID16			(1 << 15)
+
+#define ID7_MAJOR_SHIFT			4
+#define ID7_MAJOR_MASK			0xf
+
+/* Global TLB invalidation */
+#define ARM_SMMU_GR0_TLBIVMID		0x64
+#define ARM_SMMU_GR0_TLBIALLNSNH	0x68
+#define ARM_SMMU_GR0_TLBIALLH		0x6c
+#define ARM_SMMU_GR0_sTLBGSYNC		0x70
+#define ARM_SMMU_GR0_sTLBGSTATUS	0x74
+#define sTLBGSTATUS_GSACTIVE		(1 << 0)
+
+/* Stream mapping registers */
+#define ARM_SMMU_GR0_SMR(n)		(0x800 + ((n) << 2))
+#define SMR_VALID			(1 << 31)
+#define SMR_MASK_SHIFT			16
+#define SMR_ID_SHIFT			0
+
+#define ARM_SMMU_GR0_S2CR(n)		(0xc00 + ((n) << 2))
+#define S2CR_CBNDX_SHIFT		0
+#define S2CR_CBNDX_MASK			0xff
+#define S2CR_EXIDVALID			(1 << 10)
+#define S2CR_TYPE_SHIFT			16
+#define S2CR_TYPE_MASK			0x3
+enum arm_smmu_s2cr_type {
+	S2CR_TYPE_TRANS,
+	S2CR_TYPE_BYPASS,
+	S2CR_TYPE_FAULT,
+};
+
+#define S2CR_PRIVCFG_SHIFT		24
+#define S2CR_PRIVCFG_MASK		0x3
+enum arm_smmu_s2cr_privcfg {
+	S2CR_PRIVCFG_DEFAULT,
+	S2CR_PRIVCFG_DIPAN,
+	S2CR_PRIVCFG_UNPRIV,
+	S2CR_PRIVCFG_PRIV,
+};
+
+/* Context bank attribute registers */
+#define ARM_SMMU_GR1_CBAR(n)		(0x0 + ((n) << 2))
+#define CBAR_VMID_SHIFT			0
+#define CBAR_VMID_MASK			0xff
+#define CBAR_S1_BPSHCFG_SHIFT		8
+#define CBAR_S1_BPSHCFG_MASK		3
+#define CBAR_S1_BPSHCFG_NSH		3
+#define CBAR_S1_MEMATTR_SHIFT		12
+#define CBAR_S1_MEMATTR_MASK		0xf
+#define CBAR_S1_MEMATTR_WB		0xf
+#define CBAR_TYPE_SHIFT			16
+#define CBAR_TYPE_MASK			0x3
+#define CBAR_TYPE_S2_TRANS		(0 << CBAR_TYPE_SHIFT)
+#define CBAR_TYPE_S1_TRANS_S2_BYPASS	(1 << CBAR_TYPE_SHIFT)
+#define CBAR_TYPE_S1_TRANS_S2_FAULT	(2 << CBAR_TYPE_SHIFT)
+#define CBAR_TYPE_S1_TRANS_S2_TRANS	(3 << CBAR_TYPE_SHIFT)
+#define CBAR_IRPTNDX_SHIFT		24
+#define CBAR_IRPTNDX_MASK		0xff
+
+#define ARM_SMMU_GR1_CBA2R(n)		(0x800 + ((n) << 2))
+#define CBA2R_RW64_32BIT		(0 << 0)
+#define CBA2R_RW64_64BIT		(1 << 0)
+#define CBA2R_VMID_SHIFT		16
+#define CBA2R_VMID_MASK			0xffff
+
+#define ARM_SMMU_CB_SCTLR		0x0
+#define ARM_SMMU_CB_ACTLR		0x4
+#define ARM_SMMU_CB_RESUME		0x8
+#define ARM_SMMU_CB_TTBCR2		0x10
+#define ARM_SMMU_CB_TTBR0		0x20
+#define ARM_SMMU_CB_TTBR1		0x28
+#define ARM_SMMU_CB_TTBCR		0x30
+#define ARM_SMMU_CB_CONTEXTIDR		0x34
+#define ARM_SMMU_CB_S1_MAIR0		0x38
+#define ARM_SMMU_CB_S1_MAIR1		0x3c
+#define ARM_SMMU_CB_PAR			0x50
+#define ARM_SMMU_CB_FSR			0x58
+#define ARM_SMMU_CB_FAR			0x60
+#define ARM_SMMU_CB_FSYNR0		0x68
+#define ARM_SMMU_CB_S1_TLBIVA		0x600
+#define ARM_SMMU_CB_S1_TLBIASID		0x610
+#define ARM_SMMU_CB_S1_TLBIVAL		0x620
+#define ARM_SMMU_CB_S2_TLBIIPAS2	0x630
+#define ARM_SMMU_CB_S2_TLBIIPAS2L	0x638
+#define ARM_SMMU_CB_TLBSYNC		0x7f0
+#define ARM_SMMU_CB_TLBSTATUS		0x7f4
+#define ARM_SMMU_CB_ATS1PR		0x800
+#define ARM_SMMU_CB_ATSR		0x8f0
+
+#define SCTLR_S1_ASIDPNE		(1 << 12)
+#define SCTLR_CFCFG			(1 << 7)
+#define SCTLR_CFIE			(1 << 6)
+#define SCTLR_CFRE			(1 << 5)
+#define SCTLR_E				(1 << 4)
+#define SCTLR_AFE			(1 << 2)
+#define SCTLR_TRE			(1 << 1)
+#define SCTLR_M				(1 << 0)
+
+#define CB_PAR_F			(1 << 0)
+
+#define ATSR_ACTIVE			(1 << 0)
+
+#define RESUME_RETRY			(0 << 0)
+#define RESUME_TERMINATE		(1 << 0)
+
+#define TTBCR2_SEP_SHIFT		15
+#define TTBCR2_SEP_UPSTREAM		(0x7 << TTBCR2_SEP_SHIFT)
+#define TTBCR2_AS			(1 << 4)
+
+#define TTBRn_ASID_SHIFT		48
+
+#define FSR_MULTI			(1 << 31)
+#define FSR_SS				(1 << 30)
+#define FSR_UUT				(1 << 8)
+#define FSR_ASF				(1 << 7)
+#define FSR_TLBLKF			(1 << 6)
+#define FSR_TLBMCF			(1 << 5)
+#define FSR_EF				(1 << 4)
+#define FSR_PF				(1 << 3)
+#define FSR_AFF				(1 << 2)
+#define FSR_TF				(1 << 1)
+
+#define FSR_IGN				(FSR_AFF | FSR_ASF | \
+					 FSR_TLBMCF | FSR_TLBLKF)
+#define FSR_FAULT			(FSR_MULTI | FSR_SS | FSR_UUT | \
+					 FSR_EF | FSR_PF | FSR_TF | FSR_IGN)
+
+#define FSYNR0_WNR			(1 << 4)
+
+#endif /* _ARM_SMMU_REGS_H */
diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c
new file mode 100644
index 000000000..9f16f47e7
--- /dev/null
+++ b/drivers/iommu/arm-smmu-v3.c
@@ -0,0 +1,2945 @@
+/*
+ * IOMMU API for ARM architected SMMUv3 implementations.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Copyright (C) 2015 ARM Limited
+ *
+ * Author: Will Deacon <will.deacon@arm.com>
+ *
+ * This driver is powered by bad coffee and bombay mix.
+ */
+
+#include <linux/acpi.h>
+#include <linux/acpi_iort.h>
+#include <linux/bitfield.h>
+#include <linux/bitops.h>
+#include <linux/crash_dump.h>
+#include <linux/delay.h>
+#include <linux/dma-iommu.h>
+#include <linux/err.h>
+#include <linux/interrupt.h>
+#include <linux/iommu.h>
+#include <linux/iopoll.h>
+#include <linux/module.h>
+#include <linux/msi.h>
+#include <linux/of.h>
+#include <linux/of_address.h>
+#include <linux/of_iommu.h>
+#include <linux/of_platform.h>
+#include <linux/pci.h>
+#include <linux/platform_device.h>
+
+#include <linux/amba/bus.h>
+
+#include "io-pgtable.h"
+
+/* MMIO registers */
+#define ARM_SMMU_IDR0			0x0
+#define IDR0_ST_LVL			GENMASK(28, 27)
+#define IDR0_ST_LVL_2LVL		1
+#define IDR0_STALL_MODEL		GENMASK(25, 24)
+#define IDR0_STALL_MODEL_STALL		0
+#define IDR0_STALL_MODEL_FORCE		2
+#define IDR0_TTENDIAN			GENMASK(22, 21)
+#define IDR0_TTENDIAN_MIXED		0
+#define IDR0_TTENDIAN_LE		2
+#define IDR0_TTENDIAN_BE		3
+#define IDR0_CD2L			(1 << 19)
+#define IDR0_VMID16			(1 << 18)
+#define IDR0_PRI			(1 << 16)
+#define IDR0_SEV			(1 << 14)
+#define IDR0_MSI			(1 << 13)
+#define IDR0_ASID16			(1 << 12)
+#define IDR0_ATS			(1 << 10)
+#define IDR0_HYP			(1 << 9)
+#define IDR0_COHACC			(1 << 4)
+#define IDR0_TTF			GENMASK(3, 2)
+#define IDR0_TTF_AARCH64		2
+#define IDR0_TTF_AARCH32_64		3
+#define IDR0_S1P			(1 << 1)
+#define IDR0_S2P			(1 << 0)
+
+#define ARM_SMMU_IDR1			0x4
+#define IDR1_TABLES_PRESET		(1 << 30)
+#define IDR1_QUEUES_PRESET		(1 << 29)
+#define IDR1_REL			(1 << 28)
+#define IDR1_CMDQS			GENMASK(25, 21)
+#define IDR1_EVTQS			GENMASK(20, 16)
+#define IDR1_PRIQS			GENMASK(15, 11)
+#define IDR1_SSIDSIZE			GENMASK(10, 6)
+#define IDR1_SIDSIZE			GENMASK(5, 0)
+
+#define ARM_SMMU_IDR5			0x14
+#define IDR5_STALL_MAX			GENMASK(31, 16)
+#define IDR5_GRAN64K			(1 << 6)
+#define IDR5_GRAN16K			(1 << 5)
+#define IDR5_GRAN4K			(1 << 4)
+#define IDR5_OAS			GENMASK(2, 0)
+#define IDR5_OAS_32_BIT			0
+#define IDR5_OAS_36_BIT			1
+#define IDR5_OAS_40_BIT			2
+#define IDR5_OAS_42_BIT			3
+#define IDR5_OAS_44_BIT			4
+#define IDR5_OAS_48_BIT			5
+#define IDR5_OAS_52_BIT			6
+#define IDR5_VAX			GENMASK(11, 10)
+#define IDR5_VAX_52_BIT			1
+
+#define ARM_SMMU_CR0			0x20
+#define CR0_CMDQEN			(1 << 3)
+#define CR0_EVTQEN			(1 << 2)
+#define CR0_PRIQEN			(1 << 1)
+#define CR0_SMMUEN			(1 << 0)
+
+#define ARM_SMMU_CR0ACK			0x24
+
+#define ARM_SMMU_CR1			0x28
+#define CR1_TABLE_SH			GENMASK(11, 10)
+#define CR1_TABLE_OC			GENMASK(9, 8)
+#define CR1_TABLE_IC			GENMASK(7, 6)
+#define CR1_QUEUE_SH			GENMASK(5, 4)
+#define CR1_QUEUE_OC			GENMASK(3, 2)
+#define CR1_QUEUE_IC			GENMASK(1, 0)
+/* CR1 cacheability fields don't quite follow the usual TCR-style encoding */
+#define CR1_CACHE_NC			0
+#define CR1_CACHE_WB			1
+#define CR1_CACHE_WT			2
+
+#define ARM_SMMU_CR2			0x2c
+#define CR2_PTM				(1 << 2)
+#define CR2_RECINVSID			(1 << 1)
+#define CR2_E2H				(1 << 0)
+
+#define ARM_SMMU_GBPA			0x44
+#define GBPA_UPDATE			(1 << 31)
+#define GBPA_ABORT			(1 << 20)
+
+#define ARM_SMMU_IRQ_CTRL		0x50
+#define IRQ_CTRL_EVTQ_IRQEN		(1 << 2)
+#define IRQ_CTRL_PRIQ_IRQEN		(1 << 1)
+#define IRQ_CTRL_GERROR_IRQEN		(1 << 0)
+
+#define ARM_SMMU_IRQ_CTRLACK		0x54
+
+#define ARM_SMMU_GERROR			0x60
+#define GERROR_SFM_ERR			(1 << 8)
+#define GERROR_MSI_GERROR_ABT_ERR	(1 << 7)
+#define GERROR_MSI_PRIQ_ABT_ERR		(1 << 6)
+#define GERROR_MSI_EVTQ_ABT_ERR		(1 << 5)
+#define GERROR_MSI_CMDQ_ABT_ERR		(1 << 4)
+#define GERROR_PRIQ_ABT_ERR		(1 << 3)
+#define GERROR_EVTQ_ABT_ERR		(1 << 2)
+#define GERROR_CMDQ_ERR			(1 << 0)
+#define GERROR_ERR_MASK			0xfd
+
+#define ARM_SMMU_GERRORN		0x64
+
+#define ARM_SMMU_GERROR_IRQ_CFG0	0x68
+#define ARM_SMMU_GERROR_IRQ_CFG1	0x70
+#define ARM_SMMU_GERROR_IRQ_CFG2	0x74
+
+#define ARM_SMMU_STRTAB_BASE		0x80
+#define STRTAB_BASE_RA			(1UL << 62)
+#define STRTAB_BASE_ADDR_MASK		GENMASK_ULL(51, 6)
+
+#define ARM_SMMU_STRTAB_BASE_CFG	0x88
+#define STRTAB_BASE_CFG_FMT		GENMASK(17, 16)
+#define STRTAB_BASE_CFG_FMT_LINEAR	0
+#define STRTAB_BASE_CFG_FMT_2LVL	1
+#define STRTAB_BASE_CFG_SPLIT		GENMASK(10, 6)
+#define STRTAB_BASE_CFG_LOG2SIZE	GENMASK(5, 0)
+
+#define ARM_SMMU_CMDQ_BASE		0x90
+#define ARM_SMMU_CMDQ_PROD		0x98
+#define ARM_SMMU_CMDQ_CONS		0x9c
+
+#define ARM_SMMU_EVTQ_BASE		0xa0
+#define ARM_SMMU_EVTQ_PROD		0x100a8
+#define ARM_SMMU_EVTQ_CONS		0x100ac
+#define ARM_SMMU_EVTQ_IRQ_CFG0		0xb0
+#define ARM_SMMU_EVTQ_IRQ_CFG1		0xb8
+#define ARM_SMMU_EVTQ_IRQ_CFG2		0xbc
+
+#define ARM_SMMU_PRIQ_BASE		0xc0
+#define ARM_SMMU_PRIQ_PROD		0x100c8
+#define ARM_SMMU_PRIQ_CONS		0x100cc
+#define ARM_SMMU_PRIQ_IRQ_CFG0		0xd0
+#define ARM_SMMU_PRIQ_IRQ_CFG1		0xd8
+#define ARM_SMMU_PRIQ_IRQ_CFG2		0xdc
+
+/* Common MSI config fields */
+#define MSI_CFG0_ADDR_MASK		GENMASK_ULL(51, 2)
+#define MSI_CFG2_SH			GENMASK(5, 4)
+#define MSI_CFG2_MEMATTR		GENMASK(3, 0)
+
+/* Common memory attribute values */
+#define ARM_SMMU_SH_NSH			0
+#define ARM_SMMU_SH_OSH			2
+#define ARM_SMMU_SH_ISH			3
+#define ARM_SMMU_MEMATTR_DEVICE_nGnRE	0x1
+#define ARM_SMMU_MEMATTR_OIWB		0xf
+
+#define Q_IDX(q, p)			((p) & ((1 << (q)->max_n_shift) - 1))
+#define Q_WRP(q, p)			((p) & (1 << (q)->max_n_shift))
+#define Q_OVERFLOW_FLAG			(1 << 31)
+#define Q_OVF(q, p)			((p) & Q_OVERFLOW_FLAG)
+#define Q_ENT(q, p)			((q)->base +			\
+					 Q_IDX(q, p) * (q)->ent_dwords)
+
+#define Q_BASE_RWA			(1UL << 62)
+#define Q_BASE_ADDR_MASK		GENMASK_ULL(51, 5)
+#define Q_BASE_LOG2SIZE			GENMASK(4, 0)
+
+/*
+ * Stream table.
+ *
+ * Linear: Enough to cover 1 << IDR1.SIDSIZE entries
+ * 2lvl: 128k L1 entries,
+ *       256 lazy entries per table (each table covers a PCI bus)
+ */
+#define STRTAB_L1_SZ_SHIFT		20
+#define STRTAB_SPLIT			8
+
+#define STRTAB_L1_DESC_DWORDS		1
+#define STRTAB_L1_DESC_SPAN		GENMASK_ULL(4, 0)
+#define STRTAB_L1_DESC_L2PTR_MASK	GENMASK_ULL(51, 6)
+
+#define STRTAB_STE_DWORDS		8
+#define STRTAB_STE_0_V			(1UL << 0)
+#define STRTAB_STE_0_CFG		GENMASK_ULL(3, 1)
+#define STRTAB_STE_0_CFG_ABORT		0
+#define STRTAB_STE_0_CFG_BYPASS		4
+#define STRTAB_STE_0_CFG_S1_TRANS	5
+#define STRTAB_STE_0_CFG_S2_TRANS	6
+
+#define STRTAB_STE_0_S1FMT		GENMASK_ULL(5, 4)
+#define STRTAB_STE_0_S1FMT_LINEAR	0
+#define STRTAB_STE_0_S1CTXPTR_MASK	GENMASK_ULL(51, 6)
+#define STRTAB_STE_0_S1CDMAX		GENMASK_ULL(63, 59)
+
+#define STRTAB_STE_1_S1C_CACHE_NC	0UL
+#define STRTAB_STE_1_S1C_CACHE_WBRA	1UL
+#define STRTAB_STE_1_S1C_CACHE_WT	2UL
+#define STRTAB_STE_1_S1C_CACHE_WB	3UL
+#define STRTAB_STE_1_S1CIR		GENMASK_ULL(3, 2)
+#define STRTAB_STE_1_S1COR		GENMASK_ULL(5, 4)
+#define STRTAB_STE_1_S1CSH		GENMASK_ULL(7, 6)
+
+#define STRTAB_STE_1_S1STALLD		(1UL << 27)
+
+#define STRTAB_STE_1_EATS		GENMASK_ULL(29, 28)
+#define STRTAB_STE_1_EATS_ABT		0UL
+#define STRTAB_STE_1_EATS_TRANS		1UL
+#define STRTAB_STE_1_EATS_S1CHK		2UL
+
+#define STRTAB_STE_1_STRW		GENMASK_ULL(31, 30)
+#define STRTAB_STE_1_STRW_NSEL1		0UL
+#define STRTAB_STE_1_STRW_EL2		2UL
+
+#define STRTAB_STE_1_SHCFG		GENMASK_ULL(45, 44)
+#define STRTAB_STE_1_SHCFG_INCOMING	1UL
+
+#define STRTAB_STE_2_S2VMID		GENMASK_ULL(15, 0)
+#define STRTAB_STE_2_VTCR		GENMASK_ULL(50, 32)
+#define STRTAB_STE_2_S2AA64		(1UL << 51)
+#define STRTAB_STE_2_S2ENDI		(1UL << 52)
+#define STRTAB_STE_2_S2PTW		(1UL << 54)
+#define STRTAB_STE_2_S2R		(1UL << 58)
+
+#define STRTAB_STE_3_S2TTB_MASK		GENMASK_ULL(51, 4)
+
+/* Context descriptor (stage-1 only) */
+#define CTXDESC_CD_DWORDS		8
+#define CTXDESC_CD_0_TCR_T0SZ		GENMASK_ULL(5, 0)
+#define ARM64_TCR_T0SZ			GENMASK_ULL(5, 0)
+#define CTXDESC_CD_0_TCR_TG0		GENMASK_ULL(7, 6)
+#define ARM64_TCR_TG0			GENMASK_ULL(15, 14)
+#define CTXDESC_CD_0_TCR_IRGN0		GENMASK_ULL(9, 8)
+#define ARM64_TCR_IRGN0			GENMASK_ULL(9, 8)
+#define CTXDESC_CD_0_TCR_ORGN0		GENMASK_ULL(11, 10)
+#define ARM64_TCR_ORGN0			GENMASK_ULL(11, 10)
+#define CTXDESC_CD_0_TCR_SH0		GENMASK_ULL(13, 12)
+#define ARM64_TCR_SH0			GENMASK_ULL(13, 12)
+#define CTXDESC_CD_0_TCR_EPD0		(1ULL << 14)
+#define ARM64_TCR_EPD0			(1ULL << 7)
+#define CTXDESC_CD_0_TCR_EPD1		(1ULL << 30)
+#define ARM64_TCR_EPD1			(1ULL << 23)
+
+#define CTXDESC_CD_0_ENDI		(1UL << 15)
+#define CTXDESC_CD_0_V			(1UL << 31)
+
+#define CTXDESC_CD_0_TCR_IPS		GENMASK_ULL(34, 32)
+#define ARM64_TCR_IPS			GENMASK_ULL(34, 32)
+#define CTXDESC_CD_0_TCR_TBI0		(1ULL << 38)
+#define ARM64_TCR_TBI0			(1ULL << 37)
+
+#define CTXDESC_CD_0_AA64		(1UL << 41)
+#define CTXDESC_CD_0_S			(1UL << 44)
+#define CTXDESC_CD_0_R			(1UL << 45)
+#define CTXDESC_CD_0_A			(1UL << 46)
+#define CTXDESC_CD_0_ASET		(1UL << 47)
+#define CTXDESC_CD_0_ASID		GENMASK_ULL(63, 48)
+
+#define CTXDESC_CD_1_TTB0_MASK		GENMASK_ULL(51, 4)
+
+/* Convert between AArch64 (CPU) TCR format and SMMU CD format */
+#define ARM_SMMU_TCR2CD(tcr, fld)	FIELD_PREP(CTXDESC_CD_0_TCR_##fld, \
+					FIELD_GET(ARM64_TCR_##fld, tcr))
+
+/* Command queue */
+#define CMDQ_ENT_DWORDS			2
+#define CMDQ_MAX_SZ_SHIFT		8
+
+#define CMDQ_CONS_ERR			GENMASK(30, 24)
+#define CMDQ_ERR_CERROR_NONE_IDX	0
+#define CMDQ_ERR_CERROR_ILL_IDX		1
+#define CMDQ_ERR_CERROR_ABT_IDX		2
+
+#define CMDQ_0_OP			GENMASK_ULL(7, 0)
+#define CMDQ_0_SSV			(1UL << 11)
+
+#define CMDQ_PREFETCH_0_SID		GENMASK_ULL(63, 32)
+#define CMDQ_PREFETCH_1_SIZE		GENMASK_ULL(4, 0)
+#define CMDQ_PREFETCH_1_ADDR_MASK	GENMASK_ULL(63, 12)
+
+#define CMDQ_CFGI_0_SID			GENMASK_ULL(63, 32)
+#define CMDQ_CFGI_1_LEAF		(1UL << 0)
+#define CMDQ_CFGI_1_RANGE		GENMASK_ULL(4, 0)
+
+#define CMDQ_TLBI_0_VMID		GENMASK_ULL(47, 32)
+#define CMDQ_TLBI_0_ASID		GENMASK_ULL(63, 48)
+#define CMDQ_TLBI_1_LEAF		(1UL << 0)
+#define CMDQ_TLBI_1_VA_MASK		GENMASK_ULL(63, 12)
+#define CMDQ_TLBI_1_IPA_MASK		GENMASK_ULL(51, 12)
+
+#define CMDQ_PRI_0_SSID			GENMASK_ULL(31, 12)
+#define CMDQ_PRI_0_SID			GENMASK_ULL(63, 32)
+#define CMDQ_PRI_1_GRPID		GENMASK_ULL(8, 0)
+#define CMDQ_PRI_1_RESP			GENMASK_ULL(13, 12)
+
+#define CMDQ_SYNC_0_CS			GENMASK_ULL(13, 12)
+#define CMDQ_SYNC_0_CS_NONE		0
+#define CMDQ_SYNC_0_CS_IRQ		1
+#define CMDQ_SYNC_0_CS_SEV		2
+#define CMDQ_SYNC_0_MSH			GENMASK_ULL(23, 22)
+#define CMDQ_SYNC_0_MSIATTR		GENMASK_ULL(27, 24)
+#define CMDQ_SYNC_0_MSIDATA		GENMASK_ULL(63, 32)
+#define CMDQ_SYNC_1_MSIADDR_MASK	GENMASK_ULL(51, 2)
+
+/* Event queue */
+#define EVTQ_ENT_DWORDS			4
+#define EVTQ_MAX_SZ_SHIFT		7
+
+#define EVTQ_0_ID			GENMASK_ULL(7, 0)
+
+/* PRI queue */
+#define PRIQ_ENT_DWORDS			2
+#define PRIQ_MAX_SZ_SHIFT		8
+
+#define PRIQ_0_SID			GENMASK_ULL(31, 0)
+#define PRIQ_0_SSID			GENMASK_ULL(51, 32)
+#define PRIQ_0_PERM_PRIV		(1UL << 58)
+#define PRIQ_0_PERM_EXEC		(1UL << 59)
+#define PRIQ_0_PERM_READ		(1UL << 60)
+#define PRIQ_0_PERM_WRITE		(1UL << 61)
+#define PRIQ_0_PRG_LAST			(1UL << 62)
+#define PRIQ_0_SSID_V			(1UL << 63)
+
+#define PRIQ_1_PRG_IDX			GENMASK_ULL(8, 0)
+#define PRIQ_1_ADDR_MASK		GENMASK_ULL(63, 12)
+
+/* High-level queue structures */
+#define ARM_SMMU_POLL_TIMEOUT_US	100
+#define ARM_SMMU_CMDQ_SYNC_TIMEOUT_US	1000000 /* 1s! */
+#define ARM_SMMU_CMDQ_SYNC_SPIN_COUNT	10
+
+#define MSI_IOVA_BASE			0x8000000
+#define MSI_IOVA_LENGTH			0x100000
+
+static bool disable_bypass = 1;
+module_param_named(disable_bypass, disable_bypass, bool, S_IRUGO);
+MODULE_PARM_DESC(disable_bypass,
+	"Disable bypass streams such that incoming transactions from devices that are not attached to an iommu domain will report an abort back to the device and will not be allowed to pass through the SMMU.");
+
+enum pri_resp {
+	PRI_RESP_DENY = 0,
+	PRI_RESP_FAIL = 1,
+	PRI_RESP_SUCC = 2,
+};
+
+enum arm_smmu_msi_index {
+	EVTQ_MSI_INDEX,
+	GERROR_MSI_INDEX,
+	PRIQ_MSI_INDEX,
+	ARM_SMMU_MAX_MSIS,
+};
+
+static phys_addr_t arm_smmu_msi_cfg[ARM_SMMU_MAX_MSIS][3] = {
+	[EVTQ_MSI_INDEX] = {
+		ARM_SMMU_EVTQ_IRQ_CFG0,
+		ARM_SMMU_EVTQ_IRQ_CFG1,
+		ARM_SMMU_EVTQ_IRQ_CFG2,
+	},
+	[GERROR_MSI_INDEX] = {
+		ARM_SMMU_GERROR_IRQ_CFG0,
+		ARM_SMMU_GERROR_IRQ_CFG1,
+		ARM_SMMU_GERROR_IRQ_CFG2,
+	},
+	[PRIQ_MSI_INDEX] = {
+		ARM_SMMU_PRIQ_IRQ_CFG0,
+		ARM_SMMU_PRIQ_IRQ_CFG1,
+		ARM_SMMU_PRIQ_IRQ_CFG2,
+	},
+};
+
+struct arm_smmu_cmdq_ent {
+	/* Common fields */
+	u8				opcode;
+	bool				substream_valid;
+
+	/* Command-specific fields */
+	union {
+		#define CMDQ_OP_PREFETCH_CFG	0x1
+		struct {
+			u32			sid;
+			u8			size;
+			u64			addr;
+		} prefetch;
+
+		#define CMDQ_OP_CFGI_STE	0x3
+		#define CMDQ_OP_CFGI_ALL	0x4
+		struct {
+			u32			sid;
+			union {
+				bool		leaf;
+				u8		span;
+			};
+		} cfgi;
+
+		#define CMDQ_OP_TLBI_NH_ASID	0x11
+		#define CMDQ_OP_TLBI_NH_VA	0x12
+		#define CMDQ_OP_TLBI_EL2_ALL	0x20
+		#define CMDQ_OP_TLBI_S12_VMALL	0x28
+		#define CMDQ_OP_TLBI_S2_IPA	0x2a
+		#define CMDQ_OP_TLBI_NSNH_ALL	0x30
+		struct {
+			u16			asid;
+			u16			vmid;
+			bool			leaf;
+			u64			addr;
+		} tlbi;
+
+		#define CMDQ_OP_PRI_RESP	0x41
+		struct {
+			u32			sid;
+			u32			ssid;
+			u16			grpid;
+			enum pri_resp		resp;
+		} pri;
+
+		#define CMDQ_OP_CMD_SYNC	0x46
+		struct {
+			u32			msidata;
+			u64			msiaddr;
+		} sync;
+	};
+};
+
+struct arm_smmu_queue {
+	int				irq; /* Wired interrupt */
+
+	__le64				*base;
+	dma_addr_t			base_dma;
+	u64				q_base;
+
+	size_t				ent_dwords;
+	u32				max_n_shift;
+	u32				prod;
+	u32				cons;
+
+	u32 __iomem			*prod_reg;
+	u32 __iomem			*cons_reg;
+};
+
+struct arm_smmu_cmdq {
+	struct arm_smmu_queue		q;
+	spinlock_t			lock;
+};
+
+struct arm_smmu_evtq {
+	struct arm_smmu_queue		q;
+	u32				max_stalls;
+};
+
+struct arm_smmu_priq {
+	struct arm_smmu_queue		q;
+};
+
+/* High-level stream table and context descriptor structures */
+struct arm_smmu_strtab_l1_desc {
+	u8				span;
+
+	__le64				*l2ptr;
+	dma_addr_t			l2ptr_dma;
+};
+
+struct arm_smmu_s1_cfg {
+	__le64				*cdptr;
+	dma_addr_t			cdptr_dma;
+
+	struct arm_smmu_ctx_desc {
+		u16	asid;
+		u64	ttbr;
+		u64	tcr;
+		u64	mair;
+	}				cd;
+};
+
+struct arm_smmu_s2_cfg {
+	u16				vmid;
+	u64				vttbr;
+	u64				vtcr;
+};
+
+struct arm_smmu_strtab_ent {
+	/*
+	 * An STE is "assigned" if the master emitting the corresponding SID
+	 * is attached to a domain. The behaviour of an unassigned STE is
+	 * determined by the disable_bypass parameter, whereas an assigned
+	 * STE behaves according to s1_cfg/s2_cfg, which themselves are
+	 * configured according to the domain type.
+	 */
+	bool				assigned;
+	struct arm_smmu_s1_cfg		*s1_cfg;
+	struct arm_smmu_s2_cfg		*s2_cfg;
+};
+
+struct arm_smmu_strtab_cfg {
+	__le64				*strtab;
+	dma_addr_t			strtab_dma;
+	struct arm_smmu_strtab_l1_desc	*l1_desc;
+	unsigned int			num_l1_ents;
+
+	u64				strtab_base;
+	u32				strtab_base_cfg;
+};
+
+/* An SMMUv3 instance */
+struct arm_smmu_device {
+	struct device			*dev;
+	void __iomem			*base;
+
+#define ARM_SMMU_FEAT_2_LVL_STRTAB	(1 << 0)
+#define ARM_SMMU_FEAT_2_LVL_CDTAB	(1 << 1)
+#define ARM_SMMU_FEAT_TT_LE		(1 << 2)
+#define ARM_SMMU_FEAT_TT_BE		(1 << 3)
+#define ARM_SMMU_FEAT_PRI		(1 << 4)
+#define ARM_SMMU_FEAT_ATS		(1 << 5)
+#define ARM_SMMU_FEAT_SEV		(1 << 6)
+#define ARM_SMMU_FEAT_MSI		(1 << 7)
+#define ARM_SMMU_FEAT_COHERENCY		(1 << 8)
+#define ARM_SMMU_FEAT_TRANS_S1		(1 << 9)
+#define ARM_SMMU_FEAT_TRANS_S2		(1 << 10)
+#define ARM_SMMU_FEAT_STALLS		(1 << 11)
+#define ARM_SMMU_FEAT_HYP		(1 << 12)
+#define ARM_SMMU_FEAT_STALL_FORCE	(1 << 13)
+#define ARM_SMMU_FEAT_VAX		(1 << 14)
+	u32				features;
+
+#define ARM_SMMU_OPT_SKIP_PREFETCH	(1 << 0)
+#define ARM_SMMU_OPT_PAGE0_REGS_ONLY	(1 << 1)
+	u32				options;
+
+	struct arm_smmu_cmdq		cmdq;
+	struct arm_smmu_evtq		evtq;
+	struct arm_smmu_priq		priq;
+
+	int				gerr_irq;
+	int				combined_irq;
+	u32				sync_nr;
+
+	unsigned long			ias; /* IPA */
+	unsigned long			oas; /* PA */
+	unsigned long			pgsize_bitmap;
+
+#define ARM_SMMU_MAX_ASIDS		(1 << 16)
+	unsigned int			asid_bits;
+	DECLARE_BITMAP(asid_map, ARM_SMMU_MAX_ASIDS);
+
+#define ARM_SMMU_MAX_VMIDS		(1 << 16)
+	unsigned int			vmid_bits;
+	DECLARE_BITMAP(vmid_map, ARM_SMMU_MAX_VMIDS);
+
+	unsigned int			ssid_bits;
+	unsigned int			sid_bits;
+
+	struct arm_smmu_strtab_cfg	strtab_cfg;
+
+	/* Hi16xx adds an extra 32 bits of goodness to its MSI payload */
+	union {
+		u32			sync_count;
+		u64			padding;
+	};
+
+	/* IOMMU core code handle */
+	struct iommu_device		iommu;
+};
+
+/* SMMU private data for each master */
+struct arm_smmu_master_data {
+	struct arm_smmu_device		*smmu;
+	struct arm_smmu_strtab_ent	ste;
+};
+
+/* SMMU private data for an IOMMU domain */
+enum arm_smmu_domain_stage {
+	ARM_SMMU_DOMAIN_S1 = 0,
+	ARM_SMMU_DOMAIN_S2,
+	ARM_SMMU_DOMAIN_NESTED,
+	ARM_SMMU_DOMAIN_BYPASS,
+};
+
+struct arm_smmu_domain {
+	struct arm_smmu_device		*smmu;
+	struct mutex			init_mutex; /* Protects smmu pointer */
+
+	struct io_pgtable_ops		*pgtbl_ops;
+
+	enum arm_smmu_domain_stage	stage;
+	union {
+		struct arm_smmu_s1_cfg	s1_cfg;
+		struct arm_smmu_s2_cfg	s2_cfg;
+	};
+
+	struct iommu_domain		domain;
+};
+
+struct arm_smmu_option_prop {
+	u32 opt;
+	const char *prop;
+};
+
+static struct arm_smmu_option_prop arm_smmu_options[] = {
+	{ ARM_SMMU_OPT_SKIP_PREFETCH, "hisilicon,broken-prefetch-cmd" },
+	{ ARM_SMMU_OPT_PAGE0_REGS_ONLY, "cavium,cn9900-broken-page1-regspace"},
+	{ 0, NULL},
+};
+
+static inline void __iomem *arm_smmu_page1_fixup(unsigned long offset,
+						 struct arm_smmu_device *smmu)
+{
+	if ((offset > SZ_64K) &&
+	    (smmu->options & ARM_SMMU_OPT_PAGE0_REGS_ONLY))
+		offset -= SZ_64K;
+
+	return smmu->base + offset;
+}
+
+static struct arm_smmu_domain *to_smmu_domain(struct iommu_domain *dom)
+{
+	return container_of(dom, struct arm_smmu_domain, domain);
+}
+
+static void parse_driver_options(struct arm_smmu_device *smmu)
+{
+	int i = 0;
+
+	do {
+		if (of_property_read_bool(smmu->dev->of_node,
+						arm_smmu_options[i].prop)) {
+			smmu->options |= arm_smmu_options[i].opt;
+			dev_notice(smmu->dev, "option %s\n",
+				arm_smmu_options[i].prop);
+		}
+	} while (arm_smmu_options[++i].opt);
+}
+
+/* Low-level queue manipulation functions */
+static bool queue_full(struct arm_smmu_queue *q)
+{
+	return Q_IDX(q, q->prod) == Q_IDX(q, q->cons) &&
+	       Q_WRP(q, q->prod) != Q_WRP(q, q->cons);
+}
+
+static bool queue_empty(struct arm_smmu_queue *q)
+{
+	return Q_IDX(q, q->prod) == Q_IDX(q, q->cons) &&
+	       Q_WRP(q, q->prod) == Q_WRP(q, q->cons);
+}
+
+static void queue_sync_cons(struct arm_smmu_queue *q)
+{
+	q->cons = readl_relaxed(q->cons_reg);
+}
+
+static void queue_inc_cons(struct arm_smmu_queue *q)
+{
+	u32 cons = (Q_WRP(q, q->cons) | Q_IDX(q, q->cons)) + 1;
+
+	q->cons = Q_OVF(q, q->cons) | Q_WRP(q, cons) | Q_IDX(q, cons);
+
+	/*
+	 * Ensure that all CPU accesses (reads and writes) to the queue
+	 * are complete before we update the cons pointer.
+	 */
+	mb();
+	writel_relaxed(q->cons, q->cons_reg);
+}
+
+static int queue_sync_prod(struct arm_smmu_queue *q)
+{
+	int ret = 0;
+	u32 prod = readl_relaxed(q->prod_reg);
+
+	if (Q_OVF(q, prod) != Q_OVF(q, q->prod))
+		ret = -EOVERFLOW;
+
+	q->prod = prod;
+	return ret;
+}
+
+static void queue_inc_prod(struct arm_smmu_queue *q)
+{
+	u32 prod = (Q_WRP(q, q->prod) | Q_IDX(q, q->prod)) + 1;
+
+	q->prod = Q_OVF(q, q->prod) | Q_WRP(q, prod) | Q_IDX(q, prod);
+	writel(q->prod, q->prod_reg);
+}
+
+/*
+ * Wait for the SMMU to consume items. If drain is true, wait until the queue
+ * is empty. Otherwise, wait until there is at least one free slot.
+ */
+static int queue_poll_cons(struct arm_smmu_queue *q, bool sync, bool wfe)
+{
+	ktime_t timeout;
+	unsigned int delay = 1, spin_cnt = 0;
+
+	/* Wait longer if it's a CMD_SYNC */
+	timeout = ktime_add_us(ktime_get(), sync ?
+					    ARM_SMMU_CMDQ_SYNC_TIMEOUT_US :
+					    ARM_SMMU_POLL_TIMEOUT_US);
+
+	while (queue_sync_cons(q), (sync ? !queue_empty(q) : queue_full(q))) {
+		if (ktime_compare(ktime_get(), timeout) > 0)
+			return -ETIMEDOUT;
+
+		if (wfe) {
+			wfe();
+		} else if (++spin_cnt < ARM_SMMU_CMDQ_SYNC_SPIN_COUNT) {
+			cpu_relax();
+			continue;
+		} else {
+			udelay(delay);
+			delay *= 2;
+			spin_cnt = 0;
+		}
+	}
+
+	return 0;
+}
+
+static void queue_write(__le64 *dst, u64 *src, size_t n_dwords)
+{
+	int i;
+
+	for (i = 0; i < n_dwords; ++i)
+		*dst++ = cpu_to_le64(*src++);
+}
+
+static int queue_insert_raw(struct arm_smmu_queue *q, u64 *ent)
+{
+	if (queue_full(q))
+		return -ENOSPC;
+
+	queue_write(Q_ENT(q, q->prod), ent, q->ent_dwords);
+	queue_inc_prod(q);
+	return 0;
+}
+
+static void queue_read(__le64 *dst, u64 *src, size_t n_dwords)
+{
+	int i;
+
+	for (i = 0; i < n_dwords; ++i)
+		*dst++ = le64_to_cpu(*src++);
+}
+
+static int queue_remove_raw(struct arm_smmu_queue *q, u64 *ent)
+{
+	if (queue_empty(q))
+		return -EAGAIN;
+
+	queue_read(ent, Q_ENT(q, q->cons), q->ent_dwords);
+	queue_inc_cons(q);
+	return 0;
+}
+
+/* High-level queue accessors */
+static int arm_smmu_cmdq_build_cmd(u64 *cmd, struct arm_smmu_cmdq_ent *ent)
+{
+	memset(cmd, 0, CMDQ_ENT_DWORDS << 3);
+	cmd[0] |= FIELD_PREP(CMDQ_0_OP, ent->opcode);
+
+	switch (ent->opcode) {
+	case CMDQ_OP_TLBI_EL2_ALL:
+	case CMDQ_OP_TLBI_NSNH_ALL:
+		break;
+	case CMDQ_OP_PREFETCH_CFG:
+		cmd[0] |= FIELD_PREP(CMDQ_PREFETCH_0_SID, ent->prefetch.sid);
+		cmd[1] |= FIELD_PREP(CMDQ_PREFETCH_1_SIZE, ent->prefetch.size);
+		cmd[1] |= ent->prefetch.addr & CMDQ_PREFETCH_1_ADDR_MASK;
+		break;
+	case CMDQ_OP_CFGI_STE:
+		cmd[0] |= FIELD_PREP(CMDQ_CFGI_0_SID, ent->cfgi.sid);
+		cmd[1] |= FIELD_PREP(CMDQ_CFGI_1_LEAF, ent->cfgi.leaf);
+		break;
+	case CMDQ_OP_CFGI_ALL:
+		/* Cover the entire SID range */
+		cmd[1] |= FIELD_PREP(CMDQ_CFGI_1_RANGE, 31);
+		break;
+	case CMDQ_OP_TLBI_NH_VA:
+		cmd[0] |= FIELD_PREP(CMDQ_TLBI_0_VMID, ent->tlbi.vmid);
+		cmd[0] |= FIELD_PREP(CMDQ_TLBI_0_ASID, ent->tlbi.asid);
+		cmd[1] |= FIELD_PREP(CMDQ_TLBI_1_LEAF, ent->tlbi.leaf);
+		cmd[1] |= ent->tlbi.addr & CMDQ_TLBI_1_VA_MASK;
+		break;
+	case CMDQ_OP_TLBI_S2_IPA:
+		cmd[0] |= FIELD_PREP(CMDQ_TLBI_0_VMID, ent->tlbi.vmid);
+		cmd[1] |= FIELD_PREP(CMDQ_TLBI_1_LEAF, ent->tlbi.leaf);
+		cmd[1] |= ent->tlbi.addr & CMDQ_TLBI_1_IPA_MASK;
+		break;
+	case CMDQ_OP_TLBI_NH_ASID:
+		cmd[0] |= FIELD_PREP(CMDQ_TLBI_0_ASID, ent->tlbi.asid);
+		/* Fallthrough */
+	case CMDQ_OP_TLBI_S12_VMALL:
+		cmd[0] |= FIELD_PREP(CMDQ_TLBI_0_VMID, ent->tlbi.vmid);
+		break;
+	case CMDQ_OP_PRI_RESP:
+		cmd[0] |= FIELD_PREP(CMDQ_0_SSV, ent->substream_valid);
+		cmd[0] |= FIELD_PREP(CMDQ_PRI_0_SSID, ent->pri.ssid);
+		cmd[0] |= FIELD_PREP(CMDQ_PRI_0_SID, ent->pri.sid);
+		cmd[1] |= FIELD_PREP(CMDQ_PRI_1_GRPID, ent->pri.grpid);
+		switch (ent->pri.resp) {
+		case PRI_RESP_DENY:
+		case PRI_RESP_FAIL:
+		case PRI_RESP_SUCC:
+			break;
+		default:
+			return -EINVAL;
+		}
+		cmd[1] |= FIELD_PREP(CMDQ_PRI_1_RESP, ent->pri.resp);
+		break;
+	case CMDQ_OP_CMD_SYNC:
+		if (ent->sync.msiaddr)
+			cmd[0] |= FIELD_PREP(CMDQ_SYNC_0_CS, CMDQ_SYNC_0_CS_IRQ);
+		else
+			cmd[0] |= FIELD_PREP(CMDQ_SYNC_0_CS, CMDQ_SYNC_0_CS_SEV);
+		cmd[0] |= FIELD_PREP(CMDQ_SYNC_0_MSH, ARM_SMMU_SH_ISH);
+		cmd[0] |= FIELD_PREP(CMDQ_SYNC_0_MSIATTR, ARM_SMMU_MEMATTR_OIWB);
+		/*
+		 * Commands are written little-endian, but we want the SMMU to
+		 * receive MSIData, and thus write it back to memory, in CPU
+		 * byte order, so big-endian needs an extra byteswap here.
+		 */
+		cmd[0] |= FIELD_PREP(CMDQ_SYNC_0_MSIDATA,
+				     cpu_to_le32(ent->sync.msidata));
+		cmd[1] |= ent->sync.msiaddr & CMDQ_SYNC_1_MSIADDR_MASK;
+		break;
+	default:
+		return -ENOENT;
+	}
+
+	return 0;
+}
+
+static void arm_smmu_cmdq_skip_err(struct arm_smmu_device *smmu)
+{
+	static const char *cerror_str[] = {
+		[CMDQ_ERR_CERROR_NONE_IDX]	= "No error",
+		[CMDQ_ERR_CERROR_ILL_IDX]	= "Illegal command",
+		[CMDQ_ERR_CERROR_ABT_IDX]	= "Abort on command fetch",
+	};
+
+	int i;
+	u64 cmd[CMDQ_ENT_DWORDS];
+	struct arm_smmu_queue *q = &smmu->cmdq.q;
+	u32 cons = readl_relaxed(q->cons_reg);
+	u32 idx = FIELD_GET(CMDQ_CONS_ERR, cons);
+	struct arm_smmu_cmdq_ent cmd_sync = {
+		.opcode = CMDQ_OP_CMD_SYNC,
+	};
+
+	dev_err(smmu->dev, "CMDQ error (cons 0x%08x): %s\n", cons,
+		idx < ARRAY_SIZE(cerror_str) ?  cerror_str[idx] : "Unknown");
+
+	switch (idx) {
+	case CMDQ_ERR_CERROR_ABT_IDX:
+		dev_err(smmu->dev, "retrying command fetch\n");
+	case CMDQ_ERR_CERROR_NONE_IDX:
+		return;
+	case CMDQ_ERR_CERROR_ILL_IDX:
+		/* Fallthrough */
+	default:
+		break;
+	}
+
+	/*
+	 * We may have concurrent producers, so we need to be careful
+	 * not to touch any of the shadow cmdq state.
+	 */
+	queue_read(cmd, Q_ENT(q, cons), q->ent_dwords);
+	dev_err(smmu->dev, "skipping command in error state:\n");
+	for (i = 0; i < ARRAY_SIZE(cmd); ++i)
+		dev_err(smmu->dev, "\t0x%016llx\n", (unsigned long long)cmd[i]);
+
+	/* Convert the erroneous command into a CMD_SYNC */
+	if (arm_smmu_cmdq_build_cmd(cmd, &cmd_sync)) {
+		dev_err(smmu->dev, "failed to convert to CMD_SYNC\n");
+		return;
+	}
+
+	queue_write(Q_ENT(q, cons), cmd, q->ent_dwords);
+}
+
+static void arm_smmu_cmdq_insert_cmd(struct arm_smmu_device *smmu, u64 *cmd)
+{
+	struct arm_smmu_queue *q = &smmu->cmdq.q;
+	bool wfe = !!(smmu->features & ARM_SMMU_FEAT_SEV);
+
+	while (queue_insert_raw(q, cmd) == -ENOSPC) {
+		if (queue_poll_cons(q, false, wfe))
+			dev_err_ratelimited(smmu->dev, "CMDQ timeout\n");
+	}
+}
+
+static void arm_smmu_cmdq_issue_cmd(struct arm_smmu_device *smmu,
+				    struct arm_smmu_cmdq_ent *ent)
+{
+	u64 cmd[CMDQ_ENT_DWORDS];
+	unsigned long flags;
+
+	if (arm_smmu_cmdq_build_cmd(cmd, ent)) {
+		dev_warn(smmu->dev, "ignoring unknown CMDQ opcode 0x%x\n",
+			 ent->opcode);
+		return;
+	}
+
+	spin_lock_irqsave(&smmu->cmdq.lock, flags);
+	arm_smmu_cmdq_insert_cmd(smmu, cmd);
+	spin_unlock_irqrestore(&smmu->cmdq.lock, flags);
+}
+
+/*
+ * The difference between val and sync_idx is bounded by the maximum size of
+ * a queue at 2^20 entries, so 32 bits is plenty for wrap-safe arithmetic.
+ */
+static int __arm_smmu_sync_poll_msi(struct arm_smmu_device *smmu, u32 sync_idx)
+{
+	ktime_t timeout;
+	u32 val;
+
+	timeout = ktime_add_us(ktime_get(), ARM_SMMU_CMDQ_SYNC_TIMEOUT_US);
+	val = smp_cond_load_acquire(&smmu->sync_count,
+				    (int)(VAL - sync_idx) >= 0 ||
+				    !ktime_before(ktime_get(), timeout));
+
+	return (int)(val - sync_idx) < 0 ? -ETIMEDOUT : 0;
+}
+
+static int __arm_smmu_cmdq_issue_sync_msi(struct arm_smmu_device *smmu)
+{
+	u64 cmd[CMDQ_ENT_DWORDS];
+	unsigned long flags;
+	struct arm_smmu_cmdq_ent ent = {
+		.opcode = CMDQ_OP_CMD_SYNC,
+		.sync	= {
+			.msiaddr = virt_to_phys(&smmu->sync_count),
+		},
+	};
+
+	spin_lock_irqsave(&smmu->cmdq.lock, flags);
+	ent.sync.msidata = ++smmu->sync_nr;
+	arm_smmu_cmdq_build_cmd(cmd, &ent);
+	arm_smmu_cmdq_insert_cmd(smmu, cmd);
+	spin_unlock_irqrestore(&smmu->cmdq.lock, flags);
+
+	return __arm_smmu_sync_poll_msi(smmu, ent.sync.msidata);
+}
+
+static int __arm_smmu_cmdq_issue_sync(struct arm_smmu_device *smmu)
+{
+	u64 cmd[CMDQ_ENT_DWORDS];
+	unsigned long flags;
+	bool wfe = !!(smmu->features & ARM_SMMU_FEAT_SEV);
+	struct arm_smmu_cmdq_ent ent = { .opcode = CMDQ_OP_CMD_SYNC };
+	int ret;
+
+	arm_smmu_cmdq_build_cmd(cmd, &ent);
+
+	spin_lock_irqsave(&smmu->cmdq.lock, flags);
+	arm_smmu_cmdq_insert_cmd(smmu, cmd);
+	ret = queue_poll_cons(&smmu->cmdq.q, true, wfe);
+	spin_unlock_irqrestore(&smmu->cmdq.lock, flags);
+
+	return ret;
+}
+
+static void arm_smmu_cmdq_issue_sync(struct arm_smmu_device *smmu)
+{
+	int ret;
+	bool msi = (smmu->features & ARM_SMMU_FEAT_MSI) &&
+		   (smmu->features & ARM_SMMU_FEAT_COHERENCY);
+
+	ret = msi ? __arm_smmu_cmdq_issue_sync_msi(smmu)
+		  : __arm_smmu_cmdq_issue_sync(smmu);
+	if (ret)
+		dev_err_ratelimited(smmu->dev, "CMD_SYNC timeout\n");
+}
+
+/* Context descriptor manipulation functions */
+static u64 arm_smmu_cpu_tcr_to_cd(u64 tcr)
+{
+	u64 val = 0;
+
+	/* Repack the TCR. Just care about TTBR0 for now */
+	val |= ARM_SMMU_TCR2CD(tcr, T0SZ);
+	val |= ARM_SMMU_TCR2CD(tcr, TG0);
+	val |= ARM_SMMU_TCR2CD(tcr, IRGN0);
+	val |= ARM_SMMU_TCR2CD(tcr, ORGN0);
+	val |= ARM_SMMU_TCR2CD(tcr, SH0);
+	val |= ARM_SMMU_TCR2CD(tcr, EPD0);
+	val |= ARM_SMMU_TCR2CD(tcr, EPD1);
+	val |= ARM_SMMU_TCR2CD(tcr, IPS);
+	val |= ARM_SMMU_TCR2CD(tcr, TBI0);
+
+	return val;
+}
+
+static void arm_smmu_write_ctx_desc(struct arm_smmu_device *smmu,
+				    struct arm_smmu_s1_cfg *cfg)
+{
+	u64 val;
+
+	/*
+	 * We don't need to issue any invalidation here, as we'll invalidate
+	 * the STE when installing the new entry anyway.
+	 */
+	val = arm_smmu_cpu_tcr_to_cd(cfg->cd.tcr) |
+#ifdef __BIG_ENDIAN
+	      CTXDESC_CD_0_ENDI |
+#endif
+	      CTXDESC_CD_0_R | CTXDESC_CD_0_A | CTXDESC_CD_0_ASET |
+	      CTXDESC_CD_0_AA64 | FIELD_PREP(CTXDESC_CD_0_ASID, cfg->cd.asid) |
+	      CTXDESC_CD_0_V;
+
+	/* STALL_MODEL==0b10 && CD.S==0 is ILLEGAL */
+	if (smmu->features & ARM_SMMU_FEAT_STALL_FORCE)
+		val |= CTXDESC_CD_0_S;
+
+	cfg->cdptr[0] = cpu_to_le64(val);
+
+	val = cfg->cd.ttbr & CTXDESC_CD_1_TTB0_MASK;
+	cfg->cdptr[1] = cpu_to_le64(val);
+
+	cfg->cdptr[3] = cpu_to_le64(cfg->cd.mair);
+}
+
+/* Stream table manipulation functions */
+static void
+arm_smmu_write_strtab_l1_desc(__le64 *dst, struct arm_smmu_strtab_l1_desc *desc)
+{
+	u64 val = 0;
+
+	val |= FIELD_PREP(STRTAB_L1_DESC_SPAN, desc->span);
+	val |= desc->l2ptr_dma & STRTAB_L1_DESC_L2PTR_MASK;
+
+	*dst = cpu_to_le64(val);
+}
+
+static void arm_smmu_sync_ste_for_sid(struct arm_smmu_device *smmu, u32 sid)
+{
+	struct arm_smmu_cmdq_ent cmd = {
+		.opcode	= CMDQ_OP_CFGI_STE,
+		.cfgi	= {
+			.sid	= sid,
+			.leaf	= true,
+		},
+	};
+
+	arm_smmu_cmdq_issue_cmd(smmu, &cmd);
+	arm_smmu_cmdq_issue_sync(smmu);
+}
+
+static void arm_smmu_write_strtab_ent(struct arm_smmu_device *smmu, u32 sid,
+				      __le64 *dst, struct arm_smmu_strtab_ent *ste)
+{
+	/*
+	 * This is hideously complicated, but we only really care about
+	 * three cases at the moment:
+	 *
+	 * 1. Invalid (all zero) -> bypass/fault (init)
+	 * 2. Bypass/fault -> translation/bypass (attach)
+	 * 3. Translation/bypass -> bypass/fault (detach)
+	 *
+	 * Given that we can't update the STE atomically and the SMMU
+	 * doesn't read the thing in a defined order, that leaves us
+	 * with the following maintenance requirements:
+	 *
+	 * 1. Update Config, return (init time STEs aren't live)
+	 * 2. Write everything apart from dword 0, sync, write dword 0, sync
+	 * 3. Update Config, sync
+	 */
+	u64 val = le64_to_cpu(dst[0]);
+	bool ste_live = false;
+	struct arm_smmu_cmdq_ent prefetch_cmd = {
+		.opcode		= CMDQ_OP_PREFETCH_CFG,
+		.prefetch	= {
+			.sid	= sid,
+		},
+	};
+
+	if (val & STRTAB_STE_0_V) {
+		switch (FIELD_GET(STRTAB_STE_0_CFG, val)) {
+		case STRTAB_STE_0_CFG_BYPASS:
+			break;
+		case STRTAB_STE_0_CFG_S1_TRANS:
+		case STRTAB_STE_0_CFG_S2_TRANS:
+			ste_live = true;
+			break;
+		case STRTAB_STE_0_CFG_ABORT:
+			if (disable_bypass)
+				break;
+		default:
+			BUG(); /* STE corruption */
+		}
+	}
+
+	/* Nuke the existing STE_0 value, as we're going to rewrite it */
+	val = STRTAB_STE_0_V;
+
+	/* Bypass/fault */
+	if (!ste->assigned || !(ste->s1_cfg || ste->s2_cfg)) {
+		if (!ste->assigned && disable_bypass)
+			val |= FIELD_PREP(STRTAB_STE_0_CFG, STRTAB_STE_0_CFG_ABORT);
+		else
+			val |= FIELD_PREP(STRTAB_STE_0_CFG, STRTAB_STE_0_CFG_BYPASS);
+
+		dst[0] = cpu_to_le64(val);
+		dst[1] = cpu_to_le64(FIELD_PREP(STRTAB_STE_1_SHCFG,
+						STRTAB_STE_1_SHCFG_INCOMING));
+		dst[2] = 0; /* Nuke the VMID */
+		/*
+		 * The SMMU can perform negative caching, so we must sync
+		 * the STE regardless of whether the old value was live.
+		 */
+		if (smmu)
+			arm_smmu_sync_ste_for_sid(smmu, sid);
+		return;
+	}
+
+	if (ste->s1_cfg) {
+		BUG_ON(ste_live);
+		dst[1] = cpu_to_le64(
+			 FIELD_PREP(STRTAB_STE_1_S1CIR, STRTAB_STE_1_S1C_CACHE_WBRA) |
+			 FIELD_PREP(STRTAB_STE_1_S1COR, STRTAB_STE_1_S1C_CACHE_WBRA) |
+			 FIELD_PREP(STRTAB_STE_1_S1CSH, ARM_SMMU_SH_ISH) |
+#ifdef CONFIG_PCI_ATS
+			 FIELD_PREP(STRTAB_STE_1_EATS, STRTAB_STE_1_EATS_TRANS) |
+#endif
+			 FIELD_PREP(STRTAB_STE_1_STRW, STRTAB_STE_1_STRW_NSEL1));
+
+		if (smmu->features & ARM_SMMU_FEAT_STALLS &&
+		   !(smmu->features & ARM_SMMU_FEAT_STALL_FORCE))
+			dst[1] |= cpu_to_le64(STRTAB_STE_1_S1STALLD);
+
+		val |= (ste->s1_cfg->cdptr_dma & STRTAB_STE_0_S1CTXPTR_MASK) |
+			FIELD_PREP(STRTAB_STE_0_CFG, STRTAB_STE_0_CFG_S1_TRANS);
+	}
+
+	if (ste->s2_cfg) {
+		BUG_ON(ste_live);
+		dst[2] = cpu_to_le64(
+			 FIELD_PREP(STRTAB_STE_2_S2VMID, ste->s2_cfg->vmid) |
+			 FIELD_PREP(STRTAB_STE_2_VTCR, ste->s2_cfg->vtcr) |
+#ifdef __BIG_ENDIAN
+			 STRTAB_STE_2_S2ENDI |
+#endif
+			 STRTAB_STE_2_S2PTW | STRTAB_STE_2_S2AA64 |
+			 STRTAB_STE_2_S2R);
+
+		dst[3] = cpu_to_le64(ste->s2_cfg->vttbr & STRTAB_STE_3_S2TTB_MASK);
+
+		val |= FIELD_PREP(STRTAB_STE_0_CFG, STRTAB_STE_0_CFG_S2_TRANS);
+	}
+
+	arm_smmu_sync_ste_for_sid(smmu, sid);
+	/* See comment in arm_smmu_write_ctx_desc() */
+	WRITE_ONCE(dst[0], cpu_to_le64(val));
+	arm_smmu_sync_ste_for_sid(smmu, sid);
+
+	/* It's likely that we'll want to use the new STE soon */
+	if (!(smmu->options & ARM_SMMU_OPT_SKIP_PREFETCH))
+		arm_smmu_cmdq_issue_cmd(smmu, &prefetch_cmd);
+}
+
+static void arm_smmu_init_bypass_stes(u64 *strtab, unsigned int nent)
+{
+	unsigned int i;
+	struct arm_smmu_strtab_ent ste = { .assigned = false };
+
+	for (i = 0; i < nent; ++i) {
+		arm_smmu_write_strtab_ent(NULL, -1, strtab, &ste);
+		strtab += STRTAB_STE_DWORDS;
+	}
+}
+
+static int arm_smmu_init_l2_strtab(struct arm_smmu_device *smmu, u32 sid)
+{
+	size_t size;
+	void *strtab;
+	struct arm_smmu_strtab_cfg *cfg = &smmu->strtab_cfg;
+	struct arm_smmu_strtab_l1_desc *desc = &cfg->l1_desc[sid >> STRTAB_SPLIT];
+
+	if (desc->l2ptr)
+		return 0;
+
+	size = 1 << (STRTAB_SPLIT + ilog2(STRTAB_STE_DWORDS) + 3);
+	strtab = &cfg->strtab[(sid >> STRTAB_SPLIT) * STRTAB_L1_DESC_DWORDS];
+
+	desc->span = STRTAB_SPLIT + 1;
+	desc->l2ptr = dmam_alloc_coherent(smmu->dev, size, &desc->l2ptr_dma,
+					  GFP_KERNEL | __GFP_ZERO);
+	if (!desc->l2ptr) {
+		dev_err(smmu->dev,
+			"failed to allocate l2 stream table for SID %u\n",
+			sid);
+		return -ENOMEM;
+	}
+
+	arm_smmu_init_bypass_stes(desc->l2ptr, 1 << STRTAB_SPLIT);
+	arm_smmu_write_strtab_l1_desc(strtab, desc);
+	return 0;
+}
+
+/* IRQ and event handlers */
+static irqreturn_t arm_smmu_evtq_thread(int irq, void *dev)
+{
+	int i;
+	struct arm_smmu_device *smmu = dev;
+	struct arm_smmu_queue *q = &smmu->evtq.q;
+	u64 evt[EVTQ_ENT_DWORDS];
+
+	do {
+		while (!queue_remove_raw(q, evt)) {
+			u8 id = FIELD_GET(EVTQ_0_ID, evt[0]);
+
+			dev_info(smmu->dev, "event 0x%02x received:\n", id);
+			for (i = 0; i < ARRAY_SIZE(evt); ++i)
+				dev_info(smmu->dev, "\t0x%016llx\n",
+					 (unsigned long long)evt[i]);
+
+			cond_resched();
+		}
+
+		/*
+		 * Not much we can do on overflow, so scream and pretend we're
+		 * trying harder.
+		 */
+		if (queue_sync_prod(q) == -EOVERFLOW)
+			dev_err(smmu->dev, "EVTQ overflow detected -- events lost\n");
+	} while (!queue_empty(q));
+
+	/* Sync our overflow flag, as we believe we're up to speed */
+	q->cons = Q_OVF(q, q->prod) | Q_WRP(q, q->cons) | Q_IDX(q, q->cons);
+	return IRQ_HANDLED;
+}
+
+static void arm_smmu_handle_ppr(struct arm_smmu_device *smmu, u64 *evt)
+{
+	u32 sid, ssid;
+	u16 grpid;
+	bool ssv, last;
+
+	sid = FIELD_GET(PRIQ_0_SID, evt[0]);
+	ssv = FIELD_GET(PRIQ_0_SSID_V, evt[0]);
+	ssid = ssv ? FIELD_GET(PRIQ_0_SSID, evt[0]) : 0;
+	last = FIELD_GET(PRIQ_0_PRG_LAST, evt[0]);
+	grpid = FIELD_GET(PRIQ_1_PRG_IDX, evt[1]);
+
+	dev_info(smmu->dev, "unexpected PRI request received:\n");
+	dev_info(smmu->dev,
+		 "\tsid 0x%08x.0x%05x: [%u%s] %sprivileged %s%s%s access at iova 0x%016llx\n",
+		 sid, ssid, grpid, last ? "L" : "",
+		 evt[0] & PRIQ_0_PERM_PRIV ? "" : "un",
+		 evt[0] & PRIQ_0_PERM_READ ? "R" : "",
+		 evt[0] & PRIQ_0_PERM_WRITE ? "W" : "",
+		 evt[0] & PRIQ_0_PERM_EXEC ? "X" : "",
+		 evt[1] & PRIQ_1_ADDR_MASK);
+
+	if (last) {
+		struct arm_smmu_cmdq_ent cmd = {
+			.opcode			= CMDQ_OP_PRI_RESP,
+			.substream_valid	= ssv,
+			.pri			= {
+				.sid	= sid,
+				.ssid	= ssid,
+				.grpid	= grpid,
+				.resp	= PRI_RESP_DENY,
+			},
+		};
+
+		arm_smmu_cmdq_issue_cmd(smmu, &cmd);
+	}
+}
+
+static irqreturn_t arm_smmu_priq_thread(int irq, void *dev)
+{
+	struct arm_smmu_device *smmu = dev;
+	struct arm_smmu_queue *q = &smmu->priq.q;
+	u64 evt[PRIQ_ENT_DWORDS];
+
+	do {
+		while (!queue_remove_raw(q, evt))
+			arm_smmu_handle_ppr(smmu, evt);
+
+		if (queue_sync_prod(q) == -EOVERFLOW)
+			dev_err(smmu->dev, "PRIQ overflow detected -- requests lost\n");
+	} while (!queue_empty(q));
+
+	/* Sync our overflow flag, as we believe we're up to speed */
+	q->cons = Q_OVF(q, q->prod) | Q_WRP(q, q->cons) | Q_IDX(q, q->cons);
+	writel(q->cons, q->cons_reg);
+	return IRQ_HANDLED;
+}
+
+static int arm_smmu_device_disable(struct arm_smmu_device *smmu);
+
+static irqreturn_t arm_smmu_gerror_handler(int irq, void *dev)
+{
+	u32 gerror, gerrorn, active;
+	struct arm_smmu_device *smmu = dev;
+
+	gerror = readl_relaxed(smmu->base + ARM_SMMU_GERROR);
+	gerrorn = readl_relaxed(smmu->base + ARM_SMMU_GERRORN);
+
+	active = gerror ^ gerrorn;
+	if (!(active & GERROR_ERR_MASK))
+		return IRQ_NONE; /* No errors pending */
+
+	dev_warn(smmu->dev,
+		 "unexpected global error reported (0x%08x), this could be serious\n",
+		 active);
+
+	if (active & GERROR_SFM_ERR) {
+		dev_err(smmu->dev, "device has entered Service Failure Mode!\n");
+		arm_smmu_device_disable(smmu);
+	}
+
+	if (active & GERROR_MSI_GERROR_ABT_ERR)
+		dev_warn(smmu->dev, "GERROR MSI write aborted\n");
+
+	if (active & GERROR_MSI_PRIQ_ABT_ERR)
+		dev_warn(smmu->dev, "PRIQ MSI write aborted\n");
+
+	if (active & GERROR_MSI_EVTQ_ABT_ERR)
+		dev_warn(smmu->dev, "EVTQ MSI write aborted\n");
+
+	if (active & GERROR_MSI_CMDQ_ABT_ERR)
+		dev_warn(smmu->dev, "CMDQ MSI write aborted\n");
+
+	if (active & GERROR_PRIQ_ABT_ERR)
+		dev_err(smmu->dev, "PRIQ write aborted -- events may have been lost\n");
+
+	if (active & GERROR_EVTQ_ABT_ERR)
+		dev_err(smmu->dev, "EVTQ write aborted -- events may have been lost\n");
+
+	if (active & GERROR_CMDQ_ERR)
+		arm_smmu_cmdq_skip_err(smmu);
+
+	writel(gerror, smmu->base + ARM_SMMU_GERRORN);
+	return IRQ_HANDLED;
+}
+
+static irqreturn_t arm_smmu_combined_irq_thread(int irq, void *dev)
+{
+	struct arm_smmu_device *smmu = dev;
+
+	arm_smmu_evtq_thread(irq, dev);
+	if (smmu->features & ARM_SMMU_FEAT_PRI)
+		arm_smmu_priq_thread(irq, dev);
+
+	return IRQ_HANDLED;
+}
+
+static irqreturn_t arm_smmu_combined_irq_handler(int irq, void *dev)
+{
+	arm_smmu_gerror_handler(irq, dev);
+	return IRQ_WAKE_THREAD;
+}
+
+/* IO_PGTABLE API */
+static void __arm_smmu_tlb_sync(struct arm_smmu_device *smmu)
+{
+	arm_smmu_cmdq_issue_sync(smmu);
+}
+
+static void arm_smmu_tlb_sync(void *cookie)
+{
+	struct arm_smmu_domain *smmu_domain = cookie;
+	__arm_smmu_tlb_sync(smmu_domain->smmu);
+}
+
+static void arm_smmu_tlb_inv_context(void *cookie)
+{
+	struct arm_smmu_domain *smmu_domain = cookie;
+	struct arm_smmu_device *smmu = smmu_domain->smmu;
+	struct arm_smmu_cmdq_ent cmd;
+
+	if (smmu_domain->stage == ARM_SMMU_DOMAIN_S1) {
+		cmd.opcode	= CMDQ_OP_TLBI_NH_ASID;
+		cmd.tlbi.asid	= smmu_domain->s1_cfg.cd.asid;
+		cmd.tlbi.vmid	= 0;
+	} else {
+		cmd.opcode	= CMDQ_OP_TLBI_S12_VMALL;
+		cmd.tlbi.vmid	= smmu_domain->s2_cfg.vmid;
+	}
+
+	arm_smmu_cmdq_issue_cmd(smmu, &cmd);
+	__arm_smmu_tlb_sync(smmu);
+}
+
+static void arm_smmu_tlb_inv_range_nosync(unsigned long iova, size_t size,
+					  size_t granule, bool leaf, void *cookie)
+{
+	struct arm_smmu_domain *smmu_domain = cookie;
+	struct arm_smmu_device *smmu = smmu_domain->smmu;
+	struct arm_smmu_cmdq_ent cmd = {
+		.tlbi = {
+			.leaf	= leaf,
+			.addr	= iova,
+		},
+	};
+
+	if (smmu_domain->stage == ARM_SMMU_DOMAIN_S1) {
+		cmd.opcode	= CMDQ_OP_TLBI_NH_VA;
+		cmd.tlbi.asid	= smmu_domain->s1_cfg.cd.asid;
+	} else {
+		cmd.opcode	= CMDQ_OP_TLBI_S2_IPA;
+		cmd.tlbi.vmid	= smmu_domain->s2_cfg.vmid;
+	}
+
+	do {
+		arm_smmu_cmdq_issue_cmd(smmu, &cmd);
+		cmd.tlbi.addr += granule;
+	} while (size -= granule);
+}
+
+static const struct iommu_gather_ops arm_smmu_gather_ops = {
+	.tlb_flush_all	= arm_smmu_tlb_inv_context,
+	.tlb_add_flush	= arm_smmu_tlb_inv_range_nosync,
+	.tlb_sync	= arm_smmu_tlb_sync,
+};
+
+/* IOMMU API */
+static bool arm_smmu_capable(enum iommu_cap cap)
+{
+	switch (cap) {
+	case IOMMU_CAP_CACHE_COHERENCY:
+		return true;
+	case IOMMU_CAP_NOEXEC:
+		return true;
+	default:
+		return false;
+	}
+}
+
+static struct iommu_domain *arm_smmu_domain_alloc(unsigned type)
+{
+	struct arm_smmu_domain *smmu_domain;
+
+	if (type != IOMMU_DOMAIN_UNMANAGED &&
+	    type != IOMMU_DOMAIN_DMA &&
+	    type != IOMMU_DOMAIN_IDENTITY)
+		return NULL;
+
+	/*
+	 * Allocate the domain and initialise some of its data structures.
+	 * We can't really do anything meaningful until we've added a
+	 * master.
+	 */
+	smmu_domain = kzalloc(sizeof(*smmu_domain), GFP_KERNEL);
+	if (!smmu_domain)
+		return NULL;
+
+	if (type == IOMMU_DOMAIN_DMA &&
+	    iommu_get_dma_cookie(&smmu_domain->domain)) {
+		kfree(smmu_domain);
+		return NULL;
+	}
+
+	mutex_init(&smmu_domain->init_mutex);
+	return &smmu_domain->domain;
+}
+
+static int arm_smmu_bitmap_alloc(unsigned long *map, int span)
+{
+	int idx, size = 1 << span;
+
+	do {
+		idx = find_first_zero_bit(map, size);
+		if (idx == size)
+			return -ENOSPC;
+	} while (test_and_set_bit(idx, map));
+
+	return idx;
+}
+
+static void arm_smmu_bitmap_free(unsigned long *map, int idx)
+{
+	clear_bit(idx, map);
+}
+
+static void arm_smmu_domain_free(struct iommu_domain *domain)
+{
+	struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
+	struct arm_smmu_device *smmu = smmu_domain->smmu;
+
+	iommu_put_dma_cookie(domain);
+	free_io_pgtable_ops(smmu_domain->pgtbl_ops);
+
+	/* Free the CD and ASID, if we allocated them */
+	if (smmu_domain->stage == ARM_SMMU_DOMAIN_S1) {
+		struct arm_smmu_s1_cfg *cfg = &smmu_domain->s1_cfg;
+
+		if (cfg->cdptr) {
+			dmam_free_coherent(smmu_domain->smmu->dev,
+					   CTXDESC_CD_DWORDS << 3,
+					   cfg->cdptr,
+					   cfg->cdptr_dma);
+
+			arm_smmu_bitmap_free(smmu->asid_map, cfg->cd.asid);
+		}
+	} else {
+		struct arm_smmu_s2_cfg *cfg = &smmu_domain->s2_cfg;
+		if (cfg->vmid)
+			arm_smmu_bitmap_free(smmu->vmid_map, cfg->vmid);
+	}
+
+	kfree(smmu_domain);
+}
+
+static int arm_smmu_domain_finalise_s1(struct arm_smmu_domain *smmu_domain,
+				       struct io_pgtable_cfg *pgtbl_cfg)
+{
+	int ret;
+	int asid;
+	struct arm_smmu_device *smmu = smmu_domain->smmu;
+	struct arm_smmu_s1_cfg *cfg = &smmu_domain->s1_cfg;
+
+	asid = arm_smmu_bitmap_alloc(smmu->asid_map, smmu->asid_bits);
+	if (asid < 0)
+		return asid;
+
+	cfg->cdptr = dmam_alloc_coherent(smmu->dev, CTXDESC_CD_DWORDS << 3,
+					 &cfg->cdptr_dma,
+					 GFP_KERNEL | __GFP_ZERO);
+	if (!cfg->cdptr) {
+		dev_warn(smmu->dev, "failed to allocate context descriptor\n");
+		ret = -ENOMEM;
+		goto out_free_asid;
+	}
+
+	cfg->cd.asid	= (u16)asid;
+	cfg->cd.ttbr	= pgtbl_cfg->arm_lpae_s1_cfg.ttbr[0];
+	cfg->cd.tcr	= pgtbl_cfg->arm_lpae_s1_cfg.tcr;
+	cfg->cd.mair	= pgtbl_cfg->arm_lpae_s1_cfg.mair[0];
+	return 0;
+
+out_free_asid:
+	arm_smmu_bitmap_free(smmu->asid_map, asid);
+	return ret;
+}
+
+static int arm_smmu_domain_finalise_s2(struct arm_smmu_domain *smmu_domain,
+				       struct io_pgtable_cfg *pgtbl_cfg)
+{
+	int vmid;
+	struct arm_smmu_device *smmu = smmu_domain->smmu;
+	struct arm_smmu_s2_cfg *cfg = &smmu_domain->s2_cfg;
+
+	vmid = arm_smmu_bitmap_alloc(smmu->vmid_map, smmu->vmid_bits);
+	if (vmid < 0)
+		return vmid;
+
+	cfg->vmid	= (u16)vmid;
+	cfg->vttbr	= pgtbl_cfg->arm_lpae_s2_cfg.vttbr;
+	cfg->vtcr	= pgtbl_cfg->arm_lpae_s2_cfg.vtcr;
+	return 0;
+}
+
+static int arm_smmu_domain_finalise(struct iommu_domain *domain)
+{
+	int ret;
+	unsigned long ias, oas;
+	enum io_pgtable_fmt fmt;
+	struct io_pgtable_cfg pgtbl_cfg;
+	struct io_pgtable_ops *pgtbl_ops;
+	int (*finalise_stage_fn)(struct arm_smmu_domain *,
+				 struct io_pgtable_cfg *);
+	struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
+	struct arm_smmu_device *smmu = smmu_domain->smmu;
+
+	if (domain->type == IOMMU_DOMAIN_IDENTITY) {
+		smmu_domain->stage = ARM_SMMU_DOMAIN_BYPASS;
+		return 0;
+	}
+
+	/* Restrict the stage to what we can actually support */
+	if (!(smmu->features & ARM_SMMU_FEAT_TRANS_S1))
+		smmu_domain->stage = ARM_SMMU_DOMAIN_S2;
+	if (!(smmu->features & ARM_SMMU_FEAT_TRANS_S2))
+		smmu_domain->stage = ARM_SMMU_DOMAIN_S1;
+
+	switch (smmu_domain->stage) {
+	case ARM_SMMU_DOMAIN_S1:
+		ias = (smmu->features & ARM_SMMU_FEAT_VAX) ? 52 : 48;
+		ias = min_t(unsigned long, ias, VA_BITS);
+		oas = smmu->ias;
+		fmt = ARM_64_LPAE_S1;
+		finalise_stage_fn = arm_smmu_domain_finalise_s1;
+		break;
+	case ARM_SMMU_DOMAIN_NESTED:
+	case ARM_SMMU_DOMAIN_S2:
+		ias = smmu->ias;
+		oas = smmu->oas;
+		fmt = ARM_64_LPAE_S2;
+		finalise_stage_fn = arm_smmu_domain_finalise_s2;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	pgtbl_cfg = (struct io_pgtable_cfg) {
+		.pgsize_bitmap	= smmu->pgsize_bitmap,
+		.ias		= ias,
+		.oas		= oas,
+		.tlb		= &arm_smmu_gather_ops,
+		.iommu_dev	= smmu->dev,
+	};
+
+	if (smmu->features & ARM_SMMU_FEAT_COHERENCY)
+		pgtbl_cfg.quirks = IO_PGTABLE_QUIRK_NO_DMA;
+
+	pgtbl_ops = alloc_io_pgtable_ops(fmt, &pgtbl_cfg, smmu_domain);
+	if (!pgtbl_ops)
+		return -ENOMEM;
+
+	domain->pgsize_bitmap = pgtbl_cfg.pgsize_bitmap;
+	domain->geometry.aperture_end = (1UL << pgtbl_cfg.ias) - 1;
+	domain->geometry.force_aperture = true;
+
+	ret = finalise_stage_fn(smmu_domain, &pgtbl_cfg);
+	if (ret < 0) {
+		free_io_pgtable_ops(pgtbl_ops);
+		return ret;
+	}
+
+	smmu_domain->pgtbl_ops = pgtbl_ops;
+	return 0;
+}
+
+static __le64 *arm_smmu_get_step_for_sid(struct arm_smmu_device *smmu, u32 sid)
+{
+	__le64 *step;
+	struct arm_smmu_strtab_cfg *cfg = &smmu->strtab_cfg;
+
+	if (smmu->features & ARM_SMMU_FEAT_2_LVL_STRTAB) {
+		struct arm_smmu_strtab_l1_desc *l1_desc;
+		int idx;
+
+		/* Two-level walk */
+		idx = (sid >> STRTAB_SPLIT) * STRTAB_L1_DESC_DWORDS;
+		l1_desc = &cfg->l1_desc[idx];
+		idx = (sid & ((1 << STRTAB_SPLIT) - 1)) * STRTAB_STE_DWORDS;
+		step = &l1_desc->l2ptr[idx];
+	} else {
+		/* Simple linear lookup */
+		step = &cfg->strtab[sid * STRTAB_STE_DWORDS];
+	}
+
+	return step;
+}
+
+static void arm_smmu_install_ste_for_dev(struct iommu_fwspec *fwspec)
+{
+	int i, j;
+	struct arm_smmu_master_data *master = fwspec->iommu_priv;
+	struct arm_smmu_device *smmu = master->smmu;
+
+	for (i = 0; i < fwspec->num_ids; ++i) {
+		u32 sid = fwspec->ids[i];
+		__le64 *step = arm_smmu_get_step_for_sid(smmu, sid);
+
+		/* Bridged PCI devices may end up with duplicated IDs */
+		for (j = 0; j < i; j++)
+			if (fwspec->ids[j] == sid)
+				break;
+		if (j < i)
+			continue;
+
+		arm_smmu_write_strtab_ent(smmu, sid, step, &master->ste);
+	}
+}
+
+static void arm_smmu_detach_dev(struct device *dev)
+{
+	struct arm_smmu_master_data *master = dev->iommu_fwspec->iommu_priv;
+
+	master->ste.assigned = false;
+	arm_smmu_install_ste_for_dev(dev->iommu_fwspec);
+}
+
+static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev)
+{
+	int ret = 0;
+	struct arm_smmu_device *smmu;
+	struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
+	struct arm_smmu_master_data *master;
+	struct arm_smmu_strtab_ent *ste;
+
+	if (!dev->iommu_fwspec)
+		return -ENOENT;
+
+	master = dev->iommu_fwspec->iommu_priv;
+	smmu = master->smmu;
+	ste = &master->ste;
+
+	/* Already attached to a different domain? */
+	if (ste->assigned)
+		arm_smmu_detach_dev(dev);
+
+	mutex_lock(&smmu_domain->init_mutex);
+
+	if (!smmu_domain->smmu) {
+		smmu_domain->smmu = smmu;
+		ret = arm_smmu_domain_finalise(domain);
+		if (ret) {
+			smmu_domain->smmu = NULL;
+			goto out_unlock;
+		}
+	} else if (smmu_domain->smmu != smmu) {
+		dev_err(dev,
+			"cannot attach to SMMU %s (upstream of %s)\n",
+			dev_name(smmu_domain->smmu->dev),
+			dev_name(smmu->dev));
+		ret = -ENXIO;
+		goto out_unlock;
+	}
+
+	ste->assigned = true;
+
+	if (smmu_domain->stage == ARM_SMMU_DOMAIN_BYPASS) {
+		ste->s1_cfg = NULL;
+		ste->s2_cfg = NULL;
+	} else if (smmu_domain->stage == ARM_SMMU_DOMAIN_S1) {
+		ste->s1_cfg = &smmu_domain->s1_cfg;
+		ste->s2_cfg = NULL;
+		arm_smmu_write_ctx_desc(smmu, ste->s1_cfg);
+	} else {
+		ste->s1_cfg = NULL;
+		ste->s2_cfg = &smmu_domain->s2_cfg;
+	}
+
+	arm_smmu_install_ste_for_dev(dev->iommu_fwspec);
+out_unlock:
+	mutex_unlock(&smmu_domain->init_mutex);
+	return ret;
+}
+
+static int arm_smmu_map(struct iommu_domain *domain, unsigned long iova,
+			phys_addr_t paddr, size_t size, int prot)
+{
+	struct io_pgtable_ops *ops = to_smmu_domain(domain)->pgtbl_ops;
+
+	if (!ops)
+		return -ENODEV;
+
+	return ops->map(ops, iova, paddr, size, prot);
+}
+
+static size_t
+arm_smmu_unmap(struct iommu_domain *domain, unsigned long iova, size_t size)
+{
+	struct io_pgtable_ops *ops = to_smmu_domain(domain)->pgtbl_ops;
+
+	if (!ops)
+		return 0;
+
+	return ops->unmap(ops, iova, size);
+}
+
+static void arm_smmu_iotlb_sync(struct iommu_domain *domain)
+{
+	struct arm_smmu_device *smmu = to_smmu_domain(domain)->smmu;
+
+	if (smmu)
+		__arm_smmu_tlb_sync(smmu);
+}
+
+static phys_addr_t
+arm_smmu_iova_to_phys(struct iommu_domain *domain, dma_addr_t iova)
+{
+	struct io_pgtable_ops *ops = to_smmu_domain(domain)->pgtbl_ops;
+
+	if (domain->type == IOMMU_DOMAIN_IDENTITY)
+		return iova;
+
+	if (!ops)
+		return 0;
+
+	return ops->iova_to_phys(ops, iova);
+}
+
+static struct platform_driver arm_smmu_driver;
+
+static int arm_smmu_match_node(struct device *dev, void *data)
+{
+	return dev->fwnode == data;
+}
+
+static
+struct arm_smmu_device *arm_smmu_get_by_fwnode(struct fwnode_handle *fwnode)
+{
+	struct device *dev = driver_find_device(&arm_smmu_driver.driver, NULL,
+						fwnode, arm_smmu_match_node);
+	put_device(dev);
+	return dev ? dev_get_drvdata(dev) : NULL;
+}
+
+static bool arm_smmu_sid_in_range(struct arm_smmu_device *smmu, u32 sid)
+{
+	unsigned long limit = smmu->strtab_cfg.num_l1_ents;
+
+	if (smmu->features & ARM_SMMU_FEAT_2_LVL_STRTAB)
+		limit *= 1UL << STRTAB_SPLIT;
+
+	return sid < limit;
+}
+
+static struct iommu_ops arm_smmu_ops;
+
+static int arm_smmu_add_device(struct device *dev)
+{
+	int i, ret;
+	struct arm_smmu_device *smmu;
+	struct arm_smmu_master_data *master;
+	struct iommu_fwspec *fwspec = dev->iommu_fwspec;
+	struct iommu_group *group;
+
+	if (!fwspec || fwspec->ops != &arm_smmu_ops)
+		return -ENODEV;
+	/*
+	 * We _can_ actually withstand dodgy bus code re-calling add_device()
+	 * without an intervening remove_device()/of_xlate() sequence, but
+	 * we're not going to do so quietly...
+	 */
+	if (WARN_ON_ONCE(fwspec->iommu_priv)) {
+		master = fwspec->iommu_priv;
+		smmu = master->smmu;
+	} else {
+		smmu = arm_smmu_get_by_fwnode(fwspec->iommu_fwnode);
+		if (!smmu)
+			return -ENODEV;
+		master = kzalloc(sizeof(*master), GFP_KERNEL);
+		if (!master)
+			return -ENOMEM;
+
+		master->smmu = smmu;
+		fwspec->iommu_priv = master;
+	}
+
+	/* Check the SIDs are in range of the SMMU and our stream table */
+	for (i = 0; i < fwspec->num_ids; i++) {
+		u32 sid = fwspec->ids[i];
+
+		if (!arm_smmu_sid_in_range(smmu, sid))
+			return -ERANGE;
+
+		/* Ensure l2 strtab is initialised */
+		if (smmu->features & ARM_SMMU_FEAT_2_LVL_STRTAB) {
+			ret = arm_smmu_init_l2_strtab(smmu, sid);
+			if (ret)
+				return ret;
+		}
+	}
+
+	group = iommu_group_get_for_dev(dev);
+	if (!IS_ERR(group)) {
+		iommu_group_put(group);
+		iommu_device_link(&smmu->iommu, dev);
+	}
+
+	return PTR_ERR_OR_ZERO(group);
+}
+
+static void arm_smmu_remove_device(struct device *dev)
+{
+	struct iommu_fwspec *fwspec = dev->iommu_fwspec;
+	struct arm_smmu_master_data *master;
+	struct arm_smmu_device *smmu;
+
+	if (!fwspec || fwspec->ops != &arm_smmu_ops)
+		return;
+
+	master = fwspec->iommu_priv;
+	smmu = master->smmu;
+	if (master && master->ste.assigned)
+		arm_smmu_detach_dev(dev);
+	iommu_group_remove_device(dev);
+	iommu_device_unlink(&smmu->iommu, dev);
+	kfree(master);
+	iommu_fwspec_free(dev);
+}
+
+static struct iommu_group *arm_smmu_device_group(struct device *dev)
+{
+	struct iommu_group *group;
+
+	/*
+	 * We don't support devices sharing stream IDs other than PCI RID
+	 * aliases, since the necessary ID-to-device lookup becomes rather
+	 * impractical given a potential sparse 32-bit stream ID space.
+	 */
+	if (dev_is_pci(dev))
+		group = pci_device_group(dev);
+	else
+		group = generic_device_group(dev);
+
+	return group;
+}
+
+static int arm_smmu_domain_get_attr(struct iommu_domain *domain,
+				    enum iommu_attr attr, void *data)
+{
+	struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
+
+	if (domain->type != IOMMU_DOMAIN_UNMANAGED)
+		return -EINVAL;
+
+	switch (attr) {
+	case DOMAIN_ATTR_NESTING:
+		*(int *)data = (smmu_domain->stage == ARM_SMMU_DOMAIN_NESTED);
+		return 0;
+	default:
+		return -ENODEV;
+	}
+}
+
+static int arm_smmu_domain_set_attr(struct iommu_domain *domain,
+				    enum iommu_attr attr, void *data)
+{
+	int ret = 0;
+	struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
+
+	if (domain->type != IOMMU_DOMAIN_UNMANAGED)
+		return -EINVAL;
+
+	mutex_lock(&smmu_domain->init_mutex);
+
+	switch (attr) {
+	case DOMAIN_ATTR_NESTING:
+		if (smmu_domain->smmu) {
+			ret = -EPERM;
+			goto out_unlock;
+		}
+
+		if (*(int *)data)
+			smmu_domain->stage = ARM_SMMU_DOMAIN_NESTED;
+		else
+			smmu_domain->stage = ARM_SMMU_DOMAIN_S1;
+
+		break;
+	default:
+		ret = -ENODEV;
+	}
+
+out_unlock:
+	mutex_unlock(&smmu_domain->init_mutex);
+	return ret;
+}
+
+static int arm_smmu_of_xlate(struct device *dev, struct of_phandle_args *args)
+{
+	return iommu_fwspec_add_ids(dev, args->args, 1);
+}
+
+static void arm_smmu_get_resv_regions(struct device *dev,
+				      struct list_head *head)
+{
+	struct iommu_resv_region *region;
+	int prot = IOMMU_WRITE | IOMMU_NOEXEC | IOMMU_MMIO;
+
+	region = iommu_alloc_resv_region(MSI_IOVA_BASE, MSI_IOVA_LENGTH,
+					 prot, IOMMU_RESV_SW_MSI);
+	if (!region)
+		return;
+
+	list_add_tail(&region->list, head);
+
+	iommu_dma_get_resv_regions(dev, head);
+}
+
+static void arm_smmu_put_resv_regions(struct device *dev,
+				      struct list_head *head)
+{
+	struct iommu_resv_region *entry, *next;
+
+	list_for_each_entry_safe(entry, next, head, list)
+		kfree(entry);
+}
+
+static struct iommu_ops arm_smmu_ops = {
+	.capable		= arm_smmu_capable,
+	.domain_alloc		= arm_smmu_domain_alloc,
+	.domain_free		= arm_smmu_domain_free,
+	.attach_dev		= arm_smmu_attach_dev,
+	.map			= arm_smmu_map,
+	.unmap			= arm_smmu_unmap,
+	.flush_iotlb_all	= arm_smmu_iotlb_sync,
+	.iotlb_sync		= arm_smmu_iotlb_sync,
+	.iova_to_phys		= arm_smmu_iova_to_phys,
+	.add_device		= arm_smmu_add_device,
+	.remove_device		= arm_smmu_remove_device,
+	.device_group		= arm_smmu_device_group,
+	.domain_get_attr	= arm_smmu_domain_get_attr,
+	.domain_set_attr	= arm_smmu_domain_set_attr,
+	.of_xlate		= arm_smmu_of_xlate,
+	.get_resv_regions	= arm_smmu_get_resv_regions,
+	.put_resv_regions	= arm_smmu_put_resv_regions,
+	.pgsize_bitmap		= -1UL, /* Restricted during device attach */
+};
+
+/* Probing and initialisation functions */
+static int arm_smmu_init_one_queue(struct arm_smmu_device *smmu,
+				   struct arm_smmu_queue *q,
+				   unsigned long prod_off,
+				   unsigned long cons_off,
+				   size_t dwords)
+{
+	size_t qsz = ((1 << q->max_n_shift) * dwords) << 3;
+
+	q->base = dmam_alloc_coherent(smmu->dev, qsz, &q->base_dma, GFP_KERNEL);
+	if (!q->base) {
+		dev_err(smmu->dev, "failed to allocate queue (0x%zx bytes)\n",
+			qsz);
+		return -ENOMEM;
+	}
+
+	q->prod_reg	= arm_smmu_page1_fixup(prod_off, smmu);
+	q->cons_reg	= arm_smmu_page1_fixup(cons_off, smmu);
+	q->ent_dwords	= dwords;
+
+	q->q_base  = Q_BASE_RWA;
+	q->q_base |= q->base_dma & Q_BASE_ADDR_MASK;
+	q->q_base |= FIELD_PREP(Q_BASE_LOG2SIZE, q->max_n_shift);
+
+	q->prod = q->cons = 0;
+	return 0;
+}
+
+static int arm_smmu_init_queues(struct arm_smmu_device *smmu)
+{
+	int ret;
+
+	/* cmdq */
+	spin_lock_init(&smmu->cmdq.lock);
+	ret = arm_smmu_init_one_queue(smmu, &smmu->cmdq.q, ARM_SMMU_CMDQ_PROD,
+				      ARM_SMMU_CMDQ_CONS, CMDQ_ENT_DWORDS);
+	if (ret)
+		return ret;
+
+	/* evtq */
+	ret = arm_smmu_init_one_queue(smmu, &smmu->evtq.q, ARM_SMMU_EVTQ_PROD,
+				      ARM_SMMU_EVTQ_CONS, EVTQ_ENT_DWORDS);
+	if (ret)
+		return ret;
+
+	/* priq */
+	if (!(smmu->features & ARM_SMMU_FEAT_PRI))
+		return 0;
+
+	return arm_smmu_init_one_queue(smmu, &smmu->priq.q, ARM_SMMU_PRIQ_PROD,
+				       ARM_SMMU_PRIQ_CONS, PRIQ_ENT_DWORDS);
+}
+
+static int arm_smmu_init_l1_strtab(struct arm_smmu_device *smmu)
+{
+	unsigned int i;
+	struct arm_smmu_strtab_cfg *cfg = &smmu->strtab_cfg;
+	size_t size = sizeof(*cfg->l1_desc) * cfg->num_l1_ents;
+	void *strtab = smmu->strtab_cfg.strtab;
+
+	cfg->l1_desc = devm_kzalloc(smmu->dev, size, GFP_KERNEL);
+	if (!cfg->l1_desc) {
+		dev_err(smmu->dev, "failed to allocate l1 stream table desc\n");
+		return -ENOMEM;
+	}
+
+	for (i = 0; i < cfg->num_l1_ents; ++i) {
+		arm_smmu_write_strtab_l1_desc(strtab, &cfg->l1_desc[i]);
+		strtab += STRTAB_L1_DESC_DWORDS << 3;
+	}
+
+	return 0;
+}
+
+static int arm_smmu_init_strtab_2lvl(struct arm_smmu_device *smmu)
+{
+	void *strtab;
+	u64 reg;
+	u32 size, l1size;
+	struct arm_smmu_strtab_cfg *cfg = &smmu->strtab_cfg;
+
+	/* Calculate the L1 size, capped to the SIDSIZE. */
+	size = STRTAB_L1_SZ_SHIFT - (ilog2(STRTAB_L1_DESC_DWORDS) + 3);
+	size = min(size, smmu->sid_bits - STRTAB_SPLIT);
+	cfg->num_l1_ents = 1 << size;
+
+	size += STRTAB_SPLIT;
+	if (size < smmu->sid_bits)
+		dev_warn(smmu->dev,
+			 "2-level strtab only covers %u/%u bits of SID\n",
+			 size, smmu->sid_bits);
+
+	l1size = cfg->num_l1_ents * (STRTAB_L1_DESC_DWORDS << 3);
+	strtab = dmam_alloc_coherent(smmu->dev, l1size, &cfg->strtab_dma,
+				     GFP_KERNEL | __GFP_ZERO);
+	if (!strtab) {
+		dev_err(smmu->dev,
+			"failed to allocate l1 stream table (%u bytes)\n",
+			size);
+		return -ENOMEM;
+	}
+	cfg->strtab = strtab;
+
+	/* Configure strtab_base_cfg for 2 levels */
+	reg  = FIELD_PREP(STRTAB_BASE_CFG_FMT, STRTAB_BASE_CFG_FMT_2LVL);
+	reg |= FIELD_PREP(STRTAB_BASE_CFG_LOG2SIZE, size);
+	reg |= FIELD_PREP(STRTAB_BASE_CFG_SPLIT, STRTAB_SPLIT);
+	cfg->strtab_base_cfg = reg;
+
+	return arm_smmu_init_l1_strtab(smmu);
+}
+
+static int arm_smmu_init_strtab_linear(struct arm_smmu_device *smmu)
+{
+	void *strtab;
+	u64 reg;
+	u32 size;
+	struct arm_smmu_strtab_cfg *cfg = &smmu->strtab_cfg;
+
+	size = (1 << smmu->sid_bits) * (STRTAB_STE_DWORDS << 3);
+	strtab = dmam_alloc_coherent(smmu->dev, size, &cfg->strtab_dma,
+				     GFP_KERNEL | __GFP_ZERO);
+	if (!strtab) {
+		dev_err(smmu->dev,
+			"failed to allocate linear stream table (%u bytes)\n",
+			size);
+		return -ENOMEM;
+	}
+	cfg->strtab = strtab;
+	cfg->num_l1_ents = 1 << smmu->sid_bits;
+
+	/* Configure strtab_base_cfg for a linear table covering all SIDs */
+	reg  = FIELD_PREP(STRTAB_BASE_CFG_FMT, STRTAB_BASE_CFG_FMT_LINEAR);
+	reg |= FIELD_PREP(STRTAB_BASE_CFG_LOG2SIZE, smmu->sid_bits);
+	cfg->strtab_base_cfg = reg;
+
+	arm_smmu_init_bypass_stes(strtab, cfg->num_l1_ents);
+	return 0;
+}
+
+static int arm_smmu_init_strtab(struct arm_smmu_device *smmu)
+{
+	u64 reg;
+	int ret;
+
+	if (smmu->features & ARM_SMMU_FEAT_2_LVL_STRTAB)
+		ret = arm_smmu_init_strtab_2lvl(smmu);
+	else
+		ret = arm_smmu_init_strtab_linear(smmu);
+
+	if (ret)
+		return ret;
+
+	/* Set the strtab base address */
+	reg  = smmu->strtab_cfg.strtab_dma & STRTAB_BASE_ADDR_MASK;
+	reg |= STRTAB_BASE_RA;
+	smmu->strtab_cfg.strtab_base = reg;
+
+	/* Allocate the first VMID for stage-2 bypass STEs */
+	set_bit(0, smmu->vmid_map);
+	return 0;
+}
+
+static int arm_smmu_init_structures(struct arm_smmu_device *smmu)
+{
+	int ret;
+
+	ret = arm_smmu_init_queues(smmu);
+	if (ret)
+		return ret;
+
+	return arm_smmu_init_strtab(smmu);
+}
+
+static int arm_smmu_write_reg_sync(struct arm_smmu_device *smmu, u32 val,
+				   unsigned int reg_off, unsigned int ack_off)
+{
+	u32 reg;
+
+	writel_relaxed(val, smmu->base + reg_off);
+	return readl_relaxed_poll_timeout(smmu->base + ack_off, reg, reg == val,
+					  1, ARM_SMMU_POLL_TIMEOUT_US);
+}
+
+/* GBPA is "special" */
+static int arm_smmu_update_gbpa(struct arm_smmu_device *smmu, u32 set, u32 clr)
+{
+	int ret;
+	u32 reg, __iomem *gbpa = smmu->base + ARM_SMMU_GBPA;
+
+	ret = readl_relaxed_poll_timeout(gbpa, reg, !(reg & GBPA_UPDATE),
+					 1, ARM_SMMU_POLL_TIMEOUT_US);
+	if (ret)
+		return ret;
+
+	reg &= ~clr;
+	reg |= set;
+	writel_relaxed(reg | GBPA_UPDATE, gbpa);
+	ret = readl_relaxed_poll_timeout(gbpa, reg, !(reg & GBPA_UPDATE),
+					 1, ARM_SMMU_POLL_TIMEOUT_US);
+
+	if (ret)
+		dev_err(smmu->dev, "GBPA not responding to update\n");
+	return ret;
+}
+
+static void arm_smmu_free_msis(void *data)
+{
+	struct device *dev = data;
+	platform_msi_domain_free_irqs(dev);
+}
+
+static void arm_smmu_write_msi_msg(struct msi_desc *desc, struct msi_msg *msg)
+{
+	phys_addr_t doorbell;
+	struct device *dev = msi_desc_to_dev(desc);
+	struct arm_smmu_device *smmu = dev_get_drvdata(dev);
+	phys_addr_t *cfg = arm_smmu_msi_cfg[desc->platform.msi_index];
+
+	doorbell = (((u64)msg->address_hi) << 32) | msg->address_lo;
+	doorbell &= MSI_CFG0_ADDR_MASK;
+
+	writeq_relaxed(doorbell, smmu->base + cfg[0]);
+	writel_relaxed(msg->data, smmu->base + cfg[1]);
+	writel_relaxed(ARM_SMMU_MEMATTR_DEVICE_nGnRE, smmu->base + cfg[2]);
+}
+
+static void arm_smmu_setup_msis(struct arm_smmu_device *smmu)
+{
+	struct msi_desc *desc;
+	int ret, nvec = ARM_SMMU_MAX_MSIS;
+	struct device *dev = smmu->dev;
+
+	/* Clear the MSI address regs */
+	writeq_relaxed(0, smmu->base + ARM_SMMU_GERROR_IRQ_CFG0);
+	writeq_relaxed(0, smmu->base + ARM_SMMU_EVTQ_IRQ_CFG0);
+
+	if (smmu->features & ARM_SMMU_FEAT_PRI)
+		writeq_relaxed(0, smmu->base + ARM_SMMU_PRIQ_IRQ_CFG0);
+	else
+		nvec--;
+
+	if (!(smmu->features & ARM_SMMU_FEAT_MSI))
+		return;
+
+	if (!dev->msi_domain) {
+		dev_info(smmu->dev, "msi_domain absent - falling back to wired irqs\n");
+		return;
+	}
+
+	/* Allocate MSIs for evtq, gerror and priq. Ignore cmdq */
+	ret = platform_msi_domain_alloc_irqs(dev, nvec, arm_smmu_write_msi_msg);
+	if (ret) {
+		dev_warn(dev, "failed to allocate MSIs - falling back to wired irqs\n");
+		return;
+	}
+
+	for_each_msi_entry(desc, dev) {
+		switch (desc->platform.msi_index) {
+		case EVTQ_MSI_INDEX:
+			smmu->evtq.q.irq = desc->irq;
+			break;
+		case GERROR_MSI_INDEX:
+			smmu->gerr_irq = desc->irq;
+			break;
+		case PRIQ_MSI_INDEX:
+			smmu->priq.q.irq = desc->irq;
+			break;
+		default:	/* Unknown */
+			continue;
+		}
+	}
+
+	/* Add callback to free MSIs on teardown */
+	devm_add_action(dev, arm_smmu_free_msis, dev);
+}
+
+static void arm_smmu_setup_unique_irqs(struct arm_smmu_device *smmu)
+{
+	int irq, ret;
+
+	arm_smmu_setup_msis(smmu);
+
+	/* Request interrupt lines */
+	irq = smmu->evtq.q.irq;
+	if (irq) {
+		ret = devm_request_threaded_irq(smmu->dev, irq, NULL,
+						arm_smmu_evtq_thread,
+						IRQF_ONESHOT,
+						"arm-smmu-v3-evtq", smmu);
+		if (ret < 0)
+			dev_warn(smmu->dev, "failed to enable evtq irq\n");
+	} else {
+		dev_warn(smmu->dev, "no evtq irq - events will not be reported!\n");
+	}
+
+	irq = smmu->gerr_irq;
+	if (irq) {
+		ret = devm_request_irq(smmu->dev, irq, arm_smmu_gerror_handler,
+				       0, "arm-smmu-v3-gerror", smmu);
+		if (ret < 0)
+			dev_warn(smmu->dev, "failed to enable gerror irq\n");
+	} else {
+		dev_warn(smmu->dev, "no gerr irq - errors will not be reported!\n");
+	}
+
+	if (smmu->features & ARM_SMMU_FEAT_PRI) {
+		irq = smmu->priq.q.irq;
+		if (irq) {
+			ret = devm_request_threaded_irq(smmu->dev, irq, NULL,
+							arm_smmu_priq_thread,
+							IRQF_ONESHOT,
+							"arm-smmu-v3-priq",
+							smmu);
+			if (ret < 0)
+				dev_warn(smmu->dev,
+					 "failed to enable priq irq\n");
+		} else {
+			dev_warn(smmu->dev, "no priq irq - PRI will be broken\n");
+		}
+	}
+}
+
+static int arm_smmu_setup_irqs(struct arm_smmu_device *smmu)
+{
+	int ret, irq;
+	u32 irqen_flags = IRQ_CTRL_EVTQ_IRQEN | IRQ_CTRL_GERROR_IRQEN;
+
+	/* Disable IRQs first */
+	ret = arm_smmu_write_reg_sync(smmu, 0, ARM_SMMU_IRQ_CTRL,
+				      ARM_SMMU_IRQ_CTRLACK);
+	if (ret) {
+		dev_err(smmu->dev, "failed to disable irqs\n");
+		return ret;
+	}
+
+	irq = smmu->combined_irq;
+	if (irq) {
+		/*
+		 * Cavium ThunderX2 implementation doesn't not support unique
+		 * irq lines. Use single irq line for all the SMMUv3 interrupts.
+		 */
+		ret = devm_request_threaded_irq(smmu->dev, irq,
+					arm_smmu_combined_irq_handler,
+					arm_smmu_combined_irq_thread,
+					IRQF_ONESHOT,
+					"arm-smmu-v3-combined-irq", smmu);
+		if (ret < 0)
+			dev_warn(smmu->dev, "failed to enable combined irq\n");
+	} else
+		arm_smmu_setup_unique_irqs(smmu);
+
+	if (smmu->features & ARM_SMMU_FEAT_PRI)
+		irqen_flags |= IRQ_CTRL_PRIQ_IRQEN;
+
+	/* Enable interrupt generation on the SMMU */
+	ret = arm_smmu_write_reg_sync(smmu, irqen_flags,
+				      ARM_SMMU_IRQ_CTRL, ARM_SMMU_IRQ_CTRLACK);
+	if (ret)
+		dev_warn(smmu->dev, "failed to enable irqs\n");
+
+	return 0;
+}
+
+static int arm_smmu_device_disable(struct arm_smmu_device *smmu)
+{
+	int ret;
+
+	ret = arm_smmu_write_reg_sync(smmu, 0, ARM_SMMU_CR0, ARM_SMMU_CR0ACK);
+	if (ret)
+		dev_err(smmu->dev, "failed to clear cr0\n");
+
+	return ret;
+}
+
+static int arm_smmu_device_reset(struct arm_smmu_device *smmu, bool bypass)
+{
+	int ret;
+	u32 reg, enables;
+	struct arm_smmu_cmdq_ent cmd;
+
+	/* Clear CR0 and sync (disables SMMU and queue processing) */
+	reg = readl_relaxed(smmu->base + ARM_SMMU_CR0);
+	if (reg & CR0_SMMUEN) {
+		dev_warn(smmu->dev, "SMMU currently enabled! Resetting...\n");
+		WARN_ON(is_kdump_kernel() && !disable_bypass);
+		arm_smmu_update_gbpa(smmu, GBPA_ABORT, 0);
+	}
+
+	ret = arm_smmu_device_disable(smmu);
+	if (ret)
+		return ret;
+
+	/* CR1 (table and queue memory attributes) */
+	reg = FIELD_PREP(CR1_TABLE_SH, ARM_SMMU_SH_ISH) |
+	      FIELD_PREP(CR1_TABLE_OC, CR1_CACHE_WB) |
+	      FIELD_PREP(CR1_TABLE_IC, CR1_CACHE_WB) |
+	      FIELD_PREP(CR1_QUEUE_SH, ARM_SMMU_SH_ISH) |
+	      FIELD_PREP(CR1_QUEUE_OC, CR1_CACHE_WB) |
+	      FIELD_PREP(CR1_QUEUE_IC, CR1_CACHE_WB);
+	writel_relaxed(reg, smmu->base + ARM_SMMU_CR1);
+
+	/* CR2 (random crap) */
+	reg = CR2_PTM | CR2_RECINVSID | CR2_E2H;
+	writel_relaxed(reg, smmu->base + ARM_SMMU_CR2);
+
+	/* Stream table */
+	writeq_relaxed(smmu->strtab_cfg.strtab_base,
+		       smmu->base + ARM_SMMU_STRTAB_BASE);
+	writel_relaxed(smmu->strtab_cfg.strtab_base_cfg,
+		       smmu->base + ARM_SMMU_STRTAB_BASE_CFG);
+
+	/* Command queue */
+	writeq_relaxed(smmu->cmdq.q.q_base, smmu->base + ARM_SMMU_CMDQ_BASE);
+	writel_relaxed(smmu->cmdq.q.prod, smmu->base + ARM_SMMU_CMDQ_PROD);
+	writel_relaxed(smmu->cmdq.q.cons, smmu->base + ARM_SMMU_CMDQ_CONS);
+
+	enables = CR0_CMDQEN;
+	ret = arm_smmu_write_reg_sync(smmu, enables, ARM_SMMU_CR0,
+				      ARM_SMMU_CR0ACK);
+	if (ret) {
+		dev_err(smmu->dev, "failed to enable command queue\n");
+		return ret;
+	}
+
+	/* Invalidate any cached configuration */
+	cmd.opcode = CMDQ_OP_CFGI_ALL;
+	arm_smmu_cmdq_issue_cmd(smmu, &cmd);
+	arm_smmu_cmdq_issue_sync(smmu);
+
+	/* Invalidate any stale TLB entries */
+	if (smmu->features & ARM_SMMU_FEAT_HYP) {
+		cmd.opcode = CMDQ_OP_TLBI_EL2_ALL;
+		arm_smmu_cmdq_issue_cmd(smmu, &cmd);
+	}
+
+	cmd.opcode = CMDQ_OP_TLBI_NSNH_ALL;
+	arm_smmu_cmdq_issue_cmd(smmu, &cmd);
+	arm_smmu_cmdq_issue_sync(smmu);
+
+	/* Event queue */
+	writeq_relaxed(smmu->evtq.q.q_base, smmu->base + ARM_SMMU_EVTQ_BASE);
+	writel_relaxed(smmu->evtq.q.prod,
+		       arm_smmu_page1_fixup(ARM_SMMU_EVTQ_PROD, smmu));
+	writel_relaxed(smmu->evtq.q.cons,
+		       arm_smmu_page1_fixup(ARM_SMMU_EVTQ_CONS, smmu));
+
+	enables |= CR0_EVTQEN;
+	ret = arm_smmu_write_reg_sync(smmu, enables, ARM_SMMU_CR0,
+				      ARM_SMMU_CR0ACK);
+	if (ret) {
+		dev_err(smmu->dev, "failed to enable event queue\n");
+		return ret;
+	}
+
+	/* PRI queue */
+	if (smmu->features & ARM_SMMU_FEAT_PRI) {
+		writeq_relaxed(smmu->priq.q.q_base,
+			       smmu->base + ARM_SMMU_PRIQ_BASE);
+		writel_relaxed(smmu->priq.q.prod,
+			       arm_smmu_page1_fixup(ARM_SMMU_PRIQ_PROD, smmu));
+		writel_relaxed(smmu->priq.q.cons,
+			       arm_smmu_page1_fixup(ARM_SMMU_PRIQ_CONS, smmu));
+
+		enables |= CR0_PRIQEN;
+		ret = arm_smmu_write_reg_sync(smmu, enables, ARM_SMMU_CR0,
+					      ARM_SMMU_CR0ACK);
+		if (ret) {
+			dev_err(smmu->dev, "failed to enable PRI queue\n");
+			return ret;
+		}
+	}
+
+	ret = arm_smmu_setup_irqs(smmu);
+	if (ret) {
+		dev_err(smmu->dev, "failed to setup irqs\n");
+		return ret;
+	}
+
+	if (is_kdump_kernel())
+		enables &= ~(CR0_EVTQEN | CR0_PRIQEN);
+
+	/* Enable the SMMU interface, or ensure bypass */
+	if (!bypass || disable_bypass) {
+		enables |= CR0_SMMUEN;
+	} else {
+		ret = arm_smmu_update_gbpa(smmu, 0, GBPA_ABORT);
+		if (ret)
+			return ret;
+	}
+	ret = arm_smmu_write_reg_sync(smmu, enables, ARM_SMMU_CR0,
+				      ARM_SMMU_CR0ACK);
+	if (ret) {
+		dev_err(smmu->dev, "failed to enable SMMU interface\n");
+		return ret;
+	}
+
+	return 0;
+}
+
+static int arm_smmu_device_hw_probe(struct arm_smmu_device *smmu)
+{
+	u32 reg;
+	bool coherent = smmu->features & ARM_SMMU_FEAT_COHERENCY;
+
+	/* IDR0 */
+	reg = readl_relaxed(smmu->base + ARM_SMMU_IDR0);
+
+	/* 2-level structures */
+	if (FIELD_GET(IDR0_ST_LVL, reg) == IDR0_ST_LVL_2LVL)
+		smmu->features |= ARM_SMMU_FEAT_2_LVL_STRTAB;
+
+	if (reg & IDR0_CD2L)
+		smmu->features |= ARM_SMMU_FEAT_2_LVL_CDTAB;
+
+	/*
+	 * Translation table endianness.
+	 * We currently require the same endianness as the CPU, but this
+	 * could be changed later by adding a new IO_PGTABLE_QUIRK.
+	 */
+	switch (FIELD_GET(IDR0_TTENDIAN, reg)) {
+	case IDR0_TTENDIAN_MIXED:
+		smmu->features |= ARM_SMMU_FEAT_TT_LE | ARM_SMMU_FEAT_TT_BE;
+		break;
+#ifdef __BIG_ENDIAN
+	case IDR0_TTENDIAN_BE:
+		smmu->features |= ARM_SMMU_FEAT_TT_BE;
+		break;
+#else
+	case IDR0_TTENDIAN_LE:
+		smmu->features |= ARM_SMMU_FEAT_TT_LE;
+		break;
+#endif
+	default:
+		dev_err(smmu->dev, "unknown/unsupported TT endianness!\n");
+		return -ENXIO;
+	}
+
+	/* Boolean feature flags */
+	if (IS_ENABLED(CONFIG_PCI_PRI) && reg & IDR0_PRI)
+		smmu->features |= ARM_SMMU_FEAT_PRI;
+
+	if (IS_ENABLED(CONFIG_PCI_ATS) && reg & IDR0_ATS)
+		smmu->features |= ARM_SMMU_FEAT_ATS;
+
+	if (reg & IDR0_SEV)
+		smmu->features |= ARM_SMMU_FEAT_SEV;
+
+	if (reg & IDR0_MSI)
+		smmu->features |= ARM_SMMU_FEAT_MSI;
+
+	if (reg & IDR0_HYP)
+		smmu->features |= ARM_SMMU_FEAT_HYP;
+
+	/*
+	 * The coherency feature as set by FW is used in preference to the ID
+	 * register, but warn on mismatch.
+	 */
+	if (!!(reg & IDR0_COHACC) != coherent)
+		dev_warn(smmu->dev, "IDR0.COHACC overridden by FW configuration (%s)\n",
+			 coherent ? "true" : "false");
+
+	switch (FIELD_GET(IDR0_STALL_MODEL, reg)) {
+	case IDR0_STALL_MODEL_FORCE:
+		smmu->features |= ARM_SMMU_FEAT_STALL_FORCE;
+		/* Fallthrough */
+	case IDR0_STALL_MODEL_STALL:
+		smmu->features |= ARM_SMMU_FEAT_STALLS;
+	}
+
+	if (reg & IDR0_S1P)
+		smmu->features |= ARM_SMMU_FEAT_TRANS_S1;
+
+	if (reg & IDR0_S2P)
+		smmu->features |= ARM_SMMU_FEAT_TRANS_S2;
+
+	if (!(reg & (IDR0_S1P | IDR0_S2P))) {
+		dev_err(smmu->dev, "no translation support!\n");
+		return -ENXIO;
+	}
+
+	/* We only support the AArch64 table format at present */
+	switch (FIELD_GET(IDR0_TTF, reg)) {
+	case IDR0_TTF_AARCH32_64:
+		smmu->ias = 40;
+		/* Fallthrough */
+	case IDR0_TTF_AARCH64:
+		break;
+	default:
+		dev_err(smmu->dev, "AArch64 table format not supported!\n");
+		return -ENXIO;
+	}
+
+	/* ASID/VMID sizes */
+	smmu->asid_bits = reg & IDR0_ASID16 ? 16 : 8;
+	smmu->vmid_bits = reg & IDR0_VMID16 ? 16 : 8;
+
+	/* IDR1 */
+	reg = readl_relaxed(smmu->base + ARM_SMMU_IDR1);
+	if (reg & (IDR1_TABLES_PRESET | IDR1_QUEUES_PRESET | IDR1_REL)) {
+		dev_err(smmu->dev, "embedded implementation not supported\n");
+		return -ENXIO;
+	}
+
+	/* Queue sizes, capped at 4k */
+	smmu->cmdq.q.max_n_shift = min_t(u32, CMDQ_MAX_SZ_SHIFT,
+					 FIELD_GET(IDR1_CMDQS, reg));
+	if (!smmu->cmdq.q.max_n_shift) {
+		/* Odd alignment restrictions on the base, so ignore for now */
+		dev_err(smmu->dev, "unit-length command queue not supported\n");
+		return -ENXIO;
+	}
+
+	smmu->evtq.q.max_n_shift = min_t(u32, EVTQ_MAX_SZ_SHIFT,
+					 FIELD_GET(IDR1_EVTQS, reg));
+	smmu->priq.q.max_n_shift = min_t(u32, PRIQ_MAX_SZ_SHIFT,
+					 FIELD_GET(IDR1_PRIQS, reg));
+
+	/* SID/SSID sizes */
+	smmu->ssid_bits = FIELD_GET(IDR1_SSIDSIZE, reg);
+	smmu->sid_bits = FIELD_GET(IDR1_SIDSIZE, reg);
+
+	/*
+	 * If the SMMU supports fewer bits than would fill a single L2 stream
+	 * table, use a linear table instead.
+	 */
+	if (smmu->sid_bits <= STRTAB_SPLIT)
+		smmu->features &= ~ARM_SMMU_FEAT_2_LVL_STRTAB;
+
+	/* IDR5 */
+	reg = readl_relaxed(smmu->base + ARM_SMMU_IDR5);
+
+	/* Maximum number of outstanding stalls */
+	smmu->evtq.max_stalls = FIELD_GET(IDR5_STALL_MAX, reg);
+
+	/* Page sizes */
+	if (reg & IDR5_GRAN64K)
+		smmu->pgsize_bitmap |= SZ_64K | SZ_512M;
+	if (reg & IDR5_GRAN16K)
+		smmu->pgsize_bitmap |= SZ_16K | SZ_32M;
+	if (reg & IDR5_GRAN4K)
+		smmu->pgsize_bitmap |= SZ_4K | SZ_2M | SZ_1G;
+
+	/* Input address size */
+	if (FIELD_GET(IDR5_VAX, reg) == IDR5_VAX_52_BIT)
+		smmu->features |= ARM_SMMU_FEAT_VAX;
+
+	/* Output address size */
+	switch (FIELD_GET(IDR5_OAS, reg)) {
+	case IDR5_OAS_32_BIT:
+		smmu->oas = 32;
+		break;
+	case IDR5_OAS_36_BIT:
+		smmu->oas = 36;
+		break;
+	case IDR5_OAS_40_BIT:
+		smmu->oas = 40;
+		break;
+	case IDR5_OAS_42_BIT:
+		smmu->oas = 42;
+		break;
+	case IDR5_OAS_44_BIT:
+		smmu->oas = 44;
+		break;
+	case IDR5_OAS_52_BIT:
+		smmu->oas = 52;
+		smmu->pgsize_bitmap |= 1ULL << 42; /* 4TB */
+		break;
+	default:
+		dev_info(smmu->dev,
+			"unknown output address size. Truncating to 48-bit\n");
+		/* Fallthrough */
+	case IDR5_OAS_48_BIT:
+		smmu->oas = 48;
+	}
+
+	if (arm_smmu_ops.pgsize_bitmap == -1UL)
+		arm_smmu_ops.pgsize_bitmap = smmu->pgsize_bitmap;
+	else
+		arm_smmu_ops.pgsize_bitmap |= smmu->pgsize_bitmap;
+
+	/* Set the DMA mask for our table walker */
+	if (dma_set_mask_and_coherent(smmu->dev, DMA_BIT_MASK(smmu->oas)))
+		dev_warn(smmu->dev,
+			 "failed to set DMA mask for table walker\n");
+
+	smmu->ias = max(smmu->ias, smmu->oas);
+
+	dev_info(smmu->dev, "ias %lu-bit, oas %lu-bit (features 0x%08x)\n",
+		 smmu->ias, smmu->oas, smmu->features);
+	return 0;
+}
+
+#ifdef CONFIG_ACPI
+static void acpi_smmu_get_options(u32 model, struct arm_smmu_device *smmu)
+{
+	switch (model) {
+	case ACPI_IORT_SMMU_V3_CAVIUM_CN99XX:
+		smmu->options |= ARM_SMMU_OPT_PAGE0_REGS_ONLY;
+		break;
+	case ACPI_IORT_SMMU_V3_HISILICON_HI161X:
+		smmu->options |= ARM_SMMU_OPT_SKIP_PREFETCH;
+		break;
+	}
+
+	dev_notice(smmu->dev, "option mask 0x%x\n", smmu->options);
+}
+
+static int arm_smmu_device_acpi_probe(struct platform_device *pdev,
+				      struct arm_smmu_device *smmu)
+{
+	struct acpi_iort_smmu_v3 *iort_smmu;
+	struct device *dev = smmu->dev;
+	struct acpi_iort_node *node;
+
+	node = *(struct acpi_iort_node **)dev_get_platdata(dev);
+
+	/* Retrieve SMMUv3 specific data */
+	iort_smmu = (struct acpi_iort_smmu_v3 *)node->node_data;
+
+	acpi_smmu_get_options(iort_smmu->model, smmu);
+
+	if (iort_smmu->flags & ACPI_IORT_SMMU_V3_COHACC_OVERRIDE)
+		smmu->features |= ARM_SMMU_FEAT_COHERENCY;
+
+	return 0;
+}
+#else
+static inline int arm_smmu_device_acpi_probe(struct platform_device *pdev,
+					     struct arm_smmu_device *smmu)
+{
+	return -ENODEV;
+}
+#endif
+
+static int arm_smmu_device_dt_probe(struct platform_device *pdev,
+				    struct arm_smmu_device *smmu)
+{
+	struct device *dev = &pdev->dev;
+	u32 cells;
+	int ret = -EINVAL;
+
+	if (of_property_read_u32(dev->of_node, "#iommu-cells", &cells))
+		dev_err(dev, "missing #iommu-cells property\n");
+	else if (cells != 1)
+		dev_err(dev, "invalid #iommu-cells value (%d)\n", cells);
+	else
+		ret = 0;
+
+	parse_driver_options(smmu);
+
+	if (of_dma_is_coherent(dev->of_node))
+		smmu->features |= ARM_SMMU_FEAT_COHERENCY;
+
+	return ret;
+}
+
+static unsigned long arm_smmu_resource_size(struct arm_smmu_device *smmu)
+{
+	if (smmu->options & ARM_SMMU_OPT_PAGE0_REGS_ONLY)
+		return SZ_64K;
+	else
+		return SZ_128K;
+}
+
+static int arm_smmu_device_probe(struct platform_device *pdev)
+{
+	int irq, ret;
+	struct resource *res;
+	resource_size_t ioaddr;
+	struct arm_smmu_device *smmu;
+	struct device *dev = &pdev->dev;
+	bool bypass;
+
+	smmu = devm_kzalloc(dev, sizeof(*smmu), GFP_KERNEL);
+	if (!smmu) {
+		dev_err(dev, "failed to allocate arm_smmu_device\n");
+		return -ENOMEM;
+	}
+	smmu->dev = dev;
+
+	if (dev->of_node) {
+		ret = arm_smmu_device_dt_probe(pdev, smmu);
+	} else {
+		ret = arm_smmu_device_acpi_probe(pdev, smmu);
+		if (ret == -ENODEV)
+			return ret;
+	}
+
+	/* Set bypass mode according to firmware probing result */
+	bypass = !!ret;
+
+	/* Base address */
+	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	if (resource_size(res) + 1 < arm_smmu_resource_size(smmu)) {
+		dev_err(dev, "MMIO region too small (%pr)\n", res);
+		return -EINVAL;
+	}
+	ioaddr = res->start;
+
+	smmu->base = devm_ioremap_resource(dev, res);
+	if (IS_ERR(smmu->base))
+		return PTR_ERR(smmu->base);
+
+	/* Interrupt lines */
+
+	irq = platform_get_irq_byname(pdev, "combined");
+	if (irq > 0)
+		smmu->combined_irq = irq;
+	else {
+		irq = platform_get_irq_byname(pdev, "eventq");
+		if (irq > 0)
+			smmu->evtq.q.irq = irq;
+
+		irq = platform_get_irq_byname(pdev, "priq");
+		if (irq > 0)
+			smmu->priq.q.irq = irq;
+
+		irq = platform_get_irq_byname(pdev, "gerror");
+		if (irq > 0)
+			smmu->gerr_irq = irq;
+	}
+	/* Probe the h/w */
+	ret = arm_smmu_device_hw_probe(smmu);
+	if (ret)
+		return ret;
+
+	/* Initialise in-memory data structures */
+	ret = arm_smmu_init_structures(smmu);
+	if (ret)
+		return ret;
+
+	/* Record our private device structure */
+	platform_set_drvdata(pdev, smmu);
+
+	/* Reset the device */
+	ret = arm_smmu_device_reset(smmu, bypass);
+	if (ret)
+		return ret;
+
+	/* And we're up. Go go go! */
+	ret = iommu_device_sysfs_add(&smmu->iommu, dev, NULL,
+				     "smmu3.%pa", &ioaddr);
+	if (ret)
+		return ret;
+
+	iommu_device_set_ops(&smmu->iommu, &arm_smmu_ops);
+	iommu_device_set_fwnode(&smmu->iommu, dev->fwnode);
+
+	ret = iommu_device_register(&smmu->iommu);
+	if (ret) {
+		dev_err(dev, "Failed to register iommu\n");
+		return ret;
+	}
+
+#ifdef CONFIG_PCI
+	if (pci_bus_type.iommu_ops != &arm_smmu_ops) {
+		pci_request_acs();
+		ret = bus_set_iommu(&pci_bus_type, &arm_smmu_ops);
+		if (ret)
+			return ret;
+	}
+#endif
+#ifdef CONFIG_ARM_AMBA
+	if (amba_bustype.iommu_ops != &arm_smmu_ops) {
+		ret = bus_set_iommu(&amba_bustype, &arm_smmu_ops);
+		if (ret)
+			return ret;
+	}
+#endif
+	if (platform_bus_type.iommu_ops != &arm_smmu_ops) {
+		ret = bus_set_iommu(&platform_bus_type, &arm_smmu_ops);
+		if (ret)
+			return ret;
+	}
+	return 0;
+}
+
+static int arm_smmu_device_remove(struct platform_device *pdev)
+{
+	struct arm_smmu_device *smmu = platform_get_drvdata(pdev);
+
+	arm_smmu_device_disable(smmu);
+
+	return 0;
+}
+
+static void arm_smmu_device_shutdown(struct platform_device *pdev)
+{
+	arm_smmu_device_remove(pdev);
+}
+
+static const struct of_device_id arm_smmu_of_match[] = {
+	{ .compatible = "arm,smmu-v3", },
+	{ },
+};
+MODULE_DEVICE_TABLE(of, arm_smmu_of_match);
+
+static struct platform_driver arm_smmu_driver = {
+	.driver	= {
+		.name		= "arm-smmu-v3",
+		.of_match_table	= of_match_ptr(arm_smmu_of_match),
+	},
+	.probe	= arm_smmu_device_probe,
+	.remove	= arm_smmu_device_remove,
+	.shutdown = arm_smmu_device_shutdown,
+};
+module_platform_driver(arm_smmu_driver);
+
+MODULE_DESCRIPTION("IOMMU API for ARM architected SMMUv3 implementations");
+MODULE_AUTHOR("Will Deacon <will.deacon@arm.com>");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/iommu/arm-smmu.c b/drivers/iommu/arm-smmu.c
new file mode 100644
index 000000000..0c3b8f1c7
--- /dev/null
+++ b/drivers/iommu/arm-smmu.c
@@ -0,0 +1,2237 @@
+/*
+ * IOMMU API for ARM architected SMMU implementations.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) 2013 ARM Limited
+ *
+ * Author: Will Deacon <will.deacon@arm.com>
+ *
+ * This driver currently supports:
+ *	- SMMUv1 and v2 implementations
+ *	- Stream-matching and stream-indexing
+ *	- v7/v8 long-descriptor format
+ *	- Non-secure access to the SMMU
+ *	- Context fault reporting
+ *	- Extended Stream ID (16 bit)
+ */
+
+#define pr_fmt(fmt) "arm-smmu: " fmt
+
+#include <linux/acpi.h>
+#include <linux/acpi_iort.h>
+#include <linux/atomic.h>
+#include <linux/delay.h>
+#include <linux/dma-iommu.h>
+#include <linux/dma-mapping.h>
+#include <linux/err.h>
+#include <linux/interrupt.h>
+#include <linux/io.h>
+#include <linux/io-64-nonatomic-hi-lo.h>
+#include <linux/iommu.h>
+#include <linux/iopoll.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/of_address.h>
+#include <linux/of_device.h>
+#include <linux/of_iommu.h>
+#include <linux/pci.h>
+#include <linux/platform_device.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+
+#include <linux/amba/bus.h>
+
+#include "io-pgtable.h"
+#include "arm-smmu-regs.h"
+
+/*
+ * Apparently, some Qualcomm arm64 platforms which appear to expose their SMMU
+ * global register space are still, in fact, using a hypervisor to mediate it
+ * by trapping and emulating register accesses. Sadly, some deployed versions
+ * of said trapping code have bugs wherein they go horribly wrong for stores
+ * using r31 (i.e. XZR/WZR) as the source register.
+ */
+#define QCOM_DUMMY_VAL -1
+
+#define ARM_MMU500_ACTLR_CPRE		(1 << 1)
+
+#define ARM_MMU500_ACR_CACHE_LOCK	(1 << 26)
+#define ARM_MMU500_ACR_S2CRB_TLBEN	(1 << 10)
+#define ARM_MMU500_ACR_SMTNMB_TLBEN	(1 << 8)
+
+#define TLB_LOOP_TIMEOUT		1000000	/* 1s! */
+#define TLB_SPIN_COUNT			10
+
+/* Maximum number of context banks per SMMU */
+#define ARM_SMMU_MAX_CBS		128
+
+/* SMMU global address space */
+#define ARM_SMMU_GR0(smmu)		((smmu)->base)
+#define ARM_SMMU_GR1(smmu)		((smmu)->base + (1 << (smmu)->pgshift))
+
+/*
+ * SMMU global address space with conditional offset to access secure
+ * aliases of non-secure registers (e.g. nsCR0: 0x400, nsGFSR: 0x448,
+ * nsGFSYNR0: 0x450)
+ */
+#define ARM_SMMU_GR0_NS(smmu)						\
+	((smmu)->base +							\
+		((smmu->options & ARM_SMMU_OPT_SECURE_CFG_ACCESS)	\
+			? 0x400 : 0))
+
+/*
+ * Some 64-bit registers only make sense to write atomically, but in such
+ * cases all the data relevant to AArch32 formats lies within the lower word,
+ * therefore this actually makes more sense than it might first appear.
+ */
+#ifdef CONFIG_64BIT
+#define smmu_write_atomic_lq		writeq_relaxed
+#else
+#define smmu_write_atomic_lq		writel_relaxed
+#endif
+
+/* Translation context bank */
+#define ARM_SMMU_CB(smmu, n)	((smmu)->cb_base + ((n) << (smmu)->pgshift))
+
+#define MSI_IOVA_BASE			0x8000000
+#define MSI_IOVA_LENGTH			0x100000
+
+static int force_stage;
+module_param(force_stage, int, S_IRUGO);
+MODULE_PARM_DESC(force_stage,
+	"Force SMMU mappings to be installed at a particular stage of translation. A value of '1' or '2' forces the corresponding stage. All other values are ignored (i.e. no stage is forced). Note that selecting a specific stage will disable support for nested translation.");
+static bool disable_bypass;
+module_param(disable_bypass, bool, S_IRUGO);
+MODULE_PARM_DESC(disable_bypass,
+	"Disable bypass streams such that incoming transactions from devices that are not attached to an iommu domain will report an abort back to the device and will not be allowed to pass through the SMMU.");
+
+enum arm_smmu_arch_version {
+	ARM_SMMU_V1,
+	ARM_SMMU_V1_64K,
+	ARM_SMMU_V2,
+};
+
+enum arm_smmu_implementation {
+	GENERIC_SMMU,
+	ARM_MMU500,
+	CAVIUM_SMMUV2,
+	QCOM_SMMUV2,
+};
+
+struct arm_smmu_s2cr {
+	struct iommu_group		*group;
+	int				count;
+	enum arm_smmu_s2cr_type		type;
+	enum arm_smmu_s2cr_privcfg	privcfg;
+	u8				cbndx;
+};
+
+#define s2cr_init_val (struct arm_smmu_s2cr){				\
+	.type = disable_bypass ? S2CR_TYPE_FAULT : S2CR_TYPE_BYPASS,	\
+}
+
+struct arm_smmu_smr {
+	u16				mask;
+	u16				id;
+	bool				valid;
+};
+
+struct arm_smmu_cb {
+	u64				ttbr[2];
+	u32				tcr[2];
+	u32				mair[2];
+	struct arm_smmu_cfg		*cfg;
+};
+
+struct arm_smmu_master_cfg {
+	struct arm_smmu_device		*smmu;
+	s16				smendx[];
+};
+#define INVALID_SMENDX			-1
+#define __fwspec_cfg(fw) ((struct arm_smmu_master_cfg *)fw->iommu_priv)
+#define fwspec_smmu(fw)  (__fwspec_cfg(fw)->smmu)
+#define fwspec_smendx(fw, i) \
+	(i >= fw->num_ids ? INVALID_SMENDX : __fwspec_cfg(fw)->smendx[i])
+#define for_each_cfg_sme(fw, i, idx) \
+	for (i = 0; idx = fwspec_smendx(fw, i), i < fw->num_ids; ++i)
+
+struct arm_smmu_device {
+	struct device			*dev;
+
+	void __iomem			*base;
+	void __iomem			*cb_base;
+	unsigned long			pgshift;
+
+#define ARM_SMMU_FEAT_COHERENT_WALK	(1 << 0)
+#define ARM_SMMU_FEAT_STREAM_MATCH	(1 << 1)
+#define ARM_SMMU_FEAT_TRANS_S1		(1 << 2)
+#define ARM_SMMU_FEAT_TRANS_S2		(1 << 3)
+#define ARM_SMMU_FEAT_TRANS_NESTED	(1 << 4)
+#define ARM_SMMU_FEAT_TRANS_OPS		(1 << 5)
+#define ARM_SMMU_FEAT_VMID16		(1 << 6)
+#define ARM_SMMU_FEAT_FMT_AARCH64_4K	(1 << 7)
+#define ARM_SMMU_FEAT_FMT_AARCH64_16K	(1 << 8)
+#define ARM_SMMU_FEAT_FMT_AARCH64_64K	(1 << 9)
+#define ARM_SMMU_FEAT_FMT_AARCH32_L	(1 << 10)
+#define ARM_SMMU_FEAT_FMT_AARCH32_S	(1 << 11)
+#define ARM_SMMU_FEAT_EXIDS		(1 << 12)
+	u32				features;
+
+#define ARM_SMMU_OPT_SECURE_CFG_ACCESS (1 << 0)
+	u32				options;
+	enum arm_smmu_arch_version	version;
+	enum arm_smmu_implementation	model;
+
+	u32				num_context_banks;
+	u32				num_s2_context_banks;
+	DECLARE_BITMAP(context_map, ARM_SMMU_MAX_CBS);
+	struct arm_smmu_cb		*cbs;
+	atomic_t			irptndx;
+
+	u32				num_mapping_groups;
+	u16				streamid_mask;
+	u16				smr_mask_mask;
+	struct arm_smmu_smr		*smrs;
+	struct arm_smmu_s2cr		*s2crs;
+	struct mutex			stream_map_mutex;
+
+	unsigned long			va_size;
+	unsigned long			ipa_size;
+	unsigned long			pa_size;
+	unsigned long			pgsize_bitmap;
+
+	u32				num_global_irqs;
+	u32				num_context_irqs;
+	unsigned int			*irqs;
+
+	u32				cavium_id_base; /* Specific to Cavium */
+
+	spinlock_t			global_sync_lock;
+
+	/* IOMMU core code handle */
+	struct iommu_device		iommu;
+};
+
+enum arm_smmu_context_fmt {
+	ARM_SMMU_CTX_FMT_NONE,
+	ARM_SMMU_CTX_FMT_AARCH64,
+	ARM_SMMU_CTX_FMT_AARCH32_L,
+	ARM_SMMU_CTX_FMT_AARCH32_S,
+};
+
+struct arm_smmu_cfg {
+	u8				cbndx;
+	u8				irptndx;
+	union {
+		u16			asid;
+		u16			vmid;
+	};
+	u32				cbar;
+	enum arm_smmu_context_fmt	fmt;
+};
+#define INVALID_IRPTNDX			0xff
+
+enum arm_smmu_domain_stage {
+	ARM_SMMU_DOMAIN_S1 = 0,
+	ARM_SMMU_DOMAIN_S2,
+	ARM_SMMU_DOMAIN_NESTED,
+	ARM_SMMU_DOMAIN_BYPASS,
+};
+
+struct arm_smmu_domain {
+	struct arm_smmu_device		*smmu;
+	struct io_pgtable_ops		*pgtbl_ops;
+	const struct iommu_gather_ops	*tlb_ops;
+	struct arm_smmu_cfg		cfg;
+	enum arm_smmu_domain_stage	stage;
+	struct mutex			init_mutex; /* Protects smmu pointer */
+	spinlock_t			cb_lock; /* Serialises ATS1* ops and TLB syncs */
+	struct iommu_domain		domain;
+};
+
+struct arm_smmu_option_prop {
+	u32 opt;
+	const char *prop;
+};
+
+static atomic_t cavium_smmu_context_count = ATOMIC_INIT(0);
+
+static bool using_legacy_binding, using_generic_binding;
+
+static struct arm_smmu_option_prop arm_smmu_options[] = {
+	{ ARM_SMMU_OPT_SECURE_CFG_ACCESS, "calxeda,smmu-secure-config-access" },
+	{ 0, NULL},
+};
+
+static struct arm_smmu_domain *to_smmu_domain(struct iommu_domain *dom)
+{
+	return container_of(dom, struct arm_smmu_domain, domain);
+}
+
+static void parse_driver_options(struct arm_smmu_device *smmu)
+{
+	int i = 0;
+
+	do {
+		if (of_property_read_bool(smmu->dev->of_node,
+						arm_smmu_options[i].prop)) {
+			smmu->options |= arm_smmu_options[i].opt;
+			dev_notice(smmu->dev, "option %s\n",
+				arm_smmu_options[i].prop);
+		}
+	} while (arm_smmu_options[++i].opt);
+}
+
+static struct device_node *dev_get_dev_node(struct device *dev)
+{
+	if (dev_is_pci(dev)) {
+		struct pci_bus *bus = to_pci_dev(dev)->bus;
+
+		while (!pci_is_root_bus(bus))
+			bus = bus->parent;
+		return of_node_get(bus->bridge->parent->of_node);
+	}
+
+	return of_node_get(dev->of_node);
+}
+
+static int __arm_smmu_get_pci_sid(struct pci_dev *pdev, u16 alias, void *data)
+{
+	*((__be32 *)data) = cpu_to_be32(alias);
+	return 0; /* Continue walking */
+}
+
+static int __find_legacy_master_phandle(struct device *dev, void *data)
+{
+	struct of_phandle_iterator *it = *(void **)data;
+	struct device_node *np = it->node;
+	int err;
+
+	of_for_each_phandle(it, err, dev->of_node, "mmu-masters",
+			    "#stream-id-cells", 0)
+		if (it->node == np) {
+			*(void **)data = dev;
+			return 1;
+		}
+	it->node = np;
+	return err == -ENOENT ? 0 : err;
+}
+
+static struct platform_driver arm_smmu_driver;
+static struct iommu_ops arm_smmu_ops;
+
+static int arm_smmu_register_legacy_master(struct device *dev,
+					   struct arm_smmu_device **smmu)
+{
+	struct device *smmu_dev;
+	struct device_node *np;
+	struct of_phandle_iterator it;
+	void *data = &it;
+	u32 *sids;
+	__be32 pci_sid;
+	int err;
+
+	np = dev_get_dev_node(dev);
+	if (!np || !of_find_property(np, "#stream-id-cells", NULL)) {
+		of_node_put(np);
+		return -ENODEV;
+	}
+
+	it.node = np;
+	err = driver_for_each_device(&arm_smmu_driver.driver, NULL, &data,
+				     __find_legacy_master_phandle);
+	smmu_dev = data;
+	of_node_put(np);
+	if (err == 0)
+		return -ENODEV;
+	if (err < 0)
+		return err;
+
+	if (dev_is_pci(dev)) {
+		/* "mmu-masters" assumes Stream ID == Requester ID */
+		pci_for_each_dma_alias(to_pci_dev(dev), __arm_smmu_get_pci_sid,
+				       &pci_sid);
+		it.cur = &pci_sid;
+		it.cur_count = 1;
+	}
+
+	err = iommu_fwspec_init(dev, &smmu_dev->of_node->fwnode,
+				&arm_smmu_ops);
+	if (err)
+		return err;
+
+	sids = kcalloc(it.cur_count, sizeof(*sids), GFP_KERNEL);
+	if (!sids)
+		return -ENOMEM;
+
+	*smmu = dev_get_drvdata(smmu_dev);
+	of_phandle_iterator_args(&it, sids, it.cur_count);
+	err = iommu_fwspec_add_ids(dev, sids, it.cur_count);
+	kfree(sids);
+	return err;
+}
+
+static int __arm_smmu_alloc_bitmap(unsigned long *map, int start, int end)
+{
+	int idx;
+
+	do {
+		idx = find_next_zero_bit(map, end, start);
+		if (idx == end)
+			return -ENOSPC;
+	} while (test_and_set_bit(idx, map));
+
+	return idx;
+}
+
+static void __arm_smmu_free_bitmap(unsigned long *map, int idx)
+{
+	clear_bit(idx, map);
+}
+
+/* Wait for any pending TLB invalidations to complete */
+static void __arm_smmu_tlb_sync(struct arm_smmu_device *smmu,
+				void __iomem *sync, void __iomem *status)
+{
+	unsigned int spin_cnt, delay;
+
+	writel_relaxed(QCOM_DUMMY_VAL, sync);
+	for (delay = 1; delay < TLB_LOOP_TIMEOUT; delay *= 2) {
+		for (spin_cnt = TLB_SPIN_COUNT; spin_cnt > 0; spin_cnt--) {
+			if (!(readl_relaxed(status) & sTLBGSTATUS_GSACTIVE))
+				return;
+			cpu_relax();
+		}
+		udelay(delay);
+	}
+	dev_err_ratelimited(smmu->dev,
+			    "TLB sync timed out -- SMMU may be deadlocked\n");
+}
+
+static void arm_smmu_tlb_sync_global(struct arm_smmu_device *smmu)
+{
+	void __iomem *base = ARM_SMMU_GR0(smmu);
+	unsigned long flags;
+
+	spin_lock_irqsave(&smmu->global_sync_lock, flags);
+	__arm_smmu_tlb_sync(smmu, base + ARM_SMMU_GR0_sTLBGSYNC,
+			    base + ARM_SMMU_GR0_sTLBGSTATUS);
+	spin_unlock_irqrestore(&smmu->global_sync_lock, flags);
+}
+
+static void arm_smmu_tlb_sync_context(void *cookie)
+{
+	struct arm_smmu_domain *smmu_domain = cookie;
+	struct arm_smmu_device *smmu = smmu_domain->smmu;
+	void __iomem *base = ARM_SMMU_CB(smmu, smmu_domain->cfg.cbndx);
+	unsigned long flags;
+
+	spin_lock_irqsave(&smmu_domain->cb_lock, flags);
+	__arm_smmu_tlb_sync(smmu, base + ARM_SMMU_CB_TLBSYNC,
+			    base + ARM_SMMU_CB_TLBSTATUS);
+	spin_unlock_irqrestore(&smmu_domain->cb_lock, flags);
+}
+
+static void arm_smmu_tlb_sync_vmid(void *cookie)
+{
+	struct arm_smmu_domain *smmu_domain = cookie;
+
+	arm_smmu_tlb_sync_global(smmu_domain->smmu);
+}
+
+static void arm_smmu_tlb_inv_context_s1(void *cookie)
+{
+	struct arm_smmu_domain *smmu_domain = cookie;
+	struct arm_smmu_cfg *cfg = &smmu_domain->cfg;
+	void __iomem *base = ARM_SMMU_CB(smmu_domain->smmu, cfg->cbndx);
+
+	writel_relaxed(cfg->asid, base + ARM_SMMU_CB_S1_TLBIASID);
+	arm_smmu_tlb_sync_context(cookie);
+}
+
+static void arm_smmu_tlb_inv_context_s2(void *cookie)
+{
+	struct arm_smmu_domain *smmu_domain = cookie;
+	struct arm_smmu_device *smmu = smmu_domain->smmu;
+	void __iomem *base = ARM_SMMU_GR0(smmu);
+
+	writel_relaxed(smmu_domain->cfg.vmid, base + ARM_SMMU_GR0_TLBIVMID);
+	arm_smmu_tlb_sync_global(smmu);
+}
+
+static void arm_smmu_tlb_inv_range_nosync(unsigned long iova, size_t size,
+					  size_t granule, bool leaf, void *cookie)
+{
+	struct arm_smmu_domain *smmu_domain = cookie;
+	struct arm_smmu_cfg *cfg = &smmu_domain->cfg;
+	bool stage1 = cfg->cbar != CBAR_TYPE_S2_TRANS;
+	void __iomem *reg = ARM_SMMU_CB(smmu_domain->smmu, cfg->cbndx);
+
+	if (smmu_domain->smmu->features & ARM_SMMU_FEAT_COHERENT_WALK)
+		wmb();
+
+	if (stage1) {
+		reg += leaf ? ARM_SMMU_CB_S1_TLBIVAL : ARM_SMMU_CB_S1_TLBIVA;
+
+		if (cfg->fmt != ARM_SMMU_CTX_FMT_AARCH64) {
+			iova &= ~12UL;
+			iova |= cfg->asid;
+			do {
+				writel_relaxed(iova, reg);
+				iova += granule;
+			} while (size -= granule);
+		} else {
+			iova >>= 12;
+			iova |= (u64)cfg->asid << 48;
+			do {
+				writeq_relaxed(iova, reg);
+				iova += granule >> 12;
+			} while (size -= granule);
+		}
+	} else {
+		reg += leaf ? ARM_SMMU_CB_S2_TLBIIPAS2L :
+			      ARM_SMMU_CB_S2_TLBIIPAS2;
+		iova >>= 12;
+		do {
+			smmu_write_atomic_lq(iova, reg);
+			iova += granule >> 12;
+		} while (size -= granule);
+	}
+}
+
+/*
+ * On MMU-401 at least, the cost of firing off multiple TLBIVMIDs appears
+ * almost negligible, but the benefit of getting the first one in as far ahead
+ * of the sync as possible is significant, hence we don't just make this a
+ * no-op and set .tlb_sync to arm_smmu_inv_context_s2() as you might think.
+ */
+static void arm_smmu_tlb_inv_vmid_nosync(unsigned long iova, size_t size,
+					 size_t granule, bool leaf, void *cookie)
+{
+	struct arm_smmu_domain *smmu_domain = cookie;
+	void __iomem *base = ARM_SMMU_GR0(smmu_domain->smmu);
+
+	if (smmu_domain->smmu->features & ARM_SMMU_FEAT_COHERENT_WALK)
+		wmb();
+
+	writel_relaxed(smmu_domain->cfg.vmid, base + ARM_SMMU_GR0_TLBIVMID);
+}
+
+static const struct iommu_gather_ops arm_smmu_s1_tlb_ops = {
+	.tlb_flush_all	= arm_smmu_tlb_inv_context_s1,
+	.tlb_add_flush	= arm_smmu_tlb_inv_range_nosync,
+	.tlb_sync	= arm_smmu_tlb_sync_context,
+};
+
+static const struct iommu_gather_ops arm_smmu_s2_tlb_ops_v2 = {
+	.tlb_flush_all	= arm_smmu_tlb_inv_context_s2,
+	.tlb_add_flush	= arm_smmu_tlb_inv_range_nosync,
+	.tlb_sync	= arm_smmu_tlb_sync_context,
+};
+
+static const struct iommu_gather_ops arm_smmu_s2_tlb_ops_v1 = {
+	.tlb_flush_all	= arm_smmu_tlb_inv_context_s2,
+	.tlb_add_flush	= arm_smmu_tlb_inv_vmid_nosync,
+	.tlb_sync	= arm_smmu_tlb_sync_vmid,
+};
+
+static irqreturn_t arm_smmu_context_fault(int irq, void *dev)
+{
+	u32 fsr, fsynr;
+	unsigned long iova;
+	struct iommu_domain *domain = dev;
+	struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
+	struct arm_smmu_cfg *cfg = &smmu_domain->cfg;
+	struct arm_smmu_device *smmu = smmu_domain->smmu;
+	void __iomem *cb_base;
+
+	cb_base = ARM_SMMU_CB(smmu, cfg->cbndx);
+	fsr = readl_relaxed(cb_base + ARM_SMMU_CB_FSR);
+
+	if (!(fsr & FSR_FAULT))
+		return IRQ_NONE;
+
+	fsynr = readl_relaxed(cb_base + ARM_SMMU_CB_FSYNR0);
+	iova = readq_relaxed(cb_base + ARM_SMMU_CB_FAR);
+
+	dev_err_ratelimited(smmu->dev,
+	"Unhandled context fault: fsr=0x%x, iova=0x%08lx, fsynr=0x%x, cb=%d\n",
+			    fsr, iova, fsynr, cfg->cbndx);
+
+	writel(fsr, cb_base + ARM_SMMU_CB_FSR);
+	return IRQ_HANDLED;
+}
+
+static irqreturn_t arm_smmu_global_fault(int irq, void *dev)
+{
+	u32 gfsr, gfsynr0, gfsynr1, gfsynr2;
+	struct arm_smmu_device *smmu = dev;
+	void __iomem *gr0_base = ARM_SMMU_GR0_NS(smmu);
+
+	gfsr = readl_relaxed(gr0_base + ARM_SMMU_GR0_sGFSR);
+	gfsynr0 = readl_relaxed(gr0_base + ARM_SMMU_GR0_sGFSYNR0);
+	gfsynr1 = readl_relaxed(gr0_base + ARM_SMMU_GR0_sGFSYNR1);
+	gfsynr2 = readl_relaxed(gr0_base + ARM_SMMU_GR0_sGFSYNR2);
+
+	if (!gfsr)
+		return IRQ_NONE;
+
+	dev_err_ratelimited(smmu->dev,
+		"Unexpected global fault, this could be serious\n");
+	dev_err_ratelimited(smmu->dev,
+		"\tGFSR 0x%08x, GFSYNR0 0x%08x, GFSYNR1 0x%08x, GFSYNR2 0x%08x\n",
+		gfsr, gfsynr0, gfsynr1, gfsynr2);
+
+	writel(gfsr, gr0_base + ARM_SMMU_GR0_sGFSR);
+	return IRQ_HANDLED;
+}
+
+static void arm_smmu_init_context_bank(struct arm_smmu_domain *smmu_domain,
+				       struct io_pgtable_cfg *pgtbl_cfg)
+{
+	struct arm_smmu_cfg *cfg = &smmu_domain->cfg;
+	struct arm_smmu_cb *cb = &smmu_domain->smmu->cbs[cfg->cbndx];
+	bool stage1 = cfg->cbar != CBAR_TYPE_S2_TRANS;
+
+	cb->cfg = cfg;
+
+	/* TTBCR */
+	if (stage1) {
+		if (cfg->fmt == ARM_SMMU_CTX_FMT_AARCH32_S) {
+			cb->tcr[0] = pgtbl_cfg->arm_v7s_cfg.tcr;
+		} else {
+			cb->tcr[0] = pgtbl_cfg->arm_lpae_s1_cfg.tcr;
+			cb->tcr[1] = pgtbl_cfg->arm_lpae_s1_cfg.tcr >> 32;
+			cb->tcr[1] |= TTBCR2_SEP_UPSTREAM;
+			if (cfg->fmt == ARM_SMMU_CTX_FMT_AARCH64)
+				cb->tcr[1] |= TTBCR2_AS;
+		}
+	} else {
+		cb->tcr[0] = pgtbl_cfg->arm_lpae_s2_cfg.vtcr;
+	}
+
+	/* TTBRs */
+	if (stage1) {
+		if (cfg->fmt == ARM_SMMU_CTX_FMT_AARCH32_S) {
+			cb->ttbr[0] = pgtbl_cfg->arm_v7s_cfg.ttbr[0];
+			cb->ttbr[1] = pgtbl_cfg->arm_v7s_cfg.ttbr[1];
+		} else {
+			cb->ttbr[0] = pgtbl_cfg->arm_lpae_s1_cfg.ttbr[0];
+			cb->ttbr[0] |= (u64)cfg->asid << TTBRn_ASID_SHIFT;
+			cb->ttbr[1] = pgtbl_cfg->arm_lpae_s1_cfg.ttbr[1];
+			cb->ttbr[1] |= (u64)cfg->asid << TTBRn_ASID_SHIFT;
+		}
+	} else {
+		cb->ttbr[0] = pgtbl_cfg->arm_lpae_s2_cfg.vttbr;
+	}
+
+	/* MAIRs (stage-1 only) */
+	if (stage1) {
+		if (cfg->fmt == ARM_SMMU_CTX_FMT_AARCH32_S) {
+			cb->mair[0] = pgtbl_cfg->arm_v7s_cfg.prrr;
+			cb->mair[1] = pgtbl_cfg->arm_v7s_cfg.nmrr;
+		} else {
+			cb->mair[0] = pgtbl_cfg->arm_lpae_s1_cfg.mair[0];
+			cb->mair[1] = pgtbl_cfg->arm_lpae_s1_cfg.mair[1];
+		}
+	}
+}
+
+static void arm_smmu_write_context_bank(struct arm_smmu_device *smmu, int idx)
+{
+	u32 reg;
+	bool stage1;
+	struct arm_smmu_cb *cb = &smmu->cbs[idx];
+	struct arm_smmu_cfg *cfg = cb->cfg;
+	void __iomem *cb_base, *gr1_base;
+
+	cb_base = ARM_SMMU_CB(smmu, idx);
+
+	/* Unassigned context banks only need disabling */
+	if (!cfg) {
+		writel_relaxed(0, cb_base + ARM_SMMU_CB_SCTLR);
+		return;
+	}
+
+	gr1_base = ARM_SMMU_GR1(smmu);
+	stage1 = cfg->cbar != CBAR_TYPE_S2_TRANS;
+
+	/* CBA2R */
+	if (smmu->version > ARM_SMMU_V1) {
+		if (cfg->fmt == ARM_SMMU_CTX_FMT_AARCH64)
+			reg = CBA2R_RW64_64BIT;
+		else
+			reg = CBA2R_RW64_32BIT;
+		/* 16-bit VMIDs live in CBA2R */
+		if (smmu->features & ARM_SMMU_FEAT_VMID16)
+			reg |= cfg->vmid << CBA2R_VMID_SHIFT;
+
+		writel_relaxed(reg, gr1_base + ARM_SMMU_GR1_CBA2R(idx));
+	}
+
+	/* CBAR */
+	reg = cfg->cbar;
+	if (smmu->version < ARM_SMMU_V2)
+		reg |= cfg->irptndx << CBAR_IRPTNDX_SHIFT;
+
+	/*
+	 * Use the weakest shareability/memory types, so they are
+	 * overridden by the ttbcr/pte.
+	 */
+	if (stage1) {
+		reg |= (CBAR_S1_BPSHCFG_NSH << CBAR_S1_BPSHCFG_SHIFT) |
+			(CBAR_S1_MEMATTR_WB << CBAR_S1_MEMATTR_SHIFT);
+	} else if (!(smmu->features & ARM_SMMU_FEAT_VMID16)) {
+		/* 8-bit VMIDs live in CBAR */
+		reg |= cfg->vmid << CBAR_VMID_SHIFT;
+	}
+	writel_relaxed(reg, gr1_base + ARM_SMMU_GR1_CBAR(idx));
+
+	/*
+	 * TTBCR
+	 * We must write this before the TTBRs, since it determines the
+	 * access behaviour of some fields (in particular, ASID[15:8]).
+	 */
+	if (stage1 && smmu->version > ARM_SMMU_V1)
+		writel_relaxed(cb->tcr[1], cb_base + ARM_SMMU_CB_TTBCR2);
+	writel_relaxed(cb->tcr[0], cb_base + ARM_SMMU_CB_TTBCR);
+
+	/* TTBRs */
+	if (cfg->fmt == ARM_SMMU_CTX_FMT_AARCH32_S) {
+		writel_relaxed(cfg->asid, cb_base + ARM_SMMU_CB_CONTEXTIDR);
+		writel_relaxed(cb->ttbr[0], cb_base + ARM_SMMU_CB_TTBR0);
+		writel_relaxed(cb->ttbr[1], cb_base + ARM_SMMU_CB_TTBR1);
+	} else {
+		writeq_relaxed(cb->ttbr[0], cb_base + ARM_SMMU_CB_TTBR0);
+		if (stage1)
+			writeq_relaxed(cb->ttbr[1], cb_base + ARM_SMMU_CB_TTBR1);
+	}
+
+	/* MAIRs (stage-1 only) */
+	if (stage1) {
+		writel_relaxed(cb->mair[0], cb_base + ARM_SMMU_CB_S1_MAIR0);
+		writel_relaxed(cb->mair[1], cb_base + ARM_SMMU_CB_S1_MAIR1);
+	}
+
+	/* SCTLR */
+	reg = SCTLR_CFIE | SCTLR_CFRE | SCTLR_AFE | SCTLR_TRE | SCTLR_M;
+	if (stage1)
+		reg |= SCTLR_S1_ASIDPNE;
+	if (IS_ENABLED(CONFIG_CPU_BIG_ENDIAN))
+		reg |= SCTLR_E;
+
+	writel_relaxed(reg, cb_base + ARM_SMMU_CB_SCTLR);
+}
+
+static int arm_smmu_init_domain_context(struct iommu_domain *domain,
+					struct arm_smmu_device *smmu)
+{
+	int irq, start, ret = 0;
+	unsigned long ias, oas;
+	struct io_pgtable_ops *pgtbl_ops;
+	struct io_pgtable_cfg pgtbl_cfg;
+	enum io_pgtable_fmt fmt;
+	struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
+	struct arm_smmu_cfg *cfg = &smmu_domain->cfg;
+
+	mutex_lock(&smmu_domain->init_mutex);
+	if (smmu_domain->smmu)
+		goto out_unlock;
+
+	if (domain->type == IOMMU_DOMAIN_IDENTITY) {
+		smmu_domain->stage = ARM_SMMU_DOMAIN_BYPASS;
+		smmu_domain->smmu = smmu;
+		goto out_unlock;
+	}
+
+	/*
+	 * Mapping the requested stage onto what we support is surprisingly
+	 * complicated, mainly because the spec allows S1+S2 SMMUs without
+	 * support for nested translation. That means we end up with the
+	 * following table:
+	 *
+	 * Requested        Supported        Actual
+	 *     S1               N              S1
+	 *     S1             S1+S2            S1
+	 *     S1               S2             S2
+	 *     S1               S1             S1
+	 *     N                N              N
+	 *     N              S1+S2            S2
+	 *     N                S2             S2
+	 *     N                S1             S1
+	 *
+	 * Note that you can't actually request stage-2 mappings.
+	 */
+	if (!(smmu->features & ARM_SMMU_FEAT_TRANS_S1))
+		smmu_domain->stage = ARM_SMMU_DOMAIN_S2;
+	if (!(smmu->features & ARM_SMMU_FEAT_TRANS_S2))
+		smmu_domain->stage = ARM_SMMU_DOMAIN_S1;
+
+	/*
+	 * Choosing a suitable context format is even more fiddly. Until we
+	 * grow some way for the caller to express a preference, and/or move
+	 * the decision into the io-pgtable code where it arguably belongs,
+	 * just aim for the closest thing to the rest of the system, and hope
+	 * that the hardware isn't esoteric enough that we can't assume AArch64
+	 * support to be a superset of AArch32 support...
+	 */
+	if (smmu->features & ARM_SMMU_FEAT_FMT_AARCH32_L)
+		cfg->fmt = ARM_SMMU_CTX_FMT_AARCH32_L;
+	if (IS_ENABLED(CONFIG_IOMMU_IO_PGTABLE_ARMV7S) &&
+	    !IS_ENABLED(CONFIG_64BIT) && !IS_ENABLED(CONFIG_ARM_LPAE) &&
+	    (smmu->features & ARM_SMMU_FEAT_FMT_AARCH32_S) &&
+	    (smmu_domain->stage == ARM_SMMU_DOMAIN_S1))
+		cfg->fmt = ARM_SMMU_CTX_FMT_AARCH32_S;
+	if ((IS_ENABLED(CONFIG_64BIT) || cfg->fmt == ARM_SMMU_CTX_FMT_NONE) &&
+	    (smmu->features & (ARM_SMMU_FEAT_FMT_AARCH64_64K |
+			       ARM_SMMU_FEAT_FMT_AARCH64_16K |
+			       ARM_SMMU_FEAT_FMT_AARCH64_4K)))
+		cfg->fmt = ARM_SMMU_CTX_FMT_AARCH64;
+
+	if (cfg->fmt == ARM_SMMU_CTX_FMT_NONE) {
+		ret = -EINVAL;
+		goto out_unlock;
+	}
+
+	switch (smmu_domain->stage) {
+	case ARM_SMMU_DOMAIN_S1:
+		cfg->cbar = CBAR_TYPE_S1_TRANS_S2_BYPASS;
+		start = smmu->num_s2_context_banks;
+		ias = smmu->va_size;
+		oas = smmu->ipa_size;
+		if (cfg->fmt == ARM_SMMU_CTX_FMT_AARCH64) {
+			fmt = ARM_64_LPAE_S1;
+		} else if (cfg->fmt == ARM_SMMU_CTX_FMT_AARCH32_L) {
+			fmt = ARM_32_LPAE_S1;
+			ias = min(ias, 32UL);
+			oas = min(oas, 40UL);
+		} else {
+			fmt = ARM_V7S;
+			ias = min(ias, 32UL);
+			oas = min(oas, 32UL);
+		}
+		smmu_domain->tlb_ops = &arm_smmu_s1_tlb_ops;
+		break;
+	case ARM_SMMU_DOMAIN_NESTED:
+		/*
+		 * We will likely want to change this if/when KVM gets
+		 * involved.
+		 */
+	case ARM_SMMU_DOMAIN_S2:
+		cfg->cbar = CBAR_TYPE_S2_TRANS;
+		start = 0;
+		ias = smmu->ipa_size;
+		oas = smmu->pa_size;
+		if (cfg->fmt == ARM_SMMU_CTX_FMT_AARCH64) {
+			fmt = ARM_64_LPAE_S2;
+		} else {
+			fmt = ARM_32_LPAE_S2;
+			ias = min(ias, 40UL);
+			oas = min(oas, 40UL);
+		}
+		if (smmu->version == ARM_SMMU_V2)
+			smmu_domain->tlb_ops = &arm_smmu_s2_tlb_ops_v2;
+		else
+			smmu_domain->tlb_ops = &arm_smmu_s2_tlb_ops_v1;
+		break;
+	default:
+		ret = -EINVAL;
+		goto out_unlock;
+	}
+	ret = __arm_smmu_alloc_bitmap(smmu->context_map, start,
+				      smmu->num_context_banks);
+	if (ret < 0)
+		goto out_unlock;
+
+	cfg->cbndx = ret;
+	if (smmu->version < ARM_SMMU_V2) {
+		cfg->irptndx = atomic_inc_return(&smmu->irptndx);
+		cfg->irptndx %= smmu->num_context_irqs;
+	} else {
+		cfg->irptndx = cfg->cbndx;
+	}
+
+	if (smmu_domain->stage == ARM_SMMU_DOMAIN_S2)
+		cfg->vmid = cfg->cbndx + 1 + smmu->cavium_id_base;
+	else
+		cfg->asid = cfg->cbndx + smmu->cavium_id_base;
+
+	pgtbl_cfg = (struct io_pgtable_cfg) {
+		.pgsize_bitmap	= smmu->pgsize_bitmap,
+		.ias		= ias,
+		.oas		= oas,
+		.tlb		= smmu_domain->tlb_ops,
+		.iommu_dev	= smmu->dev,
+	};
+
+	if (smmu->features & ARM_SMMU_FEAT_COHERENT_WALK)
+		pgtbl_cfg.quirks = IO_PGTABLE_QUIRK_NO_DMA;
+
+	smmu_domain->smmu = smmu;
+	pgtbl_ops = alloc_io_pgtable_ops(fmt, &pgtbl_cfg, smmu_domain);
+	if (!pgtbl_ops) {
+		ret = -ENOMEM;
+		goto out_clear_smmu;
+	}
+
+	/* Update the domain's page sizes to reflect the page table format */
+	domain->pgsize_bitmap = pgtbl_cfg.pgsize_bitmap;
+	domain->geometry.aperture_end = (1UL << ias) - 1;
+	domain->geometry.force_aperture = true;
+
+	/* Initialise the context bank with our page table cfg */
+	arm_smmu_init_context_bank(smmu_domain, &pgtbl_cfg);
+	arm_smmu_write_context_bank(smmu, cfg->cbndx);
+
+	/*
+	 * Request context fault interrupt. Do this last to avoid the
+	 * handler seeing a half-initialised domain state.
+	 */
+	irq = smmu->irqs[smmu->num_global_irqs + cfg->irptndx];
+	ret = devm_request_irq(smmu->dev, irq, arm_smmu_context_fault,
+			       IRQF_SHARED, "arm-smmu-context-fault", domain);
+	if (ret < 0) {
+		dev_err(smmu->dev, "failed to request context IRQ %d (%u)\n",
+			cfg->irptndx, irq);
+		cfg->irptndx = INVALID_IRPTNDX;
+	}
+
+	mutex_unlock(&smmu_domain->init_mutex);
+
+	/* Publish page table ops for map/unmap */
+	smmu_domain->pgtbl_ops = pgtbl_ops;
+	return 0;
+
+out_clear_smmu:
+	smmu_domain->smmu = NULL;
+out_unlock:
+	mutex_unlock(&smmu_domain->init_mutex);
+	return ret;
+}
+
+static void arm_smmu_destroy_domain_context(struct iommu_domain *domain)
+{
+	struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
+	struct arm_smmu_device *smmu = smmu_domain->smmu;
+	struct arm_smmu_cfg *cfg = &smmu_domain->cfg;
+	int irq;
+
+	if (!smmu || domain->type == IOMMU_DOMAIN_IDENTITY)
+		return;
+
+	/*
+	 * Disable the context bank and free the page tables before freeing
+	 * it.
+	 */
+	smmu->cbs[cfg->cbndx].cfg = NULL;
+	arm_smmu_write_context_bank(smmu, cfg->cbndx);
+
+	if (cfg->irptndx != INVALID_IRPTNDX) {
+		irq = smmu->irqs[smmu->num_global_irqs + cfg->irptndx];
+		devm_free_irq(smmu->dev, irq, domain);
+	}
+
+	free_io_pgtable_ops(smmu_domain->pgtbl_ops);
+	__arm_smmu_free_bitmap(smmu->context_map, cfg->cbndx);
+}
+
+static struct iommu_domain *arm_smmu_domain_alloc(unsigned type)
+{
+	struct arm_smmu_domain *smmu_domain;
+
+	if (type != IOMMU_DOMAIN_UNMANAGED &&
+	    type != IOMMU_DOMAIN_DMA &&
+	    type != IOMMU_DOMAIN_IDENTITY)
+		return NULL;
+	/*
+	 * Allocate the domain and initialise some of its data structures.
+	 * We can't really do anything meaningful until we've added a
+	 * master.
+	 */
+	smmu_domain = kzalloc(sizeof(*smmu_domain), GFP_KERNEL);
+	if (!smmu_domain)
+		return NULL;
+
+	if (type == IOMMU_DOMAIN_DMA && (using_legacy_binding ||
+	    iommu_get_dma_cookie(&smmu_domain->domain))) {
+		kfree(smmu_domain);
+		return NULL;
+	}
+
+	mutex_init(&smmu_domain->init_mutex);
+	spin_lock_init(&smmu_domain->cb_lock);
+
+	return &smmu_domain->domain;
+}
+
+static void arm_smmu_domain_free(struct iommu_domain *domain)
+{
+	struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
+
+	/*
+	 * Free the domain resources. We assume that all devices have
+	 * already been detached.
+	 */
+	iommu_put_dma_cookie(domain);
+	arm_smmu_destroy_domain_context(domain);
+	kfree(smmu_domain);
+}
+
+static void arm_smmu_write_smr(struct arm_smmu_device *smmu, int idx)
+{
+	struct arm_smmu_smr *smr = smmu->smrs + idx;
+	u32 reg = smr->id << SMR_ID_SHIFT | smr->mask << SMR_MASK_SHIFT;
+
+	if (!(smmu->features & ARM_SMMU_FEAT_EXIDS) && smr->valid)
+		reg |= SMR_VALID;
+	writel_relaxed(reg, ARM_SMMU_GR0(smmu) + ARM_SMMU_GR0_SMR(idx));
+}
+
+static void arm_smmu_write_s2cr(struct arm_smmu_device *smmu, int idx)
+{
+	struct arm_smmu_s2cr *s2cr = smmu->s2crs + idx;
+	u32 reg = (s2cr->type & S2CR_TYPE_MASK) << S2CR_TYPE_SHIFT |
+		  (s2cr->cbndx & S2CR_CBNDX_MASK) << S2CR_CBNDX_SHIFT |
+		  (s2cr->privcfg & S2CR_PRIVCFG_MASK) << S2CR_PRIVCFG_SHIFT;
+
+	if (smmu->features & ARM_SMMU_FEAT_EXIDS && smmu->smrs &&
+	    smmu->smrs[idx].valid)
+		reg |= S2CR_EXIDVALID;
+	writel_relaxed(reg, ARM_SMMU_GR0(smmu) + ARM_SMMU_GR0_S2CR(idx));
+}
+
+static void arm_smmu_write_sme(struct arm_smmu_device *smmu, int idx)
+{
+	arm_smmu_write_s2cr(smmu, idx);
+	if (smmu->smrs)
+		arm_smmu_write_smr(smmu, idx);
+}
+
+/*
+ * The width of SMR's mask field depends on sCR0_EXIDENABLE, so this function
+ * should be called after sCR0 is written.
+ */
+static void arm_smmu_test_smr_masks(struct arm_smmu_device *smmu)
+{
+	void __iomem *gr0_base = ARM_SMMU_GR0(smmu);
+	u32 smr;
+
+	if (!smmu->smrs)
+		return;
+
+	/*
+	 * SMR.ID bits may not be preserved if the corresponding MASK
+	 * bits are set, so check each one separately. We can reject
+	 * masters later if they try to claim IDs outside these masks.
+	 */
+	smr = smmu->streamid_mask << SMR_ID_SHIFT;
+	writel_relaxed(smr, gr0_base + ARM_SMMU_GR0_SMR(0));
+	smr = readl_relaxed(gr0_base + ARM_SMMU_GR0_SMR(0));
+	smmu->streamid_mask = smr >> SMR_ID_SHIFT;
+
+	smr = smmu->streamid_mask << SMR_MASK_SHIFT;
+	writel_relaxed(smr, gr0_base + ARM_SMMU_GR0_SMR(0));
+	smr = readl_relaxed(gr0_base + ARM_SMMU_GR0_SMR(0));
+	smmu->smr_mask_mask = smr >> SMR_MASK_SHIFT;
+}
+
+static int arm_smmu_find_sme(struct arm_smmu_device *smmu, u16 id, u16 mask)
+{
+	struct arm_smmu_smr *smrs = smmu->smrs;
+	int i, free_idx = -ENOSPC;
+
+	/* Stream indexing is blissfully easy */
+	if (!smrs)
+		return id;
+
+	/* Validating SMRs is... less so */
+	for (i = 0; i < smmu->num_mapping_groups; ++i) {
+		if (!smrs[i].valid) {
+			/*
+			 * Note the first free entry we come across, which
+			 * we'll claim in the end if nothing else matches.
+			 */
+			if (free_idx < 0)
+				free_idx = i;
+			continue;
+		}
+		/*
+		 * If the new entry is _entirely_ matched by an existing entry,
+		 * then reuse that, with the guarantee that there also cannot
+		 * be any subsequent conflicting entries. In normal use we'd
+		 * expect simply identical entries for this case, but there's
+		 * no harm in accommodating the generalisation.
+		 */
+		if ((mask & smrs[i].mask) == mask &&
+		    !((id ^ smrs[i].id) & ~smrs[i].mask))
+			return i;
+		/*
+		 * If the new entry has any other overlap with an existing one,
+		 * though, then there always exists at least one stream ID
+		 * which would cause a conflict, and we can't allow that risk.
+		 */
+		if (!((id ^ smrs[i].id) & ~(smrs[i].mask | mask)))
+			return -EINVAL;
+	}
+
+	return free_idx;
+}
+
+static bool arm_smmu_free_sme(struct arm_smmu_device *smmu, int idx)
+{
+	if (--smmu->s2crs[idx].count)
+		return false;
+
+	smmu->s2crs[idx] = s2cr_init_val;
+	if (smmu->smrs)
+		smmu->smrs[idx].valid = false;
+
+	return true;
+}
+
+static int arm_smmu_master_alloc_smes(struct device *dev)
+{
+	struct iommu_fwspec *fwspec = dev->iommu_fwspec;
+	struct arm_smmu_master_cfg *cfg = fwspec->iommu_priv;
+	struct arm_smmu_device *smmu = cfg->smmu;
+	struct arm_smmu_smr *smrs = smmu->smrs;
+	struct iommu_group *group;
+	int i, idx, ret;
+
+	mutex_lock(&smmu->stream_map_mutex);
+	/* Figure out a viable stream map entry allocation */
+	for_each_cfg_sme(fwspec, i, idx) {
+		u16 sid = fwspec->ids[i];
+		u16 mask = fwspec->ids[i] >> SMR_MASK_SHIFT;
+
+		if (idx != INVALID_SMENDX) {
+			ret = -EEXIST;
+			goto out_err;
+		}
+
+		ret = arm_smmu_find_sme(smmu, sid, mask);
+		if (ret < 0)
+			goto out_err;
+
+		idx = ret;
+		if (smrs && smmu->s2crs[idx].count == 0) {
+			smrs[idx].id = sid;
+			smrs[idx].mask = mask;
+			smrs[idx].valid = true;
+		}
+		smmu->s2crs[idx].count++;
+		cfg->smendx[i] = (s16)idx;
+	}
+
+	group = iommu_group_get_for_dev(dev);
+	if (!group)
+		group = ERR_PTR(-ENOMEM);
+	if (IS_ERR(group)) {
+		ret = PTR_ERR(group);
+		goto out_err;
+	}
+	iommu_group_put(group);
+
+	/* It worked! Now, poke the actual hardware */
+	for_each_cfg_sme(fwspec, i, idx) {
+		arm_smmu_write_sme(smmu, idx);
+		smmu->s2crs[idx].group = group;
+	}
+
+	mutex_unlock(&smmu->stream_map_mutex);
+	return 0;
+
+out_err:
+	while (i--) {
+		arm_smmu_free_sme(smmu, cfg->smendx[i]);
+		cfg->smendx[i] = INVALID_SMENDX;
+	}
+	mutex_unlock(&smmu->stream_map_mutex);
+	return ret;
+}
+
+static void arm_smmu_master_free_smes(struct iommu_fwspec *fwspec)
+{
+	struct arm_smmu_device *smmu = fwspec_smmu(fwspec);
+	struct arm_smmu_master_cfg *cfg = fwspec->iommu_priv;
+	int i, idx;
+
+	mutex_lock(&smmu->stream_map_mutex);
+	for_each_cfg_sme(fwspec, i, idx) {
+		if (arm_smmu_free_sme(smmu, idx))
+			arm_smmu_write_sme(smmu, idx);
+		cfg->smendx[i] = INVALID_SMENDX;
+	}
+	mutex_unlock(&smmu->stream_map_mutex);
+}
+
+static int arm_smmu_domain_add_master(struct arm_smmu_domain *smmu_domain,
+				      struct iommu_fwspec *fwspec)
+{
+	struct arm_smmu_device *smmu = smmu_domain->smmu;
+	struct arm_smmu_s2cr *s2cr = smmu->s2crs;
+	u8 cbndx = smmu_domain->cfg.cbndx;
+	enum arm_smmu_s2cr_type type;
+	int i, idx;
+
+	if (smmu_domain->stage == ARM_SMMU_DOMAIN_BYPASS)
+		type = S2CR_TYPE_BYPASS;
+	else
+		type = S2CR_TYPE_TRANS;
+
+	for_each_cfg_sme(fwspec, i, idx) {
+		if (type == s2cr[idx].type && cbndx == s2cr[idx].cbndx)
+			continue;
+
+		s2cr[idx].type = type;
+		s2cr[idx].privcfg = S2CR_PRIVCFG_DEFAULT;
+		s2cr[idx].cbndx = cbndx;
+		arm_smmu_write_s2cr(smmu, idx);
+	}
+	return 0;
+}
+
+static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev)
+{
+	int ret;
+	struct iommu_fwspec *fwspec = dev->iommu_fwspec;
+	struct arm_smmu_device *smmu;
+	struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
+
+	if (!fwspec || fwspec->ops != &arm_smmu_ops) {
+		dev_err(dev, "cannot attach to SMMU, is it on the same bus?\n");
+		return -ENXIO;
+	}
+
+	/*
+	 * FIXME: The arch/arm DMA API code tries to attach devices to its own
+	 * domains between of_xlate() and add_device() - we have no way to cope
+	 * with that, so until ARM gets converted to rely on groups and default
+	 * domains, just say no (but more politely than by dereferencing NULL).
+	 * This should be at least a WARN_ON once that's sorted.
+	 */
+	if (!fwspec->iommu_priv)
+		return -ENODEV;
+
+	smmu = fwspec_smmu(fwspec);
+	/* Ensure that the domain is finalised */
+	ret = arm_smmu_init_domain_context(domain, smmu);
+	if (ret < 0)
+		return ret;
+
+	/*
+	 * Sanity check the domain. We don't support domains across
+	 * different SMMUs.
+	 */
+	if (smmu_domain->smmu != smmu) {
+		dev_err(dev,
+			"cannot attach to SMMU %s whilst already attached to domain on SMMU %s\n",
+			dev_name(smmu_domain->smmu->dev), dev_name(smmu->dev));
+		return -EINVAL;
+	}
+
+	/* Looks ok, so add the device to the domain */
+	return arm_smmu_domain_add_master(smmu_domain, fwspec);
+}
+
+static int arm_smmu_map(struct iommu_domain *domain, unsigned long iova,
+			phys_addr_t paddr, size_t size, int prot)
+{
+	struct io_pgtable_ops *ops = to_smmu_domain(domain)->pgtbl_ops;
+
+	if (!ops)
+		return -ENODEV;
+
+	return ops->map(ops, iova, paddr, size, prot);
+}
+
+static size_t arm_smmu_unmap(struct iommu_domain *domain, unsigned long iova,
+			     size_t size)
+{
+	struct io_pgtable_ops *ops = to_smmu_domain(domain)->pgtbl_ops;
+
+	if (!ops)
+		return 0;
+
+	return ops->unmap(ops, iova, size);
+}
+
+static void arm_smmu_iotlb_sync(struct iommu_domain *domain)
+{
+	struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
+
+	if (smmu_domain->tlb_ops)
+		smmu_domain->tlb_ops->tlb_sync(smmu_domain);
+}
+
+static phys_addr_t arm_smmu_iova_to_phys_hard(struct iommu_domain *domain,
+					      dma_addr_t iova)
+{
+	struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
+	struct arm_smmu_device *smmu = smmu_domain->smmu;
+	struct arm_smmu_cfg *cfg = &smmu_domain->cfg;
+	struct io_pgtable_ops *ops= smmu_domain->pgtbl_ops;
+	struct device *dev = smmu->dev;
+	void __iomem *cb_base;
+	u32 tmp;
+	u64 phys;
+	unsigned long va, flags;
+
+	cb_base = ARM_SMMU_CB(smmu, cfg->cbndx);
+
+	spin_lock_irqsave(&smmu_domain->cb_lock, flags);
+	/* ATS1 registers can only be written atomically */
+	va = iova & ~0xfffUL;
+	if (smmu->version == ARM_SMMU_V2)
+		smmu_write_atomic_lq(va, cb_base + ARM_SMMU_CB_ATS1PR);
+	else /* Register is only 32-bit in v1 */
+		writel_relaxed(va, cb_base + ARM_SMMU_CB_ATS1PR);
+
+	if (readl_poll_timeout_atomic(cb_base + ARM_SMMU_CB_ATSR, tmp,
+				      !(tmp & ATSR_ACTIVE), 5, 50)) {
+		spin_unlock_irqrestore(&smmu_domain->cb_lock, flags);
+		dev_err(dev,
+			"iova to phys timed out on %pad. Falling back to software table walk.\n",
+			&iova);
+		return ops->iova_to_phys(ops, iova);
+	}
+
+	phys = readq_relaxed(cb_base + ARM_SMMU_CB_PAR);
+	spin_unlock_irqrestore(&smmu_domain->cb_lock, flags);
+	if (phys & CB_PAR_F) {
+		dev_err(dev, "translation fault!\n");
+		dev_err(dev, "PAR = 0x%llx\n", phys);
+		return 0;
+	}
+
+	return (phys & GENMASK_ULL(39, 12)) | (iova & 0xfff);
+}
+
+static phys_addr_t arm_smmu_iova_to_phys(struct iommu_domain *domain,
+					dma_addr_t iova)
+{
+	struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
+	struct io_pgtable_ops *ops = smmu_domain->pgtbl_ops;
+
+	if (domain->type == IOMMU_DOMAIN_IDENTITY)
+		return iova;
+
+	if (!ops)
+		return 0;
+
+	if (smmu_domain->smmu->features & ARM_SMMU_FEAT_TRANS_OPS &&
+			smmu_domain->stage == ARM_SMMU_DOMAIN_S1)
+		return arm_smmu_iova_to_phys_hard(domain, iova);
+
+	return ops->iova_to_phys(ops, iova);
+}
+
+static bool arm_smmu_capable(enum iommu_cap cap)
+{
+	switch (cap) {
+	case IOMMU_CAP_CACHE_COHERENCY:
+		/*
+		 * Return true here as the SMMU can always send out coherent
+		 * requests.
+		 */
+		return true;
+	case IOMMU_CAP_NOEXEC:
+		return true;
+	default:
+		return false;
+	}
+}
+
+static int arm_smmu_match_node(struct device *dev, void *data)
+{
+	return dev->fwnode == data;
+}
+
+static
+struct arm_smmu_device *arm_smmu_get_by_fwnode(struct fwnode_handle *fwnode)
+{
+	struct device *dev = driver_find_device(&arm_smmu_driver.driver, NULL,
+						fwnode, arm_smmu_match_node);
+	put_device(dev);
+	return dev ? dev_get_drvdata(dev) : NULL;
+}
+
+static int arm_smmu_add_device(struct device *dev)
+{
+	struct arm_smmu_device *smmu;
+	struct arm_smmu_master_cfg *cfg;
+	struct iommu_fwspec *fwspec = dev->iommu_fwspec;
+	int i, ret;
+
+	if (using_legacy_binding) {
+		ret = arm_smmu_register_legacy_master(dev, &smmu);
+
+		/*
+		 * If dev->iommu_fwspec is initally NULL, arm_smmu_register_legacy_master()
+		 * will allocate/initialise a new one. Thus we need to update fwspec for
+		 * later use.
+		 */
+		fwspec = dev->iommu_fwspec;
+		if (ret)
+			goto out_free;
+	} else if (fwspec && fwspec->ops == &arm_smmu_ops) {
+		smmu = arm_smmu_get_by_fwnode(fwspec->iommu_fwnode);
+	} else {
+		return -ENODEV;
+	}
+
+	ret = -EINVAL;
+	for (i = 0; i < fwspec->num_ids; i++) {
+		u16 sid = fwspec->ids[i];
+		u16 mask = fwspec->ids[i] >> SMR_MASK_SHIFT;
+
+		if (sid & ~smmu->streamid_mask) {
+			dev_err(dev, "stream ID 0x%x out of range for SMMU (0x%x)\n",
+				sid, smmu->streamid_mask);
+			goto out_free;
+		}
+		if (mask & ~smmu->smr_mask_mask) {
+			dev_err(dev, "SMR mask 0x%x out of range for SMMU (0x%x)\n",
+				mask, smmu->smr_mask_mask);
+			goto out_free;
+		}
+	}
+
+	ret = -ENOMEM;
+	cfg = kzalloc(offsetof(struct arm_smmu_master_cfg, smendx[i]),
+		      GFP_KERNEL);
+	if (!cfg)
+		goto out_free;
+
+	cfg->smmu = smmu;
+	fwspec->iommu_priv = cfg;
+	while (i--)
+		cfg->smendx[i] = INVALID_SMENDX;
+
+	ret = arm_smmu_master_alloc_smes(dev);
+	if (ret)
+		goto out_cfg_free;
+
+	iommu_device_link(&smmu->iommu, dev);
+
+	return 0;
+
+out_cfg_free:
+	kfree(cfg);
+out_free:
+	iommu_fwspec_free(dev);
+	return ret;
+}
+
+static void arm_smmu_remove_device(struct device *dev)
+{
+	struct iommu_fwspec *fwspec = dev->iommu_fwspec;
+	struct arm_smmu_master_cfg *cfg;
+	struct arm_smmu_device *smmu;
+
+
+	if (!fwspec || fwspec->ops != &arm_smmu_ops)
+		return;
+
+	cfg  = fwspec->iommu_priv;
+	smmu = cfg->smmu;
+
+	iommu_device_unlink(&smmu->iommu, dev);
+	arm_smmu_master_free_smes(fwspec);
+	iommu_group_remove_device(dev);
+	kfree(fwspec->iommu_priv);
+	iommu_fwspec_free(dev);
+}
+
+static struct iommu_group *arm_smmu_device_group(struct device *dev)
+{
+	struct iommu_fwspec *fwspec = dev->iommu_fwspec;
+	struct arm_smmu_device *smmu = fwspec_smmu(fwspec);
+	struct iommu_group *group = NULL;
+	int i, idx;
+
+	for_each_cfg_sme(fwspec, i, idx) {
+		if (group && smmu->s2crs[idx].group &&
+		    group != smmu->s2crs[idx].group)
+			return ERR_PTR(-EINVAL);
+
+		group = smmu->s2crs[idx].group;
+	}
+
+	if (group)
+		return iommu_group_ref_get(group);
+
+	if (dev_is_pci(dev))
+		group = pci_device_group(dev);
+	else
+		group = generic_device_group(dev);
+
+	return group;
+}
+
+static int arm_smmu_domain_get_attr(struct iommu_domain *domain,
+				    enum iommu_attr attr, void *data)
+{
+	struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
+
+	if (domain->type != IOMMU_DOMAIN_UNMANAGED)
+		return -EINVAL;
+
+	switch (attr) {
+	case DOMAIN_ATTR_NESTING:
+		*(int *)data = (smmu_domain->stage == ARM_SMMU_DOMAIN_NESTED);
+		return 0;
+	default:
+		return -ENODEV;
+	}
+}
+
+static int arm_smmu_domain_set_attr(struct iommu_domain *domain,
+				    enum iommu_attr attr, void *data)
+{
+	int ret = 0;
+	struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
+
+	if (domain->type != IOMMU_DOMAIN_UNMANAGED)
+		return -EINVAL;
+
+	mutex_lock(&smmu_domain->init_mutex);
+
+	switch (attr) {
+	case DOMAIN_ATTR_NESTING:
+		if (smmu_domain->smmu) {
+			ret = -EPERM;
+			goto out_unlock;
+		}
+
+		if (*(int *)data)
+			smmu_domain->stage = ARM_SMMU_DOMAIN_NESTED;
+		else
+			smmu_domain->stage = ARM_SMMU_DOMAIN_S1;
+
+		break;
+	default:
+		ret = -ENODEV;
+	}
+
+out_unlock:
+	mutex_unlock(&smmu_domain->init_mutex);
+	return ret;
+}
+
+static int arm_smmu_of_xlate(struct device *dev, struct of_phandle_args *args)
+{
+	u32 mask, fwid = 0;
+
+	if (args->args_count > 0)
+		fwid |= (u16)args->args[0];
+
+	if (args->args_count > 1)
+		fwid |= (u16)args->args[1] << SMR_MASK_SHIFT;
+	else if (!of_property_read_u32(args->np, "stream-match-mask", &mask))
+		fwid |= (u16)mask << SMR_MASK_SHIFT;
+
+	return iommu_fwspec_add_ids(dev, &fwid, 1);
+}
+
+static void arm_smmu_get_resv_regions(struct device *dev,
+				      struct list_head *head)
+{
+	struct iommu_resv_region *region;
+	int prot = IOMMU_WRITE | IOMMU_NOEXEC | IOMMU_MMIO;
+
+	region = iommu_alloc_resv_region(MSI_IOVA_BASE, MSI_IOVA_LENGTH,
+					 prot, IOMMU_RESV_SW_MSI);
+	if (!region)
+		return;
+
+	list_add_tail(&region->list, head);
+
+	iommu_dma_get_resv_regions(dev, head);
+}
+
+static void arm_smmu_put_resv_regions(struct device *dev,
+				      struct list_head *head)
+{
+	struct iommu_resv_region *entry, *next;
+
+	list_for_each_entry_safe(entry, next, head, list)
+		kfree(entry);
+}
+
+static struct iommu_ops arm_smmu_ops = {
+	.capable		= arm_smmu_capable,
+	.domain_alloc		= arm_smmu_domain_alloc,
+	.domain_free		= arm_smmu_domain_free,
+	.attach_dev		= arm_smmu_attach_dev,
+	.map			= arm_smmu_map,
+	.unmap			= arm_smmu_unmap,
+	.flush_iotlb_all	= arm_smmu_iotlb_sync,
+	.iotlb_sync		= arm_smmu_iotlb_sync,
+	.iova_to_phys		= arm_smmu_iova_to_phys,
+	.add_device		= arm_smmu_add_device,
+	.remove_device		= arm_smmu_remove_device,
+	.device_group		= arm_smmu_device_group,
+	.domain_get_attr	= arm_smmu_domain_get_attr,
+	.domain_set_attr	= arm_smmu_domain_set_attr,
+	.of_xlate		= arm_smmu_of_xlate,
+	.get_resv_regions	= arm_smmu_get_resv_regions,
+	.put_resv_regions	= arm_smmu_put_resv_regions,
+	.pgsize_bitmap		= -1UL, /* Restricted during device attach */
+};
+
+static void arm_smmu_device_reset(struct arm_smmu_device *smmu)
+{
+	void __iomem *gr0_base = ARM_SMMU_GR0(smmu);
+	int i;
+	u32 reg, major;
+
+	/* clear global FSR */
+	reg = readl_relaxed(ARM_SMMU_GR0_NS(smmu) + ARM_SMMU_GR0_sGFSR);
+	writel(reg, ARM_SMMU_GR0_NS(smmu) + ARM_SMMU_GR0_sGFSR);
+
+	/*
+	 * Reset stream mapping groups: Initial values mark all SMRn as
+	 * invalid and all S2CRn as bypass unless overridden.
+	 */
+	for (i = 0; i < smmu->num_mapping_groups; ++i)
+		arm_smmu_write_sme(smmu, i);
+
+	if (smmu->model == ARM_MMU500) {
+		/*
+		 * Before clearing ARM_MMU500_ACTLR_CPRE, need to
+		 * clear CACHE_LOCK bit of ACR first. And, CACHE_LOCK
+		 * bit is only present in MMU-500r2 onwards.
+		 */
+		reg = readl_relaxed(gr0_base + ARM_SMMU_GR0_ID7);
+		major = (reg >> ID7_MAJOR_SHIFT) & ID7_MAJOR_MASK;
+		reg = readl_relaxed(gr0_base + ARM_SMMU_GR0_sACR);
+		if (major >= 2)
+			reg &= ~ARM_MMU500_ACR_CACHE_LOCK;
+		/*
+		 * Allow unmatched Stream IDs to allocate bypass
+		 * TLB entries for reduced latency.
+		 */
+		reg |= ARM_MMU500_ACR_SMTNMB_TLBEN | ARM_MMU500_ACR_S2CRB_TLBEN;
+		writel_relaxed(reg, gr0_base + ARM_SMMU_GR0_sACR);
+	}
+
+	/* Make sure all context banks are disabled and clear CB_FSR  */
+	for (i = 0; i < smmu->num_context_banks; ++i) {
+		void __iomem *cb_base = ARM_SMMU_CB(smmu, i);
+
+		arm_smmu_write_context_bank(smmu, i);
+		writel_relaxed(FSR_FAULT, cb_base + ARM_SMMU_CB_FSR);
+		/*
+		 * Disable MMU-500's not-particularly-beneficial next-page
+		 * prefetcher for the sake of errata #841119 and #826419.
+		 */
+		if (smmu->model == ARM_MMU500) {
+			reg = readl_relaxed(cb_base + ARM_SMMU_CB_ACTLR);
+			reg &= ~ARM_MMU500_ACTLR_CPRE;
+			writel_relaxed(reg, cb_base + ARM_SMMU_CB_ACTLR);
+		}
+	}
+
+	/* Invalidate the TLB, just in case */
+	writel_relaxed(QCOM_DUMMY_VAL, gr0_base + ARM_SMMU_GR0_TLBIALLH);
+	writel_relaxed(QCOM_DUMMY_VAL, gr0_base + ARM_SMMU_GR0_TLBIALLNSNH);
+
+	reg = readl_relaxed(ARM_SMMU_GR0_NS(smmu) + ARM_SMMU_GR0_sCR0);
+
+	/* Enable fault reporting */
+	reg |= (sCR0_GFRE | sCR0_GFIE | sCR0_GCFGFRE | sCR0_GCFGFIE);
+
+	/* Disable TLB broadcasting. */
+	reg |= (sCR0_VMIDPNE | sCR0_PTM);
+
+	/* Enable client access, handling unmatched streams as appropriate */
+	reg &= ~sCR0_CLIENTPD;
+	if (disable_bypass)
+		reg |= sCR0_USFCFG;
+	else
+		reg &= ~sCR0_USFCFG;
+
+	/* Disable forced broadcasting */
+	reg &= ~sCR0_FB;
+
+	/* Don't upgrade barriers */
+	reg &= ~(sCR0_BSU_MASK << sCR0_BSU_SHIFT);
+
+	if (smmu->features & ARM_SMMU_FEAT_VMID16)
+		reg |= sCR0_VMID16EN;
+
+	if (smmu->features & ARM_SMMU_FEAT_EXIDS)
+		reg |= sCR0_EXIDENABLE;
+
+	/* Push the button */
+	arm_smmu_tlb_sync_global(smmu);
+	writel(reg, ARM_SMMU_GR0_NS(smmu) + ARM_SMMU_GR0_sCR0);
+}
+
+static int arm_smmu_id_size_to_bits(int size)
+{
+	switch (size) {
+	case 0:
+		return 32;
+	case 1:
+		return 36;
+	case 2:
+		return 40;
+	case 3:
+		return 42;
+	case 4:
+		return 44;
+	case 5:
+	default:
+		return 48;
+	}
+}
+
+static int arm_smmu_device_cfg_probe(struct arm_smmu_device *smmu)
+{
+	unsigned long size;
+	void __iomem *gr0_base = ARM_SMMU_GR0(smmu);
+	u32 id;
+	bool cttw_reg, cttw_fw = smmu->features & ARM_SMMU_FEAT_COHERENT_WALK;
+	int i;
+
+	dev_notice(smmu->dev, "probing hardware configuration...\n");
+	dev_notice(smmu->dev, "SMMUv%d with:\n",
+			smmu->version == ARM_SMMU_V2 ? 2 : 1);
+
+	/* ID0 */
+	id = readl_relaxed(gr0_base + ARM_SMMU_GR0_ID0);
+
+	/* Restrict available stages based on module parameter */
+	if (force_stage == 1)
+		id &= ~(ID0_S2TS | ID0_NTS);
+	else if (force_stage == 2)
+		id &= ~(ID0_S1TS | ID0_NTS);
+
+	if (id & ID0_S1TS) {
+		smmu->features |= ARM_SMMU_FEAT_TRANS_S1;
+		dev_notice(smmu->dev, "\tstage 1 translation\n");
+	}
+
+	if (id & ID0_S2TS) {
+		smmu->features |= ARM_SMMU_FEAT_TRANS_S2;
+		dev_notice(smmu->dev, "\tstage 2 translation\n");
+	}
+
+	if (id & ID0_NTS) {
+		smmu->features |= ARM_SMMU_FEAT_TRANS_NESTED;
+		dev_notice(smmu->dev, "\tnested translation\n");
+	}
+
+	if (!(smmu->features &
+		(ARM_SMMU_FEAT_TRANS_S1 | ARM_SMMU_FEAT_TRANS_S2))) {
+		dev_err(smmu->dev, "\tno translation support!\n");
+		return -ENODEV;
+	}
+
+	if ((id & ID0_S1TS) &&
+		((smmu->version < ARM_SMMU_V2) || !(id & ID0_ATOSNS))) {
+		smmu->features |= ARM_SMMU_FEAT_TRANS_OPS;
+		dev_notice(smmu->dev, "\taddress translation ops\n");
+	}
+
+	/*
+	 * In order for DMA API calls to work properly, we must defer to what
+	 * the FW says about coherency, regardless of what the hardware claims.
+	 * Fortunately, this also opens up a workaround for systems where the
+	 * ID register value has ended up configured incorrectly.
+	 */
+	cttw_reg = !!(id & ID0_CTTW);
+	if (cttw_fw || cttw_reg)
+		dev_notice(smmu->dev, "\t%scoherent table walk\n",
+			   cttw_fw ? "" : "non-");
+	if (cttw_fw != cttw_reg)
+		dev_notice(smmu->dev,
+			   "\t(IDR0.CTTW overridden by FW configuration)\n");
+
+	/* Max. number of entries we have for stream matching/indexing */
+	if (smmu->version == ARM_SMMU_V2 && id & ID0_EXIDS) {
+		smmu->features |= ARM_SMMU_FEAT_EXIDS;
+		size = 1 << 16;
+	} else {
+		size = 1 << ((id >> ID0_NUMSIDB_SHIFT) & ID0_NUMSIDB_MASK);
+	}
+	smmu->streamid_mask = size - 1;
+	if (id & ID0_SMS) {
+		smmu->features |= ARM_SMMU_FEAT_STREAM_MATCH;
+		size = (id >> ID0_NUMSMRG_SHIFT) & ID0_NUMSMRG_MASK;
+		if (size == 0) {
+			dev_err(smmu->dev,
+				"stream-matching supported, but no SMRs present!\n");
+			return -ENODEV;
+		}
+
+		/* Zero-initialised to mark as invalid */
+		smmu->smrs = devm_kcalloc(smmu->dev, size, sizeof(*smmu->smrs),
+					  GFP_KERNEL);
+		if (!smmu->smrs)
+			return -ENOMEM;
+
+		dev_notice(smmu->dev,
+			   "\tstream matching with %lu register groups", size);
+	}
+	/* s2cr->type == 0 means translation, so initialise explicitly */
+	smmu->s2crs = devm_kmalloc_array(smmu->dev, size, sizeof(*smmu->s2crs),
+					 GFP_KERNEL);
+	if (!smmu->s2crs)
+		return -ENOMEM;
+	for (i = 0; i < size; i++)
+		smmu->s2crs[i] = s2cr_init_val;
+
+	smmu->num_mapping_groups = size;
+	mutex_init(&smmu->stream_map_mutex);
+	spin_lock_init(&smmu->global_sync_lock);
+
+	if (smmu->version < ARM_SMMU_V2 || !(id & ID0_PTFS_NO_AARCH32)) {
+		smmu->features |= ARM_SMMU_FEAT_FMT_AARCH32_L;
+		if (!(id & ID0_PTFS_NO_AARCH32S))
+			smmu->features |= ARM_SMMU_FEAT_FMT_AARCH32_S;
+	}
+
+	/* ID1 */
+	id = readl_relaxed(gr0_base + ARM_SMMU_GR0_ID1);
+	smmu->pgshift = (id & ID1_PAGESIZE) ? 16 : 12;
+
+	/* Check for size mismatch of SMMU address space from mapped region */
+	size = 1 << (((id >> ID1_NUMPAGENDXB_SHIFT) & ID1_NUMPAGENDXB_MASK) + 1);
+	size <<= smmu->pgshift;
+	if (smmu->cb_base != gr0_base + size)
+		dev_warn(smmu->dev,
+			"SMMU address space size (0x%lx) differs from mapped region size (0x%tx)!\n",
+			size * 2, (smmu->cb_base - gr0_base) * 2);
+
+	smmu->num_s2_context_banks = (id >> ID1_NUMS2CB_SHIFT) & ID1_NUMS2CB_MASK;
+	smmu->num_context_banks = (id >> ID1_NUMCB_SHIFT) & ID1_NUMCB_MASK;
+	if (smmu->num_s2_context_banks > smmu->num_context_banks) {
+		dev_err(smmu->dev, "impossible number of S2 context banks!\n");
+		return -ENODEV;
+	}
+	dev_notice(smmu->dev, "\t%u context banks (%u stage-2 only)\n",
+		   smmu->num_context_banks, smmu->num_s2_context_banks);
+	/*
+	 * Cavium CN88xx erratum #27704.
+	 * Ensure ASID and VMID allocation is unique across all SMMUs in
+	 * the system.
+	 */
+	if (smmu->model == CAVIUM_SMMUV2) {
+		smmu->cavium_id_base =
+			atomic_add_return(smmu->num_context_banks,
+					  &cavium_smmu_context_count);
+		smmu->cavium_id_base -= smmu->num_context_banks;
+		dev_notice(smmu->dev, "\tenabling workaround for Cavium erratum 27704\n");
+	}
+	smmu->cbs = devm_kcalloc(smmu->dev, smmu->num_context_banks,
+				 sizeof(*smmu->cbs), GFP_KERNEL);
+	if (!smmu->cbs)
+		return -ENOMEM;
+
+	/* ID2 */
+	id = readl_relaxed(gr0_base + ARM_SMMU_GR0_ID2);
+	size = arm_smmu_id_size_to_bits((id >> ID2_IAS_SHIFT) & ID2_IAS_MASK);
+	smmu->ipa_size = size;
+
+	/* The output mask is also applied for bypass */
+	size = arm_smmu_id_size_to_bits((id >> ID2_OAS_SHIFT) & ID2_OAS_MASK);
+	smmu->pa_size = size;
+
+	if (id & ID2_VMID16)
+		smmu->features |= ARM_SMMU_FEAT_VMID16;
+
+	/*
+	 * What the page table walker can address actually depends on which
+	 * descriptor format is in use, but since a) we don't know that yet,
+	 * and b) it can vary per context bank, this will have to do...
+	 */
+	if (dma_set_mask_and_coherent(smmu->dev, DMA_BIT_MASK(size)))
+		dev_warn(smmu->dev,
+			 "failed to set DMA mask for table walker\n");
+
+	if (smmu->version < ARM_SMMU_V2) {
+		smmu->va_size = smmu->ipa_size;
+		if (smmu->version == ARM_SMMU_V1_64K)
+			smmu->features |= ARM_SMMU_FEAT_FMT_AARCH64_64K;
+	} else {
+		size = (id >> ID2_UBS_SHIFT) & ID2_UBS_MASK;
+		smmu->va_size = arm_smmu_id_size_to_bits(size);
+		if (id & ID2_PTFS_4K)
+			smmu->features |= ARM_SMMU_FEAT_FMT_AARCH64_4K;
+		if (id & ID2_PTFS_16K)
+			smmu->features |= ARM_SMMU_FEAT_FMT_AARCH64_16K;
+		if (id & ID2_PTFS_64K)
+			smmu->features |= ARM_SMMU_FEAT_FMT_AARCH64_64K;
+	}
+
+	/* Now we've corralled the various formats, what'll it do? */
+	if (smmu->features & ARM_SMMU_FEAT_FMT_AARCH32_S)
+		smmu->pgsize_bitmap |= SZ_4K | SZ_64K | SZ_1M | SZ_16M;
+	if (smmu->features &
+	    (ARM_SMMU_FEAT_FMT_AARCH32_L | ARM_SMMU_FEAT_FMT_AARCH64_4K))
+		smmu->pgsize_bitmap |= SZ_4K | SZ_2M | SZ_1G;
+	if (smmu->features & ARM_SMMU_FEAT_FMT_AARCH64_16K)
+		smmu->pgsize_bitmap |= SZ_16K | SZ_32M;
+	if (smmu->features & ARM_SMMU_FEAT_FMT_AARCH64_64K)
+		smmu->pgsize_bitmap |= SZ_64K | SZ_512M;
+
+	if (arm_smmu_ops.pgsize_bitmap == -1UL)
+		arm_smmu_ops.pgsize_bitmap = smmu->pgsize_bitmap;
+	else
+		arm_smmu_ops.pgsize_bitmap |= smmu->pgsize_bitmap;
+	dev_notice(smmu->dev, "\tSupported page sizes: 0x%08lx\n",
+		   smmu->pgsize_bitmap);
+
+
+	if (smmu->features & ARM_SMMU_FEAT_TRANS_S1)
+		dev_notice(smmu->dev, "\tStage-1: %lu-bit VA -> %lu-bit IPA\n",
+			   smmu->va_size, smmu->ipa_size);
+
+	if (smmu->features & ARM_SMMU_FEAT_TRANS_S2)
+		dev_notice(smmu->dev, "\tStage-2: %lu-bit IPA -> %lu-bit PA\n",
+			   smmu->ipa_size, smmu->pa_size);
+
+	return 0;
+}
+
+struct arm_smmu_match_data {
+	enum arm_smmu_arch_version version;
+	enum arm_smmu_implementation model;
+};
+
+#define ARM_SMMU_MATCH_DATA(name, ver, imp)	\
+static struct arm_smmu_match_data name = { .version = ver, .model = imp }
+
+ARM_SMMU_MATCH_DATA(smmu_generic_v1, ARM_SMMU_V1, GENERIC_SMMU);
+ARM_SMMU_MATCH_DATA(smmu_generic_v2, ARM_SMMU_V2, GENERIC_SMMU);
+ARM_SMMU_MATCH_DATA(arm_mmu401, ARM_SMMU_V1_64K, GENERIC_SMMU);
+ARM_SMMU_MATCH_DATA(arm_mmu500, ARM_SMMU_V2, ARM_MMU500);
+ARM_SMMU_MATCH_DATA(cavium_smmuv2, ARM_SMMU_V2, CAVIUM_SMMUV2);
+ARM_SMMU_MATCH_DATA(qcom_smmuv2, ARM_SMMU_V2, QCOM_SMMUV2);
+
+static const struct of_device_id arm_smmu_of_match[] = {
+	{ .compatible = "arm,smmu-v1", .data = &smmu_generic_v1 },
+	{ .compatible = "arm,smmu-v2", .data = &smmu_generic_v2 },
+	{ .compatible = "arm,mmu-400", .data = &smmu_generic_v1 },
+	{ .compatible = "arm,mmu-401", .data = &arm_mmu401 },
+	{ .compatible = "arm,mmu-500", .data = &arm_mmu500 },
+	{ .compatible = "cavium,smmu-v2", .data = &cavium_smmuv2 },
+	{ .compatible = "qcom,smmu-v2", .data = &qcom_smmuv2 },
+	{ },
+};
+MODULE_DEVICE_TABLE(of, arm_smmu_of_match);
+
+#ifdef CONFIG_ACPI
+static int acpi_smmu_get_data(u32 model, struct arm_smmu_device *smmu)
+{
+	int ret = 0;
+
+	switch (model) {
+	case ACPI_IORT_SMMU_V1:
+	case ACPI_IORT_SMMU_CORELINK_MMU400:
+		smmu->version = ARM_SMMU_V1;
+		smmu->model = GENERIC_SMMU;
+		break;
+	case ACPI_IORT_SMMU_CORELINK_MMU401:
+		smmu->version = ARM_SMMU_V1_64K;
+		smmu->model = GENERIC_SMMU;
+		break;
+	case ACPI_IORT_SMMU_V2:
+		smmu->version = ARM_SMMU_V2;
+		smmu->model = GENERIC_SMMU;
+		break;
+	case ACPI_IORT_SMMU_CORELINK_MMU500:
+		smmu->version = ARM_SMMU_V2;
+		smmu->model = ARM_MMU500;
+		break;
+	case ACPI_IORT_SMMU_CAVIUM_THUNDERX:
+		smmu->version = ARM_SMMU_V2;
+		smmu->model = CAVIUM_SMMUV2;
+		break;
+	default:
+		ret = -ENODEV;
+	}
+
+	return ret;
+}
+
+static int arm_smmu_device_acpi_probe(struct platform_device *pdev,
+				      struct arm_smmu_device *smmu)
+{
+	struct device *dev = smmu->dev;
+	struct acpi_iort_node *node =
+		*(struct acpi_iort_node **)dev_get_platdata(dev);
+	struct acpi_iort_smmu *iort_smmu;
+	int ret;
+
+	/* Retrieve SMMU1/2 specific data */
+	iort_smmu = (struct acpi_iort_smmu *)node->node_data;
+
+	ret = acpi_smmu_get_data(iort_smmu->model, smmu);
+	if (ret < 0)
+		return ret;
+
+	/* Ignore the configuration access interrupt */
+	smmu->num_global_irqs = 1;
+
+	if (iort_smmu->flags & ACPI_IORT_SMMU_COHERENT_WALK)
+		smmu->features |= ARM_SMMU_FEAT_COHERENT_WALK;
+
+	return 0;
+}
+#else
+static inline int arm_smmu_device_acpi_probe(struct platform_device *pdev,
+					     struct arm_smmu_device *smmu)
+{
+	return -ENODEV;
+}
+#endif
+
+static int arm_smmu_device_dt_probe(struct platform_device *pdev,
+				    struct arm_smmu_device *smmu)
+{
+	const struct arm_smmu_match_data *data;
+	struct device *dev = &pdev->dev;
+	bool legacy_binding;
+
+	if (of_property_read_u32(dev->of_node, "#global-interrupts",
+				 &smmu->num_global_irqs)) {
+		dev_err(dev, "missing #global-interrupts property\n");
+		return -ENODEV;
+	}
+
+	data = of_device_get_match_data(dev);
+	smmu->version = data->version;
+	smmu->model = data->model;
+
+	parse_driver_options(smmu);
+
+	legacy_binding = of_find_property(dev->of_node, "mmu-masters", NULL);
+	if (legacy_binding && !using_generic_binding) {
+		if (!using_legacy_binding)
+			pr_notice("deprecated \"mmu-masters\" DT property in use; DMA API support unavailable\n");
+		using_legacy_binding = true;
+	} else if (!legacy_binding && !using_legacy_binding) {
+		using_generic_binding = true;
+	} else {
+		dev_err(dev, "not probing due to mismatched DT properties\n");
+		return -ENODEV;
+	}
+
+	if (of_dma_is_coherent(dev->of_node))
+		smmu->features |= ARM_SMMU_FEAT_COHERENT_WALK;
+
+	return 0;
+}
+
+static void arm_smmu_bus_init(void)
+{
+	/* Oh, for a proper bus abstraction */
+	if (!iommu_present(&platform_bus_type))
+		bus_set_iommu(&platform_bus_type, &arm_smmu_ops);
+#ifdef CONFIG_ARM_AMBA
+	if (!iommu_present(&amba_bustype))
+		bus_set_iommu(&amba_bustype, &arm_smmu_ops);
+#endif
+#ifdef CONFIG_PCI
+	if (!iommu_present(&pci_bus_type)) {
+		pci_request_acs();
+		bus_set_iommu(&pci_bus_type, &arm_smmu_ops);
+	}
+#endif
+}
+
+static int arm_smmu_device_probe(struct platform_device *pdev)
+{
+	struct resource *res;
+	resource_size_t ioaddr;
+	struct arm_smmu_device *smmu;
+	struct device *dev = &pdev->dev;
+	int num_irqs, i, err;
+
+	smmu = devm_kzalloc(dev, sizeof(*smmu), GFP_KERNEL);
+	if (!smmu) {
+		dev_err(dev, "failed to allocate arm_smmu_device\n");
+		return -ENOMEM;
+	}
+	smmu->dev = dev;
+
+	if (dev->of_node)
+		err = arm_smmu_device_dt_probe(pdev, smmu);
+	else
+		err = arm_smmu_device_acpi_probe(pdev, smmu);
+
+	if (err)
+		return err;
+
+	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	ioaddr = res->start;
+	smmu->base = devm_ioremap_resource(dev, res);
+	if (IS_ERR(smmu->base))
+		return PTR_ERR(smmu->base);
+	smmu->cb_base = smmu->base + resource_size(res) / 2;
+
+	num_irqs = 0;
+	while ((res = platform_get_resource(pdev, IORESOURCE_IRQ, num_irqs))) {
+		num_irqs++;
+		if (num_irqs > smmu->num_global_irqs)
+			smmu->num_context_irqs++;
+	}
+
+	if (!smmu->num_context_irqs) {
+		dev_err(dev, "found %d interrupts but expected at least %d\n",
+			num_irqs, smmu->num_global_irqs + 1);
+		return -ENODEV;
+	}
+
+	smmu->irqs = devm_kcalloc(dev, num_irqs, sizeof(*smmu->irqs),
+				  GFP_KERNEL);
+	if (!smmu->irqs) {
+		dev_err(dev, "failed to allocate %d irqs\n", num_irqs);
+		return -ENOMEM;
+	}
+
+	for (i = 0; i < num_irqs; ++i) {
+		int irq = platform_get_irq(pdev, i);
+
+		if (irq < 0) {
+			dev_err(dev, "failed to get irq index %d\n", i);
+			return -ENODEV;
+		}
+		smmu->irqs[i] = irq;
+	}
+
+	err = arm_smmu_device_cfg_probe(smmu);
+	if (err)
+		return err;
+
+	if (smmu->version == ARM_SMMU_V2) {
+		if (smmu->num_context_banks > smmu->num_context_irqs) {
+			dev_err(dev,
+			      "found only %d context irq(s) but %d required\n",
+			      smmu->num_context_irqs, smmu->num_context_banks);
+			return -ENODEV;
+		}
+
+		/* Ignore superfluous interrupts */
+		smmu->num_context_irqs = smmu->num_context_banks;
+	}
+
+	for (i = 0; i < smmu->num_global_irqs; ++i) {
+		err = devm_request_irq(smmu->dev, smmu->irqs[i],
+				       arm_smmu_global_fault,
+				       IRQF_SHARED,
+				       "arm-smmu global fault",
+				       smmu);
+		if (err) {
+			dev_err(dev, "failed to request global IRQ %d (%u)\n",
+				i, smmu->irqs[i]);
+			return err;
+		}
+	}
+
+	err = iommu_device_sysfs_add(&smmu->iommu, smmu->dev, NULL,
+				     "smmu.%pa", &ioaddr);
+	if (err) {
+		dev_err(dev, "Failed to register iommu in sysfs\n");
+		return err;
+	}
+
+	iommu_device_set_ops(&smmu->iommu, &arm_smmu_ops);
+	iommu_device_set_fwnode(&smmu->iommu, dev->fwnode);
+
+	err = iommu_device_register(&smmu->iommu);
+	if (err) {
+		dev_err(dev, "Failed to register iommu\n");
+		return err;
+	}
+
+	platform_set_drvdata(pdev, smmu);
+	arm_smmu_device_reset(smmu);
+	arm_smmu_test_smr_masks(smmu);
+
+	/*
+	 * For ACPI and generic DT bindings, an SMMU will be probed before
+	 * any device which might need it, so we want the bus ops in place
+	 * ready to handle default domain setup as soon as any SMMU exists.
+	 */
+	if (!using_legacy_binding)
+		arm_smmu_bus_init();
+
+	return 0;
+}
+
+/*
+ * With the legacy DT binding in play, though, we have no guarantees about
+ * probe order, but then we're also not doing default domains, so we can
+ * delay setting bus ops until we're sure every possible SMMU is ready,
+ * and that way ensure that no add_device() calls get missed.
+ */
+static int arm_smmu_legacy_bus_init(void)
+{
+	if (using_legacy_binding)
+		arm_smmu_bus_init();
+	return 0;
+}
+device_initcall_sync(arm_smmu_legacy_bus_init);
+
+static int arm_smmu_device_remove(struct platform_device *pdev)
+{
+	struct arm_smmu_device *smmu = platform_get_drvdata(pdev);
+
+	if (!smmu)
+		return -ENODEV;
+
+	if (!bitmap_empty(smmu->context_map, ARM_SMMU_MAX_CBS))
+		dev_err(&pdev->dev, "removing device with active domains!\n");
+
+	/* Turn the thing off */
+	writel(sCR0_CLIENTPD, ARM_SMMU_GR0_NS(smmu) + ARM_SMMU_GR0_sCR0);
+	return 0;
+}
+
+static void arm_smmu_device_shutdown(struct platform_device *pdev)
+{
+	arm_smmu_device_remove(pdev);
+}
+
+static int __maybe_unused arm_smmu_pm_resume(struct device *dev)
+{
+	struct arm_smmu_device *smmu = dev_get_drvdata(dev);
+
+	arm_smmu_device_reset(smmu);
+	return 0;
+}
+
+static SIMPLE_DEV_PM_OPS(arm_smmu_pm_ops, NULL, arm_smmu_pm_resume);
+
+static struct platform_driver arm_smmu_driver = {
+	.driver	= {
+		.name		= "arm-smmu",
+		.of_match_table	= of_match_ptr(arm_smmu_of_match),
+		.pm		= &arm_smmu_pm_ops,
+	},
+	.probe	= arm_smmu_device_probe,
+	.remove	= arm_smmu_device_remove,
+	.shutdown = arm_smmu_device_shutdown,
+};
+module_platform_driver(arm_smmu_driver);
+
+MODULE_DESCRIPTION("IOMMU API for ARM architected SMMU implementations");
+MODULE_AUTHOR("Will Deacon <will.deacon@arm.com>");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
new file mode 100644
index 000000000..31ba8f925
--- /dev/null
+++ b/drivers/iommu/dma-iommu.c
@@ -0,0 +1,906 @@
+/*
+ * A fairly generic DMA-API to IOMMU-API glue layer.
+ *
+ * Copyright (C) 2014-2015 ARM Ltd.
+ *
+ * based in part on arch/arm/mm/dma-mapping.c:
+ * Copyright (C) 2000-2004 Russell King
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/acpi_iort.h>
+#include <linux/device.h>
+#include <linux/dma-iommu.h>
+#include <linux/gfp.h>
+#include <linux/huge_mm.h>
+#include <linux/iommu.h>
+#include <linux/iova.h>
+#include <linux/irq.h>
+#include <linux/mm.h>
+#include <linux/pci.h>
+#include <linux/scatterlist.h>
+#include <linux/vmalloc.h>
+
+#define IOMMU_MAPPING_ERROR	0
+
+struct iommu_dma_msi_page {
+	struct list_head	list;
+	dma_addr_t		iova;
+	phys_addr_t		phys;
+};
+
+enum iommu_dma_cookie_type {
+	IOMMU_DMA_IOVA_COOKIE,
+	IOMMU_DMA_MSI_COOKIE,
+};
+
+struct iommu_dma_cookie {
+	enum iommu_dma_cookie_type	type;
+	union {
+		/* Full allocator for IOMMU_DMA_IOVA_COOKIE */
+		struct iova_domain	iovad;
+		/* Trivial linear page allocator for IOMMU_DMA_MSI_COOKIE */
+		dma_addr_t		msi_iova;
+	};
+	struct list_head		msi_page_list;
+	spinlock_t			msi_lock;
+};
+
+static inline size_t cookie_msi_granule(struct iommu_dma_cookie *cookie)
+{
+	if (cookie->type == IOMMU_DMA_IOVA_COOKIE)
+		return cookie->iovad.granule;
+	return PAGE_SIZE;
+}
+
+static struct iommu_dma_cookie *cookie_alloc(enum iommu_dma_cookie_type type)
+{
+	struct iommu_dma_cookie *cookie;
+
+	cookie = kzalloc(sizeof(*cookie), GFP_KERNEL);
+	if (cookie) {
+		spin_lock_init(&cookie->msi_lock);
+		INIT_LIST_HEAD(&cookie->msi_page_list);
+		cookie->type = type;
+	}
+	return cookie;
+}
+
+int iommu_dma_init(void)
+{
+	return iova_cache_get();
+}
+
+/**
+ * iommu_get_dma_cookie - Acquire DMA-API resources for a domain
+ * @domain: IOMMU domain to prepare for DMA-API usage
+ *
+ * IOMMU drivers should normally call this from their domain_alloc
+ * callback when domain->type == IOMMU_DOMAIN_DMA.
+ */
+int iommu_get_dma_cookie(struct iommu_domain *domain)
+{
+	if (domain->iova_cookie)
+		return -EEXIST;
+
+	domain->iova_cookie = cookie_alloc(IOMMU_DMA_IOVA_COOKIE);
+	if (!domain->iova_cookie)
+		return -ENOMEM;
+
+	return 0;
+}
+EXPORT_SYMBOL(iommu_get_dma_cookie);
+
+/**
+ * iommu_get_msi_cookie - Acquire just MSI remapping resources
+ * @domain: IOMMU domain to prepare
+ * @base: Start address of IOVA region for MSI mappings
+ *
+ * Users who manage their own IOVA allocation and do not want DMA API support,
+ * but would still like to take advantage of automatic MSI remapping, can use
+ * this to initialise their own domain appropriately. Users should reserve a
+ * contiguous IOVA region, starting at @base, large enough to accommodate the
+ * number of PAGE_SIZE mappings necessary to cover every MSI doorbell address
+ * used by the devices attached to @domain.
+ */
+int iommu_get_msi_cookie(struct iommu_domain *domain, dma_addr_t base)
+{
+	struct iommu_dma_cookie *cookie;
+
+	if (domain->type != IOMMU_DOMAIN_UNMANAGED)
+		return -EINVAL;
+
+	if (domain->iova_cookie)
+		return -EEXIST;
+
+	cookie = cookie_alloc(IOMMU_DMA_MSI_COOKIE);
+	if (!cookie)
+		return -ENOMEM;
+
+	cookie->msi_iova = base;
+	domain->iova_cookie = cookie;
+	return 0;
+}
+EXPORT_SYMBOL(iommu_get_msi_cookie);
+
+/**
+ * iommu_put_dma_cookie - Release a domain's DMA mapping resources
+ * @domain: IOMMU domain previously prepared by iommu_get_dma_cookie() or
+ *          iommu_get_msi_cookie()
+ *
+ * IOMMU drivers should normally call this from their domain_free callback.
+ */
+void iommu_put_dma_cookie(struct iommu_domain *domain)
+{
+	struct iommu_dma_cookie *cookie = domain->iova_cookie;
+	struct iommu_dma_msi_page *msi, *tmp;
+
+	if (!cookie)
+		return;
+
+	if (cookie->type == IOMMU_DMA_IOVA_COOKIE && cookie->iovad.granule)
+		put_iova_domain(&cookie->iovad);
+
+	list_for_each_entry_safe(msi, tmp, &cookie->msi_page_list, list) {
+		list_del(&msi->list);
+		kfree(msi);
+	}
+	kfree(cookie);
+	domain->iova_cookie = NULL;
+}
+EXPORT_SYMBOL(iommu_put_dma_cookie);
+
+/**
+ * iommu_dma_get_resv_regions - Reserved region driver helper
+ * @dev: Device from iommu_get_resv_regions()
+ * @list: Reserved region list from iommu_get_resv_regions()
+ *
+ * IOMMU drivers can use this to implement their .get_resv_regions callback
+ * for general non-IOMMU-specific reservations. Currently, this covers GICv3
+ * ITS region reservation on ACPI based ARM platforms that may require HW MSI
+ * reservation.
+ */
+void iommu_dma_get_resv_regions(struct device *dev, struct list_head *list)
+{
+
+	if (!is_of_node(dev->iommu_fwspec->iommu_fwnode))
+		iort_iommu_msi_get_resv_regions(dev, list);
+
+}
+EXPORT_SYMBOL(iommu_dma_get_resv_regions);
+
+static int cookie_init_hw_msi_region(struct iommu_dma_cookie *cookie,
+		phys_addr_t start, phys_addr_t end)
+{
+	struct iova_domain *iovad = &cookie->iovad;
+	struct iommu_dma_msi_page *msi_page;
+	int i, num_pages;
+
+	start -= iova_offset(iovad, start);
+	num_pages = iova_align(iovad, end - start) >> iova_shift(iovad);
+
+	for (i = 0; i < num_pages; i++) {
+		msi_page = kmalloc(sizeof(*msi_page), GFP_KERNEL);
+		if (!msi_page)
+			return -ENOMEM;
+
+		msi_page->phys = start;
+		msi_page->iova = start;
+		INIT_LIST_HEAD(&msi_page->list);
+		list_add(&msi_page->list, &cookie->msi_page_list);
+		start += iovad->granule;
+	}
+
+	return 0;
+}
+
+static void iova_reserve_pci_windows(struct pci_dev *dev,
+		struct iova_domain *iovad)
+{
+	struct pci_host_bridge *bridge = pci_find_host_bridge(dev->bus);
+	struct resource_entry *window;
+	unsigned long lo, hi;
+
+	resource_list_for_each_entry(window, &bridge->windows) {
+		if (resource_type(window->res) != IORESOURCE_MEM)
+			continue;
+
+		lo = iova_pfn(iovad, window->res->start - window->offset);
+		hi = iova_pfn(iovad, window->res->end - window->offset);
+		reserve_iova(iovad, lo, hi);
+	}
+}
+
+static int iova_reserve_iommu_regions(struct device *dev,
+		struct iommu_domain *domain)
+{
+	struct iommu_dma_cookie *cookie = domain->iova_cookie;
+	struct iova_domain *iovad = &cookie->iovad;
+	struct iommu_resv_region *region;
+	LIST_HEAD(resv_regions);
+	int ret = 0;
+
+	if (dev_is_pci(dev))
+		iova_reserve_pci_windows(to_pci_dev(dev), iovad);
+
+	iommu_get_resv_regions(dev, &resv_regions);
+	list_for_each_entry(region, &resv_regions, list) {
+		unsigned long lo, hi;
+
+		/* We ARE the software that manages these! */
+		if (region->type == IOMMU_RESV_SW_MSI)
+			continue;
+
+		lo = iova_pfn(iovad, region->start);
+		hi = iova_pfn(iovad, region->start + region->length - 1);
+		reserve_iova(iovad, lo, hi);
+
+		if (region->type == IOMMU_RESV_MSI)
+			ret = cookie_init_hw_msi_region(cookie, region->start,
+					region->start + region->length);
+		if (ret)
+			break;
+	}
+	iommu_put_resv_regions(dev, &resv_regions);
+
+	return ret;
+}
+
+/**
+ * iommu_dma_init_domain - Initialise a DMA mapping domain
+ * @domain: IOMMU domain previously prepared by iommu_get_dma_cookie()
+ * @base: IOVA at which the mappable address space starts
+ * @size: Size of IOVA space
+ * @dev: Device the domain is being initialised for
+ *
+ * @base and @size should be exact multiples of IOMMU page granularity to
+ * avoid rounding surprises. If necessary, we reserve the page at address 0
+ * to ensure it is an invalid IOVA. It is safe to reinitialise a domain, but
+ * any change which could make prior IOVAs invalid will fail.
+ */
+int iommu_dma_init_domain(struct iommu_domain *domain, dma_addr_t base,
+		u64 size, struct device *dev)
+{
+	struct iommu_dma_cookie *cookie = domain->iova_cookie;
+	struct iova_domain *iovad = &cookie->iovad;
+	unsigned long order, base_pfn, end_pfn;
+
+	if (!cookie || cookie->type != IOMMU_DMA_IOVA_COOKIE)
+		return -EINVAL;
+
+	/* Use the smallest supported page size for IOVA granularity */
+	order = __ffs(domain->pgsize_bitmap);
+	base_pfn = max_t(unsigned long, 1, base >> order);
+	end_pfn = (base + size - 1) >> order;
+
+	/* Check the domain allows at least some access to the device... */
+	if (domain->geometry.force_aperture) {
+		if (base > domain->geometry.aperture_end ||
+		    base + size <= domain->geometry.aperture_start) {
+			pr_warn("specified DMA range outside IOMMU capability\n");
+			return -EFAULT;
+		}
+		/* ...then finally give it a kicking to make sure it fits */
+		base_pfn = max_t(unsigned long, base_pfn,
+				domain->geometry.aperture_start >> order);
+	}
+
+	/* start_pfn is always nonzero for an already-initialised domain */
+	if (iovad->start_pfn) {
+		if (1UL << order != iovad->granule ||
+		    base_pfn != iovad->start_pfn) {
+			pr_warn("Incompatible range for DMA domain\n");
+			return -EFAULT;
+		}
+
+		return 0;
+	}
+
+	init_iova_domain(iovad, 1UL << order, base_pfn);
+	if (!dev)
+		return 0;
+
+	return iova_reserve_iommu_regions(dev, domain);
+}
+EXPORT_SYMBOL(iommu_dma_init_domain);
+
+/**
+ * dma_info_to_prot - Translate DMA API directions and attributes to IOMMU API
+ *                    page flags.
+ * @dir: Direction of DMA transfer
+ * @coherent: Is the DMA master cache-coherent?
+ * @attrs: DMA attributes for the mapping
+ *
+ * Return: corresponding IOMMU API page protection flags
+ */
+int dma_info_to_prot(enum dma_data_direction dir, bool coherent,
+		     unsigned long attrs)
+{
+	int prot = coherent ? IOMMU_CACHE : 0;
+
+	if (attrs & DMA_ATTR_PRIVILEGED)
+		prot |= IOMMU_PRIV;
+
+	switch (dir) {
+	case DMA_BIDIRECTIONAL:
+		return prot | IOMMU_READ | IOMMU_WRITE;
+	case DMA_TO_DEVICE:
+		return prot | IOMMU_READ;
+	case DMA_FROM_DEVICE:
+		return prot | IOMMU_WRITE;
+	default:
+		return 0;
+	}
+}
+
+static dma_addr_t iommu_dma_alloc_iova(struct iommu_domain *domain,
+		size_t size, dma_addr_t dma_limit, struct device *dev)
+{
+	struct iommu_dma_cookie *cookie = domain->iova_cookie;
+	struct iova_domain *iovad = &cookie->iovad;
+	unsigned long shift, iova_len, iova = 0;
+
+	if (cookie->type == IOMMU_DMA_MSI_COOKIE) {
+		cookie->msi_iova += size;
+		return cookie->msi_iova - size;
+	}
+
+	shift = iova_shift(iovad);
+	iova_len = size >> shift;
+	/*
+	 * Freeing non-power-of-two-sized allocations back into the IOVA caches
+	 * will come back to bite us badly, so we have to waste a bit of space
+	 * rounding up anything cacheable to make sure that can't happen. The
+	 * order of the unadjusted size will still match upon freeing.
+	 */
+	if (iova_len < (1 << (IOVA_RANGE_CACHE_MAX_SIZE - 1)))
+		iova_len = roundup_pow_of_two(iova_len);
+
+	if (dev->bus_dma_mask)
+		dma_limit &= dev->bus_dma_mask;
+
+	if (domain->geometry.force_aperture)
+		dma_limit = min(dma_limit, domain->geometry.aperture_end);
+
+	/* Try to get PCI devices a SAC address */
+	if (dma_limit > DMA_BIT_MASK(32) && dev_is_pci(dev))
+		iova = alloc_iova_fast(iovad, iova_len,
+				       DMA_BIT_MASK(32) >> shift, false);
+
+	if (!iova)
+		iova = alloc_iova_fast(iovad, iova_len, dma_limit >> shift,
+				       true);
+
+	return (dma_addr_t)iova << shift;
+}
+
+static void iommu_dma_free_iova(struct iommu_dma_cookie *cookie,
+		dma_addr_t iova, size_t size)
+{
+	struct iova_domain *iovad = &cookie->iovad;
+
+	/* The MSI case is only ever cleaning up its most recent allocation */
+	if (cookie->type == IOMMU_DMA_MSI_COOKIE)
+		cookie->msi_iova -= size;
+	else
+		free_iova_fast(iovad, iova_pfn(iovad, iova),
+				size >> iova_shift(iovad));
+}
+
+static void __iommu_dma_unmap(struct iommu_domain *domain, dma_addr_t dma_addr,
+		size_t size)
+{
+	struct iommu_dma_cookie *cookie = domain->iova_cookie;
+	struct iova_domain *iovad = &cookie->iovad;
+	size_t iova_off = iova_offset(iovad, dma_addr);
+
+	dma_addr -= iova_off;
+	size = iova_align(iovad, size + iova_off);
+
+	WARN_ON(iommu_unmap(domain, dma_addr, size) != size);
+	iommu_dma_free_iova(cookie, dma_addr, size);
+}
+
+static void __iommu_dma_free_pages(struct page **pages, int count)
+{
+	while (count--)
+		__free_page(pages[count]);
+	kvfree(pages);
+}
+
+static struct page **__iommu_dma_alloc_pages(unsigned int count,
+		unsigned long order_mask, gfp_t gfp)
+{
+	struct page **pages;
+	unsigned int i = 0, array_size = count * sizeof(*pages);
+
+	order_mask &= (2U << MAX_ORDER) - 1;
+	if (!order_mask)
+		return NULL;
+
+	if (array_size <= PAGE_SIZE)
+		pages = kzalloc(array_size, GFP_KERNEL);
+	else
+		pages = vzalloc(array_size);
+	if (!pages)
+		return NULL;
+
+	/* IOMMU can map any pages, so himem can also be used here */
+	gfp |= __GFP_NOWARN | __GFP_HIGHMEM;
+
+	while (count) {
+		struct page *page = NULL;
+		unsigned int order_size;
+
+		/*
+		 * Higher-order allocations are a convenience rather
+		 * than a necessity, hence using __GFP_NORETRY until
+		 * falling back to minimum-order allocations.
+		 */
+		for (order_mask &= (2U << __fls(count)) - 1;
+		     order_mask; order_mask &= ~order_size) {
+			unsigned int order = __fls(order_mask);
+
+			order_size = 1U << order;
+			page = alloc_pages((order_mask - order_size) ?
+					   gfp | __GFP_NORETRY : gfp, order);
+			if (!page)
+				continue;
+			if (!order)
+				break;
+			if (!PageCompound(page)) {
+				split_page(page, order);
+				break;
+			} else if (!split_huge_page(page)) {
+				break;
+			}
+			__free_pages(page, order);
+		}
+		if (!page) {
+			__iommu_dma_free_pages(pages, i);
+			return NULL;
+		}
+		count -= order_size;
+		while (order_size--)
+			pages[i++] = page++;
+	}
+	return pages;
+}
+
+/**
+ * iommu_dma_free - Free a buffer allocated by iommu_dma_alloc()
+ * @dev: Device which owns this buffer
+ * @pages: Array of buffer pages as returned by iommu_dma_alloc()
+ * @size: Size of buffer in bytes
+ * @handle: DMA address of buffer
+ *
+ * Frees both the pages associated with the buffer, and the array
+ * describing them
+ */
+void iommu_dma_free(struct device *dev, struct page **pages, size_t size,
+		dma_addr_t *handle)
+{
+	__iommu_dma_unmap(iommu_get_domain_for_dev(dev), *handle, size);
+	__iommu_dma_free_pages(pages, PAGE_ALIGN(size) >> PAGE_SHIFT);
+	*handle = IOMMU_MAPPING_ERROR;
+}
+
+/**
+ * iommu_dma_alloc - Allocate and map a buffer contiguous in IOVA space
+ * @dev: Device to allocate memory for. Must be a real device
+ *	 attached to an iommu_dma_domain
+ * @size: Size of buffer in bytes
+ * @gfp: Allocation flags
+ * @attrs: DMA attributes for this allocation
+ * @prot: IOMMU mapping flags
+ * @handle: Out argument for allocated DMA handle
+ * @flush_page: Arch callback which must ensure PAGE_SIZE bytes from the
+ *		given VA/PA are visible to the given non-coherent device.
+ *
+ * If @size is less than PAGE_SIZE, then a full CPU page will be allocated,
+ * but an IOMMU which supports smaller pages might not map the whole thing.
+ *
+ * Return: Array of struct page pointers describing the buffer,
+ *	   or NULL on failure.
+ */
+struct page **iommu_dma_alloc(struct device *dev, size_t size, gfp_t gfp,
+		unsigned long attrs, int prot, dma_addr_t *handle,
+		void (*flush_page)(struct device *, const void *, phys_addr_t))
+{
+	struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
+	struct iommu_dma_cookie *cookie = domain->iova_cookie;
+	struct iova_domain *iovad = &cookie->iovad;
+	struct page **pages;
+	struct sg_table sgt;
+	dma_addr_t iova;
+	unsigned int count, min_size, alloc_sizes = domain->pgsize_bitmap;
+
+	*handle = IOMMU_MAPPING_ERROR;
+
+	min_size = alloc_sizes & -alloc_sizes;
+	if (min_size < PAGE_SIZE) {
+		min_size = PAGE_SIZE;
+		alloc_sizes |= PAGE_SIZE;
+	} else {
+		size = ALIGN(size, min_size);
+	}
+	if (attrs & DMA_ATTR_ALLOC_SINGLE_PAGES)
+		alloc_sizes = min_size;
+
+	count = PAGE_ALIGN(size) >> PAGE_SHIFT;
+	pages = __iommu_dma_alloc_pages(count, alloc_sizes >> PAGE_SHIFT, gfp);
+	if (!pages)
+		return NULL;
+
+	size = iova_align(iovad, size);
+	iova = iommu_dma_alloc_iova(domain, size, dev->coherent_dma_mask, dev);
+	if (!iova)
+		goto out_free_pages;
+
+	if (sg_alloc_table_from_pages(&sgt, pages, count, 0, size, GFP_KERNEL))
+		goto out_free_iova;
+
+	if (!(prot & IOMMU_CACHE)) {
+		struct sg_mapping_iter miter;
+		/*
+		 * The CPU-centric flushing implied by SG_MITER_TO_SG isn't
+		 * sufficient here, so skip it by using the "wrong" direction.
+		 */
+		sg_miter_start(&miter, sgt.sgl, sgt.orig_nents, SG_MITER_FROM_SG);
+		while (sg_miter_next(&miter))
+			flush_page(dev, miter.addr, page_to_phys(miter.page));
+		sg_miter_stop(&miter);
+	}
+
+	if (iommu_map_sg(domain, iova, sgt.sgl, sgt.orig_nents, prot)
+			< size)
+		goto out_free_sg;
+
+	*handle = iova;
+	sg_free_table(&sgt);
+	return pages;
+
+out_free_sg:
+	sg_free_table(&sgt);
+out_free_iova:
+	iommu_dma_free_iova(cookie, iova, size);
+out_free_pages:
+	__iommu_dma_free_pages(pages, count);
+	return NULL;
+}
+
+/**
+ * iommu_dma_mmap - Map a buffer into provided user VMA
+ * @pages: Array representing buffer from iommu_dma_alloc()
+ * @size: Size of buffer in bytes
+ * @vma: VMA describing requested userspace mapping
+ *
+ * Maps the pages of the buffer in @pages into @vma. The caller is responsible
+ * for verifying the correct size and protection of @vma beforehand.
+ */
+
+int iommu_dma_mmap(struct page **pages, size_t size, struct vm_area_struct *vma)
+{
+	unsigned long uaddr = vma->vm_start;
+	unsigned int i, count = PAGE_ALIGN(size) >> PAGE_SHIFT;
+	int ret = -ENXIO;
+
+	for (i = vma->vm_pgoff; i < count && uaddr < vma->vm_end; i++) {
+		ret = vm_insert_page(vma, uaddr, pages[i]);
+		if (ret)
+			break;
+		uaddr += PAGE_SIZE;
+	}
+	return ret;
+}
+
+static dma_addr_t __iommu_dma_map(struct device *dev, phys_addr_t phys,
+		size_t size, int prot)
+{
+	struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
+	struct iommu_dma_cookie *cookie = domain->iova_cookie;
+	size_t iova_off = 0;
+	dma_addr_t iova;
+
+	if (cookie->type == IOMMU_DMA_IOVA_COOKIE) {
+		iova_off = iova_offset(&cookie->iovad, phys);
+		size = iova_align(&cookie->iovad, size + iova_off);
+	}
+
+	iova = iommu_dma_alloc_iova(domain, size, dma_get_mask(dev), dev);
+	if (!iova)
+		return IOMMU_MAPPING_ERROR;
+
+	if (iommu_map(domain, iova, phys - iova_off, size, prot)) {
+		iommu_dma_free_iova(cookie, iova, size);
+		return IOMMU_MAPPING_ERROR;
+	}
+	return iova + iova_off;
+}
+
+dma_addr_t iommu_dma_map_page(struct device *dev, struct page *page,
+		unsigned long offset, size_t size, int prot)
+{
+	return __iommu_dma_map(dev, page_to_phys(page) + offset, size, prot);
+}
+
+void iommu_dma_unmap_page(struct device *dev, dma_addr_t handle, size_t size,
+		enum dma_data_direction dir, unsigned long attrs)
+{
+	__iommu_dma_unmap(iommu_get_domain_for_dev(dev), handle, size);
+}
+
+/*
+ * Prepare a successfully-mapped scatterlist to give back to the caller.
+ *
+ * At this point the segments are already laid out by iommu_dma_map_sg() to
+ * avoid individually crossing any boundaries, so we merely need to check a
+ * segment's start address to avoid concatenating across one.
+ */
+static int __finalise_sg(struct device *dev, struct scatterlist *sg, int nents,
+		dma_addr_t dma_addr)
+{
+	struct scatterlist *s, *cur = sg;
+	unsigned long seg_mask = dma_get_seg_boundary(dev);
+	unsigned int cur_len = 0, max_len = dma_get_max_seg_size(dev);
+	int i, count = 0;
+
+	for_each_sg(sg, s, nents, i) {
+		/* Restore this segment's original unaligned fields first */
+		unsigned int s_iova_off = sg_dma_address(s);
+		unsigned int s_length = sg_dma_len(s);
+		unsigned int s_iova_len = s->length;
+
+		s->offset += s_iova_off;
+		s->length = s_length;
+		sg_dma_address(s) = IOMMU_MAPPING_ERROR;
+		sg_dma_len(s) = 0;
+
+		/*
+		 * Now fill in the real DMA data. If...
+		 * - there is a valid output segment to append to
+		 * - and this segment starts on an IOVA page boundary
+		 * - but doesn't fall at a segment boundary
+		 * - and wouldn't make the resulting output segment too long
+		 */
+		if (cur_len && !s_iova_off && (dma_addr & seg_mask) &&
+		    (max_len - cur_len >= s_length)) {
+			/* ...then concatenate it with the previous one */
+			cur_len += s_length;
+		} else {
+			/* Otherwise start the next output segment */
+			if (i > 0)
+				cur = sg_next(cur);
+			cur_len = s_length;
+			count++;
+
+			sg_dma_address(cur) = dma_addr + s_iova_off;
+		}
+
+		sg_dma_len(cur) = cur_len;
+		dma_addr += s_iova_len;
+
+		if (s_length + s_iova_off < s_iova_len)
+			cur_len = 0;
+	}
+	return count;
+}
+
+/*
+ * If mapping failed, then just restore the original list,
+ * but making sure the DMA fields are invalidated.
+ */
+static void __invalidate_sg(struct scatterlist *sg, int nents)
+{
+	struct scatterlist *s;
+	int i;
+
+	for_each_sg(sg, s, nents, i) {
+		if (sg_dma_address(s) != IOMMU_MAPPING_ERROR)
+			s->offset += sg_dma_address(s);
+		if (sg_dma_len(s))
+			s->length = sg_dma_len(s);
+		sg_dma_address(s) = IOMMU_MAPPING_ERROR;
+		sg_dma_len(s) = 0;
+	}
+}
+
+/*
+ * The DMA API client is passing in a scatterlist which could describe
+ * any old buffer layout, but the IOMMU API requires everything to be
+ * aligned to IOMMU pages. Hence the need for this complicated bit of
+ * impedance-matching, to be able to hand off a suitably-aligned list,
+ * but still preserve the original offsets and sizes for the caller.
+ */
+int iommu_dma_map_sg(struct device *dev, struct scatterlist *sg,
+		int nents, int prot)
+{
+	struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
+	struct iommu_dma_cookie *cookie = domain->iova_cookie;
+	struct iova_domain *iovad = &cookie->iovad;
+	struct scatterlist *s, *prev = NULL;
+	dma_addr_t iova;
+	size_t iova_len = 0;
+	unsigned long mask = dma_get_seg_boundary(dev);
+	int i;
+
+	/*
+	 * Work out how much IOVA space we need, and align the segments to
+	 * IOVA granules for the IOMMU driver to handle. With some clever
+	 * trickery we can modify the list in-place, but reversibly, by
+	 * stashing the unaligned parts in the as-yet-unused DMA fields.
+	 */
+	for_each_sg(sg, s, nents, i) {
+		size_t s_iova_off = iova_offset(iovad, s->offset);
+		size_t s_length = s->length;
+		size_t pad_len = (mask - iova_len + 1) & mask;
+
+		sg_dma_address(s) = s_iova_off;
+		sg_dma_len(s) = s_length;
+		s->offset -= s_iova_off;
+		s_length = iova_align(iovad, s_length + s_iova_off);
+		s->length = s_length;
+
+		/*
+		 * Due to the alignment of our single IOVA allocation, we can
+		 * depend on these assumptions about the segment boundary mask:
+		 * - If mask size >= IOVA size, then the IOVA range cannot
+		 *   possibly fall across a boundary, so we don't care.
+		 * - If mask size < IOVA size, then the IOVA range must start
+		 *   exactly on a boundary, therefore we can lay things out
+		 *   based purely on segment lengths without needing to know
+		 *   the actual addresses beforehand.
+		 * - The mask must be a power of 2, so pad_len == 0 if
+		 *   iova_len == 0, thus we cannot dereference prev the first
+		 *   time through here (i.e. before it has a meaningful value).
+		 */
+		if (pad_len && pad_len < s_length - 1) {
+			prev->length += pad_len;
+			iova_len += pad_len;
+		}
+
+		iova_len += s_length;
+		prev = s;
+	}
+
+	iova = iommu_dma_alloc_iova(domain, iova_len, dma_get_mask(dev), dev);
+	if (!iova)
+		goto out_restore_sg;
+
+	/*
+	 * We'll leave any physical concatenation to the IOMMU driver's
+	 * implementation - it knows better than we do.
+	 */
+	if (iommu_map_sg(domain, iova, sg, nents, prot) < iova_len)
+		goto out_free_iova;
+
+	return __finalise_sg(dev, sg, nents, iova);
+
+out_free_iova:
+	iommu_dma_free_iova(cookie, iova, iova_len);
+out_restore_sg:
+	__invalidate_sg(sg, nents);
+	return 0;
+}
+
+void iommu_dma_unmap_sg(struct device *dev, struct scatterlist *sg, int nents,
+		enum dma_data_direction dir, unsigned long attrs)
+{
+	dma_addr_t start, end;
+	struct scatterlist *tmp;
+	int i;
+	/*
+	 * The scatterlist segments are mapped into a single
+	 * contiguous IOVA allocation, so this is incredibly easy.
+	 */
+	start = sg_dma_address(sg);
+	for_each_sg(sg_next(sg), tmp, nents - 1, i) {
+		if (sg_dma_len(tmp) == 0)
+			break;
+		sg = tmp;
+	}
+	end = sg_dma_address(sg) + sg_dma_len(sg);
+	__iommu_dma_unmap(iommu_get_domain_for_dev(dev), start, end - start);
+}
+
+dma_addr_t iommu_dma_map_resource(struct device *dev, phys_addr_t phys,
+		size_t size, enum dma_data_direction dir, unsigned long attrs)
+{
+	return __iommu_dma_map(dev, phys, size,
+			dma_info_to_prot(dir, false, attrs) | IOMMU_MMIO);
+}
+
+void iommu_dma_unmap_resource(struct device *dev, dma_addr_t handle,
+		size_t size, enum dma_data_direction dir, unsigned long attrs)
+{
+	__iommu_dma_unmap(iommu_get_domain_for_dev(dev), handle, size);
+}
+
+int iommu_dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
+{
+	return dma_addr == IOMMU_MAPPING_ERROR;
+}
+
+static struct iommu_dma_msi_page *iommu_dma_get_msi_page(struct device *dev,
+		phys_addr_t msi_addr, struct iommu_domain *domain)
+{
+	struct iommu_dma_cookie *cookie = domain->iova_cookie;
+	struct iommu_dma_msi_page *msi_page;
+	dma_addr_t iova;
+	int prot = IOMMU_WRITE | IOMMU_NOEXEC | IOMMU_MMIO;
+	size_t size = cookie_msi_granule(cookie);
+
+	msi_addr &= ~(phys_addr_t)(size - 1);
+	list_for_each_entry(msi_page, &cookie->msi_page_list, list)
+		if (msi_page->phys == msi_addr)
+			return msi_page;
+
+	msi_page = kzalloc(sizeof(*msi_page), GFP_ATOMIC);
+	if (!msi_page)
+		return NULL;
+
+	iova = __iommu_dma_map(dev, msi_addr, size, prot);
+	if (iommu_dma_mapping_error(dev, iova))
+		goto out_free_page;
+
+	INIT_LIST_HEAD(&msi_page->list);
+	msi_page->phys = msi_addr;
+	msi_page->iova = iova;
+	list_add(&msi_page->list, &cookie->msi_page_list);
+	return msi_page;
+
+out_free_page:
+	kfree(msi_page);
+	return NULL;
+}
+
+void iommu_dma_map_msi_msg(int irq, struct msi_msg *msg)
+{
+	struct device *dev = msi_desc_to_dev(irq_get_msi_desc(irq));
+	struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
+	struct iommu_dma_cookie *cookie;
+	struct iommu_dma_msi_page *msi_page;
+	phys_addr_t msi_addr = (u64)msg->address_hi << 32 | msg->address_lo;
+	unsigned long flags;
+
+	if (!domain || !domain->iova_cookie)
+		return;
+
+	cookie = domain->iova_cookie;
+
+	/*
+	 * We disable IRQs to rule out a possible inversion against
+	 * irq_desc_lock if, say, someone tries to retarget the affinity
+	 * of an MSI from within an IPI handler.
+	 */
+	spin_lock_irqsave(&cookie->msi_lock, flags);
+	msi_page = iommu_dma_get_msi_page(dev, msi_addr, domain);
+	spin_unlock_irqrestore(&cookie->msi_lock, flags);
+
+	if (WARN_ON(!msi_page)) {
+		/*
+		 * We're called from a void callback, so the best we can do is
+		 * 'fail' by filling the message with obviously bogus values.
+		 * Since we got this far due to an IOMMU being present, it's
+		 * not like the existing address would have worked anyway...
+		 */
+		msg->address_hi = ~0U;
+		msg->address_lo = ~0U;
+		msg->data = ~0U;
+	} else {
+		msg->address_hi = upper_32_bits(msi_page->iova);
+		msg->address_lo &= cookie_msi_granule(cookie) - 1;
+		msg->address_lo += lower_32_bits(msi_page->iova);
+	}
+}
diff --git a/drivers/iommu/dmar.c b/drivers/iommu/dmar.c
new file mode 100644
index 000000000..bc565f8e8
--- /dev/null
+++ b/drivers/iommu/dmar.c
@@ -0,0 +1,2071 @@
+/*
+ * Copyright (c) 2006, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ * Copyright (C) 2006-2008 Intel Corporation
+ * Author: Ashok Raj <ashok.raj@intel.com>
+ * Author: Shaohua Li <shaohua.li@intel.com>
+ * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
+ *
+ * This file implements early detection/parsing of Remapping Devices
+ * reported to OS through BIOS via DMA remapping reporting (DMAR) ACPI
+ * tables.
+ *
+ * These routines are used by both DMA-remapping and Interrupt-remapping
+ */
+
+#define pr_fmt(fmt)     "DMAR: " fmt
+
+#include <linux/pci.h>
+#include <linux/dmar.h>
+#include <linux/iova.h>
+#include <linux/intel-iommu.h>
+#include <linux/timer.h>
+#include <linux/irq.h>
+#include <linux/interrupt.h>
+#include <linux/tboot.h>
+#include <linux/dmi.h>
+#include <linux/slab.h>
+#include <linux/iommu.h>
+#include <linux/limits.h>
+#include <asm/irq_remapping.h>
+#include <asm/iommu_table.h>
+
+#include "irq_remapping.h"
+
+typedef int (*dmar_res_handler_t)(struct acpi_dmar_header *, void *);
+struct dmar_res_callback {
+	dmar_res_handler_t	cb[ACPI_DMAR_TYPE_RESERVED];
+	void			*arg[ACPI_DMAR_TYPE_RESERVED];
+	bool			ignore_unhandled;
+	bool			print_entry;
+};
+
+/*
+ * Assumptions:
+ * 1) The hotplug framework guarentees that DMAR unit will be hot-added
+ *    before IO devices managed by that unit.
+ * 2) The hotplug framework guarantees that DMAR unit will be hot-removed
+ *    after IO devices managed by that unit.
+ * 3) Hotplug events are rare.
+ *
+ * Locking rules for DMA and interrupt remapping related global data structures:
+ * 1) Use dmar_global_lock in process context
+ * 2) Use RCU in interrupt context
+ */
+DECLARE_RWSEM(dmar_global_lock);
+LIST_HEAD(dmar_drhd_units);
+
+struct acpi_table_header * __initdata dmar_tbl;
+static int dmar_dev_scope_status = 1;
+static unsigned long dmar_seq_ids[BITS_TO_LONGS(DMAR_UNITS_SUPPORTED)];
+
+static int alloc_iommu(struct dmar_drhd_unit *drhd);
+static void free_iommu(struct intel_iommu *iommu);
+
+extern const struct iommu_ops intel_iommu_ops;
+
+static void dmar_register_drhd_unit(struct dmar_drhd_unit *drhd)
+{
+	/*
+	 * add INCLUDE_ALL at the tail, so scan the list will find it at
+	 * the very end.
+	 */
+	if (drhd->include_all)
+		list_add_tail_rcu(&drhd->list, &dmar_drhd_units);
+	else
+		list_add_rcu(&drhd->list, &dmar_drhd_units);
+}
+
+void *dmar_alloc_dev_scope(void *start, void *end, int *cnt)
+{
+	struct acpi_dmar_device_scope *scope;
+
+	*cnt = 0;
+	while (start < end) {
+		scope = start;
+		if (scope->entry_type == ACPI_DMAR_SCOPE_TYPE_NAMESPACE ||
+		    scope->entry_type == ACPI_DMAR_SCOPE_TYPE_ENDPOINT ||
+		    scope->entry_type == ACPI_DMAR_SCOPE_TYPE_BRIDGE)
+			(*cnt)++;
+		else if (scope->entry_type != ACPI_DMAR_SCOPE_TYPE_IOAPIC &&
+			scope->entry_type != ACPI_DMAR_SCOPE_TYPE_HPET) {
+			pr_warn("Unsupported device scope\n");
+		}
+		start += scope->length;
+	}
+	if (*cnt == 0)
+		return NULL;
+
+	return kcalloc(*cnt, sizeof(struct dmar_dev_scope), GFP_KERNEL);
+}
+
+void dmar_free_dev_scope(struct dmar_dev_scope **devices, int *cnt)
+{
+	int i;
+	struct device *tmp_dev;
+
+	if (*devices && *cnt) {
+		for_each_active_dev_scope(*devices, *cnt, i, tmp_dev)
+			put_device(tmp_dev);
+		kfree(*devices);
+	}
+
+	*devices = NULL;
+	*cnt = 0;
+}
+
+/* Optimize out kzalloc()/kfree() for normal cases */
+static char dmar_pci_notify_info_buf[64];
+
+static struct dmar_pci_notify_info *
+dmar_alloc_pci_notify_info(struct pci_dev *dev, unsigned long event)
+{
+	int level = 0;
+	size_t size;
+	struct pci_dev *tmp;
+	struct dmar_pci_notify_info *info;
+
+	BUG_ON(dev->is_virtfn);
+
+	/*
+	 * Ignore devices that have a domain number higher than what can
+	 * be looked up in DMAR, e.g. VMD subdevices with domain 0x10000
+	 */
+	if (pci_domain_nr(dev->bus) > U16_MAX)
+		return NULL;
+
+	/* Only generate path[] for device addition event */
+	if (event == BUS_NOTIFY_ADD_DEVICE)
+		for (tmp = dev; tmp; tmp = tmp->bus->self)
+			level++;
+
+	size = sizeof(*info) + level * sizeof(info->path[0]);
+	if (size <= sizeof(dmar_pci_notify_info_buf)) {
+		info = (struct dmar_pci_notify_info *)dmar_pci_notify_info_buf;
+	} else {
+		info = kzalloc(size, GFP_KERNEL);
+		if (!info) {
+			pr_warn("Out of memory when allocating notify_info "
+				"for %s.\n", pci_name(dev));
+			if (dmar_dev_scope_status == 0)
+				dmar_dev_scope_status = -ENOMEM;
+			return NULL;
+		}
+	}
+
+	info->event = event;
+	info->dev = dev;
+	info->seg = pci_domain_nr(dev->bus);
+	info->level = level;
+	if (event == BUS_NOTIFY_ADD_DEVICE) {
+		for (tmp = dev; tmp; tmp = tmp->bus->self) {
+			level--;
+			info->path[level].bus = tmp->bus->number;
+			info->path[level].device = PCI_SLOT(tmp->devfn);
+			info->path[level].function = PCI_FUNC(tmp->devfn);
+			if (pci_is_root_bus(tmp->bus))
+				info->bus = tmp->bus->number;
+		}
+	}
+
+	return info;
+}
+
+static inline void dmar_free_pci_notify_info(struct dmar_pci_notify_info *info)
+{
+	if ((void *)info != dmar_pci_notify_info_buf)
+		kfree(info);
+}
+
+static bool dmar_match_pci_path(struct dmar_pci_notify_info *info, int bus,
+				struct acpi_dmar_pci_path *path, int count)
+{
+	int i;
+
+	if (info->bus != bus)
+		goto fallback;
+	if (info->level != count)
+		goto fallback;
+
+	for (i = 0; i < count; i++) {
+		if (path[i].device != info->path[i].device ||
+		    path[i].function != info->path[i].function)
+			goto fallback;
+	}
+
+	return true;
+
+fallback:
+
+	if (count != 1)
+		return false;
+
+	i = info->level - 1;
+	if (bus              == info->path[i].bus &&
+	    path[0].device   == info->path[i].device &&
+	    path[0].function == info->path[i].function) {
+		pr_info(FW_BUG "RMRR entry for device %02x:%02x.%x is broken - applying workaround\n",
+			bus, path[0].device, path[0].function);
+		return true;
+	}
+
+	return false;
+}
+
+/* Return: > 0 if match found, 0 if no match found, < 0 if error happens */
+int dmar_insert_dev_scope(struct dmar_pci_notify_info *info,
+			  void *start, void*end, u16 segment,
+			  struct dmar_dev_scope *devices,
+			  int devices_cnt)
+{
+	int i, level;
+	struct device *tmp, *dev = &info->dev->dev;
+	struct acpi_dmar_device_scope *scope;
+	struct acpi_dmar_pci_path *path;
+
+	if (segment != info->seg)
+		return 0;
+
+	for (; start < end; start += scope->length) {
+		scope = start;
+		if (scope->entry_type != ACPI_DMAR_SCOPE_TYPE_ENDPOINT &&
+		    scope->entry_type != ACPI_DMAR_SCOPE_TYPE_BRIDGE)
+			continue;
+
+		path = (struct acpi_dmar_pci_path *)(scope + 1);
+		level = (scope->length - sizeof(*scope)) / sizeof(*path);
+		if (!dmar_match_pci_path(info, scope->bus, path, level))
+			continue;
+
+		/*
+		 * We expect devices with endpoint scope to have normal PCI
+		 * headers, and devices with bridge scope to have bridge PCI
+		 * headers.  However PCI NTB devices may be listed in the
+		 * DMAR table with bridge scope, even though they have a
+		 * normal PCI header.  NTB devices are identified by class
+		 * "BRIDGE_OTHER" (0680h) - we don't declare a socpe mismatch
+		 * for this special case.
+		 */
+		if ((scope->entry_type == ACPI_DMAR_SCOPE_TYPE_ENDPOINT &&
+		     info->dev->hdr_type != PCI_HEADER_TYPE_NORMAL) ||
+		    (scope->entry_type == ACPI_DMAR_SCOPE_TYPE_BRIDGE &&
+		     (info->dev->hdr_type == PCI_HEADER_TYPE_NORMAL &&
+		      info->dev->class >> 8 != PCI_CLASS_BRIDGE_OTHER))) {
+			pr_warn("Device scope type does not match for %s\n",
+				pci_name(info->dev));
+			return -EINVAL;
+		}
+
+		for_each_dev_scope(devices, devices_cnt, i, tmp)
+			if (tmp == NULL) {
+				devices[i].bus = info->dev->bus->number;
+				devices[i].devfn = info->dev->devfn;
+				rcu_assign_pointer(devices[i].dev,
+						   get_device(dev));
+				return 1;
+			}
+		BUG_ON(i >= devices_cnt);
+	}
+
+	return 0;
+}
+
+int dmar_remove_dev_scope(struct dmar_pci_notify_info *info, u16 segment,
+			  struct dmar_dev_scope *devices, int count)
+{
+	int index;
+	struct device *tmp;
+
+	if (info->seg != segment)
+		return 0;
+
+	for_each_active_dev_scope(devices, count, index, tmp)
+		if (tmp == &info->dev->dev) {
+			RCU_INIT_POINTER(devices[index].dev, NULL);
+			synchronize_rcu();
+			put_device(tmp);
+			return 1;
+		}
+
+	return 0;
+}
+
+static int dmar_pci_bus_add_dev(struct dmar_pci_notify_info *info)
+{
+	int ret = 0;
+	struct dmar_drhd_unit *dmaru;
+	struct acpi_dmar_hardware_unit *drhd;
+
+	for_each_drhd_unit(dmaru) {
+		if (dmaru->include_all)
+			continue;
+
+		drhd = container_of(dmaru->hdr,
+				    struct acpi_dmar_hardware_unit, header);
+		ret = dmar_insert_dev_scope(info, (void *)(drhd + 1),
+				((void *)drhd) + drhd->header.length,
+				dmaru->segment,
+				dmaru->devices, dmaru->devices_cnt);
+		if (ret)
+			break;
+	}
+	if (ret >= 0)
+		ret = dmar_iommu_notify_scope_dev(info);
+	if (ret < 0 && dmar_dev_scope_status == 0)
+		dmar_dev_scope_status = ret;
+
+	return ret;
+}
+
+static void  dmar_pci_bus_del_dev(struct dmar_pci_notify_info *info)
+{
+	struct dmar_drhd_unit *dmaru;
+
+	for_each_drhd_unit(dmaru)
+		if (dmar_remove_dev_scope(info, dmaru->segment,
+			dmaru->devices, dmaru->devices_cnt))
+			break;
+	dmar_iommu_notify_scope_dev(info);
+}
+
+static int dmar_pci_bus_notifier(struct notifier_block *nb,
+				 unsigned long action, void *data)
+{
+	struct pci_dev *pdev = to_pci_dev(data);
+	struct dmar_pci_notify_info *info;
+
+	/* Only care about add/remove events for physical functions.
+	 * For VFs we actually do the lookup based on the corresponding
+	 * PF in device_to_iommu() anyway. */
+	if (pdev->is_virtfn)
+		return NOTIFY_DONE;
+	if (action != BUS_NOTIFY_ADD_DEVICE &&
+	    action != BUS_NOTIFY_REMOVED_DEVICE)
+		return NOTIFY_DONE;
+
+	info = dmar_alloc_pci_notify_info(pdev, action);
+	if (!info)
+		return NOTIFY_DONE;
+
+	down_write(&dmar_global_lock);
+	if (action == BUS_NOTIFY_ADD_DEVICE)
+		dmar_pci_bus_add_dev(info);
+	else if (action == BUS_NOTIFY_REMOVED_DEVICE)
+		dmar_pci_bus_del_dev(info);
+	up_write(&dmar_global_lock);
+
+	dmar_free_pci_notify_info(info);
+
+	return NOTIFY_OK;
+}
+
+static struct notifier_block dmar_pci_bus_nb = {
+	.notifier_call = dmar_pci_bus_notifier,
+	.priority = INT_MIN,
+};
+
+static struct dmar_drhd_unit *
+dmar_find_dmaru(struct acpi_dmar_hardware_unit *drhd)
+{
+	struct dmar_drhd_unit *dmaru;
+
+	list_for_each_entry_rcu(dmaru, &dmar_drhd_units, list)
+		if (dmaru->segment == drhd->segment &&
+		    dmaru->reg_base_addr == drhd->address)
+			return dmaru;
+
+	return NULL;
+}
+
+/**
+ * dmar_parse_one_drhd - parses exactly one DMA remapping hardware definition
+ * structure which uniquely represent one DMA remapping hardware unit
+ * present in the platform
+ */
+static int dmar_parse_one_drhd(struct acpi_dmar_header *header, void *arg)
+{
+	struct acpi_dmar_hardware_unit *drhd;
+	struct dmar_drhd_unit *dmaru;
+	int ret;
+
+	drhd = (struct acpi_dmar_hardware_unit *)header;
+	dmaru = dmar_find_dmaru(drhd);
+	if (dmaru)
+		goto out;
+
+	dmaru = kzalloc(sizeof(*dmaru) + header->length, GFP_KERNEL);
+	if (!dmaru)
+		return -ENOMEM;
+
+	/*
+	 * If header is allocated from slab by ACPI _DSM method, we need to
+	 * copy the content because the memory buffer will be freed on return.
+	 */
+	dmaru->hdr = (void *)(dmaru + 1);
+	memcpy(dmaru->hdr, header, header->length);
+	dmaru->reg_base_addr = drhd->address;
+	dmaru->segment = drhd->segment;
+	dmaru->include_all = drhd->flags & 0x1; /* BIT0: INCLUDE_ALL */
+	dmaru->devices = dmar_alloc_dev_scope((void *)(drhd + 1),
+					      ((void *)drhd) + drhd->header.length,
+					      &dmaru->devices_cnt);
+	if (dmaru->devices_cnt && dmaru->devices == NULL) {
+		kfree(dmaru);
+		return -ENOMEM;
+	}
+
+	ret = alloc_iommu(dmaru);
+	if (ret) {
+		dmar_free_dev_scope(&dmaru->devices,
+				    &dmaru->devices_cnt);
+		kfree(dmaru);
+		return ret;
+	}
+	dmar_register_drhd_unit(dmaru);
+
+out:
+	if (arg)
+		(*(int *)arg)++;
+
+	return 0;
+}
+
+static void dmar_free_drhd(struct dmar_drhd_unit *dmaru)
+{
+	if (dmaru->devices && dmaru->devices_cnt)
+		dmar_free_dev_scope(&dmaru->devices, &dmaru->devices_cnt);
+	if (dmaru->iommu)
+		free_iommu(dmaru->iommu);
+	kfree(dmaru);
+}
+
+static int __init dmar_parse_one_andd(struct acpi_dmar_header *header,
+				      void *arg)
+{
+	struct acpi_dmar_andd *andd = (void *)header;
+
+	/* Check for NUL termination within the designated length */
+	if (strnlen(andd->device_name, header->length - 8) == header->length - 8) {
+		pr_warn(FW_BUG
+			   "Your BIOS is broken; ANDD object name is not NUL-terminated\n"
+			   "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
+			   dmi_get_system_info(DMI_BIOS_VENDOR),
+			   dmi_get_system_info(DMI_BIOS_VERSION),
+			   dmi_get_system_info(DMI_PRODUCT_VERSION));
+		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
+		return -EINVAL;
+	}
+	pr_info("ANDD device: %x name: %s\n", andd->device_number,
+		andd->device_name);
+
+	return 0;
+}
+
+#ifdef CONFIG_ACPI_NUMA
+static int dmar_parse_one_rhsa(struct acpi_dmar_header *header, void *arg)
+{
+	struct acpi_dmar_rhsa *rhsa;
+	struct dmar_drhd_unit *drhd;
+
+	rhsa = (struct acpi_dmar_rhsa *)header;
+	for_each_drhd_unit(drhd) {
+		if (drhd->reg_base_addr == rhsa->base_address) {
+			int node = acpi_map_pxm_to_node(rhsa->proximity_domain);
+
+			if (!node_online(node))
+				node = -1;
+			drhd->iommu->node = node;
+			return 0;
+		}
+	}
+	pr_warn(FW_BUG
+		"Your BIOS is broken; RHSA refers to non-existent DMAR unit at %llx\n"
+		"BIOS vendor: %s; Ver: %s; Product Version: %s\n",
+		rhsa->base_address,
+		dmi_get_system_info(DMI_BIOS_VENDOR),
+		dmi_get_system_info(DMI_BIOS_VERSION),
+		dmi_get_system_info(DMI_PRODUCT_VERSION));
+	add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
+
+	return 0;
+}
+#else
+#define	dmar_parse_one_rhsa		dmar_res_noop
+#endif
+
+static void
+dmar_table_print_dmar_entry(struct acpi_dmar_header *header)
+{
+	struct acpi_dmar_hardware_unit *drhd;
+	struct acpi_dmar_reserved_memory *rmrr;
+	struct acpi_dmar_atsr *atsr;
+	struct acpi_dmar_rhsa *rhsa;
+
+	switch (header->type) {
+	case ACPI_DMAR_TYPE_HARDWARE_UNIT:
+		drhd = container_of(header, struct acpi_dmar_hardware_unit,
+				    header);
+		pr_info("DRHD base: %#016Lx flags: %#x\n",
+			(unsigned long long)drhd->address, drhd->flags);
+		break;
+	case ACPI_DMAR_TYPE_RESERVED_MEMORY:
+		rmrr = container_of(header, struct acpi_dmar_reserved_memory,
+				    header);
+		pr_info("RMRR base: %#016Lx end: %#016Lx\n",
+			(unsigned long long)rmrr->base_address,
+			(unsigned long long)rmrr->end_address);
+		break;
+	case ACPI_DMAR_TYPE_ROOT_ATS:
+		atsr = container_of(header, struct acpi_dmar_atsr, header);
+		pr_info("ATSR flags: %#x\n", atsr->flags);
+		break;
+	case ACPI_DMAR_TYPE_HARDWARE_AFFINITY:
+		rhsa = container_of(header, struct acpi_dmar_rhsa, header);
+		pr_info("RHSA base: %#016Lx proximity domain: %#x\n",
+		       (unsigned long long)rhsa->base_address,
+		       rhsa->proximity_domain);
+		break;
+	case ACPI_DMAR_TYPE_NAMESPACE:
+		/* We don't print this here because we need to sanity-check
+		   it first. So print it in dmar_parse_one_andd() instead. */
+		break;
+	}
+}
+
+/**
+ * dmar_table_detect - checks to see if the platform supports DMAR devices
+ */
+static int __init dmar_table_detect(void)
+{
+	acpi_status status = AE_OK;
+
+	/* if we could find DMAR table, then there are DMAR devices */
+	status = acpi_get_table(ACPI_SIG_DMAR, 0, &dmar_tbl);
+
+	if (ACPI_SUCCESS(status) && !dmar_tbl) {
+		pr_warn("Unable to map DMAR\n");
+		status = AE_NOT_FOUND;
+	}
+
+	return ACPI_SUCCESS(status) ? 0 : -ENOENT;
+}
+
+static int dmar_walk_remapping_entries(struct acpi_dmar_header *start,
+				       size_t len, struct dmar_res_callback *cb)
+{
+	struct acpi_dmar_header *iter, *next;
+	struct acpi_dmar_header *end = ((void *)start) + len;
+
+	for (iter = start; iter < end; iter = next) {
+		next = (void *)iter + iter->length;
+		if (iter->length == 0) {
+			/* Avoid looping forever on bad ACPI tables */
+			pr_debug(FW_BUG "Invalid 0-length structure\n");
+			break;
+		} else if (next > end) {
+			/* Avoid passing table end */
+			pr_warn(FW_BUG "Record passes table end\n");
+			return -EINVAL;
+		}
+
+		if (cb->print_entry)
+			dmar_table_print_dmar_entry(iter);
+
+		if (iter->type >= ACPI_DMAR_TYPE_RESERVED) {
+			/* continue for forward compatibility */
+			pr_debug("Unknown DMAR structure type %d\n",
+				 iter->type);
+		} else if (cb->cb[iter->type]) {
+			int ret;
+
+			ret = cb->cb[iter->type](iter, cb->arg[iter->type]);
+			if (ret)
+				return ret;
+		} else if (!cb->ignore_unhandled) {
+			pr_warn("No handler for DMAR structure type %d\n",
+				iter->type);
+			return -EINVAL;
+		}
+	}
+
+	return 0;
+}
+
+static inline int dmar_walk_dmar_table(struct acpi_table_dmar *dmar,
+				       struct dmar_res_callback *cb)
+{
+	return dmar_walk_remapping_entries((void *)(dmar + 1),
+			dmar->header.length - sizeof(*dmar), cb);
+}
+
+/**
+ * parse_dmar_table - parses the DMA reporting table
+ */
+static int __init
+parse_dmar_table(void)
+{
+	struct acpi_table_dmar *dmar;
+	int drhd_count = 0;
+	int ret;
+	struct dmar_res_callback cb = {
+		.print_entry = true,
+		.ignore_unhandled = true,
+		.arg[ACPI_DMAR_TYPE_HARDWARE_UNIT] = &drhd_count,
+		.cb[ACPI_DMAR_TYPE_HARDWARE_UNIT] = &dmar_parse_one_drhd,
+		.cb[ACPI_DMAR_TYPE_RESERVED_MEMORY] = &dmar_parse_one_rmrr,
+		.cb[ACPI_DMAR_TYPE_ROOT_ATS] = &dmar_parse_one_atsr,
+		.cb[ACPI_DMAR_TYPE_HARDWARE_AFFINITY] = &dmar_parse_one_rhsa,
+		.cb[ACPI_DMAR_TYPE_NAMESPACE] = &dmar_parse_one_andd,
+	};
+
+	/*
+	 * Do it again, earlier dmar_tbl mapping could be mapped with
+	 * fixed map.
+	 */
+	dmar_table_detect();
+
+	/*
+	 * ACPI tables may not be DMA protected by tboot, so use DMAR copy
+	 * SINIT saved in SinitMleData in TXT heap (which is DMA protected)
+	 */
+	dmar_tbl = tboot_get_dmar_table(dmar_tbl);
+
+	dmar = (struct acpi_table_dmar *)dmar_tbl;
+	if (!dmar)
+		return -ENODEV;
+
+	if (dmar->width < PAGE_SHIFT - 1) {
+		pr_warn("Invalid DMAR haw\n");
+		return -EINVAL;
+	}
+
+	pr_info("Host address width %d\n", dmar->width + 1);
+	ret = dmar_walk_dmar_table(dmar, &cb);
+	if (ret == 0 && drhd_count == 0)
+		pr_warn(FW_BUG "No DRHD structure found in DMAR table\n");
+
+	return ret;
+}
+
+static int dmar_pci_device_match(struct dmar_dev_scope devices[],
+				 int cnt, struct pci_dev *dev)
+{
+	int index;
+	struct device *tmp;
+
+	while (dev) {
+		for_each_active_dev_scope(devices, cnt, index, tmp)
+			if (dev_is_pci(tmp) && dev == to_pci_dev(tmp))
+				return 1;
+
+		/* Check our parent */
+		dev = dev->bus->self;
+	}
+
+	return 0;
+}
+
+struct dmar_drhd_unit *
+dmar_find_matched_drhd_unit(struct pci_dev *dev)
+{
+	struct dmar_drhd_unit *dmaru;
+	struct acpi_dmar_hardware_unit *drhd;
+
+	dev = pci_physfn(dev);
+
+	rcu_read_lock();
+	for_each_drhd_unit(dmaru) {
+		drhd = container_of(dmaru->hdr,
+				    struct acpi_dmar_hardware_unit,
+				    header);
+
+		if (dmaru->include_all &&
+		    drhd->segment == pci_domain_nr(dev->bus))
+			goto out;
+
+		if (dmar_pci_device_match(dmaru->devices,
+					  dmaru->devices_cnt, dev))
+			goto out;
+	}
+	dmaru = NULL;
+out:
+	rcu_read_unlock();
+
+	return dmaru;
+}
+
+static void __init dmar_acpi_insert_dev_scope(u8 device_number,
+					      struct acpi_device *adev)
+{
+	struct dmar_drhd_unit *dmaru;
+	struct acpi_dmar_hardware_unit *drhd;
+	struct acpi_dmar_device_scope *scope;
+	struct device *tmp;
+	int i;
+	struct acpi_dmar_pci_path *path;
+
+	for_each_drhd_unit(dmaru) {
+		drhd = container_of(dmaru->hdr,
+				    struct acpi_dmar_hardware_unit,
+				    header);
+
+		for (scope = (void *)(drhd + 1);
+		     (unsigned long)scope < ((unsigned long)drhd) + drhd->header.length;
+		     scope = ((void *)scope) + scope->length) {
+			if (scope->entry_type != ACPI_DMAR_SCOPE_TYPE_NAMESPACE)
+				continue;
+			if (scope->enumeration_id != device_number)
+				continue;
+
+			path = (void *)(scope + 1);
+			pr_info("ACPI device \"%s\" under DMAR at %llx as %02x:%02x.%d\n",
+				dev_name(&adev->dev), dmaru->reg_base_addr,
+				scope->bus, path->device, path->function);
+			for_each_dev_scope(dmaru->devices, dmaru->devices_cnt, i, tmp)
+				if (tmp == NULL) {
+					dmaru->devices[i].bus = scope->bus;
+					dmaru->devices[i].devfn = PCI_DEVFN(path->device,
+									    path->function);
+					rcu_assign_pointer(dmaru->devices[i].dev,
+							   get_device(&adev->dev));
+					return;
+				}
+			BUG_ON(i >= dmaru->devices_cnt);
+		}
+	}
+	pr_warn("No IOMMU scope found for ANDD enumeration ID %d (%s)\n",
+		device_number, dev_name(&adev->dev));
+}
+
+static int __init dmar_acpi_dev_scope_init(void)
+{
+	struct acpi_dmar_andd *andd;
+
+	if (dmar_tbl == NULL)
+		return -ENODEV;
+
+	for (andd = (void *)dmar_tbl + sizeof(struct acpi_table_dmar);
+	     ((unsigned long)andd) < ((unsigned long)dmar_tbl) + dmar_tbl->length;
+	     andd = ((void *)andd) + andd->header.length) {
+		if (andd->header.type == ACPI_DMAR_TYPE_NAMESPACE) {
+			acpi_handle h;
+			struct acpi_device *adev;
+
+			if (!ACPI_SUCCESS(acpi_get_handle(ACPI_ROOT_OBJECT,
+							  andd->device_name,
+							  &h))) {
+				pr_err("Failed to find handle for ACPI object %s\n",
+				       andd->device_name);
+				continue;
+			}
+			if (acpi_bus_get_device(h, &adev)) {
+				pr_err("Failed to get device for ACPI object %s\n",
+				       andd->device_name);
+				continue;
+			}
+			dmar_acpi_insert_dev_scope(andd->device_number, adev);
+		}
+	}
+	return 0;
+}
+
+int __init dmar_dev_scope_init(void)
+{
+	struct pci_dev *dev = NULL;
+	struct dmar_pci_notify_info *info;
+
+	if (dmar_dev_scope_status != 1)
+		return dmar_dev_scope_status;
+
+	if (list_empty(&dmar_drhd_units)) {
+		dmar_dev_scope_status = -ENODEV;
+	} else {
+		dmar_dev_scope_status = 0;
+
+		dmar_acpi_dev_scope_init();
+
+		for_each_pci_dev(dev) {
+			if (dev->is_virtfn)
+				continue;
+
+			info = dmar_alloc_pci_notify_info(dev,
+					BUS_NOTIFY_ADD_DEVICE);
+			if (!info) {
+				return dmar_dev_scope_status;
+			} else {
+				dmar_pci_bus_add_dev(info);
+				dmar_free_pci_notify_info(info);
+			}
+		}
+	}
+
+	return dmar_dev_scope_status;
+}
+
+void __init dmar_register_bus_notifier(void)
+{
+	bus_register_notifier(&pci_bus_type, &dmar_pci_bus_nb);
+}
+
+
+int __init dmar_table_init(void)
+{
+	static int dmar_table_initialized;
+	int ret;
+
+	if (dmar_table_initialized == 0) {
+		ret = parse_dmar_table();
+		if (ret < 0) {
+			if (ret != -ENODEV)
+				pr_info("Parse DMAR table failure.\n");
+		} else  if (list_empty(&dmar_drhd_units)) {
+			pr_info("No DMAR devices found\n");
+			ret = -ENODEV;
+		}
+
+		if (ret < 0)
+			dmar_table_initialized = ret;
+		else
+			dmar_table_initialized = 1;
+	}
+
+	return dmar_table_initialized < 0 ? dmar_table_initialized : 0;
+}
+
+static void warn_invalid_dmar(u64 addr, const char *message)
+{
+	pr_warn_once(FW_BUG
+		"Your BIOS is broken; DMAR reported at address %llx%s!\n"
+		"BIOS vendor: %s; Ver: %s; Product Version: %s\n",
+		addr, message,
+		dmi_get_system_info(DMI_BIOS_VENDOR),
+		dmi_get_system_info(DMI_BIOS_VERSION),
+		dmi_get_system_info(DMI_PRODUCT_VERSION));
+	add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
+}
+
+static int __ref
+dmar_validate_one_drhd(struct acpi_dmar_header *entry, void *arg)
+{
+	struct acpi_dmar_hardware_unit *drhd;
+	void __iomem *addr;
+	u64 cap, ecap;
+
+	drhd = (void *)entry;
+	if (!drhd->address) {
+		warn_invalid_dmar(0, "");
+		return -EINVAL;
+	}
+
+	if (arg)
+		addr = ioremap(drhd->address, VTD_PAGE_SIZE);
+	else
+		addr = early_ioremap(drhd->address, VTD_PAGE_SIZE);
+	if (!addr) {
+		pr_warn("Can't validate DRHD address: %llx\n", drhd->address);
+		return -EINVAL;
+	}
+
+	cap = dmar_readq(addr + DMAR_CAP_REG);
+	ecap = dmar_readq(addr + DMAR_ECAP_REG);
+
+	if (arg)
+		iounmap(addr);
+	else
+		early_iounmap(addr, VTD_PAGE_SIZE);
+
+	if (cap == (uint64_t)-1 && ecap == (uint64_t)-1) {
+		warn_invalid_dmar(drhd->address, " returns all ones");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+int __init detect_intel_iommu(void)
+{
+	int ret;
+	struct dmar_res_callback validate_drhd_cb = {
+		.cb[ACPI_DMAR_TYPE_HARDWARE_UNIT] = &dmar_validate_one_drhd,
+		.ignore_unhandled = true,
+	};
+
+	down_write(&dmar_global_lock);
+	ret = dmar_table_detect();
+	if (!ret)
+		ret = dmar_walk_dmar_table((struct acpi_table_dmar *)dmar_tbl,
+					   &validate_drhd_cb);
+	if (!ret && !no_iommu && !iommu_detected && !dmar_disabled) {
+		iommu_detected = 1;
+		/* Make sure ACS will be enabled */
+		pci_request_acs();
+	}
+
+#ifdef CONFIG_X86
+	if (!ret)
+		x86_init.iommu.iommu_init = intel_iommu_init;
+#endif
+
+	if (dmar_tbl) {
+		acpi_put_table(dmar_tbl);
+		dmar_tbl = NULL;
+	}
+	up_write(&dmar_global_lock);
+
+	return ret ? ret : 1;
+}
+
+static void unmap_iommu(struct intel_iommu *iommu)
+{
+	iounmap(iommu->reg);
+	release_mem_region(iommu->reg_phys, iommu->reg_size);
+}
+
+/**
+ * map_iommu: map the iommu's registers
+ * @iommu: the iommu to map
+ * @phys_addr: the physical address of the base resgister
+ *
+ * Memory map the iommu's registers.  Start w/ a single page, and
+ * possibly expand if that turns out to be insufficent.
+ */
+static int map_iommu(struct intel_iommu *iommu, u64 phys_addr)
+{
+	int map_size, err=0;
+
+	iommu->reg_phys = phys_addr;
+	iommu->reg_size = VTD_PAGE_SIZE;
+
+	if (!request_mem_region(iommu->reg_phys, iommu->reg_size, iommu->name)) {
+		pr_err("Can't reserve memory\n");
+		err = -EBUSY;
+		goto out;
+	}
+
+	iommu->reg = ioremap(iommu->reg_phys, iommu->reg_size);
+	if (!iommu->reg) {
+		pr_err("Can't map the region\n");
+		err = -ENOMEM;
+		goto release;
+	}
+
+	iommu->cap = dmar_readq(iommu->reg + DMAR_CAP_REG);
+	iommu->ecap = dmar_readq(iommu->reg + DMAR_ECAP_REG);
+
+	if (iommu->cap == (uint64_t)-1 && iommu->ecap == (uint64_t)-1) {
+		err = -EINVAL;
+		warn_invalid_dmar(phys_addr, " returns all ones");
+		goto unmap;
+	}
+
+	/* the registers might be more than one page */
+	map_size = max_t(int, ecap_max_iotlb_offset(iommu->ecap),
+			 cap_max_fault_reg_offset(iommu->cap));
+	map_size = VTD_PAGE_ALIGN(map_size);
+	if (map_size > iommu->reg_size) {
+		iounmap(iommu->reg);
+		release_mem_region(iommu->reg_phys, iommu->reg_size);
+		iommu->reg_size = map_size;
+		if (!request_mem_region(iommu->reg_phys, iommu->reg_size,
+					iommu->name)) {
+			pr_err("Can't reserve memory\n");
+			err = -EBUSY;
+			goto out;
+		}
+		iommu->reg = ioremap(iommu->reg_phys, iommu->reg_size);
+		if (!iommu->reg) {
+			pr_err("Can't map the region\n");
+			err = -ENOMEM;
+			goto release;
+		}
+	}
+	err = 0;
+	goto out;
+
+unmap:
+	iounmap(iommu->reg);
+release:
+	release_mem_region(iommu->reg_phys, iommu->reg_size);
+out:
+	return err;
+}
+
+static int dmar_alloc_seq_id(struct intel_iommu *iommu)
+{
+	iommu->seq_id = find_first_zero_bit(dmar_seq_ids,
+					    DMAR_UNITS_SUPPORTED);
+	if (iommu->seq_id >= DMAR_UNITS_SUPPORTED) {
+		iommu->seq_id = -1;
+	} else {
+		set_bit(iommu->seq_id, dmar_seq_ids);
+		sprintf(iommu->name, "dmar%d", iommu->seq_id);
+	}
+
+	return iommu->seq_id;
+}
+
+static void dmar_free_seq_id(struct intel_iommu *iommu)
+{
+	if (iommu->seq_id >= 0) {
+		clear_bit(iommu->seq_id, dmar_seq_ids);
+		iommu->seq_id = -1;
+	}
+}
+
+static int alloc_iommu(struct dmar_drhd_unit *drhd)
+{
+	struct intel_iommu *iommu;
+	u32 ver, sts;
+	int agaw = -1;
+	int msagaw = -1;
+	int err;
+
+	if (!drhd->reg_base_addr) {
+		warn_invalid_dmar(0, "");
+		return -EINVAL;
+	}
+
+	iommu = kzalloc(sizeof(*iommu), GFP_KERNEL);
+	if (!iommu)
+		return -ENOMEM;
+
+	if (dmar_alloc_seq_id(iommu) < 0) {
+		pr_err("Failed to allocate seq_id\n");
+		err = -ENOSPC;
+		goto error;
+	}
+
+	err = map_iommu(iommu, drhd->reg_base_addr);
+	if (err) {
+		pr_err("Failed to map %s\n", iommu->name);
+		goto error_free_seq_id;
+	}
+
+	err = -EINVAL;
+	if (cap_sagaw(iommu->cap) == 0) {
+		pr_info("%s: No supported address widths. Not attempting DMA translation.\n",
+			iommu->name);
+		drhd->ignored = 1;
+	}
+
+	if (!drhd->ignored) {
+		agaw = iommu_calculate_agaw(iommu);
+		if (agaw < 0) {
+			pr_err("Cannot get a valid agaw for iommu (seq_id = %d)\n",
+			       iommu->seq_id);
+			drhd->ignored = 1;
+		}
+	}
+	if (!drhd->ignored) {
+		msagaw = iommu_calculate_max_sagaw(iommu);
+		if (msagaw < 0) {
+			pr_err("Cannot get a valid max agaw for iommu (seq_id = %d)\n",
+			       iommu->seq_id);
+			drhd->ignored = 1;
+			agaw = -1;
+		}
+	}
+	iommu->agaw = agaw;
+	iommu->msagaw = msagaw;
+	iommu->segment = drhd->segment;
+
+	iommu->node = -1;
+
+	ver = readl(iommu->reg + DMAR_VER_REG);
+	pr_info("%s: reg_base_addr %llx ver %d:%d cap %llx ecap %llx\n",
+		iommu->name,
+		(unsigned long long)drhd->reg_base_addr,
+		DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver),
+		(unsigned long long)iommu->cap,
+		(unsigned long long)iommu->ecap);
+
+	/* Reflect status in gcmd */
+	sts = readl(iommu->reg + DMAR_GSTS_REG);
+	if (sts & DMA_GSTS_IRES)
+		iommu->gcmd |= DMA_GCMD_IRE;
+	if (sts & DMA_GSTS_TES)
+		iommu->gcmd |= DMA_GCMD_TE;
+	if (sts & DMA_GSTS_QIES)
+		iommu->gcmd |= DMA_GCMD_QIE;
+
+	raw_spin_lock_init(&iommu->register_lock);
+
+	/*
+	 * This is only for hotplug; at boot time intel_iommu_enabled won't
+	 * be set yet. When intel_iommu_init() runs, it registers the units
+	 * present at boot time, then sets intel_iommu_enabled.
+	 */
+	if (intel_iommu_enabled && !drhd->ignored) {
+		err = iommu_device_sysfs_add(&iommu->iommu, NULL,
+					     intel_iommu_groups,
+					     "%s", iommu->name);
+		if (err)
+			goto err_unmap;
+
+		iommu_device_set_ops(&iommu->iommu, &intel_iommu_ops);
+
+		err = iommu_device_register(&iommu->iommu);
+		if (err)
+			goto err_sysfs;
+	}
+
+	drhd->iommu = iommu;
+	iommu->drhd = drhd;
+
+	return 0;
+
+err_sysfs:
+	iommu_device_sysfs_remove(&iommu->iommu);
+err_unmap:
+	unmap_iommu(iommu);
+error_free_seq_id:
+	dmar_free_seq_id(iommu);
+error:
+	kfree(iommu);
+	return err;
+}
+
+static void free_iommu(struct intel_iommu *iommu)
+{
+	if (intel_iommu_enabled && !iommu->drhd->ignored) {
+		iommu_device_unregister(&iommu->iommu);
+		iommu_device_sysfs_remove(&iommu->iommu);
+	}
+
+	if (iommu->irq) {
+		if (iommu->pr_irq) {
+			free_irq(iommu->pr_irq, iommu);
+			dmar_free_hwirq(iommu->pr_irq);
+			iommu->pr_irq = 0;
+		}
+		free_irq(iommu->irq, iommu);
+		dmar_free_hwirq(iommu->irq);
+		iommu->irq = 0;
+	}
+
+	if (iommu->qi) {
+		free_page((unsigned long)iommu->qi->desc);
+		kfree(iommu->qi->desc_status);
+		kfree(iommu->qi);
+	}
+
+	if (iommu->reg)
+		unmap_iommu(iommu);
+
+	dmar_free_seq_id(iommu);
+	kfree(iommu);
+}
+
+/*
+ * Reclaim all the submitted descriptors which have completed its work.
+ */
+static inline void reclaim_free_desc(struct q_inval *qi)
+{
+	while (qi->desc_status[qi->free_tail] == QI_DONE ||
+	       qi->desc_status[qi->free_tail] == QI_ABORT) {
+		qi->desc_status[qi->free_tail] = QI_FREE;
+		qi->free_tail = (qi->free_tail + 1) % QI_LENGTH;
+		qi->free_cnt++;
+	}
+}
+
+static int qi_check_fault(struct intel_iommu *iommu, int index)
+{
+	u32 fault;
+	int head, tail;
+	struct q_inval *qi = iommu->qi;
+	int wait_index = (index + 1) % QI_LENGTH;
+
+	if (qi->desc_status[wait_index] == QI_ABORT)
+		return -EAGAIN;
+
+	fault = readl(iommu->reg + DMAR_FSTS_REG);
+
+	/*
+	 * If IQE happens, the head points to the descriptor associated
+	 * with the error. No new descriptors are fetched until the IQE
+	 * is cleared.
+	 */
+	if (fault & DMA_FSTS_IQE) {
+		head = readl(iommu->reg + DMAR_IQH_REG);
+		if ((head >> DMAR_IQ_SHIFT) == index) {
+			pr_err("VT-d detected invalid descriptor: "
+				"low=%llx, high=%llx\n",
+				(unsigned long long)qi->desc[index].low,
+				(unsigned long long)qi->desc[index].high);
+			memcpy(&qi->desc[index], &qi->desc[wait_index],
+					sizeof(struct qi_desc));
+			writel(DMA_FSTS_IQE, iommu->reg + DMAR_FSTS_REG);
+			return -EINVAL;
+		}
+	}
+
+	/*
+	 * If ITE happens, all pending wait_desc commands are aborted.
+	 * No new descriptors are fetched until the ITE is cleared.
+	 */
+	if (fault & DMA_FSTS_ITE) {
+		head = readl(iommu->reg + DMAR_IQH_REG);
+		head = ((head >> DMAR_IQ_SHIFT) - 1 + QI_LENGTH) % QI_LENGTH;
+		head |= 1;
+		tail = readl(iommu->reg + DMAR_IQT_REG);
+		tail = ((tail >> DMAR_IQ_SHIFT) - 1 + QI_LENGTH) % QI_LENGTH;
+
+		writel(DMA_FSTS_ITE, iommu->reg + DMAR_FSTS_REG);
+
+		do {
+			if (qi->desc_status[head] == QI_IN_USE)
+				qi->desc_status[head] = QI_ABORT;
+			head = (head - 2 + QI_LENGTH) % QI_LENGTH;
+		} while (head != tail);
+
+		if (qi->desc_status[wait_index] == QI_ABORT)
+			return -EAGAIN;
+	}
+
+	if (fault & DMA_FSTS_ICE)
+		writel(DMA_FSTS_ICE, iommu->reg + DMAR_FSTS_REG);
+
+	return 0;
+}
+
+/*
+ * Submit the queued invalidation descriptor to the remapping
+ * hardware unit and wait for its completion.
+ */
+int qi_submit_sync(struct qi_desc *desc, struct intel_iommu *iommu)
+{
+	int rc;
+	struct q_inval *qi = iommu->qi;
+	struct qi_desc *hw, wait_desc;
+	int wait_index, index;
+	unsigned long flags;
+
+	if (!qi)
+		return 0;
+
+	hw = qi->desc;
+
+restart:
+	rc = 0;
+
+	raw_spin_lock_irqsave(&qi->q_lock, flags);
+	while (qi->free_cnt < 3) {
+		raw_spin_unlock_irqrestore(&qi->q_lock, flags);
+		cpu_relax();
+		raw_spin_lock_irqsave(&qi->q_lock, flags);
+	}
+
+	index = qi->free_head;
+	wait_index = (index + 1) % QI_LENGTH;
+
+	qi->desc_status[index] = qi->desc_status[wait_index] = QI_IN_USE;
+
+	hw[index] = *desc;
+
+	wait_desc.low = QI_IWD_STATUS_DATA(QI_DONE) |
+			QI_IWD_STATUS_WRITE | QI_IWD_TYPE;
+	wait_desc.high = virt_to_phys(&qi->desc_status[wait_index]);
+
+	hw[wait_index] = wait_desc;
+
+	qi->free_head = (qi->free_head + 2) % QI_LENGTH;
+	qi->free_cnt -= 2;
+
+	/*
+	 * update the HW tail register indicating the presence of
+	 * new descriptors.
+	 */
+	writel(qi->free_head << DMAR_IQ_SHIFT, iommu->reg + DMAR_IQT_REG);
+
+	while (qi->desc_status[wait_index] != QI_DONE) {
+		/*
+		 * We will leave the interrupts disabled, to prevent interrupt
+		 * context to queue another cmd while a cmd is already submitted
+		 * and waiting for completion on this cpu. This is to avoid
+		 * a deadlock where the interrupt context can wait indefinitely
+		 * for free slots in the queue.
+		 */
+		rc = qi_check_fault(iommu, index);
+		if (rc)
+			break;
+
+		raw_spin_unlock(&qi->q_lock);
+		cpu_relax();
+		raw_spin_lock(&qi->q_lock);
+	}
+
+	qi->desc_status[index] = QI_DONE;
+
+	reclaim_free_desc(qi);
+	raw_spin_unlock_irqrestore(&qi->q_lock, flags);
+
+	if (rc == -EAGAIN)
+		goto restart;
+
+	return rc;
+}
+
+/*
+ * Flush the global interrupt entry cache.
+ */
+void qi_global_iec(struct intel_iommu *iommu)
+{
+	struct qi_desc desc;
+
+	desc.low = QI_IEC_TYPE;
+	desc.high = 0;
+
+	/* should never fail */
+	qi_submit_sync(&desc, iommu);
+}
+
+void qi_flush_context(struct intel_iommu *iommu, u16 did, u16 sid, u8 fm,
+		      u64 type)
+{
+	struct qi_desc desc;
+
+	desc.low = QI_CC_FM(fm) | QI_CC_SID(sid) | QI_CC_DID(did)
+			| QI_CC_GRAN(type) | QI_CC_TYPE;
+	desc.high = 0;
+
+	qi_submit_sync(&desc, iommu);
+}
+
+void qi_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr,
+		    unsigned int size_order, u64 type)
+{
+	u8 dw = 0, dr = 0;
+
+	struct qi_desc desc;
+	int ih = 0;
+
+	if (cap_write_drain(iommu->cap))
+		dw = 1;
+
+	if (cap_read_drain(iommu->cap))
+		dr = 1;
+
+	desc.low = QI_IOTLB_DID(did) | QI_IOTLB_DR(dr) | QI_IOTLB_DW(dw)
+		| QI_IOTLB_GRAN(type) | QI_IOTLB_TYPE;
+	desc.high = QI_IOTLB_ADDR(addr) | QI_IOTLB_IH(ih)
+		| QI_IOTLB_AM(size_order);
+
+	qi_submit_sync(&desc, iommu);
+}
+
+void qi_flush_dev_iotlb(struct intel_iommu *iommu, u16 sid, u16 pfsid,
+			u16 qdep, u64 addr, unsigned mask)
+{
+	struct qi_desc desc;
+
+	if (mask) {
+		addr |= (1ULL << (VTD_PAGE_SHIFT + mask - 1)) - 1;
+		desc.high = QI_DEV_IOTLB_ADDR(addr) | QI_DEV_IOTLB_SIZE;
+	} else
+		desc.high = QI_DEV_IOTLB_ADDR(addr);
+
+	if (qdep >= QI_DEV_IOTLB_MAX_INVS)
+		qdep = 0;
+
+	desc.low = QI_DEV_IOTLB_SID(sid) | QI_DEV_IOTLB_QDEP(qdep) |
+		   QI_DIOTLB_TYPE | QI_DEV_IOTLB_PFSID(pfsid);
+
+	qi_submit_sync(&desc, iommu);
+}
+
+/*
+ * Disable Queued Invalidation interface.
+ */
+void dmar_disable_qi(struct intel_iommu *iommu)
+{
+	unsigned long flags;
+	u32 sts;
+	cycles_t start_time = get_cycles();
+
+	if (!ecap_qis(iommu->ecap))
+		return;
+
+	raw_spin_lock_irqsave(&iommu->register_lock, flags);
+
+	sts =  readl(iommu->reg + DMAR_GSTS_REG);
+	if (!(sts & DMA_GSTS_QIES))
+		goto end;
+
+	/*
+	 * Give a chance to HW to complete the pending invalidation requests.
+	 */
+	while ((readl(iommu->reg + DMAR_IQT_REG) !=
+		readl(iommu->reg + DMAR_IQH_REG)) &&
+		(DMAR_OPERATION_TIMEOUT > (get_cycles() - start_time)))
+		cpu_relax();
+
+	iommu->gcmd &= ~DMA_GCMD_QIE;
+	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
+
+	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, readl,
+		      !(sts & DMA_GSTS_QIES), sts);
+end:
+	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
+}
+
+/*
+ * Enable queued invalidation.
+ */
+static void __dmar_enable_qi(struct intel_iommu *iommu)
+{
+	u32 sts;
+	unsigned long flags;
+	struct q_inval *qi = iommu->qi;
+
+	qi->free_head = qi->free_tail = 0;
+	qi->free_cnt = QI_LENGTH;
+
+	raw_spin_lock_irqsave(&iommu->register_lock, flags);
+
+	/* write zero to the tail reg */
+	writel(0, iommu->reg + DMAR_IQT_REG);
+
+	dmar_writeq(iommu->reg + DMAR_IQA_REG, virt_to_phys(qi->desc));
+
+	iommu->gcmd |= DMA_GCMD_QIE;
+	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
+
+	/* Make sure hardware complete it */
+	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, readl, (sts & DMA_GSTS_QIES), sts);
+
+	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
+}
+
+/*
+ * Enable Queued Invalidation interface. This is a must to support
+ * interrupt-remapping. Also used by DMA-remapping, which replaces
+ * register based IOTLB invalidation.
+ */
+int dmar_enable_qi(struct intel_iommu *iommu)
+{
+	struct q_inval *qi;
+	struct page *desc_page;
+
+	if (!ecap_qis(iommu->ecap))
+		return -ENOENT;
+
+	/*
+	 * queued invalidation is already setup and enabled.
+	 */
+	if (iommu->qi)
+		return 0;
+
+	iommu->qi = kmalloc(sizeof(*qi), GFP_ATOMIC);
+	if (!iommu->qi)
+		return -ENOMEM;
+
+	qi = iommu->qi;
+
+
+	desc_page = alloc_pages_node(iommu->node, GFP_ATOMIC | __GFP_ZERO, 0);
+	if (!desc_page) {
+		kfree(qi);
+		iommu->qi = NULL;
+		return -ENOMEM;
+	}
+
+	qi->desc = page_address(desc_page);
+
+	qi->desc_status = kcalloc(QI_LENGTH, sizeof(int), GFP_ATOMIC);
+	if (!qi->desc_status) {
+		free_page((unsigned long) qi->desc);
+		kfree(qi);
+		iommu->qi = NULL;
+		return -ENOMEM;
+	}
+
+	raw_spin_lock_init(&qi->q_lock);
+
+	__dmar_enable_qi(iommu);
+
+	return 0;
+}
+
+/* iommu interrupt handling. Most stuff are MSI-like. */
+
+enum faulttype {
+	DMA_REMAP,
+	INTR_REMAP,
+	UNKNOWN,
+};
+
+static const char *dma_remap_fault_reasons[] =
+{
+	"Software",
+	"Present bit in root entry is clear",
+	"Present bit in context entry is clear",
+	"Invalid context entry",
+	"Access beyond MGAW",
+	"PTE Write access is not set",
+	"PTE Read access is not set",
+	"Next page table ptr is invalid",
+	"Root table address invalid",
+	"Context table ptr is invalid",
+	"non-zero reserved fields in RTP",
+	"non-zero reserved fields in CTP",
+	"non-zero reserved fields in PTE",
+	"PCE for translation request specifies blocking",
+};
+
+static const char *irq_remap_fault_reasons[] =
+{
+	"Detected reserved fields in the decoded interrupt-remapped request",
+	"Interrupt index exceeded the interrupt-remapping table size",
+	"Present field in the IRTE entry is clear",
+	"Error accessing interrupt-remapping table pointed by IRTA_REG",
+	"Detected reserved fields in the IRTE entry",
+	"Blocked a compatibility format interrupt request",
+	"Blocked an interrupt request due to source-id verification failure",
+};
+
+static const char *dmar_get_fault_reason(u8 fault_reason, int *fault_type)
+{
+	if (fault_reason >= 0x20 && (fault_reason - 0x20 <
+					ARRAY_SIZE(irq_remap_fault_reasons))) {
+		*fault_type = INTR_REMAP;
+		return irq_remap_fault_reasons[fault_reason - 0x20];
+	} else if (fault_reason < ARRAY_SIZE(dma_remap_fault_reasons)) {
+		*fault_type = DMA_REMAP;
+		return dma_remap_fault_reasons[fault_reason];
+	} else {
+		*fault_type = UNKNOWN;
+		return "Unknown";
+	}
+}
+
+
+static inline int dmar_msi_reg(struct intel_iommu *iommu, int irq)
+{
+	if (iommu->irq == irq)
+		return DMAR_FECTL_REG;
+	else if (iommu->pr_irq == irq)
+		return DMAR_PECTL_REG;
+	else
+		BUG();
+}
+
+void dmar_msi_unmask(struct irq_data *data)
+{
+	struct intel_iommu *iommu = irq_data_get_irq_handler_data(data);
+	int reg = dmar_msi_reg(iommu, data->irq);
+	unsigned long flag;
+
+	/* unmask it */
+	raw_spin_lock_irqsave(&iommu->register_lock, flag);
+	writel(0, iommu->reg + reg);
+	/* Read a reg to force flush the post write */
+	readl(iommu->reg + reg);
+	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
+}
+
+void dmar_msi_mask(struct irq_data *data)
+{
+	struct intel_iommu *iommu = irq_data_get_irq_handler_data(data);
+	int reg = dmar_msi_reg(iommu, data->irq);
+	unsigned long flag;
+
+	/* mask it */
+	raw_spin_lock_irqsave(&iommu->register_lock, flag);
+	writel(DMA_FECTL_IM, iommu->reg + reg);
+	/* Read a reg to force flush the post write */
+	readl(iommu->reg + reg);
+	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
+}
+
+void dmar_msi_write(int irq, struct msi_msg *msg)
+{
+	struct intel_iommu *iommu = irq_get_handler_data(irq);
+	int reg = dmar_msi_reg(iommu, irq);
+	unsigned long flag;
+
+	raw_spin_lock_irqsave(&iommu->register_lock, flag);
+	writel(msg->data, iommu->reg + reg + 4);
+	writel(msg->address_lo, iommu->reg + reg + 8);
+	writel(msg->address_hi, iommu->reg + reg + 12);
+	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
+}
+
+void dmar_msi_read(int irq, struct msi_msg *msg)
+{
+	struct intel_iommu *iommu = irq_get_handler_data(irq);
+	int reg = dmar_msi_reg(iommu, irq);
+	unsigned long flag;
+
+	raw_spin_lock_irqsave(&iommu->register_lock, flag);
+	msg->data = readl(iommu->reg + reg + 4);
+	msg->address_lo = readl(iommu->reg + reg + 8);
+	msg->address_hi = readl(iommu->reg + reg + 12);
+	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
+}
+
+static int dmar_fault_do_one(struct intel_iommu *iommu, int type,
+		u8 fault_reason, u16 source_id, unsigned long long addr)
+{
+	const char *reason;
+	int fault_type;
+
+	reason = dmar_get_fault_reason(fault_reason, &fault_type);
+
+	if (fault_type == INTR_REMAP)
+		pr_err("[INTR-REMAP] Request device [%02x:%02x.%d] fault index %llx [fault reason %02d] %s\n",
+			source_id >> 8, PCI_SLOT(source_id & 0xFF),
+			PCI_FUNC(source_id & 0xFF), addr >> 48,
+			fault_reason, reason);
+	else
+		pr_err("[%s] Request device [%02x:%02x.%d] fault addr %llx [fault reason %02d] %s\n",
+		       type ? "DMA Read" : "DMA Write",
+		       source_id >> 8, PCI_SLOT(source_id & 0xFF),
+		       PCI_FUNC(source_id & 0xFF), addr, fault_reason, reason);
+	return 0;
+}
+
+#define PRIMARY_FAULT_REG_LEN (16)
+irqreturn_t dmar_fault(int irq, void *dev_id)
+{
+	struct intel_iommu *iommu = dev_id;
+	int reg, fault_index;
+	u32 fault_status;
+	unsigned long flag;
+	static DEFINE_RATELIMIT_STATE(rs,
+				      DEFAULT_RATELIMIT_INTERVAL,
+				      DEFAULT_RATELIMIT_BURST);
+
+	raw_spin_lock_irqsave(&iommu->register_lock, flag);
+	fault_status = readl(iommu->reg + DMAR_FSTS_REG);
+	if (fault_status && __ratelimit(&rs))
+		pr_err("DRHD: handling fault status reg %x\n", fault_status);
+
+	/* TBD: ignore advanced fault log currently */
+	if (!(fault_status & DMA_FSTS_PPF))
+		goto unlock_exit;
+
+	fault_index = dma_fsts_fault_record_index(fault_status);
+	reg = cap_fault_reg_offset(iommu->cap);
+	while (1) {
+		/* Disable printing, simply clear the fault when ratelimited */
+		bool ratelimited = !__ratelimit(&rs);
+		u8 fault_reason;
+		u16 source_id;
+		u64 guest_addr;
+		int type;
+		u32 data;
+
+		/* highest 32 bits */
+		data = readl(iommu->reg + reg +
+				fault_index * PRIMARY_FAULT_REG_LEN + 12);
+		if (!(data & DMA_FRCD_F))
+			break;
+
+		if (!ratelimited) {
+			fault_reason = dma_frcd_fault_reason(data);
+			type = dma_frcd_type(data);
+
+			data = readl(iommu->reg + reg +
+				     fault_index * PRIMARY_FAULT_REG_LEN + 8);
+			source_id = dma_frcd_source_id(data);
+
+			guest_addr = dmar_readq(iommu->reg + reg +
+					fault_index * PRIMARY_FAULT_REG_LEN);
+			guest_addr = dma_frcd_page_addr(guest_addr);
+		}
+
+		/* clear the fault */
+		writel(DMA_FRCD_F, iommu->reg + reg +
+			fault_index * PRIMARY_FAULT_REG_LEN + 12);
+
+		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
+
+		if (!ratelimited)
+			dmar_fault_do_one(iommu, type, fault_reason,
+					  source_id, guest_addr);
+
+		fault_index++;
+		if (fault_index >= cap_num_fault_regs(iommu->cap))
+			fault_index = 0;
+		raw_spin_lock_irqsave(&iommu->register_lock, flag);
+	}
+
+	writel(DMA_FSTS_PFO | DMA_FSTS_PPF | DMA_FSTS_PRO,
+	       iommu->reg + DMAR_FSTS_REG);
+
+unlock_exit:
+	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
+	return IRQ_HANDLED;
+}
+
+int dmar_set_interrupt(struct intel_iommu *iommu)
+{
+	int irq, ret;
+
+	/*
+	 * Check if the fault interrupt is already initialized.
+	 */
+	if (iommu->irq)
+		return 0;
+
+	irq = dmar_alloc_hwirq(iommu->seq_id, iommu->node, iommu);
+	if (irq > 0) {
+		iommu->irq = irq;
+	} else {
+		pr_err("No free IRQ vectors\n");
+		return -EINVAL;
+	}
+
+	ret = request_irq(irq, dmar_fault, IRQF_NO_THREAD, iommu->name, iommu);
+	if (ret)
+		pr_err("Can't request irq\n");
+	return ret;
+}
+
+int __init enable_drhd_fault_handling(void)
+{
+	struct dmar_drhd_unit *drhd;
+	struct intel_iommu *iommu;
+
+	/*
+	 * Enable fault control interrupt.
+	 */
+	for_each_iommu(iommu, drhd) {
+		u32 fault_status;
+		int ret = dmar_set_interrupt(iommu);
+
+		if (ret) {
+			pr_err("DRHD %Lx: failed to enable fault, interrupt, ret %d\n",
+			       (unsigned long long)drhd->reg_base_addr, ret);
+			return -1;
+		}
+
+		/*
+		 * Clear any previous faults.
+		 */
+		dmar_fault(iommu->irq, iommu);
+		fault_status = readl(iommu->reg + DMAR_FSTS_REG);
+		writel(fault_status, iommu->reg + DMAR_FSTS_REG);
+	}
+
+	return 0;
+}
+
+/*
+ * Re-enable Queued Invalidation interface.
+ */
+int dmar_reenable_qi(struct intel_iommu *iommu)
+{
+	if (!ecap_qis(iommu->ecap))
+		return -ENOENT;
+
+	if (!iommu->qi)
+		return -ENOENT;
+
+	/*
+	 * First disable queued invalidation.
+	 */
+	dmar_disable_qi(iommu);
+	/*
+	 * Then enable queued invalidation again. Since there is no pending
+	 * invalidation requests now, it's safe to re-enable queued
+	 * invalidation.
+	 */
+	__dmar_enable_qi(iommu);
+
+	return 0;
+}
+
+/*
+ * Check interrupt remapping support in DMAR table description.
+ */
+int __init dmar_ir_support(void)
+{
+	struct acpi_table_dmar *dmar;
+	dmar = (struct acpi_table_dmar *)dmar_tbl;
+	if (!dmar)
+		return 0;
+	return dmar->flags & 0x1;
+}
+
+/* Check whether DMAR units are in use */
+static inline bool dmar_in_use(void)
+{
+	return irq_remapping_enabled || intel_iommu_enabled;
+}
+
+static int __init dmar_free_unused_resources(void)
+{
+	struct dmar_drhd_unit *dmaru, *dmaru_n;
+
+	if (dmar_in_use())
+		return 0;
+
+	if (dmar_dev_scope_status != 1 && !list_empty(&dmar_drhd_units))
+		bus_unregister_notifier(&pci_bus_type, &dmar_pci_bus_nb);
+
+	down_write(&dmar_global_lock);
+	list_for_each_entry_safe(dmaru, dmaru_n, &dmar_drhd_units, list) {
+		list_del(&dmaru->list);
+		dmar_free_drhd(dmaru);
+	}
+	up_write(&dmar_global_lock);
+
+	return 0;
+}
+
+late_initcall(dmar_free_unused_resources);
+IOMMU_INIT_POST(detect_intel_iommu);
+
+/*
+ * DMAR Hotplug Support
+ * For more details, please refer to Intel(R) Virtualization Technology
+ * for Directed-IO Architecture Specifiction, Rev 2.2, Section 8.8
+ * "Remapping Hardware Unit Hot Plug".
+ */
+static guid_t dmar_hp_guid =
+	GUID_INIT(0xD8C1A3A6, 0xBE9B, 0x4C9B,
+		  0x91, 0xBF, 0xC3, 0xCB, 0x81, 0xFC, 0x5D, 0xAF);
+
+/*
+ * Currently there's only one revision and BIOS will not check the revision id,
+ * so use 0 for safety.
+ */
+#define	DMAR_DSM_REV_ID			0
+#define	DMAR_DSM_FUNC_DRHD		1
+#define	DMAR_DSM_FUNC_ATSR		2
+#define	DMAR_DSM_FUNC_RHSA		3
+
+static inline bool dmar_detect_dsm(acpi_handle handle, int func)
+{
+	return acpi_check_dsm(handle, &dmar_hp_guid, DMAR_DSM_REV_ID, 1 << func);
+}
+
+static int dmar_walk_dsm_resource(acpi_handle handle, int func,
+				  dmar_res_handler_t handler, void *arg)
+{
+	int ret = -ENODEV;
+	union acpi_object *obj;
+	struct acpi_dmar_header *start;
+	struct dmar_res_callback callback;
+	static int res_type[] = {
+		[DMAR_DSM_FUNC_DRHD] = ACPI_DMAR_TYPE_HARDWARE_UNIT,
+		[DMAR_DSM_FUNC_ATSR] = ACPI_DMAR_TYPE_ROOT_ATS,
+		[DMAR_DSM_FUNC_RHSA] = ACPI_DMAR_TYPE_HARDWARE_AFFINITY,
+	};
+
+	if (!dmar_detect_dsm(handle, func))
+		return 0;
+
+	obj = acpi_evaluate_dsm_typed(handle, &dmar_hp_guid, DMAR_DSM_REV_ID,
+				      func, NULL, ACPI_TYPE_BUFFER);
+	if (!obj)
+		return -ENODEV;
+
+	memset(&callback, 0, sizeof(callback));
+	callback.cb[res_type[func]] = handler;
+	callback.arg[res_type[func]] = arg;
+	start = (struct acpi_dmar_header *)obj->buffer.pointer;
+	ret = dmar_walk_remapping_entries(start, obj->buffer.length, &callback);
+
+	ACPI_FREE(obj);
+
+	return ret;
+}
+
+static int dmar_hp_add_drhd(struct acpi_dmar_header *header, void *arg)
+{
+	int ret;
+	struct dmar_drhd_unit *dmaru;
+
+	dmaru = dmar_find_dmaru((struct acpi_dmar_hardware_unit *)header);
+	if (!dmaru)
+		return -ENODEV;
+
+	ret = dmar_ir_hotplug(dmaru, true);
+	if (ret == 0)
+		ret = dmar_iommu_hotplug(dmaru, true);
+
+	return ret;
+}
+
+static int dmar_hp_remove_drhd(struct acpi_dmar_header *header, void *arg)
+{
+	int i, ret;
+	struct device *dev;
+	struct dmar_drhd_unit *dmaru;
+
+	dmaru = dmar_find_dmaru((struct acpi_dmar_hardware_unit *)header);
+	if (!dmaru)
+		return 0;
+
+	/*
+	 * All PCI devices managed by this unit should have been destroyed.
+	 */
+	if (!dmaru->include_all && dmaru->devices && dmaru->devices_cnt) {
+		for_each_active_dev_scope(dmaru->devices,
+					  dmaru->devices_cnt, i, dev)
+			return -EBUSY;
+	}
+
+	ret = dmar_ir_hotplug(dmaru, false);
+	if (ret == 0)
+		ret = dmar_iommu_hotplug(dmaru, false);
+
+	return ret;
+}
+
+static int dmar_hp_release_drhd(struct acpi_dmar_header *header, void *arg)
+{
+	struct dmar_drhd_unit *dmaru;
+
+	dmaru = dmar_find_dmaru((struct acpi_dmar_hardware_unit *)header);
+	if (dmaru) {
+		list_del_rcu(&dmaru->list);
+		synchronize_rcu();
+		dmar_free_drhd(dmaru);
+	}
+
+	return 0;
+}
+
+static int dmar_hotplug_insert(acpi_handle handle)
+{
+	int ret;
+	int drhd_count = 0;
+
+	ret = dmar_walk_dsm_resource(handle, DMAR_DSM_FUNC_DRHD,
+				     &dmar_validate_one_drhd, (void *)1);
+	if (ret)
+		goto out;
+
+	ret = dmar_walk_dsm_resource(handle, DMAR_DSM_FUNC_DRHD,
+				     &dmar_parse_one_drhd, (void *)&drhd_count);
+	if (ret == 0 && drhd_count == 0) {
+		pr_warn(FW_BUG "No DRHD structures in buffer returned by _DSM method\n");
+		goto out;
+	} else if (ret) {
+		goto release_drhd;
+	}
+
+	ret = dmar_walk_dsm_resource(handle, DMAR_DSM_FUNC_RHSA,
+				     &dmar_parse_one_rhsa, NULL);
+	if (ret)
+		goto release_drhd;
+
+	ret = dmar_walk_dsm_resource(handle, DMAR_DSM_FUNC_ATSR,
+				     &dmar_parse_one_atsr, NULL);
+	if (ret)
+		goto release_atsr;
+
+	ret = dmar_walk_dsm_resource(handle, DMAR_DSM_FUNC_DRHD,
+				     &dmar_hp_add_drhd, NULL);
+	if (!ret)
+		return 0;
+
+	dmar_walk_dsm_resource(handle, DMAR_DSM_FUNC_DRHD,
+			       &dmar_hp_remove_drhd, NULL);
+release_atsr:
+	dmar_walk_dsm_resource(handle, DMAR_DSM_FUNC_ATSR,
+			       &dmar_release_one_atsr, NULL);
+release_drhd:
+	dmar_walk_dsm_resource(handle, DMAR_DSM_FUNC_DRHD,
+			       &dmar_hp_release_drhd, NULL);
+out:
+	return ret;
+}
+
+static int dmar_hotplug_remove(acpi_handle handle)
+{
+	int ret;
+
+	ret = dmar_walk_dsm_resource(handle, DMAR_DSM_FUNC_ATSR,
+				     &dmar_check_one_atsr, NULL);
+	if (ret)
+		return ret;
+
+	ret = dmar_walk_dsm_resource(handle, DMAR_DSM_FUNC_DRHD,
+				     &dmar_hp_remove_drhd, NULL);
+	if (ret == 0) {
+		WARN_ON(dmar_walk_dsm_resource(handle, DMAR_DSM_FUNC_ATSR,
+					       &dmar_release_one_atsr, NULL));
+		WARN_ON(dmar_walk_dsm_resource(handle, DMAR_DSM_FUNC_DRHD,
+					       &dmar_hp_release_drhd, NULL));
+	} else {
+		dmar_walk_dsm_resource(handle, DMAR_DSM_FUNC_DRHD,
+				       &dmar_hp_add_drhd, NULL);
+	}
+
+	return ret;
+}
+
+static acpi_status dmar_get_dsm_handle(acpi_handle handle, u32 lvl,
+				       void *context, void **retval)
+{
+	acpi_handle *phdl = retval;
+
+	if (dmar_detect_dsm(handle, DMAR_DSM_FUNC_DRHD)) {
+		*phdl = handle;
+		return AE_CTRL_TERMINATE;
+	}
+
+	return AE_OK;
+}
+
+static int dmar_device_hotplug(acpi_handle handle, bool insert)
+{
+	int ret;
+	acpi_handle tmp = NULL;
+	acpi_status status;
+
+	if (!dmar_in_use())
+		return 0;
+
+	if (dmar_detect_dsm(handle, DMAR_DSM_FUNC_DRHD)) {
+		tmp = handle;
+	} else {
+		status = acpi_walk_namespace(ACPI_TYPE_DEVICE, handle,
+					     ACPI_UINT32_MAX,
+					     dmar_get_dsm_handle,
+					     NULL, NULL, &tmp);
+		if (ACPI_FAILURE(status)) {
+			pr_warn("Failed to locate _DSM method.\n");
+			return -ENXIO;
+		}
+	}
+	if (tmp == NULL)
+		return 0;
+
+	down_write(&dmar_global_lock);
+	if (insert)
+		ret = dmar_hotplug_insert(tmp);
+	else
+		ret = dmar_hotplug_remove(tmp);
+	up_write(&dmar_global_lock);
+
+	return ret;
+}
+
+int dmar_device_add(acpi_handle handle)
+{
+	return dmar_device_hotplug(handle, true);
+}
+
+int dmar_device_remove(acpi_handle handle)
+{
+	return dmar_device_hotplug(handle, false);
+}
diff --git a/drivers/iommu/exynos-iommu.c b/drivers/iommu/exynos-iommu.c
new file mode 100644
index 000000000..4bf6049dd
--- /dev/null
+++ b/drivers/iommu/exynos-iommu.c
@@ -0,0 +1,1395 @@
+/*
+ * Copyright (c) 2011,2016 Samsung Electronics Co., Ltd.
+ *		http://www.samsung.com
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifdef CONFIG_EXYNOS_IOMMU_DEBUG
+#define DEBUG
+#endif
+
+#include <linux/clk.h>
+#include <linux/dma-mapping.h>
+#include <linux/err.h>
+#include <linux/io.h>
+#include <linux/iommu.h>
+#include <linux/interrupt.h>
+#include <linux/kmemleak.h>
+#include <linux/list.h>
+#include <linux/of.h>
+#include <linux/of_iommu.h>
+#include <linux/of_platform.h>
+#include <linux/platform_device.h>
+#include <linux/pm_runtime.h>
+#include <linux/slab.h>
+#include <linux/dma-iommu.h>
+
+typedef u32 sysmmu_iova_t;
+typedef u32 sysmmu_pte_t;
+
+/* We do not consider super section mapping (16MB) */
+#define SECT_ORDER 20
+#define LPAGE_ORDER 16
+#define SPAGE_ORDER 12
+
+#define SECT_SIZE (1 << SECT_ORDER)
+#define LPAGE_SIZE (1 << LPAGE_ORDER)
+#define SPAGE_SIZE (1 << SPAGE_ORDER)
+
+#define SECT_MASK (~(SECT_SIZE - 1))
+#define LPAGE_MASK (~(LPAGE_SIZE - 1))
+#define SPAGE_MASK (~(SPAGE_SIZE - 1))
+
+#define lv1ent_fault(sent) ((*(sent) == ZERO_LV2LINK) || \
+			   ((*(sent) & 3) == 0) || ((*(sent) & 3) == 3))
+#define lv1ent_zero(sent) (*(sent) == ZERO_LV2LINK)
+#define lv1ent_page_zero(sent) ((*(sent) & 3) == 1)
+#define lv1ent_page(sent) ((*(sent) != ZERO_LV2LINK) && \
+			  ((*(sent) & 3) == 1))
+#define lv1ent_section(sent) ((*(sent) & 3) == 2)
+
+#define lv2ent_fault(pent) ((*(pent) & 3) == 0)
+#define lv2ent_small(pent) ((*(pent) & 2) == 2)
+#define lv2ent_large(pent) ((*(pent) & 3) == 1)
+
+/*
+ * v1.x - v3.x SYSMMU supports 32bit physical and 32bit virtual address spaces
+ * v5.0 introduced support for 36bit physical address space by shifting
+ * all page entry values by 4 bits.
+ * All SYSMMU controllers in the system support the address spaces of the same
+ * size, so PG_ENT_SHIFT can be initialized on first SYSMMU probe to proper
+ * value (0 or 4).
+ */
+static short PG_ENT_SHIFT = -1;
+#define SYSMMU_PG_ENT_SHIFT 0
+#define SYSMMU_V5_PG_ENT_SHIFT 4
+
+static const sysmmu_pte_t *LV1_PROT;
+static const sysmmu_pte_t SYSMMU_LV1_PROT[] = {
+	((0 << 15) | (0 << 10)), /* no access */
+	((1 << 15) | (1 << 10)), /* IOMMU_READ only */
+	((0 << 15) | (1 << 10)), /* IOMMU_WRITE not supported, use read/write */
+	((0 << 15) | (1 << 10)), /* IOMMU_READ | IOMMU_WRITE */
+};
+static const sysmmu_pte_t SYSMMU_V5_LV1_PROT[] = {
+	(0 << 4), /* no access */
+	(1 << 4), /* IOMMU_READ only */
+	(2 << 4), /* IOMMU_WRITE only */
+	(3 << 4), /* IOMMU_READ | IOMMU_WRITE */
+};
+
+static const sysmmu_pte_t *LV2_PROT;
+static const sysmmu_pte_t SYSMMU_LV2_PROT[] = {
+	((0 << 9) | (0 << 4)), /* no access */
+	((1 << 9) | (1 << 4)), /* IOMMU_READ only */
+	((0 << 9) | (1 << 4)), /* IOMMU_WRITE not supported, use read/write */
+	((0 << 9) | (1 << 4)), /* IOMMU_READ | IOMMU_WRITE */
+};
+static const sysmmu_pte_t SYSMMU_V5_LV2_PROT[] = {
+	(0 << 2), /* no access */
+	(1 << 2), /* IOMMU_READ only */
+	(2 << 2), /* IOMMU_WRITE only */
+	(3 << 2), /* IOMMU_READ | IOMMU_WRITE */
+};
+
+#define SYSMMU_SUPPORTED_PROT_BITS (IOMMU_READ | IOMMU_WRITE)
+
+#define sect_to_phys(ent) (((phys_addr_t) ent) << PG_ENT_SHIFT)
+#define section_phys(sent) (sect_to_phys(*(sent)) & SECT_MASK)
+#define section_offs(iova) (iova & (SECT_SIZE - 1))
+#define lpage_phys(pent) (sect_to_phys(*(pent)) & LPAGE_MASK)
+#define lpage_offs(iova) (iova & (LPAGE_SIZE - 1))
+#define spage_phys(pent) (sect_to_phys(*(pent)) & SPAGE_MASK)
+#define spage_offs(iova) (iova & (SPAGE_SIZE - 1))
+
+#define NUM_LV1ENTRIES 4096
+#define NUM_LV2ENTRIES (SECT_SIZE / SPAGE_SIZE)
+
+static u32 lv1ent_offset(sysmmu_iova_t iova)
+{
+	return iova >> SECT_ORDER;
+}
+
+static u32 lv2ent_offset(sysmmu_iova_t iova)
+{
+	return (iova >> SPAGE_ORDER) & (NUM_LV2ENTRIES - 1);
+}
+
+#define LV1TABLE_SIZE (NUM_LV1ENTRIES * sizeof(sysmmu_pte_t))
+#define LV2TABLE_SIZE (NUM_LV2ENTRIES * sizeof(sysmmu_pte_t))
+
+#define SPAGES_PER_LPAGE (LPAGE_SIZE / SPAGE_SIZE)
+#define lv2table_base(sent) (sect_to_phys(*(sent) & 0xFFFFFFC0))
+
+#define mk_lv1ent_sect(pa, prot) ((pa >> PG_ENT_SHIFT) | LV1_PROT[prot] | 2)
+#define mk_lv1ent_page(pa) ((pa >> PG_ENT_SHIFT) | 1)
+#define mk_lv2ent_lpage(pa, prot) ((pa >> PG_ENT_SHIFT) | LV2_PROT[prot] | 1)
+#define mk_lv2ent_spage(pa, prot) ((pa >> PG_ENT_SHIFT) | LV2_PROT[prot] | 2)
+
+#define CTRL_ENABLE	0x5
+#define CTRL_BLOCK	0x7
+#define CTRL_DISABLE	0x0
+
+#define CFG_LRU		0x1
+#define CFG_EAP		(1 << 2)
+#define CFG_QOS(n)	((n & 0xF) << 7)
+#define CFG_ACGEN	(1 << 24) /* System MMU 3.3 only */
+#define CFG_SYSSEL	(1 << 22) /* System MMU 3.2 only */
+#define CFG_FLPDCACHE	(1 << 20) /* System MMU 3.2+ only */
+
+/* common registers */
+#define REG_MMU_CTRL		0x000
+#define REG_MMU_CFG		0x004
+#define REG_MMU_STATUS		0x008
+#define REG_MMU_VERSION		0x034
+
+#define MMU_MAJ_VER(val)	((val) >> 7)
+#define MMU_MIN_VER(val)	((val) & 0x7F)
+#define MMU_RAW_VER(reg)	(((reg) >> 21) & ((1 << 11) - 1)) /* 11 bits */
+
+#define MAKE_MMU_VER(maj, min)	((((maj) & 0xF) << 7) | ((min) & 0x7F))
+
+/* v1.x - v3.x registers */
+#define REG_MMU_FLUSH		0x00C
+#define REG_MMU_FLUSH_ENTRY	0x010
+#define REG_PT_BASE_ADDR	0x014
+#define REG_INT_STATUS		0x018
+#define REG_INT_CLEAR		0x01C
+
+#define REG_PAGE_FAULT_ADDR	0x024
+#define REG_AW_FAULT_ADDR	0x028
+#define REG_AR_FAULT_ADDR	0x02C
+#define REG_DEFAULT_SLAVE_ADDR	0x030
+
+/* v5.x registers */
+#define REG_V5_PT_BASE_PFN	0x00C
+#define REG_V5_MMU_FLUSH_ALL	0x010
+#define REG_V5_MMU_FLUSH_ENTRY	0x014
+#define REG_V5_MMU_FLUSH_RANGE	0x018
+#define REG_V5_MMU_FLUSH_START	0x020
+#define REG_V5_MMU_FLUSH_END	0x024
+#define REG_V5_INT_STATUS	0x060
+#define REG_V5_INT_CLEAR	0x064
+#define REG_V5_FAULT_AR_VA	0x070
+#define REG_V5_FAULT_AW_VA	0x080
+
+#define has_sysmmu(dev)		(dev->archdata.iommu != NULL)
+
+static struct device *dma_dev;
+static struct kmem_cache *lv2table_kmem_cache;
+static sysmmu_pte_t *zero_lv2_table;
+#define ZERO_LV2LINK mk_lv1ent_page(virt_to_phys(zero_lv2_table))
+
+static sysmmu_pte_t *section_entry(sysmmu_pte_t *pgtable, sysmmu_iova_t iova)
+{
+	return pgtable + lv1ent_offset(iova);
+}
+
+static sysmmu_pte_t *page_entry(sysmmu_pte_t *sent, sysmmu_iova_t iova)
+{
+	return (sysmmu_pte_t *)phys_to_virt(
+				lv2table_base(sent)) + lv2ent_offset(iova);
+}
+
+/*
+ * IOMMU fault information register
+ */
+struct sysmmu_fault_info {
+	unsigned int bit;	/* bit number in STATUS register */
+	unsigned short addr_reg; /* register to read VA fault address */
+	const char *name;	/* human readable fault name */
+	unsigned int type;	/* fault type for report_iommu_fault */
+};
+
+static const struct sysmmu_fault_info sysmmu_faults[] = {
+	{ 0, REG_PAGE_FAULT_ADDR, "PAGE", IOMMU_FAULT_READ },
+	{ 1, REG_AR_FAULT_ADDR, "AR MULTI-HIT", IOMMU_FAULT_READ },
+	{ 2, REG_AW_FAULT_ADDR, "AW MULTI-HIT", IOMMU_FAULT_WRITE },
+	{ 3, REG_DEFAULT_SLAVE_ADDR, "BUS ERROR", IOMMU_FAULT_READ },
+	{ 4, REG_AR_FAULT_ADDR, "AR SECURITY PROTECTION", IOMMU_FAULT_READ },
+	{ 5, REG_AR_FAULT_ADDR, "AR ACCESS PROTECTION", IOMMU_FAULT_READ },
+	{ 6, REG_AW_FAULT_ADDR, "AW SECURITY PROTECTION", IOMMU_FAULT_WRITE },
+	{ 7, REG_AW_FAULT_ADDR, "AW ACCESS PROTECTION", IOMMU_FAULT_WRITE },
+};
+
+static const struct sysmmu_fault_info sysmmu_v5_faults[] = {
+	{ 0, REG_V5_FAULT_AR_VA, "AR PTW", IOMMU_FAULT_READ },
+	{ 1, REG_V5_FAULT_AR_VA, "AR PAGE", IOMMU_FAULT_READ },
+	{ 2, REG_V5_FAULT_AR_VA, "AR MULTI-HIT", IOMMU_FAULT_READ },
+	{ 3, REG_V5_FAULT_AR_VA, "AR ACCESS PROTECTION", IOMMU_FAULT_READ },
+	{ 4, REG_V5_FAULT_AR_VA, "AR SECURITY PROTECTION", IOMMU_FAULT_READ },
+	{ 16, REG_V5_FAULT_AW_VA, "AW PTW", IOMMU_FAULT_WRITE },
+	{ 17, REG_V5_FAULT_AW_VA, "AW PAGE", IOMMU_FAULT_WRITE },
+	{ 18, REG_V5_FAULT_AW_VA, "AW MULTI-HIT", IOMMU_FAULT_WRITE },
+	{ 19, REG_V5_FAULT_AW_VA, "AW ACCESS PROTECTION", IOMMU_FAULT_WRITE },
+	{ 20, REG_V5_FAULT_AW_VA, "AW SECURITY PROTECTION", IOMMU_FAULT_WRITE },
+};
+
+/*
+ * This structure is attached to dev.archdata.iommu of the master device
+ * on device add, contains a list of SYSMMU controllers defined by device tree,
+ * which are bound to given master device. It is usually referenced by 'owner'
+ * pointer.
+*/
+struct exynos_iommu_owner {
+	struct list_head controllers;	/* list of sysmmu_drvdata.owner_node */
+	struct iommu_domain *domain;	/* domain this device is attached */
+	struct mutex rpm_lock;		/* for runtime pm of all sysmmus */
+};
+
+/*
+ * This structure exynos specific generalization of struct iommu_domain.
+ * It contains list of SYSMMU controllers from all master devices, which has
+ * been attached to this domain and page tables of IO address space defined by
+ * it. It is usually referenced by 'domain' pointer.
+ */
+struct exynos_iommu_domain {
+	struct list_head clients; /* list of sysmmu_drvdata.domain_node */
+	sysmmu_pte_t *pgtable;	/* lv1 page table, 16KB */
+	short *lv2entcnt;	/* free lv2 entry counter for each section */
+	spinlock_t lock;	/* lock for modyfying list of clients */
+	spinlock_t pgtablelock;	/* lock for modifying page table @ pgtable */
+	struct iommu_domain domain; /* generic domain data structure */
+};
+
+/*
+ * This structure hold all data of a single SYSMMU controller, this includes
+ * hw resources like registers and clocks, pointers and list nodes to connect
+ * it to all other structures, internal state and parameters read from device
+ * tree. It is usually referenced by 'data' pointer.
+ */
+struct sysmmu_drvdata {
+	struct device *sysmmu;		/* SYSMMU controller device */
+	struct device *master;		/* master device (owner) */
+	struct device_link *link;	/* runtime PM link to master */
+	void __iomem *sfrbase;		/* our registers */
+	struct clk *clk;		/* SYSMMU's clock */
+	struct clk *aclk;		/* SYSMMU's aclk clock */
+	struct clk *pclk;		/* SYSMMU's pclk clock */
+	struct clk *clk_master;		/* master's device clock */
+	spinlock_t lock;		/* lock for modyfying state */
+	bool active;			/* current status */
+	struct exynos_iommu_domain *domain; /* domain we belong to */
+	struct list_head domain_node;	/* node for domain clients list */
+	struct list_head owner_node;	/* node for owner controllers list */
+	phys_addr_t pgtable;		/* assigned page table structure */
+	unsigned int version;		/* our version */
+
+	struct iommu_device iommu;	/* IOMMU core handle */
+};
+
+static struct exynos_iommu_domain *to_exynos_domain(struct iommu_domain *dom)
+{
+	return container_of(dom, struct exynos_iommu_domain, domain);
+}
+
+static void sysmmu_unblock(struct sysmmu_drvdata *data)
+{
+	writel(CTRL_ENABLE, data->sfrbase + REG_MMU_CTRL);
+}
+
+static bool sysmmu_block(struct sysmmu_drvdata *data)
+{
+	int i = 120;
+
+	writel(CTRL_BLOCK, data->sfrbase + REG_MMU_CTRL);
+	while ((i > 0) && !(readl(data->sfrbase + REG_MMU_STATUS) & 1))
+		--i;
+
+	if (!(readl(data->sfrbase + REG_MMU_STATUS) & 1)) {
+		sysmmu_unblock(data);
+		return false;
+	}
+
+	return true;
+}
+
+static void __sysmmu_tlb_invalidate(struct sysmmu_drvdata *data)
+{
+	if (MMU_MAJ_VER(data->version) < 5)
+		writel(0x1, data->sfrbase + REG_MMU_FLUSH);
+	else
+		writel(0x1, data->sfrbase + REG_V5_MMU_FLUSH_ALL);
+}
+
+static void __sysmmu_tlb_invalidate_entry(struct sysmmu_drvdata *data,
+				sysmmu_iova_t iova, unsigned int num_inv)
+{
+	unsigned int i;
+
+	if (MMU_MAJ_VER(data->version) < 5) {
+		for (i = 0; i < num_inv; i++) {
+			writel((iova & SPAGE_MASK) | 1,
+				     data->sfrbase + REG_MMU_FLUSH_ENTRY);
+			iova += SPAGE_SIZE;
+		}
+	} else {
+		if (num_inv == 1) {
+			writel((iova & SPAGE_MASK) | 1,
+				     data->sfrbase + REG_V5_MMU_FLUSH_ENTRY);
+		} else {
+			writel((iova & SPAGE_MASK),
+				     data->sfrbase + REG_V5_MMU_FLUSH_START);
+			writel((iova & SPAGE_MASK) + (num_inv - 1) * SPAGE_SIZE,
+				     data->sfrbase + REG_V5_MMU_FLUSH_END);
+			writel(1, data->sfrbase + REG_V5_MMU_FLUSH_RANGE);
+		}
+	}
+}
+
+static void __sysmmu_set_ptbase(struct sysmmu_drvdata *data, phys_addr_t pgd)
+{
+	if (MMU_MAJ_VER(data->version) < 5)
+		writel(pgd, data->sfrbase + REG_PT_BASE_ADDR);
+	else
+		writel(pgd >> PAGE_SHIFT,
+			     data->sfrbase + REG_V5_PT_BASE_PFN);
+
+	__sysmmu_tlb_invalidate(data);
+}
+
+static void __sysmmu_enable_clocks(struct sysmmu_drvdata *data)
+{
+	BUG_ON(clk_prepare_enable(data->clk_master));
+	BUG_ON(clk_prepare_enable(data->clk));
+	BUG_ON(clk_prepare_enable(data->pclk));
+	BUG_ON(clk_prepare_enable(data->aclk));
+}
+
+static void __sysmmu_disable_clocks(struct sysmmu_drvdata *data)
+{
+	clk_disable_unprepare(data->aclk);
+	clk_disable_unprepare(data->pclk);
+	clk_disable_unprepare(data->clk);
+	clk_disable_unprepare(data->clk_master);
+}
+
+static void __sysmmu_get_version(struct sysmmu_drvdata *data)
+{
+	u32 ver;
+
+	__sysmmu_enable_clocks(data);
+
+	ver = readl(data->sfrbase + REG_MMU_VERSION);
+
+	/* controllers on some SoCs don't report proper version */
+	if (ver == 0x80000001u)
+		data->version = MAKE_MMU_VER(1, 0);
+	else
+		data->version = MMU_RAW_VER(ver);
+
+	dev_dbg(data->sysmmu, "hardware version: %d.%d\n",
+		MMU_MAJ_VER(data->version), MMU_MIN_VER(data->version));
+
+	__sysmmu_disable_clocks(data);
+}
+
+static void show_fault_information(struct sysmmu_drvdata *data,
+				   const struct sysmmu_fault_info *finfo,
+				   sysmmu_iova_t fault_addr)
+{
+	sysmmu_pte_t *ent;
+
+	dev_err(data->sysmmu, "%s: %s FAULT occurred at %#x\n",
+		dev_name(data->master), finfo->name, fault_addr);
+	dev_dbg(data->sysmmu, "Page table base: %pa\n", &data->pgtable);
+	ent = section_entry(phys_to_virt(data->pgtable), fault_addr);
+	dev_dbg(data->sysmmu, "\tLv1 entry: %#x\n", *ent);
+	if (lv1ent_page(ent)) {
+		ent = page_entry(ent, fault_addr);
+		dev_dbg(data->sysmmu, "\t Lv2 entry: %#x\n", *ent);
+	}
+}
+
+static irqreturn_t exynos_sysmmu_irq(int irq, void *dev_id)
+{
+	/* SYSMMU is in blocked state when interrupt occurred. */
+	struct sysmmu_drvdata *data = dev_id;
+	const struct sysmmu_fault_info *finfo;
+	unsigned int i, n, itype;
+	sysmmu_iova_t fault_addr = -1;
+	unsigned short reg_status, reg_clear;
+	int ret = -ENOSYS;
+
+	WARN_ON(!data->active);
+
+	if (MMU_MAJ_VER(data->version) < 5) {
+		reg_status = REG_INT_STATUS;
+		reg_clear = REG_INT_CLEAR;
+		finfo = sysmmu_faults;
+		n = ARRAY_SIZE(sysmmu_faults);
+	} else {
+		reg_status = REG_V5_INT_STATUS;
+		reg_clear = REG_V5_INT_CLEAR;
+		finfo = sysmmu_v5_faults;
+		n = ARRAY_SIZE(sysmmu_v5_faults);
+	}
+
+	spin_lock(&data->lock);
+
+	clk_enable(data->clk_master);
+
+	itype = __ffs(readl(data->sfrbase + reg_status));
+	for (i = 0; i < n; i++, finfo++)
+		if (finfo->bit == itype)
+			break;
+	/* unknown/unsupported fault */
+	BUG_ON(i == n);
+
+	/* print debug message */
+	fault_addr = readl(data->sfrbase + finfo->addr_reg);
+	show_fault_information(data, finfo, fault_addr);
+
+	if (data->domain)
+		ret = report_iommu_fault(&data->domain->domain,
+					data->master, fault_addr, finfo->type);
+	/* fault is not recovered by fault handler */
+	BUG_ON(ret != 0);
+
+	writel(1 << itype, data->sfrbase + reg_clear);
+
+	sysmmu_unblock(data);
+
+	clk_disable(data->clk_master);
+
+	spin_unlock(&data->lock);
+
+	return IRQ_HANDLED;
+}
+
+static void __sysmmu_disable(struct sysmmu_drvdata *data)
+{
+	unsigned long flags;
+
+	clk_enable(data->clk_master);
+
+	spin_lock_irqsave(&data->lock, flags);
+	writel(CTRL_DISABLE, data->sfrbase + REG_MMU_CTRL);
+	writel(0, data->sfrbase + REG_MMU_CFG);
+	data->active = false;
+	spin_unlock_irqrestore(&data->lock, flags);
+
+	__sysmmu_disable_clocks(data);
+}
+
+static void __sysmmu_init_config(struct sysmmu_drvdata *data)
+{
+	unsigned int cfg;
+
+	if (data->version <= MAKE_MMU_VER(3, 1))
+		cfg = CFG_LRU | CFG_QOS(15);
+	else if (data->version <= MAKE_MMU_VER(3, 2))
+		cfg = CFG_LRU | CFG_QOS(15) | CFG_FLPDCACHE | CFG_SYSSEL;
+	else
+		cfg = CFG_QOS(15) | CFG_FLPDCACHE | CFG_ACGEN;
+
+	cfg |= CFG_EAP; /* enable access protection bits check */
+
+	writel(cfg, data->sfrbase + REG_MMU_CFG);
+}
+
+static void __sysmmu_enable(struct sysmmu_drvdata *data)
+{
+	unsigned long flags;
+
+	__sysmmu_enable_clocks(data);
+
+	spin_lock_irqsave(&data->lock, flags);
+	writel(CTRL_BLOCK, data->sfrbase + REG_MMU_CTRL);
+	__sysmmu_init_config(data);
+	__sysmmu_set_ptbase(data, data->pgtable);
+	writel(CTRL_ENABLE, data->sfrbase + REG_MMU_CTRL);
+	data->active = true;
+	spin_unlock_irqrestore(&data->lock, flags);
+
+	/*
+	 * SYSMMU driver keeps master's clock enabled only for the short
+	 * time, while accessing the registers. For performing address
+	 * translation during DMA transaction it relies on the client
+	 * driver to enable it.
+	 */
+	clk_disable(data->clk_master);
+}
+
+static void sysmmu_tlb_invalidate_flpdcache(struct sysmmu_drvdata *data,
+					    sysmmu_iova_t iova)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&data->lock, flags);
+	if (data->active && data->version >= MAKE_MMU_VER(3, 3)) {
+		clk_enable(data->clk_master);
+		if (sysmmu_block(data)) {
+			if (data->version >= MAKE_MMU_VER(5, 0))
+				__sysmmu_tlb_invalidate(data);
+			else
+				__sysmmu_tlb_invalidate_entry(data, iova, 1);
+			sysmmu_unblock(data);
+		}
+		clk_disable(data->clk_master);
+	}
+	spin_unlock_irqrestore(&data->lock, flags);
+}
+
+static void sysmmu_tlb_invalidate_entry(struct sysmmu_drvdata *data,
+					sysmmu_iova_t iova, size_t size)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&data->lock, flags);
+	if (data->active) {
+		unsigned int num_inv = 1;
+
+		clk_enable(data->clk_master);
+
+		/*
+		 * L2TLB invalidation required
+		 * 4KB page: 1 invalidation
+		 * 64KB page: 16 invalidations
+		 * 1MB page: 64 invalidations
+		 * because it is set-associative TLB
+		 * with 8-way and 64 sets.
+		 * 1MB page can be cached in one of all sets.
+		 * 64KB page can be one of 16 consecutive sets.
+		 */
+		if (MMU_MAJ_VER(data->version) == 2)
+			num_inv = min_t(unsigned int, size / PAGE_SIZE, 64);
+
+		if (sysmmu_block(data)) {
+			__sysmmu_tlb_invalidate_entry(data, iova, num_inv);
+			sysmmu_unblock(data);
+		}
+		clk_disable(data->clk_master);
+	}
+	spin_unlock_irqrestore(&data->lock, flags);
+}
+
+static const struct iommu_ops exynos_iommu_ops;
+
+static int __init exynos_sysmmu_probe(struct platform_device *pdev)
+{
+	int irq, ret;
+	struct device *dev = &pdev->dev;
+	struct sysmmu_drvdata *data;
+	struct resource *res;
+
+	data = devm_kzalloc(dev, sizeof(*data), GFP_KERNEL);
+	if (!data)
+		return -ENOMEM;
+
+	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	data->sfrbase = devm_ioremap_resource(dev, res);
+	if (IS_ERR(data->sfrbase))
+		return PTR_ERR(data->sfrbase);
+
+	irq = platform_get_irq(pdev, 0);
+	if (irq <= 0) {
+		dev_err(dev, "Unable to find IRQ resource\n");
+		return irq;
+	}
+
+	ret = devm_request_irq(dev, irq, exynos_sysmmu_irq, 0,
+				dev_name(dev), data);
+	if (ret) {
+		dev_err(dev, "Unabled to register handler of irq %d\n", irq);
+		return ret;
+	}
+
+	data->clk = devm_clk_get(dev, "sysmmu");
+	if (PTR_ERR(data->clk) == -ENOENT)
+		data->clk = NULL;
+	else if (IS_ERR(data->clk))
+		return PTR_ERR(data->clk);
+
+	data->aclk = devm_clk_get(dev, "aclk");
+	if (PTR_ERR(data->aclk) == -ENOENT)
+		data->aclk = NULL;
+	else if (IS_ERR(data->aclk))
+		return PTR_ERR(data->aclk);
+
+	data->pclk = devm_clk_get(dev, "pclk");
+	if (PTR_ERR(data->pclk) == -ENOENT)
+		data->pclk = NULL;
+	else if (IS_ERR(data->pclk))
+		return PTR_ERR(data->pclk);
+
+	if (!data->clk && (!data->aclk || !data->pclk)) {
+		dev_err(dev, "Failed to get device clock(s)!\n");
+		return -ENOSYS;
+	}
+
+	data->clk_master = devm_clk_get(dev, "master");
+	if (PTR_ERR(data->clk_master) == -ENOENT)
+		data->clk_master = NULL;
+	else if (IS_ERR(data->clk_master))
+		return PTR_ERR(data->clk_master);
+
+	data->sysmmu = dev;
+	spin_lock_init(&data->lock);
+
+	ret = iommu_device_sysfs_add(&data->iommu, &pdev->dev, NULL,
+				     dev_name(data->sysmmu));
+	if (ret)
+		return ret;
+
+	iommu_device_set_ops(&data->iommu, &exynos_iommu_ops);
+	iommu_device_set_fwnode(&data->iommu, &dev->of_node->fwnode);
+
+	ret = iommu_device_register(&data->iommu);
+	if (ret)
+		return ret;
+
+	platform_set_drvdata(pdev, data);
+
+	__sysmmu_get_version(data);
+	if (PG_ENT_SHIFT < 0) {
+		if (MMU_MAJ_VER(data->version) < 5) {
+			PG_ENT_SHIFT = SYSMMU_PG_ENT_SHIFT;
+			LV1_PROT = SYSMMU_LV1_PROT;
+			LV2_PROT = SYSMMU_LV2_PROT;
+		} else {
+			PG_ENT_SHIFT = SYSMMU_V5_PG_ENT_SHIFT;
+			LV1_PROT = SYSMMU_V5_LV1_PROT;
+			LV2_PROT = SYSMMU_V5_LV2_PROT;
+		}
+	}
+
+	/*
+	 * use the first registered sysmmu device for performing
+	 * dma mapping operations on iommu page tables (cpu cache flush)
+	 */
+	if (!dma_dev)
+		dma_dev = &pdev->dev;
+
+	pm_runtime_enable(dev);
+
+	return 0;
+}
+
+static int __maybe_unused exynos_sysmmu_suspend(struct device *dev)
+{
+	struct sysmmu_drvdata *data = dev_get_drvdata(dev);
+	struct device *master = data->master;
+
+	if (master) {
+		struct exynos_iommu_owner *owner = master->archdata.iommu;
+
+		mutex_lock(&owner->rpm_lock);
+		if (data->domain) {
+			dev_dbg(data->sysmmu, "saving state\n");
+			__sysmmu_disable(data);
+		}
+		mutex_unlock(&owner->rpm_lock);
+	}
+	return 0;
+}
+
+static int __maybe_unused exynos_sysmmu_resume(struct device *dev)
+{
+	struct sysmmu_drvdata *data = dev_get_drvdata(dev);
+	struct device *master = data->master;
+
+	if (master) {
+		struct exynos_iommu_owner *owner = master->archdata.iommu;
+
+		mutex_lock(&owner->rpm_lock);
+		if (data->domain) {
+			dev_dbg(data->sysmmu, "restoring state\n");
+			__sysmmu_enable(data);
+		}
+		mutex_unlock(&owner->rpm_lock);
+	}
+	return 0;
+}
+
+static const struct dev_pm_ops sysmmu_pm_ops = {
+	SET_RUNTIME_PM_OPS(exynos_sysmmu_suspend, exynos_sysmmu_resume, NULL)
+	SET_SYSTEM_SLEEP_PM_OPS(pm_runtime_force_suspend,
+				pm_runtime_force_resume)
+};
+
+static const struct of_device_id sysmmu_of_match[] = {
+	{ .compatible	= "samsung,exynos-sysmmu", },
+	{ },
+};
+
+static struct platform_driver exynos_sysmmu_driver __refdata = {
+	.probe	= exynos_sysmmu_probe,
+	.driver	= {
+		.name		= "exynos-sysmmu",
+		.of_match_table	= sysmmu_of_match,
+		.pm		= &sysmmu_pm_ops,
+		.suppress_bind_attrs = true,
+	}
+};
+
+static inline void update_pte(sysmmu_pte_t *ent, sysmmu_pte_t val)
+{
+	dma_sync_single_for_cpu(dma_dev, virt_to_phys(ent), sizeof(*ent),
+				DMA_TO_DEVICE);
+	*ent = cpu_to_le32(val);
+	dma_sync_single_for_device(dma_dev, virt_to_phys(ent), sizeof(*ent),
+				   DMA_TO_DEVICE);
+}
+
+static struct iommu_domain *exynos_iommu_domain_alloc(unsigned type)
+{
+	struct exynos_iommu_domain *domain;
+	dma_addr_t handle;
+	int i;
+
+	/* Check if correct PTE offsets are initialized */
+	BUG_ON(PG_ENT_SHIFT < 0 || !dma_dev);
+
+	domain = kzalloc(sizeof(*domain), GFP_KERNEL);
+	if (!domain)
+		return NULL;
+
+	if (type == IOMMU_DOMAIN_DMA) {
+		if (iommu_get_dma_cookie(&domain->domain) != 0)
+			goto err_pgtable;
+	} else if (type != IOMMU_DOMAIN_UNMANAGED) {
+		goto err_pgtable;
+	}
+
+	domain->pgtable = (sysmmu_pte_t *)__get_free_pages(GFP_KERNEL, 2);
+	if (!domain->pgtable)
+		goto err_dma_cookie;
+
+	domain->lv2entcnt = (short *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, 1);
+	if (!domain->lv2entcnt)
+		goto err_counter;
+
+	/* Workaround for System MMU v3.3 to prevent caching 1MiB mapping */
+	for (i = 0; i < NUM_LV1ENTRIES; i++)
+		domain->pgtable[i] = ZERO_LV2LINK;
+
+	handle = dma_map_single(dma_dev, domain->pgtable, LV1TABLE_SIZE,
+				DMA_TO_DEVICE);
+	/* For mapping page table entries we rely on dma == phys */
+	BUG_ON(handle != virt_to_phys(domain->pgtable));
+	if (dma_mapping_error(dma_dev, handle))
+		goto err_lv2ent;
+
+	spin_lock_init(&domain->lock);
+	spin_lock_init(&domain->pgtablelock);
+	INIT_LIST_HEAD(&domain->clients);
+
+	domain->domain.geometry.aperture_start = 0;
+	domain->domain.geometry.aperture_end   = ~0UL;
+	domain->domain.geometry.force_aperture = true;
+
+	return &domain->domain;
+
+err_lv2ent:
+	free_pages((unsigned long)domain->lv2entcnt, 1);
+err_counter:
+	free_pages((unsigned long)domain->pgtable, 2);
+err_dma_cookie:
+	if (type == IOMMU_DOMAIN_DMA)
+		iommu_put_dma_cookie(&domain->domain);
+err_pgtable:
+	kfree(domain);
+	return NULL;
+}
+
+static void exynos_iommu_domain_free(struct iommu_domain *iommu_domain)
+{
+	struct exynos_iommu_domain *domain = to_exynos_domain(iommu_domain);
+	struct sysmmu_drvdata *data, *next;
+	unsigned long flags;
+	int i;
+
+	WARN_ON(!list_empty(&domain->clients));
+
+	spin_lock_irqsave(&domain->lock, flags);
+
+	list_for_each_entry_safe(data, next, &domain->clients, domain_node) {
+		spin_lock(&data->lock);
+		__sysmmu_disable(data);
+		data->pgtable = 0;
+		data->domain = NULL;
+		list_del_init(&data->domain_node);
+		spin_unlock(&data->lock);
+	}
+
+	spin_unlock_irqrestore(&domain->lock, flags);
+
+	if (iommu_domain->type == IOMMU_DOMAIN_DMA)
+		iommu_put_dma_cookie(iommu_domain);
+
+	dma_unmap_single(dma_dev, virt_to_phys(domain->pgtable), LV1TABLE_SIZE,
+			 DMA_TO_DEVICE);
+
+	for (i = 0; i < NUM_LV1ENTRIES; i++)
+		if (lv1ent_page(domain->pgtable + i)) {
+			phys_addr_t base = lv2table_base(domain->pgtable + i);
+
+			dma_unmap_single(dma_dev, base, LV2TABLE_SIZE,
+					 DMA_TO_DEVICE);
+			kmem_cache_free(lv2table_kmem_cache,
+					phys_to_virt(base));
+		}
+
+	free_pages((unsigned long)domain->pgtable, 2);
+	free_pages((unsigned long)domain->lv2entcnt, 1);
+	kfree(domain);
+}
+
+static void exynos_iommu_detach_device(struct iommu_domain *iommu_domain,
+				    struct device *dev)
+{
+	struct exynos_iommu_owner *owner = dev->archdata.iommu;
+	struct exynos_iommu_domain *domain = to_exynos_domain(iommu_domain);
+	phys_addr_t pagetable = virt_to_phys(domain->pgtable);
+	struct sysmmu_drvdata *data, *next;
+	unsigned long flags;
+
+	if (!has_sysmmu(dev) || owner->domain != iommu_domain)
+		return;
+
+	mutex_lock(&owner->rpm_lock);
+
+	list_for_each_entry(data, &owner->controllers, owner_node) {
+		pm_runtime_get_noresume(data->sysmmu);
+		if (pm_runtime_active(data->sysmmu))
+			__sysmmu_disable(data);
+		pm_runtime_put(data->sysmmu);
+	}
+
+	spin_lock_irqsave(&domain->lock, flags);
+	list_for_each_entry_safe(data, next, &domain->clients, domain_node) {
+		spin_lock(&data->lock);
+		data->pgtable = 0;
+		data->domain = NULL;
+		list_del_init(&data->domain_node);
+		spin_unlock(&data->lock);
+	}
+	owner->domain = NULL;
+	spin_unlock_irqrestore(&domain->lock, flags);
+
+	mutex_unlock(&owner->rpm_lock);
+
+	dev_dbg(dev, "%s: Detached IOMMU with pgtable %pa\n", __func__,
+		&pagetable);
+}
+
+static int exynos_iommu_attach_device(struct iommu_domain *iommu_domain,
+				   struct device *dev)
+{
+	struct exynos_iommu_owner *owner = dev->archdata.iommu;
+	struct exynos_iommu_domain *domain = to_exynos_domain(iommu_domain);
+	struct sysmmu_drvdata *data;
+	phys_addr_t pagetable = virt_to_phys(domain->pgtable);
+	unsigned long flags;
+
+	if (!has_sysmmu(dev))
+		return -ENODEV;
+
+	if (owner->domain)
+		exynos_iommu_detach_device(owner->domain, dev);
+
+	mutex_lock(&owner->rpm_lock);
+
+	spin_lock_irqsave(&domain->lock, flags);
+	list_for_each_entry(data, &owner->controllers, owner_node) {
+		spin_lock(&data->lock);
+		data->pgtable = pagetable;
+		data->domain = domain;
+		list_add_tail(&data->domain_node, &domain->clients);
+		spin_unlock(&data->lock);
+	}
+	owner->domain = iommu_domain;
+	spin_unlock_irqrestore(&domain->lock, flags);
+
+	list_for_each_entry(data, &owner->controllers, owner_node) {
+		pm_runtime_get_noresume(data->sysmmu);
+		if (pm_runtime_active(data->sysmmu))
+			__sysmmu_enable(data);
+		pm_runtime_put(data->sysmmu);
+	}
+
+	mutex_unlock(&owner->rpm_lock);
+
+	dev_dbg(dev, "%s: Attached IOMMU with pgtable %pa\n", __func__,
+		&pagetable);
+
+	return 0;
+}
+
+static sysmmu_pte_t *alloc_lv2entry(struct exynos_iommu_domain *domain,
+		sysmmu_pte_t *sent, sysmmu_iova_t iova, short *pgcounter)
+{
+	if (lv1ent_section(sent)) {
+		WARN(1, "Trying mapping on %#08x mapped with 1MiB page", iova);
+		return ERR_PTR(-EADDRINUSE);
+	}
+
+	if (lv1ent_fault(sent)) {
+		dma_addr_t handle;
+		sysmmu_pte_t *pent;
+		bool need_flush_flpd_cache = lv1ent_zero(sent);
+
+		pent = kmem_cache_zalloc(lv2table_kmem_cache, GFP_ATOMIC);
+		BUG_ON((uintptr_t)pent & (LV2TABLE_SIZE - 1));
+		if (!pent)
+			return ERR_PTR(-ENOMEM);
+
+		update_pte(sent, mk_lv1ent_page(virt_to_phys(pent)));
+		kmemleak_ignore(pent);
+		*pgcounter = NUM_LV2ENTRIES;
+		handle = dma_map_single(dma_dev, pent, LV2TABLE_SIZE,
+					DMA_TO_DEVICE);
+		if (dma_mapping_error(dma_dev, handle)) {
+			kmem_cache_free(lv2table_kmem_cache, pent);
+			return ERR_PTR(-EADDRINUSE);
+		}
+
+		/*
+		 * If pre-fetched SLPD is a faulty SLPD in zero_l2_table,
+		 * FLPD cache may cache the address of zero_l2_table. This
+		 * function replaces the zero_l2_table with new L2 page table
+		 * to write valid mappings.
+		 * Accessing the valid area may cause page fault since FLPD
+		 * cache may still cache zero_l2_table for the valid area
+		 * instead of new L2 page table that has the mapping
+		 * information of the valid area.
+		 * Thus any replacement of zero_l2_table with other valid L2
+		 * page table must involve FLPD cache invalidation for System
+		 * MMU v3.3.
+		 * FLPD cache invalidation is performed with TLB invalidation
+		 * by VPN without blocking. It is safe to invalidate TLB without
+		 * blocking because the target address of TLB invalidation is
+		 * not currently mapped.
+		 */
+		if (need_flush_flpd_cache) {
+			struct sysmmu_drvdata *data;
+
+			spin_lock(&domain->lock);
+			list_for_each_entry(data, &domain->clients, domain_node)
+				sysmmu_tlb_invalidate_flpdcache(data, iova);
+			spin_unlock(&domain->lock);
+		}
+	}
+
+	return page_entry(sent, iova);
+}
+
+static int lv1set_section(struct exynos_iommu_domain *domain,
+			  sysmmu_pte_t *sent, sysmmu_iova_t iova,
+			  phys_addr_t paddr, int prot, short *pgcnt)
+{
+	if (lv1ent_section(sent)) {
+		WARN(1, "Trying mapping on 1MiB@%#08x that is mapped",
+			iova);
+		return -EADDRINUSE;
+	}
+
+	if (lv1ent_page(sent)) {
+		if (*pgcnt != NUM_LV2ENTRIES) {
+			WARN(1, "Trying mapping on 1MiB@%#08x that is mapped",
+				iova);
+			return -EADDRINUSE;
+		}
+
+		kmem_cache_free(lv2table_kmem_cache, page_entry(sent, 0));
+		*pgcnt = 0;
+	}
+
+	update_pte(sent, mk_lv1ent_sect(paddr, prot));
+
+	spin_lock(&domain->lock);
+	if (lv1ent_page_zero(sent)) {
+		struct sysmmu_drvdata *data;
+		/*
+		 * Flushing FLPD cache in System MMU v3.3 that may cache a FLPD
+		 * entry by speculative prefetch of SLPD which has no mapping.
+		 */
+		list_for_each_entry(data, &domain->clients, domain_node)
+			sysmmu_tlb_invalidate_flpdcache(data, iova);
+	}
+	spin_unlock(&domain->lock);
+
+	return 0;
+}
+
+static int lv2set_page(sysmmu_pte_t *pent, phys_addr_t paddr, size_t size,
+		       int prot, short *pgcnt)
+{
+	if (size == SPAGE_SIZE) {
+		if (WARN_ON(!lv2ent_fault(pent)))
+			return -EADDRINUSE;
+
+		update_pte(pent, mk_lv2ent_spage(paddr, prot));
+		*pgcnt -= 1;
+	} else { /* size == LPAGE_SIZE */
+		int i;
+		dma_addr_t pent_base = virt_to_phys(pent);
+
+		dma_sync_single_for_cpu(dma_dev, pent_base,
+					sizeof(*pent) * SPAGES_PER_LPAGE,
+					DMA_TO_DEVICE);
+		for (i = 0; i < SPAGES_PER_LPAGE; i++, pent++) {
+			if (WARN_ON(!lv2ent_fault(pent))) {
+				if (i > 0)
+					memset(pent - i, 0, sizeof(*pent) * i);
+				return -EADDRINUSE;
+			}
+
+			*pent = mk_lv2ent_lpage(paddr, prot);
+		}
+		dma_sync_single_for_device(dma_dev, pent_base,
+					   sizeof(*pent) * SPAGES_PER_LPAGE,
+					   DMA_TO_DEVICE);
+		*pgcnt -= SPAGES_PER_LPAGE;
+	}
+
+	return 0;
+}
+
+/*
+ * *CAUTION* to the I/O virtual memory managers that support exynos-iommu:
+ *
+ * System MMU v3.x has advanced logic to improve address translation
+ * performance with caching more page table entries by a page table walk.
+ * However, the logic has a bug that while caching faulty page table entries,
+ * System MMU reports page fault if the cached fault entry is hit even though
+ * the fault entry is updated to a valid entry after the entry is cached.
+ * To prevent caching faulty page table entries which may be updated to valid
+ * entries later, the virtual memory manager should care about the workaround
+ * for the problem. The following describes the workaround.
+ *
+ * Any two consecutive I/O virtual address regions must have a hole of 128KiB
+ * at maximum to prevent misbehavior of System MMU 3.x (workaround for h/w bug).
+ *
+ * Precisely, any start address of I/O virtual region must be aligned with
+ * the following sizes for System MMU v3.1 and v3.2.
+ * System MMU v3.1: 128KiB
+ * System MMU v3.2: 256KiB
+ *
+ * Because System MMU v3.3 caches page table entries more aggressively, it needs
+ * more workarounds.
+ * - Any two consecutive I/O virtual regions must have a hole of size larger
+ *   than or equal to 128KiB.
+ * - Start address of an I/O virtual region must be aligned by 128KiB.
+ */
+static int exynos_iommu_map(struct iommu_domain *iommu_domain,
+			    unsigned long l_iova, phys_addr_t paddr, size_t size,
+			    int prot)
+{
+	struct exynos_iommu_domain *domain = to_exynos_domain(iommu_domain);
+	sysmmu_pte_t *entry;
+	sysmmu_iova_t iova = (sysmmu_iova_t)l_iova;
+	unsigned long flags;
+	int ret = -ENOMEM;
+
+	BUG_ON(domain->pgtable == NULL);
+	prot &= SYSMMU_SUPPORTED_PROT_BITS;
+
+	spin_lock_irqsave(&domain->pgtablelock, flags);
+
+	entry = section_entry(domain->pgtable, iova);
+
+	if (size == SECT_SIZE) {
+		ret = lv1set_section(domain, entry, iova, paddr, prot,
+				     &domain->lv2entcnt[lv1ent_offset(iova)]);
+	} else {
+		sysmmu_pte_t *pent;
+
+		pent = alloc_lv2entry(domain, entry, iova,
+				      &domain->lv2entcnt[lv1ent_offset(iova)]);
+
+		if (IS_ERR(pent))
+			ret = PTR_ERR(pent);
+		else
+			ret = lv2set_page(pent, paddr, size, prot,
+				       &domain->lv2entcnt[lv1ent_offset(iova)]);
+	}
+
+	if (ret)
+		pr_err("%s: Failed(%d) to map %#zx bytes @ %#x\n",
+			__func__, ret, size, iova);
+
+	spin_unlock_irqrestore(&domain->pgtablelock, flags);
+
+	return ret;
+}
+
+static void exynos_iommu_tlb_invalidate_entry(struct exynos_iommu_domain *domain,
+					      sysmmu_iova_t iova, size_t size)
+{
+	struct sysmmu_drvdata *data;
+	unsigned long flags;
+
+	spin_lock_irqsave(&domain->lock, flags);
+
+	list_for_each_entry(data, &domain->clients, domain_node)
+		sysmmu_tlb_invalidate_entry(data, iova, size);
+
+	spin_unlock_irqrestore(&domain->lock, flags);
+}
+
+static size_t exynos_iommu_unmap(struct iommu_domain *iommu_domain,
+				 unsigned long l_iova, size_t size)
+{
+	struct exynos_iommu_domain *domain = to_exynos_domain(iommu_domain);
+	sysmmu_iova_t iova = (sysmmu_iova_t)l_iova;
+	sysmmu_pte_t *ent;
+	size_t err_pgsize;
+	unsigned long flags;
+
+	BUG_ON(domain->pgtable == NULL);
+
+	spin_lock_irqsave(&domain->pgtablelock, flags);
+
+	ent = section_entry(domain->pgtable, iova);
+
+	if (lv1ent_section(ent)) {
+		if (WARN_ON(size < SECT_SIZE)) {
+			err_pgsize = SECT_SIZE;
+			goto err;
+		}
+
+		/* workaround for h/w bug in System MMU v3.3 */
+		update_pte(ent, ZERO_LV2LINK);
+		size = SECT_SIZE;
+		goto done;
+	}
+
+	if (unlikely(lv1ent_fault(ent))) {
+		if (size > SECT_SIZE)
+			size = SECT_SIZE;
+		goto done;
+	}
+
+	/* lv1ent_page(sent) == true here */
+
+	ent = page_entry(ent, iova);
+
+	if (unlikely(lv2ent_fault(ent))) {
+		size = SPAGE_SIZE;
+		goto done;
+	}
+
+	if (lv2ent_small(ent)) {
+		update_pte(ent, 0);
+		size = SPAGE_SIZE;
+		domain->lv2entcnt[lv1ent_offset(iova)] += 1;
+		goto done;
+	}
+
+	/* lv1ent_large(ent) == true here */
+	if (WARN_ON(size < LPAGE_SIZE)) {
+		err_pgsize = LPAGE_SIZE;
+		goto err;
+	}
+
+	dma_sync_single_for_cpu(dma_dev, virt_to_phys(ent),
+				sizeof(*ent) * SPAGES_PER_LPAGE,
+				DMA_TO_DEVICE);
+	memset(ent, 0, sizeof(*ent) * SPAGES_PER_LPAGE);
+	dma_sync_single_for_device(dma_dev, virt_to_phys(ent),
+				   sizeof(*ent) * SPAGES_PER_LPAGE,
+				   DMA_TO_DEVICE);
+	size = LPAGE_SIZE;
+	domain->lv2entcnt[lv1ent_offset(iova)] += SPAGES_PER_LPAGE;
+done:
+	spin_unlock_irqrestore(&domain->pgtablelock, flags);
+
+	exynos_iommu_tlb_invalidate_entry(domain, iova, size);
+
+	return size;
+err:
+	spin_unlock_irqrestore(&domain->pgtablelock, flags);
+
+	pr_err("%s: Failed: size(%#zx) @ %#x is smaller than page size %#zx\n",
+		__func__, size, iova, err_pgsize);
+
+	return 0;
+}
+
+static phys_addr_t exynos_iommu_iova_to_phys(struct iommu_domain *iommu_domain,
+					  dma_addr_t iova)
+{
+	struct exynos_iommu_domain *domain = to_exynos_domain(iommu_domain);
+	sysmmu_pte_t *entry;
+	unsigned long flags;
+	phys_addr_t phys = 0;
+
+	spin_lock_irqsave(&domain->pgtablelock, flags);
+
+	entry = section_entry(domain->pgtable, iova);
+
+	if (lv1ent_section(entry)) {
+		phys = section_phys(entry) + section_offs(iova);
+	} else if (lv1ent_page(entry)) {
+		entry = page_entry(entry, iova);
+
+		if (lv2ent_large(entry))
+			phys = lpage_phys(entry) + lpage_offs(iova);
+		else if (lv2ent_small(entry))
+			phys = spage_phys(entry) + spage_offs(iova);
+	}
+
+	spin_unlock_irqrestore(&domain->pgtablelock, flags);
+
+	return phys;
+}
+
+static int exynos_iommu_add_device(struct device *dev)
+{
+	struct exynos_iommu_owner *owner = dev->archdata.iommu;
+	struct sysmmu_drvdata *data;
+	struct iommu_group *group;
+
+	if (!has_sysmmu(dev))
+		return -ENODEV;
+
+	group = iommu_group_get_for_dev(dev);
+
+	if (IS_ERR(group))
+		return PTR_ERR(group);
+
+	list_for_each_entry(data, &owner->controllers, owner_node) {
+		/*
+		 * SYSMMU will be runtime activated via device link
+		 * (dependency) to its master device, so there are no
+		 * direct calls to pm_runtime_get/put in this driver.
+		 */
+		data->link = device_link_add(dev, data->sysmmu,
+					     DL_FLAG_PM_RUNTIME);
+	}
+	iommu_group_put(group);
+
+	return 0;
+}
+
+static void exynos_iommu_remove_device(struct device *dev)
+{
+	struct exynos_iommu_owner *owner = dev->archdata.iommu;
+	struct sysmmu_drvdata *data;
+
+	if (!has_sysmmu(dev))
+		return;
+
+	if (owner->domain) {
+		struct iommu_group *group = iommu_group_get(dev);
+
+		if (group) {
+			WARN_ON(owner->domain !=
+				iommu_group_default_domain(group));
+			exynos_iommu_detach_device(owner->domain, dev);
+			iommu_group_put(group);
+		}
+	}
+	iommu_group_remove_device(dev);
+
+	list_for_each_entry(data, &owner->controllers, owner_node)
+		device_link_del(data->link);
+}
+
+static int exynos_iommu_of_xlate(struct device *dev,
+				 struct of_phandle_args *spec)
+{
+	struct exynos_iommu_owner *owner = dev->archdata.iommu;
+	struct platform_device *sysmmu = of_find_device_by_node(spec->np);
+	struct sysmmu_drvdata *data, *entry;
+
+	if (!sysmmu)
+		return -ENODEV;
+
+	data = platform_get_drvdata(sysmmu);
+	if (!data) {
+		put_device(&sysmmu->dev);
+		return -ENODEV;
+	}
+
+	if (!owner) {
+		owner = kzalloc(sizeof(*owner), GFP_KERNEL);
+		if (!owner) {
+			put_device(&sysmmu->dev);
+			return -ENOMEM;
+		}
+
+		INIT_LIST_HEAD(&owner->controllers);
+		mutex_init(&owner->rpm_lock);
+		dev->archdata.iommu = owner;
+	}
+
+	list_for_each_entry(entry, &owner->controllers, owner_node)
+		if (entry == data)
+			return 0;
+
+	list_add_tail(&data->owner_node, &owner->controllers);
+	data->master = dev;
+
+	return 0;
+}
+
+static const struct iommu_ops exynos_iommu_ops = {
+	.domain_alloc = exynos_iommu_domain_alloc,
+	.domain_free = exynos_iommu_domain_free,
+	.attach_dev = exynos_iommu_attach_device,
+	.detach_dev = exynos_iommu_detach_device,
+	.map = exynos_iommu_map,
+	.unmap = exynos_iommu_unmap,
+	.iova_to_phys = exynos_iommu_iova_to_phys,
+	.device_group = generic_device_group,
+	.add_device = exynos_iommu_add_device,
+	.remove_device = exynos_iommu_remove_device,
+	.pgsize_bitmap = SECT_SIZE | LPAGE_SIZE | SPAGE_SIZE,
+	.of_xlate = exynos_iommu_of_xlate,
+};
+
+static int __init exynos_iommu_init(void)
+{
+	struct device_node *np;
+	int ret;
+
+	np = of_find_matching_node(NULL, sysmmu_of_match);
+	if (!np)
+		return 0;
+
+	of_node_put(np);
+
+	lv2table_kmem_cache = kmem_cache_create("exynos-iommu-lv2table",
+				LV2TABLE_SIZE, LV2TABLE_SIZE, 0, NULL);
+	if (!lv2table_kmem_cache) {
+		pr_err("%s: Failed to create kmem cache\n", __func__);
+		return -ENOMEM;
+	}
+
+	ret = platform_driver_register(&exynos_sysmmu_driver);
+	if (ret) {
+		pr_err("%s: Failed to register driver\n", __func__);
+		goto err_reg_driver;
+	}
+
+	zero_lv2_table = kmem_cache_zalloc(lv2table_kmem_cache, GFP_KERNEL);
+	if (zero_lv2_table == NULL) {
+		pr_err("%s: Failed to allocate zero level2 page table\n",
+			__func__);
+		ret = -ENOMEM;
+		goto err_zero_lv2;
+	}
+
+	ret = bus_set_iommu(&platform_bus_type, &exynos_iommu_ops);
+	if (ret) {
+		pr_err("%s: Failed to register exynos-iommu driver.\n",
+								__func__);
+		goto err_set_iommu;
+	}
+
+	return 0;
+err_set_iommu:
+	kmem_cache_free(lv2table_kmem_cache, zero_lv2_table);
+err_zero_lv2:
+	platform_driver_unregister(&exynos_sysmmu_driver);
+err_reg_driver:
+	kmem_cache_destroy(lv2table_kmem_cache);
+	return ret;
+}
+core_initcall(exynos_iommu_init);
diff --git a/drivers/iommu/fsl_pamu.c b/drivers/iommu/fsl_pamu.c
new file mode 100644
index 000000000..854062579
--- /dev/null
+++ b/drivers/iommu/fsl_pamu.c
@@ -0,0 +1,1286 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ * Copyright (C) 2013 Freescale Semiconductor, Inc.
+ *
+ */
+
+#define pr_fmt(fmt)    "fsl-pamu: %s: " fmt, __func__
+
+#include "fsl_pamu.h"
+
+#include <linux/fsl/guts.h>
+#include <linux/interrupt.h>
+#include <linux/genalloc.h>
+
+#include <asm/mpc85xx.h>
+
+/* define indexes for each operation mapping scenario */
+#define OMI_QMAN        0x00
+#define OMI_FMAN        0x01
+#define OMI_QMAN_PRIV   0x02
+#define OMI_CAAM        0x03
+
+#define make64(high, low) (((u64)(high) << 32) | (low))
+
+struct pamu_isr_data {
+	void __iomem *pamu_reg_base;	/* Base address of PAMU regs */
+	unsigned int count;		/* The number of PAMUs */
+};
+
+static struct paace *ppaact;
+static struct paace *spaact;
+
+static bool probed;			/* Has PAMU been probed? */
+
+/*
+ * Table for matching compatible strings, for device tree
+ * guts node, for QorIQ SOCs.
+ * "fsl,qoriq-device-config-2.0" corresponds to T4 & B4
+ * SOCs. For the older SOCs "fsl,qoriq-device-config-1.0"
+ * string would be used.
+ */
+static const struct of_device_id guts_device_ids[] = {
+	{ .compatible = "fsl,qoriq-device-config-1.0", },
+	{ .compatible = "fsl,qoriq-device-config-2.0", },
+	{}
+};
+
+/*
+ * Table for matching compatible strings, for device tree
+ * L3 cache controller node.
+ * "fsl,t4240-l3-cache-controller" corresponds to T4,
+ * "fsl,b4860-l3-cache-controller" corresponds to B4 &
+ * "fsl,p4080-l3-cache-controller" corresponds to other,
+ * SOCs.
+ */
+static const struct of_device_id l3_device_ids[] = {
+	{ .compatible = "fsl,t4240-l3-cache-controller", },
+	{ .compatible = "fsl,b4860-l3-cache-controller", },
+	{ .compatible = "fsl,p4080-l3-cache-controller", },
+	{}
+};
+
+/* maximum subwindows permitted per liodn */
+static u32 max_subwindow_count;
+
+/* Pool for fspi allocation */
+static struct gen_pool *spaace_pool;
+
+/**
+ * pamu_get_max_subwin_cnt() - Return the maximum supported
+ * subwindow count per liodn.
+ *
+ */
+u32 pamu_get_max_subwin_cnt(void)
+{
+	return max_subwindow_count;
+}
+
+/**
+ * pamu_get_ppaace() - Return the primary PACCE
+ * @liodn: liodn PAACT index for desired PAACE
+ *
+ * Returns the ppace pointer upon success else return
+ * null.
+ */
+static struct paace *pamu_get_ppaace(int liodn)
+{
+	if (!ppaact || liodn >= PAACE_NUMBER_ENTRIES) {
+		pr_debug("PPAACT doesn't exist\n");
+		return NULL;
+	}
+
+	return &ppaact[liodn];
+}
+
+/**
+ * pamu_enable_liodn() - Set valid bit of PACCE
+ * @liodn: liodn PAACT index for desired PAACE
+ *
+ * Returns 0 upon success else error code < 0 returned
+ */
+int pamu_enable_liodn(int liodn)
+{
+	struct paace *ppaace;
+
+	ppaace = pamu_get_ppaace(liodn);
+	if (!ppaace) {
+		pr_debug("Invalid primary paace entry\n");
+		return -ENOENT;
+	}
+
+	if (!get_bf(ppaace->addr_bitfields, PPAACE_AF_WSE)) {
+		pr_debug("liodn %d not configured\n", liodn);
+		return -EINVAL;
+	}
+
+	/* Ensure that all other stores to the ppaace complete first */
+	mb();
+
+	set_bf(ppaace->addr_bitfields, PAACE_AF_V, PAACE_V_VALID);
+	mb();
+
+	return 0;
+}
+
+/**
+ * pamu_disable_liodn() - Clears valid bit of PACCE
+ * @liodn: liodn PAACT index for desired PAACE
+ *
+ * Returns 0 upon success else error code < 0 returned
+ */
+int pamu_disable_liodn(int liodn)
+{
+	struct paace *ppaace;
+
+	ppaace = pamu_get_ppaace(liodn);
+	if (!ppaace) {
+		pr_debug("Invalid primary paace entry\n");
+		return -ENOENT;
+	}
+
+	set_bf(ppaace->addr_bitfields, PAACE_AF_V, PAACE_V_INVALID);
+	mb();
+
+	return 0;
+}
+
+/* Derive the window size encoding for a particular PAACE entry */
+static unsigned int map_addrspace_size_to_wse(phys_addr_t addrspace_size)
+{
+	/* Bug if not a power of 2 */
+	BUG_ON(addrspace_size & (addrspace_size - 1));
+
+	/* window size is 2^(WSE+1) bytes */
+	return fls64(addrspace_size) - 2;
+}
+
+/* Derive the PAACE window count encoding for the subwindow count */
+static unsigned int map_subwindow_cnt_to_wce(u32 subwindow_cnt)
+{
+	/* window count is 2^(WCE+1) bytes */
+	return __ffs(subwindow_cnt) - 1;
+}
+
+/*
+ * Set the PAACE type as primary and set the coherency required domain
+ * attribute
+ */
+static void pamu_init_ppaace(struct paace *ppaace)
+{
+	set_bf(ppaace->addr_bitfields, PAACE_AF_PT, PAACE_PT_PRIMARY);
+
+	set_bf(ppaace->domain_attr.to_host.coherency_required, PAACE_DA_HOST_CR,
+	       PAACE_M_COHERENCE_REQ);
+}
+
+/*
+ * Set the PAACE type as secondary and set the coherency required domain
+ * attribute.
+ */
+static void pamu_init_spaace(struct paace *spaace)
+{
+	set_bf(spaace->addr_bitfields, PAACE_AF_PT, PAACE_PT_SECONDARY);
+	set_bf(spaace->domain_attr.to_host.coherency_required, PAACE_DA_HOST_CR,
+	       PAACE_M_COHERENCE_REQ);
+}
+
+/*
+ * Return the spaace (corresponding to the secondary window index)
+ * for a particular ppaace.
+ */
+static struct paace *pamu_get_spaace(struct paace *paace, u32 wnum)
+{
+	u32 subwin_cnt;
+	struct paace *spaace = NULL;
+
+	subwin_cnt = 1UL << (get_bf(paace->impl_attr, PAACE_IA_WCE) + 1);
+
+	if (wnum < subwin_cnt)
+		spaace = &spaact[paace->fspi + wnum];
+	else
+		pr_debug("secondary paace out of bounds\n");
+
+	return spaace;
+}
+
+/**
+ * pamu_get_fspi_and_allocate() - Allocates fspi index and reserves subwindows
+ *                                required for primary PAACE in the secondary
+ *                                PAACE table.
+ * @subwin_cnt: Number of subwindows to be reserved.
+ *
+ * A PPAACE entry may have a number of associated subwindows. A subwindow
+ * corresponds to a SPAACE entry in the SPAACT table. Each PAACE entry stores
+ * the index (fspi) of the first SPAACE entry in the SPAACT table. This
+ * function returns the index of the first SPAACE entry. The remaining
+ * SPAACE entries are reserved contiguously from that index.
+ *
+ * Returns a valid fspi index in the range of 0 - SPAACE_NUMBER_ENTRIES on success.
+ * If no SPAACE entry is available or the allocator can not reserve the required
+ * number of contiguous entries function returns ULONG_MAX indicating a failure.
+ *
+ */
+static unsigned long pamu_get_fspi_and_allocate(u32 subwin_cnt)
+{
+	unsigned long spaace_addr;
+
+	spaace_addr = gen_pool_alloc(spaace_pool, subwin_cnt * sizeof(struct paace));
+	if (!spaace_addr)
+		return ULONG_MAX;
+
+	return (spaace_addr - (unsigned long)spaact) / (sizeof(struct paace));
+}
+
+/* Release the subwindows reserved for a particular LIODN */
+void pamu_free_subwins(int liodn)
+{
+	struct paace *ppaace;
+	u32 subwin_cnt, size;
+
+	ppaace = pamu_get_ppaace(liodn);
+	if (!ppaace) {
+		pr_debug("Invalid liodn entry\n");
+		return;
+	}
+
+	if (get_bf(ppaace->addr_bitfields, PPAACE_AF_MW)) {
+		subwin_cnt = 1UL << (get_bf(ppaace->impl_attr, PAACE_IA_WCE) + 1);
+		size = (subwin_cnt - 1) * sizeof(struct paace);
+		gen_pool_free(spaace_pool, (unsigned long)&spaact[ppaace->fspi], size);
+		set_bf(ppaace->addr_bitfields, PPAACE_AF_MW, 0);
+	}
+}
+
+/*
+ * Function used for updating stash destination for the coressponding
+ * LIODN.
+ */
+int  pamu_update_paace_stash(int liodn, u32 subwin, u32 value)
+{
+	struct paace *paace;
+
+	paace = pamu_get_ppaace(liodn);
+	if (!paace) {
+		pr_debug("Invalid liodn entry\n");
+		return -ENOENT;
+	}
+	if (subwin) {
+		paace = pamu_get_spaace(paace, subwin - 1);
+		if (!paace)
+			return -ENOENT;
+	}
+	set_bf(paace->impl_attr, PAACE_IA_CID, value);
+
+	mb();
+
+	return 0;
+}
+
+/* Disable a subwindow corresponding to the LIODN */
+int pamu_disable_spaace(int liodn, u32 subwin)
+{
+	struct paace *paace;
+
+	paace = pamu_get_ppaace(liodn);
+	if (!paace) {
+		pr_debug("Invalid liodn entry\n");
+		return -ENOENT;
+	}
+	if (subwin) {
+		paace = pamu_get_spaace(paace, subwin - 1);
+		if (!paace)
+			return -ENOENT;
+		set_bf(paace->addr_bitfields, PAACE_AF_V, PAACE_V_INVALID);
+	} else {
+		set_bf(paace->addr_bitfields, PAACE_AF_AP,
+		       PAACE_AP_PERMS_DENIED);
+	}
+
+	mb();
+
+	return 0;
+}
+
+/**
+ * pamu_config_paace() - Sets up PPAACE entry for specified liodn
+ *
+ * @liodn: Logical IO device number
+ * @win_addr: starting address of DSA window
+ * @win-size: size of DSA window
+ * @omi: Operation mapping index -- if ~omi == 0 then omi not defined
+ * @rpn: real (true physical) page number
+ * @stashid: cache stash id for associated cpu -- if ~stashid == 0 then
+ *	     stashid not defined
+ * @snoopid: snoop id for hardware coherency -- if ~snoopid == 0 then
+ *	     snoopid not defined
+ * @subwin_cnt: number of sub-windows
+ * @prot: window permissions
+ *
+ * Returns 0 upon success else error code < 0 returned
+ */
+int pamu_config_ppaace(int liodn, phys_addr_t win_addr, phys_addr_t win_size,
+		       u32 omi, unsigned long rpn, u32 snoopid, u32 stashid,
+		       u32 subwin_cnt, int prot)
+{
+	struct paace *ppaace;
+	unsigned long fspi;
+
+	if ((win_size & (win_size - 1)) || win_size < PAMU_PAGE_SIZE) {
+		pr_debug("window size too small or not a power of two %pa\n",
+			 &win_size);
+		return -EINVAL;
+	}
+
+	if (win_addr & (win_size - 1)) {
+		pr_debug("window address is not aligned with window size\n");
+		return -EINVAL;
+	}
+
+	ppaace = pamu_get_ppaace(liodn);
+	if (!ppaace)
+		return -ENOENT;
+
+	/* window size is 2^(WSE+1) bytes */
+	set_bf(ppaace->addr_bitfields, PPAACE_AF_WSE,
+	       map_addrspace_size_to_wse(win_size));
+
+	pamu_init_ppaace(ppaace);
+
+	ppaace->wbah = win_addr >> (PAMU_PAGE_SHIFT + 20);
+	set_bf(ppaace->addr_bitfields, PPAACE_AF_WBAL,
+	       (win_addr >> PAMU_PAGE_SHIFT));
+
+	/* set up operation mapping if it's configured */
+	if (omi < OME_NUMBER_ENTRIES) {
+		set_bf(ppaace->impl_attr, PAACE_IA_OTM, PAACE_OTM_INDEXED);
+		ppaace->op_encode.index_ot.omi = omi;
+	} else if (~omi != 0) {
+		pr_debug("bad operation mapping index: %d\n", omi);
+		return -EINVAL;
+	}
+
+	/* configure stash id */
+	if (~stashid != 0)
+		set_bf(ppaace->impl_attr, PAACE_IA_CID, stashid);
+
+	/* configure snoop id */
+	if (~snoopid != 0)
+		ppaace->domain_attr.to_host.snpid = snoopid;
+
+	if (subwin_cnt) {
+		/* The first entry is in the primary PAACE instead */
+		fspi = pamu_get_fspi_and_allocate(subwin_cnt - 1);
+		if (fspi == ULONG_MAX) {
+			pr_debug("spaace indexes exhausted\n");
+			return -EINVAL;
+		}
+
+		/* window count is 2^(WCE+1) bytes */
+		set_bf(ppaace->impl_attr, PAACE_IA_WCE,
+		       map_subwindow_cnt_to_wce(subwin_cnt));
+		set_bf(ppaace->addr_bitfields, PPAACE_AF_MW, 0x1);
+		ppaace->fspi = fspi;
+	} else {
+		set_bf(ppaace->impl_attr, PAACE_IA_ATM, PAACE_ATM_WINDOW_XLATE);
+		ppaace->twbah = rpn >> 20;
+		set_bf(ppaace->win_bitfields, PAACE_WIN_TWBAL, rpn);
+		set_bf(ppaace->addr_bitfields, PAACE_AF_AP, prot);
+		set_bf(ppaace->impl_attr, PAACE_IA_WCE, 0);
+		set_bf(ppaace->addr_bitfields, PPAACE_AF_MW, 0);
+	}
+	mb();
+
+	return 0;
+}
+
+/**
+ * pamu_config_spaace() - Sets up SPAACE entry for specified subwindow
+ *
+ * @liodn:  Logical IO device number
+ * @subwin_cnt:  number of sub-windows associated with dma-window
+ * @subwin: subwindow index
+ * @subwin_size: size of subwindow
+ * @omi: Operation mapping index
+ * @rpn: real (true physical) page number
+ * @snoopid: snoop id for hardware coherency -- if ~snoopid == 0 then
+ *			  snoopid not defined
+ * @stashid: cache stash id for associated cpu
+ * @enable: enable/disable subwindow after reconfiguration
+ * @prot: sub window permissions
+ *
+ * Returns 0 upon success else error code < 0 returned
+ */
+int pamu_config_spaace(int liodn, u32 subwin_cnt, u32 subwin,
+		       phys_addr_t subwin_size, u32 omi, unsigned long rpn,
+		       u32 snoopid, u32 stashid, int enable, int prot)
+{
+	struct paace *paace;
+
+	/* setup sub-windows */
+	if (!subwin_cnt) {
+		pr_debug("Invalid subwindow count\n");
+		return -EINVAL;
+	}
+
+	paace = pamu_get_ppaace(liodn);
+	if (subwin > 0 && subwin < subwin_cnt && paace) {
+		paace = pamu_get_spaace(paace, subwin - 1);
+
+		if (paace && !(paace->addr_bitfields & PAACE_V_VALID)) {
+			pamu_init_spaace(paace);
+			set_bf(paace->addr_bitfields, SPAACE_AF_LIODN, liodn);
+		}
+	}
+
+	if (!paace) {
+		pr_debug("Invalid liodn entry\n");
+		return -ENOENT;
+	}
+
+	if ((subwin_size & (subwin_size - 1)) || subwin_size < PAMU_PAGE_SIZE) {
+		pr_debug("subwindow size out of range, or not a power of 2\n");
+		return -EINVAL;
+	}
+
+	if (rpn == ULONG_MAX) {
+		pr_debug("real page number out of range\n");
+		return -EINVAL;
+	}
+
+	/* window size is 2^(WSE+1) bytes */
+	set_bf(paace->win_bitfields, PAACE_WIN_SWSE,
+	       map_addrspace_size_to_wse(subwin_size));
+
+	set_bf(paace->impl_attr, PAACE_IA_ATM, PAACE_ATM_WINDOW_XLATE);
+	paace->twbah = rpn >> 20;
+	set_bf(paace->win_bitfields, PAACE_WIN_TWBAL, rpn);
+	set_bf(paace->addr_bitfields, PAACE_AF_AP, prot);
+
+	/* configure snoop id */
+	if (~snoopid != 0)
+		paace->domain_attr.to_host.snpid = snoopid;
+
+	/* set up operation mapping if it's configured */
+	if (omi < OME_NUMBER_ENTRIES) {
+		set_bf(paace->impl_attr, PAACE_IA_OTM, PAACE_OTM_INDEXED);
+		paace->op_encode.index_ot.omi = omi;
+	} else if (~omi != 0) {
+		pr_debug("bad operation mapping index: %d\n", omi);
+		return -EINVAL;
+	}
+
+	if (~stashid != 0)
+		set_bf(paace->impl_attr, PAACE_IA_CID, stashid);
+
+	smp_wmb();
+
+	if (enable)
+		set_bf(paace->addr_bitfields, PAACE_AF_V, PAACE_V_VALID);
+
+	mb();
+
+	return 0;
+}
+
+/**
+ * get_ome_index() - Returns the index in the operation mapping table
+ *                   for device.
+ * @*omi_index: pointer for storing the index value
+ *
+ */
+void get_ome_index(u32 *omi_index, struct device *dev)
+{
+	if (of_device_is_compatible(dev->of_node, "fsl,qman-portal"))
+		*omi_index = OMI_QMAN;
+	if (of_device_is_compatible(dev->of_node, "fsl,qman"))
+		*omi_index = OMI_QMAN_PRIV;
+}
+
+/**
+ * get_stash_id - Returns stash destination id corresponding to a
+ *                cache type and vcpu.
+ * @stash_dest_hint: L1, L2 or L3
+ * @vcpu: vpcu target for a particular cache type.
+ *
+ * Returs stash on success or ~(u32)0 on failure.
+ *
+ */
+u32 get_stash_id(u32 stash_dest_hint, u32 vcpu)
+{
+	const u32 *prop;
+	struct device_node *node;
+	u32 cache_level;
+	int len, found = 0;
+	int i;
+
+	/* Fastpath, exit early if L3/CPC cache is target for stashing */
+	if (stash_dest_hint == PAMU_ATTR_CACHE_L3) {
+		node = of_find_matching_node(NULL, l3_device_ids);
+		if (node) {
+			prop = of_get_property(node, "cache-stash-id", NULL);
+			if (!prop) {
+				pr_debug("missing cache-stash-id at %pOF\n",
+					 node);
+				of_node_put(node);
+				return ~(u32)0;
+			}
+			of_node_put(node);
+			return be32_to_cpup(prop);
+		}
+		return ~(u32)0;
+	}
+
+	for_each_node_by_type(node, "cpu") {
+		prop = of_get_property(node, "reg", &len);
+		for (i = 0; i < len / sizeof(u32); i++) {
+			if (be32_to_cpup(&prop[i]) == vcpu) {
+				found = 1;
+				goto found_cpu_node;
+			}
+		}
+	}
+found_cpu_node:
+
+	/* find the hwnode that represents the cache */
+	for (cache_level = PAMU_ATTR_CACHE_L1; (cache_level < PAMU_ATTR_CACHE_L3) && found; cache_level++) {
+		if (stash_dest_hint == cache_level) {
+			prop = of_get_property(node, "cache-stash-id", NULL);
+			if (!prop) {
+				pr_debug("missing cache-stash-id at %pOF\n",
+					 node);
+				of_node_put(node);
+				return ~(u32)0;
+			}
+			of_node_put(node);
+			return be32_to_cpup(prop);
+		}
+
+		prop = of_get_property(node, "next-level-cache", NULL);
+		if (!prop) {
+			pr_debug("can't find next-level-cache at %pOF\n", node);
+			of_node_put(node);
+			return ~(u32)0;  /* can't traverse any further */
+		}
+		of_node_put(node);
+
+		/* advance to next node in cache hierarchy */
+		node = of_find_node_by_phandle(*prop);
+		if (!node) {
+			pr_debug("Invalid node for cache hierarchy\n");
+			return ~(u32)0;
+		}
+	}
+
+	pr_debug("stash dest not found for %d on vcpu %d\n",
+		 stash_dest_hint, vcpu);
+	return ~(u32)0;
+}
+
+/* Identify if the PAACT table entry belongs to QMAN, BMAN or QMAN Portal */
+#define QMAN_PAACE 1
+#define QMAN_PORTAL_PAACE 2
+#define BMAN_PAACE 3
+
+/**
+ * Setup operation mapping and stash destinations for QMAN and QMAN portal.
+ * Memory accesses to QMAN and BMAN private memory need not be coherent, so
+ * clear the PAACE entry coherency attribute for them.
+ */
+static void setup_qbman_paace(struct paace *ppaace, int  paace_type)
+{
+	switch (paace_type) {
+	case QMAN_PAACE:
+		set_bf(ppaace->impl_attr, PAACE_IA_OTM, PAACE_OTM_INDEXED);
+		ppaace->op_encode.index_ot.omi = OMI_QMAN_PRIV;
+		/* setup QMAN Private data stashing for the L3 cache */
+		set_bf(ppaace->impl_attr, PAACE_IA_CID, get_stash_id(PAMU_ATTR_CACHE_L3, 0));
+		set_bf(ppaace->domain_attr.to_host.coherency_required, PAACE_DA_HOST_CR,
+		       0);
+		break;
+	case QMAN_PORTAL_PAACE:
+		set_bf(ppaace->impl_attr, PAACE_IA_OTM, PAACE_OTM_INDEXED);
+		ppaace->op_encode.index_ot.omi = OMI_QMAN;
+		/* Set DQRR and Frame stashing for the L3 cache */
+		set_bf(ppaace->impl_attr, PAACE_IA_CID, get_stash_id(PAMU_ATTR_CACHE_L3, 0));
+		break;
+	case BMAN_PAACE:
+		set_bf(ppaace->domain_attr.to_host.coherency_required, PAACE_DA_HOST_CR,
+		       0);
+		break;
+	}
+}
+
+/**
+ * Setup the operation mapping table for various devices. This is a static
+ * table where each table index corresponds to a particular device. PAMU uses
+ * this table to translate device transaction to appropriate corenet
+ * transaction.
+ */
+static void setup_omt(struct ome *omt)
+{
+	struct ome *ome;
+
+	/* Configure OMI_QMAN */
+	ome = &omt[OMI_QMAN];
+
+	ome->moe[IOE_READ_IDX] = EOE_VALID | EOE_READ;
+	ome->moe[IOE_EREAD0_IDX] = EOE_VALID | EOE_RSA;
+	ome->moe[IOE_WRITE_IDX] = EOE_VALID | EOE_WRITE;
+	ome->moe[IOE_EWRITE0_IDX] = EOE_VALID | EOE_WWSAO;
+
+	ome->moe[IOE_DIRECT0_IDX] = EOE_VALID | EOE_LDEC;
+	ome->moe[IOE_DIRECT1_IDX] = EOE_VALID | EOE_LDECPE;
+
+	/* Configure OMI_FMAN */
+	ome = &omt[OMI_FMAN];
+	ome->moe[IOE_READ_IDX]  = EOE_VALID | EOE_READI;
+	ome->moe[IOE_WRITE_IDX] = EOE_VALID | EOE_WRITE;
+
+	/* Configure OMI_QMAN private */
+	ome = &omt[OMI_QMAN_PRIV];
+	ome->moe[IOE_READ_IDX]  = EOE_VALID | EOE_READ;
+	ome->moe[IOE_WRITE_IDX] = EOE_VALID | EOE_WRITE;
+	ome->moe[IOE_EREAD0_IDX] = EOE_VALID | EOE_RSA;
+	ome->moe[IOE_EWRITE0_IDX] = EOE_VALID | EOE_WWSA;
+
+	/* Configure OMI_CAAM */
+	ome = &omt[OMI_CAAM];
+	ome->moe[IOE_READ_IDX]  = EOE_VALID | EOE_READI;
+	ome->moe[IOE_WRITE_IDX] = EOE_VALID | EOE_WRITE;
+}
+
+/*
+ * Get the maximum number of PAACT table entries
+ * and subwindows supported by PAMU
+ */
+static void get_pamu_cap_values(unsigned long pamu_reg_base)
+{
+	u32 pc_val;
+
+	pc_val = in_be32((u32 *)(pamu_reg_base + PAMU_PC3));
+	/* Maximum number of subwindows per liodn */
+	max_subwindow_count = 1 << (1 + PAMU_PC3_MWCE(pc_val));
+}
+
+/* Setup PAMU registers pointing to PAACT, SPAACT and OMT */
+static int setup_one_pamu(unsigned long pamu_reg_base, unsigned long pamu_reg_size,
+			  phys_addr_t ppaact_phys, phys_addr_t spaact_phys,
+			  phys_addr_t omt_phys)
+{
+	u32 *pc;
+	struct pamu_mmap_regs *pamu_regs;
+
+	pc = (u32 *) (pamu_reg_base + PAMU_PC);
+	pamu_regs = (struct pamu_mmap_regs *)
+		(pamu_reg_base + PAMU_MMAP_REGS_BASE);
+
+	/* set up pointers to corenet control blocks */
+
+	out_be32(&pamu_regs->ppbah, upper_32_bits(ppaact_phys));
+	out_be32(&pamu_regs->ppbal, lower_32_bits(ppaact_phys));
+	ppaact_phys = ppaact_phys + PAACT_SIZE;
+	out_be32(&pamu_regs->pplah, upper_32_bits(ppaact_phys));
+	out_be32(&pamu_regs->pplal, lower_32_bits(ppaact_phys));
+
+	out_be32(&pamu_regs->spbah, upper_32_bits(spaact_phys));
+	out_be32(&pamu_regs->spbal, lower_32_bits(spaact_phys));
+	spaact_phys = spaact_phys + SPAACT_SIZE;
+	out_be32(&pamu_regs->splah, upper_32_bits(spaact_phys));
+	out_be32(&pamu_regs->splal, lower_32_bits(spaact_phys));
+
+	out_be32(&pamu_regs->obah, upper_32_bits(omt_phys));
+	out_be32(&pamu_regs->obal, lower_32_bits(omt_phys));
+	omt_phys = omt_phys + OMT_SIZE;
+	out_be32(&pamu_regs->olah, upper_32_bits(omt_phys));
+	out_be32(&pamu_regs->olal, lower_32_bits(omt_phys));
+
+	/*
+	 * set PAMU enable bit,
+	 * allow ppaact & omt to be cached
+	 * & enable PAMU access violation interrupts.
+	 */
+
+	out_be32((u32 *)(pamu_reg_base + PAMU_PICS),
+		 PAMU_ACCESS_VIOLATION_ENABLE);
+	out_be32(pc, PAMU_PC_PE | PAMU_PC_OCE | PAMU_PC_SPCC | PAMU_PC_PPCC);
+	return 0;
+}
+
+/* Enable all device LIODNS */
+static void setup_liodns(void)
+{
+	int i, len;
+	struct paace *ppaace;
+	struct device_node *node = NULL;
+	const u32 *prop;
+
+	for_each_node_with_property(node, "fsl,liodn") {
+		prop = of_get_property(node, "fsl,liodn", &len);
+		for (i = 0; i < len / sizeof(u32); i++) {
+			int liodn;
+
+			liodn = be32_to_cpup(&prop[i]);
+			if (liodn >= PAACE_NUMBER_ENTRIES) {
+				pr_debug("Invalid LIODN value %d\n", liodn);
+				continue;
+			}
+			ppaace = pamu_get_ppaace(liodn);
+			pamu_init_ppaace(ppaace);
+			/* window size is 2^(WSE+1) bytes */
+			set_bf(ppaace->addr_bitfields, PPAACE_AF_WSE, 35);
+			ppaace->wbah = 0;
+			set_bf(ppaace->addr_bitfields, PPAACE_AF_WBAL, 0);
+			set_bf(ppaace->impl_attr, PAACE_IA_ATM,
+			       PAACE_ATM_NO_XLATE);
+			set_bf(ppaace->addr_bitfields, PAACE_AF_AP,
+			       PAACE_AP_PERMS_ALL);
+			if (of_device_is_compatible(node, "fsl,qman-portal"))
+				setup_qbman_paace(ppaace, QMAN_PORTAL_PAACE);
+			if (of_device_is_compatible(node, "fsl,qman"))
+				setup_qbman_paace(ppaace, QMAN_PAACE);
+			if (of_device_is_compatible(node, "fsl,bman"))
+				setup_qbman_paace(ppaace, BMAN_PAACE);
+			mb();
+			pamu_enable_liodn(liodn);
+		}
+	}
+}
+
+static irqreturn_t pamu_av_isr(int irq, void *arg)
+{
+	struct pamu_isr_data *data = arg;
+	phys_addr_t phys;
+	unsigned int i, j, ret;
+
+	pr_emerg("access violation interrupt\n");
+
+	for (i = 0; i < data->count; i++) {
+		void __iomem *p = data->pamu_reg_base + i * PAMU_OFFSET;
+		u32 pics = in_be32(p + PAMU_PICS);
+
+		if (pics & PAMU_ACCESS_VIOLATION_STAT) {
+			u32 avs1 = in_be32(p + PAMU_AVS1);
+			struct paace *paace;
+
+			pr_emerg("POES1=%08x\n", in_be32(p + PAMU_POES1));
+			pr_emerg("POES2=%08x\n", in_be32(p + PAMU_POES2));
+			pr_emerg("AVS1=%08x\n", avs1);
+			pr_emerg("AVS2=%08x\n", in_be32(p + PAMU_AVS2));
+			pr_emerg("AVA=%016llx\n",
+				 make64(in_be32(p + PAMU_AVAH),
+					in_be32(p + PAMU_AVAL)));
+			pr_emerg("UDAD=%08x\n", in_be32(p + PAMU_UDAD));
+			pr_emerg("POEA=%016llx\n",
+				 make64(in_be32(p + PAMU_POEAH),
+					in_be32(p + PAMU_POEAL)));
+
+			phys = make64(in_be32(p + PAMU_POEAH),
+				      in_be32(p + PAMU_POEAL));
+
+			/* Assume that POEA points to a PAACE */
+			if (phys) {
+				u32 *paace = phys_to_virt(phys);
+
+				/* Only the first four words are relevant */
+				for (j = 0; j < 4; j++)
+					pr_emerg("PAACE[%u]=%08x\n",
+						 j, in_be32(paace + j));
+			}
+
+			/* clear access violation condition */
+			out_be32(p + PAMU_AVS1, avs1 & PAMU_AV_MASK);
+			paace = pamu_get_ppaace(avs1 >> PAMU_AVS1_LIODN_SHIFT);
+			BUG_ON(!paace);
+			/* check if we got a violation for a disabled LIODN */
+			if (!get_bf(paace->addr_bitfields, PAACE_AF_V)) {
+				/*
+				 * As per hardware erratum A-003638, access
+				 * violation can be reported for a disabled
+				 * LIODN. If we hit that condition, disable
+				 * access violation reporting.
+				 */
+				pics &= ~PAMU_ACCESS_VIOLATION_ENABLE;
+			} else {
+				/* Disable the LIODN */
+				ret = pamu_disable_liodn(avs1 >> PAMU_AVS1_LIODN_SHIFT);
+				BUG_ON(ret);
+				pr_emerg("Disabling liodn %x\n",
+					 avs1 >> PAMU_AVS1_LIODN_SHIFT);
+			}
+			out_be32((p + PAMU_PICS), pics);
+		}
+	}
+
+	return IRQ_HANDLED;
+}
+
+#define LAWAR_EN		0x80000000
+#define LAWAR_TARGET_MASK	0x0FF00000
+#define LAWAR_TARGET_SHIFT	20
+#define LAWAR_SIZE_MASK		0x0000003F
+#define LAWAR_CSDID_MASK	0x000FF000
+#define LAWAR_CSDID_SHIFT	12
+
+#define LAW_SIZE_4K		0xb
+
+struct ccsr_law {
+	u32	lawbarh;	/* LAWn base address high */
+	u32	lawbarl;	/* LAWn base address low */
+	u32	lawar;		/* LAWn attributes */
+	u32	reserved;
+};
+
+/*
+ * Create a coherence subdomain for a given memory block.
+ */
+static int create_csd(phys_addr_t phys, size_t size, u32 csd_port_id)
+{
+	struct device_node *np;
+	const __be32 *iprop;
+	void __iomem *lac = NULL;	/* Local Access Control registers */
+	struct ccsr_law __iomem *law;
+	void __iomem *ccm = NULL;
+	u32 __iomem *csdids;
+	unsigned int i, num_laws, num_csds;
+	u32 law_target = 0;
+	u32 csd_id = 0;
+	int ret = 0;
+
+	np = of_find_compatible_node(NULL, NULL, "fsl,corenet-law");
+	if (!np)
+		return -ENODEV;
+
+	iprop = of_get_property(np, "fsl,num-laws", NULL);
+	if (!iprop) {
+		ret = -ENODEV;
+		goto error;
+	}
+
+	num_laws = be32_to_cpup(iprop);
+	if (!num_laws) {
+		ret = -ENODEV;
+		goto error;
+	}
+
+	lac = of_iomap(np, 0);
+	if (!lac) {
+		ret = -ENODEV;
+		goto error;
+	}
+
+	/* LAW registers are at offset 0xC00 */
+	law = lac + 0xC00;
+
+	of_node_put(np);
+
+	np = of_find_compatible_node(NULL, NULL, "fsl,corenet-cf");
+	if (!np) {
+		ret = -ENODEV;
+		goto error;
+	}
+
+	iprop = of_get_property(np, "fsl,ccf-num-csdids", NULL);
+	if (!iprop) {
+		ret = -ENODEV;
+		goto error;
+	}
+
+	num_csds = be32_to_cpup(iprop);
+	if (!num_csds) {
+		ret = -ENODEV;
+		goto error;
+	}
+
+	ccm = of_iomap(np, 0);
+	if (!ccm) {
+		ret = -ENOMEM;
+		goto error;
+	}
+
+	/* The undocumented CSDID registers are at offset 0x600 */
+	csdids = ccm + 0x600;
+
+	of_node_put(np);
+	np = NULL;
+
+	/* Find an unused coherence subdomain ID */
+	for (csd_id = 0; csd_id < num_csds; csd_id++) {
+		if (!csdids[csd_id])
+			break;
+	}
+
+	/* Store the Port ID in the (undocumented) proper CIDMRxx register */
+	csdids[csd_id] = csd_port_id;
+
+	/* Find the DDR LAW that maps to our buffer. */
+	for (i = 0; i < num_laws; i++) {
+		if (law[i].lawar & LAWAR_EN) {
+			phys_addr_t law_start, law_end;
+
+			law_start = make64(law[i].lawbarh, law[i].lawbarl);
+			law_end = law_start +
+				(2ULL << (law[i].lawar & LAWAR_SIZE_MASK));
+
+			if (law_start <= phys && phys < law_end) {
+				law_target = law[i].lawar & LAWAR_TARGET_MASK;
+				break;
+			}
+		}
+	}
+
+	if (i == 0 || i == num_laws) {
+		/* This should never happen */
+		ret = -ENOENT;
+		goto error;
+	}
+
+	/* Find a free LAW entry */
+	while (law[--i].lawar & LAWAR_EN) {
+		if (i == 0) {
+			/* No higher priority LAW slots available */
+			ret = -ENOENT;
+			goto error;
+		}
+	}
+
+	law[i].lawbarh = upper_32_bits(phys);
+	law[i].lawbarl = lower_32_bits(phys);
+	wmb();
+	law[i].lawar = LAWAR_EN | law_target | (csd_id << LAWAR_CSDID_SHIFT) |
+		(LAW_SIZE_4K + get_order(size));
+	wmb();
+
+error:
+	if (ccm)
+		iounmap(ccm);
+
+	if (lac)
+		iounmap(lac);
+
+	if (np)
+		of_node_put(np);
+
+	return ret;
+}
+
+/*
+ * Table of SVRs and the corresponding PORT_ID values. Port ID corresponds to a
+ * bit map of snoopers for a given range of memory mapped by a LAW.
+ *
+ * All future CoreNet-enabled SOCs will have this erratum(A-004510) fixed, so this
+ * table should never need to be updated.  SVRs are guaranteed to be unique, so
+ * there is no worry that a future SOC will inadvertently have one of these
+ * values.
+ */
+static const struct {
+	u32 svr;
+	u32 port_id;
+} port_id_map[] = {
+	{(SVR_P2040 << 8) | 0x10, 0xFF000000},	/* P2040 1.0 */
+	{(SVR_P2040 << 8) | 0x11, 0xFF000000},	/* P2040 1.1 */
+	{(SVR_P2041 << 8) | 0x10, 0xFF000000},	/* P2041 1.0 */
+	{(SVR_P2041 << 8) | 0x11, 0xFF000000},	/* P2041 1.1 */
+	{(SVR_P3041 << 8) | 0x10, 0xFF000000},	/* P3041 1.0 */
+	{(SVR_P3041 << 8) | 0x11, 0xFF000000},	/* P3041 1.1 */
+	{(SVR_P4040 << 8) | 0x20, 0xFFF80000},	/* P4040 2.0 */
+	{(SVR_P4080 << 8) | 0x20, 0xFFF80000},	/* P4080 2.0 */
+	{(SVR_P5010 << 8) | 0x10, 0xFC000000},	/* P5010 1.0 */
+	{(SVR_P5010 << 8) | 0x20, 0xFC000000},	/* P5010 2.0 */
+	{(SVR_P5020 << 8) | 0x10, 0xFC000000},	/* P5020 1.0 */
+	{(SVR_P5021 << 8) | 0x10, 0xFF800000},	/* P5021 1.0 */
+	{(SVR_P5040 << 8) | 0x10, 0xFF800000},	/* P5040 1.0 */
+};
+
+#define SVR_SECURITY	0x80000	/* The Security (E) bit */
+
+static int fsl_pamu_probe(struct platform_device *pdev)
+{
+	struct device *dev = &pdev->dev;
+	void __iomem *pamu_regs = NULL;
+	struct ccsr_guts __iomem *guts_regs = NULL;
+	u32 pamubypenr, pamu_counter;
+	unsigned long pamu_reg_off;
+	unsigned long pamu_reg_base;
+	struct pamu_isr_data *data = NULL;
+	struct device_node *guts_node;
+	u64 size;
+	struct page *p;
+	int ret = 0;
+	int irq;
+	phys_addr_t ppaact_phys;
+	phys_addr_t spaact_phys;
+	struct ome *omt;
+	phys_addr_t omt_phys;
+	size_t mem_size = 0;
+	unsigned int order = 0;
+	u32 csd_port_id = 0;
+	unsigned i;
+	/*
+	 * enumerate all PAMUs and allocate and setup PAMU tables
+	 * for each of them,
+	 * NOTE : All PAMUs share the same LIODN tables.
+	 */
+
+	if (WARN_ON(probed))
+		return -EBUSY;
+
+	pamu_regs = of_iomap(dev->of_node, 0);
+	if (!pamu_regs) {
+		dev_err(dev, "ioremap of PAMU node failed\n");
+		return -ENOMEM;
+	}
+	of_get_address(dev->of_node, 0, &size, NULL);
+
+	irq = irq_of_parse_and_map(dev->of_node, 0);
+	if (irq == NO_IRQ) {
+		dev_warn(dev, "no interrupts listed in PAMU node\n");
+		goto error;
+	}
+
+	data = kzalloc(sizeof(*data), GFP_KERNEL);
+	if (!data) {
+		ret = -ENOMEM;
+		goto error;
+	}
+	data->pamu_reg_base = pamu_regs;
+	data->count = size / PAMU_OFFSET;
+
+	/* The ISR needs access to the regs, so we won't iounmap them */
+	ret = request_irq(irq, pamu_av_isr, 0, "pamu", data);
+	if (ret < 0) {
+		dev_err(dev, "error %i installing ISR for irq %i\n", ret, irq);
+		goto error;
+	}
+
+	guts_node = of_find_matching_node(NULL, guts_device_ids);
+	if (!guts_node) {
+		dev_err(dev, "could not find GUTS node %pOF\n", dev->of_node);
+		ret = -ENODEV;
+		goto error;
+	}
+
+	guts_regs = of_iomap(guts_node, 0);
+	of_node_put(guts_node);
+	if (!guts_regs) {
+		dev_err(dev, "ioremap of GUTS node failed\n");
+		ret = -ENODEV;
+		goto error;
+	}
+
+	/* read in the PAMU capability registers */
+	get_pamu_cap_values((unsigned long)pamu_regs);
+	/*
+	 * To simplify the allocation of a coherency domain, we allocate the
+	 * PAACT and the OMT in the same memory buffer.  Unfortunately, this
+	 * wastes more memory compared to allocating the buffers separately.
+	 */
+	/* Determine how much memory we need */
+	mem_size = (PAGE_SIZE << get_order(PAACT_SIZE)) +
+		(PAGE_SIZE << get_order(SPAACT_SIZE)) +
+		(PAGE_SIZE << get_order(OMT_SIZE));
+	order = get_order(mem_size);
+
+	p = alloc_pages(GFP_KERNEL | __GFP_ZERO, order);
+	if (!p) {
+		dev_err(dev, "unable to allocate PAACT/SPAACT/OMT block\n");
+		ret = -ENOMEM;
+		goto error;
+	}
+
+	ppaact = page_address(p);
+	ppaact_phys = page_to_phys(p);
+
+	/* Make sure the memory is naturally aligned */
+	if (ppaact_phys & ((PAGE_SIZE << order) - 1)) {
+		dev_err(dev, "PAACT/OMT block is unaligned\n");
+		ret = -ENOMEM;
+		goto error;
+	}
+
+	spaact = (void *)ppaact + (PAGE_SIZE << get_order(PAACT_SIZE));
+	omt = (void *)spaact + (PAGE_SIZE << get_order(SPAACT_SIZE));
+
+	dev_dbg(dev, "ppaact virt=%p phys=%pa\n", ppaact, &ppaact_phys);
+
+	/* Check to see if we need to implement the work-around on this SOC */
+
+	/* Determine the Port ID for our coherence subdomain */
+	for (i = 0; i < ARRAY_SIZE(port_id_map); i++) {
+		if (port_id_map[i].svr == (mfspr(SPRN_SVR) & ~SVR_SECURITY)) {
+			csd_port_id = port_id_map[i].port_id;
+			dev_dbg(dev, "found matching SVR %08x\n",
+				port_id_map[i].svr);
+			break;
+		}
+	}
+
+	if (csd_port_id) {
+		dev_dbg(dev, "creating coherency subdomain at address %pa, size %zu, port id 0x%08x",
+			&ppaact_phys, mem_size, csd_port_id);
+
+		ret = create_csd(ppaact_phys, mem_size, csd_port_id);
+		if (ret) {
+			dev_err(dev, "could not create coherence subdomain\n");
+			return ret;
+		}
+	}
+
+	spaact_phys = virt_to_phys(spaact);
+	omt_phys = virt_to_phys(omt);
+
+	spaace_pool = gen_pool_create(ilog2(sizeof(struct paace)), -1);
+	if (!spaace_pool) {
+		ret = -ENOMEM;
+		dev_err(dev, "Failed to allocate spaace gen pool\n");
+		goto error;
+	}
+
+	ret = gen_pool_add(spaace_pool, (unsigned long)spaact, SPAACT_SIZE, -1);
+	if (ret)
+		goto error_genpool;
+
+	pamubypenr = in_be32(&guts_regs->pamubypenr);
+
+	for (pamu_reg_off = 0, pamu_counter = 0x80000000; pamu_reg_off < size;
+	     pamu_reg_off += PAMU_OFFSET, pamu_counter >>= 1) {
+
+		pamu_reg_base = (unsigned long)pamu_regs + pamu_reg_off;
+		setup_one_pamu(pamu_reg_base, pamu_reg_off, ppaact_phys,
+			       spaact_phys, omt_phys);
+		/* Disable PAMU bypass for this PAMU */
+		pamubypenr &= ~pamu_counter;
+	}
+
+	setup_omt(omt);
+
+	/* Enable all relevant PAMU(s) */
+	out_be32(&guts_regs->pamubypenr, pamubypenr);
+
+	iounmap(guts_regs);
+
+	/* Enable DMA for the LIODNs in the device tree */
+
+	setup_liodns();
+
+	probed = true;
+
+	return 0;
+
+error_genpool:
+	gen_pool_destroy(spaace_pool);
+
+error:
+	if (irq != NO_IRQ)
+		free_irq(irq, data);
+
+	if (data) {
+		memset(data, 0, sizeof(struct pamu_isr_data));
+		kfree(data);
+	}
+
+	if (pamu_regs)
+		iounmap(pamu_regs);
+
+	if (guts_regs)
+		iounmap(guts_regs);
+
+	if (ppaact)
+		free_pages((unsigned long)ppaact, order);
+
+	ppaact = NULL;
+
+	return ret;
+}
+
+static struct platform_driver fsl_of_pamu_driver = {
+	.driver = {
+		.name = "fsl-of-pamu",
+	},
+	.probe = fsl_pamu_probe,
+};
+
+static __init int fsl_pamu_init(void)
+{
+	struct platform_device *pdev = NULL;
+	struct device_node *np;
+	int ret;
+
+	/*
+	 * The normal OF process calls the probe function at some
+	 * indeterminate later time, after most drivers have loaded.  This is
+	 * too late for us, because PAMU clients (like the Qman driver)
+	 * depend on PAMU being initialized early.
+	 *
+	 * So instead, we "manually" call our probe function by creating the
+	 * platform devices ourselves.
+	 */
+
+	/*
+	 * We assume that there is only one PAMU node in the device tree.  A
+	 * single PAMU node represents all of the PAMU devices in the SOC
+	 * already.   Everything else already makes that assumption, and the
+	 * binding for the PAMU nodes doesn't allow for any parent-child
+	 * relationships anyway.  In other words, support for more than one
+	 * PAMU node would require significant changes to a lot of code.
+	 */
+
+	np = of_find_compatible_node(NULL, NULL, "fsl,pamu");
+	if (!np) {
+		pr_err("could not find a PAMU node\n");
+		return -ENODEV;
+	}
+
+	ret = platform_driver_register(&fsl_of_pamu_driver);
+	if (ret) {
+		pr_err("could not register driver (err=%i)\n", ret);
+		goto error_driver_register;
+	}
+
+	pdev = platform_device_alloc("fsl-of-pamu", 0);
+	if (!pdev) {
+		pr_err("could not allocate device %pOF\n", np);
+		ret = -ENOMEM;
+		goto error_device_alloc;
+	}
+	pdev->dev.of_node = of_node_get(np);
+
+	ret = pamu_domain_init();
+	if (ret)
+		goto error_device_add;
+
+	ret = platform_device_add(pdev);
+	if (ret) {
+		pr_err("could not add device %pOF (err=%i)\n", np, ret);
+		goto error_device_add;
+	}
+
+	return 0;
+
+error_device_add:
+	of_node_put(pdev->dev.of_node);
+	pdev->dev.of_node = NULL;
+
+	platform_device_put(pdev);
+
+error_device_alloc:
+	platform_driver_unregister(&fsl_of_pamu_driver);
+
+error_driver_register:
+	of_node_put(np);
+
+	return ret;
+}
+arch_initcall(fsl_pamu_init);
diff --git a/drivers/iommu/fsl_pamu.h b/drivers/iommu/fsl_pamu.h
new file mode 100644
index 000000000..c3434f29c
--- /dev/null
+++ b/drivers/iommu/fsl_pamu.h
@@ -0,0 +1,412 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ * Copyright (C) 2013 Freescale Semiconductor, Inc.
+ *
+ */
+
+#ifndef __FSL_PAMU_H
+#define __FSL_PAMU_H
+
+#include <linux/iommu.h>
+#include <linux/pci.h>
+
+#include <asm/fsl_pamu_stash.h>
+
+/* Bit Field macros
+ *	v = bit field variable; m = mask, m##_SHIFT = shift, x = value to load
+ */
+#define set_bf(v, m, x)		(v = ((v) & ~(m)) | (((x) << m##_SHIFT) & (m)))
+#define get_bf(v, m)		(((v) & (m)) >> m##_SHIFT)
+
+/* PAMU CCSR space */
+#define PAMU_PGC 0x00000000     /* Allows all peripheral accesses */
+#define PAMU_PE 0x40000000      /* enable PAMU                    */
+
+/* PAMU_OFFSET to the next pamu space in ccsr */
+#define PAMU_OFFSET 0x1000
+
+#define PAMU_MMAP_REGS_BASE 0
+
+struct pamu_mmap_regs {
+	u32 ppbah;
+	u32 ppbal;
+	u32 pplah;
+	u32 pplal;
+	u32 spbah;
+	u32 spbal;
+	u32 splah;
+	u32 splal;
+	u32 obah;
+	u32 obal;
+	u32 olah;
+	u32 olal;
+};
+
+/* PAMU Error Registers */
+#define PAMU_POES1 0x0040
+#define PAMU_POES2 0x0044
+#define PAMU_POEAH 0x0048
+#define PAMU_POEAL 0x004C
+#define PAMU_AVS1  0x0050
+#define PAMU_AVS1_AV    0x1
+#define PAMU_AVS1_OTV   0x6
+#define PAMU_AVS1_APV   0x78
+#define PAMU_AVS1_WAV   0x380
+#define PAMU_AVS1_LAV   0x1c00
+#define PAMU_AVS1_GCV   0x2000
+#define PAMU_AVS1_PDV   0x4000
+#define PAMU_AV_MASK    (PAMU_AVS1_AV | PAMU_AVS1_OTV | PAMU_AVS1_APV | PAMU_AVS1_WAV \
+			 | PAMU_AVS1_LAV | PAMU_AVS1_GCV | PAMU_AVS1_PDV)
+#define PAMU_AVS1_LIODN_SHIFT 16
+#define PAMU_LAV_LIODN_NOT_IN_PPAACT 0x400
+
+#define PAMU_AVS2  0x0054
+#define PAMU_AVAH  0x0058
+#define PAMU_AVAL  0x005C
+#define PAMU_EECTL 0x0060
+#define PAMU_EEDIS 0x0064
+#define PAMU_EEINTEN 0x0068
+#define PAMU_EEDET 0x006C
+#define PAMU_EEATTR 0x0070
+#define PAMU_EEAHI 0x0074
+#define PAMU_EEALO 0x0078
+#define PAMU_EEDHI 0X007C
+#define PAMU_EEDLO 0x0080
+#define PAMU_EECC  0x0084
+#define PAMU_UDAD  0x0090
+
+/* PAMU Revision Registers */
+#define PAMU_PR1 0x0BF8
+#define PAMU_PR2 0x0BFC
+
+/* PAMU version mask */
+#define PAMU_PR1_MASK 0xffff
+
+/* PAMU Capabilities Registers */
+#define PAMU_PC1 0x0C00
+#define PAMU_PC2 0x0C04
+#define PAMU_PC3 0x0C08
+#define PAMU_PC4 0x0C0C
+
+/* PAMU Control Register */
+#define PAMU_PC 0x0C10
+
+/* PAMU control defs */
+#define PAMU_CONTROL 0x0C10
+#define PAMU_PC_PGC 0x80000000  /* PAMU gate closed bit */
+#define PAMU_PC_PE   0x40000000 /* PAMU enable bit */
+#define PAMU_PC_SPCC 0x00000010 /* sPAACE cache enable */
+#define PAMU_PC_PPCC 0x00000001 /* pPAACE cache enable */
+#define PAMU_PC_OCE  0x00001000 /* OMT cache enable */
+
+#define PAMU_PFA1 0x0C14
+#define PAMU_PFA2 0x0C18
+
+#define PAMU_PC2_MLIODN(X) ((X) >> 16)
+#define PAMU_PC3_MWCE(X) (((X) >> 21) & 0xf)
+
+/* PAMU Interrupt control and Status Register */
+#define PAMU_PICS 0x0C1C
+#define PAMU_ACCESS_VIOLATION_STAT   0x8
+#define PAMU_ACCESS_VIOLATION_ENABLE 0x4
+
+/* PAMU Debug Registers */
+#define PAMU_PD1 0x0F00
+#define PAMU_PD2 0x0F04
+#define PAMU_PD3 0x0F08
+#define PAMU_PD4 0x0F0C
+
+#define PAACE_AP_PERMS_DENIED  0x0
+#define PAACE_AP_PERMS_QUERY   0x1
+#define PAACE_AP_PERMS_UPDATE  0x2
+#define PAACE_AP_PERMS_ALL     0x3
+
+#define PAACE_DD_TO_HOST       0x0
+#define PAACE_DD_TO_IO         0x1
+#define PAACE_PT_PRIMARY       0x0
+#define PAACE_PT_SECONDARY     0x1
+#define PAACE_V_INVALID        0x0
+#define PAACE_V_VALID          0x1
+#define PAACE_MW_SUBWINDOWS    0x1
+
+#define PAACE_WSE_4K           0xB
+#define PAACE_WSE_8K           0xC
+#define PAACE_WSE_16K          0xD
+#define PAACE_WSE_32K          0xE
+#define PAACE_WSE_64K          0xF
+#define PAACE_WSE_128K         0x10
+#define PAACE_WSE_256K         0x11
+#define PAACE_WSE_512K         0x12
+#define PAACE_WSE_1M           0x13
+#define PAACE_WSE_2M           0x14
+#define PAACE_WSE_4M           0x15
+#define PAACE_WSE_8M           0x16
+#define PAACE_WSE_16M          0x17
+#define PAACE_WSE_32M          0x18
+#define PAACE_WSE_64M          0x19
+#define PAACE_WSE_128M         0x1A
+#define PAACE_WSE_256M         0x1B
+#define PAACE_WSE_512M         0x1C
+#define PAACE_WSE_1G           0x1D
+#define PAACE_WSE_2G           0x1E
+#define PAACE_WSE_4G           0x1F
+
+#define PAACE_DID_PCI_EXPRESS_1 0x00
+#define PAACE_DID_PCI_EXPRESS_2 0x01
+#define PAACE_DID_PCI_EXPRESS_3 0x02
+#define PAACE_DID_PCI_EXPRESS_4 0x03
+#define PAACE_DID_LOCAL_BUS     0x04
+#define PAACE_DID_SRIO          0x0C
+#define PAACE_DID_MEM_1         0x10
+#define PAACE_DID_MEM_2         0x11
+#define PAACE_DID_MEM_3         0x12
+#define PAACE_DID_MEM_4         0x13
+#define PAACE_DID_MEM_1_2       0x14
+#define PAACE_DID_MEM_3_4       0x15
+#define PAACE_DID_MEM_1_4       0x16
+#define PAACE_DID_BM_SW_PORTAL  0x18
+#define PAACE_DID_PAMU          0x1C
+#define PAACE_DID_CAAM          0x21
+#define PAACE_DID_QM_SW_PORTAL  0x3C
+#define PAACE_DID_CORE0_INST    0x80
+#define PAACE_DID_CORE0_DATA    0x81
+#define PAACE_DID_CORE1_INST    0x82
+#define PAACE_DID_CORE1_DATA    0x83
+#define PAACE_DID_CORE2_INST    0x84
+#define PAACE_DID_CORE2_DATA    0x85
+#define PAACE_DID_CORE3_INST    0x86
+#define PAACE_DID_CORE3_DATA    0x87
+#define PAACE_DID_CORE4_INST    0x88
+#define PAACE_DID_CORE4_DATA    0x89
+#define PAACE_DID_CORE5_INST    0x8A
+#define PAACE_DID_CORE5_DATA    0x8B
+#define PAACE_DID_CORE6_INST    0x8C
+#define PAACE_DID_CORE6_DATA    0x8D
+#define PAACE_DID_CORE7_INST    0x8E
+#define PAACE_DID_CORE7_DATA    0x8F
+#define PAACE_DID_BROADCAST     0xFF
+
+#define PAACE_ATM_NO_XLATE      0x00
+#define PAACE_ATM_WINDOW_XLATE  0x01
+#define PAACE_ATM_PAGE_XLATE    0x02
+#define PAACE_ATM_WIN_PG_XLATE  (PAACE_ATM_WINDOW_XLATE | PAACE_ATM_PAGE_XLATE)
+#define PAACE_OTM_NO_XLATE      0x00
+#define PAACE_OTM_IMMEDIATE     0x01
+#define PAACE_OTM_INDEXED       0x02
+#define PAACE_OTM_RESERVED      0x03
+
+#define PAACE_M_COHERENCE_REQ   0x01
+
+#define PAACE_PID_0             0x0
+#define PAACE_PID_1             0x1
+#define PAACE_PID_2             0x2
+#define PAACE_PID_3             0x3
+#define PAACE_PID_4             0x4
+#define PAACE_PID_5             0x5
+#define PAACE_PID_6             0x6
+#define PAACE_PID_7             0x7
+
+#define PAACE_TCEF_FORMAT0_8B   0x00
+#define PAACE_TCEF_FORMAT1_RSVD 0x01
+/*
+ * Hard coded value for the PAACT size to accommodate
+ * maximum LIODN value generated by u-boot.
+ */
+#define PAACE_NUMBER_ENTRIES    0x500
+/* Hard coded value for the SPAACT size */
+#define SPAACE_NUMBER_ENTRIES	0x800
+
+#define	OME_NUMBER_ENTRIES      16
+
+/* PAACE Bit Field Defines */
+#define PPAACE_AF_WBAL			0xfffff000
+#define PPAACE_AF_WBAL_SHIFT		12
+#define PPAACE_AF_WSE			0x00000fc0
+#define PPAACE_AF_WSE_SHIFT		6
+#define PPAACE_AF_MW			0x00000020
+#define PPAACE_AF_MW_SHIFT		5
+
+#define SPAACE_AF_LIODN			0xffff0000
+#define SPAACE_AF_LIODN_SHIFT		16
+
+#define PAACE_AF_AP			0x00000018
+#define PAACE_AF_AP_SHIFT		3
+#define PAACE_AF_DD			0x00000004
+#define PAACE_AF_DD_SHIFT		2
+#define PAACE_AF_PT			0x00000002
+#define PAACE_AF_PT_SHIFT		1
+#define PAACE_AF_V			0x00000001
+#define PAACE_AF_V_SHIFT		0
+
+#define PAACE_DA_HOST_CR		0x80
+#define PAACE_DA_HOST_CR_SHIFT		7
+
+#define PAACE_IA_CID			0x00FF0000
+#define PAACE_IA_CID_SHIFT		16
+#define PAACE_IA_WCE			0x000000F0
+#define PAACE_IA_WCE_SHIFT		4
+#define PAACE_IA_ATM			0x0000000C
+#define PAACE_IA_ATM_SHIFT		2
+#define PAACE_IA_OTM			0x00000003
+#define PAACE_IA_OTM_SHIFT		0
+
+#define PAACE_WIN_TWBAL			0xfffff000
+#define PAACE_WIN_TWBAL_SHIFT		12
+#define PAACE_WIN_SWSE			0x00000fc0
+#define PAACE_WIN_SWSE_SHIFT		6
+
+/* PAMU Data Structures */
+/* primary / secondary paact structure */
+struct paace {
+	/* PAACE Offset 0x00 */
+	u32 wbah;				/* only valid for Primary PAACE */
+	u32 addr_bitfields;		/* See P/S PAACE_AF_* */
+
+	/* PAACE Offset 0x08 */
+	/* Interpretation of first 32 bits dependent on DD above */
+	union {
+		struct {
+			/* Destination ID, see PAACE_DID_* defines */
+			u8 did;
+			/* Partition ID */
+			u8 pid;
+			/* Snoop ID */
+			u8 snpid;
+			/* coherency_required : 1 reserved : 7 */
+			u8 coherency_required; /* See PAACE_DA_* */
+		} to_host;
+		struct {
+			/* Destination ID, see PAACE_DID_* defines */
+			u8  did;
+			u8  reserved1;
+			u16 reserved2;
+		} to_io;
+	} domain_attr;
+
+	/* Implementation attributes + window count + address & operation translation modes */
+	u32 impl_attr;			/* See PAACE_IA_* */
+
+	/* PAACE Offset 0x10 */
+	/* Translated window base address */
+	u32 twbah;
+	u32 win_bitfields;			/* See PAACE_WIN_* */
+
+	/* PAACE Offset 0x18 */
+	/* first secondary paace entry */
+	u32 fspi;				/* only valid for Primary PAACE */
+	union {
+		struct {
+			u8 ioea;
+			u8 moea;
+			u8 ioeb;
+			u8 moeb;
+		} immed_ot;
+		struct {
+			u16 reserved;
+			u16 omi;
+		} index_ot;
+	} op_encode;
+
+	/* PAACE Offsets 0x20-0x38 */
+	u32 reserved[8];			/* not currently implemented */
+};
+
+/* OME : Operation mapping entry
+ * MOE : Mapped Operation Encodings
+ * The operation mapping table is table containing operation mapping entries (OME).
+ * The index of a particular OME is programmed in the PAACE entry for translation
+ * in bound I/O operations corresponding to an LIODN. The OMT is used for translation
+ * specifically in case of the indexed translation mode. Each OME contains a 128
+ * byte mapped operation encoding (MOE), where each byte represents an MOE.
+ */
+#define NUM_MOE 128
+struct ome {
+	u8 moe[NUM_MOE];
+} __packed;
+
+#define PAACT_SIZE              (sizeof(struct paace) * PAACE_NUMBER_ENTRIES)
+#define SPAACT_SIZE              (sizeof(struct paace) * SPAACE_NUMBER_ENTRIES)
+#define OMT_SIZE                (sizeof(struct ome) * OME_NUMBER_ENTRIES)
+
+#define PAMU_PAGE_SHIFT 12
+#define PAMU_PAGE_SIZE  4096ULL
+
+#define IOE_READ        0x00
+#define IOE_READ_IDX    0x00
+#define IOE_WRITE       0x81
+#define IOE_WRITE_IDX   0x01
+#define IOE_EREAD0      0x82    /* Enhanced read type 0 */
+#define IOE_EREAD0_IDX  0x02    /* Enhanced read type 0 */
+#define IOE_EWRITE0     0x83    /* Enhanced write type 0 */
+#define IOE_EWRITE0_IDX 0x03    /* Enhanced write type 0 */
+#define IOE_DIRECT0     0x84    /* Directive type 0 */
+#define IOE_DIRECT0_IDX 0x04    /* Directive type 0 */
+#define IOE_EREAD1      0x85    /* Enhanced read type 1 */
+#define IOE_EREAD1_IDX  0x05    /* Enhanced read type 1 */
+#define IOE_EWRITE1     0x86    /* Enhanced write type 1 */
+#define IOE_EWRITE1_IDX 0x06    /* Enhanced write type 1 */
+#define IOE_DIRECT1     0x87    /* Directive type 1 */
+#define IOE_DIRECT1_IDX 0x07    /* Directive type 1 */
+#define IOE_RAC         0x8c    /* Read with Atomic clear */
+#define IOE_RAC_IDX     0x0c    /* Read with Atomic clear */
+#define IOE_RAS         0x8d    /* Read with Atomic set */
+#define IOE_RAS_IDX     0x0d    /* Read with Atomic set */
+#define IOE_RAD         0x8e    /* Read with Atomic decrement */
+#define IOE_RAD_IDX     0x0e    /* Read with Atomic decrement */
+#define IOE_RAI         0x8f    /* Read with Atomic increment */
+#define IOE_RAI_IDX     0x0f    /* Read with Atomic increment */
+
+#define EOE_READ        0x00
+#define EOE_WRITE       0x01
+#define EOE_RAC         0x0c    /* Read with Atomic clear */
+#define EOE_RAS         0x0d    /* Read with Atomic set */
+#define EOE_RAD         0x0e    /* Read with Atomic decrement */
+#define EOE_RAI         0x0f    /* Read with Atomic increment */
+#define EOE_LDEC        0x10    /* Load external cache */
+#define EOE_LDECL       0x11    /* Load external cache with stash lock */
+#define EOE_LDECPE      0x12    /* Load external cache with preferred exclusive */
+#define EOE_LDECPEL     0x13    /* Load external cache with preferred exclusive and lock */
+#define EOE_LDECFE      0x14    /* Load external cache with forced exclusive */
+#define EOE_LDECFEL     0x15    /* Load external cache with forced exclusive and lock */
+#define EOE_RSA         0x16    /* Read with stash allocate */
+#define EOE_RSAU        0x17    /* Read with stash allocate and unlock */
+#define EOE_READI       0x18    /* Read with invalidate */
+#define EOE_RWNITC      0x19    /* Read with no intention to cache */
+#define EOE_WCI         0x1a    /* Write cache inhibited */
+#define EOE_WWSA        0x1b    /* Write with stash allocate */
+#define EOE_WWSAL       0x1c    /* Write with stash allocate and lock */
+#define EOE_WWSAO       0x1d    /* Write with stash allocate only */
+#define EOE_WWSAOL      0x1e    /* Write with stash allocate only and lock */
+#define EOE_VALID       0x80
+
+/* Function prototypes */
+int pamu_domain_init(void);
+int pamu_enable_liodn(int liodn);
+int pamu_disable_liodn(int liodn);
+void pamu_free_subwins(int liodn);
+int pamu_config_ppaace(int liodn, phys_addr_t win_addr, phys_addr_t win_size,
+		       u32 omi, unsigned long rpn, u32 snoopid, uint32_t stashid,
+		       u32 subwin_cnt, int prot);
+int pamu_config_spaace(int liodn, u32 subwin_cnt, u32 subwin_addr,
+		       phys_addr_t subwin_size, u32 omi, unsigned long rpn,
+		       uint32_t snoopid, u32 stashid, int enable, int prot);
+
+u32 get_stash_id(u32 stash_dest_hint, u32 vcpu);
+void get_ome_index(u32 *omi_index, struct device *dev);
+int  pamu_update_paace_stash(int liodn, u32 subwin, u32 value);
+int pamu_disable_spaace(int liodn, u32 subwin);
+u32 pamu_get_max_subwin_cnt(void);
+
+#endif  /* __FSL_PAMU_H */
diff --git a/drivers/iommu/fsl_pamu_domain.c b/drivers/iommu/fsl_pamu_domain.c
new file mode 100644
index 000000000..f089136e9
--- /dev/null
+++ b/drivers/iommu/fsl_pamu_domain.c
@@ -0,0 +1,1096 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ * Copyright (C) 2013 Freescale Semiconductor, Inc.
+ * Author: Varun Sethi <varun.sethi@freescale.com>
+ *
+ */
+
+#define pr_fmt(fmt)    "fsl-pamu-domain: %s: " fmt, __func__
+
+#include "fsl_pamu_domain.h"
+
+#include <sysdev/fsl_pci.h>
+
+/*
+ * Global spinlock that needs to be held while
+ * configuring PAMU.
+ */
+static DEFINE_SPINLOCK(iommu_lock);
+
+static struct kmem_cache *fsl_pamu_domain_cache;
+static struct kmem_cache *iommu_devinfo_cache;
+static DEFINE_SPINLOCK(device_domain_lock);
+
+struct iommu_device pamu_iommu;	/* IOMMU core code handle */
+
+static struct fsl_dma_domain *to_fsl_dma_domain(struct iommu_domain *dom)
+{
+	return container_of(dom, struct fsl_dma_domain, iommu_domain);
+}
+
+static int __init iommu_init_mempool(void)
+{
+	fsl_pamu_domain_cache = kmem_cache_create("fsl_pamu_domain",
+						  sizeof(struct fsl_dma_domain),
+						  0,
+						  SLAB_HWCACHE_ALIGN,
+						  NULL);
+	if (!fsl_pamu_domain_cache) {
+		pr_debug("Couldn't create fsl iommu_domain cache\n");
+		return -ENOMEM;
+	}
+
+	iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
+						sizeof(struct device_domain_info),
+						0,
+						SLAB_HWCACHE_ALIGN,
+						NULL);
+	if (!iommu_devinfo_cache) {
+		pr_debug("Couldn't create devinfo cache\n");
+		kmem_cache_destroy(fsl_pamu_domain_cache);
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static phys_addr_t get_phys_addr(struct fsl_dma_domain *dma_domain, dma_addr_t iova)
+{
+	u32 win_cnt = dma_domain->win_cnt;
+	struct dma_window *win_ptr = &dma_domain->win_arr[0];
+	struct iommu_domain_geometry *geom;
+
+	geom = &dma_domain->iommu_domain.geometry;
+
+	if (!win_cnt || !dma_domain->geom_size) {
+		pr_debug("Number of windows/geometry not configured for the domain\n");
+		return 0;
+	}
+
+	if (win_cnt > 1) {
+		u64 subwin_size;
+		dma_addr_t subwin_iova;
+		u32 wnd;
+
+		subwin_size = dma_domain->geom_size >> ilog2(win_cnt);
+		subwin_iova = iova & ~(subwin_size - 1);
+		wnd = (subwin_iova - geom->aperture_start) >> ilog2(subwin_size);
+		win_ptr = &dma_domain->win_arr[wnd];
+	}
+
+	if (win_ptr->valid)
+		return win_ptr->paddr + (iova & (win_ptr->size - 1));
+
+	return 0;
+}
+
+static int map_subwins(int liodn, struct fsl_dma_domain *dma_domain)
+{
+	struct dma_window *sub_win_ptr = &dma_domain->win_arr[0];
+	int i, ret;
+	unsigned long rpn, flags;
+
+	for (i = 0; i < dma_domain->win_cnt; i++) {
+		if (sub_win_ptr[i].valid) {
+			rpn = sub_win_ptr[i].paddr >> PAMU_PAGE_SHIFT;
+			spin_lock_irqsave(&iommu_lock, flags);
+			ret = pamu_config_spaace(liodn, dma_domain->win_cnt, i,
+						 sub_win_ptr[i].size,
+						 ~(u32)0,
+						 rpn,
+						 dma_domain->snoop_id,
+						 dma_domain->stash_id,
+						 (i > 0) ? 1 : 0,
+						 sub_win_ptr[i].prot);
+			spin_unlock_irqrestore(&iommu_lock, flags);
+			if (ret) {
+				pr_debug("SPAACE configuration failed for liodn %d\n",
+					 liodn);
+				return ret;
+			}
+		}
+	}
+
+	return ret;
+}
+
+static int map_win(int liodn, struct fsl_dma_domain *dma_domain)
+{
+	int ret;
+	struct dma_window *wnd = &dma_domain->win_arr[0];
+	phys_addr_t wnd_addr = dma_domain->iommu_domain.geometry.aperture_start;
+	unsigned long flags;
+
+	spin_lock_irqsave(&iommu_lock, flags);
+	ret = pamu_config_ppaace(liodn, wnd_addr,
+				 wnd->size,
+				 ~(u32)0,
+				 wnd->paddr >> PAMU_PAGE_SHIFT,
+				 dma_domain->snoop_id, dma_domain->stash_id,
+				 0, wnd->prot);
+	spin_unlock_irqrestore(&iommu_lock, flags);
+	if (ret)
+		pr_debug("PAACE configuration failed for liodn %d\n", liodn);
+
+	return ret;
+}
+
+/* Map the DMA window corresponding to the LIODN */
+static int map_liodn(int liodn, struct fsl_dma_domain *dma_domain)
+{
+	if (dma_domain->win_cnt > 1)
+		return map_subwins(liodn, dma_domain);
+	else
+		return map_win(liodn, dma_domain);
+}
+
+/* Update window/subwindow mapping for the LIODN */
+static int update_liodn(int liodn, struct fsl_dma_domain *dma_domain, u32 wnd_nr)
+{
+	int ret;
+	struct dma_window *wnd = &dma_domain->win_arr[wnd_nr];
+	unsigned long flags;
+
+	spin_lock_irqsave(&iommu_lock, flags);
+	if (dma_domain->win_cnt > 1) {
+		ret = pamu_config_spaace(liodn, dma_domain->win_cnt, wnd_nr,
+					 wnd->size,
+					 ~(u32)0,
+					 wnd->paddr >> PAMU_PAGE_SHIFT,
+					 dma_domain->snoop_id,
+					 dma_domain->stash_id,
+					 (wnd_nr > 0) ? 1 : 0,
+					 wnd->prot);
+		if (ret)
+			pr_debug("Subwindow reconfiguration failed for liodn %d\n",
+				 liodn);
+	} else {
+		phys_addr_t wnd_addr;
+
+		wnd_addr = dma_domain->iommu_domain.geometry.aperture_start;
+
+		ret = pamu_config_ppaace(liodn, wnd_addr,
+					 wnd->size,
+					 ~(u32)0,
+					 wnd->paddr >> PAMU_PAGE_SHIFT,
+					 dma_domain->snoop_id, dma_domain->stash_id,
+					 0, wnd->prot);
+		if (ret)
+			pr_debug("Window reconfiguration failed for liodn %d\n",
+				 liodn);
+	}
+
+	spin_unlock_irqrestore(&iommu_lock, flags);
+
+	return ret;
+}
+
+static int update_liodn_stash(int liodn, struct fsl_dma_domain *dma_domain,
+			      u32 val)
+{
+	int ret = 0, i;
+	unsigned long flags;
+
+	spin_lock_irqsave(&iommu_lock, flags);
+	if (!dma_domain->win_arr) {
+		pr_debug("Windows not configured, stash destination update failed for liodn %d\n",
+			 liodn);
+		spin_unlock_irqrestore(&iommu_lock, flags);
+		return -EINVAL;
+	}
+
+	for (i = 0; i < dma_domain->win_cnt; i++) {
+		ret = pamu_update_paace_stash(liodn, i, val);
+		if (ret) {
+			pr_debug("Failed to update SPAACE %d field for liodn %d\n ",
+				 i, liodn);
+			spin_unlock_irqrestore(&iommu_lock, flags);
+			return ret;
+		}
+	}
+
+	spin_unlock_irqrestore(&iommu_lock, flags);
+
+	return ret;
+}
+
+/* Set the geometry parameters for a LIODN */
+static int pamu_set_liodn(int liodn, struct device *dev,
+			  struct fsl_dma_domain *dma_domain,
+			  struct iommu_domain_geometry *geom_attr,
+			  u32 win_cnt)
+{
+	phys_addr_t window_addr, window_size;
+	phys_addr_t subwin_size;
+	int ret = 0, i;
+	u32 omi_index = ~(u32)0;
+	unsigned long flags;
+
+	/*
+	 * Configure the omi_index at the geometry setup time.
+	 * This is a static value which depends on the type of
+	 * device and would not change thereafter.
+	 */
+	get_ome_index(&omi_index, dev);
+
+	window_addr = geom_attr->aperture_start;
+	window_size = dma_domain->geom_size;
+
+	spin_lock_irqsave(&iommu_lock, flags);
+	ret = pamu_disable_liodn(liodn);
+	if (!ret)
+		ret = pamu_config_ppaace(liodn, window_addr, window_size, omi_index,
+					 0, dma_domain->snoop_id,
+					 dma_domain->stash_id, win_cnt, 0);
+	spin_unlock_irqrestore(&iommu_lock, flags);
+	if (ret) {
+		pr_debug("PAACE configuration failed for liodn %d, win_cnt =%d\n",
+			 liodn, win_cnt);
+		return ret;
+	}
+
+	if (win_cnt > 1) {
+		subwin_size = window_size >> ilog2(win_cnt);
+		for (i = 0; i < win_cnt; i++) {
+			spin_lock_irqsave(&iommu_lock, flags);
+			ret = pamu_disable_spaace(liodn, i);
+			if (!ret)
+				ret = pamu_config_spaace(liodn, win_cnt, i,
+							 subwin_size, omi_index,
+							 0, dma_domain->snoop_id,
+							 dma_domain->stash_id,
+							 0, 0);
+			spin_unlock_irqrestore(&iommu_lock, flags);
+			if (ret) {
+				pr_debug("SPAACE configuration failed for liodn %d\n",
+					 liodn);
+				return ret;
+			}
+		}
+	}
+
+	return ret;
+}
+
+static int check_size(u64 size, dma_addr_t iova)
+{
+	/*
+	 * Size must be a power of two and at least be equal
+	 * to PAMU page size.
+	 */
+	if ((size & (size - 1)) || size < PAMU_PAGE_SIZE) {
+		pr_debug("Size too small or not a power of two\n");
+		return -EINVAL;
+	}
+
+	/* iova must be page size aligned */
+	if (iova & (size - 1)) {
+		pr_debug("Address is not aligned with window size\n");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static struct fsl_dma_domain *iommu_alloc_dma_domain(void)
+{
+	struct fsl_dma_domain *domain;
+
+	domain = kmem_cache_zalloc(fsl_pamu_domain_cache, GFP_KERNEL);
+	if (!domain)
+		return NULL;
+
+	domain->stash_id = ~(u32)0;
+	domain->snoop_id = ~(u32)0;
+	domain->win_cnt = pamu_get_max_subwin_cnt();
+	domain->geom_size = 0;
+
+	INIT_LIST_HEAD(&domain->devices);
+
+	spin_lock_init(&domain->domain_lock);
+
+	return domain;
+}
+
+static void remove_device_ref(struct device_domain_info *info, u32 win_cnt)
+{
+	unsigned long flags;
+
+	list_del(&info->link);
+	spin_lock_irqsave(&iommu_lock, flags);
+	if (win_cnt > 1)
+		pamu_free_subwins(info->liodn);
+	pamu_disable_liodn(info->liodn);
+	spin_unlock_irqrestore(&iommu_lock, flags);
+	spin_lock_irqsave(&device_domain_lock, flags);
+	info->dev->archdata.iommu_domain = NULL;
+	kmem_cache_free(iommu_devinfo_cache, info);
+	spin_unlock_irqrestore(&device_domain_lock, flags);
+}
+
+static void detach_device(struct device *dev, struct fsl_dma_domain *dma_domain)
+{
+	struct device_domain_info *info, *tmp;
+	unsigned long flags;
+
+	spin_lock_irqsave(&dma_domain->domain_lock, flags);
+	/* Remove the device from the domain device list */
+	list_for_each_entry_safe(info, tmp, &dma_domain->devices, link) {
+		if (!dev || (info->dev == dev))
+			remove_device_ref(info, dma_domain->win_cnt);
+	}
+	spin_unlock_irqrestore(&dma_domain->domain_lock, flags);
+}
+
+static void attach_device(struct fsl_dma_domain *dma_domain, int liodn, struct device *dev)
+{
+	struct device_domain_info *info, *old_domain_info;
+	unsigned long flags;
+
+	spin_lock_irqsave(&device_domain_lock, flags);
+	/*
+	 * Check here if the device is already attached to domain or not.
+	 * If the device is already attached to a domain detach it.
+	 */
+	old_domain_info = dev->archdata.iommu_domain;
+	if (old_domain_info && old_domain_info->domain != dma_domain) {
+		spin_unlock_irqrestore(&device_domain_lock, flags);
+		detach_device(dev, old_domain_info->domain);
+		spin_lock_irqsave(&device_domain_lock, flags);
+	}
+
+	info = kmem_cache_zalloc(iommu_devinfo_cache, GFP_ATOMIC);
+
+	info->dev = dev;
+	info->liodn = liodn;
+	info->domain = dma_domain;
+
+	list_add(&info->link, &dma_domain->devices);
+	/*
+	 * In case of devices with multiple LIODNs just store
+	 * the info for the first LIODN as all
+	 * LIODNs share the same domain
+	 */
+	if (!dev->archdata.iommu_domain)
+		dev->archdata.iommu_domain = info;
+	spin_unlock_irqrestore(&device_domain_lock, flags);
+}
+
+static phys_addr_t fsl_pamu_iova_to_phys(struct iommu_domain *domain,
+					 dma_addr_t iova)
+{
+	struct fsl_dma_domain *dma_domain = to_fsl_dma_domain(domain);
+
+	if (iova < domain->geometry.aperture_start ||
+	    iova > domain->geometry.aperture_end)
+		return 0;
+
+	return get_phys_addr(dma_domain, iova);
+}
+
+static bool fsl_pamu_capable(enum iommu_cap cap)
+{
+	return cap == IOMMU_CAP_CACHE_COHERENCY;
+}
+
+static void fsl_pamu_domain_free(struct iommu_domain *domain)
+{
+	struct fsl_dma_domain *dma_domain = to_fsl_dma_domain(domain);
+
+	/* remove all the devices from the device list */
+	detach_device(NULL, dma_domain);
+
+	dma_domain->enabled = 0;
+	dma_domain->mapped = 0;
+
+	kmem_cache_free(fsl_pamu_domain_cache, dma_domain);
+}
+
+static struct iommu_domain *fsl_pamu_domain_alloc(unsigned type)
+{
+	struct fsl_dma_domain *dma_domain;
+
+	if (type != IOMMU_DOMAIN_UNMANAGED)
+		return NULL;
+
+	dma_domain = iommu_alloc_dma_domain();
+	if (!dma_domain) {
+		pr_debug("dma_domain allocation failed\n");
+		return NULL;
+	}
+	/* defaul geometry 64 GB i.e. maximum system address */
+	dma_domain->iommu_domain. geometry.aperture_start = 0;
+	dma_domain->iommu_domain.geometry.aperture_end = (1ULL << 36) - 1;
+	dma_domain->iommu_domain.geometry.force_aperture = true;
+
+	return &dma_domain->iommu_domain;
+}
+
+/* Configure geometry settings for all LIODNs associated with domain */
+static int pamu_set_domain_geometry(struct fsl_dma_domain *dma_domain,
+				    struct iommu_domain_geometry *geom_attr,
+				    u32 win_cnt)
+{
+	struct device_domain_info *info;
+	int ret = 0;
+
+	list_for_each_entry(info, &dma_domain->devices, link) {
+		ret = pamu_set_liodn(info->liodn, info->dev, dma_domain,
+				     geom_attr, win_cnt);
+		if (ret)
+			break;
+	}
+
+	return ret;
+}
+
+/* Update stash destination for all LIODNs associated with the domain */
+static int update_domain_stash(struct fsl_dma_domain *dma_domain, u32 val)
+{
+	struct device_domain_info *info;
+	int ret = 0;
+
+	list_for_each_entry(info, &dma_domain->devices, link) {
+		ret = update_liodn_stash(info->liodn, dma_domain, val);
+		if (ret)
+			break;
+	}
+
+	return ret;
+}
+
+/* Update domain mappings for all LIODNs associated with the domain */
+static int update_domain_mapping(struct fsl_dma_domain *dma_domain, u32 wnd_nr)
+{
+	struct device_domain_info *info;
+	int ret = 0;
+
+	list_for_each_entry(info, &dma_domain->devices, link) {
+		ret = update_liodn(info->liodn, dma_domain, wnd_nr);
+		if (ret)
+			break;
+	}
+	return ret;
+}
+
+static int disable_domain_win(struct fsl_dma_domain *dma_domain, u32 wnd_nr)
+{
+	struct device_domain_info *info;
+	int ret = 0;
+
+	list_for_each_entry(info, &dma_domain->devices, link) {
+		if (dma_domain->win_cnt == 1 && dma_domain->enabled) {
+			ret = pamu_disable_liodn(info->liodn);
+			if (!ret)
+				dma_domain->enabled = 0;
+		} else {
+			ret = pamu_disable_spaace(info->liodn, wnd_nr);
+		}
+	}
+
+	return ret;
+}
+
+static void fsl_pamu_window_disable(struct iommu_domain *domain, u32 wnd_nr)
+{
+	struct fsl_dma_domain *dma_domain = to_fsl_dma_domain(domain);
+	unsigned long flags;
+	int ret;
+
+	spin_lock_irqsave(&dma_domain->domain_lock, flags);
+	if (!dma_domain->win_arr) {
+		pr_debug("Number of windows not configured\n");
+		spin_unlock_irqrestore(&dma_domain->domain_lock, flags);
+		return;
+	}
+
+	if (wnd_nr >= dma_domain->win_cnt) {
+		pr_debug("Invalid window index\n");
+		spin_unlock_irqrestore(&dma_domain->domain_lock, flags);
+		return;
+	}
+
+	if (dma_domain->win_arr[wnd_nr].valid) {
+		ret = disable_domain_win(dma_domain, wnd_nr);
+		if (!ret) {
+			dma_domain->win_arr[wnd_nr].valid = 0;
+			dma_domain->mapped--;
+		}
+	}
+
+	spin_unlock_irqrestore(&dma_domain->domain_lock, flags);
+}
+
+static int fsl_pamu_window_enable(struct iommu_domain *domain, u32 wnd_nr,
+				  phys_addr_t paddr, u64 size, int prot)
+{
+	struct fsl_dma_domain *dma_domain = to_fsl_dma_domain(domain);
+	struct dma_window *wnd;
+	int pamu_prot = 0;
+	int ret;
+	unsigned long flags;
+	u64 win_size;
+
+	if (prot & IOMMU_READ)
+		pamu_prot |= PAACE_AP_PERMS_QUERY;
+	if (prot & IOMMU_WRITE)
+		pamu_prot |= PAACE_AP_PERMS_UPDATE;
+
+	spin_lock_irqsave(&dma_domain->domain_lock, flags);
+	if (!dma_domain->win_arr) {
+		pr_debug("Number of windows not configured\n");
+		spin_unlock_irqrestore(&dma_domain->domain_lock, flags);
+		return -ENODEV;
+	}
+
+	if (wnd_nr >= dma_domain->win_cnt) {
+		pr_debug("Invalid window index\n");
+		spin_unlock_irqrestore(&dma_domain->domain_lock, flags);
+		return -EINVAL;
+	}
+
+	win_size = dma_domain->geom_size >> ilog2(dma_domain->win_cnt);
+	if (size > win_size) {
+		pr_debug("Invalid window size\n");
+		spin_unlock_irqrestore(&dma_domain->domain_lock, flags);
+		return -EINVAL;
+	}
+
+	if (dma_domain->win_cnt == 1) {
+		if (dma_domain->enabled) {
+			pr_debug("Disable the window before updating the mapping\n");
+			spin_unlock_irqrestore(&dma_domain->domain_lock, flags);
+			return -EBUSY;
+		}
+
+		ret = check_size(size, domain->geometry.aperture_start);
+		if (ret) {
+			pr_debug("Aperture start not aligned to the size\n");
+			spin_unlock_irqrestore(&dma_domain->domain_lock, flags);
+			return -EINVAL;
+		}
+	}
+
+	wnd = &dma_domain->win_arr[wnd_nr];
+	if (!wnd->valid) {
+		wnd->paddr = paddr;
+		wnd->size = size;
+		wnd->prot = pamu_prot;
+
+		ret = update_domain_mapping(dma_domain, wnd_nr);
+		if (!ret) {
+			wnd->valid = 1;
+			dma_domain->mapped++;
+		}
+	} else {
+		pr_debug("Disable the window before updating the mapping\n");
+		ret = -EBUSY;
+	}
+
+	spin_unlock_irqrestore(&dma_domain->domain_lock, flags);
+
+	return ret;
+}
+
+/*
+ * Attach the LIODN to the DMA domain and configure the geometry
+ * and window mappings.
+ */
+static int handle_attach_device(struct fsl_dma_domain *dma_domain,
+				struct device *dev, const u32 *liodn,
+				int num)
+{
+	unsigned long flags;
+	struct iommu_domain *domain = &dma_domain->iommu_domain;
+	int ret = 0;
+	int i;
+
+	spin_lock_irqsave(&dma_domain->domain_lock, flags);
+	for (i = 0; i < num; i++) {
+		/* Ensure that LIODN value is valid */
+		if (liodn[i] >= PAACE_NUMBER_ENTRIES) {
+			pr_debug("Invalid liodn %d, attach device failed for %pOF\n",
+				 liodn[i], dev->of_node);
+			ret = -EINVAL;
+			break;
+		}
+
+		attach_device(dma_domain, liodn[i], dev);
+		/*
+		 * Check if geometry has already been configured
+		 * for the domain. If yes, set the geometry for
+		 * the LIODN.
+		 */
+		if (dma_domain->win_arr) {
+			u32 win_cnt = dma_domain->win_cnt > 1 ? dma_domain->win_cnt : 0;
+
+			ret = pamu_set_liodn(liodn[i], dev, dma_domain,
+					     &domain->geometry, win_cnt);
+			if (ret)
+				break;
+			if (dma_domain->mapped) {
+				/*
+				 * Create window/subwindow mapping for
+				 * the LIODN.
+				 */
+				ret = map_liodn(liodn[i], dma_domain);
+				if (ret)
+					break;
+			}
+		}
+	}
+	spin_unlock_irqrestore(&dma_domain->domain_lock, flags);
+
+	return ret;
+}
+
+static int fsl_pamu_attach_device(struct iommu_domain *domain,
+				  struct device *dev)
+{
+	struct fsl_dma_domain *dma_domain = to_fsl_dma_domain(domain);
+	const u32 *liodn;
+	u32 liodn_cnt;
+	int len, ret = 0;
+	struct pci_dev *pdev = NULL;
+	struct pci_controller *pci_ctl;
+
+	/*
+	 * Use LIODN of the PCI controller while attaching a
+	 * PCI device.
+	 */
+	if (dev_is_pci(dev)) {
+		pdev = to_pci_dev(dev);
+		pci_ctl = pci_bus_to_host(pdev->bus);
+		/*
+		 * make dev point to pci controller device
+		 * so we can get the LIODN programmed by
+		 * u-boot.
+		 */
+		dev = pci_ctl->parent;
+	}
+
+	liodn = of_get_property(dev->of_node, "fsl,liodn", &len);
+	if (liodn) {
+		liodn_cnt = len / sizeof(u32);
+		ret = handle_attach_device(dma_domain, dev, liodn, liodn_cnt);
+	} else {
+		pr_debug("missing fsl,liodn property at %pOF\n", dev->of_node);
+		ret = -EINVAL;
+	}
+
+	return ret;
+}
+
+static void fsl_pamu_detach_device(struct iommu_domain *domain,
+				   struct device *dev)
+{
+	struct fsl_dma_domain *dma_domain = to_fsl_dma_domain(domain);
+	const u32 *prop;
+	int len;
+	struct pci_dev *pdev = NULL;
+	struct pci_controller *pci_ctl;
+
+	/*
+	 * Use LIODN of the PCI controller while detaching a
+	 * PCI device.
+	 */
+	if (dev_is_pci(dev)) {
+		pdev = to_pci_dev(dev);
+		pci_ctl = pci_bus_to_host(pdev->bus);
+		/*
+		 * make dev point to pci controller device
+		 * so we can get the LIODN programmed by
+		 * u-boot.
+		 */
+		dev = pci_ctl->parent;
+	}
+
+	prop = of_get_property(dev->of_node, "fsl,liodn", &len);
+	if (prop)
+		detach_device(dev, dma_domain);
+	else
+		pr_debug("missing fsl,liodn property at %pOF\n", dev->of_node);
+}
+
+static  int configure_domain_geometry(struct iommu_domain *domain, void *data)
+{
+	struct iommu_domain_geometry *geom_attr = data;
+	struct fsl_dma_domain *dma_domain = to_fsl_dma_domain(domain);
+	dma_addr_t geom_size;
+	unsigned long flags;
+
+	geom_size = geom_attr->aperture_end - geom_attr->aperture_start + 1;
+	/*
+	 * Sanity check the geometry size. Also, we do not support
+	 * DMA outside of the geometry.
+	 */
+	if (check_size(geom_size, geom_attr->aperture_start) ||
+	    !geom_attr->force_aperture) {
+		pr_debug("Invalid PAMU geometry attributes\n");
+		return -EINVAL;
+	}
+
+	spin_lock_irqsave(&dma_domain->domain_lock, flags);
+	if (dma_domain->enabled) {
+		pr_debug("Can't set geometry attributes as domain is active\n");
+		spin_unlock_irqrestore(&dma_domain->domain_lock, flags);
+		return  -EBUSY;
+	}
+
+	/* Copy the domain geometry information */
+	memcpy(&domain->geometry, geom_attr,
+	       sizeof(struct iommu_domain_geometry));
+	dma_domain->geom_size = geom_size;
+
+	spin_unlock_irqrestore(&dma_domain->domain_lock, flags);
+
+	return 0;
+}
+
+/* Set the domain stash attribute */
+static int configure_domain_stash(struct fsl_dma_domain *dma_domain, void *data)
+{
+	struct pamu_stash_attribute *stash_attr = data;
+	unsigned long flags;
+	int ret;
+
+	spin_lock_irqsave(&dma_domain->domain_lock, flags);
+
+	memcpy(&dma_domain->dma_stash, stash_attr,
+	       sizeof(struct pamu_stash_attribute));
+
+	dma_domain->stash_id = get_stash_id(stash_attr->cache,
+					    stash_attr->cpu);
+	if (dma_domain->stash_id == ~(u32)0) {
+		pr_debug("Invalid stash attributes\n");
+		spin_unlock_irqrestore(&dma_domain->domain_lock, flags);
+		return -EINVAL;
+	}
+
+	ret = update_domain_stash(dma_domain, dma_domain->stash_id);
+
+	spin_unlock_irqrestore(&dma_domain->domain_lock, flags);
+
+	return ret;
+}
+
+/* Configure domain dma state i.e. enable/disable DMA */
+static int configure_domain_dma_state(struct fsl_dma_domain *dma_domain, bool enable)
+{
+	struct device_domain_info *info;
+	unsigned long flags;
+	int ret;
+
+	spin_lock_irqsave(&dma_domain->domain_lock, flags);
+
+	if (enable && !dma_domain->mapped) {
+		pr_debug("Can't enable DMA domain without valid mapping\n");
+		spin_unlock_irqrestore(&dma_domain->domain_lock, flags);
+		return -ENODEV;
+	}
+
+	dma_domain->enabled = enable;
+	list_for_each_entry(info, &dma_domain->devices, link) {
+		ret = (enable) ? pamu_enable_liodn(info->liodn) :
+			pamu_disable_liodn(info->liodn);
+		if (ret)
+			pr_debug("Unable to set dma state for liodn %d",
+				 info->liodn);
+	}
+	spin_unlock_irqrestore(&dma_domain->domain_lock, flags);
+
+	return 0;
+}
+
+static int fsl_pamu_set_domain_attr(struct iommu_domain *domain,
+				    enum iommu_attr attr_type, void *data)
+{
+	struct fsl_dma_domain *dma_domain = to_fsl_dma_domain(domain);
+	int ret = 0;
+
+	switch (attr_type) {
+	case DOMAIN_ATTR_GEOMETRY:
+		ret = configure_domain_geometry(domain, data);
+		break;
+	case DOMAIN_ATTR_FSL_PAMU_STASH:
+		ret = configure_domain_stash(dma_domain, data);
+		break;
+	case DOMAIN_ATTR_FSL_PAMU_ENABLE:
+		ret = configure_domain_dma_state(dma_domain, *(int *)data);
+		break;
+	default:
+		pr_debug("Unsupported attribute type\n");
+		ret = -EINVAL;
+		break;
+	}
+
+	return ret;
+}
+
+static int fsl_pamu_get_domain_attr(struct iommu_domain *domain,
+				    enum iommu_attr attr_type, void *data)
+{
+	struct fsl_dma_domain *dma_domain = to_fsl_dma_domain(domain);
+	int ret = 0;
+
+	switch (attr_type) {
+	case DOMAIN_ATTR_FSL_PAMU_STASH:
+		memcpy(data, &dma_domain->dma_stash,
+		       sizeof(struct pamu_stash_attribute));
+		break;
+	case DOMAIN_ATTR_FSL_PAMU_ENABLE:
+		*(int *)data = dma_domain->enabled;
+		break;
+	case DOMAIN_ATTR_FSL_PAMUV1:
+		*(int *)data = DOMAIN_ATTR_FSL_PAMUV1;
+		break;
+	default:
+		pr_debug("Unsupported attribute type\n");
+		ret = -EINVAL;
+		break;
+	}
+
+	return ret;
+}
+
+static struct iommu_group *get_device_iommu_group(struct device *dev)
+{
+	struct iommu_group *group;
+
+	group = iommu_group_get(dev);
+	if (!group)
+		group = iommu_group_alloc();
+
+	return group;
+}
+
+static  bool check_pci_ctl_endpt_part(struct pci_controller *pci_ctl)
+{
+	u32 version;
+
+	/* Check the PCI controller version number by readding BRR1 register */
+	version = in_be32(pci_ctl->cfg_addr + (PCI_FSL_BRR1 >> 2));
+	version &= PCI_FSL_BRR1_VER;
+	/* If PCI controller version is >= 0x204 we can partition endpoints */
+	return version >= 0x204;
+}
+
+/* Get iommu group information from peer devices or devices on the parent bus */
+static struct iommu_group *get_shared_pci_device_group(struct pci_dev *pdev)
+{
+	struct pci_dev *tmp;
+	struct iommu_group *group;
+	struct pci_bus *bus = pdev->bus;
+
+	/*
+	 * Traverese the pci bus device list to get
+	 * the shared iommu group.
+	 */
+	while (bus) {
+		list_for_each_entry(tmp, &bus->devices, bus_list) {
+			if (tmp == pdev)
+				continue;
+			group = iommu_group_get(&tmp->dev);
+			if (group)
+				return group;
+		}
+
+		bus = bus->parent;
+	}
+
+	return NULL;
+}
+
+static struct iommu_group *get_pci_device_group(struct pci_dev *pdev)
+{
+	struct pci_controller *pci_ctl;
+	bool pci_endpt_partioning;
+	struct iommu_group *group = NULL;
+
+	pci_ctl = pci_bus_to_host(pdev->bus);
+	pci_endpt_partioning = check_pci_ctl_endpt_part(pci_ctl);
+	/* We can partition PCIe devices so assign device group to the device */
+	if (pci_endpt_partioning) {
+		group = pci_device_group(&pdev->dev);
+
+		/*
+		 * PCIe controller is not a paritionable entity
+		 * free the controller device iommu_group.
+		 */
+		if (pci_ctl->parent->iommu_group)
+			iommu_group_remove_device(pci_ctl->parent);
+	} else {
+		/*
+		 * All devices connected to the controller will share the
+		 * PCI controllers device group. If this is the first
+		 * device to be probed for the pci controller, copy the
+		 * device group information from the PCI controller device
+		 * node and remove the PCI controller iommu group.
+		 * For subsequent devices, the iommu group information can
+		 * be obtained from sibling devices (i.e. from the bus_devices
+		 * link list).
+		 */
+		if (pci_ctl->parent->iommu_group) {
+			group = get_device_iommu_group(pci_ctl->parent);
+			iommu_group_remove_device(pci_ctl->parent);
+		} else {
+			group = get_shared_pci_device_group(pdev);
+		}
+	}
+
+	if (!group)
+		group = ERR_PTR(-ENODEV);
+
+	return group;
+}
+
+static struct iommu_group *fsl_pamu_device_group(struct device *dev)
+{
+	struct iommu_group *group = ERR_PTR(-ENODEV);
+	int len;
+
+	/*
+	 * For platform devices we allocate a separate group for
+	 * each of the devices.
+	 */
+	if (dev_is_pci(dev))
+		group = get_pci_device_group(to_pci_dev(dev));
+	else if (of_get_property(dev->of_node, "fsl,liodn", &len))
+		group = get_device_iommu_group(dev);
+
+	return group;
+}
+
+static int fsl_pamu_add_device(struct device *dev)
+{
+	struct iommu_group *group;
+
+	group = iommu_group_get_for_dev(dev);
+	if (IS_ERR(group))
+		return PTR_ERR(group);
+
+	iommu_group_put(group);
+
+	iommu_device_link(&pamu_iommu, dev);
+
+	return 0;
+}
+
+static void fsl_pamu_remove_device(struct device *dev)
+{
+	iommu_device_unlink(&pamu_iommu, dev);
+	iommu_group_remove_device(dev);
+}
+
+static int fsl_pamu_set_windows(struct iommu_domain *domain, u32 w_count)
+{
+	struct fsl_dma_domain *dma_domain = to_fsl_dma_domain(domain);
+	unsigned long flags;
+	int ret;
+
+	spin_lock_irqsave(&dma_domain->domain_lock, flags);
+	/* Ensure domain is inactive i.e. DMA should be disabled for the domain */
+	if (dma_domain->enabled) {
+		pr_debug("Can't set geometry attributes as domain is active\n");
+		spin_unlock_irqrestore(&dma_domain->domain_lock, flags);
+		return  -EBUSY;
+	}
+
+	/* Ensure that the geometry has been set for the domain */
+	if (!dma_domain->geom_size) {
+		pr_debug("Please configure geometry before setting the number of windows\n");
+		spin_unlock_irqrestore(&dma_domain->domain_lock, flags);
+		return -EINVAL;
+	}
+
+	/*
+	 * Ensure we have valid window count i.e. it should be less than
+	 * maximum permissible limit and should be a power of two.
+	 */
+	if (w_count > pamu_get_max_subwin_cnt() || !is_power_of_2(w_count)) {
+		pr_debug("Invalid window count\n");
+		spin_unlock_irqrestore(&dma_domain->domain_lock, flags);
+		return -EINVAL;
+	}
+
+	ret = pamu_set_domain_geometry(dma_domain, &domain->geometry,
+				       w_count > 1 ? w_count : 0);
+	if (!ret) {
+		kfree(dma_domain->win_arr);
+		dma_domain->win_arr = kcalloc(w_count,
+					      sizeof(*dma_domain->win_arr),
+					      GFP_ATOMIC);
+		if (!dma_domain->win_arr) {
+			spin_unlock_irqrestore(&dma_domain->domain_lock, flags);
+			return -ENOMEM;
+		}
+		dma_domain->win_cnt = w_count;
+	}
+	spin_unlock_irqrestore(&dma_domain->domain_lock, flags);
+
+	return ret;
+}
+
+static u32 fsl_pamu_get_windows(struct iommu_domain *domain)
+{
+	struct fsl_dma_domain *dma_domain = to_fsl_dma_domain(domain);
+
+	return dma_domain->win_cnt;
+}
+
+static const struct iommu_ops fsl_pamu_ops = {
+	.capable	= fsl_pamu_capable,
+	.domain_alloc	= fsl_pamu_domain_alloc,
+	.domain_free    = fsl_pamu_domain_free,
+	.attach_dev	= fsl_pamu_attach_device,
+	.detach_dev	= fsl_pamu_detach_device,
+	.domain_window_enable = fsl_pamu_window_enable,
+	.domain_window_disable = fsl_pamu_window_disable,
+	.domain_get_windows = fsl_pamu_get_windows,
+	.domain_set_windows = fsl_pamu_set_windows,
+	.iova_to_phys	= fsl_pamu_iova_to_phys,
+	.domain_set_attr = fsl_pamu_set_domain_attr,
+	.domain_get_attr = fsl_pamu_get_domain_attr,
+	.add_device	= fsl_pamu_add_device,
+	.remove_device	= fsl_pamu_remove_device,
+	.device_group   = fsl_pamu_device_group,
+};
+
+int __init pamu_domain_init(void)
+{
+	int ret = 0;
+
+	ret = iommu_init_mempool();
+	if (ret)
+		return ret;
+
+	ret = iommu_device_sysfs_add(&pamu_iommu, NULL, NULL, "iommu0");
+	if (ret)
+		return ret;
+
+	iommu_device_set_ops(&pamu_iommu, &fsl_pamu_ops);
+
+	ret = iommu_device_register(&pamu_iommu);
+	if (ret) {
+		iommu_device_sysfs_remove(&pamu_iommu);
+		pr_err("Can't register iommu device\n");
+		return ret;
+	}
+
+	bus_set_iommu(&platform_bus_type, &fsl_pamu_ops);
+	bus_set_iommu(&pci_bus_type, &fsl_pamu_ops);
+
+	return ret;
+}
diff --git a/drivers/iommu/fsl_pamu_domain.h b/drivers/iommu/fsl_pamu_domain.h
new file mode 100644
index 000000000..f2b0f741d
--- /dev/null
+++ b/drivers/iommu/fsl_pamu_domain.h
@@ -0,0 +1,85 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ * Copyright (C) 2013 Freescale Semiconductor, Inc.
+ *
+ */
+
+#ifndef __FSL_PAMU_DOMAIN_H
+#define __FSL_PAMU_DOMAIN_H
+
+#include "fsl_pamu.h"
+
+struct dma_window {
+	phys_addr_t paddr;
+	u64 size;
+	int valid;
+	int prot;
+};
+
+struct fsl_dma_domain {
+	/*
+	 * Indicates the geometry size for the domain.
+	 * This would be set when the geometry is
+	 * configured for the domain.
+	 */
+	dma_addr_t			geom_size;
+	/*
+	 * Number of windows assocaited with this domain.
+	 * During domain initialization, it is set to the
+	 * the maximum number of subwindows allowed for a LIODN.
+	 * Minimum value for this is 1 indicating a single PAMU
+	 * window, without any sub windows. Value can be set/
+	 * queried by set_attr/get_attr API for DOMAIN_ATTR_WINDOWS.
+	 * Value can only be set once the geometry has been configured.
+	 */
+	u32				win_cnt;
+	/*
+	 * win_arr contains information of the configured
+	 * windows for a domain. This is allocated only
+	 * when the number of windows for the domain are
+	 * set.
+	 */
+	struct dma_window		*win_arr;
+	/* list of devices associated with the domain */
+	struct list_head		devices;
+	/* dma_domain states:
+	 * mapped - A particular mapping has been created
+	 * within the configured geometry.
+	 * enabled - DMA has been enabled for the given
+	 * domain. This translates to setting of the
+	 * valid bit for the primary PAACE in the PAMU
+	 * PAACT table. Domain geometry should be set and
+	 * it must have a valid mapping before DMA can be
+	 * enabled for it.
+	 *
+	 */
+	int				mapped;
+	int				enabled;
+	/* stash_id obtained from the stash attribute details */
+	u32				stash_id;
+	struct pamu_stash_attribute	dma_stash;
+	u32				snoop_id;
+	struct iommu_domain		iommu_domain;
+	spinlock_t			domain_lock;
+};
+
+/* domain-device relationship */
+struct device_domain_info {
+	struct list_head link;	/* link to domain siblings */
+	struct device *dev;
+	u32 liodn;
+	struct fsl_dma_domain *domain; /* pointer to domain */
+};
+#endif  /* __FSL_PAMU_DOMAIN_H */
diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
new file mode 100644
index 000000000..dcb865d19
--- /dev/null
+++ b/drivers/iommu/intel-iommu.c
@@ -0,0 +1,5511 @@
+/*
+ * Copyright © 2006-2014 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * Authors: David Woodhouse <dwmw2@infradead.org>,
+ *          Ashok Raj <ashok.raj@intel.com>,
+ *          Shaohua Li <shaohua.li@intel.com>,
+ *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
+ *          Fenghua Yu <fenghua.yu@intel.com>
+ *          Joerg Roedel <jroedel@suse.de>
+ */
+
+#define pr_fmt(fmt)     "DMAR: " fmt
+
+#include <linux/init.h>
+#include <linux/bitmap.h>
+#include <linux/debugfs.h>
+#include <linux/export.h>
+#include <linux/slab.h>
+#include <linux/irq.h>
+#include <linux/interrupt.h>
+#include <linux/spinlock.h>
+#include <linux/pci.h>
+#include <linux/dmar.h>
+#include <linux/dma-mapping.h>
+#include <linux/mempool.h>
+#include <linux/memory.h>
+#include <linux/cpu.h>
+#include <linux/timer.h>
+#include <linux/io.h>
+#include <linux/iova.h>
+#include <linux/iommu.h>
+#include <linux/intel-iommu.h>
+#include <linux/syscore_ops.h>
+#include <linux/tboot.h>
+#include <linux/dmi.h>
+#include <linux/pci-ats.h>
+#include <linux/memblock.h>
+#include <linux/dma-contiguous.h>
+#include <linux/dma-direct.h>
+#include <linux/crash_dump.h>
+#include <asm/irq_remapping.h>
+#include <asm/cacheflush.h>
+#include <asm/iommu.h>
+
+#include "irq_remapping.h"
+#include "intel-pasid.h"
+
+#define ROOT_SIZE		VTD_PAGE_SIZE
+#define CONTEXT_SIZE		VTD_PAGE_SIZE
+
+#define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
+#define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
+#define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
+#define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
+
+#define IOAPIC_RANGE_START	(0xfee00000)
+#define IOAPIC_RANGE_END	(0xfeefffff)
+#define IOVA_START_ADDR		(0x1000)
+
+#define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
+
+#define MAX_AGAW_WIDTH 64
+#define MAX_AGAW_PFN_WIDTH	(MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
+
+#define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
+#define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
+
+/* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
+   to match. That way, we can use 'unsigned long' for PFNs with impunity. */
+#define DOMAIN_MAX_PFN(gaw)	((unsigned long) min_t(uint64_t, \
+				__DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
+#define DOMAIN_MAX_ADDR(gaw)	(((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
+
+/* IO virtual address start page frame number */
+#define IOVA_START_PFN		(1)
+
+#define IOVA_PFN(addr)		((addr) >> PAGE_SHIFT)
+
+/* page table handling */
+#define LEVEL_STRIDE		(9)
+#define LEVEL_MASK		(((u64)1 << LEVEL_STRIDE) - 1)
+
+/*
+ * This bitmap is used to advertise the page sizes our hardware support
+ * to the IOMMU core, which will then use this information to split
+ * physically contiguous memory regions it is mapping into page sizes
+ * that we support.
+ *
+ * Traditionally the IOMMU core just handed us the mappings directly,
+ * after making sure the size is an order of a 4KiB page and that the
+ * mapping has natural alignment.
+ *
+ * To retain this behavior, we currently advertise that we support
+ * all page sizes that are an order of 4KiB.
+ *
+ * If at some point we'd like to utilize the IOMMU core's new behavior,
+ * we could change this to advertise the real page sizes we support.
+ */
+#define INTEL_IOMMU_PGSIZES	(~0xFFFUL)
+
+static inline int agaw_to_level(int agaw)
+{
+	return agaw + 2;
+}
+
+static inline int agaw_to_width(int agaw)
+{
+	return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
+}
+
+static inline int width_to_agaw(int width)
+{
+	return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
+}
+
+static inline unsigned int level_to_offset_bits(int level)
+{
+	return (level - 1) * LEVEL_STRIDE;
+}
+
+static inline int pfn_level_offset(unsigned long pfn, int level)
+{
+	return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
+}
+
+static inline unsigned long level_mask(int level)
+{
+	return -1UL << level_to_offset_bits(level);
+}
+
+static inline unsigned long level_size(int level)
+{
+	return 1UL << level_to_offset_bits(level);
+}
+
+static inline unsigned long align_to_level(unsigned long pfn, int level)
+{
+	return (pfn + level_size(level) - 1) & level_mask(level);
+}
+
+static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
+{
+	return  1 << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
+}
+
+/* VT-d pages must always be _smaller_ than MM pages. Otherwise things
+   are never going to work. */
+static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
+{
+	return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
+}
+
+static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
+{
+	return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
+}
+static inline unsigned long page_to_dma_pfn(struct page *pg)
+{
+	return mm_to_dma_pfn(page_to_pfn(pg));
+}
+static inline unsigned long virt_to_dma_pfn(void *p)
+{
+	return page_to_dma_pfn(virt_to_page(p));
+}
+
+/* global iommu list, set NULL for ignored DMAR units */
+static struct intel_iommu **g_iommus;
+
+static void __init check_tylersburg_isoch(void);
+static int rwbf_quirk;
+
+/*
+ * set to 1 to panic kernel if can't successfully enable VT-d
+ * (used when kernel is launched w/ TXT)
+ */
+static int force_on = 0;
+int intel_iommu_tboot_noforce;
+
+/*
+ * 0: Present
+ * 1-11: Reserved
+ * 12-63: Context Ptr (12 - (haw-1))
+ * 64-127: Reserved
+ */
+struct root_entry {
+	u64	lo;
+	u64	hi;
+};
+#define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
+
+/*
+ * Take a root_entry and return the Lower Context Table Pointer (LCTP)
+ * if marked present.
+ */
+static phys_addr_t root_entry_lctp(struct root_entry *re)
+{
+	if (!(re->lo & 1))
+		return 0;
+
+	return re->lo & VTD_PAGE_MASK;
+}
+
+/*
+ * Take a root_entry and return the Upper Context Table Pointer (UCTP)
+ * if marked present.
+ */
+static phys_addr_t root_entry_uctp(struct root_entry *re)
+{
+	if (!(re->hi & 1))
+		return 0;
+
+	return re->hi & VTD_PAGE_MASK;
+}
+/*
+ * low 64 bits:
+ * 0: present
+ * 1: fault processing disable
+ * 2-3: translation type
+ * 12-63: address space root
+ * high 64 bits:
+ * 0-2: address width
+ * 3-6: aval
+ * 8-23: domain id
+ */
+struct context_entry {
+	u64 lo;
+	u64 hi;
+};
+
+static inline void context_clear_pasid_enable(struct context_entry *context)
+{
+	context->lo &= ~(1ULL << 11);
+}
+
+static inline bool context_pasid_enabled(struct context_entry *context)
+{
+	return !!(context->lo & (1ULL << 11));
+}
+
+static inline void context_set_copied(struct context_entry *context)
+{
+	context->hi |= (1ull << 3);
+}
+
+static inline bool context_copied(struct context_entry *context)
+{
+	return !!(context->hi & (1ULL << 3));
+}
+
+static inline bool __context_present(struct context_entry *context)
+{
+	return (context->lo & 1);
+}
+
+static inline bool context_present(struct context_entry *context)
+{
+	return context_pasid_enabled(context) ?
+	     __context_present(context) :
+	     __context_present(context) && !context_copied(context);
+}
+
+static inline void context_set_present(struct context_entry *context)
+{
+	context->lo |= 1;
+}
+
+static inline void context_set_fault_enable(struct context_entry *context)
+{
+	context->lo &= (((u64)-1) << 2) | 1;
+}
+
+static inline void context_set_translation_type(struct context_entry *context,
+						unsigned long value)
+{
+	context->lo &= (((u64)-1) << 4) | 3;
+	context->lo |= (value & 3) << 2;
+}
+
+static inline void context_set_address_root(struct context_entry *context,
+					    unsigned long value)
+{
+	context->lo &= ~VTD_PAGE_MASK;
+	context->lo |= value & VTD_PAGE_MASK;
+}
+
+static inline void context_set_address_width(struct context_entry *context,
+					     unsigned long value)
+{
+	context->hi |= value & 7;
+}
+
+static inline void context_set_domain_id(struct context_entry *context,
+					 unsigned long value)
+{
+	context->hi |= (value & ((1 << 16) - 1)) << 8;
+}
+
+static inline int context_domain_id(struct context_entry *c)
+{
+	return((c->hi >> 8) & 0xffff);
+}
+
+static inline void context_clear_entry(struct context_entry *context)
+{
+	context->lo = 0;
+	context->hi = 0;
+}
+
+/*
+ * 0: readable
+ * 1: writable
+ * 2-6: reserved
+ * 7: super page
+ * 8-10: available
+ * 11: snoop behavior
+ * 12-63: Host physcial address
+ */
+struct dma_pte {
+	u64 val;
+};
+
+static inline void dma_clear_pte(struct dma_pte *pte)
+{
+	pte->val = 0;
+}
+
+static inline u64 dma_pte_addr(struct dma_pte *pte)
+{
+#ifdef CONFIG_64BIT
+	return pte->val & VTD_PAGE_MASK;
+#else
+	/* Must have a full atomic 64-bit read */
+	return  __cmpxchg64(&pte->val, 0ULL, 0ULL) & VTD_PAGE_MASK;
+#endif
+}
+
+static inline bool dma_pte_present(struct dma_pte *pte)
+{
+	return (pte->val & 3) != 0;
+}
+
+static inline bool dma_pte_superpage(struct dma_pte *pte)
+{
+	return (pte->val & DMA_PTE_LARGE_PAGE);
+}
+
+static inline int first_pte_in_page(struct dma_pte *pte)
+{
+	return !((unsigned long)pte & ~VTD_PAGE_MASK);
+}
+
+/*
+ * This domain is a statically identity mapping domain.
+ *	1. This domain creats a static 1:1 mapping to all usable memory.
+ * 	2. It maps to each iommu if successful.
+ *	3. Each iommu mapps to this domain if successful.
+ */
+static struct dmar_domain *si_domain;
+static int hw_pass_through = 1;
+
+/*
+ * Domain represents a virtual machine, more than one devices
+ * across iommus may be owned in one domain, e.g. kvm guest.
+ */
+#define DOMAIN_FLAG_VIRTUAL_MACHINE	(1 << 0)
+
+/* si_domain contains mulitple devices */
+#define DOMAIN_FLAG_STATIC_IDENTITY	(1 << 1)
+
+#define for_each_domain_iommu(idx, domain)			\
+	for (idx = 0; idx < g_num_of_iommus; idx++)		\
+		if (domain->iommu_refcnt[idx])
+
+struct dmar_rmrr_unit {
+	struct list_head list;		/* list of rmrr units	*/
+	struct acpi_dmar_header *hdr;	/* ACPI header		*/
+	u64	base_address;		/* reserved base address*/
+	u64	end_address;		/* reserved end address */
+	struct dmar_dev_scope *devices;	/* target devices */
+	int	devices_cnt;		/* target device count */
+};
+
+struct dmar_atsr_unit {
+	struct list_head list;		/* list of ATSR units */
+	struct acpi_dmar_header *hdr;	/* ACPI header */
+	struct dmar_dev_scope *devices;	/* target devices */
+	int devices_cnt;		/* target device count */
+	u8 include_all:1;		/* include all ports */
+};
+
+static LIST_HEAD(dmar_atsr_units);
+static LIST_HEAD(dmar_rmrr_units);
+
+#define for_each_rmrr_units(rmrr) \
+	list_for_each_entry(rmrr, &dmar_rmrr_units, list)
+
+/* bitmap for indexing intel_iommus */
+static int g_num_of_iommus;
+
+static void domain_exit(struct dmar_domain *domain);
+static void domain_remove_dev_info(struct dmar_domain *domain);
+static void dmar_remove_one_dev_info(struct dmar_domain *domain,
+				     struct device *dev);
+static void __dmar_remove_one_dev_info(struct device_domain_info *info);
+static void domain_context_clear(struct intel_iommu *iommu,
+				 struct device *dev);
+static int domain_detach_iommu(struct dmar_domain *domain,
+			       struct intel_iommu *iommu);
+
+#ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
+int dmar_disabled = 0;
+#else
+int dmar_disabled = 1;
+#endif /*CONFIG_INTEL_IOMMU_DEFAULT_ON*/
+
+int intel_iommu_enabled = 0;
+EXPORT_SYMBOL_GPL(intel_iommu_enabled);
+
+static int dmar_map_gfx = 1;
+static int dmar_forcedac;
+static int intel_iommu_strict;
+static int intel_iommu_superpage = 1;
+static int intel_iommu_ecs = 1;
+static int intel_iommu_pasid28;
+static int iommu_identity_mapping;
+
+#define IDENTMAP_ALL		1
+#define IDENTMAP_GFX		2
+#define IDENTMAP_AZALIA		4
+
+/* Broadwell and Skylake have broken ECS support — normal so-called "second
+ * level" translation of DMA requests-without-PASID doesn't actually happen
+ * unless you also set the NESTE bit in an extended context-entry. Which of
+ * course means that SVM doesn't work because it's trying to do nested
+ * translation of the physical addresses it finds in the process page tables,
+ * through the IOVA->phys mapping found in the "second level" page tables.
+ *
+ * The VT-d specification was retroactively changed to change the definition
+ * of the capability bits and pretend that Broadwell/Skylake never happened...
+ * but unfortunately the wrong bit was changed. It's ECS which is broken, but
+ * for some reason it was the PASID capability bit which was redefined (from
+ * bit 28 on BDW/SKL to bit 40 in future).
+ *
+ * So our test for ECS needs to eschew those implementations which set the old
+ * PASID capabiity bit 28, since those are the ones on which ECS is broken.
+ * Unless we are working around the 'pasid28' limitations, that is, by putting
+ * the device into passthrough mode for normal DMA and thus masking the bug.
+ */
+#define ecs_enabled(iommu) (intel_iommu_ecs && ecap_ecs(iommu->ecap) && \
+			    (intel_iommu_pasid28 || !ecap_broken_pasid(iommu->ecap)))
+/* PASID support is thus enabled if ECS is enabled and *either* of the old
+ * or new capability bits are set. */
+#define pasid_enabled(iommu) (ecs_enabled(iommu) &&			\
+			      (ecap_pasid(iommu->ecap) || ecap_broken_pasid(iommu->ecap)))
+
+int intel_iommu_gfx_mapped;
+EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
+
+#define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
+static DEFINE_SPINLOCK(device_domain_lock);
+static LIST_HEAD(device_domain_list);
+
+/*
+ * Iterate over elements in device_domain_list and call the specified
+ * callback @fn against each element. This helper should only be used
+ * in the context where the device_domain_lock has already been holden.
+ */
+int for_each_device_domain(int (*fn)(struct device_domain_info *info,
+				     void *data), void *data)
+{
+	int ret = 0;
+	struct device_domain_info *info;
+
+	assert_spin_locked(&device_domain_lock);
+	list_for_each_entry(info, &device_domain_list, global) {
+		ret = fn(info, data);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+const struct iommu_ops intel_iommu_ops;
+
+static bool translation_pre_enabled(struct intel_iommu *iommu)
+{
+	return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
+}
+
+static void clear_translation_pre_enabled(struct intel_iommu *iommu)
+{
+	iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
+}
+
+static void init_translation_status(struct intel_iommu *iommu)
+{
+	u32 gsts;
+
+	gsts = readl(iommu->reg + DMAR_GSTS_REG);
+	if (gsts & DMA_GSTS_TES)
+		iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
+}
+
+/* Convert generic 'struct iommu_domain to private struct dmar_domain */
+static struct dmar_domain *to_dmar_domain(struct iommu_domain *dom)
+{
+	return container_of(dom, struct dmar_domain, domain);
+}
+
+static int __init intel_iommu_setup(char *str)
+{
+	if (!str)
+		return -EINVAL;
+	while (*str) {
+		if (!strncmp(str, "on", 2)) {
+			dmar_disabled = 0;
+			pr_info("IOMMU enabled\n");
+		} else if (!strncmp(str, "off", 3)) {
+			dmar_disabled = 1;
+			pr_info("IOMMU disabled\n");
+		} else if (!strncmp(str, "igfx_off", 8)) {
+			dmar_map_gfx = 0;
+			pr_info("Disable GFX device mapping\n");
+		} else if (!strncmp(str, "forcedac", 8)) {
+			pr_info("Forcing DAC for PCI devices\n");
+			dmar_forcedac = 1;
+		} else if (!strncmp(str, "strict", 6)) {
+			pr_info("Disable batched IOTLB flush\n");
+			intel_iommu_strict = 1;
+		} else if (!strncmp(str, "sp_off", 6)) {
+			pr_info("Disable supported super page\n");
+			intel_iommu_superpage = 0;
+		} else if (!strncmp(str, "ecs_off", 7)) {
+			printk(KERN_INFO
+				"Intel-IOMMU: disable extended context table support\n");
+			intel_iommu_ecs = 0;
+		} else if (!strncmp(str, "pasid28", 7)) {
+			printk(KERN_INFO
+				"Intel-IOMMU: enable pre-production PASID support\n");
+			intel_iommu_pasid28 = 1;
+			iommu_identity_mapping |= IDENTMAP_GFX;
+		} else if (!strncmp(str, "tboot_noforce", 13)) {
+			printk(KERN_INFO
+				"Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
+			intel_iommu_tboot_noforce = 1;
+		}
+
+		str += strcspn(str, ",");
+		while (*str == ',')
+			str++;
+	}
+	return 0;
+}
+__setup("intel_iommu=", intel_iommu_setup);
+
+static struct kmem_cache *iommu_domain_cache;
+static struct kmem_cache *iommu_devinfo_cache;
+
+static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did)
+{
+	struct dmar_domain **domains;
+	int idx = did >> 8;
+
+	domains = iommu->domains[idx];
+	if (!domains)
+		return NULL;
+
+	return domains[did & 0xff];
+}
+
+static void set_iommu_domain(struct intel_iommu *iommu, u16 did,
+			     struct dmar_domain *domain)
+{
+	struct dmar_domain **domains;
+	int idx = did >> 8;
+
+	if (!iommu->domains[idx]) {
+		size_t size = 256 * sizeof(struct dmar_domain *);
+		iommu->domains[idx] = kzalloc(size, GFP_ATOMIC);
+	}
+
+	domains = iommu->domains[idx];
+	if (WARN_ON(!domains))
+		return;
+	else
+		domains[did & 0xff] = domain;
+}
+
+void *alloc_pgtable_page(int node)
+{
+	struct page *page;
+	void *vaddr = NULL;
+
+	page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
+	if (page)
+		vaddr = page_address(page);
+	return vaddr;
+}
+
+void free_pgtable_page(void *vaddr)
+{
+	free_page((unsigned long)vaddr);
+}
+
+static inline void *alloc_domain_mem(void)
+{
+	return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
+}
+
+static void free_domain_mem(void *vaddr)
+{
+	kmem_cache_free(iommu_domain_cache, vaddr);
+}
+
+static inline void * alloc_devinfo_mem(void)
+{
+	return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
+}
+
+static inline void free_devinfo_mem(void *vaddr)
+{
+	kmem_cache_free(iommu_devinfo_cache, vaddr);
+}
+
+static inline int domain_type_is_vm(struct dmar_domain *domain)
+{
+	return domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE;
+}
+
+static inline int domain_type_is_si(struct dmar_domain *domain)
+{
+	return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY;
+}
+
+static inline int domain_type_is_vm_or_si(struct dmar_domain *domain)
+{
+	return domain->flags & (DOMAIN_FLAG_VIRTUAL_MACHINE |
+				DOMAIN_FLAG_STATIC_IDENTITY);
+}
+
+static inline int domain_pfn_supported(struct dmar_domain *domain,
+				       unsigned long pfn)
+{
+	int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
+
+	return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
+}
+
+static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
+{
+	unsigned long sagaw;
+	int agaw = -1;
+
+	sagaw = cap_sagaw(iommu->cap);
+	for (agaw = width_to_agaw(max_gaw);
+	     agaw >= 0; agaw--) {
+		if (test_bit(agaw, &sagaw))
+			break;
+	}
+
+	return agaw;
+}
+
+/*
+ * Calculate max SAGAW for each iommu.
+ */
+int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
+{
+	return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
+}
+
+/*
+ * calculate agaw for each iommu.
+ * "SAGAW" may be different across iommus, use a default agaw, and
+ * get a supported less agaw for iommus that don't support the default agaw.
+ */
+int iommu_calculate_agaw(struct intel_iommu *iommu)
+{
+	return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
+}
+
+/* This functionin only returns single iommu in a domain */
+struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
+{
+	int iommu_id;
+
+	/* si_domain and vm domain should not get here. */
+	BUG_ON(domain_type_is_vm_or_si(domain));
+	for_each_domain_iommu(iommu_id, domain)
+		break;
+
+	if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
+		return NULL;
+
+	return g_iommus[iommu_id];
+}
+
+static void domain_update_iommu_coherency(struct dmar_domain *domain)
+{
+	struct dmar_drhd_unit *drhd;
+	struct intel_iommu *iommu;
+	bool found = false;
+	int i;
+
+	domain->iommu_coherency = 1;
+
+	for_each_domain_iommu(i, domain) {
+		found = true;
+		if (!ecap_coherent(g_iommus[i]->ecap)) {
+			domain->iommu_coherency = 0;
+			break;
+		}
+	}
+	if (found)
+		return;
+
+	/* No hardware attached; use lowest common denominator */
+	rcu_read_lock();
+	for_each_active_iommu(iommu, drhd) {
+		if (!ecap_coherent(iommu->ecap)) {
+			domain->iommu_coherency = 0;
+			break;
+		}
+	}
+	rcu_read_unlock();
+}
+
+static int domain_update_iommu_snooping(struct intel_iommu *skip)
+{
+	struct dmar_drhd_unit *drhd;
+	struct intel_iommu *iommu;
+	int ret = 1;
+
+	rcu_read_lock();
+	for_each_active_iommu(iommu, drhd) {
+		if (iommu != skip) {
+			if (!ecap_sc_support(iommu->ecap)) {
+				ret = 0;
+				break;
+			}
+		}
+	}
+	rcu_read_unlock();
+
+	return ret;
+}
+
+static int domain_update_iommu_superpage(struct intel_iommu *skip)
+{
+	struct dmar_drhd_unit *drhd;
+	struct intel_iommu *iommu;
+	int mask = 0xf;
+
+	if (!intel_iommu_superpage) {
+		return 0;
+	}
+
+	/* set iommu_superpage to the smallest common denominator */
+	rcu_read_lock();
+	for_each_active_iommu(iommu, drhd) {
+		if (iommu != skip) {
+			mask &= cap_super_page_val(iommu->cap);
+			if (!mask)
+				break;
+		}
+	}
+	rcu_read_unlock();
+
+	return fls(mask);
+}
+
+/* Some capabilities may be different across iommus */
+static void domain_update_iommu_cap(struct dmar_domain *domain)
+{
+	domain_update_iommu_coherency(domain);
+	domain->iommu_snooping = domain_update_iommu_snooping(NULL);
+	domain->iommu_superpage = domain_update_iommu_superpage(NULL);
+}
+
+static inline struct context_entry *iommu_context_addr(struct intel_iommu *iommu,
+						       u8 bus, u8 devfn, int alloc)
+{
+	struct root_entry *root = &iommu->root_entry[bus];
+	struct context_entry *context;
+	u64 *entry;
+
+	entry = &root->lo;
+	if (ecs_enabled(iommu)) {
+		if (devfn >= 0x80) {
+			devfn -= 0x80;
+			entry = &root->hi;
+		}
+		devfn *= 2;
+	}
+	if (*entry & 1)
+		context = phys_to_virt(*entry & VTD_PAGE_MASK);
+	else {
+		unsigned long phy_addr;
+		if (!alloc)
+			return NULL;
+
+		context = alloc_pgtable_page(iommu->node);
+		if (!context)
+			return NULL;
+
+		__iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
+		phy_addr = virt_to_phys((void *)context);
+		*entry = phy_addr | 1;
+		__iommu_flush_cache(iommu, entry, sizeof(*entry));
+	}
+	return &context[devfn];
+}
+
+static int iommu_dummy(struct device *dev)
+{
+	return dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
+}
+
+static struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
+{
+	struct dmar_drhd_unit *drhd = NULL;
+	struct intel_iommu *iommu;
+	struct device *tmp;
+	struct pci_dev *ptmp, *pdev = NULL;
+	u16 segment = 0;
+	int i;
+
+	if (iommu_dummy(dev))
+		return NULL;
+
+	if (dev_is_pci(dev)) {
+		struct pci_dev *pf_pdev;
+
+		pdev = to_pci_dev(dev);
+
+#ifdef CONFIG_X86
+		/* VMD child devices currently cannot be handled individually */
+		if (is_vmd(pdev->bus))
+			return NULL;
+#endif
+
+		/* VFs aren't listed in scope tables; we need to look up
+		 * the PF instead to find the IOMMU. */
+		pf_pdev = pci_physfn(pdev);
+		dev = &pf_pdev->dev;
+		segment = pci_domain_nr(pdev->bus);
+	} else if (has_acpi_companion(dev))
+		dev = &ACPI_COMPANION(dev)->dev;
+
+	rcu_read_lock();
+	for_each_active_iommu(iommu, drhd) {
+		if (pdev && segment != drhd->segment)
+			continue;
+
+		for_each_active_dev_scope(drhd->devices,
+					  drhd->devices_cnt, i, tmp) {
+			if (tmp == dev) {
+				/* For a VF use its original BDF# not that of the PF
+				 * which we used for the IOMMU lookup. Strictly speaking
+				 * we could do this for all PCI devices; we only need to
+				 * get the BDF# from the scope table for ACPI matches. */
+				if (pdev && pdev->is_virtfn)
+					goto got_pdev;
+
+				*bus = drhd->devices[i].bus;
+				*devfn = drhd->devices[i].devfn;
+				goto out;
+			}
+
+			if (!pdev || !dev_is_pci(tmp))
+				continue;
+
+			ptmp = to_pci_dev(tmp);
+			if (ptmp->subordinate &&
+			    ptmp->subordinate->number <= pdev->bus->number &&
+			    ptmp->subordinate->busn_res.end >= pdev->bus->number)
+				goto got_pdev;
+		}
+
+		if (pdev && drhd->include_all) {
+		got_pdev:
+			*bus = pdev->bus->number;
+			*devfn = pdev->devfn;
+			goto out;
+		}
+	}
+	iommu = NULL;
+ out:
+	rcu_read_unlock();
+
+	return iommu;
+}
+
+static void domain_flush_cache(struct dmar_domain *domain,
+			       void *addr, int size)
+{
+	if (!domain->iommu_coherency)
+		clflush_cache_range(addr, size);
+}
+
+static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
+{
+	struct context_entry *context;
+	int ret = 0;
+	unsigned long flags;
+
+	spin_lock_irqsave(&iommu->lock, flags);
+	context = iommu_context_addr(iommu, bus, devfn, 0);
+	if (context)
+		ret = context_present(context);
+	spin_unlock_irqrestore(&iommu->lock, flags);
+	return ret;
+}
+
+static void free_context_table(struct intel_iommu *iommu)
+{
+	int i;
+	unsigned long flags;
+	struct context_entry *context;
+
+	spin_lock_irqsave(&iommu->lock, flags);
+	if (!iommu->root_entry) {
+		goto out;
+	}
+	for (i = 0; i < ROOT_ENTRY_NR; i++) {
+		context = iommu_context_addr(iommu, i, 0, 0);
+		if (context)
+			free_pgtable_page(context);
+
+		if (!ecs_enabled(iommu))
+			continue;
+
+		context = iommu_context_addr(iommu, i, 0x80, 0);
+		if (context)
+			free_pgtable_page(context);
+
+	}
+	free_pgtable_page(iommu->root_entry);
+	iommu->root_entry = NULL;
+out:
+	spin_unlock_irqrestore(&iommu->lock, flags);
+}
+
+static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
+				      unsigned long pfn, int *target_level)
+{
+	struct dma_pte *parent, *pte = NULL;
+	int level = agaw_to_level(domain->agaw);
+	int offset;
+
+	BUG_ON(!domain->pgd);
+
+	if (!domain_pfn_supported(domain, pfn))
+		/* Address beyond IOMMU's addressing capabilities. */
+		return NULL;
+
+	parent = domain->pgd;
+
+	while (1) {
+		void *tmp_page;
+
+		offset = pfn_level_offset(pfn, level);
+		pte = &parent[offset];
+		if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
+			break;
+		if (level == *target_level)
+			break;
+
+		if (!dma_pte_present(pte)) {
+			uint64_t pteval;
+
+			tmp_page = alloc_pgtable_page(domain->nid);
+
+			if (!tmp_page)
+				return NULL;
+
+			domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
+			pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
+			if (cmpxchg64(&pte->val, 0ULL, pteval))
+				/* Someone else set it while we were thinking; use theirs. */
+				free_pgtable_page(tmp_page);
+			else
+				domain_flush_cache(domain, pte, sizeof(*pte));
+		}
+		if (level == 1)
+			break;
+
+		parent = phys_to_virt(dma_pte_addr(pte));
+		level--;
+	}
+
+	if (!*target_level)
+		*target_level = level;
+
+	return pte;
+}
+
+
+/* return address's pte at specific level */
+static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
+					 unsigned long pfn,
+					 int level, int *large_page)
+{
+	struct dma_pte *parent, *pte = NULL;
+	int total = agaw_to_level(domain->agaw);
+	int offset;
+
+	parent = domain->pgd;
+	while (level <= total) {
+		offset = pfn_level_offset(pfn, total);
+		pte = &parent[offset];
+		if (level == total)
+			return pte;
+
+		if (!dma_pte_present(pte)) {
+			*large_page = total;
+			break;
+		}
+
+		if (dma_pte_superpage(pte)) {
+			*large_page = total;
+			return pte;
+		}
+
+		parent = phys_to_virt(dma_pte_addr(pte));
+		total--;
+	}
+	return NULL;
+}
+
+/* clear last level pte, a tlb flush should be followed */
+static void dma_pte_clear_range(struct dmar_domain *domain,
+				unsigned long start_pfn,
+				unsigned long last_pfn)
+{
+	unsigned int large_page = 1;
+	struct dma_pte *first_pte, *pte;
+
+	BUG_ON(!domain_pfn_supported(domain, start_pfn));
+	BUG_ON(!domain_pfn_supported(domain, last_pfn));
+	BUG_ON(start_pfn > last_pfn);
+
+	/* we don't need lock here; nobody else touches the iova range */
+	do {
+		large_page = 1;
+		first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
+		if (!pte) {
+			start_pfn = align_to_level(start_pfn + 1, large_page + 1);
+			continue;
+		}
+		do {
+			dma_clear_pte(pte);
+			start_pfn += lvl_to_nr_pages(large_page);
+			pte++;
+		} while (start_pfn <= last_pfn && !first_pte_in_page(pte));
+
+		domain_flush_cache(domain, first_pte,
+				   (void *)pte - (void *)first_pte);
+
+	} while (start_pfn && start_pfn <= last_pfn);
+}
+
+static void dma_pte_free_level(struct dmar_domain *domain, int level,
+			       int retain_level, struct dma_pte *pte,
+			       unsigned long pfn, unsigned long start_pfn,
+			       unsigned long last_pfn)
+{
+	pfn = max(start_pfn, pfn);
+	pte = &pte[pfn_level_offset(pfn, level)];
+
+	do {
+		unsigned long level_pfn;
+		struct dma_pte *level_pte;
+
+		if (!dma_pte_present(pte) || dma_pte_superpage(pte))
+			goto next;
+
+		level_pfn = pfn & level_mask(level);
+		level_pte = phys_to_virt(dma_pte_addr(pte));
+
+		if (level > 2) {
+			dma_pte_free_level(domain, level - 1, retain_level,
+					   level_pte, level_pfn, start_pfn,
+					   last_pfn);
+		}
+
+		/*
+		 * Free the page table if we're below the level we want to
+		 * retain and the range covers the entire table.
+		 */
+		if (level < retain_level && !(start_pfn > level_pfn ||
+		      last_pfn < level_pfn + level_size(level) - 1)) {
+			dma_clear_pte(pte);
+			domain_flush_cache(domain, pte, sizeof(*pte));
+			free_pgtable_page(level_pte);
+		}
+next:
+		pfn += level_size(level);
+	} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
+}
+
+/*
+ * clear last level (leaf) ptes and free page table pages below the
+ * level we wish to keep intact.
+ */
+static void dma_pte_free_pagetable(struct dmar_domain *domain,
+				   unsigned long start_pfn,
+				   unsigned long last_pfn,
+				   int retain_level)
+{
+	BUG_ON(!domain_pfn_supported(domain, start_pfn));
+	BUG_ON(!domain_pfn_supported(domain, last_pfn));
+	BUG_ON(start_pfn > last_pfn);
+
+	dma_pte_clear_range(domain, start_pfn, last_pfn);
+
+	/* We don't need lock here; nobody else touches the iova range */
+	dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
+			   domain->pgd, 0, start_pfn, last_pfn);
+
+	/* free pgd */
+	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
+		free_pgtable_page(domain->pgd);
+		domain->pgd = NULL;
+	}
+}
+
+/* When a page at a given level is being unlinked from its parent, we don't
+   need to *modify* it at all. All we need to do is make a list of all the
+   pages which can be freed just as soon as we've flushed the IOTLB and we
+   know the hardware page-walk will no longer touch them.
+   The 'pte' argument is the *parent* PTE, pointing to the page that is to
+   be freed. */
+static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
+					    int level, struct dma_pte *pte,
+					    struct page *freelist)
+{
+	struct page *pg;
+
+	pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
+	pg->freelist = freelist;
+	freelist = pg;
+
+	if (level == 1)
+		return freelist;
+
+	pte = page_address(pg);
+	do {
+		if (dma_pte_present(pte) && !dma_pte_superpage(pte))
+			freelist = dma_pte_list_pagetables(domain, level - 1,
+							   pte, freelist);
+		pte++;
+	} while (!first_pte_in_page(pte));
+
+	return freelist;
+}
+
+static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
+					struct dma_pte *pte, unsigned long pfn,
+					unsigned long start_pfn,
+					unsigned long last_pfn,
+					struct page *freelist)
+{
+	struct dma_pte *first_pte = NULL, *last_pte = NULL;
+
+	pfn = max(start_pfn, pfn);
+	pte = &pte[pfn_level_offset(pfn, level)];
+
+	do {
+		unsigned long level_pfn;
+
+		if (!dma_pte_present(pte))
+			goto next;
+
+		level_pfn = pfn & level_mask(level);
+
+		/* If range covers entire pagetable, free it */
+		if (start_pfn <= level_pfn &&
+		    last_pfn >= level_pfn + level_size(level) - 1) {
+			/* These suborbinate page tables are going away entirely. Don't
+			   bother to clear them; we're just going to *free* them. */
+			if (level > 1 && !dma_pte_superpage(pte))
+				freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
+
+			dma_clear_pte(pte);
+			if (!first_pte)
+				first_pte = pte;
+			last_pte = pte;
+		} else if (level > 1) {
+			/* Recurse down into a level that isn't *entirely* obsolete */
+			freelist = dma_pte_clear_level(domain, level - 1,
+						       phys_to_virt(dma_pte_addr(pte)),
+						       level_pfn, start_pfn, last_pfn,
+						       freelist);
+		}
+next:
+		pfn += level_size(level);
+	} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
+
+	if (first_pte)
+		domain_flush_cache(domain, first_pte,
+				   (void *)++last_pte - (void *)first_pte);
+
+	return freelist;
+}
+
+/* We can't just free the pages because the IOMMU may still be walking
+   the page tables, and may have cached the intermediate levels. The
+   pages can only be freed after the IOTLB flush has been done. */
+static struct page *domain_unmap(struct dmar_domain *domain,
+				 unsigned long start_pfn,
+				 unsigned long last_pfn)
+{
+	struct page *freelist = NULL;
+
+	BUG_ON(!domain_pfn_supported(domain, start_pfn));
+	BUG_ON(!domain_pfn_supported(domain, last_pfn));
+	BUG_ON(start_pfn > last_pfn);
+
+	/* we don't need lock here; nobody else touches the iova range */
+	freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
+				       domain->pgd, 0, start_pfn, last_pfn, NULL);
+
+	/* free pgd */
+	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
+		struct page *pgd_page = virt_to_page(domain->pgd);
+		pgd_page->freelist = freelist;
+		freelist = pgd_page;
+
+		domain->pgd = NULL;
+	}
+
+	return freelist;
+}
+
+static void dma_free_pagelist(struct page *freelist)
+{
+	struct page *pg;
+
+	while ((pg = freelist)) {
+		freelist = pg->freelist;
+		free_pgtable_page(page_address(pg));
+	}
+}
+
+static void iova_entry_free(unsigned long data)
+{
+	struct page *freelist = (struct page *)data;
+
+	dma_free_pagelist(freelist);
+}
+
+/* iommu handling */
+static int iommu_alloc_root_entry(struct intel_iommu *iommu)
+{
+	struct root_entry *root;
+	unsigned long flags;
+
+	root = (struct root_entry *)alloc_pgtable_page(iommu->node);
+	if (!root) {
+		pr_err("Allocating root entry for %s failed\n",
+			iommu->name);
+		return -ENOMEM;
+	}
+
+	__iommu_flush_cache(iommu, root, ROOT_SIZE);
+
+	spin_lock_irqsave(&iommu->lock, flags);
+	iommu->root_entry = root;
+	spin_unlock_irqrestore(&iommu->lock, flags);
+
+	return 0;
+}
+
+static void iommu_set_root_entry(struct intel_iommu *iommu)
+{
+	u64 addr;
+	u32 sts;
+	unsigned long flag;
+
+	addr = virt_to_phys(iommu->root_entry);
+	if (ecs_enabled(iommu))
+		addr |= DMA_RTADDR_RTT;
+
+	raw_spin_lock_irqsave(&iommu->register_lock, flag);
+	dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
+
+	writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
+
+	/* Make sure hardware complete it */
+	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
+		      readl, (sts & DMA_GSTS_RTPS), sts);
+
+	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
+}
+
+static void iommu_flush_write_buffer(struct intel_iommu *iommu)
+{
+	u32 val;
+	unsigned long flag;
+
+	if (!rwbf_quirk && !cap_rwbf(iommu->cap))
+		return;
+
+	raw_spin_lock_irqsave(&iommu->register_lock, flag);
+	writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
+
+	/* Make sure hardware complete it */
+	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
+		      readl, (!(val & DMA_GSTS_WBFS)), val);
+
+	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
+}
+
+/* return value determine if we need a write buffer flush */
+static void __iommu_flush_context(struct intel_iommu *iommu,
+				  u16 did, u16 source_id, u8 function_mask,
+				  u64 type)
+{
+	u64 val = 0;
+	unsigned long flag;
+
+	switch (type) {
+	case DMA_CCMD_GLOBAL_INVL:
+		val = DMA_CCMD_GLOBAL_INVL;
+		break;
+	case DMA_CCMD_DOMAIN_INVL:
+		val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
+		break;
+	case DMA_CCMD_DEVICE_INVL:
+		val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
+			| DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
+		break;
+	default:
+		BUG();
+	}
+	val |= DMA_CCMD_ICC;
+
+	raw_spin_lock_irqsave(&iommu->register_lock, flag);
+	dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
+
+	/* Make sure hardware complete it */
+	IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
+		dmar_readq, (!(val & DMA_CCMD_ICC)), val);
+
+	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
+}
+
+/* return value determine if we need a write buffer flush */
+static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
+				u64 addr, unsigned int size_order, u64 type)
+{
+	int tlb_offset = ecap_iotlb_offset(iommu->ecap);
+	u64 val = 0, val_iva = 0;
+	unsigned long flag;
+
+	switch (type) {
+	case DMA_TLB_GLOBAL_FLUSH:
+		/* global flush doesn't need set IVA_REG */
+		val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
+		break;
+	case DMA_TLB_DSI_FLUSH:
+		val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
+		break;
+	case DMA_TLB_PSI_FLUSH:
+		val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
+		/* IH bit is passed in as part of address */
+		val_iva = size_order | addr;
+		break;
+	default:
+		BUG();
+	}
+	/* Note: set drain read/write */
+#if 0
+	/*
+	 * This is probably to be super secure.. Looks like we can
+	 * ignore it without any impact.
+	 */
+	if (cap_read_drain(iommu->cap))
+		val |= DMA_TLB_READ_DRAIN;
+#endif
+	if (cap_write_drain(iommu->cap))
+		val |= DMA_TLB_WRITE_DRAIN;
+
+	raw_spin_lock_irqsave(&iommu->register_lock, flag);
+	/* Note: Only uses first TLB reg currently */
+	if (val_iva)
+		dmar_writeq(iommu->reg + tlb_offset, val_iva);
+	dmar_writeq(iommu->reg + tlb_offset + 8, val);
+
+	/* Make sure hardware complete it */
+	IOMMU_WAIT_OP(iommu, tlb_offset + 8,
+		dmar_readq, (!(val & DMA_TLB_IVT)), val);
+
+	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
+
+	/* check IOTLB invalidation granularity */
+	if (DMA_TLB_IAIG(val) == 0)
+		pr_err("Flush IOTLB failed\n");
+	if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
+		pr_debug("TLB flush request %Lx, actual %Lx\n",
+			(unsigned long long)DMA_TLB_IIRG(type),
+			(unsigned long long)DMA_TLB_IAIG(val));
+}
+
+static struct device_domain_info *
+iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
+			 u8 bus, u8 devfn)
+{
+	struct device_domain_info *info;
+
+	assert_spin_locked(&device_domain_lock);
+
+	if (!iommu->qi)
+		return NULL;
+
+	list_for_each_entry(info, &domain->devices, link)
+		if (info->iommu == iommu && info->bus == bus &&
+		    info->devfn == devfn) {
+			if (info->ats_supported && info->dev)
+				return info;
+			break;
+		}
+
+	return NULL;
+}
+
+static void domain_update_iotlb(struct dmar_domain *domain)
+{
+	struct device_domain_info *info;
+	bool has_iotlb_device = false;
+
+	assert_spin_locked(&device_domain_lock);
+
+	list_for_each_entry(info, &domain->devices, link) {
+		struct pci_dev *pdev;
+
+		if (!info->dev || !dev_is_pci(info->dev))
+			continue;
+
+		pdev = to_pci_dev(info->dev);
+		if (pdev->ats_enabled) {
+			has_iotlb_device = true;
+			break;
+		}
+	}
+
+	domain->has_iotlb_device = has_iotlb_device;
+}
+
+static void iommu_enable_dev_iotlb(struct device_domain_info *info)
+{
+	struct pci_dev *pdev;
+
+	assert_spin_locked(&device_domain_lock);
+
+	if (!info || !dev_is_pci(info->dev))
+		return;
+
+	pdev = to_pci_dev(info->dev);
+	/* For IOMMU that supports device IOTLB throttling (DIT), we assign
+	 * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
+	 * queue depth at PF level. If DIT is not set, PFSID will be treated as
+	 * reserved, which should be set to 0.
+	 */
+	if (!ecap_dit(info->iommu->ecap))
+		info->pfsid = 0;
+	else {
+		struct pci_dev *pf_pdev;
+
+		/* pdev will be returned if device is not a vf */
+		pf_pdev = pci_physfn(pdev);
+		info->pfsid = PCI_DEVID(pf_pdev->bus->number, pf_pdev->devfn);
+	}
+
+#ifdef CONFIG_INTEL_IOMMU_SVM
+	/* The PCIe spec, in its wisdom, declares that the behaviour of
+	   the device if you enable PASID support after ATS support is
+	   undefined. So always enable PASID support on devices which
+	   have it, even if we can't yet know if we're ever going to
+	   use it. */
+	if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
+		info->pasid_enabled = 1;
+
+	if (info->pri_supported && !pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32))
+		info->pri_enabled = 1;
+#endif
+	if (info->ats_supported && !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
+		info->ats_enabled = 1;
+		domain_update_iotlb(info->domain);
+		info->ats_qdep = pci_ats_queue_depth(pdev);
+	}
+}
+
+static void iommu_disable_dev_iotlb(struct device_domain_info *info)
+{
+	struct pci_dev *pdev;
+
+	assert_spin_locked(&device_domain_lock);
+
+	if (!dev_is_pci(info->dev))
+		return;
+
+	pdev = to_pci_dev(info->dev);
+
+	if (info->ats_enabled) {
+		pci_disable_ats(pdev);
+		info->ats_enabled = 0;
+		domain_update_iotlb(info->domain);
+	}
+#ifdef CONFIG_INTEL_IOMMU_SVM
+	if (info->pri_enabled) {
+		pci_disable_pri(pdev);
+		info->pri_enabled = 0;
+	}
+	if (info->pasid_enabled) {
+		pci_disable_pasid(pdev);
+		info->pasid_enabled = 0;
+	}
+#endif
+}
+
+static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
+				  u64 addr, unsigned mask)
+{
+	u16 sid, qdep;
+	unsigned long flags;
+	struct device_domain_info *info;
+
+	if (!domain->has_iotlb_device)
+		return;
+
+	spin_lock_irqsave(&device_domain_lock, flags);
+	list_for_each_entry(info, &domain->devices, link) {
+		if (!info->ats_enabled)
+			continue;
+
+		sid = info->bus << 8 | info->devfn;
+		qdep = info->ats_qdep;
+		qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
+				qdep, addr, mask);
+	}
+	spin_unlock_irqrestore(&device_domain_lock, flags);
+}
+
+static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
+				  struct dmar_domain *domain,
+				  unsigned long pfn, unsigned int pages,
+				  int ih, int map)
+{
+	unsigned int mask = ilog2(__roundup_pow_of_two(pages));
+	uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
+	u16 did = domain->iommu_did[iommu->seq_id];
+
+	BUG_ON(pages == 0);
+
+	if (ih)
+		ih = 1 << 6;
+	/*
+	 * Fallback to domain selective flush if no PSI support or the size is
+	 * too big.
+	 * PSI requires page size to be 2 ^ x, and the base address is naturally
+	 * aligned to the size
+	 */
+	if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
+		iommu->flush.flush_iotlb(iommu, did, 0, 0,
+						DMA_TLB_DSI_FLUSH);
+	else
+		iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
+						DMA_TLB_PSI_FLUSH);
+
+	/*
+	 * In caching mode, changes of pages from non-present to present require
+	 * flush. However, device IOTLB doesn't need to be flushed in this case.
+	 */
+	if (!cap_caching_mode(iommu->cap) || !map)
+		iommu_flush_dev_iotlb(domain, addr, mask);
+}
+
+/* Notification for newly created mappings */
+static inline void __mapping_notify_one(struct intel_iommu *iommu,
+					struct dmar_domain *domain,
+					unsigned long pfn, unsigned int pages)
+{
+	/* It's a non-present to present mapping. Only flush if caching mode */
+	if (cap_caching_mode(iommu->cap))
+		iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
+	else
+		iommu_flush_write_buffer(iommu);
+}
+
+static void iommu_flush_iova(struct iova_domain *iovad)
+{
+	struct dmar_domain *domain;
+	int idx;
+
+	domain = container_of(iovad, struct dmar_domain, iovad);
+
+	for_each_domain_iommu(idx, domain) {
+		struct intel_iommu *iommu = g_iommus[idx];
+		u16 did = domain->iommu_did[iommu->seq_id];
+
+		iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
+
+		if (!cap_caching_mode(iommu->cap))
+			iommu_flush_dev_iotlb(get_iommu_domain(iommu, did),
+					      0, MAX_AGAW_PFN_WIDTH);
+	}
+}
+
+static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
+{
+	u32 pmen;
+	unsigned long flags;
+
+	if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
+		return;
+
+	raw_spin_lock_irqsave(&iommu->register_lock, flags);
+	pmen = readl(iommu->reg + DMAR_PMEN_REG);
+	pmen &= ~DMA_PMEN_EPM;
+	writel(pmen, iommu->reg + DMAR_PMEN_REG);
+
+	/* wait for the protected region status bit to clear */
+	IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
+		readl, !(pmen & DMA_PMEN_PRS), pmen);
+
+	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
+}
+
+static void iommu_enable_translation(struct intel_iommu *iommu)
+{
+	u32 sts;
+	unsigned long flags;
+
+	raw_spin_lock_irqsave(&iommu->register_lock, flags);
+	iommu->gcmd |= DMA_GCMD_TE;
+	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
+
+	/* Make sure hardware complete it */
+	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
+		      readl, (sts & DMA_GSTS_TES), sts);
+
+	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
+}
+
+static void iommu_disable_translation(struct intel_iommu *iommu)
+{
+	u32 sts;
+	unsigned long flag;
+
+	raw_spin_lock_irqsave(&iommu->register_lock, flag);
+	iommu->gcmd &= ~DMA_GCMD_TE;
+	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
+
+	/* Make sure hardware complete it */
+	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
+		      readl, (!(sts & DMA_GSTS_TES)), sts);
+
+	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
+}
+
+
+static int iommu_init_domains(struct intel_iommu *iommu)
+{
+	u32 ndomains, nlongs;
+	size_t size;
+
+	ndomains = cap_ndoms(iommu->cap);
+	pr_debug("%s: Number of Domains supported <%d>\n",
+		 iommu->name, ndomains);
+	nlongs = BITS_TO_LONGS(ndomains);
+
+	spin_lock_init(&iommu->lock);
+
+	iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
+	if (!iommu->domain_ids) {
+		pr_err("%s: Allocating domain id array failed\n",
+		       iommu->name);
+		return -ENOMEM;
+	}
+
+	size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **);
+	iommu->domains = kzalloc(size, GFP_KERNEL);
+
+	if (iommu->domains) {
+		size = 256 * sizeof(struct dmar_domain *);
+		iommu->domains[0] = kzalloc(size, GFP_KERNEL);
+	}
+
+	if (!iommu->domains || !iommu->domains[0]) {
+		pr_err("%s: Allocating domain array failed\n",
+		       iommu->name);
+		kfree(iommu->domain_ids);
+		kfree(iommu->domains);
+		iommu->domain_ids = NULL;
+		iommu->domains    = NULL;
+		return -ENOMEM;
+	}
+
+
+
+	/*
+	 * If Caching mode is set, then invalid translations are tagged
+	 * with domain-id 0, hence we need to pre-allocate it. We also
+	 * use domain-id 0 as a marker for non-allocated domain-id, so
+	 * make sure it is not used for a real domain.
+	 */
+	set_bit(0, iommu->domain_ids);
+
+	return 0;
+}
+
+static void disable_dmar_iommu(struct intel_iommu *iommu)
+{
+	struct device_domain_info *info, *tmp;
+	unsigned long flags;
+
+	if (!iommu->domains || !iommu->domain_ids)
+		return;
+
+again:
+	spin_lock_irqsave(&device_domain_lock, flags);
+	list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
+		struct dmar_domain *domain;
+
+		if (info->iommu != iommu)
+			continue;
+
+		if (!info->dev || !info->domain)
+			continue;
+
+		domain = info->domain;
+
+		__dmar_remove_one_dev_info(info);
+
+		if (!domain_type_is_vm_or_si(domain)) {
+			/*
+			 * The domain_exit() function  can't be called under
+			 * device_domain_lock, as it takes this lock itself.
+			 * So release the lock here and re-run the loop
+			 * afterwards.
+			 */
+			spin_unlock_irqrestore(&device_domain_lock, flags);
+			domain_exit(domain);
+			goto again;
+		}
+	}
+	spin_unlock_irqrestore(&device_domain_lock, flags);
+
+	if (iommu->gcmd & DMA_GCMD_TE)
+		iommu_disable_translation(iommu);
+}
+
+static void free_dmar_iommu(struct intel_iommu *iommu)
+{
+	if ((iommu->domains) && (iommu->domain_ids)) {
+		int elems = ALIGN(cap_ndoms(iommu->cap), 256) >> 8;
+		int i;
+
+		for (i = 0; i < elems; i++)
+			kfree(iommu->domains[i]);
+		kfree(iommu->domains);
+		kfree(iommu->domain_ids);
+		iommu->domains = NULL;
+		iommu->domain_ids = NULL;
+	}
+
+	g_iommus[iommu->seq_id] = NULL;
+
+	/* free context mapping */
+	free_context_table(iommu);
+
+#ifdef CONFIG_INTEL_IOMMU_SVM
+	if (pasid_enabled(iommu)) {
+		if (ecap_prs(iommu->ecap))
+			intel_svm_finish_prq(iommu);
+		intel_svm_exit(iommu);
+	}
+#endif
+}
+
+static struct dmar_domain *alloc_domain(int flags)
+{
+	struct dmar_domain *domain;
+
+	domain = alloc_domain_mem();
+	if (!domain)
+		return NULL;
+
+	memset(domain, 0, sizeof(*domain));
+	domain->nid = -1;
+	domain->flags = flags;
+	domain->has_iotlb_device = false;
+	INIT_LIST_HEAD(&domain->devices);
+
+	return domain;
+}
+
+/* Must be called with iommu->lock */
+static int domain_attach_iommu(struct dmar_domain *domain,
+			       struct intel_iommu *iommu)
+{
+	unsigned long ndomains;
+	int num;
+
+	assert_spin_locked(&device_domain_lock);
+	assert_spin_locked(&iommu->lock);
+
+	domain->iommu_refcnt[iommu->seq_id] += 1;
+	domain->iommu_count += 1;
+	if (domain->iommu_refcnt[iommu->seq_id] == 1) {
+		ndomains = cap_ndoms(iommu->cap);
+		num      = find_first_zero_bit(iommu->domain_ids, ndomains);
+
+		if (num >= ndomains) {
+			pr_err("%s: No free domain ids\n", iommu->name);
+			domain->iommu_refcnt[iommu->seq_id] -= 1;
+			domain->iommu_count -= 1;
+			return -ENOSPC;
+		}
+
+		set_bit(num, iommu->domain_ids);
+		set_iommu_domain(iommu, num, domain);
+
+		domain->iommu_did[iommu->seq_id] = num;
+		domain->nid			 = iommu->node;
+
+		domain_update_iommu_cap(domain);
+	}
+
+	return 0;
+}
+
+static int domain_detach_iommu(struct dmar_domain *domain,
+			       struct intel_iommu *iommu)
+{
+	int num, count = INT_MAX;
+
+	assert_spin_locked(&device_domain_lock);
+	assert_spin_locked(&iommu->lock);
+
+	domain->iommu_refcnt[iommu->seq_id] -= 1;
+	count = --domain->iommu_count;
+	if (domain->iommu_refcnt[iommu->seq_id] == 0) {
+		num = domain->iommu_did[iommu->seq_id];
+		clear_bit(num, iommu->domain_ids);
+		set_iommu_domain(iommu, num, NULL);
+
+		domain_update_iommu_cap(domain);
+		domain->iommu_did[iommu->seq_id] = 0;
+	}
+
+	return count;
+}
+
+static struct iova_domain reserved_iova_list;
+static struct lock_class_key reserved_rbtree_key;
+
+static int dmar_init_reserved_ranges(void)
+{
+	struct pci_dev *pdev = NULL;
+	struct iova *iova;
+	int i;
+
+	init_iova_domain(&reserved_iova_list, VTD_PAGE_SIZE, IOVA_START_PFN);
+
+	lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
+		&reserved_rbtree_key);
+
+	/* IOAPIC ranges shouldn't be accessed by DMA */
+	iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
+		IOVA_PFN(IOAPIC_RANGE_END));
+	if (!iova) {
+		pr_err("Reserve IOAPIC range failed\n");
+		return -ENODEV;
+	}
+
+	/* Reserve all PCI MMIO to avoid peer-to-peer access */
+	for_each_pci_dev(pdev) {
+		struct resource *r;
+
+		for (i = 0; i < PCI_NUM_RESOURCES; i++) {
+			r = &pdev->resource[i];
+			if (!r->flags || !(r->flags & IORESOURCE_MEM))
+				continue;
+			iova = reserve_iova(&reserved_iova_list,
+					    IOVA_PFN(r->start),
+					    IOVA_PFN(r->end));
+			if (!iova) {
+				pr_err("Reserve iova failed\n");
+				return -ENODEV;
+			}
+		}
+	}
+	return 0;
+}
+
+static void domain_reserve_special_ranges(struct dmar_domain *domain)
+{
+	copy_reserved_iova(&reserved_iova_list, &domain->iovad);
+}
+
+static inline int guestwidth_to_adjustwidth(int gaw)
+{
+	int agaw;
+	int r = (gaw - 12) % 9;
+
+	if (r == 0)
+		agaw = gaw;
+	else
+		agaw = gaw + 9 - r;
+	if (agaw > 64)
+		agaw = 64;
+	return agaw;
+}
+
+static int domain_init(struct dmar_domain *domain, struct intel_iommu *iommu,
+		       int guest_width)
+{
+	int adjust_width, agaw, cap_width;
+	unsigned long sagaw;
+	int err;
+
+	init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
+
+	err = init_iova_flush_queue(&domain->iovad,
+				    iommu_flush_iova, iova_entry_free);
+	if (err)
+		return err;
+
+	domain_reserve_special_ranges(domain);
+
+	/* calculate AGAW */
+	cap_width = min_t(int, cap_mgaw(iommu->cap), agaw_to_width(iommu->agaw));
+	if (guest_width > cap_width)
+		guest_width = cap_width;
+	domain->gaw = guest_width;
+	adjust_width = guestwidth_to_adjustwidth(guest_width);
+	agaw = width_to_agaw(adjust_width);
+	sagaw = cap_sagaw(iommu->cap);
+	if (!test_bit(agaw, &sagaw)) {
+		/* hardware doesn't support it, choose a bigger one */
+		pr_debug("Hardware doesn't support agaw %d\n", agaw);
+		agaw = find_next_bit(&sagaw, 5, agaw);
+		if (agaw >= 5)
+			return -ENODEV;
+	}
+	domain->agaw = agaw;
+
+	if (ecap_coherent(iommu->ecap))
+		domain->iommu_coherency = 1;
+	else
+		domain->iommu_coherency = 0;
+
+	if (ecap_sc_support(iommu->ecap))
+		domain->iommu_snooping = 1;
+	else
+		domain->iommu_snooping = 0;
+
+	if (intel_iommu_superpage)
+		domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
+	else
+		domain->iommu_superpage = 0;
+
+	domain->nid = iommu->node;
+
+	/* always allocate the top pgd */
+	domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
+	if (!domain->pgd)
+		return -ENOMEM;
+	__iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
+	return 0;
+}
+
+static void domain_exit(struct dmar_domain *domain)
+{
+	struct page *freelist = NULL;
+
+	/* Domain 0 is reserved, so dont process it */
+	if (!domain)
+		return;
+
+	/* Remove associated devices and clear attached or cached domains */
+	rcu_read_lock();
+	domain_remove_dev_info(domain);
+	rcu_read_unlock();
+
+	/* destroy iovas */
+	put_iova_domain(&domain->iovad);
+
+	freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
+
+	dma_free_pagelist(freelist);
+
+	free_domain_mem(domain);
+}
+
+static int domain_context_mapping_one(struct dmar_domain *domain,
+				      struct intel_iommu *iommu,
+				      u8 bus, u8 devfn)
+{
+	u16 did = domain->iommu_did[iommu->seq_id];
+	int translation = CONTEXT_TT_MULTI_LEVEL;
+	struct device_domain_info *info = NULL;
+	struct context_entry *context;
+	unsigned long flags;
+	struct dma_pte *pgd;
+	int ret, agaw;
+
+	WARN_ON(did == 0);
+
+	if (hw_pass_through && domain_type_is_si(domain))
+		translation = CONTEXT_TT_PASS_THROUGH;
+
+	pr_debug("Set context mapping for %02x:%02x.%d\n",
+		bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
+
+	BUG_ON(!domain->pgd);
+
+	spin_lock_irqsave(&device_domain_lock, flags);
+	spin_lock(&iommu->lock);
+
+	ret = -ENOMEM;
+	context = iommu_context_addr(iommu, bus, devfn, 1);
+	if (!context)
+		goto out_unlock;
+
+	ret = 0;
+	if (context_present(context))
+		goto out_unlock;
+
+	/*
+	 * For kdump cases, old valid entries may be cached due to the
+	 * in-flight DMA and copied pgtable, but there is no unmapping
+	 * behaviour for them, thus we need an explicit cache flush for
+	 * the newly-mapped device. For kdump, at this point, the device
+	 * is supposed to finish reset at its driver probe stage, so no
+	 * in-flight DMA will exist, and we don't need to worry anymore
+	 * hereafter.
+	 */
+	if (context_copied(context)) {
+		u16 did_old = context_domain_id(context);
+
+		if (did_old < cap_ndoms(iommu->cap)) {
+			iommu->flush.flush_context(iommu, did_old,
+						   (((u16)bus) << 8) | devfn,
+						   DMA_CCMD_MASK_NOBIT,
+						   DMA_CCMD_DEVICE_INVL);
+			iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
+						 DMA_TLB_DSI_FLUSH);
+		}
+	}
+
+	pgd = domain->pgd;
+
+	context_clear_entry(context);
+	context_set_domain_id(context, did);
+
+	/*
+	 * Skip top levels of page tables for iommu which has less agaw
+	 * than default.  Unnecessary for PT mode.
+	 */
+	if (translation != CONTEXT_TT_PASS_THROUGH) {
+		for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
+			ret = -ENOMEM;
+			pgd = phys_to_virt(dma_pte_addr(pgd));
+			if (!dma_pte_present(pgd))
+				goto out_unlock;
+		}
+
+		info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
+		if (info && info->ats_supported)
+			translation = CONTEXT_TT_DEV_IOTLB;
+		else
+			translation = CONTEXT_TT_MULTI_LEVEL;
+
+		context_set_address_root(context, virt_to_phys(pgd));
+		context_set_address_width(context, agaw);
+	} else {
+		/*
+		 * In pass through mode, AW must be programmed to
+		 * indicate the largest AGAW value supported by
+		 * hardware. And ASR is ignored by hardware.
+		 */
+		context_set_address_width(context, iommu->msagaw);
+	}
+
+	context_set_translation_type(context, translation);
+	context_set_fault_enable(context);
+	context_set_present(context);
+	domain_flush_cache(domain, context, sizeof(*context));
+
+	/*
+	 * It's a non-present to present mapping. If hardware doesn't cache
+	 * non-present entry we only need to flush the write-buffer. If the
+	 * _does_ cache non-present entries, then it does so in the special
+	 * domain #0, which we have to flush:
+	 */
+	if (cap_caching_mode(iommu->cap)) {
+		iommu->flush.flush_context(iommu, 0,
+					   (((u16)bus) << 8) | devfn,
+					   DMA_CCMD_MASK_NOBIT,
+					   DMA_CCMD_DEVICE_INVL);
+		iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
+	} else {
+		iommu_flush_write_buffer(iommu);
+	}
+	iommu_enable_dev_iotlb(info);
+
+	ret = 0;
+
+out_unlock:
+	spin_unlock(&iommu->lock);
+	spin_unlock_irqrestore(&device_domain_lock, flags);
+
+	return ret;
+}
+
+struct domain_context_mapping_data {
+	struct dmar_domain *domain;
+	struct intel_iommu *iommu;
+};
+
+static int domain_context_mapping_cb(struct pci_dev *pdev,
+				     u16 alias, void *opaque)
+{
+	struct domain_context_mapping_data *data = opaque;
+
+	return domain_context_mapping_one(data->domain, data->iommu,
+					  PCI_BUS_NUM(alias), alias & 0xff);
+}
+
+static int
+domain_context_mapping(struct dmar_domain *domain, struct device *dev)
+{
+	struct intel_iommu *iommu;
+	u8 bus, devfn;
+	struct domain_context_mapping_data data;
+
+	iommu = device_to_iommu(dev, &bus, &devfn);
+	if (!iommu)
+		return -ENODEV;
+
+	if (!dev_is_pci(dev))
+		return domain_context_mapping_one(domain, iommu, bus, devfn);
+
+	data.domain = domain;
+	data.iommu = iommu;
+
+	return pci_for_each_dma_alias(to_pci_dev(dev),
+				      &domain_context_mapping_cb, &data);
+}
+
+static int domain_context_mapped_cb(struct pci_dev *pdev,
+				    u16 alias, void *opaque)
+{
+	struct intel_iommu *iommu = opaque;
+
+	return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
+}
+
+static int domain_context_mapped(struct device *dev)
+{
+	struct intel_iommu *iommu;
+	u8 bus, devfn;
+
+	iommu = device_to_iommu(dev, &bus, &devfn);
+	if (!iommu)
+		return -ENODEV;
+
+	if (!dev_is_pci(dev))
+		return device_context_mapped(iommu, bus, devfn);
+
+	return !pci_for_each_dma_alias(to_pci_dev(dev),
+				       domain_context_mapped_cb, iommu);
+}
+
+/* Returns a number of VTD pages, but aligned to MM page size */
+static inline unsigned long aligned_nrpages(unsigned long host_addr,
+					    size_t size)
+{
+	host_addr &= ~PAGE_MASK;
+	return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
+}
+
+/* Return largest possible superpage level for a given mapping */
+static inline int hardware_largepage_caps(struct dmar_domain *domain,
+					  unsigned long iov_pfn,
+					  unsigned long phy_pfn,
+					  unsigned long pages)
+{
+	int support, level = 1;
+	unsigned long pfnmerge;
+
+	support = domain->iommu_superpage;
+
+	/* To use a large page, the virtual *and* physical addresses
+	   must be aligned to 2MiB/1GiB/etc. Lower bits set in either
+	   of them will mean we have to use smaller pages. So just
+	   merge them and check both at once. */
+	pfnmerge = iov_pfn | phy_pfn;
+
+	while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
+		pages >>= VTD_STRIDE_SHIFT;
+		if (!pages)
+			break;
+		pfnmerge >>= VTD_STRIDE_SHIFT;
+		level++;
+		support--;
+	}
+	return level;
+}
+
+static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
+			    struct scatterlist *sg, unsigned long phys_pfn,
+			    unsigned long nr_pages, int prot)
+{
+	struct dma_pte *first_pte = NULL, *pte = NULL;
+	phys_addr_t uninitialized_var(pteval);
+	unsigned long sg_res = 0;
+	unsigned int largepage_lvl = 0;
+	unsigned long lvl_pages = 0;
+
+	BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
+
+	if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
+		return -EINVAL;
+
+	prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
+
+	if (!sg) {
+		sg_res = nr_pages;
+		pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
+	}
+
+	while (nr_pages > 0) {
+		uint64_t tmp;
+
+		if (!sg_res) {
+			unsigned int pgoff = sg->offset & ~PAGE_MASK;
+
+			sg_res = aligned_nrpages(sg->offset, sg->length);
+			sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + pgoff;
+			sg->dma_length = sg->length;
+			pteval = (sg_phys(sg) - pgoff) | prot;
+			phys_pfn = pteval >> VTD_PAGE_SHIFT;
+		}
+
+		if (!pte) {
+			largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
+
+			first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
+			if (!pte)
+				return -ENOMEM;
+			/* It is large page*/
+			if (largepage_lvl > 1) {
+				unsigned long nr_superpages, end_pfn;
+
+				pteval |= DMA_PTE_LARGE_PAGE;
+				lvl_pages = lvl_to_nr_pages(largepage_lvl);
+
+				nr_superpages = sg_res / lvl_pages;
+				end_pfn = iov_pfn + nr_superpages * lvl_pages - 1;
+
+				/*
+				 * Ensure that old small page tables are
+				 * removed to make room for superpage(s).
+				 * We're adding new large pages, so make sure
+				 * we don't remove their parent tables.
+				 */
+				dma_pte_free_pagetable(domain, iov_pfn, end_pfn,
+						       largepage_lvl + 1);
+			} else {
+				pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
+			}
+
+		}
+		/* We don't need lock here, nobody else
+		 * touches the iova range
+		 */
+		tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
+		if (tmp) {
+			static int dumps = 5;
+			pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
+				iov_pfn, tmp, (unsigned long long)pteval);
+			if (dumps) {
+				dumps--;
+				debug_dma_dump_mappings(NULL);
+			}
+			WARN_ON(1);
+		}
+
+		lvl_pages = lvl_to_nr_pages(largepage_lvl);
+
+		BUG_ON(nr_pages < lvl_pages);
+		BUG_ON(sg_res < lvl_pages);
+
+		nr_pages -= lvl_pages;
+		iov_pfn += lvl_pages;
+		phys_pfn += lvl_pages;
+		pteval += lvl_pages * VTD_PAGE_SIZE;
+		sg_res -= lvl_pages;
+
+		/* If the next PTE would be the first in a new page, then we
+		   need to flush the cache on the entries we've just written.
+		   And then we'll need to recalculate 'pte', so clear it and
+		   let it get set again in the if (!pte) block above.
+
+		   If we're done (!nr_pages) we need to flush the cache too.
+
+		   Also if we've been setting superpages, we may need to
+		   recalculate 'pte' and switch back to smaller pages for the
+		   end of the mapping, if the trailing size is not enough to
+		   use another superpage (i.e. sg_res < lvl_pages). */
+		pte++;
+		if (!nr_pages || first_pte_in_page(pte) ||
+		    (largepage_lvl > 1 && sg_res < lvl_pages)) {
+			domain_flush_cache(domain, first_pte,
+					   (void *)pte - (void *)first_pte);
+			pte = NULL;
+		}
+
+		if (!sg_res && nr_pages)
+			sg = sg_next(sg);
+	}
+	return 0;
+}
+
+static int domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
+                         struct scatterlist *sg, unsigned long phys_pfn,
+                         unsigned long nr_pages, int prot)
+{
+       int ret;
+       struct intel_iommu *iommu;
+
+       /* Do the real mapping first */
+       ret = __domain_mapping(domain, iov_pfn, sg, phys_pfn, nr_pages, prot);
+       if (ret)
+               return ret;
+
+       /* Notify about the new mapping */
+       if (domain_type_is_vm(domain)) {
+	       /* VM typed domains can have more than one IOMMUs */
+	       int iommu_id;
+	       for_each_domain_iommu(iommu_id, domain) {
+		       iommu = g_iommus[iommu_id];
+		       __mapping_notify_one(iommu, domain, iov_pfn, nr_pages);
+	       }
+       } else {
+	       /* General domains only have one IOMMU */
+	       iommu = domain_get_iommu(domain);
+	       __mapping_notify_one(iommu, domain, iov_pfn, nr_pages);
+       }
+
+       return 0;
+}
+
+static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
+				    struct scatterlist *sg, unsigned long nr_pages,
+				    int prot)
+{
+	return domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
+}
+
+static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
+				     unsigned long phys_pfn, unsigned long nr_pages,
+				     int prot)
+{
+	return domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
+}
+
+static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn)
+{
+	unsigned long flags;
+	struct context_entry *context;
+	u16 did_old;
+
+	if (!iommu)
+		return;
+
+	spin_lock_irqsave(&iommu->lock, flags);
+	context = iommu_context_addr(iommu, bus, devfn, 0);
+	if (!context) {
+		spin_unlock_irqrestore(&iommu->lock, flags);
+		return;
+	}
+	did_old = context_domain_id(context);
+	context_clear_entry(context);
+	__iommu_flush_cache(iommu, context, sizeof(*context));
+	spin_unlock_irqrestore(&iommu->lock, flags);
+	iommu->flush.flush_context(iommu,
+				   did_old,
+				   (((u16)bus) << 8) | devfn,
+				   DMA_CCMD_MASK_NOBIT,
+				   DMA_CCMD_DEVICE_INVL);
+	iommu->flush.flush_iotlb(iommu,
+				 did_old,
+				 0,
+				 0,
+				 DMA_TLB_DSI_FLUSH);
+}
+
+static inline void unlink_domain_info(struct device_domain_info *info)
+{
+	assert_spin_locked(&device_domain_lock);
+	list_del(&info->link);
+	list_del(&info->global);
+	if (info->dev)
+		info->dev->archdata.iommu = NULL;
+}
+
+static void domain_remove_dev_info(struct dmar_domain *domain)
+{
+	struct device_domain_info *info, *tmp;
+	unsigned long flags;
+
+	spin_lock_irqsave(&device_domain_lock, flags);
+	list_for_each_entry_safe(info, tmp, &domain->devices, link)
+		__dmar_remove_one_dev_info(info);
+	spin_unlock_irqrestore(&device_domain_lock, flags);
+}
+
+/*
+ * find_domain
+ * Note: we use struct device->archdata.iommu stores the info
+ */
+static struct dmar_domain *find_domain(struct device *dev)
+{
+	struct device_domain_info *info;
+
+	/* No lock here, assumes no domain exit in normal case */
+	info = dev->archdata.iommu;
+	if (likely(info))
+		return info->domain;
+	return NULL;
+}
+
+static inline struct device_domain_info *
+dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
+{
+	struct device_domain_info *info;
+
+	list_for_each_entry(info, &device_domain_list, global)
+		if (info->iommu->segment == segment && info->bus == bus &&
+		    info->devfn == devfn)
+			return info;
+
+	return NULL;
+}
+
+static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
+						    int bus, int devfn,
+						    struct device *dev,
+						    struct dmar_domain *domain)
+{
+	struct dmar_domain *found = NULL;
+	struct device_domain_info *info;
+	unsigned long flags;
+	int ret;
+
+	info = alloc_devinfo_mem();
+	if (!info)
+		return NULL;
+
+	info->bus = bus;
+	info->devfn = devfn;
+	info->ats_supported = info->pasid_supported = info->pri_supported = 0;
+	info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0;
+	info->ats_qdep = 0;
+	info->dev = dev;
+	info->domain = domain;
+	info->iommu = iommu;
+	info->pasid_table = NULL;
+
+	if (dev && dev_is_pci(dev)) {
+		struct pci_dev *pdev = to_pci_dev(info->dev);
+
+		if (!pci_ats_disabled() &&
+		    ecap_dev_iotlb_support(iommu->ecap) &&
+		    pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ATS) &&
+		    dmar_find_matched_atsr_unit(pdev))
+			info->ats_supported = 1;
+
+		if (ecs_enabled(iommu)) {
+			if (pasid_enabled(iommu)) {
+				int features = pci_pasid_features(pdev);
+				if (features >= 0)
+					info->pasid_supported = features | 1;
+			}
+
+			if (info->ats_supported && ecap_prs(iommu->ecap) &&
+			    pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI))
+				info->pri_supported = 1;
+		}
+	}
+
+	spin_lock_irqsave(&device_domain_lock, flags);
+	if (dev)
+		found = find_domain(dev);
+
+	if (!found) {
+		struct device_domain_info *info2;
+		info2 = dmar_search_domain_by_dev_info(iommu->segment, bus, devfn);
+		if (info2) {
+			found      = info2->domain;
+			info2->dev = dev;
+		}
+	}
+
+	if (found) {
+		spin_unlock_irqrestore(&device_domain_lock, flags);
+		free_devinfo_mem(info);
+		/* Caller must free the original domain */
+		return found;
+	}
+
+	spin_lock(&iommu->lock);
+	ret = domain_attach_iommu(domain, iommu);
+	spin_unlock(&iommu->lock);
+
+	if (ret) {
+		spin_unlock_irqrestore(&device_domain_lock, flags);
+		free_devinfo_mem(info);
+		return NULL;
+	}
+
+	list_add(&info->link, &domain->devices);
+	list_add(&info->global, &device_domain_list);
+	if (dev)
+		dev->archdata.iommu = info;
+
+	if (dev && dev_is_pci(dev) && info->pasid_supported) {
+		ret = intel_pasid_alloc_table(dev);
+		if (ret) {
+			pr_warn("No pasid table for %s, pasid disabled\n",
+				dev_name(dev));
+			info->pasid_supported = 0;
+		}
+	}
+	spin_unlock_irqrestore(&device_domain_lock, flags);
+
+	if (dev && domain_context_mapping(domain, dev)) {
+		pr_err("Domain context map for %s failed\n", dev_name(dev));
+		dmar_remove_one_dev_info(domain, dev);
+		return NULL;
+	}
+
+	return domain;
+}
+
+static int get_last_alias(struct pci_dev *pdev, u16 alias, void *opaque)
+{
+	*(u16 *)opaque = alias;
+	return 0;
+}
+
+static struct dmar_domain *find_or_alloc_domain(struct device *dev, int gaw)
+{
+	struct device_domain_info *info = NULL;
+	struct dmar_domain *domain = NULL;
+	struct intel_iommu *iommu;
+	u16 dma_alias;
+	unsigned long flags;
+	u8 bus, devfn;
+
+	iommu = device_to_iommu(dev, &bus, &devfn);
+	if (!iommu)
+		return NULL;
+
+	if (dev_is_pci(dev)) {
+		struct pci_dev *pdev = to_pci_dev(dev);
+
+		pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
+
+		spin_lock_irqsave(&device_domain_lock, flags);
+		info = dmar_search_domain_by_dev_info(pci_domain_nr(pdev->bus),
+						      PCI_BUS_NUM(dma_alias),
+						      dma_alias & 0xff);
+		if (info) {
+			iommu = info->iommu;
+			domain = info->domain;
+		}
+		spin_unlock_irqrestore(&device_domain_lock, flags);
+
+		/* DMA alias already has a domain, use it */
+		if (info)
+			goto out;
+	}
+
+	/* Allocate and initialize new domain for the device */
+	domain = alloc_domain(0);
+	if (!domain)
+		return NULL;
+	if (domain_init(domain, iommu, gaw)) {
+		domain_exit(domain);
+		return NULL;
+	}
+
+out:
+
+	return domain;
+}
+
+static struct dmar_domain *set_domain_for_dev(struct device *dev,
+					      struct dmar_domain *domain)
+{
+	struct intel_iommu *iommu;
+	struct dmar_domain *tmp;
+	u16 req_id, dma_alias;
+	u8 bus, devfn;
+
+	iommu = device_to_iommu(dev, &bus, &devfn);
+	if (!iommu)
+		return NULL;
+
+	req_id = ((u16)bus << 8) | devfn;
+
+	if (dev_is_pci(dev)) {
+		struct pci_dev *pdev = to_pci_dev(dev);
+
+		pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
+
+		/* register PCI DMA alias device */
+		if (req_id != dma_alias) {
+			tmp = dmar_insert_one_dev_info(iommu, PCI_BUS_NUM(dma_alias),
+					dma_alias & 0xff, NULL, domain);
+
+			if (!tmp || tmp != domain)
+				return tmp;
+		}
+	}
+
+	tmp = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
+	if (!tmp || tmp != domain)
+		return tmp;
+
+	return domain;
+}
+
+static struct dmar_domain *get_domain_for_dev(struct device *dev, int gaw)
+{
+	struct dmar_domain *domain, *tmp;
+
+	domain = find_domain(dev);
+	if (domain)
+		goto out;
+
+	domain = find_or_alloc_domain(dev, gaw);
+	if (!domain)
+		goto out;
+
+	tmp = set_domain_for_dev(dev, domain);
+	if (!tmp || domain != tmp) {
+		domain_exit(domain);
+		domain = tmp;
+	}
+
+out:
+
+	return domain;
+}
+
+static int iommu_domain_identity_map(struct dmar_domain *domain,
+				     unsigned long long start,
+				     unsigned long long end)
+{
+	unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
+	unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
+
+	if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
+			  dma_to_mm_pfn(last_vpfn))) {
+		pr_err("Reserving iova failed\n");
+		return -ENOMEM;
+	}
+
+	pr_debug("Mapping reserved region %llx-%llx\n", start, end);
+	/*
+	 * RMRR range might have overlap with physical memory range,
+	 * clear it first
+	 */
+	dma_pte_clear_range(domain, first_vpfn, last_vpfn);
+
+	return __domain_mapping(domain, first_vpfn, NULL,
+				first_vpfn, last_vpfn - first_vpfn + 1,
+				DMA_PTE_READ|DMA_PTE_WRITE);
+}
+
+static int domain_prepare_identity_map(struct device *dev,
+				       struct dmar_domain *domain,
+				       unsigned long long start,
+				       unsigned long long end)
+{
+	/* For _hardware_ passthrough, don't bother. But for software
+	   passthrough, we do it anyway -- it may indicate a memory
+	   range which is reserved in E820, so which didn't get set
+	   up to start with in si_domain */
+	if (domain == si_domain && hw_pass_through) {
+		pr_warn("Ignoring identity map for HW passthrough device %s [0x%Lx - 0x%Lx]\n",
+			dev_name(dev), start, end);
+		return 0;
+	}
+
+	pr_info("Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
+		dev_name(dev), start, end);
+
+	if (end < start) {
+		WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
+			"BIOS vendor: %s; Ver: %s; Product Version: %s\n",
+			dmi_get_system_info(DMI_BIOS_VENDOR),
+			dmi_get_system_info(DMI_BIOS_VERSION),
+		     dmi_get_system_info(DMI_PRODUCT_VERSION));
+		return -EIO;
+	}
+
+	if (end >> agaw_to_width(domain->agaw)) {
+		WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
+		     "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
+		     agaw_to_width(domain->agaw),
+		     dmi_get_system_info(DMI_BIOS_VENDOR),
+		     dmi_get_system_info(DMI_BIOS_VERSION),
+		     dmi_get_system_info(DMI_PRODUCT_VERSION));
+		return -EIO;
+	}
+
+	return iommu_domain_identity_map(domain, start, end);
+}
+
+static int iommu_prepare_identity_map(struct device *dev,
+				      unsigned long long start,
+				      unsigned long long end)
+{
+	struct dmar_domain *domain;
+	int ret;
+
+	domain = get_domain_for_dev(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
+	if (!domain)
+		return -ENOMEM;
+
+	ret = domain_prepare_identity_map(dev, domain, start, end);
+	if (ret)
+		domain_exit(domain);
+
+	return ret;
+}
+
+static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
+					 struct device *dev)
+{
+	if (dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
+		return 0;
+	return iommu_prepare_identity_map(dev, rmrr->base_address,
+					  rmrr->end_address);
+}
+
+#ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
+static inline void iommu_prepare_isa(void)
+{
+	struct pci_dev *pdev;
+	int ret;
+
+	pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
+	if (!pdev)
+		return;
+
+	pr_info("Prepare 0-16MiB unity mapping for LPC\n");
+	ret = iommu_prepare_identity_map(&pdev->dev, 0, 16*1024*1024 - 1);
+
+	if (ret)
+		pr_err("Failed to create 0-16MiB identity map - floppy might not work\n");
+
+	pci_dev_put(pdev);
+}
+#else
+static inline void iommu_prepare_isa(void)
+{
+	return;
+}
+#endif /* !CONFIG_INTEL_IOMMU_FLPY_WA */
+
+static int md_domain_init(struct dmar_domain *domain, int guest_width);
+
+static int __init si_domain_init(int hw)
+{
+	int nid, ret = 0;
+
+	si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
+	if (!si_domain)
+		return -EFAULT;
+
+	if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
+		domain_exit(si_domain);
+		return -EFAULT;
+	}
+
+	pr_debug("Identity mapping domain allocated\n");
+
+	if (hw)
+		return 0;
+
+	for_each_online_node(nid) {
+		unsigned long start_pfn, end_pfn;
+		int i;
+
+		for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
+			ret = iommu_domain_identity_map(si_domain,
+					PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
+			if (ret)
+				return ret;
+		}
+	}
+
+	return 0;
+}
+
+static int identity_mapping(struct device *dev)
+{
+	struct device_domain_info *info;
+
+	if (likely(!iommu_identity_mapping))
+		return 0;
+
+	info = dev->archdata.iommu;
+	if (info && info != DUMMY_DEVICE_DOMAIN_INFO)
+		return (info->domain == si_domain);
+
+	return 0;
+}
+
+static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
+{
+	struct dmar_domain *ndomain;
+	struct intel_iommu *iommu;
+	u8 bus, devfn;
+
+	iommu = device_to_iommu(dev, &bus, &devfn);
+	if (!iommu)
+		return -ENODEV;
+
+	ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
+	if (ndomain != domain)
+		return -EBUSY;
+
+	return 0;
+}
+
+static bool device_has_rmrr(struct device *dev)
+{
+	struct dmar_rmrr_unit *rmrr;
+	struct device *tmp;
+	int i;
+
+	rcu_read_lock();
+	for_each_rmrr_units(rmrr) {
+		/*
+		 * Return TRUE if this RMRR contains the device that
+		 * is passed in.
+		 */
+		for_each_active_dev_scope(rmrr->devices,
+					  rmrr->devices_cnt, i, tmp)
+			if (tmp == dev) {
+				rcu_read_unlock();
+				return true;
+			}
+	}
+	rcu_read_unlock();
+	return false;
+}
+
+/*
+ * There are a couple cases where we need to restrict the functionality of
+ * devices associated with RMRRs.  The first is when evaluating a device for
+ * identity mapping because problems exist when devices are moved in and out
+ * of domains and their respective RMRR information is lost.  This means that
+ * a device with associated RMRRs will never be in a "passthrough" domain.
+ * The second is use of the device through the IOMMU API.  This interface
+ * expects to have full control of the IOVA space for the device.  We cannot
+ * satisfy both the requirement that RMRR access is maintained and have an
+ * unencumbered IOVA space.  We also have no ability to quiesce the device's
+ * use of the RMRR space or even inform the IOMMU API user of the restriction.
+ * We therefore prevent devices associated with an RMRR from participating in
+ * the IOMMU API, which eliminates them from device assignment.
+ *
+ * In both cases we assume that PCI USB devices with RMRRs have them largely
+ * for historical reasons and that the RMRR space is not actively used post
+ * boot.  This exclusion may change if vendors begin to abuse it.
+ *
+ * The same exception is made for graphics devices, with the requirement that
+ * any use of the RMRR regions will be torn down before assigning the device
+ * to a guest.
+ */
+static bool device_is_rmrr_locked(struct device *dev)
+{
+	if (!device_has_rmrr(dev))
+		return false;
+
+	if (dev_is_pci(dev)) {
+		struct pci_dev *pdev = to_pci_dev(dev);
+
+		if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
+			return false;
+	}
+
+	return true;
+}
+
+static int iommu_should_identity_map(struct device *dev, int startup)
+{
+
+	if (dev_is_pci(dev)) {
+		struct pci_dev *pdev = to_pci_dev(dev);
+
+		if (device_is_rmrr_locked(dev))
+			return 0;
+
+		if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
+			return 1;
+
+		if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
+			return 1;
+
+		if (!(iommu_identity_mapping & IDENTMAP_ALL))
+			return 0;
+
+		/*
+		 * We want to start off with all devices in the 1:1 domain, and
+		 * take them out later if we find they can't access all of memory.
+		 *
+		 * However, we can't do this for PCI devices behind bridges,
+		 * because all PCI devices behind the same bridge will end up
+		 * with the same source-id on their transactions.
+		 *
+		 * Practically speaking, we can't change things around for these
+		 * devices at run-time, because we can't be sure there'll be no
+		 * DMA transactions in flight for any of their siblings.
+		 *
+		 * So PCI devices (unless they're on the root bus) as well as
+		 * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
+		 * the 1:1 domain, just in _case_ one of their siblings turns out
+		 * not to be able to map all of memory.
+		 */
+		if (!pci_is_pcie(pdev)) {
+			if (!pci_is_root_bus(pdev->bus))
+				return 0;
+			if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
+				return 0;
+		} else if (pci_pcie_type(pdev) == PCI_EXP_TYPE_PCI_BRIDGE)
+			return 0;
+	} else {
+		if (device_has_rmrr(dev))
+			return 0;
+	}
+
+	/*
+	 * At boot time, we don't yet know if devices will be 64-bit capable.
+	 * Assume that they will — if they turn out not to be, then we can
+	 * take them out of the 1:1 domain later.
+	 */
+	if (!startup) {
+		/*
+		 * If the device's dma_mask is less than the system's memory
+		 * size then this is not a candidate for identity mapping.
+		 */
+		u64 dma_mask = *dev->dma_mask;
+
+		if (dev->coherent_dma_mask &&
+		    dev->coherent_dma_mask < dma_mask)
+			dma_mask = dev->coherent_dma_mask;
+
+		return dma_mask >= dma_get_required_mask(dev);
+	}
+
+	return 1;
+}
+
+static int __init dev_prepare_static_identity_mapping(struct device *dev, int hw)
+{
+	int ret;
+
+	if (!iommu_should_identity_map(dev, 1))
+		return 0;
+
+	ret = domain_add_dev_info(si_domain, dev);
+	if (!ret)
+		pr_info("%s identity mapping for device %s\n",
+			hw ? "Hardware" : "Software", dev_name(dev));
+	else if (ret == -ENODEV)
+		/* device not associated with an iommu */
+		ret = 0;
+
+	return ret;
+}
+
+
+static int __init iommu_prepare_static_identity_mapping(int hw)
+{
+	struct pci_dev *pdev = NULL;
+	struct dmar_drhd_unit *drhd;
+	struct intel_iommu *iommu;
+	struct device *dev;
+	int i;
+	int ret = 0;
+
+	for_each_pci_dev(pdev) {
+		ret = dev_prepare_static_identity_mapping(&pdev->dev, hw);
+		if (ret)
+			return ret;
+	}
+
+	for_each_active_iommu(iommu, drhd)
+		for_each_active_dev_scope(drhd->devices, drhd->devices_cnt, i, dev) {
+			struct acpi_device_physical_node *pn;
+			struct acpi_device *adev;
+
+			if (dev->bus != &acpi_bus_type)
+				continue;
+
+			adev= to_acpi_device(dev);
+			mutex_lock(&adev->physical_node_lock);
+			list_for_each_entry(pn, &adev->physical_node_list, node) {
+				ret = dev_prepare_static_identity_mapping(pn->dev, hw);
+				if (ret)
+					break;
+			}
+			mutex_unlock(&adev->physical_node_lock);
+			if (ret)
+				return ret;
+		}
+
+	return 0;
+}
+
+static void intel_iommu_init_qi(struct intel_iommu *iommu)
+{
+	/*
+	 * Start from the sane iommu hardware state.
+	 * If the queued invalidation is already initialized by us
+	 * (for example, while enabling interrupt-remapping) then
+	 * we got the things already rolling from a sane state.
+	 */
+	if (!iommu->qi) {
+		/*
+		 * Clear any previous faults.
+		 */
+		dmar_fault(-1, iommu);
+		/*
+		 * Disable queued invalidation if supported and already enabled
+		 * before OS handover.
+		 */
+		dmar_disable_qi(iommu);
+	}
+
+	if (dmar_enable_qi(iommu)) {
+		/*
+		 * Queued Invalidate not enabled, use Register Based Invalidate
+		 */
+		iommu->flush.flush_context = __iommu_flush_context;
+		iommu->flush.flush_iotlb = __iommu_flush_iotlb;
+		pr_info("%s: Using Register based invalidation\n",
+			iommu->name);
+	} else {
+		iommu->flush.flush_context = qi_flush_context;
+		iommu->flush.flush_iotlb = qi_flush_iotlb;
+		pr_info("%s: Using Queued invalidation\n", iommu->name);
+	}
+}
+
+static int copy_context_table(struct intel_iommu *iommu,
+			      struct root_entry *old_re,
+			      struct context_entry **tbl,
+			      int bus, bool ext)
+{
+	int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
+	struct context_entry *new_ce = NULL, ce;
+	struct context_entry *old_ce = NULL;
+	struct root_entry re;
+	phys_addr_t old_ce_phys;
+
+	tbl_idx = ext ? bus * 2 : bus;
+	memcpy(&re, old_re, sizeof(re));
+
+	for (devfn = 0; devfn < 256; devfn++) {
+		/* First calculate the correct index */
+		idx = (ext ? devfn * 2 : devfn) % 256;
+
+		if (idx == 0) {
+			/* First save what we may have and clean up */
+			if (new_ce) {
+				tbl[tbl_idx] = new_ce;
+				__iommu_flush_cache(iommu, new_ce,
+						    VTD_PAGE_SIZE);
+				pos = 1;
+			}
+
+			if (old_ce)
+				memunmap(old_ce);
+
+			ret = 0;
+			if (devfn < 0x80)
+				old_ce_phys = root_entry_lctp(&re);
+			else
+				old_ce_phys = root_entry_uctp(&re);
+
+			if (!old_ce_phys) {
+				if (ext && devfn == 0) {
+					/* No LCTP, try UCTP */
+					devfn = 0x7f;
+					continue;
+				} else {
+					goto out;
+				}
+			}
+
+			ret = -ENOMEM;
+			old_ce = memremap(old_ce_phys, PAGE_SIZE,
+					MEMREMAP_WB);
+			if (!old_ce)
+				goto out;
+
+			new_ce = alloc_pgtable_page(iommu->node);
+			if (!new_ce)
+				goto out_unmap;
+
+			ret = 0;
+		}
+
+		/* Now copy the context entry */
+		memcpy(&ce, old_ce + idx, sizeof(ce));
+
+		if (!__context_present(&ce))
+			continue;
+
+		did = context_domain_id(&ce);
+		if (did >= 0 && did < cap_ndoms(iommu->cap))
+			set_bit(did, iommu->domain_ids);
+
+		/*
+		 * We need a marker for copied context entries. This
+		 * marker needs to work for the old format as well as
+		 * for extended context entries.
+		 *
+		 * Bit 67 of the context entry is used. In the old
+		 * format this bit is available to software, in the
+		 * extended format it is the PGE bit, but PGE is ignored
+		 * by HW if PASIDs are disabled (and thus still
+		 * available).
+		 *
+		 * So disable PASIDs first and then mark the entry
+		 * copied. This means that we don't copy PASID
+		 * translations from the old kernel, but this is fine as
+		 * faults there are not fatal.
+		 */
+		context_clear_pasid_enable(&ce);
+		context_set_copied(&ce);
+
+		new_ce[idx] = ce;
+	}
+
+	tbl[tbl_idx + pos] = new_ce;
+
+	__iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
+
+out_unmap:
+	memunmap(old_ce);
+
+out:
+	return ret;
+}
+
+static int copy_translation_tables(struct intel_iommu *iommu)
+{
+	struct context_entry **ctxt_tbls;
+	struct root_entry *old_rt;
+	phys_addr_t old_rt_phys;
+	int ctxt_table_entries;
+	unsigned long flags;
+	u64 rtaddr_reg;
+	int bus, ret;
+	bool new_ext, ext;
+
+	rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
+	ext        = !!(rtaddr_reg & DMA_RTADDR_RTT);
+	new_ext    = !!ecap_ecs(iommu->ecap);
+
+	/*
+	 * The RTT bit can only be changed when translation is disabled,
+	 * but disabling translation means to open a window for data
+	 * corruption. So bail out and don't copy anything if we would
+	 * have to change the bit.
+	 */
+	if (new_ext != ext)
+		return -EINVAL;
+
+	old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
+	if (!old_rt_phys)
+		return -EINVAL;
+
+	old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
+	if (!old_rt)
+		return -ENOMEM;
+
+	/* This is too big for the stack - allocate it from slab */
+	ctxt_table_entries = ext ? 512 : 256;
+	ret = -ENOMEM;
+	ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
+	if (!ctxt_tbls)
+		goto out_unmap;
+
+	for (bus = 0; bus < 256; bus++) {
+		ret = copy_context_table(iommu, &old_rt[bus],
+					 ctxt_tbls, bus, ext);
+		if (ret) {
+			pr_err("%s: Failed to copy context table for bus %d\n",
+				iommu->name, bus);
+			continue;
+		}
+	}
+
+	spin_lock_irqsave(&iommu->lock, flags);
+
+	/* Context tables are copied, now write them to the root_entry table */
+	for (bus = 0; bus < 256; bus++) {
+		int idx = ext ? bus * 2 : bus;
+		u64 val;
+
+		if (ctxt_tbls[idx]) {
+			val = virt_to_phys(ctxt_tbls[idx]) | 1;
+			iommu->root_entry[bus].lo = val;
+		}
+
+		if (!ext || !ctxt_tbls[idx + 1])
+			continue;
+
+		val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
+		iommu->root_entry[bus].hi = val;
+	}
+
+	spin_unlock_irqrestore(&iommu->lock, flags);
+
+	kfree(ctxt_tbls);
+
+	__iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
+
+	ret = 0;
+
+out_unmap:
+	memunmap(old_rt);
+
+	return ret;
+}
+
+static int __init init_dmars(void)
+{
+	struct dmar_drhd_unit *drhd;
+	struct dmar_rmrr_unit *rmrr;
+	bool copied_tables = false;
+	struct device *dev;
+	struct intel_iommu *iommu;
+	int i, ret;
+
+	/*
+	 * for each drhd
+	 *    allocate root
+	 *    initialize and program root entry to not present
+	 * endfor
+	 */
+	for_each_drhd_unit(drhd) {
+		/*
+		 * lock not needed as this is only incremented in the single
+		 * threaded kernel __init code path all other access are read
+		 * only
+		 */
+		if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
+			g_num_of_iommus++;
+			continue;
+		}
+		pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
+	}
+
+	/* Preallocate enough resources for IOMMU hot-addition */
+	if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
+		g_num_of_iommus = DMAR_UNITS_SUPPORTED;
+
+	g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
+			GFP_KERNEL);
+	if (!g_iommus) {
+		pr_err("Allocating global iommu array failed\n");
+		ret = -ENOMEM;
+		goto error;
+	}
+
+	for_each_active_iommu(iommu, drhd) {
+		/*
+		 * Find the max pasid size of all IOMMU's in the system.
+		 * We need to ensure the system pasid table is no bigger
+		 * than the smallest supported.
+		 */
+		if (pasid_enabled(iommu)) {
+			u32 temp = 2 << ecap_pss(iommu->ecap);
+
+			intel_pasid_max_id = min_t(u32, temp,
+						   intel_pasid_max_id);
+		}
+
+		g_iommus[iommu->seq_id] = iommu;
+
+		intel_iommu_init_qi(iommu);
+
+		ret = iommu_init_domains(iommu);
+		if (ret)
+			goto free_iommu;
+
+		init_translation_status(iommu);
+
+		if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
+			iommu_disable_translation(iommu);
+			clear_translation_pre_enabled(iommu);
+			pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
+				iommu->name);
+		}
+
+		/*
+		 * TBD:
+		 * we could share the same root & context tables
+		 * among all IOMMU's. Need to Split it later.
+		 */
+		ret = iommu_alloc_root_entry(iommu);
+		if (ret)
+			goto free_iommu;
+
+		if (translation_pre_enabled(iommu)) {
+			pr_info("Translation already enabled - trying to copy translation structures\n");
+
+			ret = copy_translation_tables(iommu);
+			if (ret) {
+				/*
+				 * We found the IOMMU with translation
+				 * enabled - but failed to copy over the
+				 * old root-entry table. Try to proceed
+				 * by disabling translation now and
+				 * allocating a clean root-entry table.
+				 * This might cause DMAR faults, but
+				 * probably the dump will still succeed.
+				 */
+				pr_err("Failed to copy translation tables from previous kernel for %s\n",
+				       iommu->name);
+				iommu_disable_translation(iommu);
+				clear_translation_pre_enabled(iommu);
+			} else {
+				pr_info("Copied translation tables from previous kernel for %s\n",
+					iommu->name);
+				copied_tables = true;
+			}
+		}
+
+		if (!ecap_pass_through(iommu->ecap))
+			hw_pass_through = 0;
+
+		if (!intel_iommu_strict && cap_caching_mode(iommu->cap)) {
+			pr_info("Disable batched IOTLB flush due to virtualization");
+			intel_iommu_strict = 1;
+		}
+
+#ifdef CONFIG_INTEL_IOMMU_SVM
+		if (pasid_enabled(iommu))
+			intel_svm_init(iommu);
+#endif
+	}
+
+	/*
+	 * Now that qi is enabled on all iommus, set the root entry and flush
+	 * caches. This is required on some Intel X58 chipsets, otherwise the
+	 * flush_context function will loop forever and the boot hangs.
+	 */
+	for_each_active_iommu(iommu, drhd) {
+		iommu_flush_write_buffer(iommu);
+		iommu_set_root_entry(iommu);
+		iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
+		iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
+	}
+
+	if (iommu_pass_through)
+		iommu_identity_mapping |= IDENTMAP_ALL;
+
+#ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
+	dmar_map_gfx = 0;
+#endif
+
+	if (!dmar_map_gfx)
+		iommu_identity_mapping |= IDENTMAP_GFX;
+
+	check_tylersburg_isoch();
+
+	if (iommu_identity_mapping) {
+		ret = si_domain_init(hw_pass_through);
+		if (ret)
+			goto free_iommu;
+	}
+
+
+	/*
+	 * If we copied translations from a previous kernel in the kdump
+	 * case, we can not assign the devices to domains now, as that
+	 * would eliminate the old mappings. So skip this part and defer
+	 * the assignment to device driver initialization time.
+	 */
+	if (copied_tables)
+		goto domains_done;
+
+	/*
+	 * If pass through is not set or not enabled, setup context entries for
+	 * identity mappings for rmrr, gfx, and isa and may fall back to static
+	 * identity mapping if iommu_identity_mapping is set.
+	 */
+	if (iommu_identity_mapping) {
+		ret = iommu_prepare_static_identity_mapping(hw_pass_through);
+		if (ret) {
+			pr_crit("Failed to setup IOMMU pass-through\n");
+			goto free_iommu;
+		}
+	}
+	/*
+	 * For each rmrr
+	 *   for each dev attached to rmrr
+	 *   do
+	 *     locate drhd for dev, alloc domain for dev
+	 *     allocate free domain
+	 *     allocate page table entries for rmrr
+	 *     if context not allocated for bus
+	 *           allocate and init context
+	 *           set present in root table for this bus
+	 *     init context with domain, translation etc
+	 *    endfor
+	 * endfor
+	 */
+	pr_info("Setting RMRR:\n");
+	for_each_rmrr_units(rmrr) {
+		/* some BIOS lists non-exist devices in DMAR table. */
+		for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
+					  i, dev) {
+			ret = iommu_prepare_rmrr_dev(rmrr, dev);
+			if (ret)
+				pr_err("Mapping reserved region failed\n");
+		}
+	}
+
+	iommu_prepare_isa();
+
+domains_done:
+
+	/*
+	 * for each drhd
+	 *   enable fault log
+	 *   global invalidate context cache
+	 *   global invalidate iotlb
+	 *   enable translation
+	 */
+	for_each_iommu(iommu, drhd) {
+		if (drhd->ignored) {
+			/*
+			 * we always have to disable PMRs or DMA may fail on
+			 * this device
+			 */
+			if (force_on)
+				iommu_disable_protect_mem_regions(iommu);
+			continue;
+		}
+
+		iommu_flush_write_buffer(iommu);
+
+#ifdef CONFIG_INTEL_IOMMU_SVM
+		if (pasid_enabled(iommu) && ecap_prs(iommu->ecap)) {
+			ret = intel_svm_enable_prq(iommu);
+			if (ret)
+				goto free_iommu;
+		}
+#endif
+		ret = dmar_set_interrupt(iommu);
+		if (ret)
+			goto free_iommu;
+
+		if (!translation_pre_enabled(iommu))
+			iommu_enable_translation(iommu);
+
+		iommu_disable_protect_mem_regions(iommu);
+	}
+
+	return 0;
+
+free_iommu:
+	for_each_active_iommu(iommu, drhd) {
+		disable_dmar_iommu(iommu);
+		free_dmar_iommu(iommu);
+	}
+
+	kfree(g_iommus);
+
+error:
+	return ret;
+}
+
+/* This takes a number of _MM_ pages, not VTD pages */
+static unsigned long intel_alloc_iova(struct device *dev,
+				     struct dmar_domain *domain,
+				     unsigned long nrpages, uint64_t dma_mask)
+{
+	unsigned long iova_pfn = 0;
+
+	/* Restrict dma_mask to the width that the iommu can handle */
+	dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
+	/* Ensure we reserve the whole size-aligned region */
+	nrpages = __roundup_pow_of_two(nrpages);
+
+	if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
+		/*
+		 * First try to allocate an io virtual address in
+		 * DMA_BIT_MASK(32) and if that fails then try allocating
+		 * from higher range
+		 */
+		iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
+					   IOVA_PFN(DMA_BIT_MASK(32)), false);
+		if (iova_pfn)
+			return iova_pfn;
+	}
+	iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
+				   IOVA_PFN(dma_mask), true);
+	if (unlikely(!iova_pfn)) {
+		pr_err("Allocating %ld-page iova for %s failed",
+		       nrpages, dev_name(dev));
+		return 0;
+	}
+
+	return iova_pfn;
+}
+
+struct dmar_domain *get_valid_domain_for_dev(struct device *dev)
+{
+	struct dmar_domain *domain, *tmp;
+	struct dmar_rmrr_unit *rmrr;
+	struct device *i_dev;
+	int i, ret;
+
+	domain = find_domain(dev);
+	if (domain)
+		goto out;
+
+	domain = find_or_alloc_domain(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
+	if (!domain)
+		goto out;
+
+	/* We have a new domain - setup possible RMRRs for the device */
+	rcu_read_lock();
+	for_each_rmrr_units(rmrr) {
+		for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
+					  i, i_dev) {
+			if (i_dev != dev)
+				continue;
+
+			ret = domain_prepare_identity_map(dev, domain,
+							  rmrr->base_address,
+							  rmrr->end_address);
+			if (ret)
+				dev_err(dev, "Mapping reserved region failed\n");
+		}
+	}
+	rcu_read_unlock();
+
+	tmp = set_domain_for_dev(dev, domain);
+	if (!tmp || domain != tmp) {
+		domain_exit(domain);
+		domain = tmp;
+	}
+
+out:
+
+	if (!domain)
+		pr_err("Allocating domain for %s failed\n", dev_name(dev));
+
+
+	return domain;
+}
+
+/* Check if the dev needs to go through non-identity map and unmap process.*/
+static int iommu_no_mapping(struct device *dev)
+{
+	int found;
+
+	if (iommu_dummy(dev))
+		return 1;
+
+	if (!iommu_identity_mapping)
+		return 0;
+
+	found = identity_mapping(dev);
+	if (found) {
+		if (iommu_should_identity_map(dev, 0))
+			return 1;
+		else {
+			/*
+			 * 32 bit DMA is removed from si_domain and fall back
+			 * to non-identity mapping.
+			 */
+			dmar_remove_one_dev_info(si_domain, dev);
+			pr_info("32bit %s uses non-identity mapping\n",
+				dev_name(dev));
+			return 0;
+		}
+	} else {
+		/*
+		 * In case of a detached 64 bit DMA device from vm, the device
+		 * is put into si_domain for identity mapping.
+		 */
+		if (iommu_should_identity_map(dev, 0)) {
+			int ret;
+			ret = domain_add_dev_info(si_domain, dev);
+			if (!ret) {
+				pr_info("64bit %s uses identity mapping\n",
+					dev_name(dev));
+				return 1;
+			}
+		}
+	}
+
+	return 0;
+}
+
+static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
+				     size_t size, int dir, u64 dma_mask)
+{
+	struct dmar_domain *domain;
+	phys_addr_t start_paddr;
+	unsigned long iova_pfn;
+	int prot = 0;
+	int ret;
+	struct intel_iommu *iommu;
+	unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
+
+	BUG_ON(dir == DMA_NONE);
+
+	if (iommu_no_mapping(dev))
+		return paddr;
+
+	domain = get_valid_domain_for_dev(dev);
+	if (!domain)
+		return 0;
+
+	iommu = domain_get_iommu(domain);
+	size = aligned_nrpages(paddr, size);
+
+	iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), dma_mask);
+	if (!iova_pfn)
+		goto error;
+
+	/*
+	 * Check if DMAR supports zero-length reads on write only
+	 * mappings..
+	 */
+	if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
+			!cap_zlr(iommu->cap))
+		prot |= DMA_PTE_READ;
+	if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
+		prot |= DMA_PTE_WRITE;
+	/*
+	 * paddr - (paddr + size) might be partial page, we should map the whole
+	 * page.  Note: if two part of one page are separately mapped, we
+	 * might have two guest_addr mapping to the same host paddr, but this
+	 * is not a big problem
+	 */
+	ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
+				 mm_to_dma_pfn(paddr_pfn), size, prot);
+	if (ret)
+		goto error;
+
+	start_paddr = (phys_addr_t)iova_pfn << PAGE_SHIFT;
+	start_paddr += paddr & ~PAGE_MASK;
+	return start_paddr;
+
+error:
+	if (iova_pfn)
+		free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
+	pr_err("Device %s request: %zx@%llx dir %d --- failed\n",
+		dev_name(dev), size, (unsigned long long)paddr, dir);
+	return 0;
+}
+
+static dma_addr_t intel_map_page(struct device *dev, struct page *page,
+				 unsigned long offset, size_t size,
+				 enum dma_data_direction dir,
+				 unsigned long attrs)
+{
+	return __intel_map_single(dev, page_to_phys(page) + offset, size,
+				  dir, *dev->dma_mask);
+}
+
+static void intel_unmap(struct device *dev, dma_addr_t dev_addr, size_t size)
+{
+	struct dmar_domain *domain;
+	unsigned long start_pfn, last_pfn;
+	unsigned long nrpages;
+	unsigned long iova_pfn;
+	struct intel_iommu *iommu;
+	struct page *freelist;
+
+	if (iommu_no_mapping(dev))
+		return;
+
+	domain = find_domain(dev);
+	BUG_ON(!domain);
+
+	iommu = domain_get_iommu(domain);
+
+	iova_pfn = IOVA_PFN(dev_addr);
+
+	nrpages = aligned_nrpages(dev_addr, size);
+	start_pfn = mm_to_dma_pfn(iova_pfn);
+	last_pfn = start_pfn + nrpages - 1;
+
+	pr_debug("Device %s unmapping: pfn %lx-%lx\n",
+		 dev_name(dev), start_pfn, last_pfn);
+
+	freelist = domain_unmap(domain, start_pfn, last_pfn);
+
+	if (intel_iommu_strict || !has_iova_flush_queue(&domain->iovad)) {
+		iommu_flush_iotlb_psi(iommu, domain, start_pfn,
+				      nrpages, !freelist, 0);
+		/* free iova */
+		free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
+		dma_free_pagelist(freelist);
+	} else {
+		queue_iova(&domain->iovad, iova_pfn, nrpages,
+			   (unsigned long)freelist);
+		/*
+		 * queue up the release of the unmap to save the 1/6th of the
+		 * cpu used up by the iotlb flush operation...
+		 */
+	}
+}
+
+static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
+			     size_t size, enum dma_data_direction dir,
+			     unsigned long attrs)
+{
+	intel_unmap(dev, dev_addr, size);
+}
+
+static void *intel_alloc_coherent(struct device *dev, size_t size,
+				  dma_addr_t *dma_handle, gfp_t flags,
+				  unsigned long attrs)
+{
+	struct page *page = NULL;
+	int order;
+
+	size = PAGE_ALIGN(size);
+	order = get_order(size);
+
+	if (!iommu_no_mapping(dev))
+		flags &= ~(GFP_DMA | GFP_DMA32);
+	else if (dev->coherent_dma_mask < dma_get_required_mask(dev)) {
+		if (dev->coherent_dma_mask < DMA_BIT_MASK(32))
+			flags |= GFP_DMA;
+		else
+			flags |= GFP_DMA32;
+	}
+
+	if (gfpflags_allow_blocking(flags)) {
+		unsigned int count = size >> PAGE_SHIFT;
+
+		page = dma_alloc_from_contiguous(dev, count, order,
+						 flags & __GFP_NOWARN);
+		if (page && iommu_no_mapping(dev) &&
+		    page_to_phys(page) + size > dev->coherent_dma_mask) {
+			dma_release_from_contiguous(dev, page, count);
+			page = NULL;
+		}
+	}
+
+	if (!page)
+		page = alloc_pages(flags, order);
+	if (!page)
+		return NULL;
+	memset(page_address(page), 0, size);
+
+	*dma_handle = __intel_map_single(dev, page_to_phys(page), size,
+					 DMA_BIDIRECTIONAL,
+					 dev->coherent_dma_mask);
+	if (*dma_handle)
+		return page_address(page);
+	if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
+		__free_pages(page, order);
+
+	return NULL;
+}
+
+static void intel_free_coherent(struct device *dev, size_t size, void *vaddr,
+				dma_addr_t dma_handle, unsigned long attrs)
+{
+	int order;
+	struct page *page = virt_to_page(vaddr);
+
+	size = PAGE_ALIGN(size);
+	order = get_order(size);
+
+	intel_unmap(dev, dma_handle, size);
+	if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
+		__free_pages(page, order);
+}
+
+static void intel_unmap_sg(struct device *dev, struct scatterlist *sglist,
+			   int nelems, enum dma_data_direction dir,
+			   unsigned long attrs)
+{
+	dma_addr_t startaddr = sg_dma_address(sglist) & PAGE_MASK;
+	unsigned long nrpages = 0;
+	struct scatterlist *sg;
+	int i;
+
+	for_each_sg(sglist, sg, nelems, i) {
+		nrpages += aligned_nrpages(sg_dma_address(sg), sg_dma_len(sg));
+	}
+
+	intel_unmap(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
+}
+
+static int intel_nontranslate_map_sg(struct device *hddev,
+	struct scatterlist *sglist, int nelems, int dir)
+{
+	int i;
+	struct scatterlist *sg;
+
+	for_each_sg(sglist, sg, nelems, i) {
+		BUG_ON(!sg_page(sg));
+		sg->dma_address = sg_phys(sg);
+		sg->dma_length = sg->length;
+	}
+	return nelems;
+}
+
+static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
+			enum dma_data_direction dir, unsigned long attrs)
+{
+	int i;
+	struct dmar_domain *domain;
+	size_t size = 0;
+	int prot = 0;
+	unsigned long iova_pfn;
+	int ret;
+	struct scatterlist *sg;
+	unsigned long start_vpfn;
+	struct intel_iommu *iommu;
+
+	BUG_ON(dir == DMA_NONE);
+	if (iommu_no_mapping(dev))
+		return intel_nontranslate_map_sg(dev, sglist, nelems, dir);
+
+	domain = get_valid_domain_for_dev(dev);
+	if (!domain)
+		return 0;
+
+	iommu = domain_get_iommu(domain);
+
+	for_each_sg(sglist, sg, nelems, i)
+		size += aligned_nrpages(sg->offset, sg->length);
+
+	iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size),
+				*dev->dma_mask);
+	if (!iova_pfn) {
+		sglist->dma_length = 0;
+		return 0;
+	}
+
+	/*
+	 * Check if DMAR supports zero-length reads on write only
+	 * mappings..
+	 */
+	if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
+			!cap_zlr(iommu->cap))
+		prot |= DMA_PTE_READ;
+	if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
+		prot |= DMA_PTE_WRITE;
+
+	start_vpfn = mm_to_dma_pfn(iova_pfn);
+
+	ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
+	if (unlikely(ret)) {
+		dma_pte_free_pagetable(domain, start_vpfn,
+				       start_vpfn + size - 1,
+				       agaw_to_level(domain->agaw) + 1);
+		free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
+		return 0;
+	}
+
+	return nelems;
+}
+
+static int intel_mapping_error(struct device *dev, dma_addr_t dma_addr)
+{
+	return !dma_addr;
+}
+
+const struct dma_map_ops intel_dma_ops = {
+	.alloc = intel_alloc_coherent,
+	.free = intel_free_coherent,
+	.map_sg = intel_map_sg,
+	.unmap_sg = intel_unmap_sg,
+	.map_page = intel_map_page,
+	.unmap_page = intel_unmap_page,
+	.mapping_error = intel_mapping_error,
+#ifdef CONFIG_X86
+	.dma_supported = dma_direct_supported,
+#endif
+};
+
+static inline int iommu_domain_cache_init(void)
+{
+	int ret = 0;
+
+	iommu_domain_cache = kmem_cache_create("iommu_domain",
+					 sizeof(struct dmar_domain),
+					 0,
+					 SLAB_HWCACHE_ALIGN,
+
+					 NULL);
+	if (!iommu_domain_cache) {
+		pr_err("Couldn't create iommu_domain cache\n");
+		ret = -ENOMEM;
+	}
+
+	return ret;
+}
+
+static inline int iommu_devinfo_cache_init(void)
+{
+	int ret = 0;
+
+	iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
+					 sizeof(struct device_domain_info),
+					 0,
+					 SLAB_HWCACHE_ALIGN,
+					 NULL);
+	if (!iommu_devinfo_cache) {
+		pr_err("Couldn't create devinfo cache\n");
+		ret = -ENOMEM;
+	}
+
+	return ret;
+}
+
+static int __init iommu_init_mempool(void)
+{
+	int ret;
+	ret = iova_cache_get();
+	if (ret)
+		return ret;
+
+	ret = iommu_domain_cache_init();
+	if (ret)
+		goto domain_error;
+
+	ret = iommu_devinfo_cache_init();
+	if (!ret)
+		return ret;
+
+	kmem_cache_destroy(iommu_domain_cache);
+domain_error:
+	iova_cache_put();
+
+	return -ENOMEM;
+}
+
+static void __init iommu_exit_mempool(void)
+{
+	kmem_cache_destroy(iommu_devinfo_cache);
+	kmem_cache_destroy(iommu_domain_cache);
+	iova_cache_put();
+}
+
+static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
+{
+	struct dmar_drhd_unit *drhd;
+	u32 vtbar;
+	int rc;
+
+	/* We know that this device on this chipset has its own IOMMU.
+	 * If we find it under a different IOMMU, then the BIOS is lying
+	 * to us. Hope that the IOMMU for this device is actually
+	 * disabled, and it needs no translation...
+	 */
+	rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
+	if (rc) {
+		/* "can't" happen */
+		dev_info(&pdev->dev, "failed to run vt-d quirk\n");
+		return;
+	}
+	vtbar &= 0xffff0000;
+
+	/* we know that the this iommu should be at offset 0xa000 from vtbar */
+	drhd = dmar_find_matched_drhd_unit(pdev);
+	if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
+		pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
+		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
+		pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
+	}
+}
+DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
+
+static void __init init_no_remapping_devices(void)
+{
+	struct dmar_drhd_unit *drhd;
+	struct device *dev;
+	int i;
+
+	for_each_drhd_unit(drhd) {
+		if (!drhd->include_all) {
+			for_each_active_dev_scope(drhd->devices,
+						  drhd->devices_cnt, i, dev)
+				break;
+			/* ignore DMAR unit if no devices exist */
+			if (i == drhd->devices_cnt)
+				drhd->ignored = 1;
+		}
+	}
+
+	for_each_active_drhd_unit(drhd) {
+		if (drhd->include_all)
+			continue;
+
+		for_each_active_dev_scope(drhd->devices,
+					  drhd->devices_cnt, i, dev)
+			if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
+				break;
+		if (i < drhd->devices_cnt)
+			continue;
+
+		/* This IOMMU has *only* gfx devices. Either bypass it or
+		   set the gfx_mapped flag, as appropriate */
+		if (!dmar_map_gfx) {
+			drhd->ignored = 1;
+			for_each_active_dev_scope(drhd->devices,
+						  drhd->devices_cnt, i, dev)
+				dev->archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
+		}
+	}
+}
+
+#ifdef CONFIG_SUSPEND
+static int init_iommu_hw(void)
+{
+	struct dmar_drhd_unit *drhd;
+	struct intel_iommu *iommu = NULL;
+
+	for_each_active_iommu(iommu, drhd)
+		if (iommu->qi)
+			dmar_reenable_qi(iommu);
+
+	for_each_iommu(iommu, drhd) {
+		if (drhd->ignored) {
+			/*
+			 * we always have to disable PMRs or DMA may fail on
+			 * this device
+			 */
+			if (force_on)
+				iommu_disable_protect_mem_regions(iommu);
+			continue;
+		}
+	
+		iommu_flush_write_buffer(iommu);
+
+		iommu_set_root_entry(iommu);
+
+		iommu->flush.flush_context(iommu, 0, 0, 0,
+					   DMA_CCMD_GLOBAL_INVL);
+		iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
+		iommu_enable_translation(iommu);
+		iommu_disable_protect_mem_regions(iommu);
+	}
+
+	return 0;
+}
+
+static void iommu_flush_all(void)
+{
+	struct dmar_drhd_unit *drhd;
+	struct intel_iommu *iommu;
+
+	for_each_active_iommu(iommu, drhd) {
+		iommu->flush.flush_context(iommu, 0, 0, 0,
+					   DMA_CCMD_GLOBAL_INVL);
+		iommu->flush.flush_iotlb(iommu, 0, 0, 0,
+					 DMA_TLB_GLOBAL_FLUSH);
+	}
+}
+
+static int iommu_suspend(void)
+{
+	struct dmar_drhd_unit *drhd;
+	struct intel_iommu *iommu = NULL;
+	unsigned long flag;
+
+	for_each_active_iommu(iommu, drhd) {
+		iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
+						 GFP_ATOMIC);
+		if (!iommu->iommu_state)
+			goto nomem;
+	}
+
+	iommu_flush_all();
+
+	for_each_active_iommu(iommu, drhd) {
+		iommu_disable_translation(iommu);
+
+		raw_spin_lock_irqsave(&iommu->register_lock, flag);
+
+		iommu->iommu_state[SR_DMAR_FECTL_REG] =
+			readl(iommu->reg + DMAR_FECTL_REG);
+		iommu->iommu_state[SR_DMAR_FEDATA_REG] =
+			readl(iommu->reg + DMAR_FEDATA_REG);
+		iommu->iommu_state[SR_DMAR_FEADDR_REG] =
+			readl(iommu->reg + DMAR_FEADDR_REG);
+		iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
+			readl(iommu->reg + DMAR_FEUADDR_REG);
+
+		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
+	}
+	return 0;
+
+nomem:
+	for_each_active_iommu(iommu, drhd)
+		kfree(iommu->iommu_state);
+
+	return -ENOMEM;
+}
+
+static void iommu_resume(void)
+{
+	struct dmar_drhd_unit *drhd;
+	struct intel_iommu *iommu = NULL;
+	unsigned long flag;
+
+	if (init_iommu_hw()) {
+		if (force_on)
+			panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
+		else
+			WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
+		return;
+	}
+
+	for_each_active_iommu(iommu, drhd) {
+
+		raw_spin_lock_irqsave(&iommu->register_lock, flag);
+
+		writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
+			iommu->reg + DMAR_FECTL_REG);
+		writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
+			iommu->reg + DMAR_FEDATA_REG);
+		writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
+			iommu->reg + DMAR_FEADDR_REG);
+		writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
+			iommu->reg + DMAR_FEUADDR_REG);
+
+		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
+	}
+
+	for_each_active_iommu(iommu, drhd)
+		kfree(iommu->iommu_state);
+}
+
+static struct syscore_ops iommu_syscore_ops = {
+	.resume		= iommu_resume,
+	.suspend	= iommu_suspend,
+};
+
+static void __init init_iommu_pm_ops(void)
+{
+	register_syscore_ops(&iommu_syscore_ops);
+}
+
+#else
+static inline void init_iommu_pm_ops(void) {}
+#endif	/* CONFIG_PM */
+
+
+int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
+{
+	struct acpi_dmar_reserved_memory *rmrr;
+	struct dmar_rmrr_unit *rmrru;
+	size_t length;
+
+	rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
+	if (!rmrru)
+		goto out;
+
+	rmrru->hdr = header;
+	rmrr = (struct acpi_dmar_reserved_memory *)header;
+	rmrru->base_address = rmrr->base_address;
+	rmrru->end_address = rmrr->end_address;
+
+	length = rmrr->end_address - rmrr->base_address + 1;
+
+	rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
+				((void *)rmrr) + rmrr->header.length,
+				&rmrru->devices_cnt);
+	if (rmrru->devices_cnt && rmrru->devices == NULL)
+		goto free_rmrru;
+
+	list_add(&rmrru->list, &dmar_rmrr_units);
+
+	return 0;
+free_rmrru:
+	kfree(rmrru);
+out:
+	return -ENOMEM;
+}
+
+static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
+{
+	struct dmar_atsr_unit *atsru;
+	struct acpi_dmar_atsr *tmp;
+
+	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
+		tmp = (struct acpi_dmar_atsr *)atsru->hdr;
+		if (atsr->segment != tmp->segment)
+			continue;
+		if (atsr->header.length != tmp->header.length)
+			continue;
+		if (memcmp(atsr, tmp, atsr->header.length) == 0)
+			return atsru;
+	}
+
+	return NULL;
+}
+
+int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
+{
+	struct acpi_dmar_atsr *atsr;
+	struct dmar_atsr_unit *atsru;
+
+	if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
+		return 0;
+
+	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
+	atsru = dmar_find_atsr(atsr);
+	if (atsru)
+		return 0;
+
+	atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
+	if (!atsru)
+		return -ENOMEM;
+
+	/*
+	 * If memory is allocated from slab by ACPI _DSM method, we need to
+	 * copy the memory content because the memory buffer will be freed
+	 * on return.
+	 */
+	atsru->hdr = (void *)(atsru + 1);
+	memcpy(atsru->hdr, hdr, hdr->length);
+	atsru->include_all = atsr->flags & 0x1;
+	if (!atsru->include_all) {
+		atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
+				(void *)atsr + atsr->header.length,
+				&atsru->devices_cnt);
+		if (atsru->devices_cnt && atsru->devices == NULL) {
+			kfree(atsru);
+			return -ENOMEM;
+		}
+	}
+
+	list_add_rcu(&atsru->list, &dmar_atsr_units);
+
+	return 0;
+}
+
+static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
+{
+	dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
+	kfree(atsru);
+}
+
+int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
+{
+	struct acpi_dmar_atsr *atsr;
+	struct dmar_atsr_unit *atsru;
+
+	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
+	atsru = dmar_find_atsr(atsr);
+	if (atsru) {
+		list_del_rcu(&atsru->list);
+		synchronize_rcu();
+		intel_iommu_free_atsr(atsru);
+	}
+
+	return 0;
+}
+
+int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
+{
+	int i;
+	struct device *dev;
+	struct acpi_dmar_atsr *atsr;
+	struct dmar_atsr_unit *atsru;
+
+	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
+	atsru = dmar_find_atsr(atsr);
+	if (!atsru)
+		return 0;
+
+	if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
+		for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
+					  i, dev)
+			return -EBUSY;
+	}
+
+	return 0;
+}
+
+static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
+{
+	int sp, ret = 0;
+	struct intel_iommu *iommu = dmaru->iommu;
+
+	if (g_iommus[iommu->seq_id])
+		return 0;
+
+	if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
+		pr_warn("%s: Doesn't support hardware pass through.\n",
+			iommu->name);
+		return -ENXIO;
+	}
+	if (!ecap_sc_support(iommu->ecap) &&
+	    domain_update_iommu_snooping(iommu)) {
+		pr_warn("%s: Doesn't support snooping.\n",
+			iommu->name);
+		return -ENXIO;
+	}
+	sp = domain_update_iommu_superpage(iommu) - 1;
+	if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
+		pr_warn("%s: Doesn't support large page.\n",
+			iommu->name);
+		return -ENXIO;
+	}
+
+	/*
+	 * Disable translation if already enabled prior to OS handover.
+	 */
+	if (iommu->gcmd & DMA_GCMD_TE)
+		iommu_disable_translation(iommu);
+
+	g_iommus[iommu->seq_id] = iommu;
+	ret = iommu_init_domains(iommu);
+	if (ret == 0)
+		ret = iommu_alloc_root_entry(iommu);
+	if (ret)
+		goto out;
+
+#ifdef CONFIG_INTEL_IOMMU_SVM
+	if (pasid_enabled(iommu))
+		intel_svm_init(iommu);
+#endif
+
+	if (dmaru->ignored) {
+		/*
+		 * we always have to disable PMRs or DMA may fail on this device
+		 */
+		if (force_on)
+			iommu_disable_protect_mem_regions(iommu);
+		return 0;
+	}
+
+	intel_iommu_init_qi(iommu);
+	iommu_flush_write_buffer(iommu);
+
+#ifdef CONFIG_INTEL_IOMMU_SVM
+	if (pasid_enabled(iommu) && ecap_prs(iommu->ecap)) {
+		ret = intel_svm_enable_prq(iommu);
+		if (ret)
+			goto disable_iommu;
+	}
+#endif
+	ret = dmar_set_interrupt(iommu);
+	if (ret)
+		goto disable_iommu;
+
+	iommu_set_root_entry(iommu);
+	iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
+	iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
+	iommu_enable_translation(iommu);
+
+	iommu_disable_protect_mem_regions(iommu);
+	return 0;
+
+disable_iommu:
+	disable_dmar_iommu(iommu);
+out:
+	free_dmar_iommu(iommu);
+	return ret;
+}
+
+int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
+{
+	int ret = 0;
+	struct intel_iommu *iommu = dmaru->iommu;
+
+	if (!intel_iommu_enabled)
+		return 0;
+	if (iommu == NULL)
+		return -EINVAL;
+
+	if (insert) {
+		ret = intel_iommu_add(dmaru);
+	} else {
+		disable_dmar_iommu(iommu);
+		free_dmar_iommu(iommu);
+	}
+
+	return ret;
+}
+
+static void intel_iommu_free_dmars(void)
+{
+	struct dmar_rmrr_unit *rmrru, *rmrr_n;
+	struct dmar_atsr_unit *atsru, *atsr_n;
+
+	list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
+		list_del(&rmrru->list);
+		dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
+		kfree(rmrru);
+	}
+
+	list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
+		list_del(&atsru->list);
+		intel_iommu_free_atsr(atsru);
+	}
+}
+
+int dmar_find_matched_atsr_unit(struct pci_dev *dev)
+{
+	int i, ret = 1;
+	struct pci_bus *bus;
+	struct pci_dev *bridge = NULL;
+	struct device *tmp;
+	struct acpi_dmar_atsr *atsr;
+	struct dmar_atsr_unit *atsru;
+
+	dev = pci_physfn(dev);
+	for (bus = dev->bus; bus; bus = bus->parent) {
+		bridge = bus->self;
+		/* If it's an integrated device, allow ATS */
+		if (!bridge)
+			return 1;
+		/* Connected via non-PCIe: no ATS */
+		if (!pci_is_pcie(bridge) ||
+		    pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
+			return 0;
+		/* If we found the root port, look it up in the ATSR */
+		if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
+			break;
+	}
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
+		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
+		if (atsr->segment != pci_domain_nr(dev->bus))
+			continue;
+
+		for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
+			if (tmp == &bridge->dev)
+				goto out;
+
+		if (atsru->include_all)
+			goto out;
+	}
+	ret = 0;
+out:
+	rcu_read_unlock();
+
+	return ret;
+}
+
+int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
+{
+	int ret = 0;
+	struct dmar_rmrr_unit *rmrru;
+	struct dmar_atsr_unit *atsru;
+	struct acpi_dmar_atsr *atsr;
+	struct acpi_dmar_reserved_memory *rmrr;
+
+	if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
+		return 0;
+
+	list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
+		rmrr = container_of(rmrru->hdr,
+				    struct acpi_dmar_reserved_memory, header);
+		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
+			ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
+				((void *)rmrr) + rmrr->header.length,
+				rmrr->segment, rmrru->devices,
+				rmrru->devices_cnt);
+			if(ret < 0)
+				return ret;
+		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
+			dmar_remove_dev_scope(info, rmrr->segment,
+				rmrru->devices, rmrru->devices_cnt);
+		}
+	}
+
+	list_for_each_entry(atsru, &dmar_atsr_units, list) {
+		if (atsru->include_all)
+			continue;
+
+		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
+		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
+			ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
+					(void *)atsr + atsr->header.length,
+					atsr->segment, atsru->devices,
+					atsru->devices_cnt);
+			if (ret > 0)
+				break;
+			else if(ret < 0)
+				return ret;
+		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
+			if (dmar_remove_dev_scope(info, atsr->segment,
+					atsru->devices, atsru->devices_cnt))
+				break;
+		}
+	}
+
+	return 0;
+}
+
+/*
+ * Here we only respond to action of unbound device from driver.
+ *
+ * Added device is not attached to its DMAR domain here yet. That will happen
+ * when mapping the device to iova.
+ */
+static int device_notifier(struct notifier_block *nb,
+				  unsigned long action, void *data)
+{
+	struct device *dev = data;
+	struct dmar_domain *domain;
+
+	if (iommu_dummy(dev))
+		return 0;
+
+	if (action != BUS_NOTIFY_REMOVED_DEVICE)
+		return 0;
+
+	domain = find_domain(dev);
+	if (!domain)
+		return 0;
+
+	dmar_remove_one_dev_info(domain, dev);
+	if (!domain_type_is_vm_or_si(domain) && list_empty(&domain->devices))
+		domain_exit(domain);
+
+	return 0;
+}
+
+static struct notifier_block device_nb = {
+	.notifier_call = device_notifier,
+};
+
+static int intel_iommu_memory_notifier(struct notifier_block *nb,
+				       unsigned long val, void *v)
+{
+	struct memory_notify *mhp = v;
+	unsigned long long start, end;
+	unsigned long start_vpfn, last_vpfn;
+
+	switch (val) {
+	case MEM_GOING_ONLINE:
+		start = mhp->start_pfn << PAGE_SHIFT;
+		end = ((mhp->start_pfn + mhp->nr_pages) << PAGE_SHIFT) - 1;
+		if (iommu_domain_identity_map(si_domain, start, end)) {
+			pr_warn("Failed to build identity map for [%llx-%llx]\n",
+				start, end);
+			return NOTIFY_BAD;
+		}
+		break;
+
+	case MEM_OFFLINE:
+	case MEM_CANCEL_ONLINE:
+		start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
+		last_vpfn = mm_to_dma_pfn(mhp->start_pfn + mhp->nr_pages - 1);
+		while (start_vpfn <= last_vpfn) {
+			struct iova *iova;
+			struct dmar_drhd_unit *drhd;
+			struct intel_iommu *iommu;
+			struct page *freelist;
+
+			iova = find_iova(&si_domain->iovad, start_vpfn);
+			if (iova == NULL) {
+				pr_debug("Failed get IOVA for PFN %lx\n",
+					 start_vpfn);
+				break;
+			}
+
+			iova = split_and_remove_iova(&si_domain->iovad, iova,
+						     start_vpfn, last_vpfn);
+			if (iova == NULL) {
+				pr_warn("Failed to split IOVA PFN [%lx-%lx]\n",
+					start_vpfn, last_vpfn);
+				return NOTIFY_BAD;
+			}
+
+			freelist = domain_unmap(si_domain, iova->pfn_lo,
+					       iova->pfn_hi);
+
+			rcu_read_lock();
+			for_each_active_iommu(iommu, drhd)
+				iommu_flush_iotlb_psi(iommu, si_domain,
+					iova->pfn_lo, iova_size(iova),
+					!freelist, 0);
+			rcu_read_unlock();
+			dma_free_pagelist(freelist);
+
+			start_vpfn = iova->pfn_hi + 1;
+			free_iova_mem(iova);
+		}
+		break;
+	}
+
+	return NOTIFY_OK;
+}
+
+static struct notifier_block intel_iommu_memory_nb = {
+	.notifier_call = intel_iommu_memory_notifier,
+	.priority = 0
+};
+
+static void free_all_cpu_cached_iovas(unsigned int cpu)
+{
+	int i;
+
+	for (i = 0; i < g_num_of_iommus; i++) {
+		struct intel_iommu *iommu = g_iommus[i];
+		struct dmar_domain *domain;
+		int did;
+
+		if (!iommu)
+			continue;
+
+		for (did = 0; did < cap_ndoms(iommu->cap); did++) {
+			domain = get_iommu_domain(iommu, (u16)did);
+
+			if (!domain)
+				continue;
+			free_cpu_cached_iovas(cpu, &domain->iovad);
+		}
+	}
+}
+
+static int intel_iommu_cpu_dead(unsigned int cpu)
+{
+	free_all_cpu_cached_iovas(cpu);
+	return 0;
+}
+
+static void intel_disable_iommus(void)
+{
+	struct intel_iommu *iommu = NULL;
+	struct dmar_drhd_unit *drhd;
+
+	for_each_iommu(iommu, drhd)
+		iommu_disable_translation(iommu);
+}
+
+static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
+{
+	struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
+
+	return container_of(iommu_dev, struct intel_iommu, iommu);
+}
+
+static ssize_t intel_iommu_show_version(struct device *dev,
+					struct device_attribute *attr,
+					char *buf)
+{
+	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
+	u32 ver = readl(iommu->reg + DMAR_VER_REG);
+	return sprintf(buf, "%d:%d\n",
+		       DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
+}
+static DEVICE_ATTR(version, S_IRUGO, intel_iommu_show_version, NULL);
+
+static ssize_t intel_iommu_show_address(struct device *dev,
+					struct device_attribute *attr,
+					char *buf)
+{
+	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
+	return sprintf(buf, "%llx\n", iommu->reg_phys);
+}
+static DEVICE_ATTR(address, S_IRUGO, intel_iommu_show_address, NULL);
+
+static ssize_t intel_iommu_show_cap(struct device *dev,
+				    struct device_attribute *attr,
+				    char *buf)
+{
+	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
+	return sprintf(buf, "%llx\n", iommu->cap);
+}
+static DEVICE_ATTR(cap, S_IRUGO, intel_iommu_show_cap, NULL);
+
+static ssize_t intel_iommu_show_ecap(struct device *dev,
+				    struct device_attribute *attr,
+				    char *buf)
+{
+	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
+	return sprintf(buf, "%llx\n", iommu->ecap);
+}
+static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL);
+
+static ssize_t intel_iommu_show_ndoms(struct device *dev,
+				      struct device_attribute *attr,
+				      char *buf)
+{
+	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
+	return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
+}
+static DEVICE_ATTR(domains_supported, S_IRUGO, intel_iommu_show_ndoms, NULL);
+
+static ssize_t intel_iommu_show_ndoms_used(struct device *dev,
+					   struct device_attribute *attr,
+					   char *buf)
+{
+	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
+	return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
+						  cap_ndoms(iommu->cap)));
+}
+static DEVICE_ATTR(domains_used, S_IRUGO, intel_iommu_show_ndoms_used, NULL);
+
+static struct attribute *intel_iommu_attrs[] = {
+	&dev_attr_version.attr,
+	&dev_attr_address.attr,
+	&dev_attr_cap.attr,
+	&dev_attr_ecap.attr,
+	&dev_attr_domains_supported.attr,
+	&dev_attr_domains_used.attr,
+	NULL,
+};
+
+static struct attribute_group intel_iommu_group = {
+	.name = "intel-iommu",
+	.attrs = intel_iommu_attrs,
+};
+
+const struct attribute_group *intel_iommu_groups[] = {
+	&intel_iommu_group,
+	NULL,
+};
+
+int __init intel_iommu_init(void)
+{
+	int ret = -ENODEV;
+	struct dmar_drhd_unit *drhd;
+	struct intel_iommu *iommu;
+
+	/* VT-d is required for a TXT/tboot launch, so enforce that */
+	force_on = tboot_force_iommu();
+
+	if (iommu_init_mempool()) {
+		if (force_on)
+			panic("tboot: Failed to initialize iommu memory\n");
+		return -ENOMEM;
+	}
+
+	down_write(&dmar_global_lock);
+	if (dmar_table_init()) {
+		if (force_on)
+			panic("tboot: Failed to initialize DMAR table\n");
+		goto out_free_dmar;
+	}
+
+	if (dmar_dev_scope_init() < 0) {
+		if (force_on)
+			panic("tboot: Failed to initialize DMAR device scope\n");
+		goto out_free_dmar;
+	}
+
+	up_write(&dmar_global_lock);
+
+	/*
+	 * The bus notifier takes the dmar_global_lock, so lockdep will
+	 * complain later when we register it under the lock.
+	 */
+	dmar_register_bus_notifier();
+
+	down_write(&dmar_global_lock);
+
+	if (no_iommu || dmar_disabled) {
+		/*
+		 * We exit the function here to ensure IOMMU's remapping and
+		 * mempool aren't setup, which means that the IOMMU's PMRs
+		 * won't be disabled via the call to init_dmars(). So disable
+		 * it explicitly here. The PMRs were setup by tboot prior to
+		 * calling SENTER, but the kernel is expected to reset/tear
+		 * down the PMRs.
+		 */
+		if (intel_iommu_tboot_noforce) {
+			for_each_iommu(iommu, drhd)
+				iommu_disable_protect_mem_regions(iommu);
+		}
+
+		/*
+		 * Make sure the IOMMUs are switched off, even when we
+		 * boot into a kexec kernel and the previous kernel left
+		 * them enabled
+		 */
+		intel_disable_iommus();
+		goto out_free_dmar;
+	}
+
+	if (list_empty(&dmar_rmrr_units))
+		pr_info("No RMRR found\n");
+
+	if (list_empty(&dmar_atsr_units))
+		pr_info("No ATSR found\n");
+
+	if (dmar_init_reserved_ranges()) {
+		if (force_on)
+			panic("tboot: Failed to reserve iommu ranges\n");
+		goto out_free_reserved_range;
+	}
+
+	if (dmar_map_gfx)
+		intel_iommu_gfx_mapped = 1;
+
+	init_no_remapping_devices();
+
+	ret = init_dmars();
+	if (ret) {
+		if (force_on)
+			panic("tboot: Failed to initialize DMARs\n");
+		pr_err("Initialization failed\n");
+		goto out_free_reserved_range;
+	}
+	up_write(&dmar_global_lock);
+	pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
+
+#if defined(CONFIG_X86) && defined(CONFIG_SWIOTLB)
+	swiotlb = 0;
+#endif
+	dma_ops = &intel_dma_ops;
+
+	init_iommu_pm_ops();
+
+	for_each_active_iommu(iommu, drhd) {
+		iommu_device_sysfs_add(&iommu->iommu, NULL,
+				       intel_iommu_groups,
+				       "%s", iommu->name);
+		iommu_device_set_ops(&iommu->iommu, &intel_iommu_ops);
+		iommu_device_register(&iommu->iommu);
+	}
+
+	bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
+	bus_register_notifier(&pci_bus_type, &device_nb);
+	if (si_domain && !hw_pass_through)
+		register_memory_notifier(&intel_iommu_memory_nb);
+	cpuhp_setup_state(CPUHP_IOMMU_INTEL_DEAD, "iommu/intel:dead", NULL,
+			  intel_iommu_cpu_dead);
+	intel_iommu_enabled = 1;
+
+	return 0;
+
+out_free_reserved_range:
+	put_iova_domain(&reserved_iova_list);
+out_free_dmar:
+	intel_iommu_free_dmars();
+	up_write(&dmar_global_lock);
+	iommu_exit_mempool();
+	return ret;
+}
+
+static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
+{
+	struct intel_iommu *iommu = opaque;
+
+	domain_context_clear_one(iommu, PCI_BUS_NUM(alias), alias & 0xff);
+	return 0;
+}
+
+/*
+ * NB - intel-iommu lacks any sort of reference counting for the users of
+ * dependent devices.  If multiple endpoints have intersecting dependent
+ * devices, unbinding the driver from any one of them will possibly leave
+ * the others unable to operate.
+ */
+static void domain_context_clear(struct intel_iommu *iommu, struct device *dev)
+{
+	if (!iommu || !dev || !dev_is_pci(dev))
+		return;
+
+	pci_for_each_dma_alias(to_pci_dev(dev), &domain_context_clear_one_cb, iommu);
+}
+
+static void __dmar_remove_one_dev_info(struct device_domain_info *info)
+{
+	struct intel_iommu *iommu;
+	unsigned long flags;
+
+	assert_spin_locked(&device_domain_lock);
+
+	if (WARN_ON(!info))
+		return;
+
+	iommu = info->iommu;
+
+	if (info->dev) {
+		iommu_disable_dev_iotlb(info);
+		domain_context_clear(iommu, info->dev);
+		intel_pasid_free_table(info->dev);
+	}
+
+	unlink_domain_info(info);
+
+	spin_lock_irqsave(&iommu->lock, flags);
+	domain_detach_iommu(info->domain, iommu);
+	spin_unlock_irqrestore(&iommu->lock, flags);
+
+	free_devinfo_mem(info);
+}
+
+static void dmar_remove_one_dev_info(struct dmar_domain *domain,
+				     struct device *dev)
+{
+	struct device_domain_info *info;
+	unsigned long flags;
+
+	spin_lock_irqsave(&device_domain_lock, flags);
+	info = dev->archdata.iommu;
+	__dmar_remove_one_dev_info(info);
+	spin_unlock_irqrestore(&device_domain_lock, flags);
+}
+
+static int md_domain_init(struct dmar_domain *domain, int guest_width)
+{
+	int adjust_width;
+
+	init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
+	domain_reserve_special_ranges(domain);
+
+	/* calculate AGAW */
+	domain->gaw = guest_width;
+	adjust_width = guestwidth_to_adjustwidth(guest_width);
+	domain->agaw = width_to_agaw(adjust_width);
+
+	domain->iommu_coherency = 0;
+	domain->iommu_snooping = 0;
+	domain->iommu_superpage = 0;
+	domain->max_addr = 0;
+
+	/* always allocate the top pgd */
+	domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
+	if (!domain->pgd)
+		return -ENOMEM;
+	domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
+	return 0;
+}
+
+static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
+{
+	struct dmar_domain *dmar_domain;
+	struct iommu_domain *domain;
+
+	if (type != IOMMU_DOMAIN_UNMANAGED)
+		return NULL;
+
+	dmar_domain = alloc_domain(DOMAIN_FLAG_VIRTUAL_MACHINE);
+	if (!dmar_domain) {
+		pr_err("Can't allocate dmar_domain\n");
+		return NULL;
+	}
+	if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
+		pr_err("Domain initialization failed\n");
+		domain_exit(dmar_domain);
+		return NULL;
+	}
+	domain_update_iommu_cap(dmar_domain);
+
+	domain = &dmar_domain->domain;
+	domain->geometry.aperture_start = 0;
+	domain->geometry.aperture_end   = __DOMAIN_MAX_ADDR(dmar_domain->gaw);
+	domain->geometry.force_aperture = true;
+
+	return domain;
+}
+
+static void intel_iommu_domain_free(struct iommu_domain *domain)
+{
+	domain_exit(to_dmar_domain(domain));
+}
+
+static int intel_iommu_attach_device(struct iommu_domain *domain,
+				     struct device *dev)
+{
+	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
+	struct intel_iommu *iommu;
+	int addr_width;
+	u8 bus, devfn;
+
+	if (device_is_rmrr_locked(dev)) {
+		dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement.  Contact your platform vendor.\n");
+		return -EPERM;
+	}
+
+	/* normally dev is not mapped */
+	if (unlikely(domain_context_mapped(dev))) {
+		struct dmar_domain *old_domain;
+
+		old_domain = find_domain(dev);
+		if (old_domain) {
+			rcu_read_lock();
+			dmar_remove_one_dev_info(old_domain, dev);
+			rcu_read_unlock();
+
+			if (!domain_type_is_vm_or_si(old_domain) &&
+			     list_empty(&old_domain->devices))
+				domain_exit(old_domain);
+		}
+	}
+
+	iommu = device_to_iommu(dev, &bus, &devfn);
+	if (!iommu)
+		return -ENODEV;
+
+	/* check if this iommu agaw is sufficient for max mapped address */
+	addr_width = agaw_to_width(iommu->agaw);
+	if (addr_width > cap_mgaw(iommu->cap))
+		addr_width = cap_mgaw(iommu->cap);
+
+	if (dmar_domain->max_addr > (1LL << addr_width)) {
+		pr_err("%s: iommu width (%d) is not "
+		       "sufficient for the mapped address (%llx)\n",
+		       __func__, addr_width, dmar_domain->max_addr);
+		return -EFAULT;
+	}
+	dmar_domain->gaw = addr_width;
+
+	/*
+	 * Knock out extra levels of page tables if necessary
+	 */
+	while (iommu->agaw < dmar_domain->agaw) {
+		struct dma_pte *pte;
+
+		pte = dmar_domain->pgd;
+		if (dma_pte_present(pte)) {
+			dmar_domain->pgd = (struct dma_pte *)
+				phys_to_virt(dma_pte_addr(pte));
+			free_pgtable_page(pte);
+		}
+		dmar_domain->agaw--;
+	}
+
+	return domain_add_dev_info(dmar_domain, dev);
+}
+
+static void intel_iommu_detach_device(struct iommu_domain *domain,
+				      struct device *dev)
+{
+	dmar_remove_one_dev_info(to_dmar_domain(domain), dev);
+}
+
+static int intel_iommu_map(struct iommu_domain *domain,
+			   unsigned long iova, phys_addr_t hpa,
+			   size_t size, int iommu_prot)
+{
+	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
+	u64 max_addr;
+	int prot = 0;
+	int ret;
+
+	if (iommu_prot & IOMMU_READ)
+		prot |= DMA_PTE_READ;
+	if (iommu_prot & IOMMU_WRITE)
+		prot |= DMA_PTE_WRITE;
+	if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
+		prot |= DMA_PTE_SNP;
+
+	max_addr = iova + size;
+	if (dmar_domain->max_addr < max_addr) {
+		u64 end;
+
+		/* check if minimum agaw is sufficient for mapped address */
+		end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
+		if (end < max_addr) {
+			pr_err("%s: iommu width (%d) is not "
+			       "sufficient for the mapped address (%llx)\n",
+			       __func__, dmar_domain->gaw, max_addr);
+			return -EFAULT;
+		}
+		dmar_domain->max_addr = max_addr;
+	}
+	/* Round up size to next multiple of PAGE_SIZE, if it and
+	   the low bits of hpa would take us onto the next page */
+	size = aligned_nrpages(hpa, size);
+	ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
+				 hpa >> VTD_PAGE_SHIFT, size, prot);
+	return ret;
+}
+
+static size_t intel_iommu_unmap(struct iommu_domain *domain,
+				unsigned long iova, size_t size)
+{
+	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
+	struct page *freelist = NULL;
+	unsigned long start_pfn, last_pfn;
+	unsigned int npages;
+	int iommu_id, level = 0;
+
+	/* Cope with horrid API which requires us to unmap more than the
+	   size argument if it happens to be a large-page mapping. */
+	BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
+
+	if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
+		size = VTD_PAGE_SIZE << level_to_offset_bits(level);
+
+	start_pfn = iova >> VTD_PAGE_SHIFT;
+	last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
+
+	freelist = domain_unmap(dmar_domain, start_pfn, last_pfn);
+
+	npages = last_pfn - start_pfn + 1;
+
+	for_each_domain_iommu(iommu_id, dmar_domain)
+		iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
+				      start_pfn, npages, !freelist, 0);
+
+	dma_free_pagelist(freelist);
+
+	if (dmar_domain->max_addr == iova + size)
+		dmar_domain->max_addr = iova;
+
+	return size;
+}
+
+static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
+					    dma_addr_t iova)
+{
+	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
+	struct dma_pte *pte;
+	int level = 0;
+	u64 phys = 0;
+
+	pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
+	if (pte && dma_pte_present(pte))
+		phys = dma_pte_addr(pte) +
+			(iova & (BIT_MASK(level_to_offset_bits(level) +
+						VTD_PAGE_SHIFT) - 1));
+
+	return phys;
+}
+
+static bool intel_iommu_capable(enum iommu_cap cap)
+{
+	if (cap == IOMMU_CAP_CACHE_COHERENCY)
+		return domain_update_iommu_snooping(NULL) == 1;
+	if (cap == IOMMU_CAP_INTR_REMAP)
+		return irq_remapping_enabled == 1;
+
+	return false;
+}
+
+static int intel_iommu_add_device(struct device *dev)
+{
+	struct intel_iommu *iommu;
+	struct iommu_group *group;
+	u8 bus, devfn;
+
+	iommu = device_to_iommu(dev, &bus, &devfn);
+	if (!iommu)
+		return -ENODEV;
+
+	iommu_device_link(&iommu->iommu, dev);
+
+	group = iommu_group_get_for_dev(dev);
+
+	if (IS_ERR(group))
+		return PTR_ERR(group);
+
+	iommu_group_put(group);
+	return 0;
+}
+
+static void intel_iommu_remove_device(struct device *dev)
+{
+	struct intel_iommu *iommu;
+	u8 bus, devfn;
+
+	iommu = device_to_iommu(dev, &bus, &devfn);
+	if (!iommu)
+		return;
+
+	iommu_group_remove_device(dev);
+
+	iommu_device_unlink(&iommu->iommu, dev);
+}
+
+static void intel_iommu_get_resv_regions(struct device *device,
+					 struct list_head *head)
+{
+	int prot = DMA_PTE_READ | DMA_PTE_WRITE;
+	struct iommu_resv_region *reg;
+	struct dmar_rmrr_unit *rmrr;
+	struct device *i_dev;
+	int i;
+
+	down_read(&dmar_global_lock);
+	for_each_rmrr_units(rmrr) {
+		for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
+					  i, i_dev) {
+			struct iommu_resv_region *resv;
+			size_t length;
+
+			if (i_dev != device)
+				continue;
+
+			length = rmrr->end_address - rmrr->base_address + 1;
+			resv = iommu_alloc_resv_region(rmrr->base_address,
+						       length, prot,
+						       IOMMU_RESV_DIRECT);
+			if (!resv)
+				break;
+
+			list_add_tail(&resv->list, head);
+		}
+	}
+	up_read(&dmar_global_lock);
+
+	reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
+				      IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
+				      0, IOMMU_RESV_MSI);
+	if (!reg)
+		return;
+	list_add_tail(&reg->list, head);
+}
+
+static void intel_iommu_put_resv_regions(struct device *dev,
+					 struct list_head *head)
+{
+	struct iommu_resv_region *entry, *next;
+
+	list_for_each_entry_safe(entry, next, head, list)
+		kfree(entry);
+}
+
+#ifdef CONFIG_INTEL_IOMMU_SVM
+#define MAX_NR_PASID_BITS (20)
+static inline unsigned long intel_iommu_get_pts(struct device *dev)
+{
+	int pts, max_pasid;
+
+	max_pasid = intel_pasid_get_dev_max_id(dev);
+	pts = find_first_bit((unsigned long *)&max_pasid, MAX_NR_PASID_BITS);
+	if (pts < 5)
+		return 0;
+
+	return pts - 5;
+}
+
+int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct intel_svm_dev *sdev)
+{
+	struct device_domain_info *info;
+	struct context_entry *context;
+	struct dmar_domain *domain;
+	unsigned long flags;
+	u64 ctx_lo;
+	int ret;
+
+	domain = get_valid_domain_for_dev(sdev->dev);
+	if (!domain)
+		return -EINVAL;
+
+	spin_lock_irqsave(&device_domain_lock, flags);
+	spin_lock(&iommu->lock);
+
+	ret = -EINVAL;
+	info = sdev->dev->archdata.iommu;
+	if (!info || !info->pasid_supported)
+		goto out;
+
+	context = iommu_context_addr(iommu, info->bus, info->devfn, 0);
+	if (WARN_ON(!context))
+		goto out;
+
+	ctx_lo = context[0].lo;
+
+	sdev->did = domain->iommu_did[iommu->seq_id];
+	sdev->sid = PCI_DEVID(info->bus, info->devfn);
+
+	if (!(ctx_lo & CONTEXT_PASIDE)) {
+		if (iommu->pasid_state_table)
+			context[1].hi = (u64)virt_to_phys(iommu->pasid_state_table);
+		context[1].lo = (u64)virt_to_phys(info->pasid_table->table) |
+			intel_iommu_get_pts(sdev->dev);
+
+		wmb();
+		/* CONTEXT_TT_MULTI_LEVEL and CONTEXT_TT_DEV_IOTLB are both
+		 * extended to permit requests-with-PASID if the PASIDE bit
+		 * is set. which makes sense. For CONTEXT_TT_PASS_THROUGH,
+		 * however, the PASIDE bit is ignored and requests-with-PASID
+		 * are unconditionally blocked. Which makes less sense.
+		 * So convert from CONTEXT_TT_PASS_THROUGH to one of the new
+		 * "guest mode" translation types depending on whether ATS
+		 * is available or not. Annoyingly, we can't use the new
+		 * modes *unless* PASIDE is set. */
+		if ((ctx_lo & CONTEXT_TT_MASK) == (CONTEXT_TT_PASS_THROUGH << 2)) {
+			ctx_lo &= ~CONTEXT_TT_MASK;
+			if (info->ats_supported)
+				ctx_lo |= CONTEXT_TT_PT_PASID_DEV_IOTLB << 2;
+			else
+				ctx_lo |= CONTEXT_TT_PT_PASID << 2;
+		}
+		ctx_lo |= CONTEXT_PASIDE;
+		if (iommu->pasid_state_table)
+			ctx_lo |= CONTEXT_DINVE;
+		if (info->pri_supported)
+			ctx_lo |= CONTEXT_PRS;
+		context[0].lo = ctx_lo;
+		wmb();
+		iommu->flush.flush_context(iommu, sdev->did, sdev->sid,
+					   DMA_CCMD_MASK_NOBIT,
+					   DMA_CCMD_DEVICE_INVL);
+	}
+
+	/* Enable PASID support in the device, if it wasn't already */
+	if (!info->pasid_enabled)
+		iommu_enable_dev_iotlb(info);
+
+	if (info->ats_enabled) {
+		sdev->dev_iotlb = 1;
+		sdev->qdep = info->ats_qdep;
+		if (sdev->qdep >= QI_DEV_EIOTLB_MAX_INVS)
+			sdev->qdep = 0;
+	}
+	ret = 0;
+
+ out:
+	spin_unlock(&iommu->lock);
+	spin_unlock_irqrestore(&device_domain_lock, flags);
+
+	return ret;
+}
+
+struct intel_iommu *intel_svm_device_to_iommu(struct device *dev)
+{
+	struct intel_iommu *iommu;
+	u8 bus, devfn;
+
+	if (iommu_dummy(dev)) {
+		dev_warn(dev,
+			 "No IOMMU translation for device; cannot enable SVM\n");
+		return NULL;
+	}
+
+	iommu = device_to_iommu(dev, &bus, &devfn);
+	if ((!iommu)) {
+		dev_err(dev, "No IOMMU for device; cannot enable SVM\n");
+		return NULL;
+	}
+
+	return iommu;
+}
+#endif /* CONFIG_INTEL_IOMMU_SVM */
+
+const struct iommu_ops intel_iommu_ops = {
+	.capable		= intel_iommu_capable,
+	.domain_alloc		= intel_iommu_domain_alloc,
+	.domain_free		= intel_iommu_domain_free,
+	.attach_dev		= intel_iommu_attach_device,
+	.detach_dev		= intel_iommu_detach_device,
+	.map			= intel_iommu_map,
+	.unmap			= intel_iommu_unmap,
+	.iova_to_phys		= intel_iommu_iova_to_phys,
+	.add_device		= intel_iommu_add_device,
+	.remove_device		= intel_iommu_remove_device,
+	.get_resv_regions	= intel_iommu_get_resv_regions,
+	.put_resv_regions	= intel_iommu_put_resv_regions,
+	.device_group		= pci_device_group,
+	.pgsize_bitmap		= INTEL_IOMMU_PGSIZES,
+};
+
+static void quirk_iommu_g4x_gfx(struct pci_dev *dev)
+{
+	/* G4x/GM45 integrated gfx dmar support is totally busted. */
+	pr_info("Disabling IOMMU for graphics on this chipset\n");
+	dmar_map_gfx = 0;
+}
+
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_g4x_gfx);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_g4x_gfx);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_g4x_gfx);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_g4x_gfx);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_g4x_gfx);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_g4x_gfx);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_g4x_gfx);
+
+static void quirk_iommu_rwbf(struct pci_dev *dev)
+{
+	/*
+	 * Mobile 4 Series Chipset neglects to set RWBF capability,
+	 * but needs it. Same seems to hold for the desktop versions.
+	 */
+	pr_info("Forcing write-buffer flush capability\n");
+	rwbf_quirk = 1;
+}
+
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
+
+#define GGC 0x52
+#define GGC_MEMORY_SIZE_MASK	(0xf << 8)
+#define GGC_MEMORY_SIZE_NONE	(0x0 << 8)
+#define GGC_MEMORY_SIZE_1M	(0x1 << 8)
+#define GGC_MEMORY_SIZE_2M	(0x3 << 8)
+#define GGC_MEMORY_VT_ENABLED	(0x8 << 8)
+#define GGC_MEMORY_SIZE_2M_VT	(0x9 << 8)
+#define GGC_MEMORY_SIZE_3M_VT	(0xa << 8)
+#define GGC_MEMORY_SIZE_4M_VT	(0xb << 8)
+
+static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
+{
+	unsigned short ggc;
+
+	if (pci_read_config_word(dev, GGC, &ggc))
+		return;
+
+	if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
+		pr_info("BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
+		dmar_map_gfx = 0;
+	} else if (dmar_map_gfx) {
+		/* we have to ensure the gfx device is idle before we flush */
+		pr_info("Disabling batched IOTLB flush on Ironlake\n");
+		intel_iommu_strict = 1;
+       }
+}
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
+
+/* On Tylersburg chipsets, some BIOSes have been known to enable the
+   ISOCH DMAR unit for the Azalia sound device, but not give it any
+   TLB entries, which causes it to deadlock. Check for that.  We do
+   this in a function called from init_dmars(), instead of in a PCI
+   quirk, because we don't want to print the obnoxious "BIOS broken"
+   message if VT-d is actually disabled.
+*/
+static void __init check_tylersburg_isoch(void)
+{
+	struct pci_dev *pdev;
+	uint32_t vtisochctrl;
+
+	/* If there's no Azalia in the system anyway, forget it. */
+	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
+	if (!pdev)
+		return;
+	pci_dev_put(pdev);
+
+	/* System Management Registers. Might be hidden, in which case
+	   we can't do the sanity check. But that's OK, because the
+	   known-broken BIOSes _don't_ actually hide it, so far. */
+	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
+	if (!pdev)
+		return;
+
+	if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
+		pci_dev_put(pdev);
+		return;
+	}
+
+	pci_dev_put(pdev);
+
+	/* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
+	if (vtisochctrl & 1)
+		return;
+
+	/* Drop all bits other than the number of TLB entries */
+	vtisochctrl &= 0x1c;
+
+	/* If we have the recommended number of TLB entries (16), fine. */
+	if (vtisochctrl == 0x10)
+		return;
+
+	/* Zero TLB entries? You get to ride the short bus to school. */
+	if (!vtisochctrl) {
+		WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
+		     "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
+		     dmi_get_system_info(DMI_BIOS_VENDOR),
+		     dmi_get_system_info(DMI_BIOS_VERSION),
+		     dmi_get_system_info(DMI_PRODUCT_VERSION));
+		iommu_identity_mapping |= IDENTMAP_AZALIA;
+		return;
+	}
+
+	pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
+	       vtisochctrl);
+}
diff --git a/drivers/iommu/intel-pasid.c b/drivers/iommu/intel-pasid.c
new file mode 100644
index 000000000..fe95c9bd4
--- /dev/null
+++ b/drivers/iommu/intel-pasid.c
@@ -0,0 +1,239 @@
+// SPDX-License-Identifier: GPL-2.0
+/**
+ * intel-pasid.c - PASID idr, table and entry manipulation
+ *
+ * Copyright (C) 2018 Intel Corporation
+ *
+ * Author: Lu Baolu <baolu.lu@linux.intel.com>
+ */
+
+#define pr_fmt(fmt)	"DMAR: " fmt
+
+#include <linux/dmar.h>
+#include <linux/intel-iommu.h>
+#include <linux/iommu.h>
+#include <linux/memory.h>
+#include <linux/pci.h>
+#include <linux/pci-ats.h>
+#include <linux/spinlock.h>
+
+#include "intel-pasid.h"
+
+/*
+ * Intel IOMMU system wide PASID name space:
+ */
+static DEFINE_SPINLOCK(pasid_lock);
+u32 intel_pasid_max_id = PASID_MAX;
+static DEFINE_IDR(pasid_idr);
+
+int intel_pasid_alloc_id(void *ptr, int start, int end, gfp_t gfp)
+{
+	int ret, min, max;
+
+	min = max_t(int, start, PASID_MIN);
+	max = min_t(int, end, intel_pasid_max_id);
+
+	WARN_ON(in_interrupt());
+	idr_preload(gfp);
+	spin_lock(&pasid_lock);
+	ret = idr_alloc(&pasid_idr, ptr, min, max, GFP_ATOMIC);
+	spin_unlock(&pasid_lock);
+	idr_preload_end();
+
+	return ret;
+}
+
+void intel_pasid_free_id(int pasid)
+{
+	spin_lock(&pasid_lock);
+	idr_remove(&pasid_idr, pasid);
+	spin_unlock(&pasid_lock);
+}
+
+void *intel_pasid_lookup_id(int pasid)
+{
+	void *p;
+
+	spin_lock(&pasid_lock);
+	p = idr_find(&pasid_idr, pasid);
+	spin_unlock(&pasid_lock);
+
+	return p;
+}
+
+/*
+ * Per device pasid table management:
+ */
+static inline void
+device_attach_pasid_table(struct device_domain_info *info,
+			  struct pasid_table *pasid_table)
+{
+	info->pasid_table = pasid_table;
+	list_add(&info->table, &pasid_table->dev);
+}
+
+static inline void
+device_detach_pasid_table(struct device_domain_info *info,
+			  struct pasid_table *pasid_table)
+{
+	info->pasid_table = NULL;
+	list_del(&info->table);
+}
+
+struct pasid_table_opaque {
+	struct pasid_table	**pasid_table;
+	int			segment;
+	int			bus;
+	int			devfn;
+};
+
+static int search_pasid_table(struct device_domain_info *info, void *opaque)
+{
+	struct pasid_table_opaque *data = opaque;
+
+	if (info->iommu->segment == data->segment &&
+	    info->bus == data->bus &&
+	    info->devfn == data->devfn &&
+	    info->pasid_table) {
+		*data->pasid_table = info->pasid_table;
+		return 1;
+	}
+
+	return 0;
+}
+
+static int get_alias_pasid_table(struct pci_dev *pdev, u16 alias, void *opaque)
+{
+	struct pasid_table_opaque *data = opaque;
+
+	data->segment = pci_domain_nr(pdev->bus);
+	data->bus = PCI_BUS_NUM(alias);
+	data->devfn = alias & 0xff;
+
+	return for_each_device_domain(&search_pasid_table, data);
+}
+
+/*
+ * Allocate a pasid table for @dev. It should be called in a
+ * single-thread context.
+ */
+int intel_pasid_alloc_table(struct device *dev)
+{
+	struct device_domain_info *info;
+	struct pasid_table *pasid_table;
+	struct pasid_table_opaque data;
+	struct page *pages;
+	size_t size, count;
+	int ret, order;
+
+	info = dev->archdata.iommu;
+	if (WARN_ON(!info || !dev_is_pci(dev) ||
+		    !info->pasid_supported || info->pasid_table))
+		return -EINVAL;
+
+	/* DMA alias device already has a pasid table, use it: */
+	data.pasid_table = &pasid_table;
+	ret = pci_for_each_dma_alias(to_pci_dev(dev),
+				     &get_alias_pasid_table, &data);
+	if (ret)
+		goto attach_out;
+
+	pasid_table = kzalloc(sizeof(*pasid_table), GFP_ATOMIC);
+	if (!pasid_table)
+		return -ENOMEM;
+	INIT_LIST_HEAD(&pasid_table->dev);
+
+	size = sizeof(struct pasid_entry);
+	count = min_t(int, pci_max_pasids(to_pci_dev(dev)), intel_pasid_max_id);
+	order = get_order(size * count);
+	pages = alloc_pages_node(info->iommu->node,
+				 GFP_ATOMIC | __GFP_ZERO,
+				 order);
+	if (!pages)
+		return -ENOMEM;
+
+	pasid_table->table = page_address(pages);
+	pasid_table->order = order;
+	pasid_table->max_pasid = count;
+
+attach_out:
+	device_attach_pasid_table(info, pasid_table);
+
+	return 0;
+}
+
+void intel_pasid_free_table(struct device *dev)
+{
+	struct device_domain_info *info;
+	struct pasid_table *pasid_table;
+
+	info = dev->archdata.iommu;
+	if (!info || !dev_is_pci(dev) ||
+	    !info->pasid_supported || !info->pasid_table)
+		return;
+
+	pasid_table = info->pasid_table;
+	device_detach_pasid_table(info, pasid_table);
+
+	if (!list_empty(&pasid_table->dev))
+		return;
+
+	free_pages((unsigned long)pasid_table->table, pasid_table->order);
+	kfree(pasid_table);
+}
+
+struct pasid_table *intel_pasid_get_table(struct device *dev)
+{
+	struct device_domain_info *info;
+
+	info = dev->archdata.iommu;
+	if (!info)
+		return NULL;
+
+	return info->pasid_table;
+}
+
+int intel_pasid_get_dev_max_id(struct device *dev)
+{
+	struct device_domain_info *info;
+
+	info = dev->archdata.iommu;
+	if (!info || !info->pasid_table)
+		return 0;
+
+	return info->pasid_table->max_pasid;
+}
+
+struct pasid_entry *intel_pasid_get_entry(struct device *dev, int pasid)
+{
+	struct pasid_table *pasid_table;
+	struct pasid_entry *entries;
+
+	pasid_table = intel_pasid_get_table(dev);
+	if (WARN_ON(!pasid_table || pasid < 0 ||
+		    pasid >= intel_pasid_get_dev_max_id(dev)))
+		return NULL;
+
+	entries = pasid_table->table;
+
+	return &entries[pasid];
+}
+
+/*
+ * Interfaces for PASID table entry manipulation:
+ */
+static inline void pasid_clear_entry(struct pasid_entry *pe)
+{
+	WRITE_ONCE(pe->val, 0);
+}
+
+void intel_pasid_clear_entry(struct device *dev, int pasid)
+{
+	struct pasid_entry *pe;
+
+	pe = intel_pasid_get_entry(dev, pasid);
+	if (WARN_ON(!pe))
+		return;
+
+	pasid_clear_entry(pe);
+}
diff --git a/drivers/iommu/intel-pasid.h b/drivers/iommu/intel-pasid.h
new file mode 100644
index 000000000..1fb5e12b0
--- /dev/null
+++ b/drivers/iommu/intel-pasid.h
@@ -0,0 +1,39 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * intel-pasid.h - PASID idr, table and entry header
+ *
+ * Copyright (C) 2018 Intel Corporation
+ *
+ * Author: Lu Baolu <baolu.lu@linux.intel.com>
+ */
+
+#ifndef __INTEL_PASID_H
+#define __INTEL_PASID_H
+
+#define PASID_MIN			0x1
+#define PASID_MAX			0x20000
+
+struct pasid_entry {
+	u64 val;
+};
+
+/* The representative of a PASID table */
+struct pasid_table {
+	void			*table;		/* pasid table pointer */
+	int			order;		/* page order of pasid table */
+	int			max_pasid;	/* max pasid */
+	struct list_head	dev;		/* device list */
+};
+
+extern u32 intel_pasid_max_id;
+int intel_pasid_alloc_id(void *ptr, int start, int end, gfp_t gfp);
+void intel_pasid_free_id(int pasid);
+void *intel_pasid_lookup_id(int pasid);
+int intel_pasid_alloc_table(struct device *dev);
+void intel_pasid_free_table(struct device *dev);
+struct pasid_table *intel_pasid_get_table(struct device *dev);
+int intel_pasid_get_dev_max_id(struct device *dev);
+struct pasid_entry *intel_pasid_get_entry(struct device *dev, int pasid);
+void intel_pasid_clear_entry(struct device *dev, int pasid);
+
+#endif /* __INTEL_PASID_H */
diff --git a/drivers/iommu/intel-svm.c b/drivers/iommu/intel-svm.c
new file mode 100644
index 000000000..ef3aadec9
--- /dev/null
+++ b/drivers/iommu/intel-svm.c
@@ -0,0 +1,701 @@
+/*
+ * Copyright © 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * Authors: David Woodhouse <dwmw2@infradead.org>
+ */
+
+#include <linux/intel-iommu.h>
+#include <linux/mmu_notifier.h>
+#include <linux/sched.h>
+#include <linux/sched/mm.h>
+#include <linux/slab.h>
+#include <linux/intel-svm.h>
+#include <linux/rculist.h>
+#include <linux/pci.h>
+#include <linux/pci-ats.h>
+#include <linux/dmar.h>
+#include <linux/interrupt.h>
+#include <linux/mm_types.h>
+#include <asm/page.h>
+
+#include "intel-pasid.h"
+
+#define PASID_ENTRY_P		BIT_ULL(0)
+#define PASID_ENTRY_FLPM_5LP	BIT_ULL(9)
+#define PASID_ENTRY_SRE		BIT_ULL(11)
+
+static irqreturn_t prq_event_thread(int irq, void *d);
+
+struct pasid_state_entry {
+	u64 val;
+};
+
+int intel_svm_init(struct intel_iommu *iommu)
+{
+	struct page *pages;
+	int order;
+
+	if (cpu_feature_enabled(X86_FEATURE_GBPAGES) &&
+			!cap_fl1gp_support(iommu->cap))
+		return -EINVAL;
+
+	if (cpu_feature_enabled(X86_FEATURE_LA57) &&
+			!cap_5lp_support(iommu->cap))
+		return -EINVAL;
+
+	/* Start at 2 because it's defined as 2^(1+PSS) */
+	iommu->pasid_max = 2 << ecap_pss(iommu->ecap);
+
+	/* Eventually I'm promised we will get a multi-level PASID table
+	 * and it won't have to be physically contiguous. Until then,
+	 * limit the size because 8MiB contiguous allocations can be hard
+	 * to come by. The limit of 0x20000, which is 1MiB for each of
+	 * the PASID and PASID-state tables, is somewhat arbitrary. */
+	if (iommu->pasid_max > 0x20000)
+		iommu->pasid_max = 0x20000;
+
+	order = get_order(sizeof(struct pasid_entry) * iommu->pasid_max);
+	if (ecap_dis(iommu->ecap)) {
+		/* Just making it explicit... */
+		BUILD_BUG_ON(sizeof(struct pasid_entry) != sizeof(struct pasid_state_entry));
+		pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, order);
+		if (pages)
+			iommu->pasid_state_table = page_address(pages);
+		else
+			pr_warn("IOMMU: %s: Failed to allocate PASID state table\n",
+				iommu->name);
+	}
+
+	return 0;
+}
+
+int intel_svm_exit(struct intel_iommu *iommu)
+{
+	int order = get_order(sizeof(struct pasid_entry) * iommu->pasid_max);
+
+	if (iommu->pasid_state_table) {
+		free_pages((unsigned long)iommu->pasid_state_table, order);
+		iommu->pasid_state_table = NULL;
+	}
+
+	return 0;
+}
+
+#define PRQ_ORDER 0
+
+int intel_svm_enable_prq(struct intel_iommu *iommu)
+{
+	struct page *pages;
+	int irq, ret;
+
+	pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, PRQ_ORDER);
+	if (!pages) {
+		pr_warn("IOMMU: %s: Failed to allocate page request queue\n",
+			iommu->name);
+		return -ENOMEM;
+	}
+	iommu->prq = page_address(pages);
+
+	irq = dmar_alloc_hwirq(DMAR_UNITS_SUPPORTED + iommu->seq_id, iommu->node, iommu);
+	if (irq <= 0) {
+		pr_err("IOMMU: %s: Failed to create IRQ vector for page request queue\n",
+		       iommu->name);
+		ret = -EINVAL;
+	err:
+		free_pages((unsigned long)iommu->prq, PRQ_ORDER);
+		iommu->prq = NULL;
+		return ret;
+	}
+	iommu->pr_irq = irq;
+
+	snprintf(iommu->prq_name, sizeof(iommu->prq_name), "dmar%d-prq", iommu->seq_id);
+
+	ret = request_threaded_irq(irq, NULL, prq_event_thread, IRQF_ONESHOT,
+				   iommu->prq_name, iommu);
+	if (ret) {
+		pr_err("IOMMU: %s: Failed to request IRQ for page request queue\n",
+		       iommu->name);
+		dmar_free_hwirq(irq);
+		iommu->pr_irq = 0;
+		goto err;
+	}
+	dmar_writeq(iommu->reg + DMAR_PQH_REG, 0ULL);
+	dmar_writeq(iommu->reg + DMAR_PQT_REG, 0ULL);
+	dmar_writeq(iommu->reg + DMAR_PQA_REG, virt_to_phys(iommu->prq) | PRQ_ORDER);
+
+	return 0;
+}
+
+int intel_svm_finish_prq(struct intel_iommu *iommu)
+{
+	dmar_writeq(iommu->reg + DMAR_PQH_REG, 0ULL);
+	dmar_writeq(iommu->reg + DMAR_PQT_REG, 0ULL);
+	dmar_writeq(iommu->reg + DMAR_PQA_REG, 0ULL);
+
+	if (iommu->pr_irq) {
+		free_irq(iommu->pr_irq, iommu);
+		dmar_free_hwirq(iommu->pr_irq);
+		iommu->pr_irq = 0;
+	}
+
+	free_pages((unsigned long)iommu->prq, PRQ_ORDER);
+	iommu->prq = NULL;
+
+	return 0;
+}
+
+static void intel_flush_svm_range_dev (struct intel_svm *svm, struct intel_svm_dev *sdev,
+				       unsigned long address, unsigned long pages, int ih, int gl)
+{
+	struct qi_desc desc;
+
+	if (pages == -1) {
+		/* For global kernel pages we have to flush them in *all* PASIDs
+		 * because that's the only option the hardware gives us. Despite
+		 * the fact that they are actually only accessible through one. */
+		if (gl)
+			desc.low = QI_EIOTLB_PASID(svm->pasid) | QI_EIOTLB_DID(sdev->did) |
+				QI_EIOTLB_GRAN(QI_GRAN_ALL_ALL) | QI_EIOTLB_TYPE;
+		else
+			desc.low = QI_EIOTLB_PASID(svm->pasid) | QI_EIOTLB_DID(sdev->did) |
+				QI_EIOTLB_GRAN(QI_GRAN_NONG_PASID) | QI_EIOTLB_TYPE;
+		desc.high = 0;
+	} else {
+		int mask = ilog2(__roundup_pow_of_two(pages));
+
+		desc.low = QI_EIOTLB_PASID(svm->pasid) | QI_EIOTLB_DID(sdev->did) |
+			QI_EIOTLB_GRAN(QI_GRAN_PSI_PASID) | QI_EIOTLB_TYPE;
+		desc.high = QI_EIOTLB_ADDR(address) | QI_EIOTLB_GL(gl) |
+			QI_EIOTLB_IH(ih) | QI_EIOTLB_AM(mask);
+	}
+	qi_submit_sync(&desc, svm->iommu);
+
+	if (sdev->dev_iotlb) {
+		desc.low = QI_DEV_EIOTLB_PASID(svm->pasid) | QI_DEV_EIOTLB_SID(sdev->sid) |
+			QI_DEV_EIOTLB_QDEP(sdev->qdep) | QI_DEIOTLB_TYPE;
+		if (pages == -1) {
+			desc.high = QI_DEV_EIOTLB_ADDR(-1ULL >> 1) | QI_DEV_EIOTLB_SIZE;
+		} else if (pages > 1) {
+			/* The least significant zero bit indicates the size. So,
+			 * for example, an "address" value of 0x12345f000 will
+			 * flush from 0x123440000 to 0x12347ffff (256KiB). */
+			unsigned long last = address + ((unsigned long)(pages - 1) << VTD_PAGE_SHIFT);
+			unsigned long mask = __rounddown_pow_of_two(address ^ last);
+
+			desc.high = QI_DEV_EIOTLB_ADDR((address & ~mask) | (mask - 1)) | QI_DEV_EIOTLB_SIZE;
+		} else {
+			desc.high = QI_DEV_EIOTLB_ADDR(address);
+		}
+		qi_submit_sync(&desc, svm->iommu);
+	}
+}
+
+static void intel_flush_svm_range(struct intel_svm *svm, unsigned long address,
+				  unsigned long pages, int ih, int gl)
+{
+	struct intel_svm_dev *sdev;
+
+	/* Try deferred invalidate if available */
+	if (svm->iommu->pasid_state_table &&
+	    !cmpxchg64(&svm->iommu->pasid_state_table[svm->pasid].val, 0, 1ULL << 63))
+		return;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(sdev, &svm->devs, list)
+		intel_flush_svm_range_dev(svm, sdev, address, pages, ih, gl);
+	rcu_read_unlock();
+}
+
+static void intel_change_pte(struct mmu_notifier *mn, struct mm_struct *mm,
+			     unsigned long address, pte_t pte)
+{
+	struct intel_svm *svm = container_of(mn, struct intel_svm, notifier);
+
+	intel_flush_svm_range(svm, address, 1, 1, 0);
+}
+
+/* Pages have been freed at this point */
+static void intel_invalidate_range(struct mmu_notifier *mn,
+				   struct mm_struct *mm,
+				   unsigned long start, unsigned long end)
+{
+	struct intel_svm *svm = container_of(mn, struct intel_svm, notifier);
+
+	intel_flush_svm_range(svm, start,
+			      (end - start + PAGE_SIZE - 1) >> VTD_PAGE_SHIFT, 0, 0);
+}
+
+
+static void intel_flush_pasid_dev(struct intel_svm *svm, struct intel_svm_dev *sdev, int pasid)
+{
+	struct qi_desc desc;
+
+	desc.high = 0;
+	desc.low = QI_PC_TYPE | QI_PC_DID(sdev->did) | QI_PC_PASID_SEL | QI_PC_PASID(pasid);
+
+	qi_submit_sync(&desc, svm->iommu);
+}
+
+static void intel_mm_release(struct mmu_notifier *mn, struct mm_struct *mm)
+{
+	struct intel_svm *svm = container_of(mn, struct intel_svm, notifier);
+	struct intel_svm_dev *sdev;
+
+	/* This might end up being called from exit_mmap(), *before* the page
+	 * tables are cleared. And __mmu_notifier_release() will delete us from
+	 * the list of notifiers so that our invalidate_range() callback doesn't
+	 * get called when the page tables are cleared. So we need to protect
+	 * against hardware accessing those page tables.
+	 *
+	 * We do it by clearing the entry in the PASID table and then flushing
+	 * the IOTLB and the PASID table caches. This might upset hardware;
+	 * perhaps we'll want to point the PASID to a dummy PGD (like the zero
+	 * page) so that we end up taking a fault that the hardware really
+	 * *has* to handle gracefully without affecting other processes.
+	 */
+	rcu_read_lock();
+	list_for_each_entry_rcu(sdev, &svm->devs, list) {
+		intel_pasid_clear_entry(sdev->dev, svm->pasid);
+		intel_flush_pasid_dev(svm, sdev, svm->pasid);
+		intel_flush_svm_range_dev(svm, sdev, 0, -1, 0, !svm->mm);
+	}
+	rcu_read_unlock();
+
+}
+
+static const struct mmu_notifier_ops intel_mmuops = {
+	.flags = MMU_INVALIDATE_DOES_NOT_BLOCK,
+	.release = intel_mm_release,
+	.change_pte = intel_change_pte,
+	.invalidate_range = intel_invalidate_range,
+};
+
+static DEFINE_MUTEX(pasid_mutex);
+static LIST_HEAD(global_svm_list);
+
+int intel_svm_bind_mm(struct device *dev, int *pasid, int flags, struct svm_dev_ops *ops)
+{
+	struct intel_iommu *iommu = intel_svm_device_to_iommu(dev);
+	struct pasid_entry *entry;
+	struct intel_svm_dev *sdev;
+	struct intel_svm *svm = NULL;
+	struct mm_struct *mm = NULL;
+	u64 pasid_entry_val;
+	int pasid_max;
+	int ret;
+
+	if (!iommu || dmar_disabled)
+		return -EINVAL;
+
+	if (dev_is_pci(dev)) {
+		pasid_max = pci_max_pasids(to_pci_dev(dev));
+		if (pasid_max < 0)
+			return -EINVAL;
+	} else
+		pasid_max = 1 << 20;
+
+	if (flags & SVM_FLAG_SUPERVISOR_MODE) {
+		if (!ecap_srs(iommu->ecap))
+			return -EINVAL;
+	} else if (pasid) {
+		mm = get_task_mm(current);
+		BUG_ON(!mm);
+	}
+
+	mutex_lock(&pasid_mutex);
+	if (pasid && !(flags & SVM_FLAG_PRIVATE_PASID)) {
+		struct intel_svm *t;
+
+		list_for_each_entry(t, &global_svm_list, list) {
+			if (t->mm != mm || (t->flags & SVM_FLAG_PRIVATE_PASID))
+				continue;
+
+			svm = t;
+			if (svm->pasid >= pasid_max) {
+				dev_warn(dev,
+					 "Limited PASID width. Cannot use existing PASID %d\n",
+					 svm->pasid);
+				ret = -ENOSPC;
+				goto out;
+			}
+
+			list_for_each_entry(sdev, &svm->devs, list) {
+				if (dev == sdev->dev) {
+					if (sdev->ops != ops) {
+						ret = -EBUSY;
+						goto out;
+					}
+					sdev->users++;
+					goto success;
+				}
+			}
+
+			break;
+		}
+	}
+
+	sdev = kzalloc(sizeof(*sdev), GFP_KERNEL);
+	if (!sdev) {
+		ret = -ENOMEM;
+		goto out;
+	}
+	sdev->dev = dev;
+
+	ret = intel_iommu_enable_pasid(iommu, sdev);
+	if (ret || !pasid) {
+		/* If they don't actually want to assign a PASID, this is
+		 * just an enabling check/preparation. */
+		kfree(sdev);
+		goto out;
+	}
+	/* Finish the setup now we know we're keeping it */
+	sdev->users = 1;
+	sdev->ops = ops;
+	init_rcu_head(&sdev->rcu);
+
+	if (!svm) {
+		svm = kzalloc(sizeof(*svm), GFP_KERNEL);
+		if (!svm) {
+			ret = -ENOMEM;
+			kfree(sdev);
+			goto out;
+		}
+		svm->iommu = iommu;
+
+		if (pasid_max > intel_pasid_max_id)
+			pasid_max = intel_pasid_max_id;
+
+		/* Do not use PASID 0 in caching mode (virtualised IOMMU) */
+		ret = intel_pasid_alloc_id(svm,
+					   !!cap_caching_mode(iommu->cap),
+					   pasid_max, GFP_KERNEL);
+		if (ret < 0) {
+			kfree(svm);
+			kfree(sdev);
+			goto out;
+		}
+		svm->pasid = ret;
+		svm->notifier.ops = &intel_mmuops;
+		svm->mm = mm;
+		svm->flags = flags;
+		INIT_LIST_HEAD_RCU(&svm->devs);
+		INIT_LIST_HEAD(&svm->list);
+		ret = -ENOMEM;
+		if (mm) {
+			ret = mmu_notifier_register(&svm->notifier, mm);
+			if (ret) {
+				intel_pasid_free_id(svm->pasid);
+				kfree(svm);
+				kfree(sdev);
+				goto out;
+			}
+			pasid_entry_val = (u64)__pa(mm->pgd) | PASID_ENTRY_P;
+		} else
+			pasid_entry_val = (u64)__pa(init_mm.pgd) |
+					  PASID_ENTRY_P | PASID_ENTRY_SRE;
+		if (cpu_feature_enabled(X86_FEATURE_LA57))
+			pasid_entry_val |= PASID_ENTRY_FLPM_5LP;
+
+		entry = intel_pasid_get_entry(dev, svm->pasid);
+		entry->val = pasid_entry_val;
+
+		wmb();
+
+		/*
+		 * Flush PASID cache when a PASID table entry becomes
+		 * present.
+		 */
+		if (cap_caching_mode(iommu->cap))
+			intel_flush_pasid_dev(svm, sdev, svm->pasid);
+
+		list_add_tail(&svm->list, &global_svm_list);
+	}
+	list_add_rcu(&sdev->list, &svm->devs);
+
+ success:
+	*pasid = svm->pasid;
+	ret = 0;
+ out:
+	mutex_unlock(&pasid_mutex);
+	if (mm)
+		mmput(mm);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(intel_svm_bind_mm);
+
+int intel_svm_unbind_mm(struct device *dev, int pasid)
+{
+	struct intel_svm_dev *sdev;
+	struct intel_iommu *iommu;
+	struct intel_svm *svm;
+	int ret = -EINVAL;
+
+	mutex_lock(&pasid_mutex);
+	iommu = intel_svm_device_to_iommu(dev);
+	if (!iommu)
+		goto out;
+
+	svm = intel_pasid_lookup_id(pasid);
+	if (!svm)
+		goto out;
+
+	list_for_each_entry(sdev, &svm->devs, list) {
+		if (dev == sdev->dev) {
+			ret = 0;
+			sdev->users--;
+			if (!sdev->users) {
+				list_del_rcu(&sdev->list);
+				/* Flush the PASID cache and IOTLB for this device.
+				 * Note that we do depend on the hardware *not* using
+				 * the PASID any more. Just as we depend on other
+				 * devices never using PASIDs that they have no right
+				 * to use. We have a *shared* PASID table, because it's
+				 * large and has to be physically contiguous. So it's
+				 * hard to be as defensive as we might like. */
+				intel_flush_pasid_dev(svm, sdev, svm->pasid);
+				intel_flush_svm_range_dev(svm, sdev, 0, -1, 0, !svm->mm);
+				kfree_rcu(sdev, rcu);
+				intel_pasid_clear_entry(dev, svm->pasid);
+
+				if (list_empty(&svm->devs)) {
+					intel_pasid_free_id(svm->pasid);
+					if (svm->mm)
+						mmu_notifier_unregister(&svm->notifier, svm->mm);
+
+					list_del(&svm->list);
+
+					/* We mandate that no page faults may be outstanding
+					 * for the PASID when intel_svm_unbind_mm() is called.
+					 * If that is not obeyed, subtle errors will happen.
+					 * Let's make them less subtle... */
+					memset(svm, 0x6b, sizeof(*svm));
+					kfree(svm);
+				}
+			}
+			break;
+		}
+	}
+ out:
+	mutex_unlock(&pasid_mutex);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(intel_svm_unbind_mm);
+
+int intel_svm_is_pasid_valid(struct device *dev, int pasid)
+{
+	struct intel_iommu *iommu;
+	struct intel_svm *svm;
+	int ret = -EINVAL;
+
+	mutex_lock(&pasid_mutex);
+	iommu = intel_svm_device_to_iommu(dev);
+	if (!iommu)
+		goto out;
+
+	svm = intel_pasid_lookup_id(pasid);
+	if (!svm)
+		goto out;
+
+	/* init_mm is used in this case */
+	if (!svm->mm)
+		ret = 1;
+	else if (atomic_read(&svm->mm->mm_users) > 0)
+		ret = 1;
+	else
+		ret = 0;
+
+ out:
+	mutex_unlock(&pasid_mutex);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(intel_svm_is_pasid_valid);
+
+/* Page request queue descriptor */
+struct page_req_dsc {
+	u64 srr:1;
+	u64 bof:1;
+	u64 pasid_present:1;
+	u64 lpig:1;
+	u64 pasid:20;
+	u64 bus:8;
+	u64 private:23;
+	u64 prg_index:9;
+	u64 rd_req:1;
+	u64 wr_req:1;
+	u64 exe_req:1;
+	u64 priv_req:1;
+	u64 devfn:8;
+	u64 addr:52;
+};
+
+#define PRQ_RING_MASK ((0x1000 << PRQ_ORDER) - 0x10)
+
+static bool access_error(struct vm_area_struct *vma, struct page_req_dsc *req)
+{
+	unsigned long requested = 0;
+
+	if (req->exe_req)
+		requested |= VM_EXEC;
+
+	if (req->rd_req)
+		requested |= VM_READ;
+
+	if (req->wr_req)
+		requested |= VM_WRITE;
+
+	return (requested & ~vma->vm_flags) != 0;
+}
+
+static bool is_canonical_address(u64 addr)
+{
+	int shift = 64 - (__VIRTUAL_MASK_SHIFT + 1);
+	long saddr = (long) addr;
+
+	return (((saddr << shift) >> shift) == saddr);
+}
+
+static irqreturn_t prq_event_thread(int irq, void *d)
+{
+	struct intel_iommu *iommu = d;
+	struct intel_svm *svm = NULL;
+	int head, tail, handled = 0;
+
+	/* Clear PPR bit before reading head/tail registers, to
+	 * ensure that we get a new interrupt if needed. */
+	writel(DMA_PRS_PPR, iommu->reg + DMAR_PRS_REG);
+
+	tail = dmar_readq(iommu->reg + DMAR_PQT_REG) & PRQ_RING_MASK;
+	head = dmar_readq(iommu->reg + DMAR_PQH_REG) & PRQ_RING_MASK;
+	while (head != tail) {
+		struct intel_svm_dev *sdev;
+		struct vm_area_struct *vma;
+		struct page_req_dsc *req;
+		struct qi_desc resp;
+		int result;
+		vm_fault_t ret;
+		u64 address;
+
+		handled = 1;
+
+		req = &iommu->prq[head / sizeof(*req)];
+
+		result = QI_RESP_FAILURE;
+		address = (u64)req->addr << VTD_PAGE_SHIFT;
+		if (!req->pasid_present) {
+			pr_err("%s: Page request without PASID: %08llx %08llx\n",
+			       iommu->name, ((unsigned long long *)req)[0],
+			       ((unsigned long long *)req)[1]);
+			goto no_pasid;
+		}
+
+		if (!svm || svm->pasid != req->pasid) {
+			rcu_read_lock();
+			svm = intel_pasid_lookup_id(req->pasid);
+			/* It *can't* go away, because the driver is not permitted
+			 * to unbind the mm while any page faults are outstanding.
+			 * So we only need RCU to protect the internal idr code. */
+			rcu_read_unlock();
+
+			if (!svm) {
+				pr_err("%s: Page request for invalid PASID %d: %08llx %08llx\n",
+				       iommu->name, req->pasid, ((unsigned long long *)req)[0],
+				       ((unsigned long long *)req)[1]);
+				goto no_pasid;
+			}
+		}
+
+		result = QI_RESP_INVALID;
+		/* Since we're using init_mm.pgd directly, we should never take
+		 * any faults on kernel addresses. */
+		if (!svm->mm)
+			goto bad_req;
+
+		/* If address is not canonical, return invalid response */
+		if (!is_canonical_address(address))
+			goto bad_req;
+
+		/* If the mm is already defunct, don't handle faults. */
+		if (!mmget_not_zero(svm->mm))
+			goto bad_req;
+
+		down_read(&svm->mm->mmap_sem);
+		vma = find_extend_vma(svm->mm, address);
+		if (!vma || address < vma->vm_start)
+			goto invalid;
+
+		if (access_error(vma, req))
+			goto invalid;
+
+		ret = handle_mm_fault(vma, address,
+				      req->wr_req ? FAULT_FLAG_WRITE : 0);
+		if (ret & VM_FAULT_ERROR)
+			goto invalid;
+
+		result = QI_RESP_SUCCESS;
+	invalid:
+		up_read(&svm->mm->mmap_sem);
+		mmput(svm->mm);
+	bad_req:
+		/* Accounting for major/minor faults? */
+		rcu_read_lock();
+		list_for_each_entry_rcu(sdev, &svm->devs, list) {
+			if (sdev->sid == PCI_DEVID(req->bus, req->devfn))
+				break;
+		}
+		/* Other devices can go away, but the drivers are not permitted
+		 * to unbind while any page faults might be in flight. So it's
+		 * OK to drop the 'lock' here now we have it. */
+		rcu_read_unlock();
+
+		if (WARN_ON(&sdev->list == &svm->devs))
+			sdev = NULL;
+
+		if (sdev && sdev->ops && sdev->ops->fault_cb) {
+			int rwxp = (req->rd_req << 3) | (req->wr_req << 2) |
+				(req->exe_req << 1) | (req->priv_req);
+			sdev->ops->fault_cb(sdev->dev, req->pasid, req->addr, req->private, rwxp, result);
+		}
+		/* We get here in the error case where the PASID lookup failed,
+		   and these can be NULL. Do not use them below this point! */
+		sdev = NULL;
+		svm = NULL;
+	no_pasid:
+		if (req->lpig) {
+			/* Page Group Response */
+			resp.low = QI_PGRP_PASID(req->pasid) |
+				QI_PGRP_DID((req->bus << 8) | req->devfn) |
+				QI_PGRP_PASID_P(req->pasid_present) |
+				QI_PGRP_RESP_TYPE;
+			resp.high = QI_PGRP_IDX(req->prg_index) |
+				QI_PGRP_PRIV(req->private) | QI_PGRP_RESP_CODE(result);
+
+			qi_submit_sync(&resp, iommu);
+		} else if (req->srr) {
+			/* Page Stream Response */
+			resp.low = QI_PSTRM_IDX(req->prg_index) |
+				QI_PSTRM_PRIV(req->private) | QI_PSTRM_BUS(req->bus) |
+				QI_PSTRM_PASID(req->pasid) | QI_PSTRM_RESP_TYPE;
+			resp.high = QI_PSTRM_ADDR(address) | QI_PSTRM_DEVFN(req->devfn) |
+				QI_PSTRM_RESP_CODE(result);
+
+			qi_submit_sync(&resp, iommu);
+		}
+
+		head = (head + sizeof(*req)) & PRQ_RING_MASK;
+	}
+
+	dmar_writeq(iommu->reg + DMAR_PQH_REG, tail);
+
+	return IRQ_RETVAL(handled);
+}
diff --git a/drivers/iommu/intel_irq_remapping.c b/drivers/iommu/intel_irq_remapping.c
new file mode 100644
index 000000000..17fc262f2
--- /dev/null
+++ b/drivers/iommu/intel_irq_remapping.c
@@ -0,0 +1,1514 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#define pr_fmt(fmt)     "DMAR-IR: " fmt
+
+#include <linux/interrupt.h>
+#include <linux/dmar.h>
+#include <linux/spinlock.h>
+#include <linux/slab.h>
+#include <linux/jiffies.h>
+#include <linux/hpet.h>
+#include <linux/pci.h>
+#include <linux/irq.h>
+#include <linux/intel-iommu.h>
+#include <linux/acpi.h>
+#include <linux/irqdomain.h>
+#include <linux/crash_dump.h>
+#include <asm/io_apic.h>
+#include <asm/smp.h>
+#include <asm/cpu.h>
+#include <asm/irq_remapping.h>
+#include <asm/pci-direct.h>
+#include <asm/msidef.h>
+
+#include "irq_remapping.h"
+
+enum irq_mode {
+	IRQ_REMAPPING,
+	IRQ_POSTING,
+};
+
+struct ioapic_scope {
+	struct intel_iommu *iommu;
+	unsigned int id;
+	unsigned int bus;	/* PCI bus number */
+	unsigned int devfn;	/* PCI devfn number */
+};
+
+struct hpet_scope {
+	struct intel_iommu *iommu;
+	u8 id;
+	unsigned int bus;
+	unsigned int devfn;
+};
+
+struct irq_2_iommu {
+	struct intel_iommu *iommu;
+	u16 irte_index;
+	u16 sub_handle;
+	u8  irte_mask;
+	enum irq_mode mode;
+};
+
+struct intel_ir_data {
+	struct irq_2_iommu			irq_2_iommu;
+	struct irte				irte_entry;
+	union {
+		struct msi_msg			msi_entry;
+	};
+};
+
+#define IR_X2APIC_MODE(mode) (mode ? (1 << 11) : 0)
+#define IRTE_DEST(dest) ((eim_mode) ? dest : dest << 8)
+
+static int __read_mostly eim_mode;
+static struct ioapic_scope ir_ioapic[MAX_IO_APICS];
+static struct hpet_scope ir_hpet[MAX_HPET_TBS];
+
+/*
+ * Lock ordering:
+ * ->dmar_global_lock
+ *	->irq_2_ir_lock
+ *		->qi->q_lock
+ *	->iommu->register_lock
+ * Note:
+ * intel_irq_remap_ops.{supported,prepare,enable,disable,reenable} are called
+ * in single-threaded environment with interrupt disabled, so no need to tabke
+ * the dmar_global_lock.
+ */
+static DEFINE_RAW_SPINLOCK(irq_2_ir_lock);
+static const struct irq_domain_ops intel_ir_domain_ops;
+
+static void iommu_disable_irq_remapping(struct intel_iommu *iommu);
+static int __init parse_ioapics_under_ir(void);
+
+static bool ir_pre_enabled(struct intel_iommu *iommu)
+{
+	return (iommu->flags & VTD_FLAG_IRQ_REMAP_PRE_ENABLED);
+}
+
+static void clear_ir_pre_enabled(struct intel_iommu *iommu)
+{
+	iommu->flags &= ~VTD_FLAG_IRQ_REMAP_PRE_ENABLED;
+}
+
+static void init_ir_status(struct intel_iommu *iommu)
+{
+	u32 gsts;
+
+	gsts = readl(iommu->reg + DMAR_GSTS_REG);
+	if (gsts & DMA_GSTS_IRES)
+		iommu->flags |= VTD_FLAG_IRQ_REMAP_PRE_ENABLED;
+}
+
+static int alloc_irte(struct intel_iommu *iommu, int irq,
+		      struct irq_2_iommu *irq_iommu, u16 count)
+{
+	struct ir_table *table = iommu->ir_table;
+	unsigned int mask = 0;
+	unsigned long flags;
+	int index;
+
+	if (!count || !irq_iommu)
+		return -1;
+
+	if (count > 1) {
+		count = __roundup_pow_of_two(count);
+		mask = ilog2(count);
+	}
+
+	if (mask > ecap_max_handle_mask(iommu->ecap)) {
+		pr_err("Requested mask %x exceeds the max invalidation handle"
+		       " mask value %Lx\n", mask,
+		       ecap_max_handle_mask(iommu->ecap));
+		return -1;
+	}
+
+	raw_spin_lock_irqsave(&irq_2_ir_lock, flags);
+	index = bitmap_find_free_region(table->bitmap,
+					INTR_REMAP_TABLE_ENTRIES, mask);
+	if (index < 0) {
+		pr_warn("IR%d: can't allocate an IRTE\n", iommu->seq_id);
+	} else {
+		irq_iommu->iommu = iommu;
+		irq_iommu->irte_index =  index;
+		irq_iommu->sub_handle = 0;
+		irq_iommu->irte_mask = mask;
+		irq_iommu->mode = IRQ_REMAPPING;
+	}
+	raw_spin_unlock_irqrestore(&irq_2_ir_lock, flags);
+
+	return index;
+}
+
+static int qi_flush_iec(struct intel_iommu *iommu, int index, int mask)
+{
+	struct qi_desc desc;
+
+	desc.low = QI_IEC_IIDEX(index) | QI_IEC_TYPE | QI_IEC_IM(mask)
+		   | QI_IEC_SELECTIVE;
+	desc.high = 0;
+
+	return qi_submit_sync(&desc, iommu);
+}
+
+static int modify_irte(struct irq_2_iommu *irq_iommu,
+		       struct irte *irte_modified)
+{
+	struct intel_iommu *iommu;
+	unsigned long flags;
+	struct irte *irte;
+	int rc, index;
+
+	if (!irq_iommu)
+		return -1;
+
+	raw_spin_lock_irqsave(&irq_2_ir_lock, flags);
+
+	iommu = irq_iommu->iommu;
+
+	index = irq_iommu->irte_index + irq_iommu->sub_handle;
+	irte = &iommu->ir_table->base[index];
+
+#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE)
+	if ((irte->pst == 1) || (irte_modified->pst == 1)) {
+		bool ret;
+
+		ret = cmpxchg_double(&irte->low, &irte->high,
+				     irte->low, irte->high,
+				     irte_modified->low, irte_modified->high);
+		/*
+		 * We use cmpxchg16 to atomically update the 128-bit IRTE,
+		 * and it cannot be updated by the hardware or other processors
+		 * behind us, so the return value of cmpxchg16 should be the
+		 * same as the old value.
+		 */
+		WARN_ON(!ret);
+	} else
+#endif
+	{
+		set_64bit(&irte->low, irte_modified->low);
+		set_64bit(&irte->high, irte_modified->high);
+	}
+	__iommu_flush_cache(iommu, irte, sizeof(*irte));
+
+	rc = qi_flush_iec(iommu, index, 0);
+
+	/* Update iommu mode according to the IRTE mode */
+	irq_iommu->mode = irte->pst ? IRQ_POSTING : IRQ_REMAPPING;
+	raw_spin_unlock_irqrestore(&irq_2_ir_lock, flags);
+
+	return rc;
+}
+
+static struct intel_iommu *map_hpet_to_ir(u8 hpet_id)
+{
+	int i;
+
+	for (i = 0; i < MAX_HPET_TBS; i++)
+		if (ir_hpet[i].id == hpet_id && ir_hpet[i].iommu)
+			return ir_hpet[i].iommu;
+	return NULL;
+}
+
+static struct intel_iommu *map_ioapic_to_ir(int apic)
+{
+	int i;
+
+	for (i = 0; i < MAX_IO_APICS; i++)
+		if (ir_ioapic[i].id == apic && ir_ioapic[i].iommu)
+			return ir_ioapic[i].iommu;
+	return NULL;
+}
+
+static struct intel_iommu *map_dev_to_ir(struct pci_dev *dev)
+{
+	struct dmar_drhd_unit *drhd;
+
+	drhd = dmar_find_matched_drhd_unit(dev);
+	if (!drhd)
+		return NULL;
+
+	return drhd->iommu;
+}
+
+static int clear_entries(struct irq_2_iommu *irq_iommu)
+{
+	struct irte *start, *entry, *end;
+	struct intel_iommu *iommu;
+	int index;
+
+	if (irq_iommu->sub_handle)
+		return 0;
+
+	iommu = irq_iommu->iommu;
+	index = irq_iommu->irte_index;
+
+	start = iommu->ir_table->base + index;
+	end = start + (1 << irq_iommu->irte_mask);
+
+	for (entry = start; entry < end; entry++) {
+		set_64bit(&entry->low, 0);
+		set_64bit(&entry->high, 0);
+	}
+	bitmap_release_region(iommu->ir_table->bitmap, index,
+			      irq_iommu->irte_mask);
+
+	return qi_flush_iec(iommu, index, irq_iommu->irte_mask);
+}
+
+/*
+ * source validation type
+ */
+#define SVT_NO_VERIFY		0x0  /* no verification is required */
+#define SVT_VERIFY_SID_SQ	0x1  /* verify using SID and SQ fields */
+#define SVT_VERIFY_BUS		0x2  /* verify bus of request-id */
+
+/*
+ * source-id qualifier
+ */
+#define SQ_ALL_16	0x0  /* verify all 16 bits of request-id */
+#define SQ_13_IGNORE_1	0x1  /* verify most significant 13 bits, ignore
+			      * the third least significant bit
+			      */
+#define SQ_13_IGNORE_2	0x2  /* verify most significant 13 bits, ignore
+			      * the second and third least significant bits
+			      */
+#define SQ_13_IGNORE_3	0x3  /* verify most significant 13 bits, ignore
+			      * the least three significant bits
+			      */
+
+/*
+ * set SVT, SQ and SID fields of irte to verify
+ * source ids of interrupt requests
+ */
+static void set_irte_sid(struct irte *irte, unsigned int svt,
+			 unsigned int sq, unsigned int sid)
+{
+	if (disable_sourceid_checking)
+		svt = SVT_NO_VERIFY;
+	irte->svt = svt;
+	irte->sq = sq;
+	irte->sid = sid;
+}
+
+static int set_ioapic_sid(struct irte *irte, int apic)
+{
+	int i;
+	u16 sid = 0;
+
+	if (!irte)
+		return -1;
+
+	down_read(&dmar_global_lock);
+	for (i = 0; i < MAX_IO_APICS; i++) {
+		if (ir_ioapic[i].iommu && ir_ioapic[i].id == apic) {
+			sid = (ir_ioapic[i].bus << 8) | ir_ioapic[i].devfn;
+			break;
+		}
+	}
+	up_read(&dmar_global_lock);
+
+	if (sid == 0) {
+		pr_warn("Failed to set source-id of IOAPIC (%d)\n", apic);
+		return -1;
+	}
+
+	set_irte_sid(irte, SVT_VERIFY_SID_SQ, SQ_ALL_16, sid);
+
+	return 0;
+}
+
+static int set_hpet_sid(struct irte *irte, u8 id)
+{
+	int i;
+	u16 sid = 0;
+
+	if (!irte)
+		return -1;
+
+	down_read(&dmar_global_lock);
+	for (i = 0; i < MAX_HPET_TBS; i++) {
+		if (ir_hpet[i].iommu && ir_hpet[i].id == id) {
+			sid = (ir_hpet[i].bus << 8) | ir_hpet[i].devfn;
+			break;
+		}
+	}
+	up_read(&dmar_global_lock);
+
+	if (sid == 0) {
+		pr_warn("Failed to set source-id of HPET block (%d)\n", id);
+		return -1;
+	}
+
+	/*
+	 * Should really use SQ_ALL_16. Some platforms are broken.
+	 * While we figure out the right quirks for these broken platforms, use
+	 * SQ_13_IGNORE_3 for now.
+	 */
+	set_irte_sid(irte, SVT_VERIFY_SID_SQ, SQ_13_IGNORE_3, sid);
+
+	return 0;
+}
+
+struct set_msi_sid_data {
+	struct pci_dev *pdev;
+	u16 alias;
+};
+
+static int set_msi_sid_cb(struct pci_dev *pdev, u16 alias, void *opaque)
+{
+	struct set_msi_sid_data *data = opaque;
+
+	data->pdev = pdev;
+	data->alias = alias;
+
+	return 0;
+}
+
+static int set_msi_sid(struct irte *irte, struct pci_dev *dev)
+{
+	struct set_msi_sid_data data;
+
+	if (!irte || !dev)
+		return -1;
+
+	pci_for_each_dma_alias(dev, set_msi_sid_cb, &data);
+
+	/*
+	 * DMA alias provides us with a PCI device and alias.  The only case
+	 * where the it will return an alias on a different bus than the
+	 * device is the case of a PCIe-to-PCI bridge, where the alias is for
+	 * the subordinate bus.  In this case we can only verify the bus.
+	 *
+	 * If the alias device is on a different bus than our source device
+	 * then we have a topology based alias, use it.
+	 *
+	 * Otherwise, the alias is for a device DMA quirk and we cannot
+	 * assume that MSI uses the same requester ID.  Therefore use the
+	 * original device.
+	 */
+	if (PCI_BUS_NUM(data.alias) != data.pdev->bus->number)
+		set_irte_sid(irte, SVT_VERIFY_BUS, SQ_ALL_16,
+			     PCI_DEVID(PCI_BUS_NUM(data.alias),
+				       dev->bus->number));
+	else if (data.pdev->bus->number != dev->bus->number)
+		set_irte_sid(irte, SVT_VERIFY_SID_SQ, SQ_ALL_16, data.alias);
+	else
+		set_irte_sid(irte, SVT_VERIFY_SID_SQ, SQ_ALL_16,
+			     PCI_DEVID(dev->bus->number, dev->devfn));
+
+	return 0;
+}
+
+static int iommu_load_old_irte(struct intel_iommu *iommu)
+{
+	struct irte *old_ir_table;
+	phys_addr_t irt_phys;
+	unsigned int i;
+	size_t size;
+	u64 irta;
+
+	/* Check whether the old ir-table has the same size as ours */
+	irta = dmar_readq(iommu->reg + DMAR_IRTA_REG);
+	if ((irta & INTR_REMAP_TABLE_REG_SIZE_MASK)
+	     != INTR_REMAP_TABLE_REG_SIZE)
+		return -EINVAL;
+
+	irt_phys = irta & VTD_PAGE_MASK;
+	size     = INTR_REMAP_TABLE_ENTRIES*sizeof(struct irte);
+
+	/* Map the old IR table */
+	old_ir_table = memremap(irt_phys, size, MEMREMAP_WB);
+	if (!old_ir_table)
+		return -ENOMEM;
+
+	/* Copy data over */
+	memcpy(iommu->ir_table->base, old_ir_table, size);
+
+	__iommu_flush_cache(iommu, iommu->ir_table->base, size);
+
+	/*
+	 * Now check the table for used entries and mark those as
+	 * allocated in the bitmap
+	 */
+	for (i = 0; i < INTR_REMAP_TABLE_ENTRIES; i++) {
+		if (iommu->ir_table->base[i].present)
+			bitmap_set(iommu->ir_table->bitmap, i, 1);
+	}
+
+	memunmap(old_ir_table);
+
+	return 0;
+}
+
+
+static void iommu_set_irq_remapping(struct intel_iommu *iommu, int mode)
+{
+	unsigned long flags;
+	u64 addr;
+	u32 sts;
+
+	addr = virt_to_phys((void *)iommu->ir_table->base);
+
+	raw_spin_lock_irqsave(&iommu->register_lock, flags);
+
+	dmar_writeq(iommu->reg + DMAR_IRTA_REG,
+		    (addr) | IR_X2APIC_MODE(mode) | INTR_REMAP_TABLE_REG_SIZE);
+
+	/* Set interrupt-remapping table pointer */
+	writel(iommu->gcmd | DMA_GCMD_SIRTP, iommu->reg + DMAR_GCMD_REG);
+
+	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
+		      readl, (sts & DMA_GSTS_IRTPS), sts);
+	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
+
+	/*
+	 * Global invalidation of interrupt entry cache to make sure the
+	 * hardware uses the new irq remapping table.
+	 */
+	qi_global_iec(iommu);
+}
+
+static void iommu_enable_irq_remapping(struct intel_iommu *iommu)
+{
+	unsigned long flags;
+	u32 sts;
+
+	raw_spin_lock_irqsave(&iommu->register_lock, flags);
+
+	/* Enable interrupt-remapping */
+	iommu->gcmd |= DMA_GCMD_IRE;
+	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
+	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
+		      readl, (sts & DMA_GSTS_IRES), sts);
+
+	/* Block compatibility-format MSIs */
+	if (sts & DMA_GSTS_CFIS) {
+		iommu->gcmd &= ~DMA_GCMD_CFI;
+		writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
+		IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
+			      readl, !(sts & DMA_GSTS_CFIS), sts);
+	}
+
+	/*
+	 * With CFI clear in the Global Command register, we should be
+	 * protected from dangerous (i.e. compatibility) interrupts
+	 * regardless of x2apic status.  Check just to be sure.
+	 */
+	if (sts & DMA_GSTS_CFIS)
+		WARN(1, KERN_WARNING
+			"Compatibility-format IRQs enabled despite intr remapping;\n"
+			"you are vulnerable to IRQ injection.\n");
+
+	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
+}
+
+static int intel_setup_irq_remapping(struct intel_iommu *iommu)
+{
+	struct ir_table *ir_table;
+	struct fwnode_handle *fn;
+	unsigned long *bitmap;
+	struct page *pages;
+
+	if (iommu->ir_table)
+		return 0;
+
+	ir_table = kzalloc(sizeof(struct ir_table), GFP_KERNEL);
+	if (!ir_table)
+		return -ENOMEM;
+
+	pages = alloc_pages_node(iommu->node, GFP_KERNEL | __GFP_ZERO,
+				 INTR_REMAP_PAGE_ORDER);
+	if (!pages) {
+		pr_err("IR%d: failed to allocate pages of order %d\n",
+		       iommu->seq_id, INTR_REMAP_PAGE_ORDER);
+		goto out_free_table;
+	}
+
+	bitmap = kcalloc(BITS_TO_LONGS(INTR_REMAP_TABLE_ENTRIES),
+			 sizeof(long), GFP_ATOMIC);
+	if (bitmap == NULL) {
+		pr_err("IR%d: failed to allocate bitmap\n", iommu->seq_id);
+		goto out_free_pages;
+	}
+
+	fn = irq_domain_alloc_named_id_fwnode("INTEL-IR", iommu->seq_id);
+	if (!fn)
+		goto out_free_bitmap;
+
+	iommu->ir_domain =
+		irq_domain_create_hierarchy(arch_get_ir_parent_domain(),
+					    0, INTR_REMAP_TABLE_ENTRIES,
+					    fn, &intel_ir_domain_ops,
+					    iommu);
+	if (!iommu->ir_domain) {
+		pr_err("IR%d: failed to allocate irqdomain\n", iommu->seq_id);
+		goto out_free_fwnode;
+	}
+	iommu->ir_msi_domain =
+		arch_create_remap_msi_irq_domain(iommu->ir_domain,
+						 "INTEL-IR-MSI",
+						 iommu->seq_id);
+
+	ir_table->base = page_address(pages);
+	ir_table->bitmap = bitmap;
+	iommu->ir_table = ir_table;
+
+	/*
+	 * If the queued invalidation is already initialized,
+	 * shouldn't disable it.
+	 */
+	if (!iommu->qi) {
+		/*
+		 * Clear previous faults.
+		 */
+		dmar_fault(-1, iommu);
+		dmar_disable_qi(iommu);
+
+		if (dmar_enable_qi(iommu)) {
+			pr_err("Failed to enable queued invalidation\n");
+			goto out_free_ir_domain;
+		}
+	}
+
+	init_ir_status(iommu);
+
+	if (ir_pre_enabled(iommu)) {
+		if (!is_kdump_kernel()) {
+			pr_warn("IRQ remapping was enabled on %s but we are not in kdump mode\n",
+				iommu->name);
+			clear_ir_pre_enabled(iommu);
+			iommu_disable_irq_remapping(iommu);
+		} else if (iommu_load_old_irte(iommu))
+			pr_err("Failed to copy IR table for %s from previous kernel\n",
+			       iommu->name);
+		else
+			pr_info("Copied IR table for %s from previous kernel\n",
+				iommu->name);
+	}
+
+	iommu_set_irq_remapping(iommu, eim_mode);
+
+	return 0;
+
+out_free_ir_domain:
+	if (iommu->ir_msi_domain)
+		irq_domain_remove(iommu->ir_msi_domain);
+	iommu->ir_msi_domain = NULL;
+	irq_domain_remove(iommu->ir_domain);
+	iommu->ir_domain = NULL;
+out_free_fwnode:
+	irq_domain_free_fwnode(fn);
+out_free_bitmap:
+	kfree(bitmap);
+out_free_pages:
+	__free_pages(pages, INTR_REMAP_PAGE_ORDER);
+out_free_table:
+	kfree(ir_table);
+
+	iommu->ir_table  = NULL;
+
+	return -ENOMEM;
+}
+
+static void intel_teardown_irq_remapping(struct intel_iommu *iommu)
+{
+	struct fwnode_handle *fn;
+
+	if (iommu && iommu->ir_table) {
+		if (iommu->ir_msi_domain) {
+			fn = iommu->ir_msi_domain->fwnode;
+
+			irq_domain_remove(iommu->ir_msi_domain);
+			irq_domain_free_fwnode(fn);
+			iommu->ir_msi_domain = NULL;
+		}
+		if (iommu->ir_domain) {
+			fn = iommu->ir_domain->fwnode;
+
+			irq_domain_remove(iommu->ir_domain);
+			irq_domain_free_fwnode(fn);
+			iommu->ir_domain = NULL;
+		}
+		free_pages((unsigned long)iommu->ir_table->base,
+			   INTR_REMAP_PAGE_ORDER);
+		kfree(iommu->ir_table->bitmap);
+		kfree(iommu->ir_table);
+		iommu->ir_table = NULL;
+	}
+}
+
+/*
+ * Disable Interrupt Remapping.
+ */
+static void iommu_disable_irq_remapping(struct intel_iommu *iommu)
+{
+	unsigned long flags;
+	u32 sts;
+
+	if (!ecap_ir_support(iommu->ecap))
+		return;
+
+	/*
+	 * global invalidation of interrupt entry cache before disabling
+	 * interrupt-remapping.
+	 */
+	qi_global_iec(iommu);
+
+	raw_spin_lock_irqsave(&iommu->register_lock, flags);
+
+	sts = readl(iommu->reg + DMAR_GSTS_REG);
+	if (!(sts & DMA_GSTS_IRES))
+		goto end;
+
+	iommu->gcmd &= ~DMA_GCMD_IRE;
+	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
+
+	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
+		      readl, !(sts & DMA_GSTS_IRES), sts);
+
+end:
+	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
+}
+
+static int __init dmar_x2apic_optout(void)
+{
+	struct acpi_table_dmar *dmar;
+	dmar = (struct acpi_table_dmar *)dmar_tbl;
+	if (!dmar || no_x2apic_optout)
+		return 0;
+	return dmar->flags & DMAR_X2APIC_OPT_OUT;
+}
+
+static void __init intel_cleanup_irq_remapping(void)
+{
+	struct dmar_drhd_unit *drhd;
+	struct intel_iommu *iommu;
+
+	for_each_iommu(iommu, drhd) {
+		if (ecap_ir_support(iommu->ecap)) {
+			iommu_disable_irq_remapping(iommu);
+			intel_teardown_irq_remapping(iommu);
+		}
+	}
+
+	if (x2apic_supported())
+		pr_warn("Failed to enable irq remapping. You are vulnerable to irq-injection attacks.\n");
+}
+
+static int __init intel_prepare_irq_remapping(void)
+{
+	struct dmar_drhd_unit *drhd;
+	struct intel_iommu *iommu;
+	int eim = 0;
+
+	if (irq_remap_broken) {
+		pr_warn("This system BIOS has enabled interrupt remapping\n"
+			"on a chipset that contains an erratum making that\n"
+			"feature unstable.  To maintain system stability\n"
+			"interrupt remapping is being disabled.  Please\n"
+			"contact your BIOS vendor for an update\n");
+		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
+		return -ENODEV;
+	}
+
+	if (dmar_table_init() < 0)
+		return -ENODEV;
+
+	if (!dmar_ir_support())
+		return -ENODEV;
+
+	if (parse_ioapics_under_ir()) {
+		pr_info("Not enabling interrupt remapping\n");
+		goto error;
+	}
+
+	/* First make sure all IOMMUs support IRQ remapping */
+	for_each_iommu(iommu, drhd)
+		if (!ecap_ir_support(iommu->ecap))
+			goto error;
+
+	/* Detect remapping mode: lapic or x2apic */
+	if (x2apic_supported()) {
+		eim = !dmar_x2apic_optout();
+		if (!eim) {
+			pr_info("x2apic is disabled because BIOS sets x2apic opt out bit.");
+			pr_info("Use 'intremap=no_x2apic_optout' to override the BIOS setting.\n");
+		}
+	}
+
+	for_each_iommu(iommu, drhd) {
+		if (eim && !ecap_eim_support(iommu->ecap)) {
+			pr_info("%s does not support EIM\n", iommu->name);
+			eim = 0;
+		}
+	}
+
+	eim_mode = eim;
+	if (eim)
+		pr_info("Queued invalidation will be enabled to support x2apic and Intr-remapping.\n");
+
+	/* Do the initializations early */
+	for_each_iommu(iommu, drhd) {
+		if (intel_setup_irq_remapping(iommu)) {
+			pr_err("Failed to setup irq remapping for %s\n",
+			       iommu->name);
+			goto error;
+		}
+	}
+
+	return 0;
+
+error:
+	intel_cleanup_irq_remapping();
+	return -ENODEV;
+}
+
+/*
+ * Set Posted-Interrupts capability.
+ */
+static inline void set_irq_posting_cap(void)
+{
+	struct dmar_drhd_unit *drhd;
+	struct intel_iommu *iommu;
+
+	if (!disable_irq_post) {
+		/*
+		 * If IRTE is in posted format, the 'pda' field goes across the
+		 * 64-bit boundary, we need use cmpxchg16b to atomically update
+		 * it. We only expose posted-interrupt when X86_FEATURE_CX16
+		 * is supported. Actually, hardware platforms supporting PI
+		 * should have X86_FEATURE_CX16 support, this has been confirmed
+		 * with Intel hardware guys.
+		 */
+		if (boot_cpu_has(X86_FEATURE_CX16))
+			intel_irq_remap_ops.capability |= 1 << IRQ_POSTING_CAP;
+
+		for_each_iommu(iommu, drhd)
+			if (!cap_pi_support(iommu->cap)) {
+				intel_irq_remap_ops.capability &=
+						~(1 << IRQ_POSTING_CAP);
+				break;
+			}
+	}
+}
+
+static int __init intel_enable_irq_remapping(void)
+{
+	struct dmar_drhd_unit *drhd;
+	struct intel_iommu *iommu;
+	bool setup = false;
+
+	/*
+	 * Setup Interrupt-remapping for all the DRHD's now.
+	 */
+	for_each_iommu(iommu, drhd) {
+		if (!ir_pre_enabled(iommu))
+			iommu_enable_irq_remapping(iommu);
+		setup = true;
+	}
+
+	if (!setup)
+		goto error;
+
+	irq_remapping_enabled = 1;
+
+	set_irq_posting_cap();
+
+	pr_info("Enabled IRQ remapping in %s mode\n", eim_mode ? "x2apic" : "xapic");
+
+	return eim_mode ? IRQ_REMAP_X2APIC_MODE : IRQ_REMAP_XAPIC_MODE;
+
+error:
+	intel_cleanup_irq_remapping();
+	return -1;
+}
+
+static int ir_parse_one_hpet_scope(struct acpi_dmar_device_scope *scope,
+				   struct intel_iommu *iommu,
+				   struct acpi_dmar_hardware_unit *drhd)
+{
+	struct acpi_dmar_pci_path *path;
+	u8 bus;
+	int count, free = -1;
+
+	bus = scope->bus;
+	path = (struct acpi_dmar_pci_path *)(scope + 1);
+	count = (scope->length - sizeof(struct acpi_dmar_device_scope))
+		/ sizeof(struct acpi_dmar_pci_path);
+
+	while (--count > 0) {
+		/*
+		 * Access PCI directly due to the PCI
+		 * subsystem isn't initialized yet.
+		 */
+		bus = read_pci_config_byte(bus, path->device, path->function,
+					   PCI_SECONDARY_BUS);
+		path++;
+	}
+
+	for (count = 0; count < MAX_HPET_TBS; count++) {
+		if (ir_hpet[count].iommu == iommu &&
+		    ir_hpet[count].id == scope->enumeration_id)
+			return 0;
+		else if (ir_hpet[count].iommu == NULL && free == -1)
+			free = count;
+	}
+	if (free == -1) {
+		pr_warn("Exceeded Max HPET blocks\n");
+		return -ENOSPC;
+	}
+
+	ir_hpet[free].iommu = iommu;
+	ir_hpet[free].id    = scope->enumeration_id;
+	ir_hpet[free].bus   = bus;
+	ir_hpet[free].devfn = PCI_DEVFN(path->device, path->function);
+	pr_info("HPET id %d under DRHD base 0x%Lx\n",
+		scope->enumeration_id, drhd->address);
+
+	return 0;
+}
+
+static int ir_parse_one_ioapic_scope(struct acpi_dmar_device_scope *scope,
+				     struct intel_iommu *iommu,
+				     struct acpi_dmar_hardware_unit *drhd)
+{
+	struct acpi_dmar_pci_path *path;
+	u8 bus;
+	int count, free = -1;
+
+	bus = scope->bus;
+	path = (struct acpi_dmar_pci_path *)(scope + 1);
+	count = (scope->length - sizeof(struct acpi_dmar_device_scope))
+		/ sizeof(struct acpi_dmar_pci_path);
+
+	while (--count > 0) {
+		/*
+		 * Access PCI directly due to the PCI
+		 * subsystem isn't initialized yet.
+		 */
+		bus = read_pci_config_byte(bus, path->device, path->function,
+					   PCI_SECONDARY_BUS);
+		path++;
+	}
+
+	for (count = 0; count < MAX_IO_APICS; count++) {
+		if (ir_ioapic[count].iommu == iommu &&
+		    ir_ioapic[count].id == scope->enumeration_id)
+			return 0;
+		else if (ir_ioapic[count].iommu == NULL && free == -1)
+			free = count;
+	}
+	if (free == -1) {
+		pr_warn("Exceeded Max IO APICS\n");
+		return -ENOSPC;
+	}
+
+	ir_ioapic[free].bus   = bus;
+	ir_ioapic[free].devfn = PCI_DEVFN(path->device, path->function);
+	ir_ioapic[free].iommu = iommu;
+	ir_ioapic[free].id    = scope->enumeration_id;
+	pr_info("IOAPIC id %d under DRHD base  0x%Lx IOMMU %d\n",
+		scope->enumeration_id, drhd->address, iommu->seq_id);
+
+	return 0;
+}
+
+static int ir_parse_ioapic_hpet_scope(struct acpi_dmar_header *header,
+				      struct intel_iommu *iommu)
+{
+	int ret = 0;
+	struct acpi_dmar_hardware_unit *drhd;
+	struct acpi_dmar_device_scope *scope;
+	void *start, *end;
+
+	drhd = (struct acpi_dmar_hardware_unit *)header;
+	start = (void *)(drhd + 1);
+	end = ((void *)drhd) + header->length;
+
+	while (start < end && ret == 0) {
+		scope = start;
+		if (scope->entry_type == ACPI_DMAR_SCOPE_TYPE_IOAPIC)
+			ret = ir_parse_one_ioapic_scope(scope, iommu, drhd);
+		else if (scope->entry_type == ACPI_DMAR_SCOPE_TYPE_HPET)
+			ret = ir_parse_one_hpet_scope(scope, iommu, drhd);
+		start += scope->length;
+	}
+
+	return ret;
+}
+
+static void ir_remove_ioapic_hpet_scope(struct intel_iommu *iommu)
+{
+	int i;
+
+	for (i = 0; i < MAX_HPET_TBS; i++)
+		if (ir_hpet[i].iommu == iommu)
+			ir_hpet[i].iommu = NULL;
+
+	for (i = 0; i < MAX_IO_APICS; i++)
+		if (ir_ioapic[i].iommu == iommu)
+			ir_ioapic[i].iommu = NULL;
+}
+
+/*
+ * Finds the assocaition between IOAPIC's and its Interrupt-remapping
+ * hardware unit.
+ */
+static int __init parse_ioapics_under_ir(void)
+{
+	struct dmar_drhd_unit *drhd;
+	struct intel_iommu *iommu;
+	bool ir_supported = false;
+	int ioapic_idx;
+
+	for_each_iommu(iommu, drhd) {
+		int ret;
+
+		if (!ecap_ir_support(iommu->ecap))
+			continue;
+
+		ret = ir_parse_ioapic_hpet_scope(drhd->hdr, iommu);
+		if (ret)
+			return ret;
+
+		ir_supported = true;
+	}
+
+	if (!ir_supported)
+		return -ENODEV;
+
+	for (ioapic_idx = 0; ioapic_idx < nr_ioapics; ioapic_idx++) {
+		int ioapic_id = mpc_ioapic_id(ioapic_idx);
+		if (!map_ioapic_to_ir(ioapic_id)) {
+			pr_err(FW_BUG "ioapic %d has no mapping iommu, "
+			       "interrupt remapping will be disabled\n",
+			       ioapic_id);
+			return -1;
+		}
+	}
+
+	return 0;
+}
+
+static int __init ir_dev_scope_init(void)
+{
+	int ret;
+
+	if (!irq_remapping_enabled)
+		return 0;
+
+	down_write(&dmar_global_lock);
+	ret = dmar_dev_scope_init();
+	up_write(&dmar_global_lock);
+
+	return ret;
+}
+rootfs_initcall(ir_dev_scope_init);
+
+static void disable_irq_remapping(void)
+{
+	struct dmar_drhd_unit *drhd;
+	struct intel_iommu *iommu = NULL;
+
+	/*
+	 * Disable Interrupt-remapping for all the DRHD's now.
+	 */
+	for_each_iommu(iommu, drhd) {
+		if (!ecap_ir_support(iommu->ecap))
+			continue;
+
+		iommu_disable_irq_remapping(iommu);
+	}
+
+	/*
+	 * Clear Posted-Interrupts capability.
+	 */
+	if (!disable_irq_post)
+		intel_irq_remap_ops.capability &= ~(1 << IRQ_POSTING_CAP);
+}
+
+static int reenable_irq_remapping(int eim)
+{
+	struct dmar_drhd_unit *drhd;
+	bool setup = false;
+	struct intel_iommu *iommu = NULL;
+
+	for_each_iommu(iommu, drhd)
+		if (iommu->qi)
+			dmar_reenable_qi(iommu);
+
+	/*
+	 * Setup Interrupt-remapping for all the DRHD's now.
+	 */
+	for_each_iommu(iommu, drhd) {
+		if (!ecap_ir_support(iommu->ecap))
+			continue;
+
+		/* Set up interrupt remapping for iommu.*/
+		iommu_set_irq_remapping(iommu, eim);
+		iommu_enable_irq_remapping(iommu);
+		setup = true;
+	}
+
+	if (!setup)
+		goto error;
+
+	set_irq_posting_cap();
+
+	return 0;
+
+error:
+	/*
+	 * handle error condition gracefully here!
+	 */
+	return -1;
+}
+
+static void prepare_irte(struct irte *irte, int vector, unsigned int dest)
+{
+	memset(irte, 0, sizeof(*irte));
+
+	irte->present = 1;
+	irte->dst_mode = apic->irq_dest_mode;
+	/*
+	 * Trigger mode in the IRTE will always be edge, and for IO-APIC, the
+	 * actual level or edge trigger will be setup in the IO-APIC
+	 * RTE. This will help simplify level triggered irq migration.
+	 * For more details, see the comments (in io_apic.c) explainig IO-APIC
+	 * irq migration in the presence of interrupt-remapping.
+	*/
+	irte->trigger_mode = 0;
+	irte->dlvry_mode = apic->irq_delivery_mode;
+	irte->vector = vector;
+	irte->dest_id = IRTE_DEST(dest);
+	irte->redir_hint = 1;
+}
+
+static struct irq_domain *intel_get_ir_irq_domain(struct irq_alloc_info *info)
+{
+	struct intel_iommu *iommu = NULL;
+
+	if (!info)
+		return NULL;
+
+	switch (info->type) {
+	case X86_IRQ_ALLOC_TYPE_IOAPIC:
+		iommu = map_ioapic_to_ir(info->ioapic_id);
+		break;
+	case X86_IRQ_ALLOC_TYPE_HPET:
+		iommu = map_hpet_to_ir(info->hpet_id);
+		break;
+	case X86_IRQ_ALLOC_TYPE_MSI:
+	case X86_IRQ_ALLOC_TYPE_MSIX:
+		iommu = map_dev_to_ir(info->msi_dev);
+		break;
+	default:
+		BUG_ON(1);
+		break;
+	}
+
+	return iommu ? iommu->ir_domain : NULL;
+}
+
+static struct irq_domain *intel_get_irq_domain(struct irq_alloc_info *info)
+{
+	struct intel_iommu *iommu;
+
+	if (!info)
+		return NULL;
+
+	switch (info->type) {
+	case X86_IRQ_ALLOC_TYPE_MSI:
+	case X86_IRQ_ALLOC_TYPE_MSIX:
+		iommu = map_dev_to_ir(info->msi_dev);
+		if (iommu)
+			return iommu->ir_msi_domain;
+		break;
+	default:
+		break;
+	}
+
+	return NULL;
+}
+
+struct irq_remap_ops intel_irq_remap_ops = {
+	.prepare		= intel_prepare_irq_remapping,
+	.enable			= intel_enable_irq_remapping,
+	.disable		= disable_irq_remapping,
+	.reenable		= reenable_irq_remapping,
+	.enable_faulting	= enable_drhd_fault_handling,
+	.get_ir_irq_domain	= intel_get_ir_irq_domain,
+	.get_irq_domain		= intel_get_irq_domain,
+};
+
+static void intel_ir_reconfigure_irte(struct irq_data *irqd, bool force)
+{
+	struct intel_ir_data *ir_data = irqd->chip_data;
+	struct irte *irte = &ir_data->irte_entry;
+	struct irq_cfg *cfg = irqd_cfg(irqd);
+
+	/*
+	 * Atomically updates the IRTE with the new destination, vector
+	 * and flushes the interrupt entry cache.
+	 */
+	irte->vector = cfg->vector;
+	irte->dest_id = IRTE_DEST(cfg->dest_apicid);
+
+	/* Update the hardware only if the interrupt is in remapped mode. */
+	if (force || ir_data->irq_2_iommu.mode == IRQ_REMAPPING)
+		modify_irte(&ir_data->irq_2_iommu, irte);
+}
+
+/*
+ * Migrate the IO-APIC irq in the presence of intr-remapping.
+ *
+ * For both level and edge triggered, irq migration is a simple atomic
+ * update(of vector and cpu destination) of IRTE and flush the hardware cache.
+ *
+ * For level triggered, we eliminate the io-apic RTE modification (with the
+ * updated vector information), by using a virtual vector (io-apic pin number).
+ * Real vector that is used for interrupting cpu will be coming from
+ * the interrupt-remapping table entry.
+ *
+ * As the migration is a simple atomic update of IRTE, the same mechanism
+ * is used to migrate MSI irq's in the presence of interrupt-remapping.
+ */
+static int
+intel_ir_set_affinity(struct irq_data *data, const struct cpumask *mask,
+		      bool force)
+{
+	struct irq_data *parent = data->parent_data;
+	struct irq_cfg *cfg = irqd_cfg(data);
+	int ret;
+
+	ret = parent->chip->irq_set_affinity(parent, mask, force);
+	if (ret < 0 || ret == IRQ_SET_MASK_OK_DONE)
+		return ret;
+
+	intel_ir_reconfigure_irte(data, false);
+	/*
+	 * After this point, all the interrupts will start arriving
+	 * at the new destination. So, time to cleanup the previous
+	 * vector allocation.
+	 */
+	send_cleanup_vector(cfg);
+
+	return IRQ_SET_MASK_OK_DONE;
+}
+
+static void intel_ir_compose_msi_msg(struct irq_data *irq_data,
+				     struct msi_msg *msg)
+{
+	struct intel_ir_data *ir_data = irq_data->chip_data;
+
+	*msg = ir_data->msi_entry;
+}
+
+static int intel_ir_set_vcpu_affinity(struct irq_data *data, void *info)
+{
+	struct intel_ir_data *ir_data = data->chip_data;
+	struct vcpu_data *vcpu_pi_info = info;
+
+	/* stop posting interrupts, back to remapping mode */
+	if (!vcpu_pi_info) {
+		modify_irte(&ir_data->irq_2_iommu, &ir_data->irte_entry);
+	} else {
+		struct irte irte_pi;
+
+		/*
+		 * We are not caching the posted interrupt entry. We
+		 * copy the data from the remapped entry and modify
+		 * the fields which are relevant for posted mode. The
+		 * cached remapped entry is used for switching back to
+		 * remapped mode.
+		 */
+		memset(&irte_pi, 0, sizeof(irte_pi));
+		dmar_copy_shared_irte(&irte_pi, &ir_data->irte_entry);
+
+		/* Update the posted mode fields */
+		irte_pi.p_pst = 1;
+		irte_pi.p_urgent = 0;
+		irte_pi.p_vector = vcpu_pi_info->vector;
+		irte_pi.pda_l = (vcpu_pi_info->pi_desc_addr >>
+				(32 - PDA_LOW_BIT)) & ~(-1UL << PDA_LOW_BIT);
+		irte_pi.pda_h = (vcpu_pi_info->pi_desc_addr >> 32) &
+				~(-1UL << PDA_HIGH_BIT);
+
+		modify_irte(&ir_data->irq_2_iommu, &irte_pi);
+	}
+
+	return 0;
+}
+
+static struct irq_chip intel_ir_chip = {
+	.name			= "INTEL-IR",
+	.irq_ack		= apic_ack_irq,
+	.irq_set_affinity	= intel_ir_set_affinity,
+	.irq_compose_msi_msg	= intel_ir_compose_msi_msg,
+	.irq_set_vcpu_affinity	= intel_ir_set_vcpu_affinity,
+};
+
+static void intel_irq_remapping_prepare_irte(struct intel_ir_data *data,
+					     struct irq_cfg *irq_cfg,
+					     struct irq_alloc_info *info,
+					     int index, int sub_handle)
+{
+	struct IR_IO_APIC_route_entry *entry;
+	struct irte *irte = &data->irte_entry;
+	struct msi_msg *msg = &data->msi_entry;
+
+	prepare_irte(irte, irq_cfg->vector, irq_cfg->dest_apicid);
+	switch (info->type) {
+	case X86_IRQ_ALLOC_TYPE_IOAPIC:
+		/* Set source-id of interrupt request */
+		set_ioapic_sid(irte, info->ioapic_id);
+		apic_printk(APIC_VERBOSE, KERN_DEBUG "IOAPIC[%d]: Set IRTE entry (P:%d FPD:%d Dst_Mode:%d Redir_hint:%d Trig_Mode:%d Dlvry_Mode:%X Avail:%X Vector:%02X Dest:%08X SID:%04X SQ:%X SVT:%X)\n",
+			info->ioapic_id, irte->present, irte->fpd,
+			irte->dst_mode, irte->redir_hint,
+			irte->trigger_mode, irte->dlvry_mode,
+			irte->avail, irte->vector, irte->dest_id,
+			irte->sid, irte->sq, irte->svt);
+
+		entry = (struct IR_IO_APIC_route_entry *)info->ioapic_entry;
+		info->ioapic_entry = NULL;
+		memset(entry, 0, sizeof(*entry));
+		entry->index2	= (index >> 15) & 0x1;
+		entry->zero	= 0;
+		entry->format	= 1;
+		entry->index	= (index & 0x7fff);
+		/*
+		 * IO-APIC RTE will be configured with virtual vector.
+		 * irq handler will do the explicit EOI to the io-apic.
+		 */
+		entry->vector	= info->ioapic_pin;
+		entry->mask	= 0;			/* enable IRQ */
+		entry->trigger	= info->ioapic_trigger;
+		entry->polarity	= info->ioapic_polarity;
+		if (info->ioapic_trigger)
+			entry->mask = 1; /* Mask level triggered irqs. */
+		break;
+
+	case X86_IRQ_ALLOC_TYPE_HPET:
+	case X86_IRQ_ALLOC_TYPE_MSI:
+	case X86_IRQ_ALLOC_TYPE_MSIX:
+		if (info->type == X86_IRQ_ALLOC_TYPE_HPET)
+			set_hpet_sid(irte, info->hpet_id);
+		else
+			set_msi_sid(irte, info->msi_dev);
+
+		msg->address_hi = MSI_ADDR_BASE_HI;
+		msg->data = sub_handle;
+		msg->address_lo = MSI_ADDR_BASE_LO | MSI_ADDR_IR_EXT_INT |
+				  MSI_ADDR_IR_SHV |
+				  MSI_ADDR_IR_INDEX1(index) |
+				  MSI_ADDR_IR_INDEX2(index);
+		break;
+
+	default:
+		BUG_ON(1);
+		break;
+	}
+}
+
+static void intel_free_irq_resources(struct irq_domain *domain,
+				     unsigned int virq, unsigned int nr_irqs)
+{
+	struct irq_data *irq_data;
+	struct intel_ir_data *data;
+	struct irq_2_iommu *irq_iommu;
+	unsigned long flags;
+	int i;
+	for (i = 0; i < nr_irqs; i++) {
+		irq_data = irq_domain_get_irq_data(domain, virq  + i);
+		if (irq_data && irq_data->chip_data) {
+			data = irq_data->chip_data;
+			irq_iommu = &data->irq_2_iommu;
+			raw_spin_lock_irqsave(&irq_2_ir_lock, flags);
+			clear_entries(irq_iommu);
+			raw_spin_unlock_irqrestore(&irq_2_ir_lock, flags);
+			irq_domain_reset_irq_data(irq_data);
+			kfree(data);
+		}
+	}
+}
+
+static int intel_irq_remapping_alloc(struct irq_domain *domain,
+				     unsigned int virq, unsigned int nr_irqs,
+				     void *arg)
+{
+	struct intel_iommu *iommu = domain->host_data;
+	struct irq_alloc_info *info = arg;
+	struct intel_ir_data *data, *ird;
+	struct irq_data *irq_data;
+	struct irq_cfg *irq_cfg;
+	int i, ret, index;
+
+	if (!info || !iommu)
+		return -EINVAL;
+	if (nr_irqs > 1 && info->type != X86_IRQ_ALLOC_TYPE_MSI &&
+	    info->type != X86_IRQ_ALLOC_TYPE_MSIX)
+		return -EINVAL;
+
+	/*
+	 * With IRQ remapping enabled, don't need contiguous CPU vectors
+	 * to support multiple MSI interrupts.
+	 */
+	if (info->type == X86_IRQ_ALLOC_TYPE_MSI)
+		info->flags &= ~X86_IRQ_ALLOC_CONTIGUOUS_VECTORS;
+
+	ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, arg);
+	if (ret < 0)
+		return ret;
+
+	ret = -ENOMEM;
+	data = kzalloc(sizeof(*data), GFP_KERNEL);
+	if (!data)
+		goto out_free_parent;
+
+	down_read(&dmar_global_lock);
+	index = alloc_irte(iommu, virq, &data->irq_2_iommu, nr_irqs);
+	up_read(&dmar_global_lock);
+	if (index < 0) {
+		pr_warn("Failed to allocate IRTE\n");
+		kfree(data);
+		goto out_free_parent;
+	}
+
+	for (i = 0; i < nr_irqs; i++) {
+		irq_data = irq_domain_get_irq_data(domain, virq + i);
+		irq_cfg = irqd_cfg(irq_data);
+		if (!irq_data || !irq_cfg) {
+			if (!i)
+				kfree(data);
+			ret = -EINVAL;
+			goto out_free_data;
+		}
+
+		if (i > 0) {
+			ird = kzalloc(sizeof(*ird), GFP_KERNEL);
+			if (!ird)
+				goto out_free_data;
+			/* Initialize the common data */
+			ird->irq_2_iommu = data->irq_2_iommu;
+			ird->irq_2_iommu.sub_handle = i;
+		} else {
+			ird = data;
+		}
+
+		irq_data->hwirq = (index << 16) + i;
+		irq_data->chip_data = ird;
+		irq_data->chip = &intel_ir_chip;
+		intel_irq_remapping_prepare_irte(ird, irq_cfg, info, index, i);
+		irq_set_status_flags(virq + i, IRQ_MOVE_PCNTXT);
+	}
+	return 0;
+
+out_free_data:
+	intel_free_irq_resources(domain, virq, i);
+out_free_parent:
+	irq_domain_free_irqs_common(domain, virq, nr_irqs);
+	return ret;
+}
+
+static void intel_irq_remapping_free(struct irq_domain *domain,
+				     unsigned int virq, unsigned int nr_irqs)
+{
+	intel_free_irq_resources(domain, virq, nr_irqs);
+	irq_domain_free_irqs_common(domain, virq, nr_irqs);
+}
+
+static int intel_irq_remapping_activate(struct irq_domain *domain,
+					struct irq_data *irq_data, bool reserve)
+{
+	intel_ir_reconfigure_irte(irq_data, true);
+	return 0;
+}
+
+static void intel_irq_remapping_deactivate(struct irq_domain *domain,
+					   struct irq_data *irq_data)
+{
+	struct intel_ir_data *data = irq_data->chip_data;
+	struct irte entry;
+
+	memset(&entry, 0, sizeof(entry));
+	modify_irte(&data->irq_2_iommu, &entry);
+}
+
+static const struct irq_domain_ops intel_ir_domain_ops = {
+	.alloc = intel_irq_remapping_alloc,
+	.free = intel_irq_remapping_free,
+	.activate = intel_irq_remapping_activate,
+	.deactivate = intel_irq_remapping_deactivate,
+};
+
+/*
+ * Support of Interrupt Remapping Unit Hotplug
+ */
+static int dmar_ir_add(struct dmar_drhd_unit *dmaru, struct intel_iommu *iommu)
+{
+	int ret;
+	int eim = x2apic_enabled();
+
+	if (eim && !ecap_eim_support(iommu->ecap)) {
+		pr_info("DRHD %Lx: EIM not supported by DRHD, ecap %Lx\n",
+			iommu->reg_phys, iommu->ecap);
+		return -ENODEV;
+	}
+
+	if (ir_parse_ioapic_hpet_scope(dmaru->hdr, iommu)) {
+		pr_warn("DRHD %Lx: failed to parse managed IOAPIC/HPET\n",
+			iommu->reg_phys);
+		return -ENODEV;
+	}
+
+	/* TODO: check all IOAPICs are covered by IOMMU */
+
+	/* Setup Interrupt-remapping now. */
+	ret = intel_setup_irq_remapping(iommu);
+	if (ret) {
+		pr_err("Failed to setup irq remapping for %s\n",
+		       iommu->name);
+		intel_teardown_irq_remapping(iommu);
+		ir_remove_ioapic_hpet_scope(iommu);
+	} else {
+		iommu_enable_irq_remapping(iommu);
+	}
+
+	return ret;
+}
+
+int dmar_ir_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
+{
+	int ret = 0;
+	struct intel_iommu *iommu = dmaru->iommu;
+
+	if (!irq_remapping_enabled)
+		return 0;
+	if (iommu == NULL)
+		return -EINVAL;
+	if (!ecap_ir_support(iommu->ecap))
+		return 0;
+	if (irq_remapping_cap(IRQ_POSTING_CAP) &&
+	    !cap_pi_support(iommu->cap))
+		return -EBUSY;
+
+	if (insert) {
+		if (!iommu->ir_table)
+			ret = dmar_ir_add(dmaru, iommu);
+	} else {
+		if (iommu->ir_table) {
+			if (!bitmap_empty(iommu->ir_table->bitmap,
+					  INTR_REMAP_TABLE_ENTRIES)) {
+				ret = -EBUSY;
+			} else {
+				iommu_disable_irq_remapping(iommu);
+				intel_teardown_irq_remapping(iommu);
+				ir_remove_ioapic_hpet_scope(iommu);
+			}
+		}
+	}
+
+	return ret;
+}
diff --git a/drivers/iommu/io-pgtable-arm-v7s.c b/drivers/iommu/io-pgtable-arm-v7s.c
new file mode 100644
index 000000000..48d4709a8
--- /dev/null
+++ b/drivers/iommu/io-pgtable-arm-v7s.c
@@ -0,0 +1,945 @@
+/*
+ * CPU-agnostic ARM page table allocator.
+ *
+ * ARMv7 Short-descriptor format, supporting
+ * - Basic memory attributes
+ * - Simplified access permissions (AP[2:1] model)
+ * - Backwards-compatible TEX remap
+ * - Large pages/supersections (if indicated by the caller)
+ *
+ * Not supporting:
+ * - Legacy access permissions (AP[2:0] model)
+ *
+ * Almost certainly never supporting:
+ * - PXN
+ * - Domains
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Copyright (C) 2014-2015 ARM Limited
+ * Copyright (c) 2014-2015 MediaTek Inc.
+ */
+
+#define pr_fmt(fmt)	"arm-v7s io-pgtable: " fmt
+
+#include <linux/atomic.h>
+#include <linux/dma-mapping.h>
+#include <linux/gfp.h>
+#include <linux/iommu.h>
+#include <linux/kernel.h>
+#include <linux/kmemleak.h>
+#include <linux/sizes.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/types.h>
+
+#include <asm/barrier.h>
+
+#include "io-pgtable.h"
+
+/* Struct accessors */
+#define io_pgtable_to_data(x)						\
+	container_of((x), struct arm_v7s_io_pgtable, iop)
+
+#define io_pgtable_ops_to_data(x)					\
+	io_pgtable_to_data(io_pgtable_ops_to_pgtable(x))
+
+/*
+ * We have 32 bits total; 12 bits resolved at level 1, 8 bits at level 2,
+ * and 12 bits in a page. With some carefully-chosen coefficients we can
+ * hide the ugly inconsistencies behind these macros and at least let the
+ * rest of the code pretend to be somewhat sane.
+ */
+#define ARM_V7S_ADDR_BITS		32
+#define _ARM_V7S_LVL_BITS(lvl)		(16 - (lvl) * 4)
+#define ARM_V7S_LVL_SHIFT(lvl)		(ARM_V7S_ADDR_BITS - (4 + 8 * (lvl)))
+#define ARM_V7S_TABLE_SHIFT		10
+
+#define ARM_V7S_PTES_PER_LVL(lvl)	(1 << _ARM_V7S_LVL_BITS(lvl))
+#define ARM_V7S_TABLE_SIZE(lvl)						\
+	(ARM_V7S_PTES_PER_LVL(lvl) * sizeof(arm_v7s_iopte))
+
+#define ARM_V7S_BLOCK_SIZE(lvl)		(1UL << ARM_V7S_LVL_SHIFT(lvl))
+#define ARM_V7S_LVL_MASK(lvl)		((u32)(~0U << ARM_V7S_LVL_SHIFT(lvl)))
+#define ARM_V7S_TABLE_MASK		((u32)(~0U << ARM_V7S_TABLE_SHIFT))
+#define _ARM_V7S_IDX_MASK(lvl)		(ARM_V7S_PTES_PER_LVL(lvl) - 1)
+#define ARM_V7S_LVL_IDX(addr, lvl)	({				\
+	int _l = lvl;							\
+	((u32)(addr) >> ARM_V7S_LVL_SHIFT(_l)) & _ARM_V7S_IDX_MASK(_l); \
+})
+
+/*
+ * Large page/supersection entries are effectively a block of 16 page/section
+ * entries, along the lines of the LPAE contiguous hint, but all with the
+ * same output address. For want of a better common name we'll call them
+ * "contiguous" versions of their respective page/section entries here, but
+ * noting the distinction (WRT to TLB maintenance) that they represent *one*
+ * entry repeated 16 times, not 16 separate entries (as in the LPAE case).
+ */
+#define ARM_V7S_CONT_PAGES		16
+
+/* PTE type bits: these are all mixed up with XN/PXN bits in most cases */
+#define ARM_V7S_PTE_TYPE_TABLE		0x1
+#define ARM_V7S_PTE_TYPE_PAGE		0x2
+#define ARM_V7S_PTE_TYPE_CONT_PAGE	0x1
+
+#define ARM_V7S_PTE_IS_VALID(pte)	(((pte) & 0x3) != 0)
+#define ARM_V7S_PTE_IS_TABLE(pte, lvl) \
+	((lvl) == 1 && (((pte) & 0x3) == ARM_V7S_PTE_TYPE_TABLE))
+
+/* Page table bits */
+#define ARM_V7S_ATTR_XN(lvl)		BIT(4 * (2 - (lvl)))
+#define ARM_V7S_ATTR_B			BIT(2)
+#define ARM_V7S_ATTR_C			BIT(3)
+#define ARM_V7S_ATTR_NS_TABLE		BIT(3)
+#define ARM_V7S_ATTR_NS_SECTION		BIT(19)
+
+#define ARM_V7S_CONT_SECTION		BIT(18)
+#define ARM_V7S_CONT_PAGE_XN_SHIFT	15
+
+/*
+ * The attribute bits are consistently ordered*, but occupy bits [17:10] of
+ * a level 1 PTE vs. bits [11:4] at level 2. Thus we define the individual
+ * fields relative to that 8-bit block, plus a total shift relative to the PTE.
+ */
+#define ARM_V7S_ATTR_SHIFT(lvl)		(16 - (lvl) * 6)
+
+#define ARM_V7S_ATTR_MASK		0xff
+#define ARM_V7S_ATTR_AP0		BIT(0)
+#define ARM_V7S_ATTR_AP1		BIT(1)
+#define ARM_V7S_ATTR_AP2		BIT(5)
+#define ARM_V7S_ATTR_S			BIT(6)
+#define ARM_V7S_ATTR_NG			BIT(7)
+#define ARM_V7S_TEX_SHIFT		2
+#define ARM_V7S_TEX_MASK		0x7
+#define ARM_V7S_ATTR_TEX(val)		(((val) & ARM_V7S_TEX_MASK) << ARM_V7S_TEX_SHIFT)
+
+#define ARM_V7S_ATTR_MTK_4GB		BIT(9) /* MTK extend it for 4GB mode */
+
+/* *well, except for TEX on level 2 large pages, of course :( */
+#define ARM_V7S_CONT_PAGE_TEX_SHIFT	6
+#define ARM_V7S_CONT_PAGE_TEX_MASK	(ARM_V7S_TEX_MASK << ARM_V7S_CONT_PAGE_TEX_SHIFT)
+
+/* Simplified access permissions */
+#define ARM_V7S_PTE_AF			ARM_V7S_ATTR_AP0
+#define ARM_V7S_PTE_AP_UNPRIV		ARM_V7S_ATTR_AP1
+#define ARM_V7S_PTE_AP_RDONLY		ARM_V7S_ATTR_AP2
+
+/* Register bits */
+#define ARM_V7S_RGN_NC			0
+#define ARM_V7S_RGN_WBWA		1
+#define ARM_V7S_RGN_WT			2
+#define ARM_V7S_RGN_WB			3
+
+#define ARM_V7S_PRRR_TYPE_DEVICE	1
+#define ARM_V7S_PRRR_TYPE_NORMAL	2
+#define ARM_V7S_PRRR_TR(n, type)	(((type) & 0x3) << ((n) * 2))
+#define ARM_V7S_PRRR_DS0		BIT(16)
+#define ARM_V7S_PRRR_DS1		BIT(17)
+#define ARM_V7S_PRRR_NS0		BIT(18)
+#define ARM_V7S_PRRR_NS1		BIT(19)
+#define ARM_V7S_PRRR_NOS(n)		BIT((n) + 24)
+
+#define ARM_V7S_NMRR_IR(n, attr)	(((attr) & 0x3) << ((n) * 2))
+#define ARM_V7S_NMRR_OR(n, attr)	(((attr) & 0x3) << ((n) * 2 + 16))
+
+#define ARM_V7S_TTBR_S			BIT(1)
+#define ARM_V7S_TTBR_NOS		BIT(5)
+#define ARM_V7S_TTBR_ORGN_ATTR(attr)	(((attr) & 0x3) << 3)
+#define ARM_V7S_TTBR_IRGN_ATTR(attr)					\
+	((((attr) & 0x1) << 6) | (((attr) & 0x2) >> 1))
+
+#define ARM_V7S_TCR_PD1			BIT(5)
+
+#ifdef CONFIG_ZONE_DMA32
+#define ARM_V7S_TABLE_GFP_DMA GFP_DMA32
+#define ARM_V7S_TABLE_SLAB_FLAGS SLAB_CACHE_DMA32
+#else
+#define ARM_V7S_TABLE_GFP_DMA GFP_DMA
+#define ARM_V7S_TABLE_SLAB_FLAGS SLAB_CACHE_DMA
+#endif
+
+typedef u32 arm_v7s_iopte;
+
+static bool selftest_running;
+
+struct arm_v7s_io_pgtable {
+	struct io_pgtable	iop;
+
+	arm_v7s_iopte		*pgd;
+	struct kmem_cache	*l2_tables;
+	spinlock_t		split_lock;
+};
+
+static dma_addr_t __arm_v7s_dma_addr(void *pages)
+{
+	return (dma_addr_t)virt_to_phys(pages);
+}
+
+static arm_v7s_iopte *iopte_deref(arm_v7s_iopte pte, int lvl)
+{
+	if (ARM_V7S_PTE_IS_TABLE(pte, lvl))
+		pte &= ARM_V7S_TABLE_MASK;
+	else
+		pte &= ARM_V7S_LVL_MASK(lvl);
+	return phys_to_virt(pte);
+}
+
+static void *__arm_v7s_alloc_table(int lvl, gfp_t gfp,
+				   struct arm_v7s_io_pgtable *data)
+{
+	struct io_pgtable_cfg *cfg = &data->iop.cfg;
+	struct device *dev = cfg->iommu_dev;
+	phys_addr_t phys;
+	dma_addr_t dma;
+	size_t size = ARM_V7S_TABLE_SIZE(lvl);
+	void *table = NULL;
+
+	if (lvl == 1)
+		table = (void *)__get_free_pages(
+			__GFP_ZERO | ARM_V7S_TABLE_GFP_DMA, get_order(size));
+	else if (lvl == 2)
+		table = kmem_cache_zalloc(data->l2_tables, gfp);
+	phys = virt_to_phys(table);
+	if (phys != (arm_v7s_iopte)phys) {
+		/* Doesn't fit in PTE */
+		dev_err(dev, "Page table does not fit in PTE: %pa", &phys);
+		goto out_free;
+	}
+	if (table && !(cfg->quirks & IO_PGTABLE_QUIRK_NO_DMA)) {
+		dma = dma_map_single(dev, table, size, DMA_TO_DEVICE);
+		if (dma_mapping_error(dev, dma))
+			goto out_free;
+		/*
+		 * We depend on the IOMMU being able to work with any physical
+		 * address directly, so if the DMA layer suggests otherwise by
+		 * translating or truncating them, that bodes very badly...
+		 */
+		if (dma != phys)
+			goto out_unmap;
+	}
+	if (lvl == 2)
+		kmemleak_ignore(table);
+	return table;
+
+out_unmap:
+	dev_err(dev, "Cannot accommodate DMA translation for IOMMU page tables\n");
+	dma_unmap_single(dev, dma, size, DMA_TO_DEVICE);
+out_free:
+	if (lvl == 1)
+		free_pages((unsigned long)table, get_order(size));
+	else
+		kmem_cache_free(data->l2_tables, table);
+	return NULL;
+}
+
+static void __arm_v7s_free_table(void *table, int lvl,
+				 struct arm_v7s_io_pgtable *data)
+{
+	struct io_pgtable_cfg *cfg = &data->iop.cfg;
+	struct device *dev = cfg->iommu_dev;
+	size_t size = ARM_V7S_TABLE_SIZE(lvl);
+
+	if (!(cfg->quirks & IO_PGTABLE_QUIRK_NO_DMA))
+		dma_unmap_single(dev, __arm_v7s_dma_addr(table), size,
+				 DMA_TO_DEVICE);
+	if (lvl == 1)
+		free_pages((unsigned long)table, get_order(size));
+	else
+		kmem_cache_free(data->l2_tables, table);
+}
+
+static void __arm_v7s_pte_sync(arm_v7s_iopte *ptep, int num_entries,
+			       struct io_pgtable_cfg *cfg)
+{
+	if (cfg->quirks & IO_PGTABLE_QUIRK_NO_DMA)
+		return;
+
+	dma_sync_single_for_device(cfg->iommu_dev, __arm_v7s_dma_addr(ptep),
+				   num_entries * sizeof(*ptep), DMA_TO_DEVICE);
+}
+static void __arm_v7s_set_pte(arm_v7s_iopte *ptep, arm_v7s_iopte pte,
+			      int num_entries, struct io_pgtable_cfg *cfg)
+{
+	int i;
+
+	for (i = 0; i < num_entries; i++)
+		ptep[i] = pte;
+
+	__arm_v7s_pte_sync(ptep, num_entries, cfg);
+}
+
+static arm_v7s_iopte arm_v7s_prot_to_pte(int prot, int lvl,
+					 struct io_pgtable_cfg *cfg)
+{
+	bool ap = !(cfg->quirks & IO_PGTABLE_QUIRK_NO_PERMS);
+	arm_v7s_iopte pte = ARM_V7S_ATTR_NG | ARM_V7S_ATTR_S;
+
+	if (!(prot & IOMMU_MMIO))
+		pte |= ARM_V7S_ATTR_TEX(1);
+	if (ap) {
+		pte |= ARM_V7S_PTE_AF;
+		if (!(prot & IOMMU_PRIV))
+			pte |= ARM_V7S_PTE_AP_UNPRIV;
+		if (!(prot & IOMMU_WRITE))
+			pte |= ARM_V7S_PTE_AP_RDONLY;
+	}
+	pte <<= ARM_V7S_ATTR_SHIFT(lvl);
+
+	if ((prot & IOMMU_NOEXEC) && ap)
+		pte |= ARM_V7S_ATTR_XN(lvl);
+	if (prot & IOMMU_MMIO)
+		pte |= ARM_V7S_ATTR_B;
+	else if (prot & IOMMU_CACHE)
+		pte |= ARM_V7S_ATTR_B | ARM_V7S_ATTR_C;
+
+	pte |= ARM_V7S_PTE_TYPE_PAGE;
+	if (lvl == 1 && (cfg->quirks & IO_PGTABLE_QUIRK_ARM_NS))
+		pte |= ARM_V7S_ATTR_NS_SECTION;
+
+	if (cfg->quirks & IO_PGTABLE_QUIRK_ARM_MTK_4GB)
+		pte |= ARM_V7S_ATTR_MTK_4GB;
+
+	return pte;
+}
+
+static int arm_v7s_pte_to_prot(arm_v7s_iopte pte, int lvl)
+{
+	int prot = IOMMU_READ;
+	arm_v7s_iopte attr = pte >> ARM_V7S_ATTR_SHIFT(lvl);
+
+	if (!(attr & ARM_V7S_PTE_AP_RDONLY))
+		prot |= IOMMU_WRITE;
+	if (!(attr & ARM_V7S_PTE_AP_UNPRIV))
+		prot |= IOMMU_PRIV;
+	if ((attr & (ARM_V7S_TEX_MASK << ARM_V7S_TEX_SHIFT)) == 0)
+		prot |= IOMMU_MMIO;
+	else if (pte & ARM_V7S_ATTR_C)
+		prot |= IOMMU_CACHE;
+	if (pte & ARM_V7S_ATTR_XN(lvl))
+		prot |= IOMMU_NOEXEC;
+
+	return prot;
+}
+
+static arm_v7s_iopte arm_v7s_pte_to_cont(arm_v7s_iopte pte, int lvl)
+{
+	if (lvl == 1) {
+		pte |= ARM_V7S_CONT_SECTION;
+	} else if (lvl == 2) {
+		arm_v7s_iopte xn = pte & ARM_V7S_ATTR_XN(lvl);
+		arm_v7s_iopte tex = pte & ARM_V7S_CONT_PAGE_TEX_MASK;
+
+		pte ^= xn | tex | ARM_V7S_PTE_TYPE_PAGE;
+		pte |= (xn << ARM_V7S_CONT_PAGE_XN_SHIFT) |
+		       (tex << ARM_V7S_CONT_PAGE_TEX_SHIFT) |
+		       ARM_V7S_PTE_TYPE_CONT_PAGE;
+	}
+	return pte;
+}
+
+static arm_v7s_iopte arm_v7s_cont_to_pte(arm_v7s_iopte pte, int lvl)
+{
+	if (lvl == 1) {
+		pte &= ~ARM_V7S_CONT_SECTION;
+	} else if (lvl == 2) {
+		arm_v7s_iopte xn = pte & BIT(ARM_V7S_CONT_PAGE_XN_SHIFT);
+		arm_v7s_iopte tex = pte & (ARM_V7S_CONT_PAGE_TEX_MASK <<
+					   ARM_V7S_CONT_PAGE_TEX_SHIFT);
+
+		pte ^= xn | tex | ARM_V7S_PTE_TYPE_CONT_PAGE;
+		pte |= (xn >> ARM_V7S_CONT_PAGE_XN_SHIFT) |
+		       (tex >> ARM_V7S_CONT_PAGE_TEX_SHIFT) |
+		       ARM_V7S_PTE_TYPE_PAGE;
+	}
+	return pte;
+}
+
+static bool arm_v7s_pte_is_cont(arm_v7s_iopte pte, int lvl)
+{
+	if (lvl == 1 && !ARM_V7S_PTE_IS_TABLE(pte, lvl))
+		return pte & ARM_V7S_CONT_SECTION;
+	else if (lvl == 2)
+		return !(pte & ARM_V7S_PTE_TYPE_PAGE);
+	return false;
+}
+
+static size_t __arm_v7s_unmap(struct arm_v7s_io_pgtable *, unsigned long,
+			      size_t, int, arm_v7s_iopte *);
+
+static int arm_v7s_init_pte(struct arm_v7s_io_pgtable *data,
+			    unsigned long iova, phys_addr_t paddr, int prot,
+			    int lvl, int num_entries, arm_v7s_iopte *ptep)
+{
+	struct io_pgtable_cfg *cfg = &data->iop.cfg;
+	arm_v7s_iopte pte;
+	int i;
+
+	for (i = 0; i < num_entries; i++)
+		if (ARM_V7S_PTE_IS_TABLE(ptep[i], lvl)) {
+			/*
+			 * We need to unmap and free the old table before
+			 * overwriting it with a block entry.
+			 */
+			arm_v7s_iopte *tblp;
+			size_t sz = ARM_V7S_BLOCK_SIZE(lvl);
+
+			tblp = ptep - ARM_V7S_LVL_IDX(iova, lvl);
+			if (WARN_ON(__arm_v7s_unmap(data, iova + i * sz,
+						    sz, lvl, tblp) != sz))
+				return -EINVAL;
+		} else if (ptep[i]) {
+			/* We require an unmap first */
+			WARN_ON(!selftest_running);
+			return -EEXIST;
+		}
+
+	pte = arm_v7s_prot_to_pte(prot, lvl, cfg);
+	if (num_entries > 1)
+		pte = arm_v7s_pte_to_cont(pte, lvl);
+
+	pte |= paddr & ARM_V7S_LVL_MASK(lvl);
+
+	__arm_v7s_set_pte(ptep, pte, num_entries, cfg);
+	return 0;
+}
+
+static arm_v7s_iopte arm_v7s_install_table(arm_v7s_iopte *table,
+					   arm_v7s_iopte *ptep,
+					   arm_v7s_iopte curr,
+					   struct io_pgtable_cfg *cfg)
+{
+	arm_v7s_iopte old, new;
+
+	new = virt_to_phys(table) | ARM_V7S_PTE_TYPE_TABLE;
+	if (cfg->quirks & IO_PGTABLE_QUIRK_ARM_NS)
+		new |= ARM_V7S_ATTR_NS_TABLE;
+
+	/*
+	 * Ensure the table itself is visible before its PTE can be.
+	 * Whilst we could get away with cmpxchg64_release below, this
+	 * doesn't have any ordering semantics when !CONFIG_SMP.
+	 */
+	dma_wmb();
+
+	old = cmpxchg_relaxed(ptep, curr, new);
+	__arm_v7s_pte_sync(ptep, 1, cfg);
+
+	return old;
+}
+
+static int __arm_v7s_map(struct arm_v7s_io_pgtable *data, unsigned long iova,
+			 phys_addr_t paddr, size_t size, int prot,
+			 int lvl, arm_v7s_iopte *ptep)
+{
+	struct io_pgtable_cfg *cfg = &data->iop.cfg;
+	arm_v7s_iopte pte, *cptep;
+	int num_entries = size >> ARM_V7S_LVL_SHIFT(lvl);
+
+	/* Find our entry at the current level */
+	ptep += ARM_V7S_LVL_IDX(iova, lvl);
+
+	/* If we can install a leaf entry at this level, then do so */
+	if (num_entries)
+		return arm_v7s_init_pte(data, iova, paddr, prot,
+					lvl, num_entries, ptep);
+
+	/* We can't allocate tables at the final level */
+	if (WARN_ON(lvl == 2))
+		return -EINVAL;
+
+	/* Grab a pointer to the next level */
+	pte = READ_ONCE(*ptep);
+	if (!pte) {
+		cptep = __arm_v7s_alloc_table(lvl + 1, GFP_ATOMIC, data);
+		if (!cptep)
+			return -ENOMEM;
+
+		pte = arm_v7s_install_table(cptep, ptep, 0, cfg);
+		if (pte)
+			__arm_v7s_free_table(cptep, lvl + 1, data);
+	} else {
+		/* We've no easy way of knowing if it's synced yet, so... */
+		__arm_v7s_pte_sync(ptep, 1, cfg);
+	}
+
+	if (ARM_V7S_PTE_IS_TABLE(pte, lvl)) {
+		cptep = iopte_deref(pte, lvl);
+	} else if (pte) {
+		/* We require an unmap first */
+		WARN_ON(!selftest_running);
+		return -EEXIST;
+	}
+
+	/* Rinse, repeat */
+	return __arm_v7s_map(data, iova, paddr, size, prot, lvl + 1, cptep);
+}
+
+static int arm_v7s_map(struct io_pgtable_ops *ops, unsigned long iova,
+			phys_addr_t paddr, size_t size, int prot)
+{
+	struct arm_v7s_io_pgtable *data = io_pgtable_ops_to_data(ops);
+	struct io_pgtable *iop = &data->iop;
+	int ret;
+
+	/* If no access, then nothing to do */
+	if (!(prot & (IOMMU_READ | IOMMU_WRITE)))
+		return 0;
+
+	if (WARN_ON(upper_32_bits(iova) || upper_32_bits(paddr)))
+		return -ERANGE;
+
+	ret = __arm_v7s_map(data, iova, paddr, size, prot, 1, data->pgd);
+	/*
+	 * Synchronise all PTE updates for the new mapping before there's
+	 * a chance for anything to kick off a table walk for the new iova.
+	 */
+	if (iop->cfg.quirks & IO_PGTABLE_QUIRK_TLBI_ON_MAP) {
+		io_pgtable_tlb_add_flush(iop, iova, size,
+					 ARM_V7S_BLOCK_SIZE(2), false);
+		io_pgtable_tlb_sync(iop);
+	} else {
+		wmb();
+	}
+
+	return ret;
+}
+
+static void arm_v7s_free_pgtable(struct io_pgtable *iop)
+{
+	struct arm_v7s_io_pgtable *data = io_pgtable_to_data(iop);
+	int i;
+
+	for (i = 0; i < ARM_V7S_PTES_PER_LVL(1); i++) {
+		arm_v7s_iopte pte = data->pgd[i];
+
+		if (ARM_V7S_PTE_IS_TABLE(pte, 1))
+			__arm_v7s_free_table(iopte_deref(pte, 1), 2, data);
+	}
+	__arm_v7s_free_table(data->pgd, 1, data);
+	kmem_cache_destroy(data->l2_tables);
+	kfree(data);
+}
+
+static arm_v7s_iopte arm_v7s_split_cont(struct arm_v7s_io_pgtable *data,
+					unsigned long iova, int idx, int lvl,
+					arm_v7s_iopte *ptep)
+{
+	struct io_pgtable *iop = &data->iop;
+	arm_v7s_iopte pte;
+	size_t size = ARM_V7S_BLOCK_SIZE(lvl);
+	int i;
+
+	/* Check that we didn't lose a race to get the lock */
+	pte = *ptep;
+	if (!arm_v7s_pte_is_cont(pte, lvl))
+		return pte;
+
+	ptep -= idx & (ARM_V7S_CONT_PAGES - 1);
+	pte = arm_v7s_cont_to_pte(pte, lvl);
+	for (i = 0; i < ARM_V7S_CONT_PAGES; i++)
+		ptep[i] = pte + i * size;
+
+	__arm_v7s_pte_sync(ptep, ARM_V7S_CONT_PAGES, &iop->cfg);
+
+	size *= ARM_V7S_CONT_PAGES;
+	io_pgtable_tlb_add_flush(iop, iova, size, size, true);
+	io_pgtable_tlb_sync(iop);
+	return pte;
+}
+
+static size_t arm_v7s_split_blk_unmap(struct arm_v7s_io_pgtable *data,
+				      unsigned long iova, size_t size,
+				      arm_v7s_iopte blk_pte,
+				      arm_v7s_iopte *ptep)
+{
+	struct io_pgtable_cfg *cfg = &data->iop.cfg;
+	arm_v7s_iopte pte, *tablep;
+	int i, unmap_idx, num_entries, num_ptes;
+
+	tablep = __arm_v7s_alloc_table(2, GFP_ATOMIC, data);
+	if (!tablep)
+		return 0; /* Bytes unmapped */
+
+	num_ptes = ARM_V7S_PTES_PER_LVL(2);
+	num_entries = size >> ARM_V7S_LVL_SHIFT(2);
+	unmap_idx = ARM_V7S_LVL_IDX(iova, 2);
+
+	pte = arm_v7s_prot_to_pte(arm_v7s_pte_to_prot(blk_pte, 1), 2, cfg);
+	if (num_entries > 1)
+		pte = arm_v7s_pte_to_cont(pte, 2);
+
+	for (i = 0; i < num_ptes; i += num_entries, pte += size) {
+		/* Unmap! */
+		if (i == unmap_idx)
+			continue;
+
+		__arm_v7s_set_pte(&tablep[i], pte, num_entries, cfg);
+	}
+
+	pte = arm_v7s_install_table(tablep, ptep, blk_pte, cfg);
+	if (pte != blk_pte) {
+		__arm_v7s_free_table(tablep, 2, data);
+
+		if (!ARM_V7S_PTE_IS_TABLE(pte, 1))
+			return 0;
+
+		tablep = iopte_deref(pte, 1);
+		return __arm_v7s_unmap(data, iova, size, 2, tablep);
+	}
+
+	io_pgtable_tlb_add_flush(&data->iop, iova, size, size, true);
+	return size;
+}
+
+static size_t __arm_v7s_unmap(struct arm_v7s_io_pgtable *data,
+			      unsigned long iova, size_t size, int lvl,
+			      arm_v7s_iopte *ptep)
+{
+	arm_v7s_iopte pte[ARM_V7S_CONT_PAGES];
+	struct io_pgtable *iop = &data->iop;
+	int idx, i = 0, num_entries = size >> ARM_V7S_LVL_SHIFT(lvl);
+
+	/* Something went horribly wrong and we ran out of page table */
+	if (WARN_ON(lvl > 2))
+		return 0;
+
+	idx = ARM_V7S_LVL_IDX(iova, lvl);
+	ptep += idx;
+	do {
+		pte[i] = READ_ONCE(ptep[i]);
+		if (WARN_ON(!ARM_V7S_PTE_IS_VALID(pte[i])))
+			return 0;
+	} while (++i < num_entries);
+
+	/*
+	 * If we've hit a contiguous 'large page' entry at this level, it
+	 * needs splitting first, unless we're unmapping the whole lot.
+	 *
+	 * For splitting, we can't rewrite 16 PTEs atomically, and since we
+	 * can't necessarily assume TEX remap we don't have a software bit to
+	 * mark live entries being split. In practice (i.e. DMA API code), we
+	 * will never be splitting large pages anyway, so just wrap this edge
+	 * case in a lock for the sake of correctness and be done with it.
+	 */
+	if (num_entries <= 1 && arm_v7s_pte_is_cont(pte[0], lvl)) {
+		unsigned long flags;
+
+		spin_lock_irqsave(&data->split_lock, flags);
+		pte[0] = arm_v7s_split_cont(data, iova, idx, lvl, ptep);
+		spin_unlock_irqrestore(&data->split_lock, flags);
+	}
+
+	/* If the size matches this level, we're in the right place */
+	if (num_entries) {
+		size_t blk_size = ARM_V7S_BLOCK_SIZE(lvl);
+
+		__arm_v7s_set_pte(ptep, 0, num_entries, &iop->cfg);
+
+		for (i = 0; i < num_entries; i++) {
+			if (ARM_V7S_PTE_IS_TABLE(pte[i], lvl)) {
+				/* Also flush any partial walks */
+				io_pgtable_tlb_add_flush(iop, iova, blk_size,
+					ARM_V7S_BLOCK_SIZE(lvl + 1), false);
+				io_pgtable_tlb_sync(iop);
+				ptep = iopte_deref(pte[i], lvl);
+				__arm_v7s_free_table(ptep, lvl + 1, data);
+			} else {
+				io_pgtable_tlb_add_flush(iop, iova, blk_size,
+							 blk_size, true);
+			}
+			iova += blk_size;
+		}
+		return size;
+	} else if (lvl == 1 && !ARM_V7S_PTE_IS_TABLE(pte[0], lvl)) {
+		/*
+		 * Insert a table at the next level to map the old region,
+		 * minus the part we want to unmap
+		 */
+		return arm_v7s_split_blk_unmap(data, iova, size, pte[0], ptep);
+	}
+
+	/* Keep on walkin' */
+	ptep = iopte_deref(pte[0], lvl);
+	return __arm_v7s_unmap(data, iova, size, lvl + 1, ptep);
+}
+
+static size_t arm_v7s_unmap(struct io_pgtable_ops *ops, unsigned long iova,
+			    size_t size)
+{
+	struct arm_v7s_io_pgtable *data = io_pgtable_ops_to_data(ops);
+
+	if (WARN_ON(upper_32_bits(iova)))
+		return 0;
+
+	return __arm_v7s_unmap(data, iova, size, 1, data->pgd);
+}
+
+static phys_addr_t arm_v7s_iova_to_phys(struct io_pgtable_ops *ops,
+					unsigned long iova)
+{
+	struct arm_v7s_io_pgtable *data = io_pgtable_ops_to_data(ops);
+	arm_v7s_iopte *ptep = data->pgd, pte;
+	int lvl = 0;
+	u32 mask;
+
+	do {
+		ptep += ARM_V7S_LVL_IDX(iova, ++lvl);
+		pte = READ_ONCE(*ptep);
+		ptep = iopte_deref(pte, lvl);
+	} while (ARM_V7S_PTE_IS_TABLE(pte, lvl));
+
+	if (!ARM_V7S_PTE_IS_VALID(pte))
+		return 0;
+
+	mask = ARM_V7S_LVL_MASK(lvl);
+	if (arm_v7s_pte_is_cont(pte, lvl))
+		mask *= ARM_V7S_CONT_PAGES;
+	return (pte & mask) | (iova & ~mask);
+}
+
+static struct io_pgtable *arm_v7s_alloc_pgtable(struct io_pgtable_cfg *cfg,
+						void *cookie)
+{
+	struct arm_v7s_io_pgtable *data;
+
+#ifdef PHYS_OFFSET
+	if (upper_32_bits(PHYS_OFFSET))
+		return NULL;
+#endif
+	if (cfg->ias > ARM_V7S_ADDR_BITS || cfg->oas > ARM_V7S_ADDR_BITS)
+		return NULL;
+
+	if (cfg->quirks & ~(IO_PGTABLE_QUIRK_ARM_NS |
+			    IO_PGTABLE_QUIRK_NO_PERMS |
+			    IO_PGTABLE_QUIRK_TLBI_ON_MAP |
+			    IO_PGTABLE_QUIRK_ARM_MTK_4GB |
+			    IO_PGTABLE_QUIRK_NO_DMA))
+		return NULL;
+
+	/* If ARM_MTK_4GB is enabled, the NO_PERMS is also expected. */
+	if (cfg->quirks & IO_PGTABLE_QUIRK_ARM_MTK_4GB &&
+	    !(cfg->quirks & IO_PGTABLE_QUIRK_NO_PERMS))
+			return NULL;
+
+	data = kmalloc(sizeof(*data), GFP_KERNEL);
+	if (!data)
+		return NULL;
+
+	spin_lock_init(&data->split_lock);
+	data->l2_tables = kmem_cache_create("io-pgtable_armv7s_l2",
+					    ARM_V7S_TABLE_SIZE(2),
+					    ARM_V7S_TABLE_SIZE(2),
+					    ARM_V7S_TABLE_SLAB_FLAGS, NULL);
+	if (!data->l2_tables)
+		goto out_free_data;
+
+	data->iop.ops = (struct io_pgtable_ops) {
+		.map		= arm_v7s_map,
+		.unmap		= arm_v7s_unmap,
+		.iova_to_phys	= arm_v7s_iova_to_phys,
+	};
+
+	/* We have to do this early for __arm_v7s_alloc_table to work... */
+	data->iop.cfg = *cfg;
+
+	/*
+	 * Unless the IOMMU driver indicates supersection support by
+	 * having SZ_16M set in the initial bitmap, they won't be used.
+	 */
+	cfg->pgsize_bitmap &= SZ_4K | SZ_64K | SZ_1M | SZ_16M;
+
+	/* TCR: T0SZ=0, disable TTBR1 */
+	cfg->arm_v7s_cfg.tcr = ARM_V7S_TCR_PD1;
+
+	/*
+	 * TEX remap: the indices used map to the closest equivalent types
+	 * under the non-TEX-remap interpretation of those attribute bits,
+	 * excepting various implementation-defined aspects of shareability.
+	 */
+	cfg->arm_v7s_cfg.prrr = ARM_V7S_PRRR_TR(1, ARM_V7S_PRRR_TYPE_DEVICE) |
+				ARM_V7S_PRRR_TR(4, ARM_V7S_PRRR_TYPE_NORMAL) |
+				ARM_V7S_PRRR_TR(7, ARM_V7S_PRRR_TYPE_NORMAL) |
+				ARM_V7S_PRRR_DS0 | ARM_V7S_PRRR_DS1 |
+				ARM_V7S_PRRR_NS1 | ARM_V7S_PRRR_NOS(7);
+	cfg->arm_v7s_cfg.nmrr = ARM_V7S_NMRR_IR(7, ARM_V7S_RGN_WBWA) |
+				ARM_V7S_NMRR_OR(7, ARM_V7S_RGN_WBWA);
+
+	/* Looking good; allocate a pgd */
+	data->pgd = __arm_v7s_alloc_table(1, GFP_KERNEL, data);
+	if (!data->pgd)
+		goto out_free_data;
+
+	/* Ensure the empty pgd is visible before any actual TTBR write */
+	wmb();
+
+	/* TTBRs */
+	cfg->arm_v7s_cfg.ttbr[0] = virt_to_phys(data->pgd) |
+				   ARM_V7S_TTBR_S | ARM_V7S_TTBR_NOS |
+				   ARM_V7S_TTBR_IRGN_ATTR(ARM_V7S_RGN_WBWA) |
+				   ARM_V7S_TTBR_ORGN_ATTR(ARM_V7S_RGN_WBWA);
+	cfg->arm_v7s_cfg.ttbr[1] = 0;
+	return &data->iop;
+
+out_free_data:
+	kmem_cache_destroy(data->l2_tables);
+	kfree(data);
+	return NULL;
+}
+
+struct io_pgtable_init_fns io_pgtable_arm_v7s_init_fns = {
+	.alloc	= arm_v7s_alloc_pgtable,
+	.free	= arm_v7s_free_pgtable,
+};
+
+#ifdef CONFIG_IOMMU_IO_PGTABLE_ARMV7S_SELFTEST
+
+static struct io_pgtable_cfg *cfg_cookie;
+
+static void dummy_tlb_flush_all(void *cookie)
+{
+	WARN_ON(cookie != cfg_cookie);
+}
+
+static void dummy_tlb_add_flush(unsigned long iova, size_t size,
+				size_t granule, bool leaf, void *cookie)
+{
+	WARN_ON(cookie != cfg_cookie);
+	WARN_ON(!(size & cfg_cookie->pgsize_bitmap));
+}
+
+static void dummy_tlb_sync(void *cookie)
+{
+	WARN_ON(cookie != cfg_cookie);
+}
+
+static const struct iommu_gather_ops dummy_tlb_ops = {
+	.tlb_flush_all	= dummy_tlb_flush_all,
+	.tlb_add_flush	= dummy_tlb_add_flush,
+	.tlb_sync	= dummy_tlb_sync,
+};
+
+#define __FAIL(ops)	({				\
+		WARN(1, "selftest: test failed\n");	\
+		selftest_running = false;		\
+		-EFAULT;				\
+})
+
+static int __init arm_v7s_do_selftests(void)
+{
+	struct io_pgtable_ops *ops;
+	struct io_pgtable_cfg cfg = {
+		.tlb = &dummy_tlb_ops,
+		.oas = 32,
+		.ias = 32,
+		.quirks = IO_PGTABLE_QUIRK_ARM_NS | IO_PGTABLE_QUIRK_NO_DMA,
+		.pgsize_bitmap = SZ_4K | SZ_64K | SZ_1M | SZ_16M,
+	};
+	unsigned int iova, size, iova_start;
+	unsigned int i, loopnr = 0;
+
+	selftest_running = true;
+
+	cfg_cookie = &cfg;
+
+	ops = alloc_io_pgtable_ops(ARM_V7S, &cfg, &cfg);
+	if (!ops) {
+		pr_err("selftest: failed to allocate io pgtable ops\n");
+		return -EINVAL;
+	}
+
+	/*
+	 * Initial sanity checks.
+	 * Empty page tables shouldn't provide any translations.
+	 */
+	if (ops->iova_to_phys(ops, 42))
+		return __FAIL(ops);
+
+	if (ops->iova_to_phys(ops, SZ_1G + 42))
+		return __FAIL(ops);
+
+	if (ops->iova_to_phys(ops, SZ_2G + 42))
+		return __FAIL(ops);
+
+	/*
+	 * Distinct mappings of different granule sizes.
+	 */
+	iova = 0;
+	for_each_set_bit(i, &cfg.pgsize_bitmap, BITS_PER_LONG) {
+		size = 1UL << i;
+		if (ops->map(ops, iova, iova, size, IOMMU_READ |
+						    IOMMU_WRITE |
+						    IOMMU_NOEXEC |
+						    IOMMU_CACHE))
+			return __FAIL(ops);
+
+		/* Overlapping mappings */
+		if (!ops->map(ops, iova, iova + size, size,
+			      IOMMU_READ | IOMMU_NOEXEC))
+			return __FAIL(ops);
+
+		if (ops->iova_to_phys(ops, iova + 42) != (iova + 42))
+			return __FAIL(ops);
+
+		iova += SZ_16M;
+		loopnr++;
+	}
+
+	/* Partial unmap */
+	i = 1;
+	size = 1UL << __ffs(cfg.pgsize_bitmap);
+	while (i < loopnr) {
+		iova_start = i * SZ_16M;
+		if (ops->unmap(ops, iova_start + size, size) != size)
+			return __FAIL(ops);
+
+		/* Remap of partial unmap */
+		if (ops->map(ops, iova_start + size, size, size, IOMMU_READ))
+			return __FAIL(ops);
+
+		if (ops->iova_to_phys(ops, iova_start + size + 42)
+		    != (size + 42))
+			return __FAIL(ops);
+		i++;
+	}
+
+	/* Full unmap */
+	iova = 0;
+	for_each_set_bit(i, &cfg.pgsize_bitmap, BITS_PER_LONG) {
+		size = 1UL << i;
+
+		if (ops->unmap(ops, iova, size) != size)
+			return __FAIL(ops);
+
+		if (ops->iova_to_phys(ops, iova + 42))
+			return __FAIL(ops);
+
+		/* Remap full block */
+		if (ops->map(ops, iova, iova, size, IOMMU_WRITE))
+			return __FAIL(ops);
+
+		if (ops->iova_to_phys(ops, iova + 42) != (iova + 42))
+			return __FAIL(ops);
+
+		iova += SZ_16M;
+	}
+
+	free_io_pgtable_ops(ops);
+
+	selftest_running = false;
+
+	pr_info("self test ok\n");
+	return 0;
+}
+subsys_initcall(arm_v7s_do_selftests);
+#endif
diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c
new file mode 100644
index 000000000..4bd2dd70a
--- /dev/null
+++ b/drivers/iommu/io-pgtable-arm.c
@@ -0,0 +1,1191 @@
+/*
+ * CPU-agnostic ARM page table allocator.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Copyright (C) 2014 ARM Limited
+ *
+ * Author: Will Deacon <will.deacon@arm.com>
+ */
+
+#define pr_fmt(fmt)	"arm-lpae io-pgtable: " fmt
+
+#include <linux/atomic.h>
+#include <linux/bitops.h>
+#include <linux/iommu.h>
+#include <linux/kernel.h>
+#include <linux/sizes.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/dma-mapping.h>
+
+#include <asm/barrier.h>
+
+#include "io-pgtable.h"
+
+#define ARM_LPAE_MAX_ADDR_BITS		52
+#define ARM_LPAE_S2_MAX_CONCAT_PAGES	16
+#define ARM_LPAE_MAX_LEVELS		4
+
+/* Struct accessors */
+#define io_pgtable_to_data(x)						\
+	container_of((x), struct arm_lpae_io_pgtable, iop)
+
+#define io_pgtable_ops_to_data(x)					\
+	io_pgtable_to_data(io_pgtable_ops_to_pgtable(x))
+
+/*
+ * For consistency with the architecture, we always consider
+ * ARM_LPAE_MAX_LEVELS levels, with the walk starting at level n >=0
+ */
+#define ARM_LPAE_START_LVL(d)		(ARM_LPAE_MAX_LEVELS - (d)->levels)
+
+/*
+ * Calculate the right shift amount to get to the portion describing level l
+ * in a virtual address mapped by the pagetable in d.
+ */
+#define ARM_LPAE_LVL_SHIFT(l,d)						\
+	((((d)->levels - ((l) - ARM_LPAE_START_LVL(d) + 1))		\
+	  * (d)->bits_per_level) + (d)->pg_shift)
+
+#define ARM_LPAE_GRANULE(d)		(1UL << (d)->pg_shift)
+
+#define ARM_LPAE_PAGES_PER_PGD(d)					\
+	DIV_ROUND_UP((d)->pgd_size, ARM_LPAE_GRANULE(d))
+
+/*
+ * Calculate the index at level l used to map virtual address a using the
+ * pagetable in d.
+ */
+#define ARM_LPAE_PGD_IDX(l,d)						\
+	((l) == ARM_LPAE_START_LVL(d) ? ilog2(ARM_LPAE_PAGES_PER_PGD(d)) : 0)
+
+#define ARM_LPAE_LVL_IDX(a,l,d)						\
+	(((u64)(a) >> ARM_LPAE_LVL_SHIFT(l,d)) &			\
+	 ((1 << ((d)->bits_per_level + ARM_LPAE_PGD_IDX(l,d))) - 1))
+
+/* Calculate the block/page mapping size at level l for pagetable in d. */
+#define ARM_LPAE_BLOCK_SIZE(l,d)					\
+	(1ULL << (ilog2(sizeof(arm_lpae_iopte)) +			\
+		((ARM_LPAE_MAX_LEVELS - (l)) * (d)->bits_per_level)))
+
+/* Page table bits */
+#define ARM_LPAE_PTE_TYPE_SHIFT		0
+#define ARM_LPAE_PTE_TYPE_MASK		0x3
+
+#define ARM_LPAE_PTE_TYPE_BLOCK		1
+#define ARM_LPAE_PTE_TYPE_TABLE		3
+#define ARM_LPAE_PTE_TYPE_PAGE		3
+
+#define ARM_LPAE_PTE_ADDR_MASK		GENMASK_ULL(47,12)
+
+#define ARM_LPAE_PTE_NSTABLE		(((arm_lpae_iopte)1) << 63)
+#define ARM_LPAE_PTE_XN			(((arm_lpae_iopte)3) << 53)
+#define ARM_LPAE_PTE_AF			(((arm_lpae_iopte)1) << 10)
+#define ARM_LPAE_PTE_SH_NS		(((arm_lpae_iopte)0) << 8)
+#define ARM_LPAE_PTE_SH_OS		(((arm_lpae_iopte)2) << 8)
+#define ARM_LPAE_PTE_SH_IS		(((arm_lpae_iopte)3) << 8)
+#define ARM_LPAE_PTE_NS			(((arm_lpae_iopte)1) << 5)
+#define ARM_LPAE_PTE_VALID		(((arm_lpae_iopte)1) << 0)
+
+#define ARM_LPAE_PTE_ATTR_LO_MASK	(((arm_lpae_iopte)0x3ff) << 2)
+/* Ignore the contiguous bit for block splitting */
+#define ARM_LPAE_PTE_ATTR_HI_MASK	(((arm_lpae_iopte)6) << 52)
+#define ARM_LPAE_PTE_ATTR_MASK		(ARM_LPAE_PTE_ATTR_LO_MASK |	\
+					 ARM_LPAE_PTE_ATTR_HI_MASK)
+/* Software bit for solving coherency races */
+#define ARM_LPAE_PTE_SW_SYNC		(((arm_lpae_iopte)1) << 55)
+
+/* Stage-1 PTE */
+#define ARM_LPAE_PTE_AP_UNPRIV		(((arm_lpae_iopte)1) << 6)
+#define ARM_LPAE_PTE_AP_RDONLY		(((arm_lpae_iopte)2) << 6)
+#define ARM_LPAE_PTE_ATTRINDX_SHIFT	2
+#define ARM_LPAE_PTE_nG			(((arm_lpae_iopte)1) << 11)
+
+/* Stage-2 PTE */
+#define ARM_LPAE_PTE_HAP_FAULT		(((arm_lpae_iopte)0) << 6)
+#define ARM_LPAE_PTE_HAP_READ		(((arm_lpae_iopte)1) << 6)
+#define ARM_LPAE_PTE_HAP_WRITE		(((arm_lpae_iopte)2) << 6)
+#define ARM_LPAE_PTE_MEMATTR_OIWB	(((arm_lpae_iopte)0xf) << 2)
+#define ARM_LPAE_PTE_MEMATTR_NC		(((arm_lpae_iopte)0x5) << 2)
+#define ARM_LPAE_PTE_MEMATTR_DEV	(((arm_lpae_iopte)0x1) << 2)
+
+/* Register bits */
+#define ARM_32_LPAE_TCR_EAE		(1 << 31)
+#define ARM_64_LPAE_S2_TCR_RES1		(1 << 31)
+
+#define ARM_LPAE_TCR_EPD1		(1 << 23)
+
+#define ARM_LPAE_TCR_TG0_4K		(0 << 14)
+#define ARM_LPAE_TCR_TG0_64K		(1 << 14)
+#define ARM_LPAE_TCR_TG0_16K		(2 << 14)
+
+#define ARM_LPAE_TCR_SH0_SHIFT		12
+#define ARM_LPAE_TCR_SH0_MASK		0x3
+#define ARM_LPAE_TCR_SH_NS		0
+#define ARM_LPAE_TCR_SH_OS		2
+#define ARM_LPAE_TCR_SH_IS		3
+
+#define ARM_LPAE_TCR_ORGN0_SHIFT	10
+#define ARM_LPAE_TCR_IRGN0_SHIFT	8
+#define ARM_LPAE_TCR_RGN_MASK		0x3
+#define ARM_LPAE_TCR_RGN_NC		0
+#define ARM_LPAE_TCR_RGN_WBWA		1
+#define ARM_LPAE_TCR_RGN_WT		2
+#define ARM_LPAE_TCR_RGN_WB		3
+
+#define ARM_LPAE_TCR_SL0_SHIFT		6
+#define ARM_LPAE_TCR_SL0_MASK		0x3
+
+#define ARM_LPAE_TCR_T0SZ_SHIFT		0
+#define ARM_LPAE_TCR_SZ_MASK		0xf
+
+#define ARM_LPAE_TCR_PS_SHIFT		16
+#define ARM_LPAE_TCR_PS_MASK		0x7
+
+#define ARM_LPAE_TCR_IPS_SHIFT		32
+#define ARM_LPAE_TCR_IPS_MASK		0x7
+
+#define ARM_LPAE_TCR_PS_32_BIT		0x0ULL
+#define ARM_LPAE_TCR_PS_36_BIT		0x1ULL
+#define ARM_LPAE_TCR_PS_40_BIT		0x2ULL
+#define ARM_LPAE_TCR_PS_42_BIT		0x3ULL
+#define ARM_LPAE_TCR_PS_44_BIT		0x4ULL
+#define ARM_LPAE_TCR_PS_48_BIT		0x5ULL
+#define ARM_LPAE_TCR_PS_52_BIT		0x6ULL
+
+#define ARM_LPAE_MAIR_ATTR_SHIFT(n)	((n) << 3)
+#define ARM_LPAE_MAIR_ATTR_MASK		0xff
+#define ARM_LPAE_MAIR_ATTR_DEVICE	0x04
+#define ARM_LPAE_MAIR_ATTR_NC		0x44
+#define ARM_LPAE_MAIR_ATTR_WBRWA	0xff
+#define ARM_LPAE_MAIR_ATTR_IDX_NC	0
+#define ARM_LPAE_MAIR_ATTR_IDX_CACHE	1
+#define ARM_LPAE_MAIR_ATTR_IDX_DEV	2
+
+/* IOPTE accessors */
+#define iopte_deref(pte,d) __va(iopte_to_paddr(pte, d))
+
+#define iopte_type(pte,l)					\
+	(((pte) >> ARM_LPAE_PTE_TYPE_SHIFT) & ARM_LPAE_PTE_TYPE_MASK)
+
+#define iopte_prot(pte)	((pte) & ARM_LPAE_PTE_ATTR_MASK)
+
+#define iopte_leaf(pte,l)					\
+	(l == (ARM_LPAE_MAX_LEVELS - 1) ?			\
+		(iopte_type(pte,l) == ARM_LPAE_PTE_TYPE_PAGE) :	\
+		(iopte_type(pte,l) == ARM_LPAE_PTE_TYPE_BLOCK))
+
+struct arm_lpae_io_pgtable {
+	struct io_pgtable	iop;
+
+	int			levels;
+	size_t			pgd_size;
+	unsigned long		pg_shift;
+	unsigned long		bits_per_level;
+
+	void			*pgd;
+};
+
+typedef u64 arm_lpae_iopte;
+
+static arm_lpae_iopte paddr_to_iopte(phys_addr_t paddr,
+				     struct arm_lpae_io_pgtable *data)
+{
+	arm_lpae_iopte pte = paddr;
+
+	/* Of the bits which overlap, either 51:48 or 15:12 are always RES0 */
+	return (pte | (pte >> (48 - 12))) & ARM_LPAE_PTE_ADDR_MASK;
+}
+
+static phys_addr_t iopte_to_paddr(arm_lpae_iopte pte,
+				  struct arm_lpae_io_pgtable *data)
+{
+	u64 paddr = pte & ARM_LPAE_PTE_ADDR_MASK;
+
+	if (data->pg_shift < 16)
+		return paddr;
+
+	/* Rotate the packed high-order bits back to the top */
+	return (paddr | (paddr << (48 - 12))) & (ARM_LPAE_PTE_ADDR_MASK << 4);
+}
+
+static bool selftest_running = false;
+
+static dma_addr_t __arm_lpae_dma_addr(void *pages)
+{
+	return (dma_addr_t)virt_to_phys(pages);
+}
+
+static void *__arm_lpae_alloc_pages(size_t size, gfp_t gfp,
+				    struct io_pgtable_cfg *cfg)
+{
+	struct device *dev = cfg->iommu_dev;
+	int order = get_order(size);
+	struct page *p;
+	dma_addr_t dma;
+	void *pages;
+
+	VM_BUG_ON((gfp & __GFP_HIGHMEM));
+	p = alloc_pages_node(dev ? dev_to_node(dev) : NUMA_NO_NODE,
+			     gfp | __GFP_ZERO, order);
+	if (!p)
+		return NULL;
+
+	pages = page_address(p);
+	if (!(cfg->quirks & IO_PGTABLE_QUIRK_NO_DMA)) {
+		dma = dma_map_single(dev, pages, size, DMA_TO_DEVICE);
+		if (dma_mapping_error(dev, dma))
+			goto out_free;
+		/*
+		 * We depend on the IOMMU being able to work with any physical
+		 * address directly, so if the DMA layer suggests otherwise by
+		 * translating or truncating them, that bodes very badly...
+		 */
+		if (dma != virt_to_phys(pages))
+			goto out_unmap;
+	}
+
+	return pages;
+
+out_unmap:
+	dev_err(dev, "Cannot accommodate DMA translation for IOMMU page tables\n");
+	dma_unmap_single(dev, dma, size, DMA_TO_DEVICE);
+out_free:
+	__free_pages(p, order);
+	return NULL;
+}
+
+static void __arm_lpae_free_pages(void *pages, size_t size,
+				  struct io_pgtable_cfg *cfg)
+{
+	if (!(cfg->quirks & IO_PGTABLE_QUIRK_NO_DMA))
+		dma_unmap_single(cfg->iommu_dev, __arm_lpae_dma_addr(pages),
+				 size, DMA_TO_DEVICE);
+	free_pages((unsigned long)pages, get_order(size));
+}
+
+static void __arm_lpae_sync_pte(arm_lpae_iopte *ptep,
+				struct io_pgtable_cfg *cfg)
+{
+	dma_sync_single_for_device(cfg->iommu_dev, __arm_lpae_dma_addr(ptep),
+				   sizeof(*ptep), DMA_TO_DEVICE);
+}
+
+static void __arm_lpae_set_pte(arm_lpae_iopte *ptep, arm_lpae_iopte pte,
+			       struct io_pgtable_cfg *cfg)
+{
+	*ptep = pte;
+
+	if (!(cfg->quirks & IO_PGTABLE_QUIRK_NO_DMA))
+		__arm_lpae_sync_pte(ptep, cfg);
+}
+
+static size_t __arm_lpae_unmap(struct arm_lpae_io_pgtable *data,
+			       unsigned long iova, size_t size, int lvl,
+			       arm_lpae_iopte *ptep);
+
+static void __arm_lpae_init_pte(struct arm_lpae_io_pgtable *data,
+				phys_addr_t paddr, arm_lpae_iopte prot,
+				int lvl, arm_lpae_iopte *ptep)
+{
+	arm_lpae_iopte pte = prot;
+
+	if (data->iop.cfg.quirks & IO_PGTABLE_QUIRK_ARM_NS)
+		pte |= ARM_LPAE_PTE_NS;
+
+	if (lvl == ARM_LPAE_MAX_LEVELS - 1)
+		pte |= ARM_LPAE_PTE_TYPE_PAGE;
+	else
+		pte |= ARM_LPAE_PTE_TYPE_BLOCK;
+
+	pte |= ARM_LPAE_PTE_AF | ARM_LPAE_PTE_SH_IS;
+	pte |= paddr_to_iopte(paddr, data);
+
+	__arm_lpae_set_pte(ptep, pte, &data->iop.cfg);
+}
+
+static int arm_lpae_init_pte(struct arm_lpae_io_pgtable *data,
+			     unsigned long iova, phys_addr_t paddr,
+			     arm_lpae_iopte prot, int lvl,
+			     arm_lpae_iopte *ptep)
+{
+	arm_lpae_iopte pte = *ptep;
+
+	if (iopte_leaf(pte, lvl)) {
+		/* We require an unmap first */
+		WARN_ON(!selftest_running);
+		return -EEXIST;
+	} else if (iopte_type(pte, lvl) == ARM_LPAE_PTE_TYPE_TABLE) {
+		/*
+		 * We need to unmap and free the old table before
+		 * overwriting it with a block entry.
+		 */
+		arm_lpae_iopte *tblp;
+		size_t sz = ARM_LPAE_BLOCK_SIZE(lvl, data);
+
+		tblp = ptep - ARM_LPAE_LVL_IDX(iova, lvl, data);
+		if (WARN_ON(__arm_lpae_unmap(data, iova, sz, lvl, tblp) != sz))
+			return -EINVAL;
+	}
+
+	__arm_lpae_init_pte(data, paddr, prot, lvl, ptep);
+	return 0;
+}
+
+static arm_lpae_iopte arm_lpae_install_table(arm_lpae_iopte *table,
+					     arm_lpae_iopte *ptep,
+					     arm_lpae_iopte curr,
+					     struct arm_lpae_io_pgtable *data)
+{
+	arm_lpae_iopte old, new;
+	struct io_pgtable_cfg *cfg = &data->iop.cfg;
+
+	new = paddr_to_iopte(__pa(table), data) | ARM_LPAE_PTE_TYPE_TABLE;
+	if (cfg->quirks & IO_PGTABLE_QUIRK_ARM_NS)
+		new |= ARM_LPAE_PTE_NSTABLE;
+
+	/*
+	 * Ensure the table itself is visible before its PTE can be.
+	 * Whilst we could get away with cmpxchg64_release below, this
+	 * doesn't have any ordering semantics when !CONFIG_SMP.
+	 */
+	dma_wmb();
+
+	old = cmpxchg64_relaxed(ptep, curr, new);
+
+	if ((cfg->quirks & IO_PGTABLE_QUIRK_NO_DMA) ||
+	    (old & ARM_LPAE_PTE_SW_SYNC))
+		return old;
+
+	/* Even if it's not ours, there's no point waiting; just kick it */
+	__arm_lpae_sync_pte(ptep, cfg);
+	if (old == curr)
+		WRITE_ONCE(*ptep, new | ARM_LPAE_PTE_SW_SYNC);
+
+	return old;
+}
+
+static int __arm_lpae_map(struct arm_lpae_io_pgtable *data, unsigned long iova,
+			  phys_addr_t paddr, size_t size, arm_lpae_iopte prot,
+			  int lvl, arm_lpae_iopte *ptep)
+{
+	arm_lpae_iopte *cptep, pte;
+	size_t block_size = ARM_LPAE_BLOCK_SIZE(lvl, data);
+	size_t tblsz = ARM_LPAE_GRANULE(data);
+	struct io_pgtable_cfg *cfg = &data->iop.cfg;
+
+	/* Find our entry at the current level */
+	ptep += ARM_LPAE_LVL_IDX(iova, lvl, data);
+
+	/* If we can install a leaf entry at this level, then do so */
+	if (size == block_size && (size & cfg->pgsize_bitmap))
+		return arm_lpae_init_pte(data, iova, paddr, prot, lvl, ptep);
+
+	/* We can't allocate tables at the final level */
+	if (WARN_ON(lvl >= ARM_LPAE_MAX_LEVELS - 1))
+		return -EINVAL;
+
+	/* Grab a pointer to the next level */
+	pte = READ_ONCE(*ptep);
+	if (!pte) {
+		cptep = __arm_lpae_alloc_pages(tblsz, GFP_ATOMIC, cfg);
+		if (!cptep)
+			return -ENOMEM;
+
+		pte = arm_lpae_install_table(cptep, ptep, 0, data);
+		if (pte)
+			__arm_lpae_free_pages(cptep, tblsz, cfg);
+	} else if (!(cfg->quirks & IO_PGTABLE_QUIRK_NO_DMA) &&
+		   !(pte & ARM_LPAE_PTE_SW_SYNC)) {
+		__arm_lpae_sync_pte(ptep, cfg);
+	}
+
+	if (pte && !iopte_leaf(pte, lvl)) {
+		cptep = iopte_deref(pte, data);
+	} else if (pte) {
+		/* We require an unmap first */
+		WARN_ON(!selftest_running);
+		return -EEXIST;
+	}
+
+	/* Rinse, repeat */
+	return __arm_lpae_map(data, iova, paddr, size, prot, lvl + 1, cptep);
+}
+
+static arm_lpae_iopte arm_lpae_prot_to_pte(struct arm_lpae_io_pgtable *data,
+					   int prot)
+{
+	arm_lpae_iopte pte;
+
+	if (data->iop.fmt == ARM_64_LPAE_S1 ||
+	    data->iop.fmt == ARM_32_LPAE_S1) {
+		pte = ARM_LPAE_PTE_nG;
+
+		if (!(prot & IOMMU_WRITE) && (prot & IOMMU_READ))
+			pte |= ARM_LPAE_PTE_AP_RDONLY;
+
+		if (!(prot & IOMMU_PRIV))
+			pte |= ARM_LPAE_PTE_AP_UNPRIV;
+
+		if (prot & IOMMU_MMIO)
+			pte |= (ARM_LPAE_MAIR_ATTR_IDX_DEV
+				<< ARM_LPAE_PTE_ATTRINDX_SHIFT);
+		else if (prot & IOMMU_CACHE)
+			pte |= (ARM_LPAE_MAIR_ATTR_IDX_CACHE
+				<< ARM_LPAE_PTE_ATTRINDX_SHIFT);
+	} else {
+		pte = ARM_LPAE_PTE_HAP_FAULT;
+		if (prot & IOMMU_READ)
+			pte |= ARM_LPAE_PTE_HAP_READ;
+		if (prot & IOMMU_WRITE)
+			pte |= ARM_LPAE_PTE_HAP_WRITE;
+		if (prot & IOMMU_MMIO)
+			pte |= ARM_LPAE_PTE_MEMATTR_DEV;
+		else if (prot & IOMMU_CACHE)
+			pte |= ARM_LPAE_PTE_MEMATTR_OIWB;
+		else
+			pte |= ARM_LPAE_PTE_MEMATTR_NC;
+	}
+
+	if (prot & IOMMU_NOEXEC)
+		pte |= ARM_LPAE_PTE_XN;
+
+	return pte;
+}
+
+static int arm_lpae_map(struct io_pgtable_ops *ops, unsigned long iova,
+			phys_addr_t paddr, size_t size, int iommu_prot)
+{
+	struct arm_lpae_io_pgtable *data = io_pgtable_ops_to_data(ops);
+	arm_lpae_iopte *ptep = data->pgd;
+	int ret, lvl = ARM_LPAE_START_LVL(data);
+	arm_lpae_iopte prot;
+
+	/* If no access, then nothing to do */
+	if (!(iommu_prot & (IOMMU_READ | IOMMU_WRITE)))
+		return 0;
+
+	if (WARN_ON(iova >= (1ULL << data->iop.cfg.ias) ||
+		    paddr >= (1ULL << data->iop.cfg.oas)))
+		return -ERANGE;
+
+	prot = arm_lpae_prot_to_pte(data, iommu_prot);
+	ret = __arm_lpae_map(data, iova, paddr, size, prot, lvl, ptep);
+	/*
+	 * Synchronise all PTE updates for the new mapping before there's
+	 * a chance for anything to kick off a table walk for the new iova.
+	 */
+	wmb();
+
+	return ret;
+}
+
+static void __arm_lpae_free_pgtable(struct arm_lpae_io_pgtable *data, int lvl,
+				    arm_lpae_iopte *ptep)
+{
+	arm_lpae_iopte *start, *end;
+	unsigned long table_size;
+
+	if (lvl == ARM_LPAE_START_LVL(data))
+		table_size = data->pgd_size;
+	else
+		table_size = ARM_LPAE_GRANULE(data);
+
+	start = ptep;
+
+	/* Only leaf entries at the last level */
+	if (lvl == ARM_LPAE_MAX_LEVELS - 1)
+		end = ptep;
+	else
+		end = (void *)ptep + table_size;
+
+	while (ptep != end) {
+		arm_lpae_iopte pte = *ptep++;
+
+		if (!pte || iopte_leaf(pte, lvl))
+			continue;
+
+		__arm_lpae_free_pgtable(data, lvl + 1, iopte_deref(pte, data));
+	}
+
+	__arm_lpae_free_pages(start, table_size, &data->iop.cfg);
+}
+
+static void arm_lpae_free_pgtable(struct io_pgtable *iop)
+{
+	struct arm_lpae_io_pgtable *data = io_pgtable_to_data(iop);
+
+	__arm_lpae_free_pgtable(data, ARM_LPAE_START_LVL(data), data->pgd);
+	kfree(data);
+}
+
+static size_t arm_lpae_split_blk_unmap(struct arm_lpae_io_pgtable *data,
+				       unsigned long iova, size_t size,
+				       arm_lpae_iopte blk_pte, int lvl,
+				       arm_lpae_iopte *ptep)
+{
+	struct io_pgtable_cfg *cfg = &data->iop.cfg;
+	arm_lpae_iopte pte, *tablep;
+	phys_addr_t blk_paddr;
+	size_t tablesz = ARM_LPAE_GRANULE(data);
+	size_t split_sz = ARM_LPAE_BLOCK_SIZE(lvl, data);
+	int i, unmap_idx = -1;
+
+	if (WARN_ON(lvl == ARM_LPAE_MAX_LEVELS))
+		return 0;
+
+	tablep = __arm_lpae_alloc_pages(tablesz, GFP_ATOMIC, cfg);
+	if (!tablep)
+		return 0; /* Bytes unmapped */
+
+	if (size == split_sz)
+		unmap_idx = ARM_LPAE_LVL_IDX(iova, lvl, data);
+
+	blk_paddr = iopte_to_paddr(blk_pte, data);
+	pte = iopte_prot(blk_pte);
+
+	for (i = 0; i < tablesz / sizeof(pte); i++, blk_paddr += split_sz) {
+		/* Unmap! */
+		if (i == unmap_idx)
+			continue;
+
+		__arm_lpae_init_pte(data, blk_paddr, pte, lvl, &tablep[i]);
+	}
+
+	pte = arm_lpae_install_table(tablep, ptep, blk_pte, data);
+	if (pte != blk_pte) {
+		__arm_lpae_free_pages(tablep, tablesz, cfg);
+		/*
+		 * We may race against someone unmapping another part of this
+		 * block, but anything else is invalid. We can't misinterpret
+		 * a page entry here since we're never at the last level.
+		 */
+		if (iopte_type(pte, lvl - 1) != ARM_LPAE_PTE_TYPE_TABLE)
+			return 0;
+
+		tablep = iopte_deref(pte, data);
+	} else if (unmap_idx >= 0) {
+		io_pgtable_tlb_add_flush(&data->iop, iova, size, size, true);
+		return size;
+	}
+
+	return __arm_lpae_unmap(data, iova, size, lvl, tablep);
+}
+
+static size_t __arm_lpae_unmap(struct arm_lpae_io_pgtable *data,
+			       unsigned long iova, size_t size, int lvl,
+			       arm_lpae_iopte *ptep)
+{
+	arm_lpae_iopte pte;
+	struct io_pgtable *iop = &data->iop;
+
+	/* Something went horribly wrong and we ran out of page table */
+	if (WARN_ON(lvl == ARM_LPAE_MAX_LEVELS))
+		return 0;
+
+	ptep += ARM_LPAE_LVL_IDX(iova, lvl, data);
+	pte = READ_ONCE(*ptep);
+	if (WARN_ON(!pte))
+		return 0;
+
+	/* If the size matches this level, we're in the right place */
+	if (size == ARM_LPAE_BLOCK_SIZE(lvl, data)) {
+		__arm_lpae_set_pte(ptep, 0, &iop->cfg);
+
+		if (!iopte_leaf(pte, lvl)) {
+			/* Also flush any partial walks */
+			io_pgtable_tlb_add_flush(iop, iova, size,
+						ARM_LPAE_GRANULE(data), false);
+			io_pgtable_tlb_sync(iop);
+			ptep = iopte_deref(pte, data);
+			__arm_lpae_free_pgtable(data, lvl + 1, ptep);
+		} else {
+			io_pgtable_tlb_add_flush(iop, iova, size, size, true);
+		}
+
+		return size;
+	} else if (iopte_leaf(pte, lvl)) {
+		/*
+		 * Insert a table at the next level to map the old region,
+		 * minus the part we want to unmap
+		 */
+		return arm_lpae_split_blk_unmap(data, iova, size, pte,
+						lvl + 1, ptep);
+	}
+
+	/* Keep on walkin' */
+	ptep = iopte_deref(pte, data);
+	return __arm_lpae_unmap(data, iova, size, lvl + 1, ptep);
+}
+
+static size_t arm_lpae_unmap(struct io_pgtable_ops *ops, unsigned long iova,
+			     size_t size)
+{
+	struct arm_lpae_io_pgtable *data = io_pgtable_ops_to_data(ops);
+	arm_lpae_iopte *ptep = data->pgd;
+	int lvl = ARM_LPAE_START_LVL(data);
+
+	if (WARN_ON(iova >= (1ULL << data->iop.cfg.ias)))
+		return 0;
+
+	return __arm_lpae_unmap(data, iova, size, lvl, ptep);
+}
+
+static phys_addr_t arm_lpae_iova_to_phys(struct io_pgtable_ops *ops,
+					 unsigned long iova)
+{
+	struct arm_lpae_io_pgtable *data = io_pgtable_ops_to_data(ops);
+	arm_lpae_iopte pte, *ptep = data->pgd;
+	int lvl = ARM_LPAE_START_LVL(data);
+
+	do {
+		/* Valid IOPTE pointer? */
+		if (!ptep)
+			return 0;
+
+		/* Grab the IOPTE we're interested in */
+		ptep += ARM_LPAE_LVL_IDX(iova, lvl, data);
+		pte = READ_ONCE(*ptep);
+
+		/* Valid entry? */
+		if (!pte)
+			return 0;
+
+		/* Leaf entry? */
+		if (iopte_leaf(pte,lvl))
+			goto found_translation;
+
+		/* Take it to the next level */
+		ptep = iopte_deref(pte, data);
+	} while (++lvl < ARM_LPAE_MAX_LEVELS);
+
+	/* Ran out of page tables to walk */
+	return 0;
+
+found_translation:
+	iova &= (ARM_LPAE_BLOCK_SIZE(lvl, data) - 1);
+	return iopte_to_paddr(pte, data) | iova;
+}
+
+static void arm_lpae_restrict_pgsizes(struct io_pgtable_cfg *cfg)
+{
+	unsigned long granule, page_sizes;
+	unsigned int max_addr_bits = 48;
+
+	/*
+	 * We need to restrict the supported page sizes to match the
+	 * translation regime for a particular granule. Aim to match
+	 * the CPU page size if possible, otherwise prefer smaller sizes.
+	 * While we're at it, restrict the block sizes to match the
+	 * chosen granule.
+	 */
+	if (cfg->pgsize_bitmap & PAGE_SIZE)
+		granule = PAGE_SIZE;
+	else if (cfg->pgsize_bitmap & ~PAGE_MASK)
+		granule = 1UL << __fls(cfg->pgsize_bitmap & ~PAGE_MASK);
+	else if (cfg->pgsize_bitmap & PAGE_MASK)
+		granule = 1UL << __ffs(cfg->pgsize_bitmap & PAGE_MASK);
+	else
+		granule = 0;
+
+	switch (granule) {
+	case SZ_4K:
+		page_sizes = (SZ_4K | SZ_2M | SZ_1G);
+		break;
+	case SZ_16K:
+		page_sizes = (SZ_16K | SZ_32M);
+		break;
+	case SZ_64K:
+		max_addr_bits = 52;
+		page_sizes = (SZ_64K | SZ_512M);
+		if (cfg->oas > 48)
+			page_sizes |= 1ULL << 42; /* 4TB */
+		break;
+	default:
+		page_sizes = 0;
+	}
+
+	cfg->pgsize_bitmap &= page_sizes;
+	cfg->ias = min(cfg->ias, max_addr_bits);
+	cfg->oas = min(cfg->oas, max_addr_bits);
+}
+
+static struct arm_lpae_io_pgtable *
+arm_lpae_alloc_pgtable(struct io_pgtable_cfg *cfg)
+{
+	unsigned long va_bits, pgd_bits;
+	struct arm_lpae_io_pgtable *data;
+
+	arm_lpae_restrict_pgsizes(cfg);
+
+	if (!(cfg->pgsize_bitmap & (SZ_4K | SZ_16K | SZ_64K)))
+		return NULL;
+
+	if (cfg->ias > ARM_LPAE_MAX_ADDR_BITS)
+		return NULL;
+
+	if (cfg->oas > ARM_LPAE_MAX_ADDR_BITS)
+		return NULL;
+
+	if (!selftest_running && cfg->iommu_dev->dma_pfn_offset) {
+		dev_err(cfg->iommu_dev, "Cannot accommodate DMA offset for IOMMU page tables\n");
+		return NULL;
+	}
+
+	data = kmalloc(sizeof(*data), GFP_KERNEL);
+	if (!data)
+		return NULL;
+
+	data->pg_shift = __ffs(cfg->pgsize_bitmap);
+	data->bits_per_level = data->pg_shift - ilog2(sizeof(arm_lpae_iopte));
+
+	va_bits = cfg->ias - data->pg_shift;
+	data->levels = DIV_ROUND_UP(va_bits, data->bits_per_level);
+
+	/* Calculate the actual size of our pgd (without concatenation) */
+	pgd_bits = va_bits - (data->bits_per_level * (data->levels - 1));
+	data->pgd_size = 1UL << (pgd_bits + ilog2(sizeof(arm_lpae_iopte)));
+
+	data->iop.ops = (struct io_pgtable_ops) {
+		.map		= arm_lpae_map,
+		.unmap		= arm_lpae_unmap,
+		.iova_to_phys	= arm_lpae_iova_to_phys,
+	};
+
+	return data;
+}
+
+static struct io_pgtable *
+arm_64_lpae_alloc_pgtable_s1(struct io_pgtable_cfg *cfg, void *cookie)
+{
+	u64 reg;
+	struct arm_lpae_io_pgtable *data;
+
+	if (cfg->quirks & ~(IO_PGTABLE_QUIRK_ARM_NS | IO_PGTABLE_QUIRK_NO_DMA))
+		return NULL;
+
+	data = arm_lpae_alloc_pgtable(cfg);
+	if (!data)
+		return NULL;
+
+	/* TCR */
+	reg = (ARM_LPAE_TCR_SH_IS << ARM_LPAE_TCR_SH0_SHIFT) |
+	      (ARM_LPAE_TCR_RGN_WBWA << ARM_LPAE_TCR_IRGN0_SHIFT) |
+	      (ARM_LPAE_TCR_RGN_WBWA << ARM_LPAE_TCR_ORGN0_SHIFT);
+
+	switch (ARM_LPAE_GRANULE(data)) {
+	case SZ_4K:
+		reg |= ARM_LPAE_TCR_TG0_4K;
+		break;
+	case SZ_16K:
+		reg |= ARM_LPAE_TCR_TG0_16K;
+		break;
+	case SZ_64K:
+		reg |= ARM_LPAE_TCR_TG0_64K;
+		break;
+	}
+
+	switch (cfg->oas) {
+	case 32:
+		reg |= (ARM_LPAE_TCR_PS_32_BIT << ARM_LPAE_TCR_IPS_SHIFT);
+		break;
+	case 36:
+		reg |= (ARM_LPAE_TCR_PS_36_BIT << ARM_LPAE_TCR_IPS_SHIFT);
+		break;
+	case 40:
+		reg |= (ARM_LPAE_TCR_PS_40_BIT << ARM_LPAE_TCR_IPS_SHIFT);
+		break;
+	case 42:
+		reg |= (ARM_LPAE_TCR_PS_42_BIT << ARM_LPAE_TCR_IPS_SHIFT);
+		break;
+	case 44:
+		reg |= (ARM_LPAE_TCR_PS_44_BIT << ARM_LPAE_TCR_IPS_SHIFT);
+		break;
+	case 48:
+		reg |= (ARM_LPAE_TCR_PS_48_BIT << ARM_LPAE_TCR_IPS_SHIFT);
+		break;
+	case 52:
+		reg |= (ARM_LPAE_TCR_PS_52_BIT << ARM_LPAE_TCR_IPS_SHIFT);
+		break;
+	default:
+		goto out_free_data;
+	}
+
+	reg |= (64ULL - cfg->ias) << ARM_LPAE_TCR_T0SZ_SHIFT;
+
+	/* Disable speculative walks through TTBR1 */
+	reg |= ARM_LPAE_TCR_EPD1;
+	cfg->arm_lpae_s1_cfg.tcr = reg;
+
+	/* MAIRs */
+	reg = (ARM_LPAE_MAIR_ATTR_NC
+	       << ARM_LPAE_MAIR_ATTR_SHIFT(ARM_LPAE_MAIR_ATTR_IDX_NC)) |
+	      (ARM_LPAE_MAIR_ATTR_WBRWA
+	       << ARM_LPAE_MAIR_ATTR_SHIFT(ARM_LPAE_MAIR_ATTR_IDX_CACHE)) |
+	      (ARM_LPAE_MAIR_ATTR_DEVICE
+	       << ARM_LPAE_MAIR_ATTR_SHIFT(ARM_LPAE_MAIR_ATTR_IDX_DEV));
+
+	cfg->arm_lpae_s1_cfg.mair[0] = reg;
+	cfg->arm_lpae_s1_cfg.mair[1] = 0;
+
+	/* Looking good; allocate a pgd */
+	data->pgd = __arm_lpae_alloc_pages(data->pgd_size, GFP_KERNEL, cfg);
+	if (!data->pgd)
+		goto out_free_data;
+
+	/* Ensure the empty pgd is visible before any actual TTBR write */
+	wmb();
+
+	/* TTBRs */
+	cfg->arm_lpae_s1_cfg.ttbr[0] = virt_to_phys(data->pgd);
+	cfg->arm_lpae_s1_cfg.ttbr[1] = 0;
+	return &data->iop;
+
+out_free_data:
+	kfree(data);
+	return NULL;
+}
+
+static struct io_pgtable *
+arm_64_lpae_alloc_pgtable_s2(struct io_pgtable_cfg *cfg, void *cookie)
+{
+	u64 reg, sl;
+	struct arm_lpae_io_pgtable *data;
+
+	/* The NS quirk doesn't apply at stage 2 */
+	if (cfg->quirks & ~IO_PGTABLE_QUIRK_NO_DMA)
+		return NULL;
+
+	data = arm_lpae_alloc_pgtable(cfg);
+	if (!data)
+		return NULL;
+
+	/*
+	 * Concatenate PGDs at level 1 if possible in order to reduce
+	 * the depth of the stage-2 walk.
+	 */
+	if (data->levels == ARM_LPAE_MAX_LEVELS) {
+		unsigned long pgd_pages;
+
+		pgd_pages = data->pgd_size >> ilog2(sizeof(arm_lpae_iopte));
+		if (pgd_pages <= ARM_LPAE_S2_MAX_CONCAT_PAGES) {
+			data->pgd_size = pgd_pages << data->pg_shift;
+			data->levels--;
+		}
+	}
+
+	/* VTCR */
+	reg = ARM_64_LPAE_S2_TCR_RES1 |
+	     (ARM_LPAE_TCR_SH_IS << ARM_LPAE_TCR_SH0_SHIFT) |
+	     (ARM_LPAE_TCR_RGN_WBWA << ARM_LPAE_TCR_IRGN0_SHIFT) |
+	     (ARM_LPAE_TCR_RGN_WBWA << ARM_LPAE_TCR_ORGN0_SHIFT);
+
+	sl = ARM_LPAE_START_LVL(data);
+
+	switch (ARM_LPAE_GRANULE(data)) {
+	case SZ_4K:
+		reg |= ARM_LPAE_TCR_TG0_4K;
+		sl++; /* SL0 format is different for 4K granule size */
+		break;
+	case SZ_16K:
+		reg |= ARM_LPAE_TCR_TG0_16K;
+		break;
+	case SZ_64K:
+		reg |= ARM_LPAE_TCR_TG0_64K;
+		break;
+	}
+
+	switch (cfg->oas) {
+	case 32:
+		reg |= (ARM_LPAE_TCR_PS_32_BIT << ARM_LPAE_TCR_PS_SHIFT);
+		break;
+	case 36:
+		reg |= (ARM_LPAE_TCR_PS_36_BIT << ARM_LPAE_TCR_PS_SHIFT);
+		break;
+	case 40:
+		reg |= (ARM_LPAE_TCR_PS_40_BIT << ARM_LPAE_TCR_PS_SHIFT);
+		break;
+	case 42:
+		reg |= (ARM_LPAE_TCR_PS_42_BIT << ARM_LPAE_TCR_PS_SHIFT);
+		break;
+	case 44:
+		reg |= (ARM_LPAE_TCR_PS_44_BIT << ARM_LPAE_TCR_PS_SHIFT);
+		break;
+	case 48:
+		reg |= (ARM_LPAE_TCR_PS_48_BIT << ARM_LPAE_TCR_PS_SHIFT);
+		break;
+	case 52:
+		reg |= (ARM_LPAE_TCR_PS_52_BIT << ARM_LPAE_TCR_PS_SHIFT);
+		break;
+	default:
+		goto out_free_data;
+	}
+
+	reg |= (64ULL - cfg->ias) << ARM_LPAE_TCR_T0SZ_SHIFT;
+	reg |= (~sl & ARM_LPAE_TCR_SL0_MASK) << ARM_LPAE_TCR_SL0_SHIFT;
+	cfg->arm_lpae_s2_cfg.vtcr = reg;
+
+	/* Allocate pgd pages */
+	data->pgd = __arm_lpae_alloc_pages(data->pgd_size, GFP_KERNEL, cfg);
+	if (!data->pgd)
+		goto out_free_data;
+
+	/* Ensure the empty pgd is visible before any actual TTBR write */
+	wmb();
+
+	/* VTTBR */
+	cfg->arm_lpae_s2_cfg.vttbr = virt_to_phys(data->pgd);
+	return &data->iop;
+
+out_free_data:
+	kfree(data);
+	return NULL;
+}
+
+static struct io_pgtable *
+arm_32_lpae_alloc_pgtable_s1(struct io_pgtable_cfg *cfg, void *cookie)
+{
+	struct io_pgtable *iop;
+
+	if (cfg->ias > 32 || cfg->oas > 40)
+		return NULL;
+
+	cfg->pgsize_bitmap &= (SZ_4K | SZ_2M | SZ_1G);
+	iop = arm_64_lpae_alloc_pgtable_s1(cfg, cookie);
+	if (iop) {
+		cfg->arm_lpae_s1_cfg.tcr |= ARM_32_LPAE_TCR_EAE;
+		cfg->arm_lpae_s1_cfg.tcr &= 0xffffffff;
+	}
+
+	return iop;
+}
+
+static struct io_pgtable *
+arm_32_lpae_alloc_pgtable_s2(struct io_pgtable_cfg *cfg, void *cookie)
+{
+	struct io_pgtable *iop;
+
+	if (cfg->ias > 40 || cfg->oas > 40)
+		return NULL;
+
+	cfg->pgsize_bitmap &= (SZ_4K | SZ_2M | SZ_1G);
+	iop = arm_64_lpae_alloc_pgtable_s2(cfg, cookie);
+	if (iop)
+		cfg->arm_lpae_s2_cfg.vtcr &= 0xffffffff;
+
+	return iop;
+}
+
+struct io_pgtable_init_fns io_pgtable_arm_64_lpae_s1_init_fns = {
+	.alloc	= arm_64_lpae_alloc_pgtable_s1,
+	.free	= arm_lpae_free_pgtable,
+};
+
+struct io_pgtable_init_fns io_pgtable_arm_64_lpae_s2_init_fns = {
+	.alloc	= arm_64_lpae_alloc_pgtable_s2,
+	.free	= arm_lpae_free_pgtable,
+};
+
+struct io_pgtable_init_fns io_pgtable_arm_32_lpae_s1_init_fns = {
+	.alloc	= arm_32_lpae_alloc_pgtable_s1,
+	.free	= arm_lpae_free_pgtable,
+};
+
+struct io_pgtable_init_fns io_pgtable_arm_32_lpae_s2_init_fns = {
+	.alloc	= arm_32_lpae_alloc_pgtable_s2,
+	.free	= arm_lpae_free_pgtable,
+};
+
+#ifdef CONFIG_IOMMU_IO_PGTABLE_LPAE_SELFTEST
+
+static struct io_pgtable_cfg *cfg_cookie;
+
+static void dummy_tlb_flush_all(void *cookie)
+{
+	WARN_ON(cookie != cfg_cookie);
+}
+
+static void dummy_tlb_add_flush(unsigned long iova, size_t size,
+				size_t granule, bool leaf, void *cookie)
+{
+	WARN_ON(cookie != cfg_cookie);
+	WARN_ON(!(size & cfg_cookie->pgsize_bitmap));
+}
+
+static void dummy_tlb_sync(void *cookie)
+{
+	WARN_ON(cookie != cfg_cookie);
+}
+
+static const struct iommu_gather_ops dummy_tlb_ops __initconst = {
+	.tlb_flush_all	= dummy_tlb_flush_all,
+	.tlb_add_flush	= dummy_tlb_add_flush,
+	.tlb_sync	= dummy_tlb_sync,
+};
+
+static void __init arm_lpae_dump_ops(struct io_pgtable_ops *ops)
+{
+	struct arm_lpae_io_pgtable *data = io_pgtable_ops_to_data(ops);
+	struct io_pgtable_cfg *cfg = &data->iop.cfg;
+
+	pr_err("cfg: pgsize_bitmap 0x%lx, ias %u-bit\n",
+		cfg->pgsize_bitmap, cfg->ias);
+	pr_err("data: %d levels, 0x%zx pgd_size, %lu pg_shift, %lu bits_per_level, pgd @ %p\n",
+		data->levels, data->pgd_size, data->pg_shift,
+		data->bits_per_level, data->pgd);
+}
+
+#define __FAIL(ops, i)	({						\
+		WARN(1, "selftest: test failed for fmt idx %d\n", (i));	\
+		arm_lpae_dump_ops(ops);					\
+		selftest_running = false;				\
+		-EFAULT;						\
+})
+
+static int __init arm_lpae_run_tests(struct io_pgtable_cfg *cfg)
+{
+	static const enum io_pgtable_fmt fmts[] = {
+		ARM_64_LPAE_S1,
+		ARM_64_LPAE_S2,
+	};
+
+	int i, j;
+	unsigned long iova;
+	size_t size;
+	struct io_pgtable_ops *ops;
+
+	selftest_running = true;
+
+	for (i = 0; i < ARRAY_SIZE(fmts); ++i) {
+		cfg_cookie = cfg;
+		ops = alloc_io_pgtable_ops(fmts[i], cfg, cfg);
+		if (!ops) {
+			pr_err("selftest: failed to allocate io pgtable ops\n");
+			return -ENOMEM;
+		}
+
+		/*
+		 * Initial sanity checks.
+		 * Empty page tables shouldn't provide any translations.
+		 */
+		if (ops->iova_to_phys(ops, 42))
+			return __FAIL(ops, i);
+
+		if (ops->iova_to_phys(ops, SZ_1G + 42))
+			return __FAIL(ops, i);
+
+		if (ops->iova_to_phys(ops, SZ_2G + 42))
+			return __FAIL(ops, i);
+
+		/*
+		 * Distinct mappings of different granule sizes.
+		 */
+		iova = 0;
+		for_each_set_bit(j, &cfg->pgsize_bitmap, BITS_PER_LONG) {
+			size = 1UL << j;
+
+			if (ops->map(ops, iova, iova, size, IOMMU_READ |
+							    IOMMU_WRITE |
+							    IOMMU_NOEXEC |
+							    IOMMU_CACHE))
+				return __FAIL(ops, i);
+
+			/* Overlapping mappings */
+			if (!ops->map(ops, iova, iova + size, size,
+				      IOMMU_READ | IOMMU_NOEXEC))
+				return __FAIL(ops, i);
+
+			if (ops->iova_to_phys(ops, iova + 42) != (iova + 42))
+				return __FAIL(ops, i);
+
+			iova += SZ_1G;
+		}
+
+		/* Partial unmap */
+		size = 1UL << __ffs(cfg->pgsize_bitmap);
+		if (ops->unmap(ops, SZ_1G + size, size) != size)
+			return __FAIL(ops, i);
+
+		/* Remap of partial unmap */
+		if (ops->map(ops, SZ_1G + size, size, size, IOMMU_READ))
+			return __FAIL(ops, i);
+
+		if (ops->iova_to_phys(ops, SZ_1G + size + 42) != (size + 42))
+			return __FAIL(ops, i);
+
+		/* Full unmap */
+		iova = 0;
+		for_each_set_bit(j, &cfg->pgsize_bitmap, BITS_PER_LONG) {
+			size = 1UL << j;
+
+			if (ops->unmap(ops, iova, size) != size)
+				return __FAIL(ops, i);
+
+			if (ops->iova_to_phys(ops, iova + 42))
+				return __FAIL(ops, i);
+
+			/* Remap full block */
+			if (ops->map(ops, iova, iova, size, IOMMU_WRITE))
+				return __FAIL(ops, i);
+
+			if (ops->iova_to_phys(ops, iova + 42) != (iova + 42))
+				return __FAIL(ops, i);
+
+			iova += SZ_1G;
+		}
+
+		free_io_pgtable_ops(ops);
+	}
+
+	selftest_running = false;
+	return 0;
+}
+
+static int __init arm_lpae_do_selftests(void)
+{
+	static const unsigned long pgsize[] = {
+		SZ_4K | SZ_2M | SZ_1G,
+		SZ_16K | SZ_32M,
+		SZ_64K | SZ_512M,
+	};
+
+	static const unsigned int ias[] = {
+		32, 36, 40, 42, 44, 48,
+	};
+
+	int i, j, pass = 0, fail = 0;
+	struct io_pgtable_cfg cfg = {
+		.tlb = &dummy_tlb_ops,
+		.oas = 48,
+		.quirks = IO_PGTABLE_QUIRK_NO_DMA,
+	};
+
+	for (i = 0; i < ARRAY_SIZE(pgsize); ++i) {
+		for (j = 0; j < ARRAY_SIZE(ias); ++j) {
+			cfg.pgsize_bitmap = pgsize[i];
+			cfg.ias = ias[j];
+			pr_info("selftest: pgsize_bitmap 0x%08lx, IAS %u\n",
+				pgsize[i], ias[j]);
+			if (arm_lpae_run_tests(&cfg))
+				fail++;
+			else
+				pass++;
+		}
+	}
+
+	pr_info("selftest: completed with %d PASS %d FAIL\n", pass, fail);
+	return fail ? -EFAULT : 0;
+}
+subsys_initcall(arm_lpae_do_selftests);
+#endif
diff --git a/drivers/iommu/io-pgtable.c b/drivers/iommu/io-pgtable.c
new file mode 100644
index 000000000..127558d83
--- /dev/null
+++ b/drivers/iommu/io-pgtable.c
@@ -0,0 +1,79 @@
+/*
+ * Generic page table allocator for IOMMUs.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Copyright (C) 2014 ARM Limited
+ *
+ * Author: Will Deacon <will.deacon@arm.com>
+ */
+
+#include <linux/bug.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+
+#include "io-pgtable.h"
+
+static const struct io_pgtable_init_fns *
+io_pgtable_init_table[IO_PGTABLE_NUM_FMTS] = {
+#ifdef CONFIG_IOMMU_IO_PGTABLE_LPAE
+	[ARM_32_LPAE_S1] = &io_pgtable_arm_32_lpae_s1_init_fns,
+	[ARM_32_LPAE_S2] = &io_pgtable_arm_32_lpae_s2_init_fns,
+	[ARM_64_LPAE_S1] = &io_pgtable_arm_64_lpae_s1_init_fns,
+	[ARM_64_LPAE_S2] = &io_pgtable_arm_64_lpae_s2_init_fns,
+#endif
+#ifdef CONFIG_IOMMU_IO_PGTABLE_ARMV7S
+	[ARM_V7S] = &io_pgtable_arm_v7s_init_fns,
+#endif
+};
+
+struct io_pgtable_ops *alloc_io_pgtable_ops(enum io_pgtable_fmt fmt,
+					    struct io_pgtable_cfg *cfg,
+					    void *cookie)
+{
+	struct io_pgtable *iop;
+	const struct io_pgtable_init_fns *fns;
+
+	if (fmt >= IO_PGTABLE_NUM_FMTS)
+		return NULL;
+
+	fns = io_pgtable_init_table[fmt];
+	if (!fns)
+		return NULL;
+
+	iop = fns->alloc(cfg, cookie);
+	if (!iop)
+		return NULL;
+
+	iop->fmt	= fmt;
+	iop->cookie	= cookie;
+	iop->cfg	= *cfg;
+
+	return &iop->ops;
+}
+
+/*
+ * It is the IOMMU driver's responsibility to ensure that the page table
+ * is no longer accessible to the walker by this point.
+ */
+void free_io_pgtable_ops(struct io_pgtable_ops *ops)
+{
+	struct io_pgtable *iop;
+
+	if (!ops)
+		return;
+
+	iop = container_of(ops, struct io_pgtable, ops);
+	io_pgtable_tlb_flush_all(iop);
+	io_pgtable_init_table[iop->fmt]->free(iop);
+}
diff --git a/drivers/iommu/io-pgtable.h b/drivers/iommu/io-pgtable.h
new file mode 100644
index 000000000..2df79093c
--- /dev/null
+++ b/drivers/iommu/io-pgtable.h
@@ -0,0 +1,208 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __IO_PGTABLE_H
+#define __IO_PGTABLE_H
+#include <linux/bitops.h>
+
+/*
+ * Public API for use by IOMMU drivers
+ */
+enum io_pgtable_fmt {
+	ARM_32_LPAE_S1,
+	ARM_32_LPAE_S2,
+	ARM_64_LPAE_S1,
+	ARM_64_LPAE_S2,
+	ARM_V7S,
+	IO_PGTABLE_NUM_FMTS,
+};
+
+/**
+ * struct iommu_gather_ops - IOMMU callbacks for TLB and page table management.
+ *
+ * @tlb_flush_all: Synchronously invalidate the entire TLB context.
+ * @tlb_add_flush: Queue up a TLB invalidation for a virtual address range.
+ * @tlb_sync:      Ensure any queued TLB invalidation has taken effect, and
+ *                 any corresponding page table updates are visible to the
+ *                 IOMMU.
+ *
+ * Note that these can all be called in atomic context and must therefore
+ * not block.
+ */
+struct iommu_gather_ops {
+	void (*tlb_flush_all)(void *cookie);
+	void (*tlb_add_flush)(unsigned long iova, size_t size, size_t granule,
+			      bool leaf, void *cookie);
+	void (*tlb_sync)(void *cookie);
+};
+
+/**
+ * struct io_pgtable_cfg - Configuration data for a set of page tables.
+ *
+ * @quirks:        A bitmap of hardware quirks that require some special
+ *                 action by the low-level page table allocator.
+ * @pgsize_bitmap: A bitmap of page sizes supported by this set of page
+ *                 tables.
+ * @ias:           Input address (iova) size, in bits.
+ * @oas:           Output address (paddr) size, in bits.
+ * @tlb:           TLB management callbacks for this set of tables.
+ * @iommu_dev:     The device representing the DMA configuration for the
+ *                 page table walker.
+ */
+struct io_pgtable_cfg {
+	/*
+	 * IO_PGTABLE_QUIRK_ARM_NS: (ARM formats) Set NS and NSTABLE bits in
+	 *	stage 1 PTEs, for hardware which insists on validating them
+	 *	even in	non-secure state where they should normally be ignored.
+	 *
+	 * IO_PGTABLE_QUIRK_NO_PERMS: Ignore the IOMMU_READ, IOMMU_WRITE and
+	 *	IOMMU_NOEXEC flags and map everything with full access, for
+	 *	hardware which does not implement the permissions of a given
+	 *	format, and/or requires some format-specific default value.
+	 *
+	 * IO_PGTABLE_QUIRK_TLBI_ON_MAP: If the format forbids caching invalid
+	 *	(unmapped) entries but the hardware might do so anyway, perform
+	 *	TLB maintenance when mapping as well as when unmapping.
+	 *
+	 * IO_PGTABLE_QUIRK_ARM_MTK_4GB: (ARM v7s format) Set bit 9 in all
+	 *	PTEs, for Mediatek IOMMUs which treat it as a 33rd address bit
+	 *	when the SoC is in "4GB mode" and they can only access the high
+	 *	remap of DRAM (0x1_00000000 to 0x1_ffffffff).
+	 *
+	 * IO_PGTABLE_QUIRK_NO_DMA: Guarantees that the tables will only ever
+	 *	be accessed by a fully cache-coherent IOMMU or CPU (e.g. for a
+	 *	software-emulated IOMMU), such that pagetable updates need not
+	 *	be treated as explicit DMA data.
+	 */
+	#define IO_PGTABLE_QUIRK_ARM_NS		BIT(0)
+	#define IO_PGTABLE_QUIRK_NO_PERMS	BIT(1)
+	#define IO_PGTABLE_QUIRK_TLBI_ON_MAP	BIT(2)
+	#define IO_PGTABLE_QUIRK_ARM_MTK_4GB	BIT(3)
+	#define IO_PGTABLE_QUIRK_NO_DMA		BIT(4)
+	unsigned long			quirks;
+	unsigned long			pgsize_bitmap;
+	unsigned int			ias;
+	unsigned int			oas;
+	const struct iommu_gather_ops	*tlb;
+	struct device			*iommu_dev;
+
+	/* Low-level data specific to the table format */
+	union {
+		struct {
+			u64	ttbr[2];
+			u64	tcr;
+			u64	mair[2];
+		} arm_lpae_s1_cfg;
+
+		struct {
+			u64	vttbr;
+			u64	vtcr;
+		} arm_lpae_s2_cfg;
+
+		struct {
+			u32	ttbr[2];
+			u32	tcr;
+			u32	nmrr;
+			u32	prrr;
+		} arm_v7s_cfg;
+	};
+};
+
+/**
+ * struct io_pgtable_ops - Page table manipulation API for IOMMU drivers.
+ *
+ * @map:          Map a physically contiguous memory region.
+ * @unmap:        Unmap a physically contiguous memory region.
+ * @iova_to_phys: Translate iova to physical address.
+ *
+ * These functions map directly onto the iommu_ops member functions with
+ * the same names.
+ */
+struct io_pgtable_ops {
+	int (*map)(struct io_pgtable_ops *ops, unsigned long iova,
+		   phys_addr_t paddr, size_t size, int prot);
+	size_t (*unmap)(struct io_pgtable_ops *ops, unsigned long iova,
+			size_t size);
+	phys_addr_t (*iova_to_phys)(struct io_pgtable_ops *ops,
+				    unsigned long iova);
+};
+
+/**
+ * alloc_io_pgtable_ops() - Allocate a page table allocator for use by an IOMMU.
+ *
+ * @fmt:    The page table format.
+ * @cfg:    The page table configuration. This will be modified to represent
+ *          the configuration actually provided by the allocator (e.g. the
+ *          pgsize_bitmap may be restricted).
+ * @cookie: An opaque token provided by the IOMMU driver and passed back to
+ *          the callback routines in cfg->tlb.
+ */
+struct io_pgtable_ops *alloc_io_pgtable_ops(enum io_pgtable_fmt fmt,
+					    struct io_pgtable_cfg *cfg,
+					    void *cookie);
+
+/**
+ * free_io_pgtable_ops() - Free an io_pgtable_ops structure. The caller
+ *                         *must* ensure that the page table is no longer
+ *                         live, but the TLB can be dirty.
+ *
+ * @ops: The ops returned from alloc_io_pgtable_ops.
+ */
+void free_io_pgtable_ops(struct io_pgtable_ops *ops);
+
+
+/*
+ * Internal structures for page table allocator implementations.
+ */
+
+/**
+ * struct io_pgtable - Internal structure describing a set of page tables.
+ *
+ * @fmt:    The page table format.
+ * @cookie: An opaque token provided by the IOMMU driver and passed back to
+ *          any callback routines.
+ * @cfg:    A copy of the page table configuration.
+ * @ops:    The page table operations in use for this set of page tables.
+ */
+struct io_pgtable {
+	enum io_pgtable_fmt	fmt;
+	void			*cookie;
+	struct io_pgtable_cfg	cfg;
+	struct io_pgtable_ops	ops;
+};
+
+#define io_pgtable_ops_to_pgtable(x) container_of((x), struct io_pgtable, ops)
+
+static inline void io_pgtable_tlb_flush_all(struct io_pgtable *iop)
+{
+	iop->cfg.tlb->tlb_flush_all(iop->cookie);
+}
+
+static inline void io_pgtable_tlb_add_flush(struct io_pgtable *iop,
+		unsigned long iova, size_t size, size_t granule, bool leaf)
+{
+	iop->cfg.tlb->tlb_add_flush(iova, size, granule, leaf, iop->cookie);
+}
+
+static inline void io_pgtable_tlb_sync(struct io_pgtable *iop)
+{
+	iop->cfg.tlb->tlb_sync(iop->cookie);
+}
+
+/**
+ * struct io_pgtable_init_fns - Alloc/free a set of page tables for a
+ *                              particular format.
+ *
+ * @alloc: Allocate a set of page tables described by cfg.
+ * @free:  Free the page tables associated with iop.
+ */
+struct io_pgtable_init_fns {
+	struct io_pgtable *(*alloc)(struct io_pgtable_cfg *cfg, void *cookie);
+	void (*free)(struct io_pgtable *iop);
+};
+
+extern struct io_pgtable_init_fns io_pgtable_arm_32_lpae_s1_init_fns;
+extern struct io_pgtable_init_fns io_pgtable_arm_32_lpae_s2_init_fns;
+extern struct io_pgtable_init_fns io_pgtable_arm_64_lpae_s1_init_fns;
+extern struct io_pgtable_init_fns io_pgtable_arm_64_lpae_s2_init_fns;
+extern struct io_pgtable_init_fns io_pgtable_arm_v7s_init_fns;
+
+#endif /* __IO_PGTABLE_H */
diff --git a/drivers/iommu/iommu-debugfs.c b/drivers/iommu/iommu-debugfs.c
new file mode 100644
index 000000000..f03548942
--- /dev/null
+++ b/drivers/iommu/iommu-debugfs.c
@@ -0,0 +1,51 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * IOMMU debugfs core infrastructure
+ *
+ * Copyright (C) 2018 Advanced Micro Devices, Inc.
+ *
+ * Author: Gary R Hook <gary.hook@amd.com>
+ */
+
+#include <linux/pci.h>
+#include <linux/iommu.h>
+#include <linux/debugfs.h>
+
+struct dentry *iommu_debugfs_dir;
+EXPORT_SYMBOL_GPL(iommu_debugfs_dir);
+
+/**
+ * iommu_debugfs_setup - create the top-level iommu directory in debugfs
+ *
+ * Provide base enablement for using debugfs to expose internal data of an
+ * IOMMU driver. When called, this function creates the
+ * /sys/kernel/debug/iommu directory.
+ *
+ * Emit a strong warning at boot time to indicate that this feature is
+ * enabled.
+ *
+ * This function is called from iommu_init; drivers may then use
+ * iommu_debugfs_dir to instantiate a vendor-specific directory to be used
+ * to expose internal data.
+ */
+void iommu_debugfs_setup(void)
+{
+	if (!iommu_debugfs_dir) {
+		iommu_debugfs_dir = debugfs_create_dir("iommu", NULL);
+		pr_warn("\n");
+		pr_warn("*************************************************************\n");
+		pr_warn("**     NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE    **\n");
+		pr_warn("**                                                         **\n");
+		pr_warn("**  IOMMU DebugFS SUPPORT HAS BEEN ENABLED IN THIS KERNEL  **\n");
+		pr_warn("**                                                         **\n");
+		pr_warn("** This means that this kernel is built to expose internal **\n");
+		pr_warn("** IOMMU data structures, which may compromise security on **\n");
+		pr_warn("** your system.                                            **\n");
+		pr_warn("**                                                         **\n");
+		pr_warn("** If you see this message and you are not debugging the   **\n");
+		pr_warn("** kernel, report this immediately to your vendor!         **\n");
+		pr_warn("**                                                         **\n");
+		pr_warn("**     NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE    **\n");
+		pr_warn("*************************************************************\n");
+	}
+}
diff --git a/drivers/iommu/iommu-sysfs.c b/drivers/iommu/iommu-sysfs.c
new file mode 100644
index 000000000..36d1a7ce7
--- /dev/null
+++ b/drivers/iommu/iommu-sysfs.c
@@ -0,0 +1,133 @@
+/*
+ * IOMMU sysfs class support
+ *
+ * Copyright (C) 2014 Red Hat, Inc.  All rights reserved.
+ *     Author: Alex Williamson <alex.williamson@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/device.h>
+#include <linux/iommu.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+
+/*
+ * We provide a common class "devices" group which initially has no attributes.
+ * As devices are added to the IOMMU, we'll add links to the group.
+ */
+static struct attribute *devices_attr[] = {
+	NULL,
+};
+
+static const struct attribute_group iommu_devices_attr_group = {
+	.name = "devices",
+	.attrs = devices_attr,
+};
+
+static const struct attribute_group *iommu_dev_groups[] = {
+	&iommu_devices_attr_group,
+	NULL,
+};
+
+static void iommu_release_device(struct device *dev)
+{
+	kfree(dev);
+}
+
+static struct class iommu_class = {
+	.name = "iommu",
+	.dev_release = iommu_release_device,
+	.dev_groups = iommu_dev_groups,
+};
+
+static int __init iommu_dev_init(void)
+{
+	return class_register(&iommu_class);
+}
+postcore_initcall(iommu_dev_init);
+
+/*
+ * Init the struct device for the IOMMU. IOMMU specific attributes can
+ * be provided as an attribute group, allowing a unique namespace per
+ * IOMMU type.
+ */
+int iommu_device_sysfs_add(struct iommu_device *iommu,
+			   struct device *parent,
+			   const struct attribute_group **groups,
+			   const char *fmt, ...)
+{
+	va_list vargs;
+	int ret;
+
+	iommu->dev = kzalloc(sizeof(*iommu->dev), GFP_KERNEL);
+	if (!iommu->dev)
+		return -ENOMEM;
+
+	device_initialize(iommu->dev);
+
+	iommu->dev->class = &iommu_class;
+	iommu->dev->parent = parent;
+	iommu->dev->groups = groups;
+
+	va_start(vargs, fmt);
+	ret = kobject_set_name_vargs(&iommu->dev->kobj, fmt, vargs);
+	va_end(vargs);
+	if (ret)
+		goto error;
+
+	ret = device_add(iommu->dev);
+	if (ret)
+		goto error;
+
+	dev_set_drvdata(iommu->dev, iommu);
+
+	return 0;
+
+error:
+	put_device(iommu->dev);
+	return ret;
+}
+
+void iommu_device_sysfs_remove(struct iommu_device *iommu)
+{
+	dev_set_drvdata(iommu->dev, NULL);
+	device_unregister(iommu->dev);
+	iommu->dev = NULL;
+}
+/*
+ * IOMMU drivers can indicate a device is managed by a given IOMMU using
+ * this interface.  A link to the device will be created in the "devices"
+ * directory of the IOMMU device in sysfs and an "iommu" link will be
+ * created under the linked device, pointing back at the IOMMU device.
+ */
+int iommu_device_link(struct iommu_device *iommu, struct device *link)
+{
+	int ret;
+
+	if (!iommu || IS_ERR(iommu))
+		return -ENODEV;
+
+	ret = sysfs_add_link_to_group(&iommu->dev->kobj, "devices",
+				      &link->kobj, dev_name(link));
+	if (ret)
+		return ret;
+
+	ret = sysfs_create_link_nowarn(&link->kobj, &iommu->dev->kobj, "iommu");
+	if (ret)
+		sysfs_remove_link_from_group(&iommu->dev->kobj, "devices",
+					     dev_name(link));
+
+	return ret;
+}
+
+void iommu_device_unlink(struct iommu_device *iommu, struct device *link)
+{
+	if (!iommu || IS_ERR(iommu))
+		return;
+
+	sysfs_remove_link(&link->kobj, "iommu");
+	sysfs_remove_link_from_group(&iommu->dev->kobj, "devices", dev_name(link));
+}
diff --git a/drivers/iommu/iommu-traces.c b/drivers/iommu/iommu-traces.c
new file mode 100644
index 000000000..1e9ca7789
--- /dev/null
+++ b/drivers/iommu/iommu-traces.c
@@ -0,0 +1,28 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * iommu trace points
+ *
+ * Copyright (C) 2013 Shuah Khan <shuah.kh@samsung.com>
+ *
+ */
+
+#include <linux/string.h>
+#include <linux/types.h>
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/iommu.h>
+
+/* iommu_group_event */
+EXPORT_TRACEPOINT_SYMBOL_GPL(add_device_to_group);
+EXPORT_TRACEPOINT_SYMBOL_GPL(remove_device_from_group);
+
+/* iommu_device_event */
+EXPORT_TRACEPOINT_SYMBOL_GPL(attach_device_to_domain);
+EXPORT_TRACEPOINT_SYMBOL_GPL(detach_device_from_domain);
+
+/* iommu_map_unmap */
+EXPORT_TRACEPOINT_SYMBOL_GPL(map);
+EXPORT_TRACEPOINT_SYMBOL_GPL(unmap);
+
+/* iommu_error */
+EXPORT_TRACEPOINT_SYMBOL_GPL(io_page_fault);
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
new file mode 100644
index 000000000..85ef6c9bc
--- /dev/null
+++ b/drivers/iommu/iommu.c
@@ -0,0 +1,2019 @@
+/*
+ * Copyright (C) 2007-2008 Advanced Micro Devices, Inc.
+ * Author: Joerg Roedel <jroedel@suse.de>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ */
+
+#define pr_fmt(fmt)    "iommu: " fmt
+
+#include <linux/device.h>
+#include <linux/kernel.h>
+#include <linux/bug.h>
+#include <linux/types.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/errno.h>
+#include <linux/iommu.h>
+#include <linux/idr.h>
+#include <linux/notifier.h>
+#include <linux/err.h>
+#include <linux/pci.h>
+#include <linux/bitops.h>
+#include <linux/property.h>
+#include <trace/events/iommu.h>
+
+static struct kset *iommu_group_kset;
+static DEFINE_IDA(iommu_group_ida);
+#ifdef CONFIG_IOMMU_DEFAULT_PASSTHROUGH
+static unsigned int iommu_def_domain_type = IOMMU_DOMAIN_IDENTITY;
+#else
+static unsigned int iommu_def_domain_type = IOMMU_DOMAIN_DMA;
+#endif
+
+struct iommu_callback_data {
+	const struct iommu_ops *ops;
+};
+
+struct iommu_group {
+	struct kobject kobj;
+	struct kobject *devices_kobj;
+	struct list_head devices;
+	struct mutex mutex;
+	struct blocking_notifier_head notifier;
+	void *iommu_data;
+	void (*iommu_data_release)(void *iommu_data);
+	char *name;
+	int id;
+	struct iommu_domain *default_domain;
+	struct iommu_domain *domain;
+};
+
+struct group_device {
+	struct list_head list;
+	struct device *dev;
+	char *name;
+};
+
+struct iommu_group_attribute {
+	struct attribute attr;
+	ssize_t (*show)(struct iommu_group *group, char *buf);
+	ssize_t (*store)(struct iommu_group *group,
+			 const char *buf, size_t count);
+};
+
+static const char * const iommu_group_resv_type_string[] = {
+	[IOMMU_RESV_DIRECT]	= "direct",
+	[IOMMU_RESV_RESERVED]	= "reserved",
+	[IOMMU_RESV_MSI]	= "msi",
+	[IOMMU_RESV_SW_MSI]	= "msi",
+};
+
+#define IOMMU_GROUP_ATTR(_name, _mode, _show, _store)		\
+struct iommu_group_attribute iommu_group_attr_##_name =		\
+	__ATTR(_name, _mode, _show, _store)
+
+#define to_iommu_group_attr(_attr)	\
+	container_of(_attr, struct iommu_group_attribute, attr)
+#define to_iommu_group(_kobj)		\
+	container_of(_kobj, struct iommu_group, kobj)
+
+static LIST_HEAD(iommu_device_list);
+static DEFINE_SPINLOCK(iommu_device_lock);
+
+int iommu_device_register(struct iommu_device *iommu)
+{
+	spin_lock(&iommu_device_lock);
+	list_add_tail(&iommu->list, &iommu_device_list);
+	spin_unlock(&iommu_device_lock);
+
+	return 0;
+}
+
+void iommu_device_unregister(struct iommu_device *iommu)
+{
+	spin_lock(&iommu_device_lock);
+	list_del(&iommu->list);
+	spin_unlock(&iommu_device_lock);
+}
+
+static struct iommu_domain *__iommu_domain_alloc(struct bus_type *bus,
+						 unsigned type);
+static int __iommu_attach_device(struct iommu_domain *domain,
+				 struct device *dev);
+static int __iommu_attach_group(struct iommu_domain *domain,
+				struct iommu_group *group);
+static void __iommu_detach_group(struct iommu_domain *domain,
+				 struct iommu_group *group);
+
+static int __init iommu_set_def_domain_type(char *str)
+{
+	bool pt;
+	int ret;
+
+	ret = kstrtobool(str, &pt);
+	if (ret)
+		return ret;
+
+	iommu_def_domain_type = pt ? IOMMU_DOMAIN_IDENTITY : IOMMU_DOMAIN_DMA;
+	return 0;
+}
+early_param("iommu.passthrough", iommu_set_def_domain_type);
+
+static ssize_t iommu_group_attr_show(struct kobject *kobj,
+				     struct attribute *__attr, char *buf)
+{
+	struct iommu_group_attribute *attr = to_iommu_group_attr(__attr);
+	struct iommu_group *group = to_iommu_group(kobj);
+	ssize_t ret = -EIO;
+
+	if (attr->show)
+		ret = attr->show(group, buf);
+	return ret;
+}
+
+static ssize_t iommu_group_attr_store(struct kobject *kobj,
+				      struct attribute *__attr,
+				      const char *buf, size_t count)
+{
+	struct iommu_group_attribute *attr = to_iommu_group_attr(__attr);
+	struct iommu_group *group = to_iommu_group(kobj);
+	ssize_t ret = -EIO;
+
+	if (attr->store)
+		ret = attr->store(group, buf, count);
+	return ret;
+}
+
+static const struct sysfs_ops iommu_group_sysfs_ops = {
+	.show = iommu_group_attr_show,
+	.store = iommu_group_attr_store,
+};
+
+static int iommu_group_create_file(struct iommu_group *group,
+				   struct iommu_group_attribute *attr)
+{
+	return sysfs_create_file(&group->kobj, &attr->attr);
+}
+
+static void iommu_group_remove_file(struct iommu_group *group,
+				    struct iommu_group_attribute *attr)
+{
+	sysfs_remove_file(&group->kobj, &attr->attr);
+}
+
+static ssize_t iommu_group_show_name(struct iommu_group *group, char *buf)
+{
+	return sprintf(buf, "%s\n", group->name);
+}
+
+/**
+ * iommu_insert_resv_region - Insert a new region in the
+ * list of reserved regions.
+ * @new: new region to insert
+ * @regions: list of regions
+ *
+ * The new element is sorted by address with respect to the other
+ * regions of the same type. In case it overlaps with another
+ * region of the same type, regions are merged. In case it
+ * overlaps with another region of different type, regions are
+ * not merged.
+ */
+static int iommu_insert_resv_region(struct iommu_resv_region *new,
+				    struct list_head *regions)
+{
+	struct iommu_resv_region *region;
+	phys_addr_t start = new->start;
+	phys_addr_t end = new->start + new->length - 1;
+	struct list_head *pos = regions->next;
+
+	while (pos != regions) {
+		struct iommu_resv_region *entry =
+			list_entry(pos, struct iommu_resv_region, list);
+		phys_addr_t a = entry->start;
+		phys_addr_t b = entry->start + entry->length - 1;
+		int type = entry->type;
+
+		if (end < a) {
+			goto insert;
+		} else if (start > b) {
+			pos = pos->next;
+		} else if ((start >= a) && (end <= b)) {
+			if (new->type == type)
+				return 0;
+			else
+				pos = pos->next;
+		} else {
+			if (new->type == type) {
+				phys_addr_t new_start = min(a, start);
+				phys_addr_t new_end = max(b, end);
+				int ret;
+
+				list_del(&entry->list);
+				entry->start = new_start;
+				entry->length = new_end - new_start + 1;
+				ret = iommu_insert_resv_region(entry, regions);
+				kfree(entry);
+				return ret;
+			} else {
+				pos = pos->next;
+			}
+		}
+	}
+insert:
+	region = iommu_alloc_resv_region(new->start, new->length,
+					 new->prot, new->type);
+	if (!region)
+		return -ENOMEM;
+
+	list_add_tail(&region->list, pos);
+	return 0;
+}
+
+static int
+iommu_insert_device_resv_regions(struct list_head *dev_resv_regions,
+				 struct list_head *group_resv_regions)
+{
+	struct iommu_resv_region *entry;
+	int ret = 0;
+
+	list_for_each_entry(entry, dev_resv_regions, list) {
+		ret = iommu_insert_resv_region(entry, group_resv_regions);
+		if (ret)
+			break;
+	}
+	return ret;
+}
+
+int iommu_get_group_resv_regions(struct iommu_group *group,
+				 struct list_head *head)
+{
+	struct group_device *device;
+	int ret = 0;
+
+	mutex_lock(&group->mutex);
+	list_for_each_entry(device, &group->devices, list) {
+		struct list_head dev_resv_regions;
+
+		INIT_LIST_HEAD(&dev_resv_regions);
+		iommu_get_resv_regions(device->dev, &dev_resv_regions);
+		ret = iommu_insert_device_resv_regions(&dev_resv_regions, head);
+		iommu_put_resv_regions(device->dev, &dev_resv_regions);
+		if (ret)
+			break;
+	}
+	mutex_unlock(&group->mutex);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(iommu_get_group_resv_regions);
+
+static ssize_t iommu_group_show_resv_regions(struct iommu_group *group,
+					     char *buf)
+{
+	struct iommu_resv_region *region, *next;
+	struct list_head group_resv_regions;
+	char *str = buf;
+
+	INIT_LIST_HEAD(&group_resv_regions);
+	iommu_get_group_resv_regions(group, &group_resv_regions);
+
+	list_for_each_entry_safe(region, next, &group_resv_regions, list) {
+		str += sprintf(str, "0x%016llx 0x%016llx %s\n",
+			       (long long int)region->start,
+			       (long long int)(region->start +
+						region->length - 1),
+			       iommu_group_resv_type_string[region->type]);
+		kfree(region);
+	}
+
+	return (str - buf);
+}
+
+static ssize_t iommu_group_show_type(struct iommu_group *group,
+				     char *buf)
+{
+	char *type = "unknown\n";
+
+	if (group->default_domain) {
+		switch (group->default_domain->type) {
+		case IOMMU_DOMAIN_BLOCKED:
+			type = "blocked\n";
+			break;
+		case IOMMU_DOMAIN_IDENTITY:
+			type = "identity\n";
+			break;
+		case IOMMU_DOMAIN_UNMANAGED:
+			type = "unmanaged\n";
+			break;
+		case IOMMU_DOMAIN_DMA:
+			type = "DMA\n";
+			break;
+		}
+	}
+	strcpy(buf, type);
+
+	return strlen(type);
+}
+
+static IOMMU_GROUP_ATTR(name, S_IRUGO, iommu_group_show_name, NULL);
+
+static IOMMU_GROUP_ATTR(reserved_regions, 0444,
+			iommu_group_show_resv_regions, NULL);
+
+static IOMMU_GROUP_ATTR(type, 0444, iommu_group_show_type, NULL);
+
+static void iommu_group_release(struct kobject *kobj)
+{
+	struct iommu_group *group = to_iommu_group(kobj);
+
+	pr_debug("Releasing group %d\n", group->id);
+
+	if (group->iommu_data_release)
+		group->iommu_data_release(group->iommu_data);
+
+	ida_simple_remove(&iommu_group_ida, group->id);
+
+	if (group->default_domain)
+		iommu_domain_free(group->default_domain);
+
+	kfree(group->name);
+	kfree(group);
+}
+
+static struct kobj_type iommu_group_ktype = {
+	.sysfs_ops = &iommu_group_sysfs_ops,
+	.release = iommu_group_release,
+};
+
+/**
+ * iommu_group_alloc - Allocate a new group
+ *
+ * This function is called by an iommu driver to allocate a new iommu
+ * group.  The iommu group represents the minimum granularity of the iommu.
+ * Upon successful return, the caller holds a reference to the supplied
+ * group in order to hold the group until devices are added.  Use
+ * iommu_group_put() to release this extra reference count, allowing the
+ * group to be automatically reclaimed once it has no devices or external
+ * references.
+ */
+struct iommu_group *iommu_group_alloc(void)
+{
+	struct iommu_group *group;
+	int ret;
+
+	group = kzalloc(sizeof(*group), GFP_KERNEL);
+	if (!group)
+		return ERR_PTR(-ENOMEM);
+
+	group->kobj.kset = iommu_group_kset;
+	mutex_init(&group->mutex);
+	INIT_LIST_HEAD(&group->devices);
+	BLOCKING_INIT_NOTIFIER_HEAD(&group->notifier);
+
+	ret = ida_simple_get(&iommu_group_ida, 0, 0, GFP_KERNEL);
+	if (ret < 0) {
+		kfree(group);
+		return ERR_PTR(ret);
+	}
+	group->id = ret;
+
+	ret = kobject_init_and_add(&group->kobj, &iommu_group_ktype,
+				   NULL, "%d", group->id);
+	if (ret) {
+		ida_simple_remove(&iommu_group_ida, group->id);
+		kobject_put(&group->kobj);
+		return ERR_PTR(ret);
+	}
+
+	group->devices_kobj = kobject_create_and_add("devices", &group->kobj);
+	if (!group->devices_kobj) {
+		kobject_put(&group->kobj); /* triggers .release & free */
+		return ERR_PTR(-ENOMEM);
+	}
+
+	/*
+	 * The devices_kobj holds a reference on the group kobject, so
+	 * as long as that exists so will the group.  We can therefore
+	 * use the devices_kobj for reference counting.
+	 */
+	kobject_put(&group->kobj);
+
+	ret = iommu_group_create_file(group,
+				      &iommu_group_attr_reserved_regions);
+	if (ret)
+		return ERR_PTR(ret);
+
+	ret = iommu_group_create_file(group, &iommu_group_attr_type);
+	if (ret)
+		return ERR_PTR(ret);
+
+	pr_debug("Allocated group %d\n", group->id);
+
+	return group;
+}
+EXPORT_SYMBOL_GPL(iommu_group_alloc);
+
+struct iommu_group *iommu_group_get_by_id(int id)
+{
+	struct kobject *group_kobj;
+	struct iommu_group *group;
+	const char *name;
+
+	if (!iommu_group_kset)
+		return NULL;
+
+	name = kasprintf(GFP_KERNEL, "%d", id);
+	if (!name)
+		return NULL;
+
+	group_kobj = kset_find_obj(iommu_group_kset, name);
+	kfree(name);
+
+	if (!group_kobj)
+		return NULL;
+
+	group = container_of(group_kobj, struct iommu_group, kobj);
+	BUG_ON(group->id != id);
+
+	kobject_get(group->devices_kobj);
+	kobject_put(&group->kobj);
+
+	return group;
+}
+EXPORT_SYMBOL_GPL(iommu_group_get_by_id);
+
+/**
+ * iommu_group_get_iommudata - retrieve iommu_data registered for a group
+ * @group: the group
+ *
+ * iommu drivers can store data in the group for use when doing iommu
+ * operations.  This function provides a way to retrieve it.  Caller
+ * should hold a group reference.
+ */
+void *iommu_group_get_iommudata(struct iommu_group *group)
+{
+	return group->iommu_data;
+}
+EXPORT_SYMBOL_GPL(iommu_group_get_iommudata);
+
+/**
+ * iommu_group_set_iommudata - set iommu_data for a group
+ * @group: the group
+ * @iommu_data: new data
+ * @release: release function for iommu_data
+ *
+ * iommu drivers can store data in the group for use when doing iommu
+ * operations.  This function provides a way to set the data after
+ * the group has been allocated.  Caller should hold a group reference.
+ */
+void iommu_group_set_iommudata(struct iommu_group *group, void *iommu_data,
+			       void (*release)(void *iommu_data))
+{
+	group->iommu_data = iommu_data;
+	group->iommu_data_release = release;
+}
+EXPORT_SYMBOL_GPL(iommu_group_set_iommudata);
+
+/**
+ * iommu_group_set_name - set name for a group
+ * @group: the group
+ * @name: name
+ *
+ * Allow iommu driver to set a name for a group.  When set it will
+ * appear in a name attribute file under the group in sysfs.
+ */
+int iommu_group_set_name(struct iommu_group *group, const char *name)
+{
+	int ret;
+
+	if (group->name) {
+		iommu_group_remove_file(group, &iommu_group_attr_name);
+		kfree(group->name);
+		group->name = NULL;
+		if (!name)
+			return 0;
+	}
+
+	group->name = kstrdup(name, GFP_KERNEL);
+	if (!group->name)
+		return -ENOMEM;
+
+	ret = iommu_group_create_file(group, &iommu_group_attr_name);
+	if (ret) {
+		kfree(group->name);
+		group->name = NULL;
+		return ret;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(iommu_group_set_name);
+
+static int iommu_group_create_direct_mappings(struct iommu_group *group,
+					      struct device *dev)
+{
+	struct iommu_domain *domain = group->default_domain;
+	struct iommu_resv_region *entry;
+	struct list_head mappings;
+	unsigned long pg_size;
+	int ret = 0;
+
+	if (!domain || domain->type != IOMMU_DOMAIN_DMA)
+		return 0;
+
+	BUG_ON(!domain->pgsize_bitmap);
+
+	pg_size = 1UL << __ffs(domain->pgsize_bitmap);
+	INIT_LIST_HEAD(&mappings);
+
+	iommu_get_resv_regions(dev, &mappings);
+
+	/* We need to consider overlapping regions for different devices */
+	list_for_each_entry(entry, &mappings, list) {
+		dma_addr_t start, end, addr;
+
+		if (domain->ops->apply_resv_region)
+			domain->ops->apply_resv_region(dev, domain, entry);
+
+		start = ALIGN(entry->start, pg_size);
+		end   = ALIGN(entry->start + entry->length, pg_size);
+
+		if (entry->type != IOMMU_RESV_DIRECT)
+			continue;
+
+		for (addr = start; addr < end; addr += pg_size) {
+			phys_addr_t phys_addr;
+
+			phys_addr = iommu_iova_to_phys(domain, addr);
+			if (phys_addr)
+				continue;
+
+			ret = iommu_map(domain, addr, addr, pg_size, entry->prot);
+			if (ret)
+				goto out;
+		}
+
+	}
+
+	iommu_flush_tlb_all(domain);
+
+out:
+	iommu_put_resv_regions(dev, &mappings);
+
+	return ret;
+}
+
+/**
+ * iommu_group_add_device - add a device to an iommu group
+ * @group: the group into which to add the device (reference should be held)
+ * @dev: the device
+ *
+ * This function is called by an iommu driver to add a device into a
+ * group.  Adding a device increments the group reference count.
+ */
+int iommu_group_add_device(struct iommu_group *group, struct device *dev)
+{
+	int ret, i = 0;
+	struct group_device *device;
+
+	device = kzalloc(sizeof(*device), GFP_KERNEL);
+	if (!device)
+		return -ENOMEM;
+
+	device->dev = dev;
+
+	ret = sysfs_create_link(&dev->kobj, &group->kobj, "iommu_group");
+	if (ret)
+		goto err_free_device;
+
+	device->name = kasprintf(GFP_KERNEL, "%s", kobject_name(&dev->kobj));
+rename:
+	if (!device->name) {
+		ret = -ENOMEM;
+		goto err_remove_link;
+	}
+
+	ret = sysfs_create_link_nowarn(group->devices_kobj,
+				       &dev->kobj, device->name);
+	if (ret) {
+		if (ret == -EEXIST && i >= 0) {
+			/*
+			 * Account for the slim chance of collision
+			 * and append an instance to the name.
+			 */
+			kfree(device->name);
+			device->name = kasprintf(GFP_KERNEL, "%s.%d",
+						 kobject_name(&dev->kobj), i++);
+			goto rename;
+		}
+		goto err_free_name;
+	}
+
+	kobject_get(group->devices_kobj);
+
+	dev->iommu_group = group;
+
+	iommu_group_create_direct_mappings(group, dev);
+
+	mutex_lock(&group->mutex);
+	list_add_tail(&device->list, &group->devices);
+	if (group->domain)
+		ret = __iommu_attach_device(group->domain, dev);
+	mutex_unlock(&group->mutex);
+	if (ret)
+		goto err_put_group;
+
+	/* Notify any listeners about change to group. */
+	blocking_notifier_call_chain(&group->notifier,
+				     IOMMU_GROUP_NOTIFY_ADD_DEVICE, dev);
+
+	trace_add_device_to_group(group->id, dev);
+
+	pr_info("Adding device %s to group %d\n", dev_name(dev), group->id);
+
+	return 0;
+
+err_put_group:
+	mutex_lock(&group->mutex);
+	list_del(&device->list);
+	mutex_unlock(&group->mutex);
+	dev->iommu_group = NULL;
+	kobject_put(group->devices_kobj);
+	sysfs_remove_link(group->devices_kobj, device->name);
+err_free_name:
+	kfree(device->name);
+err_remove_link:
+	sysfs_remove_link(&dev->kobj, "iommu_group");
+err_free_device:
+	kfree(device);
+	pr_err("Failed to add device %s to group %d: %d\n", dev_name(dev), group->id, ret);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(iommu_group_add_device);
+
+/**
+ * iommu_group_remove_device - remove a device from it's current group
+ * @dev: device to be removed
+ *
+ * This function is called by an iommu driver to remove the device from
+ * it's current group.  This decrements the iommu group reference count.
+ */
+void iommu_group_remove_device(struct device *dev)
+{
+	struct iommu_group *group = dev->iommu_group;
+	struct group_device *tmp_device, *device = NULL;
+
+	pr_info("Removing device %s from group %d\n", dev_name(dev), group->id);
+
+	/* Pre-notify listeners that a device is being removed. */
+	blocking_notifier_call_chain(&group->notifier,
+				     IOMMU_GROUP_NOTIFY_DEL_DEVICE, dev);
+
+	mutex_lock(&group->mutex);
+	list_for_each_entry(tmp_device, &group->devices, list) {
+		if (tmp_device->dev == dev) {
+			device = tmp_device;
+			list_del(&device->list);
+			break;
+		}
+	}
+	mutex_unlock(&group->mutex);
+
+	if (!device)
+		return;
+
+	sysfs_remove_link(group->devices_kobj, device->name);
+	sysfs_remove_link(&dev->kobj, "iommu_group");
+
+	trace_remove_device_from_group(group->id, dev);
+
+	kfree(device->name);
+	kfree(device);
+	dev->iommu_group = NULL;
+	kobject_put(group->devices_kobj);
+}
+EXPORT_SYMBOL_GPL(iommu_group_remove_device);
+
+static int iommu_group_device_count(struct iommu_group *group)
+{
+	struct group_device *entry;
+	int ret = 0;
+
+	list_for_each_entry(entry, &group->devices, list)
+		ret++;
+
+	return ret;
+}
+
+/**
+ * iommu_group_for_each_dev - iterate over each device in the group
+ * @group: the group
+ * @data: caller opaque data to be passed to callback function
+ * @fn: caller supplied callback function
+ *
+ * This function is called by group users to iterate over group devices.
+ * Callers should hold a reference count to the group during callback.
+ * The group->mutex is held across callbacks, which will block calls to
+ * iommu_group_add/remove_device.
+ */
+static int __iommu_group_for_each_dev(struct iommu_group *group, void *data,
+				      int (*fn)(struct device *, void *))
+{
+	struct group_device *device;
+	int ret = 0;
+
+	list_for_each_entry(device, &group->devices, list) {
+		ret = fn(device->dev, data);
+		if (ret)
+			break;
+	}
+	return ret;
+}
+
+
+int iommu_group_for_each_dev(struct iommu_group *group, void *data,
+			     int (*fn)(struct device *, void *))
+{
+	int ret;
+
+	mutex_lock(&group->mutex);
+	ret = __iommu_group_for_each_dev(group, data, fn);
+	mutex_unlock(&group->mutex);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(iommu_group_for_each_dev);
+
+/**
+ * iommu_group_get - Return the group for a device and increment reference
+ * @dev: get the group that this device belongs to
+ *
+ * This function is called by iommu drivers and users to get the group
+ * for the specified device.  If found, the group is returned and the group
+ * reference in incremented, else NULL.
+ */
+struct iommu_group *iommu_group_get(struct device *dev)
+{
+	struct iommu_group *group = dev->iommu_group;
+
+	if (group)
+		kobject_get(group->devices_kobj);
+
+	return group;
+}
+EXPORT_SYMBOL_GPL(iommu_group_get);
+
+/**
+ * iommu_group_ref_get - Increment reference on a group
+ * @group: the group to use, must not be NULL
+ *
+ * This function is called by iommu drivers to take additional references on an
+ * existing group.  Returns the given group for convenience.
+ */
+struct iommu_group *iommu_group_ref_get(struct iommu_group *group)
+{
+	kobject_get(group->devices_kobj);
+	return group;
+}
+
+/**
+ * iommu_group_put - Decrement group reference
+ * @group: the group to use
+ *
+ * This function is called by iommu drivers and users to release the
+ * iommu group.  Once the reference count is zero, the group is released.
+ */
+void iommu_group_put(struct iommu_group *group)
+{
+	if (group)
+		kobject_put(group->devices_kobj);
+}
+EXPORT_SYMBOL_GPL(iommu_group_put);
+
+/**
+ * iommu_group_register_notifier - Register a notifier for group changes
+ * @group: the group to watch
+ * @nb: notifier block to signal
+ *
+ * This function allows iommu group users to track changes in a group.
+ * See include/linux/iommu.h for actions sent via this notifier.  Caller
+ * should hold a reference to the group throughout notifier registration.
+ */
+int iommu_group_register_notifier(struct iommu_group *group,
+				  struct notifier_block *nb)
+{
+	return blocking_notifier_chain_register(&group->notifier, nb);
+}
+EXPORT_SYMBOL_GPL(iommu_group_register_notifier);
+
+/**
+ * iommu_group_unregister_notifier - Unregister a notifier
+ * @group: the group to watch
+ * @nb: notifier block to signal
+ *
+ * Unregister a previously registered group notifier block.
+ */
+int iommu_group_unregister_notifier(struct iommu_group *group,
+				    struct notifier_block *nb)
+{
+	return blocking_notifier_chain_unregister(&group->notifier, nb);
+}
+EXPORT_SYMBOL_GPL(iommu_group_unregister_notifier);
+
+/**
+ * iommu_group_id - Return ID for a group
+ * @group: the group to ID
+ *
+ * Return the unique ID for the group matching the sysfs group number.
+ */
+int iommu_group_id(struct iommu_group *group)
+{
+	return group->id;
+}
+EXPORT_SYMBOL_GPL(iommu_group_id);
+
+static struct iommu_group *get_pci_alias_group(struct pci_dev *pdev,
+					       unsigned long *devfns);
+
+/*
+ * To consider a PCI device isolated, we require ACS to support Source
+ * Validation, Request Redirection, Completer Redirection, and Upstream
+ * Forwarding.  This effectively means that devices cannot spoof their
+ * requester ID, requests and completions cannot be redirected, and all
+ * transactions are forwarded upstream, even as it passes through a
+ * bridge where the target device is downstream.
+ */
+#define REQ_ACS_FLAGS   (PCI_ACS_SV | PCI_ACS_RR | PCI_ACS_CR | PCI_ACS_UF)
+
+/*
+ * For multifunction devices which are not isolated from each other, find
+ * all the other non-isolated functions and look for existing groups.  For
+ * each function, we also need to look for aliases to or from other devices
+ * that may already have a group.
+ */
+static struct iommu_group *get_pci_function_alias_group(struct pci_dev *pdev,
+							unsigned long *devfns)
+{
+	struct pci_dev *tmp = NULL;
+	struct iommu_group *group;
+
+	if (!pdev->multifunction || pci_acs_enabled(pdev, REQ_ACS_FLAGS))
+		return NULL;
+
+	for_each_pci_dev(tmp) {
+		if (tmp == pdev || tmp->bus != pdev->bus ||
+		    PCI_SLOT(tmp->devfn) != PCI_SLOT(pdev->devfn) ||
+		    pci_acs_enabled(tmp, REQ_ACS_FLAGS))
+			continue;
+
+		group = get_pci_alias_group(tmp, devfns);
+		if (group) {
+			pci_dev_put(tmp);
+			return group;
+		}
+	}
+
+	return NULL;
+}
+
+/*
+ * Look for aliases to or from the given device for existing groups. DMA
+ * aliases are only supported on the same bus, therefore the search
+ * space is quite small (especially since we're really only looking at pcie
+ * device, and therefore only expect multiple slots on the root complex or
+ * downstream switch ports).  It's conceivable though that a pair of
+ * multifunction devices could have aliases between them that would cause a
+ * loop.  To prevent this, we use a bitmap to track where we've been.
+ */
+static struct iommu_group *get_pci_alias_group(struct pci_dev *pdev,
+					       unsigned long *devfns)
+{
+	struct pci_dev *tmp = NULL;
+	struct iommu_group *group;
+
+	if (test_and_set_bit(pdev->devfn & 0xff, devfns))
+		return NULL;
+
+	group = iommu_group_get(&pdev->dev);
+	if (group)
+		return group;
+
+	for_each_pci_dev(tmp) {
+		if (tmp == pdev || tmp->bus != pdev->bus)
+			continue;
+
+		/* We alias them or they alias us */
+		if (pci_devs_are_dma_aliases(pdev, tmp)) {
+			group = get_pci_alias_group(tmp, devfns);
+			if (group) {
+				pci_dev_put(tmp);
+				return group;
+			}
+
+			group = get_pci_function_alias_group(tmp, devfns);
+			if (group) {
+				pci_dev_put(tmp);
+				return group;
+			}
+		}
+	}
+
+	return NULL;
+}
+
+struct group_for_pci_data {
+	struct pci_dev *pdev;
+	struct iommu_group *group;
+};
+
+/*
+ * DMA alias iterator callback, return the last seen device.  Stop and return
+ * the IOMMU group if we find one along the way.
+ */
+static int get_pci_alias_or_group(struct pci_dev *pdev, u16 alias, void *opaque)
+{
+	struct group_for_pci_data *data = opaque;
+
+	data->pdev = pdev;
+	data->group = iommu_group_get(&pdev->dev);
+
+	return data->group != NULL;
+}
+
+/*
+ * Generic device_group call-back function. It just allocates one
+ * iommu-group per device.
+ */
+struct iommu_group *generic_device_group(struct device *dev)
+{
+	return iommu_group_alloc();
+}
+
+/*
+ * Use standard PCI bus topology, isolation features, and DMA alias quirks
+ * to find or create an IOMMU group for a device.
+ */
+struct iommu_group *pci_device_group(struct device *dev)
+{
+	struct pci_dev *pdev = to_pci_dev(dev);
+	struct group_for_pci_data data;
+	struct pci_bus *bus;
+	struct iommu_group *group = NULL;
+	u64 devfns[4] = { 0 };
+
+	if (WARN_ON(!dev_is_pci(dev)))
+		return ERR_PTR(-EINVAL);
+
+	/*
+	 * Find the upstream DMA alias for the device.  A device must not
+	 * be aliased due to topology in order to have its own IOMMU group.
+	 * If we find an alias along the way that already belongs to a
+	 * group, use it.
+	 */
+	if (pci_for_each_dma_alias(pdev, get_pci_alias_or_group, &data))
+		return data.group;
+
+	pdev = data.pdev;
+
+	/*
+	 * Continue upstream from the point of minimum IOMMU granularity
+	 * due to aliases to the point where devices are protected from
+	 * peer-to-peer DMA by PCI ACS.  Again, if we find an existing
+	 * group, use it.
+	 */
+	for (bus = pdev->bus; !pci_is_root_bus(bus); bus = bus->parent) {
+		if (!bus->self)
+			continue;
+
+		if (pci_acs_path_enabled(bus->self, NULL, REQ_ACS_FLAGS))
+			break;
+
+		pdev = bus->self;
+
+		group = iommu_group_get(&pdev->dev);
+		if (group)
+			return group;
+	}
+
+	/*
+	 * Look for existing groups on device aliases.  If we alias another
+	 * device or another device aliases us, use the same group.
+	 */
+	group = get_pci_alias_group(pdev, (unsigned long *)devfns);
+	if (group)
+		return group;
+
+	/*
+	 * Look for existing groups on non-isolated functions on the same
+	 * slot and aliases of those funcions, if any.  No need to clear
+	 * the search bitmap, the tested devfns are still valid.
+	 */
+	group = get_pci_function_alias_group(pdev, (unsigned long *)devfns);
+	if (group)
+		return group;
+
+	/* No shared group found, allocate new */
+	return iommu_group_alloc();
+}
+
+/**
+ * iommu_group_get_for_dev - Find or create the IOMMU group for a device
+ * @dev: target device
+ *
+ * This function is intended to be called by IOMMU drivers and extended to
+ * support common, bus-defined algorithms when determining or creating the
+ * IOMMU group for a device.  On success, the caller will hold a reference
+ * to the returned IOMMU group, which will already include the provided
+ * device.  The reference should be released with iommu_group_put().
+ */
+struct iommu_group *iommu_group_get_for_dev(struct device *dev)
+{
+	const struct iommu_ops *ops = dev->bus->iommu_ops;
+	struct iommu_group *group;
+	int ret;
+
+	group = iommu_group_get(dev);
+	if (group)
+		return group;
+
+	if (!ops)
+		return ERR_PTR(-EINVAL);
+
+	group = ops->device_group(dev);
+	if (WARN_ON_ONCE(group == NULL))
+		return ERR_PTR(-EINVAL);
+
+	if (IS_ERR(group))
+		return group;
+
+	/*
+	 * Try to allocate a default domain - needs support from the
+	 * IOMMU driver.
+	 */
+	if (!group->default_domain) {
+		struct iommu_domain *dom;
+
+		dom = __iommu_domain_alloc(dev->bus, iommu_def_domain_type);
+		if (!dom && iommu_def_domain_type != IOMMU_DOMAIN_DMA) {
+			dev_warn(dev,
+				 "failed to allocate default IOMMU domain of type %u; falling back to IOMMU_DOMAIN_DMA",
+				 iommu_def_domain_type);
+			dom = __iommu_domain_alloc(dev->bus, IOMMU_DOMAIN_DMA);
+		}
+
+		group->default_domain = dom;
+		if (!group->domain)
+			group->domain = dom;
+	}
+
+	ret = iommu_group_add_device(group, dev);
+	if (ret) {
+		iommu_group_put(group);
+		return ERR_PTR(ret);
+	}
+
+	return group;
+}
+
+struct iommu_domain *iommu_group_default_domain(struct iommu_group *group)
+{
+	return group->default_domain;
+}
+
+static int add_iommu_group(struct device *dev, void *data)
+{
+	struct iommu_callback_data *cb = data;
+	const struct iommu_ops *ops = cb->ops;
+	int ret;
+
+	if (!ops->add_device)
+		return 0;
+
+	WARN_ON(dev->iommu_group);
+
+	ret = ops->add_device(dev);
+
+	/*
+	 * We ignore -ENODEV errors for now, as they just mean that the
+	 * device is not translated by an IOMMU. We still care about
+	 * other errors and fail to initialize when they happen.
+	 */
+	if (ret == -ENODEV)
+		ret = 0;
+
+	return ret;
+}
+
+static int remove_iommu_group(struct device *dev, void *data)
+{
+	struct iommu_callback_data *cb = data;
+	const struct iommu_ops *ops = cb->ops;
+
+	if (ops->remove_device && dev->iommu_group)
+		ops->remove_device(dev);
+
+	return 0;
+}
+
+static int iommu_bus_notifier(struct notifier_block *nb,
+			      unsigned long action, void *data)
+{
+	struct device *dev = data;
+	const struct iommu_ops *ops = dev->bus->iommu_ops;
+	struct iommu_group *group;
+	unsigned long group_action = 0;
+
+	/*
+	 * ADD/DEL call into iommu driver ops if provided, which may
+	 * result in ADD/DEL notifiers to group->notifier
+	 */
+	if (action == BUS_NOTIFY_ADD_DEVICE) {
+		if (ops->add_device) {
+			int ret;
+
+			ret = ops->add_device(dev);
+			return (ret) ? NOTIFY_DONE : NOTIFY_OK;
+		}
+	} else if (action == BUS_NOTIFY_REMOVED_DEVICE) {
+		if (ops->remove_device && dev->iommu_group) {
+			ops->remove_device(dev);
+			return 0;
+		}
+	}
+
+	/*
+	 * Remaining BUS_NOTIFYs get filtered and republished to the
+	 * group, if anyone is listening
+	 */
+	group = iommu_group_get(dev);
+	if (!group)
+		return 0;
+
+	switch (action) {
+	case BUS_NOTIFY_BIND_DRIVER:
+		group_action = IOMMU_GROUP_NOTIFY_BIND_DRIVER;
+		break;
+	case BUS_NOTIFY_BOUND_DRIVER:
+		group_action = IOMMU_GROUP_NOTIFY_BOUND_DRIVER;
+		break;
+	case BUS_NOTIFY_UNBIND_DRIVER:
+		group_action = IOMMU_GROUP_NOTIFY_UNBIND_DRIVER;
+		break;
+	case BUS_NOTIFY_UNBOUND_DRIVER:
+		group_action = IOMMU_GROUP_NOTIFY_UNBOUND_DRIVER;
+		break;
+	}
+
+	if (group_action)
+		blocking_notifier_call_chain(&group->notifier,
+					     group_action, dev);
+
+	iommu_group_put(group);
+	return 0;
+}
+
+static int iommu_bus_init(struct bus_type *bus, const struct iommu_ops *ops)
+{
+	int err;
+	struct notifier_block *nb;
+	struct iommu_callback_data cb = {
+		.ops = ops,
+	};
+
+	nb = kzalloc(sizeof(struct notifier_block), GFP_KERNEL);
+	if (!nb)
+		return -ENOMEM;
+
+	nb->notifier_call = iommu_bus_notifier;
+
+	err = bus_register_notifier(bus, nb);
+	if (err)
+		goto out_free;
+
+	err = bus_for_each_dev(bus, NULL, &cb, add_iommu_group);
+	if (err)
+		goto out_err;
+
+
+	return 0;
+
+out_err:
+	/* Clean up */
+	bus_for_each_dev(bus, NULL, &cb, remove_iommu_group);
+	bus_unregister_notifier(bus, nb);
+
+out_free:
+	kfree(nb);
+
+	return err;
+}
+
+/**
+ * bus_set_iommu - set iommu-callbacks for the bus
+ * @bus: bus.
+ * @ops: the callbacks provided by the iommu-driver
+ *
+ * This function is called by an iommu driver to set the iommu methods
+ * used for a particular bus. Drivers for devices on that bus can use
+ * the iommu-api after these ops are registered.
+ * This special function is needed because IOMMUs are usually devices on
+ * the bus itself, so the iommu drivers are not initialized when the bus
+ * is set up. With this function the iommu-driver can set the iommu-ops
+ * afterwards.
+ */
+int bus_set_iommu(struct bus_type *bus, const struct iommu_ops *ops)
+{
+	int err;
+
+	if (bus->iommu_ops != NULL)
+		return -EBUSY;
+
+	bus->iommu_ops = ops;
+
+	/* Do IOMMU specific setup for this bus-type */
+	err = iommu_bus_init(bus, ops);
+	if (err)
+		bus->iommu_ops = NULL;
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(bus_set_iommu);
+
+bool iommu_present(struct bus_type *bus)
+{
+	return bus->iommu_ops != NULL;
+}
+EXPORT_SYMBOL_GPL(iommu_present);
+
+bool iommu_capable(struct bus_type *bus, enum iommu_cap cap)
+{
+	if (!bus->iommu_ops || !bus->iommu_ops->capable)
+		return false;
+
+	return bus->iommu_ops->capable(cap);
+}
+EXPORT_SYMBOL_GPL(iommu_capable);
+
+/**
+ * iommu_set_fault_handler() - set a fault handler for an iommu domain
+ * @domain: iommu domain
+ * @handler: fault handler
+ * @token: user data, will be passed back to the fault handler
+ *
+ * This function should be used by IOMMU users which want to be notified
+ * whenever an IOMMU fault happens.
+ *
+ * The fault handler itself should return 0 on success, and an appropriate
+ * error code otherwise.
+ */
+void iommu_set_fault_handler(struct iommu_domain *domain,
+					iommu_fault_handler_t handler,
+					void *token)
+{
+	BUG_ON(!domain);
+
+	domain->handler = handler;
+	domain->handler_token = token;
+}
+EXPORT_SYMBOL_GPL(iommu_set_fault_handler);
+
+static struct iommu_domain *__iommu_domain_alloc(struct bus_type *bus,
+						 unsigned type)
+{
+	struct iommu_domain *domain;
+
+	if (bus == NULL || bus->iommu_ops == NULL)
+		return NULL;
+
+	domain = bus->iommu_ops->domain_alloc(type);
+	if (!domain)
+		return NULL;
+
+	domain->ops  = bus->iommu_ops;
+	domain->type = type;
+	/* Assume all sizes by default; the driver may override this later */
+	domain->pgsize_bitmap  = bus->iommu_ops->pgsize_bitmap;
+
+	return domain;
+}
+
+struct iommu_domain *iommu_domain_alloc(struct bus_type *bus)
+{
+	return __iommu_domain_alloc(bus, IOMMU_DOMAIN_UNMANAGED);
+}
+EXPORT_SYMBOL_GPL(iommu_domain_alloc);
+
+void iommu_domain_free(struct iommu_domain *domain)
+{
+	domain->ops->domain_free(domain);
+}
+EXPORT_SYMBOL_GPL(iommu_domain_free);
+
+static int __iommu_attach_device(struct iommu_domain *domain,
+				 struct device *dev)
+{
+	int ret;
+	if ((domain->ops->is_attach_deferred != NULL) &&
+	    domain->ops->is_attach_deferred(domain, dev))
+		return 0;
+
+	if (unlikely(domain->ops->attach_dev == NULL))
+		return -ENODEV;
+
+	ret = domain->ops->attach_dev(domain, dev);
+	if (!ret)
+		trace_attach_device_to_domain(dev);
+	return ret;
+}
+
+int iommu_attach_device(struct iommu_domain *domain, struct device *dev)
+{
+	struct iommu_group *group;
+	int ret;
+
+	group = iommu_group_get(dev);
+	if (!group)
+		return -ENODEV;
+
+	/*
+	 * Lock the group to make sure the device-count doesn't
+	 * change while we are attaching
+	 */
+	mutex_lock(&group->mutex);
+	ret = -EINVAL;
+	if (iommu_group_device_count(group) != 1)
+		goto out_unlock;
+
+	ret = __iommu_attach_group(domain, group);
+
+out_unlock:
+	mutex_unlock(&group->mutex);
+	iommu_group_put(group);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(iommu_attach_device);
+
+static void __iommu_detach_device(struct iommu_domain *domain,
+				  struct device *dev)
+{
+	if ((domain->ops->is_attach_deferred != NULL) &&
+	    domain->ops->is_attach_deferred(domain, dev))
+		return;
+
+	if (unlikely(domain->ops->detach_dev == NULL))
+		return;
+
+	domain->ops->detach_dev(domain, dev);
+	trace_detach_device_from_domain(dev);
+}
+
+void iommu_detach_device(struct iommu_domain *domain, struct device *dev)
+{
+	struct iommu_group *group;
+
+	group = iommu_group_get(dev);
+	if (!group)
+		return;
+
+	mutex_lock(&group->mutex);
+	if (iommu_group_device_count(group) != 1) {
+		WARN_ON(1);
+		goto out_unlock;
+	}
+
+	__iommu_detach_group(domain, group);
+
+out_unlock:
+	mutex_unlock(&group->mutex);
+	iommu_group_put(group);
+}
+EXPORT_SYMBOL_GPL(iommu_detach_device);
+
+struct iommu_domain *iommu_get_domain_for_dev(struct device *dev)
+{
+	struct iommu_domain *domain;
+	struct iommu_group *group;
+
+	group = iommu_group_get(dev);
+	if (!group)
+		return NULL;
+
+	domain = group->domain;
+
+	iommu_group_put(group);
+
+	return domain;
+}
+EXPORT_SYMBOL_GPL(iommu_get_domain_for_dev);
+
+/*
+ * IOMMU groups are really the natrual working unit of the IOMMU, but
+ * the IOMMU API works on domains and devices.  Bridge that gap by
+ * iterating over the devices in a group.  Ideally we'd have a single
+ * device which represents the requestor ID of the group, but we also
+ * allow IOMMU drivers to create policy defined minimum sets, where
+ * the physical hardware may be able to distiguish members, but we
+ * wish to group them at a higher level (ex. untrusted multi-function
+ * PCI devices).  Thus we attach each device.
+ */
+static int iommu_group_do_attach_device(struct device *dev, void *data)
+{
+	struct iommu_domain *domain = data;
+
+	return __iommu_attach_device(domain, dev);
+}
+
+static int __iommu_attach_group(struct iommu_domain *domain,
+				struct iommu_group *group)
+{
+	int ret;
+
+	if (group->default_domain && group->domain != group->default_domain)
+		return -EBUSY;
+
+	ret = __iommu_group_for_each_dev(group, domain,
+					 iommu_group_do_attach_device);
+	if (ret == 0)
+		group->domain = domain;
+
+	return ret;
+}
+
+int iommu_attach_group(struct iommu_domain *domain, struct iommu_group *group)
+{
+	int ret;
+
+	mutex_lock(&group->mutex);
+	ret = __iommu_attach_group(domain, group);
+	mutex_unlock(&group->mutex);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(iommu_attach_group);
+
+static int iommu_group_do_detach_device(struct device *dev, void *data)
+{
+	struct iommu_domain *domain = data;
+
+	__iommu_detach_device(domain, dev);
+
+	return 0;
+}
+
+static void __iommu_detach_group(struct iommu_domain *domain,
+				 struct iommu_group *group)
+{
+	int ret;
+
+	if (!group->default_domain) {
+		__iommu_group_for_each_dev(group, domain,
+					   iommu_group_do_detach_device);
+		group->domain = NULL;
+		return;
+	}
+
+	if (group->domain == group->default_domain)
+		return;
+
+	/* Detach by re-attaching to the default domain */
+	ret = __iommu_group_for_each_dev(group, group->default_domain,
+					 iommu_group_do_attach_device);
+	if (ret != 0)
+		WARN_ON(1);
+	else
+		group->domain = group->default_domain;
+}
+
+void iommu_detach_group(struct iommu_domain *domain, struct iommu_group *group)
+{
+	mutex_lock(&group->mutex);
+	__iommu_detach_group(domain, group);
+	mutex_unlock(&group->mutex);
+}
+EXPORT_SYMBOL_GPL(iommu_detach_group);
+
+phys_addr_t iommu_iova_to_phys(struct iommu_domain *domain, dma_addr_t iova)
+{
+	if (unlikely(domain->ops->iova_to_phys == NULL))
+		return 0;
+
+	return domain->ops->iova_to_phys(domain, iova);
+}
+EXPORT_SYMBOL_GPL(iommu_iova_to_phys);
+
+static size_t iommu_pgsize(struct iommu_domain *domain,
+			   unsigned long addr_merge, size_t size)
+{
+	unsigned int pgsize_idx;
+	size_t pgsize;
+
+	/* Max page size that still fits into 'size' */
+	pgsize_idx = __fls(size);
+
+	/* need to consider alignment requirements ? */
+	if (likely(addr_merge)) {
+		/* Max page size allowed by address */
+		unsigned int align_pgsize_idx = __ffs(addr_merge);
+		pgsize_idx = min(pgsize_idx, align_pgsize_idx);
+	}
+
+	/* build a mask of acceptable page sizes */
+	pgsize = (1UL << (pgsize_idx + 1)) - 1;
+
+	/* throw away page sizes not supported by the hardware */
+	pgsize &= domain->pgsize_bitmap;
+
+	/* make sure we're still sane */
+	BUG_ON(!pgsize);
+
+	/* pick the biggest page */
+	pgsize_idx = __fls(pgsize);
+	pgsize = 1UL << pgsize_idx;
+
+	return pgsize;
+}
+
+int iommu_map(struct iommu_domain *domain, unsigned long iova,
+	      phys_addr_t paddr, size_t size, int prot)
+{
+	unsigned long orig_iova = iova;
+	unsigned int min_pagesz;
+	size_t orig_size = size;
+	phys_addr_t orig_paddr = paddr;
+	int ret = 0;
+
+	if (unlikely(domain->ops->map == NULL ||
+		     domain->pgsize_bitmap == 0UL))
+		return -ENODEV;
+
+	if (unlikely(!(domain->type & __IOMMU_DOMAIN_PAGING)))
+		return -EINVAL;
+
+	/* find out the minimum page size supported */
+	min_pagesz = 1 << __ffs(domain->pgsize_bitmap);
+
+	/*
+	 * both the virtual address and the physical one, as well as
+	 * the size of the mapping, must be aligned (at least) to the
+	 * size of the smallest page supported by the hardware
+	 */
+	if (!IS_ALIGNED(iova | paddr | size, min_pagesz)) {
+		pr_err("unaligned: iova 0x%lx pa %pa size 0x%zx min_pagesz 0x%x\n",
+		       iova, &paddr, size, min_pagesz);
+		return -EINVAL;
+	}
+
+	pr_debug("map: iova 0x%lx pa %pa size 0x%zx\n", iova, &paddr, size);
+
+	while (size) {
+		size_t pgsize = iommu_pgsize(domain, iova | paddr, size);
+
+		pr_debug("mapping: iova 0x%lx pa %pa pgsize 0x%zx\n",
+			 iova, &paddr, pgsize);
+
+		ret = domain->ops->map(domain, iova, paddr, pgsize, prot);
+		if (ret)
+			break;
+
+		iova += pgsize;
+		paddr += pgsize;
+		size -= pgsize;
+	}
+
+	/* unroll mapping in case something went wrong */
+	if (ret)
+		iommu_unmap(domain, orig_iova, orig_size - size);
+	else
+		trace_map(orig_iova, orig_paddr, orig_size);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(iommu_map);
+
+static size_t __iommu_unmap(struct iommu_domain *domain,
+			    unsigned long iova, size_t size,
+			    bool sync)
+{
+	const struct iommu_ops *ops = domain->ops;
+	size_t unmapped_page, unmapped = 0;
+	unsigned long orig_iova = iova;
+	unsigned int min_pagesz;
+
+	if (unlikely(ops->unmap == NULL ||
+		     domain->pgsize_bitmap == 0UL))
+		return 0;
+
+	if (unlikely(!(domain->type & __IOMMU_DOMAIN_PAGING)))
+		return 0;
+
+	/* find out the minimum page size supported */
+	min_pagesz = 1 << __ffs(domain->pgsize_bitmap);
+
+	/*
+	 * The virtual address, as well as the size of the mapping, must be
+	 * aligned (at least) to the size of the smallest page supported
+	 * by the hardware
+	 */
+	if (!IS_ALIGNED(iova | size, min_pagesz)) {
+		pr_err("unaligned: iova 0x%lx size 0x%zx min_pagesz 0x%x\n",
+		       iova, size, min_pagesz);
+		return 0;
+	}
+
+	pr_debug("unmap this: iova 0x%lx size 0x%zx\n", iova, size);
+
+	/*
+	 * Keep iterating until we either unmap 'size' bytes (or more)
+	 * or we hit an area that isn't mapped.
+	 */
+	while (unmapped < size) {
+		size_t pgsize = iommu_pgsize(domain, iova, size - unmapped);
+
+		unmapped_page = ops->unmap(domain, iova, pgsize);
+		if (!unmapped_page)
+			break;
+
+		if (sync && ops->iotlb_range_add)
+			ops->iotlb_range_add(domain, iova, pgsize);
+
+		pr_debug("unmapped: iova 0x%lx size 0x%zx\n",
+			 iova, unmapped_page);
+
+		iova += unmapped_page;
+		unmapped += unmapped_page;
+	}
+
+	if (sync && ops->iotlb_sync)
+		ops->iotlb_sync(domain);
+
+	trace_unmap(orig_iova, size, unmapped);
+	return unmapped;
+}
+
+size_t iommu_unmap(struct iommu_domain *domain,
+		   unsigned long iova, size_t size)
+{
+	return __iommu_unmap(domain, iova, size, true);
+}
+EXPORT_SYMBOL_GPL(iommu_unmap);
+
+size_t iommu_unmap_fast(struct iommu_domain *domain,
+			unsigned long iova, size_t size)
+{
+	return __iommu_unmap(domain, iova, size, false);
+}
+EXPORT_SYMBOL_GPL(iommu_unmap_fast);
+
+size_t iommu_map_sg(struct iommu_domain *domain, unsigned long iova,
+		    struct scatterlist *sg, unsigned int nents, int prot)
+{
+	struct scatterlist *s;
+	size_t mapped = 0;
+	unsigned int i, min_pagesz;
+	int ret;
+
+	if (unlikely(domain->pgsize_bitmap == 0UL))
+		return 0;
+
+	min_pagesz = 1 << __ffs(domain->pgsize_bitmap);
+
+	for_each_sg(sg, s, nents, i) {
+		phys_addr_t phys = page_to_phys(sg_page(s)) + s->offset;
+
+		/*
+		 * We are mapping on IOMMU page boundaries, so offset within
+		 * the page must be 0. However, the IOMMU may support pages
+		 * smaller than PAGE_SIZE, so s->offset may still represent
+		 * an offset of that boundary within the CPU page.
+		 */
+		if (!IS_ALIGNED(s->offset, min_pagesz))
+			goto out_err;
+
+		ret = iommu_map(domain, iova + mapped, phys, s->length, prot);
+		if (ret)
+			goto out_err;
+
+		mapped += s->length;
+	}
+
+	return mapped;
+
+out_err:
+	/* undo mappings already done */
+	iommu_unmap(domain, iova, mapped);
+
+	return 0;
+
+}
+EXPORT_SYMBOL_GPL(iommu_map_sg);
+
+int iommu_domain_window_enable(struct iommu_domain *domain, u32 wnd_nr,
+			       phys_addr_t paddr, u64 size, int prot)
+{
+	if (unlikely(domain->ops->domain_window_enable == NULL))
+		return -ENODEV;
+
+	return domain->ops->domain_window_enable(domain, wnd_nr, paddr, size,
+						 prot);
+}
+EXPORT_SYMBOL_GPL(iommu_domain_window_enable);
+
+void iommu_domain_window_disable(struct iommu_domain *domain, u32 wnd_nr)
+{
+	if (unlikely(domain->ops->domain_window_disable == NULL))
+		return;
+
+	return domain->ops->domain_window_disable(domain, wnd_nr);
+}
+EXPORT_SYMBOL_GPL(iommu_domain_window_disable);
+
+/**
+ * report_iommu_fault() - report about an IOMMU fault to the IOMMU framework
+ * @domain: the iommu domain where the fault has happened
+ * @dev: the device where the fault has happened
+ * @iova: the faulting address
+ * @flags: mmu fault flags (e.g. IOMMU_FAULT_READ/IOMMU_FAULT_WRITE/...)
+ *
+ * This function should be called by the low-level IOMMU implementations
+ * whenever IOMMU faults happen, to allow high-level users, that are
+ * interested in such events, to know about them.
+ *
+ * This event may be useful for several possible use cases:
+ * - mere logging of the event
+ * - dynamic TLB/PTE loading
+ * - if restarting of the faulting device is required
+ *
+ * Returns 0 on success and an appropriate error code otherwise (if dynamic
+ * PTE/TLB loading will one day be supported, implementations will be able
+ * to tell whether it succeeded or not according to this return value).
+ *
+ * Specifically, -ENOSYS is returned if a fault handler isn't installed
+ * (though fault handlers can also return -ENOSYS, in case they want to
+ * elicit the default behavior of the IOMMU drivers).
+ */
+int report_iommu_fault(struct iommu_domain *domain, struct device *dev,
+		       unsigned long iova, int flags)
+{
+	int ret = -ENOSYS;
+
+	/*
+	 * if upper layers showed interest and installed a fault handler,
+	 * invoke it.
+	 */
+	if (domain->handler)
+		ret = domain->handler(domain, dev, iova, flags,
+						domain->handler_token);
+
+	trace_io_page_fault(dev, iova, flags);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(report_iommu_fault);
+
+static int __init iommu_init(void)
+{
+	iommu_group_kset = kset_create_and_add("iommu_groups",
+					       NULL, kernel_kobj);
+	BUG_ON(!iommu_group_kset);
+
+	iommu_debugfs_setup();
+
+	return 0;
+}
+core_initcall(iommu_init);
+
+int iommu_domain_get_attr(struct iommu_domain *domain,
+			  enum iommu_attr attr, void *data)
+{
+	struct iommu_domain_geometry *geometry;
+	bool *paging;
+	int ret = 0;
+	u32 *count;
+
+	switch (attr) {
+	case DOMAIN_ATTR_GEOMETRY:
+		geometry  = data;
+		*geometry = domain->geometry;
+
+		break;
+	case DOMAIN_ATTR_PAGING:
+		paging  = data;
+		*paging = (domain->pgsize_bitmap != 0UL);
+		break;
+	case DOMAIN_ATTR_WINDOWS:
+		count = data;
+
+		if (domain->ops->domain_get_windows != NULL)
+			*count = domain->ops->domain_get_windows(domain);
+		else
+			ret = -ENODEV;
+
+		break;
+	default:
+		if (!domain->ops->domain_get_attr)
+			return -EINVAL;
+
+		ret = domain->ops->domain_get_attr(domain, attr, data);
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(iommu_domain_get_attr);
+
+int iommu_domain_set_attr(struct iommu_domain *domain,
+			  enum iommu_attr attr, void *data)
+{
+	int ret = 0;
+	u32 *count;
+
+	switch (attr) {
+	case DOMAIN_ATTR_WINDOWS:
+		count = data;
+
+		if (domain->ops->domain_set_windows != NULL)
+			ret = domain->ops->domain_set_windows(domain, *count);
+		else
+			ret = -ENODEV;
+
+		break;
+	default:
+		if (domain->ops->domain_set_attr == NULL)
+			return -EINVAL;
+
+		ret = domain->ops->domain_set_attr(domain, attr, data);
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(iommu_domain_set_attr);
+
+void iommu_get_resv_regions(struct device *dev, struct list_head *list)
+{
+	const struct iommu_ops *ops = dev->bus->iommu_ops;
+
+	if (ops && ops->get_resv_regions)
+		ops->get_resv_regions(dev, list);
+}
+
+void iommu_put_resv_regions(struct device *dev, struct list_head *list)
+{
+	const struct iommu_ops *ops = dev->bus->iommu_ops;
+
+	if (ops && ops->put_resv_regions)
+		ops->put_resv_regions(dev, list);
+}
+
+struct iommu_resv_region *iommu_alloc_resv_region(phys_addr_t start,
+						  size_t length, int prot,
+						  enum iommu_resv_type type)
+{
+	struct iommu_resv_region *region;
+
+	region = kzalloc(sizeof(*region), GFP_KERNEL);
+	if (!region)
+		return NULL;
+
+	INIT_LIST_HEAD(&region->list);
+	region->start = start;
+	region->length = length;
+	region->prot = prot;
+	region->type = type;
+	return region;
+}
+
+/* Request that a device is direct mapped by the IOMMU */
+int iommu_request_dm_for_dev(struct device *dev)
+{
+	struct iommu_domain *dm_domain;
+	struct iommu_group *group;
+	int ret;
+
+	/* Device must already be in a group before calling this function */
+	group = iommu_group_get(dev);
+	if (!group)
+		return -EINVAL;
+
+	mutex_lock(&group->mutex);
+
+	/* Check if the default domain is already direct mapped */
+	ret = 0;
+	if (group->default_domain &&
+	    group->default_domain->type == IOMMU_DOMAIN_IDENTITY)
+		goto out;
+
+	/* Don't change mappings of existing devices */
+	ret = -EBUSY;
+	if (iommu_group_device_count(group) != 1)
+		goto out;
+
+	/* Allocate a direct mapped domain */
+	ret = -ENOMEM;
+	dm_domain = __iommu_domain_alloc(dev->bus, IOMMU_DOMAIN_IDENTITY);
+	if (!dm_domain)
+		goto out;
+
+	/* Attach the device to the domain */
+	ret = __iommu_attach_group(dm_domain, group);
+	if (ret) {
+		iommu_domain_free(dm_domain);
+		goto out;
+	}
+
+	/* Make the direct mapped domain the default for this group */
+	if (group->default_domain)
+		iommu_domain_free(group->default_domain);
+	group->default_domain = dm_domain;
+
+	pr_info("Using direct mapping for device %s\n", dev_name(dev));
+
+	ret = 0;
+out:
+	mutex_unlock(&group->mutex);
+	iommu_group_put(group);
+
+	return ret;
+}
+
+const struct iommu_ops *iommu_ops_from_fwnode(struct fwnode_handle *fwnode)
+{
+	const struct iommu_ops *ops = NULL;
+	struct iommu_device *iommu;
+
+	spin_lock(&iommu_device_lock);
+	list_for_each_entry(iommu, &iommu_device_list, list)
+		if (iommu->fwnode == fwnode) {
+			ops = iommu->ops;
+			break;
+		}
+	spin_unlock(&iommu_device_lock);
+	return ops;
+}
+
+int iommu_fwspec_init(struct device *dev, struct fwnode_handle *iommu_fwnode,
+		      const struct iommu_ops *ops)
+{
+	struct iommu_fwspec *fwspec = dev->iommu_fwspec;
+
+	if (fwspec)
+		return ops == fwspec->ops ? 0 : -EINVAL;
+
+	fwspec = kzalloc(sizeof(*fwspec), GFP_KERNEL);
+	if (!fwspec)
+		return -ENOMEM;
+
+	of_node_get(to_of_node(iommu_fwnode));
+	fwspec->iommu_fwnode = iommu_fwnode;
+	fwspec->ops = ops;
+	dev->iommu_fwspec = fwspec;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(iommu_fwspec_init);
+
+void iommu_fwspec_free(struct device *dev)
+{
+	struct iommu_fwspec *fwspec = dev->iommu_fwspec;
+
+	if (fwspec) {
+		fwnode_handle_put(fwspec->iommu_fwnode);
+		kfree(fwspec);
+		dev->iommu_fwspec = NULL;
+	}
+}
+EXPORT_SYMBOL_GPL(iommu_fwspec_free);
+
+int iommu_fwspec_add_ids(struct device *dev, u32 *ids, int num_ids)
+{
+	struct iommu_fwspec *fwspec = dev->iommu_fwspec;
+	size_t size;
+	int i;
+
+	if (!fwspec)
+		return -EINVAL;
+
+	size = offsetof(struct iommu_fwspec, ids[fwspec->num_ids + num_ids]);
+	if (size > sizeof(*fwspec)) {
+		fwspec = krealloc(dev->iommu_fwspec, size, GFP_KERNEL);
+		if (!fwspec)
+			return -ENOMEM;
+
+		dev->iommu_fwspec = fwspec;
+	}
+
+	for (i = 0; i < num_ids; i++)
+		fwspec->ids[fwspec->num_ids + i] = ids[i];
+
+	fwspec->num_ids += num_ids;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(iommu_fwspec_add_ids);
diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
new file mode 100644
index 000000000..fdd68d8e8
--- /dev/null
+++ b/drivers/iommu/iova.c
@@ -0,0 +1,1052 @@
+/*
+ * Copyright © 2006-2009, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
+ */
+
+#include <linux/iova.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/smp.h>
+#include <linux/bitops.h>
+#include <linux/cpu.h>
+
+/* The anchor node sits above the top of the usable address space */
+#define IOVA_ANCHOR	~0UL
+
+static bool iova_rcache_insert(struct iova_domain *iovad,
+			       unsigned long pfn,
+			       unsigned long size);
+static unsigned long iova_rcache_get(struct iova_domain *iovad,
+				     unsigned long size,
+				     unsigned long limit_pfn);
+static void init_iova_rcaches(struct iova_domain *iovad);
+static void free_iova_rcaches(struct iova_domain *iovad);
+static void fq_destroy_all_entries(struct iova_domain *iovad);
+static void fq_flush_timeout(struct timer_list *t);
+
+void
+init_iova_domain(struct iova_domain *iovad, unsigned long granule,
+	unsigned long start_pfn)
+{
+	/*
+	 * IOVA granularity will normally be equal to the smallest
+	 * supported IOMMU page size; both *must* be capable of
+	 * representing individual CPU pages exactly.
+	 */
+	BUG_ON((granule > PAGE_SIZE) || !is_power_of_2(granule));
+
+	spin_lock_init(&iovad->iova_rbtree_lock);
+	iovad->rbroot = RB_ROOT;
+	iovad->cached_node = &iovad->anchor.node;
+	iovad->cached32_node = &iovad->anchor.node;
+	iovad->granule = granule;
+	iovad->start_pfn = start_pfn;
+	iovad->dma_32bit_pfn = 1UL << (32 - iova_shift(iovad));
+	iovad->flush_cb = NULL;
+	iovad->fq = NULL;
+	iovad->anchor.pfn_lo = iovad->anchor.pfn_hi = IOVA_ANCHOR;
+	rb_link_node(&iovad->anchor.node, NULL, &iovad->rbroot.rb_node);
+	rb_insert_color(&iovad->anchor.node, &iovad->rbroot);
+	init_iova_rcaches(iovad);
+}
+EXPORT_SYMBOL_GPL(init_iova_domain);
+
+bool has_iova_flush_queue(struct iova_domain *iovad)
+{
+	return !!iovad->fq;
+}
+
+static void free_iova_flush_queue(struct iova_domain *iovad)
+{
+	if (!has_iova_flush_queue(iovad))
+		return;
+
+	del_timer_sync(&iovad->fq_timer);
+
+	fq_destroy_all_entries(iovad);
+
+	free_percpu(iovad->fq);
+
+	iovad->fq         = NULL;
+	iovad->flush_cb   = NULL;
+	iovad->entry_dtor = NULL;
+}
+
+int init_iova_flush_queue(struct iova_domain *iovad,
+			  iova_flush_cb flush_cb, iova_entry_dtor entry_dtor)
+{
+	struct iova_fq __percpu *queue;
+	int cpu;
+
+	atomic64_set(&iovad->fq_flush_start_cnt,  0);
+	atomic64_set(&iovad->fq_flush_finish_cnt, 0);
+
+	queue = alloc_percpu(struct iova_fq);
+	if (!queue)
+		return -ENOMEM;
+
+	iovad->flush_cb   = flush_cb;
+	iovad->entry_dtor = entry_dtor;
+
+	for_each_possible_cpu(cpu) {
+		struct iova_fq *fq;
+
+		fq = per_cpu_ptr(queue, cpu);
+		fq->head = 0;
+		fq->tail = 0;
+
+		spin_lock_init(&fq->lock);
+	}
+
+	smp_wmb();
+
+	iovad->fq = queue;
+
+	timer_setup(&iovad->fq_timer, fq_flush_timeout, 0);
+	atomic_set(&iovad->fq_timer_on, 0);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(init_iova_flush_queue);
+
+static struct rb_node *
+__get_cached_rbnode(struct iova_domain *iovad, unsigned long limit_pfn)
+{
+	if (limit_pfn <= iovad->dma_32bit_pfn)
+		return iovad->cached32_node;
+
+	return iovad->cached_node;
+}
+
+static void
+__cached_rbnode_insert_update(struct iova_domain *iovad, struct iova *new)
+{
+	if (new->pfn_hi < iovad->dma_32bit_pfn)
+		iovad->cached32_node = &new->node;
+	else
+		iovad->cached_node = &new->node;
+}
+
+static void
+__cached_rbnode_delete_update(struct iova_domain *iovad, struct iova *free)
+{
+	struct iova *cached_iova;
+
+	cached_iova = rb_entry(iovad->cached32_node, struct iova, node);
+	if (free == cached_iova ||
+	    (free->pfn_hi < iovad->dma_32bit_pfn &&
+	     free->pfn_lo >= cached_iova->pfn_lo))
+		iovad->cached32_node = rb_next(&free->node);
+
+	cached_iova = rb_entry(iovad->cached_node, struct iova, node);
+	if (free->pfn_lo >= cached_iova->pfn_lo)
+		iovad->cached_node = rb_next(&free->node);
+}
+
+/* Insert the iova into domain rbtree by holding writer lock */
+static void
+iova_insert_rbtree(struct rb_root *root, struct iova *iova,
+		   struct rb_node *start)
+{
+	struct rb_node **new, *parent = NULL;
+
+	new = (start) ? &start : &(root->rb_node);
+	/* Figure out where to put new node */
+	while (*new) {
+		struct iova *this = rb_entry(*new, struct iova, node);
+
+		parent = *new;
+
+		if (iova->pfn_lo < this->pfn_lo)
+			new = &((*new)->rb_left);
+		else if (iova->pfn_lo > this->pfn_lo)
+			new = &((*new)->rb_right);
+		else {
+			WARN_ON(1); /* this should not happen */
+			return;
+		}
+	}
+	/* Add new node and rebalance tree. */
+	rb_link_node(&iova->node, parent, new);
+	rb_insert_color(&iova->node, root);
+}
+
+static int __alloc_and_insert_iova_range(struct iova_domain *iovad,
+		unsigned long size, unsigned long limit_pfn,
+			struct iova *new, bool size_aligned)
+{
+	struct rb_node *curr, *prev;
+	struct iova *curr_iova;
+	unsigned long flags;
+	unsigned long new_pfn;
+	unsigned long align_mask = ~0UL;
+
+	if (size_aligned)
+		align_mask <<= fls_long(size - 1);
+
+	/* Walk the tree backwards */
+	spin_lock_irqsave(&iovad->iova_rbtree_lock, flags);
+	curr = __get_cached_rbnode(iovad, limit_pfn);
+	curr_iova = rb_entry(curr, struct iova, node);
+	do {
+		limit_pfn = min(limit_pfn, curr_iova->pfn_lo);
+		new_pfn = (limit_pfn - size) & align_mask;
+		prev = curr;
+		curr = rb_prev(curr);
+		curr_iova = rb_entry(curr, struct iova, node);
+	} while (curr && new_pfn <= curr_iova->pfn_hi);
+
+	if (limit_pfn < size || new_pfn < iovad->start_pfn) {
+		spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
+		return -ENOMEM;
+	}
+
+	/* pfn_lo will point to size aligned address if size_aligned is set */
+	new->pfn_lo = new_pfn;
+	new->pfn_hi = new->pfn_lo + size - 1;
+
+	/* If we have 'prev', it's a valid place to start the insertion. */
+	iova_insert_rbtree(&iovad->rbroot, new, prev);
+	__cached_rbnode_insert_update(iovad, new);
+
+	spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
+
+
+	return 0;
+}
+
+static struct kmem_cache *iova_cache;
+static unsigned int iova_cache_users;
+static DEFINE_MUTEX(iova_cache_mutex);
+
+struct iova *alloc_iova_mem(void)
+{
+	return kmem_cache_zalloc(iova_cache, GFP_ATOMIC);
+}
+EXPORT_SYMBOL(alloc_iova_mem);
+
+void free_iova_mem(struct iova *iova)
+{
+	if (iova->pfn_lo != IOVA_ANCHOR)
+		kmem_cache_free(iova_cache, iova);
+}
+EXPORT_SYMBOL(free_iova_mem);
+
+int iova_cache_get(void)
+{
+	mutex_lock(&iova_cache_mutex);
+	if (!iova_cache_users) {
+		iova_cache = kmem_cache_create(
+			"iommu_iova", sizeof(struct iova), 0,
+			SLAB_HWCACHE_ALIGN, NULL);
+		if (!iova_cache) {
+			mutex_unlock(&iova_cache_mutex);
+			printk(KERN_ERR "Couldn't create iova cache\n");
+			return -ENOMEM;
+		}
+	}
+
+	iova_cache_users++;
+	mutex_unlock(&iova_cache_mutex);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(iova_cache_get);
+
+void iova_cache_put(void)
+{
+	mutex_lock(&iova_cache_mutex);
+	if (WARN_ON(!iova_cache_users)) {
+		mutex_unlock(&iova_cache_mutex);
+		return;
+	}
+	iova_cache_users--;
+	if (!iova_cache_users)
+		kmem_cache_destroy(iova_cache);
+	mutex_unlock(&iova_cache_mutex);
+}
+EXPORT_SYMBOL_GPL(iova_cache_put);
+
+/**
+ * alloc_iova - allocates an iova
+ * @iovad: - iova domain in question
+ * @size: - size of page frames to allocate
+ * @limit_pfn: - max limit address
+ * @size_aligned: - set if size_aligned address range is required
+ * This function allocates an iova in the range iovad->start_pfn to limit_pfn,
+ * searching top-down from limit_pfn to iovad->start_pfn. If the size_aligned
+ * flag is set then the allocated address iova->pfn_lo will be naturally
+ * aligned on roundup_power_of_two(size).
+ */
+struct iova *
+alloc_iova(struct iova_domain *iovad, unsigned long size,
+	unsigned long limit_pfn,
+	bool size_aligned)
+{
+	struct iova *new_iova;
+	int ret;
+
+	new_iova = alloc_iova_mem();
+	if (!new_iova)
+		return NULL;
+
+	ret = __alloc_and_insert_iova_range(iovad, size, limit_pfn + 1,
+			new_iova, size_aligned);
+
+	if (ret) {
+		free_iova_mem(new_iova);
+		return NULL;
+	}
+
+	return new_iova;
+}
+EXPORT_SYMBOL_GPL(alloc_iova);
+
+static struct iova *
+private_find_iova(struct iova_domain *iovad, unsigned long pfn)
+{
+	struct rb_node *node = iovad->rbroot.rb_node;
+
+	assert_spin_locked(&iovad->iova_rbtree_lock);
+
+	while (node) {
+		struct iova *iova = rb_entry(node, struct iova, node);
+
+		if (pfn < iova->pfn_lo)
+			node = node->rb_left;
+		else if (pfn > iova->pfn_hi)
+			node = node->rb_right;
+		else
+			return iova;	/* pfn falls within iova's range */
+	}
+
+	return NULL;
+}
+
+static void private_free_iova(struct iova_domain *iovad, struct iova *iova)
+{
+	assert_spin_locked(&iovad->iova_rbtree_lock);
+	__cached_rbnode_delete_update(iovad, iova);
+	rb_erase(&iova->node, &iovad->rbroot);
+	free_iova_mem(iova);
+}
+
+/**
+ * find_iova - finds an iova for a given pfn
+ * @iovad: - iova domain in question.
+ * @pfn: - page frame number
+ * This function finds and returns an iova belonging to the
+ * given doamin which matches the given pfn.
+ */
+struct iova *find_iova(struct iova_domain *iovad, unsigned long pfn)
+{
+	unsigned long flags;
+	struct iova *iova;
+
+	/* Take the lock so that no other thread is manipulating the rbtree */
+	spin_lock_irqsave(&iovad->iova_rbtree_lock, flags);
+	iova = private_find_iova(iovad, pfn);
+	spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
+	return iova;
+}
+EXPORT_SYMBOL_GPL(find_iova);
+
+/**
+ * __free_iova - frees the given iova
+ * @iovad: iova domain in question.
+ * @iova: iova in question.
+ * Frees the given iova belonging to the giving domain
+ */
+void
+__free_iova(struct iova_domain *iovad, struct iova *iova)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&iovad->iova_rbtree_lock, flags);
+	private_free_iova(iovad, iova);
+	spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
+}
+EXPORT_SYMBOL_GPL(__free_iova);
+
+/**
+ * free_iova - finds and frees the iova for a given pfn
+ * @iovad: - iova domain in question.
+ * @pfn: - pfn that is allocated previously
+ * This functions finds an iova for a given pfn and then
+ * frees the iova from that domain.
+ */
+void
+free_iova(struct iova_domain *iovad, unsigned long pfn)
+{
+	struct iova *iova = find_iova(iovad, pfn);
+
+	if (iova)
+		__free_iova(iovad, iova);
+
+}
+EXPORT_SYMBOL_GPL(free_iova);
+
+/**
+ * alloc_iova_fast - allocates an iova from rcache
+ * @iovad: - iova domain in question
+ * @size: - size of page frames to allocate
+ * @limit_pfn: - max limit address
+ * @flush_rcache: - set to flush rcache on regular allocation failure
+ * This function tries to satisfy an iova allocation from the rcache,
+ * and falls back to regular allocation on failure. If regular allocation
+ * fails too and the flush_rcache flag is set then the rcache will be flushed.
+*/
+unsigned long
+alloc_iova_fast(struct iova_domain *iovad, unsigned long size,
+		unsigned long limit_pfn, bool flush_rcache)
+{
+	unsigned long iova_pfn;
+	struct iova *new_iova;
+
+	iova_pfn = iova_rcache_get(iovad, size, limit_pfn + 1);
+	if (iova_pfn)
+		return iova_pfn;
+
+retry:
+	new_iova = alloc_iova(iovad, size, limit_pfn, true);
+	if (!new_iova) {
+		unsigned int cpu;
+
+		if (!flush_rcache)
+			return 0;
+
+		/* Try replenishing IOVAs by flushing rcache. */
+		flush_rcache = false;
+		for_each_online_cpu(cpu)
+			free_cpu_cached_iovas(cpu, iovad);
+		goto retry;
+	}
+
+	return new_iova->pfn_lo;
+}
+EXPORT_SYMBOL_GPL(alloc_iova_fast);
+
+/**
+ * free_iova_fast - free iova pfn range into rcache
+ * @iovad: - iova domain in question.
+ * @pfn: - pfn that is allocated previously
+ * @size: - # of pages in range
+ * This functions frees an iova range by trying to put it into the rcache,
+ * falling back to regular iova deallocation via free_iova() if this fails.
+ */
+void
+free_iova_fast(struct iova_domain *iovad, unsigned long pfn, unsigned long size)
+{
+	if (iova_rcache_insert(iovad, pfn, size))
+		return;
+
+	free_iova(iovad, pfn);
+}
+EXPORT_SYMBOL_GPL(free_iova_fast);
+
+#define fq_ring_for_each(i, fq) \
+	for ((i) = (fq)->head; (i) != (fq)->tail; (i) = ((i) + 1) % IOVA_FQ_SIZE)
+
+static inline bool fq_full(struct iova_fq *fq)
+{
+	assert_spin_locked(&fq->lock);
+	return (((fq->tail + 1) % IOVA_FQ_SIZE) == fq->head);
+}
+
+static inline unsigned fq_ring_add(struct iova_fq *fq)
+{
+	unsigned idx = fq->tail;
+
+	assert_spin_locked(&fq->lock);
+
+	fq->tail = (idx + 1) % IOVA_FQ_SIZE;
+
+	return idx;
+}
+
+static void fq_ring_free(struct iova_domain *iovad, struct iova_fq *fq)
+{
+	u64 counter = atomic64_read(&iovad->fq_flush_finish_cnt);
+	unsigned idx;
+
+	assert_spin_locked(&fq->lock);
+
+	fq_ring_for_each(idx, fq) {
+
+		if (fq->entries[idx].counter >= counter)
+			break;
+
+		if (iovad->entry_dtor)
+			iovad->entry_dtor(fq->entries[idx].data);
+
+		free_iova_fast(iovad,
+			       fq->entries[idx].iova_pfn,
+			       fq->entries[idx].pages);
+
+		fq->head = (fq->head + 1) % IOVA_FQ_SIZE;
+	}
+}
+
+static void iova_domain_flush(struct iova_domain *iovad)
+{
+	atomic64_inc(&iovad->fq_flush_start_cnt);
+	iovad->flush_cb(iovad);
+	atomic64_inc(&iovad->fq_flush_finish_cnt);
+}
+
+static void fq_destroy_all_entries(struct iova_domain *iovad)
+{
+	int cpu;
+
+	/*
+	 * This code runs when the iova_domain is being detroyed, so don't
+	 * bother to free iovas, just call the entry_dtor on all remaining
+	 * entries.
+	 */
+	if (!iovad->entry_dtor)
+		return;
+
+	for_each_possible_cpu(cpu) {
+		struct iova_fq *fq = per_cpu_ptr(iovad->fq, cpu);
+		int idx;
+
+		fq_ring_for_each(idx, fq)
+			iovad->entry_dtor(fq->entries[idx].data);
+	}
+}
+
+static void fq_flush_timeout(struct timer_list *t)
+{
+	struct iova_domain *iovad = from_timer(iovad, t, fq_timer);
+	int cpu;
+
+	atomic_set(&iovad->fq_timer_on, 0);
+	iova_domain_flush(iovad);
+
+	for_each_possible_cpu(cpu) {
+		unsigned long flags;
+		struct iova_fq *fq;
+
+		fq = per_cpu_ptr(iovad->fq, cpu);
+		spin_lock_irqsave(&fq->lock, flags);
+		fq_ring_free(iovad, fq);
+		spin_unlock_irqrestore(&fq->lock, flags);
+	}
+}
+
+void queue_iova(struct iova_domain *iovad,
+		unsigned long pfn, unsigned long pages,
+		unsigned long data)
+{
+	struct iova_fq *fq = raw_cpu_ptr(iovad->fq);
+	unsigned long flags;
+	unsigned idx;
+
+	spin_lock_irqsave(&fq->lock, flags);
+
+	/*
+	 * First remove all entries from the flush queue that have already been
+	 * flushed out on another CPU. This makes the fq_full() check below less
+	 * likely to be true.
+	 */
+	fq_ring_free(iovad, fq);
+
+	if (fq_full(fq)) {
+		iova_domain_flush(iovad);
+		fq_ring_free(iovad, fq);
+	}
+
+	idx = fq_ring_add(fq);
+
+	fq->entries[idx].iova_pfn = pfn;
+	fq->entries[idx].pages    = pages;
+	fq->entries[idx].data     = data;
+	fq->entries[idx].counter  = atomic64_read(&iovad->fq_flush_start_cnt);
+
+	spin_unlock_irqrestore(&fq->lock, flags);
+
+	/* Avoid false sharing as much as possible. */
+	if (!atomic_read(&iovad->fq_timer_on) &&
+	    !atomic_cmpxchg(&iovad->fq_timer_on, 0, 1))
+		mod_timer(&iovad->fq_timer,
+			  jiffies + msecs_to_jiffies(IOVA_FQ_TIMEOUT));
+}
+EXPORT_SYMBOL_GPL(queue_iova);
+
+/**
+ * put_iova_domain - destroys the iova doamin
+ * @iovad: - iova domain in question.
+ * All the iova's in that domain are destroyed.
+ */
+void put_iova_domain(struct iova_domain *iovad)
+{
+	struct iova *iova, *tmp;
+
+	free_iova_flush_queue(iovad);
+	free_iova_rcaches(iovad);
+	rbtree_postorder_for_each_entry_safe(iova, tmp, &iovad->rbroot, node)
+		free_iova_mem(iova);
+}
+EXPORT_SYMBOL_GPL(put_iova_domain);
+
+static int
+__is_range_overlap(struct rb_node *node,
+	unsigned long pfn_lo, unsigned long pfn_hi)
+{
+	struct iova *iova = rb_entry(node, struct iova, node);
+
+	if ((pfn_lo <= iova->pfn_hi) && (pfn_hi >= iova->pfn_lo))
+		return 1;
+	return 0;
+}
+
+static inline struct iova *
+alloc_and_init_iova(unsigned long pfn_lo, unsigned long pfn_hi)
+{
+	struct iova *iova;
+
+	iova = alloc_iova_mem();
+	if (iova) {
+		iova->pfn_lo = pfn_lo;
+		iova->pfn_hi = pfn_hi;
+	}
+
+	return iova;
+}
+
+static struct iova *
+__insert_new_range(struct iova_domain *iovad,
+	unsigned long pfn_lo, unsigned long pfn_hi)
+{
+	struct iova *iova;
+
+	iova = alloc_and_init_iova(pfn_lo, pfn_hi);
+	if (iova)
+		iova_insert_rbtree(&iovad->rbroot, iova, NULL);
+
+	return iova;
+}
+
+static void
+__adjust_overlap_range(struct iova *iova,
+	unsigned long *pfn_lo, unsigned long *pfn_hi)
+{
+	if (*pfn_lo < iova->pfn_lo)
+		iova->pfn_lo = *pfn_lo;
+	if (*pfn_hi > iova->pfn_hi)
+		*pfn_lo = iova->pfn_hi + 1;
+}
+
+/**
+ * reserve_iova - reserves an iova in the given range
+ * @iovad: - iova domain pointer
+ * @pfn_lo: - lower page frame address
+ * @pfn_hi:- higher pfn adderss
+ * This function allocates reserves the address range from pfn_lo to pfn_hi so
+ * that this address is not dished out as part of alloc_iova.
+ */
+struct iova *
+reserve_iova(struct iova_domain *iovad,
+	unsigned long pfn_lo, unsigned long pfn_hi)
+{
+	struct rb_node *node;
+	unsigned long flags;
+	struct iova *iova;
+	unsigned int overlap = 0;
+
+	/* Don't allow nonsensical pfns */
+	if (WARN_ON((pfn_hi | pfn_lo) > (ULLONG_MAX >> iova_shift(iovad))))
+		return NULL;
+
+	spin_lock_irqsave(&iovad->iova_rbtree_lock, flags);
+	for (node = rb_first(&iovad->rbroot); node; node = rb_next(node)) {
+		if (__is_range_overlap(node, pfn_lo, pfn_hi)) {
+			iova = rb_entry(node, struct iova, node);
+			__adjust_overlap_range(iova, &pfn_lo, &pfn_hi);
+			if ((pfn_lo >= iova->pfn_lo) &&
+				(pfn_hi <= iova->pfn_hi))
+				goto finish;
+			overlap = 1;
+
+		} else if (overlap)
+				break;
+	}
+
+	/* We are here either because this is the first reserver node
+	 * or need to insert remaining non overlap addr range
+	 */
+	iova = __insert_new_range(iovad, pfn_lo, pfn_hi);
+finish:
+
+	spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
+	return iova;
+}
+EXPORT_SYMBOL_GPL(reserve_iova);
+
+/**
+ * copy_reserved_iova - copies the reserved between domains
+ * @from: - source doamin from where to copy
+ * @to: - destination domin where to copy
+ * This function copies reserved iova's from one doamin to
+ * other.
+ */
+void
+copy_reserved_iova(struct iova_domain *from, struct iova_domain *to)
+{
+	unsigned long flags;
+	struct rb_node *node;
+
+	spin_lock_irqsave(&from->iova_rbtree_lock, flags);
+	for (node = rb_first(&from->rbroot); node; node = rb_next(node)) {
+		struct iova *iova = rb_entry(node, struct iova, node);
+		struct iova *new_iova;
+
+		if (iova->pfn_lo == IOVA_ANCHOR)
+			continue;
+
+		new_iova = reserve_iova(to, iova->pfn_lo, iova->pfn_hi);
+		if (!new_iova)
+			printk(KERN_ERR "Reserve iova range %lx@%lx failed\n",
+				iova->pfn_lo, iova->pfn_lo);
+	}
+	spin_unlock_irqrestore(&from->iova_rbtree_lock, flags);
+}
+EXPORT_SYMBOL_GPL(copy_reserved_iova);
+
+struct iova *
+split_and_remove_iova(struct iova_domain *iovad, struct iova *iova,
+		      unsigned long pfn_lo, unsigned long pfn_hi)
+{
+	unsigned long flags;
+	struct iova *prev = NULL, *next = NULL;
+
+	spin_lock_irqsave(&iovad->iova_rbtree_lock, flags);
+	if (iova->pfn_lo < pfn_lo) {
+		prev = alloc_and_init_iova(iova->pfn_lo, pfn_lo - 1);
+		if (prev == NULL)
+			goto error;
+	}
+	if (iova->pfn_hi > pfn_hi) {
+		next = alloc_and_init_iova(pfn_hi + 1, iova->pfn_hi);
+		if (next == NULL)
+			goto error;
+	}
+
+	__cached_rbnode_delete_update(iovad, iova);
+	rb_erase(&iova->node, &iovad->rbroot);
+
+	if (prev) {
+		iova_insert_rbtree(&iovad->rbroot, prev, NULL);
+		iova->pfn_lo = pfn_lo;
+	}
+	if (next) {
+		iova_insert_rbtree(&iovad->rbroot, next, NULL);
+		iova->pfn_hi = pfn_hi;
+	}
+	spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
+
+	return iova;
+
+error:
+	spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
+	if (prev)
+		free_iova_mem(prev);
+	return NULL;
+}
+
+/*
+ * Magazine caches for IOVA ranges.  For an introduction to magazines,
+ * see the USENIX 2001 paper "Magazines and Vmem: Extending the Slab
+ * Allocator to Many CPUs and Arbitrary Resources" by Bonwick and Adams.
+ * For simplicity, we use a static magazine size and don't implement the
+ * dynamic size tuning described in the paper.
+ */
+
+#define IOVA_MAG_SIZE 128
+
+struct iova_magazine {
+	unsigned long size;
+	unsigned long pfns[IOVA_MAG_SIZE];
+};
+
+struct iova_cpu_rcache {
+	spinlock_t lock;
+	struct iova_magazine *loaded;
+	struct iova_magazine *prev;
+};
+
+static struct iova_magazine *iova_magazine_alloc(gfp_t flags)
+{
+	return kzalloc(sizeof(struct iova_magazine), flags);
+}
+
+static void iova_magazine_free(struct iova_magazine *mag)
+{
+	kfree(mag);
+}
+
+static void
+iova_magazine_free_pfns(struct iova_magazine *mag, struct iova_domain *iovad)
+{
+	unsigned long flags;
+	int i;
+
+	if (!mag)
+		return;
+
+	spin_lock_irqsave(&iovad->iova_rbtree_lock, flags);
+
+	for (i = 0 ; i < mag->size; ++i) {
+		struct iova *iova = private_find_iova(iovad, mag->pfns[i]);
+
+		if (WARN_ON(!iova))
+			continue;
+
+		private_free_iova(iovad, iova);
+	}
+
+	spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
+
+	mag->size = 0;
+}
+
+static bool iova_magazine_full(struct iova_magazine *mag)
+{
+	return (mag && mag->size == IOVA_MAG_SIZE);
+}
+
+static bool iova_magazine_empty(struct iova_magazine *mag)
+{
+	return (!mag || mag->size == 0);
+}
+
+static unsigned long iova_magazine_pop(struct iova_magazine *mag,
+				       unsigned long limit_pfn)
+{
+	int i;
+	unsigned long pfn;
+
+	BUG_ON(iova_magazine_empty(mag));
+
+	/* Only fall back to the rbtree if we have no suitable pfns at all */
+	for (i = mag->size - 1; mag->pfns[i] > limit_pfn; i--)
+		if (i == 0)
+			return 0;
+
+	/* Swap it to pop it */
+	pfn = mag->pfns[i];
+	mag->pfns[i] = mag->pfns[--mag->size];
+
+	return pfn;
+}
+
+static void iova_magazine_push(struct iova_magazine *mag, unsigned long pfn)
+{
+	BUG_ON(iova_magazine_full(mag));
+
+	mag->pfns[mag->size++] = pfn;
+}
+
+static void init_iova_rcaches(struct iova_domain *iovad)
+{
+	struct iova_cpu_rcache *cpu_rcache;
+	struct iova_rcache *rcache;
+	unsigned int cpu;
+	int i;
+
+	for (i = 0; i < IOVA_RANGE_CACHE_MAX_SIZE; ++i) {
+		rcache = &iovad->rcaches[i];
+		spin_lock_init(&rcache->lock);
+		rcache->depot_size = 0;
+		rcache->cpu_rcaches = __alloc_percpu(sizeof(*cpu_rcache), cache_line_size());
+		if (WARN_ON(!rcache->cpu_rcaches))
+			continue;
+		for_each_possible_cpu(cpu) {
+			cpu_rcache = per_cpu_ptr(rcache->cpu_rcaches, cpu);
+			spin_lock_init(&cpu_rcache->lock);
+			cpu_rcache->loaded = iova_magazine_alloc(GFP_KERNEL);
+			cpu_rcache->prev = iova_magazine_alloc(GFP_KERNEL);
+		}
+	}
+}
+
+/*
+ * Try inserting IOVA range starting with 'iova_pfn' into 'rcache', and
+ * return true on success.  Can fail if rcache is full and we can't free
+ * space, and free_iova() (our only caller) will then return the IOVA
+ * range to the rbtree instead.
+ */
+static bool __iova_rcache_insert(struct iova_domain *iovad,
+				 struct iova_rcache *rcache,
+				 unsigned long iova_pfn)
+{
+	struct iova_magazine *mag_to_free = NULL;
+	struct iova_cpu_rcache *cpu_rcache;
+	bool can_insert = false;
+	unsigned long flags;
+
+	cpu_rcache = raw_cpu_ptr(rcache->cpu_rcaches);
+	spin_lock_irqsave(&cpu_rcache->lock, flags);
+
+	if (!iova_magazine_full(cpu_rcache->loaded)) {
+		can_insert = true;
+	} else if (!iova_magazine_full(cpu_rcache->prev)) {
+		swap(cpu_rcache->prev, cpu_rcache->loaded);
+		can_insert = true;
+	} else {
+		struct iova_magazine *new_mag = iova_magazine_alloc(GFP_ATOMIC);
+
+		if (new_mag) {
+			spin_lock(&rcache->lock);
+			if (rcache->depot_size < MAX_GLOBAL_MAGS) {
+				rcache->depot[rcache->depot_size++] =
+						cpu_rcache->loaded;
+			} else {
+				mag_to_free = cpu_rcache->loaded;
+			}
+			spin_unlock(&rcache->lock);
+
+			cpu_rcache->loaded = new_mag;
+			can_insert = true;
+		}
+	}
+
+	if (can_insert)
+		iova_magazine_push(cpu_rcache->loaded, iova_pfn);
+
+	spin_unlock_irqrestore(&cpu_rcache->lock, flags);
+
+	if (mag_to_free) {
+		iova_magazine_free_pfns(mag_to_free, iovad);
+		iova_magazine_free(mag_to_free);
+	}
+
+	return can_insert;
+}
+
+static bool iova_rcache_insert(struct iova_domain *iovad, unsigned long pfn,
+			       unsigned long size)
+{
+	unsigned int log_size = order_base_2(size);
+
+	if (log_size >= IOVA_RANGE_CACHE_MAX_SIZE)
+		return false;
+
+	return __iova_rcache_insert(iovad, &iovad->rcaches[log_size], pfn);
+}
+
+/*
+ * Caller wants to allocate a new IOVA range from 'rcache'.  If we can
+ * satisfy the request, return a matching non-NULL range and remove
+ * it from the 'rcache'.
+ */
+static unsigned long __iova_rcache_get(struct iova_rcache *rcache,
+				       unsigned long limit_pfn)
+{
+	struct iova_cpu_rcache *cpu_rcache;
+	unsigned long iova_pfn = 0;
+	bool has_pfn = false;
+	unsigned long flags;
+
+	cpu_rcache = raw_cpu_ptr(rcache->cpu_rcaches);
+	spin_lock_irqsave(&cpu_rcache->lock, flags);
+
+	if (!iova_magazine_empty(cpu_rcache->loaded)) {
+		has_pfn = true;
+	} else if (!iova_magazine_empty(cpu_rcache->prev)) {
+		swap(cpu_rcache->prev, cpu_rcache->loaded);
+		has_pfn = true;
+	} else {
+		spin_lock(&rcache->lock);
+		if (rcache->depot_size > 0) {
+			iova_magazine_free(cpu_rcache->loaded);
+			cpu_rcache->loaded = rcache->depot[--rcache->depot_size];
+			has_pfn = true;
+		}
+		spin_unlock(&rcache->lock);
+	}
+
+	if (has_pfn)
+		iova_pfn = iova_magazine_pop(cpu_rcache->loaded, limit_pfn);
+
+	spin_unlock_irqrestore(&cpu_rcache->lock, flags);
+
+	return iova_pfn;
+}
+
+/*
+ * Try to satisfy IOVA allocation range from rcache.  Fail if requested
+ * size is too big or the DMA limit we are given isn't satisfied by the
+ * top element in the magazine.
+ */
+static unsigned long iova_rcache_get(struct iova_domain *iovad,
+				     unsigned long size,
+				     unsigned long limit_pfn)
+{
+	unsigned int log_size = order_base_2(size);
+
+	if (log_size >= IOVA_RANGE_CACHE_MAX_SIZE)
+		return 0;
+
+	return __iova_rcache_get(&iovad->rcaches[log_size], limit_pfn - size);
+}
+
+/*
+ * free rcache data structures.
+ */
+static void free_iova_rcaches(struct iova_domain *iovad)
+{
+	struct iova_rcache *rcache;
+	struct iova_cpu_rcache *cpu_rcache;
+	unsigned int cpu;
+	int i, j;
+
+	for (i = 0; i < IOVA_RANGE_CACHE_MAX_SIZE; ++i) {
+		rcache = &iovad->rcaches[i];
+		for_each_possible_cpu(cpu) {
+			cpu_rcache = per_cpu_ptr(rcache->cpu_rcaches, cpu);
+			iova_magazine_free(cpu_rcache->loaded);
+			iova_magazine_free(cpu_rcache->prev);
+		}
+		free_percpu(rcache->cpu_rcaches);
+		for (j = 0; j < rcache->depot_size; ++j)
+			iova_magazine_free(rcache->depot[j]);
+	}
+}
+
+/*
+ * free all the IOVA ranges cached by a cpu (used when cpu is unplugged)
+ */
+void free_cpu_cached_iovas(unsigned int cpu, struct iova_domain *iovad)
+{
+	struct iova_cpu_rcache *cpu_rcache;
+	struct iova_rcache *rcache;
+	unsigned long flags;
+	int i;
+
+	for (i = 0; i < IOVA_RANGE_CACHE_MAX_SIZE; ++i) {
+		rcache = &iovad->rcaches[i];
+		cpu_rcache = per_cpu_ptr(rcache->cpu_rcaches, cpu);
+		spin_lock_irqsave(&cpu_rcache->lock, flags);
+		iova_magazine_free_pfns(cpu_rcache->loaded, iovad);
+		iova_magazine_free_pfns(cpu_rcache->prev, iovad);
+		spin_unlock_irqrestore(&cpu_rcache->lock, flags);
+	}
+}
+
+MODULE_AUTHOR("Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>");
+MODULE_LICENSE("GPL");
diff --git a/drivers/iommu/ipmmu-vmsa.c b/drivers/iommu/ipmmu-vmsa.c
new file mode 100644
index 000000000..d8598e44e
--- /dev/null
+++ b/drivers/iommu/ipmmu-vmsa.c
@@ -0,0 +1,1149 @@
+/*
+ * IPMMU VMSA
+ *
+ * Copyright (C) 2014 Renesas Electronics Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; version 2 of the License.
+ */
+
+#include <linux/bitmap.h>
+#include <linux/delay.h>
+#include <linux/dma-iommu.h>
+#include <linux/dma-mapping.h>
+#include <linux/err.h>
+#include <linux/export.h>
+#include <linux/interrupt.h>
+#include <linux/io.h>
+#include <linux/iommu.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/of_device.h>
+#include <linux/of_iommu.h>
+#include <linux/of_platform.h>
+#include <linux/platform_device.h>
+#include <linux/sizes.h>
+#include <linux/slab.h>
+#include <linux/sys_soc.h>
+
+#if defined(CONFIG_ARM) && !defined(CONFIG_IOMMU_DMA)
+#include <asm/dma-iommu.h>
+#include <asm/pgalloc.h>
+#else
+#define arm_iommu_create_mapping(...)	NULL
+#define arm_iommu_attach_device(...)	-ENODEV
+#define arm_iommu_release_mapping(...)	do {} while (0)
+#define arm_iommu_detach_device(...)	do {} while (0)
+#endif
+
+#include "io-pgtable.h"
+
+#define IPMMU_CTX_MAX 8
+
+struct ipmmu_features {
+	bool use_ns_alias_offset;
+	bool has_cache_leaf_nodes;
+	unsigned int number_of_contexts;
+	bool setup_imbuscr;
+	bool twobit_imttbcr_sl0;
+	bool reserved_context;
+};
+
+struct ipmmu_vmsa_device {
+	struct device *dev;
+	void __iomem *base;
+	struct iommu_device iommu;
+	struct ipmmu_vmsa_device *root;
+	const struct ipmmu_features *features;
+	unsigned int num_utlbs;
+	unsigned int num_ctx;
+	spinlock_t lock;			/* Protects ctx and domains[] */
+	DECLARE_BITMAP(ctx, IPMMU_CTX_MAX);
+	struct ipmmu_vmsa_domain *domains[IPMMU_CTX_MAX];
+
+	struct iommu_group *group;
+	struct dma_iommu_mapping *mapping;
+};
+
+struct ipmmu_vmsa_domain {
+	struct ipmmu_vmsa_device *mmu;
+	struct iommu_domain io_domain;
+
+	struct io_pgtable_cfg cfg;
+	struct io_pgtable_ops *iop;
+
+	unsigned int context_id;
+	struct mutex mutex;			/* Protects mappings */
+};
+
+static struct ipmmu_vmsa_domain *to_vmsa_domain(struct iommu_domain *dom)
+{
+	return container_of(dom, struct ipmmu_vmsa_domain, io_domain);
+}
+
+static struct ipmmu_vmsa_device *to_ipmmu(struct device *dev)
+{
+	return dev->iommu_fwspec ? dev->iommu_fwspec->iommu_priv : NULL;
+}
+
+#define TLB_LOOP_TIMEOUT		100	/* 100us */
+
+/* -----------------------------------------------------------------------------
+ * Registers Definition
+ */
+
+#define IM_NS_ALIAS_OFFSET		0x800
+
+#define IM_CTX_SIZE			0x40
+
+#define IMCTR				0x0000
+#define IMCTR_TRE			(1 << 17)
+#define IMCTR_AFE			(1 << 16)
+#define IMCTR_RTSEL_MASK		(3 << 4)
+#define IMCTR_RTSEL_SHIFT		4
+#define IMCTR_TREN			(1 << 3)
+#define IMCTR_INTEN			(1 << 2)
+#define IMCTR_FLUSH			(1 << 1)
+#define IMCTR_MMUEN			(1 << 0)
+
+#define IMCAAR				0x0004
+
+#define IMTTBCR				0x0008
+#define IMTTBCR_EAE			(1 << 31)
+#define IMTTBCR_PMB			(1 << 30)
+#define IMTTBCR_SH1_NON_SHAREABLE	(0 << 28)
+#define IMTTBCR_SH1_OUTER_SHAREABLE	(2 << 28)
+#define IMTTBCR_SH1_INNER_SHAREABLE	(3 << 28)
+#define IMTTBCR_SH1_MASK		(3 << 28)
+#define IMTTBCR_ORGN1_NC		(0 << 26)
+#define IMTTBCR_ORGN1_WB_WA		(1 << 26)
+#define IMTTBCR_ORGN1_WT		(2 << 26)
+#define IMTTBCR_ORGN1_WB		(3 << 26)
+#define IMTTBCR_ORGN1_MASK		(3 << 26)
+#define IMTTBCR_IRGN1_NC		(0 << 24)
+#define IMTTBCR_IRGN1_WB_WA		(1 << 24)
+#define IMTTBCR_IRGN1_WT		(2 << 24)
+#define IMTTBCR_IRGN1_WB		(3 << 24)
+#define IMTTBCR_IRGN1_MASK		(3 << 24)
+#define IMTTBCR_TSZ1_MASK		(7 << 16)
+#define IMTTBCR_TSZ1_SHIFT		16
+#define IMTTBCR_SH0_NON_SHAREABLE	(0 << 12)
+#define IMTTBCR_SH0_OUTER_SHAREABLE	(2 << 12)
+#define IMTTBCR_SH0_INNER_SHAREABLE	(3 << 12)
+#define IMTTBCR_SH0_MASK		(3 << 12)
+#define IMTTBCR_ORGN0_NC		(0 << 10)
+#define IMTTBCR_ORGN0_WB_WA		(1 << 10)
+#define IMTTBCR_ORGN0_WT		(2 << 10)
+#define IMTTBCR_ORGN0_WB		(3 << 10)
+#define IMTTBCR_ORGN0_MASK		(3 << 10)
+#define IMTTBCR_IRGN0_NC		(0 << 8)
+#define IMTTBCR_IRGN0_WB_WA		(1 << 8)
+#define IMTTBCR_IRGN0_WT		(2 << 8)
+#define IMTTBCR_IRGN0_WB		(3 << 8)
+#define IMTTBCR_IRGN0_MASK		(3 << 8)
+#define IMTTBCR_SL0_LVL_2		(0 << 4)
+#define IMTTBCR_SL0_LVL_1		(1 << 4)
+#define IMTTBCR_TSZ0_MASK		(7 << 0)
+#define IMTTBCR_TSZ0_SHIFT		O
+
+#define IMTTBCR_SL0_TWOBIT_LVL_3	(0 << 6)
+#define IMTTBCR_SL0_TWOBIT_LVL_2	(1 << 6)
+#define IMTTBCR_SL0_TWOBIT_LVL_1	(2 << 6)
+
+#define IMBUSCR				0x000c
+#define IMBUSCR_DVM			(1 << 2)
+#define IMBUSCR_BUSSEL_SYS		(0 << 0)
+#define IMBUSCR_BUSSEL_CCI		(1 << 0)
+#define IMBUSCR_BUSSEL_IMCAAR		(2 << 0)
+#define IMBUSCR_BUSSEL_CCI_IMCAAR	(3 << 0)
+#define IMBUSCR_BUSSEL_MASK		(3 << 0)
+
+#define IMTTLBR0			0x0010
+#define IMTTUBR0			0x0014
+#define IMTTLBR1			0x0018
+#define IMTTUBR1			0x001c
+
+#define IMSTR				0x0020
+#define IMSTR_ERRLVL_MASK		(3 << 12)
+#define IMSTR_ERRLVL_SHIFT		12
+#define IMSTR_ERRCODE_TLB_FORMAT	(1 << 8)
+#define IMSTR_ERRCODE_ACCESS_PERM	(4 << 8)
+#define IMSTR_ERRCODE_SECURE_ACCESS	(5 << 8)
+#define IMSTR_ERRCODE_MASK		(7 << 8)
+#define IMSTR_MHIT			(1 << 4)
+#define IMSTR_ABORT			(1 << 2)
+#define IMSTR_PF			(1 << 1)
+#define IMSTR_TF			(1 << 0)
+
+#define IMMAIR0				0x0028
+#define IMMAIR1				0x002c
+#define IMMAIR_ATTR_MASK		0xff
+#define IMMAIR_ATTR_DEVICE		0x04
+#define IMMAIR_ATTR_NC			0x44
+#define IMMAIR_ATTR_WBRWA		0xff
+#define IMMAIR_ATTR_SHIFT(n)		((n) << 3)
+#define IMMAIR_ATTR_IDX_NC		0
+#define IMMAIR_ATTR_IDX_WBRWA		1
+#define IMMAIR_ATTR_IDX_DEV		2
+
+#define IMEAR				0x0030
+
+#define IMPCTR				0x0200
+#define IMPSTR				0x0208
+#define IMPEAR				0x020c
+#define IMPMBA(n)			(0x0280 + ((n) * 4))
+#define IMPMBD(n)			(0x02c0 + ((n) * 4))
+
+#define IMUCTR(n)			((n) < 32 ? IMUCTR0(n) : IMUCTR32(n))
+#define IMUCTR0(n)			(0x0300 + ((n) * 16))
+#define IMUCTR32(n)			(0x0600 + (((n) - 32) * 16))
+#define IMUCTR_FIXADDEN			(1 << 31)
+#define IMUCTR_FIXADD_MASK		(0xff << 16)
+#define IMUCTR_FIXADD_SHIFT		16
+#define IMUCTR_TTSEL_MMU(n)		((n) << 4)
+#define IMUCTR_TTSEL_PMB		(8 << 4)
+#define IMUCTR_TTSEL_MASK		(15 << 4)
+#define IMUCTR_FLUSH			(1 << 1)
+#define IMUCTR_MMUEN			(1 << 0)
+
+#define IMUASID(n)			((n) < 32 ? IMUASID0(n) : IMUASID32(n))
+#define IMUASID0(n)			(0x0308 + ((n) * 16))
+#define IMUASID32(n)			(0x0608 + (((n) - 32) * 16))
+#define IMUASID_ASID8_MASK		(0xff << 8)
+#define IMUASID_ASID8_SHIFT		8
+#define IMUASID_ASID0_MASK		(0xff << 0)
+#define IMUASID_ASID0_SHIFT		0
+
+/* -----------------------------------------------------------------------------
+ * Root device handling
+ */
+
+static struct platform_driver ipmmu_driver;
+
+static bool ipmmu_is_root(struct ipmmu_vmsa_device *mmu)
+{
+	return mmu->root == mmu;
+}
+
+static int __ipmmu_check_device(struct device *dev, void *data)
+{
+	struct ipmmu_vmsa_device *mmu = dev_get_drvdata(dev);
+	struct ipmmu_vmsa_device **rootp = data;
+
+	if (ipmmu_is_root(mmu))
+		*rootp = mmu;
+
+	return 0;
+}
+
+static struct ipmmu_vmsa_device *ipmmu_find_root(void)
+{
+	struct ipmmu_vmsa_device *root = NULL;
+
+	return driver_for_each_device(&ipmmu_driver.driver, NULL, &root,
+				      __ipmmu_check_device) == 0 ? root : NULL;
+}
+
+/* -----------------------------------------------------------------------------
+ * Read/Write Access
+ */
+
+static u32 ipmmu_read(struct ipmmu_vmsa_device *mmu, unsigned int offset)
+{
+	return ioread32(mmu->base + offset);
+}
+
+static void ipmmu_write(struct ipmmu_vmsa_device *mmu, unsigned int offset,
+			u32 data)
+{
+	iowrite32(data, mmu->base + offset);
+}
+
+static u32 ipmmu_ctx_read_root(struct ipmmu_vmsa_domain *domain,
+			       unsigned int reg)
+{
+	return ipmmu_read(domain->mmu->root,
+			  domain->context_id * IM_CTX_SIZE + reg);
+}
+
+static void ipmmu_ctx_write_root(struct ipmmu_vmsa_domain *domain,
+				 unsigned int reg, u32 data)
+{
+	ipmmu_write(domain->mmu->root,
+		    domain->context_id * IM_CTX_SIZE + reg, data);
+}
+
+static void ipmmu_ctx_write_all(struct ipmmu_vmsa_domain *domain,
+				unsigned int reg, u32 data)
+{
+	if (domain->mmu != domain->mmu->root)
+		ipmmu_write(domain->mmu,
+			    domain->context_id * IM_CTX_SIZE + reg, data);
+
+	ipmmu_write(domain->mmu->root,
+		    domain->context_id * IM_CTX_SIZE + reg, data);
+}
+
+/* -----------------------------------------------------------------------------
+ * TLB and microTLB Management
+ */
+
+/* Wait for any pending TLB invalidations to complete */
+static void ipmmu_tlb_sync(struct ipmmu_vmsa_domain *domain)
+{
+	unsigned int count = 0;
+
+	while (ipmmu_ctx_read_root(domain, IMCTR) & IMCTR_FLUSH) {
+		cpu_relax();
+		if (++count == TLB_LOOP_TIMEOUT) {
+			dev_err_ratelimited(domain->mmu->dev,
+			"TLB sync timed out -- MMU may be deadlocked\n");
+			return;
+		}
+		udelay(1);
+	}
+}
+
+static void ipmmu_tlb_invalidate(struct ipmmu_vmsa_domain *domain)
+{
+	u32 reg;
+
+	reg = ipmmu_ctx_read_root(domain, IMCTR);
+	reg |= IMCTR_FLUSH;
+	ipmmu_ctx_write_all(domain, IMCTR, reg);
+
+	ipmmu_tlb_sync(domain);
+}
+
+/*
+ * Enable MMU translation for the microTLB.
+ */
+static void ipmmu_utlb_enable(struct ipmmu_vmsa_domain *domain,
+			      unsigned int utlb)
+{
+	struct ipmmu_vmsa_device *mmu = domain->mmu;
+
+	/*
+	 * TODO: Reference-count the microTLB as several bus masters can be
+	 * connected to the same microTLB.
+	 */
+
+	/* TODO: What should we set the ASID to ? */
+	ipmmu_write(mmu, IMUASID(utlb), 0);
+	/* TODO: Do we need to flush the microTLB ? */
+	ipmmu_write(mmu, IMUCTR(utlb),
+		    IMUCTR_TTSEL_MMU(domain->context_id) | IMUCTR_FLUSH |
+		    IMUCTR_MMUEN);
+}
+
+/*
+ * Disable MMU translation for the microTLB.
+ */
+static void ipmmu_utlb_disable(struct ipmmu_vmsa_domain *domain,
+			       unsigned int utlb)
+{
+	struct ipmmu_vmsa_device *mmu = domain->mmu;
+
+	ipmmu_write(mmu, IMUCTR(utlb), 0);
+}
+
+static void ipmmu_tlb_flush_all(void *cookie)
+{
+	struct ipmmu_vmsa_domain *domain = cookie;
+
+	ipmmu_tlb_invalidate(domain);
+}
+
+static void ipmmu_tlb_add_flush(unsigned long iova, size_t size,
+				size_t granule, bool leaf, void *cookie)
+{
+	/* The hardware doesn't support selective TLB flush. */
+}
+
+static const struct iommu_gather_ops ipmmu_gather_ops = {
+	.tlb_flush_all = ipmmu_tlb_flush_all,
+	.tlb_add_flush = ipmmu_tlb_add_flush,
+	.tlb_sync = ipmmu_tlb_flush_all,
+};
+
+/* -----------------------------------------------------------------------------
+ * Domain/Context Management
+ */
+
+static int ipmmu_domain_allocate_context(struct ipmmu_vmsa_device *mmu,
+					 struct ipmmu_vmsa_domain *domain)
+{
+	unsigned long flags;
+	int ret;
+
+	spin_lock_irqsave(&mmu->lock, flags);
+
+	ret = find_first_zero_bit(mmu->ctx, mmu->num_ctx);
+	if (ret != mmu->num_ctx) {
+		mmu->domains[ret] = domain;
+		set_bit(ret, mmu->ctx);
+	} else
+		ret = -EBUSY;
+
+	spin_unlock_irqrestore(&mmu->lock, flags);
+
+	return ret;
+}
+
+static void ipmmu_domain_free_context(struct ipmmu_vmsa_device *mmu,
+				      unsigned int context_id)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&mmu->lock, flags);
+
+	clear_bit(context_id, mmu->ctx);
+	mmu->domains[context_id] = NULL;
+
+	spin_unlock_irqrestore(&mmu->lock, flags);
+}
+
+static int ipmmu_domain_init_context(struct ipmmu_vmsa_domain *domain)
+{
+	u64 ttbr;
+	u32 tmp;
+	int ret;
+
+	/*
+	 * Allocate the page table operations.
+	 *
+	 * VMSA states in section B3.6.3 "Control of Secure or Non-secure memory
+	 * access, Long-descriptor format" that the NStable bit being set in a
+	 * table descriptor will result in the NStable and NS bits of all child
+	 * entries being ignored and considered as being set. The IPMMU seems
+	 * not to comply with this, as it generates a secure access page fault
+	 * if any of the NStable and NS bits isn't set when running in
+	 * non-secure mode.
+	 */
+	domain->cfg.quirks = IO_PGTABLE_QUIRK_ARM_NS;
+	domain->cfg.pgsize_bitmap = SZ_1G | SZ_2M | SZ_4K;
+	domain->cfg.ias = 32;
+	domain->cfg.oas = 40;
+	domain->cfg.tlb = &ipmmu_gather_ops;
+	domain->io_domain.geometry.aperture_end = DMA_BIT_MASK(32);
+	domain->io_domain.geometry.force_aperture = true;
+	/*
+	 * TODO: Add support for coherent walk through CCI with DVM and remove
+	 * cache handling. For now, delegate it to the io-pgtable code.
+	 */
+	domain->cfg.iommu_dev = domain->mmu->root->dev;
+
+	/*
+	 * Find an unused context.
+	 */
+	ret = ipmmu_domain_allocate_context(domain->mmu->root, domain);
+	if (ret < 0)
+		return ret;
+
+	domain->context_id = ret;
+
+	domain->iop = alloc_io_pgtable_ops(ARM_32_LPAE_S1, &domain->cfg,
+					   domain);
+	if (!domain->iop) {
+		ipmmu_domain_free_context(domain->mmu->root,
+					  domain->context_id);
+		return -EINVAL;
+	}
+
+	/* TTBR0 */
+	ttbr = domain->cfg.arm_lpae_s1_cfg.ttbr[0];
+	ipmmu_ctx_write_root(domain, IMTTLBR0, ttbr);
+	ipmmu_ctx_write_root(domain, IMTTUBR0, ttbr >> 32);
+
+	/*
+	 * TTBCR
+	 * We use long descriptors with inner-shareable WBWA tables and allocate
+	 * the whole 32-bit VA space to TTBR0.
+	 */
+	if (domain->mmu->features->twobit_imttbcr_sl0)
+		tmp = IMTTBCR_SL0_TWOBIT_LVL_1;
+	else
+		tmp = IMTTBCR_SL0_LVL_1;
+
+	ipmmu_ctx_write_root(domain, IMTTBCR, IMTTBCR_EAE |
+			     IMTTBCR_SH0_INNER_SHAREABLE | IMTTBCR_ORGN0_WB_WA |
+			     IMTTBCR_IRGN0_WB_WA | tmp);
+
+	/* MAIR0 */
+	ipmmu_ctx_write_root(domain, IMMAIR0,
+			     domain->cfg.arm_lpae_s1_cfg.mair[0]);
+
+	/* IMBUSCR */
+	if (domain->mmu->features->setup_imbuscr)
+		ipmmu_ctx_write_root(domain, IMBUSCR,
+				     ipmmu_ctx_read_root(domain, IMBUSCR) &
+				     ~(IMBUSCR_DVM | IMBUSCR_BUSSEL_MASK));
+
+	/*
+	 * IMSTR
+	 * Clear all interrupt flags.
+	 */
+	ipmmu_ctx_write_root(domain, IMSTR, ipmmu_ctx_read_root(domain, IMSTR));
+
+	/*
+	 * IMCTR
+	 * Enable the MMU and interrupt generation. The long-descriptor
+	 * translation table format doesn't use TEX remapping. Don't enable AF
+	 * software management as we have no use for it. Flush the TLB as
+	 * required when modifying the context registers.
+	 */
+	ipmmu_ctx_write_all(domain, IMCTR,
+			    IMCTR_INTEN | IMCTR_FLUSH | IMCTR_MMUEN);
+
+	return 0;
+}
+
+static void ipmmu_domain_destroy_context(struct ipmmu_vmsa_domain *domain)
+{
+	if (!domain->mmu)
+		return;
+
+	/*
+	 * Disable the context. Flush the TLB as required when modifying the
+	 * context registers.
+	 *
+	 * TODO: Is TLB flush really needed ?
+	 */
+	ipmmu_ctx_write_all(domain, IMCTR, IMCTR_FLUSH);
+	ipmmu_tlb_sync(domain);
+	ipmmu_domain_free_context(domain->mmu->root, domain->context_id);
+}
+
+/* -----------------------------------------------------------------------------
+ * Fault Handling
+ */
+
+static irqreturn_t ipmmu_domain_irq(struct ipmmu_vmsa_domain *domain)
+{
+	const u32 err_mask = IMSTR_MHIT | IMSTR_ABORT | IMSTR_PF | IMSTR_TF;
+	struct ipmmu_vmsa_device *mmu = domain->mmu;
+	u32 status;
+	u32 iova;
+
+	status = ipmmu_ctx_read_root(domain, IMSTR);
+	if (!(status & err_mask))
+		return IRQ_NONE;
+
+	iova = ipmmu_ctx_read_root(domain, IMEAR);
+
+	/*
+	 * Clear the error status flags. Unlike traditional interrupt flag
+	 * registers that must be cleared by writing 1, this status register
+	 * seems to require 0. The error address register must be read before,
+	 * otherwise its value will be 0.
+	 */
+	ipmmu_ctx_write_root(domain, IMSTR, 0);
+
+	/* Log fatal errors. */
+	if (status & IMSTR_MHIT)
+		dev_err_ratelimited(mmu->dev, "Multiple TLB hits @0x%08x\n",
+				    iova);
+	if (status & IMSTR_ABORT)
+		dev_err_ratelimited(mmu->dev, "Page Table Walk Abort @0x%08x\n",
+				    iova);
+
+	if (!(status & (IMSTR_PF | IMSTR_TF)))
+		return IRQ_NONE;
+
+	/*
+	 * Try to handle page faults and translation faults.
+	 *
+	 * TODO: We need to look up the faulty device based on the I/O VA. Use
+	 * the IOMMU device for now.
+	 */
+	if (!report_iommu_fault(&domain->io_domain, mmu->dev, iova, 0))
+		return IRQ_HANDLED;
+
+	dev_err_ratelimited(mmu->dev,
+			    "Unhandled fault: status 0x%08x iova 0x%08x\n",
+			    status, iova);
+
+	return IRQ_HANDLED;
+}
+
+static irqreturn_t ipmmu_irq(int irq, void *dev)
+{
+	struct ipmmu_vmsa_device *mmu = dev;
+	irqreturn_t status = IRQ_NONE;
+	unsigned int i;
+	unsigned long flags;
+
+	spin_lock_irqsave(&mmu->lock, flags);
+
+	/*
+	 * Check interrupts for all active contexts.
+	 */
+	for (i = 0; i < mmu->num_ctx; i++) {
+		if (!mmu->domains[i])
+			continue;
+		if (ipmmu_domain_irq(mmu->domains[i]) == IRQ_HANDLED)
+			status = IRQ_HANDLED;
+	}
+
+	spin_unlock_irqrestore(&mmu->lock, flags);
+
+	return status;
+}
+
+/* -----------------------------------------------------------------------------
+ * IOMMU Operations
+ */
+
+static struct iommu_domain *__ipmmu_domain_alloc(unsigned type)
+{
+	struct ipmmu_vmsa_domain *domain;
+
+	domain = kzalloc(sizeof(*domain), GFP_KERNEL);
+	if (!domain)
+		return NULL;
+
+	mutex_init(&domain->mutex);
+
+	return &domain->io_domain;
+}
+
+static struct iommu_domain *ipmmu_domain_alloc(unsigned type)
+{
+	struct iommu_domain *io_domain = NULL;
+
+	switch (type) {
+	case IOMMU_DOMAIN_UNMANAGED:
+		io_domain = __ipmmu_domain_alloc(type);
+		break;
+
+	case IOMMU_DOMAIN_DMA:
+		io_domain = __ipmmu_domain_alloc(type);
+		if (io_domain && iommu_get_dma_cookie(io_domain)) {
+			kfree(io_domain);
+			io_domain = NULL;
+		}
+		break;
+	}
+
+	return io_domain;
+}
+
+static void ipmmu_domain_free(struct iommu_domain *io_domain)
+{
+	struct ipmmu_vmsa_domain *domain = to_vmsa_domain(io_domain);
+
+	/*
+	 * Free the domain resources. We assume that all devices have already
+	 * been detached.
+	 */
+	iommu_put_dma_cookie(io_domain);
+	ipmmu_domain_destroy_context(domain);
+	free_io_pgtable_ops(domain->iop);
+	kfree(domain);
+}
+
+static int ipmmu_attach_device(struct iommu_domain *io_domain,
+			       struct device *dev)
+{
+	struct iommu_fwspec *fwspec = dev->iommu_fwspec;
+	struct ipmmu_vmsa_device *mmu = to_ipmmu(dev);
+	struct ipmmu_vmsa_domain *domain = to_vmsa_domain(io_domain);
+	unsigned int i;
+	int ret = 0;
+
+	if (!mmu) {
+		dev_err(dev, "Cannot attach to IPMMU\n");
+		return -ENXIO;
+	}
+
+	mutex_lock(&domain->mutex);
+
+	if (!domain->mmu) {
+		/* The domain hasn't been used yet, initialize it. */
+		domain->mmu = mmu;
+		ret = ipmmu_domain_init_context(domain);
+		if (ret < 0) {
+			dev_err(dev, "Unable to initialize IPMMU context\n");
+			domain->mmu = NULL;
+		} else {
+			dev_info(dev, "Using IPMMU context %u\n",
+				 domain->context_id);
+		}
+	} else if (domain->mmu != mmu) {
+		/*
+		 * Something is wrong, we can't attach two devices using
+		 * different IOMMUs to the same domain.
+		 */
+		dev_err(dev, "Can't attach IPMMU %s to domain on IPMMU %s\n",
+			dev_name(mmu->dev), dev_name(domain->mmu->dev));
+		ret = -EINVAL;
+	} else
+		dev_info(dev, "Reusing IPMMU context %u\n", domain->context_id);
+
+	mutex_unlock(&domain->mutex);
+
+	if (ret < 0)
+		return ret;
+
+	for (i = 0; i < fwspec->num_ids; ++i)
+		ipmmu_utlb_enable(domain, fwspec->ids[i]);
+
+	return 0;
+}
+
+static void ipmmu_detach_device(struct iommu_domain *io_domain,
+				struct device *dev)
+{
+	struct iommu_fwspec *fwspec = dev->iommu_fwspec;
+	struct ipmmu_vmsa_domain *domain = to_vmsa_domain(io_domain);
+	unsigned int i;
+
+	for (i = 0; i < fwspec->num_ids; ++i)
+		ipmmu_utlb_disable(domain, fwspec->ids[i]);
+
+	/*
+	 * TODO: Optimize by disabling the context when no device is attached.
+	 */
+}
+
+static int ipmmu_map(struct iommu_domain *io_domain, unsigned long iova,
+		     phys_addr_t paddr, size_t size, int prot)
+{
+	struct ipmmu_vmsa_domain *domain = to_vmsa_domain(io_domain);
+
+	if (!domain)
+		return -ENODEV;
+
+	return domain->iop->map(domain->iop, iova, paddr, size, prot);
+}
+
+static size_t ipmmu_unmap(struct iommu_domain *io_domain, unsigned long iova,
+			  size_t size)
+{
+	struct ipmmu_vmsa_domain *domain = to_vmsa_domain(io_domain);
+
+	return domain->iop->unmap(domain->iop, iova, size);
+}
+
+static void ipmmu_iotlb_sync(struct iommu_domain *io_domain)
+{
+	struct ipmmu_vmsa_domain *domain = to_vmsa_domain(io_domain);
+
+	if (domain->mmu)
+		ipmmu_tlb_flush_all(domain);
+}
+
+static phys_addr_t ipmmu_iova_to_phys(struct iommu_domain *io_domain,
+				      dma_addr_t iova)
+{
+	struct ipmmu_vmsa_domain *domain = to_vmsa_domain(io_domain);
+
+	/* TODO: Is locking needed ? */
+
+	return domain->iop->iova_to_phys(domain->iop, iova);
+}
+
+static int ipmmu_init_platform_device(struct device *dev,
+				      struct of_phandle_args *args)
+{
+	struct platform_device *ipmmu_pdev;
+
+	ipmmu_pdev = of_find_device_by_node(args->np);
+	if (!ipmmu_pdev)
+		return -ENODEV;
+
+	dev->iommu_fwspec->iommu_priv = platform_get_drvdata(ipmmu_pdev);
+	return 0;
+}
+
+static bool ipmmu_slave_whitelist(struct device *dev)
+{
+	/* By default, do not allow use of IPMMU */
+	return false;
+}
+
+static const struct soc_device_attribute soc_rcar_gen3[] = {
+	{ .soc_id = "r8a7795", },
+	{ .soc_id = "r8a7796", },
+	{ .soc_id = "r8a77965", },
+	{ .soc_id = "r8a77970", },
+	{ .soc_id = "r8a77995", },
+	{ /* sentinel */ }
+};
+
+static int ipmmu_of_xlate(struct device *dev,
+			  struct of_phandle_args *spec)
+{
+	/* For R-Car Gen3 use a white list to opt-in slave devices */
+	if (soc_device_match(soc_rcar_gen3) && !ipmmu_slave_whitelist(dev))
+		return -ENODEV;
+
+	iommu_fwspec_add_ids(dev, spec->args, 1);
+
+	/* Initialize once - xlate() will call multiple times */
+	if (to_ipmmu(dev))
+		return 0;
+
+	return ipmmu_init_platform_device(dev, spec);
+}
+
+static int ipmmu_init_arm_mapping(struct device *dev)
+{
+	struct ipmmu_vmsa_device *mmu = to_ipmmu(dev);
+	struct iommu_group *group;
+	int ret;
+
+	/* Create a device group and add the device to it. */
+	group = iommu_group_alloc();
+	if (IS_ERR(group)) {
+		dev_err(dev, "Failed to allocate IOMMU group\n");
+		return PTR_ERR(group);
+	}
+
+	ret = iommu_group_add_device(group, dev);
+	iommu_group_put(group);
+
+	if (ret < 0) {
+		dev_err(dev, "Failed to add device to IPMMU group\n");
+		return ret;
+	}
+
+	/*
+	 * Create the ARM mapping, used by the ARM DMA mapping core to allocate
+	 * VAs. This will allocate a corresponding IOMMU domain.
+	 *
+	 * TODO:
+	 * - Create one mapping per context (TLB).
+	 * - Make the mapping size configurable ? We currently use a 2GB mapping
+	 *   at a 1GB offset to ensure that NULL VAs will fault.
+	 */
+	if (!mmu->mapping) {
+		struct dma_iommu_mapping *mapping;
+
+		mapping = arm_iommu_create_mapping(&platform_bus_type,
+						   SZ_1G, SZ_2G);
+		if (IS_ERR(mapping)) {
+			dev_err(mmu->dev, "failed to create ARM IOMMU mapping\n");
+			ret = PTR_ERR(mapping);
+			goto error;
+		}
+
+		mmu->mapping = mapping;
+	}
+
+	/* Attach the ARM VA mapping to the device. */
+	ret = arm_iommu_attach_device(dev, mmu->mapping);
+	if (ret < 0) {
+		dev_err(dev, "Failed to attach device to VA mapping\n");
+		goto error;
+	}
+
+	return 0;
+
+error:
+	iommu_group_remove_device(dev);
+	if (mmu->mapping)
+		arm_iommu_release_mapping(mmu->mapping);
+
+	return ret;
+}
+
+static int ipmmu_add_device(struct device *dev)
+{
+	struct iommu_group *group;
+
+	/*
+	 * Only let through devices that have been verified in xlate()
+	 */
+	if (!to_ipmmu(dev))
+		return -ENODEV;
+
+	if (IS_ENABLED(CONFIG_ARM) && !IS_ENABLED(CONFIG_IOMMU_DMA))
+		return ipmmu_init_arm_mapping(dev);
+
+	group = iommu_group_get_for_dev(dev);
+	if (IS_ERR(group))
+		return PTR_ERR(group);
+
+	iommu_group_put(group);
+	return 0;
+}
+
+static void ipmmu_remove_device(struct device *dev)
+{
+	arm_iommu_detach_device(dev);
+	iommu_group_remove_device(dev);
+}
+
+static struct iommu_group *ipmmu_find_group(struct device *dev)
+{
+	struct ipmmu_vmsa_device *mmu = to_ipmmu(dev);
+	struct iommu_group *group;
+
+	if (mmu->group)
+		return iommu_group_ref_get(mmu->group);
+
+	group = iommu_group_alloc();
+	if (!IS_ERR(group))
+		mmu->group = group;
+
+	return group;
+}
+
+static const struct iommu_ops ipmmu_ops = {
+	.domain_alloc = ipmmu_domain_alloc,
+	.domain_free = ipmmu_domain_free,
+	.attach_dev = ipmmu_attach_device,
+	.detach_dev = ipmmu_detach_device,
+	.map = ipmmu_map,
+	.unmap = ipmmu_unmap,
+	.flush_iotlb_all = ipmmu_iotlb_sync,
+	.iotlb_sync = ipmmu_iotlb_sync,
+	.iova_to_phys = ipmmu_iova_to_phys,
+	.add_device = ipmmu_add_device,
+	.remove_device = ipmmu_remove_device,
+	.device_group = ipmmu_find_group,
+	.pgsize_bitmap = SZ_1G | SZ_2M | SZ_4K,
+	.of_xlate = ipmmu_of_xlate,
+};
+
+/* -----------------------------------------------------------------------------
+ * Probe/remove and init
+ */
+
+static void ipmmu_device_reset(struct ipmmu_vmsa_device *mmu)
+{
+	unsigned int i;
+
+	/* Disable all contexts. */
+	for (i = 0; i < mmu->num_ctx; ++i)
+		ipmmu_write(mmu, i * IM_CTX_SIZE + IMCTR, 0);
+}
+
+static const struct ipmmu_features ipmmu_features_default = {
+	.use_ns_alias_offset = true,
+	.has_cache_leaf_nodes = false,
+	.number_of_contexts = 1, /* software only tested with one context */
+	.setup_imbuscr = true,
+	.twobit_imttbcr_sl0 = false,
+	.reserved_context = false,
+};
+
+static const struct ipmmu_features ipmmu_features_rcar_gen3 = {
+	.use_ns_alias_offset = false,
+	.has_cache_leaf_nodes = true,
+	.number_of_contexts = 8,
+	.setup_imbuscr = false,
+	.twobit_imttbcr_sl0 = true,
+	.reserved_context = true,
+};
+
+static const struct of_device_id ipmmu_of_ids[] = {
+	{
+		.compatible = "renesas,ipmmu-vmsa",
+		.data = &ipmmu_features_default,
+	}, {
+		.compatible = "renesas,ipmmu-r8a7795",
+		.data = &ipmmu_features_rcar_gen3,
+	}, {
+		.compatible = "renesas,ipmmu-r8a7796",
+		.data = &ipmmu_features_rcar_gen3,
+	}, {
+		.compatible = "renesas,ipmmu-r8a77965",
+		.data = &ipmmu_features_rcar_gen3,
+	}, {
+		.compatible = "renesas,ipmmu-r8a77970",
+		.data = &ipmmu_features_rcar_gen3,
+	}, {
+		.compatible = "renesas,ipmmu-r8a77995",
+		.data = &ipmmu_features_rcar_gen3,
+	}, {
+		/* Terminator */
+	},
+};
+
+MODULE_DEVICE_TABLE(of, ipmmu_of_ids);
+
+static int ipmmu_probe(struct platform_device *pdev)
+{
+	struct ipmmu_vmsa_device *mmu;
+	struct resource *res;
+	int irq;
+	int ret;
+
+	mmu = devm_kzalloc(&pdev->dev, sizeof(*mmu), GFP_KERNEL);
+	if (!mmu) {
+		dev_err(&pdev->dev, "cannot allocate device data\n");
+		return -ENOMEM;
+	}
+
+	mmu->dev = &pdev->dev;
+	mmu->num_utlbs = 48;
+	spin_lock_init(&mmu->lock);
+	bitmap_zero(mmu->ctx, IPMMU_CTX_MAX);
+	mmu->features = of_device_get_match_data(&pdev->dev);
+	dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(40));
+
+	/* Map I/O memory and request IRQ. */
+	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	mmu->base = devm_ioremap_resource(&pdev->dev, res);
+	if (IS_ERR(mmu->base))
+		return PTR_ERR(mmu->base);
+
+	/*
+	 * The IPMMU has two register banks, for secure and non-secure modes.
+	 * The bank mapped at the beginning of the IPMMU address space
+	 * corresponds to the running mode of the CPU. When running in secure
+	 * mode the non-secure register bank is also available at an offset.
+	 *
+	 * Secure mode operation isn't clearly documented and is thus currently
+	 * not implemented in the driver. Furthermore, preliminary tests of
+	 * non-secure operation with the main register bank were not successful.
+	 * Offset the registers base unconditionally to point to the non-secure
+	 * alias space for now.
+	 */
+	if (mmu->features->use_ns_alias_offset)
+		mmu->base += IM_NS_ALIAS_OFFSET;
+
+	mmu->num_ctx = min_t(unsigned int, IPMMU_CTX_MAX,
+			     mmu->features->number_of_contexts);
+
+	irq = platform_get_irq(pdev, 0);
+
+	/*
+	 * Determine if this IPMMU instance is a root device by checking for
+	 * the lack of has_cache_leaf_nodes flag or renesas,ipmmu-main property.
+	 */
+	if (!mmu->features->has_cache_leaf_nodes ||
+	    !of_find_property(pdev->dev.of_node, "renesas,ipmmu-main", NULL))
+		mmu->root = mmu;
+	else
+		mmu->root = ipmmu_find_root();
+
+	/*
+	 * Wait until the root device has been registered for sure.
+	 */
+	if (!mmu->root)
+		return -EPROBE_DEFER;
+
+	/* Root devices have mandatory IRQs */
+	if (ipmmu_is_root(mmu)) {
+		if (irq < 0) {
+			dev_err(&pdev->dev, "no IRQ found\n");
+			return irq;
+		}
+
+		ret = devm_request_irq(&pdev->dev, irq, ipmmu_irq, 0,
+				       dev_name(&pdev->dev), mmu);
+		if (ret < 0) {
+			dev_err(&pdev->dev, "failed to request IRQ %d\n", irq);
+			return ret;
+		}
+
+		ipmmu_device_reset(mmu);
+
+		if (mmu->features->reserved_context) {
+			dev_info(&pdev->dev, "IPMMU context 0 is reserved\n");
+			set_bit(0, mmu->ctx);
+		}
+	}
+
+	/*
+	 * Register the IPMMU to the IOMMU subsystem in the following cases:
+	 * - R-Car Gen2 IPMMU (all devices registered)
+	 * - R-Car Gen3 IPMMU (leaf devices only - skip root IPMMU-MM device)
+	 */
+	if (!mmu->features->has_cache_leaf_nodes || !ipmmu_is_root(mmu)) {
+		ret = iommu_device_sysfs_add(&mmu->iommu, &pdev->dev, NULL,
+					     dev_name(&pdev->dev));
+		if (ret)
+			return ret;
+
+		iommu_device_set_ops(&mmu->iommu, &ipmmu_ops);
+		iommu_device_set_fwnode(&mmu->iommu,
+					&pdev->dev.of_node->fwnode);
+
+		ret = iommu_device_register(&mmu->iommu);
+		if (ret)
+			return ret;
+
+#if defined(CONFIG_IOMMU_DMA)
+		if (!iommu_present(&platform_bus_type))
+			bus_set_iommu(&platform_bus_type, &ipmmu_ops);
+#endif
+	}
+
+	/*
+	 * We can't create the ARM mapping here as it requires the bus to have
+	 * an IOMMU, which only happens when bus_set_iommu() is called in
+	 * ipmmu_init() after the probe function returns.
+	 */
+
+	platform_set_drvdata(pdev, mmu);
+
+	return 0;
+}
+
+static int ipmmu_remove(struct platform_device *pdev)
+{
+	struct ipmmu_vmsa_device *mmu = platform_get_drvdata(pdev);
+
+	iommu_device_sysfs_remove(&mmu->iommu);
+	iommu_device_unregister(&mmu->iommu);
+
+	arm_iommu_release_mapping(mmu->mapping);
+
+	ipmmu_device_reset(mmu);
+
+	return 0;
+}
+
+static struct platform_driver ipmmu_driver = {
+	.driver = {
+		.name = "ipmmu-vmsa",
+		.of_match_table = of_match_ptr(ipmmu_of_ids),
+	},
+	.probe = ipmmu_probe,
+	.remove	= ipmmu_remove,
+};
+
+static int __init ipmmu_init(void)
+{
+	struct device_node *np;
+	static bool setup_done;
+	int ret;
+
+	if (setup_done)
+		return 0;
+
+	np = of_find_matching_node(NULL, ipmmu_of_ids);
+	if (!np)
+		return 0;
+
+	of_node_put(np);
+
+	ret = platform_driver_register(&ipmmu_driver);
+	if (ret < 0)
+		return ret;
+
+#if defined(CONFIG_ARM) && !defined(CONFIG_IOMMU_DMA)
+	if (!iommu_present(&platform_bus_type))
+		bus_set_iommu(&platform_bus_type, &ipmmu_ops);
+#endif
+
+	setup_done = true;
+	return 0;
+}
+
+static void __exit ipmmu_exit(void)
+{
+	return platform_driver_unregister(&ipmmu_driver);
+}
+
+subsys_initcall(ipmmu_init);
+module_exit(ipmmu_exit);
+
+MODULE_DESCRIPTION("IOMMU API for Renesas VMSA-compatible IPMMU");
+MODULE_AUTHOR("Laurent Pinchart <laurent.pinchart@ideasonboard.com>");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/iommu/irq_remapping.c b/drivers/iommu/irq_remapping.c
new file mode 100644
index 000000000..7d0f3074d
--- /dev/null
+++ b/drivers/iommu/irq_remapping.c
@@ -0,0 +1,192 @@
+#include <linux/seq_file.h>
+#include <linux/cpumask.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/msi.h>
+#include <linux/irq.h>
+#include <linux/pci.h>
+#include <linux/irqdomain.h>
+
+#include <asm/hw_irq.h>
+#include <asm/irq_remapping.h>
+#include <asm/processor.h>
+#include <asm/x86_init.h>
+#include <asm/apic.h>
+#include <asm/hpet.h>
+
+#include "irq_remapping.h"
+
+int irq_remapping_enabled;
+int irq_remap_broken;
+int disable_sourceid_checking;
+int no_x2apic_optout;
+
+int disable_irq_post = 0;
+
+static int disable_irq_remap;
+static struct irq_remap_ops *remap_ops;
+
+static void irq_remapping_restore_boot_irq_mode(void)
+{
+	/*
+	 * With interrupt-remapping, for now we will use virtual wire A
+	 * mode, as virtual wire B is little complex (need to configure
+	 * both IOAPIC RTE as well as interrupt-remapping table entry).
+	 * As this gets called during crash dump, keep this simple for
+	 * now.
+	 */
+	if (boot_cpu_has(X86_FEATURE_APIC) || apic_from_smp_config())
+		disconnect_bsp_APIC(0);
+}
+
+static void __init irq_remapping_modify_x86_ops(void)
+{
+	x86_apic_ops.restore = irq_remapping_restore_boot_irq_mode;
+}
+
+static __init int setup_nointremap(char *str)
+{
+	disable_irq_remap = 1;
+	return 0;
+}
+early_param("nointremap", setup_nointremap);
+
+static __init int setup_irqremap(char *str)
+{
+	if (!str)
+		return -EINVAL;
+
+	while (*str) {
+		if (!strncmp(str, "on", 2)) {
+			disable_irq_remap = 0;
+			disable_irq_post = 0;
+		} else if (!strncmp(str, "off", 3)) {
+			disable_irq_remap = 1;
+			disable_irq_post = 1;
+		} else if (!strncmp(str, "nosid", 5))
+			disable_sourceid_checking = 1;
+		else if (!strncmp(str, "no_x2apic_optout", 16))
+			no_x2apic_optout = 1;
+		else if (!strncmp(str, "nopost", 6))
+			disable_irq_post = 1;
+
+		str += strcspn(str, ",");
+		while (*str == ',')
+			str++;
+	}
+
+	return 0;
+}
+early_param("intremap", setup_irqremap);
+
+void set_irq_remapping_broken(void)
+{
+	irq_remap_broken = 1;
+}
+
+bool irq_remapping_cap(enum irq_remap_cap cap)
+{
+	if (!remap_ops || disable_irq_post)
+		return false;
+
+	return (remap_ops->capability & (1 << cap));
+}
+EXPORT_SYMBOL_GPL(irq_remapping_cap);
+
+int __init irq_remapping_prepare(void)
+{
+	if (disable_irq_remap)
+		return -ENOSYS;
+
+	if (intel_irq_remap_ops.prepare() == 0)
+		remap_ops = &intel_irq_remap_ops;
+	else if (IS_ENABLED(CONFIG_AMD_IOMMU) &&
+		 amd_iommu_irq_ops.prepare() == 0)
+		remap_ops = &amd_iommu_irq_ops;
+	else
+		return -ENOSYS;
+
+	return 0;
+}
+
+int __init irq_remapping_enable(void)
+{
+	int ret;
+
+	if (!remap_ops->enable)
+		return -ENODEV;
+
+	ret = remap_ops->enable();
+
+	if (irq_remapping_enabled)
+		irq_remapping_modify_x86_ops();
+
+	return ret;
+}
+
+void irq_remapping_disable(void)
+{
+	if (irq_remapping_enabled && remap_ops->disable)
+		remap_ops->disable();
+}
+
+int irq_remapping_reenable(int mode)
+{
+	if (irq_remapping_enabled && remap_ops->reenable)
+		return remap_ops->reenable(mode);
+
+	return 0;
+}
+
+int __init irq_remap_enable_fault_handling(void)
+{
+	if (!irq_remapping_enabled)
+		return 0;
+
+	if (!remap_ops->enable_faulting)
+		return -ENODEV;
+
+	return remap_ops->enable_faulting();
+}
+
+void panic_if_irq_remap(const char *msg)
+{
+	if (irq_remapping_enabled)
+		panic(msg);
+}
+
+/**
+ * irq_remapping_get_ir_irq_domain - Get the irqdomain associated with the IOMMU
+ *				     device serving request @info
+ * @info: interrupt allocation information, used to identify the IOMMU device
+ *
+ * It's used to get parent irqdomain for HPET and IOAPIC irqdomains.
+ * Returns pointer to IRQ domain, or NULL on failure.
+ */
+struct irq_domain *
+irq_remapping_get_ir_irq_domain(struct irq_alloc_info *info)
+{
+	if (!remap_ops || !remap_ops->get_ir_irq_domain)
+		return NULL;
+
+	return remap_ops->get_ir_irq_domain(info);
+}
+
+/**
+ * irq_remapping_get_irq_domain - Get the irqdomain serving the request @info
+ * @info: interrupt allocation information, used to identify the IOMMU device
+ *
+ * There will be one PCI MSI/MSIX irqdomain associated with each interrupt
+ * remapping device, so this interface is used to retrieve the PCI MSI/MSIX
+ * irqdomain serving request @info.
+ * Returns pointer to IRQ domain, or NULL on failure.
+ */
+struct irq_domain *
+irq_remapping_get_irq_domain(struct irq_alloc_info *info)
+{
+	if (!remap_ops || !remap_ops->get_irq_domain)
+		return NULL;
+
+	return remap_ops->get_irq_domain(info);
+}
diff --git a/drivers/iommu/irq_remapping.h b/drivers/iommu/irq_remapping.h
new file mode 100644
index 000000000..0afef6e43
--- /dev/null
+++ b/drivers/iommu/irq_remapping.h
@@ -0,0 +1,76 @@
+/*
+ * Copyright (C) 2012 Advanced Micro Devices, Inc.
+ * Author: Joerg Roedel <jroedel@suse.de>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ *
+ * This header file contains stuff that is shared between different interrupt
+ * remapping drivers but with no need to be visible outside of the IOMMU layer.
+ */
+
+#ifndef __IRQ_REMAPPING_H
+#define __IRQ_REMAPPING_H
+
+#ifdef CONFIG_IRQ_REMAP
+
+struct irq_data;
+struct msi_msg;
+struct irq_domain;
+struct irq_alloc_info;
+
+extern int irq_remap_broken;
+extern int disable_sourceid_checking;
+extern int no_x2apic_optout;
+extern int irq_remapping_enabled;
+
+extern int disable_irq_post;
+
+struct irq_remap_ops {
+	/* The supported capabilities */
+	int capability;
+
+	/* Initializes hardware and makes it ready for remapping interrupts */
+	int  (*prepare)(void);
+
+	/* Enables the remapping hardware */
+	int  (*enable)(void);
+
+	/* Disables the remapping hardware */
+	void (*disable)(void);
+
+	/* Reenables the remapping hardware */
+	int  (*reenable)(int);
+
+	/* Enable fault handling */
+	int  (*enable_faulting)(void);
+
+	/* Get the irqdomain associated the IOMMU device */
+	struct irq_domain *(*get_ir_irq_domain)(struct irq_alloc_info *);
+
+	/* Get the MSI irqdomain associated with the IOMMU device */
+	struct irq_domain *(*get_irq_domain)(struct irq_alloc_info *);
+};
+
+extern struct irq_remap_ops intel_irq_remap_ops;
+extern struct irq_remap_ops amd_iommu_irq_ops;
+
+#else  /* CONFIG_IRQ_REMAP */
+
+#define irq_remapping_enabled 0
+#define irq_remap_broken      0
+#define disable_irq_post      1
+
+#endif /* CONFIG_IRQ_REMAP */
+
+#endif /* __IRQ_REMAPPING_H */
diff --git a/drivers/iommu/msm_iommu.c b/drivers/iommu/msm_iommu.c
new file mode 100644
index 000000000..bf7ff0219
--- /dev/null
+++ b/drivers/iommu/msm_iommu.c
@@ -0,0 +1,877 @@
+/* Copyright (c) 2010-2011, Code Aurora Forum. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+
+#define pr_fmt(fmt)	KBUILD_MODNAME ": " fmt
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/platform_device.h>
+#include <linux/errno.h>
+#include <linux/io.h>
+#include <linux/interrupt.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/slab.h>
+#include <linux/iommu.h>
+#include <linux/clk.h>
+#include <linux/err.h>
+#include <linux/of_iommu.h>
+
+#include <asm/cacheflush.h>
+#include <asm/sizes.h>
+
+#include "msm_iommu_hw-8xxx.h"
+#include "msm_iommu.h"
+#include "io-pgtable.h"
+
+#define MRC(reg, processor, op1, crn, crm, op2)				\
+__asm__ __volatile__ (							\
+"   mrc   "   #processor "," #op1 ", %0,"  #crn "," #crm "," #op2 "\n"  \
+: "=r" (reg))
+
+/* bitmap of the page sizes currently supported */
+#define MSM_IOMMU_PGSIZES	(SZ_4K | SZ_64K | SZ_1M | SZ_16M)
+
+DEFINE_SPINLOCK(msm_iommu_lock);
+static LIST_HEAD(qcom_iommu_devices);
+static struct iommu_ops msm_iommu_ops;
+
+struct msm_priv {
+	struct list_head list_attached;
+	struct iommu_domain domain;
+	struct io_pgtable_cfg	cfg;
+	struct io_pgtable_ops	*iop;
+	struct device		*dev;
+	spinlock_t		pgtlock; /* pagetable lock */
+};
+
+static struct msm_priv *to_msm_priv(struct iommu_domain *dom)
+{
+	return container_of(dom, struct msm_priv, domain);
+}
+
+static int __enable_clocks(struct msm_iommu_dev *iommu)
+{
+	int ret;
+
+	ret = clk_enable(iommu->pclk);
+	if (ret)
+		goto fail;
+
+	if (iommu->clk) {
+		ret = clk_enable(iommu->clk);
+		if (ret)
+			clk_disable(iommu->pclk);
+	}
+fail:
+	return ret;
+}
+
+static void __disable_clocks(struct msm_iommu_dev *iommu)
+{
+	if (iommu->clk)
+		clk_disable(iommu->clk);
+	clk_disable(iommu->pclk);
+}
+
+static void msm_iommu_reset(void __iomem *base, int ncb)
+{
+	int ctx;
+
+	SET_RPUE(base, 0);
+	SET_RPUEIE(base, 0);
+	SET_ESRRESTORE(base, 0);
+	SET_TBE(base, 0);
+	SET_CR(base, 0);
+	SET_SPDMBE(base, 0);
+	SET_TESTBUSCR(base, 0);
+	SET_TLBRSW(base, 0);
+	SET_GLOBAL_TLBIALL(base, 0);
+	SET_RPU_ACR(base, 0);
+	SET_TLBLKCRWE(base, 1);
+
+	for (ctx = 0; ctx < ncb; ctx++) {
+		SET_BPRCOSH(base, ctx, 0);
+		SET_BPRCISH(base, ctx, 0);
+		SET_BPRCNSH(base, ctx, 0);
+		SET_BPSHCFG(base, ctx, 0);
+		SET_BPMTCFG(base, ctx, 0);
+		SET_ACTLR(base, ctx, 0);
+		SET_SCTLR(base, ctx, 0);
+		SET_FSRRESTORE(base, ctx, 0);
+		SET_TTBR0(base, ctx, 0);
+		SET_TTBR1(base, ctx, 0);
+		SET_TTBCR(base, ctx, 0);
+		SET_BFBCR(base, ctx, 0);
+		SET_PAR(base, ctx, 0);
+		SET_FAR(base, ctx, 0);
+		SET_CTX_TLBIALL(base, ctx, 0);
+		SET_TLBFLPTER(base, ctx, 0);
+		SET_TLBSLPTER(base, ctx, 0);
+		SET_TLBLKCR(base, ctx, 0);
+		SET_CONTEXTIDR(base, ctx, 0);
+	}
+}
+
+static void __flush_iotlb(void *cookie)
+{
+	struct msm_priv *priv = cookie;
+	struct msm_iommu_dev *iommu = NULL;
+	struct msm_iommu_ctx_dev *master;
+	int ret = 0;
+
+	list_for_each_entry(iommu, &priv->list_attached, dom_node) {
+		ret = __enable_clocks(iommu);
+		if (ret)
+			goto fail;
+
+		list_for_each_entry(master, &iommu->ctx_list, list)
+			SET_CTX_TLBIALL(iommu->base, master->num, 0);
+
+		__disable_clocks(iommu);
+	}
+fail:
+	return;
+}
+
+static void __flush_iotlb_range(unsigned long iova, size_t size,
+				size_t granule, bool leaf, void *cookie)
+{
+	struct msm_priv *priv = cookie;
+	struct msm_iommu_dev *iommu = NULL;
+	struct msm_iommu_ctx_dev *master;
+	int ret = 0;
+	int temp_size;
+
+	list_for_each_entry(iommu, &priv->list_attached, dom_node) {
+		ret = __enable_clocks(iommu);
+		if (ret)
+			goto fail;
+
+		list_for_each_entry(master, &iommu->ctx_list, list) {
+			temp_size = size;
+			do {
+				iova &= TLBIVA_VA;
+				iova |= GET_CONTEXTIDR_ASID(iommu->base,
+							    master->num);
+				SET_TLBIVA(iommu->base, master->num, iova);
+				iova += granule;
+			} while (temp_size -= granule);
+		}
+
+		__disable_clocks(iommu);
+	}
+
+fail:
+	return;
+}
+
+static void __flush_iotlb_sync(void *cookie)
+{
+	/*
+	 * Nothing is needed here, the barrier to guarantee
+	 * completion of the tlb sync operation is implicitly
+	 * taken care when the iommu client does a writel before
+	 * kick starting the other master.
+	 */
+}
+
+static const struct iommu_gather_ops msm_iommu_gather_ops = {
+	.tlb_flush_all = __flush_iotlb,
+	.tlb_add_flush = __flush_iotlb_range,
+	.tlb_sync = __flush_iotlb_sync,
+};
+
+static int msm_iommu_alloc_ctx(unsigned long *map, int start, int end)
+{
+	int idx;
+
+	do {
+		idx = find_next_zero_bit(map, end, start);
+		if (idx == end)
+			return -ENOSPC;
+	} while (test_and_set_bit(idx, map));
+
+	return idx;
+}
+
+static void msm_iommu_free_ctx(unsigned long *map, int idx)
+{
+	clear_bit(idx, map);
+}
+
+static void config_mids(struct msm_iommu_dev *iommu,
+			struct msm_iommu_ctx_dev *master)
+{
+	int mid, ctx, i;
+
+	for (i = 0; i < master->num_mids; i++) {
+		mid = master->mids[i];
+		ctx = master->num;
+
+		SET_M2VCBR_N(iommu->base, mid, 0);
+		SET_CBACR_N(iommu->base, ctx, 0);
+
+		/* Set VMID = 0 */
+		SET_VMID(iommu->base, mid, 0);
+
+		/* Set the context number for that MID to this context */
+		SET_CBNDX(iommu->base, mid, ctx);
+
+		/* Set MID associated with this context bank to 0*/
+		SET_CBVMID(iommu->base, ctx, 0);
+
+		/* Set the ASID for TLB tagging for this context */
+		SET_CONTEXTIDR_ASID(iommu->base, ctx, ctx);
+
+		/* Set security bit override to be Non-secure */
+		SET_NSCFG(iommu->base, mid, 3);
+	}
+}
+
+static void __reset_context(void __iomem *base, int ctx)
+{
+	SET_BPRCOSH(base, ctx, 0);
+	SET_BPRCISH(base, ctx, 0);
+	SET_BPRCNSH(base, ctx, 0);
+	SET_BPSHCFG(base, ctx, 0);
+	SET_BPMTCFG(base, ctx, 0);
+	SET_ACTLR(base, ctx, 0);
+	SET_SCTLR(base, ctx, 0);
+	SET_FSRRESTORE(base, ctx, 0);
+	SET_TTBR0(base, ctx, 0);
+	SET_TTBR1(base, ctx, 0);
+	SET_TTBCR(base, ctx, 0);
+	SET_BFBCR(base, ctx, 0);
+	SET_PAR(base, ctx, 0);
+	SET_FAR(base, ctx, 0);
+	SET_CTX_TLBIALL(base, ctx, 0);
+	SET_TLBFLPTER(base, ctx, 0);
+	SET_TLBSLPTER(base, ctx, 0);
+	SET_TLBLKCR(base, ctx, 0);
+}
+
+static void __program_context(void __iomem *base, int ctx,
+			      struct msm_priv *priv)
+{
+	__reset_context(base, ctx);
+
+	/* Turn on TEX Remap */
+	SET_TRE(base, ctx, 1);
+	SET_AFE(base, ctx, 1);
+
+	/* Set up HTW mode */
+	/* TLB miss configuration: perform HTW on miss */
+	SET_TLBMCFG(base, ctx, 0x3);
+
+	/* V2P configuration: HTW for access */
+	SET_V2PCFG(base, ctx, 0x3);
+
+	SET_TTBCR(base, ctx, priv->cfg.arm_v7s_cfg.tcr);
+	SET_TTBR0(base, ctx, priv->cfg.arm_v7s_cfg.ttbr[0]);
+	SET_TTBR1(base, ctx, priv->cfg.arm_v7s_cfg.ttbr[1]);
+
+	/* Set prrr and nmrr */
+	SET_PRRR(base, ctx, priv->cfg.arm_v7s_cfg.prrr);
+	SET_NMRR(base, ctx, priv->cfg.arm_v7s_cfg.nmrr);
+
+	/* Invalidate the TLB for this context */
+	SET_CTX_TLBIALL(base, ctx, 0);
+
+	/* Set interrupt number to "secure" interrupt */
+	SET_IRPTNDX(base, ctx, 0);
+
+	/* Enable context fault interrupt */
+	SET_CFEIE(base, ctx, 1);
+
+	/* Stall access on a context fault and let the handler deal with it */
+	SET_CFCFG(base, ctx, 1);
+
+	/* Redirect all cacheable requests to L2 slave port. */
+	SET_RCISH(base, ctx, 1);
+	SET_RCOSH(base, ctx, 1);
+	SET_RCNSH(base, ctx, 1);
+
+	/* Turn on BFB prefetch */
+	SET_BFBDFE(base, ctx, 1);
+
+	/* Enable the MMU */
+	SET_M(base, ctx, 1);
+}
+
+static struct iommu_domain *msm_iommu_domain_alloc(unsigned type)
+{
+	struct msm_priv *priv;
+
+	if (type != IOMMU_DOMAIN_UNMANAGED)
+		return NULL;
+
+	priv = kzalloc(sizeof(*priv), GFP_KERNEL);
+	if (!priv)
+		goto fail_nomem;
+
+	INIT_LIST_HEAD(&priv->list_attached);
+
+	priv->domain.geometry.aperture_start = 0;
+	priv->domain.geometry.aperture_end   = (1ULL << 32) - 1;
+	priv->domain.geometry.force_aperture = true;
+
+	return &priv->domain;
+
+fail_nomem:
+	kfree(priv);
+	return NULL;
+}
+
+static void msm_iommu_domain_free(struct iommu_domain *domain)
+{
+	struct msm_priv *priv;
+	unsigned long flags;
+
+	spin_lock_irqsave(&msm_iommu_lock, flags);
+	priv = to_msm_priv(domain);
+	kfree(priv);
+	spin_unlock_irqrestore(&msm_iommu_lock, flags);
+}
+
+static int msm_iommu_domain_config(struct msm_priv *priv)
+{
+	spin_lock_init(&priv->pgtlock);
+
+	priv->cfg = (struct io_pgtable_cfg) {
+		.quirks = IO_PGTABLE_QUIRK_TLBI_ON_MAP,
+		.pgsize_bitmap = msm_iommu_ops.pgsize_bitmap,
+		.ias = 32,
+		.oas = 32,
+		.tlb = &msm_iommu_gather_ops,
+		.iommu_dev = priv->dev,
+	};
+
+	priv->iop = alloc_io_pgtable_ops(ARM_V7S, &priv->cfg, priv);
+	if (!priv->iop) {
+		dev_err(priv->dev, "Failed to allocate pgtable\n");
+		return -EINVAL;
+	}
+
+	msm_iommu_ops.pgsize_bitmap = priv->cfg.pgsize_bitmap;
+
+	return 0;
+}
+
+/* Must be called under msm_iommu_lock */
+static struct msm_iommu_dev *find_iommu_for_dev(struct device *dev)
+{
+	struct msm_iommu_dev *iommu, *ret = NULL;
+	struct msm_iommu_ctx_dev *master;
+
+	list_for_each_entry(iommu, &qcom_iommu_devices, dev_node) {
+		master = list_first_entry(&iommu->ctx_list,
+					  struct msm_iommu_ctx_dev,
+					  list);
+		if (master->of_node == dev->of_node) {
+			ret = iommu;
+			break;
+		}
+	}
+
+	return ret;
+}
+
+static int msm_iommu_add_device(struct device *dev)
+{
+	struct msm_iommu_dev *iommu;
+	struct iommu_group *group;
+	unsigned long flags;
+
+	spin_lock_irqsave(&msm_iommu_lock, flags);
+	iommu = find_iommu_for_dev(dev);
+	spin_unlock_irqrestore(&msm_iommu_lock, flags);
+
+	if (iommu)
+		iommu_device_link(&iommu->iommu, dev);
+	else
+		return -ENODEV;
+
+	group = iommu_group_get_for_dev(dev);
+	if (IS_ERR(group))
+		return PTR_ERR(group);
+
+	iommu_group_put(group);
+
+	return 0;
+}
+
+static void msm_iommu_remove_device(struct device *dev)
+{
+	struct msm_iommu_dev *iommu;
+	unsigned long flags;
+
+	spin_lock_irqsave(&msm_iommu_lock, flags);
+	iommu = find_iommu_for_dev(dev);
+	spin_unlock_irqrestore(&msm_iommu_lock, flags);
+
+	if (iommu)
+		iommu_device_unlink(&iommu->iommu, dev);
+
+	iommu_group_remove_device(dev);
+}
+
+static int msm_iommu_attach_dev(struct iommu_domain *domain, struct device *dev)
+{
+	int ret = 0;
+	unsigned long flags;
+	struct msm_iommu_dev *iommu;
+	struct msm_priv *priv = to_msm_priv(domain);
+	struct msm_iommu_ctx_dev *master;
+
+	priv->dev = dev;
+	msm_iommu_domain_config(priv);
+
+	spin_lock_irqsave(&msm_iommu_lock, flags);
+	list_for_each_entry(iommu, &qcom_iommu_devices, dev_node) {
+		master = list_first_entry(&iommu->ctx_list,
+					  struct msm_iommu_ctx_dev,
+					  list);
+		if (master->of_node == dev->of_node) {
+			ret = __enable_clocks(iommu);
+			if (ret)
+				goto fail;
+
+			list_for_each_entry(master, &iommu->ctx_list, list) {
+				if (master->num) {
+					dev_err(dev, "domain already attached");
+					ret = -EEXIST;
+					goto fail;
+				}
+				master->num =
+					msm_iommu_alloc_ctx(iommu->context_map,
+							    0, iommu->ncb);
+					if (IS_ERR_VALUE(master->num)) {
+						ret = -ENODEV;
+						goto fail;
+					}
+				config_mids(iommu, master);
+				__program_context(iommu->base, master->num,
+						  priv);
+			}
+			__disable_clocks(iommu);
+			list_add(&iommu->dom_node, &priv->list_attached);
+		}
+	}
+
+fail:
+	spin_unlock_irqrestore(&msm_iommu_lock, flags);
+
+	return ret;
+}
+
+static void msm_iommu_detach_dev(struct iommu_domain *domain,
+				 struct device *dev)
+{
+	struct msm_priv *priv = to_msm_priv(domain);
+	unsigned long flags;
+	struct msm_iommu_dev *iommu;
+	struct msm_iommu_ctx_dev *master;
+	int ret;
+
+	free_io_pgtable_ops(priv->iop);
+
+	spin_lock_irqsave(&msm_iommu_lock, flags);
+	list_for_each_entry(iommu, &priv->list_attached, dom_node) {
+		ret = __enable_clocks(iommu);
+		if (ret)
+			goto fail;
+
+		list_for_each_entry(master, &iommu->ctx_list, list) {
+			msm_iommu_free_ctx(iommu->context_map, master->num);
+			__reset_context(iommu->base, master->num);
+		}
+		__disable_clocks(iommu);
+	}
+fail:
+	spin_unlock_irqrestore(&msm_iommu_lock, flags);
+}
+
+static int msm_iommu_map(struct iommu_domain *domain, unsigned long iova,
+			 phys_addr_t pa, size_t len, int prot)
+{
+	struct msm_priv *priv = to_msm_priv(domain);
+	unsigned long flags;
+	int ret;
+
+	spin_lock_irqsave(&priv->pgtlock, flags);
+	ret = priv->iop->map(priv->iop, iova, pa, len, prot);
+	spin_unlock_irqrestore(&priv->pgtlock, flags);
+
+	return ret;
+}
+
+static size_t msm_iommu_unmap(struct iommu_domain *domain, unsigned long iova,
+			      size_t len)
+{
+	struct msm_priv *priv = to_msm_priv(domain);
+	unsigned long flags;
+
+	spin_lock_irqsave(&priv->pgtlock, flags);
+	len = priv->iop->unmap(priv->iop, iova, len);
+	spin_unlock_irqrestore(&priv->pgtlock, flags);
+
+	return len;
+}
+
+static phys_addr_t msm_iommu_iova_to_phys(struct iommu_domain *domain,
+					  dma_addr_t va)
+{
+	struct msm_priv *priv;
+	struct msm_iommu_dev *iommu;
+	struct msm_iommu_ctx_dev *master;
+	unsigned int par;
+	unsigned long flags;
+	phys_addr_t ret = 0;
+
+	spin_lock_irqsave(&msm_iommu_lock, flags);
+
+	priv = to_msm_priv(domain);
+	iommu = list_first_entry(&priv->list_attached,
+				 struct msm_iommu_dev, dom_node);
+
+	if (list_empty(&iommu->ctx_list))
+		goto fail;
+
+	master = list_first_entry(&iommu->ctx_list,
+				  struct msm_iommu_ctx_dev, list);
+	if (!master)
+		goto fail;
+
+	ret = __enable_clocks(iommu);
+	if (ret)
+		goto fail;
+
+	/* Invalidate context TLB */
+	SET_CTX_TLBIALL(iommu->base, master->num, 0);
+	SET_V2PPR(iommu->base, master->num, va & V2Pxx_VA);
+
+	par = GET_PAR(iommu->base, master->num);
+
+	/* We are dealing with a supersection */
+	if (GET_NOFAULT_SS(iommu->base, master->num))
+		ret = (par & 0xFF000000) | (va & 0x00FFFFFF);
+	else	/* Upper 20 bits from PAR, lower 12 from VA */
+		ret = (par & 0xFFFFF000) | (va & 0x00000FFF);
+
+	if (GET_FAULT(iommu->base, master->num))
+		ret = 0;
+
+	__disable_clocks(iommu);
+fail:
+	spin_unlock_irqrestore(&msm_iommu_lock, flags);
+	return ret;
+}
+
+static bool msm_iommu_capable(enum iommu_cap cap)
+{
+	return false;
+}
+
+static void print_ctx_regs(void __iomem *base, int ctx)
+{
+	unsigned int fsr = GET_FSR(base, ctx);
+	pr_err("FAR    = %08x    PAR    = %08x\n",
+	       GET_FAR(base, ctx), GET_PAR(base, ctx));
+	pr_err("FSR    = %08x [%s%s%s%s%s%s%s%s%s%s]\n", fsr,
+			(fsr & 0x02) ? "TF " : "",
+			(fsr & 0x04) ? "AFF " : "",
+			(fsr & 0x08) ? "APF " : "",
+			(fsr & 0x10) ? "TLBMF " : "",
+			(fsr & 0x20) ? "HTWDEEF " : "",
+			(fsr & 0x40) ? "HTWSEEF " : "",
+			(fsr & 0x80) ? "MHF " : "",
+			(fsr & 0x10000) ? "SL " : "",
+			(fsr & 0x40000000) ? "SS " : "",
+			(fsr & 0x80000000) ? "MULTI " : "");
+
+	pr_err("FSYNR0 = %08x    FSYNR1 = %08x\n",
+	       GET_FSYNR0(base, ctx), GET_FSYNR1(base, ctx));
+	pr_err("TTBR0  = %08x    TTBR1  = %08x\n",
+	       GET_TTBR0(base, ctx), GET_TTBR1(base, ctx));
+	pr_err("SCTLR  = %08x    ACTLR  = %08x\n",
+	       GET_SCTLR(base, ctx), GET_ACTLR(base, ctx));
+}
+
+static void insert_iommu_master(struct device *dev,
+				struct msm_iommu_dev **iommu,
+				struct of_phandle_args *spec)
+{
+	struct msm_iommu_ctx_dev *master = dev->archdata.iommu;
+	int sid;
+
+	if (list_empty(&(*iommu)->ctx_list)) {
+		master = kzalloc(sizeof(*master), GFP_ATOMIC);
+		master->of_node = dev->of_node;
+		list_add(&master->list, &(*iommu)->ctx_list);
+		dev->archdata.iommu = master;
+	}
+
+	for (sid = 0; sid < master->num_mids; sid++)
+		if (master->mids[sid] == spec->args[0]) {
+			dev_warn(dev, "Stream ID 0x%hx repeated; ignoring\n",
+				 sid);
+			return;
+		}
+
+	master->mids[master->num_mids++] = spec->args[0];
+}
+
+static int qcom_iommu_of_xlate(struct device *dev,
+			       struct of_phandle_args *spec)
+{
+	struct msm_iommu_dev *iommu = NULL, *iter;
+	unsigned long flags;
+	int ret = 0;
+
+	spin_lock_irqsave(&msm_iommu_lock, flags);
+	list_for_each_entry(iter, &qcom_iommu_devices, dev_node) {
+		if (iter->dev->of_node == spec->np) {
+			iommu = iter;
+			break;
+		}
+	}
+
+	if (!iommu) {
+		ret = -ENODEV;
+		goto fail;
+	}
+
+	insert_iommu_master(dev, &iommu, spec);
+fail:
+	spin_unlock_irqrestore(&msm_iommu_lock, flags);
+
+	return ret;
+}
+
+irqreturn_t msm_iommu_fault_handler(int irq, void *dev_id)
+{
+	struct msm_iommu_dev *iommu = dev_id;
+	unsigned int fsr;
+	int i, ret;
+
+	spin_lock(&msm_iommu_lock);
+
+	if (!iommu) {
+		pr_err("Invalid device ID in context interrupt handler\n");
+		goto fail;
+	}
+
+	pr_err("Unexpected IOMMU page fault!\n");
+	pr_err("base = %08x\n", (unsigned int)iommu->base);
+
+	ret = __enable_clocks(iommu);
+	if (ret)
+		goto fail;
+
+	for (i = 0; i < iommu->ncb; i++) {
+		fsr = GET_FSR(iommu->base, i);
+		if (fsr) {
+			pr_err("Fault occurred in context %d.\n", i);
+			pr_err("Interesting registers:\n");
+			print_ctx_regs(iommu->base, i);
+			SET_FSR(iommu->base, i, 0x4000000F);
+		}
+	}
+	__disable_clocks(iommu);
+fail:
+	spin_unlock(&msm_iommu_lock);
+	return 0;
+}
+
+static struct iommu_ops msm_iommu_ops = {
+	.capable = msm_iommu_capable,
+	.domain_alloc = msm_iommu_domain_alloc,
+	.domain_free = msm_iommu_domain_free,
+	.attach_dev = msm_iommu_attach_dev,
+	.detach_dev = msm_iommu_detach_dev,
+	.map = msm_iommu_map,
+	.unmap = msm_iommu_unmap,
+	.iova_to_phys = msm_iommu_iova_to_phys,
+	.add_device = msm_iommu_add_device,
+	.remove_device = msm_iommu_remove_device,
+	.device_group = generic_device_group,
+	.pgsize_bitmap = MSM_IOMMU_PGSIZES,
+	.of_xlate = qcom_iommu_of_xlate,
+};
+
+static int msm_iommu_probe(struct platform_device *pdev)
+{
+	struct resource *r;
+	resource_size_t ioaddr;
+	struct msm_iommu_dev *iommu;
+	int ret, par, val;
+
+	iommu = devm_kzalloc(&pdev->dev, sizeof(*iommu), GFP_KERNEL);
+	if (!iommu)
+		return -ENODEV;
+
+	iommu->dev = &pdev->dev;
+	INIT_LIST_HEAD(&iommu->ctx_list);
+
+	iommu->pclk = devm_clk_get(iommu->dev, "smmu_pclk");
+	if (IS_ERR(iommu->pclk)) {
+		dev_err(iommu->dev, "could not get smmu_pclk\n");
+		return PTR_ERR(iommu->pclk);
+	}
+
+	ret = clk_prepare(iommu->pclk);
+	if (ret) {
+		dev_err(iommu->dev, "could not prepare smmu_pclk\n");
+		return ret;
+	}
+
+	iommu->clk = devm_clk_get(iommu->dev, "iommu_clk");
+	if (IS_ERR(iommu->clk)) {
+		dev_err(iommu->dev, "could not get iommu_clk\n");
+		clk_unprepare(iommu->pclk);
+		return PTR_ERR(iommu->clk);
+	}
+
+	ret = clk_prepare(iommu->clk);
+	if (ret) {
+		dev_err(iommu->dev, "could not prepare iommu_clk\n");
+		clk_unprepare(iommu->pclk);
+		return ret;
+	}
+
+	r = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	iommu->base = devm_ioremap_resource(iommu->dev, r);
+	if (IS_ERR(iommu->base)) {
+		dev_err(iommu->dev, "could not get iommu base\n");
+		ret = PTR_ERR(iommu->base);
+		goto fail;
+	}
+	ioaddr = r->start;
+
+	iommu->irq = platform_get_irq(pdev, 0);
+	if (iommu->irq < 0) {
+		dev_err(iommu->dev, "could not get iommu irq\n");
+		ret = -ENODEV;
+		goto fail;
+	}
+
+	ret = of_property_read_u32(iommu->dev->of_node, "qcom,ncb", &val);
+	if (ret) {
+		dev_err(iommu->dev, "could not get ncb\n");
+		goto fail;
+	}
+	iommu->ncb = val;
+
+	msm_iommu_reset(iommu->base, iommu->ncb);
+	SET_M(iommu->base, 0, 1);
+	SET_PAR(iommu->base, 0, 0);
+	SET_V2PCFG(iommu->base, 0, 1);
+	SET_V2PPR(iommu->base, 0, 0);
+	par = GET_PAR(iommu->base, 0);
+	SET_V2PCFG(iommu->base, 0, 0);
+	SET_M(iommu->base, 0, 0);
+
+	if (!par) {
+		pr_err("Invalid PAR value detected\n");
+		ret = -ENODEV;
+		goto fail;
+	}
+
+	ret = devm_request_threaded_irq(iommu->dev, iommu->irq, NULL,
+					msm_iommu_fault_handler,
+					IRQF_ONESHOT | IRQF_SHARED,
+					"msm_iommu_secure_irpt_handler",
+					iommu);
+	if (ret) {
+		pr_err("Request IRQ %d failed with ret=%d\n", iommu->irq, ret);
+		goto fail;
+	}
+
+	list_add(&iommu->dev_node, &qcom_iommu_devices);
+
+	ret = iommu_device_sysfs_add(&iommu->iommu, iommu->dev, NULL,
+				     "msm-smmu.%pa", &ioaddr);
+	if (ret) {
+		pr_err("Could not add msm-smmu at %pa to sysfs\n", &ioaddr);
+		goto fail;
+	}
+
+	iommu_device_set_ops(&iommu->iommu, &msm_iommu_ops);
+	iommu_device_set_fwnode(&iommu->iommu, &pdev->dev.of_node->fwnode);
+
+	ret = iommu_device_register(&iommu->iommu);
+	if (ret) {
+		pr_err("Could not register msm-smmu at %pa\n", &ioaddr);
+		goto fail;
+	}
+
+	bus_set_iommu(&platform_bus_type, &msm_iommu_ops);
+
+	pr_info("device mapped at %p, irq %d with %d ctx banks\n",
+		iommu->base, iommu->irq, iommu->ncb);
+
+	return ret;
+fail:
+	clk_unprepare(iommu->clk);
+	clk_unprepare(iommu->pclk);
+	return ret;
+}
+
+static const struct of_device_id msm_iommu_dt_match[] = {
+	{ .compatible = "qcom,apq8064-iommu" },
+	{}
+};
+
+static int msm_iommu_remove(struct platform_device *pdev)
+{
+	struct msm_iommu_dev *iommu = platform_get_drvdata(pdev);
+
+	clk_unprepare(iommu->clk);
+	clk_unprepare(iommu->pclk);
+	return 0;
+}
+
+static struct platform_driver msm_iommu_driver = {
+	.driver = {
+		.name	= "msm_iommu",
+		.of_match_table = msm_iommu_dt_match,
+	},
+	.probe		= msm_iommu_probe,
+	.remove		= msm_iommu_remove,
+};
+
+static int __init msm_iommu_driver_init(void)
+{
+	int ret;
+
+	ret = platform_driver_register(&msm_iommu_driver);
+	if (ret != 0)
+		pr_err("Failed to register IOMMU driver\n");
+
+	return ret;
+}
+
+static void __exit msm_iommu_driver_exit(void)
+{
+	platform_driver_unregister(&msm_iommu_driver);
+}
+
+subsys_initcall(msm_iommu_driver_init);
+module_exit(msm_iommu_driver_exit);
+
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Stepan Moskovchenko <stepanm@codeaurora.org>");
diff --git a/drivers/iommu/msm_iommu.h b/drivers/iommu/msm_iommu.h
new file mode 100644
index 000000000..ae92d2779
--- /dev/null
+++ b/drivers/iommu/msm_iommu.h
@@ -0,0 +1,102 @@
+/* Copyright (c) 2010-2011, Code Aurora Forum. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+
+#ifndef MSM_IOMMU_H
+#define MSM_IOMMU_H
+
+#include <linux/interrupt.h>
+#include <linux/iommu.h>
+#include <linux/clk.h>
+
+/* Sharability attributes of MSM IOMMU mappings */
+#define MSM_IOMMU_ATTR_NON_SH		0x0
+#define MSM_IOMMU_ATTR_SH		0x4
+
+/* Cacheability attributes of MSM IOMMU mappings */
+#define MSM_IOMMU_ATTR_NONCACHED	0x0
+#define MSM_IOMMU_ATTR_CACHED_WB_WA	0x1
+#define MSM_IOMMU_ATTR_CACHED_WB_NWA	0x2
+#define MSM_IOMMU_ATTR_CACHED_WT	0x3
+
+/* Mask for the cache policy attribute */
+#define MSM_IOMMU_CP_MASK		0x03
+
+/* Maximum number of Machine IDs that we are allowing to be mapped to the same
+ * context bank. The number of MIDs mapped to the same CB does not affect
+ * performance, but there is a practical limit on how many distinct MIDs may
+ * be present. These mappings are typically determined at design time and are
+ * not expected to change at run time.
+ */
+#define MAX_NUM_MIDS	32
+
+/* Maximum number of context banks that can be present in IOMMU */
+#define IOMMU_MAX_CBS	128
+
+/**
+ * struct msm_iommu_dev - a single IOMMU hardware instance
+ * ncb		Number of context banks present on this IOMMU HW instance
+ * dev:		IOMMU device
+ * irq:		Interrupt number
+ * clk:		The bus clock for this IOMMU hardware instance
+ * pclk:	The clock for the IOMMU bus interconnect
+ * dev_node:	list head in qcom_iommu_device_list
+ * dom_node:	list head for domain
+ * ctx_list:	list of 'struct msm_iommu_ctx_dev'
+ * context_map: Bitmap to track allocated context banks
+ */
+struct msm_iommu_dev {
+	void __iomem *base;
+	int ncb;
+	struct device *dev;
+	int irq;
+	struct clk *clk;
+	struct clk *pclk;
+	struct list_head dev_node;
+	struct list_head dom_node;
+	struct list_head ctx_list;
+	DECLARE_BITMAP(context_map, IOMMU_MAX_CBS);
+
+	struct iommu_device iommu;
+};
+
+/**
+ * struct msm_iommu_ctx_dev - an IOMMU context bank instance
+ * of_node	node ptr of client device
+ * num		Index of this context bank within the hardware
+ * mids		List of Machine IDs that are to be mapped into this context
+ *		bank, terminated by -1. The MID is a set of signals on the
+ *		AXI bus that identifies the function associated with a specific
+ *		memory request. (See ARM spec).
+ * num_mids	Total number of mids
+ * node		list head in ctx_list
+ */
+struct msm_iommu_ctx_dev {
+	struct device_node *of_node;
+	int num;
+	int mids[MAX_NUM_MIDS];
+	int num_mids;
+	struct list_head list;
+};
+
+/*
+ * Interrupt handler for the IOMMU context fault interrupt. Hooking the
+ * interrupt is not supported in the API yet, but this will print an error
+ * message and dump useful IOMMU registers.
+ */
+irqreturn_t msm_iommu_fault_handler(int irq, void *dev_id);
+
+#endif
diff --git a/drivers/iommu/msm_iommu_hw-8xxx.h b/drivers/iommu/msm_iommu_hw-8xxx.h
new file mode 100644
index 000000000..fc160101d
--- /dev/null
+++ b/drivers/iommu/msm_iommu_hw-8xxx.h
@@ -0,0 +1,1865 @@
+/* Copyright (c) 2010-2011, Code Aurora Forum. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+
+#ifndef __ARCH_ARM_MACH_MSM_IOMMU_HW_8XXX_H
+#define __ARCH_ARM_MACH_MSM_IOMMU_HW_8XXX_H
+
+#define CTX_SHIFT 12
+
+#define GET_GLOBAL_REG(reg, base) (readl((base) + (reg)))
+#define GET_CTX_REG(reg, base, ctx) \
+				(readl((base) + (reg) + ((ctx) << CTX_SHIFT)))
+
+#define SET_GLOBAL_REG(reg, base, val)	writel((val), ((base) + (reg)))
+
+#define SET_CTX_REG(reg, base, ctx, val) \
+			writel((val), ((base) + (reg) + ((ctx) << CTX_SHIFT)))
+
+/* Wrappers for numbered registers */
+#define SET_GLOBAL_REG_N(b, n, r, v) SET_GLOBAL_REG(b, ((r) + (n << 2)), (v))
+#define GET_GLOBAL_REG_N(b, n, r)    GET_GLOBAL_REG(b, ((r) + (n << 2)))
+
+/* Field wrappers */
+#define GET_GLOBAL_FIELD(b, r, F)    GET_FIELD(((b) + (r)), F##_MASK, F##_SHIFT)
+#define GET_CONTEXT_FIELD(b, c, r, F)	\
+	GET_FIELD(((b) + (r) + ((c) << CTX_SHIFT)), F##_MASK, F##_SHIFT)
+
+#define SET_GLOBAL_FIELD(b, r, F, v) \
+	SET_FIELD(((b) + (r)), F##_MASK, F##_SHIFT, (v))
+#define SET_CONTEXT_FIELD(b, c, r, F, v)	\
+	SET_FIELD(((b) + (r) + ((c) << CTX_SHIFT)), F##_MASK, F##_SHIFT, (v))
+
+#define GET_FIELD(addr, mask, shift)  ((readl(addr) >> (shift)) & (mask))
+
+#define SET_FIELD(addr, mask, shift, v) \
+do { \
+	int t = readl(addr); \
+	writel((t & ~((mask) << (shift))) + (((v) & (mask)) << (shift)), addr);\
+} while (0)
+
+
+#define NUM_FL_PTE	4096
+#define NUM_SL_PTE	256
+#define NUM_TEX_CLASS	8
+
+/* First-level page table bits */
+#define FL_BASE_MASK		0xFFFFFC00
+#define FL_TYPE_TABLE		(1 << 0)
+#define FL_TYPE_SECT		(2 << 0)
+#define FL_SUPERSECTION		(1 << 18)
+#define FL_AP_WRITE		(1 << 10)
+#define FL_AP_READ		(1 << 11)
+#define FL_SHARED		(1 << 16)
+#define FL_BUFFERABLE		(1 << 2)
+#define FL_CACHEABLE		(1 << 3)
+#define FL_TEX0			(1 << 12)
+#define FL_OFFSET(va)		(((va) & 0xFFF00000) >> 20)
+#define FL_NG			(1 << 17)
+
+/* Second-level page table bits */
+#define SL_BASE_MASK_LARGE	0xFFFF0000
+#define SL_BASE_MASK_SMALL	0xFFFFF000
+#define SL_TYPE_LARGE		(1 << 0)
+#define SL_TYPE_SMALL		(2 << 0)
+#define SL_AP0			(1 << 4)
+#define SL_AP1			(2 << 4)
+#define SL_SHARED		(1 << 10)
+#define SL_BUFFERABLE		(1 << 2)
+#define SL_CACHEABLE		(1 << 3)
+#define SL_TEX0			(1 << 6)
+#define SL_OFFSET(va)		(((va) & 0xFF000) >> 12)
+#define SL_NG			(1 << 11)
+
+/* Memory type and cache policy attributes */
+#define MT_SO			0
+#define MT_DEV			1
+#define MT_NORMAL		2
+#define CP_NONCACHED		0
+#define CP_WB_WA		1
+#define CP_WT			2
+#define CP_WB_NWA		3
+
+/* Global register setters / getters */
+#define SET_M2VCBR_N(b, N, v)	 SET_GLOBAL_REG_N(M2VCBR_N, N, (b), (v))
+#define SET_CBACR_N(b, N, v)	 SET_GLOBAL_REG_N(CBACR_N, N, (b), (v))
+#define SET_TLBRSW(b, v)	 SET_GLOBAL_REG(TLBRSW, (b), (v))
+#define SET_TLBTR0(b, v)	 SET_GLOBAL_REG(TLBTR0, (b), (v))
+#define SET_TLBTR1(b, v)	 SET_GLOBAL_REG(TLBTR1, (b), (v))
+#define SET_TLBTR2(b, v)	 SET_GLOBAL_REG(TLBTR2, (b), (v))
+#define SET_TESTBUSCR(b, v)	 SET_GLOBAL_REG(TESTBUSCR, (b), (v))
+#define SET_GLOBAL_TLBIALL(b, v) SET_GLOBAL_REG(GLOBAL_TLBIALL, (b), (v))
+#define SET_TLBIVMID(b, v)	 SET_GLOBAL_REG(TLBIVMID, (b), (v))
+#define SET_CR(b, v)		 SET_GLOBAL_REG(CR, (b), (v))
+#define SET_EAR(b, v)		 SET_GLOBAL_REG(EAR, (b), (v))
+#define SET_ESR(b, v)		 SET_GLOBAL_REG(ESR, (b), (v))
+#define SET_ESRRESTORE(b, v)	 SET_GLOBAL_REG(ESRRESTORE, (b), (v))
+#define SET_ESYNR0(b, v)	 SET_GLOBAL_REG(ESYNR0, (b), (v))
+#define SET_ESYNR1(b, v)	 SET_GLOBAL_REG(ESYNR1, (b), (v))
+#define SET_RPU_ACR(b, v)	 SET_GLOBAL_REG(RPU_ACR, (b), (v))
+
+#define GET_M2VCBR_N(b, N)	 GET_GLOBAL_REG_N(M2VCBR_N, N, (b))
+#define GET_CBACR_N(b, N)	 GET_GLOBAL_REG_N(CBACR_N, N, (b))
+#define GET_TLBTR0(b)		 GET_GLOBAL_REG(TLBTR0, (b))
+#define GET_TLBTR1(b)		 GET_GLOBAL_REG(TLBTR1, (b))
+#define GET_TLBTR2(b)		 GET_GLOBAL_REG(TLBTR2, (b))
+#define GET_TESTBUSCR(b)	 GET_GLOBAL_REG(TESTBUSCR, (b))
+#define GET_GLOBAL_TLBIALL(b)	 GET_GLOBAL_REG(GLOBAL_TLBIALL, (b))
+#define GET_TLBIVMID(b)		 GET_GLOBAL_REG(TLBIVMID, (b))
+#define GET_CR(b)		 GET_GLOBAL_REG(CR, (b))
+#define GET_EAR(b)		 GET_GLOBAL_REG(EAR, (b))
+#define GET_ESR(b)		 GET_GLOBAL_REG(ESR, (b))
+#define GET_ESRRESTORE(b)	 GET_GLOBAL_REG(ESRRESTORE, (b))
+#define GET_ESYNR0(b)		 GET_GLOBAL_REG(ESYNR0, (b))
+#define GET_ESYNR1(b)		 GET_GLOBAL_REG(ESYNR1, (b))
+#define GET_REV(b)		 GET_GLOBAL_REG(REV, (b))
+#define GET_IDR(b)		 GET_GLOBAL_REG(IDR, (b))
+#define GET_RPU_ACR(b)		 GET_GLOBAL_REG(RPU_ACR, (b))
+
+
+/* Context register setters/getters */
+#define SET_SCTLR(b, c, v)	 SET_CTX_REG(SCTLR, (b), (c), (v))
+#define SET_ACTLR(b, c, v)	 SET_CTX_REG(ACTLR, (b), (c), (v))
+#define SET_CONTEXTIDR(b, c, v)	 SET_CTX_REG(CONTEXTIDR, (b), (c), (v))
+#define SET_TTBR0(b, c, v)	 SET_CTX_REG(TTBR0, (b), (c), (v))
+#define SET_TTBR1(b, c, v)	 SET_CTX_REG(TTBR1, (b), (c), (v))
+#define SET_TTBCR(b, c, v)	 SET_CTX_REG(TTBCR, (b), (c), (v))
+#define SET_PAR(b, c, v)	 SET_CTX_REG(PAR, (b), (c), (v))
+#define SET_FSR(b, c, v)	 SET_CTX_REG(FSR, (b), (c), (v))
+#define SET_FSRRESTORE(b, c, v)	 SET_CTX_REG(FSRRESTORE, (b), (c), (v))
+#define SET_FAR(b, c, v)	 SET_CTX_REG(FAR, (b), (c), (v))
+#define SET_FSYNR0(b, c, v)	 SET_CTX_REG(FSYNR0, (b), (c), (v))
+#define SET_FSYNR1(b, c, v)	 SET_CTX_REG(FSYNR1, (b), (c), (v))
+#define SET_PRRR(b, c, v)	 SET_CTX_REG(PRRR, (b), (c), (v))
+#define SET_NMRR(b, c, v)	 SET_CTX_REG(NMRR, (b), (c), (v))
+#define SET_TLBLKCR(b, c, v)	 SET_CTX_REG(TLBLCKR, (b), (c), (v))
+#define SET_V2PSR(b, c, v)	 SET_CTX_REG(V2PSR, (b), (c), (v))
+#define SET_TLBFLPTER(b, c, v)	 SET_CTX_REG(TLBFLPTER, (b), (c), (v))
+#define SET_TLBSLPTER(b, c, v)	 SET_CTX_REG(TLBSLPTER, (b), (c), (v))
+#define SET_BFBCR(b, c, v)	 SET_CTX_REG(BFBCR, (b), (c), (v))
+#define SET_CTX_TLBIALL(b, c, v) SET_CTX_REG(CTX_TLBIALL, (b), (c), (v))
+#define SET_TLBIASID(b, c, v)	 SET_CTX_REG(TLBIASID, (b), (c), (v))
+#define SET_TLBIVA(b, c, v)	 SET_CTX_REG(TLBIVA, (b), (c), (v))
+#define SET_TLBIVAA(b, c, v)	 SET_CTX_REG(TLBIVAA, (b), (c), (v))
+#define SET_V2PPR(b, c, v)	 SET_CTX_REG(V2PPR, (b), (c), (v))
+#define SET_V2PPW(b, c, v)	 SET_CTX_REG(V2PPW, (b), (c), (v))
+#define SET_V2PUR(b, c, v)	 SET_CTX_REG(V2PUR, (b), (c), (v))
+#define SET_V2PUW(b, c, v)	 SET_CTX_REG(V2PUW, (b), (c), (v))
+#define SET_RESUME(b, c, v)	 SET_CTX_REG(RESUME, (b), (c), (v))
+
+#define GET_SCTLR(b, c)		 GET_CTX_REG(SCTLR, (b), (c))
+#define GET_ACTLR(b, c)		 GET_CTX_REG(ACTLR, (b), (c))
+#define GET_CONTEXTIDR(b, c)	 GET_CTX_REG(CONTEXTIDR, (b), (c))
+#define GET_TTBR0(b, c)		 GET_CTX_REG(TTBR0, (b), (c))
+#define GET_TTBR1(b, c)		 GET_CTX_REG(TTBR1, (b), (c))
+#define GET_TTBCR(b, c)		 GET_CTX_REG(TTBCR, (b), (c))
+#define GET_PAR(b, c)		 GET_CTX_REG(PAR, (b), (c))
+#define GET_FSR(b, c)		 GET_CTX_REG(FSR, (b), (c))
+#define GET_FSRRESTORE(b, c)	 GET_CTX_REG(FSRRESTORE, (b), (c))
+#define GET_FAR(b, c)		 GET_CTX_REG(FAR, (b), (c))
+#define GET_FSYNR0(b, c)	 GET_CTX_REG(FSYNR0, (b), (c))
+#define GET_FSYNR1(b, c)	 GET_CTX_REG(FSYNR1, (b), (c))
+#define GET_PRRR(b, c)		 GET_CTX_REG(PRRR, (b), (c))
+#define GET_NMRR(b, c)		 GET_CTX_REG(NMRR, (b), (c))
+#define GET_TLBLCKR(b, c)	 GET_CTX_REG(TLBLCKR, (b), (c))
+#define GET_V2PSR(b, c)		 GET_CTX_REG(V2PSR, (b), (c))
+#define GET_TLBFLPTER(b, c)	 GET_CTX_REG(TLBFLPTER, (b), (c))
+#define GET_TLBSLPTER(b, c)	 GET_CTX_REG(TLBSLPTER, (b), (c))
+#define GET_BFBCR(b, c)		 GET_CTX_REG(BFBCR, (b), (c))
+#define GET_CTX_TLBIALL(b, c)	 GET_CTX_REG(CTX_TLBIALL, (b), (c))
+#define GET_TLBIASID(b, c)	 GET_CTX_REG(TLBIASID, (b), (c))
+#define GET_TLBIVA(b, c)	 GET_CTX_REG(TLBIVA, (b), (c))
+#define GET_TLBIVAA(b, c)	 GET_CTX_REG(TLBIVAA, (b), (c))
+#define GET_V2PPR(b, c)		 GET_CTX_REG(V2PPR, (b), (c))
+#define GET_V2PPW(b, c)		 GET_CTX_REG(V2PPW, (b), (c))
+#define GET_V2PUR(b, c)		 GET_CTX_REG(V2PUR, (b), (c))
+#define GET_V2PUW(b, c)		 GET_CTX_REG(V2PUW, (b), (c))
+#define GET_RESUME(b, c)	 GET_CTX_REG(RESUME, (b), (c))
+
+
+/* Global field setters / getters */
+/* Global Field Setters: */
+/* CBACR_N */
+#define SET_RWVMID(b, n, v)   SET_GLOBAL_FIELD(b, (n<<2)|(CBACR_N), RWVMID, v)
+#define SET_RWE(b, n, v)      SET_GLOBAL_FIELD(b, (n<<2)|(CBACR_N), RWE, v)
+#define SET_RWGE(b, n, v)     SET_GLOBAL_FIELD(b, (n<<2)|(CBACR_N), RWGE, v)
+#define SET_CBVMID(b, n, v)   SET_GLOBAL_FIELD(b, (n<<2)|(CBACR_N), CBVMID, v)
+#define SET_IRPTNDX(b, n, v)  SET_GLOBAL_FIELD(b, (n<<2)|(CBACR_N), IRPTNDX, v)
+
+
+/* M2VCBR_N */
+#define SET_VMID(b, n, v)     SET_GLOBAL_FIELD(b, (n<<2)|(M2VCBR_N), VMID, v)
+#define SET_CBNDX(b, n, v)    SET_GLOBAL_FIELD(b, (n<<2)|(M2VCBR_N), CBNDX, v)
+#define SET_BYPASSD(b, n, v)  SET_GLOBAL_FIELD(b, (n<<2)|(M2VCBR_N), BYPASSD, v)
+#define SET_BPRCOSH(b, n, v)  SET_GLOBAL_FIELD(b, (n<<2)|(M2VCBR_N), BPRCOSH, v)
+#define SET_BPRCISH(b, n, v)  SET_GLOBAL_FIELD(b, (n<<2)|(M2VCBR_N), BPRCISH, v)
+#define SET_BPRCNSH(b, n, v)  SET_GLOBAL_FIELD(b, (n<<2)|(M2VCBR_N), BPRCNSH, v)
+#define SET_BPSHCFG(b, n, v)  SET_GLOBAL_FIELD(b, (n<<2)|(M2VCBR_N), BPSHCFG, v)
+#define SET_NSCFG(b, n, v)    SET_GLOBAL_FIELD(b, (n<<2)|(M2VCBR_N), NSCFG, v)
+#define SET_BPMTCFG(b, n, v)  SET_GLOBAL_FIELD(b, (n<<2)|(M2VCBR_N), BPMTCFG, v)
+#define SET_BPMEMTYPE(b, n, v) \
+	SET_GLOBAL_FIELD(b, (n<<2)|(M2VCBR_N), BPMEMTYPE, v)
+
+
+/* CR */
+#define SET_RPUE(b, v)		 SET_GLOBAL_FIELD(b, CR, RPUE, v)
+#define SET_RPUERE(b, v)	 SET_GLOBAL_FIELD(b, CR, RPUERE, v)
+#define SET_RPUEIE(b, v)	 SET_GLOBAL_FIELD(b, CR, RPUEIE, v)
+#define SET_DCDEE(b, v)		 SET_GLOBAL_FIELD(b, CR, DCDEE, v)
+#define SET_CLIENTPD(b, v)       SET_GLOBAL_FIELD(b, CR, CLIENTPD, v)
+#define SET_STALLD(b, v)	 SET_GLOBAL_FIELD(b, CR, STALLD, v)
+#define SET_TLBLKCRWE(b, v)      SET_GLOBAL_FIELD(b, CR, TLBLKCRWE, v)
+#define SET_CR_TLBIALLCFG(b, v)  SET_GLOBAL_FIELD(b, CR, CR_TLBIALLCFG, v)
+#define SET_TLBIVMIDCFG(b, v)    SET_GLOBAL_FIELD(b, CR, TLBIVMIDCFG, v)
+#define SET_CR_HUME(b, v)        SET_GLOBAL_FIELD(b, CR, CR_HUME, v)
+
+
+/* ESR */
+#define SET_CFG(b, v)		 SET_GLOBAL_FIELD(b, ESR, CFG, v)
+#define SET_BYPASS(b, v)	 SET_GLOBAL_FIELD(b, ESR, BYPASS, v)
+#define SET_ESR_MULTI(b, v)      SET_GLOBAL_FIELD(b, ESR, ESR_MULTI, v)
+
+
+/* ESYNR0 */
+#define SET_ESYNR0_AMID(b, v)    SET_GLOBAL_FIELD(b, ESYNR0, ESYNR0_AMID, v)
+#define SET_ESYNR0_APID(b, v)    SET_GLOBAL_FIELD(b, ESYNR0, ESYNR0_APID, v)
+#define SET_ESYNR0_ABID(b, v)    SET_GLOBAL_FIELD(b, ESYNR0, ESYNR0_ABID, v)
+#define SET_ESYNR0_AVMID(b, v)   SET_GLOBAL_FIELD(b, ESYNR0, ESYNR0_AVMID, v)
+#define SET_ESYNR0_ATID(b, v)    SET_GLOBAL_FIELD(b, ESYNR0, ESYNR0_ATID, v)
+
+
+/* ESYNR1 */
+#define SET_ESYNR1_AMEMTYPE(b, v) \
+			SET_GLOBAL_FIELD(b, ESYNR1, ESYNR1_AMEMTYPE, v)
+#define SET_ESYNR1_ASHARED(b, v)  SET_GLOBAL_FIELD(b, ESYNR1, ESYNR1_ASHARED, v)
+#define SET_ESYNR1_AINNERSHARED(b, v) \
+			SET_GLOBAL_FIELD(b, ESYNR1, ESYNR1_AINNERSHARED, v)
+#define SET_ESYNR1_APRIV(b, v)   SET_GLOBAL_FIELD(b, ESYNR1, ESYNR1_APRIV, v)
+#define SET_ESYNR1_APROTNS(b, v) SET_GLOBAL_FIELD(b, ESYNR1, ESYNR1_APROTNS, v)
+#define SET_ESYNR1_AINST(b, v)   SET_GLOBAL_FIELD(b, ESYNR1, ESYNR1_AINST, v)
+#define SET_ESYNR1_AWRITE(b, v)  SET_GLOBAL_FIELD(b, ESYNR1, ESYNR1_AWRITE, v)
+#define SET_ESYNR1_ABURST(b, v)  SET_GLOBAL_FIELD(b, ESYNR1, ESYNR1_ABURST, v)
+#define SET_ESYNR1_ALEN(b, v)    SET_GLOBAL_FIELD(b, ESYNR1, ESYNR1_ALEN, v)
+#define SET_ESYNR1_ASIZE(b, v)   SET_GLOBAL_FIELD(b, ESYNR1, ESYNR1_ASIZE, v)
+#define SET_ESYNR1_ALOCK(b, v)   SET_GLOBAL_FIELD(b, ESYNR1, ESYNR1_ALOCK, v)
+#define SET_ESYNR1_AOOO(b, v)    SET_GLOBAL_FIELD(b, ESYNR1, ESYNR1_AOOO, v)
+#define SET_ESYNR1_AFULL(b, v)   SET_GLOBAL_FIELD(b, ESYNR1, ESYNR1_AFULL, v)
+#define SET_ESYNR1_AC(b, v)      SET_GLOBAL_FIELD(b, ESYNR1, ESYNR1_AC, v)
+#define SET_ESYNR1_DCD(b, v)     SET_GLOBAL_FIELD(b, ESYNR1, ESYNR1_DCD, v)
+
+
+/* TESTBUSCR */
+#define SET_TBE(b, v)		 SET_GLOBAL_FIELD(b, TESTBUSCR, TBE, v)
+#define SET_SPDMBE(b, v)	 SET_GLOBAL_FIELD(b, TESTBUSCR, SPDMBE, v)
+#define SET_WGSEL(b, v)		 SET_GLOBAL_FIELD(b, TESTBUSCR, WGSEL, v)
+#define SET_TBLSEL(b, v)	 SET_GLOBAL_FIELD(b, TESTBUSCR, TBLSEL, v)
+#define SET_TBHSEL(b, v)	 SET_GLOBAL_FIELD(b, TESTBUSCR, TBHSEL, v)
+#define SET_SPDM0SEL(b, v)       SET_GLOBAL_FIELD(b, TESTBUSCR, SPDM0SEL, v)
+#define SET_SPDM1SEL(b, v)       SET_GLOBAL_FIELD(b, TESTBUSCR, SPDM1SEL, v)
+#define SET_SPDM2SEL(b, v)       SET_GLOBAL_FIELD(b, TESTBUSCR, SPDM2SEL, v)
+#define SET_SPDM3SEL(b, v)       SET_GLOBAL_FIELD(b, TESTBUSCR, SPDM3SEL, v)
+
+
+/* TLBIVMID */
+#define SET_TLBIVMID_VMID(b, v)  SET_GLOBAL_FIELD(b, TLBIVMID, TLBIVMID_VMID, v)
+
+
+/* TLBRSW */
+#define SET_TLBRSW_INDEX(b, v)   SET_GLOBAL_FIELD(b, TLBRSW, TLBRSW_INDEX, v)
+#define SET_TLBBFBS(b, v)	 SET_GLOBAL_FIELD(b, TLBRSW, TLBBFBS, v)
+
+
+/* TLBTR0 */
+#define SET_PR(b, v)		 SET_GLOBAL_FIELD(b, TLBTR0, PR, v)
+#define SET_PW(b, v)		 SET_GLOBAL_FIELD(b, TLBTR0, PW, v)
+#define SET_UR(b, v)		 SET_GLOBAL_FIELD(b, TLBTR0, UR, v)
+#define SET_UW(b, v)		 SET_GLOBAL_FIELD(b, TLBTR0, UW, v)
+#define SET_XN(b, v)		 SET_GLOBAL_FIELD(b, TLBTR0, XN, v)
+#define SET_NSDESC(b, v)	 SET_GLOBAL_FIELD(b, TLBTR0, NSDESC, v)
+#define SET_ISH(b, v)		 SET_GLOBAL_FIELD(b, TLBTR0, ISH, v)
+#define SET_SH(b, v)		 SET_GLOBAL_FIELD(b, TLBTR0, SH, v)
+#define SET_MT(b, v)		 SET_GLOBAL_FIELD(b, TLBTR0, MT, v)
+#define SET_DPSIZR(b, v)	 SET_GLOBAL_FIELD(b, TLBTR0, DPSIZR, v)
+#define SET_DPSIZC(b, v)	 SET_GLOBAL_FIELD(b, TLBTR0, DPSIZC, v)
+
+
+/* TLBTR1 */
+#define SET_TLBTR1_VMID(b, v)    SET_GLOBAL_FIELD(b, TLBTR1, TLBTR1_VMID, v)
+#define SET_TLBTR1_PA(b, v)      SET_GLOBAL_FIELD(b, TLBTR1, TLBTR1_PA, v)
+
+
+/* TLBTR2 */
+#define SET_TLBTR2_ASID(b, v)    SET_GLOBAL_FIELD(b, TLBTR2, TLBTR2_ASID, v)
+#define SET_TLBTR2_V(b, v)       SET_GLOBAL_FIELD(b, TLBTR2, TLBTR2_V, v)
+#define SET_TLBTR2_NSTID(b, v)   SET_GLOBAL_FIELD(b, TLBTR2, TLBTR2_NSTID, v)
+#define SET_TLBTR2_NV(b, v)      SET_GLOBAL_FIELD(b, TLBTR2, TLBTR2_NV, v)
+#define SET_TLBTR2_VA(b, v)      SET_GLOBAL_FIELD(b, TLBTR2, TLBTR2_VA, v)
+
+
+/* Global Field Getters */
+/* CBACR_N */
+#define GET_RWVMID(b, n)	 GET_GLOBAL_FIELD(b, (n<<2)|(CBACR_N), RWVMID)
+#define GET_RWE(b, n)		 GET_GLOBAL_FIELD(b, (n<<2)|(CBACR_N), RWE)
+#define GET_RWGE(b, n)		 GET_GLOBAL_FIELD(b, (n<<2)|(CBACR_N), RWGE)
+#define GET_CBVMID(b, n)	 GET_GLOBAL_FIELD(b, (n<<2)|(CBACR_N), CBVMID)
+#define GET_IRPTNDX(b, n)	 GET_GLOBAL_FIELD(b, (n<<2)|(CBACR_N), IRPTNDX)
+
+
+/* M2VCBR_N */
+#define GET_VMID(b, n)       GET_GLOBAL_FIELD(b, (n<<2)|(M2VCBR_N), VMID)
+#define GET_CBNDX(b, n)      GET_GLOBAL_FIELD(b, (n<<2)|(M2VCBR_N), CBNDX)
+#define GET_BYPASSD(b, n)    GET_GLOBAL_FIELD(b, (n<<2)|(M2VCBR_N), BYPASSD)
+#define GET_BPRCOSH(b, n)    GET_GLOBAL_FIELD(b, (n<<2)|(M2VCBR_N), BPRCOSH)
+#define GET_BPRCISH(b, n)    GET_GLOBAL_FIELD(b, (n<<2)|(M2VCBR_N), BPRCISH)
+#define GET_BPRCNSH(b, n)    GET_GLOBAL_FIELD(b, (n<<2)|(M2VCBR_N), BPRCNSH)
+#define GET_BPSHCFG(b, n)    GET_GLOBAL_FIELD(b, (n<<2)|(M2VCBR_N), BPSHCFG)
+#define GET_NSCFG(b, n)      GET_GLOBAL_FIELD(b, (n<<2)|(M2VCBR_N), NSCFG)
+#define GET_BPMTCFG(b, n)    GET_GLOBAL_FIELD(b, (n<<2)|(M2VCBR_N), BPMTCFG)
+#define GET_BPMEMTYPE(b, n)  GET_GLOBAL_FIELD(b, (n<<2)|(M2VCBR_N), BPMEMTYPE)
+
+
+/* CR */
+#define GET_RPUE(b)		 GET_GLOBAL_FIELD(b, CR, RPUE)
+#define GET_RPUERE(b)		 GET_GLOBAL_FIELD(b, CR, RPUERE)
+#define GET_RPUEIE(b)		 GET_GLOBAL_FIELD(b, CR, RPUEIE)
+#define GET_DCDEE(b)		 GET_GLOBAL_FIELD(b, CR, DCDEE)
+#define GET_CLIENTPD(b)		 GET_GLOBAL_FIELD(b, CR, CLIENTPD)
+#define GET_STALLD(b)		 GET_GLOBAL_FIELD(b, CR, STALLD)
+#define GET_TLBLKCRWE(b)	 GET_GLOBAL_FIELD(b, CR, TLBLKCRWE)
+#define GET_CR_TLBIALLCFG(b)	 GET_GLOBAL_FIELD(b, CR, CR_TLBIALLCFG)
+#define GET_TLBIVMIDCFG(b)	 GET_GLOBAL_FIELD(b, CR, TLBIVMIDCFG)
+#define GET_CR_HUME(b)		 GET_GLOBAL_FIELD(b, CR, CR_HUME)
+
+
+/* ESR */
+#define GET_CFG(b)		 GET_GLOBAL_FIELD(b, ESR, CFG)
+#define GET_BYPASS(b)		 GET_GLOBAL_FIELD(b, ESR, BYPASS)
+#define GET_ESR_MULTI(b)	 GET_GLOBAL_FIELD(b, ESR, ESR_MULTI)
+
+
+/* ESYNR0 */
+#define GET_ESYNR0_AMID(b)	 GET_GLOBAL_FIELD(b, ESYNR0, ESYNR0_AMID)
+#define GET_ESYNR0_APID(b)	 GET_GLOBAL_FIELD(b, ESYNR0, ESYNR0_APID)
+#define GET_ESYNR0_ABID(b)	 GET_GLOBAL_FIELD(b, ESYNR0, ESYNR0_ABID)
+#define GET_ESYNR0_AVMID(b)	 GET_GLOBAL_FIELD(b, ESYNR0, ESYNR0_AVMID)
+#define GET_ESYNR0_ATID(b)	 GET_GLOBAL_FIELD(b, ESYNR0, ESYNR0_ATID)
+
+
+/* ESYNR1 */
+#define GET_ESYNR1_AMEMTYPE(b)   GET_GLOBAL_FIELD(b, ESYNR1, ESYNR1_AMEMTYPE)
+#define GET_ESYNR1_ASHARED(b)    GET_GLOBAL_FIELD(b, ESYNR1, ESYNR1_ASHARED)
+#define GET_ESYNR1_AINNERSHARED(b) \
+			GET_GLOBAL_FIELD(b, ESYNR1, ESYNR1_AINNERSHARED)
+#define GET_ESYNR1_APRIV(b)      GET_GLOBAL_FIELD(b, ESYNR1, ESYNR1_APRIV)
+#define GET_ESYNR1_APROTNS(b)	 GET_GLOBAL_FIELD(b, ESYNR1, ESYNR1_APROTNS)
+#define GET_ESYNR1_AINST(b)	 GET_GLOBAL_FIELD(b, ESYNR1, ESYNR1_AINST)
+#define GET_ESYNR1_AWRITE(b)	 GET_GLOBAL_FIELD(b, ESYNR1, ESYNR1_AWRITE)
+#define GET_ESYNR1_ABURST(b)	 GET_GLOBAL_FIELD(b, ESYNR1, ESYNR1_ABURST)
+#define GET_ESYNR1_ALEN(b)	 GET_GLOBAL_FIELD(b, ESYNR1, ESYNR1_ALEN)
+#define GET_ESYNR1_ASIZE(b)	 GET_GLOBAL_FIELD(b, ESYNR1, ESYNR1_ASIZE)
+#define GET_ESYNR1_ALOCK(b)	 GET_GLOBAL_FIELD(b, ESYNR1, ESYNR1_ALOCK)
+#define GET_ESYNR1_AOOO(b)	 GET_GLOBAL_FIELD(b, ESYNR1, ESYNR1_AOOO)
+#define GET_ESYNR1_AFULL(b)	 GET_GLOBAL_FIELD(b, ESYNR1, ESYNR1_AFULL)
+#define GET_ESYNR1_AC(b)	 GET_GLOBAL_FIELD(b, ESYNR1, ESYNR1_AC)
+#define GET_ESYNR1_DCD(b)	 GET_GLOBAL_FIELD(b, ESYNR1, ESYNR1_DCD)
+
+
+/* IDR */
+#define GET_NM2VCBMT(b)		 GET_GLOBAL_FIELD(b, IDR, NM2VCBMT)
+#define GET_HTW(b)		 GET_GLOBAL_FIELD(b, IDR, HTW)
+#define GET_HUM(b)		 GET_GLOBAL_FIELD(b, IDR, HUM)
+#define GET_TLBSIZE(b)		 GET_GLOBAL_FIELD(b, IDR, TLBSIZE)
+#define GET_NCB(b)		 GET_GLOBAL_FIELD(b, IDR, NCB)
+#define GET_NIRPT(b)		 GET_GLOBAL_FIELD(b, IDR, NIRPT)
+
+
+/* REV */
+#define GET_MAJOR(b)		 GET_GLOBAL_FIELD(b, REV, MAJOR)
+#define GET_MINOR(b)		 GET_GLOBAL_FIELD(b, REV, MINOR)
+
+
+/* TESTBUSCR */
+#define GET_TBE(b)		 GET_GLOBAL_FIELD(b, TESTBUSCR, TBE)
+#define GET_SPDMBE(b)		 GET_GLOBAL_FIELD(b, TESTBUSCR, SPDMBE)
+#define GET_WGSEL(b)		 GET_GLOBAL_FIELD(b, TESTBUSCR, WGSEL)
+#define GET_TBLSEL(b)		 GET_GLOBAL_FIELD(b, TESTBUSCR, TBLSEL)
+#define GET_TBHSEL(b)		 GET_GLOBAL_FIELD(b, TESTBUSCR, TBHSEL)
+#define GET_SPDM0SEL(b)		 GET_GLOBAL_FIELD(b, TESTBUSCR, SPDM0SEL)
+#define GET_SPDM1SEL(b)		 GET_GLOBAL_FIELD(b, TESTBUSCR, SPDM1SEL)
+#define GET_SPDM2SEL(b)		 GET_GLOBAL_FIELD(b, TESTBUSCR, SPDM2SEL)
+#define GET_SPDM3SEL(b)		 GET_GLOBAL_FIELD(b, TESTBUSCR, SPDM3SEL)
+
+
+/* TLBIVMID */
+#define GET_TLBIVMID_VMID(b)	 GET_GLOBAL_FIELD(b, TLBIVMID, TLBIVMID_VMID)
+
+
+/* TLBTR0 */
+#define GET_PR(b)		 GET_GLOBAL_FIELD(b, TLBTR0, PR)
+#define GET_PW(b)		 GET_GLOBAL_FIELD(b, TLBTR0, PW)
+#define GET_UR(b)		 GET_GLOBAL_FIELD(b, TLBTR0, UR)
+#define GET_UW(b)		 GET_GLOBAL_FIELD(b, TLBTR0, UW)
+#define GET_XN(b)		 GET_GLOBAL_FIELD(b, TLBTR0, XN)
+#define GET_NSDESC(b)		 GET_GLOBAL_FIELD(b, TLBTR0, NSDESC)
+#define GET_ISH(b)		 GET_GLOBAL_FIELD(b, TLBTR0, ISH)
+#define GET_SH(b)		 GET_GLOBAL_FIELD(b, TLBTR0, SH)
+#define GET_MT(b)		 GET_GLOBAL_FIELD(b, TLBTR0, MT)
+#define GET_DPSIZR(b)		 GET_GLOBAL_FIELD(b, TLBTR0, DPSIZR)
+#define GET_DPSIZC(b)		 GET_GLOBAL_FIELD(b, TLBTR0, DPSIZC)
+
+
+/* TLBTR1 */
+#define GET_TLBTR1_VMID(b)	 GET_GLOBAL_FIELD(b, TLBTR1, TLBTR1_VMID)
+#define GET_TLBTR1_PA(b)	 GET_GLOBAL_FIELD(b, TLBTR1, TLBTR1_PA)
+
+
+/* TLBTR2 */
+#define GET_TLBTR2_ASID(b)	 GET_GLOBAL_FIELD(b, TLBTR2, TLBTR2_ASID)
+#define GET_TLBTR2_V(b)		 GET_GLOBAL_FIELD(b, TLBTR2, TLBTR2_V)
+#define GET_TLBTR2_NSTID(b)	 GET_GLOBAL_FIELD(b, TLBTR2, TLBTR2_NSTID)
+#define GET_TLBTR2_NV(b)	 GET_GLOBAL_FIELD(b, TLBTR2, TLBTR2_NV)
+#define GET_TLBTR2_VA(b)	 GET_GLOBAL_FIELD(b, TLBTR2, TLBTR2_VA)
+
+
+/* Context Register setters / getters */
+/* Context Register setters */
+/* ACTLR */
+#define SET_CFERE(b, c, v)	 SET_CONTEXT_FIELD(b, c, ACTLR, CFERE, v)
+#define SET_CFEIE(b, c, v)	 SET_CONTEXT_FIELD(b, c, ACTLR, CFEIE, v)
+#define SET_PTSHCFG(b, c, v)	 SET_CONTEXT_FIELD(b, c, ACTLR, PTSHCFG, v)
+#define SET_RCOSH(b, c, v)	 SET_CONTEXT_FIELD(b, c, ACTLR, RCOSH, v)
+#define SET_RCISH(b, c, v)	 SET_CONTEXT_FIELD(b, c, ACTLR, RCISH, v)
+#define SET_RCNSH(b, c, v)	 SET_CONTEXT_FIELD(b, c, ACTLR, RCNSH, v)
+#define SET_PRIVCFG(b, c, v)	 SET_CONTEXT_FIELD(b, c, ACTLR, PRIVCFG, v)
+#define SET_DNA(b, c, v)	 SET_CONTEXT_FIELD(b, c, ACTLR, DNA, v)
+#define SET_DNLV2PA(b, c, v)	 SET_CONTEXT_FIELD(b, c, ACTLR, DNLV2PA, v)
+#define SET_TLBMCFG(b, c, v)	 SET_CONTEXT_FIELD(b, c, ACTLR, TLBMCFG, v)
+#define SET_CFCFG(b, c, v)	 SET_CONTEXT_FIELD(b, c, ACTLR, CFCFG, v)
+#define SET_TIPCF(b, c, v)	 SET_CONTEXT_FIELD(b, c, ACTLR, TIPCF, v)
+#define SET_V2PCFG(b, c, v)	 SET_CONTEXT_FIELD(b, c, ACTLR, V2PCFG, v)
+#define SET_HUME(b, c, v)	 SET_CONTEXT_FIELD(b, c, ACTLR, HUME, v)
+#define SET_PTMTCFG(b, c, v)	 SET_CONTEXT_FIELD(b, c, ACTLR, PTMTCFG, v)
+#define SET_PTMEMTYPE(b, c, v)	 SET_CONTEXT_FIELD(b, c, ACTLR, PTMEMTYPE, v)
+
+
+/* BFBCR */
+#define SET_BFBDFE(b, c, v)	 SET_CONTEXT_FIELD(b, c, BFBCR, BFBDFE, v)
+#define SET_BFBSFE(b, c, v)	 SET_CONTEXT_FIELD(b, c, BFBCR, BFBSFE, v)
+#define SET_SFVS(b, c, v)	 SET_CONTEXT_FIELD(b, c, BFBCR, SFVS, v)
+#define SET_FLVIC(b, c, v)	 SET_CONTEXT_FIELD(b, c, BFBCR, FLVIC, v)
+#define SET_SLVIC(b, c, v)	 SET_CONTEXT_FIELD(b, c, BFBCR, SLVIC, v)
+
+
+/* CONTEXTIDR */
+#define SET_CONTEXTIDR_ASID(b, c, v)   \
+		SET_CONTEXT_FIELD(b, c, CONTEXTIDR, CONTEXTIDR_ASID, v)
+#define SET_CONTEXTIDR_PROCID(b, c, v) \
+		SET_CONTEXT_FIELD(b, c, CONTEXTIDR, PROCID, v)
+
+
+/* FSR */
+#define SET_TF(b, c, v)		 SET_CONTEXT_FIELD(b, c, FSR, TF, v)
+#define SET_AFF(b, c, v)	 SET_CONTEXT_FIELD(b, c, FSR, AFF, v)
+#define SET_APF(b, c, v)	 SET_CONTEXT_FIELD(b, c, FSR, APF, v)
+#define SET_TLBMF(b, c, v)	 SET_CONTEXT_FIELD(b, c, FSR, TLBMF, v)
+#define SET_HTWDEEF(b, c, v)	 SET_CONTEXT_FIELD(b, c, FSR, HTWDEEF, v)
+#define SET_HTWSEEF(b, c, v)	 SET_CONTEXT_FIELD(b, c, FSR, HTWSEEF, v)
+#define SET_MHF(b, c, v)	 SET_CONTEXT_FIELD(b, c, FSR, MHF, v)
+#define SET_SL(b, c, v)		 SET_CONTEXT_FIELD(b, c, FSR, SL, v)
+#define SET_SS(b, c, v)		 SET_CONTEXT_FIELD(b, c, FSR, SS, v)
+#define SET_MULTI(b, c, v)	 SET_CONTEXT_FIELD(b, c, FSR, MULTI, v)
+
+
+/* FSYNR0 */
+#define SET_AMID(b, c, v)	 SET_CONTEXT_FIELD(b, c, FSYNR0, AMID, v)
+#define SET_APID(b, c, v)	 SET_CONTEXT_FIELD(b, c, FSYNR0, APID, v)
+#define SET_ABID(b, c, v)	 SET_CONTEXT_FIELD(b, c, FSYNR0, ABID, v)
+#define SET_ATID(b, c, v)	 SET_CONTEXT_FIELD(b, c, FSYNR0, ATID, v)
+
+
+/* FSYNR1 */
+#define SET_AMEMTYPE(b, c, v)	 SET_CONTEXT_FIELD(b, c, FSYNR1, AMEMTYPE, v)
+#define SET_ASHARED(b, c, v)	 SET_CONTEXT_FIELD(b, c, FSYNR1, ASHARED, v)
+#define SET_AINNERSHARED(b, c, v)  \
+				SET_CONTEXT_FIELD(b, c, FSYNR1, AINNERSHARED, v)
+#define SET_APRIV(b, c, v)	 SET_CONTEXT_FIELD(b, c, FSYNR1, APRIV, v)
+#define SET_APROTNS(b, c, v)	 SET_CONTEXT_FIELD(b, c, FSYNR1, APROTNS, v)
+#define SET_AINST(b, c, v)	 SET_CONTEXT_FIELD(b, c, FSYNR1, AINST, v)
+#define SET_AWRITE(b, c, v)	 SET_CONTEXT_FIELD(b, c, FSYNR1, AWRITE, v)
+#define SET_ABURST(b, c, v)	 SET_CONTEXT_FIELD(b, c, FSYNR1, ABURST, v)
+#define SET_ALEN(b, c, v)	 SET_CONTEXT_FIELD(b, c, FSYNR1, ALEN, v)
+#define SET_FSYNR1_ASIZE(b, c, v) \
+				SET_CONTEXT_FIELD(b, c, FSYNR1, FSYNR1_ASIZE, v)
+#define SET_ALOCK(b, c, v)	 SET_CONTEXT_FIELD(b, c, FSYNR1, ALOCK, v)
+#define SET_AFULL(b, c, v)	 SET_CONTEXT_FIELD(b, c, FSYNR1, AFULL, v)
+
+
+/* NMRR */
+#define SET_ICPC0(b, c, v)	 SET_CONTEXT_FIELD(b, c, NMRR, ICPC0, v)
+#define SET_ICPC1(b, c, v)	 SET_CONTEXT_FIELD(b, c, NMRR, ICPC1, v)
+#define SET_ICPC2(b, c, v)	 SET_CONTEXT_FIELD(b, c, NMRR, ICPC2, v)
+#define SET_ICPC3(b, c, v)	 SET_CONTEXT_FIELD(b, c, NMRR, ICPC3, v)
+#define SET_ICPC4(b, c, v)	 SET_CONTEXT_FIELD(b, c, NMRR, ICPC4, v)
+#define SET_ICPC5(b, c, v)	 SET_CONTEXT_FIELD(b, c, NMRR, ICPC5, v)
+#define SET_ICPC6(b, c, v)	 SET_CONTEXT_FIELD(b, c, NMRR, ICPC6, v)
+#define SET_ICPC7(b, c, v)	 SET_CONTEXT_FIELD(b, c, NMRR, ICPC7, v)
+#define SET_OCPC0(b, c, v)	 SET_CONTEXT_FIELD(b, c, NMRR, OCPC0, v)
+#define SET_OCPC1(b, c, v)	 SET_CONTEXT_FIELD(b, c, NMRR, OCPC1, v)
+#define SET_OCPC2(b, c, v)	 SET_CONTEXT_FIELD(b, c, NMRR, OCPC2, v)
+#define SET_OCPC3(b, c, v)	 SET_CONTEXT_FIELD(b, c, NMRR, OCPC3, v)
+#define SET_OCPC4(b, c, v)	 SET_CONTEXT_FIELD(b, c, NMRR, OCPC4, v)
+#define SET_OCPC5(b, c, v)	 SET_CONTEXT_FIELD(b, c, NMRR, OCPC5, v)
+#define SET_OCPC6(b, c, v)	 SET_CONTEXT_FIELD(b, c, NMRR, OCPC6, v)
+#define SET_OCPC7(b, c, v)	 SET_CONTEXT_FIELD(b, c, NMRR, OCPC7, v)
+
+
+/* PAR */
+#define SET_FAULT(b, c, v)	 SET_CONTEXT_FIELD(b, c, PAR, FAULT, v)
+
+#define SET_FAULT_TF(b, c, v)	 SET_CONTEXT_FIELD(b, c, PAR, FAULT_TF, v)
+#define SET_FAULT_AFF(b, c, v)	 SET_CONTEXT_FIELD(b, c, PAR, FAULT_AFF, v)
+#define SET_FAULT_APF(b, c, v)	 SET_CONTEXT_FIELD(b, c, PAR, FAULT_APF, v)
+#define SET_FAULT_TLBMF(b, c, v) SET_CONTEXT_FIELD(b, c, PAR, FAULT_TLBMF, v)
+#define SET_FAULT_HTWDEEF(b, c, v) \
+				SET_CONTEXT_FIELD(b, c, PAR, FAULT_HTWDEEF, v)
+#define SET_FAULT_HTWSEEF(b, c, v) \
+				SET_CONTEXT_FIELD(b, c, PAR, FAULT_HTWSEEF, v)
+#define SET_FAULT_MHF(b, c, v)	 SET_CONTEXT_FIELD(b, c, PAR, FAULT_MHF, v)
+#define SET_FAULT_SL(b, c, v)	 SET_CONTEXT_FIELD(b, c, PAR, FAULT_SL, v)
+#define SET_FAULT_SS(b, c, v)	 SET_CONTEXT_FIELD(b, c, PAR, FAULT_SS, v)
+
+#define SET_NOFAULT_SS(b, c, v)	 SET_CONTEXT_FIELD(b, c, PAR, NOFAULT_SS, v)
+#define SET_NOFAULT_MT(b, c, v)	 SET_CONTEXT_FIELD(b, c, PAR, NOFAULT_MT, v)
+#define SET_NOFAULT_SH(b, c, v)	 SET_CONTEXT_FIELD(b, c, PAR, NOFAULT_SH, v)
+#define SET_NOFAULT_NS(b, c, v)	 SET_CONTEXT_FIELD(b, c, PAR, NOFAULT_NS, v)
+#define SET_NOFAULT_NOS(b, c, v) SET_CONTEXT_FIELD(b, c, PAR, NOFAULT_NOS, v)
+#define SET_NPFAULT_PA(b, c, v)	 SET_CONTEXT_FIELD(b, c, PAR, NPFAULT_PA, v)
+
+
+/* PRRR */
+#define SET_MTC0(b, c, v)	 SET_CONTEXT_FIELD(b, c, PRRR, MTC0, v)
+#define SET_MTC1(b, c, v)	 SET_CONTEXT_FIELD(b, c, PRRR, MTC1, v)
+#define SET_MTC2(b, c, v)	 SET_CONTEXT_FIELD(b, c, PRRR, MTC2, v)
+#define SET_MTC3(b, c, v)	 SET_CONTEXT_FIELD(b, c, PRRR, MTC3, v)
+#define SET_MTC4(b, c, v)	 SET_CONTEXT_FIELD(b, c, PRRR, MTC4, v)
+#define SET_MTC5(b, c, v)	 SET_CONTEXT_FIELD(b, c, PRRR, MTC5, v)
+#define SET_MTC6(b, c, v)	 SET_CONTEXT_FIELD(b, c, PRRR, MTC6, v)
+#define SET_MTC7(b, c, v)	 SET_CONTEXT_FIELD(b, c, PRRR, MTC7, v)
+#define SET_SHDSH0(b, c, v)	 SET_CONTEXT_FIELD(b, c, PRRR, SHDSH0, v)
+#define SET_SHDSH1(b, c, v)	 SET_CONTEXT_FIELD(b, c, PRRR, SHDSH1, v)
+#define SET_SHNMSH0(b, c, v)	 SET_CONTEXT_FIELD(b, c, PRRR, SHNMSH0, v)
+#define SET_SHNMSH1(b, c, v)     SET_CONTEXT_FIELD(b, c, PRRR, SHNMSH1, v)
+#define SET_NOS0(b, c, v)	 SET_CONTEXT_FIELD(b, c, PRRR, NOS0, v)
+#define SET_NOS1(b, c, v)	 SET_CONTEXT_FIELD(b, c, PRRR, NOS1, v)
+#define SET_NOS2(b, c, v)	 SET_CONTEXT_FIELD(b, c, PRRR, NOS2, v)
+#define SET_NOS3(b, c, v)	 SET_CONTEXT_FIELD(b, c, PRRR, NOS3, v)
+#define SET_NOS4(b, c, v)	 SET_CONTEXT_FIELD(b, c, PRRR, NOS4, v)
+#define SET_NOS5(b, c, v)	 SET_CONTEXT_FIELD(b, c, PRRR, NOS5, v)
+#define SET_NOS6(b, c, v)	 SET_CONTEXT_FIELD(b, c, PRRR, NOS6, v)
+#define SET_NOS7(b, c, v)	 SET_CONTEXT_FIELD(b, c, PRRR, NOS7, v)
+
+
+/* RESUME */
+#define SET_TNR(b, c, v)	 SET_CONTEXT_FIELD(b, c, RESUME, TNR, v)
+
+
+/* SCTLR */
+#define SET_M(b, c, v)		 SET_CONTEXT_FIELD(b, c, SCTLR, M, v)
+#define SET_TRE(b, c, v)	 SET_CONTEXT_FIELD(b, c, SCTLR, TRE, v)
+#define SET_AFE(b, c, v)	 SET_CONTEXT_FIELD(b, c, SCTLR, AFE, v)
+#define SET_HAF(b, c, v)	 SET_CONTEXT_FIELD(b, c, SCTLR, HAF, v)
+#define SET_BE(b, c, v)		 SET_CONTEXT_FIELD(b, c, SCTLR, BE, v)
+#define SET_AFFD(b, c, v)	 SET_CONTEXT_FIELD(b, c, SCTLR, AFFD, v)
+
+
+/* TLBLKCR */
+#define SET_LKE(b, c, v)	   SET_CONTEXT_FIELD(b, c, TLBLKCR, LKE, v)
+#define SET_TLBLKCR_TLBIALLCFG(b, c, v) \
+			SET_CONTEXT_FIELD(b, c, TLBLKCR, TLBLCKR_TLBIALLCFG, v)
+#define SET_TLBIASIDCFG(b, c, v) \
+			SET_CONTEXT_FIELD(b, c, TLBLKCR, TLBIASIDCFG, v)
+#define SET_TLBIVAACFG(b, c, v)	SET_CONTEXT_FIELD(b, c, TLBLKCR, TLBIVAACFG, v)
+#define SET_FLOOR(b, c, v)	SET_CONTEXT_FIELD(b, c, TLBLKCR, FLOOR, v)
+#define SET_VICTIM(b, c, v)	SET_CONTEXT_FIELD(b, c, TLBLKCR, VICTIM, v)
+
+
+/* TTBCR */
+#define SET_N(b, c, v)	         SET_CONTEXT_FIELD(b, c, TTBCR, N, v)
+#define SET_PD0(b, c, v)	 SET_CONTEXT_FIELD(b, c, TTBCR, PD0, v)
+#define SET_PD1(b, c, v)	 SET_CONTEXT_FIELD(b, c, TTBCR, PD1, v)
+
+
+/* TTBR0 */
+#define SET_TTBR0_IRGNH(b, c, v) SET_CONTEXT_FIELD(b, c, TTBR0, TTBR0_IRGNH, v)
+#define SET_TTBR0_SH(b, c, v)	 SET_CONTEXT_FIELD(b, c, TTBR0, TTBR0_SH, v)
+#define SET_TTBR0_ORGN(b, c, v)	 SET_CONTEXT_FIELD(b, c, TTBR0, TTBR0_ORGN, v)
+#define SET_TTBR0_NOS(b, c, v)	 SET_CONTEXT_FIELD(b, c, TTBR0, TTBR0_NOS, v)
+#define SET_TTBR0_IRGNL(b, c, v) SET_CONTEXT_FIELD(b, c, TTBR0, TTBR0_IRGNL, v)
+#define SET_TTBR0_PA(b, c, v)	 SET_CONTEXT_FIELD(b, c, TTBR0, TTBR0_PA, v)
+
+
+/* TTBR1 */
+#define SET_TTBR1_IRGNH(b, c, v) SET_CONTEXT_FIELD(b, c, TTBR1, TTBR1_IRGNH, v)
+#define SET_TTBR1_SH(b, c, v)	 SET_CONTEXT_FIELD(b, c, TTBR1, TTBR1_SH, v)
+#define SET_TTBR1_ORGN(b, c, v)	 SET_CONTEXT_FIELD(b, c, TTBR1, TTBR1_ORGN, v)
+#define SET_TTBR1_NOS(b, c, v)	 SET_CONTEXT_FIELD(b, c, TTBR1, TTBR1_NOS, v)
+#define SET_TTBR1_IRGNL(b, c, v) SET_CONTEXT_FIELD(b, c, TTBR1, TTBR1_IRGNL, v)
+#define SET_TTBR1_PA(b, c, v)	 SET_CONTEXT_FIELD(b, c, TTBR1, TTBR1_PA, v)
+
+
+/* V2PSR */
+#define SET_HIT(b, c, v)	 SET_CONTEXT_FIELD(b, c, V2PSR, HIT, v)
+#define SET_INDEX(b, c, v)	 SET_CONTEXT_FIELD(b, c, V2PSR, INDEX, v)
+
+
+/* Context Register getters */
+/* ACTLR */
+#define GET_CFERE(b, c)	        GET_CONTEXT_FIELD(b, c, ACTLR, CFERE)
+#define GET_CFEIE(b, c)	        GET_CONTEXT_FIELD(b, c, ACTLR, CFEIE)
+#define GET_PTSHCFG(b, c)       GET_CONTEXT_FIELD(b, c, ACTLR, PTSHCFG)
+#define GET_RCOSH(b, c)	        GET_CONTEXT_FIELD(b, c, ACTLR, RCOSH)
+#define GET_RCISH(b, c)	        GET_CONTEXT_FIELD(b, c, ACTLR, RCISH)
+#define GET_RCNSH(b, c)	        GET_CONTEXT_FIELD(b, c, ACTLR, RCNSH)
+#define GET_PRIVCFG(b, c)       GET_CONTEXT_FIELD(b, c, ACTLR, PRIVCFG)
+#define GET_DNA(b, c)	        GET_CONTEXT_FIELD(b, c, ACTLR, DNA)
+#define GET_DNLV2PA(b, c)       GET_CONTEXT_FIELD(b, c, ACTLR, DNLV2PA)
+#define GET_TLBMCFG(b, c)       GET_CONTEXT_FIELD(b, c, ACTLR, TLBMCFG)
+#define GET_CFCFG(b, c)	        GET_CONTEXT_FIELD(b, c, ACTLR, CFCFG)
+#define GET_TIPCF(b, c)	        GET_CONTEXT_FIELD(b, c, ACTLR, TIPCF)
+#define GET_V2PCFG(b, c)        GET_CONTEXT_FIELD(b, c, ACTLR, V2PCFG)
+#define GET_HUME(b, c)	        GET_CONTEXT_FIELD(b, c, ACTLR, HUME)
+#define GET_PTMTCFG(b, c)       GET_CONTEXT_FIELD(b, c, ACTLR, PTMTCFG)
+#define GET_PTMEMTYPE(b, c)     GET_CONTEXT_FIELD(b, c, ACTLR, PTMEMTYPE)
+
+/* BFBCR */
+#define GET_BFBDFE(b, c)	GET_CONTEXT_FIELD(b, c, BFBCR, BFBDFE)
+#define GET_BFBSFE(b, c)	GET_CONTEXT_FIELD(b, c, BFBCR, BFBSFE)
+#define GET_SFVS(b, c)		GET_CONTEXT_FIELD(b, c, BFBCR, SFVS)
+#define GET_FLVIC(b, c)		GET_CONTEXT_FIELD(b, c, BFBCR, FLVIC)
+#define GET_SLVIC(b, c)		GET_CONTEXT_FIELD(b, c, BFBCR, SLVIC)
+
+
+/* CONTEXTIDR */
+#define GET_CONTEXTIDR_ASID(b, c) \
+			GET_CONTEXT_FIELD(b, c, CONTEXTIDR, CONTEXTIDR_ASID)
+#define GET_CONTEXTIDR_PROCID(b, c) GET_CONTEXT_FIELD(b, c, CONTEXTIDR, PROCID)
+
+
+/* FSR */
+#define GET_TF(b, c)		GET_CONTEXT_FIELD(b, c, FSR, TF)
+#define GET_AFF(b, c)		GET_CONTEXT_FIELD(b, c, FSR, AFF)
+#define GET_APF(b, c)		GET_CONTEXT_FIELD(b, c, FSR, APF)
+#define GET_TLBMF(b, c)		GET_CONTEXT_FIELD(b, c, FSR, TLBMF)
+#define GET_HTWDEEF(b, c)	GET_CONTEXT_FIELD(b, c, FSR, HTWDEEF)
+#define GET_HTWSEEF(b, c)	GET_CONTEXT_FIELD(b, c, FSR, HTWSEEF)
+#define GET_MHF(b, c)		GET_CONTEXT_FIELD(b, c, FSR, MHF)
+#define GET_SL(b, c)		GET_CONTEXT_FIELD(b, c, FSR, SL)
+#define GET_SS(b, c)		GET_CONTEXT_FIELD(b, c, FSR, SS)
+#define GET_MULTI(b, c)		GET_CONTEXT_FIELD(b, c, FSR, MULTI)
+
+
+/* FSYNR0 */
+#define GET_AMID(b, c)		GET_CONTEXT_FIELD(b, c, FSYNR0, AMID)
+#define GET_APID(b, c)		GET_CONTEXT_FIELD(b, c, FSYNR0, APID)
+#define GET_ABID(b, c)		GET_CONTEXT_FIELD(b, c, FSYNR0, ABID)
+#define GET_ATID(b, c)		GET_CONTEXT_FIELD(b, c, FSYNR0, ATID)
+
+
+/* FSYNR1 */
+#define GET_AMEMTYPE(b, c)	GET_CONTEXT_FIELD(b, c, FSYNR1, AMEMTYPE)
+#define GET_ASHARED(b, c)	GET_CONTEXT_FIELD(b, c, FSYNR1, ASHARED)
+#define GET_AINNERSHARED(b, c)  GET_CONTEXT_FIELD(b, c, FSYNR1, AINNERSHARED)
+#define GET_APRIV(b, c)		GET_CONTEXT_FIELD(b, c, FSYNR1, APRIV)
+#define GET_APROTNS(b, c)	GET_CONTEXT_FIELD(b, c, FSYNR1, APROTNS)
+#define GET_AINST(b, c)		GET_CONTEXT_FIELD(b, c, FSYNR1, AINST)
+#define GET_AWRITE(b, c)	GET_CONTEXT_FIELD(b, c, FSYNR1, AWRITE)
+#define GET_ABURST(b, c)	GET_CONTEXT_FIELD(b, c, FSYNR1, ABURST)
+#define GET_ALEN(b, c)		GET_CONTEXT_FIELD(b, c, FSYNR1, ALEN)
+#define GET_FSYNR1_ASIZE(b, c)	GET_CONTEXT_FIELD(b, c, FSYNR1, FSYNR1_ASIZE)
+#define GET_ALOCK(b, c)		GET_CONTEXT_FIELD(b, c, FSYNR1, ALOCK)
+#define GET_AFULL(b, c)		GET_CONTEXT_FIELD(b, c, FSYNR1, AFULL)
+
+
+/* NMRR */
+#define GET_ICPC0(b, c)		GET_CONTEXT_FIELD(b, c, NMRR, ICPC0)
+#define GET_ICPC1(b, c)		GET_CONTEXT_FIELD(b, c, NMRR, ICPC1)
+#define GET_ICPC2(b, c)		GET_CONTEXT_FIELD(b, c, NMRR, ICPC2)
+#define GET_ICPC3(b, c)		GET_CONTEXT_FIELD(b, c, NMRR, ICPC3)
+#define GET_ICPC4(b, c)		GET_CONTEXT_FIELD(b, c, NMRR, ICPC4)
+#define GET_ICPC5(b, c)		GET_CONTEXT_FIELD(b, c, NMRR, ICPC5)
+#define GET_ICPC6(b, c)		GET_CONTEXT_FIELD(b, c, NMRR, ICPC6)
+#define GET_ICPC7(b, c)		GET_CONTEXT_FIELD(b, c, NMRR, ICPC7)
+#define GET_OCPC0(b, c)		GET_CONTEXT_FIELD(b, c, NMRR, OCPC0)
+#define GET_OCPC1(b, c)		GET_CONTEXT_FIELD(b, c, NMRR, OCPC1)
+#define GET_OCPC2(b, c)		GET_CONTEXT_FIELD(b, c, NMRR, OCPC2)
+#define GET_OCPC3(b, c)		GET_CONTEXT_FIELD(b, c, NMRR, OCPC3)
+#define GET_OCPC4(b, c)		GET_CONTEXT_FIELD(b, c, NMRR, OCPC4)
+#define GET_OCPC5(b, c)		GET_CONTEXT_FIELD(b, c, NMRR, OCPC5)
+#define GET_OCPC6(b, c)		GET_CONTEXT_FIELD(b, c, NMRR, OCPC6)
+#define GET_OCPC7(b, c)		GET_CONTEXT_FIELD(b, c, NMRR, OCPC7)
+#define NMRR_ICP(nmrr, n)	(((nmrr) & (3 << ((n) * 2))) >> ((n) * 2))
+#define NMRR_OCP(nmrr, n)	(((nmrr) & (3 << ((n) * 2 + 16))) >> \
+								((n) * 2 + 16))
+
+/* PAR */
+#define GET_FAULT(b, c)		GET_CONTEXT_FIELD(b, c, PAR, FAULT)
+
+#define GET_FAULT_TF(b, c)	GET_CONTEXT_FIELD(b, c, PAR, FAULT_TF)
+#define GET_FAULT_AFF(b, c)	GET_CONTEXT_FIELD(b, c, PAR, FAULT_AFF)
+#define GET_FAULT_APF(b, c)	GET_CONTEXT_FIELD(b, c, PAR, FAULT_APF)
+#define GET_FAULT_TLBMF(b, c)   GET_CONTEXT_FIELD(b, c, PAR, FAULT_TLBMF)
+#define GET_FAULT_HTWDEEF(b, c) GET_CONTEXT_FIELD(b, c, PAR, FAULT_HTWDEEF)
+#define GET_FAULT_HTWSEEF(b, c) GET_CONTEXT_FIELD(b, c, PAR, FAULT_HTWSEEF)
+#define GET_FAULT_MHF(b, c)	GET_CONTEXT_FIELD(b, c, PAR, FAULT_MHF)
+#define GET_FAULT_SL(b, c)	GET_CONTEXT_FIELD(b, c, PAR, FAULT_SL)
+#define GET_FAULT_SS(b, c)	GET_CONTEXT_FIELD(b, c, PAR, FAULT_SS)
+
+#define GET_NOFAULT_SS(b, c)	GET_CONTEXT_FIELD(b, c, PAR, PAR_NOFAULT_SS)
+#define GET_NOFAULT_MT(b, c)	GET_CONTEXT_FIELD(b, c, PAR, PAR_NOFAULT_MT)
+#define GET_NOFAULT_SH(b, c)	GET_CONTEXT_FIELD(b, c, PAR, PAR_NOFAULT_SH)
+#define GET_NOFAULT_NS(b, c)	GET_CONTEXT_FIELD(b, c, PAR, PAR_NOFAULT_NS)
+#define GET_NOFAULT_NOS(b, c)   GET_CONTEXT_FIELD(b, c, PAR, PAR_NOFAULT_NOS)
+#define GET_NPFAULT_PA(b, c)	GET_CONTEXT_FIELD(b, c, PAR, PAR_NPFAULT_PA)
+
+
+/* PRRR */
+#define GET_MTC0(b, c)		GET_CONTEXT_FIELD(b, c, PRRR, MTC0)
+#define GET_MTC1(b, c)		GET_CONTEXT_FIELD(b, c, PRRR, MTC1)
+#define GET_MTC2(b, c)		GET_CONTEXT_FIELD(b, c, PRRR, MTC2)
+#define GET_MTC3(b, c)		GET_CONTEXT_FIELD(b, c, PRRR, MTC3)
+#define GET_MTC4(b, c)		GET_CONTEXT_FIELD(b, c, PRRR, MTC4)
+#define GET_MTC5(b, c)		GET_CONTEXT_FIELD(b, c, PRRR, MTC5)
+#define GET_MTC6(b, c)		GET_CONTEXT_FIELD(b, c, PRRR, MTC6)
+#define GET_MTC7(b, c)		GET_CONTEXT_FIELD(b, c, PRRR, MTC7)
+#define GET_SHDSH0(b, c)	GET_CONTEXT_FIELD(b, c, PRRR, SHDSH0)
+#define GET_SHDSH1(b, c)	GET_CONTEXT_FIELD(b, c, PRRR, SHDSH1)
+#define GET_SHNMSH0(b, c)	GET_CONTEXT_FIELD(b, c, PRRR, SHNMSH0)
+#define GET_SHNMSH1(b, c)	GET_CONTEXT_FIELD(b, c, PRRR, SHNMSH1)
+#define GET_NOS0(b, c)		GET_CONTEXT_FIELD(b, c, PRRR, NOS0)
+#define GET_NOS1(b, c)		GET_CONTEXT_FIELD(b, c, PRRR, NOS1)
+#define GET_NOS2(b, c)		GET_CONTEXT_FIELD(b, c, PRRR, NOS2)
+#define GET_NOS3(b, c)		GET_CONTEXT_FIELD(b, c, PRRR, NOS3)
+#define GET_NOS4(b, c)		GET_CONTEXT_FIELD(b, c, PRRR, NOS4)
+#define GET_NOS5(b, c)		GET_CONTEXT_FIELD(b, c, PRRR, NOS5)
+#define GET_NOS6(b, c)		GET_CONTEXT_FIELD(b, c, PRRR, NOS6)
+#define GET_NOS7(b, c)		GET_CONTEXT_FIELD(b, c, PRRR, NOS7)
+#define PRRR_NOS(prrr, n)	 ((prrr) & (1 << ((n) + 24)) ? 1 : 0)
+#define PRRR_MT(prrr, n)	 ((((prrr) & (3 << ((n) * 2))) >> ((n) * 2)))
+
+
+/* RESUME */
+#define GET_TNR(b, c)		GET_CONTEXT_FIELD(b, c, RESUME, TNR)
+
+
+/* SCTLR */
+#define GET_M(b, c)		GET_CONTEXT_FIELD(b, c, SCTLR, M)
+#define GET_TRE(b, c)		GET_CONTEXT_FIELD(b, c, SCTLR, TRE)
+#define GET_AFE(b, c)		GET_CONTEXT_FIELD(b, c, SCTLR, AFE)
+#define GET_HAF(b, c)		GET_CONTEXT_FIELD(b, c, SCTLR, HAF)
+#define GET_BE(b, c)		GET_CONTEXT_FIELD(b, c, SCTLR, BE)
+#define GET_AFFD(b, c)		GET_CONTEXT_FIELD(b, c, SCTLR, AFFD)
+
+
+/* TLBLKCR */
+#define GET_LKE(b, c)		GET_CONTEXT_FIELD(b, c, TLBLKCR, LKE)
+#define GET_TLBLCKR_TLBIALLCFG(b, c) \
+			GET_CONTEXT_FIELD(b, c, TLBLKCR, TLBLCKR_TLBIALLCFG)
+#define GET_TLBIASIDCFG(b, c)   GET_CONTEXT_FIELD(b, c, TLBLKCR, TLBIASIDCFG)
+#define GET_TLBIVAACFG(b, c)	GET_CONTEXT_FIELD(b, c, TLBLKCR, TLBIVAACFG)
+#define GET_FLOOR(b, c)		GET_CONTEXT_FIELD(b, c, TLBLKCR, FLOOR)
+#define GET_VICTIM(b, c)	GET_CONTEXT_FIELD(b, c, TLBLKCR, VICTIM)
+
+
+/* TTBCR */
+#define GET_N(b, c)		GET_CONTEXT_FIELD(b, c, TTBCR, N)
+#define GET_PD0(b, c)		GET_CONTEXT_FIELD(b, c, TTBCR, PD0)
+#define GET_PD1(b, c)		GET_CONTEXT_FIELD(b, c, TTBCR, PD1)
+
+
+/* TTBR0 */
+#define GET_TTBR0_IRGNH(b, c)	GET_CONTEXT_FIELD(b, c, TTBR0, TTBR0_IRGNH)
+#define GET_TTBR0_SH(b, c)	GET_CONTEXT_FIELD(b, c, TTBR0, TTBR0_SH)
+#define GET_TTBR0_ORGN(b, c)	GET_CONTEXT_FIELD(b, c, TTBR0, TTBR0_ORGN)
+#define GET_TTBR0_NOS(b, c)	GET_CONTEXT_FIELD(b, c, TTBR0, TTBR0_NOS)
+#define GET_TTBR0_IRGNL(b, c)	GET_CONTEXT_FIELD(b, c, TTBR0, TTBR0_IRGNL)
+#define GET_TTBR0_PA(b, c)	GET_CONTEXT_FIELD(b, c, TTBR0, TTBR0_PA)
+
+
+/* TTBR1 */
+#define GET_TTBR1_IRGNH(b, c)	GET_CONTEXT_FIELD(b, c, TTBR1, TTBR1_IRGNH)
+#define GET_TTBR1_SH(b, c)	GET_CONTEXT_FIELD(b, c, TTBR1, TTBR1_SH)
+#define GET_TTBR1_ORGN(b, c)	GET_CONTEXT_FIELD(b, c, TTBR1, TTBR1_ORGN)
+#define GET_TTBR1_NOS(b, c)	GET_CONTEXT_FIELD(b, c, TTBR1, TTBR1_NOS)
+#define GET_TTBR1_IRGNL(b, c)	GET_CONTEXT_FIELD(b, c, TTBR1, TTBR1_IRGNL)
+#define GET_TTBR1_PA(b, c)	GET_CONTEXT_FIELD(b, c, TTBR1, TTBR1_PA)
+
+
+/* V2PSR */
+#define GET_HIT(b, c)		GET_CONTEXT_FIELD(b, c, V2PSR, HIT)
+#define GET_INDEX(b, c)		GET_CONTEXT_FIELD(b, c, V2PSR, INDEX)
+
+
+/* Global Registers */
+#define M2VCBR_N	(0xFF000)
+#define CBACR_N		(0xFF800)
+#define TLBRSW		(0xFFE00)
+#define TLBTR0		(0xFFE80)
+#define TLBTR1		(0xFFE84)
+#define TLBTR2		(0xFFE88)
+#define TESTBUSCR	(0xFFE8C)
+#define GLOBAL_TLBIALL	(0xFFF00)
+#define TLBIVMID	(0xFFF04)
+#define CR		(0xFFF80)
+#define EAR		(0xFFF84)
+#define ESR		(0xFFF88)
+#define ESRRESTORE	(0xFFF8C)
+#define ESYNR0		(0xFFF90)
+#define ESYNR1		(0xFFF94)
+#define REV		(0xFFFF4)
+#define IDR		(0xFFFF8)
+#define RPU_ACR		(0xFFFFC)
+
+
+/* Context Bank Registers */
+#define SCTLR		(0x000)
+#define ACTLR		(0x004)
+#define CONTEXTIDR	(0x008)
+#define TTBR0		(0x010)
+#define TTBR1		(0x014)
+#define TTBCR		(0x018)
+#define PAR		(0x01C)
+#define FSR		(0x020)
+#define FSRRESTORE	(0x024)
+#define FAR		(0x028)
+#define FSYNR0		(0x02C)
+#define FSYNR1		(0x030)
+#define PRRR		(0x034)
+#define NMRR		(0x038)
+#define TLBLCKR		(0x03C)
+#define V2PSR		(0x040)
+#define TLBFLPTER	(0x044)
+#define TLBSLPTER	(0x048)
+#define BFBCR		(0x04C)
+#define CTX_TLBIALL	(0x800)
+#define TLBIASID	(0x804)
+#define TLBIVA		(0x808)
+#define TLBIVAA		(0x80C)
+#define V2PPR		(0x810)
+#define V2PPW		(0x814)
+#define V2PUR		(0x818)
+#define V2PUW		(0x81C)
+#define RESUME		(0x820)
+
+
+/* Global Register Fields */
+/* CBACRn */
+#define RWVMID        (RWVMID_MASK       << RWVMID_SHIFT)
+#define RWE           (RWE_MASK          << RWE_SHIFT)
+#define RWGE          (RWGE_MASK         << RWGE_SHIFT)
+#define CBVMID        (CBVMID_MASK       << CBVMID_SHIFT)
+#define IRPTNDX       (IRPTNDX_MASK      << IRPTNDX_SHIFT)
+
+
+/* CR */
+#define RPUE          (RPUE_MASK          << RPUE_SHIFT)
+#define RPUERE        (RPUERE_MASK        << RPUERE_SHIFT)
+#define RPUEIE        (RPUEIE_MASK        << RPUEIE_SHIFT)
+#define DCDEE         (DCDEE_MASK         << DCDEE_SHIFT)
+#define CLIENTPD      (CLIENTPD_MASK      << CLIENTPD_SHIFT)
+#define STALLD        (STALLD_MASK        << STALLD_SHIFT)
+#define TLBLKCRWE     (TLBLKCRWE_MASK     << TLBLKCRWE_SHIFT)
+#define CR_TLBIALLCFG (CR_TLBIALLCFG_MASK << CR_TLBIALLCFG_SHIFT)
+#define TLBIVMIDCFG   (TLBIVMIDCFG_MASK   << TLBIVMIDCFG_SHIFT)
+#define CR_HUME       (CR_HUME_MASK       << CR_HUME_SHIFT)
+
+
+/* ESR */
+#define CFG           (CFG_MASK          << CFG_SHIFT)
+#define BYPASS        (BYPASS_MASK       << BYPASS_SHIFT)
+#define ESR_MULTI     (ESR_MULTI_MASK    << ESR_MULTI_SHIFT)
+
+
+/* ESYNR0 */
+#define ESYNR0_AMID   (ESYNR0_AMID_MASK  << ESYNR0_AMID_SHIFT)
+#define ESYNR0_APID   (ESYNR0_APID_MASK  << ESYNR0_APID_SHIFT)
+#define ESYNR0_ABID   (ESYNR0_ABID_MASK  << ESYNR0_ABID_SHIFT)
+#define ESYNR0_AVMID  (ESYNR0_AVMID_MASK << ESYNR0_AVMID_SHIFT)
+#define ESYNR0_ATID   (ESYNR0_ATID_MASK  << ESYNR0_ATID_SHIFT)
+
+
+/* ESYNR1 */
+#define ESYNR1_AMEMTYPE      (ESYNR1_AMEMTYPE_MASK    << ESYNR1_AMEMTYPE_SHIFT)
+#define ESYNR1_ASHARED       (ESYNR1_ASHARED_MASK     << ESYNR1_ASHARED_SHIFT)
+#define ESYNR1_AINNERSHARED  (ESYNR1_AINNERSHARED_MASK<< \
+						ESYNR1_AINNERSHARED_SHIFT)
+#define ESYNR1_APRIV         (ESYNR1_APRIV_MASK       << ESYNR1_APRIV_SHIFT)
+#define ESYNR1_APROTNS       (ESYNR1_APROTNS_MASK     << ESYNR1_APROTNS_SHIFT)
+#define ESYNR1_AINST         (ESYNR1_AINST_MASK       << ESYNR1_AINST_SHIFT)
+#define ESYNR1_AWRITE        (ESYNR1_AWRITE_MASK      << ESYNR1_AWRITE_SHIFT)
+#define ESYNR1_ABURST        (ESYNR1_ABURST_MASK      << ESYNR1_ABURST_SHIFT)
+#define ESYNR1_ALEN          (ESYNR1_ALEN_MASK        << ESYNR1_ALEN_SHIFT)
+#define ESYNR1_ASIZE         (ESYNR1_ASIZE_MASK       << ESYNR1_ASIZE_SHIFT)
+#define ESYNR1_ALOCK         (ESYNR1_ALOCK_MASK       << ESYNR1_ALOCK_SHIFT)
+#define ESYNR1_AOOO          (ESYNR1_AOOO_MASK        << ESYNR1_AOOO_SHIFT)
+#define ESYNR1_AFULL         (ESYNR1_AFULL_MASK       << ESYNR1_AFULL_SHIFT)
+#define ESYNR1_AC            (ESYNR1_AC_MASK          << ESYNR1_AC_SHIFT)
+#define ESYNR1_DCD           (ESYNR1_DCD_MASK         << ESYNR1_DCD_SHIFT)
+
+
+/* IDR */
+#define NM2VCBMT      (NM2VCBMT_MASK     << NM2VCBMT_SHIFT)
+#define HTW           (HTW_MASK          << HTW_SHIFT)
+#define HUM           (HUM_MASK          << HUM_SHIFT)
+#define TLBSIZE       (TLBSIZE_MASK      << TLBSIZE_SHIFT)
+#define NCB           (NCB_MASK          << NCB_SHIFT)
+#define NIRPT         (NIRPT_MASK        << NIRPT_SHIFT)
+
+
+/* M2VCBRn */
+#define VMID          (VMID_MASK         << VMID_SHIFT)
+#define CBNDX         (CBNDX_MASK        << CBNDX_SHIFT)
+#define BYPASSD       (BYPASSD_MASK      << BYPASSD_SHIFT)
+#define BPRCOSH       (BPRCOSH_MASK      << BPRCOSH_SHIFT)
+#define BPRCISH       (BPRCISH_MASK      << BPRCISH_SHIFT)
+#define BPRCNSH       (BPRCNSH_MASK      << BPRCNSH_SHIFT)
+#define BPSHCFG       (BPSHCFG_MASK      << BPSHCFG_SHIFT)
+#define NSCFG         (NSCFG_MASK        << NSCFG_SHIFT)
+#define BPMTCFG       (BPMTCFG_MASK      << BPMTCFG_SHIFT)
+#define BPMEMTYPE     (BPMEMTYPE_MASK    << BPMEMTYPE_SHIFT)
+
+
+/* REV */
+#define IDR_MINOR     (MINOR_MASK        << MINOR_SHIFT)
+#define IDR_MAJOR     (MAJOR_MASK        << MAJOR_SHIFT)
+
+
+/* TESTBUSCR */
+#define TBE           (TBE_MASK          << TBE_SHIFT)
+#define SPDMBE        (SPDMBE_MASK       << SPDMBE_SHIFT)
+#define WGSEL         (WGSEL_MASK        << WGSEL_SHIFT)
+#define TBLSEL        (TBLSEL_MASK       << TBLSEL_SHIFT)
+#define TBHSEL        (TBHSEL_MASK       << TBHSEL_SHIFT)
+#define SPDM0SEL      (SPDM0SEL_MASK     << SPDM0SEL_SHIFT)
+#define SPDM1SEL      (SPDM1SEL_MASK     << SPDM1SEL_SHIFT)
+#define SPDM2SEL      (SPDM2SEL_MASK     << SPDM2SEL_SHIFT)
+#define SPDM3SEL      (SPDM3SEL_MASK     << SPDM3SEL_SHIFT)
+
+
+/* TLBIVMID */
+#define TLBIVMID_VMID (TLBIVMID_VMID_MASK << TLBIVMID_VMID_SHIFT)
+
+
+/* TLBRSW */
+#define TLBRSW_INDEX  (TLBRSW_INDEX_MASK << TLBRSW_INDEX_SHIFT)
+#define TLBBFBS       (TLBBFBS_MASK      << TLBBFBS_SHIFT)
+
+
+/* TLBTR0 */
+#define PR            (PR_MASK           << PR_SHIFT)
+#define PW            (PW_MASK           << PW_SHIFT)
+#define UR            (UR_MASK           << UR_SHIFT)
+#define UW            (UW_MASK           << UW_SHIFT)
+#define XN            (XN_MASK           << XN_SHIFT)
+#define NSDESC        (NSDESC_MASK       << NSDESC_SHIFT)
+#define ISH           (ISH_MASK          << ISH_SHIFT)
+#define SH            (SH_MASK           << SH_SHIFT)
+#define MT            (MT_MASK           << MT_SHIFT)
+#define DPSIZR        (DPSIZR_MASK       << DPSIZR_SHIFT)
+#define DPSIZC        (DPSIZC_MASK       << DPSIZC_SHIFT)
+
+
+/* TLBTR1 */
+#define TLBTR1_VMID   (TLBTR1_VMID_MASK  << TLBTR1_VMID_SHIFT)
+#define TLBTR1_PA     (TLBTR1_PA_MASK    << TLBTR1_PA_SHIFT)
+
+
+/* TLBTR2 */
+#define TLBTR2_ASID   (TLBTR2_ASID_MASK  << TLBTR2_ASID_SHIFT)
+#define TLBTR2_V      (TLBTR2_V_MASK     << TLBTR2_V_SHIFT)
+#define TLBTR2_NSTID  (TLBTR2_NSTID_MASK << TLBTR2_NSTID_SHIFT)
+#define TLBTR2_NV     (TLBTR2_NV_MASK    << TLBTR2_NV_SHIFT)
+#define TLBTR2_VA     (TLBTR2_VA_MASK    << TLBTR2_VA_SHIFT)
+
+
+/* Context Register Fields */
+/* ACTLR */
+#define CFERE              (CFERE_MASK              << CFERE_SHIFT)
+#define CFEIE              (CFEIE_MASK              << CFEIE_SHIFT)
+#define PTSHCFG            (PTSHCFG_MASK            << PTSHCFG_SHIFT)
+#define RCOSH              (RCOSH_MASK              << RCOSH_SHIFT)
+#define RCISH              (RCISH_MASK              << RCISH_SHIFT)
+#define RCNSH              (RCNSH_MASK              << RCNSH_SHIFT)
+#define PRIVCFG            (PRIVCFG_MASK            << PRIVCFG_SHIFT)
+#define DNA                (DNA_MASK                << DNA_SHIFT)
+#define DNLV2PA            (DNLV2PA_MASK            << DNLV2PA_SHIFT)
+#define TLBMCFG            (TLBMCFG_MASK            << TLBMCFG_SHIFT)
+#define CFCFG              (CFCFG_MASK              << CFCFG_SHIFT)
+#define TIPCF              (TIPCF_MASK              << TIPCF_SHIFT)
+#define V2PCFG             (V2PCFG_MASK             << V2PCFG_SHIFT)
+#define HUME               (HUME_MASK               << HUME_SHIFT)
+#define PTMTCFG            (PTMTCFG_MASK            << PTMTCFG_SHIFT)
+#define PTMEMTYPE          (PTMEMTYPE_MASK          << PTMEMTYPE_SHIFT)
+
+
+/* BFBCR */
+#define BFBDFE             (BFBDFE_MASK             << BFBDFE_SHIFT)
+#define BFBSFE             (BFBSFE_MASK             << BFBSFE_SHIFT)
+#define SFVS               (SFVS_MASK               << SFVS_SHIFT)
+#define FLVIC              (FLVIC_MASK              << FLVIC_SHIFT)
+#define SLVIC              (SLVIC_MASK              << SLVIC_SHIFT)
+
+
+/* CONTEXTIDR */
+#define CONTEXTIDR_ASID    (CONTEXTIDR_ASID_MASK    << CONTEXTIDR_ASID_SHIFT)
+#define PROCID             (PROCID_MASK             << PROCID_SHIFT)
+
+
+/* FSR */
+#define TF                 (TF_MASK                 << TF_SHIFT)
+#define AFF                (AFF_MASK                << AFF_SHIFT)
+#define APF                (APF_MASK                << APF_SHIFT)
+#define TLBMF              (TLBMF_MASK              << TLBMF_SHIFT)
+#define HTWDEEF            (HTWDEEF_MASK            << HTWDEEF_SHIFT)
+#define HTWSEEF            (HTWSEEF_MASK            << HTWSEEF_SHIFT)
+#define MHF                (MHF_MASK                << MHF_SHIFT)
+#define SL                 (SL_MASK                 << SL_SHIFT)
+#define SS                 (SS_MASK                 << SS_SHIFT)
+#define MULTI              (MULTI_MASK              << MULTI_SHIFT)
+
+
+/* FSYNR0 */
+#define AMID               (AMID_MASK               << AMID_SHIFT)
+#define APID               (APID_MASK               << APID_SHIFT)
+#define ABID               (ABID_MASK               << ABID_SHIFT)
+#define ATID               (ATID_MASK               << ATID_SHIFT)
+
+
+/* FSYNR1 */
+#define AMEMTYPE           (AMEMTYPE_MASK           << AMEMTYPE_SHIFT)
+#define ASHARED            (ASHARED_MASK            << ASHARED_SHIFT)
+#define AINNERSHARED       (AINNERSHARED_MASK       << AINNERSHARED_SHIFT)
+#define APRIV              (APRIV_MASK              << APRIV_SHIFT)
+#define APROTNS            (APROTNS_MASK            << APROTNS_SHIFT)
+#define AINST              (AINST_MASK              << AINST_SHIFT)
+#define AWRITE             (AWRITE_MASK             << AWRITE_SHIFT)
+#define ABURST             (ABURST_MASK             << ABURST_SHIFT)
+#define ALEN               (ALEN_MASK               << ALEN_SHIFT)
+#define FSYNR1_ASIZE       (FSYNR1_ASIZE_MASK       << FSYNR1_ASIZE_SHIFT)
+#define ALOCK              (ALOCK_MASK              << ALOCK_SHIFT)
+#define AFULL              (AFULL_MASK              << AFULL_SHIFT)
+
+
+/* NMRR */
+#define ICPC0              (ICPC0_MASK              << ICPC0_SHIFT)
+#define ICPC1              (ICPC1_MASK              << ICPC1_SHIFT)
+#define ICPC2              (ICPC2_MASK              << ICPC2_SHIFT)
+#define ICPC3              (ICPC3_MASK              << ICPC3_SHIFT)
+#define ICPC4              (ICPC4_MASK              << ICPC4_SHIFT)
+#define ICPC5              (ICPC5_MASK              << ICPC5_SHIFT)
+#define ICPC6              (ICPC6_MASK              << ICPC6_SHIFT)
+#define ICPC7              (ICPC7_MASK              << ICPC7_SHIFT)
+#define OCPC0              (OCPC0_MASK              << OCPC0_SHIFT)
+#define OCPC1              (OCPC1_MASK              << OCPC1_SHIFT)
+#define OCPC2              (OCPC2_MASK              << OCPC2_SHIFT)
+#define OCPC3              (OCPC3_MASK              << OCPC3_SHIFT)
+#define OCPC4              (OCPC4_MASK              << OCPC4_SHIFT)
+#define OCPC5              (OCPC5_MASK              << OCPC5_SHIFT)
+#define OCPC6              (OCPC6_MASK              << OCPC6_SHIFT)
+#define OCPC7              (OCPC7_MASK              << OCPC7_SHIFT)
+
+
+/* PAR */
+#define FAULT              (FAULT_MASK              << FAULT_SHIFT)
+/* If a fault is present, these are the
+same as the fault fields in the FAR */
+#define FAULT_TF           (FAULT_TF_MASK           << FAULT_TF_SHIFT)
+#define FAULT_AFF          (FAULT_AFF_MASK          << FAULT_AFF_SHIFT)
+#define FAULT_APF          (FAULT_APF_MASK          << FAULT_APF_SHIFT)
+#define FAULT_TLBMF        (FAULT_TLBMF_MASK        << FAULT_TLBMF_SHIFT)
+#define FAULT_HTWDEEF      (FAULT_HTWDEEF_MASK      << FAULT_HTWDEEF_SHIFT)
+#define FAULT_HTWSEEF      (FAULT_HTWSEEF_MASK      << FAULT_HTWSEEF_SHIFT)
+#define FAULT_MHF          (FAULT_MHF_MASK          << FAULT_MHF_SHIFT)
+#define FAULT_SL           (FAULT_SL_MASK           << FAULT_SL_SHIFT)
+#define FAULT_SS           (FAULT_SS_MASK           << FAULT_SS_SHIFT)
+
+/* If NO fault is present, the following fields are in effect */
+/* (FAULT remains as before) */
+#define PAR_NOFAULT_SS     (PAR_NOFAULT_SS_MASK     << PAR_NOFAULT_SS_SHIFT)
+#define PAR_NOFAULT_MT     (PAR_NOFAULT_MT_MASK     << PAR_NOFAULT_MT_SHIFT)
+#define PAR_NOFAULT_SH     (PAR_NOFAULT_SH_MASK     << PAR_NOFAULT_SH_SHIFT)
+#define PAR_NOFAULT_NS     (PAR_NOFAULT_NS_MASK     << PAR_NOFAULT_NS_SHIFT)
+#define PAR_NOFAULT_NOS    (PAR_NOFAULT_NOS_MASK    << PAR_NOFAULT_NOS_SHIFT)
+#define PAR_NPFAULT_PA     (PAR_NPFAULT_PA_MASK     << PAR_NPFAULT_PA_SHIFT)
+
+
+/* PRRR */
+#define MTC0               (MTC0_MASK               << MTC0_SHIFT)
+#define MTC1               (MTC1_MASK               << MTC1_SHIFT)
+#define MTC2               (MTC2_MASK               << MTC2_SHIFT)
+#define MTC3               (MTC3_MASK               << MTC3_SHIFT)
+#define MTC4               (MTC4_MASK               << MTC4_SHIFT)
+#define MTC5               (MTC5_MASK               << MTC5_SHIFT)
+#define MTC6               (MTC6_MASK               << MTC6_SHIFT)
+#define MTC7               (MTC7_MASK               << MTC7_SHIFT)
+#define SHDSH0             (SHDSH0_MASK             << SHDSH0_SHIFT)
+#define SHDSH1             (SHDSH1_MASK             << SHDSH1_SHIFT)
+#define SHNMSH0            (SHNMSH0_MASK            << SHNMSH0_SHIFT)
+#define SHNMSH1            (SHNMSH1_MASK            << SHNMSH1_SHIFT)
+#define NOS0               (NOS0_MASK               << NOS0_SHIFT)
+#define NOS1               (NOS1_MASK               << NOS1_SHIFT)
+#define NOS2               (NOS2_MASK               << NOS2_SHIFT)
+#define NOS3               (NOS3_MASK               << NOS3_SHIFT)
+#define NOS4               (NOS4_MASK               << NOS4_SHIFT)
+#define NOS5               (NOS5_MASK               << NOS5_SHIFT)
+#define NOS6               (NOS6_MASK               << NOS6_SHIFT)
+#define NOS7               (NOS7_MASK               << NOS7_SHIFT)
+
+
+/* RESUME */
+#define TNR                (TNR_MASK                << TNR_SHIFT)
+
+
+/* SCTLR */
+#define M                  (M_MASK                  << M_SHIFT)
+#define TRE                (TRE_MASK                << TRE_SHIFT)
+#define AFE                (AFE_MASK                << AFE_SHIFT)
+#define HAF                (HAF_MASK                << HAF_SHIFT)
+#define BE                 (BE_MASK                 << BE_SHIFT)
+#define AFFD               (AFFD_MASK               << AFFD_SHIFT)
+
+
+/* TLBIASID */
+#define TLBIASID_ASID      (TLBIASID_ASID_MASK      << TLBIASID_ASID_SHIFT)
+
+
+/* TLBIVA */
+#define TLBIVA_ASID        (TLBIVA_ASID_MASK        << TLBIVA_ASID_SHIFT)
+#define TLBIVA_VA          (TLBIVA_VA_MASK          << TLBIVA_VA_SHIFT)
+
+
+/* TLBIVAA */
+#define TLBIVAA_VA         (TLBIVAA_VA_MASK         << TLBIVAA_VA_SHIFT)
+
+
+/* TLBLCKR */
+#define LKE                (LKE_MASK                << LKE_SHIFT)
+#define TLBLCKR_TLBIALLCFG (TLBLCKR_TLBIALLCFG_MASK<<TLBLCKR_TLBIALLCFG_SHIFT)
+#define TLBIASIDCFG        (TLBIASIDCFG_MASK        << TLBIASIDCFG_SHIFT)
+#define TLBIVAACFG         (TLBIVAACFG_MASK         << TLBIVAACFG_SHIFT)
+#define FLOOR              (FLOOR_MASK              << FLOOR_SHIFT)
+#define VICTIM             (VICTIM_MASK             << VICTIM_SHIFT)
+
+
+/* TTBCR */
+#define N                  (N_MASK                  << N_SHIFT)
+#define PD0                (PD0_MASK                << PD0_SHIFT)
+#define PD1                (PD1_MASK                << PD1_SHIFT)
+
+
+/* TTBR0 */
+#define TTBR0_IRGNH        (TTBR0_IRGNH_MASK        << TTBR0_IRGNH_SHIFT)
+#define TTBR0_SH           (TTBR0_SH_MASK           << TTBR0_SH_SHIFT)
+#define TTBR0_ORGN         (TTBR0_ORGN_MASK         << TTBR0_ORGN_SHIFT)
+#define TTBR0_NOS          (TTBR0_NOS_MASK          << TTBR0_NOS_SHIFT)
+#define TTBR0_IRGNL        (TTBR0_IRGNL_MASK        << TTBR0_IRGNL_SHIFT)
+#define TTBR0_PA           (TTBR0_PA_MASK           << TTBR0_PA_SHIFT)
+
+
+/* TTBR1 */
+#define TTBR1_IRGNH        (TTBR1_IRGNH_MASK        << TTBR1_IRGNH_SHIFT)
+#define TTBR1_SH           (TTBR1_SH_MASK           << TTBR1_SH_SHIFT)
+#define TTBR1_ORGN         (TTBR1_ORGN_MASK         << TTBR1_ORGN_SHIFT)
+#define TTBR1_NOS          (TTBR1_NOS_MASK          << TTBR1_NOS_SHIFT)
+#define TTBR1_IRGNL        (TTBR1_IRGNL_MASK        << TTBR1_IRGNL_SHIFT)
+#define TTBR1_PA           (TTBR1_PA_MASK           << TTBR1_PA_SHIFT)
+
+
+/* V2PSR */
+#define HIT                (HIT_MASK                << HIT_SHIFT)
+#define INDEX              (INDEX_MASK              << INDEX_SHIFT)
+
+
+/* V2Pxx */
+#define V2Pxx_INDEX        (V2Pxx_INDEX_MASK        << V2Pxx_INDEX_SHIFT)
+#define V2Pxx_VA           (V2Pxx_VA_MASK           << V2Pxx_VA_SHIFT)
+
+
+/* Global Register Masks */
+/* CBACRn */
+#define RWVMID_MASK               0x1F
+#define RWE_MASK                  0x01
+#define RWGE_MASK                 0x01
+#define CBVMID_MASK               0x1F
+#define IRPTNDX_MASK              0xFF
+
+
+/* CR */
+#define RPUE_MASK                 0x01
+#define RPUERE_MASK               0x01
+#define RPUEIE_MASK               0x01
+#define DCDEE_MASK                0x01
+#define CLIENTPD_MASK             0x01
+#define STALLD_MASK               0x01
+#define TLBLKCRWE_MASK            0x01
+#define CR_TLBIALLCFG_MASK        0x01
+#define TLBIVMIDCFG_MASK          0x01
+#define CR_HUME_MASK              0x01
+
+
+/* ESR */
+#define CFG_MASK                  0x01
+#define BYPASS_MASK               0x01
+#define ESR_MULTI_MASK            0x01
+
+
+/* ESYNR0 */
+#define ESYNR0_AMID_MASK          0xFF
+#define ESYNR0_APID_MASK          0x1F
+#define ESYNR0_ABID_MASK          0x07
+#define ESYNR0_AVMID_MASK         0x1F
+#define ESYNR0_ATID_MASK          0xFF
+
+
+/* ESYNR1 */
+#define ESYNR1_AMEMTYPE_MASK             0x07
+#define ESYNR1_ASHARED_MASK              0x01
+#define ESYNR1_AINNERSHARED_MASK         0x01
+#define ESYNR1_APRIV_MASK                0x01
+#define ESYNR1_APROTNS_MASK              0x01
+#define ESYNR1_AINST_MASK                0x01
+#define ESYNR1_AWRITE_MASK               0x01
+#define ESYNR1_ABURST_MASK               0x01
+#define ESYNR1_ALEN_MASK                 0x0F
+#define ESYNR1_ASIZE_MASK                0x01
+#define ESYNR1_ALOCK_MASK                0x03
+#define ESYNR1_AOOO_MASK                 0x01
+#define ESYNR1_AFULL_MASK                0x01
+#define ESYNR1_AC_MASK                   0x01
+#define ESYNR1_DCD_MASK                  0x01
+
+
+/* IDR */
+#define NM2VCBMT_MASK             0x1FF
+#define HTW_MASK                  0x01
+#define HUM_MASK                  0x01
+#define TLBSIZE_MASK              0x0F
+#define NCB_MASK                  0xFF
+#define NIRPT_MASK                0xFF
+
+
+/* M2VCBRn */
+#define VMID_MASK                 0x1F
+#define CBNDX_MASK                0xFF
+#define BYPASSD_MASK              0x01
+#define BPRCOSH_MASK              0x01
+#define BPRCISH_MASK              0x01
+#define BPRCNSH_MASK              0x01
+#define BPSHCFG_MASK              0x03
+#define NSCFG_MASK                0x03
+#define BPMTCFG_MASK              0x01
+#define BPMEMTYPE_MASK            0x07
+
+
+/* REV */
+#define MINOR_MASK                0x0F
+#define MAJOR_MASK                0x0F
+
+
+/* TESTBUSCR */
+#define TBE_MASK                  0x01
+#define SPDMBE_MASK               0x01
+#define WGSEL_MASK                0x03
+#define TBLSEL_MASK               0x03
+#define TBHSEL_MASK               0x03
+#define SPDM0SEL_MASK             0x0F
+#define SPDM1SEL_MASK             0x0F
+#define SPDM2SEL_MASK             0x0F
+#define SPDM3SEL_MASK             0x0F
+
+
+/* TLBIMID */
+#define TLBIVMID_VMID_MASK        0x1F
+
+
+/* TLBRSW */
+#define TLBRSW_INDEX_MASK         0xFF
+#define TLBBFBS_MASK              0x03
+
+
+/* TLBTR0 */
+#define PR_MASK                   0x01
+#define PW_MASK                   0x01
+#define UR_MASK                   0x01
+#define UW_MASK                   0x01
+#define XN_MASK                   0x01
+#define NSDESC_MASK               0x01
+#define ISH_MASK                  0x01
+#define SH_MASK                   0x01
+#define MT_MASK                   0x07
+#define DPSIZR_MASK               0x07
+#define DPSIZC_MASK               0x07
+
+
+/* TLBTR1 */
+#define TLBTR1_VMID_MASK          0x1F
+#define TLBTR1_PA_MASK            0x000FFFFF
+
+
+/* TLBTR2 */
+#define TLBTR2_ASID_MASK          0xFF
+#define TLBTR2_V_MASK             0x01
+#define TLBTR2_NSTID_MASK         0x01
+#define TLBTR2_NV_MASK            0x01
+#define TLBTR2_VA_MASK            0x000FFFFF
+
+
+/* Global Register Shifts */
+/* CBACRn */
+#define RWVMID_SHIFT             0
+#define RWE_SHIFT                8
+#define RWGE_SHIFT               9
+#define CBVMID_SHIFT             16
+#define IRPTNDX_SHIFT            24
+
+
+/* CR */
+#define RPUE_SHIFT               0
+#define RPUERE_SHIFT             1
+#define RPUEIE_SHIFT             2
+#define DCDEE_SHIFT              3
+#define CLIENTPD_SHIFT           4
+#define STALLD_SHIFT             5
+#define TLBLKCRWE_SHIFT          6
+#define CR_TLBIALLCFG_SHIFT      7
+#define TLBIVMIDCFG_SHIFT        8
+#define CR_HUME_SHIFT            9
+
+
+/* ESR */
+#define CFG_SHIFT                0
+#define BYPASS_SHIFT             1
+#define ESR_MULTI_SHIFT          31
+
+
+/* ESYNR0 */
+#define ESYNR0_AMID_SHIFT        0
+#define ESYNR0_APID_SHIFT        8
+#define ESYNR0_ABID_SHIFT        13
+#define ESYNR0_AVMID_SHIFT       16
+#define ESYNR0_ATID_SHIFT        24
+
+
+/* ESYNR1 */
+#define ESYNR1_AMEMTYPE_SHIFT           0
+#define ESYNR1_ASHARED_SHIFT            3
+#define ESYNR1_AINNERSHARED_SHIFT       4
+#define ESYNR1_APRIV_SHIFT              5
+#define ESYNR1_APROTNS_SHIFT            6
+#define ESYNR1_AINST_SHIFT              7
+#define ESYNR1_AWRITE_SHIFT             8
+#define ESYNR1_ABURST_SHIFT             10
+#define ESYNR1_ALEN_SHIFT               12
+#define ESYNR1_ASIZE_SHIFT              16
+#define ESYNR1_ALOCK_SHIFT              20
+#define ESYNR1_AOOO_SHIFT               22
+#define ESYNR1_AFULL_SHIFT              24
+#define ESYNR1_AC_SHIFT                 30
+#define ESYNR1_DCD_SHIFT                31
+
+
+/* IDR */
+#define NM2VCBMT_SHIFT           0
+#define HTW_SHIFT                9
+#define HUM_SHIFT                10
+#define TLBSIZE_SHIFT            12
+#define NCB_SHIFT                16
+#define NIRPT_SHIFT              24
+
+
+/* M2VCBRn */
+#define VMID_SHIFT               0
+#define CBNDX_SHIFT              8
+#define BYPASSD_SHIFT            16
+#define BPRCOSH_SHIFT            17
+#define BPRCISH_SHIFT            18
+#define BPRCNSH_SHIFT            19
+#define BPSHCFG_SHIFT            20
+#define NSCFG_SHIFT              22
+#define BPMTCFG_SHIFT            24
+#define BPMEMTYPE_SHIFT          25
+
+
+/* REV */
+#define MINOR_SHIFT              0
+#define MAJOR_SHIFT              4
+
+
+/* TESTBUSCR */
+#define TBE_SHIFT                0
+#define SPDMBE_SHIFT             1
+#define WGSEL_SHIFT              8
+#define TBLSEL_SHIFT             12
+#define TBHSEL_SHIFT             14
+#define SPDM0SEL_SHIFT           16
+#define SPDM1SEL_SHIFT           20
+#define SPDM2SEL_SHIFT           24
+#define SPDM3SEL_SHIFT           28
+
+
+/* TLBIMID */
+#define TLBIVMID_VMID_SHIFT      0
+
+
+/* TLBRSW */
+#define TLBRSW_INDEX_SHIFT       0
+#define TLBBFBS_SHIFT            8
+
+
+/* TLBTR0 */
+#define PR_SHIFT                 0
+#define PW_SHIFT                 1
+#define UR_SHIFT                 2
+#define UW_SHIFT                 3
+#define XN_SHIFT                 4
+#define NSDESC_SHIFT             6
+#define ISH_SHIFT                7
+#define SH_SHIFT                 8
+#define MT_SHIFT                 9
+#define DPSIZR_SHIFT             16
+#define DPSIZC_SHIFT             20
+
+
+/* TLBTR1 */
+#define TLBTR1_VMID_SHIFT        0
+#define TLBTR1_PA_SHIFT          12
+
+
+/* TLBTR2 */
+#define TLBTR2_ASID_SHIFT        0
+#define TLBTR2_V_SHIFT           8
+#define TLBTR2_NSTID_SHIFT       9
+#define TLBTR2_NV_SHIFT          10
+#define TLBTR2_VA_SHIFT          12
+
+
+/* Context Register Masks */
+/* ACTLR */
+#define CFERE_MASK                       0x01
+#define CFEIE_MASK                       0x01
+#define PTSHCFG_MASK                     0x03
+#define RCOSH_MASK                       0x01
+#define RCISH_MASK                       0x01
+#define RCNSH_MASK                       0x01
+#define PRIVCFG_MASK                     0x03
+#define DNA_MASK                         0x01
+#define DNLV2PA_MASK                     0x01
+#define TLBMCFG_MASK                     0x03
+#define CFCFG_MASK                       0x01
+#define TIPCF_MASK                       0x01
+#define V2PCFG_MASK                      0x03
+#define HUME_MASK                        0x01
+#define PTMTCFG_MASK                     0x01
+#define PTMEMTYPE_MASK                   0x07
+
+
+/* BFBCR */
+#define BFBDFE_MASK                      0x01
+#define BFBSFE_MASK                      0x01
+#define SFVS_MASK                        0x01
+#define FLVIC_MASK                       0x0F
+#define SLVIC_MASK                       0x0F
+
+
+/* CONTEXTIDR */
+#define CONTEXTIDR_ASID_MASK             0xFF
+#define PROCID_MASK                      0x00FFFFFF
+
+
+/* FSR */
+#define TF_MASK                          0x01
+#define AFF_MASK                         0x01
+#define APF_MASK                         0x01
+#define TLBMF_MASK                       0x01
+#define HTWDEEF_MASK                     0x01
+#define HTWSEEF_MASK                     0x01
+#define MHF_MASK                         0x01
+#define SL_MASK                          0x01
+#define SS_MASK                          0x01
+#define MULTI_MASK                       0x01
+
+
+/* FSYNR0 */
+#define AMID_MASK                        0xFF
+#define APID_MASK                        0x1F
+#define ABID_MASK                        0x07
+#define ATID_MASK                        0xFF
+
+
+/* FSYNR1 */
+#define AMEMTYPE_MASK                    0x07
+#define ASHARED_MASK                     0x01
+#define AINNERSHARED_MASK                0x01
+#define APRIV_MASK                       0x01
+#define APROTNS_MASK                     0x01
+#define AINST_MASK                       0x01
+#define AWRITE_MASK                      0x01
+#define ABURST_MASK                      0x01
+#define ALEN_MASK                        0x0F
+#define FSYNR1_ASIZE_MASK                0x07
+#define ALOCK_MASK                       0x03
+#define AFULL_MASK                       0x01
+
+
+/* NMRR */
+#define ICPC0_MASK                       0x03
+#define ICPC1_MASK                       0x03
+#define ICPC2_MASK                       0x03
+#define ICPC3_MASK                       0x03
+#define ICPC4_MASK                       0x03
+#define ICPC5_MASK                       0x03
+#define ICPC6_MASK                       0x03
+#define ICPC7_MASK                       0x03
+#define OCPC0_MASK                       0x03
+#define OCPC1_MASK                       0x03
+#define OCPC2_MASK                       0x03
+#define OCPC3_MASK                       0x03
+#define OCPC4_MASK                       0x03
+#define OCPC5_MASK                       0x03
+#define OCPC6_MASK                       0x03
+#define OCPC7_MASK                       0x03
+
+
+/* PAR */
+#define FAULT_MASK                       0x01
+/* If a fault is present, these are the
+same as the fault fields in the FAR */
+#define FAULT_TF_MASK                    0x01
+#define FAULT_AFF_MASK                   0x01
+#define FAULT_APF_MASK                   0x01
+#define FAULT_TLBMF_MASK                 0x01
+#define FAULT_HTWDEEF_MASK               0x01
+#define FAULT_HTWSEEF_MASK               0x01
+#define FAULT_MHF_MASK                   0x01
+#define FAULT_SL_MASK                    0x01
+#define FAULT_SS_MASK                    0x01
+
+/* If NO fault is present, the following
+ * fields are in effect
+ * (FAULT remains as before) */
+#define PAR_NOFAULT_SS_MASK              0x01
+#define PAR_NOFAULT_MT_MASK              0x07
+#define PAR_NOFAULT_SH_MASK              0x01
+#define PAR_NOFAULT_NS_MASK              0x01
+#define PAR_NOFAULT_NOS_MASK             0x01
+#define PAR_NPFAULT_PA_MASK              0x000FFFFF
+
+
+/* PRRR */
+#define MTC0_MASK                        0x03
+#define MTC1_MASK                        0x03
+#define MTC2_MASK                        0x03
+#define MTC3_MASK                        0x03
+#define MTC4_MASK                        0x03
+#define MTC5_MASK                        0x03
+#define MTC6_MASK                        0x03
+#define MTC7_MASK                        0x03
+#define SHDSH0_MASK                      0x01
+#define SHDSH1_MASK                      0x01
+#define SHNMSH0_MASK                     0x01
+#define SHNMSH1_MASK                     0x01
+#define NOS0_MASK                        0x01
+#define NOS1_MASK                        0x01
+#define NOS2_MASK                        0x01
+#define NOS3_MASK                        0x01
+#define NOS4_MASK                        0x01
+#define NOS5_MASK                        0x01
+#define NOS6_MASK                        0x01
+#define NOS7_MASK                        0x01
+
+
+/* RESUME */
+#define TNR_MASK                         0x01
+
+
+/* SCTLR */
+#define M_MASK                           0x01
+#define TRE_MASK                         0x01
+#define AFE_MASK                         0x01
+#define HAF_MASK                         0x01
+#define BE_MASK                          0x01
+#define AFFD_MASK                        0x01
+
+
+/* TLBIASID */
+#define TLBIASID_ASID_MASK               0xFF
+
+
+/* TLBIVA */
+#define TLBIVA_ASID_MASK                 0xFF
+#define TLBIVA_VA_MASK                   0x000FFFFF
+
+
+/* TLBIVAA */
+#define TLBIVAA_VA_MASK                  0x000FFFFF
+
+
+/* TLBLCKR */
+#define LKE_MASK                         0x01
+#define TLBLCKR_TLBIALLCFG_MASK          0x01
+#define TLBIASIDCFG_MASK                 0x01
+#define TLBIVAACFG_MASK                  0x01
+#define FLOOR_MASK                       0xFF
+#define VICTIM_MASK                      0xFF
+
+
+/* TTBCR */
+#define N_MASK                           0x07
+#define PD0_MASK                         0x01
+#define PD1_MASK                         0x01
+
+
+/* TTBR0 */
+#define TTBR0_IRGNH_MASK                 0x01
+#define TTBR0_SH_MASK                    0x01
+#define TTBR0_ORGN_MASK                  0x03
+#define TTBR0_NOS_MASK                   0x01
+#define TTBR0_IRGNL_MASK                 0x01
+#define TTBR0_PA_MASK                    0x0003FFFF
+
+
+/* TTBR1 */
+#define TTBR1_IRGNH_MASK                 0x01
+#define TTBR1_SH_MASK                    0x01
+#define TTBR1_ORGN_MASK                  0x03
+#define TTBR1_NOS_MASK                   0x01
+#define TTBR1_IRGNL_MASK                 0x01
+#define TTBR1_PA_MASK                    0x0003FFFF
+
+
+/* V2PSR */
+#define HIT_MASK                         0x01
+#define INDEX_MASK                       0xFF
+
+
+/* V2Pxx */
+#define V2Pxx_INDEX_MASK                 0xFF
+#define V2Pxx_VA_MASK                    0x000FFFFF
+
+
+/* Context Register Shifts */
+/* ACTLR */
+#define CFERE_SHIFT                    0
+#define CFEIE_SHIFT                    1
+#define PTSHCFG_SHIFT                  2
+#define RCOSH_SHIFT                    4
+#define RCISH_SHIFT                    5
+#define RCNSH_SHIFT                    6
+#define PRIVCFG_SHIFT                  8
+#define DNA_SHIFT                      10
+#define DNLV2PA_SHIFT                  11
+#define TLBMCFG_SHIFT                  12
+#define CFCFG_SHIFT                    14
+#define TIPCF_SHIFT                    15
+#define V2PCFG_SHIFT                   16
+#define HUME_SHIFT                     18
+#define PTMTCFG_SHIFT                  20
+#define PTMEMTYPE_SHIFT                21
+
+
+/* BFBCR */
+#define BFBDFE_SHIFT                   0
+#define BFBSFE_SHIFT                   1
+#define SFVS_SHIFT                     2
+#define FLVIC_SHIFT                    4
+#define SLVIC_SHIFT                    8
+
+
+/* CONTEXTIDR */
+#define CONTEXTIDR_ASID_SHIFT          0
+#define PROCID_SHIFT                   8
+
+
+/* FSR */
+#define TF_SHIFT                       1
+#define AFF_SHIFT                      2
+#define APF_SHIFT                      3
+#define TLBMF_SHIFT                    4
+#define HTWDEEF_SHIFT                  5
+#define HTWSEEF_SHIFT                  6
+#define MHF_SHIFT                      7
+#define SL_SHIFT                       16
+#define SS_SHIFT                       30
+#define MULTI_SHIFT                    31
+
+
+/* FSYNR0 */
+#define AMID_SHIFT                     0
+#define APID_SHIFT                     8
+#define ABID_SHIFT                     13
+#define ATID_SHIFT                     24
+
+
+/* FSYNR1 */
+#define AMEMTYPE_SHIFT                 0
+#define ASHARED_SHIFT                  3
+#define AINNERSHARED_SHIFT             4
+#define APRIV_SHIFT                    5
+#define APROTNS_SHIFT                  6
+#define AINST_SHIFT                    7
+#define AWRITE_SHIFT                   8
+#define ABURST_SHIFT                   10
+#define ALEN_SHIFT                     12
+#define FSYNR1_ASIZE_SHIFT             16
+#define ALOCK_SHIFT                    20
+#define AFULL_SHIFT                    24
+
+
+/* NMRR */
+#define ICPC0_SHIFT                    0
+#define ICPC1_SHIFT                    2
+#define ICPC2_SHIFT                    4
+#define ICPC3_SHIFT                    6
+#define ICPC4_SHIFT                    8
+#define ICPC5_SHIFT                    10
+#define ICPC6_SHIFT                    12
+#define ICPC7_SHIFT                    14
+#define OCPC0_SHIFT                    16
+#define OCPC1_SHIFT                    18
+#define OCPC2_SHIFT                    20
+#define OCPC3_SHIFT                    22
+#define OCPC4_SHIFT                    24
+#define OCPC5_SHIFT                    26
+#define OCPC6_SHIFT                    28
+#define OCPC7_SHIFT                    30
+
+
+/* PAR */
+#define FAULT_SHIFT                    0
+/* If a fault is present, these are the
+same as the fault fields in the FAR */
+#define FAULT_TF_SHIFT                 1
+#define FAULT_AFF_SHIFT                2
+#define FAULT_APF_SHIFT                3
+#define FAULT_TLBMF_SHIFT              4
+#define FAULT_HTWDEEF_SHIFT            5
+#define FAULT_HTWSEEF_SHIFT            6
+#define FAULT_MHF_SHIFT                7
+#define FAULT_SL_SHIFT                 16
+#define FAULT_SS_SHIFT                 30
+
+/* If NO fault is present, the following
+ * fields are in effect
+ * (FAULT remains as before) */
+#define PAR_NOFAULT_SS_SHIFT           1
+#define PAR_NOFAULT_MT_SHIFT           4
+#define PAR_NOFAULT_SH_SHIFT           7
+#define PAR_NOFAULT_NS_SHIFT           9
+#define PAR_NOFAULT_NOS_SHIFT          10
+#define PAR_NPFAULT_PA_SHIFT           12
+
+
+/* PRRR */
+#define MTC0_SHIFT                     0
+#define MTC1_SHIFT                     2
+#define MTC2_SHIFT                     4
+#define MTC3_SHIFT                     6
+#define MTC4_SHIFT                     8
+#define MTC5_SHIFT                     10
+#define MTC6_SHIFT                     12
+#define MTC7_SHIFT                     14
+#define SHDSH0_SHIFT                   16
+#define SHDSH1_SHIFT                   17
+#define SHNMSH0_SHIFT                  18
+#define SHNMSH1_SHIFT                  19
+#define NOS0_SHIFT                     24
+#define NOS1_SHIFT                     25
+#define NOS2_SHIFT                     26
+#define NOS3_SHIFT                     27
+#define NOS4_SHIFT                     28
+#define NOS5_SHIFT                     29
+#define NOS6_SHIFT                     30
+#define NOS7_SHIFT                     31
+
+
+/* RESUME */
+#define TNR_SHIFT                      0
+
+
+/* SCTLR */
+#define M_SHIFT                        0
+#define TRE_SHIFT                      1
+#define AFE_SHIFT                      2
+#define HAF_SHIFT                      3
+#define BE_SHIFT                       4
+#define AFFD_SHIFT                     5
+
+
+/* TLBIASID */
+#define TLBIASID_ASID_SHIFT            0
+
+
+/* TLBIVA */
+#define TLBIVA_ASID_SHIFT              0
+#define TLBIVA_VA_SHIFT                12
+
+
+/* TLBIVAA */
+#define TLBIVAA_VA_SHIFT               12
+
+
+/* TLBLCKR */
+#define LKE_SHIFT                      0
+#define TLBLCKR_TLBIALLCFG_SHIFT       1
+#define TLBIASIDCFG_SHIFT              2
+#define TLBIVAACFG_SHIFT               3
+#define FLOOR_SHIFT                    8
+#define VICTIM_SHIFT                   8
+
+
+/* TTBCR */
+#define N_SHIFT                        3
+#define PD0_SHIFT                      4
+#define PD1_SHIFT                      5
+
+
+/* TTBR0 */
+#define TTBR0_IRGNH_SHIFT              0
+#define TTBR0_SH_SHIFT                 1
+#define TTBR0_ORGN_SHIFT               3
+#define TTBR0_NOS_SHIFT                5
+#define TTBR0_IRGNL_SHIFT              6
+#define TTBR0_PA_SHIFT                 14
+
+
+/* TTBR1 */
+#define TTBR1_IRGNH_SHIFT              0
+#define TTBR1_SH_SHIFT                 1
+#define TTBR1_ORGN_SHIFT               3
+#define TTBR1_NOS_SHIFT                5
+#define TTBR1_IRGNL_SHIFT              6
+#define TTBR1_PA_SHIFT                 14
+
+
+/* V2PSR */
+#define HIT_SHIFT                      0
+#define INDEX_SHIFT                    8
+
+
+/* V2Pxx */
+#define V2Pxx_INDEX_SHIFT              0
+#define V2Pxx_VA_SHIFT                 12
+
+#endif
diff --git a/drivers/iommu/mtk_iommu.c b/drivers/iommu/mtk_iommu.c
new file mode 100644
index 000000000..7304ad88f
--- /dev/null
+++ b/drivers/iommu/mtk_iommu.c
@@ -0,0 +1,786 @@
+/*
+ * Copyright (c) 2015-2016 MediaTek Inc.
+ * Author: Yong Wu <yong.wu@mediatek.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+#include <linux/bootmem.h>
+#include <linux/bug.h>
+#include <linux/clk.h>
+#include <linux/component.h>
+#include <linux/device.h>
+#include <linux/dma-iommu.h>
+#include <linux/err.h>
+#include <linux/interrupt.h>
+#include <linux/io.h>
+#include <linux/iommu.h>
+#include <linux/iopoll.h>
+#include <linux/list.h>
+#include <linux/of_address.h>
+#include <linux/of_iommu.h>
+#include <linux/of_irq.h>
+#include <linux/of_platform.h>
+#include <linux/platform_device.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <asm/barrier.h>
+#include <soc/mediatek/smi.h>
+
+#include "mtk_iommu.h"
+
+#define REG_MMU_PT_BASE_ADDR			0x000
+
+#define REG_MMU_INVALIDATE			0x020
+#define F_ALL_INVLD				0x2
+#define F_MMU_INV_RANGE				0x1
+
+#define REG_MMU_INVLD_START_A			0x024
+#define REG_MMU_INVLD_END_A			0x028
+
+#define REG_MMU_INV_SEL				0x038
+#define F_INVLD_EN0				BIT(0)
+#define F_INVLD_EN1				BIT(1)
+
+#define REG_MMU_STANDARD_AXI_MODE		0x048
+#define REG_MMU_DCM_DIS				0x050
+
+#define REG_MMU_CTRL_REG			0x110
+#define F_MMU_PREFETCH_RT_REPLACE_MOD		BIT(4)
+#define F_MMU_TF_PROTECT_SEL_SHIFT(data) \
+	((data)->m4u_plat == M4U_MT2712 ? 4 : 5)
+/* It's named by F_MMU_TF_PROT_SEL in mt2712. */
+#define F_MMU_TF_PROTECT_SEL(prot, data) \
+	(((prot) & 0x3) << F_MMU_TF_PROTECT_SEL_SHIFT(data))
+
+#define REG_MMU_IVRP_PADDR			0x114
+
+#define REG_MMU_VLD_PA_RNG			0x118
+#define F_MMU_VLD_PA_RNG(EA, SA)		(((EA) << 8) | (SA))
+
+#define REG_MMU_INT_CONTROL0			0x120
+#define F_L2_MULIT_HIT_EN			BIT(0)
+#define F_TABLE_WALK_FAULT_INT_EN		BIT(1)
+#define F_PREETCH_FIFO_OVERFLOW_INT_EN		BIT(2)
+#define F_MISS_FIFO_OVERFLOW_INT_EN		BIT(3)
+#define F_PREFETCH_FIFO_ERR_INT_EN		BIT(5)
+#define F_MISS_FIFO_ERR_INT_EN			BIT(6)
+#define F_INT_CLR_BIT				BIT(12)
+
+#define REG_MMU_INT_MAIN_CONTROL		0x124
+#define F_INT_TRANSLATION_FAULT			BIT(0)
+#define F_INT_MAIN_MULTI_HIT_FAULT		BIT(1)
+#define F_INT_INVALID_PA_FAULT			BIT(2)
+#define F_INT_ENTRY_REPLACEMENT_FAULT		BIT(3)
+#define F_INT_TLB_MISS_FAULT			BIT(4)
+#define F_INT_MISS_TRANSACTION_FIFO_FAULT	BIT(5)
+#define F_INT_PRETETCH_TRANSATION_FIFO_FAULT	BIT(6)
+
+#define REG_MMU_CPE_DONE			0x12C
+
+#define REG_MMU_FAULT_ST1			0x134
+
+#define REG_MMU_FAULT_VA			0x13c
+#define F_MMU_FAULT_VA_WRITE_BIT		BIT(1)
+#define F_MMU_FAULT_VA_LAYER_BIT		BIT(0)
+
+#define REG_MMU_INVLD_PA			0x140
+#define REG_MMU_INT_ID				0x150
+#define F_MMU0_INT_ID_LARB_ID(a)		(((a) >> 7) & 0x7)
+#define F_MMU0_INT_ID_PORT_ID(a)		(((a) >> 2) & 0x1f)
+
+#define MTK_PROTECT_PA_ALIGN			128
+
+/*
+ * Get the local arbiter ID and the portid within the larb arbiter
+ * from mtk_m4u_id which is defined by MTK_M4U_ID.
+ */
+#define MTK_M4U_TO_LARB(id)		(((id) >> 5) & 0xf)
+#define MTK_M4U_TO_PORT(id)		((id) & 0x1f)
+
+struct mtk_iommu_domain {
+	spinlock_t			pgtlock; /* lock for page table */
+
+	struct io_pgtable_cfg		cfg;
+	struct io_pgtable_ops		*iop;
+
+	struct iommu_domain		domain;
+};
+
+static struct iommu_ops mtk_iommu_ops;
+
+/*
+ * In M4U 4GB mode, the physical address is remapped as below:
+ *
+ * CPU Physical address:
+ * ====================
+ *
+ * 0      1G       2G     3G       4G     5G
+ * |---A---|---B---|---C---|---D---|---E---|
+ * +--I/O--+------------Memory-------------+
+ *
+ * IOMMU output physical address:
+ *  =============================
+ *
+ *                                 4G      5G     6G      7G      8G
+ *                                 |---E---|---B---|---C---|---D---|
+ *                                 +------------Memory-------------+
+ *
+ * The Region 'A'(I/O) can NOT be mapped by M4U; For Region 'B'/'C'/'D', the
+ * bit32 of the CPU physical address always is needed to set, and for Region
+ * 'E', the CPU physical address keep as is.
+ * Additionally, The iommu consumers always use the CPU phyiscal address.
+ */
+#define MTK_IOMMU_4GB_MODE_REMAP_BASE	 0x40000000
+
+static LIST_HEAD(m4ulist);	/* List all the M4U HWs */
+
+#define for_each_m4u(data)	list_for_each_entry(data, &m4ulist, list)
+
+/*
+ * There may be 1 or 2 M4U HWs, But we always expect they are in the same domain
+ * for the performance.
+ *
+ * Here always return the mtk_iommu_data of the first probed M4U where the
+ * iommu domain information is recorded.
+ */
+static struct mtk_iommu_data *mtk_iommu_get_m4u_data(void)
+{
+	struct mtk_iommu_data *data;
+
+	for_each_m4u(data)
+		return data;
+
+	return NULL;
+}
+
+static struct mtk_iommu_domain *to_mtk_domain(struct iommu_domain *dom)
+{
+	return container_of(dom, struct mtk_iommu_domain, domain);
+}
+
+static void mtk_iommu_tlb_flush_all(void *cookie)
+{
+	struct mtk_iommu_data *data = cookie;
+
+	for_each_m4u(data) {
+		writel_relaxed(F_INVLD_EN1 | F_INVLD_EN0,
+			       data->base + REG_MMU_INV_SEL);
+		writel_relaxed(F_ALL_INVLD, data->base + REG_MMU_INVALIDATE);
+		wmb(); /* Make sure the tlb flush all done */
+	}
+}
+
+static void mtk_iommu_tlb_add_flush_nosync(unsigned long iova, size_t size,
+					   size_t granule, bool leaf,
+					   void *cookie)
+{
+	struct mtk_iommu_data *data = cookie;
+
+	for_each_m4u(data) {
+		writel_relaxed(F_INVLD_EN1 | F_INVLD_EN0,
+			       data->base + REG_MMU_INV_SEL);
+
+		writel_relaxed(iova, data->base + REG_MMU_INVLD_START_A);
+		writel_relaxed(iova + size - 1,
+			       data->base + REG_MMU_INVLD_END_A);
+		writel_relaxed(F_MMU_INV_RANGE,
+			       data->base + REG_MMU_INVALIDATE);
+		data->tlb_flush_active = true;
+	}
+}
+
+static void mtk_iommu_tlb_sync(void *cookie)
+{
+	struct mtk_iommu_data *data = cookie;
+	int ret;
+	u32 tmp;
+
+	for_each_m4u(data) {
+		/* Avoid timing out if there's nothing to wait for */
+		if (!data->tlb_flush_active)
+			return;
+
+		ret = readl_poll_timeout_atomic(data->base + REG_MMU_CPE_DONE,
+						tmp, tmp != 0, 10, 100000);
+		if (ret) {
+			dev_warn(data->dev,
+				 "Partial TLB flush timed out, falling back to full flush\n");
+			mtk_iommu_tlb_flush_all(cookie);
+		}
+		/* Clear the CPE status */
+		writel_relaxed(0, data->base + REG_MMU_CPE_DONE);
+		data->tlb_flush_active = false;
+	}
+}
+
+static const struct iommu_gather_ops mtk_iommu_gather_ops = {
+	.tlb_flush_all = mtk_iommu_tlb_flush_all,
+	.tlb_add_flush = mtk_iommu_tlb_add_flush_nosync,
+	.tlb_sync = mtk_iommu_tlb_sync,
+};
+
+static irqreturn_t mtk_iommu_isr(int irq, void *dev_id)
+{
+	struct mtk_iommu_data *data = dev_id;
+	struct mtk_iommu_domain *dom = data->m4u_dom;
+	u32 int_state, regval, fault_iova, fault_pa;
+	unsigned int fault_larb, fault_port;
+	bool layer, write;
+
+	/* Read error info from registers */
+	int_state = readl_relaxed(data->base + REG_MMU_FAULT_ST1);
+	fault_iova = readl_relaxed(data->base + REG_MMU_FAULT_VA);
+	layer = fault_iova & F_MMU_FAULT_VA_LAYER_BIT;
+	write = fault_iova & F_MMU_FAULT_VA_WRITE_BIT;
+	fault_pa = readl_relaxed(data->base + REG_MMU_INVLD_PA);
+	regval = readl_relaxed(data->base + REG_MMU_INT_ID);
+	fault_larb = F_MMU0_INT_ID_LARB_ID(regval);
+	fault_port = F_MMU0_INT_ID_PORT_ID(regval);
+
+	if (report_iommu_fault(&dom->domain, data->dev, fault_iova,
+			       write ? IOMMU_FAULT_WRITE : IOMMU_FAULT_READ)) {
+		dev_err_ratelimited(
+			data->dev,
+			"fault type=0x%x iova=0x%x pa=0x%x larb=%d port=%d layer=%d %s\n",
+			int_state, fault_iova, fault_pa, fault_larb, fault_port,
+			layer, write ? "write" : "read");
+	}
+
+	/* Interrupt clear */
+	regval = readl_relaxed(data->base + REG_MMU_INT_CONTROL0);
+	regval |= F_INT_CLR_BIT;
+	writel_relaxed(regval, data->base + REG_MMU_INT_CONTROL0);
+
+	mtk_iommu_tlb_flush_all(data);
+
+	return IRQ_HANDLED;
+}
+
+static void mtk_iommu_config(struct mtk_iommu_data *data,
+			     struct device *dev, bool enable)
+{
+	struct mtk_smi_larb_iommu    *larb_mmu;
+	unsigned int                 larbid, portid;
+	struct iommu_fwspec *fwspec = dev->iommu_fwspec;
+	int i;
+
+	for (i = 0; i < fwspec->num_ids; ++i) {
+		larbid = MTK_M4U_TO_LARB(fwspec->ids[i]);
+		portid = MTK_M4U_TO_PORT(fwspec->ids[i]);
+		larb_mmu = &data->smi_imu.larb_imu[larbid];
+
+		dev_dbg(dev, "%s iommu port: %d\n",
+			enable ? "enable" : "disable", portid);
+
+		if (enable)
+			larb_mmu->mmu |= MTK_SMI_MMU_EN(portid);
+		else
+			larb_mmu->mmu &= ~MTK_SMI_MMU_EN(portid);
+	}
+}
+
+static int mtk_iommu_domain_finalise(struct mtk_iommu_domain *dom)
+{
+	struct mtk_iommu_data *data = mtk_iommu_get_m4u_data();
+
+	spin_lock_init(&dom->pgtlock);
+
+	dom->cfg = (struct io_pgtable_cfg) {
+		.quirks = IO_PGTABLE_QUIRK_ARM_NS |
+			IO_PGTABLE_QUIRK_NO_PERMS |
+			IO_PGTABLE_QUIRK_TLBI_ON_MAP,
+		.pgsize_bitmap = mtk_iommu_ops.pgsize_bitmap,
+		.ias = 32,
+		.oas = 32,
+		.tlb = &mtk_iommu_gather_ops,
+		.iommu_dev = data->dev,
+	};
+
+	if (data->enable_4GB)
+		dom->cfg.quirks |= IO_PGTABLE_QUIRK_ARM_MTK_4GB;
+
+	dom->iop = alloc_io_pgtable_ops(ARM_V7S, &dom->cfg, data);
+	if (!dom->iop) {
+		dev_err(data->dev, "Failed to alloc io pgtable\n");
+		return -EINVAL;
+	}
+
+	/* Update our support page sizes bitmap */
+	dom->domain.pgsize_bitmap = dom->cfg.pgsize_bitmap;
+	return 0;
+}
+
+static struct iommu_domain *mtk_iommu_domain_alloc(unsigned type)
+{
+	struct mtk_iommu_domain *dom;
+
+	if (type != IOMMU_DOMAIN_DMA)
+		return NULL;
+
+	dom = kzalloc(sizeof(*dom), GFP_KERNEL);
+	if (!dom)
+		return NULL;
+
+	if (iommu_get_dma_cookie(&dom->domain))
+		goto  free_dom;
+
+	if (mtk_iommu_domain_finalise(dom))
+		goto  put_dma_cookie;
+
+	dom->domain.geometry.aperture_start = 0;
+	dom->domain.geometry.aperture_end = DMA_BIT_MASK(32);
+	dom->domain.geometry.force_aperture = true;
+
+	return &dom->domain;
+
+put_dma_cookie:
+	iommu_put_dma_cookie(&dom->domain);
+free_dom:
+	kfree(dom);
+	return NULL;
+}
+
+static void mtk_iommu_domain_free(struct iommu_domain *domain)
+{
+	struct mtk_iommu_domain *dom = to_mtk_domain(domain);
+
+	free_io_pgtable_ops(dom->iop);
+	iommu_put_dma_cookie(domain);
+	kfree(to_mtk_domain(domain));
+}
+
+static int mtk_iommu_attach_device(struct iommu_domain *domain,
+				   struct device *dev)
+{
+	struct mtk_iommu_domain *dom = to_mtk_domain(domain);
+	struct mtk_iommu_data *data = dev->iommu_fwspec->iommu_priv;
+
+	if (!data)
+		return -ENODEV;
+
+	/* Update the pgtable base address register of the M4U HW */
+	if (!data->m4u_dom) {
+		data->m4u_dom = dom;
+		writel(dom->cfg.arm_v7s_cfg.ttbr[0],
+		       data->base + REG_MMU_PT_BASE_ADDR);
+	}
+
+	mtk_iommu_config(data, dev, true);
+	return 0;
+}
+
+static void mtk_iommu_detach_device(struct iommu_domain *domain,
+				    struct device *dev)
+{
+	struct mtk_iommu_data *data = dev->iommu_fwspec->iommu_priv;
+
+	if (!data)
+		return;
+
+	mtk_iommu_config(data, dev, false);
+}
+
+static int mtk_iommu_map(struct iommu_domain *domain, unsigned long iova,
+			 phys_addr_t paddr, size_t size, int prot)
+{
+	struct mtk_iommu_domain *dom = to_mtk_domain(domain);
+	unsigned long flags;
+	int ret;
+
+	spin_lock_irqsave(&dom->pgtlock, flags);
+	ret = dom->iop->map(dom->iop, iova, paddr & DMA_BIT_MASK(32),
+			    size, prot);
+	spin_unlock_irqrestore(&dom->pgtlock, flags);
+
+	return ret;
+}
+
+static size_t mtk_iommu_unmap(struct iommu_domain *domain,
+			      unsigned long iova, size_t size)
+{
+	struct mtk_iommu_domain *dom = to_mtk_domain(domain);
+	unsigned long flags;
+	size_t unmapsz;
+
+	spin_lock_irqsave(&dom->pgtlock, flags);
+	unmapsz = dom->iop->unmap(dom->iop, iova, size);
+	spin_unlock_irqrestore(&dom->pgtlock, flags);
+
+	return unmapsz;
+}
+
+static void mtk_iommu_iotlb_sync(struct iommu_domain *domain)
+{
+	mtk_iommu_tlb_flush_all(mtk_iommu_get_m4u_data());
+}
+
+static phys_addr_t mtk_iommu_iova_to_phys(struct iommu_domain *domain,
+					  dma_addr_t iova)
+{
+	struct mtk_iommu_domain *dom = to_mtk_domain(domain);
+	struct mtk_iommu_data *data = mtk_iommu_get_m4u_data();
+	unsigned long flags;
+	phys_addr_t pa;
+
+	spin_lock_irqsave(&dom->pgtlock, flags);
+	pa = dom->iop->iova_to_phys(dom->iop, iova);
+	spin_unlock_irqrestore(&dom->pgtlock, flags);
+
+	if (data->enable_4GB && pa < MTK_IOMMU_4GB_MODE_REMAP_BASE)
+		pa |= BIT_ULL(32);
+
+	return pa;
+}
+
+static int mtk_iommu_add_device(struct device *dev)
+{
+	struct mtk_iommu_data *data;
+	struct iommu_group *group;
+
+	if (!dev->iommu_fwspec || dev->iommu_fwspec->ops != &mtk_iommu_ops)
+		return -ENODEV; /* Not a iommu client device */
+
+	data = dev->iommu_fwspec->iommu_priv;
+	iommu_device_link(&data->iommu, dev);
+
+	group = iommu_group_get_for_dev(dev);
+	if (IS_ERR(group))
+		return PTR_ERR(group);
+
+	iommu_group_put(group);
+	return 0;
+}
+
+static void mtk_iommu_remove_device(struct device *dev)
+{
+	struct mtk_iommu_data *data;
+
+	if (!dev->iommu_fwspec || dev->iommu_fwspec->ops != &mtk_iommu_ops)
+		return;
+
+	data = dev->iommu_fwspec->iommu_priv;
+	iommu_device_unlink(&data->iommu, dev);
+
+	iommu_group_remove_device(dev);
+	iommu_fwspec_free(dev);
+}
+
+static struct iommu_group *mtk_iommu_device_group(struct device *dev)
+{
+	struct mtk_iommu_data *data = mtk_iommu_get_m4u_data();
+
+	if (!data)
+		return ERR_PTR(-ENODEV);
+
+	/* All the client devices are in the same m4u iommu-group */
+	if (!data->m4u_group) {
+		data->m4u_group = iommu_group_alloc();
+		if (IS_ERR(data->m4u_group))
+			dev_err(dev, "Failed to allocate M4U IOMMU group\n");
+	} else {
+		iommu_group_ref_get(data->m4u_group);
+	}
+	return data->m4u_group;
+}
+
+static int mtk_iommu_of_xlate(struct device *dev, struct of_phandle_args *args)
+{
+	struct platform_device *m4updev;
+
+	if (args->args_count != 1) {
+		dev_err(dev, "invalid #iommu-cells(%d) property for IOMMU\n",
+			args->args_count);
+		return -EINVAL;
+	}
+
+	if (!dev->iommu_fwspec->iommu_priv) {
+		/* Get the m4u device */
+		m4updev = of_find_device_by_node(args->np);
+		if (WARN_ON(!m4updev))
+			return -EINVAL;
+
+		dev->iommu_fwspec->iommu_priv = platform_get_drvdata(m4updev);
+	}
+
+	return iommu_fwspec_add_ids(dev, args->args, 1);
+}
+
+static struct iommu_ops mtk_iommu_ops = {
+	.domain_alloc	= mtk_iommu_domain_alloc,
+	.domain_free	= mtk_iommu_domain_free,
+	.attach_dev	= mtk_iommu_attach_device,
+	.detach_dev	= mtk_iommu_detach_device,
+	.map		= mtk_iommu_map,
+	.unmap		= mtk_iommu_unmap,
+	.flush_iotlb_all = mtk_iommu_iotlb_sync,
+	.iotlb_sync	= mtk_iommu_iotlb_sync,
+	.iova_to_phys	= mtk_iommu_iova_to_phys,
+	.add_device	= mtk_iommu_add_device,
+	.remove_device	= mtk_iommu_remove_device,
+	.device_group	= mtk_iommu_device_group,
+	.of_xlate	= mtk_iommu_of_xlate,
+	.pgsize_bitmap	= SZ_4K | SZ_64K | SZ_1M | SZ_16M,
+};
+
+static int mtk_iommu_hw_init(const struct mtk_iommu_data *data)
+{
+	u32 regval;
+	int ret;
+
+	ret = clk_prepare_enable(data->bclk);
+	if (ret) {
+		dev_err(data->dev, "Failed to enable iommu bclk(%d)\n", ret);
+		return ret;
+	}
+
+	regval = F_MMU_TF_PROTECT_SEL(2, data);
+	if (data->m4u_plat == M4U_MT8173)
+		regval |= F_MMU_PREFETCH_RT_REPLACE_MOD;
+	writel_relaxed(regval, data->base + REG_MMU_CTRL_REG);
+
+	regval = F_L2_MULIT_HIT_EN |
+		F_TABLE_WALK_FAULT_INT_EN |
+		F_PREETCH_FIFO_OVERFLOW_INT_EN |
+		F_MISS_FIFO_OVERFLOW_INT_EN |
+		F_PREFETCH_FIFO_ERR_INT_EN |
+		F_MISS_FIFO_ERR_INT_EN;
+	writel_relaxed(regval, data->base + REG_MMU_INT_CONTROL0);
+
+	regval = F_INT_TRANSLATION_FAULT |
+		F_INT_MAIN_MULTI_HIT_FAULT |
+		F_INT_INVALID_PA_FAULT |
+		F_INT_ENTRY_REPLACEMENT_FAULT |
+		F_INT_TLB_MISS_FAULT |
+		F_INT_MISS_TRANSACTION_FIFO_FAULT |
+		F_INT_PRETETCH_TRANSATION_FIFO_FAULT;
+	writel_relaxed(regval, data->base + REG_MMU_INT_MAIN_CONTROL);
+
+	if (data->m4u_plat == M4U_MT8173)
+		regval = (data->protect_base >> 1) | (data->enable_4GB << 31);
+	else
+		regval = lower_32_bits(data->protect_base) |
+			 upper_32_bits(data->protect_base);
+	writel_relaxed(regval, data->base + REG_MMU_IVRP_PADDR);
+
+	if (data->enable_4GB && data->m4u_plat != M4U_MT8173) {
+		/*
+		 * If 4GB mode is enabled, the validate PA range is from
+		 * 0x1_0000_0000 to 0x1_ffff_ffff. here record bit[32:30].
+		 */
+		regval = F_MMU_VLD_PA_RNG(7, 4);
+		writel_relaxed(regval, data->base + REG_MMU_VLD_PA_RNG);
+	}
+	writel_relaxed(0, data->base + REG_MMU_DCM_DIS);
+
+	/* It's MISC control register whose default value is ok except mt8173.*/
+	if (data->m4u_plat == M4U_MT8173)
+		writel_relaxed(0, data->base + REG_MMU_STANDARD_AXI_MODE);
+
+	if (devm_request_irq(data->dev, data->irq, mtk_iommu_isr, 0,
+			     dev_name(data->dev), (void *)data)) {
+		writel_relaxed(0, data->base + REG_MMU_PT_BASE_ADDR);
+		clk_disable_unprepare(data->bclk);
+		dev_err(data->dev, "Failed @ IRQ-%d Request\n", data->irq);
+		return -ENODEV;
+	}
+
+	return 0;
+}
+
+static const struct component_master_ops mtk_iommu_com_ops = {
+	.bind		= mtk_iommu_bind,
+	.unbind		= mtk_iommu_unbind,
+};
+
+static int mtk_iommu_probe(struct platform_device *pdev)
+{
+	struct mtk_iommu_data   *data;
+	struct device           *dev = &pdev->dev;
+	struct resource         *res;
+	resource_size_t		ioaddr;
+	struct component_match  *match = NULL;
+	void                    *protect;
+	int                     i, larb_nr, ret;
+
+	data = devm_kzalloc(dev, sizeof(*data), GFP_KERNEL);
+	if (!data)
+		return -ENOMEM;
+	data->dev = dev;
+	data->m4u_plat = (enum mtk_iommu_plat)of_device_get_match_data(dev);
+
+	/* Protect memory. HW will access here while translation fault.*/
+	protect = devm_kzalloc(dev, MTK_PROTECT_PA_ALIGN * 2, GFP_KERNEL);
+	if (!protect)
+		return -ENOMEM;
+	data->protect_base = ALIGN(virt_to_phys(protect), MTK_PROTECT_PA_ALIGN);
+
+	/* Whether the current dram is over 4GB */
+	data->enable_4GB = !!(max_pfn > (BIT_ULL(32) >> PAGE_SHIFT));
+
+	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	data->base = devm_ioremap_resource(dev, res);
+	if (IS_ERR(data->base))
+		return PTR_ERR(data->base);
+	ioaddr = res->start;
+
+	data->irq = platform_get_irq(pdev, 0);
+	if (data->irq < 0)
+		return data->irq;
+
+	data->bclk = devm_clk_get(dev, "bclk");
+	if (IS_ERR(data->bclk))
+		return PTR_ERR(data->bclk);
+
+	larb_nr = of_count_phandle_with_args(dev->of_node,
+					     "mediatek,larbs", NULL);
+	if (larb_nr < 0)
+		return larb_nr;
+	data->smi_imu.larb_nr = larb_nr;
+
+	for (i = 0; i < larb_nr; i++) {
+		struct device_node *larbnode;
+		struct platform_device *plarbdev;
+		u32 id;
+
+		larbnode = of_parse_phandle(dev->of_node, "mediatek,larbs", i);
+		if (!larbnode)
+			return -EINVAL;
+
+		if (!of_device_is_available(larbnode))
+			continue;
+
+		ret = of_property_read_u32(larbnode, "mediatek,larb-id", &id);
+		if (ret)/* The id is consecutive if there is no this property */
+			id = i;
+
+		plarbdev = of_find_device_by_node(larbnode);
+		if (!plarbdev)
+			return -EPROBE_DEFER;
+		data->smi_imu.larb_imu[id].dev = &plarbdev->dev;
+
+		component_match_add_release(dev, &match, release_of,
+					    compare_of, larbnode);
+	}
+
+	platform_set_drvdata(pdev, data);
+
+	ret = mtk_iommu_hw_init(data);
+	if (ret)
+		return ret;
+
+	ret = iommu_device_sysfs_add(&data->iommu, dev, NULL,
+				     "mtk-iommu.%pa", &ioaddr);
+	if (ret)
+		return ret;
+
+	iommu_device_set_ops(&data->iommu, &mtk_iommu_ops);
+	iommu_device_set_fwnode(&data->iommu, &pdev->dev.of_node->fwnode);
+
+	ret = iommu_device_register(&data->iommu);
+	if (ret)
+		return ret;
+
+	list_add_tail(&data->list, &m4ulist);
+
+	if (!iommu_present(&platform_bus_type))
+		bus_set_iommu(&platform_bus_type, &mtk_iommu_ops);
+
+	return component_master_add_with_match(dev, &mtk_iommu_com_ops, match);
+}
+
+static int mtk_iommu_remove(struct platform_device *pdev)
+{
+	struct mtk_iommu_data *data = platform_get_drvdata(pdev);
+
+	iommu_device_sysfs_remove(&data->iommu);
+	iommu_device_unregister(&data->iommu);
+
+	list_del(&data->list);
+
+	clk_disable_unprepare(data->bclk);
+	devm_free_irq(&pdev->dev, data->irq, data);
+	component_master_del(&pdev->dev, &mtk_iommu_com_ops);
+	return 0;
+}
+
+static int __maybe_unused mtk_iommu_suspend(struct device *dev)
+{
+	struct mtk_iommu_data *data = dev_get_drvdata(dev);
+	struct mtk_iommu_suspend_reg *reg = &data->reg;
+	void __iomem *base = data->base;
+
+	reg->standard_axi_mode = readl_relaxed(base +
+					       REG_MMU_STANDARD_AXI_MODE);
+	reg->dcm_dis = readl_relaxed(base + REG_MMU_DCM_DIS);
+	reg->ctrl_reg = readl_relaxed(base + REG_MMU_CTRL_REG);
+	reg->int_control0 = readl_relaxed(base + REG_MMU_INT_CONTROL0);
+	reg->int_main_control = readl_relaxed(base + REG_MMU_INT_MAIN_CONTROL);
+	reg->ivrp_paddr = readl_relaxed(base + REG_MMU_IVRP_PADDR);
+	clk_disable_unprepare(data->bclk);
+	return 0;
+}
+
+static int __maybe_unused mtk_iommu_resume(struct device *dev)
+{
+	struct mtk_iommu_data *data = dev_get_drvdata(dev);
+	struct mtk_iommu_suspend_reg *reg = &data->reg;
+	void __iomem *base = data->base;
+	int ret;
+
+	ret = clk_prepare_enable(data->bclk);
+	if (ret) {
+		dev_err(data->dev, "Failed to enable clk(%d) in resume\n", ret);
+		return ret;
+	}
+	writel_relaxed(reg->standard_axi_mode,
+		       base + REG_MMU_STANDARD_AXI_MODE);
+	writel_relaxed(reg->dcm_dis, base + REG_MMU_DCM_DIS);
+	writel_relaxed(reg->ctrl_reg, base + REG_MMU_CTRL_REG);
+	writel_relaxed(reg->int_control0, base + REG_MMU_INT_CONTROL0);
+	writel_relaxed(reg->int_main_control, base + REG_MMU_INT_MAIN_CONTROL);
+	writel_relaxed(reg->ivrp_paddr, base + REG_MMU_IVRP_PADDR);
+	if (data->m4u_dom)
+		writel(data->m4u_dom->cfg.arm_v7s_cfg.ttbr[0],
+		       base + REG_MMU_PT_BASE_ADDR);
+	return 0;
+}
+
+static const struct dev_pm_ops mtk_iommu_pm_ops = {
+	SET_NOIRQ_SYSTEM_SLEEP_PM_OPS(mtk_iommu_suspend, mtk_iommu_resume)
+};
+
+static const struct of_device_id mtk_iommu_of_ids[] = {
+	{ .compatible = "mediatek,mt2712-m4u", .data = (void *)M4U_MT2712},
+	{ .compatible = "mediatek,mt8173-m4u", .data = (void *)M4U_MT8173},
+	{}
+};
+
+static struct platform_driver mtk_iommu_driver = {
+	.probe	= mtk_iommu_probe,
+	.remove	= mtk_iommu_remove,
+	.driver	= {
+		.name = "mtk-iommu",
+		.of_match_table = of_match_ptr(mtk_iommu_of_ids),
+		.pm = &mtk_iommu_pm_ops,
+	}
+};
+
+static int __init mtk_iommu_init(void)
+{
+	int ret;
+
+	ret = platform_driver_register(&mtk_iommu_driver);
+	if (ret != 0)
+		pr_err("Failed to register MTK IOMMU driver\n");
+
+	return ret;
+}
+
+subsys_initcall(mtk_iommu_init)
diff --git a/drivers/iommu/mtk_iommu.h b/drivers/iommu/mtk_iommu.h
new file mode 100644
index 000000000..778498b86
--- /dev/null
+++ b/drivers/iommu/mtk_iommu.h
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2015-2016 MediaTek Inc.
+ * Author: Honghui Zhang <honghui.zhang@mediatek.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef _MTK_IOMMU_H_
+#define _MTK_IOMMU_H_
+
+#include <linux/clk.h>
+#include <linux/component.h>
+#include <linux/device.h>
+#include <linux/io.h>
+#include <linux/iommu.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <soc/mediatek/smi.h>
+
+#include "io-pgtable.h"
+
+struct mtk_iommu_suspend_reg {
+	u32				standard_axi_mode;
+	u32				dcm_dis;
+	u32				ctrl_reg;
+	u32				int_control0;
+	u32				int_main_control;
+	u32				ivrp_paddr;
+};
+
+enum mtk_iommu_plat {
+	M4U_MT2701,
+	M4U_MT2712,
+	M4U_MT8173,
+};
+
+struct mtk_iommu_domain;
+
+struct mtk_iommu_data {
+	void __iomem			*base;
+	int				irq;
+	struct device			*dev;
+	struct clk			*bclk;
+	phys_addr_t			protect_base; /* protect memory base */
+	struct mtk_iommu_suspend_reg	reg;
+	struct mtk_iommu_domain		*m4u_dom;
+	struct iommu_group		*m4u_group;
+	struct mtk_smi_iommu		smi_imu;      /* SMI larb iommu info */
+	bool                            enable_4GB;
+	bool				tlb_flush_active;
+
+	struct iommu_device		iommu;
+	enum mtk_iommu_plat		m4u_plat;
+
+	struct list_head		list;
+};
+
+static inline int compare_of(struct device *dev, void *data)
+{
+	return dev->of_node == data;
+}
+
+static inline void release_of(struct device *dev, void *data)
+{
+	of_node_put(data);
+}
+
+static inline int mtk_iommu_bind(struct device *dev)
+{
+	struct mtk_iommu_data *data = dev_get_drvdata(dev);
+
+	return component_bind_all(dev, &data->smi_imu);
+}
+
+static inline void mtk_iommu_unbind(struct device *dev)
+{
+	struct mtk_iommu_data *data = dev_get_drvdata(dev);
+
+	component_unbind_all(dev, &data->smi_imu);
+}
+
+#endif
diff --git a/drivers/iommu/mtk_iommu_v1.c b/drivers/iommu/mtk_iommu_v1.c
new file mode 100644
index 000000000..676c02949
--- /dev/null
+++ b/drivers/iommu/mtk_iommu_v1.c
@@ -0,0 +1,718 @@
+/*
+ * Copyright (c) 2015-2016 MediaTek Inc.
+ * Author: Honghui Zhang <honghui.zhang@mediatek.com>
+ *
+ * Based on driver/iommu/mtk_iommu.c
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+#include <linux/bootmem.h>
+#include <linux/bug.h>
+#include <linux/clk.h>
+#include <linux/component.h>
+#include <linux/device.h>
+#include <linux/dma-mapping.h>
+#include <linux/dma-iommu.h>
+#include <linux/err.h>
+#include <linux/interrupt.h>
+#include <linux/io.h>
+#include <linux/iommu.h>
+#include <linux/iopoll.h>
+#include <linux/list.h>
+#include <linux/of_address.h>
+#include <linux/of_iommu.h>
+#include <linux/of_irq.h>
+#include <linux/of_platform.h>
+#include <linux/platform_device.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <asm/barrier.h>
+#include <asm/dma-iommu.h>
+#include <linux/module.h>
+#include <dt-bindings/memory/mt2701-larb-port.h>
+#include <soc/mediatek/smi.h>
+#include "mtk_iommu.h"
+
+#define REG_MMU_PT_BASE_ADDR			0x000
+
+#define F_ALL_INVLD				0x2
+#define F_MMU_INV_RANGE				0x1
+#define F_INVLD_EN0				BIT(0)
+#define F_INVLD_EN1				BIT(1)
+
+#define F_MMU_FAULT_VA_MSK			0xfffff000
+#define MTK_PROTECT_PA_ALIGN			128
+
+#define REG_MMU_CTRL_REG			0x210
+#define F_MMU_CTRL_COHERENT_EN			BIT(8)
+#define REG_MMU_IVRP_PADDR			0x214
+#define REG_MMU_INT_CONTROL			0x220
+#define F_INT_TRANSLATION_FAULT			BIT(0)
+#define F_INT_MAIN_MULTI_HIT_FAULT		BIT(1)
+#define F_INT_INVALID_PA_FAULT			BIT(2)
+#define F_INT_ENTRY_REPLACEMENT_FAULT		BIT(3)
+#define F_INT_TABLE_WALK_FAULT			BIT(4)
+#define F_INT_TLB_MISS_FAULT			BIT(5)
+#define F_INT_PFH_DMA_FIFO_OVERFLOW		BIT(6)
+#define F_INT_MISS_DMA_FIFO_OVERFLOW		BIT(7)
+
+#define F_MMU_TF_PROTECT_SEL(prot)		(((prot) & 0x3) << 5)
+#define F_INT_CLR_BIT				BIT(12)
+
+#define REG_MMU_FAULT_ST			0x224
+#define REG_MMU_FAULT_VA			0x228
+#define REG_MMU_INVLD_PA			0x22C
+#define REG_MMU_INT_ID				0x388
+#define REG_MMU_INVALIDATE			0x5c0
+#define REG_MMU_INVLD_START_A			0x5c4
+#define REG_MMU_INVLD_END_A			0x5c8
+
+#define REG_MMU_INV_SEL				0x5d8
+#define REG_MMU_STANDARD_AXI_MODE		0x5e8
+
+#define REG_MMU_DCM				0x5f0
+#define F_MMU_DCM_ON				BIT(1)
+#define REG_MMU_CPE_DONE			0x60c
+#define F_DESC_VALID				0x2
+#define F_DESC_NONSEC				BIT(3)
+#define MT2701_M4U_TF_LARB(TF)			(6 - (((TF) >> 13) & 0x7))
+#define MT2701_M4U_TF_PORT(TF)			(((TF) >> 8) & 0xF)
+/* MTK generation one iommu HW only support 4K size mapping */
+#define MT2701_IOMMU_PAGE_SHIFT			12
+#define MT2701_IOMMU_PAGE_SIZE			(1UL << MT2701_IOMMU_PAGE_SHIFT)
+
+/*
+ * MTK m4u support 4GB iova address space, and only support 4K page
+ * mapping. So the pagetable size should be exactly as 4M.
+ */
+#define M2701_IOMMU_PGT_SIZE			SZ_4M
+
+struct mtk_iommu_domain {
+	spinlock_t			pgtlock; /* lock for page table */
+	struct iommu_domain		domain;
+	u32				*pgt_va;
+	dma_addr_t			pgt_pa;
+	struct mtk_iommu_data		*data;
+};
+
+static struct mtk_iommu_domain *to_mtk_domain(struct iommu_domain *dom)
+{
+	return container_of(dom, struct mtk_iommu_domain, domain);
+}
+
+static const int mt2701_m4u_in_larb[] = {
+	LARB0_PORT_OFFSET, LARB1_PORT_OFFSET,
+	LARB2_PORT_OFFSET, LARB3_PORT_OFFSET
+};
+
+static inline int mt2701_m4u_to_larb(int id)
+{
+	int i;
+
+	for (i = ARRAY_SIZE(mt2701_m4u_in_larb) - 1; i >= 0; i--)
+		if ((id) >= mt2701_m4u_in_larb[i])
+			return i;
+
+	return 0;
+}
+
+static inline int mt2701_m4u_to_port(int id)
+{
+	int larb = mt2701_m4u_to_larb(id);
+
+	return id - mt2701_m4u_in_larb[larb];
+}
+
+static void mtk_iommu_tlb_flush_all(struct mtk_iommu_data *data)
+{
+	writel_relaxed(F_INVLD_EN1 | F_INVLD_EN0,
+			data->base + REG_MMU_INV_SEL);
+	writel_relaxed(F_ALL_INVLD, data->base + REG_MMU_INVALIDATE);
+	wmb(); /* Make sure the tlb flush all done */
+}
+
+static void mtk_iommu_tlb_flush_range(struct mtk_iommu_data *data,
+				unsigned long iova, size_t size)
+{
+	int ret;
+	u32 tmp;
+
+	writel_relaxed(F_INVLD_EN1 | F_INVLD_EN0,
+		data->base + REG_MMU_INV_SEL);
+	writel_relaxed(iova & F_MMU_FAULT_VA_MSK,
+		data->base + REG_MMU_INVLD_START_A);
+	writel_relaxed((iova + size - 1) & F_MMU_FAULT_VA_MSK,
+		data->base + REG_MMU_INVLD_END_A);
+	writel_relaxed(F_MMU_INV_RANGE, data->base + REG_MMU_INVALIDATE);
+
+	ret = readl_poll_timeout_atomic(data->base + REG_MMU_CPE_DONE,
+				tmp, tmp != 0, 10, 100000);
+	if (ret) {
+		dev_warn(data->dev,
+			 "Partial TLB flush timed out, falling back to full flush\n");
+		mtk_iommu_tlb_flush_all(data);
+	}
+	/* Clear the CPE status */
+	writel_relaxed(0, data->base + REG_MMU_CPE_DONE);
+}
+
+static irqreturn_t mtk_iommu_isr(int irq, void *dev_id)
+{
+	struct mtk_iommu_data *data = dev_id;
+	struct mtk_iommu_domain *dom = data->m4u_dom;
+	u32 int_state, regval, fault_iova, fault_pa;
+	unsigned int fault_larb, fault_port;
+
+	/* Read error information from registers */
+	int_state = readl_relaxed(data->base + REG_MMU_FAULT_ST);
+	fault_iova = readl_relaxed(data->base + REG_MMU_FAULT_VA);
+
+	fault_iova &= F_MMU_FAULT_VA_MSK;
+	fault_pa = readl_relaxed(data->base + REG_MMU_INVLD_PA);
+	regval = readl_relaxed(data->base + REG_MMU_INT_ID);
+	fault_larb = MT2701_M4U_TF_LARB(regval);
+	fault_port = MT2701_M4U_TF_PORT(regval);
+
+	/*
+	 * MTK v1 iommu HW could not determine whether the fault is read or
+	 * write fault, report as read fault.
+	 */
+	if (report_iommu_fault(&dom->domain, data->dev, fault_iova,
+			IOMMU_FAULT_READ))
+		dev_err_ratelimited(data->dev,
+			"fault type=0x%x iova=0x%x pa=0x%x larb=%d port=%d\n",
+			int_state, fault_iova, fault_pa,
+			fault_larb, fault_port);
+
+	/* Interrupt clear */
+	regval = readl_relaxed(data->base + REG_MMU_INT_CONTROL);
+	regval |= F_INT_CLR_BIT;
+	writel_relaxed(regval, data->base + REG_MMU_INT_CONTROL);
+
+	mtk_iommu_tlb_flush_all(data);
+
+	return IRQ_HANDLED;
+}
+
+static void mtk_iommu_config(struct mtk_iommu_data *data,
+			     struct device *dev, bool enable)
+{
+	struct mtk_smi_larb_iommu    *larb_mmu;
+	unsigned int                 larbid, portid;
+	struct iommu_fwspec *fwspec = dev->iommu_fwspec;
+	int i;
+
+	for (i = 0; i < fwspec->num_ids; ++i) {
+		larbid = mt2701_m4u_to_larb(fwspec->ids[i]);
+		portid = mt2701_m4u_to_port(fwspec->ids[i]);
+		larb_mmu = &data->smi_imu.larb_imu[larbid];
+
+		dev_dbg(dev, "%s iommu port: %d\n",
+			enable ? "enable" : "disable", portid);
+
+		if (enable)
+			larb_mmu->mmu |= MTK_SMI_MMU_EN(portid);
+		else
+			larb_mmu->mmu &= ~MTK_SMI_MMU_EN(portid);
+	}
+}
+
+static int mtk_iommu_domain_finalise(struct mtk_iommu_data *data)
+{
+	struct mtk_iommu_domain *dom = data->m4u_dom;
+
+	spin_lock_init(&dom->pgtlock);
+
+	dom->pgt_va = dma_zalloc_coherent(data->dev,
+				M2701_IOMMU_PGT_SIZE,
+				&dom->pgt_pa, GFP_KERNEL);
+	if (!dom->pgt_va)
+		return -ENOMEM;
+
+	writel(dom->pgt_pa, data->base + REG_MMU_PT_BASE_ADDR);
+
+	dom->data = data;
+
+	return 0;
+}
+
+static struct iommu_domain *mtk_iommu_domain_alloc(unsigned type)
+{
+	struct mtk_iommu_domain *dom;
+
+	if (type != IOMMU_DOMAIN_UNMANAGED)
+		return NULL;
+
+	dom = kzalloc(sizeof(*dom), GFP_KERNEL);
+	if (!dom)
+		return NULL;
+
+	return &dom->domain;
+}
+
+static void mtk_iommu_domain_free(struct iommu_domain *domain)
+{
+	struct mtk_iommu_domain *dom = to_mtk_domain(domain);
+	struct mtk_iommu_data *data = dom->data;
+
+	dma_free_coherent(data->dev, M2701_IOMMU_PGT_SIZE,
+			dom->pgt_va, dom->pgt_pa);
+	kfree(to_mtk_domain(domain));
+}
+
+static int mtk_iommu_attach_device(struct iommu_domain *domain,
+				   struct device *dev)
+{
+	struct mtk_iommu_domain *dom = to_mtk_domain(domain);
+	struct mtk_iommu_data *data = dev->iommu_fwspec->iommu_priv;
+	int ret;
+
+	if (!data)
+		return -ENODEV;
+
+	if (!data->m4u_dom) {
+		data->m4u_dom = dom;
+		ret = mtk_iommu_domain_finalise(data);
+		if (ret) {
+			data->m4u_dom = NULL;
+			return ret;
+		}
+	}
+
+	mtk_iommu_config(data, dev, true);
+	return 0;
+}
+
+static void mtk_iommu_detach_device(struct iommu_domain *domain,
+				    struct device *dev)
+{
+	struct mtk_iommu_data *data = dev->iommu_fwspec->iommu_priv;
+
+	if (!data)
+		return;
+
+	mtk_iommu_config(data, dev, false);
+}
+
+static int mtk_iommu_map(struct iommu_domain *domain, unsigned long iova,
+			 phys_addr_t paddr, size_t size, int prot)
+{
+	struct mtk_iommu_domain *dom = to_mtk_domain(domain);
+	unsigned int page_num = size >> MT2701_IOMMU_PAGE_SHIFT;
+	unsigned long flags;
+	unsigned int i;
+	u32 *pgt_base_iova = dom->pgt_va + (iova  >> MT2701_IOMMU_PAGE_SHIFT);
+	u32 pabase = (u32)paddr;
+	int map_size = 0;
+
+	spin_lock_irqsave(&dom->pgtlock, flags);
+	for (i = 0; i < page_num; i++) {
+		if (pgt_base_iova[i]) {
+			memset(pgt_base_iova, 0, i * sizeof(u32));
+			break;
+		}
+		pgt_base_iova[i] = pabase | F_DESC_VALID | F_DESC_NONSEC;
+		pabase += MT2701_IOMMU_PAGE_SIZE;
+		map_size += MT2701_IOMMU_PAGE_SIZE;
+	}
+
+	spin_unlock_irqrestore(&dom->pgtlock, flags);
+
+	mtk_iommu_tlb_flush_range(dom->data, iova, size);
+
+	return map_size == size ? 0 : -EEXIST;
+}
+
+static size_t mtk_iommu_unmap(struct iommu_domain *domain,
+			      unsigned long iova, size_t size)
+{
+	struct mtk_iommu_domain *dom = to_mtk_domain(domain);
+	unsigned long flags;
+	u32 *pgt_base_iova = dom->pgt_va + (iova  >> MT2701_IOMMU_PAGE_SHIFT);
+	unsigned int page_num = size >> MT2701_IOMMU_PAGE_SHIFT;
+
+	spin_lock_irqsave(&dom->pgtlock, flags);
+	memset(pgt_base_iova, 0, page_num * sizeof(u32));
+	spin_unlock_irqrestore(&dom->pgtlock, flags);
+
+	mtk_iommu_tlb_flush_range(dom->data, iova, size);
+
+	return size;
+}
+
+static phys_addr_t mtk_iommu_iova_to_phys(struct iommu_domain *domain,
+					  dma_addr_t iova)
+{
+	struct mtk_iommu_domain *dom = to_mtk_domain(domain);
+	unsigned long flags;
+	phys_addr_t pa;
+
+	spin_lock_irqsave(&dom->pgtlock, flags);
+	pa = *(dom->pgt_va + (iova >> MT2701_IOMMU_PAGE_SHIFT));
+	pa = pa & (~(MT2701_IOMMU_PAGE_SIZE - 1));
+	spin_unlock_irqrestore(&dom->pgtlock, flags);
+
+	return pa;
+}
+
+static struct iommu_ops mtk_iommu_ops;
+
+/*
+ * MTK generation one iommu HW only support one iommu domain, and all the client
+ * sharing the same iova address space.
+ */
+static int mtk_iommu_create_mapping(struct device *dev,
+				    struct of_phandle_args *args)
+{
+	struct mtk_iommu_data *data;
+	struct platform_device *m4updev;
+	struct dma_iommu_mapping *mtk_mapping;
+	struct device *m4udev;
+	int ret;
+
+	if (args->args_count != 1) {
+		dev_err(dev, "invalid #iommu-cells(%d) property for IOMMU\n",
+			args->args_count);
+		return -EINVAL;
+	}
+
+	if (!dev->iommu_fwspec) {
+		ret = iommu_fwspec_init(dev, &args->np->fwnode, &mtk_iommu_ops);
+		if (ret)
+			return ret;
+	} else if (dev->iommu_fwspec->ops != &mtk_iommu_ops) {
+		return -EINVAL;
+	}
+
+	if (!dev->iommu_fwspec->iommu_priv) {
+		/* Get the m4u device */
+		m4updev = of_find_device_by_node(args->np);
+		if (WARN_ON(!m4updev))
+			return -EINVAL;
+
+		dev->iommu_fwspec->iommu_priv = platform_get_drvdata(m4updev);
+	}
+
+	ret = iommu_fwspec_add_ids(dev, args->args, 1);
+	if (ret)
+		return ret;
+
+	data = dev->iommu_fwspec->iommu_priv;
+	m4udev = data->dev;
+	mtk_mapping = m4udev->archdata.iommu;
+	if (!mtk_mapping) {
+		/* MTK iommu support 4GB iova address space. */
+		mtk_mapping = arm_iommu_create_mapping(&platform_bus_type,
+						0, 1ULL << 32);
+		if (IS_ERR(mtk_mapping))
+			return PTR_ERR(mtk_mapping);
+
+		m4udev->archdata.iommu = mtk_mapping;
+	}
+
+	return 0;
+}
+
+static int mtk_iommu_add_device(struct device *dev)
+{
+	struct dma_iommu_mapping *mtk_mapping;
+	struct of_phandle_args iommu_spec;
+	struct of_phandle_iterator it;
+	struct mtk_iommu_data *data;
+	struct iommu_group *group;
+	int err;
+
+	of_for_each_phandle(&it, err, dev->of_node, "iommus",
+			"#iommu-cells", 0) {
+		int count = of_phandle_iterator_args(&it, iommu_spec.args,
+					MAX_PHANDLE_ARGS);
+		iommu_spec.np = of_node_get(it.node);
+		iommu_spec.args_count = count;
+
+		mtk_iommu_create_mapping(dev, &iommu_spec);
+		of_node_put(iommu_spec.np);
+	}
+
+	if (!dev->iommu_fwspec || dev->iommu_fwspec->ops != &mtk_iommu_ops)
+		return -ENODEV; /* Not a iommu client device */
+
+	/*
+	 * This is a short-term bodge because the ARM DMA code doesn't
+	 * understand multi-device groups, but we have to call into it
+	 * successfully (and not just rely on a normal IOMMU API attach
+	 * here) in order to set the correct DMA API ops on @dev.
+	 */
+	group = iommu_group_alloc();
+	if (IS_ERR(group))
+		return PTR_ERR(group);
+
+	err = iommu_group_add_device(group, dev);
+	iommu_group_put(group);
+	if (err)
+		return err;
+
+	data = dev->iommu_fwspec->iommu_priv;
+	mtk_mapping = data->dev->archdata.iommu;
+	err = arm_iommu_attach_device(dev, mtk_mapping);
+	if (err) {
+		iommu_group_remove_device(dev);
+		return err;
+	}
+
+	return iommu_device_link(&data->iommu, dev);;
+}
+
+static void mtk_iommu_remove_device(struct device *dev)
+{
+	struct mtk_iommu_data *data;
+
+	if (!dev->iommu_fwspec || dev->iommu_fwspec->ops != &mtk_iommu_ops)
+		return;
+
+	data = dev->iommu_fwspec->iommu_priv;
+	iommu_device_unlink(&data->iommu, dev);
+
+	iommu_group_remove_device(dev);
+	iommu_fwspec_free(dev);
+}
+
+static int mtk_iommu_hw_init(const struct mtk_iommu_data *data)
+{
+	u32 regval;
+	int ret;
+
+	ret = clk_prepare_enable(data->bclk);
+	if (ret) {
+		dev_err(data->dev, "Failed to enable iommu bclk(%d)\n", ret);
+		return ret;
+	}
+
+	regval = F_MMU_CTRL_COHERENT_EN | F_MMU_TF_PROTECT_SEL(2);
+	writel_relaxed(regval, data->base + REG_MMU_CTRL_REG);
+
+	regval = F_INT_TRANSLATION_FAULT |
+		F_INT_MAIN_MULTI_HIT_FAULT |
+		F_INT_INVALID_PA_FAULT |
+		F_INT_ENTRY_REPLACEMENT_FAULT |
+		F_INT_TABLE_WALK_FAULT |
+		F_INT_TLB_MISS_FAULT |
+		F_INT_PFH_DMA_FIFO_OVERFLOW |
+		F_INT_MISS_DMA_FIFO_OVERFLOW;
+	writel_relaxed(regval, data->base + REG_MMU_INT_CONTROL);
+
+	/* protect memory,hw will write here while translation fault */
+	writel_relaxed(data->protect_base,
+			data->base + REG_MMU_IVRP_PADDR);
+
+	writel_relaxed(F_MMU_DCM_ON, data->base + REG_MMU_DCM);
+
+	if (devm_request_irq(data->dev, data->irq, mtk_iommu_isr, 0,
+			     dev_name(data->dev), (void *)data)) {
+		writel_relaxed(0, data->base + REG_MMU_PT_BASE_ADDR);
+		clk_disable_unprepare(data->bclk);
+		dev_err(data->dev, "Failed @ IRQ-%d Request\n", data->irq);
+		return -ENODEV;
+	}
+
+	return 0;
+}
+
+static struct iommu_ops mtk_iommu_ops = {
+	.domain_alloc	= mtk_iommu_domain_alloc,
+	.domain_free	= mtk_iommu_domain_free,
+	.attach_dev	= mtk_iommu_attach_device,
+	.detach_dev	= mtk_iommu_detach_device,
+	.map		= mtk_iommu_map,
+	.unmap		= mtk_iommu_unmap,
+	.iova_to_phys	= mtk_iommu_iova_to_phys,
+	.add_device	= mtk_iommu_add_device,
+	.remove_device	= mtk_iommu_remove_device,
+	.pgsize_bitmap	= ~0UL << MT2701_IOMMU_PAGE_SHIFT,
+};
+
+static const struct of_device_id mtk_iommu_of_ids[] = {
+	{ .compatible = "mediatek,mt2701-m4u", },
+	{}
+};
+
+static const struct component_master_ops mtk_iommu_com_ops = {
+	.bind		= mtk_iommu_bind,
+	.unbind		= mtk_iommu_unbind,
+};
+
+static int mtk_iommu_probe(struct platform_device *pdev)
+{
+	struct mtk_iommu_data		*data;
+	struct device			*dev = &pdev->dev;
+	struct resource			*res;
+	struct component_match		*match = NULL;
+	struct of_phandle_args		larb_spec;
+	struct of_phandle_iterator	it;
+	void				*protect;
+	int				larb_nr, ret, err;
+
+	data = devm_kzalloc(dev, sizeof(*data), GFP_KERNEL);
+	if (!data)
+		return -ENOMEM;
+
+	data->dev = dev;
+
+	/* Protect memory. HW will access here while translation fault.*/
+	protect = devm_kzalloc(dev, MTK_PROTECT_PA_ALIGN * 2,
+			GFP_KERNEL | GFP_DMA);
+	if (!protect)
+		return -ENOMEM;
+	data->protect_base = ALIGN(virt_to_phys(protect), MTK_PROTECT_PA_ALIGN);
+
+	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	data->base = devm_ioremap_resource(dev, res);
+	if (IS_ERR(data->base))
+		return PTR_ERR(data->base);
+
+	data->irq = platform_get_irq(pdev, 0);
+	if (data->irq < 0)
+		return data->irq;
+
+	data->bclk = devm_clk_get(dev, "bclk");
+	if (IS_ERR(data->bclk))
+		return PTR_ERR(data->bclk);
+
+	larb_nr = 0;
+	of_for_each_phandle(&it, err, dev->of_node,
+			"mediatek,larbs", NULL, 0) {
+		struct platform_device *plarbdev;
+		int count = of_phandle_iterator_args(&it, larb_spec.args,
+					MAX_PHANDLE_ARGS);
+
+		if (count)
+			continue;
+
+		larb_spec.np = of_node_get(it.node);
+		if (!of_device_is_available(larb_spec.np))
+			continue;
+
+		plarbdev = of_find_device_by_node(larb_spec.np);
+		if (!plarbdev) {
+			plarbdev = of_platform_device_create(
+						larb_spec.np, NULL,
+						platform_bus_type.dev_root);
+			if (!plarbdev) {
+				of_node_put(larb_spec.np);
+				return -EPROBE_DEFER;
+			}
+		}
+
+		data->smi_imu.larb_imu[larb_nr].dev = &plarbdev->dev;
+		component_match_add_release(dev, &match, release_of,
+					    compare_of, larb_spec.np);
+		larb_nr++;
+	}
+
+	data->smi_imu.larb_nr = larb_nr;
+
+	platform_set_drvdata(pdev, data);
+
+	ret = mtk_iommu_hw_init(data);
+	if (ret)
+		return ret;
+
+	ret = iommu_device_sysfs_add(&data->iommu, &pdev->dev, NULL,
+				     dev_name(&pdev->dev));
+	if (ret)
+		return ret;
+
+	iommu_device_set_ops(&data->iommu, &mtk_iommu_ops);
+
+	ret = iommu_device_register(&data->iommu);
+	if (ret)
+		return ret;
+
+	if (!iommu_present(&platform_bus_type))
+		bus_set_iommu(&platform_bus_type,  &mtk_iommu_ops);
+
+	return component_master_add_with_match(dev, &mtk_iommu_com_ops, match);
+}
+
+static int mtk_iommu_remove(struct platform_device *pdev)
+{
+	struct mtk_iommu_data *data = platform_get_drvdata(pdev);
+
+	iommu_device_sysfs_remove(&data->iommu);
+	iommu_device_unregister(&data->iommu);
+
+	if (iommu_present(&platform_bus_type))
+		bus_set_iommu(&platform_bus_type, NULL);
+
+	clk_disable_unprepare(data->bclk);
+	devm_free_irq(&pdev->dev, data->irq, data);
+	component_master_del(&pdev->dev, &mtk_iommu_com_ops);
+	return 0;
+}
+
+static int __maybe_unused mtk_iommu_suspend(struct device *dev)
+{
+	struct mtk_iommu_data *data = dev_get_drvdata(dev);
+	struct mtk_iommu_suspend_reg *reg = &data->reg;
+	void __iomem *base = data->base;
+
+	reg->standard_axi_mode = readl_relaxed(base +
+					       REG_MMU_STANDARD_AXI_MODE);
+	reg->dcm_dis = readl_relaxed(base + REG_MMU_DCM);
+	reg->ctrl_reg = readl_relaxed(base + REG_MMU_CTRL_REG);
+	reg->int_control0 = readl_relaxed(base + REG_MMU_INT_CONTROL);
+	return 0;
+}
+
+static int __maybe_unused mtk_iommu_resume(struct device *dev)
+{
+	struct mtk_iommu_data *data = dev_get_drvdata(dev);
+	struct mtk_iommu_suspend_reg *reg = &data->reg;
+	void __iomem *base = data->base;
+
+	writel_relaxed(data->m4u_dom->pgt_pa, base + REG_MMU_PT_BASE_ADDR);
+	writel_relaxed(reg->standard_axi_mode,
+		       base + REG_MMU_STANDARD_AXI_MODE);
+	writel_relaxed(reg->dcm_dis, base + REG_MMU_DCM);
+	writel_relaxed(reg->ctrl_reg, base + REG_MMU_CTRL_REG);
+	writel_relaxed(reg->int_control0, base + REG_MMU_INT_CONTROL);
+	writel_relaxed(data->protect_base, base + REG_MMU_IVRP_PADDR);
+	return 0;
+}
+
+static const struct dev_pm_ops mtk_iommu_pm_ops = {
+	SET_SYSTEM_SLEEP_PM_OPS(mtk_iommu_suspend, mtk_iommu_resume)
+};
+
+static struct platform_driver mtk_iommu_driver = {
+	.probe	= mtk_iommu_probe,
+	.remove	= mtk_iommu_remove,
+	.driver	= {
+		.name = "mtk-iommu-v1",
+		.of_match_table = mtk_iommu_of_ids,
+		.pm = &mtk_iommu_pm_ops,
+	}
+};
+
+static int __init m4u_init(void)
+{
+	return platform_driver_register(&mtk_iommu_driver);
+}
+
+static void __exit m4u_exit(void)
+{
+	return platform_driver_unregister(&mtk_iommu_driver);
+}
+
+subsys_initcall(m4u_init);
+module_exit(m4u_exit);
+
+MODULE_DESCRIPTION("IOMMU API for MTK architected m4u v1 implementations");
+MODULE_AUTHOR("Honghui Zhang <honghui.zhang@mediatek.com>");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/iommu/of_iommu.c b/drivers/iommu/of_iommu.c
new file mode 100644
index 000000000..f7787e757
--- /dev/null
+++ b/drivers/iommu/of_iommu.c
@@ -0,0 +1,216 @@
+/*
+ * OF helpers for IOMMU
+ *
+ * Copyright (c) 2012, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include <linux/export.h>
+#include <linux/iommu.h>
+#include <linux/limits.h>
+#include <linux/of.h>
+#include <linux/of_iommu.h>
+#include <linux/of_pci.h>
+#include <linux/slab.h>
+
+#define NO_IOMMU	1
+
+/**
+ * of_get_dma_window - Parse *dma-window property and returns 0 if found.
+ *
+ * @dn: device node
+ * @prefix: prefix for property name if any
+ * @index: index to start to parse
+ * @busno: Returns busno if supported. Otherwise pass NULL
+ * @addr: Returns address that DMA starts
+ * @size: Returns the range that DMA can handle
+ *
+ * This supports different formats flexibly. "prefix" can be
+ * configured if any. "busno" and "index" are optionally
+ * specified. Set 0(or NULL) if not used.
+ */
+int of_get_dma_window(struct device_node *dn, const char *prefix, int index,
+		      unsigned long *busno, dma_addr_t *addr, size_t *size)
+{
+	const __be32 *dma_window, *end;
+	int bytes, cur_index = 0;
+	char propname[NAME_MAX], addrname[NAME_MAX], sizename[NAME_MAX];
+
+	if (!dn || !addr || !size)
+		return -EINVAL;
+
+	if (!prefix)
+		prefix = "";
+
+	snprintf(propname, sizeof(propname), "%sdma-window", prefix);
+	snprintf(addrname, sizeof(addrname), "%s#dma-address-cells", prefix);
+	snprintf(sizename, sizeof(sizename), "%s#dma-size-cells", prefix);
+
+	dma_window = of_get_property(dn, propname, &bytes);
+	if (!dma_window)
+		return -ENODEV;
+	end = dma_window + bytes / sizeof(*dma_window);
+
+	while (dma_window < end) {
+		u32 cells;
+		const void *prop;
+
+		/* busno is one cell if supported */
+		if (busno)
+			*busno = be32_to_cpup(dma_window++);
+
+		prop = of_get_property(dn, addrname, NULL);
+		if (!prop)
+			prop = of_get_property(dn, "#address-cells", NULL);
+
+		cells = prop ? be32_to_cpup(prop) : of_n_addr_cells(dn);
+		if (!cells)
+			return -EINVAL;
+		*addr = of_read_number(dma_window, cells);
+		dma_window += cells;
+
+		prop = of_get_property(dn, sizename, NULL);
+		cells = prop ? be32_to_cpup(prop) : of_n_size_cells(dn);
+		if (!cells)
+			return -EINVAL;
+		*size = of_read_number(dma_window, cells);
+		dma_window += cells;
+
+		if (cur_index++ == index)
+			break;
+	}
+	return 0;
+}
+EXPORT_SYMBOL_GPL(of_get_dma_window);
+
+static int of_iommu_xlate(struct device *dev,
+			  struct of_phandle_args *iommu_spec)
+{
+	const struct iommu_ops *ops;
+	struct fwnode_handle *fwnode = &iommu_spec->np->fwnode;
+	int err;
+
+	ops = iommu_ops_from_fwnode(fwnode);
+	if ((ops && !ops->of_xlate) ||
+	    !of_device_is_available(iommu_spec->np))
+		return NO_IOMMU;
+
+	err = iommu_fwspec_init(dev, &iommu_spec->np->fwnode, ops);
+	if (err)
+		return err;
+	/*
+	 * The otherwise-empty fwspec handily serves to indicate the specific
+	 * IOMMU device we're waiting for, which will be useful if we ever get
+	 * a proper probe-ordering dependency mechanism in future.
+	 */
+	if (!ops)
+		return driver_deferred_probe_check_state(dev);
+
+	return ops->of_xlate(dev, iommu_spec);
+}
+
+struct of_pci_iommu_alias_info {
+	struct device *dev;
+	struct device_node *np;
+};
+
+static int of_pci_iommu_init(struct pci_dev *pdev, u16 alias, void *data)
+{
+	struct of_pci_iommu_alias_info *info = data;
+	struct of_phandle_args iommu_spec = { .args_count = 1 };
+	int err;
+
+	err = of_pci_map_rid(info->np, alias, "iommu-map",
+			     "iommu-map-mask", &iommu_spec.np,
+			     iommu_spec.args);
+	if (err)
+		return err == -ENODEV ? NO_IOMMU : err;
+
+	err = of_iommu_xlate(info->dev, &iommu_spec);
+	of_node_put(iommu_spec.np);
+	return err;
+}
+
+const struct iommu_ops *of_iommu_configure(struct device *dev,
+					   struct device_node *master_np)
+{
+	const struct iommu_ops *ops = NULL;
+	struct iommu_fwspec *fwspec = dev->iommu_fwspec;
+	int err = NO_IOMMU;
+
+	if (!master_np)
+		return NULL;
+
+	if (fwspec) {
+		if (fwspec->ops)
+			return fwspec->ops;
+
+		/* In the deferred case, start again from scratch */
+		iommu_fwspec_free(dev);
+	}
+
+	/*
+	 * We don't currently walk up the tree looking for a parent IOMMU.
+	 * See the `Notes:' section of
+	 * Documentation/devicetree/bindings/iommu/iommu.txt
+	 */
+	if (dev_is_pci(dev)) {
+		struct of_pci_iommu_alias_info info = {
+			.dev = dev,
+			.np = master_np,
+		};
+
+		err = pci_for_each_dma_alias(to_pci_dev(dev),
+					     of_pci_iommu_init, &info);
+	} else {
+		struct of_phandle_args iommu_spec;
+		int idx = 0;
+
+		while (!of_parse_phandle_with_args(master_np, "iommus",
+						   "#iommu-cells",
+						   idx, &iommu_spec)) {
+			err = of_iommu_xlate(dev, &iommu_spec);
+			of_node_put(iommu_spec.np);
+			idx++;
+			if (err)
+				break;
+		}
+	}
+
+	/*
+	 * Two success conditions can be represented by non-negative err here:
+	 * >0 : there is no IOMMU, or one was unavailable for non-fatal reasons
+	 *  0 : we found an IOMMU, and dev->fwspec is initialised appropriately
+	 * <0 : any actual error
+	 */
+	if (!err)
+		ops = dev->iommu_fwspec->ops;
+	/*
+	 * If we have reason to believe the IOMMU driver missed the initial
+	 * add_device callback for dev, replay it to get things in order.
+	 */
+	if (ops && ops->add_device && dev->bus && !dev->iommu_group)
+		err = ops->add_device(dev);
+
+	/* Ignore all other errors apart from EPROBE_DEFER */
+	if (err == -EPROBE_DEFER) {
+		ops = ERR_PTR(err);
+	} else if (err < 0) {
+		dev_dbg(dev, "Adding to IOMMU failed: %d\n", err);
+		ops = NULL;
+	}
+
+	return ops;
+}
diff --git a/drivers/iommu/omap-iommu-debug.c b/drivers/iommu/omap-iommu-debug.c
new file mode 100644
index 000000000..5ce55fabc
--- /dev/null
+++ b/drivers/iommu/omap-iommu-debug.c
@@ -0,0 +1,313 @@
+/*
+ * omap iommu: debugfs interface
+ *
+ * Copyright (C) 2008-2009 Nokia Corporation
+ *
+ * Written by Hiroshi DOYU <Hiroshi.DOYU@nokia.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/err.h>
+#include <linux/io.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <linux/pm_runtime.h>
+#include <linux/debugfs.h>
+#include <linux/platform_data/iommu-omap.h>
+
+#include "omap-iopgtable.h"
+#include "omap-iommu.h"
+
+static DEFINE_MUTEX(iommu_debug_lock);
+
+static struct dentry *iommu_debug_root;
+
+static inline bool is_omap_iommu_detached(struct omap_iommu *obj)
+{
+	return !obj->domain;
+}
+
+#define pr_reg(name)							\
+	do {								\
+		ssize_t bytes;						\
+		const char *str = "%20s: %08x\n";			\
+		const int maxcol = 32;					\
+		bytes = snprintf(p, maxcol, str, __stringify(name),	\
+				 iommu_read_reg(obj, MMU_##name));	\
+		p += bytes;						\
+		len -= bytes;						\
+		if (len < maxcol)					\
+			goto out;					\
+	} while (0)
+
+static ssize_t
+omap2_iommu_dump_ctx(struct omap_iommu *obj, char *buf, ssize_t len)
+{
+	char *p = buf;
+
+	pr_reg(REVISION);
+	pr_reg(IRQSTATUS);
+	pr_reg(IRQENABLE);
+	pr_reg(WALKING_ST);
+	pr_reg(CNTL);
+	pr_reg(FAULT_AD);
+	pr_reg(TTB);
+	pr_reg(LOCK);
+	pr_reg(LD_TLB);
+	pr_reg(CAM);
+	pr_reg(RAM);
+	pr_reg(GFLUSH);
+	pr_reg(FLUSH_ENTRY);
+	pr_reg(READ_CAM);
+	pr_reg(READ_RAM);
+	pr_reg(EMU_FAULT_AD);
+out:
+	return p - buf;
+}
+
+static ssize_t omap_iommu_dump_ctx(struct omap_iommu *obj, char *buf,
+				   ssize_t bytes)
+{
+	if (!obj || !buf)
+		return -EINVAL;
+
+	pm_runtime_get_sync(obj->dev);
+
+	bytes = omap2_iommu_dump_ctx(obj, buf, bytes);
+
+	pm_runtime_put_sync(obj->dev);
+
+	return bytes;
+}
+
+static ssize_t debug_read_regs(struct file *file, char __user *userbuf,
+			       size_t count, loff_t *ppos)
+{
+	struct omap_iommu *obj = file->private_data;
+	char *p, *buf;
+	ssize_t bytes;
+
+	if (is_omap_iommu_detached(obj))
+		return -EPERM;
+
+	buf = kmalloc(count, GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+	p = buf;
+
+	mutex_lock(&iommu_debug_lock);
+
+	bytes = omap_iommu_dump_ctx(obj, p, count);
+	if (bytes < 0)
+		goto err;
+	bytes = simple_read_from_buffer(userbuf, count, ppos, buf, bytes);
+
+err:
+	mutex_unlock(&iommu_debug_lock);
+	kfree(buf);
+
+	return bytes;
+}
+
+static int
+__dump_tlb_entries(struct omap_iommu *obj, struct cr_regs *crs, int num)
+{
+	int i;
+	struct iotlb_lock saved;
+	struct cr_regs tmp;
+	struct cr_regs *p = crs;
+
+	pm_runtime_get_sync(obj->dev);
+	iotlb_lock_get(obj, &saved);
+
+	for_each_iotlb_cr(obj, num, i, tmp) {
+		if (!iotlb_cr_valid(&tmp))
+			continue;
+		*p++ = tmp;
+	}
+
+	iotlb_lock_set(obj, &saved);
+	pm_runtime_put_sync(obj->dev);
+
+	return  p - crs;
+}
+
+static ssize_t iotlb_dump_cr(struct omap_iommu *obj, struct cr_regs *cr,
+			     struct seq_file *s)
+{
+	seq_printf(s, "%08x %08x %01x\n", cr->cam, cr->ram,
+		   (cr->cam & MMU_CAM_P) ? 1 : 0);
+	return 0;
+}
+
+static size_t omap_dump_tlb_entries(struct omap_iommu *obj, struct seq_file *s)
+{
+	int i, num;
+	struct cr_regs *cr;
+
+	num = obj->nr_tlb_entries;
+
+	cr = kcalloc(num, sizeof(*cr), GFP_KERNEL);
+	if (!cr)
+		return 0;
+
+	num = __dump_tlb_entries(obj, cr, num);
+	for (i = 0; i < num; i++)
+		iotlb_dump_cr(obj, cr + i, s);
+	kfree(cr);
+
+	return 0;
+}
+
+static int debug_read_tlb(struct seq_file *s, void *data)
+{
+	struct omap_iommu *obj = s->private;
+
+	if (is_omap_iommu_detached(obj))
+		return -EPERM;
+
+	mutex_lock(&iommu_debug_lock);
+
+	seq_printf(s, "%8s %8s\n", "cam:", "ram:");
+	seq_puts(s, "-----------------------------------------\n");
+	omap_dump_tlb_entries(obj, s);
+
+	mutex_unlock(&iommu_debug_lock);
+
+	return 0;
+}
+
+static void dump_ioptable(struct seq_file *s)
+{
+	int i, j;
+	u32 da;
+	u32 *iopgd, *iopte;
+	struct omap_iommu *obj = s->private;
+
+	spin_lock(&obj->page_table_lock);
+
+	iopgd = iopgd_offset(obj, 0);
+	for (i = 0; i < PTRS_PER_IOPGD; i++, iopgd++) {
+		if (!*iopgd)
+			continue;
+
+		if (!(*iopgd & IOPGD_TABLE)) {
+			da = i << IOPGD_SHIFT;
+			seq_printf(s, "1: 0x%08x 0x%08x\n", da, *iopgd);
+			continue;
+		}
+
+		iopte = iopte_offset(iopgd, 0);
+		for (j = 0; j < PTRS_PER_IOPTE; j++, iopte++) {
+			if (!*iopte)
+				continue;
+
+			da = (i << IOPGD_SHIFT) + (j << IOPTE_SHIFT);
+			seq_printf(s, "2: 0x%08x 0x%08x\n", da, *iopte);
+		}
+	}
+
+	spin_unlock(&obj->page_table_lock);
+}
+
+static int debug_read_pagetable(struct seq_file *s, void *data)
+{
+	struct omap_iommu *obj = s->private;
+
+	if (is_omap_iommu_detached(obj))
+		return -EPERM;
+
+	mutex_lock(&iommu_debug_lock);
+
+	seq_printf(s, "L: %8s %8s\n", "da:", "pte:");
+	seq_puts(s, "--------------------------\n");
+	dump_ioptable(s);
+
+	mutex_unlock(&iommu_debug_lock);
+
+	return 0;
+}
+
+#define DEBUG_SEQ_FOPS_RO(name)						       \
+	static int debug_open_##name(struct inode *inode, struct file *file)   \
+	{								       \
+		return single_open(file, debug_read_##name, inode->i_private); \
+	}								       \
+									       \
+	static const struct file_operations debug_##name##_fops = {	       \
+		.open		= debug_open_##name,			       \
+		.read		= seq_read,				       \
+		.llseek		= seq_lseek,				       \
+		.release	= single_release,			       \
+	}
+
+#define DEBUG_FOPS_RO(name)						\
+	static const struct file_operations debug_##name##_fops = {	\
+		.open = simple_open,					\
+		.read = debug_read_##name,				\
+		.llseek = generic_file_llseek,				\
+	}
+
+DEBUG_FOPS_RO(regs);
+DEBUG_SEQ_FOPS_RO(tlb);
+DEBUG_SEQ_FOPS_RO(pagetable);
+
+#define __DEBUG_ADD_FILE(attr, mode)					\
+	{								\
+		struct dentry *dent;					\
+		dent = debugfs_create_file(#attr, mode, obj->debug_dir,	\
+					   obj, &debug_##attr##_fops);	\
+		if (!dent)						\
+			goto err;					\
+	}
+
+#define DEBUG_ADD_FILE_RO(name) __DEBUG_ADD_FILE(name, 0400)
+
+void omap_iommu_debugfs_add(struct omap_iommu *obj)
+{
+	struct dentry *d;
+
+	if (!iommu_debug_root)
+		return;
+
+	obj->debug_dir = debugfs_create_dir(obj->name, iommu_debug_root);
+	if (!obj->debug_dir)
+		return;
+
+	d = debugfs_create_u32("nr_tlb_entries", 0400, obj->debug_dir,
+			       &obj->nr_tlb_entries);
+	if (!d)
+		return;
+
+	DEBUG_ADD_FILE_RO(regs);
+	DEBUG_ADD_FILE_RO(tlb);
+	DEBUG_ADD_FILE_RO(pagetable);
+
+	return;
+
+err:
+	debugfs_remove_recursive(obj->debug_dir);
+}
+
+void omap_iommu_debugfs_remove(struct omap_iommu *obj)
+{
+	if (!obj->debug_dir)
+		return;
+
+	debugfs_remove_recursive(obj->debug_dir);
+}
+
+void __init omap_iommu_debugfs_init(void)
+{
+	iommu_debug_root = debugfs_create_dir("omap_iommu", NULL);
+	if (!iommu_debug_root)
+		pr_err("can't create debugfs dir\n");
+}
+
+void __exit omap_iommu_debugfs_exit(void)
+{
+	debugfs_remove(iommu_debug_root);
+}
diff --git a/drivers/iommu/omap-iommu.c b/drivers/iommu/omap-iommu.c
new file mode 100644
index 000000000..d2fb347aa
--- /dev/null
+++ b/drivers/iommu/omap-iommu.c
@@ -0,0 +1,1599 @@
+/*
+ * omap iommu: tlb and pagetable primitives
+ *
+ * Copyright (C) 2008-2010 Nokia Corporation
+ * Copyright (C) 2013-2017 Texas Instruments Incorporated - http://www.ti.com/
+ *
+ * Written by Hiroshi DOYU <Hiroshi.DOYU@nokia.com>,
+ *		Paul Mundt and Toshihiro Kobayashi
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/dma-mapping.h>
+#include <linux/err.h>
+#include <linux/slab.h>
+#include <linux/interrupt.h>
+#include <linux/ioport.h>
+#include <linux/platform_device.h>
+#include <linux/iommu.h>
+#include <linux/omap-iommu.h>
+#include <linux/mutex.h>
+#include <linux/spinlock.h>
+#include <linux/io.h>
+#include <linux/pm_runtime.h>
+#include <linux/of.h>
+#include <linux/of_iommu.h>
+#include <linux/of_irq.h>
+#include <linux/of_platform.h>
+#include <linux/regmap.h>
+#include <linux/mfd/syscon.h>
+
+#include <linux/platform_data/iommu-omap.h>
+
+#include "omap-iopgtable.h"
+#include "omap-iommu.h"
+
+static const struct iommu_ops omap_iommu_ops;
+
+#define to_iommu(dev)							\
+	((struct omap_iommu *)platform_get_drvdata(to_platform_device(dev)))
+
+/* bitmap of the page sizes currently supported */
+#define OMAP_IOMMU_PGSIZES	(SZ_4K | SZ_64K | SZ_1M | SZ_16M)
+
+#define MMU_LOCK_BASE_SHIFT	10
+#define MMU_LOCK_BASE_MASK	(0x1f << MMU_LOCK_BASE_SHIFT)
+#define MMU_LOCK_BASE(x)	\
+	((x & MMU_LOCK_BASE_MASK) >> MMU_LOCK_BASE_SHIFT)
+
+#define MMU_LOCK_VICT_SHIFT	4
+#define MMU_LOCK_VICT_MASK	(0x1f << MMU_LOCK_VICT_SHIFT)
+#define MMU_LOCK_VICT(x)	\
+	((x & MMU_LOCK_VICT_MASK) >> MMU_LOCK_VICT_SHIFT)
+
+static struct platform_driver omap_iommu_driver;
+static struct kmem_cache *iopte_cachep;
+
+/**
+ * to_omap_domain - Get struct omap_iommu_domain from generic iommu_domain
+ * @dom:	generic iommu domain handle
+ **/
+static struct omap_iommu_domain *to_omap_domain(struct iommu_domain *dom)
+{
+	return container_of(dom, struct omap_iommu_domain, domain);
+}
+
+/**
+ * omap_iommu_save_ctx - Save registers for pm off-mode support
+ * @dev:	client device
+ **/
+void omap_iommu_save_ctx(struct device *dev)
+{
+	struct omap_iommu_arch_data *arch_data = dev->archdata.iommu;
+	struct omap_iommu *obj;
+	u32 *p;
+	int i;
+
+	if (!arch_data)
+		return;
+
+	while (arch_data->iommu_dev) {
+		obj = arch_data->iommu_dev;
+		p = obj->ctx;
+		for (i = 0; i < (MMU_REG_SIZE / sizeof(u32)); i++) {
+			p[i] = iommu_read_reg(obj, i * sizeof(u32));
+			dev_dbg(obj->dev, "%s\t[%02d] %08x\n", __func__, i,
+				p[i]);
+		}
+		arch_data++;
+	}
+}
+EXPORT_SYMBOL_GPL(omap_iommu_save_ctx);
+
+/**
+ * omap_iommu_restore_ctx - Restore registers for pm off-mode support
+ * @dev:	client device
+ **/
+void omap_iommu_restore_ctx(struct device *dev)
+{
+	struct omap_iommu_arch_data *arch_data = dev->archdata.iommu;
+	struct omap_iommu *obj;
+	u32 *p;
+	int i;
+
+	if (!arch_data)
+		return;
+
+	while (arch_data->iommu_dev) {
+		obj = arch_data->iommu_dev;
+		p = obj->ctx;
+		for (i = 0; i < (MMU_REG_SIZE / sizeof(u32)); i++) {
+			iommu_write_reg(obj, p[i], i * sizeof(u32));
+			dev_dbg(obj->dev, "%s\t[%02d] %08x\n", __func__, i,
+				p[i]);
+		}
+		arch_data++;
+	}
+}
+EXPORT_SYMBOL_GPL(omap_iommu_restore_ctx);
+
+static void dra7_cfg_dspsys_mmu(struct omap_iommu *obj, bool enable)
+{
+	u32 val, mask;
+
+	if (!obj->syscfg)
+		return;
+
+	mask = (1 << (obj->id * DSP_SYS_MMU_CONFIG_EN_SHIFT));
+	val = enable ? mask : 0;
+	regmap_update_bits(obj->syscfg, DSP_SYS_MMU_CONFIG, mask, val);
+}
+
+static void __iommu_set_twl(struct omap_iommu *obj, bool on)
+{
+	u32 l = iommu_read_reg(obj, MMU_CNTL);
+
+	if (on)
+		iommu_write_reg(obj, MMU_IRQ_TWL_MASK, MMU_IRQENABLE);
+	else
+		iommu_write_reg(obj, MMU_IRQ_TLB_MISS_MASK, MMU_IRQENABLE);
+
+	l &= ~MMU_CNTL_MASK;
+	if (on)
+		l |= (MMU_CNTL_MMU_EN | MMU_CNTL_TWL_EN);
+	else
+		l |= (MMU_CNTL_MMU_EN);
+
+	iommu_write_reg(obj, l, MMU_CNTL);
+}
+
+static int omap2_iommu_enable(struct omap_iommu *obj)
+{
+	u32 l, pa;
+
+	if (!obj->iopgd || !IS_ALIGNED((u32)obj->iopgd,  SZ_16K))
+		return -EINVAL;
+
+	pa = virt_to_phys(obj->iopgd);
+	if (!IS_ALIGNED(pa, SZ_16K))
+		return -EINVAL;
+
+	l = iommu_read_reg(obj, MMU_REVISION);
+	dev_info(obj->dev, "%s: version %d.%d\n", obj->name,
+		 (l >> 4) & 0xf, l & 0xf);
+
+	iommu_write_reg(obj, pa, MMU_TTB);
+
+	dra7_cfg_dspsys_mmu(obj, true);
+
+	if (obj->has_bus_err_back)
+		iommu_write_reg(obj, MMU_GP_REG_BUS_ERR_BACK_EN, MMU_GP_REG);
+
+	__iommu_set_twl(obj, true);
+
+	return 0;
+}
+
+static void omap2_iommu_disable(struct omap_iommu *obj)
+{
+	u32 l = iommu_read_reg(obj, MMU_CNTL);
+
+	l &= ~MMU_CNTL_MASK;
+	iommu_write_reg(obj, l, MMU_CNTL);
+	dra7_cfg_dspsys_mmu(obj, false);
+
+	dev_dbg(obj->dev, "%s is shutting down\n", obj->name);
+}
+
+static int iommu_enable(struct omap_iommu *obj)
+{
+	int err;
+	struct platform_device *pdev = to_platform_device(obj->dev);
+	struct iommu_platform_data *pdata = dev_get_platdata(&pdev->dev);
+
+	if (pdata && pdata->deassert_reset) {
+		err = pdata->deassert_reset(pdev, pdata->reset_name);
+		if (err) {
+			dev_err(obj->dev, "deassert_reset failed: %d\n", err);
+			return err;
+		}
+	}
+
+	pm_runtime_get_sync(obj->dev);
+
+	err = omap2_iommu_enable(obj);
+
+	return err;
+}
+
+static void iommu_disable(struct omap_iommu *obj)
+{
+	struct platform_device *pdev = to_platform_device(obj->dev);
+	struct iommu_platform_data *pdata = dev_get_platdata(&pdev->dev);
+
+	omap2_iommu_disable(obj);
+
+	pm_runtime_put_sync(obj->dev);
+
+	if (pdata && pdata->assert_reset)
+		pdata->assert_reset(pdev, pdata->reset_name);
+}
+
+/*
+ *	TLB operations
+ */
+static u32 iotlb_cr_to_virt(struct cr_regs *cr)
+{
+	u32 page_size = cr->cam & MMU_CAM_PGSZ_MASK;
+	u32 mask = get_cam_va_mask(cr->cam & page_size);
+
+	return cr->cam & mask;
+}
+
+static u32 get_iopte_attr(struct iotlb_entry *e)
+{
+	u32 attr;
+
+	attr = e->mixed << 5;
+	attr |= e->endian;
+	attr |= e->elsz >> 3;
+	attr <<= (((e->pgsz == MMU_CAM_PGSZ_4K) ||
+			(e->pgsz == MMU_CAM_PGSZ_64K)) ? 0 : 6);
+	return attr;
+}
+
+static u32 iommu_report_fault(struct omap_iommu *obj, u32 *da)
+{
+	u32 status, fault_addr;
+
+	status = iommu_read_reg(obj, MMU_IRQSTATUS);
+	status &= MMU_IRQ_MASK;
+	if (!status) {
+		*da = 0;
+		return 0;
+	}
+
+	fault_addr = iommu_read_reg(obj, MMU_FAULT_AD);
+	*da = fault_addr;
+
+	iommu_write_reg(obj, status, MMU_IRQSTATUS);
+
+	return status;
+}
+
+void iotlb_lock_get(struct omap_iommu *obj, struct iotlb_lock *l)
+{
+	u32 val;
+
+	val = iommu_read_reg(obj, MMU_LOCK);
+
+	l->base = MMU_LOCK_BASE(val);
+	l->vict = MMU_LOCK_VICT(val);
+}
+
+void iotlb_lock_set(struct omap_iommu *obj, struct iotlb_lock *l)
+{
+	u32 val;
+
+	val = (l->base << MMU_LOCK_BASE_SHIFT);
+	val |= (l->vict << MMU_LOCK_VICT_SHIFT);
+
+	iommu_write_reg(obj, val, MMU_LOCK);
+}
+
+static void iotlb_read_cr(struct omap_iommu *obj, struct cr_regs *cr)
+{
+	cr->cam = iommu_read_reg(obj, MMU_READ_CAM);
+	cr->ram = iommu_read_reg(obj, MMU_READ_RAM);
+}
+
+static void iotlb_load_cr(struct omap_iommu *obj, struct cr_regs *cr)
+{
+	iommu_write_reg(obj, cr->cam | MMU_CAM_V, MMU_CAM);
+	iommu_write_reg(obj, cr->ram, MMU_RAM);
+
+	iommu_write_reg(obj, 1, MMU_FLUSH_ENTRY);
+	iommu_write_reg(obj, 1, MMU_LD_TLB);
+}
+
+/* only used in iotlb iteration for-loop */
+struct cr_regs __iotlb_read_cr(struct omap_iommu *obj, int n)
+{
+	struct cr_regs cr;
+	struct iotlb_lock l;
+
+	iotlb_lock_get(obj, &l);
+	l.vict = n;
+	iotlb_lock_set(obj, &l);
+	iotlb_read_cr(obj, &cr);
+
+	return cr;
+}
+
+#ifdef PREFETCH_IOTLB
+static struct cr_regs *iotlb_alloc_cr(struct omap_iommu *obj,
+				      struct iotlb_entry *e)
+{
+	struct cr_regs *cr;
+
+	if (!e)
+		return NULL;
+
+	if (e->da & ~(get_cam_va_mask(e->pgsz))) {
+		dev_err(obj->dev, "%s:\twrong alignment: %08x\n", __func__,
+			e->da);
+		return ERR_PTR(-EINVAL);
+	}
+
+	cr = kmalloc(sizeof(*cr), GFP_KERNEL);
+	if (!cr)
+		return ERR_PTR(-ENOMEM);
+
+	cr->cam = (e->da & MMU_CAM_VATAG_MASK) | e->prsvd | e->pgsz | e->valid;
+	cr->ram = e->pa | e->endian | e->elsz | e->mixed;
+
+	return cr;
+}
+
+/**
+ * load_iotlb_entry - Set an iommu tlb entry
+ * @obj:	target iommu
+ * @e:		an iommu tlb entry info
+ **/
+static int load_iotlb_entry(struct omap_iommu *obj, struct iotlb_entry *e)
+{
+	int err = 0;
+	struct iotlb_lock l;
+	struct cr_regs *cr;
+
+	if (!obj || !obj->nr_tlb_entries || !e)
+		return -EINVAL;
+
+	pm_runtime_get_sync(obj->dev);
+
+	iotlb_lock_get(obj, &l);
+	if (l.base == obj->nr_tlb_entries) {
+		dev_warn(obj->dev, "%s: preserve entries full\n", __func__);
+		err = -EBUSY;
+		goto out;
+	}
+	if (!e->prsvd) {
+		int i;
+		struct cr_regs tmp;
+
+		for_each_iotlb_cr(obj, obj->nr_tlb_entries, i, tmp)
+			if (!iotlb_cr_valid(&tmp))
+				break;
+
+		if (i == obj->nr_tlb_entries) {
+			dev_dbg(obj->dev, "%s: full: no entry\n", __func__);
+			err = -EBUSY;
+			goto out;
+		}
+
+		iotlb_lock_get(obj, &l);
+	} else {
+		l.vict = l.base;
+		iotlb_lock_set(obj, &l);
+	}
+
+	cr = iotlb_alloc_cr(obj, e);
+	if (IS_ERR(cr)) {
+		pm_runtime_put_sync(obj->dev);
+		return PTR_ERR(cr);
+	}
+
+	iotlb_load_cr(obj, cr);
+	kfree(cr);
+
+	if (e->prsvd)
+		l.base++;
+	/* increment victim for next tlb load */
+	if (++l.vict == obj->nr_tlb_entries)
+		l.vict = l.base;
+	iotlb_lock_set(obj, &l);
+out:
+	pm_runtime_put_sync(obj->dev);
+	return err;
+}
+
+#else /* !PREFETCH_IOTLB */
+
+static int load_iotlb_entry(struct omap_iommu *obj, struct iotlb_entry *e)
+{
+	return 0;
+}
+
+#endif /* !PREFETCH_IOTLB */
+
+static int prefetch_iotlb_entry(struct omap_iommu *obj, struct iotlb_entry *e)
+{
+	return load_iotlb_entry(obj, e);
+}
+
+/**
+ * flush_iotlb_page - Clear an iommu tlb entry
+ * @obj:	target iommu
+ * @da:		iommu device virtual address
+ *
+ * Clear an iommu tlb entry which includes 'da' address.
+ **/
+static void flush_iotlb_page(struct omap_iommu *obj, u32 da)
+{
+	int i;
+	struct cr_regs cr;
+
+	pm_runtime_get_sync(obj->dev);
+
+	for_each_iotlb_cr(obj, obj->nr_tlb_entries, i, cr) {
+		u32 start;
+		size_t bytes;
+
+		if (!iotlb_cr_valid(&cr))
+			continue;
+
+		start = iotlb_cr_to_virt(&cr);
+		bytes = iopgsz_to_bytes(cr.cam & 3);
+
+		if ((start <= da) && (da < start + bytes)) {
+			dev_dbg(obj->dev, "%s: %08x<=%08x(%x)\n",
+				__func__, start, da, bytes);
+			iotlb_load_cr(obj, &cr);
+			iommu_write_reg(obj, 1, MMU_FLUSH_ENTRY);
+			break;
+		}
+	}
+	pm_runtime_put_sync(obj->dev);
+
+	if (i == obj->nr_tlb_entries)
+		dev_dbg(obj->dev, "%s: no page for %08x\n", __func__, da);
+}
+
+/**
+ * flush_iotlb_all - Clear all iommu tlb entries
+ * @obj:	target iommu
+ **/
+static void flush_iotlb_all(struct omap_iommu *obj)
+{
+	struct iotlb_lock l;
+
+	pm_runtime_get_sync(obj->dev);
+
+	l.base = 0;
+	l.vict = 0;
+	iotlb_lock_set(obj, &l);
+
+	iommu_write_reg(obj, 1, MMU_GFLUSH);
+
+	pm_runtime_put_sync(obj->dev);
+}
+
+/*
+ *	H/W pagetable operations
+ */
+static void flush_iopte_range(struct device *dev, dma_addr_t dma,
+			      unsigned long offset, int num_entries)
+{
+	size_t size = num_entries * sizeof(u32);
+
+	dma_sync_single_range_for_device(dev, dma, offset, size, DMA_TO_DEVICE);
+}
+
+static void iopte_free(struct omap_iommu *obj, u32 *iopte, bool dma_valid)
+{
+	dma_addr_t pt_dma;
+
+	/* Note: freed iopte's must be clean ready for re-use */
+	if (iopte) {
+		if (dma_valid) {
+			pt_dma = virt_to_phys(iopte);
+			dma_unmap_single(obj->dev, pt_dma, IOPTE_TABLE_SIZE,
+					 DMA_TO_DEVICE);
+		}
+
+		kmem_cache_free(iopte_cachep, iopte);
+	}
+}
+
+static u32 *iopte_alloc(struct omap_iommu *obj, u32 *iopgd,
+			dma_addr_t *pt_dma, u32 da)
+{
+	u32 *iopte;
+	unsigned long offset = iopgd_index(da) * sizeof(da);
+
+	/* a table has already existed */
+	if (*iopgd)
+		goto pte_ready;
+
+	/*
+	 * do the allocation outside the page table lock
+	 */
+	spin_unlock(&obj->page_table_lock);
+	iopte = kmem_cache_zalloc(iopte_cachep, GFP_KERNEL);
+	spin_lock(&obj->page_table_lock);
+
+	if (!*iopgd) {
+		if (!iopte)
+			return ERR_PTR(-ENOMEM);
+
+		*pt_dma = dma_map_single(obj->dev, iopte, IOPTE_TABLE_SIZE,
+					 DMA_TO_DEVICE);
+		if (dma_mapping_error(obj->dev, *pt_dma)) {
+			dev_err(obj->dev, "DMA map error for L2 table\n");
+			iopte_free(obj, iopte, false);
+			return ERR_PTR(-ENOMEM);
+		}
+
+		/*
+		 * we rely on dma address and the physical address to be
+		 * the same for mapping the L2 table
+		 */
+		if (WARN_ON(*pt_dma != virt_to_phys(iopte))) {
+			dev_err(obj->dev, "DMA translation error for L2 table\n");
+			dma_unmap_single(obj->dev, *pt_dma, IOPTE_TABLE_SIZE,
+					 DMA_TO_DEVICE);
+			iopte_free(obj, iopte, false);
+			return ERR_PTR(-ENOMEM);
+		}
+
+		*iopgd = virt_to_phys(iopte) | IOPGD_TABLE;
+
+		flush_iopte_range(obj->dev, obj->pd_dma, offset, 1);
+		dev_vdbg(obj->dev, "%s: a new pte:%p\n", __func__, iopte);
+	} else {
+		/* We raced, free the reduniovant table */
+		iopte_free(obj, iopte, false);
+	}
+
+pte_ready:
+	iopte = iopte_offset(iopgd, da);
+	*pt_dma = iopgd_page_paddr(iopgd);
+	dev_vdbg(obj->dev,
+		 "%s: da:%08x pgd:%p *pgd:%08x pte:%p *pte:%08x\n",
+		 __func__, da, iopgd, *iopgd, iopte, *iopte);
+
+	return iopte;
+}
+
+static int iopgd_alloc_section(struct omap_iommu *obj, u32 da, u32 pa, u32 prot)
+{
+	u32 *iopgd = iopgd_offset(obj, da);
+	unsigned long offset = iopgd_index(da) * sizeof(da);
+
+	if ((da | pa) & ~IOSECTION_MASK) {
+		dev_err(obj->dev, "%s: %08x:%08x should aligned on %08lx\n",
+			__func__, da, pa, IOSECTION_SIZE);
+		return -EINVAL;
+	}
+
+	*iopgd = (pa & IOSECTION_MASK) | prot | IOPGD_SECTION;
+	flush_iopte_range(obj->dev, obj->pd_dma, offset, 1);
+	return 0;
+}
+
+static int iopgd_alloc_super(struct omap_iommu *obj, u32 da, u32 pa, u32 prot)
+{
+	u32 *iopgd = iopgd_offset(obj, da);
+	unsigned long offset = iopgd_index(da) * sizeof(da);
+	int i;
+
+	if ((da | pa) & ~IOSUPER_MASK) {
+		dev_err(obj->dev, "%s: %08x:%08x should aligned on %08lx\n",
+			__func__, da, pa, IOSUPER_SIZE);
+		return -EINVAL;
+	}
+
+	for (i = 0; i < 16; i++)
+		*(iopgd + i) = (pa & IOSUPER_MASK) | prot | IOPGD_SUPER;
+	flush_iopte_range(obj->dev, obj->pd_dma, offset, 16);
+	return 0;
+}
+
+static int iopte_alloc_page(struct omap_iommu *obj, u32 da, u32 pa, u32 prot)
+{
+	u32 *iopgd = iopgd_offset(obj, da);
+	dma_addr_t pt_dma;
+	u32 *iopte = iopte_alloc(obj, iopgd, &pt_dma, da);
+	unsigned long offset = iopte_index(da) * sizeof(da);
+
+	if (IS_ERR(iopte))
+		return PTR_ERR(iopte);
+
+	*iopte = (pa & IOPAGE_MASK) | prot | IOPTE_SMALL;
+	flush_iopte_range(obj->dev, pt_dma, offset, 1);
+
+	dev_vdbg(obj->dev, "%s: da:%08x pa:%08x pte:%p *pte:%08x\n",
+		 __func__, da, pa, iopte, *iopte);
+
+	return 0;
+}
+
+static int iopte_alloc_large(struct omap_iommu *obj, u32 da, u32 pa, u32 prot)
+{
+	u32 *iopgd = iopgd_offset(obj, da);
+	dma_addr_t pt_dma;
+	u32 *iopte = iopte_alloc(obj, iopgd, &pt_dma, da);
+	unsigned long offset = iopte_index(da) * sizeof(da);
+	int i;
+
+	if ((da | pa) & ~IOLARGE_MASK) {
+		dev_err(obj->dev, "%s: %08x:%08x should aligned on %08lx\n",
+			__func__, da, pa, IOLARGE_SIZE);
+		return -EINVAL;
+	}
+
+	if (IS_ERR(iopte))
+		return PTR_ERR(iopte);
+
+	for (i = 0; i < 16; i++)
+		*(iopte + i) = (pa & IOLARGE_MASK) | prot | IOPTE_LARGE;
+	flush_iopte_range(obj->dev, pt_dma, offset, 16);
+	return 0;
+}
+
+static int
+iopgtable_store_entry_core(struct omap_iommu *obj, struct iotlb_entry *e)
+{
+	int (*fn)(struct omap_iommu *, u32, u32, u32);
+	u32 prot;
+	int err;
+
+	if (!obj || !e)
+		return -EINVAL;
+
+	switch (e->pgsz) {
+	case MMU_CAM_PGSZ_16M:
+		fn = iopgd_alloc_super;
+		break;
+	case MMU_CAM_PGSZ_1M:
+		fn = iopgd_alloc_section;
+		break;
+	case MMU_CAM_PGSZ_64K:
+		fn = iopte_alloc_large;
+		break;
+	case MMU_CAM_PGSZ_4K:
+		fn = iopte_alloc_page;
+		break;
+	default:
+		fn = NULL;
+		break;
+	}
+
+	if (WARN_ON(!fn))
+		return -EINVAL;
+
+	prot = get_iopte_attr(e);
+
+	spin_lock(&obj->page_table_lock);
+	err = fn(obj, e->da, e->pa, prot);
+	spin_unlock(&obj->page_table_lock);
+
+	return err;
+}
+
+/**
+ * omap_iopgtable_store_entry - Make an iommu pte entry
+ * @obj:	target iommu
+ * @e:		an iommu tlb entry info
+ **/
+static int
+omap_iopgtable_store_entry(struct omap_iommu *obj, struct iotlb_entry *e)
+{
+	int err;
+
+	flush_iotlb_page(obj, e->da);
+	err = iopgtable_store_entry_core(obj, e);
+	if (!err)
+		prefetch_iotlb_entry(obj, e);
+	return err;
+}
+
+/**
+ * iopgtable_lookup_entry - Lookup an iommu pte entry
+ * @obj:	target iommu
+ * @da:		iommu device virtual address
+ * @ppgd:	iommu pgd entry pointer to be returned
+ * @ppte:	iommu pte entry pointer to be returned
+ **/
+static void
+iopgtable_lookup_entry(struct omap_iommu *obj, u32 da, u32 **ppgd, u32 **ppte)
+{
+	u32 *iopgd, *iopte = NULL;
+
+	iopgd = iopgd_offset(obj, da);
+	if (!*iopgd)
+		goto out;
+
+	if (iopgd_is_table(*iopgd))
+		iopte = iopte_offset(iopgd, da);
+out:
+	*ppgd = iopgd;
+	*ppte = iopte;
+}
+
+static size_t iopgtable_clear_entry_core(struct omap_iommu *obj, u32 da)
+{
+	size_t bytes;
+	u32 *iopgd = iopgd_offset(obj, da);
+	int nent = 1;
+	dma_addr_t pt_dma;
+	unsigned long pd_offset = iopgd_index(da) * sizeof(da);
+	unsigned long pt_offset = iopte_index(da) * sizeof(da);
+
+	if (!*iopgd)
+		return 0;
+
+	if (iopgd_is_table(*iopgd)) {
+		int i;
+		u32 *iopte = iopte_offset(iopgd, da);
+
+		bytes = IOPTE_SIZE;
+		if (*iopte & IOPTE_LARGE) {
+			nent *= 16;
+			/* rewind to the 1st entry */
+			iopte = iopte_offset(iopgd, (da & IOLARGE_MASK));
+		}
+		bytes *= nent;
+		memset(iopte, 0, nent * sizeof(*iopte));
+		pt_dma = iopgd_page_paddr(iopgd);
+		flush_iopte_range(obj->dev, pt_dma, pt_offset, nent);
+
+		/*
+		 * do table walk to check if this table is necessary or not
+		 */
+		iopte = iopte_offset(iopgd, 0);
+		for (i = 0; i < PTRS_PER_IOPTE; i++)
+			if (iopte[i])
+				goto out;
+
+		iopte_free(obj, iopte, true);
+		nent = 1; /* for the next L1 entry */
+	} else {
+		bytes = IOPGD_SIZE;
+		if ((*iopgd & IOPGD_SUPER) == IOPGD_SUPER) {
+			nent *= 16;
+			/* rewind to the 1st entry */
+			iopgd = iopgd_offset(obj, (da & IOSUPER_MASK));
+		}
+		bytes *= nent;
+	}
+	memset(iopgd, 0, nent * sizeof(*iopgd));
+	flush_iopte_range(obj->dev, obj->pd_dma, pd_offset, nent);
+out:
+	return bytes;
+}
+
+/**
+ * iopgtable_clear_entry - Remove an iommu pte entry
+ * @obj:	target iommu
+ * @da:		iommu device virtual address
+ **/
+static size_t iopgtable_clear_entry(struct omap_iommu *obj, u32 da)
+{
+	size_t bytes;
+
+	spin_lock(&obj->page_table_lock);
+
+	bytes = iopgtable_clear_entry_core(obj, da);
+	flush_iotlb_page(obj, da);
+
+	spin_unlock(&obj->page_table_lock);
+
+	return bytes;
+}
+
+static void iopgtable_clear_entry_all(struct omap_iommu *obj)
+{
+	unsigned long offset;
+	int i;
+
+	spin_lock(&obj->page_table_lock);
+
+	for (i = 0; i < PTRS_PER_IOPGD; i++) {
+		u32 da;
+		u32 *iopgd;
+
+		da = i << IOPGD_SHIFT;
+		iopgd = iopgd_offset(obj, da);
+		offset = iopgd_index(da) * sizeof(da);
+
+		if (!*iopgd)
+			continue;
+
+		if (iopgd_is_table(*iopgd))
+			iopte_free(obj, iopte_offset(iopgd, 0), true);
+
+		*iopgd = 0;
+		flush_iopte_range(obj->dev, obj->pd_dma, offset, 1);
+	}
+
+	flush_iotlb_all(obj);
+
+	spin_unlock(&obj->page_table_lock);
+}
+
+/*
+ *	Device IOMMU generic operations
+ */
+static irqreturn_t iommu_fault_handler(int irq, void *data)
+{
+	u32 da, errs;
+	u32 *iopgd, *iopte;
+	struct omap_iommu *obj = data;
+	struct iommu_domain *domain = obj->domain;
+	struct omap_iommu_domain *omap_domain = to_omap_domain(domain);
+
+	if (!omap_domain->dev)
+		return IRQ_NONE;
+
+	errs = iommu_report_fault(obj, &da);
+	if (errs == 0)
+		return IRQ_HANDLED;
+
+	/* Fault callback or TLB/PTE Dynamic loading */
+	if (!report_iommu_fault(domain, obj->dev, da, 0))
+		return IRQ_HANDLED;
+
+	iommu_write_reg(obj, 0, MMU_IRQENABLE);
+
+	iopgd = iopgd_offset(obj, da);
+
+	if (!iopgd_is_table(*iopgd)) {
+		dev_err(obj->dev, "%s: errs:0x%08x da:0x%08x pgd:0x%p *pgd:px%08x\n",
+			obj->name, errs, da, iopgd, *iopgd);
+		return IRQ_NONE;
+	}
+
+	iopte = iopte_offset(iopgd, da);
+
+	dev_err(obj->dev, "%s: errs:0x%08x da:0x%08x pgd:0x%p *pgd:0x%08x pte:0x%p *pte:0x%08x\n",
+		obj->name, errs, da, iopgd, *iopgd, iopte, *iopte);
+
+	return IRQ_NONE;
+}
+
+/**
+ * omap_iommu_attach() - attach iommu device to an iommu domain
+ * @obj:	target omap iommu device
+ * @iopgd:	page table
+ **/
+static int omap_iommu_attach(struct omap_iommu *obj, u32 *iopgd)
+{
+	int err;
+
+	spin_lock(&obj->iommu_lock);
+
+	obj->pd_dma = dma_map_single(obj->dev, iopgd, IOPGD_TABLE_SIZE,
+				     DMA_TO_DEVICE);
+	if (dma_mapping_error(obj->dev, obj->pd_dma)) {
+		dev_err(obj->dev, "DMA map error for L1 table\n");
+		err = -ENOMEM;
+		goto out_err;
+	}
+
+	obj->iopgd = iopgd;
+	err = iommu_enable(obj);
+	if (err)
+		goto out_err;
+	flush_iotlb_all(obj);
+
+	spin_unlock(&obj->iommu_lock);
+
+	dev_dbg(obj->dev, "%s: %s\n", __func__, obj->name);
+
+	return 0;
+
+out_err:
+	spin_unlock(&obj->iommu_lock);
+
+	return err;
+}
+
+/**
+ * omap_iommu_detach - release iommu device
+ * @obj:	target iommu
+ **/
+static void omap_iommu_detach(struct omap_iommu *obj)
+{
+	if (!obj || IS_ERR(obj))
+		return;
+
+	spin_lock(&obj->iommu_lock);
+
+	dma_unmap_single(obj->dev, obj->pd_dma, IOPGD_TABLE_SIZE,
+			 DMA_TO_DEVICE);
+	iommu_disable(obj);
+	obj->pd_dma = 0;
+	obj->iopgd = NULL;
+
+	spin_unlock(&obj->iommu_lock);
+
+	dev_dbg(obj->dev, "%s: %s\n", __func__, obj->name);
+}
+
+static bool omap_iommu_can_register(struct platform_device *pdev)
+{
+	struct device_node *np = pdev->dev.of_node;
+
+	if (!of_device_is_compatible(np, "ti,dra7-dsp-iommu"))
+		return true;
+
+	/*
+	 * restrict IOMMU core registration only for processor-port MDMA MMUs
+	 * on DRA7 DSPs
+	 */
+	if ((!strcmp(dev_name(&pdev->dev), "40d01000.mmu")) ||
+	    (!strcmp(dev_name(&pdev->dev), "41501000.mmu")))
+		return true;
+
+	return false;
+}
+
+static int omap_iommu_dra7_get_dsp_system_cfg(struct platform_device *pdev,
+					      struct omap_iommu *obj)
+{
+	struct device_node *np = pdev->dev.of_node;
+	int ret;
+
+	if (!of_device_is_compatible(np, "ti,dra7-dsp-iommu"))
+		return 0;
+
+	if (!of_property_read_bool(np, "ti,syscon-mmuconfig")) {
+		dev_err(&pdev->dev, "ti,syscon-mmuconfig property is missing\n");
+		return -EINVAL;
+	}
+
+	obj->syscfg =
+		syscon_regmap_lookup_by_phandle(np, "ti,syscon-mmuconfig");
+	if (IS_ERR(obj->syscfg)) {
+		/* can fail with -EPROBE_DEFER */
+		ret = PTR_ERR(obj->syscfg);
+		return ret;
+	}
+
+	if (of_property_read_u32_index(np, "ti,syscon-mmuconfig", 1,
+				       &obj->id)) {
+		dev_err(&pdev->dev, "couldn't get the IOMMU instance id within subsystem\n");
+		return -EINVAL;
+	}
+
+	if (obj->id != 0 && obj->id != 1) {
+		dev_err(&pdev->dev, "invalid IOMMU instance id\n");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+/*
+ *	OMAP Device MMU(IOMMU) detection
+ */
+static int omap_iommu_probe(struct platform_device *pdev)
+{
+	int err = -ENODEV;
+	int irq;
+	struct omap_iommu *obj;
+	struct resource *res;
+	struct device_node *of = pdev->dev.of_node;
+
+	if (!of) {
+		pr_err("%s: only DT-based devices are supported\n", __func__);
+		return -ENODEV;
+	}
+
+	obj = devm_kzalloc(&pdev->dev, sizeof(*obj) + MMU_REG_SIZE, GFP_KERNEL);
+	if (!obj)
+		return -ENOMEM;
+
+	obj->name = dev_name(&pdev->dev);
+	obj->nr_tlb_entries = 32;
+	err = of_property_read_u32(of, "ti,#tlb-entries", &obj->nr_tlb_entries);
+	if (err && err != -EINVAL)
+		return err;
+	if (obj->nr_tlb_entries != 32 && obj->nr_tlb_entries != 8)
+		return -EINVAL;
+	if (of_find_property(of, "ti,iommu-bus-err-back", NULL))
+		obj->has_bus_err_back = MMU_GP_REG_BUS_ERR_BACK_EN;
+
+	obj->dev = &pdev->dev;
+	obj->ctx = (void *)obj + sizeof(*obj);
+
+	spin_lock_init(&obj->iommu_lock);
+	spin_lock_init(&obj->page_table_lock);
+
+	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	obj->regbase = devm_ioremap_resource(obj->dev, res);
+	if (IS_ERR(obj->regbase))
+		return PTR_ERR(obj->regbase);
+
+	err = omap_iommu_dra7_get_dsp_system_cfg(pdev, obj);
+	if (err)
+		return err;
+
+	irq = platform_get_irq(pdev, 0);
+	if (irq < 0)
+		return -ENODEV;
+
+	err = devm_request_irq(obj->dev, irq, iommu_fault_handler, IRQF_SHARED,
+			       dev_name(obj->dev), obj);
+	if (err < 0)
+		return err;
+	platform_set_drvdata(pdev, obj);
+
+	if (omap_iommu_can_register(pdev)) {
+		obj->group = iommu_group_alloc();
+		if (IS_ERR(obj->group))
+			return PTR_ERR(obj->group);
+
+		err = iommu_device_sysfs_add(&obj->iommu, obj->dev, NULL,
+					     obj->name);
+		if (err)
+			goto out_group;
+
+		iommu_device_set_ops(&obj->iommu, &omap_iommu_ops);
+
+		err = iommu_device_register(&obj->iommu);
+		if (err)
+			goto out_sysfs;
+	}
+
+	pm_runtime_irq_safe(obj->dev);
+	pm_runtime_enable(obj->dev);
+
+	omap_iommu_debugfs_add(obj);
+
+	dev_info(&pdev->dev, "%s registered\n", obj->name);
+
+	return 0;
+
+out_sysfs:
+	iommu_device_sysfs_remove(&obj->iommu);
+out_group:
+	iommu_group_put(obj->group);
+	return err;
+}
+
+static int omap_iommu_remove(struct platform_device *pdev)
+{
+	struct omap_iommu *obj = platform_get_drvdata(pdev);
+
+	if (obj->group) {
+		iommu_group_put(obj->group);
+		obj->group = NULL;
+
+		iommu_device_sysfs_remove(&obj->iommu);
+		iommu_device_unregister(&obj->iommu);
+	}
+
+	omap_iommu_debugfs_remove(obj);
+
+	pm_runtime_disable(obj->dev);
+
+	dev_info(&pdev->dev, "%s removed\n", obj->name);
+	return 0;
+}
+
+static const struct of_device_id omap_iommu_of_match[] = {
+	{ .compatible = "ti,omap2-iommu" },
+	{ .compatible = "ti,omap4-iommu" },
+	{ .compatible = "ti,dra7-iommu"	},
+	{ .compatible = "ti,dra7-dsp-iommu" },
+	{},
+};
+
+static struct platform_driver omap_iommu_driver = {
+	.probe	= omap_iommu_probe,
+	.remove	= omap_iommu_remove,
+	.driver	= {
+		.name	= "omap-iommu",
+		.of_match_table = of_match_ptr(omap_iommu_of_match),
+	},
+};
+
+static u32 iotlb_init_entry(struct iotlb_entry *e, u32 da, u32 pa, int pgsz)
+{
+	memset(e, 0, sizeof(*e));
+
+	e->da		= da;
+	e->pa		= pa;
+	e->valid	= MMU_CAM_V;
+	e->pgsz		= pgsz;
+	e->endian	= MMU_RAM_ENDIAN_LITTLE;
+	e->elsz		= MMU_RAM_ELSZ_8;
+	e->mixed	= 0;
+
+	return iopgsz_to_bytes(e->pgsz);
+}
+
+static int omap_iommu_map(struct iommu_domain *domain, unsigned long da,
+			  phys_addr_t pa, size_t bytes, int prot)
+{
+	struct omap_iommu_domain *omap_domain = to_omap_domain(domain);
+	struct device *dev = omap_domain->dev;
+	struct omap_iommu_device *iommu;
+	struct omap_iommu *oiommu;
+	struct iotlb_entry e;
+	int omap_pgsz;
+	u32 ret = -EINVAL;
+	int i;
+
+	omap_pgsz = bytes_to_iopgsz(bytes);
+	if (omap_pgsz < 0) {
+		dev_err(dev, "invalid size to map: %d\n", bytes);
+		return -EINVAL;
+	}
+
+	dev_dbg(dev, "mapping da 0x%lx to pa %pa size 0x%x\n", da, &pa, bytes);
+
+	iotlb_init_entry(&e, da, pa, omap_pgsz);
+
+	iommu = omap_domain->iommus;
+	for (i = 0; i < omap_domain->num_iommus; i++, iommu++) {
+		oiommu = iommu->iommu_dev;
+		ret = omap_iopgtable_store_entry(oiommu, &e);
+		if (ret) {
+			dev_err(dev, "omap_iopgtable_store_entry failed: %d\n",
+				ret);
+			break;
+		}
+	}
+
+	if (ret) {
+		while (i--) {
+			iommu--;
+			oiommu = iommu->iommu_dev;
+			iopgtable_clear_entry(oiommu, da);
+		}
+	}
+
+	return ret;
+}
+
+static size_t omap_iommu_unmap(struct iommu_domain *domain, unsigned long da,
+			       size_t size)
+{
+	struct omap_iommu_domain *omap_domain = to_omap_domain(domain);
+	struct device *dev = omap_domain->dev;
+	struct omap_iommu_device *iommu;
+	struct omap_iommu *oiommu;
+	bool error = false;
+	size_t bytes = 0;
+	int i;
+
+	dev_dbg(dev, "unmapping da 0x%lx size %u\n", da, size);
+
+	iommu = omap_domain->iommus;
+	for (i = 0; i < omap_domain->num_iommus; i++, iommu++) {
+		oiommu = iommu->iommu_dev;
+		bytes = iopgtable_clear_entry(oiommu, da);
+		if (!bytes)
+			error = true;
+	}
+
+	/*
+	 * simplify return - we are only checking if any of the iommus
+	 * reported an error, but not if all of them are unmapping the
+	 * same number of entries. This should not occur due to the
+	 * mirror programming.
+	 */
+	return error ? 0 : bytes;
+}
+
+static int omap_iommu_count(struct device *dev)
+{
+	struct omap_iommu_arch_data *arch_data = dev->archdata.iommu;
+	int count = 0;
+
+	while (arch_data->iommu_dev) {
+		count++;
+		arch_data++;
+	}
+
+	return count;
+}
+
+/* caller should call cleanup if this function fails */
+static int omap_iommu_attach_init(struct device *dev,
+				  struct omap_iommu_domain *odomain)
+{
+	struct omap_iommu_device *iommu;
+	int i;
+
+	odomain->num_iommus = omap_iommu_count(dev);
+	if (!odomain->num_iommus)
+		return -EINVAL;
+
+	odomain->iommus = kcalloc(odomain->num_iommus, sizeof(*iommu),
+				  GFP_ATOMIC);
+	if (!odomain->iommus)
+		return -ENOMEM;
+
+	iommu = odomain->iommus;
+	for (i = 0; i < odomain->num_iommus; i++, iommu++) {
+		iommu->pgtable = kzalloc(IOPGD_TABLE_SIZE, GFP_ATOMIC);
+		if (!iommu->pgtable)
+			return -ENOMEM;
+
+		/*
+		 * should never fail, but please keep this around to ensure
+		 * we keep the hardware happy
+		 */
+		if (WARN_ON(!IS_ALIGNED((long)iommu->pgtable,
+					IOPGD_TABLE_SIZE)))
+			return -EINVAL;
+	}
+
+	return 0;
+}
+
+static void omap_iommu_detach_fini(struct omap_iommu_domain *odomain)
+{
+	int i;
+	struct omap_iommu_device *iommu = odomain->iommus;
+
+	for (i = 0; iommu && i < odomain->num_iommus; i++, iommu++)
+		kfree(iommu->pgtable);
+
+	kfree(odomain->iommus);
+	odomain->num_iommus = 0;
+	odomain->iommus = NULL;
+}
+
+static int
+omap_iommu_attach_dev(struct iommu_domain *domain, struct device *dev)
+{
+	struct omap_iommu_domain *omap_domain = to_omap_domain(domain);
+	struct omap_iommu_arch_data *arch_data = dev->archdata.iommu;
+	struct omap_iommu_device *iommu;
+	struct omap_iommu *oiommu;
+	int ret = 0;
+	int i;
+
+	if (!arch_data || !arch_data->iommu_dev) {
+		dev_err(dev, "device doesn't have an associated iommu\n");
+		return -EINVAL;
+	}
+
+	spin_lock(&omap_domain->lock);
+
+	/* only a single client device can be attached to a domain */
+	if (omap_domain->dev) {
+		dev_err(dev, "iommu domain is already attached\n");
+		ret = -EBUSY;
+		goto out;
+	}
+
+	ret = omap_iommu_attach_init(dev, omap_domain);
+	if (ret) {
+		dev_err(dev, "failed to allocate required iommu data %d\n",
+			ret);
+		goto init_fail;
+	}
+
+	iommu = omap_domain->iommus;
+	for (i = 0; i < omap_domain->num_iommus; i++, iommu++, arch_data++) {
+		/* configure and enable the omap iommu */
+		oiommu = arch_data->iommu_dev;
+		ret = omap_iommu_attach(oiommu, iommu->pgtable);
+		if (ret) {
+			dev_err(dev, "can't get omap iommu: %d\n", ret);
+			goto attach_fail;
+		}
+
+		oiommu->domain = domain;
+		iommu->iommu_dev = oiommu;
+	}
+
+	omap_domain->dev = dev;
+
+	goto out;
+
+attach_fail:
+	while (i--) {
+		iommu--;
+		arch_data--;
+		oiommu = iommu->iommu_dev;
+		omap_iommu_detach(oiommu);
+		iommu->iommu_dev = NULL;
+		oiommu->domain = NULL;
+	}
+init_fail:
+	omap_iommu_detach_fini(omap_domain);
+out:
+	spin_unlock(&omap_domain->lock);
+	return ret;
+}
+
+static void _omap_iommu_detach_dev(struct omap_iommu_domain *omap_domain,
+				   struct device *dev)
+{
+	struct omap_iommu_arch_data *arch_data = dev->archdata.iommu;
+	struct omap_iommu_device *iommu = omap_domain->iommus;
+	struct omap_iommu *oiommu;
+	int i;
+
+	if (!omap_domain->dev) {
+		dev_err(dev, "domain has no attached device\n");
+		return;
+	}
+
+	/* only a single device is supported per domain for now */
+	if (omap_domain->dev != dev) {
+		dev_err(dev, "invalid attached device\n");
+		return;
+	}
+
+	/*
+	 * cleanup in the reverse order of attachment - this addresses
+	 * any h/w dependencies between multiple instances, if any
+	 */
+	iommu += (omap_domain->num_iommus - 1);
+	arch_data += (omap_domain->num_iommus - 1);
+	for (i = 0; i < omap_domain->num_iommus; i++, iommu--, arch_data--) {
+		oiommu = iommu->iommu_dev;
+		iopgtable_clear_entry_all(oiommu);
+
+		omap_iommu_detach(oiommu);
+		iommu->iommu_dev = NULL;
+		oiommu->domain = NULL;
+	}
+
+	omap_iommu_detach_fini(omap_domain);
+
+	omap_domain->dev = NULL;
+}
+
+static void omap_iommu_detach_dev(struct iommu_domain *domain,
+				  struct device *dev)
+{
+	struct omap_iommu_domain *omap_domain = to_omap_domain(domain);
+
+	spin_lock(&omap_domain->lock);
+	_omap_iommu_detach_dev(omap_domain, dev);
+	spin_unlock(&omap_domain->lock);
+}
+
+static struct iommu_domain *omap_iommu_domain_alloc(unsigned type)
+{
+	struct omap_iommu_domain *omap_domain;
+
+	if (type != IOMMU_DOMAIN_UNMANAGED)
+		return NULL;
+
+	omap_domain = kzalloc(sizeof(*omap_domain), GFP_KERNEL);
+	if (!omap_domain)
+		return NULL;
+
+	spin_lock_init(&omap_domain->lock);
+
+	omap_domain->domain.geometry.aperture_start = 0;
+	omap_domain->domain.geometry.aperture_end   = (1ULL << 32) - 1;
+	omap_domain->domain.geometry.force_aperture = true;
+
+	return &omap_domain->domain;
+}
+
+static void omap_iommu_domain_free(struct iommu_domain *domain)
+{
+	struct omap_iommu_domain *omap_domain = to_omap_domain(domain);
+
+	/*
+	 * An iommu device is still attached
+	 * (currently, only one device can be attached) ?
+	 */
+	if (omap_domain->dev)
+		_omap_iommu_detach_dev(omap_domain, omap_domain->dev);
+
+	kfree(omap_domain);
+}
+
+static phys_addr_t omap_iommu_iova_to_phys(struct iommu_domain *domain,
+					   dma_addr_t da)
+{
+	struct omap_iommu_domain *omap_domain = to_omap_domain(domain);
+	struct omap_iommu_device *iommu = omap_domain->iommus;
+	struct omap_iommu *oiommu = iommu->iommu_dev;
+	struct device *dev = oiommu->dev;
+	u32 *pgd, *pte;
+	phys_addr_t ret = 0;
+
+	/*
+	 * all the iommus within the domain will have identical programming,
+	 * so perform the lookup using just the first iommu
+	 */
+	iopgtable_lookup_entry(oiommu, da, &pgd, &pte);
+
+	if (pte) {
+		if (iopte_is_small(*pte))
+			ret = omap_iommu_translate(*pte, da, IOPTE_MASK);
+		else if (iopte_is_large(*pte))
+			ret = omap_iommu_translate(*pte, da, IOLARGE_MASK);
+		else
+			dev_err(dev, "bogus pte 0x%x, da 0x%llx", *pte,
+				(unsigned long long)da);
+	} else {
+		if (iopgd_is_section(*pgd))
+			ret = omap_iommu_translate(*pgd, da, IOSECTION_MASK);
+		else if (iopgd_is_super(*pgd))
+			ret = omap_iommu_translate(*pgd, da, IOSUPER_MASK);
+		else
+			dev_err(dev, "bogus pgd 0x%x, da 0x%llx", *pgd,
+				(unsigned long long)da);
+	}
+
+	return ret;
+}
+
+static int omap_iommu_add_device(struct device *dev)
+{
+	struct omap_iommu_arch_data *arch_data, *tmp;
+	struct omap_iommu *oiommu;
+	struct iommu_group *group;
+	struct device_node *np;
+	struct platform_device *pdev;
+	int num_iommus, i;
+	int ret;
+
+	/*
+	 * Allocate the archdata iommu structure for DT-based devices.
+	 *
+	 * TODO: Simplify this when removing non-DT support completely from the
+	 * IOMMU users.
+	 */
+	if (!dev->of_node)
+		return 0;
+
+	/*
+	 * retrieve the count of IOMMU nodes using phandle size as element size
+	 * since #iommu-cells = 0 for OMAP
+	 */
+	num_iommus = of_property_count_elems_of_size(dev->of_node, "iommus",
+						     sizeof(phandle));
+	if (num_iommus < 0)
+		return 0;
+
+	arch_data = kcalloc(num_iommus + 1, sizeof(*arch_data), GFP_KERNEL);
+	if (!arch_data)
+		return -ENOMEM;
+
+	for (i = 0, tmp = arch_data; i < num_iommus; i++, tmp++) {
+		np = of_parse_phandle(dev->of_node, "iommus", i);
+		if (!np) {
+			kfree(arch_data);
+			return -EINVAL;
+		}
+
+		pdev = of_find_device_by_node(np);
+		if (WARN_ON(!pdev)) {
+			of_node_put(np);
+			kfree(arch_data);
+			return -EINVAL;
+		}
+
+		oiommu = platform_get_drvdata(pdev);
+		if (!oiommu) {
+			of_node_put(np);
+			kfree(arch_data);
+			return -EINVAL;
+		}
+
+		tmp->iommu_dev = oiommu;
+
+		of_node_put(np);
+	}
+
+	/*
+	 * use the first IOMMU alone for the sysfs device linking.
+	 * TODO: Evaluate if a single iommu_group needs to be
+	 * maintained for both IOMMUs
+	 */
+	oiommu = arch_data->iommu_dev;
+	ret = iommu_device_link(&oiommu->iommu, dev);
+	if (ret) {
+		kfree(arch_data);
+		return ret;
+	}
+
+	dev->archdata.iommu = arch_data;
+
+	/*
+	 * IOMMU group initialization calls into omap_iommu_device_group, which
+	 * needs a valid dev->archdata.iommu pointer
+	 */
+	group = iommu_group_get_for_dev(dev);
+	if (IS_ERR(group)) {
+		iommu_device_unlink(&oiommu->iommu, dev);
+		dev->archdata.iommu = NULL;
+		kfree(arch_data);
+		return PTR_ERR(group);
+	}
+	iommu_group_put(group);
+
+	return 0;
+}
+
+static void omap_iommu_remove_device(struct device *dev)
+{
+	struct omap_iommu_arch_data *arch_data = dev->archdata.iommu;
+
+	if (!dev->of_node || !arch_data)
+		return;
+
+	iommu_device_unlink(&arch_data->iommu_dev->iommu, dev);
+	iommu_group_remove_device(dev);
+
+	dev->archdata.iommu = NULL;
+	kfree(arch_data);
+
+}
+
+static struct iommu_group *omap_iommu_device_group(struct device *dev)
+{
+	struct omap_iommu_arch_data *arch_data = dev->archdata.iommu;
+	struct iommu_group *group = ERR_PTR(-EINVAL);
+
+	if (arch_data->iommu_dev)
+		group = iommu_group_ref_get(arch_data->iommu_dev->group);
+
+	return group;
+}
+
+static const struct iommu_ops omap_iommu_ops = {
+	.domain_alloc	= omap_iommu_domain_alloc,
+	.domain_free	= omap_iommu_domain_free,
+	.attach_dev	= omap_iommu_attach_dev,
+	.detach_dev	= omap_iommu_detach_dev,
+	.map		= omap_iommu_map,
+	.unmap		= omap_iommu_unmap,
+	.iova_to_phys	= omap_iommu_iova_to_phys,
+	.add_device	= omap_iommu_add_device,
+	.remove_device	= omap_iommu_remove_device,
+	.device_group	= omap_iommu_device_group,
+	.pgsize_bitmap	= OMAP_IOMMU_PGSIZES,
+};
+
+static int __init omap_iommu_init(void)
+{
+	struct kmem_cache *p;
+	const unsigned long flags = SLAB_HWCACHE_ALIGN;
+	size_t align = 1 << 10; /* L2 pagetable alignement */
+	struct device_node *np;
+	int ret;
+
+	np = of_find_matching_node(NULL, omap_iommu_of_match);
+	if (!np)
+		return 0;
+
+	of_node_put(np);
+
+	p = kmem_cache_create("iopte_cache", IOPTE_TABLE_SIZE, align, flags,
+			      NULL);
+	if (!p)
+		return -ENOMEM;
+	iopte_cachep = p;
+
+	omap_iommu_debugfs_init();
+
+	ret = platform_driver_register(&omap_iommu_driver);
+	if (ret) {
+		pr_err("%s: failed to register driver\n", __func__);
+		goto fail_driver;
+	}
+
+	ret = bus_set_iommu(&platform_bus_type, &omap_iommu_ops);
+	if (ret)
+		goto fail_bus;
+
+	return 0;
+
+fail_bus:
+	platform_driver_unregister(&omap_iommu_driver);
+fail_driver:
+	kmem_cache_destroy(iopte_cachep);
+	return ret;
+}
+subsys_initcall(omap_iommu_init);
+/* must be ready before omap3isp is probed */
diff --git a/drivers/iommu/omap-iommu.h b/drivers/iommu/omap-iommu.h
new file mode 100644
index 000000000..1703159ef
--- /dev/null
+++ b/drivers/iommu/omap-iommu.h
@@ -0,0 +1,270 @@
+/*
+ * omap iommu: main structures
+ *
+ * Copyright (C) 2008-2009 Nokia Corporation
+ *
+ * Written by Hiroshi DOYU <Hiroshi.DOYU@nokia.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef _OMAP_IOMMU_H
+#define _OMAP_IOMMU_H
+
+#include <linux/bitops.h>
+#include <linux/iommu.h>
+
+#define for_each_iotlb_cr(obj, n, __i, cr)				\
+	for (__i = 0;							\
+	     (__i < (n)) && (cr = __iotlb_read_cr((obj), __i), true);	\
+	     __i++)
+
+struct iotlb_entry {
+	u32 da;
+	u32 pa;
+	u32 pgsz, prsvd, valid;
+	u32 endian, elsz, mixed;
+};
+
+/**
+ * struct omap_iommu_device - omap iommu device data
+ * @pgtable:	page table used by an omap iommu attached to a domain
+ * @iommu_dev:	pointer to store an omap iommu instance attached to a domain
+ */
+struct omap_iommu_device {
+	u32 *pgtable;
+	struct omap_iommu *iommu_dev;
+};
+
+/**
+ * struct omap_iommu_domain - omap iommu domain
+ * @num_iommus: number of iommus in this domain
+ * @iommus:	omap iommu device data for all iommus in this domain
+ * @dev:	Device using this domain.
+ * @lock:	domain lock, should be taken when attaching/detaching
+ * @domain:	generic domain handle used by iommu core code
+ */
+struct omap_iommu_domain {
+	u32 num_iommus;
+	struct omap_iommu_device *iommus;
+	struct device *dev;
+	spinlock_t lock;
+	struct iommu_domain domain;
+};
+
+struct omap_iommu {
+	const char	*name;
+	void __iomem	*regbase;
+	struct regmap	*syscfg;
+	struct device	*dev;
+	struct iommu_domain *domain;
+	struct dentry	*debug_dir;
+
+	spinlock_t	iommu_lock;	/* global for this whole object */
+
+	/*
+	 * We don't change iopgd for a situation like pgd for a task,
+	 * but share it globally for each iommu.
+	 */
+	u32		*iopgd;
+	spinlock_t	page_table_lock; /* protect iopgd */
+	dma_addr_t	pd_dma;
+
+	int		nr_tlb_entries;
+
+	void *ctx; /* iommu context: registres saved area */
+
+	int has_bus_err_back;
+	u32 id;
+
+	struct iommu_device iommu;
+	struct iommu_group *group;
+};
+
+/**
+ * struct omap_iommu_arch_data - omap iommu private data
+ * @iommu_dev: handle of the iommu device
+ *
+ * This is an omap iommu private data object, which binds an iommu user
+ * to its iommu device. This object should be placed at the iommu user's
+ * dev_archdata so generic IOMMU API can be used without having to
+ * utilize omap-specific plumbing anymore.
+ */
+struct omap_iommu_arch_data {
+	struct omap_iommu *iommu_dev;
+};
+
+struct cr_regs {
+	u32 cam;
+	u32 ram;
+};
+
+struct iotlb_lock {
+	short base;
+	short vict;
+};
+
+/*
+ * MMU Register offsets
+ */
+#define MMU_REVISION		0x00
+#define MMU_IRQSTATUS		0x18
+#define MMU_IRQENABLE		0x1c
+#define MMU_WALKING_ST		0x40
+#define MMU_CNTL		0x44
+#define MMU_FAULT_AD		0x48
+#define MMU_TTB			0x4c
+#define MMU_LOCK		0x50
+#define MMU_LD_TLB		0x54
+#define MMU_CAM			0x58
+#define MMU_RAM			0x5c
+#define MMU_GFLUSH		0x60
+#define MMU_FLUSH_ENTRY		0x64
+#define MMU_READ_CAM		0x68
+#define MMU_READ_RAM		0x6c
+#define MMU_EMU_FAULT_AD	0x70
+#define MMU_GP_REG		0x88
+
+#define MMU_REG_SIZE		256
+
+/*
+ * MMU Register bit definitions
+ */
+/* IRQSTATUS & IRQENABLE */
+#define MMU_IRQ_MULTIHITFAULT	BIT(4)
+#define MMU_IRQ_TABLEWALKFAULT	BIT(3)
+#define MMU_IRQ_EMUMISS		BIT(2)
+#define MMU_IRQ_TRANSLATIONFAULT	BIT(1)
+#define MMU_IRQ_TLBMISS		BIT(0)
+
+#define __MMU_IRQ_FAULT		\
+	(MMU_IRQ_MULTIHITFAULT | MMU_IRQ_EMUMISS | MMU_IRQ_TRANSLATIONFAULT)
+#define MMU_IRQ_MASK		\
+	(__MMU_IRQ_FAULT | MMU_IRQ_TABLEWALKFAULT | MMU_IRQ_TLBMISS)
+#define MMU_IRQ_TWL_MASK	(__MMU_IRQ_FAULT | MMU_IRQ_TABLEWALKFAULT)
+#define MMU_IRQ_TLB_MISS_MASK	(__MMU_IRQ_FAULT | MMU_IRQ_TLBMISS)
+
+/* MMU_CNTL */
+#define MMU_CNTL_SHIFT		1
+#define MMU_CNTL_MASK		(7 << MMU_CNTL_SHIFT)
+#define MMU_CNTL_EML_TLB	BIT(3)
+#define MMU_CNTL_TWL_EN		BIT(2)
+#define MMU_CNTL_MMU_EN		BIT(1)
+
+/* CAM */
+#define MMU_CAM_VATAG_SHIFT	12
+#define MMU_CAM_VATAG_MASK \
+	((~0UL >> MMU_CAM_VATAG_SHIFT) << MMU_CAM_VATAG_SHIFT)
+#define MMU_CAM_P		BIT(3)
+#define MMU_CAM_V		BIT(2)
+#define MMU_CAM_PGSZ_MASK	3
+#define MMU_CAM_PGSZ_1M		(0 << 0)
+#define MMU_CAM_PGSZ_64K	(1 << 0)
+#define MMU_CAM_PGSZ_4K		(2 << 0)
+#define MMU_CAM_PGSZ_16M	(3 << 0)
+
+/* RAM */
+#define MMU_RAM_PADDR_SHIFT	12
+#define MMU_RAM_PADDR_MASK \
+	((~0UL >> MMU_RAM_PADDR_SHIFT) << MMU_RAM_PADDR_SHIFT)
+
+#define MMU_RAM_ENDIAN_SHIFT	9
+#define MMU_RAM_ENDIAN_MASK	BIT(MMU_RAM_ENDIAN_SHIFT)
+#define MMU_RAM_ENDIAN_LITTLE	(0 << MMU_RAM_ENDIAN_SHIFT)
+#define MMU_RAM_ENDIAN_BIG	BIT(MMU_RAM_ENDIAN_SHIFT)
+
+#define MMU_RAM_ELSZ_SHIFT	7
+#define MMU_RAM_ELSZ_MASK	(3 << MMU_RAM_ELSZ_SHIFT)
+#define MMU_RAM_ELSZ_8		(0 << MMU_RAM_ELSZ_SHIFT)
+#define MMU_RAM_ELSZ_16		(1 << MMU_RAM_ELSZ_SHIFT)
+#define MMU_RAM_ELSZ_32		(2 << MMU_RAM_ELSZ_SHIFT)
+#define MMU_RAM_ELSZ_NONE	(3 << MMU_RAM_ELSZ_SHIFT)
+#define MMU_RAM_MIXED_SHIFT	6
+#define MMU_RAM_MIXED_MASK	BIT(MMU_RAM_MIXED_SHIFT)
+#define MMU_RAM_MIXED		MMU_RAM_MIXED_MASK
+
+#define MMU_GP_REG_BUS_ERR_BACK_EN	0x1
+
+#define get_cam_va_mask(pgsz)				\
+	(((pgsz) == MMU_CAM_PGSZ_16M) ? 0xff000000 :	\
+	 ((pgsz) == MMU_CAM_PGSZ_1M)  ? 0xfff00000 :	\
+	 ((pgsz) == MMU_CAM_PGSZ_64K) ? 0xffff0000 :	\
+	 ((pgsz) == MMU_CAM_PGSZ_4K)  ? 0xfffff000 : 0)
+
+/*
+ * DSP_SYSTEM registers and bit definitions (applicable only for DRA7xx DSP)
+ */
+#define DSP_SYS_REVISION		0x00
+#define DSP_SYS_MMU_CONFIG		0x18
+#define DSP_SYS_MMU_CONFIG_EN_SHIFT	4
+
+/*
+ * utilities for super page(16MB, 1MB, 64KB and 4KB)
+ */
+
+#define iopgsz_max(bytes)			\
+	(((bytes) >= SZ_16M) ? SZ_16M :		\
+	 ((bytes) >= SZ_1M)  ? SZ_1M  :		\
+	 ((bytes) >= SZ_64K) ? SZ_64K :		\
+	 ((bytes) >= SZ_4K)  ? SZ_4K  :	0)
+
+#define bytes_to_iopgsz(bytes)				\
+	(((bytes) == SZ_16M) ? MMU_CAM_PGSZ_16M :	\
+	 ((bytes) == SZ_1M)  ? MMU_CAM_PGSZ_1M  :	\
+	 ((bytes) == SZ_64K) ? MMU_CAM_PGSZ_64K :	\
+	 ((bytes) == SZ_4K)  ? MMU_CAM_PGSZ_4K  : -1)
+
+#define iopgsz_to_bytes(iopgsz)				\
+	(((iopgsz) == MMU_CAM_PGSZ_16M)	? SZ_16M :	\
+	 ((iopgsz) == MMU_CAM_PGSZ_1M)	? SZ_1M  :	\
+	 ((iopgsz) == MMU_CAM_PGSZ_64K)	? SZ_64K :	\
+	 ((iopgsz) == MMU_CAM_PGSZ_4K)	? SZ_4K  : 0)
+
+#define iopgsz_ok(bytes) (bytes_to_iopgsz(bytes) >= 0)
+
+/*
+ * global functions
+ */
+
+struct cr_regs __iotlb_read_cr(struct omap_iommu *obj, int n);
+void iotlb_lock_get(struct omap_iommu *obj, struct iotlb_lock *l);
+void iotlb_lock_set(struct omap_iommu *obj, struct iotlb_lock *l);
+
+#ifdef CONFIG_OMAP_IOMMU_DEBUG
+void omap_iommu_debugfs_init(void);
+void omap_iommu_debugfs_exit(void);
+
+void omap_iommu_debugfs_add(struct omap_iommu *obj);
+void omap_iommu_debugfs_remove(struct omap_iommu *obj);
+#else
+static inline void omap_iommu_debugfs_init(void) { }
+static inline void omap_iommu_debugfs_exit(void) { }
+
+static inline void omap_iommu_debugfs_add(struct omap_iommu *obj) { }
+static inline void omap_iommu_debugfs_remove(struct omap_iommu *obj) { }
+#endif
+
+/*
+ * register accessors
+ */
+static inline u32 iommu_read_reg(struct omap_iommu *obj, size_t offs)
+{
+	return __raw_readl(obj->regbase + offs);
+}
+
+static inline void iommu_write_reg(struct omap_iommu *obj, u32 val, size_t offs)
+{
+	__raw_writel(val, obj->regbase + offs);
+}
+
+static inline int iotlb_cr_valid(struct cr_regs *cr)
+{
+	if (!cr)
+		return -EINVAL;
+
+	return cr->cam & MMU_CAM_V;
+}
+
+#endif /* _OMAP_IOMMU_H */
diff --git a/drivers/iommu/omap-iopgtable.h b/drivers/iommu/omap-iopgtable.h
new file mode 100644
index 000000000..01a315227
--- /dev/null
+++ b/drivers/iommu/omap-iopgtable.h
@@ -0,0 +1,102 @@
+/*
+ * omap iommu: pagetable definitions
+ *
+ * Copyright (C) 2008-2010 Nokia Corporation
+ *
+ * Written by Hiroshi DOYU <Hiroshi.DOYU@nokia.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef _OMAP_IOPGTABLE_H
+#define _OMAP_IOPGTABLE_H
+
+#include <linux/bitops.h>
+
+/*
+ * "L2 table" address mask and size definitions.
+ */
+#define IOPGD_SHIFT		20
+#define IOPGD_SIZE		BIT(IOPGD_SHIFT)
+#define IOPGD_MASK		(~(IOPGD_SIZE - 1))
+
+/*
+ * "section" address mask and size definitions.
+ */
+#define IOSECTION_SHIFT		20
+#define IOSECTION_SIZE		BIT(IOSECTION_SHIFT)
+#define IOSECTION_MASK		(~(IOSECTION_SIZE - 1))
+
+/*
+ * "supersection" address mask and size definitions.
+ */
+#define IOSUPER_SHIFT		24
+#define IOSUPER_SIZE		BIT(IOSUPER_SHIFT)
+#define IOSUPER_MASK		(~(IOSUPER_SIZE - 1))
+
+#define PTRS_PER_IOPGD		(1UL << (32 - IOPGD_SHIFT))
+#define IOPGD_TABLE_SIZE	(PTRS_PER_IOPGD * sizeof(u32))
+
+/*
+ * "small page" address mask and size definitions.
+ */
+#define IOPTE_SHIFT		12
+#define IOPTE_SIZE		BIT(IOPTE_SHIFT)
+#define IOPTE_MASK		(~(IOPTE_SIZE - 1))
+
+/*
+ * "large page" address mask and size definitions.
+ */
+#define IOLARGE_SHIFT		16
+#define IOLARGE_SIZE		BIT(IOLARGE_SHIFT)
+#define IOLARGE_MASK		(~(IOLARGE_SIZE - 1))
+
+#define PTRS_PER_IOPTE		(1UL << (IOPGD_SHIFT - IOPTE_SHIFT))
+#define IOPTE_TABLE_SIZE	(PTRS_PER_IOPTE * sizeof(u32))
+
+#define IOPAGE_MASK		IOPTE_MASK
+
+/**
+ * omap_iommu_translate() - va to pa translation
+ * @d:		omap iommu descriptor
+ * @va:		virtual address
+ * @mask:	omap iommu descriptor mask
+ *
+ * va to pa translation
+ */
+static inline phys_addr_t omap_iommu_translate(u32 d, u32 va, u32 mask)
+{
+	return (d & mask) | (va & (~mask));
+}
+
+/*
+ * some descriptor attributes.
+ */
+#define IOPGD_TABLE		(1)
+#define IOPGD_SECTION		(2)
+#define IOPGD_SUPER		(BIT(18) | IOPGD_SECTION)
+
+#define iopgd_is_table(x)	(((x) & 3) == IOPGD_TABLE)
+#define iopgd_is_section(x)	(((x) & (1 << 18 | 3)) == IOPGD_SECTION)
+#define iopgd_is_super(x)	(((x) & (1 << 18 | 3)) == IOPGD_SUPER)
+
+#define IOPTE_SMALL		(2)
+#define IOPTE_LARGE		(1)
+
+#define iopte_is_small(x)	(((x) & 2) == IOPTE_SMALL)
+#define iopte_is_large(x)	(((x) & 3) == IOPTE_LARGE)
+
+/* to find an entry in a page-table-directory */
+#define iopgd_index(da)		(((da) >> IOPGD_SHIFT) & (PTRS_PER_IOPGD - 1))
+#define iopgd_offset(obj, da)	((obj)->iopgd + iopgd_index(da))
+
+#define iopgd_page_paddr(iopgd)	(*iopgd & ~((1 << 10) - 1))
+#define iopgd_page_vaddr(iopgd)	((u32 *)phys_to_virt(iopgd_page_paddr(iopgd)))
+
+/* to find an entry in the second-level page table. */
+#define iopte_index(da)		(((da) >> IOPTE_SHIFT) & (PTRS_PER_IOPTE - 1))
+#define iopte_offset(iopgd, da)	(iopgd_page_vaddr(iopgd) + iopte_index(da))
+
+#endif /* _OMAP_IOPGTABLE_H */
diff --git a/drivers/iommu/qcom_iommu.c b/drivers/iommu/qcom_iommu.c
new file mode 100644
index 000000000..b0a4a3d2f
--- /dev/null
+++ b/drivers/iommu/qcom_iommu.c
@@ -0,0 +1,947 @@
+/*
+ * IOMMU API for QCOM secure IOMMUs.  Somewhat based on arm-smmu.c
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Copyright (C) 2013 ARM Limited
+ * Copyright (C) 2017 Red Hat
+ */
+
+#include <linux/atomic.h>
+#include <linux/clk.h>
+#include <linux/delay.h>
+#include <linux/dma-iommu.h>
+#include <linux/dma-mapping.h>
+#include <linux/err.h>
+#include <linux/interrupt.h>
+#include <linux/io.h>
+#include <linux/io-64-nonatomic-hi-lo.h>
+#include <linux/iommu.h>
+#include <linux/iopoll.h>
+#include <linux/kconfig.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/of.h>
+#include <linux/of_address.h>
+#include <linux/of_device.h>
+#include <linux/of_iommu.h>
+#include <linux/platform_device.h>
+#include <linux/pm.h>
+#include <linux/pm_runtime.h>
+#include <linux/qcom_scm.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+
+#include "io-pgtable.h"
+#include "arm-smmu-regs.h"
+
+#define SMMU_INTR_SEL_NS     0x2000
+
+struct qcom_iommu_ctx;
+
+struct qcom_iommu_dev {
+	/* IOMMU core code handle */
+	struct iommu_device	 iommu;
+	struct device		*dev;
+	struct clk		*iface_clk;
+	struct clk		*bus_clk;
+	void __iomem		*local_base;
+	u32			 sec_id;
+	u8			 num_ctxs;
+	struct qcom_iommu_ctx	*ctxs[0];   /* indexed by asid-1 */
+};
+
+struct qcom_iommu_ctx {
+	struct device		*dev;
+	void __iomem		*base;
+	bool			 secure_init;
+	u8			 asid;      /* asid and ctx bank # are 1:1 */
+	struct iommu_domain	*domain;
+};
+
+struct qcom_iommu_domain {
+	struct io_pgtable_ops	*pgtbl_ops;
+	spinlock_t		 pgtbl_lock;
+	struct mutex		 init_mutex; /* Protects iommu pointer */
+	struct iommu_domain	 domain;
+	struct qcom_iommu_dev	*iommu;
+};
+
+static struct qcom_iommu_domain *to_qcom_iommu_domain(struct iommu_domain *dom)
+{
+	return container_of(dom, struct qcom_iommu_domain, domain);
+}
+
+static const struct iommu_ops qcom_iommu_ops;
+
+static struct qcom_iommu_dev * to_iommu(struct iommu_fwspec *fwspec)
+{
+	if (!fwspec || fwspec->ops != &qcom_iommu_ops)
+		return NULL;
+	return fwspec->iommu_priv;
+}
+
+static struct qcom_iommu_ctx * to_ctx(struct iommu_fwspec *fwspec, unsigned asid)
+{
+	struct qcom_iommu_dev *qcom_iommu = to_iommu(fwspec);
+	if (!qcom_iommu)
+		return NULL;
+	return qcom_iommu->ctxs[asid - 1];
+}
+
+static inline void
+iommu_writel(struct qcom_iommu_ctx *ctx, unsigned reg, u32 val)
+{
+	writel_relaxed(val, ctx->base + reg);
+}
+
+static inline void
+iommu_writeq(struct qcom_iommu_ctx *ctx, unsigned reg, u64 val)
+{
+	writeq_relaxed(val, ctx->base + reg);
+}
+
+static inline u32
+iommu_readl(struct qcom_iommu_ctx *ctx, unsigned reg)
+{
+	return readl_relaxed(ctx->base + reg);
+}
+
+static inline u64
+iommu_readq(struct qcom_iommu_ctx *ctx, unsigned reg)
+{
+	return readq_relaxed(ctx->base + reg);
+}
+
+static void qcom_iommu_tlb_sync(void *cookie)
+{
+	struct iommu_fwspec *fwspec = cookie;
+	unsigned i;
+
+	for (i = 0; i < fwspec->num_ids; i++) {
+		struct qcom_iommu_ctx *ctx = to_ctx(fwspec, fwspec->ids[i]);
+		unsigned int val, ret;
+
+		iommu_writel(ctx, ARM_SMMU_CB_TLBSYNC, 0);
+
+		ret = readl_poll_timeout(ctx->base + ARM_SMMU_CB_TLBSTATUS, val,
+					 (val & 0x1) == 0, 0, 5000000);
+		if (ret)
+			dev_err(ctx->dev, "timeout waiting for TLB SYNC\n");
+	}
+}
+
+static void qcom_iommu_tlb_inv_context(void *cookie)
+{
+	struct iommu_fwspec *fwspec = cookie;
+	unsigned i;
+
+	for (i = 0; i < fwspec->num_ids; i++) {
+		struct qcom_iommu_ctx *ctx = to_ctx(fwspec, fwspec->ids[i]);
+		iommu_writel(ctx, ARM_SMMU_CB_S1_TLBIASID, ctx->asid);
+	}
+
+	qcom_iommu_tlb_sync(cookie);
+}
+
+static void qcom_iommu_tlb_inv_range_nosync(unsigned long iova, size_t size,
+					    size_t granule, bool leaf, void *cookie)
+{
+	struct iommu_fwspec *fwspec = cookie;
+	unsigned i, reg;
+
+	reg = leaf ? ARM_SMMU_CB_S1_TLBIVAL : ARM_SMMU_CB_S1_TLBIVA;
+
+	for (i = 0; i < fwspec->num_ids; i++) {
+		struct qcom_iommu_ctx *ctx = to_ctx(fwspec, fwspec->ids[i]);
+		size_t s = size;
+
+		iova &= ~12UL;
+		iova |= ctx->asid;
+		do {
+			iommu_writel(ctx, reg, iova);
+			iova += granule;
+		} while (s -= granule);
+	}
+}
+
+static const struct iommu_gather_ops qcom_gather_ops = {
+	.tlb_flush_all	= qcom_iommu_tlb_inv_context,
+	.tlb_add_flush	= qcom_iommu_tlb_inv_range_nosync,
+	.tlb_sync	= qcom_iommu_tlb_sync,
+};
+
+static irqreturn_t qcom_iommu_fault(int irq, void *dev)
+{
+	struct qcom_iommu_ctx *ctx = dev;
+	u32 fsr, fsynr;
+	u64 iova;
+
+	fsr = iommu_readl(ctx, ARM_SMMU_CB_FSR);
+
+	if (!(fsr & FSR_FAULT))
+		return IRQ_NONE;
+
+	fsynr = iommu_readl(ctx, ARM_SMMU_CB_FSYNR0);
+	iova = iommu_readq(ctx, ARM_SMMU_CB_FAR);
+
+	if (!report_iommu_fault(ctx->domain, ctx->dev, iova, 0)) {
+		dev_err_ratelimited(ctx->dev,
+				    "Unhandled context fault: fsr=0x%x, "
+				    "iova=0x%016llx, fsynr=0x%x, cb=%d\n",
+				    fsr, iova, fsynr, ctx->asid);
+	}
+
+	iommu_writel(ctx, ARM_SMMU_CB_FSR, fsr);
+	iommu_writel(ctx, ARM_SMMU_CB_RESUME, RESUME_TERMINATE);
+
+	return IRQ_HANDLED;
+}
+
+static int qcom_iommu_init_domain(struct iommu_domain *domain,
+				  struct qcom_iommu_dev *qcom_iommu,
+				  struct iommu_fwspec *fwspec)
+{
+	struct qcom_iommu_domain *qcom_domain = to_qcom_iommu_domain(domain);
+	struct io_pgtable_ops *pgtbl_ops;
+	struct io_pgtable_cfg pgtbl_cfg;
+	int i, ret = 0;
+	u32 reg;
+
+	mutex_lock(&qcom_domain->init_mutex);
+	if (qcom_domain->iommu)
+		goto out_unlock;
+
+	pgtbl_cfg = (struct io_pgtable_cfg) {
+		.pgsize_bitmap	= qcom_iommu_ops.pgsize_bitmap,
+		.ias		= 32,
+		.oas		= 40,
+		.tlb		= &qcom_gather_ops,
+		.iommu_dev	= qcom_iommu->dev,
+	};
+
+	qcom_domain->iommu = qcom_iommu;
+	pgtbl_ops = alloc_io_pgtable_ops(ARM_32_LPAE_S1, &pgtbl_cfg, fwspec);
+	if (!pgtbl_ops) {
+		dev_err(qcom_iommu->dev, "failed to allocate pagetable ops\n");
+		ret = -ENOMEM;
+		goto out_clear_iommu;
+	}
+
+	/* Update the domain's page sizes to reflect the page table format */
+	domain->pgsize_bitmap = pgtbl_cfg.pgsize_bitmap;
+	domain->geometry.aperture_end = (1ULL << pgtbl_cfg.ias) - 1;
+	domain->geometry.force_aperture = true;
+
+	for (i = 0; i < fwspec->num_ids; i++) {
+		struct qcom_iommu_ctx *ctx = to_ctx(fwspec, fwspec->ids[i]);
+
+		if (!ctx->secure_init) {
+			ret = qcom_scm_restore_sec_cfg(qcom_iommu->sec_id, ctx->asid);
+			if (ret) {
+				dev_err(qcom_iommu->dev, "secure init failed: %d\n", ret);
+				goto out_clear_iommu;
+			}
+			ctx->secure_init = true;
+		}
+
+		/* TTBRs */
+		iommu_writeq(ctx, ARM_SMMU_CB_TTBR0,
+				pgtbl_cfg.arm_lpae_s1_cfg.ttbr[0] |
+				((u64)ctx->asid << TTBRn_ASID_SHIFT));
+		iommu_writeq(ctx, ARM_SMMU_CB_TTBR1,
+				pgtbl_cfg.arm_lpae_s1_cfg.ttbr[1] |
+				((u64)ctx->asid << TTBRn_ASID_SHIFT));
+
+		/* TTBCR */
+		iommu_writel(ctx, ARM_SMMU_CB_TTBCR2,
+				(pgtbl_cfg.arm_lpae_s1_cfg.tcr >> 32) |
+				TTBCR2_SEP_UPSTREAM);
+		iommu_writel(ctx, ARM_SMMU_CB_TTBCR,
+				pgtbl_cfg.arm_lpae_s1_cfg.tcr);
+
+		/* MAIRs (stage-1 only) */
+		iommu_writel(ctx, ARM_SMMU_CB_S1_MAIR0,
+				pgtbl_cfg.arm_lpae_s1_cfg.mair[0]);
+		iommu_writel(ctx, ARM_SMMU_CB_S1_MAIR1,
+				pgtbl_cfg.arm_lpae_s1_cfg.mair[1]);
+
+		/* SCTLR */
+		reg = SCTLR_CFIE | SCTLR_CFRE | SCTLR_AFE | SCTLR_TRE |
+			SCTLR_M | SCTLR_S1_ASIDPNE | SCTLR_CFCFG;
+
+		if (IS_ENABLED(CONFIG_BIG_ENDIAN))
+			reg |= SCTLR_E;
+
+		iommu_writel(ctx, ARM_SMMU_CB_SCTLR, reg);
+
+		ctx->domain = domain;
+	}
+
+	mutex_unlock(&qcom_domain->init_mutex);
+
+	/* Publish page table ops for map/unmap */
+	qcom_domain->pgtbl_ops = pgtbl_ops;
+
+	return 0;
+
+out_clear_iommu:
+	qcom_domain->iommu = NULL;
+out_unlock:
+	mutex_unlock(&qcom_domain->init_mutex);
+	return ret;
+}
+
+static struct iommu_domain *qcom_iommu_domain_alloc(unsigned type)
+{
+	struct qcom_iommu_domain *qcom_domain;
+
+	if (type != IOMMU_DOMAIN_UNMANAGED && type != IOMMU_DOMAIN_DMA)
+		return NULL;
+	/*
+	 * Allocate the domain and initialise some of its data structures.
+	 * We can't really do anything meaningful until we've added a
+	 * master.
+	 */
+	qcom_domain = kzalloc(sizeof(*qcom_domain), GFP_KERNEL);
+	if (!qcom_domain)
+		return NULL;
+
+	if (type == IOMMU_DOMAIN_DMA &&
+	    iommu_get_dma_cookie(&qcom_domain->domain)) {
+		kfree(qcom_domain);
+		return NULL;
+	}
+
+	mutex_init(&qcom_domain->init_mutex);
+	spin_lock_init(&qcom_domain->pgtbl_lock);
+
+	return &qcom_domain->domain;
+}
+
+static void qcom_iommu_domain_free(struct iommu_domain *domain)
+{
+	struct qcom_iommu_domain *qcom_domain = to_qcom_iommu_domain(domain);
+
+	iommu_put_dma_cookie(domain);
+
+	if (qcom_domain->iommu) {
+		/*
+		 * NOTE: unmap can be called after client device is powered
+		 * off, for example, with GPUs or anything involving dma-buf.
+		 * So we cannot rely on the device_link.  Make sure the IOMMU
+		 * is on to avoid unclocked accesses in the TLB inv path:
+		 */
+		pm_runtime_get_sync(qcom_domain->iommu->dev);
+		free_io_pgtable_ops(qcom_domain->pgtbl_ops);
+		pm_runtime_put_sync(qcom_domain->iommu->dev);
+	}
+
+	kfree(qcom_domain);
+}
+
+static int qcom_iommu_attach_dev(struct iommu_domain *domain, struct device *dev)
+{
+	struct qcom_iommu_dev *qcom_iommu = to_iommu(dev->iommu_fwspec);
+	struct qcom_iommu_domain *qcom_domain = to_qcom_iommu_domain(domain);
+	int ret;
+
+	if (!qcom_iommu) {
+		dev_err(dev, "cannot attach to IOMMU, is it on the same bus?\n");
+		return -ENXIO;
+	}
+
+	/* Ensure that the domain is finalized */
+	pm_runtime_get_sync(qcom_iommu->dev);
+	ret = qcom_iommu_init_domain(domain, qcom_iommu, dev->iommu_fwspec);
+	pm_runtime_put_sync(qcom_iommu->dev);
+	if (ret < 0)
+		return ret;
+
+	/*
+	 * Sanity check the domain. We don't support domains across
+	 * different IOMMUs.
+	 */
+	if (qcom_domain->iommu != qcom_iommu) {
+		dev_err(dev, "cannot attach to IOMMU %s while already "
+			"attached to domain on IOMMU %s\n",
+			dev_name(qcom_domain->iommu->dev),
+			dev_name(qcom_iommu->dev));
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static void qcom_iommu_detach_dev(struct iommu_domain *domain, struct device *dev)
+{
+	struct iommu_fwspec *fwspec = dev->iommu_fwspec;
+	struct qcom_iommu_dev *qcom_iommu = to_iommu(fwspec);
+	struct qcom_iommu_domain *qcom_domain = to_qcom_iommu_domain(domain);
+	unsigned i;
+
+	if (WARN_ON(!qcom_domain->iommu))
+		return;
+
+	pm_runtime_get_sync(qcom_iommu->dev);
+	for (i = 0; i < fwspec->num_ids; i++) {
+		struct qcom_iommu_ctx *ctx = to_ctx(fwspec, fwspec->ids[i]);
+
+		/* Disable the context bank: */
+		iommu_writel(ctx, ARM_SMMU_CB_SCTLR, 0);
+
+		ctx->domain = NULL;
+	}
+	pm_runtime_put_sync(qcom_iommu->dev);
+}
+
+static int qcom_iommu_map(struct iommu_domain *domain, unsigned long iova,
+			  phys_addr_t paddr, size_t size, int prot)
+{
+	int ret;
+	unsigned long flags;
+	struct qcom_iommu_domain *qcom_domain = to_qcom_iommu_domain(domain);
+	struct io_pgtable_ops *ops = qcom_domain->pgtbl_ops;
+
+	if (!ops)
+		return -ENODEV;
+
+	spin_lock_irqsave(&qcom_domain->pgtbl_lock, flags);
+	ret = ops->map(ops, iova, paddr, size, prot);
+	spin_unlock_irqrestore(&qcom_domain->pgtbl_lock, flags);
+	return ret;
+}
+
+static size_t qcom_iommu_unmap(struct iommu_domain *domain, unsigned long iova,
+			       size_t size)
+{
+	size_t ret;
+	unsigned long flags;
+	struct qcom_iommu_domain *qcom_domain = to_qcom_iommu_domain(domain);
+	struct io_pgtable_ops *ops = qcom_domain->pgtbl_ops;
+
+	if (!ops)
+		return 0;
+
+	/* NOTE: unmap can be called after client device is powered off,
+	 * for example, with GPUs or anything involving dma-buf.  So we
+	 * cannot rely on the device_link.  Make sure the IOMMU is on to
+	 * avoid unclocked accesses in the TLB inv path:
+	 */
+	pm_runtime_get_sync(qcom_domain->iommu->dev);
+	spin_lock_irqsave(&qcom_domain->pgtbl_lock, flags);
+	ret = ops->unmap(ops, iova, size);
+	spin_unlock_irqrestore(&qcom_domain->pgtbl_lock, flags);
+	pm_runtime_put_sync(qcom_domain->iommu->dev);
+
+	return ret;
+}
+
+static void qcom_iommu_iotlb_sync(struct iommu_domain *domain)
+{
+	struct qcom_iommu_domain *qcom_domain = to_qcom_iommu_domain(domain);
+	struct io_pgtable *pgtable = container_of(qcom_domain->pgtbl_ops,
+						  struct io_pgtable, ops);
+	if (!qcom_domain->pgtbl_ops)
+		return;
+
+	pm_runtime_get_sync(qcom_domain->iommu->dev);
+	qcom_iommu_tlb_sync(pgtable->cookie);
+	pm_runtime_put_sync(qcom_domain->iommu->dev);
+}
+
+static phys_addr_t qcom_iommu_iova_to_phys(struct iommu_domain *domain,
+					   dma_addr_t iova)
+{
+	phys_addr_t ret;
+	unsigned long flags;
+	struct qcom_iommu_domain *qcom_domain = to_qcom_iommu_domain(domain);
+	struct io_pgtable_ops *ops = qcom_domain->pgtbl_ops;
+
+	if (!ops)
+		return 0;
+
+	spin_lock_irqsave(&qcom_domain->pgtbl_lock, flags);
+	ret = ops->iova_to_phys(ops, iova);
+	spin_unlock_irqrestore(&qcom_domain->pgtbl_lock, flags);
+
+	return ret;
+}
+
+static bool qcom_iommu_capable(enum iommu_cap cap)
+{
+	switch (cap) {
+	case IOMMU_CAP_CACHE_COHERENCY:
+		/*
+		 * Return true here as the SMMU can always send out coherent
+		 * requests.
+		 */
+		return true;
+	case IOMMU_CAP_NOEXEC:
+		return true;
+	default:
+		return false;
+	}
+}
+
+static int qcom_iommu_add_device(struct device *dev)
+{
+	struct qcom_iommu_dev *qcom_iommu = to_iommu(dev->iommu_fwspec);
+	struct iommu_group *group;
+	struct device_link *link;
+
+	if (!qcom_iommu)
+		return -ENODEV;
+
+	/*
+	 * Establish the link between iommu and master, so that the
+	 * iommu gets runtime enabled/disabled as per the master's
+	 * needs.
+	 */
+	link = device_link_add(dev, qcom_iommu->dev, DL_FLAG_PM_RUNTIME);
+	if (!link) {
+		dev_err(qcom_iommu->dev, "Unable to create device link between %s and %s\n",
+			dev_name(qcom_iommu->dev), dev_name(dev));
+		return -ENODEV;
+	}
+
+	group = iommu_group_get_for_dev(dev);
+	if (IS_ERR_OR_NULL(group))
+		return PTR_ERR_OR_ZERO(group);
+
+	iommu_group_put(group);
+	iommu_device_link(&qcom_iommu->iommu, dev);
+
+	return 0;
+}
+
+static void qcom_iommu_remove_device(struct device *dev)
+{
+	struct qcom_iommu_dev *qcom_iommu = to_iommu(dev->iommu_fwspec);
+
+	if (!qcom_iommu)
+		return;
+
+	iommu_device_unlink(&qcom_iommu->iommu, dev);
+	iommu_group_remove_device(dev);
+	iommu_fwspec_free(dev);
+}
+
+static int qcom_iommu_of_xlate(struct device *dev, struct of_phandle_args *args)
+{
+	struct qcom_iommu_dev *qcom_iommu;
+	struct platform_device *iommu_pdev;
+	unsigned asid = args->args[0];
+
+	if (args->args_count != 1) {
+		dev_err(dev, "incorrect number of iommu params found for %s "
+			"(found %d, expected 1)\n",
+			args->np->full_name, args->args_count);
+		return -EINVAL;
+	}
+
+	iommu_pdev = of_find_device_by_node(args->np);
+	if (WARN_ON(!iommu_pdev))
+		return -EINVAL;
+
+	qcom_iommu = platform_get_drvdata(iommu_pdev);
+
+	/* make sure the asid specified in dt is valid, so we don't have
+	 * to sanity check this elsewhere, since 'asid - 1' is used to
+	 * index into qcom_iommu->ctxs:
+	 */
+	if (WARN_ON(asid < 1) ||
+	    WARN_ON(asid > qcom_iommu->num_ctxs))
+		return -EINVAL;
+
+	if (!dev->iommu_fwspec->iommu_priv) {
+		dev->iommu_fwspec->iommu_priv = qcom_iommu;
+	} else {
+		/* make sure devices iommus dt node isn't referring to
+		 * multiple different iommu devices.  Multiple context
+		 * banks are ok, but multiple devices are not:
+		 */
+		if (WARN_ON(qcom_iommu != dev->iommu_fwspec->iommu_priv))
+			return -EINVAL;
+	}
+
+	return iommu_fwspec_add_ids(dev, &asid, 1);
+}
+
+static const struct iommu_ops qcom_iommu_ops = {
+	.capable	= qcom_iommu_capable,
+	.domain_alloc	= qcom_iommu_domain_alloc,
+	.domain_free	= qcom_iommu_domain_free,
+	.attach_dev	= qcom_iommu_attach_dev,
+	.detach_dev	= qcom_iommu_detach_dev,
+	.map		= qcom_iommu_map,
+	.unmap		= qcom_iommu_unmap,
+	.flush_iotlb_all = qcom_iommu_iotlb_sync,
+	.iotlb_sync	= qcom_iommu_iotlb_sync,
+	.iova_to_phys	= qcom_iommu_iova_to_phys,
+	.add_device	= qcom_iommu_add_device,
+	.remove_device	= qcom_iommu_remove_device,
+	.device_group	= generic_device_group,
+	.of_xlate	= qcom_iommu_of_xlate,
+	.pgsize_bitmap	= SZ_4K | SZ_64K | SZ_1M | SZ_16M,
+};
+
+static int qcom_iommu_enable_clocks(struct qcom_iommu_dev *qcom_iommu)
+{
+	int ret;
+
+	ret = clk_prepare_enable(qcom_iommu->iface_clk);
+	if (ret) {
+		dev_err(qcom_iommu->dev, "Couldn't enable iface_clk\n");
+		return ret;
+	}
+
+	ret = clk_prepare_enable(qcom_iommu->bus_clk);
+	if (ret) {
+		dev_err(qcom_iommu->dev, "Couldn't enable bus_clk\n");
+		clk_disable_unprepare(qcom_iommu->iface_clk);
+		return ret;
+	}
+
+	return 0;
+}
+
+static void qcom_iommu_disable_clocks(struct qcom_iommu_dev *qcom_iommu)
+{
+	clk_disable_unprepare(qcom_iommu->bus_clk);
+	clk_disable_unprepare(qcom_iommu->iface_clk);
+}
+
+static int qcom_iommu_sec_ptbl_init(struct device *dev)
+{
+	size_t psize = 0;
+	unsigned int spare = 0;
+	void *cpu_addr;
+	dma_addr_t paddr;
+	unsigned long attrs;
+	static bool allocated = false;
+	int ret;
+
+	if (allocated)
+		return 0;
+
+	ret = qcom_scm_iommu_secure_ptbl_size(spare, &psize);
+	if (ret) {
+		dev_err(dev, "failed to get iommu secure pgtable size (%d)\n",
+			ret);
+		return ret;
+	}
+
+	dev_info(dev, "iommu sec: pgtable size: %zu\n", psize);
+
+	attrs = DMA_ATTR_NO_KERNEL_MAPPING;
+
+	cpu_addr = dma_alloc_attrs(dev, psize, &paddr, GFP_KERNEL, attrs);
+	if (!cpu_addr) {
+		dev_err(dev, "failed to allocate %zu bytes for pgtable\n",
+			psize);
+		return -ENOMEM;
+	}
+
+	ret = qcom_scm_iommu_secure_ptbl_init(paddr, psize, spare);
+	if (ret) {
+		dev_err(dev, "failed to init iommu pgtable (%d)\n", ret);
+		goto free_mem;
+	}
+
+	allocated = true;
+	return 0;
+
+free_mem:
+	dma_free_attrs(dev, psize, cpu_addr, paddr, attrs);
+	return ret;
+}
+
+static int get_asid(const struct device_node *np)
+{
+	u32 reg;
+
+	/* read the "reg" property directly to get the relative address
+	 * of the context bank, and calculate the asid from that:
+	 */
+	if (of_property_read_u32_index(np, "reg", 0, &reg))
+		return -ENODEV;
+
+	return reg / 0x1000;      /* context banks are 0x1000 apart */
+}
+
+static int qcom_iommu_ctx_probe(struct platform_device *pdev)
+{
+	struct qcom_iommu_ctx *ctx;
+	struct device *dev = &pdev->dev;
+	struct qcom_iommu_dev *qcom_iommu = dev_get_drvdata(dev->parent);
+	struct resource *res;
+	int ret, irq;
+
+	ctx = devm_kzalloc(dev, sizeof(*ctx), GFP_KERNEL);
+	if (!ctx)
+		return -ENOMEM;
+
+	ctx->dev = dev;
+	platform_set_drvdata(pdev, ctx);
+
+	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	ctx->base = devm_ioremap_resource(dev, res);
+	if (IS_ERR(ctx->base))
+		return PTR_ERR(ctx->base);
+
+	irq = platform_get_irq(pdev, 0);
+	if (irq < 0) {
+		dev_err(dev, "failed to get irq\n");
+		return -ENODEV;
+	}
+
+	/* clear IRQs before registering fault handler, just in case the
+	 * boot-loader left us a surprise:
+	 */
+	iommu_writel(ctx, ARM_SMMU_CB_FSR, iommu_readl(ctx, ARM_SMMU_CB_FSR));
+
+	ret = devm_request_irq(dev, irq,
+			       qcom_iommu_fault,
+			       IRQF_SHARED,
+			       "qcom-iommu-fault",
+			       ctx);
+	if (ret) {
+		dev_err(dev, "failed to request IRQ %u\n", irq);
+		return ret;
+	}
+
+	ret = get_asid(dev->of_node);
+	if (ret < 0) {
+		dev_err(dev, "missing reg property\n");
+		return ret;
+	}
+
+	ctx->asid = ret;
+
+	dev_dbg(dev, "found asid %u\n", ctx->asid);
+
+	qcom_iommu->ctxs[ctx->asid - 1] = ctx;
+
+	return 0;
+}
+
+static int qcom_iommu_ctx_remove(struct platform_device *pdev)
+{
+	struct qcom_iommu_dev *qcom_iommu = dev_get_drvdata(pdev->dev.parent);
+	struct qcom_iommu_ctx *ctx = platform_get_drvdata(pdev);
+
+	platform_set_drvdata(pdev, NULL);
+
+	qcom_iommu->ctxs[ctx->asid - 1] = NULL;
+
+	return 0;
+}
+
+static const struct of_device_id ctx_of_match[] = {
+	{ .compatible = "qcom,msm-iommu-v1-ns" },
+	{ .compatible = "qcom,msm-iommu-v1-sec" },
+	{ /* sentinel */ }
+};
+
+static struct platform_driver qcom_iommu_ctx_driver = {
+	.driver	= {
+		.name		= "qcom-iommu-ctx",
+		.of_match_table	= of_match_ptr(ctx_of_match),
+	},
+	.probe	= qcom_iommu_ctx_probe,
+	.remove = qcom_iommu_ctx_remove,
+};
+
+static bool qcom_iommu_has_secure_context(struct qcom_iommu_dev *qcom_iommu)
+{
+	struct device_node *child;
+
+	for_each_child_of_node(qcom_iommu->dev->of_node, child)
+		if (of_device_is_compatible(child, "qcom,msm-iommu-v1-sec"))
+			return true;
+
+	return false;
+}
+
+static int qcom_iommu_device_probe(struct platform_device *pdev)
+{
+	struct device_node *child;
+	struct qcom_iommu_dev *qcom_iommu;
+	struct device *dev = &pdev->dev;
+	struct resource *res;
+	int ret, sz, max_asid = 0;
+
+	/* find the max asid (which is 1:1 to ctx bank idx), so we know how
+	 * many child ctx devices we have:
+	 */
+	for_each_child_of_node(dev->of_node, child)
+		max_asid = max(max_asid, get_asid(child));
+
+	sz = sizeof(*qcom_iommu) + (max_asid * sizeof(qcom_iommu->ctxs[0]));
+
+	qcom_iommu = devm_kzalloc(dev, sz, GFP_KERNEL);
+	if (!qcom_iommu)
+		return -ENOMEM;
+	qcom_iommu->num_ctxs = max_asid;
+	qcom_iommu->dev = dev;
+
+	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	if (res) {
+		qcom_iommu->local_base = devm_ioremap_resource(dev, res);
+		if (IS_ERR(qcom_iommu->local_base))
+			return PTR_ERR(qcom_iommu->local_base);
+	}
+
+	qcom_iommu->iface_clk = devm_clk_get(dev, "iface");
+	if (IS_ERR(qcom_iommu->iface_clk)) {
+		dev_err(dev, "failed to get iface clock\n");
+		return PTR_ERR(qcom_iommu->iface_clk);
+	}
+
+	qcom_iommu->bus_clk = devm_clk_get(dev, "bus");
+	if (IS_ERR(qcom_iommu->bus_clk)) {
+		dev_err(dev, "failed to get bus clock\n");
+		return PTR_ERR(qcom_iommu->bus_clk);
+	}
+
+	if (of_property_read_u32(dev->of_node, "qcom,iommu-secure-id",
+				 &qcom_iommu->sec_id)) {
+		dev_err(dev, "missing qcom,iommu-secure-id property\n");
+		return -ENODEV;
+	}
+
+	if (qcom_iommu_has_secure_context(qcom_iommu)) {
+		ret = qcom_iommu_sec_ptbl_init(dev);
+		if (ret) {
+			dev_err(dev, "cannot init secure pg table(%d)\n", ret);
+			return ret;
+		}
+	}
+
+	platform_set_drvdata(pdev, qcom_iommu);
+
+	pm_runtime_enable(dev);
+
+	/* register context bank devices, which are child nodes: */
+	ret = devm_of_platform_populate(dev);
+	if (ret) {
+		dev_err(dev, "Failed to populate iommu contexts\n");
+		return ret;
+	}
+
+	ret = iommu_device_sysfs_add(&qcom_iommu->iommu, dev, NULL,
+				     dev_name(dev));
+	if (ret) {
+		dev_err(dev, "Failed to register iommu in sysfs\n");
+		return ret;
+	}
+
+	iommu_device_set_ops(&qcom_iommu->iommu, &qcom_iommu_ops);
+	iommu_device_set_fwnode(&qcom_iommu->iommu, dev->fwnode);
+
+	ret = iommu_device_register(&qcom_iommu->iommu);
+	if (ret) {
+		dev_err(dev, "Failed to register iommu\n");
+		return ret;
+	}
+
+	bus_set_iommu(&platform_bus_type, &qcom_iommu_ops);
+
+	if (qcom_iommu->local_base) {
+		pm_runtime_get_sync(dev);
+		writel_relaxed(0xffffffff, qcom_iommu->local_base + SMMU_INTR_SEL_NS);
+		pm_runtime_put_sync(dev);
+	}
+
+	return 0;
+}
+
+static int qcom_iommu_device_remove(struct platform_device *pdev)
+{
+	struct qcom_iommu_dev *qcom_iommu = platform_get_drvdata(pdev);
+
+	bus_set_iommu(&platform_bus_type, NULL);
+
+	pm_runtime_force_suspend(&pdev->dev);
+	platform_set_drvdata(pdev, NULL);
+	iommu_device_sysfs_remove(&qcom_iommu->iommu);
+	iommu_device_unregister(&qcom_iommu->iommu);
+
+	return 0;
+}
+
+static int __maybe_unused qcom_iommu_resume(struct device *dev)
+{
+	struct qcom_iommu_dev *qcom_iommu = dev_get_drvdata(dev);
+
+	return qcom_iommu_enable_clocks(qcom_iommu);
+}
+
+static int __maybe_unused qcom_iommu_suspend(struct device *dev)
+{
+	struct qcom_iommu_dev *qcom_iommu = dev_get_drvdata(dev);
+
+	qcom_iommu_disable_clocks(qcom_iommu);
+
+	return 0;
+}
+
+static const struct dev_pm_ops qcom_iommu_pm_ops = {
+	SET_RUNTIME_PM_OPS(qcom_iommu_suspend, qcom_iommu_resume, NULL)
+	SET_SYSTEM_SLEEP_PM_OPS(pm_runtime_force_suspend,
+				pm_runtime_force_resume)
+};
+
+static const struct of_device_id qcom_iommu_of_match[] = {
+	{ .compatible = "qcom,msm-iommu-v1" },
+	{ /* sentinel */ }
+};
+MODULE_DEVICE_TABLE(of, qcom_iommu_of_match);
+
+static struct platform_driver qcom_iommu_driver = {
+	.driver	= {
+		.name		= "qcom-iommu",
+		.of_match_table	= of_match_ptr(qcom_iommu_of_match),
+		.pm		= &qcom_iommu_pm_ops,
+	},
+	.probe	= qcom_iommu_device_probe,
+	.remove	= qcom_iommu_device_remove,
+};
+
+static int __init qcom_iommu_init(void)
+{
+	int ret;
+
+	ret = platform_driver_register(&qcom_iommu_ctx_driver);
+	if (ret)
+		return ret;
+
+	ret = platform_driver_register(&qcom_iommu_driver);
+	if (ret)
+		platform_driver_unregister(&qcom_iommu_ctx_driver);
+
+	return ret;
+}
+
+static void __exit qcom_iommu_exit(void)
+{
+	platform_driver_unregister(&qcom_iommu_driver);
+	platform_driver_unregister(&qcom_iommu_ctx_driver);
+}
+
+module_init(qcom_iommu_init);
+module_exit(qcom_iommu_exit);
+
+MODULE_DESCRIPTION("IOMMU API for QCOM IOMMU v1 implementations");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/iommu/rockchip-iommu.c b/drivers/iommu/rockchip-iommu.c
new file mode 100644
index 000000000..140b287e8
--- /dev/null
+++ b/drivers/iommu/rockchip-iommu.c
@@ -0,0 +1,1309 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/clk.h>
+#include <linux/compiler.h>
+#include <linux/delay.h>
+#include <linux/device.h>
+#include <linux/dma-iommu.h>
+#include <linux/dma-mapping.h>
+#include <linux/errno.h>
+#include <linux/interrupt.h>
+#include <linux/io.h>
+#include <linux/iommu.h>
+#include <linux/iopoll.h>
+#include <linux/list.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/of_iommu.h>
+#include <linux/of_platform.h>
+#include <linux/platform_device.h>
+#include <linux/pm_runtime.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+
+/** MMU register offsets */
+#define RK_MMU_DTE_ADDR		0x00	/* Directory table address */
+#define RK_MMU_STATUS		0x04
+#define RK_MMU_COMMAND		0x08
+#define RK_MMU_PAGE_FAULT_ADDR	0x0C	/* IOVA of last page fault */
+#define RK_MMU_ZAP_ONE_LINE	0x10	/* Shootdown one IOTLB entry */
+#define RK_MMU_INT_RAWSTAT	0x14	/* IRQ status ignoring mask */
+#define RK_MMU_INT_CLEAR	0x18	/* Acknowledge and re-arm irq */
+#define RK_MMU_INT_MASK		0x1C	/* IRQ enable */
+#define RK_MMU_INT_STATUS	0x20	/* IRQ status after masking */
+#define RK_MMU_AUTO_GATING	0x24
+
+#define DTE_ADDR_DUMMY		0xCAFEBABE
+
+#define RK_MMU_POLL_PERIOD_US		100
+#define RK_MMU_FORCE_RESET_TIMEOUT_US	100000
+#define RK_MMU_POLL_TIMEOUT_US		1000
+
+/* RK_MMU_STATUS fields */
+#define RK_MMU_STATUS_PAGING_ENABLED       BIT(0)
+#define RK_MMU_STATUS_PAGE_FAULT_ACTIVE    BIT(1)
+#define RK_MMU_STATUS_STALL_ACTIVE         BIT(2)
+#define RK_MMU_STATUS_IDLE                 BIT(3)
+#define RK_MMU_STATUS_REPLAY_BUFFER_EMPTY  BIT(4)
+#define RK_MMU_STATUS_PAGE_FAULT_IS_WRITE  BIT(5)
+#define RK_MMU_STATUS_STALL_NOT_ACTIVE     BIT(31)
+
+/* RK_MMU_COMMAND command values */
+#define RK_MMU_CMD_ENABLE_PAGING    0  /* Enable memory translation */
+#define RK_MMU_CMD_DISABLE_PAGING   1  /* Disable memory translation */
+#define RK_MMU_CMD_ENABLE_STALL     2  /* Stall paging to allow other cmds */
+#define RK_MMU_CMD_DISABLE_STALL    3  /* Stop stall re-enables paging */
+#define RK_MMU_CMD_ZAP_CACHE        4  /* Shoot down entire IOTLB */
+#define RK_MMU_CMD_PAGE_FAULT_DONE  5  /* Clear page fault */
+#define RK_MMU_CMD_FORCE_RESET      6  /* Reset all registers */
+
+/* RK_MMU_INT_* register fields */
+#define RK_MMU_IRQ_PAGE_FAULT    0x01  /* page fault */
+#define RK_MMU_IRQ_BUS_ERROR     0x02  /* bus read error */
+#define RK_MMU_IRQ_MASK          (RK_MMU_IRQ_PAGE_FAULT | RK_MMU_IRQ_BUS_ERROR)
+
+#define NUM_DT_ENTRIES 1024
+#define NUM_PT_ENTRIES 1024
+
+#define SPAGE_ORDER 12
+#define SPAGE_SIZE (1 << SPAGE_ORDER)
+
+ /*
+  * Support mapping any size that fits in one page table:
+  *   4 KiB to 4 MiB
+  */
+#define RK_IOMMU_PGSIZE_BITMAP 0x007ff000
+
+struct rk_iommu_domain {
+	struct list_head iommus;
+	u32 *dt; /* page directory table */
+	dma_addr_t dt_dma;
+	spinlock_t iommus_lock; /* lock for iommus list */
+	spinlock_t dt_lock; /* lock for modifying page directory table */
+
+	struct iommu_domain domain;
+};
+
+/* list of clocks required by IOMMU */
+static const char * const rk_iommu_clocks[] = {
+	"aclk", "iface",
+};
+
+struct rk_iommu {
+	struct device *dev;
+	void __iomem **bases;
+	int num_mmu;
+	struct clk_bulk_data *clocks;
+	int num_clocks;
+	bool reset_disabled;
+	struct iommu_device iommu;
+	struct list_head node; /* entry in rk_iommu_domain.iommus */
+	struct iommu_domain *domain; /* domain to which iommu is attached */
+	struct iommu_group *group;
+};
+
+struct rk_iommudata {
+	struct device_link *link; /* runtime PM link from IOMMU to master */
+	struct rk_iommu *iommu;
+};
+
+static struct device *dma_dev;
+
+static inline void rk_table_flush(struct rk_iommu_domain *dom, dma_addr_t dma,
+				  unsigned int count)
+{
+	size_t size = count * sizeof(u32); /* count of u32 entry */
+
+	dma_sync_single_for_device(dma_dev, dma, size, DMA_TO_DEVICE);
+}
+
+static struct rk_iommu_domain *to_rk_domain(struct iommu_domain *dom)
+{
+	return container_of(dom, struct rk_iommu_domain, domain);
+}
+
+/*
+ * The Rockchip rk3288 iommu uses a 2-level page table.
+ * The first level is the "Directory Table" (DT).
+ * The DT consists of 1024 4-byte Directory Table Entries (DTEs), each pointing
+ * to a "Page Table".
+ * The second level is the 1024 Page Tables (PT).
+ * Each PT consists of 1024 4-byte Page Table Entries (PTEs), each pointing to
+ * a 4 KB page of physical memory.
+ *
+ * The DT and each PT fits in a single 4 KB page (4-bytes * 1024 entries).
+ * Each iommu device has a MMU_DTE_ADDR register that contains the physical
+ * address of the start of the DT page.
+ *
+ * The structure of the page table is as follows:
+ *
+ *                   DT
+ * MMU_DTE_ADDR -> +-----+
+ *                 |     |
+ *                 +-----+     PT
+ *                 | DTE | -> +-----+
+ *                 +-----+    |     |     Memory
+ *                 |     |    +-----+     Page
+ *                 |     |    | PTE | -> +-----+
+ *                 +-----+    +-----+    |     |
+ *                            |     |    |     |
+ *                            |     |    |     |
+ *                            +-----+    |     |
+ *                                       |     |
+ *                                       |     |
+ *                                       +-----+
+ */
+
+/*
+ * Each DTE has a PT address and a valid bit:
+ * +---------------------+-----------+-+
+ * | PT address          | Reserved  |V|
+ * +---------------------+-----------+-+
+ *  31:12 - PT address (PTs always starts on a 4 KB boundary)
+ *  11: 1 - Reserved
+ *      0 - 1 if PT @ PT address is valid
+ */
+#define RK_DTE_PT_ADDRESS_MASK    0xfffff000
+#define RK_DTE_PT_VALID           BIT(0)
+
+static inline phys_addr_t rk_dte_pt_address(u32 dte)
+{
+	return (phys_addr_t)dte & RK_DTE_PT_ADDRESS_MASK;
+}
+
+static inline bool rk_dte_is_pt_valid(u32 dte)
+{
+	return dte & RK_DTE_PT_VALID;
+}
+
+static inline u32 rk_mk_dte(dma_addr_t pt_dma)
+{
+	return (pt_dma & RK_DTE_PT_ADDRESS_MASK) | RK_DTE_PT_VALID;
+}
+
+/*
+ * Each PTE has a Page address, some flags and a valid bit:
+ * +---------------------+---+-------+-+
+ * | Page address        |Rsv| Flags |V|
+ * +---------------------+---+-------+-+
+ *  31:12 - Page address (Pages always start on a 4 KB boundary)
+ *  11: 9 - Reserved
+ *   8: 1 - Flags
+ *      8 - Read allocate - allocate cache space on read misses
+ *      7 - Read cache - enable cache & prefetch of data
+ *      6 - Write buffer - enable delaying writes on their way to memory
+ *      5 - Write allocate - allocate cache space on write misses
+ *      4 - Write cache - different writes can be merged together
+ *      3 - Override cache attributes
+ *          if 1, bits 4-8 control cache attributes
+ *          if 0, the system bus defaults are used
+ *      2 - Writable
+ *      1 - Readable
+ *      0 - 1 if Page @ Page address is valid
+ */
+#define RK_PTE_PAGE_ADDRESS_MASK  0xfffff000
+#define RK_PTE_PAGE_FLAGS_MASK    0x000001fe
+#define RK_PTE_PAGE_WRITABLE      BIT(2)
+#define RK_PTE_PAGE_READABLE      BIT(1)
+#define RK_PTE_PAGE_VALID         BIT(0)
+
+static inline phys_addr_t rk_pte_page_address(u32 pte)
+{
+	return (phys_addr_t)pte & RK_PTE_PAGE_ADDRESS_MASK;
+}
+
+static inline bool rk_pte_is_page_valid(u32 pte)
+{
+	return pte & RK_PTE_PAGE_VALID;
+}
+
+/* TODO: set cache flags per prot IOMMU_CACHE */
+static u32 rk_mk_pte(phys_addr_t page, int prot)
+{
+	u32 flags = 0;
+	flags |= (prot & IOMMU_READ) ? RK_PTE_PAGE_READABLE : 0;
+	flags |= (prot & IOMMU_WRITE) ? RK_PTE_PAGE_WRITABLE : 0;
+	page &= RK_PTE_PAGE_ADDRESS_MASK;
+	return page | flags | RK_PTE_PAGE_VALID;
+}
+
+static u32 rk_mk_pte_invalid(u32 pte)
+{
+	return pte & ~RK_PTE_PAGE_VALID;
+}
+
+/*
+ * rk3288 iova (IOMMU Virtual Address) format
+ *  31       22.21       12.11          0
+ * +-----------+-----------+-------------+
+ * | DTE index | PTE index | Page offset |
+ * +-----------+-----------+-------------+
+ *  31:22 - DTE index   - index of DTE in DT
+ *  21:12 - PTE index   - index of PTE in PT @ DTE.pt_address
+ *  11: 0 - Page offset - offset into page @ PTE.page_address
+ */
+#define RK_IOVA_DTE_MASK    0xffc00000
+#define RK_IOVA_DTE_SHIFT   22
+#define RK_IOVA_PTE_MASK    0x003ff000
+#define RK_IOVA_PTE_SHIFT   12
+#define RK_IOVA_PAGE_MASK   0x00000fff
+#define RK_IOVA_PAGE_SHIFT  0
+
+static u32 rk_iova_dte_index(dma_addr_t iova)
+{
+	return (u32)(iova & RK_IOVA_DTE_MASK) >> RK_IOVA_DTE_SHIFT;
+}
+
+static u32 rk_iova_pte_index(dma_addr_t iova)
+{
+	return (u32)(iova & RK_IOVA_PTE_MASK) >> RK_IOVA_PTE_SHIFT;
+}
+
+static u32 rk_iova_page_offset(dma_addr_t iova)
+{
+	return (u32)(iova & RK_IOVA_PAGE_MASK) >> RK_IOVA_PAGE_SHIFT;
+}
+
+static u32 rk_iommu_read(void __iomem *base, u32 offset)
+{
+	return readl(base + offset);
+}
+
+static void rk_iommu_write(void __iomem *base, u32 offset, u32 value)
+{
+	writel(value, base + offset);
+}
+
+static void rk_iommu_command(struct rk_iommu *iommu, u32 command)
+{
+	int i;
+
+	for (i = 0; i < iommu->num_mmu; i++)
+		writel(command, iommu->bases[i] + RK_MMU_COMMAND);
+}
+
+static void rk_iommu_base_command(void __iomem *base, u32 command)
+{
+	writel(command, base + RK_MMU_COMMAND);
+}
+static void rk_iommu_zap_lines(struct rk_iommu *iommu, dma_addr_t iova_start,
+			       size_t size)
+{
+	int i;
+	dma_addr_t iova_end = iova_start + size;
+	/*
+	 * TODO(djkurtz): Figure out when it is more efficient to shootdown the
+	 * entire iotlb rather than iterate over individual iovas.
+	 */
+	for (i = 0; i < iommu->num_mmu; i++) {
+		dma_addr_t iova;
+
+		for (iova = iova_start; iova < iova_end; iova += SPAGE_SIZE)
+			rk_iommu_write(iommu->bases[i], RK_MMU_ZAP_ONE_LINE, iova);
+	}
+}
+
+static bool rk_iommu_is_stall_active(struct rk_iommu *iommu)
+{
+	bool active = true;
+	int i;
+
+	for (i = 0; i < iommu->num_mmu; i++)
+		active &= !!(rk_iommu_read(iommu->bases[i], RK_MMU_STATUS) &
+					   RK_MMU_STATUS_STALL_ACTIVE);
+
+	return active;
+}
+
+static bool rk_iommu_is_paging_enabled(struct rk_iommu *iommu)
+{
+	bool enable = true;
+	int i;
+
+	for (i = 0; i < iommu->num_mmu; i++)
+		enable &= !!(rk_iommu_read(iommu->bases[i], RK_MMU_STATUS) &
+					   RK_MMU_STATUS_PAGING_ENABLED);
+
+	return enable;
+}
+
+static bool rk_iommu_is_reset_done(struct rk_iommu *iommu)
+{
+	bool done = true;
+	int i;
+
+	for (i = 0; i < iommu->num_mmu; i++)
+		done &= rk_iommu_read(iommu->bases[i], RK_MMU_DTE_ADDR) == 0;
+
+	return done;
+}
+
+static int rk_iommu_enable_stall(struct rk_iommu *iommu)
+{
+	int ret, i;
+	bool val;
+
+	if (rk_iommu_is_stall_active(iommu))
+		return 0;
+
+	/* Stall can only be enabled if paging is enabled */
+	if (!rk_iommu_is_paging_enabled(iommu))
+		return 0;
+
+	rk_iommu_command(iommu, RK_MMU_CMD_ENABLE_STALL);
+
+	ret = readx_poll_timeout(rk_iommu_is_stall_active, iommu, val,
+				 val, RK_MMU_POLL_PERIOD_US,
+				 RK_MMU_POLL_TIMEOUT_US);
+	if (ret)
+		for (i = 0; i < iommu->num_mmu; i++)
+			dev_err(iommu->dev, "Enable stall request timed out, status: %#08x\n",
+				rk_iommu_read(iommu->bases[i], RK_MMU_STATUS));
+
+	return ret;
+}
+
+static int rk_iommu_disable_stall(struct rk_iommu *iommu)
+{
+	int ret, i;
+	bool val;
+
+	if (!rk_iommu_is_stall_active(iommu))
+		return 0;
+
+	rk_iommu_command(iommu, RK_MMU_CMD_DISABLE_STALL);
+
+	ret = readx_poll_timeout(rk_iommu_is_stall_active, iommu, val,
+				 !val, RK_MMU_POLL_PERIOD_US,
+				 RK_MMU_POLL_TIMEOUT_US);
+	if (ret)
+		for (i = 0; i < iommu->num_mmu; i++)
+			dev_err(iommu->dev, "Disable stall request timed out, status: %#08x\n",
+				rk_iommu_read(iommu->bases[i], RK_MMU_STATUS));
+
+	return ret;
+}
+
+static int rk_iommu_enable_paging(struct rk_iommu *iommu)
+{
+	int ret, i;
+	bool val;
+
+	if (rk_iommu_is_paging_enabled(iommu))
+		return 0;
+
+	rk_iommu_command(iommu, RK_MMU_CMD_ENABLE_PAGING);
+
+	ret = readx_poll_timeout(rk_iommu_is_paging_enabled, iommu, val,
+				 val, RK_MMU_POLL_PERIOD_US,
+				 RK_MMU_POLL_TIMEOUT_US);
+	if (ret)
+		for (i = 0; i < iommu->num_mmu; i++)
+			dev_err(iommu->dev, "Enable paging request timed out, status: %#08x\n",
+				rk_iommu_read(iommu->bases[i], RK_MMU_STATUS));
+
+	return ret;
+}
+
+static int rk_iommu_disable_paging(struct rk_iommu *iommu)
+{
+	int ret, i;
+	bool val;
+
+	if (!rk_iommu_is_paging_enabled(iommu))
+		return 0;
+
+	rk_iommu_command(iommu, RK_MMU_CMD_DISABLE_PAGING);
+
+	ret = readx_poll_timeout(rk_iommu_is_paging_enabled, iommu, val,
+				 !val, RK_MMU_POLL_PERIOD_US,
+				 RK_MMU_POLL_TIMEOUT_US);
+	if (ret)
+		for (i = 0; i < iommu->num_mmu; i++)
+			dev_err(iommu->dev, "Disable paging request timed out, status: %#08x\n",
+				rk_iommu_read(iommu->bases[i], RK_MMU_STATUS));
+
+	return ret;
+}
+
+static int rk_iommu_force_reset(struct rk_iommu *iommu)
+{
+	int ret, i;
+	u32 dte_addr;
+	bool val;
+
+	if (iommu->reset_disabled)
+		return 0;
+
+	/*
+	 * Check if register DTE_ADDR is working by writing DTE_ADDR_DUMMY
+	 * and verifying that upper 5 nybbles are read back.
+	 */
+	for (i = 0; i < iommu->num_mmu; i++) {
+		rk_iommu_write(iommu->bases[i], RK_MMU_DTE_ADDR, DTE_ADDR_DUMMY);
+
+		dte_addr = rk_iommu_read(iommu->bases[i], RK_MMU_DTE_ADDR);
+		if (dte_addr != (DTE_ADDR_DUMMY & RK_DTE_PT_ADDRESS_MASK)) {
+			dev_err(iommu->dev, "Error during raw reset. MMU_DTE_ADDR is not functioning\n");
+			return -EFAULT;
+		}
+	}
+
+	rk_iommu_command(iommu, RK_MMU_CMD_FORCE_RESET);
+
+	ret = readx_poll_timeout(rk_iommu_is_reset_done, iommu, val,
+				 val, RK_MMU_FORCE_RESET_TIMEOUT_US,
+				 RK_MMU_POLL_TIMEOUT_US);
+	if (ret) {
+		dev_err(iommu->dev, "FORCE_RESET command timed out\n");
+		return ret;
+	}
+
+	return 0;
+}
+
+static void log_iova(struct rk_iommu *iommu, int index, dma_addr_t iova)
+{
+	void __iomem *base = iommu->bases[index];
+	u32 dte_index, pte_index, page_offset;
+	u32 mmu_dte_addr;
+	phys_addr_t mmu_dte_addr_phys, dte_addr_phys;
+	u32 *dte_addr;
+	u32 dte;
+	phys_addr_t pte_addr_phys = 0;
+	u32 *pte_addr = NULL;
+	u32 pte = 0;
+	phys_addr_t page_addr_phys = 0;
+	u32 page_flags = 0;
+
+	dte_index = rk_iova_dte_index(iova);
+	pte_index = rk_iova_pte_index(iova);
+	page_offset = rk_iova_page_offset(iova);
+
+	mmu_dte_addr = rk_iommu_read(base, RK_MMU_DTE_ADDR);
+	mmu_dte_addr_phys = (phys_addr_t)mmu_dte_addr;
+
+	dte_addr_phys = mmu_dte_addr_phys + (4 * dte_index);
+	dte_addr = phys_to_virt(dte_addr_phys);
+	dte = *dte_addr;
+
+	if (!rk_dte_is_pt_valid(dte))
+		goto print_it;
+
+	pte_addr_phys = rk_dte_pt_address(dte) + (pte_index * 4);
+	pte_addr = phys_to_virt(pte_addr_phys);
+	pte = *pte_addr;
+
+	if (!rk_pte_is_page_valid(pte))
+		goto print_it;
+
+	page_addr_phys = rk_pte_page_address(pte) + page_offset;
+	page_flags = pte & RK_PTE_PAGE_FLAGS_MASK;
+
+print_it:
+	dev_err(iommu->dev, "iova = %pad: dte_index: %#03x pte_index: %#03x page_offset: %#03x\n",
+		&iova, dte_index, pte_index, page_offset);
+	dev_err(iommu->dev, "mmu_dte_addr: %pa dte@%pa: %#08x valid: %u pte@%pa: %#08x valid: %u page@%pa flags: %#03x\n",
+		&mmu_dte_addr_phys, &dte_addr_phys, dte,
+		rk_dte_is_pt_valid(dte), &pte_addr_phys, pte,
+		rk_pte_is_page_valid(pte), &page_addr_phys, page_flags);
+}
+
+static irqreturn_t rk_iommu_irq(int irq, void *dev_id)
+{
+	struct rk_iommu *iommu = dev_id;
+	u32 status;
+	u32 int_status;
+	dma_addr_t iova;
+	irqreturn_t ret = IRQ_NONE;
+	int i, err;
+
+	err = pm_runtime_get_if_in_use(iommu->dev);
+	if (WARN_ON_ONCE(err <= 0))
+		return ret;
+
+	if (WARN_ON(clk_bulk_enable(iommu->num_clocks, iommu->clocks)))
+		goto out;
+
+	for (i = 0; i < iommu->num_mmu; i++) {
+		int_status = rk_iommu_read(iommu->bases[i], RK_MMU_INT_STATUS);
+		if (int_status == 0)
+			continue;
+
+		ret = IRQ_HANDLED;
+		iova = rk_iommu_read(iommu->bases[i], RK_MMU_PAGE_FAULT_ADDR);
+
+		if (int_status & RK_MMU_IRQ_PAGE_FAULT) {
+			int flags;
+
+			status = rk_iommu_read(iommu->bases[i], RK_MMU_STATUS);
+			flags = (status & RK_MMU_STATUS_PAGE_FAULT_IS_WRITE) ?
+					IOMMU_FAULT_WRITE : IOMMU_FAULT_READ;
+
+			dev_err(iommu->dev, "Page fault at %pad of type %s\n",
+				&iova,
+				(flags == IOMMU_FAULT_WRITE) ? "write" : "read");
+
+			log_iova(iommu, i, iova);
+
+			/*
+			 * Report page fault to any installed handlers.
+			 * Ignore the return code, though, since we always zap cache
+			 * and clear the page fault anyway.
+			 */
+			if (iommu->domain)
+				report_iommu_fault(iommu->domain, iommu->dev, iova,
+						   flags);
+			else
+				dev_err(iommu->dev, "Page fault while iommu not attached to domain?\n");
+
+			rk_iommu_base_command(iommu->bases[i], RK_MMU_CMD_ZAP_CACHE);
+			rk_iommu_base_command(iommu->bases[i], RK_MMU_CMD_PAGE_FAULT_DONE);
+		}
+
+		if (int_status & RK_MMU_IRQ_BUS_ERROR)
+			dev_err(iommu->dev, "BUS_ERROR occurred at %pad\n", &iova);
+
+		if (int_status & ~RK_MMU_IRQ_MASK)
+			dev_err(iommu->dev, "unexpected int_status: %#08x\n",
+				int_status);
+
+		rk_iommu_write(iommu->bases[i], RK_MMU_INT_CLEAR, int_status);
+	}
+
+	clk_bulk_disable(iommu->num_clocks, iommu->clocks);
+
+out:
+	pm_runtime_put(iommu->dev);
+	return ret;
+}
+
+static phys_addr_t rk_iommu_iova_to_phys(struct iommu_domain *domain,
+					 dma_addr_t iova)
+{
+	struct rk_iommu_domain *rk_domain = to_rk_domain(domain);
+	unsigned long flags;
+	phys_addr_t pt_phys, phys = 0;
+	u32 dte, pte;
+	u32 *page_table;
+
+	spin_lock_irqsave(&rk_domain->dt_lock, flags);
+
+	dte = rk_domain->dt[rk_iova_dte_index(iova)];
+	if (!rk_dte_is_pt_valid(dte))
+		goto out;
+
+	pt_phys = rk_dte_pt_address(dte);
+	page_table = (u32 *)phys_to_virt(pt_phys);
+	pte = page_table[rk_iova_pte_index(iova)];
+	if (!rk_pte_is_page_valid(pte))
+		goto out;
+
+	phys = rk_pte_page_address(pte) + rk_iova_page_offset(iova);
+out:
+	spin_unlock_irqrestore(&rk_domain->dt_lock, flags);
+
+	return phys;
+}
+
+static void rk_iommu_zap_iova(struct rk_iommu_domain *rk_domain,
+			      dma_addr_t iova, size_t size)
+{
+	struct list_head *pos;
+	unsigned long flags;
+
+	/* shootdown these iova from all iommus using this domain */
+	spin_lock_irqsave(&rk_domain->iommus_lock, flags);
+	list_for_each(pos, &rk_domain->iommus) {
+		struct rk_iommu *iommu;
+		int ret;
+
+		iommu = list_entry(pos, struct rk_iommu, node);
+
+		/* Only zap TLBs of IOMMUs that are powered on. */
+		ret = pm_runtime_get_if_in_use(iommu->dev);
+		if (WARN_ON_ONCE(ret < 0))
+			continue;
+		if (ret) {
+			WARN_ON(clk_bulk_enable(iommu->num_clocks,
+						iommu->clocks));
+			rk_iommu_zap_lines(iommu, iova, size);
+			clk_bulk_disable(iommu->num_clocks, iommu->clocks);
+			pm_runtime_put(iommu->dev);
+		}
+	}
+	spin_unlock_irqrestore(&rk_domain->iommus_lock, flags);
+}
+
+static void rk_iommu_zap_iova_first_last(struct rk_iommu_domain *rk_domain,
+					 dma_addr_t iova, size_t size)
+{
+	rk_iommu_zap_iova(rk_domain, iova, SPAGE_SIZE);
+	if (size > SPAGE_SIZE)
+		rk_iommu_zap_iova(rk_domain, iova + size - SPAGE_SIZE,
+					SPAGE_SIZE);
+}
+
+static u32 *rk_dte_get_page_table(struct rk_iommu_domain *rk_domain,
+				  dma_addr_t iova)
+{
+	u32 *page_table, *dte_addr;
+	u32 dte_index, dte;
+	phys_addr_t pt_phys;
+	dma_addr_t pt_dma;
+
+	assert_spin_locked(&rk_domain->dt_lock);
+
+	dte_index = rk_iova_dte_index(iova);
+	dte_addr = &rk_domain->dt[dte_index];
+	dte = *dte_addr;
+	if (rk_dte_is_pt_valid(dte))
+		goto done;
+
+	page_table = (u32 *)get_zeroed_page(GFP_ATOMIC | GFP_DMA32);
+	if (!page_table)
+		return ERR_PTR(-ENOMEM);
+
+	pt_dma = dma_map_single(dma_dev, page_table, SPAGE_SIZE, DMA_TO_DEVICE);
+	if (dma_mapping_error(dma_dev, pt_dma)) {
+		dev_err(dma_dev, "DMA mapping error while allocating page table\n");
+		free_page((unsigned long)page_table);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	dte = rk_mk_dte(pt_dma);
+	*dte_addr = dte;
+
+	rk_table_flush(rk_domain, pt_dma, NUM_PT_ENTRIES);
+	rk_table_flush(rk_domain,
+		       rk_domain->dt_dma + dte_index * sizeof(u32), 1);
+done:
+	pt_phys = rk_dte_pt_address(dte);
+	return (u32 *)phys_to_virt(pt_phys);
+}
+
+static size_t rk_iommu_unmap_iova(struct rk_iommu_domain *rk_domain,
+				  u32 *pte_addr, dma_addr_t pte_dma,
+				  size_t size)
+{
+	unsigned int pte_count;
+	unsigned int pte_total = size / SPAGE_SIZE;
+
+	assert_spin_locked(&rk_domain->dt_lock);
+
+	for (pte_count = 0; pte_count < pte_total; pte_count++) {
+		u32 pte = pte_addr[pte_count];
+		if (!rk_pte_is_page_valid(pte))
+			break;
+
+		pte_addr[pte_count] = rk_mk_pte_invalid(pte);
+	}
+
+	rk_table_flush(rk_domain, pte_dma, pte_count);
+
+	return pte_count * SPAGE_SIZE;
+}
+
+static int rk_iommu_map_iova(struct rk_iommu_domain *rk_domain, u32 *pte_addr,
+			     dma_addr_t pte_dma, dma_addr_t iova,
+			     phys_addr_t paddr, size_t size, int prot)
+{
+	unsigned int pte_count;
+	unsigned int pte_total = size / SPAGE_SIZE;
+	phys_addr_t page_phys;
+
+	assert_spin_locked(&rk_domain->dt_lock);
+
+	for (pte_count = 0; pte_count < pte_total; pte_count++) {
+		u32 pte = pte_addr[pte_count];
+
+		if (rk_pte_is_page_valid(pte))
+			goto unwind;
+
+		pte_addr[pte_count] = rk_mk_pte(paddr, prot);
+
+		paddr += SPAGE_SIZE;
+	}
+
+	rk_table_flush(rk_domain, pte_dma, pte_total);
+
+	/*
+	 * Zap the first and last iova to evict from iotlb any previously
+	 * mapped cachelines holding stale values for its dte and pte.
+	 * We only zap the first and last iova, since only they could have
+	 * dte or pte shared with an existing mapping.
+	 */
+	rk_iommu_zap_iova_first_last(rk_domain, iova, size);
+
+	return 0;
+unwind:
+	/* Unmap the range of iovas that we just mapped */
+	rk_iommu_unmap_iova(rk_domain, pte_addr, pte_dma,
+			    pte_count * SPAGE_SIZE);
+
+	iova += pte_count * SPAGE_SIZE;
+	page_phys = rk_pte_page_address(pte_addr[pte_count]);
+	pr_err("iova: %pad already mapped to %pa cannot remap to phys: %pa prot: %#x\n",
+	       &iova, &page_phys, &paddr, prot);
+
+	return -EADDRINUSE;
+}
+
+static int rk_iommu_map(struct iommu_domain *domain, unsigned long _iova,
+			phys_addr_t paddr, size_t size, int prot)
+{
+	struct rk_iommu_domain *rk_domain = to_rk_domain(domain);
+	unsigned long flags;
+	dma_addr_t pte_dma, iova = (dma_addr_t)_iova;
+	u32 *page_table, *pte_addr;
+	u32 dte_index, pte_index;
+	int ret;
+
+	spin_lock_irqsave(&rk_domain->dt_lock, flags);
+
+	/*
+	 * pgsize_bitmap specifies iova sizes that fit in one page table
+	 * (1024 4-KiB pages = 4 MiB).
+	 * So, size will always be 4096 <= size <= 4194304.
+	 * Since iommu_map() guarantees that both iova and size will be
+	 * aligned, we will always only be mapping from a single dte here.
+	 */
+	page_table = rk_dte_get_page_table(rk_domain, iova);
+	if (IS_ERR(page_table)) {
+		spin_unlock_irqrestore(&rk_domain->dt_lock, flags);
+		return PTR_ERR(page_table);
+	}
+
+	dte_index = rk_domain->dt[rk_iova_dte_index(iova)];
+	pte_index = rk_iova_pte_index(iova);
+	pte_addr = &page_table[pte_index];
+	pte_dma = rk_dte_pt_address(dte_index) + pte_index * sizeof(u32);
+	ret = rk_iommu_map_iova(rk_domain, pte_addr, pte_dma, iova,
+				paddr, size, prot);
+
+	spin_unlock_irqrestore(&rk_domain->dt_lock, flags);
+
+	return ret;
+}
+
+static size_t rk_iommu_unmap(struct iommu_domain *domain, unsigned long _iova,
+			     size_t size)
+{
+	struct rk_iommu_domain *rk_domain = to_rk_domain(domain);
+	unsigned long flags;
+	dma_addr_t pte_dma, iova = (dma_addr_t)_iova;
+	phys_addr_t pt_phys;
+	u32 dte;
+	u32 *pte_addr;
+	size_t unmap_size;
+
+	spin_lock_irqsave(&rk_domain->dt_lock, flags);
+
+	/*
+	 * pgsize_bitmap specifies iova sizes that fit in one page table
+	 * (1024 4-KiB pages = 4 MiB).
+	 * So, size will always be 4096 <= size <= 4194304.
+	 * Since iommu_unmap() guarantees that both iova and size will be
+	 * aligned, we will always only be unmapping from a single dte here.
+	 */
+	dte = rk_domain->dt[rk_iova_dte_index(iova)];
+	/* Just return 0 if iova is unmapped */
+	if (!rk_dte_is_pt_valid(dte)) {
+		spin_unlock_irqrestore(&rk_domain->dt_lock, flags);
+		return 0;
+	}
+
+	pt_phys = rk_dte_pt_address(dte);
+	pte_addr = (u32 *)phys_to_virt(pt_phys) + rk_iova_pte_index(iova);
+	pte_dma = pt_phys + rk_iova_pte_index(iova) * sizeof(u32);
+	unmap_size = rk_iommu_unmap_iova(rk_domain, pte_addr, pte_dma, size);
+
+	spin_unlock_irqrestore(&rk_domain->dt_lock, flags);
+
+	/* Shootdown iotlb entries for iova range that was just unmapped */
+	rk_iommu_zap_iova(rk_domain, iova, unmap_size);
+
+	return unmap_size;
+}
+
+static struct rk_iommu *rk_iommu_from_dev(struct device *dev)
+{
+	struct rk_iommudata *data = dev->archdata.iommu;
+
+	return data ? data->iommu : NULL;
+}
+
+/* Must be called with iommu powered on and attached */
+static void rk_iommu_disable(struct rk_iommu *iommu)
+{
+	int i;
+
+	/* Ignore error while disabling, just keep going */
+	WARN_ON(clk_bulk_enable(iommu->num_clocks, iommu->clocks));
+	rk_iommu_enable_stall(iommu);
+	rk_iommu_disable_paging(iommu);
+	for (i = 0; i < iommu->num_mmu; i++) {
+		rk_iommu_write(iommu->bases[i], RK_MMU_INT_MASK, 0);
+		rk_iommu_write(iommu->bases[i], RK_MMU_DTE_ADDR, 0);
+	}
+	rk_iommu_disable_stall(iommu);
+	clk_bulk_disable(iommu->num_clocks, iommu->clocks);
+}
+
+/* Must be called with iommu powered on and attached */
+static int rk_iommu_enable(struct rk_iommu *iommu)
+{
+	struct iommu_domain *domain = iommu->domain;
+	struct rk_iommu_domain *rk_domain = to_rk_domain(domain);
+	int ret, i;
+
+	ret = clk_bulk_enable(iommu->num_clocks, iommu->clocks);
+	if (ret)
+		return ret;
+
+	ret = rk_iommu_enable_stall(iommu);
+	if (ret)
+		goto out_disable_clocks;
+
+	ret = rk_iommu_force_reset(iommu);
+	if (ret)
+		goto out_disable_stall;
+
+	for (i = 0; i < iommu->num_mmu; i++) {
+		rk_iommu_write(iommu->bases[i], RK_MMU_DTE_ADDR,
+			       rk_domain->dt_dma);
+		rk_iommu_base_command(iommu->bases[i], RK_MMU_CMD_ZAP_CACHE);
+		rk_iommu_write(iommu->bases[i], RK_MMU_INT_MASK, RK_MMU_IRQ_MASK);
+	}
+
+	ret = rk_iommu_enable_paging(iommu);
+
+out_disable_stall:
+	rk_iommu_disable_stall(iommu);
+out_disable_clocks:
+	clk_bulk_disable(iommu->num_clocks, iommu->clocks);
+	return ret;
+}
+
+static void rk_iommu_detach_device(struct iommu_domain *domain,
+				   struct device *dev)
+{
+	struct rk_iommu *iommu;
+	struct rk_iommu_domain *rk_domain = to_rk_domain(domain);
+	unsigned long flags;
+	int ret;
+
+	/* Allow 'virtual devices' (eg drm) to detach from domain */
+	iommu = rk_iommu_from_dev(dev);
+	if (!iommu)
+		return;
+
+	dev_dbg(dev, "Detaching from iommu domain\n");
+
+	/* iommu already detached */
+	if (iommu->domain != domain)
+		return;
+
+	iommu->domain = NULL;
+
+	spin_lock_irqsave(&rk_domain->iommus_lock, flags);
+	list_del_init(&iommu->node);
+	spin_unlock_irqrestore(&rk_domain->iommus_lock, flags);
+
+	ret = pm_runtime_get_if_in_use(iommu->dev);
+	WARN_ON_ONCE(ret < 0);
+	if (ret > 0) {
+		rk_iommu_disable(iommu);
+		pm_runtime_put(iommu->dev);
+	}
+}
+
+static int rk_iommu_attach_device(struct iommu_domain *domain,
+		struct device *dev)
+{
+	struct rk_iommu *iommu;
+	struct rk_iommu_domain *rk_domain = to_rk_domain(domain);
+	unsigned long flags;
+	int ret;
+
+	/*
+	 * Allow 'virtual devices' (e.g., drm) to attach to domain.
+	 * Such a device does not belong to an iommu group.
+	 */
+	iommu = rk_iommu_from_dev(dev);
+	if (!iommu)
+		return 0;
+
+	dev_dbg(dev, "Attaching to iommu domain\n");
+
+	/* iommu already attached */
+	if (iommu->domain == domain)
+		return 0;
+
+	if (iommu->domain)
+		rk_iommu_detach_device(iommu->domain, dev);
+
+	iommu->domain = domain;
+
+	spin_lock_irqsave(&rk_domain->iommus_lock, flags);
+	list_add_tail(&iommu->node, &rk_domain->iommus);
+	spin_unlock_irqrestore(&rk_domain->iommus_lock, flags);
+
+	ret = pm_runtime_get_if_in_use(iommu->dev);
+	if (!ret || WARN_ON_ONCE(ret < 0))
+		return 0;
+
+	ret = rk_iommu_enable(iommu);
+	if (ret)
+		rk_iommu_detach_device(iommu->domain, dev);
+
+	pm_runtime_put(iommu->dev);
+
+	return ret;
+}
+
+static struct iommu_domain *rk_iommu_domain_alloc(unsigned type)
+{
+	struct rk_iommu_domain *rk_domain;
+
+	if (type != IOMMU_DOMAIN_UNMANAGED && type != IOMMU_DOMAIN_DMA)
+		return NULL;
+
+	if (!dma_dev)
+		return NULL;
+
+	rk_domain = kzalloc(sizeof(*rk_domain), GFP_KERNEL);
+	if (!rk_domain)
+		return NULL;
+
+	if (type == IOMMU_DOMAIN_DMA &&
+	    iommu_get_dma_cookie(&rk_domain->domain))
+		goto err_free_domain;
+
+	/*
+	 * rk32xx iommus use a 2 level pagetable.
+	 * Each level1 (dt) and level2 (pt) table has 1024 4-byte entries.
+	 * Allocate one 4 KiB page for each table.
+	 */
+	rk_domain->dt = (u32 *)get_zeroed_page(GFP_KERNEL | GFP_DMA32);
+	if (!rk_domain->dt)
+		goto err_put_cookie;
+
+	rk_domain->dt_dma = dma_map_single(dma_dev, rk_domain->dt,
+					   SPAGE_SIZE, DMA_TO_DEVICE);
+	if (dma_mapping_error(dma_dev, rk_domain->dt_dma)) {
+		dev_err(dma_dev, "DMA map error for DT\n");
+		goto err_free_dt;
+	}
+
+	rk_table_flush(rk_domain, rk_domain->dt_dma, NUM_DT_ENTRIES);
+
+	spin_lock_init(&rk_domain->iommus_lock);
+	spin_lock_init(&rk_domain->dt_lock);
+	INIT_LIST_HEAD(&rk_domain->iommus);
+
+	rk_domain->domain.geometry.aperture_start = 0;
+	rk_domain->domain.geometry.aperture_end   = DMA_BIT_MASK(32);
+	rk_domain->domain.geometry.force_aperture = true;
+
+	return &rk_domain->domain;
+
+err_free_dt:
+	free_page((unsigned long)rk_domain->dt);
+err_put_cookie:
+	if (type == IOMMU_DOMAIN_DMA)
+		iommu_put_dma_cookie(&rk_domain->domain);
+err_free_domain:
+	kfree(rk_domain);
+
+	return NULL;
+}
+
+static void rk_iommu_domain_free(struct iommu_domain *domain)
+{
+	struct rk_iommu_domain *rk_domain = to_rk_domain(domain);
+	int i;
+
+	WARN_ON(!list_empty(&rk_domain->iommus));
+
+	for (i = 0; i < NUM_DT_ENTRIES; i++) {
+		u32 dte = rk_domain->dt[i];
+		if (rk_dte_is_pt_valid(dte)) {
+			phys_addr_t pt_phys = rk_dte_pt_address(dte);
+			u32 *page_table = phys_to_virt(pt_phys);
+			dma_unmap_single(dma_dev, pt_phys,
+					 SPAGE_SIZE, DMA_TO_DEVICE);
+			free_page((unsigned long)page_table);
+		}
+	}
+
+	dma_unmap_single(dma_dev, rk_domain->dt_dma,
+			 SPAGE_SIZE, DMA_TO_DEVICE);
+	free_page((unsigned long)rk_domain->dt);
+
+	if (domain->type == IOMMU_DOMAIN_DMA)
+		iommu_put_dma_cookie(&rk_domain->domain);
+	kfree(rk_domain);
+}
+
+static int rk_iommu_add_device(struct device *dev)
+{
+	struct iommu_group *group;
+	struct rk_iommu *iommu;
+	struct rk_iommudata *data;
+
+	data = dev->archdata.iommu;
+	if (!data)
+		return -ENODEV;
+
+	iommu = rk_iommu_from_dev(dev);
+
+	group = iommu_group_get_for_dev(dev);
+	if (IS_ERR(group))
+		return PTR_ERR(group);
+	iommu_group_put(group);
+
+	iommu_device_link(&iommu->iommu, dev);
+	data->link = device_link_add(dev, iommu->dev, DL_FLAG_PM_RUNTIME);
+
+	return 0;
+}
+
+static void rk_iommu_remove_device(struct device *dev)
+{
+	struct rk_iommu *iommu;
+	struct rk_iommudata *data = dev->archdata.iommu;
+
+	iommu = rk_iommu_from_dev(dev);
+
+	device_link_del(data->link);
+	iommu_device_unlink(&iommu->iommu, dev);
+	iommu_group_remove_device(dev);
+}
+
+static struct iommu_group *rk_iommu_device_group(struct device *dev)
+{
+	struct rk_iommu *iommu;
+
+	iommu = rk_iommu_from_dev(dev);
+
+	return iommu_group_ref_get(iommu->group);
+}
+
+static int rk_iommu_of_xlate(struct device *dev,
+			     struct of_phandle_args *args)
+{
+	struct platform_device *iommu_dev;
+	struct rk_iommudata *data;
+
+	data = devm_kzalloc(dma_dev, sizeof(*data), GFP_KERNEL);
+	if (!data)
+		return -ENOMEM;
+
+	iommu_dev = of_find_device_by_node(args->np);
+
+	data->iommu = platform_get_drvdata(iommu_dev);
+	dev->archdata.iommu = data;
+
+	platform_device_put(iommu_dev);
+
+	return 0;
+}
+
+static const struct iommu_ops rk_iommu_ops = {
+	.domain_alloc = rk_iommu_domain_alloc,
+	.domain_free = rk_iommu_domain_free,
+	.attach_dev = rk_iommu_attach_device,
+	.detach_dev = rk_iommu_detach_device,
+	.map = rk_iommu_map,
+	.unmap = rk_iommu_unmap,
+	.add_device = rk_iommu_add_device,
+	.remove_device = rk_iommu_remove_device,
+	.iova_to_phys = rk_iommu_iova_to_phys,
+	.device_group = rk_iommu_device_group,
+	.pgsize_bitmap = RK_IOMMU_PGSIZE_BITMAP,
+	.of_xlate = rk_iommu_of_xlate,
+};
+
+static int rk_iommu_probe(struct platform_device *pdev)
+{
+	struct device *dev = &pdev->dev;
+	struct rk_iommu *iommu;
+	struct resource *res;
+	int num_res = pdev->num_resources;
+	int err, i, irq;
+
+	iommu = devm_kzalloc(dev, sizeof(*iommu), GFP_KERNEL);
+	if (!iommu)
+		return -ENOMEM;
+
+	platform_set_drvdata(pdev, iommu);
+	iommu->dev = dev;
+	iommu->num_mmu = 0;
+
+	iommu->bases = devm_kcalloc(dev, num_res, sizeof(*iommu->bases),
+				    GFP_KERNEL);
+	if (!iommu->bases)
+		return -ENOMEM;
+
+	for (i = 0; i < num_res; i++) {
+		res = platform_get_resource(pdev, IORESOURCE_MEM, i);
+		if (!res)
+			continue;
+		iommu->bases[i] = devm_ioremap_resource(&pdev->dev, res);
+		if (IS_ERR(iommu->bases[i]))
+			continue;
+		iommu->num_mmu++;
+	}
+	if (iommu->num_mmu == 0)
+		return PTR_ERR(iommu->bases[0]);
+
+	iommu->reset_disabled = device_property_read_bool(dev,
+					"rockchip,disable-mmu-reset");
+
+	iommu->num_clocks = ARRAY_SIZE(rk_iommu_clocks);
+	iommu->clocks = devm_kcalloc(iommu->dev, iommu->num_clocks,
+				     sizeof(*iommu->clocks), GFP_KERNEL);
+	if (!iommu->clocks)
+		return -ENOMEM;
+
+	for (i = 0; i < iommu->num_clocks; ++i)
+		iommu->clocks[i].id = rk_iommu_clocks[i];
+
+	/*
+	 * iommu clocks should be present for all new devices and devicetrees
+	 * but there are older devicetrees without clocks out in the wild.
+	 * So clocks as optional for the time being.
+	 */
+	err = devm_clk_bulk_get(iommu->dev, iommu->num_clocks, iommu->clocks);
+	if (err == -ENOENT)
+		iommu->num_clocks = 0;
+	else if (err)
+		return err;
+
+	err = clk_bulk_prepare(iommu->num_clocks, iommu->clocks);
+	if (err)
+		return err;
+
+	iommu->group = iommu_group_alloc();
+	if (IS_ERR(iommu->group)) {
+		err = PTR_ERR(iommu->group);
+		goto err_unprepare_clocks;
+	}
+
+	err = iommu_device_sysfs_add(&iommu->iommu, dev, NULL, dev_name(dev));
+	if (err)
+		goto err_put_group;
+
+	iommu_device_set_ops(&iommu->iommu, &rk_iommu_ops);
+	iommu_device_set_fwnode(&iommu->iommu, &dev->of_node->fwnode);
+
+	err = iommu_device_register(&iommu->iommu);
+	if (err)
+		goto err_remove_sysfs;
+
+	/*
+	 * Use the first registered IOMMU device for domain to use with DMA
+	 * API, since a domain might not physically correspond to a single
+	 * IOMMU device..
+	 */
+	if (!dma_dev)
+		dma_dev = &pdev->dev;
+
+	bus_set_iommu(&platform_bus_type, &rk_iommu_ops);
+
+	pm_runtime_enable(dev);
+
+	i = 0;
+	while ((irq = platform_get_irq(pdev, i++)) != -ENXIO) {
+		if (irq < 0)
+			return irq;
+
+		err = devm_request_irq(iommu->dev, irq, rk_iommu_irq,
+				       IRQF_SHARED, dev_name(dev), iommu);
+		if (err) {
+			pm_runtime_disable(dev);
+			goto err_remove_sysfs;
+		}
+	}
+
+	return 0;
+err_remove_sysfs:
+	iommu_device_sysfs_remove(&iommu->iommu);
+err_put_group:
+	iommu_group_put(iommu->group);
+err_unprepare_clocks:
+	clk_bulk_unprepare(iommu->num_clocks, iommu->clocks);
+	return err;
+}
+
+static void rk_iommu_shutdown(struct platform_device *pdev)
+{
+	struct rk_iommu *iommu = platform_get_drvdata(pdev);
+	int i = 0, irq;
+
+	while ((irq = platform_get_irq(pdev, i++)) != -ENXIO)
+		devm_free_irq(iommu->dev, irq, iommu);
+
+	pm_runtime_force_suspend(&pdev->dev);
+}
+
+static int __maybe_unused rk_iommu_suspend(struct device *dev)
+{
+	struct rk_iommu *iommu = dev_get_drvdata(dev);
+
+	if (!iommu->domain)
+		return 0;
+
+	rk_iommu_disable(iommu);
+	return 0;
+}
+
+static int __maybe_unused rk_iommu_resume(struct device *dev)
+{
+	struct rk_iommu *iommu = dev_get_drvdata(dev);
+
+	if (!iommu->domain)
+		return 0;
+
+	return rk_iommu_enable(iommu);
+}
+
+static const struct dev_pm_ops rk_iommu_pm_ops = {
+	SET_RUNTIME_PM_OPS(rk_iommu_suspend, rk_iommu_resume, NULL)
+	SET_SYSTEM_SLEEP_PM_OPS(pm_runtime_force_suspend,
+				pm_runtime_force_resume)
+};
+
+static const struct of_device_id rk_iommu_dt_ids[] = {
+	{ .compatible = "rockchip,iommu" },
+	{ /* sentinel */ }
+};
+MODULE_DEVICE_TABLE(of, rk_iommu_dt_ids);
+
+static struct platform_driver rk_iommu_driver = {
+	.probe = rk_iommu_probe,
+	.shutdown = rk_iommu_shutdown,
+	.driver = {
+		   .name = "rk_iommu",
+		   .of_match_table = rk_iommu_dt_ids,
+		   .pm = &rk_iommu_pm_ops,
+		   .suppress_bind_attrs = true,
+	},
+};
+
+static int __init rk_iommu_init(void)
+{
+	return platform_driver_register(&rk_iommu_driver);
+}
+subsys_initcall(rk_iommu_init);
+
+MODULE_DESCRIPTION("IOMMU API for Rockchip");
+MODULE_AUTHOR("Simon Xue <xxm@rock-chips.com> and Daniel Kurtz <djkurtz@chromium.org>");
+MODULE_ALIAS("platform:rockchip-iommu");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/iommu/s390-iommu.c b/drivers/iommu/s390-iommu.c
new file mode 100644
index 000000000..22d4db302
--- /dev/null
+++ b/drivers/iommu/s390-iommu.c
@@ -0,0 +1,385 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * IOMMU API for s390 PCI devices
+ *
+ * Copyright IBM Corp. 2015
+ * Author(s): Gerald Schaefer <gerald.schaefer@de.ibm.com>
+ */
+
+#include <linux/pci.h>
+#include <linux/iommu.h>
+#include <linux/iommu-helper.h>
+#include <linux/sizes.h>
+#include <asm/pci_dma.h>
+
+/*
+ * Physically contiguous memory regions can be mapped with 4 KiB alignment,
+ * we allow all page sizes that are an order of 4KiB (no special large page
+ * support so far).
+ */
+#define S390_IOMMU_PGSIZES	(~0xFFFUL)
+
+static const struct iommu_ops s390_iommu_ops;
+
+struct s390_domain {
+	struct iommu_domain	domain;
+	struct list_head	devices;
+	unsigned long		*dma_table;
+	spinlock_t		dma_table_lock;
+	spinlock_t		list_lock;
+};
+
+struct s390_domain_device {
+	struct list_head	list;
+	struct zpci_dev		*zdev;
+};
+
+static struct s390_domain *to_s390_domain(struct iommu_domain *dom)
+{
+	return container_of(dom, struct s390_domain, domain);
+}
+
+static bool s390_iommu_capable(enum iommu_cap cap)
+{
+	switch (cap) {
+	case IOMMU_CAP_CACHE_COHERENCY:
+		return true;
+	case IOMMU_CAP_INTR_REMAP:
+		return true;
+	default:
+		return false;
+	}
+}
+
+static struct iommu_domain *s390_domain_alloc(unsigned domain_type)
+{
+	struct s390_domain *s390_domain;
+
+	if (domain_type != IOMMU_DOMAIN_UNMANAGED)
+		return NULL;
+
+	s390_domain = kzalloc(sizeof(*s390_domain), GFP_KERNEL);
+	if (!s390_domain)
+		return NULL;
+
+	s390_domain->dma_table = dma_alloc_cpu_table();
+	if (!s390_domain->dma_table) {
+		kfree(s390_domain);
+		return NULL;
+	}
+
+	spin_lock_init(&s390_domain->dma_table_lock);
+	spin_lock_init(&s390_domain->list_lock);
+	INIT_LIST_HEAD(&s390_domain->devices);
+
+	return &s390_domain->domain;
+}
+
+static void s390_domain_free(struct iommu_domain *domain)
+{
+	struct s390_domain *s390_domain = to_s390_domain(domain);
+
+	dma_cleanup_tables(s390_domain->dma_table);
+	kfree(s390_domain);
+}
+
+static int s390_iommu_attach_device(struct iommu_domain *domain,
+				    struct device *dev)
+{
+	struct s390_domain *s390_domain = to_s390_domain(domain);
+	struct zpci_dev *zdev = to_pci_dev(dev)->sysdata;
+	struct s390_domain_device *domain_device;
+	unsigned long flags;
+	int rc;
+
+	if (!zdev)
+		return -ENODEV;
+
+	domain_device = kzalloc(sizeof(*domain_device), GFP_KERNEL);
+	if (!domain_device)
+		return -ENOMEM;
+
+	if (zdev->dma_table)
+		zpci_dma_exit_device(zdev);
+
+	zdev->dma_table = s390_domain->dma_table;
+	rc = zpci_register_ioat(zdev, 0, zdev->start_dma, zdev->end_dma,
+				(u64) zdev->dma_table);
+	if (rc)
+		goto out_restore;
+
+	spin_lock_irqsave(&s390_domain->list_lock, flags);
+	/* First device defines the DMA range limits */
+	if (list_empty(&s390_domain->devices)) {
+		domain->geometry.aperture_start = zdev->start_dma;
+		domain->geometry.aperture_end = zdev->end_dma;
+		domain->geometry.force_aperture = true;
+	/* Allow only devices with identical DMA range limits */
+	} else if (domain->geometry.aperture_start != zdev->start_dma ||
+		   domain->geometry.aperture_end != zdev->end_dma) {
+		rc = -EINVAL;
+		spin_unlock_irqrestore(&s390_domain->list_lock, flags);
+		goto out_restore;
+	}
+	domain_device->zdev = zdev;
+	zdev->s390_domain = s390_domain;
+	list_add(&domain_device->list, &s390_domain->devices);
+	spin_unlock_irqrestore(&s390_domain->list_lock, flags);
+
+	return 0;
+
+out_restore:
+	zpci_dma_init_device(zdev);
+	kfree(domain_device);
+
+	return rc;
+}
+
+static void s390_iommu_detach_device(struct iommu_domain *domain,
+				     struct device *dev)
+{
+	struct s390_domain *s390_domain = to_s390_domain(domain);
+	struct zpci_dev *zdev = to_pci_dev(dev)->sysdata;
+	struct s390_domain_device *domain_device, *tmp;
+	unsigned long flags;
+	int found = 0;
+
+	if (!zdev)
+		return;
+
+	spin_lock_irqsave(&s390_domain->list_lock, flags);
+	list_for_each_entry_safe(domain_device, tmp, &s390_domain->devices,
+				 list) {
+		if (domain_device->zdev == zdev) {
+			list_del(&domain_device->list);
+			kfree(domain_device);
+			found = 1;
+			break;
+		}
+	}
+	spin_unlock_irqrestore(&s390_domain->list_lock, flags);
+
+	if (found) {
+		zdev->s390_domain = NULL;
+		zpci_unregister_ioat(zdev, 0);
+		zpci_dma_init_device(zdev);
+	}
+}
+
+static int s390_iommu_add_device(struct device *dev)
+{
+	struct iommu_group *group = iommu_group_get_for_dev(dev);
+	struct zpci_dev *zdev = to_pci_dev(dev)->sysdata;
+
+	if (IS_ERR(group))
+		return PTR_ERR(group);
+
+	iommu_group_put(group);
+	iommu_device_link(&zdev->iommu_dev, dev);
+
+	return 0;
+}
+
+static void s390_iommu_remove_device(struct device *dev)
+{
+	struct zpci_dev *zdev = to_pci_dev(dev)->sysdata;
+	struct iommu_domain *domain;
+
+	/*
+	 * This is a workaround for a scenario where the IOMMU API common code
+	 * "forgets" to call the detach_dev callback: After binding a device
+	 * to vfio-pci and completing the VFIO_SET_IOMMU ioctl (which triggers
+	 * the attach_dev), removing the device via
+	 * "echo 1 > /sys/bus/pci/devices/.../remove" won't trigger detach_dev,
+	 * only remove_device will be called via the BUS_NOTIFY_REMOVED_DEVICE
+	 * notifier.
+	 *
+	 * So let's call detach_dev from here if it hasn't been called before.
+	 */
+	if (zdev && zdev->s390_domain) {
+		domain = iommu_get_domain_for_dev(dev);
+		if (domain)
+			s390_iommu_detach_device(domain, dev);
+	}
+
+	iommu_device_unlink(&zdev->iommu_dev, dev);
+	iommu_group_remove_device(dev);
+}
+
+static int s390_iommu_update_trans(struct s390_domain *s390_domain,
+				   unsigned long pa, dma_addr_t dma_addr,
+				   size_t size, int flags)
+{
+	struct s390_domain_device *domain_device;
+	u8 *page_addr = (u8 *) (pa & PAGE_MASK);
+	dma_addr_t start_dma_addr = dma_addr;
+	unsigned long irq_flags, nr_pages, i;
+	unsigned long *entry;
+	int rc = 0;
+
+	if (dma_addr < s390_domain->domain.geometry.aperture_start ||
+	    dma_addr + size > s390_domain->domain.geometry.aperture_end)
+		return -EINVAL;
+
+	nr_pages = PAGE_ALIGN(size) >> PAGE_SHIFT;
+	if (!nr_pages)
+		return 0;
+
+	spin_lock_irqsave(&s390_domain->dma_table_lock, irq_flags);
+	for (i = 0; i < nr_pages; i++) {
+		entry = dma_walk_cpu_trans(s390_domain->dma_table, dma_addr);
+		if (!entry) {
+			rc = -ENOMEM;
+			goto undo_cpu_trans;
+		}
+		dma_update_cpu_trans(entry, page_addr, flags);
+		page_addr += PAGE_SIZE;
+		dma_addr += PAGE_SIZE;
+	}
+
+	spin_lock(&s390_domain->list_lock);
+	list_for_each_entry(domain_device, &s390_domain->devices, list) {
+		rc = zpci_refresh_trans((u64) domain_device->zdev->fh << 32,
+					start_dma_addr, nr_pages * PAGE_SIZE);
+		if (rc)
+			break;
+	}
+	spin_unlock(&s390_domain->list_lock);
+
+undo_cpu_trans:
+	if (rc && ((flags & ZPCI_PTE_VALID_MASK) == ZPCI_PTE_VALID)) {
+		flags = ZPCI_PTE_INVALID;
+		while (i-- > 0) {
+			page_addr -= PAGE_SIZE;
+			dma_addr -= PAGE_SIZE;
+			entry = dma_walk_cpu_trans(s390_domain->dma_table,
+						   dma_addr);
+			if (!entry)
+				break;
+			dma_update_cpu_trans(entry, page_addr, flags);
+		}
+	}
+	spin_unlock_irqrestore(&s390_domain->dma_table_lock, irq_flags);
+
+	return rc;
+}
+
+static int s390_iommu_map(struct iommu_domain *domain, unsigned long iova,
+			  phys_addr_t paddr, size_t size, int prot)
+{
+	struct s390_domain *s390_domain = to_s390_domain(domain);
+	int flags = ZPCI_PTE_VALID, rc = 0;
+
+	if (!(prot & IOMMU_READ))
+		return -EINVAL;
+
+	if (!(prot & IOMMU_WRITE))
+		flags |= ZPCI_TABLE_PROTECTED;
+
+	rc = s390_iommu_update_trans(s390_domain, (unsigned long) paddr, iova,
+				     size, flags);
+
+	return rc;
+}
+
+static phys_addr_t s390_iommu_iova_to_phys(struct iommu_domain *domain,
+					   dma_addr_t iova)
+{
+	struct s390_domain *s390_domain = to_s390_domain(domain);
+	unsigned long *sto, *pto, *rto, flags;
+	unsigned int rtx, sx, px;
+	phys_addr_t phys = 0;
+
+	if (iova < domain->geometry.aperture_start ||
+	    iova > domain->geometry.aperture_end)
+		return 0;
+
+	rtx = calc_rtx(iova);
+	sx = calc_sx(iova);
+	px = calc_px(iova);
+	rto = s390_domain->dma_table;
+
+	spin_lock_irqsave(&s390_domain->dma_table_lock, flags);
+	if (rto && reg_entry_isvalid(rto[rtx])) {
+		sto = get_rt_sto(rto[rtx]);
+		if (sto && reg_entry_isvalid(sto[sx])) {
+			pto = get_st_pto(sto[sx]);
+			if (pto && pt_entry_isvalid(pto[px]))
+				phys = pto[px] & ZPCI_PTE_ADDR_MASK;
+		}
+	}
+	spin_unlock_irqrestore(&s390_domain->dma_table_lock, flags);
+
+	return phys;
+}
+
+static size_t s390_iommu_unmap(struct iommu_domain *domain,
+			       unsigned long iova, size_t size)
+{
+	struct s390_domain *s390_domain = to_s390_domain(domain);
+	int flags = ZPCI_PTE_INVALID;
+	phys_addr_t paddr;
+	int rc;
+
+	paddr = s390_iommu_iova_to_phys(domain, iova);
+	if (!paddr)
+		return 0;
+
+	rc = s390_iommu_update_trans(s390_domain, (unsigned long) paddr, iova,
+				     size, flags);
+	if (rc)
+		return 0;
+
+	return size;
+}
+
+int zpci_init_iommu(struct zpci_dev *zdev)
+{
+	int rc = 0;
+
+	rc = iommu_device_sysfs_add(&zdev->iommu_dev, NULL, NULL,
+				    "s390-iommu.%08x", zdev->fid);
+	if (rc)
+		goto out_err;
+
+	iommu_device_set_ops(&zdev->iommu_dev, &s390_iommu_ops);
+
+	rc = iommu_device_register(&zdev->iommu_dev);
+	if (rc)
+		goto out_sysfs;
+
+	return 0;
+
+out_sysfs:
+	iommu_device_sysfs_remove(&zdev->iommu_dev);
+
+out_err:
+	return rc;
+}
+
+void zpci_destroy_iommu(struct zpci_dev *zdev)
+{
+	iommu_device_unregister(&zdev->iommu_dev);
+	iommu_device_sysfs_remove(&zdev->iommu_dev);
+}
+
+static const struct iommu_ops s390_iommu_ops = {
+	.capable = s390_iommu_capable,
+	.domain_alloc = s390_domain_alloc,
+	.domain_free = s390_domain_free,
+	.attach_dev = s390_iommu_attach_device,
+	.detach_dev = s390_iommu_detach_device,
+	.map = s390_iommu_map,
+	.unmap = s390_iommu_unmap,
+	.iova_to_phys = s390_iommu_iova_to_phys,
+	.add_device = s390_iommu_add_device,
+	.remove_device = s390_iommu_remove_device,
+	.device_group = generic_device_group,
+	.pgsize_bitmap = S390_IOMMU_PGSIZES,
+};
+
+static int __init s390_iommu_init(void)
+{
+	return bus_set_iommu(&pci_bus_type, &s390_iommu_ops);
+}
+subsys_initcall(s390_iommu_init);
diff --git a/drivers/iommu/tegra-gart.c b/drivers/iommu/tegra-gart.c
new file mode 100644
index 000000000..7b1361d57
--- /dev/null
+++ b/drivers/iommu/tegra-gart.c
@@ -0,0 +1,534 @@
+/*
+ * IOMMU API for GART in Tegra20
+ *
+ * Copyright (c) 2010-2012, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#define pr_fmt(fmt)	"%s(): " fmt, __func__
+
+#include <linux/module.h>
+#include <linux/platform_device.h>
+#include <linux/spinlock.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/mm.h>
+#include <linux/list.h>
+#include <linux/device.h>
+#include <linux/io.h>
+#include <linux/iommu.h>
+#include <linux/of.h>
+
+#include <asm/cacheflush.h>
+
+/* bitmap of the page sizes currently supported */
+#define GART_IOMMU_PGSIZES	(SZ_4K)
+
+#define GART_REG_BASE		0x24
+#define GART_CONFIG		(0x24 - GART_REG_BASE)
+#define GART_ENTRY_ADDR		(0x28 - GART_REG_BASE)
+#define GART_ENTRY_DATA		(0x2c - GART_REG_BASE)
+#define GART_ENTRY_PHYS_ADDR_VALID	(1 << 31)
+
+#define GART_PAGE_SHIFT		12
+#define GART_PAGE_SIZE		(1 << GART_PAGE_SHIFT)
+#define GART_PAGE_MASK						\
+	(~(GART_PAGE_SIZE - 1) & ~GART_ENTRY_PHYS_ADDR_VALID)
+
+struct gart_client {
+	struct device		*dev;
+	struct list_head	list;
+};
+
+struct gart_device {
+	void __iomem		*regs;
+	u32			*savedata;
+	u32			page_count;	/* total remappable size */
+	dma_addr_t		iovmm_base;	/* offset to vmm_area */
+	spinlock_t		pte_lock;	/* for pagetable */
+	struct list_head	client;
+	spinlock_t		client_lock;	/* for client list */
+	struct device		*dev;
+
+	struct iommu_device	iommu;		/* IOMMU Core handle */
+};
+
+struct gart_domain {
+	struct iommu_domain domain;		/* generic domain handle */
+	struct gart_device *gart;		/* link to gart device   */
+};
+
+static struct gart_device *gart_handle; /* unique for a system */
+
+static bool gart_debug;
+
+#define GART_PTE(_pfn)						\
+	(GART_ENTRY_PHYS_ADDR_VALID | ((_pfn) << PAGE_SHIFT))
+
+static struct gart_domain *to_gart_domain(struct iommu_domain *dom)
+{
+	return container_of(dom, struct gart_domain, domain);
+}
+
+/*
+ * Any interaction between any block on PPSB and a block on APB or AHB
+ * must have these read-back to ensure the APB/AHB bus transaction is
+ * complete before initiating activity on the PPSB block.
+ */
+#define FLUSH_GART_REGS(gart)	((void)readl((gart)->regs + GART_CONFIG))
+
+#define for_each_gart_pte(gart, iova)					\
+	for (iova = gart->iovmm_base;					\
+	     iova < gart->iovmm_base + GART_PAGE_SIZE * gart->page_count; \
+	     iova += GART_PAGE_SIZE)
+
+static inline void gart_set_pte(struct gart_device *gart,
+				unsigned long offs, u32 pte)
+{
+	writel(offs, gart->regs + GART_ENTRY_ADDR);
+	writel(pte, gart->regs + GART_ENTRY_DATA);
+
+	dev_dbg(gart->dev, "%s %08lx:%08x\n",
+		 pte ? "map" : "unmap", offs, pte & GART_PAGE_MASK);
+}
+
+static inline unsigned long gart_read_pte(struct gart_device *gart,
+					  unsigned long offs)
+{
+	unsigned long pte;
+
+	writel(offs, gart->regs + GART_ENTRY_ADDR);
+	pte = readl(gart->regs + GART_ENTRY_DATA);
+
+	return pte;
+}
+
+static void do_gart_setup(struct gart_device *gart, const u32 *data)
+{
+	unsigned long iova;
+
+	for_each_gart_pte(gart, iova)
+		gart_set_pte(gart, iova, data ? *(data++) : 0);
+
+	writel(1, gart->regs + GART_CONFIG);
+	FLUSH_GART_REGS(gart);
+}
+
+#ifdef DEBUG
+static void gart_dump_table(struct gart_device *gart)
+{
+	unsigned long iova;
+	unsigned long flags;
+
+	spin_lock_irqsave(&gart->pte_lock, flags);
+	for_each_gart_pte(gart, iova) {
+		unsigned long pte;
+
+		pte = gart_read_pte(gart, iova);
+
+		dev_dbg(gart->dev, "%s %08lx:%08lx\n",
+			(GART_ENTRY_PHYS_ADDR_VALID & pte) ? "v" : " ",
+			iova, pte & GART_PAGE_MASK);
+	}
+	spin_unlock_irqrestore(&gart->pte_lock, flags);
+}
+#else
+static inline void gart_dump_table(struct gart_device *gart)
+{
+}
+#endif
+
+static inline bool gart_iova_range_valid(struct gart_device *gart,
+					 unsigned long iova, size_t bytes)
+{
+	unsigned long iova_start, iova_end, gart_start, gart_end;
+
+	iova_start = iova;
+	iova_end = iova_start + bytes - 1;
+	gart_start = gart->iovmm_base;
+	gart_end = gart_start + gart->page_count * GART_PAGE_SIZE - 1;
+
+	if (iova_start < gart_start)
+		return false;
+	if (iova_end > gart_end)
+		return false;
+	return true;
+}
+
+static int gart_iommu_attach_dev(struct iommu_domain *domain,
+				 struct device *dev)
+{
+	struct gart_domain *gart_domain = to_gart_domain(domain);
+	struct gart_device *gart = gart_domain->gart;
+	struct gart_client *client, *c;
+	int err = 0;
+
+	client = devm_kzalloc(gart->dev, sizeof(*c), GFP_KERNEL);
+	if (!client)
+		return -ENOMEM;
+	client->dev = dev;
+
+	spin_lock(&gart->client_lock);
+	list_for_each_entry(c, &gart->client, list) {
+		if (c->dev == dev) {
+			dev_err(gart->dev,
+				"%s is already attached\n", dev_name(dev));
+			err = -EINVAL;
+			goto fail;
+		}
+	}
+	list_add(&client->list, &gart->client);
+	spin_unlock(&gart->client_lock);
+	dev_dbg(gart->dev, "Attached %s\n", dev_name(dev));
+	return 0;
+
+fail:
+	devm_kfree(gart->dev, client);
+	spin_unlock(&gart->client_lock);
+	return err;
+}
+
+static void gart_iommu_detach_dev(struct iommu_domain *domain,
+				  struct device *dev)
+{
+	struct gart_domain *gart_domain = to_gart_domain(domain);
+	struct gart_device *gart = gart_domain->gart;
+	struct gart_client *c;
+
+	spin_lock(&gart->client_lock);
+
+	list_for_each_entry(c, &gart->client, list) {
+		if (c->dev == dev) {
+			list_del(&c->list);
+			devm_kfree(gart->dev, c);
+			dev_dbg(gart->dev, "Detached %s\n", dev_name(dev));
+			goto out;
+		}
+	}
+	dev_err(gart->dev, "Couldn't find\n");
+out:
+	spin_unlock(&gart->client_lock);
+}
+
+static struct iommu_domain *gart_iommu_domain_alloc(unsigned type)
+{
+	struct gart_domain *gart_domain;
+	struct gart_device *gart;
+
+	if (type != IOMMU_DOMAIN_UNMANAGED)
+		return NULL;
+
+	gart = gart_handle;
+	if (!gart)
+		return NULL;
+
+	gart_domain = kzalloc(sizeof(*gart_domain), GFP_KERNEL);
+	if (!gart_domain)
+		return NULL;
+
+	gart_domain->gart = gart;
+	gart_domain->domain.geometry.aperture_start = gart->iovmm_base;
+	gart_domain->domain.geometry.aperture_end = gart->iovmm_base +
+					gart->page_count * GART_PAGE_SIZE - 1;
+	gart_domain->domain.geometry.force_aperture = true;
+
+	return &gart_domain->domain;
+}
+
+static void gart_iommu_domain_free(struct iommu_domain *domain)
+{
+	struct gart_domain *gart_domain = to_gart_domain(domain);
+	struct gart_device *gart = gart_domain->gart;
+
+	if (gart) {
+		spin_lock(&gart->client_lock);
+		if (!list_empty(&gart->client)) {
+			struct gart_client *c;
+
+			list_for_each_entry(c, &gart->client, list)
+				gart_iommu_detach_dev(domain, c->dev);
+		}
+		spin_unlock(&gart->client_lock);
+	}
+
+	kfree(gart_domain);
+}
+
+static int gart_iommu_map(struct iommu_domain *domain, unsigned long iova,
+			  phys_addr_t pa, size_t bytes, int prot)
+{
+	struct gart_domain *gart_domain = to_gart_domain(domain);
+	struct gart_device *gart = gart_domain->gart;
+	unsigned long flags;
+	unsigned long pfn;
+	unsigned long pte;
+
+	if (!gart_iova_range_valid(gart, iova, bytes))
+		return -EINVAL;
+
+	spin_lock_irqsave(&gart->pte_lock, flags);
+	pfn = __phys_to_pfn(pa);
+	if (!pfn_valid(pfn)) {
+		dev_err(gart->dev, "Invalid page: %pa\n", &pa);
+		spin_unlock_irqrestore(&gart->pte_lock, flags);
+		return -EINVAL;
+	}
+	if (gart_debug) {
+		pte = gart_read_pte(gart, iova);
+		if (pte & GART_ENTRY_PHYS_ADDR_VALID) {
+			spin_unlock_irqrestore(&gart->pte_lock, flags);
+			dev_err(gart->dev, "Page entry is in-use\n");
+			return -EBUSY;
+		}
+	}
+	gart_set_pte(gart, iova, GART_PTE(pfn));
+	FLUSH_GART_REGS(gart);
+	spin_unlock_irqrestore(&gart->pte_lock, flags);
+	return 0;
+}
+
+static size_t gart_iommu_unmap(struct iommu_domain *domain, unsigned long iova,
+			       size_t bytes)
+{
+	struct gart_domain *gart_domain = to_gart_domain(domain);
+	struct gart_device *gart = gart_domain->gart;
+	unsigned long flags;
+
+	if (!gart_iova_range_valid(gart, iova, bytes))
+		return 0;
+
+	spin_lock_irqsave(&gart->pte_lock, flags);
+	gart_set_pte(gart, iova, 0);
+	FLUSH_GART_REGS(gart);
+	spin_unlock_irqrestore(&gart->pte_lock, flags);
+	return bytes;
+}
+
+static phys_addr_t gart_iommu_iova_to_phys(struct iommu_domain *domain,
+					   dma_addr_t iova)
+{
+	struct gart_domain *gart_domain = to_gart_domain(domain);
+	struct gart_device *gart = gart_domain->gart;
+	unsigned long pte;
+	phys_addr_t pa;
+	unsigned long flags;
+
+	if (!gart_iova_range_valid(gart, iova, 0))
+		return -EINVAL;
+
+	spin_lock_irqsave(&gart->pte_lock, flags);
+	pte = gart_read_pte(gart, iova);
+	spin_unlock_irqrestore(&gart->pte_lock, flags);
+
+	pa = (pte & GART_PAGE_MASK);
+	if (!pfn_valid(__phys_to_pfn(pa))) {
+		dev_err(gart->dev, "No entry for %08llx:%pa\n",
+			 (unsigned long long)iova, &pa);
+		gart_dump_table(gart);
+		return -EINVAL;
+	}
+	return pa;
+}
+
+static bool gart_iommu_capable(enum iommu_cap cap)
+{
+	return false;
+}
+
+static int gart_iommu_add_device(struct device *dev)
+{
+	struct iommu_group *group = iommu_group_get_for_dev(dev);
+
+	if (IS_ERR(group))
+		return PTR_ERR(group);
+
+	iommu_group_put(group);
+
+	iommu_device_link(&gart_handle->iommu, dev);
+
+	return 0;
+}
+
+static void gart_iommu_remove_device(struct device *dev)
+{
+	iommu_group_remove_device(dev);
+	iommu_device_unlink(&gart_handle->iommu, dev);
+}
+
+static const struct iommu_ops gart_iommu_ops = {
+	.capable	= gart_iommu_capable,
+	.domain_alloc	= gart_iommu_domain_alloc,
+	.domain_free	= gart_iommu_domain_free,
+	.attach_dev	= gart_iommu_attach_dev,
+	.detach_dev	= gart_iommu_detach_dev,
+	.add_device	= gart_iommu_add_device,
+	.remove_device	= gart_iommu_remove_device,
+	.device_group	= generic_device_group,
+	.map		= gart_iommu_map,
+	.unmap		= gart_iommu_unmap,
+	.iova_to_phys	= gart_iommu_iova_to_phys,
+	.pgsize_bitmap	= GART_IOMMU_PGSIZES,
+};
+
+static int tegra_gart_suspend(struct device *dev)
+{
+	struct gart_device *gart = dev_get_drvdata(dev);
+	unsigned long iova;
+	u32 *data = gart->savedata;
+	unsigned long flags;
+
+	spin_lock_irqsave(&gart->pte_lock, flags);
+	for_each_gart_pte(gart, iova)
+		*(data++) = gart_read_pte(gart, iova);
+	spin_unlock_irqrestore(&gart->pte_lock, flags);
+	return 0;
+}
+
+static int tegra_gart_resume(struct device *dev)
+{
+	struct gart_device *gart = dev_get_drvdata(dev);
+	unsigned long flags;
+
+	spin_lock_irqsave(&gart->pte_lock, flags);
+	do_gart_setup(gart, gart->savedata);
+	spin_unlock_irqrestore(&gart->pte_lock, flags);
+	return 0;
+}
+
+static int tegra_gart_probe(struct platform_device *pdev)
+{
+	struct gart_device *gart;
+	struct resource *res, *res_remap;
+	void __iomem *gart_regs;
+	struct device *dev = &pdev->dev;
+	int ret;
+
+	if (gart_handle)
+		return -EIO;
+
+	BUILD_BUG_ON(PAGE_SHIFT != GART_PAGE_SHIFT);
+
+	/* the GART memory aperture is required */
+	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	res_remap = platform_get_resource(pdev, IORESOURCE_MEM, 1);
+	if (!res || !res_remap) {
+		dev_err(dev, "GART memory aperture expected\n");
+		return -ENXIO;
+	}
+
+	gart = devm_kzalloc(dev, sizeof(*gart), GFP_KERNEL);
+	if (!gart) {
+		dev_err(dev, "failed to allocate gart_device\n");
+		return -ENOMEM;
+	}
+
+	gart_regs = devm_ioremap(dev, res->start, resource_size(res));
+	if (!gart_regs) {
+		dev_err(dev, "failed to remap GART registers\n");
+		return -ENXIO;
+	}
+
+	ret = iommu_device_sysfs_add(&gart->iommu, &pdev->dev, NULL,
+				     dev_name(&pdev->dev));
+	if (ret) {
+		dev_err(dev, "Failed to register IOMMU in sysfs\n");
+		return ret;
+	}
+
+	iommu_device_set_ops(&gart->iommu, &gart_iommu_ops);
+
+	ret = iommu_device_register(&gart->iommu);
+	if (ret) {
+		dev_err(dev, "Failed to register IOMMU\n");
+		iommu_device_sysfs_remove(&gart->iommu);
+		return ret;
+	}
+
+	gart->dev = &pdev->dev;
+	spin_lock_init(&gart->pte_lock);
+	spin_lock_init(&gart->client_lock);
+	INIT_LIST_HEAD(&gart->client);
+	gart->regs = gart_regs;
+	gart->iovmm_base = (dma_addr_t)res_remap->start;
+	gart->page_count = (resource_size(res_remap) >> GART_PAGE_SHIFT);
+
+	gart->savedata = vmalloc(array_size(sizeof(u32), gart->page_count));
+	if (!gart->savedata) {
+		dev_err(dev, "failed to allocate context save area\n");
+		return -ENOMEM;
+	}
+
+	platform_set_drvdata(pdev, gart);
+	do_gart_setup(gart, NULL);
+
+	gart_handle = gart;
+
+	return 0;
+}
+
+static int tegra_gart_remove(struct platform_device *pdev)
+{
+	struct gart_device *gart = platform_get_drvdata(pdev);
+
+	iommu_device_unregister(&gart->iommu);
+	iommu_device_sysfs_remove(&gart->iommu);
+
+	writel(0, gart->regs + GART_CONFIG);
+	if (gart->savedata)
+		vfree(gart->savedata);
+	gart_handle = NULL;
+	return 0;
+}
+
+static const struct dev_pm_ops tegra_gart_pm_ops = {
+	.suspend	= tegra_gart_suspend,
+	.resume		= tegra_gart_resume,
+};
+
+static const struct of_device_id tegra_gart_of_match[] = {
+	{ .compatible = "nvidia,tegra20-gart", },
+	{ },
+};
+MODULE_DEVICE_TABLE(of, tegra_gart_of_match);
+
+static struct platform_driver tegra_gart_driver = {
+	.probe		= tegra_gart_probe,
+	.remove		= tegra_gart_remove,
+	.driver = {
+		.name	= "tegra-gart",
+		.pm	= &tegra_gart_pm_ops,
+		.of_match_table = tegra_gart_of_match,
+	},
+};
+
+static int tegra_gart_init(void)
+{
+	return platform_driver_register(&tegra_gart_driver);
+}
+
+static void __exit tegra_gart_exit(void)
+{
+	platform_driver_unregister(&tegra_gart_driver);
+}
+
+subsys_initcall(tegra_gart_init);
+module_exit(tegra_gart_exit);
+module_param(gart_debug, bool, 0644);
+
+MODULE_PARM_DESC(gart_debug, "Enable GART debugging");
+MODULE_DESCRIPTION("IOMMU API for GART in Tegra20");
+MODULE_AUTHOR("Hiroshi DOYU <hdoyu@nvidia.com>");
+MODULE_ALIAS("platform:tegra-gart");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/iommu/tegra-smmu.c b/drivers/iommu/tegra-smmu.c
new file mode 100644
index 000000000..fa0ecb5e6
--- /dev/null
+++ b/drivers/iommu/tegra-smmu.c
@@ -0,0 +1,1111 @@
+/*
+ * Copyright (C) 2011-2014 NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/bitops.h>
+#include <linux/debugfs.h>
+#include <linux/err.h>
+#include <linux/iommu.h>
+#include <linux/kernel.h>
+#include <linux/of.h>
+#include <linux/of_device.h>
+#include <linux/platform_device.h>
+#include <linux/slab.h>
+#include <linux/dma-mapping.h>
+
+#include <soc/tegra/ahb.h>
+#include <soc/tegra/mc.h>
+
+struct tegra_smmu_group {
+	struct list_head list;
+	const struct tegra_smmu_group_soc *soc;
+	struct iommu_group *group;
+};
+
+struct tegra_smmu {
+	void __iomem *regs;
+	struct device *dev;
+
+	struct tegra_mc *mc;
+	const struct tegra_smmu_soc *soc;
+
+	struct list_head groups;
+
+	unsigned long pfn_mask;
+	unsigned long tlb_mask;
+
+	unsigned long *asids;
+	struct mutex lock;
+
+	struct list_head list;
+
+	struct dentry *debugfs;
+
+	struct iommu_device iommu;	/* IOMMU Core code handle */
+};
+
+struct tegra_smmu_as {
+	struct iommu_domain domain;
+	struct tegra_smmu *smmu;
+	unsigned int use_count;
+	u32 *count;
+	struct page **pts;
+	struct page *pd;
+	dma_addr_t pd_dma;
+	unsigned id;
+	u32 attr;
+};
+
+static struct tegra_smmu_as *to_smmu_as(struct iommu_domain *dom)
+{
+	return container_of(dom, struct tegra_smmu_as, domain);
+}
+
+static inline void smmu_writel(struct tegra_smmu *smmu, u32 value,
+			       unsigned long offset)
+{
+	writel(value, smmu->regs + offset);
+}
+
+static inline u32 smmu_readl(struct tegra_smmu *smmu, unsigned long offset)
+{
+	return readl(smmu->regs + offset);
+}
+
+#define SMMU_CONFIG 0x010
+#define  SMMU_CONFIG_ENABLE (1 << 0)
+
+#define SMMU_TLB_CONFIG 0x14
+#define  SMMU_TLB_CONFIG_HIT_UNDER_MISS (1 << 29)
+#define  SMMU_TLB_CONFIG_ROUND_ROBIN_ARBITRATION (1 << 28)
+#define  SMMU_TLB_CONFIG_ACTIVE_LINES(smmu) \
+	((smmu)->soc->num_tlb_lines & (smmu)->tlb_mask)
+
+#define SMMU_PTC_CONFIG 0x18
+#define  SMMU_PTC_CONFIG_ENABLE (1 << 29)
+#define  SMMU_PTC_CONFIG_REQ_LIMIT(x) (((x) & 0x0f) << 24)
+#define  SMMU_PTC_CONFIG_INDEX_MAP(x) ((x) & 0x3f)
+
+#define SMMU_PTB_ASID 0x01c
+#define  SMMU_PTB_ASID_VALUE(x) ((x) & 0x7f)
+
+#define SMMU_PTB_DATA 0x020
+#define  SMMU_PTB_DATA_VALUE(dma, attr) ((dma) >> 12 | (attr))
+
+#define SMMU_MK_PDE(dma, attr) ((dma) >> SMMU_PTE_SHIFT | (attr))
+
+#define SMMU_TLB_FLUSH 0x030
+#define  SMMU_TLB_FLUSH_VA_MATCH_ALL     (0 << 0)
+#define  SMMU_TLB_FLUSH_VA_MATCH_SECTION (2 << 0)
+#define  SMMU_TLB_FLUSH_VA_MATCH_GROUP   (3 << 0)
+#define  SMMU_TLB_FLUSH_VA_SECTION(addr) ((((addr) & 0xffc00000) >> 12) | \
+					  SMMU_TLB_FLUSH_VA_MATCH_SECTION)
+#define  SMMU_TLB_FLUSH_VA_GROUP(addr)   ((((addr) & 0xffffc000) >> 12) | \
+					  SMMU_TLB_FLUSH_VA_MATCH_GROUP)
+#define  SMMU_TLB_FLUSH_ASID_MATCH       (1 << 31)
+
+#define SMMU_PTC_FLUSH 0x034
+#define  SMMU_PTC_FLUSH_TYPE_ALL (0 << 0)
+#define  SMMU_PTC_FLUSH_TYPE_ADR (1 << 0)
+
+#define SMMU_PTC_FLUSH_HI 0x9b8
+#define  SMMU_PTC_FLUSH_HI_MASK 0x3
+
+/* per-SWGROUP SMMU_*_ASID register */
+#define SMMU_ASID_ENABLE (1 << 31)
+#define SMMU_ASID_MASK 0x7f
+#define SMMU_ASID_VALUE(x) ((x) & SMMU_ASID_MASK)
+
+/* page table definitions */
+#define SMMU_NUM_PDE 1024
+#define SMMU_NUM_PTE 1024
+
+#define SMMU_SIZE_PD (SMMU_NUM_PDE * 4)
+#define SMMU_SIZE_PT (SMMU_NUM_PTE * 4)
+
+#define SMMU_PDE_SHIFT 22
+#define SMMU_PTE_SHIFT 12
+
+#define SMMU_PD_READABLE	(1 << 31)
+#define SMMU_PD_WRITABLE	(1 << 30)
+#define SMMU_PD_NONSECURE	(1 << 29)
+
+#define SMMU_PDE_READABLE	(1 << 31)
+#define SMMU_PDE_WRITABLE	(1 << 30)
+#define SMMU_PDE_NONSECURE	(1 << 29)
+#define SMMU_PDE_NEXT		(1 << 28)
+
+#define SMMU_PTE_READABLE	(1 << 31)
+#define SMMU_PTE_WRITABLE	(1 << 30)
+#define SMMU_PTE_NONSECURE	(1 << 29)
+
+#define SMMU_PDE_ATTR		(SMMU_PDE_READABLE | SMMU_PDE_WRITABLE | \
+				 SMMU_PDE_NONSECURE)
+#define SMMU_PTE_ATTR		(SMMU_PTE_READABLE | SMMU_PTE_WRITABLE | \
+				 SMMU_PTE_NONSECURE)
+
+static unsigned int iova_pd_index(unsigned long iova)
+{
+	return (iova >> SMMU_PDE_SHIFT) & (SMMU_NUM_PDE - 1);
+}
+
+static unsigned int iova_pt_index(unsigned long iova)
+{
+	return (iova >> SMMU_PTE_SHIFT) & (SMMU_NUM_PTE - 1);
+}
+
+static bool smmu_dma_addr_valid(struct tegra_smmu *smmu, dma_addr_t addr)
+{
+	addr >>= 12;
+	return (addr & smmu->pfn_mask) == addr;
+}
+
+static dma_addr_t smmu_pde_to_dma(struct tegra_smmu *smmu, u32 pde)
+{
+	return (dma_addr_t)(pde & smmu->pfn_mask) << 12;
+}
+
+static void smmu_flush_ptc_all(struct tegra_smmu *smmu)
+{
+	smmu_writel(smmu, SMMU_PTC_FLUSH_TYPE_ALL, SMMU_PTC_FLUSH);
+}
+
+static inline void smmu_flush_ptc(struct tegra_smmu *smmu, dma_addr_t dma,
+				  unsigned long offset)
+{
+	u32 value;
+
+	offset &= ~(smmu->mc->soc->atom_size - 1);
+
+	if (smmu->mc->soc->num_address_bits > 32) {
+#ifdef CONFIG_ARCH_DMA_ADDR_T_64BIT
+		value = (dma >> 32) & SMMU_PTC_FLUSH_HI_MASK;
+#else
+		value = 0;
+#endif
+		smmu_writel(smmu, value, SMMU_PTC_FLUSH_HI);
+	}
+
+	value = (dma + offset) | SMMU_PTC_FLUSH_TYPE_ADR;
+	smmu_writel(smmu, value, SMMU_PTC_FLUSH);
+}
+
+static inline void smmu_flush_tlb(struct tegra_smmu *smmu)
+{
+	smmu_writel(smmu, SMMU_TLB_FLUSH_VA_MATCH_ALL, SMMU_TLB_FLUSH);
+}
+
+static inline void smmu_flush_tlb_asid(struct tegra_smmu *smmu,
+				       unsigned long asid)
+{
+	u32 value;
+
+	if (smmu->soc->num_asids == 4)
+		value = (asid & 0x3) << 29;
+	else
+		value = (asid & 0x7f) << 24;
+
+	value |= SMMU_TLB_FLUSH_ASID_MATCH | SMMU_TLB_FLUSH_VA_MATCH_ALL;
+	smmu_writel(smmu, value, SMMU_TLB_FLUSH);
+}
+
+static inline void smmu_flush_tlb_section(struct tegra_smmu *smmu,
+					  unsigned long asid,
+					  unsigned long iova)
+{
+	u32 value;
+
+	if (smmu->soc->num_asids == 4)
+		value = (asid & 0x3) << 29;
+	else
+		value = (asid & 0x7f) << 24;
+
+	value |= SMMU_TLB_FLUSH_ASID_MATCH | SMMU_TLB_FLUSH_VA_SECTION(iova);
+	smmu_writel(smmu, value, SMMU_TLB_FLUSH);
+}
+
+static inline void smmu_flush_tlb_group(struct tegra_smmu *smmu,
+					unsigned long asid,
+					unsigned long iova)
+{
+	u32 value;
+
+	if (smmu->soc->num_asids == 4)
+		value = (asid & 0x3) << 29;
+	else
+		value = (asid & 0x7f) << 24;
+
+	value |= SMMU_TLB_FLUSH_ASID_MATCH | SMMU_TLB_FLUSH_VA_GROUP(iova);
+	smmu_writel(smmu, value, SMMU_TLB_FLUSH);
+}
+
+static inline void smmu_flush(struct tegra_smmu *smmu)
+{
+	smmu_readl(smmu, SMMU_CONFIG);
+}
+
+static int tegra_smmu_alloc_asid(struct tegra_smmu *smmu, unsigned int *idp)
+{
+	unsigned long id;
+
+	mutex_lock(&smmu->lock);
+
+	id = find_first_zero_bit(smmu->asids, smmu->soc->num_asids);
+	if (id >= smmu->soc->num_asids) {
+		mutex_unlock(&smmu->lock);
+		return -ENOSPC;
+	}
+
+	set_bit(id, smmu->asids);
+	*idp = id;
+
+	mutex_unlock(&smmu->lock);
+	return 0;
+}
+
+static void tegra_smmu_free_asid(struct tegra_smmu *smmu, unsigned int id)
+{
+	mutex_lock(&smmu->lock);
+	clear_bit(id, smmu->asids);
+	mutex_unlock(&smmu->lock);
+}
+
+static bool tegra_smmu_capable(enum iommu_cap cap)
+{
+	return false;
+}
+
+static struct iommu_domain *tegra_smmu_domain_alloc(unsigned type)
+{
+	struct tegra_smmu_as *as;
+
+	if (type != IOMMU_DOMAIN_UNMANAGED)
+		return NULL;
+
+	as = kzalloc(sizeof(*as), GFP_KERNEL);
+	if (!as)
+		return NULL;
+
+	as->attr = SMMU_PD_READABLE | SMMU_PD_WRITABLE | SMMU_PD_NONSECURE;
+
+	as->pd = alloc_page(GFP_KERNEL | __GFP_DMA | __GFP_ZERO);
+	if (!as->pd) {
+		kfree(as);
+		return NULL;
+	}
+
+	as->count = kcalloc(SMMU_NUM_PDE, sizeof(u32), GFP_KERNEL);
+	if (!as->count) {
+		__free_page(as->pd);
+		kfree(as);
+		return NULL;
+	}
+
+	as->pts = kcalloc(SMMU_NUM_PDE, sizeof(*as->pts), GFP_KERNEL);
+	if (!as->pts) {
+		kfree(as->count);
+		__free_page(as->pd);
+		kfree(as);
+		return NULL;
+	}
+
+	/* setup aperture */
+	as->domain.geometry.aperture_start = 0;
+	as->domain.geometry.aperture_end = 0xffffffff;
+	as->domain.geometry.force_aperture = true;
+
+	return &as->domain;
+}
+
+static void tegra_smmu_domain_free(struct iommu_domain *domain)
+{
+	struct tegra_smmu_as *as = to_smmu_as(domain);
+
+	/* TODO: free page directory and page tables */
+
+	kfree(as);
+}
+
+static const struct tegra_smmu_swgroup *
+tegra_smmu_find_swgroup(struct tegra_smmu *smmu, unsigned int swgroup)
+{
+	const struct tegra_smmu_swgroup *group = NULL;
+	unsigned int i;
+
+	for (i = 0; i < smmu->soc->num_swgroups; i++) {
+		if (smmu->soc->swgroups[i].swgroup == swgroup) {
+			group = &smmu->soc->swgroups[i];
+			break;
+		}
+	}
+
+	return group;
+}
+
+static void tegra_smmu_enable(struct tegra_smmu *smmu, unsigned int swgroup,
+			      unsigned int asid)
+{
+	const struct tegra_smmu_swgroup *group;
+	unsigned int i;
+	u32 value;
+
+	for (i = 0; i < smmu->soc->num_clients; i++) {
+		const struct tegra_mc_client *client = &smmu->soc->clients[i];
+
+		if (client->swgroup != swgroup)
+			continue;
+
+		value = smmu_readl(smmu, client->smmu.reg);
+		value |= BIT(client->smmu.bit);
+		smmu_writel(smmu, value, client->smmu.reg);
+	}
+
+	group = tegra_smmu_find_swgroup(smmu, swgroup);
+	if (group) {
+		value = smmu_readl(smmu, group->reg);
+		value &= ~SMMU_ASID_MASK;
+		value |= SMMU_ASID_VALUE(asid);
+		value |= SMMU_ASID_ENABLE;
+		smmu_writel(smmu, value, group->reg);
+	}
+}
+
+static void tegra_smmu_disable(struct tegra_smmu *smmu, unsigned int swgroup,
+			       unsigned int asid)
+{
+	const struct tegra_smmu_swgroup *group;
+	unsigned int i;
+	u32 value;
+
+	group = tegra_smmu_find_swgroup(smmu, swgroup);
+	if (group) {
+		value = smmu_readl(smmu, group->reg);
+		value &= ~SMMU_ASID_MASK;
+		value |= SMMU_ASID_VALUE(asid);
+		value &= ~SMMU_ASID_ENABLE;
+		smmu_writel(smmu, value, group->reg);
+	}
+
+	for (i = 0; i < smmu->soc->num_clients; i++) {
+		const struct tegra_mc_client *client = &smmu->soc->clients[i];
+
+		if (client->swgroup != swgroup)
+			continue;
+
+		value = smmu_readl(smmu, client->smmu.reg);
+		value &= ~BIT(client->smmu.bit);
+		smmu_writel(smmu, value, client->smmu.reg);
+	}
+}
+
+static int tegra_smmu_as_prepare(struct tegra_smmu *smmu,
+				 struct tegra_smmu_as *as)
+{
+	u32 value;
+	int err;
+
+	if (as->use_count > 0) {
+		as->use_count++;
+		return 0;
+	}
+
+	as->pd_dma = dma_map_page(smmu->dev, as->pd, 0, SMMU_SIZE_PD,
+				  DMA_TO_DEVICE);
+	if (dma_mapping_error(smmu->dev, as->pd_dma))
+		return -ENOMEM;
+
+	/* We can't handle 64-bit DMA addresses */
+	if (!smmu_dma_addr_valid(smmu, as->pd_dma)) {
+		err = -ENOMEM;
+		goto err_unmap;
+	}
+
+	err = tegra_smmu_alloc_asid(smmu, &as->id);
+	if (err < 0)
+		goto err_unmap;
+
+	smmu_flush_ptc(smmu, as->pd_dma, 0);
+	smmu_flush_tlb_asid(smmu, as->id);
+
+	smmu_writel(smmu, as->id & 0x7f, SMMU_PTB_ASID);
+	value = SMMU_PTB_DATA_VALUE(as->pd_dma, as->attr);
+	smmu_writel(smmu, value, SMMU_PTB_DATA);
+	smmu_flush(smmu);
+
+	as->smmu = smmu;
+	as->use_count++;
+
+	return 0;
+
+err_unmap:
+	dma_unmap_page(smmu->dev, as->pd_dma, SMMU_SIZE_PD, DMA_TO_DEVICE);
+	return err;
+}
+
+static void tegra_smmu_as_unprepare(struct tegra_smmu *smmu,
+				    struct tegra_smmu_as *as)
+{
+	if (--as->use_count > 0)
+		return;
+
+	tegra_smmu_free_asid(smmu, as->id);
+
+	dma_unmap_page(smmu->dev, as->pd_dma, SMMU_SIZE_PD, DMA_TO_DEVICE);
+
+	as->smmu = NULL;
+}
+
+static int tegra_smmu_attach_dev(struct iommu_domain *domain,
+				 struct device *dev)
+{
+	struct tegra_smmu *smmu = dev->archdata.iommu;
+	struct tegra_smmu_as *as = to_smmu_as(domain);
+	struct device_node *np = dev->of_node;
+	struct of_phandle_args args;
+	unsigned int index = 0;
+	int err = 0;
+
+	while (!of_parse_phandle_with_args(np, "iommus", "#iommu-cells", index,
+					   &args)) {
+		unsigned int swgroup = args.args[0];
+
+		if (args.np != smmu->dev->of_node) {
+			of_node_put(args.np);
+			continue;
+		}
+
+		of_node_put(args.np);
+
+		err = tegra_smmu_as_prepare(smmu, as);
+		if (err < 0)
+			return err;
+
+		tegra_smmu_enable(smmu, swgroup, as->id);
+		index++;
+	}
+
+	if (index == 0)
+		return -ENODEV;
+
+	return 0;
+}
+
+static void tegra_smmu_detach_dev(struct iommu_domain *domain, struct device *dev)
+{
+	struct tegra_smmu_as *as = to_smmu_as(domain);
+	struct device_node *np = dev->of_node;
+	struct tegra_smmu *smmu = as->smmu;
+	struct of_phandle_args args;
+	unsigned int index = 0;
+
+	while (!of_parse_phandle_with_args(np, "iommus", "#iommu-cells", index,
+					   &args)) {
+		unsigned int swgroup = args.args[0];
+
+		if (args.np != smmu->dev->of_node) {
+			of_node_put(args.np);
+			continue;
+		}
+
+		of_node_put(args.np);
+
+		tegra_smmu_disable(smmu, swgroup, as->id);
+		tegra_smmu_as_unprepare(smmu, as);
+		index++;
+	}
+}
+
+static void tegra_smmu_set_pde(struct tegra_smmu_as *as, unsigned long iova,
+			       u32 value)
+{
+	unsigned int pd_index = iova_pd_index(iova);
+	struct tegra_smmu *smmu = as->smmu;
+	u32 *pd = page_address(as->pd);
+	unsigned long offset = pd_index * sizeof(*pd);
+
+	/* Set the page directory entry first */
+	pd[pd_index] = value;
+
+	/* The flush the page directory entry from caches */
+	dma_sync_single_range_for_device(smmu->dev, as->pd_dma, offset,
+					 sizeof(*pd), DMA_TO_DEVICE);
+
+	/* And flush the iommu */
+	smmu_flush_ptc(smmu, as->pd_dma, offset);
+	smmu_flush_tlb_section(smmu, as->id, iova);
+	smmu_flush(smmu);
+}
+
+static u32 *tegra_smmu_pte_offset(struct page *pt_page, unsigned long iova)
+{
+	u32 *pt = page_address(pt_page);
+
+	return pt + iova_pt_index(iova);
+}
+
+static u32 *tegra_smmu_pte_lookup(struct tegra_smmu_as *as, unsigned long iova,
+				  dma_addr_t *dmap)
+{
+	unsigned int pd_index = iova_pd_index(iova);
+	struct tegra_smmu *smmu = as->smmu;
+	struct page *pt_page;
+	u32 *pd;
+
+	pt_page = as->pts[pd_index];
+	if (!pt_page)
+		return NULL;
+
+	pd = page_address(as->pd);
+	*dmap = smmu_pde_to_dma(smmu, pd[pd_index]);
+
+	return tegra_smmu_pte_offset(pt_page, iova);
+}
+
+static u32 *as_get_pte(struct tegra_smmu_as *as, dma_addr_t iova,
+		       dma_addr_t *dmap)
+{
+	unsigned int pde = iova_pd_index(iova);
+	struct tegra_smmu *smmu = as->smmu;
+
+	if (!as->pts[pde]) {
+		struct page *page;
+		dma_addr_t dma;
+
+		page = alloc_page(GFP_KERNEL | __GFP_DMA | __GFP_ZERO);
+		if (!page)
+			return NULL;
+
+		dma = dma_map_page(smmu->dev, page, 0, SMMU_SIZE_PT,
+				   DMA_TO_DEVICE);
+		if (dma_mapping_error(smmu->dev, dma)) {
+			__free_page(page);
+			return NULL;
+		}
+
+		if (!smmu_dma_addr_valid(smmu, dma)) {
+			dma_unmap_page(smmu->dev, dma, SMMU_SIZE_PT,
+				       DMA_TO_DEVICE);
+			__free_page(page);
+			return NULL;
+		}
+
+		as->pts[pde] = page;
+
+		tegra_smmu_set_pde(as, iova, SMMU_MK_PDE(dma, SMMU_PDE_ATTR |
+							      SMMU_PDE_NEXT));
+
+		*dmap = dma;
+	} else {
+		u32 *pd = page_address(as->pd);
+
+		*dmap = smmu_pde_to_dma(smmu, pd[pde]);
+	}
+
+	return tegra_smmu_pte_offset(as->pts[pde], iova);
+}
+
+static void tegra_smmu_pte_get_use(struct tegra_smmu_as *as, unsigned long iova)
+{
+	unsigned int pd_index = iova_pd_index(iova);
+
+	as->count[pd_index]++;
+}
+
+static void tegra_smmu_pte_put_use(struct tegra_smmu_as *as, unsigned long iova)
+{
+	unsigned int pde = iova_pd_index(iova);
+	struct page *page = as->pts[pde];
+
+	/*
+	 * When no entries in this page table are used anymore, return the
+	 * memory page to the system.
+	 */
+	if (--as->count[pde] == 0) {
+		struct tegra_smmu *smmu = as->smmu;
+		u32 *pd = page_address(as->pd);
+		dma_addr_t pte_dma = smmu_pde_to_dma(smmu, pd[pde]);
+
+		tegra_smmu_set_pde(as, iova, 0);
+
+		dma_unmap_page(smmu->dev, pte_dma, SMMU_SIZE_PT, DMA_TO_DEVICE);
+		__free_page(page);
+		as->pts[pde] = NULL;
+	}
+}
+
+static void tegra_smmu_set_pte(struct tegra_smmu_as *as, unsigned long iova,
+			       u32 *pte, dma_addr_t pte_dma, u32 val)
+{
+	struct tegra_smmu *smmu = as->smmu;
+	unsigned long offset = offset_in_page(pte);
+
+	*pte = val;
+
+	dma_sync_single_range_for_device(smmu->dev, pte_dma, offset,
+					 4, DMA_TO_DEVICE);
+	smmu_flush_ptc(smmu, pte_dma, offset);
+	smmu_flush_tlb_group(smmu, as->id, iova);
+	smmu_flush(smmu);
+}
+
+static int tegra_smmu_map(struct iommu_domain *domain, unsigned long iova,
+			  phys_addr_t paddr, size_t size, int prot)
+{
+	struct tegra_smmu_as *as = to_smmu_as(domain);
+	dma_addr_t pte_dma;
+	u32 *pte;
+
+	pte = as_get_pte(as, iova, &pte_dma);
+	if (!pte)
+		return -ENOMEM;
+
+	/* If we aren't overwriting a pre-existing entry, increment use */
+	if (*pte == 0)
+		tegra_smmu_pte_get_use(as, iova);
+
+	tegra_smmu_set_pte(as, iova, pte, pte_dma,
+			   __phys_to_pfn(paddr) | SMMU_PTE_ATTR);
+
+	return 0;
+}
+
+static size_t tegra_smmu_unmap(struct iommu_domain *domain, unsigned long iova,
+			       size_t size)
+{
+	struct tegra_smmu_as *as = to_smmu_as(domain);
+	dma_addr_t pte_dma;
+	u32 *pte;
+
+	pte = tegra_smmu_pte_lookup(as, iova, &pte_dma);
+	if (!pte || !*pte)
+		return 0;
+
+	tegra_smmu_set_pte(as, iova, pte, pte_dma, 0);
+	tegra_smmu_pte_put_use(as, iova);
+
+	return size;
+}
+
+static phys_addr_t tegra_smmu_iova_to_phys(struct iommu_domain *domain,
+					   dma_addr_t iova)
+{
+	struct tegra_smmu_as *as = to_smmu_as(domain);
+	unsigned long pfn;
+	dma_addr_t pte_dma;
+	u32 *pte;
+
+	pte = tegra_smmu_pte_lookup(as, iova, &pte_dma);
+	if (!pte || !*pte)
+		return 0;
+
+	pfn = *pte & as->smmu->pfn_mask;
+
+	return PFN_PHYS(pfn);
+}
+
+static struct tegra_smmu *tegra_smmu_find(struct device_node *np)
+{
+	struct platform_device *pdev;
+	struct tegra_mc *mc;
+
+	pdev = of_find_device_by_node(np);
+	if (!pdev)
+		return NULL;
+
+	mc = platform_get_drvdata(pdev);
+	if (!mc)
+		return NULL;
+
+	return mc->smmu;
+}
+
+static int tegra_smmu_configure(struct tegra_smmu *smmu, struct device *dev,
+				struct of_phandle_args *args)
+{
+	const struct iommu_ops *ops = smmu->iommu.ops;
+	int err;
+
+	err = iommu_fwspec_init(dev, &dev->of_node->fwnode, ops);
+	if (err < 0) {
+		dev_err(dev, "failed to initialize fwspec: %d\n", err);
+		return err;
+	}
+
+	err = ops->of_xlate(dev, args);
+	if (err < 0) {
+		dev_err(dev, "failed to parse SW group ID: %d\n", err);
+		iommu_fwspec_free(dev);
+		return err;
+	}
+
+	return 0;
+}
+
+static int tegra_smmu_add_device(struct device *dev)
+{
+	struct device_node *np = dev->of_node;
+	struct tegra_smmu *smmu = NULL;
+	struct iommu_group *group;
+	struct of_phandle_args args;
+	unsigned int index = 0;
+	int err;
+
+	while (of_parse_phandle_with_args(np, "iommus", "#iommu-cells", index,
+					  &args) == 0) {
+		smmu = tegra_smmu_find(args.np);
+		if (smmu) {
+			err = tegra_smmu_configure(smmu, dev, &args);
+			of_node_put(args.np);
+
+			if (err < 0)
+				return err;
+
+			/*
+			 * Only a single IOMMU master interface is currently
+			 * supported by the Linux kernel, so abort after the
+			 * first match.
+			 */
+			dev->archdata.iommu = smmu;
+
+			iommu_device_link(&smmu->iommu, dev);
+
+			break;
+		}
+
+		of_node_put(args.np);
+		index++;
+	}
+
+	if (!smmu)
+		return -ENODEV;
+
+	group = iommu_group_get_for_dev(dev);
+	if (IS_ERR(group))
+		return PTR_ERR(group);
+
+	iommu_group_put(group);
+
+	return 0;
+}
+
+static void tegra_smmu_remove_device(struct device *dev)
+{
+	struct tegra_smmu *smmu = dev->archdata.iommu;
+
+	if (smmu)
+		iommu_device_unlink(&smmu->iommu, dev);
+
+	dev->archdata.iommu = NULL;
+	iommu_group_remove_device(dev);
+}
+
+static const struct tegra_smmu_group_soc *
+tegra_smmu_find_group(struct tegra_smmu *smmu, unsigned int swgroup)
+{
+	unsigned int i, j;
+
+	for (i = 0; i < smmu->soc->num_groups; i++)
+		for (j = 0; j < smmu->soc->groups[i].num_swgroups; j++)
+			if (smmu->soc->groups[i].swgroups[j] == swgroup)
+				return &smmu->soc->groups[i];
+
+	return NULL;
+}
+
+static struct iommu_group *tegra_smmu_group_get(struct tegra_smmu *smmu,
+						unsigned int swgroup)
+{
+	const struct tegra_smmu_group_soc *soc;
+	struct tegra_smmu_group *group;
+
+	soc = tegra_smmu_find_group(smmu, swgroup);
+	if (!soc)
+		return NULL;
+
+	mutex_lock(&smmu->lock);
+
+	list_for_each_entry(group, &smmu->groups, list)
+		if (group->soc == soc) {
+			mutex_unlock(&smmu->lock);
+			return group->group;
+		}
+
+	group = devm_kzalloc(smmu->dev, sizeof(*group), GFP_KERNEL);
+	if (!group) {
+		mutex_unlock(&smmu->lock);
+		return NULL;
+	}
+
+	INIT_LIST_HEAD(&group->list);
+	group->soc = soc;
+
+	group->group = iommu_group_alloc();
+	if (IS_ERR(group->group)) {
+		devm_kfree(smmu->dev, group);
+		mutex_unlock(&smmu->lock);
+		return NULL;
+	}
+
+	list_add_tail(&group->list, &smmu->groups);
+	mutex_unlock(&smmu->lock);
+
+	return group->group;
+}
+
+static struct iommu_group *tegra_smmu_device_group(struct device *dev)
+{
+	struct iommu_fwspec *fwspec = dev->iommu_fwspec;
+	struct tegra_smmu *smmu = dev->archdata.iommu;
+	struct iommu_group *group;
+
+	group = tegra_smmu_group_get(smmu, fwspec->ids[0]);
+	if (!group)
+		group = generic_device_group(dev);
+
+	return group;
+}
+
+static int tegra_smmu_of_xlate(struct device *dev,
+			       struct of_phandle_args *args)
+{
+	u32 id = args->args[0];
+
+	return iommu_fwspec_add_ids(dev, &id, 1);
+}
+
+static const struct iommu_ops tegra_smmu_ops = {
+	.capable = tegra_smmu_capable,
+	.domain_alloc = tegra_smmu_domain_alloc,
+	.domain_free = tegra_smmu_domain_free,
+	.attach_dev = tegra_smmu_attach_dev,
+	.detach_dev = tegra_smmu_detach_dev,
+	.add_device = tegra_smmu_add_device,
+	.remove_device = tegra_smmu_remove_device,
+	.device_group = tegra_smmu_device_group,
+	.map = tegra_smmu_map,
+	.unmap = tegra_smmu_unmap,
+	.iova_to_phys = tegra_smmu_iova_to_phys,
+	.of_xlate = tegra_smmu_of_xlate,
+	.pgsize_bitmap = SZ_4K,
+};
+
+static void tegra_smmu_ahb_enable(void)
+{
+	static const struct of_device_id ahb_match[] = {
+		{ .compatible = "nvidia,tegra30-ahb", },
+		{ }
+	};
+	struct device_node *ahb;
+
+	ahb = of_find_matching_node(NULL, ahb_match);
+	if (ahb) {
+		tegra_ahb_enable_smmu(ahb);
+		of_node_put(ahb);
+	}
+}
+
+static int tegra_smmu_swgroups_show(struct seq_file *s, void *data)
+{
+	struct tegra_smmu *smmu = s->private;
+	unsigned int i;
+	u32 value;
+
+	seq_printf(s, "swgroup    enabled  ASID\n");
+	seq_printf(s, "------------------------\n");
+
+	for (i = 0; i < smmu->soc->num_swgroups; i++) {
+		const struct tegra_smmu_swgroup *group = &smmu->soc->swgroups[i];
+		const char *status;
+		unsigned int asid;
+
+		value = smmu_readl(smmu, group->reg);
+
+		if (value & SMMU_ASID_ENABLE)
+			status = "yes";
+		else
+			status = "no";
+
+		asid = value & SMMU_ASID_MASK;
+
+		seq_printf(s, "%-9s  %-7s  %#04x\n", group->name, status,
+			   asid);
+	}
+
+	return 0;
+}
+
+static int tegra_smmu_swgroups_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, tegra_smmu_swgroups_show, inode->i_private);
+}
+
+static const struct file_operations tegra_smmu_swgroups_fops = {
+	.open = tegra_smmu_swgroups_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = single_release,
+};
+
+static int tegra_smmu_clients_show(struct seq_file *s, void *data)
+{
+	struct tegra_smmu *smmu = s->private;
+	unsigned int i;
+	u32 value;
+
+	seq_printf(s, "client       enabled\n");
+	seq_printf(s, "--------------------\n");
+
+	for (i = 0; i < smmu->soc->num_clients; i++) {
+		const struct tegra_mc_client *client = &smmu->soc->clients[i];
+		const char *status;
+
+		value = smmu_readl(smmu, client->smmu.reg);
+
+		if (value & BIT(client->smmu.bit))
+			status = "yes";
+		else
+			status = "no";
+
+		seq_printf(s, "%-12s %s\n", client->name, status);
+	}
+
+	return 0;
+}
+
+static int tegra_smmu_clients_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, tegra_smmu_clients_show, inode->i_private);
+}
+
+static const struct file_operations tegra_smmu_clients_fops = {
+	.open = tegra_smmu_clients_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = single_release,
+};
+
+static void tegra_smmu_debugfs_init(struct tegra_smmu *smmu)
+{
+	smmu->debugfs = debugfs_create_dir("smmu", NULL);
+	if (!smmu->debugfs)
+		return;
+
+	debugfs_create_file("swgroups", S_IRUGO, smmu->debugfs, smmu,
+			    &tegra_smmu_swgroups_fops);
+	debugfs_create_file("clients", S_IRUGO, smmu->debugfs, smmu,
+			    &tegra_smmu_clients_fops);
+}
+
+static void tegra_smmu_debugfs_exit(struct tegra_smmu *smmu)
+{
+	debugfs_remove_recursive(smmu->debugfs);
+}
+
+struct tegra_smmu *tegra_smmu_probe(struct device *dev,
+				    const struct tegra_smmu_soc *soc,
+				    struct tegra_mc *mc)
+{
+	struct tegra_smmu *smmu;
+	size_t size;
+	u32 value;
+	int err;
+
+	/* This can happen on Tegra20 which doesn't have an SMMU */
+	if (!soc)
+		return NULL;
+
+	smmu = devm_kzalloc(dev, sizeof(*smmu), GFP_KERNEL);
+	if (!smmu)
+		return ERR_PTR(-ENOMEM);
+
+	/*
+	 * This is a bit of a hack. Ideally we'd want to simply return this
+	 * value. However the IOMMU registration process will attempt to add
+	 * all devices to the IOMMU when bus_set_iommu() is called. In order
+	 * not to rely on global variables to track the IOMMU instance, we
+	 * set it here so that it can be looked up from the .add_device()
+	 * callback via the IOMMU device's .drvdata field.
+	 */
+	mc->smmu = smmu;
+
+	size = BITS_TO_LONGS(soc->num_asids) * sizeof(long);
+
+	smmu->asids = devm_kzalloc(dev, size, GFP_KERNEL);
+	if (!smmu->asids)
+		return ERR_PTR(-ENOMEM);
+
+	INIT_LIST_HEAD(&smmu->groups);
+	mutex_init(&smmu->lock);
+
+	smmu->regs = mc->regs;
+	smmu->soc = soc;
+	smmu->dev = dev;
+	smmu->mc = mc;
+
+	smmu->pfn_mask = BIT_MASK(mc->soc->num_address_bits - PAGE_SHIFT) - 1;
+	dev_dbg(dev, "address bits: %u, PFN mask: %#lx\n",
+		mc->soc->num_address_bits, smmu->pfn_mask);
+	smmu->tlb_mask = (smmu->soc->num_tlb_lines << 1) - 1;
+	dev_dbg(dev, "TLB lines: %u, mask: %#lx\n", smmu->soc->num_tlb_lines,
+		smmu->tlb_mask);
+
+	value = SMMU_PTC_CONFIG_ENABLE | SMMU_PTC_CONFIG_INDEX_MAP(0x3f);
+
+	if (soc->supports_request_limit)
+		value |= SMMU_PTC_CONFIG_REQ_LIMIT(8);
+
+	smmu_writel(smmu, value, SMMU_PTC_CONFIG);
+
+	value = SMMU_TLB_CONFIG_HIT_UNDER_MISS |
+		SMMU_TLB_CONFIG_ACTIVE_LINES(smmu);
+
+	if (soc->supports_round_robin_arbitration)
+		value |= SMMU_TLB_CONFIG_ROUND_ROBIN_ARBITRATION;
+
+	smmu_writel(smmu, value, SMMU_TLB_CONFIG);
+
+	smmu_flush_ptc_all(smmu);
+	smmu_flush_tlb(smmu);
+	smmu_writel(smmu, SMMU_CONFIG_ENABLE, SMMU_CONFIG);
+	smmu_flush(smmu);
+
+	tegra_smmu_ahb_enable();
+
+	err = iommu_device_sysfs_add(&smmu->iommu, dev, NULL, dev_name(dev));
+	if (err)
+		return ERR_PTR(err);
+
+	iommu_device_set_ops(&smmu->iommu, &tegra_smmu_ops);
+	iommu_device_set_fwnode(&smmu->iommu, dev->fwnode);
+
+	err = iommu_device_register(&smmu->iommu);
+	if (err) {
+		iommu_device_sysfs_remove(&smmu->iommu);
+		return ERR_PTR(err);
+	}
+
+	err = bus_set_iommu(&platform_bus_type, &tegra_smmu_ops);
+	if (err < 0) {
+		iommu_device_unregister(&smmu->iommu);
+		iommu_device_sysfs_remove(&smmu->iommu);
+		return ERR_PTR(err);
+	}
+
+	if (IS_ENABLED(CONFIG_DEBUG_FS))
+		tegra_smmu_debugfs_init(smmu);
+
+	return smmu;
+}
+
+void tegra_smmu_remove(struct tegra_smmu *smmu)
+{
+	iommu_device_unregister(&smmu->iommu);
+	iommu_device_sysfs_remove(&smmu->iommu);
+
+	if (IS_ENABLED(CONFIG_DEBUG_FS))
+		tegra_smmu_debugfs_exit(smmu);
+}