58 files changed, 24753 insertions, 0 deletions
diff --git a/arch/powerpc/platforms/powernv/Kconfig b/arch/powerpc/platforms/powernv/Kconfig
new file mode 100644
index 000000000..938803eab
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/Kconfig
@@ -0,0 +1,52 @@
+# SPDX-License-Identifier: GPL-2.0
+config PPC_POWERNV
+	depends on PPC64 && PPC_BOOK3S
+	bool "IBM PowerNV (Non-Virtualized) platform support"
+	select PPC_NATIVE
+	select PPC_XICS
+	select PPC_ICP_NATIVE
+	select PPC_XIVE_NATIVE
+	select PPC_P7_NAP
+	select FORCE_PCI
+	select PCI_MSI
+	select EPAPR_BOOT
+	select PPC_INDIRECT_PIO
+	select PPC_UDBG_16550
+	select ARCH_RANDOM
+	select CPU_FREQ
+	select PPC_DOORBELL
+	select MMU_NOTIFIER
+	select FORCE_SMP
+	default y
+
+config OPAL_PRD
+	tristate 'OPAL PRD driver'
+	depends on PPC_POWERNV
+	help
+	  This enables the opal-prd driver, a facility to run processor
+	  recovery diagnostics on OpenPower machines
+
+config PPC_MEMTRACE
+	bool "Enable removal of RAM from kernel mappings for tracing"
+	depends on PPC_POWERNV && MEMORY_HOTREMOVE
+	help
+	  Enabling this option allows for the removal of memory (RAM)
+	  from the kernel mappings to be used for hardware tracing.
+
+config PPC_VAS
+	bool "IBM Virtual Accelerator Switchboard (VAS)"
+	depends on PPC_POWERNV && PPC_64K_PAGES
+	default y
+	help
+	  This enables support for IBM Virtual Accelerator Switchboard (VAS).
+
+	  VAS allows accelerators in co-processors like NX-GZIP and NX-842
+	  to be accessible to kernel subsystems and user processes.
+
+	  VAS adapters are found in POWER9 based systems.
+
+	  If unsure, say N.
+
+config SCOM_DEBUGFS
+	bool "Expose SCOM controllers via debugfs"
+	depends on DEBUG_FS
diff --git a/arch/powerpc/platforms/powernv/Makefile b/arch/powerpc/platforms/powernv/Makefile
new file mode 100644
index 000000000..2eb6ae150
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/Makefile
@@ -0,0 +1,24 @@
+# SPDX-License-Identifier: GPL-2.0
+obj-y			+= setup.o opal-call.o opal-wrappers.o opal.o opal-async.o
+obj-y			+= idle.o opal-rtc.o opal-nvram.o opal-lpc.o opal-flash.o
+obj-y			+= rng.o opal-elog.o opal-dump.o opal-sysparam.o opal-sensor.o
+obj-y			+= opal-msglog.o opal-hmi.o opal-power.o opal-irqchip.o
+obj-y			+= opal-kmsg.o opal-powercap.o opal-psr.o opal-sensor-groups.o
+obj-y			+= ultravisor.o
+
+obj-$(CONFIG_SMP)	+= smp.o subcore.o subcore-asm.o
+obj-$(CONFIG_FA_DUMP)	+= opal-fadump.o
+obj-$(CONFIG_PRESERVE_FA_DUMP)	+= opal-fadump.o
+obj-$(CONFIG_OPAL_CORE)	+= opal-core.o
+obj-$(CONFIG_PCI)	+= pci.o pci-ioda.o npu-dma.o pci-ioda-tce.o
+obj-$(CONFIG_PCI_IOV)   += pci-sriov.o
+obj-$(CONFIG_CXL_BASE)	+= pci-cxl.o
+obj-$(CONFIG_EEH)	+= eeh-powernv.o
+obj-$(CONFIG_MEMORY_FAILURE)	+= opal-memory-errors.o
+obj-$(CONFIG_OPAL_PRD)	+= opal-prd.o
+obj-$(CONFIG_PERF_EVENTS) += opal-imc.o
+obj-$(CONFIG_PPC_MEMTRACE)	+= memtrace.o
+obj-$(CONFIG_PPC_VAS)	+= vas.o vas-window.o vas-debug.o vas-fault.o vas-api.o
+obj-$(CONFIG_OCXL_BASE)	+= ocxl.o
+obj-$(CONFIG_SCOM_DEBUGFS) += opal-xscom.o
+obj-$(CONFIG_PPC_SECURE_BOOT) += opal-secvar.o
diff --git a/arch/powerpc/platforms/powernv/copy-paste.h b/arch/powerpc/platforms/powernv/copy-paste.h
new file mode 100644
index 000000000..f063807ed
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/copy-paste.h
@@ -0,0 +1,42 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright 2016-17 IBM Corp.
+ */
+#include <asm/ppc-opcode.h>
+#include <asm/reg.h>
+
+/*
+ * Copy/paste instructions:
+ *
+ *	copy RA,RB
+ *		Copy contents of address (RA) + effective_address(RB)
+ *		to internal copy-buffer.
+ *
+ *	paste RA,RB
+ *		Paste contents of internal copy-buffer to the address
+ *		(RA) + effective_address(RB)
+ */
+static inline int vas_copy(void *crb, int offset)
+{
+	asm volatile(PPC_COPY(%0, %1)";"
+		:
+		: "b" (offset), "b" (crb)
+		: "memory");
+
+	return 0;
+}
+
+static inline int vas_paste(void *paste_address, int offset)
+{
+	u32 cr;
+
+	cr = 0;
+	asm volatile(PPC_PASTE(%1, %2)";"
+		"mfocrf %0, 0x80;"
+		: "=r" (cr)
+		: "b" (offset), "b" (paste_address)
+		: "memory", "cr0");
+
+	/* We mask with 0xE to ignore SO */
+	return (cr >> CR0_SHIFT) & 0xE;
+}
diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c b/arch/powerpc/platforms/powernv/eeh-powernv.c
new file mode 100644
index 000000000..89e22c460
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/eeh-powernv.c
@@ -0,0 +1,1714 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * PowerNV Platform dependent EEH operations
+ *
+ * Copyright Benjamin Herrenschmidt & Gavin Shan, IBM Corporation 2013.
+ */
+
+#include <linux/atomic.h>
+#include <linux/debugfs.h>
+#include <linux/delay.h>
+#include <linux/export.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/list.h>
+#include <linux/msi.h>
+#include <linux/of.h>
+#include <linux/pci.h>
+#include <linux/proc_fs.h>
+#include <linux/rbtree.h>
+#include <linux/sched.h>
+#include <linux/seq_file.h>
+#include <linux/spinlock.h>
+
+#include <asm/eeh.h>
+#include <asm/eeh_event.h>
+#include <asm/firmware.h>
+#include <asm/io.h>
+#include <asm/iommu.h>
+#include <asm/machdep.h>
+#include <asm/msi_bitmap.h>
+#include <asm/opal.h>
+#include <asm/ppc-pci.h>
+#include <asm/pnv-pci.h>
+
+#include "powernv.h"
+#include "pci.h"
+#include "../../../../drivers/pci/pci.h"
+
+static int eeh_event_irq = -EINVAL;
+
+static void pnv_pcibios_bus_add_device(struct pci_dev *pdev)
+{
+	dev_dbg(&pdev->dev, "EEH: Setting up device\n");
+	eeh_probe_device(pdev);
+}
+
+static irqreturn_t pnv_eeh_event(int irq, void *data)
+{
+	/*
+	 * We simply send a special EEH event if EEH has been
+	 * enabled. We don't care about EEH events until we've
+	 * finished processing the outstanding ones. Event processing
+	 * gets unmasked in next_error() if EEH is enabled.
+	 */
+	disable_irq_nosync(irq);
+
+	if (eeh_enabled())
+		eeh_send_failure_event(NULL);
+
+	return IRQ_HANDLED;
+}
+
+#ifdef CONFIG_DEBUG_FS
+static ssize_t pnv_eeh_ei_write(struct file *filp,
+				const char __user *user_buf,
+				size_t count, loff_t *ppos)
+{
+	struct pci_controller *hose = filp->private_data;
+	struct eeh_pe *pe;
+	int pe_no, type, func;
+	unsigned long addr, mask;
+	char buf[50];
+	int ret;
+
+	if (!eeh_ops || !eeh_ops->err_inject)
+		return -ENXIO;
+
+	/* Copy over argument buffer */
+	ret = simple_write_to_buffer(buf, sizeof(buf), ppos, user_buf, count);
+	if (!ret)
+		return -EFAULT;
+
+	/* Retrieve parameters */
+	ret = sscanf(buf, "%x:%x:%x:%lx:%lx",
+		     &pe_no, &type, &func, &addr, &mask);
+	if (ret != 5)
+		return -EINVAL;
+
+	/* Retrieve PE */
+	pe = eeh_pe_get(hose, pe_no);
+	if (!pe)
+		return -ENODEV;
+
+	/* Do error injection */
+	ret = eeh_ops->err_inject(pe, type, func, addr, mask);
+	return ret < 0 ? ret : count;
+}
+
+static const struct file_operations pnv_eeh_ei_fops = {
+	.open	= simple_open,
+	.llseek	= no_llseek,
+	.write	= pnv_eeh_ei_write,
+};
+
+static int pnv_eeh_dbgfs_set(void *data, int offset, u64 val)
+{
+	struct pci_controller *hose = data;
+	struct pnv_phb *phb = hose->private_data;
+
+	out_be64(phb->regs + offset, val);
+	return 0;
+}
+
+static int pnv_eeh_dbgfs_get(void *data, int offset, u64 *val)
+{
+	struct pci_controller *hose = data;
+	struct pnv_phb *phb = hose->private_data;
+
+	*val = in_be64(phb->regs + offset);
+	return 0;
+}
+
+#define PNV_EEH_DBGFS_ENTRY(name, reg)				\
+static int pnv_eeh_dbgfs_set_##name(void *data, u64 val)	\
+{								\
+	return pnv_eeh_dbgfs_set(data, reg, val);		\
+}								\
+								\
+static int pnv_eeh_dbgfs_get_##name(void *data, u64 *val)	\
+{								\
+	return pnv_eeh_dbgfs_get(data, reg, val);		\
+}								\
+								\
+DEFINE_SIMPLE_ATTRIBUTE(pnv_eeh_dbgfs_ops_##name,		\
+			pnv_eeh_dbgfs_get_##name,		\
+                        pnv_eeh_dbgfs_set_##name,		\
+			"0x%llx\n")
+
+PNV_EEH_DBGFS_ENTRY(outb, 0xD10);
+PNV_EEH_DBGFS_ENTRY(inbA, 0xD90);
+PNV_EEH_DBGFS_ENTRY(inbB, 0xE10);
+
+#endif /* CONFIG_DEBUG_FS */
+
+static void pnv_eeh_enable_phbs(void)
+{
+	struct pci_controller *hose;
+	struct pnv_phb *phb;
+
+	list_for_each_entry(hose, &hose_list, list_node) {
+		phb = hose->private_data;
+		/*
+		 * If EEH is enabled, we're going to rely on that.
+		 * Otherwise, we restore to conventional mechanism
+		 * to clear frozen PE during PCI config access.
+		 */
+		if (eeh_enabled())
+			phb->flags |= PNV_PHB_FLAG_EEH;
+		else
+			phb->flags &= ~PNV_PHB_FLAG_EEH;
+	}
+}
+
+/**
+ * pnv_eeh_post_init - EEH platform dependent post initialization
+ *
+ * EEH platform dependent post initialization on powernv. When
+ * the function is called, the EEH PEs and devices should have
+ * been built. If the I/O cache staff has been built, EEH is
+ * ready to supply service.
+ */
+int pnv_eeh_post_init(void)
+{
+	struct pci_controller *hose;
+	struct pnv_phb *phb;
+	int ret = 0;
+
+	eeh_show_enabled();
+
+	/* Register OPAL event notifier */
+	eeh_event_irq = opal_event_request(ilog2(OPAL_EVENT_PCI_ERROR));
+	if (eeh_event_irq < 0) {
+		pr_err("%s: Can't register OPAL event interrupt (%d)\n",
+		       __func__, eeh_event_irq);
+		return eeh_event_irq;
+	}
+
+	ret = request_irq(eeh_event_irq, pnv_eeh_event,
+			  IRQ_TYPE_LEVEL_HIGH, "opal-eeh", NULL);
+	if (ret < 0) {
+		irq_dispose_mapping(eeh_event_irq);
+		pr_err("%s: Can't request OPAL event interrupt (%d)\n",
+		       __func__, eeh_event_irq);
+		return ret;
+	}
+
+	if (!eeh_enabled())
+		disable_irq(eeh_event_irq);
+
+	pnv_eeh_enable_phbs();
+
+	list_for_each_entry(hose, &hose_list, list_node) {
+		phb = hose->private_data;
+
+		/* Create debugfs entries */
+#ifdef CONFIG_DEBUG_FS
+		if (phb->has_dbgfs || !phb->dbgfs)
+			continue;
+
+		phb->has_dbgfs = 1;
+		debugfs_create_file("err_injct", 0200,
+				    phb->dbgfs, hose,
+				    &pnv_eeh_ei_fops);
+
+		debugfs_create_file("err_injct_outbound", 0600,
+				    phb->dbgfs, hose,
+				    &pnv_eeh_dbgfs_ops_outb);
+		debugfs_create_file("err_injct_inboundA", 0600,
+				    phb->dbgfs, hose,
+				    &pnv_eeh_dbgfs_ops_inbA);
+		debugfs_create_file("err_injct_inboundB", 0600,
+				    phb->dbgfs, hose,
+				    &pnv_eeh_dbgfs_ops_inbB);
+#endif /* CONFIG_DEBUG_FS */
+	}
+
+	return ret;
+}
+
+static int pnv_eeh_find_cap(struct pci_dn *pdn, int cap)
+{
+	int pos = PCI_CAPABILITY_LIST;
+	int cnt = 48;   /* Maximal number of capabilities */
+	u32 status, id;
+
+	if (!pdn)
+		return 0;
+
+	/* Check if the device supports capabilities */
+	pnv_pci_cfg_read(pdn, PCI_STATUS, 2, &status);
+	if (!(status & PCI_STATUS_CAP_LIST))
+		return 0;
+
+	while (cnt--) {
+		pnv_pci_cfg_read(pdn, pos, 1, &pos);
+		if (pos < 0x40)
+			break;
+
+		pos &= ~3;
+		pnv_pci_cfg_read(pdn, pos + PCI_CAP_LIST_ID, 1, &id);
+		if (id == 0xff)
+			break;
+
+		/* Found */
+		if (id == cap)
+			return pos;
+
+		/* Next one */
+		pos += PCI_CAP_LIST_NEXT;
+	}
+
+	return 0;
+}
+
+static int pnv_eeh_find_ecap(struct pci_dn *pdn, int cap)
+{
+	struct eeh_dev *edev = pdn_to_eeh_dev(pdn);
+	u32 header;
+	int pos = 256, ttl = (4096 - 256) / 8;
+
+	if (!edev || !edev->pcie_cap)
+		return 0;
+	if (pnv_pci_cfg_read(pdn, pos, 4, &header) != PCIBIOS_SUCCESSFUL)
+		return 0;
+	else if (!header)
+		return 0;
+
+	while (ttl-- > 0) {
+		if (PCI_EXT_CAP_ID(header) == cap && pos)
+			return pos;
+
+		pos = PCI_EXT_CAP_NEXT(header);
+		if (pos < 256)
+			break;
+
+		if (pnv_pci_cfg_read(pdn, pos, 4, &header) != PCIBIOS_SUCCESSFUL)
+			break;
+	}
+
+	return 0;
+}
+
+static struct eeh_pe *pnv_eeh_get_upstream_pe(struct pci_dev *pdev)
+{
+	struct pci_controller *hose = pdev->bus->sysdata;
+	struct pnv_phb *phb = hose->private_data;
+	struct pci_dev *parent = pdev->bus->self;
+
+#ifdef CONFIG_PCI_IOV
+	/* for VFs we use the PF's PE as the upstream PE */
+	if (pdev->is_virtfn)
+		parent = pdev->physfn;
+#endif
+
+	/* otherwise use the PE of our parent bridge */
+	if (parent) {
+		struct pnv_ioda_pe *ioda_pe = pnv_ioda_get_pe(parent);
+
+		return eeh_pe_get(phb->hose, ioda_pe->pe_number);
+	}
+
+	return NULL;
+}
+
+/**
+ * pnv_eeh_probe - Do probe on PCI device
+ * @pdev: pci_dev to probe
+ *
+ * Create, or find the existing, eeh_dev for this pci_dev.
+ */
+static struct eeh_dev *pnv_eeh_probe(struct pci_dev *pdev)
+{
+	struct pci_dn *pdn = pci_get_pdn(pdev);
+	struct pci_controller *hose = pdn->phb;
+	struct pnv_phb *phb = hose->private_data;
+	struct eeh_dev *edev = pdn_to_eeh_dev(pdn);
+	struct eeh_pe *upstream_pe;
+	uint32_t pcie_flags;
+	int ret;
+	int config_addr = (pdn->busno << 8) | (pdn->devfn);
+
+	/*
+	 * When probing the root bridge, which doesn't have any
+	 * subordinate PCI devices. We don't have OF node for
+	 * the root bridge. So it's not reasonable to continue
+	 * the probing.
+	 */
+	if (!edev || edev->pe)
+		return NULL;
+
+	/* already configured? */
+	if (edev->pdev) {
+		pr_debug("%s: found existing edev for %04x:%02x:%02x.%01x\n",
+			__func__, hose->global_number, config_addr >> 8,
+			PCI_SLOT(config_addr), PCI_FUNC(config_addr));
+		return edev;
+	}
+
+	/* Skip for PCI-ISA bridge */
+	if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
+		return NULL;
+
+	eeh_edev_dbg(edev, "Probing device\n");
+
+	/* Initialize eeh device */
+	edev->mode	&= 0xFFFFFF00;
+	edev->pcix_cap = pnv_eeh_find_cap(pdn, PCI_CAP_ID_PCIX);
+	edev->pcie_cap = pnv_eeh_find_cap(pdn, PCI_CAP_ID_EXP);
+	edev->af_cap   = pnv_eeh_find_cap(pdn, PCI_CAP_ID_AF);
+	edev->aer_cap  = pnv_eeh_find_ecap(pdn, PCI_EXT_CAP_ID_ERR);
+	if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_PCI) {
+		edev->mode |= EEH_DEV_BRIDGE;
+		if (edev->pcie_cap) {
+			pnv_pci_cfg_read(pdn, edev->pcie_cap + PCI_EXP_FLAGS,
+					 2, &pcie_flags);
+			pcie_flags = (pcie_flags & PCI_EXP_FLAGS_TYPE) >> 4;
+			if (pcie_flags == PCI_EXP_TYPE_ROOT_PORT)
+				edev->mode |= EEH_DEV_ROOT_PORT;
+			else if (pcie_flags == PCI_EXP_TYPE_DOWNSTREAM)
+				edev->mode |= EEH_DEV_DS_PORT;
+		}
+	}
+
+	edev->pe_config_addr = phb->ioda.pe_rmap[config_addr];
+
+	upstream_pe = pnv_eeh_get_upstream_pe(pdev);
+
+	/* Create PE */
+	ret = eeh_pe_tree_insert(edev, upstream_pe);
+	if (ret) {
+		eeh_edev_warn(edev, "Failed to add device to PE (code %d)\n", ret);
+		return NULL;
+	}
+
+	/*
+	 * If the PE contains any one of following adapters, the
+	 * PCI config space can't be accessed when dumping EEH log.
+	 * Otherwise, we will run into fenced PHB caused by shortage
+	 * of outbound credits in the adapter. The PCI config access
+	 * should be blocked until PE reset. MMIO access is dropped
+	 * by hardware certainly. In order to drop PCI config requests,
+	 * one more flag (EEH_PE_CFG_RESTRICTED) is introduced, which
+	 * will be checked in the backend for PE state retrival. If
+	 * the PE becomes frozen for the first time and the flag has
+	 * been set for the PE, we will set EEH_PE_CFG_BLOCKED for
+	 * that PE to block its config space.
+	 *
+	 * Broadcom BCM5718 2-ports NICs (14e4:1656)
+	 * Broadcom Austin 4-ports NICs (14e4:1657)
+	 * Broadcom Shiner 4-ports 1G NICs (14e4:168a)
+	 * Broadcom Shiner 2-ports 10G NICs (14e4:168e)
+	 */
+	if ((pdn->vendor_id == PCI_VENDOR_ID_BROADCOM &&
+	     pdn->device_id == 0x1656) ||
+	    (pdn->vendor_id == PCI_VENDOR_ID_BROADCOM &&
+	     pdn->device_id == 0x1657) ||
+	    (pdn->vendor_id == PCI_VENDOR_ID_BROADCOM &&
+	     pdn->device_id == 0x168a) ||
+	    (pdn->vendor_id == PCI_VENDOR_ID_BROADCOM &&
+	     pdn->device_id == 0x168e))
+		edev->pe->state |= EEH_PE_CFG_RESTRICTED;
+
+	/*
+	 * Cache the PE primary bus, which can't be fetched when
+	 * full hotplug is in progress. In that case, all child
+	 * PCI devices of the PE are expected to be removed prior
+	 * to PE reset.
+	 */
+	if (!(edev->pe->state & EEH_PE_PRI_BUS)) {
+		edev->pe->bus = pci_find_bus(hose->global_number,
+					     pdn->busno);
+		if (edev->pe->bus)
+			edev->pe->state |= EEH_PE_PRI_BUS;
+	}
+
+	/*
+	 * Enable EEH explicitly so that we will do EEH check
+	 * while accessing I/O stuff
+	 */
+	if (!eeh_has_flag(EEH_ENABLED)) {
+		enable_irq(eeh_event_irq);
+		pnv_eeh_enable_phbs();
+		eeh_add_flag(EEH_ENABLED);
+	}
+
+	/* Save memory bars */
+	eeh_save_bars(edev);
+
+	eeh_edev_dbg(edev, "EEH enabled on device\n");
+
+	return edev;
+}
+
+/**
+ * pnv_eeh_set_option - Initialize EEH or MMIO/DMA reenable
+ * @pe: EEH PE
+ * @option: operation to be issued
+ *
+ * The function is used to control the EEH functionality globally.
+ * Currently, following options are support according to PAPR:
+ * Enable EEH, Disable EEH, Enable MMIO and Enable DMA
+ */
+static int pnv_eeh_set_option(struct eeh_pe *pe, int option)
+{
+	struct pci_controller *hose = pe->phb;
+	struct pnv_phb *phb = hose->private_data;
+	bool freeze_pe = false;
+	int opt;
+	s64 rc;
+
+	switch (option) {
+	case EEH_OPT_DISABLE:
+		return -EPERM;
+	case EEH_OPT_ENABLE:
+		return 0;
+	case EEH_OPT_THAW_MMIO:
+		opt = OPAL_EEH_ACTION_CLEAR_FREEZE_MMIO;
+		break;
+	case EEH_OPT_THAW_DMA:
+		opt = OPAL_EEH_ACTION_CLEAR_FREEZE_DMA;
+		break;
+	case EEH_OPT_FREEZE_PE:
+		freeze_pe = true;
+		opt = OPAL_EEH_ACTION_SET_FREEZE_ALL;
+		break;
+	default:
+		pr_warn("%s: Invalid option %d\n", __func__, option);
+		return -EINVAL;
+	}
+
+	/* Freeze master and slave PEs if PHB supports compound PEs */
+	if (freeze_pe) {
+		if (phb->freeze_pe) {
+			phb->freeze_pe(phb, pe->addr);
+			return 0;
+		}
+
+		rc = opal_pci_eeh_freeze_set(phb->opal_id, pe->addr, opt);
+		if (rc != OPAL_SUCCESS) {
+			pr_warn("%s: Failure %lld freezing PHB#%x-PE#%x\n",
+				__func__, rc, phb->hose->global_number,
+				pe->addr);
+			return -EIO;
+		}
+
+		return 0;
+	}
+
+	/* Unfreeze master and slave PEs if PHB supports */
+	if (phb->unfreeze_pe)
+		return phb->unfreeze_pe(phb, pe->addr, opt);
+
+	rc = opal_pci_eeh_freeze_clear(phb->opal_id, pe->addr, opt);
+	if (rc != OPAL_SUCCESS) {
+		pr_warn("%s: Failure %lld enable %d for PHB#%x-PE#%x\n",
+			__func__, rc, option, phb->hose->global_number,
+			pe->addr);
+		return -EIO;
+	}
+
+	return 0;
+}
+
+static void pnv_eeh_get_phb_diag(struct eeh_pe *pe)
+{
+	struct pnv_phb *phb = pe->phb->private_data;
+	s64 rc;
+
+	rc = opal_pci_get_phb_diag_data2(phb->opal_id, pe->data,
+					 phb->diag_data_size);
+	if (rc != OPAL_SUCCESS)
+		pr_warn("%s: Failure %lld getting PHB#%x diag-data\n",
+			__func__, rc, pe->phb->global_number);
+}
+
+static int pnv_eeh_get_phb_state(struct eeh_pe *pe)
+{
+	struct pnv_phb *phb = pe->phb->private_data;
+	u8 fstate = 0;
+	__be16 pcierr = 0;
+	s64 rc;
+	int result = 0;
+
+	rc = opal_pci_eeh_freeze_status(phb->opal_id,
+					pe->addr,
+					&fstate,
+					&pcierr,
+					NULL);
+	if (rc != OPAL_SUCCESS) {
+		pr_warn("%s: Failure %lld getting PHB#%x state\n",
+			__func__, rc, phb->hose->global_number);
+		return EEH_STATE_NOT_SUPPORT;
+	}
+
+	/*
+	 * Check PHB state. If the PHB is frozen for the
+	 * first time, to dump the PHB diag-data.
+	 */
+	if (be16_to_cpu(pcierr) != OPAL_EEH_PHB_ERROR) {
+		result = (EEH_STATE_MMIO_ACTIVE  |
+			  EEH_STATE_DMA_ACTIVE   |
+			  EEH_STATE_MMIO_ENABLED |
+			  EEH_STATE_DMA_ENABLED);
+	} else if (!(pe->state & EEH_PE_ISOLATED)) {
+		eeh_pe_mark_isolated(pe);
+		pnv_eeh_get_phb_diag(pe);
+
+		if (eeh_has_flag(EEH_EARLY_DUMP_LOG))
+			pnv_pci_dump_phb_diag_data(pe->phb, pe->data);
+	}
+
+	return result;
+}
+
+static int pnv_eeh_get_pe_state(struct eeh_pe *pe)
+{
+	struct pnv_phb *phb = pe->phb->private_data;
+	u8 fstate = 0;
+	__be16 pcierr = 0;
+	s64 rc;
+	int result;
+
+	/*
+	 * We don't clobber hardware frozen state until PE
+	 * reset is completed. In order to keep EEH core
+	 * moving forward, we have to return operational
+	 * state during PE reset.
+	 */
+	if (pe->state & EEH_PE_RESET) {
+		result = (EEH_STATE_MMIO_ACTIVE  |
+			  EEH_STATE_DMA_ACTIVE   |
+			  EEH_STATE_MMIO_ENABLED |
+			  EEH_STATE_DMA_ENABLED);
+		return result;
+	}
+
+	/*
+	 * Fetch PE state from hardware. If the PHB
+	 * supports compound PE, let it handle that.
+	 */
+	if (phb->get_pe_state) {
+		fstate = phb->get_pe_state(phb, pe->addr);
+	} else {
+		rc = opal_pci_eeh_freeze_status(phb->opal_id,
+						pe->addr,
+						&fstate,
+						&pcierr,
+						NULL);
+		if (rc != OPAL_SUCCESS) {
+			pr_warn("%s: Failure %lld getting PHB#%x-PE%x state\n",
+				__func__, rc, phb->hose->global_number,
+				pe->addr);
+			return EEH_STATE_NOT_SUPPORT;
+		}
+	}
+
+	/* Figure out state */
+	switch (fstate) {
+	case OPAL_EEH_STOPPED_NOT_FROZEN:
+		result = (EEH_STATE_MMIO_ACTIVE  |
+			  EEH_STATE_DMA_ACTIVE   |
+			  EEH_STATE_MMIO_ENABLED |
+			  EEH_STATE_DMA_ENABLED);
+		break;
+	case OPAL_EEH_STOPPED_MMIO_FREEZE:
+		result = (EEH_STATE_DMA_ACTIVE |
+			  EEH_STATE_DMA_ENABLED);
+		break;
+	case OPAL_EEH_STOPPED_DMA_FREEZE:
+		result = (EEH_STATE_MMIO_ACTIVE |
+			  EEH_STATE_MMIO_ENABLED);
+		break;
+	case OPAL_EEH_STOPPED_MMIO_DMA_FREEZE:
+		result = 0;
+		break;
+	case OPAL_EEH_STOPPED_RESET:
+		result = EEH_STATE_RESET_ACTIVE;
+		break;
+	case OPAL_EEH_STOPPED_TEMP_UNAVAIL:
+		result = EEH_STATE_UNAVAILABLE;
+		break;
+	case OPAL_EEH_STOPPED_PERM_UNAVAIL:
+		result = EEH_STATE_NOT_SUPPORT;
+		break;
+	default:
+		result = EEH_STATE_NOT_SUPPORT;
+		pr_warn("%s: Invalid PHB#%x-PE#%x state %x\n",
+			__func__, phb->hose->global_number,
+			pe->addr, fstate);
+	}
+
+	/*
+	 * If PHB supports compound PE, to freeze all
+	 * slave PEs for consistency.
+	 *
+	 * If the PE is switching to frozen state for the
+	 * first time, to dump the PHB diag-data.
+	 */
+	if (!(result & EEH_STATE_NOT_SUPPORT) &&
+	    !(result & EEH_STATE_UNAVAILABLE) &&
+	    !(result & EEH_STATE_MMIO_ACTIVE) &&
+	    !(result & EEH_STATE_DMA_ACTIVE)  &&
+	    !(pe->state & EEH_PE_ISOLATED)) {
+		if (phb->freeze_pe)
+			phb->freeze_pe(phb, pe->addr);
+
+		eeh_pe_mark_isolated(pe);
+		pnv_eeh_get_phb_diag(pe);
+
+		if (eeh_has_flag(EEH_EARLY_DUMP_LOG))
+			pnv_pci_dump_phb_diag_data(pe->phb, pe->data);
+	}
+
+	return result;
+}
+
+/**
+ * pnv_eeh_get_state - Retrieve PE state
+ * @pe: EEH PE
+ * @delay: delay while PE state is temporarily unavailable
+ *
+ * Retrieve the state of the specified PE. For IODA-compitable
+ * platform, it should be retrieved from IODA table. Therefore,
+ * we prefer passing down to hardware implementation to handle
+ * it.
+ */
+static int pnv_eeh_get_state(struct eeh_pe *pe, int *delay)
+{
+	int ret;
+
+	if (pe->type & EEH_PE_PHB)
+		ret = pnv_eeh_get_phb_state(pe);
+	else
+		ret = pnv_eeh_get_pe_state(pe);
+
+	if (!delay)
+		return ret;
+
+	/*
+	 * If the PE state is temporarily unavailable,
+	 * to inform the EEH core delay for default
+	 * period (1 second)
+	 */
+	*delay = 0;
+	if (ret & EEH_STATE_UNAVAILABLE)
+		*delay = 1000;
+
+	return ret;
+}
+
+static s64 pnv_eeh_poll(unsigned long id)
+{
+	s64 rc = OPAL_HARDWARE;
+
+	while (1) {
+		rc = opal_pci_poll(id);
+		if (rc <= 0)
+			break;
+
+		if (system_state < SYSTEM_RUNNING)
+			udelay(1000 * rc);
+		else
+			msleep(rc);
+	}
+
+	return rc;
+}
+
+int pnv_eeh_phb_reset(struct pci_controller *hose, int option)
+{
+	struct pnv_phb *phb = hose->private_data;
+	s64 rc = OPAL_HARDWARE;
+
+	pr_debug("%s: Reset PHB#%x, option=%d\n",
+		 __func__, hose->global_number, option);
+
+	/* Issue PHB complete reset request */
+	if (option == EEH_RESET_FUNDAMENTAL ||
+	    option == EEH_RESET_HOT)
+		rc = opal_pci_reset(phb->opal_id,
+				    OPAL_RESET_PHB_COMPLETE,
+				    OPAL_ASSERT_RESET);
+	else if (option == EEH_RESET_DEACTIVATE)
+		rc = opal_pci_reset(phb->opal_id,
+				    OPAL_RESET_PHB_COMPLETE,
+				    OPAL_DEASSERT_RESET);
+	if (rc < 0)
+		goto out;
+
+	/*
+	 * Poll state of the PHB until the request is done
+	 * successfully. The PHB reset is usually PHB complete
+	 * reset followed by hot reset on root bus. So we also
+	 * need the PCI bus settlement delay.
+	 */
+	if (rc > 0)
+		rc = pnv_eeh_poll(phb->opal_id);
+	if (option == EEH_RESET_DEACTIVATE) {
+		if (system_state < SYSTEM_RUNNING)
+			udelay(1000 * EEH_PE_RST_SETTLE_TIME);
+		else
+			msleep(EEH_PE_RST_SETTLE_TIME);
+	}
+out:
+	if (rc != OPAL_SUCCESS)
+		return -EIO;
+
+	return 0;
+}
+
+static int pnv_eeh_root_reset(struct pci_controller *hose, int option)
+{
+	struct pnv_phb *phb = hose->private_data;
+	s64 rc = OPAL_HARDWARE;
+
+	pr_debug("%s: Reset PHB#%x, option=%d\n",
+		 __func__, hose->global_number, option);
+
+	/*
+	 * During the reset deassert time, we needn't care
+	 * the reset scope because the firmware does nothing
+	 * for fundamental or hot reset during deassert phase.
+	 */
+	if (option == EEH_RESET_FUNDAMENTAL)
+		rc = opal_pci_reset(phb->opal_id,
+				    OPAL_RESET_PCI_FUNDAMENTAL,
+				    OPAL_ASSERT_RESET);
+	else if (option == EEH_RESET_HOT)
+		rc = opal_pci_reset(phb->opal_id,
+				    OPAL_RESET_PCI_HOT,
+				    OPAL_ASSERT_RESET);
+	else if (option == EEH_RESET_DEACTIVATE)
+		rc = opal_pci_reset(phb->opal_id,
+				    OPAL_RESET_PCI_HOT,
+				    OPAL_DEASSERT_RESET);
+	if (rc < 0)
+		goto out;
+
+	/* Poll state of the PHB until the request is done */
+	if (rc > 0)
+		rc = pnv_eeh_poll(phb->opal_id);
+	if (option == EEH_RESET_DEACTIVATE)
+		msleep(EEH_PE_RST_SETTLE_TIME);
+out:
+	if (rc != OPAL_SUCCESS)
+		return -EIO;
+
+	return 0;
+}
+
+static int __pnv_eeh_bridge_reset(struct pci_dev *dev, int option)
+{
+	struct pci_dn *pdn = pci_get_pdn_by_devfn(dev->bus, dev->devfn);
+	struct eeh_dev *edev = pdn_to_eeh_dev(pdn);
+	int aer = edev ? edev->aer_cap : 0;
+	u32 ctrl;
+
+	pr_debug("%s: Secondary Reset PCI bus %04x:%02x with option %d\n",
+		 __func__, pci_domain_nr(dev->bus),
+		 dev->bus->number, option);
+
+	switch (option) {
+	case EEH_RESET_FUNDAMENTAL:
+	case EEH_RESET_HOT:
+		/* Don't report linkDown event */
+		if (aer) {
+			eeh_ops->read_config(edev, aer + PCI_ERR_UNCOR_MASK,
+					     4, &ctrl);
+			ctrl |= PCI_ERR_UNC_SURPDN;
+			eeh_ops->write_config(edev, aer + PCI_ERR_UNCOR_MASK,
+					      4, ctrl);
+		}
+
+		eeh_ops->read_config(edev, PCI_BRIDGE_CONTROL, 2, &ctrl);
+		ctrl |= PCI_BRIDGE_CTL_BUS_RESET;
+		eeh_ops->write_config(edev, PCI_BRIDGE_CONTROL, 2, ctrl);
+
+		msleep(EEH_PE_RST_HOLD_TIME);
+		break;
+	case EEH_RESET_DEACTIVATE:
+		eeh_ops->read_config(edev, PCI_BRIDGE_CONTROL, 2, &ctrl);
+		ctrl &= ~PCI_BRIDGE_CTL_BUS_RESET;
+		eeh_ops->write_config(edev, PCI_BRIDGE_CONTROL, 2, ctrl);
+
+		msleep(EEH_PE_RST_SETTLE_TIME);
+
+		/* Continue reporting linkDown event */
+		if (aer) {
+			eeh_ops->read_config(edev, aer + PCI_ERR_UNCOR_MASK,
+					     4, &ctrl);
+			ctrl &= ~PCI_ERR_UNC_SURPDN;
+			eeh_ops->write_config(edev, aer + PCI_ERR_UNCOR_MASK,
+					      4, ctrl);
+		}
+
+		break;
+	}
+
+	return 0;
+}
+
+static int pnv_eeh_bridge_reset(struct pci_dev *pdev, int option)
+{
+	struct pci_controller *hose = pci_bus_to_host(pdev->bus);
+	struct pnv_phb *phb = hose->private_data;
+	struct device_node *dn = pci_device_to_OF_node(pdev);
+	uint64_t id = PCI_SLOT_ID(phb->opal_id,
+				  (pdev->bus->number << 8) | pdev->devfn);
+	uint8_t scope;
+	int64_t rc;
+
+	/* Hot reset to the bus if firmware cannot handle */
+	if (!dn || !of_get_property(dn, "ibm,reset-by-firmware", NULL))
+		return __pnv_eeh_bridge_reset(pdev, option);
+
+	pr_debug("%s: FW reset PCI bus %04x:%02x with option %d\n",
+		 __func__, pci_domain_nr(pdev->bus),
+		 pdev->bus->number, option);
+
+	switch (option) {
+	case EEH_RESET_FUNDAMENTAL:
+		scope = OPAL_RESET_PCI_FUNDAMENTAL;
+		break;
+	case EEH_RESET_HOT:
+		scope = OPAL_RESET_PCI_HOT;
+		break;
+	case EEH_RESET_DEACTIVATE:
+		return 0;
+	default:
+		dev_dbg(&pdev->dev, "%s: Unsupported reset %d\n",
+			__func__, option);
+		return -EINVAL;
+	}
+
+	rc = opal_pci_reset(id, scope, OPAL_ASSERT_RESET);
+	if (rc <= OPAL_SUCCESS)
+		goto out;
+
+	rc = pnv_eeh_poll(id);
+out:
+	return (rc == OPAL_SUCCESS) ? 0 : -EIO;
+}
+
+void pnv_pci_reset_secondary_bus(struct pci_dev *dev)
+{
+	struct pci_controller *hose;
+
+	if (pci_is_root_bus(dev->bus)) {
+		hose = pci_bus_to_host(dev->bus);
+		pnv_eeh_root_reset(hose, EEH_RESET_HOT);
+		pnv_eeh_root_reset(hose, EEH_RESET_DEACTIVATE);
+	} else {
+		pnv_eeh_bridge_reset(dev, EEH_RESET_HOT);
+		pnv_eeh_bridge_reset(dev, EEH_RESET_DEACTIVATE);
+	}
+}
+
+static void pnv_eeh_wait_for_pending(struct pci_dn *pdn, const char *type,
+				     int pos, u16 mask)
+{
+	struct eeh_dev *edev = pdn->edev;
+	int i, status = 0;
+
+	/* Wait for Transaction Pending bit to be cleared */
+	for (i = 0; i < 4; i++) {
+		eeh_ops->read_config(edev, pos, 2, &status);
+		if (!(status & mask))
+			return;
+
+		msleep((1 << i) * 100);
+	}
+
+	pr_warn("%s: Pending transaction while issuing %sFLR to %04x:%02x:%02x.%01x\n",
+		__func__, type,
+		pdn->phb->global_number, pdn->busno,
+		PCI_SLOT(pdn->devfn), PCI_FUNC(pdn->devfn));
+}
+
+static int pnv_eeh_do_flr(struct pci_dn *pdn, int option)
+{
+	struct eeh_dev *edev = pdn_to_eeh_dev(pdn);
+	u32 reg = 0;
+
+	if (WARN_ON(!edev->pcie_cap))
+		return -ENOTTY;
+
+	eeh_ops->read_config(edev, edev->pcie_cap + PCI_EXP_DEVCAP, 4, &reg);
+	if (!(reg & PCI_EXP_DEVCAP_FLR))
+		return -ENOTTY;
+
+	switch (option) {
+	case EEH_RESET_HOT:
+	case EEH_RESET_FUNDAMENTAL:
+		pnv_eeh_wait_for_pending(pdn, "",
+					 edev->pcie_cap + PCI_EXP_DEVSTA,
+					 PCI_EXP_DEVSTA_TRPND);
+		eeh_ops->read_config(edev, edev->pcie_cap + PCI_EXP_DEVCTL,
+				     4, &reg);
+		reg |= PCI_EXP_DEVCTL_BCR_FLR;
+		eeh_ops->write_config(edev, edev->pcie_cap + PCI_EXP_DEVCTL,
+				      4, reg);
+		msleep(EEH_PE_RST_HOLD_TIME);
+		break;
+	case EEH_RESET_DEACTIVATE:
+		eeh_ops->read_config(edev, edev->pcie_cap + PCI_EXP_DEVCTL,
+				     4, &reg);
+		reg &= ~PCI_EXP_DEVCTL_BCR_FLR;
+		eeh_ops->write_config(edev, edev->pcie_cap + PCI_EXP_DEVCTL,
+				      4, reg);
+		msleep(EEH_PE_RST_SETTLE_TIME);
+		break;
+	}
+
+	return 0;
+}
+
+static int pnv_eeh_do_af_flr(struct pci_dn *pdn, int option)
+{
+	struct eeh_dev *edev = pdn_to_eeh_dev(pdn);
+	u32 cap = 0;
+
+	if (WARN_ON(!edev->af_cap))
+		return -ENOTTY;
+
+	eeh_ops->read_config(edev, edev->af_cap + PCI_AF_CAP, 1, &cap);
+	if (!(cap & PCI_AF_CAP_TP) || !(cap & PCI_AF_CAP_FLR))
+		return -ENOTTY;
+
+	switch (option) {
+	case EEH_RESET_HOT:
+	case EEH_RESET_FUNDAMENTAL:
+		/*
+		 * Wait for Transaction Pending bit to clear. A word-aligned
+		 * test is used, so we use the conrol offset rather than status
+		 * and shift the test bit to match.
+		 */
+		pnv_eeh_wait_for_pending(pdn, "AF",
+					 edev->af_cap + PCI_AF_CTRL,
+					 PCI_AF_STATUS_TP << 8);
+		eeh_ops->write_config(edev, edev->af_cap + PCI_AF_CTRL,
+				      1, PCI_AF_CTRL_FLR);
+		msleep(EEH_PE_RST_HOLD_TIME);
+		break;
+	case EEH_RESET_DEACTIVATE:
+		eeh_ops->write_config(edev, edev->af_cap + PCI_AF_CTRL, 1, 0);
+		msleep(EEH_PE_RST_SETTLE_TIME);
+		break;
+	}
+
+	return 0;
+}
+
+static int pnv_eeh_reset_vf_pe(struct eeh_pe *pe, int option)
+{
+	struct eeh_dev *edev;
+	struct pci_dn *pdn;
+	int ret;
+
+	/* The VF PE should have only one child device */
+	edev = list_first_entry_or_null(&pe->edevs, struct eeh_dev, entry);
+	pdn = eeh_dev_to_pdn(edev);
+	if (!pdn)
+		return -ENXIO;
+
+	ret = pnv_eeh_do_flr(pdn, option);
+	if (!ret)
+		return ret;
+
+	return pnv_eeh_do_af_flr(pdn, option);
+}
+
+/**
+ * pnv_eeh_reset - Reset the specified PE
+ * @pe: EEH PE
+ * @option: reset option
+ *
+ * Do reset on the indicated PE. For PCI bus sensitive PE,
+ * we need to reset the parent p2p bridge. The PHB has to
+ * be reinitialized if the p2p bridge is root bridge. For
+ * PCI device sensitive PE, we will try to reset the device
+ * through FLR. For now, we don't have OPAL APIs to do HARD
+ * reset yet, so all reset would be SOFT (HOT) reset.
+ */
+static int pnv_eeh_reset(struct eeh_pe *pe, int option)
+{
+	struct pci_controller *hose = pe->phb;
+	struct pnv_phb *phb;
+	struct pci_bus *bus;
+	int64_t rc;
+
+	/*
+	 * For PHB reset, we always have complete reset. For those PEs whose
+	 * primary bus derived from root complex (root bus) or root port
+	 * (usually bus#1), we apply hot or fundamental reset on the root port.
+	 * For other PEs, we always have hot reset on the PE primary bus.
+	 *
+	 * Here, we have different design to pHyp, which always clear the
+	 * frozen state during PE reset. However, the good idea here from
+	 * benh is to keep frozen state before we get PE reset done completely
+	 * (until BAR restore). With the frozen state, HW drops illegal IO
+	 * or MMIO access, which can incur recrusive frozen PE during PE
+	 * reset. The side effect is that EEH core has to clear the frozen
+	 * state explicitly after BAR restore.
+	 */
+	if (pe->type & EEH_PE_PHB)
+		return pnv_eeh_phb_reset(hose, option);
+
+	/*
+	 * The frozen PE might be caused by PAPR error injection
+	 * registers, which are expected to be cleared after hitting
+	 * frozen PE as stated in the hardware spec. Unfortunately,
+	 * that's not true on P7IOC. So we have to clear it manually
+	 * to avoid recursive EEH errors during recovery.
+	 */
+	phb = hose->private_data;
+	if (phb->model == PNV_PHB_MODEL_P7IOC &&
+	    (option == EEH_RESET_HOT ||
+	     option == EEH_RESET_FUNDAMENTAL)) {
+		rc = opal_pci_reset(phb->opal_id,
+				    OPAL_RESET_PHB_ERROR,
+				    OPAL_ASSERT_RESET);
+		if (rc != OPAL_SUCCESS) {
+			pr_warn("%s: Failure %lld clearing error injection registers\n",
+				__func__, rc);
+			return -EIO;
+		}
+	}
+
+	if (pe->type & EEH_PE_VF)
+		return pnv_eeh_reset_vf_pe(pe, option);
+
+	bus = eeh_pe_bus_get(pe);
+	if (!bus) {
+		pr_err("%s: Cannot find PCI bus for PHB#%x-PE#%x\n",
+			__func__, pe->phb->global_number, pe->addr);
+		return -EIO;
+	}
+
+	if (pci_is_root_bus(bus))
+		return pnv_eeh_root_reset(hose, option);
+
+	/*
+	 * For hot resets try use the generic PCI error recovery reset
+	 * functions. These correctly handles the case where the secondary
+	 * bus is behind a hotplug slot and it will use the slot provided
+	 * reset methods to prevent spurious hotplug events during the reset.
+	 *
+	 * Fundemental resets need to be handled internally to EEH since the
+	 * PCI core doesn't really have a concept of a fundemental reset,
+	 * mainly because there's no standard way to generate one. Only a
+	 * few devices require an FRESET so it should be fine.
+	 */
+	if (option != EEH_RESET_FUNDAMENTAL) {
+		/*
+		 * NB: Skiboot and pnv_eeh_bridge_reset() also no-op the
+		 *     de-assert step. It's like the OPAL reset API was
+		 *     poorly designed or something...
+		 */
+		if (option == EEH_RESET_DEACTIVATE)
+			return 0;
+
+		rc = pci_bus_error_reset(bus->self);
+		if (!rc)
+			return 0;
+	}
+
+	/* otherwise, use the generic bridge reset. this might call into FW */
+	if (pci_is_root_bus(bus->parent))
+		return pnv_eeh_root_reset(hose, option);
+	return pnv_eeh_bridge_reset(bus->self, option);
+}
+
+/**
+ * pnv_eeh_get_log - Retrieve error log
+ * @pe: EEH PE
+ * @severity: temporary or permanent error log
+ * @drv_log: driver log to be combined with retrieved error log
+ * @len: length of driver log
+ *
+ * Retrieve the temporary or permanent error from the PE.
+ */
+static int pnv_eeh_get_log(struct eeh_pe *pe, int severity,
+			   char *drv_log, unsigned long len)
+{
+	if (!eeh_has_flag(EEH_EARLY_DUMP_LOG))
+		pnv_pci_dump_phb_diag_data(pe->phb, pe->data);
+
+	return 0;
+}
+
+/**
+ * pnv_eeh_configure_bridge - Configure PCI bridges in the indicated PE
+ * @pe: EEH PE
+ *
+ * The function will be called to reconfigure the bridges included
+ * in the specified PE so that the mulfunctional PE would be recovered
+ * again.
+ */
+static int pnv_eeh_configure_bridge(struct eeh_pe *pe)
+{
+	return 0;
+}
+
+/**
+ * pnv_pe_err_inject - Inject specified error to the indicated PE
+ * @pe: the indicated PE
+ * @type: error type
+ * @func: specific error type
+ * @addr: address
+ * @mask: address mask
+ *
+ * The routine is called to inject specified error, which is
+ * determined by @type and @func, to the indicated PE for
+ * testing purpose.
+ */
+static int pnv_eeh_err_inject(struct eeh_pe *pe, int type, int func,
+			      unsigned long addr, unsigned long mask)
+{
+	struct pci_controller *hose = pe->phb;
+	struct pnv_phb *phb = hose->private_data;
+	s64 rc;
+
+	if (type != OPAL_ERR_INJECT_TYPE_IOA_BUS_ERR &&
+	    type != OPAL_ERR_INJECT_TYPE_IOA_BUS_ERR64) {
+		pr_warn("%s: Invalid error type %d\n",
+			__func__, type);
+		return -ERANGE;
+	}
+
+	if (func < OPAL_ERR_INJECT_FUNC_IOA_LD_MEM_ADDR ||
+	    func > OPAL_ERR_INJECT_FUNC_IOA_DMA_WR_TARGET) {
+		pr_warn("%s: Invalid error function %d\n",
+			__func__, func);
+		return -ERANGE;
+	}
+
+	/* Firmware supports error injection ? */
+	if (!opal_check_token(OPAL_PCI_ERR_INJECT)) {
+		pr_warn("%s: Firmware doesn't support error injection\n",
+			__func__);
+		return -ENXIO;
+	}
+
+	/* Do error injection */
+	rc = opal_pci_err_inject(phb->opal_id, pe->addr,
+				 type, func, addr, mask);
+	if (rc != OPAL_SUCCESS) {
+		pr_warn("%s: Failure %lld injecting error "
+			"%d-%d to PHB#%x-PE#%x\n",
+			__func__, rc, type, func,
+			hose->global_number, pe->addr);
+		return -EIO;
+	}
+
+	return 0;
+}
+
+static inline bool pnv_eeh_cfg_blocked(struct pci_dn *pdn)
+{
+	struct eeh_dev *edev = pdn_to_eeh_dev(pdn);
+
+	if (!edev || !edev->pe)
+		return false;
+
+	/*
+	 * We will issue FLR or AF FLR to all VFs, which are contained
+	 * in VF PE. It relies on the EEH PCI config accessors. So we
+	 * can't block them during the window.
+	 */
+	if (edev->physfn && (edev->pe->state & EEH_PE_RESET))
+		return false;
+
+	if (edev->pe->state & EEH_PE_CFG_BLOCKED)
+		return true;
+
+	return false;
+}
+
+static int pnv_eeh_read_config(struct eeh_dev *edev,
+			       int where, int size, u32 *val)
+{
+	struct pci_dn *pdn = eeh_dev_to_pdn(edev);
+
+	if (!pdn)
+		return PCIBIOS_DEVICE_NOT_FOUND;
+
+	if (pnv_eeh_cfg_blocked(pdn)) {
+		*val = 0xFFFFFFFF;
+		return PCIBIOS_SET_FAILED;
+	}
+
+	return pnv_pci_cfg_read(pdn, where, size, val);
+}
+
+static int pnv_eeh_write_config(struct eeh_dev *edev,
+				int where, int size, u32 val)
+{
+	struct pci_dn *pdn = eeh_dev_to_pdn(edev);
+
+	if (!pdn)
+		return PCIBIOS_DEVICE_NOT_FOUND;
+
+	if (pnv_eeh_cfg_blocked(pdn))
+		return PCIBIOS_SET_FAILED;
+
+	return pnv_pci_cfg_write(pdn, where, size, val);
+}
+
+static void pnv_eeh_dump_hub_diag_common(struct OpalIoP7IOCErrorData *data)
+{
+	/* GEM */
+	if (data->gemXfir || data->gemRfir ||
+	    data->gemRirqfir || data->gemMask || data->gemRwof)
+		pr_info("  GEM: %016llx %016llx %016llx %016llx %016llx\n",
+			be64_to_cpu(data->gemXfir),
+			be64_to_cpu(data->gemRfir),
+			be64_to_cpu(data->gemRirqfir),
+			be64_to_cpu(data->gemMask),
+			be64_to_cpu(data->gemRwof));
+
+	/* LEM */
+	if (data->lemFir || data->lemErrMask ||
+	    data->lemAction0 || data->lemAction1 || data->lemWof)
+		pr_info("  LEM: %016llx %016llx %016llx %016llx %016llx\n",
+			be64_to_cpu(data->lemFir),
+			be64_to_cpu(data->lemErrMask),
+			be64_to_cpu(data->lemAction0),
+			be64_to_cpu(data->lemAction1),
+			be64_to_cpu(data->lemWof));
+}
+
+static void pnv_eeh_get_and_dump_hub_diag(struct pci_controller *hose)
+{
+	struct pnv_phb *phb = hose->private_data;
+	struct OpalIoP7IOCErrorData *data =
+		(struct OpalIoP7IOCErrorData*)phb->diag_data;
+	long rc;
+
+	rc = opal_pci_get_hub_diag_data(phb->hub_id, data, sizeof(*data));
+	if (rc != OPAL_SUCCESS) {
+		pr_warn("%s: Failed to get HUB#%llx diag-data (%ld)\n",
+			__func__, phb->hub_id, rc);
+		return;
+	}
+
+	switch (be16_to_cpu(data->type)) {
+	case OPAL_P7IOC_DIAG_TYPE_RGC:
+		pr_info("P7IOC diag-data for RGC\n\n");
+		pnv_eeh_dump_hub_diag_common(data);
+		if (data->rgc.rgcStatus || data->rgc.rgcLdcp)
+			pr_info("  RGC: %016llx %016llx\n",
+				be64_to_cpu(data->rgc.rgcStatus),
+				be64_to_cpu(data->rgc.rgcLdcp));
+		break;
+	case OPAL_P7IOC_DIAG_TYPE_BI:
+		pr_info("P7IOC diag-data for BI %s\n\n",
+			data->bi.biDownbound ? "Downbound" : "Upbound");
+		pnv_eeh_dump_hub_diag_common(data);
+		if (data->bi.biLdcp0 || data->bi.biLdcp1 ||
+		    data->bi.biLdcp2 || data->bi.biFenceStatus)
+			pr_info("  BI:  %016llx %016llx %016llx %016llx\n",
+				be64_to_cpu(data->bi.biLdcp0),
+				be64_to_cpu(data->bi.biLdcp1),
+				be64_to_cpu(data->bi.biLdcp2),
+				be64_to_cpu(data->bi.biFenceStatus));
+		break;
+	case OPAL_P7IOC_DIAG_TYPE_CI:
+		pr_info("P7IOC diag-data for CI Port %d\n\n",
+			data->ci.ciPort);
+		pnv_eeh_dump_hub_diag_common(data);
+		if (data->ci.ciPortStatus || data->ci.ciPortLdcp)
+			pr_info("  CI:  %016llx %016llx\n",
+				be64_to_cpu(data->ci.ciPortStatus),
+				be64_to_cpu(data->ci.ciPortLdcp));
+		break;
+	case OPAL_P7IOC_DIAG_TYPE_MISC:
+		pr_info("P7IOC diag-data for MISC\n\n");
+		pnv_eeh_dump_hub_diag_common(data);
+		break;
+	case OPAL_P7IOC_DIAG_TYPE_I2C:
+		pr_info("P7IOC diag-data for I2C\n\n");
+		pnv_eeh_dump_hub_diag_common(data);
+		break;
+	default:
+		pr_warn("%s: Invalid type of HUB#%llx diag-data (%d)\n",
+			__func__, phb->hub_id, data->type);
+	}
+}
+
+static int pnv_eeh_get_pe(struct pci_controller *hose,
+			  u16 pe_no, struct eeh_pe **pe)
+{
+	struct pnv_phb *phb = hose->private_data;
+	struct pnv_ioda_pe *pnv_pe;
+	struct eeh_pe *dev_pe;
+
+	/*
+	 * If PHB supports compound PE, to fetch
+	 * the master PE because slave PE is invisible
+	 * to EEH core.
+	 */
+	pnv_pe = &phb->ioda.pe_array[pe_no];
+	if (pnv_pe->flags & PNV_IODA_PE_SLAVE) {
+		pnv_pe = pnv_pe->master;
+		WARN_ON(!pnv_pe ||
+			!(pnv_pe->flags & PNV_IODA_PE_MASTER));
+		pe_no = pnv_pe->pe_number;
+	}
+
+	/* Find the PE according to PE# */
+	dev_pe = eeh_pe_get(hose, pe_no);
+	if (!dev_pe)
+		return -EEXIST;
+
+	/* Freeze the (compound) PE */
+	*pe = dev_pe;
+	if (!(dev_pe->state & EEH_PE_ISOLATED))
+		phb->freeze_pe(phb, pe_no);
+
+	/*
+	 * At this point, we're sure the (compound) PE should
+	 * have been frozen. However, we still need poke until
+	 * hitting the frozen PE on top level.
+	 */
+	dev_pe = dev_pe->parent;
+	while (dev_pe && !(dev_pe->type & EEH_PE_PHB)) {
+		int ret;
+		ret = eeh_ops->get_state(dev_pe, NULL);
+		if (ret <= 0 || eeh_state_active(ret)) {
+			dev_pe = dev_pe->parent;
+			continue;
+		}
+
+		/* Frozen parent PE */
+		*pe = dev_pe;
+		if (!(dev_pe->state & EEH_PE_ISOLATED))
+			phb->freeze_pe(phb, dev_pe->addr);
+
+		/* Next one */
+		dev_pe = dev_pe->parent;
+	}
+
+	return 0;
+}
+
+/**
+ * pnv_eeh_next_error - Retrieve next EEH error to handle
+ * @pe: Affected PE
+ *
+ * The function is expected to be called by EEH core while it gets
+ * special EEH event (without binding PE). The function calls to
+ * OPAL APIs for next error to handle. The informational error is
+ * handled internally by platform. However, the dead IOC, dead PHB,
+ * fenced PHB and frozen PE should be handled by EEH core eventually.
+ */
+static int pnv_eeh_next_error(struct eeh_pe **pe)
+{
+	struct pci_controller *hose;
+	struct pnv_phb *phb;
+	struct eeh_pe *phb_pe, *parent_pe;
+	__be64 frozen_pe_no;
+	__be16 err_type, severity;
+	long rc;
+	int state, ret = EEH_NEXT_ERR_NONE;
+
+	/*
+	 * While running here, it's safe to purge the event queue. The
+	 * event should still be masked.
+	 */
+	eeh_remove_event(NULL, false);
+
+	list_for_each_entry(hose, &hose_list, list_node) {
+		/*
+		 * If the subordinate PCI buses of the PHB has been
+		 * removed or is exactly under error recovery, we
+		 * needn't take care of it any more.
+		 */
+		phb = hose->private_data;
+		phb_pe = eeh_phb_pe_get(hose);
+		if (!phb_pe || (phb_pe->state & EEH_PE_ISOLATED))
+			continue;
+
+		rc = opal_pci_next_error(phb->opal_id,
+					 &frozen_pe_no, &err_type, &severity);
+		if (rc != OPAL_SUCCESS) {
+			pr_devel("%s: Invalid return value on "
+				 "PHB#%x (0x%lx) from opal_pci_next_error",
+				 __func__, hose->global_number, rc);
+			continue;
+		}
+
+		/* If the PHB doesn't have error, stop processing */
+		if (be16_to_cpu(err_type) == OPAL_EEH_NO_ERROR ||
+		    be16_to_cpu(severity) == OPAL_EEH_SEV_NO_ERROR) {
+			pr_devel("%s: No error found on PHB#%x\n",
+				 __func__, hose->global_number);
+			continue;
+		}
+
+		/*
+		 * Processing the error. We're expecting the error with
+		 * highest priority reported upon multiple errors on the
+		 * specific PHB.
+		 */
+		pr_devel("%s: Error (%d, %d, %llu) on PHB#%x\n",
+			__func__, be16_to_cpu(err_type),
+			be16_to_cpu(severity), be64_to_cpu(frozen_pe_no),
+			hose->global_number);
+		switch (be16_to_cpu(err_type)) {
+		case OPAL_EEH_IOC_ERROR:
+			if (be16_to_cpu(severity) == OPAL_EEH_SEV_IOC_DEAD) {
+				pr_err("EEH: dead IOC detected\n");
+				ret = EEH_NEXT_ERR_DEAD_IOC;
+			} else if (be16_to_cpu(severity) == OPAL_EEH_SEV_INF) {
+				pr_info("EEH: IOC informative error "
+					"detected\n");
+				pnv_eeh_get_and_dump_hub_diag(hose);
+				ret = EEH_NEXT_ERR_NONE;
+			}
+
+			break;
+		case OPAL_EEH_PHB_ERROR:
+			if (be16_to_cpu(severity) == OPAL_EEH_SEV_PHB_DEAD) {
+				*pe = phb_pe;
+				pr_err("EEH: dead PHB#%x detected, "
+				       "location: %s\n",
+					hose->global_number,
+					eeh_pe_loc_get(phb_pe));
+				ret = EEH_NEXT_ERR_DEAD_PHB;
+			} else if (be16_to_cpu(severity) ==
+				   OPAL_EEH_SEV_PHB_FENCED) {
+				*pe = phb_pe;
+				pr_err("EEH: Fenced PHB#%x detected, "
+				       "location: %s\n",
+					hose->global_number,
+					eeh_pe_loc_get(phb_pe));
+				ret = EEH_NEXT_ERR_FENCED_PHB;
+			} else if (be16_to_cpu(severity) == OPAL_EEH_SEV_INF) {
+				pr_info("EEH: PHB#%x informative error "
+					"detected, location: %s\n",
+					hose->global_number,
+					eeh_pe_loc_get(phb_pe));
+				pnv_eeh_get_phb_diag(phb_pe);
+				pnv_pci_dump_phb_diag_data(hose, phb_pe->data);
+				ret = EEH_NEXT_ERR_NONE;
+			}
+
+			break;
+		case OPAL_EEH_PE_ERROR:
+			/*
+			 * If we can't find the corresponding PE, we
+			 * just try to unfreeze.
+			 */
+			if (pnv_eeh_get_pe(hose,
+				be64_to_cpu(frozen_pe_no), pe)) {
+				pr_info("EEH: Clear non-existing PHB#%x-PE#%llx\n",
+					hose->global_number, be64_to_cpu(frozen_pe_no));
+				pr_info("EEH: PHB location: %s\n",
+					eeh_pe_loc_get(phb_pe));
+
+				/* Dump PHB diag-data */
+				rc = opal_pci_get_phb_diag_data2(phb->opal_id,
+					phb->diag_data, phb->diag_data_size);
+				if (rc == OPAL_SUCCESS)
+					pnv_pci_dump_phb_diag_data(hose,
+							phb->diag_data);
+
+				/* Try best to clear it */
+				opal_pci_eeh_freeze_clear(phb->opal_id,
+					be64_to_cpu(frozen_pe_no),
+					OPAL_EEH_ACTION_CLEAR_FREEZE_ALL);
+				ret = EEH_NEXT_ERR_NONE;
+			} else if ((*pe)->state & EEH_PE_ISOLATED ||
+				   eeh_pe_passed(*pe)) {
+				ret = EEH_NEXT_ERR_NONE;
+			} else {
+				pr_err("EEH: Frozen PE#%x "
+				       "on PHB#%x detected\n",
+				       (*pe)->addr,
+					(*pe)->phb->global_number);
+				pr_err("EEH: PE location: %s, "
+				       "PHB location: %s\n",
+				       eeh_pe_loc_get(*pe),
+				       eeh_pe_loc_get(phb_pe));
+				ret = EEH_NEXT_ERR_FROZEN_PE;
+			}
+
+			break;
+		default:
+			pr_warn("%s: Unexpected error type %d\n",
+				__func__, be16_to_cpu(err_type));
+		}
+
+		/*
+		 * EEH core will try recover from fenced PHB or
+		 * frozen PE. In the time for frozen PE, EEH core
+		 * enable IO path for that before collecting logs,
+		 * but it ruins the site. So we have to dump the
+		 * log in advance here.
+		 */
+		if ((ret == EEH_NEXT_ERR_FROZEN_PE  ||
+		    ret == EEH_NEXT_ERR_FENCED_PHB) &&
+		    !((*pe)->state & EEH_PE_ISOLATED)) {
+			eeh_pe_mark_isolated(*pe);
+			pnv_eeh_get_phb_diag(*pe);
+
+			if (eeh_has_flag(EEH_EARLY_DUMP_LOG))
+				pnv_pci_dump_phb_diag_data((*pe)->phb,
+							   (*pe)->data);
+		}
+
+		/*
+		 * We probably have the frozen parent PE out there and
+		 * we need have to handle frozen parent PE firstly.
+		 */
+		if (ret == EEH_NEXT_ERR_FROZEN_PE) {
+			parent_pe = (*pe)->parent;
+			while (parent_pe) {
+				/* Hit the ceiling ? */
+				if (parent_pe->type & EEH_PE_PHB)
+					break;
+
+				/* Frozen parent PE ? */
+				state = eeh_ops->get_state(parent_pe, NULL);
+				if (state > 0 && !eeh_state_active(state))
+					*pe = parent_pe;
+
+				/* Next parent level */
+				parent_pe = parent_pe->parent;
+			}
+
+			/* We possibly migrate to another PE */
+			eeh_pe_mark_isolated(*pe);
+		}
+
+		/*
+		 * If we have no errors on the specific PHB or only
+		 * informative error there, we continue poking it.
+		 * Otherwise, we need actions to be taken by upper
+		 * layer.
+		 */
+		if (ret > EEH_NEXT_ERR_INF)
+			break;
+	}
+
+	/* Unmask the event */
+	if (ret == EEH_NEXT_ERR_NONE && eeh_enabled())
+		enable_irq(eeh_event_irq);
+
+	return ret;
+}
+
+static int pnv_eeh_restore_config(struct eeh_dev *edev)
+{
+	struct pnv_phb *phb;
+	s64 ret = 0;
+
+	if (!edev)
+		return -EEXIST;
+
+	if (edev->physfn)
+		return 0;
+
+	phb = edev->controller->private_data;
+	ret = opal_pci_reinit(phb->opal_id,
+			      OPAL_REINIT_PCI_DEV, edev->bdfn);
+
+	if (ret) {
+		pr_warn("%s: Can't reinit PCI dev 0x%x (%lld)\n",
+			__func__, edev->bdfn, ret);
+		return -EIO;
+	}
+
+	return ret;
+}
+
+static struct eeh_ops pnv_eeh_ops = {
+	.name                   = "powernv",
+	.probe			= pnv_eeh_probe,
+	.set_option             = pnv_eeh_set_option,
+	.get_state              = pnv_eeh_get_state,
+	.reset                  = pnv_eeh_reset,
+	.get_log                = pnv_eeh_get_log,
+	.configure_bridge       = pnv_eeh_configure_bridge,
+	.err_inject		= pnv_eeh_err_inject,
+	.read_config            = pnv_eeh_read_config,
+	.write_config           = pnv_eeh_write_config,
+	.next_error		= pnv_eeh_next_error,
+	.restore_config		= pnv_eeh_restore_config,
+	.notify_resume		= NULL
+};
+
+#ifdef CONFIG_PCI_IOV
+static void pnv_pci_fixup_vf_mps(struct pci_dev *pdev)
+{
+	struct pci_dn *pdn = pci_get_pdn(pdev);
+	int parent_mps;
+
+	if (!pdev->is_virtfn)
+		return;
+
+	/* Synchronize MPS for VF and PF */
+	parent_mps = pcie_get_mps(pdev->physfn);
+	if ((128 << pdev->pcie_mpss) >= parent_mps)
+		pcie_set_mps(pdev, parent_mps);
+	pdn->mps = pcie_get_mps(pdev);
+}
+DECLARE_PCI_FIXUP_HEADER(PCI_ANY_ID, PCI_ANY_ID, pnv_pci_fixup_vf_mps);
+#endif /* CONFIG_PCI_IOV */
+
+/**
+ * eeh_powernv_init - Register platform dependent EEH operations
+ *
+ * EEH initialization on powernv platform. This function should be
+ * called before any EEH related functions.
+ */
+static int __init eeh_powernv_init(void)
+{
+	int max_diag_size = PNV_PCI_DIAG_BUF_SIZE;
+	struct pci_controller *hose;
+	struct pnv_phb *phb;
+	int ret = -EINVAL;
+
+	if (!firmware_has_feature(FW_FEATURE_OPAL)) {
+		pr_warn("%s: OPAL is required !\n", __func__);
+		return -EINVAL;
+	}
+
+	/* Set probe mode */
+	eeh_add_flag(EEH_PROBE_MODE_DEV);
+
+	/*
+	 * P7IOC blocks PCI config access to frozen PE, but PHB3
+	 * doesn't do that. So we have to selectively enable I/O
+	 * prior to collecting error log.
+	 */
+	list_for_each_entry(hose, &hose_list, list_node) {
+		phb = hose->private_data;
+
+		if (phb->model == PNV_PHB_MODEL_P7IOC)
+			eeh_add_flag(EEH_ENABLE_IO_FOR_LOG);
+
+		if (phb->diag_data_size > max_diag_size)
+			max_diag_size = phb->diag_data_size;
+
+		break;
+	}
+
+	/*
+	 * eeh_init() allocates the eeh_pe and its aux data buf so the
+	 * size needs to be set before calling eeh_init().
+	 */
+	eeh_set_pe_aux_size(max_diag_size);
+	ppc_md.pcibios_bus_add_device = pnv_pcibios_bus_add_device;
+
+	ret = eeh_init(&pnv_eeh_ops);
+	if (!ret)
+		pr_info("EEH: PowerNV platform initialized\n");
+	else
+		pr_info("EEH: Failed to initialize PowerNV platform (%d)\n", ret);
+
+	return ret;
+}
+machine_arch_initcall(powernv, eeh_powernv_init);
diff --git a/arch/powerpc/platforms/powernv/idle.c b/arch/powerpc/platforms/powernv/idle.c
new file mode 100644
index 000000000..1ed7c5286
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/idle.c
@@ -0,0 +1,1535 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * PowerNV cpuidle code
+ *
+ * Copyright 2015 IBM Corp.
+ */
+
+#include <linux/types.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/of.h>
+#include <linux/device.h>
+#include <linux/cpu.h>
+
+#include <asm/asm-prototypes.h>
+#include <asm/firmware.h>
+#include <asm/machdep.h>
+#include <asm/opal.h>
+#include <asm/cputhreads.h>
+#include <asm/cpuidle.h>
+#include <asm/code-patching.h>
+#include <asm/smp.h>
+#include <asm/runlatch.h>
+#include <asm/dbell.h>
+
+#include "powernv.h"
+#include "subcore.h"
+
+/* Power ISA 3.0 allows for stop states 0x0 - 0xF */
+#define MAX_STOP_STATE	0xF
+
+#define P9_STOP_SPR_MSR 2000
+#define P9_STOP_SPR_PSSCR      855
+
+static u32 supported_cpuidle_states;
+struct pnv_idle_states_t *pnv_idle_states;
+int nr_pnv_idle_states;
+
+/*
+ * The default stop state that will be used by ppc_md.power_save
+ * function on platforms that support stop instruction.
+ */
+static u64 pnv_default_stop_val;
+static u64 pnv_default_stop_mask;
+static bool default_stop_found;
+
+/*
+ * First stop state levels when SPR and TB loss can occur.
+ */
+static u64 pnv_first_tb_loss_level = MAX_STOP_STATE + 1;
+static u64 deep_spr_loss_state = MAX_STOP_STATE + 1;
+
+/*
+ * psscr value and mask of the deepest stop idle state.
+ * Used when a cpu is offlined.
+ */
+static u64 pnv_deepest_stop_psscr_val;
+static u64 pnv_deepest_stop_psscr_mask;
+static u64 pnv_deepest_stop_flag;
+static bool deepest_stop_found;
+
+static unsigned long power7_offline_type;
+
+static int pnv_save_sprs_for_deep_states(void)
+{
+	int cpu;
+	int rc;
+
+	/*
+	 * hid0, hid1, hid4, hid5, hmeer and lpcr values are symmetric across
+	 * all cpus at boot. Get these reg values of current cpu and use the
+	 * same across all cpus.
+	 */
+	uint64_t lpcr_val	= mfspr(SPRN_LPCR);
+	uint64_t hid0_val	= mfspr(SPRN_HID0);
+	uint64_t hmeer_val	= mfspr(SPRN_HMEER);
+	uint64_t msr_val = MSR_IDLE;
+	uint64_t psscr_val = pnv_deepest_stop_psscr_val;
+
+	for_each_present_cpu(cpu) {
+		uint64_t pir = get_hard_smp_processor_id(cpu);
+		uint64_t hsprg0_val = (uint64_t)paca_ptrs[cpu];
+
+		rc = opal_slw_set_reg(pir, SPRN_HSPRG0, hsprg0_val);
+		if (rc != 0)
+			return rc;
+
+		rc = opal_slw_set_reg(pir, SPRN_LPCR, lpcr_val);
+		if (rc != 0)
+			return rc;
+
+		if (cpu_has_feature(CPU_FTR_ARCH_300)) {
+			rc = opal_slw_set_reg(pir, P9_STOP_SPR_MSR, msr_val);
+			if (rc)
+				return rc;
+
+			rc = opal_slw_set_reg(pir,
+					      P9_STOP_SPR_PSSCR, psscr_val);
+
+			if (rc)
+				return rc;
+		}
+
+		/* HIDs are per core registers */
+		if (cpu_thread_in_core(cpu) == 0) {
+
+			rc = opal_slw_set_reg(pir, SPRN_HMEER, hmeer_val);
+			if (rc != 0)
+				return rc;
+
+			rc = opal_slw_set_reg(pir, SPRN_HID0, hid0_val);
+			if (rc != 0)
+				return rc;
+
+			/* Only p8 needs to set extra HID regiters */
+			if (!cpu_has_feature(CPU_FTR_ARCH_300)) {
+				uint64_t hid1_val = mfspr(SPRN_HID1);
+				uint64_t hid4_val = mfspr(SPRN_HID4);
+				uint64_t hid5_val = mfspr(SPRN_HID5);
+
+				rc = opal_slw_set_reg(pir, SPRN_HID1, hid1_val);
+				if (rc != 0)
+					return rc;
+
+				rc = opal_slw_set_reg(pir, SPRN_HID4, hid4_val);
+				if (rc != 0)
+					return rc;
+
+				rc = opal_slw_set_reg(pir, SPRN_HID5, hid5_val);
+				if (rc != 0)
+					return rc;
+			}
+		}
+	}
+
+	return 0;
+}
+
+u32 pnv_get_supported_cpuidle_states(void)
+{
+	return supported_cpuidle_states;
+}
+EXPORT_SYMBOL_GPL(pnv_get_supported_cpuidle_states);
+
+static void pnv_fastsleep_workaround_apply(void *info)
+
+{
+	int rc;
+	int *err = info;
+
+	rc = opal_config_cpu_idle_state(OPAL_CONFIG_IDLE_FASTSLEEP,
+					OPAL_CONFIG_IDLE_APPLY);
+	if (rc)
+		*err = 1;
+}
+
+static bool power7_fastsleep_workaround_entry = true;
+static bool power7_fastsleep_workaround_exit = true;
+
+/*
+ * Used to store fastsleep workaround state
+ * 0 - Workaround applied/undone at fastsleep entry/exit path (Default)
+ * 1 - Workaround applied once, never undone.
+ */
+static u8 fastsleep_workaround_applyonce;
+
+static ssize_t show_fastsleep_workaround_applyonce(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	return sprintf(buf, "%u\n", fastsleep_workaround_applyonce);
+}
+
+static ssize_t store_fastsleep_workaround_applyonce(struct device *dev,
+		struct device_attribute *attr, const char *buf,
+		size_t count)
+{
+	cpumask_t primary_thread_mask;
+	int err;
+	u8 val;
+
+	if (kstrtou8(buf, 0, &val) || val != 1)
+		return -EINVAL;
+
+	if (fastsleep_workaround_applyonce == 1)
+		return count;
+
+	/*
+	 * fastsleep_workaround_applyonce = 1 implies
+	 * fastsleep workaround needs to be left in 'applied' state on all
+	 * the cores. Do this by-
+	 * 1. Disable the 'undo' workaround in fastsleep exit path
+	 * 2. Sendi IPIs to all the cores which have at least one online thread
+	 * 3. Disable the 'apply' workaround in fastsleep entry path
+	 *
+	 * There is no need to send ipi to cores which have all threads
+	 * offlined, as last thread of the core entering fastsleep or deeper
+	 * state would have applied workaround.
+	 */
+	power7_fastsleep_workaround_exit = false;
+
+	get_online_cpus();
+	primary_thread_mask = cpu_online_cores_map();
+	on_each_cpu_mask(&primary_thread_mask,
+				pnv_fastsleep_workaround_apply,
+				&err, 1);
+	put_online_cpus();
+	if (err) {
+		pr_err("fastsleep_workaround_applyonce change failed while running pnv_fastsleep_workaround_apply");
+		goto fail;
+	}
+
+	power7_fastsleep_workaround_entry = false;
+
+	fastsleep_workaround_applyonce = 1;
+
+	return count;
+fail:
+	return -EIO;
+}
+
+static DEVICE_ATTR(fastsleep_workaround_applyonce, 0600,
+			show_fastsleep_workaround_applyonce,
+			store_fastsleep_workaround_applyonce);
+
+static inline void atomic_start_thread_idle(void)
+{
+	int cpu = raw_smp_processor_id();
+	int first = cpu_first_thread_sibling(cpu);
+	int thread_nr = cpu_thread_in_core(cpu);
+	unsigned long *state = &paca_ptrs[first]->idle_state;
+
+	clear_bit(thread_nr, state);
+}
+
+static inline void atomic_stop_thread_idle(void)
+{
+	int cpu = raw_smp_processor_id();
+	int first = cpu_first_thread_sibling(cpu);
+	int thread_nr = cpu_thread_in_core(cpu);
+	unsigned long *state = &paca_ptrs[first]->idle_state;
+
+	set_bit(thread_nr, state);
+}
+
+static inline void atomic_lock_thread_idle(void)
+{
+	int cpu = raw_smp_processor_id();
+	int first = cpu_first_thread_sibling(cpu);
+	unsigned long *state = &paca_ptrs[first]->idle_state;
+
+	while (unlikely(test_and_set_bit_lock(NR_PNV_CORE_IDLE_LOCK_BIT, state)))
+		barrier();
+}
+
+static inline void atomic_unlock_and_stop_thread_idle(void)
+{
+	int cpu = raw_smp_processor_id();
+	int first = cpu_first_thread_sibling(cpu);
+	unsigned long thread = 1UL << cpu_thread_in_core(cpu);
+	unsigned long *state = &paca_ptrs[first]->idle_state;
+	u64 s = READ_ONCE(*state);
+	u64 new, tmp;
+
+	BUG_ON(!(s & PNV_CORE_IDLE_LOCK_BIT));
+	BUG_ON(s & thread);
+
+again:
+	new = (s | thread) & ~PNV_CORE_IDLE_LOCK_BIT;
+	tmp = cmpxchg(state, s, new);
+	if (unlikely(tmp != s)) {
+		s = tmp;
+		goto again;
+	}
+}
+
+static inline void atomic_unlock_thread_idle(void)
+{
+	int cpu = raw_smp_processor_id();
+	int first = cpu_first_thread_sibling(cpu);
+	unsigned long *state = &paca_ptrs[first]->idle_state;
+
+	BUG_ON(!test_bit(NR_PNV_CORE_IDLE_LOCK_BIT, state));
+	clear_bit_unlock(NR_PNV_CORE_IDLE_LOCK_BIT, state);
+}
+
+/* P7 and P8 */
+struct p7_sprs {
+	/* per core */
+	u64 tscr;
+	u64 worc;
+
+	/* per subcore */
+	u64 sdr1;
+	u64 rpr;
+
+	/* per thread */
+	u64 lpcr;
+	u64 hfscr;
+	u64 fscr;
+	u64 purr;
+	u64 spurr;
+	u64 dscr;
+	u64 wort;
+
+	/* per thread SPRs that get lost in shallow states */
+	u64 amr;
+	u64 iamr;
+	u64 amor;
+	u64 uamor;
+};
+
+static unsigned long power7_idle_insn(unsigned long type)
+{
+	int cpu = raw_smp_processor_id();
+	int first = cpu_first_thread_sibling(cpu);
+	unsigned long *state = &paca_ptrs[first]->idle_state;
+	unsigned long thread = 1UL << cpu_thread_in_core(cpu);
+	unsigned long core_thread_mask = (1UL << threads_per_core) - 1;
+	unsigned long srr1;
+	bool full_winkle;
+	struct p7_sprs sprs = {}; /* avoid false use-uninitialised */
+	bool sprs_saved = false;
+	int rc;
+
+	if (unlikely(type != PNV_THREAD_NAP)) {
+		atomic_lock_thread_idle();
+
+		BUG_ON(!(*state & thread));
+		*state &= ~thread;
+
+		if (power7_fastsleep_workaround_entry) {
+			if ((*state & core_thread_mask) == 0) {
+				rc = opal_config_cpu_idle_state(
+						OPAL_CONFIG_IDLE_FASTSLEEP,
+						OPAL_CONFIG_IDLE_APPLY);
+				BUG_ON(rc);
+			}
+		}
+
+		if (type == PNV_THREAD_WINKLE) {
+			sprs.tscr	= mfspr(SPRN_TSCR);
+			sprs.worc	= mfspr(SPRN_WORC);
+
+			sprs.sdr1	= mfspr(SPRN_SDR1);
+			sprs.rpr	= mfspr(SPRN_RPR);
+
+			sprs.lpcr	= mfspr(SPRN_LPCR);
+			if (cpu_has_feature(CPU_FTR_ARCH_207S)) {
+				sprs.hfscr	= mfspr(SPRN_HFSCR);
+				sprs.fscr	= mfspr(SPRN_FSCR);
+			}
+			sprs.purr	= mfspr(SPRN_PURR);
+			sprs.spurr	= mfspr(SPRN_SPURR);
+			sprs.dscr	= mfspr(SPRN_DSCR);
+			sprs.wort	= mfspr(SPRN_WORT);
+
+			sprs_saved = true;
+
+			/*
+			 * Increment winkle counter and set all winkle bits if
+			 * all threads are winkling. This allows wakeup side to
+			 * distinguish between fast sleep and winkle state
+			 * loss. Fast sleep still has to resync the timebase so
+			 * this may not be a really big win.
+			 */
+			*state += 1 << PNV_CORE_IDLE_WINKLE_COUNT_SHIFT;
+			if ((*state & PNV_CORE_IDLE_WINKLE_COUNT_BITS)
+					>> PNV_CORE_IDLE_WINKLE_COUNT_SHIFT
+					== threads_per_core)
+				*state |= PNV_CORE_IDLE_THREAD_WINKLE_BITS;
+			WARN_ON((*state & PNV_CORE_IDLE_WINKLE_COUNT_BITS) == 0);
+		}
+
+		atomic_unlock_thread_idle();
+	}
+
+	if (cpu_has_feature(CPU_FTR_ARCH_207S)) {
+		sprs.amr	= mfspr(SPRN_AMR);
+		sprs.iamr	= mfspr(SPRN_IAMR);
+		sprs.amor	= mfspr(SPRN_AMOR);
+		sprs.uamor	= mfspr(SPRN_UAMOR);
+	}
+
+	local_paca->thread_idle_state = type;
+	srr1 = isa206_idle_insn_mayloss(type);		/* go idle */
+	local_paca->thread_idle_state = PNV_THREAD_RUNNING;
+
+	WARN_ON_ONCE(!srr1);
+	WARN_ON_ONCE(mfmsr() & (MSR_IR|MSR_DR));
+
+	if (cpu_has_feature(CPU_FTR_ARCH_207S)) {
+		if ((srr1 & SRR1_WAKESTATE) != SRR1_WS_NOLOSS) {
+			/*
+			 * We don't need an isync after the mtsprs here because
+			 * the upcoming mtmsrd is execution synchronizing.
+			 */
+			mtspr(SPRN_AMR,		sprs.amr);
+			mtspr(SPRN_IAMR,	sprs.iamr);
+			mtspr(SPRN_AMOR,	sprs.amor);
+			mtspr(SPRN_UAMOR,	sprs.uamor);
+		}
+	}
+
+	if (unlikely((srr1 & SRR1_WAKEMASK_P8) == SRR1_WAKEHMI))
+		hmi_exception_realmode(NULL);
+
+	if (likely((srr1 & SRR1_WAKESTATE) != SRR1_WS_HVLOSS)) {
+		if (unlikely(type != PNV_THREAD_NAP)) {
+			atomic_lock_thread_idle();
+			if (type == PNV_THREAD_WINKLE) {
+				WARN_ON((*state & PNV_CORE_IDLE_WINKLE_COUNT_BITS) == 0);
+				*state -= 1 << PNV_CORE_IDLE_WINKLE_COUNT_SHIFT;
+				*state &= ~(thread << PNV_CORE_IDLE_THREAD_WINKLE_BITS_SHIFT);
+			}
+			atomic_unlock_and_stop_thread_idle();
+		}
+		return srr1;
+	}
+
+	/* HV state loss */
+	BUG_ON(type == PNV_THREAD_NAP);
+
+	atomic_lock_thread_idle();
+
+	full_winkle = false;
+	if (type == PNV_THREAD_WINKLE) {
+		WARN_ON((*state & PNV_CORE_IDLE_WINKLE_COUNT_BITS) == 0);
+		*state -= 1 << PNV_CORE_IDLE_WINKLE_COUNT_SHIFT;
+		if (*state & (thread << PNV_CORE_IDLE_THREAD_WINKLE_BITS_SHIFT)) {
+			*state &= ~(thread << PNV_CORE_IDLE_THREAD_WINKLE_BITS_SHIFT);
+			full_winkle = true;
+			BUG_ON(!sprs_saved);
+		}
+	}
+
+	WARN_ON(*state & thread);
+
+	if ((*state & core_thread_mask) != 0)
+		goto core_woken;
+
+	/* Per-core SPRs */
+	if (full_winkle) {
+		mtspr(SPRN_TSCR,	sprs.tscr);
+		mtspr(SPRN_WORC,	sprs.worc);
+	}
+
+	if (power7_fastsleep_workaround_exit) {
+		rc = opal_config_cpu_idle_state(OPAL_CONFIG_IDLE_FASTSLEEP,
+						OPAL_CONFIG_IDLE_UNDO);
+		BUG_ON(rc);
+	}
+
+	/* TB */
+	if (opal_resync_timebase() != OPAL_SUCCESS)
+		BUG();
+
+core_woken:
+	if (!full_winkle)
+		goto subcore_woken;
+
+	if ((*state & local_paca->subcore_sibling_mask) != 0)
+		goto subcore_woken;
+
+	/* Per-subcore SPRs */
+	mtspr(SPRN_SDR1,	sprs.sdr1);
+	mtspr(SPRN_RPR,		sprs.rpr);
+
+subcore_woken:
+	/*
+	 * isync after restoring shared SPRs and before unlocking. Unlock
+	 * only contains hwsync which does not necessarily do the right
+	 * thing for SPRs.
+	 */
+	isync();
+	atomic_unlock_and_stop_thread_idle();
+
+	/* Fast sleep does not lose SPRs */
+	if (!full_winkle)
+		return srr1;
+
+	/* Per-thread SPRs */
+	mtspr(SPRN_LPCR,	sprs.lpcr);
+	if (cpu_has_feature(CPU_FTR_ARCH_207S)) {
+		mtspr(SPRN_HFSCR,	sprs.hfscr);
+		mtspr(SPRN_FSCR,	sprs.fscr);
+	}
+	mtspr(SPRN_PURR,	sprs.purr);
+	mtspr(SPRN_SPURR,	sprs.spurr);
+	mtspr(SPRN_DSCR,	sprs.dscr);
+	mtspr(SPRN_WORT,	sprs.wort);
+
+	mtspr(SPRN_SPRG3,	local_paca->sprg_vdso);
+
+	/*
+	 * The SLB has to be restored here, but it sometimes still
+	 * contains entries, so the __ variant must be used to prevent
+	 * multi hits.
+	 */
+	__slb_restore_bolted_realmode();
+
+	return srr1;
+}
+
+extern unsigned long idle_kvm_start_guest(unsigned long srr1);
+
+#ifdef CONFIG_HOTPLUG_CPU
+static unsigned long power7_offline(void)
+{
+	unsigned long srr1;
+
+	mtmsr(MSR_IDLE);
+
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+	/* Tell KVM we're entering idle. */
+	/******************************************************/
+	/*  N O T E   W E L L    ! ! !    N O T E   W E L L   */
+	/* The following store to HSTATE_HWTHREAD_STATE(r13)  */
+	/* MUST occur in real mode, i.e. with the MMU off,    */
+	/* and the MMU must stay off until we clear this flag */
+	/* and test HSTATE_HWTHREAD_REQ(r13) in               */
+	/* pnv_powersave_wakeup in this file.                 */
+	/* The reason is that another thread can switch the   */
+	/* MMU to a guest context whenever this flag is set   */
+	/* to KVM_HWTHREAD_IN_IDLE, and if the MMU was on,    */
+	/* that would potentially cause this thread to start  */
+	/* executing instructions from guest memory in        */
+	/* hypervisor mode, leading to a host crash or data   */
+	/* corruption, or worse.                              */
+	/******************************************************/
+	local_paca->kvm_hstate.hwthread_state = KVM_HWTHREAD_IN_IDLE;
+#endif
+
+	__ppc64_runlatch_off();
+	srr1 = power7_idle_insn(power7_offline_type);
+	__ppc64_runlatch_on();
+
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+	local_paca->kvm_hstate.hwthread_state = KVM_HWTHREAD_IN_KERNEL;
+	/* Order setting hwthread_state vs. testing hwthread_req */
+	smp_mb();
+	if (local_paca->kvm_hstate.hwthread_req)
+		srr1 = idle_kvm_start_guest(srr1);
+#endif
+
+	mtmsr(MSR_KERNEL);
+
+	return srr1;
+}
+#endif
+
+void power7_idle_type(unsigned long type)
+{
+	unsigned long srr1;
+
+	if (!prep_irq_for_idle_irqsoff())
+		return;
+
+	mtmsr(MSR_IDLE);
+	__ppc64_runlatch_off();
+	srr1 = power7_idle_insn(type);
+	__ppc64_runlatch_on();
+	mtmsr(MSR_KERNEL);
+
+	fini_irq_for_idle_irqsoff();
+	irq_set_pending_from_srr1(srr1);
+}
+
+static void power7_idle(void)
+{
+	if (!powersave_nap)
+		return;
+
+	power7_idle_type(PNV_THREAD_NAP);
+}
+
+struct p9_sprs {
+	/* per core */
+	u64 ptcr;
+	u64 rpr;
+	u64 tscr;
+	u64 ldbar;
+
+	/* per thread */
+	u64 lpcr;
+	u64 hfscr;
+	u64 fscr;
+	u64 pid;
+	u64 purr;
+	u64 spurr;
+	u64 dscr;
+	u64 wort;
+
+	u64 mmcra;
+	u32 mmcr0;
+	u32 mmcr1;
+	u64 mmcr2;
+
+	/* per thread SPRs that get lost in shallow states */
+	u64 amr;
+	u64 iamr;
+	u64 amor;
+	u64 uamor;
+};
+
+static unsigned long power9_idle_stop(unsigned long psscr, bool mmu_on)
+{
+	int cpu = raw_smp_processor_id();
+	int first = cpu_first_thread_sibling(cpu);
+	unsigned long *state = &paca_ptrs[first]->idle_state;
+	unsigned long core_thread_mask = (1UL << threads_per_core) - 1;
+	unsigned long srr1;
+	unsigned long pls;
+	unsigned long mmcr0 = 0;
+	unsigned long mmcra = 0;
+	struct p9_sprs sprs = {}; /* avoid false used-uninitialised */
+	bool sprs_saved = false;
+
+	if (!(psscr & (PSSCR_EC|PSSCR_ESL))) {
+		/* EC=ESL=0 case */
+
+		BUG_ON(!mmu_on);
+
+		/*
+		 * Wake synchronously. SRESET via xscom may still cause
+		 * a 0x100 powersave wakeup with SRR1 reason!
+		 */
+		srr1 = isa300_idle_stop_noloss(psscr);		/* go idle */
+		if (likely(!srr1))
+			return 0;
+
+		/*
+		 * Registers not saved, can't recover!
+		 * This would be a hardware bug
+		 */
+		BUG_ON((srr1 & SRR1_WAKESTATE) != SRR1_WS_NOLOSS);
+
+		goto out;
+	}
+
+	/* EC=ESL=1 case */
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+	if (cpu_has_feature(CPU_FTR_P9_TM_XER_SO_BUG)) {
+		local_paca->requested_psscr = psscr;
+		/* order setting requested_psscr vs testing dont_stop */
+		smp_mb();
+		if (atomic_read(&local_paca->dont_stop)) {
+			local_paca->requested_psscr = 0;
+			return 0;
+		}
+	}
+#endif
+
+	if (!cpu_has_feature(CPU_FTR_POWER9_DD2_1)) {
+		 /*
+		  * POWER9 DD2 can incorrectly set PMAO when waking up
+		  * after a state-loss idle. Saving and restoring MMCR0
+		  * over idle is a workaround.
+		  */
+		mmcr0		= mfspr(SPRN_MMCR0);
+	}
+
+	if ((psscr & PSSCR_RL_MASK) >= deep_spr_loss_state) {
+		sprs.lpcr	= mfspr(SPRN_LPCR);
+		sprs.hfscr	= mfspr(SPRN_HFSCR);
+		sprs.fscr	= mfspr(SPRN_FSCR);
+		sprs.pid	= mfspr(SPRN_PID);
+		sprs.purr	= mfspr(SPRN_PURR);
+		sprs.spurr	= mfspr(SPRN_SPURR);
+		sprs.dscr	= mfspr(SPRN_DSCR);
+		sprs.wort	= mfspr(SPRN_WORT);
+
+		sprs.mmcra	= mfspr(SPRN_MMCRA);
+		sprs.mmcr0	= mfspr(SPRN_MMCR0);
+		sprs.mmcr1	= mfspr(SPRN_MMCR1);
+		sprs.mmcr2	= mfspr(SPRN_MMCR2);
+
+		sprs.ptcr	= mfspr(SPRN_PTCR);
+		sprs.rpr	= mfspr(SPRN_RPR);
+		sprs.tscr	= mfspr(SPRN_TSCR);
+		if (!firmware_has_feature(FW_FEATURE_ULTRAVISOR))
+			sprs.ldbar = mfspr(SPRN_LDBAR);
+
+		sprs_saved = true;
+
+		atomic_start_thread_idle();
+	}
+
+	sprs.amr	= mfspr(SPRN_AMR);
+	sprs.iamr	= mfspr(SPRN_IAMR);
+	sprs.amor	= mfspr(SPRN_AMOR);
+	sprs.uamor	= mfspr(SPRN_UAMOR);
+
+	srr1 = isa300_idle_stop_mayloss(psscr);		/* go idle */
+
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+	local_paca->requested_psscr = 0;
+#endif
+
+	psscr = mfspr(SPRN_PSSCR);
+
+	WARN_ON_ONCE(!srr1);
+	WARN_ON_ONCE(mfmsr() & (MSR_IR|MSR_DR));
+
+	if ((srr1 & SRR1_WAKESTATE) != SRR1_WS_NOLOSS) {
+		/*
+		 * We don't need an isync after the mtsprs here because the
+		 * upcoming mtmsrd is execution synchronizing.
+		 */
+		mtspr(SPRN_AMR,		sprs.amr);
+		mtspr(SPRN_IAMR,	sprs.iamr);
+		mtspr(SPRN_AMOR,	sprs.amor);
+		mtspr(SPRN_UAMOR,	sprs.uamor);
+
+		/*
+		 * Workaround for POWER9 DD2.0, if we lost resources, the ERAT
+		 * might have been corrupted and needs flushing. We also need
+		 * to reload MMCR0 (see mmcr0 comment above).
+		 */
+		if (!cpu_has_feature(CPU_FTR_POWER9_DD2_1)) {
+			asm volatile(PPC_ISA_3_0_INVALIDATE_ERAT);
+			mtspr(SPRN_MMCR0, mmcr0);
+		}
+
+		/*
+		 * DD2.2 and earlier need to set then clear bit 60 in MMCRA
+		 * to ensure the PMU starts running.
+		 */
+		mmcra = mfspr(SPRN_MMCRA);
+		mmcra |= PPC_BIT(60);
+		mtspr(SPRN_MMCRA, mmcra);
+		mmcra &= ~PPC_BIT(60);
+		mtspr(SPRN_MMCRA, mmcra);
+	}
+
+	if (unlikely((srr1 & SRR1_WAKEMASK_P8) == SRR1_WAKEHMI))
+		hmi_exception_realmode(NULL);
+
+	/*
+	 * On POWER9, SRR1 bits do not match exactly as expected.
+	 * SRR1_WS_GPRLOSS (10b) can also result in SPR loss, so
+	 * just always test PSSCR for SPR/TB state loss.
+	 */
+	pls = (psscr & PSSCR_PLS) >> PSSCR_PLS_SHIFT;
+	if (likely(pls < deep_spr_loss_state)) {
+		if (sprs_saved)
+			atomic_stop_thread_idle();
+		goto out;
+	}
+
+	/* HV state loss */
+	BUG_ON(!sprs_saved);
+
+	atomic_lock_thread_idle();
+
+	if ((*state & core_thread_mask) != 0)
+		goto core_woken;
+
+	/* Per-core SPRs */
+	mtspr(SPRN_PTCR,	sprs.ptcr);
+	mtspr(SPRN_RPR,		sprs.rpr);
+	mtspr(SPRN_TSCR,	sprs.tscr);
+
+	if (pls >= pnv_first_tb_loss_level) {
+		/* TB loss */
+		if (opal_resync_timebase() != OPAL_SUCCESS)
+			BUG();
+	}
+
+	/*
+	 * isync after restoring shared SPRs and before unlocking. Unlock
+	 * only contains hwsync which does not necessarily do the right
+	 * thing for SPRs.
+	 */
+	isync();
+
+core_woken:
+	atomic_unlock_and_stop_thread_idle();
+
+	/* Per-thread SPRs */
+	mtspr(SPRN_LPCR,	sprs.lpcr);
+	mtspr(SPRN_HFSCR,	sprs.hfscr);
+	mtspr(SPRN_FSCR,	sprs.fscr);
+	mtspr(SPRN_PID,		sprs.pid);
+	mtspr(SPRN_PURR,	sprs.purr);
+	mtspr(SPRN_SPURR,	sprs.spurr);
+	mtspr(SPRN_DSCR,	sprs.dscr);
+	mtspr(SPRN_WORT,	sprs.wort);
+
+	mtspr(SPRN_MMCRA,	sprs.mmcra);
+	mtspr(SPRN_MMCR0,	sprs.mmcr0);
+	mtspr(SPRN_MMCR1,	sprs.mmcr1);
+	mtspr(SPRN_MMCR2,	sprs.mmcr2);
+	if (!firmware_has_feature(FW_FEATURE_ULTRAVISOR))
+		mtspr(SPRN_LDBAR, sprs.ldbar);
+
+	mtspr(SPRN_SPRG3,	local_paca->sprg_vdso);
+
+	if (!radix_enabled())
+		__slb_restore_bolted_realmode();
+
+out:
+	if (mmu_on)
+		mtmsr(MSR_KERNEL);
+
+	return srr1;
+}
+
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+/*
+ * This is used in working around bugs in thread reconfiguration
+ * on POWER9 (at least up to Nimbus DD2.2) relating to transactional
+ * memory and the way that XER[SO] is checkpointed.
+ * This function forces the core into SMT4 in order by asking
+ * all other threads not to stop, and sending a message to any
+ * that are in a stop state.
+ * Must be called with preemption disabled.
+ */
+void pnv_power9_force_smt4_catch(void)
+{
+	int cpu, cpu0, thr;
+	int awake_threads = 1;		/* this thread is awake */
+	int poke_threads = 0;
+	int need_awake = threads_per_core;
+
+	cpu = smp_processor_id();
+	cpu0 = cpu & ~(threads_per_core - 1);
+	for (thr = 0; thr < threads_per_core; ++thr) {
+		if (cpu != cpu0 + thr)
+			atomic_inc(&paca_ptrs[cpu0+thr]->dont_stop);
+	}
+	/* order setting dont_stop vs testing requested_psscr */
+	smp_mb();
+	for (thr = 0; thr < threads_per_core; ++thr) {
+		if (!paca_ptrs[cpu0+thr]->requested_psscr)
+			++awake_threads;
+		else
+			poke_threads |= (1 << thr);
+	}
+
+	/* If at least 3 threads are awake, the core is in SMT4 already */
+	if (awake_threads < need_awake) {
+		/* We have to wake some threads; we'll use msgsnd */
+		for (thr = 0; thr < threads_per_core; ++thr) {
+			if (poke_threads & (1 << thr)) {
+				ppc_msgsnd_sync();
+				ppc_msgsnd(PPC_DBELL_MSGTYPE, 0,
+					   paca_ptrs[cpu0+thr]->hw_cpu_id);
+			}
+		}
+		/* now spin until at least 3 threads are awake */
+		do {
+			for (thr = 0; thr < threads_per_core; ++thr) {
+				if ((poke_threads & (1 << thr)) &&
+				    !paca_ptrs[cpu0+thr]->requested_psscr) {
+					++awake_threads;
+					poke_threads &= ~(1 << thr);
+				}
+			}
+		} while (awake_threads < need_awake);
+	}
+}
+EXPORT_SYMBOL_GPL(pnv_power9_force_smt4_catch);
+
+void pnv_power9_force_smt4_release(void)
+{
+	int cpu, cpu0, thr;
+
+	cpu = smp_processor_id();
+	cpu0 = cpu & ~(threads_per_core - 1);
+
+	/* clear all the dont_stop flags */
+	for (thr = 0; thr < threads_per_core; ++thr) {
+		if (cpu != cpu0 + thr)
+			atomic_dec(&paca_ptrs[cpu0+thr]->dont_stop);
+	}
+}
+EXPORT_SYMBOL_GPL(pnv_power9_force_smt4_release);
+#endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */
+
+struct p10_sprs {
+	/*
+	 * SPRs that get lost in shallow states:
+	 *
+	 * P10 loses CR, LR, CTR, FPSCR, VSCR, XER, TAR, SPRG2, and HSPRG1
+	 * isa300 idle routines restore CR, LR.
+	 * CTR is volatile
+	 * idle thread doesn't use FP or VEC
+	 * kernel doesn't use TAR
+	 * HSPRG1 is only live in HV interrupt entry
+	 * SPRG2 is only live in KVM guests, KVM handles it.
+	 */
+};
+
+static unsigned long power10_idle_stop(unsigned long psscr, bool mmu_on)
+{
+	int cpu = raw_smp_processor_id();
+	int first = cpu_first_thread_sibling(cpu);
+	unsigned long *state = &paca_ptrs[first]->idle_state;
+	unsigned long core_thread_mask = (1UL << threads_per_core) - 1;
+	unsigned long srr1;
+	unsigned long pls;
+//	struct p10_sprs sprs = {}; /* avoid false used-uninitialised */
+	bool sprs_saved = false;
+
+	if (!(psscr & (PSSCR_EC|PSSCR_ESL))) {
+		/* EC=ESL=0 case */
+
+		BUG_ON(!mmu_on);
+
+		/*
+		 * Wake synchronously. SRESET via xscom may still cause
+		 * a 0x100 powersave wakeup with SRR1 reason!
+		 */
+		srr1 = isa300_idle_stop_noloss(psscr);		/* go idle */
+		if (likely(!srr1))
+			return 0;
+
+		/*
+		 * Registers not saved, can't recover!
+		 * This would be a hardware bug
+		 */
+		BUG_ON((srr1 & SRR1_WAKESTATE) != SRR1_WS_NOLOSS);
+
+		goto out;
+	}
+
+	/* EC=ESL=1 case */
+	if ((psscr & PSSCR_RL_MASK) >= deep_spr_loss_state) {
+		/* XXX: save SPRs for deep state loss here. */
+
+		sprs_saved = true;
+
+		atomic_start_thread_idle();
+	}
+
+	srr1 = isa300_idle_stop_mayloss(psscr);		/* go idle */
+
+	psscr = mfspr(SPRN_PSSCR);
+
+	WARN_ON_ONCE(!srr1);
+	WARN_ON_ONCE(mfmsr() & (MSR_IR|MSR_DR));
+
+	if (unlikely((srr1 & SRR1_WAKEMASK_P8) == SRR1_WAKEHMI))
+		hmi_exception_realmode(NULL);
+
+	/*
+	 * On POWER10, SRR1 bits do not match exactly as expected.
+	 * SRR1_WS_GPRLOSS (10b) can also result in SPR loss, so
+	 * just always test PSSCR for SPR/TB state loss.
+	 */
+	pls = (psscr & PSSCR_PLS) >> PSSCR_PLS_SHIFT;
+	if (likely(pls < deep_spr_loss_state)) {
+		if (sprs_saved)
+			atomic_stop_thread_idle();
+		goto out;
+	}
+
+	/* HV state loss */
+	BUG_ON(!sprs_saved);
+
+	atomic_lock_thread_idle();
+
+	if ((*state & core_thread_mask) != 0)
+		goto core_woken;
+
+	/* XXX: restore per-core SPRs here */
+
+	if (pls >= pnv_first_tb_loss_level) {
+		/* TB loss */
+		if (opal_resync_timebase() != OPAL_SUCCESS)
+			BUG();
+	}
+
+	/*
+	 * isync after restoring shared SPRs and before unlocking. Unlock
+	 * only contains hwsync which does not necessarily do the right
+	 * thing for SPRs.
+	 */
+	isync();
+
+core_woken:
+	atomic_unlock_and_stop_thread_idle();
+
+	/* XXX: restore per-thread SPRs here */
+
+	if (!radix_enabled())
+		__slb_restore_bolted_realmode();
+
+out:
+	if (mmu_on)
+		mtmsr(MSR_KERNEL);
+
+	return srr1;
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+static unsigned long arch300_offline_stop(unsigned long psscr)
+{
+	unsigned long srr1;
+
+#ifndef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+	__ppc64_runlatch_off();
+	if (cpu_has_feature(CPU_FTR_ARCH_31))
+		srr1 = power10_idle_stop(psscr, true);
+	else
+		srr1 = power9_idle_stop(psscr, true);
+	__ppc64_runlatch_on();
+#else
+	/*
+	 * Tell KVM we're entering idle.
+	 * This does not have to be done in real mode because the P9 MMU
+	 * is independent per-thread. Some steppings share radix/hash mode
+	 * between threads, but in that case KVM has a barrier sync in real
+	 * mode before and after switching between radix and hash.
+	 *
+	 * kvm_start_guest must still be called in real mode though, hence
+	 * the false argument.
+	 */
+	local_paca->kvm_hstate.hwthread_state = KVM_HWTHREAD_IN_IDLE;
+
+	__ppc64_runlatch_off();
+	if (cpu_has_feature(CPU_FTR_ARCH_31))
+		srr1 = power10_idle_stop(psscr, false);
+	else
+		srr1 = power9_idle_stop(psscr, false);
+	__ppc64_runlatch_on();
+
+	local_paca->kvm_hstate.hwthread_state = KVM_HWTHREAD_IN_KERNEL;
+	/* Order setting hwthread_state vs. testing hwthread_req */
+	smp_mb();
+	if (local_paca->kvm_hstate.hwthread_req)
+		srr1 = idle_kvm_start_guest(srr1);
+	mtmsr(MSR_KERNEL);
+#endif
+
+	return srr1;
+}
+#endif
+
+void arch300_idle_type(unsigned long stop_psscr_val,
+				      unsigned long stop_psscr_mask)
+{
+	unsigned long psscr;
+	unsigned long srr1;
+
+	if (!prep_irq_for_idle_irqsoff())
+		return;
+
+	psscr = mfspr(SPRN_PSSCR);
+	psscr = (psscr & ~stop_psscr_mask) | stop_psscr_val;
+
+	__ppc64_runlatch_off();
+	if (cpu_has_feature(CPU_FTR_ARCH_31))
+		srr1 = power10_idle_stop(psscr, true);
+	else
+		srr1 = power9_idle_stop(psscr, true);
+	__ppc64_runlatch_on();
+
+	fini_irq_for_idle_irqsoff();
+
+	irq_set_pending_from_srr1(srr1);
+}
+
+/*
+ * Used for ppc_md.power_save which needs a function with no parameters
+ */
+static void arch300_idle(void)
+{
+	arch300_idle_type(pnv_default_stop_val, pnv_default_stop_mask);
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+
+void pnv_program_cpu_hotplug_lpcr(unsigned int cpu, u64 lpcr_val)
+{
+	u64 pir = get_hard_smp_processor_id(cpu);
+
+	mtspr(SPRN_LPCR, lpcr_val);
+
+	/*
+	 * Program the LPCR via stop-api only if the deepest stop state
+	 * can lose hypervisor context.
+	 */
+	if (supported_cpuidle_states & OPAL_PM_LOSE_FULL_CONTEXT)
+		opal_slw_set_reg(pir, SPRN_LPCR, lpcr_val);
+}
+
+/*
+ * pnv_cpu_offline: A function that puts the CPU into the deepest
+ * available platform idle state on a CPU-Offline.
+ * interrupts hard disabled and no lazy irq pending.
+ */
+unsigned long pnv_cpu_offline(unsigned int cpu)
+{
+	unsigned long srr1;
+
+	__ppc64_runlatch_off();
+
+	if (cpu_has_feature(CPU_FTR_ARCH_300) && deepest_stop_found) {
+		unsigned long psscr;
+
+		psscr = mfspr(SPRN_PSSCR);
+		psscr = (psscr & ~pnv_deepest_stop_psscr_mask) |
+						pnv_deepest_stop_psscr_val;
+		srr1 = arch300_offline_stop(psscr);
+	} else if (cpu_has_feature(CPU_FTR_ARCH_206) && power7_offline_type) {
+		srr1 = power7_offline();
+	} else {
+		/* This is the fallback method. We emulate snooze */
+		while (!generic_check_cpu_restart(cpu)) {
+			HMT_low();
+			HMT_very_low();
+		}
+		srr1 = 0;
+		HMT_medium();
+	}
+
+	__ppc64_runlatch_on();
+
+	return srr1;
+}
+#endif
+
+/*
+ * Power ISA 3.0 idle initialization.
+ *
+ * POWER ISA 3.0 defines a new SPR Processor stop Status and Control
+ * Register (PSSCR) to control idle behavior.
+ *
+ * PSSCR layout:
+ * ----------------------------------------------------------
+ * | PLS | /// | SD | ESL | EC | PSLL | /// | TR | MTL | RL |
+ * ----------------------------------------------------------
+ * 0      4     41   42    43   44     48    54   56    60
+ *
+ * PSSCR key fields:
+ *	Bits 0:3  - Power-Saving Level Status (PLS). This field indicates the
+ *	lowest power-saving state the thread entered since stop instruction was
+ *	last executed.
+ *
+ *	Bit 41 - Status Disable(SD)
+ *	0 - Shows PLS entries
+ *	1 - PLS entries are all 0
+ *
+ *	Bit 42 - Enable State Loss
+ *	0 - No state is lost irrespective of other fields
+ *	1 - Allows state loss
+ *
+ *	Bit 43 - Exit Criterion
+ *	0 - Exit from power-save mode on any interrupt
+ *	1 - Exit from power-save mode controlled by LPCR's PECE bits
+ *
+ *	Bits 44:47 - Power-Saving Level Limit
+ *	This limits the power-saving level that can be entered into.
+ *
+ *	Bits 60:63 - Requested Level
+ *	Used to specify which power-saving level must be entered on executing
+ *	stop instruction
+ */
+
+int validate_psscr_val_mask(u64 *psscr_val, u64 *psscr_mask, u32 flags)
+{
+	int err = 0;
+
+	/*
+	 * psscr_mask == 0xf indicates an older firmware.
+	 * Set remaining fields of psscr to the default values.
+	 * See NOTE above definition of PSSCR_HV_DEFAULT_VAL
+	 */
+	if (*psscr_mask == 0xf) {
+		*psscr_val = *psscr_val | PSSCR_HV_DEFAULT_VAL;
+		*psscr_mask = PSSCR_HV_DEFAULT_MASK;
+		return err;
+	}
+
+	/*
+	 * New firmware is expected to set the psscr_val bits correctly.
+	 * Validate that the following invariants are correctly maintained by
+	 * the new firmware.
+	 * - ESL bit value matches the EC bit value.
+	 * - ESL bit is set for all the deep stop states.
+	 */
+	if (GET_PSSCR_ESL(*psscr_val) != GET_PSSCR_EC(*psscr_val)) {
+		err = ERR_EC_ESL_MISMATCH;
+	} else if ((flags & OPAL_PM_LOSE_FULL_CONTEXT) &&
+		GET_PSSCR_ESL(*psscr_val) == 0) {
+		err = ERR_DEEP_STATE_ESL_MISMATCH;
+	}
+
+	return err;
+}
+
+/*
+ * pnv_arch300_idle_init: Initializes the default idle state, first
+ *                        deep idle state and deepest idle state on
+ *                        ISA 3.0 CPUs.
+ *
+ * @np: /ibm,opal/power-mgt device node
+ * @flags: cpu-idle-state-flags array
+ * @dt_idle_states: Number of idle state entries
+ * Returns 0 on success
+ */
+static void __init pnv_arch300_idle_init(void)
+{
+	u64 max_residency_ns = 0;
+	int i;
+
+	/* stop is not really architected, we only have p9,p10 drivers */
+	if (!pvr_version_is(PVR_POWER10) && !pvr_version_is(PVR_POWER9))
+		return;
+
+	/*
+	 * pnv_deepest_stop_{val,mask} should be set to values corresponding to
+	 * the deepest stop state.
+	 *
+	 * pnv_default_stop_{val,mask} should be set to values corresponding to
+	 * the deepest loss-less (OPAL_PM_STOP_INST_FAST) stop state.
+	 */
+	pnv_first_tb_loss_level = MAX_STOP_STATE + 1;
+	deep_spr_loss_state = MAX_STOP_STATE + 1;
+	for (i = 0; i < nr_pnv_idle_states; i++) {
+		int err;
+		struct pnv_idle_states_t *state = &pnv_idle_states[i];
+		u64 psscr_rl = state->psscr_val & PSSCR_RL_MASK;
+
+		/* No deep loss driver implemented for POWER10 yet */
+		if (pvr_version_is(PVR_POWER10) &&
+				state->flags & (OPAL_PM_TIMEBASE_STOP|OPAL_PM_LOSE_FULL_CONTEXT))
+			continue;
+
+		if ((state->flags & OPAL_PM_TIMEBASE_STOP) &&
+		     (pnv_first_tb_loss_level > psscr_rl))
+			pnv_first_tb_loss_level = psscr_rl;
+
+		if ((state->flags & OPAL_PM_LOSE_FULL_CONTEXT) &&
+		     (deep_spr_loss_state > psscr_rl))
+			deep_spr_loss_state = psscr_rl;
+
+		/*
+		 * The idle code does not deal with TB loss occurring
+		 * in a shallower state than SPR loss, so force it to
+		 * behave like SPRs are lost if TB is lost. POWER9 would
+		 * never encouter this, but a POWER8 core would if it
+		 * implemented the stop instruction. So this is for forward
+		 * compatibility.
+		 */
+		if ((state->flags & OPAL_PM_TIMEBASE_STOP) &&
+		     (deep_spr_loss_state > psscr_rl))
+			deep_spr_loss_state = psscr_rl;
+
+		err = validate_psscr_val_mask(&state->psscr_val,
+					      &state->psscr_mask,
+					      state->flags);
+		if (err) {
+			report_invalid_psscr_val(state->psscr_val, err);
+			continue;
+		}
+
+		state->valid = true;
+
+		if (max_residency_ns < state->residency_ns) {
+			max_residency_ns = state->residency_ns;
+			pnv_deepest_stop_psscr_val = state->psscr_val;
+			pnv_deepest_stop_psscr_mask = state->psscr_mask;
+			pnv_deepest_stop_flag = state->flags;
+			deepest_stop_found = true;
+		}
+
+		if (!default_stop_found &&
+		    (state->flags & OPAL_PM_STOP_INST_FAST)) {
+			pnv_default_stop_val = state->psscr_val;
+			pnv_default_stop_mask = state->psscr_mask;
+			default_stop_found = true;
+			WARN_ON(state->flags & OPAL_PM_LOSE_FULL_CONTEXT);
+		}
+	}
+
+	if (unlikely(!default_stop_found)) {
+		pr_warn("cpuidle-powernv: No suitable default stop state found. Disabling platform idle.\n");
+	} else {
+		ppc_md.power_save = arch300_idle;
+		pr_info("cpuidle-powernv: Default stop: psscr = 0x%016llx,mask=0x%016llx\n",
+			pnv_default_stop_val, pnv_default_stop_mask);
+	}
+
+	if (unlikely(!deepest_stop_found)) {
+		pr_warn("cpuidle-powernv: No suitable stop state for CPU-Hotplug. Offlined CPUs will busy wait");
+	} else {
+		pr_info("cpuidle-powernv: Deepest stop: psscr = 0x%016llx,mask=0x%016llx\n",
+			pnv_deepest_stop_psscr_val,
+			pnv_deepest_stop_psscr_mask);
+	}
+
+	pr_info("cpuidle-powernv: First stop level that may lose SPRs = 0x%llx\n",
+		deep_spr_loss_state);
+
+	pr_info("cpuidle-powernv: First stop level that may lose timebase = 0x%llx\n",
+		pnv_first_tb_loss_level);
+}
+
+static void __init pnv_disable_deep_states(void)
+{
+	/*
+	 * The stop-api is unable to restore hypervisor
+	 * resources on wakeup from platform idle states which
+	 * lose full context. So disable such states.
+	 */
+	supported_cpuidle_states &= ~OPAL_PM_LOSE_FULL_CONTEXT;
+	pr_warn("cpuidle-powernv: Disabling idle states that lose full context\n");
+	pr_warn("cpuidle-powernv: Idle power-savings, CPU-Hotplug affected\n");
+
+	if (cpu_has_feature(CPU_FTR_ARCH_300) &&
+	    (pnv_deepest_stop_flag & OPAL_PM_LOSE_FULL_CONTEXT)) {
+		/*
+		 * Use the default stop state for CPU-Hotplug
+		 * if available.
+		 */
+		if (default_stop_found) {
+			pnv_deepest_stop_psscr_val = pnv_default_stop_val;
+			pnv_deepest_stop_psscr_mask = pnv_default_stop_mask;
+			pr_warn("cpuidle-powernv: Offlined CPUs will stop with psscr = 0x%016llx\n",
+				pnv_deepest_stop_psscr_val);
+		} else { /* Fallback to snooze loop for CPU-Hotplug */
+			deepest_stop_found = false;
+			pr_warn("cpuidle-powernv: Offlined CPUs will busy wait\n");
+		}
+	}
+}
+
+/*
+ * Probe device tree for supported idle states
+ */
+static void __init pnv_probe_idle_states(void)
+{
+	int i;
+
+	if (nr_pnv_idle_states < 0) {
+		pr_warn("cpuidle-powernv: no idle states found in the DT\n");
+		return;
+	}
+
+	if (cpu_has_feature(CPU_FTR_ARCH_300))
+		pnv_arch300_idle_init();
+
+	for (i = 0; i < nr_pnv_idle_states; i++)
+		supported_cpuidle_states |= pnv_idle_states[i].flags;
+}
+
+/*
+ * This function parses device-tree and populates all the information
+ * into pnv_idle_states structure. It also sets up nr_pnv_idle_states
+ * which is the number of cpuidle states discovered through device-tree.
+ */
+
+static int pnv_parse_cpuidle_dt(void)
+{
+	struct device_node *np;
+	int nr_idle_states, i;
+	int rc = 0;
+	u32 *temp_u32;
+	u64 *temp_u64;
+	const char **temp_string;
+
+	np = of_find_node_by_path("/ibm,opal/power-mgt");
+	if (!np) {
+		pr_warn("opal: PowerMgmt Node not found\n");
+		return -ENODEV;
+	}
+	nr_idle_states = of_property_count_u32_elems(np,
+						"ibm,cpu-idle-state-flags");
+
+	pnv_idle_states = kcalloc(nr_idle_states, sizeof(*pnv_idle_states),
+				  GFP_KERNEL);
+	temp_u32 = kcalloc(nr_idle_states, sizeof(u32),  GFP_KERNEL);
+	temp_u64 = kcalloc(nr_idle_states, sizeof(u64),  GFP_KERNEL);
+	temp_string = kcalloc(nr_idle_states, sizeof(char *),  GFP_KERNEL);
+
+	if (!(pnv_idle_states && temp_u32 && temp_u64 && temp_string)) {
+		pr_err("Could not allocate memory for dt parsing\n");
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	/* Read flags */
+	if (of_property_read_u32_array(np, "ibm,cpu-idle-state-flags",
+				       temp_u32, nr_idle_states)) {
+		pr_warn("cpuidle-powernv: missing ibm,cpu-idle-state-flags in DT\n");
+		rc = -EINVAL;
+		goto out;
+	}
+	for (i = 0; i < nr_idle_states; i++)
+		pnv_idle_states[i].flags = temp_u32[i];
+
+	/* Read latencies */
+	if (of_property_read_u32_array(np, "ibm,cpu-idle-state-latencies-ns",
+				       temp_u32, nr_idle_states)) {
+		pr_warn("cpuidle-powernv: missing ibm,cpu-idle-state-latencies-ns in DT\n");
+		rc = -EINVAL;
+		goto out;
+	}
+	for (i = 0; i < nr_idle_states; i++)
+		pnv_idle_states[i].latency_ns = temp_u32[i];
+
+	/* Read residencies */
+	if (of_property_read_u32_array(np, "ibm,cpu-idle-state-residency-ns",
+				       temp_u32, nr_idle_states)) {
+		pr_warn("cpuidle-powernv: missing ibm,cpu-idle-state-residency-ns in DT\n");
+		rc = -EINVAL;
+		goto out;
+	}
+	for (i = 0; i < nr_idle_states; i++)
+		pnv_idle_states[i].residency_ns = temp_u32[i];
+
+	/* For power9 and later */
+	if (cpu_has_feature(CPU_FTR_ARCH_300)) {
+		/* Read pm_crtl_val */
+		if (of_property_read_u64_array(np, "ibm,cpu-idle-state-psscr",
+					       temp_u64, nr_idle_states)) {
+			pr_warn("cpuidle-powernv: missing ibm,cpu-idle-state-psscr in DT\n");
+			rc = -EINVAL;
+			goto out;
+		}
+		for (i = 0; i < nr_idle_states; i++)
+			pnv_idle_states[i].psscr_val = temp_u64[i];
+
+		/* Read pm_crtl_mask */
+		if (of_property_read_u64_array(np, "ibm,cpu-idle-state-psscr-mask",
+					       temp_u64, nr_idle_states)) {
+			pr_warn("cpuidle-powernv: missing ibm,cpu-idle-state-psscr-mask in DT\n");
+			rc = -EINVAL;
+			goto out;
+		}
+		for (i = 0; i < nr_idle_states; i++)
+			pnv_idle_states[i].psscr_mask = temp_u64[i];
+	}
+
+	/*
+	 * power8 specific properties ibm,cpu-idle-state-pmicr-mask and
+	 * ibm,cpu-idle-state-pmicr-val were never used and there is no
+	 * plan to use it in near future. Hence, not parsing these properties
+	 */
+
+	if (of_property_read_string_array(np, "ibm,cpu-idle-state-names",
+					  temp_string, nr_idle_states) < 0) {
+		pr_warn("cpuidle-powernv: missing ibm,cpu-idle-state-names in DT\n");
+		rc = -EINVAL;
+		goto out;
+	}
+	for (i = 0; i < nr_idle_states; i++)
+		strlcpy(pnv_idle_states[i].name, temp_string[i],
+			PNV_IDLE_NAME_LEN);
+	nr_pnv_idle_states = nr_idle_states;
+	rc = 0;
+out:
+	kfree(temp_u32);
+	kfree(temp_u64);
+	kfree(temp_string);
+	return rc;
+}
+
+static int __init pnv_init_idle_states(void)
+{
+	int cpu;
+	int rc = 0;
+
+	/* Set up PACA fields */
+	for_each_present_cpu(cpu) {
+		struct paca_struct *p = paca_ptrs[cpu];
+
+		p->idle_state = 0;
+		if (cpu == cpu_first_thread_sibling(cpu))
+			p->idle_state = (1 << threads_per_core) - 1;
+
+		if (!cpu_has_feature(CPU_FTR_ARCH_300)) {
+			/* P7/P8 nap */
+			p->thread_idle_state = PNV_THREAD_RUNNING;
+		} else if (pvr_version_is(PVR_POWER9)) {
+			/* P9 stop workarounds */
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+			p->requested_psscr = 0;
+			atomic_set(&p->dont_stop, 0);
+#endif
+		}
+	}
+
+	/* In case we error out nr_pnv_idle_states will be zero */
+	nr_pnv_idle_states = 0;
+	supported_cpuidle_states = 0;
+
+	if (cpuidle_disable != IDLE_NO_OVERRIDE)
+		goto out;
+	rc = pnv_parse_cpuidle_dt();
+	if (rc)
+		return rc;
+	pnv_probe_idle_states();
+
+	if (!cpu_has_feature(CPU_FTR_ARCH_300)) {
+		if (!(supported_cpuidle_states & OPAL_PM_SLEEP_ENABLED_ER1)) {
+			power7_fastsleep_workaround_entry = false;
+			power7_fastsleep_workaround_exit = false;
+		} else {
+			/*
+			 * OPAL_PM_SLEEP_ENABLED_ER1 is set. It indicates that
+			 * workaround is needed to use fastsleep. Provide sysfs
+			 * control to choose how this workaround has to be
+			 * applied.
+			 */
+			device_create_file(cpu_subsys.dev_root,
+				&dev_attr_fastsleep_workaround_applyonce);
+		}
+
+		update_subcore_sibling_mask();
+
+		if (supported_cpuidle_states & OPAL_PM_NAP_ENABLED) {
+			ppc_md.power_save = power7_idle;
+			power7_offline_type = PNV_THREAD_NAP;
+		}
+
+		if ((supported_cpuidle_states & OPAL_PM_WINKLE_ENABLED) &&
+			   (supported_cpuidle_states & OPAL_PM_LOSE_FULL_CONTEXT))
+			power7_offline_type = PNV_THREAD_WINKLE;
+		else if ((supported_cpuidle_states & OPAL_PM_SLEEP_ENABLED) ||
+			   (supported_cpuidle_states & OPAL_PM_SLEEP_ENABLED_ER1))
+			power7_offline_type = PNV_THREAD_SLEEP;
+	}
+
+	if (supported_cpuidle_states & OPAL_PM_LOSE_FULL_CONTEXT) {
+		if (pnv_save_sprs_for_deep_states())
+			pnv_disable_deep_states();
+	}
+
+out:
+	return 0;
+}
+machine_subsys_initcall(powernv, pnv_init_idle_states);
diff --git a/arch/powerpc/platforms/powernv/memtrace.c b/arch/powerpc/platforms/powernv/memtrace.c
new file mode 100644
index 000000000..0e42fe2d7
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/memtrace.c
@@ -0,0 +1,342 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) IBM Corporation, 2014, 2017
+ * Anton Blanchard, Rashmica Gupta.
+ */
+
+#define pr_fmt(fmt) "memtrace: " fmt
+
+#include <linux/bitops.h>
+#include <linux/string.h>
+#include <linux/memblock.h>
+#include <linux/init.h>
+#include <linux/moduleparam.h>
+#include <linux/fs.h>
+#include <linux/debugfs.h>
+#include <linux/slab.h>
+#include <linux/memory.h>
+#include <linux/memory_hotplug.h>
+#include <linux/numa.h>
+#include <asm/machdep.h>
+#include <asm/debugfs.h>
+
+/* This enables us to keep track of the memory removed from each node. */
+struct memtrace_entry {
+	void *mem;
+	u64 start;
+	u64 size;
+	u32 nid;
+	struct dentry *dir;
+	char name[16];
+};
+
+static DEFINE_MUTEX(memtrace_mutex);
+static u64 memtrace_size;
+
+static struct memtrace_entry *memtrace_array;
+static unsigned int memtrace_array_nr;
+
+
+static ssize_t memtrace_read(struct file *filp, char __user *ubuf,
+			     size_t count, loff_t *ppos)
+{
+	struct memtrace_entry *ent = filp->private_data;
+
+	return simple_read_from_buffer(ubuf, count, ppos, ent->mem, ent->size);
+}
+
+static const struct file_operations memtrace_fops = {
+	.llseek = default_llseek,
+	.read	= memtrace_read,
+	.open	= simple_open,
+};
+
+static int check_memblock_online(struct memory_block *mem, void *arg)
+{
+	if (mem->state != MEM_ONLINE)
+		return -1;
+
+	return 0;
+}
+
+static int change_memblock_state(struct memory_block *mem, void *arg)
+{
+	unsigned long state = (unsigned long)arg;
+
+	mem->state = state;
+
+	return 0;
+}
+
+static void memtrace_clear_range(unsigned long start_pfn,
+				 unsigned long nr_pages)
+{
+	unsigned long pfn;
+
+	/*
+	 * As pages are offline, we cannot trust the memmap anymore. As HIGHMEM
+	 * does not apply, avoid passing around "struct page" and use
+	 * clear_page() instead directly.
+	 */
+	for (pfn = start_pfn; pfn < start_pfn + nr_pages; pfn++) {
+		if (IS_ALIGNED(pfn, PAGES_PER_SECTION))
+			cond_resched();
+		clear_page(__va(PFN_PHYS(pfn)));
+	}
+}
+
+/* called with device_hotplug_lock held */
+static bool memtrace_offline_pages(u32 nid, u64 start_pfn, u64 nr_pages)
+{
+	const unsigned long start = PFN_PHYS(start_pfn);
+	const unsigned long size = PFN_PHYS(nr_pages);
+
+	if (walk_memory_blocks(start, size, NULL, check_memblock_online))
+		return false;
+
+	walk_memory_blocks(start, size, (void *)MEM_GOING_OFFLINE,
+			   change_memblock_state);
+
+	if (offline_pages(start_pfn, nr_pages)) {
+		walk_memory_blocks(start, size, (void *)MEM_ONLINE,
+				   change_memblock_state);
+		return false;
+	}
+
+	walk_memory_blocks(start, size, (void *)MEM_OFFLINE,
+			   change_memblock_state);
+
+
+	return true;
+}
+
+static u64 memtrace_alloc_node(u32 nid, u64 size)
+{
+	u64 start_pfn, end_pfn, nr_pages, pfn;
+	u64 base_pfn;
+	u64 bytes = memory_block_size_bytes();
+
+	if (!node_spanned_pages(nid))
+		return 0;
+
+	start_pfn = node_start_pfn(nid);
+	end_pfn = node_end_pfn(nid);
+	nr_pages = size >> PAGE_SHIFT;
+
+	/* Trace memory needs to be aligned to the size */
+	end_pfn = round_down(end_pfn - nr_pages, nr_pages);
+
+	lock_device_hotplug();
+	for (base_pfn = end_pfn; base_pfn > start_pfn; base_pfn -= nr_pages) {
+		if (memtrace_offline_pages(nid, base_pfn, nr_pages) == true) {
+			/*
+			 * Clear the range while we still have a linear
+			 * mapping.
+			 */
+			memtrace_clear_range(base_pfn, nr_pages);
+			/*
+			 * Remove memory in memory block size chunks so that
+			 * iomem resources are always split to the same size and
+			 * we never try to remove memory that spans two iomem
+			 * resources.
+			 */
+			end_pfn = base_pfn + nr_pages;
+			for (pfn = base_pfn; pfn < end_pfn; pfn += bytes>> PAGE_SHIFT) {
+				__remove_memory(nid, pfn << PAGE_SHIFT, bytes);
+			}
+			unlock_device_hotplug();
+			return base_pfn << PAGE_SHIFT;
+		}
+	}
+	unlock_device_hotplug();
+
+	return 0;
+}
+
+static int memtrace_init_regions_runtime(u64 size)
+{
+	u32 nid;
+	u64 m;
+
+	memtrace_array = kcalloc(num_online_nodes(),
+				sizeof(struct memtrace_entry), GFP_KERNEL);
+	if (!memtrace_array) {
+		pr_err("Failed to allocate memtrace_array\n");
+		return -EINVAL;
+	}
+
+	for_each_online_node(nid) {
+		m = memtrace_alloc_node(nid, size);
+
+		/*
+		 * A node might not have any local memory, so warn but
+		 * continue on.
+		 */
+		if (!m) {
+			pr_err("Failed to allocate trace memory on node %d\n", nid);
+			continue;
+		}
+
+		pr_info("Allocated trace memory on node %d at 0x%016llx\n", nid, m);
+
+		memtrace_array[memtrace_array_nr].start = m;
+		memtrace_array[memtrace_array_nr].size = size;
+		memtrace_array[memtrace_array_nr].nid = nid;
+		memtrace_array_nr++;
+	}
+
+	return 0;
+}
+
+static struct dentry *memtrace_debugfs_dir;
+
+static int memtrace_init_debugfs(void)
+{
+	int ret = 0;
+	int i;
+
+	for (i = 0; i < memtrace_array_nr; i++) {
+		struct dentry *dir;
+		struct memtrace_entry *ent = &memtrace_array[i];
+
+		ent->mem = ioremap(ent->start, ent->size);
+		/* Warn but continue on */
+		if (!ent->mem) {
+			pr_err("Failed to map trace memory at 0x%llx\n",
+				 ent->start);
+			ret = -1;
+			continue;
+		}
+
+		snprintf(ent->name, 16, "%08x", ent->nid);
+		dir = debugfs_create_dir(ent->name, memtrace_debugfs_dir);
+
+		ent->dir = dir;
+		debugfs_create_file("trace", 0400, dir, ent, &memtrace_fops);
+		debugfs_create_x64("start", 0400, dir, &ent->start);
+		debugfs_create_x64("size", 0400, dir, &ent->size);
+	}
+
+	return ret;
+}
+
+static int online_mem_block(struct memory_block *mem, void *arg)
+{
+	return device_online(&mem->dev);
+}
+
+/*
+ * Iterate through the chunks of memory we have removed from the kernel
+ * and attempt to add them back to the kernel.
+ */
+static int memtrace_online(void)
+{
+	int i, ret = 0;
+	struct memtrace_entry *ent;
+
+	for (i = memtrace_array_nr - 1; i >= 0; i--) {
+		ent = &memtrace_array[i];
+
+		/* We have onlined this chunk previously */
+		if (ent->nid == NUMA_NO_NODE)
+			continue;
+
+		/* Remove from io mappings */
+		if (ent->mem) {
+			iounmap(ent->mem);
+			ent->mem = 0;
+		}
+
+		if (add_memory(ent->nid, ent->start, ent->size, MHP_NONE)) {
+			pr_err("Failed to add trace memory to node %d\n",
+				ent->nid);
+			ret += 1;
+			continue;
+		}
+
+		lock_device_hotplug();
+		walk_memory_blocks(ent->start, ent->size, NULL,
+				   online_mem_block);
+		unlock_device_hotplug();
+
+		/*
+		 * Memory was added successfully so clean up references to it
+		 * so on reentry we can tell that this chunk was added.
+		 */
+		debugfs_remove_recursive(ent->dir);
+		pr_info("Added trace memory back to node %d\n", ent->nid);
+		ent->size = ent->start = ent->nid = NUMA_NO_NODE;
+	}
+	if (ret)
+		return ret;
+
+	/* If all chunks of memory were added successfully, reset globals */
+	kfree(memtrace_array);
+	memtrace_array = NULL;
+	memtrace_size = 0;
+	memtrace_array_nr = 0;
+	return 0;
+}
+
+static int memtrace_enable_set(void *data, u64 val)
+{
+	int rc = -EAGAIN;
+	u64 bytes;
+
+	/*
+	 * Don't attempt to do anything if size isn't aligned to a memory
+	 * block or equal to zero.
+	 */
+	bytes = memory_block_size_bytes();
+	if (val & (bytes - 1)) {
+		pr_err("Value must be aligned with 0x%llx\n", bytes);
+		return -EINVAL;
+	}
+
+	mutex_lock(&memtrace_mutex);
+
+	/* Re-add/online previously removed/offlined memory */
+	if (memtrace_size) {
+		if (memtrace_online())
+			goto out_unlock;
+	}
+
+	if (!val) {
+		rc = 0;
+		goto out_unlock;
+	}
+
+	/* Offline and remove memory */
+	if (memtrace_init_regions_runtime(val))
+		goto out_unlock;
+
+	if (memtrace_init_debugfs())
+		goto out_unlock;
+
+	memtrace_size = val;
+	rc = 0;
+out_unlock:
+	mutex_unlock(&memtrace_mutex);
+	return rc;
+}
+
+static int memtrace_enable_get(void *data, u64 *val)
+{
+	*val = memtrace_size;
+	return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(memtrace_init_fops, memtrace_enable_get,
+					memtrace_enable_set, "0x%016llx\n");
+
+static int memtrace_init(void)
+{
+	memtrace_debugfs_dir = debugfs_create_dir("memtrace",
+						  powerpc_debugfs_root);
+
+	debugfs_create_file("enable", 0600, memtrace_debugfs_dir,
+			    NULL, &memtrace_init_fops);
+
+	return 0;
+}
+machine_device_initcall(powernv, memtrace_init);
diff --git a/arch/powerpc/platforms/powernv/npu-dma.c b/arch/powerpc/platforms/powernv/npu-dma.c
new file mode 100644
index 000000000..b711dc326
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/npu-dma.c
@@ -0,0 +1,705 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * This file implements the DMA operations for NVLink devices. The NPU
+ * devices all point to the same iommu table as the parent PCI device.
+ *
+ * Copyright Alistair Popple, IBM Corporation 2015.
+ */
+
+#include <linux/mmu_notifier.h>
+#include <linux/mmu_context.h>
+#include <linux/of.h>
+#include <linux/pci.h>
+#include <linux/memblock.h>
+#include <linux/sizes.h>
+
+#include <asm/debugfs.h>
+#include <asm/powernv.h>
+#include <asm/ppc-pci.h>
+#include <asm/opal.h>
+
+#include "pci.h"
+
+static struct pci_dev *get_pci_dev(struct device_node *dn)
+{
+	struct pci_dn *pdn = PCI_DN(dn);
+	struct pci_dev *pdev;
+
+	pdev = pci_get_domain_bus_and_slot(pci_domain_nr(pdn->phb->bus),
+					   pdn->busno, pdn->devfn);
+
+	/*
+	 * pci_get_domain_bus_and_slot() increased the reference count of
+	 * the PCI device, but callers don't need that actually as the PE
+	 * already holds a reference to the device. Since callers aren't
+	 * aware of the reference count change, call pci_dev_put() now to
+	 * avoid leaks.
+	 */
+	if (pdev)
+		pci_dev_put(pdev);
+
+	return pdev;
+}
+
+/* Given a NPU device get the associated PCI device. */
+struct pci_dev *pnv_pci_get_gpu_dev(struct pci_dev *npdev)
+{
+	struct device_node *dn;
+	struct pci_dev *gpdev;
+
+	if (WARN_ON(!npdev))
+		return NULL;
+
+	if (WARN_ON(!npdev->dev.of_node))
+		return NULL;
+
+	/* Get assoicated PCI device */
+	dn = of_parse_phandle(npdev->dev.of_node, "ibm,gpu", 0);
+	if (!dn)
+		return NULL;
+
+	gpdev = get_pci_dev(dn);
+	of_node_put(dn);
+
+	return gpdev;
+}
+EXPORT_SYMBOL(pnv_pci_get_gpu_dev);
+
+/* Given the real PCI device get a linked NPU device. */
+struct pci_dev *pnv_pci_get_npu_dev(struct pci_dev *gpdev, int index)
+{
+	struct device_node *dn;
+	struct pci_dev *npdev;
+
+	if (WARN_ON(!gpdev))
+		return NULL;
+
+	/* Not all PCI devices have device-tree nodes */
+	if (!gpdev->dev.of_node)
+		return NULL;
+
+	/* Get assoicated PCI device */
+	dn = of_parse_phandle(gpdev->dev.of_node, "ibm,npu", index);
+	if (!dn)
+		return NULL;
+
+	npdev = get_pci_dev(dn);
+	of_node_put(dn);
+
+	return npdev;
+}
+EXPORT_SYMBOL(pnv_pci_get_npu_dev);
+
+#ifdef CONFIG_IOMMU_API
+/*
+ * Returns the PE assoicated with the PCI device of the given
+ * NPU. Returns the linked pci device if pci_dev != NULL.
+ */
+static struct pnv_ioda_pe *get_gpu_pci_dev_and_pe(struct pnv_ioda_pe *npe,
+						  struct pci_dev **gpdev)
+{
+	struct pnv_phb *phb;
+	struct pci_controller *hose;
+	struct pci_dev *pdev;
+	struct pnv_ioda_pe *pe;
+	struct pci_dn *pdn;
+
+	pdev = pnv_pci_get_gpu_dev(npe->pdev);
+	if (!pdev)
+		return NULL;
+
+	pdn = pci_get_pdn(pdev);
+	if (WARN_ON(!pdn || pdn->pe_number == IODA_INVALID_PE))
+		return NULL;
+
+	hose = pci_bus_to_host(pdev->bus);
+	phb = hose->private_data;
+	pe = &phb->ioda.pe_array[pdn->pe_number];
+
+	if (gpdev)
+		*gpdev = pdev;
+
+	return pe;
+}
+
+static long pnv_npu_unset_window(struct iommu_table_group *table_group,
+		int num);
+
+static long pnv_npu_set_window(struct iommu_table_group *table_group, int num,
+		struct iommu_table *tbl)
+{
+	struct pnv_ioda_pe *npe = container_of(table_group, struct pnv_ioda_pe,
+			table_group);
+	struct pnv_phb *phb = npe->phb;
+	int64_t rc;
+	const unsigned long size = tbl->it_indirect_levels ?
+		tbl->it_level_size : tbl->it_size;
+	const __u64 start_addr = tbl->it_offset << tbl->it_page_shift;
+	const __u64 win_size = tbl->it_size << tbl->it_page_shift;
+	int num2 = (num == 0) ? 1 : 0;
+
+	/* NPU has just one TVE so if there is another table, remove it first */
+	if (npe->table_group.tables[num2])
+		pnv_npu_unset_window(&npe->table_group, num2);
+
+	pe_info(npe, "Setting up window %llx..%llx pg=%lx\n",
+			start_addr, start_addr + win_size - 1,
+			IOMMU_PAGE_SIZE(tbl));
+
+	rc = opal_pci_map_pe_dma_window(phb->opal_id,
+			npe->pe_number,
+			npe->pe_number,
+			tbl->it_indirect_levels + 1,
+			__pa(tbl->it_base),
+			size << 3,
+			IOMMU_PAGE_SIZE(tbl));
+	if (rc) {
+		pe_err(npe, "Failed to configure TCE table, err %lld\n", rc);
+		return rc;
+	}
+	pnv_pci_ioda2_tce_invalidate_entire(phb, false);
+
+	/* Add the table to the list so its TCE cache will get invalidated */
+	pnv_pci_link_table_and_group(phb->hose->node, num,
+			tbl, &npe->table_group);
+
+	return 0;
+}
+
+static long pnv_npu_unset_window(struct iommu_table_group *table_group, int num)
+{
+	struct pnv_ioda_pe *npe = container_of(table_group, struct pnv_ioda_pe,
+			table_group);
+	struct pnv_phb *phb = npe->phb;
+	int64_t rc;
+
+	if (!npe->table_group.tables[num])
+		return 0;
+
+	pe_info(npe, "Removing DMA window\n");
+
+	rc = opal_pci_map_pe_dma_window(phb->opal_id, npe->pe_number,
+			npe->pe_number,
+			0/* levels */, 0/* table address */,
+			0/* table size */, 0/* page size */);
+	if (rc) {
+		pe_err(npe, "Unmapping failed, ret = %lld\n", rc);
+		return rc;
+	}
+	pnv_pci_ioda2_tce_invalidate_entire(phb, false);
+
+	pnv_pci_unlink_table_and_group(npe->table_group.tables[num],
+			&npe->table_group);
+
+	return 0;
+}
+
+/* Switch ownership from platform code to external user (e.g. VFIO) */
+static void pnv_npu_take_ownership(struct iommu_table_group *table_group)
+{
+	struct pnv_ioda_pe *npe = container_of(table_group, struct pnv_ioda_pe,
+			table_group);
+	struct pnv_phb *phb = npe->phb;
+	int64_t rc;
+	struct pci_dev *gpdev = NULL;
+
+	/*
+	 * Note: NPU has just a single TVE in the hardware which means that
+	 * while used by the kernel, it can have either 32bit window or
+	 * DMA bypass but never both. So we deconfigure 32bit window only
+	 * if it was enabled at the moment of ownership change.
+	 */
+	if (npe->table_group.tables[0]) {
+		pnv_npu_unset_window(&npe->table_group, 0);
+		return;
+	}
+
+	/* Disable bypass */
+	rc = opal_pci_map_pe_dma_window_real(phb->opal_id,
+			npe->pe_number, npe->pe_number,
+			0 /* bypass base */, 0);
+	if (rc) {
+		pe_err(npe, "Failed to disable bypass, err %lld\n", rc);
+		return;
+	}
+	pnv_pci_ioda2_tce_invalidate_entire(npe->phb, false);
+
+	get_gpu_pci_dev_and_pe(npe, &gpdev);
+	if (gpdev)
+		pnv_npu2_unmap_lpar_dev(gpdev);
+}
+
+static void pnv_npu_release_ownership(struct iommu_table_group *table_group)
+{
+	struct pnv_ioda_pe *npe = container_of(table_group, struct pnv_ioda_pe,
+			table_group);
+	struct pci_dev *gpdev = NULL;
+
+	get_gpu_pci_dev_and_pe(npe, &gpdev);
+	if (gpdev)
+		pnv_npu2_map_lpar_dev(gpdev, 0, MSR_DR | MSR_PR | MSR_HV);
+}
+
+static struct iommu_table_group_ops pnv_pci_npu_ops = {
+	.set_window = pnv_npu_set_window,
+	.unset_window = pnv_npu_unset_window,
+	.take_ownership = pnv_npu_take_ownership,
+	.release_ownership = pnv_npu_release_ownership,
+};
+#endif /* !CONFIG_IOMMU_API */
+
+/*
+ * NPU2 ATS
+ */
+/* Maximum possible number of ATSD MMIO registers per NPU */
+#define NV_NMMU_ATSD_REGS 8
+#define NV_NPU_MAX_PE_NUM	16
+
+/*
+ * A compound NPU IOMMU group which might consist of 1 GPU + 2xNPUs (POWER8) or
+ * up to 3 x (GPU + 2xNPUs) (POWER9).
+ */
+struct npu_comp {
+	struct iommu_table_group table_group;
+	int pe_num;
+	struct pnv_ioda_pe *pe[NV_NPU_MAX_PE_NUM];
+};
+
+/* An NPU descriptor, valid for POWER9 only */
+struct npu {
+	int index;
+	struct npu_comp npucomp;
+};
+
+#ifdef CONFIG_IOMMU_API
+static long pnv_npu_peers_create_table_userspace(
+		struct iommu_table_group *table_group,
+		int num, __u32 page_shift, __u64 window_size, __u32 levels,
+		struct iommu_table **ptbl)
+{
+	struct npu_comp *npucomp = container_of(table_group, struct npu_comp,
+			table_group);
+
+	if (!npucomp->pe_num || !npucomp->pe[0] ||
+			!npucomp->pe[0]->table_group.ops ||
+			!npucomp->pe[0]->table_group.ops->create_table)
+		return -EFAULT;
+
+	return npucomp->pe[0]->table_group.ops->create_table(
+			&npucomp->pe[0]->table_group, num, page_shift,
+			window_size, levels, ptbl);
+}
+
+static long pnv_npu_peers_set_window(struct iommu_table_group *table_group,
+		int num, struct iommu_table *tbl)
+{
+	int i, j;
+	long ret = 0;
+	struct npu_comp *npucomp = container_of(table_group, struct npu_comp,
+			table_group);
+
+	for (i = 0; i < npucomp->pe_num; ++i) {
+		struct pnv_ioda_pe *pe = npucomp->pe[i];
+
+		if (!pe->table_group.ops->set_window)
+			continue;
+
+		ret = pe->table_group.ops->set_window(&pe->table_group,
+				num, tbl);
+		if (ret)
+			break;
+	}
+
+	if (ret) {
+		for (j = 0; j < i; ++j) {
+			struct pnv_ioda_pe *pe = npucomp->pe[j];
+
+			if (!pe->table_group.ops->unset_window)
+				continue;
+
+			ret = pe->table_group.ops->unset_window(
+					&pe->table_group, num);
+			if (ret)
+				break;
+		}
+	} else {
+		table_group->tables[num] = iommu_tce_table_get(tbl);
+	}
+
+	return ret;
+}
+
+static long pnv_npu_peers_unset_window(struct iommu_table_group *table_group,
+		int num)
+{
+	int i, j;
+	long ret = 0;
+	struct npu_comp *npucomp = container_of(table_group, struct npu_comp,
+			table_group);
+
+	for (i = 0; i < npucomp->pe_num; ++i) {
+		struct pnv_ioda_pe *pe = npucomp->pe[i];
+
+		WARN_ON(npucomp->table_group.tables[num] !=
+				table_group->tables[num]);
+		if (!npucomp->table_group.tables[num])
+			continue;
+
+		if (!pe->table_group.ops->unset_window)
+			continue;
+
+		ret = pe->table_group.ops->unset_window(&pe->table_group, num);
+		if (ret)
+			break;
+	}
+
+	if (ret) {
+		for (j = 0; j < i; ++j) {
+			struct pnv_ioda_pe *pe = npucomp->pe[j];
+
+			if (!npucomp->table_group.tables[num])
+				continue;
+
+			if (!pe->table_group.ops->set_window)
+				continue;
+
+			ret = pe->table_group.ops->set_window(&pe->table_group,
+					num, table_group->tables[num]);
+			if (ret)
+				break;
+		}
+	} else if (table_group->tables[num]) {
+		iommu_tce_table_put(table_group->tables[num]);
+		table_group->tables[num] = NULL;
+	}
+
+	return ret;
+}
+
+static void pnv_npu_peers_take_ownership(struct iommu_table_group *table_group)
+{
+	int i;
+	struct npu_comp *npucomp = container_of(table_group, struct npu_comp,
+			table_group);
+
+	for (i = 0; i < npucomp->pe_num; ++i) {
+		struct pnv_ioda_pe *pe = npucomp->pe[i];
+
+		if (!pe->table_group.ops ||
+		    !pe->table_group.ops->take_ownership)
+			continue;
+		pe->table_group.ops->take_ownership(&pe->table_group);
+	}
+}
+
+static void pnv_npu_peers_release_ownership(
+		struct iommu_table_group *table_group)
+{
+	int i;
+	struct npu_comp *npucomp = container_of(table_group, struct npu_comp,
+			table_group);
+
+	for (i = 0; i < npucomp->pe_num; ++i) {
+		struct pnv_ioda_pe *pe = npucomp->pe[i];
+
+		if (!pe->table_group.ops ||
+		    !pe->table_group.ops->release_ownership)
+			continue;
+		pe->table_group.ops->release_ownership(&pe->table_group);
+	}
+}
+
+static struct iommu_table_group_ops pnv_npu_peers_ops = {
+	.get_table_size = pnv_pci_ioda2_get_table_size,
+	.create_table = pnv_npu_peers_create_table_userspace,
+	.set_window = pnv_npu_peers_set_window,
+	.unset_window = pnv_npu_peers_unset_window,
+	.take_ownership = pnv_npu_peers_take_ownership,
+	.release_ownership = pnv_npu_peers_release_ownership,
+};
+
+static void pnv_comp_attach_table_group(struct npu_comp *npucomp,
+		struct pnv_ioda_pe *pe)
+{
+	if (WARN_ON(npucomp->pe_num == NV_NPU_MAX_PE_NUM))
+		return;
+
+	npucomp->pe[npucomp->pe_num] = pe;
+	++npucomp->pe_num;
+}
+
+static struct iommu_table_group *
+	pnv_try_setup_npu_table_group(struct pnv_ioda_pe *pe)
+{
+	struct iommu_table_group *compound_group;
+	struct npu_comp *npucomp;
+	struct pci_dev *gpdev = NULL;
+	struct pci_controller *hose;
+	struct pci_dev *npdev = NULL;
+
+	list_for_each_entry(gpdev, &pe->pbus->devices, bus_list) {
+		npdev = pnv_pci_get_npu_dev(gpdev, 0);
+		if (npdev)
+			break;
+	}
+
+	if (!npdev)
+		/* It is not an NPU attached device, skip */
+		return NULL;
+
+	hose = pci_bus_to_host(npdev->bus);
+
+	if (hose->npu) {
+		/* P9 case: compound group is per-NPU (all gpus, all links) */
+		npucomp = &hose->npu->npucomp;
+	} else {
+		/* P8 case: Compound group is per-GPU (1 gpu, 2 links) */
+		npucomp = pe->npucomp = kzalloc(sizeof(*npucomp), GFP_KERNEL);
+	}
+
+	compound_group = &npucomp->table_group;
+	if (!compound_group->group) {
+		compound_group->ops = &pnv_npu_peers_ops;
+		iommu_register_group(compound_group, hose->global_number,
+				pe->pe_number);
+
+		/* Steal capabilities from a GPU PE */
+		compound_group->max_dynamic_windows_supported =
+			pe->table_group.max_dynamic_windows_supported;
+		compound_group->tce32_start = pe->table_group.tce32_start;
+		compound_group->tce32_size = pe->table_group.tce32_size;
+		compound_group->max_levels = pe->table_group.max_levels;
+		if (!compound_group->pgsizes)
+			compound_group->pgsizes = pe->table_group.pgsizes;
+	}
+
+	/*
+	 * The gpu would have been added to the iommu group that's created
+	 * for the PE. Pull it out now.
+	 */
+	iommu_del_device(&gpdev->dev);
+
+       /*
+	* I'm not sure this is strictly required, but it's probably a good idea
+	* since the table_group for the PE is going to be attached to the
+	* compound table group. If we leave the PE's iommu group active then
+	* we might have the same table_group being modifiable via two sepeate
+	* iommu groups.
+	*/
+	iommu_group_put(pe->table_group.group);
+
+	/* now put the GPU into the compound group */
+	pnv_comp_attach_table_group(npucomp, pe);
+	iommu_add_device(compound_group, &gpdev->dev);
+
+	return compound_group;
+}
+
+static struct iommu_table_group *pnv_npu_compound_attach(struct pnv_ioda_pe *pe)
+{
+	struct iommu_table_group *table_group;
+	struct npu_comp *npucomp;
+	struct pci_dev *gpdev = NULL;
+	struct pci_dev *npdev;
+	struct pnv_ioda_pe *gpe = get_gpu_pci_dev_and_pe(pe, &gpdev);
+
+	WARN_ON(!(pe->flags & PNV_IODA_PE_DEV));
+	if (!gpe)
+		return NULL;
+
+	/*
+	 * IODA2 bridges get this set up from pci_controller_ops::setup_bridge
+	 * but NPU bridges do not have this hook defined so we do it here.
+	 * We do not setup other table group parameters as they won't be used
+	 * anyway - NVLink bridges are subordinate PEs.
+	 */
+	pe->table_group.ops = &pnv_pci_npu_ops;
+
+	table_group = iommu_group_get_iommudata(
+			iommu_group_get(&gpdev->dev));
+
+	/*
+	 * On P9 NPU PHB and PCI PHB support different page sizes,
+	 * keep only matching. We expect here that NVLink bridge PE pgsizes is
+	 * initialized by the caller.
+	 */
+	table_group->pgsizes &= pe->table_group.pgsizes;
+	npucomp = container_of(table_group, struct npu_comp, table_group);
+	pnv_comp_attach_table_group(npucomp, pe);
+
+	list_for_each_entry(npdev, &pe->phb->hose->bus->devices, bus_list) {
+		struct pci_dev *gpdevtmp = pnv_pci_get_gpu_dev(npdev);
+
+		if (gpdevtmp != gpdev)
+			continue;
+
+		iommu_add_device(table_group, &npdev->dev);
+	}
+
+	return table_group;
+}
+
+void pnv_pci_npu_setup_iommu_groups(void)
+{
+	struct pci_controller *hose;
+	struct pnv_phb *phb;
+	struct pnv_ioda_pe *pe;
+
+	/*
+	 * For non-nvlink devices the IOMMU group is registered when the PE is
+	 * configured and devices are added to the group when the per-device
+	 * DMA setup is run. That's done in hose->ops.dma_dev_setup() which is
+	 * only initialise for "normal" IODA PHBs.
+	 *
+	 * For NVLink devices we need to ensure the NVLinks and the GPU end up
+	 * in the same IOMMU group, so that's handled here.
+	 */
+	list_for_each_entry(hose, &hose_list, list_node) {
+		phb = hose->private_data;
+
+		if (phb->type == PNV_PHB_IODA2)
+			list_for_each_entry(pe, &phb->ioda.pe_list, list)
+				pnv_try_setup_npu_table_group(pe);
+	}
+
+	/*
+	 * Now we have all PHBs discovered, time to add NPU devices to
+	 * the corresponding IOMMU groups.
+	 */
+	list_for_each_entry(hose, &hose_list, list_node) {
+		unsigned long  pgsizes;
+
+		phb = hose->private_data;
+
+		if (phb->type != PNV_PHB_NPU_NVLINK)
+			continue;
+
+		pgsizes = pnv_ioda_parse_tce_sizes(phb);
+		list_for_each_entry(pe, &phb->ioda.pe_list, list) {
+			/*
+			 * IODA2 bridges get this set up from
+			 * pci_controller_ops::setup_bridge but NPU bridges
+			 * do not have this hook defined so we do it here.
+			 */
+			pe->table_group.pgsizes = pgsizes;
+			pnv_npu_compound_attach(pe);
+		}
+	}
+}
+#endif /* CONFIG_IOMMU_API */
+
+int pnv_npu2_init(struct pci_controller *hose)
+{
+	static int npu_index;
+	struct npu *npu;
+	int ret;
+
+	npu = kzalloc(sizeof(*npu), GFP_KERNEL);
+	if (!npu)
+		return -ENOMEM;
+
+	npu_index++;
+	if (WARN_ON(npu_index >= NV_MAX_NPUS)) {
+		ret = -ENOSPC;
+		goto fail_exit;
+	}
+	npu->index = npu_index;
+	hose->npu = npu;
+
+	return 0;
+
+fail_exit:
+	kfree(npu);
+	return ret;
+}
+
+int pnv_npu2_map_lpar_dev(struct pci_dev *gpdev, unsigned int lparid,
+		unsigned long msr)
+{
+	int ret;
+	struct pci_dev *npdev = pnv_pci_get_npu_dev(gpdev, 0);
+	struct pci_controller *hose;
+	struct pnv_phb *nphb;
+
+	if (!npdev)
+		return -ENODEV;
+
+	hose = pci_bus_to_host(npdev->bus);
+	if (hose->npu == NULL) {
+		dev_info_once(&npdev->dev, "Nvlink1 does not support contexts");
+		return 0;
+	}
+
+	nphb = hose->private_data;
+
+	dev_dbg(&gpdev->dev, "Map LPAR opalid=%llu lparid=%u\n",
+			nphb->opal_id, lparid);
+	/*
+	 * Currently we only support radix and non-zero LPCR only makes sense
+	 * for hash tables so skiboot expects the LPCR parameter to be a zero.
+	 */
+	ret = opal_npu_map_lpar(nphb->opal_id, pci_dev_id(gpdev), lparid,
+				0 /* LPCR bits */);
+	if (ret) {
+		dev_err(&gpdev->dev, "Error %d mapping device to LPAR\n", ret);
+		return ret;
+	}
+
+	dev_dbg(&gpdev->dev, "init context opalid=%llu msr=%lx\n",
+			nphb->opal_id, msr);
+	ret = opal_npu_init_context(nphb->opal_id, 0/*__unused*/, msr,
+				    pci_dev_id(gpdev));
+	if (ret < 0)
+		dev_err(&gpdev->dev, "Failed to init context: %d\n", ret);
+	else
+		ret = 0;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(pnv_npu2_map_lpar_dev);
+
+void pnv_npu2_map_lpar(struct pnv_ioda_pe *gpe, unsigned long msr)
+{
+	struct pci_dev *gpdev;
+
+	list_for_each_entry(gpdev, &gpe->pbus->devices, bus_list)
+		pnv_npu2_map_lpar_dev(gpdev, 0, msr);
+}
+
+int pnv_npu2_unmap_lpar_dev(struct pci_dev *gpdev)
+{
+	int ret;
+	struct pci_dev *npdev = pnv_pci_get_npu_dev(gpdev, 0);
+	struct pci_controller *hose;
+	struct pnv_phb *nphb;
+
+	if (!npdev)
+		return -ENODEV;
+
+	hose = pci_bus_to_host(npdev->bus);
+	if (hose->npu == NULL) {
+		dev_info_once(&npdev->dev, "Nvlink1 does not support contexts");
+		return 0;
+	}
+
+	nphb = hose->private_data;
+
+	dev_dbg(&gpdev->dev, "destroy context opalid=%llu\n",
+			nphb->opal_id);
+	ret = opal_npu_destroy_context(nphb->opal_id, 0/*__unused*/,
+				       pci_dev_id(gpdev));
+	if (ret < 0) {
+		dev_err(&gpdev->dev, "Failed to destroy context: %d\n", ret);
+		return ret;
+	}
+
+	/* Set LPID to 0 anyway, just to be safe */
+	dev_dbg(&gpdev->dev, "Map LPAR opalid=%llu lparid=0\n", nphb->opal_id);
+	ret = opal_npu_map_lpar(nphb->opal_id, pci_dev_id(gpdev), 0 /*LPID*/,
+				0 /* LPCR bits */);
+	if (ret)
+		dev_err(&gpdev->dev, "Error %d mapping device to LPAR\n", ret);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(pnv_npu2_unmap_lpar_dev);
diff --git a/arch/powerpc/platforms/powernv/ocxl.c b/arch/powerpc/platforms/powernv/ocxl.c
new file mode 100644
index 000000000..ecdad219d
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/ocxl.c
@@ -0,0 +1,485 @@
+// SPDX-License-Identifier: GPL-2.0+
+// Copyright 2017 IBM Corp.
+#include <asm/pnv-ocxl.h>
+#include <asm/opal.h>
+#include <misc/ocxl-config.h>
+#include "pci.h"
+
+#define PNV_OCXL_TL_P9_RECV_CAP		0x000000000000000Full
+#define PNV_OCXL_ACTAG_MAX		64
+/* PASIDs are 20-bit, but on P9, NPU can only handle 15 bits */
+#define PNV_OCXL_PASID_BITS		15
+#define PNV_OCXL_PASID_MAX		((1 << PNV_OCXL_PASID_BITS) - 1)
+
+#define AFU_PRESENT (1 << 31)
+#define AFU_INDEX_MASK 0x3F000000
+#define AFU_INDEX_SHIFT 24
+#define ACTAG_MASK 0xFFF
+
+
+struct actag_range {
+	u16 start;
+	u16 count;
+};
+
+struct npu_link {
+	struct list_head list;
+	int domain;
+	int bus;
+	int dev;
+	u16 fn_desired_actags[8];
+	struct actag_range fn_actags[8];
+	bool assignment_done;
+};
+static struct list_head links_list = LIST_HEAD_INIT(links_list);
+static DEFINE_MUTEX(links_list_lock);
+
+
+/*
+ * opencapi actags handling:
+ *
+ * When sending commands, the opencapi device references the memory
+ * context it's targeting with an 'actag', which is really an alias
+ * for a (BDF, pasid) combination. When it receives a command, the NPU
+ * must do a lookup of the actag to identify the memory context. The
+ * hardware supports a finite number of actags per link (64 for
+ * POWER9).
+ *
+ * The device can carry multiple functions, and each function can have
+ * multiple AFUs. Each AFU advertises in its config space the number
+ * of desired actags. The host must configure in the config space of
+ * the AFU how many actags the AFU is really allowed to use (which can
+ * be less than what the AFU desires).
+ *
+ * When a PCI function is probed by the driver, it has no visibility
+ * about the other PCI functions and how many actags they'd like,
+ * which makes it impossible to distribute actags fairly among AFUs.
+ *
+ * Unfortunately, the only way to know how many actags a function
+ * desires is by looking at the data for each AFU in the config space
+ * and add them up. Similarly, the only way to know how many actags
+ * all the functions of the physical device desire is by adding the
+ * previously computed function counts. Then we can match that against
+ * what the hardware supports.
+ *
+ * To get a comprehensive view, we use a 'pci fixup': at the end of
+ * PCI enumeration, each function counts how many actags its AFUs
+ * desire and we save it in a 'npu_link' structure, shared between all
+ * the PCI functions of a same device. Therefore, when the first
+ * function is probed by the driver, we can get an idea of the total
+ * count of desired actags for the device, and assign the actags to
+ * the AFUs, by pro-rating if needed.
+ */
+
+static int find_dvsec_from_pos(struct pci_dev *dev, int dvsec_id, int pos)
+{
+	int vsec = pos;
+	u16 vendor, id;
+
+	while ((vsec = pci_find_next_ext_capability(dev, vsec,
+						    OCXL_EXT_CAP_ID_DVSEC))) {
+		pci_read_config_word(dev, vsec + OCXL_DVSEC_VENDOR_OFFSET,
+				&vendor);
+		pci_read_config_word(dev, vsec + OCXL_DVSEC_ID_OFFSET, &id);
+		if (vendor == PCI_VENDOR_ID_IBM && id == dvsec_id)
+			return vsec;
+	}
+	return 0;
+}
+
+static int find_dvsec_afu_ctrl(struct pci_dev *dev, u8 afu_idx)
+{
+	int vsec = 0;
+	u8 idx;
+
+	while ((vsec = find_dvsec_from_pos(dev, OCXL_DVSEC_AFU_CTRL_ID,
+					   vsec))) {
+		pci_read_config_byte(dev, vsec + OCXL_DVSEC_AFU_CTRL_AFU_IDX,
+				&idx);
+		if (idx == afu_idx)
+			return vsec;
+	}
+	return 0;
+}
+
+static int get_max_afu_index(struct pci_dev *dev, int *afu_idx)
+{
+	int pos;
+	u32 val;
+
+	pos = find_dvsec_from_pos(dev, OCXL_DVSEC_FUNC_ID, 0);
+	if (!pos)
+		return -ESRCH;
+
+	pci_read_config_dword(dev, pos + OCXL_DVSEC_FUNC_OFF_INDEX, &val);
+	if (val & AFU_PRESENT)
+		*afu_idx = (val & AFU_INDEX_MASK) >> AFU_INDEX_SHIFT;
+	else
+		*afu_idx = -1;
+	return 0;
+}
+
+static int get_actag_count(struct pci_dev *dev, int afu_idx, int *actag)
+{
+	int pos;
+	u16 actag_sup;
+
+	pos = find_dvsec_afu_ctrl(dev, afu_idx);
+	if (!pos)
+		return -ESRCH;
+
+	pci_read_config_word(dev, pos + OCXL_DVSEC_AFU_CTRL_ACTAG_SUP,
+			&actag_sup);
+	*actag = actag_sup & ACTAG_MASK;
+	return 0;
+}
+
+static struct npu_link *find_link(struct pci_dev *dev)
+{
+	struct npu_link *link;
+
+	list_for_each_entry(link, &links_list, list) {
+		/* The functions of a device all share the same link */
+		if (link->domain == pci_domain_nr(dev->bus) &&
+			link->bus == dev->bus->number &&
+			link->dev == PCI_SLOT(dev->devfn)) {
+			return link;
+		}
+	}
+
+	/* link doesn't exist yet. Allocate one */
+	link = kzalloc(sizeof(struct npu_link), GFP_KERNEL);
+	if (!link)
+		return NULL;
+	link->domain = pci_domain_nr(dev->bus);
+	link->bus = dev->bus->number;
+	link->dev = PCI_SLOT(dev->devfn);
+	list_add(&link->list, &links_list);
+	return link;
+}
+
+static void pnv_ocxl_fixup_actag(struct pci_dev *dev)
+{
+	struct pci_controller *hose = pci_bus_to_host(dev->bus);
+	struct pnv_phb *phb = hose->private_data;
+	struct npu_link *link;
+	int rc, afu_idx = -1, i, actag;
+
+	if (!machine_is(powernv))
+		return;
+
+	if (phb->type != PNV_PHB_NPU_OCAPI)
+		return;
+
+	mutex_lock(&links_list_lock);
+
+	link = find_link(dev);
+	if (!link) {
+		dev_warn(&dev->dev, "couldn't update actag information\n");
+		mutex_unlock(&links_list_lock);
+		return;
+	}
+
+	/*
+	 * Check how many actags are desired for the AFUs under that
+	 * function and add it to the count for the link
+	 */
+	rc = get_max_afu_index(dev, &afu_idx);
+	if (rc) {
+		/* Most likely an invalid config space */
+		dev_dbg(&dev->dev, "couldn't find AFU information\n");
+		afu_idx = -1;
+	}
+
+	link->fn_desired_actags[PCI_FUNC(dev->devfn)] = 0;
+	for (i = 0; i <= afu_idx; i++) {
+		/*
+		 * AFU index 'holes' are allowed. So don't fail if we
+		 * can't read the actag info for an index
+		 */
+		rc = get_actag_count(dev, i, &actag);
+		if (rc)
+			continue;
+		link->fn_desired_actags[PCI_FUNC(dev->devfn)] += actag;
+	}
+	dev_dbg(&dev->dev, "total actags for function: %d\n",
+		link->fn_desired_actags[PCI_FUNC(dev->devfn)]);
+
+	mutex_unlock(&links_list_lock);
+}
+DECLARE_PCI_FIXUP_HEADER(PCI_ANY_ID, PCI_ANY_ID, pnv_ocxl_fixup_actag);
+
+static u16 assign_fn_actags(u16 desired, u16 total)
+{
+	u16 count;
+
+	if (total <= PNV_OCXL_ACTAG_MAX)
+		count = desired;
+	else
+		count = PNV_OCXL_ACTAG_MAX * desired / total;
+
+	return count;
+}
+
+static void assign_actags(struct npu_link *link)
+{
+	u16 actag_count, range_start = 0, total_desired = 0;
+	int i;
+
+	for (i = 0; i < 8; i++)
+		total_desired += link->fn_desired_actags[i];
+
+	for (i = 0; i < 8; i++) {
+		if (link->fn_desired_actags[i]) {
+			actag_count = assign_fn_actags(
+				link->fn_desired_actags[i],
+				total_desired);
+			link->fn_actags[i].start = range_start;
+			link->fn_actags[i].count = actag_count;
+			range_start += actag_count;
+			WARN_ON(range_start >= PNV_OCXL_ACTAG_MAX);
+		}
+		pr_debug("link %x:%x:%x fct %d actags: start=%d count=%d (desired=%d)\n",
+			link->domain, link->bus, link->dev, i,
+			link->fn_actags[i].start, link->fn_actags[i].count,
+			link->fn_desired_actags[i]);
+	}
+	link->assignment_done = true;
+}
+
+int pnv_ocxl_get_actag(struct pci_dev *dev, u16 *base, u16 *enabled,
+		u16 *supported)
+{
+	struct npu_link *link;
+
+	mutex_lock(&links_list_lock);
+
+	link = find_link(dev);
+	if (!link) {
+		dev_err(&dev->dev, "actag information not found\n");
+		mutex_unlock(&links_list_lock);
+		return -ENODEV;
+	}
+	/*
+	 * On p9, we only have 64 actags per link, so they must be
+	 * shared by all the functions of the same adapter. We counted
+	 * the desired actag counts during PCI enumeration, so that we
+	 * can allocate a pro-rated number of actags to each function.
+	 */
+	if (!link->assignment_done)
+		assign_actags(link);
+
+	*base      = link->fn_actags[PCI_FUNC(dev->devfn)].start;
+	*enabled   = link->fn_actags[PCI_FUNC(dev->devfn)].count;
+	*supported = link->fn_desired_actags[PCI_FUNC(dev->devfn)];
+
+	mutex_unlock(&links_list_lock);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(pnv_ocxl_get_actag);
+
+int pnv_ocxl_get_pasid_count(struct pci_dev *dev, int *count)
+{
+	struct npu_link *link;
+	int i, rc = -EINVAL;
+
+	/*
+	 * The number of PASIDs (process address space ID) which can
+	 * be used by a function depends on how many functions exist
+	 * on the device. The NPU needs to be configured to know how
+	 * many bits are available to PASIDs and how many are to be
+	 * used by the function BDF indentifier.
+	 *
+	 * We only support one AFU-carrying function for now.
+	 */
+	mutex_lock(&links_list_lock);
+
+	link = find_link(dev);
+	if (!link) {
+		dev_err(&dev->dev, "actag information not found\n");
+		mutex_unlock(&links_list_lock);
+		return -ENODEV;
+	}
+
+	for (i = 0; i < 8; i++)
+		if (link->fn_desired_actags[i] && (i == PCI_FUNC(dev->devfn))) {
+			*count = PNV_OCXL_PASID_MAX;
+			rc = 0;
+			break;
+		}
+
+	mutex_unlock(&links_list_lock);
+	dev_dbg(&dev->dev, "%d PASIDs available for function\n",
+		rc ? 0 : *count);
+	return rc;
+}
+EXPORT_SYMBOL_GPL(pnv_ocxl_get_pasid_count);
+
+static void set_templ_rate(unsigned int templ, unsigned int rate, char *buf)
+{
+	int shift, idx;
+
+	WARN_ON(templ > PNV_OCXL_TL_MAX_TEMPLATE);
+	idx = (PNV_OCXL_TL_MAX_TEMPLATE - templ) / 2;
+	shift = 4 * (1 - ((PNV_OCXL_TL_MAX_TEMPLATE - templ) % 2));
+	buf[idx] |= rate << shift;
+}
+
+int pnv_ocxl_get_tl_cap(struct pci_dev *dev, long *cap,
+			char *rate_buf, int rate_buf_size)
+{
+	if (rate_buf_size != PNV_OCXL_TL_RATE_BUF_SIZE)
+		return -EINVAL;
+	/*
+	 * The TL capabilities are a characteristic of the NPU, so
+	 * we go with hard-coded values.
+	 *
+	 * The receiving rate of each template is encoded on 4 bits.
+	 *
+	 * On P9:
+	 * - templates 0 -> 3 are supported
+	 * - templates 0, 1 and 3 have a 0 receiving rate
+	 * - template 2 has receiving rate of 1 (extra cycle)
+	 */
+	memset(rate_buf, 0, rate_buf_size);
+	set_templ_rate(2, 1, rate_buf);
+	*cap = PNV_OCXL_TL_P9_RECV_CAP;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(pnv_ocxl_get_tl_cap);
+
+int pnv_ocxl_set_tl_conf(struct pci_dev *dev, long cap,
+			uint64_t rate_buf_phys, int rate_buf_size)
+{
+	struct pci_controller *hose = pci_bus_to_host(dev->bus);
+	struct pnv_phb *phb = hose->private_data;
+	int rc;
+
+	if (rate_buf_size != PNV_OCXL_TL_RATE_BUF_SIZE)
+		return -EINVAL;
+
+	rc = opal_npu_tl_set(phb->opal_id, dev->devfn, cap,
+			rate_buf_phys, rate_buf_size);
+	if (rc) {
+		dev_err(&dev->dev, "Can't configure host TL: %d\n", rc);
+		return -EINVAL;
+	}
+	return 0;
+}
+EXPORT_SYMBOL_GPL(pnv_ocxl_set_tl_conf);
+
+int pnv_ocxl_get_xsl_irq(struct pci_dev *dev, int *hwirq)
+{
+	int rc;
+
+	rc = of_property_read_u32(dev->dev.of_node, "ibm,opal-xsl-irq", hwirq);
+	if (rc) {
+		dev_err(&dev->dev,
+			"Can't get translation interrupt for device\n");
+		return rc;
+	}
+	return 0;
+}
+EXPORT_SYMBOL_GPL(pnv_ocxl_get_xsl_irq);
+
+void pnv_ocxl_unmap_xsl_regs(void __iomem *dsisr, void __iomem *dar,
+			void __iomem *tfc, void __iomem *pe_handle)
+{
+	iounmap(dsisr);
+	iounmap(dar);
+	iounmap(tfc);
+	iounmap(pe_handle);
+}
+EXPORT_SYMBOL_GPL(pnv_ocxl_unmap_xsl_regs);
+
+int pnv_ocxl_map_xsl_regs(struct pci_dev *dev, void __iomem **dsisr,
+			void __iomem **dar, void __iomem **tfc,
+			void __iomem **pe_handle)
+{
+	u64 reg;
+	int i, j, rc = 0;
+	void __iomem *regs[4];
+
+	/*
+	 * opal stores the mmio addresses of the DSISR, DAR, TFC and
+	 * PE_HANDLE registers in a device tree property, in that
+	 * order
+	 */
+	for (i = 0; i < 4; i++) {
+		rc = of_property_read_u64_index(dev->dev.of_node,
+						"ibm,opal-xsl-mmio", i, &reg);
+		if (rc)
+			break;
+		regs[i] = ioremap(reg, 8);
+		if (!regs[i]) {
+			rc = -EINVAL;
+			break;
+		}
+	}
+	if (rc) {
+		dev_err(&dev->dev, "Can't map translation mmio registers\n");
+		for (j = i - 1; j >= 0; j--)
+			iounmap(regs[j]);
+	} else {
+		*dsisr = regs[0];
+		*dar = regs[1];
+		*tfc = regs[2];
+		*pe_handle = regs[3];
+	}
+	return rc;
+}
+EXPORT_SYMBOL_GPL(pnv_ocxl_map_xsl_regs);
+
+struct spa_data {
+	u64 phb_opal_id;
+	u32 bdfn;
+};
+
+int pnv_ocxl_spa_setup(struct pci_dev *dev, void *spa_mem, int PE_mask,
+		void **platform_data)
+{
+	struct pci_controller *hose = pci_bus_to_host(dev->bus);
+	struct pnv_phb *phb = hose->private_data;
+	struct spa_data *data;
+	u32 bdfn;
+	int rc;
+
+	data = kzalloc(sizeof(*data), GFP_KERNEL);
+	if (!data)
+		return -ENOMEM;
+
+	bdfn = (dev->bus->number << 8) | dev->devfn;
+	rc = opal_npu_spa_setup(phb->opal_id, bdfn, virt_to_phys(spa_mem),
+				PE_mask);
+	if (rc) {
+		dev_err(&dev->dev, "Can't setup Shared Process Area: %d\n", rc);
+		kfree(data);
+		return rc;
+	}
+	data->phb_opal_id = phb->opal_id;
+	data->bdfn = bdfn;
+	*platform_data = (void *) data;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(pnv_ocxl_spa_setup);
+
+void pnv_ocxl_spa_release(void *platform_data)
+{
+	struct spa_data *data = (struct spa_data *) platform_data;
+	int rc;
+
+	rc = opal_npu_spa_setup(data->phb_opal_id, data->bdfn, 0, 0);
+	WARN_ON(rc);
+	kfree(data);
+}
+EXPORT_SYMBOL_GPL(pnv_ocxl_spa_release);
+
+int pnv_ocxl_spa_remove_pe_from_cache(void *platform_data, int pe_handle)
+{
+	struct spa_data *data = (struct spa_data *) platform_data;
+	int rc;
+
+	rc = opal_npu_spa_clear_cache(data->phb_opal_id, data->bdfn, pe_handle);
+	return rc;
+}
+EXPORT_SYMBOL_GPL(pnv_ocxl_spa_remove_pe_from_cache);
diff --git a/arch/powerpc/platforms/powernv/opal-async.c b/arch/powerpc/platforms/powernv/opal-async.c
new file mode 100644
index 000000000..c094fdf58
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/opal-async.c
@@ -0,0 +1,290 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * PowerNV OPAL asynchronous completion interfaces
+ *
+ * Copyright 2013-2017 IBM Corp.
+ */
+
+#undef DEBUG
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/sched.h>
+#include <linux/semaphore.h>
+#include <linux/spinlock.h>
+#include <linux/wait.h>
+#include <linux/gfp.h>
+#include <linux/of.h>
+#include <asm/machdep.h>
+#include <asm/opal.h>
+
+enum opal_async_token_state {
+	ASYNC_TOKEN_UNALLOCATED = 0,
+	ASYNC_TOKEN_ALLOCATED,
+	ASYNC_TOKEN_DISPATCHED,
+	ASYNC_TOKEN_ABANDONED,
+	ASYNC_TOKEN_COMPLETED
+};
+
+struct opal_async_token {
+	enum opal_async_token_state state;
+	struct opal_msg response;
+};
+
+static DECLARE_WAIT_QUEUE_HEAD(opal_async_wait);
+static DEFINE_SPINLOCK(opal_async_comp_lock);
+static struct semaphore opal_async_sem;
+static unsigned int opal_max_async_tokens;
+static struct opal_async_token *opal_async_tokens;
+
+static int __opal_async_get_token(void)
+{
+	unsigned long flags;
+	int i, token = -EBUSY;
+
+	spin_lock_irqsave(&opal_async_comp_lock, flags);
+
+	for (i = 0; i < opal_max_async_tokens; i++) {
+		if (opal_async_tokens[i].state == ASYNC_TOKEN_UNALLOCATED) {
+			opal_async_tokens[i].state = ASYNC_TOKEN_ALLOCATED;
+			token = i;
+			break;
+		}
+	}
+
+	spin_unlock_irqrestore(&opal_async_comp_lock, flags);
+	return token;
+}
+
+/*
+ * Note: If the returned token is used in an opal call and opal returns
+ * OPAL_ASYNC_COMPLETION you MUST call one of opal_async_wait_response() or
+ * opal_async_wait_response_interruptible() at least once before calling another
+ * opal_async_* function
+ */
+int opal_async_get_token_interruptible(void)
+{
+	int token;
+
+	/* Wait until a token is available */
+	if (down_interruptible(&opal_async_sem))
+		return -ERESTARTSYS;
+
+	token = __opal_async_get_token();
+	if (token < 0)
+		up(&opal_async_sem);
+
+	return token;
+}
+EXPORT_SYMBOL_GPL(opal_async_get_token_interruptible);
+
+static int __opal_async_release_token(int token)
+{
+	unsigned long flags;
+	int rc;
+
+	if (token < 0 || token >= opal_max_async_tokens) {
+		pr_err("%s: Passed token is out of range, token %d\n",
+				__func__, token);
+		return -EINVAL;
+	}
+
+	spin_lock_irqsave(&opal_async_comp_lock, flags);
+	switch (opal_async_tokens[token].state) {
+	case ASYNC_TOKEN_COMPLETED:
+	case ASYNC_TOKEN_ALLOCATED:
+		opal_async_tokens[token].state = ASYNC_TOKEN_UNALLOCATED;
+		rc = 0;
+		break;
+	/*
+	 * DISPATCHED and ABANDONED tokens must wait for OPAL to respond.
+	 * Mark a DISPATCHED token as ABANDONED so that the response handling
+	 * code knows no one cares and that it can free it then.
+	 */
+	case ASYNC_TOKEN_DISPATCHED:
+		opal_async_tokens[token].state = ASYNC_TOKEN_ABANDONED;
+		fallthrough;
+	default:
+		rc = 1;
+	}
+	spin_unlock_irqrestore(&opal_async_comp_lock, flags);
+
+	return rc;
+}
+
+int opal_async_release_token(int token)
+{
+	int ret;
+
+	ret = __opal_async_release_token(token);
+	if (!ret)
+		up(&opal_async_sem);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(opal_async_release_token);
+
+int opal_async_wait_response(uint64_t token, struct opal_msg *msg)
+{
+	if (token >= opal_max_async_tokens) {
+		pr_err("%s: Invalid token passed\n", __func__);
+		return -EINVAL;
+	}
+
+	if (!msg) {
+		pr_err("%s: Invalid message pointer passed\n", __func__);
+		return -EINVAL;
+	}
+
+	/*
+	 * There is no need to mark the token as dispatched, wait_event()
+	 * will block until the token completes.
+	 *
+	 * Wakeup the poller before we wait for events to speed things
+	 * up on platforms or simulators where the interrupts aren't
+	 * functional.
+	 */
+	opal_wake_poller();
+	wait_event(opal_async_wait, opal_async_tokens[token].state
+			== ASYNC_TOKEN_COMPLETED);
+	memcpy(msg, &opal_async_tokens[token].response, sizeof(*msg));
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(opal_async_wait_response);
+
+int opal_async_wait_response_interruptible(uint64_t token, struct opal_msg *msg)
+{
+	unsigned long flags;
+	int ret;
+
+	if (token >= opal_max_async_tokens) {
+		pr_err("%s: Invalid token passed\n", __func__);
+		return -EINVAL;
+	}
+
+	if (!msg) {
+		pr_err("%s: Invalid message pointer passed\n", __func__);
+		return -EINVAL;
+	}
+
+	/*
+	 * The first time this gets called we mark the token as DISPATCHED
+	 * so that if wait_event_interruptible() returns not zero and the
+	 * caller frees the token, we know not to actually free the token
+	 * until the response comes.
+	 *
+	 * Only change if the token is ALLOCATED - it may have been
+	 * completed even before the caller gets around to calling this
+	 * the first time.
+	 *
+	 * There is also a dirty great comment at the token allocation
+	 * function that if the opal call returns OPAL_ASYNC_COMPLETION to
+	 * the caller then the caller *must* call this or the not
+	 * interruptible version before doing anything else with the
+	 * token.
+	 */
+	if (opal_async_tokens[token].state == ASYNC_TOKEN_ALLOCATED) {
+		spin_lock_irqsave(&opal_async_comp_lock, flags);
+		if (opal_async_tokens[token].state == ASYNC_TOKEN_ALLOCATED)
+			opal_async_tokens[token].state = ASYNC_TOKEN_DISPATCHED;
+		spin_unlock_irqrestore(&opal_async_comp_lock, flags);
+	}
+
+	/*
+	 * Wakeup the poller before we wait for events to speed things
+	 * up on platforms or simulators where the interrupts aren't
+	 * functional.
+	 */
+	opal_wake_poller();
+	ret = wait_event_interruptible(opal_async_wait,
+			opal_async_tokens[token].state ==
+			ASYNC_TOKEN_COMPLETED);
+	if (!ret)
+		memcpy(msg, &opal_async_tokens[token].response, sizeof(*msg));
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(opal_async_wait_response_interruptible);
+
+/* Called from interrupt context */
+static int opal_async_comp_event(struct notifier_block *nb,
+		unsigned long msg_type, void *msg)
+{
+	struct opal_msg *comp_msg = msg;
+	enum opal_async_token_state state;
+	unsigned long flags;
+	uint64_t token;
+
+	if (msg_type != OPAL_MSG_ASYNC_COMP)
+		return 0;
+
+	token = be64_to_cpu(comp_msg->params[0]);
+	spin_lock_irqsave(&opal_async_comp_lock, flags);
+	state = opal_async_tokens[token].state;
+	opal_async_tokens[token].state = ASYNC_TOKEN_COMPLETED;
+	spin_unlock_irqrestore(&opal_async_comp_lock, flags);
+
+	if (state == ASYNC_TOKEN_ABANDONED) {
+		/* Free the token, no one else will */
+		opal_async_release_token(token);
+		return 0;
+	}
+	memcpy(&opal_async_tokens[token].response, comp_msg, sizeof(*comp_msg));
+	wake_up(&opal_async_wait);
+
+	return 0;
+}
+
+static struct notifier_block opal_async_comp_nb = {
+		.notifier_call	= opal_async_comp_event,
+		.next		= NULL,
+		.priority	= 0,
+};
+
+int __init opal_async_comp_init(void)
+{
+	struct device_node *opal_node;
+	const __be32 *async;
+	int err;
+
+	opal_node = of_find_node_by_path("/ibm,opal");
+	if (!opal_node) {
+		pr_err("%s: Opal node not found\n", __func__);
+		err = -ENOENT;
+		goto out;
+	}
+
+	async = of_get_property(opal_node, "opal-msg-async-num", NULL);
+	if (!async) {
+		pr_err("%s: %pOF has no opal-msg-async-num\n",
+				__func__, opal_node);
+		err = -ENOENT;
+		goto out_opal_node;
+	}
+
+	opal_max_async_tokens = be32_to_cpup(async);
+	opal_async_tokens = kcalloc(opal_max_async_tokens,
+			sizeof(*opal_async_tokens), GFP_KERNEL);
+	if (!opal_async_tokens) {
+		err = -ENOMEM;
+		goto out_opal_node;
+	}
+
+	err = opal_message_notifier_register(OPAL_MSG_ASYNC_COMP,
+			&opal_async_comp_nb);
+	if (err) {
+		pr_err("%s: Can't register OPAL event notifier (%d)\n",
+				__func__, err);
+		kfree(opal_async_tokens);
+		goto out_opal_node;
+	}
+
+	sema_init(&opal_async_sem, opal_max_async_tokens);
+
+out_opal_node:
+	of_node_put(opal_node);
+out:
+	return err;
+}
diff --git a/arch/powerpc/platforms/powernv/opal-call.c b/arch/powerpc/platforms/powernv/opal-call.c
new file mode 100644
index 000000000..5cd0f52d2
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/opal-call.c
@@ -0,0 +1,295 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/percpu.h>
+#include <linux/jump_label.h>
+#include <asm/opal-api.h>
+#include <asm/trace.h>
+#include <asm/asm-prototypes.h>
+
+#ifdef CONFIG_TRACEPOINTS
+/*
+ * Since the tracing code might execute OPAL calls we need to guard against
+ * recursion.
+ */
+static DEFINE_PER_CPU(unsigned int, opal_trace_depth);
+
+static void __trace_opal_entry(s64 a0, s64 a1, s64 a2, s64 a3,
+			       s64 a4, s64 a5, s64 a6, s64 a7,
+			       unsigned long opcode)
+{
+	unsigned int *depth;
+	unsigned long args[8];
+
+	depth = this_cpu_ptr(&opal_trace_depth);
+
+	if (*depth)
+		return;
+
+	args[0] = a0;
+	args[1] = a1;
+	args[2] = a2;
+	args[3] = a3;
+	args[4] = a4;
+	args[5] = a5;
+	args[6] = a6;
+	args[7] = a7;
+
+	(*depth)++;
+	trace_opal_entry(opcode, &args[0]);
+	(*depth)--;
+}
+
+static void __trace_opal_exit(unsigned long opcode, unsigned long retval)
+{
+	unsigned int *depth;
+
+	depth = this_cpu_ptr(&opal_trace_depth);
+
+	if (*depth)
+		return;
+
+	(*depth)++;
+	trace_opal_exit(opcode, retval);
+	(*depth)--;
+}
+
+static DEFINE_STATIC_KEY_FALSE(opal_tracepoint_key);
+
+int opal_tracepoint_regfunc(void)
+{
+	static_branch_inc(&opal_tracepoint_key);
+	return 0;
+}
+
+void opal_tracepoint_unregfunc(void)
+{
+	static_branch_dec(&opal_tracepoint_key);
+}
+
+static s64 __opal_call_trace(s64 a0, s64 a1, s64 a2, s64 a3,
+			     s64 a4, s64 a5, s64 a6, s64 a7,
+			      unsigned long opcode, unsigned long msr)
+{
+	s64 ret;
+
+	__trace_opal_entry(a0, a1, a2, a3, a4, a5, a6, a7, opcode);
+	ret = __opal_call(a0, a1, a2, a3, a4, a5, a6, a7, opcode, msr);
+	__trace_opal_exit(opcode, ret);
+
+	return ret;
+}
+
+#define DO_TRACE (static_branch_unlikely(&opal_tracepoint_key))
+
+#else /* CONFIG_TRACEPOINTS */
+
+static s64 __opal_call_trace(s64 a0, s64 a1, s64 a2, s64 a3,
+			     s64 a4, s64 a5, s64 a6, s64 a7,
+			      unsigned long opcode, unsigned long msr)
+{
+	return 0;
+}
+
+#define DO_TRACE false
+#endif /* CONFIG_TRACEPOINTS */
+
+static int64_t opal_call(int64_t a0, int64_t a1, int64_t a2, int64_t a3,
+	     int64_t a4, int64_t a5, int64_t a6, int64_t a7, int64_t opcode)
+{
+	unsigned long flags;
+	unsigned long msr = mfmsr();
+	bool mmu = (msr & (MSR_IR|MSR_DR));
+	int64_t ret;
+
+	msr &= ~MSR_EE;
+
+	if (unlikely(!mmu))
+		return __opal_call(a0, a1, a2, a3, a4, a5, a6, a7, opcode, msr);
+
+	local_save_flags(flags);
+	hard_irq_disable();
+
+	if (DO_TRACE) {
+		ret = __opal_call_trace(a0, a1, a2, a3, a4, a5, a6, a7, opcode, msr);
+	} else {
+		ret = __opal_call(a0, a1, a2, a3, a4, a5, a6, a7, opcode, msr);
+	}
+
+	local_irq_restore(flags);
+
+	return ret;
+}
+
+#define OPAL_CALL(name, opcode)					\
+int64_t name(int64_t a0, int64_t a1, int64_t a2, int64_t a3,	\
+	     int64_t a4, int64_t a5, int64_t a6, int64_t a7);	\
+int64_t name(int64_t a0, int64_t a1, int64_t a2, int64_t a3,	\
+	     int64_t a4, int64_t a5, int64_t a6, int64_t a7)	\
+{								\
+	return opal_call(a0, a1, a2, a3, a4, a5, a6, a7, opcode); \
+}
+
+OPAL_CALL(opal_invalid_call,			OPAL_INVALID_CALL);
+OPAL_CALL(opal_console_write,			OPAL_CONSOLE_WRITE);
+OPAL_CALL(opal_console_read,			OPAL_CONSOLE_READ);
+OPAL_CALL(opal_console_write_buffer_space,	OPAL_CONSOLE_WRITE_BUFFER_SPACE);
+OPAL_CALL(opal_rtc_read,			OPAL_RTC_READ);
+OPAL_CALL(opal_rtc_write,			OPAL_RTC_WRITE);
+OPAL_CALL(opal_cec_power_down,			OPAL_CEC_POWER_DOWN);
+OPAL_CALL(opal_cec_reboot,			OPAL_CEC_REBOOT);
+OPAL_CALL(opal_cec_reboot2,			OPAL_CEC_REBOOT2);
+OPAL_CALL(opal_read_nvram,			OPAL_READ_NVRAM);
+OPAL_CALL(opal_write_nvram,			OPAL_WRITE_NVRAM);
+OPAL_CALL(opal_handle_interrupt,		OPAL_HANDLE_INTERRUPT);
+OPAL_CALL(opal_poll_events,			OPAL_POLL_EVENTS);
+OPAL_CALL(opal_pci_set_hub_tce_memory,		OPAL_PCI_SET_HUB_TCE_MEMORY);
+OPAL_CALL(opal_pci_set_phb_tce_memory,		OPAL_PCI_SET_PHB_TCE_MEMORY);
+OPAL_CALL(opal_pci_config_read_byte,		OPAL_PCI_CONFIG_READ_BYTE);
+OPAL_CALL(opal_pci_config_read_half_word,	OPAL_PCI_CONFIG_READ_HALF_WORD);
+OPAL_CALL(opal_pci_config_read_word,		OPAL_PCI_CONFIG_READ_WORD);
+OPAL_CALL(opal_pci_config_write_byte,		OPAL_PCI_CONFIG_WRITE_BYTE);
+OPAL_CALL(opal_pci_config_write_half_word,	OPAL_PCI_CONFIG_WRITE_HALF_WORD);
+OPAL_CALL(opal_pci_config_write_word,		OPAL_PCI_CONFIG_WRITE_WORD);
+OPAL_CALL(opal_set_xive,			OPAL_SET_XIVE);
+OPAL_CALL(opal_get_xive,			OPAL_GET_XIVE);
+OPAL_CALL(opal_register_exception_handler,	OPAL_REGISTER_OPAL_EXCEPTION_HANDLER);
+OPAL_CALL(opal_pci_eeh_freeze_status,		OPAL_PCI_EEH_FREEZE_STATUS);
+OPAL_CALL(opal_pci_eeh_freeze_clear,		OPAL_PCI_EEH_FREEZE_CLEAR);
+OPAL_CALL(opal_pci_eeh_freeze_set,		OPAL_PCI_EEH_FREEZE_SET);
+OPAL_CALL(opal_pci_err_inject,			OPAL_PCI_ERR_INJECT);
+OPAL_CALL(opal_pci_shpc,			OPAL_PCI_SHPC);
+OPAL_CALL(opal_pci_phb_mmio_enable,		OPAL_PCI_PHB_MMIO_ENABLE);
+OPAL_CALL(opal_pci_set_phb_mem_window,		OPAL_PCI_SET_PHB_MEM_WINDOW);
+OPAL_CALL(opal_pci_map_pe_mmio_window,		OPAL_PCI_MAP_PE_MMIO_WINDOW);
+OPAL_CALL(opal_pci_set_phb_table_memory,	OPAL_PCI_SET_PHB_TABLE_MEMORY);
+OPAL_CALL(opal_pci_set_pe,			OPAL_PCI_SET_PE);
+OPAL_CALL(opal_pci_set_peltv,			OPAL_PCI_SET_PELTV);
+OPAL_CALL(opal_pci_set_mve,			OPAL_PCI_SET_MVE);
+OPAL_CALL(opal_pci_set_mve_enable,		OPAL_PCI_SET_MVE_ENABLE);
+OPAL_CALL(opal_pci_get_xive_reissue,		OPAL_PCI_GET_XIVE_REISSUE);
+OPAL_CALL(opal_pci_set_xive_reissue,		OPAL_PCI_SET_XIVE_REISSUE);
+OPAL_CALL(opal_pci_set_xive_pe,			OPAL_PCI_SET_XIVE_PE);
+OPAL_CALL(opal_get_xive_source,			OPAL_GET_XIVE_SOURCE);
+OPAL_CALL(opal_get_msi_32,			OPAL_GET_MSI_32);
+OPAL_CALL(opal_get_msi_64,			OPAL_GET_MSI_64);
+OPAL_CALL(opal_start_cpu,			OPAL_START_CPU);
+OPAL_CALL(opal_query_cpu_status,		OPAL_QUERY_CPU_STATUS);
+OPAL_CALL(opal_write_oppanel,			OPAL_WRITE_OPPANEL);
+OPAL_CALL(opal_pci_map_pe_dma_window,		OPAL_PCI_MAP_PE_DMA_WINDOW);
+OPAL_CALL(opal_pci_map_pe_dma_window_real,	OPAL_PCI_MAP_PE_DMA_WINDOW_REAL);
+OPAL_CALL(opal_pci_reset,			OPAL_PCI_RESET);
+OPAL_CALL(opal_pci_get_hub_diag_data,		OPAL_PCI_GET_HUB_DIAG_DATA);
+OPAL_CALL(opal_pci_get_phb_diag_data,		OPAL_PCI_GET_PHB_DIAG_DATA);
+OPAL_CALL(opal_pci_fence_phb,			OPAL_PCI_FENCE_PHB);
+OPAL_CALL(opal_pci_reinit,			OPAL_PCI_REINIT);
+OPAL_CALL(opal_pci_mask_pe_error,		OPAL_PCI_MASK_PE_ERROR);
+OPAL_CALL(opal_set_slot_led_status,		OPAL_SET_SLOT_LED_STATUS);
+OPAL_CALL(opal_get_epow_status,			OPAL_GET_EPOW_STATUS);
+OPAL_CALL(opal_get_dpo_status,			OPAL_GET_DPO_STATUS);
+OPAL_CALL(opal_set_system_attention_led,	OPAL_SET_SYSTEM_ATTENTION_LED);
+OPAL_CALL(opal_pci_next_error,			OPAL_PCI_NEXT_ERROR);
+OPAL_CALL(opal_pci_poll,			OPAL_PCI_POLL);
+OPAL_CALL(opal_pci_msi_eoi,			OPAL_PCI_MSI_EOI);
+OPAL_CALL(opal_pci_get_phb_diag_data2,		OPAL_PCI_GET_PHB_DIAG_DATA2);
+OPAL_CALL(opal_xscom_read,			OPAL_XSCOM_READ);
+OPAL_CALL(opal_xscom_write,			OPAL_XSCOM_WRITE);
+OPAL_CALL(opal_lpc_read,			OPAL_LPC_READ);
+OPAL_CALL(opal_lpc_write,			OPAL_LPC_WRITE);
+OPAL_CALL(opal_return_cpu,			OPAL_RETURN_CPU);
+OPAL_CALL(opal_reinit_cpus,			OPAL_REINIT_CPUS);
+OPAL_CALL(opal_read_elog,			OPAL_ELOG_READ);
+OPAL_CALL(opal_send_ack_elog,			OPAL_ELOG_ACK);
+OPAL_CALL(opal_get_elog_size,			OPAL_ELOG_SIZE);
+OPAL_CALL(opal_resend_pending_logs,		OPAL_ELOG_RESEND);
+OPAL_CALL(opal_write_elog,			OPAL_ELOG_WRITE);
+OPAL_CALL(opal_validate_flash,			OPAL_FLASH_VALIDATE);
+OPAL_CALL(opal_manage_flash,			OPAL_FLASH_MANAGE);
+OPAL_CALL(opal_update_flash,			OPAL_FLASH_UPDATE);
+OPAL_CALL(opal_resync_timebase,			OPAL_RESYNC_TIMEBASE);
+OPAL_CALL(opal_check_token,			OPAL_CHECK_TOKEN);
+OPAL_CALL(opal_dump_init,			OPAL_DUMP_INIT);
+OPAL_CALL(opal_dump_info,			OPAL_DUMP_INFO);
+OPAL_CALL(opal_dump_info2,			OPAL_DUMP_INFO2);
+OPAL_CALL(opal_dump_read,			OPAL_DUMP_READ);
+OPAL_CALL(opal_dump_ack,			OPAL_DUMP_ACK);
+OPAL_CALL(opal_get_msg,				OPAL_GET_MSG);
+OPAL_CALL(opal_write_oppanel_async,		OPAL_WRITE_OPPANEL_ASYNC);
+OPAL_CALL(opal_check_completion,		OPAL_CHECK_ASYNC_COMPLETION);
+OPAL_CALL(opal_dump_resend_notification,	OPAL_DUMP_RESEND);
+OPAL_CALL(opal_sync_host_reboot,		OPAL_SYNC_HOST_REBOOT);
+OPAL_CALL(opal_sensor_read,			OPAL_SENSOR_READ);
+OPAL_CALL(opal_get_param,			OPAL_GET_PARAM);
+OPAL_CALL(opal_set_param,			OPAL_SET_PARAM);
+OPAL_CALL(opal_handle_hmi,			OPAL_HANDLE_HMI);
+OPAL_CALL(opal_handle_hmi2,			OPAL_HANDLE_HMI2);
+OPAL_CALL(opal_config_cpu_idle_state,		OPAL_CONFIG_CPU_IDLE_STATE);
+OPAL_CALL(opal_slw_set_reg,			OPAL_SLW_SET_REG);
+OPAL_CALL(opal_register_dump_region,		OPAL_REGISTER_DUMP_REGION);
+OPAL_CALL(opal_unregister_dump_region,		OPAL_UNREGISTER_DUMP_REGION);
+OPAL_CALL(opal_pci_set_phb_cxl_mode,		OPAL_PCI_SET_PHB_CAPI_MODE);
+OPAL_CALL(opal_tpo_write,			OPAL_WRITE_TPO);
+OPAL_CALL(opal_tpo_read,			OPAL_READ_TPO);
+OPAL_CALL(opal_ipmi_send,			OPAL_IPMI_SEND);
+OPAL_CALL(opal_ipmi_recv,			OPAL_IPMI_RECV);
+OPAL_CALL(opal_i2c_request,			OPAL_I2C_REQUEST);
+OPAL_CALL(opal_flash_read,			OPAL_FLASH_READ);
+OPAL_CALL(opal_flash_write,			OPAL_FLASH_WRITE);
+OPAL_CALL(opal_flash_erase,			OPAL_FLASH_ERASE);
+OPAL_CALL(opal_prd_msg,				OPAL_PRD_MSG);
+OPAL_CALL(opal_leds_get_ind,			OPAL_LEDS_GET_INDICATOR);
+OPAL_CALL(opal_leds_set_ind,			OPAL_LEDS_SET_INDICATOR);
+OPAL_CALL(opal_console_flush,			OPAL_CONSOLE_FLUSH);
+OPAL_CALL(opal_get_device_tree,			OPAL_GET_DEVICE_TREE);
+OPAL_CALL(opal_pci_get_presence_state,		OPAL_PCI_GET_PRESENCE_STATE);
+OPAL_CALL(opal_pci_get_power_state,		OPAL_PCI_GET_POWER_STATE);
+OPAL_CALL(opal_pci_set_power_state,		OPAL_PCI_SET_POWER_STATE);
+OPAL_CALL(opal_int_get_xirr,			OPAL_INT_GET_XIRR);
+OPAL_CALL(opal_int_set_cppr,			OPAL_INT_SET_CPPR);
+OPAL_CALL(opal_int_eoi,				OPAL_INT_EOI);
+OPAL_CALL(opal_int_set_mfrr,			OPAL_INT_SET_MFRR);
+OPAL_CALL(opal_pci_tce_kill,			OPAL_PCI_TCE_KILL);
+OPAL_CALL(opal_nmmu_set_ptcr,			OPAL_NMMU_SET_PTCR);
+OPAL_CALL(opal_xive_reset,			OPAL_XIVE_RESET);
+OPAL_CALL(opal_xive_get_irq_info,		OPAL_XIVE_GET_IRQ_INFO);
+OPAL_CALL(opal_xive_get_irq_config,		OPAL_XIVE_GET_IRQ_CONFIG);
+OPAL_CALL(opal_xive_set_irq_config,		OPAL_XIVE_SET_IRQ_CONFIG);
+OPAL_CALL(opal_xive_get_queue_info,		OPAL_XIVE_GET_QUEUE_INFO);
+OPAL_CALL(opal_xive_set_queue_info,		OPAL_XIVE_SET_QUEUE_INFO);
+OPAL_CALL(opal_xive_donate_page,		OPAL_XIVE_DONATE_PAGE);
+OPAL_CALL(opal_xive_alloc_vp_block,		OPAL_XIVE_ALLOCATE_VP_BLOCK);
+OPAL_CALL(opal_xive_free_vp_block,		OPAL_XIVE_FREE_VP_BLOCK);
+OPAL_CALL(opal_xive_allocate_irq_raw,		OPAL_XIVE_ALLOCATE_IRQ);
+OPAL_CALL(opal_xive_free_irq,			OPAL_XIVE_FREE_IRQ);
+OPAL_CALL(opal_xive_get_vp_info,		OPAL_XIVE_GET_VP_INFO);
+OPAL_CALL(opal_xive_set_vp_info,		OPAL_XIVE_SET_VP_INFO);
+OPAL_CALL(opal_xive_sync,			OPAL_XIVE_SYNC);
+OPAL_CALL(opal_xive_dump,			OPAL_XIVE_DUMP);
+OPAL_CALL(opal_xive_get_queue_state,		OPAL_XIVE_GET_QUEUE_STATE);
+OPAL_CALL(opal_xive_set_queue_state,		OPAL_XIVE_SET_QUEUE_STATE);
+OPAL_CALL(opal_xive_get_vp_state,		OPAL_XIVE_GET_VP_STATE);
+OPAL_CALL(opal_signal_system_reset,		OPAL_SIGNAL_SYSTEM_RESET);
+OPAL_CALL(opal_npu_init_context,		OPAL_NPU_INIT_CONTEXT);
+OPAL_CALL(opal_npu_destroy_context,		OPAL_NPU_DESTROY_CONTEXT);
+OPAL_CALL(opal_npu_map_lpar,			OPAL_NPU_MAP_LPAR);
+OPAL_CALL(opal_imc_counters_init,		OPAL_IMC_COUNTERS_INIT);
+OPAL_CALL(opal_imc_counters_start,		OPAL_IMC_COUNTERS_START);
+OPAL_CALL(opal_imc_counters_stop,		OPAL_IMC_COUNTERS_STOP);
+OPAL_CALL(opal_get_powercap,			OPAL_GET_POWERCAP);
+OPAL_CALL(opal_set_powercap,			OPAL_SET_POWERCAP);
+OPAL_CALL(opal_get_power_shift_ratio,		OPAL_GET_POWER_SHIFT_RATIO);
+OPAL_CALL(opal_set_power_shift_ratio,		OPAL_SET_POWER_SHIFT_RATIO);
+OPAL_CALL(opal_sensor_group_clear,		OPAL_SENSOR_GROUP_CLEAR);
+OPAL_CALL(opal_quiesce,				OPAL_QUIESCE);
+OPAL_CALL(opal_npu_spa_setup,			OPAL_NPU_SPA_SETUP);
+OPAL_CALL(opal_npu_spa_clear_cache,		OPAL_NPU_SPA_CLEAR_CACHE);
+OPAL_CALL(opal_npu_tl_set,			OPAL_NPU_TL_SET);
+OPAL_CALL(opal_pci_get_pbcq_tunnel_bar,		OPAL_PCI_GET_PBCQ_TUNNEL_BAR);
+OPAL_CALL(opal_pci_set_pbcq_tunnel_bar,		OPAL_PCI_SET_PBCQ_TUNNEL_BAR);
+OPAL_CALL(opal_sensor_read_u64,			OPAL_SENSOR_READ_U64);
+OPAL_CALL(opal_sensor_group_enable,		OPAL_SENSOR_GROUP_ENABLE);
+OPAL_CALL(opal_nx_coproc_init,			OPAL_NX_COPROC_INIT);
+OPAL_CALL(opal_mpipl_update,			OPAL_MPIPL_UPDATE);
+OPAL_CALL(opal_mpipl_register_tag,		OPAL_MPIPL_REGISTER_TAG);
+OPAL_CALL(opal_mpipl_query_tag,			OPAL_MPIPL_QUERY_TAG);
+OPAL_CALL(opal_secvar_get,			OPAL_SECVAR_GET);
+OPAL_CALL(opal_secvar_get_next,			OPAL_SECVAR_GET_NEXT);
+OPAL_CALL(opal_secvar_enqueue_update,		OPAL_SECVAR_ENQUEUE_UPDATE);
diff --git a/arch/powerpc/platforms/powernv/opal-core.c b/arch/powerpc/platforms/powernv/opal-core.c
new file mode 100644
index 000000000..23571f0b5
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/opal-core.c
@@ -0,0 +1,661 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Interface for exporting the OPAL ELF core.
+ * Heavily inspired from fs/proc/vmcore.c
+ *
+ * Copyright 2019, Hari Bathini, IBM Corporation.
+ */
+
+#define pr_fmt(fmt) "opal core: " fmt
+
+#include <linux/memblock.h>
+#include <linux/uaccess.h>
+#include <linux/proc_fs.h>
+#include <linux/elf.h>
+#include <linux/elfcore.h>
+#include <linux/kobject.h>
+#include <linux/sysfs.h>
+#include <linux/slab.h>
+#include <linux/crash_core.h>
+#include <linux/of.h>
+
+#include <asm/page.h>
+#include <asm/opal.h>
+#include <asm/fadump-internal.h>
+
+#include "opal-fadump.h"
+
+#define MAX_PT_LOAD_CNT		8
+
+/* NT_AUXV note related info */
+#define AUXV_CNT		1
+#define AUXV_DESC_SZ		(((2 * AUXV_CNT) + 1) * sizeof(Elf64_Off))
+
+struct opalcore_config {
+	u32			num_cpus;
+	/* PIR value of crashing CPU */
+	u32			crashing_cpu;
+
+	/* CPU state data info from F/W */
+	u64			cpu_state_destination_vaddr;
+	u64			cpu_state_data_size;
+	u64			cpu_state_entry_size;
+
+	/* OPAL memory to be exported as PT_LOAD segments */
+	u64			ptload_addr[MAX_PT_LOAD_CNT];
+	u64			ptload_size[MAX_PT_LOAD_CNT];
+	u64			ptload_cnt;
+
+	/* Pointer to the first PT_LOAD in the ELF core file */
+	Elf64_Phdr		*ptload_phdr;
+
+	/* Total size of opalcore file. */
+	size_t			opalcore_size;
+
+	/* Buffer for all the ELF core headers and the PT_NOTE */
+	size_t			opalcorebuf_sz;
+	char			*opalcorebuf;
+
+	/* NT_AUXV buffer */
+	char			auxv_buf[AUXV_DESC_SZ];
+};
+
+struct opalcore {
+	struct list_head	list;
+	u64			paddr;
+	size_t			size;
+	loff_t			offset;
+};
+
+static LIST_HEAD(opalcore_list);
+static struct opalcore_config *oc_conf;
+static const struct opal_mpipl_fadump *opalc_metadata;
+static const struct opal_mpipl_fadump *opalc_cpu_metadata;
+struct kobject *mpipl_kobj;
+
+/*
+ * Set crashing CPU's signal to SIGUSR1. if the kernel is triggered
+ * by kernel, SIGTERM otherwise.
+ */
+bool kernel_initiated;
+
+static struct opalcore * __init get_new_element(void)
+{
+	return kzalloc(sizeof(struct opalcore), GFP_KERNEL);
+}
+
+static inline int is_opalcore_usable(void)
+{
+	return (oc_conf && oc_conf->opalcorebuf != NULL) ? 1 : 0;
+}
+
+static Elf64_Word *append_elf64_note(Elf64_Word *buf, char *name,
+				     u32 type, void *data,
+				     size_t data_len)
+{
+	Elf64_Nhdr *note = (Elf64_Nhdr *)buf;
+	Elf64_Word namesz = strlen(name) + 1;
+
+	note->n_namesz = cpu_to_be32(namesz);
+	note->n_descsz = cpu_to_be32(data_len);
+	note->n_type   = cpu_to_be32(type);
+	buf += DIV_ROUND_UP(sizeof(*note), sizeof(Elf64_Word));
+	memcpy(buf, name, namesz);
+	buf += DIV_ROUND_UP(namesz, sizeof(Elf64_Word));
+	memcpy(buf, data, data_len);
+	buf += DIV_ROUND_UP(data_len, sizeof(Elf64_Word));
+
+	return buf;
+}
+
+static void fill_prstatus(struct elf_prstatus *prstatus, int pir,
+			  struct pt_regs *regs)
+{
+	memset(prstatus, 0, sizeof(struct elf_prstatus));
+	elf_core_copy_kernel_regs(&(prstatus->pr_reg), regs);
+
+	/*
+	 * Overload PID with PIR value.
+	 * As a PIR value could also be '0', add an offset of '100'
+	 * to every PIR to avoid misinterpretations in GDB.
+	 */
+	prstatus->pr_pid  = cpu_to_be32(100 + pir);
+	prstatus->pr_ppid = cpu_to_be32(1);
+
+	/*
+	 * Indicate SIGUSR1 for crash initiated from kernel.
+	 * SIGTERM otherwise.
+	 */
+	if (pir == oc_conf->crashing_cpu) {
+		short sig;
+
+		sig = kernel_initiated ? SIGUSR1 : SIGTERM;
+		prstatus->pr_cursig = cpu_to_be16(sig);
+	}
+}
+
+static Elf64_Word *auxv_to_elf64_notes(Elf64_Word *buf,
+				       u64 opal_boot_entry)
+{
+	Elf64_Off *bufp = (Elf64_Off *)oc_conf->auxv_buf;
+	int idx = 0;
+
+	memset(bufp, 0, AUXV_DESC_SZ);
+
+	/* Entry point of OPAL */
+	bufp[idx++] = cpu_to_be64(AT_ENTRY);
+	bufp[idx++] = cpu_to_be64(opal_boot_entry);
+
+	/* end of vector */
+	bufp[idx++] = cpu_to_be64(AT_NULL);
+
+	buf = append_elf64_note(buf, CRASH_CORE_NOTE_NAME, NT_AUXV,
+				oc_conf->auxv_buf, AUXV_DESC_SZ);
+	return buf;
+}
+
+/*
+ * Read from the ELF header and then the crash dump.
+ * Returns number of bytes read on success, -errno on failure.
+ */
+static ssize_t read_opalcore(struct file *file, struct kobject *kobj,
+			     struct bin_attribute *bin_attr, char *to,
+			     loff_t pos, size_t count)
+{
+	struct opalcore *m;
+	ssize_t tsz, avail;
+	loff_t tpos = pos;
+
+	if (pos >= oc_conf->opalcore_size)
+		return 0;
+
+	/* Adjust count if it goes beyond opalcore size */
+	avail = oc_conf->opalcore_size - pos;
+	if (count > avail)
+		count = avail;
+
+	if (count == 0)
+		return 0;
+
+	/* Read ELF core header and/or PT_NOTE segment */
+	if (tpos < oc_conf->opalcorebuf_sz) {
+		tsz = min_t(size_t, oc_conf->opalcorebuf_sz - tpos, count);
+		memcpy(to, oc_conf->opalcorebuf + tpos, tsz);
+		to += tsz;
+		tpos += tsz;
+		count -= tsz;
+	}
+
+	list_for_each_entry(m, &opalcore_list, list) {
+		/* nothing more to read here */
+		if (count == 0)
+			break;
+
+		if (tpos < m->offset + m->size) {
+			void *addr;
+
+			tsz = min_t(size_t, m->offset + m->size - tpos, count);
+			addr = (void *)(m->paddr + tpos - m->offset);
+			memcpy(to, __va(addr), tsz);
+			to += tsz;
+			tpos += tsz;
+			count -= tsz;
+		}
+	}
+
+	return (tpos - pos);
+}
+
+static struct bin_attribute opal_core_attr = {
+	.attr = {.name = "core", .mode = 0400},
+	.read = read_opalcore
+};
+
+/*
+ * Read CPU state dump data and convert it into ELF notes.
+ *
+ * Each register entry is of 16 bytes, A numerical identifier along with
+ * a GPR/SPR flag in the first 8 bytes and the register value in the next
+ * 8 bytes. For more details refer to F/W documentation.
+ */
+static Elf64_Word * __init opalcore_append_cpu_notes(Elf64_Word *buf)
+{
+	u32 thread_pir, size_per_thread, regs_offset, regs_cnt, reg_esize;
+	struct hdat_fadump_thread_hdr *thdr;
+	struct elf_prstatus prstatus;
+	Elf64_Word *first_cpu_note;
+	struct pt_regs regs;
+	char *bufp;
+	int i;
+
+	size_per_thread = oc_conf->cpu_state_entry_size;
+	bufp = __va(oc_conf->cpu_state_destination_vaddr);
+
+	/*
+	 * Offset for register entries, entry size and registers count is
+	 * duplicated in every thread header in keeping with HDAT format.
+	 * Use these values from the first thread header.
+	 */
+	thdr = (struct hdat_fadump_thread_hdr *)bufp;
+	regs_offset = (offsetof(struct hdat_fadump_thread_hdr, offset) +
+		       be32_to_cpu(thdr->offset));
+	reg_esize = be32_to_cpu(thdr->esize);
+	regs_cnt  = be32_to_cpu(thdr->ecnt);
+
+	pr_debug("--------CPU State Data------------\n");
+	pr_debug("NumCpus     : %u\n", oc_conf->num_cpus);
+	pr_debug("\tOffset: %u, Entry size: %u, Cnt: %u\n",
+		 regs_offset, reg_esize, regs_cnt);
+
+	/*
+	 * Skip past the first CPU note. Fill this note with the
+	 * crashing CPU's prstatus.
+	 */
+	first_cpu_note = buf;
+	buf = append_elf64_note(buf, CRASH_CORE_NOTE_NAME, NT_PRSTATUS,
+				&prstatus, sizeof(prstatus));
+
+	for (i = 0; i < oc_conf->num_cpus; i++, bufp += size_per_thread) {
+		thdr = (struct hdat_fadump_thread_hdr *)bufp;
+		thread_pir = be32_to_cpu(thdr->pir);
+
+		pr_debug("[%04d] PIR: 0x%x, core state: 0x%02x\n",
+			 i, thread_pir, thdr->core_state);
+
+		/*
+		 * Register state data of MAX cores is provided by firmware,
+		 * but some of this cores may not be active. So, while
+		 * processing register state data, check core state and
+		 * skip threads that belong to inactive cores.
+		 */
+		if (thdr->core_state == HDAT_FADUMP_CORE_INACTIVE)
+			continue;
+
+		opal_fadump_read_regs((bufp + regs_offset), regs_cnt,
+				      reg_esize, false, &regs);
+
+		pr_debug("PIR 0x%x - R1 : 0x%llx, NIP : 0x%llx\n", thread_pir,
+			 be64_to_cpu(regs.gpr[1]), be64_to_cpu(regs.nip));
+		fill_prstatus(&prstatus, thread_pir, &regs);
+
+		if (thread_pir != oc_conf->crashing_cpu) {
+			buf = append_elf64_note(buf, CRASH_CORE_NOTE_NAME,
+						NT_PRSTATUS, &prstatus,
+						sizeof(prstatus));
+		} else {
+			/*
+			 * Add crashing CPU as the first NT_PRSTATUS note for
+			 * GDB to process the core file appropriately.
+			 */
+			append_elf64_note(first_cpu_note, CRASH_CORE_NOTE_NAME,
+					  NT_PRSTATUS, &prstatus,
+					  sizeof(prstatus));
+		}
+	}
+
+	return buf;
+}
+
+static int __init create_opalcore(void)
+{
+	u64 opal_boot_entry, opal_base_addr, paddr;
+	u32 hdr_size, cpu_notes_size, count;
+	struct device_node *dn;
+	struct opalcore *new;
+	loff_t opalcore_off;
+	struct page *page;
+	Elf64_Phdr *phdr;
+	Elf64_Ehdr *elf;
+	int i, ret;
+	char *bufp;
+
+	/* Get size of header & CPU notes for OPAL core */
+	hdr_size = (sizeof(Elf64_Ehdr) +
+		    ((oc_conf->ptload_cnt + 1) * sizeof(Elf64_Phdr)));
+	cpu_notes_size = ((oc_conf->num_cpus * (CRASH_CORE_NOTE_HEAD_BYTES +
+			  CRASH_CORE_NOTE_NAME_BYTES +
+			  CRASH_CORE_NOTE_DESC_BYTES)) +
+			  (CRASH_CORE_NOTE_HEAD_BYTES +
+			  CRASH_CORE_NOTE_NAME_BYTES + AUXV_DESC_SZ));
+
+	/* Allocate buffer to setup OPAL core */
+	oc_conf->opalcorebuf_sz = PAGE_ALIGN(hdr_size + cpu_notes_size);
+	oc_conf->opalcorebuf = alloc_pages_exact(oc_conf->opalcorebuf_sz,
+						 GFP_KERNEL | __GFP_ZERO);
+	if (!oc_conf->opalcorebuf) {
+		pr_err("Not enough memory to setup OPAL core (size: %lu)\n",
+		       oc_conf->opalcorebuf_sz);
+		oc_conf->opalcorebuf_sz = 0;
+		return -ENOMEM;
+	}
+	count = oc_conf->opalcorebuf_sz / PAGE_SIZE;
+	page = virt_to_page(oc_conf->opalcorebuf);
+	for (i = 0; i < count; i++)
+		mark_page_reserved(page + i);
+
+	pr_debug("opalcorebuf = 0x%llx\n", (u64)oc_conf->opalcorebuf);
+
+	/* Read OPAL related device-tree entries */
+	dn = of_find_node_by_name(NULL, "ibm,opal");
+	if (dn) {
+		ret = of_property_read_u64(dn, "opal-base-address",
+					   &opal_base_addr);
+		pr_debug("opal-base-address: %llx\n", opal_base_addr);
+		ret |= of_property_read_u64(dn, "opal-boot-address",
+					    &opal_boot_entry);
+		pr_debug("opal-boot-address: %llx\n", opal_boot_entry);
+	}
+	if (!dn || ret)
+		pr_warn("WARNING: Failed to read OPAL base & entry values\n");
+
+	/* Use count to keep track of the program headers */
+	count = 0;
+
+	bufp = oc_conf->opalcorebuf;
+	elf = (Elf64_Ehdr *)bufp;
+	bufp += sizeof(Elf64_Ehdr);
+	memcpy(elf->e_ident, ELFMAG, SELFMAG);
+	elf->e_ident[EI_CLASS] = ELF_CLASS;
+	elf->e_ident[EI_DATA] = ELFDATA2MSB;
+	elf->e_ident[EI_VERSION] = EV_CURRENT;
+	elf->e_ident[EI_OSABI] = ELF_OSABI;
+	memset(elf->e_ident+EI_PAD, 0, EI_NIDENT-EI_PAD);
+	elf->e_type = cpu_to_be16(ET_CORE);
+	elf->e_machine = cpu_to_be16(ELF_ARCH);
+	elf->e_version = cpu_to_be32(EV_CURRENT);
+	elf->e_entry = 0;
+	elf->e_phoff = cpu_to_be64(sizeof(Elf64_Ehdr));
+	elf->e_shoff = 0;
+	elf->e_flags = 0;
+
+	elf->e_ehsize = cpu_to_be16(sizeof(Elf64_Ehdr));
+	elf->e_phentsize = cpu_to_be16(sizeof(Elf64_Phdr));
+	elf->e_phnum = 0;
+	elf->e_shentsize = 0;
+	elf->e_shnum = 0;
+	elf->e_shstrndx = 0;
+
+	phdr = (Elf64_Phdr *)bufp;
+	bufp += sizeof(Elf64_Phdr);
+	phdr->p_type	= cpu_to_be32(PT_NOTE);
+	phdr->p_flags	= 0;
+	phdr->p_align	= 0;
+	phdr->p_paddr	= phdr->p_vaddr = 0;
+	phdr->p_offset	= cpu_to_be64(hdr_size);
+	phdr->p_filesz	= phdr->p_memsz = cpu_to_be64(cpu_notes_size);
+	count++;
+
+	opalcore_off = oc_conf->opalcorebuf_sz;
+	oc_conf->ptload_phdr  = (Elf64_Phdr *)bufp;
+	paddr = 0;
+	for (i = 0; i < oc_conf->ptload_cnt; i++) {
+		phdr = (Elf64_Phdr *)bufp;
+		bufp += sizeof(Elf64_Phdr);
+		phdr->p_type	= cpu_to_be32(PT_LOAD);
+		phdr->p_flags	= cpu_to_be32(PF_R|PF_W|PF_X);
+		phdr->p_align	= 0;
+
+		new = get_new_element();
+		if (!new)
+			return -ENOMEM;
+		new->paddr  = oc_conf->ptload_addr[i];
+		new->size   = oc_conf->ptload_size[i];
+		new->offset = opalcore_off;
+		list_add_tail(&new->list, &opalcore_list);
+
+		phdr->p_paddr	= cpu_to_be64(paddr);
+		phdr->p_vaddr	= cpu_to_be64(opal_base_addr + paddr);
+		phdr->p_filesz	= phdr->p_memsz  =
+			cpu_to_be64(oc_conf->ptload_size[i]);
+		phdr->p_offset	= cpu_to_be64(opalcore_off);
+
+		count++;
+		opalcore_off += oc_conf->ptload_size[i];
+		paddr += oc_conf->ptload_size[i];
+	}
+
+	elf->e_phnum = cpu_to_be16(count);
+
+	bufp = (char *)opalcore_append_cpu_notes((Elf64_Word *)bufp);
+	bufp = (char *)auxv_to_elf64_notes((Elf64_Word *)bufp, opal_boot_entry);
+
+	oc_conf->opalcore_size = opalcore_off;
+	return 0;
+}
+
+static void opalcore_cleanup(void)
+{
+	if (oc_conf == NULL)
+		return;
+
+	/* Remove OPAL core sysfs file */
+	sysfs_remove_bin_file(mpipl_kobj, &opal_core_attr);
+	oc_conf->ptload_phdr = NULL;
+	oc_conf->ptload_cnt = 0;
+
+	/* free the buffer used for setting up OPAL core */
+	if (oc_conf->opalcorebuf) {
+		void *end = (void *)((u64)oc_conf->opalcorebuf +
+				     oc_conf->opalcorebuf_sz);
+
+		free_reserved_area(oc_conf->opalcorebuf, end, -1, NULL);
+		oc_conf->opalcorebuf = NULL;
+		oc_conf->opalcorebuf_sz = 0;
+	}
+
+	kfree(oc_conf);
+	oc_conf = NULL;
+}
+__exitcall(opalcore_cleanup);
+
+static void __init opalcore_config_init(void)
+{
+	u32 idx, cpu_data_version;
+	struct device_node *np;
+	const __be32 *prop;
+	u64 addr = 0;
+	int i, ret;
+
+	np = of_find_node_by_path("/ibm,opal/dump");
+	if (np == NULL)
+		return;
+
+	if (!of_device_is_compatible(np, "ibm,opal-dump")) {
+		pr_warn("Support missing for this f/w version!\n");
+		return;
+	}
+
+	/* Check if dump has been initiated on last reboot */
+	prop = of_get_property(np, "mpipl-boot", NULL);
+	if (!prop) {
+		of_node_put(np);
+		return;
+	}
+
+	/* Get OPAL metadata */
+	ret = opal_mpipl_query_tag(OPAL_MPIPL_TAG_OPAL, &addr);
+	if ((ret != OPAL_SUCCESS) || !addr) {
+		pr_err("Failed to get OPAL metadata (%d)\n", ret);
+		goto error_out;
+	}
+
+	addr = be64_to_cpu(addr);
+	pr_debug("OPAL metadata addr: %llx\n", addr);
+	opalc_metadata = __va(addr);
+
+	/* Get OPAL CPU metadata */
+	ret = opal_mpipl_query_tag(OPAL_MPIPL_TAG_CPU, &addr);
+	if ((ret != OPAL_SUCCESS) || !addr) {
+		pr_err("Failed to get OPAL CPU metadata (%d)\n", ret);
+		goto error_out;
+	}
+
+	addr = be64_to_cpu(addr);
+	pr_debug("CPU metadata addr: %llx\n", addr);
+	opalc_cpu_metadata = __va(addr);
+
+	/* Allocate memory for config buffer */
+	oc_conf = kzalloc(sizeof(struct opalcore_config), GFP_KERNEL);
+	if (oc_conf == NULL)
+		goto error_out;
+
+	/* Parse OPAL metadata */
+	if (opalc_metadata->version != OPAL_MPIPL_VERSION) {
+		pr_warn("Supported OPAL metadata version: %u, found: %u!\n",
+			OPAL_MPIPL_VERSION, opalc_metadata->version);
+		pr_warn("WARNING: F/W using newer OPAL metadata format!!\n");
+	}
+
+	oc_conf->ptload_cnt = 0;
+	idx = be32_to_cpu(opalc_metadata->region_cnt);
+	if (idx > MAX_PT_LOAD_CNT) {
+		pr_warn("WARNING: OPAL regions count (%d) adjusted to limit (%d)",
+			idx, MAX_PT_LOAD_CNT);
+		idx = MAX_PT_LOAD_CNT;
+	}
+	for (i = 0; i < idx; i++) {
+		oc_conf->ptload_addr[oc_conf->ptload_cnt] =
+				be64_to_cpu(opalc_metadata->region[i].dest);
+		oc_conf->ptload_size[oc_conf->ptload_cnt++] =
+				be64_to_cpu(opalc_metadata->region[i].size);
+	}
+	oc_conf->ptload_cnt = i;
+	oc_conf->crashing_cpu = be32_to_cpu(opalc_metadata->crashing_pir);
+
+	if (!oc_conf->ptload_cnt) {
+		pr_err("OPAL memory regions not found\n");
+		goto error_out;
+	}
+
+	/* Parse OPAL CPU metadata */
+	cpu_data_version = be32_to_cpu(opalc_cpu_metadata->cpu_data_version);
+	if (cpu_data_version != HDAT_FADUMP_CPU_DATA_VER) {
+		pr_warn("Supported CPU data version: %u, found: %u!\n",
+			HDAT_FADUMP_CPU_DATA_VER, cpu_data_version);
+		pr_warn("WARNING: F/W using newer CPU state data format!!\n");
+	}
+
+	addr = be64_to_cpu(opalc_cpu_metadata->region[0].dest);
+	if (!addr) {
+		pr_err("CPU state data not found!\n");
+		goto error_out;
+	}
+	oc_conf->cpu_state_destination_vaddr = (u64)__va(addr);
+
+	oc_conf->cpu_state_data_size =
+			be64_to_cpu(opalc_cpu_metadata->region[0].size);
+	oc_conf->cpu_state_entry_size =
+			be32_to_cpu(opalc_cpu_metadata->cpu_data_size);
+
+	if ((oc_conf->cpu_state_entry_size == 0) ||
+	    (oc_conf->cpu_state_entry_size > oc_conf->cpu_state_data_size)) {
+		pr_err("CPU state data is invalid.\n");
+		goto error_out;
+	}
+	oc_conf->num_cpus = (oc_conf->cpu_state_data_size /
+			     oc_conf->cpu_state_entry_size);
+
+	of_node_put(np);
+	return;
+
+error_out:
+	pr_err("Could not export /sys/firmware/opal/core\n");
+	opalcore_cleanup();
+	of_node_put(np);
+}
+
+static ssize_t release_core_store(struct kobject *kobj,
+				  struct kobj_attribute *attr,
+				  const char *buf, size_t count)
+{
+	int input = -1;
+
+	if (kstrtoint(buf, 0, &input))
+		return -EINVAL;
+
+	if (input == 1) {
+		if (oc_conf == NULL) {
+			pr_err("'/sys/firmware/opal/core' file not accessible!\n");
+			return -EPERM;
+		}
+
+		/*
+		 * Take away '/sys/firmware/opal/core' and release all memory
+		 * used for exporting this file.
+		 */
+		opalcore_cleanup();
+	} else
+		return -EINVAL;
+
+	return count;
+}
+
+static struct kobj_attribute opalcore_rel_attr = __ATTR_WO(release_core);
+
+static struct attribute *mpipl_attr[] = {
+	&opalcore_rel_attr.attr,
+	NULL,
+};
+
+static struct bin_attribute *mpipl_bin_attr[] = {
+	&opal_core_attr,
+	NULL,
+
+};
+
+static struct attribute_group mpipl_group = {
+	.attrs = mpipl_attr,
+	.bin_attrs =  mpipl_bin_attr,
+};
+
+static int __init opalcore_init(void)
+{
+	int rc = -1;
+
+	opalcore_config_init();
+
+	if (oc_conf == NULL)
+		return rc;
+
+	create_opalcore();
+
+	/*
+	 * If oc_conf->opalcorebuf= is set in the 2nd kernel,
+	 * then capture the dump.
+	 */
+	if (!(is_opalcore_usable())) {
+		pr_err("Failed to export /sys/firmware/opal/mpipl/core\n");
+		opalcore_cleanup();
+		return rc;
+	}
+
+	/* Set OPAL core file size */
+	opal_core_attr.size = oc_conf->opalcore_size;
+
+	mpipl_kobj = kobject_create_and_add("mpipl", opal_kobj);
+	if (!mpipl_kobj) {
+		pr_err("unable to create mpipl kobject\n");
+		return -ENOMEM;
+	}
+
+	/* Export OPAL core sysfs file */
+	rc = sysfs_create_group(mpipl_kobj, &mpipl_group);
+	if (rc) {
+		pr_err("mpipl sysfs group creation failed (%d)", rc);
+		opalcore_cleanup();
+		return rc;
+	}
+	/* The /sys/firmware/opal/core is moved to /sys/firmware/opal/mpipl/
+	 * directory, need to create symlink at old location to maintain
+	 * backward compatibility.
+	 */
+	rc = compat_only_sysfs_link_entry_to_kobj(opal_kobj, mpipl_kobj,
+						  "core", NULL);
+	if (rc) {
+		pr_err("unable to create core symlink (%d)\n", rc);
+		return rc;
+	}
+
+	return 0;
+}
+fs_initcall(opalcore_init);
diff --git a/arch/powerpc/platforms/powernv/opal-dump.c b/arch/powerpc/platforms/powernv/opal-dump.c
new file mode 100644
index 000000000..00c5a59d8
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/opal-dump.c
@@ -0,0 +1,458 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * PowerNV OPAL Dump Interface
+ *
+ * Copyright 2013,2014 IBM Corp.
+ */
+
+#include <linux/kobject.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/pagemap.h>
+#include <linux/delay.h>
+#include <linux/interrupt.h>
+
+#include <asm/opal.h>
+
+#define DUMP_TYPE_FSP	0x01
+
+struct dump_obj {
+	struct kobject  kobj;
+	struct bin_attribute dump_attr;
+	uint32_t	id;  /* becomes object name */
+	uint32_t	type;
+	uint32_t	size;
+	char		*buffer;
+};
+#define to_dump_obj(x) container_of(x, struct dump_obj, kobj)
+
+struct dump_attribute {
+	struct attribute attr;
+	ssize_t (*show)(struct dump_obj *dump, struct dump_attribute *attr,
+			char *buf);
+	ssize_t (*store)(struct dump_obj *dump, struct dump_attribute *attr,
+			 const char *buf, size_t count);
+};
+#define to_dump_attr(x) container_of(x, struct dump_attribute, attr)
+
+static ssize_t dump_id_show(struct dump_obj *dump_obj,
+			    struct dump_attribute *attr,
+			    char *buf)
+{
+	return sprintf(buf, "0x%x\n", dump_obj->id);
+}
+
+static const char* dump_type_to_string(uint32_t type)
+{
+	switch (type) {
+	case 0x01: return "SP Dump";
+	case 0x02: return "System/Platform Dump";
+	case 0x03: return "SMA Dump";
+	default: return "unknown";
+	}
+}
+
+static ssize_t dump_type_show(struct dump_obj *dump_obj,
+			      struct dump_attribute *attr,
+			      char *buf)
+{
+
+	return sprintf(buf, "0x%x %s\n", dump_obj->type,
+		       dump_type_to_string(dump_obj->type));
+}
+
+static ssize_t dump_ack_show(struct dump_obj *dump_obj,
+			     struct dump_attribute *attr,
+			     char *buf)
+{
+	return sprintf(buf, "ack - acknowledge dump\n");
+}
+
+/*
+ * Send acknowledgement to OPAL
+ */
+static int64_t dump_send_ack(uint32_t dump_id)
+{
+	int rc;
+
+	rc = opal_dump_ack(dump_id);
+	if (rc)
+		pr_warn("%s: Failed to send ack to Dump ID 0x%x (%d)\n",
+			__func__, dump_id, rc);
+	return rc;
+}
+
+static ssize_t dump_ack_store(struct dump_obj *dump_obj,
+			      struct dump_attribute *attr,
+			      const char *buf,
+			      size_t count)
+{
+	/*
+	 * Try to self remove this attribute. If we are successful,
+	 * delete the kobject itself.
+	 */
+	if (sysfs_remove_file_self(&dump_obj->kobj, &attr->attr)) {
+		dump_send_ack(dump_obj->id);
+		kobject_put(&dump_obj->kobj);
+	}
+	return count;
+}
+
+/* Attributes of a dump
+ * The binary attribute of the dump itself is dynamic
+ * due to the dynamic size of the dump
+ */
+static struct dump_attribute id_attribute =
+	__ATTR(id, 0444, dump_id_show, NULL);
+static struct dump_attribute type_attribute =
+	__ATTR(type, 0444, dump_type_show, NULL);
+static struct dump_attribute ack_attribute =
+	__ATTR(acknowledge, 0660, dump_ack_show, dump_ack_store);
+
+static ssize_t init_dump_show(struct dump_obj *dump_obj,
+			      struct dump_attribute *attr,
+			      char *buf)
+{
+	return sprintf(buf, "1 - initiate Service Processor(FSP) dump\n");
+}
+
+static int64_t dump_fips_init(uint8_t type)
+{
+	int rc;
+
+	rc = opal_dump_init(type);
+	if (rc)
+		pr_warn("%s: Failed to initiate FSP dump (%d)\n",
+			__func__, rc);
+	return rc;
+}
+
+static ssize_t init_dump_store(struct dump_obj *dump_obj,
+			       struct dump_attribute *attr,
+			       const char *buf,
+			       size_t count)
+{
+	int rc;
+
+	rc = dump_fips_init(DUMP_TYPE_FSP);
+	if (rc == OPAL_SUCCESS)
+		pr_info("%s: Initiated FSP dump\n", __func__);
+
+	return count;
+}
+
+static struct dump_attribute initiate_attribute =
+	__ATTR(initiate_dump, 0600, init_dump_show, init_dump_store);
+
+static struct attribute *initiate_attrs[] = {
+	&initiate_attribute.attr,
+	NULL,
+};
+
+static struct attribute_group initiate_attr_group = {
+	.attrs = initiate_attrs,
+};
+
+static struct kset *dump_kset;
+
+static ssize_t dump_attr_show(struct kobject *kobj,
+			      struct attribute *attr,
+			      char *buf)
+{
+	struct dump_attribute *attribute;
+	struct dump_obj *dump;
+
+	attribute = to_dump_attr(attr);
+	dump = to_dump_obj(kobj);
+
+	if (!attribute->show)
+		return -EIO;
+
+	return attribute->show(dump, attribute, buf);
+}
+
+static ssize_t dump_attr_store(struct kobject *kobj,
+			       struct attribute *attr,
+			       const char *buf, size_t len)
+{
+	struct dump_attribute *attribute;
+	struct dump_obj *dump;
+
+	attribute = to_dump_attr(attr);
+	dump = to_dump_obj(kobj);
+
+	if (!attribute->store)
+		return -EIO;
+
+	return attribute->store(dump, attribute, buf, len);
+}
+
+static const struct sysfs_ops dump_sysfs_ops = {
+	.show = dump_attr_show,
+	.store = dump_attr_store,
+};
+
+static void dump_release(struct kobject *kobj)
+{
+	struct dump_obj *dump;
+
+	dump = to_dump_obj(kobj);
+	vfree(dump->buffer);
+	kfree(dump);
+}
+
+static struct attribute *dump_default_attrs[] = {
+	&id_attribute.attr,
+	&type_attribute.attr,
+	&ack_attribute.attr,
+	NULL,
+};
+
+static struct kobj_type dump_ktype = {
+	.sysfs_ops = &dump_sysfs_ops,
+	.release = &dump_release,
+	.default_attrs = dump_default_attrs,
+};
+
+static int64_t dump_read_info(uint32_t *dump_id, uint32_t *dump_size, uint32_t *dump_type)
+{
+	__be32 id, size, type;
+	int rc;
+
+	type = cpu_to_be32(0xffffffff);
+
+	rc = opal_dump_info2(&id, &size, &type);
+	if (rc == OPAL_PARAMETER)
+		rc = opal_dump_info(&id, &size);
+
+	if (rc) {
+		pr_warn("%s: Failed to get dump info (%d)\n",
+			__func__, rc);
+		return rc;
+	}
+
+	*dump_id = be32_to_cpu(id);
+	*dump_size = be32_to_cpu(size);
+	*dump_type = be32_to_cpu(type);
+
+	return rc;
+}
+
+static int64_t dump_read_data(struct dump_obj *dump)
+{
+	struct opal_sg_list *list;
+	uint64_t addr;
+	int64_t rc;
+
+	/* Allocate memory */
+	dump->buffer = vzalloc(PAGE_ALIGN(dump->size));
+	if (!dump->buffer) {
+		pr_err("%s : Failed to allocate memory\n", __func__);
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	/* Generate SG list */
+	list = opal_vmalloc_to_sg_list(dump->buffer, dump->size);
+	if (!list) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	/* First entry address */
+	addr = __pa(list);
+
+	/* Fetch data */
+	rc = OPAL_BUSY_EVENT;
+	while (rc == OPAL_BUSY || rc == OPAL_BUSY_EVENT) {
+		rc = opal_dump_read(dump->id, addr);
+		if (rc == OPAL_BUSY_EVENT) {
+			opal_poll_events(NULL);
+			msleep(20);
+		}
+	}
+
+	if (rc != OPAL_SUCCESS && rc != OPAL_PARTIAL)
+		pr_warn("%s: Extract dump failed for ID 0x%x\n",
+			__func__, dump->id);
+
+	/* Free SG list */
+	opal_free_sg_list(list);
+
+out:
+	return rc;
+}
+
+static ssize_t dump_attr_read(struct file *filep, struct kobject *kobj,
+			      struct bin_attribute *bin_attr,
+			      char *buffer, loff_t pos, size_t count)
+{
+	ssize_t rc;
+
+	struct dump_obj *dump = to_dump_obj(kobj);
+
+	if (!dump->buffer) {
+		rc = dump_read_data(dump);
+
+		if (rc != OPAL_SUCCESS && rc != OPAL_PARTIAL) {
+			vfree(dump->buffer);
+			dump->buffer = NULL;
+
+			return -EIO;
+		}
+		if (rc == OPAL_PARTIAL) {
+			/* On a partial read, we just return EIO
+			 * and rely on userspace to ask us to try
+			 * again.
+			 */
+			pr_info("%s: Platform dump partially read. ID = 0x%x\n",
+				__func__, dump->id);
+			return -EIO;
+		}
+	}
+
+	memcpy(buffer, dump->buffer + pos, count);
+
+	/* You may think we could free the dump buffer now and retrieve
+	 * it again later if needed, but due to current firmware limitation,
+	 * that's not the case. So, once read into userspace once,
+	 * we keep the dump around until it's acknowledged by userspace.
+	 */
+
+	return count;
+}
+
+static void create_dump_obj(uint32_t id, size_t size, uint32_t type)
+{
+	struct dump_obj *dump;
+	int rc;
+
+	dump = kzalloc(sizeof(*dump), GFP_KERNEL);
+	if (!dump)
+		return;
+
+	dump->kobj.kset = dump_kset;
+
+	kobject_init(&dump->kobj, &dump_ktype);
+
+	sysfs_bin_attr_init(&dump->dump_attr);
+
+	dump->dump_attr.attr.name = "dump";
+	dump->dump_attr.attr.mode = 0400;
+	dump->dump_attr.size = size;
+	dump->dump_attr.read = dump_attr_read;
+
+	dump->id = id;
+	dump->size = size;
+	dump->type = type;
+
+	rc = kobject_add(&dump->kobj, NULL, "0x%x-0x%x", type, id);
+	if (rc) {
+		kobject_put(&dump->kobj);
+		return;
+	}
+
+	/*
+	 * As soon as the sysfs file for this dump is created/activated there is
+	 * a chance the opal_errd daemon (or any userspace) might read and
+	 * acknowledge the dump before kobject_uevent() is called. If that
+	 * happens then there is a potential race between
+	 * dump_ack_store->kobject_put() and kobject_uevent() which leads to a
+	 * use-after-free of a kernfs object resulting in a kernel crash.
+	 *
+	 * To avoid that, we need to take a reference on behalf of the bin file,
+	 * so that our reference remains valid while we call kobject_uevent().
+	 * We then drop our reference before exiting the function, leaving the
+	 * bin file to drop the last reference (if it hasn't already).
+	 */
+
+	/* Take a reference for the bin file */
+	kobject_get(&dump->kobj);
+	rc = sysfs_create_bin_file(&dump->kobj, &dump->dump_attr);
+	if (rc == 0) {
+		kobject_uevent(&dump->kobj, KOBJ_ADD);
+
+		pr_info("%s: New platform dump. ID = 0x%x Size %u\n",
+			__func__, dump->id, dump->size);
+	} else {
+		/* Drop reference count taken for bin file */
+		kobject_put(&dump->kobj);
+	}
+
+	/* Drop our reference */
+	kobject_put(&dump->kobj);
+	return;
+}
+
+static irqreturn_t process_dump(int irq, void *data)
+{
+	int rc;
+	uint32_t dump_id, dump_size, dump_type;
+	char name[22];
+	struct kobject *kobj;
+
+	rc = dump_read_info(&dump_id, &dump_size, &dump_type);
+	if (rc != OPAL_SUCCESS)
+		return IRQ_HANDLED;
+
+	sprintf(name, "0x%x-0x%x", dump_type, dump_id);
+
+	/* we may get notified twice, let's handle
+	 * that gracefully and not create two conflicting
+	 * entries.
+	 */
+	kobj = kset_find_obj(dump_kset, name);
+	if (kobj) {
+		/* Drop reference added by kset_find_obj() */
+		kobject_put(kobj);
+		return IRQ_HANDLED;
+	}
+
+	create_dump_obj(dump_id, dump_size, dump_type);
+
+	return IRQ_HANDLED;
+}
+
+void __init opal_platform_dump_init(void)
+{
+	int rc;
+	int dump_irq;
+
+	/* ELOG not supported by firmware */
+	if (!opal_check_token(OPAL_DUMP_READ))
+		return;
+
+	dump_kset = kset_create_and_add("dump", NULL, opal_kobj);
+	if (!dump_kset) {
+		pr_warn("%s: Failed to create dump kset\n", __func__);
+		return;
+	}
+
+	rc = sysfs_create_group(&dump_kset->kobj, &initiate_attr_group);
+	if (rc) {
+		pr_warn("%s: Failed to create initiate dump attr group\n",
+			__func__);
+		kobject_put(&dump_kset->kobj);
+		return;
+	}
+
+	dump_irq = opal_event_request(ilog2(OPAL_EVENT_DUMP_AVAIL));
+	if (!dump_irq) {
+		pr_err("%s: Can't register OPAL event irq (%d)\n",
+		       __func__, dump_irq);
+		return;
+	}
+
+	rc = request_threaded_irq(dump_irq, NULL, process_dump,
+				IRQF_TRIGGER_HIGH | IRQF_ONESHOT,
+				"opal-dump", NULL);
+	if (rc) {
+		pr_err("%s: Can't request OPAL event irq (%d)\n",
+		       __func__, rc);
+		return;
+	}
+
+	if (opal_check_token(OPAL_DUMP_RESEND))
+		opal_dump_resend_notification();
+}
diff --git a/arch/powerpc/platforms/powernv/opal-elog.c b/arch/powerpc/platforms/powernv/opal-elog.c
new file mode 100644
index 000000000..37b380eef
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/opal-elog.c
@@ -0,0 +1,339 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Error log support on PowerNV.
+ *
+ * Copyright 2013,2014 IBM Corp.
+ */
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/of.h>
+#include <linux/slab.h>
+#include <linux/sysfs.h>
+#include <linux/fs.h>
+#include <linux/vmalloc.h>
+#include <linux/fcntl.h>
+#include <linux/kobject.h>
+#include <linux/uaccess.h>
+#include <asm/opal.h>
+
+struct elog_obj {
+	struct kobject kobj;
+	struct bin_attribute raw_attr;
+	uint64_t id;
+	uint64_t type;
+	size_t size;
+	char *buffer;
+};
+#define to_elog_obj(x) container_of(x, struct elog_obj, kobj)
+
+struct elog_attribute {
+	struct attribute attr;
+	ssize_t (*show)(struct elog_obj *elog, struct elog_attribute *attr,
+			char *buf);
+	ssize_t (*store)(struct elog_obj *elog, struct elog_attribute *attr,
+			 const char *buf, size_t count);
+};
+#define to_elog_attr(x) container_of(x, struct elog_attribute, attr)
+
+static ssize_t elog_id_show(struct elog_obj *elog_obj,
+			    struct elog_attribute *attr,
+			    char *buf)
+{
+	return sprintf(buf, "0x%llx\n", elog_obj->id);
+}
+
+static const char *elog_type_to_string(uint64_t type)
+{
+	switch (type) {
+	case 0: return "PEL";
+	default: return "unknown";
+	}
+}
+
+static ssize_t elog_type_show(struct elog_obj *elog_obj,
+			      struct elog_attribute *attr,
+			      char *buf)
+{
+	return sprintf(buf, "0x%llx %s\n",
+		       elog_obj->type,
+		       elog_type_to_string(elog_obj->type));
+}
+
+static ssize_t elog_ack_show(struct elog_obj *elog_obj,
+			     struct elog_attribute *attr,
+			     char *buf)
+{
+	return sprintf(buf, "ack - acknowledge log message\n");
+}
+
+static ssize_t elog_ack_store(struct elog_obj *elog_obj,
+			      struct elog_attribute *attr,
+			      const char *buf,
+			      size_t count)
+{
+	/*
+	 * Try to self remove this attribute. If we are successful,
+	 * delete the kobject itself.
+	 */
+	if (sysfs_remove_file_self(&elog_obj->kobj, &attr->attr)) {
+		opal_send_ack_elog(elog_obj->id);
+		kobject_put(&elog_obj->kobj);
+	}
+	return count;
+}
+
+static struct elog_attribute id_attribute =
+	__ATTR(id, 0444, elog_id_show, NULL);
+static struct elog_attribute type_attribute =
+	__ATTR(type, 0444, elog_type_show, NULL);
+static struct elog_attribute ack_attribute =
+	__ATTR(acknowledge, 0660, elog_ack_show, elog_ack_store);
+
+static struct kset *elog_kset;
+
+static ssize_t elog_attr_show(struct kobject *kobj,
+			      struct attribute *attr,
+			      char *buf)
+{
+	struct elog_attribute *attribute;
+	struct elog_obj *elog;
+
+	attribute = to_elog_attr(attr);
+	elog = to_elog_obj(kobj);
+
+	if (!attribute->show)
+		return -EIO;
+
+	return attribute->show(elog, attribute, buf);
+}
+
+static ssize_t elog_attr_store(struct kobject *kobj,
+			       struct attribute *attr,
+			       const char *buf, size_t len)
+{
+	struct elog_attribute *attribute;
+	struct elog_obj *elog;
+
+	attribute = to_elog_attr(attr);
+	elog = to_elog_obj(kobj);
+
+	if (!attribute->store)
+		return -EIO;
+
+	return attribute->store(elog, attribute, buf, len);
+}
+
+static const struct sysfs_ops elog_sysfs_ops = {
+	.show = elog_attr_show,
+	.store = elog_attr_store,
+};
+
+static void elog_release(struct kobject *kobj)
+{
+	struct elog_obj *elog;
+
+	elog = to_elog_obj(kobj);
+	kfree(elog->buffer);
+	kfree(elog);
+}
+
+static struct attribute *elog_default_attrs[] = {
+	&id_attribute.attr,
+	&type_attribute.attr,
+	&ack_attribute.attr,
+	NULL,
+};
+
+static struct kobj_type elog_ktype = {
+	.sysfs_ops = &elog_sysfs_ops,
+	.release = &elog_release,
+	.default_attrs = elog_default_attrs,
+};
+
+/* Maximum size of a single log on FSP is 16KB */
+#define OPAL_MAX_ERRLOG_SIZE	16384
+
+static ssize_t raw_attr_read(struct file *filep, struct kobject *kobj,
+			     struct bin_attribute *bin_attr,
+			     char *buffer, loff_t pos, size_t count)
+{
+	int opal_rc;
+
+	struct elog_obj *elog = to_elog_obj(kobj);
+
+	/* We may have had an error reading before, so let's retry */
+	if (!elog->buffer) {
+		elog->buffer = kzalloc(elog->size, GFP_KERNEL);
+		if (!elog->buffer)
+			return -EIO;
+
+		opal_rc = opal_read_elog(__pa(elog->buffer),
+					 elog->size, elog->id);
+		if (opal_rc != OPAL_SUCCESS) {
+			pr_err("ELOG: log read failed for log-id=%llx\n",
+			       elog->id);
+			kfree(elog->buffer);
+			elog->buffer = NULL;
+			return -EIO;
+		}
+	}
+
+	memcpy(buffer, elog->buffer + pos, count);
+
+	return count;
+}
+
+static void create_elog_obj(uint64_t id, size_t size, uint64_t type)
+{
+	struct elog_obj *elog;
+	int rc;
+
+	elog = kzalloc(sizeof(*elog), GFP_KERNEL);
+	if (!elog)
+		return;
+
+	elog->kobj.kset = elog_kset;
+
+	kobject_init(&elog->kobj, &elog_ktype);
+
+	sysfs_bin_attr_init(&elog->raw_attr);
+
+	elog->raw_attr.attr.name = "raw";
+	elog->raw_attr.attr.mode = 0400;
+	elog->raw_attr.size = size;
+	elog->raw_attr.read = raw_attr_read;
+
+	elog->id = id;
+	elog->size = size;
+	elog->type = type;
+
+	elog->buffer = kzalloc(elog->size, GFP_KERNEL);
+
+	if (elog->buffer) {
+		rc = opal_read_elog(__pa(elog->buffer),
+					 elog->size, elog->id);
+		if (rc != OPAL_SUCCESS) {
+			pr_err("ELOG: log read failed for log-id=%llx\n",
+			       elog->id);
+			kfree(elog->buffer);
+			elog->buffer = NULL;
+		}
+	}
+
+	rc = kobject_add(&elog->kobj, NULL, "0x%llx", id);
+	if (rc) {
+		kobject_put(&elog->kobj);
+		return;
+	}
+
+	/*
+	 * As soon as the sysfs file for this elog is created/activated there is
+	 * a chance the opal_errd daemon (or any userspace) might read and
+	 * acknowledge the elog before kobject_uevent() is called. If that
+	 * happens then there is a potential race between
+	 * elog_ack_store->kobject_put() and kobject_uevent() which leads to a
+	 * use-after-free of a kernfs object resulting in a kernel crash.
+	 *
+	 * To avoid that, we need to take a reference on behalf of the bin file,
+	 * so that our reference remains valid while we call kobject_uevent().
+	 * We then drop our reference before exiting the function, leaving the
+	 * bin file to drop the last reference (if it hasn't already).
+	 */
+
+	/* Take a reference for the bin file */
+	kobject_get(&elog->kobj);
+	rc = sysfs_create_bin_file(&elog->kobj, &elog->raw_attr);
+	if (rc == 0) {
+		kobject_uevent(&elog->kobj, KOBJ_ADD);
+	} else {
+		/* Drop the reference taken for the bin file */
+		kobject_put(&elog->kobj);
+	}
+
+	/* Drop our reference */
+	kobject_put(&elog->kobj);
+
+	return;
+}
+
+static irqreturn_t elog_event(int irq, void *data)
+{
+	__be64 size;
+	__be64 id;
+	__be64 type;
+	uint64_t elog_size;
+	uint64_t log_id;
+	uint64_t elog_type;
+	int rc;
+	char name[2+16+1];
+	struct kobject *kobj;
+
+	rc = opal_get_elog_size(&id, &size, &type);
+	if (rc != OPAL_SUCCESS) {
+		pr_err("ELOG: OPAL log info read failed\n");
+		return IRQ_HANDLED;
+	}
+
+	elog_size = be64_to_cpu(size);
+	log_id = be64_to_cpu(id);
+	elog_type = be64_to_cpu(type);
+
+	WARN_ON(elog_size > OPAL_MAX_ERRLOG_SIZE);
+
+	if (elog_size >= OPAL_MAX_ERRLOG_SIZE)
+		elog_size  =  OPAL_MAX_ERRLOG_SIZE;
+
+	sprintf(name, "0x%llx", log_id);
+
+	/* we may get notified twice, let's handle
+	 * that gracefully and not create two conflicting
+	 * entries.
+	 */
+	kobj = kset_find_obj(elog_kset, name);
+	if (kobj) {
+		/* Drop reference added by kset_find_obj() */
+		kobject_put(kobj);
+		return IRQ_HANDLED;
+	}
+
+	create_elog_obj(log_id, elog_size, elog_type);
+
+	return IRQ_HANDLED;
+}
+
+int __init opal_elog_init(void)
+{
+	int rc = 0, irq;
+
+	/* ELOG not supported by firmware */
+	if (!opal_check_token(OPAL_ELOG_READ))
+		return -1;
+
+	elog_kset = kset_create_and_add("elog", NULL, opal_kobj);
+	if (!elog_kset) {
+		pr_warn("%s: failed to create elog kset\n", __func__);
+		return -1;
+	}
+
+	irq = opal_event_request(ilog2(OPAL_EVENT_ERROR_LOG_AVAIL));
+	if (!irq) {
+		pr_err("%s: Can't register OPAL event irq (%d)\n",
+		       __func__, irq);
+		return irq;
+	}
+
+	rc = request_threaded_irq(irq, NULL, elog_event,
+			IRQF_TRIGGER_HIGH | IRQF_ONESHOT, "opal-elog", NULL);
+	if (rc) {
+		pr_err("%s: Can't request OPAL event irq (%d)\n",
+		       __func__, rc);
+		return rc;
+	}
+
+	/* We are now ready to pull error logs from opal. */
+	if (opal_check_token(OPAL_ELOG_RESEND))
+		opal_resend_pending_logs();
+
+	return 0;
+}
diff --git a/arch/powerpc/platforms/powernv/opal-fadump.c b/arch/powerpc/platforms/powernv/opal-fadump.c
new file mode 100644
index 000000000..e23a51a05
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/opal-fadump.c
@@ -0,0 +1,726 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Firmware-Assisted Dump support on POWER platform (OPAL).
+ *
+ * Copyright 2019, Hari Bathini, IBM Corporation.
+ */
+
+#define pr_fmt(fmt) "opal fadump: " fmt
+
+#include <linux/string.h>
+#include <linux/seq_file.h>
+#include <linux/of.h>
+#include <linux/of_fdt.h>
+#include <linux/libfdt.h>
+#include <linux/mm.h>
+#include <linux/crash_dump.h>
+
+#include <asm/page.h>
+#include <asm/opal.h>
+#include <asm/fadump-internal.h>
+
+#include "opal-fadump.h"
+
+
+#ifdef CONFIG_PRESERVE_FA_DUMP
+/*
+ * When dump is active but PRESERVE_FA_DUMP is enabled on the kernel,
+ * ensure crash data is preserved in hope that the subsequent memory
+ * preserving kernel boot is going to process this crash data.
+ */
+void __init opal_fadump_dt_scan(struct fw_dump *fadump_conf, u64 node)
+{
+	const struct opal_fadump_mem_struct *opal_fdm_active;
+	const __be32 *prop;
+	unsigned long dn;
+	u64 addr = 0;
+	s64 ret;
+
+	dn = of_get_flat_dt_subnode_by_name(node, "dump");
+	if (dn == -FDT_ERR_NOTFOUND)
+		return;
+
+	/*
+	 * Check if dump has been initiated on last reboot.
+	 */
+	prop = of_get_flat_dt_prop(dn, "mpipl-boot", NULL);
+	if (!prop)
+		return;
+
+	ret = opal_mpipl_query_tag(OPAL_MPIPL_TAG_KERNEL, &addr);
+	if ((ret != OPAL_SUCCESS) || !addr) {
+		pr_debug("Could not get Kernel metadata (%lld)\n", ret);
+		return;
+	}
+
+	/*
+	 * Preserve memory only if kernel memory regions are registered
+	 * with f/w for MPIPL.
+	 */
+	addr = be64_to_cpu(addr);
+	pr_debug("Kernel metadata addr: %llx\n", addr);
+	opal_fdm_active = (void *)addr;
+	if (be16_to_cpu(opal_fdm_active->registered_regions) == 0)
+		return;
+
+	ret = opal_mpipl_query_tag(OPAL_MPIPL_TAG_BOOT_MEM, &addr);
+	if ((ret != OPAL_SUCCESS) || !addr) {
+		pr_err("Failed to get boot memory tag (%lld)\n", ret);
+		return;
+	}
+
+	/*
+	 * Memory below this address can be used for booting a
+	 * capture kernel or petitboot kernel. Preserve everything
+	 * above this address for processing crashdump.
+	 */
+	fadump_conf->boot_mem_top = be64_to_cpu(addr);
+	pr_debug("Preserve everything above %llx\n", fadump_conf->boot_mem_top);
+
+	pr_info("Firmware-assisted dump is active.\n");
+	fadump_conf->dump_active = 1;
+}
+
+#else /* CONFIG_PRESERVE_FA_DUMP */
+static const struct opal_fadump_mem_struct *opal_fdm_active;
+static const struct opal_mpipl_fadump *opal_cpu_metadata;
+static struct opal_fadump_mem_struct *opal_fdm;
+
+#ifdef CONFIG_OPAL_CORE
+extern bool kernel_initiated;
+#endif
+
+static int opal_fadump_unregister(struct fw_dump *fadump_conf);
+
+static void opal_fadump_update_config(struct fw_dump *fadump_conf,
+				      const struct opal_fadump_mem_struct *fdm)
+{
+	pr_debug("Boot memory regions count: %d\n", be16_to_cpu(fdm->region_cnt));
+
+	/*
+	 * The destination address of the first boot memory region is the
+	 * destination address of boot memory regions.
+	 */
+	fadump_conf->boot_mem_dest_addr = be64_to_cpu(fdm->rgn[0].dest);
+	pr_debug("Destination address of boot memory regions: %#016llx\n",
+		 fadump_conf->boot_mem_dest_addr);
+
+	fadump_conf->fadumphdr_addr = be64_to_cpu(fdm->fadumphdr_addr);
+}
+
+/*
+ * This function is called in the capture kernel to get configuration details
+ * from metadata setup by the first kernel.
+ */
+static void opal_fadump_get_config(struct fw_dump *fadump_conf,
+				   const struct opal_fadump_mem_struct *fdm)
+{
+	unsigned long base, size, last_end, hole_size;
+	int i;
+
+	if (!fadump_conf->dump_active)
+		return;
+
+	last_end = 0;
+	hole_size = 0;
+	fadump_conf->boot_memory_size = 0;
+
+	pr_debug("Boot memory regions:\n");
+	for (i = 0; i < be16_to_cpu(fdm->region_cnt); i++) {
+		base = be64_to_cpu(fdm->rgn[i].src);
+		size = be64_to_cpu(fdm->rgn[i].size);
+		pr_debug("\t[%03d] base: 0x%lx, size: 0x%lx\n", i, base, size);
+
+		fadump_conf->boot_mem_addr[i] = base;
+		fadump_conf->boot_mem_sz[i] = size;
+		fadump_conf->boot_memory_size += size;
+		hole_size += (base - last_end);
+
+		last_end = base + size;
+	}
+
+	/*
+	 * Start address of reserve dump area (permanent reservation) for
+	 * re-registering FADump after dump capture.
+	 */
+	fadump_conf->reserve_dump_area_start = be64_to_cpu(fdm->rgn[0].dest);
+
+	/*
+	 * Rarely, but it can so happen that system crashes before all
+	 * boot memory regions are registered for MPIPL. In such
+	 * cases, warn that the vmcore may not be accurate and proceed
+	 * anyway as that is the best bet considering free pages, cache
+	 * pages, user pages, etc are usually filtered out.
+	 *
+	 * Hope the memory that could not be preserved only has pages
+	 * that are usually filtered out while saving the vmcore.
+	 */
+	if (be16_to_cpu(fdm->region_cnt) > be16_to_cpu(fdm->registered_regions)) {
+		pr_warn("Not all memory regions were saved!!!\n");
+		pr_warn("  Unsaved memory regions:\n");
+		i = be16_to_cpu(fdm->registered_regions);
+		while (i < be16_to_cpu(fdm->region_cnt)) {
+			pr_warn("\t[%03d] base: 0x%llx, size: 0x%llx\n",
+				i, be64_to_cpu(fdm->rgn[i].src),
+				be64_to_cpu(fdm->rgn[i].size));
+			i++;
+		}
+
+		pr_warn("If the unsaved regions only contain pages that are filtered out (eg. free/user pages), the vmcore should still be usable.\n");
+		pr_warn("WARNING: If the unsaved regions contain kernel pages, the vmcore will be corrupted.\n");
+	}
+
+	fadump_conf->boot_mem_top = (fadump_conf->boot_memory_size + hole_size);
+	fadump_conf->boot_mem_regs_cnt = be16_to_cpu(fdm->region_cnt);
+	opal_fadump_update_config(fadump_conf, fdm);
+}
+
+/* Initialize kernel metadata */
+static void opal_fadump_init_metadata(struct opal_fadump_mem_struct *fdm)
+{
+	fdm->version = OPAL_FADUMP_VERSION;
+	fdm->region_cnt = cpu_to_be16(0);
+	fdm->registered_regions = cpu_to_be16(0);
+	fdm->fadumphdr_addr = cpu_to_be64(0);
+}
+
+static u64 opal_fadump_init_mem_struct(struct fw_dump *fadump_conf)
+{
+	u64 addr = fadump_conf->reserve_dump_area_start;
+	u16 reg_cnt;
+	int i;
+
+	opal_fdm = __va(fadump_conf->kernel_metadata);
+	opal_fadump_init_metadata(opal_fdm);
+
+	/* Boot memory regions */
+	reg_cnt = be16_to_cpu(opal_fdm->region_cnt);
+	for (i = 0; i < fadump_conf->boot_mem_regs_cnt; i++) {
+		opal_fdm->rgn[i].src	= cpu_to_be64(fadump_conf->boot_mem_addr[i]);
+		opal_fdm->rgn[i].dest	= cpu_to_be64(addr);
+		opal_fdm->rgn[i].size	= cpu_to_be64(fadump_conf->boot_mem_sz[i]);
+
+		reg_cnt++;
+		addr += fadump_conf->boot_mem_sz[i];
+	}
+	opal_fdm->region_cnt = cpu_to_be16(reg_cnt);
+
+	/*
+	 * Kernel metadata is passed to f/w and retrieved in capture kerenl.
+	 * So, use it to save fadump header address instead of calculating it.
+	 */
+	opal_fdm->fadumphdr_addr = cpu_to_be64(be64_to_cpu(opal_fdm->rgn[0].dest) +
+					       fadump_conf->boot_memory_size);
+
+	opal_fadump_update_config(fadump_conf, opal_fdm);
+
+	return addr;
+}
+
+static u64 opal_fadump_get_metadata_size(void)
+{
+	return PAGE_ALIGN(sizeof(struct opal_fadump_mem_struct));
+}
+
+static int opal_fadump_setup_metadata(struct fw_dump *fadump_conf)
+{
+	int err = 0;
+	s64 ret;
+
+	/*
+	 * Use the last page(s) in FADump memory reservation for
+	 * kernel metadata.
+	 */
+	fadump_conf->kernel_metadata = (fadump_conf->reserve_dump_area_start +
+					fadump_conf->reserve_dump_area_size -
+					opal_fadump_get_metadata_size());
+	pr_info("Kernel metadata addr: %llx\n", fadump_conf->kernel_metadata);
+
+	/* Initialize kernel metadata before registering the address with f/w */
+	opal_fdm = __va(fadump_conf->kernel_metadata);
+	opal_fadump_init_metadata(opal_fdm);
+
+	/*
+	 * Register metadata address with f/w. Can be retrieved in
+	 * the capture kernel.
+	 */
+	ret = opal_mpipl_register_tag(OPAL_MPIPL_TAG_KERNEL,
+				      fadump_conf->kernel_metadata);
+	if (ret != OPAL_SUCCESS) {
+		pr_err("Failed to set kernel metadata tag!\n");
+		err = -EPERM;
+	}
+
+	/*
+	 * Register boot memory top address with f/w. Should be retrieved
+	 * by a kernel that intends to preserve crash'ed kernel's memory.
+	 */
+	ret = opal_mpipl_register_tag(OPAL_MPIPL_TAG_BOOT_MEM,
+				      fadump_conf->boot_mem_top);
+	if (ret != OPAL_SUCCESS) {
+		pr_err("Failed to set boot memory tag!\n");
+		err = -EPERM;
+	}
+
+	return err;
+}
+
+static u64 opal_fadump_get_bootmem_min(void)
+{
+	return OPAL_FADUMP_MIN_BOOT_MEM;
+}
+
+static int opal_fadump_register(struct fw_dump *fadump_conf)
+{
+	s64 rc = OPAL_PARAMETER;
+	u16 registered_regs;
+	int i, err = -EIO;
+
+	registered_regs = be16_to_cpu(opal_fdm->registered_regions);
+	for (i = 0; i < be16_to_cpu(opal_fdm->region_cnt); i++) {
+		rc = opal_mpipl_update(OPAL_MPIPL_ADD_RANGE,
+				       be64_to_cpu(opal_fdm->rgn[i].src),
+				       be64_to_cpu(opal_fdm->rgn[i].dest),
+				       be64_to_cpu(opal_fdm->rgn[i].size));
+		if (rc != OPAL_SUCCESS)
+			break;
+
+		registered_regs++;
+	}
+	opal_fdm->registered_regions = cpu_to_be16(registered_regs);
+
+	switch (rc) {
+	case OPAL_SUCCESS:
+		pr_info("Registration is successful!\n");
+		fadump_conf->dump_registered = 1;
+		err = 0;
+		break;
+	case OPAL_RESOURCE:
+		/* If MAX regions limit in f/w is hit, warn and proceed. */
+		pr_warn("%d regions could not be registered for MPIPL as MAX limit is reached!\n",
+			(be16_to_cpu(opal_fdm->region_cnt) -
+			 be16_to_cpu(opal_fdm->registered_regions)));
+		fadump_conf->dump_registered = 1;
+		err = 0;
+		break;
+	case OPAL_PARAMETER:
+		pr_err("Failed to register. Parameter Error(%lld).\n", rc);
+		break;
+	case OPAL_HARDWARE:
+		pr_err("Support not available.\n");
+		fadump_conf->fadump_supported = 0;
+		fadump_conf->fadump_enabled = 0;
+		break;
+	default:
+		pr_err("Failed to register. Unknown Error(%lld).\n", rc);
+		break;
+	}
+
+	/*
+	 * If some regions were registered before OPAL_MPIPL_ADD_RANGE
+	 * OPAL call failed, unregister all regions.
+	 */
+	if ((err < 0) && (be16_to_cpu(opal_fdm->registered_regions) > 0))
+		opal_fadump_unregister(fadump_conf);
+
+	return err;
+}
+
+static int opal_fadump_unregister(struct fw_dump *fadump_conf)
+{
+	s64 rc;
+
+	rc = opal_mpipl_update(OPAL_MPIPL_REMOVE_ALL, 0, 0, 0);
+	if (rc) {
+		pr_err("Failed to un-register - unexpected Error(%lld).\n", rc);
+		return -EIO;
+	}
+
+	opal_fdm->registered_regions = cpu_to_be16(0);
+	fadump_conf->dump_registered = 0;
+	return 0;
+}
+
+static int opal_fadump_invalidate(struct fw_dump *fadump_conf)
+{
+	s64 rc;
+
+	rc = opal_mpipl_update(OPAL_MPIPL_FREE_PRESERVED_MEMORY, 0, 0, 0);
+	if (rc) {
+		pr_err("Failed to invalidate - unexpected Error(%lld).\n", rc);
+		return -EIO;
+	}
+
+	fadump_conf->dump_active = 0;
+	opal_fdm_active = NULL;
+	return 0;
+}
+
+static void opal_fadump_cleanup(struct fw_dump *fadump_conf)
+{
+	s64 ret;
+
+	ret = opal_mpipl_register_tag(OPAL_MPIPL_TAG_KERNEL, 0);
+	if (ret != OPAL_SUCCESS)
+		pr_warn("Could not reset (%llu) kernel metadata tag!\n", ret);
+}
+
+/*
+ * Verify if CPU state data is available. If available, do a bit of sanity
+ * checking before processing this data.
+ */
+static bool __init is_opal_fadump_cpu_data_valid(struct fw_dump *fadump_conf)
+{
+	if (!opal_cpu_metadata)
+		return false;
+
+	fadump_conf->cpu_state_data_version =
+		be32_to_cpu(opal_cpu_metadata->cpu_data_version);
+	fadump_conf->cpu_state_entry_size =
+		be32_to_cpu(opal_cpu_metadata->cpu_data_size);
+	fadump_conf->cpu_state_dest_vaddr =
+		(u64)__va(be64_to_cpu(opal_cpu_metadata->region[0].dest));
+	fadump_conf->cpu_state_data_size =
+		be64_to_cpu(opal_cpu_metadata->region[0].size);
+
+	if (fadump_conf->cpu_state_data_version != HDAT_FADUMP_CPU_DATA_VER) {
+		pr_warn("Supported CPU state data version: %u, found: %d!\n",
+			HDAT_FADUMP_CPU_DATA_VER,
+			fadump_conf->cpu_state_data_version);
+		pr_warn("WARNING: F/W using newer CPU state data format!!\n");
+	}
+
+	if ((fadump_conf->cpu_state_dest_vaddr == 0) ||
+	    (fadump_conf->cpu_state_entry_size == 0) ||
+	    (fadump_conf->cpu_state_entry_size >
+	     fadump_conf->cpu_state_data_size)) {
+		pr_err("CPU state data is invalid. Ignoring!\n");
+		return false;
+	}
+
+	return true;
+}
+
+/*
+ * Convert CPU state data saved at the time of crash into ELF notes.
+ *
+ * While the crashing CPU's register data is saved by the kernel, CPU state
+ * data for all CPUs is saved by f/w. In CPU state data provided by f/w,
+ * each register entry is of 16 bytes, a numerical identifier along with
+ * a GPR/SPR flag in the first 8 bytes and the register value in the next
+ * 8 bytes. For more details refer to F/W documentation. If this data is
+ * missing or in unsupported format, append crashing CPU's register data
+ * saved by the kernel in the PT_NOTE, to have something to work with in
+ * the vmcore file.
+ */
+static int __init
+opal_fadump_build_cpu_notes(struct fw_dump *fadump_conf,
+			    struct fadump_crash_info_header *fdh)
+{
+	u32 thread_pir, size_per_thread, regs_offset, regs_cnt, reg_esize;
+	struct hdat_fadump_thread_hdr *thdr;
+	bool is_cpu_data_valid = false;
+	u32 num_cpus = 1, *note_buf;
+	struct pt_regs regs;
+	char *bufp;
+	int rc, i;
+
+	if (is_opal_fadump_cpu_data_valid(fadump_conf)) {
+		size_per_thread = fadump_conf->cpu_state_entry_size;
+		num_cpus = (fadump_conf->cpu_state_data_size / size_per_thread);
+		bufp = __va(fadump_conf->cpu_state_dest_vaddr);
+		is_cpu_data_valid = true;
+	}
+
+	rc = fadump_setup_cpu_notes_buf(num_cpus);
+	if (rc != 0)
+		return rc;
+
+	note_buf = (u32 *)fadump_conf->cpu_notes_buf_vaddr;
+	if (!is_cpu_data_valid)
+		goto out;
+
+	/*
+	 * Offset for register entries, entry size and registers count is
+	 * duplicated in every thread header in keeping with HDAT format.
+	 * Use these values from the first thread header.
+	 */
+	thdr = (struct hdat_fadump_thread_hdr *)bufp;
+	regs_offset = (offsetof(struct hdat_fadump_thread_hdr, offset) +
+		       be32_to_cpu(thdr->offset));
+	reg_esize = be32_to_cpu(thdr->esize);
+	regs_cnt  = be32_to_cpu(thdr->ecnt);
+
+	pr_debug("--------CPU State Data------------\n");
+	pr_debug("NumCpus     : %u\n", num_cpus);
+	pr_debug("\tOffset: %u, Entry size: %u, Cnt: %u\n",
+		 regs_offset, reg_esize, regs_cnt);
+
+	for (i = 0; i < num_cpus; i++, bufp += size_per_thread) {
+		thdr = (struct hdat_fadump_thread_hdr *)bufp;
+
+		thread_pir = be32_to_cpu(thdr->pir);
+		pr_debug("[%04d] PIR: 0x%x, core state: 0x%02x\n",
+			 i, thread_pir, thdr->core_state);
+
+		/*
+		 * If this is kernel initiated crash, crashing_cpu would be set
+		 * appropriately and register data of the crashing CPU saved by
+		 * crashing kernel. Add this saved register data of crashing CPU
+		 * to elf notes and populate the pt_regs for the remaining CPUs
+		 * from register state data provided by firmware.
+		 */
+		if (fdh->crashing_cpu == thread_pir) {
+			note_buf = fadump_regs_to_elf_notes(note_buf,
+							    &fdh->regs);
+			pr_debug("Crashing CPU PIR: 0x%x - R1 : 0x%lx, NIP : 0x%lx\n",
+				 fdh->crashing_cpu, fdh->regs.gpr[1],
+				 fdh->regs.nip);
+			continue;
+		}
+
+		/*
+		 * Register state data of MAX cores is provided by firmware,
+		 * but some of this cores may not be active. So, while
+		 * processing register state data, check core state and
+		 * skip threads that belong to inactive cores.
+		 */
+		if (thdr->core_state == HDAT_FADUMP_CORE_INACTIVE)
+			continue;
+
+		opal_fadump_read_regs((bufp + regs_offset), regs_cnt,
+				      reg_esize, true, &regs);
+		note_buf = fadump_regs_to_elf_notes(note_buf, &regs);
+		pr_debug("CPU PIR: 0x%x - R1 : 0x%lx, NIP : 0x%lx\n",
+			 thread_pir, regs.gpr[1], regs.nip);
+	}
+
+out:
+	/*
+	 * CPU state data is invalid/unsupported. Try appending crashing CPU's
+	 * register data, if it is saved by the kernel.
+	 */
+	if (fadump_conf->cpu_notes_buf_vaddr == (u64)note_buf) {
+		if (fdh->crashing_cpu == FADUMP_CPU_UNKNOWN) {
+			fadump_free_cpu_notes_buf();
+			return -ENODEV;
+		}
+
+		pr_warn("WARNING: appending only crashing CPU's register data\n");
+		note_buf = fadump_regs_to_elf_notes(note_buf, &(fdh->regs));
+	}
+
+	final_note(note_buf);
+
+	pr_debug("Updating elfcore header (%llx) with cpu notes\n",
+		 fdh->elfcorehdr_addr);
+	fadump_update_elfcore_header(__va(fdh->elfcorehdr_addr));
+	return 0;
+}
+
+static int __init opal_fadump_process(struct fw_dump *fadump_conf)
+{
+	struct fadump_crash_info_header *fdh;
+	int rc = -EINVAL;
+
+	if (!opal_fdm_active || !fadump_conf->fadumphdr_addr)
+		return rc;
+
+	/* Validate the fadump crash info header */
+	fdh = __va(fadump_conf->fadumphdr_addr);
+	if (fdh->magic_number != FADUMP_CRASH_INFO_MAGIC) {
+		pr_err("Crash info header is not valid.\n");
+		return rc;
+	}
+
+#ifdef CONFIG_OPAL_CORE
+	/*
+	 * If this is a kernel initiated crash, crashing_cpu would be set
+	 * appropriately and register data of the crashing CPU saved by
+	 * crashing kernel. Add this saved register data of crashing CPU
+	 * to elf notes and populate the pt_regs for the remaining CPUs
+	 * from register state data provided by firmware.
+	 */
+	if (fdh->crashing_cpu != FADUMP_CPU_UNKNOWN)
+		kernel_initiated = true;
+#endif
+
+	rc = opal_fadump_build_cpu_notes(fadump_conf, fdh);
+	if (rc)
+		return rc;
+
+	/*
+	 * We are done validating dump info and elfcore header is now ready
+	 * to be exported. set elfcorehdr_addr so that vmcore module will
+	 * export the elfcore header through '/proc/vmcore'.
+	 */
+	elfcorehdr_addr = fdh->elfcorehdr_addr;
+
+	return rc;
+}
+
+static void opal_fadump_region_show(struct fw_dump *fadump_conf,
+				    struct seq_file *m)
+{
+	const struct opal_fadump_mem_struct *fdm_ptr;
+	u64 dumped_bytes = 0;
+	int i;
+
+	if (fadump_conf->dump_active)
+		fdm_ptr = opal_fdm_active;
+	else
+		fdm_ptr = opal_fdm;
+
+	for (i = 0; i < be16_to_cpu(fdm_ptr->region_cnt); i++) {
+		/*
+		 * Only regions that are registered for MPIPL
+		 * would have dump data.
+		 */
+		if ((fadump_conf->dump_active) &&
+		    (i < be16_to_cpu(fdm_ptr->registered_regions)))
+			dumped_bytes = be64_to_cpu(fdm_ptr->rgn[i].size);
+
+		seq_printf(m, "DUMP: Src: %#016llx, Dest: %#016llx, ",
+			   be64_to_cpu(fdm_ptr->rgn[i].src),
+			   be64_to_cpu(fdm_ptr->rgn[i].dest));
+		seq_printf(m, "Size: %#llx, Dumped: %#llx bytes\n",
+			   be64_to_cpu(fdm_ptr->rgn[i].size), dumped_bytes);
+	}
+
+	/* Dump is active. Show reserved area start address. */
+	if (fadump_conf->dump_active) {
+		seq_printf(m, "\nMemory above %#016lx is reserved for saving crash dump\n",
+			   fadump_conf->reserve_dump_area_start);
+	}
+}
+
+static void opal_fadump_trigger(struct fadump_crash_info_header *fdh,
+				const char *msg)
+{
+	int rc;
+
+	/*
+	 * Unlike on pSeries platform, logical CPU number is not provided
+	 * with architected register state data. So, store the crashing
+	 * CPU's PIR instead to plug the appropriate register data for
+	 * crashing CPU in the vmcore file.
+	 */
+	fdh->crashing_cpu = (u32)mfspr(SPRN_PIR);
+
+	rc = opal_cec_reboot2(OPAL_REBOOT_MPIPL, msg);
+	if (rc == OPAL_UNSUPPORTED) {
+		pr_emerg("Reboot type %d not supported.\n",
+			 OPAL_REBOOT_MPIPL);
+	} else if (rc == OPAL_HARDWARE)
+		pr_emerg("No backend support for MPIPL!\n");
+}
+
+static struct fadump_ops opal_fadump_ops = {
+	.fadump_init_mem_struct		= opal_fadump_init_mem_struct,
+	.fadump_get_metadata_size	= opal_fadump_get_metadata_size,
+	.fadump_setup_metadata		= opal_fadump_setup_metadata,
+	.fadump_get_bootmem_min		= opal_fadump_get_bootmem_min,
+	.fadump_register		= opal_fadump_register,
+	.fadump_unregister		= opal_fadump_unregister,
+	.fadump_invalidate		= opal_fadump_invalidate,
+	.fadump_cleanup			= opal_fadump_cleanup,
+	.fadump_process			= opal_fadump_process,
+	.fadump_region_show		= opal_fadump_region_show,
+	.fadump_trigger			= opal_fadump_trigger,
+};
+
+void __init opal_fadump_dt_scan(struct fw_dump *fadump_conf, u64 node)
+{
+	const __be32 *prop;
+	unsigned long dn;
+	__be64 be_addr;
+	u64 addr = 0;
+	int i, len;
+	s64 ret;
+
+	/*
+	 * Check if Firmware-Assisted Dump is supported. if yes, check
+	 * if dump has been initiated on last reboot.
+	 */
+	dn = of_get_flat_dt_subnode_by_name(node, "dump");
+	if (dn == -FDT_ERR_NOTFOUND) {
+		pr_debug("FADump support is missing!\n");
+		return;
+	}
+
+	if (!of_flat_dt_is_compatible(dn, "ibm,opal-dump")) {
+		pr_err("Support missing for this f/w version!\n");
+		return;
+	}
+
+	prop = of_get_flat_dt_prop(dn, "fw-load-area", &len);
+	if (prop) {
+		/*
+		 * Each f/w load area is an (address,size) pair,
+		 * 2 cells each, totalling 4 cells per range.
+		 */
+		for (i = 0; i < len / (sizeof(*prop) * 4); i++) {
+			u64 base, end;
+
+			base = of_read_number(prop + (i * 4) + 0, 2);
+			end = base;
+			end += of_read_number(prop + (i * 4) + 2, 2);
+			if (end > OPAL_FADUMP_MIN_BOOT_MEM) {
+				pr_err("F/W load area: 0x%llx-0x%llx\n",
+				       base, end);
+				pr_err("F/W version not supported!\n");
+				return;
+			}
+		}
+	}
+
+	fadump_conf->ops		= &opal_fadump_ops;
+	fadump_conf->fadump_supported	= 1;
+
+	/*
+	 * Firmware supports 32-bit field for size. Align it to PAGE_SIZE
+	 * and request firmware to copy multiple kernel boot memory regions.
+	 */
+	fadump_conf->max_copy_size = ALIGN_DOWN(U32_MAX, PAGE_SIZE);
+
+	/*
+	 * Check if dump has been initiated on last reboot.
+	 */
+	prop = of_get_flat_dt_prop(dn, "mpipl-boot", NULL);
+	if (!prop)
+		return;
+
+	ret = opal_mpipl_query_tag(OPAL_MPIPL_TAG_KERNEL, &be_addr);
+	if ((ret != OPAL_SUCCESS) || !be_addr) {
+		pr_err("Failed to get Kernel metadata (%lld)\n", ret);
+		return;
+	}
+
+	addr = be64_to_cpu(be_addr);
+	pr_debug("Kernel metadata addr: %llx\n", addr);
+
+	opal_fdm_active = __va(addr);
+	if (opal_fdm_active->version != OPAL_FADUMP_VERSION) {
+		pr_warn("Supported kernel metadata version: %u, found: %d!\n",
+			OPAL_FADUMP_VERSION, opal_fdm_active->version);
+		pr_warn("WARNING: Kernel metadata format mismatch identified! Core file maybe corrupted..\n");
+	}
+
+	/* Kernel regions not registered with f/w for MPIPL */
+	if (be16_to_cpu(opal_fdm_active->registered_regions) == 0) {
+		opal_fdm_active = NULL;
+		return;
+	}
+
+	ret = opal_mpipl_query_tag(OPAL_MPIPL_TAG_CPU, &be_addr);
+	if (be_addr) {
+		addr = be64_to_cpu(be_addr);
+		pr_debug("CPU metadata addr: %llx\n", addr);
+		opal_cpu_metadata = __va(addr);
+	}
+
+	pr_info("Firmware-assisted dump is active.\n");
+	fadump_conf->dump_active = 1;
+	opal_fadump_get_config(fadump_conf, opal_fdm_active);
+}
+#endif /* !CONFIG_PRESERVE_FA_DUMP */
diff --git a/arch/powerpc/platforms/powernv/opal-fadump.h b/arch/powerpc/platforms/powernv/opal-fadump.h
new file mode 100644
index 000000000..3f715efb0
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/opal-fadump.h
@@ -0,0 +1,146 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Firmware-Assisted Dump support on POWER platform (OPAL).
+ *
+ * Copyright 2019, Hari Bathini, IBM Corporation.
+ */
+
+#ifndef _POWERNV_OPAL_FADUMP_H
+#define _POWERNV_OPAL_FADUMP_H
+
+#include <asm/reg.h>
+
+/*
+ * With kernel & initrd loaded at 512MB (with 256MB size), enforce a minimum
+ * boot memory size of 768MB to ensure f/w loading kernel and initrd doesn't
+ * mess with crash'ed kernel's memory during MPIPL.
+ */
+#define OPAL_FADUMP_MIN_BOOT_MEM		(0x30000000UL)
+
+/*
+ * OPAL FADump metadata structure format version
+ *
+ * OPAL FADump kernel metadata structure stores kernel metadata needed to
+ * register-for/process crash dump. Format version is used to keep a tab on
+ * the changes in the structure format. The changes, if any, to the format
+ * are expected to be minimal and backward compatible.
+ */
+#define OPAL_FADUMP_VERSION			0x1
+
+/*
+ * OPAL FADump kernel metadata
+ *
+ * The address of this structure will be registered with f/w for retrieving
+ * in the capture kernel to process the crash dump.
+ */
+struct opal_fadump_mem_struct {
+	u8	version;
+	u8	reserved[3];
+	__be16	region_cnt;		/* number of regions */
+	__be16	registered_regions;	/* Regions registered for MPIPL */
+	__be64	fadumphdr_addr;
+	struct opal_mpipl_region	rgn[FADUMP_MAX_MEM_REGS];
+} __packed;
+
+/*
+ * CPU state data
+ *
+ * CPU state data information is provided by f/w. The format for this data
+ * is defined in the HDAT spec. Version is used to keep a tab on the changes
+ * in this CPU state data format. Changes to this format are unlikely, but
+ * if there are any changes, please refer to latest HDAT specification.
+ */
+#define HDAT_FADUMP_CPU_DATA_VER		1
+
+#define HDAT_FADUMP_CORE_INACTIVE		(0x0F)
+
+/* HDAT thread header for register entries */
+struct hdat_fadump_thread_hdr {
+	__be32  pir;
+	/* 0x00 - 0x0F - The corresponding stop state of the core */
+	u8      core_state;
+	u8      reserved[3];
+
+	__be32	offset;	/* Offset to Register Entries array */
+	__be32	ecnt;	/* Number of entries */
+	__be32	esize;	/* Alloc size of each array entry in bytes */
+	__be32	eactsz;	/* Actual size of each array entry in bytes */
+} __packed;
+
+/* Register types populated by f/w */
+#define HDAT_FADUMP_REG_TYPE_GPR		0x01
+#define HDAT_FADUMP_REG_TYPE_SPR		0x02
+
+/* ID numbers used by f/w while populating certain registers */
+#define HDAT_FADUMP_REG_ID_NIP			0x7D0
+#define HDAT_FADUMP_REG_ID_MSR			0x7D1
+#define HDAT_FADUMP_REG_ID_CCR			0x7D2
+
+/* HDAT register entry. */
+struct hdat_fadump_reg_entry {
+	__be32		reg_type;
+	__be32		reg_num;
+	__be64		reg_val;
+} __packed;
+
+static inline void opal_fadump_set_regval_regnum(struct pt_regs *regs,
+						 u32 reg_type, u32 reg_num,
+						 u64 reg_val)
+{
+	if (reg_type == HDAT_FADUMP_REG_TYPE_GPR) {
+		if (reg_num < 32)
+			regs->gpr[reg_num] = reg_val;
+		return;
+	}
+
+	switch (reg_num) {
+	case SPRN_CTR:
+		regs->ctr = reg_val;
+		break;
+	case SPRN_LR:
+		regs->link = reg_val;
+		break;
+	case SPRN_XER:
+		regs->xer = reg_val;
+		break;
+	case SPRN_DAR:
+		regs->dar = reg_val;
+		break;
+	case SPRN_DSISR:
+		regs->dsisr = reg_val;
+		break;
+	case HDAT_FADUMP_REG_ID_NIP:
+		regs->nip = reg_val;
+		break;
+	case HDAT_FADUMP_REG_ID_MSR:
+		regs->msr = reg_val;
+		break;
+	case HDAT_FADUMP_REG_ID_CCR:
+		regs->ccr = reg_val;
+		break;
+	}
+}
+
+static inline void opal_fadump_read_regs(char *bufp, unsigned int regs_cnt,
+					 unsigned int reg_entry_size,
+					 bool cpu_endian,
+					 struct pt_regs *regs)
+{
+	struct hdat_fadump_reg_entry *reg_entry;
+	u64 val;
+	int i;
+
+	memset(regs, 0, sizeof(struct pt_regs));
+
+	for (i = 0; i < regs_cnt; i++, bufp += reg_entry_size) {
+		reg_entry = (struct hdat_fadump_reg_entry *)bufp;
+		val = (cpu_endian ? be64_to_cpu(reg_entry->reg_val) :
+		       (u64)(reg_entry->reg_val));
+		opal_fadump_set_regval_regnum(regs,
+					      be32_to_cpu(reg_entry->reg_type),
+					      be32_to_cpu(reg_entry->reg_num),
+					      val);
+	}
+}
+
+#endif /* _POWERNV_OPAL_FADUMP_H */
diff --git a/arch/powerpc/platforms/powernv/opal-flash.c b/arch/powerpc/platforms/powernv/opal-flash.c
new file mode 100644
index 000000000..7e7d38b17
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/opal-flash.c
@@ -0,0 +1,562 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * PowerNV OPAL Firmware Update Interface
+ *
+ * Copyright 2013 IBM Corp.
+ */
+
+#define DEBUG
+
+#include <linux/kernel.h>
+#include <linux/reboot.h>
+#include <linux/init.h>
+#include <linux/kobject.h>
+#include <linux/sysfs.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include <linux/vmalloc.h>
+#include <linux/pagemap.h>
+#include <linux/delay.h>
+
+#include <asm/opal.h>
+
+/* FLASH status codes */
+#define FLASH_NO_OP		-1099	/* No operation initiated by user */
+#define FLASH_NO_AUTH		-9002	/* Not a service authority partition */
+
+/* Validate image status values */
+#define VALIDATE_IMG_READY	-1001	/* Image ready for validation */
+#define VALIDATE_IMG_INCOMPLETE	-1002	/* User copied < VALIDATE_BUF_SIZE */
+
+/* Manage image status values */
+#define MANAGE_ACTIVE_ERR	-9001	/* Cannot overwrite active img */
+
+/* Flash image status values */
+#define FLASH_IMG_READY		0	/* Img ready for flash on reboot */
+#define FLASH_INVALID_IMG	-1003	/* Flash image shorter than expected */
+#define FLASH_IMG_NULL_DATA	-1004	/* Bad data in sg list entry */
+#define FLASH_IMG_BAD_LEN	-1005	/* Bad length in sg list entry */
+
+/* Manage operation tokens */
+#define FLASH_REJECT_TMP_SIDE	0	/* Reject temporary fw image */
+#define FLASH_COMMIT_TMP_SIDE	1	/* Commit temporary fw image */
+
+/* Update tokens */
+#define FLASH_UPDATE_CANCEL	0	/* Cancel update request */
+#define FLASH_UPDATE_INIT	1	/* Initiate update */
+
+/* Validate image update result tokens */
+#define VALIDATE_TMP_UPDATE	0     /* T side will be updated */
+#define VALIDATE_FLASH_AUTH	1     /* Partition does not have authority */
+#define VALIDATE_INVALID_IMG	2     /* Candidate image is not valid */
+#define VALIDATE_CUR_UNKNOWN	3     /* Current fixpack level is unknown */
+/*
+ * Current T side will be committed to P side before being replace with new
+ * image, and the new image is downlevel from current image
+ */
+#define VALIDATE_TMP_COMMIT_DL	4
+/*
+ * Current T side will be committed to P side before being replaced with new
+ * image
+ */
+#define VALIDATE_TMP_COMMIT	5
+/*
+ * T side will be updated with a downlevel image
+ */
+#define VALIDATE_TMP_UPDATE_DL	6
+/*
+ * The candidate image's release date is later than the system's firmware
+ * service entitlement date - service warranty period has expired
+ */
+#define VALIDATE_OUT_OF_WRNTY	7
+
+/* Validate buffer size */
+#define VALIDATE_BUF_SIZE	4096
+
+/* XXX: Assume candidate image size is <= 1GB */
+#define MAX_IMAGE_SIZE	0x40000000
+
+/* Image status */
+enum {
+	IMAGE_INVALID,
+	IMAGE_LOADING,
+	IMAGE_READY,
+};
+
+/* Candidate image data */
+struct image_data_t {
+	int		status;
+	void		*data;
+	uint32_t	size;
+};
+
+/* Candidate image header */
+struct image_header_t {
+	uint16_t	magic;
+	uint16_t	version;
+	uint32_t	size;
+};
+
+struct validate_flash_t {
+	int		status;		/* Return status */
+	void		*buf;		/* Candidate image buffer */
+	uint32_t	buf_size;	/* Image size */
+	uint32_t	result;		/* Update results token */
+};
+
+struct manage_flash_t {
+	int status;		/* Return status */
+};
+
+struct update_flash_t {
+	int status;		/* Return status */
+};
+
+static struct image_header_t	image_header;
+static struct image_data_t	image_data;
+static struct validate_flash_t	validate_flash_data;
+static struct manage_flash_t	manage_flash_data;
+
+/* Initialize update_flash_data status to No Operation */
+static struct update_flash_t	update_flash_data = {
+	.status = FLASH_NO_OP,
+};
+
+static DEFINE_MUTEX(image_data_mutex);
+
+/*
+ * Validate candidate image
+ */
+static inline void opal_flash_validate(void)
+{
+	long ret;
+	void *buf = validate_flash_data.buf;
+	__be32 size = cpu_to_be32(validate_flash_data.buf_size);
+	__be32 result;
+
+	ret = opal_validate_flash(__pa(buf), &size, &result);
+
+	validate_flash_data.status = ret;
+	validate_flash_data.buf_size = be32_to_cpu(size);
+	validate_flash_data.result = be32_to_cpu(result);
+}
+
+/*
+ * Validate output format:
+ *     validate result token
+ *     current image version details
+ *     new image version details
+ */
+static ssize_t validate_show(struct kobject *kobj,
+			     struct kobj_attribute *attr, char *buf)
+{
+	struct validate_flash_t *args_buf = &validate_flash_data;
+	int len;
+
+	/* Candidate image is not validated */
+	if (args_buf->status < VALIDATE_TMP_UPDATE) {
+		len = sprintf(buf, "%d\n", args_buf->status);
+		goto out;
+	}
+
+	/* Result token */
+	len = sprintf(buf, "%d\n", args_buf->result);
+
+	/* Current and candidate image version details */
+	if ((args_buf->result != VALIDATE_TMP_UPDATE) &&
+	    (args_buf->result < VALIDATE_CUR_UNKNOWN))
+		goto out;
+
+	if (args_buf->buf_size > (VALIDATE_BUF_SIZE - len)) {
+		memcpy(buf + len, args_buf->buf, VALIDATE_BUF_SIZE - len);
+		len = VALIDATE_BUF_SIZE;
+	} else {
+		memcpy(buf + len, args_buf->buf, args_buf->buf_size);
+		len += args_buf->buf_size;
+	}
+out:
+	/* Set status to default */
+	args_buf->status = FLASH_NO_OP;
+	return len;
+}
+
+/*
+ * Validate candidate firmware image
+ *
+ * Note:
+ *   We are only interested in first 4K bytes of the
+ *   candidate image.
+ */
+static ssize_t validate_store(struct kobject *kobj,
+			      struct kobj_attribute *attr,
+			      const char *buf, size_t count)
+{
+	struct validate_flash_t *args_buf = &validate_flash_data;
+
+	if (buf[0] != '1')
+		return -EINVAL;
+
+	mutex_lock(&image_data_mutex);
+
+	if (image_data.status != IMAGE_READY ||
+	    image_data.size < VALIDATE_BUF_SIZE) {
+		args_buf->result = VALIDATE_INVALID_IMG;
+		args_buf->status = VALIDATE_IMG_INCOMPLETE;
+		goto out;
+	}
+
+	/* Copy first 4k bytes of candidate image */
+	memcpy(args_buf->buf, image_data.data, VALIDATE_BUF_SIZE);
+
+	args_buf->status = VALIDATE_IMG_READY;
+	args_buf->buf_size = VALIDATE_BUF_SIZE;
+
+	/* Validate candidate image */
+	opal_flash_validate();
+
+out:
+	mutex_unlock(&image_data_mutex);
+	return count;
+}
+
+/*
+ * Manage flash routine
+ */
+static inline void opal_flash_manage(uint8_t op)
+{
+	struct manage_flash_t *const args_buf = &manage_flash_data;
+
+	args_buf->status = opal_manage_flash(op);
+}
+
+/*
+ * Show manage flash status
+ */
+static ssize_t manage_show(struct kobject *kobj,
+			   struct kobj_attribute *attr, char *buf)
+{
+	struct manage_flash_t *const args_buf = &manage_flash_data;
+	int rc;
+
+	rc = sprintf(buf, "%d\n", args_buf->status);
+	/* Set status to default*/
+	args_buf->status = FLASH_NO_OP;
+	return rc;
+}
+
+/*
+ * Manage operations:
+ *   0 - Reject
+ *   1 - Commit
+ */
+static ssize_t manage_store(struct kobject *kobj,
+			    struct kobj_attribute *attr,
+			    const char *buf, size_t count)
+{
+	uint8_t op;
+	switch (buf[0]) {
+	case '0':
+		op = FLASH_REJECT_TMP_SIDE;
+		break;
+	case '1':
+		op = FLASH_COMMIT_TMP_SIDE;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	/* commit/reject temporary image */
+	opal_flash_manage(op);
+	return count;
+}
+
+/*
+ * OPAL update flash
+ */
+static int opal_flash_update(int op)
+{
+	struct opal_sg_list *list;
+	unsigned long addr;
+	int64_t rc = OPAL_PARAMETER;
+
+	if (op == FLASH_UPDATE_CANCEL) {
+		pr_alert("FLASH: Image update cancelled\n");
+		addr = '\0';
+		goto flash;
+	}
+
+	list = opal_vmalloc_to_sg_list(image_data.data, image_data.size);
+	if (!list)
+		goto invalid_img;
+
+	/* First entry address */
+	addr = __pa(list);
+
+flash:
+	rc = opal_update_flash(addr);
+
+invalid_img:
+	return rc;
+}
+
+/* This gets called just before system reboots */
+void opal_flash_update_print_message(void)
+{
+	if (update_flash_data.status != FLASH_IMG_READY)
+		return;
+
+	pr_alert("FLASH: Flashing new firmware\n");
+	pr_alert("FLASH: Image is %u bytes\n", image_data.size);
+	pr_alert("FLASH: Performing flash and reboot/shutdown\n");
+	pr_alert("FLASH: This will take several minutes. Do not power off!\n");
+
+	/* Small delay to help getting the above message out */
+	msleep(500);
+}
+
+/*
+ * Show candidate image status
+ */
+static ssize_t update_show(struct kobject *kobj,
+			   struct kobj_attribute *attr, char *buf)
+{
+	struct update_flash_t *const args_buf = &update_flash_data;
+	return sprintf(buf, "%d\n", args_buf->status);
+}
+
+/*
+ * Set update image flag
+ *  1 - Flash new image
+ *  0 - Cancel flash request
+ */
+static ssize_t update_store(struct kobject *kobj,
+			    struct kobj_attribute *attr,
+			    const char *buf, size_t count)
+{
+	struct update_flash_t *const args_buf = &update_flash_data;
+	int rc = count;
+
+	mutex_lock(&image_data_mutex);
+
+	switch (buf[0]) {
+	case '0':
+		if (args_buf->status == FLASH_IMG_READY)
+			opal_flash_update(FLASH_UPDATE_CANCEL);
+		args_buf->status = FLASH_NO_OP;
+		break;
+	case '1':
+		/* Image is loaded? */
+		if (image_data.status == IMAGE_READY)
+			args_buf->status =
+				opal_flash_update(FLASH_UPDATE_INIT);
+		else
+			args_buf->status = FLASH_INVALID_IMG;
+		break;
+	default:
+		rc = -EINVAL;
+	}
+
+	mutex_unlock(&image_data_mutex);
+	return rc;
+}
+
+/*
+ * Free image buffer
+ */
+static void free_image_buf(void)
+{
+	void *addr;
+	int size;
+
+	addr = image_data.data;
+	size = PAGE_ALIGN(image_data.size);
+	while (size > 0) {
+		ClearPageReserved(vmalloc_to_page(addr));
+		addr += PAGE_SIZE;
+		size -= PAGE_SIZE;
+	}
+	vfree(image_data.data);
+	image_data.data = NULL;
+	image_data.status = IMAGE_INVALID;
+}
+
+/*
+ * Allocate image buffer.
+ */
+static int alloc_image_buf(char *buffer, size_t count)
+{
+	void *addr;
+	int size;
+
+	if (count < sizeof(image_header)) {
+		pr_warn("FLASH: Invalid candidate image\n");
+		return -EINVAL;
+	}
+
+	memcpy(&image_header, (void *)buffer, sizeof(image_header));
+	image_data.size = be32_to_cpu(image_header.size);
+	pr_debug("FLASH: Candidate image size = %u\n", image_data.size);
+
+	if (image_data.size > MAX_IMAGE_SIZE) {
+		pr_warn("FLASH: Too large image\n");
+		return -EINVAL;
+	}
+	if (image_data.size < VALIDATE_BUF_SIZE) {
+		pr_warn("FLASH: Image is shorter than expected\n");
+		return -EINVAL;
+	}
+
+	image_data.data = vzalloc(PAGE_ALIGN(image_data.size));
+	if (!image_data.data) {
+		pr_err("%s : Failed to allocate memory\n", __func__);
+		return -ENOMEM;
+	}
+
+	/* Pin memory */
+	addr = image_data.data;
+	size = PAGE_ALIGN(image_data.size);
+	while (size > 0) {
+		SetPageReserved(vmalloc_to_page(addr));
+		addr += PAGE_SIZE;
+		size -= PAGE_SIZE;
+	}
+
+	image_data.status = IMAGE_LOADING;
+	return 0;
+}
+
+/*
+ * Copy candidate image
+ *
+ * Parse candidate image header to get total image size
+ * and pre-allocate required memory.
+ */
+static ssize_t image_data_write(struct file *filp, struct kobject *kobj,
+				struct bin_attribute *bin_attr,
+				char *buffer, loff_t pos, size_t count)
+{
+	int rc;
+
+	mutex_lock(&image_data_mutex);
+
+	/* New image ? */
+	if (pos == 0) {
+		/* Free memory, if already allocated */
+		if (image_data.data)
+			free_image_buf();
+
+		/* Cancel outstanding image update request */
+		if (update_flash_data.status == FLASH_IMG_READY)
+			opal_flash_update(FLASH_UPDATE_CANCEL);
+
+		/* Allocate memory */
+		rc = alloc_image_buf(buffer, count);
+		if (rc)
+			goto out;
+	}
+
+	if (image_data.status != IMAGE_LOADING) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	if ((pos + count) > image_data.size) {
+		rc = -EINVAL;
+		goto out;
+	}
+
+	memcpy(image_data.data + pos, (void *)buffer, count);
+	rc = count;
+
+	/* Set image status */
+	if ((pos + count) == image_data.size) {
+		pr_debug("FLASH: Candidate image loaded....\n");
+		image_data.status = IMAGE_READY;
+	}
+
+out:
+	mutex_unlock(&image_data_mutex);
+	return rc;
+}
+
+/*
+ * sysfs interface :
+ *  OPAL uses below sysfs files for code update.
+ *  We create these files under /sys/firmware/opal.
+ *
+ *   image		: Interface to load candidate firmware image
+ *   validate_flash	: Validate firmware image
+ *   manage_flash	: Commit/Reject firmware image
+ *   update_flash	: Flash new firmware image
+ *
+ */
+static const struct bin_attribute image_data_attr = {
+	.attr = {.name = "image", .mode = 0200},
+	.size = MAX_IMAGE_SIZE,	/* Limit image size */
+	.write = image_data_write,
+};
+
+static struct kobj_attribute validate_attribute =
+	__ATTR(validate_flash, 0600, validate_show, validate_store);
+
+static struct kobj_attribute manage_attribute =
+	__ATTR(manage_flash, 0600, manage_show, manage_store);
+
+static struct kobj_attribute update_attribute =
+	__ATTR(update_flash, 0600, update_show, update_store);
+
+static struct attribute *image_op_attrs[] = {
+	&validate_attribute.attr,
+	&manage_attribute.attr,
+	&update_attribute.attr,
+	NULL	/* need to NULL terminate the list of attributes */
+};
+
+static struct attribute_group image_op_attr_group = {
+	.attrs = image_op_attrs,
+};
+
+void __init opal_flash_update_init(void)
+{
+	int ret;
+
+	/* Allocate validate image buffer */
+	validate_flash_data.buf = kzalloc(VALIDATE_BUF_SIZE, GFP_KERNEL);
+	if (!validate_flash_data.buf) {
+		pr_err("%s : Failed to allocate memory\n", __func__);
+		return;
+	}
+
+	/* Make sure /sys/firmware/opal directory is created */
+	if (!opal_kobj) {
+		pr_warn("FLASH: opal kobject is not available\n");
+		goto nokobj;
+	}
+
+	/* Create the sysfs files */
+	ret = sysfs_create_group(opal_kobj, &image_op_attr_group);
+	if (ret) {
+		pr_warn("FLASH: Failed to create sysfs files\n");
+		goto nokobj;
+	}
+
+	ret = sysfs_create_bin_file(opal_kobj, &image_data_attr);
+	if (ret) {
+		pr_warn("FLASH: Failed to create sysfs files\n");
+		goto nosysfs_file;
+	}
+
+	/* Set default status */
+	validate_flash_data.status = FLASH_NO_OP;
+	manage_flash_data.status = FLASH_NO_OP;
+	update_flash_data.status = FLASH_NO_OP;
+	image_data.status = IMAGE_INVALID;
+	return;
+
+nosysfs_file:
+	sysfs_remove_group(opal_kobj, &image_op_attr_group);
+
+nokobj:
+	kfree(validate_flash_data.buf);
+	return;
+}
diff --git a/arch/powerpc/platforms/powernv/opal-hmi.c b/arch/powerpc/platforms/powernv/opal-hmi.c
new file mode 100644
index 000000000..3e1f064a1
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/opal-hmi.c
@@ -0,0 +1,376 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * OPAL hypervisor Maintenance interrupt handling support in PowerNV.
+ *
+ * Copyright 2014 IBM Corporation
+ * Author: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
+ */
+
+#undef DEBUG
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/of.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+
+#include <asm/opal.h>
+#include <asm/cputable.h>
+#include <asm/machdep.h>
+
+#include "powernv.h"
+
+static int opal_hmi_handler_nb_init;
+struct OpalHmiEvtNode {
+	struct list_head list;
+	struct OpalHMIEvent hmi_evt;
+};
+
+struct xstop_reason {
+	uint32_t xstop_reason;
+	const char *unit_failed;
+	const char *description;
+};
+
+static LIST_HEAD(opal_hmi_evt_list);
+static DEFINE_SPINLOCK(opal_hmi_evt_lock);
+
+static void print_core_checkstop_reason(const char *level,
+					struct OpalHMIEvent *hmi_evt)
+{
+	int i;
+	static const struct xstop_reason xstop_reason[] = {
+		{ CORE_CHECKSTOP_IFU_REGFILE, "IFU",
+				"RegFile core check stop" },
+		{ CORE_CHECKSTOP_IFU_LOGIC, "IFU", "Logic core check stop" },
+		{ CORE_CHECKSTOP_PC_DURING_RECOV, "PC",
+				"Core checkstop during recovery" },
+		{ CORE_CHECKSTOP_ISU_REGFILE, "ISU",
+				"RegFile core check stop (mapper error)" },
+		{ CORE_CHECKSTOP_ISU_LOGIC, "ISU", "Logic core check stop" },
+		{ CORE_CHECKSTOP_FXU_LOGIC, "FXU", "Logic core check stop" },
+		{ CORE_CHECKSTOP_VSU_LOGIC, "VSU", "Logic core check stop" },
+		{ CORE_CHECKSTOP_PC_RECOV_IN_MAINT_MODE, "PC",
+				"Recovery in maintenance mode" },
+		{ CORE_CHECKSTOP_LSU_REGFILE, "LSU",
+				"RegFile core check stop" },
+		{ CORE_CHECKSTOP_PC_FWD_PROGRESS, "PC",
+				"Forward Progress Error" },
+		{ CORE_CHECKSTOP_LSU_LOGIC, "LSU", "Logic core check stop" },
+		{ CORE_CHECKSTOP_PC_LOGIC, "PC", "Logic core check stop" },
+		{ CORE_CHECKSTOP_PC_HYP_RESOURCE, "PC",
+				"Hypervisor Resource error - core check stop" },
+		{ CORE_CHECKSTOP_PC_HANG_RECOV_FAILED, "PC",
+				"Hang Recovery Failed (core check stop)" },
+		{ CORE_CHECKSTOP_PC_AMBI_HANG_DETECTED, "PC",
+				"Ambiguous Hang Detected (unknown source)" },
+		{ CORE_CHECKSTOP_PC_DEBUG_TRIG_ERR_INJ, "PC",
+				"Debug Trigger Error inject" },
+		{ CORE_CHECKSTOP_PC_SPRD_HYP_ERR_INJ, "PC",
+				"Hypervisor check stop via SPRC/SPRD" },
+	};
+
+	/* Validity check */
+	if (!hmi_evt->u.xstop_error.xstop_reason) {
+		printk("%s	Unknown Core check stop.\n", level);
+		return;
+	}
+
+	printk("%s	CPU PIR: %08x\n", level,
+			be32_to_cpu(hmi_evt->u.xstop_error.u.pir));
+	for (i = 0; i < ARRAY_SIZE(xstop_reason); i++)
+		if (be32_to_cpu(hmi_evt->u.xstop_error.xstop_reason) &
+					xstop_reason[i].xstop_reason)
+			printk("%s	[Unit: %-3s] %s\n", level,
+					xstop_reason[i].unit_failed,
+					xstop_reason[i].description);
+}
+
+static void print_nx_checkstop_reason(const char *level,
+					struct OpalHMIEvent *hmi_evt)
+{
+	int i;
+	static const struct xstop_reason xstop_reason[] = {
+		{ NX_CHECKSTOP_SHM_INVAL_STATE_ERR, "DMA & Engine",
+					"SHM invalid state error" },
+		{ NX_CHECKSTOP_DMA_INVAL_STATE_ERR_1, "DMA & Engine",
+					"DMA invalid state error bit 15" },
+		{ NX_CHECKSTOP_DMA_INVAL_STATE_ERR_2, "DMA & Engine",
+					"DMA invalid state error bit 16" },
+		{ NX_CHECKSTOP_DMA_CH0_INVAL_STATE_ERR, "DMA & Engine",
+					"Channel 0 invalid state error" },
+		{ NX_CHECKSTOP_DMA_CH1_INVAL_STATE_ERR, "DMA & Engine",
+					"Channel 1 invalid state error" },
+		{ NX_CHECKSTOP_DMA_CH2_INVAL_STATE_ERR, "DMA & Engine",
+					"Channel 2 invalid state error" },
+		{ NX_CHECKSTOP_DMA_CH3_INVAL_STATE_ERR, "DMA & Engine",
+					"Channel 3 invalid state error" },
+		{ NX_CHECKSTOP_DMA_CH4_INVAL_STATE_ERR, "DMA & Engine",
+					"Channel 4 invalid state error" },
+		{ NX_CHECKSTOP_DMA_CH5_INVAL_STATE_ERR, "DMA & Engine",
+					"Channel 5 invalid state error" },
+		{ NX_CHECKSTOP_DMA_CH6_INVAL_STATE_ERR, "DMA & Engine",
+					"Channel 6 invalid state error" },
+		{ NX_CHECKSTOP_DMA_CH7_INVAL_STATE_ERR, "DMA & Engine",
+					"Channel 7 invalid state error" },
+		{ NX_CHECKSTOP_DMA_CRB_UE, "DMA & Engine",
+					"UE error on CRB(CSB address, CCB)" },
+		{ NX_CHECKSTOP_DMA_CRB_SUE, "DMA & Engine",
+					"SUE error on CRB(CSB address, CCB)" },
+		{ NX_CHECKSTOP_PBI_ISN_UE, "PowerBus Interface",
+		"CRB Kill ISN received while holding ISN with UE error" },
+	};
+
+	/* Validity check */
+	if (!hmi_evt->u.xstop_error.xstop_reason) {
+		printk("%s	Unknown NX check stop.\n", level);
+		return;
+	}
+
+	printk("%s	NX checkstop on CHIP ID: %x\n", level,
+			be32_to_cpu(hmi_evt->u.xstop_error.u.chip_id));
+	for (i = 0; i < ARRAY_SIZE(xstop_reason); i++)
+		if (be32_to_cpu(hmi_evt->u.xstop_error.xstop_reason) &
+					xstop_reason[i].xstop_reason)
+			printk("%s	[Unit: %-3s] %s\n", level,
+					xstop_reason[i].unit_failed,
+					xstop_reason[i].description);
+}
+
+static void print_npu_checkstop_reason(const char *level,
+					struct OpalHMIEvent *hmi_evt)
+{
+	uint8_t reason, reason_count, i;
+
+	/*
+	 * We may not have a checkstop reason on some combination of
+	 * hardware and/or skiboot version
+	 */
+	if (!hmi_evt->u.xstop_error.xstop_reason) {
+		printk("%s	NPU checkstop on chip %x\n", level,
+			be32_to_cpu(hmi_evt->u.xstop_error.u.chip_id));
+		return;
+	}
+
+	/*
+	 * NPU2 has 3 FIRs. Reason encoded on a byte as:
+	 *   2 bits for the FIR number
+	 *   6 bits for the bit number
+	 * It may be possible to find several reasons.
+	 *
+	 * We don't display a specific message per FIR bit as there
+	 * are too many and most are meaningless without the workbook
+	 * and/or hw team help anyway.
+	 */
+	reason_count = sizeof(hmi_evt->u.xstop_error.xstop_reason) /
+		sizeof(reason);
+	for (i = 0; i < reason_count; i++) {
+		reason = (hmi_evt->u.xstop_error.xstop_reason >> (8 * i)) & 0xFF;
+		if (reason)
+			printk("%s	NPU checkstop on chip %x: FIR%d bit %d is set\n",
+				level,
+				be32_to_cpu(hmi_evt->u.xstop_error.u.chip_id),
+				reason >> 6, reason & 0x3F);
+	}
+}
+
+static void print_checkstop_reason(const char *level,
+					struct OpalHMIEvent *hmi_evt)
+{
+	uint8_t type = hmi_evt->u.xstop_error.xstop_type;
+	switch (type) {
+	case CHECKSTOP_TYPE_CORE:
+		print_core_checkstop_reason(level, hmi_evt);
+		break;
+	case CHECKSTOP_TYPE_NX:
+		print_nx_checkstop_reason(level, hmi_evt);
+		break;
+	case CHECKSTOP_TYPE_NPU:
+		print_npu_checkstop_reason(level, hmi_evt);
+		break;
+	default:
+		printk("%s	Unknown Malfunction Alert of type %d\n",
+		       level, type);
+		break;
+	}
+}
+
+static void print_hmi_event_info(struct OpalHMIEvent *hmi_evt)
+{
+	const char *level, *sevstr, *error_info;
+	static const char *hmi_error_types[] = {
+		"Malfunction Alert",
+		"Processor Recovery done",
+		"Processor recovery occurred again",
+		"Processor recovery occurred for masked error",
+		"Timer facility experienced an error",
+		"TFMR SPR is corrupted",
+		"UPS (Uninterrupted Power System) Overflow indication",
+		"An XSCOM operation failure",
+		"An XSCOM operation completed",
+		"SCOM has set a reserved FIR bit to cause recovery",
+		"Debug trigger has set a reserved FIR bit to cause recovery",
+		"A hypervisor resource error occurred",
+		"CAPP recovery process is in progress",
+	};
+
+	/* Print things out */
+	if (hmi_evt->version < OpalHMIEvt_V1) {
+		pr_err("HMI Interrupt, Unknown event version %d !\n",
+			hmi_evt->version);
+		return;
+	}
+	switch (hmi_evt->severity) {
+	case OpalHMI_SEV_NO_ERROR:
+		level = KERN_INFO;
+		sevstr = "Harmless";
+		break;
+	case OpalHMI_SEV_WARNING:
+		level = KERN_WARNING;
+		sevstr = "";
+		break;
+	case OpalHMI_SEV_ERROR_SYNC:
+		level = KERN_ERR;
+		sevstr = "Severe";
+		break;
+	case OpalHMI_SEV_FATAL:
+	default:
+		level = KERN_ERR;
+		sevstr = "Fatal";
+		break;
+	}
+
+	printk("%s%s Hypervisor Maintenance interrupt [%s]\n",
+		level, sevstr,
+		hmi_evt->disposition == OpalHMI_DISPOSITION_RECOVERED ?
+		"Recovered" : "Not recovered");
+	error_info = hmi_evt->type < ARRAY_SIZE(hmi_error_types) ?
+			hmi_error_types[hmi_evt->type]
+			: "Unknown";
+	printk("%s Error detail: %s\n", level, error_info);
+	printk("%s	HMER: %016llx\n", level, be64_to_cpu(hmi_evt->hmer));
+	if ((hmi_evt->type == OpalHMI_ERROR_TFAC) ||
+		(hmi_evt->type == OpalHMI_ERROR_TFMR_PARITY))
+		printk("%s	TFMR: %016llx\n", level,
+						be64_to_cpu(hmi_evt->tfmr));
+
+	if (hmi_evt->version < OpalHMIEvt_V2)
+		return;
+
+	/* OpalHMIEvt_V2 and above provides reason for malfunction alert. */
+	if (hmi_evt->type == OpalHMI_ERROR_MALFUNC_ALERT)
+		print_checkstop_reason(level, hmi_evt);
+}
+
+static void hmi_event_handler(struct work_struct *work)
+{
+	unsigned long flags;
+	struct OpalHMIEvent *hmi_evt;
+	struct OpalHmiEvtNode *msg_node;
+	uint8_t disposition;
+	struct opal_msg msg;
+	int unrecoverable = 0;
+
+	spin_lock_irqsave(&opal_hmi_evt_lock, flags);
+	while (!list_empty(&opal_hmi_evt_list)) {
+		msg_node = list_entry(opal_hmi_evt_list.next,
+					   struct OpalHmiEvtNode, list);
+		list_del(&msg_node->list);
+		spin_unlock_irqrestore(&opal_hmi_evt_lock, flags);
+
+		hmi_evt = (struct OpalHMIEvent *) &msg_node->hmi_evt;
+		print_hmi_event_info(hmi_evt);
+		disposition = hmi_evt->disposition;
+		kfree(msg_node);
+
+		/*
+		 * Check if HMI event has been recovered or not. If not
+		 * then kernel can't continue, we need to panic.
+		 * But before we do that, display all the HMI event
+		 * available on the list and set unrecoverable flag to 1.
+		 */
+		if (disposition != OpalHMI_DISPOSITION_RECOVERED)
+			unrecoverable = 1;
+
+		spin_lock_irqsave(&opal_hmi_evt_lock, flags);
+	}
+	spin_unlock_irqrestore(&opal_hmi_evt_lock, flags);
+
+	if (unrecoverable) {
+		/* Pull all HMI events from OPAL before we panic. */
+		while (opal_get_msg(__pa(&msg), sizeof(msg)) == OPAL_SUCCESS) {
+			u32 type;
+
+			type = be32_to_cpu(msg.msg_type);
+
+			/* skip if not HMI event */
+			if (type != OPAL_MSG_HMI_EVT)
+				continue;
+
+			/* HMI event info starts from param[0] */
+			hmi_evt = (struct OpalHMIEvent *)&msg.params[0];
+			print_hmi_event_info(hmi_evt);
+		}
+
+		pnv_platform_error_reboot(NULL, "Unrecoverable HMI exception");
+	}
+}
+
+static DECLARE_WORK(hmi_event_work, hmi_event_handler);
+/*
+ * opal_handle_hmi_event - notifier handler that queues up HMI events
+ * to be preocessed later.
+ */
+static int opal_handle_hmi_event(struct notifier_block *nb,
+			  unsigned long msg_type, void *msg)
+{
+	unsigned long flags;
+	struct OpalHMIEvent *hmi_evt;
+	struct opal_msg *hmi_msg = msg;
+	struct OpalHmiEvtNode *msg_node;
+
+	/* Sanity Checks */
+	if (msg_type != OPAL_MSG_HMI_EVT)
+		return 0;
+
+	/* HMI event info starts from param[0] */
+	hmi_evt = (struct OpalHMIEvent *)&hmi_msg->params[0];
+
+	/* Delay the logging of HMI events to workqueue. */
+	msg_node = kzalloc(sizeof(*msg_node), GFP_ATOMIC);
+	if (!msg_node) {
+		pr_err("HMI: out of memory, Opal message event not handled\n");
+		return -ENOMEM;
+	}
+	memcpy(&msg_node->hmi_evt, hmi_evt, sizeof(*hmi_evt));
+
+	spin_lock_irqsave(&opal_hmi_evt_lock, flags);
+	list_add(&msg_node->list, &opal_hmi_evt_list);
+	spin_unlock_irqrestore(&opal_hmi_evt_lock, flags);
+
+	schedule_work(&hmi_event_work);
+	return 0;
+}
+
+static struct notifier_block opal_hmi_handler_nb = {
+	.notifier_call	= opal_handle_hmi_event,
+	.next		= NULL,
+	.priority	= 0,
+};
+
+int __init opal_hmi_handler_init(void)
+{
+	int ret;
+
+	if (!opal_hmi_handler_nb_init) {
+		ret = opal_message_notifier_register(
+				OPAL_MSG_HMI_EVT, &opal_hmi_handler_nb);
+		if (ret) {
+			pr_err("%s: Can't register OPAL event notifier (%d)\n",
+			       __func__, ret);
+			return ret;
+		}
+		opal_hmi_handler_nb_init = 1;
+	}
+	return 0;
+}
diff --git a/arch/powerpc/platforms/powernv/opal-imc.c b/arch/powerpc/platforms/powernv/opal-imc.c
new file mode 100644
index 000000000..7824cc364
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/opal-imc.c
@@ -0,0 +1,325 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * OPAL IMC interface detection driver
+ * Supported on POWERNV platform
+ *
+ * Copyright	(C) 2017 Madhavan Srinivasan, IBM Corporation.
+ *		(C) 2017 Anju T Sudhakar, IBM Corporation.
+ *		(C) 2017 Hemant K Shaw, IBM Corporation.
+ */
+#include <linux/kernel.h>
+#include <linux/platform_device.h>
+#include <linux/of.h>
+#include <linux/of_address.h>
+#include <linux/of_platform.h>
+#include <linux/crash_dump.h>
+#include <asm/opal.h>
+#include <asm/io.h>
+#include <asm/imc-pmu.h>
+#include <asm/cputhreads.h>
+#include <asm/debugfs.h>
+
+static struct dentry *imc_debugfs_parent;
+
+/* Helpers to export imc command and mode via debugfs */
+static int imc_mem_get(void *data, u64 *val)
+{
+	*val = cpu_to_be64(*(u64 *)data);
+	return 0;
+}
+
+static int imc_mem_set(void *data, u64 val)
+{
+	*(u64 *)data = cpu_to_be64(val);
+	return 0;
+}
+DEFINE_DEBUGFS_ATTRIBUTE(fops_imc_x64, imc_mem_get, imc_mem_set, "0x%016llx\n");
+
+static void imc_debugfs_create_x64(const char *name, umode_t mode,
+				   struct dentry *parent, u64  *value)
+{
+	debugfs_create_file_unsafe(name, mode, parent, value, &fops_imc_x64);
+}
+
+/*
+ * export_imc_mode_and_cmd: Create a debugfs interface
+ *                     for imc_cmd and imc_mode
+ *                     for each node in the system.
+ *  imc_mode and imc_cmd can be changed by echo into
+ *  this interface.
+ */
+static void export_imc_mode_and_cmd(struct device_node *node,
+				    struct imc_pmu *pmu_ptr)
+{
+	static u64 loc, *imc_mode_addr, *imc_cmd_addr;
+	char mode[16], cmd[16];
+	u32 cb_offset;
+	struct imc_mem_info *ptr = pmu_ptr->mem_info;
+
+	imc_debugfs_parent = debugfs_create_dir("imc", powerpc_debugfs_root);
+
+	if (of_property_read_u32(node, "cb_offset", &cb_offset))
+		cb_offset = IMC_CNTL_BLK_OFFSET;
+
+	while (ptr->vbase != NULL) {
+		loc = (u64)(ptr->vbase) + cb_offset;
+		imc_mode_addr = (u64 *)(loc + IMC_CNTL_BLK_MODE_OFFSET);
+		sprintf(mode, "imc_mode_%d", (u32)(ptr->id));
+		imc_debugfs_create_x64(mode, 0600, imc_debugfs_parent,
+				       imc_mode_addr);
+
+		imc_cmd_addr = (u64 *)(loc + IMC_CNTL_BLK_CMD_OFFSET);
+		sprintf(cmd, "imc_cmd_%d", (u32)(ptr->id));
+		imc_debugfs_create_x64(cmd, 0600, imc_debugfs_parent,
+				       imc_cmd_addr);
+		ptr++;
+	}
+}
+
+/*
+ * imc_get_mem_addr_nest: Function to get nest counter memory region
+ * for each chip
+ */
+static int imc_get_mem_addr_nest(struct device_node *node,
+				 struct imc_pmu *pmu_ptr,
+				 u32 offset)
+{
+	int nr_chips = 0, i;
+	u64 *base_addr_arr, baddr;
+	u32 *chipid_arr;
+
+	nr_chips = of_property_count_u32_elems(node, "chip-id");
+	if (nr_chips <= 0)
+		return -ENODEV;
+
+	base_addr_arr = kcalloc(nr_chips, sizeof(*base_addr_arr), GFP_KERNEL);
+	if (!base_addr_arr)
+		return -ENOMEM;
+
+	chipid_arr = kcalloc(nr_chips, sizeof(*chipid_arr), GFP_KERNEL);
+	if (!chipid_arr) {
+		kfree(base_addr_arr);
+		return -ENOMEM;
+	}
+
+	if (of_property_read_u32_array(node, "chip-id", chipid_arr, nr_chips))
+		goto error;
+
+	if (of_property_read_u64_array(node, "base-addr", base_addr_arr,
+								nr_chips))
+		goto error;
+
+	pmu_ptr->mem_info = kcalloc(nr_chips + 1, sizeof(*pmu_ptr->mem_info),
+				    GFP_KERNEL);
+	if (!pmu_ptr->mem_info)
+		goto error;
+
+	for (i = 0; i < nr_chips; i++) {
+		pmu_ptr->mem_info[i].id = chipid_arr[i];
+		baddr = base_addr_arr[i] + offset;
+		pmu_ptr->mem_info[i].vbase = phys_to_virt(baddr);
+	}
+
+	pmu_ptr->imc_counter_mmaped = true;
+	kfree(base_addr_arr);
+	kfree(chipid_arr);
+	return 0;
+
+error:
+	kfree(base_addr_arr);
+	kfree(chipid_arr);
+	return -1;
+}
+
+/*
+ * imc_pmu_create : Takes the parent device which is the pmu unit, pmu_index
+ *		    and domain as the inputs.
+ * Allocates memory for the struct imc_pmu, sets up its domain, size and offsets
+ */
+static struct imc_pmu *imc_pmu_create(struct device_node *parent, int pmu_index, int domain)
+{
+	int ret = 0;
+	struct imc_pmu *pmu_ptr;
+	u32 offset;
+
+	/* Return for unknown domain */
+	if (domain < 0)
+		return NULL;
+
+	/* memory for pmu */
+	pmu_ptr = kzalloc(sizeof(*pmu_ptr), GFP_KERNEL);
+	if (!pmu_ptr)
+		return NULL;
+
+	/* Set the domain */
+	pmu_ptr->domain = domain;
+
+	ret = of_property_read_u32(parent, "size", &pmu_ptr->counter_mem_size);
+	if (ret)
+		goto free_pmu;
+
+	if (!of_property_read_u32(parent, "offset", &offset)) {
+		if (imc_get_mem_addr_nest(parent, pmu_ptr, offset))
+			goto free_pmu;
+	}
+
+	/* Function to register IMC pmu */
+	ret = init_imc_pmu(parent, pmu_ptr, pmu_index);
+	if (ret) {
+		pr_err("IMC PMU %s Register failed\n", pmu_ptr->pmu.name);
+		kfree(pmu_ptr->pmu.name);
+		if (pmu_ptr->domain == IMC_DOMAIN_NEST)
+			kfree(pmu_ptr->mem_info);
+		kfree(pmu_ptr);
+		return NULL;
+	}
+
+	return pmu_ptr;
+
+free_pmu:
+	kfree(pmu_ptr);
+	return NULL;
+}
+
+static void disable_nest_pmu_counters(void)
+{
+	int nid, cpu;
+	const struct cpumask *l_cpumask;
+
+	get_online_cpus();
+	for_each_node_with_cpus(nid) {
+		l_cpumask = cpumask_of_node(nid);
+		cpu = cpumask_first_and(l_cpumask, cpu_online_mask);
+		if (cpu >= nr_cpu_ids)
+			continue;
+		opal_imc_counters_stop(OPAL_IMC_COUNTERS_NEST,
+				       get_hard_smp_processor_id(cpu));
+	}
+	put_online_cpus();
+}
+
+static void disable_core_pmu_counters(void)
+{
+	cpumask_t cores_map;
+	int cpu, rc;
+
+	get_online_cpus();
+	/* Disable the IMC Core functions */
+	cores_map = cpu_online_cores_map();
+	for_each_cpu(cpu, &cores_map) {
+		rc = opal_imc_counters_stop(OPAL_IMC_COUNTERS_CORE,
+					    get_hard_smp_processor_id(cpu));
+		if (rc)
+			pr_err("%s: Failed to stop Core (cpu = %d)\n",
+				__FUNCTION__, cpu);
+	}
+	put_online_cpus();
+}
+
+int get_max_nest_dev(void)
+{
+	struct device_node *node;
+	u32 pmu_units = 0, type;
+
+	for_each_compatible_node(node, NULL, IMC_DTB_UNIT_COMPAT) {
+		if (of_property_read_u32(node, "type", &type))
+			continue;
+
+		if (type == IMC_TYPE_CHIP)
+			pmu_units++;
+	}
+
+	return pmu_units;
+}
+
+static int opal_imc_counters_probe(struct platform_device *pdev)
+{
+	struct device_node *imc_dev = pdev->dev.of_node;
+	struct imc_pmu *pmu;
+	int pmu_count = 0, domain;
+	bool core_imc_reg = false, thread_imc_reg = false;
+	u32 type;
+
+	/*
+	 * Check whether this is kdump kernel. If yes, force the engines to
+	 * stop and return.
+	 */
+	if (is_kdump_kernel()) {
+		disable_nest_pmu_counters();
+		disable_core_pmu_counters();
+		return -ENODEV;
+	}
+
+	for_each_compatible_node(imc_dev, NULL, IMC_DTB_UNIT_COMPAT) {
+		pmu = NULL;
+		if (of_property_read_u32(imc_dev, "type", &type)) {
+			pr_warn("IMC Device without type property\n");
+			continue;
+		}
+
+		switch (type) {
+		case IMC_TYPE_CHIP:
+			domain = IMC_DOMAIN_NEST;
+			break;
+		case IMC_TYPE_CORE:
+			domain =IMC_DOMAIN_CORE;
+			break;
+		case IMC_TYPE_THREAD:
+			domain = IMC_DOMAIN_THREAD;
+			break;
+		case IMC_TYPE_TRACE:
+			domain = IMC_DOMAIN_TRACE;
+			break;
+		default:
+			pr_warn("IMC Unknown Device type \n");
+			domain = -1;
+			break;
+		}
+
+		pmu = imc_pmu_create(imc_dev, pmu_count, domain);
+		if (pmu != NULL) {
+			if (domain == IMC_DOMAIN_NEST) {
+				if (!imc_debugfs_parent)
+					export_imc_mode_and_cmd(imc_dev, pmu);
+				pmu_count++;
+			}
+			if (domain == IMC_DOMAIN_CORE)
+				core_imc_reg = true;
+			if (domain == IMC_DOMAIN_THREAD)
+				thread_imc_reg = true;
+		}
+	}
+
+	/* If core imc is not registered, unregister thread-imc */
+	if (!core_imc_reg && thread_imc_reg)
+		unregister_thread_imc();
+
+	return 0;
+}
+
+static void opal_imc_counters_shutdown(struct platform_device *pdev)
+{
+	/*
+	 * Function only stops the engines which is bare minimum.
+	 * TODO: Need to handle proper memory cleanup and pmu
+	 * unregister.
+	 */
+	disable_nest_pmu_counters();
+	disable_core_pmu_counters();
+}
+
+static const struct of_device_id opal_imc_match[] = {
+	{ .compatible = IMC_DTB_COMPAT },
+	{},
+};
+
+static struct platform_driver opal_imc_driver = {
+	.driver = {
+		.name = "opal-imc-counters",
+		.of_match_table = opal_imc_match,
+	},
+	.probe = opal_imc_counters_probe,
+	.shutdown = opal_imc_counters_shutdown,
+};
+
+builtin_platform_driver(opal_imc_driver);
diff --git a/arch/powerpc/platforms/powernv/opal-irqchip.c b/arch/powerpc/platforms/powernv/opal-irqchip.c
new file mode 100644
index 000000000..dcec0f760
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/opal-irqchip.c
@@ -0,0 +1,315 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * This file implements an irqchip for OPAL events. Whenever there is
+ * an interrupt that is handled by OPAL we get passed a list of events
+ * that Linux needs to do something about. These basically look like
+ * interrupts to Linux so we implement an irqchip to handle them.
+ *
+ * Copyright Alistair Popple, IBM Corporation 2014.
+ */
+#include <linux/bitops.h>
+#include <linux/irq.h>
+#include <linux/irqchip.h>
+#include <linux/irqdomain.h>
+#include <linux/interrupt.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/platform_device.h>
+#include <linux/kthread.h>
+#include <linux/delay.h>
+#include <linux/slab.h>
+#include <linux/of_irq.h>
+
+#include <asm/machdep.h>
+#include <asm/opal.h>
+
+#include "powernv.h"
+
+/* Maximum number of events supported by OPAL firmware */
+#define MAX_NUM_EVENTS 64
+
+struct opal_event_irqchip {
+	struct irq_chip irqchip;
+	struct irq_domain *domain;
+	unsigned long mask;
+};
+static struct opal_event_irqchip opal_event_irqchip;
+static u64 last_outstanding_events;
+static int opal_irq_count;
+static struct resource *opal_irqs;
+
+void opal_handle_events(void)
+{
+	__be64 events = 0;
+	u64 e;
+
+	e = READ_ONCE(last_outstanding_events) & opal_event_irqchip.mask;
+again:
+	while (e) {
+		int virq, hwirq;
+
+		hwirq = fls64(e) - 1;
+		e &= ~BIT_ULL(hwirq);
+
+		local_irq_disable();
+		virq = irq_find_mapping(opal_event_irqchip.domain, hwirq);
+		if (virq) {
+			irq_enter();
+			generic_handle_irq(virq);
+			irq_exit();
+		}
+		local_irq_enable();
+
+		cond_resched();
+	}
+	last_outstanding_events = 0;
+	if (opal_poll_events(&events) != OPAL_SUCCESS)
+		return;
+	e = be64_to_cpu(events) & opal_event_irqchip.mask;
+	if (e)
+		goto again;
+}
+
+bool opal_have_pending_events(void)
+{
+	if (last_outstanding_events & opal_event_irqchip.mask)
+		return true;
+	return false;
+}
+
+static void opal_event_mask(struct irq_data *d)
+{
+	clear_bit(d->hwirq, &opal_event_irqchip.mask);
+}
+
+static void opal_event_unmask(struct irq_data *d)
+{
+	set_bit(d->hwirq, &opal_event_irqchip.mask);
+	if (opal_have_pending_events())
+		opal_wake_poller();
+}
+
+static int opal_event_set_type(struct irq_data *d, unsigned int flow_type)
+{
+	/*
+	 * For now we only support level triggered events. The irq
+	 * handler will be called continuously until the event has
+	 * been cleared in OPAL.
+	 */
+	if (flow_type != IRQ_TYPE_LEVEL_HIGH)
+		return -EINVAL;
+
+	return 0;
+}
+
+static struct opal_event_irqchip opal_event_irqchip = {
+	.irqchip = {
+		.name = "OPAL EVT",
+		.irq_mask = opal_event_mask,
+		.irq_unmask = opal_event_unmask,
+		.irq_set_type = opal_event_set_type,
+	},
+	.mask = 0,
+};
+
+static int opal_event_map(struct irq_domain *d, unsigned int irq,
+			irq_hw_number_t hwirq)
+{
+	irq_set_chip_data(irq, &opal_event_irqchip);
+	irq_set_chip_and_handler(irq, &opal_event_irqchip.irqchip,
+				handle_level_irq);
+
+	return 0;
+}
+
+static irqreturn_t opal_interrupt(int irq, void *data)
+{
+	__be64 events;
+
+	opal_handle_interrupt(virq_to_hw(irq), &events);
+	last_outstanding_events = be64_to_cpu(events);
+	if (opal_have_pending_events())
+		opal_wake_poller();
+
+	return IRQ_HANDLED;
+}
+
+static int opal_event_match(struct irq_domain *h, struct device_node *node,
+			    enum irq_domain_bus_token bus_token)
+{
+	return irq_domain_get_of_node(h) == node;
+}
+
+static int opal_event_xlate(struct irq_domain *h, struct device_node *np,
+			   const u32 *intspec, unsigned int intsize,
+			   irq_hw_number_t *out_hwirq, unsigned int *out_flags)
+{
+	*out_hwirq = intspec[0];
+	*out_flags = IRQ_TYPE_LEVEL_HIGH;
+
+	return 0;
+}
+
+static const struct irq_domain_ops opal_event_domain_ops = {
+	.match	= opal_event_match,
+	.map	= opal_event_map,
+	.xlate	= opal_event_xlate,
+};
+
+void opal_event_shutdown(void)
+{
+	unsigned int i;
+
+	/* First free interrupts, which will also mask them */
+	for (i = 0; i < opal_irq_count; i++) {
+		if (!opal_irqs || !opal_irqs[i].start)
+			continue;
+
+		if (in_interrupt() || irqs_disabled())
+			disable_irq_nosync(opal_irqs[i].start);
+		else
+			free_irq(opal_irqs[i].start, NULL);
+
+		opal_irqs[i].start = 0;
+	}
+}
+
+int __init opal_event_init(void)
+{
+	struct device_node *dn, *opal_node;
+	bool old_style = false;
+	int i, rc = 0;
+
+	opal_node = of_find_node_by_path("/ibm,opal");
+	if (!opal_node) {
+		pr_warn("opal: Node not found\n");
+		return -ENODEV;
+	}
+
+	/* If dn is NULL it means the domain won't be linked to a DT
+	 * node so therefore irq_of_parse_and_map(...) wont work. But
+	 * that shouldn't be problem because if we're running a
+	 * version of skiboot that doesn't have the dn then the
+	 * devices won't have the correct properties and will have to
+	 * fall back to the legacy method (opal_event_request(...))
+	 * anyway. */
+	dn = of_find_compatible_node(NULL, NULL, "ibm,opal-event");
+	opal_event_irqchip.domain = irq_domain_add_linear(dn, MAX_NUM_EVENTS,
+				&opal_event_domain_ops, &opal_event_irqchip);
+	of_node_put(dn);
+	if (!opal_event_irqchip.domain) {
+		pr_warn("opal: Unable to create irq domain\n");
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	/* Look for new-style (standard) "interrupts" property */
+	opal_irq_count = of_irq_count(opal_node);
+
+	/* Absent ? Look for the old one */
+	if (opal_irq_count < 1) {
+		/* Get opal-interrupts property and names if present */
+		rc = of_property_count_u32_elems(opal_node, "opal-interrupts");
+		if (rc > 0)
+			opal_irq_count = rc;
+		old_style = true;
+	}
+
+	/* No interrupts ? Bail out */
+	if (!opal_irq_count)
+		goto out;
+
+	pr_debug("OPAL: Found %d interrupts reserved for OPAL using %s scheme\n",
+		 opal_irq_count, old_style ? "old" : "new");
+
+	/* Allocate an IRQ resources array */
+	opal_irqs = kcalloc(opal_irq_count, sizeof(struct resource), GFP_KERNEL);
+	if (WARN_ON(!opal_irqs)) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	/* Build the resources array */
+	if (old_style) {
+		/* Old style "opal-interrupts" property */
+		for (i = 0; i < opal_irq_count; i++) {
+			struct resource *r = &opal_irqs[i];
+			const char *name = NULL;
+			u32 hw_irq;
+			int virq;
+
+			rc = of_property_read_u32_index(opal_node, "opal-interrupts",
+							i, &hw_irq);
+			if (WARN_ON(rc < 0)) {
+				opal_irq_count = i;
+				break;
+			}
+			of_property_read_string_index(opal_node, "opal-interrupts-names",
+						      i, &name);
+			virq = irq_create_mapping(NULL, hw_irq);
+			if (!virq) {
+				pr_warn("Failed to map OPAL irq 0x%x\n", hw_irq);
+				continue;
+			}
+			r->start = r->end = virq;
+			r->flags = IORESOURCE_IRQ | IRQ_TYPE_LEVEL_LOW;
+			r->name = name;
+		}
+	} else {
+		/* new style standard "interrupts" property */
+		rc = of_irq_to_resource_table(opal_node, opal_irqs, opal_irq_count);
+		if (WARN_ON(rc < 0)) {
+			opal_irq_count = 0;
+			kfree(opal_irqs);
+			goto out;
+		}
+		if (WARN_ON(rc < opal_irq_count))
+			opal_irq_count = rc;
+	}
+
+	/* Install interrupt handlers */
+	for (i = 0; i < opal_irq_count; i++) {
+		struct resource *r = &opal_irqs[i];
+		const char *name;
+
+		/* Prefix name */
+		if (r->name && strlen(r->name))
+			name = kasprintf(GFP_KERNEL, "opal-%s", r->name);
+		else
+			name = kasprintf(GFP_KERNEL, "opal");
+
+		if (!name)
+			continue;
+		/* Install interrupt handler */
+		rc = request_irq(r->start, opal_interrupt, r->flags & IRQD_TRIGGER_MASK,
+				 name, NULL);
+		if (rc) {
+			pr_warn("Error %d requesting OPAL irq %d\n", rc, (int)r->start);
+			continue;
+		}
+	}
+	rc = 0;
+ out:
+	of_node_put(opal_node);
+	return rc;
+}
+machine_arch_initcall(powernv, opal_event_init);
+
+/**
+ * opal_event_request(unsigned int opal_event_nr) - Request an event
+ * @opal_event_nr: the opal event number to request
+ *
+ * This routine can be used to find the linux virq number which can
+ * then be passed to request_irq to assign a handler for a particular
+ * opal event. This should only be used by legacy devices which don't
+ * have proper device tree bindings. Most devices should use
+ * irq_of_parse_and_map() instead.
+ */
+int opal_event_request(unsigned int opal_event_nr)
+{
+	if (WARN_ON_ONCE(!opal_event_irqchip.domain))
+		return 0;
+
+	return irq_create_mapping(opal_event_irqchip.domain, opal_event_nr);
+}
+EXPORT_SYMBOL(opal_event_request);
diff --git a/arch/powerpc/platforms/powernv/opal-kmsg.c b/arch/powerpc/platforms/powernv/opal-kmsg.c
new file mode 100644
index 000000000..6c3bc4b4d
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/opal-kmsg.c
@@ -0,0 +1,47 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * kmsg dumper that ensures the OPAL console fully flushes panic messages
+ *
+ * Author: Russell Currey <ruscur@russell.cc>
+ *
+ * Copyright 2015 IBM Corporation.
+ */
+
+#include <linux/kmsg_dump.h>
+
+#include <asm/opal.h>
+#include <asm/opal-api.h>
+
+/*
+ * Console output is controlled by OPAL firmware.  The kernel regularly calls
+ * OPAL_POLL_EVENTS, which flushes some console output.  In a panic state,
+ * however, the kernel no longer calls OPAL_POLL_EVENTS and the panic message
+ * may not be completely printed.  This function does not actually dump the
+ * message, it just ensures that OPAL completely flushes the console buffer.
+ */
+static void kmsg_dump_opal_console_flush(struct kmsg_dumper *dumper,
+				     enum kmsg_dump_reason reason)
+{
+	/*
+	 * Outside of a panic context the pollers will continue to run,
+	 * so we don't need to do any special flushing.
+	 */
+	if (reason != KMSG_DUMP_PANIC)
+		return;
+
+	opal_flush_console(0);
+}
+
+static struct kmsg_dumper opal_kmsg_dumper = {
+	.dump = kmsg_dump_opal_console_flush
+};
+
+void __init opal_kmsg_init(void)
+{
+	int rc;
+
+	/* Add our dumper to the list */
+	rc = kmsg_dump_register(&opal_kmsg_dumper);
+	if (rc != 0)
+		pr_err("opal: kmsg_dump_register failed; returned %d\n", rc);
+}
diff --git a/arch/powerpc/platforms/powernv/opal-lpc.c b/arch/powerpc/platforms/powernv/opal-lpc.c
new file mode 100644
index 000000000..123a0e799
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/opal-lpc.c
@@ -0,0 +1,418 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * PowerNV LPC bus handling.
+ *
+ * Copyright 2013 IBM Corp.
+ */
+
+#include <linux/kernel.h>
+#include <linux/of.h>
+#include <linux/bug.h>
+#include <linux/io.h>
+#include <linux/slab.h>
+
+#include <asm/machdep.h>
+#include <asm/firmware.h>
+#include <asm/opal.h>
+#include <asm/prom.h>
+#include <linux/uaccess.h>
+#include <asm/debugfs.h>
+#include <asm/isa-bridge.h>
+
+static int opal_lpc_chip_id = -1;
+
+static u8 opal_lpc_inb(unsigned long port)
+{
+	int64_t rc;
+	__be32 data;
+
+	if (opal_lpc_chip_id < 0 || port > 0xffff)
+		return 0xff;
+	rc = opal_lpc_read(opal_lpc_chip_id, OPAL_LPC_IO, port, &data, 1);
+	return rc ? 0xff : be32_to_cpu(data);
+}
+
+static __le16 __opal_lpc_inw(unsigned long port)
+{
+	int64_t rc;
+	__be32 data;
+
+	if (opal_lpc_chip_id < 0 || port > 0xfffe)
+		return 0xffff;
+	if (port & 1)
+		return (__le16)opal_lpc_inb(port) << 8 | opal_lpc_inb(port + 1);
+	rc = opal_lpc_read(opal_lpc_chip_id, OPAL_LPC_IO, port, &data, 2);
+	return rc ? 0xffff : be32_to_cpu(data);
+}
+static u16 opal_lpc_inw(unsigned long port)
+{
+	return le16_to_cpu(__opal_lpc_inw(port));
+}
+
+static __le32 __opal_lpc_inl(unsigned long port)
+{
+	int64_t rc;
+	__be32 data;
+
+	if (opal_lpc_chip_id < 0 || port > 0xfffc)
+		return 0xffffffff;
+	if (port & 3)
+		return (__le32)opal_lpc_inb(port    ) << 24 |
+		       (__le32)opal_lpc_inb(port + 1) << 16 |
+		       (__le32)opal_lpc_inb(port + 2) <<  8 |
+			       opal_lpc_inb(port + 3);
+	rc = opal_lpc_read(opal_lpc_chip_id, OPAL_LPC_IO, port, &data, 4);
+	return rc ? 0xffffffff : be32_to_cpu(data);
+}
+
+static u32 opal_lpc_inl(unsigned long port)
+{
+	return le32_to_cpu(__opal_lpc_inl(port));
+}
+
+static void opal_lpc_outb(u8 val, unsigned long port)
+{
+	if (opal_lpc_chip_id < 0 || port > 0xffff)
+		return;
+	opal_lpc_write(opal_lpc_chip_id, OPAL_LPC_IO, port, val, 1);
+}
+
+static void __opal_lpc_outw(__le16 val, unsigned long port)
+{
+	if (opal_lpc_chip_id < 0 || port > 0xfffe)
+		return;
+	if (port & 1) {
+		opal_lpc_outb(val >> 8, port);
+		opal_lpc_outb(val     , port + 1);
+		return;
+	}
+	opal_lpc_write(opal_lpc_chip_id, OPAL_LPC_IO, port, val, 2);
+}
+
+static void opal_lpc_outw(u16 val, unsigned long port)
+{
+	__opal_lpc_outw(cpu_to_le16(val), port);
+}
+
+static void __opal_lpc_outl(__le32 val, unsigned long port)
+{
+	if (opal_lpc_chip_id < 0 || port > 0xfffc)
+		return;
+	if (port & 3) {
+		opal_lpc_outb(val >> 24, port);
+		opal_lpc_outb(val >> 16, port + 1);
+		opal_lpc_outb(val >>  8, port + 2);
+		opal_lpc_outb(val      , port + 3);
+		return;
+	}
+	opal_lpc_write(opal_lpc_chip_id, OPAL_LPC_IO, port, val, 4);
+}
+
+static void opal_lpc_outl(u32 val, unsigned long port)
+{
+	__opal_lpc_outl(cpu_to_le32(val), port);
+}
+
+static void opal_lpc_insb(unsigned long p, void *b, unsigned long c)
+{
+	u8 *ptr = b;
+
+	while(c--)
+		*(ptr++) = opal_lpc_inb(p);
+}
+
+static void opal_lpc_insw(unsigned long p, void *b, unsigned long c)
+{
+	__le16 *ptr = b;
+
+	while(c--)
+		*(ptr++) = __opal_lpc_inw(p);
+}
+
+static void opal_lpc_insl(unsigned long p, void *b, unsigned long c)
+{
+	__le32 *ptr = b;
+
+	while(c--)
+		*(ptr++) = __opal_lpc_inl(p);
+}
+
+static void opal_lpc_outsb(unsigned long p, const void *b, unsigned long c)
+{
+	const u8 *ptr = b;
+
+	while(c--)
+		opal_lpc_outb(*(ptr++), p);
+}
+
+static void opal_lpc_outsw(unsigned long p, const void *b, unsigned long c)
+{
+	const __le16 *ptr = b;
+
+	while(c--)
+		__opal_lpc_outw(*(ptr++), p);
+}
+
+static void opal_lpc_outsl(unsigned long p, const void *b, unsigned long c)
+{
+	const __le32 *ptr = b;
+
+	while(c--)
+		__opal_lpc_outl(*(ptr++), p);
+}
+
+static const struct ppc_pci_io opal_lpc_io = {
+	.inb	= opal_lpc_inb,
+	.inw	= opal_lpc_inw,
+	.inl	= opal_lpc_inl,
+	.outb	= opal_lpc_outb,
+	.outw	= opal_lpc_outw,
+	.outl	= opal_lpc_outl,
+	.insb	= opal_lpc_insb,
+	.insw	= opal_lpc_insw,
+	.insl	= opal_lpc_insl,
+	.outsb	= opal_lpc_outsb,
+	.outsw	= opal_lpc_outsw,
+	.outsl	= opal_lpc_outsl,
+};
+
+#ifdef CONFIG_DEBUG_FS
+struct lpc_debugfs_entry {
+	enum OpalLPCAddressType lpc_type;
+};
+
+static ssize_t lpc_debug_read(struct file *filp, char __user *ubuf,
+			      size_t count, loff_t *ppos)
+{
+	struct lpc_debugfs_entry *lpc = filp->private_data;
+	u32 data, pos, len, todo;
+	int rc;
+
+	if (!access_ok(ubuf, count))
+		return -EFAULT;
+
+	todo = count;
+	while (todo) {
+		pos = *ppos;
+
+		/*
+		 * Select access size based on count and alignment and
+		 * access type. IO and MEM only support byte acceses,
+		 * FW supports all 3.
+		 */
+		len = 1;
+		if (lpc->lpc_type == OPAL_LPC_FW) {
+			if (todo > 3 && (pos & 3) == 0)
+				len = 4;
+			else if (todo > 1 && (pos & 1) == 0)
+				len = 2;
+		}
+		rc = opal_lpc_read(opal_lpc_chip_id, lpc->lpc_type, pos,
+				   &data, len);
+		if (rc)
+			return -ENXIO;
+
+		/*
+		 * Now there is some trickery with the data returned by OPAL
+		 * as it's the desired data right justified in a 32-bit BE
+		 * word.
+		 *
+		 * This is a very bad interface and I'm to blame for it :-(
+		 *
+		 * So we can't just apply a 32-bit swap to what comes from OPAL,
+		 * because user space expects the *bytes* to be in their proper
+		 * respective positions (ie, LPC position).
+		 *
+		 * So what we really want to do here is to shift data right
+		 * appropriately on a LE kernel.
+		 *
+		 * IE. If the LPC transaction has bytes B0, B1, B2 and B3 in that
+		 * order, we have in memory written to by OPAL at the "data"
+		 * pointer:
+		 *
+		 *               Bytes:      OPAL "data"   LE "data"
+		 *   32-bit:   B0 B1 B2 B3   B0B1B2B3      B3B2B1B0
+		 *   16-bit:   B0 B1         0000B0B1      B1B00000
+		 *    8-bit:   B0            000000B0      B0000000
+		 *
+		 * So a BE kernel will have the leftmost of the above in the MSB
+		 * and rightmost in the LSB and can just then "cast" the u32 "data"
+		 * down to the appropriate quantity and write it.
+		 *
+		 * However, an LE kernel can't. It doesn't need to swap because a
+		 * load from data followed by a store to user are going to preserve
+		 * the byte ordering which is the wire byte order which is what the
+		 * user wants, but in order to "crop" to the right size, we need to
+		 * shift right first.
+		 */
+		switch(len) {
+		case 4:
+			rc = __put_user((u32)data, (u32 __user *)ubuf);
+			break;
+		case 2:
+#ifdef __LITTLE_ENDIAN__
+			data >>= 16;
+#endif
+			rc = __put_user((u16)data, (u16 __user *)ubuf);
+			break;
+		default:
+#ifdef __LITTLE_ENDIAN__
+			data >>= 24;
+#endif
+			rc = __put_user((u8)data, (u8 __user *)ubuf);
+			break;
+		}
+		if (rc)
+			return -EFAULT;
+		*ppos += len;
+		ubuf += len;
+		todo -= len;
+	}
+
+	return count;
+}
+
+static ssize_t lpc_debug_write(struct file *filp, const char __user *ubuf,
+			       size_t count, loff_t *ppos)
+{
+	struct lpc_debugfs_entry *lpc = filp->private_data;
+	u32 data, pos, len, todo;
+	int rc;
+
+	if (!access_ok(ubuf, count))
+		return -EFAULT;
+
+	todo = count;
+	while (todo) {
+		pos = *ppos;
+
+		/*
+		 * Select access size based on count and alignment and
+		 * access type. IO and MEM only support byte acceses,
+		 * FW supports all 3.
+		 */
+		len = 1;
+		if (lpc->lpc_type == OPAL_LPC_FW) {
+			if (todo > 3 && (pos & 3) == 0)
+				len = 4;
+			else if (todo > 1 && (pos & 1) == 0)
+				len = 2;
+		}
+
+		/*
+		 * Similarly to the read case, we have some trickery here but
+		 * it's different to handle. We need to pass the value to OPAL in
+		 * a register whose layout depends on the access size. We want
+		 * to reproduce the memory layout of the user, however we aren't
+		 * doing a load from user and a store to another memory location
+		 * which would achieve that. Here we pass the value to OPAL via
+		 * a register which is expected to contain the "BE" interpretation
+		 * of the byte sequence. IE: for a 32-bit access, byte 0 should be
+		 * in the MSB. So here we *do* need to byteswap on LE.
+		 *
+		 *           User bytes:    LE "data"  OPAL "data"
+		 *  32-bit:  B0 B1 B2 B3    B3B2B1B0   B0B1B2B3
+		 *  16-bit:  B0 B1          0000B1B0   0000B0B1
+		 *   8-bit:  B0             000000B0   000000B0
+		 */
+		switch(len) {
+		case 4:
+			rc = __get_user(data, (u32 __user *)ubuf);
+			data = cpu_to_be32(data);
+			break;
+		case 2:
+			rc = __get_user(data, (u16 __user *)ubuf);
+			data = cpu_to_be16(data);
+			break;
+		default:
+			rc = __get_user(data, (u8 __user *)ubuf);
+			break;
+		}
+		if (rc)
+			return -EFAULT;
+
+		rc = opal_lpc_write(opal_lpc_chip_id, lpc->lpc_type, pos,
+				    data, len);
+		if (rc)
+			return -ENXIO;
+		*ppos += len;
+		ubuf += len;
+		todo -= len;
+	}
+
+	return count;
+}
+
+static const struct file_operations lpc_fops = {
+	.read =		lpc_debug_read,
+	.write =	lpc_debug_write,
+	.open =		simple_open,
+	.llseek =	default_llseek,
+};
+
+static int opal_lpc_debugfs_create_type(struct dentry *folder,
+					const char *fname,
+					enum OpalLPCAddressType type)
+{
+	struct lpc_debugfs_entry *entry;
+	entry = kzalloc(sizeof(*entry), GFP_KERNEL);
+	if (!entry)
+		return -ENOMEM;
+	entry->lpc_type = type;
+	debugfs_create_file(fname, 0600, folder, entry, &lpc_fops);
+	return 0;
+}
+
+static int opal_lpc_init_debugfs(void)
+{
+	struct dentry *root;
+	int rc = 0;
+
+	if (opal_lpc_chip_id < 0)
+		return -ENODEV;
+
+	root = debugfs_create_dir("lpc", powerpc_debugfs_root);
+
+	rc |= opal_lpc_debugfs_create_type(root, "io", OPAL_LPC_IO);
+	rc |= opal_lpc_debugfs_create_type(root, "mem", OPAL_LPC_MEM);
+	rc |= opal_lpc_debugfs_create_type(root, "fw", OPAL_LPC_FW);
+	return rc;
+}
+machine_device_initcall(powernv, opal_lpc_init_debugfs);
+#endif  /* CONFIG_DEBUG_FS */
+
+void __init opal_lpc_init(void)
+{
+	struct device_node *np;
+
+	/*
+	 * Look for a Power8 LPC bus tagged as "primary",
+	 * we currently support only one though the OPAL APIs
+	 * support any number.
+	 */
+	for_each_compatible_node(np, NULL, "ibm,power8-lpc") {
+		if (!of_device_is_available(np))
+			continue;
+		if (!of_get_property(np, "primary", NULL))
+			continue;
+		opal_lpc_chip_id = of_get_ibm_chip_id(np);
+		of_node_put(np);
+		break;
+	}
+	if (opal_lpc_chip_id < 0)
+		return;
+
+	/* Does it support direct mapping ? */
+	if (of_get_property(np, "ranges", NULL)) {
+		pr_info("OPAL: Found memory mapped LPC bus on chip %d\n",
+			opal_lpc_chip_id);
+		isa_bridge_init_non_pci(np);
+	} else {
+		pr_info("OPAL: Found non-mapped LPC bus on chip %d\n",
+			opal_lpc_chip_id);
+
+		/* Setup special IO ops */
+		ppc_pci_io = opal_lpc_io;
+		isa_io_special = true;
+	}
+}
diff --git a/arch/powerpc/platforms/powernv/opal-memory-errors.c b/arch/powerpc/platforms/powernv/opal-memory-errors.c
new file mode 100644
index 000000000..1e8e17df9
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/opal-memory-errors.c
@@ -0,0 +1,134 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * OPAL asynchronus Memory error handling support in PowerNV.
+ *
+ * Copyright 2013 IBM Corporation
+ * Author: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
+ */
+
+#undef DEBUG
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/of.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+
+#include <asm/machdep.h>
+#include <asm/opal.h>
+#include <asm/cputable.h>
+
+static int opal_mem_err_nb_init;
+static LIST_HEAD(opal_memory_err_list);
+static DEFINE_SPINLOCK(opal_mem_err_lock);
+
+struct OpalMsgNode {
+	struct list_head list;
+	struct opal_msg msg;
+};
+
+static void handle_memory_error_event(struct OpalMemoryErrorData *merr_evt)
+{
+	uint64_t paddr_start, paddr_end;
+
+	pr_debug("%s: Retrieved memory error event, type: 0x%x\n",
+		  __func__, merr_evt->type);
+	switch (merr_evt->type) {
+	case OPAL_MEM_ERR_TYPE_RESILIENCE:
+		paddr_start = be64_to_cpu(merr_evt->u.resilience.physical_address_start);
+		paddr_end = be64_to_cpu(merr_evt->u.resilience.physical_address_end);
+		break;
+	case OPAL_MEM_ERR_TYPE_DYN_DALLOC:
+		paddr_start = be64_to_cpu(merr_evt->u.dyn_dealloc.physical_address_start);
+		paddr_end = be64_to_cpu(merr_evt->u.dyn_dealloc.physical_address_end);
+		break;
+	default:
+		return;
+	}
+
+	for (; paddr_start < paddr_end; paddr_start += PAGE_SIZE) {
+		memory_failure(paddr_start >> PAGE_SHIFT, 0);
+	}
+}
+
+static void handle_memory_error(void)
+{
+	unsigned long flags;
+	struct OpalMemoryErrorData *merr_evt;
+	struct OpalMsgNode *msg_node;
+
+	spin_lock_irqsave(&opal_mem_err_lock, flags);
+	while (!list_empty(&opal_memory_err_list)) {
+		 msg_node = list_entry(opal_memory_err_list.next,
+					   struct OpalMsgNode, list);
+		list_del(&msg_node->list);
+		spin_unlock_irqrestore(&opal_mem_err_lock, flags);
+
+		merr_evt = (struct OpalMemoryErrorData *)
+					&msg_node->msg.params[0];
+		handle_memory_error_event(merr_evt);
+		kfree(msg_node);
+		spin_lock_irqsave(&opal_mem_err_lock, flags);
+	}
+	spin_unlock_irqrestore(&opal_mem_err_lock, flags);
+}
+
+static void mem_error_handler(struct work_struct *work)
+{
+	handle_memory_error();
+}
+
+static DECLARE_WORK(mem_error_work, mem_error_handler);
+
+/*
+ * opal_memory_err_event - notifier handler that queues up the opal message
+ * to be preocessed later.
+ */
+static int opal_memory_err_event(struct notifier_block *nb,
+			  unsigned long msg_type, void *msg)
+{
+	unsigned long flags;
+	struct OpalMsgNode *msg_node;
+
+	if (msg_type != OPAL_MSG_MEM_ERR)
+		return 0;
+
+	msg_node = kzalloc(sizeof(*msg_node), GFP_ATOMIC);
+	if (!msg_node) {
+		pr_err("MEMORY_ERROR: out of memory, Opal message event not"
+		       "handled\n");
+		return -ENOMEM;
+	}
+	memcpy(&msg_node->msg, msg, sizeof(msg_node->msg));
+
+	spin_lock_irqsave(&opal_mem_err_lock, flags);
+	list_add(&msg_node->list, &opal_memory_err_list);
+	spin_unlock_irqrestore(&opal_mem_err_lock, flags);
+
+	schedule_work(&mem_error_work);
+	return 0;
+}
+
+static struct notifier_block opal_mem_err_nb = {
+	.notifier_call	= opal_memory_err_event,
+	.next		= NULL,
+	.priority	= 0,
+};
+
+static int __init opal_mem_err_init(void)
+{
+	int ret;
+
+	if (!opal_mem_err_nb_init) {
+		ret = opal_message_notifier_register(
+					OPAL_MSG_MEM_ERR, &opal_mem_err_nb);
+		if (ret) {
+			pr_err("%s: Can't register OPAL event notifier (%d)\n",
+			       __func__, ret);
+			return ret;
+		}
+		opal_mem_err_nb_init = 1;
+	}
+	return 0;
+}
+machine_device_initcall(powernv, opal_mem_err_init);
diff --git a/arch/powerpc/platforms/powernv/opal-msglog.c b/arch/powerpc/platforms/powernv/opal-msglog.c
new file mode 100644
index 000000000..d3b6e135c
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/opal-msglog.c
@@ -0,0 +1,161 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * PowerNV OPAL in-memory console interface
+ *
+ * Copyright 2014 IBM Corp.
+ */
+
+#include <asm/io.h>
+#include <asm/opal.h>
+#include <linux/debugfs.h>
+#include <linux/of.h>
+#include <linux/types.h>
+#include <asm/barrier.h>
+
+#include "powernv.h"
+
+/* OPAL in-memory console. Defined in OPAL source at core/console.c */
+struct memcons {
+	__be64 magic;
+#define MEMCONS_MAGIC	0x6630696567726173L
+	__be64 obuf_phys;
+	__be64 ibuf_phys;
+	__be32 obuf_size;
+	__be32 ibuf_size;
+	__be32 out_pos;
+#define MEMCONS_OUT_POS_WRAP	0x80000000u
+#define MEMCONS_OUT_POS_MASK	0x00ffffffu
+	__be32 in_prod;
+	__be32 in_cons;
+};
+
+static struct memcons *opal_memcons = NULL;
+
+ssize_t memcons_copy(struct memcons *mc, char *to, loff_t pos, size_t count)
+{
+	const char *conbuf;
+	ssize_t ret;
+	size_t first_read = 0;
+	uint32_t out_pos, avail;
+
+	if (!mc)
+		return -ENODEV;
+
+	out_pos = be32_to_cpu(READ_ONCE(mc->out_pos));
+
+	/* Now we've read out_pos, put a barrier in before reading the new
+	 * data it points to in conbuf. */
+	smp_rmb();
+
+	conbuf = phys_to_virt(be64_to_cpu(mc->obuf_phys));
+
+	/* When the buffer has wrapped, read from the out_pos marker to the end
+	 * of the buffer, and then read the remaining data as in the un-wrapped
+	 * case. */
+	if (out_pos & MEMCONS_OUT_POS_WRAP) {
+
+		out_pos &= MEMCONS_OUT_POS_MASK;
+		avail = be32_to_cpu(mc->obuf_size) - out_pos;
+
+		ret = memory_read_from_buffer(to, count, &pos,
+				conbuf + out_pos, avail);
+
+		if (ret < 0)
+			goto out;
+
+		first_read = ret;
+		to += first_read;
+		count -= first_read;
+		pos -= avail;
+
+		if (count <= 0)
+			goto out;
+	}
+
+	/* Sanity check. The firmware should not do this to us. */
+	if (out_pos > be32_to_cpu(mc->obuf_size)) {
+		pr_err("OPAL: memory console corruption. Aborting read.\n");
+		return -EINVAL;
+	}
+
+	ret = memory_read_from_buffer(to, count, &pos, conbuf, out_pos);
+
+	if (ret < 0)
+		goto out;
+
+	ret += first_read;
+out:
+	return ret;
+}
+
+ssize_t opal_msglog_copy(char *to, loff_t pos, size_t count)
+{
+	return memcons_copy(opal_memcons, to, pos, count);
+}
+
+static ssize_t opal_msglog_read(struct file *file, struct kobject *kobj,
+				struct bin_attribute *bin_attr, char *to,
+				loff_t pos, size_t count)
+{
+	return opal_msglog_copy(to, pos, count);
+}
+
+static struct bin_attribute opal_msglog_attr = {
+	.attr = {.name = "msglog", .mode = 0400},
+	.read = opal_msglog_read
+};
+
+struct memcons *memcons_init(struct device_node *node, const char *mc_prop_name)
+{
+	u64 mcaddr;
+	struct memcons *mc;
+
+	if (of_property_read_u64(node, mc_prop_name, &mcaddr)) {
+		pr_warn("%s property not found, no message log\n",
+			mc_prop_name);
+		goto out_err;
+	}
+
+	mc = phys_to_virt(mcaddr);
+	if (!mc) {
+		pr_warn("memory console address is invalid\n");
+		goto out_err;
+	}
+
+	if (be64_to_cpu(mc->magic) != MEMCONS_MAGIC) {
+		pr_warn("memory console version is invalid\n");
+		goto out_err;
+	}
+
+	return mc;
+
+out_err:
+	return NULL;
+}
+
+u32 memcons_get_size(struct memcons *mc)
+{
+	return be32_to_cpu(mc->ibuf_size) + be32_to_cpu(mc->obuf_size);
+}
+
+void __init opal_msglog_init(void)
+{
+	opal_memcons = memcons_init(opal_node, "ibm,opal-memcons");
+	if (!opal_memcons) {
+		pr_warn("OPAL: memcons failed to load from ibm,opal-memcons\n");
+		return;
+	}
+
+	opal_msglog_attr.size = memcons_get_size(opal_memcons);
+}
+
+void __init opal_msglog_sysfs_init(void)
+{
+	if (!opal_memcons) {
+		pr_warn("OPAL: message log initialisation failed, not creating sysfs entry\n");
+		return;
+	}
+
+	if (sysfs_create_bin_file(opal_kobj, &opal_msglog_attr) != 0)
+		pr_warn("OPAL: sysfs file creation failed\n");
+}
diff --git a/arch/powerpc/platforms/powernv/opal-nvram.c b/arch/powerpc/platforms/powernv/opal-nvram.c
new file mode 100644
index 000000000..380bc2d7e
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/opal-nvram.c
@@ -0,0 +1,113 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * PowerNV nvram code.
+ *
+ * Copyright 2011 IBM Corp.
+ */
+
+#define DEBUG
+
+#include <linux/delay.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/of.h>
+
+#include <asm/opal.h>
+#include <asm/nvram.h>
+#include <asm/machdep.h>
+
+static unsigned int nvram_size;
+
+static ssize_t opal_nvram_size(void)
+{
+	return nvram_size;
+}
+
+static ssize_t opal_nvram_read(char *buf, size_t count, loff_t *index)
+{
+	s64 rc;
+	int off;
+
+	if (*index >= nvram_size)
+		return 0;
+	off = *index;
+	if ((off + count) > nvram_size)
+		count = nvram_size - off;
+	rc = opal_read_nvram(__pa(buf), count, off);
+	if (rc != OPAL_SUCCESS)
+		return -EIO;
+	*index += count;
+	return count;
+}
+
+/*
+ * This can be called in the panic path with interrupts off, so use
+ * mdelay in that case.
+ */
+static ssize_t opal_nvram_write(char *buf, size_t count, loff_t *index)
+{
+	s64 rc = OPAL_BUSY;
+	int off;
+
+	if (*index >= nvram_size)
+		return 0;
+	off = *index;
+	if ((off + count) > nvram_size)
+		count = nvram_size - off;
+
+	while (rc == OPAL_BUSY || rc == OPAL_BUSY_EVENT) {
+		rc = opal_write_nvram(__pa(buf), count, off);
+		if (rc == OPAL_BUSY_EVENT) {
+			if (in_interrupt() || irqs_disabled())
+				mdelay(OPAL_BUSY_DELAY_MS);
+			else
+				msleep(OPAL_BUSY_DELAY_MS);
+			opal_poll_events(NULL);
+		} else if (rc == OPAL_BUSY) {
+			if (in_interrupt() || irqs_disabled())
+				mdelay(OPAL_BUSY_DELAY_MS);
+			else
+				msleep(OPAL_BUSY_DELAY_MS);
+		}
+	}
+
+	if (rc)
+		return -EIO;
+
+	*index += count;
+	return count;
+}
+
+static int __init opal_nvram_init_log_partitions(void)
+{
+	/* Scan nvram for partitions */
+	nvram_scan_partitions();
+	nvram_init_oops_partition(0);
+	return 0;
+}
+machine_arch_initcall(powernv, opal_nvram_init_log_partitions);
+
+void __init opal_nvram_init(void)
+{
+	struct device_node *np;
+	const __be32 *nbytes_p;
+
+	np = of_find_compatible_node(NULL, NULL, "ibm,opal-nvram");
+	if (np == NULL)
+		return;
+
+	nbytes_p = of_get_property(np, "#bytes", NULL);
+	if (!nbytes_p) {
+		of_node_put(np);
+		return;
+	}
+	nvram_size = be32_to_cpup(nbytes_p);
+
+	pr_info("OPAL nvram setup, %u bytes\n", nvram_size);
+	of_node_put(np);
+
+	ppc_md.nvram_read = opal_nvram_read;
+	ppc_md.nvram_write = opal_nvram_write;
+	ppc_md.nvram_size = opal_nvram_size;
+}
+
diff --git a/arch/powerpc/platforms/powernv/opal-power.c b/arch/powerpc/platforms/powernv/opal-power.c
new file mode 100644
index 000000000..2a3717fc2
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/opal-power.c
@@ -0,0 +1,174 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * PowerNV OPAL power control for graceful shutdown handling
+ *
+ * Copyright 2015 IBM Corp.
+ */
+
+#define pr_fmt(fmt)	"opal-power: "	fmt
+
+#include <linux/kernel.h>
+#include <linux/reboot.h>
+#include <linux/notifier.h>
+#include <linux/of.h>
+
+#include <asm/opal.h>
+#include <asm/machdep.h>
+
+#define SOFT_OFF 0x00
+#define SOFT_REBOOT 0x01
+
+/* Detect EPOW event */
+static bool detect_epow(void)
+{
+	u16 epow;
+	int i, rc;
+	__be16 epow_classes;
+	__be16 opal_epow_status[OPAL_SYSEPOW_MAX] = {0};
+
+	/*
+	* Check for EPOW event. Kernel sends supported EPOW classes info
+	* to OPAL. OPAL returns EPOW info along with classes present.
+	*/
+	epow_classes = cpu_to_be16(OPAL_SYSEPOW_MAX);
+	rc = opal_get_epow_status(opal_epow_status, &epow_classes);
+	if (rc != OPAL_SUCCESS) {
+		pr_err("Failed to get EPOW event information\n");
+		return false;
+	}
+
+	/* Look for EPOW events present */
+	for (i = 0; i < be16_to_cpu(epow_classes); i++) {
+		epow = be16_to_cpu(opal_epow_status[i]);
+
+		/* Filter events which do not need shutdown. */
+		if (i == OPAL_SYSEPOW_POWER)
+			epow &= ~(OPAL_SYSPOWER_CHNG | OPAL_SYSPOWER_FAIL |
+					OPAL_SYSPOWER_INCL);
+		if (epow)
+			return true;
+	}
+
+	return false;
+}
+
+/* Check for existing EPOW, DPO events */
+static bool poweroff_pending(void)
+{
+	int rc;
+	__be64 opal_dpo_timeout;
+
+	/* Check for DPO event */
+	rc = opal_get_dpo_status(&opal_dpo_timeout);
+	if (rc == OPAL_SUCCESS) {
+		pr_info("Existing DPO event detected.\n");
+		return true;
+	}
+
+	/* Check for EPOW event */
+	if (detect_epow()) {
+		pr_info("Existing EPOW event detected.\n");
+		return true;
+	}
+
+	return false;
+}
+
+/* OPAL power-control events notifier */
+static int opal_power_control_event(struct notifier_block *nb,
+					unsigned long msg_type, void *msg)
+{
+	uint64_t type;
+
+	switch (msg_type) {
+	case OPAL_MSG_EPOW:
+		if (detect_epow()) {
+			pr_info("EPOW msg received. Powering off system\n");
+			orderly_poweroff(true);
+		}
+		break;
+	case OPAL_MSG_DPO:
+		pr_info("DPO msg received. Powering off system\n");
+		orderly_poweroff(true);
+		break;
+	case OPAL_MSG_SHUTDOWN:
+		type = be64_to_cpu(((struct opal_msg *)msg)->params[0]);
+		switch (type) {
+		case SOFT_REBOOT:
+			pr_info("Reboot requested\n");
+			orderly_reboot();
+			break;
+		case SOFT_OFF:
+			pr_info("Poweroff requested\n");
+			orderly_poweroff(true);
+			break;
+		default:
+			pr_err("Unknown power-control type %llu\n", type);
+		}
+		break;
+	default:
+		pr_err("Unknown OPAL message type %lu\n", msg_type);
+	}
+
+	return 0;
+}
+
+/* OPAL EPOW event notifier block */
+static struct notifier_block opal_epow_nb = {
+	.notifier_call	= opal_power_control_event,
+	.next		= NULL,
+	.priority	= 0,
+};
+
+/* OPAL DPO event notifier block */
+static struct notifier_block opal_dpo_nb = {
+	.notifier_call	= opal_power_control_event,
+	.next		= NULL,
+	.priority	= 0,
+};
+
+/* OPAL power-control event notifier block */
+static struct notifier_block opal_power_control_nb = {
+	.notifier_call	= opal_power_control_event,
+	.next		= NULL,
+	.priority	= 0,
+};
+
+int __init opal_power_control_init(void)
+{
+	int ret, supported = 0;
+	struct device_node *np;
+
+	/* Register OPAL power-control events notifier */
+	ret = opal_message_notifier_register(OPAL_MSG_SHUTDOWN,
+						&opal_power_control_nb);
+	if (ret)
+		pr_err("Failed to register SHUTDOWN notifier, ret = %d\n", ret);
+
+	/* Determine OPAL EPOW, DPO support */
+	np = of_find_node_by_path("/ibm,opal/epow");
+	if (np) {
+		supported = of_device_is_compatible(np, "ibm,opal-v3-epow");
+		of_node_put(np);
+	}
+
+	if (!supported)
+		return 0;
+	pr_info("OPAL EPOW, DPO support detected.\n");
+
+	/* Register EPOW event notifier */
+	ret = opal_message_notifier_register(OPAL_MSG_EPOW, &opal_epow_nb);
+	if (ret)
+		pr_err("Failed to register EPOW notifier, ret = %d\n", ret);
+
+	/* Register DPO event notifier */
+	ret = opal_message_notifier_register(OPAL_MSG_DPO, &opal_dpo_nb);
+	if (ret)
+		pr_err("Failed to register DPO notifier, ret = %d\n", ret);
+
+	/* Check for any pending EPOW or DPO events. */
+	if (poweroff_pending())
+		orderly_poweroff(true);
+
+	return 0;
+}
diff --git a/arch/powerpc/platforms/powernv/opal-powercap.c b/arch/powerpc/platforms/powernv/opal-powercap.c
new file mode 100644
index 000000000..ce9ec3962
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/opal-powercap.c
@@ -0,0 +1,247 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * PowerNV OPAL Powercap interface
+ *
+ * Copyright 2017 IBM Corp.
+ */
+
+#define pr_fmt(fmt)     "opal-powercap: " fmt
+
+#include <linux/of.h>
+#include <linux/kobject.h>
+#include <linux/slab.h>
+
+#include <asm/opal.h>
+
+static DEFINE_MUTEX(powercap_mutex);
+
+static struct kobject *powercap_kobj;
+
+struct powercap_attr {
+	u32 handle;
+	struct kobj_attribute attr;
+};
+
+static struct pcap {
+	struct attribute_group pg;
+	struct powercap_attr *pattrs;
+} *pcaps;
+
+static ssize_t powercap_show(struct kobject *kobj, struct kobj_attribute *attr,
+			     char *buf)
+{
+	struct powercap_attr *pcap_attr = container_of(attr,
+						struct powercap_attr, attr);
+	struct opal_msg msg;
+	u32 pcap;
+	int ret, token;
+
+	token = opal_async_get_token_interruptible();
+	if (token < 0) {
+		pr_devel("Failed to get token\n");
+		return token;
+	}
+
+	ret = mutex_lock_interruptible(&powercap_mutex);
+	if (ret)
+		goto out_token;
+
+	ret = opal_get_powercap(pcap_attr->handle, token, (u32 *)__pa(&pcap));
+	switch (ret) {
+	case OPAL_ASYNC_COMPLETION:
+		ret = opal_async_wait_response(token, &msg);
+		if (ret) {
+			pr_devel("Failed to wait for the async response\n");
+			ret = -EIO;
+			goto out;
+		}
+		ret = opal_error_code(opal_get_async_rc(msg));
+		if (!ret) {
+			ret = sprintf(buf, "%u\n", be32_to_cpu(pcap));
+			if (ret < 0)
+				ret = -EIO;
+		}
+		break;
+	case OPAL_SUCCESS:
+		ret = sprintf(buf, "%u\n", be32_to_cpu(pcap));
+		if (ret < 0)
+			ret = -EIO;
+		break;
+	default:
+		ret = opal_error_code(ret);
+	}
+
+out:
+	mutex_unlock(&powercap_mutex);
+out_token:
+	opal_async_release_token(token);
+	return ret;
+}
+
+static ssize_t powercap_store(struct kobject *kobj,
+			      struct kobj_attribute *attr, const char *buf,
+			      size_t count)
+{
+	struct powercap_attr *pcap_attr = container_of(attr,
+						struct powercap_attr, attr);
+	struct opal_msg msg;
+	u32 pcap;
+	int ret, token;
+
+	ret = kstrtoint(buf, 0, &pcap);
+	if (ret)
+		return ret;
+
+	token = opal_async_get_token_interruptible();
+	if (token < 0) {
+		pr_devel("Failed to get token\n");
+		return token;
+	}
+
+	ret = mutex_lock_interruptible(&powercap_mutex);
+	if (ret)
+		goto out_token;
+
+	ret = opal_set_powercap(pcap_attr->handle, token, pcap);
+	switch (ret) {
+	case OPAL_ASYNC_COMPLETION:
+		ret = opal_async_wait_response(token, &msg);
+		if (ret) {
+			pr_devel("Failed to wait for the async response\n");
+			ret = -EIO;
+			goto out;
+		}
+		ret = opal_error_code(opal_get_async_rc(msg));
+		if (!ret)
+			ret = count;
+		break;
+	case OPAL_SUCCESS:
+		ret = count;
+		break;
+	default:
+		ret = opal_error_code(ret);
+	}
+
+out:
+	mutex_unlock(&powercap_mutex);
+out_token:
+	opal_async_release_token(token);
+	return ret;
+}
+
+static void powercap_add_attr(int handle, const char *name,
+			      struct powercap_attr *attr)
+{
+	attr->handle = handle;
+	sysfs_attr_init(&attr->attr.attr);
+	attr->attr.attr.name = name;
+	attr->attr.attr.mode = 0444;
+	attr->attr.show = powercap_show;
+}
+
+void __init opal_powercap_init(void)
+{
+	struct device_node *powercap, *node;
+	int i = 0;
+
+	powercap = of_find_compatible_node(NULL, NULL, "ibm,opal-powercap");
+	if (!powercap) {
+		pr_devel("Powercap node not found\n");
+		return;
+	}
+
+	pcaps = kcalloc(of_get_child_count(powercap), sizeof(*pcaps),
+			GFP_KERNEL);
+	if (!pcaps)
+		return;
+
+	powercap_kobj = kobject_create_and_add("powercap", opal_kobj);
+	if (!powercap_kobj) {
+		pr_warn("Failed to create powercap kobject\n");
+		goto out_pcaps;
+	}
+
+	i = 0;
+	for_each_child_of_node(powercap, node) {
+		u32 cur, min, max;
+		int j = 0;
+		bool has_cur = false, has_min = false, has_max = false;
+
+		if (!of_property_read_u32(node, "powercap-min", &min)) {
+			j++;
+			has_min = true;
+		}
+
+		if (!of_property_read_u32(node, "powercap-max", &max)) {
+			j++;
+			has_max = true;
+		}
+
+		if (!of_property_read_u32(node, "powercap-current", &cur)) {
+			j++;
+			has_cur = true;
+		}
+
+		pcaps[i].pattrs = kcalloc(j, sizeof(struct powercap_attr),
+					  GFP_KERNEL);
+		if (!pcaps[i].pattrs)
+			goto out_pcaps_pattrs;
+
+		pcaps[i].pg.attrs = kcalloc(j + 1, sizeof(struct attribute *),
+					    GFP_KERNEL);
+		if (!pcaps[i].pg.attrs) {
+			kfree(pcaps[i].pattrs);
+			goto out_pcaps_pattrs;
+		}
+
+		j = 0;
+		pcaps[i].pg.name = kasprintf(GFP_KERNEL, "%pOFn", node);
+		if (!pcaps[i].pg.name) {
+			kfree(pcaps[i].pattrs);
+			kfree(pcaps[i].pg.attrs);
+			goto out_pcaps_pattrs;
+		}
+
+		if (has_min) {
+			powercap_add_attr(min, "powercap-min",
+					  &pcaps[i].pattrs[j]);
+			pcaps[i].pg.attrs[j] = &pcaps[i].pattrs[j].attr.attr;
+			j++;
+		}
+
+		if (has_max) {
+			powercap_add_attr(max, "powercap-max",
+					  &pcaps[i].pattrs[j]);
+			pcaps[i].pg.attrs[j] = &pcaps[i].pattrs[j].attr.attr;
+			j++;
+		}
+
+		if (has_cur) {
+			powercap_add_attr(cur, "powercap-current",
+					  &pcaps[i].pattrs[j]);
+			pcaps[i].pattrs[j].attr.attr.mode |= 0220;
+			pcaps[i].pattrs[j].attr.store = powercap_store;
+			pcaps[i].pg.attrs[j] = &pcaps[i].pattrs[j].attr.attr;
+			j++;
+		}
+
+		if (sysfs_create_group(powercap_kobj, &pcaps[i].pg)) {
+			pr_warn("Failed to create powercap attribute group %s\n",
+				pcaps[i].pg.name);
+			goto out_pcaps_pattrs;
+		}
+		i++;
+	}
+
+	return;
+
+out_pcaps_pattrs:
+	while (--i >= 0) {
+		kfree(pcaps[i].pattrs);
+		kfree(pcaps[i].pg.attrs);
+		kfree(pcaps[i].pg.name);
+	}
+	kobject_put(powercap_kobj);
+out_pcaps:
+	kfree(pcaps);
+}
diff --git a/arch/powerpc/platforms/powernv/opal-prd.c b/arch/powerpc/platforms/powernv/opal-prd.c
new file mode 100644
index 000000000..17a2874d1
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/opal-prd.c
@@ -0,0 +1,448 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * OPAL Runtime Diagnostics interface driver
+ * Supported on POWERNV platform
+ *
+ * Copyright IBM Corporation 2015
+ */
+
+#define pr_fmt(fmt) "opal-prd: " fmt
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/platform_device.h>
+#include <linux/miscdevice.h>
+#include <linux/fs.h>
+#include <linux/of.h>
+#include <linux/of_address.h>
+#include <linux/poll.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <asm/opal-prd.h>
+#include <asm/opal.h>
+#include <asm/io.h>
+#include <linux/uaccess.h>
+
+
+/*
+ * The msg member must be at the end of the struct, as it's followed by the
+ * message data.
+ */
+struct opal_prd_msg_queue_item {
+	struct list_head		list;
+	struct opal_prd_msg_header	msg;
+};
+
+static struct device_node *prd_node;
+static LIST_HEAD(opal_prd_msg_queue);
+static DEFINE_SPINLOCK(opal_prd_msg_queue_lock);
+static DECLARE_WAIT_QUEUE_HEAD(opal_prd_msg_wait);
+static atomic_t prd_usage;
+
+static bool opal_prd_range_is_valid(uint64_t addr, uint64_t size)
+{
+	struct device_node *parent, *node;
+	bool found;
+
+	if (addr + size < addr)
+		return false;
+
+	parent = of_find_node_by_path("/reserved-memory");
+	if (!parent)
+		return false;
+
+	found = false;
+
+	for_each_child_of_node(parent, node) {
+		uint64_t range_addr, range_size, range_end;
+		const __be32 *addrp;
+		const char *label;
+
+		addrp = of_get_address(node, 0, &range_size, NULL);
+
+		range_addr = of_read_number(addrp, 2);
+		range_end = range_addr + range_size;
+
+		label = of_get_property(node, "ibm,prd-label", NULL);
+
+		/* PRD ranges need a label */
+		if (!label)
+			continue;
+
+		if (range_end <= range_addr)
+			continue;
+
+		if (addr >= range_addr && addr + size <= range_end) {
+			found = true;
+			of_node_put(node);
+			break;
+		}
+	}
+
+	of_node_put(parent);
+	return found;
+}
+
+static int opal_prd_open(struct inode *inode, struct file *file)
+{
+	/*
+	 * Prevent multiple (separate) processes from concurrent interactions
+	 * with the FW PRD channel
+	 */
+	if (atomic_xchg(&prd_usage, 1) == 1)
+		return -EBUSY;
+
+	return 0;
+}
+
+/*
+ * opal_prd_mmap - maps firmware-provided ranges into userspace
+ * @file: file structure for the device
+ * @vma: VMA to map the registers into
+ */
+
+static int opal_prd_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	size_t addr, size;
+	pgprot_t page_prot;
+	int rc;
+
+	pr_devel("opal_prd_mmap(0x%016lx, 0x%016lx, 0x%lx, 0x%lx)\n",
+			vma->vm_start, vma->vm_end, vma->vm_pgoff,
+			vma->vm_flags);
+
+	addr = vma->vm_pgoff << PAGE_SHIFT;
+	size = vma->vm_end - vma->vm_start;
+
+	/* ensure we're mapping within one of the allowable ranges */
+	if (!opal_prd_range_is_valid(addr, size))
+		return -EINVAL;
+
+	page_prot = phys_mem_access_prot(file, vma->vm_pgoff,
+					 size, vma->vm_page_prot);
+
+	rc = remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff, size,
+				page_prot);
+
+	return rc;
+}
+
+static bool opal_msg_queue_empty(void)
+{
+	unsigned long flags;
+	bool ret;
+
+	spin_lock_irqsave(&opal_prd_msg_queue_lock, flags);
+	ret = list_empty(&opal_prd_msg_queue);
+	spin_unlock_irqrestore(&opal_prd_msg_queue_lock, flags);
+
+	return ret;
+}
+
+static __poll_t opal_prd_poll(struct file *file,
+		struct poll_table_struct *wait)
+{
+	poll_wait(file, &opal_prd_msg_wait, wait);
+
+	if (!opal_msg_queue_empty())
+		return EPOLLIN | EPOLLRDNORM;
+
+	return 0;
+}
+
+static ssize_t opal_prd_read(struct file *file, char __user *buf,
+		size_t count, loff_t *ppos)
+{
+	struct opal_prd_msg_queue_item *item;
+	unsigned long flags;
+	ssize_t size, err;
+	int rc;
+
+	/* we need at least a header's worth of data */
+	if (count < sizeof(item->msg))
+		return -EINVAL;
+
+	if (*ppos)
+		return -ESPIPE;
+
+	item = NULL;
+
+	for (;;) {
+
+		spin_lock_irqsave(&opal_prd_msg_queue_lock, flags);
+		if (!list_empty(&opal_prd_msg_queue)) {
+			item = list_first_entry(&opal_prd_msg_queue,
+					struct opal_prd_msg_queue_item, list);
+			list_del(&item->list);
+		}
+		spin_unlock_irqrestore(&opal_prd_msg_queue_lock, flags);
+
+		if (item)
+			break;
+
+		if (file->f_flags & O_NONBLOCK)
+			return -EAGAIN;
+
+		rc = wait_event_interruptible(opal_prd_msg_wait,
+				!opal_msg_queue_empty());
+		if (rc)
+			return -EINTR;
+	}
+
+	size = be16_to_cpu(item->msg.size);
+	if (size > count) {
+		err = -EINVAL;
+		goto err_requeue;
+	}
+
+	rc = copy_to_user(buf, &item->msg, size);
+	if (rc) {
+		err = -EFAULT;
+		goto err_requeue;
+	}
+
+	kfree(item);
+
+	return size;
+
+err_requeue:
+	/* eep! re-queue at the head of the list */
+	spin_lock_irqsave(&opal_prd_msg_queue_lock, flags);
+	list_add(&item->list, &opal_prd_msg_queue);
+	spin_unlock_irqrestore(&opal_prd_msg_queue_lock, flags);
+	return err;
+}
+
+static ssize_t opal_prd_write(struct file *file, const char __user *buf,
+		size_t count, loff_t *ppos)
+{
+	struct opal_prd_msg_header hdr;
+	ssize_t size;
+	void *msg;
+	int rc;
+
+	size = sizeof(hdr);
+
+	if (count < size)
+		return -EINVAL;
+
+	/* grab the header */
+	rc = copy_from_user(&hdr, buf, sizeof(hdr));
+	if (rc)
+		return -EFAULT;
+
+	size = be16_to_cpu(hdr.size);
+
+	msg = memdup_user(buf, size);
+	if (IS_ERR(msg))
+		return PTR_ERR(msg);
+
+	rc = opal_prd_msg(msg);
+	if (rc) {
+		pr_warn("write: opal_prd_msg returned %d\n", rc);
+		size = -EIO;
+	}
+
+	kfree(msg);
+
+	return size;
+}
+
+static int opal_prd_release(struct inode *inode, struct file *file)
+{
+	struct opal_prd_msg_header msg;
+
+	msg.size = cpu_to_be16(sizeof(msg));
+	msg.type = OPAL_PRD_MSG_TYPE_FINI;
+
+	opal_prd_msg((struct opal_prd_msg *)&msg);
+
+	atomic_xchg(&prd_usage, 0);
+
+	return 0;
+}
+
+static long opal_prd_ioctl(struct file *file, unsigned int cmd,
+		unsigned long param)
+{
+	struct opal_prd_info info;
+	struct opal_prd_scom scom;
+	int rc = 0;
+
+	switch (cmd) {
+	case OPAL_PRD_GET_INFO:
+		memset(&info, 0, sizeof(info));
+		info.version = OPAL_PRD_KERNEL_VERSION;
+		rc = copy_to_user((void __user *)param, &info, sizeof(info));
+		if (rc)
+			return -EFAULT;
+		break;
+
+	case OPAL_PRD_SCOM_READ:
+		rc = copy_from_user(&scom, (void __user *)param, sizeof(scom));
+		if (rc)
+			return -EFAULT;
+
+		scom.rc = opal_xscom_read(scom.chip, scom.addr,
+				(__be64 *)&scom.data);
+		scom.data = be64_to_cpu(scom.data);
+		pr_devel("ioctl SCOM_READ: chip %llx addr %016llx data %016llx rc %lld\n",
+				scom.chip, scom.addr, scom.data, scom.rc);
+
+		rc = copy_to_user((void __user *)param, &scom, sizeof(scom));
+		if (rc)
+			return -EFAULT;
+		break;
+
+	case OPAL_PRD_SCOM_WRITE:
+		rc = copy_from_user(&scom, (void __user *)param, sizeof(scom));
+		if (rc)
+			return -EFAULT;
+
+		scom.rc = opal_xscom_write(scom.chip, scom.addr, scom.data);
+		pr_devel("ioctl SCOM_WRITE: chip %llx addr %016llx data %016llx rc %lld\n",
+				scom.chip, scom.addr, scom.data, scom.rc);
+
+		rc = copy_to_user((void __user *)param, &scom, sizeof(scom));
+		if (rc)
+			return -EFAULT;
+		break;
+
+	default:
+		rc = -EINVAL;
+	}
+
+	return rc;
+}
+
+static const struct file_operations opal_prd_fops = {
+	.open		= opal_prd_open,
+	.mmap		= opal_prd_mmap,
+	.poll		= opal_prd_poll,
+	.read		= opal_prd_read,
+	.write		= opal_prd_write,
+	.unlocked_ioctl	= opal_prd_ioctl,
+	.release	= opal_prd_release,
+	.owner		= THIS_MODULE,
+};
+
+static struct miscdevice opal_prd_dev = {
+	.minor		= MISC_DYNAMIC_MINOR,
+	.name		= "opal-prd",
+	.fops		= &opal_prd_fops,
+};
+
+/* opal interface */
+static int opal_prd_msg_notifier(struct notifier_block *nb,
+		unsigned long msg_type, void *_msg)
+{
+	struct opal_prd_msg_queue_item *item;
+	struct opal_prd_msg_header *hdr;
+	struct opal_msg *msg = _msg;
+	int msg_size, item_size;
+	unsigned long flags;
+
+	if (msg_type != OPAL_MSG_PRD && msg_type != OPAL_MSG_PRD2)
+		return 0;
+
+	/* Calculate total size of the message and item we need to store. The
+	 * 'size' field in the header includes the header itself. */
+	hdr = (void *)msg->params;
+	msg_size = be16_to_cpu(hdr->size);
+	item_size = msg_size + sizeof(*item) - sizeof(item->msg);
+
+	item = kzalloc(item_size, GFP_ATOMIC);
+	if (!item)
+		return -ENOMEM;
+
+	memcpy(&item->msg, msg->params, msg_size);
+
+	spin_lock_irqsave(&opal_prd_msg_queue_lock, flags);
+	list_add_tail(&item->list, &opal_prd_msg_queue);
+	spin_unlock_irqrestore(&opal_prd_msg_queue_lock, flags);
+
+	wake_up_interruptible(&opal_prd_msg_wait);
+
+	return 0;
+}
+
+static struct notifier_block opal_prd_event_nb = {
+	.notifier_call	= opal_prd_msg_notifier,
+	.next		= NULL,
+	.priority	= 0,
+};
+
+static struct notifier_block opal_prd_event_nb2 = {
+	.notifier_call	= opal_prd_msg_notifier,
+	.next		= NULL,
+	.priority	= 0,
+};
+
+static int opal_prd_probe(struct platform_device *pdev)
+{
+	int rc;
+
+	if (!pdev || !pdev->dev.of_node)
+		return -ENODEV;
+
+	/* We should only have one prd driver instance per machine; ensure
+	 * that we only get a valid probe on a single OF node.
+	 */
+	if (prd_node)
+		return -EBUSY;
+
+	prd_node = pdev->dev.of_node;
+
+	rc = opal_message_notifier_register(OPAL_MSG_PRD, &opal_prd_event_nb);
+	if (rc) {
+		pr_err("Couldn't register event notifier\n");
+		return rc;
+	}
+
+	rc = opal_message_notifier_register(OPAL_MSG_PRD2, &opal_prd_event_nb2);
+	if (rc) {
+		pr_err("Couldn't register PRD2 event notifier\n");
+		opal_message_notifier_unregister(OPAL_MSG_PRD, &opal_prd_event_nb);
+		return rc;
+	}
+
+	rc = misc_register(&opal_prd_dev);
+	if (rc) {
+		pr_err("failed to register miscdev\n");
+		opal_message_notifier_unregister(OPAL_MSG_PRD,
+				&opal_prd_event_nb);
+		opal_message_notifier_unregister(OPAL_MSG_PRD2,
+				&opal_prd_event_nb2);
+		return rc;
+	}
+
+	return 0;
+}
+
+static int opal_prd_remove(struct platform_device *pdev)
+{
+	misc_deregister(&opal_prd_dev);
+	opal_message_notifier_unregister(OPAL_MSG_PRD, &opal_prd_event_nb);
+	opal_message_notifier_unregister(OPAL_MSG_PRD2, &opal_prd_event_nb2);
+	return 0;
+}
+
+static const struct of_device_id opal_prd_match[] = {
+	{ .compatible = "ibm,opal-prd" },
+	{ },
+};
+
+static struct platform_driver opal_prd_driver = {
+	.driver = {
+		.name		= "opal-prd",
+		.of_match_table	= opal_prd_match,
+	},
+	.probe	= opal_prd_probe,
+	.remove	= opal_prd_remove,
+};
+
+module_platform_driver(opal_prd_driver);
+
+MODULE_DEVICE_TABLE(of, opal_prd_match);
+MODULE_DESCRIPTION("PowerNV OPAL runtime diagnostic driver");
+MODULE_LICENSE("GPL");
diff --git a/arch/powerpc/platforms/powernv/opal-psr.c b/arch/powerpc/platforms/powernv/opal-psr.c
new file mode 100644
index 000000000..69d7e7595
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/opal-psr.c
@@ -0,0 +1,171 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * PowerNV OPAL Power-Shift-Ratio interface
+ *
+ * Copyright 2017 IBM Corp.
+ */
+
+#define pr_fmt(fmt)     "opal-psr: " fmt
+
+#include <linux/of.h>
+#include <linux/kobject.h>
+#include <linux/slab.h>
+
+#include <asm/opal.h>
+
+static DEFINE_MUTEX(psr_mutex);
+
+static struct kobject *psr_kobj;
+
+static struct psr_attr {
+	u32 handle;
+	struct kobj_attribute attr;
+} *psr_attrs;
+
+static ssize_t psr_show(struct kobject *kobj, struct kobj_attribute *attr,
+			char *buf)
+{
+	struct psr_attr *psr_attr = container_of(attr, struct psr_attr, attr);
+	struct opal_msg msg;
+	int psr, ret, token;
+
+	token = opal_async_get_token_interruptible();
+	if (token < 0) {
+		pr_devel("Failed to get token\n");
+		return token;
+	}
+
+	ret = mutex_lock_interruptible(&psr_mutex);
+	if (ret)
+		goto out_token;
+
+	ret = opal_get_power_shift_ratio(psr_attr->handle, token,
+					    (u32 *)__pa(&psr));
+	switch (ret) {
+	case OPAL_ASYNC_COMPLETION:
+		ret = opal_async_wait_response(token, &msg);
+		if (ret) {
+			pr_devel("Failed to wait for the async response\n");
+			ret = -EIO;
+			goto out;
+		}
+		ret = opal_error_code(opal_get_async_rc(msg));
+		if (!ret) {
+			ret = sprintf(buf, "%u\n", be32_to_cpu(psr));
+			if (ret < 0)
+				ret = -EIO;
+		}
+		break;
+	case OPAL_SUCCESS:
+		ret = sprintf(buf, "%u\n", be32_to_cpu(psr));
+		if (ret < 0)
+			ret = -EIO;
+		break;
+	default:
+		ret = opal_error_code(ret);
+	}
+
+out:
+	mutex_unlock(&psr_mutex);
+out_token:
+	opal_async_release_token(token);
+	return ret;
+}
+
+static ssize_t psr_store(struct kobject *kobj, struct kobj_attribute *attr,
+			 const char *buf, size_t count)
+{
+	struct psr_attr *psr_attr = container_of(attr, struct psr_attr, attr);
+	struct opal_msg msg;
+	int psr, ret, token;
+
+	ret = kstrtoint(buf, 0, &psr);
+	if (ret)
+		return ret;
+
+	token = opal_async_get_token_interruptible();
+	if (token < 0) {
+		pr_devel("Failed to get token\n");
+		return token;
+	}
+
+	ret = mutex_lock_interruptible(&psr_mutex);
+	if (ret)
+		goto out_token;
+
+	ret = opal_set_power_shift_ratio(psr_attr->handle, token, psr);
+	switch (ret) {
+	case OPAL_ASYNC_COMPLETION:
+		ret = opal_async_wait_response(token, &msg);
+		if (ret) {
+			pr_devel("Failed to wait for the async response\n");
+			ret = -EIO;
+			goto out;
+		}
+		ret = opal_error_code(opal_get_async_rc(msg));
+		if (!ret)
+			ret = count;
+		break;
+	case OPAL_SUCCESS:
+		ret = count;
+		break;
+	default:
+		ret = opal_error_code(ret);
+	}
+
+out:
+	mutex_unlock(&psr_mutex);
+out_token:
+	opal_async_release_token(token);
+	return ret;
+}
+
+void __init opal_psr_init(void)
+{
+	struct device_node *psr, *node;
+	int i = 0;
+
+	psr = of_find_compatible_node(NULL, NULL,
+				      "ibm,opal-power-shift-ratio");
+	if (!psr) {
+		pr_devel("Power-shift-ratio node not found\n");
+		return;
+	}
+
+	psr_attrs = kcalloc(of_get_child_count(psr), sizeof(*psr_attrs),
+			    GFP_KERNEL);
+	if (!psr_attrs)
+		return;
+
+	psr_kobj = kobject_create_and_add("psr", opal_kobj);
+	if (!psr_kobj) {
+		pr_warn("Failed to create psr kobject\n");
+		goto out;
+	}
+
+	for_each_child_of_node(psr, node) {
+		if (of_property_read_u32(node, "handle",
+					 &psr_attrs[i].handle))
+			goto out_kobj;
+
+		sysfs_attr_init(&psr_attrs[i].attr.attr);
+		if (of_property_read_string(node, "label",
+					    &psr_attrs[i].attr.attr.name))
+			goto out_kobj;
+		psr_attrs[i].attr.attr.mode = 0664;
+		psr_attrs[i].attr.show = psr_show;
+		psr_attrs[i].attr.store = psr_store;
+		if (sysfs_create_file(psr_kobj, &psr_attrs[i].attr.attr)) {
+			pr_devel("Failed to create psr sysfs file %s\n",
+				 psr_attrs[i].attr.attr.name);
+			goto out_kobj;
+		}
+		i++;
+	}
+
+	return;
+out_kobj:
+	kobject_put(psr_kobj);
+out:
+	kfree(psr_attrs);
+}
diff --git a/arch/powerpc/platforms/powernv/opal-rtc.c b/arch/powerpc/platforms/powernv/opal-rtc.c
new file mode 100644
index 000000000..44d7dacb3
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/opal-rtc.c
@@ -0,0 +1,83 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * PowerNV Real Time Clock.
+ *
+ * Copyright 2011 IBM Corp.
+ */
+
+
+#include <linux/kernel.h>
+#include <linux/time.h>
+#include <linux/bcd.h>
+#include <linux/rtc.h>
+#include <linux/delay.h>
+#include <linux/platform_device.h>
+#include <linux/of_platform.h>
+
+#include <asm/opal.h>
+#include <asm/firmware.h>
+#include <asm/machdep.h>
+
+static void opal_to_tm(u32 y_m_d, u64 h_m_s_ms, struct rtc_time *tm)
+{
+	tm->tm_year	= ((bcd2bin(y_m_d >> 24) * 100) +
+			   bcd2bin((y_m_d >> 16) & 0xff)) - 1900;
+	tm->tm_mon	= bcd2bin((y_m_d >> 8) & 0xff) - 1;
+	tm->tm_mday	= bcd2bin(y_m_d & 0xff);
+	tm->tm_hour	= bcd2bin((h_m_s_ms >> 56) & 0xff);
+	tm->tm_min	= bcd2bin((h_m_s_ms >> 48) & 0xff);
+	tm->tm_sec	= bcd2bin((h_m_s_ms >> 40) & 0xff);
+	tm->tm_wday     = -1;
+}
+
+time64_t __init opal_get_boot_time(void)
+{
+	struct rtc_time tm;
+	u32 y_m_d;
+	u64 h_m_s_ms;
+	__be32 __y_m_d;
+	__be64 __h_m_s_ms;
+	long rc = OPAL_BUSY;
+
+	if (!opal_check_token(OPAL_RTC_READ))
+		return 0;
+
+	while (rc == OPAL_BUSY || rc == OPAL_BUSY_EVENT) {
+		rc = opal_rtc_read(&__y_m_d, &__h_m_s_ms);
+		if (rc == OPAL_BUSY_EVENT) {
+			mdelay(OPAL_BUSY_DELAY_MS);
+			opal_poll_events(NULL);
+		} else if (rc == OPAL_BUSY) {
+			mdelay(OPAL_BUSY_DELAY_MS);
+		}
+	}
+	if (rc != OPAL_SUCCESS)
+		return 0;
+
+	y_m_d = be32_to_cpu(__y_m_d);
+	h_m_s_ms = be64_to_cpu(__h_m_s_ms);
+	opal_to_tm(y_m_d, h_m_s_ms, &tm);
+	return rtc_tm_to_time64(&tm);
+}
+
+static __init int opal_time_init(void)
+{
+	struct platform_device *pdev;
+	struct device_node *rtc;
+
+	rtc = of_find_node_by_path("/ibm,opal/rtc");
+	if (rtc) {
+		pdev = of_platform_device_create(rtc, "opal-rtc", NULL);
+		of_node_put(rtc);
+	} else {
+		if (opal_check_token(OPAL_RTC_READ) ||
+		    opal_check_token(OPAL_READ_TPO))
+			pdev = platform_device_register_simple("opal-rtc", -1,
+							       NULL, 0);
+		else
+			return -ENODEV;
+	}
+
+	return PTR_ERR_OR_ZERO(pdev);
+}
+machine_subsys_initcall(powernv, opal_time_init);
diff --git a/arch/powerpc/platforms/powernv/opal-secvar.c b/arch/powerpc/platforms/powernv/opal-secvar.c
new file mode 100644
index 000000000..14133e120
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/opal-secvar.c
@@ -0,0 +1,140 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * PowerNV code for secure variables
+ *
+ * Copyright (C) 2019 IBM Corporation
+ * Author: Claudio Carvalho
+ *         Nayna Jain
+ *
+ * APIs to access secure variables managed by OPAL.
+ */
+
+#define pr_fmt(fmt) "secvar: "fmt
+
+#include <linux/types.h>
+#include <linux/platform_device.h>
+#include <linux/of_platform.h>
+#include <asm/opal.h>
+#include <asm/secvar.h>
+#include <asm/secure_boot.h>
+
+static int opal_status_to_err(int rc)
+{
+	int err;
+
+	switch (rc) {
+	case OPAL_SUCCESS:
+		err = 0;
+		break;
+	case OPAL_UNSUPPORTED:
+		err = -ENXIO;
+		break;
+	case OPAL_PARAMETER:
+		err = -EINVAL;
+		break;
+	case OPAL_RESOURCE:
+		err = -ENOSPC;
+		break;
+	case OPAL_HARDWARE:
+		err = -EIO;
+		break;
+	case OPAL_NO_MEM:
+		err = -ENOMEM;
+		break;
+	case OPAL_EMPTY:
+		err = -ENOENT;
+		break;
+	case OPAL_PARTIAL:
+		err = -EFBIG;
+		break;
+	default:
+		err = -EINVAL;
+	}
+
+	return err;
+}
+
+static int opal_get_variable(const char *key, uint64_t ksize,
+			     u8 *data, uint64_t *dsize)
+{
+	int rc;
+
+	if (!key || !dsize)
+		return -EINVAL;
+
+	*dsize = cpu_to_be64(*dsize);
+
+	rc = opal_secvar_get(key, ksize, data, dsize);
+
+	*dsize = be64_to_cpu(*dsize);
+
+	return opal_status_to_err(rc);
+}
+
+static int opal_get_next_variable(const char *key, uint64_t *keylen,
+				  uint64_t keybufsize)
+{
+	int rc;
+
+	if (!key || !keylen)
+		return -EINVAL;
+
+	*keylen = cpu_to_be64(*keylen);
+
+	rc = opal_secvar_get_next(key, keylen, keybufsize);
+
+	*keylen = be64_to_cpu(*keylen);
+
+	return opal_status_to_err(rc);
+}
+
+static int opal_set_variable(const char *key, uint64_t ksize, u8 *data,
+			     uint64_t dsize)
+{
+	int rc;
+
+	if (!key || !data)
+		return -EINVAL;
+
+	rc = opal_secvar_enqueue_update(key, ksize, data, dsize);
+
+	return opal_status_to_err(rc);
+}
+
+static const struct secvar_operations opal_secvar_ops = {
+	.get = opal_get_variable,
+	.get_next = opal_get_next_variable,
+	.set = opal_set_variable,
+};
+
+static int opal_secvar_probe(struct platform_device *pdev)
+{
+	if (!opal_check_token(OPAL_SECVAR_GET)
+			|| !opal_check_token(OPAL_SECVAR_GET_NEXT)
+			|| !opal_check_token(OPAL_SECVAR_ENQUEUE_UPDATE)) {
+		pr_err("OPAL doesn't support secure variables\n");
+		return -ENODEV;
+	}
+
+	set_secvar_ops(&opal_secvar_ops);
+
+	return 0;
+}
+
+static const struct of_device_id opal_secvar_match[] = {
+	{ .compatible = "ibm,secvar-backend",},
+	{},
+};
+
+static struct platform_driver opal_secvar_driver = {
+	.driver = {
+		.name = "secvar",
+		.of_match_table = opal_secvar_match,
+	},
+};
+
+static int __init opal_secvar_init(void)
+{
+	return platform_driver_probe(&opal_secvar_driver, opal_secvar_probe);
+}
+device_initcall(opal_secvar_init);
diff --git a/arch/powerpc/platforms/powernv/opal-sensor-groups.c b/arch/powerpc/platforms/powernv/opal-sensor-groups.c
new file mode 100644
index 000000000..f8ae1fb0c
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/opal-sensor-groups.c
@@ -0,0 +1,236 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * PowerNV OPAL Sensor-groups interface
+ *
+ * Copyright 2017 IBM Corp.
+ */
+
+#define pr_fmt(fmt)     "opal-sensor-groups: " fmt
+
+#include <linux/of.h>
+#include <linux/kobject.h>
+#include <linux/slab.h>
+
+#include <asm/opal.h>
+
+static DEFINE_MUTEX(sg_mutex);
+
+static struct kobject *sg_kobj;
+
+struct sg_attr {
+	u32 handle;
+	struct kobj_attribute attr;
+};
+
+static struct sensor_group {
+	char name[20];
+	struct attribute_group sg;
+	struct sg_attr *sgattrs;
+} *sgs;
+
+int sensor_group_enable(u32 handle, bool enable)
+{
+	struct opal_msg msg;
+	int token, ret;
+
+	token = opal_async_get_token_interruptible();
+	if (token < 0)
+		return token;
+
+	ret = opal_sensor_group_enable(handle, token, enable);
+	if (ret == OPAL_ASYNC_COMPLETION) {
+		ret = opal_async_wait_response(token, &msg);
+		if (ret) {
+			pr_devel("Failed to wait for the async response\n");
+			ret = -EIO;
+			goto out;
+		}
+		ret = opal_error_code(opal_get_async_rc(msg));
+	} else {
+		ret = opal_error_code(ret);
+	}
+
+out:
+	opal_async_release_token(token);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(sensor_group_enable);
+
+static ssize_t sg_store(struct kobject *kobj, struct kobj_attribute *attr,
+			const char *buf, size_t count)
+{
+	struct sg_attr *sattr = container_of(attr, struct sg_attr, attr);
+	struct opal_msg msg;
+	u32 data;
+	int ret, token;
+
+	ret = kstrtoint(buf, 0, &data);
+	if (ret)
+		return ret;
+
+	if (data != 1)
+		return -EINVAL;
+
+	token = opal_async_get_token_interruptible();
+	if (token < 0) {
+		pr_devel("Failed to get token\n");
+		return token;
+	}
+
+	ret = mutex_lock_interruptible(&sg_mutex);
+	if (ret)
+		goto out_token;
+
+	ret = opal_sensor_group_clear(sattr->handle, token);
+	switch (ret) {
+	case OPAL_ASYNC_COMPLETION:
+		ret = opal_async_wait_response(token, &msg);
+		if (ret) {
+			pr_devel("Failed to wait for the async response\n");
+			ret = -EIO;
+			goto out;
+		}
+		ret = opal_error_code(opal_get_async_rc(msg));
+		if (!ret)
+			ret = count;
+		break;
+	case OPAL_SUCCESS:
+		ret = count;
+		break;
+	default:
+		ret = opal_error_code(ret);
+	}
+
+out:
+	mutex_unlock(&sg_mutex);
+out_token:
+	opal_async_release_token(token);
+	return ret;
+}
+
+static struct sg_ops_info {
+	int opal_no;
+	const char *attr_name;
+	ssize_t (*store)(struct kobject *kobj, struct kobj_attribute *attr,
+			const char *buf, size_t count);
+} ops_info[] = {
+	{ OPAL_SENSOR_GROUP_CLEAR, "clear", sg_store },
+};
+
+static void add_attr(int handle, struct sg_attr *attr, int index)
+{
+	attr->handle = handle;
+	sysfs_attr_init(&attr->attr.attr);
+	attr->attr.attr.name = ops_info[index].attr_name;
+	attr->attr.attr.mode = 0220;
+	attr->attr.store = ops_info[index].store;
+}
+
+static int add_attr_group(const __be32 *ops, int len, struct sensor_group *sg,
+			   u32 handle)
+{
+	int i, j;
+	int count = 0;
+
+	for (i = 0; i < len; i++)
+		for (j = 0; j < ARRAY_SIZE(ops_info); j++)
+			if (be32_to_cpu(ops[i]) == ops_info[j].opal_no) {
+				add_attr(handle, &sg->sgattrs[count], j);
+				sg->sg.attrs[count] =
+					&sg->sgattrs[count].attr.attr;
+				count++;
+			}
+
+	return sysfs_create_group(sg_kobj, &sg->sg);
+}
+
+static int get_nr_attrs(const __be32 *ops, int len)
+{
+	int i, j;
+	int nr_attrs = 0;
+
+	for (i = 0; i < len; i++)
+		for (j = 0; j < ARRAY_SIZE(ops_info); j++)
+			if (be32_to_cpu(ops[i]) == ops_info[j].opal_no)
+				nr_attrs++;
+
+	return nr_attrs;
+}
+
+void __init opal_sensor_groups_init(void)
+{
+	struct device_node *sg, *node;
+	int i = 0;
+
+	sg = of_find_compatible_node(NULL, NULL, "ibm,opal-sensor-group");
+	if (!sg) {
+		pr_devel("Sensor groups node not found\n");
+		return;
+	}
+
+	sgs = kcalloc(of_get_child_count(sg), sizeof(*sgs), GFP_KERNEL);
+	if (!sgs)
+		return;
+
+	sg_kobj = kobject_create_and_add("sensor_groups", opal_kobj);
+	if (!sg_kobj) {
+		pr_warn("Failed to create sensor group kobject\n");
+		goto out_sgs;
+	}
+
+	for_each_child_of_node(sg, node) {
+		const __be32 *ops;
+		u32 sgid, len, nr_attrs, chipid;
+
+		ops = of_get_property(node, "ops", &len);
+		if (!ops)
+			continue;
+
+		nr_attrs = get_nr_attrs(ops, len);
+		if (!nr_attrs)
+			continue;
+
+		sgs[i].sgattrs = kcalloc(nr_attrs, sizeof(*sgs[i].sgattrs),
+					 GFP_KERNEL);
+		if (!sgs[i].sgattrs)
+			goto out_sgs_sgattrs;
+
+		sgs[i].sg.attrs = kcalloc(nr_attrs + 1,
+					  sizeof(*sgs[i].sg.attrs),
+					  GFP_KERNEL);
+
+		if (!sgs[i].sg.attrs) {
+			kfree(sgs[i].sgattrs);
+			goto out_sgs_sgattrs;
+		}
+
+		if (of_property_read_u32(node, "sensor-group-id", &sgid)) {
+			pr_warn("sensor-group-id property not found\n");
+			goto out_sgs_sgattrs;
+		}
+
+		if (!of_property_read_u32(node, "ibm,chip-id", &chipid))
+			sprintf(sgs[i].name, "%pOFn%d", node, chipid);
+		else
+			sprintf(sgs[i].name, "%pOFn", node);
+
+		sgs[i].sg.name = sgs[i].name;
+		if (add_attr_group(ops, len, &sgs[i], sgid)) {
+			pr_warn("Failed to create sensor attribute group %s\n",
+				sgs[i].sg.name);
+			goto out_sgs_sgattrs;
+		}
+		i++;
+	}
+
+	return;
+
+out_sgs_sgattrs:
+	while (--i >= 0) {
+		kfree(sgs[i].sgattrs);
+		kfree(sgs[i].sg.attrs);
+	}
+	kobject_put(sg_kobj);
+out_sgs:
+	kfree(sgs);
+}
diff --git a/arch/powerpc/platforms/powernv/opal-sensor.c b/arch/powerpc/platforms/powernv/opal-sensor.c
new file mode 100644
index 000000000..3192c614a
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/opal-sensor.c
@@ -0,0 +1,130 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * PowerNV sensor code
+ *
+ * Copyright (C) 2013 IBM
+ */
+
+#include <linux/delay.h>
+#include <linux/of_platform.h>
+#include <asm/opal.h>
+#include <asm/machdep.h>
+
+/*
+ * This will return sensor information to driver based on the requested sensor
+ * handle. A handle is an opaque id for the powernv, read by the driver from the
+ * device tree..
+ */
+int opal_get_sensor_data(u32 sensor_hndl, u32 *sensor_data)
+{
+	int ret, token;
+	struct opal_msg msg;
+	__be32 data;
+
+	token = opal_async_get_token_interruptible();
+	if (token < 0)
+		return token;
+
+	ret = opal_sensor_read(sensor_hndl, token, &data);
+	switch (ret) {
+	case OPAL_ASYNC_COMPLETION:
+		ret = opal_async_wait_response(token, &msg);
+		if (ret) {
+			pr_err("%s: Failed to wait for the async response, %d\n",
+			       __func__, ret);
+			goto out;
+		}
+
+		ret = opal_error_code(opal_get_async_rc(msg));
+		*sensor_data = be32_to_cpu(data);
+		break;
+
+	case OPAL_SUCCESS:
+		ret = 0;
+		*sensor_data = be32_to_cpu(data);
+		break;
+
+	case OPAL_WRONG_STATE:
+		ret = -EIO;
+		break;
+
+	default:
+		ret = opal_error_code(ret);
+		break;
+	}
+
+out:
+	opal_async_release_token(token);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(opal_get_sensor_data);
+
+int opal_get_sensor_data_u64(u32 sensor_hndl, u64 *sensor_data)
+{
+	int ret, token;
+	struct opal_msg msg;
+	__be64 data;
+
+	if (!opal_check_token(OPAL_SENSOR_READ_U64)) {
+		u32 sdata;
+
+		ret = opal_get_sensor_data(sensor_hndl, &sdata);
+		if (!ret)
+			*sensor_data = sdata;
+		return ret;
+	}
+
+	token = opal_async_get_token_interruptible();
+	if (token < 0)
+		return token;
+
+	ret = opal_sensor_read_u64(sensor_hndl, token, &data);
+	switch (ret) {
+	case OPAL_ASYNC_COMPLETION:
+		ret = opal_async_wait_response(token, &msg);
+		if (ret) {
+			pr_err("%s: Failed to wait for the async response, %d\n",
+			       __func__, ret);
+			goto out_token;
+		}
+
+		ret = opal_error_code(opal_get_async_rc(msg));
+		*sensor_data = be64_to_cpu(data);
+		break;
+
+	case OPAL_SUCCESS:
+		ret = 0;
+		*sensor_data = be64_to_cpu(data);
+		break;
+
+	case OPAL_WRONG_STATE:
+		ret = -EIO;
+		break;
+
+	default:
+		ret = opal_error_code(ret);
+		break;
+	}
+
+out_token:
+	opal_async_release_token(token);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(opal_get_sensor_data_u64);
+
+int __init opal_sensor_init(void)
+{
+	struct platform_device *pdev;
+	struct device_node *sensor;
+
+	sensor = of_find_node_by_path("/ibm,opal/sensors");
+	if (!sensor) {
+		pr_err("Opal node 'sensors' not found\n");
+		return -ENODEV;
+	}
+
+	pdev = of_platform_device_create(sensor, "opal-sensor", NULL);
+	of_node_put(sensor);
+
+	return PTR_ERR_OR_ZERO(pdev);
+}
diff --git a/arch/powerpc/platforms/powernv/opal-sysparam.c b/arch/powerpc/platforms/powernv/opal-sysparam.c
new file mode 100644
index 000000000..a12312afe
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/opal-sysparam.c
@@ -0,0 +1,294 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * PowerNV system parameter code
+ *
+ * Copyright (C) 2013 IBM
+ */
+
+#include <linux/kobject.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include <linux/of.h>
+#include <linux/gfp.h>
+#include <linux/stat.h>
+#include <asm/opal.h>
+
+#define MAX_PARAM_DATA_LEN	64
+
+static DEFINE_MUTEX(opal_sysparam_mutex);
+static struct kobject *sysparam_kobj;
+static void *param_data_buf;
+
+struct param_attr {
+	struct list_head list;
+	u32 param_id;
+	u32 param_size;
+	struct kobj_attribute kobj_attr;
+};
+
+static ssize_t opal_get_sys_param(u32 param_id, u32 length, void *buffer)
+{
+	struct opal_msg msg;
+	ssize_t ret;
+	int token;
+
+	token = opal_async_get_token_interruptible();
+	if (token < 0) {
+		if (token != -ERESTARTSYS)
+			pr_err("%s: Couldn't get the token, returning\n",
+					__func__);
+		ret = token;
+		goto out;
+	}
+
+	ret = opal_get_param(token, param_id, (u64)buffer, length);
+	if (ret != OPAL_ASYNC_COMPLETION) {
+		ret = opal_error_code(ret);
+		goto out_token;
+	}
+
+	ret = opal_async_wait_response(token, &msg);
+	if (ret) {
+		pr_err("%s: Failed to wait for the async response, %zd\n",
+				__func__, ret);
+		goto out_token;
+	}
+
+	ret = opal_error_code(opal_get_async_rc(msg));
+
+out_token:
+	opal_async_release_token(token);
+out:
+	return ret;
+}
+
+static int opal_set_sys_param(u32 param_id, u32 length, void *buffer)
+{
+	struct opal_msg msg;
+	int ret, token;
+
+	token = opal_async_get_token_interruptible();
+	if (token < 0) {
+		if (token != -ERESTARTSYS)
+			pr_err("%s: Couldn't get the token, returning\n",
+					__func__);
+		ret = token;
+		goto out;
+	}
+
+	ret = opal_set_param(token, param_id, (u64)buffer, length);
+
+	if (ret != OPAL_ASYNC_COMPLETION) {
+		ret = opal_error_code(ret);
+		goto out_token;
+	}
+
+	ret = opal_async_wait_response(token, &msg);
+	if (ret) {
+		pr_err("%s: Failed to wait for the async response, %d\n",
+				__func__, ret);
+		goto out_token;
+	}
+
+	ret = opal_error_code(opal_get_async_rc(msg));
+
+out_token:
+	opal_async_release_token(token);
+out:
+	return ret;
+}
+
+static ssize_t sys_param_show(struct kobject *kobj,
+		struct kobj_attribute *kobj_attr, char *buf)
+{
+	struct param_attr *attr = container_of(kobj_attr, struct param_attr,
+			kobj_attr);
+	ssize_t ret;
+
+	mutex_lock(&opal_sysparam_mutex);
+	ret = opal_get_sys_param(attr->param_id, attr->param_size,
+			param_data_buf);
+	if (ret)
+		goto out;
+
+	memcpy(buf, param_data_buf, attr->param_size);
+
+	ret = attr->param_size;
+out:
+	mutex_unlock(&opal_sysparam_mutex);
+	return ret;
+}
+
+static ssize_t sys_param_store(struct kobject *kobj,
+		struct kobj_attribute *kobj_attr, const char *buf, size_t count)
+{
+	struct param_attr *attr = container_of(kobj_attr, struct param_attr,
+			kobj_attr);
+	ssize_t ret;
+
+        /* MAX_PARAM_DATA_LEN is sizeof(param_data_buf) */
+        if (count > MAX_PARAM_DATA_LEN)
+                count = MAX_PARAM_DATA_LEN;
+
+	mutex_lock(&opal_sysparam_mutex);
+	memcpy(param_data_buf, buf, count);
+	ret = opal_set_sys_param(attr->param_id, attr->param_size,
+			param_data_buf);
+	mutex_unlock(&opal_sysparam_mutex);
+	if (!ret)
+		ret = count;
+	return ret;
+}
+
+void __init opal_sys_param_init(void)
+{
+	struct device_node *sysparam;
+	struct param_attr *attr;
+	u32 *id, *size;
+	int count, i;
+	u8 *perm;
+
+	if (!opal_kobj) {
+		pr_warn("SYSPARAM: opal kobject is not available\n");
+		goto out;
+	}
+
+	/* Some systems do not use sysparams; this is not an error */
+	sysparam = of_find_node_by_path("/ibm,opal/sysparams");
+	if (!sysparam)
+		goto out;
+
+	if (!of_device_is_compatible(sysparam, "ibm,opal-sysparams")) {
+		pr_err("SYSPARAM: Opal sysparam node not compatible\n");
+		goto out_node_put;
+	}
+
+	sysparam_kobj = kobject_create_and_add("sysparams", opal_kobj);
+	if (!sysparam_kobj) {
+		pr_err("SYSPARAM: Failed to create sysparam kobject\n");
+		goto out_node_put;
+	}
+
+	/* Allocate big enough buffer for any get/set transactions */
+	param_data_buf = kzalloc(MAX_PARAM_DATA_LEN, GFP_KERNEL);
+	if (!param_data_buf) {
+		pr_err("SYSPARAM: Failed to allocate memory for param data "
+				"buf\n");
+		goto out_kobj_put;
+	}
+
+	/* Number of parameters exposed through DT */
+	count = of_property_count_strings(sysparam, "param-name");
+	if (count < 0) {
+		pr_err("SYSPARAM: No string found of property param-name in "
+				"the node %pOFn\n", sysparam);
+		goto out_param_buf;
+	}
+
+	id = kcalloc(count, sizeof(*id), GFP_KERNEL);
+	if (!id) {
+		pr_err("SYSPARAM: Failed to allocate memory to read parameter "
+				"id\n");
+		goto out_param_buf;
+	}
+
+	size = kcalloc(count, sizeof(*size), GFP_KERNEL);
+	if (!size) {
+		pr_err("SYSPARAM: Failed to allocate memory to read parameter "
+				"size\n");
+		goto out_free_id;
+	}
+
+	perm = kcalloc(count, sizeof(*perm), GFP_KERNEL);
+	if (!perm) {
+		pr_err("SYSPARAM: Failed to allocate memory to read supported "
+				"action on the parameter");
+		goto out_free_size;
+	}
+
+	if (of_property_read_u32_array(sysparam, "param-id", id, count)) {
+		pr_err("SYSPARAM: Missing property param-id in the DT\n");
+		goto out_free_perm;
+	}
+
+	if (of_property_read_u32_array(sysparam, "param-len", size, count)) {
+		pr_err("SYSPARAM: Missing property param-len in the DT\n");
+		goto out_free_perm;
+	}
+
+
+	if (of_property_read_u8_array(sysparam, "param-perm", perm, count)) {
+		pr_err("SYSPARAM: Missing property param-perm in the DT\n");
+		goto out_free_perm;
+	}
+
+	attr = kcalloc(count, sizeof(*attr), GFP_KERNEL);
+	if (!attr) {
+		pr_err("SYSPARAM: Failed to allocate memory for parameter "
+				"attributes\n");
+		goto out_free_perm;
+	}
+
+	/* For each of the parameters, populate the parameter attributes */
+	for (i = 0; i < count; i++) {
+		if (size[i] > MAX_PARAM_DATA_LEN) {
+			pr_warn("SYSPARAM: Not creating parameter %d as size "
+				"exceeds buffer length\n", i);
+			continue;
+		}
+
+		sysfs_attr_init(&attr[i].kobj_attr.attr);
+		attr[i].param_id = id[i];
+		attr[i].param_size = size[i];
+		if (of_property_read_string_index(sysparam, "param-name", i,
+				&attr[i].kobj_attr.attr.name))
+			continue;
+
+		/* If the parameter is read-only or read-write */
+		switch (perm[i] & 3) {
+		case OPAL_SYSPARAM_READ:
+			attr[i].kobj_attr.attr.mode = 0444;
+			break;
+		case OPAL_SYSPARAM_WRITE:
+			attr[i].kobj_attr.attr.mode = 0200;
+			break;
+		case OPAL_SYSPARAM_RW:
+			attr[i].kobj_attr.attr.mode = 0644;
+			break;
+		default:
+			break;
+		}
+
+		attr[i].kobj_attr.show = sys_param_show;
+		attr[i].kobj_attr.store = sys_param_store;
+
+		if (sysfs_create_file(sysparam_kobj, &attr[i].kobj_attr.attr)) {
+			pr_err("SYSPARAM: Failed to create sysfs file %s\n",
+					attr[i].kobj_attr.attr.name);
+			goto out_free_attr;
+		}
+	}
+
+	kfree(perm);
+	kfree(size);
+	kfree(id);
+	of_node_put(sysparam);
+	return;
+
+out_free_attr:
+	kfree(attr);
+out_free_perm:
+	kfree(perm);
+out_free_size:
+	kfree(size);
+out_free_id:
+	kfree(id);
+out_param_buf:
+	kfree(param_data_buf);
+out_kobj_put:
+	kobject_put(sysparam_kobj);
+out_node_put:
+	of_node_put(sysparam);
+out:
+	return;
+}
diff --git a/arch/powerpc/platforms/powernv/opal-tracepoints.c b/arch/powerpc/platforms/powernv/opal-tracepoints.c
new file mode 100644
index 000000000..f16a43540
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/opal-tracepoints.c
@@ -0,0 +1,88 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/percpu.h>
+#include <linux/jump_label.h>
+#include <asm/trace.h>
+#include <asm/asm-prototypes.h>
+
+#ifdef CONFIG_JUMP_LABEL
+struct static_key opal_tracepoint_key = STATIC_KEY_INIT;
+
+int opal_tracepoint_regfunc(void)
+{
+	static_key_slow_inc(&opal_tracepoint_key);
+	return 0;
+}
+
+void opal_tracepoint_unregfunc(void)
+{
+	static_key_slow_dec(&opal_tracepoint_key);
+}
+#else
+/*
+ * We optimise OPAL calls by placing opal_tracepoint_refcount
+ * directly in the TOC so we can check if the opal tracepoints are
+ * enabled via a single load.
+ */
+
+/* NB: reg/unreg are called while guarded with the tracepoints_mutex */
+extern long opal_tracepoint_refcount;
+
+int opal_tracepoint_regfunc(void)
+{
+	opal_tracepoint_refcount++;
+	return 0;
+}
+
+void opal_tracepoint_unregfunc(void)
+{
+	opal_tracepoint_refcount--;
+}
+#endif
+
+/*
+ * Since the tracing code might execute OPAL calls we need to guard against
+ * recursion.
+ */
+static DEFINE_PER_CPU(unsigned int, opal_trace_depth);
+
+void __trace_opal_entry(unsigned long opcode, unsigned long *args)
+{
+	unsigned long flags;
+	unsigned int *depth;
+
+	local_irq_save(flags);
+
+	depth = this_cpu_ptr(&opal_trace_depth);
+
+	if (*depth)
+		goto out;
+
+	(*depth)++;
+	preempt_disable();
+	trace_opal_entry(opcode, args);
+	(*depth)--;
+
+out:
+	local_irq_restore(flags);
+}
+
+void __trace_opal_exit(long opcode, unsigned long retval)
+{
+	unsigned long flags;
+	unsigned int *depth;
+
+	local_irq_save(flags);
+
+	depth = this_cpu_ptr(&opal_trace_depth);
+
+	if (*depth)
+		goto out;
+
+	(*depth)++;
+	trace_opal_exit(opcode, retval);
+	preempt_enable();
+	(*depth)--;
+
+out:
+	local_irq_restore(flags);
+}
diff --git a/arch/powerpc/platforms/powernv/opal-wrappers.S b/arch/powerpc/platforms/powernv/opal-wrappers.S
new file mode 100644
index 000000000..e5acc33b3
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/opal-wrappers.S
@@ -0,0 +1,63 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * PowerNV OPAL API wrappers
+ *
+ * Copyright 2011 IBM Corp.
+ */
+
+#include <linux/jump_label.h>
+#include <asm/ppc_asm.h>
+#include <asm/hvcall.h>
+#include <asm/asm-offsets.h>
+#include <asm/opal.h>
+#include <asm/asm-compat.h>
+#include <asm/feature-fixups.h>
+
+	.section ".text"
+
+/*
+ * r3-r10		- OPAL call arguments
+ * STK_PARAM(R11)	- OPAL opcode
+ * STK_PARAM(R12)	- MSR to restore
+ */
+_GLOBAL_TOC(__opal_call)
+	mflr	r0
+	std	r0,PPC_LR_STKOFF(r1)
+	ld	r12,STK_PARAM(R12)(r1)
+	li	r0,MSR_IR|MSR_DR|MSR_LE
+	andc	r12,r12,r0
+	LOAD_REG_ADDR(r11, opal_return)
+	mtlr	r11
+	LOAD_REG_ADDR(r11, opal)
+	ld	r2,0(r11)
+	ld	r11,8(r11)
+	mtspr	SPRN_HSRR0,r11
+	mtspr	SPRN_HSRR1,r12
+	/* set token to r0 */
+	ld	r0,STK_PARAM(R11)(r1)
+	hrfid
+opal_return:
+	/*
+	 * Restore MSR on OPAL return. The MSR is set to big-endian.
+	 */
+#ifdef __BIG_ENDIAN__
+	ld	r11,STK_PARAM(R12)(r1)
+	mtmsrd	r11
+#else
+	/* Endian can only be switched with rfi, must byte reverse MSR load */
+	.short 0x4039	 /* li r10,STK_PARAM(R12)		*/
+	.byte (STK_PARAM(R12) >> 8) & 0xff
+	.byte STK_PARAM(R12) & 0xff
+
+	.long 0x280c6a7d /* ldbrx r11,r10,r1			*/
+	.long 0x05009f42 /* bcl 20,31,$+4			*/
+	.long 0xa602487d /* mflr r10				*/
+	.long 0x14004a39 /* addi r10,r10,20			*/
+	.long 0xa64b5a7d /* mthsrr0 r10				*/
+	.long 0xa64b7b7d /* mthsrr1 r11				*/
+	.long 0x2402004c /* hrfid				*/
+#endif
+	ld	r2,PACATOC(r13)
+	ld	r0,PPC_LR_STKOFF(r1)
+	mtlr	r0
+	blr
diff --git a/arch/powerpc/platforms/powernv/opal-xscom.c b/arch/powerpc/platforms/powernv/opal-xscom.c
new file mode 100644
index 000000000..d5814c504
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/opal-xscom.c
@@ -0,0 +1,210 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * PowerNV SCOM bus debugfs interface
+ *
+ * Copyright 2010 Benjamin Herrenschmidt, IBM Corp
+ *                <benh@kernel.crashing.org>
+ *     and        David Gibson, IBM Corporation.
+ * Copyright 2013 IBM Corp.
+ */
+
+#include <linux/kernel.h>
+#include <linux/of.h>
+#include <linux/bug.h>
+#include <linux/gfp.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+
+#include <asm/machdep.h>
+#include <asm/firmware.h>
+#include <asm/opal.h>
+#include <asm/debugfs.h>
+#include <asm/prom.h>
+
+static u64 opal_scom_unmangle(u64 addr)
+{
+	u64 tmp;
+
+	/*
+	 * XSCOM addresses use the top nibble to set indirect mode and
+	 * its form.  Bits 4-11 are always 0.
+	 *
+	 * Because the debugfs interface uses signed offsets and shifts
+	 * the address left by 3, we basically cannot use the top 4 bits
+	 * of the 64-bit address, and thus cannot use the indirect bit.
+	 *
+	 * To deal with that, we support the indirect bits being in
+	 * bits 4-7 (IBM notation) instead of bit 0-3 in this API, we
+	 * do the conversion here.
+	 *
+	 * For in-kernel use, we don't need to do this mangling.  In
+	 * kernel won't have bits 4-7 set.
+	 *
+	 * So:
+	 *   debugfs will always   set 0-3 = 0 and clear 4-7
+	 *    kernel will always clear 0-3 = 0 and   set 4-7
+	 */
+	tmp = addr;
+	tmp  &= 0x0f00000000000000;
+	addr &= 0xf0ffffffffffffff;
+	addr |= tmp << 4;
+
+	return addr;
+}
+
+static int opal_scom_read(uint32_t chip, uint64_t addr, u64 reg, u64 *value)
+{
+	int64_t rc;
+	__be64 v;
+
+	reg = opal_scom_unmangle(addr + reg);
+	rc = opal_xscom_read(chip, reg, (__be64 *)__pa(&v));
+	if (rc) {
+		*value = 0xfffffffffffffffful;
+		return -EIO;
+	}
+	*value = be64_to_cpu(v);
+	return 0;
+}
+
+static int opal_scom_write(uint32_t chip, uint64_t addr, u64 reg, u64 value)
+{
+	int64_t rc;
+
+	reg = opal_scom_unmangle(addr + reg);
+	rc = opal_xscom_write(chip, reg, value);
+	if (rc)
+		return -EIO;
+	return 0;
+}
+
+struct scom_debug_entry {
+	u32 chip;
+	struct debugfs_blob_wrapper path;
+	char name[16];
+};
+
+static ssize_t scom_debug_read(struct file *filp, char __user *ubuf,
+			       size_t count, loff_t *ppos)
+{
+	struct scom_debug_entry *ent = filp->private_data;
+	u64 __user *ubuf64 = (u64 __user *)ubuf;
+	loff_t off = *ppos;
+	ssize_t done = 0;
+	u64 reg, reg_base, reg_cnt, val;
+	int rc;
+
+	if (off < 0 || (off & 7) || (count & 7))
+		return -EINVAL;
+	reg_base = off >> 3;
+	reg_cnt = count >> 3;
+
+	for (reg = 0; reg < reg_cnt; reg++) {
+		rc = opal_scom_read(ent->chip, reg_base, reg, &val);
+		if (!rc)
+			rc = put_user(val, ubuf64);
+		if (rc) {
+			if (!done)
+				done = rc;
+			break;
+		}
+		ubuf64++;
+		*ppos += 8;
+		done += 8;
+	}
+	return done;
+}
+
+static ssize_t scom_debug_write(struct file *filp, const char __user *ubuf,
+				size_t count, loff_t *ppos)
+{
+	struct scom_debug_entry *ent = filp->private_data;
+	u64 __user *ubuf64 = (u64 __user *)ubuf;
+	loff_t off = *ppos;
+	ssize_t done = 0;
+	u64 reg, reg_base, reg_cnt, val;
+	int rc;
+
+	if (off < 0 || (off & 7) || (count & 7))
+		return -EINVAL;
+	reg_base = off >> 3;
+	reg_cnt = count >> 3;
+
+	for (reg = 0; reg < reg_cnt; reg++) {
+		rc = get_user(val, ubuf64);
+		if (!rc)
+			rc = opal_scom_write(ent->chip, reg_base, reg,  val);
+		if (rc) {
+			if (!done)
+				done = rc;
+			break;
+		}
+		ubuf64++;
+		done += 8;
+	}
+	return done;
+}
+
+static const struct file_operations scom_debug_fops = {
+	.read =		scom_debug_read,
+	.write =	scom_debug_write,
+	.open =		simple_open,
+	.llseek =	default_llseek,
+};
+
+static int scom_debug_init_one(struct dentry *root, struct device_node *dn,
+			       int chip)
+{
+	struct scom_debug_entry *ent;
+	struct dentry *dir;
+
+	ent = kzalloc(sizeof(*ent), GFP_KERNEL);
+	if (!ent)
+		return -ENOMEM;
+
+	ent->chip = chip;
+	snprintf(ent->name, 16, "%08x", chip);
+	ent->path.data = (void *)kasprintf(GFP_KERNEL, "%pOF", dn);
+	if (!ent->path.data) {
+		kfree(ent);
+		return -ENOMEM;
+	}
+
+	ent->path.size = strlen((char *)ent->path.data);
+
+	dir = debugfs_create_dir(ent->name, root);
+	if (!dir) {
+		kfree(ent->path.data);
+		kfree(ent);
+		return -1;
+	}
+
+	debugfs_create_blob("devspec", 0400, dir, &ent->path);
+	debugfs_create_file("access", 0600, dir, ent, &scom_debug_fops);
+
+	return 0;
+}
+
+static int scom_debug_init(void)
+{
+	struct device_node *dn;
+	struct dentry *root;
+	int chip, rc;
+
+	if (!firmware_has_feature(FW_FEATURE_OPAL))
+		return 0;
+
+	root = debugfs_create_dir("scom", powerpc_debugfs_root);
+	if (!root)
+		return -1;
+
+	rc = 0;
+	for_each_node_with_property(dn, "scom-controller") {
+		chip = of_get_ibm_chip_id(dn);
+		WARN_ON(chip == -1);
+		rc |= scom_debug_init_one(root, dn, chip);
+	}
+
+	return rc;
+}
+device_initcall(scom_debug_init);
diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c
new file mode 100644
index 000000000..1d05c168c
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/opal.c
@@ -0,0 +1,1249 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * PowerNV OPAL high level interfaces
+ *
+ * Copyright 2011 IBM Corp.
+ */
+
+#define pr_fmt(fmt)	"opal: " fmt
+
+#include <linux/printk.h>
+#include <linux/types.h>
+#include <linux/of.h>
+#include <linux/of_fdt.h>
+#include <linux/of_platform.h>
+#include <linux/of_address.h>
+#include <linux/interrupt.h>
+#include <linux/notifier.h>
+#include <linux/slab.h>
+#include <linux/sched.h>
+#include <linux/kobject.h>
+#include <linux/delay.h>
+#include <linux/memblock.h>
+#include <linux/kthread.h>
+#include <linux/freezer.h>
+#include <linux/kmsg_dump.h>
+#include <linux/console.h>
+#include <linux/sched/debug.h>
+
+#include <asm/machdep.h>
+#include <asm/opal.h>
+#include <asm/firmware.h>
+#include <asm/mce.h>
+#include <asm/imc-pmu.h>
+#include <asm/bug.h>
+
+#include "powernv.h"
+
+#define OPAL_MSG_QUEUE_MAX 16
+
+struct opal_msg_node {
+	struct list_head	list;
+	struct opal_msg		msg;
+};
+
+static DEFINE_SPINLOCK(msg_list_lock);
+static LIST_HEAD(msg_list);
+
+/* /sys/firmware/opal */
+struct kobject *opal_kobj;
+
+struct opal {
+	u64 base;
+	u64 entry;
+	u64 size;
+} opal;
+
+struct mcheck_recoverable_range {
+	u64 start_addr;
+	u64 end_addr;
+	u64 recover_addr;
+};
+
+static int msg_list_size;
+
+static struct mcheck_recoverable_range *mc_recoverable_range;
+static int mc_recoverable_range_len;
+
+struct device_node *opal_node;
+static DEFINE_SPINLOCK(opal_write_lock);
+static struct atomic_notifier_head opal_msg_notifier_head[OPAL_MSG_TYPE_MAX];
+static uint32_t opal_heartbeat;
+static struct task_struct *kopald_tsk;
+static struct opal_msg *opal_msg;
+static u32 opal_msg_size __ro_after_init;
+
+void opal_configure_cores(void)
+{
+	u64 reinit_flags = 0;
+
+	/* Do the actual re-init, This will clobber all FPRs, VRs, etc...
+	 *
+	 * It will preserve non volatile GPRs and HSPRG0/1. It will
+	 * also restore HIDs and other SPRs to their original value
+	 * but it might clobber a bunch.
+	 */
+#ifdef __BIG_ENDIAN__
+	reinit_flags |= OPAL_REINIT_CPUS_HILE_BE;
+#else
+	reinit_flags |= OPAL_REINIT_CPUS_HILE_LE;
+#endif
+
+	/*
+	 * POWER9 always support running hash:
+	 *  ie. Host hash  supports  hash guests
+	 *      Host radix supports  hash/radix guests
+	 */
+	if (early_cpu_has_feature(CPU_FTR_ARCH_300)) {
+		reinit_flags |= OPAL_REINIT_CPUS_MMU_HASH;
+		if (early_radix_enabled())
+			reinit_flags |= OPAL_REINIT_CPUS_MMU_RADIX;
+	}
+
+	opal_reinit_cpus(reinit_flags);
+
+	/* Restore some bits */
+	if (cur_cpu_spec->cpu_restore)
+		cur_cpu_spec->cpu_restore();
+}
+
+int __init early_init_dt_scan_opal(unsigned long node,
+				   const char *uname, int depth, void *data)
+{
+	const void *basep, *entryp, *sizep;
+	int basesz, entrysz, runtimesz;
+
+	if (depth != 1 || strcmp(uname, "ibm,opal") != 0)
+		return 0;
+
+	basep  = of_get_flat_dt_prop(node, "opal-base-address", &basesz);
+	entryp = of_get_flat_dt_prop(node, "opal-entry-address", &entrysz);
+	sizep = of_get_flat_dt_prop(node, "opal-runtime-size", &runtimesz);
+
+	if (!basep || !entryp || !sizep)
+		return 1;
+
+	opal.base = of_read_number(basep, basesz/4);
+	opal.entry = of_read_number(entryp, entrysz/4);
+	opal.size = of_read_number(sizep, runtimesz/4);
+
+	pr_debug("OPAL Base  = 0x%llx (basep=%p basesz=%d)\n",
+		 opal.base, basep, basesz);
+	pr_debug("OPAL Entry = 0x%llx (entryp=%p basesz=%d)\n",
+		 opal.entry, entryp, entrysz);
+	pr_debug("OPAL Entry = 0x%llx (sizep=%p runtimesz=%d)\n",
+		 opal.size, sizep, runtimesz);
+
+	if (of_flat_dt_is_compatible(node, "ibm,opal-v3")) {
+		powerpc_firmware_features |= FW_FEATURE_OPAL;
+		pr_debug("OPAL detected !\n");
+	} else {
+		panic("OPAL != V3 detected, no longer supported.\n");
+	}
+
+	return 1;
+}
+
+int __init early_init_dt_scan_recoverable_ranges(unsigned long node,
+				   const char *uname, int depth, void *data)
+{
+	int i, psize, size;
+	const __be32 *prop;
+
+	if (depth != 1 || strcmp(uname, "ibm,opal") != 0)
+		return 0;
+
+	prop = of_get_flat_dt_prop(node, "mcheck-recoverable-ranges", &psize);
+
+	if (!prop)
+		return 1;
+
+	pr_debug("Found machine check recoverable ranges.\n");
+
+	/*
+	 * Calculate number of available entries.
+	 *
+	 * Each recoverable address range entry is (start address, len,
+	 * recovery address), 2 cells each for start and recovery address,
+	 * 1 cell for len, totalling 5 cells per entry.
+	 */
+	mc_recoverable_range_len = psize / (sizeof(*prop) * 5);
+
+	/* Sanity check */
+	if (!mc_recoverable_range_len)
+		return 1;
+
+	/* Size required to hold all the entries. */
+	size = mc_recoverable_range_len *
+			sizeof(struct mcheck_recoverable_range);
+
+	/*
+	 * Allocate a buffer to hold the MC recoverable ranges.
+	 */
+	mc_recoverable_range = memblock_alloc(size, __alignof__(u64));
+	if (!mc_recoverable_range)
+		panic("%s: Failed to allocate %u bytes align=0x%lx\n",
+		      __func__, size, __alignof__(u64));
+
+	for (i = 0; i < mc_recoverable_range_len; i++) {
+		mc_recoverable_range[i].start_addr =
+					of_read_number(prop + (i * 5) + 0, 2);
+		mc_recoverable_range[i].end_addr =
+					mc_recoverable_range[i].start_addr +
+					of_read_number(prop + (i * 5) + 2, 1);
+		mc_recoverable_range[i].recover_addr =
+					of_read_number(prop + (i * 5) + 3, 2);
+
+		pr_debug("Machine check recoverable range: %llx..%llx: %llx\n",
+				mc_recoverable_range[i].start_addr,
+				mc_recoverable_range[i].end_addr,
+				mc_recoverable_range[i].recover_addr);
+	}
+	return 1;
+}
+
+static int __init opal_register_exception_handlers(void)
+{
+#ifdef __BIG_ENDIAN__
+	u64 glue;
+
+	if (!(powerpc_firmware_features & FW_FEATURE_OPAL))
+		return -ENODEV;
+
+	/* Hookup some exception handlers except machine check. We use the
+	 * fwnmi area at 0x7000 to provide the glue space to OPAL
+	 */
+	glue = 0x7000;
+
+	/*
+	 * Only ancient OPAL firmware requires this.
+	 * Specifically, firmware from FW810.00 (released June 2014)
+	 * through FW810.20 (Released October 2014).
+	 *
+	 * Check if we are running on newer (post Oct 2014) firmware that
+	 * exports the OPAL_HANDLE_HMI token. If yes, then don't ask OPAL to
+	 * patch the HMI interrupt and we catch it directly in Linux.
+	 *
+	 * For older firmware (i.e < FW810.20), we fallback to old behavior and
+	 * let OPAL patch the HMI vector and handle it inside OPAL firmware.
+	 *
+	 * For newer firmware we catch/handle the HMI directly in Linux.
+	 */
+	if (!opal_check_token(OPAL_HANDLE_HMI)) {
+		pr_info("Old firmware detected, OPAL handles HMIs.\n");
+		opal_register_exception_handler(
+				OPAL_HYPERVISOR_MAINTENANCE_HANDLER,
+				0, glue);
+		glue += 128;
+	}
+
+	/*
+	 * Only applicable to ancient firmware, all modern
+	 * (post March 2015/skiboot 5.0) firmware will just return
+	 * OPAL_UNSUPPORTED.
+	 */
+	opal_register_exception_handler(OPAL_SOFTPATCH_HANDLER, 0, glue);
+#endif
+
+	return 0;
+}
+machine_early_initcall(powernv, opal_register_exception_handlers);
+
+static void queue_replay_msg(void *msg)
+{
+	struct opal_msg_node *msg_node;
+
+	if (msg_list_size < OPAL_MSG_QUEUE_MAX) {
+		msg_node = kzalloc(sizeof(*msg_node), GFP_ATOMIC);
+		if (msg_node) {
+			INIT_LIST_HEAD(&msg_node->list);
+			memcpy(&msg_node->msg, msg, sizeof(struct opal_msg));
+			list_add_tail(&msg_node->list, &msg_list);
+			msg_list_size++;
+		} else
+			pr_warn_once("message queue no memory\n");
+
+		if (msg_list_size >= OPAL_MSG_QUEUE_MAX)
+			pr_warn_once("message queue full\n");
+	}
+}
+
+static void dequeue_replay_msg(enum opal_msg_type msg_type)
+{
+	struct opal_msg_node *msg_node, *tmp;
+
+	list_for_each_entry_safe(msg_node, tmp, &msg_list, list) {
+		if (be32_to_cpu(msg_node->msg.msg_type) != msg_type)
+			continue;
+
+		atomic_notifier_call_chain(&opal_msg_notifier_head[msg_type],
+					msg_type,
+					&msg_node->msg);
+
+		list_del(&msg_node->list);
+		kfree(msg_node);
+		msg_list_size--;
+	}
+}
+
+/*
+ * Opal message notifier based on message type. Allow subscribers to get
+ * notified for specific messgae type.
+ */
+int opal_message_notifier_register(enum opal_msg_type msg_type,
+					struct notifier_block *nb)
+{
+	int ret;
+	unsigned long flags;
+
+	if (!nb || msg_type >= OPAL_MSG_TYPE_MAX) {
+		pr_warn("%s: Invalid arguments, msg_type:%d\n",
+			__func__, msg_type);
+		return -EINVAL;
+	}
+
+	spin_lock_irqsave(&msg_list_lock, flags);
+	ret = atomic_notifier_chain_register(
+		&opal_msg_notifier_head[msg_type], nb);
+
+	/*
+	 * If the registration succeeded, replay any queued messages that came
+	 * in prior to the notifier chain registration. msg_list_lock held here
+	 * to ensure they're delivered prior to any subsequent messages.
+	 */
+	if (ret == 0)
+		dequeue_replay_msg(msg_type);
+
+	spin_unlock_irqrestore(&msg_list_lock, flags);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(opal_message_notifier_register);
+
+int opal_message_notifier_unregister(enum opal_msg_type msg_type,
+				     struct notifier_block *nb)
+{
+	return atomic_notifier_chain_unregister(
+			&opal_msg_notifier_head[msg_type], nb);
+}
+EXPORT_SYMBOL_GPL(opal_message_notifier_unregister);
+
+static void opal_message_do_notify(uint32_t msg_type, void *msg)
+{
+	unsigned long flags;
+	bool queued = false;
+
+	spin_lock_irqsave(&msg_list_lock, flags);
+	if (opal_msg_notifier_head[msg_type].head == NULL) {
+		/*
+		 * Queue up the msg since no notifiers have registered
+		 * yet for this msg_type.
+		 */
+		queue_replay_msg(msg);
+		queued = true;
+	}
+	spin_unlock_irqrestore(&msg_list_lock, flags);
+
+	if (queued)
+		return;
+
+	/* notify subscribers */
+	atomic_notifier_call_chain(&opal_msg_notifier_head[msg_type],
+					msg_type, msg);
+}
+
+static void opal_handle_message(void)
+{
+	s64 ret;
+	u32 type;
+
+	ret = opal_get_msg(__pa(opal_msg), opal_msg_size);
+	/* No opal message pending. */
+	if (ret == OPAL_RESOURCE)
+		return;
+
+	/* check for errors. */
+	if (ret) {
+		pr_warn("%s: Failed to retrieve opal message, err=%lld\n",
+			__func__, ret);
+		return;
+	}
+
+	type = be32_to_cpu(opal_msg->msg_type);
+
+	/* Sanity check */
+	if (type >= OPAL_MSG_TYPE_MAX) {
+		pr_warn_once("%s: Unknown message type: %u\n", __func__, type);
+		return;
+	}
+	opal_message_do_notify(type, (void *)opal_msg);
+}
+
+static irqreturn_t opal_message_notify(int irq, void *data)
+{
+	opal_handle_message();
+	return IRQ_HANDLED;
+}
+
+static int __init opal_message_init(struct device_node *opal_node)
+{
+	int ret, i, irq;
+
+	ret = of_property_read_u32(opal_node, "opal-msg-size", &opal_msg_size);
+	if (ret) {
+		pr_notice("Failed to read opal-msg-size property\n");
+		opal_msg_size = sizeof(struct opal_msg);
+	}
+
+	opal_msg = kmalloc(opal_msg_size, GFP_KERNEL);
+	if (!opal_msg) {
+		opal_msg_size = sizeof(struct opal_msg);
+		/* Try to allocate fixed message size */
+		opal_msg = kmalloc(opal_msg_size, GFP_KERNEL);
+		BUG_ON(opal_msg == NULL);
+	}
+
+	for (i = 0; i < OPAL_MSG_TYPE_MAX; i++)
+		ATOMIC_INIT_NOTIFIER_HEAD(&opal_msg_notifier_head[i]);
+
+	irq = opal_event_request(ilog2(OPAL_EVENT_MSG_PENDING));
+	if (!irq) {
+		pr_err("%s: Can't register OPAL event irq (%d)\n",
+		       __func__, irq);
+		return irq;
+	}
+
+	ret = request_irq(irq, opal_message_notify,
+			IRQ_TYPE_LEVEL_HIGH, "opal-msg", NULL);
+	if (ret) {
+		pr_err("%s: Can't request OPAL event irq (%d)\n",
+		       __func__, ret);
+		return ret;
+	}
+
+	return 0;
+}
+
+int opal_get_chars(uint32_t vtermno, char *buf, int count)
+{
+	s64 rc;
+	__be64 evt, len;
+
+	if (!opal.entry)
+		return -ENODEV;
+	opal_poll_events(&evt);
+	if ((be64_to_cpu(evt) & OPAL_EVENT_CONSOLE_INPUT) == 0)
+		return 0;
+	len = cpu_to_be64(count);
+	rc = opal_console_read(vtermno, &len, buf);
+	if (rc == OPAL_SUCCESS)
+		return be64_to_cpu(len);
+	return 0;
+}
+
+static int __opal_put_chars(uint32_t vtermno, const char *data, int total_len, bool atomic)
+{
+	unsigned long flags = 0 /* shut up gcc */;
+	int written;
+	__be64 olen;
+	s64 rc;
+
+	if (!opal.entry)
+		return -ENODEV;
+
+	if (atomic)
+		spin_lock_irqsave(&opal_write_lock, flags);
+	rc = opal_console_write_buffer_space(vtermno, &olen);
+	if (rc || be64_to_cpu(olen) < total_len) {
+		/* Closed -> drop characters */
+		if (rc)
+			written = total_len;
+		else
+			written = -EAGAIN;
+		goto out;
+	}
+
+	/* Should not get a partial write here because space is available. */
+	olen = cpu_to_be64(total_len);
+	rc = opal_console_write(vtermno, &olen, data);
+	if (rc == OPAL_BUSY || rc == OPAL_BUSY_EVENT) {
+		if (rc == OPAL_BUSY_EVENT)
+			opal_poll_events(NULL);
+		written = -EAGAIN;
+		goto out;
+	}
+
+	/* Closed or other error drop */
+	if (rc != OPAL_SUCCESS) {
+		written = opal_error_code(rc);
+		goto out;
+	}
+
+	written = be64_to_cpu(olen);
+	if (written < total_len) {
+		if (atomic) {
+			/* Should not happen */
+			pr_warn("atomic console write returned partial "
+				"len=%d written=%d\n", total_len, written);
+		}
+		if (!written)
+			written = -EAGAIN;
+	}
+
+out:
+	if (atomic)
+		spin_unlock_irqrestore(&opal_write_lock, flags);
+
+	return written;
+}
+
+int opal_put_chars(uint32_t vtermno, const char *data, int total_len)
+{
+	return __opal_put_chars(vtermno, data, total_len, false);
+}
+
+/*
+ * opal_put_chars_atomic will not perform partial-writes. Data will be
+ * atomically written to the terminal or not at all. This is not strictly
+ * true at the moment because console space can race with OPAL's console
+ * writes.
+ */
+int opal_put_chars_atomic(uint32_t vtermno, const char *data, int total_len)
+{
+	return __opal_put_chars(vtermno, data, total_len, true);
+}
+
+static s64 __opal_flush_console(uint32_t vtermno)
+{
+	s64 rc;
+
+	if (!opal_check_token(OPAL_CONSOLE_FLUSH)) {
+		__be64 evt;
+
+		/*
+		 * If OPAL_CONSOLE_FLUSH is not implemented in the firmware,
+		 * the console can still be flushed by calling the polling
+		 * function while it has OPAL_EVENT_CONSOLE_OUTPUT events.
+		 */
+		WARN_ONCE(1, "opal: OPAL_CONSOLE_FLUSH missing.\n");
+
+		opal_poll_events(&evt);
+		if (!(be64_to_cpu(evt) & OPAL_EVENT_CONSOLE_OUTPUT))
+			return OPAL_SUCCESS;
+		return OPAL_BUSY;
+
+	} else {
+		rc = opal_console_flush(vtermno);
+		if (rc == OPAL_BUSY_EVENT) {
+			opal_poll_events(NULL);
+			rc = OPAL_BUSY;
+		}
+		return rc;
+	}
+
+}
+
+/*
+ * opal_flush_console spins until the console is flushed
+ */
+int opal_flush_console(uint32_t vtermno)
+{
+	for (;;) {
+		s64 rc = __opal_flush_console(vtermno);
+
+		if (rc == OPAL_BUSY || rc == OPAL_PARTIAL) {
+			mdelay(1);
+			continue;
+		}
+
+		return opal_error_code(rc);
+	}
+}
+
+/*
+ * opal_flush_chars is an hvc interface that sleeps until the console is
+ * flushed if wait, otherwise it will return -EBUSY if the console has data,
+ * -EAGAIN if it has data and some of it was flushed.
+ */
+int opal_flush_chars(uint32_t vtermno, bool wait)
+{
+	for (;;) {
+		s64 rc = __opal_flush_console(vtermno);
+
+		if (rc == OPAL_BUSY || rc == OPAL_PARTIAL) {
+			if (wait) {
+				msleep(OPAL_BUSY_DELAY_MS);
+				continue;
+			}
+			if (rc == OPAL_PARTIAL)
+				return -EAGAIN;
+		}
+
+		return opal_error_code(rc);
+	}
+}
+
+static int opal_recover_mce(struct pt_regs *regs,
+					struct machine_check_event *evt)
+{
+	int recovered = 0;
+
+	if (!(regs->msr & MSR_RI)) {
+		/* If MSR_RI isn't set, we cannot recover */
+		pr_err("Machine check interrupt unrecoverable: MSR(RI=0)\n");
+		recovered = 0;
+	} else if (evt->disposition == MCE_DISPOSITION_RECOVERED) {
+		/* Platform corrected itself */
+		recovered = 1;
+	} else if (evt->severity == MCE_SEV_FATAL) {
+		/* Fatal machine check */
+		pr_err("Machine check interrupt is fatal\n");
+		recovered = 0;
+	}
+
+	if (!recovered && evt->sync_error) {
+		/*
+		 * Try to kill processes if we get a synchronous machine check
+		 * (e.g., one caused by execution of this instruction). This
+		 * will devolve into a panic if we try to kill init or are in
+		 * an interrupt etc.
+		 *
+		 * TODO: Queue up this address for hwpoisioning later.
+		 * TODO: This is not quite right for d-side machine
+		 *       checks ->nip is not necessarily the important
+		 *       address.
+		 */
+		if ((user_mode(regs))) {
+			_exception(SIGBUS, regs, BUS_MCEERR_AR, regs->nip);
+			recovered = 1;
+		} else if (die_will_crash()) {
+			/*
+			 * die() would kill the kernel, so better to go via
+			 * the platform reboot code that will log the
+			 * machine check.
+			 */
+			recovered = 0;
+		} else {
+			die("Machine check", regs, SIGBUS);
+			recovered = 1;
+		}
+	}
+
+	return recovered;
+}
+
+void __noreturn pnv_platform_error_reboot(struct pt_regs *regs, const char *msg)
+{
+	panic_flush_kmsg_start();
+
+	pr_emerg("Hardware platform error: %s\n", msg);
+	if (regs)
+		show_regs(regs);
+	smp_send_stop();
+
+	panic_flush_kmsg_end();
+
+	/*
+	 * Don't bother to shut things down because this will
+	 * xstop the system.
+	 */
+	if (opal_cec_reboot2(OPAL_REBOOT_PLATFORM_ERROR, msg)
+						== OPAL_UNSUPPORTED) {
+		pr_emerg("Reboot type %d not supported for %s\n",
+				OPAL_REBOOT_PLATFORM_ERROR, msg);
+	}
+
+	/*
+	 * We reached here. There can be three possibilities:
+	 * 1. We are running on a firmware level that do not support
+	 *    opal_cec_reboot2()
+	 * 2. We are running on a firmware level that do not support
+	 *    OPAL_REBOOT_PLATFORM_ERROR reboot type.
+	 * 3. We are running on FSP based system that does not need
+	 *    opal to trigger checkstop explicitly for error analysis.
+	 *    The FSP PRD component would have already got notified
+	 *    about this error through other channels.
+	 * 4. We are running on a newer skiboot that by default does
+	 *    not cause a checkstop, drops us back to the kernel to
+	 *    extract context and state at the time of the error.
+	 */
+
+	panic(msg);
+}
+
+int opal_machine_check(struct pt_regs *regs)
+{
+	struct machine_check_event evt;
+
+	if (!get_mce_event(&evt, MCE_EVENT_RELEASE))
+		return 0;
+
+	/* Print things out */
+	if (evt.version != MCE_V1) {
+		pr_err("Machine Check Exception, Unknown event version %d !\n",
+		       evt.version);
+		return 0;
+	}
+	machine_check_print_event_info(&evt, user_mode(regs), false);
+
+	if (opal_recover_mce(regs, &evt))
+		return 1;
+
+	pnv_platform_error_reboot(regs, "Unrecoverable Machine Check exception");
+}
+
+/* Early hmi handler called in real mode. */
+int opal_hmi_exception_early(struct pt_regs *regs)
+{
+	s64 rc;
+
+	/*
+	 * call opal hmi handler. Pass paca address as token.
+	 * The return value OPAL_SUCCESS is an indication that there is
+	 * an HMI event generated waiting to pull by Linux.
+	 */
+	rc = opal_handle_hmi();
+	if (rc == OPAL_SUCCESS) {
+		local_paca->hmi_event_available = 1;
+		return 1;
+	}
+	return 0;
+}
+
+int opal_hmi_exception_early2(struct pt_regs *regs)
+{
+	s64 rc;
+	__be64 out_flags;
+
+	/*
+	 * call opal hmi handler.
+	 * Check 64-bit flag mask to find out if an event was generated,
+	 * and whether TB is still valid or not etc.
+	 */
+	rc = opal_handle_hmi2(&out_flags);
+	if (rc != OPAL_SUCCESS)
+		return 0;
+
+	if (be64_to_cpu(out_flags) & OPAL_HMI_FLAGS_NEW_EVENT)
+		local_paca->hmi_event_available = 1;
+	if (be64_to_cpu(out_flags) & OPAL_HMI_FLAGS_TOD_TB_FAIL)
+		tb_invalid = true;
+	return 1;
+}
+
+/* HMI exception handler called in virtual mode when irqs are next enabled. */
+int opal_handle_hmi_exception(struct pt_regs *regs)
+{
+	/*
+	 * Check if HMI event is available.
+	 * if Yes, then wake kopald to process them.
+	 */
+	if (!local_paca->hmi_event_available)
+		return 0;
+
+	local_paca->hmi_event_available = 0;
+	opal_wake_poller();
+
+	return 1;
+}
+
+static uint64_t find_recovery_address(uint64_t nip)
+{
+	int i;
+
+	for (i = 0; i < mc_recoverable_range_len; i++)
+		if ((nip >= mc_recoverable_range[i].start_addr) &&
+		    (nip < mc_recoverable_range[i].end_addr))
+		    return mc_recoverable_range[i].recover_addr;
+	return 0;
+}
+
+bool opal_mce_check_early_recovery(struct pt_regs *regs)
+{
+	uint64_t recover_addr = 0;
+
+	if (!opal.base || !opal.size)
+		goto out;
+
+	if ((regs->nip >= opal.base) &&
+			(regs->nip < (opal.base + opal.size)))
+		recover_addr = find_recovery_address(regs->nip);
+
+	/*
+	 * Setup regs->nip to rfi into fixup address.
+	 */
+	if (recover_addr)
+		regs->nip = recover_addr;
+
+out:
+	return !!recover_addr;
+}
+
+static int opal_sysfs_init(void)
+{
+	opal_kobj = kobject_create_and_add("opal", firmware_kobj);
+	if (!opal_kobj) {
+		pr_warn("kobject_create_and_add opal failed\n");
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static ssize_t export_attr_read(struct file *fp, struct kobject *kobj,
+				struct bin_attribute *bin_attr, char *buf,
+				loff_t off, size_t count)
+{
+	return memory_read_from_buffer(buf, count, &off, bin_attr->private,
+				       bin_attr->size);
+}
+
+static int opal_add_one_export(struct kobject *parent, const char *export_name,
+			       struct device_node *np, const char *prop_name)
+{
+	struct bin_attribute *attr = NULL;
+	const char *name = NULL;
+	u64 vals[2];
+	int rc;
+
+	rc = of_property_read_u64_array(np, prop_name, &vals[0], 2);
+	if (rc)
+		goto out;
+
+	attr = kzalloc(sizeof(*attr), GFP_KERNEL);
+	if (!attr) {
+		rc = -ENOMEM;
+		goto out;
+	}
+	name = kstrdup(export_name, GFP_KERNEL);
+	if (!name) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	sysfs_bin_attr_init(attr);
+	attr->attr.name = name;
+	attr->attr.mode = 0400;
+	attr->read = export_attr_read;
+	attr->private = __va(vals[0]);
+	attr->size = vals[1];
+
+	rc = sysfs_create_bin_file(parent, attr);
+out:
+	if (rc) {
+		kfree(name);
+		kfree(attr);
+	}
+
+	return rc;
+}
+
+static void opal_add_exported_attrs(struct device_node *np,
+				    struct kobject *kobj)
+{
+	struct device_node *child;
+	struct property *prop;
+
+	for_each_property_of_node(np, prop) {
+		int rc;
+
+		if (!strcmp(prop->name, "name") ||
+		    !strcmp(prop->name, "phandle"))
+			continue;
+
+		rc = opal_add_one_export(kobj, prop->name, np, prop->name);
+		if (rc) {
+			pr_warn("Unable to add export %pOF/%s, rc = %d!\n",
+				np, prop->name, rc);
+		}
+	}
+
+	for_each_child_of_node(np, child) {
+		struct kobject *child_kobj;
+
+		child_kobj = kobject_create_and_add(child->name, kobj);
+		if (!child_kobj) {
+			pr_err("Unable to create export dir for %pOF\n", child);
+			continue;
+		}
+
+		opal_add_exported_attrs(child, child_kobj);
+	}
+}
+
+/*
+ * opal_export_attrs: creates a sysfs node for each property listed in
+ * the device-tree under /ibm,opal/firmware/exports/
+ * All new sysfs nodes are created under /opal/exports/.
+ * This allows for reserved memory regions (e.g. HDAT) to be read.
+ * The new sysfs nodes are only readable by root.
+ */
+static void opal_export_attrs(void)
+{
+	struct device_node *np;
+	struct kobject *kobj;
+	int rc;
+
+	np = of_find_node_by_path("/ibm,opal/firmware/exports");
+	if (!np)
+		return;
+
+	/* Create new 'exports' directory - /sys/firmware/opal/exports */
+	kobj = kobject_create_and_add("exports", opal_kobj);
+	if (!kobj) {
+		pr_warn("kobject_create_and_add() of exports failed\n");
+		of_node_put(np);
+		return;
+	}
+
+	opal_add_exported_attrs(np, kobj);
+
+	/*
+	 * NB: symbol_map existed before the generic export interface so it
+	 * lives under the top level opal_kobj.
+	 */
+	rc = opal_add_one_export(opal_kobj, "symbol_map",
+				 np->parent, "symbol-map");
+	if (rc)
+		pr_warn("Error %d creating OPAL symbols file\n", rc);
+
+	of_node_put(np);
+}
+
+static void __init opal_dump_region_init(void)
+{
+	void *addr;
+	uint64_t size;
+	int rc;
+
+	if (!opal_check_token(OPAL_REGISTER_DUMP_REGION))
+		return;
+
+	/* Register kernel log buffer */
+	addr = log_buf_addr_get();
+	if (addr == NULL)
+		return;
+
+	size = log_buf_len_get();
+	if (size == 0)
+		return;
+
+	rc = opal_register_dump_region(OPAL_DUMP_REGION_LOG_BUF,
+				       __pa(addr), size);
+	/* Don't warn if this is just an older OPAL that doesn't
+	 * know about that call
+	 */
+	if (rc && rc != OPAL_UNSUPPORTED)
+		pr_warn("DUMP: Failed to register kernel log buffer. "
+			"rc = %d\n", rc);
+}
+
+static void opal_pdev_init(const char *compatible)
+{
+	struct device_node *np;
+
+	for_each_compatible_node(np, NULL, compatible)
+		of_platform_device_create(np, NULL, NULL);
+}
+
+static void __init opal_imc_init_dev(void)
+{
+	struct device_node *np;
+
+	np = of_find_compatible_node(NULL, NULL, IMC_DTB_COMPAT);
+	if (np)
+		of_platform_device_create(np, NULL, NULL);
+}
+
+static int kopald(void *unused)
+{
+	unsigned long timeout = msecs_to_jiffies(opal_heartbeat) + 1;
+
+	set_freezable();
+	do {
+		try_to_freeze();
+
+		opal_handle_events();
+
+		set_current_state(TASK_INTERRUPTIBLE);
+		if (opal_have_pending_events())
+			__set_current_state(TASK_RUNNING);
+		else
+			schedule_timeout(timeout);
+
+	} while (!kthread_should_stop());
+
+	return 0;
+}
+
+void opal_wake_poller(void)
+{
+	if (kopald_tsk)
+		wake_up_process(kopald_tsk);
+}
+
+static void opal_init_heartbeat(void)
+{
+	/* Old firwmware, we assume the HVC heartbeat is sufficient */
+	if (of_property_read_u32(opal_node, "ibm,heartbeat-ms",
+				 &opal_heartbeat) != 0)
+		opal_heartbeat = 0;
+
+	if (opal_heartbeat)
+		kopald_tsk = kthread_run(kopald, NULL, "kopald");
+}
+
+static int __init opal_init(void)
+{
+	struct device_node *np, *consoles, *leds;
+	int rc;
+
+	opal_node = of_find_node_by_path("/ibm,opal");
+	if (!opal_node) {
+		pr_warn("Device node not found\n");
+		return -ENODEV;
+	}
+
+	/* Register OPAL consoles if any ports */
+	consoles = of_find_node_by_path("/ibm,opal/consoles");
+	if (consoles) {
+		for_each_child_of_node(consoles, np) {
+			if (!of_node_name_eq(np, "serial"))
+				continue;
+			of_platform_device_create(np, NULL, NULL);
+		}
+		of_node_put(consoles);
+	}
+
+	/* Initialise OPAL messaging system */
+	opal_message_init(opal_node);
+
+	/* Initialise OPAL asynchronous completion interface */
+	opal_async_comp_init();
+
+	/* Initialise OPAL sensor interface */
+	opal_sensor_init();
+
+	/* Initialise OPAL hypervisor maintainence interrupt handling */
+	opal_hmi_handler_init();
+
+	/* Create i2c platform devices */
+	opal_pdev_init("ibm,opal-i2c");
+
+	/* Handle non-volatile memory devices */
+	opal_pdev_init("pmem-region");
+
+	/* Setup a heatbeat thread if requested by OPAL */
+	opal_init_heartbeat();
+
+	/* Detect In-Memory Collection counters and create devices*/
+	opal_imc_init_dev();
+
+	/* Create leds platform devices */
+	leds = of_find_node_by_path("/ibm,opal/leds");
+	if (leds) {
+		of_platform_device_create(leds, "opal_leds", NULL);
+		of_node_put(leds);
+	}
+
+	/* Initialise OPAL message log interface */
+	opal_msglog_init();
+
+	/* Create "opal" kobject under /sys/firmware */
+	rc = opal_sysfs_init();
+	if (rc == 0) {
+		/* Setup dump region interface */
+		opal_dump_region_init();
+		/* Setup error log interface */
+		rc = opal_elog_init();
+		/* Setup code update interface */
+		opal_flash_update_init();
+		/* Setup platform dump extract interface */
+		opal_platform_dump_init();
+		/* Setup system parameters interface */
+		opal_sys_param_init();
+		/* Setup message log sysfs interface. */
+		opal_msglog_sysfs_init();
+		/* Add all export properties*/
+		opal_export_attrs();
+	}
+
+	/* Initialize platform devices: IPMI backend, PRD & flash interface */
+	opal_pdev_init("ibm,opal-ipmi");
+	opal_pdev_init("ibm,opal-flash");
+	opal_pdev_init("ibm,opal-prd");
+
+	/* Initialise platform device: oppanel interface */
+	opal_pdev_init("ibm,opal-oppanel");
+
+	/* Initialise OPAL kmsg dumper for flushing console on panic */
+	opal_kmsg_init();
+
+	/* Initialise OPAL powercap interface */
+	opal_powercap_init();
+
+	/* Initialise OPAL Power-Shifting-Ratio interface */
+	opal_psr_init();
+
+	/* Initialise OPAL sensor groups */
+	opal_sensor_groups_init();
+
+	/* Initialise OPAL Power control interface */
+	opal_power_control_init();
+
+	/* Initialize OPAL secure variables */
+	opal_pdev_init("ibm,secvar-backend");
+
+	return 0;
+}
+machine_subsys_initcall(powernv, opal_init);
+
+void opal_shutdown(void)
+{
+	long rc = OPAL_BUSY;
+
+	opal_event_shutdown();
+
+	/*
+	 * Then sync with OPAL which ensure anything that can
+	 * potentially write to our memory has completed such
+	 * as an ongoing dump retrieval
+	 */
+	while (rc == OPAL_BUSY || rc == OPAL_BUSY_EVENT) {
+		rc = opal_sync_host_reboot();
+		if (rc == OPAL_BUSY)
+			opal_poll_events(NULL);
+		else
+			mdelay(10);
+	}
+
+	/* Unregister memory dump region */
+	if (opal_check_token(OPAL_UNREGISTER_DUMP_REGION))
+		opal_unregister_dump_region(OPAL_DUMP_REGION_LOG_BUF);
+}
+
+/* Export this so that test modules can use it */
+EXPORT_SYMBOL_GPL(opal_invalid_call);
+EXPORT_SYMBOL_GPL(opal_xscom_read);
+EXPORT_SYMBOL_GPL(opal_xscom_write);
+EXPORT_SYMBOL_GPL(opal_ipmi_send);
+EXPORT_SYMBOL_GPL(opal_ipmi_recv);
+EXPORT_SYMBOL_GPL(opal_flash_read);
+EXPORT_SYMBOL_GPL(opal_flash_write);
+EXPORT_SYMBOL_GPL(opal_flash_erase);
+EXPORT_SYMBOL_GPL(opal_prd_msg);
+EXPORT_SYMBOL_GPL(opal_check_token);
+
+/* Convert a region of vmalloc memory to an opal sg list */
+struct opal_sg_list *opal_vmalloc_to_sg_list(void *vmalloc_addr,
+					     unsigned long vmalloc_size)
+{
+	struct opal_sg_list *sg, *first = NULL;
+	unsigned long i = 0;
+
+	sg = kzalloc(PAGE_SIZE, GFP_KERNEL);
+	if (!sg)
+		goto nomem;
+
+	first = sg;
+
+	while (vmalloc_size > 0) {
+		uint64_t data = vmalloc_to_pfn(vmalloc_addr) << PAGE_SHIFT;
+		uint64_t length = min(vmalloc_size, PAGE_SIZE);
+
+		sg->entry[i].data = cpu_to_be64(data);
+		sg->entry[i].length = cpu_to_be64(length);
+		i++;
+
+		if (i >= SG_ENTRIES_PER_NODE) {
+			struct opal_sg_list *next;
+
+			next = kzalloc(PAGE_SIZE, GFP_KERNEL);
+			if (!next)
+				goto nomem;
+
+			sg->length = cpu_to_be64(
+					i * sizeof(struct opal_sg_entry) + 16);
+			i = 0;
+			sg->next = cpu_to_be64(__pa(next));
+			sg = next;
+		}
+
+		vmalloc_addr += length;
+		vmalloc_size -= length;
+	}
+
+	sg->length = cpu_to_be64(i * sizeof(struct opal_sg_entry) + 16);
+
+	return first;
+
+nomem:
+	pr_err("%s : Failed to allocate memory\n", __func__);
+	opal_free_sg_list(first);
+	return NULL;
+}
+
+void opal_free_sg_list(struct opal_sg_list *sg)
+{
+	while (sg) {
+		uint64_t next = be64_to_cpu(sg->next);
+
+		kfree(sg);
+
+		if (next)
+			sg = __va(next);
+		else
+			sg = NULL;
+	}
+}
+
+int opal_error_code(int rc)
+{
+	switch (rc) {
+	case OPAL_SUCCESS:		return 0;
+
+	case OPAL_PARAMETER:		return -EINVAL;
+	case OPAL_ASYNC_COMPLETION:	return -EINPROGRESS;
+	case OPAL_BUSY:
+	case OPAL_BUSY_EVENT:		return -EBUSY;
+	case OPAL_NO_MEM:		return -ENOMEM;
+	case OPAL_PERMISSION:		return -EPERM;
+
+	case OPAL_UNSUPPORTED:		return -EIO;
+	case OPAL_HARDWARE:		return -EIO;
+	case OPAL_INTERNAL_ERROR:	return -EIO;
+	case OPAL_TIMEOUT:		return -ETIMEDOUT;
+	default:
+		pr_err("%s: unexpected OPAL error %d\n", __func__, rc);
+		return -EIO;
+	}
+}
+
+void powernv_set_nmmu_ptcr(unsigned long ptcr)
+{
+	int rc;
+
+	if (firmware_has_feature(FW_FEATURE_OPAL)) {
+		rc = opal_nmmu_set_ptcr(-1UL, ptcr);
+		if (rc != OPAL_SUCCESS && rc != OPAL_UNSUPPORTED)
+			pr_warn("%s: Unable to set nest mmu ptcr\n", __func__);
+	}
+}
+
+EXPORT_SYMBOL_GPL(opal_poll_events);
+EXPORT_SYMBOL_GPL(opal_rtc_read);
+EXPORT_SYMBOL_GPL(opal_rtc_write);
+EXPORT_SYMBOL_GPL(opal_tpo_read);
+EXPORT_SYMBOL_GPL(opal_tpo_write);
+EXPORT_SYMBOL_GPL(opal_i2c_request);
+/* Export these symbols for PowerNV LED class driver */
+EXPORT_SYMBOL_GPL(opal_leds_get_ind);
+EXPORT_SYMBOL_GPL(opal_leds_set_ind);
+/* Export this symbol for PowerNV Operator Panel class driver */
+EXPORT_SYMBOL_GPL(opal_write_oppanel_async);
+/* Export this for KVM */
+EXPORT_SYMBOL_GPL(opal_int_set_mfrr);
+EXPORT_SYMBOL_GPL(opal_int_eoi);
+EXPORT_SYMBOL_GPL(opal_error_code);
+/* Export the below symbol for NX compression */
+EXPORT_SYMBOL(opal_nx_coproc_init);
diff --git a/arch/powerpc/platforms/powernv/pci-cxl.c b/arch/powerpc/platforms/powernv/pci-cxl.c
new file mode 100644
index 000000000..8c739c94e
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/pci-cxl.c
@@ -0,0 +1,174 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright 2014-2016 IBM Corp.
+ */
+
+#include <linux/module.h>
+#include <asm/pnv-pci.h>
+#include <asm/opal.h>
+
+#include "pci.h"
+
+int pnv_phb_to_cxl_mode(struct pci_dev *dev, uint64_t mode)
+{
+	struct pci_controller *hose = pci_bus_to_host(dev->bus);
+	struct pnv_phb *phb = hose->private_data;
+	struct pnv_ioda_pe *pe;
+	int rc;
+
+	pe = pnv_ioda_get_pe(dev);
+	if (!pe)
+		return -ENODEV;
+
+	pe_info(pe, "Switching PHB to CXL\n");
+
+	rc = opal_pci_set_phb_cxl_mode(phb->opal_id, mode, pe->pe_number);
+	if (rc == OPAL_UNSUPPORTED)
+		dev_err(&dev->dev, "Required cxl mode not supported by firmware - update skiboot\n");
+	else if (rc)
+		dev_err(&dev->dev, "opal_pci_set_phb_cxl_mode failed: %i\n", rc);
+
+	return rc;
+}
+EXPORT_SYMBOL(pnv_phb_to_cxl_mode);
+
+/* Find PHB for cxl dev and allocate MSI hwirqs?
+ * Returns the absolute hardware IRQ number
+ */
+int pnv_cxl_alloc_hwirqs(struct pci_dev *dev, int num)
+{
+	struct pci_controller *hose = pci_bus_to_host(dev->bus);
+	struct pnv_phb *phb = hose->private_data;
+	int hwirq = msi_bitmap_alloc_hwirqs(&phb->msi_bmp, num);
+
+	if (hwirq < 0) {
+		dev_warn(&dev->dev, "Failed to find a free MSI\n");
+		return -ENOSPC;
+	}
+
+	return phb->msi_base + hwirq;
+}
+EXPORT_SYMBOL(pnv_cxl_alloc_hwirqs);
+
+void pnv_cxl_release_hwirqs(struct pci_dev *dev, int hwirq, int num)
+{
+	struct pci_controller *hose = pci_bus_to_host(dev->bus);
+	struct pnv_phb *phb = hose->private_data;
+
+	msi_bitmap_free_hwirqs(&phb->msi_bmp, hwirq - phb->msi_base, num);
+}
+EXPORT_SYMBOL(pnv_cxl_release_hwirqs);
+
+void pnv_cxl_release_hwirq_ranges(struct cxl_irq_ranges *irqs,
+				  struct pci_dev *dev)
+{
+	struct pci_controller *hose = pci_bus_to_host(dev->bus);
+	struct pnv_phb *phb = hose->private_data;
+	int i, hwirq;
+
+	for (i = 1; i < CXL_IRQ_RANGES; i++) {
+		if (!irqs->range[i])
+			continue;
+		pr_devel("cxl release irq range 0x%x: offset: 0x%lx  limit: %ld\n",
+			 i, irqs->offset[i],
+			 irqs->range[i]);
+		hwirq = irqs->offset[i] - phb->msi_base;
+		msi_bitmap_free_hwirqs(&phb->msi_bmp, hwirq,
+				       irqs->range[i]);
+	}
+}
+EXPORT_SYMBOL(pnv_cxl_release_hwirq_ranges);
+
+int pnv_cxl_alloc_hwirq_ranges(struct cxl_irq_ranges *irqs,
+			       struct pci_dev *dev, int num)
+{
+	struct pci_controller *hose = pci_bus_to_host(dev->bus);
+	struct pnv_phb *phb = hose->private_data;
+	int i, hwirq, try;
+
+	memset(irqs, 0, sizeof(struct cxl_irq_ranges));
+
+	/* 0 is reserved for the multiplexed PSL DSI interrupt */
+	for (i = 1; i < CXL_IRQ_RANGES && num; i++) {
+		try = num;
+		while (try) {
+			hwirq = msi_bitmap_alloc_hwirqs(&phb->msi_bmp, try);
+			if (hwirq >= 0)
+				break;
+			try /= 2;
+		}
+		if (!try)
+			goto fail;
+
+		irqs->offset[i] = phb->msi_base + hwirq;
+		irqs->range[i] = try;
+		pr_devel("cxl alloc irq range 0x%x: offset: 0x%lx  limit: %li\n",
+			 i, irqs->offset[i], irqs->range[i]);
+		num -= try;
+	}
+	if (num)
+		goto fail;
+
+	return 0;
+fail:
+	pnv_cxl_release_hwirq_ranges(irqs, dev);
+	return -ENOSPC;
+}
+EXPORT_SYMBOL(pnv_cxl_alloc_hwirq_ranges);
+
+int pnv_cxl_get_irq_count(struct pci_dev *dev)
+{
+	struct pci_controller *hose = pci_bus_to_host(dev->bus);
+	struct pnv_phb *phb = hose->private_data;
+
+	return phb->msi_bmp.irq_count;
+}
+EXPORT_SYMBOL(pnv_cxl_get_irq_count);
+
+int pnv_cxl_ioda_msi_setup(struct pci_dev *dev, unsigned int hwirq,
+			   unsigned int virq)
+{
+	struct pci_controller *hose = pci_bus_to_host(dev->bus);
+	struct pnv_phb *phb = hose->private_data;
+	unsigned int xive_num = hwirq - phb->msi_base;
+	struct pnv_ioda_pe *pe;
+	int rc;
+
+	if (!(pe = pnv_ioda_get_pe(dev)))
+		return -ENODEV;
+
+	/* Assign XIVE to PE */
+	rc = opal_pci_set_xive_pe(phb->opal_id, pe->pe_number, xive_num);
+	if (rc) {
+		pe_warn(pe, "%s: OPAL error %d setting msi_base 0x%x "
+			"hwirq 0x%x XIVE 0x%x PE\n",
+			pci_name(dev), rc, phb->msi_base, hwirq, xive_num);
+		return -EIO;
+	}
+	pnv_set_msi_irq_chip(phb, virq);
+
+	return 0;
+}
+EXPORT_SYMBOL(pnv_cxl_ioda_msi_setup);
+
+#if IS_MODULE(CONFIG_CXL)
+static inline int get_cxl_module(void)
+{
+	struct module *cxl_module;
+
+	mutex_lock(&module_mutex);
+
+	cxl_module = find_module("cxl");
+	if (cxl_module)
+		__module_get(cxl_module);
+
+	mutex_unlock(&module_mutex);
+
+	if (!cxl_module)
+		return -ENODEV;
+
+	return 0;
+}
+#else
+static inline int get_cxl_module(void) { return 0; }
+#endif
diff --git a/arch/powerpc/platforms/powernv/pci-ioda-tce.c b/arch/powerpc/platforms/powernv/pci-ioda-tce.c
new file mode 100644
index 000000000..5218f5da2
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/pci-ioda-tce.c
@@ -0,0 +1,427 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * TCE helpers for IODA PCI/PCIe on PowerNV platforms
+ *
+ * Copyright 2018 IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/iommu.h>
+
+#include <asm/iommu.h>
+#include <asm/tce.h>
+#include "pci.h"
+
+unsigned long pnv_ioda_parse_tce_sizes(struct pnv_phb *phb)
+{
+	struct pci_controller *hose = phb->hose;
+	struct device_node *dn = hose->dn;
+	unsigned long mask = 0;
+	int i, rc, count;
+	u32 val;
+
+	count = of_property_count_u32_elems(dn, "ibm,supported-tce-sizes");
+	if (count <= 0) {
+		mask = SZ_4K | SZ_64K;
+		/* Add 16M for POWER8 by default */
+		if (cpu_has_feature(CPU_FTR_ARCH_207S) &&
+				!cpu_has_feature(CPU_FTR_ARCH_300))
+			mask |= SZ_16M | SZ_256M;
+		return mask;
+	}
+
+	for (i = 0; i < count; i++) {
+		rc = of_property_read_u32_index(dn, "ibm,supported-tce-sizes",
+						i, &val);
+		if (rc == 0)
+			mask |= 1ULL << val;
+	}
+
+	return mask;
+}
+
+void pnv_pci_setup_iommu_table(struct iommu_table *tbl,
+		void *tce_mem, u64 tce_size,
+		u64 dma_offset, unsigned int page_shift)
+{
+	tbl->it_blocksize = 16;
+	tbl->it_base = (unsigned long)tce_mem;
+	tbl->it_page_shift = page_shift;
+	tbl->it_offset = dma_offset >> tbl->it_page_shift;
+	tbl->it_index = 0;
+	tbl->it_size = tce_size >> 3;
+	tbl->it_busno = 0;
+	tbl->it_type = TCE_PCI;
+}
+
+static __be64 *pnv_alloc_tce_level(int nid, unsigned int shift)
+{
+	struct page *tce_mem = NULL;
+	__be64 *addr;
+
+	tce_mem = alloc_pages_node(nid, GFP_ATOMIC | __GFP_NOWARN,
+			shift - PAGE_SHIFT);
+	if (!tce_mem) {
+		pr_err("Failed to allocate a TCE memory, level shift=%d\n",
+				shift);
+		return NULL;
+	}
+	addr = page_address(tce_mem);
+	memset(addr, 0, 1UL << shift);
+
+	return addr;
+}
+
+static void pnv_pci_ioda2_table_do_free_pages(__be64 *addr,
+		unsigned long size, unsigned int levels);
+
+static __be64 *pnv_tce(struct iommu_table *tbl, bool user, long idx, bool alloc)
+{
+	__be64 *tmp = user ? tbl->it_userspace : (__be64 *) tbl->it_base;
+	int  level = tbl->it_indirect_levels;
+	const long shift = ilog2(tbl->it_level_size);
+	unsigned long mask = (tbl->it_level_size - 1) << (level * shift);
+
+	while (level) {
+		int n = (idx & mask) >> (level * shift);
+		unsigned long oldtce, tce = be64_to_cpu(READ_ONCE(tmp[n]));
+
+		if (!tce) {
+			__be64 *tmp2;
+
+			if (!alloc)
+				return NULL;
+
+			tmp2 = pnv_alloc_tce_level(tbl->it_nid,
+					ilog2(tbl->it_level_size) + 3);
+			if (!tmp2)
+				return NULL;
+
+			tce = __pa(tmp2) | TCE_PCI_READ | TCE_PCI_WRITE;
+			oldtce = be64_to_cpu(cmpxchg(&tmp[n], 0,
+					cpu_to_be64(tce)));
+			if (oldtce) {
+				pnv_pci_ioda2_table_do_free_pages(tmp2,
+					ilog2(tbl->it_level_size) + 3, 1);
+				tce = oldtce;
+			}
+		}
+
+		tmp = __va(tce & ~(TCE_PCI_READ | TCE_PCI_WRITE));
+		idx &= ~mask;
+		mask >>= shift;
+		--level;
+	}
+
+	return tmp + idx;
+}
+
+int pnv_tce_build(struct iommu_table *tbl, long index, long npages,
+		unsigned long uaddr, enum dma_data_direction direction,
+		unsigned long attrs)
+{
+	u64 proto_tce = iommu_direction_to_tce_perm(direction);
+	u64 rpn = __pa(uaddr) >> tbl->it_page_shift;
+	long i;
+
+	if (proto_tce & TCE_PCI_WRITE)
+		proto_tce |= TCE_PCI_READ;
+
+	for (i = 0; i < npages; i++) {
+		unsigned long newtce = proto_tce |
+			((rpn + i) << tbl->it_page_shift);
+		unsigned long idx = index - tbl->it_offset + i;
+
+		*(pnv_tce(tbl, false, idx, true)) = cpu_to_be64(newtce);
+	}
+
+	return 0;
+}
+
+#ifdef CONFIG_IOMMU_API
+int pnv_tce_xchg(struct iommu_table *tbl, long index,
+		unsigned long *hpa, enum dma_data_direction *direction,
+		bool alloc)
+{
+	u64 proto_tce = iommu_direction_to_tce_perm(*direction);
+	unsigned long newtce = *hpa | proto_tce, oldtce;
+	unsigned long idx = index - tbl->it_offset;
+	__be64 *ptce = NULL;
+
+	BUG_ON(*hpa & ~IOMMU_PAGE_MASK(tbl));
+
+	if (*direction == DMA_NONE) {
+		ptce = pnv_tce(tbl, false, idx, false);
+		if (!ptce) {
+			*hpa = 0;
+			return 0;
+		}
+	}
+
+	if (!ptce) {
+		ptce = pnv_tce(tbl, false, idx, alloc);
+		if (!ptce)
+			return -ENOMEM;
+	}
+
+	if (newtce & TCE_PCI_WRITE)
+		newtce |= TCE_PCI_READ;
+
+	oldtce = be64_to_cpu(xchg(ptce, cpu_to_be64(newtce)));
+	*hpa = oldtce & ~(TCE_PCI_READ | TCE_PCI_WRITE);
+	*direction = iommu_tce_direction(oldtce);
+
+	return 0;
+}
+
+__be64 *pnv_tce_useraddrptr(struct iommu_table *tbl, long index, bool alloc)
+{
+	if (WARN_ON_ONCE(!tbl->it_userspace))
+		return NULL;
+
+	return pnv_tce(tbl, true, index - tbl->it_offset, alloc);
+}
+#endif
+
+void pnv_tce_free(struct iommu_table *tbl, long index, long npages)
+{
+	long i;
+
+	for (i = 0; i < npages; i++) {
+		unsigned long idx = index - tbl->it_offset + i;
+		__be64 *ptce = pnv_tce(tbl, false, idx,	false);
+
+		if (ptce)
+			*ptce = cpu_to_be64(0);
+		else
+			/* Skip the rest of the level */
+			i |= tbl->it_level_size - 1;
+	}
+}
+
+unsigned long pnv_tce_get(struct iommu_table *tbl, long index)
+{
+	__be64 *ptce = pnv_tce(tbl, false, index - tbl->it_offset, false);
+
+	if (!ptce)
+		return 0;
+
+	return be64_to_cpu(*ptce);
+}
+
+static void pnv_pci_ioda2_table_do_free_pages(__be64 *addr,
+		unsigned long size, unsigned int levels)
+{
+	const unsigned long addr_ul = (unsigned long) addr &
+			~(TCE_PCI_READ | TCE_PCI_WRITE);
+
+	if (levels) {
+		long i;
+		u64 *tmp = (u64 *) addr_ul;
+
+		for (i = 0; i < size; ++i) {
+			unsigned long hpa = be64_to_cpu(tmp[i]);
+
+			if (!(hpa & (TCE_PCI_READ | TCE_PCI_WRITE)))
+				continue;
+
+			pnv_pci_ioda2_table_do_free_pages(__va(hpa), size,
+					levels - 1);
+		}
+	}
+
+	free_pages(addr_ul, get_order(size << 3));
+}
+
+void pnv_pci_ioda2_table_free_pages(struct iommu_table *tbl)
+{
+	const unsigned long size = tbl->it_indirect_levels ?
+			tbl->it_level_size : tbl->it_size;
+
+	if (!tbl->it_size)
+		return;
+
+	pnv_pci_ioda2_table_do_free_pages((__be64 *)tbl->it_base, size,
+			tbl->it_indirect_levels);
+	if (tbl->it_userspace) {
+		pnv_pci_ioda2_table_do_free_pages(tbl->it_userspace, size,
+				tbl->it_indirect_levels);
+	}
+}
+
+static __be64 *pnv_pci_ioda2_table_do_alloc_pages(int nid, unsigned int shift,
+		unsigned int levels, unsigned long limit,
+		unsigned long *current_offset, unsigned long *total_allocated)
+{
+	__be64 *addr, *tmp;
+	unsigned long allocated = 1UL << shift;
+	unsigned int entries = 1UL << (shift - 3);
+	long i;
+
+	addr = pnv_alloc_tce_level(nid, shift);
+	*total_allocated += allocated;
+
+	--levels;
+	if (!levels) {
+		*current_offset += allocated;
+		return addr;
+	}
+
+	for (i = 0; i < entries; ++i) {
+		tmp = pnv_pci_ioda2_table_do_alloc_pages(nid, shift,
+				levels, limit, current_offset, total_allocated);
+		if (!tmp)
+			break;
+
+		addr[i] = cpu_to_be64(__pa(tmp) |
+				TCE_PCI_READ | TCE_PCI_WRITE);
+
+		if (*current_offset >= limit)
+			break;
+	}
+
+	return addr;
+}
+
+long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset,
+		__u32 page_shift, __u64 window_size, __u32 levels,
+		bool alloc_userspace_copy, struct iommu_table *tbl)
+{
+	void *addr, *uas = NULL;
+	unsigned long offset = 0, level_shift, total_allocated = 0;
+	unsigned long total_allocated_uas = 0;
+	const unsigned int window_shift = ilog2(window_size);
+	unsigned int entries_shift = window_shift - page_shift;
+	unsigned int table_shift = max_t(unsigned int, entries_shift + 3,
+			PAGE_SHIFT);
+	const unsigned long tce_table_size = 1UL << table_shift;
+
+	if (!levels || (levels > POWERNV_IOMMU_MAX_LEVELS))
+		return -EINVAL;
+
+	if (!is_power_of_2(window_size))
+		return -EINVAL;
+
+	/* Adjust direct table size from window_size and levels */
+	entries_shift = (entries_shift + levels - 1) / levels;
+	level_shift = entries_shift + 3;
+	level_shift = max_t(unsigned int, level_shift, PAGE_SHIFT);
+
+	if ((level_shift - 3) * levels + page_shift >= 55)
+		return -EINVAL;
+
+	/* Allocate TCE table */
+	addr = pnv_pci_ioda2_table_do_alloc_pages(nid, level_shift,
+			1, tce_table_size, &offset, &total_allocated);
+
+	/* addr==NULL means that the first level allocation failed */
+	if (!addr)
+		return -ENOMEM;
+
+	/*
+	 * First level was allocated but some lower level failed as
+	 * we did not allocate as much as we wanted,
+	 * release partially allocated table.
+	 */
+	if (levels == 1 && offset < tce_table_size)
+		goto free_tces_exit;
+
+	/* Allocate userspace view of the TCE table */
+	if (alloc_userspace_copy) {
+		offset = 0;
+		uas = pnv_pci_ioda2_table_do_alloc_pages(nid, level_shift,
+				1, tce_table_size, &offset,
+				&total_allocated_uas);
+		if (!uas)
+			goto free_tces_exit;
+		if (levels == 1 && (offset < tce_table_size ||
+				total_allocated_uas != total_allocated))
+			goto free_uas_exit;
+	}
+
+	/* Setup linux iommu table */
+	pnv_pci_setup_iommu_table(tbl, addr, tce_table_size, bus_offset,
+			page_shift);
+	tbl->it_level_size = 1ULL << (level_shift - 3);
+	tbl->it_indirect_levels = levels - 1;
+	tbl->it_userspace = uas;
+	tbl->it_nid = nid;
+
+	pr_debug("Created TCE table: ws=%08llx ts=%lx @%08llx base=%lx uas=%p levels=%d/%d\n",
+			window_size, tce_table_size, bus_offset, tbl->it_base,
+			tbl->it_userspace, 1, levels);
+
+	return 0;
+
+free_uas_exit:
+	pnv_pci_ioda2_table_do_free_pages(uas,
+			1ULL << (level_shift - 3), levels - 1);
+free_tces_exit:
+	pnv_pci_ioda2_table_do_free_pages(addr,
+			1ULL << (level_shift - 3), levels - 1);
+
+	return -ENOMEM;
+}
+
+void pnv_pci_unlink_table_and_group(struct iommu_table *tbl,
+		struct iommu_table_group *table_group)
+{
+	long i;
+	bool found;
+	struct iommu_table_group_link *tgl;
+
+	if (!tbl || !table_group)
+		return;
+
+	/* Remove link to a group from table's list of attached groups */
+	found = false;
+	list_for_each_entry_rcu(tgl, &tbl->it_group_list, next) {
+		if (tgl->table_group == table_group) {
+			list_del_rcu(&tgl->next);
+			kfree_rcu(tgl, rcu);
+			found = true;
+			break;
+		}
+	}
+	if (WARN_ON(!found))
+		return;
+
+	/* Clean a pointer to iommu_table in iommu_table_group::tables[] */
+	found = false;
+	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
+		if (table_group->tables[i] == tbl) {
+			iommu_tce_table_put(tbl);
+			table_group->tables[i] = NULL;
+			found = true;
+			break;
+		}
+	}
+	WARN_ON(!found);
+}
+
+long pnv_pci_link_table_and_group(int node, int num,
+		struct iommu_table *tbl,
+		struct iommu_table_group *table_group)
+{
+	struct iommu_table_group_link *tgl = NULL;
+
+	if (WARN_ON(!tbl || !table_group))
+		return -EINVAL;
+
+	tgl = kzalloc_node(sizeof(struct iommu_table_group_link), GFP_KERNEL,
+			node);
+	if (!tgl)
+		return -ENOMEM;
+
+	tgl->table_group = table_group;
+	list_add_rcu(&tgl->next, &tbl->it_group_list);
+
+	table_group->tables[num] = iommu_tce_table_get(tbl);
+
+	return 0;
+}
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
new file mode 100644
index 000000000..a1e6dd477
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -0,0 +1,3231 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Support PCI/PCIe on PowerNV platforms
+ *
+ * Copyright 2011 Benjamin Herrenschmidt, IBM Corp.
+ */
+
+#undef DEBUG
+
+#include <linux/kernel.h>
+#include <linux/pci.h>
+#include <linux/crash_dump.h>
+#include <linux/delay.h>
+#include <linux/string.h>
+#include <linux/init.h>
+#include <linux/memblock.h>
+#include <linux/irq.h>
+#include <linux/io.h>
+#include <linux/msi.h>
+#include <linux/iommu.h>
+#include <linux/rculist.h>
+#include <linux/sizes.h>
+
+#include <asm/sections.h>
+#include <asm/io.h>
+#include <asm/prom.h>
+#include <asm/pci-bridge.h>
+#include <asm/machdep.h>
+#include <asm/msi_bitmap.h>
+#include <asm/ppc-pci.h>
+#include <asm/opal.h>
+#include <asm/iommu.h>
+#include <asm/tce.h>
+#include <asm/xics.h>
+#include <asm/debugfs.h>
+#include <asm/firmware.h>
+#include <asm/pnv-pci.h>
+#include <asm/mmzone.h>
+
+#include <misc/cxl-base.h>
+
+#include "powernv.h"
+#include "pci.h"
+#include "../../../../drivers/pci/pci.h"
+
+#define PNV_IODA1_M64_NUM	16	/* Number of M64 BARs	*/
+#define PNV_IODA1_M64_SEGS	8	/* Segments per M64 BAR	*/
+#define PNV_IODA1_DMA32_SEGSIZE	0x10000000
+
+static const char * const pnv_phb_names[] = { "IODA1", "IODA2", "NPU_NVLINK",
+					      "NPU_OCAPI" };
+
+static void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable);
+static void pnv_pci_configure_bus(struct pci_bus *bus);
+
+void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level,
+			    const char *fmt, ...)
+{
+	struct va_format vaf;
+	va_list args;
+	char pfix[32];
+
+	va_start(args, fmt);
+
+	vaf.fmt = fmt;
+	vaf.va = &args;
+
+	if (pe->flags & PNV_IODA_PE_DEV)
+		strlcpy(pfix, dev_name(&pe->pdev->dev), sizeof(pfix));
+	else if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL))
+		sprintf(pfix, "%04x:%02x     ",
+			pci_domain_nr(pe->pbus), pe->pbus->number);
+#ifdef CONFIG_PCI_IOV
+	else if (pe->flags & PNV_IODA_PE_VF)
+		sprintf(pfix, "%04x:%02x:%2x.%d",
+			pci_domain_nr(pe->parent_dev->bus),
+			(pe->rid & 0xff00) >> 8,
+			PCI_SLOT(pe->rid), PCI_FUNC(pe->rid));
+#endif /* CONFIG_PCI_IOV*/
+
+	printk("%spci %s: [PE# %.2x] %pV",
+	       level, pfix, pe->pe_number, &vaf);
+
+	va_end(args);
+}
+
+static bool pnv_iommu_bypass_disabled __read_mostly;
+static bool pci_reset_phbs __read_mostly;
+
+static int __init iommu_setup(char *str)
+{
+	if (!str)
+		return -EINVAL;
+
+	while (*str) {
+		if (!strncmp(str, "nobypass", 8)) {
+			pnv_iommu_bypass_disabled = true;
+			pr_info("PowerNV: IOMMU bypass window disabled.\n");
+			break;
+		}
+		str += strcspn(str, ",");
+		if (*str == ',')
+			str++;
+	}
+
+	return 0;
+}
+early_param("iommu", iommu_setup);
+
+static int __init pci_reset_phbs_setup(char *str)
+{
+	pci_reset_phbs = true;
+	return 0;
+}
+
+early_param("ppc_pci_reset_phbs", pci_reset_phbs_setup);
+
+static struct pnv_ioda_pe *pnv_ioda_init_pe(struct pnv_phb *phb, int pe_no)
+{
+	s64 rc;
+
+	phb->ioda.pe_array[pe_no].phb = phb;
+	phb->ioda.pe_array[pe_no].pe_number = pe_no;
+	phb->ioda.pe_array[pe_no].dma_setup_done = false;
+
+	/*
+	 * Clear the PE frozen state as it might be put into frozen state
+	 * in the last PCI remove path. It's not harmful to do so when the
+	 * PE is already in unfrozen state.
+	 */
+	rc = opal_pci_eeh_freeze_clear(phb->opal_id, pe_no,
+				       OPAL_EEH_ACTION_CLEAR_FREEZE_ALL);
+	if (rc != OPAL_SUCCESS && rc != OPAL_UNSUPPORTED)
+		pr_warn("%s: Error %lld unfreezing PHB#%x-PE#%x\n",
+			__func__, rc, phb->hose->global_number, pe_no);
+
+	return &phb->ioda.pe_array[pe_no];
+}
+
+static void pnv_ioda_reserve_pe(struct pnv_phb *phb, int pe_no)
+{
+	if (!(pe_no >= 0 && pe_no < phb->ioda.total_pe_num)) {
+		pr_warn("%s: Invalid PE %x on PHB#%x\n",
+			__func__, pe_no, phb->hose->global_number);
+		return;
+	}
+
+	mutex_lock(&phb->ioda.pe_alloc_mutex);
+	if (test_and_set_bit(pe_no, phb->ioda.pe_alloc))
+		pr_debug("%s: PE %x was reserved on PHB#%x\n",
+			 __func__, pe_no, phb->hose->global_number);
+	mutex_unlock(&phb->ioda.pe_alloc_mutex);
+
+	pnv_ioda_init_pe(phb, pe_no);
+}
+
+struct pnv_ioda_pe *pnv_ioda_alloc_pe(struct pnv_phb *phb, int count)
+{
+	struct pnv_ioda_pe *ret = NULL;
+	int run = 0, pe, i;
+
+	mutex_lock(&phb->ioda.pe_alloc_mutex);
+
+	/* scan backwards for a run of @count cleared bits */
+	for (pe = phb->ioda.total_pe_num - 1; pe >= 0; pe--) {
+		if (test_bit(pe, phb->ioda.pe_alloc)) {
+			run = 0;
+			continue;
+		}
+
+		run++;
+		if (run == count)
+			break;
+	}
+	if (run != count)
+		goto out;
+
+	for (i = pe; i < pe + count; i++) {
+		set_bit(i, phb->ioda.pe_alloc);
+		pnv_ioda_init_pe(phb, i);
+	}
+	ret = &phb->ioda.pe_array[pe];
+
+out:
+	mutex_unlock(&phb->ioda.pe_alloc_mutex);
+	return ret;
+}
+
+void pnv_ioda_free_pe(struct pnv_ioda_pe *pe)
+{
+	struct pnv_phb *phb = pe->phb;
+	unsigned int pe_num = pe->pe_number;
+
+	WARN_ON(pe->pdev);
+	WARN_ON(pe->npucomp); /* NPUs for nvlink are not supposed to be freed */
+	kfree(pe->npucomp);
+	memset(pe, 0, sizeof(struct pnv_ioda_pe));
+
+	mutex_lock(&phb->ioda.pe_alloc_mutex);
+	clear_bit(pe_num, phb->ioda.pe_alloc);
+	mutex_unlock(&phb->ioda.pe_alloc_mutex);
+}
+
+/* The default M64 BAR is shared by all PEs */
+static int pnv_ioda2_init_m64(struct pnv_phb *phb)
+{
+	const char *desc;
+	struct resource *r;
+	s64 rc;
+
+	/* Configure the default M64 BAR */
+	rc = opal_pci_set_phb_mem_window(phb->opal_id,
+					 OPAL_M64_WINDOW_TYPE,
+					 phb->ioda.m64_bar_idx,
+					 phb->ioda.m64_base,
+					 0, /* unused */
+					 phb->ioda.m64_size);
+	if (rc != OPAL_SUCCESS) {
+		desc = "configuring";
+		goto fail;
+	}
+
+	/* Enable the default M64 BAR */
+	rc = opal_pci_phb_mmio_enable(phb->opal_id,
+				      OPAL_M64_WINDOW_TYPE,
+				      phb->ioda.m64_bar_idx,
+				      OPAL_ENABLE_M64_SPLIT);
+	if (rc != OPAL_SUCCESS) {
+		desc = "enabling";
+		goto fail;
+	}
+
+	/*
+	 * Exclude the segments for reserved and root bus PE, which
+	 * are first or last two PEs.
+	 */
+	r = &phb->hose->mem_resources[1];
+	if (phb->ioda.reserved_pe_idx == 0)
+		r->start += (2 * phb->ioda.m64_segsize);
+	else if (phb->ioda.reserved_pe_idx == (phb->ioda.total_pe_num - 1))
+		r->end -= (2 * phb->ioda.m64_segsize);
+	else
+		pr_warn("  Cannot strip M64 segment for reserved PE#%x\n",
+			phb->ioda.reserved_pe_idx);
+
+	return 0;
+
+fail:
+	pr_warn("  Failure %lld %s M64 BAR#%d\n",
+		rc, desc, phb->ioda.m64_bar_idx);
+	opal_pci_phb_mmio_enable(phb->opal_id,
+				 OPAL_M64_WINDOW_TYPE,
+				 phb->ioda.m64_bar_idx,
+				 OPAL_DISABLE_M64);
+	return -EIO;
+}
+
+static void pnv_ioda_reserve_dev_m64_pe(struct pci_dev *pdev,
+					 unsigned long *pe_bitmap)
+{
+	struct pnv_phb *phb = pci_bus_to_pnvhb(pdev->bus);
+	struct resource *r;
+	resource_size_t base, sgsz, start, end;
+	int segno, i;
+
+	base = phb->ioda.m64_base;
+	sgsz = phb->ioda.m64_segsize;
+	for (i = 0; i <= PCI_ROM_RESOURCE; i++) {
+		r = &pdev->resource[i];
+		if (!r->parent || !pnv_pci_is_m64(phb, r))
+			continue;
+
+		start = ALIGN_DOWN(r->start - base, sgsz);
+		end = ALIGN(r->end - base, sgsz);
+		for (segno = start / sgsz; segno < end / sgsz; segno++) {
+			if (pe_bitmap)
+				set_bit(segno, pe_bitmap);
+			else
+				pnv_ioda_reserve_pe(phb, segno);
+		}
+	}
+}
+
+static int pnv_ioda1_init_m64(struct pnv_phb *phb)
+{
+	struct resource *r;
+	int index;
+
+	/*
+	 * There are 16 M64 BARs, each of which has 8 segments. So
+	 * there are as many M64 segments as the maximum number of
+	 * PEs, which is 128.
+	 */
+	for (index = 0; index < PNV_IODA1_M64_NUM; index++) {
+		unsigned long base, segsz = phb->ioda.m64_segsize;
+		int64_t rc;
+
+		base = phb->ioda.m64_base +
+		       index * PNV_IODA1_M64_SEGS * segsz;
+		rc = opal_pci_set_phb_mem_window(phb->opal_id,
+				OPAL_M64_WINDOW_TYPE, index, base, 0,
+				PNV_IODA1_M64_SEGS * segsz);
+		if (rc != OPAL_SUCCESS) {
+			pr_warn("  Error %lld setting M64 PHB#%x-BAR#%d\n",
+				rc, phb->hose->global_number, index);
+			goto fail;
+		}
+
+		rc = opal_pci_phb_mmio_enable(phb->opal_id,
+				OPAL_M64_WINDOW_TYPE, index,
+				OPAL_ENABLE_M64_SPLIT);
+		if (rc != OPAL_SUCCESS) {
+			pr_warn("  Error %lld enabling M64 PHB#%x-BAR#%d\n",
+				rc, phb->hose->global_number, index);
+			goto fail;
+		}
+	}
+
+	for (index = 0; index < phb->ioda.total_pe_num; index++) {
+		int64_t rc;
+
+		/*
+		 * P7IOC supports M64DT, which helps mapping M64 segment
+		 * to one particular PE#. However, PHB3 has fixed mapping
+		 * between M64 segment and PE#. In order to have same logic
+		 * for P7IOC and PHB3, we enforce fixed mapping between M64
+		 * segment and PE# on P7IOC.
+		 */
+		rc = opal_pci_map_pe_mmio_window(phb->opal_id,
+				index, OPAL_M64_WINDOW_TYPE,
+				index / PNV_IODA1_M64_SEGS,
+				index % PNV_IODA1_M64_SEGS);
+		if (rc != OPAL_SUCCESS) {
+			pr_warn("%s: Error %lld mapping M64 for PHB#%x-PE#%x\n",
+				__func__, rc, phb->hose->global_number,
+				index);
+			goto fail;
+		}
+	}
+
+	/*
+	 * Exclude the segments for reserved and root bus PE, which
+	 * are first or last two PEs.
+	 */
+	r = &phb->hose->mem_resources[1];
+	if (phb->ioda.reserved_pe_idx == 0)
+		r->start += (2 * phb->ioda.m64_segsize);
+	else if (phb->ioda.reserved_pe_idx == (phb->ioda.total_pe_num - 1))
+		r->end -= (2 * phb->ioda.m64_segsize);
+	else
+		WARN(1, "Wrong reserved PE#%x on PHB#%x\n",
+		     phb->ioda.reserved_pe_idx, phb->hose->global_number);
+
+	return 0;
+
+fail:
+	for ( ; index >= 0; index--)
+		opal_pci_phb_mmio_enable(phb->opal_id,
+			OPAL_M64_WINDOW_TYPE, index, OPAL_DISABLE_M64);
+
+	return -EIO;
+}
+
+static void pnv_ioda_reserve_m64_pe(struct pci_bus *bus,
+				    unsigned long *pe_bitmap,
+				    bool all)
+{
+	struct pci_dev *pdev;
+
+	list_for_each_entry(pdev, &bus->devices, bus_list) {
+		pnv_ioda_reserve_dev_m64_pe(pdev, pe_bitmap);
+
+		if (all && pdev->subordinate)
+			pnv_ioda_reserve_m64_pe(pdev->subordinate,
+						pe_bitmap, all);
+	}
+}
+
+static struct pnv_ioda_pe *pnv_ioda_pick_m64_pe(struct pci_bus *bus, bool all)
+{
+	struct pnv_phb *phb = pci_bus_to_pnvhb(bus);
+	struct pnv_ioda_pe *master_pe, *pe;
+	unsigned long size, *pe_alloc;
+	int i;
+
+	/* Root bus shouldn't use M64 */
+	if (pci_is_root_bus(bus))
+		return NULL;
+
+	/* Allocate bitmap */
+	size = ALIGN(phb->ioda.total_pe_num / 8, sizeof(unsigned long));
+	pe_alloc = kzalloc(size, GFP_KERNEL);
+	if (!pe_alloc) {
+		pr_warn("%s: Out of memory !\n",
+			__func__);
+		return NULL;
+	}
+
+	/* Figure out reserved PE numbers by the PE */
+	pnv_ioda_reserve_m64_pe(bus, pe_alloc, all);
+
+	/*
+	 * the current bus might not own M64 window and that's all
+	 * contributed by its child buses. For the case, we needn't
+	 * pick M64 dependent PE#.
+	 */
+	if (bitmap_empty(pe_alloc, phb->ioda.total_pe_num)) {
+		kfree(pe_alloc);
+		return NULL;
+	}
+
+	/*
+	 * Figure out the master PE and put all slave PEs to master
+	 * PE's list to form compound PE.
+	 */
+	master_pe = NULL;
+	i = -1;
+	while ((i = find_next_bit(pe_alloc, phb->ioda.total_pe_num, i + 1)) <
+		phb->ioda.total_pe_num) {
+		pe = &phb->ioda.pe_array[i];
+
+		phb->ioda.m64_segmap[pe->pe_number] = pe->pe_number;
+		if (!master_pe) {
+			pe->flags |= PNV_IODA_PE_MASTER;
+			INIT_LIST_HEAD(&pe->slaves);
+			master_pe = pe;
+		} else {
+			pe->flags |= PNV_IODA_PE_SLAVE;
+			pe->master = master_pe;
+			list_add_tail(&pe->list, &master_pe->slaves);
+		}
+	}
+
+	kfree(pe_alloc);
+	return master_pe;
+}
+
+static void __init pnv_ioda_parse_m64_window(struct pnv_phb *phb)
+{
+	struct pci_controller *hose = phb->hose;
+	struct device_node *dn = hose->dn;
+	struct resource *res;
+	u32 m64_range[2], i;
+	const __be32 *r;
+	u64 pci_addr;
+
+	if (phb->type != PNV_PHB_IODA1 && phb->type != PNV_PHB_IODA2) {
+		pr_info("  Not support M64 window\n");
+		return;
+	}
+
+	if (!firmware_has_feature(FW_FEATURE_OPAL)) {
+		pr_info("  Firmware too old to support M64 window\n");
+		return;
+	}
+
+	r = of_get_property(dn, "ibm,opal-m64-window", NULL);
+	if (!r) {
+		pr_info("  No <ibm,opal-m64-window> on %pOF\n",
+			dn);
+		return;
+	}
+
+	/*
+	 * Find the available M64 BAR range and pickup the last one for
+	 * covering the whole 64-bits space. We support only one range.
+	 */
+	if (of_property_read_u32_array(dn, "ibm,opal-available-m64-ranges",
+				       m64_range, 2)) {
+		/* In absence of the property, assume 0..15 */
+		m64_range[0] = 0;
+		m64_range[1] = 16;
+	}
+	/* We only support 64 bits in our allocator */
+	if (m64_range[1] > 63) {
+		pr_warn("%s: Limiting M64 range to 63 (from %d) on PHB#%x\n",
+			__func__, m64_range[1], phb->hose->global_number);
+		m64_range[1] = 63;
+	}
+	/* Empty range, no m64 */
+	if (m64_range[1] <= m64_range[0]) {
+		pr_warn("%s: M64 empty, disabling M64 usage on PHB#%x\n",
+			__func__, phb->hose->global_number);
+		return;
+	}
+
+	/* Configure M64 informations */
+	res = &hose->mem_resources[1];
+	res->name = dn->full_name;
+	res->start = of_translate_address(dn, r + 2);
+	res->end = res->start + of_read_number(r + 4, 2) - 1;
+	res->flags = (IORESOURCE_MEM | IORESOURCE_MEM_64 | IORESOURCE_PREFETCH);
+	pci_addr = of_read_number(r, 2);
+	hose->mem_offset[1] = res->start - pci_addr;
+
+	phb->ioda.m64_size = resource_size(res);
+	phb->ioda.m64_segsize = phb->ioda.m64_size / phb->ioda.total_pe_num;
+	phb->ioda.m64_base = pci_addr;
+
+	/* This lines up nicely with the display from processing OF ranges */
+	pr_info(" MEM 0x%016llx..0x%016llx -> 0x%016llx (M64 #%d..%d)\n",
+		res->start, res->end, pci_addr, m64_range[0],
+		m64_range[0] + m64_range[1] - 1);
+
+	/* Mark all M64 used up by default */
+	phb->ioda.m64_bar_alloc = (unsigned long)-1;
+
+	/* Use last M64 BAR to cover M64 window */
+	m64_range[1]--;
+	phb->ioda.m64_bar_idx = m64_range[0] + m64_range[1];
+
+	pr_info(" Using M64 #%d as default window\n", phb->ioda.m64_bar_idx);
+
+	/* Mark remaining ones free */
+	for (i = m64_range[0]; i < m64_range[1]; i++)
+		clear_bit(i, &phb->ioda.m64_bar_alloc);
+
+	/*
+	 * Setup init functions for M64 based on IODA version, IODA3 uses
+	 * the IODA2 code.
+	 */
+	if (phb->type == PNV_PHB_IODA1)
+		phb->init_m64 = pnv_ioda1_init_m64;
+	else
+		phb->init_m64 = pnv_ioda2_init_m64;
+}
+
+static void pnv_ioda_freeze_pe(struct pnv_phb *phb, int pe_no)
+{
+	struct pnv_ioda_pe *pe = &phb->ioda.pe_array[pe_no];
+	struct pnv_ioda_pe *slave;
+	s64 rc;
+
+	/* Fetch master PE */
+	if (pe->flags & PNV_IODA_PE_SLAVE) {
+		pe = pe->master;
+		if (WARN_ON(!pe || !(pe->flags & PNV_IODA_PE_MASTER)))
+			return;
+
+		pe_no = pe->pe_number;
+	}
+
+	/* Freeze master PE */
+	rc = opal_pci_eeh_freeze_set(phb->opal_id,
+				     pe_no,
+				     OPAL_EEH_ACTION_SET_FREEZE_ALL);
+	if (rc != OPAL_SUCCESS) {
+		pr_warn("%s: Failure %lld freezing PHB#%x-PE#%x\n",
+			__func__, rc, phb->hose->global_number, pe_no);
+		return;
+	}
+
+	/* Freeze slave PEs */
+	if (!(pe->flags & PNV_IODA_PE_MASTER))
+		return;
+
+	list_for_each_entry(slave, &pe->slaves, list) {
+		rc = opal_pci_eeh_freeze_set(phb->opal_id,
+					     slave->pe_number,
+					     OPAL_EEH_ACTION_SET_FREEZE_ALL);
+		if (rc != OPAL_SUCCESS)
+			pr_warn("%s: Failure %lld freezing PHB#%x-PE#%x\n",
+				__func__, rc, phb->hose->global_number,
+				slave->pe_number);
+	}
+}
+
+static int pnv_ioda_unfreeze_pe(struct pnv_phb *phb, int pe_no, int opt)
+{
+	struct pnv_ioda_pe *pe, *slave;
+	s64 rc;
+
+	/* Find master PE */
+	pe = &phb->ioda.pe_array[pe_no];
+	if (pe->flags & PNV_IODA_PE_SLAVE) {
+		pe = pe->master;
+		WARN_ON(!pe || !(pe->flags & PNV_IODA_PE_MASTER));
+		pe_no = pe->pe_number;
+	}
+
+	/* Clear frozen state for master PE */
+	rc = opal_pci_eeh_freeze_clear(phb->opal_id, pe_no, opt);
+	if (rc != OPAL_SUCCESS) {
+		pr_warn("%s: Failure %lld clear %d on PHB#%x-PE#%x\n",
+			__func__, rc, opt, phb->hose->global_number, pe_no);
+		return -EIO;
+	}
+
+	if (!(pe->flags & PNV_IODA_PE_MASTER))
+		return 0;
+
+	/* Clear frozen state for slave PEs */
+	list_for_each_entry(slave, &pe->slaves, list) {
+		rc = opal_pci_eeh_freeze_clear(phb->opal_id,
+					     slave->pe_number,
+					     opt);
+		if (rc != OPAL_SUCCESS) {
+			pr_warn("%s: Failure %lld clear %d on PHB#%x-PE#%x\n",
+				__func__, rc, opt, phb->hose->global_number,
+				slave->pe_number);
+			return -EIO;
+		}
+	}
+
+	return 0;
+}
+
+static int pnv_ioda_get_pe_state(struct pnv_phb *phb, int pe_no)
+{
+	struct pnv_ioda_pe *slave, *pe;
+	u8 fstate = 0, state;
+	__be16 pcierr = 0;
+	s64 rc;
+
+	/* Sanity check on PE number */
+	if (pe_no < 0 || pe_no >= phb->ioda.total_pe_num)
+		return OPAL_EEH_STOPPED_PERM_UNAVAIL;
+
+	/*
+	 * Fetch the master PE and the PE instance might be
+	 * not initialized yet.
+	 */
+	pe = &phb->ioda.pe_array[pe_no];
+	if (pe->flags & PNV_IODA_PE_SLAVE) {
+		pe = pe->master;
+		WARN_ON(!pe || !(pe->flags & PNV_IODA_PE_MASTER));
+		pe_no = pe->pe_number;
+	}
+
+	/* Check the master PE */
+	rc = opal_pci_eeh_freeze_status(phb->opal_id, pe_no,
+					&state, &pcierr, NULL);
+	if (rc != OPAL_SUCCESS) {
+		pr_warn("%s: Failure %lld getting "
+			"PHB#%x-PE#%x state\n",
+			__func__, rc,
+			phb->hose->global_number, pe_no);
+		return OPAL_EEH_STOPPED_TEMP_UNAVAIL;
+	}
+
+	/* Check the slave PE */
+	if (!(pe->flags & PNV_IODA_PE_MASTER))
+		return state;
+
+	list_for_each_entry(slave, &pe->slaves, list) {
+		rc = opal_pci_eeh_freeze_status(phb->opal_id,
+						slave->pe_number,
+						&fstate,
+						&pcierr,
+						NULL);
+		if (rc != OPAL_SUCCESS) {
+			pr_warn("%s: Failure %lld getting "
+				"PHB#%x-PE#%x state\n",
+				__func__, rc,
+				phb->hose->global_number, slave->pe_number);
+			return OPAL_EEH_STOPPED_TEMP_UNAVAIL;
+		}
+
+		/*
+		 * Override the result based on the ascending
+		 * priority.
+		 */
+		if (fstate > state)
+			state = fstate;
+	}
+
+	return state;
+}
+
+struct pnv_ioda_pe *pnv_pci_bdfn_to_pe(struct pnv_phb *phb, u16 bdfn)
+{
+	int pe_number = phb->ioda.pe_rmap[bdfn];
+
+	if (pe_number == IODA_INVALID_PE)
+		return NULL;
+
+	return &phb->ioda.pe_array[pe_number];
+}
+
+struct pnv_ioda_pe *pnv_ioda_get_pe(struct pci_dev *dev)
+{
+	struct pnv_phb *phb = pci_bus_to_pnvhb(dev->bus);
+	struct pci_dn *pdn = pci_get_pdn(dev);
+
+	if (!pdn)
+		return NULL;
+	if (pdn->pe_number == IODA_INVALID_PE)
+		return NULL;
+	return &phb->ioda.pe_array[pdn->pe_number];
+}
+
+static int pnv_ioda_set_one_peltv(struct pnv_phb *phb,
+				  struct pnv_ioda_pe *parent,
+				  struct pnv_ioda_pe *child,
+				  bool is_add)
+{
+	const char *desc = is_add ? "adding" : "removing";
+	uint8_t op = is_add ? OPAL_ADD_PE_TO_DOMAIN :
+			      OPAL_REMOVE_PE_FROM_DOMAIN;
+	struct pnv_ioda_pe *slave;
+	long rc;
+
+	/* Parent PE affects child PE */
+	rc = opal_pci_set_peltv(phb->opal_id, parent->pe_number,
+				child->pe_number, op);
+	if (rc != OPAL_SUCCESS) {
+		pe_warn(child, "OPAL error %ld %s to parent PELTV\n",
+			rc, desc);
+		return -ENXIO;
+	}
+
+	if (!(child->flags & PNV_IODA_PE_MASTER))
+		return 0;
+
+	/* Compound case: parent PE affects slave PEs */
+	list_for_each_entry(slave, &child->slaves, list) {
+		rc = opal_pci_set_peltv(phb->opal_id, parent->pe_number,
+					slave->pe_number, op);
+		if (rc != OPAL_SUCCESS) {
+			pe_warn(slave, "OPAL error %ld %s to parent PELTV\n",
+				rc, desc);
+			return -ENXIO;
+		}
+	}
+
+	return 0;
+}
+
+static int pnv_ioda_set_peltv(struct pnv_phb *phb,
+			      struct pnv_ioda_pe *pe,
+			      bool is_add)
+{
+	struct pnv_ioda_pe *slave;
+	struct pci_dev *pdev = NULL;
+	int ret;
+
+	/*
+	 * Clear PE frozen state. If it's master PE, we need
+	 * clear slave PE frozen state as well.
+	 */
+	if (is_add) {
+		opal_pci_eeh_freeze_clear(phb->opal_id, pe->pe_number,
+					  OPAL_EEH_ACTION_CLEAR_FREEZE_ALL);
+		if (pe->flags & PNV_IODA_PE_MASTER) {
+			list_for_each_entry(slave, &pe->slaves, list)
+				opal_pci_eeh_freeze_clear(phb->opal_id,
+							  slave->pe_number,
+							  OPAL_EEH_ACTION_CLEAR_FREEZE_ALL);
+		}
+	}
+
+	/*
+	 * Associate PE in PELT. We need add the PE into the
+	 * corresponding PELT-V as well. Otherwise, the error
+	 * originated from the PE might contribute to other
+	 * PEs.
+	 */
+	ret = pnv_ioda_set_one_peltv(phb, pe, pe, is_add);
+	if (ret)
+		return ret;
+
+	/* For compound PEs, any one affects all of them */
+	if (pe->flags & PNV_IODA_PE_MASTER) {
+		list_for_each_entry(slave, &pe->slaves, list) {
+			ret = pnv_ioda_set_one_peltv(phb, slave, pe, is_add);
+			if (ret)
+				return ret;
+		}
+	}
+
+	if (pe->flags & (PNV_IODA_PE_BUS_ALL | PNV_IODA_PE_BUS))
+		pdev = pe->pbus->self;
+	else if (pe->flags & PNV_IODA_PE_DEV)
+		pdev = pe->pdev->bus->self;
+#ifdef CONFIG_PCI_IOV
+	else if (pe->flags & PNV_IODA_PE_VF)
+		pdev = pe->parent_dev;
+#endif /* CONFIG_PCI_IOV */
+	while (pdev) {
+		struct pci_dn *pdn = pci_get_pdn(pdev);
+		struct pnv_ioda_pe *parent;
+
+		if (pdn && pdn->pe_number != IODA_INVALID_PE) {
+			parent = &phb->ioda.pe_array[pdn->pe_number];
+			ret = pnv_ioda_set_one_peltv(phb, parent, pe, is_add);
+			if (ret)
+				return ret;
+		}
+
+		pdev = pdev->bus->self;
+	}
+
+	return 0;
+}
+
+static void pnv_ioda_unset_peltv(struct pnv_phb *phb,
+				 struct pnv_ioda_pe *pe,
+				 struct pci_dev *parent)
+{
+	int64_t rc;
+
+	while (parent) {
+		struct pci_dn *pdn = pci_get_pdn(parent);
+
+		if (pdn && pdn->pe_number != IODA_INVALID_PE) {
+			rc = opal_pci_set_peltv(phb->opal_id, pdn->pe_number,
+						pe->pe_number,
+						OPAL_REMOVE_PE_FROM_DOMAIN);
+			/* XXX What to do in case of error ? */
+		}
+		parent = parent->bus->self;
+	}
+
+	opal_pci_eeh_freeze_clear(phb->opal_id, pe->pe_number,
+				  OPAL_EEH_ACTION_CLEAR_FREEZE_ALL);
+
+	/* Disassociate PE in PELT */
+	rc = opal_pci_set_peltv(phb->opal_id, pe->pe_number,
+				pe->pe_number, OPAL_REMOVE_PE_FROM_DOMAIN);
+	if (rc)
+		pe_warn(pe, "OPAL error %lld remove self from PELTV\n", rc);
+}
+
+int pnv_ioda_deconfigure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe)
+{
+	struct pci_dev *parent;
+	uint8_t bcomp, dcomp, fcomp;
+	int64_t rc;
+	long rid_end, rid;
+
+	/* Currently, we just deconfigure VF PE. Bus PE will always there.*/
+	if (pe->pbus) {
+		int count;
+
+		dcomp = OPAL_IGNORE_RID_DEVICE_NUMBER;
+		fcomp = OPAL_IGNORE_RID_FUNCTION_NUMBER;
+		parent = pe->pbus->self;
+		if (pe->flags & PNV_IODA_PE_BUS_ALL)
+			count = resource_size(&pe->pbus->busn_res);
+		else
+			count = 1;
+
+		switch(count) {
+		case  1: bcomp = OpalPciBusAll;         break;
+		case  2: bcomp = OpalPciBus7Bits;       break;
+		case  4: bcomp = OpalPciBus6Bits;       break;
+		case  8: bcomp = OpalPciBus5Bits;       break;
+		case 16: bcomp = OpalPciBus4Bits;       break;
+		case 32: bcomp = OpalPciBus3Bits;       break;
+		default:
+			dev_err(&pe->pbus->dev, "Number of subordinate buses %d unsupported\n",
+			        count);
+			/* Do an exact match only */
+			bcomp = OpalPciBusAll;
+		}
+		rid_end = pe->rid + (count << 8);
+	} else {
+#ifdef CONFIG_PCI_IOV
+		if (pe->flags & PNV_IODA_PE_VF)
+			parent = pe->parent_dev;
+		else
+#endif
+			parent = pe->pdev->bus->self;
+		bcomp = OpalPciBusAll;
+		dcomp = OPAL_COMPARE_RID_DEVICE_NUMBER;
+		fcomp = OPAL_COMPARE_RID_FUNCTION_NUMBER;
+		rid_end = pe->rid + 1;
+	}
+
+	/* Clear the reverse map */
+	for (rid = pe->rid; rid < rid_end; rid++)
+		phb->ioda.pe_rmap[rid] = IODA_INVALID_PE;
+
+	/*
+	 * Release from all parents PELT-V. NPUs don't have a PELTV
+	 * table
+	 */
+	if (phb->type != PNV_PHB_NPU_NVLINK && phb->type != PNV_PHB_NPU_OCAPI)
+		pnv_ioda_unset_peltv(phb, pe, parent);
+
+	rc = opal_pci_set_pe(phb->opal_id, pe->pe_number, pe->rid,
+			     bcomp, dcomp, fcomp, OPAL_UNMAP_PE);
+	if (rc)
+		pe_err(pe, "OPAL error %lld trying to setup PELT table\n", rc);
+
+	pe->pbus = NULL;
+	pe->pdev = NULL;
+#ifdef CONFIG_PCI_IOV
+	pe->parent_dev = NULL;
+#endif
+
+	return 0;
+}
+
+int pnv_ioda_configure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe)
+{
+	uint8_t bcomp, dcomp, fcomp;
+	long rc, rid_end, rid;
+
+	/* Bus validation ? */
+	if (pe->pbus) {
+		int count;
+
+		dcomp = OPAL_IGNORE_RID_DEVICE_NUMBER;
+		fcomp = OPAL_IGNORE_RID_FUNCTION_NUMBER;
+		if (pe->flags & PNV_IODA_PE_BUS_ALL)
+			count = resource_size(&pe->pbus->busn_res);
+		else
+			count = 1;
+
+		switch(count) {
+		case  1: bcomp = OpalPciBusAll;		break;
+		case  2: bcomp = OpalPciBus7Bits;	break;
+		case  4: bcomp = OpalPciBus6Bits;	break;
+		case  8: bcomp = OpalPciBus5Bits;	break;
+		case 16: bcomp = OpalPciBus4Bits;	break;
+		case 32: bcomp = OpalPciBus3Bits;	break;
+		default:
+			dev_err(&pe->pbus->dev, "Number of subordinate buses %d unsupported\n",
+			        count);
+			/* Do an exact match only */
+			bcomp = OpalPciBusAll;
+		}
+		rid_end = pe->rid + (count << 8);
+	} else {
+		bcomp = OpalPciBusAll;
+		dcomp = OPAL_COMPARE_RID_DEVICE_NUMBER;
+		fcomp = OPAL_COMPARE_RID_FUNCTION_NUMBER;
+		rid_end = pe->rid + 1;
+	}
+
+	/*
+	 * Associate PE in PELT. We need add the PE into the
+	 * corresponding PELT-V as well. Otherwise, the error
+	 * originated from the PE might contribute to other
+	 * PEs.
+	 */
+	rc = opal_pci_set_pe(phb->opal_id, pe->pe_number, pe->rid,
+			     bcomp, dcomp, fcomp, OPAL_MAP_PE);
+	if (rc) {
+		pe_err(pe, "OPAL error %ld trying to setup PELT table\n", rc);
+		return -ENXIO;
+	}
+
+	/*
+	 * Configure PELTV. NPUs don't have a PELTV table so skip
+	 * configuration on them.
+	 */
+	if (phb->type != PNV_PHB_NPU_NVLINK && phb->type != PNV_PHB_NPU_OCAPI)
+		pnv_ioda_set_peltv(phb, pe, true);
+
+	/* Setup reverse map */
+	for (rid = pe->rid; rid < rid_end; rid++)
+		phb->ioda.pe_rmap[rid] = pe->pe_number;
+
+	/* Setup one MVTs on IODA1 */
+	if (phb->type != PNV_PHB_IODA1) {
+		pe->mve_number = 0;
+		goto out;
+	}
+
+	pe->mve_number = pe->pe_number;
+	rc = opal_pci_set_mve(phb->opal_id, pe->mve_number, pe->pe_number);
+	if (rc != OPAL_SUCCESS) {
+		pe_err(pe, "OPAL error %ld setting up MVE %x\n",
+		       rc, pe->mve_number);
+		pe->mve_number = -1;
+	} else {
+		rc = opal_pci_set_mve_enable(phb->opal_id,
+					     pe->mve_number, OPAL_ENABLE_MVE);
+		if (rc) {
+			pe_err(pe, "OPAL error %ld enabling MVE %x\n",
+			       rc, pe->mve_number);
+			pe->mve_number = -1;
+		}
+	}
+
+out:
+	return 0;
+}
+
+static struct pnv_ioda_pe *pnv_ioda_setup_dev_PE(struct pci_dev *dev)
+{
+	struct pnv_phb *phb = pci_bus_to_pnvhb(dev->bus);
+	struct pci_dn *pdn = pci_get_pdn(dev);
+	struct pnv_ioda_pe *pe;
+
+	if (!pdn) {
+		pr_err("%s: Device tree node not associated properly\n",
+			   pci_name(dev));
+		return NULL;
+	}
+	if (pdn->pe_number != IODA_INVALID_PE)
+		return NULL;
+
+	pe = pnv_ioda_alloc_pe(phb, 1);
+	if (!pe) {
+		pr_warn("%s: Not enough PE# available, disabling device\n",
+			pci_name(dev));
+		return NULL;
+	}
+
+	/* NOTE: We don't get a reference for the pointer in the PE
+	 * data structure, both the device and PE structures should be
+	 * destroyed at the same time. However, removing nvlink
+	 * devices will need some work.
+	 *
+	 * At some point we want to remove the PDN completely anyways
+	 */
+	pdn->pe_number = pe->pe_number;
+	pe->flags = PNV_IODA_PE_DEV;
+	pe->pdev = dev;
+	pe->pbus = NULL;
+	pe->mve_number = -1;
+	pe->rid = dev->bus->number << 8 | pdn->devfn;
+	pe->device_count++;
+
+	pe_info(pe, "Associated device to PE\n");
+
+	if (pnv_ioda_configure_pe(phb, pe)) {
+		/* XXX What do we do here ? */
+		pnv_ioda_free_pe(pe);
+		pdn->pe_number = IODA_INVALID_PE;
+		pe->pdev = NULL;
+		return NULL;
+	}
+
+	/* Put PE to the list */
+	mutex_lock(&phb->ioda.pe_list_mutex);
+	list_add_tail(&pe->list, &phb->ioda.pe_list);
+	mutex_unlock(&phb->ioda.pe_list_mutex);
+	return pe;
+}
+
+/*
+ * There're 2 types of PCI bus sensitive PEs: One that is compromised of
+ * single PCI bus. Another one that contains the primary PCI bus and its
+ * subordinate PCI devices and buses. The second type of PE is normally
+ * orgiriated by PCIe-to-PCI bridge or PLX switch downstream ports.
+ */
+static struct pnv_ioda_pe *pnv_ioda_setup_bus_PE(struct pci_bus *bus, bool all)
+{
+	struct pnv_phb *phb = pci_bus_to_pnvhb(bus);
+	struct pnv_ioda_pe *pe = NULL;
+	unsigned int pe_num;
+
+	/*
+	 * In partial hotplug case, the PE instance might be still alive.
+	 * We should reuse it instead of allocating a new one.
+	 */
+	pe_num = phb->ioda.pe_rmap[bus->number << 8];
+	if (WARN_ON(pe_num != IODA_INVALID_PE)) {
+		pe = &phb->ioda.pe_array[pe_num];
+		return NULL;
+	}
+
+	/* PE number for root bus should have been reserved */
+	if (pci_is_root_bus(bus))
+		pe = &phb->ioda.pe_array[phb->ioda.root_pe_idx];
+
+	/* Check if PE is determined by M64 */
+	if (!pe)
+		pe = pnv_ioda_pick_m64_pe(bus, all);
+
+	/* The PE number isn't pinned by M64 */
+	if (!pe)
+		pe = pnv_ioda_alloc_pe(phb, 1);
+
+	if (!pe) {
+		pr_warn("%s: Not enough PE# available for PCI bus %04x:%02x\n",
+			__func__, pci_domain_nr(bus), bus->number);
+		return NULL;
+	}
+
+	pe->flags |= (all ? PNV_IODA_PE_BUS_ALL : PNV_IODA_PE_BUS);
+	pe->pbus = bus;
+	pe->pdev = NULL;
+	pe->mve_number = -1;
+	pe->rid = bus->busn_res.start << 8;
+
+	if (all)
+		pe_info(pe, "Secondary bus %pad..%pad associated with PE#%x\n",
+			&bus->busn_res.start, &bus->busn_res.end,
+			pe->pe_number);
+	else
+		pe_info(pe, "Secondary bus %pad associated with PE#%x\n",
+			&bus->busn_res.start, pe->pe_number);
+
+	if (pnv_ioda_configure_pe(phb, pe)) {
+		/* XXX What do we do here ? */
+		pnv_ioda_free_pe(pe);
+		pe->pbus = NULL;
+		return NULL;
+	}
+
+	/* Put PE to the list */
+	list_add_tail(&pe->list, &phb->ioda.pe_list);
+
+	return pe;
+}
+
+static struct pnv_ioda_pe *pnv_ioda_setup_npu_PE(struct pci_dev *npu_pdev)
+{
+	int pe_num, found_pe = false, rc;
+	long rid;
+	struct pnv_ioda_pe *pe;
+	struct pci_dev *gpu_pdev;
+	struct pci_dn *npu_pdn;
+	struct pnv_phb *phb = pci_bus_to_pnvhb(npu_pdev->bus);
+
+	/*
+	 * Intentionally leak a reference on the npu device (for
+	 * nvlink only; this is not an opencapi path) to make sure it
+	 * never goes away, as it's been the case all along and some
+	 * work is needed otherwise.
+	 */
+	pci_dev_get(npu_pdev);
+
+	/*
+	 * Due to a hardware errata PE#0 on the NPU is reserved for
+	 * error handling. This means we only have three PEs remaining
+	 * which need to be assigned to four links, implying some
+	 * links must share PEs.
+	 *
+	 * To achieve this we assign PEs such that NPUs linking the
+	 * same GPU get assigned the same PE.
+	 */
+	gpu_pdev = pnv_pci_get_gpu_dev(npu_pdev);
+	for (pe_num = 0; pe_num < phb->ioda.total_pe_num; pe_num++) {
+		pe = &phb->ioda.pe_array[pe_num];
+		if (!pe->pdev)
+			continue;
+
+		if (pnv_pci_get_gpu_dev(pe->pdev) == gpu_pdev) {
+			/*
+			 * This device has the same peer GPU so should
+			 * be assigned the same PE as the existing
+			 * peer NPU.
+			 */
+			dev_info(&npu_pdev->dev,
+				"Associating to existing PE %x\n", pe_num);
+			npu_pdn = pci_get_pdn(npu_pdev);
+			rid = npu_pdev->bus->number << 8 | npu_pdn->devfn;
+			npu_pdn->pe_number = pe_num;
+			phb->ioda.pe_rmap[rid] = pe->pe_number;
+			pe->device_count++;
+
+			/* Map the PE to this link */
+			rc = opal_pci_set_pe(phb->opal_id, pe_num, rid,
+					OpalPciBusAll,
+					OPAL_COMPARE_RID_DEVICE_NUMBER,
+					OPAL_COMPARE_RID_FUNCTION_NUMBER,
+					OPAL_MAP_PE);
+			WARN_ON(rc != OPAL_SUCCESS);
+			found_pe = true;
+			break;
+		}
+	}
+
+	if (!found_pe)
+		/*
+		 * Could not find an existing PE so allocate a new
+		 * one.
+		 */
+		return pnv_ioda_setup_dev_PE(npu_pdev);
+	else
+		return pe;
+}
+
+static void pnv_ioda_setup_npu_PEs(struct pci_bus *bus)
+{
+	struct pci_dev *pdev;
+
+	list_for_each_entry(pdev, &bus->devices, bus_list)
+		pnv_ioda_setup_npu_PE(pdev);
+}
+
+static void pnv_pci_ioda_setup_nvlink(void)
+{
+	struct pci_controller *hose;
+	struct pnv_phb *phb;
+	struct pnv_ioda_pe *pe;
+
+	list_for_each_entry(hose, &hose_list, list_node) {
+		phb = hose->private_data;
+		if (phb->type == PNV_PHB_NPU_NVLINK) {
+			/* PE#0 is needed for error reporting */
+			pnv_ioda_reserve_pe(phb, 0);
+			pnv_ioda_setup_npu_PEs(hose->bus);
+			if (phb->model == PNV_PHB_MODEL_NPU2)
+				WARN_ON_ONCE(pnv_npu2_init(hose));
+		}
+	}
+	list_for_each_entry(hose, &hose_list, list_node) {
+		phb = hose->private_data;
+		if (phb->type != PNV_PHB_IODA2)
+			continue;
+
+		list_for_each_entry(pe, &phb->ioda.pe_list, list)
+			pnv_npu2_map_lpar(pe, MSR_DR | MSR_PR | MSR_HV);
+	}
+
+#ifdef CONFIG_IOMMU_API
+	/* setup iommu groups so we can do nvlink pass-thru */
+	pnv_pci_npu_setup_iommu_groups();
+#endif
+}
+
+static void pnv_pci_ioda1_setup_dma_pe(struct pnv_phb *phb,
+				       struct pnv_ioda_pe *pe);
+
+static void pnv_pci_ioda_dma_dev_setup(struct pci_dev *pdev)
+{
+	struct pnv_phb *phb = pci_bus_to_pnvhb(pdev->bus);
+	struct pci_dn *pdn = pci_get_pdn(pdev);
+	struct pnv_ioda_pe *pe;
+
+	/* Check if the BDFN for this device is associated with a PE yet */
+	pe = pnv_pci_bdfn_to_pe(phb, pdev->devfn | (pdev->bus->number << 8));
+	if (!pe) {
+		/* VF PEs should be pre-configured in pnv_pci_sriov_enable() */
+		if (WARN_ON(pdev->is_virtfn))
+			return;
+
+		pnv_pci_configure_bus(pdev->bus);
+		pe = pnv_pci_bdfn_to_pe(phb, pdev->devfn | (pdev->bus->number << 8));
+		pci_info(pdev, "Configured PE#%x\n", pe ? pe->pe_number : 0xfffff);
+
+
+		/*
+		 * If we can't setup the IODA PE something has gone horribly
+		 * wrong and we can't enable DMA for the device.
+		 */
+		if (WARN_ON(!pe))
+			return;
+	} else {
+		pci_info(pdev, "Added to existing PE#%x\n", pe->pe_number);
+	}
+
+	/*
+	 * We assume that bridges *probably* don't need to do any DMA so we can
+	 * skip allocating a TCE table, etc unless we get a non-bridge device.
+	 */
+	if (!pe->dma_setup_done && !pci_is_bridge(pdev)) {
+		switch (phb->type) {
+		case PNV_PHB_IODA1:
+			pnv_pci_ioda1_setup_dma_pe(phb, pe);
+			break;
+		case PNV_PHB_IODA2:
+			pnv_pci_ioda2_setup_dma_pe(phb, pe);
+			break;
+		default:
+			pr_warn("%s: No DMA for PHB#%x (type %d)\n",
+				__func__, phb->hose->global_number, phb->type);
+		}
+	}
+
+	if (pdn)
+		pdn->pe_number = pe->pe_number;
+	pe->device_count++;
+
+	WARN_ON(get_dma_ops(&pdev->dev) != &dma_iommu_ops);
+	pdev->dev.archdata.dma_offset = pe->tce_bypass_base;
+	set_iommu_table_base(&pdev->dev, pe->table_group.tables[0]);
+
+	/* PEs with a DMA weight of zero won't have a group */
+	if (pe->table_group.group)
+		iommu_add_device(&pe->table_group, &pdev->dev);
+}
+
+/*
+ * Reconfigure TVE#0 to be usable as 64-bit DMA space.
+ *
+ * The first 4GB of virtual memory for a PE is reserved for 32-bit accesses.
+ * Devices can only access more than that if bit 59 of the PCI address is set
+ * by hardware, which indicates TVE#1 should be used instead of TVE#0.
+ * Many PCI devices are not capable of addressing that many bits, and as a
+ * result are limited to the 4GB of virtual memory made available to 32-bit
+ * devices in TVE#0.
+ *
+ * In order to work around this, reconfigure TVE#0 to be suitable for 64-bit
+ * devices by configuring the virtual memory past the first 4GB inaccessible
+ * by 64-bit DMAs.  This should only be used by devices that want more than
+ * 4GB, and only on PEs that have no 32-bit devices.
+ *
+ * Currently this will only work on PHB3 (POWER8).
+ */
+static int pnv_pci_ioda_dma_64bit_bypass(struct pnv_ioda_pe *pe)
+{
+	u64 window_size, table_size, tce_count, addr;
+	struct page *table_pages;
+	u64 tce_order = 28; /* 256MB TCEs */
+	__be64 *tces;
+	s64 rc;
+
+	/*
+	 * Window size needs to be a power of two, but needs to account for
+	 * shifting memory by the 4GB offset required to skip 32bit space.
+	 */
+	window_size = roundup_pow_of_two(memory_hotplug_max() + (1ULL << 32));
+	tce_count = window_size >> tce_order;
+	table_size = tce_count << 3;
+
+	if (table_size < PAGE_SIZE)
+		table_size = PAGE_SIZE;
+
+	table_pages = alloc_pages_node(pe->phb->hose->node, GFP_KERNEL,
+				       get_order(table_size));
+	if (!table_pages)
+		goto err;
+
+	tces = page_address(table_pages);
+	if (!tces)
+		goto err;
+
+	memset(tces, 0, table_size);
+
+	for (addr = 0; addr < memory_hotplug_max(); addr += (1 << tce_order)) {
+		tces[(addr + (1ULL << 32)) >> tce_order] =
+			cpu_to_be64(addr | TCE_PCI_READ | TCE_PCI_WRITE);
+	}
+
+	rc = opal_pci_map_pe_dma_window(pe->phb->opal_id,
+					pe->pe_number,
+					/* reconfigure window 0 */
+					(pe->pe_number << 1) + 0,
+					1,
+					__pa(tces),
+					table_size,
+					1 << tce_order);
+	if (rc == OPAL_SUCCESS) {
+		pe_info(pe, "Using 64-bit DMA iommu bypass (through TVE#0)\n");
+		return 0;
+	}
+err:
+	pe_err(pe, "Error configuring 64-bit DMA bypass\n");
+	return -EIO;
+}
+
+static bool pnv_pci_ioda_iommu_bypass_supported(struct pci_dev *pdev,
+		u64 dma_mask)
+{
+	struct pnv_phb *phb = pci_bus_to_pnvhb(pdev->bus);
+	struct pci_dn *pdn = pci_get_pdn(pdev);
+	struct pnv_ioda_pe *pe;
+
+	if (WARN_ON(!pdn || pdn->pe_number == IODA_INVALID_PE))
+		return false;
+
+	pe = &phb->ioda.pe_array[pdn->pe_number];
+	if (pe->tce_bypass_enabled) {
+		u64 top = pe->tce_bypass_base + memblock_end_of_DRAM() - 1;
+		if (dma_mask >= top)
+			return true;
+	}
+
+	/*
+	 * If the device can't set the TCE bypass bit but still wants
+	 * to access 4GB or more, on PHB3 we can reconfigure TVE#0 to
+	 * bypass the 32-bit region and be usable for 64-bit DMAs.
+	 * The device needs to be able to address all of this space.
+	 */
+	if (dma_mask >> 32 &&
+	    dma_mask > (memory_hotplug_max() + (1ULL << 32)) &&
+	    /* pe->pdev should be set if it's a single device, pe->pbus if not */
+	    (pe->device_count == 1 || !pe->pbus) &&
+	    phb->model == PNV_PHB_MODEL_PHB3) {
+		/* Configure the bypass mode */
+		s64 rc = pnv_pci_ioda_dma_64bit_bypass(pe);
+		if (rc)
+			return false;
+		/* 4GB offset bypasses 32-bit space */
+		pdev->dev.archdata.dma_offset = (1ULL << 32);
+		return true;
+	}
+
+	return false;
+}
+
+static inline __be64 __iomem *pnv_ioda_get_inval_reg(struct pnv_phb *phb,
+						     bool real_mode)
+{
+	return real_mode ? (__be64 __iomem *)(phb->regs_phys + 0x210) :
+		(phb->regs + 0x210);
+}
+
+static void pnv_pci_p7ioc_tce_invalidate(struct iommu_table *tbl,
+		unsigned long index, unsigned long npages, bool rm)
+{
+	struct iommu_table_group_link *tgl = list_first_entry_or_null(
+			&tbl->it_group_list, struct iommu_table_group_link,
+			next);
+	struct pnv_ioda_pe *pe = container_of(tgl->table_group,
+			struct pnv_ioda_pe, table_group);
+	__be64 __iomem *invalidate = pnv_ioda_get_inval_reg(pe->phb, rm);
+	unsigned long start, end, inc;
+
+	start = __pa(((__be64 *)tbl->it_base) + index - tbl->it_offset);
+	end = __pa(((__be64 *)tbl->it_base) + index - tbl->it_offset +
+			npages - 1);
+
+	/* p7ioc-style invalidation, 2 TCEs per write */
+	start |= (1ull << 63);
+	end |= (1ull << 63);
+	inc = 16;
+        end |= inc - 1;	/* round up end to be different than start */
+
+        mb(); /* Ensure above stores are visible */
+        while (start <= end) {
+		if (rm)
+			__raw_rm_writeq_be(start, invalidate);
+		else
+			__raw_writeq_be(start, invalidate);
+
+                start += inc;
+        }
+
+	/*
+	 * The iommu layer will do another mb() for us on build()
+	 * and we don't care on free()
+	 */
+}
+
+static int pnv_ioda1_tce_build(struct iommu_table *tbl, long index,
+		long npages, unsigned long uaddr,
+		enum dma_data_direction direction,
+		unsigned long attrs)
+{
+	int ret = pnv_tce_build(tbl, index, npages, uaddr, direction,
+			attrs);
+
+	if (!ret)
+		pnv_pci_p7ioc_tce_invalidate(tbl, index, npages, false);
+
+	return ret;
+}
+
+#ifdef CONFIG_IOMMU_API
+/* Common for IODA1 and IODA2 */
+static int pnv_ioda_tce_xchg_no_kill(struct iommu_table *tbl, long index,
+		unsigned long *hpa, enum dma_data_direction *direction,
+		bool realmode)
+{
+	return pnv_tce_xchg(tbl, index, hpa, direction, !realmode);
+}
+#endif
+
+static void pnv_ioda1_tce_free(struct iommu_table *tbl, long index,
+		long npages)
+{
+	pnv_tce_free(tbl, index, npages);
+
+	pnv_pci_p7ioc_tce_invalidate(tbl, index, npages, false);
+}
+
+static struct iommu_table_ops pnv_ioda1_iommu_ops = {
+	.set = pnv_ioda1_tce_build,
+#ifdef CONFIG_IOMMU_API
+	.xchg_no_kill = pnv_ioda_tce_xchg_no_kill,
+	.tce_kill = pnv_pci_p7ioc_tce_invalidate,
+	.useraddrptr = pnv_tce_useraddrptr,
+#endif
+	.clear = pnv_ioda1_tce_free,
+	.get = pnv_tce_get,
+};
+
+#define PHB3_TCE_KILL_INVAL_ALL		PPC_BIT(0)
+#define PHB3_TCE_KILL_INVAL_PE		PPC_BIT(1)
+#define PHB3_TCE_KILL_INVAL_ONE		PPC_BIT(2)
+
+static void pnv_pci_phb3_tce_invalidate_entire(struct pnv_phb *phb, bool rm)
+{
+	__be64 __iomem *invalidate = pnv_ioda_get_inval_reg(phb, rm);
+	const unsigned long val = PHB3_TCE_KILL_INVAL_ALL;
+
+	mb(); /* Ensure previous TCE table stores are visible */
+	if (rm)
+		__raw_rm_writeq_be(val, invalidate);
+	else
+		__raw_writeq_be(val, invalidate);
+}
+
+static inline void pnv_pci_phb3_tce_invalidate_pe(struct pnv_ioda_pe *pe)
+{
+	/* 01xb - invalidate TCEs that match the specified PE# */
+	__be64 __iomem *invalidate = pnv_ioda_get_inval_reg(pe->phb, false);
+	unsigned long val = PHB3_TCE_KILL_INVAL_PE | (pe->pe_number & 0xFF);
+
+	mb(); /* Ensure above stores are visible */
+	__raw_writeq_be(val, invalidate);
+}
+
+static void pnv_pci_phb3_tce_invalidate(struct pnv_ioda_pe *pe, bool rm,
+					unsigned shift, unsigned long index,
+					unsigned long npages)
+{
+	__be64 __iomem *invalidate = pnv_ioda_get_inval_reg(pe->phb, rm);
+	unsigned long start, end, inc;
+
+	/* We'll invalidate DMA address in PE scope */
+	start = PHB3_TCE_KILL_INVAL_ONE;
+	start |= (pe->pe_number & 0xFF);
+	end = start;
+
+	/* Figure out the start, end and step */
+	start |= (index << shift);
+	end |= ((index + npages - 1) << shift);
+	inc = (0x1ull << shift);
+	mb();
+
+	while (start <= end) {
+		if (rm)
+			__raw_rm_writeq_be(start, invalidate);
+		else
+			__raw_writeq_be(start, invalidate);
+		start += inc;
+	}
+}
+
+static inline void pnv_pci_ioda2_tce_invalidate_pe(struct pnv_ioda_pe *pe)
+{
+	struct pnv_phb *phb = pe->phb;
+
+	if (phb->model == PNV_PHB_MODEL_PHB3 && phb->regs)
+		pnv_pci_phb3_tce_invalidate_pe(pe);
+	else
+		opal_pci_tce_kill(phb->opal_id, OPAL_PCI_TCE_KILL_PE,
+				  pe->pe_number, 0, 0, 0);
+}
+
+static void pnv_pci_ioda2_tce_invalidate(struct iommu_table *tbl,
+		unsigned long index, unsigned long npages, bool rm)
+{
+	struct iommu_table_group_link *tgl;
+
+	list_for_each_entry_lockless(tgl, &tbl->it_group_list, next) {
+		struct pnv_ioda_pe *pe = container_of(tgl->table_group,
+				struct pnv_ioda_pe, table_group);
+		struct pnv_phb *phb = pe->phb;
+		unsigned int shift = tbl->it_page_shift;
+
+		/*
+		 * NVLink1 can use the TCE kill register directly as
+		 * it's the same as PHB3. NVLink2 is different and
+		 * should go via the OPAL call.
+		 */
+		if (phb->model == PNV_PHB_MODEL_NPU) {
+			/*
+			 * The NVLink hardware does not support TCE kill
+			 * per TCE entry so we have to invalidate
+			 * the entire cache for it.
+			 */
+			pnv_pci_phb3_tce_invalidate_entire(phb, rm);
+			continue;
+		}
+		if (phb->model == PNV_PHB_MODEL_PHB3 && phb->regs)
+			pnv_pci_phb3_tce_invalidate(pe, rm, shift,
+						    index, npages);
+		else
+			opal_pci_tce_kill(phb->opal_id,
+					  OPAL_PCI_TCE_KILL_PAGES,
+					  pe->pe_number, 1u << shift,
+					  index << shift, npages);
+	}
+}
+
+void pnv_pci_ioda2_tce_invalidate_entire(struct pnv_phb *phb, bool rm)
+{
+	if (phb->model == PNV_PHB_MODEL_NPU || phb->model == PNV_PHB_MODEL_PHB3)
+		pnv_pci_phb3_tce_invalidate_entire(phb, rm);
+	else
+		opal_pci_tce_kill(phb->opal_id, OPAL_PCI_TCE_KILL, 0, 0, 0, 0);
+}
+
+static int pnv_ioda2_tce_build(struct iommu_table *tbl, long index,
+		long npages, unsigned long uaddr,
+		enum dma_data_direction direction,
+		unsigned long attrs)
+{
+	int ret = pnv_tce_build(tbl, index, npages, uaddr, direction,
+			attrs);
+
+	if (!ret)
+		pnv_pci_ioda2_tce_invalidate(tbl, index, npages, false);
+
+	return ret;
+}
+
+static void pnv_ioda2_tce_free(struct iommu_table *tbl, long index,
+		long npages)
+{
+	pnv_tce_free(tbl, index, npages);
+
+	pnv_pci_ioda2_tce_invalidate(tbl, index, npages, false);
+}
+
+static struct iommu_table_ops pnv_ioda2_iommu_ops = {
+	.set = pnv_ioda2_tce_build,
+#ifdef CONFIG_IOMMU_API
+	.xchg_no_kill = pnv_ioda_tce_xchg_no_kill,
+	.tce_kill = pnv_pci_ioda2_tce_invalidate,
+	.useraddrptr = pnv_tce_useraddrptr,
+#endif
+	.clear = pnv_ioda2_tce_free,
+	.get = pnv_tce_get,
+	.free = pnv_pci_ioda2_table_free_pages,
+};
+
+static int pnv_pci_ioda_dev_dma_weight(struct pci_dev *dev, void *data)
+{
+	unsigned int *weight = (unsigned int *)data;
+
+	/* This is quite simplistic. The "base" weight of a device
+	 * is 10. 0 means no DMA is to be accounted for it.
+	 */
+	if (dev->hdr_type != PCI_HEADER_TYPE_NORMAL)
+		return 0;
+
+	if (dev->class == PCI_CLASS_SERIAL_USB_UHCI ||
+	    dev->class == PCI_CLASS_SERIAL_USB_OHCI ||
+	    dev->class == PCI_CLASS_SERIAL_USB_EHCI)
+		*weight += 3;
+	else if ((dev->class >> 8) == PCI_CLASS_STORAGE_RAID)
+		*weight += 15;
+	else
+		*weight += 10;
+
+	return 0;
+}
+
+static unsigned int pnv_pci_ioda_pe_dma_weight(struct pnv_ioda_pe *pe)
+{
+	unsigned int weight = 0;
+
+	/* SRIOV VF has same DMA32 weight as its PF */
+#ifdef CONFIG_PCI_IOV
+	if ((pe->flags & PNV_IODA_PE_VF) && pe->parent_dev) {
+		pnv_pci_ioda_dev_dma_weight(pe->parent_dev, &weight);
+		return weight;
+	}
+#endif
+
+	if ((pe->flags & PNV_IODA_PE_DEV) && pe->pdev) {
+		pnv_pci_ioda_dev_dma_weight(pe->pdev, &weight);
+	} else if ((pe->flags & PNV_IODA_PE_BUS) && pe->pbus) {
+		struct pci_dev *pdev;
+
+		list_for_each_entry(pdev, &pe->pbus->devices, bus_list)
+			pnv_pci_ioda_dev_dma_weight(pdev, &weight);
+	} else if ((pe->flags & PNV_IODA_PE_BUS_ALL) && pe->pbus) {
+		pci_walk_bus(pe->pbus, pnv_pci_ioda_dev_dma_weight, &weight);
+	}
+
+	return weight;
+}
+
+static void pnv_pci_ioda1_setup_dma_pe(struct pnv_phb *phb,
+				       struct pnv_ioda_pe *pe)
+{
+
+	struct page *tce_mem = NULL;
+	struct iommu_table *tbl;
+	unsigned int weight, total_weight = 0;
+	unsigned int tce32_segsz, base, segs, avail, i;
+	int64_t rc;
+	void *addr;
+
+	/* XXX FIXME: Handle 64-bit only DMA devices */
+	/* XXX FIXME: Provide 64-bit DMA facilities & non-4K TCE tables etc.. */
+	/* XXX FIXME: Allocate multi-level tables on PHB3 */
+	weight = pnv_pci_ioda_pe_dma_weight(pe);
+	if (!weight)
+		return;
+
+	pci_walk_bus(phb->hose->bus, pnv_pci_ioda_dev_dma_weight,
+		     &total_weight);
+	segs = (weight * phb->ioda.dma32_count) / total_weight;
+	if (!segs)
+		segs = 1;
+
+	/*
+	 * Allocate contiguous DMA32 segments. We begin with the expected
+	 * number of segments. With one more attempt, the number of DMA32
+	 * segments to be allocated is decreased by one until one segment
+	 * is allocated successfully.
+	 */
+	do {
+		for (base = 0; base <= phb->ioda.dma32_count - segs; base++) {
+			for (avail = 0, i = base; i < base + segs; i++) {
+				if (phb->ioda.dma32_segmap[i] ==
+				    IODA_INVALID_PE)
+					avail++;
+			}
+
+			if (avail == segs)
+				goto found;
+		}
+	} while (--segs);
+
+	if (!segs) {
+		pe_warn(pe, "No available DMA32 segments\n");
+		return;
+	}
+
+found:
+	tbl = pnv_pci_table_alloc(phb->hose->node);
+	if (WARN_ON(!tbl))
+		return;
+
+	iommu_register_group(&pe->table_group, phb->hose->global_number,
+			pe->pe_number);
+	pnv_pci_link_table_and_group(phb->hose->node, 0, tbl, &pe->table_group);
+
+	/* Grab a 32-bit TCE table */
+	pe_info(pe, "DMA weight %d (%d), assigned (%d) %d DMA32 segments\n",
+		weight, total_weight, base, segs);
+	pe_info(pe, " Setting up 32-bit TCE table at %08x..%08x\n",
+		base * PNV_IODA1_DMA32_SEGSIZE,
+		(base + segs) * PNV_IODA1_DMA32_SEGSIZE - 1);
+
+	/* XXX Currently, we allocate one big contiguous table for the
+	 * TCEs. We only really need one chunk per 256M of TCE space
+	 * (ie per segment) but that's an optimization for later, it
+	 * requires some added smarts with our get/put_tce implementation
+	 *
+	 * Each TCE page is 4KB in size and each TCE entry occupies 8
+	 * bytes
+	 */
+	tce32_segsz = PNV_IODA1_DMA32_SEGSIZE >> (IOMMU_PAGE_SHIFT_4K - 3);
+	tce_mem = alloc_pages_node(phb->hose->node, GFP_KERNEL,
+				   get_order(tce32_segsz * segs));
+	if (!tce_mem) {
+		pe_err(pe, " Failed to allocate a 32-bit TCE memory\n");
+		goto fail;
+	}
+	addr = page_address(tce_mem);
+	memset(addr, 0, tce32_segsz * segs);
+
+	/* Configure HW */
+	for (i = 0; i < segs; i++) {
+		rc = opal_pci_map_pe_dma_window(phb->opal_id,
+					      pe->pe_number,
+					      base + i, 1,
+					      __pa(addr) + tce32_segsz * i,
+					      tce32_segsz, IOMMU_PAGE_SIZE_4K);
+		if (rc) {
+			pe_err(pe, " Failed to configure 32-bit TCE table, err %lld\n",
+			       rc);
+			goto fail;
+		}
+	}
+
+	/* Setup DMA32 segment mapping */
+	for (i = base; i < base + segs; i++)
+		phb->ioda.dma32_segmap[i] = pe->pe_number;
+
+	/* Setup linux iommu table */
+	pnv_pci_setup_iommu_table(tbl, addr, tce32_segsz * segs,
+				  base * PNV_IODA1_DMA32_SEGSIZE,
+				  IOMMU_PAGE_SHIFT_4K);
+
+	tbl->it_ops = &pnv_ioda1_iommu_ops;
+	pe->table_group.tce32_start = tbl->it_offset << tbl->it_page_shift;
+	pe->table_group.tce32_size = tbl->it_size << tbl->it_page_shift;
+	iommu_init_table(tbl, phb->hose->node, 0, 0);
+
+	pe->dma_setup_done = true;
+	return;
+ fail:
+	/* XXX Failure: Try to fallback to 64-bit only ? */
+	if (tce_mem)
+		__free_pages(tce_mem, get_order(tce32_segsz * segs));
+	if (tbl) {
+		pnv_pci_unlink_table_and_group(tbl, &pe->table_group);
+		iommu_tce_table_put(tbl);
+	}
+}
+
+static long pnv_pci_ioda2_set_window(struct iommu_table_group *table_group,
+		int num, struct iommu_table *tbl)
+{
+	struct pnv_ioda_pe *pe = container_of(table_group, struct pnv_ioda_pe,
+			table_group);
+	struct pnv_phb *phb = pe->phb;
+	int64_t rc;
+	const unsigned long size = tbl->it_indirect_levels ?
+			tbl->it_level_size : tbl->it_size;
+	const __u64 start_addr = tbl->it_offset << tbl->it_page_shift;
+	const __u64 win_size = tbl->it_size << tbl->it_page_shift;
+
+	pe_info(pe, "Setting up window#%d %llx..%llx pg=%lx\n",
+		num, start_addr, start_addr + win_size - 1,
+		IOMMU_PAGE_SIZE(tbl));
+
+	/*
+	 * Map TCE table through TVT. The TVE index is the PE number
+	 * shifted by 1 bit for 32-bits DMA space.
+	 */
+	rc = opal_pci_map_pe_dma_window(phb->opal_id,
+			pe->pe_number,
+			(pe->pe_number << 1) + num,
+			tbl->it_indirect_levels + 1,
+			__pa(tbl->it_base),
+			size << 3,
+			IOMMU_PAGE_SIZE(tbl));
+	if (rc) {
+		pe_err(pe, "Failed to configure TCE table, err %lld\n", rc);
+		return rc;
+	}
+
+	pnv_pci_link_table_and_group(phb->hose->node, num,
+			tbl, &pe->table_group);
+	pnv_pci_ioda2_tce_invalidate_pe(pe);
+
+	return 0;
+}
+
+static void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable)
+{
+	uint16_t window_id = (pe->pe_number << 1 ) + 1;
+	int64_t rc;
+
+	pe_info(pe, "%sabling 64-bit DMA bypass\n", enable ? "En" : "Dis");
+	if (enable) {
+		phys_addr_t top = memblock_end_of_DRAM();
+
+		top = roundup_pow_of_two(top);
+		rc = opal_pci_map_pe_dma_window_real(pe->phb->opal_id,
+						     pe->pe_number,
+						     window_id,
+						     pe->tce_bypass_base,
+						     top);
+	} else {
+		rc = opal_pci_map_pe_dma_window_real(pe->phb->opal_id,
+						     pe->pe_number,
+						     window_id,
+						     pe->tce_bypass_base,
+						     0);
+	}
+	if (rc)
+		pe_err(pe, "OPAL error %lld configuring bypass window\n", rc);
+	else
+		pe->tce_bypass_enabled = enable;
+}
+
+static long pnv_pci_ioda2_create_table(struct iommu_table_group *table_group,
+		int num, __u32 page_shift, __u64 window_size, __u32 levels,
+		bool alloc_userspace_copy, struct iommu_table **ptbl)
+{
+	struct pnv_ioda_pe *pe = container_of(table_group, struct pnv_ioda_pe,
+			table_group);
+	int nid = pe->phb->hose->node;
+	__u64 bus_offset = num ? pe->tce_bypass_base : table_group->tce32_start;
+	long ret;
+	struct iommu_table *tbl;
+
+	tbl = pnv_pci_table_alloc(nid);
+	if (!tbl)
+		return -ENOMEM;
+
+	tbl->it_ops = &pnv_ioda2_iommu_ops;
+
+	ret = pnv_pci_ioda2_table_alloc_pages(nid,
+			bus_offset, page_shift, window_size,
+			levels, alloc_userspace_copy, tbl);
+	if (ret) {
+		iommu_tce_table_put(tbl);
+		return ret;
+	}
+
+	*ptbl = tbl;
+
+	return 0;
+}
+
+static long pnv_pci_ioda2_setup_default_config(struct pnv_ioda_pe *pe)
+{
+	struct iommu_table *tbl = NULL;
+	long rc;
+	unsigned long res_start, res_end;
+
+	/*
+	 * crashkernel= specifies the kdump kernel's maximum memory at
+	 * some offset and there is no guaranteed the result is a power
+	 * of 2, which will cause errors later.
+	 */
+	const u64 max_memory = __rounddown_pow_of_two(memory_hotplug_max());
+
+	/*
+	 * In memory constrained environments, e.g. kdump kernel, the
+	 * DMA window can be larger than available memory, which will
+	 * cause errors later.
+	 */
+	const u64 maxblock = 1UL << (PAGE_SHIFT + MAX_ORDER - 1);
+
+	/*
+	 * We create the default window as big as we can. The constraint is
+	 * the max order of allocation possible. The TCE table is likely to
+	 * end up being multilevel and with on-demand allocation in place,
+	 * the initial use is not going to be huge as the default window aims
+	 * to support crippled devices (i.e. not fully 64bit DMAble) only.
+	 */
+	/* iommu_table::it_map uses 1 bit per IOMMU page, hence 8 */
+	const u64 window_size = min((maxblock * 8) << PAGE_SHIFT, max_memory);
+	/* Each TCE level cannot exceed maxblock so go multilevel if needed */
+	unsigned long tces_order = ilog2(window_size >> PAGE_SHIFT);
+	unsigned long tcelevel_order = ilog2(maxblock >> 3);
+	unsigned int levels = tces_order / tcelevel_order;
+
+	if (tces_order % tcelevel_order)
+		levels += 1;
+	/*
+	 * We try to stick to default levels (which is >1 at the moment) in
+	 * order to save memory by relying on on-demain TCE level allocation.
+	 */
+	levels = max_t(unsigned int, levels, POWERNV_IOMMU_DEFAULT_LEVELS);
+
+	rc = pnv_pci_ioda2_create_table(&pe->table_group, 0, PAGE_SHIFT,
+			window_size, levels, false, &tbl);
+	if (rc) {
+		pe_err(pe, "Failed to create 32-bit TCE table, err %ld",
+				rc);
+		return rc;
+	}
+
+	/* We use top part of 32bit space for MMIO so exclude it from DMA */
+	res_start = 0;
+	res_end = 0;
+	if (window_size > pe->phb->ioda.m32_pci_base) {
+		res_start = pe->phb->ioda.m32_pci_base >> tbl->it_page_shift;
+		res_end = min(window_size, SZ_4G) >> tbl->it_page_shift;
+	}
+	iommu_init_table(tbl, pe->phb->hose->node, res_start, res_end);
+
+	rc = pnv_pci_ioda2_set_window(&pe->table_group, 0, tbl);
+	if (rc) {
+		pe_err(pe, "Failed to configure 32-bit TCE table, err %ld\n",
+				rc);
+		iommu_tce_table_put(tbl);
+		return rc;
+	}
+
+	if (!pnv_iommu_bypass_disabled)
+		pnv_pci_ioda2_set_bypass(pe, true);
+
+	/*
+	 * Set table base for the case of IOMMU DMA use. Usually this is done
+	 * from dma_dev_setup() which is not called when a device is returned
+	 * from VFIO so do it here.
+	 */
+	if (pe->pdev)
+		set_iommu_table_base(&pe->pdev->dev, tbl);
+
+	return 0;
+}
+
+static long pnv_pci_ioda2_unset_window(struct iommu_table_group *table_group,
+		int num)
+{
+	struct pnv_ioda_pe *pe = container_of(table_group, struct pnv_ioda_pe,
+			table_group);
+	struct pnv_phb *phb = pe->phb;
+	long ret;
+
+	pe_info(pe, "Removing DMA window #%d\n", num);
+
+	ret = opal_pci_map_pe_dma_window(phb->opal_id, pe->pe_number,
+			(pe->pe_number << 1) + num,
+			0/* levels */, 0/* table address */,
+			0/* table size */, 0/* page size */);
+	if (ret)
+		pe_warn(pe, "Unmapping failed, ret = %ld\n", ret);
+	else
+		pnv_pci_ioda2_tce_invalidate_pe(pe);
+
+	pnv_pci_unlink_table_and_group(table_group->tables[num], table_group);
+
+	return ret;
+}
+
+#ifdef CONFIG_IOMMU_API
+unsigned long pnv_pci_ioda2_get_table_size(__u32 page_shift,
+		__u64 window_size, __u32 levels)
+{
+	unsigned long bytes = 0;
+	const unsigned window_shift = ilog2(window_size);
+	unsigned entries_shift = window_shift - page_shift;
+	unsigned table_shift = entries_shift + 3;
+	unsigned long tce_table_size = max(0x1000UL, 1UL << table_shift);
+	unsigned long direct_table_size;
+
+	if (!levels || (levels > POWERNV_IOMMU_MAX_LEVELS) ||
+			!is_power_of_2(window_size))
+		return 0;
+
+	/* Calculate a direct table size from window_size and levels */
+	entries_shift = (entries_shift + levels - 1) / levels;
+	table_shift = entries_shift + 3;
+	table_shift = max_t(unsigned, table_shift, PAGE_SHIFT);
+	direct_table_size =  1UL << table_shift;
+
+	for ( ; levels; --levels) {
+		bytes += ALIGN(tce_table_size, direct_table_size);
+
+		tce_table_size /= direct_table_size;
+		tce_table_size <<= 3;
+		tce_table_size = max_t(unsigned long,
+				tce_table_size, direct_table_size);
+	}
+
+	return bytes + bytes; /* one for HW table, one for userspace copy */
+}
+
+static long pnv_pci_ioda2_create_table_userspace(
+		struct iommu_table_group *table_group,
+		int num, __u32 page_shift, __u64 window_size, __u32 levels,
+		struct iommu_table **ptbl)
+{
+	long ret = pnv_pci_ioda2_create_table(table_group,
+			num, page_shift, window_size, levels, true, ptbl);
+
+	if (!ret)
+		(*ptbl)->it_allocated_size = pnv_pci_ioda2_get_table_size(
+				page_shift, window_size, levels);
+	return ret;
+}
+
+static void pnv_ioda_setup_bus_dma(struct pnv_ioda_pe *pe, struct pci_bus *bus)
+{
+	struct pci_dev *dev;
+
+	list_for_each_entry(dev, &bus->devices, bus_list) {
+		set_iommu_table_base(&dev->dev, pe->table_group.tables[0]);
+		dev->dev.archdata.dma_offset = pe->tce_bypass_base;
+
+		if ((pe->flags & PNV_IODA_PE_BUS_ALL) && dev->subordinate)
+			pnv_ioda_setup_bus_dma(pe, dev->subordinate);
+	}
+}
+
+static void pnv_ioda2_take_ownership(struct iommu_table_group *table_group)
+{
+	struct pnv_ioda_pe *pe = container_of(table_group, struct pnv_ioda_pe,
+						table_group);
+	/* Store @tbl as pnv_pci_ioda2_unset_window() resets it */
+	struct iommu_table *tbl = pe->table_group.tables[0];
+
+	pnv_pci_ioda2_set_bypass(pe, false);
+	pnv_pci_ioda2_unset_window(&pe->table_group, 0);
+	if (pe->pbus)
+		pnv_ioda_setup_bus_dma(pe, pe->pbus);
+	else if (pe->pdev)
+		set_iommu_table_base(&pe->pdev->dev, NULL);
+	iommu_tce_table_put(tbl);
+}
+
+static void pnv_ioda2_release_ownership(struct iommu_table_group *table_group)
+{
+	struct pnv_ioda_pe *pe = container_of(table_group, struct pnv_ioda_pe,
+						table_group);
+
+	pnv_pci_ioda2_setup_default_config(pe);
+	if (pe->pbus)
+		pnv_ioda_setup_bus_dma(pe, pe->pbus);
+}
+
+static struct iommu_table_group_ops pnv_pci_ioda2_ops = {
+	.get_table_size = pnv_pci_ioda2_get_table_size,
+	.create_table = pnv_pci_ioda2_create_table_userspace,
+	.set_window = pnv_pci_ioda2_set_window,
+	.unset_window = pnv_pci_ioda2_unset_window,
+	.take_ownership = pnv_ioda2_take_ownership,
+	.release_ownership = pnv_ioda2_release_ownership,
+};
+#endif
+
+void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
+				struct pnv_ioda_pe *pe)
+{
+	int64_t rc;
+
+	/* TVE #1 is selected by PCI address bit 59 */
+	pe->tce_bypass_base = 1ull << 59;
+
+	/* The PE will reserve all possible 32-bits space */
+	pe_info(pe, "Setting up 32-bit TCE table at 0..%08x\n",
+		phb->ioda.m32_pci_base);
+
+	/* Setup linux iommu table */
+	pe->table_group.tce32_start = 0;
+	pe->table_group.tce32_size = phb->ioda.m32_pci_base;
+	pe->table_group.max_dynamic_windows_supported =
+			IOMMU_TABLE_GROUP_MAX_TABLES;
+	pe->table_group.max_levels = POWERNV_IOMMU_MAX_LEVELS;
+	pe->table_group.pgsizes = pnv_ioda_parse_tce_sizes(phb);
+
+	rc = pnv_pci_ioda2_setup_default_config(pe);
+	if (rc)
+		return;
+
+#ifdef CONFIG_IOMMU_API
+	pe->table_group.ops = &pnv_pci_ioda2_ops;
+	iommu_register_group(&pe->table_group, phb->hose->global_number,
+			     pe->pe_number);
+#endif
+	pe->dma_setup_done = true;
+}
+
+int64_t pnv_opal_pci_msi_eoi(struct irq_chip *chip, unsigned int hw_irq)
+{
+	struct pnv_phb *phb = container_of(chip, struct pnv_phb,
+					   ioda.irq_chip);
+
+	return opal_pci_msi_eoi(phb->opal_id, hw_irq);
+}
+
+static void pnv_ioda2_msi_eoi(struct irq_data *d)
+{
+	int64_t rc;
+	unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d);
+	struct irq_chip *chip = irq_data_get_irq_chip(d);
+
+	rc = pnv_opal_pci_msi_eoi(chip, hw_irq);
+	WARN_ON_ONCE(rc);
+
+	icp_native_eoi(d);
+}
+
+
+void pnv_set_msi_irq_chip(struct pnv_phb *phb, unsigned int virq)
+{
+	struct irq_data *idata;
+	struct irq_chip *ichip;
+
+	/* The MSI EOI OPAL call is only needed on PHB3 */
+	if (phb->model != PNV_PHB_MODEL_PHB3)
+		return;
+
+	if (!phb->ioda.irq_chip_init) {
+		/*
+		 * First time we setup an MSI IRQ, we need to setup the
+		 * corresponding IRQ chip to route correctly.
+		 */
+		idata = irq_get_irq_data(virq);
+		ichip = irq_data_get_irq_chip(idata);
+		phb->ioda.irq_chip_init = 1;
+		phb->ioda.irq_chip = *ichip;
+		phb->ioda.irq_chip.irq_eoi = pnv_ioda2_msi_eoi;
+	}
+	irq_set_chip(virq, &phb->ioda.irq_chip);
+}
+
+/*
+ * Returns true iff chip is something that we could call
+ * pnv_opal_pci_msi_eoi for.
+ */
+bool is_pnv_opal_msi(struct irq_chip *chip)
+{
+	return chip->irq_eoi == pnv_ioda2_msi_eoi;
+}
+EXPORT_SYMBOL_GPL(is_pnv_opal_msi);
+
+static int pnv_pci_ioda_msi_setup(struct pnv_phb *phb, struct pci_dev *dev,
+				  unsigned int hwirq, unsigned int virq,
+				  unsigned int is_64, struct msi_msg *msg)
+{
+	struct pnv_ioda_pe *pe = pnv_ioda_get_pe(dev);
+	unsigned int xive_num = hwirq - phb->msi_base;
+	__be32 data;
+	int rc;
+
+	/* No PE assigned ? bail out ... no MSI for you ! */
+	if (pe == NULL)
+		return -ENXIO;
+
+	/* Check if we have an MVE */
+	if (pe->mve_number < 0)
+		return -ENXIO;
+
+	/* Force 32-bit MSI on some broken devices */
+	if (dev->no_64bit_msi)
+		is_64 = 0;
+
+	/* Assign XIVE to PE */
+	rc = opal_pci_set_xive_pe(phb->opal_id, pe->pe_number, xive_num);
+	if (rc) {
+		pr_warn("%s: OPAL error %d setting XIVE %d PE\n",
+			pci_name(dev), rc, xive_num);
+		return -EIO;
+	}
+
+	if (is_64) {
+		__be64 addr64;
+
+		rc = opal_get_msi_64(phb->opal_id, pe->mve_number, xive_num, 1,
+				     &addr64, &data);
+		if (rc) {
+			pr_warn("%s: OPAL error %d getting 64-bit MSI data\n",
+				pci_name(dev), rc);
+			return -EIO;
+		}
+		msg->address_hi = be64_to_cpu(addr64) >> 32;
+		msg->address_lo = be64_to_cpu(addr64) & 0xfffffffful;
+	} else {
+		__be32 addr32;
+
+		rc = opal_get_msi_32(phb->opal_id, pe->mve_number, xive_num, 1,
+				     &addr32, &data);
+		if (rc) {
+			pr_warn("%s: OPAL error %d getting 32-bit MSI data\n",
+				pci_name(dev), rc);
+			return -EIO;
+		}
+		msg->address_hi = 0;
+		msg->address_lo = be32_to_cpu(addr32);
+	}
+	msg->data = be32_to_cpu(data);
+
+	pnv_set_msi_irq_chip(phb, virq);
+
+	pr_devel("%s: %s-bit MSI on hwirq %x (xive #%d),"
+		 " address=%x_%08x data=%x PE# %x\n",
+		 pci_name(dev), is_64 ? "64" : "32", hwirq, xive_num,
+		 msg->address_hi, msg->address_lo, data, pe->pe_number);
+
+	return 0;
+}
+
+static void pnv_pci_init_ioda_msis(struct pnv_phb *phb)
+{
+	unsigned int count;
+	const __be32 *prop = of_get_property(phb->hose->dn,
+					     "ibm,opal-msi-ranges", NULL);
+	if (!prop) {
+		/* BML Fallback */
+		prop = of_get_property(phb->hose->dn, "msi-ranges", NULL);
+	}
+	if (!prop)
+		return;
+
+	phb->msi_base = be32_to_cpup(prop);
+	count = be32_to_cpup(prop + 1);
+	if (msi_bitmap_alloc(&phb->msi_bmp, count, phb->hose->dn)) {
+		pr_err("PCI %d: Failed to allocate MSI bitmap !\n",
+		       phb->hose->global_number);
+		return;
+	}
+
+	phb->msi_setup = pnv_pci_ioda_msi_setup;
+	phb->msi32_support = 1;
+	pr_info("  Allocated bitmap for %d MSIs (base IRQ 0x%x)\n",
+		count, phb->msi_base);
+}
+
+static void pnv_ioda_setup_pe_res(struct pnv_ioda_pe *pe,
+				  struct resource *res)
+{
+	struct pnv_phb *phb = pe->phb;
+	struct pci_bus_region region;
+	int index;
+	int64_t rc;
+
+	if (!res || !res->flags || res->start > res->end ||
+	    res->flags & IORESOURCE_UNSET)
+		return;
+
+	if (res->flags & IORESOURCE_IO) {
+		region.start = res->start - phb->ioda.io_pci_base;
+		region.end   = res->end - phb->ioda.io_pci_base;
+		index = region.start / phb->ioda.io_segsize;
+
+		while (index < phb->ioda.total_pe_num &&
+		       region.start <= region.end) {
+			phb->ioda.io_segmap[index] = pe->pe_number;
+			rc = opal_pci_map_pe_mmio_window(phb->opal_id,
+				pe->pe_number, OPAL_IO_WINDOW_TYPE, 0, index);
+			if (rc != OPAL_SUCCESS) {
+				pr_err("%s: Error %lld mapping IO segment#%d to PE#%x\n",
+				       __func__, rc, index, pe->pe_number);
+				break;
+			}
+
+			region.start += phb->ioda.io_segsize;
+			index++;
+		}
+	} else if ((res->flags & IORESOURCE_MEM) &&
+		   !pnv_pci_is_m64(phb, res)) {
+		region.start = res->start -
+			       phb->hose->mem_offset[0] -
+			       phb->ioda.m32_pci_base;
+		region.end   = res->end -
+			       phb->hose->mem_offset[0] -
+			       phb->ioda.m32_pci_base;
+		index = region.start / phb->ioda.m32_segsize;
+
+		while (index < phb->ioda.total_pe_num &&
+		       region.start <= region.end) {
+			phb->ioda.m32_segmap[index] = pe->pe_number;
+			rc = opal_pci_map_pe_mmio_window(phb->opal_id,
+				pe->pe_number, OPAL_M32_WINDOW_TYPE, 0, index);
+			if (rc != OPAL_SUCCESS) {
+				pr_err("%s: Error %lld mapping M32 segment#%d to PE#%x",
+				       __func__, rc, index, pe->pe_number);
+				break;
+			}
+
+			region.start += phb->ioda.m32_segsize;
+			index++;
+		}
+	}
+}
+
+/*
+ * This function is supposed to be called on basis of PE from top
+ * to bottom style. So the the I/O or MMIO segment assigned to
+ * parent PE could be overridden by its child PEs if necessary.
+ */
+static void pnv_ioda_setup_pe_seg(struct pnv_ioda_pe *pe)
+{
+	struct pci_dev *pdev;
+	int i;
+
+	/*
+	 * NOTE: We only care PCI bus based PE for now. For PCI
+	 * device based PE, for example SRIOV sensitive VF should
+	 * be figured out later.
+	 */
+	BUG_ON(!(pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)));
+
+	list_for_each_entry(pdev, &pe->pbus->devices, bus_list) {
+		for (i = 0; i <= PCI_ROM_RESOURCE; i++)
+			pnv_ioda_setup_pe_res(pe, &pdev->resource[i]);
+
+		/*
+		 * If the PE contains all subordinate PCI buses, the
+		 * windows of the child bridges should be mapped to
+		 * the PE as well.
+		 */
+		if (!(pe->flags & PNV_IODA_PE_BUS_ALL) || !pci_is_bridge(pdev))
+			continue;
+		for (i = 0; i < PCI_BRIDGE_RESOURCE_NUM; i++)
+			pnv_ioda_setup_pe_res(pe,
+				&pdev->resource[PCI_BRIDGE_RESOURCES + i]);
+	}
+}
+
+#ifdef CONFIG_DEBUG_FS
+static int pnv_pci_diag_data_set(void *data, u64 val)
+{
+	struct pnv_phb *phb = data;
+	s64 ret;
+
+	/* Retrieve the diag data from firmware */
+	ret = opal_pci_get_phb_diag_data2(phb->opal_id, phb->diag_data,
+					  phb->diag_data_size);
+	if (ret != OPAL_SUCCESS)
+		return -EIO;
+
+	/* Print the diag data to the kernel log */
+	pnv_pci_dump_phb_diag_data(phb->hose, phb->diag_data);
+	return 0;
+}
+
+DEFINE_DEBUGFS_ATTRIBUTE(pnv_pci_diag_data_fops, NULL, pnv_pci_diag_data_set,
+			 "%llu\n");
+
+static int pnv_pci_ioda_pe_dump(void *data, u64 val)
+{
+	struct pnv_phb *phb = data;
+	int pe_num;
+
+	for (pe_num = 0; pe_num < phb->ioda.total_pe_num; pe_num++) {
+		struct pnv_ioda_pe *pe = &phb->ioda.pe_array[pe_num];
+
+		if (!test_bit(pe_num, phb->ioda.pe_alloc))
+			continue;
+
+		pe_warn(pe, "rid: %04x dev count: %2d flags: %s%s%s%s%s%s\n",
+			pe->rid, pe->device_count,
+			(pe->flags & PNV_IODA_PE_DEV) ? "dev " : "",
+			(pe->flags & PNV_IODA_PE_BUS) ? "bus " : "",
+			(pe->flags & PNV_IODA_PE_BUS_ALL) ? "all " : "",
+			(pe->flags & PNV_IODA_PE_MASTER) ? "master " : "",
+			(pe->flags & PNV_IODA_PE_SLAVE) ? "slave " : "",
+			(pe->flags & PNV_IODA_PE_VF) ? "vf " : "");
+	}
+
+	return 0;
+}
+
+DEFINE_DEBUGFS_ATTRIBUTE(pnv_pci_ioda_pe_dump_fops, NULL,
+			 pnv_pci_ioda_pe_dump, "%llu\n");
+
+#endif /* CONFIG_DEBUG_FS */
+
+static void pnv_pci_ioda_create_dbgfs(void)
+{
+#ifdef CONFIG_DEBUG_FS
+	struct pci_controller *hose, *tmp;
+	struct pnv_phb *phb;
+	char name[16];
+
+	list_for_each_entry_safe(hose, tmp, &hose_list, list_node) {
+		phb = hose->private_data;
+
+		/* Notify initialization of PHB done */
+		phb->initialized = 1;
+
+		sprintf(name, "PCI%04x", hose->global_number);
+		phb->dbgfs = debugfs_create_dir(name, powerpc_debugfs_root);
+
+		debugfs_create_file_unsafe("dump_diag_regs", 0200, phb->dbgfs,
+					   phb, &pnv_pci_diag_data_fops);
+		debugfs_create_file_unsafe("dump_ioda_pe_state", 0200, phb->dbgfs,
+					   phb, &pnv_pci_ioda_pe_dump_fops);
+	}
+#endif /* CONFIG_DEBUG_FS */
+}
+
+static void pnv_pci_enable_bridge(struct pci_bus *bus)
+{
+	struct pci_dev *dev = bus->self;
+	struct pci_bus *child;
+
+	/* Empty bus ? bail */
+	if (list_empty(&bus->devices))
+		return;
+
+	/*
+	 * If there's a bridge associated with that bus enable it. This works
+	 * around races in the generic code if the enabling is done during
+	 * parallel probing. This can be removed once those races have been
+	 * fixed.
+	 */
+	if (dev) {
+		int rc = pci_enable_device(dev);
+		if (rc)
+			pci_err(dev, "Error enabling bridge (%d)\n", rc);
+		pci_set_master(dev);
+	}
+
+	/* Perform the same to child busses */
+	list_for_each_entry(child, &bus->children, node)
+		pnv_pci_enable_bridge(child);
+}
+
+static void pnv_pci_enable_bridges(void)
+{
+	struct pci_controller *hose;
+
+	list_for_each_entry(hose, &hose_list, list_node)
+		pnv_pci_enable_bridge(hose->bus);
+}
+
+static void pnv_pci_ioda_fixup(void)
+{
+	pnv_pci_ioda_setup_nvlink();
+	pnv_pci_ioda_create_dbgfs();
+
+	pnv_pci_enable_bridges();
+
+#ifdef CONFIG_EEH
+	pnv_eeh_post_init();
+#endif
+}
+
+/*
+ * Returns the alignment for I/O or memory windows for P2P
+ * bridges. That actually depends on how PEs are segmented.
+ * For now, we return I/O or M32 segment size for PE sensitive
+ * P2P bridges. Otherwise, the default values (4KiB for I/O,
+ * 1MiB for memory) will be returned.
+ *
+ * The current PCI bus might be put into one PE, which was
+ * create against the parent PCI bridge. For that case, we
+ * needn't enlarge the alignment so that we can save some
+ * resources.
+ */
+static resource_size_t pnv_pci_window_alignment(struct pci_bus *bus,
+						unsigned long type)
+{
+	struct pnv_phb *phb = pci_bus_to_pnvhb(bus);
+	int num_pci_bridges = 0;
+	struct pci_dev *bridge;
+
+	bridge = bus->self;
+	while (bridge) {
+		if (pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE) {
+			num_pci_bridges++;
+			if (num_pci_bridges >= 2)
+				return 1;
+		}
+
+		bridge = bridge->bus->self;
+	}
+
+	/*
+	 * We fall back to M32 if M64 isn't supported. We enforce the M64
+	 * alignment for any 64-bit resource, PCIe doesn't care and
+	 * bridges only do 64-bit prefetchable anyway.
+	 */
+	if (phb->ioda.m64_segsize && pnv_pci_is_m64_flags(type))
+		return phb->ioda.m64_segsize;
+	if (type & IORESOURCE_MEM)
+		return phb->ioda.m32_segsize;
+
+	return phb->ioda.io_segsize;
+}
+
+/*
+ * We are updating root port or the upstream port of the
+ * bridge behind the root port with PHB's windows in order
+ * to accommodate the changes on required resources during
+ * PCI (slot) hotplug, which is connected to either root
+ * port or the downstream ports of PCIe switch behind the
+ * root port.
+ */
+static void pnv_pci_fixup_bridge_resources(struct pci_bus *bus,
+					   unsigned long type)
+{
+	struct pci_controller *hose = pci_bus_to_host(bus);
+	struct pnv_phb *phb = hose->private_data;
+	struct pci_dev *bridge = bus->self;
+	struct resource *r, *w;
+	bool msi_region = false;
+	int i;
+
+	/* Check if we need apply fixup to the bridge's windows */
+	if (!pci_is_root_bus(bridge->bus) &&
+	    !pci_is_root_bus(bridge->bus->self->bus))
+		return;
+
+	/* Fixup the resources */
+	for (i = 0; i < PCI_BRIDGE_RESOURCE_NUM; i++) {
+		r = &bridge->resource[PCI_BRIDGE_RESOURCES + i];
+		if (!r->flags || !r->parent)
+			continue;
+
+		w = NULL;
+		if (r->flags & type & IORESOURCE_IO)
+			w = &hose->io_resource;
+		else if (pnv_pci_is_m64(phb, r) &&
+			 (type & IORESOURCE_PREFETCH) &&
+			 phb->ioda.m64_segsize)
+			w = &hose->mem_resources[1];
+		else if (r->flags & type & IORESOURCE_MEM) {
+			w = &hose->mem_resources[0];
+			msi_region = true;
+		}
+
+		r->start = w->start;
+		r->end = w->end;
+
+		/* The 64KB 32-bits MSI region shouldn't be included in
+		 * the 32-bits bridge window. Otherwise, we can see strange
+		 * issues. One of them is EEH error observed on Garrison.
+		 *
+		 * Exclude top 1MB region which is the minimal alignment of
+		 * 32-bits bridge window.
+		 */
+		if (msi_region) {
+			r->end += 0x10000;
+			r->end -= 0x100000;
+		}
+	}
+}
+
+static void pnv_pci_configure_bus(struct pci_bus *bus)
+{
+	struct pci_dev *bridge = bus->self;
+	struct pnv_ioda_pe *pe;
+	bool all = (bridge && pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE);
+
+	dev_info(&bus->dev, "Configuring PE for bus\n");
+
+	/* Don't assign PE to PCI bus, which doesn't have subordinate devices */
+	if (WARN_ON(list_empty(&bus->devices)))
+		return;
+
+	/* Reserve PEs according to used M64 resources */
+	pnv_ioda_reserve_m64_pe(bus, NULL, all);
+
+	/*
+	 * Assign PE. We might run here because of partial hotplug.
+	 * For the case, we just pick up the existing PE and should
+	 * not allocate resources again.
+	 */
+	pe = pnv_ioda_setup_bus_PE(bus, all);
+	if (!pe)
+		return;
+
+	pnv_ioda_setup_pe_seg(pe);
+}
+
+static resource_size_t pnv_pci_default_alignment(void)
+{
+	return PAGE_SIZE;
+}
+
+/* Prevent enabling devices for which we couldn't properly
+ * assign a PE
+ */
+static bool pnv_pci_enable_device_hook(struct pci_dev *dev)
+{
+	struct pnv_phb *phb = pci_bus_to_pnvhb(dev->bus);
+	struct pci_dn *pdn;
+
+	/* The function is probably called while the PEs have
+	 * not be created yet. For example, resource reassignment
+	 * during PCI probe period. We just skip the check if
+	 * PEs isn't ready.
+	 */
+	if (!phb->initialized)
+		return true;
+
+	pdn = pci_get_pdn(dev);
+	if (!pdn || pdn->pe_number == IODA_INVALID_PE)
+		return false;
+
+	return true;
+}
+
+static bool pnv_ocapi_enable_device_hook(struct pci_dev *dev)
+{
+	struct pci_controller *hose = pci_bus_to_host(dev->bus);
+	struct pnv_phb *phb = hose->private_data;
+	struct pci_dn *pdn;
+	struct pnv_ioda_pe *pe;
+
+	if (!phb->initialized)
+		return true;
+
+	pdn = pci_get_pdn(dev);
+	if (!pdn)
+		return false;
+
+	if (pdn->pe_number == IODA_INVALID_PE) {
+		pe = pnv_ioda_setup_dev_PE(dev);
+		if (!pe)
+			return false;
+	}
+	return true;
+}
+
+static long pnv_pci_ioda1_unset_window(struct iommu_table_group *table_group,
+				       int num)
+{
+	struct pnv_ioda_pe *pe = container_of(table_group,
+					      struct pnv_ioda_pe, table_group);
+	struct pnv_phb *phb = pe->phb;
+	unsigned int idx;
+	long rc;
+
+	pe_info(pe, "Removing DMA window #%d\n", num);
+	for (idx = 0; idx < phb->ioda.dma32_count; idx++) {
+		if (phb->ioda.dma32_segmap[idx] != pe->pe_number)
+			continue;
+
+		rc = opal_pci_map_pe_dma_window(phb->opal_id, pe->pe_number,
+						idx, 0, 0ul, 0ul, 0ul);
+		if (rc != OPAL_SUCCESS) {
+			pe_warn(pe, "Failure %ld unmapping DMA32 segment#%d\n",
+				rc, idx);
+			return rc;
+		}
+
+		phb->ioda.dma32_segmap[idx] = IODA_INVALID_PE;
+	}
+
+	pnv_pci_unlink_table_and_group(table_group->tables[num], table_group);
+	return OPAL_SUCCESS;
+}
+
+static void pnv_pci_ioda1_release_pe_dma(struct pnv_ioda_pe *pe)
+{
+	struct iommu_table *tbl = pe->table_group.tables[0];
+	int64_t rc;
+
+	if (!pe->dma_setup_done)
+		return;
+
+	rc = pnv_pci_ioda1_unset_window(&pe->table_group, 0);
+	if (rc != OPAL_SUCCESS)
+		return;
+
+	pnv_pci_p7ioc_tce_invalidate(tbl, tbl->it_offset, tbl->it_size, false);
+	if (pe->table_group.group) {
+		iommu_group_put(pe->table_group.group);
+		WARN_ON(pe->table_group.group);
+	}
+
+	free_pages(tbl->it_base, get_order(tbl->it_size << 3));
+	iommu_tce_table_put(tbl);
+}
+
+void pnv_pci_ioda2_release_pe_dma(struct pnv_ioda_pe *pe)
+{
+	struct iommu_table *tbl = pe->table_group.tables[0];
+	int64_t rc;
+
+	if (!pe->dma_setup_done)
+		return;
+
+	rc = pnv_pci_ioda2_unset_window(&pe->table_group, 0);
+	if (rc)
+		pe_warn(pe, "OPAL error %lld release DMA window\n", rc);
+
+	pnv_pci_ioda2_set_bypass(pe, false);
+	if (pe->table_group.group) {
+		iommu_group_put(pe->table_group.group);
+		WARN_ON(pe->table_group.group);
+	}
+
+	iommu_tce_table_put(tbl);
+}
+
+static void pnv_ioda_free_pe_seg(struct pnv_ioda_pe *pe,
+				 unsigned short win,
+				 unsigned int *map)
+{
+	struct pnv_phb *phb = pe->phb;
+	int idx;
+	int64_t rc;
+
+	for (idx = 0; idx < phb->ioda.total_pe_num; idx++) {
+		if (map[idx] != pe->pe_number)
+			continue;
+
+		rc = opal_pci_map_pe_mmio_window(phb->opal_id,
+				phb->ioda.reserved_pe_idx, win, 0, idx);
+
+		if (rc != OPAL_SUCCESS)
+			pe_warn(pe, "Error %lld unmapping (%d) segment#%d\n",
+				rc, win, idx);
+
+		map[idx] = IODA_INVALID_PE;
+	}
+}
+
+static void pnv_ioda_release_pe_seg(struct pnv_ioda_pe *pe)
+{
+	struct pnv_phb *phb = pe->phb;
+
+	if (phb->type == PNV_PHB_IODA1) {
+		pnv_ioda_free_pe_seg(pe, OPAL_IO_WINDOW_TYPE,
+				     phb->ioda.io_segmap);
+		pnv_ioda_free_pe_seg(pe, OPAL_M32_WINDOW_TYPE,
+				     phb->ioda.m32_segmap);
+		/* M64 is pre-configured by pnv_ioda1_init_m64() */
+	} else if (phb->type == PNV_PHB_IODA2) {
+		pnv_ioda_free_pe_seg(pe, OPAL_M32_WINDOW_TYPE,
+				     phb->ioda.m32_segmap);
+	}
+}
+
+static void pnv_ioda_release_pe(struct pnv_ioda_pe *pe)
+{
+	struct pnv_phb *phb = pe->phb;
+	struct pnv_ioda_pe *slave, *tmp;
+
+	pe_info(pe, "Releasing PE\n");
+
+	mutex_lock(&phb->ioda.pe_list_mutex);
+	list_del(&pe->list);
+	mutex_unlock(&phb->ioda.pe_list_mutex);
+
+	switch (phb->type) {
+	case PNV_PHB_IODA1:
+		pnv_pci_ioda1_release_pe_dma(pe);
+		break;
+	case PNV_PHB_IODA2:
+		pnv_pci_ioda2_release_pe_dma(pe);
+		break;
+	case PNV_PHB_NPU_OCAPI:
+		break;
+	default:
+		WARN_ON(1);
+	}
+
+	pnv_ioda_release_pe_seg(pe);
+	pnv_ioda_deconfigure_pe(pe->phb, pe);
+
+	/* Release slave PEs in the compound PE */
+	if (pe->flags & PNV_IODA_PE_MASTER) {
+		list_for_each_entry_safe(slave, tmp, &pe->slaves, list) {
+			list_del(&slave->list);
+			pnv_ioda_free_pe(slave);
+		}
+	}
+
+	/*
+	 * The PE for root bus can be removed because of hotplug in EEH
+	 * recovery for fenced PHB error. We need to mark the PE dead so
+	 * that it can be populated again in PCI hot add path. The PE
+	 * shouldn't be destroyed as it's the global reserved resource.
+	 */
+	if (phb->ioda.root_pe_idx == pe->pe_number)
+		return;
+
+	pnv_ioda_free_pe(pe);
+}
+
+static void pnv_pci_release_device(struct pci_dev *pdev)
+{
+	struct pnv_phb *phb = pci_bus_to_pnvhb(pdev->bus);
+	struct pci_dn *pdn = pci_get_pdn(pdev);
+	struct pnv_ioda_pe *pe;
+
+	/* The VF PE state is torn down when sriov_disable() is called */
+	if (pdev->is_virtfn)
+		return;
+
+	if (!pdn || pdn->pe_number == IODA_INVALID_PE)
+		return;
+
+#ifdef CONFIG_PCI_IOV
+	/*
+	 * FIXME: Try move this to sriov_disable(). It's here since we allocate
+	 * the iov state at probe time since we need to fiddle with the IOV
+	 * resources.
+	 */
+	if (pdev->is_physfn)
+		kfree(pdev->dev.archdata.iov_data);
+#endif
+
+	/*
+	 * PCI hotplug can happen as part of EEH error recovery. The @pdn
+	 * isn't removed and added afterwards in this scenario. We should
+	 * set the PE number in @pdn to an invalid one. Otherwise, the PE's
+	 * device count is decreased on removing devices while failing to
+	 * be increased on adding devices. It leads to unbalanced PE's device
+	 * count and eventually make normal PCI hotplug path broken.
+	 */
+	pe = &phb->ioda.pe_array[pdn->pe_number];
+	pdn->pe_number = IODA_INVALID_PE;
+
+	WARN_ON(--pe->device_count < 0);
+	if (pe->device_count == 0)
+		pnv_ioda_release_pe(pe);
+}
+
+static void pnv_npu_disable_device(struct pci_dev *pdev)
+{
+	struct eeh_dev *edev = pci_dev_to_eeh_dev(pdev);
+	struct eeh_pe *eehpe = edev ? edev->pe : NULL;
+
+	if (eehpe && eeh_ops && eeh_ops->reset)
+		eeh_ops->reset(eehpe, EEH_RESET_HOT);
+}
+
+static void pnv_pci_ioda_shutdown(struct pci_controller *hose)
+{
+	struct pnv_phb *phb = hose->private_data;
+
+	opal_pci_reset(phb->opal_id, OPAL_RESET_PCI_IODA_TABLE,
+		       OPAL_ASSERT_RESET);
+}
+
+static void pnv_pci_ioda_dma_bus_setup(struct pci_bus *bus)
+{
+	struct pnv_phb *phb = pci_bus_to_pnvhb(bus);
+	struct pnv_ioda_pe *pe;
+
+	list_for_each_entry(pe, &phb->ioda.pe_list, list) {
+		if (!(pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)))
+			continue;
+
+		if (!pe->pbus)
+			continue;
+
+		if (bus->number == ((pe->rid >> 8) & 0xFF)) {
+			pe->pbus = bus;
+			break;
+		}
+	}
+}
+
+static const struct pci_controller_ops pnv_pci_ioda_controller_ops = {
+	.dma_dev_setup		= pnv_pci_ioda_dma_dev_setup,
+	.dma_bus_setup		= pnv_pci_ioda_dma_bus_setup,
+	.iommu_bypass_supported	= pnv_pci_ioda_iommu_bypass_supported,
+	.setup_msi_irqs		= pnv_setup_msi_irqs,
+	.teardown_msi_irqs	= pnv_teardown_msi_irqs,
+	.enable_device_hook	= pnv_pci_enable_device_hook,
+	.release_device		= pnv_pci_release_device,
+	.window_alignment	= pnv_pci_window_alignment,
+	.setup_bridge		= pnv_pci_fixup_bridge_resources,
+	.reset_secondary_bus	= pnv_pci_reset_secondary_bus,
+	.shutdown		= pnv_pci_ioda_shutdown,
+};
+
+static const struct pci_controller_ops pnv_npu_ioda_controller_ops = {
+	.setup_msi_irqs		= pnv_setup_msi_irqs,
+	.teardown_msi_irqs	= pnv_teardown_msi_irqs,
+	.enable_device_hook	= pnv_pci_enable_device_hook,
+	.window_alignment	= pnv_pci_window_alignment,
+	.reset_secondary_bus	= pnv_pci_reset_secondary_bus,
+	.shutdown		= pnv_pci_ioda_shutdown,
+	.disable_device		= pnv_npu_disable_device,
+};
+
+static const struct pci_controller_ops pnv_npu_ocapi_ioda_controller_ops = {
+	.enable_device_hook	= pnv_ocapi_enable_device_hook,
+	.release_device		= pnv_pci_release_device,
+	.window_alignment	= pnv_pci_window_alignment,
+	.reset_secondary_bus	= pnv_pci_reset_secondary_bus,
+	.shutdown		= pnv_pci_ioda_shutdown,
+};
+
+static void __init pnv_pci_init_ioda_phb(struct device_node *np,
+					 u64 hub_id, int ioda_type)
+{
+	struct pci_controller *hose;
+	struct pnv_phb *phb;
+	unsigned long size, m64map_off, m32map_off, pemap_off;
+	unsigned long iomap_off = 0, dma32map_off = 0;
+	struct pnv_ioda_pe *root_pe;
+	struct resource r;
+	const __be64 *prop64;
+	const __be32 *prop32;
+	int len;
+	unsigned int segno;
+	u64 phb_id;
+	void *aux;
+	long rc;
+
+	if (!of_device_is_available(np))
+		return;
+
+	pr_info("Initializing %s PHB (%pOF)\n",	pnv_phb_names[ioda_type], np);
+
+	prop64 = of_get_property(np, "ibm,opal-phbid", NULL);
+	if (!prop64) {
+		pr_err("  Missing \"ibm,opal-phbid\" property !\n");
+		return;
+	}
+	phb_id = be64_to_cpup(prop64);
+	pr_debug("  PHB-ID  : 0x%016llx\n", phb_id);
+
+	phb = memblock_alloc(sizeof(*phb), SMP_CACHE_BYTES);
+	if (!phb)
+		panic("%s: Failed to allocate %zu bytes\n", __func__,
+		      sizeof(*phb));
+
+	/* Allocate PCI controller */
+	phb->hose = hose = pcibios_alloc_controller(np);
+	if (!phb->hose) {
+		pr_err("  Can't allocate PCI controller for %pOF\n",
+		       np);
+		memblock_free(__pa(phb), sizeof(struct pnv_phb));
+		return;
+	}
+
+	spin_lock_init(&phb->lock);
+	prop32 = of_get_property(np, "bus-range", &len);
+	if (prop32 && len == 8) {
+		hose->first_busno = be32_to_cpu(prop32[0]);
+		hose->last_busno = be32_to_cpu(prop32[1]);
+	} else {
+		pr_warn("  Broken <bus-range> on %pOF\n", np);
+		hose->first_busno = 0;
+		hose->last_busno = 0xff;
+	}
+	hose->private_data = phb;
+	phb->hub_id = hub_id;
+	phb->opal_id = phb_id;
+	phb->type = ioda_type;
+	mutex_init(&phb->ioda.pe_alloc_mutex);
+
+	/* Detect specific models for error handling */
+	if (of_device_is_compatible(np, "ibm,p7ioc-pciex"))
+		phb->model = PNV_PHB_MODEL_P7IOC;
+	else if (of_device_is_compatible(np, "ibm,power8-pciex"))
+		phb->model = PNV_PHB_MODEL_PHB3;
+	else if (of_device_is_compatible(np, "ibm,power8-npu-pciex"))
+		phb->model = PNV_PHB_MODEL_NPU;
+	else if (of_device_is_compatible(np, "ibm,power9-npu-pciex"))
+		phb->model = PNV_PHB_MODEL_NPU2;
+	else
+		phb->model = PNV_PHB_MODEL_UNKNOWN;
+
+	/* Initialize diagnostic data buffer */
+	prop32 = of_get_property(np, "ibm,phb-diag-data-size", NULL);
+	if (prop32)
+		phb->diag_data_size = be32_to_cpup(prop32);
+	else
+		phb->diag_data_size = PNV_PCI_DIAG_BUF_SIZE;
+
+	phb->diag_data = memblock_alloc(phb->diag_data_size, SMP_CACHE_BYTES);
+	if (!phb->diag_data)
+		panic("%s: Failed to allocate %u bytes\n", __func__,
+		      phb->diag_data_size);
+
+	/* Parse 32-bit and IO ranges (if any) */
+	pci_process_bridge_OF_ranges(hose, np, !hose->global_number);
+
+	/* Get registers */
+	if (!of_address_to_resource(np, 0, &r)) {
+		phb->regs_phys = r.start;
+		phb->regs = ioremap(r.start, resource_size(&r));
+		if (phb->regs == NULL)
+			pr_err("  Failed to map registers !\n");
+	}
+
+	/* Initialize more IODA stuff */
+	phb->ioda.total_pe_num = 1;
+	prop32 = of_get_property(np, "ibm,opal-num-pes", NULL);
+	if (prop32)
+		phb->ioda.total_pe_num = be32_to_cpup(prop32);
+	prop32 = of_get_property(np, "ibm,opal-reserved-pe", NULL);
+	if (prop32)
+		phb->ioda.reserved_pe_idx = be32_to_cpup(prop32);
+
+	/* Invalidate RID to PE# mapping */
+	for (segno = 0; segno < ARRAY_SIZE(phb->ioda.pe_rmap); segno++)
+		phb->ioda.pe_rmap[segno] = IODA_INVALID_PE;
+
+	/* Parse 64-bit MMIO range */
+	pnv_ioda_parse_m64_window(phb);
+
+	phb->ioda.m32_size = resource_size(&hose->mem_resources[0]);
+	/* FW Has already off top 64k of M32 space (MSI space) */
+	phb->ioda.m32_size += 0x10000;
+
+	phb->ioda.m32_segsize = phb->ioda.m32_size / phb->ioda.total_pe_num;
+	phb->ioda.m32_pci_base = hose->mem_resources[0].start - hose->mem_offset[0];
+	phb->ioda.io_size = hose->pci_io_size;
+	phb->ioda.io_segsize = phb->ioda.io_size / phb->ioda.total_pe_num;
+	phb->ioda.io_pci_base = 0; /* XXX calculate this ? */
+
+	/* Calculate how many 32-bit TCE segments we have */
+	phb->ioda.dma32_count = phb->ioda.m32_pci_base /
+				PNV_IODA1_DMA32_SEGSIZE;
+
+	/* Allocate aux data & arrays. We don't have IO ports on PHB3 */
+	size = ALIGN(max_t(unsigned, phb->ioda.total_pe_num, 8) / 8,
+			sizeof(unsigned long));
+	m64map_off = size;
+	size += phb->ioda.total_pe_num * sizeof(phb->ioda.m64_segmap[0]);
+	m32map_off = size;
+	size += phb->ioda.total_pe_num * sizeof(phb->ioda.m32_segmap[0]);
+	if (phb->type == PNV_PHB_IODA1) {
+		iomap_off = size;
+		size += phb->ioda.total_pe_num * sizeof(phb->ioda.io_segmap[0]);
+		dma32map_off = size;
+		size += phb->ioda.dma32_count *
+			sizeof(phb->ioda.dma32_segmap[0]);
+	}
+	pemap_off = size;
+	size += phb->ioda.total_pe_num * sizeof(struct pnv_ioda_pe);
+	aux = memblock_alloc(size, SMP_CACHE_BYTES);
+	if (!aux)
+		panic("%s: Failed to allocate %lu bytes\n", __func__, size);
+	phb->ioda.pe_alloc = aux;
+	phb->ioda.m64_segmap = aux + m64map_off;
+	phb->ioda.m32_segmap = aux + m32map_off;
+	for (segno = 0; segno < phb->ioda.total_pe_num; segno++) {
+		phb->ioda.m64_segmap[segno] = IODA_INVALID_PE;
+		phb->ioda.m32_segmap[segno] = IODA_INVALID_PE;
+	}
+	if (phb->type == PNV_PHB_IODA1) {
+		phb->ioda.io_segmap = aux + iomap_off;
+		for (segno = 0; segno < phb->ioda.total_pe_num; segno++)
+			phb->ioda.io_segmap[segno] = IODA_INVALID_PE;
+
+		phb->ioda.dma32_segmap = aux + dma32map_off;
+		for (segno = 0; segno < phb->ioda.dma32_count; segno++)
+			phb->ioda.dma32_segmap[segno] = IODA_INVALID_PE;
+	}
+	phb->ioda.pe_array = aux + pemap_off;
+
+	/*
+	 * Choose PE number for root bus, which shouldn't have
+	 * M64 resources consumed by its child devices. To pick
+	 * the PE number adjacent to the reserved one if possible.
+	 */
+	pnv_ioda_reserve_pe(phb, phb->ioda.reserved_pe_idx);
+	if (phb->ioda.reserved_pe_idx == 0) {
+		phb->ioda.root_pe_idx = 1;
+		pnv_ioda_reserve_pe(phb, phb->ioda.root_pe_idx);
+	} else if (phb->ioda.reserved_pe_idx == (phb->ioda.total_pe_num - 1)) {
+		phb->ioda.root_pe_idx = phb->ioda.reserved_pe_idx - 1;
+		pnv_ioda_reserve_pe(phb, phb->ioda.root_pe_idx);
+	} else {
+		/* otherwise just allocate one */
+		root_pe = pnv_ioda_alloc_pe(phb, 1);
+		phb->ioda.root_pe_idx = root_pe->pe_number;
+	}
+
+	INIT_LIST_HEAD(&phb->ioda.pe_list);
+	mutex_init(&phb->ioda.pe_list_mutex);
+
+	/* Calculate how many 32-bit TCE segments we have */
+	phb->ioda.dma32_count = phb->ioda.m32_pci_base /
+				PNV_IODA1_DMA32_SEGSIZE;
+
+#if 0 /* We should really do that ... */
+	rc = opal_pci_set_phb_mem_window(opal->phb_id,
+					 window_type,
+					 window_num,
+					 starting_real_address,
+					 starting_pci_address,
+					 segment_size);
+#endif
+
+	pr_info("  %03d (%03d) PE's M32: 0x%x [segment=0x%x]\n",
+		phb->ioda.total_pe_num, phb->ioda.reserved_pe_idx,
+		phb->ioda.m32_size, phb->ioda.m32_segsize);
+	if (phb->ioda.m64_size)
+		pr_info("                 M64: 0x%lx [segment=0x%lx]\n",
+			phb->ioda.m64_size, phb->ioda.m64_segsize);
+	if (phb->ioda.io_size)
+		pr_info("                  IO: 0x%x [segment=0x%x]\n",
+			phb->ioda.io_size, phb->ioda.io_segsize);
+
+
+	phb->hose->ops = &pnv_pci_ops;
+	phb->get_pe_state = pnv_ioda_get_pe_state;
+	phb->freeze_pe = pnv_ioda_freeze_pe;
+	phb->unfreeze_pe = pnv_ioda_unfreeze_pe;
+
+	/* Setup MSI support */
+	pnv_pci_init_ioda_msis(phb);
+
+	/*
+	 * We pass the PCI probe flag PCI_REASSIGN_ALL_RSRC here
+	 * to let the PCI core do resource assignment. It's supposed
+	 * that the PCI core will do correct I/O and MMIO alignment
+	 * for the P2P bridge bars so that each PCI bus (excluding
+	 * the child P2P bridges) can form individual PE.
+	 */
+	ppc_md.pcibios_fixup = pnv_pci_ioda_fixup;
+
+	switch (phb->type) {
+	case PNV_PHB_NPU_NVLINK:
+		hose->controller_ops = pnv_npu_ioda_controller_ops;
+		break;
+	case PNV_PHB_NPU_OCAPI:
+		hose->controller_ops = pnv_npu_ocapi_ioda_controller_ops;
+		break;
+	default:
+		hose->controller_ops = pnv_pci_ioda_controller_ops;
+	}
+
+	ppc_md.pcibios_default_alignment = pnv_pci_default_alignment;
+
+#ifdef CONFIG_PCI_IOV
+	ppc_md.pcibios_fixup_sriov = pnv_pci_ioda_fixup_iov;
+	ppc_md.pcibios_iov_resource_alignment = pnv_pci_iov_resource_alignment;
+	ppc_md.pcibios_sriov_enable = pnv_pcibios_sriov_enable;
+	ppc_md.pcibios_sriov_disable = pnv_pcibios_sriov_disable;
+#endif
+
+	pci_add_flags(PCI_REASSIGN_ALL_RSRC);
+
+	/* Reset IODA tables to a clean state */
+	rc = opal_pci_reset(phb_id, OPAL_RESET_PCI_IODA_TABLE, OPAL_ASSERT_RESET);
+	if (rc)
+		pr_warn("  OPAL Error %ld performing IODA table reset !\n", rc);
+
+	/*
+	 * If we're running in kdump kernel, the previous kernel never
+	 * shutdown PCI devices correctly. We already got IODA table
+	 * cleaned out. So we have to issue PHB reset to stop all PCI
+	 * transactions from previous kernel. The ppc_pci_reset_phbs
+	 * kernel parameter will force this reset too. Additionally,
+	 * if the IODA reset above failed then use a bigger hammer.
+	 * This can happen if we get a PHB fatal error in very early
+	 * boot.
+	 */
+	if (is_kdump_kernel() || pci_reset_phbs || rc) {
+		pr_info("  Issue PHB reset ...\n");
+		pnv_eeh_phb_reset(hose, EEH_RESET_FUNDAMENTAL);
+		pnv_eeh_phb_reset(hose, EEH_RESET_DEACTIVATE);
+	}
+
+	/* Remove M64 resource if we can't configure it successfully */
+	if (!phb->init_m64 || phb->init_m64(phb))
+		hose->mem_resources[1].flags = 0;
+}
+
+void __init pnv_pci_init_ioda2_phb(struct device_node *np)
+{
+	pnv_pci_init_ioda_phb(np, 0, PNV_PHB_IODA2);
+}
+
+void __init pnv_pci_init_npu_phb(struct device_node *np)
+{
+	pnv_pci_init_ioda_phb(np, 0, PNV_PHB_NPU_NVLINK);
+}
+
+void __init pnv_pci_init_npu2_opencapi_phb(struct device_node *np)
+{
+	pnv_pci_init_ioda_phb(np, 0, PNV_PHB_NPU_OCAPI);
+}
+
+static void pnv_npu2_opencapi_cfg_size_fixup(struct pci_dev *dev)
+{
+	struct pnv_phb *phb = pci_bus_to_pnvhb(dev->bus);
+
+	if (!machine_is(powernv))
+		return;
+
+	if (phb->type == PNV_PHB_NPU_OCAPI)
+		dev->cfg_size = PCI_CFG_SPACE_EXP_SIZE;
+}
+DECLARE_PCI_FIXUP_EARLY(PCI_ANY_ID, PCI_ANY_ID, pnv_npu2_opencapi_cfg_size_fixup);
+
+void __init pnv_pci_init_ioda_hub(struct device_node *np)
+{
+	struct device_node *phbn;
+	const __be64 *prop64;
+	u64 hub_id;
+
+	pr_info("Probing IODA IO-Hub %pOF\n", np);
+
+	prop64 = of_get_property(np, "ibm,opal-hubid", NULL);
+	if (!prop64) {
+		pr_err(" Missing \"ibm,opal-hubid\" property !\n");
+		return;
+	}
+	hub_id = be64_to_cpup(prop64);
+	pr_devel(" HUB-ID : 0x%016llx\n", hub_id);
+
+	/* Count child PHBs */
+	for_each_child_of_node(np, phbn) {
+		/* Look for IODA1 PHBs */
+		if (of_device_is_compatible(phbn, "ibm,ioda-phb"))
+			pnv_pci_init_ioda_phb(phbn, hub_id, PNV_PHB_IODA1);
+	}
+}
diff --git a/arch/powerpc/platforms/powernv/pci-sriov.c b/arch/powerpc/platforms/powernv/pci-sriov.c
new file mode 100644
index 000000000..e3e52ff2c
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/pci-sriov.c
@@ -0,0 +1,766 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include <linux/kernel.h>
+#include <linux/ioport.h>
+#include <linux/bitmap.h>
+#include <linux/pci.h>
+
+#include <asm/opal.h>
+
+#include "pci.h"
+
+/* for pci_dev_is_added() */
+#include "../../../../drivers/pci/pci.h"
+
+/*
+ * The majority of the complexity in supporting SR-IOV on PowerNV comes from
+ * the need to put the MMIO space for each VF into a separate PE. Internally
+ * the PHB maps MMIO addresses to a specific PE using the "Memory BAR Table".
+ * The MBT historically only applied to the 64bit MMIO window of the PHB
+ * so it's common to see it referred to as the "M64BT".
+ *
+ * An MBT entry stores the mapped range as an <base>,<mask> pair. This forces
+ * the address range that we want to map to be power-of-two sized and aligned.
+ * For conventional PCI devices this isn't really an issue since PCI device BARs
+ * have the same requirement.
+ *
+ * For a SR-IOV BAR things are a little more awkward since size and alignment
+ * are not coupled. The alignment is set based on the the per-VF BAR size, but
+ * the total BAR area is: number-of-vfs * per-vf-size. The number of VFs
+ * isn't necessarily a power of two, so neither is the total size. To fix that
+ * we need to finesse (read: hack) the Linux BAR allocator so that it will
+ * allocate the SR-IOV BARs in a way that lets us map them using the MBT.
+ *
+ * The changes to size and alignment that we need to do depend on the "mode"
+ * of MBT entry that we use. We only support SR-IOV on PHB3 (IODA2) and above,
+ * so as a baseline we can assume that we have the following BAR modes
+ * available:
+ *
+ *   NB: $PE_COUNT is the number of PEs that the PHB supports.
+ *
+ * a) A segmented BAR that splits the mapped range into $PE_COUNT equally sized
+ *    segments. The n'th segment is mapped to the n'th PE.
+ * b) An un-segmented BAR that maps the whole address range to a specific PE.
+ *
+ *
+ * We prefer to use mode a) since it only requires one MBT entry per SR-IOV BAR
+ * For comparison b) requires one entry per-VF per-BAR, or:
+ * (num-vfs * num-sriov-bars) in total. To use a) we need the size of each segment
+ * to equal the size of the per-VF BAR area. So:
+ *
+ *	new_size = per-vf-size * number-of-PEs
+ *
+ * The alignment for the SR-IOV BAR also needs to be changed from per-vf-size
+ * to "new_size", calculated above. Implementing this is a convoluted process
+ * which requires several hooks in the PCI core:
+ *
+ * 1. In pcibios_add_device() we call pnv_pci_ioda_fixup_iov().
+ *
+ *    At this point the device has been probed and the device's BARs are sized,
+ *    but no resource allocations have been done. The SR-IOV BARs are sized
+ *    based on the maximum number of VFs supported by the device and we need
+ *    to increase that to new_size.
+ *
+ * 2. Later, when Linux actually assigns resources it tries to make the resource
+ *    allocations for each PCI bus as compact as possible. As a part of that it
+ *    sorts the BARs on a bus by their required alignment, which is calculated
+ *    using pci_resource_alignment().
+ *
+ *    For IOV resources this goes:
+ *    pci_resource_alignment()
+ *        pci_sriov_resource_alignment()
+ *            pcibios_sriov_resource_alignment()
+ *                pnv_pci_iov_resource_alignment()
+ *
+ *    Our hook overrides the default alignment, equal to the per-vf-size, with
+ *    new_size computed above.
+ *
+ * 3. When userspace enables VFs for a device:
+ *
+ *    sriov_enable()
+ *       pcibios_sriov_enable()
+ *           pnv_pcibios_sriov_enable()
+ *
+ *    This is where we actually allocate PE numbers for each VF and setup the
+ *    MBT mapping for each SR-IOV BAR. In steps 1) and 2) we setup an "arena"
+ *    where each MBT segment is equal in size to the VF BAR so we can shift
+ *    around the actual SR-IOV BAR location within this arena. We need this
+ *    ability because the PE space is shared by all devices on the same PHB.
+ *    When using mode a) described above segment 0 in maps to PE#0 which might
+ *    be already being used by another device on the PHB.
+ *
+ *    As a result we need allocate a contigious range of PE numbers, then shift
+ *    the address programmed into the SR-IOV BAR of the PF so that the address
+ *    of VF0 matches up with the segment corresponding to the first allocated
+ *    PE number. This is handled in pnv_pci_vf_resource_shift().
+ *
+ *    Once all that is done we return to the PCI core which then enables VFs,
+ *    scans them and creates pci_devs for each. The init process for a VF is
+ *    largely the same as a normal device, but the VF is inserted into the IODA
+ *    PE that we allocated for it rather than the PE associated with the bus.
+ *
+ * 4. When userspace disables VFs we unwind the above in
+ *    pnv_pcibios_sriov_disable(). Fortunately this is relatively simple since
+ *    we don't need to validate anything, just tear down the mappings and
+ *    move SR-IOV resource back to its "proper" location.
+ *
+ * That's how mode a) works. In theory mode b) (single PE mapping) is less work
+ * since we can map each individual VF with a separate BAR. However, there's a
+ * few limitations:
+ *
+ * 1) For IODA2 mode b) has a minimum alignment requirement of 32MB. This makes
+ *    it only usable for devices with very large per-VF BARs. Such devices are
+ *    similar to Big Foot. They definitely exist, but I've never seen one.
+ *
+ * 2) The number of MBT entries that we have is limited. PHB3 and PHB4 only
+ *    16 total and some are needed for. Most SR-IOV capable network cards can support
+ *    more than 16 VFs on each port.
+ *
+ * We use b) when using a) would use more than 1/4 of the entire 64 bit MMIO
+ * window of the PHB.
+ *
+ *
+ *
+ * PHB4 (IODA3) added a few new features that would be useful for SR-IOV. It
+ * allowed the MBT to map 32bit MMIO space in addition to 64bit which allows
+ * us to support SR-IOV BARs in the 32bit MMIO window. This is useful since
+ * the Linux BAR allocation will place any BAR marked as non-prefetchable into
+ * the non-prefetchable bridge window, which is 32bit only. It also added two
+ * new modes:
+ *
+ * c) A segmented BAR similar to a), but each segment can be individually
+ *    mapped to any PE. This is matches how the 32bit MMIO window worked on
+ *    IODA1&2.
+ *
+ * d) A segmented BAR with 8, 64, or 128 segments. This works similarly to a),
+ *    but with fewer segments and configurable base PE.
+ *
+ *    i.e. The n'th segment maps to the (n + base)'th PE.
+ *
+ *    The base PE is also required to be a multiple of the window size.
+ *
+ * Unfortunately, the OPAL API doesn't currently (as of skiboot v6.6) allow us
+ * to exploit any of the IODA3 features.
+ */
+
+static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev)
+{
+	struct pnv_phb *phb = pci_bus_to_pnvhb(pdev->bus);
+	struct resource *res;
+	int i;
+	resource_size_t vf_bar_sz;
+	struct pnv_iov_data *iov;
+	int mul;
+
+	iov = kzalloc(sizeof(*iov), GFP_KERNEL);
+	if (!iov)
+		goto disable_iov;
+	pdev->dev.archdata.iov_data = iov;
+	mul = phb->ioda.total_pe_num;
+
+	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
+		res = &pdev->resource[i + PCI_IOV_RESOURCES];
+		if (!res->flags || res->parent)
+			continue;
+		if (!pnv_pci_is_m64_flags(res->flags)) {
+			dev_warn(&pdev->dev, "Don't support SR-IOV with non M64 VF BAR%d: %pR. \n",
+				 i, res);
+			goto disable_iov;
+		}
+
+		vf_bar_sz = pci_iov_resource_size(pdev, i + PCI_IOV_RESOURCES);
+
+		/*
+		 * Generally, one segmented M64 BAR maps one IOV BAR. However,
+		 * if a VF BAR is too large we end up wasting a lot of space.
+		 * If each VF needs more than 1/4 of the default m64 segment
+		 * then each VF BAR should be mapped in single-PE mode to reduce
+		 * the amount of space required. This does however limit the
+		 * number of VFs we can support.
+		 *
+		 * The 1/4 limit is arbitrary and can be tweaked.
+		 */
+		if (vf_bar_sz > (phb->ioda.m64_segsize >> 2)) {
+			/*
+			 * On PHB3, the minimum size alignment of M64 BAR in
+			 * single mode is 32MB. If this VF BAR is smaller than
+			 * 32MB, but still too large for a segmented window
+			 * then we can't map it and need to disable SR-IOV for
+			 * this device.
+			 */
+			if (vf_bar_sz < SZ_32M) {
+				pci_err(pdev, "VF BAR%d: %pR can't be mapped in single PE mode\n",
+					i, res);
+				goto disable_iov;
+			}
+
+			iov->m64_single_mode[i] = true;
+			continue;
+		}
+
+		/*
+		 * This BAR can be mapped with one segmented window, so adjust
+		 * te resource size to accommodate.
+		 */
+		pci_dbg(pdev, " Fixing VF BAR%d: %pR to\n", i, res);
+		res->end = res->start + vf_bar_sz * mul - 1;
+		pci_dbg(pdev, "                       %pR\n", res);
+
+		pci_info(pdev, "VF BAR%d: %pR (expanded to %d VFs for PE alignment)",
+			 i, res, mul);
+
+		iov->need_shift = true;
+	}
+
+	return;
+
+disable_iov:
+	/* Save ourselves some MMIO space by disabling the unusable BARs */
+	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
+		res = &pdev->resource[i + PCI_IOV_RESOURCES];
+		res->flags = 0;
+		res->end = res->start - 1;
+	}
+
+	pdev->dev.archdata.iov_data = NULL;
+	kfree(iov);
+}
+
+void pnv_pci_ioda_fixup_iov(struct pci_dev *pdev)
+{
+	if (WARN_ON(pci_dev_is_added(pdev)))
+		return;
+
+	if (pdev->is_virtfn) {
+		struct pnv_ioda_pe *pe = pnv_ioda_get_pe(pdev);
+
+		/*
+		 * VF PEs are single-device PEs so their pdev pointer needs to
+		 * be set. The pdev doesn't exist when the PE is allocated (in
+		 * (pcibios_sriov_enable()) so we fix it up here.
+		 */
+		pe->pdev = pdev;
+		WARN_ON(!(pe->flags & PNV_IODA_PE_VF));
+	} else if (pdev->is_physfn) {
+		/*
+		 * For PFs adjust their allocated IOV resources to match what
+		 * the PHB can support using it's M64 BAR table.
+		 */
+		pnv_pci_ioda_fixup_iov_resources(pdev);
+	}
+}
+
+resource_size_t pnv_pci_iov_resource_alignment(struct pci_dev *pdev,
+						      int resno)
+{
+	resource_size_t align = pci_iov_resource_size(pdev, resno);
+	struct pnv_phb *phb = pci_bus_to_pnvhb(pdev->bus);
+	struct pnv_iov_data *iov = pnv_iov_get(pdev);
+
+	/*
+	 * iov can be null if we have an SR-IOV device with IOV BAR that can't
+	 * be placed in the m64 space (i.e. The BAR is 32bit or non-prefetch).
+	 * In that case we don't allow VFs to be enabled since one of their
+	 * BARs would not be placed in the correct PE.
+	 */
+	if (!iov)
+		return align;
+
+	/*
+	 * If we're using single mode then we can just use the native VF BAR
+	 * alignment. We validated that it's possible to use a single PE
+	 * window above when we did the fixup.
+	 */
+	if (iov->m64_single_mode[resno - PCI_IOV_RESOURCES])
+		return align;
+
+	/*
+	 * On PowerNV platform, IOV BAR is mapped by M64 BAR to enable the
+	 * SR-IOV. While from hardware perspective, the range mapped by M64
+	 * BAR should be size aligned.
+	 *
+	 * This function returns the total IOV BAR size if M64 BAR is in
+	 * Shared PE mode or just VF BAR size if not.
+	 * If the M64 BAR is in Single PE mode, return the VF BAR size or
+	 * M64 segment size if IOV BAR size is less.
+	 */
+	return phb->ioda.total_pe_num * align;
+}
+
+static int pnv_pci_vf_release_m64(struct pci_dev *pdev, u16 num_vfs)
+{
+	struct pnv_iov_data   *iov;
+	struct pnv_phb        *phb;
+	int window_id;
+
+	phb = pci_bus_to_pnvhb(pdev->bus);
+	iov = pnv_iov_get(pdev);
+
+	for_each_set_bit(window_id, iov->used_m64_bar_mask, MAX_M64_BARS) {
+		opal_pci_phb_mmio_enable(phb->opal_id,
+					 OPAL_M64_WINDOW_TYPE,
+					 window_id,
+					 0);
+
+		clear_bit(window_id, &phb->ioda.m64_bar_alloc);
+	}
+
+	return 0;
+}
+
+
+/*
+ * PHB3 and beyond support segmented windows. The window's address range
+ * is subdivided into phb->ioda.total_pe_num segments and there's a 1-1
+ * mapping between PEs and segments.
+ */
+static int64_t pnv_ioda_map_m64_segmented(struct pnv_phb *phb,
+					  int window_id,
+					  resource_size_t start,
+					  resource_size_t size)
+{
+	int64_t rc;
+
+	rc = opal_pci_set_phb_mem_window(phb->opal_id,
+					 OPAL_M64_WINDOW_TYPE,
+					 window_id,
+					 start,
+					 0, /* unused */
+					 size);
+	if (rc)
+		goto out;
+
+	rc = opal_pci_phb_mmio_enable(phb->opal_id,
+				      OPAL_M64_WINDOW_TYPE,
+				      window_id,
+				      OPAL_ENABLE_M64_SPLIT);
+out:
+	if (rc)
+		pr_err("Failed to map M64 window #%d: %lld\n", window_id, rc);
+
+	return rc;
+}
+
+static int64_t pnv_ioda_map_m64_single(struct pnv_phb *phb,
+				       int pe_num,
+				       int window_id,
+				       resource_size_t start,
+				       resource_size_t size)
+{
+	int64_t rc;
+
+	/*
+	 * The API for setting up m64 mmio windows seems to have been designed
+	 * with P7-IOC in mind. For that chip each M64 BAR (window) had a fixed
+	 * split of 8 equally sized segments each of which could individually
+	 * assigned to a PE.
+	 *
+	 * The problem with this is that the API doesn't have any way to
+	 * communicate the number of segments we want on a BAR. This wasn't
+	 * a problem for p7-ioc since you didn't have a choice, but the
+	 * single PE windows added in PHB3 don't map cleanly to this API.
+	 *
+	 * As a result we've got this slightly awkward process where we
+	 * call opal_pci_map_pe_mmio_window() to put the single in single
+	 * PE mode, and set the PE for the window before setting the address
+	 * bounds. We need to do it this way because the single PE windows
+	 * for PHB3 have different alignment requirements on PHB3.
+	 */
+	rc = opal_pci_map_pe_mmio_window(phb->opal_id,
+					 pe_num,
+					 OPAL_M64_WINDOW_TYPE,
+					 window_id,
+					 0);
+	if (rc)
+		goto out;
+
+	/*
+	 * NB: In single PE mode the window needs to be aligned to 32MB
+	 */
+	rc = opal_pci_set_phb_mem_window(phb->opal_id,
+					 OPAL_M64_WINDOW_TYPE,
+					 window_id,
+					 start,
+					 0, /* ignored by FW, m64 is 1-1 */
+					 size);
+	if (rc)
+		goto out;
+
+	/*
+	 * Now actually enable it. We specified the BAR should be in "non-split"
+	 * mode so FW will validate that the BAR is in single PE mode.
+	 */
+	rc = opal_pci_phb_mmio_enable(phb->opal_id,
+				      OPAL_M64_WINDOW_TYPE,
+				      window_id,
+				      OPAL_ENABLE_M64_NON_SPLIT);
+out:
+	if (rc)
+		pr_err("Error mapping single PE BAR\n");
+
+	return rc;
+}
+
+static int pnv_pci_alloc_m64_bar(struct pnv_phb *phb, struct pnv_iov_data *iov)
+{
+	int win;
+
+	do {
+		win = find_next_zero_bit(&phb->ioda.m64_bar_alloc,
+				phb->ioda.m64_bar_idx + 1, 0);
+
+		if (win >= phb->ioda.m64_bar_idx + 1)
+			return -1;
+	} while (test_and_set_bit(win, &phb->ioda.m64_bar_alloc));
+
+	set_bit(win, iov->used_m64_bar_mask);
+
+	return win;
+}
+
+static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs)
+{
+	struct pnv_iov_data   *iov;
+	struct pnv_phb        *phb;
+	int                    win;
+	struct resource       *res;
+	int                    i, j;
+	int64_t                rc;
+	resource_size_t        size, start;
+	int                    base_pe_num;
+
+	phb = pci_bus_to_pnvhb(pdev->bus);
+	iov = pnv_iov_get(pdev);
+
+	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
+		res = &pdev->resource[i + PCI_IOV_RESOURCES];
+		if (!res->flags || !res->parent)
+			continue;
+
+		/* don't need single mode? map everything in one go! */
+		if (!iov->m64_single_mode[i]) {
+			win = pnv_pci_alloc_m64_bar(phb, iov);
+			if (win < 0)
+				goto m64_failed;
+
+			size = resource_size(res);
+			start = res->start;
+
+			rc = pnv_ioda_map_m64_segmented(phb, win, start, size);
+			if (rc)
+				goto m64_failed;
+
+			continue;
+		}
+
+		/* otherwise map each VF with single PE BARs */
+		size = pci_iov_resource_size(pdev, PCI_IOV_RESOURCES + i);
+		base_pe_num = iov->vf_pe_arr[0].pe_number;
+
+		for (j = 0; j < num_vfs; j++) {
+			win = pnv_pci_alloc_m64_bar(phb, iov);
+			if (win < 0)
+				goto m64_failed;
+
+			start = res->start + size * j;
+			rc = pnv_ioda_map_m64_single(phb, win,
+						     base_pe_num + j,
+						     start,
+						     size);
+			if (rc)
+				goto m64_failed;
+		}
+	}
+	return 0;
+
+m64_failed:
+	pnv_pci_vf_release_m64(pdev, num_vfs);
+	return -EBUSY;
+}
+
+static void pnv_ioda_release_vf_PE(struct pci_dev *pdev)
+{
+	struct pnv_phb        *phb;
+	struct pnv_ioda_pe    *pe, *pe_n;
+
+	phb = pci_bus_to_pnvhb(pdev->bus);
+
+	if (!pdev->is_physfn)
+		return;
+
+	/* FIXME: Use pnv_ioda_release_pe()? */
+	list_for_each_entry_safe(pe, pe_n, &phb->ioda.pe_list, list) {
+		if (pe->parent_dev != pdev)
+			continue;
+
+		pnv_pci_ioda2_release_pe_dma(pe);
+
+		/* Remove from list */
+		mutex_lock(&phb->ioda.pe_list_mutex);
+		list_del(&pe->list);
+		mutex_unlock(&phb->ioda.pe_list_mutex);
+
+		pnv_ioda_deconfigure_pe(phb, pe);
+
+		pnv_ioda_free_pe(pe);
+	}
+}
+
+static int pnv_pci_vf_resource_shift(struct pci_dev *dev, int offset)
+{
+	struct resource *res, res2;
+	struct pnv_iov_data *iov;
+	resource_size_t size;
+	u16 num_vfs;
+	int i;
+
+	if (!dev->is_physfn)
+		return -EINVAL;
+	iov = pnv_iov_get(dev);
+
+	/*
+	 * "offset" is in VFs.  The M64 windows are sized so that when they
+	 * are segmented, each segment is the same size as the IOV BAR.
+	 * Each segment is in a separate PE, and the high order bits of the
+	 * address are the PE number.  Therefore, each VF's BAR is in a
+	 * separate PE, and changing the IOV BAR start address changes the
+	 * range of PEs the VFs are in.
+	 */
+	num_vfs = iov->num_vfs;
+	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
+		res = &dev->resource[i + PCI_IOV_RESOURCES];
+		if (!res->flags || !res->parent)
+			continue;
+		if (iov->m64_single_mode[i])
+			continue;
+
+		/*
+		 * The actual IOV BAR range is determined by the start address
+		 * and the actual size for num_vfs VFs BAR.  This check is to
+		 * make sure that after shifting, the range will not overlap
+		 * with another device.
+		 */
+		size = pci_iov_resource_size(dev, i + PCI_IOV_RESOURCES);
+		res2.flags = res->flags;
+		res2.start = res->start + (size * offset);
+		res2.end = res2.start + (size * num_vfs) - 1;
+
+		if (res2.end > res->end) {
+			dev_err(&dev->dev, "VF BAR%d: %pR would extend past %pR (trying to enable %d VFs shifted by %d)\n",
+				i, &res2, res, num_vfs, offset);
+			return -EBUSY;
+		}
+	}
+
+	/*
+	 * Since M64 BAR shares segments among all possible 256 PEs,
+	 * we have to shift the beginning of PF IOV BAR to make it start from
+	 * the segment which belongs to the PE number assigned to the first VF.
+	 * This creates a "hole" in the /proc/iomem which could be used for
+	 * allocating other resources so we reserve this area below and
+	 * release when IOV is released.
+	 */
+	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
+		res = &dev->resource[i + PCI_IOV_RESOURCES];
+		if (!res->flags || !res->parent)
+			continue;
+		if (iov->m64_single_mode[i])
+			continue;
+
+		size = pci_iov_resource_size(dev, i + PCI_IOV_RESOURCES);
+		res2 = *res;
+		res->start += size * offset;
+
+		dev_info(&dev->dev, "VF BAR%d: %pR shifted to %pR (%sabling %d VFs shifted by %d)\n",
+			 i, &res2, res, (offset > 0) ? "En" : "Dis",
+			 num_vfs, offset);
+
+		if (offset < 0) {
+			devm_release_resource(&dev->dev, &iov->holes[i]);
+			memset(&iov->holes[i], 0, sizeof(iov->holes[i]));
+		}
+
+		pci_update_resource(dev, i + PCI_IOV_RESOURCES);
+
+		if (offset > 0) {
+			iov->holes[i].start = res2.start;
+			iov->holes[i].end = res2.start + size * offset - 1;
+			iov->holes[i].flags = IORESOURCE_BUS;
+			iov->holes[i].name = "pnv_iov_reserved";
+			devm_request_resource(&dev->dev, res->parent,
+					&iov->holes[i]);
+		}
+	}
+	return 0;
+}
+
+static void pnv_pci_sriov_disable(struct pci_dev *pdev)
+{
+	u16                    num_vfs, base_pe;
+	struct pnv_iov_data   *iov;
+
+	iov = pnv_iov_get(pdev);
+	if (WARN_ON(!iov))
+		return;
+
+	num_vfs = iov->num_vfs;
+	base_pe = iov->vf_pe_arr[0].pe_number;
+
+	/* Release VF PEs */
+	pnv_ioda_release_vf_PE(pdev);
+
+	/* Un-shift the IOV BARs if we need to */
+	if (iov->need_shift)
+		pnv_pci_vf_resource_shift(pdev, -base_pe);
+
+	/* Release M64 windows */
+	pnv_pci_vf_release_m64(pdev, num_vfs);
+}
+
+static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs)
+{
+	struct pnv_phb        *phb;
+	struct pnv_ioda_pe    *pe;
+	int                    pe_num;
+	u16                    vf_index;
+	struct pnv_iov_data   *iov;
+	struct pci_dn         *pdn;
+
+	if (!pdev->is_physfn)
+		return;
+
+	phb = pci_bus_to_pnvhb(pdev->bus);
+	pdn = pci_get_pdn(pdev);
+	iov = pnv_iov_get(pdev);
+
+	/* Reserve PE for each VF */
+	for (vf_index = 0; vf_index < num_vfs; vf_index++) {
+		int vf_devfn = pci_iov_virtfn_devfn(pdev, vf_index);
+		int vf_bus = pci_iov_virtfn_bus(pdev, vf_index);
+		struct pci_dn *vf_pdn;
+
+		pe = &iov->vf_pe_arr[vf_index];
+		pe->phb = phb;
+		pe->flags = PNV_IODA_PE_VF;
+		pe->pbus = NULL;
+		pe->parent_dev = pdev;
+		pe->mve_number = -1;
+		pe->rid = (vf_bus << 8) | vf_devfn;
+
+		pe_num = pe->pe_number;
+		pe_info(pe, "VF %04d:%02d:%02d.%d associated with PE#%x\n",
+			pci_domain_nr(pdev->bus), pdev->bus->number,
+			PCI_SLOT(vf_devfn), PCI_FUNC(vf_devfn), pe_num);
+
+		if (pnv_ioda_configure_pe(phb, pe)) {
+			/* XXX What do we do here ? */
+			pnv_ioda_free_pe(pe);
+			pe->pdev = NULL;
+			continue;
+		}
+
+		/* Put PE to the list */
+		mutex_lock(&phb->ioda.pe_list_mutex);
+		list_add_tail(&pe->list, &phb->ioda.pe_list);
+		mutex_unlock(&phb->ioda.pe_list_mutex);
+
+		/* associate this pe to it's pdn */
+		list_for_each_entry(vf_pdn, &pdn->parent->child_list, list) {
+			if (vf_pdn->busno == vf_bus &&
+			    vf_pdn->devfn == vf_devfn) {
+				vf_pdn->pe_number = pe_num;
+				break;
+			}
+		}
+
+		pnv_pci_ioda2_setup_dma_pe(phb, pe);
+	}
+}
+
+static int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
+{
+	struct pnv_ioda_pe    *base_pe;
+	struct pnv_iov_data   *iov;
+	struct pnv_phb        *phb;
+	int                    ret;
+	u16                    i;
+
+	phb = pci_bus_to_pnvhb(pdev->bus);
+	iov = pnv_iov_get(pdev);
+
+	/*
+	 * There's a calls to IODA2 PE setup code littered throughout. We could
+	 * probably fix that, but we'd still have problems due to the
+	 * restriction inherent on IODA1 PHBs.
+	 *
+	 * NB: We class IODA3 as IODA2 since they're very similar.
+	 */
+	if (phb->type != PNV_PHB_IODA2) {
+		pci_err(pdev, "SR-IOV is not supported on this PHB\n");
+		return -ENXIO;
+	}
+
+	if (!iov) {
+		dev_info(&pdev->dev, "don't support this SRIOV device with non 64bit-prefetchable IOV BAR\n");
+		return -ENOSPC;
+	}
+
+	/* allocate a contigious block of PEs for our VFs */
+	base_pe = pnv_ioda_alloc_pe(phb, num_vfs);
+	if (!base_pe) {
+		pci_err(pdev, "Unable to allocate PEs for %d VFs\n", num_vfs);
+		return -EBUSY;
+	}
+
+	iov->vf_pe_arr = base_pe;
+	iov->num_vfs = num_vfs;
+
+	/* Assign M64 window accordingly */
+	ret = pnv_pci_vf_assign_m64(pdev, num_vfs);
+	if (ret) {
+		dev_info(&pdev->dev, "Not enough M64 window resources\n");
+		goto m64_failed;
+	}
+
+	/*
+	 * When using one M64 BAR to map one IOV BAR, we need to shift
+	 * the IOV BAR according to the PE# allocated to the VFs.
+	 * Otherwise, the PE# for the VF will conflict with others.
+	 */
+	if (iov->need_shift) {
+		ret = pnv_pci_vf_resource_shift(pdev, base_pe->pe_number);
+		if (ret)
+			goto shift_failed;
+	}
+
+	/* Setup VF PEs */
+	pnv_ioda_setup_vf_PE(pdev, num_vfs);
+
+	return 0;
+
+shift_failed:
+	pnv_pci_vf_release_m64(pdev, num_vfs);
+
+m64_failed:
+	for (i = 0; i < num_vfs; i++)
+		pnv_ioda_free_pe(&iov->vf_pe_arr[i]);
+
+	return ret;
+}
+
+int pnv_pcibios_sriov_disable(struct pci_dev *pdev)
+{
+	pnv_pci_sriov_disable(pdev);
+
+	/* Release PCI data */
+	remove_sriov_vf_pdns(pdev);
+	return 0;
+}
+
+int pnv_pcibios_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
+{
+	/* Allocate PCI data */
+	add_sriov_vf_pdns(pdev);
+
+	return pnv_pci_sriov_enable(pdev, num_vfs);
+}
diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
new file mode 100644
index 000000000..9b9bca169
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/pci.c
@@ -0,0 +1,971 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Support PCI/PCIe on PowerNV platforms
+ *
+ * Copyright 2011 Benjamin Herrenschmidt, IBM Corp.
+ */
+
+#include <linux/kernel.h>
+#include <linux/pci.h>
+#include <linux/delay.h>
+#include <linux/string.h>
+#include <linux/init.h>
+#include <linux/irq.h>
+#include <linux/io.h>
+#include <linux/msi.h>
+#include <linux/iommu.h>
+#include <linux/sched/mm.h>
+
+#include <asm/sections.h>
+#include <asm/io.h>
+#include <asm/prom.h>
+#include <asm/pci-bridge.h>
+#include <asm/machdep.h>
+#include <asm/msi_bitmap.h>
+#include <asm/ppc-pci.h>
+#include <asm/pnv-pci.h>
+#include <asm/opal.h>
+#include <asm/iommu.h>
+#include <asm/tce.h>
+#include <asm/firmware.h>
+#include <asm/eeh_event.h>
+#include <asm/eeh.h>
+
+#include "powernv.h"
+#include "pci.h"
+
+static DEFINE_MUTEX(tunnel_mutex);
+
+int pnv_pci_get_slot_id(struct device_node *np, uint64_t *id)
+{
+	struct device_node *node = np;
+	u32 bdfn;
+	u64 phbid;
+	int ret;
+
+	ret = of_property_read_u32(np, "reg", &bdfn);
+	if (ret)
+		return -ENXIO;
+
+	bdfn = ((bdfn & 0x00ffff00) >> 8);
+	for (node = np; node; node = of_get_parent(node)) {
+		if (!PCI_DN(node)) {
+			of_node_put(node);
+			break;
+		}
+
+		if (!of_device_is_compatible(node, "ibm,ioda2-phb") &&
+		    !of_device_is_compatible(node, "ibm,ioda3-phb") &&
+		    !of_device_is_compatible(node, "ibm,ioda2-npu2-opencapi-phb")) {
+			of_node_put(node);
+			continue;
+		}
+
+		ret = of_property_read_u64(node, "ibm,opal-phbid", &phbid);
+		if (ret) {
+			of_node_put(node);
+			return -ENXIO;
+		}
+
+		if (of_device_is_compatible(node, "ibm,ioda2-npu2-opencapi-phb"))
+			*id = PCI_PHB_SLOT_ID(phbid);
+		else
+			*id = PCI_SLOT_ID(phbid, bdfn);
+		return 0;
+	}
+
+	return -ENODEV;
+}
+EXPORT_SYMBOL_GPL(pnv_pci_get_slot_id);
+
+int pnv_pci_get_device_tree(uint32_t phandle, void *buf, uint64_t len)
+{
+	int64_t rc;
+
+	if (!opal_check_token(OPAL_GET_DEVICE_TREE))
+		return -ENXIO;
+
+	rc = opal_get_device_tree(phandle, (uint64_t)buf, len);
+	if (rc < OPAL_SUCCESS)
+		return -EIO;
+
+	return rc;
+}
+EXPORT_SYMBOL_GPL(pnv_pci_get_device_tree);
+
+int pnv_pci_get_presence_state(uint64_t id, uint8_t *state)
+{
+	int64_t rc;
+
+	if (!opal_check_token(OPAL_PCI_GET_PRESENCE_STATE))
+		return -ENXIO;
+
+	rc = opal_pci_get_presence_state(id, (uint64_t)state);
+	if (rc != OPAL_SUCCESS)
+		return -EIO;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(pnv_pci_get_presence_state);
+
+int pnv_pci_get_power_state(uint64_t id, uint8_t *state)
+{
+	int64_t rc;
+
+	if (!opal_check_token(OPAL_PCI_GET_POWER_STATE))
+		return -ENXIO;
+
+	rc = opal_pci_get_power_state(id, (uint64_t)state);
+	if (rc != OPAL_SUCCESS)
+		return -EIO;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(pnv_pci_get_power_state);
+
+int pnv_pci_set_power_state(uint64_t id, uint8_t state, struct opal_msg *msg)
+{
+	struct opal_msg m;
+	int token, ret;
+	int64_t rc;
+
+	if (!opal_check_token(OPAL_PCI_SET_POWER_STATE))
+		return -ENXIO;
+
+	token = opal_async_get_token_interruptible();
+	if (unlikely(token < 0))
+		return token;
+
+	rc = opal_pci_set_power_state(token, id, (uint64_t)&state);
+	if (rc == OPAL_SUCCESS) {
+		ret = 0;
+		goto exit;
+	} else if (rc != OPAL_ASYNC_COMPLETION) {
+		ret = -EIO;
+		goto exit;
+	}
+
+	ret = opal_async_wait_response(token, &m);
+	if (ret < 0)
+		goto exit;
+
+	if (msg) {
+		ret = 1;
+		memcpy(msg, &m, sizeof(m));
+	}
+
+exit:
+	opal_async_release_token(token);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(pnv_pci_set_power_state);
+
+int pnv_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type)
+{
+	struct pnv_phb *phb = pci_bus_to_pnvhb(pdev->bus);
+	struct msi_desc *entry;
+	struct msi_msg msg;
+	int hwirq;
+	unsigned int virq;
+	int rc;
+
+	if (WARN_ON(!phb) || !phb->msi_bmp.bitmap)
+		return -ENODEV;
+
+	if (pdev->no_64bit_msi && !phb->msi32_support)
+		return -ENODEV;
+
+	for_each_pci_msi_entry(entry, pdev) {
+		if (!entry->msi_attrib.is_64 && !phb->msi32_support) {
+			pr_warn("%s: Supports only 64-bit MSIs\n",
+				pci_name(pdev));
+			return -ENXIO;
+		}
+		hwirq = msi_bitmap_alloc_hwirqs(&phb->msi_bmp, 1);
+		if (hwirq < 0) {
+			pr_warn("%s: Failed to find a free MSI\n",
+				pci_name(pdev));
+			return -ENOSPC;
+		}
+		virq = irq_create_mapping(NULL, phb->msi_base + hwirq);
+		if (!virq) {
+			pr_warn("%s: Failed to map MSI to linux irq\n",
+				pci_name(pdev));
+			msi_bitmap_free_hwirqs(&phb->msi_bmp, hwirq, 1);
+			return -ENOMEM;
+		}
+		rc = phb->msi_setup(phb, pdev, phb->msi_base + hwirq,
+				    virq, entry->msi_attrib.is_64, &msg);
+		if (rc) {
+			pr_warn("%s: Failed to setup MSI\n", pci_name(pdev));
+			irq_dispose_mapping(virq);
+			msi_bitmap_free_hwirqs(&phb->msi_bmp, hwirq, 1);
+			return rc;
+		}
+		irq_set_msi_desc(virq, entry);
+		pci_write_msi_msg(virq, &msg);
+	}
+	return 0;
+}
+
+void pnv_teardown_msi_irqs(struct pci_dev *pdev)
+{
+	struct pnv_phb *phb = pci_bus_to_pnvhb(pdev->bus);
+	struct msi_desc *entry;
+	irq_hw_number_t hwirq;
+
+	if (WARN_ON(!phb))
+		return;
+
+	for_each_pci_msi_entry(entry, pdev) {
+		if (!entry->irq)
+			continue;
+		hwirq = virq_to_hw(entry->irq);
+		irq_set_msi_desc(entry->irq, NULL);
+		irq_dispose_mapping(entry->irq);
+		msi_bitmap_free_hwirqs(&phb->msi_bmp, hwirq - phb->msi_base, 1);
+	}
+}
+
+/* Nicely print the contents of the PE State Tables (PEST). */
+static void pnv_pci_dump_pest(__be64 pestA[], __be64 pestB[], int pest_size)
+{
+	__be64 prevA = ULONG_MAX, prevB = ULONG_MAX;
+	bool dup = false;
+	int i;
+
+	for (i = 0; i < pest_size; i++) {
+		__be64 peA = be64_to_cpu(pestA[i]);
+		__be64 peB = be64_to_cpu(pestB[i]);
+
+		if (peA != prevA || peB != prevB) {
+			if (dup) {
+				pr_info("PE[..%03x] A/B: as above\n", i-1);
+				dup = false;
+			}
+			prevA = peA;
+			prevB = peB;
+			if (peA & PNV_IODA_STOPPED_STATE ||
+			    peB & PNV_IODA_STOPPED_STATE)
+				pr_info("PE[%03x] A/B: %016llx %016llx\n",
+					i, peA, peB);
+		} else if (!dup && (peA & PNV_IODA_STOPPED_STATE ||
+				    peB & PNV_IODA_STOPPED_STATE)) {
+			dup = true;
+		}
+	}
+}
+
+static void pnv_pci_dump_p7ioc_diag_data(struct pci_controller *hose,
+					 struct OpalIoPhbErrorCommon *common)
+{
+	struct OpalIoP7IOCPhbErrorData *data;
+
+	data = (struct OpalIoP7IOCPhbErrorData *)common;
+	pr_info("P7IOC PHB#%x Diag-data (Version: %d)\n",
+		hose->global_number, be32_to_cpu(common->version));
+
+	if (data->brdgCtl)
+		pr_info("brdgCtl:     %08x\n",
+			be32_to_cpu(data->brdgCtl));
+	if (data->portStatusReg || data->rootCmplxStatus ||
+	    data->busAgentStatus)
+		pr_info("UtlSts:      %08x %08x %08x\n",
+			be32_to_cpu(data->portStatusReg),
+			be32_to_cpu(data->rootCmplxStatus),
+			be32_to_cpu(data->busAgentStatus));
+	if (data->deviceStatus || data->slotStatus   ||
+	    data->linkStatus   || data->devCmdStatus ||
+	    data->devSecStatus)
+		pr_info("RootSts:     %08x %08x %08x %08x %08x\n",
+			be32_to_cpu(data->deviceStatus),
+			be32_to_cpu(data->slotStatus),
+			be32_to_cpu(data->linkStatus),
+			be32_to_cpu(data->devCmdStatus),
+			be32_to_cpu(data->devSecStatus));
+	if (data->rootErrorStatus   || data->uncorrErrorStatus ||
+	    data->corrErrorStatus)
+		pr_info("RootErrSts:  %08x %08x %08x\n",
+			be32_to_cpu(data->rootErrorStatus),
+			be32_to_cpu(data->uncorrErrorStatus),
+			be32_to_cpu(data->corrErrorStatus));
+	if (data->tlpHdr1 || data->tlpHdr2 ||
+	    data->tlpHdr3 || data->tlpHdr4)
+		pr_info("RootErrLog:  %08x %08x %08x %08x\n",
+			be32_to_cpu(data->tlpHdr1),
+			be32_to_cpu(data->tlpHdr2),
+			be32_to_cpu(data->tlpHdr3),
+			be32_to_cpu(data->tlpHdr4));
+	if (data->sourceId || data->errorClass ||
+	    data->correlator)
+		pr_info("RootErrLog1: %08x %016llx %016llx\n",
+			be32_to_cpu(data->sourceId),
+			be64_to_cpu(data->errorClass),
+			be64_to_cpu(data->correlator));
+	if (data->p7iocPlssr || data->p7iocCsr)
+		pr_info("PhbSts:      %016llx %016llx\n",
+			be64_to_cpu(data->p7iocPlssr),
+			be64_to_cpu(data->p7iocCsr));
+	if (data->lemFir)
+		pr_info("Lem:         %016llx %016llx %016llx\n",
+			be64_to_cpu(data->lemFir),
+			be64_to_cpu(data->lemErrorMask),
+			be64_to_cpu(data->lemWOF));
+	if (data->phbErrorStatus)
+		pr_info("PhbErr:      %016llx %016llx %016llx %016llx\n",
+			be64_to_cpu(data->phbErrorStatus),
+			be64_to_cpu(data->phbFirstErrorStatus),
+			be64_to_cpu(data->phbErrorLog0),
+			be64_to_cpu(data->phbErrorLog1));
+	if (data->mmioErrorStatus)
+		pr_info("OutErr:      %016llx %016llx %016llx %016llx\n",
+			be64_to_cpu(data->mmioErrorStatus),
+			be64_to_cpu(data->mmioFirstErrorStatus),
+			be64_to_cpu(data->mmioErrorLog0),
+			be64_to_cpu(data->mmioErrorLog1));
+	if (data->dma0ErrorStatus)
+		pr_info("InAErr:      %016llx %016llx %016llx %016llx\n",
+			be64_to_cpu(data->dma0ErrorStatus),
+			be64_to_cpu(data->dma0FirstErrorStatus),
+			be64_to_cpu(data->dma0ErrorLog0),
+			be64_to_cpu(data->dma0ErrorLog1));
+	if (data->dma1ErrorStatus)
+		pr_info("InBErr:      %016llx %016llx %016llx %016llx\n",
+			be64_to_cpu(data->dma1ErrorStatus),
+			be64_to_cpu(data->dma1FirstErrorStatus),
+			be64_to_cpu(data->dma1ErrorLog0),
+			be64_to_cpu(data->dma1ErrorLog1));
+
+	pnv_pci_dump_pest(data->pestA, data->pestB, OPAL_P7IOC_NUM_PEST_REGS);
+}
+
+static void pnv_pci_dump_phb3_diag_data(struct pci_controller *hose,
+					struct OpalIoPhbErrorCommon *common)
+{
+	struct OpalIoPhb3ErrorData *data;
+
+	data = (struct OpalIoPhb3ErrorData*)common;
+	pr_info("PHB3 PHB#%x Diag-data (Version: %d)\n",
+		hose->global_number, be32_to_cpu(common->version));
+	if (data->brdgCtl)
+		pr_info("brdgCtl:     %08x\n",
+			be32_to_cpu(data->brdgCtl));
+	if (data->portStatusReg || data->rootCmplxStatus ||
+	    data->busAgentStatus)
+		pr_info("UtlSts:      %08x %08x %08x\n",
+			be32_to_cpu(data->portStatusReg),
+			be32_to_cpu(data->rootCmplxStatus),
+			be32_to_cpu(data->busAgentStatus));
+	if (data->deviceStatus || data->slotStatus   ||
+	    data->linkStatus   || data->devCmdStatus ||
+	    data->devSecStatus)
+		pr_info("RootSts:     %08x %08x %08x %08x %08x\n",
+			be32_to_cpu(data->deviceStatus),
+			be32_to_cpu(data->slotStatus),
+			be32_to_cpu(data->linkStatus),
+			be32_to_cpu(data->devCmdStatus),
+			be32_to_cpu(data->devSecStatus));
+	if (data->rootErrorStatus || data->uncorrErrorStatus ||
+	    data->corrErrorStatus)
+		pr_info("RootErrSts:  %08x %08x %08x\n",
+			be32_to_cpu(data->rootErrorStatus),
+			be32_to_cpu(data->uncorrErrorStatus),
+			be32_to_cpu(data->corrErrorStatus));
+	if (data->tlpHdr1 || data->tlpHdr2 ||
+	    data->tlpHdr3 || data->tlpHdr4)
+		pr_info("RootErrLog:  %08x %08x %08x %08x\n",
+			be32_to_cpu(data->tlpHdr1),
+			be32_to_cpu(data->tlpHdr2),
+			be32_to_cpu(data->tlpHdr3),
+			be32_to_cpu(data->tlpHdr4));
+	if (data->sourceId || data->errorClass ||
+	    data->correlator)
+		pr_info("RootErrLog1: %08x %016llx %016llx\n",
+			be32_to_cpu(data->sourceId),
+			be64_to_cpu(data->errorClass),
+			be64_to_cpu(data->correlator));
+	if (data->nFir)
+		pr_info("nFir:        %016llx %016llx %016llx\n",
+			be64_to_cpu(data->nFir),
+			be64_to_cpu(data->nFirMask),
+			be64_to_cpu(data->nFirWOF));
+	if (data->phbPlssr || data->phbCsr)
+		pr_info("PhbSts:      %016llx %016llx\n",
+			be64_to_cpu(data->phbPlssr),
+			be64_to_cpu(data->phbCsr));
+	if (data->lemFir)
+		pr_info("Lem:         %016llx %016llx %016llx\n",
+			be64_to_cpu(data->lemFir),
+			be64_to_cpu(data->lemErrorMask),
+			be64_to_cpu(data->lemWOF));
+	if (data->phbErrorStatus)
+		pr_info("PhbErr:      %016llx %016llx %016llx %016llx\n",
+			be64_to_cpu(data->phbErrorStatus),
+			be64_to_cpu(data->phbFirstErrorStatus),
+			be64_to_cpu(data->phbErrorLog0),
+			be64_to_cpu(data->phbErrorLog1));
+	if (data->mmioErrorStatus)
+		pr_info("OutErr:      %016llx %016llx %016llx %016llx\n",
+			be64_to_cpu(data->mmioErrorStatus),
+			be64_to_cpu(data->mmioFirstErrorStatus),
+			be64_to_cpu(data->mmioErrorLog0),
+			be64_to_cpu(data->mmioErrorLog1));
+	if (data->dma0ErrorStatus)
+		pr_info("InAErr:      %016llx %016llx %016llx %016llx\n",
+			be64_to_cpu(data->dma0ErrorStatus),
+			be64_to_cpu(data->dma0FirstErrorStatus),
+			be64_to_cpu(data->dma0ErrorLog0),
+			be64_to_cpu(data->dma0ErrorLog1));
+	if (data->dma1ErrorStatus)
+		pr_info("InBErr:      %016llx %016llx %016llx %016llx\n",
+			be64_to_cpu(data->dma1ErrorStatus),
+			be64_to_cpu(data->dma1FirstErrorStatus),
+			be64_to_cpu(data->dma1ErrorLog0),
+			be64_to_cpu(data->dma1ErrorLog1));
+
+	pnv_pci_dump_pest(data->pestA, data->pestB, OPAL_PHB3_NUM_PEST_REGS);
+}
+
+static void pnv_pci_dump_phb4_diag_data(struct pci_controller *hose,
+					struct OpalIoPhbErrorCommon *common)
+{
+	struct OpalIoPhb4ErrorData *data;
+
+	data = (struct OpalIoPhb4ErrorData*)common;
+	pr_info("PHB4 PHB#%d Diag-data (Version: %d)\n",
+		hose->global_number, be32_to_cpu(common->version));
+	if (data->brdgCtl)
+		pr_info("brdgCtl:    %08x\n",
+			be32_to_cpu(data->brdgCtl));
+	if (data->deviceStatus || data->slotStatus   ||
+	    data->linkStatus   || data->devCmdStatus ||
+	    data->devSecStatus)
+		pr_info("RootSts:    %08x %08x %08x %08x %08x\n",
+			be32_to_cpu(data->deviceStatus),
+			be32_to_cpu(data->slotStatus),
+			be32_to_cpu(data->linkStatus),
+			be32_to_cpu(data->devCmdStatus),
+			be32_to_cpu(data->devSecStatus));
+	if (data->rootErrorStatus || data->uncorrErrorStatus ||
+	    data->corrErrorStatus)
+		pr_info("RootErrSts: %08x %08x %08x\n",
+			be32_to_cpu(data->rootErrorStatus),
+			be32_to_cpu(data->uncorrErrorStatus),
+			be32_to_cpu(data->corrErrorStatus));
+	if (data->tlpHdr1 || data->tlpHdr2 ||
+	    data->tlpHdr3 || data->tlpHdr4)
+		pr_info("RootErrLog: %08x %08x %08x %08x\n",
+			be32_to_cpu(data->tlpHdr1),
+			be32_to_cpu(data->tlpHdr2),
+			be32_to_cpu(data->tlpHdr3),
+			be32_to_cpu(data->tlpHdr4));
+	if (data->sourceId)
+		pr_info("sourceId:   %08x\n", be32_to_cpu(data->sourceId));
+	if (data->nFir)
+		pr_info("nFir:       %016llx %016llx %016llx\n",
+			be64_to_cpu(data->nFir),
+			be64_to_cpu(data->nFirMask),
+			be64_to_cpu(data->nFirWOF));
+	if (data->phbPlssr || data->phbCsr)
+		pr_info("PhbSts:     %016llx %016llx\n",
+			be64_to_cpu(data->phbPlssr),
+			be64_to_cpu(data->phbCsr));
+	if (data->lemFir)
+		pr_info("Lem:        %016llx %016llx %016llx\n",
+			be64_to_cpu(data->lemFir),
+			be64_to_cpu(data->lemErrorMask),
+			be64_to_cpu(data->lemWOF));
+	if (data->phbErrorStatus)
+		pr_info("PhbErr:     %016llx %016llx %016llx %016llx\n",
+			be64_to_cpu(data->phbErrorStatus),
+			be64_to_cpu(data->phbFirstErrorStatus),
+			be64_to_cpu(data->phbErrorLog0),
+			be64_to_cpu(data->phbErrorLog1));
+	if (data->phbTxeErrorStatus)
+		pr_info("PhbTxeErr:  %016llx %016llx %016llx %016llx\n",
+			be64_to_cpu(data->phbTxeErrorStatus),
+			be64_to_cpu(data->phbTxeFirstErrorStatus),
+			be64_to_cpu(data->phbTxeErrorLog0),
+			be64_to_cpu(data->phbTxeErrorLog1));
+	if (data->phbRxeArbErrorStatus)
+		pr_info("RxeArbErr:  %016llx %016llx %016llx %016llx\n",
+			be64_to_cpu(data->phbRxeArbErrorStatus),
+			be64_to_cpu(data->phbRxeArbFirstErrorStatus),
+			be64_to_cpu(data->phbRxeArbErrorLog0),
+			be64_to_cpu(data->phbRxeArbErrorLog1));
+	if (data->phbRxeMrgErrorStatus)
+		pr_info("RxeMrgErr:  %016llx %016llx %016llx %016llx\n",
+			be64_to_cpu(data->phbRxeMrgErrorStatus),
+			be64_to_cpu(data->phbRxeMrgFirstErrorStatus),
+			be64_to_cpu(data->phbRxeMrgErrorLog0),
+			be64_to_cpu(data->phbRxeMrgErrorLog1));
+	if (data->phbRxeTceErrorStatus)
+		pr_info("RxeTceErr:  %016llx %016llx %016llx %016llx\n",
+			be64_to_cpu(data->phbRxeTceErrorStatus),
+			be64_to_cpu(data->phbRxeTceFirstErrorStatus),
+			be64_to_cpu(data->phbRxeTceErrorLog0),
+			be64_to_cpu(data->phbRxeTceErrorLog1));
+
+	if (data->phbPblErrorStatus)
+		pr_info("PblErr:     %016llx %016llx %016llx %016llx\n",
+			be64_to_cpu(data->phbPblErrorStatus),
+			be64_to_cpu(data->phbPblFirstErrorStatus),
+			be64_to_cpu(data->phbPblErrorLog0),
+			be64_to_cpu(data->phbPblErrorLog1));
+	if (data->phbPcieDlpErrorStatus)
+		pr_info("PcieDlp:    %016llx %016llx %016llx\n",
+			be64_to_cpu(data->phbPcieDlpErrorLog1),
+			be64_to_cpu(data->phbPcieDlpErrorLog2),
+			be64_to_cpu(data->phbPcieDlpErrorStatus));
+	if (data->phbRegbErrorStatus)
+		pr_info("RegbErr:    %016llx %016llx %016llx %016llx\n",
+			be64_to_cpu(data->phbRegbErrorStatus),
+			be64_to_cpu(data->phbRegbFirstErrorStatus),
+			be64_to_cpu(data->phbRegbErrorLog0),
+			be64_to_cpu(data->phbRegbErrorLog1));
+
+
+	pnv_pci_dump_pest(data->pestA, data->pestB, OPAL_PHB4_NUM_PEST_REGS);
+}
+
+void pnv_pci_dump_phb_diag_data(struct pci_controller *hose,
+				unsigned char *log_buff)
+{
+	struct OpalIoPhbErrorCommon *common;
+
+	if (!hose || !log_buff)
+		return;
+
+	common = (struct OpalIoPhbErrorCommon *)log_buff;
+	switch (be32_to_cpu(common->ioType)) {
+	case OPAL_PHB_ERROR_DATA_TYPE_P7IOC:
+		pnv_pci_dump_p7ioc_diag_data(hose, common);
+		break;
+	case OPAL_PHB_ERROR_DATA_TYPE_PHB3:
+		pnv_pci_dump_phb3_diag_data(hose, common);
+		break;
+	case OPAL_PHB_ERROR_DATA_TYPE_PHB4:
+		pnv_pci_dump_phb4_diag_data(hose, common);
+		break;
+	default:
+		pr_warn("%s: Unrecognized ioType %d\n",
+			__func__, be32_to_cpu(common->ioType));
+	}
+}
+
+static void pnv_pci_handle_eeh_config(struct pnv_phb *phb, u32 pe_no)
+{
+	unsigned long flags, rc;
+	int has_diag, ret = 0;
+
+	spin_lock_irqsave(&phb->lock, flags);
+
+	/* Fetch PHB diag-data */
+	rc = opal_pci_get_phb_diag_data2(phb->opal_id, phb->diag_data,
+					 phb->diag_data_size);
+	has_diag = (rc == OPAL_SUCCESS);
+
+	/* If PHB supports compound PE, to handle it */
+	if (phb->unfreeze_pe) {
+		ret = phb->unfreeze_pe(phb,
+				       pe_no,
+				       OPAL_EEH_ACTION_CLEAR_FREEZE_ALL);
+	} else {
+		rc = opal_pci_eeh_freeze_clear(phb->opal_id,
+					     pe_no,
+					     OPAL_EEH_ACTION_CLEAR_FREEZE_ALL);
+		if (rc) {
+			pr_warn("%s: Failure %ld clearing frozen "
+				"PHB#%x-PE#%x\n",
+				__func__, rc, phb->hose->global_number,
+				pe_no);
+			ret = -EIO;
+		}
+	}
+
+	/*
+	 * For now, let's only display the diag buffer when we fail to clear
+	 * the EEH status. We'll do more sensible things later when we have
+	 * proper EEH support. We need to make sure we don't pollute ourselves
+	 * with the normal errors generated when probing empty slots
+	 */
+	if (has_diag && ret)
+		pnv_pci_dump_phb_diag_data(phb->hose, phb->diag_data);
+
+	spin_unlock_irqrestore(&phb->lock, flags);
+}
+
+static void pnv_pci_config_check_eeh(struct pci_dn *pdn)
+{
+	struct pnv_phb *phb = pdn->phb->private_data;
+	u8	fstate = 0;
+	__be16	pcierr = 0;
+	unsigned int pe_no;
+	s64	rc;
+
+	/*
+	 * Get the PE#. During the PCI probe stage, we might not
+	 * setup that yet. So all ER errors should be mapped to
+	 * reserved PE.
+	 */
+	pe_no = pdn->pe_number;
+	if (pe_no == IODA_INVALID_PE) {
+		pe_no = phb->ioda.reserved_pe_idx;
+	}
+
+	/*
+	 * Fetch frozen state. If the PHB support compound PE,
+	 * we need handle that case.
+	 */
+	if (phb->get_pe_state) {
+		fstate = phb->get_pe_state(phb, pe_no);
+	} else {
+		rc = opal_pci_eeh_freeze_status(phb->opal_id,
+						pe_no,
+						&fstate,
+						&pcierr,
+						NULL);
+		if (rc) {
+			pr_warn("%s: Failure %lld getting PHB#%x-PE#%x state\n",
+				__func__, rc, phb->hose->global_number, pe_no);
+			return;
+		}
+	}
+
+	pr_devel(" -> EEH check, bdfn=%04x PE#%x fstate=%x\n",
+		 (pdn->busno << 8) | (pdn->devfn), pe_no, fstate);
+
+	/* Clear the frozen state if applicable */
+	if (fstate == OPAL_EEH_STOPPED_MMIO_FREEZE ||
+	    fstate == OPAL_EEH_STOPPED_DMA_FREEZE  ||
+	    fstate == OPAL_EEH_STOPPED_MMIO_DMA_FREEZE) {
+		/*
+		 * If PHB supports compound PE, freeze it for
+		 * consistency.
+		 */
+		if (phb->freeze_pe)
+			phb->freeze_pe(phb, pe_no);
+
+		pnv_pci_handle_eeh_config(phb, pe_no);
+	}
+}
+
+int pnv_pci_cfg_read(struct pci_dn *pdn,
+		     int where, int size, u32 *val)
+{
+	struct pnv_phb *phb = pdn->phb->private_data;
+	u32 bdfn = (pdn->busno << 8) | pdn->devfn;
+	s64 rc;
+
+	switch (size) {
+	case 1: {
+		u8 v8;
+		rc = opal_pci_config_read_byte(phb->opal_id, bdfn, where, &v8);
+		*val = (rc == OPAL_SUCCESS) ? v8 : 0xff;
+		break;
+	}
+	case 2: {
+		__be16 v16;
+		rc = opal_pci_config_read_half_word(phb->opal_id, bdfn, where,
+						   &v16);
+		*val = (rc == OPAL_SUCCESS) ? be16_to_cpu(v16) : 0xffff;
+		break;
+	}
+	case 4: {
+		__be32 v32;
+		rc = opal_pci_config_read_word(phb->opal_id, bdfn, where, &v32);
+		*val = (rc == OPAL_SUCCESS) ? be32_to_cpu(v32) : 0xffffffff;
+		break;
+	}
+	default:
+		return PCIBIOS_FUNC_NOT_SUPPORTED;
+	}
+
+	pr_devel("%s: bus: %x devfn: %x +%x/%x -> %08x\n",
+		 __func__, pdn->busno, pdn->devfn, where, size, *val);
+	return PCIBIOS_SUCCESSFUL;
+}
+
+int pnv_pci_cfg_write(struct pci_dn *pdn,
+		      int where, int size, u32 val)
+{
+	struct pnv_phb *phb = pdn->phb->private_data;
+	u32 bdfn = (pdn->busno << 8) | pdn->devfn;
+
+	pr_devel("%s: bus: %x devfn: %x +%x/%x -> %08x\n",
+		 __func__, pdn->busno, pdn->devfn, where, size, val);
+	switch (size) {
+	case 1:
+		opal_pci_config_write_byte(phb->opal_id, bdfn, where, val);
+		break;
+	case 2:
+		opal_pci_config_write_half_word(phb->opal_id, bdfn, where, val);
+		break;
+	case 4:
+		opal_pci_config_write_word(phb->opal_id, bdfn, where, val);
+		break;
+	default:
+		return PCIBIOS_FUNC_NOT_SUPPORTED;
+	}
+
+	return PCIBIOS_SUCCESSFUL;
+}
+
+#if CONFIG_EEH
+static bool pnv_pci_cfg_check(struct pci_dn *pdn)
+{
+	struct eeh_dev *edev = NULL;
+	struct pnv_phb *phb = pdn->phb->private_data;
+
+	/* EEH not enabled ? */
+	if (!(phb->flags & PNV_PHB_FLAG_EEH))
+		return true;
+
+	/* PE reset or device removed ? */
+	edev = pdn->edev;
+	if (edev) {
+		if (edev->pe &&
+		    (edev->pe->state & EEH_PE_CFG_BLOCKED))
+			return false;
+
+		if (edev->mode & EEH_DEV_REMOVED)
+			return false;
+	}
+
+	return true;
+}
+#else
+static inline pnv_pci_cfg_check(struct pci_dn *pdn)
+{
+	return true;
+}
+#endif /* CONFIG_EEH */
+
+static int pnv_pci_read_config(struct pci_bus *bus,
+			       unsigned int devfn,
+			       int where, int size, u32 *val)
+{
+	struct pci_dn *pdn;
+	struct pnv_phb *phb;
+	int ret;
+
+	*val = 0xFFFFFFFF;
+	pdn = pci_get_pdn_by_devfn(bus, devfn);
+	if (!pdn)
+		return PCIBIOS_DEVICE_NOT_FOUND;
+
+	if (!pnv_pci_cfg_check(pdn))
+		return PCIBIOS_DEVICE_NOT_FOUND;
+
+	ret = pnv_pci_cfg_read(pdn, where, size, val);
+	phb = pdn->phb->private_data;
+	if (phb->flags & PNV_PHB_FLAG_EEH && pdn->edev) {
+		if (*val == EEH_IO_ERROR_VALUE(size) &&
+		    eeh_dev_check_failure(pdn->edev))
+                        return PCIBIOS_DEVICE_NOT_FOUND;
+	} else {
+		pnv_pci_config_check_eeh(pdn);
+	}
+
+	return ret;
+}
+
+static int pnv_pci_write_config(struct pci_bus *bus,
+				unsigned int devfn,
+				int where, int size, u32 val)
+{
+	struct pci_dn *pdn;
+	struct pnv_phb *phb;
+	int ret;
+
+	pdn = pci_get_pdn_by_devfn(bus, devfn);
+	if (!pdn)
+		return PCIBIOS_DEVICE_NOT_FOUND;
+
+	if (!pnv_pci_cfg_check(pdn))
+		return PCIBIOS_DEVICE_NOT_FOUND;
+
+	ret = pnv_pci_cfg_write(pdn, where, size, val);
+	phb = pdn->phb->private_data;
+	if (!(phb->flags & PNV_PHB_FLAG_EEH))
+		pnv_pci_config_check_eeh(pdn);
+
+	return ret;
+}
+
+struct pci_ops pnv_pci_ops = {
+	.read  = pnv_pci_read_config,
+	.write = pnv_pci_write_config,
+};
+
+struct iommu_table *pnv_pci_table_alloc(int nid)
+{
+	struct iommu_table *tbl;
+
+	tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL, nid);
+	if (!tbl)
+		return NULL;
+
+	INIT_LIST_HEAD_RCU(&tbl->it_group_list);
+	kref_init(&tbl->it_kref);
+
+	return tbl;
+}
+
+struct device_node *pnv_pci_get_phb_node(struct pci_dev *dev)
+{
+	struct pci_controller *hose = pci_bus_to_host(dev->bus);
+
+	return of_node_get(hose->dn);
+}
+EXPORT_SYMBOL(pnv_pci_get_phb_node);
+
+int pnv_pci_set_tunnel_bar(struct pci_dev *dev, u64 addr, int enable)
+{
+	struct pnv_phb *phb = pci_bus_to_pnvhb(dev->bus);
+	u64 tunnel_bar;
+	__be64 val;
+	int rc;
+
+	if (!opal_check_token(OPAL_PCI_GET_PBCQ_TUNNEL_BAR))
+		return -ENXIO;
+	if (!opal_check_token(OPAL_PCI_SET_PBCQ_TUNNEL_BAR))
+		return -ENXIO;
+
+	mutex_lock(&tunnel_mutex);
+	rc = opal_pci_get_pbcq_tunnel_bar(phb->opal_id, &val);
+	if (rc != OPAL_SUCCESS) {
+		rc = -EIO;
+		goto out;
+	}
+	tunnel_bar = be64_to_cpu(val);
+	if (enable) {
+		/*
+		* Only one device per PHB can use atomics.
+		* Our policy is first-come, first-served.
+		*/
+		if (tunnel_bar) {
+			if (tunnel_bar != addr)
+				rc = -EBUSY;
+			else
+				rc = 0;	/* Setting same address twice is ok */
+			goto out;
+		}
+	} else {
+		/*
+		* The device that owns atomics and wants to release
+		* them must pass the same address with enable == 0.
+		*/
+		if (tunnel_bar != addr) {
+			rc = -EPERM;
+			goto out;
+		}
+		addr = 0x0ULL;
+	}
+	rc = opal_pci_set_pbcq_tunnel_bar(phb->opal_id, addr);
+	rc = opal_error_code(rc);
+out:
+	mutex_unlock(&tunnel_mutex);
+	return rc;
+}
+EXPORT_SYMBOL_GPL(pnv_pci_set_tunnel_bar);
+
+void pnv_pci_shutdown(void)
+{
+	struct pci_controller *hose;
+
+	list_for_each_entry(hose, &hose_list, list_node)
+		if (hose->controller_ops.shutdown)
+			hose->controller_ops.shutdown(hose);
+}
+
+/* Fixup wrong class code in p7ioc and p8 root complex */
+static void pnv_p7ioc_rc_quirk(struct pci_dev *dev)
+{
+	dev->class = PCI_CLASS_BRIDGE_PCI << 8;
+}
+DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_IBM, 0x3b9, pnv_p7ioc_rc_quirk);
+
+void __init pnv_pci_init(void)
+{
+	struct device_node *np;
+
+	pci_add_flags(PCI_CAN_SKIP_ISA_ALIGN);
+
+	/* If we don't have OPAL, eg. in sim, just skip PCI probe */
+	if (!firmware_has_feature(FW_FEATURE_OPAL))
+		return;
+
+#ifdef CONFIG_PCIEPORTBUS
+	/*
+	 * On PowerNV PCIe devices are (currently) managed in cooperation
+	 * with firmware. This isn't *strictly* required, but there's enough
+	 * assumptions baked into both firmware and the platform code that
+	 * it's unwise to allow the portbus services to be used.
+	 *
+	 * We need to fix this eventually, but for now set this flag to disable
+	 * the portbus driver. The AER service isn't required since that AER
+	 * events are handled via EEH. The pciehp hotplug driver can't work
+	 * without kernel changes (and portbus binding breaks pnv_php). The
+	 * other services also require some thinking about how we're going
+	 * to integrate them.
+	 */
+	pcie_ports_disabled = true;
+#endif
+
+	/* Look for IODA IO-Hubs. */
+	for_each_compatible_node(np, NULL, "ibm,ioda-hub") {
+		pnv_pci_init_ioda_hub(np);
+	}
+
+	/* Look for ioda2 built-in PHB3's */
+	for_each_compatible_node(np, NULL, "ibm,ioda2-phb")
+		pnv_pci_init_ioda2_phb(np);
+
+	/* Look for ioda3 built-in PHB4's, we treat them as IODA2 */
+	for_each_compatible_node(np, NULL, "ibm,ioda3-phb")
+		pnv_pci_init_ioda2_phb(np);
+
+	/* Look for NPU PHBs */
+	for_each_compatible_node(np, NULL, "ibm,ioda2-npu-phb")
+		pnv_pci_init_npu_phb(np);
+
+	/*
+	 * Look for NPU2 PHBs which we treat mostly as NPU PHBs with
+	 * the exception of TCE kill which requires an OPAL call.
+	 */
+	for_each_compatible_node(np, NULL, "ibm,ioda2-npu2-phb")
+		pnv_pci_init_npu_phb(np);
+
+	/* Look for NPU2 OpenCAPI PHBs */
+	for_each_compatible_node(np, NULL, "ibm,ioda2-npu2-opencapi-phb")
+		pnv_pci_init_npu2_opencapi_phb(np);
+
+	/* Configure IOMMU DMA hooks */
+	set_pci_dma_ops(&dma_iommu_ops);
+}
+
+static int pnv_tce_iommu_bus_notifier(struct notifier_block *nb,
+		unsigned long action, void *data)
+{
+	struct device *dev = data;
+
+	switch (action) {
+	case BUS_NOTIFY_DEL_DEVICE:
+		iommu_del_device(dev);
+		return 0;
+	default:
+		return 0;
+	}
+}
+
+static struct notifier_block pnv_tce_iommu_bus_nb = {
+	.notifier_call = pnv_tce_iommu_bus_notifier,
+};
+
+static int __init pnv_tce_iommu_bus_notifier_init(void)
+{
+	bus_register_notifier(&pci_bus_type, &pnv_tce_iommu_bus_nb);
+	return 0;
+}
+machine_subsys_initcall_sync(powernv, pnv_tce_iommu_bus_notifier_init);
diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h
new file mode 100644
index 000000000..739a0b3b7
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/pci.h
@@ -0,0 +1,364 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __POWERNV_PCI_H
+#define __POWERNV_PCI_H
+
+#include <linux/compiler.h>		/* for __printf */
+#include <linux/iommu.h>
+#include <asm/iommu.h>
+#include <asm/msi_bitmap.h>
+
+struct pci_dn;
+
+enum pnv_phb_type {
+	PNV_PHB_IODA1		= 0,
+	PNV_PHB_IODA2		= 1,
+	PNV_PHB_NPU_NVLINK	= 2,
+	PNV_PHB_NPU_OCAPI	= 3,
+};
+
+/* Precise PHB model for error management */
+enum pnv_phb_model {
+	PNV_PHB_MODEL_UNKNOWN,
+	PNV_PHB_MODEL_P7IOC,
+	PNV_PHB_MODEL_PHB3,
+	PNV_PHB_MODEL_NPU,
+	PNV_PHB_MODEL_NPU2,
+};
+
+#define PNV_PCI_DIAG_BUF_SIZE	8192
+#define PNV_IODA_PE_DEV		(1 << 0)	/* PE has single PCI device	*/
+#define PNV_IODA_PE_BUS		(1 << 1)	/* PE has primary PCI bus	*/
+#define PNV_IODA_PE_BUS_ALL	(1 << 2)	/* PE has subordinate buses	*/
+#define PNV_IODA_PE_MASTER	(1 << 3)	/* Master PE in compound case	*/
+#define PNV_IODA_PE_SLAVE	(1 << 4)	/* Slave PE in compound case	*/
+#define PNV_IODA_PE_VF		(1 << 5)	/* PE for one VF 		*/
+
+/*
+ * A brief note on PNV_IODA_PE_BUS_ALL
+ *
+ * This is needed because of the behaviour of PCIe-to-PCI bridges. The PHB uses
+ * the Requester ID field of the PCIe request header to determine the device
+ * (and PE) that initiated a DMA. In legacy PCI individual memory read/write
+ * requests aren't tagged with the RID. To work around this the PCIe-to-PCI
+ * bridge will use (secondary_bus_no << 8) | 0x00 as the RID on the PCIe side.
+ *
+ * PCIe-to-X bridges have a similar issue even though PCI-X requests also have
+ * a RID in the transaction header. The PCIe-to-X bridge is permitted to "take
+ * ownership" of a transaction by a PCI-X device when forwarding it to the PCIe
+ * side of the bridge.
+ *
+ * To work around these problems we use the BUS_ALL flag since every subordinate
+ * bus of the bridge should go into the same PE.
+ */
+
+/* Indicates operations are frozen for a PE: MMIO in PESTA & DMA in PESTB. */
+#define PNV_IODA_STOPPED_STATE	0x8000000000000000
+
+/* Data associated with a PE, including IOMMU tracking etc.. */
+struct pnv_phb;
+struct pnv_ioda_pe {
+	unsigned long		flags;
+	struct pnv_phb		*phb;
+	int			device_count;
+
+	/* A PE can be associated with a single device or an
+	 * entire bus (& children). In the former case, pdev
+	 * is populated, in the later case, pbus is.
+	 */
+#ifdef CONFIG_PCI_IOV
+	struct pci_dev          *parent_dev;
+#endif
+	struct pci_dev		*pdev;
+	struct pci_bus		*pbus;
+
+	/* Effective RID (device RID for a device PE and base bus
+	 * RID with devfn 0 for a bus PE)
+	 */
+	unsigned int		rid;
+
+	/* PE number */
+	unsigned int		pe_number;
+
+	/* "Base" iommu table, ie, 4K TCEs, 32-bit DMA */
+	struct iommu_table_group table_group;
+	struct npu_comp		*npucomp;
+
+	/* 64-bit TCE bypass region */
+	bool			tce_bypass_enabled;
+	uint64_t		tce_bypass_base;
+
+	/*
+	 * Used to track whether we've done DMA setup for this PE or not. We
+	 * want to defer allocating TCE tables, etc until we've added a
+	 * non-bridge device to the PE.
+	 */
+	bool			dma_setup_done;
+
+	/* MSIs. MVE index is identical for 32 and 64 bit MSI
+	 * and -1 if not supported. (It's actually identical to the
+	 * PE number)
+	 */
+	int			mve_number;
+
+	/* PEs in compound case */
+	struct pnv_ioda_pe	*master;
+	struct list_head	slaves;
+
+	/* Link in list of PE#s */
+	struct list_head	list;
+};
+
+#define PNV_PHB_FLAG_EEH	(1 << 0)
+
+struct pnv_phb {
+	struct pci_controller	*hose;
+	enum pnv_phb_type	type;
+	enum pnv_phb_model	model;
+	u64			hub_id;
+	u64			opal_id;
+	int			flags;
+	void __iomem		*regs;
+	u64			regs_phys;
+	int			initialized;
+	spinlock_t		lock;
+
+#ifdef CONFIG_DEBUG_FS
+	int			has_dbgfs;
+	struct dentry		*dbgfs;
+#endif
+
+	unsigned int		msi_base;
+	unsigned int		msi32_support;
+	struct msi_bitmap	msi_bmp;
+	int (*msi_setup)(struct pnv_phb *phb, struct pci_dev *dev,
+			 unsigned int hwirq, unsigned int virq,
+			 unsigned int is_64, struct msi_msg *msg);
+	int (*init_m64)(struct pnv_phb *phb);
+	int (*get_pe_state)(struct pnv_phb *phb, int pe_no);
+	void (*freeze_pe)(struct pnv_phb *phb, int pe_no);
+	int (*unfreeze_pe)(struct pnv_phb *phb, int pe_no, int opt);
+
+	struct {
+		/* Global bridge info */
+		unsigned int		total_pe_num;
+		unsigned int		reserved_pe_idx;
+		unsigned int		root_pe_idx;
+
+		/* 32-bit MMIO window */
+		unsigned int		m32_size;
+		unsigned int		m32_segsize;
+		unsigned int		m32_pci_base;
+
+		/* 64-bit MMIO window */
+		unsigned int		m64_bar_idx;
+		unsigned long		m64_size;
+		unsigned long		m64_segsize;
+		unsigned long		m64_base;
+#define MAX_M64_BARS 64
+		unsigned long		m64_bar_alloc;
+
+		/* IO ports */
+		unsigned int		io_size;
+		unsigned int		io_segsize;
+		unsigned int		io_pci_base;
+
+		/* PE allocation */
+		struct mutex		pe_alloc_mutex;
+		unsigned long		*pe_alloc;
+		struct pnv_ioda_pe	*pe_array;
+
+		/* M32 & IO segment maps */
+		unsigned int		*m64_segmap;
+		unsigned int		*m32_segmap;
+		unsigned int		*io_segmap;
+
+		/* DMA32 segment maps - IODA1 only */
+		unsigned int		dma32_count;
+		unsigned int		*dma32_segmap;
+
+		/* IRQ chip */
+		int			irq_chip_init;
+		struct irq_chip		irq_chip;
+
+		/* Sorted list of used PE's based
+		 * on the sequence of creation
+		 */
+		struct list_head	pe_list;
+		struct mutex            pe_list_mutex;
+
+		/* Reverse map of PEs, indexed by {bus, devfn} */
+		unsigned int		pe_rmap[0x10000];
+	} ioda;
+
+	/* PHB and hub diagnostics */
+	unsigned int		diag_data_size;
+	u8			*diag_data;
+};
+
+
+/* IODA PE management */
+
+static inline bool pnv_pci_is_m64(struct pnv_phb *phb, struct resource *r)
+{
+	/*
+	 * WARNING: We cannot rely on the resource flags. The Linux PCI
+	 * allocation code sometimes decides to put a 64-bit prefetchable
+	 * BAR in the 32-bit window, so we have to compare the addresses.
+	 *
+	 * For simplicity we only test resource start.
+	 */
+	return (r->start >= phb->ioda.m64_base &&
+		r->start < (phb->ioda.m64_base + phb->ioda.m64_size));
+}
+
+static inline bool pnv_pci_is_m64_flags(unsigned long resource_flags)
+{
+	unsigned long flags = (IORESOURCE_MEM_64 | IORESOURCE_PREFETCH);
+
+	return (resource_flags & flags) == flags;
+}
+
+int pnv_ioda_configure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe);
+int pnv_ioda_deconfigure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe);
+
+void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe);
+void pnv_pci_ioda2_release_pe_dma(struct pnv_ioda_pe *pe);
+
+struct pnv_ioda_pe *pnv_ioda_alloc_pe(struct pnv_phb *phb, int count);
+void pnv_ioda_free_pe(struct pnv_ioda_pe *pe);
+
+#ifdef CONFIG_PCI_IOV
+/*
+ * For SR-IOV we want to put each VF's MMIO resource in to a separate PE.
+ * This requires a bit of acrobatics with the MMIO -> PE configuration
+ * and this structure is used to keep track of it all.
+ */
+struct pnv_iov_data {
+	/* number of VFs enabled */
+	u16     num_vfs;
+
+	/* pointer to the array of VF PEs. num_vfs long*/
+	struct pnv_ioda_pe *vf_pe_arr;
+
+	/* Did we map the VF BAR with single-PE IODA BARs? */
+	bool    m64_single_mode[PCI_SRIOV_NUM_BARS];
+
+	/*
+	 * True if we're using any segmented windows. In that case we need
+	 * shift the start of the IOV resource the segment corresponding to
+	 * the allocated PE.
+	 */
+	bool    need_shift;
+
+	/*
+	 * Bit mask used to track which m64 windows are used to map the
+	 * SR-IOV BARs for this device.
+	 */
+	DECLARE_BITMAP(used_m64_bar_mask, MAX_M64_BARS);
+
+	/*
+	 * If we map the SR-IOV BARs with a segmented window then
+	 * parts of that window will be "claimed" by other PEs.
+	 *
+	 * "holes" here is used to reserve the leading portion
+	 * of the window that is used by other (non VF) PEs.
+	 */
+	struct resource holes[PCI_SRIOV_NUM_BARS];
+};
+
+static inline struct pnv_iov_data *pnv_iov_get(struct pci_dev *pdev)
+{
+	return pdev->dev.archdata.iov_data;
+}
+
+void pnv_pci_ioda_fixup_iov(struct pci_dev *pdev);
+resource_size_t pnv_pci_iov_resource_alignment(struct pci_dev *pdev, int resno);
+
+int pnv_pcibios_sriov_enable(struct pci_dev *pdev, u16 num_vfs);
+int pnv_pcibios_sriov_disable(struct pci_dev *pdev);
+#endif /* CONFIG_PCI_IOV */
+
+extern struct pci_ops pnv_pci_ops;
+
+void pnv_pci_dump_phb_diag_data(struct pci_controller *hose,
+				unsigned char *log_buff);
+int pnv_pci_cfg_read(struct pci_dn *pdn,
+		     int where, int size, u32 *val);
+int pnv_pci_cfg_write(struct pci_dn *pdn,
+		      int where, int size, u32 val);
+extern struct iommu_table *pnv_pci_table_alloc(int nid);
+
+extern void pnv_pci_init_ioda_hub(struct device_node *np);
+extern void pnv_pci_init_ioda2_phb(struct device_node *np);
+extern void pnv_pci_init_npu_phb(struct device_node *np);
+extern void pnv_pci_init_npu2_opencapi_phb(struct device_node *np);
+extern void pnv_npu2_map_lpar(struct pnv_ioda_pe *gpe, unsigned long msr);
+extern void pnv_pci_reset_secondary_bus(struct pci_dev *dev);
+extern int pnv_eeh_phb_reset(struct pci_controller *hose, int option);
+
+extern int pnv_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type);
+extern void pnv_teardown_msi_irqs(struct pci_dev *pdev);
+extern struct pnv_ioda_pe *pnv_pci_bdfn_to_pe(struct pnv_phb *phb, u16 bdfn);
+extern struct pnv_ioda_pe *pnv_ioda_get_pe(struct pci_dev *dev);
+extern void pnv_set_msi_irq_chip(struct pnv_phb *phb, unsigned int virq);
+extern unsigned long pnv_pci_ioda2_get_table_size(__u32 page_shift,
+		__u64 window_size, __u32 levels);
+extern int pnv_eeh_post_init(void);
+
+__printf(3, 4)
+extern void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level,
+			    const char *fmt, ...);
+#define pe_err(pe, fmt, ...)					\
+	pe_level_printk(pe, KERN_ERR, fmt, ##__VA_ARGS__)
+#define pe_warn(pe, fmt, ...)					\
+	pe_level_printk(pe, KERN_WARNING, fmt, ##__VA_ARGS__)
+#define pe_info(pe, fmt, ...)					\
+	pe_level_printk(pe, KERN_INFO, fmt, ##__VA_ARGS__)
+
+/* Nvlink functions */
+extern void pnv_npu_try_dma_set_bypass(struct pci_dev *gpdev, bool bypass);
+extern void pnv_pci_ioda2_tce_invalidate_entire(struct pnv_phb *phb, bool rm);
+extern void pnv_pci_npu_setup_iommu_groups(void);
+
+/* pci-ioda-tce.c */
+#define POWERNV_IOMMU_DEFAULT_LEVELS	2
+#define POWERNV_IOMMU_MAX_LEVELS	5
+
+extern int pnv_tce_build(struct iommu_table *tbl, long index, long npages,
+		unsigned long uaddr, enum dma_data_direction direction,
+		unsigned long attrs);
+extern void pnv_tce_free(struct iommu_table *tbl, long index, long npages);
+extern int pnv_tce_xchg(struct iommu_table *tbl, long index,
+		unsigned long *hpa, enum dma_data_direction *direction,
+		bool alloc);
+extern __be64 *pnv_tce_useraddrptr(struct iommu_table *tbl, long index,
+		bool alloc);
+extern unsigned long pnv_tce_get(struct iommu_table *tbl, long index);
+
+extern long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset,
+		__u32 page_shift, __u64 window_size, __u32 levels,
+		bool alloc_userspace_copy, struct iommu_table *tbl);
+extern void pnv_pci_ioda2_table_free_pages(struct iommu_table *tbl);
+
+extern long pnv_pci_link_table_and_group(int node, int num,
+		struct iommu_table *tbl,
+		struct iommu_table_group *table_group);
+extern void pnv_pci_unlink_table_and_group(struct iommu_table *tbl,
+		struct iommu_table_group *table_group);
+extern void pnv_pci_setup_iommu_table(struct iommu_table *tbl,
+		void *tce_mem, u64 tce_size,
+		u64 dma_offset, unsigned int page_shift);
+
+extern unsigned long pnv_ioda_parse_tce_sizes(struct pnv_phb *phb);
+
+static inline struct pnv_phb *pci_bus_to_pnvhb(struct pci_bus *bus)
+{
+	struct pci_controller *hose = bus->sysdata;
+
+	if (hose)
+		return hose->private_data;
+
+	return NULL;
+}
+
+#endif /* __POWERNV_PCI_H */
diff --git a/arch/powerpc/platforms/powernv/powernv.h b/arch/powerpc/platforms/powernv/powernv.h
new file mode 100644
index 000000000..528946ee7
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/powernv.h
@@ -0,0 +1,47 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _POWERNV_H
+#define _POWERNV_H
+
+/*
+ * There's various hacks scattered throughout the generic powerpc arch code
+ * that needs to call into powernv platform stuff. The prototypes for those
+ * functions are in asm/powernv.h
+ */
+#include <asm/powernv.h>
+
+#ifdef CONFIG_SMP
+extern void pnv_smp_init(void);
+#else
+static inline void pnv_smp_init(void) { }
+#endif
+
+extern void pnv_platform_error_reboot(struct pt_regs *regs, const char *msg) __noreturn;
+
+struct pci_dev;
+
+#ifdef CONFIG_PCI
+extern void pnv_pci_init(void);
+extern void pnv_pci_shutdown(void);
+#else
+static inline void pnv_pci_init(void) { }
+static inline void pnv_pci_shutdown(void) { }
+#endif
+
+extern u32 pnv_get_supported_cpuidle_states(void);
+
+extern void pnv_lpc_init(void);
+
+extern void opal_handle_events(void);
+extern bool opal_have_pending_events(void);
+extern void opal_event_shutdown(void);
+
+bool cpu_core_split_required(void);
+
+struct memcons;
+ssize_t memcons_copy(struct memcons *mc, char *to, loff_t pos, size_t count);
+u32 memcons_get_size(struct memcons *mc);
+struct memcons *memcons_init(struct device_node *node, const char *mc_prop_name);
+
+void pnv_rng_init(void);
+
+#endif /* _POWERNV_H */
diff --git a/arch/powerpc/platforms/powernv/rng.c b/arch/powerpc/platforms/powernv/rng.c
new file mode 100644
index 000000000..a99033c3d
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/rng.c
@@ -0,0 +1,220 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright 2013, Michael Ellerman, IBM Corporation.
+ */
+
+#define pr_fmt(fmt)	"powernv-rng: " fmt
+
+#include <linux/kernel.h>
+#include <linux/of.h>
+#include <linux/of_address.h>
+#include <linux/of_platform.h>
+#include <linux/slab.h>
+#include <linux/smp.h>
+#include <asm/archrandom.h>
+#include <asm/cputable.h>
+#include <asm/io.h>
+#include <asm/prom.h>
+#include <asm/machdep.h>
+#include <asm/smp.h>
+#include "powernv.h"
+
+#define DARN_ERR 0xFFFFFFFFFFFFFFFFul
+
+struct powernv_rng {
+	void __iomem *regs;
+	void __iomem *regs_real;
+	unsigned long mask;
+};
+
+static DEFINE_PER_CPU(struct powernv_rng *, powernv_rng);
+
+int powernv_hwrng_present(void)
+{
+	struct powernv_rng *rng;
+
+	rng = get_cpu_var(powernv_rng);
+	put_cpu_var(rng);
+	return rng != NULL;
+}
+
+static unsigned long rng_whiten(struct powernv_rng *rng, unsigned long val)
+{
+	unsigned long parity;
+
+	/* Calculate the parity of the value */
+	asm (".machine push;   \
+	      .machine power7; \
+	      popcntd %0,%1;   \
+	      .machine pop;"
+	     : "=r" (parity) : "r" (val));
+
+	/* xor our value with the previous mask */
+	val ^= rng->mask;
+
+	/* update the mask based on the parity of this value */
+	rng->mask = (rng->mask << 1) | (parity & 1);
+
+	return val;
+}
+
+int powernv_get_random_real_mode(unsigned long *v)
+{
+	struct powernv_rng *rng;
+
+	rng = raw_cpu_read(powernv_rng);
+	if (!rng)
+		return 0;
+
+	*v = rng_whiten(rng, __raw_rm_readq(rng->regs_real));
+
+	return 1;
+}
+
+static int powernv_get_random_darn(unsigned long *v)
+{
+	unsigned long val;
+
+	/* Using DARN with L=1 - 64-bit conditioned random number */
+	asm volatile(PPC_DARN(%0, 1) : "=r"(val));
+
+	if (val == DARN_ERR)
+		return 0;
+
+	*v = val;
+
+	return 1;
+}
+
+static int initialise_darn(void)
+{
+	unsigned long val;
+	int i;
+
+	if (!cpu_has_feature(CPU_FTR_ARCH_300))
+		return -ENODEV;
+
+	for (i = 0; i < 10; i++) {
+		if (powernv_get_random_darn(&val)) {
+			ppc_md.get_random_seed = powernv_get_random_darn;
+			return 0;
+		}
+	}
+	return -EIO;
+}
+
+int powernv_get_random_long(unsigned long *v)
+{
+	struct powernv_rng *rng;
+
+	rng = get_cpu_var(powernv_rng);
+
+	*v = rng_whiten(rng, in_be64(rng->regs));
+
+	put_cpu_var(rng);
+
+	return 1;
+}
+EXPORT_SYMBOL_GPL(powernv_get_random_long);
+
+static __init void rng_init_per_cpu(struct powernv_rng *rng,
+				    struct device_node *dn)
+{
+	int chip_id, cpu;
+
+	chip_id = of_get_ibm_chip_id(dn);
+	if (chip_id == -1)
+		pr_warn("No ibm,chip-id found for %pOF.\n", dn);
+
+	for_each_possible_cpu(cpu) {
+		if (per_cpu(powernv_rng, cpu) == NULL ||
+		    cpu_to_chip_id(cpu) == chip_id) {
+			per_cpu(powernv_rng, cpu) = rng;
+		}
+	}
+}
+
+static __init int rng_create(struct device_node *dn)
+{
+	struct powernv_rng *rng;
+	struct resource res;
+	unsigned long val;
+
+	rng = kzalloc(sizeof(*rng), GFP_KERNEL);
+	if (!rng)
+		return -ENOMEM;
+
+	if (of_address_to_resource(dn, 0, &res)) {
+		kfree(rng);
+		return -ENXIO;
+	}
+
+	rng->regs_real = (void __iomem *)res.start;
+
+	rng->regs = of_iomap(dn, 0);
+	if (!rng->regs) {
+		kfree(rng);
+		return -ENXIO;
+	}
+
+	val = in_be64(rng->regs);
+	rng->mask = val;
+
+	rng_init_per_cpu(rng, dn);
+
+	ppc_md.get_random_seed = powernv_get_random_long;
+
+	return 0;
+}
+
+static int __init pnv_get_random_long_early(unsigned long *v)
+{
+	struct device_node *dn;
+
+	if (!slab_is_available())
+		return 0;
+
+	if (cmpxchg(&ppc_md.get_random_seed, pnv_get_random_long_early,
+		    NULL) != pnv_get_random_long_early)
+		return 0;
+
+	for_each_compatible_node(dn, NULL, "ibm,power-rng")
+		rng_create(dn);
+
+	if (!ppc_md.get_random_seed)
+		return 0;
+	return ppc_md.get_random_seed(v);
+}
+
+void __init pnv_rng_init(void)
+{
+	struct device_node *dn;
+
+	/* Prefer darn over the rest. */
+	if (!initialise_darn())
+		return;
+
+	dn = of_find_compatible_node(NULL, NULL, "ibm,power-rng");
+	if (dn)
+		ppc_md.get_random_seed = pnv_get_random_long_early;
+
+	of_node_put(dn);
+}
+
+static int __init pnv_rng_late_init(void)
+{
+	struct device_node *dn;
+	unsigned long v;
+
+	/* In case it wasn't called during init for some other reason. */
+	if (ppc_md.get_random_seed == pnv_get_random_long_early)
+		pnv_get_random_long_early(&v);
+
+	if (ppc_md.get_random_seed == powernv_get_random_long) {
+		for_each_compatible_node(dn, NULL, "ibm,power-rng")
+			of_platform_device_create(dn, NULL, NULL);
+	}
+
+	return 0;
+}
+machine_subsys_initcall(powernv, pnv_rng_late_init);
diff --git a/arch/powerpc/platforms/powernv/setup.c b/arch/powerpc/platforms/powernv/setup.c
new file mode 100644
index 000000000..1a2f12dc0
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/setup.c
@@ -0,0 +1,563 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * PowerNV setup code.
+ *
+ * Copyright 2011 IBM Corp.
+ */
+
+#undef DEBUG
+
+#include <linux/cpu.h>
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/tty.h>
+#include <linux/reboot.h>
+#include <linux/init.h>
+#include <linux/console.h>
+#include <linux/delay.h>
+#include <linux/irq.h>
+#include <linux/seq_file.h>
+#include <linux/of.h>
+#include <linux/of_fdt.h>
+#include <linux/interrupt.h>
+#include <linux/bug.h>
+#include <linux/pci.h>
+#include <linux/cpufreq.h>
+#include <linux/memblock.h>
+
+#include <asm/machdep.h>
+#include <asm/firmware.h>
+#include <asm/xics.h>
+#include <asm/xive.h>
+#include <asm/opal.h>
+#include <asm/kexec.h>
+#include <asm/smp.h>
+#include <asm/tm.h>
+#include <asm/setup.h>
+#include <asm/security_features.h>
+
+#include "powernv.h"
+
+
+static bool fw_feature_is(const char *state, const char *name,
+			  struct device_node *fw_features)
+{
+	struct device_node *np;
+	bool rc = false;
+
+	np = of_get_child_by_name(fw_features, name);
+	if (np) {
+		rc = of_property_read_bool(np, state);
+		of_node_put(np);
+	}
+
+	return rc;
+}
+
+static void init_fw_feat_flags(struct device_node *np)
+{
+	if (fw_feature_is("enabled", "inst-spec-barrier-ori31,31,0", np))
+		security_ftr_set(SEC_FTR_SPEC_BAR_ORI31);
+
+	if (fw_feature_is("enabled", "fw-bcctrl-serialized", np))
+		security_ftr_set(SEC_FTR_BCCTRL_SERIALISED);
+
+	if (fw_feature_is("enabled", "inst-l1d-flush-ori30,30,0", np))
+		security_ftr_set(SEC_FTR_L1D_FLUSH_ORI30);
+
+	if (fw_feature_is("enabled", "inst-l1d-flush-trig2", np))
+		security_ftr_set(SEC_FTR_L1D_FLUSH_TRIG2);
+
+	if (fw_feature_is("enabled", "fw-l1d-thread-split", np))
+		security_ftr_set(SEC_FTR_L1D_THREAD_PRIV);
+
+	if (fw_feature_is("enabled", "fw-count-cache-disabled", np))
+		security_ftr_set(SEC_FTR_COUNT_CACHE_DISABLED);
+
+	if (fw_feature_is("enabled", "fw-count-cache-flush-bcctr2,0,0", np))
+		security_ftr_set(SEC_FTR_BCCTR_FLUSH_ASSIST);
+
+	if (fw_feature_is("enabled", "needs-count-cache-flush-on-context-switch", np))
+		security_ftr_set(SEC_FTR_FLUSH_COUNT_CACHE);
+
+	/*
+	 * The features below are enabled by default, so we instead look to see
+	 * if firmware has *disabled* them, and clear them if so.
+	 */
+	if (fw_feature_is("disabled", "speculation-policy-favor-security", np))
+		security_ftr_clear(SEC_FTR_FAVOUR_SECURITY);
+
+	if (fw_feature_is("disabled", "needs-l1d-flush-msr-pr-0-to-1", np))
+		security_ftr_clear(SEC_FTR_L1D_FLUSH_PR);
+
+	if (fw_feature_is("disabled", "needs-l1d-flush-msr-hv-1-to-0", np))
+		security_ftr_clear(SEC_FTR_L1D_FLUSH_HV);
+
+	if (fw_feature_is("disabled", "needs-spec-barrier-for-bound-checks", np))
+		security_ftr_clear(SEC_FTR_BNDS_CHK_SPEC_BAR);
+}
+
+static void pnv_setup_security_mitigations(void)
+{
+	struct device_node *np, *fw_features;
+	enum l1d_flush_type type;
+	bool enable;
+
+	/* Default to fallback in case fw-features are not available */
+	type = L1D_FLUSH_FALLBACK;
+
+	np = of_find_node_by_name(NULL, "ibm,opal");
+	fw_features = of_get_child_by_name(np, "fw-features");
+	of_node_put(np);
+
+	if (fw_features) {
+		init_fw_feat_flags(fw_features);
+		of_node_put(fw_features);
+
+		if (security_ftr_enabled(SEC_FTR_L1D_FLUSH_TRIG2))
+			type = L1D_FLUSH_MTTRIG;
+
+		if (security_ftr_enabled(SEC_FTR_L1D_FLUSH_ORI30))
+			type = L1D_FLUSH_ORI;
+	}
+
+	/*
+	 * If we are non-Power9 bare metal, we don't need to flush on kernel
+	 * entry or after user access: they fix a P9 specific vulnerability.
+	 */
+	if (!pvr_version_is(PVR_POWER9)) {
+		security_ftr_clear(SEC_FTR_L1D_FLUSH_ENTRY);
+		security_ftr_clear(SEC_FTR_L1D_FLUSH_UACCESS);
+	}
+
+	enable = security_ftr_enabled(SEC_FTR_FAVOUR_SECURITY) && \
+		 (security_ftr_enabled(SEC_FTR_L1D_FLUSH_PR)   || \
+		  security_ftr_enabled(SEC_FTR_L1D_FLUSH_HV));
+
+	setup_rfi_flush(type, enable);
+	setup_count_cache_flush();
+
+	enable = security_ftr_enabled(SEC_FTR_FAVOUR_SECURITY) &&
+		 security_ftr_enabled(SEC_FTR_L1D_FLUSH_ENTRY);
+	setup_entry_flush(enable);
+
+	enable = security_ftr_enabled(SEC_FTR_FAVOUR_SECURITY) &&
+		 security_ftr_enabled(SEC_FTR_L1D_FLUSH_UACCESS);
+	setup_uaccess_flush(enable);
+
+	setup_stf_barrier();
+}
+
+static void __init pnv_check_guarded_cores(void)
+{
+	struct device_node *dn;
+	int bad_count = 0;
+
+	for_each_node_by_type(dn, "cpu") {
+		if (of_property_match_string(dn, "status", "bad") >= 0)
+			bad_count++;
+	};
+
+	if (bad_count) {
+		printk("  _     _______________\n");
+		pr_cont(" | |   /               \\\n");
+		pr_cont(" | |   |    WARNING!   |\n");
+		pr_cont(" | |   |               |\n");
+		pr_cont(" | |   | It looks like |\n");
+		pr_cont(" |_|   |  you have %*d |\n", 3, bad_count);
+		pr_cont("  _    | guarded cores |\n");
+		pr_cont(" (_)   \\_______________/\n");
+	}
+}
+
+static void __init pnv_setup_arch(void)
+{
+	set_arch_panic_timeout(10, ARCH_PANIC_TIMEOUT);
+
+	pnv_setup_security_mitigations();
+
+	/* Initialize SMP */
+	pnv_smp_init();
+
+	/* Setup PCI */
+	pnv_pci_init();
+
+	/* Setup RTC and NVRAM callbacks */
+	if (firmware_has_feature(FW_FEATURE_OPAL))
+		opal_nvram_init();
+
+	/* Enable NAP mode */
+	powersave_nap = 1;
+
+	pnv_check_guarded_cores();
+
+	/* XXX PMCS */
+
+	pnv_rng_init();
+}
+
+static void __init pnv_init(void)
+{
+	/*
+	 * Initialize the LPC bus now so that legacy serial
+	 * ports can be found on it
+	 */
+	opal_lpc_init();
+
+#ifdef CONFIG_HVC_OPAL
+	if (firmware_has_feature(FW_FEATURE_OPAL))
+		hvc_opal_init_early();
+	else
+#endif
+		add_preferred_console("hvc", 0, NULL);
+
+	if (!radix_enabled()) {
+		size_t size = sizeof(struct slb_entry) * mmu_slb_size;
+		int i;
+
+		/* Allocate per cpu area to save old slb contents during MCE */
+		for_each_possible_cpu(i) {
+			paca_ptrs[i]->mce_faulty_slbs =
+					memblock_alloc_node(size,
+						__alignof__(struct slb_entry),
+						cpu_to_node(i));
+		}
+	}
+}
+
+static void __init pnv_init_IRQ(void)
+{
+	/* Try using a XIVE if available, otherwise use a XICS */
+	if (!xive_native_init())
+		xics_init();
+
+	WARN_ON(!ppc_md.get_irq);
+}
+
+static void pnv_show_cpuinfo(struct seq_file *m)
+{
+	struct device_node *root;
+	const char *model = "";
+
+	root = of_find_node_by_path("/");
+	if (root)
+		model = of_get_property(root, "model", NULL);
+	seq_printf(m, "machine\t\t: PowerNV %s\n", model);
+	if (firmware_has_feature(FW_FEATURE_OPAL))
+		seq_printf(m, "firmware\t: OPAL\n");
+	else
+		seq_printf(m, "firmware\t: BML\n");
+	of_node_put(root);
+	if (radix_enabled())
+		seq_printf(m, "MMU\t\t: Radix\n");
+	else
+		seq_printf(m, "MMU\t\t: Hash\n");
+}
+
+static void pnv_prepare_going_down(void)
+{
+	/*
+	 * Disable all notifiers from OPAL, we can't
+	 * service interrupts anymore anyway
+	 */
+	opal_event_shutdown();
+
+	/* Print flash update message if one is scheduled. */
+	opal_flash_update_print_message();
+
+	smp_send_stop();
+
+	hard_irq_disable();
+}
+
+static void  __noreturn pnv_restart(char *cmd)
+{
+	long rc;
+
+	pnv_prepare_going_down();
+
+	do {
+		if (!cmd || !strlen(cmd))
+			rc = opal_cec_reboot();
+		else if (strcmp(cmd, "full") == 0)
+			rc = opal_cec_reboot2(OPAL_REBOOT_FULL_IPL, NULL);
+		else if (strcmp(cmd, "mpipl") == 0)
+			rc = opal_cec_reboot2(OPAL_REBOOT_MPIPL, NULL);
+		else if (strcmp(cmd, "error") == 0)
+			rc = opal_cec_reboot2(OPAL_REBOOT_PLATFORM_ERROR, NULL);
+		else if (strcmp(cmd, "fast") == 0)
+			rc = opal_cec_reboot2(OPAL_REBOOT_FAST, NULL);
+		else
+			rc = OPAL_UNSUPPORTED;
+
+		if (rc == OPAL_BUSY || rc == OPAL_BUSY_EVENT) {
+			/* Opal is busy wait for some time and retry */
+			opal_poll_events(NULL);
+			mdelay(10);
+
+		} else	if (cmd && rc) {
+			/* Unknown error while issuing reboot */
+			if (rc == OPAL_UNSUPPORTED)
+				pr_err("Unsupported '%s' reboot.\n", cmd);
+			else
+				pr_err("Unable to issue '%s' reboot. Err=%ld\n",
+				       cmd, rc);
+			pr_info("Forcing a cec-reboot\n");
+			cmd = NULL;
+			rc = OPAL_BUSY;
+
+		} else if (rc != OPAL_SUCCESS) {
+			/* Unknown error while issuing cec-reboot */
+			pr_err("Unable to reboot. Err=%ld\n", rc);
+		}
+
+	} while (rc == OPAL_BUSY || rc == OPAL_BUSY_EVENT);
+
+	for (;;)
+		opal_poll_events(NULL);
+}
+
+static void __noreturn pnv_power_off(void)
+{
+	long rc = OPAL_BUSY;
+
+	pnv_prepare_going_down();
+
+	while (rc == OPAL_BUSY || rc == OPAL_BUSY_EVENT) {
+		rc = opal_cec_power_down(0);
+		if (rc == OPAL_BUSY_EVENT)
+			opal_poll_events(NULL);
+		else
+			mdelay(10);
+	}
+	for (;;)
+		opal_poll_events(NULL);
+}
+
+static void __noreturn pnv_halt(void)
+{
+	pnv_power_off();
+}
+
+static void pnv_progress(char *s, unsigned short hex)
+{
+}
+
+static void pnv_shutdown(void)
+{
+	/* Let the PCI code clear up IODA tables */
+	pnv_pci_shutdown();
+
+	/*
+	 * Stop OPAL activity: Unregister all OPAL interrupts so they
+	 * don't fire up while we kexec and make sure all potentially
+	 * DMA'ing ops are complete (such as dump retrieval).
+	 */
+	opal_shutdown();
+}
+
+#ifdef CONFIG_KEXEC_CORE
+static void pnv_kexec_wait_secondaries_down(void)
+{
+	int my_cpu, i, notified = -1;
+
+	my_cpu = get_cpu();
+
+	for_each_online_cpu(i) {
+		uint8_t status;
+		int64_t rc, timeout = 1000;
+
+		if (i == my_cpu)
+			continue;
+
+		for (;;) {
+			rc = opal_query_cpu_status(get_hard_smp_processor_id(i),
+						   &status);
+			if (rc != OPAL_SUCCESS || status != OPAL_THREAD_STARTED)
+				break;
+			barrier();
+			if (i != notified) {
+				printk(KERN_INFO "kexec: waiting for cpu %d "
+				       "(physical %d) to enter OPAL\n",
+				       i, paca_ptrs[i]->hw_cpu_id);
+				notified = i;
+			}
+
+			/*
+			 * On crash secondaries might be unreachable or hung,
+			 * so timeout if we've waited too long
+			 * */
+			mdelay(1);
+			if (timeout-- == 0) {
+				printk(KERN_ERR "kexec: timed out waiting for "
+				       "cpu %d (physical %d) to enter OPAL\n",
+				       i, paca_ptrs[i]->hw_cpu_id);
+				break;
+			}
+		}
+	}
+}
+
+static void pnv_kexec_cpu_down(int crash_shutdown, int secondary)
+{
+	u64 reinit_flags;
+
+	if (xive_enabled())
+		xive_teardown_cpu();
+	else
+		xics_kexec_teardown_cpu(secondary);
+
+	/* On OPAL, we return all CPUs to firmware */
+	if (!firmware_has_feature(FW_FEATURE_OPAL))
+		return;
+
+	if (secondary) {
+		/* Return secondary CPUs to firmware on OPAL v3 */
+		mb();
+		get_paca()->kexec_state = KEXEC_STATE_REAL_MODE;
+		mb();
+
+		/* Return the CPU to OPAL */
+		opal_return_cpu();
+	} else {
+		/* Primary waits for the secondaries to have reached OPAL */
+		pnv_kexec_wait_secondaries_down();
+
+		/* Switch XIVE back to emulation mode */
+		if (xive_enabled())
+			xive_shutdown();
+
+		/*
+		 * We might be running as little-endian - now that interrupts
+		 * are disabled, reset the HILE bit to big-endian so we don't
+		 * take interrupts in the wrong endian later
+		 *
+		 * We reinit to enable both radix and hash on P9 to ensure
+		 * the mode used by the next kernel is always supported.
+		 */
+		reinit_flags = OPAL_REINIT_CPUS_HILE_BE;
+		if (cpu_has_feature(CPU_FTR_ARCH_300))
+			reinit_flags |= OPAL_REINIT_CPUS_MMU_RADIX |
+				OPAL_REINIT_CPUS_MMU_HASH;
+		opal_reinit_cpus(reinit_flags);
+	}
+}
+#endif /* CONFIG_KEXEC_CORE */
+
+#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
+static unsigned long pnv_memory_block_size(void)
+{
+	/*
+	 * We map the kernel linear region with 1GB large pages on radix. For
+	 * memory hot unplug to work our memory block size must be at least
+	 * this size.
+	 */
+	if (radix_enabled())
+		return radix_mem_block_size;
+	else
+		return 256UL * 1024 * 1024;
+}
+#endif
+
+static void __init pnv_setup_machdep_opal(void)
+{
+	ppc_md.get_boot_time = opal_get_boot_time;
+	ppc_md.restart = pnv_restart;
+	pm_power_off = pnv_power_off;
+	ppc_md.halt = pnv_halt;
+	/* ppc_md.system_reset_exception gets filled in by pnv_smp_init() */
+	ppc_md.machine_check_exception = opal_machine_check;
+	ppc_md.mce_check_early_recovery = opal_mce_check_early_recovery;
+	if (opal_check_token(OPAL_HANDLE_HMI2))
+		ppc_md.hmi_exception_early = opal_hmi_exception_early2;
+	else
+		ppc_md.hmi_exception_early = opal_hmi_exception_early;
+	ppc_md.handle_hmi_exception = opal_handle_hmi_exception;
+}
+
+static int __init pnv_probe(void)
+{
+	if (!of_machine_is_compatible("ibm,powernv"))
+		return 0;
+
+	if (firmware_has_feature(FW_FEATURE_OPAL))
+		pnv_setup_machdep_opal();
+
+	pr_debug("PowerNV detected !\n");
+
+	pnv_init();
+
+	return 1;
+}
+
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+void __init pnv_tm_init(void)
+{
+	if (!firmware_has_feature(FW_FEATURE_OPAL) ||
+	    !pvr_version_is(PVR_POWER9) ||
+	    early_cpu_has_feature(CPU_FTR_TM))
+		return;
+
+	if (opal_reinit_cpus(OPAL_REINIT_CPUS_TM_SUSPEND_DISABLED) != OPAL_SUCCESS)
+		return;
+
+	pr_info("Enabling TM (Transactional Memory) with Suspend Disabled\n");
+	cur_cpu_spec->cpu_features |= CPU_FTR_TM;
+	/* Make sure "normal" HTM is off (it should be) */
+	cur_cpu_spec->cpu_user_features2 &= ~PPC_FEATURE2_HTM;
+	/* Turn on no suspend mode, and HTM no SC */
+	cur_cpu_spec->cpu_user_features2 |= PPC_FEATURE2_HTM_NO_SUSPEND | \
+					    PPC_FEATURE2_HTM_NOSC;
+	tm_suspend_disabled = true;
+}
+#endif /* CONFIG_PPC_TRANSACTIONAL_MEM */
+
+/*
+ * Returns the cpu frequency for 'cpu' in Hz. This is used by
+ * /proc/cpuinfo
+ */
+static unsigned long pnv_get_proc_freq(unsigned int cpu)
+{
+	unsigned long ret_freq;
+
+	ret_freq = cpufreq_get(cpu) * 1000ul;
+
+	/*
+	 * If the backend cpufreq driver does not exist,
+         * then fallback to old way of reporting the clockrate.
+	 */
+	if (!ret_freq)
+		ret_freq = ppc_proc_freq;
+	return ret_freq;
+}
+
+static long pnv_machine_check_early(struct pt_regs *regs)
+{
+	long handled = 0;
+
+	if (cur_cpu_spec && cur_cpu_spec->machine_check_early)
+		handled = cur_cpu_spec->machine_check_early(regs);
+
+	return handled;
+}
+
+define_machine(powernv) {
+	.name			= "PowerNV",
+	.probe			= pnv_probe,
+	.setup_arch		= pnv_setup_arch,
+	.init_IRQ		= pnv_init_IRQ,
+	.show_cpuinfo		= pnv_show_cpuinfo,
+	.get_proc_freq          = pnv_get_proc_freq,
+	.progress		= pnv_progress,
+	.machine_shutdown	= pnv_shutdown,
+	.power_save             = NULL,
+	.calibrate_decr		= generic_calibrate_decr,
+	.machine_check_early	= pnv_machine_check_early,
+#ifdef CONFIG_KEXEC_CORE
+	.kexec_cpu_down		= pnv_kexec_cpu_down,
+#endif
+#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
+	.memory_block_size	= pnv_memory_block_size,
+#endif
+};
diff --git a/arch/powerpc/platforms/powernv/smp.c b/arch/powerpc/platforms/powernv/smp.c
new file mode 100644
index 000000000..cbb67813c
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/smp.c
@@ -0,0 +1,441 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * SMP support for PowerNV machines.
+ *
+ * Copyright 2011 IBM Corp.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/sched/hotplug.h>
+#include <linux/smp.h>
+#include <linux/interrupt.h>
+#include <linux/delay.h>
+#include <linux/init.h>
+#include <linux/spinlock.h>
+#include <linux/cpu.h>
+
+#include <asm/irq.h>
+#include <asm/smp.h>
+#include <asm/paca.h>
+#include <asm/machdep.h>
+#include <asm/cputable.h>
+#include <asm/firmware.h>
+#include <asm/vdso_datapage.h>
+#include <asm/cputhreads.h>
+#include <asm/xics.h>
+#include <asm/xive.h>
+#include <asm/opal.h>
+#include <asm/runlatch.h>
+#include <asm/code-patching.h>
+#include <asm/dbell.h>
+#include <asm/kvm_ppc.h>
+#include <asm/ppc-opcode.h>
+#include <asm/cpuidle.h>
+#include <asm/kexec.h>
+#include <asm/reg.h>
+#include <asm/powernv.h>
+
+#include "powernv.h"
+
+#ifdef DEBUG
+#include <asm/udbg.h>
+#define DBG(fmt...) udbg_printf(fmt)
+#else
+#define DBG(fmt...) do { } while (0)
+#endif
+
+static void pnv_smp_setup_cpu(int cpu)
+{
+	/*
+	 * P9 workaround for CI vector load (see traps.c),
+	 * enable the corresponding HMI interrupt
+	 */
+	if (pvr_version_is(PVR_POWER9))
+		mtspr(SPRN_HMEER, mfspr(SPRN_HMEER) | PPC_BIT(17));
+
+	if (xive_enabled())
+		xive_smp_setup_cpu();
+	else if (cpu != boot_cpuid)
+		xics_setup_cpu();
+}
+
+static int pnv_smp_kick_cpu(int nr)
+{
+	unsigned int pcpu;
+	unsigned long start_here =
+			__pa(ppc_function_entry(generic_secondary_smp_init));
+	long rc;
+	uint8_t status;
+
+	if (nr < 0 || nr >= nr_cpu_ids)
+		return -EINVAL;
+
+	pcpu = get_hard_smp_processor_id(nr);
+	/*
+	 * If we already started or OPAL is not supported, we just
+	 * kick the CPU via the PACA
+	 */
+	if (paca_ptrs[nr]->cpu_start || !firmware_has_feature(FW_FEATURE_OPAL))
+		goto kick;
+
+	/*
+	 * At this point, the CPU can either be spinning on the way in
+	 * from kexec or be inside OPAL waiting to be started for the
+	 * first time. OPAL v3 allows us to query OPAL to know if it
+	 * has the CPUs, so we do that
+	 */
+	rc = opal_query_cpu_status(pcpu, &status);
+	if (rc != OPAL_SUCCESS) {
+		pr_warn("OPAL Error %ld querying CPU %d state\n", rc, nr);
+		return -ENODEV;
+	}
+
+	/*
+	 * Already started, just kick it, probably coming from
+	 * kexec and spinning
+	 */
+	if (status == OPAL_THREAD_STARTED)
+		goto kick;
+
+	/*
+	 * Available/inactive, let's kick it
+	 */
+	if (status == OPAL_THREAD_INACTIVE) {
+		pr_devel("OPAL: Starting CPU %d (HW 0x%x)...\n", nr, pcpu);
+		rc = opal_start_cpu(pcpu, start_here);
+		if (rc != OPAL_SUCCESS) {
+			pr_warn("OPAL Error %ld starting CPU %d\n", rc, nr);
+			return -ENODEV;
+		}
+	} else {
+		/*
+		 * An unavailable CPU (or any other unknown status)
+		 * shouldn't be started. It should also
+		 * not be in the possible map but currently it can
+		 * happen
+		 */
+		pr_devel("OPAL: CPU %d (HW 0x%x) is unavailable"
+			 " (status %d)...\n", nr, pcpu, status);
+		return -ENODEV;
+	}
+
+kick:
+	return smp_generic_kick_cpu(nr);
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+
+static int pnv_smp_cpu_disable(void)
+{
+	int cpu = smp_processor_id();
+
+	/* This is identical to pSeries... might consolidate by
+	 * moving migrate_irqs_away to a ppc_md with default to
+	 * the generic fixup_irqs. --BenH.
+	 */
+	set_cpu_online(cpu, false);
+	vdso_data->processorCount--;
+	if (cpu == boot_cpuid)
+		boot_cpuid = cpumask_any(cpu_online_mask);
+	if (xive_enabled())
+		xive_smp_disable_cpu();
+	else
+		xics_migrate_irqs_away();
+
+	cleanup_cpu_mmu_context();
+
+	return 0;
+}
+
+static void pnv_flush_interrupts(void)
+{
+	if (cpu_has_feature(CPU_FTR_ARCH_300)) {
+		if (xive_enabled())
+			xive_flush_interrupt();
+		else
+			icp_opal_flush_interrupt();
+	} else {
+		icp_native_flush_interrupt();
+	}
+}
+
+static void pnv_cpu_offline_self(void)
+{
+	unsigned long srr1, unexpected_mask, wmask;
+	unsigned int cpu;
+	u64 lpcr_val;
+
+	/* Standard hot unplug procedure */
+
+	idle_task_exit();
+	cpu = smp_processor_id();
+	DBG("CPU%d offline\n", cpu);
+	generic_set_cpu_dead(cpu);
+	smp_wmb();
+
+	wmask = SRR1_WAKEMASK;
+	if (cpu_has_feature(CPU_FTR_ARCH_207S))
+		wmask = SRR1_WAKEMASK_P8;
+
+	/*
+	 * This turns the irq soft-disabled state we're called with, into a
+	 * hard-disabled state with pending irq_happened interrupts cleared.
+	 *
+	 * PACA_IRQ_DEC   - Decrementer should be ignored.
+	 * PACA_IRQ_HMI   - Can be ignored, processing is done in real mode.
+	 * PACA_IRQ_DBELL, EE, PMI - Unexpected.
+	 */
+	hard_irq_disable();
+	if (generic_check_cpu_restart(cpu))
+		goto out;
+
+	unexpected_mask = ~(PACA_IRQ_DEC | PACA_IRQ_HMI | PACA_IRQ_HARD_DIS);
+	if (local_paca->irq_happened & unexpected_mask) {
+		if (local_paca->irq_happened & PACA_IRQ_EE)
+			pnv_flush_interrupts();
+		DBG("CPU%d Unexpected exit while offline irq_happened=%lx!\n",
+				cpu, local_paca->irq_happened);
+	}
+	local_paca->irq_happened = PACA_IRQ_HARD_DIS;
+
+	/*
+	 * We don't want to take decrementer interrupts while we are
+	 * offline, so clear LPCR:PECE1. We keep PECE2 (and
+	 * LPCR_PECE_HVEE on P9) enabled so as to let IPIs in.
+	 *
+	 * If the CPU gets woken up by a special wakeup, ensure that
+	 * the SLW engine sets LPCR with decrementer bit cleared, else
+	 * the CPU will come back to the kernel due to a spurious
+	 * wakeup.
+	 */
+	lpcr_val = mfspr(SPRN_LPCR) & ~(u64)LPCR_PECE1;
+	pnv_program_cpu_hotplug_lpcr(cpu, lpcr_val);
+
+	while (!generic_check_cpu_restart(cpu)) {
+		/*
+		 * Clear IPI flag, since we don't handle IPIs while
+		 * offline, except for those when changing micro-threading
+		 * mode, which are handled explicitly below, and those
+		 * for coming online, which are handled via
+		 * generic_check_cpu_restart() calls.
+		 */
+		kvmppc_clear_host_ipi(cpu);
+
+		srr1 = pnv_cpu_offline(cpu);
+
+		WARN_ON_ONCE(!irqs_disabled());
+		WARN_ON(lazy_irq_pending());
+
+		/*
+		 * If the SRR1 value indicates that we woke up due to
+		 * an external interrupt, then clear the interrupt.
+		 * We clear the interrupt before checking for the
+		 * reason, so as to avoid a race where we wake up for
+		 * some other reason, find nothing and clear the interrupt
+		 * just as some other cpu is sending us an interrupt.
+		 * If we returned from power7_nap as a result of
+		 * having finished executing in a KVM guest, then srr1
+		 * contains 0.
+		 */
+		if (((srr1 & wmask) == SRR1_WAKEEE) ||
+		    ((srr1 & wmask) == SRR1_WAKEHVI)) {
+			pnv_flush_interrupts();
+		} else if ((srr1 & wmask) == SRR1_WAKEHDBELL) {
+			unsigned long msg = PPC_DBELL_TYPE(PPC_DBELL_SERVER);
+			asm volatile(PPC_MSGCLR(%0) : : "r" (msg));
+		} else if ((srr1 & wmask) == SRR1_WAKERESET) {
+			irq_set_pending_from_srr1(srr1);
+			/* Does not return */
+		}
+
+		smp_mb();
+
+		/*
+		 * For kdump kernels, we process the ipi and jump to
+		 * crash_ipi_callback
+		 */
+		if (kdump_in_progress()) {
+			/*
+			 * If we got to this point, we've not used
+			 * NMI's, otherwise we would have gone
+			 * via the SRR1_WAKERESET path. We are
+			 * using regular IPI's for waking up offline
+			 * threads.
+			 */
+			struct pt_regs regs;
+
+			ppc_save_regs(&regs);
+			crash_ipi_callback(&regs);
+			/* Does not return */
+		}
+
+		if (cpu_core_split_required())
+			continue;
+
+		if (srr1 && !generic_check_cpu_restart(cpu))
+			DBG("CPU%d Unexpected exit while offline srr1=%lx!\n",
+					cpu, srr1);
+
+	}
+
+	/*
+	 * Re-enable decrementer interrupts in LPCR.
+	 *
+	 * Further, we want stop states to be woken up by decrementer
+	 * for non-hotplug cases. So program the LPCR via stop api as
+	 * well.
+	 */
+	lpcr_val = mfspr(SPRN_LPCR) | (u64)LPCR_PECE1;
+	pnv_program_cpu_hotplug_lpcr(cpu, lpcr_val);
+out:
+	DBG("CPU%d coming online...\n", cpu);
+}
+
+#endif /* CONFIG_HOTPLUG_CPU */
+
+static int pnv_cpu_bootable(unsigned int nr)
+{
+	/*
+	 * Starting with POWER8, the subcore logic relies on all threads of a
+	 * core being booted so that they can participate in split mode
+	 * switches. So on those machines we ignore the smt_enabled_at_boot
+	 * setting (smt-enabled on the kernel command line).
+	 */
+	if (cpu_has_feature(CPU_FTR_ARCH_207S))
+		return 1;
+
+	return smp_generic_cpu_bootable(nr);
+}
+
+static int pnv_smp_prepare_cpu(int cpu)
+{
+	if (xive_enabled())
+		return xive_smp_prepare_cpu(cpu);
+	return 0;
+}
+
+/* Cause IPI as setup by the interrupt controller (xics or xive) */
+static void (*ic_cause_ipi)(int cpu);
+
+static void pnv_cause_ipi(int cpu)
+{
+	if (doorbell_try_core_ipi(cpu))
+		return;
+
+	ic_cause_ipi(cpu);
+}
+
+static void __init pnv_smp_probe(void)
+{
+	if (xive_enabled())
+		xive_smp_probe();
+	else
+		xics_smp_probe();
+
+	if (cpu_has_feature(CPU_FTR_DBELL)) {
+		ic_cause_ipi = smp_ops->cause_ipi;
+		WARN_ON(!ic_cause_ipi);
+
+		if (cpu_has_feature(CPU_FTR_ARCH_300))
+			smp_ops->cause_ipi = doorbell_global_ipi;
+		else
+			smp_ops->cause_ipi = pnv_cause_ipi;
+	}
+}
+
+static int pnv_system_reset_exception(struct pt_regs *regs)
+{
+	if (smp_handle_nmi_ipi(regs))
+		return 1;
+	return 0;
+}
+
+static int pnv_cause_nmi_ipi(int cpu)
+{
+	int64_t rc;
+
+	if (cpu >= 0) {
+		int h = get_hard_smp_processor_id(cpu);
+
+		if (opal_check_token(OPAL_QUIESCE))
+			opal_quiesce(QUIESCE_HOLD, h);
+
+		rc = opal_signal_system_reset(h);
+
+		if (opal_check_token(OPAL_QUIESCE))
+			opal_quiesce(QUIESCE_RESUME, h);
+
+		if (rc != OPAL_SUCCESS)
+			return 0;
+		return 1;
+
+	} else if (cpu == NMI_IPI_ALL_OTHERS) {
+		bool success = true;
+		int c;
+
+		if (opal_check_token(OPAL_QUIESCE))
+			opal_quiesce(QUIESCE_HOLD, -1);
+
+		/*
+		 * We do not use broadcasts (yet), because it's not clear
+		 * exactly what semantics Linux wants or the firmware should
+		 * provide.
+		 */
+		for_each_online_cpu(c) {
+			if (c == smp_processor_id())
+				continue;
+
+			rc = opal_signal_system_reset(
+						get_hard_smp_processor_id(c));
+			if (rc != OPAL_SUCCESS)
+				success = false;
+		}
+
+		if (opal_check_token(OPAL_QUIESCE))
+			opal_quiesce(QUIESCE_RESUME, -1);
+
+		if (success)
+			return 1;
+
+		/*
+		 * Caller will fall back to doorbells, which may pick
+		 * up the remainders.
+		 */
+	}
+
+	return 0;
+}
+
+static struct smp_ops_t pnv_smp_ops = {
+	.message_pass	= NULL, /* Use smp_muxed_ipi_message_pass */
+	.cause_ipi	= NULL,	/* Filled at runtime by pnv_smp_probe() */
+	.cause_nmi_ipi	= NULL,
+	.probe		= pnv_smp_probe,
+	.prepare_cpu	= pnv_smp_prepare_cpu,
+	.kick_cpu	= pnv_smp_kick_cpu,
+	.setup_cpu	= pnv_smp_setup_cpu,
+	.cpu_bootable	= pnv_cpu_bootable,
+#ifdef CONFIG_HOTPLUG_CPU
+	.cpu_disable	= pnv_smp_cpu_disable,
+	.cpu_die	= generic_cpu_die,
+	.cpu_offline_self = pnv_cpu_offline_self,
+#endif /* CONFIG_HOTPLUG_CPU */
+};
+
+/* This is called very early during platform setup_arch */
+void __init pnv_smp_init(void)
+{
+	if (opal_check_token(OPAL_SIGNAL_SYSTEM_RESET)) {
+		ppc_md.system_reset_exception = pnv_system_reset_exception;
+		pnv_smp_ops.cause_nmi_ipi = pnv_cause_nmi_ipi;
+	}
+	smp_ops = &pnv_smp_ops;
+
+#ifdef CONFIG_HOTPLUG_CPU
+#ifdef CONFIG_KEXEC_CORE
+	crash_wake_offline = 1;
+#endif
+#endif
+}
diff --git a/arch/powerpc/platforms/powernv/subcore-asm.S b/arch/powerpc/platforms/powernv/subcore-asm.S
new file mode 100644
index 000000000..e038f6761
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/subcore-asm.S
@@ -0,0 +1,91 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright 2013, Michael (Ellerman|Neuling), IBM Corporation.
+ */
+
+#include <asm/asm-offsets.h>
+#include <asm/ppc_asm.h>
+#include <asm/reg.h>
+
+#include "subcore.h"
+
+
+_GLOBAL(split_core_secondary_loop)
+	/*
+	 * r3 = u8 *state, used throughout the routine
+	 * r4 = temp
+	 * r5 = temp
+	 * ..
+	 * r12 = MSR
+	 */
+	mfmsr	r12
+
+	/* Disable interrupts so SRR0/1 don't get trashed */
+	li	r4,0
+	ori	r4,r4,MSR_EE|MSR_SE|MSR_BE|MSR_RI
+	andc	r4,r12,r4
+	sync
+	mtmsrd	r4
+
+	/* Switch to real mode and leave interrupts off */
+	li	r5, MSR_IR|MSR_DR
+	andc	r5, r4, r5
+
+	LOAD_REG_ADDR(r4, real_mode)
+
+	mtspr	SPRN_SRR0,r4
+	mtspr	SPRN_SRR1,r5
+	rfid
+	b	.	/* prevent speculative execution */
+
+real_mode:
+	/* Grab values from unsplit SPRs */
+	mfspr	r6,  SPRN_LDBAR
+	mfspr	r7,  SPRN_PMMAR
+	mfspr	r8,  SPRN_PMCR
+	mfspr	r9,  SPRN_RPR
+	mfspr	r10, SPRN_SDR1
+
+	/* Order reading the SPRs vs telling the primary we are ready to split */
+	sync
+
+	/* Tell thread 0 we are in real mode */
+	li	r4, SYNC_STEP_REAL_MODE
+	stb	r4, 0(r3)
+
+	li	r5, (HID0_POWER8_4LPARMODE | HID0_POWER8_2LPARMODE)@highest
+	sldi	r5, r5, 48
+
+	/* Loop until we see the split happen in HID0 */
+1:	mfspr	r4, SPRN_HID0
+	and.	r4, r4, r5
+	beq	1b
+
+	/*
+	 * We only need to initialise the below regs once for each subcore,
+	 * but it's simpler and harmless to do it on each thread.
+	 */
+
+	/* Make sure various SPRS have sane values */
+	li	r4, 0
+	mtspr	SPRN_LPID, r4
+	mtspr	SPRN_PCR, r4
+	mtspr	SPRN_HDEC, r4
+
+	/* Restore SPR values now we are split */
+	mtspr	SPRN_LDBAR, r6
+	mtspr	SPRN_PMMAR, r7
+	mtspr	SPRN_PMCR, r8
+	mtspr	SPRN_RPR, r9
+	mtspr	SPRN_SDR1, r10
+
+	LOAD_REG_ADDR(r5, virtual_mode)
+
+	/* Get out of real mode */
+	mtspr	SPRN_SRR0,r5
+	mtspr	SPRN_SRR1,r12
+	rfid
+	b	.	/* prevent speculative execution */
+
+virtual_mode:
+	blr
diff --git a/arch/powerpc/platforms/powernv/subcore.c b/arch/powerpc/platforms/powernv/subcore.c
new file mode 100644
index 000000000..73207b53d
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/subcore.c
@@ -0,0 +1,431 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright 2013, Michael (Ellerman|Neuling), IBM Corporation.
+ */
+
+#define pr_fmt(fmt)	"powernv: " fmt
+
+#include <linux/kernel.h>
+#include <linux/cpu.h>
+#include <linux/cpumask.h>
+#include <linux/device.h>
+#include <linux/gfp.h>
+#include <linux/smp.h>
+#include <linux/stop_machine.h>
+
+#include <asm/cputhreads.h>
+#include <asm/cpuidle.h>
+#include <asm/kvm_ppc.h>
+#include <asm/machdep.h>
+#include <asm/opal.h>
+#include <asm/smp.h>
+
+#include "subcore.h"
+#include "powernv.h"
+
+
+/*
+ * Split/unsplit procedure:
+ *
+ * A core can be in one of three states, unsplit, 2-way split, and 4-way split.
+ *
+ * The mapping to subcores_per_core is simple:
+ *
+ *  State       | subcores_per_core
+ *  ------------|------------------
+ *  Unsplit     |        1
+ *  2-way split |        2
+ *  4-way split |        4
+ *
+ * The core is split along thread boundaries, the mapping between subcores and
+ * threads is as follows:
+ *
+ *  Unsplit:
+ *          ----------------------------
+ *  Subcore |            0             |
+ *          ----------------------------
+ *  Thread  |  0  1  2  3  4  5  6  7  |
+ *          ----------------------------
+ *
+ *  2-way split:
+ *          -------------------------------------
+ *  Subcore |        0        |        1        |
+ *          -------------------------------------
+ *  Thread  |  0   1   2   3  |  4   5   6   7  |
+ *          -------------------------------------
+ *
+ *  4-way split:
+ *          -----------------------------------------
+ *  Subcore |    0    |    1    |    2    |    3    |
+ *          -----------------------------------------
+ *  Thread  |  0   1  |  2   3  |  4   5  |  6   7  |
+ *          -----------------------------------------
+ *
+ *
+ * Transitions
+ * -----------
+ *
+ * It is not possible to transition between either of the split states, the
+ * core must first be unsplit. The legal transitions are:
+ *
+ *  -----------          ---------------
+ *  |         |  <---->  | 2-way split |
+ *  |         |          ---------------
+ *  | Unsplit |
+ *  |         |          ---------------
+ *  |         |  <---->  | 4-way split |
+ *  -----------          ---------------
+ *
+ * Unsplitting
+ * -----------
+ *
+ * Unsplitting is the simpler procedure. It requires thread 0 to request the
+ * unsplit while all other threads NAP.
+ *
+ * Thread 0 clears HID0_POWER8_DYNLPARDIS (Dynamic LPAR Disable). This tells
+ * the hardware that if all threads except 0 are napping, the hardware should
+ * unsplit the core.
+ *
+ * Non-zero threads are sent to a NAP loop, they don't exit the loop until they
+ * see the core unsplit.
+ *
+ * Core 0 spins waiting for the hardware to see all the other threads napping
+ * and perform the unsplit.
+ *
+ * Once thread 0 sees the unsplit, it IPIs the secondary threads to wake them
+ * out of NAP. They will then see the core unsplit and exit the NAP loop.
+ *
+ * Splitting
+ * ---------
+ *
+ * The basic splitting procedure is fairly straight forward. However it is
+ * complicated by the fact that after the split occurs, the newly created
+ * subcores are not in a fully initialised state.
+ *
+ * Most notably the subcores do not have the correct value for SDR1, which
+ * means they must not be running in virtual mode when the split occurs. The
+ * subcores have separate timebases SPRs but these are pre-synchronised by
+ * opal.
+ *
+ * To begin with secondary threads are sent to an assembly routine. There they
+ * switch to real mode, so they are immune to the uninitialised SDR1 value.
+ * Once in real mode they indicate that they are in real mode, and spin waiting
+ * to see the core split.
+ *
+ * Thread 0 waits to see that all secondaries are in real mode, and then begins
+ * the splitting procedure. It firstly sets HID0_POWER8_DYNLPARDIS, which
+ * prevents the hardware from unsplitting. Then it sets the appropriate HID bit
+ * to request the split, and spins waiting to see that the split has happened.
+ *
+ * Concurrently the secondaries will notice the split. When they do they set up
+ * their SPRs, notably SDR1, and then they can return to virtual mode and exit
+ * the procedure.
+ */
+
+/* Initialised at boot by subcore_init() */
+static int subcores_per_core;
+
+/*
+ * Used to communicate to offline cpus that we want them to pop out of the
+ * offline loop and do a split or unsplit.
+ *
+ * 0 - no split happening
+ * 1 - unsplit in progress
+ * 2 - split to 2 in progress
+ * 4 - split to 4 in progress
+ */
+static int new_split_mode;
+
+static cpumask_var_t cpu_offline_mask;
+
+struct split_state {
+	u8 step;
+	u8 master;
+};
+
+static DEFINE_PER_CPU(struct split_state, split_state);
+
+static void wait_for_sync_step(int step)
+{
+	int i, cpu = smp_processor_id();
+
+	for (i = cpu + 1; i < cpu + threads_per_core; i++)
+		while(per_cpu(split_state, i).step < step)
+			barrier();
+
+	/* Order the wait loop vs any subsequent loads/stores. */
+	mb();
+}
+
+static void update_hid_in_slw(u64 hid0)
+{
+	u64 idle_states = pnv_get_supported_cpuidle_states();
+
+	if (idle_states & OPAL_PM_WINKLE_ENABLED) {
+		/* OPAL call to patch slw with the new HID0 value */
+		u64 cpu_pir = hard_smp_processor_id();
+
+		opal_slw_set_reg(cpu_pir, SPRN_HID0, hid0);
+	}
+}
+
+static void unsplit_core(void)
+{
+	u64 hid0, mask;
+	int i, cpu;
+
+	mask = HID0_POWER8_2LPARMODE | HID0_POWER8_4LPARMODE;
+
+	cpu = smp_processor_id();
+	if (cpu_thread_in_core(cpu) != 0) {
+		while (mfspr(SPRN_HID0) & mask)
+			power7_idle_type(PNV_THREAD_NAP);
+
+		per_cpu(split_state, cpu).step = SYNC_STEP_UNSPLIT;
+		return;
+	}
+
+	hid0 = mfspr(SPRN_HID0);
+	hid0 &= ~HID0_POWER8_DYNLPARDIS;
+	update_power8_hid0(hid0);
+	update_hid_in_slw(hid0);
+
+	while (mfspr(SPRN_HID0) & mask)
+		cpu_relax();
+
+	/* Wake secondaries out of NAP */
+	for (i = cpu + 1; i < cpu + threads_per_core; i++)
+		smp_send_reschedule(i);
+
+	wait_for_sync_step(SYNC_STEP_UNSPLIT);
+}
+
+static void split_core(int new_mode)
+{
+	struct {  u64 value; u64 mask; } split_parms[2] = {
+		{ HID0_POWER8_1TO2LPAR, HID0_POWER8_2LPARMODE },
+		{ HID0_POWER8_1TO4LPAR, HID0_POWER8_4LPARMODE }
+	};
+	int i, cpu;
+	u64 hid0;
+
+	/* Convert new_mode (2 or 4) into an index into our parms array */
+	i = (new_mode >> 1) - 1;
+	BUG_ON(i < 0 || i > 1);
+
+	cpu = smp_processor_id();
+	if (cpu_thread_in_core(cpu) != 0) {
+		split_core_secondary_loop(&per_cpu(split_state, cpu).step);
+		return;
+	}
+
+	wait_for_sync_step(SYNC_STEP_REAL_MODE);
+
+	/* Write new mode */
+	hid0  = mfspr(SPRN_HID0);
+	hid0 |= HID0_POWER8_DYNLPARDIS | split_parms[i].value;
+	update_power8_hid0(hid0);
+	update_hid_in_slw(hid0);
+
+	/* Wait for it to happen */
+	while (!(mfspr(SPRN_HID0) & split_parms[i].mask))
+		cpu_relax();
+}
+
+static void cpu_do_split(int new_mode)
+{
+	/*
+	 * At boot subcores_per_core will be 0, so we will always unsplit at
+	 * boot. In the usual case where the core is already unsplit it's a
+	 * nop, and this just ensures the kernel's notion of the mode is
+	 * consistent with the hardware.
+	 */
+	if (subcores_per_core != 1)
+		unsplit_core();
+
+	if (new_mode != 1)
+		split_core(new_mode);
+
+	mb();
+	per_cpu(split_state, smp_processor_id()).step = SYNC_STEP_FINISHED;
+}
+
+bool cpu_core_split_required(void)
+{
+	smp_rmb();
+
+	if (!new_split_mode)
+		return false;
+
+	cpu_do_split(new_split_mode);
+
+	return true;
+}
+
+void update_subcore_sibling_mask(void)
+{
+	int cpu;
+	/*
+	 * sibling mask for the first cpu. Left shift this by required bits
+	 * to get sibling mask for the rest of the cpus.
+	 */
+	int sibling_mask_first_cpu =  (1 << threads_per_subcore) - 1;
+
+	for_each_possible_cpu(cpu) {
+		int tid = cpu_thread_in_core(cpu);
+		int offset = (tid / threads_per_subcore) * threads_per_subcore;
+		int mask = sibling_mask_first_cpu << offset;
+
+		paca_ptrs[cpu]->subcore_sibling_mask = mask;
+
+	}
+}
+
+static int cpu_update_split_mode(void *data)
+{
+	int cpu, new_mode = *(int *)data;
+
+	if (this_cpu_ptr(&split_state)->master) {
+		new_split_mode = new_mode;
+		smp_wmb();
+
+		cpumask_andnot(cpu_offline_mask, cpu_present_mask,
+			       cpu_online_mask);
+
+		/* This should work even though the cpu is offline */
+		for_each_cpu(cpu, cpu_offline_mask)
+			smp_send_reschedule(cpu);
+	}
+
+	cpu_do_split(new_mode);
+
+	if (this_cpu_ptr(&split_state)->master) {
+		/* Wait for all cpus to finish before we touch subcores_per_core */
+		for_each_present_cpu(cpu) {
+			if (cpu >= setup_max_cpus)
+				break;
+
+			while(per_cpu(split_state, cpu).step < SYNC_STEP_FINISHED)
+				barrier();
+		}
+
+		new_split_mode = 0;
+
+		/* Make the new mode public */
+		subcores_per_core = new_mode;
+		threads_per_subcore = threads_per_core / subcores_per_core;
+		update_subcore_sibling_mask();
+
+		/* Make sure the new mode is written before we exit */
+		mb();
+	}
+
+	return 0;
+}
+
+static int set_subcores_per_core(int new_mode)
+{
+	struct split_state *state;
+	int cpu;
+
+	if (kvm_hv_mode_active()) {
+		pr_err("Unable to change split core mode while KVM active.\n");
+		return -EBUSY;
+	}
+
+	/*
+	 * We are only called at boot, or from the sysfs write. If that ever
+	 * changes we'll need a lock here.
+	 */
+	BUG_ON(new_mode < 1 || new_mode > 4 || new_mode == 3);
+
+	for_each_present_cpu(cpu) {
+		state = &per_cpu(split_state, cpu);
+		state->step = SYNC_STEP_INITIAL;
+		state->master = 0;
+	}
+
+	cpus_read_lock();
+
+	/* This cpu will update the globals before exiting stop machine */
+	this_cpu_ptr(&split_state)->master = 1;
+
+	/* Ensure state is consistent before we call the other cpus */
+	mb();
+
+	stop_machine_cpuslocked(cpu_update_split_mode, &new_mode,
+				cpu_online_mask);
+
+	cpus_read_unlock();
+
+	return 0;
+}
+
+static ssize_t __used store_subcores_per_core(struct device *dev,
+		struct device_attribute *attr, const char *buf,
+		size_t count)
+{
+	unsigned long val;
+	int rc;
+
+	/* We are serialised by the attribute lock */
+
+	rc = sscanf(buf, "%lx", &val);
+	if (rc != 1)
+		return -EINVAL;
+
+	switch (val) {
+	case 1:
+	case 2:
+	case 4:
+		if (subcores_per_core == val)
+			/* Nothing to do */
+			goto out;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	rc = set_subcores_per_core(val);
+	if (rc)
+		return rc;
+
+out:
+	return count;
+}
+
+static ssize_t show_subcores_per_core(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	return sprintf(buf, "%x\n", subcores_per_core);
+}
+
+static DEVICE_ATTR(subcores_per_core, 0644,
+		show_subcores_per_core, store_subcores_per_core);
+
+static int subcore_init(void)
+{
+	unsigned pvr_ver;
+
+	pvr_ver = PVR_VER(mfspr(SPRN_PVR));
+
+	if (pvr_ver != PVR_POWER8 &&
+	    pvr_ver != PVR_POWER8E &&
+	    pvr_ver != PVR_POWER8NVL)
+		return 0;
+
+	/*
+	 * We need all threads in a core to be present to split/unsplit so
+         * continue only if max_cpus are aligned to threads_per_core.
+	 */
+	if (setup_max_cpus % threads_per_core)
+		return 0;
+
+	BUG_ON(!alloc_cpumask_var(&cpu_offline_mask, GFP_KERNEL));
+
+	set_subcores_per_core(1);
+
+	return device_create_file(cpu_subsys.dev_root,
+				  &dev_attr_subcores_per_core);
+}
+machine_device_initcall(powernv, subcore_init);
diff --git a/arch/powerpc/platforms/powernv/subcore.h b/arch/powerpc/platforms/powernv/subcore.h
new file mode 100644
index 000000000..c8f574d1c
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/subcore.h
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright 2013, Michael Ellerman, IBM Corporation.
+ */
+
+/* These are ordered and tested with <= */
+#define SYNC_STEP_INITIAL	0
+#define SYNC_STEP_UNSPLIT	1	/* Set by secondary when it sees unsplit */
+#define SYNC_STEP_REAL_MODE	2	/* Set by secondary when in real mode  */
+#define SYNC_STEP_FINISHED	3	/* Set by secondary when split/unsplit is done */
+
+#ifndef __ASSEMBLY__
+
+#ifdef CONFIG_SMP
+void split_core_secondary_loop(u8 *state);
+extern void update_subcore_sibling_mask(void);
+#else
+static inline void update_subcore_sibling_mask(void) { };
+#endif /* CONFIG_SMP */
+
+#endif /* __ASSEMBLY__ */
diff --git a/arch/powerpc/platforms/powernv/ultravisor.c b/arch/powerpc/platforms/powernv/ultravisor.c
new file mode 100644
index 000000000..67c8c4b2d
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/ultravisor.c
@@ -0,0 +1,70 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Ultravisor high level interfaces
+ *
+ * Copyright 2019, IBM Corporation.
+ *
+ */
+#include <linux/init.h>
+#include <linux/printk.h>
+#include <linux/of_fdt.h>
+#include <linux/of.h>
+
+#include <asm/ultravisor.h>
+#include <asm/firmware.h>
+#include <asm/machdep.h>
+
+#include "powernv.h"
+
+static struct kobject *ultravisor_kobj;
+
+int __init early_init_dt_scan_ultravisor(unsigned long node, const char *uname,
+					 int depth, void *data)
+{
+	if (!of_flat_dt_is_compatible(node, "ibm,ultravisor"))
+		return 0;
+
+	powerpc_firmware_features |= FW_FEATURE_ULTRAVISOR;
+	pr_debug("Ultravisor detected!\n");
+	return 1;
+}
+
+static struct memcons *uv_memcons;
+
+static ssize_t uv_msglog_read(struct file *file, struct kobject *kobj,
+			      struct bin_attribute *bin_attr, char *to,
+			      loff_t pos, size_t count)
+{
+	return memcons_copy(uv_memcons, to, pos, count);
+}
+
+static struct bin_attribute uv_msglog_attr = {
+	.attr = {.name = "msglog", .mode = 0400},
+	.read = uv_msglog_read
+};
+
+static int __init uv_init(void)
+{
+	struct device_node *node;
+
+	if (!firmware_has_feature(FW_FEATURE_ULTRAVISOR))
+		return 0;
+
+	node = of_find_compatible_node(NULL, NULL, "ibm,uv-firmware");
+	if (!node)
+		return -ENODEV;
+
+	uv_memcons = memcons_init(node, "memcons");
+	of_node_put(node);
+	if (!uv_memcons)
+		return -ENOENT;
+
+	uv_msglog_attr.size = memcons_get_size(uv_memcons);
+
+	ultravisor_kobj = kobject_create_and_add("ultravisor", firmware_kobj);
+	if (!ultravisor_kobj)
+		return -ENOMEM;
+
+	return sysfs_create_bin_file(ultravisor_kobj, &uv_msglog_attr);
+}
+machine_subsys_initcall(powernv, uv_init);
diff --git a/arch/powerpc/platforms/powernv/vas-api.c b/arch/powerpc/platforms/powernv/vas-api.c
new file mode 100644
index 000000000..98ed5d8c5
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/vas-api.c
@@ -0,0 +1,278 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * VAS user space API for its accelerators (Only NX-GZIP is supported now)
+ * Copyright (C) 2019 Haren Myneni, IBM Corp
+ */
+
+#include <linux/kernel.h>
+#include <linux/device.h>
+#include <linux/cdev.h>
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <asm/vas.h>
+#include <uapi/asm/vas-api.h>
+#include "vas.h"
+
+/*
+ * The driver creates the device node that can be used as follows:
+ * For NX-GZIP
+ *
+ *	fd = open("/dev/crypto/nx-gzip", O_RDWR);
+ *	rc = ioctl(fd, VAS_TX_WIN_OPEN, &attr);
+ *	paste_addr = mmap(NULL, PAGE_SIZE, prot, MAP_SHARED, fd, 0ULL).
+ *	vas_copy(&crb, 0, 1);
+ *	vas_paste(paste_addr, 0, 1);
+ *	close(fd) or exit process to close window.
+ *
+ * where "vas_copy" and "vas_paste" are defined in copy-paste.h.
+ * copy/paste returns to the user space directly. So refer NX hardware
+ * documententation for exact copy/paste usage and completion / error
+ * conditions.
+ */
+
+/*
+ * Wrapper object for the nx-gzip device - there is just one instance of
+ * this node for the whole system.
+ */
+static struct coproc_dev {
+	struct cdev cdev;
+	struct device *device;
+	char *name;
+	dev_t devt;
+	struct class *class;
+	enum vas_cop_type cop_type;
+} coproc_device;
+
+struct coproc_instance {
+	struct coproc_dev *coproc;
+	struct vas_window *txwin;
+};
+
+static char *coproc_devnode(struct device *dev, umode_t *mode)
+{
+	return kasprintf(GFP_KERNEL, "crypto/%s", dev_name(dev));
+}
+
+static int coproc_open(struct inode *inode, struct file *fp)
+{
+	struct coproc_instance *cp_inst;
+
+	cp_inst = kzalloc(sizeof(*cp_inst), GFP_KERNEL);
+	if (!cp_inst)
+		return -ENOMEM;
+
+	cp_inst->coproc = container_of(inode->i_cdev, struct coproc_dev,
+					cdev);
+	fp->private_data = cp_inst;
+
+	return 0;
+}
+
+static int coproc_ioc_tx_win_open(struct file *fp, unsigned long arg)
+{
+	void __user *uptr = (void __user *)arg;
+	struct vas_tx_win_attr txattr = {};
+	struct vas_tx_win_open_attr uattr;
+	struct coproc_instance *cp_inst;
+	struct vas_window *txwin;
+	int rc, vasid;
+
+	cp_inst = fp->private_data;
+
+	/*
+	 * One window for file descriptor
+	 */
+	if (cp_inst->txwin)
+		return -EEXIST;
+
+	rc = copy_from_user(&uattr, uptr, sizeof(uattr));
+	if (rc) {
+		pr_err("%s(): copy_from_user() returns %d\n", __func__, rc);
+		return -EFAULT;
+	}
+
+	if (uattr.version != 1) {
+		pr_err("Invalid version\n");
+		return -EINVAL;
+	}
+
+	vasid = uattr.vas_id;
+
+	vas_init_tx_win_attr(&txattr, cp_inst->coproc->cop_type);
+
+	txattr.lpid = mfspr(SPRN_LPID);
+	txattr.pidr = mfspr(SPRN_PID);
+	txattr.user_win = true;
+	txattr.rsvd_txbuf_count = false;
+	txattr.pswid = false;
+
+	pr_devel("Pid %d: Opening txwin, PIDR %ld\n", txattr.pidr,
+				mfspr(SPRN_PID));
+
+	txwin = vas_tx_win_open(vasid, cp_inst->coproc->cop_type, &txattr);
+	if (IS_ERR(txwin)) {
+		pr_err("%s() vas_tx_win_open() failed, %ld\n", __func__,
+					PTR_ERR(txwin));
+		return PTR_ERR(txwin);
+	}
+
+	cp_inst->txwin = txwin;
+
+	return 0;
+}
+
+static int coproc_release(struct inode *inode, struct file *fp)
+{
+	struct coproc_instance *cp_inst = fp->private_data;
+
+	if (cp_inst->txwin) {
+		vas_win_close(cp_inst->txwin);
+		cp_inst->txwin = NULL;
+	}
+
+	kfree(cp_inst);
+	fp->private_data = NULL;
+
+	/*
+	 * We don't know here if user has other receive windows
+	 * open, so we can't really call clear_thread_tidr().
+	 * So, once the process calls set_thread_tidr(), the
+	 * TIDR value sticks around until process exits, resulting
+	 * in an extra copy in restore_sprs().
+	 */
+
+	return 0;
+}
+
+static int coproc_mmap(struct file *fp, struct vm_area_struct *vma)
+{
+	struct coproc_instance *cp_inst = fp->private_data;
+	struct vas_window *txwin;
+	unsigned long pfn;
+	u64 paste_addr;
+	pgprot_t prot;
+	int rc;
+
+	txwin = cp_inst->txwin;
+
+	if ((vma->vm_end - vma->vm_start) > PAGE_SIZE) {
+		pr_debug("%s(): size 0x%zx, PAGE_SIZE 0x%zx\n", __func__,
+				(vma->vm_end - vma->vm_start), PAGE_SIZE);
+		return -EINVAL;
+	}
+
+	/* Ensure instance has an open send window */
+	if (!txwin) {
+		pr_err("%s(): No send window open?\n", __func__);
+		return -EINVAL;
+	}
+
+	vas_win_paste_addr(txwin, &paste_addr, NULL);
+	pfn = paste_addr >> PAGE_SHIFT;
+
+	/* flags, page_prot from cxl_mmap(), except we want cachable */
+	vma->vm_flags |= VM_IO | VM_PFNMAP;
+	vma->vm_page_prot = pgprot_cached(vma->vm_page_prot);
+
+	prot = __pgprot(pgprot_val(vma->vm_page_prot) | _PAGE_DIRTY);
+
+	rc = remap_pfn_range(vma, vma->vm_start, pfn + vma->vm_pgoff,
+			vma->vm_end - vma->vm_start, prot);
+
+	pr_devel("%s(): paste addr %llx at %lx, rc %d\n", __func__,
+			paste_addr, vma->vm_start, rc);
+
+	return rc;
+}
+
+static long coproc_ioctl(struct file *fp, unsigned int cmd, unsigned long arg)
+{
+	switch (cmd) {
+	case VAS_TX_WIN_OPEN:
+		return coproc_ioc_tx_win_open(fp, arg);
+	default:
+		return -EINVAL;
+	}
+}
+
+static struct file_operations coproc_fops = {
+	.open = coproc_open,
+	.release = coproc_release,
+	.mmap = coproc_mmap,
+	.unlocked_ioctl = coproc_ioctl,
+};
+
+/*
+ * Supporting only nx-gzip coprocessor type now, but this API code
+ * extended to other coprocessor types later.
+ */
+int vas_register_coproc_api(struct module *mod, enum vas_cop_type cop_type,
+				const char *name)
+{
+	int rc = -EINVAL;
+	dev_t devno;
+
+	rc = alloc_chrdev_region(&coproc_device.devt, 1, 1, name);
+	if (rc) {
+		pr_err("Unable to allocate coproc major number: %i\n", rc);
+		return rc;
+	}
+
+	pr_devel("%s device allocated, dev [%i,%i]\n", name,
+			MAJOR(coproc_device.devt), MINOR(coproc_device.devt));
+
+	coproc_device.class = class_create(mod, name);
+	if (IS_ERR(coproc_device.class)) {
+		rc = PTR_ERR(coproc_device.class);
+		pr_err("Unable to create %s class %d\n", name, rc);
+		goto err_class;
+	}
+	coproc_device.class->devnode = coproc_devnode;
+	coproc_device.cop_type = cop_type;
+
+	coproc_fops.owner = mod;
+	cdev_init(&coproc_device.cdev, &coproc_fops);
+
+	devno = MKDEV(MAJOR(coproc_device.devt), 0);
+	rc = cdev_add(&coproc_device.cdev, devno, 1);
+	if (rc) {
+		pr_err("cdev_add() failed %d\n", rc);
+		goto err_cdev;
+	}
+
+	coproc_device.device = device_create(coproc_device.class, NULL,
+			devno, NULL, name, MINOR(devno));
+	if (IS_ERR(coproc_device.device)) {
+		rc = PTR_ERR(coproc_device.device);
+		pr_err("Unable to create coproc-%d %d\n", MINOR(devno), rc);
+		goto err;
+	}
+
+	pr_devel("%s: Added dev [%d,%d]\n", __func__, MAJOR(devno),
+			MINOR(devno));
+
+	return 0;
+
+err:
+	cdev_del(&coproc_device.cdev);
+err_cdev:
+	class_destroy(coproc_device.class);
+err_class:
+	unregister_chrdev_region(coproc_device.devt, 1);
+	return rc;
+}
+EXPORT_SYMBOL_GPL(vas_register_coproc_api);
+
+void vas_unregister_coproc_api(void)
+{
+	dev_t devno;
+
+	cdev_del(&coproc_device.cdev);
+	devno = MKDEV(MAJOR(coproc_device.devt), 0);
+	device_destroy(coproc_device.class, devno);
+
+	class_destroy(coproc_device.class);
+	unregister_chrdev_region(coproc_device.devt, 1);
+}
+EXPORT_SYMBOL_GPL(vas_unregister_coproc_api);
diff --git a/arch/powerpc/platforms/powernv/vas-debug.c b/arch/powerpc/platforms/powernv/vas-debug.c
new file mode 100644
index 000000000..41fa90d2f
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/vas-debug.c
@@ -0,0 +1,165 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright 2016-17 IBM Corp.
+ */
+
+#define pr_fmt(fmt) "vas: " fmt
+
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+#include "vas.h"
+
+static struct dentry *vas_debugfs;
+
+static char *cop_to_str(int cop)
+{
+	switch (cop) {
+	case VAS_COP_TYPE_FAULT:	return "Fault";
+	case VAS_COP_TYPE_842:		return "NX-842 Normal Priority";
+	case VAS_COP_TYPE_842_HIPRI:	return "NX-842 High Priority";
+	case VAS_COP_TYPE_GZIP:		return "NX-GZIP Normal Priority";
+	case VAS_COP_TYPE_GZIP_HIPRI:	return "NX-GZIP High Priority";
+	case VAS_COP_TYPE_FTW:		return "Fast Thread-wakeup";
+	default:			return "Unknown";
+	}
+}
+
+static int info_show(struct seq_file *s, void *private)
+{
+	struct vas_window *window = s->private;
+
+	mutex_lock(&vas_mutex);
+
+	/* ensure window is not unmapped */
+	if (!window->hvwc_map)
+		goto unlock;
+
+	seq_printf(s, "Type: %s, %s\n", cop_to_str(window->cop),
+					window->tx_win ? "Send" : "Receive");
+	seq_printf(s, "Pid : %d\n", vas_window_pid(window));
+
+unlock:
+	mutex_unlock(&vas_mutex);
+	return 0;
+}
+
+DEFINE_SHOW_ATTRIBUTE(info);
+
+static inline void print_reg(struct seq_file *s, struct vas_window *win,
+			char *name, u32 reg)
+{
+	seq_printf(s, "0x%016llx %s\n", read_hvwc_reg(win, name, reg), name);
+}
+
+static int hvwc_show(struct seq_file *s, void *private)
+{
+	struct vas_window *window = s->private;
+
+	mutex_lock(&vas_mutex);
+
+	/* ensure window is not unmapped */
+	if (!window->hvwc_map)
+		goto unlock;
+
+	print_reg(s, window, VREG(LPID));
+	print_reg(s, window, VREG(PID));
+	print_reg(s, window, VREG(XLATE_MSR));
+	print_reg(s, window, VREG(XLATE_LPCR));
+	print_reg(s, window, VREG(XLATE_CTL));
+	print_reg(s, window, VREG(AMR));
+	print_reg(s, window, VREG(SEIDR));
+	print_reg(s, window, VREG(FAULT_TX_WIN));
+	print_reg(s, window, VREG(OSU_INTR_SRC_RA));
+	print_reg(s, window, VREG(HV_INTR_SRC_RA));
+	print_reg(s, window, VREG(PSWID));
+	print_reg(s, window, VREG(LFIFO_BAR));
+	print_reg(s, window, VREG(LDATA_STAMP_CTL));
+	print_reg(s, window, VREG(LDMA_CACHE_CTL));
+	print_reg(s, window, VREG(LRFIFO_PUSH));
+	print_reg(s, window, VREG(CURR_MSG_COUNT));
+	print_reg(s, window, VREG(LNOTIFY_AFTER_COUNT));
+	print_reg(s, window, VREG(LRX_WCRED));
+	print_reg(s, window, VREG(LRX_WCRED_ADDER));
+	print_reg(s, window, VREG(TX_WCRED));
+	print_reg(s, window, VREG(TX_WCRED_ADDER));
+	print_reg(s, window, VREG(LFIFO_SIZE));
+	print_reg(s, window, VREG(WINCTL));
+	print_reg(s, window, VREG(WIN_STATUS));
+	print_reg(s, window, VREG(WIN_CTX_CACHING_CTL));
+	print_reg(s, window, VREG(TX_RSVD_BUF_COUNT));
+	print_reg(s, window, VREG(LRFIFO_WIN_PTR));
+	print_reg(s, window, VREG(LNOTIFY_CTL));
+	print_reg(s, window, VREG(LNOTIFY_PID));
+	print_reg(s, window, VREG(LNOTIFY_LPID));
+	print_reg(s, window, VREG(LNOTIFY_TID));
+	print_reg(s, window, VREG(LNOTIFY_SCOPE));
+	print_reg(s, window, VREG(NX_UTIL_ADDER));
+unlock:
+	mutex_unlock(&vas_mutex);
+	return 0;
+}
+
+DEFINE_SHOW_ATTRIBUTE(hvwc);
+
+void vas_window_free_dbgdir(struct vas_window *window)
+{
+	if (window->dbgdir) {
+		debugfs_remove_recursive(window->dbgdir);
+		kfree(window->dbgname);
+		window->dbgdir = NULL;
+		window->dbgname = NULL;
+	}
+}
+
+void vas_window_init_dbgdir(struct vas_window *window)
+{
+	struct dentry *d;
+
+	if (!window->vinst->dbgdir)
+		return;
+
+	window->dbgname = kzalloc(16, GFP_KERNEL);
+	if (!window->dbgname)
+		return;
+
+	snprintf(window->dbgname, 16, "w%d", window->winid);
+
+	d = debugfs_create_dir(window->dbgname, window->vinst->dbgdir);
+	window->dbgdir = d;
+
+	debugfs_create_file("info", 0444, d, window, &info_fops);
+	debugfs_create_file("hvwc", 0444, d, window, &hvwc_fops);
+}
+
+void vas_instance_init_dbgdir(struct vas_instance *vinst)
+{
+	struct dentry *d;
+
+	vas_init_dbgdir();
+
+	vinst->dbgname = kzalloc(16, GFP_KERNEL);
+	if (!vinst->dbgname)
+		return;
+
+	snprintf(vinst->dbgname, 16, "v%d", vinst->vas_id);
+
+	d = debugfs_create_dir(vinst->dbgname, vas_debugfs);
+	vinst->dbgdir = d;
+}
+
+/*
+ * Set up the "root" VAS debugfs dir. Return if we already set it up
+ * (or failed to) in an earlier instance of VAS.
+ */
+void vas_init_dbgdir(void)
+{
+	static bool first_time = true;
+
+	if (!first_time)
+		return;
+
+	first_time = false;
+	vas_debugfs = debugfs_create_dir("vas", NULL);
+}
diff --git a/arch/powerpc/platforms/powernv/vas-fault.c b/arch/powerpc/platforms/powernv/vas-fault.c
new file mode 100644
index 000000000..dd9c23c09
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/vas-fault.c
@@ -0,0 +1,382 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * VAS Fault handling.
+ * Copyright 2019, IBM Corporation
+ */
+
+#define pr_fmt(fmt) "vas: " fmt
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <linux/kthread.h>
+#include <linux/sched/signal.h>
+#include <linux/mmu_context.h>
+#include <asm/icswx.h>
+
+#include "vas.h"
+
+/*
+ * The maximum FIFO size for fault window can be 8MB
+ * (VAS_RX_FIFO_SIZE_MAX). Using 4MB FIFO since each VAS
+ * instance will be having fault window.
+ * 8MB FIFO can be used if expects more faults for each VAS
+ * instance.
+ */
+#define VAS_FAULT_WIN_FIFO_SIZE	(4 << 20)
+
+static void dump_crb(struct coprocessor_request_block *crb)
+{
+	struct data_descriptor_entry *dde;
+	struct nx_fault_stamp *nx;
+
+	dde = &crb->source;
+	pr_devel("SrcDDE: addr 0x%llx, len %d, count %d, idx %d, flags %d\n",
+		be64_to_cpu(dde->address), be32_to_cpu(dde->length),
+		dde->count, dde->index, dde->flags);
+
+	dde = &crb->target;
+	pr_devel("TgtDDE: addr 0x%llx, len %d, count %d, idx %d, flags %d\n",
+		be64_to_cpu(dde->address), be32_to_cpu(dde->length),
+		dde->count, dde->index, dde->flags);
+
+	nx = &crb->stamp.nx;
+	pr_devel("NX Stamp: PSWID 0x%x, FSA 0x%llx, flags 0x%x, FS 0x%x\n",
+		be32_to_cpu(nx->pswid),
+		be64_to_cpu(crb->stamp.nx.fault_storage_addr),
+		nx->flags, nx->fault_status);
+}
+
+/*
+ * Update the CSB to indicate a translation error.
+ *
+ * User space will be polling on CSB after the request is issued.
+ * If NX can handle the request without any issues, it updates CSB.
+ * Whereas if NX encounters page fault, the kernel will handle the
+ * fault and update CSB with translation error.
+ *
+ * If we are unable to update the CSB means copy_to_user failed due to
+ * invalid csb_addr, send a signal to the process.
+ */
+static void update_csb(struct vas_window *window,
+			struct coprocessor_request_block *crb)
+{
+	struct coprocessor_status_block csb;
+	struct kernel_siginfo info;
+	struct task_struct *tsk;
+	void __user *csb_addr;
+	struct pid *pid;
+	int rc;
+
+	/*
+	 * NX user space windows can not be opened for task->mm=NULL
+	 * and faults will not be generated for kernel requests.
+	 */
+	if (WARN_ON_ONCE(!window->mm || !window->user_win))
+		return;
+
+	csb_addr = (void __user *)be64_to_cpu(crb->csb_addr);
+
+	memset(&csb, 0, sizeof(csb));
+	csb.cc = CSB_CC_FAULT_ADDRESS;
+	csb.ce = CSB_CE_TERMINATION;
+	csb.cs = 0;
+	csb.count = 0;
+
+	/*
+	 * NX operates and returns in BE format as defined CRB struct.
+	 * So saves fault_storage_addr in BE as NX pastes in FIFO and
+	 * expects user space to convert to CPU format.
+	 */
+	csb.address = crb->stamp.nx.fault_storage_addr;
+	csb.flags = 0;
+
+	pid = window->pid;
+	tsk = get_pid_task(pid, PIDTYPE_PID);
+	/*
+	 * Process closes send window after all pending NX requests are
+	 * completed. In multi-thread applications, a child thread can
+	 * open a window and can exit without closing it. May be some
+	 * requests are pending or this window can be used by other
+	 * threads later. We should handle faults if NX encounters
+	 * pages faults on these requests. Update CSB with translation
+	 * error and fault address. If csb_addr passed by user space is
+	 * invalid, send SEGV signal to pid saved in window. If the
+	 * child thread is not running, send the signal to tgid.
+	 * Parent thread (tgid) will close this window upon its exit.
+	 *
+	 * pid and mm references are taken when window is opened by
+	 * process (pid). So tgid is used only when child thread opens
+	 * a window and exits without closing it.
+	 */
+	if (!tsk) {
+		pid = window->tgid;
+		tsk = get_pid_task(pid, PIDTYPE_PID);
+		/*
+		 * Parent thread (tgid) will be closing window when it
+		 * exits. So should not get here.
+		 */
+		if (WARN_ON_ONCE(!tsk))
+			return;
+	}
+
+	/* Return if the task is exiting. */
+	if (tsk->flags & PF_EXITING) {
+		put_task_struct(tsk);
+		return;
+	}
+
+	kthread_use_mm(window->mm);
+	rc = copy_to_user(csb_addr, &csb, sizeof(csb));
+	/*
+	 * User space polls on csb.flags (first byte). So add barrier
+	 * then copy first byte with csb flags update.
+	 */
+	if (!rc) {
+		csb.flags = CSB_V;
+		/* Make sure update to csb.flags is visible now */
+		smp_mb();
+		rc = copy_to_user(csb_addr, &csb, sizeof(u8));
+	}
+	kthread_unuse_mm(window->mm);
+	put_task_struct(tsk);
+
+	/* Success */
+	if (!rc)
+		return;
+
+	pr_debug("Invalid CSB address 0x%p signalling pid(%d)\n",
+			csb_addr, pid_vnr(pid));
+
+	clear_siginfo(&info);
+	info.si_signo = SIGSEGV;
+	info.si_errno = EFAULT;
+	info.si_code = SEGV_MAPERR;
+	info.si_addr = csb_addr;
+
+	/*
+	 * process will be polling on csb.flags after request is sent to
+	 * NX. So generally CSB update should not fail except when an
+	 * application passes invalid csb_addr. So an error message will
+	 * be displayed and leave it to user space whether to ignore or
+	 * handle this signal.
+	 */
+	rcu_read_lock();
+	rc = kill_pid_info(SIGSEGV, &info, pid);
+	rcu_read_unlock();
+
+	pr_devel("%s(): pid %d kill_proc_info() rc %d\n", __func__,
+			pid_vnr(pid), rc);
+}
+
+static void dump_fifo(struct vas_instance *vinst, void *entry)
+{
+	unsigned long *end = vinst->fault_fifo + vinst->fault_fifo_size;
+	unsigned long *fifo = entry;
+	int i;
+
+	pr_err("Fault fifo size %d, Max crbs %d\n", vinst->fault_fifo_size,
+			vinst->fault_fifo_size / CRB_SIZE);
+
+	/* Dump 10 CRB entries or until end of FIFO */
+	pr_err("Fault FIFO Dump:\n");
+	for (i = 0; i < 10*(CRB_SIZE/8) && fifo < end; i += 4, fifo += 4) {
+		pr_err("[%.3d, %p]: 0x%.16lx 0x%.16lx 0x%.16lx 0x%.16lx\n",
+			i, fifo, *fifo, *(fifo+1), *(fifo+2), *(fifo+3));
+	}
+}
+
+/*
+ * Process valid CRBs in fault FIFO.
+ * NX process user space requests, return credit and update the status
+ * in CRB. If it encounters transalation error when accessing CRB or
+ * request buffers, raises interrupt on the CPU to handle the fault.
+ * It takes credit on fault window, updates nx_fault_stamp in CRB with
+ * the following information and pastes CRB in fault FIFO.
+ *
+ * pswid - window ID of the window on which the request is sent.
+ * fault_storage_addr - fault address
+ *
+ * It can raise a single interrupt for multiple faults. Expects OS to
+ * process all valid faults and return credit for each fault on user
+ * space and fault windows. This fault FIFO control will be done with
+ * credit mechanism. NX can continuously paste CRBs until credits are not
+ * available on fault window. Otherwise, returns with RMA_reject.
+ *
+ * Total credits available on fault window: FIFO_SIZE(4MB)/CRBS_SIZE(128)
+ *
+ */
+irqreturn_t vas_fault_thread_fn(int irq, void *data)
+{
+	struct vas_instance *vinst = data;
+	struct coprocessor_request_block *crb, *entry;
+	struct coprocessor_request_block buf;
+	struct vas_window *window;
+	unsigned long flags;
+	void *fifo;
+
+	crb = &buf;
+
+	/*
+	 * VAS can interrupt with multiple page faults. So process all
+	 * valid CRBs within fault FIFO until reaches invalid CRB.
+	 * We use CCW[0] and pswid to validate validate CRBs:
+	 *
+	 * CCW[0]	Reserved bit. When NX pastes CRB, CCW[0]=0
+	 *		OS sets this bit to 1 after reading CRB.
+	 * pswid	NX assigns window ID. Set pswid to -1 after
+	 *		reading CRB from fault FIFO.
+	 *
+	 * We exit this function if no valid CRBs are available to process.
+	 * So acquire fault_lock and reset fifo_in_progress to 0 before
+	 * exit.
+	 * In case kernel receives another interrupt with different page
+	 * fault, interrupt handler returns with IRQ_HANDLED if
+	 * fifo_in_progress is set. Means these new faults will be
+	 * handled by the current thread. Otherwise set fifo_in_progress
+	 * and return IRQ_WAKE_THREAD to wake up thread.
+	 */
+	while (true) {
+		spin_lock_irqsave(&vinst->fault_lock, flags);
+		/*
+		 * Advance the fault fifo pointer to next CRB.
+		 * Use CRB_SIZE rather than sizeof(*crb) since the latter is
+		 * aligned to CRB_ALIGN (256) but the CRB written to by VAS is
+		 * only CRB_SIZE in len.
+		 */
+		fifo = vinst->fault_fifo + (vinst->fault_crbs * CRB_SIZE);
+		entry = fifo;
+
+		if ((entry->stamp.nx.pswid == cpu_to_be32(FIFO_INVALID_ENTRY))
+			|| (entry->ccw & cpu_to_be32(CCW0_INVALID))) {
+			vinst->fifo_in_progress = 0;
+			spin_unlock_irqrestore(&vinst->fault_lock, flags);
+			return IRQ_HANDLED;
+		}
+
+		spin_unlock_irqrestore(&vinst->fault_lock, flags);
+		vinst->fault_crbs++;
+		if (vinst->fault_crbs == (vinst->fault_fifo_size / CRB_SIZE))
+			vinst->fault_crbs = 0;
+
+		memcpy(crb, fifo, CRB_SIZE);
+		entry->stamp.nx.pswid = cpu_to_be32(FIFO_INVALID_ENTRY);
+		entry->ccw |= cpu_to_be32(CCW0_INVALID);
+		/*
+		 * Return credit for the fault window.
+		 */
+		vas_return_credit(vinst->fault_win, false);
+
+		pr_devel("VAS[%d] fault_fifo %p, fifo %p, fault_crbs %d\n",
+				vinst->vas_id, vinst->fault_fifo, fifo,
+				vinst->fault_crbs);
+
+		dump_crb(crb);
+		window = vas_pswid_to_window(vinst,
+				be32_to_cpu(crb->stamp.nx.pswid));
+
+		if (IS_ERR(window)) {
+			/*
+			 * We got an interrupt about a specific send
+			 * window but we can't find that window and we can't
+			 * even clean it up (return credit on user space
+			 * window).
+			 * But we should not get here.
+			 * TODO: Disable IRQ.
+			 */
+			dump_fifo(vinst, (void *)entry);
+			pr_err("VAS[%d] fault_fifo %p, fifo %p, pswid 0x%x, fault_crbs %d bad CRB?\n",
+				vinst->vas_id, vinst->fault_fifo, fifo,
+				be32_to_cpu(crb->stamp.nx.pswid),
+				vinst->fault_crbs);
+
+			WARN_ON_ONCE(1);
+		} else {
+			update_csb(window, crb);
+			/*
+			 * Return credit for send window after processing
+			 * fault CRB.
+			 */
+			vas_return_credit(window, true);
+		}
+	}
+}
+
+irqreturn_t vas_fault_handler(int irq, void *dev_id)
+{
+	struct vas_instance *vinst = dev_id;
+	irqreturn_t ret = IRQ_WAKE_THREAD;
+	unsigned long flags;
+
+	/*
+	 * NX can generate an interrupt for multiple faults. So the
+	 * fault handler thread process all CRBs until finds invalid
+	 * entry. In case if NX sees continuous faults, it is possible
+	 * that the thread function entered with the first interrupt
+	 * can execute and process all valid CRBs.
+	 * So wake up thread only if the fault thread is not in progress.
+	 */
+	spin_lock_irqsave(&vinst->fault_lock, flags);
+
+	if (vinst->fifo_in_progress)
+		ret = IRQ_HANDLED;
+	else
+		vinst->fifo_in_progress = 1;
+
+	spin_unlock_irqrestore(&vinst->fault_lock, flags);
+
+	return ret;
+}
+
+/*
+ * Fault window is opened per VAS instance. NX pastes fault CRB in fault
+ * FIFO upon page faults.
+ */
+int vas_setup_fault_window(struct vas_instance *vinst)
+{
+	struct vas_rx_win_attr attr;
+
+	vinst->fault_fifo_size = VAS_FAULT_WIN_FIFO_SIZE;
+	vinst->fault_fifo = kzalloc(vinst->fault_fifo_size, GFP_KERNEL);
+	if (!vinst->fault_fifo) {
+		pr_err("Unable to alloc %d bytes for fault_fifo\n",
+				vinst->fault_fifo_size);
+		return -ENOMEM;
+	}
+
+	/*
+	 * Invalidate all CRB entries. NX pastes valid entry for each fault.
+	 */
+	memset(vinst->fault_fifo, FIFO_INVALID_ENTRY, vinst->fault_fifo_size);
+	vas_init_rx_win_attr(&attr, VAS_COP_TYPE_FAULT);
+
+	attr.rx_fifo_size = vinst->fault_fifo_size;
+	attr.rx_fifo = __pa(vinst->fault_fifo);
+
+	/*
+	 * Max creds is based on number of CRBs can fit in the FIFO.
+	 * (fault_fifo_size/CRB_SIZE). If 8MB FIFO is used, max creds
+	 * will be 0xffff since the receive creds field is 16bits wide.
+	 */
+	attr.wcreds_max = vinst->fault_fifo_size / CRB_SIZE;
+	attr.lnotify_lpid = 0;
+	attr.lnotify_pid = mfspr(SPRN_PID);
+	attr.lnotify_tid = mfspr(SPRN_PID);
+
+	vinst->fault_win = vas_rx_win_open(vinst->vas_id, VAS_COP_TYPE_FAULT,
+					&attr);
+
+	if (IS_ERR(vinst->fault_win)) {
+		pr_err("VAS: Error %ld opening FaultWin\n",
+			PTR_ERR(vinst->fault_win));
+		kfree(vinst->fault_fifo);
+		return PTR_ERR(vinst->fault_win);
+	}
+
+	pr_devel("VAS: Created FaultWin %d, LPID/PID/TID [%d/%d/%d]\n",
+			vinst->fault_win->winid, attr.lnotify_lpid,
+			attr.lnotify_pid, attr.lnotify_tid);
+
+	return 0;
+}
diff --git a/arch/powerpc/platforms/powernv/vas-trace.h b/arch/powerpc/platforms/powernv/vas-trace.h
new file mode 100644
index 000000000..a449b9f0c
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/vas-trace.h
@@ -0,0 +1,113 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM	vas
+
+#if !defined(_VAS_TRACE_H) || defined(TRACE_HEADER_MULTI_READ)
+
+#define _VAS_TRACE_H
+#include <linux/tracepoint.h>
+#include <linux/sched.h>
+#include <asm/vas.h>
+
+TRACE_EVENT(	vas_rx_win_open,
+
+		TP_PROTO(struct task_struct *tsk,
+			 int vasid,
+			 int cop,
+			 struct vas_rx_win_attr *rxattr),
+
+		TP_ARGS(tsk, vasid, cop, rxattr),
+
+		TP_STRUCT__entry(
+			__field(struct task_struct *, tsk)
+			__field(int, pid)
+			__field(int, cop)
+			__field(int, vasid)
+			__field(struct vas_rx_win_attr *, rxattr)
+			__field(int, lnotify_lpid)
+			__field(int, lnotify_pid)
+			__field(int, lnotify_tid)
+		),
+
+		TP_fast_assign(
+			__entry->pid = tsk->pid;
+			__entry->vasid = vasid;
+			__entry->cop = cop;
+			__entry->lnotify_lpid = rxattr->lnotify_lpid;
+			__entry->lnotify_pid = rxattr->lnotify_pid;
+			__entry->lnotify_tid = rxattr->lnotify_tid;
+		),
+
+		TP_printk("pid=%d, vasid=%d, cop=%d, lpid=%d, pid=%d, tid=%d",
+			__entry->pid, __entry->vasid, __entry->cop,
+			__entry->lnotify_lpid, __entry->lnotify_pid,
+			__entry->lnotify_tid)
+);
+
+TRACE_EVENT(	vas_tx_win_open,
+
+		TP_PROTO(struct task_struct *tsk,
+			 int vasid,
+			 int cop,
+			 struct vas_tx_win_attr *txattr),
+
+		TP_ARGS(tsk, vasid, cop, txattr),
+
+		TP_STRUCT__entry(
+			__field(struct task_struct *, tsk)
+			__field(int, pid)
+			__field(int, cop)
+			__field(int, vasid)
+			__field(struct vas_tx_win_attr *, txattr)
+			__field(int, lpid)
+			__field(int, pidr)
+		),
+
+		TP_fast_assign(
+			__entry->pid = tsk->pid;
+			__entry->vasid = vasid;
+			__entry->cop = cop;
+			__entry->lpid = txattr->lpid;
+			__entry->pidr = txattr->pidr;
+		),
+
+		TP_printk("pid=%d, vasid=%d, cop=%d, lpid=%d, pidr=%d",
+			__entry->pid, __entry->vasid, __entry->cop,
+			__entry->lpid, __entry->pidr)
+);
+
+TRACE_EVENT(	vas_paste_crb,
+
+		TP_PROTO(struct task_struct *tsk,
+			struct vas_window *win),
+
+		TP_ARGS(tsk, win),
+
+		TP_STRUCT__entry(
+			__field(struct task_struct *, tsk)
+			__field(struct vas_window *, win)
+			__field(int, pid)
+			__field(int, vasid)
+			__field(int, winid)
+			__field(unsigned long, paste_kaddr)
+		),
+
+		TP_fast_assign(
+			__entry->pid = tsk->pid;
+			__entry->vasid = win->vinst->vas_id;
+			__entry->winid = win->winid;
+			__entry->paste_kaddr = (unsigned long)win->paste_kaddr
+		),
+
+		TP_printk("pid=%d, vasid=%d, winid=%d, paste_kaddr=0x%016lx\n",
+			__entry->pid, __entry->vasid, __entry->winid,
+			__entry->paste_kaddr)
+);
+
+#endif /* _VAS_TRACE_H */
+
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH ../../arch/powerpc/platforms/powernv
+#define TRACE_INCLUDE_FILE vas-trace
+#include <trace/define_trace.h>
diff --git a/arch/powerpc/platforms/powernv/vas-window.c b/arch/powerpc/platforms/powernv/vas-window.c
new file mode 100644
index 000000000..3a86cdd5a
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/vas-window.c
@@ -0,0 +1,1444 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright 2016-17 IBM Corp.
+ */
+
+#define pr_fmt(fmt) "vas: " fmt
+
+#include <linux/types.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include <linux/io.h>
+#include <linux/log2.h>
+#include <linux/rcupdate.h>
+#include <linux/cred.h>
+#include <linux/sched/mm.h>
+#include <linux/mmu_context.h>
+#include <asm/switch_to.h>
+#include <asm/ppc-opcode.h>
+#include "vas.h"
+#include "copy-paste.h"
+
+#define CREATE_TRACE_POINTS
+#include "vas-trace.h"
+
+/*
+ * Compute the paste address region for the window @window using the
+ * ->paste_base_addr and ->paste_win_id_shift we got from device tree.
+ */
+void vas_win_paste_addr(struct vas_window *window, u64 *addr, int *len)
+{
+	int winid;
+	u64 base, shift;
+
+	base = window->vinst->paste_base_addr;
+	shift = window->vinst->paste_win_id_shift;
+	winid = window->winid;
+
+	*addr  = base + (winid << shift);
+	if (len)
+		*len = PAGE_SIZE;
+
+	pr_debug("Txwin #%d: Paste addr 0x%llx\n", winid, *addr);
+}
+
+static inline void get_hvwc_mmio_bar(struct vas_window *window,
+			u64 *start, int *len)
+{
+	u64 pbaddr;
+
+	pbaddr = window->vinst->hvwc_bar_start;
+	*start = pbaddr + window->winid * VAS_HVWC_SIZE;
+	*len = VAS_HVWC_SIZE;
+}
+
+static inline void get_uwc_mmio_bar(struct vas_window *window,
+			u64 *start, int *len)
+{
+	u64 pbaddr;
+
+	pbaddr = window->vinst->uwc_bar_start;
+	*start = pbaddr + window->winid * VAS_UWC_SIZE;
+	*len = VAS_UWC_SIZE;
+}
+
+/*
+ * Map the paste bus address of the given send window into kernel address
+ * space. Unlike MMIO regions (map_mmio_region() below), paste region must
+ * be mapped cache-able and is only applicable to send windows.
+ */
+static void *map_paste_region(struct vas_window *txwin)
+{
+	int len;
+	void *map;
+	char *name;
+	u64 start;
+
+	name = kasprintf(GFP_KERNEL, "window-v%d-w%d", txwin->vinst->vas_id,
+				txwin->winid);
+	if (!name)
+		goto free_name;
+
+	txwin->paste_addr_name = name;
+	vas_win_paste_addr(txwin, &start, &len);
+
+	if (!request_mem_region(start, len, name)) {
+		pr_devel("%s(): request_mem_region(0x%llx, %d) failed\n",
+				__func__, start, len);
+		goto free_name;
+	}
+
+	map = ioremap_cache(start, len);
+	if (!map) {
+		pr_devel("%s(): ioremap_cache(0x%llx, %d) failed\n", __func__,
+				start, len);
+		goto free_name;
+	}
+
+	pr_devel("Mapped paste addr 0x%llx to kaddr 0x%p\n", start, map);
+	return map;
+
+free_name:
+	kfree(name);
+	return ERR_PTR(-ENOMEM);
+}
+
+static void *map_mmio_region(char *name, u64 start, int len)
+{
+	void *map;
+
+	if (!request_mem_region(start, len, name)) {
+		pr_devel("%s(): request_mem_region(0x%llx, %d) failed\n",
+				__func__, start, len);
+		return NULL;
+	}
+
+	map = ioremap(start, len);
+	if (!map) {
+		pr_devel("%s(): ioremap(0x%llx, %d) failed\n", __func__, start,
+				len);
+		return NULL;
+	}
+
+	return map;
+}
+
+static void unmap_region(void *addr, u64 start, int len)
+{
+	iounmap(addr);
+	release_mem_region((phys_addr_t)start, len);
+}
+
+/*
+ * Unmap the paste address region for a window.
+ */
+static void unmap_paste_region(struct vas_window *window)
+{
+	int len;
+	u64 busaddr_start;
+
+	if (window->paste_kaddr) {
+		vas_win_paste_addr(window, &busaddr_start, &len);
+		unmap_region(window->paste_kaddr, busaddr_start, len);
+		window->paste_kaddr = NULL;
+		kfree(window->paste_addr_name);
+		window->paste_addr_name = NULL;
+	}
+}
+
+/*
+ * Unmap the MMIO regions for a window. Hold the vas_mutex so we don't
+ * unmap when the window's debugfs dir is in use. This serializes close
+ * of a window even on another VAS instance but since its not a critical
+ * path, just minimize the time we hold the mutex for now. We can add
+ * a per-instance mutex later if necessary.
+ */
+static void unmap_winctx_mmio_bars(struct vas_window *window)
+{
+	int len;
+	void *uwc_map;
+	void *hvwc_map;
+	u64 busaddr_start;
+
+	mutex_lock(&vas_mutex);
+
+	hvwc_map = window->hvwc_map;
+	window->hvwc_map = NULL;
+
+	uwc_map = window->uwc_map;
+	window->uwc_map = NULL;
+
+	mutex_unlock(&vas_mutex);
+
+	if (hvwc_map) {
+		get_hvwc_mmio_bar(window, &busaddr_start, &len);
+		unmap_region(hvwc_map, busaddr_start, len);
+	}
+
+	if (uwc_map) {
+		get_uwc_mmio_bar(window, &busaddr_start, &len);
+		unmap_region(uwc_map, busaddr_start, len);
+	}
+}
+
+/*
+ * Find the Hypervisor Window Context (HVWC) MMIO Base Address Region and the
+ * OS/User Window Context (UWC) MMIO Base Address Region for the given window.
+ * Map these bus addresses and save the mapped kernel addresses in @window.
+ */
+static int map_winctx_mmio_bars(struct vas_window *window)
+{
+	int len;
+	u64 start;
+
+	get_hvwc_mmio_bar(window, &start, &len);
+	window->hvwc_map = map_mmio_region("HVWCM_Window", start, len);
+
+	get_uwc_mmio_bar(window, &start, &len);
+	window->uwc_map = map_mmio_region("UWCM_Window", start, len);
+
+	if (!window->hvwc_map || !window->uwc_map) {
+		unmap_winctx_mmio_bars(window);
+		return -1;
+	}
+
+	return 0;
+}
+
+/*
+ * Reset all valid registers in the HV and OS/User Window Contexts for
+ * the window identified by @window.
+ *
+ * NOTE: We cannot really use a for loop to reset window context. Not all
+ *	 offsets in a window context are valid registers and the valid
+ *	 registers are not sequential. And, we can only write to offsets
+ *	 with valid registers.
+ */
+static void reset_window_regs(struct vas_window *window)
+{
+	write_hvwc_reg(window, VREG(LPID), 0ULL);
+	write_hvwc_reg(window, VREG(PID), 0ULL);
+	write_hvwc_reg(window, VREG(XLATE_MSR), 0ULL);
+	write_hvwc_reg(window, VREG(XLATE_LPCR), 0ULL);
+	write_hvwc_reg(window, VREG(XLATE_CTL), 0ULL);
+	write_hvwc_reg(window, VREG(AMR), 0ULL);
+	write_hvwc_reg(window, VREG(SEIDR), 0ULL);
+	write_hvwc_reg(window, VREG(FAULT_TX_WIN), 0ULL);
+	write_hvwc_reg(window, VREG(OSU_INTR_SRC_RA), 0ULL);
+	write_hvwc_reg(window, VREG(HV_INTR_SRC_RA), 0ULL);
+	write_hvwc_reg(window, VREG(PSWID), 0ULL);
+	write_hvwc_reg(window, VREG(LFIFO_BAR), 0ULL);
+	write_hvwc_reg(window, VREG(LDATA_STAMP_CTL), 0ULL);
+	write_hvwc_reg(window, VREG(LDMA_CACHE_CTL), 0ULL);
+	write_hvwc_reg(window, VREG(LRFIFO_PUSH), 0ULL);
+	write_hvwc_reg(window, VREG(CURR_MSG_COUNT), 0ULL);
+	write_hvwc_reg(window, VREG(LNOTIFY_AFTER_COUNT), 0ULL);
+	write_hvwc_reg(window, VREG(LRX_WCRED), 0ULL);
+	write_hvwc_reg(window, VREG(LRX_WCRED_ADDER), 0ULL);
+	write_hvwc_reg(window, VREG(TX_WCRED), 0ULL);
+	write_hvwc_reg(window, VREG(TX_WCRED_ADDER), 0ULL);
+	write_hvwc_reg(window, VREG(LFIFO_SIZE), 0ULL);
+	write_hvwc_reg(window, VREG(WINCTL), 0ULL);
+	write_hvwc_reg(window, VREG(WIN_STATUS), 0ULL);
+	write_hvwc_reg(window, VREG(WIN_CTX_CACHING_CTL), 0ULL);
+	write_hvwc_reg(window, VREG(TX_RSVD_BUF_COUNT), 0ULL);
+	write_hvwc_reg(window, VREG(LRFIFO_WIN_PTR), 0ULL);
+	write_hvwc_reg(window, VREG(LNOTIFY_CTL), 0ULL);
+	write_hvwc_reg(window, VREG(LNOTIFY_PID), 0ULL);
+	write_hvwc_reg(window, VREG(LNOTIFY_LPID), 0ULL);
+	write_hvwc_reg(window, VREG(LNOTIFY_TID), 0ULL);
+	write_hvwc_reg(window, VREG(LNOTIFY_SCOPE), 0ULL);
+	write_hvwc_reg(window, VREG(NX_UTIL_ADDER), 0ULL);
+
+	/* Skip read-only registers: NX_UTIL and NX_UTIL_SE */
+
+	/*
+	 * The send and receive window credit adder registers are also
+	 * accessible from HVWC and have been initialized above. We don't
+	 * need to initialize from the OS/User Window Context, so skip
+	 * following calls:
+	 *
+	 *	write_uwc_reg(window, VREG(TX_WCRED_ADDER), 0ULL);
+	 *	write_uwc_reg(window, VREG(LRX_WCRED_ADDER), 0ULL);
+	 */
+}
+
+/*
+ * Initialize window context registers related to Address Translation.
+ * These registers are common to send/receive windows although they
+ * differ for user/kernel windows. As we resolve the TODOs we may
+ * want to add fields to vas_winctx and move the initialization to
+ * init_vas_winctx_regs().
+ */
+static void init_xlate_regs(struct vas_window *window, bool user_win)
+{
+	u64 lpcr, val;
+
+	/*
+	 * MSR_TA, MSR_US are false for both kernel and user.
+	 * MSR_DR and MSR_PR are false for kernel.
+	 */
+	val = 0ULL;
+	val = SET_FIELD(VAS_XLATE_MSR_HV, val, 1);
+	val = SET_FIELD(VAS_XLATE_MSR_SF, val, 1);
+	if (user_win) {
+		val = SET_FIELD(VAS_XLATE_MSR_DR, val, 1);
+		val = SET_FIELD(VAS_XLATE_MSR_PR, val, 1);
+	}
+	write_hvwc_reg(window, VREG(XLATE_MSR), val);
+
+	lpcr = mfspr(SPRN_LPCR);
+	val = 0ULL;
+	/*
+	 * NOTE: From Section 5.7.8.1 Segment Lookaside Buffer of the
+	 *	 Power ISA, v3.0B, Page size encoding is 0 = 4KB, 5 = 64KB.
+	 *
+	 * NOTE: From Section 1.3.1, Address Translation Context of the
+	 *	 Nest MMU Workbook, LPCR_SC should be 0 for Power9.
+	 */
+	val = SET_FIELD(VAS_XLATE_LPCR_PAGE_SIZE, val, 5);
+	val = SET_FIELD(VAS_XLATE_LPCR_ISL, val, lpcr & LPCR_ISL);
+	val = SET_FIELD(VAS_XLATE_LPCR_TC, val, lpcr & LPCR_TC);
+	val = SET_FIELD(VAS_XLATE_LPCR_SC, val, 0);
+	write_hvwc_reg(window, VREG(XLATE_LPCR), val);
+
+	/*
+	 * Section 1.3.1 (Address translation Context) of NMMU workbook.
+	 *	0b00	Hashed Page Table mode
+	 *	0b01	Reserved
+	 *	0b10	Radix on HPT
+	 *	0b11	Radix on Radix
+	 */
+	val = 0ULL;
+	val = SET_FIELD(VAS_XLATE_MODE, val, radix_enabled() ? 3 : 2);
+	write_hvwc_reg(window, VREG(XLATE_CTL), val);
+
+	/*
+	 * TODO: Can we mfspr(AMR) even for user windows?
+	 */
+	val = 0ULL;
+	val = SET_FIELD(VAS_AMR, val, mfspr(SPRN_AMR));
+	write_hvwc_reg(window, VREG(AMR), val);
+
+	val = 0ULL;
+	val = SET_FIELD(VAS_SEIDR, val, 0);
+	write_hvwc_reg(window, VREG(SEIDR), val);
+}
+
+/*
+ * Initialize Reserved Send Buffer Count for the send window. It involves
+ * writing to the register, reading it back to confirm that the hardware
+ * has enough buffers to reserve. See section 1.3.1.2.1 of VAS workbook.
+ *
+ * Since we can only make a best-effort attempt to fulfill the request,
+ * we don't return any errors if we cannot.
+ *
+ * TODO: Reserved (aka dedicated) send buffers are not supported yet.
+ */
+static void init_rsvd_tx_buf_count(struct vas_window *txwin,
+				struct vas_winctx *winctx)
+{
+	write_hvwc_reg(txwin, VREG(TX_RSVD_BUF_COUNT), 0ULL);
+}
+
+/*
+ * init_winctx_regs()
+ *	Initialize window context registers for a receive window.
+ *	Except for caching control and marking window open, the registers
+ *	are initialized in the order listed in Section 3.1.4 (Window Context
+ *	Cache Register Details) of the VAS workbook although they don't need
+ *	to be.
+ *
+ * Design note: For NX receive windows, NX allocates the FIFO buffer in OPAL
+ *	(so that it can get a large contiguous area) and passes that buffer
+ *	to kernel via device tree. We now write that buffer address to the
+ *	FIFO BAR. Would it make sense to do this all in OPAL? i.e have OPAL
+ *	write the per-chip RX FIFO addresses to the windows during boot-up
+ *	as a one-time task? That could work for NX but what about other
+ *	receivers?  Let the receivers tell us the rx-fifo buffers for now.
+ */
+static void init_winctx_regs(struct vas_window *window,
+			     struct vas_winctx *winctx)
+{
+	u64 val;
+	int fifo_size;
+
+	reset_window_regs(window);
+
+	val = 0ULL;
+	val = SET_FIELD(VAS_LPID, val, winctx->lpid);
+	write_hvwc_reg(window, VREG(LPID), val);
+
+	val = 0ULL;
+	val = SET_FIELD(VAS_PID_ID, val, winctx->pidr);
+	write_hvwc_reg(window, VREG(PID), val);
+
+	init_xlate_regs(window, winctx->user_win);
+
+	val = 0ULL;
+	val = SET_FIELD(VAS_FAULT_TX_WIN, val, winctx->fault_win_id);
+	write_hvwc_reg(window, VREG(FAULT_TX_WIN), val);
+
+	/* In PowerNV, interrupts go to HV. */
+	write_hvwc_reg(window, VREG(OSU_INTR_SRC_RA), 0ULL);
+
+	val = 0ULL;
+	val = SET_FIELD(VAS_HV_INTR_SRC_RA, val, winctx->irq_port);
+	write_hvwc_reg(window, VREG(HV_INTR_SRC_RA), val);
+
+	val = 0ULL;
+	val = SET_FIELD(VAS_PSWID_EA_HANDLE, val, winctx->pswid);
+	write_hvwc_reg(window, VREG(PSWID), val);
+
+	write_hvwc_reg(window, VREG(SPARE1), 0ULL);
+	write_hvwc_reg(window, VREG(SPARE2), 0ULL);
+	write_hvwc_reg(window, VREG(SPARE3), 0ULL);
+
+	/*
+	 * NOTE: VAS expects the FIFO address to be copied into the LFIFO_BAR
+	 *	 register as is - do NOT shift the address into VAS_LFIFO_BAR
+	 *	 bit fields! Ok to set the page migration select fields -
+	 *	 VAS ignores the lower 10+ bits in the address anyway, because
+	 *	 the minimum FIFO size is 1K?
+	 *
+	 * See also: Design note in function header.
+	 */
+	val = winctx->rx_fifo;
+	val = SET_FIELD(VAS_PAGE_MIGRATION_SELECT, val, 0);
+	write_hvwc_reg(window, VREG(LFIFO_BAR), val);
+
+	val = 0ULL;
+	val = SET_FIELD(VAS_LDATA_STAMP, val, winctx->data_stamp);
+	write_hvwc_reg(window, VREG(LDATA_STAMP_CTL), val);
+
+	val = 0ULL;
+	val = SET_FIELD(VAS_LDMA_TYPE, val, winctx->dma_type);
+	val = SET_FIELD(VAS_LDMA_FIFO_DISABLE, val, winctx->fifo_disable);
+	write_hvwc_reg(window, VREG(LDMA_CACHE_CTL), val);
+
+	write_hvwc_reg(window, VREG(LRFIFO_PUSH), 0ULL);
+	write_hvwc_reg(window, VREG(CURR_MSG_COUNT), 0ULL);
+	write_hvwc_reg(window, VREG(LNOTIFY_AFTER_COUNT), 0ULL);
+
+	val = 0ULL;
+	val = SET_FIELD(VAS_LRX_WCRED, val, winctx->wcreds_max);
+	write_hvwc_reg(window, VREG(LRX_WCRED), val);
+
+	val = 0ULL;
+	val = SET_FIELD(VAS_TX_WCRED, val, winctx->wcreds_max);
+	write_hvwc_reg(window, VREG(TX_WCRED), val);
+
+	write_hvwc_reg(window, VREG(LRX_WCRED_ADDER), 0ULL);
+	write_hvwc_reg(window, VREG(TX_WCRED_ADDER), 0ULL);
+
+	fifo_size = winctx->rx_fifo_size / 1024;
+
+	val = 0ULL;
+	val = SET_FIELD(VAS_LFIFO_SIZE, val, ilog2(fifo_size));
+	write_hvwc_reg(window, VREG(LFIFO_SIZE), val);
+
+	/* Update window control and caching control registers last so
+	 * we mark the window open only after fully initializing it and
+	 * pushing context to cache.
+	 */
+
+	write_hvwc_reg(window, VREG(WIN_STATUS), 0ULL);
+
+	init_rsvd_tx_buf_count(window, winctx);
+
+	/* for a send window, point to the matching receive window */
+	val = 0ULL;
+	val = SET_FIELD(VAS_LRX_WIN_ID, val, winctx->rx_win_id);
+	write_hvwc_reg(window, VREG(LRFIFO_WIN_PTR), val);
+
+	write_hvwc_reg(window, VREG(SPARE4), 0ULL);
+
+	val = 0ULL;
+	val = SET_FIELD(VAS_NOTIFY_DISABLE, val, winctx->notify_disable);
+	val = SET_FIELD(VAS_INTR_DISABLE, val, winctx->intr_disable);
+	val = SET_FIELD(VAS_NOTIFY_EARLY, val, winctx->notify_early);
+	val = SET_FIELD(VAS_NOTIFY_OSU_INTR, val, winctx->notify_os_intr_reg);
+	write_hvwc_reg(window, VREG(LNOTIFY_CTL), val);
+
+	val = 0ULL;
+	val = SET_FIELD(VAS_LNOTIFY_PID, val, winctx->lnotify_pid);
+	write_hvwc_reg(window, VREG(LNOTIFY_PID), val);
+
+	val = 0ULL;
+	val = SET_FIELD(VAS_LNOTIFY_LPID, val, winctx->lnotify_lpid);
+	write_hvwc_reg(window, VREG(LNOTIFY_LPID), val);
+
+	val = 0ULL;
+	val = SET_FIELD(VAS_LNOTIFY_TID, val, winctx->lnotify_tid);
+	write_hvwc_reg(window, VREG(LNOTIFY_TID), val);
+
+	val = 0ULL;
+	val = SET_FIELD(VAS_LNOTIFY_MIN_SCOPE, val, winctx->min_scope);
+	val = SET_FIELD(VAS_LNOTIFY_MAX_SCOPE, val, winctx->max_scope);
+	write_hvwc_reg(window, VREG(LNOTIFY_SCOPE), val);
+
+	/* Skip read-only registers NX_UTIL and NX_UTIL_SE */
+
+	write_hvwc_reg(window, VREG(SPARE5), 0ULL);
+	write_hvwc_reg(window, VREG(NX_UTIL_ADDER), 0ULL);
+	write_hvwc_reg(window, VREG(SPARE6), 0ULL);
+
+	/* Finally, push window context to memory and... */
+	val = 0ULL;
+	val = SET_FIELD(VAS_PUSH_TO_MEM, val, 1);
+	write_hvwc_reg(window, VREG(WIN_CTX_CACHING_CTL), val);
+
+	/* ... mark the window open for business */
+	val = 0ULL;
+	val = SET_FIELD(VAS_WINCTL_REJ_NO_CREDIT, val, winctx->rej_no_credit);
+	val = SET_FIELD(VAS_WINCTL_PIN, val, winctx->pin_win);
+	val = SET_FIELD(VAS_WINCTL_TX_WCRED_MODE, val, winctx->tx_wcred_mode);
+	val = SET_FIELD(VAS_WINCTL_RX_WCRED_MODE, val, winctx->rx_wcred_mode);
+	val = SET_FIELD(VAS_WINCTL_TX_WORD_MODE, val, winctx->tx_word_mode);
+	val = SET_FIELD(VAS_WINCTL_RX_WORD_MODE, val, winctx->rx_word_mode);
+	val = SET_FIELD(VAS_WINCTL_FAULT_WIN, val, winctx->fault_win);
+	val = SET_FIELD(VAS_WINCTL_NX_WIN, val, winctx->nx_win);
+	val = SET_FIELD(VAS_WINCTL_OPEN, val, 1);
+	write_hvwc_reg(window, VREG(WINCTL), val);
+}
+
+static void vas_release_window_id(struct ida *ida, int winid)
+{
+	ida_free(ida, winid);
+}
+
+static int vas_assign_window_id(struct ida *ida)
+{
+	int winid = ida_alloc_max(ida, VAS_WINDOWS_PER_CHIP - 1, GFP_KERNEL);
+
+	if (winid == -ENOSPC) {
+		pr_err("Too many (%d) open windows\n", VAS_WINDOWS_PER_CHIP);
+		return -EAGAIN;
+	}
+
+	return winid;
+}
+
+static void vas_window_free(struct vas_window *window)
+{
+	int winid = window->winid;
+	struct vas_instance *vinst = window->vinst;
+
+	unmap_winctx_mmio_bars(window);
+
+	vas_window_free_dbgdir(window);
+
+	kfree(window);
+
+	vas_release_window_id(&vinst->ida, winid);
+}
+
+static struct vas_window *vas_window_alloc(struct vas_instance *vinst)
+{
+	int winid;
+	struct vas_window *window;
+
+	winid = vas_assign_window_id(&vinst->ida);
+	if (winid < 0)
+		return ERR_PTR(winid);
+
+	window = kzalloc(sizeof(*window), GFP_KERNEL);
+	if (!window)
+		goto out_free;
+
+	window->vinst = vinst;
+	window->winid = winid;
+
+	if (map_winctx_mmio_bars(window))
+		goto out_free;
+
+	vas_window_init_dbgdir(window);
+
+	return window;
+
+out_free:
+	kfree(window);
+	vas_release_window_id(&vinst->ida, winid);
+	return ERR_PTR(-ENOMEM);
+}
+
+static void put_rx_win(struct vas_window *rxwin)
+{
+	/* Better not be a send window! */
+	WARN_ON_ONCE(rxwin->tx_win);
+
+	atomic_dec(&rxwin->num_txwins);
+}
+
+/*
+ * Find the user space receive window given the @pswid.
+ *      - We must have a valid vasid and it must belong to this instance.
+ *        (so both send and receive windows are on the same VAS instance)
+ *      - The window must refer to an OPEN, FTW, RECEIVE window.
+ *
+ * NOTE: We access ->windows[] table and assume that vinst->mutex is held.
+ */
+static struct vas_window *get_user_rxwin(struct vas_instance *vinst, u32 pswid)
+{
+	int vasid, winid;
+	struct vas_window *rxwin;
+
+	decode_pswid(pswid, &vasid, &winid);
+
+	if (vinst->vas_id != vasid)
+		return ERR_PTR(-EINVAL);
+
+	rxwin = vinst->windows[winid];
+
+	if (!rxwin || rxwin->tx_win || rxwin->cop != VAS_COP_TYPE_FTW)
+		return ERR_PTR(-EINVAL);
+
+	return rxwin;
+}
+
+/*
+ * Get the VAS receive window associated with NX engine identified
+ * by @cop and if applicable, @pswid.
+ *
+ * See also function header of set_vinst_win().
+ */
+static struct vas_window *get_vinst_rxwin(struct vas_instance *vinst,
+			enum vas_cop_type cop, u32 pswid)
+{
+	struct vas_window *rxwin;
+
+	mutex_lock(&vinst->mutex);
+
+	if (cop == VAS_COP_TYPE_FTW)
+		rxwin = get_user_rxwin(vinst, pswid);
+	else
+		rxwin = vinst->rxwin[cop] ?: ERR_PTR(-EINVAL);
+
+	if (!IS_ERR(rxwin))
+		atomic_inc(&rxwin->num_txwins);
+
+	mutex_unlock(&vinst->mutex);
+
+	return rxwin;
+}
+
+/*
+ * We have two tables of windows in a VAS instance. The first one,
+ * ->windows[], contains all the windows in the instance and allows
+ * looking up a window by its id. It is used to look up send windows
+ * during fault handling and receive windows when pairing user space
+ * send/receive windows.
+ *
+ * The second table, ->rxwin[], contains receive windows that are
+ * associated with NX engines. This table has VAS_COP_TYPE_MAX
+ * entries and is used to look up a receive window by its
+ * coprocessor type.
+ *
+ * Here, we save @window in the ->windows[] table. If it is a receive
+ * window, we also save the window in the ->rxwin[] table.
+ */
+static void set_vinst_win(struct vas_instance *vinst,
+			struct vas_window *window)
+{
+	int id = window->winid;
+
+	mutex_lock(&vinst->mutex);
+
+	/*
+	 * There should only be one receive window for a coprocessor type
+	 * unless its a user (FTW) window.
+	 */
+	if (!window->user_win && !window->tx_win) {
+		WARN_ON_ONCE(vinst->rxwin[window->cop]);
+		vinst->rxwin[window->cop] = window;
+	}
+
+	WARN_ON_ONCE(vinst->windows[id] != NULL);
+	vinst->windows[id] = window;
+
+	mutex_unlock(&vinst->mutex);
+}
+
+/*
+ * Clear this window from the table(s) of windows for this VAS instance.
+ * See also function header of set_vinst_win().
+ */
+static void clear_vinst_win(struct vas_window *window)
+{
+	int id = window->winid;
+	struct vas_instance *vinst = window->vinst;
+
+	mutex_lock(&vinst->mutex);
+
+	if (!window->user_win && !window->tx_win) {
+		WARN_ON_ONCE(!vinst->rxwin[window->cop]);
+		vinst->rxwin[window->cop] = NULL;
+	}
+
+	WARN_ON_ONCE(vinst->windows[id] != window);
+	vinst->windows[id] = NULL;
+
+	mutex_unlock(&vinst->mutex);
+}
+
+static void init_winctx_for_rxwin(struct vas_window *rxwin,
+			struct vas_rx_win_attr *rxattr,
+			struct vas_winctx *winctx)
+{
+	/*
+	 * We first zero (memset()) all fields and only set non-zero fields.
+	 * Following fields are 0/false but maybe deserve a comment:
+	 *
+	 *	->notify_os_intr_reg	In powerNV, send intrs to HV
+	 *	->notify_disable	False for NX windows
+	 *	->intr_disable		False for Fault Windows
+	 *	->xtra_write		False for NX windows
+	 *	->notify_early		NA for NX windows
+	 *	->rsvd_txbuf_count	NA for Rx windows
+	 *	->lpid, ->pid, ->tid	NA for Rx windows
+	 */
+
+	memset(winctx, 0, sizeof(struct vas_winctx));
+
+	winctx->rx_fifo = rxattr->rx_fifo;
+	winctx->rx_fifo_size = rxattr->rx_fifo_size;
+	winctx->wcreds_max = rxwin->wcreds_max;
+	winctx->pin_win = rxattr->pin_win;
+
+	winctx->nx_win = rxattr->nx_win;
+	winctx->fault_win = rxattr->fault_win;
+	winctx->user_win = rxattr->user_win;
+	winctx->rej_no_credit = rxattr->rej_no_credit;
+	winctx->rx_word_mode = rxattr->rx_win_ord_mode;
+	winctx->tx_word_mode = rxattr->tx_win_ord_mode;
+	winctx->rx_wcred_mode = rxattr->rx_wcred_mode;
+	winctx->tx_wcred_mode = rxattr->tx_wcred_mode;
+	winctx->notify_early = rxattr->notify_early;
+
+	if (winctx->nx_win) {
+		winctx->data_stamp = true;
+		winctx->intr_disable = true;
+		winctx->pin_win = true;
+
+		WARN_ON_ONCE(winctx->fault_win);
+		WARN_ON_ONCE(!winctx->rx_word_mode);
+		WARN_ON_ONCE(!winctx->tx_word_mode);
+		WARN_ON_ONCE(winctx->notify_after_count);
+	} else if (winctx->fault_win) {
+		winctx->notify_disable = true;
+	} else if (winctx->user_win) {
+		/*
+		 * Section 1.8.1 Low Latency Core-Core Wake up of
+		 * the VAS workbook:
+		 *
+		 *      - disable credit checks ([tr]x_wcred_mode = false)
+		 *      - disable FIFO writes
+		 *      - enable ASB_Notify, disable interrupt
+		 */
+		winctx->fifo_disable = true;
+		winctx->intr_disable = true;
+		winctx->rx_fifo = 0;
+	}
+
+	winctx->lnotify_lpid = rxattr->lnotify_lpid;
+	winctx->lnotify_pid = rxattr->lnotify_pid;
+	winctx->lnotify_tid = rxattr->lnotify_tid;
+	winctx->pswid = rxattr->pswid;
+	winctx->dma_type = VAS_DMA_TYPE_INJECT;
+	winctx->tc_mode = rxattr->tc_mode;
+
+	winctx->min_scope = VAS_SCOPE_LOCAL;
+	winctx->max_scope = VAS_SCOPE_VECTORED_GROUP;
+	if (rxwin->vinst->virq)
+		winctx->irq_port = rxwin->vinst->irq_port;
+}
+
+static bool rx_win_args_valid(enum vas_cop_type cop,
+			struct vas_rx_win_attr *attr)
+{
+	pr_debug("Rxattr: fault %d, notify %d, intr %d, early %d, fifo %d\n",
+			attr->fault_win, attr->notify_disable,
+			attr->intr_disable, attr->notify_early,
+			attr->rx_fifo_size);
+
+	if (cop >= VAS_COP_TYPE_MAX)
+		return false;
+
+	if (cop != VAS_COP_TYPE_FTW &&
+				attr->rx_fifo_size < VAS_RX_FIFO_SIZE_MIN)
+		return false;
+
+	if (attr->rx_fifo_size > VAS_RX_FIFO_SIZE_MAX)
+		return false;
+
+	if (!attr->wcreds_max)
+		return false;
+
+	if (attr->nx_win) {
+		/* cannot be fault or user window if it is nx */
+		if (attr->fault_win || attr->user_win)
+			return false;
+		/*
+		 * Section 3.1.4.32: NX Windows must not disable notification,
+		 *	and must not enable interrupts or early notification.
+		 */
+		if (attr->notify_disable || !attr->intr_disable ||
+				attr->notify_early)
+			return false;
+	} else if (attr->fault_win) {
+		/* cannot be both fault and user window */
+		if (attr->user_win)
+			return false;
+
+		/*
+		 * Section 3.1.4.32: Fault windows must disable notification
+		 *	but not interrupts.
+		 */
+		if (!attr->notify_disable || attr->intr_disable)
+			return false;
+
+	} else if (attr->user_win) {
+		/*
+		 * User receive windows are only for fast-thread-wakeup
+		 * (FTW). They don't need a FIFO and must disable interrupts
+		 */
+		if (attr->rx_fifo || attr->rx_fifo_size || !attr->intr_disable)
+			return false;
+	} else {
+		/* Rx window must be one of NX or Fault or User window. */
+		return false;
+	}
+
+	return true;
+}
+
+void vas_init_rx_win_attr(struct vas_rx_win_attr *rxattr, enum vas_cop_type cop)
+{
+	memset(rxattr, 0, sizeof(*rxattr));
+
+	if (cop == VAS_COP_TYPE_842 || cop == VAS_COP_TYPE_842_HIPRI ||
+		cop == VAS_COP_TYPE_GZIP || cop == VAS_COP_TYPE_GZIP_HIPRI) {
+		rxattr->pin_win = true;
+		rxattr->nx_win = true;
+		rxattr->fault_win = false;
+		rxattr->intr_disable = true;
+		rxattr->rx_wcred_mode = true;
+		rxattr->tx_wcred_mode = true;
+		rxattr->rx_win_ord_mode = true;
+		rxattr->tx_win_ord_mode = true;
+	} else if (cop == VAS_COP_TYPE_FAULT) {
+		rxattr->pin_win = true;
+		rxattr->fault_win = true;
+		rxattr->notify_disable = true;
+		rxattr->rx_wcred_mode = true;
+		rxattr->rx_win_ord_mode = true;
+		rxattr->rej_no_credit = true;
+		rxattr->tc_mode = VAS_THRESH_DISABLED;
+	} else if (cop == VAS_COP_TYPE_FTW) {
+		rxattr->user_win = true;
+		rxattr->intr_disable = true;
+
+		/*
+		 * As noted in the VAS Workbook we disable credit checks.
+		 * If we enable credit checks in the future, we must also
+		 * implement a mechanism to return the user credits or new
+		 * paste operations will fail.
+		 */
+	}
+}
+EXPORT_SYMBOL_GPL(vas_init_rx_win_attr);
+
+struct vas_window *vas_rx_win_open(int vasid, enum vas_cop_type cop,
+			struct vas_rx_win_attr *rxattr)
+{
+	struct vas_window *rxwin;
+	struct vas_winctx winctx;
+	struct vas_instance *vinst;
+
+	trace_vas_rx_win_open(current, vasid, cop, rxattr);
+
+	if (!rx_win_args_valid(cop, rxattr))
+		return ERR_PTR(-EINVAL);
+
+	vinst = find_vas_instance(vasid);
+	if (!vinst) {
+		pr_devel("vasid %d not found!\n", vasid);
+		return ERR_PTR(-EINVAL);
+	}
+	pr_devel("Found instance %d\n", vasid);
+
+	rxwin = vas_window_alloc(vinst);
+	if (IS_ERR(rxwin)) {
+		pr_devel("Unable to allocate memory for Rx window\n");
+		return rxwin;
+	}
+
+	rxwin->tx_win = false;
+	rxwin->nx_win = rxattr->nx_win;
+	rxwin->user_win = rxattr->user_win;
+	rxwin->cop = cop;
+	rxwin->wcreds_max = rxattr->wcreds_max;
+
+	init_winctx_for_rxwin(rxwin, rxattr, &winctx);
+	init_winctx_regs(rxwin, &winctx);
+
+	set_vinst_win(vinst, rxwin);
+
+	return rxwin;
+}
+EXPORT_SYMBOL_GPL(vas_rx_win_open);
+
+void vas_init_tx_win_attr(struct vas_tx_win_attr *txattr, enum vas_cop_type cop)
+{
+	memset(txattr, 0, sizeof(*txattr));
+
+	if (cop == VAS_COP_TYPE_842 || cop == VAS_COP_TYPE_842_HIPRI ||
+		cop == VAS_COP_TYPE_GZIP || cop == VAS_COP_TYPE_GZIP_HIPRI) {
+		txattr->rej_no_credit = false;
+		txattr->rx_wcred_mode = true;
+		txattr->tx_wcred_mode = true;
+		txattr->rx_win_ord_mode = true;
+		txattr->tx_win_ord_mode = true;
+	} else if (cop == VAS_COP_TYPE_FTW) {
+		txattr->user_win = true;
+	}
+}
+EXPORT_SYMBOL_GPL(vas_init_tx_win_attr);
+
+static void init_winctx_for_txwin(struct vas_window *txwin,
+			struct vas_tx_win_attr *txattr,
+			struct vas_winctx *winctx)
+{
+	/*
+	 * We first zero all fields and only set non-zero ones. Following
+	 * are some fields set to 0/false for the stated reason:
+	 *
+	 *	->notify_os_intr_reg	In powernv, send intrs to HV
+	 *	->rsvd_txbuf_count	Not supported yet.
+	 *	->notify_disable	False for NX windows
+	 *	->xtra_write		False for NX windows
+	 *	->notify_early		NA for NX windows
+	 *	->lnotify_lpid		NA for Tx windows
+	 *	->lnotify_pid		NA for Tx windows
+	 *	->lnotify_tid		NA for Tx windows
+	 *	->tx_win_cred_mode	Ignore for now for NX windows
+	 *	->rx_win_cred_mode	Ignore for now for NX windows
+	 */
+	memset(winctx, 0, sizeof(struct vas_winctx));
+
+	winctx->wcreds_max = txwin->wcreds_max;
+
+	winctx->user_win = txattr->user_win;
+	winctx->nx_win = txwin->rxwin->nx_win;
+	winctx->pin_win = txattr->pin_win;
+	winctx->rej_no_credit = txattr->rej_no_credit;
+	winctx->rsvd_txbuf_enable = txattr->rsvd_txbuf_enable;
+
+	winctx->rx_wcred_mode = txattr->rx_wcred_mode;
+	winctx->tx_wcred_mode = txattr->tx_wcred_mode;
+	winctx->rx_word_mode = txattr->rx_win_ord_mode;
+	winctx->tx_word_mode = txattr->tx_win_ord_mode;
+	winctx->rsvd_txbuf_count = txattr->rsvd_txbuf_count;
+
+	winctx->intr_disable = true;
+	if (winctx->nx_win)
+		winctx->data_stamp = true;
+
+	winctx->lpid = txattr->lpid;
+	winctx->pidr = txattr->pidr;
+	winctx->rx_win_id = txwin->rxwin->winid;
+	/*
+	 * IRQ and fault window setup is successful. Set fault window
+	 * for the send window so that ready to handle faults.
+	 */
+	if (txwin->vinst->virq)
+		winctx->fault_win_id = txwin->vinst->fault_win->winid;
+
+	winctx->dma_type = VAS_DMA_TYPE_INJECT;
+	winctx->tc_mode = txattr->tc_mode;
+	winctx->min_scope = VAS_SCOPE_LOCAL;
+	winctx->max_scope = VAS_SCOPE_VECTORED_GROUP;
+	if (txwin->vinst->virq)
+		winctx->irq_port = txwin->vinst->irq_port;
+
+	winctx->pswid = txattr->pswid ? txattr->pswid :
+			encode_pswid(txwin->vinst->vas_id, txwin->winid);
+}
+
+static bool tx_win_args_valid(enum vas_cop_type cop,
+			struct vas_tx_win_attr *attr)
+{
+	if (attr->tc_mode != VAS_THRESH_DISABLED)
+		return false;
+
+	if (cop > VAS_COP_TYPE_MAX)
+		return false;
+
+	if (attr->wcreds_max > VAS_TX_WCREDS_MAX)
+		return false;
+
+	if (attr->user_win) {
+		if (attr->rsvd_txbuf_count)
+			return false;
+
+		if (cop != VAS_COP_TYPE_FTW && cop != VAS_COP_TYPE_GZIP &&
+			cop != VAS_COP_TYPE_GZIP_HIPRI)
+			return false;
+	}
+
+	return true;
+}
+
+struct vas_window *vas_tx_win_open(int vasid, enum vas_cop_type cop,
+			struct vas_tx_win_attr *attr)
+{
+	int rc;
+	struct vas_window *txwin;
+	struct vas_window *rxwin;
+	struct vas_winctx winctx;
+	struct vas_instance *vinst;
+
+	trace_vas_tx_win_open(current, vasid, cop, attr);
+
+	if (!tx_win_args_valid(cop, attr))
+		return ERR_PTR(-EINVAL);
+
+	/*
+	 * If caller did not specify a vasid but specified the PSWID of a
+	 * receive window (applicable only to FTW windows), use the vasid
+	 * from that receive window.
+	 */
+	if (vasid == -1 && attr->pswid)
+		decode_pswid(attr->pswid, &vasid, NULL);
+
+	vinst = find_vas_instance(vasid);
+	if (!vinst) {
+		pr_devel("vasid %d not found!\n", vasid);
+		return ERR_PTR(-EINVAL);
+	}
+
+	rxwin = get_vinst_rxwin(vinst, cop, attr->pswid);
+	if (IS_ERR(rxwin)) {
+		pr_devel("No RxWin for vasid %d, cop %d\n", vasid, cop);
+		return rxwin;
+	}
+
+	txwin = vas_window_alloc(vinst);
+	if (IS_ERR(txwin)) {
+		rc = PTR_ERR(txwin);
+		goto put_rxwin;
+	}
+
+	txwin->cop = cop;
+	txwin->tx_win = 1;
+	txwin->rxwin = rxwin;
+	txwin->nx_win = txwin->rxwin->nx_win;
+	txwin->user_win = attr->user_win;
+	txwin->wcreds_max = attr->wcreds_max ?: VAS_WCREDS_DEFAULT;
+
+	init_winctx_for_txwin(txwin, attr, &winctx);
+
+	init_winctx_regs(txwin, &winctx);
+
+	/*
+	 * If its a kernel send window, map the window address into the
+	 * kernel's address space. For user windows, user must issue an
+	 * mmap() to map the window into their address space.
+	 *
+	 * NOTE: If kernel ever resubmits a user CRB after handling a page
+	 *	 fault, we will need to map this into kernel as well.
+	 */
+	if (!txwin->user_win) {
+		txwin->paste_kaddr = map_paste_region(txwin);
+		if (IS_ERR(txwin->paste_kaddr)) {
+			rc = PTR_ERR(txwin->paste_kaddr);
+			goto free_window;
+		}
+	} else {
+		/*
+		 * Interrupt hanlder or fault window setup failed. Means
+		 * NX can not generate fault for page fault. So not
+		 * opening for user space tx window.
+		 */
+		if (!vinst->virq) {
+			rc = -ENODEV;
+			goto free_window;
+		}
+
+		/*
+		 * Window opened by a child thread may not be closed when
+		 * it exits. So take reference to its pid and release it
+		 * when the window is free by parent thread.
+		 * Acquire a reference to the task's pid to make sure
+		 * pid will not be re-used - needed only for multithread
+		 * applications.
+		 */
+		txwin->pid = get_task_pid(current, PIDTYPE_PID);
+		/*
+		 * Acquire a reference to the task's mm.
+		 */
+		txwin->mm = get_task_mm(current);
+
+		if (!txwin->mm) {
+			put_pid(txwin->pid);
+			pr_err("VAS: pid(%d): mm_struct is not found\n",
+					current->pid);
+			rc = -EPERM;
+			goto free_window;
+		}
+
+		mmgrab(txwin->mm);
+		mmput(txwin->mm);
+		mm_context_add_vas_window(txwin->mm);
+		/*
+		 * Process closes window during exit. In the case of
+		 * multithread application, the child thread can open
+		 * window and can exit without closing it. so takes tgid
+		 * reference until window closed to make sure tgid is not
+		 * reused.
+		 */
+		txwin->tgid = find_get_pid(task_tgid_vnr(current));
+		/*
+		 * Even a process that has no foreign real address mapping can
+		 * use an unpaired COPY instruction (to no real effect). Issue
+		 * CP_ABORT to clear any pending COPY and prevent a covert
+		 * channel.
+		 *
+		 * __switch_to() will issue CP_ABORT on future context switches
+		 * if process / thread has any open VAS window (Use
+		 * current->mm->context.vas_windows).
+		 */
+		asm volatile(PPC_CP_ABORT);
+	}
+
+	set_vinst_win(vinst, txwin);
+
+	return txwin;
+
+free_window:
+	vas_window_free(txwin);
+
+put_rxwin:
+	put_rx_win(rxwin);
+	return ERR_PTR(rc);
+
+}
+EXPORT_SYMBOL_GPL(vas_tx_win_open);
+
+int vas_copy_crb(void *crb, int offset)
+{
+	return vas_copy(crb, offset);
+}
+EXPORT_SYMBOL_GPL(vas_copy_crb);
+
+#define RMA_LSMP_REPORT_ENABLE PPC_BIT(53)
+int vas_paste_crb(struct vas_window *txwin, int offset, bool re)
+{
+	int rc;
+	void *addr;
+	uint64_t val;
+
+	trace_vas_paste_crb(current, txwin);
+
+	/*
+	 * Only NX windows are supported for now and hardware assumes
+	 * report-enable flag is set for NX windows. Ensure software
+	 * complies too.
+	 */
+	WARN_ON_ONCE(txwin->nx_win && !re);
+
+	addr = txwin->paste_kaddr;
+	if (re) {
+		/*
+		 * Set the REPORT_ENABLE bit (equivalent to writing
+		 * to 1K offset of the paste address)
+		 */
+		val = SET_FIELD(RMA_LSMP_REPORT_ENABLE, 0ULL, 1);
+		addr += val;
+	}
+
+	/*
+	 * Map the raw CR value from vas_paste() to an error code (there
+	 * is just pass or fail for now though).
+	 */
+	rc = vas_paste(addr, offset);
+	if (rc == 2)
+		rc = 0;
+	else
+		rc = -EINVAL;
+
+	pr_debug("Txwin #%d: Msg count %llu\n", txwin->winid,
+			read_hvwc_reg(txwin, VREG(LRFIFO_PUSH)));
+
+	return rc;
+}
+EXPORT_SYMBOL_GPL(vas_paste_crb);
+
+/*
+ * If credit checking is enabled for this window, poll for the return
+ * of window credits (i.e for NX engines to process any outstanding CRBs).
+ * Since NX-842 waits for the CRBs to be processed before closing the
+ * window, we should not have to wait for too long.
+ *
+ * TODO: We retry in 10ms intervals now. We could/should probably peek at
+ *	the VAS_LRFIFO_PUSH_OFFSET register to get an estimate of pending
+ *	CRBs on the FIFO and compute the delay dynamically on each retry.
+ *	But that is not really needed until we support NX-GZIP access from
+ *	user space. (NX-842 driver waits for CSB and Fast thread-wakeup
+ *	doesn't use credit checking).
+ */
+static void poll_window_credits(struct vas_window *window)
+{
+	u64 val;
+	int creds, mode;
+	int count = 0;
+
+	val = read_hvwc_reg(window, VREG(WINCTL));
+	if (window->tx_win)
+		mode = GET_FIELD(VAS_WINCTL_TX_WCRED_MODE, val);
+	else
+		mode = GET_FIELD(VAS_WINCTL_RX_WCRED_MODE, val);
+
+	if (!mode)
+		return;
+retry:
+	if (window->tx_win) {
+		val = read_hvwc_reg(window, VREG(TX_WCRED));
+		creds = GET_FIELD(VAS_TX_WCRED, val);
+	} else {
+		val = read_hvwc_reg(window, VREG(LRX_WCRED));
+		creds = GET_FIELD(VAS_LRX_WCRED, val);
+	}
+
+	/*
+	 * Takes around few milliseconds to complete all pending requests
+	 * and return credits.
+	 * TODO: Scan fault FIFO and invalidate CRBs points to this window
+	 *       and issue CRB Kill to stop all pending requests. Need only
+	 *       if there is a bug in NX or fault handling in kernel.
+	 */
+	if (creds < window->wcreds_max) {
+		val = 0;
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		schedule_timeout(msecs_to_jiffies(10));
+		count++;
+		/*
+		 * Process can not close send window until all credits are
+		 * returned.
+		 */
+		if (!(count % 1000))
+			pr_warn_ratelimited("VAS: pid %d stuck. Waiting for credits returned for Window(%d). creds %d, Retries %d\n",
+				vas_window_pid(window), window->winid,
+				creds, count);
+
+		goto retry;
+	}
+}
+
+/*
+ * Wait for the window to go to "not-busy" state. It should only take a
+ * short time to queue a CRB, so window should not be busy for too long.
+ * Trying 5ms intervals.
+ */
+static void poll_window_busy_state(struct vas_window *window)
+{
+	int busy;
+	u64 val;
+	int count = 0;
+
+retry:
+	val = read_hvwc_reg(window, VREG(WIN_STATUS));
+	busy = GET_FIELD(VAS_WIN_BUSY, val);
+	if (busy) {
+		val = 0;
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		schedule_timeout(msecs_to_jiffies(10));
+		count++;
+		/*
+		 * Takes around few milliseconds to process all pending
+		 * requests.
+		 */
+		if (!(count % 1000))
+			pr_warn_ratelimited("VAS: pid %d stuck. Window (ID=%d) is in busy state. Retries %d\n",
+				vas_window_pid(window), window->winid, count);
+
+		goto retry;
+	}
+}
+
+/*
+ * Have the hardware cast a window out of cache and wait for it to
+ * be completed.
+ *
+ * NOTE: It can take a relatively long time to cast the window context
+ *	out of the cache. It is not strictly necessary to cast out if:
+ *
+ *	- we clear the "Pin Window" bit (so hardware is free to evict)
+ *
+ *	- we re-initialize the window context when it is reassigned.
+ *
+ *	We do the former in vas_win_close() and latter in vas_win_open().
+ *	So, ignoring the cast-out for now. We can add it as needed. If
+ *	casting out becomes necessary we should consider offloading the
+ *	job to a worker thread, so the window close can proceed quickly.
+ */
+static void poll_window_castout(struct vas_window *window)
+{
+	/* stub for now */
+}
+
+/*
+ * Unpin and close a window so no new requests are accepted and the
+ * hardware can evict this window from cache if necessary.
+ */
+static void unpin_close_window(struct vas_window *window)
+{
+	u64 val;
+
+	val = read_hvwc_reg(window, VREG(WINCTL));
+	val = SET_FIELD(VAS_WINCTL_PIN, val, 0);
+	val = SET_FIELD(VAS_WINCTL_OPEN, val, 0);
+	write_hvwc_reg(window, VREG(WINCTL), val);
+}
+
+/*
+ * Close a window.
+ *
+ * See Section 1.12.1 of VAS workbook v1.05 for details on closing window:
+ *	- Disable new paste operations (unmap paste address)
+ *	- Poll for the "Window Busy" bit to be cleared
+ *	- Clear the Open/Enable bit for the Window.
+ *	- Poll for return of window Credits (implies FIFO empty for Rx win?)
+ *	- Unpin and cast window context out of cache
+ *
+ * Besides the hardware, kernel has some bookkeeping of course.
+ */
+int vas_win_close(struct vas_window *window)
+{
+	if (!window)
+		return 0;
+
+	if (!window->tx_win && atomic_read(&window->num_txwins) != 0) {
+		pr_devel("Attempting to close an active Rx window!\n");
+		WARN_ON_ONCE(1);
+		return -EBUSY;
+	}
+
+	unmap_paste_region(window);
+
+	poll_window_busy_state(window);
+
+	unpin_close_window(window);
+
+	poll_window_credits(window);
+
+	clear_vinst_win(window);
+
+	poll_window_castout(window);
+
+	/* if send window, drop reference to matching receive window */
+	if (window->tx_win) {
+		if (window->user_win) {
+			/* Drop references to pid. tgid and mm */
+			put_pid(window->pid);
+			put_pid(window->tgid);
+			if (window->mm) {
+				mm_context_remove_vas_window(window->mm);
+				mmdrop(window->mm);
+			}
+		}
+		put_rx_win(window->rxwin);
+	}
+
+	vas_window_free(window);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(vas_win_close);
+
+/*
+ * Return credit for the given window.
+ * Send windows and fault window uses credit mechanism as follows:
+ *
+ * Send windows:
+ * - The default number of credits available for each send window is
+ *   1024. It means 1024 requests can be issued asynchronously at the
+ *   same time. If the credit is not available, that request will be
+ *   returned with RMA_Busy.
+ * - One credit is taken when NX request is issued.
+ * - This credit is returned after NX processed that request.
+ * - If NX encounters translation error, kernel will return the
+ *   credit on the specific send window after processing the fault CRB.
+ *
+ * Fault window:
+ * - The total number credits available is FIFO_SIZE/CRB_SIZE.
+ *   Means 4MB/128 in the current implementation. If credit is not
+ *   available, RMA_Reject is returned.
+ * - A credit is taken when NX pastes CRB in fault FIFO.
+ * - The kernel with return credit on fault window after reading entry
+ *   from fault FIFO.
+ */
+void vas_return_credit(struct vas_window *window, bool tx)
+{
+	uint64_t val;
+
+	val = 0ULL;
+	if (tx) { /* send window */
+		val = SET_FIELD(VAS_TX_WCRED, val, 1);
+		write_hvwc_reg(window, VREG(TX_WCRED_ADDER), val);
+	} else {
+		val = SET_FIELD(VAS_LRX_WCRED, val, 1);
+		write_hvwc_reg(window, VREG(LRX_WCRED_ADDER), val);
+	}
+}
+
+struct vas_window *vas_pswid_to_window(struct vas_instance *vinst,
+		uint32_t pswid)
+{
+	struct vas_window *window;
+	int winid;
+
+	if (!pswid) {
+		pr_devel("%s: called for pswid 0!\n", __func__);
+		return ERR_PTR(-ESRCH);
+	}
+
+	decode_pswid(pswid, NULL, &winid);
+
+	if (winid >= VAS_WINDOWS_PER_CHIP)
+		return ERR_PTR(-ESRCH);
+
+	/*
+	 * If application closes the window before the hardware
+	 * returns the fault CRB, we should wait in vas_win_close()
+	 * for the pending requests. so the window must be active
+	 * and the process alive.
+	 *
+	 * If its a kernel process, we should not get any faults and
+	 * should not get here.
+	 */
+	window = vinst->windows[winid];
+
+	if (!window) {
+		pr_err("PSWID decode: Could not find window for winid %d pswid %d vinst 0x%p\n",
+			winid, pswid, vinst);
+		return NULL;
+	}
+
+	/*
+	 * Do some sanity checks on the decoded window.  Window should be
+	 * NX GZIP user send window. FTW windows should not incur faults
+	 * since their CRBs are ignored (not queued on FIFO or processed
+	 * by NX).
+	 */
+	if (!window->tx_win || !window->user_win || !window->nx_win ||
+			window->cop == VAS_COP_TYPE_FAULT ||
+			window->cop == VAS_COP_TYPE_FTW) {
+		pr_err("PSWID decode: id %d, tx %d, user %d, nx %d, cop %d\n",
+			winid, window->tx_win, window->user_win,
+			window->nx_win, window->cop);
+		WARN_ON(1);
+	}
+
+	return window;
+}
diff --git a/arch/powerpc/platforms/powernv/vas.c b/arch/powerpc/platforms/powernv/vas.c
new file mode 100644
index 000000000..598e4cd56
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/vas.c
@@ -0,0 +1,248 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright 2016-17 IBM Corp.
+ */
+
+#define pr_fmt(fmt) "vas: " fmt
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/export.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/platform_device.h>
+#include <linux/of_platform.h>
+#include <linux/of_address.h>
+#include <linux/of.h>
+#include <linux/irqdomain.h>
+#include <linux/interrupt.h>
+#include <asm/prom.h>
+#include <asm/xive.h>
+
+#include "vas.h"
+
+DEFINE_MUTEX(vas_mutex);
+static LIST_HEAD(vas_instances);
+
+static DEFINE_PER_CPU(int, cpu_vas_id);
+
+static int vas_irq_fault_window_setup(struct vas_instance *vinst)
+{
+	char devname[64];
+	int rc = 0;
+
+	snprintf(devname, sizeof(devname), "vas-%d", vinst->vas_id);
+	rc = request_threaded_irq(vinst->virq, vas_fault_handler,
+				vas_fault_thread_fn, 0, devname, vinst);
+
+	if (rc) {
+		pr_err("VAS[%d]: Request IRQ(%d) failed with %d\n",
+				vinst->vas_id, vinst->virq, rc);
+		goto out;
+	}
+
+	rc = vas_setup_fault_window(vinst);
+	if (rc)
+		free_irq(vinst->virq, vinst);
+
+out:
+	return rc;
+}
+
+static int init_vas_instance(struct platform_device *pdev)
+{
+	struct device_node *dn = pdev->dev.of_node;
+	struct vas_instance *vinst;
+	struct xive_irq_data *xd;
+	uint32_t chipid, hwirq;
+	struct resource *res;
+	int rc, cpu, vasid;
+
+	rc = of_property_read_u32(dn, "ibm,vas-id", &vasid);
+	if (rc) {
+		pr_err("No ibm,vas-id property for %s?\n", pdev->name);
+		return -ENODEV;
+	}
+
+	rc = of_property_read_u32(dn, "ibm,chip-id", &chipid);
+	if (rc) {
+		pr_err("No ibm,chip-id property for %s?\n", pdev->name);
+		return -ENODEV;
+	}
+
+	if (pdev->num_resources != 4) {
+		pr_err("Unexpected DT configuration for [%s, %d]\n",
+				pdev->name, vasid);
+		return -ENODEV;
+	}
+
+	vinst = kzalloc(sizeof(*vinst), GFP_KERNEL);
+	if (!vinst)
+		return -ENOMEM;
+
+	INIT_LIST_HEAD(&vinst->node);
+	ida_init(&vinst->ida);
+	mutex_init(&vinst->mutex);
+	vinst->vas_id = vasid;
+	vinst->pdev = pdev;
+
+	res = &pdev->resource[0];
+	vinst->hvwc_bar_start = res->start;
+
+	res = &pdev->resource[1];
+	vinst->uwc_bar_start = res->start;
+
+	res = &pdev->resource[2];
+	vinst->paste_base_addr = res->start;
+
+	res = &pdev->resource[3];
+	if (res->end > 62) {
+		pr_err("Bad 'paste_win_id_shift' in DT, %llx\n", res->end);
+		goto free_vinst;
+	}
+
+	vinst->paste_win_id_shift = 63 - res->end;
+
+	hwirq = xive_native_alloc_irq_on_chip(chipid);
+	if (!hwirq) {
+		pr_err("Inst%d: Unable to allocate global irq for chip %d\n",
+				vinst->vas_id, chipid);
+		return -ENOENT;
+	}
+
+	vinst->virq = irq_create_mapping(NULL, hwirq);
+	if (!vinst->virq) {
+		pr_err("Inst%d: Unable to map global irq %d\n",
+				vinst->vas_id, hwirq);
+		return -EINVAL;
+	}
+
+	xd = irq_get_handler_data(vinst->virq);
+	if (!xd) {
+		pr_err("Inst%d: Invalid virq %d\n",
+				vinst->vas_id, vinst->virq);
+		return -EINVAL;
+	}
+
+	vinst->irq_port = xd->trig_page;
+	pr_devel("Initialized instance [%s, %d] paste_base 0x%llx paste_win_id_shift 0x%llx IRQ %d Port 0x%llx\n",
+			pdev->name, vasid, vinst->paste_base_addr,
+			vinst->paste_win_id_shift, vinst->virq,
+			vinst->irq_port);
+
+	for_each_possible_cpu(cpu) {
+		if (cpu_to_chip_id(cpu) == of_get_ibm_chip_id(dn))
+			per_cpu(cpu_vas_id, cpu) = vasid;
+	}
+
+	mutex_lock(&vas_mutex);
+	list_add(&vinst->node, &vas_instances);
+	mutex_unlock(&vas_mutex);
+
+	spin_lock_init(&vinst->fault_lock);
+	/*
+	 * IRQ and fault handling setup is needed only for user space
+	 * send windows.
+	 */
+	if (vinst->virq) {
+		rc = vas_irq_fault_window_setup(vinst);
+		/*
+		 * Fault window is used only for user space send windows.
+		 * So if vinst->virq is NULL, tx_win_open returns -ENODEV
+		 * for user space.
+		 */
+		if (rc)
+			vinst->virq = 0;
+	}
+
+	vas_instance_init_dbgdir(vinst);
+
+	dev_set_drvdata(&pdev->dev, vinst);
+
+	return 0;
+
+free_vinst:
+	kfree(vinst);
+	return -ENODEV;
+
+}
+
+/*
+ * Although this is read/used multiple times, it is written to only
+ * during initialization.
+ */
+struct vas_instance *find_vas_instance(int vasid)
+{
+	struct list_head *ent;
+	struct vas_instance *vinst;
+
+	mutex_lock(&vas_mutex);
+
+	if (vasid == -1)
+		vasid = per_cpu(cpu_vas_id, smp_processor_id());
+
+	list_for_each(ent, &vas_instances) {
+		vinst = list_entry(ent, struct vas_instance, node);
+		if (vinst->vas_id == vasid) {
+			mutex_unlock(&vas_mutex);
+			return vinst;
+		}
+	}
+	mutex_unlock(&vas_mutex);
+
+	pr_devel("Instance %d not found\n", vasid);
+	return NULL;
+}
+
+int chip_to_vas_id(int chipid)
+{
+	int cpu;
+
+	for_each_possible_cpu(cpu) {
+		if (cpu_to_chip_id(cpu) == chipid)
+			return per_cpu(cpu_vas_id, cpu);
+	}
+	return -1;
+}
+EXPORT_SYMBOL(chip_to_vas_id);
+
+static int vas_probe(struct platform_device *pdev)
+{
+	return init_vas_instance(pdev);
+}
+
+static const struct of_device_id powernv_vas_match[] = {
+	{ .compatible = "ibm,vas",},
+	{},
+};
+
+static struct platform_driver vas_driver = {
+	.driver = {
+		.name = "vas",
+		.of_match_table = powernv_vas_match,
+	},
+	.probe = vas_probe,
+};
+
+static int __init vas_init(void)
+{
+	int found = 0;
+	struct device_node *dn;
+
+	platform_driver_register(&vas_driver);
+
+	for_each_compatible_node(dn, NULL, "ibm,vas") {
+		of_platform_device_create(dn, NULL, NULL);
+		found++;
+	}
+
+	if (!found) {
+		platform_driver_unregister(&vas_driver);
+		return -ENODEV;
+	}
+
+	pr_devel("Found %d instances\n", found);
+
+	return 0;
+}
+device_initcall(vas_init);
diff --git a/arch/powerpc/platforms/powernv/vas.h b/arch/powerpc/platforms/powernv/vas.h
new file mode 100644
index 000000000..1f6e73809
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/vas.h
@@ -0,0 +1,508 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright 2016-17 IBM Corp.
+ */
+
+#ifndef _VAS_H
+#define _VAS_H
+#include <linux/atomic.h>
+#include <linux/idr.h>
+#include <asm/vas.h>
+#include <linux/io.h>
+#include <linux/dcache.h>
+#include <linux/mutex.h>
+#include <linux/stringify.h>
+
+/*
+ * Overview of Virtual Accelerator Switchboard (VAS).
+ *
+ * VAS is a hardware "switchboard" that allows senders and receivers to
+ * exchange messages with _minimal_ kernel involvment. The receivers are
+ * typically NX coprocessor engines that perform compression or encryption
+ * in hardware, but receivers can also be other software threads.
+ *
+ * Senders are user/kernel threads that submit compression/encryption or
+ * other requests to the receivers. Senders must format their messages as
+ * Coprocessor Request Blocks (CRB)s and submit them using the "copy" and
+ * "paste" instructions which were introduced in Power9.
+ *
+ * A Power node can have (upto?) 8 Power chips. There is one instance of
+ * VAS in each Power9 chip. Each instance of VAS has 64K windows or ports,
+ * Senders and receivers must each connect to a separate window before they
+ * can exchange messages through the switchboard.
+ *
+ * Each window is described by two types of window contexts:
+ *
+ *	Hypervisor Window Context (HVWC) of size VAS_HVWC_SIZE bytes
+ *
+ *	OS/User Window Context (UWC) of size VAS_UWC_SIZE bytes.
+ *
+ * A window context can be viewed as a set of 64-bit registers. The settings
+ * in these registers configure/control/determine the behavior of the VAS
+ * hardware when messages are sent/received through the window. The registers
+ * in the HVWC are configured by the kernel while the registers in the UWC can
+ * be configured by the kernel or by the user space application that is using
+ * the window.
+ *
+ * The HVWCs for all windows on a specific instance of VAS are in a contiguous
+ * range of hardware addresses or Base address region (BAR) referred to as the
+ * HVWC BAR for the instance. Similarly the UWCs for all windows on an instance
+ * are referred to as the UWC BAR for the instance.
+ *
+ * The two BARs for each instance are defined Power9 MMIO Ranges spreadsheet
+ * and available to the kernel in the VAS node's "reg" property in the device
+ * tree:
+ *
+ *	/proc/device-tree/vasm@.../reg
+ *
+ * (see vas_probe() for details on the reg property).
+ *
+ * The kernel maps the HVWC and UWC BAR regions into the kernel address
+ * space (hvwc_map and uwc_map). The kernel can then access the window
+ * contexts of a specific window using:
+ *
+ *	 hvwc = hvwc_map + winid * VAS_HVWC_SIZE.
+ *	 uwc = uwc_map + winid * VAS_UWC_SIZE.
+ *
+ * where winid is the window index (0..64K).
+ *
+ * As mentioned, a window context is used to "configure" a window. Besides
+ * this configuration address, each _send_ window also has a unique hardware
+ * "paste" address that is used to submit requests/CRBs (see vas_paste_crb()).
+ *
+ * The hardware paste address for a window is computed using the "paste
+ * base address" and "paste win id shift" reg properties in the VAS device
+ * tree node using:
+ *
+ *	paste_addr = paste_base + ((winid << paste_win_id_shift))
+ *
+ * (again, see vas_probe() for ->paste_base_addr and ->paste_win_id_shift).
+ *
+ * The kernel maps this hardware address into the sender's address space
+ * after which they can use the 'paste' instruction (new in Power9) to
+ * send a message (submit a request aka CRB) to the coprocessor.
+ *
+ * NOTE: In the initial version, senders can only in-kernel drivers/threads.
+ *	 Support for user space threads will be added in follow-on patches.
+ *
+ * TODO: Do we need to map the UWC into user address space so they can return
+ *	 credits? Its NA for NX but may be needed for other receive windows.
+ *
+ */
+
+#define VAS_WINDOWS_PER_CHIP		(64 << 10)
+
+/*
+ * Hypervisor and OS/USer Window Context sizes
+ */
+#define VAS_HVWC_SIZE			512
+#define VAS_UWC_SIZE			PAGE_SIZE
+
+/*
+ * Initial per-process credits.
+ * Max send window credits:    4K-1 (12-bits in VAS_TX_WCRED)
+ *
+ * TODO: Needs tuning for per-process credits
+ */
+#define VAS_TX_WCREDS_MAX		((4 << 10) - 1)
+#define VAS_WCREDS_DEFAULT		(1 << 10)
+
+/*
+ * VAS Window Context Register Offsets and bitmasks.
+ * See Section 3.1.4 of VAS Work book
+ */
+#define VAS_LPID_OFFSET			0x010
+#define VAS_LPID			PPC_BITMASK(0, 11)
+
+#define VAS_PID_OFFSET			0x018
+#define VAS_PID_ID			PPC_BITMASK(0, 19)
+
+#define VAS_XLATE_MSR_OFFSET		0x020
+#define VAS_XLATE_MSR_DR		PPC_BIT(0)
+#define VAS_XLATE_MSR_TA		PPC_BIT(1)
+#define VAS_XLATE_MSR_PR		PPC_BIT(2)
+#define VAS_XLATE_MSR_US		PPC_BIT(3)
+#define VAS_XLATE_MSR_HV		PPC_BIT(4)
+#define VAS_XLATE_MSR_SF		PPC_BIT(5)
+
+#define VAS_XLATE_LPCR_OFFSET		0x028
+#define VAS_XLATE_LPCR_PAGE_SIZE	PPC_BITMASK(0, 2)
+#define VAS_XLATE_LPCR_ISL		PPC_BIT(3)
+#define VAS_XLATE_LPCR_TC		PPC_BIT(4)
+#define VAS_XLATE_LPCR_SC		PPC_BIT(5)
+
+#define VAS_XLATE_CTL_OFFSET		0x030
+#define VAS_XLATE_MODE			PPC_BITMASK(0, 1)
+
+#define VAS_AMR_OFFSET			0x040
+#define VAS_AMR				PPC_BITMASK(0, 63)
+
+#define VAS_SEIDR_OFFSET		0x048
+#define VAS_SEIDR			PPC_BITMASK(0, 63)
+
+#define VAS_FAULT_TX_WIN_OFFSET		0x050
+#define VAS_FAULT_TX_WIN		PPC_BITMASK(48, 63)
+
+#define VAS_OSU_INTR_SRC_RA_OFFSET	0x060
+#define VAS_OSU_INTR_SRC_RA		PPC_BITMASK(8, 63)
+
+#define VAS_HV_INTR_SRC_RA_OFFSET	0x070
+#define VAS_HV_INTR_SRC_RA		PPC_BITMASK(8, 63)
+
+#define VAS_PSWID_OFFSET		0x078
+#define VAS_PSWID_EA_HANDLE		PPC_BITMASK(0, 31)
+
+#define VAS_SPARE1_OFFSET		0x080
+#define VAS_SPARE2_OFFSET		0x088
+#define VAS_SPARE3_OFFSET		0x090
+#define VAS_SPARE4_OFFSET		0x130
+#define VAS_SPARE5_OFFSET		0x160
+#define VAS_SPARE6_OFFSET		0x188
+
+#define VAS_LFIFO_BAR_OFFSET		0x0A0
+#define VAS_LFIFO_BAR			PPC_BITMASK(8, 53)
+#define VAS_PAGE_MIGRATION_SELECT	PPC_BITMASK(54, 56)
+
+#define VAS_LDATA_STAMP_CTL_OFFSET	0x0A8
+#define VAS_LDATA_STAMP			PPC_BITMASK(0, 1)
+#define VAS_XTRA_WRITE			PPC_BIT(2)
+
+#define VAS_LDMA_CACHE_CTL_OFFSET	0x0B0
+#define VAS_LDMA_TYPE			PPC_BITMASK(0, 1)
+#define VAS_LDMA_FIFO_DISABLE		PPC_BIT(2)
+
+#define VAS_LRFIFO_PUSH_OFFSET		0x0B8
+#define VAS_LRFIFO_PUSH			PPC_BITMASK(0, 15)
+
+#define VAS_CURR_MSG_COUNT_OFFSET	0x0C0
+#define VAS_CURR_MSG_COUNT		PPC_BITMASK(0, 7)
+
+#define VAS_LNOTIFY_AFTER_COUNT_OFFSET	0x0C8
+#define VAS_LNOTIFY_AFTER_COUNT		PPC_BITMASK(0, 7)
+
+#define VAS_LRX_WCRED_OFFSET		0x0E0
+#define VAS_LRX_WCRED			PPC_BITMASK(0, 15)
+
+#define VAS_LRX_WCRED_ADDER_OFFSET	0x190
+#define VAS_LRX_WCRED_ADDER		PPC_BITMASK(0, 15)
+
+#define VAS_TX_WCRED_OFFSET		0x0F0
+#define VAS_TX_WCRED			PPC_BITMASK(4, 15)
+
+#define VAS_TX_WCRED_ADDER_OFFSET	0x1A0
+#define VAS_TX_WCRED_ADDER		PPC_BITMASK(4, 15)
+
+#define VAS_LFIFO_SIZE_OFFSET		0x100
+#define VAS_LFIFO_SIZE			PPC_BITMASK(0, 3)
+
+#define VAS_WINCTL_OFFSET		0x108
+#define VAS_WINCTL_OPEN			PPC_BIT(0)
+#define VAS_WINCTL_REJ_NO_CREDIT	PPC_BIT(1)
+#define VAS_WINCTL_PIN			PPC_BIT(2)
+#define VAS_WINCTL_TX_WCRED_MODE	PPC_BIT(3)
+#define VAS_WINCTL_RX_WCRED_MODE	PPC_BIT(4)
+#define VAS_WINCTL_TX_WORD_MODE		PPC_BIT(5)
+#define VAS_WINCTL_RX_WORD_MODE		PPC_BIT(6)
+#define VAS_WINCTL_RSVD_TXBUF		PPC_BIT(7)
+#define VAS_WINCTL_THRESH_CTL		PPC_BITMASK(8, 9)
+#define VAS_WINCTL_FAULT_WIN		PPC_BIT(10)
+#define VAS_WINCTL_NX_WIN		PPC_BIT(11)
+
+#define VAS_WIN_STATUS_OFFSET		0x110
+#define VAS_WIN_BUSY			PPC_BIT(1)
+
+#define VAS_WIN_CTX_CACHING_CTL_OFFSET	0x118
+#define VAS_CASTOUT_REQ			PPC_BIT(0)
+#define VAS_PUSH_TO_MEM			PPC_BIT(1)
+#define VAS_WIN_CACHE_STATUS		PPC_BIT(4)
+
+#define VAS_TX_RSVD_BUF_COUNT_OFFSET	0x120
+#define VAS_RXVD_BUF_COUNT		PPC_BITMASK(58, 63)
+
+#define VAS_LRFIFO_WIN_PTR_OFFSET	0x128
+#define VAS_LRX_WIN_ID			PPC_BITMASK(0, 15)
+
+/*
+ * Local Notification Control Register controls what happens in _response_
+ * to a paste command and hence applies only to receive windows.
+ */
+#define VAS_LNOTIFY_CTL_OFFSET		0x138
+#define VAS_NOTIFY_DISABLE		PPC_BIT(0)
+#define VAS_INTR_DISABLE		PPC_BIT(1)
+#define VAS_NOTIFY_EARLY		PPC_BIT(2)
+#define VAS_NOTIFY_OSU_INTR		PPC_BIT(3)
+
+#define VAS_LNOTIFY_PID_OFFSET		0x140
+#define VAS_LNOTIFY_PID			PPC_BITMASK(0, 19)
+
+#define VAS_LNOTIFY_LPID_OFFSET		0x148
+#define VAS_LNOTIFY_LPID		PPC_BITMASK(0, 11)
+
+#define VAS_LNOTIFY_TID_OFFSET		0x150
+#define VAS_LNOTIFY_TID			PPC_BITMASK(0, 15)
+
+#define VAS_LNOTIFY_SCOPE_OFFSET	0x158
+#define VAS_LNOTIFY_MIN_SCOPE		PPC_BITMASK(0, 1)
+#define VAS_LNOTIFY_MAX_SCOPE		PPC_BITMASK(2, 3)
+
+#define VAS_NX_UTIL_OFFSET		0x1B0
+#define VAS_NX_UTIL			PPC_BITMASK(0, 63)
+
+/* SE: Side effects */
+#define VAS_NX_UTIL_SE_OFFSET		0x1B8
+#define VAS_NX_UTIL_SE			PPC_BITMASK(0, 63)
+
+#define VAS_NX_UTIL_ADDER_OFFSET	0x180
+#define VAS_NX_UTIL_ADDER		PPC_BITMASK(32, 63)
+
+/*
+ * VREG(x):
+ * Expand a register's short name (eg: LPID) into two parameters:
+ *	- the register's short name in string form ("LPID"), and
+ *	- the name of the macro (eg: VAS_LPID_OFFSET), defining the
+ *	  register's offset in the window context
+ */
+#define VREG_SFX(n, s)	__stringify(n), VAS_##n##s
+#define VREG(r)		VREG_SFX(r, _OFFSET)
+
+/*
+ * Local Notify Scope Control Register. (Receive windows only).
+ */
+enum vas_notify_scope {
+	VAS_SCOPE_LOCAL,
+	VAS_SCOPE_GROUP,
+	VAS_SCOPE_VECTORED_GROUP,
+	VAS_SCOPE_UNUSED,
+};
+
+/*
+ * Local DMA Cache Control Register (Receive windows only).
+ */
+enum vas_dma_type {
+	VAS_DMA_TYPE_INJECT,
+	VAS_DMA_TYPE_WRITE,
+};
+
+/*
+ * Local Notify Scope Control Register. (Receive windows only).
+ * Not applicable to NX receive windows.
+ */
+enum vas_notify_after_count {
+	VAS_NOTIFY_AFTER_256 = 0,
+	VAS_NOTIFY_NONE,
+	VAS_NOTIFY_AFTER_2
+};
+
+/*
+ * NX can generate an interrupt for multiple faults and expects kernel
+ * to process all of them. So read all valid CRB entries until find the
+ * invalid one. So use pswid which is pasted by NX and ccw[0] (reserved
+ * bit in BE) to check valid CRB. CCW[0] will not be touched by user
+ * space. Application gets CRB formt error if it updates this bit.
+ *
+ * Invalidate FIFO during allocation and process all entries from last
+ * successful read until finds invalid pswid and ccw[0] values.
+ * After reading each CRB entry from fault FIFO, the kernel invalidate
+ * it by updating pswid with FIFO_INVALID_ENTRY and CCW[0] with
+ * CCW0_INVALID.
+ */
+#define FIFO_INVALID_ENTRY	0xffffffff
+#define CCW0_INVALID		1
+
+/*
+ * One per instance of VAS. Each instance will have a separate set of
+ * receive windows, one per coprocessor type.
+ *
+ * See also function header of set_vinst_win() for details on ->windows[]
+ * and ->rxwin[] tables.
+ */
+struct vas_instance {
+	int vas_id;
+	struct ida ida;
+	struct list_head node;
+	struct platform_device *pdev;
+
+	u64 hvwc_bar_start;
+	u64 uwc_bar_start;
+	u64 paste_base_addr;
+	u64 paste_win_id_shift;
+
+	u64 irq_port;
+	int virq;
+	int fault_crbs;
+	int fault_fifo_size;
+	int fifo_in_progress;	/* To wake up thread or return IRQ_HANDLED */
+	spinlock_t fault_lock;	/* Protects fifo_in_progress update */
+	void *fault_fifo;
+	struct vas_window *fault_win; /* Fault window */
+
+	struct mutex mutex;
+	struct vas_window *rxwin[VAS_COP_TYPE_MAX];
+	struct vas_window *windows[VAS_WINDOWS_PER_CHIP];
+
+	char *dbgname;
+	struct dentry *dbgdir;
+};
+
+/*
+ * In-kernel state a VAS window. One per window.
+ */
+struct vas_window {
+	/* Fields common to send and receive windows */
+	struct vas_instance *vinst;
+	int winid;
+	bool tx_win;		/* True if send window */
+	bool nx_win;		/* True if NX window */
+	bool user_win;		/* True if user space window */
+	void *hvwc_map;		/* HV window context */
+	void *uwc_map;		/* OS/User window context */
+	struct pid *pid;	/* Linux process id of owner */
+	struct pid *tgid;	/* Thread group ID of owner */
+	struct mm_struct *mm;	/* Linux process mm_struct */
+	int wcreds_max;		/* Window credits */
+
+	char *dbgname;
+	struct dentry *dbgdir;
+
+	/* Fields applicable only to send windows */
+	void *paste_kaddr;
+	char *paste_addr_name;
+	struct vas_window *rxwin;
+
+	/* Feilds applicable only to receive windows */
+	enum vas_cop_type cop;
+	atomic_t num_txwins;
+};
+
+/*
+ * Container for the hardware state of a window. One per-window.
+ *
+ * A VAS Window context is a 512-byte area in the hardware that contains
+ * a set of 64-bit registers. Individual bit-fields in these registers
+ * determine the configuration/operation of the hardware. struct vas_winctx
+ * is a container for the register fields in the window context.
+ */
+struct vas_winctx {
+	u64 rx_fifo;
+	int rx_fifo_size;
+	int wcreds_max;
+	int rsvd_txbuf_count;
+
+	bool user_win;
+	bool nx_win;
+	bool fault_win;
+	bool rsvd_txbuf_enable;
+	bool pin_win;
+	bool rej_no_credit;
+	bool tx_wcred_mode;
+	bool rx_wcred_mode;
+	bool tx_word_mode;
+	bool rx_word_mode;
+	bool data_stamp;
+	bool xtra_write;
+	bool notify_disable;
+	bool intr_disable;
+	bool fifo_disable;
+	bool notify_early;
+	bool notify_os_intr_reg;
+
+	int lpid;
+	int pidr;		/* value from SPRN_PID, not linux pid */
+	int lnotify_lpid;
+	int lnotify_pid;
+	int lnotify_tid;
+	u32 pswid;
+	int rx_win_id;
+	int fault_win_id;
+	int tc_mode;
+
+	u64 irq_port;
+
+	enum vas_dma_type dma_type;
+	enum vas_notify_scope min_scope;
+	enum vas_notify_scope max_scope;
+	enum vas_notify_after_count notify_after_count;
+};
+
+extern struct mutex vas_mutex;
+
+extern struct vas_instance *find_vas_instance(int vasid);
+extern void vas_init_dbgdir(void);
+extern void vas_instance_init_dbgdir(struct vas_instance *vinst);
+extern void vas_window_init_dbgdir(struct vas_window *win);
+extern void vas_window_free_dbgdir(struct vas_window *win);
+extern int vas_setup_fault_window(struct vas_instance *vinst);
+extern irqreturn_t vas_fault_thread_fn(int irq, void *data);
+extern irqreturn_t vas_fault_handler(int irq, void *dev_id);
+extern void vas_return_credit(struct vas_window *window, bool tx);
+extern struct vas_window *vas_pswid_to_window(struct vas_instance *vinst,
+						uint32_t pswid);
+extern void vas_win_paste_addr(struct vas_window *window, u64 *addr,
+					int *len);
+
+static inline int vas_window_pid(struct vas_window *window)
+{
+	return pid_vnr(window->pid);
+}
+
+static inline void vas_log_write(struct vas_window *win, char *name,
+			void *regptr, u64 val)
+{
+	if (val)
+		pr_debug("%swin #%d: %s reg %p, val 0x%016llx\n",
+				win->tx_win ? "Tx" : "Rx", win->winid, name,
+				regptr, val);
+}
+
+static inline void write_uwc_reg(struct vas_window *win, char *name,
+			s32 reg, u64 val)
+{
+	void *regptr;
+
+	regptr = win->uwc_map + reg;
+	vas_log_write(win, name, regptr, val);
+
+	out_be64(regptr, val);
+}
+
+static inline void write_hvwc_reg(struct vas_window *win, char *name,
+			s32 reg, u64 val)
+{
+	void *regptr;
+
+	regptr = win->hvwc_map + reg;
+	vas_log_write(win, name, regptr, val);
+
+	out_be64(regptr, val);
+}
+
+static inline u64 read_hvwc_reg(struct vas_window *win,
+			char *name __maybe_unused, s32 reg)
+{
+	return in_be64(win->hvwc_map+reg);
+}
+
+/*
+ * Encode/decode the Partition Send Window ID (PSWID) for a window in
+ * a way that we can uniquely identify any window in the system. i.e.
+ * we should be able to locate the 'struct vas_window' given the PSWID.
+ *
+ *	Bits	Usage
+ *	0:7	VAS id (8 bits)
+ *	8:15	Unused, 0 (3 bits)
+ *	16:31	Window id (16 bits)
+ */
+static inline u32 encode_pswid(int vasid, int winid)
+{
+	return ((u32)winid | (vasid << (31 - 7)));
+}
+
+static inline void decode_pswid(u32 pswid, int *vasid, int *winid)
+{
+	if (vasid)
+		*vasid = pswid >> (31 - 7) & 0xFF;
+
+	if (winid)
+		*winid = pswid & 0xFFFF;
+}
+#endif /* _VAS_H */