summaryrefslogtreecommitdiffstats
path: root/arch/powerpc/platforms/powernv
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-06 01:02:30 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-06 01:02:30 +0000
commit76cb841cb886eef6b3bee341a2266c76578724ad (patch)
treef5892e5ba6cc11949952a6ce4ecbe6d516d6ce58 /arch/powerpc/platforms/powernv
parentInitial commit. (diff)
downloadlinux-upstream.tar.xz
linux-upstream.zip
Adding upstream version 4.19.249.upstream/4.19.249upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'arch/powerpc/platforms/powernv')
-rw-r--r--arch/powerpc/platforms/powernv/Kconfig55
-rw-r--r--arch/powerpc/platforms/powernv/Makefile19
-rw-r--r--arch/powerpc/platforms/powernv/copy-paste.h46
-rw-r--r--arch/powerpc/platforms/powernv/eeh-powernv.c1752
-rw-r--r--arch/powerpc/platforms/powernv/idle.c847
-rw-r--r--arch/powerpc/platforms/powernv/memtrace.c359
-rw-r--r--arch/powerpc/platforms/powernv/npu-dma.c1012
-rw-r--r--arch/powerpc/platforms/powernv/ocxl.c515
-rw-r--r--arch/powerpc/platforms/powernv/opal-async.c294
-rw-r--r--arch/powerpc/platforms/powernv/opal-dump.c457
-rw-r--r--arch/powerpc/platforms/powernv/opal-elog.c338
-rw-r--r--arch/powerpc/platforms/powernv/opal-flash.c566
-rw-r--r--arch/powerpc/platforms/powernv/opal-hmi.c348
-rw-r--r--arch/powerpc/platforms/powernv/opal-imc.c336
-rw-r--r--arch/powerpc/platforms/powernv/opal-irqchip.c317
-rw-r--r--arch/powerpc/platforms/powernv/opal-kmsg.c51
-rw-r--r--arch/powerpc/platforms/powernv/opal-lpc.c422
-rw-r--r--arch/powerpc/platforms/powernv/opal-memory-errors.c147
-rw-r--r--arch/powerpc/platforms/powernv/opal-msglog.c142
-rw-r--r--arch/powerpc/platforms/powernv/opal-nvram.c117
-rw-r--r--arch/powerpc/platforms/powernv/opal-power.c179
-rw-r--r--arch/powerpc/platforms/powernv/opal-powercap.c244
-rw-r--r--arch/powerpc/platforms/powernv/opal-prd.c440
-rw-r--r--arch/powerpc/platforms/powernv/opal-psr.c175
-rw-r--r--arch/powerpc/platforms/powernv/opal-rtc.c87
-rw-r--r--arch/powerpc/platforms/powernv/opal-sensor-groups.c240
-rw-r--r--arch/powerpc/platforms/powernv/opal-sensor.c143
-rw-r--r--arch/powerpc/platforms/powernv/opal-sysparam.c307
-rw-r--r--arch/powerpc/platforms/powernv/opal-tracepoints.c88
-rw-r--r--arch/powerpc/platforms/powernv/opal-wrappers.S333
-rw-r--r--arch/powerpc/platforms/powernv/opal-xscom.c140
-rw-r--r--arch/powerpc/platforms/powernv/opal.c1118
-rw-r--r--arch/powerpc/platforms/powernv/pci-cxl.c178
-rw-r--r--arch/powerpc/platforms/powernv/pci-ioda-tce.c406
-rw-r--r--arch/powerpc/platforms/powernv/pci-ioda.c4090
-rw-r--r--arch/powerpc/platforms/powernv/pci.c1129
-rw-r--r--arch/powerpc/platforms/powernv/pci.h274
-rw-r--r--arch/powerpc/platforms/powernv/powernv.h33
-rw-r--r--arch/powerpc/platforms/powernv/rng.c198
-rw-r--r--arch/powerpc/platforms/powernv/setup.c474
-rw-r--r--arch/powerpc/platforms/powernv/smp.c442
-rw-r--r--arch/powerpc/platforms/powernv/subcore-asm.S95
-rw-r--r--arch/powerpc/platforms/powernv/subcore.c435
-rw-r--r--arch/powerpc/platforms/powernv/subcore.h25
-rw-r--r--arch/powerpc/platforms/powernv/vas-debug.c220
-rw-r--r--arch/powerpc/platforms/powernv/vas-trace.h113
-rw-r--r--arch/powerpc/platforms/powernv/vas-window.c1279
-rw-r--r--arch/powerpc/platforms/powernv/vas.c179
-rw-r--r--arch/powerpc/platforms/powernv/vas.h479
49 files changed, 21683 insertions, 0 deletions
diff --git a/arch/powerpc/platforms/powernv/Kconfig b/arch/powerpc/platforms/powernv/Kconfig
new file mode 100644
index 000000000..f8dc98d3d
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/Kconfig
@@ -0,0 +1,55 @@
+# SPDX-License-Identifier: GPL-2.0
+config PPC_POWERNV
+ depends on PPC64 && PPC_BOOK3S
+ bool "IBM PowerNV (Non-Virtualized) platform support"
+ select PPC_NATIVE
+ select PPC_XICS
+ select PPC_ICP_NATIVE
+ select PPC_XIVE_NATIVE
+ select PPC_P7_NAP
+ select PCI
+ select PCI_MSI
+ select EPAPR_BOOT
+ select PPC_INDIRECT_PIO
+ select PPC_UDBG_16550
+ select PPC_SCOM
+ select ARCH_RANDOM
+ select CPU_FREQ
+ select CPU_FREQ_GOV_PERFORMANCE
+ select CPU_FREQ_GOV_POWERSAVE
+ select CPU_FREQ_GOV_USERSPACE
+ select CPU_FREQ_GOV_ONDEMAND
+ select CPU_FREQ_GOV_CONSERVATIVE
+ select PPC_DOORBELL
+ select MMU_NOTIFIER
+ select FORCE_SMP
+ default y
+
+config OPAL_PRD
+ tristate 'OPAL PRD driver'
+ depends on PPC_POWERNV
+ help
+ This enables the opal-prd driver, a facility to run processor
+ recovery diagnostics on OpenPower machines
+
+config PPC_MEMTRACE
+ bool "Enable removal of RAM from kernel mappings for tracing"
+ depends on PPC_POWERNV && MEMORY_HOTREMOVE
+ default n
+ help
+ Enabling this option allows for the removal of memory (RAM)
+ from the kernel mappings to be used for hardware tracing.
+
+config PPC_VAS
+ bool "IBM Virtual Accelerator Switchboard (VAS)"
+ depends on PPC_POWERNV && PPC_64K_PAGES
+ default y
+ help
+ This enables support for IBM Virtual Accelerator Switchboard (VAS).
+
+ VAS allows accelerators in co-processors like NX-GZIP and NX-842
+ to be accessible to kernel subsystems and user processes.
+
+ VAS adapters are found in POWER9 based systems.
+
+ If unsure, say N.
diff --git a/arch/powerpc/platforms/powernv/Makefile b/arch/powerpc/platforms/powernv/Makefile
new file mode 100644
index 000000000..b540ce8ee
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/Makefile
@@ -0,0 +1,19 @@
+# SPDX-License-Identifier: GPL-2.0
+obj-y += setup.o opal-wrappers.o opal.o opal-async.o idle.o
+obj-y += opal-rtc.o opal-nvram.o opal-lpc.o opal-flash.o
+obj-y += rng.o opal-elog.o opal-dump.o opal-sysparam.o opal-sensor.o
+obj-y += opal-msglog.o opal-hmi.o opal-power.o opal-irqchip.o
+obj-y += opal-kmsg.o opal-powercap.o opal-psr.o opal-sensor-groups.o
+
+obj-$(CONFIG_SMP) += smp.o subcore.o subcore-asm.o
+obj-$(CONFIG_PCI) += pci.o pci-ioda.o npu-dma.o pci-ioda-tce.o
+obj-$(CONFIG_CXL_BASE) += pci-cxl.o
+obj-$(CONFIG_EEH) += eeh-powernv.o
+obj-$(CONFIG_PPC_SCOM) += opal-xscom.o
+obj-$(CONFIG_MEMORY_FAILURE) += opal-memory-errors.o
+obj-$(CONFIG_TRACEPOINTS) += opal-tracepoints.o
+obj-$(CONFIG_OPAL_PRD) += opal-prd.o
+obj-$(CONFIG_PERF_EVENTS) += opal-imc.o
+obj-$(CONFIG_PPC_MEMTRACE) += memtrace.o
+obj-$(CONFIG_PPC_VAS) += vas.o vas-window.o vas-debug.o
+obj-$(CONFIG_OCXL_BASE) += ocxl.o
diff --git a/arch/powerpc/platforms/powernv/copy-paste.h b/arch/powerpc/platforms/powernv/copy-paste.h
new file mode 100644
index 000000000..cb36f9fbc
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/copy-paste.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright 2016-17 IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <asm/ppc-opcode.h>
+#include <asm/reg.h>
+
+/*
+ * Copy/paste instructions:
+ *
+ * copy RA,RB
+ * Copy contents of address (RA) + effective_address(RB)
+ * to internal copy-buffer.
+ *
+ * paste RA,RB
+ * Paste contents of internal copy-buffer to the address
+ * (RA) + effective_address(RB)
+ */
+static inline int vas_copy(void *crb, int offset)
+{
+ asm volatile(PPC_COPY(%0, %1)";"
+ :
+ : "b" (offset), "b" (crb)
+ : "memory");
+
+ return 0;
+}
+
+static inline int vas_paste(void *paste_address, int offset)
+{
+ u32 cr;
+
+ cr = 0;
+ asm volatile(PPC_PASTE(%1, %2)";"
+ "mfocrf %0, 0x80;"
+ : "=r" (cr)
+ : "b" (offset), "b" (paste_address)
+ : "memory", "cr0");
+
+ /* We mask with 0xE to ignore SO */
+ return (cr >> CR0_SHIFT) & 0xE;
+}
diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c b/arch/powerpc/platforms/powernv/eeh-powernv.c
new file mode 100644
index 000000000..9dd5b8909
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/eeh-powernv.c
@@ -0,0 +1,1752 @@
+/*
+ * The file intends to implement the platform dependent EEH operations on
+ * powernv platform. Actually, the powernv was created in order to fully
+ * hypervisor support.
+ *
+ * Copyright Benjamin Herrenschmidt & Gavin Shan, IBM Corporation 2013.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/atomic.h>
+#include <linux/debugfs.h>
+#include <linux/delay.h>
+#include <linux/export.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/list.h>
+#include <linux/msi.h>
+#include <linux/of.h>
+#include <linux/pci.h>
+#include <linux/proc_fs.h>
+#include <linux/rbtree.h>
+#include <linux/sched.h>
+#include <linux/seq_file.h>
+#include <linux/spinlock.h>
+
+#include <asm/eeh.h>
+#include <asm/eeh_event.h>
+#include <asm/firmware.h>
+#include <asm/io.h>
+#include <asm/iommu.h>
+#include <asm/machdep.h>
+#include <asm/msi_bitmap.h>
+#include <asm/opal.h>
+#include <asm/ppc-pci.h>
+#include <asm/pnv-pci.h>
+
+#include "powernv.h"
+#include "pci.h"
+
+static int eeh_event_irq = -EINVAL;
+
+void pnv_pcibios_bus_add_device(struct pci_dev *pdev)
+{
+ struct pci_dn *pdn = pci_get_pdn(pdev);
+
+ if (!pdev->is_virtfn)
+ return;
+
+ /*
+ * The following operations will fail if VF's sysfs files
+ * aren't created or its resources aren't finalized.
+ */
+ eeh_add_device_early(pdn);
+ eeh_add_device_late(pdev);
+ eeh_sysfs_add_device(pdev);
+}
+
+static int pnv_eeh_init(void)
+{
+ struct pci_controller *hose;
+ struct pnv_phb *phb;
+ int max_diag_size = PNV_PCI_DIAG_BUF_SIZE;
+
+ if (!firmware_has_feature(FW_FEATURE_OPAL)) {
+ pr_warn("%s: OPAL is required !\n",
+ __func__);
+ return -EINVAL;
+ }
+
+ /* Set probe mode */
+ eeh_add_flag(EEH_PROBE_MODE_DEV);
+
+ /*
+ * P7IOC blocks PCI config access to frozen PE, but PHB3
+ * doesn't do that. So we have to selectively enable I/O
+ * prior to collecting error log.
+ */
+ list_for_each_entry(hose, &hose_list, list_node) {
+ phb = hose->private_data;
+
+ if (phb->model == PNV_PHB_MODEL_P7IOC)
+ eeh_add_flag(EEH_ENABLE_IO_FOR_LOG);
+
+ if (phb->diag_data_size > max_diag_size)
+ max_diag_size = phb->diag_data_size;
+
+ /*
+ * PE#0 should be regarded as valid by EEH core
+ * if it's not the reserved one. Currently, we
+ * have the reserved PE#255 and PE#127 for PHB3
+ * and P7IOC separately. So we should regard
+ * PE#0 as valid for PHB3 and P7IOC.
+ */
+ if (phb->ioda.reserved_pe_idx != 0)
+ eeh_add_flag(EEH_VALID_PE_ZERO);
+
+ break;
+ }
+
+ eeh_set_pe_aux_size(max_diag_size);
+ ppc_md.pcibios_bus_add_device = pnv_pcibios_bus_add_device;
+
+ return 0;
+}
+
+static irqreturn_t pnv_eeh_event(int irq, void *data)
+{
+ /*
+ * We simply send a special EEH event if EEH has been
+ * enabled. We don't care about EEH events until we've
+ * finished processing the outstanding ones. Event processing
+ * gets unmasked in next_error() if EEH is enabled.
+ */
+ disable_irq_nosync(irq);
+
+ if (eeh_enabled())
+ eeh_send_failure_event(NULL);
+
+ return IRQ_HANDLED;
+}
+
+#ifdef CONFIG_DEBUG_FS
+static ssize_t pnv_eeh_ei_write(struct file *filp,
+ const char __user *user_buf,
+ size_t count, loff_t *ppos)
+{
+ struct pci_controller *hose = filp->private_data;
+ struct eeh_pe *pe;
+ int pe_no, type, func;
+ unsigned long addr, mask;
+ char buf[50];
+ int ret;
+
+ if (!eeh_ops || !eeh_ops->err_inject)
+ return -ENXIO;
+
+ /* Copy over argument buffer */
+ ret = simple_write_to_buffer(buf, sizeof(buf), ppos, user_buf, count);
+ if (!ret)
+ return -EFAULT;
+
+ /* Retrieve parameters */
+ ret = sscanf(buf, "%x:%x:%x:%lx:%lx",
+ &pe_no, &type, &func, &addr, &mask);
+ if (ret != 5)
+ return -EINVAL;
+
+ /* Retrieve PE */
+ pe = eeh_pe_get(hose, pe_no, 0);
+ if (!pe)
+ return -ENODEV;
+
+ /* Do error injection */
+ ret = eeh_ops->err_inject(pe, type, func, addr, mask);
+ return ret < 0 ? ret : count;
+}
+
+static const struct file_operations pnv_eeh_ei_fops = {
+ .open = simple_open,
+ .llseek = no_llseek,
+ .write = pnv_eeh_ei_write,
+};
+
+static int pnv_eeh_dbgfs_set(void *data, int offset, u64 val)
+{
+ struct pci_controller *hose = data;
+ struct pnv_phb *phb = hose->private_data;
+
+ out_be64(phb->regs + offset, val);
+ return 0;
+}
+
+static int pnv_eeh_dbgfs_get(void *data, int offset, u64 *val)
+{
+ struct pci_controller *hose = data;
+ struct pnv_phb *phb = hose->private_data;
+
+ *val = in_be64(phb->regs + offset);
+ return 0;
+}
+
+#define PNV_EEH_DBGFS_ENTRY(name, reg) \
+static int pnv_eeh_dbgfs_set_##name(void *data, u64 val) \
+{ \
+ return pnv_eeh_dbgfs_set(data, reg, val); \
+} \
+ \
+static int pnv_eeh_dbgfs_get_##name(void *data, u64 *val) \
+{ \
+ return pnv_eeh_dbgfs_get(data, reg, val); \
+} \
+ \
+DEFINE_SIMPLE_ATTRIBUTE(pnv_eeh_dbgfs_ops_##name, \
+ pnv_eeh_dbgfs_get_##name, \
+ pnv_eeh_dbgfs_set_##name, \
+ "0x%llx\n")
+
+PNV_EEH_DBGFS_ENTRY(outb, 0xD10);
+PNV_EEH_DBGFS_ENTRY(inbA, 0xD90);
+PNV_EEH_DBGFS_ENTRY(inbB, 0xE10);
+
+#endif /* CONFIG_DEBUG_FS */
+
+/**
+ * pnv_eeh_post_init - EEH platform dependent post initialization
+ *
+ * EEH platform dependent post initialization on powernv. When
+ * the function is called, the EEH PEs and devices should have
+ * been built. If the I/O cache staff has been built, EEH is
+ * ready to supply service.
+ */
+int pnv_eeh_post_init(void)
+{
+ struct pci_controller *hose;
+ struct pnv_phb *phb;
+ int ret = 0;
+
+ /* Probe devices & build address cache */
+ eeh_probe_devices();
+ eeh_addr_cache_build();
+
+ if (eeh_has_flag(EEH_POSTPONED_PROBE)) {
+ eeh_clear_flag(EEH_POSTPONED_PROBE);
+ if (eeh_enabled())
+ pr_info("EEH: PCI Enhanced I/O Error Handling Enabled\n");
+ else
+ pr_info("EEH: No capable adapters found\n");
+ }
+
+ /* Register OPAL event notifier */
+ eeh_event_irq = opal_event_request(ilog2(OPAL_EVENT_PCI_ERROR));
+ if (eeh_event_irq < 0) {
+ pr_err("%s: Can't register OPAL event interrupt (%d)\n",
+ __func__, eeh_event_irq);
+ return eeh_event_irq;
+ }
+
+ ret = request_irq(eeh_event_irq, pnv_eeh_event,
+ IRQ_TYPE_LEVEL_HIGH, "opal-eeh", NULL);
+ if (ret < 0) {
+ irq_dispose_mapping(eeh_event_irq);
+ pr_err("%s: Can't request OPAL event interrupt (%d)\n",
+ __func__, eeh_event_irq);
+ return ret;
+ }
+
+ if (!eeh_enabled())
+ disable_irq(eeh_event_irq);
+
+ list_for_each_entry(hose, &hose_list, list_node) {
+ phb = hose->private_data;
+
+ /*
+ * If EEH is enabled, we're going to rely on that.
+ * Otherwise, we restore to conventional mechanism
+ * to clear frozen PE during PCI config access.
+ */
+ if (eeh_enabled())
+ phb->flags |= PNV_PHB_FLAG_EEH;
+ else
+ phb->flags &= ~PNV_PHB_FLAG_EEH;
+
+ /* Create debugfs entries */
+#ifdef CONFIG_DEBUG_FS
+ if (phb->has_dbgfs || !phb->dbgfs)
+ continue;
+
+ phb->has_dbgfs = 1;
+ debugfs_create_file("err_injct", 0200,
+ phb->dbgfs, hose,
+ &pnv_eeh_ei_fops);
+
+ debugfs_create_file("err_injct_outbound", 0600,
+ phb->dbgfs, hose,
+ &pnv_eeh_dbgfs_ops_outb);
+ debugfs_create_file("err_injct_inboundA", 0600,
+ phb->dbgfs, hose,
+ &pnv_eeh_dbgfs_ops_inbA);
+ debugfs_create_file("err_injct_inboundB", 0600,
+ phb->dbgfs, hose,
+ &pnv_eeh_dbgfs_ops_inbB);
+#endif /* CONFIG_DEBUG_FS */
+ }
+
+ return ret;
+}
+
+static int pnv_eeh_find_cap(struct pci_dn *pdn, int cap)
+{
+ int pos = PCI_CAPABILITY_LIST;
+ int cnt = 48; /* Maximal number of capabilities */
+ u32 status, id;
+
+ if (!pdn)
+ return 0;
+
+ /* Check if the device supports capabilities */
+ pnv_pci_cfg_read(pdn, PCI_STATUS, 2, &status);
+ if (!(status & PCI_STATUS_CAP_LIST))
+ return 0;
+
+ while (cnt--) {
+ pnv_pci_cfg_read(pdn, pos, 1, &pos);
+ if (pos < 0x40)
+ break;
+
+ pos &= ~3;
+ pnv_pci_cfg_read(pdn, pos + PCI_CAP_LIST_ID, 1, &id);
+ if (id == 0xff)
+ break;
+
+ /* Found */
+ if (id == cap)
+ return pos;
+
+ /* Next one */
+ pos += PCI_CAP_LIST_NEXT;
+ }
+
+ return 0;
+}
+
+static int pnv_eeh_find_ecap(struct pci_dn *pdn, int cap)
+{
+ struct eeh_dev *edev = pdn_to_eeh_dev(pdn);
+ u32 header;
+ int pos = 256, ttl = (4096 - 256) / 8;
+
+ if (!edev || !edev->pcie_cap)
+ return 0;
+ if (pnv_pci_cfg_read(pdn, pos, 4, &header) != PCIBIOS_SUCCESSFUL)
+ return 0;
+ else if (!header)
+ return 0;
+
+ while (ttl-- > 0) {
+ if (PCI_EXT_CAP_ID(header) == cap && pos)
+ return pos;
+
+ pos = PCI_EXT_CAP_NEXT(header);
+ if (pos < 256)
+ break;
+
+ if (pnv_pci_cfg_read(pdn, pos, 4, &header) != PCIBIOS_SUCCESSFUL)
+ break;
+ }
+
+ return 0;
+}
+
+/**
+ * pnv_eeh_probe - Do probe on PCI device
+ * @pdn: PCI device node
+ * @data: unused
+ *
+ * When EEH module is installed during system boot, all PCI devices
+ * are checked one by one to see if it supports EEH. The function
+ * is introduced for the purpose. By default, EEH has been enabled
+ * on all PCI devices. That's to say, we only need do necessary
+ * initialization on the corresponding eeh device and create PE
+ * accordingly.
+ *
+ * It's notable that's unsafe to retrieve the EEH device through
+ * the corresponding PCI device. During the PCI device hotplug, which
+ * was possiblly triggered by EEH core, the binding between EEH device
+ * and the PCI device isn't built yet.
+ */
+static void *pnv_eeh_probe(struct pci_dn *pdn, void *data)
+{
+ struct pci_controller *hose = pdn->phb;
+ struct pnv_phb *phb = hose->private_data;
+ struct eeh_dev *edev = pdn_to_eeh_dev(pdn);
+ uint32_t pcie_flags;
+ int ret;
+ int config_addr = (pdn->busno << 8) | (pdn->devfn);
+
+ /*
+ * When probing the root bridge, which doesn't have any
+ * subordinate PCI devices. We don't have OF node for
+ * the root bridge. So it's not reasonable to continue
+ * the probing.
+ */
+ if (!edev || edev->pe)
+ return NULL;
+
+ /* Skip for PCI-ISA bridge */
+ if ((pdn->class_code >> 8) == PCI_CLASS_BRIDGE_ISA)
+ return NULL;
+
+ /* Skip if we haven't probed yet */
+ if (phb->ioda.pe_rmap[config_addr] == IODA_INVALID_PE) {
+ eeh_add_flag(EEH_POSTPONED_PROBE);
+ return NULL;
+ }
+
+ /* Initialize eeh device */
+ edev->class_code = pdn->class_code;
+ edev->mode &= 0xFFFFFF00;
+ edev->pcix_cap = pnv_eeh_find_cap(pdn, PCI_CAP_ID_PCIX);
+ edev->pcie_cap = pnv_eeh_find_cap(pdn, PCI_CAP_ID_EXP);
+ edev->af_cap = pnv_eeh_find_cap(pdn, PCI_CAP_ID_AF);
+ edev->aer_cap = pnv_eeh_find_ecap(pdn, PCI_EXT_CAP_ID_ERR);
+ if ((edev->class_code >> 8) == PCI_CLASS_BRIDGE_PCI) {
+ edev->mode |= EEH_DEV_BRIDGE;
+ if (edev->pcie_cap) {
+ pnv_pci_cfg_read(pdn, edev->pcie_cap + PCI_EXP_FLAGS,
+ 2, &pcie_flags);
+ pcie_flags = (pcie_flags & PCI_EXP_FLAGS_TYPE) >> 4;
+ if (pcie_flags == PCI_EXP_TYPE_ROOT_PORT)
+ edev->mode |= EEH_DEV_ROOT_PORT;
+ else if (pcie_flags == PCI_EXP_TYPE_DOWNSTREAM)
+ edev->mode |= EEH_DEV_DS_PORT;
+ }
+ }
+
+ edev->pe_config_addr = phb->ioda.pe_rmap[config_addr];
+
+ /* Create PE */
+ ret = eeh_add_to_parent_pe(edev);
+ if (ret) {
+ pr_warn("%s: Can't add PCI dev %04x:%02x:%02x.%01x to parent PE (%x)\n",
+ __func__, hose->global_number, pdn->busno,
+ PCI_SLOT(pdn->devfn), PCI_FUNC(pdn->devfn), ret);
+ return NULL;
+ }
+
+ /*
+ * If the PE contains any one of following adapters, the
+ * PCI config space can't be accessed when dumping EEH log.
+ * Otherwise, we will run into fenced PHB caused by shortage
+ * of outbound credits in the adapter. The PCI config access
+ * should be blocked until PE reset. MMIO access is dropped
+ * by hardware certainly. In order to drop PCI config requests,
+ * one more flag (EEH_PE_CFG_RESTRICTED) is introduced, which
+ * will be checked in the backend for PE state retrival. If
+ * the PE becomes frozen for the first time and the flag has
+ * been set for the PE, we will set EEH_PE_CFG_BLOCKED for
+ * that PE to block its config space.
+ *
+ * Broadcom BCM5718 2-ports NICs (14e4:1656)
+ * Broadcom Austin 4-ports NICs (14e4:1657)
+ * Broadcom Shiner 4-ports 1G NICs (14e4:168a)
+ * Broadcom Shiner 2-ports 10G NICs (14e4:168e)
+ */
+ if ((pdn->vendor_id == PCI_VENDOR_ID_BROADCOM &&
+ pdn->device_id == 0x1656) ||
+ (pdn->vendor_id == PCI_VENDOR_ID_BROADCOM &&
+ pdn->device_id == 0x1657) ||
+ (pdn->vendor_id == PCI_VENDOR_ID_BROADCOM &&
+ pdn->device_id == 0x168a) ||
+ (pdn->vendor_id == PCI_VENDOR_ID_BROADCOM &&
+ pdn->device_id == 0x168e))
+ edev->pe->state |= EEH_PE_CFG_RESTRICTED;
+
+ /*
+ * Cache the PE primary bus, which can't be fetched when
+ * full hotplug is in progress. In that case, all child
+ * PCI devices of the PE are expected to be removed prior
+ * to PE reset.
+ */
+ if (!(edev->pe->state & EEH_PE_PRI_BUS)) {
+ edev->pe->bus = pci_find_bus(hose->global_number,
+ pdn->busno);
+ if (edev->pe->bus)
+ edev->pe->state |= EEH_PE_PRI_BUS;
+ }
+
+ /*
+ * Enable EEH explicitly so that we will do EEH check
+ * while accessing I/O stuff
+ */
+ eeh_add_flag(EEH_ENABLED);
+
+ /* Save memory bars */
+ eeh_save_bars(edev);
+
+ return NULL;
+}
+
+/**
+ * pnv_eeh_set_option - Initialize EEH or MMIO/DMA reenable
+ * @pe: EEH PE
+ * @option: operation to be issued
+ *
+ * The function is used to control the EEH functionality globally.
+ * Currently, following options are support according to PAPR:
+ * Enable EEH, Disable EEH, Enable MMIO and Enable DMA
+ */
+static int pnv_eeh_set_option(struct eeh_pe *pe, int option)
+{
+ struct pci_controller *hose = pe->phb;
+ struct pnv_phb *phb = hose->private_data;
+ bool freeze_pe = false;
+ int opt;
+ s64 rc;
+
+ switch (option) {
+ case EEH_OPT_DISABLE:
+ return -EPERM;
+ case EEH_OPT_ENABLE:
+ return 0;
+ case EEH_OPT_THAW_MMIO:
+ opt = OPAL_EEH_ACTION_CLEAR_FREEZE_MMIO;
+ break;
+ case EEH_OPT_THAW_DMA:
+ opt = OPAL_EEH_ACTION_CLEAR_FREEZE_DMA;
+ break;
+ case EEH_OPT_FREEZE_PE:
+ freeze_pe = true;
+ opt = OPAL_EEH_ACTION_SET_FREEZE_ALL;
+ break;
+ default:
+ pr_warn("%s: Invalid option %d\n", __func__, option);
+ return -EINVAL;
+ }
+
+ /* Freeze master and slave PEs if PHB supports compound PEs */
+ if (freeze_pe) {
+ if (phb->freeze_pe) {
+ phb->freeze_pe(phb, pe->addr);
+ return 0;
+ }
+
+ rc = opal_pci_eeh_freeze_set(phb->opal_id, pe->addr, opt);
+ if (rc != OPAL_SUCCESS) {
+ pr_warn("%s: Failure %lld freezing PHB#%x-PE#%x\n",
+ __func__, rc, phb->hose->global_number,
+ pe->addr);
+ return -EIO;
+ }
+
+ return 0;
+ }
+
+ /* Unfreeze master and slave PEs if PHB supports */
+ if (phb->unfreeze_pe)
+ return phb->unfreeze_pe(phb, pe->addr, opt);
+
+ rc = opal_pci_eeh_freeze_clear(phb->opal_id, pe->addr, opt);
+ if (rc != OPAL_SUCCESS) {
+ pr_warn("%s: Failure %lld enable %d for PHB#%x-PE#%x\n",
+ __func__, rc, option, phb->hose->global_number,
+ pe->addr);
+ return -EIO;
+ }
+
+ return 0;
+}
+
+/**
+ * pnv_eeh_get_pe_addr - Retrieve PE address
+ * @pe: EEH PE
+ *
+ * Retrieve the PE address according to the given tranditional
+ * PCI BDF (Bus/Device/Function) address.
+ */
+static int pnv_eeh_get_pe_addr(struct eeh_pe *pe)
+{
+ return pe->addr;
+}
+
+static void pnv_eeh_get_phb_diag(struct eeh_pe *pe)
+{
+ struct pnv_phb *phb = pe->phb->private_data;
+ s64 rc;
+
+ rc = opal_pci_get_phb_diag_data2(phb->opal_id, pe->data,
+ phb->diag_data_size);
+ if (rc != OPAL_SUCCESS)
+ pr_warn("%s: Failure %lld getting PHB#%x diag-data\n",
+ __func__, rc, pe->phb->global_number);
+}
+
+static int pnv_eeh_get_phb_state(struct eeh_pe *pe)
+{
+ struct pnv_phb *phb = pe->phb->private_data;
+ u8 fstate = 0;
+ __be16 pcierr = 0;
+ s64 rc;
+ int result = 0;
+
+ rc = opal_pci_eeh_freeze_status(phb->opal_id,
+ pe->addr,
+ &fstate,
+ &pcierr,
+ NULL);
+ if (rc != OPAL_SUCCESS) {
+ pr_warn("%s: Failure %lld getting PHB#%x state\n",
+ __func__, rc, phb->hose->global_number);
+ return EEH_STATE_NOT_SUPPORT;
+ }
+
+ /*
+ * Check PHB state. If the PHB is frozen for the
+ * first time, to dump the PHB diag-data.
+ */
+ if (be16_to_cpu(pcierr) != OPAL_EEH_PHB_ERROR) {
+ result = (EEH_STATE_MMIO_ACTIVE |
+ EEH_STATE_DMA_ACTIVE |
+ EEH_STATE_MMIO_ENABLED |
+ EEH_STATE_DMA_ENABLED);
+ } else if (!(pe->state & EEH_PE_ISOLATED)) {
+ eeh_pe_state_mark(pe, EEH_PE_ISOLATED);
+ pnv_eeh_get_phb_diag(pe);
+
+ if (eeh_has_flag(EEH_EARLY_DUMP_LOG))
+ pnv_pci_dump_phb_diag_data(pe->phb, pe->data);
+ }
+
+ return result;
+}
+
+static int pnv_eeh_get_pe_state(struct eeh_pe *pe)
+{
+ struct pnv_phb *phb = pe->phb->private_data;
+ u8 fstate = 0;
+ __be16 pcierr = 0;
+ s64 rc;
+ int result;
+
+ /*
+ * We don't clobber hardware frozen state until PE
+ * reset is completed. In order to keep EEH core
+ * moving forward, we have to return operational
+ * state during PE reset.
+ */
+ if (pe->state & EEH_PE_RESET) {
+ result = (EEH_STATE_MMIO_ACTIVE |
+ EEH_STATE_DMA_ACTIVE |
+ EEH_STATE_MMIO_ENABLED |
+ EEH_STATE_DMA_ENABLED);
+ return result;
+ }
+
+ /*
+ * Fetch PE state from hardware. If the PHB
+ * supports compound PE, let it handle that.
+ */
+ if (phb->get_pe_state) {
+ fstate = phb->get_pe_state(phb, pe->addr);
+ } else {
+ rc = opal_pci_eeh_freeze_status(phb->opal_id,
+ pe->addr,
+ &fstate,
+ &pcierr,
+ NULL);
+ if (rc != OPAL_SUCCESS) {
+ pr_warn("%s: Failure %lld getting PHB#%x-PE%x state\n",
+ __func__, rc, phb->hose->global_number,
+ pe->addr);
+ return EEH_STATE_NOT_SUPPORT;
+ }
+ }
+
+ /* Figure out state */
+ switch (fstate) {
+ case OPAL_EEH_STOPPED_NOT_FROZEN:
+ result = (EEH_STATE_MMIO_ACTIVE |
+ EEH_STATE_DMA_ACTIVE |
+ EEH_STATE_MMIO_ENABLED |
+ EEH_STATE_DMA_ENABLED);
+ break;
+ case OPAL_EEH_STOPPED_MMIO_FREEZE:
+ result = (EEH_STATE_DMA_ACTIVE |
+ EEH_STATE_DMA_ENABLED);
+ break;
+ case OPAL_EEH_STOPPED_DMA_FREEZE:
+ result = (EEH_STATE_MMIO_ACTIVE |
+ EEH_STATE_MMIO_ENABLED);
+ break;
+ case OPAL_EEH_STOPPED_MMIO_DMA_FREEZE:
+ result = 0;
+ break;
+ case OPAL_EEH_STOPPED_RESET:
+ result = EEH_STATE_RESET_ACTIVE;
+ break;
+ case OPAL_EEH_STOPPED_TEMP_UNAVAIL:
+ result = EEH_STATE_UNAVAILABLE;
+ break;
+ case OPAL_EEH_STOPPED_PERM_UNAVAIL:
+ result = EEH_STATE_NOT_SUPPORT;
+ break;
+ default:
+ result = EEH_STATE_NOT_SUPPORT;
+ pr_warn("%s: Invalid PHB#%x-PE#%x state %x\n",
+ __func__, phb->hose->global_number,
+ pe->addr, fstate);
+ }
+
+ /*
+ * If PHB supports compound PE, to freeze all
+ * slave PEs for consistency.
+ *
+ * If the PE is switching to frozen state for the
+ * first time, to dump the PHB diag-data.
+ */
+ if (!(result & EEH_STATE_NOT_SUPPORT) &&
+ !(result & EEH_STATE_UNAVAILABLE) &&
+ !(result & EEH_STATE_MMIO_ACTIVE) &&
+ !(result & EEH_STATE_DMA_ACTIVE) &&
+ !(pe->state & EEH_PE_ISOLATED)) {
+ if (phb->freeze_pe)
+ phb->freeze_pe(phb, pe->addr);
+
+ eeh_pe_state_mark(pe, EEH_PE_ISOLATED);
+ pnv_eeh_get_phb_diag(pe);
+
+ if (eeh_has_flag(EEH_EARLY_DUMP_LOG))
+ pnv_pci_dump_phb_diag_data(pe->phb, pe->data);
+ }
+
+ return result;
+}
+
+/**
+ * pnv_eeh_get_state - Retrieve PE state
+ * @pe: EEH PE
+ * @delay: delay while PE state is temporarily unavailable
+ *
+ * Retrieve the state of the specified PE. For IODA-compitable
+ * platform, it should be retrieved from IODA table. Therefore,
+ * we prefer passing down to hardware implementation to handle
+ * it.
+ */
+static int pnv_eeh_get_state(struct eeh_pe *pe, int *delay)
+{
+ int ret;
+
+ if (pe->type & EEH_PE_PHB)
+ ret = pnv_eeh_get_phb_state(pe);
+ else
+ ret = pnv_eeh_get_pe_state(pe);
+
+ if (!delay)
+ return ret;
+
+ /*
+ * If the PE state is temporarily unavailable,
+ * to inform the EEH core delay for default
+ * period (1 second)
+ */
+ *delay = 0;
+ if (ret & EEH_STATE_UNAVAILABLE)
+ *delay = 1000;
+
+ return ret;
+}
+
+static s64 pnv_eeh_poll(unsigned long id)
+{
+ s64 rc = OPAL_HARDWARE;
+
+ while (1) {
+ rc = opal_pci_poll(id);
+ if (rc <= 0)
+ break;
+
+ if (system_state < SYSTEM_RUNNING)
+ udelay(1000 * rc);
+ else
+ msleep(rc);
+ }
+
+ return rc;
+}
+
+int pnv_eeh_phb_reset(struct pci_controller *hose, int option)
+{
+ struct pnv_phb *phb = hose->private_data;
+ s64 rc = OPAL_HARDWARE;
+
+ pr_debug("%s: Reset PHB#%x, option=%d\n",
+ __func__, hose->global_number, option);
+
+ /* Issue PHB complete reset request */
+ if (option == EEH_RESET_FUNDAMENTAL ||
+ option == EEH_RESET_HOT)
+ rc = opal_pci_reset(phb->opal_id,
+ OPAL_RESET_PHB_COMPLETE,
+ OPAL_ASSERT_RESET);
+ else if (option == EEH_RESET_DEACTIVATE)
+ rc = opal_pci_reset(phb->opal_id,
+ OPAL_RESET_PHB_COMPLETE,
+ OPAL_DEASSERT_RESET);
+ if (rc < 0)
+ goto out;
+
+ /*
+ * Poll state of the PHB until the request is done
+ * successfully. The PHB reset is usually PHB complete
+ * reset followed by hot reset on root bus. So we also
+ * need the PCI bus settlement delay.
+ */
+ if (rc > 0)
+ rc = pnv_eeh_poll(phb->opal_id);
+ if (option == EEH_RESET_DEACTIVATE) {
+ if (system_state < SYSTEM_RUNNING)
+ udelay(1000 * EEH_PE_RST_SETTLE_TIME);
+ else
+ msleep(EEH_PE_RST_SETTLE_TIME);
+ }
+out:
+ if (rc != OPAL_SUCCESS)
+ return -EIO;
+
+ return 0;
+}
+
+static int pnv_eeh_root_reset(struct pci_controller *hose, int option)
+{
+ struct pnv_phb *phb = hose->private_data;
+ s64 rc = OPAL_HARDWARE;
+
+ pr_debug("%s: Reset PHB#%x, option=%d\n",
+ __func__, hose->global_number, option);
+
+ /*
+ * During the reset deassert time, we needn't care
+ * the reset scope because the firmware does nothing
+ * for fundamental or hot reset during deassert phase.
+ */
+ if (option == EEH_RESET_FUNDAMENTAL)
+ rc = opal_pci_reset(phb->opal_id,
+ OPAL_RESET_PCI_FUNDAMENTAL,
+ OPAL_ASSERT_RESET);
+ else if (option == EEH_RESET_HOT)
+ rc = opal_pci_reset(phb->opal_id,
+ OPAL_RESET_PCI_HOT,
+ OPAL_ASSERT_RESET);
+ else if (option == EEH_RESET_DEACTIVATE)
+ rc = opal_pci_reset(phb->opal_id,
+ OPAL_RESET_PCI_HOT,
+ OPAL_DEASSERT_RESET);
+ if (rc < 0)
+ goto out;
+
+ /* Poll state of the PHB until the request is done */
+ if (rc > 0)
+ rc = pnv_eeh_poll(phb->opal_id);
+ if (option == EEH_RESET_DEACTIVATE)
+ msleep(EEH_PE_RST_SETTLE_TIME);
+out:
+ if (rc != OPAL_SUCCESS)
+ return -EIO;
+
+ return 0;
+}
+
+static int __pnv_eeh_bridge_reset(struct pci_dev *dev, int option)
+{
+ struct pci_dn *pdn = pci_get_pdn_by_devfn(dev->bus, dev->devfn);
+ struct eeh_dev *edev = pdn_to_eeh_dev(pdn);
+ int aer = edev ? edev->aer_cap : 0;
+ u32 ctrl;
+
+ pr_debug("%s: Reset PCI bus %04x:%02x with option %d\n",
+ __func__, pci_domain_nr(dev->bus),
+ dev->bus->number, option);
+
+ switch (option) {
+ case EEH_RESET_FUNDAMENTAL:
+ case EEH_RESET_HOT:
+ /* Don't report linkDown event */
+ if (aer) {
+ eeh_ops->read_config(pdn, aer + PCI_ERR_UNCOR_MASK,
+ 4, &ctrl);
+ ctrl |= PCI_ERR_UNC_SURPDN;
+ eeh_ops->write_config(pdn, aer + PCI_ERR_UNCOR_MASK,
+ 4, ctrl);
+ }
+
+ eeh_ops->read_config(pdn, PCI_BRIDGE_CONTROL, 2, &ctrl);
+ ctrl |= PCI_BRIDGE_CTL_BUS_RESET;
+ eeh_ops->write_config(pdn, PCI_BRIDGE_CONTROL, 2, ctrl);
+
+ msleep(EEH_PE_RST_HOLD_TIME);
+ break;
+ case EEH_RESET_DEACTIVATE:
+ eeh_ops->read_config(pdn, PCI_BRIDGE_CONTROL, 2, &ctrl);
+ ctrl &= ~PCI_BRIDGE_CTL_BUS_RESET;
+ eeh_ops->write_config(pdn, PCI_BRIDGE_CONTROL, 2, ctrl);
+
+ msleep(EEH_PE_RST_SETTLE_TIME);
+
+ /* Continue reporting linkDown event */
+ if (aer) {
+ eeh_ops->read_config(pdn, aer + PCI_ERR_UNCOR_MASK,
+ 4, &ctrl);
+ ctrl &= ~PCI_ERR_UNC_SURPDN;
+ eeh_ops->write_config(pdn, aer + PCI_ERR_UNCOR_MASK,
+ 4, ctrl);
+ }
+
+ break;
+ }
+
+ return 0;
+}
+
+static int pnv_eeh_bridge_reset(struct pci_dev *pdev, int option)
+{
+ struct pci_controller *hose = pci_bus_to_host(pdev->bus);
+ struct pnv_phb *phb = hose->private_data;
+ struct device_node *dn = pci_device_to_OF_node(pdev);
+ uint64_t id = PCI_SLOT_ID(phb->opal_id,
+ (pdev->bus->number << 8) | pdev->devfn);
+ uint8_t scope;
+ int64_t rc;
+
+ /* Hot reset to the bus if firmware cannot handle */
+ if (!dn || !of_get_property(dn, "ibm,reset-by-firmware", NULL))
+ return __pnv_eeh_bridge_reset(pdev, option);
+
+ switch (option) {
+ case EEH_RESET_FUNDAMENTAL:
+ scope = OPAL_RESET_PCI_FUNDAMENTAL;
+ break;
+ case EEH_RESET_HOT:
+ scope = OPAL_RESET_PCI_HOT;
+ break;
+ case EEH_RESET_DEACTIVATE:
+ return 0;
+ default:
+ dev_dbg(&pdev->dev, "%s: Unsupported reset %d\n",
+ __func__, option);
+ return -EINVAL;
+ }
+
+ rc = opal_pci_reset(id, scope, OPAL_ASSERT_RESET);
+ if (rc <= OPAL_SUCCESS)
+ goto out;
+
+ rc = pnv_eeh_poll(id);
+out:
+ return (rc == OPAL_SUCCESS) ? 0 : -EIO;
+}
+
+void pnv_pci_reset_secondary_bus(struct pci_dev *dev)
+{
+ struct pci_controller *hose;
+
+ if (pci_is_root_bus(dev->bus)) {
+ hose = pci_bus_to_host(dev->bus);
+ pnv_eeh_root_reset(hose, EEH_RESET_HOT);
+ pnv_eeh_root_reset(hose, EEH_RESET_DEACTIVATE);
+ } else {
+ pnv_eeh_bridge_reset(dev, EEH_RESET_HOT);
+ pnv_eeh_bridge_reset(dev, EEH_RESET_DEACTIVATE);
+ }
+}
+
+static void pnv_eeh_wait_for_pending(struct pci_dn *pdn, const char *type,
+ int pos, u16 mask)
+{
+ int i, status = 0;
+
+ /* Wait for Transaction Pending bit to be cleared */
+ for (i = 0; i < 4; i++) {
+ eeh_ops->read_config(pdn, pos, 2, &status);
+ if (!(status & mask))
+ return;
+
+ msleep((1 << i) * 100);
+ }
+
+ pr_warn("%s: Pending transaction while issuing %sFLR to %04x:%02x:%02x.%01x\n",
+ __func__, type,
+ pdn->phb->global_number, pdn->busno,
+ PCI_SLOT(pdn->devfn), PCI_FUNC(pdn->devfn));
+}
+
+static int pnv_eeh_do_flr(struct pci_dn *pdn, int option)
+{
+ struct eeh_dev *edev = pdn_to_eeh_dev(pdn);
+ u32 reg = 0;
+
+ if (WARN_ON(!edev->pcie_cap))
+ return -ENOTTY;
+
+ eeh_ops->read_config(pdn, edev->pcie_cap + PCI_EXP_DEVCAP, 4, &reg);
+ if (!(reg & PCI_EXP_DEVCAP_FLR))
+ return -ENOTTY;
+
+ switch (option) {
+ case EEH_RESET_HOT:
+ case EEH_RESET_FUNDAMENTAL:
+ pnv_eeh_wait_for_pending(pdn, "",
+ edev->pcie_cap + PCI_EXP_DEVSTA,
+ PCI_EXP_DEVSTA_TRPND);
+ eeh_ops->read_config(pdn, edev->pcie_cap + PCI_EXP_DEVCTL,
+ 4, &reg);
+ reg |= PCI_EXP_DEVCTL_BCR_FLR;
+ eeh_ops->write_config(pdn, edev->pcie_cap + PCI_EXP_DEVCTL,
+ 4, reg);
+ msleep(EEH_PE_RST_HOLD_TIME);
+ break;
+ case EEH_RESET_DEACTIVATE:
+ eeh_ops->read_config(pdn, edev->pcie_cap + PCI_EXP_DEVCTL,
+ 4, &reg);
+ reg &= ~PCI_EXP_DEVCTL_BCR_FLR;
+ eeh_ops->write_config(pdn, edev->pcie_cap + PCI_EXP_DEVCTL,
+ 4, reg);
+ msleep(EEH_PE_RST_SETTLE_TIME);
+ break;
+ }
+
+ return 0;
+}
+
+static int pnv_eeh_do_af_flr(struct pci_dn *pdn, int option)
+{
+ struct eeh_dev *edev = pdn_to_eeh_dev(pdn);
+ u32 cap = 0;
+
+ if (WARN_ON(!edev->af_cap))
+ return -ENOTTY;
+
+ eeh_ops->read_config(pdn, edev->af_cap + PCI_AF_CAP, 1, &cap);
+ if (!(cap & PCI_AF_CAP_TP) || !(cap & PCI_AF_CAP_FLR))
+ return -ENOTTY;
+
+ switch (option) {
+ case EEH_RESET_HOT:
+ case EEH_RESET_FUNDAMENTAL:
+ /*
+ * Wait for Transaction Pending bit to clear. A word-aligned
+ * test is used, so we use the conrol offset rather than status
+ * and shift the test bit to match.
+ */
+ pnv_eeh_wait_for_pending(pdn, "AF",
+ edev->af_cap + PCI_AF_CTRL,
+ PCI_AF_STATUS_TP << 8);
+ eeh_ops->write_config(pdn, edev->af_cap + PCI_AF_CTRL,
+ 1, PCI_AF_CTRL_FLR);
+ msleep(EEH_PE_RST_HOLD_TIME);
+ break;
+ case EEH_RESET_DEACTIVATE:
+ eeh_ops->write_config(pdn, edev->af_cap + PCI_AF_CTRL, 1, 0);
+ msleep(EEH_PE_RST_SETTLE_TIME);
+ break;
+ }
+
+ return 0;
+}
+
+static int pnv_eeh_reset_vf_pe(struct eeh_pe *pe, int option)
+{
+ struct eeh_dev *edev;
+ struct pci_dn *pdn;
+ int ret;
+
+ /* The VF PE should have only one child device */
+ edev = list_first_entry_or_null(&pe->edevs, struct eeh_dev, list);
+ pdn = eeh_dev_to_pdn(edev);
+ if (!pdn)
+ return -ENXIO;
+
+ ret = pnv_eeh_do_flr(pdn, option);
+ if (!ret)
+ return ret;
+
+ return pnv_eeh_do_af_flr(pdn, option);
+}
+
+/**
+ * pnv_eeh_reset - Reset the specified PE
+ * @pe: EEH PE
+ * @option: reset option
+ *
+ * Do reset on the indicated PE. For PCI bus sensitive PE,
+ * we need to reset the parent p2p bridge. The PHB has to
+ * be reinitialized if the p2p bridge is root bridge. For
+ * PCI device sensitive PE, we will try to reset the device
+ * through FLR. For now, we don't have OPAL APIs to do HARD
+ * reset yet, so all reset would be SOFT (HOT) reset.
+ */
+static int pnv_eeh_reset(struct eeh_pe *pe, int option)
+{
+ struct pci_controller *hose = pe->phb;
+ struct pnv_phb *phb;
+ struct pci_bus *bus;
+ int64_t rc;
+
+ /*
+ * For PHB reset, we always have complete reset. For those PEs whose
+ * primary bus derived from root complex (root bus) or root port
+ * (usually bus#1), we apply hot or fundamental reset on the root port.
+ * For other PEs, we always have hot reset on the PE primary bus.
+ *
+ * Here, we have different design to pHyp, which always clear the
+ * frozen state during PE reset. However, the good idea here from
+ * benh is to keep frozen state before we get PE reset done completely
+ * (until BAR restore). With the frozen state, HW drops illegal IO
+ * or MMIO access, which can incur recrusive frozen PE during PE
+ * reset. The side effect is that EEH core has to clear the frozen
+ * state explicitly after BAR restore.
+ */
+ if (pe->type & EEH_PE_PHB)
+ return pnv_eeh_phb_reset(hose, option);
+
+ /*
+ * The frozen PE might be caused by PAPR error injection
+ * registers, which are expected to be cleared after hitting
+ * frozen PE as stated in the hardware spec. Unfortunately,
+ * that's not true on P7IOC. So we have to clear it manually
+ * to avoid recursive EEH errors during recovery.
+ */
+ phb = hose->private_data;
+ if (phb->model == PNV_PHB_MODEL_P7IOC &&
+ (option == EEH_RESET_HOT ||
+ option == EEH_RESET_FUNDAMENTAL)) {
+ rc = opal_pci_reset(phb->opal_id,
+ OPAL_RESET_PHB_ERROR,
+ OPAL_ASSERT_RESET);
+ if (rc != OPAL_SUCCESS) {
+ pr_warn("%s: Failure %lld clearing error injection registers\n",
+ __func__, rc);
+ return -EIO;
+ }
+ }
+
+ if (pe->type & EEH_PE_VF)
+ return pnv_eeh_reset_vf_pe(pe, option);
+
+ bus = eeh_pe_bus_get(pe);
+ if (!bus) {
+ pr_err("%s: Cannot find PCI bus for PHB#%x-PE#%x\n",
+ __func__, pe->phb->global_number, pe->addr);
+ return -EIO;
+ }
+
+ /*
+ * If dealing with the root bus (or the bus underneath the
+ * root port), we reset the bus underneath the root port.
+ *
+ * The cxl driver depends on this behaviour for bi-modal card
+ * switching.
+ */
+ if (pci_is_root_bus(bus) ||
+ pci_is_root_bus(bus->parent))
+ return pnv_eeh_root_reset(hose, option);
+
+ return pnv_eeh_bridge_reset(bus->self, option);
+}
+
+/**
+ * pnv_eeh_wait_state - Wait for PE state
+ * @pe: EEH PE
+ * @max_wait: maximal period in millisecond
+ *
+ * Wait for the state of associated PE. It might take some time
+ * to retrieve the PE's state.
+ */
+static int pnv_eeh_wait_state(struct eeh_pe *pe, int max_wait)
+{
+ int ret;
+ int mwait;
+
+ while (1) {
+ ret = pnv_eeh_get_state(pe, &mwait);
+
+ /*
+ * If the PE's state is temporarily unavailable,
+ * we have to wait for the specified time. Otherwise,
+ * the PE's state will be returned immediately.
+ */
+ if (ret != EEH_STATE_UNAVAILABLE)
+ return ret;
+
+ if (max_wait <= 0) {
+ pr_warn("%s: Timeout getting PE#%x's state (%d)\n",
+ __func__, pe->addr, max_wait);
+ return EEH_STATE_NOT_SUPPORT;
+ }
+
+ max_wait -= mwait;
+ msleep(mwait);
+ }
+
+ return EEH_STATE_NOT_SUPPORT;
+}
+
+/**
+ * pnv_eeh_get_log - Retrieve error log
+ * @pe: EEH PE
+ * @severity: temporary or permanent error log
+ * @drv_log: driver log to be combined with retrieved error log
+ * @len: length of driver log
+ *
+ * Retrieve the temporary or permanent error from the PE.
+ */
+static int pnv_eeh_get_log(struct eeh_pe *pe, int severity,
+ char *drv_log, unsigned long len)
+{
+ if (!eeh_has_flag(EEH_EARLY_DUMP_LOG))
+ pnv_pci_dump_phb_diag_data(pe->phb, pe->data);
+
+ return 0;
+}
+
+/**
+ * pnv_eeh_configure_bridge - Configure PCI bridges in the indicated PE
+ * @pe: EEH PE
+ *
+ * The function will be called to reconfigure the bridges included
+ * in the specified PE so that the mulfunctional PE would be recovered
+ * again.
+ */
+static int pnv_eeh_configure_bridge(struct eeh_pe *pe)
+{
+ return 0;
+}
+
+/**
+ * pnv_pe_err_inject - Inject specified error to the indicated PE
+ * @pe: the indicated PE
+ * @type: error type
+ * @func: specific error type
+ * @addr: address
+ * @mask: address mask
+ *
+ * The routine is called to inject specified error, which is
+ * determined by @type and @func, to the indicated PE for
+ * testing purpose.
+ */
+static int pnv_eeh_err_inject(struct eeh_pe *pe, int type, int func,
+ unsigned long addr, unsigned long mask)
+{
+ struct pci_controller *hose = pe->phb;
+ struct pnv_phb *phb = hose->private_data;
+ s64 rc;
+
+ if (type != OPAL_ERR_INJECT_TYPE_IOA_BUS_ERR &&
+ type != OPAL_ERR_INJECT_TYPE_IOA_BUS_ERR64) {
+ pr_warn("%s: Invalid error type %d\n",
+ __func__, type);
+ return -ERANGE;
+ }
+
+ if (func < OPAL_ERR_INJECT_FUNC_IOA_LD_MEM_ADDR ||
+ func > OPAL_ERR_INJECT_FUNC_IOA_DMA_WR_TARGET) {
+ pr_warn("%s: Invalid error function %d\n",
+ __func__, func);
+ return -ERANGE;
+ }
+
+ /* Firmware supports error injection ? */
+ if (!opal_check_token(OPAL_PCI_ERR_INJECT)) {
+ pr_warn("%s: Firmware doesn't support error injection\n",
+ __func__);
+ return -ENXIO;
+ }
+
+ /* Do error injection */
+ rc = opal_pci_err_inject(phb->opal_id, pe->addr,
+ type, func, addr, mask);
+ if (rc != OPAL_SUCCESS) {
+ pr_warn("%s: Failure %lld injecting error "
+ "%d-%d to PHB#%x-PE#%x\n",
+ __func__, rc, type, func,
+ hose->global_number, pe->addr);
+ return -EIO;
+ }
+
+ return 0;
+}
+
+static inline bool pnv_eeh_cfg_blocked(struct pci_dn *pdn)
+{
+ struct eeh_dev *edev = pdn_to_eeh_dev(pdn);
+
+ if (!edev || !edev->pe)
+ return false;
+
+ /*
+ * We will issue FLR or AF FLR to all VFs, which are contained
+ * in VF PE. It relies on the EEH PCI config accessors. So we
+ * can't block them during the window.
+ */
+ if (edev->physfn && (edev->pe->state & EEH_PE_RESET))
+ return false;
+
+ if (edev->pe->state & EEH_PE_CFG_BLOCKED)
+ return true;
+
+ return false;
+}
+
+static int pnv_eeh_read_config(struct pci_dn *pdn,
+ int where, int size, u32 *val)
+{
+ if (!pdn)
+ return PCIBIOS_DEVICE_NOT_FOUND;
+
+ if (pnv_eeh_cfg_blocked(pdn)) {
+ *val = 0xFFFFFFFF;
+ return PCIBIOS_SET_FAILED;
+ }
+
+ return pnv_pci_cfg_read(pdn, where, size, val);
+}
+
+static int pnv_eeh_write_config(struct pci_dn *pdn,
+ int where, int size, u32 val)
+{
+ if (!pdn)
+ return PCIBIOS_DEVICE_NOT_FOUND;
+
+ if (pnv_eeh_cfg_blocked(pdn))
+ return PCIBIOS_SET_FAILED;
+
+ return pnv_pci_cfg_write(pdn, where, size, val);
+}
+
+static void pnv_eeh_dump_hub_diag_common(struct OpalIoP7IOCErrorData *data)
+{
+ /* GEM */
+ if (data->gemXfir || data->gemRfir ||
+ data->gemRirqfir || data->gemMask || data->gemRwof)
+ pr_info(" GEM: %016llx %016llx %016llx %016llx %016llx\n",
+ be64_to_cpu(data->gemXfir),
+ be64_to_cpu(data->gemRfir),
+ be64_to_cpu(data->gemRirqfir),
+ be64_to_cpu(data->gemMask),
+ be64_to_cpu(data->gemRwof));
+
+ /* LEM */
+ if (data->lemFir || data->lemErrMask ||
+ data->lemAction0 || data->lemAction1 || data->lemWof)
+ pr_info(" LEM: %016llx %016llx %016llx %016llx %016llx\n",
+ be64_to_cpu(data->lemFir),
+ be64_to_cpu(data->lemErrMask),
+ be64_to_cpu(data->lemAction0),
+ be64_to_cpu(data->lemAction1),
+ be64_to_cpu(data->lemWof));
+}
+
+static void pnv_eeh_get_and_dump_hub_diag(struct pci_controller *hose)
+{
+ struct pnv_phb *phb = hose->private_data;
+ struct OpalIoP7IOCErrorData *data =
+ (struct OpalIoP7IOCErrorData*)phb->diag_data;
+ long rc;
+
+ rc = opal_pci_get_hub_diag_data(phb->hub_id, data, sizeof(*data));
+ if (rc != OPAL_SUCCESS) {
+ pr_warn("%s: Failed to get HUB#%llx diag-data (%ld)\n",
+ __func__, phb->hub_id, rc);
+ return;
+ }
+
+ switch (be16_to_cpu(data->type)) {
+ case OPAL_P7IOC_DIAG_TYPE_RGC:
+ pr_info("P7IOC diag-data for RGC\n\n");
+ pnv_eeh_dump_hub_diag_common(data);
+ if (data->rgc.rgcStatus || data->rgc.rgcLdcp)
+ pr_info(" RGC: %016llx %016llx\n",
+ be64_to_cpu(data->rgc.rgcStatus),
+ be64_to_cpu(data->rgc.rgcLdcp));
+ break;
+ case OPAL_P7IOC_DIAG_TYPE_BI:
+ pr_info("P7IOC diag-data for BI %s\n\n",
+ data->bi.biDownbound ? "Downbound" : "Upbound");
+ pnv_eeh_dump_hub_diag_common(data);
+ if (data->bi.biLdcp0 || data->bi.biLdcp1 ||
+ data->bi.biLdcp2 || data->bi.biFenceStatus)
+ pr_info(" BI: %016llx %016llx %016llx %016llx\n",
+ be64_to_cpu(data->bi.biLdcp0),
+ be64_to_cpu(data->bi.biLdcp1),
+ be64_to_cpu(data->bi.biLdcp2),
+ be64_to_cpu(data->bi.biFenceStatus));
+ break;
+ case OPAL_P7IOC_DIAG_TYPE_CI:
+ pr_info("P7IOC diag-data for CI Port %d\n\n",
+ data->ci.ciPort);
+ pnv_eeh_dump_hub_diag_common(data);
+ if (data->ci.ciPortStatus || data->ci.ciPortLdcp)
+ pr_info(" CI: %016llx %016llx\n",
+ be64_to_cpu(data->ci.ciPortStatus),
+ be64_to_cpu(data->ci.ciPortLdcp));
+ break;
+ case OPAL_P7IOC_DIAG_TYPE_MISC:
+ pr_info("P7IOC diag-data for MISC\n\n");
+ pnv_eeh_dump_hub_diag_common(data);
+ break;
+ case OPAL_P7IOC_DIAG_TYPE_I2C:
+ pr_info("P7IOC diag-data for I2C\n\n");
+ pnv_eeh_dump_hub_diag_common(data);
+ break;
+ default:
+ pr_warn("%s: Invalid type of HUB#%llx diag-data (%d)\n",
+ __func__, phb->hub_id, data->type);
+ }
+}
+
+static int pnv_eeh_get_pe(struct pci_controller *hose,
+ u16 pe_no, struct eeh_pe **pe)
+{
+ struct pnv_phb *phb = hose->private_data;
+ struct pnv_ioda_pe *pnv_pe;
+ struct eeh_pe *dev_pe;
+
+ /*
+ * If PHB supports compound PE, to fetch
+ * the master PE because slave PE is invisible
+ * to EEH core.
+ */
+ pnv_pe = &phb->ioda.pe_array[pe_no];
+ if (pnv_pe->flags & PNV_IODA_PE_SLAVE) {
+ pnv_pe = pnv_pe->master;
+ WARN_ON(!pnv_pe ||
+ !(pnv_pe->flags & PNV_IODA_PE_MASTER));
+ pe_no = pnv_pe->pe_number;
+ }
+
+ /* Find the PE according to PE# */
+ dev_pe = eeh_pe_get(hose, pe_no, 0);
+ if (!dev_pe)
+ return -EEXIST;
+
+ /* Freeze the (compound) PE */
+ *pe = dev_pe;
+ if (!(dev_pe->state & EEH_PE_ISOLATED))
+ phb->freeze_pe(phb, pe_no);
+
+ /*
+ * At this point, we're sure the (compound) PE should
+ * have been frozen. However, we still need poke until
+ * hitting the frozen PE on top level.
+ */
+ dev_pe = dev_pe->parent;
+ while (dev_pe && !(dev_pe->type & EEH_PE_PHB)) {
+ int ret;
+ ret = eeh_ops->get_state(dev_pe, NULL);
+ if (ret <= 0 || eeh_state_active(ret)) {
+ dev_pe = dev_pe->parent;
+ continue;
+ }
+
+ /* Frozen parent PE */
+ *pe = dev_pe;
+ if (!(dev_pe->state & EEH_PE_ISOLATED))
+ phb->freeze_pe(phb, dev_pe->addr);
+
+ /* Next one */
+ dev_pe = dev_pe->parent;
+ }
+
+ return 0;
+}
+
+/**
+ * pnv_eeh_next_error - Retrieve next EEH error to handle
+ * @pe: Affected PE
+ *
+ * The function is expected to be called by EEH core while it gets
+ * special EEH event (without binding PE). The function calls to
+ * OPAL APIs for next error to handle. The informational error is
+ * handled internally by platform. However, the dead IOC, dead PHB,
+ * fenced PHB and frozen PE should be handled by EEH core eventually.
+ */
+static int pnv_eeh_next_error(struct eeh_pe **pe)
+{
+ struct pci_controller *hose;
+ struct pnv_phb *phb;
+ struct eeh_pe *phb_pe, *parent_pe;
+ __be64 frozen_pe_no;
+ __be16 err_type, severity;
+ long rc;
+ int state, ret = EEH_NEXT_ERR_NONE;
+
+ /*
+ * While running here, it's safe to purge the event queue. The
+ * event should still be masked.
+ */
+ eeh_remove_event(NULL, false);
+
+ list_for_each_entry(hose, &hose_list, list_node) {
+ /*
+ * If the subordinate PCI buses of the PHB has been
+ * removed or is exactly under error recovery, we
+ * needn't take care of it any more.
+ */
+ phb = hose->private_data;
+ phb_pe = eeh_phb_pe_get(hose);
+ if (!phb_pe || (phb_pe->state & EEH_PE_ISOLATED))
+ continue;
+
+ rc = opal_pci_next_error(phb->opal_id,
+ &frozen_pe_no, &err_type, &severity);
+ if (rc != OPAL_SUCCESS) {
+ pr_devel("%s: Invalid return value on "
+ "PHB#%x (0x%lx) from opal_pci_next_error",
+ __func__, hose->global_number, rc);
+ continue;
+ }
+
+ /* If the PHB doesn't have error, stop processing */
+ if (be16_to_cpu(err_type) == OPAL_EEH_NO_ERROR ||
+ be16_to_cpu(severity) == OPAL_EEH_SEV_NO_ERROR) {
+ pr_devel("%s: No error found on PHB#%x\n",
+ __func__, hose->global_number);
+ continue;
+ }
+
+ /*
+ * Processing the error. We're expecting the error with
+ * highest priority reported upon multiple errors on the
+ * specific PHB.
+ */
+ pr_devel("%s: Error (%d, %d, %llu) on PHB#%x\n",
+ __func__, be16_to_cpu(err_type),
+ be16_to_cpu(severity), be64_to_cpu(frozen_pe_no),
+ hose->global_number);
+ switch (be16_to_cpu(err_type)) {
+ case OPAL_EEH_IOC_ERROR:
+ if (be16_to_cpu(severity) == OPAL_EEH_SEV_IOC_DEAD) {
+ pr_err("EEH: dead IOC detected\n");
+ ret = EEH_NEXT_ERR_DEAD_IOC;
+ } else if (be16_to_cpu(severity) == OPAL_EEH_SEV_INF) {
+ pr_info("EEH: IOC informative error "
+ "detected\n");
+ pnv_eeh_get_and_dump_hub_diag(hose);
+ ret = EEH_NEXT_ERR_NONE;
+ }
+
+ break;
+ case OPAL_EEH_PHB_ERROR:
+ if (be16_to_cpu(severity) == OPAL_EEH_SEV_PHB_DEAD) {
+ *pe = phb_pe;
+ pr_err("EEH: dead PHB#%x detected, "
+ "location: %s\n",
+ hose->global_number,
+ eeh_pe_loc_get(phb_pe));
+ ret = EEH_NEXT_ERR_DEAD_PHB;
+ } else if (be16_to_cpu(severity) ==
+ OPAL_EEH_SEV_PHB_FENCED) {
+ *pe = phb_pe;
+ pr_err("EEH: Fenced PHB#%x detected, "
+ "location: %s\n",
+ hose->global_number,
+ eeh_pe_loc_get(phb_pe));
+ ret = EEH_NEXT_ERR_FENCED_PHB;
+ } else if (be16_to_cpu(severity) == OPAL_EEH_SEV_INF) {
+ pr_info("EEH: PHB#%x informative error "
+ "detected, location: %s\n",
+ hose->global_number,
+ eeh_pe_loc_get(phb_pe));
+ pnv_eeh_get_phb_diag(phb_pe);
+ pnv_pci_dump_phb_diag_data(hose, phb_pe->data);
+ ret = EEH_NEXT_ERR_NONE;
+ }
+
+ break;
+ case OPAL_EEH_PE_ERROR:
+ /*
+ * If we can't find the corresponding PE, we
+ * just try to unfreeze.
+ */
+ if (pnv_eeh_get_pe(hose,
+ be64_to_cpu(frozen_pe_no), pe)) {
+ pr_info("EEH: Clear non-existing PHB#%x-PE#%llx\n",
+ hose->global_number, be64_to_cpu(frozen_pe_no));
+ pr_info("EEH: PHB location: %s\n",
+ eeh_pe_loc_get(phb_pe));
+
+ /* Dump PHB diag-data */
+ rc = opal_pci_get_phb_diag_data2(phb->opal_id,
+ phb->diag_data, phb->diag_data_size);
+ if (rc == OPAL_SUCCESS)
+ pnv_pci_dump_phb_diag_data(hose,
+ phb->diag_data);
+
+ /* Try best to clear it */
+ opal_pci_eeh_freeze_clear(phb->opal_id,
+ be64_to_cpu(frozen_pe_no),
+ OPAL_EEH_ACTION_CLEAR_FREEZE_ALL);
+ ret = EEH_NEXT_ERR_NONE;
+ } else if ((*pe)->state & EEH_PE_ISOLATED ||
+ eeh_pe_passed(*pe)) {
+ ret = EEH_NEXT_ERR_NONE;
+ } else {
+ pr_err("EEH: Frozen PE#%x "
+ "on PHB#%x detected\n",
+ (*pe)->addr,
+ (*pe)->phb->global_number);
+ pr_err("EEH: PE location: %s, "
+ "PHB location: %s\n",
+ eeh_pe_loc_get(*pe),
+ eeh_pe_loc_get(phb_pe));
+ ret = EEH_NEXT_ERR_FROZEN_PE;
+ }
+
+ break;
+ default:
+ pr_warn("%s: Unexpected error type %d\n",
+ __func__, be16_to_cpu(err_type));
+ }
+
+ /*
+ * EEH core will try recover from fenced PHB or
+ * frozen PE. In the time for frozen PE, EEH core
+ * enable IO path for that before collecting logs,
+ * but it ruins the site. So we have to dump the
+ * log in advance here.
+ */
+ if ((ret == EEH_NEXT_ERR_FROZEN_PE ||
+ ret == EEH_NEXT_ERR_FENCED_PHB) &&
+ !((*pe)->state & EEH_PE_ISOLATED)) {
+ eeh_pe_state_mark(*pe, EEH_PE_ISOLATED);
+ pnv_eeh_get_phb_diag(*pe);
+
+ if (eeh_has_flag(EEH_EARLY_DUMP_LOG))
+ pnv_pci_dump_phb_diag_data((*pe)->phb,
+ (*pe)->data);
+ }
+
+ /*
+ * We probably have the frozen parent PE out there and
+ * we need have to handle frozen parent PE firstly.
+ */
+ if (ret == EEH_NEXT_ERR_FROZEN_PE) {
+ parent_pe = (*pe)->parent;
+ while (parent_pe) {
+ /* Hit the ceiling ? */
+ if (parent_pe->type & EEH_PE_PHB)
+ break;
+
+ /* Frozen parent PE ? */
+ state = eeh_ops->get_state(parent_pe, NULL);
+ if (state > 0 && !eeh_state_active(state))
+ *pe = parent_pe;
+
+ /* Next parent level */
+ parent_pe = parent_pe->parent;
+ }
+
+ /* We possibly migrate to another PE */
+ eeh_pe_state_mark(*pe, EEH_PE_ISOLATED);
+ }
+
+ /*
+ * If we have no errors on the specific PHB or only
+ * informative error there, we continue poking it.
+ * Otherwise, we need actions to be taken by upper
+ * layer.
+ */
+ if (ret > EEH_NEXT_ERR_INF)
+ break;
+ }
+
+ /* Unmask the event */
+ if (ret == EEH_NEXT_ERR_NONE && eeh_enabled())
+ enable_irq(eeh_event_irq);
+
+ return ret;
+}
+
+static int pnv_eeh_restore_config(struct pci_dn *pdn)
+{
+ struct eeh_dev *edev = pdn_to_eeh_dev(pdn);
+ struct pnv_phb *phb;
+ s64 ret = 0;
+ int config_addr = (pdn->busno << 8) | (pdn->devfn);
+
+ if (!edev)
+ return -EEXIST;
+
+ /*
+ * We have to restore the PCI config space after reset since the
+ * firmware can't see SRIOV VFs.
+ *
+ * FIXME: The MPS, error routing rules, timeout setting are worthy
+ * to be exported by firmware in extendible way.
+ */
+ if (edev->physfn) {
+ ret = eeh_restore_vf_config(pdn);
+ } else {
+ phb = pdn->phb->private_data;
+ ret = opal_pci_reinit(phb->opal_id,
+ OPAL_REINIT_PCI_DEV, config_addr);
+ }
+
+ if (ret) {
+ pr_warn("%s: Can't reinit PCI dev 0x%x (%lld)\n",
+ __func__, config_addr, ret);
+ return -EIO;
+ }
+
+ return ret;
+}
+
+static struct eeh_ops pnv_eeh_ops = {
+ .name = "powernv",
+ .init = pnv_eeh_init,
+ .probe = pnv_eeh_probe,
+ .set_option = pnv_eeh_set_option,
+ .get_pe_addr = pnv_eeh_get_pe_addr,
+ .get_state = pnv_eeh_get_state,
+ .reset = pnv_eeh_reset,
+ .wait_state = pnv_eeh_wait_state,
+ .get_log = pnv_eeh_get_log,
+ .configure_bridge = pnv_eeh_configure_bridge,
+ .err_inject = pnv_eeh_err_inject,
+ .read_config = pnv_eeh_read_config,
+ .write_config = pnv_eeh_write_config,
+ .next_error = pnv_eeh_next_error,
+ .restore_config = pnv_eeh_restore_config,
+ .notify_resume = NULL
+};
+
+#ifdef CONFIG_PCI_IOV
+static void pnv_pci_fixup_vf_mps(struct pci_dev *pdev)
+{
+ struct pci_dn *pdn = pci_get_pdn(pdev);
+ int parent_mps;
+
+ if (!pdev->is_virtfn)
+ return;
+
+ /* Synchronize MPS for VF and PF */
+ parent_mps = pcie_get_mps(pdev->physfn);
+ if ((128 << pdev->pcie_mpss) >= parent_mps)
+ pcie_set_mps(pdev, parent_mps);
+ pdn->mps = pcie_get_mps(pdev);
+}
+DECLARE_PCI_FIXUP_HEADER(PCI_ANY_ID, PCI_ANY_ID, pnv_pci_fixup_vf_mps);
+#endif /* CONFIG_PCI_IOV */
+
+/**
+ * eeh_powernv_init - Register platform dependent EEH operations
+ *
+ * EEH initialization on powernv platform. This function should be
+ * called before any EEH related functions.
+ */
+static int __init eeh_powernv_init(void)
+{
+ int ret = -EINVAL;
+
+ ret = eeh_ops_register(&pnv_eeh_ops);
+ if (!ret)
+ pr_info("EEH: PowerNV platform initialized\n");
+ else
+ pr_info("EEH: Failed to initialize PowerNV platform (%d)\n", ret);
+
+ return ret;
+}
+machine_early_initcall(powernv, eeh_powernv_init);
diff --git a/arch/powerpc/platforms/powernv/idle.c b/arch/powerpc/platforms/powernv/idle.c
new file mode 100644
index 000000000..e52f9b06d
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/idle.c
@@ -0,0 +1,847 @@
+/*
+ * PowerNV cpuidle code
+ *
+ * Copyright 2015 IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/types.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/of.h>
+#include <linux/device.h>
+#include <linux/cpu.h>
+
+#include <asm/firmware.h>
+#include <asm/machdep.h>
+#include <asm/opal.h>
+#include <asm/cputhreads.h>
+#include <asm/cpuidle.h>
+#include <asm/code-patching.h>
+#include <asm/smp.h>
+#include <asm/runlatch.h>
+#include <asm/dbell.h>
+
+#include "powernv.h"
+#include "subcore.h"
+
+/* Power ISA 3.0 allows for stop states 0x0 - 0xF */
+#define MAX_STOP_STATE 0xF
+
+#define P9_STOP_SPR_MSR 2000
+#define P9_STOP_SPR_PSSCR 855
+
+static u32 supported_cpuidle_states;
+struct pnv_idle_states_t *pnv_idle_states;
+int nr_pnv_idle_states;
+
+/*
+ * The default stop state that will be used by ppc_md.power_save
+ * function on platforms that support stop instruction.
+ */
+static u64 pnv_default_stop_val;
+static u64 pnv_default_stop_mask;
+static bool default_stop_found;
+
+/*
+ * First deep stop state. Used to figure out when to save/restore
+ * hypervisor context.
+ */
+u64 pnv_first_deep_stop_state = MAX_STOP_STATE;
+
+/*
+ * psscr value and mask of the deepest stop idle state.
+ * Used when a cpu is offlined.
+ */
+static u64 pnv_deepest_stop_psscr_val;
+static u64 pnv_deepest_stop_psscr_mask;
+static u64 pnv_deepest_stop_flag;
+static bool deepest_stop_found;
+
+static int pnv_save_sprs_for_deep_states(void)
+{
+ int cpu;
+ int rc;
+
+ /*
+ * hid0, hid1, hid4, hid5, hmeer and lpcr values are symmetric across
+ * all cpus at boot. Get these reg values of current cpu and use the
+ * same across all cpus.
+ */
+ uint64_t lpcr_val = mfspr(SPRN_LPCR);
+ uint64_t hid0_val = mfspr(SPRN_HID0);
+ uint64_t hid1_val = mfspr(SPRN_HID1);
+ uint64_t hid4_val = mfspr(SPRN_HID4);
+ uint64_t hid5_val = mfspr(SPRN_HID5);
+ uint64_t hmeer_val = mfspr(SPRN_HMEER);
+ uint64_t msr_val = MSR_IDLE;
+ uint64_t psscr_val = pnv_deepest_stop_psscr_val;
+
+ for_each_present_cpu(cpu) {
+ uint64_t pir = get_hard_smp_processor_id(cpu);
+ uint64_t hsprg0_val = (uint64_t)paca_ptrs[cpu];
+
+ rc = opal_slw_set_reg(pir, SPRN_HSPRG0, hsprg0_val);
+ if (rc != 0)
+ return rc;
+
+ rc = opal_slw_set_reg(pir, SPRN_LPCR, lpcr_val);
+ if (rc != 0)
+ return rc;
+
+ if (cpu_has_feature(CPU_FTR_ARCH_300)) {
+ rc = opal_slw_set_reg(pir, P9_STOP_SPR_MSR, msr_val);
+ if (rc)
+ return rc;
+
+ rc = opal_slw_set_reg(pir,
+ P9_STOP_SPR_PSSCR, psscr_val);
+
+ if (rc)
+ return rc;
+ }
+
+ /* HIDs are per core registers */
+ if (cpu_thread_in_core(cpu) == 0) {
+
+ rc = opal_slw_set_reg(pir, SPRN_HMEER, hmeer_val);
+ if (rc != 0)
+ return rc;
+
+ rc = opal_slw_set_reg(pir, SPRN_HID0, hid0_val);
+ if (rc != 0)
+ return rc;
+
+ /* Only p8 needs to set extra HID regiters */
+ if (!cpu_has_feature(CPU_FTR_ARCH_300)) {
+
+ rc = opal_slw_set_reg(pir, SPRN_HID1, hid1_val);
+ if (rc != 0)
+ return rc;
+
+ rc = opal_slw_set_reg(pir, SPRN_HID4, hid4_val);
+ if (rc != 0)
+ return rc;
+
+ rc = opal_slw_set_reg(pir, SPRN_HID5, hid5_val);
+ if (rc != 0)
+ return rc;
+ }
+ }
+ }
+
+ return 0;
+}
+
+static void pnv_alloc_idle_core_states(void)
+{
+ int i, j;
+ int nr_cores = cpu_nr_cores();
+ u32 *core_idle_state;
+
+ /*
+ * core_idle_state - The lower 8 bits track the idle state of
+ * each thread of the core.
+ *
+ * The most significant bit is the lock bit.
+ *
+ * Initially all the bits corresponding to threads_per_core
+ * are set. They are cleared when the thread enters deep idle
+ * state like sleep and winkle/stop.
+ *
+ * Initially the lock bit is cleared. The lock bit has 2
+ * purposes:
+ * a. While the first thread in the core waking up from
+ * idle is restoring core state, it prevents other
+ * threads in the core from switching to process
+ * context.
+ * b. While the last thread in the core is saving the
+ * core state, it prevents a different thread from
+ * waking up.
+ */
+ for (i = 0; i < nr_cores; i++) {
+ int first_cpu = i * threads_per_core;
+ int node = cpu_to_node(first_cpu);
+ size_t paca_ptr_array_size;
+
+ core_idle_state = kmalloc_node(sizeof(u32), GFP_KERNEL, node);
+ *core_idle_state = (1 << threads_per_core) - 1;
+ paca_ptr_array_size = (threads_per_core *
+ sizeof(struct paca_struct *));
+
+ for (j = 0; j < threads_per_core; j++) {
+ int cpu = first_cpu + j;
+
+ paca_ptrs[cpu]->core_idle_state_ptr = core_idle_state;
+ paca_ptrs[cpu]->thread_idle_state = PNV_THREAD_RUNNING;
+ paca_ptrs[cpu]->thread_mask = 1 << j;
+ }
+ }
+
+ update_subcore_sibling_mask();
+
+ if (supported_cpuidle_states & OPAL_PM_LOSE_FULL_CONTEXT) {
+ int rc = pnv_save_sprs_for_deep_states();
+
+ if (likely(!rc))
+ return;
+
+ /*
+ * The stop-api is unable to restore hypervisor
+ * resources on wakeup from platform idle states which
+ * lose full context. So disable such states.
+ */
+ supported_cpuidle_states &= ~OPAL_PM_LOSE_FULL_CONTEXT;
+ pr_warn("cpuidle-powernv: Disabling idle states that lose full context\n");
+ pr_warn("cpuidle-powernv: Idle power-savings, CPU-Hotplug affected\n");
+
+ if (cpu_has_feature(CPU_FTR_ARCH_300) &&
+ (pnv_deepest_stop_flag & OPAL_PM_LOSE_FULL_CONTEXT)) {
+ /*
+ * Use the default stop state for CPU-Hotplug
+ * if available.
+ */
+ if (default_stop_found) {
+ pnv_deepest_stop_psscr_val =
+ pnv_default_stop_val;
+ pnv_deepest_stop_psscr_mask =
+ pnv_default_stop_mask;
+ pr_warn("cpuidle-powernv: Offlined CPUs will stop with psscr = 0x%016llx\n",
+ pnv_deepest_stop_psscr_val);
+ } else { /* Fallback to snooze loop for CPU-Hotplug */
+ deepest_stop_found = false;
+ pr_warn("cpuidle-powernv: Offlined CPUs will busy wait\n");
+ }
+ }
+ }
+}
+
+u32 pnv_get_supported_cpuidle_states(void)
+{
+ return supported_cpuidle_states;
+}
+EXPORT_SYMBOL_GPL(pnv_get_supported_cpuidle_states);
+
+static void pnv_fastsleep_workaround_apply(void *info)
+
+{
+ int rc;
+ int *err = info;
+
+ rc = opal_config_cpu_idle_state(OPAL_CONFIG_IDLE_FASTSLEEP,
+ OPAL_CONFIG_IDLE_APPLY);
+ if (rc)
+ *err = 1;
+}
+
+/*
+ * Used to store fastsleep workaround state
+ * 0 - Workaround applied/undone at fastsleep entry/exit path (Default)
+ * 1 - Workaround applied once, never undone.
+ */
+static u8 fastsleep_workaround_applyonce;
+
+static ssize_t show_fastsleep_workaround_applyonce(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%u\n", fastsleep_workaround_applyonce);
+}
+
+static ssize_t store_fastsleep_workaround_applyonce(struct device *dev,
+ struct device_attribute *attr, const char *buf,
+ size_t count)
+{
+ cpumask_t primary_thread_mask;
+ int err;
+ u8 val;
+
+ if (kstrtou8(buf, 0, &val) || val != 1)
+ return -EINVAL;
+
+ if (fastsleep_workaround_applyonce == 1)
+ return count;
+
+ /*
+ * fastsleep_workaround_applyonce = 1 implies
+ * fastsleep workaround needs to be left in 'applied' state on all
+ * the cores. Do this by-
+ * 1. Patching out the call to 'undo' workaround in fastsleep exit path
+ * 2. Sending ipi to all the cores which have at least one online thread
+ * 3. Patching out the call to 'apply' workaround in fastsleep entry
+ * path
+ * There is no need to send ipi to cores which have all threads
+ * offlined, as last thread of the core entering fastsleep or deeper
+ * state would have applied workaround.
+ */
+ err = patch_instruction(
+ (unsigned int *)pnv_fastsleep_workaround_at_exit,
+ PPC_INST_NOP);
+ if (err) {
+ pr_err("fastsleep_workaround_applyonce change failed while patching pnv_fastsleep_workaround_at_exit");
+ goto fail;
+ }
+
+ get_online_cpus();
+ primary_thread_mask = cpu_online_cores_map();
+ on_each_cpu_mask(&primary_thread_mask,
+ pnv_fastsleep_workaround_apply,
+ &err, 1);
+ put_online_cpus();
+ if (err) {
+ pr_err("fastsleep_workaround_applyonce change failed while running pnv_fastsleep_workaround_apply");
+ goto fail;
+ }
+
+ err = patch_instruction(
+ (unsigned int *)pnv_fastsleep_workaround_at_entry,
+ PPC_INST_NOP);
+ if (err) {
+ pr_err("fastsleep_workaround_applyonce change failed while patching pnv_fastsleep_workaround_at_entry");
+ goto fail;
+ }
+
+ fastsleep_workaround_applyonce = 1;
+
+ return count;
+fail:
+ return -EIO;
+}
+
+static DEVICE_ATTR(fastsleep_workaround_applyonce, 0600,
+ show_fastsleep_workaround_applyonce,
+ store_fastsleep_workaround_applyonce);
+
+static unsigned long __power7_idle_type(unsigned long type)
+{
+ unsigned long srr1;
+
+ if (!prep_irq_for_idle_irqsoff())
+ return 0;
+
+ __ppc64_runlatch_off();
+ srr1 = power7_idle_insn(type);
+ __ppc64_runlatch_on();
+
+ fini_irq_for_idle_irqsoff();
+
+ return srr1;
+}
+
+void power7_idle_type(unsigned long type)
+{
+ unsigned long srr1;
+
+ srr1 = __power7_idle_type(type);
+ irq_set_pending_from_srr1(srr1);
+}
+
+void power7_idle(void)
+{
+ if (!powersave_nap)
+ return;
+
+ power7_idle_type(PNV_THREAD_NAP);
+}
+
+static unsigned long __power9_idle_type(unsigned long stop_psscr_val,
+ unsigned long stop_psscr_mask)
+{
+ unsigned long psscr;
+ unsigned long srr1;
+
+ if (!prep_irq_for_idle_irqsoff())
+ return 0;
+
+ psscr = mfspr(SPRN_PSSCR);
+ psscr = (psscr & ~stop_psscr_mask) | stop_psscr_val;
+
+ __ppc64_runlatch_off();
+ srr1 = power9_idle_stop(psscr);
+ __ppc64_runlatch_on();
+
+ fini_irq_for_idle_irqsoff();
+
+ return srr1;
+}
+
+void power9_idle_type(unsigned long stop_psscr_val,
+ unsigned long stop_psscr_mask)
+{
+ unsigned long srr1;
+
+ srr1 = __power9_idle_type(stop_psscr_val, stop_psscr_mask);
+ irq_set_pending_from_srr1(srr1);
+}
+
+/*
+ * Used for ppc_md.power_save which needs a function with no parameters
+ */
+void power9_idle(void)
+{
+ power9_idle_type(pnv_default_stop_val, pnv_default_stop_mask);
+}
+
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+/*
+ * This is used in working around bugs in thread reconfiguration
+ * on POWER9 (at least up to Nimbus DD2.2) relating to transactional
+ * memory and the way that XER[SO] is checkpointed.
+ * This function forces the core into SMT4 in order by asking
+ * all other threads not to stop, and sending a message to any
+ * that are in a stop state.
+ * Must be called with preemption disabled.
+ */
+void pnv_power9_force_smt4_catch(void)
+{
+ int cpu, cpu0, thr;
+ int awake_threads = 1; /* this thread is awake */
+ int poke_threads = 0;
+ int need_awake = threads_per_core;
+
+ cpu = smp_processor_id();
+ cpu0 = cpu & ~(threads_per_core - 1);
+ for (thr = 0; thr < threads_per_core; ++thr) {
+ if (cpu != cpu0 + thr)
+ atomic_inc(&paca_ptrs[cpu0+thr]->dont_stop);
+ }
+ /* order setting dont_stop vs testing requested_psscr */
+ mb();
+ for (thr = 0; thr < threads_per_core; ++thr) {
+ if (!paca_ptrs[cpu0+thr]->requested_psscr)
+ ++awake_threads;
+ else
+ poke_threads |= (1 << thr);
+ }
+
+ /* If at least 3 threads are awake, the core is in SMT4 already */
+ if (awake_threads < need_awake) {
+ /* We have to wake some threads; we'll use msgsnd */
+ for (thr = 0; thr < threads_per_core; ++thr) {
+ if (poke_threads & (1 << thr)) {
+ ppc_msgsnd_sync();
+ ppc_msgsnd(PPC_DBELL_MSGTYPE, 0,
+ paca_ptrs[cpu0+thr]->hw_cpu_id);
+ }
+ }
+ /* now spin until at least 3 threads are awake */
+ do {
+ for (thr = 0; thr < threads_per_core; ++thr) {
+ if ((poke_threads & (1 << thr)) &&
+ !paca_ptrs[cpu0+thr]->requested_psscr) {
+ ++awake_threads;
+ poke_threads &= ~(1 << thr);
+ }
+ }
+ } while (awake_threads < need_awake);
+ }
+}
+EXPORT_SYMBOL_GPL(pnv_power9_force_smt4_catch);
+
+void pnv_power9_force_smt4_release(void)
+{
+ int cpu, cpu0, thr;
+
+ cpu = smp_processor_id();
+ cpu0 = cpu & ~(threads_per_core - 1);
+
+ /* clear all the dont_stop flags */
+ for (thr = 0; thr < threads_per_core; ++thr) {
+ if (cpu != cpu0 + thr)
+ atomic_dec(&paca_ptrs[cpu0+thr]->dont_stop);
+ }
+}
+EXPORT_SYMBOL_GPL(pnv_power9_force_smt4_release);
+#endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */
+
+#ifdef CONFIG_HOTPLUG_CPU
+
+void pnv_program_cpu_hotplug_lpcr(unsigned int cpu, u64 lpcr_val)
+{
+ u64 pir = get_hard_smp_processor_id(cpu);
+
+ mtspr(SPRN_LPCR, lpcr_val);
+
+ /*
+ * Program the LPCR via stop-api only if the deepest stop state
+ * can lose hypervisor context.
+ */
+ if (supported_cpuidle_states & OPAL_PM_LOSE_FULL_CONTEXT)
+ opal_slw_set_reg(pir, SPRN_LPCR, lpcr_val);
+}
+
+/*
+ * pnv_cpu_offline: A function that puts the CPU into the deepest
+ * available platform idle state on a CPU-Offline.
+ * interrupts hard disabled and no lazy irq pending.
+ */
+unsigned long pnv_cpu_offline(unsigned int cpu)
+{
+ unsigned long srr1;
+ u32 idle_states = pnv_get_supported_cpuidle_states();
+
+ __ppc64_runlatch_off();
+
+ if (cpu_has_feature(CPU_FTR_ARCH_300) && deepest_stop_found) {
+ unsigned long psscr;
+
+ psscr = mfspr(SPRN_PSSCR);
+ psscr = (psscr & ~pnv_deepest_stop_psscr_mask) |
+ pnv_deepest_stop_psscr_val;
+ srr1 = power9_offline_stop(psscr);
+
+ } else if ((idle_states & OPAL_PM_WINKLE_ENABLED) &&
+ (idle_states & OPAL_PM_LOSE_FULL_CONTEXT)) {
+ srr1 = power7_idle_insn(PNV_THREAD_WINKLE);
+ } else if ((idle_states & OPAL_PM_SLEEP_ENABLED) ||
+ (idle_states & OPAL_PM_SLEEP_ENABLED_ER1)) {
+ srr1 = power7_idle_insn(PNV_THREAD_SLEEP);
+ } else if (idle_states & OPAL_PM_NAP_ENABLED) {
+ srr1 = power7_idle_insn(PNV_THREAD_NAP);
+ } else {
+ /* This is the fallback method. We emulate snooze */
+ while (!generic_check_cpu_restart(cpu)) {
+ HMT_low();
+ HMT_very_low();
+ }
+ srr1 = 0;
+ HMT_medium();
+ }
+
+ __ppc64_runlatch_on();
+
+ return srr1;
+}
+#endif
+
+/*
+ * Power ISA 3.0 idle initialization.
+ *
+ * POWER ISA 3.0 defines a new SPR Processor stop Status and Control
+ * Register (PSSCR) to control idle behavior.
+ *
+ * PSSCR layout:
+ * ----------------------------------------------------------
+ * | PLS | /// | SD | ESL | EC | PSLL | /// | TR | MTL | RL |
+ * ----------------------------------------------------------
+ * 0 4 41 42 43 44 48 54 56 60
+ *
+ * PSSCR key fields:
+ * Bits 0:3 - Power-Saving Level Status (PLS). This field indicates the
+ * lowest power-saving state the thread entered since stop instruction was
+ * last executed.
+ *
+ * Bit 41 - Status Disable(SD)
+ * 0 - Shows PLS entries
+ * 1 - PLS entries are all 0
+ *
+ * Bit 42 - Enable State Loss
+ * 0 - No state is lost irrespective of other fields
+ * 1 - Allows state loss
+ *
+ * Bit 43 - Exit Criterion
+ * 0 - Exit from power-save mode on any interrupt
+ * 1 - Exit from power-save mode controlled by LPCR's PECE bits
+ *
+ * Bits 44:47 - Power-Saving Level Limit
+ * This limits the power-saving level that can be entered into.
+ *
+ * Bits 60:63 - Requested Level
+ * Used to specify which power-saving level must be entered on executing
+ * stop instruction
+ */
+
+int validate_psscr_val_mask(u64 *psscr_val, u64 *psscr_mask, u32 flags)
+{
+ int err = 0;
+
+ /*
+ * psscr_mask == 0xf indicates an older firmware.
+ * Set remaining fields of psscr to the default values.
+ * See NOTE above definition of PSSCR_HV_DEFAULT_VAL
+ */
+ if (*psscr_mask == 0xf) {
+ *psscr_val = *psscr_val | PSSCR_HV_DEFAULT_VAL;
+ *psscr_mask = PSSCR_HV_DEFAULT_MASK;
+ return err;
+ }
+
+ /*
+ * New firmware is expected to set the psscr_val bits correctly.
+ * Validate that the following invariants are correctly maintained by
+ * the new firmware.
+ * - ESL bit value matches the EC bit value.
+ * - ESL bit is set for all the deep stop states.
+ */
+ if (GET_PSSCR_ESL(*psscr_val) != GET_PSSCR_EC(*psscr_val)) {
+ err = ERR_EC_ESL_MISMATCH;
+ } else if ((flags & OPAL_PM_LOSE_FULL_CONTEXT) &&
+ GET_PSSCR_ESL(*psscr_val) == 0) {
+ err = ERR_DEEP_STATE_ESL_MISMATCH;
+ }
+
+ return err;
+}
+
+/*
+ * pnv_arch300_idle_init: Initializes the default idle state, first
+ * deep idle state and deepest idle state on
+ * ISA 3.0 CPUs.
+ *
+ * @np: /ibm,opal/power-mgt device node
+ * @flags: cpu-idle-state-flags array
+ * @dt_idle_states: Number of idle state entries
+ * Returns 0 on success
+ */
+static int __init pnv_power9_idle_init(void)
+{
+ u64 max_residency_ns = 0;
+ int i;
+
+ /*
+ * Set pnv_first_deep_stop_state, pnv_deepest_stop_psscr_{val,mask},
+ * and the pnv_default_stop_{val,mask}.
+ *
+ * pnv_first_deep_stop_state should be set to the first stop
+ * level to cause hypervisor state loss.
+ *
+ * pnv_deepest_stop_{val,mask} should be set to values corresponding to
+ * the deepest stop state.
+ *
+ * pnv_default_stop_{val,mask} should be set to values corresponding to
+ * the shallowest (OPAL_PM_STOP_INST_FAST) loss-less stop state.
+ */
+ pnv_first_deep_stop_state = MAX_STOP_STATE;
+ for (i = 0; i < nr_pnv_idle_states; i++) {
+ int err;
+ struct pnv_idle_states_t *state = &pnv_idle_states[i];
+ u64 psscr_rl = state->psscr_val & PSSCR_RL_MASK;
+
+ if ((state->flags & OPAL_PM_LOSE_FULL_CONTEXT) &&
+ pnv_first_deep_stop_state > psscr_rl)
+ pnv_first_deep_stop_state = psscr_rl;
+
+ err = validate_psscr_val_mask(&state->psscr_val,
+ &state->psscr_mask,
+ state->flags);
+ if (err) {
+ report_invalid_psscr_val(state->psscr_val, err);
+ continue;
+ }
+
+ state->valid = true;
+
+ if (max_residency_ns < state->residency_ns) {
+ max_residency_ns = state->residency_ns;
+ pnv_deepest_stop_psscr_val = state->psscr_val;
+ pnv_deepest_stop_psscr_mask = state->psscr_mask;
+ pnv_deepest_stop_flag = state->flags;
+ deepest_stop_found = true;
+ }
+
+ if (!default_stop_found &&
+ (state->flags & OPAL_PM_STOP_INST_FAST)) {
+ pnv_default_stop_val = state->psscr_val;
+ pnv_default_stop_mask = state->psscr_mask;
+ default_stop_found = true;
+ }
+ }
+
+ if (unlikely(!default_stop_found)) {
+ pr_warn("cpuidle-powernv: No suitable default stop state found. Disabling platform idle.\n");
+ } else {
+ ppc_md.power_save = power9_idle;
+ pr_info("cpuidle-powernv: Default stop: psscr = 0x%016llx,mask=0x%016llx\n",
+ pnv_default_stop_val, pnv_default_stop_mask);
+ }
+
+ if (unlikely(!deepest_stop_found)) {
+ pr_warn("cpuidle-powernv: No suitable stop state for CPU-Hotplug. Offlined CPUs will busy wait");
+ } else {
+ pr_info("cpuidle-powernv: Deepest stop: psscr = 0x%016llx,mask=0x%016llx\n",
+ pnv_deepest_stop_psscr_val,
+ pnv_deepest_stop_psscr_mask);
+ }
+
+ pr_info("cpuidle-powernv: Requested Level (RL) value of first deep stop = 0x%llx\n",
+ pnv_first_deep_stop_state);
+
+ return 0;
+}
+
+/*
+ * Probe device tree for supported idle states
+ */
+static void __init pnv_probe_idle_states(void)
+{
+ int i;
+
+ if (nr_pnv_idle_states < 0) {
+ pr_warn("cpuidle-powernv: no idle states found in the DT\n");
+ return;
+ }
+
+ if (cpu_has_feature(CPU_FTR_ARCH_300)) {
+ if (pnv_power9_idle_init())
+ return;
+ }
+
+ for (i = 0; i < nr_pnv_idle_states; i++)
+ supported_cpuidle_states |= pnv_idle_states[i].flags;
+}
+
+/*
+ * This function parses device-tree and populates all the information
+ * into pnv_idle_states structure. It also sets up nr_pnv_idle_states
+ * which is the number of cpuidle states discovered through device-tree.
+ */
+
+static int pnv_parse_cpuidle_dt(void)
+{
+ struct device_node *np;
+ int nr_idle_states, i;
+ int rc = 0;
+ u32 *temp_u32;
+ u64 *temp_u64;
+ const char **temp_string;
+
+ np = of_find_node_by_path("/ibm,opal/power-mgt");
+ if (!np) {
+ pr_warn("opal: PowerMgmt Node not found\n");
+ return -ENODEV;
+ }
+ nr_idle_states = of_property_count_u32_elems(np,
+ "ibm,cpu-idle-state-flags");
+
+ pnv_idle_states = kcalloc(nr_idle_states, sizeof(*pnv_idle_states),
+ GFP_KERNEL);
+ temp_u32 = kcalloc(nr_idle_states, sizeof(u32), GFP_KERNEL);
+ temp_u64 = kcalloc(nr_idle_states, sizeof(u64), GFP_KERNEL);
+ temp_string = kcalloc(nr_idle_states, sizeof(char *), GFP_KERNEL);
+
+ if (!(pnv_idle_states && temp_u32 && temp_u64 && temp_string)) {
+ pr_err("Could not allocate memory for dt parsing\n");
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ /* Read flags */
+ if (of_property_read_u32_array(np, "ibm,cpu-idle-state-flags",
+ temp_u32, nr_idle_states)) {
+ pr_warn("cpuidle-powernv: missing ibm,cpu-idle-state-flags in DT\n");
+ rc = -EINVAL;
+ goto out;
+ }
+ for (i = 0; i < nr_idle_states; i++)
+ pnv_idle_states[i].flags = temp_u32[i];
+
+ /* Read latencies */
+ if (of_property_read_u32_array(np, "ibm,cpu-idle-state-latencies-ns",
+ temp_u32, nr_idle_states)) {
+ pr_warn("cpuidle-powernv: missing ibm,cpu-idle-state-latencies-ns in DT\n");
+ rc = -EINVAL;
+ goto out;
+ }
+ for (i = 0; i < nr_idle_states; i++)
+ pnv_idle_states[i].latency_ns = temp_u32[i];
+
+ /* Read residencies */
+ if (of_property_read_u32_array(np, "ibm,cpu-idle-state-residency-ns",
+ temp_u32, nr_idle_states)) {
+ pr_warn("cpuidle-powernv: missing ibm,cpu-idle-state-latencies-ns in DT\n");
+ rc = -EINVAL;
+ goto out;
+ }
+ for (i = 0; i < nr_idle_states; i++)
+ pnv_idle_states[i].residency_ns = temp_u32[i];
+
+ /* For power9 */
+ if (cpu_has_feature(CPU_FTR_ARCH_300)) {
+ /* Read pm_crtl_val */
+ if (of_property_read_u64_array(np, "ibm,cpu-idle-state-psscr",
+ temp_u64, nr_idle_states)) {
+ pr_warn("cpuidle-powernv: missing ibm,cpu-idle-state-psscr in DT\n");
+ rc = -EINVAL;
+ goto out;
+ }
+ for (i = 0; i < nr_idle_states; i++)
+ pnv_idle_states[i].psscr_val = temp_u64[i];
+
+ /* Read pm_crtl_mask */
+ if (of_property_read_u64_array(np, "ibm,cpu-idle-state-psscr-mask",
+ temp_u64, nr_idle_states)) {
+ pr_warn("cpuidle-powernv: missing ibm,cpu-idle-state-psscr-mask in DT\n");
+ rc = -EINVAL;
+ goto out;
+ }
+ for (i = 0; i < nr_idle_states; i++)
+ pnv_idle_states[i].psscr_mask = temp_u64[i];
+ }
+
+ /*
+ * power8 specific properties ibm,cpu-idle-state-pmicr-mask and
+ * ibm,cpu-idle-state-pmicr-val were never used and there is no
+ * plan to use it in near future. Hence, not parsing these properties
+ */
+
+ if (of_property_read_string_array(np, "ibm,cpu-idle-state-names",
+ temp_string, nr_idle_states) < 0) {
+ pr_warn("cpuidle-powernv: missing ibm,cpu-idle-state-names in DT\n");
+ rc = -EINVAL;
+ goto out;
+ }
+ for (i = 0; i < nr_idle_states; i++)
+ strlcpy(pnv_idle_states[i].name, temp_string[i],
+ PNV_IDLE_NAME_LEN);
+ nr_pnv_idle_states = nr_idle_states;
+ rc = 0;
+out:
+ kfree(temp_u32);
+ kfree(temp_u64);
+ kfree(temp_string);
+ return rc;
+}
+
+static int __init pnv_init_idle_states(void)
+{
+ int rc = 0;
+ supported_cpuidle_states = 0;
+
+ /* In case we error out nr_pnv_idle_states will be zero */
+ nr_pnv_idle_states = 0;
+ if (cpuidle_disable != IDLE_NO_OVERRIDE)
+ goto out;
+ rc = pnv_parse_cpuidle_dt();
+ if (rc)
+ return rc;
+ pnv_probe_idle_states();
+
+ if (!(supported_cpuidle_states & OPAL_PM_SLEEP_ENABLED_ER1)) {
+ patch_instruction(
+ (unsigned int *)pnv_fastsleep_workaround_at_entry,
+ PPC_INST_NOP);
+ patch_instruction(
+ (unsigned int *)pnv_fastsleep_workaround_at_exit,
+ PPC_INST_NOP);
+ } else {
+ /*
+ * OPAL_PM_SLEEP_ENABLED_ER1 is set. It indicates that
+ * workaround is needed to use fastsleep. Provide sysfs
+ * control to choose how this workaround has to be applied.
+ */
+ device_create_file(cpu_subsys.dev_root,
+ &dev_attr_fastsleep_workaround_applyonce);
+ }
+
+ pnv_alloc_idle_core_states();
+
+ if (supported_cpuidle_states & OPAL_PM_NAP_ENABLED)
+ ppc_md.power_save = power7_idle;
+
+out:
+ return 0;
+}
+machine_subsys_initcall(powernv, pnv_init_idle_states);
diff --git a/arch/powerpc/platforms/powernv/memtrace.c b/arch/powerpc/platforms/powernv/memtrace.c
new file mode 100644
index 000000000..ce6597a29
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/memtrace.c
@@ -0,0 +1,359 @@
+/*
+ * Copyright (C) IBM Corporation, 2014, 2017
+ * Anton Blanchard, Rashmica Gupta.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#define pr_fmt(fmt) "memtrace: " fmt
+
+#include <linux/bitops.h>
+#include <linux/string.h>
+#include <linux/memblock.h>
+#include <linux/init.h>
+#include <linux/moduleparam.h>
+#include <linux/fs.h>
+#include <linux/debugfs.h>
+#include <linux/slab.h>
+#include <linux/memory.h>
+#include <linux/memory_hotplug.h>
+#include <asm/machdep.h>
+#include <asm/debugfs.h>
+
+/* This enables us to keep track of the memory removed from each node. */
+struct memtrace_entry {
+ void *mem;
+ u64 start;
+ u64 size;
+ u32 nid;
+ struct dentry *dir;
+ char name[16];
+};
+
+static DEFINE_MUTEX(memtrace_mutex);
+static u64 memtrace_size;
+
+static struct memtrace_entry *memtrace_array;
+static unsigned int memtrace_array_nr;
+
+
+static ssize_t memtrace_read(struct file *filp, char __user *ubuf,
+ size_t count, loff_t *ppos)
+{
+ struct memtrace_entry *ent = filp->private_data;
+
+ return simple_read_from_buffer(ubuf, count, ppos, ent->mem, ent->size);
+}
+
+static const struct file_operations memtrace_fops = {
+ .llseek = default_llseek,
+ .read = memtrace_read,
+ .open = simple_open,
+};
+
+static int check_memblock_online(struct memory_block *mem, void *arg)
+{
+ if (mem->state != MEM_ONLINE)
+ return -1;
+
+ return 0;
+}
+
+static int change_memblock_state(struct memory_block *mem, void *arg)
+{
+ unsigned long state = (unsigned long)arg;
+
+ mem->state = state;
+
+ return 0;
+}
+
+static void memtrace_clear_range(unsigned long start_pfn,
+ unsigned long nr_pages)
+{
+ unsigned long pfn;
+
+ /*
+ * As pages are offline, we cannot trust the memmap anymore. As HIGHMEM
+ * does not apply, avoid passing around "struct page" and use
+ * clear_page() instead directly.
+ */
+ for (pfn = start_pfn; pfn < start_pfn + nr_pages; pfn++) {
+ if (IS_ALIGNED(pfn, PAGES_PER_SECTION))
+ cond_resched();
+ clear_page(__va(PFN_PHYS(pfn)));
+ }
+}
+
+/* called with device_hotplug_lock held */
+static bool memtrace_offline_pages(u32 nid, u64 start_pfn, u64 nr_pages)
+{
+ u64 end_pfn = start_pfn + nr_pages - 1;
+
+ if (walk_memory_range(start_pfn, end_pfn, NULL,
+ check_memblock_online))
+ return false;
+
+ walk_memory_range(start_pfn, end_pfn, (void *)MEM_GOING_OFFLINE,
+ change_memblock_state);
+
+ if (offline_pages(start_pfn, nr_pages)) {
+ walk_memory_range(start_pfn, end_pfn, (void *)MEM_ONLINE,
+ change_memblock_state);
+ return false;
+ }
+
+ walk_memory_range(start_pfn, end_pfn, (void *)MEM_OFFLINE,
+ change_memblock_state);
+
+
+ return true;
+}
+
+static u64 memtrace_alloc_node(u32 nid, u64 size)
+{
+ u64 start_pfn, end_pfn, nr_pages, pfn;
+ u64 base_pfn;
+ u64 bytes = memory_block_size_bytes();
+
+ if (!node_spanned_pages(nid))
+ return 0;
+
+ start_pfn = node_start_pfn(nid);
+ end_pfn = node_end_pfn(nid);
+ nr_pages = size >> PAGE_SHIFT;
+
+ /* Trace memory needs to be aligned to the size */
+ end_pfn = round_down(end_pfn - nr_pages, nr_pages);
+
+ lock_device_hotplug();
+ for (base_pfn = end_pfn; base_pfn > start_pfn; base_pfn -= nr_pages) {
+ if (memtrace_offline_pages(nid, base_pfn, nr_pages) == true) {
+ /*
+ * Clear the range while we still have a linear
+ * mapping.
+ */
+ memtrace_clear_range(base_pfn, nr_pages);
+ /*
+ * Remove memory in memory block size chunks so that
+ * iomem resources are always split to the same size and
+ * we never try to remove memory that spans two iomem
+ * resources.
+ */
+ end_pfn = base_pfn + nr_pages;
+ for (pfn = base_pfn; pfn < end_pfn; pfn += bytes>> PAGE_SHIFT) {
+ __remove_memory(nid, pfn << PAGE_SHIFT, bytes);
+ }
+ unlock_device_hotplug();
+ return base_pfn << PAGE_SHIFT;
+ }
+ }
+ unlock_device_hotplug();
+
+ return 0;
+}
+
+static int memtrace_init_regions_runtime(u64 size)
+{
+ u32 nid;
+ u64 m;
+
+ memtrace_array = kcalloc(num_online_nodes(),
+ sizeof(struct memtrace_entry), GFP_KERNEL);
+ if (!memtrace_array) {
+ pr_err("Failed to allocate memtrace_array\n");
+ return -EINVAL;
+ }
+
+ for_each_online_node(nid) {
+ m = memtrace_alloc_node(nid, size);
+
+ /*
+ * A node might not have any local memory, so warn but
+ * continue on.
+ */
+ if (!m) {
+ pr_err("Failed to allocate trace memory on node %d\n", nid);
+ continue;
+ }
+
+ pr_info("Allocated trace memory on node %d at 0x%016llx\n", nid, m);
+
+ memtrace_array[memtrace_array_nr].start = m;
+ memtrace_array[memtrace_array_nr].size = size;
+ memtrace_array[memtrace_array_nr].nid = nid;
+ memtrace_array_nr++;
+ }
+
+ return 0;
+}
+
+static struct dentry *memtrace_debugfs_dir;
+
+static int memtrace_init_debugfs(void)
+{
+ int ret = 0;
+ int i;
+
+ for (i = 0; i < memtrace_array_nr; i++) {
+ struct dentry *dir;
+ struct memtrace_entry *ent = &memtrace_array[i];
+
+ ent->mem = ioremap(ent->start, ent->size);
+ /* Warn but continue on */
+ if (!ent->mem) {
+ pr_err("Failed to map trace memory at 0x%llx\n",
+ ent->start);
+ ret = -1;
+ continue;
+ }
+
+ snprintf(ent->name, 16, "%08x", ent->nid);
+ dir = debugfs_create_dir(ent->name, memtrace_debugfs_dir);
+ if (!dir) {
+ pr_err("Failed to create debugfs directory for node %d\n",
+ ent->nid);
+ return -1;
+ }
+
+ ent->dir = dir;
+ debugfs_create_file("trace", 0400, dir, ent, &memtrace_fops);
+ debugfs_create_x64("start", 0400, dir, &ent->start);
+ debugfs_create_x64("size", 0400, dir, &ent->size);
+ }
+
+ return ret;
+}
+
+static int online_mem_block(struct memory_block *mem, void *arg)
+{
+ return device_online(&mem->dev);
+}
+
+/*
+ * Iterate through the chunks of memory we have removed from the kernel
+ * and attempt to add them back to the kernel.
+ */
+static int memtrace_online(void)
+{
+ int i, ret = 0;
+ struct memtrace_entry *ent;
+
+ for (i = memtrace_array_nr - 1; i >= 0; i--) {
+ ent = &memtrace_array[i];
+
+ /* We have onlined this chunk previously */
+ if (ent->nid == -1)
+ continue;
+
+ /* Remove from io mappings */
+ if (ent->mem) {
+ iounmap(ent->mem);
+ ent->mem = 0;
+ }
+
+ if (add_memory(ent->nid, ent->start, ent->size)) {
+ pr_err("Failed to add trace memory to node %d\n",
+ ent->nid);
+ ret += 1;
+ continue;
+ }
+
+ /*
+ * If kernel isn't compiled with the auto online option
+ * we need to online the memory ourselves.
+ */
+ if (!memhp_auto_online) {
+ lock_device_hotplug();
+ walk_memory_range(PFN_DOWN(ent->start),
+ PFN_UP(ent->start + ent->size - 1),
+ NULL, online_mem_block);
+ unlock_device_hotplug();
+ }
+
+ /*
+ * Memory was added successfully so clean up references to it
+ * so on reentry we can tell that this chunk was added.
+ */
+ debugfs_remove_recursive(ent->dir);
+ pr_info("Added trace memory back to node %d\n", ent->nid);
+ ent->size = ent->start = ent->nid = -1;
+ }
+ if (ret)
+ return ret;
+
+ /* If all chunks of memory were added successfully, reset globals */
+ kfree(memtrace_array);
+ memtrace_array = NULL;
+ memtrace_size = 0;
+ memtrace_array_nr = 0;
+ return 0;
+}
+
+static int memtrace_enable_set(void *data, u64 val)
+{
+ int rc = -EAGAIN;
+ u64 bytes;
+
+ /*
+ * Don't attempt to do anything if size isn't aligned to a memory
+ * block or equal to zero.
+ */
+ bytes = memory_block_size_bytes();
+ if (val & (bytes - 1)) {
+ pr_err("Value must be aligned with 0x%llx\n", bytes);
+ return -EINVAL;
+ }
+
+ mutex_lock(&memtrace_mutex);
+
+ /* Re-add/online previously removed/offlined memory */
+ if (memtrace_size) {
+ if (memtrace_online())
+ goto out_unlock;
+ }
+
+ if (!val) {
+ rc = 0;
+ goto out_unlock;
+ }
+
+ /* Offline and remove memory */
+ if (memtrace_init_regions_runtime(val))
+ goto out_unlock;
+
+ if (memtrace_init_debugfs())
+ goto out_unlock;
+
+ memtrace_size = val;
+ rc = 0;
+out_unlock:
+ mutex_unlock(&memtrace_mutex);
+ return rc;
+}
+
+static int memtrace_enable_get(void *data, u64 *val)
+{
+ *val = memtrace_size;
+ return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(memtrace_init_fops, memtrace_enable_get,
+ memtrace_enable_set, "0x%016llx\n");
+
+static int memtrace_init(void)
+{
+ memtrace_debugfs_dir = debugfs_create_dir("memtrace",
+ powerpc_debugfs_root);
+ if (!memtrace_debugfs_dir)
+ return -1;
+
+ debugfs_create_file("enable", 0600, memtrace_debugfs_dir,
+ NULL, &memtrace_init_fops);
+
+ return 0;
+}
+machine_device_initcall(powernv, memtrace_init);
diff --git a/arch/powerpc/platforms/powernv/npu-dma.c b/arch/powerpc/platforms/powernv/npu-dma.c
new file mode 100644
index 000000000..fd8166ffb
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/npu-dma.c
@@ -0,0 +1,1012 @@
+/*
+ * This file implements the DMA operations for NVLink devices. The NPU
+ * devices all point to the same iommu table as the parent PCI device.
+ *
+ * Copyright Alistair Popple, IBM Corporation 2015.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+
+#include <linux/slab.h>
+#include <linux/mmu_notifier.h>
+#include <linux/mmu_context.h>
+#include <linux/of.h>
+#include <linux/export.h>
+#include <linux/pci.h>
+#include <linux/memblock.h>
+#include <linux/iommu.h>
+#include <linux/debugfs.h>
+
+#include <asm/debugfs.h>
+#include <asm/tlb.h>
+#include <asm/powernv.h>
+#include <asm/reg.h>
+#include <asm/opal.h>
+#include <asm/io.h>
+#include <asm/iommu.h>
+#include <asm/pnv-pci.h>
+#include <asm/msi_bitmap.h>
+#include <asm/opal.h>
+
+#include "powernv.h"
+#include "pci.h"
+
+#define npu_to_phb(x) container_of(x, struct pnv_phb, npu)
+
+/*
+ * spinlock to protect initialisation of an npu_context for a particular
+ * mm_struct.
+ */
+static DEFINE_SPINLOCK(npu_context_lock);
+
+/*
+ * When an address shootdown range exceeds this threshold we invalidate the
+ * entire TLB on the GPU for the given PID rather than each specific address in
+ * the range.
+ */
+static uint64_t atsd_threshold = 2 * 1024 * 1024;
+static struct dentry *atsd_threshold_dentry;
+
+/*
+ * Other types of TCE cache invalidation are not functional in the
+ * hardware.
+ */
+static struct pci_dev *get_pci_dev(struct device_node *dn)
+{
+ struct pci_dn *pdn = PCI_DN(dn);
+ struct pci_dev *pdev;
+
+ pdev = pci_get_domain_bus_and_slot(pci_domain_nr(pdn->phb->bus),
+ pdn->busno, pdn->devfn);
+
+ /*
+ * pci_get_domain_bus_and_slot() increased the reference count of
+ * the PCI device, but callers don't need that actually as the PE
+ * already holds a reference to the device. Since callers aren't
+ * aware of the reference count change, call pci_dev_put() now to
+ * avoid leaks.
+ */
+ if (pdev)
+ pci_dev_put(pdev);
+
+ return pdev;
+}
+
+/* Given a NPU device get the associated PCI device. */
+struct pci_dev *pnv_pci_get_gpu_dev(struct pci_dev *npdev)
+{
+ struct device_node *dn;
+ struct pci_dev *gpdev;
+
+ if (WARN_ON(!npdev))
+ return NULL;
+
+ if (WARN_ON(!npdev->dev.of_node))
+ return NULL;
+
+ /* Get assoicated PCI device */
+ dn = of_parse_phandle(npdev->dev.of_node, "ibm,gpu", 0);
+ if (!dn)
+ return NULL;
+
+ gpdev = get_pci_dev(dn);
+ of_node_put(dn);
+
+ return gpdev;
+}
+EXPORT_SYMBOL(pnv_pci_get_gpu_dev);
+
+/* Given the real PCI device get a linked NPU device. */
+struct pci_dev *pnv_pci_get_npu_dev(struct pci_dev *gpdev, int index)
+{
+ struct device_node *dn;
+ struct pci_dev *npdev;
+
+ if (WARN_ON(!gpdev))
+ return NULL;
+
+ /* Not all PCI devices have device-tree nodes */
+ if (!gpdev->dev.of_node)
+ return NULL;
+
+ /* Get assoicated PCI device */
+ dn = of_parse_phandle(gpdev->dev.of_node, "ibm,npu", index);
+ if (!dn)
+ return NULL;
+
+ npdev = get_pci_dev(dn);
+ of_node_put(dn);
+
+ return npdev;
+}
+EXPORT_SYMBOL(pnv_pci_get_npu_dev);
+
+#define NPU_DMA_OP_UNSUPPORTED() \
+ dev_err_once(dev, "%s operation unsupported for NVLink devices\n", \
+ __func__)
+
+static void *dma_npu_alloc(struct device *dev, size_t size,
+ dma_addr_t *dma_handle, gfp_t flag,
+ unsigned long attrs)
+{
+ NPU_DMA_OP_UNSUPPORTED();
+ return NULL;
+}
+
+static void dma_npu_free(struct device *dev, size_t size,
+ void *vaddr, dma_addr_t dma_handle,
+ unsigned long attrs)
+{
+ NPU_DMA_OP_UNSUPPORTED();
+}
+
+static dma_addr_t dma_npu_map_page(struct device *dev, struct page *page,
+ unsigned long offset, size_t size,
+ enum dma_data_direction direction,
+ unsigned long attrs)
+{
+ NPU_DMA_OP_UNSUPPORTED();
+ return 0;
+}
+
+static int dma_npu_map_sg(struct device *dev, struct scatterlist *sglist,
+ int nelems, enum dma_data_direction direction,
+ unsigned long attrs)
+{
+ NPU_DMA_OP_UNSUPPORTED();
+ return 0;
+}
+
+static int dma_npu_dma_supported(struct device *dev, u64 mask)
+{
+ NPU_DMA_OP_UNSUPPORTED();
+ return 0;
+}
+
+static u64 dma_npu_get_required_mask(struct device *dev)
+{
+ NPU_DMA_OP_UNSUPPORTED();
+ return 0;
+}
+
+static const struct dma_map_ops dma_npu_ops = {
+ .map_page = dma_npu_map_page,
+ .map_sg = dma_npu_map_sg,
+ .alloc = dma_npu_alloc,
+ .free = dma_npu_free,
+ .dma_supported = dma_npu_dma_supported,
+ .get_required_mask = dma_npu_get_required_mask,
+};
+
+/*
+ * Returns the PE assoicated with the PCI device of the given
+ * NPU. Returns the linked pci device if pci_dev != NULL.
+ */
+static struct pnv_ioda_pe *get_gpu_pci_dev_and_pe(struct pnv_ioda_pe *npe,
+ struct pci_dev **gpdev)
+{
+ struct pnv_phb *phb;
+ struct pci_controller *hose;
+ struct pci_dev *pdev;
+ struct pnv_ioda_pe *pe;
+ struct pci_dn *pdn;
+
+ pdev = pnv_pci_get_gpu_dev(npe->pdev);
+ if (!pdev)
+ return NULL;
+
+ pdn = pci_get_pdn(pdev);
+ if (WARN_ON(!pdn || pdn->pe_number == IODA_INVALID_PE))
+ return NULL;
+
+ hose = pci_bus_to_host(pdev->bus);
+ phb = hose->private_data;
+ pe = &phb->ioda.pe_array[pdn->pe_number];
+
+ if (gpdev)
+ *gpdev = pdev;
+
+ return pe;
+}
+
+long pnv_npu_set_window(struct pnv_ioda_pe *npe, int num,
+ struct iommu_table *tbl)
+{
+ struct pnv_phb *phb = npe->phb;
+ int64_t rc;
+ const unsigned long size = tbl->it_indirect_levels ?
+ tbl->it_level_size : tbl->it_size;
+ const __u64 start_addr = tbl->it_offset << tbl->it_page_shift;
+ const __u64 win_size = tbl->it_size << tbl->it_page_shift;
+
+ pe_info(npe, "Setting up window %llx..%llx pg=%lx\n",
+ start_addr, start_addr + win_size - 1,
+ IOMMU_PAGE_SIZE(tbl));
+
+ rc = opal_pci_map_pe_dma_window(phb->opal_id,
+ npe->pe_number,
+ npe->pe_number,
+ tbl->it_indirect_levels + 1,
+ __pa(tbl->it_base),
+ size << 3,
+ IOMMU_PAGE_SIZE(tbl));
+ if (rc) {
+ pe_err(npe, "Failed to configure TCE table, err %lld\n", rc);
+ return rc;
+ }
+ pnv_pci_ioda2_tce_invalidate_entire(phb, false);
+
+ /* Add the table to the list so its TCE cache will get invalidated */
+ pnv_pci_link_table_and_group(phb->hose->node, num,
+ tbl, &npe->table_group);
+
+ return 0;
+}
+
+long pnv_npu_unset_window(struct pnv_ioda_pe *npe, int num)
+{
+ struct pnv_phb *phb = npe->phb;
+ int64_t rc;
+
+ pe_info(npe, "Removing DMA window\n");
+
+ rc = opal_pci_map_pe_dma_window(phb->opal_id, npe->pe_number,
+ npe->pe_number,
+ 0/* levels */, 0/* table address */,
+ 0/* table size */, 0/* page size */);
+ if (rc) {
+ pe_err(npe, "Unmapping failed, ret = %lld\n", rc);
+ return rc;
+ }
+ pnv_pci_ioda2_tce_invalidate_entire(phb, false);
+
+ pnv_pci_unlink_table_and_group(npe->table_group.tables[num],
+ &npe->table_group);
+
+ return 0;
+}
+
+/*
+ * Enables 32 bit DMA on NPU.
+ */
+static void pnv_npu_dma_set_32(struct pnv_ioda_pe *npe)
+{
+ struct pci_dev *gpdev;
+ struct pnv_ioda_pe *gpe;
+ int64_t rc;
+
+ /*
+ * Find the assoicated PCI devices and get the dma window
+ * information from there.
+ */
+ if (!npe->pdev || !(npe->flags & PNV_IODA_PE_DEV))
+ return;
+
+ gpe = get_gpu_pci_dev_and_pe(npe, &gpdev);
+ if (!gpe)
+ return;
+
+ rc = pnv_npu_set_window(npe, 0, gpe->table_group.tables[0]);
+
+ /*
+ * We don't initialise npu_pe->tce32_table as we always use
+ * dma_npu_ops which are nops.
+ */
+ set_dma_ops(&npe->pdev->dev, &dma_npu_ops);
+}
+
+/*
+ * Enables bypass mode on the NPU. The NPU only supports one
+ * window per link, so bypass needs to be explicitly enabled or
+ * disabled. Unlike for a PHB3 bypass and non-bypass modes can't be
+ * active at the same time.
+ */
+static int pnv_npu_dma_set_bypass(struct pnv_ioda_pe *npe)
+{
+ struct pnv_phb *phb = npe->phb;
+ int64_t rc = 0;
+ phys_addr_t top = memblock_end_of_DRAM();
+
+ if (phb->type != PNV_PHB_NPU_NVLINK || !npe->pdev)
+ return -EINVAL;
+
+ rc = pnv_npu_unset_window(npe, 0);
+ if (rc != OPAL_SUCCESS)
+ return rc;
+
+ /* Enable the bypass window */
+
+ top = roundup_pow_of_two(top);
+ dev_info(&npe->pdev->dev, "Enabling bypass for PE %x\n",
+ npe->pe_number);
+ rc = opal_pci_map_pe_dma_window_real(phb->opal_id,
+ npe->pe_number, npe->pe_number,
+ 0 /* bypass base */, top);
+
+ if (rc == OPAL_SUCCESS)
+ pnv_pci_ioda2_tce_invalidate_entire(phb, false);
+
+ return rc;
+}
+
+void pnv_npu_try_dma_set_bypass(struct pci_dev *gpdev, bool bypass)
+{
+ int i;
+ struct pnv_phb *phb;
+ struct pci_dn *pdn;
+ struct pnv_ioda_pe *npe;
+ struct pci_dev *npdev;
+
+ for (i = 0; ; ++i) {
+ npdev = pnv_pci_get_npu_dev(gpdev, i);
+
+ if (!npdev)
+ break;
+
+ pdn = pci_get_pdn(npdev);
+ if (WARN_ON(!pdn || pdn->pe_number == IODA_INVALID_PE))
+ return;
+
+ phb = pci_bus_to_host(npdev->bus)->private_data;
+
+ /* We only do bypass if it's enabled on the linked device */
+ npe = &phb->ioda.pe_array[pdn->pe_number];
+
+ if (bypass) {
+ dev_info(&npdev->dev,
+ "Using 64-bit DMA iommu bypass\n");
+ pnv_npu_dma_set_bypass(npe);
+ } else {
+ dev_info(&npdev->dev, "Using 32-bit DMA via iommu\n");
+ pnv_npu_dma_set_32(npe);
+ }
+ }
+}
+
+/* Switch ownership from platform code to external user (e.g. VFIO) */
+void pnv_npu_take_ownership(struct pnv_ioda_pe *npe)
+{
+ struct pnv_phb *phb = npe->phb;
+ int64_t rc;
+
+ /*
+ * Note: NPU has just a single TVE in the hardware which means that
+ * while used by the kernel, it can have either 32bit window or
+ * DMA bypass but never both. So we deconfigure 32bit window only
+ * if it was enabled at the moment of ownership change.
+ */
+ if (npe->table_group.tables[0]) {
+ pnv_npu_unset_window(npe, 0);
+ return;
+ }
+
+ /* Disable bypass */
+ rc = opal_pci_map_pe_dma_window_real(phb->opal_id,
+ npe->pe_number, npe->pe_number,
+ 0 /* bypass base */, 0);
+ if (rc) {
+ pe_err(npe, "Failed to disable bypass, err %lld\n", rc);
+ return;
+ }
+ pnv_pci_ioda2_tce_invalidate_entire(npe->phb, false);
+}
+
+struct pnv_ioda_pe *pnv_pci_npu_setup_iommu(struct pnv_ioda_pe *npe)
+{
+ struct pnv_phb *phb = npe->phb;
+ struct pci_bus *pbus = phb->hose->bus;
+ struct pci_dev *npdev, *gpdev = NULL, *gptmp;
+ struct pnv_ioda_pe *gpe = get_gpu_pci_dev_and_pe(npe, &gpdev);
+
+ if (!gpe || !gpdev)
+ return NULL;
+
+ list_for_each_entry(npdev, &pbus->devices, bus_list) {
+ gptmp = pnv_pci_get_gpu_dev(npdev);
+
+ if (gptmp != gpdev)
+ continue;
+
+ pe_info(gpe, "Attached NPU %s\n", dev_name(&npdev->dev));
+ iommu_group_add_device(gpe->table_group.group, &npdev->dev);
+ }
+
+ return gpe;
+}
+
+/* Maximum number of nvlinks per npu */
+#define NV_MAX_LINKS 6
+
+/* Maximum index of npu2 hosts in the system. Always < NV_MAX_NPUS */
+static int max_npu2_index;
+
+struct npu_context {
+ struct mm_struct *mm;
+ struct pci_dev *npdev[NV_MAX_NPUS][NV_MAX_LINKS];
+ struct mmu_notifier mn;
+ struct kref kref;
+ bool nmmu_flush;
+
+ /* Callback to stop translation requests on a given GPU */
+ void (*release_cb)(struct npu_context *context, void *priv);
+
+ /*
+ * Private pointer passed to the above callback for usage by
+ * device drivers.
+ */
+ void *priv;
+};
+
+struct mmio_atsd_reg {
+ struct npu *npu;
+ int reg;
+};
+
+/*
+ * Find a free MMIO ATSD register and mark it in use. Return -ENOSPC
+ * if none are available.
+ */
+static int get_mmio_atsd_reg(struct npu *npu)
+{
+ int i;
+
+ for (i = 0; i < npu->mmio_atsd_count; i++) {
+ if (!test_bit(i, &npu->mmio_atsd_usage))
+ if (!test_and_set_bit_lock(i, &npu->mmio_atsd_usage))
+ return i;
+ }
+
+ return -ENOSPC;
+}
+
+static void put_mmio_atsd_reg(struct npu *npu, int reg)
+{
+ clear_bit_unlock(reg, &npu->mmio_atsd_usage);
+}
+
+/* MMIO ATSD register offsets */
+#define XTS_ATSD_AVA 1
+#define XTS_ATSD_STAT 2
+
+static void mmio_launch_invalidate(struct mmio_atsd_reg *mmio_atsd_reg,
+ unsigned long launch, unsigned long va)
+{
+ struct npu *npu = mmio_atsd_reg->npu;
+ int reg = mmio_atsd_reg->reg;
+
+ __raw_writeq_be(va, npu->mmio_atsd_regs[reg] + XTS_ATSD_AVA);
+ eieio();
+ __raw_writeq_be(launch, npu->mmio_atsd_regs[reg]);
+}
+
+static void mmio_invalidate_pid(struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS],
+ unsigned long pid, bool flush)
+{
+ int i;
+ unsigned long launch;
+
+ for (i = 0; i <= max_npu2_index; i++) {
+ if (mmio_atsd_reg[i].reg < 0)
+ continue;
+
+ /* IS set to invalidate matching PID */
+ launch = PPC_BIT(12);
+
+ /* PRS set to process-scoped */
+ launch |= PPC_BIT(13);
+
+ /* AP */
+ launch |= (u64)
+ mmu_get_ap(mmu_virtual_psize) << PPC_BITLSHIFT(17);
+
+ /* PID */
+ launch |= pid << PPC_BITLSHIFT(38);
+
+ /* No flush */
+ launch |= !flush << PPC_BITLSHIFT(39);
+
+ /* Invalidating the entire process doesn't use a va */
+ mmio_launch_invalidate(&mmio_atsd_reg[i], launch, 0);
+ }
+}
+
+static void mmio_invalidate_va(struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS],
+ unsigned long va, unsigned long pid, bool flush)
+{
+ int i;
+ unsigned long launch;
+
+ for (i = 0; i <= max_npu2_index; i++) {
+ if (mmio_atsd_reg[i].reg < 0)
+ continue;
+
+ /* IS set to invalidate target VA */
+ launch = 0;
+
+ /* PRS set to process scoped */
+ launch |= PPC_BIT(13);
+
+ /* AP */
+ launch |= (u64)
+ mmu_get_ap(mmu_virtual_psize) << PPC_BITLSHIFT(17);
+
+ /* PID */
+ launch |= pid << PPC_BITLSHIFT(38);
+
+ /* No flush */
+ launch |= !flush << PPC_BITLSHIFT(39);
+
+ mmio_launch_invalidate(&mmio_atsd_reg[i], launch, va);
+ }
+}
+
+#define mn_to_npu_context(x) container_of(x, struct npu_context, mn)
+
+static void mmio_invalidate_wait(
+ struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS])
+{
+ struct npu *npu;
+ int i, reg;
+
+ /* Wait for all invalidations to complete */
+ for (i = 0; i <= max_npu2_index; i++) {
+ if (mmio_atsd_reg[i].reg < 0)
+ continue;
+
+ /* Wait for completion */
+ npu = mmio_atsd_reg[i].npu;
+ reg = mmio_atsd_reg[i].reg;
+ while (__raw_readq(npu->mmio_atsd_regs[reg] + XTS_ATSD_STAT))
+ cpu_relax();
+ }
+}
+
+/*
+ * Acquires all the address translation shootdown (ATSD) registers required to
+ * launch an ATSD on all links this npu_context is active on.
+ */
+static void acquire_atsd_reg(struct npu_context *npu_context,
+ struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS])
+{
+ int i, j;
+ struct npu *npu;
+ struct pci_dev *npdev;
+ struct pnv_phb *nphb;
+
+ for (i = 0; i <= max_npu2_index; i++) {
+ mmio_atsd_reg[i].reg = -1;
+ for (j = 0; j < NV_MAX_LINKS; j++) {
+ /*
+ * There are no ordering requirements with respect to
+ * the setup of struct npu_context, but to ensure
+ * consistent behaviour we need to ensure npdev[][] is
+ * only read once.
+ */
+ npdev = READ_ONCE(npu_context->npdev[i][j]);
+ if (!npdev)
+ continue;
+
+ nphb = pci_bus_to_host(npdev->bus)->private_data;
+ npu = &nphb->npu;
+ mmio_atsd_reg[i].npu = npu;
+ mmio_atsd_reg[i].reg = get_mmio_atsd_reg(npu);
+ while (mmio_atsd_reg[i].reg < 0) {
+ mmio_atsd_reg[i].reg = get_mmio_atsd_reg(npu);
+ cpu_relax();
+ }
+ break;
+ }
+ }
+}
+
+/*
+ * Release previously acquired ATSD registers. To avoid deadlocks the registers
+ * must be released in the same order they were acquired above in
+ * acquire_atsd_reg.
+ */
+static void release_atsd_reg(struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS])
+{
+ int i;
+
+ for (i = 0; i <= max_npu2_index; i++) {
+ /*
+ * We can't rely on npu_context->npdev[][] being the same here
+ * as when acquire_atsd_reg() was called, hence we use the
+ * values stored in mmio_atsd_reg during the acquire phase
+ * rather than re-reading npdev[][].
+ */
+ if (mmio_atsd_reg[i].reg < 0)
+ continue;
+
+ put_mmio_atsd_reg(mmio_atsd_reg[i].npu, mmio_atsd_reg[i].reg);
+ }
+}
+
+/*
+ * Invalidate either a single address or an entire PID depending on
+ * the value of va.
+ */
+static void mmio_invalidate(struct npu_context *npu_context, int va,
+ unsigned long address, bool flush)
+{
+ struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS];
+ unsigned long pid = npu_context->mm->context.id;
+
+ if (npu_context->nmmu_flush)
+ /*
+ * Unfortunately the nest mmu does not support flushing specific
+ * addresses so we have to flush the whole mm once before
+ * shooting down the GPU translation.
+ */
+ flush_all_mm(npu_context->mm);
+
+ /*
+ * Loop over all the NPUs this process is active on and launch
+ * an invalidate.
+ */
+ acquire_atsd_reg(npu_context, mmio_atsd_reg);
+ if (va)
+ mmio_invalidate_va(mmio_atsd_reg, address, pid, flush);
+ else
+ mmio_invalidate_pid(mmio_atsd_reg, pid, flush);
+
+ mmio_invalidate_wait(mmio_atsd_reg);
+ if (flush) {
+ /*
+ * The GPU requires two flush ATSDs to ensure all entries have
+ * been flushed. We use PID 0 as it will never be used for a
+ * process on the GPU.
+ */
+ mmio_invalidate_pid(mmio_atsd_reg, 0, true);
+ mmio_invalidate_wait(mmio_atsd_reg);
+ mmio_invalidate_pid(mmio_atsd_reg, 0, true);
+ mmio_invalidate_wait(mmio_atsd_reg);
+ }
+ release_atsd_reg(mmio_atsd_reg);
+}
+
+static void pnv_npu2_mn_release(struct mmu_notifier *mn,
+ struct mm_struct *mm)
+{
+ struct npu_context *npu_context = mn_to_npu_context(mn);
+
+ /* Call into device driver to stop requests to the NMMU */
+ if (npu_context->release_cb)
+ npu_context->release_cb(npu_context, npu_context->priv);
+
+ /*
+ * There should be no more translation requests for this PID, but we
+ * need to ensure any entries for it are removed from the TLB.
+ */
+ mmio_invalidate(npu_context, 0, 0, true);
+}
+
+static void pnv_npu2_mn_change_pte(struct mmu_notifier *mn,
+ struct mm_struct *mm,
+ unsigned long address,
+ pte_t pte)
+{
+ struct npu_context *npu_context = mn_to_npu_context(mn);
+
+ mmio_invalidate(npu_context, 1, address, true);
+}
+
+static void pnv_npu2_mn_invalidate_range(struct mmu_notifier *mn,
+ struct mm_struct *mm,
+ unsigned long start, unsigned long end)
+{
+ struct npu_context *npu_context = mn_to_npu_context(mn);
+ unsigned long address;
+
+ if (end - start > atsd_threshold) {
+ /*
+ * Just invalidate the entire PID if the address range is too
+ * large.
+ */
+ mmio_invalidate(npu_context, 0, 0, true);
+ } else {
+ for (address = start; address < end; address += PAGE_SIZE)
+ mmio_invalidate(npu_context, 1, address, false);
+
+ /* Do the flush only on the final addess == end */
+ mmio_invalidate(npu_context, 1, address, true);
+ }
+}
+
+static const struct mmu_notifier_ops nv_nmmu_notifier_ops = {
+ .release = pnv_npu2_mn_release,
+ .change_pte = pnv_npu2_mn_change_pte,
+ .invalidate_range = pnv_npu2_mn_invalidate_range,
+};
+
+/*
+ * Call into OPAL to setup the nmmu context for the current task in
+ * the NPU. This must be called to setup the context tables before the
+ * GPU issues ATRs. pdev should be a pointed to PCIe GPU device.
+ *
+ * A release callback should be registered to allow a device driver to
+ * be notified that it should not launch any new translation requests
+ * as the final TLB invalidate is about to occur.
+ *
+ * Returns an error if there no contexts are currently available or a
+ * npu_context which should be passed to pnv_npu2_handle_fault().
+ *
+ * mmap_sem must be held in write mode and must not be called from interrupt
+ * context.
+ */
+struct npu_context *pnv_npu2_init_context(struct pci_dev *gpdev,
+ unsigned long flags,
+ void (*cb)(struct npu_context *, void *),
+ void *priv)
+{
+ int rc;
+ u32 nvlink_index;
+ struct device_node *nvlink_dn;
+ struct mm_struct *mm = current->mm;
+ struct pnv_phb *nphb;
+ struct npu *npu;
+ struct npu_context *npu_context;
+
+ /*
+ * At present we don't support GPUs connected to multiple NPUs and I'm
+ * not sure the hardware does either.
+ */
+ struct pci_dev *npdev = pnv_pci_get_npu_dev(gpdev, 0);
+
+ if (!firmware_has_feature(FW_FEATURE_OPAL))
+ return ERR_PTR(-ENODEV);
+
+ if (!npdev)
+ /* No nvlink associated with this GPU device */
+ return ERR_PTR(-ENODEV);
+
+ nvlink_dn = of_parse_phandle(npdev->dev.of_node, "ibm,nvlink", 0);
+ if (WARN_ON(of_property_read_u32(nvlink_dn, "ibm,npu-link-index",
+ &nvlink_index)))
+ return ERR_PTR(-ENODEV);
+
+ if (!mm || mm->context.id == 0) {
+ /*
+ * Kernel thread contexts are not supported and context id 0 is
+ * reserved on the GPU.
+ */
+ return ERR_PTR(-EINVAL);
+ }
+
+ nphb = pci_bus_to_host(npdev->bus)->private_data;
+ npu = &nphb->npu;
+
+ /*
+ * Setup the NPU context table for a particular GPU. These need to be
+ * per-GPU as we need the tables to filter ATSDs when there are no
+ * active contexts on a particular GPU. It is safe for these to be
+ * called concurrently with destroy as the OPAL call takes appropriate
+ * locks and refcounts on init/destroy.
+ */
+ rc = opal_npu_init_context(nphb->opal_id, mm->context.id, flags,
+ PCI_DEVID(gpdev->bus->number, gpdev->devfn));
+ if (rc < 0)
+ return ERR_PTR(-ENOSPC);
+
+ /*
+ * We store the npu pci device so we can more easily get at the
+ * associated npus.
+ */
+ spin_lock(&npu_context_lock);
+ npu_context = mm->context.npu_context;
+ if (npu_context) {
+ if (npu_context->release_cb != cb ||
+ npu_context->priv != priv) {
+ spin_unlock(&npu_context_lock);
+ opal_npu_destroy_context(nphb->opal_id, mm->context.id,
+ PCI_DEVID(gpdev->bus->number,
+ gpdev->devfn));
+ return ERR_PTR(-EINVAL);
+ }
+
+ WARN_ON(!kref_get_unless_zero(&npu_context->kref));
+ }
+ spin_unlock(&npu_context_lock);
+
+ if (!npu_context) {
+ /*
+ * We can set up these fields without holding the
+ * npu_context_lock as the npu_context hasn't been returned to
+ * the caller meaning it can't be destroyed. Parallel allocation
+ * is protected against by mmap_sem.
+ */
+ rc = -ENOMEM;
+ npu_context = kzalloc(sizeof(struct npu_context), GFP_KERNEL);
+ if (npu_context) {
+ kref_init(&npu_context->kref);
+ npu_context->mm = mm;
+ npu_context->mn.ops = &nv_nmmu_notifier_ops;
+ rc = __mmu_notifier_register(&npu_context->mn, mm);
+ }
+
+ if (rc) {
+ kfree(npu_context);
+ opal_npu_destroy_context(nphb->opal_id, mm->context.id,
+ PCI_DEVID(gpdev->bus->number,
+ gpdev->devfn));
+ return ERR_PTR(rc);
+ }
+
+ mm->context.npu_context = npu_context;
+ }
+
+ npu_context->release_cb = cb;
+ npu_context->priv = priv;
+
+ /*
+ * npdev is a pci_dev pointer setup by the PCI code. We assign it to
+ * npdev[][] to indicate to the mmu notifiers that an invalidation
+ * should also be sent over this nvlink. The notifiers don't use any
+ * other fields in npu_context, so we just need to ensure that when they
+ * deference npu_context->npdev[][] it is either a valid pointer or
+ * NULL.
+ */
+ WRITE_ONCE(npu_context->npdev[npu->index][nvlink_index], npdev);
+
+ if (!nphb->npu.nmmu_flush) {
+ /*
+ * If we're not explicitly flushing ourselves we need to mark
+ * the thread for global flushes
+ */
+ npu_context->nmmu_flush = false;
+ mm_context_add_copro(mm);
+ } else
+ npu_context->nmmu_flush = true;
+
+ return npu_context;
+}
+EXPORT_SYMBOL(pnv_npu2_init_context);
+
+static void pnv_npu2_release_context(struct kref *kref)
+{
+ struct npu_context *npu_context =
+ container_of(kref, struct npu_context, kref);
+
+ if (!npu_context->nmmu_flush)
+ mm_context_remove_copro(npu_context->mm);
+
+ npu_context->mm->context.npu_context = NULL;
+}
+
+/*
+ * Destroy a context on the given GPU. May free the npu_context if it is no
+ * longer active on any GPUs. Must not be called from interrupt context.
+ */
+void pnv_npu2_destroy_context(struct npu_context *npu_context,
+ struct pci_dev *gpdev)
+{
+ int removed;
+ struct pnv_phb *nphb;
+ struct npu *npu;
+ struct pci_dev *npdev = pnv_pci_get_npu_dev(gpdev, 0);
+ struct device_node *nvlink_dn;
+ u32 nvlink_index;
+
+ if (WARN_ON(!npdev))
+ return;
+
+ if (!firmware_has_feature(FW_FEATURE_OPAL))
+ return;
+
+ nphb = pci_bus_to_host(npdev->bus)->private_data;
+ npu = &nphb->npu;
+ nvlink_dn = of_parse_phandle(npdev->dev.of_node, "ibm,nvlink", 0);
+ if (WARN_ON(of_property_read_u32(nvlink_dn, "ibm,npu-link-index",
+ &nvlink_index)))
+ return;
+ WRITE_ONCE(npu_context->npdev[npu->index][nvlink_index], NULL);
+ opal_npu_destroy_context(nphb->opal_id, npu_context->mm->context.id,
+ PCI_DEVID(gpdev->bus->number, gpdev->devfn));
+ spin_lock(&npu_context_lock);
+ removed = kref_put(&npu_context->kref, pnv_npu2_release_context);
+ spin_unlock(&npu_context_lock);
+
+ /*
+ * We need to do this outside of pnv_npu2_release_context so that it is
+ * outside the spinlock as mmu_notifier_destroy uses SRCU.
+ */
+ if (removed) {
+ mmu_notifier_unregister(&npu_context->mn,
+ npu_context->mm);
+
+ kfree(npu_context);
+ }
+
+}
+EXPORT_SYMBOL(pnv_npu2_destroy_context);
+
+/*
+ * Assumes mmap_sem is held for the contexts associated mm.
+ */
+int pnv_npu2_handle_fault(struct npu_context *context, uintptr_t *ea,
+ unsigned long *flags, unsigned long *status, int count)
+{
+ u64 rc = 0, result = 0;
+ int i, is_write;
+ struct page *page[1];
+
+ /* mmap_sem should be held so the struct_mm must be present */
+ struct mm_struct *mm = context->mm;
+
+ if (!firmware_has_feature(FW_FEATURE_OPAL))
+ return -ENODEV;
+
+ WARN_ON(!rwsem_is_locked(&mm->mmap_sem));
+
+ for (i = 0; i < count; i++) {
+ is_write = flags[i] & NPU2_WRITE;
+ rc = get_user_pages_remote(NULL, mm, ea[i], 1,
+ is_write ? FOLL_WRITE : 0,
+ page, NULL, NULL);
+
+ /*
+ * To support virtualised environments we will have to do an
+ * access to the page to ensure it gets faulted into the
+ * hypervisor. For the moment virtualisation is not supported in
+ * other areas so leave the access out.
+ */
+ if (rc != 1) {
+ status[i] = rc;
+ result = -EFAULT;
+ continue;
+ }
+
+ status[i] = 0;
+ put_page(page[0]);
+ }
+
+ return result;
+}
+EXPORT_SYMBOL(pnv_npu2_handle_fault);
+
+int pnv_npu2_init(struct pnv_phb *phb)
+{
+ unsigned int i;
+ u64 mmio_atsd;
+ struct device_node *dn;
+ struct pci_dev *gpdev;
+ static int npu_index;
+ uint64_t rc = 0;
+
+ if (!atsd_threshold_dentry) {
+ atsd_threshold_dentry = debugfs_create_x64("atsd_threshold",
+ 0600, powerpc_debugfs_root, &atsd_threshold);
+ }
+
+ phb->npu.nmmu_flush =
+ of_property_read_bool(phb->hose->dn, "ibm,nmmu-flush");
+ for_each_child_of_node(phb->hose->dn, dn) {
+ gpdev = pnv_pci_get_gpu_dev(get_pci_dev(dn));
+ if (gpdev) {
+ rc = opal_npu_map_lpar(phb->opal_id,
+ PCI_DEVID(gpdev->bus->number, gpdev->devfn),
+ 0, 0);
+ if (rc)
+ dev_err(&gpdev->dev,
+ "Error %lld mapping device to LPAR\n",
+ rc);
+ }
+ }
+
+ for (i = 0; !of_property_read_u64_index(phb->hose->dn, "ibm,mmio-atsd",
+ i, &mmio_atsd); i++)
+ phb->npu.mmio_atsd_regs[i] = ioremap(mmio_atsd, 32);
+
+ pr_info("NPU%lld: Found %d MMIO ATSD registers", phb->opal_id, i);
+ phb->npu.mmio_atsd_count = i;
+ phb->npu.mmio_atsd_usage = 0;
+ npu_index++;
+ if (WARN_ON(npu_index >= NV_MAX_NPUS))
+ return -ENOSPC;
+ max_npu2_index = npu_index;
+ phb->npu.index = npu_index;
+
+ return 0;
+}
diff --git a/arch/powerpc/platforms/powernv/ocxl.c b/arch/powerpc/platforms/powernv/ocxl.c
new file mode 100644
index 000000000..8c65aacda
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/ocxl.c
@@ -0,0 +1,515 @@
+// SPDX-License-Identifier: GPL-2.0+
+// Copyright 2017 IBM Corp.
+#include <asm/pnv-ocxl.h>
+#include <asm/opal.h>
+#include <asm/xive.h>
+#include <misc/ocxl-config.h>
+#include "pci.h"
+
+#define PNV_OCXL_TL_P9_RECV_CAP 0x000000000000000Full
+#define PNV_OCXL_ACTAG_MAX 64
+/* PASIDs are 20-bit, but on P9, NPU can only handle 15 bits */
+#define PNV_OCXL_PASID_BITS 15
+#define PNV_OCXL_PASID_MAX ((1 << PNV_OCXL_PASID_BITS) - 1)
+
+#define AFU_PRESENT (1 << 31)
+#define AFU_INDEX_MASK 0x3F000000
+#define AFU_INDEX_SHIFT 24
+#define ACTAG_MASK 0xFFF
+
+
+struct actag_range {
+ u16 start;
+ u16 count;
+};
+
+struct npu_link {
+ struct list_head list;
+ int domain;
+ int bus;
+ int dev;
+ u16 fn_desired_actags[8];
+ struct actag_range fn_actags[8];
+ bool assignment_done;
+};
+static struct list_head links_list = LIST_HEAD_INIT(links_list);
+static DEFINE_MUTEX(links_list_lock);
+
+
+/*
+ * opencapi actags handling:
+ *
+ * When sending commands, the opencapi device references the memory
+ * context it's targeting with an 'actag', which is really an alias
+ * for a (BDF, pasid) combination. When it receives a command, the NPU
+ * must do a lookup of the actag to identify the memory context. The
+ * hardware supports a finite number of actags per link (64 for
+ * POWER9).
+ *
+ * The device can carry multiple functions, and each function can have
+ * multiple AFUs. Each AFU advertises in its config space the number
+ * of desired actags. The host must configure in the config space of
+ * the AFU how many actags the AFU is really allowed to use (which can
+ * be less than what the AFU desires).
+ *
+ * When a PCI function is probed by the driver, it has no visibility
+ * about the other PCI functions and how many actags they'd like,
+ * which makes it impossible to distribute actags fairly among AFUs.
+ *
+ * Unfortunately, the only way to know how many actags a function
+ * desires is by looking at the data for each AFU in the config space
+ * and add them up. Similarly, the only way to know how many actags
+ * all the functions of the physical device desire is by adding the
+ * previously computed function counts. Then we can match that against
+ * what the hardware supports.
+ *
+ * To get a comprehensive view, we use a 'pci fixup': at the end of
+ * PCI enumeration, each function counts how many actags its AFUs
+ * desire and we save it in a 'npu_link' structure, shared between all
+ * the PCI functions of a same device. Therefore, when the first
+ * function is probed by the driver, we can get an idea of the total
+ * count of desired actags for the device, and assign the actags to
+ * the AFUs, by pro-rating if needed.
+ */
+
+static int find_dvsec_from_pos(struct pci_dev *dev, int dvsec_id, int pos)
+{
+ int vsec = pos;
+ u16 vendor, id;
+
+ while ((vsec = pci_find_next_ext_capability(dev, vsec,
+ OCXL_EXT_CAP_ID_DVSEC))) {
+ pci_read_config_word(dev, vsec + OCXL_DVSEC_VENDOR_OFFSET,
+ &vendor);
+ pci_read_config_word(dev, vsec + OCXL_DVSEC_ID_OFFSET, &id);
+ if (vendor == PCI_VENDOR_ID_IBM && id == dvsec_id)
+ return vsec;
+ }
+ return 0;
+}
+
+static int find_dvsec_afu_ctrl(struct pci_dev *dev, u8 afu_idx)
+{
+ int vsec = 0;
+ u8 idx;
+
+ while ((vsec = find_dvsec_from_pos(dev, OCXL_DVSEC_AFU_CTRL_ID,
+ vsec))) {
+ pci_read_config_byte(dev, vsec + OCXL_DVSEC_AFU_CTRL_AFU_IDX,
+ &idx);
+ if (idx == afu_idx)
+ return vsec;
+ }
+ return 0;
+}
+
+static int get_max_afu_index(struct pci_dev *dev, int *afu_idx)
+{
+ int pos;
+ u32 val;
+
+ pos = find_dvsec_from_pos(dev, OCXL_DVSEC_FUNC_ID, 0);
+ if (!pos)
+ return -ESRCH;
+
+ pci_read_config_dword(dev, pos + OCXL_DVSEC_FUNC_OFF_INDEX, &val);
+ if (val & AFU_PRESENT)
+ *afu_idx = (val & AFU_INDEX_MASK) >> AFU_INDEX_SHIFT;
+ else
+ *afu_idx = -1;
+ return 0;
+}
+
+static int get_actag_count(struct pci_dev *dev, int afu_idx, int *actag)
+{
+ int pos;
+ u16 actag_sup;
+
+ pos = find_dvsec_afu_ctrl(dev, afu_idx);
+ if (!pos)
+ return -ESRCH;
+
+ pci_read_config_word(dev, pos + OCXL_DVSEC_AFU_CTRL_ACTAG_SUP,
+ &actag_sup);
+ *actag = actag_sup & ACTAG_MASK;
+ return 0;
+}
+
+static struct npu_link *find_link(struct pci_dev *dev)
+{
+ struct npu_link *link;
+
+ list_for_each_entry(link, &links_list, list) {
+ /* The functions of a device all share the same link */
+ if (link->domain == pci_domain_nr(dev->bus) &&
+ link->bus == dev->bus->number &&
+ link->dev == PCI_SLOT(dev->devfn)) {
+ return link;
+ }
+ }
+
+ /* link doesn't exist yet. Allocate one */
+ link = kzalloc(sizeof(struct npu_link), GFP_KERNEL);
+ if (!link)
+ return NULL;
+ link->domain = pci_domain_nr(dev->bus);
+ link->bus = dev->bus->number;
+ link->dev = PCI_SLOT(dev->devfn);
+ list_add(&link->list, &links_list);
+ return link;
+}
+
+static void pnv_ocxl_fixup_actag(struct pci_dev *dev)
+{
+ struct pci_controller *hose = pci_bus_to_host(dev->bus);
+ struct pnv_phb *phb = hose->private_data;
+ struct npu_link *link;
+ int rc, afu_idx = -1, i, actag;
+
+ if (!machine_is(powernv))
+ return;
+
+ if (phb->type != PNV_PHB_NPU_OCAPI)
+ return;
+
+ mutex_lock(&links_list_lock);
+
+ link = find_link(dev);
+ if (!link) {
+ dev_warn(&dev->dev, "couldn't update actag information\n");
+ mutex_unlock(&links_list_lock);
+ return;
+ }
+
+ /*
+ * Check how many actags are desired for the AFUs under that
+ * function and add it to the count for the link
+ */
+ rc = get_max_afu_index(dev, &afu_idx);
+ if (rc) {
+ /* Most likely an invalid config space */
+ dev_dbg(&dev->dev, "couldn't find AFU information\n");
+ afu_idx = -1;
+ }
+
+ link->fn_desired_actags[PCI_FUNC(dev->devfn)] = 0;
+ for (i = 0; i <= afu_idx; i++) {
+ /*
+ * AFU index 'holes' are allowed. So don't fail if we
+ * can't read the actag info for an index
+ */
+ rc = get_actag_count(dev, i, &actag);
+ if (rc)
+ continue;
+ link->fn_desired_actags[PCI_FUNC(dev->devfn)] += actag;
+ }
+ dev_dbg(&dev->dev, "total actags for function: %d\n",
+ link->fn_desired_actags[PCI_FUNC(dev->devfn)]);
+
+ mutex_unlock(&links_list_lock);
+}
+DECLARE_PCI_FIXUP_HEADER(PCI_ANY_ID, PCI_ANY_ID, pnv_ocxl_fixup_actag);
+
+static u16 assign_fn_actags(u16 desired, u16 total)
+{
+ u16 count;
+
+ if (total <= PNV_OCXL_ACTAG_MAX)
+ count = desired;
+ else
+ count = PNV_OCXL_ACTAG_MAX * desired / total;
+
+ return count;
+}
+
+static void assign_actags(struct npu_link *link)
+{
+ u16 actag_count, range_start = 0, total_desired = 0;
+ int i;
+
+ for (i = 0; i < 8; i++)
+ total_desired += link->fn_desired_actags[i];
+
+ for (i = 0; i < 8; i++) {
+ if (link->fn_desired_actags[i]) {
+ actag_count = assign_fn_actags(
+ link->fn_desired_actags[i],
+ total_desired);
+ link->fn_actags[i].start = range_start;
+ link->fn_actags[i].count = actag_count;
+ range_start += actag_count;
+ WARN_ON(range_start >= PNV_OCXL_ACTAG_MAX);
+ }
+ pr_debug("link %x:%x:%x fct %d actags: start=%d count=%d (desired=%d)\n",
+ link->domain, link->bus, link->dev, i,
+ link->fn_actags[i].start, link->fn_actags[i].count,
+ link->fn_desired_actags[i]);
+ }
+ link->assignment_done = true;
+}
+
+int pnv_ocxl_get_actag(struct pci_dev *dev, u16 *base, u16 *enabled,
+ u16 *supported)
+{
+ struct npu_link *link;
+
+ mutex_lock(&links_list_lock);
+
+ link = find_link(dev);
+ if (!link) {
+ dev_err(&dev->dev, "actag information not found\n");
+ mutex_unlock(&links_list_lock);
+ return -ENODEV;
+ }
+ /*
+ * On p9, we only have 64 actags per link, so they must be
+ * shared by all the functions of the same adapter. We counted
+ * the desired actag counts during PCI enumeration, so that we
+ * can allocate a pro-rated number of actags to each function.
+ */
+ if (!link->assignment_done)
+ assign_actags(link);
+
+ *base = link->fn_actags[PCI_FUNC(dev->devfn)].start;
+ *enabled = link->fn_actags[PCI_FUNC(dev->devfn)].count;
+ *supported = link->fn_desired_actags[PCI_FUNC(dev->devfn)];
+
+ mutex_unlock(&links_list_lock);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(pnv_ocxl_get_actag);
+
+int pnv_ocxl_get_pasid_count(struct pci_dev *dev, int *count)
+{
+ struct npu_link *link;
+ int i, rc = -EINVAL;
+
+ /*
+ * The number of PASIDs (process address space ID) which can
+ * be used by a function depends on how many functions exist
+ * on the device. The NPU needs to be configured to know how
+ * many bits are available to PASIDs and how many are to be
+ * used by the function BDF indentifier.
+ *
+ * We only support one AFU-carrying function for now.
+ */
+ mutex_lock(&links_list_lock);
+
+ link = find_link(dev);
+ if (!link) {
+ dev_err(&dev->dev, "actag information not found\n");
+ mutex_unlock(&links_list_lock);
+ return -ENODEV;
+ }
+
+ for (i = 0; i < 8; i++)
+ if (link->fn_desired_actags[i] && (i == PCI_FUNC(dev->devfn))) {
+ *count = PNV_OCXL_PASID_MAX;
+ rc = 0;
+ break;
+ }
+
+ mutex_unlock(&links_list_lock);
+ dev_dbg(&dev->dev, "%d PASIDs available for function\n",
+ rc ? 0 : *count);
+ return rc;
+}
+EXPORT_SYMBOL_GPL(pnv_ocxl_get_pasid_count);
+
+static void set_templ_rate(unsigned int templ, unsigned int rate, char *buf)
+{
+ int shift, idx;
+
+ WARN_ON(templ > PNV_OCXL_TL_MAX_TEMPLATE);
+ idx = (PNV_OCXL_TL_MAX_TEMPLATE - templ) / 2;
+ shift = 4 * (1 - ((PNV_OCXL_TL_MAX_TEMPLATE - templ) % 2));
+ buf[idx] |= rate << shift;
+}
+
+int pnv_ocxl_get_tl_cap(struct pci_dev *dev, long *cap,
+ char *rate_buf, int rate_buf_size)
+{
+ if (rate_buf_size != PNV_OCXL_TL_RATE_BUF_SIZE)
+ return -EINVAL;
+ /*
+ * The TL capabilities are a characteristic of the NPU, so
+ * we go with hard-coded values.
+ *
+ * The receiving rate of each template is encoded on 4 bits.
+ *
+ * On P9:
+ * - templates 0 -> 3 are supported
+ * - templates 0, 1 and 3 have a 0 receiving rate
+ * - template 2 has receiving rate of 1 (extra cycle)
+ */
+ memset(rate_buf, 0, rate_buf_size);
+ set_templ_rate(2, 1, rate_buf);
+ *cap = PNV_OCXL_TL_P9_RECV_CAP;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(pnv_ocxl_get_tl_cap);
+
+int pnv_ocxl_set_tl_conf(struct pci_dev *dev, long cap,
+ uint64_t rate_buf_phys, int rate_buf_size)
+{
+ struct pci_controller *hose = pci_bus_to_host(dev->bus);
+ struct pnv_phb *phb = hose->private_data;
+ int rc;
+
+ if (rate_buf_size != PNV_OCXL_TL_RATE_BUF_SIZE)
+ return -EINVAL;
+
+ rc = opal_npu_tl_set(phb->opal_id, dev->devfn, cap,
+ rate_buf_phys, rate_buf_size);
+ if (rc) {
+ dev_err(&dev->dev, "Can't configure host TL: %d\n", rc);
+ return -EINVAL;
+ }
+ return 0;
+}
+EXPORT_SYMBOL_GPL(pnv_ocxl_set_tl_conf);
+
+int pnv_ocxl_get_xsl_irq(struct pci_dev *dev, int *hwirq)
+{
+ int rc;
+
+ rc = of_property_read_u32(dev->dev.of_node, "ibm,opal-xsl-irq", hwirq);
+ if (rc) {
+ dev_err(&dev->dev,
+ "Can't get translation interrupt for device\n");
+ return rc;
+ }
+ return 0;
+}
+EXPORT_SYMBOL_GPL(pnv_ocxl_get_xsl_irq);
+
+void pnv_ocxl_unmap_xsl_regs(void __iomem *dsisr, void __iomem *dar,
+ void __iomem *tfc, void __iomem *pe_handle)
+{
+ iounmap(dsisr);
+ iounmap(dar);
+ iounmap(tfc);
+ iounmap(pe_handle);
+}
+EXPORT_SYMBOL_GPL(pnv_ocxl_unmap_xsl_regs);
+
+int pnv_ocxl_map_xsl_regs(struct pci_dev *dev, void __iomem **dsisr,
+ void __iomem **dar, void __iomem **tfc,
+ void __iomem **pe_handle)
+{
+ u64 reg;
+ int i, j, rc = 0;
+ void __iomem *regs[4];
+
+ /*
+ * opal stores the mmio addresses of the DSISR, DAR, TFC and
+ * PE_HANDLE registers in a device tree property, in that
+ * order
+ */
+ for (i = 0; i < 4; i++) {
+ rc = of_property_read_u64_index(dev->dev.of_node,
+ "ibm,opal-xsl-mmio", i, &reg);
+ if (rc)
+ break;
+ regs[i] = ioremap(reg, 8);
+ if (!regs[i]) {
+ rc = -EINVAL;
+ break;
+ }
+ }
+ if (rc) {
+ dev_err(&dev->dev, "Can't map translation mmio registers\n");
+ for (j = i - 1; j >= 0; j--)
+ iounmap(regs[j]);
+ } else {
+ *dsisr = regs[0];
+ *dar = regs[1];
+ *tfc = regs[2];
+ *pe_handle = regs[3];
+ }
+ return rc;
+}
+EXPORT_SYMBOL_GPL(pnv_ocxl_map_xsl_regs);
+
+struct spa_data {
+ u64 phb_opal_id;
+ u32 bdfn;
+};
+
+int pnv_ocxl_spa_setup(struct pci_dev *dev, void *spa_mem, int PE_mask,
+ void **platform_data)
+{
+ struct pci_controller *hose = pci_bus_to_host(dev->bus);
+ struct pnv_phb *phb = hose->private_data;
+ struct spa_data *data;
+ u32 bdfn;
+ int rc;
+
+ data = kzalloc(sizeof(*data), GFP_KERNEL);
+ if (!data)
+ return -ENOMEM;
+
+ bdfn = (dev->bus->number << 8) | dev->devfn;
+ rc = opal_npu_spa_setup(phb->opal_id, bdfn, virt_to_phys(spa_mem),
+ PE_mask);
+ if (rc) {
+ dev_err(&dev->dev, "Can't setup Shared Process Area: %d\n", rc);
+ kfree(data);
+ return rc;
+ }
+ data->phb_opal_id = phb->opal_id;
+ data->bdfn = bdfn;
+ *platform_data = (void *) data;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(pnv_ocxl_spa_setup);
+
+void pnv_ocxl_spa_release(void *platform_data)
+{
+ struct spa_data *data = (struct spa_data *) platform_data;
+ int rc;
+
+ rc = opal_npu_spa_setup(data->phb_opal_id, data->bdfn, 0, 0);
+ WARN_ON(rc);
+ kfree(data);
+}
+EXPORT_SYMBOL_GPL(pnv_ocxl_spa_release);
+
+int pnv_ocxl_spa_remove_pe_from_cache(void *platform_data, int pe_handle)
+{
+ struct spa_data *data = (struct spa_data *) platform_data;
+ int rc;
+
+ rc = opal_npu_spa_clear_cache(data->phb_opal_id, data->bdfn, pe_handle);
+ return rc;
+}
+EXPORT_SYMBOL_GPL(pnv_ocxl_spa_remove_pe_from_cache);
+
+int pnv_ocxl_alloc_xive_irq(u32 *irq, u64 *trigger_addr)
+{
+ __be64 flags, trigger_page;
+ s64 rc;
+ u32 hwirq;
+
+ hwirq = xive_native_alloc_irq();
+ if (!hwirq)
+ return -ENOENT;
+
+ rc = opal_xive_get_irq_info(hwirq, &flags, NULL, &trigger_page, NULL,
+ NULL);
+ if (rc || !trigger_page) {
+ xive_native_free_irq(hwirq);
+ return -ENOENT;
+ }
+ *irq = hwirq;
+ *trigger_addr = be64_to_cpu(trigger_page);
+ return 0;
+
+}
+EXPORT_SYMBOL_GPL(pnv_ocxl_alloc_xive_irq);
+
+void pnv_ocxl_free_xive_irq(u32 irq)
+{
+ xive_native_free_irq(irq);
+}
+EXPORT_SYMBOL_GPL(pnv_ocxl_free_xive_irq);
diff --git a/arch/powerpc/platforms/powernv/opal-async.c b/arch/powerpc/platforms/powernv/opal-async.c
new file mode 100644
index 000000000..18a355fa1
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/opal-async.c
@@ -0,0 +1,294 @@
+/*
+ * PowerNV OPAL asynchronous completion interfaces
+ *
+ * Copyright 2013-2017 IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#undef DEBUG
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/sched.h>
+#include <linux/semaphore.h>
+#include <linux/spinlock.h>
+#include <linux/wait.h>
+#include <linux/gfp.h>
+#include <linux/of.h>
+#include <asm/machdep.h>
+#include <asm/opal.h>
+
+enum opal_async_token_state {
+ ASYNC_TOKEN_UNALLOCATED = 0,
+ ASYNC_TOKEN_ALLOCATED,
+ ASYNC_TOKEN_DISPATCHED,
+ ASYNC_TOKEN_ABANDONED,
+ ASYNC_TOKEN_COMPLETED
+};
+
+struct opal_async_token {
+ enum opal_async_token_state state;
+ struct opal_msg response;
+};
+
+static DECLARE_WAIT_QUEUE_HEAD(opal_async_wait);
+static DEFINE_SPINLOCK(opal_async_comp_lock);
+static struct semaphore opal_async_sem;
+static unsigned int opal_max_async_tokens;
+static struct opal_async_token *opal_async_tokens;
+
+static int __opal_async_get_token(void)
+{
+ unsigned long flags;
+ int i, token = -EBUSY;
+
+ spin_lock_irqsave(&opal_async_comp_lock, flags);
+
+ for (i = 0; i < opal_max_async_tokens; i++) {
+ if (opal_async_tokens[i].state == ASYNC_TOKEN_UNALLOCATED) {
+ opal_async_tokens[i].state = ASYNC_TOKEN_ALLOCATED;
+ token = i;
+ break;
+ }
+ }
+
+ spin_unlock_irqrestore(&opal_async_comp_lock, flags);
+ return token;
+}
+
+/*
+ * Note: If the returned token is used in an opal call and opal returns
+ * OPAL_ASYNC_COMPLETION you MUST call one of opal_async_wait_response() or
+ * opal_async_wait_response_interruptible() at least once before calling another
+ * opal_async_* function
+ */
+int opal_async_get_token_interruptible(void)
+{
+ int token;
+
+ /* Wait until a token is available */
+ if (down_interruptible(&opal_async_sem))
+ return -ERESTARTSYS;
+
+ token = __opal_async_get_token();
+ if (token < 0)
+ up(&opal_async_sem);
+
+ return token;
+}
+EXPORT_SYMBOL_GPL(opal_async_get_token_interruptible);
+
+static int __opal_async_release_token(int token)
+{
+ unsigned long flags;
+ int rc;
+
+ if (token < 0 || token >= opal_max_async_tokens) {
+ pr_err("%s: Passed token is out of range, token %d\n",
+ __func__, token);
+ return -EINVAL;
+ }
+
+ spin_lock_irqsave(&opal_async_comp_lock, flags);
+ switch (opal_async_tokens[token].state) {
+ case ASYNC_TOKEN_COMPLETED:
+ case ASYNC_TOKEN_ALLOCATED:
+ opal_async_tokens[token].state = ASYNC_TOKEN_UNALLOCATED;
+ rc = 0;
+ break;
+ /*
+ * DISPATCHED and ABANDONED tokens must wait for OPAL to respond.
+ * Mark a DISPATCHED token as ABANDONED so that the response handling
+ * code knows no one cares and that it can free it then.
+ */
+ case ASYNC_TOKEN_DISPATCHED:
+ opal_async_tokens[token].state = ASYNC_TOKEN_ABANDONED;
+ /* Fall through */
+ default:
+ rc = 1;
+ }
+ spin_unlock_irqrestore(&opal_async_comp_lock, flags);
+
+ return rc;
+}
+
+int opal_async_release_token(int token)
+{
+ int ret;
+
+ ret = __opal_async_release_token(token);
+ if (!ret)
+ up(&opal_async_sem);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(opal_async_release_token);
+
+int opal_async_wait_response(uint64_t token, struct opal_msg *msg)
+{
+ if (token >= opal_max_async_tokens) {
+ pr_err("%s: Invalid token passed\n", __func__);
+ return -EINVAL;
+ }
+
+ if (!msg) {
+ pr_err("%s: Invalid message pointer passed\n", __func__);
+ return -EINVAL;
+ }
+
+ /*
+ * There is no need to mark the token as dispatched, wait_event()
+ * will block until the token completes.
+ *
+ * Wakeup the poller before we wait for events to speed things
+ * up on platforms or simulators where the interrupts aren't
+ * functional.
+ */
+ opal_wake_poller();
+ wait_event(opal_async_wait, opal_async_tokens[token].state
+ == ASYNC_TOKEN_COMPLETED);
+ memcpy(msg, &opal_async_tokens[token].response, sizeof(*msg));
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(opal_async_wait_response);
+
+int opal_async_wait_response_interruptible(uint64_t token, struct opal_msg *msg)
+{
+ unsigned long flags;
+ int ret;
+
+ if (token >= opal_max_async_tokens) {
+ pr_err("%s: Invalid token passed\n", __func__);
+ return -EINVAL;
+ }
+
+ if (!msg) {
+ pr_err("%s: Invalid message pointer passed\n", __func__);
+ return -EINVAL;
+ }
+
+ /*
+ * The first time this gets called we mark the token as DISPATCHED
+ * so that if wait_event_interruptible() returns not zero and the
+ * caller frees the token, we know not to actually free the token
+ * until the response comes.
+ *
+ * Only change if the token is ALLOCATED - it may have been
+ * completed even before the caller gets around to calling this
+ * the first time.
+ *
+ * There is also a dirty great comment at the token allocation
+ * function that if the opal call returns OPAL_ASYNC_COMPLETION to
+ * the caller then the caller *must* call this or the not
+ * interruptible version before doing anything else with the
+ * token.
+ */
+ if (opal_async_tokens[token].state == ASYNC_TOKEN_ALLOCATED) {
+ spin_lock_irqsave(&opal_async_comp_lock, flags);
+ if (opal_async_tokens[token].state == ASYNC_TOKEN_ALLOCATED)
+ opal_async_tokens[token].state = ASYNC_TOKEN_DISPATCHED;
+ spin_unlock_irqrestore(&opal_async_comp_lock, flags);
+ }
+
+ /*
+ * Wakeup the poller before we wait for events to speed things
+ * up on platforms or simulators where the interrupts aren't
+ * functional.
+ */
+ opal_wake_poller();
+ ret = wait_event_interruptible(opal_async_wait,
+ opal_async_tokens[token].state ==
+ ASYNC_TOKEN_COMPLETED);
+ if (!ret)
+ memcpy(msg, &opal_async_tokens[token].response, sizeof(*msg));
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(opal_async_wait_response_interruptible);
+
+/* Called from interrupt context */
+static int opal_async_comp_event(struct notifier_block *nb,
+ unsigned long msg_type, void *msg)
+{
+ struct opal_msg *comp_msg = msg;
+ enum opal_async_token_state state;
+ unsigned long flags;
+ uint64_t token;
+
+ if (msg_type != OPAL_MSG_ASYNC_COMP)
+ return 0;
+
+ token = be64_to_cpu(comp_msg->params[0]);
+ spin_lock_irqsave(&opal_async_comp_lock, flags);
+ state = opal_async_tokens[token].state;
+ opal_async_tokens[token].state = ASYNC_TOKEN_COMPLETED;
+ spin_unlock_irqrestore(&opal_async_comp_lock, flags);
+
+ if (state == ASYNC_TOKEN_ABANDONED) {
+ /* Free the token, no one else will */
+ opal_async_release_token(token);
+ return 0;
+ }
+ memcpy(&opal_async_tokens[token].response, comp_msg, sizeof(*comp_msg));
+ wake_up(&opal_async_wait);
+
+ return 0;
+}
+
+static struct notifier_block opal_async_comp_nb = {
+ .notifier_call = opal_async_comp_event,
+ .next = NULL,
+ .priority = 0,
+};
+
+int __init opal_async_comp_init(void)
+{
+ struct device_node *opal_node;
+ const __be32 *async;
+ int err;
+
+ opal_node = of_find_node_by_path("/ibm,opal");
+ if (!opal_node) {
+ pr_err("%s: Opal node not found\n", __func__);
+ err = -ENOENT;
+ goto out;
+ }
+
+ async = of_get_property(opal_node, "opal-msg-async-num", NULL);
+ if (!async) {
+ pr_err("%s: %pOF has no opal-msg-async-num\n",
+ __func__, opal_node);
+ err = -ENOENT;
+ goto out_opal_node;
+ }
+
+ opal_max_async_tokens = be32_to_cpup(async);
+ opal_async_tokens = kcalloc(opal_max_async_tokens,
+ sizeof(*opal_async_tokens), GFP_KERNEL);
+ if (!opal_async_tokens) {
+ err = -ENOMEM;
+ goto out_opal_node;
+ }
+
+ err = opal_message_notifier_register(OPAL_MSG_ASYNC_COMP,
+ &opal_async_comp_nb);
+ if (err) {
+ pr_err("%s: Can't register OPAL event notifier (%d)\n",
+ __func__, err);
+ kfree(opal_async_tokens);
+ goto out_opal_node;
+ }
+
+ sema_init(&opal_async_sem, opal_max_async_tokens);
+
+out_opal_node:
+ of_node_put(opal_node);
+out:
+ return err;
+}
diff --git a/arch/powerpc/platforms/powernv/opal-dump.c b/arch/powerpc/platforms/powernv/opal-dump.c
new file mode 100644
index 000000000..1dc2122a3
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/opal-dump.c
@@ -0,0 +1,457 @@
+/*
+ * PowerNV OPAL Dump Interface
+ *
+ * Copyright 2013,2014 IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/kobject.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/pagemap.h>
+#include <linux/delay.h>
+#include <linux/interrupt.h>
+
+#include <asm/opal.h>
+
+#define DUMP_TYPE_FSP 0x01
+
+struct dump_obj {
+ struct kobject kobj;
+ struct bin_attribute dump_attr;
+ uint32_t id; /* becomes object name */
+ uint32_t type;
+ uint32_t size;
+ char *buffer;
+};
+#define to_dump_obj(x) container_of(x, struct dump_obj, kobj)
+
+struct dump_attribute {
+ struct attribute attr;
+ ssize_t (*show)(struct dump_obj *dump, struct dump_attribute *attr,
+ char *buf);
+ ssize_t (*store)(struct dump_obj *dump, struct dump_attribute *attr,
+ const char *buf, size_t count);
+};
+#define to_dump_attr(x) container_of(x, struct dump_attribute, attr)
+
+static ssize_t dump_id_show(struct dump_obj *dump_obj,
+ struct dump_attribute *attr,
+ char *buf)
+{
+ return sprintf(buf, "0x%x\n", dump_obj->id);
+}
+
+static const char* dump_type_to_string(uint32_t type)
+{
+ switch (type) {
+ case 0x01: return "SP Dump";
+ case 0x02: return "System/Platform Dump";
+ case 0x03: return "SMA Dump";
+ default: return "unknown";
+ }
+}
+
+static ssize_t dump_type_show(struct dump_obj *dump_obj,
+ struct dump_attribute *attr,
+ char *buf)
+{
+
+ return sprintf(buf, "0x%x %s\n", dump_obj->type,
+ dump_type_to_string(dump_obj->type));
+}
+
+static ssize_t dump_ack_show(struct dump_obj *dump_obj,
+ struct dump_attribute *attr,
+ char *buf)
+{
+ return sprintf(buf, "ack - acknowledge dump\n");
+}
+
+/*
+ * Send acknowledgement to OPAL
+ */
+static int64_t dump_send_ack(uint32_t dump_id)
+{
+ int rc;
+
+ rc = opal_dump_ack(dump_id);
+ if (rc)
+ pr_warn("%s: Failed to send ack to Dump ID 0x%x (%d)\n",
+ __func__, dump_id, rc);
+ return rc;
+}
+
+static ssize_t dump_ack_store(struct dump_obj *dump_obj,
+ struct dump_attribute *attr,
+ const char *buf,
+ size_t count)
+{
+ dump_send_ack(dump_obj->id);
+ sysfs_remove_file_self(&dump_obj->kobj, &attr->attr);
+ kobject_put(&dump_obj->kobj);
+ return count;
+}
+
+/* Attributes of a dump
+ * The binary attribute of the dump itself is dynamic
+ * due to the dynamic size of the dump
+ */
+static struct dump_attribute id_attribute =
+ __ATTR(id, 0444, dump_id_show, NULL);
+static struct dump_attribute type_attribute =
+ __ATTR(type, 0444, dump_type_show, NULL);
+static struct dump_attribute ack_attribute =
+ __ATTR(acknowledge, 0660, dump_ack_show, dump_ack_store);
+
+static ssize_t init_dump_show(struct dump_obj *dump_obj,
+ struct dump_attribute *attr,
+ char *buf)
+{
+ return sprintf(buf, "1 - initiate Service Processor(FSP) dump\n");
+}
+
+static int64_t dump_fips_init(uint8_t type)
+{
+ int rc;
+
+ rc = opal_dump_init(type);
+ if (rc)
+ pr_warn("%s: Failed to initiate FSP dump (%d)\n",
+ __func__, rc);
+ return rc;
+}
+
+static ssize_t init_dump_store(struct dump_obj *dump_obj,
+ struct dump_attribute *attr,
+ const char *buf,
+ size_t count)
+{
+ int rc;
+
+ rc = dump_fips_init(DUMP_TYPE_FSP);
+ if (rc == OPAL_SUCCESS)
+ pr_info("%s: Initiated FSP dump\n", __func__);
+
+ return count;
+}
+
+static struct dump_attribute initiate_attribute =
+ __ATTR(initiate_dump, 0600, init_dump_show, init_dump_store);
+
+static struct attribute *initiate_attrs[] = {
+ &initiate_attribute.attr,
+ NULL,
+};
+
+static struct attribute_group initiate_attr_group = {
+ .attrs = initiate_attrs,
+};
+
+static struct kset *dump_kset;
+
+static ssize_t dump_attr_show(struct kobject *kobj,
+ struct attribute *attr,
+ char *buf)
+{
+ struct dump_attribute *attribute;
+ struct dump_obj *dump;
+
+ attribute = to_dump_attr(attr);
+ dump = to_dump_obj(kobj);
+
+ if (!attribute->show)
+ return -EIO;
+
+ return attribute->show(dump, attribute, buf);
+}
+
+static ssize_t dump_attr_store(struct kobject *kobj,
+ struct attribute *attr,
+ const char *buf, size_t len)
+{
+ struct dump_attribute *attribute;
+ struct dump_obj *dump;
+
+ attribute = to_dump_attr(attr);
+ dump = to_dump_obj(kobj);
+
+ if (!attribute->store)
+ return -EIO;
+
+ return attribute->store(dump, attribute, buf, len);
+}
+
+static const struct sysfs_ops dump_sysfs_ops = {
+ .show = dump_attr_show,
+ .store = dump_attr_store,
+};
+
+static void dump_release(struct kobject *kobj)
+{
+ struct dump_obj *dump;
+
+ dump = to_dump_obj(kobj);
+ vfree(dump->buffer);
+ kfree(dump);
+}
+
+static struct attribute *dump_default_attrs[] = {
+ &id_attribute.attr,
+ &type_attribute.attr,
+ &ack_attribute.attr,
+ NULL,
+};
+
+static struct kobj_type dump_ktype = {
+ .sysfs_ops = &dump_sysfs_ops,
+ .release = &dump_release,
+ .default_attrs = dump_default_attrs,
+};
+
+static int64_t dump_read_info(uint32_t *dump_id, uint32_t *dump_size, uint32_t *dump_type)
+{
+ __be32 id, size, type;
+ int rc;
+
+ type = cpu_to_be32(0xffffffff);
+
+ rc = opal_dump_info2(&id, &size, &type);
+ if (rc == OPAL_PARAMETER)
+ rc = opal_dump_info(&id, &size);
+
+ if (rc) {
+ pr_warn("%s: Failed to get dump info (%d)\n",
+ __func__, rc);
+ return rc;
+ }
+
+ *dump_id = be32_to_cpu(id);
+ *dump_size = be32_to_cpu(size);
+ *dump_type = be32_to_cpu(type);
+
+ return rc;
+}
+
+static int64_t dump_read_data(struct dump_obj *dump)
+{
+ struct opal_sg_list *list;
+ uint64_t addr;
+ int64_t rc;
+
+ /* Allocate memory */
+ dump->buffer = vzalloc(PAGE_ALIGN(dump->size));
+ if (!dump->buffer) {
+ pr_err("%s : Failed to allocate memory\n", __func__);
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ /* Generate SG list */
+ list = opal_vmalloc_to_sg_list(dump->buffer, dump->size);
+ if (!list) {
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ /* First entry address */
+ addr = __pa(list);
+
+ /* Fetch data */
+ rc = OPAL_BUSY_EVENT;
+ while (rc == OPAL_BUSY || rc == OPAL_BUSY_EVENT) {
+ rc = opal_dump_read(dump->id, addr);
+ if (rc == OPAL_BUSY_EVENT) {
+ opal_poll_events(NULL);
+ msleep(20);
+ }
+ }
+
+ if (rc != OPAL_SUCCESS && rc != OPAL_PARTIAL)
+ pr_warn("%s: Extract dump failed for ID 0x%x\n",
+ __func__, dump->id);
+
+ /* Free SG list */
+ opal_free_sg_list(list);
+
+out:
+ return rc;
+}
+
+static ssize_t dump_attr_read(struct file *filep, struct kobject *kobj,
+ struct bin_attribute *bin_attr,
+ char *buffer, loff_t pos, size_t count)
+{
+ ssize_t rc;
+
+ struct dump_obj *dump = to_dump_obj(kobj);
+
+ if (!dump->buffer) {
+ rc = dump_read_data(dump);
+
+ if (rc != OPAL_SUCCESS && rc != OPAL_PARTIAL) {
+ vfree(dump->buffer);
+ dump->buffer = NULL;
+
+ return -EIO;
+ }
+ if (rc == OPAL_PARTIAL) {
+ /* On a partial read, we just return EIO
+ * and rely on userspace to ask us to try
+ * again.
+ */
+ pr_info("%s: Platform dump partially read. ID = 0x%x\n",
+ __func__, dump->id);
+ return -EIO;
+ }
+ }
+
+ memcpy(buffer, dump->buffer + pos, count);
+
+ /* You may think we could free the dump buffer now and retrieve
+ * it again later if needed, but due to current firmware limitation,
+ * that's not the case. So, once read into userspace once,
+ * we keep the dump around until it's acknowledged by userspace.
+ */
+
+ return count;
+}
+
+static void create_dump_obj(uint32_t id, size_t size, uint32_t type)
+{
+ struct dump_obj *dump;
+ int rc;
+
+ dump = kzalloc(sizeof(*dump), GFP_KERNEL);
+ if (!dump)
+ return;
+
+ dump->kobj.kset = dump_kset;
+
+ kobject_init(&dump->kobj, &dump_ktype);
+
+ sysfs_bin_attr_init(&dump->dump_attr);
+
+ dump->dump_attr.attr.name = "dump";
+ dump->dump_attr.attr.mode = 0400;
+ dump->dump_attr.size = size;
+ dump->dump_attr.read = dump_attr_read;
+
+ dump->id = id;
+ dump->size = size;
+ dump->type = type;
+
+ rc = kobject_add(&dump->kobj, NULL, "0x%x-0x%x", type, id);
+ if (rc) {
+ kobject_put(&dump->kobj);
+ return;
+ }
+
+ /*
+ * As soon as the sysfs file for this dump is created/activated there is
+ * a chance the opal_errd daemon (or any userspace) might read and
+ * acknowledge the dump before kobject_uevent() is called. If that
+ * happens then there is a potential race between
+ * dump_ack_store->kobject_put() and kobject_uevent() which leads to a
+ * use-after-free of a kernfs object resulting in a kernel crash.
+ *
+ * To avoid that, we need to take a reference on behalf of the bin file,
+ * so that our reference remains valid while we call kobject_uevent().
+ * We then drop our reference before exiting the function, leaving the
+ * bin file to drop the last reference (if it hasn't already).
+ */
+
+ /* Take a reference for the bin file */
+ kobject_get(&dump->kobj);
+ rc = sysfs_create_bin_file(&dump->kobj, &dump->dump_attr);
+ if (rc == 0) {
+ kobject_uevent(&dump->kobj, KOBJ_ADD);
+
+ pr_info("%s: New platform dump. ID = 0x%x Size %u\n",
+ __func__, dump->id, dump->size);
+ } else {
+ /* Drop reference count taken for bin file */
+ kobject_put(&dump->kobj);
+ }
+
+ /* Drop our reference */
+ kobject_put(&dump->kobj);
+ return;
+}
+
+static irqreturn_t process_dump(int irq, void *data)
+{
+ int rc;
+ uint32_t dump_id, dump_size, dump_type;
+ char name[22];
+ struct kobject *kobj;
+
+ rc = dump_read_info(&dump_id, &dump_size, &dump_type);
+ if (rc != OPAL_SUCCESS)
+ return IRQ_HANDLED;
+
+ sprintf(name, "0x%x-0x%x", dump_type, dump_id);
+
+ /* we may get notified twice, let's handle
+ * that gracefully and not create two conflicting
+ * entries.
+ */
+ kobj = kset_find_obj(dump_kset, name);
+ if (kobj) {
+ /* Drop reference added by kset_find_obj() */
+ kobject_put(kobj);
+ return IRQ_HANDLED;
+ }
+
+ create_dump_obj(dump_id, dump_size, dump_type);
+
+ return IRQ_HANDLED;
+}
+
+void __init opal_platform_dump_init(void)
+{
+ int rc;
+ int dump_irq;
+
+ /* ELOG not supported by firmware */
+ if (!opal_check_token(OPAL_DUMP_READ))
+ return;
+
+ dump_kset = kset_create_and_add("dump", NULL, opal_kobj);
+ if (!dump_kset) {
+ pr_warn("%s: Failed to create dump kset\n", __func__);
+ return;
+ }
+
+ rc = sysfs_create_group(&dump_kset->kobj, &initiate_attr_group);
+ if (rc) {
+ pr_warn("%s: Failed to create initiate dump attr group\n",
+ __func__);
+ kobject_put(&dump_kset->kobj);
+ return;
+ }
+
+ dump_irq = opal_event_request(ilog2(OPAL_EVENT_DUMP_AVAIL));
+ if (!dump_irq) {
+ pr_err("%s: Can't register OPAL event irq (%d)\n",
+ __func__, dump_irq);
+ return;
+ }
+
+ rc = request_threaded_irq(dump_irq, NULL, process_dump,
+ IRQF_TRIGGER_HIGH | IRQF_ONESHOT,
+ "opal-dump", NULL);
+ if (rc) {
+ pr_err("%s: Can't request OPAL event irq (%d)\n",
+ __func__, rc);
+ return;
+ }
+
+ if (opal_check_token(OPAL_DUMP_RESEND))
+ opal_dump_resend_notification();
+}
diff --git a/arch/powerpc/platforms/powernv/opal-elog.c b/arch/powerpc/platforms/powernv/opal-elog.c
new file mode 100644
index 000000000..398a06631
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/opal-elog.c
@@ -0,0 +1,338 @@
+/*
+ * Error log support on PowerNV.
+ *
+ * Copyright 2013,2014 IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/of.h>
+#include <linux/slab.h>
+#include <linux/sysfs.h>
+#include <linux/fs.h>
+#include <linux/vmalloc.h>
+#include <linux/fcntl.h>
+#include <linux/kobject.h>
+#include <linux/uaccess.h>
+#include <asm/opal.h>
+
+struct elog_obj {
+ struct kobject kobj;
+ struct bin_attribute raw_attr;
+ uint64_t id;
+ uint64_t type;
+ size_t size;
+ char *buffer;
+};
+#define to_elog_obj(x) container_of(x, struct elog_obj, kobj)
+
+struct elog_attribute {
+ struct attribute attr;
+ ssize_t (*show)(struct elog_obj *elog, struct elog_attribute *attr,
+ char *buf);
+ ssize_t (*store)(struct elog_obj *elog, struct elog_attribute *attr,
+ const char *buf, size_t count);
+};
+#define to_elog_attr(x) container_of(x, struct elog_attribute, attr)
+
+static ssize_t elog_id_show(struct elog_obj *elog_obj,
+ struct elog_attribute *attr,
+ char *buf)
+{
+ return sprintf(buf, "0x%llx\n", elog_obj->id);
+}
+
+static const char *elog_type_to_string(uint64_t type)
+{
+ switch (type) {
+ case 0: return "PEL";
+ default: return "unknown";
+ }
+}
+
+static ssize_t elog_type_show(struct elog_obj *elog_obj,
+ struct elog_attribute *attr,
+ char *buf)
+{
+ return sprintf(buf, "0x%llx %s\n",
+ elog_obj->type,
+ elog_type_to_string(elog_obj->type));
+}
+
+static ssize_t elog_ack_show(struct elog_obj *elog_obj,
+ struct elog_attribute *attr,
+ char *buf)
+{
+ return sprintf(buf, "ack - acknowledge log message\n");
+}
+
+static ssize_t elog_ack_store(struct elog_obj *elog_obj,
+ struct elog_attribute *attr,
+ const char *buf,
+ size_t count)
+{
+ opal_send_ack_elog(elog_obj->id);
+ sysfs_remove_file_self(&elog_obj->kobj, &attr->attr);
+ kobject_put(&elog_obj->kobj);
+ return count;
+}
+
+static struct elog_attribute id_attribute =
+ __ATTR(id, 0444, elog_id_show, NULL);
+static struct elog_attribute type_attribute =
+ __ATTR(type, 0444, elog_type_show, NULL);
+static struct elog_attribute ack_attribute =
+ __ATTR(acknowledge, 0660, elog_ack_show, elog_ack_store);
+
+static struct kset *elog_kset;
+
+static ssize_t elog_attr_show(struct kobject *kobj,
+ struct attribute *attr,
+ char *buf)
+{
+ struct elog_attribute *attribute;
+ struct elog_obj *elog;
+
+ attribute = to_elog_attr(attr);
+ elog = to_elog_obj(kobj);
+
+ if (!attribute->show)
+ return -EIO;
+
+ return attribute->show(elog, attribute, buf);
+}
+
+static ssize_t elog_attr_store(struct kobject *kobj,
+ struct attribute *attr,
+ const char *buf, size_t len)
+{
+ struct elog_attribute *attribute;
+ struct elog_obj *elog;
+
+ attribute = to_elog_attr(attr);
+ elog = to_elog_obj(kobj);
+
+ if (!attribute->store)
+ return -EIO;
+
+ return attribute->store(elog, attribute, buf, len);
+}
+
+static const struct sysfs_ops elog_sysfs_ops = {
+ .show = elog_attr_show,
+ .store = elog_attr_store,
+};
+
+static void elog_release(struct kobject *kobj)
+{
+ struct elog_obj *elog;
+
+ elog = to_elog_obj(kobj);
+ kfree(elog->buffer);
+ kfree(elog);
+}
+
+static struct attribute *elog_default_attrs[] = {
+ &id_attribute.attr,
+ &type_attribute.attr,
+ &ack_attribute.attr,
+ NULL,
+};
+
+static struct kobj_type elog_ktype = {
+ .sysfs_ops = &elog_sysfs_ops,
+ .release = &elog_release,
+ .default_attrs = elog_default_attrs,
+};
+
+/* Maximum size of a single log on FSP is 16KB */
+#define OPAL_MAX_ERRLOG_SIZE 16384
+
+static ssize_t raw_attr_read(struct file *filep, struct kobject *kobj,
+ struct bin_attribute *bin_attr,
+ char *buffer, loff_t pos, size_t count)
+{
+ int opal_rc;
+
+ struct elog_obj *elog = to_elog_obj(kobj);
+
+ /* We may have had an error reading before, so let's retry */
+ if (!elog->buffer) {
+ elog->buffer = kzalloc(elog->size, GFP_KERNEL);
+ if (!elog->buffer)
+ return -EIO;
+
+ opal_rc = opal_read_elog(__pa(elog->buffer),
+ elog->size, elog->id);
+ if (opal_rc != OPAL_SUCCESS) {
+ pr_err("ELOG: log read failed for log-id=%llx\n",
+ elog->id);
+ kfree(elog->buffer);
+ elog->buffer = NULL;
+ return -EIO;
+ }
+ }
+
+ memcpy(buffer, elog->buffer + pos, count);
+
+ return count;
+}
+
+static void create_elog_obj(uint64_t id, size_t size, uint64_t type)
+{
+ struct elog_obj *elog;
+ int rc;
+
+ elog = kzalloc(sizeof(*elog), GFP_KERNEL);
+ if (!elog)
+ return;
+
+ elog->kobj.kset = elog_kset;
+
+ kobject_init(&elog->kobj, &elog_ktype);
+
+ sysfs_bin_attr_init(&elog->raw_attr);
+
+ elog->raw_attr.attr.name = "raw";
+ elog->raw_attr.attr.mode = 0400;
+ elog->raw_attr.size = size;
+ elog->raw_attr.read = raw_attr_read;
+
+ elog->id = id;
+ elog->size = size;
+ elog->type = type;
+
+ elog->buffer = kzalloc(elog->size, GFP_KERNEL);
+
+ if (elog->buffer) {
+ rc = opal_read_elog(__pa(elog->buffer),
+ elog->size, elog->id);
+ if (rc != OPAL_SUCCESS) {
+ pr_err("ELOG: log read failed for log-id=%llx\n",
+ elog->id);
+ kfree(elog->buffer);
+ elog->buffer = NULL;
+ }
+ }
+
+ rc = kobject_add(&elog->kobj, NULL, "0x%llx", id);
+ if (rc) {
+ kobject_put(&elog->kobj);
+ return;
+ }
+
+ /*
+ * As soon as the sysfs file for this elog is created/activated there is
+ * a chance the opal_errd daemon (or any userspace) might read and
+ * acknowledge the elog before kobject_uevent() is called. If that
+ * happens then there is a potential race between
+ * elog_ack_store->kobject_put() and kobject_uevent() which leads to a
+ * use-after-free of a kernfs object resulting in a kernel crash.
+ *
+ * To avoid that, we need to take a reference on behalf of the bin file,
+ * so that our reference remains valid while we call kobject_uevent().
+ * We then drop our reference before exiting the function, leaving the
+ * bin file to drop the last reference (if it hasn't already).
+ */
+
+ /* Take a reference for the bin file */
+ kobject_get(&elog->kobj);
+ rc = sysfs_create_bin_file(&elog->kobj, &elog->raw_attr);
+ if (rc == 0) {
+ kobject_uevent(&elog->kobj, KOBJ_ADD);
+ } else {
+ /* Drop the reference taken for the bin file */
+ kobject_put(&elog->kobj);
+ }
+
+ /* Drop our reference */
+ kobject_put(&elog->kobj);
+
+ return;
+}
+
+static irqreturn_t elog_event(int irq, void *data)
+{
+ __be64 size;
+ __be64 id;
+ __be64 type;
+ uint64_t elog_size;
+ uint64_t log_id;
+ uint64_t elog_type;
+ int rc;
+ char name[2+16+1];
+ struct kobject *kobj;
+
+ rc = opal_get_elog_size(&id, &size, &type);
+ if (rc != OPAL_SUCCESS) {
+ pr_err("ELOG: OPAL log info read failed\n");
+ return IRQ_HANDLED;
+ }
+
+ elog_size = be64_to_cpu(size);
+ log_id = be64_to_cpu(id);
+ elog_type = be64_to_cpu(type);
+
+ WARN_ON(elog_size > OPAL_MAX_ERRLOG_SIZE);
+
+ if (elog_size >= OPAL_MAX_ERRLOG_SIZE)
+ elog_size = OPAL_MAX_ERRLOG_SIZE;
+
+ sprintf(name, "0x%llx", log_id);
+
+ /* we may get notified twice, let's handle
+ * that gracefully and not create two conflicting
+ * entries.
+ */
+ kobj = kset_find_obj(elog_kset, name);
+ if (kobj) {
+ /* Drop reference added by kset_find_obj() */
+ kobject_put(kobj);
+ return IRQ_HANDLED;
+ }
+
+ create_elog_obj(log_id, elog_size, elog_type);
+
+ return IRQ_HANDLED;
+}
+
+int __init opal_elog_init(void)
+{
+ int rc = 0, irq;
+
+ /* ELOG not supported by firmware */
+ if (!opal_check_token(OPAL_ELOG_READ))
+ return -1;
+
+ elog_kset = kset_create_and_add("elog", NULL, opal_kobj);
+ if (!elog_kset) {
+ pr_warn("%s: failed to create elog kset\n", __func__);
+ return -1;
+ }
+
+ irq = opal_event_request(ilog2(OPAL_EVENT_ERROR_LOG_AVAIL));
+ if (!irq) {
+ pr_err("%s: Can't register OPAL event irq (%d)\n",
+ __func__, irq);
+ return irq;
+ }
+
+ rc = request_threaded_irq(irq, NULL, elog_event,
+ IRQF_TRIGGER_HIGH | IRQF_ONESHOT, "opal-elog", NULL);
+ if (rc) {
+ pr_err("%s: Can't request OPAL event irq (%d)\n",
+ __func__, rc);
+ return rc;
+ }
+
+ /* We are now ready to pull error logs from opal. */
+ if (opal_check_token(OPAL_ELOG_RESEND))
+ opal_resend_pending_logs();
+
+ return 0;
+}
diff --git a/arch/powerpc/platforms/powernv/opal-flash.c b/arch/powerpc/platforms/powernv/opal-flash.c
new file mode 100644
index 000000000..b37015101
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/opal-flash.c
@@ -0,0 +1,566 @@
+/*
+ * PowerNV OPAL Firmware Update Interface
+ *
+ * Copyright 2013 IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#define DEBUG
+
+#include <linux/kernel.h>
+#include <linux/reboot.h>
+#include <linux/init.h>
+#include <linux/kobject.h>
+#include <linux/sysfs.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include <linux/vmalloc.h>
+#include <linux/pagemap.h>
+#include <linux/delay.h>
+
+#include <asm/opal.h>
+
+/* FLASH status codes */
+#define FLASH_NO_OP -1099 /* No operation initiated by user */
+#define FLASH_NO_AUTH -9002 /* Not a service authority partition */
+
+/* Validate image status values */
+#define VALIDATE_IMG_READY -1001 /* Image ready for validation */
+#define VALIDATE_IMG_INCOMPLETE -1002 /* User copied < VALIDATE_BUF_SIZE */
+
+/* Manage image status values */
+#define MANAGE_ACTIVE_ERR -9001 /* Cannot overwrite active img */
+
+/* Flash image status values */
+#define FLASH_IMG_READY 0 /* Img ready for flash on reboot */
+#define FLASH_INVALID_IMG -1003 /* Flash image shorter than expected */
+#define FLASH_IMG_NULL_DATA -1004 /* Bad data in sg list entry */
+#define FLASH_IMG_BAD_LEN -1005 /* Bad length in sg list entry */
+
+/* Manage operation tokens */
+#define FLASH_REJECT_TMP_SIDE 0 /* Reject temporary fw image */
+#define FLASH_COMMIT_TMP_SIDE 1 /* Commit temporary fw image */
+
+/* Update tokens */
+#define FLASH_UPDATE_CANCEL 0 /* Cancel update request */
+#define FLASH_UPDATE_INIT 1 /* Initiate update */
+
+/* Validate image update result tokens */
+#define VALIDATE_TMP_UPDATE 0 /* T side will be updated */
+#define VALIDATE_FLASH_AUTH 1 /* Partition does not have authority */
+#define VALIDATE_INVALID_IMG 2 /* Candidate image is not valid */
+#define VALIDATE_CUR_UNKNOWN 3 /* Current fixpack level is unknown */
+/*
+ * Current T side will be committed to P side before being replace with new
+ * image, and the new image is downlevel from current image
+ */
+#define VALIDATE_TMP_COMMIT_DL 4
+/*
+ * Current T side will be committed to P side before being replaced with new
+ * image
+ */
+#define VALIDATE_TMP_COMMIT 5
+/*
+ * T side will be updated with a downlevel image
+ */
+#define VALIDATE_TMP_UPDATE_DL 6
+/*
+ * The candidate image's release date is later than the system's firmware
+ * service entitlement date - service warranty period has expired
+ */
+#define VALIDATE_OUT_OF_WRNTY 7
+
+/* Validate buffer size */
+#define VALIDATE_BUF_SIZE 4096
+
+/* XXX: Assume candidate image size is <= 1GB */
+#define MAX_IMAGE_SIZE 0x40000000
+
+/* Image status */
+enum {
+ IMAGE_INVALID,
+ IMAGE_LOADING,
+ IMAGE_READY,
+};
+
+/* Candidate image data */
+struct image_data_t {
+ int status;
+ void *data;
+ uint32_t size;
+};
+
+/* Candidate image header */
+struct image_header_t {
+ uint16_t magic;
+ uint16_t version;
+ uint32_t size;
+};
+
+struct validate_flash_t {
+ int status; /* Return status */
+ void *buf; /* Candidate image buffer */
+ uint32_t buf_size; /* Image size */
+ uint32_t result; /* Update results token */
+};
+
+struct manage_flash_t {
+ int status; /* Return status */
+};
+
+struct update_flash_t {
+ int status; /* Return status */
+};
+
+static struct image_header_t image_header;
+static struct image_data_t image_data;
+static struct validate_flash_t validate_flash_data;
+static struct manage_flash_t manage_flash_data;
+
+/* Initialize update_flash_data status to No Operation */
+static struct update_flash_t update_flash_data = {
+ .status = FLASH_NO_OP,
+};
+
+static DEFINE_MUTEX(image_data_mutex);
+
+/*
+ * Validate candidate image
+ */
+static inline void opal_flash_validate(void)
+{
+ long ret;
+ void *buf = validate_flash_data.buf;
+ __be32 size = cpu_to_be32(validate_flash_data.buf_size);
+ __be32 result;
+
+ ret = opal_validate_flash(__pa(buf), &size, &result);
+
+ validate_flash_data.status = ret;
+ validate_flash_data.buf_size = be32_to_cpu(size);
+ validate_flash_data.result = be32_to_cpu(result);
+}
+
+/*
+ * Validate output format:
+ * validate result token
+ * current image version details
+ * new image version details
+ */
+static ssize_t validate_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct validate_flash_t *args_buf = &validate_flash_data;
+ int len;
+
+ /* Candidate image is not validated */
+ if (args_buf->status < VALIDATE_TMP_UPDATE) {
+ len = sprintf(buf, "%d\n", args_buf->status);
+ goto out;
+ }
+
+ /* Result token */
+ len = sprintf(buf, "%d\n", args_buf->result);
+
+ /* Current and candidate image version details */
+ if ((args_buf->result != VALIDATE_TMP_UPDATE) &&
+ (args_buf->result < VALIDATE_CUR_UNKNOWN))
+ goto out;
+
+ if (args_buf->buf_size > (VALIDATE_BUF_SIZE - len)) {
+ memcpy(buf + len, args_buf->buf, VALIDATE_BUF_SIZE - len);
+ len = VALIDATE_BUF_SIZE;
+ } else {
+ memcpy(buf + len, args_buf->buf, args_buf->buf_size);
+ len += args_buf->buf_size;
+ }
+out:
+ /* Set status to default */
+ args_buf->status = FLASH_NO_OP;
+ return len;
+}
+
+/*
+ * Validate candidate firmware image
+ *
+ * Note:
+ * We are only interested in first 4K bytes of the
+ * candidate image.
+ */
+static ssize_t validate_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct validate_flash_t *args_buf = &validate_flash_data;
+
+ if (buf[0] != '1')
+ return -EINVAL;
+
+ mutex_lock(&image_data_mutex);
+
+ if (image_data.status != IMAGE_READY ||
+ image_data.size < VALIDATE_BUF_SIZE) {
+ args_buf->result = VALIDATE_INVALID_IMG;
+ args_buf->status = VALIDATE_IMG_INCOMPLETE;
+ goto out;
+ }
+
+ /* Copy first 4k bytes of candidate image */
+ memcpy(args_buf->buf, image_data.data, VALIDATE_BUF_SIZE);
+
+ args_buf->status = VALIDATE_IMG_READY;
+ args_buf->buf_size = VALIDATE_BUF_SIZE;
+
+ /* Validate candidate image */
+ opal_flash_validate();
+
+out:
+ mutex_unlock(&image_data_mutex);
+ return count;
+}
+
+/*
+ * Manage flash routine
+ */
+static inline void opal_flash_manage(uint8_t op)
+{
+ struct manage_flash_t *const args_buf = &manage_flash_data;
+
+ args_buf->status = opal_manage_flash(op);
+}
+
+/*
+ * Show manage flash status
+ */
+static ssize_t manage_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct manage_flash_t *const args_buf = &manage_flash_data;
+ int rc;
+
+ rc = sprintf(buf, "%d\n", args_buf->status);
+ /* Set status to default*/
+ args_buf->status = FLASH_NO_OP;
+ return rc;
+}
+
+/*
+ * Manage operations:
+ * 0 - Reject
+ * 1 - Commit
+ */
+static ssize_t manage_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ uint8_t op;
+ switch (buf[0]) {
+ case '0':
+ op = FLASH_REJECT_TMP_SIDE;
+ break;
+ case '1':
+ op = FLASH_COMMIT_TMP_SIDE;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ /* commit/reject temporary image */
+ opal_flash_manage(op);
+ return count;
+}
+
+/*
+ * OPAL update flash
+ */
+static int opal_flash_update(int op)
+{
+ struct opal_sg_list *list;
+ unsigned long addr;
+ int64_t rc = OPAL_PARAMETER;
+
+ if (op == FLASH_UPDATE_CANCEL) {
+ pr_alert("FLASH: Image update cancelled\n");
+ addr = '\0';
+ goto flash;
+ }
+
+ list = opal_vmalloc_to_sg_list(image_data.data, image_data.size);
+ if (!list)
+ goto invalid_img;
+
+ /* First entry address */
+ addr = __pa(list);
+
+flash:
+ rc = opal_update_flash(addr);
+
+invalid_img:
+ return rc;
+}
+
+/* This gets called just before system reboots */
+void opal_flash_update_print_message(void)
+{
+ if (update_flash_data.status != FLASH_IMG_READY)
+ return;
+
+ pr_alert("FLASH: Flashing new firmware\n");
+ pr_alert("FLASH: Image is %u bytes\n", image_data.size);
+ pr_alert("FLASH: Performing flash and reboot/shutdown\n");
+ pr_alert("FLASH: This will take several minutes. Do not power off!\n");
+
+ /* Small delay to help getting the above message out */
+ msleep(500);
+}
+
+/*
+ * Show candidate image status
+ */
+static ssize_t update_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct update_flash_t *const args_buf = &update_flash_data;
+ return sprintf(buf, "%d\n", args_buf->status);
+}
+
+/*
+ * Set update image flag
+ * 1 - Flash new image
+ * 0 - Cancel flash request
+ */
+static ssize_t update_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct update_flash_t *const args_buf = &update_flash_data;
+ int rc = count;
+
+ mutex_lock(&image_data_mutex);
+
+ switch (buf[0]) {
+ case '0':
+ if (args_buf->status == FLASH_IMG_READY)
+ opal_flash_update(FLASH_UPDATE_CANCEL);
+ args_buf->status = FLASH_NO_OP;
+ break;
+ case '1':
+ /* Image is loaded? */
+ if (image_data.status == IMAGE_READY)
+ args_buf->status =
+ opal_flash_update(FLASH_UPDATE_INIT);
+ else
+ args_buf->status = FLASH_INVALID_IMG;
+ break;
+ default:
+ rc = -EINVAL;
+ }
+
+ mutex_unlock(&image_data_mutex);
+ return rc;
+}
+
+/*
+ * Free image buffer
+ */
+static void free_image_buf(void)
+{
+ void *addr;
+ int size;
+
+ addr = image_data.data;
+ size = PAGE_ALIGN(image_data.size);
+ while (size > 0) {
+ ClearPageReserved(vmalloc_to_page(addr));
+ addr += PAGE_SIZE;
+ size -= PAGE_SIZE;
+ }
+ vfree(image_data.data);
+ image_data.data = NULL;
+ image_data.status = IMAGE_INVALID;
+}
+
+/*
+ * Allocate image buffer.
+ */
+static int alloc_image_buf(char *buffer, size_t count)
+{
+ void *addr;
+ int size;
+
+ if (count < sizeof(image_header)) {
+ pr_warn("FLASH: Invalid candidate image\n");
+ return -EINVAL;
+ }
+
+ memcpy(&image_header, (void *)buffer, sizeof(image_header));
+ image_data.size = be32_to_cpu(image_header.size);
+ pr_debug("FLASH: Candidate image size = %u\n", image_data.size);
+
+ if (image_data.size > MAX_IMAGE_SIZE) {
+ pr_warn("FLASH: Too large image\n");
+ return -EINVAL;
+ }
+ if (image_data.size < VALIDATE_BUF_SIZE) {
+ pr_warn("FLASH: Image is shorter than expected\n");
+ return -EINVAL;
+ }
+
+ image_data.data = vzalloc(PAGE_ALIGN(image_data.size));
+ if (!image_data.data) {
+ pr_err("%s : Failed to allocate memory\n", __func__);
+ return -ENOMEM;
+ }
+
+ /* Pin memory */
+ addr = image_data.data;
+ size = PAGE_ALIGN(image_data.size);
+ while (size > 0) {
+ SetPageReserved(vmalloc_to_page(addr));
+ addr += PAGE_SIZE;
+ size -= PAGE_SIZE;
+ }
+
+ image_data.status = IMAGE_LOADING;
+ return 0;
+}
+
+/*
+ * Copy candidate image
+ *
+ * Parse candidate image header to get total image size
+ * and pre-allocate required memory.
+ */
+static ssize_t image_data_write(struct file *filp, struct kobject *kobj,
+ struct bin_attribute *bin_attr,
+ char *buffer, loff_t pos, size_t count)
+{
+ int rc;
+
+ mutex_lock(&image_data_mutex);
+
+ /* New image ? */
+ if (pos == 0) {
+ /* Free memory, if already allocated */
+ if (image_data.data)
+ free_image_buf();
+
+ /* Cancel outstanding image update request */
+ if (update_flash_data.status == FLASH_IMG_READY)
+ opal_flash_update(FLASH_UPDATE_CANCEL);
+
+ /* Allocate memory */
+ rc = alloc_image_buf(buffer, count);
+ if (rc)
+ goto out;
+ }
+
+ if (image_data.status != IMAGE_LOADING) {
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ if ((pos + count) > image_data.size) {
+ rc = -EINVAL;
+ goto out;
+ }
+
+ memcpy(image_data.data + pos, (void *)buffer, count);
+ rc = count;
+
+ /* Set image status */
+ if ((pos + count) == image_data.size) {
+ pr_debug("FLASH: Candidate image loaded....\n");
+ image_data.status = IMAGE_READY;
+ }
+
+out:
+ mutex_unlock(&image_data_mutex);
+ return rc;
+}
+
+/*
+ * sysfs interface :
+ * OPAL uses below sysfs files for code update.
+ * We create these files under /sys/firmware/opal.
+ *
+ * image : Interface to load candidate firmware image
+ * validate_flash : Validate firmware image
+ * manage_flash : Commit/Reject firmware image
+ * update_flash : Flash new firmware image
+ *
+ */
+static const struct bin_attribute image_data_attr = {
+ .attr = {.name = "image", .mode = 0200},
+ .size = MAX_IMAGE_SIZE, /* Limit image size */
+ .write = image_data_write,
+};
+
+static struct kobj_attribute validate_attribute =
+ __ATTR(validate_flash, 0600, validate_show, validate_store);
+
+static struct kobj_attribute manage_attribute =
+ __ATTR(manage_flash, 0600, manage_show, manage_store);
+
+static struct kobj_attribute update_attribute =
+ __ATTR(update_flash, 0600, update_show, update_store);
+
+static struct attribute *image_op_attrs[] = {
+ &validate_attribute.attr,
+ &manage_attribute.attr,
+ &update_attribute.attr,
+ NULL /* need to NULL terminate the list of attributes */
+};
+
+static struct attribute_group image_op_attr_group = {
+ .attrs = image_op_attrs,
+};
+
+void __init opal_flash_update_init(void)
+{
+ int ret;
+
+ /* Allocate validate image buffer */
+ validate_flash_data.buf = kzalloc(VALIDATE_BUF_SIZE, GFP_KERNEL);
+ if (!validate_flash_data.buf) {
+ pr_err("%s : Failed to allocate memory\n", __func__);
+ return;
+ }
+
+ /* Make sure /sys/firmware/opal directory is created */
+ if (!opal_kobj) {
+ pr_warn("FLASH: opal kobject is not available\n");
+ goto nokobj;
+ }
+
+ /* Create the sysfs files */
+ ret = sysfs_create_group(opal_kobj, &image_op_attr_group);
+ if (ret) {
+ pr_warn("FLASH: Failed to create sysfs files\n");
+ goto nokobj;
+ }
+
+ ret = sysfs_create_bin_file(opal_kobj, &image_data_attr);
+ if (ret) {
+ pr_warn("FLASH: Failed to create sysfs files\n");
+ goto nosysfs_file;
+ }
+
+ /* Set default status */
+ validate_flash_data.status = FLASH_NO_OP;
+ manage_flash_data.status = FLASH_NO_OP;
+ update_flash_data.status = FLASH_NO_OP;
+ image_data.status = IMAGE_INVALID;
+ return;
+
+nosysfs_file:
+ sysfs_remove_group(opal_kobj, &image_op_attr_group);
+
+nokobj:
+ kfree(validate_flash_data.buf);
+ return;
+}
diff --git a/arch/powerpc/platforms/powernv/opal-hmi.c b/arch/powerpc/platforms/powernv/opal-hmi.c
new file mode 100644
index 000000000..586ec71a4
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/opal-hmi.c
@@ -0,0 +1,348 @@
+/*
+ * OPAL hypervisor Maintenance interrupt handling support in PowerNV.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Copyright 2014 IBM Corporation
+ * Author: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
+ */
+
+#undef DEBUG
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/of.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+
+#include <asm/opal.h>
+#include <asm/cputable.h>
+#include <asm/machdep.h>
+
+#include "powernv.h"
+
+static int opal_hmi_handler_nb_init;
+struct OpalHmiEvtNode {
+ struct list_head list;
+ struct OpalHMIEvent hmi_evt;
+};
+
+struct xstop_reason {
+ uint32_t xstop_reason;
+ const char *unit_failed;
+ const char *description;
+};
+
+static LIST_HEAD(opal_hmi_evt_list);
+static DEFINE_SPINLOCK(opal_hmi_evt_lock);
+
+static void print_core_checkstop_reason(const char *level,
+ struct OpalHMIEvent *hmi_evt)
+{
+ int i;
+ static const struct xstop_reason xstop_reason[] = {
+ { CORE_CHECKSTOP_IFU_REGFILE, "IFU",
+ "RegFile core check stop" },
+ { CORE_CHECKSTOP_IFU_LOGIC, "IFU", "Logic core check stop" },
+ { CORE_CHECKSTOP_PC_DURING_RECOV, "PC",
+ "Core checkstop during recovery" },
+ { CORE_CHECKSTOP_ISU_REGFILE, "ISU",
+ "RegFile core check stop (mapper error)" },
+ { CORE_CHECKSTOP_ISU_LOGIC, "ISU", "Logic core check stop" },
+ { CORE_CHECKSTOP_FXU_LOGIC, "FXU", "Logic core check stop" },
+ { CORE_CHECKSTOP_VSU_LOGIC, "VSU", "Logic core check stop" },
+ { CORE_CHECKSTOP_PC_RECOV_IN_MAINT_MODE, "PC",
+ "Recovery in maintenance mode" },
+ { CORE_CHECKSTOP_LSU_REGFILE, "LSU",
+ "RegFile core check stop" },
+ { CORE_CHECKSTOP_PC_FWD_PROGRESS, "PC",
+ "Forward Progress Error" },
+ { CORE_CHECKSTOP_LSU_LOGIC, "LSU", "Logic core check stop" },
+ { CORE_CHECKSTOP_PC_LOGIC, "PC", "Logic core check stop" },
+ { CORE_CHECKSTOP_PC_HYP_RESOURCE, "PC",
+ "Hypervisor Resource error - core check stop" },
+ { CORE_CHECKSTOP_PC_HANG_RECOV_FAILED, "PC",
+ "Hang Recovery Failed (core check stop)" },
+ { CORE_CHECKSTOP_PC_AMBI_HANG_DETECTED, "PC",
+ "Ambiguous Hang Detected (unknown source)" },
+ { CORE_CHECKSTOP_PC_DEBUG_TRIG_ERR_INJ, "PC",
+ "Debug Trigger Error inject" },
+ { CORE_CHECKSTOP_PC_SPRD_HYP_ERR_INJ, "PC",
+ "Hypervisor check stop via SPRC/SPRD" },
+ };
+
+ /* Validity check */
+ if (!hmi_evt->u.xstop_error.xstop_reason) {
+ printk("%s Unknown Core check stop.\n", level);
+ return;
+ }
+
+ printk("%s CPU PIR: %08x\n", level,
+ be32_to_cpu(hmi_evt->u.xstop_error.u.pir));
+ for (i = 0; i < ARRAY_SIZE(xstop_reason); i++)
+ if (be32_to_cpu(hmi_evt->u.xstop_error.xstop_reason) &
+ xstop_reason[i].xstop_reason)
+ printk("%s [Unit: %-3s] %s\n", level,
+ xstop_reason[i].unit_failed,
+ xstop_reason[i].description);
+}
+
+static void print_nx_checkstop_reason(const char *level,
+ struct OpalHMIEvent *hmi_evt)
+{
+ int i;
+ static const struct xstop_reason xstop_reason[] = {
+ { NX_CHECKSTOP_SHM_INVAL_STATE_ERR, "DMA & Engine",
+ "SHM invalid state error" },
+ { NX_CHECKSTOP_DMA_INVAL_STATE_ERR_1, "DMA & Engine",
+ "DMA invalid state error bit 15" },
+ { NX_CHECKSTOP_DMA_INVAL_STATE_ERR_2, "DMA & Engine",
+ "DMA invalid state error bit 16" },
+ { NX_CHECKSTOP_DMA_CH0_INVAL_STATE_ERR, "DMA & Engine",
+ "Channel 0 invalid state error" },
+ { NX_CHECKSTOP_DMA_CH1_INVAL_STATE_ERR, "DMA & Engine",
+ "Channel 1 invalid state error" },
+ { NX_CHECKSTOP_DMA_CH2_INVAL_STATE_ERR, "DMA & Engine",
+ "Channel 2 invalid state error" },
+ { NX_CHECKSTOP_DMA_CH3_INVAL_STATE_ERR, "DMA & Engine",
+ "Channel 3 invalid state error" },
+ { NX_CHECKSTOP_DMA_CH4_INVAL_STATE_ERR, "DMA & Engine",
+ "Channel 4 invalid state error" },
+ { NX_CHECKSTOP_DMA_CH5_INVAL_STATE_ERR, "DMA & Engine",
+ "Channel 5 invalid state error" },
+ { NX_CHECKSTOP_DMA_CH6_INVAL_STATE_ERR, "DMA & Engine",
+ "Channel 6 invalid state error" },
+ { NX_CHECKSTOP_DMA_CH7_INVAL_STATE_ERR, "DMA & Engine",
+ "Channel 7 invalid state error" },
+ { NX_CHECKSTOP_DMA_CRB_UE, "DMA & Engine",
+ "UE error on CRB(CSB address, CCB)" },
+ { NX_CHECKSTOP_DMA_CRB_SUE, "DMA & Engine",
+ "SUE error on CRB(CSB address, CCB)" },
+ { NX_CHECKSTOP_PBI_ISN_UE, "PowerBus Interface",
+ "CRB Kill ISN received while holding ISN with UE error" },
+ };
+
+ /* Validity check */
+ if (!hmi_evt->u.xstop_error.xstop_reason) {
+ printk("%s Unknown NX check stop.\n", level);
+ return;
+ }
+
+ printk("%s NX checkstop on CHIP ID: %x\n", level,
+ be32_to_cpu(hmi_evt->u.xstop_error.u.chip_id));
+ for (i = 0; i < ARRAY_SIZE(xstop_reason); i++)
+ if (be32_to_cpu(hmi_evt->u.xstop_error.xstop_reason) &
+ xstop_reason[i].xstop_reason)
+ printk("%s [Unit: %-3s] %s\n", level,
+ xstop_reason[i].unit_failed,
+ xstop_reason[i].description);
+}
+
+static void print_checkstop_reason(const char *level,
+ struct OpalHMIEvent *hmi_evt)
+{
+ uint8_t type = hmi_evt->u.xstop_error.xstop_type;
+ switch (type) {
+ case CHECKSTOP_TYPE_CORE:
+ print_core_checkstop_reason(level, hmi_evt);
+ break;
+ case CHECKSTOP_TYPE_NX:
+ print_nx_checkstop_reason(level, hmi_evt);
+ break;
+ default:
+ printk("%s Unknown Malfunction Alert of type %d\n",
+ level, type);
+ break;
+ }
+}
+
+static void print_hmi_event_info(struct OpalHMIEvent *hmi_evt)
+{
+ const char *level, *sevstr, *error_info;
+ static const char *hmi_error_types[] = {
+ "Malfunction Alert",
+ "Processor Recovery done",
+ "Processor recovery occurred again",
+ "Processor recovery occurred for masked error",
+ "Timer facility experienced an error",
+ "TFMR SPR is corrupted",
+ "UPS (Uninterrupted Power System) Overflow indication",
+ "An XSCOM operation failure",
+ "An XSCOM operation completed",
+ "SCOM has set a reserved FIR bit to cause recovery",
+ "Debug trigger has set a reserved FIR bit to cause recovery",
+ "A hypervisor resource error occurred",
+ "CAPP recovery process is in progress",
+ };
+
+ /* Print things out */
+ if (hmi_evt->version < OpalHMIEvt_V1) {
+ pr_err("HMI Interrupt, Unknown event version %d !\n",
+ hmi_evt->version);
+ return;
+ }
+ switch (hmi_evt->severity) {
+ case OpalHMI_SEV_NO_ERROR:
+ level = KERN_INFO;
+ sevstr = "Harmless";
+ break;
+ case OpalHMI_SEV_WARNING:
+ level = KERN_WARNING;
+ sevstr = "";
+ break;
+ case OpalHMI_SEV_ERROR_SYNC:
+ level = KERN_ERR;
+ sevstr = "Severe";
+ break;
+ case OpalHMI_SEV_FATAL:
+ default:
+ level = KERN_ERR;
+ sevstr = "Fatal";
+ break;
+ }
+
+ printk("%s%s Hypervisor Maintenance interrupt [%s]\n",
+ level, sevstr,
+ hmi_evt->disposition == OpalHMI_DISPOSITION_RECOVERED ?
+ "Recovered" : "Not recovered");
+ error_info = hmi_evt->type < ARRAY_SIZE(hmi_error_types) ?
+ hmi_error_types[hmi_evt->type]
+ : "Unknown";
+ printk("%s Error detail: %s\n", level, error_info);
+ printk("%s HMER: %016llx\n", level, be64_to_cpu(hmi_evt->hmer));
+ if ((hmi_evt->type == OpalHMI_ERROR_TFAC) ||
+ (hmi_evt->type == OpalHMI_ERROR_TFMR_PARITY))
+ printk("%s TFMR: %016llx\n", level,
+ be64_to_cpu(hmi_evt->tfmr));
+
+ if (hmi_evt->version < OpalHMIEvt_V2)
+ return;
+
+ /* OpalHMIEvt_V2 and above provides reason for malfunction alert. */
+ if (hmi_evt->type == OpalHMI_ERROR_MALFUNC_ALERT)
+ print_checkstop_reason(level, hmi_evt);
+}
+
+static void hmi_event_handler(struct work_struct *work)
+{
+ unsigned long flags;
+ struct OpalHMIEvent *hmi_evt;
+ struct OpalHmiEvtNode *msg_node;
+ uint8_t disposition;
+ struct opal_msg msg;
+ int unrecoverable = 0;
+
+ spin_lock_irqsave(&opal_hmi_evt_lock, flags);
+ while (!list_empty(&opal_hmi_evt_list)) {
+ msg_node = list_entry(opal_hmi_evt_list.next,
+ struct OpalHmiEvtNode, list);
+ list_del(&msg_node->list);
+ spin_unlock_irqrestore(&opal_hmi_evt_lock, flags);
+
+ hmi_evt = (struct OpalHMIEvent *) &msg_node->hmi_evt;
+ print_hmi_event_info(hmi_evt);
+ disposition = hmi_evt->disposition;
+ kfree(msg_node);
+
+ /*
+ * Check if HMI event has been recovered or not. If not
+ * then kernel can't continue, we need to panic.
+ * But before we do that, display all the HMI event
+ * available on the list and set unrecoverable flag to 1.
+ */
+ if (disposition != OpalHMI_DISPOSITION_RECOVERED)
+ unrecoverable = 1;
+
+ spin_lock_irqsave(&opal_hmi_evt_lock, flags);
+ }
+ spin_unlock_irqrestore(&opal_hmi_evt_lock, flags);
+
+ if (unrecoverable) {
+ /* Pull all HMI events from OPAL before we panic. */
+ while (opal_get_msg(__pa(&msg), sizeof(msg)) == OPAL_SUCCESS) {
+ u32 type;
+
+ type = be32_to_cpu(msg.msg_type);
+
+ /* skip if not HMI event */
+ if (type != OPAL_MSG_HMI_EVT)
+ continue;
+
+ /* HMI event info starts from param[0] */
+ hmi_evt = (struct OpalHMIEvent *)&msg.params[0];
+ print_hmi_event_info(hmi_evt);
+ }
+
+ pnv_platform_error_reboot(NULL, "Unrecoverable HMI exception");
+ }
+}
+
+static DECLARE_WORK(hmi_event_work, hmi_event_handler);
+/*
+ * opal_handle_hmi_event - notifier handler that queues up HMI events
+ * to be preocessed later.
+ */
+static int opal_handle_hmi_event(struct notifier_block *nb,
+ unsigned long msg_type, void *msg)
+{
+ unsigned long flags;
+ struct OpalHMIEvent *hmi_evt;
+ struct opal_msg *hmi_msg = msg;
+ struct OpalHmiEvtNode *msg_node;
+
+ /* Sanity Checks */
+ if (msg_type != OPAL_MSG_HMI_EVT)
+ return 0;
+
+ /* HMI event info starts from param[0] */
+ hmi_evt = (struct OpalHMIEvent *)&hmi_msg->params[0];
+
+ /* Delay the logging of HMI events to workqueue. */
+ msg_node = kzalloc(sizeof(*msg_node), GFP_ATOMIC);
+ if (!msg_node) {
+ pr_err("HMI: out of memory, Opal message event not handled\n");
+ return -ENOMEM;
+ }
+ memcpy(&msg_node->hmi_evt, hmi_evt, sizeof(*hmi_evt));
+
+ spin_lock_irqsave(&opal_hmi_evt_lock, flags);
+ list_add(&msg_node->list, &opal_hmi_evt_list);
+ spin_unlock_irqrestore(&opal_hmi_evt_lock, flags);
+
+ schedule_work(&hmi_event_work);
+ return 0;
+}
+
+static struct notifier_block opal_hmi_handler_nb = {
+ .notifier_call = opal_handle_hmi_event,
+ .next = NULL,
+ .priority = 0,
+};
+
+int __init opal_hmi_handler_init(void)
+{
+ int ret;
+
+ if (!opal_hmi_handler_nb_init) {
+ ret = opal_message_notifier_register(
+ OPAL_MSG_HMI_EVT, &opal_hmi_handler_nb);
+ if (ret) {
+ pr_err("%s: Can't register OPAL event notifier (%d)\n",
+ __func__, ret);
+ return ret;
+ }
+ opal_hmi_handler_nb_init = 1;
+ }
+ return 0;
+}
diff --git a/arch/powerpc/platforms/powernv/opal-imc.c b/arch/powerpc/platforms/powernv/opal-imc.c
new file mode 100644
index 000000000..539968279
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/opal-imc.c
@@ -0,0 +1,336 @@
+/*
+ * OPAL IMC interface detection driver
+ * Supported on POWERNV platform
+ *
+ * Copyright (C) 2017 Madhavan Srinivasan, IBM Corporation.
+ * (C) 2017 Anju T Sudhakar, IBM Corporation.
+ * (C) 2017 Hemant K Shaw, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or later version.
+ */
+#include <linux/kernel.h>
+#include <linux/platform_device.h>
+#include <linux/of.h>
+#include <linux/of_address.h>
+#include <linux/of_platform.h>
+#include <linux/crash_dump.h>
+#include <asm/opal.h>
+#include <asm/io.h>
+#include <asm/imc-pmu.h>
+#include <asm/cputhreads.h>
+#include <asm/debugfs.h>
+
+static struct dentry *imc_debugfs_parent;
+
+/* Helpers to export imc command and mode via debugfs */
+static int imc_mem_get(void *data, u64 *val)
+{
+ *val = cpu_to_be64(*(u64 *)data);
+ return 0;
+}
+
+static int imc_mem_set(void *data, u64 val)
+{
+ *(u64 *)data = cpu_to_be64(val);
+ return 0;
+}
+DEFINE_DEBUGFS_ATTRIBUTE(fops_imc_x64, imc_mem_get, imc_mem_set, "0x%016llx\n");
+
+static struct dentry *imc_debugfs_create_x64(const char *name, umode_t mode,
+ struct dentry *parent, u64 *value)
+{
+ return debugfs_create_file_unsafe(name, mode, parent,
+ value, &fops_imc_x64);
+}
+
+/*
+ * export_imc_mode_and_cmd: Create a debugfs interface
+ * for imc_cmd and imc_mode
+ * for each node in the system.
+ * imc_mode and imc_cmd can be changed by echo into
+ * this interface.
+ */
+static void export_imc_mode_and_cmd(struct device_node *node,
+ struct imc_pmu *pmu_ptr)
+{
+ static u64 loc, *imc_mode_addr, *imc_cmd_addr;
+ char mode[16], cmd[16];
+ u32 cb_offset;
+ struct imc_mem_info *ptr = pmu_ptr->mem_info;
+
+ imc_debugfs_parent = debugfs_create_dir("imc", powerpc_debugfs_root);
+
+ if (!imc_debugfs_parent)
+ return;
+
+ if (of_property_read_u32(node, "cb_offset", &cb_offset))
+ cb_offset = IMC_CNTL_BLK_OFFSET;
+
+ while (ptr->vbase != NULL) {
+ loc = (u64)(ptr->vbase) + cb_offset;
+ imc_mode_addr = (u64 *)(loc + IMC_CNTL_BLK_MODE_OFFSET);
+ sprintf(mode, "imc_mode_%d", (u32)(ptr->id));
+ if (!imc_debugfs_create_x64(mode, 0600, imc_debugfs_parent,
+ imc_mode_addr))
+ goto err;
+
+ imc_cmd_addr = (u64 *)(loc + IMC_CNTL_BLK_CMD_OFFSET);
+ sprintf(cmd, "imc_cmd_%d", (u32)(ptr->id));
+ if (!imc_debugfs_create_x64(cmd, 0600, imc_debugfs_parent,
+ imc_cmd_addr))
+ goto err;
+ ptr++;
+ }
+ return;
+
+err:
+ debugfs_remove_recursive(imc_debugfs_parent);
+}
+
+/*
+ * imc_get_mem_addr_nest: Function to get nest counter memory region
+ * for each chip
+ */
+static int imc_get_mem_addr_nest(struct device_node *node,
+ struct imc_pmu *pmu_ptr,
+ u32 offset)
+{
+ int nr_chips = 0, i;
+ u64 *base_addr_arr, baddr;
+ u32 *chipid_arr;
+
+ nr_chips = of_property_count_u32_elems(node, "chip-id");
+ if (nr_chips <= 0)
+ return -ENODEV;
+
+ base_addr_arr = kcalloc(nr_chips, sizeof(*base_addr_arr), GFP_KERNEL);
+ if (!base_addr_arr)
+ return -ENOMEM;
+
+ chipid_arr = kcalloc(nr_chips, sizeof(*chipid_arr), GFP_KERNEL);
+ if (!chipid_arr) {
+ kfree(base_addr_arr);
+ return -ENOMEM;
+ }
+
+ if (of_property_read_u32_array(node, "chip-id", chipid_arr, nr_chips))
+ goto error;
+
+ if (of_property_read_u64_array(node, "base-addr", base_addr_arr,
+ nr_chips))
+ goto error;
+
+ pmu_ptr->mem_info = kcalloc(nr_chips + 1, sizeof(*pmu_ptr->mem_info),
+ GFP_KERNEL);
+ if (!pmu_ptr->mem_info)
+ goto error;
+
+ for (i = 0; i < nr_chips; i++) {
+ pmu_ptr->mem_info[i].id = chipid_arr[i];
+ baddr = base_addr_arr[i] + offset;
+ pmu_ptr->mem_info[i].vbase = phys_to_virt(baddr);
+ }
+
+ pmu_ptr->imc_counter_mmaped = true;
+ kfree(base_addr_arr);
+ kfree(chipid_arr);
+ return 0;
+
+error:
+ kfree(base_addr_arr);
+ kfree(chipid_arr);
+ return -1;
+}
+
+/*
+ * imc_pmu_create : Takes the parent device which is the pmu unit, pmu_index
+ * and domain as the inputs.
+ * Allocates memory for the struct imc_pmu, sets up its domain, size and offsets
+ */
+static struct imc_pmu *imc_pmu_create(struct device_node *parent, int pmu_index, int domain)
+{
+ int ret = 0;
+ struct imc_pmu *pmu_ptr;
+ u32 offset;
+
+ /* Return for unknown domain */
+ if (domain < 0)
+ return NULL;
+
+ /* memory for pmu */
+ pmu_ptr = kzalloc(sizeof(*pmu_ptr), GFP_KERNEL);
+ if (!pmu_ptr)
+ return NULL;
+
+ /* Set the domain */
+ pmu_ptr->domain = domain;
+
+ ret = of_property_read_u32(parent, "size", &pmu_ptr->counter_mem_size);
+ if (ret)
+ goto free_pmu;
+
+ if (!of_property_read_u32(parent, "offset", &offset)) {
+ if (imc_get_mem_addr_nest(parent, pmu_ptr, offset))
+ goto free_pmu;
+ }
+
+ /* Function to register IMC pmu */
+ ret = init_imc_pmu(parent, pmu_ptr, pmu_index);
+ if (ret) {
+ pr_err("IMC PMU %s Register failed\n", pmu_ptr->pmu.name);
+ kfree(pmu_ptr->pmu.name);
+ if (pmu_ptr->domain == IMC_DOMAIN_NEST)
+ kfree(pmu_ptr->mem_info);
+ kfree(pmu_ptr);
+ return NULL;
+ }
+
+ return pmu_ptr;
+
+free_pmu:
+ kfree(pmu_ptr);
+ return NULL;
+}
+
+static void disable_nest_pmu_counters(void)
+{
+ int nid, cpu;
+ const struct cpumask *l_cpumask;
+
+ get_online_cpus();
+ for_each_node_with_cpus(nid) {
+ l_cpumask = cpumask_of_node(nid);
+ cpu = cpumask_first_and(l_cpumask, cpu_online_mask);
+ if (cpu >= nr_cpu_ids)
+ continue;
+ opal_imc_counters_stop(OPAL_IMC_COUNTERS_NEST,
+ get_hard_smp_processor_id(cpu));
+ }
+ put_online_cpus();
+}
+
+static void disable_core_pmu_counters(void)
+{
+ cpumask_t cores_map;
+ int cpu, rc;
+
+ get_online_cpus();
+ /* Disable the IMC Core functions */
+ cores_map = cpu_online_cores_map();
+ for_each_cpu(cpu, &cores_map) {
+ rc = opal_imc_counters_stop(OPAL_IMC_COUNTERS_CORE,
+ get_hard_smp_processor_id(cpu));
+ if (rc)
+ pr_err("%s: Failed to stop Core (cpu = %d)\n",
+ __FUNCTION__, cpu);
+ }
+ put_online_cpus();
+}
+
+int get_max_nest_dev(void)
+{
+ struct device_node *node;
+ u32 pmu_units = 0, type;
+
+ for_each_compatible_node(node, NULL, IMC_DTB_UNIT_COMPAT) {
+ if (of_property_read_u32(node, "type", &type))
+ continue;
+
+ if (type == IMC_TYPE_CHIP)
+ pmu_units++;
+ }
+
+ return pmu_units;
+}
+
+static int opal_imc_counters_probe(struct platform_device *pdev)
+{
+ struct device_node *imc_dev = pdev->dev.of_node;
+ struct imc_pmu *pmu;
+ int pmu_count = 0, domain;
+ bool core_imc_reg = false, thread_imc_reg = false;
+ u32 type;
+
+ /*
+ * Check whether this is kdump kernel. If yes, force the engines to
+ * stop and return.
+ */
+ if (is_kdump_kernel()) {
+ disable_nest_pmu_counters();
+ disable_core_pmu_counters();
+ return -ENODEV;
+ }
+
+ for_each_compatible_node(imc_dev, NULL, IMC_DTB_UNIT_COMPAT) {
+ pmu = NULL;
+ if (of_property_read_u32(imc_dev, "type", &type)) {
+ pr_warn("IMC Device without type property\n");
+ continue;
+ }
+
+ switch (type) {
+ case IMC_TYPE_CHIP:
+ domain = IMC_DOMAIN_NEST;
+ break;
+ case IMC_TYPE_CORE:
+ domain =IMC_DOMAIN_CORE;
+ break;
+ case IMC_TYPE_THREAD:
+ domain = IMC_DOMAIN_THREAD;
+ break;
+ default:
+ pr_warn("IMC Unknown Device type \n");
+ domain = -1;
+ break;
+ }
+
+ pmu = imc_pmu_create(imc_dev, pmu_count, domain);
+ if (pmu != NULL) {
+ if (domain == IMC_DOMAIN_NEST) {
+ if (!imc_debugfs_parent)
+ export_imc_mode_and_cmd(imc_dev, pmu);
+ pmu_count++;
+ }
+ if (domain == IMC_DOMAIN_CORE)
+ core_imc_reg = true;
+ if (domain == IMC_DOMAIN_THREAD)
+ thread_imc_reg = true;
+ }
+ }
+
+ /* If core imc is not registered, unregister thread-imc */
+ if (!core_imc_reg && thread_imc_reg)
+ unregister_thread_imc();
+
+ return 0;
+}
+
+static void opal_imc_counters_shutdown(struct platform_device *pdev)
+{
+ /*
+ * Function only stops the engines which is bare minimum.
+ * TODO: Need to handle proper memory cleanup and pmu
+ * unregister.
+ */
+ disable_nest_pmu_counters();
+ disable_core_pmu_counters();
+}
+
+static const struct of_device_id opal_imc_match[] = {
+ { .compatible = IMC_DTB_COMPAT },
+ {},
+};
+
+static struct platform_driver opal_imc_driver = {
+ .driver = {
+ .name = "opal-imc-counters",
+ .of_match_table = opal_imc_match,
+ },
+ .probe = opal_imc_counters_probe,
+ .shutdown = opal_imc_counters_shutdown,
+};
+
+builtin_platform_driver(opal_imc_driver);
diff --git a/arch/powerpc/platforms/powernv/opal-irqchip.c b/arch/powerpc/platforms/powernv/opal-irqchip.c
new file mode 100644
index 000000000..bc97770a6
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/opal-irqchip.c
@@ -0,0 +1,317 @@
+/*
+ * This file implements an irqchip for OPAL events. Whenever there is
+ * an interrupt that is handled by OPAL we get passed a list of events
+ * that Linux needs to do something about. These basically look like
+ * interrupts to Linux so we implement an irqchip to handle them.
+ *
+ * Copyright Alistair Popple, IBM Corporation 2014.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ */
+#include <linux/bitops.h>
+#include <linux/irq.h>
+#include <linux/irqchip.h>
+#include <linux/irqdomain.h>
+#include <linux/interrupt.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/platform_device.h>
+#include <linux/kthread.h>
+#include <linux/delay.h>
+#include <linux/slab.h>
+#include <linux/of_irq.h>
+
+#include <asm/machdep.h>
+#include <asm/opal.h>
+
+#include "powernv.h"
+
+/* Maximum number of events supported by OPAL firmware */
+#define MAX_NUM_EVENTS 64
+
+struct opal_event_irqchip {
+ struct irq_chip irqchip;
+ struct irq_domain *domain;
+ unsigned long mask;
+};
+static struct opal_event_irqchip opal_event_irqchip;
+static u64 last_outstanding_events;
+static int opal_irq_count;
+static struct resource *opal_irqs;
+
+void opal_handle_events(void)
+{
+ __be64 events = 0;
+ u64 e;
+
+ e = READ_ONCE(last_outstanding_events) & opal_event_irqchip.mask;
+again:
+ while (e) {
+ int virq, hwirq;
+
+ hwirq = fls64(e) - 1;
+ e &= ~BIT_ULL(hwirq);
+
+ local_irq_disable();
+ virq = irq_find_mapping(opal_event_irqchip.domain, hwirq);
+ if (virq) {
+ irq_enter();
+ generic_handle_irq(virq);
+ irq_exit();
+ }
+ local_irq_enable();
+
+ cond_resched();
+ }
+ last_outstanding_events = 0;
+ if (opal_poll_events(&events) != OPAL_SUCCESS)
+ return;
+ e = be64_to_cpu(events) & opal_event_irqchip.mask;
+ if (e)
+ goto again;
+}
+
+bool opal_have_pending_events(void)
+{
+ if (last_outstanding_events & opal_event_irqchip.mask)
+ return true;
+ return false;
+}
+
+static void opal_event_mask(struct irq_data *d)
+{
+ clear_bit(d->hwirq, &opal_event_irqchip.mask);
+}
+
+static void opal_event_unmask(struct irq_data *d)
+{
+ set_bit(d->hwirq, &opal_event_irqchip.mask);
+ if (opal_have_pending_events())
+ opal_wake_poller();
+}
+
+static int opal_event_set_type(struct irq_data *d, unsigned int flow_type)
+{
+ /*
+ * For now we only support level triggered events. The irq
+ * handler will be called continuously until the event has
+ * been cleared in OPAL.
+ */
+ if (flow_type != IRQ_TYPE_LEVEL_HIGH)
+ return -EINVAL;
+
+ return 0;
+}
+
+static struct opal_event_irqchip opal_event_irqchip = {
+ .irqchip = {
+ .name = "OPAL EVT",
+ .irq_mask = opal_event_mask,
+ .irq_unmask = opal_event_unmask,
+ .irq_set_type = opal_event_set_type,
+ },
+ .mask = 0,
+};
+
+static int opal_event_map(struct irq_domain *d, unsigned int irq,
+ irq_hw_number_t hwirq)
+{
+ irq_set_chip_data(irq, &opal_event_irqchip);
+ irq_set_chip_and_handler(irq, &opal_event_irqchip.irqchip,
+ handle_level_irq);
+
+ return 0;
+}
+
+static irqreturn_t opal_interrupt(int irq, void *data)
+{
+ __be64 events;
+
+ opal_handle_interrupt(virq_to_hw(irq), &events);
+ last_outstanding_events = be64_to_cpu(events);
+ if (opal_have_pending_events())
+ opal_wake_poller();
+
+ return IRQ_HANDLED;
+}
+
+static int opal_event_match(struct irq_domain *h, struct device_node *node,
+ enum irq_domain_bus_token bus_token)
+{
+ return irq_domain_get_of_node(h) == node;
+}
+
+static int opal_event_xlate(struct irq_domain *h, struct device_node *np,
+ const u32 *intspec, unsigned int intsize,
+ irq_hw_number_t *out_hwirq, unsigned int *out_flags)
+{
+ *out_hwirq = intspec[0];
+ *out_flags = IRQ_TYPE_LEVEL_HIGH;
+
+ return 0;
+}
+
+static const struct irq_domain_ops opal_event_domain_ops = {
+ .match = opal_event_match,
+ .map = opal_event_map,
+ .xlate = opal_event_xlate,
+};
+
+void opal_event_shutdown(void)
+{
+ unsigned int i;
+
+ /* First free interrupts, which will also mask them */
+ for (i = 0; i < opal_irq_count; i++) {
+ if (!opal_irqs || !opal_irqs[i].start)
+ continue;
+
+ if (in_interrupt() || irqs_disabled())
+ disable_irq_nosync(opal_irqs[i].start);
+ else
+ free_irq(opal_irqs[i].start, NULL);
+
+ opal_irqs[i].start = 0;
+ }
+}
+
+int __init opal_event_init(void)
+{
+ struct device_node *dn, *opal_node;
+ bool old_style = false;
+ int i, rc = 0;
+
+ opal_node = of_find_node_by_path("/ibm,opal");
+ if (!opal_node) {
+ pr_warn("opal: Node not found\n");
+ return -ENODEV;
+ }
+
+ /* If dn is NULL it means the domain won't be linked to a DT
+ * node so therefore irq_of_parse_and_map(...) wont work. But
+ * that shouldn't be problem because if we're running a
+ * version of skiboot that doesn't have the dn then the
+ * devices won't have the correct properties and will have to
+ * fall back to the legacy method (opal_event_request(...))
+ * anyway. */
+ dn = of_find_compatible_node(NULL, NULL, "ibm,opal-event");
+ opal_event_irqchip.domain = irq_domain_add_linear(dn, MAX_NUM_EVENTS,
+ &opal_event_domain_ops, &opal_event_irqchip);
+ of_node_put(dn);
+ if (!opal_event_irqchip.domain) {
+ pr_warn("opal: Unable to create irq domain\n");
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ /* Look for new-style (standard) "interrupts" property */
+ opal_irq_count = of_irq_count(opal_node);
+
+ /* Absent ? Look for the old one */
+ if (opal_irq_count < 1) {
+ /* Get opal-interrupts property and names if present */
+ rc = of_property_count_u32_elems(opal_node, "opal-interrupts");
+ if (rc > 0)
+ opal_irq_count = rc;
+ old_style = true;
+ }
+
+ /* No interrupts ? Bail out */
+ if (!opal_irq_count)
+ goto out;
+
+ pr_debug("OPAL: Found %d interrupts reserved for OPAL using %s scheme\n",
+ opal_irq_count, old_style ? "old" : "new");
+
+ /* Allocate an IRQ resources array */
+ opal_irqs = kcalloc(opal_irq_count, sizeof(struct resource), GFP_KERNEL);
+ if (WARN_ON(!opal_irqs)) {
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ /* Build the resources array */
+ if (old_style) {
+ /* Old style "opal-interrupts" property */
+ for (i = 0; i < opal_irq_count; i++) {
+ struct resource *r = &opal_irqs[i];
+ const char *name = NULL;
+ u32 hw_irq;
+ int virq;
+
+ rc = of_property_read_u32_index(opal_node, "opal-interrupts",
+ i, &hw_irq);
+ if (WARN_ON(rc < 0)) {
+ opal_irq_count = i;
+ break;
+ }
+ of_property_read_string_index(opal_node, "opal-interrupts-names",
+ i, &name);
+ virq = irq_create_mapping(NULL, hw_irq);
+ if (!virq) {
+ pr_warn("Failed to map OPAL irq 0x%x\n", hw_irq);
+ continue;
+ }
+ r->start = r->end = virq;
+ r->flags = IORESOURCE_IRQ | IRQ_TYPE_LEVEL_LOW;
+ r->name = name;
+ }
+ } else {
+ /* new style standard "interrupts" property */
+ rc = of_irq_to_resource_table(opal_node, opal_irqs, opal_irq_count);
+ if (WARN_ON(rc < 0)) {
+ opal_irq_count = 0;
+ kfree(opal_irqs);
+ goto out;
+ }
+ if (WARN_ON(rc < opal_irq_count))
+ opal_irq_count = rc;
+ }
+
+ /* Install interrupt handlers */
+ for (i = 0; i < opal_irq_count; i++) {
+ struct resource *r = &opal_irqs[i];
+ const char *name;
+
+ /* Prefix name */
+ if (r->name && strlen(r->name))
+ name = kasprintf(GFP_KERNEL, "opal-%s", r->name);
+ else
+ name = kasprintf(GFP_KERNEL, "opal");
+
+ /* Install interrupt handler */
+ rc = request_irq(r->start, opal_interrupt, r->flags & IRQD_TRIGGER_MASK,
+ name, NULL);
+ if (rc) {
+ pr_warn("Error %d requesting OPAL irq %d\n", rc, (int)r->start);
+ continue;
+ }
+ }
+ rc = 0;
+ out:
+ of_node_put(opal_node);
+ return rc;
+}
+machine_arch_initcall(powernv, opal_event_init);
+
+/**
+ * opal_event_request(unsigned int opal_event_nr) - Request an event
+ * @opal_event_nr: the opal event number to request
+ *
+ * This routine can be used to find the linux virq number which can
+ * then be passed to request_irq to assign a handler for a particular
+ * opal event. This should only be used by legacy devices which don't
+ * have proper device tree bindings. Most devices should use
+ * irq_of_parse_and_map() instead.
+ */
+int opal_event_request(unsigned int opal_event_nr)
+{
+ if (WARN_ON_ONCE(!opal_event_irqchip.domain))
+ return 0;
+
+ return irq_create_mapping(opal_event_irqchip.domain, opal_event_nr);
+}
+EXPORT_SYMBOL(opal_event_request);
diff --git a/arch/powerpc/platforms/powernv/opal-kmsg.c b/arch/powerpc/platforms/powernv/opal-kmsg.c
new file mode 100644
index 000000000..55691950d
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/opal-kmsg.c
@@ -0,0 +1,51 @@
+/*
+ * kmsg dumper that ensures the OPAL console fully flushes panic messages
+ *
+ * Author: Russell Currey <ruscur@russell.cc>
+ *
+ * Copyright 2015 IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ */
+
+#include <linux/kmsg_dump.h>
+
+#include <asm/opal.h>
+#include <asm/opal-api.h>
+
+/*
+ * Console output is controlled by OPAL firmware. The kernel regularly calls
+ * OPAL_POLL_EVENTS, which flushes some console output. In a panic state,
+ * however, the kernel no longer calls OPAL_POLL_EVENTS and the panic message
+ * may not be completely printed. This function does not actually dump the
+ * message, it just ensures that OPAL completely flushes the console buffer.
+ */
+static void kmsg_dump_opal_console_flush(struct kmsg_dumper *dumper,
+ enum kmsg_dump_reason reason)
+{
+ /*
+ * Outside of a panic context the pollers will continue to run,
+ * so we don't need to do any special flushing.
+ */
+ if (reason != KMSG_DUMP_PANIC)
+ return;
+
+ opal_flush_console(0);
+}
+
+static struct kmsg_dumper opal_kmsg_dumper = {
+ .dump = kmsg_dump_opal_console_flush
+};
+
+void __init opal_kmsg_init(void)
+{
+ int rc;
+
+ /* Add our dumper to the list */
+ rc = kmsg_dump_register(&opal_kmsg_dumper);
+ if (rc != 0)
+ pr_err("opal: kmsg_dump_register failed; returned %d\n", rc);
+}
diff --git a/arch/powerpc/platforms/powernv/opal-lpc.c b/arch/powerpc/platforms/powernv/opal-lpc.c
new file mode 100644
index 000000000..21f0edcfb
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/opal-lpc.c
@@ -0,0 +1,422 @@
+/*
+ * PowerNV LPC bus handling.
+ *
+ * Copyright 2013 IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/of.h>
+#include <linux/bug.h>
+#include <linux/io.h>
+#include <linux/slab.h>
+
+#include <asm/machdep.h>
+#include <asm/firmware.h>
+#include <asm/opal.h>
+#include <asm/prom.h>
+#include <linux/uaccess.h>
+#include <asm/debugfs.h>
+#include <asm/isa-bridge.h>
+
+static int opal_lpc_chip_id = -1;
+
+static u8 opal_lpc_inb(unsigned long port)
+{
+ int64_t rc;
+ __be32 data;
+
+ if (opal_lpc_chip_id < 0 || port > 0xffff)
+ return 0xff;
+ rc = opal_lpc_read(opal_lpc_chip_id, OPAL_LPC_IO, port, &data, 1);
+ return rc ? 0xff : be32_to_cpu(data);
+}
+
+static __le16 __opal_lpc_inw(unsigned long port)
+{
+ int64_t rc;
+ __be32 data;
+
+ if (opal_lpc_chip_id < 0 || port > 0xfffe)
+ return 0xffff;
+ if (port & 1)
+ return (__le16)opal_lpc_inb(port) << 8 | opal_lpc_inb(port + 1);
+ rc = opal_lpc_read(opal_lpc_chip_id, OPAL_LPC_IO, port, &data, 2);
+ return rc ? 0xffff : be32_to_cpu(data);
+}
+static u16 opal_lpc_inw(unsigned long port)
+{
+ return le16_to_cpu(__opal_lpc_inw(port));
+}
+
+static __le32 __opal_lpc_inl(unsigned long port)
+{
+ int64_t rc;
+ __be32 data;
+
+ if (opal_lpc_chip_id < 0 || port > 0xfffc)
+ return 0xffffffff;
+ if (port & 3)
+ return (__le32)opal_lpc_inb(port ) << 24 |
+ (__le32)opal_lpc_inb(port + 1) << 16 |
+ (__le32)opal_lpc_inb(port + 2) << 8 |
+ opal_lpc_inb(port + 3);
+ rc = opal_lpc_read(opal_lpc_chip_id, OPAL_LPC_IO, port, &data, 4);
+ return rc ? 0xffffffff : be32_to_cpu(data);
+}
+
+static u32 opal_lpc_inl(unsigned long port)
+{
+ return le32_to_cpu(__opal_lpc_inl(port));
+}
+
+static void opal_lpc_outb(u8 val, unsigned long port)
+{
+ if (opal_lpc_chip_id < 0 || port > 0xffff)
+ return;
+ opal_lpc_write(opal_lpc_chip_id, OPAL_LPC_IO, port, val, 1);
+}
+
+static void __opal_lpc_outw(__le16 val, unsigned long port)
+{
+ if (opal_lpc_chip_id < 0 || port > 0xfffe)
+ return;
+ if (port & 1) {
+ opal_lpc_outb(val >> 8, port);
+ opal_lpc_outb(val , port + 1);
+ return;
+ }
+ opal_lpc_write(opal_lpc_chip_id, OPAL_LPC_IO, port, val, 2);
+}
+
+static void opal_lpc_outw(u16 val, unsigned long port)
+{
+ __opal_lpc_outw(cpu_to_le16(val), port);
+}
+
+static void __opal_lpc_outl(__le32 val, unsigned long port)
+{
+ if (opal_lpc_chip_id < 0 || port > 0xfffc)
+ return;
+ if (port & 3) {
+ opal_lpc_outb(val >> 24, port);
+ opal_lpc_outb(val >> 16, port + 1);
+ opal_lpc_outb(val >> 8, port + 2);
+ opal_lpc_outb(val , port + 3);
+ return;
+ }
+ opal_lpc_write(opal_lpc_chip_id, OPAL_LPC_IO, port, val, 4);
+}
+
+static void opal_lpc_outl(u32 val, unsigned long port)
+{
+ __opal_lpc_outl(cpu_to_le32(val), port);
+}
+
+static void opal_lpc_insb(unsigned long p, void *b, unsigned long c)
+{
+ u8 *ptr = b;
+
+ while(c--)
+ *(ptr++) = opal_lpc_inb(p);
+}
+
+static void opal_lpc_insw(unsigned long p, void *b, unsigned long c)
+{
+ __le16 *ptr = b;
+
+ while(c--)
+ *(ptr++) = __opal_lpc_inw(p);
+}
+
+static void opal_lpc_insl(unsigned long p, void *b, unsigned long c)
+{
+ __le32 *ptr = b;
+
+ while(c--)
+ *(ptr++) = __opal_lpc_inl(p);
+}
+
+static void opal_lpc_outsb(unsigned long p, const void *b, unsigned long c)
+{
+ const u8 *ptr = b;
+
+ while(c--)
+ opal_lpc_outb(*(ptr++), p);
+}
+
+static void opal_lpc_outsw(unsigned long p, const void *b, unsigned long c)
+{
+ const __le16 *ptr = b;
+
+ while(c--)
+ __opal_lpc_outw(*(ptr++), p);
+}
+
+static void opal_lpc_outsl(unsigned long p, const void *b, unsigned long c)
+{
+ const __le32 *ptr = b;
+
+ while(c--)
+ __opal_lpc_outl(*(ptr++), p);
+}
+
+static const struct ppc_pci_io opal_lpc_io = {
+ .inb = opal_lpc_inb,
+ .inw = opal_lpc_inw,
+ .inl = opal_lpc_inl,
+ .outb = opal_lpc_outb,
+ .outw = opal_lpc_outw,
+ .outl = opal_lpc_outl,
+ .insb = opal_lpc_insb,
+ .insw = opal_lpc_insw,
+ .insl = opal_lpc_insl,
+ .outsb = opal_lpc_outsb,
+ .outsw = opal_lpc_outsw,
+ .outsl = opal_lpc_outsl,
+};
+
+#ifdef CONFIG_DEBUG_FS
+struct lpc_debugfs_entry {
+ enum OpalLPCAddressType lpc_type;
+};
+
+static ssize_t lpc_debug_read(struct file *filp, char __user *ubuf,
+ size_t count, loff_t *ppos)
+{
+ struct lpc_debugfs_entry *lpc = filp->private_data;
+ u32 data, pos, len, todo;
+ int rc;
+
+ if (!access_ok(VERIFY_WRITE, ubuf, count))
+ return -EFAULT;
+
+ todo = count;
+ while (todo) {
+ pos = *ppos;
+
+ /*
+ * Select access size based on count and alignment and
+ * access type. IO and MEM only support byte acceses,
+ * FW supports all 3.
+ */
+ len = 1;
+ if (lpc->lpc_type == OPAL_LPC_FW) {
+ if (todo > 3 && (pos & 3) == 0)
+ len = 4;
+ else if (todo > 1 && (pos & 1) == 0)
+ len = 2;
+ }
+ rc = opal_lpc_read(opal_lpc_chip_id, lpc->lpc_type, pos,
+ &data, len);
+ if (rc)
+ return -ENXIO;
+
+ /*
+ * Now there is some trickery with the data returned by OPAL
+ * as it's the desired data right justified in a 32-bit BE
+ * word.
+ *
+ * This is a very bad interface and I'm to blame for it :-(
+ *
+ * So we can't just apply a 32-bit swap to what comes from OPAL,
+ * because user space expects the *bytes* to be in their proper
+ * respective positions (ie, LPC position).
+ *
+ * So what we really want to do here is to shift data right
+ * appropriately on a LE kernel.
+ *
+ * IE. If the LPC transaction has bytes B0, B1, B2 and B3 in that
+ * order, we have in memory written to by OPAL at the "data"
+ * pointer:
+ *
+ * Bytes: OPAL "data" LE "data"
+ * 32-bit: B0 B1 B2 B3 B0B1B2B3 B3B2B1B0
+ * 16-bit: B0 B1 0000B0B1 B1B00000
+ * 8-bit: B0 000000B0 B0000000
+ *
+ * So a BE kernel will have the leftmost of the above in the MSB
+ * and rightmost in the LSB and can just then "cast" the u32 "data"
+ * down to the appropriate quantity and write it.
+ *
+ * However, an LE kernel can't. It doesn't need to swap because a
+ * load from data followed by a store to user are going to preserve
+ * the byte ordering which is the wire byte order which is what the
+ * user wants, but in order to "crop" to the right size, we need to
+ * shift right first.
+ */
+ switch(len) {
+ case 4:
+ rc = __put_user((u32)data, (u32 __user *)ubuf);
+ break;
+ case 2:
+#ifdef __LITTLE_ENDIAN__
+ data >>= 16;
+#endif
+ rc = __put_user((u16)data, (u16 __user *)ubuf);
+ break;
+ default:
+#ifdef __LITTLE_ENDIAN__
+ data >>= 24;
+#endif
+ rc = __put_user((u8)data, (u8 __user *)ubuf);
+ break;
+ }
+ if (rc)
+ return -EFAULT;
+ *ppos += len;
+ ubuf += len;
+ todo -= len;
+ }
+
+ return count;
+}
+
+static ssize_t lpc_debug_write(struct file *filp, const char __user *ubuf,
+ size_t count, loff_t *ppos)
+{
+ struct lpc_debugfs_entry *lpc = filp->private_data;
+ u32 data, pos, len, todo;
+ int rc;
+
+ if (!access_ok(VERIFY_READ, ubuf, count))
+ return -EFAULT;
+
+ todo = count;
+ while (todo) {
+ pos = *ppos;
+
+ /*
+ * Select access size based on count and alignment and
+ * access type. IO and MEM only support byte acceses,
+ * FW supports all 3.
+ */
+ len = 1;
+ if (lpc->lpc_type == OPAL_LPC_FW) {
+ if (todo > 3 && (pos & 3) == 0)
+ len = 4;
+ else if (todo > 1 && (pos & 1) == 0)
+ len = 2;
+ }
+
+ /*
+ * Similarly to the read case, we have some trickery here but
+ * it's different to handle. We need to pass the value to OPAL in
+ * a register whose layout depends on the access size. We want
+ * to reproduce the memory layout of the user, however we aren't
+ * doing a load from user and a store to another memory location
+ * which would achieve that. Here we pass the value to OPAL via
+ * a register which is expected to contain the "BE" interpretation
+ * of the byte sequence. IE: for a 32-bit access, byte 0 should be
+ * in the MSB. So here we *do* need to byteswap on LE.
+ *
+ * User bytes: LE "data" OPAL "data"
+ * 32-bit: B0 B1 B2 B3 B3B2B1B0 B0B1B2B3
+ * 16-bit: B0 B1 0000B1B0 0000B0B1
+ * 8-bit: B0 000000B0 000000B0
+ */
+ switch(len) {
+ case 4:
+ rc = __get_user(data, (u32 __user *)ubuf);
+ data = cpu_to_be32(data);
+ break;
+ case 2:
+ rc = __get_user(data, (u16 __user *)ubuf);
+ data = cpu_to_be16(data);
+ break;
+ default:
+ rc = __get_user(data, (u8 __user *)ubuf);
+ break;
+ }
+ if (rc)
+ return -EFAULT;
+
+ rc = opal_lpc_write(opal_lpc_chip_id, lpc->lpc_type, pos,
+ data, len);
+ if (rc)
+ return -ENXIO;
+ *ppos += len;
+ ubuf += len;
+ todo -= len;
+ }
+
+ return count;
+}
+
+static const struct file_operations lpc_fops = {
+ .read = lpc_debug_read,
+ .write = lpc_debug_write,
+ .open = simple_open,
+ .llseek = default_llseek,
+};
+
+static int opal_lpc_debugfs_create_type(struct dentry *folder,
+ const char *fname,
+ enum OpalLPCAddressType type)
+{
+ struct lpc_debugfs_entry *entry;
+ entry = kzalloc(sizeof(*entry), GFP_KERNEL);
+ if (!entry)
+ return -ENOMEM;
+ entry->lpc_type = type;
+ debugfs_create_file(fname, 0600, folder, entry, &lpc_fops);
+ return 0;
+}
+
+static int opal_lpc_init_debugfs(void)
+{
+ struct dentry *root;
+ int rc = 0;
+
+ if (opal_lpc_chip_id < 0)
+ return -ENODEV;
+
+ root = debugfs_create_dir("lpc", powerpc_debugfs_root);
+
+ rc |= opal_lpc_debugfs_create_type(root, "io", OPAL_LPC_IO);
+ rc |= opal_lpc_debugfs_create_type(root, "mem", OPAL_LPC_MEM);
+ rc |= opal_lpc_debugfs_create_type(root, "fw", OPAL_LPC_FW);
+ return rc;
+}
+machine_device_initcall(powernv, opal_lpc_init_debugfs);
+#endif /* CONFIG_DEBUG_FS */
+
+void __init opal_lpc_init(void)
+{
+ struct device_node *np;
+
+ /*
+ * Look for a Power8 LPC bus tagged as "primary",
+ * we currently support only one though the OPAL APIs
+ * support any number.
+ */
+ for_each_compatible_node(np, NULL, "ibm,power8-lpc") {
+ if (!of_device_is_available(np))
+ continue;
+ if (!of_get_property(np, "primary", NULL))
+ continue;
+ opal_lpc_chip_id = of_get_ibm_chip_id(np);
+ of_node_put(np);
+ break;
+ }
+ if (opal_lpc_chip_id < 0)
+ return;
+
+ /* Does it support direct mapping ? */
+ if (of_get_property(np, "ranges", NULL)) {
+ pr_info("OPAL: Found memory mapped LPC bus on chip %d\n",
+ opal_lpc_chip_id);
+ isa_bridge_init_non_pci(np);
+ } else {
+ pr_info("OPAL: Found non-mapped LPC bus on chip %d\n",
+ opal_lpc_chip_id);
+
+ /* Setup special IO ops */
+ ppc_pci_io = opal_lpc_io;
+ isa_io_special = true;
+ }
+}
diff --git a/arch/powerpc/platforms/powernv/opal-memory-errors.c b/arch/powerpc/platforms/powernv/opal-memory-errors.c
new file mode 100644
index 000000000..dcb42bcb5
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/opal-memory-errors.c
@@ -0,0 +1,147 @@
+/*
+ * OPAL asynchronus Memory error handling support in PowerNV.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright 2013 IBM Corporation
+ * Author: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
+ */
+
+#undef DEBUG
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/of.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+
+#include <asm/machdep.h>
+#include <asm/opal.h>
+#include <asm/cputable.h>
+
+static int opal_mem_err_nb_init;
+static LIST_HEAD(opal_memory_err_list);
+static DEFINE_SPINLOCK(opal_mem_err_lock);
+
+struct OpalMsgNode {
+ struct list_head list;
+ struct opal_msg msg;
+};
+
+static void handle_memory_error_event(struct OpalMemoryErrorData *merr_evt)
+{
+ uint64_t paddr_start, paddr_end;
+
+ pr_debug("%s: Retrieved memory error event, type: 0x%x\n",
+ __func__, merr_evt->type);
+ switch (merr_evt->type) {
+ case OPAL_MEM_ERR_TYPE_RESILIENCE:
+ paddr_start = be64_to_cpu(merr_evt->u.resilience.physical_address_start);
+ paddr_end = be64_to_cpu(merr_evt->u.resilience.physical_address_end);
+ break;
+ case OPAL_MEM_ERR_TYPE_DYN_DALLOC:
+ paddr_start = be64_to_cpu(merr_evt->u.dyn_dealloc.physical_address_start);
+ paddr_end = be64_to_cpu(merr_evt->u.dyn_dealloc.physical_address_end);
+ break;
+ default:
+ return;
+ }
+
+ for (; paddr_start < paddr_end; paddr_start += PAGE_SIZE) {
+ memory_failure(paddr_start >> PAGE_SHIFT, 0);
+ }
+}
+
+static void handle_memory_error(void)
+{
+ unsigned long flags;
+ struct OpalMemoryErrorData *merr_evt;
+ struct OpalMsgNode *msg_node;
+
+ spin_lock_irqsave(&opal_mem_err_lock, flags);
+ while (!list_empty(&opal_memory_err_list)) {
+ msg_node = list_entry(opal_memory_err_list.next,
+ struct OpalMsgNode, list);
+ list_del(&msg_node->list);
+ spin_unlock_irqrestore(&opal_mem_err_lock, flags);
+
+ merr_evt = (struct OpalMemoryErrorData *)
+ &msg_node->msg.params[0];
+ handle_memory_error_event(merr_evt);
+ kfree(msg_node);
+ spin_lock_irqsave(&opal_mem_err_lock, flags);
+ }
+ spin_unlock_irqrestore(&opal_mem_err_lock, flags);
+}
+
+static void mem_error_handler(struct work_struct *work)
+{
+ handle_memory_error();
+}
+
+static DECLARE_WORK(mem_error_work, mem_error_handler);
+
+/*
+ * opal_memory_err_event - notifier handler that queues up the opal message
+ * to be preocessed later.
+ */
+static int opal_memory_err_event(struct notifier_block *nb,
+ unsigned long msg_type, void *msg)
+{
+ unsigned long flags;
+ struct OpalMsgNode *msg_node;
+
+ if (msg_type != OPAL_MSG_MEM_ERR)
+ return 0;
+
+ msg_node = kzalloc(sizeof(*msg_node), GFP_ATOMIC);
+ if (!msg_node) {
+ pr_err("MEMORY_ERROR: out of memory, Opal message event not"
+ "handled\n");
+ return -ENOMEM;
+ }
+ memcpy(&msg_node->msg, msg, sizeof(msg_node->msg));
+
+ spin_lock_irqsave(&opal_mem_err_lock, flags);
+ list_add(&msg_node->list, &opal_memory_err_list);
+ spin_unlock_irqrestore(&opal_mem_err_lock, flags);
+
+ schedule_work(&mem_error_work);
+ return 0;
+}
+
+static struct notifier_block opal_mem_err_nb = {
+ .notifier_call = opal_memory_err_event,
+ .next = NULL,
+ .priority = 0,
+};
+
+static int __init opal_mem_err_init(void)
+{
+ int ret;
+
+ if (!opal_mem_err_nb_init) {
+ ret = opal_message_notifier_register(
+ OPAL_MSG_MEM_ERR, &opal_mem_err_nb);
+ if (ret) {
+ pr_err("%s: Can't register OPAL event notifier (%d)\n",
+ __func__, ret);
+ return ret;
+ }
+ opal_mem_err_nb_init = 1;
+ }
+ return 0;
+}
+machine_device_initcall(powernv, opal_mem_err_init);
diff --git a/arch/powerpc/platforms/powernv/opal-msglog.c b/arch/powerpc/platforms/powernv/opal-msglog.c
new file mode 100644
index 000000000..06628c71c
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/opal-msglog.c
@@ -0,0 +1,142 @@
+/*
+ * PowerNV OPAL in-memory console interface
+ *
+ * Copyright 2014 IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <asm/io.h>
+#include <asm/opal.h>
+#include <linux/debugfs.h>
+#include <linux/of.h>
+#include <linux/types.h>
+#include <asm/barrier.h>
+
+/* OPAL in-memory console. Defined in OPAL source at core/console.c */
+struct memcons {
+ __be64 magic;
+#define MEMCONS_MAGIC 0x6630696567726173L
+ __be64 obuf_phys;
+ __be64 ibuf_phys;
+ __be32 obuf_size;
+ __be32 ibuf_size;
+ __be32 out_pos;
+#define MEMCONS_OUT_POS_WRAP 0x80000000u
+#define MEMCONS_OUT_POS_MASK 0x00ffffffu
+ __be32 in_prod;
+ __be32 in_cons;
+};
+
+static struct memcons *opal_memcons = NULL;
+
+ssize_t opal_msglog_copy(char *to, loff_t pos, size_t count)
+{
+ const char *conbuf;
+ ssize_t ret;
+ size_t first_read = 0;
+ uint32_t out_pos, avail;
+
+ if (!opal_memcons)
+ return -ENODEV;
+
+ out_pos = be32_to_cpu(READ_ONCE(opal_memcons->out_pos));
+
+ /* Now we've read out_pos, put a barrier in before reading the new
+ * data it points to in conbuf. */
+ smp_rmb();
+
+ conbuf = phys_to_virt(be64_to_cpu(opal_memcons->obuf_phys));
+
+ /* When the buffer has wrapped, read from the out_pos marker to the end
+ * of the buffer, and then read the remaining data as in the un-wrapped
+ * case. */
+ if (out_pos & MEMCONS_OUT_POS_WRAP) {
+
+ out_pos &= MEMCONS_OUT_POS_MASK;
+ avail = be32_to_cpu(opal_memcons->obuf_size) - out_pos;
+
+ ret = memory_read_from_buffer(to, count, &pos,
+ conbuf + out_pos, avail);
+
+ if (ret < 0)
+ goto out;
+
+ first_read = ret;
+ to += first_read;
+ count -= first_read;
+ pos -= avail;
+
+ if (count <= 0)
+ goto out;
+ }
+
+ /* Sanity check. The firmware should not do this to us. */
+ if (out_pos > be32_to_cpu(opal_memcons->obuf_size)) {
+ pr_err("OPAL: memory console corruption. Aborting read.\n");
+ return -EINVAL;
+ }
+
+ ret = memory_read_from_buffer(to, count, &pos, conbuf, out_pos);
+
+ if (ret < 0)
+ goto out;
+
+ ret += first_read;
+out:
+ return ret;
+}
+
+static ssize_t opal_msglog_read(struct file *file, struct kobject *kobj,
+ struct bin_attribute *bin_attr, char *to,
+ loff_t pos, size_t count)
+{
+ return opal_msglog_copy(to, pos, count);
+}
+
+static struct bin_attribute opal_msglog_attr = {
+ .attr = {.name = "msglog", .mode = 0400},
+ .read = opal_msglog_read
+};
+
+void __init opal_msglog_init(void)
+{
+ u64 mcaddr;
+ struct memcons *mc;
+
+ if (of_property_read_u64(opal_node, "ibm,opal-memcons", &mcaddr)) {
+ pr_warn("OPAL: Property ibm,opal-memcons not found, no message log\n");
+ return;
+ }
+
+ mc = phys_to_virt(mcaddr);
+ if (!mc) {
+ pr_warn("OPAL: memory console address is invalid\n");
+ return;
+ }
+
+ if (be64_to_cpu(mc->magic) != MEMCONS_MAGIC) {
+ pr_warn("OPAL: memory console version is invalid\n");
+ return;
+ }
+
+ /* Report maximum size */
+ opal_msglog_attr.size = be32_to_cpu(mc->ibuf_size) +
+ be32_to_cpu(mc->obuf_size);
+
+ opal_memcons = mc;
+}
+
+void __init opal_msglog_sysfs_init(void)
+{
+ if (!opal_memcons) {
+ pr_warn("OPAL: message log initialisation failed, not creating sysfs entry\n");
+ return;
+ }
+
+ if (sysfs_create_bin_file(opal_kobj, &opal_msglog_attr) != 0)
+ pr_warn("OPAL: sysfs file creation failed\n");
+}
diff --git a/arch/powerpc/platforms/powernv/opal-nvram.c b/arch/powerpc/platforms/powernv/opal-nvram.c
new file mode 100644
index 000000000..5584247f5
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/opal-nvram.c
@@ -0,0 +1,117 @@
+/*
+ * PowerNV nvram code.
+ *
+ * Copyright 2011 IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#define DEBUG
+
+#include <linux/delay.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/of.h>
+
+#include <asm/opal.h>
+#include <asm/nvram.h>
+#include <asm/machdep.h>
+
+static unsigned int nvram_size;
+
+static ssize_t opal_nvram_size(void)
+{
+ return nvram_size;
+}
+
+static ssize_t opal_nvram_read(char *buf, size_t count, loff_t *index)
+{
+ s64 rc;
+ int off;
+
+ if (*index >= nvram_size)
+ return 0;
+ off = *index;
+ if ((off + count) > nvram_size)
+ count = nvram_size - off;
+ rc = opal_read_nvram(__pa(buf), count, off);
+ if (rc != OPAL_SUCCESS)
+ return -EIO;
+ *index += count;
+ return count;
+}
+
+/*
+ * This can be called in the panic path with interrupts off, so use
+ * mdelay in that case.
+ */
+static ssize_t opal_nvram_write(char *buf, size_t count, loff_t *index)
+{
+ s64 rc = OPAL_BUSY;
+ int off;
+
+ if (*index >= nvram_size)
+ return 0;
+ off = *index;
+ if ((off + count) > nvram_size)
+ count = nvram_size - off;
+
+ while (rc == OPAL_BUSY || rc == OPAL_BUSY_EVENT) {
+ rc = opal_write_nvram(__pa(buf), count, off);
+ if (rc == OPAL_BUSY_EVENT) {
+ if (in_interrupt() || irqs_disabled())
+ mdelay(OPAL_BUSY_DELAY_MS);
+ else
+ msleep(OPAL_BUSY_DELAY_MS);
+ opal_poll_events(NULL);
+ } else if (rc == OPAL_BUSY) {
+ if (in_interrupt() || irqs_disabled())
+ mdelay(OPAL_BUSY_DELAY_MS);
+ else
+ msleep(OPAL_BUSY_DELAY_MS);
+ }
+ }
+
+ if (rc)
+ return -EIO;
+
+ *index += count;
+ return count;
+}
+
+static int __init opal_nvram_init_log_partitions(void)
+{
+ /* Scan nvram for partitions */
+ nvram_scan_partitions();
+ nvram_init_oops_partition(0);
+ return 0;
+}
+machine_arch_initcall(powernv, opal_nvram_init_log_partitions);
+
+void __init opal_nvram_init(void)
+{
+ struct device_node *np;
+ const __be32 *nbytes_p;
+
+ np = of_find_compatible_node(NULL, NULL, "ibm,opal-nvram");
+ if (np == NULL)
+ return;
+
+ nbytes_p = of_get_property(np, "#bytes", NULL);
+ if (!nbytes_p) {
+ of_node_put(np);
+ return;
+ }
+ nvram_size = be32_to_cpup(nbytes_p);
+
+ pr_info("OPAL nvram setup, %u bytes\n", nvram_size);
+ of_node_put(np);
+
+ ppc_md.nvram_read = opal_nvram_read;
+ ppc_md.nvram_write = opal_nvram_write;
+ ppc_md.nvram_size = opal_nvram_size;
+}
+
diff --git a/arch/powerpc/platforms/powernv/opal-power.c b/arch/powerpc/platforms/powernv/opal-power.c
new file mode 100644
index 000000000..58dc33082
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/opal-power.c
@@ -0,0 +1,179 @@
+/*
+ * PowerNV OPAL power control for graceful shutdown handling
+ *
+ * Copyright 2015 IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#define pr_fmt(fmt) "opal-power: " fmt
+
+#include <linux/kernel.h>
+#include <linux/reboot.h>
+#include <linux/notifier.h>
+#include <linux/of.h>
+
+#include <asm/opal.h>
+#include <asm/machdep.h>
+
+#define SOFT_OFF 0x00
+#define SOFT_REBOOT 0x01
+
+/* Detect EPOW event */
+static bool detect_epow(void)
+{
+ u16 epow;
+ int i, rc;
+ __be16 epow_classes;
+ __be16 opal_epow_status[OPAL_SYSEPOW_MAX] = {0};
+
+ /*
+ * Check for EPOW event. Kernel sends supported EPOW classes info
+ * to OPAL. OPAL returns EPOW info along with classes present.
+ */
+ epow_classes = cpu_to_be16(OPAL_SYSEPOW_MAX);
+ rc = opal_get_epow_status(opal_epow_status, &epow_classes);
+ if (rc != OPAL_SUCCESS) {
+ pr_err("Failed to get EPOW event information\n");
+ return false;
+ }
+
+ /* Look for EPOW events present */
+ for (i = 0; i < be16_to_cpu(epow_classes); i++) {
+ epow = be16_to_cpu(opal_epow_status[i]);
+
+ /* Filter events which do not need shutdown. */
+ if (i == OPAL_SYSEPOW_POWER)
+ epow &= ~(OPAL_SYSPOWER_CHNG | OPAL_SYSPOWER_FAIL |
+ OPAL_SYSPOWER_INCL);
+ if (epow)
+ return true;
+ }
+
+ return false;
+}
+
+/* Check for existing EPOW, DPO events */
+static bool poweroff_pending(void)
+{
+ int rc;
+ __be64 opal_dpo_timeout;
+
+ /* Check for DPO event */
+ rc = opal_get_dpo_status(&opal_dpo_timeout);
+ if (rc == OPAL_SUCCESS) {
+ pr_info("Existing DPO event detected.\n");
+ return true;
+ }
+
+ /* Check for EPOW event */
+ if (detect_epow()) {
+ pr_info("Existing EPOW event detected.\n");
+ return true;
+ }
+
+ return false;
+}
+
+/* OPAL power-control events notifier */
+static int opal_power_control_event(struct notifier_block *nb,
+ unsigned long msg_type, void *msg)
+{
+ uint64_t type;
+
+ switch (msg_type) {
+ case OPAL_MSG_EPOW:
+ if (detect_epow()) {
+ pr_info("EPOW msg received. Powering off system\n");
+ orderly_poweroff(true);
+ }
+ break;
+ case OPAL_MSG_DPO:
+ pr_info("DPO msg received. Powering off system\n");
+ orderly_poweroff(true);
+ break;
+ case OPAL_MSG_SHUTDOWN:
+ type = be64_to_cpu(((struct opal_msg *)msg)->params[0]);
+ switch (type) {
+ case SOFT_REBOOT:
+ pr_info("Reboot requested\n");
+ orderly_reboot();
+ break;
+ case SOFT_OFF:
+ pr_info("Poweroff requested\n");
+ orderly_poweroff(true);
+ break;
+ default:
+ pr_err("Unknown power-control type %llu\n", type);
+ }
+ break;
+ default:
+ pr_err("Unknown OPAL message type %lu\n", msg_type);
+ }
+
+ return 0;
+}
+
+/* OPAL EPOW event notifier block */
+static struct notifier_block opal_epow_nb = {
+ .notifier_call = opal_power_control_event,
+ .next = NULL,
+ .priority = 0,
+};
+
+/* OPAL DPO event notifier block */
+static struct notifier_block opal_dpo_nb = {
+ .notifier_call = opal_power_control_event,
+ .next = NULL,
+ .priority = 0,
+};
+
+/* OPAL power-control event notifier block */
+static struct notifier_block opal_power_control_nb = {
+ .notifier_call = opal_power_control_event,
+ .next = NULL,
+ .priority = 0,
+};
+
+static int __init opal_power_control_init(void)
+{
+ int ret, supported = 0;
+ struct device_node *np;
+
+ /* Register OPAL power-control events notifier */
+ ret = opal_message_notifier_register(OPAL_MSG_SHUTDOWN,
+ &opal_power_control_nb);
+ if (ret)
+ pr_err("Failed to register SHUTDOWN notifier, ret = %d\n", ret);
+
+ /* Determine OPAL EPOW, DPO support */
+ np = of_find_node_by_path("/ibm,opal/epow");
+ if (np) {
+ supported = of_device_is_compatible(np, "ibm,opal-v3-epow");
+ of_node_put(np);
+ }
+
+ if (!supported)
+ return 0;
+ pr_info("OPAL EPOW, DPO support detected.\n");
+
+ /* Register EPOW event notifier */
+ ret = opal_message_notifier_register(OPAL_MSG_EPOW, &opal_epow_nb);
+ if (ret)
+ pr_err("Failed to register EPOW notifier, ret = %d\n", ret);
+
+ /* Register DPO event notifier */
+ ret = opal_message_notifier_register(OPAL_MSG_DPO, &opal_dpo_nb);
+ if (ret)
+ pr_err("Failed to register DPO notifier, ret = %d\n", ret);
+
+ /* Check for any pending EPOW or DPO events. */
+ if (poweroff_pending())
+ orderly_poweroff(true);
+
+ return 0;
+}
+machine_subsys_initcall(powernv, opal_power_control_init);
diff --git a/arch/powerpc/platforms/powernv/opal-powercap.c b/arch/powerpc/platforms/powernv/opal-powercap.c
new file mode 100644
index 000000000..badb29bde
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/opal-powercap.c
@@ -0,0 +1,244 @@
+/*
+ * PowerNV OPAL Powercap interface
+ *
+ * Copyright 2017 IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#define pr_fmt(fmt) "opal-powercap: " fmt
+
+#include <linux/of.h>
+#include <linux/kobject.h>
+#include <linux/slab.h>
+
+#include <asm/opal.h>
+
+DEFINE_MUTEX(powercap_mutex);
+
+static struct kobject *powercap_kobj;
+
+struct powercap_attr {
+ u32 handle;
+ struct kobj_attribute attr;
+};
+
+static struct pcap {
+ struct attribute_group pg;
+ struct powercap_attr *pattrs;
+} *pcaps;
+
+static ssize_t powercap_show(struct kobject *kobj, struct kobj_attribute *attr,
+ char *buf)
+{
+ struct powercap_attr *pcap_attr = container_of(attr,
+ struct powercap_attr, attr);
+ struct opal_msg msg;
+ u32 pcap;
+ int ret, token;
+
+ token = opal_async_get_token_interruptible();
+ if (token < 0) {
+ pr_devel("Failed to get token\n");
+ return token;
+ }
+
+ ret = mutex_lock_interruptible(&powercap_mutex);
+ if (ret)
+ goto out_token;
+
+ ret = opal_get_powercap(pcap_attr->handle, token, (u32 *)__pa(&pcap));
+ switch (ret) {
+ case OPAL_ASYNC_COMPLETION:
+ ret = opal_async_wait_response(token, &msg);
+ if (ret) {
+ pr_devel("Failed to wait for the async response\n");
+ ret = -EIO;
+ goto out;
+ }
+ ret = opal_error_code(opal_get_async_rc(msg));
+ if (!ret) {
+ ret = sprintf(buf, "%u\n", be32_to_cpu(pcap));
+ if (ret < 0)
+ ret = -EIO;
+ }
+ break;
+ case OPAL_SUCCESS:
+ ret = sprintf(buf, "%u\n", be32_to_cpu(pcap));
+ if (ret < 0)
+ ret = -EIO;
+ break;
+ default:
+ ret = opal_error_code(ret);
+ }
+
+out:
+ mutex_unlock(&powercap_mutex);
+out_token:
+ opal_async_release_token(token);
+ return ret;
+}
+
+static ssize_t powercap_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf,
+ size_t count)
+{
+ struct powercap_attr *pcap_attr = container_of(attr,
+ struct powercap_attr, attr);
+ struct opal_msg msg;
+ u32 pcap;
+ int ret, token;
+
+ ret = kstrtoint(buf, 0, &pcap);
+ if (ret)
+ return ret;
+
+ token = opal_async_get_token_interruptible();
+ if (token < 0) {
+ pr_devel("Failed to get token\n");
+ return token;
+ }
+
+ ret = mutex_lock_interruptible(&powercap_mutex);
+ if (ret)
+ goto out_token;
+
+ ret = opal_set_powercap(pcap_attr->handle, token, pcap);
+ switch (ret) {
+ case OPAL_ASYNC_COMPLETION:
+ ret = opal_async_wait_response(token, &msg);
+ if (ret) {
+ pr_devel("Failed to wait for the async response\n");
+ ret = -EIO;
+ goto out;
+ }
+ ret = opal_error_code(opal_get_async_rc(msg));
+ if (!ret)
+ ret = count;
+ break;
+ case OPAL_SUCCESS:
+ ret = count;
+ break;
+ default:
+ ret = opal_error_code(ret);
+ }
+
+out:
+ mutex_unlock(&powercap_mutex);
+out_token:
+ opal_async_release_token(token);
+ return ret;
+}
+
+static void powercap_add_attr(int handle, const char *name,
+ struct powercap_attr *attr)
+{
+ attr->handle = handle;
+ sysfs_attr_init(&attr->attr.attr);
+ attr->attr.attr.name = name;
+ attr->attr.attr.mode = 0444;
+ attr->attr.show = powercap_show;
+}
+
+void __init opal_powercap_init(void)
+{
+ struct device_node *powercap, *node;
+ int i = 0;
+
+ powercap = of_find_compatible_node(NULL, NULL, "ibm,opal-powercap");
+ if (!powercap) {
+ pr_devel("Powercap node not found\n");
+ return;
+ }
+
+ pcaps = kcalloc(of_get_child_count(powercap), sizeof(*pcaps),
+ GFP_KERNEL);
+ if (!pcaps)
+ return;
+
+ powercap_kobj = kobject_create_and_add("powercap", opal_kobj);
+ if (!powercap_kobj) {
+ pr_warn("Failed to create powercap kobject\n");
+ goto out_pcaps;
+ }
+
+ i = 0;
+ for_each_child_of_node(powercap, node) {
+ u32 cur, min, max;
+ int j = 0;
+ bool has_cur = false, has_min = false, has_max = false;
+
+ if (!of_property_read_u32(node, "powercap-min", &min)) {
+ j++;
+ has_min = true;
+ }
+
+ if (!of_property_read_u32(node, "powercap-max", &max)) {
+ j++;
+ has_max = true;
+ }
+
+ if (!of_property_read_u32(node, "powercap-current", &cur)) {
+ j++;
+ has_cur = true;
+ }
+
+ pcaps[i].pattrs = kcalloc(j, sizeof(struct powercap_attr),
+ GFP_KERNEL);
+ if (!pcaps[i].pattrs)
+ goto out_pcaps_pattrs;
+
+ pcaps[i].pg.attrs = kcalloc(j + 1, sizeof(struct attribute *),
+ GFP_KERNEL);
+ if (!pcaps[i].pg.attrs) {
+ kfree(pcaps[i].pattrs);
+ goto out_pcaps_pattrs;
+ }
+
+ j = 0;
+ pcaps[i].pg.name = node->name;
+ if (has_min) {
+ powercap_add_attr(min, "powercap-min",
+ &pcaps[i].pattrs[j]);
+ pcaps[i].pg.attrs[j] = &pcaps[i].pattrs[j].attr.attr;
+ j++;
+ }
+
+ if (has_max) {
+ powercap_add_attr(max, "powercap-max",
+ &pcaps[i].pattrs[j]);
+ pcaps[i].pg.attrs[j] = &pcaps[i].pattrs[j].attr.attr;
+ j++;
+ }
+
+ if (has_cur) {
+ powercap_add_attr(cur, "powercap-current",
+ &pcaps[i].pattrs[j]);
+ pcaps[i].pattrs[j].attr.attr.mode |= 0220;
+ pcaps[i].pattrs[j].attr.store = powercap_store;
+ pcaps[i].pg.attrs[j] = &pcaps[i].pattrs[j].attr.attr;
+ j++;
+ }
+
+ if (sysfs_create_group(powercap_kobj, &pcaps[i].pg)) {
+ pr_warn("Failed to create powercap attribute group %s\n",
+ pcaps[i].pg.name);
+ goto out_pcaps_pattrs;
+ }
+ i++;
+ }
+
+ return;
+
+out_pcaps_pattrs:
+ while (--i >= 0) {
+ kfree(pcaps[i].pattrs);
+ kfree(pcaps[i].pg.attrs);
+ }
+ kobject_put(powercap_kobj);
+out_pcaps:
+ kfree(pcaps);
+}
diff --git a/arch/powerpc/platforms/powernv/opal-prd.c b/arch/powerpc/platforms/powernv/opal-prd.c
new file mode 100644
index 000000000..4070bb4e9
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/opal-prd.c
@@ -0,0 +1,440 @@
+/*
+ * OPAL Runtime Diagnostics interface driver
+ * Supported on POWERNV platform
+ *
+ * Copyright IBM Corporation 2015
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#define pr_fmt(fmt) "opal-prd: " fmt
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/platform_device.h>
+#include <linux/miscdevice.h>
+#include <linux/fs.h>
+#include <linux/of.h>
+#include <linux/of_address.h>
+#include <linux/poll.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <asm/opal-prd.h>
+#include <asm/opal.h>
+#include <asm/io.h>
+#include <linux/uaccess.h>
+
+
+/**
+ * The msg member must be at the end of the struct, as it's followed by the
+ * message data.
+ */
+struct opal_prd_msg_queue_item {
+ struct list_head list;
+ struct opal_prd_msg_header msg;
+};
+
+static struct device_node *prd_node;
+static LIST_HEAD(opal_prd_msg_queue);
+static DEFINE_SPINLOCK(opal_prd_msg_queue_lock);
+static DECLARE_WAIT_QUEUE_HEAD(opal_prd_msg_wait);
+static atomic_t prd_usage;
+
+static bool opal_prd_range_is_valid(uint64_t addr, uint64_t size)
+{
+ struct device_node *parent, *node;
+ bool found;
+
+ if (addr + size < addr)
+ return false;
+
+ parent = of_find_node_by_path("/reserved-memory");
+ if (!parent)
+ return false;
+
+ found = false;
+
+ for_each_child_of_node(parent, node) {
+ uint64_t range_addr, range_size, range_end;
+ const __be32 *addrp;
+ const char *label;
+
+ addrp = of_get_address(node, 0, &range_size, NULL);
+
+ range_addr = of_read_number(addrp, 2);
+ range_end = range_addr + range_size;
+
+ label = of_get_property(node, "ibm,prd-label", NULL);
+
+ /* PRD ranges need a label */
+ if (!label)
+ continue;
+
+ if (range_end <= range_addr)
+ continue;
+
+ if (addr >= range_addr && addr + size <= range_end) {
+ found = true;
+ of_node_put(node);
+ break;
+ }
+ }
+
+ of_node_put(parent);
+ return found;
+}
+
+static int opal_prd_open(struct inode *inode, struct file *file)
+{
+ /*
+ * Prevent multiple (separate) processes from concurrent interactions
+ * with the FW PRD channel
+ */
+ if (atomic_xchg(&prd_usage, 1) == 1)
+ return -EBUSY;
+
+ return 0;
+}
+
+/*
+ * opal_prd_mmap - maps firmware-provided ranges into userspace
+ * @file: file structure for the device
+ * @vma: VMA to map the registers into
+ */
+
+static int opal_prd_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ size_t addr, size;
+ pgprot_t page_prot;
+ int rc;
+
+ pr_devel("opal_prd_mmap(0x%016lx, 0x%016lx, 0x%lx, 0x%lx)\n",
+ vma->vm_start, vma->vm_end, vma->vm_pgoff,
+ vma->vm_flags);
+
+ addr = vma->vm_pgoff << PAGE_SHIFT;
+ size = vma->vm_end - vma->vm_start;
+
+ /* ensure we're mapping within one of the allowable ranges */
+ if (!opal_prd_range_is_valid(addr, size))
+ return -EINVAL;
+
+ page_prot = phys_mem_access_prot(file, vma->vm_pgoff,
+ size, vma->vm_page_prot);
+
+ rc = remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff, size,
+ page_prot);
+
+ return rc;
+}
+
+static bool opal_msg_queue_empty(void)
+{
+ unsigned long flags;
+ bool ret;
+
+ spin_lock_irqsave(&opal_prd_msg_queue_lock, flags);
+ ret = list_empty(&opal_prd_msg_queue);
+ spin_unlock_irqrestore(&opal_prd_msg_queue_lock, flags);
+
+ return ret;
+}
+
+static __poll_t opal_prd_poll(struct file *file,
+ struct poll_table_struct *wait)
+{
+ poll_wait(file, &opal_prd_msg_wait, wait);
+
+ if (!opal_msg_queue_empty())
+ return EPOLLIN | EPOLLRDNORM;
+
+ return 0;
+}
+
+static ssize_t opal_prd_read(struct file *file, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct opal_prd_msg_queue_item *item;
+ unsigned long flags;
+ ssize_t size, err;
+ int rc;
+
+ /* we need at least a header's worth of data */
+ if (count < sizeof(item->msg))
+ return -EINVAL;
+
+ if (*ppos)
+ return -ESPIPE;
+
+ item = NULL;
+
+ for (;;) {
+
+ spin_lock_irqsave(&opal_prd_msg_queue_lock, flags);
+ if (!list_empty(&opal_prd_msg_queue)) {
+ item = list_first_entry(&opal_prd_msg_queue,
+ struct opal_prd_msg_queue_item, list);
+ list_del(&item->list);
+ }
+ spin_unlock_irqrestore(&opal_prd_msg_queue_lock, flags);
+
+ if (item)
+ break;
+
+ if (file->f_flags & O_NONBLOCK)
+ return -EAGAIN;
+
+ rc = wait_event_interruptible(opal_prd_msg_wait,
+ !opal_msg_queue_empty());
+ if (rc)
+ return -EINTR;
+ }
+
+ size = be16_to_cpu(item->msg.size);
+ if (size > count) {
+ err = -EINVAL;
+ goto err_requeue;
+ }
+
+ rc = copy_to_user(buf, &item->msg, size);
+ if (rc) {
+ err = -EFAULT;
+ goto err_requeue;
+ }
+
+ kfree(item);
+
+ return size;
+
+err_requeue:
+ /* eep! re-queue at the head of the list */
+ spin_lock_irqsave(&opal_prd_msg_queue_lock, flags);
+ list_add(&item->list, &opal_prd_msg_queue);
+ spin_unlock_irqrestore(&opal_prd_msg_queue_lock, flags);
+ return err;
+}
+
+static ssize_t opal_prd_write(struct file *file, const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct opal_prd_msg_header hdr;
+ ssize_t size;
+ void *msg;
+ int rc;
+
+ size = sizeof(hdr);
+
+ if (count < size)
+ return -EINVAL;
+
+ /* grab the header */
+ rc = copy_from_user(&hdr, buf, sizeof(hdr));
+ if (rc)
+ return -EFAULT;
+
+ size = be16_to_cpu(hdr.size);
+
+ msg = memdup_user(buf, size);
+ if (IS_ERR(msg))
+ return PTR_ERR(msg);
+
+ rc = opal_prd_msg(msg);
+ if (rc) {
+ pr_warn("write: opal_prd_msg returned %d\n", rc);
+ size = -EIO;
+ }
+
+ kfree(msg);
+
+ return size;
+}
+
+static int opal_prd_release(struct inode *inode, struct file *file)
+{
+ struct opal_prd_msg_header msg;
+
+ msg.size = cpu_to_be16(sizeof(msg));
+ msg.type = OPAL_PRD_MSG_TYPE_FINI;
+
+ opal_prd_msg((struct opal_prd_msg *)&msg);
+
+ atomic_xchg(&prd_usage, 0);
+
+ return 0;
+}
+
+static long opal_prd_ioctl(struct file *file, unsigned int cmd,
+ unsigned long param)
+{
+ struct opal_prd_info info;
+ struct opal_prd_scom scom;
+ int rc = 0;
+
+ switch (cmd) {
+ case OPAL_PRD_GET_INFO:
+ memset(&info, 0, sizeof(info));
+ info.version = OPAL_PRD_KERNEL_VERSION;
+ rc = copy_to_user((void __user *)param, &info, sizeof(info));
+ if (rc)
+ return -EFAULT;
+ break;
+
+ case OPAL_PRD_SCOM_READ:
+ rc = copy_from_user(&scom, (void __user *)param, sizeof(scom));
+ if (rc)
+ return -EFAULT;
+
+ scom.rc = opal_xscom_read(scom.chip, scom.addr,
+ (__be64 *)&scom.data);
+ scom.data = be64_to_cpu(scom.data);
+ pr_devel("ioctl SCOM_READ: chip %llx addr %016llx data %016llx rc %lld\n",
+ scom.chip, scom.addr, scom.data, scom.rc);
+
+ rc = copy_to_user((void __user *)param, &scom, sizeof(scom));
+ if (rc)
+ return -EFAULT;
+ break;
+
+ case OPAL_PRD_SCOM_WRITE:
+ rc = copy_from_user(&scom, (void __user *)param, sizeof(scom));
+ if (rc)
+ return -EFAULT;
+
+ scom.rc = opal_xscom_write(scom.chip, scom.addr, scom.data);
+ pr_devel("ioctl SCOM_WRITE: chip %llx addr %016llx data %016llx rc %lld\n",
+ scom.chip, scom.addr, scom.data, scom.rc);
+
+ rc = copy_to_user((void __user *)param, &scom, sizeof(scom));
+ if (rc)
+ return -EFAULT;
+ break;
+
+ default:
+ rc = -EINVAL;
+ }
+
+ return rc;
+}
+
+static const struct file_operations opal_prd_fops = {
+ .open = opal_prd_open,
+ .mmap = opal_prd_mmap,
+ .poll = opal_prd_poll,
+ .read = opal_prd_read,
+ .write = opal_prd_write,
+ .unlocked_ioctl = opal_prd_ioctl,
+ .release = opal_prd_release,
+ .owner = THIS_MODULE,
+};
+
+static struct miscdevice opal_prd_dev = {
+ .minor = MISC_DYNAMIC_MINOR,
+ .name = "opal-prd",
+ .fops = &opal_prd_fops,
+};
+
+/* opal interface */
+static int opal_prd_msg_notifier(struct notifier_block *nb,
+ unsigned long msg_type, void *_msg)
+{
+ struct opal_prd_msg_queue_item *item;
+ struct opal_prd_msg_header *hdr;
+ struct opal_msg *msg = _msg;
+ int msg_size, item_size;
+ unsigned long flags;
+
+ if (msg_type != OPAL_MSG_PRD)
+ return 0;
+
+ /* Calculate total size of the message and item we need to store. The
+ * 'size' field in the header includes the header itself. */
+ hdr = (void *)msg->params;
+ msg_size = be16_to_cpu(hdr->size);
+ item_size = msg_size + sizeof(*item) - sizeof(item->msg);
+
+ item = kzalloc(item_size, GFP_ATOMIC);
+ if (!item)
+ return -ENOMEM;
+
+ memcpy(&item->msg, msg->params, msg_size);
+
+ spin_lock_irqsave(&opal_prd_msg_queue_lock, flags);
+ list_add_tail(&item->list, &opal_prd_msg_queue);
+ spin_unlock_irqrestore(&opal_prd_msg_queue_lock, flags);
+
+ wake_up_interruptible(&opal_prd_msg_wait);
+
+ return 0;
+}
+
+static struct notifier_block opal_prd_event_nb = {
+ .notifier_call = opal_prd_msg_notifier,
+ .next = NULL,
+ .priority = 0,
+};
+
+static int opal_prd_probe(struct platform_device *pdev)
+{
+ int rc;
+
+ if (!pdev || !pdev->dev.of_node)
+ return -ENODEV;
+
+ /* We should only have one prd driver instance per machine; ensure
+ * that we only get a valid probe on a single OF node.
+ */
+ if (prd_node)
+ return -EBUSY;
+
+ prd_node = pdev->dev.of_node;
+
+ rc = opal_message_notifier_register(OPAL_MSG_PRD, &opal_prd_event_nb);
+ if (rc) {
+ pr_err("Couldn't register event notifier\n");
+ return rc;
+ }
+
+ rc = misc_register(&opal_prd_dev);
+ if (rc) {
+ pr_err("failed to register miscdev\n");
+ opal_message_notifier_unregister(OPAL_MSG_PRD,
+ &opal_prd_event_nb);
+ return rc;
+ }
+
+ return 0;
+}
+
+static int opal_prd_remove(struct platform_device *pdev)
+{
+ misc_deregister(&opal_prd_dev);
+ opal_message_notifier_unregister(OPAL_MSG_PRD, &opal_prd_event_nb);
+ return 0;
+}
+
+static const struct of_device_id opal_prd_match[] = {
+ { .compatible = "ibm,opal-prd" },
+ { },
+};
+
+static struct platform_driver opal_prd_driver = {
+ .driver = {
+ .name = "opal-prd",
+ .of_match_table = opal_prd_match,
+ },
+ .probe = opal_prd_probe,
+ .remove = opal_prd_remove,
+};
+
+module_platform_driver(opal_prd_driver);
+
+MODULE_DEVICE_TABLE(of, opal_prd_match);
+MODULE_DESCRIPTION("PowerNV OPAL runtime diagnostic driver");
+MODULE_LICENSE("GPL");
diff --git a/arch/powerpc/platforms/powernv/opal-psr.c b/arch/powerpc/platforms/powernv/opal-psr.c
new file mode 100644
index 000000000..74986b35c
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/opal-psr.c
@@ -0,0 +1,175 @@
+/*
+ * PowerNV OPAL Power-Shift-Ratio interface
+ *
+ * Copyright 2017 IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#define pr_fmt(fmt) "opal-psr: " fmt
+
+#include <linux/of.h>
+#include <linux/kobject.h>
+#include <linux/slab.h>
+
+#include <asm/opal.h>
+
+DEFINE_MUTEX(psr_mutex);
+
+static struct kobject *psr_kobj;
+
+struct psr_attr {
+ u32 handle;
+ struct kobj_attribute attr;
+} *psr_attrs;
+
+static ssize_t psr_show(struct kobject *kobj, struct kobj_attribute *attr,
+ char *buf)
+{
+ struct psr_attr *psr_attr = container_of(attr, struct psr_attr, attr);
+ struct opal_msg msg;
+ int psr, ret, token;
+
+ token = opal_async_get_token_interruptible();
+ if (token < 0) {
+ pr_devel("Failed to get token\n");
+ return token;
+ }
+
+ ret = mutex_lock_interruptible(&psr_mutex);
+ if (ret)
+ goto out_token;
+
+ ret = opal_get_power_shift_ratio(psr_attr->handle, token,
+ (u32 *)__pa(&psr));
+ switch (ret) {
+ case OPAL_ASYNC_COMPLETION:
+ ret = opal_async_wait_response(token, &msg);
+ if (ret) {
+ pr_devel("Failed to wait for the async response\n");
+ ret = -EIO;
+ goto out;
+ }
+ ret = opal_error_code(opal_get_async_rc(msg));
+ if (!ret) {
+ ret = sprintf(buf, "%u\n", be32_to_cpu(psr));
+ if (ret < 0)
+ ret = -EIO;
+ }
+ break;
+ case OPAL_SUCCESS:
+ ret = sprintf(buf, "%u\n", be32_to_cpu(psr));
+ if (ret < 0)
+ ret = -EIO;
+ break;
+ default:
+ ret = opal_error_code(ret);
+ }
+
+out:
+ mutex_unlock(&psr_mutex);
+out_token:
+ opal_async_release_token(token);
+ return ret;
+}
+
+static ssize_t psr_store(struct kobject *kobj, struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct psr_attr *psr_attr = container_of(attr, struct psr_attr, attr);
+ struct opal_msg msg;
+ int psr, ret, token;
+
+ ret = kstrtoint(buf, 0, &psr);
+ if (ret)
+ return ret;
+
+ token = opal_async_get_token_interruptible();
+ if (token < 0) {
+ pr_devel("Failed to get token\n");
+ return token;
+ }
+
+ ret = mutex_lock_interruptible(&psr_mutex);
+ if (ret)
+ goto out_token;
+
+ ret = opal_set_power_shift_ratio(psr_attr->handle, token, psr);
+ switch (ret) {
+ case OPAL_ASYNC_COMPLETION:
+ ret = opal_async_wait_response(token, &msg);
+ if (ret) {
+ pr_devel("Failed to wait for the async response\n");
+ ret = -EIO;
+ goto out;
+ }
+ ret = opal_error_code(opal_get_async_rc(msg));
+ if (!ret)
+ ret = count;
+ break;
+ case OPAL_SUCCESS:
+ ret = count;
+ break;
+ default:
+ ret = opal_error_code(ret);
+ }
+
+out:
+ mutex_unlock(&psr_mutex);
+out_token:
+ opal_async_release_token(token);
+ return ret;
+}
+
+void __init opal_psr_init(void)
+{
+ struct device_node *psr, *node;
+ int i = 0;
+
+ psr = of_find_compatible_node(NULL, NULL,
+ "ibm,opal-power-shift-ratio");
+ if (!psr) {
+ pr_devel("Power-shift-ratio node not found\n");
+ return;
+ }
+
+ psr_attrs = kcalloc(of_get_child_count(psr), sizeof(*psr_attrs),
+ GFP_KERNEL);
+ if (!psr_attrs)
+ return;
+
+ psr_kobj = kobject_create_and_add("psr", opal_kobj);
+ if (!psr_kobj) {
+ pr_warn("Failed to create psr kobject\n");
+ goto out;
+ }
+
+ for_each_child_of_node(psr, node) {
+ if (of_property_read_u32(node, "handle",
+ &psr_attrs[i].handle))
+ goto out_kobj;
+
+ sysfs_attr_init(&psr_attrs[i].attr.attr);
+ if (of_property_read_string(node, "label",
+ &psr_attrs[i].attr.attr.name))
+ goto out_kobj;
+ psr_attrs[i].attr.attr.mode = 0664;
+ psr_attrs[i].attr.show = psr_show;
+ psr_attrs[i].attr.store = psr_store;
+ if (sysfs_create_file(psr_kobj, &psr_attrs[i].attr.attr)) {
+ pr_devel("Failed to create psr sysfs file %s\n",
+ psr_attrs[i].attr.attr.name);
+ goto out_kobj;
+ }
+ i++;
+ }
+
+ return;
+out_kobj:
+ kobject_put(psr_kobj);
+out:
+ kfree(psr_attrs);
+}
diff --git a/arch/powerpc/platforms/powernv/opal-rtc.c b/arch/powerpc/platforms/powernv/opal-rtc.c
new file mode 100644
index 000000000..42ec642a3
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/opal-rtc.c
@@ -0,0 +1,87 @@
+/*
+ * PowerNV Real Time Clock.
+ *
+ * Copyright 2011 IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+
+#include <linux/kernel.h>
+#include <linux/time.h>
+#include <linux/bcd.h>
+#include <linux/rtc.h>
+#include <linux/delay.h>
+#include <linux/platform_device.h>
+#include <linux/of_platform.h>
+
+#include <asm/opal.h>
+#include <asm/firmware.h>
+#include <asm/machdep.h>
+
+static void opal_to_tm(u32 y_m_d, u64 h_m_s_ms, struct rtc_time *tm)
+{
+ tm->tm_year = ((bcd2bin(y_m_d >> 24) * 100) +
+ bcd2bin((y_m_d >> 16) & 0xff)) - 1900;
+ tm->tm_mon = bcd2bin((y_m_d >> 8) & 0xff) - 1;
+ tm->tm_mday = bcd2bin(y_m_d & 0xff);
+ tm->tm_hour = bcd2bin((h_m_s_ms >> 56) & 0xff);
+ tm->tm_min = bcd2bin((h_m_s_ms >> 48) & 0xff);
+ tm->tm_sec = bcd2bin((h_m_s_ms >> 40) & 0xff);
+ tm->tm_wday = -1;
+}
+
+time64_t __init opal_get_boot_time(void)
+{
+ struct rtc_time tm;
+ u32 y_m_d;
+ u64 h_m_s_ms;
+ __be32 __y_m_d;
+ __be64 __h_m_s_ms;
+ long rc = OPAL_BUSY;
+
+ if (!opal_check_token(OPAL_RTC_READ))
+ return 0;
+
+ while (rc == OPAL_BUSY || rc == OPAL_BUSY_EVENT) {
+ rc = opal_rtc_read(&__y_m_d, &__h_m_s_ms);
+ if (rc == OPAL_BUSY_EVENT) {
+ mdelay(OPAL_BUSY_DELAY_MS);
+ opal_poll_events(NULL);
+ } else if (rc == OPAL_BUSY) {
+ mdelay(OPAL_BUSY_DELAY_MS);
+ }
+ }
+ if (rc != OPAL_SUCCESS)
+ return 0;
+
+ y_m_d = be32_to_cpu(__y_m_d);
+ h_m_s_ms = be64_to_cpu(__h_m_s_ms);
+ opal_to_tm(y_m_d, h_m_s_ms, &tm);
+ return rtc_tm_to_time64(&tm);
+}
+
+static __init int opal_time_init(void)
+{
+ struct platform_device *pdev;
+ struct device_node *rtc;
+
+ rtc = of_find_node_by_path("/ibm,opal/rtc");
+ if (rtc) {
+ pdev = of_platform_device_create(rtc, "opal-rtc", NULL);
+ of_node_put(rtc);
+ } else {
+ if (opal_check_token(OPAL_RTC_READ) ||
+ opal_check_token(OPAL_READ_TPO))
+ pdev = platform_device_register_simple("opal-rtc", -1,
+ NULL, 0);
+ else
+ return -ENODEV;
+ }
+
+ return PTR_ERR_OR_ZERO(pdev);
+}
+machine_subsys_initcall(powernv, opal_time_init);
diff --git a/arch/powerpc/platforms/powernv/opal-sensor-groups.c b/arch/powerpc/platforms/powernv/opal-sensor-groups.c
new file mode 100644
index 000000000..f7d04b6a2
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/opal-sensor-groups.c
@@ -0,0 +1,240 @@
+/*
+ * PowerNV OPAL Sensor-groups interface
+ *
+ * Copyright 2017 IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#define pr_fmt(fmt) "opal-sensor-groups: " fmt
+
+#include <linux/of.h>
+#include <linux/kobject.h>
+#include <linux/slab.h>
+
+#include <asm/opal.h>
+
+DEFINE_MUTEX(sg_mutex);
+
+static struct kobject *sg_kobj;
+
+struct sg_attr {
+ u32 handle;
+ struct kobj_attribute attr;
+};
+
+static struct sensor_group {
+ char name[20];
+ struct attribute_group sg;
+ struct sg_attr *sgattrs;
+} *sgs;
+
+int sensor_group_enable(u32 handle, bool enable)
+{
+ struct opal_msg msg;
+ int token, ret;
+
+ token = opal_async_get_token_interruptible();
+ if (token < 0)
+ return token;
+
+ ret = opal_sensor_group_enable(handle, token, enable);
+ if (ret == OPAL_ASYNC_COMPLETION) {
+ ret = opal_async_wait_response(token, &msg);
+ if (ret) {
+ pr_devel("Failed to wait for the async response\n");
+ ret = -EIO;
+ goto out;
+ }
+ ret = opal_error_code(opal_get_async_rc(msg));
+ } else {
+ ret = opal_error_code(ret);
+ }
+
+out:
+ opal_async_release_token(token);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(sensor_group_enable);
+
+static ssize_t sg_store(struct kobject *kobj, struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct sg_attr *sattr = container_of(attr, struct sg_attr, attr);
+ struct opal_msg msg;
+ u32 data;
+ int ret, token;
+
+ ret = kstrtoint(buf, 0, &data);
+ if (ret)
+ return ret;
+
+ if (data != 1)
+ return -EINVAL;
+
+ token = opal_async_get_token_interruptible();
+ if (token < 0) {
+ pr_devel("Failed to get token\n");
+ return token;
+ }
+
+ ret = mutex_lock_interruptible(&sg_mutex);
+ if (ret)
+ goto out_token;
+
+ ret = opal_sensor_group_clear(sattr->handle, token);
+ switch (ret) {
+ case OPAL_ASYNC_COMPLETION:
+ ret = opal_async_wait_response(token, &msg);
+ if (ret) {
+ pr_devel("Failed to wait for the async response\n");
+ ret = -EIO;
+ goto out;
+ }
+ ret = opal_error_code(opal_get_async_rc(msg));
+ if (!ret)
+ ret = count;
+ break;
+ case OPAL_SUCCESS:
+ ret = count;
+ break;
+ default:
+ ret = opal_error_code(ret);
+ }
+
+out:
+ mutex_unlock(&sg_mutex);
+out_token:
+ opal_async_release_token(token);
+ return ret;
+}
+
+static struct sg_ops_info {
+ int opal_no;
+ const char *attr_name;
+ ssize_t (*store)(struct kobject *kobj, struct kobj_attribute *attr,
+ const char *buf, size_t count);
+} ops_info[] = {
+ { OPAL_SENSOR_GROUP_CLEAR, "clear", sg_store },
+};
+
+static void add_attr(int handle, struct sg_attr *attr, int index)
+{
+ attr->handle = handle;
+ sysfs_attr_init(&attr->attr.attr);
+ attr->attr.attr.name = ops_info[index].attr_name;
+ attr->attr.attr.mode = 0220;
+ attr->attr.store = ops_info[index].store;
+}
+
+static int add_attr_group(const __be32 *ops, int len, struct sensor_group *sg,
+ u32 handle)
+{
+ int i, j;
+ int count = 0;
+
+ for (i = 0; i < len; i++)
+ for (j = 0; j < ARRAY_SIZE(ops_info); j++)
+ if (be32_to_cpu(ops[i]) == ops_info[j].opal_no) {
+ add_attr(handle, &sg->sgattrs[count], j);
+ sg->sg.attrs[count] =
+ &sg->sgattrs[count].attr.attr;
+ count++;
+ }
+
+ return sysfs_create_group(sg_kobj, &sg->sg);
+}
+
+static int get_nr_attrs(const __be32 *ops, int len)
+{
+ int i, j;
+ int nr_attrs = 0;
+
+ for (i = 0; i < len; i++)
+ for (j = 0; j < ARRAY_SIZE(ops_info); j++)
+ if (be32_to_cpu(ops[i]) == ops_info[j].opal_no)
+ nr_attrs++;
+
+ return nr_attrs;
+}
+
+void __init opal_sensor_groups_init(void)
+{
+ struct device_node *sg, *node;
+ int i = 0;
+
+ sg = of_find_compatible_node(NULL, NULL, "ibm,opal-sensor-group");
+ if (!sg) {
+ pr_devel("Sensor groups node not found\n");
+ return;
+ }
+
+ sgs = kcalloc(of_get_child_count(sg), sizeof(*sgs), GFP_KERNEL);
+ if (!sgs)
+ return;
+
+ sg_kobj = kobject_create_and_add("sensor_groups", opal_kobj);
+ if (!sg_kobj) {
+ pr_warn("Failed to create sensor group kobject\n");
+ goto out_sgs;
+ }
+
+ for_each_child_of_node(sg, node) {
+ const __be32 *ops;
+ u32 sgid, len, nr_attrs, chipid;
+
+ ops = of_get_property(node, "ops", &len);
+ if (!ops)
+ continue;
+
+ nr_attrs = get_nr_attrs(ops, len);
+ if (!nr_attrs)
+ continue;
+
+ sgs[i].sgattrs = kcalloc(nr_attrs, sizeof(*sgs[i].sgattrs),
+ GFP_KERNEL);
+ if (!sgs[i].sgattrs)
+ goto out_sgs_sgattrs;
+
+ sgs[i].sg.attrs = kcalloc(nr_attrs + 1,
+ sizeof(*sgs[i].sg.attrs),
+ GFP_KERNEL);
+
+ if (!sgs[i].sg.attrs) {
+ kfree(sgs[i].sgattrs);
+ goto out_sgs_sgattrs;
+ }
+
+ if (of_property_read_u32(node, "sensor-group-id", &sgid)) {
+ pr_warn("sensor-group-id property not found\n");
+ goto out_sgs_sgattrs;
+ }
+
+ if (!of_property_read_u32(node, "ibm,chip-id", &chipid))
+ sprintf(sgs[i].name, "%s%d", node->name, chipid);
+ else
+ sprintf(sgs[i].name, "%s", node->name);
+
+ sgs[i].sg.name = sgs[i].name;
+ if (add_attr_group(ops, len, &sgs[i], sgid)) {
+ pr_warn("Failed to create sensor attribute group %s\n",
+ sgs[i].sg.name);
+ goto out_sgs_sgattrs;
+ }
+ i++;
+ }
+
+ return;
+
+out_sgs_sgattrs:
+ while (--i >= 0) {
+ kfree(sgs[i].sgattrs);
+ kfree(sgs[i].sg.attrs);
+ }
+ kobject_put(sg_kobj);
+out_sgs:
+ kfree(sgs);
+}
diff --git a/arch/powerpc/platforms/powernv/opal-sensor.c b/arch/powerpc/platforms/powernv/opal-sensor.c
new file mode 100644
index 000000000..35a5f4b9a
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/opal-sensor.c
@@ -0,0 +1,143 @@
+/*
+ * PowerNV sensor code
+ *
+ * Copyright (C) 2013 IBM
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/delay.h>
+#include <linux/of_platform.h>
+#include <asm/opal.h>
+#include <asm/machdep.h>
+
+/*
+ * This will return sensor information to driver based on the requested sensor
+ * handle. A handle is an opaque id for the powernv, read by the driver from the
+ * device tree..
+ */
+int opal_get_sensor_data(u32 sensor_hndl, u32 *sensor_data)
+{
+ int ret, token;
+ struct opal_msg msg;
+ __be32 data;
+
+ token = opal_async_get_token_interruptible();
+ if (token < 0)
+ return token;
+
+ ret = opal_sensor_read(sensor_hndl, token, &data);
+ switch (ret) {
+ case OPAL_ASYNC_COMPLETION:
+ ret = opal_async_wait_response(token, &msg);
+ if (ret) {
+ pr_err("%s: Failed to wait for the async response, %d\n",
+ __func__, ret);
+ goto out;
+ }
+
+ ret = opal_error_code(opal_get_async_rc(msg));
+ *sensor_data = be32_to_cpu(data);
+ break;
+
+ case OPAL_SUCCESS:
+ ret = 0;
+ *sensor_data = be32_to_cpu(data);
+ break;
+
+ case OPAL_WRONG_STATE:
+ ret = -EIO;
+ break;
+
+ default:
+ ret = opal_error_code(ret);
+ break;
+ }
+
+out:
+ opal_async_release_token(token);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(opal_get_sensor_data);
+
+int opal_get_sensor_data_u64(u32 sensor_hndl, u64 *sensor_data)
+{
+ int ret, token;
+ struct opal_msg msg;
+ __be64 data;
+
+ if (!opal_check_token(OPAL_SENSOR_READ_U64)) {
+ u32 sdata;
+
+ ret = opal_get_sensor_data(sensor_hndl, &sdata);
+ if (!ret)
+ *sensor_data = sdata;
+ return ret;
+ }
+
+ token = opal_async_get_token_interruptible();
+ if (token < 0)
+ return token;
+
+ ret = opal_sensor_read_u64(sensor_hndl, token, &data);
+ switch (ret) {
+ case OPAL_ASYNC_COMPLETION:
+ ret = opal_async_wait_response(token, &msg);
+ if (ret) {
+ pr_err("%s: Failed to wait for the async response, %d\n",
+ __func__, ret);
+ goto out_token;
+ }
+
+ ret = opal_error_code(opal_get_async_rc(msg));
+ *sensor_data = be64_to_cpu(data);
+ break;
+
+ case OPAL_SUCCESS:
+ ret = 0;
+ *sensor_data = be64_to_cpu(data);
+ break;
+
+ case OPAL_WRONG_STATE:
+ ret = -EIO;
+ break;
+
+ default:
+ ret = opal_error_code(ret);
+ break;
+ }
+
+out_token:
+ opal_async_release_token(token);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(opal_get_sensor_data_u64);
+
+int __init opal_sensor_init(void)
+{
+ struct platform_device *pdev;
+ struct device_node *sensor;
+
+ sensor = of_find_node_by_path("/ibm,opal/sensors");
+ if (!sensor) {
+ pr_err("Opal node 'sensors' not found\n");
+ return -ENODEV;
+ }
+
+ pdev = of_platform_device_create(sensor, "opal-sensor", NULL);
+ of_node_put(sensor);
+
+ return PTR_ERR_OR_ZERO(pdev);
+}
diff --git a/arch/powerpc/platforms/powernv/opal-sysparam.c b/arch/powerpc/platforms/powernv/opal-sysparam.c
new file mode 100644
index 000000000..9aa87df11
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/opal-sysparam.c
@@ -0,0 +1,307 @@
+/*
+ * PowerNV system parameter code
+ *
+ * Copyright (C) 2013 IBM
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/kobject.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include <linux/of.h>
+#include <linux/gfp.h>
+#include <linux/stat.h>
+#include <asm/opal.h>
+
+#define MAX_PARAM_DATA_LEN 64
+
+static DEFINE_MUTEX(opal_sysparam_mutex);
+static struct kobject *sysparam_kobj;
+static void *param_data_buf;
+
+struct param_attr {
+ struct list_head list;
+ u32 param_id;
+ u32 param_size;
+ struct kobj_attribute kobj_attr;
+};
+
+static ssize_t opal_get_sys_param(u32 param_id, u32 length, void *buffer)
+{
+ struct opal_msg msg;
+ ssize_t ret;
+ int token;
+
+ token = opal_async_get_token_interruptible();
+ if (token < 0) {
+ if (token != -ERESTARTSYS)
+ pr_err("%s: Couldn't get the token, returning\n",
+ __func__);
+ ret = token;
+ goto out;
+ }
+
+ ret = opal_get_param(token, param_id, (u64)buffer, length);
+ if (ret != OPAL_ASYNC_COMPLETION) {
+ ret = opal_error_code(ret);
+ goto out_token;
+ }
+
+ ret = opal_async_wait_response(token, &msg);
+ if (ret) {
+ pr_err("%s: Failed to wait for the async response, %zd\n",
+ __func__, ret);
+ goto out_token;
+ }
+
+ ret = opal_error_code(opal_get_async_rc(msg));
+
+out_token:
+ opal_async_release_token(token);
+out:
+ return ret;
+}
+
+static int opal_set_sys_param(u32 param_id, u32 length, void *buffer)
+{
+ struct opal_msg msg;
+ int ret, token;
+
+ token = opal_async_get_token_interruptible();
+ if (token < 0) {
+ if (token != -ERESTARTSYS)
+ pr_err("%s: Couldn't get the token, returning\n",
+ __func__);
+ ret = token;
+ goto out;
+ }
+
+ ret = opal_set_param(token, param_id, (u64)buffer, length);
+
+ if (ret != OPAL_ASYNC_COMPLETION) {
+ ret = opal_error_code(ret);
+ goto out_token;
+ }
+
+ ret = opal_async_wait_response(token, &msg);
+ if (ret) {
+ pr_err("%s: Failed to wait for the async response, %d\n",
+ __func__, ret);
+ goto out_token;
+ }
+
+ ret = opal_error_code(opal_get_async_rc(msg));
+
+out_token:
+ opal_async_release_token(token);
+out:
+ return ret;
+}
+
+static ssize_t sys_param_show(struct kobject *kobj,
+ struct kobj_attribute *kobj_attr, char *buf)
+{
+ struct param_attr *attr = container_of(kobj_attr, struct param_attr,
+ kobj_attr);
+ ssize_t ret;
+
+ mutex_lock(&opal_sysparam_mutex);
+ ret = opal_get_sys_param(attr->param_id, attr->param_size,
+ param_data_buf);
+ if (ret)
+ goto out;
+
+ memcpy(buf, param_data_buf, attr->param_size);
+
+ ret = attr->param_size;
+out:
+ mutex_unlock(&opal_sysparam_mutex);
+ return ret;
+}
+
+static ssize_t sys_param_store(struct kobject *kobj,
+ struct kobj_attribute *kobj_attr, const char *buf, size_t count)
+{
+ struct param_attr *attr = container_of(kobj_attr, struct param_attr,
+ kobj_attr);
+ ssize_t ret;
+
+ /* MAX_PARAM_DATA_LEN is sizeof(param_data_buf) */
+ if (count > MAX_PARAM_DATA_LEN)
+ count = MAX_PARAM_DATA_LEN;
+
+ mutex_lock(&opal_sysparam_mutex);
+ memcpy(param_data_buf, buf, count);
+ ret = opal_set_sys_param(attr->param_id, attr->param_size,
+ param_data_buf);
+ mutex_unlock(&opal_sysparam_mutex);
+ if (!ret)
+ ret = count;
+ return ret;
+}
+
+void __init opal_sys_param_init(void)
+{
+ struct device_node *sysparam;
+ struct param_attr *attr;
+ u32 *id, *size;
+ int count, i;
+ u8 *perm;
+
+ if (!opal_kobj) {
+ pr_warn("SYSPARAM: opal kobject is not available\n");
+ goto out;
+ }
+
+ /* Some systems do not use sysparams; this is not an error */
+ sysparam = of_find_node_by_path("/ibm,opal/sysparams");
+ if (!sysparam)
+ goto out;
+
+ if (!of_device_is_compatible(sysparam, "ibm,opal-sysparams")) {
+ pr_err("SYSPARAM: Opal sysparam node not compatible\n");
+ goto out_node_put;
+ }
+
+ sysparam_kobj = kobject_create_and_add("sysparams", opal_kobj);
+ if (!sysparam_kobj) {
+ pr_err("SYSPARAM: Failed to create sysparam kobject\n");
+ goto out_node_put;
+ }
+
+ /* Allocate big enough buffer for any get/set transactions */
+ param_data_buf = kzalloc(MAX_PARAM_DATA_LEN, GFP_KERNEL);
+ if (!param_data_buf) {
+ pr_err("SYSPARAM: Failed to allocate memory for param data "
+ "buf\n");
+ goto out_kobj_put;
+ }
+
+ /* Number of parameters exposed through DT */
+ count = of_property_count_strings(sysparam, "param-name");
+ if (count < 0) {
+ pr_err("SYSPARAM: No string found of property param-name in "
+ "the node %s\n", sysparam->name);
+ goto out_param_buf;
+ }
+
+ id = kcalloc(count, sizeof(*id), GFP_KERNEL);
+ if (!id) {
+ pr_err("SYSPARAM: Failed to allocate memory to read parameter "
+ "id\n");
+ goto out_param_buf;
+ }
+
+ size = kcalloc(count, sizeof(*size), GFP_KERNEL);
+ if (!size) {
+ pr_err("SYSPARAM: Failed to allocate memory to read parameter "
+ "size\n");
+ goto out_free_id;
+ }
+
+ perm = kcalloc(count, sizeof(*perm), GFP_KERNEL);
+ if (!perm) {
+ pr_err("SYSPARAM: Failed to allocate memory to read supported "
+ "action on the parameter");
+ goto out_free_size;
+ }
+
+ if (of_property_read_u32_array(sysparam, "param-id", id, count)) {
+ pr_err("SYSPARAM: Missing property param-id in the DT\n");
+ goto out_free_perm;
+ }
+
+ if (of_property_read_u32_array(sysparam, "param-len", size, count)) {
+ pr_err("SYSPARAM: Missing property param-len in the DT\n");
+ goto out_free_perm;
+ }
+
+
+ if (of_property_read_u8_array(sysparam, "param-perm", perm, count)) {
+ pr_err("SYSPARAM: Missing property param-perm in the DT\n");
+ goto out_free_perm;
+ }
+
+ attr = kcalloc(count, sizeof(*attr), GFP_KERNEL);
+ if (!attr) {
+ pr_err("SYSPARAM: Failed to allocate memory for parameter "
+ "attributes\n");
+ goto out_free_perm;
+ }
+
+ /* For each of the parameters, populate the parameter attributes */
+ for (i = 0; i < count; i++) {
+ if (size[i] > MAX_PARAM_DATA_LEN) {
+ pr_warn("SYSPARAM: Not creating parameter %d as size "
+ "exceeds buffer length\n", i);
+ continue;
+ }
+
+ sysfs_attr_init(&attr[i].kobj_attr.attr);
+ attr[i].param_id = id[i];
+ attr[i].param_size = size[i];
+ if (of_property_read_string_index(sysparam, "param-name", i,
+ &attr[i].kobj_attr.attr.name))
+ continue;
+
+ /* If the parameter is read-only or read-write */
+ switch (perm[i] & 3) {
+ case OPAL_SYSPARAM_READ:
+ attr[i].kobj_attr.attr.mode = 0444;
+ break;
+ case OPAL_SYSPARAM_WRITE:
+ attr[i].kobj_attr.attr.mode = 0200;
+ break;
+ case OPAL_SYSPARAM_RW:
+ attr[i].kobj_attr.attr.mode = 0644;
+ break;
+ default:
+ break;
+ }
+
+ attr[i].kobj_attr.show = sys_param_show;
+ attr[i].kobj_attr.store = sys_param_store;
+
+ if (sysfs_create_file(sysparam_kobj, &attr[i].kobj_attr.attr)) {
+ pr_err("SYSPARAM: Failed to create sysfs file %s\n",
+ attr[i].kobj_attr.attr.name);
+ goto out_free_attr;
+ }
+ }
+
+ kfree(perm);
+ kfree(size);
+ kfree(id);
+ of_node_put(sysparam);
+ return;
+
+out_free_attr:
+ kfree(attr);
+out_free_perm:
+ kfree(perm);
+out_free_size:
+ kfree(size);
+out_free_id:
+ kfree(id);
+out_param_buf:
+ kfree(param_data_buf);
+out_kobj_put:
+ kobject_put(sysparam_kobj);
+out_node_put:
+ of_node_put(sysparam);
+out:
+ return;
+}
diff --git a/arch/powerpc/platforms/powernv/opal-tracepoints.c b/arch/powerpc/platforms/powernv/opal-tracepoints.c
new file mode 100644
index 000000000..f16a43540
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/opal-tracepoints.c
@@ -0,0 +1,88 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/percpu.h>
+#include <linux/jump_label.h>
+#include <asm/trace.h>
+#include <asm/asm-prototypes.h>
+
+#ifdef CONFIG_JUMP_LABEL
+struct static_key opal_tracepoint_key = STATIC_KEY_INIT;
+
+int opal_tracepoint_regfunc(void)
+{
+ static_key_slow_inc(&opal_tracepoint_key);
+ return 0;
+}
+
+void opal_tracepoint_unregfunc(void)
+{
+ static_key_slow_dec(&opal_tracepoint_key);
+}
+#else
+/*
+ * We optimise OPAL calls by placing opal_tracepoint_refcount
+ * directly in the TOC so we can check if the opal tracepoints are
+ * enabled via a single load.
+ */
+
+/* NB: reg/unreg are called while guarded with the tracepoints_mutex */
+extern long opal_tracepoint_refcount;
+
+int opal_tracepoint_regfunc(void)
+{
+ opal_tracepoint_refcount++;
+ return 0;
+}
+
+void opal_tracepoint_unregfunc(void)
+{
+ opal_tracepoint_refcount--;
+}
+#endif
+
+/*
+ * Since the tracing code might execute OPAL calls we need to guard against
+ * recursion.
+ */
+static DEFINE_PER_CPU(unsigned int, opal_trace_depth);
+
+void __trace_opal_entry(unsigned long opcode, unsigned long *args)
+{
+ unsigned long flags;
+ unsigned int *depth;
+
+ local_irq_save(flags);
+
+ depth = this_cpu_ptr(&opal_trace_depth);
+
+ if (*depth)
+ goto out;
+
+ (*depth)++;
+ preempt_disable();
+ trace_opal_entry(opcode, args);
+ (*depth)--;
+
+out:
+ local_irq_restore(flags);
+}
+
+void __trace_opal_exit(long opcode, unsigned long retval)
+{
+ unsigned long flags;
+ unsigned int *depth;
+
+ local_irq_save(flags);
+
+ depth = this_cpu_ptr(&opal_trace_depth);
+
+ if (*depth)
+ goto out;
+
+ (*depth)++;
+ trace_opal_exit(opcode, retval);
+ preempt_enable();
+ (*depth)--;
+
+out:
+ local_irq_restore(flags);
+}
diff --git a/arch/powerpc/platforms/powernv/opal-wrappers.S b/arch/powerpc/platforms/powernv/opal-wrappers.S
new file mode 100644
index 000000000..74215ebda
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/opal-wrappers.S
@@ -0,0 +1,333 @@
+/*
+ * PowerNV OPAL API wrappers
+ *
+ * Copyright 2011 IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/jump_label.h>
+#include <asm/ppc_asm.h>
+#include <asm/hvcall.h>
+#include <asm/asm-offsets.h>
+#include <asm/opal.h>
+#include <asm/asm-compat.h>
+#include <asm/feature-fixups.h>
+
+ .section ".text"
+
+#ifdef CONFIG_TRACEPOINTS
+#ifdef CONFIG_JUMP_LABEL
+#define OPAL_BRANCH(LABEL) \
+ ARCH_STATIC_BRANCH(LABEL, opal_tracepoint_key)
+#else
+
+ .section ".toc","aw"
+
+ .globl opal_tracepoint_refcount
+opal_tracepoint_refcount:
+ .8byte 0
+
+ .section ".text"
+
+/*
+ * We branch around this in early init by using an unconditional cpu
+ * feature.
+ */
+#define OPAL_BRANCH(LABEL) \
+BEGIN_FTR_SECTION; \
+ b 1f; \
+END_FTR_SECTION(0, 1); \
+ ld r11,opal_tracepoint_refcount@toc(r2); \
+ cmpdi r11,0; \
+ bne- LABEL; \
+1:
+
+#endif
+
+#else
+#define OPAL_BRANCH(LABEL)
+#endif
+
+/*
+ * DO_OPAL_CALL assumes:
+ * r0 = opal call token
+ * r12 = msr
+ * LR has been saved
+ */
+#define DO_OPAL_CALL() \
+ mfcr r11; \
+ stw r11,8(r1); \
+ li r11,0; \
+ ori r11,r11,MSR_EE; \
+ std r12,PACASAVEDMSR(r13); \
+ andc r12,r12,r11; \
+ mtmsrd r12,1; \
+ LOAD_REG_ADDR(r11,opal_return); \
+ mtlr r11; \
+ li r11,MSR_DR|MSR_IR|MSR_LE;\
+ andc r12,r12,r11; \
+ mtspr SPRN_HSRR1,r12; \
+ LOAD_REG_ADDR(r11,opal); \
+ ld r12,8(r11); \
+ ld r2,0(r11); \
+ mtspr SPRN_HSRR0,r12; \
+ hrfid
+
+#define OPAL_CALL(name, token) \
+ _GLOBAL_TOC(name); \
+ mfmsr r12; \
+ mflr r0; \
+ andi. r11,r12,MSR_IR|MSR_DR; \
+ std r0,PPC_LR_STKOFF(r1); \
+ li r0,token; \
+ beq opal_real_call; \
+ OPAL_BRANCH(opal_tracepoint_entry) \
+ DO_OPAL_CALL()
+
+
+opal_return:
+ /*
+ * Fixup endian on OPAL return... we should be able to simplify
+ * this by instead converting the below trampoline to a set of
+ * bytes (always BE) since MSR:LE will end up fixed up as a side
+ * effect of the rfid.
+ */
+ FIXUP_ENDIAN_HV
+ ld r2,PACATOC(r13);
+ lwz r4,8(r1);
+ ld r5,PPC_LR_STKOFF(r1);
+ ld r6,PACASAVEDMSR(r13);
+ mtcr r4;
+ mtspr SPRN_HSRR0,r5;
+ mtspr SPRN_HSRR1,r6;
+ hrfid
+
+opal_real_call:
+ mfcr r11
+ stw r11,8(r1)
+ /* Set opal return address */
+ LOAD_REG_ADDR(r11, opal_return_realmode)
+ mtlr r11
+ li r11,MSR_LE
+ andc r12,r12,r11
+ mtspr SPRN_HSRR1,r12
+ LOAD_REG_ADDR(r11,opal)
+ ld r12,8(r11)
+ ld r2,0(r11)
+ mtspr SPRN_HSRR0,r12
+ hrfid
+
+opal_return_realmode:
+ FIXUP_ENDIAN_HV
+ ld r2,PACATOC(r13);
+ lwz r11,8(r1);
+ ld r12,PPC_LR_STKOFF(r1)
+ mtcr r11;
+ mtlr r12
+ blr
+
+#ifdef CONFIG_TRACEPOINTS
+opal_tracepoint_entry:
+ stdu r1,-STACKFRAMESIZE(r1)
+ std r0,STK_REG(R23)(r1)
+ std r3,STK_REG(R24)(r1)
+ std r4,STK_REG(R25)(r1)
+ std r5,STK_REG(R26)(r1)
+ std r6,STK_REG(R27)(r1)
+ std r7,STK_REG(R28)(r1)
+ std r8,STK_REG(R29)(r1)
+ std r9,STK_REG(R30)(r1)
+ std r10,STK_REG(R31)(r1)
+ mr r3,r0
+ addi r4,r1,STK_REG(R24)
+ bl __trace_opal_entry
+ ld r0,STK_REG(R23)(r1)
+ ld r3,STK_REG(R24)(r1)
+ ld r4,STK_REG(R25)(r1)
+ ld r5,STK_REG(R26)(r1)
+ ld r6,STK_REG(R27)(r1)
+ ld r7,STK_REG(R28)(r1)
+ ld r8,STK_REG(R29)(r1)
+ ld r9,STK_REG(R30)(r1)
+ ld r10,STK_REG(R31)(r1)
+
+ /* setup LR so we return via tracepoint_return */
+ LOAD_REG_ADDR(r11,opal_tracepoint_return)
+ std r11,16(r1)
+
+ mfmsr r12
+ DO_OPAL_CALL()
+
+opal_tracepoint_return:
+ std r3,STK_REG(R31)(r1)
+ mr r4,r3
+ ld r3,STK_REG(R23)(r1)
+ bl __trace_opal_exit
+ ld r3,STK_REG(R31)(r1)
+ addi r1,r1,STACKFRAMESIZE
+ ld r0,16(r1)
+ mtlr r0
+ blr
+#endif
+
+
+OPAL_CALL(opal_invalid_call, OPAL_INVALID_CALL);
+OPAL_CALL(opal_console_write, OPAL_CONSOLE_WRITE);
+OPAL_CALL(opal_console_read, OPAL_CONSOLE_READ);
+OPAL_CALL(opal_console_write_buffer_space, OPAL_CONSOLE_WRITE_BUFFER_SPACE);
+OPAL_CALL(opal_rtc_read, OPAL_RTC_READ);
+OPAL_CALL(opal_rtc_write, OPAL_RTC_WRITE);
+OPAL_CALL(opal_cec_power_down, OPAL_CEC_POWER_DOWN);
+OPAL_CALL(opal_cec_reboot, OPAL_CEC_REBOOT);
+OPAL_CALL(opal_cec_reboot2, OPAL_CEC_REBOOT2);
+OPAL_CALL(opal_read_nvram, OPAL_READ_NVRAM);
+OPAL_CALL(opal_write_nvram, OPAL_WRITE_NVRAM);
+OPAL_CALL(opal_handle_interrupt, OPAL_HANDLE_INTERRUPT);
+OPAL_CALL(opal_poll_events, OPAL_POLL_EVENTS);
+OPAL_CALL(opal_pci_set_hub_tce_memory, OPAL_PCI_SET_HUB_TCE_MEMORY);
+OPAL_CALL(opal_pci_set_phb_tce_memory, OPAL_PCI_SET_PHB_TCE_MEMORY);
+OPAL_CALL(opal_pci_config_read_byte, OPAL_PCI_CONFIG_READ_BYTE);
+OPAL_CALL(opal_pci_config_read_half_word, OPAL_PCI_CONFIG_READ_HALF_WORD);
+OPAL_CALL(opal_pci_config_read_word, OPAL_PCI_CONFIG_READ_WORD);
+OPAL_CALL(opal_pci_config_write_byte, OPAL_PCI_CONFIG_WRITE_BYTE);
+OPAL_CALL(opal_pci_config_write_half_word, OPAL_PCI_CONFIG_WRITE_HALF_WORD);
+OPAL_CALL(opal_pci_config_write_word, OPAL_PCI_CONFIG_WRITE_WORD);
+OPAL_CALL(opal_set_xive, OPAL_SET_XIVE);
+OPAL_CALL(opal_get_xive, OPAL_GET_XIVE);
+OPAL_CALL(opal_register_exception_handler, OPAL_REGISTER_OPAL_EXCEPTION_HANDLER);
+OPAL_CALL(opal_pci_eeh_freeze_status, OPAL_PCI_EEH_FREEZE_STATUS);
+OPAL_CALL(opal_pci_eeh_freeze_clear, OPAL_PCI_EEH_FREEZE_CLEAR);
+OPAL_CALL(opal_pci_eeh_freeze_set, OPAL_PCI_EEH_FREEZE_SET);
+OPAL_CALL(opal_pci_err_inject, OPAL_PCI_ERR_INJECT);
+OPAL_CALL(opal_pci_shpc, OPAL_PCI_SHPC);
+OPAL_CALL(opal_pci_phb_mmio_enable, OPAL_PCI_PHB_MMIO_ENABLE);
+OPAL_CALL(opal_pci_set_phb_mem_window, OPAL_PCI_SET_PHB_MEM_WINDOW);
+OPAL_CALL(opal_pci_map_pe_mmio_window, OPAL_PCI_MAP_PE_MMIO_WINDOW);
+OPAL_CALL(opal_pci_set_phb_table_memory, OPAL_PCI_SET_PHB_TABLE_MEMORY);
+OPAL_CALL(opal_pci_set_pe, OPAL_PCI_SET_PE);
+OPAL_CALL(opal_pci_set_peltv, OPAL_PCI_SET_PELTV);
+OPAL_CALL(opal_pci_set_mve, OPAL_PCI_SET_MVE);
+OPAL_CALL(opal_pci_set_mve_enable, OPAL_PCI_SET_MVE_ENABLE);
+OPAL_CALL(opal_pci_get_xive_reissue, OPAL_PCI_GET_XIVE_REISSUE);
+OPAL_CALL(opal_pci_set_xive_reissue, OPAL_PCI_SET_XIVE_REISSUE);
+OPAL_CALL(opal_pci_set_xive_pe, OPAL_PCI_SET_XIVE_PE);
+OPAL_CALL(opal_get_xive_source, OPAL_GET_XIVE_SOURCE);
+OPAL_CALL(opal_get_msi_32, OPAL_GET_MSI_32);
+OPAL_CALL(opal_get_msi_64, OPAL_GET_MSI_64);
+OPAL_CALL(opal_start_cpu, OPAL_START_CPU);
+OPAL_CALL(opal_query_cpu_status, OPAL_QUERY_CPU_STATUS);
+OPAL_CALL(opal_write_oppanel, OPAL_WRITE_OPPANEL);
+OPAL_CALL(opal_pci_map_pe_dma_window, OPAL_PCI_MAP_PE_DMA_WINDOW);
+OPAL_CALL(opal_pci_map_pe_dma_window_real, OPAL_PCI_MAP_PE_DMA_WINDOW_REAL);
+OPAL_CALL(opal_pci_reset, OPAL_PCI_RESET);
+OPAL_CALL(opal_pci_get_hub_diag_data, OPAL_PCI_GET_HUB_DIAG_DATA);
+OPAL_CALL(opal_pci_get_phb_diag_data, OPAL_PCI_GET_PHB_DIAG_DATA);
+OPAL_CALL(opal_pci_fence_phb, OPAL_PCI_FENCE_PHB);
+OPAL_CALL(opal_pci_reinit, OPAL_PCI_REINIT);
+OPAL_CALL(opal_pci_mask_pe_error, OPAL_PCI_MASK_PE_ERROR);
+OPAL_CALL(opal_set_slot_led_status, OPAL_SET_SLOT_LED_STATUS);
+OPAL_CALL(opal_get_epow_status, OPAL_GET_EPOW_STATUS);
+OPAL_CALL(opal_get_dpo_status, OPAL_GET_DPO_STATUS);
+OPAL_CALL(opal_set_system_attention_led, OPAL_SET_SYSTEM_ATTENTION_LED);
+OPAL_CALL(opal_pci_next_error, OPAL_PCI_NEXT_ERROR);
+OPAL_CALL(opal_pci_poll, OPAL_PCI_POLL);
+OPAL_CALL(opal_pci_msi_eoi, OPAL_PCI_MSI_EOI);
+OPAL_CALL(opal_pci_get_phb_diag_data2, OPAL_PCI_GET_PHB_DIAG_DATA2);
+OPAL_CALL(opal_xscom_read, OPAL_XSCOM_READ);
+OPAL_CALL(opal_xscom_write, OPAL_XSCOM_WRITE);
+OPAL_CALL(opal_lpc_read, OPAL_LPC_READ);
+OPAL_CALL(opal_lpc_write, OPAL_LPC_WRITE);
+OPAL_CALL(opal_return_cpu, OPAL_RETURN_CPU);
+OPAL_CALL(opal_reinit_cpus, OPAL_REINIT_CPUS);
+OPAL_CALL(opal_read_elog, OPAL_ELOG_READ);
+OPAL_CALL(opal_send_ack_elog, OPAL_ELOG_ACK);
+OPAL_CALL(opal_get_elog_size, OPAL_ELOG_SIZE);
+OPAL_CALL(opal_resend_pending_logs, OPAL_ELOG_RESEND);
+OPAL_CALL(opal_write_elog, OPAL_ELOG_WRITE);
+OPAL_CALL(opal_validate_flash, OPAL_FLASH_VALIDATE);
+OPAL_CALL(opal_manage_flash, OPAL_FLASH_MANAGE);
+OPAL_CALL(opal_update_flash, OPAL_FLASH_UPDATE);
+OPAL_CALL(opal_resync_timebase, OPAL_RESYNC_TIMEBASE);
+OPAL_CALL(opal_check_token, OPAL_CHECK_TOKEN);
+OPAL_CALL(opal_dump_init, OPAL_DUMP_INIT);
+OPAL_CALL(opal_dump_info, OPAL_DUMP_INFO);
+OPAL_CALL(opal_dump_info2, OPAL_DUMP_INFO2);
+OPAL_CALL(opal_dump_read, OPAL_DUMP_READ);
+OPAL_CALL(opal_dump_ack, OPAL_DUMP_ACK);
+OPAL_CALL(opal_get_msg, OPAL_GET_MSG);
+OPAL_CALL(opal_write_oppanel_async, OPAL_WRITE_OPPANEL_ASYNC);
+OPAL_CALL(opal_check_completion, OPAL_CHECK_ASYNC_COMPLETION);
+OPAL_CALL(opal_dump_resend_notification, OPAL_DUMP_RESEND);
+OPAL_CALL(opal_sync_host_reboot, OPAL_SYNC_HOST_REBOOT);
+OPAL_CALL(opal_sensor_read, OPAL_SENSOR_READ);
+OPAL_CALL(opal_get_param, OPAL_GET_PARAM);
+OPAL_CALL(opal_set_param, OPAL_SET_PARAM);
+OPAL_CALL(opal_handle_hmi, OPAL_HANDLE_HMI);
+OPAL_CALL(opal_config_cpu_idle_state, OPAL_CONFIG_CPU_IDLE_STATE);
+OPAL_CALL(opal_slw_set_reg, OPAL_SLW_SET_REG);
+OPAL_CALL(opal_register_dump_region, OPAL_REGISTER_DUMP_REGION);
+OPAL_CALL(opal_unregister_dump_region, OPAL_UNREGISTER_DUMP_REGION);
+OPAL_CALL(opal_pci_set_phb_cxl_mode, OPAL_PCI_SET_PHB_CAPI_MODE);
+OPAL_CALL(opal_tpo_write, OPAL_WRITE_TPO);
+OPAL_CALL(opal_tpo_read, OPAL_READ_TPO);
+OPAL_CALL(opal_ipmi_send, OPAL_IPMI_SEND);
+OPAL_CALL(opal_ipmi_recv, OPAL_IPMI_RECV);
+OPAL_CALL(opal_i2c_request, OPAL_I2C_REQUEST);
+OPAL_CALL(opal_flash_read, OPAL_FLASH_READ);
+OPAL_CALL(opal_flash_write, OPAL_FLASH_WRITE);
+OPAL_CALL(opal_flash_erase, OPAL_FLASH_ERASE);
+OPAL_CALL(opal_prd_msg, OPAL_PRD_MSG);
+OPAL_CALL(opal_leds_get_ind, OPAL_LEDS_GET_INDICATOR);
+OPAL_CALL(opal_leds_set_ind, OPAL_LEDS_SET_INDICATOR);
+OPAL_CALL(opal_console_flush, OPAL_CONSOLE_FLUSH);
+OPAL_CALL(opal_get_device_tree, OPAL_GET_DEVICE_TREE);
+OPAL_CALL(opal_pci_get_presence_state, OPAL_PCI_GET_PRESENCE_STATE);
+OPAL_CALL(opal_pci_get_power_state, OPAL_PCI_GET_POWER_STATE);
+OPAL_CALL(opal_pci_set_power_state, OPAL_PCI_SET_POWER_STATE);
+OPAL_CALL(opal_int_get_xirr, OPAL_INT_GET_XIRR);
+OPAL_CALL(opal_int_set_cppr, OPAL_INT_SET_CPPR);
+OPAL_CALL(opal_int_eoi, OPAL_INT_EOI);
+OPAL_CALL(opal_int_set_mfrr, OPAL_INT_SET_MFRR);
+OPAL_CALL(opal_pci_tce_kill, OPAL_PCI_TCE_KILL);
+OPAL_CALL(opal_nmmu_set_ptcr, OPAL_NMMU_SET_PTCR);
+OPAL_CALL(opal_xive_reset, OPAL_XIVE_RESET);
+OPAL_CALL(opal_xive_get_irq_info, OPAL_XIVE_GET_IRQ_INFO);
+OPAL_CALL(opal_xive_get_irq_config, OPAL_XIVE_GET_IRQ_CONFIG);
+OPAL_CALL(opal_xive_set_irq_config, OPAL_XIVE_SET_IRQ_CONFIG);
+OPAL_CALL(opal_xive_get_queue_info, OPAL_XIVE_GET_QUEUE_INFO);
+OPAL_CALL(opal_xive_set_queue_info, OPAL_XIVE_SET_QUEUE_INFO);
+OPAL_CALL(opal_xive_donate_page, OPAL_XIVE_DONATE_PAGE);
+OPAL_CALL(opal_xive_alloc_vp_block, OPAL_XIVE_ALLOCATE_VP_BLOCK);
+OPAL_CALL(opal_xive_free_vp_block, OPAL_XIVE_FREE_VP_BLOCK);
+OPAL_CALL(opal_xive_allocate_irq_raw, OPAL_XIVE_ALLOCATE_IRQ);
+OPAL_CALL(opal_xive_free_irq, OPAL_XIVE_FREE_IRQ);
+OPAL_CALL(opal_xive_get_vp_info, OPAL_XIVE_GET_VP_INFO);
+OPAL_CALL(opal_xive_set_vp_info, OPAL_XIVE_SET_VP_INFO);
+OPAL_CALL(opal_xive_sync, OPAL_XIVE_SYNC);
+OPAL_CALL(opal_xive_dump, OPAL_XIVE_DUMP);
+OPAL_CALL(opal_signal_system_reset, OPAL_SIGNAL_SYSTEM_RESET);
+OPAL_CALL(opal_npu_init_context, OPAL_NPU_INIT_CONTEXT);
+OPAL_CALL(opal_npu_destroy_context, OPAL_NPU_DESTROY_CONTEXT);
+OPAL_CALL(opal_npu_map_lpar, OPAL_NPU_MAP_LPAR);
+OPAL_CALL(opal_imc_counters_init, OPAL_IMC_COUNTERS_INIT);
+OPAL_CALL(opal_imc_counters_start, OPAL_IMC_COUNTERS_START);
+OPAL_CALL(opal_imc_counters_stop, OPAL_IMC_COUNTERS_STOP);
+OPAL_CALL(opal_pci_set_p2p, OPAL_PCI_SET_P2P);
+OPAL_CALL(opal_get_powercap, OPAL_GET_POWERCAP);
+OPAL_CALL(opal_set_powercap, OPAL_SET_POWERCAP);
+OPAL_CALL(opal_get_power_shift_ratio, OPAL_GET_POWER_SHIFT_RATIO);
+OPAL_CALL(opal_set_power_shift_ratio, OPAL_SET_POWER_SHIFT_RATIO);
+OPAL_CALL(opal_sensor_group_clear, OPAL_SENSOR_GROUP_CLEAR);
+OPAL_CALL(opal_quiesce, OPAL_QUIESCE);
+OPAL_CALL(opal_npu_spa_setup, OPAL_NPU_SPA_SETUP);
+OPAL_CALL(opal_npu_spa_clear_cache, OPAL_NPU_SPA_CLEAR_CACHE);
+OPAL_CALL(opal_npu_tl_set, OPAL_NPU_TL_SET);
+OPAL_CALL(opal_pci_get_pbcq_tunnel_bar, OPAL_PCI_GET_PBCQ_TUNNEL_BAR);
+OPAL_CALL(opal_pci_set_pbcq_tunnel_bar, OPAL_PCI_SET_PBCQ_TUNNEL_BAR);
+OPAL_CALL(opal_sensor_read_u64, OPAL_SENSOR_READ_U64);
+OPAL_CALL(opal_sensor_group_enable, OPAL_SENSOR_GROUP_ENABLE);
+OPAL_CALL(opal_nx_coproc_init, OPAL_NX_COPROC_INIT);
diff --git a/arch/powerpc/platforms/powernv/opal-xscom.c b/arch/powerpc/platforms/powernv/opal-xscom.c
new file mode 100644
index 000000000..22d5e1110
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/opal-xscom.c
@@ -0,0 +1,140 @@
+/*
+ * PowerNV LPC bus handling.
+ *
+ * Copyright 2013 IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/of.h>
+#include <linux/bug.h>
+#include <linux/gfp.h>
+#include <linux/slab.h>
+
+#include <asm/machdep.h>
+#include <asm/firmware.h>
+#include <asm/opal.h>
+#include <asm/scom.h>
+
+/*
+ * We could probably fit that inside the scom_map_t
+ * which is a void* after all but it's really too ugly
+ * so let's kmalloc it for now
+ */
+struct opal_scom_map {
+ uint32_t chip;
+ uint64_t addr;
+};
+
+static scom_map_t opal_scom_map(struct device_node *dev, u64 reg, u64 count)
+{
+ struct opal_scom_map *m;
+ const __be32 *gcid;
+
+ if (!of_get_property(dev, "scom-controller", NULL)) {
+ pr_err("%s: device %pOF is not a SCOM controller\n",
+ __func__, dev);
+ return SCOM_MAP_INVALID;
+ }
+ gcid = of_get_property(dev, "ibm,chip-id", NULL);
+ if (!gcid) {
+ pr_err("%s: device %pOF has no ibm,chip-id\n",
+ __func__, dev);
+ return SCOM_MAP_INVALID;
+ }
+ m = kmalloc(sizeof(*m), GFP_KERNEL);
+ if (!m)
+ return NULL;
+ m->chip = be32_to_cpup(gcid);
+ m->addr = reg;
+
+ return (scom_map_t)m;
+}
+
+static void opal_scom_unmap(scom_map_t map)
+{
+ kfree(map);
+}
+
+static int opal_xscom_err_xlate(int64_t rc)
+{
+ switch(rc) {
+ case 0:
+ return 0;
+ /* Add more translations if necessary */
+ default:
+ return -EIO;
+ }
+}
+
+static u64 opal_scom_unmangle(u64 addr)
+{
+ u64 tmp;
+
+ /*
+ * XSCOM addresses use the top nibble to set indirect mode and
+ * its form. Bits 4-11 are always 0.
+ *
+ * Because the debugfs interface uses signed offsets and shifts
+ * the address left by 3, we basically cannot use the top 4 bits
+ * of the 64-bit address, and thus cannot use the indirect bit.
+ *
+ * To deal with that, we support the indirect bits being in
+ * bits 4-7 (IBM notation) instead of bit 0-3 in this API, we
+ * do the conversion here.
+ *
+ * For in-kernel use, we don't need to do this mangling. In
+ * kernel won't have bits 4-7 set.
+ *
+ * So:
+ * debugfs will always set 0-3 = 0 and clear 4-7
+ * kernel will always clear 0-3 = 0 and set 4-7
+ */
+ tmp = addr;
+ tmp &= 0x0f00000000000000;
+ addr &= 0xf0ffffffffffffff;
+ addr |= tmp << 4;
+
+ return addr;
+}
+
+static int opal_scom_read(scom_map_t map, u64 reg, u64 *value)
+{
+ struct opal_scom_map *m = map;
+ int64_t rc;
+ __be64 v;
+
+ reg = opal_scom_unmangle(m->addr + reg);
+ rc = opal_xscom_read(m->chip, reg, (__be64 *)__pa(&v));
+ *value = be64_to_cpu(v);
+ return opal_xscom_err_xlate(rc);
+}
+
+static int opal_scom_write(scom_map_t map, u64 reg, u64 value)
+{
+ struct opal_scom_map *m = map;
+ int64_t rc;
+
+ reg = opal_scom_unmangle(m->addr + reg);
+ rc = opal_xscom_write(m->chip, reg, value);
+ return opal_xscom_err_xlate(rc);
+}
+
+static const struct scom_controller opal_scom_controller = {
+ .map = opal_scom_map,
+ .unmap = opal_scom_unmap,
+ .read = opal_scom_read,
+ .write = opal_scom_write
+};
+
+static int opal_xscom_init(void)
+{
+ if (firmware_has_feature(FW_FEATURE_OPAL))
+ scom_init(&opal_scom_controller);
+ return 0;
+}
+machine_arch_initcall(powernv, opal_xscom_init);
diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c
new file mode 100644
index 000000000..edf9032e2
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/opal.c
@@ -0,0 +1,1118 @@
+/*
+ * PowerNV OPAL high level interfaces
+ *
+ * Copyright 2011 IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#define pr_fmt(fmt) "opal: " fmt
+
+#include <linux/printk.h>
+#include <linux/types.h>
+#include <linux/of.h>
+#include <linux/of_fdt.h>
+#include <linux/of_platform.h>
+#include <linux/of_address.h>
+#include <linux/interrupt.h>
+#include <linux/notifier.h>
+#include <linux/slab.h>
+#include <linux/sched.h>
+#include <linux/kobject.h>
+#include <linux/delay.h>
+#include <linux/memblock.h>
+#include <linux/kthread.h>
+#include <linux/freezer.h>
+#include <linux/printk.h>
+#include <linux/kmsg_dump.h>
+#include <linux/console.h>
+#include <linux/sched/debug.h>
+
+#include <asm/machdep.h>
+#include <asm/opal.h>
+#include <asm/firmware.h>
+#include <asm/mce.h>
+#include <asm/imc-pmu.h>
+#include <asm/bug.h>
+
+#include "powernv.h"
+
+/* /sys/firmware/opal */
+struct kobject *opal_kobj;
+
+struct opal {
+ u64 base;
+ u64 entry;
+ u64 size;
+} opal;
+
+struct mcheck_recoverable_range {
+ u64 start_addr;
+ u64 end_addr;
+ u64 recover_addr;
+};
+
+static struct mcheck_recoverable_range *mc_recoverable_range;
+static int mc_recoverable_range_len;
+
+struct device_node *opal_node;
+static DEFINE_SPINLOCK(opal_write_lock);
+static struct atomic_notifier_head opal_msg_notifier_head[OPAL_MSG_TYPE_MAX];
+static uint32_t opal_heartbeat;
+static struct task_struct *kopald_tsk;
+
+void opal_configure_cores(void)
+{
+ u64 reinit_flags = 0;
+
+ /* Do the actual re-init, This will clobber all FPRs, VRs, etc...
+ *
+ * It will preserve non volatile GPRs and HSPRG0/1. It will
+ * also restore HIDs and other SPRs to their original value
+ * but it might clobber a bunch.
+ */
+#ifdef __BIG_ENDIAN__
+ reinit_flags |= OPAL_REINIT_CPUS_HILE_BE;
+#else
+ reinit_flags |= OPAL_REINIT_CPUS_HILE_LE;
+#endif
+
+ /*
+ * POWER9 always support running hash:
+ * ie. Host hash supports hash guests
+ * Host radix supports hash/radix guests
+ */
+ if (early_cpu_has_feature(CPU_FTR_ARCH_300)) {
+ reinit_flags |= OPAL_REINIT_CPUS_MMU_HASH;
+ if (early_radix_enabled())
+ reinit_flags |= OPAL_REINIT_CPUS_MMU_RADIX;
+ }
+
+ opal_reinit_cpus(reinit_flags);
+
+ /* Restore some bits */
+ if (cur_cpu_spec->cpu_restore)
+ cur_cpu_spec->cpu_restore();
+}
+
+int __init early_init_dt_scan_opal(unsigned long node,
+ const char *uname, int depth, void *data)
+{
+ const void *basep, *entryp, *sizep;
+ int basesz, entrysz, runtimesz;
+
+ if (depth != 1 || strcmp(uname, "ibm,opal") != 0)
+ return 0;
+
+ basep = of_get_flat_dt_prop(node, "opal-base-address", &basesz);
+ entryp = of_get_flat_dt_prop(node, "opal-entry-address", &entrysz);
+ sizep = of_get_flat_dt_prop(node, "opal-runtime-size", &runtimesz);
+
+ if (!basep || !entryp || !sizep)
+ return 1;
+
+ opal.base = of_read_number(basep, basesz/4);
+ opal.entry = of_read_number(entryp, entrysz/4);
+ opal.size = of_read_number(sizep, runtimesz/4);
+
+ pr_debug("OPAL Base = 0x%llx (basep=%p basesz=%d)\n",
+ opal.base, basep, basesz);
+ pr_debug("OPAL Entry = 0x%llx (entryp=%p basesz=%d)\n",
+ opal.entry, entryp, entrysz);
+ pr_debug("OPAL Entry = 0x%llx (sizep=%p runtimesz=%d)\n",
+ opal.size, sizep, runtimesz);
+
+ if (of_flat_dt_is_compatible(node, "ibm,opal-v3")) {
+ powerpc_firmware_features |= FW_FEATURE_OPAL;
+ pr_debug("OPAL detected !\n");
+ } else {
+ panic("OPAL != V3 detected, no longer supported.\n");
+ }
+
+ return 1;
+}
+
+int __init early_init_dt_scan_recoverable_ranges(unsigned long node,
+ const char *uname, int depth, void *data)
+{
+ int i, psize, size;
+ const __be32 *prop;
+
+ if (depth != 1 || strcmp(uname, "ibm,opal") != 0)
+ return 0;
+
+ prop = of_get_flat_dt_prop(node, "mcheck-recoverable-ranges", &psize);
+
+ if (!prop)
+ return 1;
+
+ pr_debug("Found machine check recoverable ranges.\n");
+
+ /*
+ * Calculate number of available entries.
+ *
+ * Each recoverable address range entry is (start address, len,
+ * recovery address), 2 cells each for start and recovery address,
+ * 1 cell for len, totalling 5 cells per entry.
+ */
+ mc_recoverable_range_len = psize / (sizeof(*prop) * 5);
+
+ /* Sanity check */
+ if (!mc_recoverable_range_len)
+ return 1;
+
+ /* Size required to hold all the entries. */
+ size = mc_recoverable_range_len *
+ sizeof(struct mcheck_recoverable_range);
+
+ /*
+ * Allocate a buffer to hold the MC recoverable ranges.
+ */
+ mc_recoverable_range =__va(memblock_alloc(size, __alignof__(u64)));
+ memset(mc_recoverable_range, 0, size);
+
+ for (i = 0; i < mc_recoverable_range_len; i++) {
+ mc_recoverable_range[i].start_addr =
+ of_read_number(prop + (i * 5) + 0, 2);
+ mc_recoverable_range[i].end_addr =
+ mc_recoverable_range[i].start_addr +
+ of_read_number(prop + (i * 5) + 2, 1);
+ mc_recoverable_range[i].recover_addr =
+ of_read_number(prop + (i * 5) + 3, 2);
+
+ pr_debug("Machine check recoverable range: %llx..%llx: %llx\n",
+ mc_recoverable_range[i].start_addr,
+ mc_recoverable_range[i].end_addr,
+ mc_recoverable_range[i].recover_addr);
+ }
+ return 1;
+}
+
+static int __init opal_register_exception_handlers(void)
+{
+#ifdef __BIG_ENDIAN__
+ u64 glue;
+
+ if (!(powerpc_firmware_features & FW_FEATURE_OPAL))
+ return -ENODEV;
+
+ /* Hookup some exception handlers except machine check. We use the
+ * fwnmi area at 0x7000 to provide the glue space to OPAL
+ */
+ glue = 0x7000;
+
+ /*
+ * Check if we are running on newer firmware that exports
+ * OPAL_HANDLE_HMI token. If yes, then don't ask OPAL to patch
+ * the HMI interrupt and we catch it directly in Linux.
+ *
+ * For older firmware (i.e currently released POWER8 System Firmware
+ * as of today <= SV810_087), we fallback to old behavior and let OPAL
+ * patch the HMI vector and handle it inside OPAL firmware.
+ *
+ * For newer firmware (in development/yet to be released) we will
+ * start catching/handling HMI directly in Linux.
+ */
+ if (!opal_check_token(OPAL_HANDLE_HMI)) {
+ pr_info("Old firmware detected, OPAL handles HMIs.\n");
+ opal_register_exception_handler(
+ OPAL_HYPERVISOR_MAINTENANCE_HANDLER,
+ 0, glue);
+ glue += 128;
+ }
+
+ opal_register_exception_handler(OPAL_SOFTPATCH_HANDLER, 0, glue);
+#endif
+
+ return 0;
+}
+machine_early_initcall(powernv, opal_register_exception_handlers);
+
+/*
+ * Opal message notifier based on message type. Allow subscribers to get
+ * notified for specific messgae type.
+ */
+int opal_message_notifier_register(enum opal_msg_type msg_type,
+ struct notifier_block *nb)
+{
+ if (!nb || msg_type >= OPAL_MSG_TYPE_MAX) {
+ pr_warn("%s: Invalid arguments, msg_type:%d\n",
+ __func__, msg_type);
+ return -EINVAL;
+ }
+
+ return atomic_notifier_chain_register(
+ &opal_msg_notifier_head[msg_type], nb);
+}
+EXPORT_SYMBOL_GPL(opal_message_notifier_register);
+
+int opal_message_notifier_unregister(enum opal_msg_type msg_type,
+ struct notifier_block *nb)
+{
+ return atomic_notifier_chain_unregister(
+ &opal_msg_notifier_head[msg_type], nb);
+}
+EXPORT_SYMBOL_GPL(opal_message_notifier_unregister);
+
+static void opal_message_do_notify(uint32_t msg_type, void *msg)
+{
+ /* notify subscribers */
+ atomic_notifier_call_chain(&opal_msg_notifier_head[msg_type],
+ msg_type, msg);
+}
+
+static void opal_handle_message(void)
+{
+ s64 ret;
+ /*
+ * TODO: pre-allocate a message buffer depending on opal-msg-size
+ * value in /proc/device-tree.
+ */
+ static struct opal_msg msg;
+ u32 type;
+
+ ret = opal_get_msg(__pa(&msg), sizeof(msg));
+ /* No opal message pending. */
+ if (ret == OPAL_RESOURCE)
+ return;
+
+ /* check for errors. */
+ if (ret) {
+ pr_warn("%s: Failed to retrieve opal message, err=%lld\n",
+ __func__, ret);
+ return;
+ }
+
+ type = be32_to_cpu(msg.msg_type);
+
+ /* Sanity check */
+ if (type >= OPAL_MSG_TYPE_MAX) {
+ pr_warn_once("%s: Unknown message type: %u\n", __func__, type);
+ return;
+ }
+ opal_message_do_notify(type, (void *)&msg);
+}
+
+static irqreturn_t opal_message_notify(int irq, void *data)
+{
+ opal_handle_message();
+ return IRQ_HANDLED;
+}
+
+static int __init opal_message_init(void)
+{
+ int ret, i, irq;
+
+ for (i = 0; i < OPAL_MSG_TYPE_MAX; i++)
+ ATOMIC_INIT_NOTIFIER_HEAD(&opal_msg_notifier_head[i]);
+
+ irq = opal_event_request(ilog2(OPAL_EVENT_MSG_PENDING));
+ if (!irq) {
+ pr_err("%s: Can't register OPAL event irq (%d)\n",
+ __func__, irq);
+ return irq;
+ }
+
+ ret = request_irq(irq, opal_message_notify,
+ IRQ_TYPE_LEVEL_HIGH, "opal-msg", NULL);
+ if (ret) {
+ pr_err("%s: Can't request OPAL event irq (%d)\n",
+ __func__, ret);
+ return ret;
+ }
+
+ return 0;
+}
+
+int opal_get_chars(uint32_t vtermno, char *buf, int count)
+{
+ s64 rc;
+ __be64 evt, len;
+
+ if (!opal.entry)
+ return -ENODEV;
+ opal_poll_events(&evt);
+ if ((be64_to_cpu(evt) & OPAL_EVENT_CONSOLE_INPUT) == 0)
+ return 0;
+ len = cpu_to_be64(count);
+ rc = opal_console_read(vtermno, &len, buf);
+ if (rc == OPAL_SUCCESS)
+ return be64_to_cpu(len);
+ return 0;
+}
+
+static int __opal_put_chars(uint32_t vtermno, const char *data, int total_len, bool atomic)
+{
+ unsigned long flags = 0 /* shut up gcc */;
+ int written;
+ __be64 olen;
+ s64 rc;
+
+ if (!opal.entry)
+ return -ENODEV;
+
+ if (atomic)
+ spin_lock_irqsave(&opal_write_lock, flags);
+ rc = opal_console_write_buffer_space(vtermno, &olen);
+ if (rc || be64_to_cpu(olen) < total_len) {
+ /* Closed -> drop characters */
+ if (rc)
+ written = total_len;
+ else
+ written = -EAGAIN;
+ goto out;
+ }
+
+ /* Should not get a partial write here because space is available. */
+ olen = cpu_to_be64(total_len);
+ rc = opal_console_write(vtermno, &olen, data);
+ if (rc == OPAL_BUSY || rc == OPAL_BUSY_EVENT) {
+ if (rc == OPAL_BUSY_EVENT)
+ opal_poll_events(NULL);
+ written = -EAGAIN;
+ goto out;
+ }
+
+ /* Closed or other error drop */
+ if (rc != OPAL_SUCCESS) {
+ written = opal_error_code(rc);
+ goto out;
+ }
+
+ written = be64_to_cpu(olen);
+ if (written < total_len) {
+ if (atomic) {
+ /* Should not happen */
+ pr_warn("atomic console write returned partial "
+ "len=%d written=%d\n", total_len, written);
+ }
+ if (!written)
+ written = -EAGAIN;
+ }
+
+out:
+ if (atomic)
+ spin_unlock_irqrestore(&opal_write_lock, flags);
+
+ return written;
+}
+
+int opal_put_chars(uint32_t vtermno, const char *data, int total_len)
+{
+ return __opal_put_chars(vtermno, data, total_len, false);
+}
+
+/*
+ * opal_put_chars_atomic will not perform partial-writes. Data will be
+ * atomically written to the terminal or not at all. This is not strictly
+ * true at the moment because console space can race with OPAL's console
+ * writes.
+ */
+int opal_put_chars_atomic(uint32_t vtermno, const char *data, int total_len)
+{
+ return __opal_put_chars(vtermno, data, total_len, true);
+}
+
+static s64 __opal_flush_console(uint32_t vtermno)
+{
+ s64 rc;
+
+ if (!opal_check_token(OPAL_CONSOLE_FLUSH)) {
+ __be64 evt;
+
+ /*
+ * If OPAL_CONSOLE_FLUSH is not implemented in the firmware,
+ * the console can still be flushed by calling the polling
+ * function while it has OPAL_EVENT_CONSOLE_OUTPUT events.
+ */
+ WARN_ONCE(1, "opal: OPAL_CONSOLE_FLUSH missing.\n");
+
+ opal_poll_events(&evt);
+ if (!(be64_to_cpu(evt) & OPAL_EVENT_CONSOLE_OUTPUT))
+ return OPAL_SUCCESS;
+ return OPAL_BUSY;
+
+ } else {
+ rc = opal_console_flush(vtermno);
+ if (rc == OPAL_BUSY_EVENT) {
+ opal_poll_events(NULL);
+ rc = OPAL_BUSY;
+ }
+ return rc;
+ }
+
+}
+
+/*
+ * opal_flush_console spins until the console is flushed
+ */
+int opal_flush_console(uint32_t vtermno)
+{
+ for (;;) {
+ s64 rc = __opal_flush_console(vtermno);
+
+ if (rc == OPAL_BUSY || rc == OPAL_PARTIAL) {
+ mdelay(1);
+ continue;
+ }
+
+ return opal_error_code(rc);
+ }
+}
+
+/*
+ * opal_flush_chars is an hvc interface that sleeps until the console is
+ * flushed if wait, otherwise it will return -EBUSY if the console has data,
+ * -EAGAIN if it has data and some of it was flushed.
+ */
+int opal_flush_chars(uint32_t vtermno, bool wait)
+{
+ for (;;) {
+ s64 rc = __opal_flush_console(vtermno);
+
+ if (rc == OPAL_BUSY || rc == OPAL_PARTIAL) {
+ if (wait) {
+ msleep(OPAL_BUSY_DELAY_MS);
+ continue;
+ }
+ if (rc == OPAL_PARTIAL)
+ return -EAGAIN;
+ }
+
+ return opal_error_code(rc);
+ }
+}
+
+static int opal_recover_mce(struct pt_regs *regs,
+ struct machine_check_event *evt)
+{
+ int recovered = 0;
+
+ if (!(regs->msr & MSR_RI)) {
+ /* If MSR_RI isn't set, we cannot recover */
+ pr_err("Machine check interrupt unrecoverable: MSR(RI=0)\n");
+ recovered = 0;
+ } else if (evt->disposition == MCE_DISPOSITION_RECOVERED) {
+ /* Platform corrected itself */
+ recovered = 1;
+ } else if (evt->severity == MCE_SEV_FATAL) {
+ /* Fatal machine check */
+ pr_err("Machine check interrupt is fatal\n");
+ recovered = 0;
+ }
+
+ if (!recovered && evt->severity == MCE_SEV_ERROR_SYNC) {
+ /*
+ * Try to kill processes if we get a synchronous machine check
+ * (e.g., one caused by execution of this instruction). This
+ * will devolve into a panic if we try to kill init or are in
+ * an interrupt etc.
+ *
+ * TODO: Queue up this address for hwpoisioning later.
+ * TODO: This is not quite right for d-side machine
+ * checks ->nip is not necessarily the important
+ * address.
+ */
+ if ((user_mode(regs))) {
+ _exception(SIGBUS, regs, BUS_MCEERR_AR, regs->nip);
+ recovered = 1;
+ } else if (die_will_crash()) {
+ /*
+ * die() would kill the kernel, so better to go via
+ * the platform reboot code that will log the
+ * machine check.
+ */
+ recovered = 0;
+ } else {
+ die("Machine check", regs, SIGBUS);
+ recovered = 1;
+ }
+ }
+
+ return recovered;
+}
+
+void pnv_platform_error_reboot(struct pt_regs *regs, const char *msg)
+{
+ panic_flush_kmsg_start();
+
+ pr_emerg("Hardware platform error: %s\n", msg);
+ if (regs)
+ show_regs(regs);
+ smp_send_stop();
+
+ panic_flush_kmsg_end();
+
+ /*
+ * Don't bother to shut things down because this will
+ * xstop the system.
+ */
+ if (opal_cec_reboot2(OPAL_REBOOT_PLATFORM_ERROR, msg)
+ == OPAL_UNSUPPORTED) {
+ pr_emerg("Reboot type %d not supported for %s\n",
+ OPAL_REBOOT_PLATFORM_ERROR, msg);
+ }
+
+ /*
+ * We reached here. There can be three possibilities:
+ * 1. We are running on a firmware level that do not support
+ * opal_cec_reboot2()
+ * 2. We are running on a firmware level that do not support
+ * OPAL_REBOOT_PLATFORM_ERROR reboot type.
+ * 3. We are running on FSP based system that does not need
+ * opal to trigger checkstop explicitly for error analysis.
+ * The FSP PRD component would have already got notified
+ * about this error through other channels.
+ * 4. We are running on a newer skiboot that by default does
+ * not cause a checkstop, drops us back to the kernel to
+ * extract context and state at the time of the error.
+ */
+
+ panic(msg);
+}
+
+int opal_machine_check(struct pt_regs *regs)
+{
+ struct machine_check_event evt;
+
+ if (!get_mce_event(&evt, MCE_EVENT_RELEASE))
+ return 0;
+
+ /* Print things out */
+ if (evt.version != MCE_V1) {
+ pr_err("Machine Check Exception, Unknown event version %d !\n",
+ evt.version);
+ return 0;
+ }
+ machine_check_print_event_info(&evt, user_mode(regs));
+
+ if (opal_recover_mce(regs, &evt))
+ return 1;
+
+ pnv_platform_error_reboot(regs, "Unrecoverable Machine Check exception");
+}
+
+/* Early hmi handler called in real mode. */
+int opal_hmi_exception_early(struct pt_regs *regs)
+{
+ s64 rc;
+
+ /*
+ * call opal hmi handler. Pass paca address as token.
+ * The return value OPAL_SUCCESS is an indication that there is
+ * an HMI event generated waiting to pull by Linux.
+ */
+ rc = opal_handle_hmi();
+ if (rc == OPAL_SUCCESS) {
+ local_paca->hmi_event_available = 1;
+ return 1;
+ }
+ return 0;
+}
+
+/* HMI exception handler called in virtual mode during check_irq_replay. */
+int opal_handle_hmi_exception(struct pt_regs *regs)
+{
+ /*
+ * Check if HMI event is available.
+ * if Yes, then wake kopald to process them.
+ */
+ if (!local_paca->hmi_event_available)
+ return 0;
+
+ local_paca->hmi_event_available = 0;
+ opal_wake_poller();
+
+ return 1;
+}
+
+static uint64_t find_recovery_address(uint64_t nip)
+{
+ int i;
+
+ for (i = 0; i < mc_recoverable_range_len; i++)
+ if ((nip >= mc_recoverable_range[i].start_addr) &&
+ (nip < mc_recoverable_range[i].end_addr))
+ return mc_recoverable_range[i].recover_addr;
+ return 0;
+}
+
+bool opal_mce_check_early_recovery(struct pt_regs *regs)
+{
+ uint64_t recover_addr = 0;
+
+ if (!opal.base || !opal.size)
+ goto out;
+
+ if ((regs->nip >= opal.base) &&
+ (regs->nip < (opal.base + opal.size)))
+ recover_addr = find_recovery_address(regs->nip);
+
+ /*
+ * Setup regs->nip to rfi into fixup address.
+ */
+ if (recover_addr)
+ regs->nip = recover_addr;
+
+out:
+ return !!recover_addr;
+}
+
+static int opal_sysfs_init(void)
+{
+ opal_kobj = kobject_create_and_add("opal", firmware_kobj);
+ if (!opal_kobj) {
+ pr_warn("kobject_create_and_add opal failed\n");
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+static ssize_t symbol_map_read(struct file *fp, struct kobject *kobj,
+ struct bin_attribute *bin_attr,
+ char *buf, loff_t off, size_t count)
+{
+ return memory_read_from_buffer(buf, count, &off, bin_attr->private,
+ bin_attr->size);
+}
+
+static struct bin_attribute symbol_map_attr = {
+ .attr = {.name = "symbol_map", .mode = 0400},
+ .read = symbol_map_read
+};
+
+static void opal_export_symmap(void)
+{
+ const __be64 *syms;
+ unsigned int size;
+ struct device_node *fw;
+ int rc;
+
+ fw = of_find_node_by_path("/ibm,opal/firmware");
+ if (!fw)
+ return;
+ syms = of_get_property(fw, "symbol-map", &size);
+ if (!syms || size != 2 * sizeof(__be64))
+ return;
+
+ /* Setup attributes */
+ symbol_map_attr.private = __va(be64_to_cpu(syms[0]));
+ symbol_map_attr.size = be64_to_cpu(syms[1]);
+
+ rc = sysfs_create_bin_file(opal_kobj, &symbol_map_attr);
+ if (rc)
+ pr_warn("Error %d creating OPAL symbols file\n", rc);
+}
+
+static ssize_t export_attr_read(struct file *fp, struct kobject *kobj,
+ struct bin_attribute *bin_attr, char *buf,
+ loff_t off, size_t count)
+{
+ return memory_read_from_buffer(buf, count, &off, bin_attr->private,
+ bin_attr->size);
+}
+
+/*
+ * opal_export_attrs: creates a sysfs node for each property listed in
+ * the device-tree under /ibm,opal/firmware/exports/
+ * All new sysfs nodes are created under /opal/exports/.
+ * This allows for reserved memory regions (e.g. HDAT) to be read.
+ * The new sysfs nodes are only readable by root.
+ */
+static void opal_export_attrs(void)
+{
+ struct bin_attribute *attr;
+ struct device_node *np;
+ struct property *prop;
+ struct kobject *kobj;
+ u64 vals[2];
+ int rc;
+
+ np = of_find_node_by_path("/ibm,opal/firmware/exports");
+ if (!np)
+ return;
+
+ /* Create new 'exports' directory - /sys/firmware/opal/exports */
+ kobj = kobject_create_and_add("exports", opal_kobj);
+ if (!kobj) {
+ pr_warn("kobject_create_and_add() of exports failed\n");
+ return;
+ }
+
+ for_each_property_of_node(np, prop) {
+ if (!strcmp(prop->name, "name") || !strcmp(prop->name, "phandle"))
+ continue;
+
+ if (of_property_read_u64_array(np, prop->name, &vals[0], 2))
+ continue;
+
+ attr = kzalloc(sizeof(*attr), GFP_KERNEL);
+
+ if (attr == NULL) {
+ pr_warn("Failed kmalloc for bin_attribute!");
+ continue;
+ }
+
+ sysfs_bin_attr_init(attr);
+ attr->attr.name = kstrdup(prop->name, GFP_KERNEL);
+ attr->attr.mode = 0400;
+ attr->read = export_attr_read;
+ attr->private = __va(vals[0]);
+ attr->size = vals[1];
+
+ if (attr->attr.name == NULL) {
+ pr_warn("Failed kstrdup for bin_attribute attr.name");
+ kfree(attr);
+ continue;
+ }
+
+ rc = sysfs_create_bin_file(kobj, attr);
+ if (rc) {
+ pr_warn("Error %d creating OPAL sysfs exports/%s file\n",
+ rc, prop->name);
+ kfree(attr->attr.name);
+ kfree(attr);
+ }
+ }
+
+ of_node_put(np);
+}
+
+static void __init opal_dump_region_init(void)
+{
+ void *addr;
+ uint64_t size;
+ int rc;
+
+ if (!opal_check_token(OPAL_REGISTER_DUMP_REGION))
+ return;
+
+ /* Register kernel log buffer */
+ addr = log_buf_addr_get();
+ if (addr == NULL)
+ return;
+
+ size = log_buf_len_get();
+ if (size == 0)
+ return;
+
+ rc = opal_register_dump_region(OPAL_DUMP_REGION_LOG_BUF,
+ __pa(addr), size);
+ /* Don't warn if this is just an older OPAL that doesn't
+ * know about that call
+ */
+ if (rc && rc != OPAL_UNSUPPORTED)
+ pr_warn("DUMP: Failed to register kernel log buffer. "
+ "rc = %d\n", rc);
+}
+
+static void opal_pdev_init(const char *compatible)
+{
+ struct device_node *np;
+
+ for_each_compatible_node(np, NULL, compatible)
+ of_platform_device_create(np, NULL, NULL);
+}
+
+static void __init opal_imc_init_dev(void)
+{
+ struct device_node *np;
+
+ np = of_find_compatible_node(NULL, NULL, IMC_DTB_COMPAT);
+ if (np)
+ of_platform_device_create(np, NULL, NULL);
+}
+
+static int kopald(void *unused)
+{
+ unsigned long timeout = msecs_to_jiffies(opal_heartbeat) + 1;
+
+ set_freezable();
+ do {
+ try_to_freeze();
+
+ opal_handle_events();
+
+ set_current_state(TASK_INTERRUPTIBLE);
+ if (opal_have_pending_events())
+ __set_current_state(TASK_RUNNING);
+ else
+ schedule_timeout(timeout);
+
+ } while (!kthread_should_stop());
+
+ return 0;
+}
+
+void opal_wake_poller(void)
+{
+ if (kopald_tsk)
+ wake_up_process(kopald_tsk);
+}
+
+static void opal_init_heartbeat(void)
+{
+ /* Old firwmware, we assume the HVC heartbeat is sufficient */
+ if (of_property_read_u32(opal_node, "ibm,heartbeat-ms",
+ &opal_heartbeat) != 0)
+ opal_heartbeat = 0;
+
+ if (opal_heartbeat)
+ kopald_tsk = kthread_run(kopald, NULL, "kopald");
+}
+
+static int __init opal_init(void)
+{
+ struct device_node *np, *consoles, *leds;
+ int rc;
+
+ opal_node = of_find_node_by_path("/ibm,opal");
+ if (!opal_node) {
+ pr_warn("Device node not found\n");
+ return -ENODEV;
+ }
+
+ /* Register OPAL consoles if any ports */
+ consoles = of_find_node_by_path("/ibm,opal/consoles");
+ if (consoles) {
+ for_each_child_of_node(consoles, np) {
+ if (strcmp(np->name, "serial"))
+ continue;
+ of_platform_device_create(np, NULL, NULL);
+ }
+ of_node_put(consoles);
+ }
+
+ /* Initialise OPAL messaging system */
+ opal_message_init();
+
+ /* Initialise OPAL asynchronous completion interface */
+ opal_async_comp_init();
+
+ /* Initialise OPAL sensor interface */
+ opal_sensor_init();
+
+ /* Initialise OPAL hypervisor maintainence interrupt handling */
+ opal_hmi_handler_init();
+
+ /* Create i2c platform devices */
+ opal_pdev_init("ibm,opal-i2c");
+
+ /* Handle non-volatile memory devices */
+ opal_pdev_init("pmem-region");
+
+ /* Setup a heatbeat thread if requested by OPAL */
+ opal_init_heartbeat();
+
+ /* Detect In-Memory Collection counters and create devices*/
+ opal_imc_init_dev();
+
+ /* Create leds platform devices */
+ leds = of_find_node_by_path("/ibm,opal/leds");
+ if (leds) {
+ of_platform_device_create(leds, "opal_leds", NULL);
+ of_node_put(leds);
+ }
+
+ /* Initialise OPAL message log interface */
+ opal_msglog_init();
+
+ /* Create "opal" kobject under /sys/firmware */
+ rc = opal_sysfs_init();
+ if (rc == 0) {
+ /* Export symbol map to userspace */
+ opal_export_symmap();
+ /* Setup dump region interface */
+ opal_dump_region_init();
+ /* Setup error log interface */
+ rc = opal_elog_init();
+ /* Setup code update interface */
+ opal_flash_update_init();
+ /* Setup platform dump extract interface */
+ opal_platform_dump_init();
+ /* Setup system parameters interface */
+ opal_sys_param_init();
+ /* Setup message log sysfs interface. */
+ opal_msglog_sysfs_init();
+ }
+
+ /* Export all properties */
+ opal_export_attrs();
+
+ /* Initialize platform devices: IPMI backend, PRD & flash interface */
+ opal_pdev_init("ibm,opal-ipmi");
+ opal_pdev_init("ibm,opal-flash");
+ opal_pdev_init("ibm,opal-prd");
+
+ /* Initialise platform device: oppanel interface */
+ opal_pdev_init("ibm,opal-oppanel");
+
+ /* Initialise OPAL kmsg dumper for flushing console on panic */
+ opal_kmsg_init();
+
+ /* Initialise OPAL powercap interface */
+ opal_powercap_init();
+
+ /* Initialise OPAL Power-Shifting-Ratio interface */
+ opal_psr_init();
+
+ /* Initialise OPAL sensor groups */
+ opal_sensor_groups_init();
+
+ return 0;
+}
+machine_subsys_initcall(powernv, opal_init);
+
+void opal_shutdown(void)
+{
+ long rc = OPAL_BUSY;
+
+ opal_event_shutdown();
+
+ /*
+ * Then sync with OPAL which ensure anything that can
+ * potentially write to our memory has completed such
+ * as an ongoing dump retrieval
+ */
+ while (rc == OPAL_BUSY || rc == OPAL_BUSY_EVENT) {
+ rc = opal_sync_host_reboot();
+ if (rc == OPAL_BUSY)
+ opal_poll_events(NULL);
+ else
+ mdelay(10);
+ }
+
+ /* Unregister memory dump region */
+ if (opal_check_token(OPAL_UNREGISTER_DUMP_REGION))
+ opal_unregister_dump_region(OPAL_DUMP_REGION_LOG_BUF);
+}
+
+/* Export this so that test modules can use it */
+EXPORT_SYMBOL_GPL(opal_invalid_call);
+EXPORT_SYMBOL_GPL(opal_xscom_read);
+EXPORT_SYMBOL_GPL(opal_xscom_write);
+EXPORT_SYMBOL_GPL(opal_ipmi_send);
+EXPORT_SYMBOL_GPL(opal_ipmi_recv);
+EXPORT_SYMBOL_GPL(opal_flash_read);
+EXPORT_SYMBOL_GPL(opal_flash_write);
+EXPORT_SYMBOL_GPL(opal_flash_erase);
+EXPORT_SYMBOL_GPL(opal_prd_msg);
+EXPORT_SYMBOL_GPL(opal_check_token);
+
+/* Convert a region of vmalloc memory to an opal sg list */
+struct opal_sg_list *opal_vmalloc_to_sg_list(void *vmalloc_addr,
+ unsigned long vmalloc_size)
+{
+ struct opal_sg_list *sg, *first = NULL;
+ unsigned long i = 0;
+
+ sg = kzalloc(PAGE_SIZE, GFP_KERNEL);
+ if (!sg)
+ goto nomem;
+
+ first = sg;
+
+ while (vmalloc_size > 0) {
+ uint64_t data = vmalloc_to_pfn(vmalloc_addr) << PAGE_SHIFT;
+ uint64_t length = min(vmalloc_size, PAGE_SIZE);
+
+ sg->entry[i].data = cpu_to_be64(data);
+ sg->entry[i].length = cpu_to_be64(length);
+ i++;
+
+ if (i >= SG_ENTRIES_PER_NODE) {
+ struct opal_sg_list *next;
+
+ next = kzalloc(PAGE_SIZE, GFP_KERNEL);
+ if (!next)
+ goto nomem;
+
+ sg->length = cpu_to_be64(
+ i * sizeof(struct opal_sg_entry) + 16);
+ i = 0;
+ sg->next = cpu_to_be64(__pa(next));
+ sg = next;
+ }
+
+ vmalloc_addr += length;
+ vmalloc_size -= length;
+ }
+
+ sg->length = cpu_to_be64(i * sizeof(struct opal_sg_entry) + 16);
+
+ return first;
+
+nomem:
+ pr_err("%s : Failed to allocate memory\n", __func__);
+ opal_free_sg_list(first);
+ return NULL;
+}
+
+void opal_free_sg_list(struct opal_sg_list *sg)
+{
+ while (sg) {
+ uint64_t next = be64_to_cpu(sg->next);
+
+ kfree(sg);
+
+ if (next)
+ sg = __va(next);
+ else
+ sg = NULL;
+ }
+}
+
+int opal_error_code(int rc)
+{
+ switch (rc) {
+ case OPAL_SUCCESS: return 0;
+
+ case OPAL_PARAMETER: return -EINVAL;
+ case OPAL_ASYNC_COMPLETION: return -EINPROGRESS;
+ case OPAL_BUSY:
+ case OPAL_BUSY_EVENT: return -EBUSY;
+ case OPAL_NO_MEM: return -ENOMEM;
+ case OPAL_PERMISSION: return -EPERM;
+
+ case OPAL_UNSUPPORTED: return -EIO;
+ case OPAL_HARDWARE: return -EIO;
+ case OPAL_INTERNAL_ERROR: return -EIO;
+ case OPAL_TIMEOUT: return -ETIMEDOUT;
+ default:
+ pr_err("%s: unexpected OPAL error %d\n", __func__, rc);
+ return -EIO;
+ }
+}
+
+void powernv_set_nmmu_ptcr(unsigned long ptcr)
+{
+ int rc;
+
+ if (firmware_has_feature(FW_FEATURE_OPAL)) {
+ rc = opal_nmmu_set_ptcr(-1UL, ptcr);
+ if (rc != OPAL_SUCCESS && rc != OPAL_UNSUPPORTED)
+ pr_warn("%s: Unable to set nest mmu ptcr\n", __func__);
+ }
+}
+
+EXPORT_SYMBOL_GPL(opal_poll_events);
+EXPORT_SYMBOL_GPL(opal_rtc_read);
+EXPORT_SYMBOL_GPL(opal_rtc_write);
+EXPORT_SYMBOL_GPL(opal_tpo_read);
+EXPORT_SYMBOL_GPL(opal_tpo_write);
+EXPORT_SYMBOL_GPL(opal_i2c_request);
+/* Export these symbols for PowerNV LED class driver */
+EXPORT_SYMBOL_GPL(opal_leds_get_ind);
+EXPORT_SYMBOL_GPL(opal_leds_set_ind);
+/* Export this symbol for PowerNV Operator Panel class driver */
+EXPORT_SYMBOL_GPL(opal_write_oppanel_async);
+/* Export this for KVM */
+EXPORT_SYMBOL_GPL(opal_int_set_mfrr);
+EXPORT_SYMBOL_GPL(opal_int_eoi);
+EXPORT_SYMBOL_GPL(opal_error_code);
+/* Export the below symbol for NX compression */
+EXPORT_SYMBOL(opal_nx_coproc_init);
diff --git a/arch/powerpc/platforms/powernv/pci-cxl.c b/arch/powerpc/platforms/powernv/pci-cxl.c
new file mode 100644
index 000000000..1b1811145
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/pci-cxl.c
@@ -0,0 +1,178 @@
+/*
+ * Copyright 2014-2016 IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <asm/pnv-pci.h>
+#include <asm/opal.h>
+
+#include "pci.h"
+
+int pnv_phb_to_cxl_mode(struct pci_dev *dev, uint64_t mode)
+{
+ struct pci_controller *hose = pci_bus_to_host(dev->bus);
+ struct pnv_phb *phb = hose->private_data;
+ struct pnv_ioda_pe *pe;
+ int rc;
+
+ pe = pnv_ioda_get_pe(dev);
+ if (!pe)
+ return -ENODEV;
+
+ pe_info(pe, "Switching PHB to CXL\n");
+
+ rc = opal_pci_set_phb_cxl_mode(phb->opal_id, mode, pe->pe_number);
+ if (rc == OPAL_UNSUPPORTED)
+ dev_err(&dev->dev, "Required cxl mode not supported by firmware - update skiboot\n");
+ else if (rc)
+ dev_err(&dev->dev, "opal_pci_set_phb_cxl_mode failed: %i\n", rc);
+
+ return rc;
+}
+EXPORT_SYMBOL(pnv_phb_to_cxl_mode);
+
+/* Find PHB for cxl dev and allocate MSI hwirqs?
+ * Returns the absolute hardware IRQ number
+ */
+int pnv_cxl_alloc_hwirqs(struct pci_dev *dev, int num)
+{
+ struct pci_controller *hose = pci_bus_to_host(dev->bus);
+ struct pnv_phb *phb = hose->private_data;
+ int hwirq = msi_bitmap_alloc_hwirqs(&phb->msi_bmp, num);
+
+ if (hwirq < 0) {
+ dev_warn(&dev->dev, "Failed to find a free MSI\n");
+ return -ENOSPC;
+ }
+
+ return phb->msi_base + hwirq;
+}
+EXPORT_SYMBOL(pnv_cxl_alloc_hwirqs);
+
+void pnv_cxl_release_hwirqs(struct pci_dev *dev, int hwirq, int num)
+{
+ struct pci_controller *hose = pci_bus_to_host(dev->bus);
+ struct pnv_phb *phb = hose->private_data;
+
+ msi_bitmap_free_hwirqs(&phb->msi_bmp, hwirq - phb->msi_base, num);
+}
+EXPORT_SYMBOL(pnv_cxl_release_hwirqs);
+
+void pnv_cxl_release_hwirq_ranges(struct cxl_irq_ranges *irqs,
+ struct pci_dev *dev)
+{
+ struct pci_controller *hose = pci_bus_to_host(dev->bus);
+ struct pnv_phb *phb = hose->private_data;
+ int i, hwirq;
+
+ for (i = 1; i < CXL_IRQ_RANGES; i++) {
+ if (!irqs->range[i])
+ continue;
+ pr_devel("cxl release irq range 0x%x: offset: 0x%lx limit: %ld\n",
+ i, irqs->offset[i],
+ irqs->range[i]);
+ hwirq = irqs->offset[i] - phb->msi_base;
+ msi_bitmap_free_hwirqs(&phb->msi_bmp, hwirq,
+ irqs->range[i]);
+ }
+}
+EXPORT_SYMBOL(pnv_cxl_release_hwirq_ranges);
+
+int pnv_cxl_alloc_hwirq_ranges(struct cxl_irq_ranges *irqs,
+ struct pci_dev *dev, int num)
+{
+ struct pci_controller *hose = pci_bus_to_host(dev->bus);
+ struct pnv_phb *phb = hose->private_data;
+ int i, hwirq, try;
+
+ memset(irqs, 0, sizeof(struct cxl_irq_ranges));
+
+ /* 0 is reserved for the multiplexed PSL DSI interrupt */
+ for (i = 1; i < CXL_IRQ_RANGES && num; i++) {
+ try = num;
+ while (try) {
+ hwirq = msi_bitmap_alloc_hwirqs(&phb->msi_bmp, try);
+ if (hwirq >= 0)
+ break;
+ try /= 2;
+ }
+ if (!try)
+ goto fail;
+
+ irqs->offset[i] = phb->msi_base + hwirq;
+ irqs->range[i] = try;
+ pr_devel("cxl alloc irq range 0x%x: offset: 0x%lx limit: %li\n",
+ i, irqs->offset[i], irqs->range[i]);
+ num -= try;
+ }
+ if (num)
+ goto fail;
+
+ return 0;
+fail:
+ pnv_cxl_release_hwirq_ranges(irqs, dev);
+ return -ENOSPC;
+}
+EXPORT_SYMBOL(pnv_cxl_alloc_hwirq_ranges);
+
+int pnv_cxl_get_irq_count(struct pci_dev *dev)
+{
+ struct pci_controller *hose = pci_bus_to_host(dev->bus);
+ struct pnv_phb *phb = hose->private_data;
+
+ return phb->msi_bmp.irq_count;
+}
+EXPORT_SYMBOL(pnv_cxl_get_irq_count);
+
+int pnv_cxl_ioda_msi_setup(struct pci_dev *dev, unsigned int hwirq,
+ unsigned int virq)
+{
+ struct pci_controller *hose = pci_bus_to_host(dev->bus);
+ struct pnv_phb *phb = hose->private_data;
+ unsigned int xive_num = hwirq - phb->msi_base;
+ struct pnv_ioda_pe *pe;
+ int rc;
+
+ if (!(pe = pnv_ioda_get_pe(dev)))
+ return -ENODEV;
+
+ /* Assign XIVE to PE */
+ rc = opal_pci_set_xive_pe(phb->opal_id, pe->pe_number, xive_num);
+ if (rc) {
+ pe_warn(pe, "%s: OPAL error %d setting msi_base 0x%x "
+ "hwirq 0x%x XIVE 0x%x PE\n",
+ pci_name(dev), rc, phb->msi_base, hwirq, xive_num);
+ return -EIO;
+ }
+ pnv_set_msi_irq_chip(phb, virq);
+
+ return 0;
+}
+EXPORT_SYMBOL(pnv_cxl_ioda_msi_setup);
+
+#if IS_MODULE(CONFIG_CXL)
+static inline int get_cxl_module(void)
+{
+ struct module *cxl_module;
+
+ mutex_lock(&module_mutex);
+
+ cxl_module = find_module("cxl");
+ if (cxl_module)
+ __module_get(cxl_module);
+
+ mutex_unlock(&module_mutex);
+
+ if (!cxl_module)
+ return -ENODEV;
+
+ return 0;
+}
+#else
+static inline int get_cxl_module(void) { return 0; }
+#endif
diff --git a/arch/powerpc/platforms/powernv/pci-ioda-tce.c b/arch/powerpc/platforms/powernv/pci-ioda-tce.c
new file mode 100644
index 000000000..15a567128
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/pci-ioda-tce.c
@@ -0,0 +1,406 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * TCE helpers for IODA PCI/PCIe on PowerNV platforms
+ *
+ * Copyright 2018 IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/iommu.h>
+
+#include <asm/iommu.h>
+#include <asm/tce.h>
+#include "pci.h"
+
+void pnv_pci_setup_iommu_table(struct iommu_table *tbl,
+ void *tce_mem, u64 tce_size,
+ u64 dma_offset, unsigned int page_shift)
+{
+ tbl->it_blocksize = 16;
+ tbl->it_base = (unsigned long)tce_mem;
+ tbl->it_page_shift = page_shift;
+ tbl->it_offset = dma_offset >> tbl->it_page_shift;
+ tbl->it_index = 0;
+ tbl->it_size = tce_size >> 3;
+ tbl->it_busno = 0;
+ tbl->it_type = TCE_PCI;
+}
+
+static __be64 *pnv_alloc_tce_level(int nid, unsigned int shift)
+{
+ struct page *tce_mem = NULL;
+ __be64 *addr;
+
+ tce_mem = alloc_pages_node(nid, GFP_ATOMIC | __GFP_NOWARN,
+ shift - PAGE_SHIFT);
+ if (!tce_mem) {
+ pr_err("Failed to allocate a TCE memory, level shift=%d\n",
+ shift);
+ return NULL;
+ }
+ addr = page_address(tce_mem);
+ memset(addr, 0, 1UL << shift);
+
+ return addr;
+}
+
+static void pnv_pci_ioda2_table_do_free_pages(__be64 *addr,
+ unsigned long size, unsigned int levels);
+
+static __be64 *pnv_tce(struct iommu_table *tbl, bool user, long idx, bool alloc)
+{
+ __be64 *tmp = user ? tbl->it_userspace : (__be64 *) tbl->it_base;
+ int level = tbl->it_indirect_levels;
+ const long shift = ilog2(tbl->it_level_size);
+ unsigned long mask = (tbl->it_level_size - 1) << (level * shift);
+
+ while (level) {
+ int n = (idx & mask) >> (level * shift);
+ unsigned long oldtce, tce = be64_to_cpu(READ_ONCE(tmp[n]));
+
+ if (!tce) {
+ __be64 *tmp2;
+
+ if (!alloc)
+ return NULL;
+
+ tmp2 = pnv_alloc_tce_level(tbl->it_nid,
+ ilog2(tbl->it_level_size) + 3);
+ if (!tmp2)
+ return NULL;
+
+ tce = __pa(tmp2) | TCE_PCI_READ | TCE_PCI_WRITE;
+ oldtce = be64_to_cpu(cmpxchg(&tmp[n], 0,
+ cpu_to_be64(tce)));
+ if (oldtce) {
+ pnv_pci_ioda2_table_do_free_pages(tmp2,
+ ilog2(tbl->it_level_size) + 3, 1);
+ tce = oldtce;
+ }
+ }
+
+ tmp = __va(tce & ~(TCE_PCI_READ | TCE_PCI_WRITE));
+ idx &= ~mask;
+ mask >>= shift;
+ --level;
+ }
+
+ return tmp + idx;
+}
+
+int pnv_tce_build(struct iommu_table *tbl, long index, long npages,
+ unsigned long uaddr, enum dma_data_direction direction,
+ unsigned long attrs)
+{
+ u64 proto_tce = iommu_direction_to_tce_perm(direction);
+ u64 rpn = __pa(uaddr) >> tbl->it_page_shift;
+ long i;
+
+ if (proto_tce & TCE_PCI_WRITE)
+ proto_tce |= TCE_PCI_READ;
+
+ for (i = 0; i < npages; i++) {
+ unsigned long newtce = proto_tce |
+ ((rpn + i) << tbl->it_page_shift);
+ unsigned long idx = index - tbl->it_offset + i;
+
+ *(pnv_tce(tbl, false, idx, true)) = cpu_to_be64(newtce);
+ }
+
+ return 0;
+}
+
+#ifdef CONFIG_IOMMU_API
+int pnv_tce_xchg(struct iommu_table *tbl, long index,
+ unsigned long *hpa, enum dma_data_direction *direction,
+ bool alloc)
+{
+ u64 proto_tce = iommu_direction_to_tce_perm(*direction);
+ unsigned long newtce = *hpa | proto_tce, oldtce;
+ unsigned long idx = index - tbl->it_offset;
+ __be64 *ptce = NULL;
+
+ BUG_ON(*hpa & ~IOMMU_PAGE_MASK(tbl));
+
+ if (*direction == DMA_NONE) {
+ ptce = pnv_tce(tbl, false, idx, false);
+ if (!ptce) {
+ *hpa = 0;
+ return 0;
+ }
+ }
+
+ if (!ptce) {
+ ptce = pnv_tce(tbl, false, idx, alloc);
+ if (!ptce)
+ return alloc ? H_HARDWARE : H_TOO_HARD;
+ }
+
+ if (newtce & TCE_PCI_WRITE)
+ newtce |= TCE_PCI_READ;
+
+ oldtce = be64_to_cpu(xchg(ptce, cpu_to_be64(newtce)));
+ *hpa = oldtce & ~(TCE_PCI_READ | TCE_PCI_WRITE);
+ *direction = iommu_tce_direction(oldtce);
+
+ return 0;
+}
+
+__be64 *pnv_tce_useraddrptr(struct iommu_table *tbl, long index, bool alloc)
+{
+ if (WARN_ON_ONCE(!tbl->it_userspace))
+ return NULL;
+
+ return pnv_tce(tbl, true, index - tbl->it_offset, alloc);
+}
+#endif
+
+void pnv_tce_free(struct iommu_table *tbl, long index, long npages)
+{
+ long i;
+
+ for (i = 0; i < npages; i++) {
+ unsigned long idx = index - tbl->it_offset + i;
+ __be64 *ptce = pnv_tce(tbl, false, idx, false);
+
+ if (ptce)
+ *ptce = cpu_to_be64(0);
+ else
+ /* Skip the rest of the level */
+ i |= tbl->it_level_size - 1;
+ }
+}
+
+unsigned long pnv_tce_get(struct iommu_table *tbl, long index)
+{
+ __be64 *ptce = pnv_tce(tbl, false, index - tbl->it_offset, false);
+
+ if (!ptce)
+ return 0;
+
+ return be64_to_cpu(*ptce);
+}
+
+static void pnv_pci_ioda2_table_do_free_pages(__be64 *addr,
+ unsigned long size, unsigned int levels)
+{
+ const unsigned long addr_ul = (unsigned long) addr &
+ ~(TCE_PCI_READ | TCE_PCI_WRITE);
+
+ if (levels) {
+ long i;
+ u64 *tmp = (u64 *) addr_ul;
+
+ for (i = 0; i < size; ++i) {
+ unsigned long hpa = be64_to_cpu(tmp[i]);
+
+ if (!(hpa & (TCE_PCI_READ | TCE_PCI_WRITE)))
+ continue;
+
+ pnv_pci_ioda2_table_do_free_pages(__va(hpa), size,
+ levels - 1);
+ }
+ }
+
+ free_pages(addr_ul, get_order(size << 3));
+}
+
+void pnv_pci_ioda2_table_free_pages(struct iommu_table *tbl)
+{
+ const unsigned long size = tbl->it_indirect_levels ?
+ tbl->it_level_size : tbl->it_size;
+
+ if (!tbl->it_size)
+ return;
+
+ pnv_pci_ioda2_table_do_free_pages((__be64 *)tbl->it_base, size,
+ tbl->it_indirect_levels);
+ if (tbl->it_userspace) {
+ pnv_pci_ioda2_table_do_free_pages(tbl->it_userspace, size,
+ tbl->it_indirect_levels);
+ }
+}
+
+static __be64 *pnv_pci_ioda2_table_do_alloc_pages(int nid, unsigned int shift,
+ unsigned int levels, unsigned long limit,
+ unsigned long *current_offset, unsigned long *total_allocated)
+{
+ __be64 *addr, *tmp;
+ unsigned long allocated = 1UL << shift;
+ unsigned int entries = 1UL << (shift - 3);
+ long i;
+
+ addr = pnv_alloc_tce_level(nid, shift);
+ *total_allocated += allocated;
+
+ --levels;
+ if (!levels) {
+ *current_offset += allocated;
+ return addr;
+ }
+
+ for (i = 0; i < entries; ++i) {
+ tmp = pnv_pci_ioda2_table_do_alloc_pages(nid, shift,
+ levels, limit, current_offset, total_allocated);
+ if (!tmp)
+ break;
+
+ addr[i] = cpu_to_be64(__pa(tmp) |
+ TCE_PCI_READ | TCE_PCI_WRITE);
+
+ if (*current_offset >= limit)
+ break;
+ }
+
+ return addr;
+}
+
+long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset,
+ __u32 page_shift, __u64 window_size, __u32 levels,
+ bool alloc_userspace_copy, struct iommu_table *tbl)
+{
+ void *addr, *uas = NULL;
+ unsigned long offset = 0, level_shift, total_allocated = 0;
+ unsigned long total_allocated_uas = 0;
+ const unsigned int window_shift = ilog2(window_size);
+ unsigned int entries_shift = window_shift - page_shift;
+ unsigned int table_shift = max_t(unsigned int, entries_shift + 3,
+ PAGE_SHIFT);
+ const unsigned long tce_table_size = 1UL << table_shift;
+
+ if (!levels || (levels > POWERNV_IOMMU_MAX_LEVELS))
+ return -EINVAL;
+
+ if (!is_power_of_2(window_size))
+ return -EINVAL;
+
+ /* Adjust direct table size from window_size and levels */
+ entries_shift = (entries_shift + levels - 1) / levels;
+ level_shift = entries_shift + 3;
+ level_shift = max_t(unsigned int, level_shift, PAGE_SHIFT);
+
+ if ((level_shift - 3) * levels + page_shift >= 55)
+ return -EINVAL;
+
+ /* Allocate TCE table */
+ addr = pnv_pci_ioda2_table_do_alloc_pages(nid, level_shift,
+ 1, tce_table_size, &offset, &total_allocated);
+
+ /* addr==NULL means that the first level allocation failed */
+ if (!addr)
+ return -ENOMEM;
+
+ /*
+ * First level was allocated but some lower level failed as
+ * we did not allocate as much as we wanted,
+ * release partially allocated table.
+ */
+ if (levels == 1 && offset < tce_table_size)
+ goto free_tces_exit;
+
+ /* Allocate userspace view of the TCE table */
+ if (alloc_userspace_copy) {
+ offset = 0;
+ uas = pnv_pci_ioda2_table_do_alloc_pages(nid, level_shift,
+ 1, tce_table_size, &offset,
+ &total_allocated_uas);
+ if (!uas)
+ goto free_tces_exit;
+ if (levels == 1 && (offset < tce_table_size ||
+ total_allocated_uas != total_allocated))
+ goto free_uas_exit;
+ }
+
+ /* Setup linux iommu table */
+ pnv_pci_setup_iommu_table(tbl, addr, tce_table_size, bus_offset,
+ page_shift);
+ tbl->it_level_size = 1ULL << (level_shift - 3);
+ tbl->it_indirect_levels = levels - 1;
+ tbl->it_userspace = uas;
+ tbl->it_nid = nid;
+
+ pr_debug("Created TCE table: ws=%08llx ts=%lx @%08llx base=%lx uas=%p levels=%d/%d\n",
+ window_size, tce_table_size, bus_offset, tbl->it_base,
+ tbl->it_userspace, 1, levels);
+
+ return 0;
+
+free_uas_exit:
+ pnv_pci_ioda2_table_do_free_pages(uas,
+ 1ULL << (level_shift - 3), levels - 1);
+free_tces_exit:
+ pnv_pci_ioda2_table_do_free_pages(addr,
+ 1ULL << (level_shift - 3), levels - 1);
+
+ return -ENOMEM;
+}
+
+static void pnv_iommu_table_group_link_free(struct rcu_head *head)
+{
+ struct iommu_table_group_link *tgl = container_of(head,
+ struct iommu_table_group_link, rcu);
+
+ kfree(tgl);
+}
+
+void pnv_pci_unlink_table_and_group(struct iommu_table *tbl,
+ struct iommu_table_group *table_group)
+{
+ long i;
+ bool found;
+ struct iommu_table_group_link *tgl;
+
+ if (!tbl || !table_group)
+ return;
+
+ /* Remove link to a group from table's list of attached groups */
+ found = false;
+ list_for_each_entry_rcu(tgl, &tbl->it_group_list, next) {
+ if (tgl->table_group == table_group) {
+ list_del_rcu(&tgl->next);
+ call_rcu(&tgl->rcu, pnv_iommu_table_group_link_free);
+ found = true;
+ break;
+ }
+ }
+ if (WARN_ON(!found))
+ return;
+
+ /* Clean a pointer to iommu_table in iommu_table_group::tables[] */
+ found = false;
+ for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
+ if (table_group->tables[i] == tbl) {
+ table_group->tables[i] = NULL;
+ found = true;
+ break;
+ }
+ }
+ WARN_ON(!found);
+}
+
+long pnv_pci_link_table_and_group(int node, int num,
+ struct iommu_table *tbl,
+ struct iommu_table_group *table_group)
+{
+ struct iommu_table_group_link *tgl = NULL;
+
+ if (WARN_ON(!tbl || !table_group))
+ return -EINVAL;
+
+ tgl = kzalloc_node(sizeof(struct iommu_table_group_link), GFP_KERNEL,
+ node);
+ if (!tgl)
+ return -ENOMEM;
+
+ tgl->table_group = table_group;
+ list_add_rcu(&tgl->next, &tbl->it_group_list);
+
+ table_group->tables[num] = tbl;
+
+ return 0;
+}
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
new file mode 100644
index 000000000..ecd211c5f
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -0,0 +1,4090 @@
+/*
+ * Support PCI/PCIe on PowerNV platforms
+ *
+ * Copyright 2011 Benjamin Herrenschmidt, IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#undef DEBUG
+
+#include <linux/kernel.h>
+#include <linux/pci.h>
+#include <linux/crash_dump.h>
+#include <linux/delay.h>
+#include <linux/string.h>
+#include <linux/init.h>
+#include <linux/bootmem.h>
+#include <linux/irq.h>
+#include <linux/io.h>
+#include <linux/msi.h>
+#include <linux/memblock.h>
+#include <linux/iommu.h>
+#include <linux/rculist.h>
+#include <linux/sizes.h>
+
+#include <asm/sections.h>
+#include <asm/io.h>
+#include <asm/prom.h>
+#include <asm/pci-bridge.h>
+#include <asm/machdep.h>
+#include <asm/msi_bitmap.h>
+#include <asm/ppc-pci.h>
+#include <asm/opal.h>
+#include <asm/iommu.h>
+#include <asm/tce.h>
+#include <asm/xics.h>
+#include <asm/debugfs.h>
+#include <asm/firmware.h>
+#include <asm/pnv-pci.h>
+#include <asm/mmzone.h>
+
+#include <misc/cxl-base.h>
+
+#include "powernv.h"
+#include "pci.h"
+#include "../../../../drivers/pci/pci.h"
+
+#define PNV_IODA1_M64_NUM 16 /* Number of M64 BARs */
+#define PNV_IODA1_M64_SEGS 8 /* Segments per M64 BAR */
+#define PNV_IODA1_DMA32_SEGSIZE 0x10000000
+
+static const char * const pnv_phb_names[] = { "IODA1", "IODA2", "NPU_NVLINK",
+ "NPU_OCAPI" };
+
+void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level,
+ const char *fmt, ...)
+{
+ struct va_format vaf;
+ va_list args;
+ char pfix[32];
+
+ va_start(args, fmt);
+
+ vaf.fmt = fmt;
+ vaf.va = &args;
+
+ if (pe->flags & PNV_IODA_PE_DEV)
+ strlcpy(pfix, dev_name(&pe->pdev->dev), sizeof(pfix));
+ else if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL))
+ sprintf(pfix, "%04x:%02x ",
+ pci_domain_nr(pe->pbus), pe->pbus->number);
+#ifdef CONFIG_PCI_IOV
+ else if (pe->flags & PNV_IODA_PE_VF)
+ sprintf(pfix, "%04x:%02x:%2x.%d",
+ pci_domain_nr(pe->parent_dev->bus),
+ (pe->rid & 0xff00) >> 8,
+ PCI_SLOT(pe->rid), PCI_FUNC(pe->rid));
+#endif /* CONFIG_PCI_IOV*/
+
+ printk("%spci %s: [PE# %.2x] %pV",
+ level, pfix, pe->pe_number, &vaf);
+
+ va_end(args);
+}
+
+static bool pnv_iommu_bypass_disabled __read_mostly;
+static bool pci_reset_phbs __read_mostly;
+
+static int __init iommu_setup(char *str)
+{
+ if (!str)
+ return -EINVAL;
+
+ while (*str) {
+ if (!strncmp(str, "nobypass", 8)) {
+ pnv_iommu_bypass_disabled = true;
+ pr_info("PowerNV: IOMMU bypass window disabled.\n");
+ break;
+ }
+ str += strcspn(str, ",");
+ if (*str == ',')
+ str++;
+ }
+
+ return 0;
+}
+early_param("iommu", iommu_setup);
+
+static int __init pci_reset_phbs_setup(char *str)
+{
+ pci_reset_phbs = true;
+ return 0;
+}
+
+early_param("ppc_pci_reset_phbs", pci_reset_phbs_setup);
+
+static inline bool pnv_pci_is_m64(struct pnv_phb *phb, struct resource *r)
+{
+ /*
+ * WARNING: We cannot rely on the resource flags. The Linux PCI
+ * allocation code sometimes decides to put a 64-bit prefetchable
+ * BAR in the 32-bit window, so we have to compare the addresses.
+ *
+ * For simplicity we only test resource start.
+ */
+ return (r->start >= phb->ioda.m64_base &&
+ r->start < (phb->ioda.m64_base + phb->ioda.m64_size));
+}
+
+static inline bool pnv_pci_is_m64_flags(unsigned long resource_flags)
+{
+ unsigned long flags = (IORESOURCE_MEM_64 | IORESOURCE_PREFETCH);
+
+ return (resource_flags & flags) == flags;
+}
+
+static struct pnv_ioda_pe *pnv_ioda_init_pe(struct pnv_phb *phb, int pe_no)
+{
+ s64 rc;
+
+ phb->ioda.pe_array[pe_no].phb = phb;
+ phb->ioda.pe_array[pe_no].pe_number = pe_no;
+
+ /*
+ * Clear the PE frozen state as it might be put into frozen state
+ * in the last PCI remove path. It's not harmful to do so when the
+ * PE is already in unfrozen state.
+ */
+ rc = opal_pci_eeh_freeze_clear(phb->opal_id, pe_no,
+ OPAL_EEH_ACTION_CLEAR_FREEZE_ALL);
+ if (rc != OPAL_SUCCESS && rc != OPAL_UNSUPPORTED)
+ pr_warn("%s: Error %lld unfreezing PHB#%x-PE#%x\n",
+ __func__, rc, phb->hose->global_number, pe_no);
+
+ return &phb->ioda.pe_array[pe_no];
+}
+
+static void pnv_ioda_reserve_pe(struct pnv_phb *phb, int pe_no)
+{
+ if (!(pe_no >= 0 && pe_no < phb->ioda.total_pe_num)) {
+ pr_warn("%s: Invalid PE %x on PHB#%x\n",
+ __func__, pe_no, phb->hose->global_number);
+ return;
+ }
+
+ if (test_and_set_bit(pe_no, phb->ioda.pe_alloc))
+ pr_debug("%s: PE %x was reserved on PHB#%x\n",
+ __func__, pe_no, phb->hose->global_number);
+
+ pnv_ioda_init_pe(phb, pe_no);
+}
+
+static struct pnv_ioda_pe *pnv_ioda_alloc_pe(struct pnv_phb *phb)
+{
+ long pe;
+
+ for (pe = phb->ioda.total_pe_num - 1; pe >= 0; pe--) {
+ if (!test_and_set_bit(pe, phb->ioda.pe_alloc))
+ return pnv_ioda_init_pe(phb, pe);
+ }
+
+ return NULL;
+}
+
+static void pnv_ioda_free_pe(struct pnv_ioda_pe *pe)
+{
+ struct pnv_phb *phb = pe->phb;
+ unsigned int pe_num = pe->pe_number;
+
+ WARN_ON(pe->pdev);
+
+ memset(pe, 0, sizeof(struct pnv_ioda_pe));
+ clear_bit(pe_num, phb->ioda.pe_alloc);
+}
+
+/* The default M64 BAR is shared by all PEs */
+static int pnv_ioda2_init_m64(struct pnv_phb *phb)
+{
+ const char *desc;
+ struct resource *r;
+ s64 rc;
+
+ /* Configure the default M64 BAR */
+ rc = opal_pci_set_phb_mem_window(phb->opal_id,
+ OPAL_M64_WINDOW_TYPE,
+ phb->ioda.m64_bar_idx,
+ phb->ioda.m64_base,
+ 0, /* unused */
+ phb->ioda.m64_size);
+ if (rc != OPAL_SUCCESS) {
+ desc = "configuring";
+ goto fail;
+ }
+
+ /* Enable the default M64 BAR */
+ rc = opal_pci_phb_mmio_enable(phb->opal_id,
+ OPAL_M64_WINDOW_TYPE,
+ phb->ioda.m64_bar_idx,
+ OPAL_ENABLE_M64_SPLIT);
+ if (rc != OPAL_SUCCESS) {
+ desc = "enabling";
+ goto fail;
+ }
+
+ /*
+ * Exclude the segments for reserved and root bus PE, which
+ * are first or last two PEs.
+ */
+ r = &phb->hose->mem_resources[1];
+ if (phb->ioda.reserved_pe_idx == 0)
+ r->start += (2 * phb->ioda.m64_segsize);
+ else if (phb->ioda.reserved_pe_idx == (phb->ioda.total_pe_num - 1))
+ r->end -= (2 * phb->ioda.m64_segsize);
+ else
+ pr_warn(" Cannot strip M64 segment for reserved PE#%x\n",
+ phb->ioda.reserved_pe_idx);
+
+ return 0;
+
+fail:
+ pr_warn(" Failure %lld %s M64 BAR#%d\n",
+ rc, desc, phb->ioda.m64_bar_idx);
+ opal_pci_phb_mmio_enable(phb->opal_id,
+ OPAL_M64_WINDOW_TYPE,
+ phb->ioda.m64_bar_idx,
+ OPAL_DISABLE_M64);
+ return -EIO;
+}
+
+static void pnv_ioda_reserve_dev_m64_pe(struct pci_dev *pdev,
+ unsigned long *pe_bitmap)
+{
+ struct pci_controller *hose = pci_bus_to_host(pdev->bus);
+ struct pnv_phb *phb = hose->private_data;
+ struct resource *r;
+ resource_size_t base, sgsz, start, end;
+ int segno, i;
+
+ base = phb->ioda.m64_base;
+ sgsz = phb->ioda.m64_segsize;
+ for (i = 0; i <= PCI_ROM_RESOURCE; i++) {
+ r = &pdev->resource[i];
+ if (!r->parent || !pnv_pci_is_m64(phb, r))
+ continue;
+
+ start = _ALIGN_DOWN(r->start - base, sgsz);
+ end = _ALIGN_UP(r->end - base, sgsz);
+ for (segno = start / sgsz; segno < end / sgsz; segno++) {
+ if (pe_bitmap)
+ set_bit(segno, pe_bitmap);
+ else
+ pnv_ioda_reserve_pe(phb, segno);
+ }
+ }
+}
+
+static int pnv_ioda1_init_m64(struct pnv_phb *phb)
+{
+ struct resource *r;
+ int index;
+
+ /*
+ * There are 16 M64 BARs, each of which has 8 segments. So
+ * there are as many M64 segments as the maximum number of
+ * PEs, which is 128.
+ */
+ for (index = 0; index < PNV_IODA1_M64_NUM; index++) {
+ unsigned long base, segsz = phb->ioda.m64_segsize;
+ int64_t rc;
+
+ base = phb->ioda.m64_base +
+ index * PNV_IODA1_M64_SEGS * segsz;
+ rc = opal_pci_set_phb_mem_window(phb->opal_id,
+ OPAL_M64_WINDOW_TYPE, index, base, 0,
+ PNV_IODA1_M64_SEGS * segsz);
+ if (rc != OPAL_SUCCESS) {
+ pr_warn(" Error %lld setting M64 PHB#%x-BAR#%d\n",
+ rc, phb->hose->global_number, index);
+ goto fail;
+ }
+
+ rc = opal_pci_phb_mmio_enable(phb->opal_id,
+ OPAL_M64_WINDOW_TYPE, index,
+ OPAL_ENABLE_M64_SPLIT);
+ if (rc != OPAL_SUCCESS) {
+ pr_warn(" Error %lld enabling M64 PHB#%x-BAR#%d\n",
+ rc, phb->hose->global_number, index);
+ goto fail;
+ }
+ }
+
+ /*
+ * Exclude the segments for reserved and root bus PE, which
+ * are first or last two PEs.
+ */
+ r = &phb->hose->mem_resources[1];
+ if (phb->ioda.reserved_pe_idx == 0)
+ r->start += (2 * phb->ioda.m64_segsize);
+ else if (phb->ioda.reserved_pe_idx == (phb->ioda.total_pe_num - 1))
+ r->end -= (2 * phb->ioda.m64_segsize);
+ else
+ WARN(1, "Wrong reserved PE#%x on PHB#%x\n",
+ phb->ioda.reserved_pe_idx, phb->hose->global_number);
+
+ return 0;
+
+fail:
+ for ( ; index >= 0; index--)
+ opal_pci_phb_mmio_enable(phb->opal_id,
+ OPAL_M64_WINDOW_TYPE, index, OPAL_DISABLE_M64);
+
+ return -EIO;
+}
+
+static void pnv_ioda_reserve_m64_pe(struct pci_bus *bus,
+ unsigned long *pe_bitmap,
+ bool all)
+{
+ struct pci_dev *pdev;
+
+ list_for_each_entry(pdev, &bus->devices, bus_list) {
+ pnv_ioda_reserve_dev_m64_pe(pdev, pe_bitmap);
+
+ if (all && pdev->subordinate)
+ pnv_ioda_reserve_m64_pe(pdev->subordinate,
+ pe_bitmap, all);
+ }
+}
+
+static struct pnv_ioda_pe *pnv_ioda_pick_m64_pe(struct pci_bus *bus, bool all)
+{
+ struct pci_controller *hose = pci_bus_to_host(bus);
+ struct pnv_phb *phb = hose->private_data;
+ struct pnv_ioda_pe *master_pe, *pe;
+ unsigned long size, *pe_alloc;
+ int i;
+
+ /* Root bus shouldn't use M64 */
+ if (pci_is_root_bus(bus))
+ return NULL;
+
+ /* Allocate bitmap */
+ size = _ALIGN_UP(phb->ioda.total_pe_num / 8, sizeof(unsigned long));
+ pe_alloc = kzalloc(size, GFP_KERNEL);
+ if (!pe_alloc) {
+ pr_warn("%s: Out of memory !\n",
+ __func__);
+ return NULL;
+ }
+
+ /* Figure out reserved PE numbers by the PE */
+ pnv_ioda_reserve_m64_pe(bus, pe_alloc, all);
+
+ /*
+ * the current bus might not own M64 window and that's all
+ * contributed by its child buses. For the case, we needn't
+ * pick M64 dependent PE#.
+ */
+ if (bitmap_empty(pe_alloc, phb->ioda.total_pe_num)) {
+ kfree(pe_alloc);
+ return NULL;
+ }
+
+ /*
+ * Figure out the master PE and put all slave PEs to master
+ * PE's list to form compound PE.
+ */
+ master_pe = NULL;
+ i = -1;
+ while ((i = find_next_bit(pe_alloc, phb->ioda.total_pe_num, i + 1)) <
+ phb->ioda.total_pe_num) {
+ pe = &phb->ioda.pe_array[i];
+
+ phb->ioda.m64_segmap[pe->pe_number] = pe->pe_number;
+ if (!master_pe) {
+ pe->flags |= PNV_IODA_PE_MASTER;
+ INIT_LIST_HEAD(&pe->slaves);
+ master_pe = pe;
+ } else {
+ pe->flags |= PNV_IODA_PE_SLAVE;
+ pe->master = master_pe;
+ list_add_tail(&pe->list, &master_pe->slaves);
+ }
+
+ /*
+ * P7IOC supports M64DT, which helps mapping M64 segment
+ * to one particular PE#. However, PHB3 has fixed mapping
+ * between M64 segment and PE#. In order to have same logic
+ * for P7IOC and PHB3, we enforce fixed mapping between M64
+ * segment and PE# on P7IOC.
+ */
+ if (phb->type == PNV_PHB_IODA1) {
+ int64_t rc;
+
+ rc = opal_pci_map_pe_mmio_window(phb->opal_id,
+ pe->pe_number, OPAL_M64_WINDOW_TYPE,
+ pe->pe_number / PNV_IODA1_M64_SEGS,
+ pe->pe_number % PNV_IODA1_M64_SEGS);
+ if (rc != OPAL_SUCCESS)
+ pr_warn("%s: Error %lld mapping M64 for PHB#%x-PE#%x\n",
+ __func__, rc, phb->hose->global_number,
+ pe->pe_number);
+ }
+ }
+
+ kfree(pe_alloc);
+ return master_pe;
+}
+
+static void __init pnv_ioda_parse_m64_window(struct pnv_phb *phb)
+{
+ struct pci_controller *hose = phb->hose;
+ struct device_node *dn = hose->dn;
+ struct resource *res;
+ u32 m64_range[2], i;
+ const __be32 *r;
+ u64 pci_addr;
+
+ if (phb->type != PNV_PHB_IODA1 && phb->type != PNV_PHB_IODA2) {
+ pr_info(" Not support M64 window\n");
+ return;
+ }
+
+ if (!firmware_has_feature(FW_FEATURE_OPAL)) {
+ pr_info(" Firmware too old to support M64 window\n");
+ return;
+ }
+
+ r = of_get_property(dn, "ibm,opal-m64-window", NULL);
+ if (!r) {
+ pr_info(" No <ibm,opal-m64-window> on %pOF\n",
+ dn);
+ return;
+ }
+
+ /*
+ * Find the available M64 BAR range and pickup the last one for
+ * covering the whole 64-bits space. We support only one range.
+ */
+ if (of_property_read_u32_array(dn, "ibm,opal-available-m64-ranges",
+ m64_range, 2)) {
+ /* In absence of the property, assume 0..15 */
+ m64_range[0] = 0;
+ m64_range[1] = 16;
+ }
+ /* We only support 64 bits in our allocator */
+ if (m64_range[1] > 63) {
+ pr_warn("%s: Limiting M64 range to 63 (from %d) on PHB#%x\n",
+ __func__, m64_range[1], phb->hose->global_number);
+ m64_range[1] = 63;
+ }
+ /* Empty range, no m64 */
+ if (m64_range[1] <= m64_range[0]) {
+ pr_warn("%s: M64 empty, disabling M64 usage on PHB#%x\n",
+ __func__, phb->hose->global_number);
+ return;
+ }
+
+ /* Configure M64 informations */
+ res = &hose->mem_resources[1];
+ res->name = dn->full_name;
+ res->start = of_translate_address(dn, r + 2);
+ res->end = res->start + of_read_number(r + 4, 2) - 1;
+ res->flags = (IORESOURCE_MEM | IORESOURCE_MEM_64 | IORESOURCE_PREFETCH);
+ pci_addr = of_read_number(r, 2);
+ hose->mem_offset[1] = res->start - pci_addr;
+
+ phb->ioda.m64_size = resource_size(res);
+ phb->ioda.m64_segsize = phb->ioda.m64_size / phb->ioda.total_pe_num;
+ phb->ioda.m64_base = pci_addr;
+
+ /* This lines up nicely with the display from processing OF ranges */
+ pr_info(" MEM 0x%016llx..0x%016llx -> 0x%016llx (M64 #%d..%d)\n",
+ res->start, res->end, pci_addr, m64_range[0],
+ m64_range[0] + m64_range[1] - 1);
+
+ /* Mark all M64 used up by default */
+ phb->ioda.m64_bar_alloc = (unsigned long)-1;
+
+ /* Use last M64 BAR to cover M64 window */
+ m64_range[1]--;
+ phb->ioda.m64_bar_idx = m64_range[0] + m64_range[1];
+
+ pr_info(" Using M64 #%d as default window\n", phb->ioda.m64_bar_idx);
+
+ /* Mark remaining ones free */
+ for (i = m64_range[0]; i < m64_range[1]; i++)
+ clear_bit(i, &phb->ioda.m64_bar_alloc);
+
+ /*
+ * Setup init functions for M64 based on IODA version, IODA3 uses
+ * the IODA2 code.
+ */
+ if (phb->type == PNV_PHB_IODA1)
+ phb->init_m64 = pnv_ioda1_init_m64;
+ else
+ phb->init_m64 = pnv_ioda2_init_m64;
+ phb->reserve_m64_pe = pnv_ioda_reserve_m64_pe;
+ phb->pick_m64_pe = pnv_ioda_pick_m64_pe;
+}
+
+static void pnv_ioda_freeze_pe(struct pnv_phb *phb, int pe_no)
+{
+ struct pnv_ioda_pe *pe = &phb->ioda.pe_array[pe_no];
+ struct pnv_ioda_pe *slave;
+ s64 rc;
+
+ /* Fetch master PE */
+ if (pe->flags & PNV_IODA_PE_SLAVE) {
+ pe = pe->master;
+ if (WARN_ON(!pe || !(pe->flags & PNV_IODA_PE_MASTER)))
+ return;
+
+ pe_no = pe->pe_number;
+ }
+
+ /* Freeze master PE */
+ rc = opal_pci_eeh_freeze_set(phb->opal_id,
+ pe_no,
+ OPAL_EEH_ACTION_SET_FREEZE_ALL);
+ if (rc != OPAL_SUCCESS) {
+ pr_warn("%s: Failure %lld freezing PHB#%x-PE#%x\n",
+ __func__, rc, phb->hose->global_number, pe_no);
+ return;
+ }
+
+ /* Freeze slave PEs */
+ if (!(pe->flags & PNV_IODA_PE_MASTER))
+ return;
+
+ list_for_each_entry(slave, &pe->slaves, list) {
+ rc = opal_pci_eeh_freeze_set(phb->opal_id,
+ slave->pe_number,
+ OPAL_EEH_ACTION_SET_FREEZE_ALL);
+ if (rc != OPAL_SUCCESS)
+ pr_warn("%s: Failure %lld freezing PHB#%x-PE#%x\n",
+ __func__, rc, phb->hose->global_number,
+ slave->pe_number);
+ }
+}
+
+static int pnv_ioda_unfreeze_pe(struct pnv_phb *phb, int pe_no, int opt)
+{
+ struct pnv_ioda_pe *pe, *slave;
+ s64 rc;
+
+ /* Find master PE */
+ pe = &phb->ioda.pe_array[pe_no];
+ if (pe->flags & PNV_IODA_PE_SLAVE) {
+ pe = pe->master;
+ WARN_ON(!pe || !(pe->flags & PNV_IODA_PE_MASTER));
+ pe_no = pe->pe_number;
+ }
+
+ /* Clear frozen state for master PE */
+ rc = opal_pci_eeh_freeze_clear(phb->opal_id, pe_no, opt);
+ if (rc != OPAL_SUCCESS) {
+ pr_warn("%s: Failure %lld clear %d on PHB#%x-PE#%x\n",
+ __func__, rc, opt, phb->hose->global_number, pe_no);
+ return -EIO;
+ }
+
+ if (!(pe->flags & PNV_IODA_PE_MASTER))
+ return 0;
+
+ /* Clear frozen state for slave PEs */
+ list_for_each_entry(slave, &pe->slaves, list) {
+ rc = opal_pci_eeh_freeze_clear(phb->opal_id,
+ slave->pe_number,
+ opt);
+ if (rc != OPAL_SUCCESS) {
+ pr_warn("%s: Failure %lld clear %d on PHB#%x-PE#%x\n",
+ __func__, rc, opt, phb->hose->global_number,
+ slave->pe_number);
+ return -EIO;
+ }
+ }
+
+ return 0;
+}
+
+static int pnv_ioda_get_pe_state(struct pnv_phb *phb, int pe_no)
+{
+ struct pnv_ioda_pe *slave, *pe;
+ u8 fstate = 0, state;
+ __be16 pcierr = 0;
+ s64 rc;
+
+ /* Sanity check on PE number */
+ if (pe_no < 0 || pe_no >= phb->ioda.total_pe_num)
+ return OPAL_EEH_STOPPED_PERM_UNAVAIL;
+
+ /*
+ * Fetch the master PE and the PE instance might be
+ * not initialized yet.
+ */
+ pe = &phb->ioda.pe_array[pe_no];
+ if (pe->flags & PNV_IODA_PE_SLAVE) {
+ pe = pe->master;
+ WARN_ON(!pe || !(pe->flags & PNV_IODA_PE_MASTER));
+ pe_no = pe->pe_number;
+ }
+
+ /* Check the master PE */
+ rc = opal_pci_eeh_freeze_status(phb->opal_id, pe_no,
+ &state, &pcierr, NULL);
+ if (rc != OPAL_SUCCESS) {
+ pr_warn("%s: Failure %lld getting "
+ "PHB#%x-PE#%x state\n",
+ __func__, rc,
+ phb->hose->global_number, pe_no);
+ return OPAL_EEH_STOPPED_TEMP_UNAVAIL;
+ }
+
+ /* Check the slave PE */
+ if (!(pe->flags & PNV_IODA_PE_MASTER))
+ return state;
+
+ list_for_each_entry(slave, &pe->slaves, list) {
+ rc = opal_pci_eeh_freeze_status(phb->opal_id,
+ slave->pe_number,
+ &fstate,
+ &pcierr,
+ NULL);
+ if (rc != OPAL_SUCCESS) {
+ pr_warn("%s: Failure %lld getting "
+ "PHB#%x-PE#%x state\n",
+ __func__, rc,
+ phb->hose->global_number, slave->pe_number);
+ return OPAL_EEH_STOPPED_TEMP_UNAVAIL;
+ }
+
+ /*
+ * Override the result based on the ascending
+ * priority.
+ */
+ if (fstate > state)
+ state = fstate;
+ }
+
+ return state;
+}
+
+/* Currently those 2 are only used when MSIs are enabled, this will change
+ * but in the meantime, we need to protect them to avoid warnings
+ */
+#ifdef CONFIG_PCI_MSI
+struct pnv_ioda_pe *pnv_ioda_get_pe(struct pci_dev *dev)
+{
+ struct pci_controller *hose = pci_bus_to_host(dev->bus);
+ struct pnv_phb *phb = hose->private_data;
+ struct pci_dn *pdn = pci_get_pdn(dev);
+
+ if (!pdn)
+ return NULL;
+ if (pdn->pe_number == IODA_INVALID_PE)
+ return NULL;
+ return &phb->ioda.pe_array[pdn->pe_number];
+}
+#endif /* CONFIG_PCI_MSI */
+
+static int pnv_ioda_set_one_peltv(struct pnv_phb *phb,
+ struct pnv_ioda_pe *parent,
+ struct pnv_ioda_pe *child,
+ bool is_add)
+{
+ const char *desc = is_add ? "adding" : "removing";
+ uint8_t op = is_add ? OPAL_ADD_PE_TO_DOMAIN :
+ OPAL_REMOVE_PE_FROM_DOMAIN;
+ struct pnv_ioda_pe *slave;
+ long rc;
+
+ /* Parent PE affects child PE */
+ rc = opal_pci_set_peltv(phb->opal_id, parent->pe_number,
+ child->pe_number, op);
+ if (rc != OPAL_SUCCESS) {
+ pe_warn(child, "OPAL error %ld %s to parent PELTV\n",
+ rc, desc);
+ return -ENXIO;
+ }
+
+ if (!(child->flags & PNV_IODA_PE_MASTER))
+ return 0;
+
+ /* Compound case: parent PE affects slave PEs */
+ list_for_each_entry(slave, &child->slaves, list) {
+ rc = opal_pci_set_peltv(phb->opal_id, parent->pe_number,
+ slave->pe_number, op);
+ if (rc != OPAL_SUCCESS) {
+ pe_warn(slave, "OPAL error %ld %s to parent PELTV\n",
+ rc, desc);
+ return -ENXIO;
+ }
+ }
+
+ return 0;
+}
+
+static int pnv_ioda_set_peltv(struct pnv_phb *phb,
+ struct pnv_ioda_pe *pe,
+ bool is_add)
+{
+ struct pnv_ioda_pe *slave;
+ struct pci_dev *pdev = NULL;
+ int ret;
+
+ /*
+ * Clear PE frozen state. If it's master PE, we need
+ * clear slave PE frozen state as well.
+ */
+ if (is_add) {
+ opal_pci_eeh_freeze_clear(phb->opal_id, pe->pe_number,
+ OPAL_EEH_ACTION_CLEAR_FREEZE_ALL);
+ if (pe->flags & PNV_IODA_PE_MASTER) {
+ list_for_each_entry(slave, &pe->slaves, list)
+ opal_pci_eeh_freeze_clear(phb->opal_id,
+ slave->pe_number,
+ OPAL_EEH_ACTION_CLEAR_FREEZE_ALL);
+ }
+ }
+
+ /*
+ * Associate PE in PELT. We need add the PE into the
+ * corresponding PELT-V as well. Otherwise, the error
+ * originated from the PE might contribute to other
+ * PEs.
+ */
+ ret = pnv_ioda_set_one_peltv(phb, pe, pe, is_add);
+ if (ret)
+ return ret;
+
+ /* For compound PEs, any one affects all of them */
+ if (pe->flags & PNV_IODA_PE_MASTER) {
+ list_for_each_entry(slave, &pe->slaves, list) {
+ ret = pnv_ioda_set_one_peltv(phb, slave, pe, is_add);
+ if (ret)
+ return ret;
+ }
+ }
+
+ if (pe->flags & (PNV_IODA_PE_BUS_ALL | PNV_IODA_PE_BUS))
+ pdev = pe->pbus->self;
+ else if (pe->flags & PNV_IODA_PE_DEV)
+ pdev = pe->pdev->bus->self;
+#ifdef CONFIG_PCI_IOV
+ else if (pe->flags & PNV_IODA_PE_VF)
+ pdev = pe->parent_dev;
+#endif /* CONFIG_PCI_IOV */
+ while (pdev) {
+ struct pci_dn *pdn = pci_get_pdn(pdev);
+ struct pnv_ioda_pe *parent;
+
+ if (pdn && pdn->pe_number != IODA_INVALID_PE) {
+ parent = &phb->ioda.pe_array[pdn->pe_number];
+ ret = pnv_ioda_set_one_peltv(phb, parent, pe, is_add);
+ if (ret)
+ return ret;
+ }
+
+ pdev = pdev->bus->self;
+ }
+
+ return 0;
+}
+
+static int pnv_ioda_deconfigure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe)
+{
+ struct pci_dev *parent;
+ uint8_t bcomp, dcomp, fcomp;
+ int64_t rc;
+ long rid_end, rid;
+
+ /* Currently, we just deconfigure VF PE. Bus PE will always there.*/
+ if (pe->pbus) {
+ int count;
+
+ dcomp = OPAL_IGNORE_RID_DEVICE_NUMBER;
+ fcomp = OPAL_IGNORE_RID_FUNCTION_NUMBER;
+ parent = pe->pbus->self;
+ if (pe->flags & PNV_IODA_PE_BUS_ALL)
+ count = pe->pbus->busn_res.end - pe->pbus->busn_res.start + 1;
+ else
+ count = 1;
+
+ switch(count) {
+ case 1: bcomp = OpalPciBusAll; break;
+ case 2: bcomp = OpalPciBus7Bits; break;
+ case 4: bcomp = OpalPciBus6Bits; break;
+ case 8: bcomp = OpalPciBus5Bits; break;
+ case 16: bcomp = OpalPciBus4Bits; break;
+ case 32: bcomp = OpalPciBus3Bits; break;
+ default:
+ dev_err(&pe->pbus->dev, "Number of subordinate buses %d unsupported\n",
+ count);
+ /* Do an exact match only */
+ bcomp = OpalPciBusAll;
+ }
+ rid_end = pe->rid + (count << 8);
+ } else {
+#ifdef CONFIG_PCI_IOV
+ if (pe->flags & PNV_IODA_PE_VF)
+ parent = pe->parent_dev;
+ else
+#endif
+ parent = pe->pdev->bus->self;
+ bcomp = OpalPciBusAll;
+ dcomp = OPAL_COMPARE_RID_DEVICE_NUMBER;
+ fcomp = OPAL_COMPARE_RID_FUNCTION_NUMBER;
+ rid_end = pe->rid + 1;
+ }
+
+ /* Clear the reverse map */
+ for (rid = pe->rid; rid < rid_end; rid++)
+ phb->ioda.pe_rmap[rid] = IODA_INVALID_PE;
+
+ /* Release from all parents PELT-V */
+ while (parent) {
+ struct pci_dn *pdn = pci_get_pdn(parent);
+ if (pdn && pdn->pe_number != IODA_INVALID_PE) {
+ rc = opal_pci_set_peltv(phb->opal_id, pdn->pe_number,
+ pe->pe_number, OPAL_REMOVE_PE_FROM_DOMAIN);
+ /* XXX What to do in case of error ? */
+ }
+ parent = parent->bus->self;
+ }
+
+ opal_pci_eeh_freeze_clear(phb->opal_id, pe->pe_number,
+ OPAL_EEH_ACTION_CLEAR_FREEZE_ALL);
+
+ /* Disassociate PE in PELT */
+ rc = opal_pci_set_peltv(phb->opal_id, pe->pe_number,
+ pe->pe_number, OPAL_REMOVE_PE_FROM_DOMAIN);
+ if (rc)
+ pe_warn(pe, "OPAL error %ld remove self from PELTV\n", rc);
+ rc = opal_pci_set_pe(phb->opal_id, pe->pe_number, pe->rid,
+ bcomp, dcomp, fcomp, OPAL_UNMAP_PE);
+ if (rc)
+ pe_err(pe, "OPAL error %ld trying to setup PELT table\n", rc);
+
+ pe->pbus = NULL;
+ pe->pdev = NULL;
+#ifdef CONFIG_PCI_IOV
+ pe->parent_dev = NULL;
+#endif
+
+ return 0;
+}
+
+static int pnv_ioda_configure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe)
+{
+ struct pci_dev *parent;
+ uint8_t bcomp, dcomp, fcomp;
+ long rc, rid_end, rid;
+
+ /* Bus validation ? */
+ if (pe->pbus) {
+ int count;
+
+ dcomp = OPAL_IGNORE_RID_DEVICE_NUMBER;
+ fcomp = OPAL_IGNORE_RID_FUNCTION_NUMBER;
+ parent = pe->pbus->self;
+ if (pe->flags & PNV_IODA_PE_BUS_ALL)
+ count = pe->pbus->busn_res.end - pe->pbus->busn_res.start + 1;
+ else
+ count = 1;
+
+ switch(count) {
+ case 1: bcomp = OpalPciBusAll; break;
+ case 2: bcomp = OpalPciBus7Bits; break;
+ case 4: bcomp = OpalPciBus6Bits; break;
+ case 8: bcomp = OpalPciBus5Bits; break;
+ case 16: bcomp = OpalPciBus4Bits; break;
+ case 32: bcomp = OpalPciBus3Bits; break;
+ default:
+ dev_err(&pe->pbus->dev, "Number of subordinate buses %d unsupported\n",
+ count);
+ /* Do an exact match only */
+ bcomp = OpalPciBusAll;
+ }
+ rid_end = pe->rid + (count << 8);
+ } else {
+#ifdef CONFIG_PCI_IOV
+ if (pe->flags & PNV_IODA_PE_VF)
+ parent = pe->parent_dev;
+ else
+#endif /* CONFIG_PCI_IOV */
+ parent = pe->pdev->bus->self;
+ bcomp = OpalPciBusAll;
+ dcomp = OPAL_COMPARE_RID_DEVICE_NUMBER;
+ fcomp = OPAL_COMPARE_RID_FUNCTION_NUMBER;
+ rid_end = pe->rid + 1;
+ }
+
+ /*
+ * Associate PE in PELT. We need add the PE into the
+ * corresponding PELT-V as well. Otherwise, the error
+ * originated from the PE might contribute to other
+ * PEs.
+ */
+ rc = opal_pci_set_pe(phb->opal_id, pe->pe_number, pe->rid,
+ bcomp, dcomp, fcomp, OPAL_MAP_PE);
+ if (rc) {
+ pe_err(pe, "OPAL error %ld trying to setup PELT table\n", rc);
+ return -ENXIO;
+ }
+
+ /*
+ * Configure PELTV. NPUs don't have a PELTV table so skip
+ * configuration on them.
+ */
+ if (phb->type != PNV_PHB_NPU_NVLINK && phb->type != PNV_PHB_NPU_OCAPI)
+ pnv_ioda_set_peltv(phb, pe, true);
+
+ /* Setup reverse map */
+ for (rid = pe->rid; rid < rid_end; rid++)
+ phb->ioda.pe_rmap[rid] = pe->pe_number;
+
+ /* Setup one MVTs on IODA1 */
+ if (phb->type != PNV_PHB_IODA1) {
+ pe->mve_number = 0;
+ goto out;
+ }
+
+ pe->mve_number = pe->pe_number;
+ rc = opal_pci_set_mve(phb->opal_id, pe->mve_number, pe->pe_number);
+ if (rc != OPAL_SUCCESS) {
+ pe_err(pe, "OPAL error %ld setting up MVE %x\n",
+ rc, pe->mve_number);
+ pe->mve_number = -1;
+ } else {
+ rc = opal_pci_set_mve_enable(phb->opal_id,
+ pe->mve_number, OPAL_ENABLE_MVE);
+ if (rc) {
+ pe_err(pe, "OPAL error %ld enabling MVE %x\n",
+ rc, pe->mve_number);
+ pe->mve_number = -1;
+ }
+ }
+
+out:
+ return 0;
+}
+
+#ifdef CONFIG_PCI_IOV
+static int pnv_pci_vf_resource_shift(struct pci_dev *dev, int offset)
+{
+ struct pci_dn *pdn = pci_get_pdn(dev);
+ int i;
+ struct resource *res, res2;
+ resource_size_t size;
+ u16 num_vfs;
+
+ if (!dev->is_physfn)
+ return -EINVAL;
+
+ /*
+ * "offset" is in VFs. The M64 windows are sized so that when they
+ * are segmented, each segment is the same size as the IOV BAR.
+ * Each segment is in a separate PE, and the high order bits of the
+ * address are the PE number. Therefore, each VF's BAR is in a
+ * separate PE, and changing the IOV BAR start address changes the
+ * range of PEs the VFs are in.
+ */
+ num_vfs = pdn->num_vfs;
+ for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
+ res = &dev->resource[i + PCI_IOV_RESOURCES];
+ if (!res->flags || !res->parent)
+ continue;
+
+ /*
+ * The actual IOV BAR range is determined by the start address
+ * and the actual size for num_vfs VFs BAR. This check is to
+ * make sure that after shifting, the range will not overlap
+ * with another device.
+ */
+ size = pci_iov_resource_size(dev, i + PCI_IOV_RESOURCES);
+ res2.flags = res->flags;
+ res2.start = res->start + (size * offset);
+ res2.end = res2.start + (size * num_vfs) - 1;
+
+ if (res2.end > res->end) {
+ dev_err(&dev->dev, "VF BAR%d: %pR would extend past %pR (trying to enable %d VFs shifted by %d)\n",
+ i, &res2, res, num_vfs, offset);
+ return -EBUSY;
+ }
+ }
+
+ /*
+ * Since M64 BAR shares segments among all possible 256 PEs,
+ * we have to shift the beginning of PF IOV BAR to make it start from
+ * the segment which belongs to the PE number assigned to the first VF.
+ * This creates a "hole" in the /proc/iomem which could be used for
+ * allocating other resources so we reserve this area below and
+ * release when IOV is released.
+ */
+ for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
+ res = &dev->resource[i + PCI_IOV_RESOURCES];
+ if (!res->flags || !res->parent)
+ continue;
+
+ size = pci_iov_resource_size(dev, i + PCI_IOV_RESOURCES);
+ res2 = *res;
+ res->start += size * offset;
+
+ dev_info(&dev->dev, "VF BAR%d: %pR shifted to %pR (%sabling %d VFs shifted by %d)\n",
+ i, &res2, res, (offset > 0) ? "En" : "Dis",
+ num_vfs, offset);
+
+ if (offset < 0) {
+ devm_release_resource(&dev->dev, &pdn->holes[i]);
+ memset(&pdn->holes[i], 0, sizeof(pdn->holes[i]));
+ }
+
+ pci_update_resource(dev, i + PCI_IOV_RESOURCES);
+
+ if (offset > 0) {
+ pdn->holes[i].start = res2.start;
+ pdn->holes[i].end = res2.start + size * offset - 1;
+ pdn->holes[i].flags = IORESOURCE_BUS;
+ pdn->holes[i].name = "pnv_iov_reserved";
+ devm_request_resource(&dev->dev, res->parent,
+ &pdn->holes[i]);
+ }
+ }
+ return 0;
+}
+#endif /* CONFIG_PCI_IOV */
+
+static struct pnv_ioda_pe *pnv_ioda_setup_dev_PE(struct pci_dev *dev)
+{
+ struct pci_controller *hose = pci_bus_to_host(dev->bus);
+ struct pnv_phb *phb = hose->private_data;
+ struct pci_dn *pdn = pci_get_pdn(dev);
+ struct pnv_ioda_pe *pe;
+
+ if (!pdn) {
+ pr_err("%s: Device tree node not associated properly\n",
+ pci_name(dev));
+ return NULL;
+ }
+ if (pdn->pe_number != IODA_INVALID_PE)
+ return NULL;
+
+ pe = pnv_ioda_alloc_pe(phb);
+ if (!pe) {
+ pr_warn("%s: Not enough PE# available, disabling device\n",
+ pci_name(dev));
+ return NULL;
+ }
+
+ /* NOTE: We get only one ref to the pci_dev for the pdn, not for the
+ * pointer in the PE data structure, both should be destroyed at the
+ * same time. However, this needs to be looked at more closely again
+ * once we actually start removing things (Hotplug, SR-IOV, ...)
+ *
+ * At some point we want to remove the PDN completely anyways
+ */
+ pci_dev_get(dev);
+ pdn->pe_number = pe->pe_number;
+ pe->flags = PNV_IODA_PE_DEV;
+ pe->pdev = dev;
+ pe->pbus = NULL;
+ pe->mve_number = -1;
+ pe->rid = dev->bus->number << 8 | pdn->devfn;
+
+ pe_info(pe, "Associated device to PE\n");
+
+ if (pnv_ioda_configure_pe(phb, pe)) {
+ /* XXX What do we do here ? */
+ pnv_ioda_free_pe(pe);
+ pdn->pe_number = IODA_INVALID_PE;
+ pe->pdev = NULL;
+ pci_dev_put(dev);
+ return NULL;
+ }
+
+ /* Put PE to the list */
+ list_add_tail(&pe->list, &phb->ioda.pe_list);
+
+ return pe;
+}
+
+static void pnv_ioda_setup_same_PE(struct pci_bus *bus, struct pnv_ioda_pe *pe)
+{
+ struct pci_dev *dev;
+
+ list_for_each_entry(dev, &bus->devices, bus_list) {
+ struct pci_dn *pdn = pci_get_pdn(dev);
+
+ if (pdn == NULL) {
+ pr_warn("%s: No device node associated with device !\n",
+ pci_name(dev));
+ continue;
+ }
+
+ /*
+ * In partial hotplug case, the PCI device might be still
+ * associated with the PE and needn't attach it to the PE
+ * again.
+ */
+ if (pdn->pe_number != IODA_INVALID_PE)
+ continue;
+
+ pe->device_count++;
+ pdn->pe_number = pe->pe_number;
+ if ((pe->flags & PNV_IODA_PE_BUS_ALL) && dev->subordinate)
+ pnv_ioda_setup_same_PE(dev->subordinate, pe);
+ }
+}
+
+/*
+ * There're 2 types of PCI bus sensitive PEs: One that is compromised of
+ * single PCI bus. Another one that contains the primary PCI bus and its
+ * subordinate PCI devices and buses. The second type of PE is normally
+ * orgiriated by PCIe-to-PCI bridge or PLX switch downstream ports.
+ */
+static struct pnv_ioda_pe *pnv_ioda_setup_bus_PE(struct pci_bus *bus, bool all)
+{
+ struct pci_controller *hose = pci_bus_to_host(bus);
+ struct pnv_phb *phb = hose->private_data;
+ struct pnv_ioda_pe *pe = NULL;
+ unsigned int pe_num;
+
+ /*
+ * In partial hotplug case, the PE instance might be still alive.
+ * We should reuse it instead of allocating a new one.
+ */
+ pe_num = phb->ioda.pe_rmap[bus->number << 8];
+ if (pe_num != IODA_INVALID_PE) {
+ pe = &phb->ioda.pe_array[pe_num];
+ pnv_ioda_setup_same_PE(bus, pe);
+ return NULL;
+ }
+
+ /* PE number for root bus should have been reserved */
+ if (pci_is_root_bus(bus) &&
+ phb->ioda.root_pe_idx != IODA_INVALID_PE)
+ pe = &phb->ioda.pe_array[phb->ioda.root_pe_idx];
+
+ /* Check if PE is determined by M64 */
+ if (!pe && phb->pick_m64_pe)
+ pe = phb->pick_m64_pe(bus, all);
+
+ /* The PE number isn't pinned by M64 */
+ if (!pe)
+ pe = pnv_ioda_alloc_pe(phb);
+
+ if (!pe) {
+ pr_warn("%s: Not enough PE# available for PCI bus %04x:%02x\n",
+ __func__, pci_domain_nr(bus), bus->number);
+ return NULL;
+ }
+
+ pe->flags |= (all ? PNV_IODA_PE_BUS_ALL : PNV_IODA_PE_BUS);
+ pe->pbus = bus;
+ pe->pdev = NULL;
+ pe->mve_number = -1;
+ pe->rid = bus->busn_res.start << 8;
+
+ if (all)
+ pe_info(pe, "Secondary bus %d..%d associated with PE#%x\n",
+ bus->busn_res.start, bus->busn_res.end, pe->pe_number);
+ else
+ pe_info(pe, "Secondary bus %d associated with PE#%x\n",
+ bus->busn_res.start, pe->pe_number);
+
+ if (pnv_ioda_configure_pe(phb, pe)) {
+ /* XXX What do we do here ? */
+ pnv_ioda_free_pe(pe);
+ pe->pbus = NULL;
+ return NULL;
+ }
+
+ /* Associate it with all child devices */
+ pnv_ioda_setup_same_PE(bus, pe);
+
+ /* Put PE to the list */
+ list_add_tail(&pe->list, &phb->ioda.pe_list);
+
+ return pe;
+}
+
+static struct pnv_ioda_pe *pnv_ioda_setup_npu_PE(struct pci_dev *npu_pdev)
+{
+ int pe_num, found_pe = false, rc;
+ long rid;
+ struct pnv_ioda_pe *pe;
+ struct pci_dev *gpu_pdev;
+ struct pci_dn *npu_pdn;
+ struct pci_controller *hose = pci_bus_to_host(npu_pdev->bus);
+ struct pnv_phb *phb = hose->private_data;
+
+ /*
+ * Due to a hardware errata PE#0 on the NPU is reserved for
+ * error handling. This means we only have three PEs remaining
+ * which need to be assigned to four links, implying some
+ * links must share PEs.
+ *
+ * To achieve this we assign PEs such that NPUs linking the
+ * same GPU get assigned the same PE.
+ */
+ gpu_pdev = pnv_pci_get_gpu_dev(npu_pdev);
+ for (pe_num = 0; pe_num < phb->ioda.total_pe_num; pe_num++) {
+ pe = &phb->ioda.pe_array[pe_num];
+ if (!pe->pdev)
+ continue;
+
+ if (pnv_pci_get_gpu_dev(pe->pdev) == gpu_pdev) {
+ /*
+ * This device has the same peer GPU so should
+ * be assigned the same PE as the existing
+ * peer NPU.
+ */
+ dev_info(&npu_pdev->dev,
+ "Associating to existing PE %x\n", pe_num);
+ pci_dev_get(npu_pdev);
+ npu_pdn = pci_get_pdn(npu_pdev);
+ rid = npu_pdev->bus->number << 8 | npu_pdn->devfn;
+ npu_pdn->pe_number = pe_num;
+ phb->ioda.pe_rmap[rid] = pe->pe_number;
+
+ /* Map the PE to this link */
+ rc = opal_pci_set_pe(phb->opal_id, pe_num, rid,
+ OpalPciBusAll,
+ OPAL_COMPARE_RID_DEVICE_NUMBER,
+ OPAL_COMPARE_RID_FUNCTION_NUMBER,
+ OPAL_MAP_PE);
+ WARN_ON(rc != OPAL_SUCCESS);
+ found_pe = true;
+ break;
+ }
+ }
+
+ if (!found_pe)
+ /*
+ * Could not find an existing PE so allocate a new
+ * one.
+ */
+ return pnv_ioda_setup_dev_PE(npu_pdev);
+ else
+ return pe;
+}
+
+static void pnv_ioda_setup_npu_PEs(struct pci_bus *bus)
+{
+ struct pci_dev *pdev;
+
+ list_for_each_entry(pdev, &bus->devices, bus_list)
+ pnv_ioda_setup_npu_PE(pdev);
+}
+
+static void pnv_pci_ioda_setup_PEs(void)
+{
+ struct pci_controller *hose, *tmp;
+ struct pnv_phb *phb;
+ struct pci_bus *bus;
+ struct pci_dev *pdev;
+
+ list_for_each_entry_safe(hose, tmp, &hose_list, list_node) {
+ phb = hose->private_data;
+ if (phb->type == PNV_PHB_NPU_NVLINK) {
+ /* PE#0 is needed for error reporting */
+ pnv_ioda_reserve_pe(phb, 0);
+ pnv_ioda_setup_npu_PEs(hose->bus);
+ if (phb->model == PNV_PHB_MODEL_NPU2)
+ pnv_npu2_init(phb);
+ }
+ if (phb->type == PNV_PHB_NPU_OCAPI) {
+ bus = hose->bus;
+ list_for_each_entry(pdev, &bus->devices, bus_list)
+ pnv_ioda_setup_dev_PE(pdev);
+ }
+ }
+}
+
+#ifdef CONFIG_PCI_IOV
+static int pnv_pci_vf_release_m64(struct pci_dev *pdev, u16 num_vfs)
+{
+ struct pci_bus *bus;
+ struct pci_controller *hose;
+ struct pnv_phb *phb;
+ struct pci_dn *pdn;
+ int i, j;
+ int m64_bars;
+
+ bus = pdev->bus;
+ hose = pci_bus_to_host(bus);
+ phb = hose->private_data;
+ pdn = pci_get_pdn(pdev);
+
+ if (pdn->m64_single_mode)
+ m64_bars = num_vfs;
+ else
+ m64_bars = 1;
+
+ for (i = 0; i < PCI_SRIOV_NUM_BARS; i++)
+ for (j = 0; j < m64_bars; j++) {
+ if (pdn->m64_map[j][i] == IODA_INVALID_M64)
+ continue;
+ opal_pci_phb_mmio_enable(phb->opal_id,
+ OPAL_M64_WINDOW_TYPE, pdn->m64_map[j][i], 0);
+ clear_bit(pdn->m64_map[j][i], &phb->ioda.m64_bar_alloc);
+ pdn->m64_map[j][i] = IODA_INVALID_M64;
+ }
+
+ kfree(pdn->m64_map);
+ return 0;
+}
+
+static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs)
+{
+ struct pci_bus *bus;
+ struct pci_controller *hose;
+ struct pnv_phb *phb;
+ struct pci_dn *pdn;
+ unsigned int win;
+ struct resource *res;
+ int i, j;
+ int64_t rc;
+ int total_vfs;
+ resource_size_t size, start;
+ int pe_num;
+ int m64_bars;
+
+ bus = pdev->bus;
+ hose = pci_bus_to_host(bus);
+ phb = hose->private_data;
+ pdn = pci_get_pdn(pdev);
+ total_vfs = pci_sriov_get_totalvfs(pdev);
+
+ if (pdn->m64_single_mode)
+ m64_bars = num_vfs;
+ else
+ m64_bars = 1;
+
+ pdn->m64_map = kmalloc_array(m64_bars,
+ sizeof(*pdn->m64_map),
+ GFP_KERNEL);
+ if (!pdn->m64_map)
+ return -ENOMEM;
+ /* Initialize the m64_map to IODA_INVALID_M64 */
+ for (i = 0; i < m64_bars ; i++)
+ for (j = 0; j < PCI_SRIOV_NUM_BARS; j++)
+ pdn->m64_map[i][j] = IODA_INVALID_M64;
+
+
+ for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
+ res = &pdev->resource[i + PCI_IOV_RESOURCES];
+ if (!res->flags || !res->parent)
+ continue;
+
+ for (j = 0; j < m64_bars; j++) {
+ do {
+ win = find_next_zero_bit(&phb->ioda.m64_bar_alloc,
+ phb->ioda.m64_bar_idx + 1, 0);
+
+ if (win >= phb->ioda.m64_bar_idx + 1)
+ goto m64_failed;
+ } while (test_and_set_bit(win, &phb->ioda.m64_bar_alloc));
+
+ pdn->m64_map[j][i] = win;
+
+ if (pdn->m64_single_mode) {
+ size = pci_iov_resource_size(pdev,
+ PCI_IOV_RESOURCES + i);
+ start = res->start + size * j;
+ } else {
+ size = resource_size(res);
+ start = res->start;
+ }
+
+ /* Map the M64 here */
+ if (pdn->m64_single_mode) {
+ pe_num = pdn->pe_num_map[j];
+ rc = opal_pci_map_pe_mmio_window(phb->opal_id,
+ pe_num, OPAL_M64_WINDOW_TYPE,
+ pdn->m64_map[j][i], 0);
+ }
+
+ rc = opal_pci_set_phb_mem_window(phb->opal_id,
+ OPAL_M64_WINDOW_TYPE,
+ pdn->m64_map[j][i],
+ start,
+ 0, /* unused */
+ size);
+
+
+ if (rc != OPAL_SUCCESS) {
+ dev_err(&pdev->dev, "Failed to map M64 window #%d: %lld\n",
+ win, rc);
+ goto m64_failed;
+ }
+
+ if (pdn->m64_single_mode)
+ rc = opal_pci_phb_mmio_enable(phb->opal_id,
+ OPAL_M64_WINDOW_TYPE, pdn->m64_map[j][i], 2);
+ else
+ rc = opal_pci_phb_mmio_enable(phb->opal_id,
+ OPAL_M64_WINDOW_TYPE, pdn->m64_map[j][i], 1);
+
+ if (rc != OPAL_SUCCESS) {
+ dev_err(&pdev->dev, "Failed to enable M64 window #%d: %llx\n",
+ win, rc);
+ goto m64_failed;
+ }
+ }
+ }
+ return 0;
+
+m64_failed:
+ pnv_pci_vf_release_m64(pdev, num_vfs);
+ return -EBUSY;
+}
+
+static long pnv_pci_ioda2_unset_window(struct iommu_table_group *table_group,
+ int num);
+
+static void pnv_pci_ioda2_release_dma_pe(struct pci_dev *dev, struct pnv_ioda_pe *pe)
+{
+ struct iommu_table *tbl;
+ int64_t rc;
+
+ tbl = pe->table_group.tables[0];
+ rc = pnv_pci_ioda2_unset_window(&pe->table_group, 0);
+ if (rc)
+ pe_warn(pe, "OPAL error %ld release DMA window\n", rc);
+
+ pnv_pci_ioda2_set_bypass(pe, false);
+ if (pe->table_group.group) {
+ iommu_group_put(pe->table_group.group);
+ BUG_ON(pe->table_group.group);
+ }
+ iommu_tce_table_put(tbl);
+}
+
+static void pnv_ioda_release_vf_PE(struct pci_dev *pdev)
+{
+ struct pci_bus *bus;
+ struct pci_controller *hose;
+ struct pnv_phb *phb;
+ struct pnv_ioda_pe *pe, *pe_n;
+ struct pci_dn *pdn;
+
+ bus = pdev->bus;
+ hose = pci_bus_to_host(bus);
+ phb = hose->private_data;
+ pdn = pci_get_pdn(pdev);
+
+ if (!pdev->is_physfn)
+ return;
+
+ list_for_each_entry_safe(pe, pe_n, &phb->ioda.pe_list, list) {
+ if (pe->parent_dev != pdev)
+ continue;
+
+ pnv_pci_ioda2_release_dma_pe(pdev, pe);
+
+ /* Remove from list */
+ mutex_lock(&phb->ioda.pe_list_mutex);
+ list_del(&pe->list);
+ mutex_unlock(&phb->ioda.pe_list_mutex);
+
+ pnv_ioda_deconfigure_pe(phb, pe);
+
+ pnv_ioda_free_pe(pe);
+ }
+}
+
+void pnv_pci_sriov_disable(struct pci_dev *pdev)
+{
+ struct pci_bus *bus;
+ struct pci_controller *hose;
+ struct pnv_phb *phb;
+ struct pnv_ioda_pe *pe;
+ struct pci_dn *pdn;
+ u16 num_vfs, i;
+
+ bus = pdev->bus;
+ hose = pci_bus_to_host(bus);
+ phb = hose->private_data;
+ pdn = pci_get_pdn(pdev);
+ num_vfs = pdn->num_vfs;
+
+ /* Release VF PEs */
+ pnv_ioda_release_vf_PE(pdev);
+
+ if (phb->type == PNV_PHB_IODA2) {
+ if (!pdn->m64_single_mode)
+ pnv_pci_vf_resource_shift(pdev, -*pdn->pe_num_map);
+
+ /* Release M64 windows */
+ pnv_pci_vf_release_m64(pdev, num_vfs);
+
+ /* Release PE numbers */
+ if (pdn->m64_single_mode) {
+ for (i = 0; i < num_vfs; i++) {
+ if (pdn->pe_num_map[i] == IODA_INVALID_PE)
+ continue;
+
+ pe = &phb->ioda.pe_array[pdn->pe_num_map[i]];
+ pnv_ioda_free_pe(pe);
+ }
+ } else
+ bitmap_clear(phb->ioda.pe_alloc, *pdn->pe_num_map, num_vfs);
+ /* Releasing pe_num_map */
+ kfree(pdn->pe_num_map);
+ }
+}
+
+static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
+ struct pnv_ioda_pe *pe);
+static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs)
+{
+ struct pci_bus *bus;
+ struct pci_controller *hose;
+ struct pnv_phb *phb;
+ struct pnv_ioda_pe *pe;
+ int pe_num;
+ u16 vf_index;
+ struct pci_dn *pdn;
+
+ bus = pdev->bus;
+ hose = pci_bus_to_host(bus);
+ phb = hose->private_data;
+ pdn = pci_get_pdn(pdev);
+
+ if (!pdev->is_physfn)
+ return;
+
+ /* Reserve PE for each VF */
+ for (vf_index = 0; vf_index < num_vfs; vf_index++) {
+ int vf_devfn = pci_iov_virtfn_devfn(pdev, vf_index);
+ int vf_bus = pci_iov_virtfn_bus(pdev, vf_index);
+ struct pci_dn *vf_pdn;
+
+ if (pdn->m64_single_mode)
+ pe_num = pdn->pe_num_map[vf_index];
+ else
+ pe_num = *pdn->pe_num_map + vf_index;
+
+ pe = &phb->ioda.pe_array[pe_num];
+ pe->pe_number = pe_num;
+ pe->phb = phb;
+ pe->flags = PNV_IODA_PE_VF;
+ pe->pbus = NULL;
+ pe->parent_dev = pdev;
+ pe->mve_number = -1;
+ pe->rid = (vf_bus << 8) | vf_devfn;
+
+ pe_info(pe, "VF %04d:%02d:%02d.%d associated with PE#%x\n",
+ hose->global_number, pdev->bus->number,
+ PCI_SLOT(vf_devfn), PCI_FUNC(vf_devfn), pe_num);
+
+ if (pnv_ioda_configure_pe(phb, pe)) {
+ /* XXX What do we do here ? */
+ pnv_ioda_free_pe(pe);
+ pe->pdev = NULL;
+ continue;
+ }
+
+ /* Put PE to the list */
+ mutex_lock(&phb->ioda.pe_list_mutex);
+ list_add_tail(&pe->list, &phb->ioda.pe_list);
+ mutex_unlock(&phb->ioda.pe_list_mutex);
+
+ /* associate this pe to it's pdn */
+ list_for_each_entry(vf_pdn, &pdn->parent->child_list, list) {
+ if (vf_pdn->busno == vf_bus &&
+ vf_pdn->devfn == vf_devfn) {
+ vf_pdn->pe_number = pe_num;
+ break;
+ }
+ }
+
+ pnv_pci_ioda2_setup_dma_pe(phb, pe);
+ }
+}
+
+int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
+{
+ struct pci_bus *bus;
+ struct pci_controller *hose;
+ struct pnv_phb *phb;
+ struct pnv_ioda_pe *pe;
+ struct pci_dn *pdn;
+ int ret;
+ u16 i;
+
+ bus = pdev->bus;
+ hose = pci_bus_to_host(bus);
+ phb = hose->private_data;
+ pdn = pci_get_pdn(pdev);
+
+ if (phb->type == PNV_PHB_IODA2) {
+ if (!pdn->vfs_expanded) {
+ dev_info(&pdev->dev, "don't support this SRIOV device"
+ " with non 64bit-prefetchable IOV BAR\n");
+ return -ENOSPC;
+ }
+
+ /*
+ * When M64 BARs functions in Single PE mode, the number of VFs
+ * could be enabled must be less than the number of M64 BARs.
+ */
+ if (pdn->m64_single_mode && num_vfs > phb->ioda.m64_bar_idx) {
+ dev_info(&pdev->dev, "Not enough M64 BAR for VFs\n");
+ return -EBUSY;
+ }
+
+ /* Allocating pe_num_map */
+ if (pdn->m64_single_mode)
+ pdn->pe_num_map = kmalloc_array(num_vfs,
+ sizeof(*pdn->pe_num_map),
+ GFP_KERNEL);
+ else
+ pdn->pe_num_map = kmalloc(sizeof(*pdn->pe_num_map), GFP_KERNEL);
+
+ if (!pdn->pe_num_map)
+ return -ENOMEM;
+
+ if (pdn->m64_single_mode)
+ for (i = 0; i < num_vfs; i++)
+ pdn->pe_num_map[i] = IODA_INVALID_PE;
+
+ /* Calculate available PE for required VFs */
+ if (pdn->m64_single_mode) {
+ for (i = 0; i < num_vfs; i++) {
+ pe = pnv_ioda_alloc_pe(phb);
+ if (!pe) {
+ ret = -EBUSY;
+ goto m64_failed;
+ }
+
+ pdn->pe_num_map[i] = pe->pe_number;
+ }
+ } else {
+ mutex_lock(&phb->ioda.pe_alloc_mutex);
+ *pdn->pe_num_map = bitmap_find_next_zero_area(
+ phb->ioda.pe_alloc, phb->ioda.total_pe_num,
+ 0, num_vfs, 0);
+ if (*pdn->pe_num_map >= phb->ioda.total_pe_num) {
+ mutex_unlock(&phb->ioda.pe_alloc_mutex);
+ dev_info(&pdev->dev, "Failed to enable VF%d\n", num_vfs);
+ kfree(pdn->pe_num_map);
+ return -EBUSY;
+ }
+ bitmap_set(phb->ioda.pe_alloc, *pdn->pe_num_map, num_vfs);
+ mutex_unlock(&phb->ioda.pe_alloc_mutex);
+ }
+ pdn->num_vfs = num_vfs;
+
+ /* Assign M64 window accordingly */
+ ret = pnv_pci_vf_assign_m64(pdev, num_vfs);
+ if (ret) {
+ dev_info(&pdev->dev, "Not enough M64 window resources\n");
+ goto m64_failed;
+ }
+
+ /*
+ * When using one M64 BAR to map one IOV BAR, we need to shift
+ * the IOV BAR according to the PE# allocated to the VFs.
+ * Otherwise, the PE# for the VF will conflict with others.
+ */
+ if (!pdn->m64_single_mode) {
+ ret = pnv_pci_vf_resource_shift(pdev, *pdn->pe_num_map);
+ if (ret)
+ goto m64_failed;
+ }
+ }
+
+ /* Setup VF PEs */
+ pnv_ioda_setup_vf_PE(pdev, num_vfs);
+
+ return 0;
+
+m64_failed:
+ if (pdn->m64_single_mode) {
+ for (i = 0; i < num_vfs; i++) {
+ if (pdn->pe_num_map[i] == IODA_INVALID_PE)
+ continue;
+
+ pe = &phb->ioda.pe_array[pdn->pe_num_map[i]];
+ pnv_ioda_free_pe(pe);
+ }
+ } else
+ bitmap_clear(phb->ioda.pe_alloc, *pdn->pe_num_map, num_vfs);
+
+ /* Releasing pe_num_map */
+ kfree(pdn->pe_num_map);
+
+ return ret;
+}
+
+int pnv_pcibios_sriov_disable(struct pci_dev *pdev)
+{
+ pnv_pci_sriov_disable(pdev);
+
+ /* Release PCI data */
+ remove_dev_pci_data(pdev);
+ return 0;
+}
+
+int pnv_pcibios_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
+{
+ /* Allocate PCI data */
+ add_dev_pci_data(pdev);
+
+ return pnv_pci_sriov_enable(pdev, num_vfs);
+}
+#endif /* CONFIG_PCI_IOV */
+
+static void pnv_pci_ioda_dma_dev_setup(struct pnv_phb *phb, struct pci_dev *pdev)
+{
+ struct pci_dn *pdn = pci_get_pdn(pdev);
+ struct pnv_ioda_pe *pe;
+
+ /*
+ * The function can be called while the PE#
+ * hasn't been assigned. Do nothing for the
+ * case.
+ */
+ if (!pdn || pdn->pe_number == IODA_INVALID_PE)
+ return;
+
+ pe = &phb->ioda.pe_array[pdn->pe_number];
+ WARN_ON(get_dma_ops(&pdev->dev) != &dma_iommu_ops);
+ set_dma_offset(&pdev->dev, pe->tce_bypass_base);
+ set_iommu_table_base(&pdev->dev, pe->table_group.tables[0]);
+ /*
+ * Note: iommu_add_device() will fail here as
+ * for physical PE: the device is already added by now;
+ * for virtual PE: sysfs entries are not ready yet and
+ * tce_iommu_bus_notifier will add the device to a group later.
+ */
+}
+
+static bool pnv_pci_ioda_pe_single_vendor(struct pnv_ioda_pe *pe)
+{
+ unsigned short vendor = 0;
+ struct pci_dev *pdev;
+
+ if (pe->device_count == 1)
+ return true;
+
+ /* pe->pdev should be set if it's a single device, pe->pbus if not */
+ if (!pe->pbus)
+ return true;
+
+ list_for_each_entry(pdev, &pe->pbus->devices, bus_list) {
+ if (!vendor) {
+ vendor = pdev->vendor;
+ continue;
+ }
+
+ if (pdev->vendor != vendor)
+ return false;
+ }
+
+ return true;
+}
+
+/*
+ * Reconfigure TVE#0 to be usable as 64-bit DMA space.
+ *
+ * The first 4GB of virtual memory for a PE is reserved for 32-bit accesses.
+ * Devices can only access more than that if bit 59 of the PCI address is set
+ * by hardware, which indicates TVE#1 should be used instead of TVE#0.
+ * Many PCI devices are not capable of addressing that many bits, and as a
+ * result are limited to the 4GB of virtual memory made available to 32-bit
+ * devices in TVE#0.
+ *
+ * In order to work around this, reconfigure TVE#0 to be suitable for 64-bit
+ * devices by configuring the virtual memory past the first 4GB inaccessible
+ * by 64-bit DMAs. This should only be used by devices that want more than
+ * 4GB, and only on PEs that have no 32-bit devices.
+ *
+ * Currently this will only work on PHB3 (POWER8).
+ */
+static int pnv_pci_ioda_dma_64bit_bypass(struct pnv_ioda_pe *pe)
+{
+ u64 window_size, table_size, tce_count, addr;
+ struct page *table_pages;
+ u64 tce_order = 28; /* 256MB TCEs */
+ __be64 *tces;
+ s64 rc;
+
+ /*
+ * Window size needs to be a power of two, but needs to account for
+ * shifting memory by the 4GB offset required to skip 32bit space.
+ */
+ window_size = roundup_pow_of_two(memory_hotplug_max() + (1ULL << 32));
+ tce_count = window_size >> tce_order;
+ table_size = tce_count << 3;
+
+ if (table_size < PAGE_SIZE)
+ table_size = PAGE_SIZE;
+
+ table_pages = alloc_pages_node(pe->phb->hose->node, GFP_KERNEL,
+ get_order(table_size));
+ if (!table_pages)
+ goto err;
+
+ tces = page_address(table_pages);
+ if (!tces)
+ goto err;
+
+ memset(tces, 0, table_size);
+
+ for (addr = 0; addr < memory_hotplug_max(); addr += (1 << tce_order)) {
+ tces[(addr + (1ULL << 32)) >> tce_order] =
+ cpu_to_be64(addr | TCE_PCI_READ | TCE_PCI_WRITE);
+ }
+
+ rc = opal_pci_map_pe_dma_window(pe->phb->opal_id,
+ pe->pe_number,
+ /* reconfigure window 0 */
+ (pe->pe_number << 1) + 0,
+ 1,
+ __pa(tces),
+ table_size,
+ 1 << tce_order);
+ if (rc == OPAL_SUCCESS) {
+ pe_info(pe, "Using 64-bit DMA iommu bypass (through TVE#0)\n");
+ return 0;
+ }
+err:
+ pe_err(pe, "Error configuring 64-bit DMA bypass\n");
+ return -EIO;
+}
+
+static int pnv_pci_ioda_dma_set_mask(struct pci_dev *pdev, u64 dma_mask)
+{
+ struct pci_controller *hose = pci_bus_to_host(pdev->bus);
+ struct pnv_phb *phb = hose->private_data;
+ struct pci_dn *pdn = pci_get_pdn(pdev);
+ struct pnv_ioda_pe *pe;
+ uint64_t top;
+ bool bypass = false;
+ s64 rc;
+
+ if (WARN_ON(!pdn || pdn->pe_number == IODA_INVALID_PE))
+ return -ENODEV;
+
+ pe = &phb->ioda.pe_array[pdn->pe_number];
+ if (pe->tce_bypass_enabled) {
+ top = pe->tce_bypass_base + memblock_end_of_DRAM() - 1;
+ bypass = (dma_mask >= top);
+ }
+
+ if (bypass) {
+ dev_info(&pdev->dev, "Using 64-bit DMA iommu bypass\n");
+ set_dma_ops(&pdev->dev, &dma_nommu_ops);
+ } else {
+ /*
+ * If the device can't set the TCE bypass bit but still wants
+ * to access 4GB or more, on PHB3 we can reconfigure TVE#0 to
+ * bypass the 32-bit region and be usable for 64-bit DMAs.
+ * The device needs to be able to address all of this space.
+ */
+ if (dma_mask >> 32 &&
+ dma_mask > (memory_hotplug_max() + (1ULL << 32)) &&
+ pnv_pci_ioda_pe_single_vendor(pe) &&
+ phb->model == PNV_PHB_MODEL_PHB3) {
+ /* Configure the bypass mode */
+ rc = pnv_pci_ioda_dma_64bit_bypass(pe);
+ if (rc)
+ return rc;
+ /* 4GB offset bypasses 32-bit space */
+ set_dma_offset(&pdev->dev, (1ULL << 32));
+ set_dma_ops(&pdev->dev, &dma_nommu_ops);
+ } else if (dma_mask >> 32 && dma_mask != DMA_BIT_MASK(64)) {
+ /*
+ * Fail the request if a DMA mask between 32 and 64 bits
+ * was requested but couldn't be fulfilled. Ideally we
+ * would do this for 64-bits but historically we have
+ * always fallen back to 32-bits.
+ */
+ return -ENOMEM;
+ } else {
+ dev_info(&pdev->dev, "Using 32-bit DMA via iommu\n");
+ set_dma_ops(&pdev->dev, &dma_iommu_ops);
+ }
+ }
+ *pdev->dev.dma_mask = dma_mask;
+
+ /* Update peer npu devices */
+ pnv_npu_try_dma_set_bypass(pdev, bypass);
+
+ return 0;
+}
+
+static u64 pnv_pci_ioda_dma_get_required_mask(struct pci_dev *pdev)
+{
+ struct pci_controller *hose = pci_bus_to_host(pdev->bus);
+ struct pnv_phb *phb = hose->private_data;
+ struct pci_dn *pdn = pci_get_pdn(pdev);
+ struct pnv_ioda_pe *pe;
+ u64 end, mask;
+
+ if (WARN_ON(!pdn || pdn->pe_number == IODA_INVALID_PE))
+ return 0;
+
+ pe = &phb->ioda.pe_array[pdn->pe_number];
+ if (!pe->tce_bypass_enabled)
+ return __dma_get_required_mask(&pdev->dev);
+
+
+ end = pe->tce_bypass_base + memblock_end_of_DRAM();
+ mask = 1ULL << (fls64(end) - 1);
+ mask += mask - 1;
+
+ return mask;
+}
+
+static void pnv_ioda_setup_bus_dma(struct pnv_ioda_pe *pe,
+ struct pci_bus *bus,
+ bool add_to_group)
+{
+ struct pci_dev *dev;
+
+ list_for_each_entry(dev, &bus->devices, bus_list) {
+ set_iommu_table_base(&dev->dev, pe->table_group.tables[0]);
+ set_dma_offset(&dev->dev, pe->tce_bypass_base);
+ if (add_to_group)
+ iommu_add_device(&dev->dev);
+
+ if ((pe->flags & PNV_IODA_PE_BUS_ALL) && dev->subordinate)
+ pnv_ioda_setup_bus_dma(pe, dev->subordinate,
+ add_to_group);
+ }
+}
+
+static inline __be64 __iomem *pnv_ioda_get_inval_reg(struct pnv_phb *phb,
+ bool real_mode)
+{
+ return real_mode ? (__be64 __iomem *)(phb->regs_phys + 0x210) :
+ (phb->regs + 0x210);
+}
+
+static void pnv_pci_p7ioc_tce_invalidate(struct iommu_table *tbl,
+ unsigned long index, unsigned long npages, bool rm)
+{
+ struct iommu_table_group_link *tgl = list_first_entry_or_null(
+ &tbl->it_group_list, struct iommu_table_group_link,
+ next);
+ struct pnv_ioda_pe *pe = container_of(tgl->table_group,
+ struct pnv_ioda_pe, table_group);
+ __be64 __iomem *invalidate = pnv_ioda_get_inval_reg(pe->phb, rm);
+ unsigned long start, end, inc;
+
+ start = __pa(((__be64 *)tbl->it_base) + index - tbl->it_offset);
+ end = __pa(((__be64 *)tbl->it_base) + index - tbl->it_offset +
+ npages - 1);
+
+ /* p7ioc-style invalidation, 2 TCEs per write */
+ start |= (1ull << 63);
+ end |= (1ull << 63);
+ inc = 16;
+ end |= inc - 1; /* round up end to be different than start */
+
+ mb(); /* Ensure above stores are visible */
+ while (start <= end) {
+ if (rm)
+ __raw_rm_writeq_be(start, invalidate);
+ else
+ __raw_writeq_be(start, invalidate);
+
+ start += inc;
+ }
+
+ /*
+ * The iommu layer will do another mb() for us on build()
+ * and we don't care on free()
+ */
+}
+
+static int pnv_ioda1_tce_build(struct iommu_table *tbl, long index,
+ long npages, unsigned long uaddr,
+ enum dma_data_direction direction,
+ unsigned long attrs)
+{
+ int ret = pnv_tce_build(tbl, index, npages, uaddr, direction,
+ attrs);
+
+ if (!ret)
+ pnv_pci_p7ioc_tce_invalidate(tbl, index, npages, false);
+
+ return ret;
+}
+
+#ifdef CONFIG_IOMMU_API
+static int pnv_ioda1_tce_xchg(struct iommu_table *tbl, long index,
+ unsigned long *hpa, enum dma_data_direction *direction)
+{
+ long ret = pnv_tce_xchg(tbl, index, hpa, direction, true);
+
+ if (!ret)
+ pnv_pci_p7ioc_tce_invalidate(tbl, index, 1, false);
+
+ return ret;
+}
+
+static int pnv_ioda1_tce_xchg_rm(struct iommu_table *tbl, long index,
+ unsigned long *hpa, enum dma_data_direction *direction)
+{
+ long ret = pnv_tce_xchg(tbl, index, hpa, direction, false);
+
+ if (!ret)
+ pnv_pci_p7ioc_tce_invalidate(tbl, index, 1, true);
+
+ return ret;
+}
+#endif
+
+static void pnv_ioda1_tce_free(struct iommu_table *tbl, long index,
+ long npages)
+{
+ pnv_tce_free(tbl, index, npages);
+
+ pnv_pci_p7ioc_tce_invalidate(tbl, index, npages, false);
+}
+
+static struct iommu_table_ops pnv_ioda1_iommu_ops = {
+ .set = pnv_ioda1_tce_build,
+#ifdef CONFIG_IOMMU_API
+ .exchange = pnv_ioda1_tce_xchg,
+ .exchange_rm = pnv_ioda1_tce_xchg_rm,
+ .useraddrptr = pnv_tce_useraddrptr,
+#endif
+ .clear = pnv_ioda1_tce_free,
+ .get = pnv_tce_get,
+};
+
+#define PHB3_TCE_KILL_INVAL_ALL PPC_BIT(0)
+#define PHB3_TCE_KILL_INVAL_PE PPC_BIT(1)
+#define PHB3_TCE_KILL_INVAL_ONE PPC_BIT(2)
+
+static void pnv_pci_phb3_tce_invalidate_entire(struct pnv_phb *phb, bool rm)
+{
+ __be64 __iomem *invalidate = pnv_ioda_get_inval_reg(phb, rm);
+ const unsigned long val = PHB3_TCE_KILL_INVAL_ALL;
+
+ mb(); /* Ensure previous TCE table stores are visible */
+ if (rm)
+ __raw_rm_writeq_be(val, invalidate);
+ else
+ __raw_writeq_be(val, invalidate);
+}
+
+static inline void pnv_pci_phb3_tce_invalidate_pe(struct pnv_ioda_pe *pe)
+{
+ /* 01xb - invalidate TCEs that match the specified PE# */
+ __be64 __iomem *invalidate = pnv_ioda_get_inval_reg(pe->phb, false);
+ unsigned long val = PHB3_TCE_KILL_INVAL_PE | (pe->pe_number & 0xFF);
+
+ mb(); /* Ensure above stores are visible */
+ __raw_writeq_be(val, invalidate);
+}
+
+static void pnv_pci_phb3_tce_invalidate(struct pnv_ioda_pe *pe, bool rm,
+ unsigned shift, unsigned long index,
+ unsigned long npages)
+{
+ __be64 __iomem *invalidate = pnv_ioda_get_inval_reg(pe->phb, rm);
+ unsigned long start, end, inc;
+
+ /* We'll invalidate DMA address in PE scope */
+ start = PHB3_TCE_KILL_INVAL_ONE;
+ start |= (pe->pe_number & 0xFF);
+ end = start;
+
+ /* Figure out the start, end and step */
+ start |= (index << shift);
+ end |= ((index + npages - 1) << shift);
+ inc = (0x1ull << shift);
+ mb();
+
+ while (start <= end) {
+ if (rm)
+ __raw_rm_writeq_be(start, invalidate);
+ else
+ __raw_writeq_be(start, invalidate);
+ start += inc;
+ }
+}
+
+static inline void pnv_pci_ioda2_tce_invalidate_pe(struct pnv_ioda_pe *pe)
+{
+ struct pnv_phb *phb = pe->phb;
+
+ if (phb->model == PNV_PHB_MODEL_PHB3 && phb->regs)
+ pnv_pci_phb3_tce_invalidate_pe(pe);
+ else
+ opal_pci_tce_kill(phb->opal_id, OPAL_PCI_TCE_KILL_PE,
+ pe->pe_number, 0, 0, 0);
+}
+
+static void pnv_pci_ioda2_tce_invalidate(struct iommu_table *tbl,
+ unsigned long index, unsigned long npages, bool rm)
+{
+ struct iommu_table_group_link *tgl;
+
+ list_for_each_entry_lockless(tgl, &tbl->it_group_list, next) {
+ struct pnv_ioda_pe *pe = container_of(tgl->table_group,
+ struct pnv_ioda_pe, table_group);
+ struct pnv_phb *phb = pe->phb;
+ unsigned int shift = tbl->it_page_shift;
+
+ /*
+ * NVLink1 can use the TCE kill register directly as
+ * it's the same as PHB3. NVLink2 is different and
+ * should go via the OPAL call.
+ */
+ if (phb->model == PNV_PHB_MODEL_NPU) {
+ /*
+ * The NVLink hardware does not support TCE kill
+ * per TCE entry so we have to invalidate
+ * the entire cache for it.
+ */
+ pnv_pci_phb3_tce_invalidate_entire(phb, rm);
+ continue;
+ }
+ if (phb->model == PNV_PHB_MODEL_PHB3 && phb->regs)
+ pnv_pci_phb3_tce_invalidate(pe, rm, shift,
+ index, npages);
+ else
+ opal_pci_tce_kill(phb->opal_id,
+ OPAL_PCI_TCE_KILL_PAGES,
+ pe->pe_number, 1u << shift,
+ index << shift, npages);
+ }
+}
+
+void pnv_pci_ioda2_tce_invalidate_entire(struct pnv_phb *phb, bool rm)
+{
+ if (phb->model == PNV_PHB_MODEL_NPU || phb->model == PNV_PHB_MODEL_PHB3)
+ pnv_pci_phb3_tce_invalidate_entire(phb, rm);
+ else
+ opal_pci_tce_kill(phb->opal_id, OPAL_PCI_TCE_KILL, 0, 0, 0, 0);
+}
+
+static int pnv_ioda2_tce_build(struct iommu_table *tbl, long index,
+ long npages, unsigned long uaddr,
+ enum dma_data_direction direction,
+ unsigned long attrs)
+{
+ int ret = pnv_tce_build(tbl, index, npages, uaddr, direction,
+ attrs);
+
+ if (!ret)
+ pnv_pci_ioda2_tce_invalidate(tbl, index, npages, false);
+
+ return ret;
+}
+
+#ifdef CONFIG_IOMMU_API
+static int pnv_ioda2_tce_xchg(struct iommu_table *tbl, long index,
+ unsigned long *hpa, enum dma_data_direction *direction)
+{
+ long ret = pnv_tce_xchg(tbl, index, hpa, direction, true);
+
+ if (!ret)
+ pnv_pci_ioda2_tce_invalidate(tbl, index, 1, false);
+
+ return ret;
+}
+
+static int pnv_ioda2_tce_xchg_rm(struct iommu_table *tbl, long index,
+ unsigned long *hpa, enum dma_data_direction *direction)
+{
+ long ret = pnv_tce_xchg(tbl, index, hpa, direction, false);
+
+ if (!ret)
+ pnv_pci_ioda2_tce_invalidate(tbl, index, 1, true);
+
+ return ret;
+}
+#endif
+
+static void pnv_ioda2_tce_free(struct iommu_table *tbl, long index,
+ long npages)
+{
+ pnv_tce_free(tbl, index, npages);
+
+ pnv_pci_ioda2_tce_invalidate(tbl, index, npages, false);
+}
+
+static struct iommu_table_ops pnv_ioda2_iommu_ops = {
+ .set = pnv_ioda2_tce_build,
+#ifdef CONFIG_IOMMU_API
+ .exchange = pnv_ioda2_tce_xchg,
+ .exchange_rm = pnv_ioda2_tce_xchg_rm,
+ .useraddrptr = pnv_tce_useraddrptr,
+#endif
+ .clear = pnv_ioda2_tce_free,
+ .get = pnv_tce_get,
+ .free = pnv_pci_ioda2_table_free_pages,
+};
+
+static int pnv_pci_ioda_dev_dma_weight(struct pci_dev *dev, void *data)
+{
+ unsigned int *weight = (unsigned int *)data;
+
+ /* This is quite simplistic. The "base" weight of a device
+ * is 10. 0 means no DMA is to be accounted for it.
+ */
+ if (dev->hdr_type != PCI_HEADER_TYPE_NORMAL)
+ return 0;
+
+ if (dev->class == PCI_CLASS_SERIAL_USB_UHCI ||
+ dev->class == PCI_CLASS_SERIAL_USB_OHCI ||
+ dev->class == PCI_CLASS_SERIAL_USB_EHCI)
+ *weight += 3;
+ else if ((dev->class >> 8) == PCI_CLASS_STORAGE_RAID)
+ *weight += 15;
+ else
+ *weight += 10;
+
+ return 0;
+}
+
+static unsigned int pnv_pci_ioda_pe_dma_weight(struct pnv_ioda_pe *pe)
+{
+ unsigned int weight = 0;
+
+ /* SRIOV VF has same DMA32 weight as its PF */
+#ifdef CONFIG_PCI_IOV
+ if ((pe->flags & PNV_IODA_PE_VF) && pe->parent_dev) {
+ pnv_pci_ioda_dev_dma_weight(pe->parent_dev, &weight);
+ return weight;
+ }
+#endif
+
+ if ((pe->flags & PNV_IODA_PE_DEV) && pe->pdev) {
+ pnv_pci_ioda_dev_dma_weight(pe->pdev, &weight);
+ } else if ((pe->flags & PNV_IODA_PE_BUS) && pe->pbus) {
+ struct pci_dev *pdev;
+
+ list_for_each_entry(pdev, &pe->pbus->devices, bus_list)
+ pnv_pci_ioda_dev_dma_weight(pdev, &weight);
+ } else if ((pe->flags & PNV_IODA_PE_BUS_ALL) && pe->pbus) {
+ pci_walk_bus(pe->pbus, pnv_pci_ioda_dev_dma_weight, &weight);
+ }
+
+ return weight;
+}
+
+static void pnv_pci_ioda1_setup_dma_pe(struct pnv_phb *phb,
+ struct pnv_ioda_pe *pe)
+{
+
+ struct page *tce_mem = NULL;
+ struct iommu_table *tbl;
+ unsigned int weight, total_weight = 0;
+ unsigned int tce32_segsz, base, segs, avail, i;
+ int64_t rc;
+ void *addr;
+
+ /* XXX FIXME: Handle 64-bit only DMA devices */
+ /* XXX FIXME: Provide 64-bit DMA facilities & non-4K TCE tables etc.. */
+ /* XXX FIXME: Allocate multi-level tables on PHB3 */
+ weight = pnv_pci_ioda_pe_dma_weight(pe);
+ if (!weight)
+ return;
+
+ pci_walk_bus(phb->hose->bus, pnv_pci_ioda_dev_dma_weight,
+ &total_weight);
+ segs = (weight * phb->ioda.dma32_count) / total_weight;
+ if (!segs)
+ segs = 1;
+
+ /*
+ * Allocate contiguous DMA32 segments. We begin with the expected
+ * number of segments. With one more attempt, the number of DMA32
+ * segments to be allocated is decreased by one until one segment
+ * is allocated successfully.
+ */
+ do {
+ for (base = 0; base <= phb->ioda.dma32_count - segs; base++) {
+ for (avail = 0, i = base; i < base + segs; i++) {
+ if (phb->ioda.dma32_segmap[i] ==
+ IODA_INVALID_PE)
+ avail++;
+ }
+
+ if (avail == segs)
+ goto found;
+ }
+ } while (--segs);
+
+ if (!segs) {
+ pe_warn(pe, "No available DMA32 segments\n");
+ return;
+ }
+
+found:
+ tbl = pnv_pci_table_alloc(phb->hose->node);
+ if (WARN_ON(!tbl))
+ return;
+
+ iommu_register_group(&pe->table_group, phb->hose->global_number,
+ pe->pe_number);
+ pnv_pci_link_table_and_group(phb->hose->node, 0, tbl, &pe->table_group);
+
+ /* Grab a 32-bit TCE table */
+ pe_info(pe, "DMA weight %d (%d), assigned (%d) %d DMA32 segments\n",
+ weight, total_weight, base, segs);
+ pe_info(pe, " Setting up 32-bit TCE table at %08x..%08x\n",
+ base * PNV_IODA1_DMA32_SEGSIZE,
+ (base + segs) * PNV_IODA1_DMA32_SEGSIZE - 1);
+
+ /* XXX Currently, we allocate one big contiguous table for the
+ * TCEs. We only really need one chunk per 256M of TCE space
+ * (ie per segment) but that's an optimization for later, it
+ * requires some added smarts with our get/put_tce implementation
+ *
+ * Each TCE page is 4KB in size and each TCE entry occupies 8
+ * bytes
+ */
+ tce32_segsz = PNV_IODA1_DMA32_SEGSIZE >> (IOMMU_PAGE_SHIFT_4K - 3);
+ tce_mem = alloc_pages_node(phb->hose->node, GFP_KERNEL,
+ get_order(tce32_segsz * segs));
+ if (!tce_mem) {
+ pe_err(pe, " Failed to allocate a 32-bit TCE memory\n");
+ goto fail;
+ }
+ addr = page_address(tce_mem);
+ memset(addr, 0, tce32_segsz * segs);
+
+ /* Configure HW */
+ for (i = 0; i < segs; i++) {
+ rc = opal_pci_map_pe_dma_window(phb->opal_id,
+ pe->pe_number,
+ base + i, 1,
+ __pa(addr) + tce32_segsz * i,
+ tce32_segsz, IOMMU_PAGE_SIZE_4K);
+ if (rc) {
+ pe_err(pe, " Failed to configure 32-bit TCE table,"
+ " err %ld\n", rc);
+ goto fail;
+ }
+ }
+
+ /* Setup DMA32 segment mapping */
+ for (i = base; i < base + segs; i++)
+ phb->ioda.dma32_segmap[i] = pe->pe_number;
+
+ /* Setup linux iommu table */
+ pnv_pci_setup_iommu_table(tbl, addr, tce32_segsz * segs,
+ base * PNV_IODA1_DMA32_SEGSIZE,
+ IOMMU_PAGE_SHIFT_4K);
+
+ tbl->it_ops = &pnv_ioda1_iommu_ops;
+ pe->table_group.tce32_start = tbl->it_offset << tbl->it_page_shift;
+ pe->table_group.tce32_size = tbl->it_size << tbl->it_page_shift;
+ iommu_init_table(tbl, phb->hose->node);
+
+ if (pe->flags & PNV_IODA_PE_DEV) {
+ /*
+ * Setting table base here only for carrying iommu_group
+ * further down to let iommu_add_device() do the job.
+ * pnv_pci_ioda_dma_dev_setup will override it later anyway.
+ */
+ set_iommu_table_base(&pe->pdev->dev, tbl);
+ iommu_add_device(&pe->pdev->dev);
+ } else if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL))
+ pnv_ioda_setup_bus_dma(pe, pe->pbus, true);
+
+ return;
+ fail:
+ /* XXX Failure: Try to fallback to 64-bit only ? */
+ if (tce_mem)
+ __free_pages(tce_mem, get_order(tce32_segsz * segs));
+ if (tbl) {
+ pnv_pci_unlink_table_and_group(tbl, &pe->table_group);
+ iommu_tce_table_put(tbl);
+ }
+}
+
+static long pnv_pci_ioda2_set_window(struct iommu_table_group *table_group,
+ int num, struct iommu_table *tbl)
+{
+ struct pnv_ioda_pe *pe = container_of(table_group, struct pnv_ioda_pe,
+ table_group);
+ struct pnv_phb *phb = pe->phb;
+ int64_t rc;
+ const unsigned long size = tbl->it_indirect_levels ?
+ tbl->it_level_size : tbl->it_size;
+ const __u64 start_addr = tbl->it_offset << tbl->it_page_shift;
+ const __u64 win_size = tbl->it_size << tbl->it_page_shift;
+
+ pe_info(pe, "Setting up window#%d %llx..%llx pg=%x\n", num,
+ start_addr, start_addr + win_size - 1,
+ IOMMU_PAGE_SIZE(tbl));
+
+ /*
+ * Map TCE table through TVT. The TVE index is the PE number
+ * shifted by 1 bit for 32-bits DMA space.
+ */
+ rc = opal_pci_map_pe_dma_window(phb->opal_id,
+ pe->pe_number,
+ (pe->pe_number << 1) + num,
+ tbl->it_indirect_levels + 1,
+ __pa(tbl->it_base),
+ size << 3,
+ IOMMU_PAGE_SIZE(tbl));
+ if (rc) {
+ pe_err(pe, "Failed to configure TCE table, err %ld\n", rc);
+ return rc;
+ }
+
+ pnv_pci_link_table_and_group(phb->hose->node, num,
+ tbl, &pe->table_group);
+ pnv_pci_ioda2_tce_invalidate_pe(pe);
+
+ return 0;
+}
+
+void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable)
+{
+ uint16_t window_id = (pe->pe_number << 1 ) + 1;
+ int64_t rc;
+
+ pe_info(pe, "%sabling 64-bit DMA bypass\n", enable ? "En" : "Dis");
+ if (enable) {
+ phys_addr_t top = memblock_end_of_DRAM();
+
+ top = roundup_pow_of_two(top);
+ rc = opal_pci_map_pe_dma_window_real(pe->phb->opal_id,
+ pe->pe_number,
+ window_id,
+ pe->tce_bypass_base,
+ top);
+ } else {
+ rc = opal_pci_map_pe_dma_window_real(pe->phb->opal_id,
+ pe->pe_number,
+ window_id,
+ pe->tce_bypass_base,
+ 0);
+ }
+ if (rc)
+ pe_err(pe, "OPAL error %lld configuring bypass window\n", rc);
+ else
+ pe->tce_bypass_enabled = enable;
+}
+
+static long pnv_pci_ioda2_create_table(struct iommu_table_group *table_group,
+ int num, __u32 page_shift, __u64 window_size, __u32 levels,
+ bool alloc_userspace_copy, struct iommu_table **ptbl)
+{
+ struct pnv_ioda_pe *pe = container_of(table_group, struct pnv_ioda_pe,
+ table_group);
+ int nid = pe->phb->hose->node;
+ __u64 bus_offset = num ? pe->tce_bypass_base : table_group->tce32_start;
+ long ret;
+ struct iommu_table *tbl;
+
+ tbl = pnv_pci_table_alloc(nid);
+ if (!tbl)
+ return -ENOMEM;
+
+ tbl->it_ops = &pnv_ioda2_iommu_ops;
+
+ ret = pnv_pci_ioda2_table_alloc_pages(nid,
+ bus_offset, page_shift, window_size,
+ levels, alloc_userspace_copy, tbl);
+ if (ret) {
+ iommu_tce_table_put(tbl);
+ return ret;
+ }
+
+ *ptbl = tbl;
+
+ return 0;
+}
+
+static long pnv_pci_ioda2_setup_default_config(struct pnv_ioda_pe *pe)
+{
+ struct iommu_table *tbl = NULL;
+ long rc;
+
+ /*
+ * crashkernel= specifies the kdump kernel's maximum memory at
+ * some offset and there is no guaranteed the result is a power
+ * of 2, which will cause errors later.
+ */
+ const u64 max_memory = __rounddown_pow_of_two(memory_hotplug_max());
+
+ /*
+ * In memory constrained environments, e.g. kdump kernel, the
+ * DMA window can be larger than available memory, which will
+ * cause errors later.
+ */
+ const u64 window_size = min((u64)pe->table_group.tce32_size, max_memory);
+
+ rc = pnv_pci_ioda2_create_table(&pe->table_group, 0,
+ IOMMU_PAGE_SHIFT_4K,
+ window_size,
+ POWERNV_IOMMU_DEFAULT_LEVELS, false, &tbl);
+ if (rc) {
+ pe_err(pe, "Failed to create 32-bit TCE table, err %ld",
+ rc);
+ return rc;
+ }
+
+ iommu_init_table(tbl, pe->phb->hose->node);
+
+ rc = pnv_pci_ioda2_set_window(&pe->table_group, 0, tbl);
+ if (rc) {
+ pe_err(pe, "Failed to configure 32-bit TCE table, err %ld\n",
+ rc);
+ iommu_tce_table_put(tbl);
+ return rc;
+ }
+
+ if (!pnv_iommu_bypass_disabled)
+ pnv_pci_ioda2_set_bypass(pe, true);
+
+ /*
+ * Setting table base here only for carrying iommu_group
+ * further down to let iommu_add_device() do the job.
+ * pnv_pci_ioda_dma_dev_setup will override it later anyway.
+ */
+ if (pe->flags & PNV_IODA_PE_DEV)
+ set_iommu_table_base(&pe->pdev->dev, tbl);
+
+ return 0;
+}
+
+#if defined(CONFIG_IOMMU_API) || defined(CONFIG_PCI_IOV)
+static long pnv_pci_ioda2_unset_window(struct iommu_table_group *table_group,
+ int num)
+{
+ struct pnv_ioda_pe *pe = container_of(table_group, struct pnv_ioda_pe,
+ table_group);
+ struct pnv_phb *phb = pe->phb;
+ long ret;
+
+ pe_info(pe, "Removing DMA window #%d\n", num);
+
+ ret = opal_pci_map_pe_dma_window(phb->opal_id, pe->pe_number,
+ (pe->pe_number << 1) + num,
+ 0/* levels */, 0/* table address */,
+ 0/* table size */, 0/* page size */);
+ if (ret)
+ pe_warn(pe, "Unmapping failed, ret = %ld\n", ret);
+ else
+ pnv_pci_ioda2_tce_invalidate_pe(pe);
+
+ pnv_pci_unlink_table_and_group(table_group->tables[num], table_group);
+
+ return ret;
+}
+#endif
+
+#ifdef CONFIG_IOMMU_API
+static unsigned long pnv_pci_ioda2_get_table_size(__u32 page_shift,
+ __u64 window_size, __u32 levels)
+{
+ unsigned long bytes = 0;
+ const unsigned window_shift = ilog2(window_size);
+ unsigned entries_shift = window_shift - page_shift;
+ unsigned table_shift = entries_shift + 3;
+ unsigned long tce_table_size = max(0x1000UL, 1UL << table_shift);
+ unsigned long direct_table_size;
+
+ if (!levels || (levels > POWERNV_IOMMU_MAX_LEVELS) ||
+ !is_power_of_2(window_size))
+ return 0;
+
+ /* Calculate a direct table size from window_size and levels */
+ entries_shift = (entries_shift + levels - 1) / levels;
+ table_shift = entries_shift + 3;
+ table_shift = max_t(unsigned, table_shift, PAGE_SHIFT);
+ direct_table_size = 1UL << table_shift;
+
+ for ( ; levels; --levels) {
+ bytes += _ALIGN_UP(tce_table_size, direct_table_size);
+
+ tce_table_size /= direct_table_size;
+ tce_table_size <<= 3;
+ tce_table_size = max_t(unsigned long,
+ tce_table_size, direct_table_size);
+ }
+
+ return bytes + bytes; /* one for HW table, one for userspace copy */
+}
+
+static long pnv_pci_ioda2_create_table_userspace(
+ struct iommu_table_group *table_group,
+ int num, __u32 page_shift, __u64 window_size, __u32 levels,
+ struct iommu_table **ptbl)
+{
+ long ret = pnv_pci_ioda2_create_table(table_group,
+ num, page_shift, window_size, levels, true, ptbl);
+
+ if (!ret)
+ (*ptbl)->it_allocated_size = pnv_pci_ioda2_get_table_size(
+ page_shift, window_size, levels);
+ return ret;
+}
+
+static void pnv_ioda2_take_ownership(struct iommu_table_group *table_group)
+{
+ struct pnv_ioda_pe *pe = container_of(table_group, struct pnv_ioda_pe,
+ table_group);
+ /* Store @tbl as pnv_pci_ioda2_unset_window() resets it */
+ struct iommu_table *tbl = pe->table_group.tables[0];
+
+ pnv_pci_ioda2_set_bypass(pe, false);
+ pnv_pci_ioda2_unset_window(&pe->table_group, 0);
+ if (pe->pbus)
+ pnv_ioda_setup_bus_dma(pe, pe->pbus, false);
+ iommu_tce_table_put(tbl);
+}
+
+static void pnv_ioda2_release_ownership(struct iommu_table_group *table_group)
+{
+ struct pnv_ioda_pe *pe = container_of(table_group, struct pnv_ioda_pe,
+ table_group);
+
+ pnv_pci_ioda2_setup_default_config(pe);
+ if (pe->pbus)
+ pnv_ioda_setup_bus_dma(pe, pe->pbus, false);
+}
+
+static struct iommu_table_group_ops pnv_pci_ioda2_ops = {
+ .get_table_size = pnv_pci_ioda2_get_table_size,
+ .create_table = pnv_pci_ioda2_create_table_userspace,
+ .set_window = pnv_pci_ioda2_set_window,
+ .unset_window = pnv_pci_ioda2_unset_window,
+ .take_ownership = pnv_ioda2_take_ownership,
+ .release_ownership = pnv_ioda2_release_ownership,
+};
+
+static int gpe_table_group_to_npe_cb(struct device *dev, void *opaque)
+{
+ struct pci_controller *hose;
+ struct pnv_phb *phb;
+ struct pnv_ioda_pe **ptmppe = opaque;
+ struct pci_dev *pdev = container_of(dev, struct pci_dev, dev);
+ struct pci_dn *pdn = pci_get_pdn(pdev);
+
+ if (!pdn || pdn->pe_number == IODA_INVALID_PE)
+ return 0;
+
+ hose = pci_bus_to_host(pdev->bus);
+ phb = hose->private_data;
+ if (phb->type != PNV_PHB_NPU_NVLINK)
+ return 0;
+
+ *ptmppe = &phb->ioda.pe_array[pdn->pe_number];
+
+ return 1;
+}
+
+/*
+ * This returns PE of associated NPU.
+ * This assumes that NPU is in the same IOMMU group with GPU and there is
+ * no other PEs.
+ */
+static struct pnv_ioda_pe *gpe_table_group_to_npe(
+ struct iommu_table_group *table_group)
+{
+ struct pnv_ioda_pe *npe = NULL;
+ int ret = iommu_group_for_each_dev(table_group->group, &npe,
+ gpe_table_group_to_npe_cb);
+
+ BUG_ON(!ret || !npe);
+
+ return npe;
+}
+
+static long pnv_pci_ioda2_npu_set_window(struct iommu_table_group *table_group,
+ int num, struct iommu_table *tbl)
+{
+ struct pnv_ioda_pe *npe = gpe_table_group_to_npe(table_group);
+ int num2 = (num == 0) ? 1 : 0;
+ long ret = pnv_pci_ioda2_set_window(table_group, num, tbl);
+
+ if (ret)
+ return ret;
+
+ if (table_group->tables[num2])
+ pnv_npu_unset_window(npe, num2);
+
+ ret = pnv_npu_set_window(npe, num, tbl);
+ if (ret) {
+ pnv_pci_ioda2_unset_window(table_group, num);
+ if (table_group->tables[num2])
+ pnv_npu_set_window(npe, num2,
+ table_group->tables[num2]);
+ }
+
+ return ret;
+}
+
+static long pnv_pci_ioda2_npu_unset_window(
+ struct iommu_table_group *table_group,
+ int num)
+{
+ struct pnv_ioda_pe *npe = gpe_table_group_to_npe(table_group);
+ int num2 = (num == 0) ? 1 : 0;
+ long ret = pnv_pci_ioda2_unset_window(table_group, num);
+
+ if (ret)
+ return ret;
+
+ if (!npe->table_group.tables[num])
+ return 0;
+
+ ret = pnv_npu_unset_window(npe, num);
+ if (ret)
+ return ret;
+
+ if (table_group->tables[num2])
+ ret = pnv_npu_set_window(npe, num2, table_group->tables[num2]);
+
+ return ret;
+}
+
+static void pnv_ioda2_npu_take_ownership(struct iommu_table_group *table_group)
+{
+ /*
+ * Detach NPU first as pnv_ioda2_take_ownership() will destroy
+ * the iommu_table if 32bit DMA is enabled.
+ */
+ pnv_npu_take_ownership(gpe_table_group_to_npe(table_group));
+ pnv_ioda2_take_ownership(table_group);
+}
+
+static struct iommu_table_group_ops pnv_pci_ioda2_npu_ops = {
+ .get_table_size = pnv_pci_ioda2_get_table_size,
+ .create_table = pnv_pci_ioda2_create_table_userspace,
+ .set_window = pnv_pci_ioda2_npu_set_window,
+ .unset_window = pnv_pci_ioda2_npu_unset_window,
+ .take_ownership = pnv_ioda2_npu_take_ownership,
+ .release_ownership = pnv_ioda2_release_ownership,
+};
+
+static void pnv_pci_ioda_setup_iommu_api(void)
+{
+ struct pci_controller *hose, *tmp;
+ struct pnv_phb *phb;
+ struct pnv_ioda_pe *pe, *gpe;
+
+ /*
+ * Now we have all PHBs discovered, time to add NPU devices to
+ * the corresponding IOMMU groups.
+ */
+ list_for_each_entry_safe(hose, tmp, &hose_list, list_node) {
+ phb = hose->private_data;
+
+ if (phb->type != PNV_PHB_NPU_NVLINK)
+ continue;
+
+ list_for_each_entry(pe, &phb->ioda.pe_list, list) {
+ gpe = pnv_pci_npu_setup_iommu(pe);
+ if (gpe)
+ gpe->table_group.ops = &pnv_pci_ioda2_npu_ops;
+ }
+ }
+}
+#else /* !CONFIG_IOMMU_API */
+static void pnv_pci_ioda_setup_iommu_api(void) { };
+#endif
+
+static unsigned long pnv_ioda_parse_tce_sizes(struct pnv_phb *phb)
+{
+ struct pci_controller *hose = phb->hose;
+ struct device_node *dn = hose->dn;
+ unsigned long mask = 0;
+ int i, rc, count;
+ u32 val;
+
+ count = of_property_count_u32_elems(dn, "ibm,supported-tce-sizes");
+ if (count <= 0) {
+ mask = SZ_4K | SZ_64K;
+ /* Add 16M for POWER8 by default */
+ if (cpu_has_feature(CPU_FTR_ARCH_207S) &&
+ !cpu_has_feature(CPU_FTR_ARCH_300))
+ mask |= SZ_16M | SZ_256M;
+ return mask;
+ }
+
+ for (i = 0; i < count; i++) {
+ rc = of_property_read_u32_index(dn, "ibm,supported-tce-sizes",
+ i, &val);
+ if (rc == 0)
+ mask |= 1ULL << val;
+ }
+
+ return mask;
+}
+
+static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
+ struct pnv_ioda_pe *pe)
+{
+ int64_t rc;
+
+ if (!pnv_pci_ioda_pe_dma_weight(pe))
+ return;
+
+ /* TVE #1 is selected by PCI address bit 59 */
+ pe->tce_bypass_base = 1ull << 59;
+
+ iommu_register_group(&pe->table_group, phb->hose->global_number,
+ pe->pe_number);
+
+ /* The PE will reserve all possible 32-bits space */
+ pe_info(pe, "Setting up 32-bit TCE table at 0..%08x\n",
+ phb->ioda.m32_pci_base);
+
+ /* Setup linux iommu table */
+ pe->table_group.tce32_start = 0;
+ pe->table_group.tce32_size = phb->ioda.m32_pci_base;
+ pe->table_group.max_dynamic_windows_supported =
+ IOMMU_TABLE_GROUP_MAX_TABLES;
+ pe->table_group.max_levels = POWERNV_IOMMU_MAX_LEVELS;
+ pe->table_group.pgsizes = pnv_ioda_parse_tce_sizes(phb);
+#ifdef CONFIG_IOMMU_API
+ pe->table_group.ops = &pnv_pci_ioda2_ops;
+#endif
+
+ rc = pnv_pci_ioda2_setup_default_config(pe);
+ if (rc)
+ return;
+
+ if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL))
+ pnv_ioda_setup_bus_dma(pe, pe->pbus, true);
+}
+
+#ifdef CONFIG_PCI_MSI
+int64_t pnv_opal_pci_msi_eoi(struct irq_chip *chip, unsigned int hw_irq)
+{
+ struct pnv_phb *phb = container_of(chip, struct pnv_phb,
+ ioda.irq_chip);
+
+ return opal_pci_msi_eoi(phb->opal_id, hw_irq);
+}
+
+static void pnv_ioda2_msi_eoi(struct irq_data *d)
+{
+ int64_t rc;
+ unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d);
+ struct irq_chip *chip = irq_data_get_irq_chip(d);
+
+ rc = pnv_opal_pci_msi_eoi(chip, hw_irq);
+ WARN_ON_ONCE(rc);
+
+ icp_native_eoi(d);
+}
+
+
+void pnv_set_msi_irq_chip(struct pnv_phb *phb, unsigned int virq)
+{
+ struct irq_data *idata;
+ struct irq_chip *ichip;
+
+ /* The MSI EOI OPAL call is only needed on PHB3 */
+ if (phb->model != PNV_PHB_MODEL_PHB3)
+ return;
+
+ if (!phb->ioda.irq_chip_init) {
+ /*
+ * First time we setup an MSI IRQ, we need to setup the
+ * corresponding IRQ chip to route correctly.
+ */
+ idata = irq_get_irq_data(virq);
+ ichip = irq_data_get_irq_chip(idata);
+ phb->ioda.irq_chip_init = 1;
+ phb->ioda.irq_chip = *ichip;
+ phb->ioda.irq_chip.irq_eoi = pnv_ioda2_msi_eoi;
+ }
+ irq_set_chip(virq, &phb->ioda.irq_chip);
+}
+
+/*
+ * Returns true iff chip is something that we could call
+ * pnv_opal_pci_msi_eoi for.
+ */
+bool is_pnv_opal_msi(struct irq_chip *chip)
+{
+ return chip->irq_eoi == pnv_ioda2_msi_eoi;
+}
+EXPORT_SYMBOL_GPL(is_pnv_opal_msi);
+
+static int pnv_pci_ioda_msi_setup(struct pnv_phb *phb, struct pci_dev *dev,
+ unsigned int hwirq, unsigned int virq,
+ unsigned int is_64, struct msi_msg *msg)
+{
+ struct pnv_ioda_pe *pe = pnv_ioda_get_pe(dev);
+ unsigned int xive_num = hwirq - phb->msi_base;
+ __be32 data;
+ int rc;
+
+ /* No PE assigned ? bail out ... no MSI for you ! */
+ if (pe == NULL)
+ return -ENXIO;
+
+ /* Check if we have an MVE */
+ if (pe->mve_number < 0)
+ return -ENXIO;
+
+ /* Force 32-bit MSI on some broken devices */
+ if (dev->no_64bit_msi)
+ is_64 = 0;
+
+ /* Assign XIVE to PE */
+ rc = opal_pci_set_xive_pe(phb->opal_id, pe->pe_number, xive_num);
+ if (rc) {
+ pr_warn("%s: OPAL error %d setting XIVE %d PE\n",
+ pci_name(dev), rc, xive_num);
+ return -EIO;
+ }
+
+ if (is_64) {
+ __be64 addr64;
+
+ rc = opal_get_msi_64(phb->opal_id, pe->mve_number, xive_num, 1,
+ &addr64, &data);
+ if (rc) {
+ pr_warn("%s: OPAL error %d getting 64-bit MSI data\n",
+ pci_name(dev), rc);
+ return -EIO;
+ }
+ msg->address_hi = be64_to_cpu(addr64) >> 32;
+ msg->address_lo = be64_to_cpu(addr64) & 0xfffffffful;
+ } else {
+ __be32 addr32;
+
+ rc = opal_get_msi_32(phb->opal_id, pe->mve_number, xive_num, 1,
+ &addr32, &data);
+ if (rc) {
+ pr_warn("%s: OPAL error %d getting 32-bit MSI data\n",
+ pci_name(dev), rc);
+ return -EIO;
+ }
+ msg->address_hi = 0;
+ msg->address_lo = be32_to_cpu(addr32);
+ }
+ msg->data = be32_to_cpu(data);
+
+ pnv_set_msi_irq_chip(phb, virq);
+
+ pr_devel("%s: %s-bit MSI on hwirq %x (xive #%d),"
+ " address=%x_%08x data=%x PE# %x\n",
+ pci_name(dev), is_64 ? "64" : "32", hwirq, xive_num,
+ msg->address_hi, msg->address_lo, data, pe->pe_number);
+
+ return 0;
+}
+
+static void pnv_pci_init_ioda_msis(struct pnv_phb *phb)
+{
+ unsigned int count;
+ const __be32 *prop = of_get_property(phb->hose->dn,
+ "ibm,opal-msi-ranges", NULL);
+ if (!prop) {
+ /* BML Fallback */
+ prop = of_get_property(phb->hose->dn, "msi-ranges", NULL);
+ }
+ if (!prop)
+ return;
+
+ phb->msi_base = be32_to_cpup(prop);
+ count = be32_to_cpup(prop + 1);
+ if (msi_bitmap_alloc(&phb->msi_bmp, count, phb->hose->dn)) {
+ pr_err("PCI %d: Failed to allocate MSI bitmap !\n",
+ phb->hose->global_number);
+ return;
+ }
+
+ phb->msi_setup = pnv_pci_ioda_msi_setup;
+ phb->msi32_support = 1;
+ pr_info(" Allocated bitmap for %d MSIs (base IRQ 0x%x)\n",
+ count, phb->msi_base);
+}
+#else
+static void pnv_pci_init_ioda_msis(struct pnv_phb *phb) { }
+#endif /* CONFIG_PCI_MSI */
+
+#ifdef CONFIG_PCI_IOV
+static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev)
+{
+ struct pci_controller *hose = pci_bus_to_host(pdev->bus);
+ struct pnv_phb *phb = hose->private_data;
+ const resource_size_t gate = phb->ioda.m64_segsize >> 2;
+ struct resource *res;
+ int i;
+ resource_size_t size, total_vf_bar_sz;
+ struct pci_dn *pdn;
+ int mul, total_vfs;
+
+ pdn = pci_get_pdn(pdev);
+ pdn->vfs_expanded = 0;
+ pdn->m64_single_mode = false;
+
+ total_vfs = pci_sriov_get_totalvfs(pdev);
+ mul = phb->ioda.total_pe_num;
+ total_vf_bar_sz = 0;
+
+ for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
+ res = &pdev->resource[i + PCI_IOV_RESOURCES];
+ if (!res->flags || res->parent)
+ continue;
+ if (!pnv_pci_is_m64_flags(res->flags)) {
+ dev_warn(&pdev->dev, "Don't support SR-IOV with"
+ " non M64 VF BAR%d: %pR. \n",
+ i, res);
+ goto truncate_iov;
+ }
+
+ total_vf_bar_sz += pci_iov_resource_size(pdev,
+ i + PCI_IOV_RESOURCES);
+
+ /*
+ * If bigger than quarter of M64 segment size, just round up
+ * power of two.
+ *
+ * Generally, one M64 BAR maps one IOV BAR. To avoid conflict
+ * with other devices, IOV BAR size is expanded to be
+ * (total_pe * VF_BAR_size). When VF_BAR_size is half of M64
+ * segment size , the expanded size would equal to half of the
+ * whole M64 space size, which will exhaust the M64 Space and
+ * limit the system flexibility. This is a design decision to
+ * set the boundary to quarter of the M64 segment size.
+ */
+ if (total_vf_bar_sz > gate) {
+ mul = roundup_pow_of_two(total_vfs);
+ dev_info(&pdev->dev,
+ "VF BAR Total IOV size %llx > %llx, roundup to %d VFs\n",
+ total_vf_bar_sz, gate, mul);
+ pdn->m64_single_mode = true;
+ break;
+ }
+ }
+
+ for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
+ res = &pdev->resource[i + PCI_IOV_RESOURCES];
+ if (!res->flags || res->parent)
+ continue;
+
+ size = pci_iov_resource_size(pdev, i + PCI_IOV_RESOURCES);
+ /*
+ * On PHB3, the minimum size alignment of M64 BAR in single
+ * mode is 32MB.
+ */
+ if (pdn->m64_single_mode && (size < SZ_32M))
+ goto truncate_iov;
+ dev_dbg(&pdev->dev, " Fixing VF BAR%d: %pR to\n", i, res);
+ res->end = res->start + size * mul - 1;
+ dev_dbg(&pdev->dev, " %pR\n", res);
+ dev_info(&pdev->dev, "VF BAR%d: %pR (expanded to %d VFs for PE alignment)",
+ i, res, mul);
+ }
+ pdn->vfs_expanded = mul;
+
+ return;
+
+truncate_iov:
+ /* To save MMIO space, IOV BAR is truncated. */
+ for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
+ res = &pdev->resource[i + PCI_IOV_RESOURCES];
+ res->flags = 0;
+ res->end = res->start - 1;
+ }
+}
+
+static void pnv_pci_ioda_fixup_iov(struct pci_dev *pdev)
+{
+ if (WARN_ON(pci_dev_is_added(pdev)))
+ return;
+
+ if (pdev->is_virtfn) {
+ struct pnv_ioda_pe *pe = pnv_ioda_get_pe(pdev);
+
+ /*
+ * VF PEs are single-device PEs so their pdev pointer needs to
+ * be set. The pdev doesn't exist when the PE is allocated (in
+ * (pcibios_sriov_enable()) so we fix it up here.
+ */
+ pe->pdev = pdev;
+ WARN_ON(!(pe->flags & PNV_IODA_PE_VF));
+ } else if (pdev->is_physfn) {
+ /*
+ * For PFs adjust their allocated IOV resources to match what
+ * the PHB can support using it's M64 BAR table.
+ */
+ pnv_pci_ioda_fixup_iov_resources(pdev);
+ }
+}
+#endif /* CONFIG_PCI_IOV */
+
+static void pnv_ioda_setup_pe_res(struct pnv_ioda_pe *pe,
+ struct resource *res)
+{
+ struct pnv_phb *phb = pe->phb;
+ struct pci_bus_region region;
+ int index;
+ int64_t rc;
+
+ if (!res || !res->flags || res->start > res->end)
+ return;
+
+ if (res->flags & IORESOURCE_IO) {
+ region.start = res->start - phb->ioda.io_pci_base;
+ region.end = res->end - phb->ioda.io_pci_base;
+ index = region.start / phb->ioda.io_segsize;
+
+ while (index < phb->ioda.total_pe_num &&
+ region.start <= region.end) {
+ phb->ioda.io_segmap[index] = pe->pe_number;
+ rc = opal_pci_map_pe_mmio_window(phb->opal_id,
+ pe->pe_number, OPAL_IO_WINDOW_TYPE, 0, index);
+ if (rc != OPAL_SUCCESS) {
+ pr_err("%s: Error %lld mapping IO segment#%d to PE#%x\n",
+ __func__, rc, index, pe->pe_number);
+ break;
+ }
+
+ region.start += phb->ioda.io_segsize;
+ index++;
+ }
+ } else if ((res->flags & IORESOURCE_MEM) &&
+ !pnv_pci_is_m64(phb, res)) {
+ region.start = res->start -
+ phb->hose->mem_offset[0] -
+ phb->ioda.m32_pci_base;
+ region.end = res->end -
+ phb->hose->mem_offset[0] -
+ phb->ioda.m32_pci_base;
+ index = region.start / phb->ioda.m32_segsize;
+
+ while (index < phb->ioda.total_pe_num &&
+ region.start <= region.end) {
+ phb->ioda.m32_segmap[index] = pe->pe_number;
+ rc = opal_pci_map_pe_mmio_window(phb->opal_id,
+ pe->pe_number, OPAL_M32_WINDOW_TYPE, 0, index);
+ if (rc != OPAL_SUCCESS) {
+ pr_err("%s: Error %lld mapping M32 segment#%d to PE#%x",
+ __func__, rc, index, pe->pe_number);
+ break;
+ }
+
+ region.start += phb->ioda.m32_segsize;
+ index++;
+ }
+ }
+}
+
+/*
+ * This function is supposed to be called on basis of PE from top
+ * to bottom style. So the the I/O or MMIO segment assigned to
+ * parent PE could be overridden by its child PEs if necessary.
+ */
+static void pnv_ioda_setup_pe_seg(struct pnv_ioda_pe *pe)
+{
+ struct pci_dev *pdev;
+ int i;
+
+ /*
+ * NOTE: We only care PCI bus based PE for now. For PCI
+ * device based PE, for example SRIOV sensitive VF should
+ * be figured out later.
+ */
+ BUG_ON(!(pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)));
+
+ list_for_each_entry(pdev, &pe->pbus->devices, bus_list) {
+ for (i = 0; i <= PCI_ROM_RESOURCE; i++)
+ pnv_ioda_setup_pe_res(pe, &pdev->resource[i]);
+
+ /*
+ * If the PE contains all subordinate PCI buses, the
+ * windows of the child bridges should be mapped to
+ * the PE as well.
+ */
+ if (!(pe->flags & PNV_IODA_PE_BUS_ALL) || !pci_is_bridge(pdev))
+ continue;
+ for (i = 0; i < PCI_BRIDGE_RESOURCE_NUM; i++)
+ pnv_ioda_setup_pe_res(pe,
+ &pdev->resource[PCI_BRIDGE_RESOURCES + i]);
+ }
+}
+
+#ifdef CONFIG_DEBUG_FS
+static int pnv_pci_diag_data_set(void *data, u64 val)
+{
+ struct pci_controller *hose;
+ struct pnv_phb *phb;
+ s64 ret;
+
+ if (val != 1ULL)
+ return -EINVAL;
+
+ hose = (struct pci_controller *)data;
+ if (!hose || !hose->private_data)
+ return -ENODEV;
+
+ phb = hose->private_data;
+
+ /* Retrieve the diag data from firmware */
+ ret = opal_pci_get_phb_diag_data2(phb->opal_id, phb->diag_data,
+ phb->diag_data_size);
+ if (ret != OPAL_SUCCESS)
+ return -EIO;
+
+ /* Print the diag data to the kernel log */
+ pnv_pci_dump_phb_diag_data(phb->hose, phb->diag_data);
+ return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(pnv_pci_diag_data_fops, NULL,
+ pnv_pci_diag_data_set, "%llu\n");
+
+#endif /* CONFIG_DEBUG_FS */
+
+static void pnv_pci_ioda_create_dbgfs(void)
+{
+#ifdef CONFIG_DEBUG_FS
+ struct pci_controller *hose, *tmp;
+ struct pnv_phb *phb;
+ char name[16];
+
+ list_for_each_entry_safe(hose, tmp, &hose_list, list_node) {
+ phb = hose->private_data;
+
+ /* Notify initialization of PHB done */
+ phb->initialized = 1;
+
+ sprintf(name, "PCI%04x", hose->global_number);
+ phb->dbgfs = debugfs_create_dir(name, powerpc_debugfs_root);
+ if (!phb->dbgfs) {
+ pr_warn("%s: Error on creating debugfs on PHB#%x\n",
+ __func__, hose->global_number);
+ continue;
+ }
+
+ debugfs_create_file("dump_diag_regs", 0200, phb->dbgfs, hose,
+ &pnv_pci_diag_data_fops);
+ }
+#endif /* CONFIG_DEBUG_FS */
+}
+
+static void pnv_pci_enable_bridge(struct pci_bus *bus)
+{
+ struct pci_dev *dev = bus->self;
+ struct pci_bus *child;
+
+ /* Empty bus ? bail */
+ if (list_empty(&bus->devices))
+ return;
+
+ /*
+ * If there's a bridge associated with that bus enable it. This works
+ * around races in the generic code if the enabling is done during
+ * parallel probing. This can be removed once those races have been
+ * fixed.
+ */
+ if (dev) {
+ int rc = pci_enable_device(dev);
+ if (rc)
+ pci_err(dev, "Error enabling bridge (%d)\n", rc);
+ pci_set_master(dev);
+ }
+
+ /* Perform the same to child busses */
+ list_for_each_entry(child, &bus->children, node)
+ pnv_pci_enable_bridge(child);
+}
+
+static void pnv_pci_enable_bridges(void)
+{
+ struct pci_controller *hose;
+
+ list_for_each_entry(hose, &hose_list, list_node)
+ pnv_pci_enable_bridge(hose->bus);
+}
+
+static void pnv_pci_ioda_fixup(void)
+{
+ pnv_pci_ioda_setup_PEs();
+ pnv_pci_ioda_setup_iommu_api();
+ pnv_pci_ioda_create_dbgfs();
+
+ pnv_pci_enable_bridges();
+
+#ifdef CONFIG_EEH
+ pnv_eeh_post_init();
+#endif
+}
+
+/*
+ * Returns the alignment for I/O or memory windows for P2P
+ * bridges. That actually depends on how PEs are segmented.
+ * For now, we return I/O or M32 segment size for PE sensitive
+ * P2P bridges. Otherwise, the default values (4KiB for I/O,
+ * 1MiB for memory) will be returned.
+ *
+ * The current PCI bus might be put into one PE, which was
+ * create against the parent PCI bridge. For that case, we
+ * needn't enlarge the alignment so that we can save some
+ * resources.
+ */
+static resource_size_t pnv_pci_window_alignment(struct pci_bus *bus,
+ unsigned long type)
+{
+ struct pci_dev *bridge;
+ struct pci_controller *hose = pci_bus_to_host(bus);
+ struct pnv_phb *phb = hose->private_data;
+ int num_pci_bridges = 0;
+
+ bridge = bus->self;
+ while (bridge) {
+ if (pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE) {
+ num_pci_bridges++;
+ if (num_pci_bridges >= 2)
+ return 1;
+ }
+
+ bridge = bridge->bus->self;
+ }
+
+ /*
+ * We fall back to M32 if M64 isn't supported. We enforce the M64
+ * alignment for any 64-bit resource, PCIe doesn't care and
+ * bridges only do 64-bit prefetchable anyway.
+ */
+ if (phb->ioda.m64_segsize && pnv_pci_is_m64_flags(type))
+ return phb->ioda.m64_segsize;
+ if (type & IORESOURCE_MEM)
+ return phb->ioda.m32_segsize;
+
+ return phb->ioda.io_segsize;
+}
+
+/*
+ * We are updating root port or the upstream port of the
+ * bridge behind the root port with PHB's windows in order
+ * to accommodate the changes on required resources during
+ * PCI (slot) hotplug, which is connected to either root
+ * port or the downstream ports of PCIe switch behind the
+ * root port.
+ */
+static void pnv_pci_fixup_bridge_resources(struct pci_bus *bus,
+ unsigned long type)
+{
+ struct pci_controller *hose = pci_bus_to_host(bus);
+ struct pnv_phb *phb = hose->private_data;
+ struct pci_dev *bridge = bus->self;
+ struct resource *r, *w;
+ bool msi_region = false;
+ int i;
+
+ /* Check if we need apply fixup to the bridge's windows */
+ if (!pci_is_root_bus(bridge->bus) &&
+ !pci_is_root_bus(bridge->bus->self->bus))
+ return;
+
+ /* Fixup the resources */
+ for (i = 0; i < PCI_BRIDGE_RESOURCE_NUM; i++) {
+ r = &bridge->resource[PCI_BRIDGE_RESOURCES + i];
+ if (!r->flags || !r->parent)
+ continue;
+
+ w = NULL;
+ if (r->flags & type & IORESOURCE_IO)
+ w = &hose->io_resource;
+ else if (pnv_pci_is_m64(phb, r) &&
+ (type & IORESOURCE_PREFETCH) &&
+ phb->ioda.m64_segsize)
+ w = &hose->mem_resources[1];
+ else if (r->flags & type & IORESOURCE_MEM) {
+ w = &hose->mem_resources[0];
+ msi_region = true;
+ }
+
+ r->start = w->start;
+ r->end = w->end;
+
+ /* The 64KB 32-bits MSI region shouldn't be included in
+ * the 32-bits bridge window. Otherwise, we can see strange
+ * issues. One of them is EEH error observed on Garrison.
+ *
+ * Exclude top 1MB region which is the minimal alignment of
+ * 32-bits bridge window.
+ */
+ if (msi_region) {
+ r->end += 0x10000;
+ r->end -= 0x100000;
+ }
+ }
+}
+
+static void pnv_pci_setup_bridge(struct pci_bus *bus, unsigned long type)
+{
+ struct pci_controller *hose = pci_bus_to_host(bus);
+ struct pnv_phb *phb = hose->private_data;
+ struct pci_dev *bridge = bus->self;
+ struct pnv_ioda_pe *pe;
+ bool all = (pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE);
+
+ /* Extend bridge's windows if necessary */
+ pnv_pci_fixup_bridge_resources(bus, type);
+
+ /* The PE for root bus should be realized before any one else */
+ if (!phb->ioda.root_pe_populated) {
+ pe = pnv_ioda_setup_bus_PE(phb->hose->bus, false);
+ if (pe) {
+ phb->ioda.root_pe_idx = pe->pe_number;
+ phb->ioda.root_pe_populated = true;
+ }
+ }
+
+ /* Don't assign PE to PCI bus, which doesn't have subordinate devices */
+ if (list_empty(&bus->devices))
+ return;
+
+ /* Reserve PEs according to used M64 resources */
+ if (phb->reserve_m64_pe)
+ phb->reserve_m64_pe(bus, NULL, all);
+
+ /*
+ * Assign PE. We might run here because of partial hotplug.
+ * For the case, we just pick up the existing PE and should
+ * not allocate resources again.
+ */
+ pe = pnv_ioda_setup_bus_PE(bus, all);
+ if (!pe)
+ return;
+
+ pnv_ioda_setup_pe_seg(pe);
+ switch (phb->type) {
+ case PNV_PHB_IODA1:
+ pnv_pci_ioda1_setup_dma_pe(phb, pe);
+ break;
+ case PNV_PHB_IODA2:
+ pnv_pci_ioda2_setup_dma_pe(phb, pe);
+ break;
+ default:
+ pr_warn("%s: No DMA for PHB#%x (type %d)\n",
+ __func__, phb->hose->global_number, phb->type);
+ }
+}
+
+static resource_size_t pnv_pci_default_alignment(void)
+{
+ return PAGE_SIZE;
+}
+
+#ifdef CONFIG_PCI_IOV
+static resource_size_t pnv_pci_iov_resource_alignment(struct pci_dev *pdev,
+ int resno)
+{
+ struct pci_controller *hose = pci_bus_to_host(pdev->bus);
+ struct pnv_phb *phb = hose->private_data;
+ struct pci_dn *pdn = pci_get_pdn(pdev);
+ resource_size_t align;
+
+ /*
+ * On PowerNV platform, IOV BAR is mapped by M64 BAR to enable the
+ * SR-IOV. While from hardware perspective, the range mapped by M64
+ * BAR should be size aligned.
+ *
+ * When IOV BAR is mapped with M64 BAR in Single PE mode, the extra
+ * powernv-specific hardware restriction is gone. But if just use the
+ * VF BAR size as the alignment, PF BAR / VF BAR may be allocated with
+ * in one segment of M64 #15, which introduces the PE conflict between
+ * PF and VF. Based on this, the minimum alignment of an IOV BAR is
+ * m64_segsize.
+ *
+ * This function returns the total IOV BAR size if M64 BAR is in
+ * Shared PE mode or just VF BAR size if not.
+ * If the M64 BAR is in Single PE mode, return the VF BAR size or
+ * M64 segment size if IOV BAR size is less.
+ */
+ align = pci_iov_resource_size(pdev, resno);
+ if (!pdn->vfs_expanded)
+ return align;
+ if (pdn->m64_single_mode)
+ return max(align, (resource_size_t)phb->ioda.m64_segsize);
+
+ return pdn->vfs_expanded * align;
+}
+#endif /* CONFIG_PCI_IOV */
+
+/* Prevent enabling devices for which we couldn't properly
+ * assign a PE
+ */
+static bool pnv_pci_enable_device_hook(struct pci_dev *dev)
+{
+ struct pci_controller *hose = pci_bus_to_host(dev->bus);
+ struct pnv_phb *phb = hose->private_data;
+ struct pci_dn *pdn;
+
+ /* The function is probably called while the PEs have
+ * not be created yet. For example, resource reassignment
+ * during PCI probe period. We just skip the check if
+ * PEs isn't ready.
+ */
+ if (!phb->initialized)
+ return true;
+
+ pdn = pci_get_pdn(dev);
+ if (!pdn || pdn->pe_number == IODA_INVALID_PE)
+ return false;
+
+ return true;
+}
+
+static long pnv_pci_ioda1_unset_window(struct iommu_table_group *table_group,
+ int num)
+{
+ struct pnv_ioda_pe *pe = container_of(table_group,
+ struct pnv_ioda_pe, table_group);
+ struct pnv_phb *phb = pe->phb;
+ unsigned int idx;
+ long rc;
+
+ pe_info(pe, "Removing DMA window #%d\n", num);
+ for (idx = 0; idx < phb->ioda.dma32_count; idx++) {
+ if (phb->ioda.dma32_segmap[idx] != pe->pe_number)
+ continue;
+
+ rc = opal_pci_map_pe_dma_window(phb->opal_id, pe->pe_number,
+ idx, 0, 0ul, 0ul, 0ul);
+ if (rc != OPAL_SUCCESS) {
+ pe_warn(pe, "Failure %ld unmapping DMA32 segment#%d\n",
+ rc, idx);
+ return rc;
+ }
+
+ phb->ioda.dma32_segmap[idx] = IODA_INVALID_PE;
+ }
+
+ pnv_pci_unlink_table_and_group(table_group->tables[num], table_group);
+ return OPAL_SUCCESS;
+}
+
+static void pnv_pci_ioda1_release_pe_dma(struct pnv_ioda_pe *pe)
+{
+ unsigned int weight = pnv_pci_ioda_pe_dma_weight(pe);
+ struct iommu_table *tbl = pe->table_group.tables[0];
+ int64_t rc;
+
+ if (!weight)
+ return;
+
+ rc = pnv_pci_ioda1_unset_window(&pe->table_group, 0);
+ if (rc != OPAL_SUCCESS)
+ return;
+
+ pnv_pci_p7ioc_tce_invalidate(tbl, tbl->it_offset, tbl->it_size, false);
+ if (pe->table_group.group) {
+ iommu_group_put(pe->table_group.group);
+ WARN_ON(pe->table_group.group);
+ }
+
+ free_pages(tbl->it_base, get_order(tbl->it_size << 3));
+ iommu_tce_table_put(tbl);
+}
+
+static void pnv_pci_ioda2_release_pe_dma(struct pnv_ioda_pe *pe)
+{
+ struct iommu_table *tbl = pe->table_group.tables[0];
+ unsigned int weight = pnv_pci_ioda_pe_dma_weight(pe);
+#ifdef CONFIG_IOMMU_API
+ int64_t rc;
+#endif
+
+ if (!weight)
+ return;
+
+#ifdef CONFIG_IOMMU_API
+ rc = pnv_pci_ioda2_unset_window(&pe->table_group, 0);
+ if (rc)
+ pe_warn(pe, "OPAL error %ld release DMA window\n", rc);
+#endif
+
+ pnv_pci_ioda2_set_bypass(pe, false);
+ if (pe->table_group.group) {
+ iommu_group_put(pe->table_group.group);
+ WARN_ON(pe->table_group.group);
+ }
+
+ iommu_tce_table_put(tbl);
+}
+
+static void pnv_ioda_free_pe_seg(struct pnv_ioda_pe *pe,
+ unsigned short win,
+ unsigned int *map)
+{
+ struct pnv_phb *phb = pe->phb;
+ int idx;
+ int64_t rc;
+
+ for (idx = 0; idx < phb->ioda.total_pe_num; idx++) {
+ if (map[idx] != pe->pe_number)
+ continue;
+
+ if (win == OPAL_M64_WINDOW_TYPE)
+ rc = opal_pci_map_pe_mmio_window(phb->opal_id,
+ phb->ioda.reserved_pe_idx, win,
+ idx / PNV_IODA1_M64_SEGS,
+ idx % PNV_IODA1_M64_SEGS);
+ else
+ rc = opal_pci_map_pe_mmio_window(phb->opal_id,
+ phb->ioda.reserved_pe_idx, win, 0, idx);
+
+ if (rc != OPAL_SUCCESS)
+ pe_warn(pe, "Error %ld unmapping (%d) segment#%d\n",
+ rc, win, idx);
+
+ map[idx] = IODA_INVALID_PE;
+ }
+}
+
+static void pnv_ioda_release_pe_seg(struct pnv_ioda_pe *pe)
+{
+ struct pnv_phb *phb = pe->phb;
+
+ if (phb->type == PNV_PHB_IODA1) {
+ pnv_ioda_free_pe_seg(pe, OPAL_IO_WINDOW_TYPE,
+ phb->ioda.io_segmap);
+ pnv_ioda_free_pe_seg(pe, OPAL_M32_WINDOW_TYPE,
+ phb->ioda.m32_segmap);
+ pnv_ioda_free_pe_seg(pe, OPAL_M64_WINDOW_TYPE,
+ phb->ioda.m64_segmap);
+ } else if (phb->type == PNV_PHB_IODA2) {
+ pnv_ioda_free_pe_seg(pe, OPAL_M32_WINDOW_TYPE,
+ phb->ioda.m32_segmap);
+ }
+}
+
+static void pnv_ioda_release_pe(struct pnv_ioda_pe *pe)
+{
+ struct pnv_phb *phb = pe->phb;
+ struct pnv_ioda_pe *slave, *tmp;
+
+ list_del(&pe->list);
+ switch (phb->type) {
+ case PNV_PHB_IODA1:
+ pnv_pci_ioda1_release_pe_dma(pe);
+ break;
+ case PNV_PHB_IODA2:
+ pnv_pci_ioda2_release_pe_dma(pe);
+ break;
+ default:
+ WARN_ON(1);
+ }
+
+ pnv_ioda_release_pe_seg(pe);
+ pnv_ioda_deconfigure_pe(pe->phb, pe);
+
+ /* Release slave PEs in the compound PE */
+ if (pe->flags & PNV_IODA_PE_MASTER) {
+ list_for_each_entry_safe(slave, tmp, &pe->slaves, list) {
+ list_del(&slave->list);
+ pnv_ioda_free_pe(slave);
+ }
+ }
+
+ /*
+ * The PE for root bus can be removed because of hotplug in EEH
+ * recovery for fenced PHB error. We need to mark the PE dead so
+ * that it can be populated again in PCI hot add path. The PE
+ * shouldn't be destroyed as it's the global reserved resource.
+ */
+ if (phb->ioda.root_pe_populated &&
+ phb->ioda.root_pe_idx == pe->pe_number)
+ phb->ioda.root_pe_populated = false;
+ else
+ pnv_ioda_free_pe(pe);
+}
+
+static void pnv_pci_release_device(struct pci_dev *pdev)
+{
+ struct pci_controller *hose = pci_bus_to_host(pdev->bus);
+ struct pnv_phb *phb = hose->private_data;
+ struct pci_dn *pdn = pci_get_pdn(pdev);
+ struct pnv_ioda_pe *pe;
+
+ if (pdev->is_virtfn)
+ return;
+
+ if (!pdn || pdn->pe_number == IODA_INVALID_PE)
+ return;
+
+ /*
+ * PCI hotplug can happen as part of EEH error recovery. The @pdn
+ * isn't removed and added afterwards in this scenario. We should
+ * set the PE number in @pdn to an invalid one. Otherwise, the PE's
+ * device count is decreased on removing devices while failing to
+ * be increased on adding devices. It leads to unbalanced PE's device
+ * count and eventually make normal PCI hotplug path broken.
+ */
+ pe = &phb->ioda.pe_array[pdn->pe_number];
+ pdn->pe_number = IODA_INVALID_PE;
+
+ WARN_ON(--pe->device_count < 0);
+ if (pe->device_count == 0)
+ pnv_ioda_release_pe(pe);
+}
+
+static void pnv_pci_ioda_shutdown(struct pci_controller *hose)
+{
+ struct pnv_phb *phb = hose->private_data;
+
+ opal_pci_reset(phb->opal_id, OPAL_RESET_PCI_IODA_TABLE,
+ OPAL_ASSERT_RESET);
+}
+
+static const struct pci_controller_ops pnv_pci_ioda_controller_ops = {
+ .dma_dev_setup = pnv_pci_dma_dev_setup,
+ .dma_bus_setup = pnv_pci_dma_bus_setup,
+#ifdef CONFIG_PCI_MSI
+ .setup_msi_irqs = pnv_setup_msi_irqs,
+ .teardown_msi_irqs = pnv_teardown_msi_irqs,
+#endif
+ .enable_device_hook = pnv_pci_enable_device_hook,
+ .release_device = pnv_pci_release_device,
+ .window_alignment = pnv_pci_window_alignment,
+ .setup_bridge = pnv_pci_setup_bridge,
+ .reset_secondary_bus = pnv_pci_reset_secondary_bus,
+ .dma_set_mask = pnv_pci_ioda_dma_set_mask,
+ .dma_get_required_mask = pnv_pci_ioda_dma_get_required_mask,
+ .shutdown = pnv_pci_ioda_shutdown,
+};
+
+static int pnv_npu_dma_set_mask(struct pci_dev *npdev, u64 dma_mask)
+{
+ dev_err_once(&npdev->dev,
+ "%s operation unsupported for NVLink devices\n",
+ __func__);
+ return -EPERM;
+}
+
+static const struct pci_controller_ops pnv_npu_ioda_controller_ops = {
+ .dma_dev_setup = pnv_pci_dma_dev_setup,
+#ifdef CONFIG_PCI_MSI
+ .setup_msi_irqs = pnv_setup_msi_irqs,
+ .teardown_msi_irqs = pnv_teardown_msi_irqs,
+#endif
+ .enable_device_hook = pnv_pci_enable_device_hook,
+ .window_alignment = pnv_pci_window_alignment,
+ .reset_secondary_bus = pnv_pci_reset_secondary_bus,
+ .dma_set_mask = pnv_npu_dma_set_mask,
+ .shutdown = pnv_pci_ioda_shutdown,
+};
+
+static const struct pci_controller_ops pnv_npu_ocapi_ioda_controller_ops = {
+ .enable_device_hook = pnv_pci_enable_device_hook,
+ .window_alignment = pnv_pci_window_alignment,
+ .reset_secondary_bus = pnv_pci_reset_secondary_bus,
+ .shutdown = pnv_pci_ioda_shutdown,
+};
+
+static void __init pnv_pci_init_ioda_phb(struct device_node *np,
+ u64 hub_id, int ioda_type)
+{
+ struct pci_controller *hose;
+ struct pnv_phb *phb;
+ unsigned long size, m64map_off, m32map_off, pemap_off;
+ unsigned long iomap_off = 0, dma32map_off = 0;
+ struct resource r;
+ const __be64 *prop64;
+ const __be32 *prop32;
+ int len;
+ unsigned int segno;
+ u64 phb_id;
+ void *aux;
+ long rc;
+
+ if (!of_device_is_available(np))
+ return;
+
+ pr_info("Initializing %s PHB (%pOF)\n", pnv_phb_names[ioda_type], np);
+
+ prop64 = of_get_property(np, "ibm,opal-phbid", NULL);
+ if (!prop64) {
+ pr_err(" Missing \"ibm,opal-phbid\" property !\n");
+ return;
+ }
+ phb_id = be64_to_cpup(prop64);
+ pr_debug(" PHB-ID : 0x%016llx\n", phb_id);
+
+ phb = memblock_virt_alloc(sizeof(*phb), 0);
+
+ /* Allocate PCI controller */
+ phb->hose = hose = pcibios_alloc_controller(np);
+ if (!phb->hose) {
+ pr_err(" Can't allocate PCI controller for %pOF\n",
+ np);
+ memblock_free(__pa(phb), sizeof(struct pnv_phb));
+ return;
+ }
+
+ spin_lock_init(&phb->lock);
+ prop32 = of_get_property(np, "bus-range", &len);
+ if (prop32 && len == 8) {
+ hose->first_busno = be32_to_cpu(prop32[0]);
+ hose->last_busno = be32_to_cpu(prop32[1]);
+ } else {
+ pr_warn(" Broken <bus-range> on %pOF\n", np);
+ hose->first_busno = 0;
+ hose->last_busno = 0xff;
+ }
+ hose->private_data = phb;
+ phb->hub_id = hub_id;
+ phb->opal_id = phb_id;
+ phb->type = ioda_type;
+ mutex_init(&phb->ioda.pe_alloc_mutex);
+
+ /* Detect specific models for error handling */
+ if (of_device_is_compatible(np, "ibm,p7ioc-pciex"))
+ phb->model = PNV_PHB_MODEL_P7IOC;
+ else if (of_device_is_compatible(np, "ibm,power8-pciex"))
+ phb->model = PNV_PHB_MODEL_PHB3;
+ else if (of_device_is_compatible(np, "ibm,power8-npu-pciex"))
+ phb->model = PNV_PHB_MODEL_NPU;
+ else if (of_device_is_compatible(np, "ibm,power9-npu-pciex"))
+ phb->model = PNV_PHB_MODEL_NPU2;
+ else
+ phb->model = PNV_PHB_MODEL_UNKNOWN;
+
+ /* Initialize diagnostic data buffer */
+ prop32 = of_get_property(np, "ibm,phb-diag-data-size", NULL);
+ if (prop32)
+ phb->diag_data_size = be32_to_cpup(prop32);
+ else
+ phb->diag_data_size = PNV_PCI_DIAG_BUF_SIZE;
+
+ phb->diag_data = memblock_virt_alloc(phb->diag_data_size, 0);
+
+ /* Parse 32-bit and IO ranges (if any) */
+ pci_process_bridge_OF_ranges(hose, np, !hose->global_number);
+
+ /* Get registers */
+ if (!of_address_to_resource(np, 0, &r)) {
+ phb->regs_phys = r.start;
+ phb->regs = ioremap(r.start, resource_size(&r));
+ if (phb->regs == NULL)
+ pr_err(" Failed to map registers !\n");
+ }
+
+ /* Initialize more IODA stuff */
+ phb->ioda.total_pe_num = 1;
+ prop32 = of_get_property(np, "ibm,opal-num-pes", NULL);
+ if (prop32)
+ phb->ioda.total_pe_num = be32_to_cpup(prop32);
+ prop32 = of_get_property(np, "ibm,opal-reserved-pe", NULL);
+ if (prop32)
+ phb->ioda.reserved_pe_idx = be32_to_cpup(prop32);
+
+ /* Invalidate RID to PE# mapping */
+ for (segno = 0; segno < ARRAY_SIZE(phb->ioda.pe_rmap); segno++)
+ phb->ioda.pe_rmap[segno] = IODA_INVALID_PE;
+
+ /* Parse 64-bit MMIO range */
+ pnv_ioda_parse_m64_window(phb);
+
+ phb->ioda.m32_size = resource_size(&hose->mem_resources[0]);
+ /* FW Has already off top 64k of M32 space (MSI space) */
+ phb->ioda.m32_size += 0x10000;
+
+ phb->ioda.m32_segsize = phb->ioda.m32_size / phb->ioda.total_pe_num;
+ phb->ioda.m32_pci_base = hose->mem_resources[0].start - hose->mem_offset[0];
+ phb->ioda.io_size = hose->pci_io_size;
+ phb->ioda.io_segsize = phb->ioda.io_size / phb->ioda.total_pe_num;
+ phb->ioda.io_pci_base = 0; /* XXX calculate this ? */
+
+ /* Calculate how many 32-bit TCE segments we have */
+ phb->ioda.dma32_count = phb->ioda.m32_pci_base /
+ PNV_IODA1_DMA32_SEGSIZE;
+
+ /* Allocate aux data & arrays. We don't have IO ports on PHB3 */
+ size = _ALIGN_UP(max_t(unsigned, phb->ioda.total_pe_num, 8) / 8,
+ sizeof(unsigned long));
+ m64map_off = size;
+ size += phb->ioda.total_pe_num * sizeof(phb->ioda.m64_segmap[0]);
+ m32map_off = size;
+ size += phb->ioda.total_pe_num * sizeof(phb->ioda.m32_segmap[0]);
+ if (phb->type == PNV_PHB_IODA1) {
+ iomap_off = size;
+ size += phb->ioda.total_pe_num * sizeof(phb->ioda.io_segmap[0]);
+ dma32map_off = size;
+ size += phb->ioda.dma32_count *
+ sizeof(phb->ioda.dma32_segmap[0]);
+ }
+ pemap_off = size;
+ size += phb->ioda.total_pe_num * sizeof(struct pnv_ioda_pe);
+ aux = memblock_virt_alloc(size, 0);
+ phb->ioda.pe_alloc = aux;
+ phb->ioda.m64_segmap = aux + m64map_off;
+ phb->ioda.m32_segmap = aux + m32map_off;
+ for (segno = 0; segno < phb->ioda.total_pe_num; segno++) {
+ phb->ioda.m64_segmap[segno] = IODA_INVALID_PE;
+ phb->ioda.m32_segmap[segno] = IODA_INVALID_PE;
+ }
+ if (phb->type == PNV_PHB_IODA1) {
+ phb->ioda.io_segmap = aux + iomap_off;
+ for (segno = 0; segno < phb->ioda.total_pe_num; segno++)
+ phb->ioda.io_segmap[segno] = IODA_INVALID_PE;
+
+ phb->ioda.dma32_segmap = aux + dma32map_off;
+ for (segno = 0; segno < phb->ioda.dma32_count; segno++)
+ phb->ioda.dma32_segmap[segno] = IODA_INVALID_PE;
+ }
+ phb->ioda.pe_array = aux + pemap_off;
+
+ /*
+ * Choose PE number for root bus, which shouldn't have
+ * M64 resources consumed by its child devices. To pick
+ * the PE number adjacent to the reserved one if possible.
+ */
+ pnv_ioda_reserve_pe(phb, phb->ioda.reserved_pe_idx);
+ if (phb->ioda.reserved_pe_idx == 0) {
+ phb->ioda.root_pe_idx = 1;
+ pnv_ioda_reserve_pe(phb, phb->ioda.root_pe_idx);
+ } else if (phb->ioda.reserved_pe_idx == (phb->ioda.total_pe_num - 1)) {
+ phb->ioda.root_pe_idx = phb->ioda.reserved_pe_idx - 1;
+ pnv_ioda_reserve_pe(phb, phb->ioda.root_pe_idx);
+ } else {
+ phb->ioda.root_pe_idx = IODA_INVALID_PE;
+ }
+
+ INIT_LIST_HEAD(&phb->ioda.pe_list);
+ mutex_init(&phb->ioda.pe_list_mutex);
+
+ /* Calculate how many 32-bit TCE segments we have */
+ phb->ioda.dma32_count = phb->ioda.m32_pci_base /
+ PNV_IODA1_DMA32_SEGSIZE;
+
+#if 0 /* We should really do that ... */
+ rc = opal_pci_set_phb_mem_window(opal->phb_id,
+ window_type,
+ window_num,
+ starting_real_address,
+ starting_pci_address,
+ segment_size);
+#endif
+
+ pr_info(" %03d (%03d) PE's M32: 0x%x [segment=0x%x]\n",
+ phb->ioda.total_pe_num, phb->ioda.reserved_pe_idx,
+ phb->ioda.m32_size, phb->ioda.m32_segsize);
+ if (phb->ioda.m64_size)
+ pr_info(" M64: 0x%lx [segment=0x%lx]\n",
+ phb->ioda.m64_size, phb->ioda.m64_segsize);
+ if (phb->ioda.io_size)
+ pr_info(" IO: 0x%x [segment=0x%x]\n",
+ phb->ioda.io_size, phb->ioda.io_segsize);
+
+
+ phb->hose->ops = &pnv_pci_ops;
+ phb->get_pe_state = pnv_ioda_get_pe_state;
+ phb->freeze_pe = pnv_ioda_freeze_pe;
+ phb->unfreeze_pe = pnv_ioda_unfreeze_pe;
+
+ /* Setup MSI support */
+ pnv_pci_init_ioda_msis(phb);
+
+ /*
+ * We pass the PCI probe flag PCI_REASSIGN_ALL_RSRC here
+ * to let the PCI core do resource assignment. It's supposed
+ * that the PCI core will do correct I/O and MMIO alignment
+ * for the P2P bridge bars so that each PCI bus (excluding
+ * the child P2P bridges) can form individual PE.
+ */
+ ppc_md.pcibios_fixup = pnv_pci_ioda_fixup;
+
+ switch (phb->type) {
+ case PNV_PHB_NPU_NVLINK:
+ hose->controller_ops = pnv_npu_ioda_controller_ops;
+ break;
+ case PNV_PHB_NPU_OCAPI:
+ hose->controller_ops = pnv_npu_ocapi_ioda_controller_ops;
+ break;
+ default:
+ phb->dma_dev_setup = pnv_pci_ioda_dma_dev_setup;
+ hose->controller_ops = pnv_pci_ioda_controller_ops;
+ }
+
+ ppc_md.pcibios_default_alignment = pnv_pci_default_alignment;
+
+#ifdef CONFIG_PCI_IOV
+ ppc_md.pcibios_fixup_sriov = pnv_pci_ioda_fixup_iov;
+ ppc_md.pcibios_iov_resource_alignment = pnv_pci_iov_resource_alignment;
+ ppc_md.pcibios_sriov_enable = pnv_pcibios_sriov_enable;
+ ppc_md.pcibios_sriov_disable = pnv_pcibios_sriov_disable;
+#endif
+
+ pci_add_flags(PCI_REASSIGN_ALL_RSRC);
+
+ /* Reset IODA tables to a clean state */
+ rc = opal_pci_reset(phb_id, OPAL_RESET_PCI_IODA_TABLE, OPAL_ASSERT_RESET);
+ if (rc)
+ pr_warn(" OPAL Error %ld performing IODA table reset !\n", rc);
+
+ /*
+ * If we're running in kdump kernel, the previous kernel never
+ * shutdown PCI devices correctly. We already got IODA table
+ * cleaned out. So we have to issue PHB reset to stop all PCI
+ * transactions from previous kernel. The ppc_pci_reset_phbs
+ * kernel parameter will force this reset too.
+ */
+ if (is_kdump_kernel() || pci_reset_phbs) {
+ pr_info(" Issue PHB reset ...\n");
+ pnv_eeh_phb_reset(hose, EEH_RESET_FUNDAMENTAL);
+ pnv_eeh_phb_reset(hose, EEH_RESET_DEACTIVATE);
+ }
+
+ /* Remove M64 resource if we can't configure it successfully */
+ if (!phb->init_m64 || phb->init_m64(phb))
+ hose->mem_resources[1].flags = 0;
+}
+
+void __init pnv_pci_init_ioda2_phb(struct device_node *np)
+{
+ pnv_pci_init_ioda_phb(np, 0, PNV_PHB_IODA2);
+}
+
+void __init pnv_pci_init_npu_phb(struct device_node *np)
+{
+ pnv_pci_init_ioda_phb(np, 0, PNV_PHB_NPU_NVLINK);
+}
+
+void __init pnv_pci_init_npu2_opencapi_phb(struct device_node *np)
+{
+ pnv_pci_init_ioda_phb(np, 0, PNV_PHB_NPU_OCAPI);
+}
+
+static void pnv_npu2_opencapi_cfg_size_fixup(struct pci_dev *dev)
+{
+ struct pci_controller *hose = pci_bus_to_host(dev->bus);
+ struct pnv_phb *phb = hose->private_data;
+
+ if (!machine_is(powernv))
+ return;
+
+ if (phb->type == PNV_PHB_NPU_OCAPI)
+ dev->cfg_size = PCI_CFG_SPACE_EXP_SIZE;
+}
+DECLARE_PCI_FIXUP_EARLY(PCI_ANY_ID, PCI_ANY_ID, pnv_npu2_opencapi_cfg_size_fixup);
+
+void __init pnv_pci_init_ioda_hub(struct device_node *np)
+{
+ struct device_node *phbn;
+ const __be64 *prop64;
+ u64 hub_id;
+
+ pr_info("Probing IODA IO-Hub %pOF\n", np);
+
+ prop64 = of_get_property(np, "ibm,opal-hubid", NULL);
+ if (!prop64) {
+ pr_err(" Missing \"ibm,opal-hubid\" property !\n");
+ return;
+ }
+ hub_id = be64_to_cpup(prop64);
+ pr_devel(" HUB-ID : 0x%016llx\n", hub_id);
+
+ /* Count child PHBs */
+ for_each_child_of_node(np, phbn) {
+ /* Look for IODA1 PHBs */
+ if (of_device_is_compatible(phbn, "ibm,ioda-phb"))
+ pnv_pci_init_ioda_phb(phbn, hub_id, PNV_PHB_IODA1);
+ }
+}
diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
new file mode 100644
index 000000000..b6fa900af
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/pci.c
@@ -0,0 +1,1129 @@
+/*
+ * Support PCI/PCIe on PowerNV platforms
+ *
+ * Copyright 2011 Benjamin Herrenschmidt, IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/pci.h>
+#include <linux/delay.h>
+#include <linux/string.h>
+#include <linux/init.h>
+#include <linux/irq.h>
+#include <linux/io.h>
+#include <linux/msi.h>
+#include <linux/iommu.h>
+#include <linux/sched/mm.h>
+
+#include <asm/sections.h>
+#include <asm/io.h>
+#include <asm/prom.h>
+#include <asm/pci-bridge.h>
+#include <asm/machdep.h>
+#include <asm/msi_bitmap.h>
+#include <asm/ppc-pci.h>
+#include <asm/pnv-pci.h>
+#include <asm/opal.h>
+#include <asm/iommu.h>
+#include <asm/tce.h>
+#include <asm/firmware.h>
+#include <asm/eeh_event.h>
+#include <asm/eeh.h>
+
+#include "powernv.h"
+#include "pci.h"
+
+static DEFINE_MUTEX(p2p_mutex);
+static DEFINE_MUTEX(tunnel_mutex);
+
+int pnv_pci_get_slot_id(struct device_node *np, uint64_t *id)
+{
+ struct device_node *parent = np;
+ u32 bdfn;
+ u64 phbid;
+ int ret;
+
+ ret = of_property_read_u32(np, "reg", &bdfn);
+ if (ret)
+ return -ENXIO;
+
+ bdfn = ((bdfn & 0x00ffff00) >> 8);
+ while ((parent = of_get_parent(parent))) {
+ if (!PCI_DN(parent)) {
+ of_node_put(parent);
+ break;
+ }
+
+ if (!of_device_is_compatible(parent, "ibm,ioda2-phb")) {
+ of_node_put(parent);
+ continue;
+ }
+
+ ret = of_property_read_u64(parent, "ibm,opal-phbid", &phbid);
+ if (ret) {
+ of_node_put(parent);
+ return -ENXIO;
+ }
+
+ *id = PCI_SLOT_ID(phbid, bdfn);
+ return 0;
+ }
+
+ return -ENODEV;
+}
+EXPORT_SYMBOL_GPL(pnv_pci_get_slot_id);
+
+int pnv_pci_get_device_tree(uint32_t phandle, void *buf, uint64_t len)
+{
+ int64_t rc;
+
+ if (!opal_check_token(OPAL_GET_DEVICE_TREE))
+ return -ENXIO;
+
+ rc = opal_get_device_tree(phandle, (uint64_t)buf, len);
+ if (rc < OPAL_SUCCESS)
+ return -EIO;
+
+ return rc;
+}
+EXPORT_SYMBOL_GPL(pnv_pci_get_device_tree);
+
+int pnv_pci_get_presence_state(uint64_t id, uint8_t *state)
+{
+ int64_t rc;
+
+ if (!opal_check_token(OPAL_PCI_GET_PRESENCE_STATE))
+ return -ENXIO;
+
+ rc = opal_pci_get_presence_state(id, (uint64_t)state);
+ if (rc != OPAL_SUCCESS)
+ return -EIO;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(pnv_pci_get_presence_state);
+
+int pnv_pci_get_power_state(uint64_t id, uint8_t *state)
+{
+ int64_t rc;
+
+ if (!opal_check_token(OPAL_PCI_GET_POWER_STATE))
+ return -ENXIO;
+
+ rc = opal_pci_get_power_state(id, (uint64_t)state);
+ if (rc != OPAL_SUCCESS)
+ return -EIO;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(pnv_pci_get_power_state);
+
+int pnv_pci_set_power_state(uint64_t id, uint8_t state, struct opal_msg *msg)
+{
+ struct opal_msg m;
+ int token, ret;
+ int64_t rc;
+
+ if (!opal_check_token(OPAL_PCI_SET_POWER_STATE))
+ return -ENXIO;
+
+ token = opal_async_get_token_interruptible();
+ if (unlikely(token < 0))
+ return token;
+
+ rc = opal_pci_set_power_state(token, id, (uint64_t)&state);
+ if (rc == OPAL_SUCCESS) {
+ ret = 0;
+ goto exit;
+ } else if (rc != OPAL_ASYNC_COMPLETION) {
+ ret = -EIO;
+ goto exit;
+ }
+
+ ret = opal_async_wait_response(token, &m);
+ if (ret < 0)
+ goto exit;
+
+ if (msg) {
+ ret = 1;
+ memcpy(msg, &m, sizeof(m));
+ }
+
+exit:
+ opal_async_release_token(token);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(pnv_pci_set_power_state);
+
+#ifdef CONFIG_PCI_MSI
+int pnv_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type)
+{
+ struct pci_controller *hose = pci_bus_to_host(pdev->bus);
+ struct pnv_phb *phb = hose->private_data;
+ struct msi_desc *entry;
+ struct msi_msg msg;
+ int hwirq;
+ unsigned int virq;
+ int rc;
+
+ if (WARN_ON(!phb) || !phb->msi_bmp.bitmap)
+ return -ENODEV;
+
+ if (pdev->no_64bit_msi && !phb->msi32_support)
+ return -ENODEV;
+
+ for_each_pci_msi_entry(entry, pdev) {
+ if (!entry->msi_attrib.is_64 && !phb->msi32_support) {
+ pr_warn("%s: Supports only 64-bit MSIs\n",
+ pci_name(pdev));
+ return -ENXIO;
+ }
+ hwirq = msi_bitmap_alloc_hwirqs(&phb->msi_bmp, 1);
+ if (hwirq < 0) {
+ pr_warn("%s: Failed to find a free MSI\n",
+ pci_name(pdev));
+ return -ENOSPC;
+ }
+ virq = irq_create_mapping(NULL, phb->msi_base + hwirq);
+ if (!virq) {
+ pr_warn("%s: Failed to map MSI to linux irq\n",
+ pci_name(pdev));
+ msi_bitmap_free_hwirqs(&phb->msi_bmp, hwirq, 1);
+ return -ENOMEM;
+ }
+ rc = phb->msi_setup(phb, pdev, phb->msi_base + hwirq,
+ virq, entry->msi_attrib.is_64, &msg);
+ if (rc) {
+ pr_warn("%s: Failed to setup MSI\n", pci_name(pdev));
+ irq_dispose_mapping(virq);
+ msi_bitmap_free_hwirqs(&phb->msi_bmp, hwirq, 1);
+ return rc;
+ }
+ irq_set_msi_desc(virq, entry);
+ pci_write_msi_msg(virq, &msg);
+ }
+ return 0;
+}
+
+void pnv_teardown_msi_irqs(struct pci_dev *pdev)
+{
+ struct pci_controller *hose = pci_bus_to_host(pdev->bus);
+ struct pnv_phb *phb = hose->private_data;
+ struct msi_desc *entry;
+ irq_hw_number_t hwirq;
+
+ if (WARN_ON(!phb))
+ return;
+
+ for_each_pci_msi_entry(entry, pdev) {
+ if (!entry->irq)
+ continue;
+ hwirq = virq_to_hw(entry->irq);
+ irq_set_msi_desc(entry->irq, NULL);
+ irq_dispose_mapping(entry->irq);
+ msi_bitmap_free_hwirqs(&phb->msi_bmp, hwirq - phb->msi_base, 1);
+ }
+}
+#endif /* CONFIG_PCI_MSI */
+
+/* Nicely print the contents of the PE State Tables (PEST). */
+static void pnv_pci_dump_pest(__be64 pestA[], __be64 pestB[], int pest_size)
+{
+ __be64 prevA = ULONG_MAX, prevB = ULONG_MAX;
+ bool dup = false;
+ int i;
+
+ for (i = 0; i < pest_size; i++) {
+ __be64 peA = be64_to_cpu(pestA[i]);
+ __be64 peB = be64_to_cpu(pestB[i]);
+
+ if (peA != prevA || peB != prevB) {
+ if (dup) {
+ pr_info("PE[..%03x] A/B: as above\n", i-1);
+ dup = false;
+ }
+ prevA = peA;
+ prevB = peB;
+ if (peA & PNV_IODA_STOPPED_STATE ||
+ peB & PNV_IODA_STOPPED_STATE)
+ pr_info("PE[%03x] A/B: %016llx %016llx\n",
+ i, peA, peB);
+ } else if (!dup && (peA & PNV_IODA_STOPPED_STATE ||
+ peB & PNV_IODA_STOPPED_STATE)) {
+ dup = true;
+ }
+ }
+}
+
+static void pnv_pci_dump_p7ioc_diag_data(struct pci_controller *hose,
+ struct OpalIoPhbErrorCommon *common)
+{
+ struct OpalIoP7IOCPhbErrorData *data;
+
+ data = (struct OpalIoP7IOCPhbErrorData *)common;
+ pr_info("P7IOC PHB#%x Diag-data (Version: %d)\n",
+ hose->global_number, be32_to_cpu(common->version));
+
+ if (data->brdgCtl)
+ pr_info("brdgCtl: %08x\n",
+ be32_to_cpu(data->brdgCtl));
+ if (data->portStatusReg || data->rootCmplxStatus ||
+ data->busAgentStatus)
+ pr_info("UtlSts: %08x %08x %08x\n",
+ be32_to_cpu(data->portStatusReg),
+ be32_to_cpu(data->rootCmplxStatus),
+ be32_to_cpu(data->busAgentStatus));
+ if (data->deviceStatus || data->slotStatus ||
+ data->linkStatus || data->devCmdStatus ||
+ data->devSecStatus)
+ pr_info("RootSts: %08x %08x %08x %08x %08x\n",
+ be32_to_cpu(data->deviceStatus),
+ be32_to_cpu(data->slotStatus),
+ be32_to_cpu(data->linkStatus),
+ be32_to_cpu(data->devCmdStatus),
+ be32_to_cpu(data->devSecStatus));
+ if (data->rootErrorStatus || data->uncorrErrorStatus ||
+ data->corrErrorStatus)
+ pr_info("RootErrSts: %08x %08x %08x\n",
+ be32_to_cpu(data->rootErrorStatus),
+ be32_to_cpu(data->uncorrErrorStatus),
+ be32_to_cpu(data->corrErrorStatus));
+ if (data->tlpHdr1 || data->tlpHdr2 ||
+ data->tlpHdr3 || data->tlpHdr4)
+ pr_info("RootErrLog: %08x %08x %08x %08x\n",
+ be32_to_cpu(data->tlpHdr1),
+ be32_to_cpu(data->tlpHdr2),
+ be32_to_cpu(data->tlpHdr3),
+ be32_to_cpu(data->tlpHdr4));
+ if (data->sourceId || data->errorClass ||
+ data->correlator)
+ pr_info("RootErrLog1: %08x %016llx %016llx\n",
+ be32_to_cpu(data->sourceId),
+ be64_to_cpu(data->errorClass),
+ be64_to_cpu(data->correlator));
+ if (data->p7iocPlssr || data->p7iocCsr)
+ pr_info("PhbSts: %016llx %016llx\n",
+ be64_to_cpu(data->p7iocPlssr),
+ be64_to_cpu(data->p7iocCsr));
+ if (data->lemFir)
+ pr_info("Lem: %016llx %016llx %016llx\n",
+ be64_to_cpu(data->lemFir),
+ be64_to_cpu(data->lemErrorMask),
+ be64_to_cpu(data->lemWOF));
+ if (data->phbErrorStatus)
+ pr_info("PhbErr: %016llx %016llx %016llx %016llx\n",
+ be64_to_cpu(data->phbErrorStatus),
+ be64_to_cpu(data->phbFirstErrorStatus),
+ be64_to_cpu(data->phbErrorLog0),
+ be64_to_cpu(data->phbErrorLog1));
+ if (data->mmioErrorStatus)
+ pr_info("OutErr: %016llx %016llx %016llx %016llx\n",
+ be64_to_cpu(data->mmioErrorStatus),
+ be64_to_cpu(data->mmioFirstErrorStatus),
+ be64_to_cpu(data->mmioErrorLog0),
+ be64_to_cpu(data->mmioErrorLog1));
+ if (data->dma0ErrorStatus)
+ pr_info("InAErr: %016llx %016llx %016llx %016llx\n",
+ be64_to_cpu(data->dma0ErrorStatus),
+ be64_to_cpu(data->dma0FirstErrorStatus),
+ be64_to_cpu(data->dma0ErrorLog0),
+ be64_to_cpu(data->dma0ErrorLog1));
+ if (data->dma1ErrorStatus)
+ pr_info("InBErr: %016llx %016llx %016llx %016llx\n",
+ be64_to_cpu(data->dma1ErrorStatus),
+ be64_to_cpu(data->dma1FirstErrorStatus),
+ be64_to_cpu(data->dma1ErrorLog0),
+ be64_to_cpu(data->dma1ErrorLog1));
+
+ pnv_pci_dump_pest(data->pestA, data->pestB, OPAL_P7IOC_NUM_PEST_REGS);
+}
+
+static void pnv_pci_dump_phb3_diag_data(struct pci_controller *hose,
+ struct OpalIoPhbErrorCommon *common)
+{
+ struct OpalIoPhb3ErrorData *data;
+
+ data = (struct OpalIoPhb3ErrorData*)common;
+ pr_info("PHB3 PHB#%x Diag-data (Version: %d)\n",
+ hose->global_number, be32_to_cpu(common->version));
+ if (data->brdgCtl)
+ pr_info("brdgCtl: %08x\n",
+ be32_to_cpu(data->brdgCtl));
+ if (data->portStatusReg || data->rootCmplxStatus ||
+ data->busAgentStatus)
+ pr_info("UtlSts: %08x %08x %08x\n",
+ be32_to_cpu(data->portStatusReg),
+ be32_to_cpu(data->rootCmplxStatus),
+ be32_to_cpu(data->busAgentStatus));
+ if (data->deviceStatus || data->slotStatus ||
+ data->linkStatus || data->devCmdStatus ||
+ data->devSecStatus)
+ pr_info("RootSts: %08x %08x %08x %08x %08x\n",
+ be32_to_cpu(data->deviceStatus),
+ be32_to_cpu(data->slotStatus),
+ be32_to_cpu(data->linkStatus),
+ be32_to_cpu(data->devCmdStatus),
+ be32_to_cpu(data->devSecStatus));
+ if (data->rootErrorStatus || data->uncorrErrorStatus ||
+ data->corrErrorStatus)
+ pr_info("RootErrSts: %08x %08x %08x\n",
+ be32_to_cpu(data->rootErrorStatus),
+ be32_to_cpu(data->uncorrErrorStatus),
+ be32_to_cpu(data->corrErrorStatus));
+ if (data->tlpHdr1 || data->tlpHdr2 ||
+ data->tlpHdr3 || data->tlpHdr4)
+ pr_info("RootErrLog: %08x %08x %08x %08x\n",
+ be32_to_cpu(data->tlpHdr1),
+ be32_to_cpu(data->tlpHdr2),
+ be32_to_cpu(data->tlpHdr3),
+ be32_to_cpu(data->tlpHdr4));
+ if (data->sourceId || data->errorClass ||
+ data->correlator)
+ pr_info("RootErrLog1: %08x %016llx %016llx\n",
+ be32_to_cpu(data->sourceId),
+ be64_to_cpu(data->errorClass),
+ be64_to_cpu(data->correlator));
+ if (data->nFir)
+ pr_info("nFir: %016llx %016llx %016llx\n",
+ be64_to_cpu(data->nFir),
+ be64_to_cpu(data->nFirMask),
+ be64_to_cpu(data->nFirWOF));
+ if (data->phbPlssr || data->phbCsr)
+ pr_info("PhbSts: %016llx %016llx\n",
+ be64_to_cpu(data->phbPlssr),
+ be64_to_cpu(data->phbCsr));
+ if (data->lemFir)
+ pr_info("Lem: %016llx %016llx %016llx\n",
+ be64_to_cpu(data->lemFir),
+ be64_to_cpu(data->lemErrorMask),
+ be64_to_cpu(data->lemWOF));
+ if (data->phbErrorStatus)
+ pr_info("PhbErr: %016llx %016llx %016llx %016llx\n",
+ be64_to_cpu(data->phbErrorStatus),
+ be64_to_cpu(data->phbFirstErrorStatus),
+ be64_to_cpu(data->phbErrorLog0),
+ be64_to_cpu(data->phbErrorLog1));
+ if (data->mmioErrorStatus)
+ pr_info("OutErr: %016llx %016llx %016llx %016llx\n",
+ be64_to_cpu(data->mmioErrorStatus),
+ be64_to_cpu(data->mmioFirstErrorStatus),
+ be64_to_cpu(data->mmioErrorLog0),
+ be64_to_cpu(data->mmioErrorLog1));
+ if (data->dma0ErrorStatus)
+ pr_info("InAErr: %016llx %016llx %016llx %016llx\n",
+ be64_to_cpu(data->dma0ErrorStatus),
+ be64_to_cpu(data->dma0FirstErrorStatus),
+ be64_to_cpu(data->dma0ErrorLog0),
+ be64_to_cpu(data->dma0ErrorLog1));
+ if (data->dma1ErrorStatus)
+ pr_info("InBErr: %016llx %016llx %016llx %016llx\n",
+ be64_to_cpu(data->dma1ErrorStatus),
+ be64_to_cpu(data->dma1FirstErrorStatus),
+ be64_to_cpu(data->dma1ErrorLog0),
+ be64_to_cpu(data->dma1ErrorLog1));
+
+ pnv_pci_dump_pest(data->pestA, data->pestB, OPAL_PHB3_NUM_PEST_REGS);
+}
+
+static void pnv_pci_dump_phb4_diag_data(struct pci_controller *hose,
+ struct OpalIoPhbErrorCommon *common)
+{
+ struct OpalIoPhb4ErrorData *data;
+
+ data = (struct OpalIoPhb4ErrorData*)common;
+ pr_info("PHB4 PHB#%d Diag-data (Version: %d)\n",
+ hose->global_number, be32_to_cpu(common->version));
+ if (data->brdgCtl)
+ pr_info("brdgCtl: %08x\n",
+ be32_to_cpu(data->brdgCtl));
+ if (data->deviceStatus || data->slotStatus ||
+ data->linkStatus || data->devCmdStatus ||
+ data->devSecStatus)
+ pr_info("RootSts: %08x %08x %08x %08x %08x\n",
+ be32_to_cpu(data->deviceStatus),
+ be32_to_cpu(data->slotStatus),
+ be32_to_cpu(data->linkStatus),
+ be32_to_cpu(data->devCmdStatus),
+ be32_to_cpu(data->devSecStatus));
+ if (data->rootErrorStatus || data->uncorrErrorStatus ||
+ data->corrErrorStatus)
+ pr_info("RootErrSts: %08x %08x %08x\n",
+ be32_to_cpu(data->rootErrorStatus),
+ be32_to_cpu(data->uncorrErrorStatus),
+ be32_to_cpu(data->corrErrorStatus));
+ if (data->tlpHdr1 || data->tlpHdr2 ||
+ data->tlpHdr3 || data->tlpHdr4)
+ pr_info("RootErrLog: %08x %08x %08x %08x\n",
+ be32_to_cpu(data->tlpHdr1),
+ be32_to_cpu(data->tlpHdr2),
+ be32_to_cpu(data->tlpHdr3),
+ be32_to_cpu(data->tlpHdr4));
+ if (data->sourceId)
+ pr_info("sourceId: %08x\n", be32_to_cpu(data->sourceId));
+ if (data->nFir)
+ pr_info("nFir: %016llx %016llx %016llx\n",
+ be64_to_cpu(data->nFir),
+ be64_to_cpu(data->nFirMask),
+ be64_to_cpu(data->nFirWOF));
+ if (data->phbPlssr || data->phbCsr)
+ pr_info("PhbSts: %016llx %016llx\n",
+ be64_to_cpu(data->phbPlssr),
+ be64_to_cpu(data->phbCsr));
+ if (data->lemFir)
+ pr_info("Lem: %016llx %016llx %016llx\n",
+ be64_to_cpu(data->lemFir),
+ be64_to_cpu(data->lemErrorMask),
+ be64_to_cpu(data->lemWOF));
+ if (data->phbErrorStatus)
+ pr_info("PhbErr: %016llx %016llx %016llx %016llx\n",
+ be64_to_cpu(data->phbErrorStatus),
+ be64_to_cpu(data->phbFirstErrorStatus),
+ be64_to_cpu(data->phbErrorLog0),
+ be64_to_cpu(data->phbErrorLog1));
+ if (data->phbTxeErrorStatus)
+ pr_info("PhbTxeErr: %016llx %016llx %016llx %016llx\n",
+ be64_to_cpu(data->phbTxeErrorStatus),
+ be64_to_cpu(data->phbTxeFirstErrorStatus),
+ be64_to_cpu(data->phbTxeErrorLog0),
+ be64_to_cpu(data->phbTxeErrorLog1));
+ if (data->phbRxeArbErrorStatus)
+ pr_info("RxeArbErr: %016llx %016llx %016llx %016llx\n",
+ be64_to_cpu(data->phbRxeArbErrorStatus),
+ be64_to_cpu(data->phbRxeArbFirstErrorStatus),
+ be64_to_cpu(data->phbRxeArbErrorLog0),
+ be64_to_cpu(data->phbRxeArbErrorLog1));
+ if (data->phbRxeMrgErrorStatus)
+ pr_info("RxeMrgErr: %016llx %016llx %016llx %016llx\n",
+ be64_to_cpu(data->phbRxeMrgErrorStatus),
+ be64_to_cpu(data->phbRxeMrgFirstErrorStatus),
+ be64_to_cpu(data->phbRxeMrgErrorLog0),
+ be64_to_cpu(data->phbRxeMrgErrorLog1));
+ if (data->phbRxeTceErrorStatus)
+ pr_info("RxeTceErr: %016llx %016llx %016llx %016llx\n",
+ be64_to_cpu(data->phbRxeTceErrorStatus),
+ be64_to_cpu(data->phbRxeTceFirstErrorStatus),
+ be64_to_cpu(data->phbRxeTceErrorLog0),
+ be64_to_cpu(data->phbRxeTceErrorLog1));
+
+ if (data->phbPblErrorStatus)
+ pr_info("PblErr: %016llx %016llx %016llx %016llx\n",
+ be64_to_cpu(data->phbPblErrorStatus),
+ be64_to_cpu(data->phbPblFirstErrorStatus),
+ be64_to_cpu(data->phbPblErrorLog0),
+ be64_to_cpu(data->phbPblErrorLog1));
+ if (data->phbPcieDlpErrorStatus)
+ pr_info("PcieDlp: %016llx %016llx %016llx\n",
+ be64_to_cpu(data->phbPcieDlpErrorLog1),
+ be64_to_cpu(data->phbPcieDlpErrorLog2),
+ be64_to_cpu(data->phbPcieDlpErrorStatus));
+ if (data->phbRegbErrorStatus)
+ pr_info("RegbErr: %016llx %016llx %016llx %016llx\n",
+ be64_to_cpu(data->phbRegbErrorStatus),
+ be64_to_cpu(data->phbRegbFirstErrorStatus),
+ be64_to_cpu(data->phbRegbErrorLog0),
+ be64_to_cpu(data->phbRegbErrorLog1));
+
+
+ pnv_pci_dump_pest(data->pestA, data->pestB, OPAL_PHB4_NUM_PEST_REGS);
+}
+
+void pnv_pci_dump_phb_diag_data(struct pci_controller *hose,
+ unsigned char *log_buff)
+{
+ struct OpalIoPhbErrorCommon *common;
+
+ if (!hose || !log_buff)
+ return;
+
+ common = (struct OpalIoPhbErrorCommon *)log_buff;
+ switch (be32_to_cpu(common->ioType)) {
+ case OPAL_PHB_ERROR_DATA_TYPE_P7IOC:
+ pnv_pci_dump_p7ioc_diag_data(hose, common);
+ break;
+ case OPAL_PHB_ERROR_DATA_TYPE_PHB3:
+ pnv_pci_dump_phb3_diag_data(hose, common);
+ break;
+ case OPAL_PHB_ERROR_DATA_TYPE_PHB4:
+ pnv_pci_dump_phb4_diag_data(hose, common);
+ break;
+ default:
+ pr_warn("%s: Unrecognized ioType %d\n",
+ __func__, be32_to_cpu(common->ioType));
+ }
+}
+
+static void pnv_pci_handle_eeh_config(struct pnv_phb *phb, u32 pe_no)
+{
+ unsigned long flags, rc;
+ int has_diag, ret = 0;
+
+ spin_lock_irqsave(&phb->lock, flags);
+
+ /* Fetch PHB diag-data */
+ rc = opal_pci_get_phb_diag_data2(phb->opal_id, phb->diag_data,
+ phb->diag_data_size);
+ has_diag = (rc == OPAL_SUCCESS);
+
+ /* If PHB supports compound PE, to handle it */
+ if (phb->unfreeze_pe) {
+ ret = phb->unfreeze_pe(phb,
+ pe_no,
+ OPAL_EEH_ACTION_CLEAR_FREEZE_ALL);
+ } else {
+ rc = opal_pci_eeh_freeze_clear(phb->opal_id,
+ pe_no,
+ OPAL_EEH_ACTION_CLEAR_FREEZE_ALL);
+ if (rc) {
+ pr_warn("%s: Failure %ld clearing frozen "
+ "PHB#%x-PE#%x\n",
+ __func__, rc, phb->hose->global_number,
+ pe_no);
+ ret = -EIO;
+ }
+ }
+
+ /*
+ * For now, let's only display the diag buffer when we fail to clear
+ * the EEH status. We'll do more sensible things later when we have
+ * proper EEH support. We need to make sure we don't pollute ourselves
+ * with the normal errors generated when probing empty slots
+ */
+ if (has_diag && ret)
+ pnv_pci_dump_phb_diag_data(phb->hose, phb->diag_data);
+
+ spin_unlock_irqrestore(&phb->lock, flags);
+}
+
+static void pnv_pci_config_check_eeh(struct pci_dn *pdn)
+{
+ struct pnv_phb *phb = pdn->phb->private_data;
+ u8 fstate = 0;
+ __be16 pcierr = 0;
+ unsigned int pe_no;
+ s64 rc;
+
+ /*
+ * Get the PE#. During the PCI probe stage, we might not
+ * setup that yet. So all ER errors should be mapped to
+ * reserved PE.
+ */
+ pe_no = pdn->pe_number;
+ if (pe_no == IODA_INVALID_PE) {
+ pe_no = phb->ioda.reserved_pe_idx;
+ }
+
+ /*
+ * Fetch frozen state. If the PHB support compound PE,
+ * we need handle that case.
+ */
+ if (phb->get_pe_state) {
+ fstate = phb->get_pe_state(phb, pe_no);
+ } else {
+ rc = opal_pci_eeh_freeze_status(phb->opal_id,
+ pe_no,
+ &fstate,
+ &pcierr,
+ NULL);
+ if (rc) {
+ pr_warn("%s: Failure %lld getting PHB#%x-PE#%x state\n",
+ __func__, rc, phb->hose->global_number, pe_no);
+ return;
+ }
+ }
+
+ pr_devel(" -> EEH check, bdfn=%04x PE#%x fstate=%x\n",
+ (pdn->busno << 8) | (pdn->devfn), pe_no, fstate);
+
+ /* Clear the frozen state if applicable */
+ if (fstate == OPAL_EEH_STOPPED_MMIO_FREEZE ||
+ fstate == OPAL_EEH_STOPPED_DMA_FREEZE ||
+ fstate == OPAL_EEH_STOPPED_MMIO_DMA_FREEZE) {
+ /*
+ * If PHB supports compound PE, freeze it for
+ * consistency.
+ */
+ if (phb->freeze_pe)
+ phb->freeze_pe(phb, pe_no);
+
+ pnv_pci_handle_eeh_config(phb, pe_no);
+ }
+}
+
+int pnv_pci_cfg_read(struct pci_dn *pdn,
+ int where, int size, u32 *val)
+{
+ struct pnv_phb *phb = pdn->phb->private_data;
+ u32 bdfn = (pdn->busno << 8) | pdn->devfn;
+ s64 rc;
+
+ switch (size) {
+ case 1: {
+ u8 v8;
+ rc = opal_pci_config_read_byte(phb->opal_id, bdfn, where, &v8);
+ *val = (rc == OPAL_SUCCESS) ? v8 : 0xff;
+ break;
+ }
+ case 2: {
+ __be16 v16;
+ rc = opal_pci_config_read_half_word(phb->opal_id, bdfn, where,
+ &v16);
+ *val = (rc == OPAL_SUCCESS) ? be16_to_cpu(v16) : 0xffff;
+ break;
+ }
+ case 4: {
+ __be32 v32;
+ rc = opal_pci_config_read_word(phb->opal_id, bdfn, where, &v32);
+ *val = (rc == OPAL_SUCCESS) ? be32_to_cpu(v32) : 0xffffffff;
+ break;
+ }
+ default:
+ return PCIBIOS_FUNC_NOT_SUPPORTED;
+ }
+
+ pr_devel("%s: bus: %x devfn: %x +%x/%x -> %08x\n",
+ __func__, pdn->busno, pdn->devfn, where, size, *val);
+ return PCIBIOS_SUCCESSFUL;
+}
+
+int pnv_pci_cfg_write(struct pci_dn *pdn,
+ int where, int size, u32 val)
+{
+ struct pnv_phb *phb = pdn->phb->private_data;
+ u32 bdfn = (pdn->busno << 8) | pdn->devfn;
+
+ pr_devel("%s: bus: %x devfn: %x +%x/%x -> %08x\n",
+ __func__, pdn->busno, pdn->devfn, where, size, val);
+ switch (size) {
+ case 1:
+ opal_pci_config_write_byte(phb->opal_id, bdfn, where, val);
+ break;
+ case 2:
+ opal_pci_config_write_half_word(phb->opal_id, bdfn, where, val);
+ break;
+ case 4:
+ opal_pci_config_write_word(phb->opal_id, bdfn, where, val);
+ break;
+ default:
+ return PCIBIOS_FUNC_NOT_SUPPORTED;
+ }
+
+ return PCIBIOS_SUCCESSFUL;
+}
+
+#if CONFIG_EEH
+static bool pnv_pci_cfg_check(struct pci_dn *pdn)
+{
+ struct eeh_dev *edev = NULL;
+ struct pnv_phb *phb = pdn->phb->private_data;
+
+ /* EEH not enabled ? */
+ if (!(phb->flags & PNV_PHB_FLAG_EEH))
+ return true;
+
+ /* PE reset or device removed ? */
+ edev = pdn->edev;
+ if (edev) {
+ if (edev->pe &&
+ (edev->pe->state & EEH_PE_CFG_BLOCKED))
+ return false;
+
+ if (edev->mode & EEH_DEV_REMOVED)
+ return false;
+ }
+
+ return true;
+}
+#else
+static inline pnv_pci_cfg_check(struct pci_dn *pdn)
+{
+ return true;
+}
+#endif /* CONFIG_EEH */
+
+static int pnv_pci_read_config(struct pci_bus *bus,
+ unsigned int devfn,
+ int where, int size, u32 *val)
+{
+ struct pci_dn *pdn;
+ struct pnv_phb *phb;
+ int ret;
+
+ *val = 0xFFFFFFFF;
+ pdn = pci_get_pdn_by_devfn(bus, devfn);
+ if (!pdn)
+ return PCIBIOS_DEVICE_NOT_FOUND;
+
+ if (!pnv_pci_cfg_check(pdn))
+ return PCIBIOS_DEVICE_NOT_FOUND;
+
+ ret = pnv_pci_cfg_read(pdn, where, size, val);
+ phb = pdn->phb->private_data;
+ if (phb->flags & PNV_PHB_FLAG_EEH && pdn->edev) {
+ if (*val == EEH_IO_ERROR_VALUE(size) &&
+ eeh_dev_check_failure(pdn->edev))
+ return PCIBIOS_DEVICE_NOT_FOUND;
+ } else {
+ pnv_pci_config_check_eeh(pdn);
+ }
+
+ return ret;
+}
+
+static int pnv_pci_write_config(struct pci_bus *bus,
+ unsigned int devfn,
+ int where, int size, u32 val)
+{
+ struct pci_dn *pdn;
+ struct pnv_phb *phb;
+ int ret;
+
+ pdn = pci_get_pdn_by_devfn(bus, devfn);
+ if (!pdn)
+ return PCIBIOS_DEVICE_NOT_FOUND;
+
+ if (!pnv_pci_cfg_check(pdn))
+ return PCIBIOS_DEVICE_NOT_FOUND;
+
+ ret = pnv_pci_cfg_write(pdn, where, size, val);
+ phb = pdn->phb->private_data;
+ if (!(phb->flags & PNV_PHB_FLAG_EEH))
+ pnv_pci_config_check_eeh(pdn);
+
+ return ret;
+}
+
+struct pci_ops pnv_pci_ops = {
+ .read = pnv_pci_read_config,
+ .write = pnv_pci_write_config,
+};
+
+struct iommu_table *pnv_pci_table_alloc(int nid)
+{
+ struct iommu_table *tbl;
+
+ tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL, nid);
+ if (!tbl)
+ return NULL;
+
+ INIT_LIST_HEAD_RCU(&tbl->it_group_list);
+ kref_init(&tbl->it_kref);
+
+ return tbl;
+}
+
+void pnv_pci_dma_dev_setup(struct pci_dev *pdev)
+{
+ struct pci_controller *hose = pci_bus_to_host(pdev->bus);
+ struct pnv_phb *phb = hose->private_data;
+
+ if (phb && phb->dma_dev_setup)
+ phb->dma_dev_setup(phb, pdev);
+}
+
+void pnv_pci_dma_bus_setup(struct pci_bus *bus)
+{
+ struct pci_controller *hose = bus->sysdata;
+ struct pnv_phb *phb = hose->private_data;
+ struct pnv_ioda_pe *pe;
+
+ list_for_each_entry(pe, &phb->ioda.pe_list, list) {
+ if (!(pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)))
+ continue;
+
+ if (!pe->pbus)
+ continue;
+
+ if (bus->number == ((pe->rid >> 8) & 0xFF)) {
+ pe->pbus = bus;
+ break;
+ }
+ }
+}
+
+int pnv_pci_set_p2p(struct pci_dev *initiator, struct pci_dev *target, u64 desc)
+{
+ struct pci_controller *hose;
+ struct pnv_phb *phb_init, *phb_target;
+ struct pnv_ioda_pe *pe_init;
+ int rc;
+
+ if (!opal_check_token(OPAL_PCI_SET_P2P))
+ return -ENXIO;
+
+ hose = pci_bus_to_host(initiator->bus);
+ phb_init = hose->private_data;
+
+ hose = pci_bus_to_host(target->bus);
+ phb_target = hose->private_data;
+
+ pe_init = pnv_ioda_get_pe(initiator);
+ if (!pe_init)
+ return -ENODEV;
+
+ /*
+ * Configuring the initiator's PHB requires to adjust its
+ * TVE#1 setting. Since the same device can be an initiator
+ * several times for different target devices, we need to keep
+ * a reference count to know when we can restore the default
+ * bypass setting on its TVE#1 when disabling. Opal is not
+ * tracking PE states, so we add a reference count on the PE
+ * in linux.
+ *
+ * For the target, the configuration is per PHB, so we keep a
+ * target reference count on the PHB.
+ */
+ mutex_lock(&p2p_mutex);
+
+ if (desc & OPAL_PCI_P2P_ENABLE) {
+ /* always go to opal to validate the configuration */
+ rc = opal_pci_set_p2p(phb_init->opal_id, phb_target->opal_id,
+ desc, pe_init->pe_number);
+
+ if (rc != OPAL_SUCCESS) {
+ rc = -EIO;
+ goto out;
+ }
+
+ pe_init->p2p_initiator_count++;
+ phb_target->p2p_target_count++;
+ } else {
+ if (!pe_init->p2p_initiator_count ||
+ !phb_target->p2p_target_count) {
+ rc = -EINVAL;
+ goto out;
+ }
+
+ if (--pe_init->p2p_initiator_count == 0)
+ pnv_pci_ioda2_set_bypass(pe_init, true);
+
+ if (--phb_target->p2p_target_count == 0) {
+ rc = opal_pci_set_p2p(phb_init->opal_id,
+ phb_target->opal_id, desc,
+ pe_init->pe_number);
+ if (rc != OPAL_SUCCESS) {
+ rc = -EIO;
+ goto out;
+ }
+ }
+ }
+ rc = 0;
+out:
+ mutex_unlock(&p2p_mutex);
+ return rc;
+}
+EXPORT_SYMBOL_GPL(pnv_pci_set_p2p);
+
+struct device_node *pnv_pci_get_phb_node(struct pci_dev *dev)
+{
+ struct pci_controller *hose = pci_bus_to_host(dev->bus);
+
+ return of_node_get(hose->dn);
+}
+EXPORT_SYMBOL(pnv_pci_get_phb_node);
+
+int pnv_pci_enable_tunnel(struct pci_dev *dev, u64 *asnind)
+{
+ struct device_node *np;
+ const __be32 *prop;
+ struct pnv_ioda_pe *pe;
+ uint16_t window_id;
+ int rc;
+
+ if (!radix_enabled())
+ return -ENXIO;
+
+ if (!(np = pnv_pci_get_phb_node(dev)))
+ return -ENXIO;
+
+ prop = of_get_property(np, "ibm,phb-indications", NULL);
+ of_node_put(np);
+
+ if (!prop || !prop[1])
+ return -ENXIO;
+
+ *asnind = (u64)be32_to_cpu(prop[1]);
+ pe = pnv_ioda_get_pe(dev);
+ if (!pe)
+ return -ENODEV;
+
+ /* Increase real window size to accept as_notify messages. */
+ window_id = (pe->pe_number << 1 ) + 1;
+ rc = opal_pci_map_pe_dma_window_real(pe->phb->opal_id, pe->pe_number,
+ window_id, pe->tce_bypass_base,
+ (uint64_t)1 << 48);
+ return opal_error_code(rc);
+}
+EXPORT_SYMBOL_GPL(pnv_pci_enable_tunnel);
+
+int pnv_pci_disable_tunnel(struct pci_dev *dev)
+{
+ struct pnv_ioda_pe *pe;
+
+ pe = pnv_ioda_get_pe(dev);
+ if (!pe)
+ return -ENODEV;
+
+ /* Restore default real window size. */
+ pnv_pci_ioda2_set_bypass(pe, true);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(pnv_pci_disable_tunnel);
+
+int pnv_pci_set_tunnel_bar(struct pci_dev *dev, u64 addr, int enable)
+{
+ __be64 val;
+ struct pci_controller *hose;
+ struct pnv_phb *phb;
+ u64 tunnel_bar;
+ int rc;
+
+ if (!opal_check_token(OPAL_PCI_GET_PBCQ_TUNNEL_BAR))
+ return -ENXIO;
+ if (!opal_check_token(OPAL_PCI_SET_PBCQ_TUNNEL_BAR))
+ return -ENXIO;
+
+ hose = pci_bus_to_host(dev->bus);
+ phb = hose->private_data;
+
+ mutex_lock(&tunnel_mutex);
+ rc = opal_pci_get_pbcq_tunnel_bar(phb->opal_id, &val);
+ if (rc != OPAL_SUCCESS) {
+ rc = -EIO;
+ goto out;
+ }
+ tunnel_bar = be64_to_cpu(val);
+ if (enable) {
+ /*
+ * Only one device per PHB can use atomics.
+ * Our policy is first-come, first-served.
+ */
+ if (tunnel_bar) {
+ if (tunnel_bar != addr)
+ rc = -EBUSY;
+ else
+ rc = 0; /* Setting same address twice is ok */
+ goto out;
+ }
+ } else {
+ /*
+ * The device that owns atomics and wants to release
+ * them must pass the same address with enable == 0.
+ */
+ if (tunnel_bar != addr) {
+ rc = -EPERM;
+ goto out;
+ }
+ addr = 0x0ULL;
+ }
+ rc = opal_pci_set_pbcq_tunnel_bar(phb->opal_id, addr);
+ rc = opal_error_code(rc);
+out:
+ mutex_unlock(&tunnel_mutex);
+ return rc;
+}
+EXPORT_SYMBOL_GPL(pnv_pci_set_tunnel_bar);
+
+#ifdef CONFIG_PPC64 /* for thread.tidr */
+int pnv_pci_get_as_notify_info(struct task_struct *task, u32 *lpid, u32 *pid,
+ u32 *tid)
+{
+ struct mm_struct *mm = NULL;
+
+ if (task == NULL)
+ return -EINVAL;
+
+ mm = get_task_mm(task);
+ if (mm == NULL)
+ return -EINVAL;
+
+ *pid = mm->context.id;
+ mmput(mm);
+
+ *tid = task->thread.tidr;
+ *lpid = mfspr(SPRN_LPID);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(pnv_pci_get_as_notify_info);
+#endif
+
+void pnv_pci_shutdown(void)
+{
+ struct pci_controller *hose;
+
+ list_for_each_entry(hose, &hose_list, list_node)
+ if (hose->controller_ops.shutdown)
+ hose->controller_ops.shutdown(hose);
+}
+
+/* Fixup wrong class code in p7ioc and p8 root complex */
+static void pnv_p7ioc_rc_quirk(struct pci_dev *dev)
+{
+ dev->class = PCI_CLASS_BRIDGE_PCI << 8;
+}
+DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_IBM, 0x3b9, pnv_p7ioc_rc_quirk);
+
+void __init pnv_pci_init(void)
+{
+ struct device_node *np;
+
+ pci_add_flags(PCI_CAN_SKIP_ISA_ALIGN);
+
+ /* If we don't have OPAL, eg. in sim, just skip PCI probe */
+ if (!firmware_has_feature(FW_FEATURE_OPAL))
+ return;
+
+#ifdef CONFIG_PCIEPORTBUS
+ /*
+ * On PowerNV PCIe devices are (currently) managed in cooperation
+ * with firmware. This isn't *strictly* required, but there's enough
+ * assumptions baked into both firmware and the platform code that
+ * it's unwise to allow the portbus services to be used.
+ *
+ * We need to fix this eventually, but for now set this flag to disable
+ * the portbus driver. The AER service isn't required since that AER
+ * events are handled via EEH. The pciehp hotplug driver can't work
+ * without kernel changes (and portbus binding breaks pnv_php). The
+ * other services also require some thinking about how we're going
+ * to integrate them.
+ */
+ pcie_ports_disabled = true;
+#endif
+
+ /* Look for IODA IO-Hubs. */
+ for_each_compatible_node(np, NULL, "ibm,ioda-hub") {
+ pnv_pci_init_ioda_hub(np);
+ }
+
+ /* Look for ioda2 built-in PHB3's */
+ for_each_compatible_node(np, NULL, "ibm,ioda2-phb")
+ pnv_pci_init_ioda2_phb(np);
+
+ /* Look for ioda3 built-in PHB4's, we treat them as IODA2 */
+ for_each_compatible_node(np, NULL, "ibm,ioda3-phb")
+ pnv_pci_init_ioda2_phb(np);
+
+ /* Look for NPU PHBs */
+ for_each_compatible_node(np, NULL, "ibm,ioda2-npu-phb")
+ pnv_pci_init_npu_phb(np);
+
+ /*
+ * Look for NPU2 PHBs which we treat mostly as NPU PHBs with
+ * the exception of TCE kill which requires an OPAL call.
+ */
+ for_each_compatible_node(np, NULL, "ibm,ioda2-npu2-phb")
+ pnv_pci_init_npu_phb(np);
+
+ /* Look for NPU2 OpenCAPI PHBs */
+ for_each_compatible_node(np, NULL, "ibm,ioda2-npu2-opencapi-phb")
+ pnv_pci_init_npu2_opencapi_phb(np);
+
+ /* Configure IOMMU DMA hooks */
+ set_pci_dma_ops(&dma_iommu_ops);
+}
+
+machine_subsys_initcall_sync(powernv, tce_iommu_bus_notifier_init);
diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h
new file mode 100644
index 000000000..e302aa092
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/pci.h
@@ -0,0 +1,274 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __POWERNV_PCI_H
+#define __POWERNV_PCI_H
+
+#include <linux/iommu.h>
+#include <asm/iommu.h>
+#include <asm/msi_bitmap.h>
+
+struct pci_dn;
+
+/* Maximum possible number of ATSD MMIO registers per NPU */
+#define NV_NMMU_ATSD_REGS 8
+
+enum pnv_phb_type {
+ PNV_PHB_IODA1 = 0,
+ PNV_PHB_IODA2 = 1,
+ PNV_PHB_NPU_NVLINK = 2,
+ PNV_PHB_NPU_OCAPI = 3,
+};
+
+/* Precise PHB model for error management */
+enum pnv_phb_model {
+ PNV_PHB_MODEL_UNKNOWN,
+ PNV_PHB_MODEL_P7IOC,
+ PNV_PHB_MODEL_PHB3,
+ PNV_PHB_MODEL_NPU,
+ PNV_PHB_MODEL_NPU2,
+};
+
+#define PNV_PCI_DIAG_BUF_SIZE 8192
+#define PNV_IODA_PE_DEV (1 << 0) /* PE has single PCI device */
+#define PNV_IODA_PE_BUS (1 << 1) /* PE has primary PCI bus */
+#define PNV_IODA_PE_BUS_ALL (1 << 2) /* PE has subordinate buses */
+#define PNV_IODA_PE_MASTER (1 << 3) /* Master PE in compound case */
+#define PNV_IODA_PE_SLAVE (1 << 4) /* Slave PE in compound case */
+#define PNV_IODA_PE_VF (1 << 5) /* PE for one VF */
+
+/* Indicates operations are frozen for a PE: MMIO in PESTA & DMA in PESTB. */
+#define PNV_IODA_STOPPED_STATE 0x8000000000000000
+
+/* Data associated with a PE, including IOMMU tracking etc.. */
+struct pnv_phb;
+struct pnv_ioda_pe {
+ unsigned long flags;
+ struct pnv_phb *phb;
+ int device_count;
+
+ /* A PE can be associated with a single device or an
+ * entire bus (& children). In the former case, pdev
+ * is populated, in the later case, pbus is.
+ */
+#ifdef CONFIG_PCI_IOV
+ struct pci_dev *parent_dev;
+#endif
+ struct pci_dev *pdev;
+ struct pci_bus *pbus;
+
+ /* Effective RID (device RID for a device PE and base bus
+ * RID with devfn 0 for a bus PE)
+ */
+ unsigned int rid;
+
+ /* PE number */
+ unsigned int pe_number;
+
+ /* "Base" iommu table, ie, 4K TCEs, 32-bit DMA */
+ struct iommu_table_group table_group;
+
+ /* 64-bit TCE bypass region */
+ bool tce_bypass_enabled;
+ uint64_t tce_bypass_base;
+
+ /* MSIs. MVE index is identical for for 32 and 64 bit MSI
+ * and -1 if not supported. (It's actually identical to the
+ * PE number)
+ */
+ int mve_number;
+
+ /* PEs in compound case */
+ struct pnv_ioda_pe *master;
+ struct list_head slaves;
+
+ /* PCI peer-to-peer*/
+ int p2p_initiator_count;
+
+ /* Link in list of PE#s */
+ struct list_head list;
+};
+
+#define PNV_PHB_FLAG_EEH (1 << 0)
+
+struct pnv_phb {
+ struct pci_controller *hose;
+ enum pnv_phb_type type;
+ enum pnv_phb_model model;
+ u64 hub_id;
+ u64 opal_id;
+ int flags;
+ void __iomem *regs;
+ u64 regs_phys;
+ int initialized;
+ spinlock_t lock;
+
+#ifdef CONFIG_DEBUG_FS
+ int has_dbgfs;
+ struct dentry *dbgfs;
+#endif
+
+#ifdef CONFIG_PCI_MSI
+ unsigned int msi_base;
+ unsigned int msi32_support;
+ struct msi_bitmap msi_bmp;
+#endif
+ int (*msi_setup)(struct pnv_phb *phb, struct pci_dev *dev,
+ unsigned int hwirq, unsigned int virq,
+ unsigned int is_64, struct msi_msg *msg);
+ void (*dma_dev_setup)(struct pnv_phb *phb, struct pci_dev *pdev);
+ void (*fixup_phb)(struct pci_controller *hose);
+ int (*init_m64)(struct pnv_phb *phb);
+ void (*reserve_m64_pe)(struct pci_bus *bus,
+ unsigned long *pe_bitmap, bool all);
+ struct pnv_ioda_pe *(*pick_m64_pe)(struct pci_bus *bus, bool all);
+ int (*get_pe_state)(struct pnv_phb *phb, int pe_no);
+ void (*freeze_pe)(struct pnv_phb *phb, int pe_no);
+ int (*unfreeze_pe)(struct pnv_phb *phb, int pe_no, int opt);
+
+ struct {
+ /* Global bridge info */
+ unsigned int total_pe_num;
+ unsigned int reserved_pe_idx;
+ unsigned int root_pe_idx;
+ bool root_pe_populated;
+
+ /* 32-bit MMIO window */
+ unsigned int m32_size;
+ unsigned int m32_segsize;
+ unsigned int m32_pci_base;
+
+ /* 64-bit MMIO window */
+ unsigned int m64_bar_idx;
+ unsigned long m64_size;
+ unsigned long m64_segsize;
+ unsigned long m64_base;
+ unsigned long m64_bar_alloc;
+
+ /* IO ports */
+ unsigned int io_size;
+ unsigned int io_segsize;
+ unsigned int io_pci_base;
+
+ /* PE allocation */
+ struct mutex pe_alloc_mutex;
+ unsigned long *pe_alloc;
+ struct pnv_ioda_pe *pe_array;
+
+ /* M32 & IO segment maps */
+ unsigned int *m64_segmap;
+ unsigned int *m32_segmap;
+ unsigned int *io_segmap;
+
+ /* DMA32 segment maps - IODA1 only */
+ unsigned int dma32_count;
+ unsigned int *dma32_segmap;
+
+ /* IRQ chip */
+ int irq_chip_init;
+ struct irq_chip irq_chip;
+
+ /* Sorted list of used PE's based
+ * on the sequence of creation
+ */
+ struct list_head pe_list;
+ struct mutex pe_list_mutex;
+
+ /* Reverse map of PEs, indexed by {bus, devfn} */
+ unsigned int pe_rmap[0x10000];
+ } ioda;
+
+ /* PHB and hub diagnostics */
+ unsigned int diag_data_size;
+ u8 *diag_data;
+
+ /* Nvlink2 data */
+ struct npu {
+ int index;
+ __be64 *mmio_atsd_regs[NV_NMMU_ATSD_REGS];
+ unsigned int mmio_atsd_count;
+
+ /* Bitmask for MMIO register usage */
+ unsigned long mmio_atsd_usage;
+
+ /* Do we need to explicitly flush the nest mmu? */
+ bool nmmu_flush;
+ } npu;
+
+ int p2p_target_count;
+};
+
+extern struct pci_ops pnv_pci_ops;
+
+void pnv_pci_dump_phb_diag_data(struct pci_controller *hose,
+ unsigned char *log_buff);
+int pnv_pci_cfg_read(struct pci_dn *pdn,
+ int where, int size, u32 *val);
+int pnv_pci_cfg_write(struct pci_dn *pdn,
+ int where, int size, u32 val);
+extern struct iommu_table *pnv_pci_table_alloc(int nid);
+
+extern void pnv_pci_init_ioda_hub(struct device_node *np);
+extern void pnv_pci_init_ioda2_phb(struct device_node *np);
+extern void pnv_pci_init_npu_phb(struct device_node *np);
+extern void pnv_pci_init_npu2_opencapi_phb(struct device_node *np);
+extern void pnv_pci_reset_secondary_bus(struct pci_dev *dev);
+extern int pnv_eeh_phb_reset(struct pci_controller *hose, int option);
+
+extern void pnv_pci_dma_dev_setup(struct pci_dev *pdev);
+extern void pnv_pci_dma_bus_setup(struct pci_bus *bus);
+extern int pnv_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type);
+extern void pnv_teardown_msi_irqs(struct pci_dev *pdev);
+extern struct pnv_ioda_pe *pnv_ioda_get_pe(struct pci_dev *dev);
+extern void pnv_set_msi_irq_chip(struct pnv_phb *phb, unsigned int virq);
+extern void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable);
+extern int pnv_eeh_post_init(void);
+
+extern void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level,
+ const char *fmt, ...);
+#define pe_err(pe, fmt, ...) \
+ pe_level_printk(pe, KERN_ERR, fmt, ##__VA_ARGS__)
+#define pe_warn(pe, fmt, ...) \
+ pe_level_printk(pe, KERN_WARNING, fmt, ##__VA_ARGS__)
+#define pe_info(pe, fmt, ...) \
+ pe_level_printk(pe, KERN_INFO, fmt, ##__VA_ARGS__)
+
+/* Nvlink functions */
+extern void pnv_npu_try_dma_set_bypass(struct pci_dev *gpdev, bool bypass);
+extern void pnv_pci_ioda2_tce_invalidate_entire(struct pnv_phb *phb, bool rm);
+extern struct pnv_ioda_pe *pnv_pci_npu_setup_iommu(struct pnv_ioda_pe *npe);
+extern long pnv_npu_set_window(struct pnv_ioda_pe *npe, int num,
+ struct iommu_table *tbl);
+extern long pnv_npu_unset_window(struct pnv_ioda_pe *npe, int num);
+extern void pnv_npu_take_ownership(struct pnv_ioda_pe *npe);
+extern void pnv_npu_release_ownership(struct pnv_ioda_pe *npe);
+extern int pnv_npu2_init(struct pnv_phb *phb);
+
+/* pci-ioda-tce.c */
+#define POWERNV_IOMMU_DEFAULT_LEVELS 2
+#define POWERNV_IOMMU_MAX_LEVELS 5
+
+extern int pnv_tce_build(struct iommu_table *tbl, long index, long npages,
+ unsigned long uaddr, enum dma_data_direction direction,
+ unsigned long attrs);
+extern void pnv_tce_free(struct iommu_table *tbl, long index, long npages);
+extern int pnv_tce_xchg(struct iommu_table *tbl, long index,
+ unsigned long *hpa, enum dma_data_direction *direction,
+ bool alloc);
+extern __be64 *pnv_tce_useraddrptr(struct iommu_table *tbl, long index,
+ bool alloc);
+extern unsigned long pnv_tce_get(struct iommu_table *tbl, long index);
+
+extern long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset,
+ __u32 page_shift, __u64 window_size, __u32 levels,
+ bool alloc_userspace_copy, struct iommu_table *tbl);
+extern void pnv_pci_ioda2_table_free_pages(struct iommu_table *tbl);
+
+extern long pnv_pci_link_table_and_group(int node, int num,
+ struct iommu_table *tbl,
+ struct iommu_table_group *table_group);
+extern void pnv_pci_unlink_table_and_group(struct iommu_table *tbl,
+ struct iommu_table_group *table_group);
+extern void pnv_pci_setup_iommu_table(struct iommu_table *tbl,
+ void *tce_mem, u64 tce_size,
+ u64 dma_offset, unsigned int page_shift);
+
+#endif /* __POWERNV_PCI_H */
diff --git a/arch/powerpc/platforms/powernv/powernv.h b/arch/powerpc/platforms/powernv/powernv.h
new file mode 100644
index 000000000..fd4a1c5a6
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/powernv.h
@@ -0,0 +1,33 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _POWERNV_H
+#define _POWERNV_H
+
+#ifdef CONFIG_SMP
+extern void pnv_smp_init(void);
+#else
+static inline void pnv_smp_init(void) { }
+#endif
+
+extern void pnv_platform_error_reboot(struct pt_regs *regs, const char *msg) __noreturn;
+
+struct pci_dev;
+
+#ifdef CONFIG_PCI
+extern void pnv_pci_init(void);
+extern void pnv_pci_shutdown(void);
+#else
+static inline void pnv_pci_init(void) { }
+static inline void pnv_pci_shutdown(void) { }
+#endif
+
+extern u32 pnv_get_supported_cpuidle_states(void);
+
+extern void pnv_lpc_init(void);
+
+extern void opal_handle_events(void);
+extern bool opal_have_pending_events(void);
+extern void opal_event_shutdown(void);
+
+bool cpu_core_split_required(void);
+
+#endif /* _POWERNV_H */
diff --git a/arch/powerpc/platforms/powernv/rng.c b/arch/powerpc/platforms/powernv/rng.c
new file mode 100644
index 000000000..e15dd09f7
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/rng.c
@@ -0,0 +1,198 @@
+/*
+ * Copyright 2013, Michael Ellerman, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#define pr_fmt(fmt) "powernv-rng: " fmt
+
+#include <linux/kernel.h>
+#include <linux/of.h>
+#include <linux/of_address.h>
+#include <linux/of_platform.h>
+#include <linux/slab.h>
+#include <linux/smp.h>
+#include <asm/archrandom.h>
+#include <asm/cputable.h>
+#include <asm/io.h>
+#include <asm/prom.h>
+#include <asm/machdep.h>
+#include <asm/smp.h>
+
+#define DARN_ERR 0xFFFFFFFFFFFFFFFFul
+
+struct powernv_rng {
+ void __iomem *regs;
+ void __iomem *regs_real;
+ unsigned long mask;
+};
+
+static DEFINE_PER_CPU(struct powernv_rng *, powernv_rng);
+
+
+int powernv_hwrng_present(void)
+{
+ struct powernv_rng *rng;
+
+ rng = get_cpu_var(powernv_rng);
+ put_cpu_var(rng);
+ return rng != NULL;
+}
+
+static unsigned long rng_whiten(struct powernv_rng *rng, unsigned long val)
+{
+ unsigned long parity;
+
+ /* Calculate the parity of the value */
+ asm (".machine push; \
+ .machine power7; \
+ popcntd %0,%1; \
+ .machine pop;"
+ : "=r" (parity) : "r" (val));
+
+ /* xor our value with the previous mask */
+ val ^= rng->mask;
+
+ /* update the mask based on the parity of this value */
+ rng->mask = (rng->mask << 1) | (parity & 1);
+
+ return val;
+}
+
+int powernv_get_random_real_mode(unsigned long *v)
+{
+ struct powernv_rng *rng;
+
+ rng = raw_cpu_read(powernv_rng);
+
+ *v = rng_whiten(rng, __raw_rm_readq(rng->regs_real));
+
+ return 1;
+}
+
+int powernv_get_random_darn(unsigned long *v)
+{
+ unsigned long val;
+
+ /* Using DARN with L=1 - 64-bit conditioned random number */
+ asm volatile(PPC_DARN(%0, 1) : "=r"(val));
+
+ if (val == DARN_ERR)
+ return 0;
+
+ *v = val;
+
+ return 1;
+}
+
+static int initialise_darn(void)
+{
+ unsigned long val;
+ int i;
+
+ if (!cpu_has_feature(CPU_FTR_ARCH_300))
+ return -ENODEV;
+
+ for (i = 0; i < 10; i++) {
+ if (powernv_get_random_darn(&val)) {
+ ppc_md.get_random_seed = powernv_get_random_darn;
+ return 0;
+ }
+ }
+
+ pr_warn("Unable to use DARN for get_random_seed()\n");
+
+ return -EIO;
+}
+
+int powernv_get_random_long(unsigned long *v)
+{
+ struct powernv_rng *rng;
+
+ rng = get_cpu_var(powernv_rng);
+
+ *v = rng_whiten(rng, in_be64(rng->regs));
+
+ put_cpu_var(rng);
+
+ return 1;
+}
+EXPORT_SYMBOL_GPL(powernv_get_random_long);
+
+static __init void rng_init_per_cpu(struct powernv_rng *rng,
+ struct device_node *dn)
+{
+ int chip_id, cpu;
+
+ chip_id = of_get_ibm_chip_id(dn);
+ if (chip_id == -1)
+ pr_warn("No ibm,chip-id found for %pOF.\n", dn);
+
+ for_each_possible_cpu(cpu) {
+ if (per_cpu(powernv_rng, cpu) == NULL ||
+ cpu_to_chip_id(cpu) == chip_id) {
+ per_cpu(powernv_rng, cpu) = rng;
+ }
+ }
+}
+
+static __init int rng_create(struct device_node *dn)
+{
+ struct powernv_rng *rng;
+ struct resource res;
+ unsigned long val;
+
+ rng = kzalloc(sizeof(*rng), GFP_KERNEL);
+ if (!rng)
+ return -ENOMEM;
+
+ if (of_address_to_resource(dn, 0, &res)) {
+ kfree(rng);
+ return -ENXIO;
+ }
+
+ rng->regs_real = (void __iomem *)res.start;
+
+ rng->regs = of_iomap(dn, 0);
+ if (!rng->regs) {
+ kfree(rng);
+ return -ENXIO;
+ }
+
+ val = in_be64(rng->regs);
+ rng->mask = val;
+
+ rng_init_per_cpu(rng, dn);
+
+ pr_info_once("Registering arch random hook.\n");
+
+ ppc_md.get_random_seed = powernv_get_random_long;
+
+ return 0;
+}
+
+static __init int rng_init(void)
+{
+ struct device_node *dn;
+ int rc;
+
+ for_each_compatible_node(dn, NULL, "ibm,power-rng") {
+ rc = rng_create(dn);
+ if (rc) {
+ pr_err("Failed creating rng for %pOF (%d).\n",
+ dn, rc);
+ continue;
+ }
+
+ /* Create devices for hwrng driver */
+ of_platform_device_create(dn, NULL, NULL);
+ }
+
+ initialise_darn();
+
+ return 0;
+}
+machine_subsys_initcall(powernv, rng_init);
diff --git a/arch/powerpc/platforms/powernv/setup.c b/arch/powerpc/platforms/powernv/setup.c
new file mode 100644
index 000000000..5068dd7f6
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/setup.c
@@ -0,0 +1,474 @@
+/*
+ * PowerNV setup code.
+ *
+ * Copyright 2011 IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#undef DEBUG
+
+#include <linux/cpu.h>
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/tty.h>
+#include <linux/reboot.h>
+#include <linux/init.h>
+#include <linux/console.h>
+#include <linux/delay.h>
+#include <linux/irq.h>
+#include <linux/seq_file.h>
+#include <linux/of.h>
+#include <linux/of_fdt.h>
+#include <linux/interrupt.h>
+#include <linux/bug.h>
+#include <linux/pci.h>
+#include <linux/cpufreq.h>
+
+#include <asm/machdep.h>
+#include <asm/firmware.h>
+#include <asm/xics.h>
+#include <asm/xive.h>
+#include <asm/opal.h>
+#include <asm/kexec.h>
+#include <asm/smp.h>
+#include <asm/tm.h>
+#include <asm/setup.h>
+#include <asm/security_features.h>
+
+#include "powernv.h"
+
+
+static bool fw_feature_is(const char *state, const char *name,
+ struct device_node *fw_features)
+{
+ struct device_node *np;
+ bool rc = false;
+
+ np = of_get_child_by_name(fw_features, name);
+ if (np) {
+ rc = of_property_read_bool(np, state);
+ of_node_put(np);
+ }
+
+ return rc;
+}
+
+static void init_fw_feat_flags(struct device_node *np)
+{
+ if (fw_feature_is("enabled", "inst-spec-barrier-ori31,31,0", np))
+ security_ftr_set(SEC_FTR_SPEC_BAR_ORI31);
+
+ if (fw_feature_is("enabled", "fw-bcctrl-serialized", np))
+ security_ftr_set(SEC_FTR_BCCTRL_SERIALISED);
+
+ if (fw_feature_is("enabled", "inst-l1d-flush-ori30,30,0", np))
+ security_ftr_set(SEC_FTR_L1D_FLUSH_ORI30);
+
+ if (fw_feature_is("enabled", "inst-l1d-flush-trig2", np))
+ security_ftr_set(SEC_FTR_L1D_FLUSH_TRIG2);
+
+ if (fw_feature_is("enabled", "fw-l1d-thread-split", np))
+ security_ftr_set(SEC_FTR_L1D_THREAD_PRIV);
+
+ if (fw_feature_is("enabled", "fw-count-cache-disabled", np))
+ security_ftr_set(SEC_FTR_COUNT_CACHE_DISABLED);
+
+ if (fw_feature_is("enabled", "fw-count-cache-flush-bcctr2,0,0", np))
+ security_ftr_set(SEC_FTR_BCCTR_FLUSH_ASSIST);
+
+ if (fw_feature_is("enabled", "needs-count-cache-flush-on-context-switch", np))
+ security_ftr_set(SEC_FTR_FLUSH_COUNT_CACHE);
+
+ /*
+ * The features below are enabled by default, so we instead look to see
+ * if firmware has *disabled* them, and clear them if so.
+ */
+ if (fw_feature_is("disabled", "speculation-policy-favor-security", np))
+ security_ftr_clear(SEC_FTR_FAVOUR_SECURITY);
+
+ if (fw_feature_is("disabled", "needs-l1d-flush-msr-pr-0-to-1", np))
+ security_ftr_clear(SEC_FTR_L1D_FLUSH_PR);
+
+ if (fw_feature_is("disabled", "needs-l1d-flush-msr-hv-1-to-0", np))
+ security_ftr_clear(SEC_FTR_L1D_FLUSH_HV);
+
+ if (fw_feature_is("disabled", "needs-spec-barrier-for-bound-checks", np))
+ security_ftr_clear(SEC_FTR_BNDS_CHK_SPEC_BAR);
+}
+
+static void pnv_setup_rfi_flush(void)
+{
+ struct device_node *np, *fw_features;
+ enum l1d_flush_type type;
+ bool enable;
+
+ /* Default to fallback in case fw-features are not available */
+ type = L1D_FLUSH_FALLBACK;
+
+ np = of_find_node_by_name(NULL, "ibm,opal");
+ fw_features = of_get_child_by_name(np, "fw-features");
+ of_node_put(np);
+
+ if (fw_features) {
+ init_fw_feat_flags(fw_features);
+ of_node_put(fw_features);
+
+ if (security_ftr_enabled(SEC_FTR_L1D_FLUSH_TRIG2))
+ type = L1D_FLUSH_MTTRIG;
+
+ if (security_ftr_enabled(SEC_FTR_L1D_FLUSH_ORI30))
+ type = L1D_FLUSH_ORI;
+ }
+
+ /*
+ * If we are non-Power9 bare metal, we don't need to flush on kernel
+ * entry or after user access: they fix a P9 specific vulnerability.
+ */
+ if (!pvr_version_is(PVR_POWER9)) {
+ security_ftr_clear(SEC_FTR_L1D_FLUSH_ENTRY);
+ security_ftr_clear(SEC_FTR_L1D_FLUSH_UACCESS);
+ }
+
+ enable = security_ftr_enabled(SEC_FTR_FAVOUR_SECURITY) && \
+ (security_ftr_enabled(SEC_FTR_L1D_FLUSH_PR) || \
+ security_ftr_enabled(SEC_FTR_L1D_FLUSH_HV));
+
+ setup_rfi_flush(type, enable);
+ setup_count_cache_flush();
+
+ enable = security_ftr_enabled(SEC_FTR_FAVOUR_SECURITY) &&
+ security_ftr_enabled(SEC_FTR_L1D_FLUSH_ENTRY);
+ setup_entry_flush(enable);
+
+ enable = security_ftr_enabled(SEC_FTR_FAVOUR_SECURITY) &&
+ security_ftr_enabled(SEC_FTR_L1D_FLUSH_UACCESS);
+ setup_uaccess_flush(enable);
+}
+
+static void __init pnv_setup_arch(void)
+{
+ set_arch_panic_timeout(10, ARCH_PANIC_TIMEOUT);
+
+ pnv_setup_rfi_flush();
+ setup_stf_barrier();
+
+ /* Initialize SMP */
+ pnv_smp_init();
+
+ /* Setup PCI */
+ pnv_pci_init();
+
+ /* Setup RTC and NVRAM callbacks */
+ if (firmware_has_feature(FW_FEATURE_OPAL))
+ opal_nvram_init();
+
+ /* Enable NAP mode */
+ powersave_nap = 1;
+
+ /* XXX PMCS */
+}
+
+static void __init pnv_init(void)
+{
+ /*
+ * Initialize the LPC bus now so that legacy serial
+ * ports can be found on it
+ */
+ opal_lpc_init();
+
+#ifdef CONFIG_HVC_OPAL
+ if (firmware_has_feature(FW_FEATURE_OPAL))
+ hvc_opal_init_early();
+ else
+#endif
+ add_preferred_console("hvc", 0, NULL);
+}
+
+static void __init pnv_init_IRQ(void)
+{
+ /* Try using a XIVE if available, otherwise use a XICS */
+ if (!xive_native_init())
+ xics_init();
+
+ WARN_ON(!ppc_md.get_irq);
+}
+
+static void pnv_show_cpuinfo(struct seq_file *m)
+{
+ struct device_node *root;
+ const char *model = "";
+
+ root = of_find_node_by_path("/");
+ if (root)
+ model = of_get_property(root, "model", NULL);
+ seq_printf(m, "machine\t\t: PowerNV %s\n", model);
+ if (firmware_has_feature(FW_FEATURE_OPAL))
+ seq_printf(m, "firmware\t: OPAL\n");
+ else
+ seq_printf(m, "firmware\t: BML\n");
+ of_node_put(root);
+ if (radix_enabled())
+ seq_printf(m, "MMU\t\t: Radix\n");
+ else
+ seq_printf(m, "MMU\t\t: Hash\n");
+}
+
+static void pnv_prepare_going_down(void)
+{
+ /*
+ * Disable all notifiers from OPAL, we can't
+ * service interrupts anymore anyway
+ */
+ opal_event_shutdown();
+
+ /* Print flash update message if one is scheduled. */
+ opal_flash_update_print_message();
+
+ smp_send_stop();
+
+ hard_irq_disable();
+}
+
+static void __noreturn pnv_restart(char *cmd)
+{
+ long rc = OPAL_BUSY;
+
+ pnv_prepare_going_down();
+
+ while (rc == OPAL_BUSY || rc == OPAL_BUSY_EVENT) {
+ rc = opal_cec_reboot();
+ if (rc == OPAL_BUSY_EVENT)
+ opal_poll_events(NULL);
+ else
+ mdelay(10);
+ }
+ for (;;)
+ opal_poll_events(NULL);
+}
+
+static void __noreturn pnv_power_off(void)
+{
+ long rc = OPAL_BUSY;
+
+ pnv_prepare_going_down();
+
+ while (rc == OPAL_BUSY || rc == OPAL_BUSY_EVENT) {
+ rc = opal_cec_power_down(0);
+ if (rc == OPAL_BUSY_EVENT)
+ opal_poll_events(NULL);
+ else
+ mdelay(10);
+ }
+ for (;;)
+ opal_poll_events(NULL);
+}
+
+static void __noreturn pnv_halt(void)
+{
+ pnv_power_off();
+}
+
+static void pnv_progress(char *s, unsigned short hex)
+{
+}
+
+static void pnv_shutdown(void)
+{
+ /* Let the PCI code clear up IODA tables */
+ pnv_pci_shutdown();
+
+ /*
+ * Stop OPAL activity: Unregister all OPAL interrupts so they
+ * don't fire up while we kexec and make sure all potentially
+ * DMA'ing ops are complete (such as dump retrieval).
+ */
+ opal_shutdown();
+}
+
+#ifdef CONFIG_KEXEC_CORE
+static void pnv_kexec_wait_secondaries_down(void)
+{
+ int my_cpu, i, notified = -1;
+
+ my_cpu = get_cpu();
+
+ for_each_online_cpu(i) {
+ uint8_t status;
+ int64_t rc, timeout = 1000;
+
+ if (i == my_cpu)
+ continue;
+
+ for (;;) {
+ rc = opal_query_cpu_status(get_hard_smp_processor_id(i),
+ &status);
+ if (rc != OPAL_SUCCESS || status != OPAL_THREAD_STARTED)
+ break;
+ barrier();
+ if (i != notified) {
+ printk(KERN_INFO "kexec: waiting for cpu %d "
+ "(physical %d) to enter OPAL\n",
+ i, paca_ptrs[i]->hw_cpu_id);
+ notified = i;
+ }
+
+ /*
+ * On crash secondaries might be unreachable or hung,
+ * so timeout if we've waited too long
+ * */
+ mdelay(1);
+ if (timeout-- == 0) {
+ printk(KERN_ERR "kexec: timed out waiting for "
+ "cpu %d (physical %d) to enter OPAL\n",
+ i, paca_ptrs[i]->hw_cpu_id);
+ break;
+ }
+ }
+ }
+}
+
+static void pnv_kexec_cpu_down(int crash_shutdown, int secondary)
+{
+ u64 reinit_flags;
+
+ if (xive_enabled())
+ xive_teardown_cpu();
+ else
+ xics_kexec_teardown_cpu(secondary);
+
+ /* On OPAL, we return all CPUs to firmware */
+ if (!firmware_has_feature(FW_FEATURE_OPAL))
+ return;
+
+ if (secondary) {
+ /* Return secondary CPUs to firmware on OPAL v3 */
+ mb();
+ get_paca()->kexec_state = KEXEC_STATE_REAL_MODE;
+ mb();
+
+ /* Return the CPU to OPAL */
+ opal_return_cpu();
+ } else {
+ /* Primary waits for the secondaries to have reached OPAL */
+ pnv_kexec_wait_secondaries_down();
+
+ /* Switch XIVE back to emulation mode */
+ if (xive_enabled())
+ xive_shutdown();
+
+ /*
+ * We might be running as little-endian - now that interrupts
+ * are disabled, reset the HILE bit to big-endian so we don't
+ * take interrupts in the wrong endian later
+ *
+ * We reinit to enable both radix and hash on P9 to ensure
+ * the mode used by the next kernel is always supported.
+ */
+ reinit_flags = OPAL_REINIT_CPUS_HILE_BE;
+ if (cpu_has_feature(CPU_FTR_ARCH_300))
+ reinit_flags |= OPAL_REINIT_CPUS_MMU_RADIX |
+ OPAL_REINIT_CPUS_MMU_HASH;
+ opal_reinit_cpus(reinit_flags);
+ }
+}
+#endif /* CONFIG_KEXEC_CORE */
+
+#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
+static unsigned long pnv_memory_block_size(void)
+{
+ return 256UL * 1024 * 1024;
+}
+#endif
+
+static void __init pnv_setup_machdep_opal(void)
+{
+ ppc_md.get_boot_time = opal_get_boot_time;
+ ppc_md.restart = pnv_restart;
+ pm_power_off = pnv_power_off;
+ ppc_md.halt = pnv_halt;
+ /* ppc_md.system_reset_exception gets filled in by pnv_smp_init() */
+ ppc_md.machine_check_exception = opal_machine_check;
+ ppc_md.mce_check_early_recovery = opal_mce_check_early_recovery;
+ ppc_md.hmi_exception_early = opal_hmi_exception_early;
+ ppc_md.handle_hmi_exception = opal_handle_hmi_exception;
+}
+
+static int __init pnv_probe(void)
+{
+ if (!of_machine_is_compatible("ibm,powernv"))
+ return 0;
+
+ if (firmware_has_feature(FW_FEATURE_OPAL))
+ pnv_setup_machdep_opal();
+
+ pr_debug("PowerNV detected !\n");
+
+ pnv_init();
+
+ return 1;
+}
+
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+void __init pnv_tm_init(void)
+{
+ if (!firmware_has_feature(FW_FEATURE_OPAL) ||
+ !pvr_version_is(PVR_POWER9) ||
+ early_cpu_has_feature(CPU_FTR_TM))
+ return;
+
+ if (opal_reinit_cpus(OPAL_REINIT_CPUS_TM_SUSPEND_DISABLED) != OPAL_SUCCESS)
+ return;
+
+ pr_info("Enabling TM (Transactional Memory) with Suspend Disabled\n");
+ cur_cpu_spec->cpu_features |= CPU_FTR_TM;
+ /* Make sure "normal" HTM is off (it should be) */
+ cur_cpu_spec->cpu_user_features2 &= ~PPC_FEATURE2_HTM;
+ /* Turn on no suspend mode, and HTM no SC */
+ cur_cpu_spec->cpu_user_features2 |= PPC_FEATURE2_HTM_NO_SUSPEND | \
+ PPC_FEATURE2_HTM_NOSC;
+ tm_suspend_disabled = true;
+}
+#endif /* CONFIG_PPC_TRANSACTIONAL_MEM */
+
+/*
+ * Returns the cpu frequency for 'cpu' in Hz. This is used by
+ * /proc/cpuinfo
+ */
+static unsigned long pnv_get_proc_freq(unsigned int cpu)
+{
+ unsigned long ret_freq;
+
+ ret_freq = cpufreq_get(cpu) * 1000ul;
+
+ /*
+ * If the backend cpufreq driver does not exist,
+ * then fallback to old way of reporting the clockrate.
+ */
+ if (!ret_freq)
+ ret_freq = ppc_proc_freq;
+ return ret_freq;
+}
+
+define_machine(powernv) {
+ .name = "PowerNV",
+ .probe = pnv_probe,
+ .setup_arch = pnv_setup_arch,
+ .init_IRQ = pnv_init_IRQ,
+ .show_cpuinfo = pnv_show_cpuinfo,
+ .get_proc_freq = pnv_get_proc_freq,
+ .progress = pnv_progress,
+ .machine_shutdown = pnv_shutdown,
+ .power_save = NULL,
+ .calibrate_decr = generic_calibrate_decr,
+#ifdef CONFIG_KEXEC_CORE
+ .kexec_cpu_down = pnv_kexec_cpu_down,
+#endif
+#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
+ .memory_block_size = pnv_memory_block_size,
+#endif
+};
diff --git a/arch/powerpc/platforms/powernv/smp.c b/arch/powerpc/platforms/powernv/smp.c
new file mode 100644
index 000000000..889c3dbec
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/smp.c
@@ -0,0 +1,442 @@
+/*
+ * SMP support for PowerNV machines.
+ *
+ * Copyright 2011 IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/sched/hotplug.h>
+#include <linux/smp.h>
+#include <linux/interrupt.h>
+#include <linux/delay.h>
+#include <linux/init.h>
+#include <linux/spinlock.h>
+#include <linux/cpu.h>
+
+#include <asm/irq.h>
+#include <asm/smp.h>
+#include <asm/paca.h>
+#include <asm/machdep.h>
+#include <asm/cputable.h>
+#include <asm/firmware.h>
+#include <asm/vdso_datapage.h>
+#include <asm/cputhreads.h>
+#include <asm/xics.h>
+#include <asm/xive.h>
+#include <asm/opal.h>
+#include <asm/runlatch.h>
+#include <asm/code-patching.h>
+#include <asm/dbell.h>
+#include <asm/kvm_ppc.h>
+#include <asm/ppc-opcode.h>
+#include <asm/cpuidle.h>
+#include <asm/kexec.h>
+#include <asm/reg.h>
+#include <asm/powernv.h>
+
+#include "powernv.h"
+
+#ifdef DEBUG
+#include <asm/udbg.h>
+#define DBG(fmt...) udbg_printf(fmt)
+#else
+#define DBG(fmt...) do { } while (0)
+#endif
+
+static void pnv_smp_setup_cpu(int cpu)
+{
+ /*
+ * P9 workaround for CI vector load (see traps.c),
+ * enable the corresponding HMI interrupt
+ */
+ if (pvr_version_is(PVR_POWER9))
+ mtspr(SPRN_HMEER, mfspr(SPRN_HMEER) | PPC_BIT(17));
+
+ if (xive_enabled())
+ xive_smp_setup_cpu();
+ else if (cpu != boot_cpuid)
+ xics_setup_cpu();
+}
+
+static int pnv_smp_kick_cpu(int nr)
+{
+ unsigned int pcpu;
+ unsigned long start_here =
+ __pa(ppc_function_entry(generic_secondary_smp_init));
+ long rc;
+ uint8_t status;
+
+ if (nr < 0 || nr >= nr_cpu_ids)
+ return -EINVAL;
+
+ pcpu = get_hard_smp_processor_id(nr);
+ /*
+ * If we already started or OPAL is not supported, we just
+ * kick the CPU via the PACA
+ */
+ if (paca_ptrs[nr]->cpu_start || !firmware_has_feature(FW_FEATURE_OPAL))
+ goto kick;
+
+ /*
+ * At this point, the CPU can either be spinning on the way in
+ * from kexec or be inside OPAL waiting to be started for the
+ * first time. OPAL v3 allows us to query OPAL to know if it
+ * has the CPUs, so we do that
+ */
+ rc = opal_query_cpu_status(pcpu, &status);
+ if (rc != OPAL_SUCCESS) {
+ pr_warn("OPAL Error %ld querying CPU %d state\n", rc, nr);
+ return -ENODEV;
+ }
+
+ /*
+ * Already started, just kick it, probably coming from
+ * kexec and spinning
+ */
+ if (status == OPAL_THREAD_STARTED)
+ goto kick;
+
+ /*
+ * Available/inactive, let's kick it
+ */
+ if (status == OPAL_THREAD_INACTIVE) {
+ pr_devel("OPAL: Starting CPU %d (HW 0x%x)...\n", nr, pcpu);
+ rc = opal_start_cpu(pcpu, start_here);
+ if (rc != OPAL_SUCCESS) {
+ pr_warn("OPAL Error %ld starting CPU %d\n", rc, nr);
+ return -ENODEV;
+ }
+ } else {
+ /*
+ * An unavailable CPU (or any other unknown status)
+ * shouldn't be started. It should also
+ * not be in the possible map but currently it can
+ * happen
+ */
+ pr_devel("OPAL: CPU %d (HW 0x%x) is unavailable"
+ " (status %d)...\n", nr, pcpu, status);
+ return -ENODEV;
+ }
+
+kick:
+ return smp_generic_kick_cpu(nr);
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+
+static int pnv_smp_cpu_disable(void)
+{
+ int cpu = smp_processor_id();
+
+ /* This is identical to pSeries... might consolidate by
+ * moving migrate_irqs_away to a ppc_md with default to
+ * the generic fixup_irqs. --BenH.
+ */
+ set_cpu_online(cpu, false);
+ vdso_data->processorCount--;
+ if (cpu == boot_cpuid)
+ boot_cpuid = cpumask_any(cpu_online_mask);
+ if (xive_enabled())
+ xive_smp_disable_cpu();
+ else
+ xics_migrate_irqs_away();
+ return 0;
+}
+
+static void pnv_flush_interrupts(void)
+{
+ if (cpu_has_feature(CPU_FTR_ARCH_300)) {
+ if (xive_enabled())
+ xive_flush_interrupt();
+ else
+ icp_opal_flush_interrupt();
+ } else {
+ icp_native_flush_interrupt();
+ }
+}
+
+static void pnv_smp_cpu_kill_self(void)
+{
+ unsigned long srr1, unexpected_mask, wmask;
+ unsigned int cpu;
+ u64 lpcr_val;
+
+ /* Standard hot unplug procedure */
+
+ idle_task_exit();
+ cpu = smp_processor_id();
+ DBG("CPU%d offline\n", cpu);
+ generic_set_cpu_dead(cpu);
+ smp_wmb();
+
+ wmask = SRR1_WAKEMASK;
+ if (cpu_has_feature(CPU_FTR_ARCH_207S))
+ wmask = SRR1_WAKEMASK_P8;
+
+ /*
+ * This turns the irq soft-disabled state we're called with, into a
+ * hard-disabled state with pending irq_happened interrupts cleared.
+ *
+ * PACA_IRQ_DEC - Decrementer should be ignored.
+ * PACA_IRQ_HMI - Can be ignored, processing is done in real mode.
+ * PACA_IRQ_DBELL, EE, PMI - Unexpected.
+ */
+ hard_irq_disable();
+ if (generic_check_cpu_restart(cpu))
+ goto out;
+
+ unexpected_mask = ~(PACA_IRQ_DEC | PACA_IRQ_HMI | PACA_IRQ_HARD_DIS);
+ if (local_paca->irq_happened & unexpected_mask) {
+ if (local_paca->irq_happened & PACA_IRQ_EE)
+ pnv_flush_interrupts();
+ DBG("CPU%d Unexpected exit while offline irq_happened=%lx!\n",
+ cpu, local_paca->irq_happened);
+ }
+ local_paca->irq_happened = PACA_IRQ_HARD_DIS;
+
+ /*
+ * We don't want to take decrementer interrupts while we are
+ * offline, so clear LPCR:PECE1. We keep PECE2 (and
+ * LPCR_PECE_HVEE on P9) enabled so as to let IPIs in.
+ *
+ * If the CPU gets woken up by a special wakeup, ensure that
+ * the SLW engine sets LPCR with decrementer bit cleared, else
+ * the CPU will come back to the kernel due to a spurious
+ * wakeup.
+ */
+ lpcr_val = mfspr(SPRN_LPCR) & ~(u64)LPCR_PECE1;
+ pnv_program_cpu_hotplug_lpcr(cpu, lpcr_val);
+
+ while (!generic_check_cpu_restart(cpu)) {
+ /*
+ * Clear IPI flag, since we don't handle IPIs while
+ * offline, except for those when changing micro-threading
+ * mode, which are handled explicitly below, and those
+ * for coming online, which are handled via
+ * generic_check_cpu_restart() calls.
+ */
+ kvmppc_clear_host_ipi(cpu);
+
+ srr1 = pnv_cpu_offline(cpu);
+
+ WARN_ON_ONCE(!irqs_disabled());
+ WARN_ON(lazy_irq_pending());
+
+ /*
+ * If the SRR1 value indicates that we woke up due to
+ * an external interrupt, then clear the interrupt.
+ * We clear the interrupt before checking for the
+ * reason, so as to avoid a race where we wake up for
+ * some other reason, find nothing and clear the interrupt
+ * just as some other cpu is sending us an interrupt.
+ * If we returned from power7_nap as a result of
+ * having finished executing in a KVM guest, then srr1
+ * contains 0.
+ */
+ if (((srr1 & wmask) == SRR1_WAKEEE) ||
+ ((srr1 & wmask) == SRR1_WAKEHVI)) {
+ pnv_flush_interrupts();
+ } else if ((srr1 & wmask) == SRR1_WAKEHDBELL) {
+ unsigned long msg = PPC_DBELL_TYPE(PPC_DBELL_SERVER);
+ asm volatile(PPC_MSGCLR(%0) : : "r" (msg));
+ } else if ((srr1 & wmask) == SRR1_WAKERESET) {
+ irq_set_pending_from_srr1(srr1);
+ /* Does not return */
+ }
+
+ smp_mb();
+
+ /*
+ * For kdump kernels, we process the ipi and jump to
+ * crash_ipi_callback
+ */
+ if (kdump_in_progress()) {
+ /*
+ * If we got to this point, we've not used
+ * NMI's, otherwise we would have gone
+ * via the SRR1_WAKERESET path. We are
+ * using regular IPI's for waking up offline
+ * threads.
+ */
+ struct pt_regs regs;
+
+ ppc_save_regs(&regs);
+ crash_ipi_callback(&regs);
+ /* Does not return */
+ }
+
+ if (cpu_core_split_required())
+ continue;
+
+ if (srr1 && !generic_check_cpu_restart(cpu))
+ DBG("CPU%d Unexpected exit while offline srr1=%lx!\n",
+ cpu, srr1);
+
+ }
+
+ /*
+ * Re-enable decrementer interrupts in LPCR.
+ *
+ * Further, we want stop states to be woken up by decrementer
+ * for non-hotplug cases. So program the LPCR via stop api as
+ * well.
+ */
+ lpcr_val = mfspr(SPRN_LPCR) | (u64)LPCR_PECE1;
+ pnv_program_cpu_hotplug_lpcr(cpu, lpcr_val);
+out:
+ DBG("CPU%d coming online...\n", cpu);
+}
+
+#endif /* CONFIG_HOTPLUG_CPU */
+
+static int pnv_cpu_bootable(unsigned int nr)
+{
+ /*
+ * Starting with POWER8, the subcore logic relies on all threads of a
+ * core being booted so that they can participate in split mode
+ * switches. So on those machines we ignore the smt_enabled_at_boot
+ * setting (smt-enabled on the kernel command line).
+ */
+ if (cpu_has_feature(CPU_FTR_ARCH_207S))
+ return 1;
+
+ return smp_generic_cpu_bootable(nr);
+}
+
+static int pnv_smp_prepare_cpu(int cpu)
+{
+ if (xive_enabled())
+ return xive_smp_prepare_cpu(cpu);
+ return 0;
+}
+
+/* Cause IPI as setup by the interrupt controller (xics or xive) */
+static void (*ic_cause_ipi)(int cpu);
+
+static void pnv_cause_ipi(int cpu)
+{
+ if (doorbell_try_core_ipi(cpu))
+ return;
+
+ ic_cause_ipi(cpu);
+}
+
+static void __init pnv_smp_probe(void)
+{
+ if (xive_enabled())
+ xive_smp_probe();
+ else
+ xics_smp_probe();
+
+ if (cpu_has_feature(CPU_FTR_DBELL)) {
+ ic_cause_ipi = smp_ops->cause_ipi;
+ WARN_ON(!ic_cause_ipi);
+
+ if (cpu_has_feature(CPU_FTR_ARCH_300))
+ smp_ops->cause_ipi = doorbell_global_ipi;
+ else
+ smp_ops->cause_ipi = pnv_cause_ipi;
+ }
+}
+
+static int pnv_system_reset_exception(struct pt_regs *regs)
+{
+ if (smp_handle_nmi_ipi(regs))
+ return 1;
+ return 0;
+}
+
+static int pnv_cause_nmi_ipi(int cpu)
+{
+ int64_t rc;
+
+ if (cpu >= 0) {
+ int h = get_hard_smp_processor_id(cpu);
+
+ if (opal_check_token(OPAL_QUIESCE))
+ opal_quiesce(QUIESCE_HOLD, h);
+
+ rc = opal_signal_system_reset(h);
+
+ if (opal_check_token(OPAL_QUIESCE))
+ opal_quiesce(QUIESCE_RESUME, h);
+
+ if (rc != OPAL_SUCCESS)
+ return 0;
+ return 1;
+
+ } else if (cpu == NMI_IPI_ALL_OTHERS) {
+ bool success = true;
+ int c;
+
+ if (opal_check_token(OPAL_QUIESCE))
+ opal_quiesce(QUIESCE_HOLD, -1);
+
+ /*
+ * We do not use broadcasts (yet), because it's not clear
+ * exactly what semantics Linux wants or the firmware should
+ * provide.
+ */
+ for_each_online_cpu(c) {
+ if (c == smp_processor_id())
+ continue;
+
+ rc = opal_signal_system_reset(
+ get_hard_smp_processor_id(c));
+ if (rc != OPAL_SUCCESS)
+ success = false;
+ }
+
+ if (opal_check_token(OPAL_QUIESCE))
+ opal_quiesce(QUIESCE_RESUME, -1);
+
+ if (success)
+ return 1;
+
+ /*
+ * Caller will fall back to doorbells, which may pick
+ * up the remainders.
+ */
+ }
+
+ return 0;
+}
+
+static struct smp_ops_t pnv_smp_ops = {
+ .message_pass = NULL, /* Use smp_muxed_ipi_message_pass */
+ .cause_ipi = NULL, /* Filled at runtime by pnv_smp_probe() */
+ .cause_nmi_ipi = NULL,
+ .probe = pnv_smp_probe,
+ .prepare_cpu = pnv_smp_prepare_cpu,
+ .kick_cpu = pnv_smp_kick_cpu,
+ .setup_cpu = pnv_smp_setup_cpu,
+ .cpu_bootable = pnv_cpu_bootable,
+#ifdef CONFIG_HOTPLUG_CPU
+ .cpu_disable = pnv_smp_cpu_disable,
+ .cpu_die = generic_cpu_die,
+#endif /* CONFIG_HOTPLUG_CPU */
+};
+
+/* This is called very early during platform setup_arch */
+void __init pnv_smp_init(void)
+{
+ if (opal_check_token(OPAL_SIGNAL_SYSTEM_RESET)) {
+ ppc_md.system_reset_exception = pnv_system_reset_exception;
+ pnv_smp_ops.cause_nmi_ipi = pnv_cause_nmi_ipi;
+ }
+ smp_ops = &pnv_smp_ops;
+
+#ifdef CONFIG_HOTPLUG_CPU
+ ppc_md.cpu_die = pnv_smp_cpu_kill_self;
+#ifdef CONFIG_KEXEC_CORE
+ crash_wake_offline = 1;
+#endif
+#endif
+}
diff --git a/arch/powerpc/platforms/powernv/subcore-asm.S b/arch/powerpc/platforms/powernv/subcore-asm.S
new file mode 100644
index 000000000..39bb24aa8
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/subcore-asm.S
@@ -0,0 +1,95 @@
+/*
+ * Copyright 2013, Michael (Ellerman|Neuling), IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <asm/asm-offsets.h>
+#include <asm/ppc_asm.h>
+#include <asm/reg.h>
+
+#include "subcore.h"
+
+
+_GLOBAL(split_core_secondary_loop)
+ /*
+ * r3 = u8 *state, used throughout the routine
+ * r4 = temp
+ * r5 = temp
+ * ..
+ * r12 = MSR
+ */
+ mfmsr r12
+
+ /* Disable interrupts so SRR0/1 don't get trashed */
+ li r4,0
+ ori r4,r4,MSR_EE|MSR_SE|MSR_BE|MSR_RI
+ andc r4,r12,r4
+ sync
+ mtmsrd r4
+
+ /* Switch to real mode and leave interrupts off */
+ li r5, MSR_IR|MSR_DR
+ andc r5, r4, r5
+
+ LOAD_REG_ADDR(r4, real_mode)
+
+ mtspr SPRN_SRR0,r4
+ mtspr SPRN_SRR1,r5
+ rfid
+ b . /* prevent speculative execution */
+
+real_mode:
+ /* Grab values from unsplit SPRs */
+ mfspr r6, SPRN_LDBAR
+ mfspr r7, SPRN_PMMAR
+ mfspr r8, SPRN_PMCR
+ mfspr r9, SPRN_RPR
+ mfspr r10, SPRN_SDR1
+
+ /* Order reading the SPRs vs telling the primary we are ready to split */
+ sync
+
+ /* Tell thread 0 we are in real mode */
+ li r4, SYNC_STEP_REAL_MODE
+ stb r4, 0(r3)
+
+ li r5, (HID0_POWER8_4LPARMODE | HID0_POWER8_2LPARMODE)@highest
+ sldi r5, r5, 48
+
+ /* Loop until we see the split happen in HID0 */
+1: mfspr r4, SPRN_HID0
+ and. r4, r4, r5
+ beq 1b
+
+ /*
+ * We only need to initialise the below regs once for each subcore,
+ * but it's simpler and harmless to do it on each thread.
+ */
+
+ /* Make sure various SPRS have sane values */
+ li r4, 0
+ mtspr SPRN_LPID, r4
+ mtspr SPRN_PCR, r4
+ mtspr SPRN_HDEC, r4
+
+ /* Restore SPR values now we are split */
+ mtspr SPRN_LDBAR, r6
+ mtspr SPRN_PMMAR, r7
+ mtspr SPRN_PMCR, r8
+ mtspr SPRN_RPR, r9
+ mtspr SPRN_SDR1, r10
+
+ LOAD_REG_ADDR(r5, virtual_mode)
+
+ /* Get out of real mode */
+ mtspr SPRN_SRR0,r5
+ mtspr SPRN_SRR1,r12
+ rfid
+ b . /* prevent speculative execution */
+
+virtual_mode:
+ blr
diff --git a/arch/powerpc/platforms/powernv/subcore.c b/arch/powerpc/platforms/powernv/subcore.c
new file mode 100644
index 000000000..45563004f
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/subcore.c
@@ -0,0 +1,435 @@
+/*
+ * Copyright 2013, Michael (Ellerman|Neuling), IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#define pr_fmt(fmt) "powernv: " fmt
+
+#include <linux/kernel.h>
+#include <linux/cpu.h>
+#include <linux/cpumask.h>
+#include <linux/device.h>
+#include <linux/gfp.h>
+#include <linux/smp.h>
+#include <linux/stop_machine.h>
+
+#include <asm/cputhreads.h>
+#include <asm/cpuidle.h>
+#include <asm/kvm_ppc.h>
+#include <asm/machdep.h>
+#include <asm/opal.h>
+#include <asm/smp.h>
+
+#include "subcore.h"
+#include "powernv.h"
+
+
+/*
+ * Split/unsplit procedure:
+ *
+ * A core can be in one of three states, unsplit, 2-way split, and 4-way split.
+ *
+ * The mapping to subcores_per_core is simple:
+ *
+ * State | subcores_per_core
+ * ------------|------------------
+ * Unsplit | 1
+ * 2-way split | 2
+ * 4-way split | 4
+ *
+ * The core is split along thread boundaries, the mapping between subcores and
+ * threads is as follows:
+ *
+ * Unsplit:
+ * ----------------------------
+ * Subcore | 0 |
+ * ----------------------------
+ * Thread | 0 1 2 3 4 5 6 7 |
+ * ----------------------------
+ *
+ * 2-way split:
+ * -------------------------------------
+ * Subcore | 0 | 1 |
+ * -------------------------------------
+ * Thread | 0 1 2 3 | 4 5 6 7 |
+ * -------------------------------------
+ *
+ * 4-way split:
+ * -----------------------------------------
+ * Subcore | 0 | 1 | 2 | 3 |
+ * -----------------------------------------
+ * Thread | 0 1 | 2 3 | 4 5 | 6 7 |
+ * -----------------------------------------
+ *
+ *
+ * Transitions
+ * -----------
+ *
+ * It is not possible to transition between either of the split states, the
+ * core must first be unsplit. The legal transitions are:
+ *
+ * ----------- ---------------
+ * | | <----> | 2-way split |
+ * | | ---------------
+ * | Unsplit |
+ * | | ---------------
+ * | | <----> | 4-way split |
+ * ----------- ---------------
+ *
+ * Unsplitting
+ * -----------
+ *
+ * Unsplitting is the simpler procedure. It requires thread 0 to request the
+ * unsplit while all other threads NAP.
+ *
+ * Thread 0 clears HID0_POWER8_DYNLPARDIS (Dynamic LPAR Disable). This tells
+ * the hardware that if all threads except 0 are napping, the hardware should
+ * unsplit the core.
+ *
+ * Non-zero threads are sent to a NAP loop, they don't exit the loop until they
+ * see the core unsplit.
+ *
+ * Core 0 spins waiting for the hardware to see all the other threads napping
+ * and perform the unsplit.
+ *
+ * Once thread 0 sees the unsplit, it IPIs the secondary threads to wake them
+ * out of NAP. They will then see the core unsplit and exit the NAP loop.
+ *
+ * Splitting
+ * ---------
+ *
+ * The basic splitting procedure is fairly straight forward. However it is
+ * complicated by the fact that after the split occurs, the newly created
+ * subcores are not in a fully initialised state.
+ *
+ * Most notably the subcores do not have the correct value for SDR1, which
+ * means they must not be running in virtual mode when the split occurs. The
+ * subcores have separate timebases SPRs but these are pre-synchronised by
+ * opal.
+ *
+ * To begin with secondary threads are sent to an assembly routine. There they
+ * switch to real mode, so they are immune to the uninitialised SDR1 value.
+ * Once in real mode they indicate that they are in real mode, and spin waiting
+ * to see the core split.
+ *
+ * Thread 0 waits to see that all secondaries are in real mode, and then begins
+ * the splitting procedure. It firstly sets HID0_POWER8_DYNLPARDIS, which
+ * prevents the hardware from unsplitting. Then it sets the appropriate HID bit
+ * to request the split, and spins waiting to see that the split has happened.
+ *
+ * Concurrently the secondaries will notice the split. When they do they set up
+ * their SPRs, notably SDR1, and then they can return to virtual mode and exit
+ * the procedure.
+ */
+
+/* Initialised at boot by subcore_init() */
+static int subcores_per_core;
+
+/*
+ * Used to communicate to offline cpus that we want them to pop out of the
+ * offline loop and do a split or unsplit.
+ *
+ * 0 - no split happening
+ * 1 - unsplit in progress
+ * 2 - split to 2 in progress
+ * 4 - split to 4 in progress
+ */
+static int new_split_mode;
+
+static cpumask_var_t cpu_offline_mask;
+
+struct split_state {
+ u8 step;
+ u8 master;
+};
+
+static DEFINE_PER_CPU(struct split_state, split_state);
+
+static void wait_for_sync_step(int step)
+{
+ int i, cpu = smp_processor_id();
+
+ for (i = cpu + 1; i < cpu + threads_per_core; i++)
+ while(per_cpu(split_state, i).step < step)
+ barrier();
+
+ /* Order the wait loop vs any subsequent loads/stores. */
+ mb();
+}
+
+static void update_hid_in_slw(u64 hid0)
+{
+ u64 idle_states = pnv_get_supported_cpuidle_states();
+
+ if (idle_states & OPAL_PM_WINKLE_ENABLED) {
+ /* OPAL call to patch slw with the new HID0 value */
+ u64 cpu_pir = hard_smp_processor_id();
+
+ opal_slw_set_reg(cpu_pir, SPRN_HID0, hid0);
+ }
+}
+
+static void unsplit_core(void)
+{
+ u64 hid0, mask;
+ int i, cpu;
+
+ mask = HID0_POWER8_2LPARMODE | HID0_POWER8_4LPARMODE;
+
+ cpu = smp_processor_id();
+ if (cpu_thread_in_core(cpu) != 0) {
+ while (mfspr(SPRN_HID0) & mask)
+ power7_idle_insn(PNV_THREAD_NAP);
+
+ per_cpu(split_state, cpu).step = SYNC_STEP_UNSPLIT;
+ return;
+ }
+
+ hid0 = mfspr(SPRN_HID0);
+ hid0 &= ~HID0_POWER8_DYNLPARDIS;
+ update_power8_hid0(hid0);
+ update_hid_in_slw(hid0);
+
+ while (mfspr(SPRN_HID0) & mask)
+ cpu_relax();
+
+ /* Wake secondaries out of NAP */
+ for (i = cpu + 1; i < cpu + threads_per_core; i++)
+ smp_send_reschedule(i);
+
+ wait_for_sync_step(SYNC_STEP_UNSPLIT);
+}
+
+static void split_core(int new_mode)
+{
+ struct { u64 value; u64 mask; } split_parms[2] = {
+ { HID0_POWER8_1TO2LPAR, HID0_POWER8_2LPARMODE },
+ { HID0_POWER8_1TO4LPAR, HID0_POWER8_4LPARMODE }
+ };
+ int i, cpu;
+ u64 hid0;
+
+ /* Convert new_mode (2 or 4) into an index into our parms array */
+ i = (new_mode >> 1) - 1;
+ BUG_ON(i < 0 || i > 1);
+
+ cpu = smp_processor_id();
+ if (cpu_thread_in_core(cpu) != 0) {
+ split_core_secondary_loop(&per_cpu(split_state, cpu).step);
+ return;
+ }
+
+ wait_for_sync_step(SYNC_STEP_REAL_MODE);
+
+ /* Write new mode */
+ hid0 = mfspr(SPRN_HID0);
+ hid0 |= HID0_POWER8_DYNLPARDIS | split_parms[i].value;
+ update_power8_hid0(hid0);
+ update_hid_in_slw(hid0);
+
+ /* Wait for it to happen */
+ while (!(mfspr(SPRN_HID0) & split_parms[i].mask))
+ cpu_relax();
+}
+
+static void cpu_do_split(int new_mode)
+{
+ /*
+ * At boot subcores_per_core will be 0, so we will always unsplit at
+ * boot. In the usual case where the core is already unsplit it's a
+ * nop, and this just ensures the kernel's notion of the mode is
+ * consistent with the hardware.
+ */
+ if (subcores_per_core != 1)
+ unsplit_core();
+
+ if (new_mode != 1)
+ split_core(new_mode);
+
+ mb();
+ per_cpu(split_state, smp_processor_id()).step = SYNC_STEP_FINISHED;
+}
+
+bool cpu_core_split_required(void)
+{
+ smp_rmb();
+
+ if (!new_split_mode)
+ return false;
+
+ cpu_do_split(new_split_mode);
+
+ return true;
+}
+
+void update_subcore_sibling_mask(void)
+{
+ int cpu;
+ /*
+ * sibling mask for the first cpu. Left shift this by required bits
+ * to get sibling mask for the rest of the cpus.
+ */
+ int sibling_mask_first_cpu = (1 << threads_per_subcore) - 1;
+
+ for_each_possible_cpu(cpu) {
+ int tid = cpu_thread_in_core(cpu);
+ int offset = (tid / threads_per_subcore) * threads_per_subcore;
+ int mask = sibling_mask_first_cpu << offset;
+
+ paca_ptrs[cpu]->subcore_sibling_mask = mask;
+
+ }
+}
+
+static int cpu_update_split_mode(void *data)
+{
+ int cpu, new_mode = *(int *)data;
+
+ if (this_cpu_ptr(&split_state)->master) {
+ new_split_mode = new_mode;
+ smp_wmb();
+
+ cpumask_andnot(cpu_offline_mask, cpu_present_mask,
+ cpu_online_mask);
+
+ /* This should work even though the cpu is offline */
+ for_each_cpu(cpu, cpu_offline_mask)
+ smp_send_reschedule(cpu);
+ }
+
+ cpu_do_split(new_mode);
+
+ if (this_cpu_ptr(&split_state)->master) {
+ /* Wait for all cpus to finish before we touch subcores_per_core */
+ for_each_present_cpu(cpu) {
+ if (cpu >= setup_max_cpus)
+ break;
+
+ while(per_cpu(split_state, cpu).step < SYNC_STEP_FINISHED)
+ barrier();
+ }
+
+ new_split_mode = 0;
+
+ /* Make the new mode public */
+ subcores_per_core = new_mode;
+ threads_per_subcore = threads_per_core / subcores_per_core;
+ update_subcore_sibling_mask();
+
+ /* Make sure the new mode is written before we exit */
+ mb();
+ }
+
+ return 0;
+}
+
+static int set_subcores_per_core(int new_mode)
+{
+ struct split_state *state;
+ int cpu;
+
+ if (kvm_hv_mode_active()) {
+ pr_err("Unable to change split core mode while KVM active.\n");
+ return -EBUSY;
+ }
+
+ /*
+ * We are only called at boot, or from the sysfs write. If that ever
+ * changes we'll need a lock here.
+ */
+ BUG_ON(new_mode < 1 || new_mode > 4 || new_mode == 3);
+
+ for_each_present_cpu(cpu) {
+ state = &per_cpu(split_state, cpu);
+ state->step = SYNC_STEP_INITIAL;
+ state->master = 0;
+ }
+
+ cpus_read_lock();
+
+ /* This cpu will update the globals before exiting stop machine */
+ this_cpu_ptr(&split_state)->master = 1;
+
+ /* Ensure state is consistent before we call the other cpus */
+ mb();
+
+ stop_machine_cpuslocked(cpu_update_split_mode, &new_mode,
+ cpu_online_mask);
+
+ cpus_read_unlock();
+
+ return 0;
+}
+
+static ssize_t __used store_subcores_per_core(struct device *dev,
+ struct device_attribute *attr, const char *buf,
+ size_t count)
+{
+ unsigned long val;
+ int rc;
+
+ /* We are serialised by the attribute lock */
+
+ rc = sscanf(buf, "%lx", &val);
+ if (rc != 1)
+ return -EINVAL;
+
+ switch (val) {
+ case 1:
+ case 2:
+ case 4:
+ if (subcores_per_core == val)
+ /* Nothing to do */
+ goto out;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ rc = set_subcores_per_core(val);
+ if (rc)
+ return rc;
+
+out:
+ return count;
+}
+
+static ssize_t show_subcores_per_core(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%x\n", subcores_per_core);
+}
+
+static DEVICE_ATTR(subcores_per_core, 0644,
+ show_subcores_per_core, store_subcores_per_core);
+
+static int subcore_init(void)
+{
+ unsigned pvr_ver;
+
+ pvr_ver = PVR_VER(mfspr(SPRN_PVR));
+
+ if (pvr_ver != PVR_POWER8 &&
+ pvr_ver != PVR_POWER8E &&
+ pvr_ver != PVR_POWER8NVL)
+ return 0;
+
+ /*
+ * We need all threads in a core to be present to split/unsplit so
+ * continue only if max_cpus are aligned to threads_per_core.
+ */
+ if (setup_max_cpus % threads_per_core)
+ return 0;
+
+ BUG_ON(!alloc_cpumask_var(&cpu_offline_mask, GFP_KERNEL));
+
+ set_subcores_per_core(1);
+
+ return device_create_file(cpu_subsys.dev_root,
+ &dev_attr_subcores_per_core);
+}
+machine_device_initcall(powernv, subcore_init);
diff --git a/arch/powerpc/platforms/powernv/subcore.h b/arch/powerpc/platforms/powernv/subcore.h
new file mode 100644
index 000000000..84e02ae52
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/subcore.h
@@ -0,0 +1,25 @@
+/*
+ * Copyright 2013, Michael Ellerman, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+/* These are ordered and tested with <= */
+#define SYNC_STEP_INITIAL 0
+#define SYNC_STEP_UNSPLIT 1 /* Set by secondary when it sees unsplit */
+#define SYNC_STEP_REAL_MODE 2 /* Set by secondary when in real mode */
+#define SYNC_STEP_FINISHED 3 /* Set by secondary when split/unsplit is done */
+
+#ifndef __ASSEMBLY__
+
+#ifdef CONFIG_SMP
+void split_core_secondary_loop(u8 *state);
+extern void update_subcore_sibling_mask(void);
+#else
+static inline void update_subcore_sibling_mask(void) { };
+#endif /* CONFIG_SMP */
+
+#endif /* __ASSEMBLY__ */
diff --git a/arch/powerpc/platforms/powernv/vas-debug.c b/arch/powerpc/platforms/powernv/vas-debug.c
new file mode 100644
index 000000000..4f7276ebd
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/vas-debug.c
@@ -0,0 +1,220 @@
+/*
+ * Copyright 2016-17 IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#define pr_fmt(fmt) "vas: " fmt
+
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+#include "vas.h"
+
+static struct dentry *vas_debugfs;
+
+static char *cop_to_str(int cop)
+{
+ switch (cop) {
+ case VAS_COP_TYPE_FAULT: return "Fault";
+ case VAS_COP_TYPE_842: return "NX-842 Normal Priority";
+ case VAS_COP_TYPE_842_HIPRI: return "NX-842 High Priority";
+ case VAS_COP_TYPE_GZIP: return "NX-GZIP Normal Priority";
+ case VAS_COP_TYPE_GZIP_HIPRI: return "NX-GZIP High Priority";
+ case VAS_COP_TYPE_FTW: return "Fast Thread-wakeup";
+ default: return "Unknown";
+ }
+}
+
+static int info_dbg_show(struct seq_file *s, void *private)
+{
+ struct vas_window *window = s->private;
+
+ mutex_lock(&vas_mutex);
+
+ /* ensure window is not unmapped */
+ if (!window->hvwc_map)
+ goto unlock;
+
+ seq_printf(s, "Type: %s, %s\n", cop_to_str(window->cop),
+ window->tx_win ? "Send" : "Receive");
+ seq_printf(s, "Pid : %d\n", window->pid);
+
+unlock:
+ mutex_unlock(&vas_mutex);
+ return 0;
+}
+
+static int info_dbg_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, info_dbg_show, inode->i_private);
+}
+
+static const struct file_operations info_fops = {
+ .open = info_dbg_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static inline void print_reg(struct seq_file *s, struct vas_window *win,
+ char *name, u32 reg)
+{
+ seq_printf(s, "0x%016llx %s\n", read_hvwc_reg(win, name, reg), name);
+}
+
+static int hvwc_dbg_show(struct seq_file *s, void *private)
+{
+ struct vas_window *window = s->private;
+
+ mutex_lock(&vas_mutex);
+
+ /* ensure window is not unmapped */
+ if (!window->hvwc_map)
+ goto unlock;
+
+ print_reg(s, window, VREG(LPID));
+ print_reg(s, window, VREG(PID));
+ print_reg(s, window, VREG(XLATE_MSR));
+ print_reg(s, window, VREG(XLATE_LPCR));
+ print_reg(s, window, VREG(XLATE_CTL));
+ print_reg(s, window, VREG(AMR));
+ print_reg(s, window, VREG(SEIDR));
+ print_reg(s, window, VREG(FAULT_TX_WIN));
+ print_reg(s, window, VREG(OSU_INTR_SRC_RA));
+ print_reg(s, window, VREG(HV_INTR_SRC_RA));
+ print_reg(s, window, VREG(PSWID));
+ print_reg(s, window, VREG(LFIFO_BAR));
+ print_reg(s, window, VREG(LDATA_STAMP_CTL));
+ print_reg(s, window, VREG(LDMA_CACHE_CTL));
+ print_reg(s, window, VREG(LRFIFO_PUSH));
+ print_reg(s, window, VREG(CURR_MSG_COUNT));
+ print_reg(s, window, VREG(LNOTIFY_AFTER_COUNT));
+ print_reg(s, window, VREG(LRX_WCRED));
+ print_reg(s, window, VREG(LRX_WCRED_ADDER));
+ print_reg(s, window, VREG(TX_WCRED));
+ print_reg(s, window, VREG(TX_WCRED_ADDER));
+ print_reg(s, window, VREG(LFIFO_SIZE));
+ print_reg(s, window, VREG(WINCTL));
+ print_reg(s, window, VREG(WIN_STATUS));
+ print_reg(s, window, VREG(WIN_CTX_CACHING_CTL));
+ print_reg(s, window, VREG(TX_RSVD_BUF_COUNT));
+ print_reg(s, window, VREG(LRFIFO_WIN_PTR));
+ print_reg(s, window, VREG(LNOTIFY_CTL));
+ print_reg(s, window, VREG(LNOTIFY_PID));
+ print_reg(s, window, VREG(LNOTIFY_LPID));
+ print_reg(s, window, VREG(LNOTIFY_TID));
+ print_reg(s, window, VREG(LNOTIFY_SCOPE));
+ print_reg(s, window, VREG(NX_UTIL_ADDER));
+unlock:
+ mutex_unlock(&vas_mutex);
+ return 0;
+}
+
+static int hvwc_dbg_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, hvwc_dbg_show, inode->i_private);
+}
+
+static const struct file_operations hvwc_fops = {
+ .open = hvwc_dbg_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+void vas_window_free_dbgdir(struct vas_window *window)
+{
+ if (window->dbgdir) {
+ debugfs_remove_recursive(window->dbgdir);
+ kfree(window->dbgname);
+ window->dbgdir = NULL;
+ window->dbgname = NULL;
+ }
+}
+
+void vas_window_init_dbgdir(struct vas_window *window)
+{
+ struct dentry *f, *d;
+
+ if (!window->vinst->dbgdir)
+ return;
+
+ window->dbgname = kzalloc(16, GFP_KERNEL);
+ if (!window->dbgname)
+ return;
+
+ snprintf(window->dbgname, 16, "w%d", window->winid);
+
+ d = debugfs_create_dir(window->dbgname, window->vinst->dbgdir);
+ if (IS_ERR(d))
+ goto free_name;
+
+ window->dbgdir = d;
+
+ f = debugfs_create_file("info", 0444, d, window, &info_fops);
+ if (IS_ERR(f))
+ goto remove_dir;
+
+ f = debugfs_create_file("hvwc", 0444, d, window, &hvwc_fops);
+ if (IS_ERR(f))
+ goto remove_dir;
+
+ return;
+
+remove_dir:
+ debugfs_remove_recursive(window->dbgdir);
+ window->dbgdir = NULL;
+
+free_name:
+ kfree(window->dbgname);
+ window->dbgname = NULL;
+}
+
+void vas_instance_init_dbgdir(struct vas_instance *vinst)
+{
+ struct dentry *d;
+
+ vas_init_dbgdir();
+ if (!vas_debugfs)
+ return;
+
+ vinst->dbgname = kzalloc(16, GFP_KERNEL);
+ if (!vinst->dbgname)
+ return;
+
+ snprintf(vinst->dbgname, 16, "v%d", vinst->vas_id);
+
+ d = debugfs_create_dir(vinst->dbgname, vas_debugfs);
+ if (IS_ERR(d))
+ goto free_name;
+
+ vinst->dbgdir = d;
+ return;
+
+free_name:
+ kfree(vinst->dbgname);
+ vinst->dbgname = NULL;
+ vinst->dbgdir = NULL;
+}
+
+/*
+ * Set up the "root" VAS debugfs dir. Return if we already set it up
+ * (or failed to) in an earlier instance of VAS.
+ */
+void vas_init_dbgdir(void)
+{
+ static bool first_time = true;
+
+ if (!first_time)
+ return;
+
+ first_time = false;
+ vas_debugfs = debugfs_create_dir("vas", NULL);
+ if (IS_ERR(vas_debugfs))
+ vas_debugfs = NULL;
+}
diff --git a/arch/powerpc/platforms/powernv/vas-trace.h b/arch/powerpc/platforms/powernv/vas-trace.h
new file mode 100644
index 000000000..a449b9f0c
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/vas-trace.h
@@ -0,0 +1,113 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM vas
+
+#if !defined(_VAS_TRACE_H) || defined(TRACE_HEADER_MULTI_READ)
+
+#define _VAS_TRACE_H
+#include <linux/tracepoint.h>
+#include <linux/sched.h>
+#include <asm/vas.h>
+
+TRACE_EVENT( vas_rx_win_open,
+
+ TP_PROTO(struct task_struct *tsk,
+ int vasid,
+ int cop,
+ struct vas_rx_win_attr *rxattr),
+
+ TP_ARGS(tsk, vasid, cop, rxattr),
+
+ TP_STRUCT__entry(
+ __field(struct task_struct *, tsk)
+ __field(int, pid)
+ __field(int, cop)
+ __field(int, vasid)
+ __field(struct vas_rx_win_attr *, rxattr)
+ __field(int, lnotify_lpid)
+ __field(int, lnotify_pid)
+ __field(int, lnotify_tid)
+ ),
+
+ TP_fast_assign(
+ __entry->pid = tsk->pid;
+ __entry->vasid = vasid;
+ __entry->cop = cop;
+ __entry->lnotify_lpid = rxattr->lnotify_lpid;
+ __entry->lnotify_pid = rxattr->lnotify_pid;
+ __entry->lnotify_tid = rxattr->lnotify_tid;
+ ),
+
+ TP_printk("pid=%d, vasid=%d, cop=%d, lpid=%d, pid=%d, tid=%d",
+ __entry->pid, __entry->vasid, __entry->cop,
+ __entry->lnotify_lpid, __entry->lnotify_pid,
+ __entry->lnotify_tid)
+);
+
+TRACE_EVENT( vas_tx_win_open,
+
+ TP_PROTO(struct task_struct *tsk,
+ int vasid,
+ int cop,
+ struct vas_tx_win_attr *txattr),
+
+ TP_ARGS(tsk, vasid, cop, txattr),
+
+ TP_STRUCT__entry(
+ __field(struct task_struct *, tsk)
+ __field(int, pid)
+ __field(int, cop)
+ __field(int, vasid)
+ __field(struct vas_tx_win_attr *, txattr)
+ __field(int, lpid)
+ __field(int, pidr)
+ ),
+
+ TP_fast_assign(
+ __entry->pid = tsk->pid;
+ __entry->vasid = vasid;
+ __entry->cop = cop;
+ __entry->lpid = txattr->lpid;
+ __entry->pidr = txattr->pidr;
+ ),
+
+ TP_printk("pid=%d, vasid=%d, cop=%d, lpid=%d, pidr=%d",
+ __entry->pid, __entry->vasid, __entry->cop,
+ __entry->lpid, __entry->pidr)
+);
+
+TRACE_EVENT( vas_paste_crb,
+
+ TP_PROTO(struct task_struct *tsk,
+ struct vas_window *win),
+
+ TP_ARGS(tsk, win),
+
+ TP_STRUCT__entry(
+ __field(struct task_struct *, tsk)
+ __field(struct vas_window *, win)
+ __field(int, pid)
+ __field(int, vasid)
+ __field(int, winid)
+ __field(unsigned long, paste_kaddr)
+ ),
+
+ TP_fast_assign(
+ __entry->pid = tsk->pid;
+ __entry->vasid = win->vinst->vas_id;
+ __entry->winid = win->winid;
+ __entry->paste_kaddr = (unsigned long)win->paste_kaddr
+ ),
+
+ TP_printk("pid=%d, vasid=%d, winid=%d, paste_kaddr=0x%016lx\n",
+ __entry->pid, __entry->vasid, __entry->winid,
+ __entry->paste_kaddr)
+);
+
+#endif /* _VAS_TRACE_H */
+
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH ../../arch/powerpc/platforms/powernv
+#define TRACE_INCLUDE_FILE vas-trace
+#include <trace/define_trace.h>
diff --git a/arch/powerpc/platforms/powernv/vas-window.c b/arch/powerpc/platforms/powernv/vas-window.c
new file mode 100644
index 000000000..e59e0e60e
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/vas-window.c
@@ -0,0 +1,1279 @@
+/*
+ * Copyright 2016-17 IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#define pr_fmt(fmt) "vas: " fmt
+
+#include <linux/types.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include <linux/io.h>
+#include <linux/log2.h>
+#include <linux/rcupdate.h>
+#include <linux/cred.h>
+#include <asm/switch_to.h>
+#include <asm/ppc-opcode.h>
+#include "vas.h"
+#include "copy-paste.h"
+
+#define CREATE_TRACE_POINTS
+#include "vas-trace.h"
+
+/*
+ * Compute the paste address region for the window @window using the
+ * ->paste_base_addr and ->paste_win_id_shift we got from device tree.
+ */
+static void compute_paste_address(struct vas_window *window, u64 *addr, int *len)
+{
+ int winid;
+ u64 base, shift;
+
+ base = window->vinst->paste_base_addr;
+ shift = window->vinst->paste_win_id_shift;
+ winid = window->winid;
+
+ *addr = base + (winid << shift);
+ if (len)
+ *len = PAGE_SIZE;
+
+ pr_debug("Txwin #%d: Paste addr 0x%llx\n", winid, *addr);
+}
+
+u64 vas_win_paste_addr(struct vas_window *win)
+{
+ u64 addr;
+
+ compute_paste_address(win, &addr, NULL);
+
+ return addr;
+}
+EXPORT_SYMBOL(vas_win_paste_addr);
+
+static inline void get_hvwc_mmio_bar(struct vas_window *window,
+ u64 *start, int *len)
+{
+ u64 pbaddr;
+
+ pbaddr = window->vinst->hvwc_bar_start;
+ *start = pbaddr + window->winid * VAS_HVWC_SIZE;
+ *len = VAS_HVWC_SIZE;
+}
+
+static inline void get_uwc_mmio_bar(struct vas_window *window,
+ u64 *start, int *len)
+{
+ u64 pbaddr;
+
+ pbaddr = window->vinst->uwc_bar_start;
+ *start = pbaddr + window->winid * VAS_UWC_SIZE;
+ *len = VAS_UWC_SIZE;
+}
+
+/*
+ * Map the paste bus address of the given send window into kernel address
+ * space. Unlike MMIO regions (map_mmio_region() below), paste region must
+ * be mapped cache-able and is only applicable to send windows.
+ */
+static void *map_paste_region(struct vas_window *txwin)
+{
+ int len;
+ void *map;
+ char *name;
+ u64 start;
+
+ name = kasprintf(GFP_KERNEL, "window-v%d-w%d", txwin->vinst->vas_id,
+ txwin->winid);
+ if (!name)
+ goto free_name;
+
+ txwin->paste_addr_name = name;
+ compute_paste_address(txwin, &start, &len);
+
+ if (!request_mem_region(start, len, name)) {
+ pr_devel("%s(): request_mem_region(0x%llx, %d) failed\n",
+ __func__, start, len);
+ goto free_name;
+ }
+
+ map = ioremap_cache(start, len);
+ if (!map) {
+ pr_devel("%s(): ioremap_cache(0x%llx, %d) failed\n", __func__,
+ start, len);
+ goto free_name;
+ }
+
+ pr_devel("Mapped paste addr 0x%llx to kaddr 0x%p\n", start, map);
+ return map;
+
+free_name:
+ kfree(name);
+ return ERR_PTR(-ENOMEM);
+}
+
+static void *map_mmio_region(char *name, u64 start, int len)
+{
+ void *map;
+
+ if (!request_mem_region(start, len, name)) {
+ pr_devel("%s(): request_mem_region(0x%llx, %d) failed\n",
+ __func__, start, len);
+ return NULL;
+ }
+
+ map = ioremap(start, len);
+ if (!map) {
+ pr_devel("%s(): ioremap(0x%llx, %d) failed\n", __func__, start,
+ len);
+ return NULL;
+ }
+
+ return map;
+}
+
+static void unmap_region(void *addr, u64 start, int len)
+{
+ iounmap(addr);
+ release_mem_region((phys_addr_t)start, len);
+}
+
+/*
+ * Unmap the paste address region for a window.
+ */
+static void unmap_paste_region(struct vas_window *window)
+{
+ int len;
+ u64 busaddr_start;
+
+ if (window->paste_kaddr) {
+ compute_paste_address(window, &busaddr_start, &len);
+ unmap_region(window->paste_kaddr, busaddr_start, len);
+ window->paste_kaddr = NULL;
+ kfree(window->paste_addr_name);
+ window->paste_addr_name = NULL;
+ }
+}
+
+/*
+ * Unmap the MMIO regions for a window. Hold the vas_mutex so we don't
+ * unmap when the window's debugfs dir is in use. This serializes close
+ * of a window even on another VAS instance but since its not a critical
+ * path, just minimize the time we hold the mutex for now. We can add
+ * a per-instance mutex later if necessary.
+ */
+static void unmap_winctx_mmio_bars(struct vas_window *window)
+{
+ int len;
+ void *uwc_map;
+ void *hvwc_map;
+ u64 busaddr_start;
+
+ mutex_lock(&vas_mutex);
+
+ hvwc_map = window->hvwc_map;
+ window->hvwc_map = NULL;
+
+ uwc_map = window->uwc_map;
+ window->uwc_map = NULL;
+
+ mutex_unlock(&vas_mutex);
+
+ if (hvwc_map) {
+ get_hvwc_mmio_bar(window, &busaddr_start, &len);
+ unmap_region(hvwc_map, busaddr_start, len);
+ }
+
+ if (uwc_map) {
+ get_uwc_mmio_bar(window, &busaddr_start, &len);
+ unmap_region(uwc_map, busaddr_start, len);
+ }
+}
+
+/*
+ * Find the Hypervisor Window Context (HVWC) MMIO Base Address Region and the
+ * OS/User Window Context (UWC) MMIO Base Address Region for the given window.
+ * Map these bus addresses and save the mapped kernel addresses in @window.
+ */
+int map_winctx_mmio_bars(struct vas_window *window)
+{
+ int len;
+ u64 start;
+
+ get_hvwc_mmio_bar(window, &start, &len);
+ window->hvwc_map = map_mmio_region("HVWCM_Window", start, len);
+
+ get_uwc_mmio_bar(window, &start, &len);
+ window->uwc_map = map_mmio_region("UWCM_Window", start, len);
+
+ if (!window->hvwc_map || !window->uwc_map) {
+ unmap_winctx_mmio_bars(window);
+ return -1;
+ }
+
+ return 0;
+}
+
+/*
+ * Reset all valid registers in the HV and OS/User Window Contexts for
+ * the window identified by @window.
+ *
+ * NOTE: We cannot really use a for loop to reset window context. Not all
+ * offsets in a window context are valid registers and the valid
+ * registers are not sequential. And, we can only write to offsets
+ * with valid registers.
+ */
+void reset_window_regs(struct vas_window *window)
+{
+ write_hvwc_reg(window, VREG(LPID), 0ULL);
+ write_hvwc_reg(window, VREG(PID), 0ULL);
+ write_hvwc_reg(window, VREG(XLATE_MSR), 0ULL);
+ write_hvwc_reg(window, VREG(XLATE_LPCR), 0ULL);
+ write_hvwc_reg(window, VREG(XLATE_CTL), 0ULL);
+ write_hvwc_reg(window, VREG(AMR), 0ULL);
+ write_hvwc_reg(window, VREG(SEIDR), 0ULL);
+ write_hvwc_reg(window, VREG(FAULT_TX_WIN), 0ULL);
+ write_hvwc_reg(window, VREG(OSU_INTR_SRC_RA), 0ULL);
+ write_hvwc_reg(window, VREG(HV_INTR_SRC_RA), 0ULL);
+ write_hvwc_reg(window, VREG(PSWID), 0ULL);
+ write_hvwc_reg(window, VREG(LFIFO_BAR), 0ULL);
+ write_hvwc_reg(window, VREG(LDATA_STAMP_CTL), 0ULL);
+ write_hvwc_reg(window, VREG(LDMA_CACHE_CTL), 0ULL);
+ write_hvwc_reg(window, VREG(LRFIFO_PUSH), 0ULL);
+ write_hvwc_reg(window, VREG(CURR_MSG_COUNT), 0ULL);
+ write_hvwc_reg(window, VREG(LNOTIFY_AFTER_COUNT), 0ULL);
+ write_hvwc_reg(window, VREG(LRX_WCRED), 0ULL);
+ write_hvwc_reg(window, VREG(LRX_WCRED_ADDER), 0ULL);
+ write_hvwc_reg(window, VREG(TX_WCRED), 0ULL);
+ write_hvwc_reg(window, VREG(TX_WCRED_ADDER), 0ULL);
+ write_hvwc_reg(window, VREG(LFIFO_SIZE), 0ULL);
+ write_hvwc_reg(window, VREG(WINCTL), 0ULL);
+ write_hvwc_reg(window, VREG(WIN_STATUS), 0ULL);
+ write_hvwc_reg(window, VREG(WIN_CTX_CACHING_CTL), 0ULL);
+ write_hvwc_reg(window, VREG(TX_RSVD_BUF_COUNT), 0ULL);
+ write_hvwc_reg(window, VREG(LRFIFO_WIN_PTR), 0ULL);
+ write_hvwc_reg(window, VREG(LNOTIFY_CTL), 0ULL);
+ write_hvwc_reg(window, VREG(LNOTIFY_PID), 0ULL);
+ write_hvwc_reg(window, VREG(LNOTIFY_LPID), 0ULL);
+ write_hvwc_reg(window, VREG(LNOTIFY_TID), 0ULL);
+ write_hvwc_reg(window, VREG(LNOTIFY_SCOPE), 0ULL);
+ write_hvwc_reg(window, VREG(NX_UTIL_ADDER), 0ULL);
+
+ /* Skip read-only registers: NX_UTIL and NX_UTIL_SE */
+
+ /*
+ * The send and receive window credit adder registers are also
+ * accessible from HVWC and have been initialized above. We don't
+ * need to initialize from the OS/User Window Context, so skip
+ * following calls:
+ *
+ * write_uwc_reg(window, VREG(TX_WCRED_ADDER), 0ULL);
+ * write_uwc_reg(window, VREG(LRX_WCRED_ADDER), 0ULL);
+ */
+}
+
+/*
+ * Initialize window context registers related to Address Translation.
+ * These registers are common to send/receive windows although they
+ * differ for user/kernel windows. As we resolve the TODOs we may
+ * want to add fields to vas_winctx and move the initialization to
+ * init_vas_winctx_regs().
+ */
+static void init_xlate_regs(struct vas_window *window, bool user_win)
+{
+ u64 lpcr, val;
+
+ /*
+ * MSR_TA, MSR_US are false for both kernel and user.
+ * MSR_DR and MSR_PR are false for kernel.
+ */
+ val = 0ULL;
+ val = SET_FIELD(VAS_XLATE_MSR_HV, val, 1);
+ val = SET_FIELD(VAS_XLATE_MSR_SF, val, 1);
+ if (user_win) {
+ val = SET_FIELD(VAS_XLATE_MSR_DR, val, 1);
+ val = SET_FIELD(VAS_XLATE_MSR_PR, val, 1);
+ }
+ write_hvwc_reg(window, VREG(XLATE_MSR), val);
+
+ lpcr = mfspr(SPRN_LPCR);
+ val = 0ULL;
+ /*
+ * NOTE: From Section 5.7.8.1 Segment Lookaside Buffer of the
+ * Power ISA, v3.0B, Page size encoding is 0 = 4KB, 5 = 64KB.
+ *
+ * NOTE: From Section 1.3.1, Address Translation Context of the
+ * Nest MMU Workbook, LPCR_SC should be 0 for Power9.
+ */
+ val = SET_FIELD(VAS_XLATE_LPCR_PAGE_SIZE, val, 5);
+ val = SET_FIELD(VAS_XLATE_LPCR_ISL, val, lpcr & LPCR_ISL);
+ val = SET_FIELD(VAS_XLATE_LPCR_TC, val, lpcr & LPCR_TC);
+ val = SET_FIELD(VAS_XLATE_LPCR_SC, val, 0);
+ write_hvwc_reg(window, VREG(XLATE_LPCR), val);
+
+ /*
+ * Section 1.3.1 (Address translation Context) of NMMU workbook.
+ * 0b00 Hashed Page Table mode
+ * 0b01 Reserved
+ * 0b10 Radix on HPT
+ * 0b11 Radix on Radix
+ */
+ val = 0ULL;
+ val = SET_FIELD(VAS_XLATE_MODE, val, radix_enabled() ? 3 : 2);
+ write_hvwc_reg(window, VREG(XLATE_CTL), val);
+
+ /*
+ * TODO: Can we mfspr(AMR) even for user windows?
+ */
+ val = 0ULL;
+ val = SET_FIELD(VAS_AMR, val, mfspr(SPRN_AMR));
+ write_hvwc_reg(window, VREG(AMR), val);
+
+ val = 0ULL;
+ val = SET_FIELD(VAS_SEIDR, val, 0);
+ write_hvwc_reg(window, VREG(SEIDR), val);
+}
+
+/*
+ * Initialize Reserved Send Buffer Count for the send window. It involves
+ * writing to the register, reading it back to confirm that the hardware
+ * has enough buffers to reserve. See section 1.3.1.2.1 of VAS workbook.
+ *
+ * Since we can only make a best-effort attempt to fulfill the request,
+ * we don't return any errors if we cannot.
+ *
+ * TODO: Reserved (aka dedicated) send buffers are not supported yet.
+ */
+static void init_rsvd_tx_buf_count(struct vas_window *txwin,
+ struct vas_winctx *winctx)
+{
+ write_hvwc_reg(txwin, VREG(TX_RSVD_BUF_COUNT), 0ULL);
+}
+
+/*
+ * init_winctx_regs()
+ * Initialize window context registers for a receive window.
+ * Except for caching control and marking window open, the registers
+ * are initialized in the order listed in Section 3.1.4 (Window Context
+ * Cache Register Details) of the VAS workbook although they don't need
+ * to be.
+ *
+ * Design note: For NX receive windows, NX allocates the FIFO buffer in OPAL
+ * (so that it can get a large contiguous area) and passes that buffer
+ * to kernel via device tree. We now write that buffer address to the
+ * FIFO BAR. Would it make sense to do this all in OPAL? i.e have OPAL
+ * write the per-chip RX FIFO addresses to the windows during boot-up
+ * as a one-time task? That could work for NX but what about other
+ * receivers? Let the receivers tell us the rx-fifo buffers for now.
+ */
+int init_winctx_regs(struct vas_window *window, struct vas_winctx *winctx)
+{
+ u64 val;
+ int fifo_size;
+
+ reset_window_regs(window);
+
+ val = 0ULL;
+ val = SET_FIELD(VAS_LPID, val, winctx->lpid);
+ write_hvwc_reg(window, VREG(LPID), val);
+
+ val = 0ULL;
+ val = SET_FIELD(VAS_PID_ID, val, winctx->pidr);
+ write_hvwc_reg(window, VREG(PID), val);
+
+ init_xlate_regs(window, winctx->user_win);
+
+ val = 0ULL;
+ val = SET_FIELD(VAS_FAULT_TX_WIN, val, 0);
+ write_hvwc_reg(window, VREG(FAULT_TX_WIN), val);
+
+ /* In PowerNV, interrupts go to HV. */
+ write_hvwc_reg(window, VREG(OSU_INTR_SRC_RA), 0ULL);
+
+ val = 0ULL;
+ val = SET_FIELD(VAS_HV_INTR_SRC_RA, val, winctx->irq_port);
+ write_hvwc_reg(window, VREG(HV_INTR_SRC_RA), val);
+
+ val = 0ULL;
+ val = SET_FIELD(VAS_PSWID_EA_HANDLE, val, winctx->pswid);
+ write_hvwc_reg(window, VREG(PSWID), val);
+
+ write_hvwc_reg(window, VREG(SPARE1), 0ULL);
+ write_hvwc_reg(window, VREG(SPARE2), 0ULL);
+ write_hvwc_reg(window, VREG(SPARE3), 0ULL);
+
+ /*
+ * NOTE: VAS expects the FIFO address to be copied into the LFIFO_BAR
+ * register as is - do NOT shift the address into VAS_LFIFO_BAR
+ * bit fields! Ok to set the page migration select fields -
+ * VAS ignores the lower 10+ bits in the address anyway, because
+ * the minimum FIFO size is 1K?
+ *
+ * See also: Design note in function header.
+ */
+ val = __pa(winctx->rx_fifo);
+ val = SET_FIELD(VAS_PAGE_MIGRATION_SELECT, val, 0);
+ write_hvwc_reg(window, VREG(LFIFO_BAR), val);
+
+ val = 0ULL;
+ val = SET_FIELD(VAS_LDATA_STAMP, val, winctx->data_stamp);
+ write_hvwc_reg(window, VREG(LDATA_STAMP_CTL), val);
+
+ val = 0ULL;
+ val = SET_FIELD(VAS_LDMA_TYPE, val, winctx->dma_type);
+ val = SET_FIELD(VAS_LDMA_FIFO_DISABLE, val, winctx->fifo_disable);
+ write_hvwc_reg(window, VREG(LDMA_CACHE_CTL), val);
+
+ write_hvwc_reg(window, VREG(LRFIFO_PUSH), 0ULL);
+ write_hvwc_reg(window, VREG(CURR_MSG_COUNT), 0ULL);
+ write_hvwc_reg(window, VREG(LNOTIFY_AFTER_COUNT), 0ULL);
+
+ val = 0ULL;
+ val = SET_FIELD(VAS_LRX_WCRED, val, winctx->wcreds_max);
+ write_hvwc_reg(window, VREG(LRX_WCRED), val);
+
+ val = 0ULL;
+ val = SET_FIELD(VAS_TX_WCRED, val, winctx->wcreds_max);
+ write_hvwc_reg(window, VREG(TX_WCRED), val);
+
+ write_hvwc_reg(window, VREG(LRX_WCRED_ADDER), 0ULL);
+ write_hvwc_reg(window, VREG(TX_WCRED_ADDER), 0ULL);
+
+ fifo_size = winctx->rx_fifo_size / 1024;
+
+ val = 0ULL;
+ val = SET_FIELD(VAS_LFIFO_SIZE, val, ilog2(fifo_size));
+ write_hvwc_reg(window, VREG(LFIFO_SIZE), val);
+
+ /* Update window control and caching control registers last so
+ * we mark the window open only after fully initializing it and
+ * pushing context to cache.
+ */
+
+ write_hvwc_reg(window, VREG(WIN_STATUS), 0ULL);
+
+ init_rsvd_tx_buf_count(window, winctx);
+
+ /* for a send window, point to the matching receive window */
+ val = 0ULL;
+ val = SET_FIELD(VAS_LRX_WIN_ID, val, winctx->rx_win_id);
+ write_hvwc_reg(window, VREG(LRFIFO_WIN_PTR), val);
+
+ write_hvwc_reg(window, VREG(SPARE4), 0ULL);
+
+ val = 0ULL;
+ val = SET_FIELD(VAS_NOTIFY_DISABLE, val, winctx->notify_disable);
+ val = SET_FIELD(VAS_INTR_DISABLE, val, winctx->intr_disable);
+ val = SET_FIELD(VAS_NOTIFY_EARLY, val, winctx->notify_early);
+ val = SET_FIELD(VAS_NOTIFY_OSU_INTR, val, winctx->notify_os_intr_reg);
+ write_hvwc_reg(window, VREG(LNOTIFY_CTL), val);
+
+ val = 0ULL;
+ val = SET_FIELD(VAS_LNOTIFY_PID, val, winctx->lnotify_pid);
+ write_hvwc_reg(window, VREG(LNOTIFY_PID), val);
+
+ val = 0ULL;
+ val = SET_FIELD(VAS_LNOTIFY_LPID, val, winctx->lnotify_lpid);
+ write_hvwc_reg(window, VREG(LNOTIFY_LPID), val);
+
+ val = 0ULL;
+ val = SET_FIELD(VAS_LNOTIFY_TID, val, winctx->lnotify_tid);
+ write_hvwc_reg(window, VREG(LNOTIFY_TID), val);
+
+ val = 0ULL;
+ val = SET_FIELD(VAS_LNOTIFY_MIN_SCOPE, val, winctx->min_scope);
+ val = SET_FIELD(VAS_LNOTIFY_MAX_SCOPE, val, winctx->max_scope);
+ write_hvwc_reg(window, VREG(LNOTIFY_SCOPE), val);
+
+ /* Skip read-only registers NX_UTIL and NX_UTIL_SE */
+
+ write_hvwc_reg(window, VREG(SPARE5), 0ULL);
+ write_hvwc_reg(window, VREG(NX_UTIL_ADDER), 0ULL);
+ write_hvwc_reg(window, VREG(SPARE6), 0ULL);
+
+ /* Finally, push window context to memory and... */
+ val = 0ULL;
+ val = SET_FIELD(VAS_PUSH_TO_MEM, val, 1);
+ write_hvwc_reg(window, VREG(WIN_CTX_CACHING_CTL), val);
+
+ /* ... mark the window open for business */
+ val = 0ULL;
+ val = SET_FIELD(VAS_WINCTL_REJ_NO_CREDIT, val, winctx->rej_no_credit);
+ val = SET_FIELD(VAS_WINCTL_PIN, val, winctx->pin_win);
+ val = SET_FIELD(VAS_WINCTL_TX_WCRED_MODE, val, winctx->tx_wcred_mode);
+ val = SET_FIELD(VAS_WINCTL_RX_WCRED_MODE, val, winctx->rx_wcred_mode);
+ val = SET_FIELD(VAS_WINCTL_TX_WORD_MODE, val, winctx->tx_word_mode);
+ val = SET_FIELD(VAS_WINCTL_RX_WORD_MODE, val, winctx->rx_word_mode);
+ val = SET_FIELD(VAS_WINCTL_FAULT_WIN, val, winctx->fault_win);
+ val = SET_FIELD(VAS_WINCTL_NX_WIN, val, winctx->nx_win);
+ val = SET_FIELD(VAS_WINCTL_OPEN, val, 1);
+ write_hvwc_reg(window, VREG(WINCTL), val);
+
+ return 0;
+}
+
+static void vas_release_window_id(struct ida *ida, int winid)
+{
+ ida_free(ida, winid);
+}
+
+static int vas_assign_window_id(struct ida *ida)
+{
+ int winid = ida_alloc_max(ida, VAS_WINDOWS_PER_CHIP - 1, GFP_KERNEL);
+
+ if (winid == -ENOSPC) {
+ pr_err("Too many (%d) open windows\n", VAS_WINDOWS_PER_CHIP);
+ return -EAGAIN;
+ }
+
+ return winid;
+}
+
+static void vas_window_free(struct vas_window *window)
+{
+ int winid = window->winid;
+ struct vas_instance *vinst = window->vinst;
+
+ unmap_winctx_mmio_bars(window);
+
+ vas_window_free_dbgdir(window);
+
+ kfree(window);
+
+ vas_release_window_id(&vinst->ida, winid);
+}
+
+static struct vas_window *vas_window_alloc(struct vas_instance *vinst)
+{
+ int winid;
+ struct vas_window *window;
+
+ winid = vas_assign_window_id(&vinst->ida);
+ if (winid < 0)
+ return ERR_PTR(winid);
+
+ window = kzalloc(sizeof(*window), GFP_KERNEL);
+ if (!window)
+ goto out_free;
+
+ window->vinst = vinst;
+ window->winid = winid;
+
+ if (map_winctx_mmio_bars(window))
+ goto out_free;
+
+ vas_window_init_dbgdir(window);
+
+ return window;
+
+out_free:
+ kfree(window);
+ vas_release_window_id(&vinst->ida, winid);
+ return ERR_PTR(-ENOMEM);
+}
+
+static void put_rx_win(struct vas_window *rxwin)
+{
+ /* Better not be a send window! */
+ WARN_ON_ONCE(rxwin->tx_win);
+
+ atomic_dec(&rxwin->num_txwins);
+}
+
+/*
+ * Find the user space receive window given the @pswid.
+ * - We must have a valid vasid and it must belong to this instance.
+ * (so both send and receive windows are on the same VAS instance)
+ * - The window must refer to an OPEN, FTW, RECEIVE window.
+ *
+ * NOTE: We access ->windows[] table and assume that vinst->mutex is held.
+ */
+static struct vas_window *get_user_rxwin(struct vas_instance *vinst, u32 pswid)
+{
+ int vasid, winid;
+ struct vas_window *rxwin;
+
+ decode_pswid(pswid, &vasid, &winid);
+
+ if (vinst->vas_id != vasid)
+ return ERR_PTR(-EINVAL);
+
+ rxwin = vinst->windows[winid];
+
+ if (!rxwin || rxwin->tx_win || rxwin->cop != VAS_COP_TYPE_FTW)
+ return ERR_PTR(-EINVAL);
+
+ return rxwin;
+}
+
+/*
+ * Get the VAS receive window associated with NX engine identified
+ * by @cop and if applicable, @pswid.
+ *
+ * See also function header of set_vinst_win().
+ */
+static struct vas_window *get_vinst_rxwin(struct vas_instance *vinst,
+ enum vas_cop_type cop, u32 pswid)
+{
+ struct vas_window *rxwin;
+
+ mutex_lock(&vinst->mutex);
+
+ if (cop == VAS_COP_TYPE_FTW)
+ rxwin = get_user_rxwin(vinst, pswid);
+ else
+ rxwin = vinst->rxwin[cop] ?: ERR_PTR(-EINVAL);
+
+ if (!IS_ERR(rxwin))
+ atomic_inc(&rxwin->num_txwins);
+
+ mutex_unlock(&vinst->mutex);
+
+ return rxwin;
+}
+
+/*
+ * We have two tables of windows in a VAS instance. The first one,
+ * ->windows[], contains all the windows in the instance and allows
+ * looking up a window by its id. It is used to look up send windows
+ * during fault handling and receive windows when pairing user space
+ * send/receive windows.
+ *
+ * The second table, ->rxwin[], contains receive windows that are
+ * associated with NX engines. This table has VAS_COP_TYPE_MAX
+ * entries and is used to look up a receive window by its
+ * coprocessor type.
+ *
+ * Here, we save @window in the ->windows[] table. If it is a receive
+ * window, we also save the window in the ->rxwin[] table.
+ */
+static void set_vinst_win(struct vas_instance *vinst,
+ struct vas_window *window)
+{
+ int id = window->winid;
+
+ mutex_lock(&vinst->mutex);
+
+ /*
+ * There should only be one receive window for a coprocessor type
+ * unless its a user (FTW) window.
+ */
+ if (!window->user_win && !window->tx_win) {
+ WARN_ON_ONCE(vinst->rxwin[window->cop]);
+ vinst->rxwin[window->cop] = window;
+ }
+
+ WARN_ON_ONCE(vinst->windows[id] != NULL);
+ vinst->windows[id] = window;
+
+ mutex_unlock(&vinst->mutex);
+}
+
+/*
+ * Clear this window from the table(s) of windows for this VAS instance.
+ * See also function header of set_vinst_win().
+ */
+static void clear_vinst_win(struct vas_window *window)
+{
+ int id = window->winid;
+ struct vas_instance *vinst = window->vinst;
+
+ mutex_lock(&vinst->mutex);
+
+ if (!window->user_win && !window->tx_win) {
+ WARN_ON_ONCE(!vinst->rxwin[window->cop]);
+ vinst->rxwin[window->cop] = NULL;
+ }
+
+ WARN_ON_ONCE(vinst->windows[id] != window);
+ vinst->windows[id] = NULL;
+
+ mutex_unlock(&vinst->mutex);
+}
+
+static void init_winctx_for_rxwin(struct vas_window *rxwin,
+ struct vas_rx_win_attr *rxattr,
+ struct vas_winctx *winctx)
+{
+ /*
+ * We first zero (memset()) all fields and only set non-zero fields.
+ * Following fields are 0/false but maybe deserve a comment:
+ *
+ * ->notify_os_intr_reg In powerNV, send intrs to HV
+ * ->notify_disable False for NX windows
+ * ->intr_disable False for Fault Windows
+ * ->xtra_write False for NX windows
+ * ->notify_early NA for NX windows
+ * ->rsvd_txbuf_count NA for Rx windows
+ * ->lpid, ->pid, ->tid NA for Rx windows
+ */
+
+ memset(winctx, 0, sizeof(struct vas_winctx));
+
+ winctx->rx_fifo = rxattr->rx_fifo;
+ winctx->rx_fifo_size = rxattr->rx_fifo_size;
+ winctx->wcreds_max = rxwin->wcreds_max;
+ winctx->pin_win = rxattr->pin_win;
+
+ winctx->nx_win = rxattr->nx_win;
+ winctx->fault_win = rxattr->fault_win;
+ winctx->user_win = rxattr->user_win;
+ winctx->rej_no_credit = rxattr->rej_no_credit;
+ winctx->rx_word_mode = rxattr->rx_win_ord_mode;
+ winctx->tx_word_mode = rxattr->tx_win_ord_mode;
+ winctx->rx_wcred_mode = rxattr->rx_wcred_mode;
+ winctx->tx_wcred_mode = rxattr->tx_wcred_mode;
+ winctx->notify_early = rxattr->notify_early;
+
+ if (winctx->nx_win) {
+ winctx->data_stamp = true;
+ winctx->intr_disable = true;
+ winctx->pin_win = true;
+
+ WARN_ON_ONCE(winctx->fault_win);
+ WARN_ON_ONCE(!winctx->rx_word_mode);
+ WARN_ON_ONCE(!winctx->tx_word_mode);
+ WARN_ON_ONCE(winctx->notify_after_count);
+ } else if (winctx->fault_win) {
+ winctx->notify_disable = true;
+ } else if (winctx->user_win) {
+ /*
+ * Section 1.8.1 Low Latency Core-Core Wake up of
+ * the VAS workbook:
+ *
+ * - disable credit checks ([tr]x_wcred_mode = false)
+ * - disable FIFO writes
+ * - enable ASB_Notify, disable interrupt
+ */
+ winctx->fifo_disable = true;
+ winctx->intr_disable = true;
+ winctx->rx_fifo = NULL;
+ }
+
+ winctx->lnotify_lpid = rxattr->lnotify_lpid;
+ winctx->lnotify_pid = rxattr->lnotify_pid;
+ winctx->lnotify_tid = rxattr->lnotify_tid;
+ winctx->pswid = rxattr->pswid;
+ winctx->dma_type = VAS_DMA_TYPE_INJECT;
+ winctx->tc_mode = rxattr->tc_mode;
+
+ winctx->min_scope = VAS_SCOPE_LOCAL;
+ winctx->max_scope = VAS_SCOPE_VECTORED_GROUP;
+}
+
+static bool rx_win_args_valid(enum vas_cop_type cop,
+ struct vas_rx_win_attr *attr)
+{
+ pr_debug("Rxattr: fault %d, notify %d, intr %d, early %d, fifo %d\n",
+ attr->fault_win, attr->notify_disable,
+ attr->intr_disable, attr->notify_early,
+ attr->rx_fifo_size);
+
+ if (cop >= VAS_COP_TYPE_MAX)
+ return false;
+
+ if (cop != VAS_COP_TYPE_FTW &&
+ attr->rx_fifo_size < VAS_RX_FIFO_SIZE_MIN)
+ return false;
+
+ if (attr->rx_fifo_size > VAS_RX_FIFO_SIZE_MAX)
+ return false;
+
+ if (attr->wcreds_max > VAS_RX_WCREDS_MAX)
+ return false;
+
+ if (attr->nx_win) {
+ /* cannot be fault or user window if it is nx */
+ if (attr->fault_win || attr->user_win)
+ return false;
+ /*
+ * Section 3.1.4.32: NX Windows must not disable notification,
+ * and must not enable interrupts or early notification.
+ */
+ if (attr->notify_disable || !attr->intr_disable ||
+ attr->notify_early)
+ return false;
+ } else if (attr->fault_win) {
+ /* cannot be both fault and user window */
+ if (attr->user_win)
+ return false;
+
+ /*
+ * Section 3.1.4.32: Fault windows must disable notification
+ * but not interrupts.
+ */
+ if (!attr->notify_disable || attr->intr_disable)
+ return false;
+
+ } else if (attr->user_win) {
+ /*
+ * User receive windows are only for fast-thread-wakeup
+ * (FTW). They don't need a FIFO and must disable interrupts
+ */
+ if (attr->rx_fifo || attr->rx_fifo_size || !attr->intr_disable)
+ return false;
+ } else {
+ /* Rx window must be one of NX or Fault or User window. */
+ return false;
+ }
+
+ return true;
+}
+
+void vas_init_rx_win_attr(struct vas_rx_win_attr *rxattr, enum vas_cop_type cop)
+{
+ memset(rxattr, 0, sizeof(*rxattr));
+
+ if (cop == VAS_COP_TYPE_842 || cop == VAS_COP_TYPE_842_HIPRI) {
+ rxattr->pin_win = true;
+ rxattr->nx_win = true;
+ rxattr->fault_win = false;
+ rxattr->intr_disable = true;
+ rxattr->rx_wcred_mode = true;
+ rxattr->tx_wcred_mode = true;
+ rxattr->rx_win_ord_mode = true;
+ rxattr->tx_win_ord_mode = true;
+ } else if (cop == VAS_COP_TYPE_FAULT) {
+ rxattr->pin_win = true;
+ rxattr->fault_win = true;
+ rxattr->notify_disable = true;
+ rxattr->rx_wcred_mode = true;
+ rxattr->tx_wcred_mode = true;
+ rxattr->rx_win_ord_mode = true;
+ rxattr->tx_win_ord_mode = true;
+ } else if (cop == VAS_COP_TYPE_FTW) {
+ rxattr->user_win = true;
+ rxattr->intr_disable = true;
+
+ /*
+ * As noted in the VAS Workbook we disable credit checks.
+ * If we enable credit checks in the future, we must also
+ * implement a mechanism to return the user credits or new
+ * paste operations will fail.
+ */
+ }
+}
+EXPORT_SYMBOL_GPL(vas_init_rx_win_attr);
+
+struct vas_window *vas_rx_win_open(int vasid, enum vas_cop_type cop,
+ struct vas_rx_win_attr *rxattr)
+{
+ struct vas_window *rxwin;
+ struct vas_winctx winctx;
+ struct vas_instance *vinst;
+
+ trace_vas_rx_win_open(current, vasid, cop, rxattr);
+
+ if (!rx_win_args_valid(cop, rxattr))
+ return ERR_PTR(-EINVAL);
+
+ vinst = find_vas_instance(vasid);
+ if (!vinst) {
+ pr_devel("vasid %d not found!\n", vasid);
+ return ERR_PTR(-EINVAL);
+ }
+ pr_devel("Found instance %d\n", vasid);
+
+ rxwin = vas_window_alloc(vinst);
+ if (IS_ERR(rxwin)) {
+ pr_devel("Unable to allocate memory for Rx window\n");
+ return rxwin;
+ }
+
+ rxwin->tx_win = false;
+ rxwin->nx_win = rxattr->nx_win;
+ rxwin->user_win = rxattr->user_win;
+ rxwin->cop = cop;
+ rxwin->wcreds_max = rxattr->wcreds_max ?: VAS_WCREDS_DEFAULT;
+ if (rxattr->user_win)
+ rxwin->pid = task_pid_vnr(current);
+
+ init_winctx_for_rxwin(rxwin, rxattr, &winctx);
+ init_winctx_regs(rxwin, &winctx);
+
+ set_vinst_win(vinst, rxwin);
+
+ return rxwin;
+}
+EXPORT_SYMBOL_GPL(vas_rx_win_open);
+
+void vas_init_tx_win_attr(struct vas_tx_win_attr *txattr, enum vas_cop_type cop)
+{
+ memset(txattr, 0, sizeof(*txattr));
+
+ if (cop == VAS_COP_TYPE_842 || cop == VAS_COP_TYPE_842_HIPRI) {
+ txattr->rej_no_credit = false;
+ txattr->rx_wcred_mode = true;
+ txattr->tx_wcred_mode = true;
+ txattr->rx_win_ord_mode = true;
+ txattr->tx_win_ord_mode = true;
+ } else if (cop == VAS_COP_TYPE_FTW) {
+ txattr->user_win = true;
+ }
+}
+EXPORT_SYMBOL_GPL(vas_init_tx_win_attr);
+
+static void init_winctx_for_txwin(struct vas_window *txwin,
+ struct vas_tx_win_attr *txattr,
+ struct vas_winctx *winctx)
+{
+ /*
+ * We first zero all fields and only set non-zero ones. Following
+ * are some fields set to 0/false for the stated reason:
+ *
+ * ->notify_os_intr_reg In powernv, send intrs to HV
+ * ->rsvd_txbuf_count Not supported yet.
+ * ->notify_disable False for NX windows
+ * ->xtra_write False for NX windows
+ * ->notify_early NA for NX windows
+ * ->lnotify_lpid NA for Tx windows
+ * ->lnotify_pid NA for Tx windows
+ * ->lnotify_tid NA for Tx windows
+ * ->tx_win_cred_mode Ignore for now for NX windows
+ * ->rx_win_cred_mode Ignore for now for NX windows
+ */
+ memset(winctx, 0, sizeof(struct vas_winctx));
+
+ winctx->wcreds_max = txwin->wcreds_max;
+
+ winctx->user_win = txattr->user_win;
+ winctx->nx_win = txwin->rxwin->nx_win;
+ winctx->pin_win = txattr->pin_win;
+ winctx->rej_no_credit = txattr->rej_no_credit;
+ winctx->rsvd_txbuf_enable = txattr->rsvd_txbuf_enable;
+
+ winctx->rx_wcred_mode = txattr->rx_wcred_mode;
+ winctx->tx_wcred_mode = txattr->tx_wcred_mode;
+ winctx->rx_word_mode = txattr->rx_win_ord_mode;
+ winctx->tx_word_mode = txattr->tx_win_ord_mode;
+ winctx->rsvd_txbuf_count = txattr->rsvd_txbuf_count;
+
+ winctx->intr_disable = true;
+ if (winctx->nx_win)
+ winctx->data_stamp = true;
+
+ winctx->lpid = txattr->lpid;
+ winctx->pidr = txattr->pidr;
+ winctx->rx_win_id = txwin->rxwin->winid;
+
+ winctx->dma_type = VAS_DMA_TYPE_INJECT;
+ winctx->tc_mode = txattr->tc_mode;
+ winctx->min_scope = VAS_SCOPE_LOCAL;
+ winctx->max_scope = VAS_SCOPE_VECTORED_GROUP;
+
+ winctx->pswid = 0;
+}
+
+static bool tx_win_args_valid(enum vas_cop_type cop,
+ struct vas_tx_win_attr *attr)
+{
+ if (attr->tc_mode != VAS_THRESH_DISABLED)
+ return false;
+
+ if (cop > VAS_COP_TYPE_MAX)
+ return false;
+
+ if (attr->wcreds_max > VAS_TX_WCREDS_MAX)
+ return false;
+
+ if (attr->user_win &&
+ (cop != VAS_COP_TYPE_FTW || attr->rsvd_txbuf_count))
+ return false;
+
+ return true;
+}
+
+struct vas_window *vas_tx_win_open(int vasid, enum vas_cop_type cop,
+ struct vas_tx_win_attr *attr)
+{
+ int rc;
+ struct vas_window *txwin;
+ struct vas_window *rxwin;
+ struct vas_winctx winctx;
+ struct vas_instance *vinst;
+
+ trace_vas_tx_win_open(current, vasid, cop, attr);
+
+ if (!tx_win_args_valid(cop, attr))
+ return ERR_PTR(-EINVAL);
+
+ /*
+ * If caller did not specify a vasid but specified the PSWID of a
+ * receive window (applicable only to FTW windows), use the vasid
+ * from that receive window.
+ */
+ if (vasid == -1 && attr->pswid)
+ decode_pswid(attr->pswid, &vasid, NULL);
+
+ vinst = find_vas_instance(vasid);
+ if (!vinst) {
+ pr_devel("vasid %d not found!\n", vasid);
+ return ERR_PTR(-EINVAL);
+ }
+
+ rxwin = get_vinst_rxwin(vinst, cop, attr->pswid);
+ if (IS_ERR(rxwin)) {
+ pr_devel("No RxWin for vasid %d, cop %d\n", vasid, cop);
+ return rxwin;
+ }
+
+ txwin = vas_window_alloc(vinst);
+ if (IS_ERR(txwin)) {
+ rc = PTR_ERR(txwin);
+ goto put_rxwin;
+ }
+
+ txwin->cop = cop;
+ txwin->tx_win = 1;
+ txwin->rxwin = rxwin;
+ txwin->nx_win = txwin->rxwin->nx_win;
+ txwin->pid = attr->pid;
+ txwin->user_win = attr->user_win;
+ txwin->wcreds_max = attr->wcreds_max ?: VAS_WCREDS_DEFAULT;
+
+ init_winctx_for_txwin(txwin, attr, &winctx);
+
+ init_winctx_regs(txwin, &winctx);
+
+ /*
+ * If its a kernel send window, map the window address into the
+ * kernel's address space. For user windows, user must issue an
+ * mmap() to map the window into their address space.
+ *
+ * NOTE: If kernel ever resubmits a user CRB after handling a page
+ * fault, we will need to map this into kernel as well.
+ */
+ if (!txwin->user_win) {
+ txwin->paste_kaddr = map_paste_region(txwin);
+ if (IS_ERR(txwin->paste_kaddr)) {
+ rc = PTR_ERR(txwin->paste_kaddr);
+ goto free_window;
+ }
+ } else {
+ /*
+ * A user mapping must ensure that context switch issues
+ * CP_ABORT for this thread.
+ */
+ rc = set_thread_uses_vas();
+ if (rc)
+ goto free_window;
+ }
+
+ set_vinst_win(vinst, txwin);
+
+ return txwin;
+
+free_window:
+ vas_window_free(txwin);
+
+put_rxwin:
+ put_rx_win(rxwin);
+ return ERR_PTR(rc);
+
+}
+EXPORT_SYMBOL_GPL(vas_tx_win_open);
+
+int vas_copy_crb(void *crb, int offset)
+{
+ return vas_copy(crb, offset);
+}
+EXPORT_SYMBOL_GPL(vas_copy_crb);
+
+#define RMA_LSMP_REPORT_ENABLE PPC_BIT(53)
+int vas_paste_crb(struct vas_window *txwin, int offset, bool re)
+{
+ int rc;
+ void *addr;
+ uint64_t val;
+
+ trace_vas_paste_crb(current, txwin);
+
+ /*
+ * Only NX windows are supported for now and hardware assumes
+ * report-enable flag is set for NX windows. Ensure software
+ * complies too.
+ */
+ WARN_ON_ONCE(txwin->nx_win && !re);
+
+ addr = txwin->paste_kaddr;
+ if (re) {
+ /*
+ * Set the REPORT_ENABLE bit (equivalent to writing
+ * to 1K offset of the paste address)
+ */
+ val = SET_FIELD(RMA_LSMP_REPORT_ENABLE, 0ULL, 1);
+ addr += val;
+ }
+
+ /*
+ * Map the raw CR value from vas_paste() to an error code (there
+ * is just pass or fail for now though).
+ */
+ rc = vas_paste(addr, offset);
+ if (rc == 2)
+ rc = 0;
+ else
+ rc = -EINVAL;
+
+ pr_debug("Txwin #%d: Msg count %llu\n", txwin->winid,
+ read_hvwc_reg(txwin, VREG(LRFIFO_PUSH)));
+
+ return rc;
+}
+EXPORT_SYMBOL_GPL(vas_paste_crb);
+
+/*
+ * If credit checking is enabled for this window, poll for the return
+ * of window credits (i.e for NX engines to process any outstanding CRBs).
+ * Since NX-842 waits for the CRBs to be processed before closing the
+ * window, we should not have to wait for too long.
+ *
+ * TODO: We retry in 10ms intervals now. We could/should probably peek at
+ * the VAS_LRFIFO_PUSH_OFFSET register to get an estimate of pending
+ * CRBs on the FIFO and compute the delay dynamically on each retry.
+ * But that is not really needed until we support NX-GZIP access from
+ * user space. (NX-842 driver waits for CSB and Fast thread-wakeup
+ * doesn't use credit checking).
+ */
+static void poll_window_credits(struct vas_window *window)
+{
+ u64 val;
+ int creds, mode;
+
+ val = read_hvwc_reg(window, VREG(WINCTL));
+ if (window->tx_win)
+ mode = GET_FIELD(VAS_WINCTL_TX_WCRED_MODE, val);
+ else
+ mode = GET_FIELD(VAS_WINCTL_RX_WCRED_MODE, val);
+
+ if (!mode)
+ return;
+retry:
+ if (window->tx_win) {
+ val = read_hvwc_reg(window, VREG(TX_WCRED));
+ creds = GET_FIELD(VAS_TX_WCRED, val);
+ } else {
+ val = read_hvwc_reg(window, VREG(LRX_WCRED));
+ creds = GET_FIELD(VAS_LRX_WCRED, val);
+ }
+
+ if (creds < window->wcreds_max) {
+ val = 0;
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ schedule_timeout(msecs_to_jiffies(10));
+ goto retry;
+ }
+}
+
+/*
+ * Wait for the window to go to "not-busy" state. It should only take a
+ * short time to queue a CRB, so window should not be busy for too long.
+ * Trying 5ms intervals.
+ */
+static void poll_window_busy_state(struct vas_window *window)
+{
+ int busy;
+ u64 val;
+
+retry:
+ val = read_hvwc_reg(window, VREG(WIN_STATUS));
+ busy = GET_FIELD(VAS_WIN_BUSY, val);
+ if (busy) {
+ val = 0;
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ schedule_timeout(msecs_to_jiffies(5));
+ goto retry;
+ }
+}
+
+/*
+ * Have the hardware cast a window out of cache and wait for it to
+ * be completed.
+ *
+ * NOTE: It can take a relatively long time to cast the window context
+ * out of the cache. It is not strictly necessary to cast out if:
+ *
+ * - we clear the "Pin Window" bit (so hardware is free to evict)
+ *
+ * - we re-initialize the window context when it is reassigned.
+ *
+ * We do the former in vas_win_close() and latter in vas_win_open().
+ * So, ignoring the cast-out for now. We can add it as needed. If
+ * casting out becomes necessary we should consider offloading the
+ * job to a worker thread, so the window close can proceed quickly.
+ */
+static void poll_window_castout(struct vas_window *window)
+{
+ /* stub for now */
+}
+
+/*
+ * Unpin and close a window so no new requests are accepted and the
+ * hardware can evict this window from cache if necessary.
+ */
+static void unpin_close_window(struct vas_window *window)
+{
+ u64 val;
+
+ val = read_hvwc_reg(window, VREG(WINCTL));
+ val = SET_FIELD(VAS_WINCTL_PIN, val, 0);
+ val = SET_FIELD(VAS_WINCTL_OPEN, val, 0);
+ write_hvwc_reg(window, VREG(WINCTL), val);
+}
+
+/*
+ * Close a window.
+ *
+ * See Section 1.12.1 of VAS workbook v1.05 for details on closing window:
+ * - Disable new paste operations (unmap paste address)
+ * - Poll for the "Window Busy" bit to be cleared
+ * - Clear the Open/Enable bit for the Window.
+ * - Poll for return of window Credits (implies FIFO empty for Rx win?)
+ * - Unpin and cast window context out of cache
+ *
+ * Besides the hardware, kernel has some bookkeeping of course.
+ */
+int vas_win_close(struct vas_window *window)
+{
+ if (!window)
+ return 0;
+
+ if (!window->tx_win && atomic_read(&window->num_txwins) != 0) {
+ pr_devel("Attempting to close an active Rx window!\n");
+ WARN_ON_ONCE(1);
+ return -EBUSY;
+ }
+
+ unmap_paste_region(window);
+
+ clear_vinst_win(window);
+
+ poll_window_busy_state(window);
+
+ unpin_close_window(window);
+
+ poll_window_credits(window);
+
+ poll_window_castout(window);
+
+ /* if send window, drop reference to matching receive window */
+ if (window->tx_win)
+ put_rx_win(window->rxwin);
+
+ vas_window_free(window);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(vas_win_close);
+
+/*
+ * Return a system-wide unique window id for the window @win.
+ */
+u32 vas_win_id(struct vas_window *win)
+{
+ return encode_pswid(win->vinst->vas_id, win->winid);
+}
+EXPORT_SYMBOL_GPL(vas_win_id);
diff --git a/arch/powerpc/platforms/powernv/vas.c b/arch/powerpc/platforms/powernv/vas.c
new file mode 100644
index 000000000..5a2b24cbb
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/vas.c
@@ -0,0 +1,179 @@
+/*
+ * Copyright 2016-17 IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#define pr_fmt(fmt) "vas: " fmt
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/export.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/platform_device.h>
+#include <linux/of_platform.h>
+#include <linux/of_address.h>
+#include <linux/of.h>
+#include <asm/prom.h>
+
+#include "vas.h"
+
+DEFINE_MUTEX(vas_mutex);
+static LIST_HEAD(vas_instances);
+
+static DEFINE_PER_CPU(int, cpu_vas_id);
+
+static int init_vas_instance(struct platform_device *pdev)
+{
+ int rc, cpu, vasid;
+ struct resource *res;
+ struct vas_instance *vinst;
+ struct device_node *dn = pdev->dev.of_node;
+
+ rc = of_property_read_u32(dn, "ibm,vas-id", &vasid);
+ if (rc) {
+ pr_err("No ibm,vas-id property for %s?\n", pdev->name);
+ return -ENODEV;
+ }
+
+ if (pdev->num_resources != 4) {
+ pr_err("Unexpected DT configuration for [%s, %d]\n",
+ pdev->name, vasid);
+ return -ENODEV;
+ }
+
+ vinst = kzalloc(sizeof(*vinst), GFP_KERNEL);
+ if (!vinst)
+ return -ENOMEM;
+
+ INIT_LIST_HEAD(&vinst->node);
+ ida_init(&vinst->ida);
+ mutex_init(&vinst->mutex);
+ vinst->vas_id = vasid;
+ vinst->pdev = pdev;
+
+ res = &pdev->resource[0];
+ vinst->hvwc_bar_start = res->start;
+
+ res = &pdev->resource[1];
+ vinst->uwc_bar_start = res->start;
+
+ res = &pdev->resource[2];
+ vinst->paste_base_addr = res->start;
+
+ res = &pdev->resource[3];
+ if (res->end > 62) {
+ pr_err("Bad 'paste_win_id_shift' in DT, %llx\n", res->end);
+ goto free_vinst;
+ }
+
+ vinst->paste_win_id_shift = 63 - res->end;
+
+ pr_devel("Initialized instance [%s, %d], paste_base 0x%llx, "
+ "paste_win_id_shift 0x%llx\n", pdev->name, vasid,
+ vinst->paste_base_addr, vinst->paste_win_id_shift);
+
+ for_each_possible_cpu(cpu) {
+ if (cpu_to_chip_id(cpu) == of_get_ibm_chip_id(dn))
+ per_cpu(cpu_vas_id, cpu) = vasid;
+ }
+
+ mutex_lock(&vas_mutex);
+ list_add(&vinst->node, &vas_instances);
+ mutex_unlock(&vas_mutex);
+
+ vas_instance_init_dbgdir(vinst);
+
+ dev_set_drvdata(&pdev->dev, vinst);
+
+ return 0;
+
+free_vinst:
+ kfree(vinst);
+ return -ENODEV;
+
+}
+
+/*
+ * Although this is read/used multiple times, it is written to only
+ * during initialization.
+ */
+struct vas_instance *find_vas_instance(int vasid)
+{
+ struct list_head *ent;
+ struct vas_instance *vinst;
+
+ mutex_lock(&vas_mutex);
+
+ if (vasid == -1)
+ vasid = per_cpu(cpu_vas_id, smp_processor_id());
+
+ list_for_each(ent, &vas_instances) {
+ vinst = list_entry(ent, struct vas_instance, node);
+ if (vinst->vas_id == vasid) {
+ mutex_unlock(&vas_mutex);
+ return vinst;
+ }
+ }
+ mutex_unlock(&vas_mutex);
+
+ pr_devel("Instance %d not found\n", vasid);
+ return NULL;
+}
+
+int chip_to_vas_id(int chipid)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ if (cpu_to_chip_id(cpu) == chipid)
+ return per_cpu(cpu_vas_id, cpu);
+ }
+ return -1;
+}
+EXPORT_SYMBOL(chip_to_vas_id);
+
+static int vas_probe(struct platform_device *pdev)
+{
+ return init_vas_instance(pdev);
+}
+
+static const struct of_device_id powernv_vas_match[] = {
+ { .compatible = "ibm,vas",},
+ {},
+};
+
+static struct platform_driver vas_driver = {
+ .driver = {
+ .name = "vas",
+ .of_match_table = powernv_vas_match,
+ },
+ .probe = vas_probe,
+};
+
+static int __init vas_init(void)
+{
+ int found = 0;
+ struct device_node *dn;
+
+ platform_driver_register(&vas_driver);
+
+ for_each_compatible_node(dn, NULL, "ibm,vas") {
+ of_platform_device_create(dn, NULL, NULL);
+ found++;
+ }
+
+ if (!found) {
+ platform_driver_unregister(&vas_driver);
+ return -ENODEV;
+ }
+
+ pr_devel("Found %d instances\n", found);
+
+ return 0;
+}
+device_initcall(vas_init);
diff --git a/arch/powerpc/platforms/powernv/vas.h b/arch/powerpc/platforms/powernv/vas.h
new file mode 100644
index 000000000..f5493dbdd
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/vas.h
@@ -0,0 +1,479 @@
+/*
+ * Copyright 2016-17 IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#ifndef _VAS_H
+#define _VAS_H
+#include <linux/atomic.h>
+#include <linux/idr.h>
+#include <asm/vas.h>
+#include <linux/io.h>
+#include <linux/dcache.h>
+#include <linux/mutex.h>
+#include <linux/stringify.h>
+
+/*
+ * Overview of Virtual Accelerator Switchboard (VAS).
+ *
+ * VAS is a hardware "switchboard" that allows senders and receivers to
+ * exchange messages with _minimal_ kernel involvment. The receivers are
+ * typically NX coprocessor engines that perform compression or encryption
+ * in hardware, but receivers can also be other software threads.
+ *
+ * Senders are user/kernel threads that submit compression/encryption or
+ * other requests to the receivers. Senders must format their messages as
+ * Coprocessor Request Blocks (CRB)s and submit them using the "copy" and
+ * "paste" instructions which were introduced in Power9.
+ *
+ * A Power node can have (upto?) 8 Power chips. There is one instance of
+ * VAS in each Power9 chip. Each instance of VAS has 64K windows or ports,
+ * Senders and receivers must each connect to a separate window before they
+ * can exchange messages through the switchboard.
+ *
+ * Each window is described by two types of window contexts:
+ *
+ * Hypervisor Window Context (HVWC) of size VAS_HVWC_SIZE bytes
+ *
+ * OS/User Window Context (UWC) of size VAS_UWC_SIZE bytes.
+ *
+ * A window context can be viewed as a set of 64-bit registers. The settings
+ * in these registers configure/control/determine the behavior of the VAS
+ * hardware when messages are sent/received through the window. The registers
+ * in the HVWC are configured by the kernel while the registers in the UWC can
+ * be configured by the kernel or by the user space application that is using
+ * the window.
+ *
+ * The HVWCs for all windows on a specific instance of VAS are in a contiguous
+ * range of hardware addresses or Base address region (BAR) referred to as the
+ * HVWC BAR for the instance. Similarly the UWCs for all windows on an instance
+ * are referred to as the UWC BAR for the instance.
+ *
+ * The two BARs for each instance are defined Power9 MMIO Ranges spreadsheet
+ * and available to the kernel in the VAS node's "reg" property in the device
+ * tree:
+ *
+ * /proc/device-tree/vasm@.../reg
+ *
+ * (see vas_probe() for details on the reg property).
+ *
+ * The kernel maps the HVWC and UWC BAR regions into the kernel address
+ * space (hvwc_map and uwc_map). The kernel can then access the window
+ * contexts of a specific window using:
+ *
+ * hvwc = hvwc_map + winid * VAS_HVWC_SIZE.
+ * uwc = uwc_map + winid * VAS_UWC_SIZE.
+ *
+ * where winid is the window index (0..64K).
+ *
+ * As mentioned, a window context is used to "configure" a window. Besides
+ * this configuration address, each _send_ window also has a unique hardware
+ * "paste" address that is used to submit requests/CRBs (see vas_paste_crb()).
+ *
+ * The hardware paste address for a window is computed using the "paste
+ * base address" and "paste win id shift" reg properties in the VAS device
+ * tree node using:
+ *
+ * paste_addr = paste_base + ((winid << paste_win_id_shift))
+ *
+ * (again, see vas_probe() for ->paste_base_addr and ->paste_win_id_shift).
+ *
+ * The kernel maps this hardware address into the sender's address space
+ * after which they can use the 'paste' instruction (new in Power9) to
+ * send a message (submit a request aka CRB) to the coprocessor.
+ *
+ * NOTE: In the initial version, senders can only in-kernel drivers/threads.
+ * Support for user space threads will be added in follow-on patches.
+ *
+ * TODO: Do we need to map the UWC into user address space so they can return
+ * credits? Its NA for NX but may be needed for other receive windows.
+ *
+ */
+
+#define VAS_WINDOWS_PER_CHIP (64 << 10)
+
+/*
+ * Hypervisor and OS/USer Window Context sizes
+ */
+#define VAS_HVWC_SIZE 512
+#define VAS_UWC_SIZE PAGE_SIZE
+
+/*
+ * Initial per-process credits.
+ * Max send window credits: 4K-1 (12-bits in VAS_TX_WCRED)
+ * Max receive window credits: 64K-1 (16 bits in VAS_LRX_WCRED)
+ *
+ * TODO: Needs tuning for per-process credits
+ */
+#define VAS_RX_WCREDS_MAX ((64 << 10) - 1)
+#define VAS_TX_WCREDS_MAX ((4 << 10) - 1)
+#define VAS_WCREDS_DEFAULT (1 << 10)
+
+/*
+ * VAS Window Context Register Offsets and bitmasks.
+ * See Section 3.1.4 of VAS Work book
+ */
+#define VAS_LPID_OFFSET 0x010
+#define VAS_LPID PPC_BITMASK(0, 11)
+
+#define VAS_PID_OFFSET 0x018
+#define VAS_PID_ID PPC_BITMASK(0, 19)
+
+#define VAS_XLATE_MSR_OFFSET 0x020
+#define VAS_XLATE_MSR_DR PPC_BIT(0)
+#define VAS_XLATE_MSR_TA PPC_BIT(1)
+#define VAS_XLATE_MSR_PR PPC_BIT(2)
+#define VAS_XLATE_MSR_US PPC_BIT(3)
+#define VAS_XLATE_MSR_HV PPC_BIT(4)
+#define VAS_XLATE_MSR_SF PPC_BIT(5)
+
+#define VAS_XLATE_LPCR_OFFSET 0x028
+#define VAS_XLATE_LPCR_PAGE_SIZE PPC_BITMASK(0, 2)
+#define VAS_XLATE_LPCR_ISL PPC_BIT(3)
+#define VAS_XLATE_LPCR_TC PPC_BIT(4)
+#define VAS_XLATE_LPCR_SC PPC_BIT(5)
+
+#define VAS_XLATE_CTL_OFFSET 0x030
+#define VAS_XLATE_MODE PPC_BITMASK(0, 1)
+
+#define VAS_AMR_OFFSET 0x040
+#define VAS_AMR PPC_BITMASK(0, 63)
+
+#define VAS_SEIDR_OFFSET 0x048
+#define VAS_SEIDR PPC_BITMASK(0, 63)
+
+#define VAS_FAULT_TX_WIN_OFFSET 0x050
+#define VAS_FAULT_TX_WIN PPC_BITMASK(48, 63)
+
+#define VAS_OSU_INTR_SRC_RA_OFFSET 0x060
+#define VAS_OSU_INTR_SRC_RA PPC_BITMASK(8, 63)
+
+#define VAS_HV_INTR_SRC_RA_OFFSET 0x070
+#define VAS_HV_INTR_SRC_RA PPC_BITMASK(8, 63)
+
+#define VAS_PSWID_OFFSET 0x078
+#define VAS_PSWID_EA_HANDLE PPC_BITMASK(0, 31)
+
+#define VAS_SPARE1_OFFSET 0x080
+#define VAS_SPARE2_OFFSET 0x088
+#define VAS_SPARE3_OFFSET 0x090
+#define VAS_SPARE4_OFFSET 0x130
+#define VAS_SPARE5_OFFSET 0x160
+#define VAS_SPARE6_OFFSET 0x188
+
+#define VAS_LFIFO_BAR_OFFSET 0x0A0
+#define VAS_LFIFO_BAR PPC_BITMASK(8, 53)
+#define VAS_PAGE_MIGRATION_SELECT PPC_BITMASK(54, 56)
+
+#define VAS_LDATA_STAMP_CTL_OFFSET 0x0A8
+#define VAS_LDATA_STAMP PPC_BITMASK(0, 1)
+#define VAS_XTRA_WRITE PPC_BIT(2)
+
+#define VAS_LDMA_CACHE_CTL_OFFSET 0x0B0
+#define VAS_LDMA_TYPE PPC_BITMASK(0, 1)
+#define VAS_LDMA_FIFO_DISABLE PPC_BIT(2)
+
+#define VAS_LRFIFO_PUSH_OFFSET 0x0B8
+#define VAS_LRFIFO_PUSH PPC_BITMASK(0, 15)
+
+#define VAS_CURR_MSG_COUNT_OFFSET 0x0C0
+#define VAS_CURR_MSG_COUNT PPC_BITMASK(0, 7)
+
+#define VAS_LNOTIFY_AFTER_COUNT_OFFSET 0x0C8
+#define VAS_LNOTIFY_AFTER_COUNT PPC_BITMASK(0, 7)
+
+#define VAS_LRX_WCRED_OFFSET 0x0E0
+#define VAS_LRX_WCRED PPC_BITMASK(0, 15)
+
+#define VAS_LRX_WCRED_ADDER_OFFSET 0x190
+#define VAS_LRX_WCRED_ADDER PPC_BITMASK(0, 15)
+
+#define VAS_TX_WCRED_OFFSET 0x0F0
+#define VAS_TX_WCRED PPC_BITMASK(4, 15)
+
+#define VAS_TX_WCRED_ADDER_OFFSET 0x1A0
+#define VAS_TX_WCRED_ADDER PPC_BITMASK(4, 15)
+
+#define VAS_LFIFO_SIZE_OFFSET 0x100
+#define VAS_LFIFO_SIZE PPC_BITMASK(0, 3)
+
+#define VAS_WINCTL_OFFSET 0x108
+#define VAS_WINCTL_OPEN PPC_BIT(0)
+#define VAS_WINCTL_REJ_NO_CREDIT PPC_BIT(1)
+#define VAS_WINCTL_PIN PPC_BIT(2)
+#define VAS_WINCTL_TX_WCRED_MODE PPC_BIT(3)
+#define VAS_WINCTL_RX_WCRED_MODE PPC_BIT(4)
+#define VAS_WINCTL_TX_WORD_MODE PPC_BIT(5)
+#define VAS_WINCTL_RX_WORD_MODE PPC_BIT(6)
+#define VAS_WINCTL_RSVD_TXBUF PPC_BIT(7)
+#define VAS_WINCTL_THRESH_CTL PPC_BITMASK(8, 9)
+#define VAS_WINCTL_FAULT_WIN PPC_BIT(10)
+#define VAS_WINCTL_NX_WIN PPC_BIT(11)
+
+#define VAS_WIN_STATUS_OFFSET 0x110
+#define VAS_WIN_BUSY PPC_BIT(1)
+
+#define VAS_WIN_CTX_CACHING_CTL_OFFSET 0x118
+#define VAS_CASTOUT_REQ PPC_BIT(0)
+#define VAS_PUSH_TO_MEM PPC_BIT(1)
+#define VAS_WIN_CACHE_STATUS PPC_BIT(4)
+
+#define VAS_TX_RSVD_BUF_COUNT_OFFSET 0x120
+#define VAS_RXVD_BUF_COUNT PPC_BITMASK(58, 63)
+
+#define VAS_LRFIFO_WIN_PTR_OFFSET 0x128
+#define VAS_LRX_WIN_ID PPC_BITMASK(0, 15)
+
+/*
+ * Local Notification Control Register controls what happens in _response_
+ * to a paste command and hence applies only to receive windows.
+ */
+#define VAS_LNOTIFY_CTL_OFFSET 0x138
+#define VAS_NOTIFY_DISABLE PPC_BIT(0)
+#define VAS_INTR_DISABLE PPC_BIT(1)
+#define VAS_NOTIFY_EARLY PPC_BIT(2)
+#define VAS_NOTIFY_OSU_INTR PPC_BIT(3)
+
+#define VAS_LNOTIFY_PID_OFFSET 0x140
+#define VAS_LNOTIFY_PID PPC_BITMASK(0, 19)
+
+#define VAS_LNOTIFY_LPID_OFFSET 0x148
+#define VAS_LNOTIFY_LPID PPC_BITMASK(0, 11)
+
+#define VAS_LNOTIFY_TID_OFFSET 0x150
+#define VAS_LNOTIFY_TID PPC_BITMASK(0, 15)
+
+#define VAS_LNOTIFY_SCOPE_OFFSET 0x158
+#define VAS_LNOTIFY_MIN_SCOPE PPC_BITMASK(0, 1)
+#define VAS_LNOTIFY_MAX_SCOPE PPC_BITMASK(2, 3)
+
+#define VAS_NX_UTIL_OFFSET 0x1B0
+#define VAS_NX_UTIL PPC_BITMASK(0, 63)
+
+/* SE: Side effects */
+#define VAS_NX_UTIL_SE_OFFSET 0x1B8
+#define VAS_NX_UTIL_SE PPC_BITMASK(0, 63)
+
+#define VAS_NX_UTIL_ADDER_OFFSET 0x180
+#define VAS_NX_UTIL_ADDER PPC_BITMASK(32, 63)
+
+/*
+ * VREG(x):
+ * Expand a register's short name (eg: LPID) into two parameters:
+ * - the register's short name in string form ("LPID"), and
+ * - the name of the macro (eg: VAS_LPID_OFFSET), defining the
+ * register's offset in the window context
+ */
+#define VREG_SFX(n, s) __stringify(n), VAS_##n##s
+#define VREG(r) VREG_SFX(r, _OFFSET)
+
+/*
+ * Local Notify Scope Control Register. (Receive windows only).
+ */
+enum vas_notify_scope {
+ VAS_SCOPE_LOCAL,
+ VAS_SCOPE_GROUP,
+ VAS_SCOPE_VECTORED_GROUP,
+ VAS_SCOPE_UNUSED,
+};
+
+/*
+ * Local DMA Cache Control Register (Receive windows only).
+ */
+enum vas_dma_type {
+ VAS_DMA_TYPE_INJECT,
+ VAS_DMA_TYPE_WRITE,
+};
+
+/*
+ * Local Notify Scope Control Register. (Receive windows only).
+ * Not applicable to NX receive windows.
+ */
+enum vas_notify_after_count {
+ VAS_NOTIFY_AFTER_256 = 0,
+ VAS_NOTIFY_NONE,
+ VAS_NOTIFY_AFTER_2
+};
+
+/*
+ * One per instance of VAS. Each instance will have a separate set of
+ * receive windows, one per coprocessor type.
+ *
+ * See also function header of set_vinst_win() for details on ->windows[]
+ * and ->rxwin[] tables.
+ */
+struct vas_instance {
+ int vas_id;
+ struct ida ida;
+ struct list_head node;
+ struct platform_device *pdev;
+
+ u64 hvwc_bar_start;
+ u64 uwc_bar_start;
+ u64 paste_base_addr;
+ u64 paste_win_id_shift;
+
+ struct mutex mutex;
+ struct vas_window *rxwin[VAS_COP_TYPE_MAX];
+ struct vas_window *windows[VAS_WINDOWS_PER_CHIP];
+
+ char *dbgname;
+ struct dentry *dbgdir;
+};
+
+/*
+ * In-kernel state a VAS window. One per window.
+ */
+struct vas_window {
+ /* Fields common to send and receive windows */
+ struct vas_instance *vinst;
+ int winid;
+ bool tx_win; /* True if send window */
+ bool nx_win; /* True if NX window */
+ bool user_win; /* True if user space window */
+ void *hvwc_map; /* HV window context */
+ void *uwc_map; /* OS/User window context */
+ pid_t pid; /* Linux process id of owner */
+ int wcreds_max; /* Window credits */
+
+ char *dbgname;
+ struct dentry *dbgdir;
+
+ /* Fields applicable only to send windows */
+ void *paste_kaddr;
+ char *paste_addr_name;
+ struct vas_window *rxwin;
+
+ /* Feilds applicable only to receive windows */
+ enum vas_cop_type cop;
+ atomic_t num_txwins;
+};
+
+/*
+ * Container for the hardware state of a window. One per-window.
+ *
+ * A VAS Window context is a 512-byte area in the hardware that contains
+ * a set of 64-bit registers. Individual bit-fields in these registers
+ * determine the configuration/operation of the hardware. struct vas_winctx
+ * is a container for the register fields in the window context.
+ */
+struct vas_winctx {
+ void *rx_fifo;
+ int rx_fifo_size;
+ int wcreds_max;
+ int rsvd_txbuf_count;
+
+ bool user_win;
+ bool nx_win;
+ bool fault_win;
+ bool rsvd_txbuf_enable;
+ bool pin_win;
+ bool rej_no_credit;
+ bool tx_wcred_mode;
+ bool rx_wcred_mode;
+ bool tx_word_mode;
+ bool rx_word_mode;
+ bool data_stamp;
+ bool xtra_write;
+ bool notify_disable;
+ bool intr_disable;
+ bool fifo_disable;
+ bool notify_early;
+ bool notify_os_intr_reg;
+
+ int lpid;
+ int pidr; /* value from SPRN_PID, not linux pid */
+ int lnotify_lpid;
+ int lnotify_pid;
+ int lnotify_tid;
+ u32 pswid;
+ int rx_win_id;
+ int fault_win_id;
+ int tc_mode;
+
+ u64 irq_port;
+
+ enum vas_dma_type dma_type;
+ enum vas_notify_scope min_scope;
+ enum vas_notify_scope max_scope;
+ enum vas_notify_after_count notify_after_count;
+};
+
+extern struct mutex vas_mutex;
+
+extern struct vas_instance *find_vas_instance(int vasid);
+extern void vas_init_dbgdir(void);
+extern void vas_instance_init_dbgdir(struct vas_instance *vinst);
+extern void vas_window_init_dbgdir(struct vas_window *win);
+extern void vas_window_free_dbgdir(struct vas_window *win);
+
+static inline void vas_log_write(struct vas_window *win, char *name,
+ void *regptr, u64 val)
+{
+ if (val)
+ pr_debug("%swin #%d: %s reg %p, val 0x%016llx\n",
+ win->tx_win ? "Tx" : "Rx", win->winid, name,
+ regptr, val);
+}
+
+static inline void write_uwc_reg(struct vas_window *win, char *name,
+ s32 reg, u64 val)
+{
+ void *regptr;
+
+ regptr = win->uwc_map + reg;
+ vas_log_write(win, name, regptr, val);
+
+ out_be64(regptr, val);
+}
+
+static inline void write_hvwc_reg(struct vas_window *win, char *name,
+ s32 reg, u64 val)
+{
+ void *regptr;
+
+ regptr = win->hvwc_map + reg;
+ vas_log_write(win, name, regptr, val);
+
+ out_be64(regptr, val);
+}
+
+static inline u64 read_hvwc_reg(struct vas_window *win,
+ char *name __maybe_unused, s32 reg)
+{
+ return in_be64(win->hvwc_map+reg);
+}
+
+/*
+ * Encode/decode the Partition Send Window ID (PSWID) for a window in
+ * a way that we can uniquely identify any window in the system. i.e.
+ * we should be able to locate the 'struct vas_window' given the PSWID.
+ *
+ * Bits Usage
+ * 0:7 VAS id (8 bits)
+ * 8:15 Unused, 0 (3 bits)
+ * 16:31 Window id (16 bits)
+ */
+static inline u32 encode_pswid(int vasid, int winid)
+{
+ u32 pswid = 0;
+
+ pswid |= vasid << (31 - 7);
+ pswid |= winid;
+
+ return pswid;
+}
+
+static inline void decode_pswid(u32 pswid, int *vasid, int *winid)
+{
+ if (vasid)
+ *vasid = pswid >> (31 - 7) & 0xFF;
+
+ if (winid)
+ *winid = pswid & 0xFFFF;
+}
+#endif /* _VAS_H */