summaryrefslogtreecommitdiffstats
path: root/arch/powerpc/platforms/pseries
diff options
context:
space:
mode:
Diffstat (limited to 'arch/powerpc/platforms/pseries')
-rw-r--r--arch/powerpc/platforms/pseries/Kconfig142
-rw-r--r--arch/powerpc/platforms/pseries/Makefile30
-rw-r--r--arch/powerpc/platforms/pseries/cmm.c778
-rw-r--r--arch/powerpc/platforms/pseries/dlpar.c604
-rw-r--r--arch/powerpc/platforms/pseries/dtl.c394
-rw-r--r--arch/powerpc/platforms/pseries/eeh_pseries.c884
-rw-r--r--arch/powerpc/platforms/pseries/event_sources.c57
-rw-r--r--arch/powerpc/platforms/pseries/firmware.c180
-rw-r--r--arch/powerpc/platforms/pseries/hotplug-cpu.c975
-rw-r--r--arch/powerpc/platforms/pseries/hotplug-memory.c1061
-rw-r--r--arch/powerpc/platforms/pseries/hvCall.S339
-rw-r--r--arch/powerpc/platforms/pseries/hvCall_inst.c176
-rw-r--r--arch/powerpc/platforms/pseries/hvconsole.c88
-rw-r--r--arch/powerpc/platforms/pseries/hvcserver.c252
-rw-r--r--arch/powerpc/platforms/pseries/ibmebus.c469
-rw-r--r--arch/powerpc/platforms/pseries/io_event_irq.c165
-rw-r--r--arch/powerpc/platforms/pseries/iommu.c1413
-rw-r--r--arch/powerpc/platforms/pseries/kexec.c67
-rw-r--r--arch/powerpc/platforms/pseries/lpar.c1088
-rw-r--r--arch/powerpc/platforms/pseries/lparcfg.c714
-rw-r--r--arch/powerpc/platforms/pseries/mobility.c439
-rw-r--r--arch/powerpc/platforms/pseries/msi.c531
-rw-r--r--arch/powerpc/platforms/pseries/nvram.c247
-rw-r--r--arch/powerpc/platforms/pseries/of_helpers.c99
-rw-r--r--arch/powerpc/platforms/pseries/of_helpers.h9
-rw-r--r--arch/powerpc/platforms/pseries/offline_states.h38
-rw-r--r--arch/powerpc/platforms/pseries/pci.c345
-rw-r--r--arch/powerpc/platforms/pseries/pci_dlpar.c119
-rw-r--r--arch/powerpc/platforms/pseries/power.c84
-rw-r--r--arch/powerpc/platforms/pseries/pseries.h113
-rw-r--r--arch/powerpc/platforms/pseries/pseries_energy.c368
-rw-r--r--arch/powerpc/platforms/pseries/ras.c513
-rw-r--r--arch/powerpc/platforms/pseries/reconfig.c414
-rw-r--r--arch/powerpc/platforms/pseries/rng.c46
-rw-r--r--arch/powerpc/platforms/pseries/scanlog.c200
-rw-r--r--arch/powerpc/platforms/pseries/setup.c1046
-rw-r--r--arch/powerpc/platforms/pseries/smp.c292
-rw-r--r--arch/powerpc/platforms/pseries/suspend.c278
-rw-r--r--arch/powerpc/platforms/pseries/vio.c1724
39 files changed, 16781 insertions, 0 deletions
diff --git a/arch/powerpc/platforms/pseries/Kconfig b/arch/powerpc/platforms/pseries/Kconfig
new file mode 100644
index 000000000..0c698fd6d
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/Kconfig
@@ -0,0 +1,142 @@
+# SPDX-License-Identifier: GPL-2.0
+config PPC_PSERIES
+ depends on PPC64 && PPC_BOOK3S
+ bool "IBM pSeries & new (POWER5-based) iSeries"
+ select HAVE_PCSPKR_PLATFORM
+ select MPIC
+ select OF_DYNAMIC
+ select PCI
+ select PCI_MSI
+ select PPC_XICS
+ select PPC_XIVE_SPAPR
+ select PPC_ICP_NATIVE
+ select PPC_ICP_HV
+ select PPC_ICS_RTAS
+ select PPC_I8259
+ select PPC_RTAS
+ select PPC_RTAS_DAEMON
+ select RTAS_ERROR_LOGGING
+ select PPC_UDBG_16550
+ select PPC_NATIVE
+ select PPC_DOORBELL
+ select HOTPLUG_CPU
+ select ARCH_RANDOM
+ select PPC_DOORBELL
+ select FORCE_SMP
+ default y
+
+config PPC_SPLPAR
+ depends on PPC_PSERIES
+ bool "Support for shared-processor logical partitions"
+ default n
+ help
+ Enabling this option will make the kernel run more efficiently
+ on logically-partitioned pSeries systems which use shared
+ processors, that is, which share physical processors between
+ two or more partitions.
+
+config DTL
+ bool "Dispatch Trace Log"
+ depends on PPC_SPLPAR && DEBUG_FS
+ help
+ SPLPAR machines can log hypervisor preempt & dispatch events to a
+ kernel buffer. Saying Y here will enable logging these events,
+ which are accessible through a debugfs file.
+
+ Say N if you are unsure.
+
+config PSERIES_ENERGY
+ tristate "pSeries energy management capabilities driver"
+ depends on PPC_PSERIES
+ default y
+ help
+ Provides interface to platform energy management capabilities
+ on supported PSERIES platforms.
+ Provides: /sys/devices/system/cpu/pseries_(de)activation_hint_list
+ and /sys/devices/system/cpu/cpuN/pseries_(de)activation_hint
+
+config SCANLOG
+ tristate "Scanlog dump interface"
+ depends on RTAS_PROC && PPC_PSERIES
+
+config IO_EVENT_IRQ
+ bool "IO Event Interrupt support"
+ depends on PPC_PSERIES
+ default y
+ help
+ Select this option, if you want to enable support for IO Event
+ interrupts. IO event interrupt is a mechanism provided by RTAS
+ to return information about hardware error and non-error events
+ which may need OS attention. RTAS returns events for multiple
+ event types and scopes. Device drivers can register their handlers
+ to receive events.
+
+ This option will only enable the IO event platform code. You
+ will still need to enable or compile the actual drivers
+ that use this infrastructure to handle IO event interrupts.
+
+ Say Y if you are unsure.
+
+config LPARCFG
+ bool "LPAR Configuration Data"
+ depends on PPC_PSERIES
+ help
+ Provide system capacity information via human readable
+ <key word>=<value> pairs through a /proc/ppc64/lparcfg interface.
+
+config PPC_PSERIES_DEBUG
+ depends on PPC_PSERIES && PPC_EARLY_DEBUG
+ bool "Enable extra debug logging in platforms/pseries"
+ help
+ Say Y here if you want the pseries core to produce a bunch of
+ debug messages to the system log. Select this if you are having a
+ problem with the pseries core and want to see more of what is
+ going on. This does not enable debugging in lpar.c, which must
+ be manually done due to its verbosity.
+ default y
+
+config PPC_SMLPAR
+ bool "Support for shared-memory logical partitions"
+ depends on PPC_PSERIES
+ select LPARCFG
+ default n
+ help
+ Select this option to enable shared memory partition support.
+ With this option a system running in an LPAR can be given more
+ memory than physically available and will allow firmware to
+ balance memory across many LPARs.
+
+config CMM
+ tristate "Collaborative memory management"
+ depends on PPC_SMLPAR
+ default y
+ help
+ Select this option, if you want to enable the kernel interface
+ to reduce the memory size of the system. This is accomplished
+ by allocating pages of memory and put them "on hold". This only
+ makes sense for a system running in an LPAR where the unused pages
+ will be reused for other LPARs. The interface allows firmware to
+ balance memory across many LPARs.
+
+config HV_PERF_CTRS
+ bool "Hypervisor supplied PMU events (24x7 & GPCI)"
+ default y
+ depends on PERF_EVENTS && PPC_PSERIES
+ help
+ Enable access to hypervisor supplied counters in perf. Currently,
+ this enables code that uses the hcall GetPerfCounterInfo and 24x7
+ interfaces to retrieve counters. GPCI exists on Power 6 and later
+ systems. 24x7 is available on Power 8 and later systems.
+
+ If unsure, select Y.
+
+config IBMVIO
+ depends on PPC_PSERIES
+ bool
+ default y
+
+config IBMEBUS
+ depends on PPC_PSERIES && !CPU_LITTLE_ENDIAN
+ bool "Support for GX bus based adapters"
+ help
+ Bus device driver for GX bus based adapters.
diff --git a/arch/powerpc/platforms/pseries/Makefile b/arch/powerpc/platforms/pseries/Makefile
new file mode 100644
index 000000000..7e89d5c47
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/Makefile
@@ -0,0 +1,30 @@
+# SPDX-License-Identifier: GPL-2.0
+ccflags-$(CONFIG_PPC64) := $(NO_MINIMAL_TOC)
+ccflags-$(CONFIG_PPC_PSERIES_DEBUG) += -DDEBUG
+
+obj-y := lpar.o hvCall.o nvram.o reconfig.o \
+ of_helpers.o \
+ setup.o iommu.o event_sources.o ras.o \
+ firmware.o power.o dlpar.o mobility.o rng.o \
+ pci.o pci_dlpar.o eeh_pseries.o msi.o
+obj-$(CONFIG_SMP) += smp.o
+obj-$(CONFIG_SCANLOG) += scanlog.o
+obj-$(CONFIG_KEXEC_CORE) += kexec.o
+obj-$(CONFIG_PSERIES_ENERGY) += pseries_energy.o
+
+obj-$(CONFIG_HOTPLUG_CPU) += hotplug-cpu.o
+obj-$(CONFIG_MEMORY_HOTPLUG) += hotplug-memory.o
+
+obj-$(CONFIG_HVC_CONSOLE) += hvconsole.o
+obj-$(CONFIG_HVCS) += hvcserver.o
+obj-$(CONFIG_HCALL_STATS) += hvCall_inst.o
+obj-$(CONFIG_CMM) += cmm.o
+obj-$(CONFIG_DTL) += dtl.o
+obj-$(CONFIG_IO_EVENT_IRQ) += io_event_irq.o
+obj-$(CONFIG_LPARCFG) += lparcfg.o
+obj-$(CONFIG_IBMVIO) += vio.o
+obj-$(CONFIG_IBMEBUS) += ibmebus.o
+
+ifdef CONFIG_PPC_PSERIES
+obj-$(CONFIG_SUSPEND) += suspend.o
+endif
diff --git a/arch/powerpc/platforms/pseries/cmm.c b/arch/powerpc/platforms/pseries/cmm.c
new file mode 100644
index 000000000..502ebcc6c
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/cmm.c
@@ -0,0 +1,778 @@
+/*
+ * Collaborative memory management interface.
+ *
+ * Copyright (C) 2008 IBM Corporation
+ * Author(s): Brian King (brking@linux.vnet.ibm.com),
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ */
+
+#include <linux/ctype.h>
+#include <linux/delay.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/gfp.h>
+#include <linux/kthread.h>
+#include <linux/module.h>
+#include <linux/oom.h>
+#include <linux/reboot.h>
+#include <linux/sched.h>
+#include <linux/stringify.h>
+#include <linux/swap.h>
+#include <linux/device.h>
+#include <asm/firmware.h>
+#include <asm/hvcall.h>
+#include <asm/mmu.h>
+#include <asm/pgalloc.h>
+#include <linux/uaccess.h>
+#include <linux/memory.h>
+#include <asm/plpar_wrappers.h>
+
+#include "pseries.h"
+
+#define CMM_DRIVER_VERSION "1.0.0"
+#define CMM_DEFAULT_DELAY 1
+#define CMM_HOTPLUG_DELAY 5
+#define CMM_DEBUG 0
+#define CMM_DISABLE 0
+#define CMM_OOM_KB 1024
+#define CMM_MIN_MEM_MB 256
+#define KB2PAGES(_p) ((_p)>>(PAGE_SHIFT-10))
+#define PAGES2KB(_p) ((_p)<<(PAGE_SHIFT-10))
+/*
+ * The priority level tries to ensure that this notifier is called as
+ * late as possible to reduce thrashing in the shared memory pool.
+ */
+#define CMM_MEM_HOTPLUG_PRI 1
+#define CMM_MEM_ISOLATE_PRI 15
+
+static unsigned int delay = CMM_DEFAULT_DELAY;
+static unsigned int hotplug_delay = CMM_HOTPLUG_DELAY;
+static unsigned int oom_kb = CMM_OOM_KB;
+static unsigned int cmm_debug = CMM_DEBUG;
+static unsigned int cmm_disabled = CMM_DISABLE;
+static unsigned long min_mem_mb = CMM_MIN_MEM_MB;
+static struct device cmm_dev;
+
+MODULE_AUTHOR("Brian King <brking@linux.vnet.ibm.com>");
+MODULE_DESCRIPTION("IBM System p Collaborative Memory Manager");
+MODULE_LICENSE("GPL");
+MODULE_VERSION(CMM_DRIVER_VERSION);
+
+module_param_named(delay, delay, uint, 0644);
+MODULE_PARM_DESC(delay, "Delay (in seconds) between polls to query hypervisor paging requests. "
+ "[Default=" __stringify(CMM_DEFAULT_DELAY) "]");
+module_param_named(hotplug_delay, hotplug_delay, uint, 0644);
+MODULE_PARM_DESC(hotplug_delay, "Delay (in seconds) after memory hotplug remove "
+ "before loaning resumes. "
+ "[Default=" __stringify(CMM_HOTPLUG_DELAY) "]");
+module_param_named(oom_kb, oom_kb, uint, 0644);
+MODULE_PARM_DESC(oom_kb, "Amount of memory in kb to free on OOM. "
+ "[Default=" __stringify(CMM_OOM_KB) "]");
+module_param_named(min_mem_mb, min_mem_mb, ulong, 0644);
+MODULE_PARM_DESC(min_mem_mb, "Minimum amount of memory (in MB) to not balloon. "
+ "[Default=" __stringify(CMM_MIN_MEM_MB) "]");
+module_param_named(debug, cmm_debug, uint, 0644);
+MODULE_PARM_DESC(debug, "Enable module debugging logging. Set to 1 to enable. "
+ "[Default=" __stringify(CMM_DEBUG) "]");
+
+#define CMM_NR_PAGES ((PAGE_SIZE - sizeof(void *) - sizeof(unsigned long)) / sizeof(unsigned long))
+
+#define cmm_dbg(...) if (cmm_debug) { printk(KERN_INFO "cmm: "__VA_ARGS__); }
+
+struct cmm_page_array {
+ struct cmm_page_array *next;
+ unsigned long index;
+ unsigned long page[CMM_NR_PAGES];
+};
+
+static unsigned long loaned_pages;
+static unsigned long loaned_pages_target;
+static unsigned long oom_freed_pages;
+
+static struct cmm_page_array *cmm_page_list;
+static DEFINE_SPINLOCK(cmm_lock);
+
+static DEFINE_MUTEX(hotplug_mutex);
+static int hotplug_occurred; /* protected by the hotplug mutex */
+
+static struct task_struct *cmm_thread_ptr;
+
+static long plpar_page_set_loaned(unsigned long vpa)
+{
+ unsigned long cmo_page_sz = cmo_get_page_size();
+ long rc = 0;
+ int i;
+
+ for (i = 0; !rc && i < PAGE_SIZE; i += cmo_page_sz)
+ rc = plpar_hcall_norets(H_PAGE_INIT, H_PAGE_SET_LOANED, vpa + i, 0);
+
+ for (i -= cmo_page_sz; rc && i != 0; i -= cmo_page_sz)
+ plpar_hcall_norets(H_PAGE_INIT, H_PAGE_SET_ACTIVE,
+ vpa + i - cmo_page_sz, 0);
+
+ return rc;
+}
+
+static long plpar_page_set_active(unsigned long vpa)
+{
+ unsigned long cmo_page_sz = cmo_get_page_size();
+ long rc = 0;
+ int i;
+
+ for (i = 0; !rc && i < PAGE_SIZE; i += cmo_page_sz)
+ rc = plpar_hcall_norets(H_PAGE_INIT, H_PAGE_SET_ACTIVE, vpa + i, 0);
+
+ for (i -= cmo_page_sz; rc && i != 0; i -= cmo_page_sz)
+ plpar_hcall_norets(H_PAGE_INIT, H_PAGE_SET_LOANED,
+ vpa + i - cmo_page_sz, 0);
+
+ return rc;
+}
+
+/**
+ * cmm_alloc_pages - Allocate pages and mark them as loaned
+ * @nr: number of pages to allocate
+ *
+ * Return value:
+ * number of pages requested to be allocated which were not
+ **/
+static long cmm_alloc_pages(long nr)
+{
+ struct cmm_page_array *pa, *npa;
+ unsigned long addr;
+ long rc;
+
+ cmm_dbg("Begin request for %ld pages\n", nr);
+
+ while (nr) {
+ /* Exit if a hotplug operation is in progress or occurred */
+ if (mutex_trylock(&hotplug_mutex)) {
+ if (hotplug_occurred) {
+ mutex_unlock(&hotplug_mutex);
+ break;
+ }
+ mutex_unlock(&hotplug_mutex);
+ } else {
+ break;
+ }
+
+ addr = __get_free_page(GFP_NOIO | __GFP_NOWARN |
+ __GFP_NORETRY | __GFP_NOMEMALLOC);
+ if (!addr)
+ break;
+ spin_lock(&cmm_lock);
+ pa = cmm_page_list;
+ if (!pa || pa->index >= CMM_NR_PAGES) {
+ /* Need a new page for the page list. */
+ spin_unlock(&cmm_lock);
+ npa = (struct cmm_page_array *)__get_free_page(
+ GFP_NOIO | __GFP_NOWARN |
+ __GFP_NORETRY | __GFP_NOMEMALLOC);
+ if (!npa) {
+ pr_info("%s: Can not allocate new page list\n", __func__);
+ free_page(addr);
+ break;
+ }
+ spin_lock(&cmm_lock);
+ pa = cmm_page_list;
+
+ if (!pa || pa->index >= CMM_NR_PAGES) {
+ npa->next = pa;
+ npa->index = 0;
+ pa = npa;
+ cmm_page_list = pa;
+ } else
+ free_page((unsigned long) npa);
+ }
+
+ if ((rc = plpar_page_set_loaned(__pa(addr)))) {
+ pr_err("%s: Can not set page to loaned. rc=%ld\n", __func__, rc);
+ spin_unlock(&cmm_lock);
+ free_page(addr);
+ break;
+ }
+
+ pa->page[pa->index++] = addr;
+ loaned_pages++;
+ totalram_pages--;
+ spin_unlock(&cmm_lock);
+ nr--;
+ }
+
+ cmm_dbg("End request with %ld pages unfulfilled\n", nr);
+ return nr;
+}
+
+/**
+ * cmm_free_pages - Free pages and mark them as active
+ * @nr: number of pages to free
+ *
+ * Return value:
+ * number of pages requested to be freed which were not
+ **/
+static long cmm_free_pages(long nr)
+{
+ struct cmm_page_array *pa;
+ unsigned long addr;
+
+ cmm_dbg("Begin free of %ld pages.\n", nr);
+ spin_lock(&cmm_lock);
+ pa = cmm_page_list;
+ while (nr) {
+ if (!pa || pa->index <= 0)
+ break;
+ addr = pa->page[--pa->index];
+
+ if (pa->index == 0) {
+ pa = pa->next;
+ free_page((unsigned long) cmm_page_list);
+ cmm_page_list = pa;
+ }
+
+ plpar_page_set_active(__pa(addr));
+ free_page(addr);
+ loaned_pages--;
+ nr--;
+ totalram_pages++;
+ }
+ spin_unlock(&cmm_lock);
+ cmm_dbg("End request with %ld pages unfulfilled\n", nr);
+ return nr;
+}
+
+/**
+ * cmm_oom_notify - OOM notifier
+ * @self: notifier block struct
+ * @dummy: not used
+ * @parm: returned - number of pages freed
+ *
+ * Return value:
+ * NOTIFY_OK
+ **/
+static int cmm_oom_notify(struct notifier_block *self,
+ unsigned long dummy, void *parm)
+{
+ unsigned long *freed = parm;
+ long nr = KB2PAGES(oom_kb);
+
+ cmm_dbg("OOM processing started\n");
+ nr = cmm_free_pages(nr);
+ loaned_pages_target = loaned_pages;
+ *freed += KB2PAGES(oom_kb) - nr;
+ oom_freed_pages += KB2PAGES(oom_kb) - nr;
+ cmm_dbg("OOM processing complete\n");
+ return NOTIFY_OK;
+}
+
+/**
+ * cmm_get_mpp - Read memory performance parameters
+ *
+ * Makes hcall to query the current page loan request from the hypervisor.
+ *
+ * Return value:
+ * nothing
+ **/
+static void cmm_get_mpp(void)
+{
+ int rc;
+ struct hvcall_mpp_data mpp_data;
+ signed long active_pages_target, page_loan_request, target;
+ signed long total_pages = totalram_pages + loaned_pages;
+ signed long min_mem_pages = (min_mem_mb * 1024 * 1024) / PAGE_SIZE;
+
+ rc = h_get_mpp(&mpp_data);
+
+ if (rc != H_SUCCESS)
+ return;
+
+ page_loan_request = div_s64((s64)mpp_data.loan_request, PAGE_SIZE);
+ target = page_loan_request + (signed long)loaned_pages;
+
+ if (target < 0 || total_pages < min_mem_pages)
+ target = 0;
+
+ if (target > oom_freed_pages)
+ target -= oom_freed_pages;
+ else
+ target = 0;
+
+ active_pages_target = total_pages - target;
+
+ if (min_mem_pages > active_pages_target)
+ target = total_pages - min_mem_pages;
+
+ if (target < 0)
+ target = 0;
+
+ loaned_pages_target = target;
+
+ cmm_dbg("delta = %ld, loaned = %lu, target = %lu, oom = %lu, totalram = %lu\n",
+ page_loan_request, loaned_pages, loaned_pages_target,
+ oom_freed_pages, totalram_pages);
+}
+
+static struct notifier_block cmm_oom_nb = {
+ .notifier_call = cmm_oom_notify
+};
+
+/**
+ * cmm_thread - CMM task thread
+ * @dummy: not used
+ *
+ * Return value:
+ * 0
+ **/
+static int cmm_thread(void *dummy)
+{
+ unsigned long timeleft;
+
+ while (1) {
+ timeleft = msleep_interruptible(delay * 1000);
+
+ if (kthread_should_stop() || timeleft)
+ break;
+
+ if (mutex_trylock(&hotplug_mutex)) {
+ if (hotplug_occurred) {
+ hotplug_occurred = 0;
+ mutex_unlock(&hotplug_mutex);
+ cmm_dbg("Hotplug operation has occurred, "
+ "loaning activity suspended "
+ "for %d seconds.\n",
+ hotplug_delay);
+ timeleft = msleep_interruptible(hotplug_delay *
+ 1000);
+ if (kthread_should_stop() || timeleft)
+ break;
+ continue;
+ }
+ mutex_unlock(&hotplug_mutex);
+ } else {
+ cmm_dbg("Hotplug operation in progress, activity "
+ "suspended\n");
+ continue;
+ }
+
+ cmm_get_mpp();
+
+ if (loaned_pages_target > loaned_pages) {
+ if (cmm_alloc_pages(loaned_pages_target - loaned_pages))
+ loaned_pages_target = loaned_pages;
+ } else if (loaned_pages_target < loaned_pages)
+ cmm_free_pages(loaned_pages - loaned_pages_target);
+ }
+ return 0;
+}
+
+#define CMM_SHOW(name, format, args...) \
+ static ssize_t show_##name(struct device *dev, \
+ struct device_attribute *attr, \
+ char *buf) \
+ { \
+ return sprintf(buf, format, ##args); \
+ } \
+ static DEVICE_ATTR(name, 0444, show_##name, NULL)
+
+CMM_SHOW(loaned_kb, "%lu\n", PAGES2KB(loaned_pages));
+CMM_SHOW(loaned_target_kb, "%lu\n", PAGES2KB(loaned_pages_target));
+
+static ssize_t show_oom_pages(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%lu\n", PAGES2KB(oom_freed_pages));
+}
+
+static ssize_t store_oom_pages(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ unsigned long val = simple_strtoul (buf, NULL, 10);
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+ if (val != 0)
+ return -EBADMSG;
+
+ oom_freed_pages = 0;
+ return count;
+}
+
+static DEVICE_ATTR(oom_freed_kb, 0644,
+ show_oom_pages, store_oom_pages);
+
+static struct device_attribute *cmm_attrs[] = {
+ &dev_attr_loaned_kb,
+ &dev_attr_loaned_target_kb,
+ &dev_attr_oom_freed_kb,
+};
+
+static struct bus_type cmm_subsys = {
+ .name = "cmm",
+ .dev_name = "cmm",
+};
+
+static void cmm_release_device(struct device *dev)
+{
+}
+
+/**
+ * cmm_sysfs_register - Register with sysfs
+ *
+ * Return value:
+ * 0 on success / other on failure
+ **/
+static int cmm_sysfs_register(struct device *dev)
+{
+ int i, rc;
+
+ if ((rc = subsys_system_register(&cmm_subsys, NULL)))
+ return rc;
+
+ dev->id = 0;
+ dev->bus = &cmm_subsys;
+ dev->release = cmm_release_device;
+
+ if ((rc = device_register(dev)))
+ goto subsys_unregister;
+
+ for (i = 0; i < ARRAY_SIZE(cmm_attrs); i++) {
+ if ((rc = device_create_file(dev, cmm_attrs[i])))
+ goto fail;
+ }
+
+ return 0;
+
+fail:
+ while (--i >= 0)
+ device_remove_file(dev, cmm_attrs[i]);
+ device_unregister(dev);
+subsys_unregister:
+ bus_unregister(&cmm_subsys);
+ return rc;
+}
+
+/**
+ * cmm_unregister_sysfs - Unregister from sysfs
+ *
+ **/
+static void cmm_unregister_sysfs(struct device *dev)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(cmm_attrs); i++)
+ device_remove_file(dev, cmm_attrs[i]);
+ device_unregister(dev);
+ bus_unregister(&cmm_subsys);
+}
+
+/**
+ * cmm_reboot_notifier - Make sure pages are not still marked as "loaned"
+ *
+ **/
+static int cmm_reboot_notifier(struct notifier_block *nb,
+ unsigned long action, void *unused)
+{
+ if (action == SYS_RESTART) {
+ if (cmm_thread_ptr)
+ kthread_stop(cmm_thread_ptr);
+ cmm_thread_ptr = NULL;
+ cmm_free_pages(loaned_pages);
+ }
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block cmm_reboot_nb = {
+ .notifier_call = cmm_reboot_notifier,
+};
+
+/**
+ * cmm_count_pages - Count the number of pages loaned in a particular range.
+ *
+ * @arg: memory_isolate_notify structure with address range and count
+ *
+ * Return value:
+ * 0 on success
+ **/
+static unsigned long cmm_count_pages(void *arg)
+{
+ struct memory_isolate_notify *marg = arg;
+ struct cmm_page_array *pa;
+ unsigned long start = (unsigned long)pfn_to_kaddr(marg->start_pfn);
+ unsigned long end = start + (marg->nr_pages << PAGE_SHIFT);
+ unsigned long idx;
+
+ spin_lock(&cmm_lock);
+ pa = cmm_page_list;
+ while (pa) {
+ if ((unsigned long)pa >= start && (unsigned long)pa < end)
+ marg->pages_found++;
+ for (idx = 0; idx < pa->index; idx++)
+ if (pa->page[idx] >= start && pa->page[idx] < end)
+ marg->pages_found++;
+ pa = pa->next;
+ }
+ spin_unlock(&cmm_lock);
+ return 0;
+}
+
+/**
+ * cmm_memory_isolate_cb - Handle memory isolation notifier calls
+ * @self: notifier block struct
+ * @action: action to take
+ * @arg: struct memory_isolate_notify data for handler
+ *
+ * Return value:
+ * NOTIFY_OK or notifier error based on subfunction return value
+ **/
+static int cmm_memory_isolate_cb(struct notifier_block *self,
+ unsigned long action, void *arg)
+{
+ int ret = 0;
+
+ if (action == MEM_ISOLATE_COUNT)
+ ret = cmm_count_pages(arg);
+
+ return notifier_from_errno(ret);
+}
+
+static struct notifier_block cmm_mem_isolate_nb = {
+ .notifier_call = cmm_memory_isolate_cb,
+ .priority = CMM_MEM_ISOLATE_PRI
+};
+
+/**
+ * cmm_mem_going_offline - Unloan pages where memory is to be removed
+ * @arg: memory_notify structure with page range to be offlined
+ *
+ * Return value:
+ * 0 on success
+ **/
+static int cmm_mem_going_offline(void *arg)
+{
+ struct memory_notify *marg = arg;
+ unsigned long start_page = (unsigned long)pfn_to_kaddr(marg->start_pfn);
+ unsigned long end_page = start_page + (marg->nr_pages << PAGE_SHIFT);
+ struct cmm_page_array *pa_curr, *pa_last, *npa;
+ unsigned long idx;
+ unsigned long freed = 0;
+
+ cmm_dbg("Memory going offline, searching 0x%lx (%ld pages).\n",
+ start_page, marg->nr_pages);
+ spin_lock(&cmm_lock);
+
+ /* Search the page list for pages in the range to be offlined */
+ pa_last = pa_curr = cmm_page_list;
+ while (pa_curr) {
+ for (idx = (pa_curr->index - 1); (idx + 1) > 0; idx--) {
+ if ((pa_curr->page[idx] < start_page) ||
+ (pa_curr->page[idx] >= end_page))
+ continue;
+
+ plpar_page_set_active(__pa(pa_curr->page[idx]));
+ free_page(pa_curr->page[idx]);
+ freed++;
+ loaned_pages--;
+ totalram_pages++;
+ pa_curr->page[idx] = pa_last->page[--pa_last->index];
+ if (pa_last->index == 0) {
+ if (pa_curr == pa_last)
+ pa_curr = pa_last->next;
+ pa_last = pa_last->next;
+ free_page((unsigned long)cmm_page_list);
+ cmm_page_list = pa_last;
+ }
+ }
+ pa_curr = pa_curr->next;
+ }
+
+ /* Search for page list structures in the range to be offlined */
+ pa_last = NULL;
+ pa_curr = cmm_page_list;
+ while (pa_curr) {
+ if (((unsigned long)pa_curr >= start_page) &&
+ ((unsigned long)pa_curr < end_page)) {
+ npa = (struct cmm_page_array *)__get_free_page(
+ GFP_NOIO | __GFP_NOWARN |
+ __GFP_NORETRY | __GFP_NOMEMALLOC);
+ if (!npa) {
+ spin_unlock(&cmm_lock);
+ cmm_dbg("Failed to allocate memory for list "
+ "management. Memory hotplug "
+ "failed.\n");
+ return -ENOMEM;
+ }
+ memcpy(npa, pa_curr, PAGE_SIZE);
+ if (pa_curr == cmm_page_list)
+ cmm_page_list = npa;
+ if (pa_last)
+ pa_last->next = npa;
+ free_page((unsigned long) pa_curr);
+ freed++;
+ pa_curr = npa;
+ }
+
+ pa_last = pa_curr;
+ pa_curr = pa_curr->next;
+ }
+
+ spin_unlock(&cmm_lock);
+ cmm_dbg("Released %ld pages in the search range.\n", freed);
+
+ return 0;
+}
+
+/**
+ * cmm_memory_cb - Handle memory hotplug notifier calls
+ * @self: notifier block struct
+ * @action: action to take
+ * @arg: struct memory_notify data for handler
+ *
+ * Return value:
+ * NOTIFY_OK or notifier error based on subfunction return value
+ *
+ **/
+static int cmm_memory_cb(struct notifier_block *self,
+ unsigned long action, void *arg)
+{
+ int ret = 0;
+
+ switch (action) {
+ case MEM_GOING_OFFLINE:
+ mutex_lock(&hotplug_mutex);
+ hotplug_occurred = 1;
+ ret = cmm_mem_going_offline(arg);
+ break;
+ case MEM_OFFLINE:
+ case MEM_CANCEL_OFFLINE:
+ mutex_unlock(&hotplug_mutex);
+ cmm_dbg("Memory offline operation complete.\n");
+ break;
+ case MEM_GOING_ONLINE:
+ case MEM_ONLINE:
+ case MEM_CANCEL_ONLINE:
+ break;
+ }
+
+ return notifier_from_errno(ret);
+}
+
+static struct notifier_block cmm_mem_nb = {
+ .notifier_call = cmm_memory_cb,
+ .priority = CMM_MEM_HOTPLUG_PRI
+};
+
+/**
+ * cmm_init - Module initialization
+ *
+ * Return value:
+ * 0 on success / other on failure
+ **/
+static int cmm_init(void)
+{
+ int rc = -ENOMEM;
+
+ if (!firmware_has_feature(FW_FEATURE_CMO))
+ return -EOPNOTSUPP;
+
+ if ((rc = register_oom_notifier(&cmm_oom_nb)) < 0)
+ return rc;
+
+ if ((rc = register_reboot_notifier(&cmm_reboot_nb)))
+ goto out_oom_notifier;
+
+ if ((rc = cmm_sysfs_register(&cmm_dev)))
+ goto out_reboot_notifier;
+
+ if (register_memory_notifier(&cmm_mem_nb) ||
+ register_memory_isolate_notifier(&cmm_mem_isolate_nb))
+ goto out_unregister_notifier;
+
+ if (cmm_disabled)
+ return rc;
+
+ cmm_thread_ptr = kthread_run(cmm_thread, NULL, "cmmthread");
+ if (IS_ERR(cmm_thread_ptr)) {
+ rc = PTR_ERR(cmm_thread_ptr);
+ goto out_unregister_notifier;
+ }
+
+ return rc;
+
+out_unregister_notifier:
+ unregister_memory_notifier(&cmm_mem_nb);
+ unregister_memory_isolate_notifier(&cmm_mem_isolate_nb);
+ cmm_unregister_sysfs(&cmm_dev);
+out_reboot_notifier:
+ unregister_reboot_notifier(&cmm_reboot_nb);
+out_oom_notifier:
+ unregister_oom_notifier(&cmm_oom_nb);
+ return rc;
+}
+
+/**
+ * cmm_exit - Module exit
+ *
+ * Return value:
+ * nothing
+ **/
+static void cmm_exit(void)
+{
+ if (cmm_thread_ptr)
+ kthread_stop(cmm_thread_ptr);
+ unregister_oom_notifier(&cmm_oom_nb);
+ unregister_reboot_notifier(&cmm_reboot_nb);
+ unregister_memory_notifier(&cmm_mem_nb);
+ unregister_memory_isolate_notifier(&cmm_mem_isolate_nb);
+ cmm_free_pages(loaned_pages);
+ cmm_unregister_sysfs(&cmm_dev);
+}
+
+/**
+ * cmm_set_disable - Disable/Enable CMM
+ *
+ * Return value:
+ * 0 on success / other on failure
+ **/
+static int cmm_set_disable(const char *val, const struct kernel_param *kp)
+{
+ int disable = simple_strtoul(val, NULL, 10);
+
+ if (disable != 0 && disable != 1)
+ return -EINVAL;
+
+ if (disable && !cmm_disabled) {
+ if (cmm_thread_ptr)
+ kthread_stop(cmm_thread_ptr);
+ cmm_thread_ptr = NULL;
+ cmm_free_pages(loaned_pages);
+ } else if (!disable && cmm_disabled) {
+ cmm_thread_ptr = kthread_run(cmm_thread, NULL, "cmmthread");
+ if (IS_ERR(cmm_thread_ptr))
+ return PTR_ERR(cmm_thread_ptr);
+ }
+
+ cmm_disabled = disable;
+ return 0;
+}
+
+module_param_call(disable, cmm_set_disable, param_get_uint,
+ &cmm_disabled, 0644);
+MODULE_PARM_DESC(disable, "Disable CMM. Set to 1 to disable. "
+ "[Default=" __stringify(CMM_DISABLE) "]");
+
+module_init(cmm_init);
+module_exit(cmm_exit);
diff --git a/arch/powerpc/platforms/pseries/dlpar.c b/arch/powerpc/platforms/pseries/dlpar.c
new file mode 100644
index 000000000..90fd03b9d
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/dlpar.c
@@ -0,0 +1,604 @@
+/*
+ * Support for dynamic reconfiguration for PCI, Memory, and CPU
+ * Hotplug and Dynamic Logical Partitioning on RPA platforms.
+ *
+ * Copyright (C) 2009 Nathan Fontenot
+ * Copyright (C) 2009 IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ */
+
+#define pr_fmt(fmt) "dlpar: " fmt
+
+#include <linux/kernel.h>
+#include <linux/notifier.h>
+#include <linux/spinlock.h>
+#include <linux/cpu.h>
+#include <linux/slab.h>
+#include <linux/of.h>
+
+#include "of_helpers.h"
+#include "pseries.h"
+
+#include <asm/prom.h>
+#include <asm/machdep.h>
+#include <linux/uaccess.h>
+#include <asm/rtas.h>
+
+static struct workqueue_struct *pseries_hp_wq;
+
+struct pseries_hp_work {
+ struct work_struct work;
+ struct pseries_hp_errorlog *errlog;
+ struct completion *hp_completion;
+ int *rc;
+};
+
+struct cc_workarea {
+ __be32 drc_index;
+ __be32 zero;
+ __be32 name_offset;
+ __be32 prop_length;
+ __be32 prop_offset;
+};
+
+void dlpar_free_cc_property(struct property *prop)
+{
+ kfree(prop->name);
+ kfree(prop->value);
+ kfree(prop);
+}
+
+static struct property *dlpar_parse_cc_property(struct cc_workarea *ccwa)
+{
+ struct property *prop;
+ char *name;
+ char *value;
+
+ prop = kzalloc(sizeof(*prop), GFP_KERNEL);
+ if (!prop)
+ return NULL;
+
+ name = (char *)ccwa + be32_to_cpu(ccwa->name_offset);
+ prop->name = kstrdup(name, GFP_KERNEL);
+ if (!prop->name) {
+ dlpar_free_cc_property(prop);
+ return NULL;
+ }
+
+ prop->length = be32_to_cpu(ccwa->prop_length);
+ value = (char *)ccwa + be32_to_cpu(ccwa->prop_offset);
+ prop->value = kmemdup(value, prop->length, GFP_KERNEL);
+ if (!prop->value) {
+ dlpar_free_cc_property(prop);
+ return NULL;
+ }
+
+ return prop;
+}
+
+static struct device_node *dlpar_parse_cc_node(struct cc_workarea *ccwa)
+{
+ struct device_node *dn;
+ const char *name;
+
+ dn = kzalloc(sizeof(*dn), GFP_KERNEL);
+ if (!dn)
+ return NULL;
+
+ name = (const char *)ccwa + be32_to_cpu(ccwa->name_offset);
+ dn->full_name = kstrdup(name, GFP_KERNEL);
+ if (!dn->full_name) {
+ kfree(dn);
+ return NULL;
+ }
+
+ of_node_set_flag(dn, OF_DYNAMIC);
+ of_node_init(dn);
+
+ return dn;
+}
+
+static void dlpar_free_one_cc_node(struct device_node *dn)
+{
+ struct property *prop;
+
+ while (dn->properties) {
+ prop = dn->properties;
+ dn->properties = prop->next;
+ dlpar_free_cc_property(prop);
+ }
+
+ kfree(dn->full_name);
+ kfree(dn);
+}
+
+void dlpar_free_cc_nodes(struct device_node *dn)
+{
+ if (dn->child)
+ dlpar_free_cc_nodes(dn->child);
+
+ if (dn->sibling)
+ dlpar_free_cc_nodes(dn->sibling);
+
+ dlpar_free_one_cc_node(dn);
+}
+
+#define COMPLETE 0
+#define NEXT_SIBLING 1
+#define NEXT_CHILD 2
+#define NEXT_PROPERTY 3
+#define PREV_PARENT 4
+#define MORE_MEMORY 5
+#define ERR_CFG_USE -9003
+
+struct device_node *dlpar_configure_connector(__be32 drc_index,
+ struct device_node *parent)
+{
+ struct device_node *dn;
+ struct device_node *first_dn = NULL;
+ struct device_node *last_dn = NULL;
+ struct property *property;
+ struct property *last_property = NULL;
+ struct cc_workarea *ccwa;
+ char *data_buf;
+ int cc_token;
+ int rc = -1;
+
+ cc_token = rtas_token("ibm,configure-connector");
+ if (cc_token == RTAS_UNKNOWN_SERVICE)
+ return NULL;
+
+ data_buf = kzalloc(RTAS_DATA_BUF_SIZE, GFP_KERNEL);
+ if (!data_buf)
+ return NULL;
+
+ ccwa = (struct cc_workarea *)&data_buf[0];
+ ccwa->drc_index = drc_index;
+ ccwa->zero = 0;
+
+ do {
+ /* Since we release the rtas_data_buf lock between configure
+ * connector calls we want to re-populate the rtas_data_buffer
+ * with the contents of the previous call.
+ */
+ spin_lock(&rtas_data_buf_lock);
+
+ memcpy(rtas_data_buf, data_buf, RTAS_DATA_BUF_SIZE);
+ rc = rtas_call(cc_token, 2, 1, NULL, rtas_data_buf, NULL);
+ memcpy(data_buf, rtas_data_buf, RTAS_DATA_BUF_SIZE);
+
+ spin_unlock(&rtas_data_buf_lock);
+
+ if (rtas_busy_delay(rc))
+ continue;
+
+ switch (rc) {
+ case COMPLETE:
+ break;
+
+ case NEXT_SIBLING:
+ dn = dlpar_parse_cc_node(ccwa);
+ if (!dn)
+ goto cc_error;
+
+ dn->parent = last_dn->parent;
+ last_dn->sibling = dn;
+ last_dn = dn;
+ break;
+
+ case NEXT_CHILD:
+ dn = dlpar_parse_cc_node(ccwa);
+ if (!dn)
+ goto cc_error;
+
+ if (!first_dn) {
+ dn->parent = parent;
+ first_dn = dn;
+ } else {
+ dn->parent = last_dn;
+ if (last_dn)
+ last_dn->child = dn;
+ }
+
+ last_dn = dn;
+ break;
+
+ case NEXT_PROPERTY:
+ property = dlpar_parse_cc_property(ccwa);
+ if (!property)
+ goto cc_error;
+
+ if (!last_dn->properties)
+ last_dn->properties = property;
+ else
+ last_property->next = property;
+
+ last_property = property;
+ break;
+
+ case PREV_PARENT:
+ last_dn = last_dn->parent;
+ break;
+
+ case MORE_MEMORY:
+ case ERR_CFG_USE:
+ default:
+ printk(KERN_ERR "Unexpected Error (%d) "
+ "returned from configure-connector\n", rc);
+ goto cc_error;
+ }
+ } while (rc);
+
+cc_error:
+ kfree(data_buf);
+
+ if (rc) {
+ if (first_dn)
+ dlpar_free_cc_nodes(first_dn);
+
+ return NULL;
+ }
+
+ return first_dn;
+}
+
+int dlpar_attach_node(struct device_node *dn, struct device_node *parent)
+{
+ int rc;
+
+ dn->parent = parent;
+
+ rc = of_attach_node(dn);
+ if (rc) {
+ printk(KERN_ERR "Failed to add device node %pOF\n", dn);
+ return rc;
+ }
+
+ return 0;
+}
+
+int dlpar_detach_node(struct device_node *dn)
+{
+ struct device_node *child;
+ int rc;
+
+ child = of_get_next_child(dn, NULL);
+ while (child) {
+ dlpar_detach_node(child);
+ child = of_get_next_child(dn, child);
+ }
+
+ rc = of_detach_node(dn);
+ if (rc)
+ return rc;
+
+ of_node_put(dn);
+
+ return 0;
+}
+
+#define DR_ENTITY_SENSE 9003
+#define DR_ENTITY_PRESENT 1
+#define DR_ENTITY_UNUSABLE 2
+#define ALLOCATION_STATE 9003
+#define ALLOC_UNUSABLE 0
+#define ALLOC_USABLE 1
+#define ISOLATION_STATE 9001
+#define ISOLATE 0
+#define UNISOLATE 1
+
+int dlpar_acquire_drc(u32 drc_index)
+{
+ int dr_status, rc;
+
+ rc = rtas_call(rtas_token("get-sensor-state"), 2, 2, &dr_status,
+ DR_ENTITY_SENSE, drc_index);
+ if (rc || dr_status != DR_ENTITY_UNUSABLE)
+ return -1;
+
+ rc = rtas_set_indicator(ALLOCATION_STATE, drc_index, ALLOC_USABLE);
+ if (rc)
+ return rc;
+
+ rc = rtas_set_indicator(ISOLATION_STATE, drc_index, UNISOLATE);
+ if (rc) {
+ rtas_set_indicator(ALLOCATION_STATE, drc_index, ALLOC_UNUSABLE);
+ return rc;
+ }
+
+ return 0;
+}
+
+int dlpar_release_drc(u32 drc_index)
+{
+ int dr_status, rc;
+
+ rc = rtas_call(rtas_token("get-sensor-state"), 2, 2, &dr_status,
+ DR_ENTITY_SENSE, drc_index);
+ if (rc || dr_status != DR_ENTITY_PRESENT)
+ return -1;
+
+ rc = rtas_set_indicator(ISOLATION_STATE, drc_index, ISOLATE);
+ if (rc)
+ return rc;
+
+ rc = rtas_set_indicator(ALLOCATION_STATE, drc_index, ALLOC_UNUSABLE);
+ if (rc) {
+ rtas_set_indicator(ISOLATION_STATE, drc_index, UNISOLATE);
+ return rc;
+ }
+
+ return 0;
+}
+
+static int handle_dlpar_errorlog(struct pseries_hp_errorlog *hp_elog)
+{
+ int rc;
+
+ /* pseries error logs are in BE format, convert to cpu type */
+ switch (hp_elog->id_type) {
+ case PSERIES_HP_ELOG_ID_DRC_COUNT:
+ hp_elog->_drc_u.drc_count =
+ be32_to_cpu(hp_elog->_drc_u.drc_count);
+ break;
+ case PSERIES_HP_ELOG_ID_DRC_INDEX:
+ hp_elog->_drc_u.drc_index =
+ be32_to_cpu(hp_elog->_drc_u.drc_index);
+ break;
+ case PSERIES_HP_ELOG_ID_DRC_IC:
+ hp_elog->_drc_u.ic.count =
+ be32_to_cpu(hp_elog->_drc_u.ic.count);
+ hp_elog->_drc_u.ic.index =
+ be32_to_cpu(hp_elog->_drc_u.ic.index);
+ }
+
+ switch (hp_elog->resource) {
+ case PSERIES_HP_ELOG_RESOURCE_MEM:
+ rc = dlpar_memory(hp_elog);
+ break;
+ case PSERIES_HP_ELOG_RESOURCE_CPU:
+ rc = dlpar_cpu(hp_elog);
+ break;
+ default:
+ pr_warn_ratelimited("Invalid resource (%d) specified\n",
+ hp_elog->resource);
+ rc = -EINVAL;
+ }
+
+ return rc;
+}
+
+static void pseries_hp_work_fn(struct work_struct *work)
+{
+ struct pseries_hp_work *hp_work =
+ container_of(work, struct pseries_hp_work, work);
+
+ if (hp_work->rc)
+ *(hp_work->rc) = handle_dlpar_errorlog(hp_work->errlog);
+ else
+ handle_dlpar_errorlog(hp_work->errlog);
+
+ if (hp_work->hp_completion)
+ complete(hp_work->hp_completion);
+
+ kfree(hp_work->errlog);
+ kfree((void *)work);
+}
+
+void queue_hotplug_event(struct pseries_hp_errorlog *hp_errlog,
+ struct completion *hotplug_done, int *rc)
+{
+ struct pseries_hp_work *work;
+ struct pseries_hp_errorlog *hp_errlog_copy;
+
+ hp_errlog_copy = kmalloc(sizeof(struct pseries_hp_errorlog),
+ GFP_KERNEL);
+ memcpy(hp_errlog_copy, hp_errlog, sizeof(struct pseries_hp_errorlog));
+
+ work = kmalloc(sizeof(struct pseries_hp_work), GFP_KERNEL);
+ if (work) {
+ INIT_WORK((struct work_struct *)work, pseries_hp_work_fn);
+ work->errlog = hp_errlog_copy;
+ work->hp_completion = hotplug_done;
+ work->rc = rc;
+ queue_work(pseries_hp_wq, (struct work_struct *)work);
+ } else {
+ *rc = -ENOMEM;
+ kfree(hp_errlog_copy);
+ complete(hotplug_done);
+ }
+}
+
+static int dlpar_parse_resource(char **cmd, struct pseries_hp_errorlog *hp_elog)
+{
+ char *arg;
+
+ arg = strsep(cmd, " ");
+ if (!arg)
+ return -EINVAL;
+
+ if (sysfs_streq(arg, "memory")) {
+ hp_elog->resource = PSERIES_HP_ELOG_RESOURCE_MEM;
+ } else if (sysfs_streq(arg, "cpu")) {
+ hp_elog->resource = PSERIES_HP_ELOG_RESOURCE_CPU;
+ } else {
+ pr_err("Invalid resource specified.\n");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int dlpar_parse_action(char **cmd, struct pseries_hp_errorlog *hp_elog)
+{
+ char *arg;
+
+ arg = strsep(cmd, " ");
+ if (!arg)
+ return -EINVAL;
+
+ if (sysfs_streq(arg, "add")) {
+ hp_elog->action = PSERIES_HP_ELOG_ACTION_ADD;
+ } else if (sysfs_streq(arg, "remove")) {
+ hp_elog->action = PSERIES_HP_ELOG_ACTION_REMOVE;
+ } else {
+ pr_err("Invalid action specified.\n");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int dlpar_parse_id_type(char **cmd, struct pseries_hp_errorlog *hp_elog)
+{
+ char *arg;
+ u32 count, index;
+
+ arg = strsep(cmd, " ");
+ if (!arg)
+ return -EINVAL;
+
+ if (sysfs_streq(arg, "indexed-count")) {
+ hp_elog->id_type = PSERIES_HP_ELOG_ID_DRC_IC;
+ arg = strsep(cmd, " ");
+ if (!arg) {
+ pr_err("No DRC count specified.\n");
+ return -EINVAL;
+ }
+
+ if (kstrtou32(arg, 0, &count)) {
+ pr_err("Invalid DRC count specified.\n");
+ return -EINVAL;
+ }
+
+ arg = strsep(cmd, " ");
+ if (!arg) {
+ pr_err("No DRC Index specified.\n");
+ return -EINVAL;
+ }
+
+ if (kstrtou32(arg, 0, &index)) {
+ pr_err("Invalid DRC Index specified.\n");
+ return -EINVAL;
+ }
+
+ hp_elog->_drc_u.ic.count = cpu_to_be32(count);
+ hp_elog->_drc_u.ic.index = cpu_to_be32(index);
+ } else if (sysfs_streq(arg, "index")) {
+ hp_elog->id_type = PSERIES_HP_ELOG_ID_DRC_INDEX;
+ arg = strsep(cmd, " ");
+ if (!arg) {
+ pr_err("No DRC Index specified.\n");
+ return -EINVAL;
+ }
+
+ if (kstrtou32(arg, 0, &index)) {
+ pr_err("Invalid DRC Index specified.\n");
+ return -EINVAL;
+ }
+
+ hp_elog->_drc_u.drc_index = cpu_to_be32(index);
+ } else if (sysfs_streq(arg, "count")) {
+ hp_elog->id_type = PSERIES_HP_ELOG_ID_DRC_COUNT;
+ arg = strsep(cmd, " ");
+ if (!arg) {
+ pr_err("No DRC count specified.\n");
+ return -EINVAL;
+ }
+
+ if (kstrtou32(arg, 0, &count)) {
+ pr_err("Invalid DRC count specified.\n");
+ return -EINVAL;
+ }
+
+ hp_elog->_drc_u.drc_count = cpu_to_be32(count);
+ } else {
+ pr_err("Invalid id_type specified.\n");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static ssize_t dlpar_store(struct class *class, struct class_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct pseries_hp_errorlog *hp_elog;
+ struct completion hotplug_done;
+ char *argbuf;
+ char *args;
+ int rc;
+
+ args = argbuf = kstrdup(buf, GFP_KERNEL);
+ hp_elog = kzalloc(sizeof(*hp_elog), GFP_KERNEL);
+ if (!hp_elog || !argbuf) {
+ pr_info("Could not allocate resources for DLPAR operation\n");
+ kfree(argbuf);
+ kfree(hp_elog);
+ return -ENOMEM;
+ }
+
+ /*
+ * Parse out the request from the user, this will be in the form:
+ * <resource> <action> <id_type> <id>
+ */
+ rc = dlpar_parse_resource(&args, hp_elog);
+ if (rc)
+ goto dlpar_store_out;
+
+ rc = dlpar_parse_action(&args, hp_elog);
+ if (rc)
+ goto dlpar_store_out;
+
+ rc = dlpar_parse_id_type(&args, hp_elog);
+ if (rc)
+ goto dlpar_store_out;
+
+ init_completion(&hotplug_done);
+ queue_hotplug_event(hp_elog, &hotplug_done, &rc);
+ wait_for_completion(&hotplug_done);
+
+dlpar_store_out:
+ kfree(argbuf);
+ kfree(hp_elog);
+
+ if (rc)
+ pr_err("Could not handle DLPAR request \"%s\"\n", buf);
+
+ return rc ? rc : count;
+}
+
+static ssize_t dlpar_show(struct class *class, struct class_attribute *attr,
+ char *buf)
+{
+ return sprintf(buf, "%s\n", "memory,cpu");
+}
+
+static CLASS_ATTR_RW(dlpar);
+
+int __init dlpar_workqueue_init(void)
+{
+ if (pseries_hp_wq)
+ return 0;
+
+ pseries_hp_wq = alloc_workqueue("pseries hotplug workqueue",
+ WQ_UNBOUND, 1);
+
+ return pseries_hp_wq ? 0 : -ENOMEM;
+}
+
+static int __init dlpar_sysfs_init(void)
+{
+ int rc;
+
+ rc = dlpar_workqueue_init();
+ if (rc)
+ return rc;
+
+ return sysfs_create_file(kernel_kobj, &class_attr_dlpar.attr);
+}
+machine_device_initcall(pseries, dlpar_sysfs_init);
+
diff --git a/arch/powerpc/platforms/pseries/dtl.c b/arch/powerpc/platforms/pseries/dtl.c
new file mode 100644
index 000000000..ef6595153
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/dtl.c
@@ -0,0 +1,394 @@
+/*
+ * Virtual Processor Dispatch Trace Log
+ *
+ * (C) Copyright IBM Corporation 2009
+ *
+ * Author: Jeremy Kerr <jk@ozlabs.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <asm/smp.h>
+#include <linux/uaccess.h>
+#include <asm/firmware.h>
+#include <asm/lppaca.h>
+#include <asm/debugfs.h>
+#include <asm/plpar_wrappers.h>
+#include <asm/machdep.h>
+
+struct dtl {
+ struct dtl_entry *buf;
+ struct dentry *file;
+ int cpu;
+ int buf_entries;
+ u64 last_idx;
+ spinlock_t lock;
+};
+static DEFINE_PER_CPU(struct dtl, cpu_dtl);
+
+/*
+ * Dispatch trace log event mask:
+ * 0x7: 0x1: voluntary virtual processor waits
+ * 0x2: time-slice preempts
+ * 0x4: virtual partition memory page faults
+ */
+static u8 dtl_event_mask = 0x7;
+
+
+/*
+ * Size of per-cpu log buffers. Firmware requires that the buffer does
+ * not cross a 4k boundary.
+ */
+static int dtl_buf_entries = N_DISPATCH_LOG;
+
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
+struct dtl_ring {
+ u64 write_index;
+ struct dtl_entry *write_ptr;
+ struct dtl_entry *buf;
+ struct dtl_entry *buf_end;
+ u8 saved_dtl_mask;
+};
+
+static DEFINE_PER_CPU(struct dtl_ring, dtl_rings);
+
+static atomic_t dtl_count;
+
+/*
+ * The cpu accounting code controls the DTL ring buffer, and we get
+ * given entries as they are processed.
+ */
+static void consume_dtle(struct dtl_entry *dtle, u64 index)
+{
+ struct dtl_ring *dtlr = this_cpu_ptr(&dtl_rings);
+ struct dtl_entry *wp = dtlr->write_ptr;
+ struct lppaca *vpa = local_paca->lppaca_ptr;
+
+ if (!wp)
+ return;
+
+ *wp = *dtle;
+ barrier();
+
+ /* check for hypervisor ring buffer overflow, ignore this entry if so */
+ if (index + N_DISPATCH_LOG < be64_to_cpu(vpa->dtl_idx))
+ return;
+
+ ++wp;
+ if (wp == dtlr->buf_end)
+ wp = dtlr->buf;
+ dtlr->write_ptr = wp;
+
+ /* incrementing write_index makes the new entry visible */
+ smp_wmb();
+ ++dtlr->write_index;
+}
+
+static int dtl_start(struct dtl *dtl)
+{
+ struct dtl_ring *dtlr = &per_cpu(dtl_rings, dtl->cpu);
+
+ dtlr->buf = dtl->buf;
+ dtlr->buf_end = dtl->buf + dtl->buf_entries;
+ dtlr->write_index = 0;
+
+ /* setting write_ptr enables logging into our buffer */
+ smp_wmb();
+ dtlr->write_ptr = dtl->buf;
+
+ /* enable event logging */
+ dtlr->saved_dtl_mask = lppaca_of(dtl->cpu).dtl_enable_mask;
+ lppaca_of(dtl->cpu).dtl_enable_mask |= dtl_event_mask;
+
+ dtl_consumer = consume_dtle;
+ atomic_inc(&dtl_count);
+ return 0;
+}
+
+static void dtl_stop(struct dtl *dtl)
+{
+ struct dtl_ring *dtlr = &per_cpu(dtl_rings, dtl->cpu);
+
+ dtlr->write_ptr = NULL;
+ smp_wmb();
+
+ dtlr->buf = NULL;
+
+ /* restore dtl_enable_mask */
+ lppaca_of(dtl->cpu).dtl_enable_mask = dtlr->saved_dtl_mask;
+
+ if (atomic_dec_and_test(&dtl_count))
+ dtl_consumer = NULL;
+}
+
+static u64 dtl_current_index(struct dtl *dtl)
+{
+ return per_cpu(dtl_rings, dtl->cpu).write_index;
+}
+
+#else /* CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
+
+static int dtl_start(struct dtl *dtl)
+{
+ unsigned long addr;
+ int ret, hwcpu;
+
+ /* Register our dtl buffer with the hypervisor. The HV expects the
+ * buffer size to be passed in the second word of the buffer */
+ ((u32 *)dtl->buf)[1] = cpu_to_be32(DISPATCH_LOG_BYTES);
+
+ hwcpu = get_hard_smp_processor_id(dtl->cpu);
+ addr = __pa(dtl->buf);
+ ret = register_dtl(hwcpu, addr);
+ if (ret) {
+ printk(KERN_WARNING "%s: DTL registration for cpu %d (hw %d) "
+ "failed with %d\n", __func__, dtl->cpu, hwcpu, ret);
+ return -EIO;
+ }
+
+ /* set our initial buffer indices */
+ lppaca_of(dtl->cpu).dtl_idx = 0;
+
+ /* ensure that our updates to the lppaca fields have occurred before
+ * we actually enable the logging */
+ smp_wmb();
+
+ /* enable event logging */
+ lppaca_of(dtl->cpu).dtl_enable_mask = dtl_event_mask;
+
+ return 0;
+}
+
+static void dtl_stop(struct dtl *dtl)
+{
+ int hwcpu = get_hard_smp_processor_id(dtl->cpu);
+
+ lppaca_of(dtl->cpu).dtl_enable_mask = 0x0;
+
+ unregister_dtl(hwcpu);
+}
+
+static u64 dtl_current_index(struct dtl *dtl)
+{
+ return be64_to_cpu(lppaca_of(dtl->cpu).dtl_idx);
+}
+#endif /* CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
+
+static int dtl_enable(struct dtl *dtl)
+{
+ long int n_entries;
+ long int rc;
+ struct dtl_entry *buf = NULL;
+
+ if (!dtl_cache)
+ return -ENOMEM;
+
+ /* only allow one reader */
+ if (dtl->buf)
+ return -EBUSY;
+
+ n_entries = dtl_buf_entries;
+ buf = kmem_cache_alloc_node(dtl_cache, GFP_KERNEL, cpu_to_node(dtl->cpu));
+ if (!buf) {
+ printk(KERN_WARNING "%s: buffer alloc failed for cpu %d\n",
+ __func__, dtl->cpu);
+ return -ENOMEM;
+ }
+
+ spin_lock(&dtl->lock);
+ rc = -EBUSY;
+ if (!dtl->buf) {
+ /* store the original allocation size for use during read */
+ dtl->buf_entries = n_entries;
+ dtl->buf = buf;
+ dtl->last_idx = 0;
+ rc = dtl_start(dtl);
+ if (rc)
+ dtl->buf = NULL;
+ }
+ spin_unlock(&dtl->lock);
+
+ if (rc)
+ kmem_cache_free(dtl_cache, buf);
+ return rc;
+}
+
+static void dtl_disable(struct dtl *dtl)
+{
+ spin_lock(&dtl->lock);
+ dtl_stop(dtl);
+ kmem_cache_free(dtl_cache, dtl->buf);
+ dtl->buf = NULL;
+ dtl->buf_entries = 0;
+ spin_unlock(&dtl->lock);
+}
+
+/* file interface */
+
+static int dtl_file_open(struct inode *inode, struct file *filp)
+{
+ struct dtl *dtl = inode->i_private;
+ int rc;
+
+ rc = dtl_enable(dtl);
+ if (rc)
+ return rc;
+
+ filp->private_data = dtl;
+ return 0;
+}
+
+static int dtl_file_release(struct inode *inode, struct file *filp)
+{
+ struct dtl *dtl = inode->i_private;
+ dtl_disable(dtl);
+ return 0;
+}
+
+static ssize_t dtl_file_read(struct file *filp, char __user *buf, size_t len,
+ loff_t *pos)
+{
+ long int rc, n_read, n_req, read_size;
+ struct dtl *dtl;
+ u64 cur_idx, last_idx, i;
+
+ if ((len % sizeof(struct dtl_entry)) != 0)
+ return -EINVAL;
+
+ dtl = filp->private_data;
+
+ /* requested number of entries to read */
+ n_req = len / sizeof(struct dtl_entry);
+
+ /* actual number of entries read */
+ n_read = 0;
+
+ spin_lock(&dtl->lock);
+
+ cur_idx = dtl_current_index(dtl);
+ last_idx = dtl->last_idx;
+
+ if (last_idx + dtl->buf_entries <= cur_idx)
+ last_idx = cur_idx - dtl->buf_entries + 1;
+
+ if (last_idx + n_req > cur_idx)
+ n_req = cur_idx - last_idx;
+
+ if (n_req > 0)
+ dtl->last_idx = last_idx + n_req;
+
+ spin_unlock(&dtl->lock);
+
+ if (n_req <= 0)
+ return 0;
+
+ i = last_idx % dtl->buf_entries;
+
+ /* read the tail of the buffer if we've wrapped */
+ if (i + n_req > dtl->buf_entries) {
+ read_size = dtl->buf_entries - i;
+
+ rc = copy_to_user(buf, &dtl->buf[i],
+ read_size * sizeof(struct dtl_entry));
+ if (rc)
+ return -EFAULT;
+
+ i = 0;
+ n_req -= read_size;
+ n_read += read_size;
+ buf += read_size * sizeof(struct dtl_entry);
+ }
+
+ /* .. and now the head */
+ rc = copy_to_user(buf, &dtl->buf[i], n_req * sizeof(struct dtl_entry));
+ if (rc)
+ return -EFAULT;
+
+ n_read += n_req;
+
+ return n_read * sizeof(struct dtl_entry);
+}
+
+static const struct file_operations dtl_fops = {
+ .open = dtl_file_open,
+ .release = dtl_file_release,
+ .read = dtl_file_read,
+ .llseek = no_llseek,
+};
+
+static struct dentry *dtl_dir;
+
+static int dtl_setup_file(struct dtl *dtl)
+{
+ char name[10];
+
+ sprintf(name, "cpu-%d", dtl->cpu);
+
+ dtl->file = debugfs_create_file(name, 0400, dtl_dir, dtl, &dtl_fops);
+ if (!dtl->file)
+ return -ENOMEM;
+
+ return 0;
+}
+
+static int dtl_init(void)
+{
+ struct dentry *event_mask_file, *buf_entries_file;
+ int rc, i;
+
+ if (!firmware_has_feature(FW_FEATURE_SPLPAR))
+ return -ENODEV;
+
+ /* set up common debugfs structure */
+
+ rc = -ENOMEM;
+ dtl_dir = debugfs_create_dir("dtl", powerpc_debugfs_root);
+ if (!dtl_dir) {
+ printk(KERN_WARNING "%s: can't create dtl root dir\n",
+ __func__);
+ goto err;
+ }
+
+ event_mask_file = debugfs_create_x8("dtl_event_mask", 0600,
+ dtl_dir, &dtl_event_mask);
+ buf_entries_file = debugfs_create_u32("dtl_buf_entries", 0400,
+ dtl_dir, &dtl_buf_entries);
+
+ if (!event_mask_file || !buf_entries_file) {
+ printk(KERN_WARNING "%s: can't create dtl files\n", __func__);
+ goto err_remove_dir;
+ }
+
+ /* set up the per-cpu log structures */
+ for_each_possible_cpu(i) {
+ struct dtl *dtl = &per_cpu(cpu_dtl, i);
+ spin_lock_init(&dtl->lock);
+ dtl->cpu = i;
+
+ rc = dtl_setup_file(dtl);
+ if (rc)
+ goto err_remove_dir;
+ }
+
+ return 0;
+
+err_remove_dir:
+ debugfs_remove_recursive(dtl_dir);
+err:
+ return rc;
+}
+machine_arch_initcall(pseries, dtl_init);
diff --git a/arch/powerpc/platforms/pseries/eeh_pseries.c b/arch/powerpc/platforms/pseries/eeh_pseries.c
new file mode 100644
index 000000000..823cb27ef
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/eeh_pseries.c
@@ -0,0 +1,884 @@
+/*
+ * The file intends to implement the platform dependent EEH operations on pseries.
+ * Actually, the pseries platform is built based on RTAS heavily. That means the
+ * pseries platform dependent EEH operations will be built on RTAS calls. The functions
+ * are derived from arch/powerpc/platforms/pseries/eeh.c and necessary cleanup has
+ * been done.
+ *
+ * Copyright Benjamin Herrenschmidt & Gavin Shan, IBM Corporation 2011.
+ * Copyright IBM Corporation 2001, 2005, 2006
+ * Copyright Dave Engebretsen & Todd Inglett 2001
+ * Copyright Linas Vepstas 2005, 2006
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/atomic.h>
+#include <linux/delay.h>
+#include <linux/export.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/of.h>
+#include <linux/pci.h>
+#include <linux/proc_fs.h>
+#include <linux/rbtree.h>
+#include <linux/sched.h>
+#include <linux/seq_file.h>
+#include <linux/spinlock.h>
+
+#include <asm/eeh.h>
+#include <asm/eeh_event.h>
+#include <asm/io.h>
+#include <asm/machdep.h>
+#include <asm/ppc-pci.h>
+#include <asm/rtas.h>
+
+/* RTAS tokens */
+static int ibm_set_eeh_option;
+static int ibm_set_slot_reset;
+static int ibm_read_slot_reset_state;
+static int ibm_read_slot_reset_state2;
+static int ibm_slot_error_detail;
+static int ibm_get_config_addr_info;
+static int ibm_get_config_addr_info2;
+static int ibm_configure_pe;
+
+#ifdef CONFIG_PCI_IOV
+void pseries_pcibios_bus_add_device(struct pci_dev *pdev)
+{
+ struct pci_dn *pdn = pci_get_pdn(pdev);
+ struct pci_dn *physfn_pdn;
+ struct eeh_dev *edev;
+
+ if (!pdev->is_virtfn)
+ return;
+
+ pdn->device_id = pdev->device;
+ pdn->vendor_id = pdev->vendor;
+ pdn->class_code = pdev->class;
+ /*
+ * Last allow unfreeze return code used for retrieval
+ * by user space in eeh-sysfs to show the last command
+ * completion from platform.
+ */
+ pdn->last_allow_rc = 0;
+ physfn_pdn = pci_get_pdn(pdev->physfn);
+ pdn->pe_number = physfn_pdn->pe_num_map[pdn->vf_index];
+ edev = pdn_to_eeh_dev(pdn);
+
+ /*
+ * The following operations will fail if VF's sysfs files
+ * aren't created or its resources aren't finalized.
+ */
+ eeh_add_device_early(pdn);
+ eeh_add_device_late(pdev);
+ edev->pe_config_addr = (pdn->busno << 16) | (pdn->devfn << 8);
+ eeh_rmv_from_parent_pe(edev); /* Remove as it is adding to bus pe */
+ eeh_add_to_parent_pe(edev); /* Add as VF PE type */
+ eeh_sysfs_add_device(pdev);
+
+}
+#endif
+
+/*
+ * Buffer for reporting slot-error-detail rtas calls. Its here
+ * in BSS, and not dynamically alloced, so that it ends up in
+ * RMO where RTAS can access it.
+ */
+static unsigned char slot_errbuf[RTAS_ERROR_LOG_MAX];
+static DEFINE_SPINLOCK(slot_errbuf_lock);
+static int eeh_error_buf_size;
+
+/**
+ * pseries_eeh_init - EEH platform dependent initialization
+ *
+ * EEH platform dependent initialization on pseries.
+ */
+static int pseries_eeh_init(void)
+{
+ /* figure out EEH RTAS function call tokens */
+ ibm_set_eeh_option = rtas_token("ibm,set-eeh-option");
+ ibm_set_slot_reset = rtas_token("ibm,set-slot-reset");
+ ibm_read_slot_reset_state2 = rtas_token("ibm,read-slot-reset-state2");
+ ibm_read_slot_reset_state = rtas_token("ibm,read-slot-reset-state");
+ ibm_slot_error_detail = rtas_token("ibm,slot-error-detail");
+ ibm_get_config_addr_info2 = rtas_token("ibm,get-config-addr-info2");
+ ibm_get_config_addr_info = rtas_token("ibm,get-config-addr-info");
+ ibm_configure_pe = rtas_token("ibm,configure-pe");
+
+ /*
+ * ibm,configure-pe and ibm,configure-bridge have the same semantics,
+ * however ibm,configure-pe can be faster. If we can't find
+ * ibm,configure-pe then fall back to using ibm,configure-bridge.
+ */
+ if (ibm_configure_pe == RTAS_UNKNOWN_SERVICE)
+ ibm_configure_pe = rtas_token("ibm,configure-bridge");
+
+ /*
+ * Necessary sanity check. We needn't check "get-config-addr-info"
+ * and its variant since the old firmware probably support address
+ * of domain/bus/slot/function for EEH RTAS operations.
+ */
+ if (ibm_set_eeh_option == RTAS_UNKNOWN_SERVICE ||
+ ibm_set_slot_reset == RTAS_UNKNOWN_SERVICE ||
+ (ibm_read_slot_reset_state2 == RTAS_UNKNOWN_SERVICE &&
+ ibm_read_slot_reset_state == RTAS_UNKNOWN_SERVICE) ||
+ ibm_slot_error_detail == RTAS_UNKNOWN_SERVICE ||
+ ibm_configure_pe == RTAS_UNKNOWN_SERVICE) {
+ pr_info("EEH functionality not supported\n");
+ return -EINVAL;
+ }
+
+ /* Initialize error log lock and size */
+ spin_lock_init(&slot_errbuf_lock);
+ eeh_error_buf_size = rtas_token("rtas-error-log-max");
+ if (eeh_error_buf_size == RTAS_UNKNOWN_SERVICE) {
+ pr_info("%s: unknown EEH error log size\n",
+ __func__);
+ eeh_error_buf_size = 1024;
+ } else if (eeh_error_buf_size > RTAS_ERROR_LOG_MAX) {
+ pr_info("%s: EEH error log size %d exceeds the maximal %d\n",
+ __func__, eeh_error_buf_size, RTAS_ERROR_LOG_MAX);
+ eeh_error_buf_size = RTAS_ERROR_LOG_MAX;
+ }
+
+ /* Set EEH probe mode */
+ eeh_add_flag(EEH_PROBE_MODE_DEVTREE | EEH_ENABLE_IO_FOR_LOG);
+
+#ifdef CONFIG_PCI_IOV
+ /* Set EEH machine dependent code */
+ ppc_md.pcibios_bus_add_device = pseries_pcibios_bus_add_device;
+#endif
+
+ return 0;
+}
+
+static int pseries_eeh_cap_start(struct pci_dn *pdn)
+{
+ u32 status;
+
+ if (!pdn)
+ return 0;
+
+ rtas_read_config(pdn, PCI_STATUS, 2, &status);
+ if (!(status & PCI_STATUS_CAP_LIST))
+ return 0;
+
+ return PCI_CAPABILITY_LIST;
+}
+
+
+static int pseries_eeh_find_cap(struct pci_dn *pdn, int cap)
+{
+ int pos = pseries_eeh_cap_start(pdn);
+ int cnt = 48; /* Maximal number of capabilities */
+ u32 id;
+
+ if (!pos)
+ return 0;
+
+ while (cnt--) {
+ rtas_read_config(pdn, pos, 1, &pos);
+ if (pos < 0x40)
+ break;
+ pos &= ~3;
+ rtas_read_config(pdn, pos + PCI_CAP_LIST_ID, 1, &id);
+ if (id == 0xff)
+ break;
+ if (id == cap)
+ return pos;
+ pos += PCI_CAP_LIST_NEXT;
+ }
+
+ return 0;
+}
+
+static int pseries_eeh_find_ecap(struct pci_dn *pdn, int cap)
+{
+ struct eeh_dev *edev = pdn_to_eeh_dev(pdn);
+ u32 header;
+ int pos = 256;
+ int ttl = (4096 - 256) / 8;
+
+ if (!edev || !edev->pcie_cap)
+ return 0;
+ if (rtas_read_config(pdn, pos, 4, &header) != PCIBIOS_SUCCESSFUL)
+ return 0;
+ else if (!header)
+ return 0;
+
+ while (ttl-- > 0) {
+ if (PCI_EXT_CAP_ID(header) == cap && pos)
+ return pos;
+
+ pos = PCI_EXT_CAP_NEXT(header);
+ if (pos < 256)
+ break;
+
+ if (rtas_read_config(pdn, pos, 4, &header) != PCIBIOS_SUCCESSFUL)
+ break;
+ }
+
+ return 0;
+}
+
+/**
+ * pseries_eeh_probe - EEH probe on the given device
+ * @pdn: PCI device node
+ * @data: Unused
+ *
+ * When EEH module is installed during system boot, all PCI devices
+ * are checked one by one to see if it supports EEH. The function
+ * is introduced for the purpose.
+ */
+static void *pseries_eeh_probe(struct pci_dn *pdn, void *data)
+{
+ struct eeh_dev *edev;
+ struct eeh_pe pe;
+ u32 pcie_flags;
+ int enable = 0;
+ int ret;
+
+ /* Retrieve OF node and eeh device */
+ edev = pdn_to_eeh_dev(pdn);
+ if (!edev || edev->pe)
+ return NULL;
+
+ /* Check class/vendor/device IDs */
+ if (!pdn->vendor_id || !pdn->device_id || !pdn->class_code)
+ return NULL;
+
+ /* Skip for PCI-ISA bridge */
+ if ((pdn->class_code >> 8) == PCI_CLASS_BRIDGE_ISA)
+ return NULL;
+
+ /*
+ * Update class code and mode of eeh device. We need
+ * correctly reflects that current device is root port
+ * or PCIe switch downstream port.
+ */
+ edev->class_code = pdn->class_code;
+ edev->pcix_cap = pseries_eeh_find_cap(pdn, PCI_CAP_ID_PCIX);
+ edev->pcie_cap = pseries_eeh_find_cap(pdn, PCI_CAP_ID_EXP);
+ edev->aer_cap = pseries_eeh_find_ecap(pdn, PCI_EXT_CAP_ID_ERR);
+ edev->mode &= 0xFFFFFF00;
+ if ((edev->class_code >> 8) == PCI_CLASS_BRIDGE_PCI) {
+ edev->mode |= EEH_DEV_BRIDGE;
+ if (edev->pcie_cap) {
+ rtas_read_config(pdn, edev->pcie_cap + PCI_EXP_FLAGS,
+ 2, &pcie_flags);
+ pcie_flags = (pcie_flags & PCI_EXP_FLAGS_TYPE) >> 4;
+ if (pcie_flags == PCI_EXP_TYPE_ROOT_PORT)
+ edev->mode |= EEH_DEV_ROOT_PORT;
+ else if (pcie_flags == PCI_EXP_TYPE_DOWNSTREAM)
+ edev->mode |= EEH_DEV_DS_PORT;
+ }
+ }
+
+ /* Initialize the fake PE */
+ memset(&pe, 0, sizeof(struct eeh_pe));
+ pe.phb = pdn->phb;
+ pe.config_addr = (pdn->busno << 16) | (pdn->devfn << 8);
+
+ /* Enable EEH on the device */
+ ret = eeh_ops->set_option(&pe, EEH_OPT_ENABLE);
+ if (!ret) {
+ /* Retrieve PE address */
+ edev->pe_config_addr = eeh_ops->get_pe_addr(&pe);
+ pe.addr = edev->pe_config_addr;
+
+ /* Some older systems (Power4) allow the ibm,set-eeh-option
+ * call to succeed even on nodes where EEH is not supported.
+ * Verify support explicitly.
+ */
+ ret = eeh_ops->get_state(&pe, NULL);
+ if (ret > 0 && ret != EEH_STATE_NOT_SUPPORT)
+ enable = 1;
+
+ if (enable) {
+ eeh_add_flag(EEH_ENABLED);
+ eeh_add_to_parent_pe(edev);
+
+ pr_debug("%s: EEH enabled on %02x:%02x.%01x PHB#%x-PE#%x\n",
+ __func__, pdn->busno, PCI_SLOT(pdn->devfn),
+ PCI_FUNC(pdn->devfn), pe.phb->global_number,
+ pe.addr);
+ } else if (pdn->parent && pdn_to_eeh_dev(pdn->parent) &&
+ (pdn_to_eeh_dev(pdn->parent))->pe) {
+ /* This device doesn't support EEH, but it may have an
+ * EEH parent, in which case we mark it as supported.
+ */
+ edev->pe_config_addr = pdn_to_eeh_dev(pdn->parent)->pe_config_addr;
+ eeh_add_to_parent_pe(edev);
+ }
+ }
+
+ /* Save memory bars */
+ eeh_save_bars(edev);
+
+ return NULL;
+}
+
+/**
+ * pseries_eeh_set_option - Initialize EEH or MMIO/DMA reenable
+ * @pe: EEH PE
+ * @option: operation to be issued
+ *
+ * The function is used to control the EEH functionality globally.
+ * Currently, following options are support according to PAPR:
+ * Enable EEH, Disable EEH, Enable MMIO and Enable DMA
+ */
+static int pseries_eeh_set_option(struct eeh_pe *pe, int option)
+{
+ int ret = 0;
+ int config_addr;
+
+ /*
+ * When we're enabling or disabling EEH functioality on
+ * the particular PE, the PE config address is possibly
+ * unavailable. Therefore, we have to figure it out from
+ * the FDT node.
+ */
+ switch (option) {
+ case EEH_OPT_DISABLE:
+ case EEH_OPT_ENABLE:
+ case EEH_OPT_THAW_MMIO:
+ case EEH_OPT_THAW_DMA:
+ config_addr = pe->config_addr;
+ if (pe->addr)
+ config_addr = pe->addr;
+ break;
+ case EEH_OPT_FREEZE_PE:
+ /* Not support */
+ return 0;
+ default:
+ pr_err("%s: Invalid option %d\n",
+ __func__, option);
+ return -EINVAL;
+ }
+
+ ret = rtas_call(ibm_set_eeh_option, 4, 1, NULL,
+ config_addr, BUID_HI(pe->phb->buid),
+ BUID_LO(pe->phb->buid), option);
+
+ return ret;
+}
+
+/**
+ * pseries_eeh_get_pe_addr - Retrieve PE address
+ * @pe: EEH PE
+ *
+ * Retrieve the assocated PE address. Actually, there're 2 RTAS
+ * function calls dedicated for the purpose. We need implement
+ * it through the new function and then the old one. Besides,
+ * you should make sure the config address is figured out from
+ * FDT node before calling the function.
+ *
+ * It's notable that zero'ed return value means invalid PE config
+ * address.
+ */
+static int pseries_eeh_get_pe_addr(struct eeh_pe *pe)
+{
+ int ret = 0;
+ int rets[3];
+
+ if (ibm_get_config_addr_info2 != RTAS_UNKNOWN_SERVICE) {
+ /*
+ * First of all, we need to make sure there has one PE
+ * associated with the device. Otherwise, PE address is
+ * meaningless.
+ */
+ ret = rtas_call(ibm_get_config_addr_info2, 4, 2, rets,
+ pe->config_addr, BUID_HI(pe->phb->buid),
+ BUID_LO(pe->phb->buid), 1);
+ if (ret || (rets[0] == 0))
+ return 0;
+
+ /* Retrieve the associated PE config address */
+ ret = rtas_call(ibm_get_config_addr_info2, 4, 2, rets,
+ pe->config_addr, BUID_HI(pe->phb->buid),
+ BUID_LO(pe->phb->buid), 0);
+ if (ret) {
+ pr_warn("%s: Failed to get address for PHB#%x-PE#%x\n",
+ __func__, pe->phb->global_number, pe->config_addr);
+ return 0;
+ }
+
+ return rets[0];
+ }
+
+ if (ibm_get_config_addr_info != RTAS_UNKNOWN_SERVICE) {
+ ret = rtas_call(ibm_get_config_addr_info, 4, 2, rets,
+ pe->config_addr, BUID_HI(pe->phb->buid),
+ BUID_LO(pe->phb->buid), 0);
+ if (ret) {
+ pr_warn("%s: Failed to get address for PHB#%x-PE#%x\n",
+ __func__, pe->phb->global_number, pe->config_addr);
+ return 0;
+ }
+
+ return rets[0];
+ }
+
+ return ret;
+}
+
+/**
+ * pseries_eeh_get_state - Retrieve PE state
+ * @pe: EEH PE
+ * @state: return value
+ *
+ * Retrieve the state of the specified PE. On RTAS compliant
+ * pseries platform, there already has one dedicated RTAS function
+ * for the purpose. It's notable that the associated PE config address
+ * might be ready when calling the function. Therefore, endeavour to
+ * use the PE config address if possible. Further more, there're 2
+ * RTAS calls for the purpose, we need to try the new one and back
+ * to the old one if the new one couldn't work properly.
+ */
+static int pseries_eeh_get_state(struct eeh_pe *pe, int *state)
+{
+ int config_addr;
+ int ret;
+ int rets[4];
+ int result;
+
+ /* Figure out PE config address if possible */
+ config_addr = pe->config_addr;
+ if (pe->addr)
+ config_addr = pe->addr;
+
+ if (ibm_read_slot_reset_state2 != RTAS_UNKNOWN_SERVICE) {
+ ret = rtas_call(ibm_read_slot_reset_state2, 3, 4, rets,
+ config_addr, BUID_HI(pe->phb->buid),
+ BUID_LO(pe->phb->buid));
+ } else if (ibm_read_slot_reset_state != RTAS_UNKNOWN_SERVICE) {
+ /* Fake PE unavailable info */
+ rets[2] = 0;
+ ret = rtas_call(ibm_read_slot_reset_state, 3, 3, rets,
+ config_addr, BUID_HI(pe->phb->buid),
+ BUID_LO(pe->phb->buid));
+ } else {
+ return EEH_STATE_NOT_SUPPORT;
+ }
+
+ if (ret)
+ return ret;
+
+ /* Parse the result out */
+ if (!rets[1])
+ return EEH_STATE_NOT_SUPPORT;
+
+ switch(rets[0]) {
+ case 0:
+ result = EEH_STATE_MMIO_ACTIVE |
+ EEH_STATE_DMA_ACTIVE;
+ break;
+ case 1:
+ result = EEH_STATE_RESET_ACTIVE |
+ EEH_STATE_MMIO_ACTIVE |
+ EEH_STATE_DMA_ACTIVE;
+ break;
+ case 2:
+ result = 0;
+ break;
+ case 4:
+ result = EEH_STATE_MMIO_ENABLED;
+ break;
+ case 5:
+ if (rets[2]) {
+ if (state) *state = rets[2];
+ result = EEH_STATE_UNAVAILABLE;
+ } else {
+ result = EEH_STATE_NOT_SUPPORT;
+ }
+ break;
+ default:
+ result = EEH_STATE_NOT_SUPPORT;
+ }
+
+ return result;
+}
+
+/**
+ * pseries_eeh_reset - Reset the specified PE
+ * @pe: EEH PE
+ * @option: reset option
+ *
+ * Reset the specified PE
+ */
+static int pseries_eeh_reset(struct eeh_pe *pe, int option)
+{
+ int config_addr;
+ int ret;
+
+ /* Figure out PE address */
+ config_addr = pe->config_addr;
+ if (pe->addr)
+ config_addr = pe->addr;
+
+ /* Reset PE through RTAS call */
+ ret = rtas_call(ibm_set_slot_reset, 4, 1, NULL,
+ config_addr, BUID_HI(pe->phb->buid),
+ BUID_LO(pe->phb->buid), option);
+
+ /* If fundamental-reset not supported, try hot-reset */
+ if (option == EEH_RESET_FUNDAMENTAL &&
+ ret == -8) {
+ option = EEH_RESET_HOT;
+ ret = rtas_call(ibm_set_slot_reset, 4, 1, NULL,
+ config_addr, BUID_HI(pe->phb->buid),
+ BUID_LO(pe->phb->buid), option);
+ }
+
+ /* We need reset hold or settlement delay */
+ if (option == EEH_RESET_FUNDAMENTAL ||
+ option == EEH_RESET_HOT)
+ msleep(EEH_PE_RST_HOLD_TIME);
+ else
+ msleep(EEH_PE_RST_SETTLE_TIME);
+
+ return ret;
+}
+
+/**
+ * pseries_eeh_wait_state - Wait for PE state
+ * @pe: EEH PE
+ * @max_wait: maximal period in millisecond
+ *
+ * Wait for the state of associated PE. It might take some time
+ * to retrieve the PE's state.
+ */
+static int pseries_eeh_wait_state(struct eeh_pe *pe, int max_wait)
+{
+ int ret;
+ int mwait;
+
+ /*
+ * According to PAPR, the state of PE might be temporarily
+ * unavailable. Under the circumstance, we have to wait
+ * for indicated time determined by firmware. The maximal
+ * wait time is 5 minutes, which is acquired from the original
+ * EEH implementation. Also, the original implementation
+ * also defined the minimal wait time as 1 second.
+ */
+#define EEH_STATE_MIN_WAIT_TIME (1000)
+#define EEH_STATE_MAX_WAIT_TIME (300 * 1000)
+
+ while (1) {
+ ret = pseries_eeh_get_state(pe, &mwait);
+
+ /*
+ * If the PE's state is temporarily unavailable,
+ * we have to wait for the specified time. Otherwise,
+ * the PE's state will be returned immediately.
+ */
+ if (ret != EEH_STATE_UNAVAILABLE)
+ return ret;
+
+ if (max_wait <= 0) {
+ pr_warn("%s: Timeout when getting PE's state (%d)\n",
+ __func__, max_wait);
+ return EEH_STATE_NOT_SUPPORT;
+ }
+
+ if (mwait <= 0) {
+ pr_warn("%s: Firmware returned bad wait value %d\n",
+ __func__, mwait);
+ mwait = EEH_STATE_MIN_WAIT_TIME;
+ } else if (mwait > EEH_STATE_MAX_WAIT_TIME) {
+ pr_warn("%s: Firmware returned too long wait value %d\n",
+ __func__, mwait);
+ mwait = EEH_STATE_MAX_WAIT_TIME;
+ }
+
+ max_wait -= mwait;
+ msleep(mwait);
+ }
+
+ return EEH_STATE_NOT_SUPPORT;
+}
+
+/**
+ * pseries_eeh_get_log - Retrieve error log
+ * @pe: EEH PE
+ * @severity: temporary or permanent error log
+ * @drv_log: driver log to be combined with retrieved error log
+ * @len: length of driver log
+ *
+ * Retrieve the temporary or permanent error from the PE.
+ * Actually, the error will be retrieved through the dedicated
+ * RTAS call.
+ */
+static int pseries_eeh_get_log(struct eeh_pe *pe, int severity, char *drv_log, unsigned long len)
+{
+ int config_addr;
+ unsigned long flags;
+ int ret;
+
+ spin_lock_irqsave(&slot_errbuf_lock, flags);
+ memset(slot_errbuf, 0, eeh_error_buf_size);
+
+ /* Figure out the PE address */
+ config_addr = pe->config_addr;
+ if (pe->addr)
+ config_addr = pe->addr;
+
+ ret = rtas_call(ibm_slot_error_detail, 8, 1, NULL, config_addr,
+ BUID_HI(pe->phb->buid), BUID_LO(pe->phb->buid),
+ virt_to_phys(drv_log), len,
+ virt_to_phys(slot_errbuf), eeh_error_buf_size,
+ severity);
+ if (!ret)
+ log_error(slot_errbuf, ERR_TYPE_RTAS_LOG, 0);
+ spin_unlock_irqrestore(&slot_errbuf_lock, flags);
+
+ return ret;
+}
+
+/**
+ * pseries_eeh_configure_bridge - Configure PCI bridges in the indicated PE
+ * @pe: EEH PE
+ *
+ * The function will be called to reconfigure the bridges included
+ * in the specified PE so that the mulfunctional PE would be recovered
+ * again.
+ */
+static int pseries_eeh_configure_bridge(struct eeh_pe *pe)
+{
+ int config_addr;
+ int ret;
+ /* Waiting 0.2s maximum before skipping configuration */
+ int max_wait = 200;
+
+ /* Figure out the PE address */
+ config_addr = pe->config_addr;
+ if (pe->addr)
+ config_addr = pe->addr;
+
+ while (max_wait > 0) {
+ ret = rtas_call(ibm_configure_pe, 3, 1, NULL,
+ config_addr, BUID_HI(pe->phb->buid),
+ BUID_LO(pe->phb->buid));
+
+ if (!ret)
+ return ret;
+
+ /*
+ * If RTAS returns a delay value that's above 100ms, cut it
+ * down to 100ms in case firmware made a mistake. For more
+ * on how these delay values work see rtas_busy_delay_time
+ */
+ if (ret > RTAS_EXTENDED_DELAY_MIN+2 &&
+ ret <= RTAS_EXTENDED_DELAY_MAX)
+ ret = RTAS_EXTENDED_DELAY_MIN+2;
+
+ max_wait -= rtas_busy_delay_time(ret);
+
+ if (max_wait < 0)
+ break;
+
+ rtas_busy_delay(ret);
+ }
+
+ pr_warn("%s: Unable to configure bridge PHB#%x-PE#%x (%d)\n",
+ __func__, pe->phb->global_number, pe->addr, ret);
+ return ret;
+}
+
+/**
+ * pseries_eeh_read_config - Read PCI config space
+ * @pdn: PCI device node
+ * @where: PCI address
+ * @size: size to read
+ * @val: return value
+ *
+ * Read config space from the speicifed device
+ */
+static int pseries_eeh_read_config(struct pci_dn *pdn, int where, int size, u32 *val)
+{
+ return rtas_read_config(pdn, where, size, val);
+}
+
+/**
+ * pseries_eeh_write_config - Write PCI config space
+ * @pdn: PCI device node
+ * @where: PCI address
+ * @size: size to write
+ * @val: value to be written
+ *
+ * Write config space to the specified device
+ */
+static int pseries_eeh_write_config(struct pci_dn *pdn, int where, int size, u32 val)
+{
+ return rtas_write_config(pdn, where, size, val);
+}
+
+static int pseries_eeh_restore_config(struct pci_dn *pdn)
+{
+ struct eeh_dev *edev = pdn_to_eeh_dev(pdn);
+ s64 ret = 0;
+
+ if (!edev)
+ return -EEXIST;
+
+ /*
+ * FIXME: The MPS, error routing rules, timeout setting are worthy
+ * to be exported by firmware in extendible way.
+ */
+ if (edev->physfn)
+ ret = eeh_restore_vf_config(pdn);
+
+ if (ret) {
+ pr_warn("%s: Can't reinit PCI dev 0x%x (%lld)\n",
+ __func__, edev->pe_config_addr, ret);
+ return -EIO;
+ }
+
+ return ret;
+}
+
+#ifdef CONFIG_PCI_IOV
+int pseries_send_allow_unfreeze(struct pci_dn *pdn,
+ u16 *vf_pe_array, int cur_vfs)
+{
+ int rc;
+ int ibm_allow_unfreeze = rtas_token("ibm,open-sriov-allow-unfreeze");
+ unsigned long buid, addr;
+
+ addr = rtas_config_addr(pdn->busno, pdn->devfn, 0);
+ buid = pdn->phb->buid;
+ spin_lock(&rtas_data_buf_lock);
+ memcpy(rtas_data_buf, vf_pe_array, RTAS_DATA_BUF_SIZE);
+ rc = rtas_call(ibm_allow_unfreeze, 5, 1, NULL,
+ addr,
+ BUID_HI(buid),
+ BUID_LO(buid),
+ rtas_data_buf, cur_vfs * sizeof(u16));
+ spin_unlock(&rtas_data_buf_lock);
+ if (rc)
+ pr_warn("%s: Failed to allow unfreeze for PHB#%x-PE#%lx, rc=%x\n",
+ __func__,
+ pdn->phb->global_number, addr, rc);
+ return rc;
+}
+
+static int pseries_call_allow_unfreeze(struct eeh_dev *edev)
+{
+ struct pci_dn *pdn, *tmp, *parent, *physfn_pdn;
+ int cur_vfs = 0, rc = 0, vf_index, bus, devfn;
+ u16 *vf_pe_array;
+
+ vf_pe_array = kzalloc(RTAS_DATA_BUF_SIZE, GFP_KERNEL);
+ if (!vf_pe_array)
+ return -ENOMEM;
+ if (pci_num_vf(edev->physfn ? edev->physfn : edev->pdev)) {
+ if (edev->pdev->is_physfn) {
+ cur_vfs = pci_num_vf(edev->pdev);
+ pdn = eeh_dev_to_pdn(edev);
+ parent = pdn->parent;
+ for (vf_index = 0; vf_index < cur_vfs; vf_index++)
+ vf_pe_array[vf_index] =
+ cpu_to_be16(pdn->pe_num_map[vf_index]);
+ rc = pseries_send_allow_unfreeze(pdn, vf_pe_array,
+ cur_vfs);
+ pdn->last_allow_rc = rc;
+ for (vf_index = 0; vf_index < cur_vfs; vf_index++) {
+ list_for_each_entry_safe(pdn, tmp,
+ &parent->child_list,
+ list) {
+ bus = pci_iov_virtfn_bus(edev->pdev,
+ vf_index);
+ devfn = pci_iov_virtfn_devfn(edev->pdev,
+ vf_index);
+ if (pdn->busno != bus ||
+ pdn->devfn != devfn)
+ continue;
+ pdn->last_allow_rc = rc;
+ }
+ }
+ } else {
+ pdn = pci_get_pdn(edev->pdev);
+ vf_pe_array[0] = cpu_to_be16(pdn->pe_number);
+ physfn_pdn = pci_get_pdn(edev->physfn);
+ rc = pseries_send_allow_unfreeze(physfn_pdn,
+ vf_pe_array, 1);
+ pdn->last_allow_rc = rc;
+ }
+ }
+
+ kfree(vf_pe_array);
+ return rc;
+}
+
+static int pseries_notify_resume(struct pci_dn *pdn)
+{
+ struct eeh_dev *edev = pdn_to_eeh_dev(pdn);
+
+ if (!edev)
+ return -EEXIST;
+
+ if (rtas_token("ibm,open-sriov-allow-unfreeze")
+ == RTAS_UNKNOWN_SERVICE)
+ return -EINVAL;
+
+ if (edev->pdev->is_physfn || edev->pdev->is_virtfn)
+ return pseries_call_allow_unfreeze(edev);
+
+ return 0;
+}
+#endif
+
+static struct eeh_ops pseries_eeh_ops = {
+ .name = "pseries",
+ .init = pseries_eeh_init,
+ .probe = pseries_eeh_probe,
+ .set_option = pseries_eeh_set_option,
+ .get_pe_addr = pseries_eeh_get_pe_addr,
+ .get_state = pseries_eeh_get_state,
+ .reset = pseries_eeh_reset,
+ .wait_state = pseries_eeh_wait_state,
+ .get_log = pseries_eeh_get_log,
+ .configure_bridge = pseries_eeh_configure_bridge,
+ .err_inject = NULL,
+ .read_config = pseries_eeh_read_config,
+ .write_config = pseries_eeh_write_config,
+ .next_error = NULL,
+ .restore_config = pseries_eeh_restore_config,
+#ifdef CONFIG_PCI_IOV
+ .notify_resume = pseries_notify_resume
+#endif
+};
+
+/**
+ * eeh_pseries_init - Register platform dependent EEH operations
+ *
+ * EEH initialization on pseries platform. This function should be
+ * called before any EEH related functions.
+ */
+static int __init eeh_pseries_init(void)
+{
+ int ret;
+
+ ret = eeh_ops_register(&pseries_eeh_ops);
+ if (!ret)
+ pr_info("EEH: pSeries platform initialized\n");
+ else
+ pr_info("EEH: pSeries platform initialization failure (%d)\n",
+ ret);
+
+ return ret;
+}
+machine_early_initcall(pseries, eeh_pseries_init);
diff --git a/arch/powerpc/platforms/pseries/event_sources.c b/arch/powerpc/platforms/pseries/event_sources.c
new file mode 100644
index 000000000..6eeb0d4ba
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/event_sources.c
@@ -0,0 +1,57 @@
+/*
+ * Copyright (C) 2001 Dave Engebretsen IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <asm/prom.h>
+
+#include "pseries.h"
+
+void request_event_sources_irqs(struct device_node *np,
+ irq_handler_t handler,
+ const char *name)
+{
+ int i, index, count = 0;
+ struct of_phandle_args oirq;
+ unsigned int virqs[16];
+
+ /* First try to do a proper OF tree parsing */
+ for (index = 0; of_irq_parse_one(np, index, &oirq) == 0;
+ index++) {
+ if (count > 15)
+ break;
+ virqs[count] = irq_create_of_mapping(&oirq);
+ if (!virqs[count]) {
+ pr_err("event-sources: Unable to allocate "
+ "interrupt number for %pOF\n",
+ np);
+ WARN_ON(1);
+ } else {
+ count++;
+ }
+ }
+
+ /* Now request them */
+ for (i = 0; i < count; i++) {
+ if (request_irq(virqs[i], handler, 0, name, NULL)) {
+ pr_err("event-sources: Unable to request interrupt "
+ "%d for %pOF\n", virqs[i], np);
+ WARN_ON(1);
+ return;
+ }
+ }
+}
+
diff --git a/arch/powerpc/platforms/pseries/firmware.c b/arch/powerpc/platforms/pseries/firmware.c
new file mode 100644
index 000000000..a3bbeb436
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/firmware.c
@@ -0,0 +1,180 @@
+/*
+ * pSeries firmware setup code.
+ *
+ * Portions from arch/powerpc/platforms/pseries/setup.c:
+ * Copyright (C) 1995 Linus Torvalds
+ * Adapted from 'alpha' version by Gary Thomas
+ * Modified by Cort Dougan (cort@cs.nmt.edu)
+ * Modified by PPC64 Team, IBM Corp
+ *
+ * Portions from arch/powerpc/kernel/firmware.c
+ * Copyright (C) 2001 Ben. Herrenschmidt (benh@kernel.crashing.org)
+ * Modifications for ppc64:
+ * Copyright (C) 2003 Dave Engebretsen <engebret@us.ibm.com>
+ * Copyright (C) 2005 Stephen Rothwell, IBM Corporation
+ *
+ * Copyright 2006 IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+
+#include <linux/of_fdt.h>
+#include <asm/firmware.h>
+#include <asm/prom.h>
+#include <asm/udbg.h>
+
+#include "pseries.h"
+
+struct hypertas_fw_feature {
+ unsigned long val;
+ char * name;
+};
+
+/*
+ * The names in this table match names in rtas/ibm,hypertas-functions. If the
+ * entry ends in a '*', only upto the '*' is matched. Otherwise the entire
+ * string must match.
+ */
+static __initdata struct hypertas_fw_feature
+hypertas_fw_features_table[] = {
+ {FW_FEATURE_PFT, "hcall-pft"},
+ {FW_FEATURE_TCE, "hcall-tce"},
+ {FW_FEATURE_SPRG0, "hcall-sprg0"},
+ {FW_FEATURE_DABR, "hcall-dabr"},
+ {FW_FEATURE_COPY, "hcall-copy"},
+ {FW_FEATURE_ASR, "hcall-asr"},
+ {FW_FEATURE_DEBUG, "hcall-debug"},
+ {FW_FEATURE_PERF, "hcall-perf"},
+ {FW_FEATURE_DUMP, "hcall-dump"},
+ {FW_FEATURE_INTERRUPT, "hcall-interrupt"},
+ {FW_FEATURE_MIGRATE, "hcall-migrate"},
+ {FW_FEATURE_PERFMON, "hcall-perfmon"},
+ {FW_FEATURE_CRQ, "hcall-crq"},
+ {FW_FEATURE_VIO, "hcall-vio"},
+ {FW_FEATURE_RDMA, "hcall-rdma"},
+ {FW_FEATURE_LLAN, "hcall-lLAN"},
+ {FW_FEATURE_BULK_REMOVE, "hcall-bulk"},
+ {FW_FEATURE_XDABR, "hcall-xdabr"},
+ {FW_FEATURE_MULTITCE, "hcall-multi-tce"},
+ {FW_FEATURE_SPLPAR, "hcall-splpar"},
+ {FW_FEATURE_VPHN, "hcall-vphn"},
+ {FW_FEATURE_SET_MODE, "hcall-set-mode"},
+ {FW_FEATURE_BEST_ENERGY, "hcall-best-energy-1*"},
+ {FW_FEATURE_HPT_RESIZE, "hcall-hpt-resize"},
+};
+
+/* Build up the firmware features bitmask using the contents of
+ * device-tree/ibm,hypertas-functions. Ultimately this functionality may
+ * be moved into prom.c prom_init().
+ */
+static void __init fw_hypertas_feature_init(const char *hypertas,
+ unsigned long len)
+{
+ const char *s;
+ int i;
+
+ pr_debug(" -> fw_hypertas_feature_init()\n");
+
+ for (s = hypertas; s < hypertas + len; s += strlen(s) + 1) {
+ for (i = 0; i < ARRAY_SIZE(hypertas_fw_features_table); i++) {
+ const char *name = hypertas_fw_features_table[i].name;
+ size_t size;
+
+ /*
+ * If there is a '*' at the end of name, only check
+ * upto there
+ */
+ size = strlen(name);
+ if (size && name[size - 1] == '*') {
+ if (strncmp(name, s, size - 1))
+ continue;
+ } else if (strcmp(name, s))
+ continue;
+
+ /* we have a match */
+ powerpc_firmware_features |=
+ hypertas_fw_features_table[i].val;
+ break;
+ }
+ }
+
+ pr_debug(" <- fw_hypertas_feature_init()\n");
+}
+
+struct vec5_fw_feature {
+ unsigned long val;
+ unsigned int feature;
+};
+
+static __initdata struct vec5_fw_feature
+vec5_fw_features_table[] = {
+ {FW_FEATURE_TYPE1_AFFINITY, OV5_TYPE1_AFFINITY},
+ {FW_FEATURE_PRRN, OV5_PRRN},
+ {FW_FEATURE_DRMEM_V2, OV5_DRMEM_V2},
+ {FW_FEATURE_DRC_INFO, OV5_DRC_INFO},
+};
+
+static void __init fw_vec5_feature_init(const char *vec5, unsigned long len)
+{
+ unsigned int index, feat;
+ int i;
+
+ pr_debug(" -> fw_vec5_feature_init()\n");
+
+ for (i = 0; i < ARRAY_SIZE(vec5_fw_features_table); i++) {
+ index = OV5_INDX(vec5_fw_features_table[i].feature);
+ feat = OV5_FEAT(vec5_fw_features_table[i].feature);
+
+ if (index < len && (vec5[index] & feat))
+ powerpc_firmware_features |=
+ vec5_fw_features_table[i].val;
+ }
+
+ pr_debug(" <- fw_vec5_feature_init()\n");
+}
+
+/*
+ * Called very early, MMU is off, device-tree isn't unflattened
+ */
+static int __init probe_fw_features(unsigned long node, const char *uname, int
+ depth, void *data)
+{
+ const char *prop;
+ int len;
+ static int hypertas_found;
+ static int vec5_found;
+
+ if (depth != 1)
+ return 0;
+
+ if (!strcmp(uname, "rtas") || !strcmp(uname, "rtas@0")) {
+ prop = of_get_flat_dt_prop(node, "ibm,hypertas-functions",
+ &len);
+ if (prop) {
+ powerpc_firmware_features |= FW_FEATURE_LPAR;
+ fw_hypertas_feature_init(prop, len);
+ }
+
+ hypertas_found = 1;
+ }
+
+ if (!strcmp(uname, "chosen")) {
+ prop = of_get_flat_dt_prop(node, "ibm,architecture-vec-5",
+ &len);
+ if (prop)
+ fw_vec5_feature_init(prop, len);
+
+ vec5_found = 1;
+ }
+
+ return hypertas_found && vec5_found;
+}
+
+void __init pseries_probe_fw_features(void)
+{
+ of_scan_flat_dt(probe_fw_features, NULL);
+}
diff --git a/arch/powerpc/platforms/pseries/hotplug-cpu.c b/arch/powerpc/platforms/pseries/hotplug-cpu.c
new file mode 100644
index 000000000..8bfb97d07
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/hotplug-cpu.c
@@ -0,0 +1,975 @@
+/*
+ * pseries CPU Hotplug infrastructure.
+ *
+ * Split out from arch/powerpc/platforms/pseries/setup.c
+ * arch/powerpc/kernel/rtas.c, and arch/powerpc/platforms/pseries/smp.c
+ *
+ * Peter Bergner, IBM March 2001.
+ * Copyright (C) 2001 IBM.
+ * Dave Engebretsen, Peter Bergner, and
+ * Mike Corrigan {engebret|bergner|mikec}@us.ibm.com
+ * Plus various changes from other IBM teams...
+ *
+ * Copyright (C) 2006 Michael Ellerman, IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#define pr_fmt(fmt) "pseries-hotplug-cpu: " fmt
+
+#include <linux/kernel.h>
+#include <linux/interrupt.h>
+#include <linux/delay.h>
+#include <linux/sched.h> /* for idle_task_exit */
+#include <linux/sched/hotplug.h>
+#include <linux/cpu.h>
+#include <linux/of.h>
+#include <linux/slab.h>
+#include <asm/prom.h>
+#include <asm/rtas.h>
+#include <asm/firmware.h>
+#include <asm/machdep.h>
+#include <asm/vdso_datapage.h>
+#include <asm/xics.h>
+#include <asm/xive.h>
+#include <asm/plpar_wrappers.h>
+#include <asm/topology.h>
+
+#include "pseries.h"
+#include "offline_states.h"
+
+/* This version can't take the spinlock, because it never returns */
+static int rtas_stop_self_token = RTAS_UNKNOWN_SERVICE;
+
+static DEFINE_PER_CPU(enum cpu_state_vals, preferred_offline_state) =
+ CPU_STATE_OFFLINE;
+static DEFINE_PER_CPU(enum cpu_state_vals, current_state) = CPU_STATE_OFFLINE;
+
+static enum cpu_state_vals default_offline_state = CPU_STATE_OFFLINE;
+
+static bool cede_offline_enabled __read_mostly = true;
+
+/*
+ * Enable/disable cede_offline when available.
+ */
+static int __init setup_cede_offline(char *str)
+{
+ return (kstrtobool(str, &cede_offline_enabled) == 0);
+}
+
+__setup("cede_offline=", setup_cede_offline);
+
+enum cpu_state_vals get_cpu_current_state(int cpu)
+{
+ return per_cpu(current_state, cpu);
+}
+
+void set_cpu_current_state(int cpu, enum cpu_state_vals state)
+{
+ per_cpu(current_state, cpu) = state;
+}
+
+enum cpu_state_vals get_preferred_offline_state(int cpu)
+{
+ return per_cpu(preferred_offline_state, cpu);
+}
+
+void set_preferred_offline_state(int cpu, enum cpu_state_vals state)
+{
+ per_cpu(preferred_offline_state, cpu) = state;
+}
+
+void set_default_offline_state(int cpu)
+{
+ per_cpu(preferred_offline_state, cpu) = default_offline_state;
+}
+
+static void rtas_stop_self(void)
+{
+ static struct rtas_args args;
+
+ local_irq_disable();
+
+ BUG_ON(rtas_stop_self_token == RTAS_UNKNOWN_SERVICE);
+
+ rtas_call_unlocked(&args, rtas_stop_self_token, 0, 1, NULL);
+
+ panic("Alas, I survived.\n");
+}
+
+static void pseries_mach_cpu_die(void)
+{
+ unsigned int cpu = smp_processor_id();
+ unsigned int hwcpu = hard_smp_processor_id();
+ u8 cede_latency_hint = 0;
+
+ local_irq_disable();
+ idle_task_exit();
+ if (xive_enabled())
+ xive_teardown_cpu();
+ else
+ xics_teardown_cpu();
+
+ if (get_preferred_offline_state(cpu) == CPU_STATE_INACTIVE) {
+ set_cpu_current_state(cpu, CPU_STATE_INACTIVE);
+ if (ppc_md.suspend_disable_cpu)
+ ppc_md.suspend_disable_cpu();
+
+ cede_latency_hint = 2;
+
+ get_lppaca()->idle = 1;
+ if (!lppaca_shared_proc(get_lppaca()))
+ get_lppaca()->donate_dedicated_cpu = 1;
+
+ while (get_preferred_offline_state(cpu) == CPU_STATE_INACTIVE) {
+ while (!prep_irq_for_idle()) {
+ local_irq_enable();
+ local_irq_disable();
+ }
+
+ extended_cede_processor(cede_latency_hint);
+ }
+
+ local_irq_disable();
+
+ if (!lppaca_shared_proc(get_lppaca()))
+ get_lppaca()->donate_dedicated_cpu = 0;
+ get_lppaca()->idle = 0;
+
+ if (get_preferred_offline_state(cpu) == CPU_STATE_ONLINE) {
+ unregister_slb_shadow(hwcpu);
+
+ hard_irq_disable();
+ /*
+ * Call to start_secondary_resume() will not return.
+ * Kernel stack will be reset and start_secondary()
+ * will be called to continue the online operation.
+ */
+ start_secondary_resume();
+ }
+ }
+
+ /* Requested state is CPU_STATE_OFFLINE at this point */
+ WARN_ON(get_preferred_offline_state(cpu) != CPU_STATE_OFFLINE);
+
+ set_cpu_current_state(cpu, CPU_STATE_OFFLINE);
+ unregister_slb_shadow(hwcpu);
+ rtas_stop_self();
+
+ /* Should never get here... */
+ BUG();
+ for(;;);
+}
+
+static int pseries_cpu_disable(void)
+{
+ int cpu = smp_processor_id();
+
+ set_cpu_online(cpu, false);
+ vdso_data->processorCount--;
+
+ /*fix boot_cpuid here*/
+ if (cpu == boot_cpuid)
+ boot_cpuid = cpumask_any(cpu_online_mask);
+
+ /* FIXME: abstract this to not be platform specific later on */
+ if (xive_enabled())
+ xive_smp_disable_cpu();
+ else
+ xics_migrate_irqs_away();
+ return 0;
+}
+
+/*
+ * pseries_cpu_die: Wait for the cpu to die.
+ * @cpu: logical processor id of the CPU whose death we're awaiting.
+ *
+ * This function is called from the context of the thread which is performing
+ * the cpu-offline. Here we wait for long enough to allow the cpu in question
+ * to self-destroy so that the cpu-offline thread can send the CPU_DEAD
+ * notifications.
+ *
+ * OTOH, pseries_mach_cpu_die() is called by the @cpu when it wants to
+ * self-destruct.
+ */
+static void pseries_cpu_die(unsigned int cpu)
+{
+ int tries;
+ int cpu_status = 1;
+ unsigned int pcpu = get_hard_smp_processor_id(cpu);
+
+ if (get_preferred_offline_state(cpu) == CPU_STATE_INACTIVE) {
+ cpu_status = 1;
+ for (tries = 0; tries < 5000; tries++) {
+ if (get_cpu_current_state(cpu) == CPU_STATE_INACTIVE) {
+ cpu_status = 0;
+ break;
+ }
+ msleep(1);
+ }
+ } else if (get_preferred_offline_state(cpu) == CPU_STATE_OFFLINE) {
+
+ for (tries = 0; tries < 25; tries++) {
+ cpu_status = smp_query_cpu_stopped(pcpu);
+ if (cpu_status == QCSS_STOPPED ||
+ cpu_status == QCSS_HARDWARE_ERROR)
+ break;
+ cpu_relax();
+ }
+ }
+
+ if (cpu_status != 0) {
+ printk("Querying DEAD? cpu %i (%i) shows %i\n",
+ cpu, pcpu, cpu_status);
+ }
+
+ /* Isolation and deallocation are definitely done by
+ * drslot_chrp_cpu. If they were not they would be
+ * done here. Change isolate state to Isolate and
+ * change allocation-state to Unusable.
+ */
+ paca_ptrs[cpu]->cpu_start = 0;
+}
+
+/*
+ * Update cpu_present_mask and paca(s) for a new cpu node. The wrinkle
+ * here is that a cpu device node may represent up to two logical cpus
+ * in the SMT case. We must honor the assumption in other code that
+ * the logical ids for sibling SMT threads x and y are adjacent, such
+ * that x^1 == y and y^1 == x.
+ */
+static int pseries_add_processor(struct device_node *np)
+{
+ unsigned int cpu;
+ cpumask_var_t candidate_mask, tmp;
+ int err = -ENOSPC, len, nthreads, i;
+ const __be32 *intserv;
+
+ intserv = of_get_property(np, "ibm,ppc-interrupt-server#s", &len);
+ if (!intserv)
+ return 0;
+
+ zalloc_cpumask_var(&candidate_mask, GFP_KERNEL);
+ zalloc_cpumask_var(&tmp, GFP_KERNEL);
+
+ nthreads = len / sizeof(u32);
+ for (i = 0; i < nthreads; i++)
+ cpumask_set_cpu(i, tmp);
+
+ cpu_maps_update_begin();
+
+ BUG_ON(!cpumask_subset(cpu_present_mask, cpu_possible_mask));
+
+ /* Get a bitmap of unoccupied slots. */
+ cpumask_xor(candidate_mask, cpu_possible_mask, cpu_present_mask);
+ if (cpumask_empty(candidate_mask)) {
+ /* If we get here, it most likely means that NR_CPUS is
+ * less than the partition's max processors setting.
+ */
+ printk(KERN_ERR "Cannot add cpu %pOF; this system configuration"
+ " supports %d logical cpus.\n", np,
+ num_possible_cpus());
+ goto out_unlock;
+ }
+
+ while (!cpumask_empty(tmp))
+ if (cpumask_subset(tmp, candidate_mask))
+ /* Found a range where we can insert the new cpu(s) */
+ break;
+ else
+ cpumask_shift_left(tmp, tmp, nthreads);
+
+ if (cpumask_empty(tmp)) {
+ printk(KERN_ERR "Unable to find space in cpu_present_mask for"
+ " processor %s with %d thread(s)\n", np->name,
+ nthreads);
+ goto out_unlock;
+ }
+
+ for_each_cpu(cpu, tmp) {
+ BUG_ON(cpu_present(cpu));
+ set_cpu_present(cpu, true);
+ set_hard_smp_processor_id(cpu, be32_to_cpu(*intserv++));
+ }
+ err = 0;
+out_unlock:
+ cpu_maps_update_done();
+ free_cpumask_var(candidate_mask);
+ free_cpumask_var(tmp);
+ return err;
+}
+
+/*
+ * Update the present map for a cpu node which is going away, and set
+ * the hard id in the paca(s) to -1 to be consistent with boot time
+ * convention for non-present cpus.
+ */
+static void pseries_remove_processor(struct device_node *np)
+{
+ unsigned int cpu;
+ int len, nthreads, i;
+ const __be32 *intserv;
+ u32 thread;
+
+ intserv = of_get_property(np, "ibm,ppc-interrupt-server#s", &len);
+ if (!intserv)
+ return;
+
+ nthreads = len / sizeof(u32);
+
+ cpu_maps_update_begin();
+ for (i = 0; i < nthreads; i++) {
+ thread = be32_to_cpu(intserv[i]);
+ for_each_present_cpu(cpu) {
+ if (get_hard_smp_processor_id(cpu) != thread)
+ continue;
+ BUG_ON(cpu_online(cpu));
+ set_cpu_present(cpu, false);
+ set_hard_smp_processor_id(cpu, -1);
+ update_numa_cpu_lookup_table(cpu, -1);
+ break;
+ }
+ if (cpu >= nr_cpu_ids)
+ printk(KERN_WARNING "Could not find cpu to remove "
+ "with physical id 0x%x\n", thread);
+ }
+ cpu_maps_update_done();
+}
+
+static int dlpar_online_cpu(struct device_node *dn)
+{
+ int rc = 0;
+ unsigned int cpu;
+ int len, nthreads, i;
+ const __be32 *intserv;
+ u32 thread;
+
+ intserv = of_get_property(dn, "ibm,ppc-interrupt-server#s", &len);
+ if (!intserv)
+ return -EINVAL;
+
+ nthreads = len / sizeof(u32);
+
+ cpu_maps_update_begin();
+ for (i = 0; i < nthreads; i++) {
+ thread = be32_to_cpu(intserv[i]);
+ for_each_present_cpu(cpu) {
+ if (get_hard_smp_processor_id(cpu) != thread)
+ continue;
+ BUG_ON(get_cpu_current_state(cpu)
+ != CPU_STATE_OFFLINE);
+ cpu_maps_update_done();
+ timed_topology_update(1);
+ find_and_online_cpu_nid(cpu);
+ rc = device_online(get_cpu_device(cpu));
+ if (rc)
+ goto out;
+ cpu_maps_update_begin();
+
+ break;
+ }
+ if (cpu == num_possible_cpus())
+ printk(KERN_WARNING "Could not find cpu to online "
+ "with physical id 0x%x\n", thread);
+ }
+ cpu_maps_update_done();
+
+out:
+ return rc;
+
+}
+
+static bool dlpar_cpu_exists(struct device_node *parent, u32 drc_index)
+{
+ struct device_node *child = NULL;
+ u32 my_drc_index;
+ bool found;
+ int rc;
+
+ /* Assume cpu doesn't exist */
+ found = false;
+
+ for_each_child_of_node(parent, child) {
+ rc = of_property_read_u32(child, "ibm,my-drc-index",
+ &my_drc_index);
+ if (rc)
+ continue;
+
+ if (my_drc_index == drc_index) {
+ of_node_put(child);
+ found = true;
+ break;
+ }
+ }
+
+ return found;
+}
+
+static bool valid_cpu_drc_index(struct device_node *parent, u32 drc_index)
+{
+ bool found = false;
+ int rc, index;
+
+ index = 0;
+ while (!found) {
+ u32 drc;
+
+ rc = of_property_read_u32_index(parent, "ibm,drc-indexes",
+ index++, &drc);
+ if (rc)
+ break;
+
+ if (drc == drc_index)
+ found = true;
+ }
+
+ return found;
+}
+
+static ssize_t dlpar_cpu_add(u32 drc_index)
+{
+ struct device_node *dn, *parent;
+ int rc, saved_rc;
+
+ pr_debug("Attempting to add CPU, drc index: %x\n", drc_index);
+
+ parent = of_find_node_by_path("/cpus");
+ if (!parent) {
+ pr_warn("Failed to find CPU root node \"/cpus\"\n");
+ return -ENODEV;
+ }
+
+ if (dlpar_cpu_exists(parent, drc_index)) {
+ of_node_put(parent);
+ pr_warn("CPU with drc index %x already exists\n", drc_index);
+ return -EINVAL;
+ }
+
+ if (!valid_cpu_drc_index(parent, drc_index)) {
+ of_node_put(parent);
+ pr_warn("Cannot find CPU (drc index %x) to add.\n", drc_index);
+ return -EINVAL;
+ }
+
+ rc = dlpar_acquire_drc(drc_index);
+ if (rc) {
+ pr_warn("Failed to acquire DRC, rc: %d, drc index: %x\n",
+ rc, drc_index);
+ of_node_put(parent);
+ return -EINVAL;
+ }
+
+ dn = dlpar_configure_connector(cpu_to_be32(drc_index), parent);
+ if (!dn) {
+ pr_warn("Failed call to configure-connector, drc index: %x\n",
+ drc_index);
+ dlpar_release_drc(drc_index);
+ of_node_put(parent);
+ return -EINVAL;
+ }
+
+ rc = dlpar_attach_node(dn, parent);
+
+ /* Regardless we are done with parent now */
+ of_node_put(parent);
+
+ if (rc) {
+ saved_rc = rc;
+ pr_warn("Failed to attach node %s, rc: %d, drc index: %x\n",
+ dn->name, rc, drc_index);
+
+ rc = dlpar_release_drc(drc_index);
+ if (!rc)
+ dlpar_free_cc_nodes(dn);
+
+ return saved_rc;
+ }
+
+ rc = dlpar_online_cpu(dn);
+ if (rc) {
+ saved_rc = rc;
+ pr_warn("Failed to online cpu %s, rc: %d, drc index: %x\n",
+ dn->name, rc, drc_index);
+
+ rc = dlpar_detach_node(dn);
+ if (!rc)
+ dlpar_release_drc(drc_index);
+
+ return saved_rc;
+ }
+
+ pr_debug("Successfully added CPU %s, drc index: %x\n", dn->name,
+ drc_index);
+ return rc;
+}
+
+static int dlpar_offline_cpu(struct device_node *dn)
+{
+ int rc = 0;
+ unsigned int cpu;
+ int len, nthreads, i;
+ const __be32 *intserv;
+ u32 thread;
+
+ intserv = of_get_property(dn, "ibm,ppc-interrupt-server#s", &len);
+ if (!intserv)
+ return -EINVAL;
+
+ nthreads = len / sizeof(u32);
+
+ cpu_maps_update_begin();
+ for (i = 0; i < nthreads; i++) {
+ thread = be32_to_cpu(intserv[i]);
+ for_each_present_cpu(cpu) {
+ if (get_hard_smp_processor_id(cpu) != thread)
+ continue;
+
+ if (get_cpu_current_state(cpu) == CPU_STATE_OFFLINE)
+ break;
+
+ if (get_cpu_current_state(cpu) == CPU_STATE_ONLINE) {
+ set_preferred_offline_state(cpu,
+ CPU_STATE_OFFLINE);
+ cpu_maps_update_done();
+ timed_topology_update(1);
+ rc = device_offline(get_cpu_device(cpu));
+ if (rc)
+ goto out;
+ cpu_maps_update_begin();
+ break;
+
+ }
+
+ /*
+ * The cpu is in CPU_STATE_INACTIVE.
+ * Upgrade it's state to CPU_STATE_OFFLINE.
+ */
+ set_preferred_offline_state(cpu, CPU_STATE_OFFLINE);
+ BUG_ON(plpar_hcall_norets(H_PROD, thread)
+ != H_SUCCESS);
+ __cpu_die(cpu);
+ break;
+ }
+ if (cpu == num_possible_cpus())
+ printk(KERN_WARNING "Could not find cpu to offline with physical id 0x%x\n", thread);
+ }
+ cpu_maps_update_done();
+
+out:
+ return rc;
+
+}
+
+static ssize_t dlpar_cpu_remove(struct device_node *dn, u32 drc_index)
+{
+ int rc;
+
+ pr_debug("Attempting to remove CPU %s, drc index: %x\n",
+ dn->name, drc_index);
+
+ rc = dlpar_offline_cpu(dn);
+ if (rc) {
+ pr_warn("Failed to offline CPU %s, rc: %d\n", dn->name, rc);
+ return -EINVAL;
+ }
+
+ rc = dlpar_release_drc(drc_index);
+ if (rc) {
+ pr_warn("Failed to release drc (%x) for CPU %s, rc: %d\n",
+ drc_index, dn->name, rc);
+ dlpar_online_cpu(dn);
+ return rc;
+ }
+
+ rc = dlpar_detach_node(dn);
+ if (rc) {
+ int saved_rc = rc;
+
+ pr_warn("Failed to detach CPU %s, rc: %d", dn->name, rc);
+
+ rc = dlpar_acquire_drc(drc_index);
+ if (!rc)
+ dlpar_online_cpu(dn);
+
+ return saved_rc;
+ }
+
+ pr_debug("Successfully removed CPU, drc index: %x\n", drc_index);
+ return 0;
+}
+
+static struct device_node *cpu_drc_index_to_dn(u32 drc_index)
+{
+ struct device_node *dn;
+ u32 my_index;
+ int rc;
+
+ for_each_node_by_type(dn, "cpu") {
+ rc = of_property_read_u32(dn, "ibm,my-drc-index", &my_index);
+ if (rc)
+ continue;
+
+ if (my_index == drc_index)
+ break;
+ }
+
+ return dn;
+}
+
+static int dlpar_cpu_remove_by_index(u32 drc_index)
+{
+ struct device_node *dn;
+ int rc;
+
+ dn = cpu_drc_index_to_dn(drc_index);
+ if (!dn) {
+ pr_warn("Cannot find CPU (drc index %x) to remove\n",
+ drc_index);
+ return -ENODEV;
+ }
+
+ rc = dlpar_cpu_remove(dn, drc_index);
+ of_node_put(dn);
+ return rc;
+}
+
+static int find_dlpar_cpus_to_remove(u32 *cpu_drcs, int cpus_to_remove)
+{
+ struct device_node *dn;
+ int cpus_found = 0;
+ int rc;
+
+ /* We want to find cpus_to_remove + 1 CPUs to ensure we do not
+ * remove the last CPU.
+ */
+ for_each_node_by_type(dn, "cpu") {
+ cpus_found++;
+
+ if (cpus_found > cpus_to_remove) {
+ of_node_put(dn);
+ break;
+ }
+
+ /* Note that cpus_found is always 1 ahead of the index
+ * into the cpu_drcs array, so we use cpus_found - 1
+ */
+ rc = of_property_read_u32(dn, "ibm,my-drc-index",
+ &cpu_drcs[cpus_found - 1]);
+ if (rc) {
+ pr_warn("Error occurred getting drc-index for %s\n",
+ dn->name);
+ of_node_put(dn);
+ return -1;
+ }
+ }
+
+ if (cpus_found < cpus_to_remove) {
+ pr_warn("Failed to find enough CPUs (%d of %d) to remove\n",
+ cpus_found, cpus_to_remove);
+ } else if (cpus_found == cpus_to_remove) {
+ pr_warn("Cannot remove all CPUs\n");
+ }
+
+ return cpus_found;
+}
+
+static int dlpar_cpu_remove_by_count(u32 cpus_to_remove)
+{
+ u32 *cpu_drcs;
+ int cpus_found;
+ int cpus_removed = 0;
+ int i, rc;
+
+ pr_debug("Attempting to hot-remove %d CPUs\n", cpus_to_remove);
+
+ cpu_drcs = kcalloc(cpus_to_remove, sizeof(*cpu_drcs), GFP_KERNEL);
+ if (!cpu_drcs)
+ return -EINVAL;
+
+ cpus_found = find_dlpar_cpus_to_remove(cpu_drcs, cpus_to_remove);
+ if (cpus_found <= cpus_to_remove) {
+ kfree(cpu_drcs);
+ return -EINVAL;
+ }
+
+ for (i = 0; i < cpus_to_remove; i++) {
+ rc = dlpar_cpu_remove_by_index(cpu_drcs[i]);
+ if (rc)
+ break;
+
+ cpus_removed++;
+ }
+
+ if (cpus_removed != cpus_to_remove) {
+ pr_warn("CPU hot-remove failed, adding back removed CPUs\n");
+
+ for (i = 0; i < cpus_removed; i++)
+ dlpar_cpu_add(cpu_drcs[i]);
+
+ rc = -EINVAL;
+ } else {
+ rc = 0;
+ }
+
+ kfree(cpu_drcs);
+ return rc;
+}
+
+static int find_dlpar_cpus_to_add(u32 *cpu_drcs, u32 cpus_to_add)
+{
+ struct device_node *parent;
+ int cpus_found = 0;
+ int index, rc;
+
+ parent = of_find_node_by_path("/cpus");
+ if (!parent) {
+ pr_warn("Could not find CPU root node in device tree\n");
+ kfree(cpu_drcs);
+ return -1;
+ }
+
+ /* Search the ibm,drc-indexes array for possible CPU drcs to
+ * add. Note that the format of the ibm,drc-indexes array is
+ * the number of entries in the array followed by the array
+ * of drc values so we start looking at index = 1.
+ */
+ index = 1;
+ while (cpus_found < cpus_to_add) {
+ u32 drc;
+
+ rc = of_property_read_u32_index(parent, "ibm,drc-indexes",
+ index++, &drc);
+ if (rc)
+ break;
+
+ if (dlpar_cpu_exists(parent, drc))
+ continue;
+
+ cpu_drcs[cpus_found++] = drc;
+ }
+
+ of_node_put(parent);
+ return cpus_found;
+}
+
+static int dlpar_cpu_add_by_count(u32 cpus_to_add)
+{
+ u32 *cpu_drcs;
+ int cpus_added = 0;
+ int cpus_found;
+ int i, rc;
+
+ pr_debug("Attempting to hot-add %d CPUs\n", cpus_to_add);
+
+ cpu_drcs = kcalloc(cpus_to_add, sizeof(*cpu_drcs), GFP_KERNEL);
+ if (!cpu_drcs)
+ return -EINVAL;
+
+ cpus_found = find_dlpar_cpus_to_add(cpu_drcs, cpus_to_add);
+ if (cpus_found < cpus_to_add) {
+ pr_warn("Failed to find enough CPUs (%d of %d) to add\n",
+ cpus_found, cpus_to_add);
+ kfree(cpu_drcs);
+ return -EINVAL;
+ }
+
+ for (i = 0; i < cpus_to_add; i++) {
+ rc = dlpar_cpu_add(cpu_drcs[i]);
+ if (rc)
+ break;
+
+ cpus_added++;
+ }
+
+ if (cpus_added < cpus_to_add) {
+ pr_warn("CPU hot-add failed, removing any added CPUs\n");
+
+ for (i = 0; i < cpus_added; i++)
+ dlpar_cpu_remove_by_index(cpu_drcs[i]);
+
+ rc = -EINVAL;
+ } else {
+ rc = 0;
+ }
+
+ kfree(cpu_drcs);
+ return rc;
+}
+
+int dlpar_cpu_readd(int cpu)
+{
+ struct device_node *dn;
+ struct device *dev;
+ u32 drc_index;
+ int rc;
+
+ dev = get_cpu_device(cpu);
+ dn = dev->of_node;
+
+ rc = of_property_read_u32(dn, "ibm,my-drc-index", &drc_index);
+
+ rc = dlpar_cpu_remove_by_index(drc_index);
+ if (!rc)
+ rc = dlpar_cpu_add(drc_index);
+
+ return rc;
+}
+
+int dlpar_cpu(struct pseries_hp_errorlog *hp_elog)
+{
+ u32 count, drc_index;
+ int rc;
+
+ count = hp_elog->_drc_u.drc_count;
+ drc_index = hp_elog->_drc_u.drc_index;
+
+ lock_device_hotplug();
+
+ switch (hp_elog->action) {
+ case PSERIES_HP_ELOG_ACTION_REMOVE:
+ if (hp_elog->id_type == PSERIES_HP_ELOG_ID_DRC_COUNT)
+ rc = dlpar_cpu_remove_by_count(count);
+ else if (hp_elog->id_type == PSERIES_HP_ELOG_ID_DRC_INDEX)
+ rc = dlpar_cpu_remove_by_index(drc_index);
+ else
+ rc = -EINVAL;
+ break;
+ case PSERIES_HP_ELOG_ACTION_ADD:
+ if (hp_elog->id_type == PSERIES_HP_ELOG_ID_DRC_COUNT)
+ rc = dlpar_cpu_add_by_count(count);
+ else if (hp_elog->id_type == PSERIES_HP_ELOG_ID_DRC_INDEX)
+ rc = dlpar_cpu_add(drc_index);
+ else
+ rc = -EINVAL;
+ break;
+ default:
+ pr_err("Invalid action (%d) specified\n", hp_elog->action);
+ rc = -EINVAL;
+ break;
+ }
+
+ unlock_device_hotplug();
+ return rc;
+}
+
+#ifdef CONFIG_ARCH_CPU_PROBE_RELEASE
+
+static ssize_t dlpar_cpu_probe(const char *buf, size_t count)
+{
+ u32 drc_index;
+ int rc;
+
+ rc = kstrtou32(buf, 0, &drc_index);
+ if (rc)
+ return -EINVAL;
+
+ rc = dlpar_cpu_add(drc_index);
+
+ return rc ? rc : count;
+}
+
+static ssize_t dlpar_cpu_release(const char *buf, size_t count)
+{
+ struct device_node *dn;
+ u32 drc_index;
+ int rc;
+
+ dn = of_find_node_by_path(buf);
+ if (!dn)
+ return -EINVAL;
+
+ rc = of_property_read_u32(dn, "ibm,my-drc-index", &drc_index);
+ if (rc) {
+ of_node_put(dn);
+ return -EINVAL;
+ }
+
+ rc = dlpar_cpu_remove(dn, drc_index);
+ of_node_put(dn);
+
+ return rc ? rc : count;
+}
+
+#endif /* CONFIG_ARCH_CPU_PROBE_RELEASE */
+
+static int pseries_smp_notifier(struct notifier_block *nb,
+ unsigned long action, void *data)
+{
+ struct of_reconfig_data *rd = data;
+ int err = 0;
+
+ switch (action) {
+ case OF_RECONFIG_ATTACH_NODE:
+ err = pseries_add_processor(rd->dn);
+ break;
+ case OF_RECONFIG_DETACH_NODE:
+ pseries_remove_processor(rd->dn);
+ break;
+ }
+ return notifier_from_errno(err);
+}
+
+static struct notifier_block pseries_smp_nb = {
+ .notifier_call = pseries_smp_notifier,
+};
+
+#define MAX_CEDE_LATENCY_LEVELS 4
+#define CEDE_LATENCY_PARAM_LENGTH 10
+#define CEDE_LATENCY_PARAM_MAX_LENGTH \
+ (MAX_CEDE_LATENCY_LEVELS * CEDE_LATENCY_PARAM_LENGTH * sizeof(char))
+#define CEDE_LATENCY_TOKEN 45
+
+static char cede_parameters[CEDE_LATENCY_PARAM_MAX_LENGTH];
+
+static int parse_cede_parameters(void)
+{
+ memset(cede_parameters, 0, CEDE_LATENCY_PARAM_MAX_LENGTH);
+ return rtas_call(rtas_token("ibm,get-system-parameter"), 3, 1,
+ NULL,
+ CEDE_LATENCY_TOKEN,
+ __pa(cede_parameters),
+ CEDE_LATENCY_PARAM_MAX_LENGTH);
+}
+
+static int __init pseries_cpu_hotplug_init(void)
+{
+ int cpu;
+ int qcss_tok;
+
+#ifdef CONFIG_ARCH_CPU_PROBE_RELEASE
+ ppc_md.cpu_probe = dlpar_cpu_probe;
+ ppc_md.cpu_release = dlpar_cpu_release;
+#endif /* CONFIG_ARCH_CPU_PROBE_RELEASE */
+
+ rtas_stop_self_token = rtas_token("stop-self");
+ qcss_tok = rtas_token("query-cpu-stopped-state");
+
+ if (rtas_stop_self_token == RTAS_UNKNOWN_SERVICE ||
+ qcss_tok == RTAS_UNKNOWN_SERVICE) {
+ printk(KERN_INFO "CPU Hotplug not supported by firmware "
+ "- disabling.\n");
+ return 0;
+ }
+
+ ppc_md.cpu_die = pseries_mach_cpu_die;
+ smp_ops->cpu_disable = pseries_cpu_disable;
+ smp_ops->cpu_die = pseries_cpu_die;
+
+ /* Processors can be added/removed only on LPAR */
+ if (firmware_has_feature(FW_FEATURE_LPAR)) {
+ of_reconfig_notifier_register(&pseries_smp_nb);
+ cpu_maps_update_begin();
+ if (cede_offline_enabled && parse_cede_parameters() == 0) {
+ default_offline_state = CPU_STATE_INACTIVE;
+ for_each_online_cpu(cpu)
+ set_default_offline_state(cpu);
+ }
+ cpu_maps_update_done();
+ }
+
+ return 0;
+}
+machine_arch_initcall(pseries, pseries_cpu_hotplug_init);
diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c b/arch/powerpc/platforms/pseries/hotplug-memory.c
new file mode 100644
index 000000000..afabe6918
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/hotplug-memory.c
@@ -0,0 +1,1061 @@
+/*
+ * pseries Memory Hotplug infrastructure.
+ *
+ * Copyright (C) 2008 Badari Pulavarty, IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#define pr_fmt(fmt) "pseries-hotplug-mem: " fmt
+
+#include <linux/of.h>
+#include <linux/of_address.h>
+#include <linux/memblock.h>
+#include <linux/memory.h>
+#include <linux/memory_hotplug.h>
+#include <linux/slab.h>
+
+#include <asm/firmware.h>
+#include <asm/machdep.h>
+#include <asm/prom.h>
+#include <asm/sparsemem.h>
+#include <asm/fadump.h>
+#include <asm/drmem.h>
+#include "pseries.h"
+
+static bool rtas_hp_event;
+
+unsigned long pseries_memory_block_size(void)
+{
+ struct device_node *np;
+ u64 memblock_size = MIN_MEMORY_BLOCK_SIZE;
+ struct resource r;
+
+ np = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
+ if (np) {
+ const __be64 *size;
+
+ size = of_get_property(np, "ibm,lmb-size", NULL);
+ if (size)
+ memblock_size = be64_to_cpup(size);
+ of_node_put(np);
+ } else if (machine_is(pseries)) {
+ /* This fallback really only applies to pseries */
+ unsigned int memzero_size = 0;
+
+ np = of_find_node_by_path("/memory@0");
+ if (np) {
+ if (!of_address_to_resource(np, 0, &r))
+ memzero_size = resource_size(&r);
+ of_node_put(np);
+ }
+
+ if (memzero_size) {
+ /* We now know the size of memory@0, use this to find
+ * the first memoryblock and get its size.
+ */
+ char buf[64];
+
+ sprintf(buf, "/memory@%x", memzero_size);
+ np = of_find_node_by_path(buf);
+ if (np) {
+ if (!of_address_to_resource(np, 0, &r))
+ memblock_size = resource_size(&r);
+ of_node_put(np);
+ }
+ }
+ }
+ return memblock_size;
+}
+
+static void dlpar_free_property(struct property *prop)
+{
+ kfree(prop->name);
+ kfree(prop->value);
+ kfree(prop);
+}
+
+static struct property *dlpar_clone_property(struct property *prop,
+ u32 prop_size)
+{
+ struct property *new_prop;
+
+ new_prop = kzalloc(sizeof(*new_prop), GFP_KERNEL);
+ if (!new_prop)
+ return NULL;
+
+ new_prop->name = kstrdup(prop->name, GFP_KERNEL);
+ new_prop->value = kzalloc(prop_size, GFP_KERNEL);
+ if (!new_prop->name || !new_prop->value) {
+ dlpar_free_property(new_prop);
+ return NULL;
+ }
+
+ memcpy(new_prop->value, prop->value, prop->length);
+ new_prop->length = prop_size;
+
+ of_property_set_flag(new_prop, OF_DYNAMIC);
+ return new_prop;
+}
+
+static bool find_aa_index(struct device_node *dr_node,
+ struct property *ala_prop,
+ const u32 *lmb_assoc, u32 *aa_index)
+{
+ u32 *assoc_arrays, new_prop_size;
+ struct property *new_prop;
+ int aa_arrays, aa_array_entries, aa_array_sz;
+ int i, index;
+
+ /*
+ * The ibm,associativity-lookup-arrays property is defined to be
+ * a 32-bit value specifying the number of associativity arrays
+ * followed by a 32-bitvalue specifying the number of entries per
+ * array, followed by the associativity arrays.
+ */
+ assoc_arrays = ala_prop->value;
+
+ aa_arrays = be32_to_cpu(assoc_arrays[0]);
+ aa_array_entries = be32_to_cpu(assoc_arrays[1]);
+ aa_array_sz = aa_array_entries * sizeof(u32);
+
+ for (i = 0; i < aa_arrays; i++) {
+ index = (i * aa_array_entries) + 2;
+
+ if (memcmp(&assoc_arrays[index], &lmb_assoc[1], aa_array_sz))
+ continue;
+
+ *aa_index = i;
+ return true;
+ }
+
+ new_prop_size = ala_prop->length + aa_array_sz;
+ new_prop = dlpar_clone_property(ala_prop, new_prop_size);
+ if (!new_prop)
+ return false;
+
+ assoc_arrays = new_prop->value;
+
+ /* increment the number of entries in the lookup array */
+ assoc_arrays[0] = cpu_to_be32(aa_arrays + 1);
+
+ /* copy the new associativity into the lookup array */
+ index = aa_arrays * aa_array_entries + 2;
+ memcpy(&assoc_arrays[index], &lmb_assoc[1], aa_array_sz);
+
+ of_update_property(dr_node, new_prop);
+
+ /*
+ * The associativity lookup array index for this lmb is
+ * number of entries - 1 since we added its associativity
+ * to the end of the lookup array.
+ */
+ *aa_index = be32_to_cpu(assoc_arrays[0]) - 1;
+ return true;
+}
+
+static int update_lmb_associativity_index(struct drmem_lmb *lmb)
+{
+ struct device_node *parent, *lmb_node, *dr_node;
+ struct property *ala_prop;
+ const u32 *lmb_assoc;
+ u32 aa_index;
+ bool found;
+
+ parent = of_find_node_by_path("/");
+ if (!parent)
+ return -ENODEV;
+
+ lmb_node = dlpar_configure_connector(cpu_to_be32(lmb->drc_index),
+ parent);
+ of_node_put(parent);
+ if (!lmb_node)
+ return -EINVAL;
+
+ lmb_assoc = of_get_property(lmb_node, "ibm,associativity", NULL);
+ if (!lmb_assoc) {
+ dlpar_free_cc_nodes(lmb_node);
+ return -ENODEV;
+ }
+
+ dr_node = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
+ if (!dr_node) {
+ dlpar_free_cc_nodes(lmb_node);
+ return -ENODEV;
+ }
+
+ ala_prop = of_find_property(dr_node, "ibm,associativity-lookup-arrays",
+ NULL);
+ if (!ala_prop) {
+ of_node_put(dr_node);
+ dlpar_free_cc_nodes(lmb_node);
+ return -ENODEV;
+ }
+
+ found = find_aa_index(dr_node, ala_prop, lmb_assoc, &aa_index);
+
+ of_node_put(dr_node);
+ dlpar_free_cc_nodes(lmb_node);
+
+ if (!found) {
+ pr_err("Could not find LMB associativity\n");
+ return -1;
+ }
+
+ lmb->aa_index = aa_index;
+ return 0;
+}
+
+static struct memory_block *lmb_to_memblock(struct drmem_lmb *lmb)
+{
+ unsigned long section_nr;
+ struct mem_section *mem_sect;
+ struct memory_block *mem_block;
+
+ section_nr = pfn_to_section_nr(PFN_DOWN(lmb->base_addr));
+ mem_sect = __nr_to_section(section_nr);
+
+ mem_block = find_memory_block(mem_sect);
+ return mem_block;
+}
+
+static int get_lmb_range(u32 drc_index, int n_lmbs,
+ struct drmem_lmb **start_lmb,
+ struct drmem_lmb **end_lmb)
+{
+ struct drmem_lmb *lmb, *start, *end;
+ struct drmem_lmb *limit;
+
+ start = NULL;
+ for_each_drmem_lmb(lmb) {
+ if (lmb->drc_index == drc_index) {
+ start = lmb;
+ break;
+ }
+ }
+
+ if (!start)
+ return -EINVAL;
+
+ end = &start[n_lmbs];
+
+ limit = &drmem_info->lmbs[drmem_info->n_lmbs];
+ if (end > limit)
+ return -EINVAL;
+
+ *start_lmb = start;
+ *end_lmb = end;
+ return 0;
+}
+
+static int dlpar_change_lmb_state(struct drmem_lmb *lmb, bool online)
+{
+ struct memory_block *mem_block;
+ int rc;
+
+ mem_block = lmb_to_memblock(lmb);
+ if (!mem_block)
+ return -EINVAL;
+
+ if (online && mem_block->dev.offline)
+ rc = device_online(&mem_block->dev);
+ else if (!online && !mem_block->dev.offline)
+ rc = device_offline(&mem_block->dev);
+ else
+ rc = 0;
+
+ put_device(&mem_block->dev);
+
+ return rc;
+}
+
+static int dlpar_online_lmb(struct drmem_lmb *lmb)
+{
+ return dlpar_change_lmb_state(lmb, true);
+}
+
+#ifdef CONFIG_MEMORY_HOTREMOVE
+static int dlpar_offline_lmb(struct drmem_lmb *lmb)
+{
+ return dlpar_change_lmb_state(lmb, false);
+}
+
+static int pseries_remove_memblock(unsigned long base, unsigned int memblock_size)
+{
+ unsigned long block_sz, start_pfn;
+ int sections_per_block;
+ int i, nid;
+
+ start_pfn = base >> PAGE_SHIFT;
+
+ lock_device_hotplug();
+
+ if (!pfn_valid(start_pfn))
+ goto out;
+
+ block_sz = pseries_memory_block_size();
+ sections_per_block = block_sz / MIN_MEMORY_BLOCK_SIZE;
+ nid = memory_add_physaddr_to_nid(base);
+
+ for (i = 0; i < sections_per_block; i++) {
+ __remove_memory(nid, base, MIN_MEMORY_BLOCK_SIZE);
+ base += MIN_MEMORY_BLOCK_SIZE;
+ }
+
+out:
+ /* Update memory regions for memory remove */
+ memblock_remove(base, memblock_size);
+ unlock_device_hotplug();
+ return 0;
+}
+
+static int pseries_remove_mem_node(struct device_node *np)
+{
+ const char *type;
+ const __be32 *regs;
+ unsigned long base;
+ unsigned int lmb_size;
+ int ret = -EINVAL;
+
+ /*
+ * Check to see if we are actually removing memory
+ */
+ type = of_get_property(np, "device_type", NULL);
+ if (type == NULL || strcmp(type, "memory") != 0)
+ return 0;
+
+ /*
+ * Find the base address and size of the memblock
+ */
+ regs = of_get_property(np, "reg", NULL);
+ if (!regs)
+ return ret;
+
+ base = be64_to_cpu(*(unsigned long *)regs);
+ lmb_size = be32_to_cpu(regs[3]);
+
+ pseries_remove_memblock(base, lmb_size);
+ return 0;
+}
+
+static bool lmb_is_removable(struct drmem_lmb *lmb)
+{
+ int i, scns_per_block;
+ int rc = 1;
+ unsigned long pfn, block_sz;
+ u64 phys_addr;
+
+ if (!(lmb->flags & DRCONF_MEM_ASSIGNED))
+ return false;
+
+ block_sz = memory_block_size_bytes();
+ scns_per_block = block_sz / MIN_MEMORY_BLOCK_SIZE;
+ phys_addr = lmb->base_addr;
+
+#ifdef CONFIG_FA_DUMP
+ /*
+ * Don't hot-remove memory that falls in fadump boot memory area
+ * and memory that is reserved for capturing old kernel memory.
+ */
+ if (is_fadump_memory_area(phys_addr, block_sz))
+ return false;
+#endif
+
+ for (i = 0; i < scns_per_block; i++) {
+ pfn = PFN_DOWN(phys_addr);
+ if (!pfn_present(pfn)) {
+ phys_addr += MIN_MEMORY_BLOCK_SIZE;
+ continue;
+ }
+
+ rc &= is_mem_section_removable(pfn, PAGES_PER_SECTION);
+ phys_addr += MIN_MEMORY_BLOCK_SIZE;
+ }
+
+ return rc ? true : false;
+}
+
+static int dlpar_add_lmb(struct drmem_lmb *);
+
+static int dlpar_remove_lmb(struct drmem_lmb *lmb)
+{
+ unsigned long block_sz;
+ int nid, rc;
+
+ if (!lmb_is_removable(lmb))
+ return -EINVAL;
+
+ rc = dlpar_offline_lmb(lmb);
+ if (rc)
+ return rc;
+
+ block_sz = pseries_memory_block_size();
+ nid = memory_add_physaddr_to_nid(lmb->base_addr);
+
+ __remove_memory(nid, lmb->base_addr, block_sz);
+
+ /* Update memory regions for memory remove */
+ memblock_remove(lmb->base_addr, block_sz);
+
+ invalidate_lmb_associativity_index(lmb);
+ lmb->flags &= ~DRCONF_MEM_ASSIGNED;
+
+ return 0;
+}
+
+static int dlpar_memory_remove_by_count(u32 lmbs_to_remove)
+{
+ struct drmem_lmb *lmb;
+ int lmbs_removed = 0;
+ int lmbs_available = 0;
+ int rc;
+
+ pr_info("Attempting to hot-remove %d LMB(s)\n", lmbs_to_remove);
+
+ if (lmbs_to_remove == 0)
+ return -EINVAL;
+
+ /* Validate that there are enough LMBs to satisfy the request */
+ for_each_drmem_lmb(lmb) {
+ if (lmb_is_removable(lmb))
+ lmbs_available++;
+
+ if (lmbs_available == lmbs_to_remove)
+ break;
+ }
+
+ if (lmbs_available < lmbs_to_remove) {
+ pr_info("Not enough LMBs available (%d of %d) to satisfy request\n",
+ lmbs_available, lmbs_to_remove);
+ return -EINVAL;
+ }
+
+ for_each_drmem_lmb(lmb) {
+ rc = dlpar_remove_lmb(lmb);
+ if (rc)
+ continue;
+
+ /* Mark this lmb so we can add it later if all of the
+ * requested LMBs cannot be removed.
+ */
+ drmem_mark_lmb_reserved(lmb);
+
+ lmbs_removed++;
+ if (lmbs_removed == lmbs_to_remove)
+ break;
+ }
+
+ if (lmbs_removed != lmbs_to_remove) {
+ pr_err("Memory hot-remove failed, adding LMB's back\n");
+
+ for_each_drmem_lmb(lmb) {
+ if (!drmem_lmb_reserved(lmb))
+ continue;
+
+ rc = dlpar_add_lmb(lmb);
+ if (rc)
+ pr_err("Failed to add LMB back, drc index %x\n",
+ lmb->drc_index);
+
+ drmem_remove_lmb_reservation(lmb);
+ }
+
+ rc = -EINVAL;
+ } else {
+ for_each_drmem_lmb(lmb) {
+ if (!drmem_lmb_reserved(lmb))
+ continue;
+
+ dlpar_release_drc(lmb->drc_index);
+ pr_info("Memory at %llx was hot-removed\n",
+ lmb->base_addr);
+
+ drmem_remove_lmb_reservation(lmb);
+ }
+ rc = 0;
+ }
+
+ return rc;
+}
+
+static int dlpar_memory_remove_by_index(u32 drc_index)
+{
+ struct drmem_lmb *lmb;
+ int lmb_found;
+ int rc;
+
+ pr_info("Attempting to hot-remove LMB, drc index %x\n", drc_index);
+
+ lmb_found = 0;
+ for_each_drmem_lmb(lmb) {
+ if (lmb->drc_index == drc_index) {
+ lmb_found = 1;
+ rc = dlpar_remove_lmb(lmb);
+ if (!rc)
+ dlpar_release_drc(lmb->drc_index);
+
+ break;
+ }
+ }
+
+ if (!lmb_found)
+ rc = -EINVAL;
+
+ if (rc)
+ pr_info("Failed to hot-remove memory at %llx\n",
+ lmb->base_addr);
+ else
+ pr_info("Memory at %llx was hot-removed\n", lmb->base_addr);
+
+ return rc;
+}
+
+static int dlpar_memory_readd_by_index(u32 drc_index)
+{
+ struct drmem_lmb *lmb;
+ int lmb_found;
+ int rc;
+
+ pr_info("Attempting to update LMB, drc index %x\n", drc_index);
+
+ lmb_found = 0;
+ for_each_drmem_lmb(lmb) {
+ if (lmb->drc_index == drc_index) {
+ lmb_found = 1;
+ rc = dlpar_remove_lmb(lmb);
+ if (!rc) {
+ rc = dlpar_add_lmb(lmb);
+ if (rc)
+ dlpar_release_drc(lmb->drc_index);
+ }
+ break;
+ }
+ }
+
+ if (!lmb_found)
+ rc = -EINVAL;
+
+ if (rc)
+ pr_info("Failed to update memory at %llx\n",
+ lmb->base_addr);
+ else
+ pr_info("Memory at %llx was updated\n", lmb->base_addr);
+
+ return rc;
+}
+
+static int dlpar_memory_remove_by_ic(u32 lmbs_to_remove, u32 drc_index)
+{
+ struct drmem_lmb *lmb, *start_lmb, *end_lmb;
+ int lmbs_available = 0;
+ int rc;
+
+ pr_info("Attempting to hot-remove %u LMB(s) at %x\n",
+ lmbs_to_remove, drc_index);
+
+ if (lmbs_to_remove == 0)
+ return -EINVAL;
+
+ rc = get_lmb_range(drc_index, lmbs_to_remove, &start_lmb, &end_lmb);
+ if (rc)
+ return -EINVAL;
+
+ /* Validate that there are enough LMBs to satisfy the request */
+ for_each_drmem_lmb_in_range(lmb, start_lmb, end_lmb) {
+ if (lmb->flags & DRCONF_MEM_RESERVED)
+ break;
+
+ lmbs_available++;
+ }
+
+ if (lmbs_available < lmbs_to_remove)
+ return -EINVAL;
+
+ for_each_drmem_lmb_in_range(lmb, start_lmb, end_lmb) {
+ if (!(lmb->flags & DRCONF_MEM_ASSIGNED))
+ continue;
+
+ rc = dlpar_remove_lmb(lmb);
+ if (rc)
+ break;
+
+ drmem_mark_lmb_reserved(lmb);
+ }
+
+ if (rc) {
+ pr_err("Memory indexed-count-remove failed, adding any removed LMBs\n");
+
+
+ for_each_drmem_lmb_in_range(lmb, start_lmb, end_lmb) {
+ if (!drmem_lmb_reserved(lmb))
+ continue;
+
+ rc = dlpar_add_lmb(lmb);
+ if (rc)
+ pr_err("Failed to add LMB, drc index %x\n",
+ lmb->drc_index);
+
+ drmem_remove_lmb_reservation(lmb);
+ }
+ rc = -EINVAL;
+ } else {
+ for_each_drmem_lmb_in_range(lmb, start_lmb, end_lmb) {
+ if (!drmem_lmb_reserved(lmb))
+ continue;
+
+ dlpar_release_drc(lmb->drc_index);
+ pr_info("Memory at %llx (drc index %x) was hot-removed\n",
+ lmb->base_addr, lmb->drc_index);
+
+ drmem_remove_lmb_reservation(lmb);
+ }
+ }
+
+ return rc;
+}
+
+#else
+static inline int pseries_remove_memblock(unsigned long base,
+ unsigned int memblock_size)
+{
+ return -EOPNOTSUPP;
+}
+static inline int pseries_remove_mem_node(struct device_node *np)
+{
+ return 0;
+}
+static inline int dlpar_memory_remove(struct pseries_hp_errorlog *hp_elog)
+{
+ return -EOPNOTSUPP;
+}
+static int dlpar_remove_lmb(struct drmem_lmb *lmb)
+{
+ return -EOPNOTSUPP;
+}
+static int dlpar_memory_remove_by_count(u32 lmbs_to_remove)
+{
+ return -EOPNOTSUPP;
+}
+static int dlpar_memory_remove_by_index(u32 drc_index)
+{
+ return -EOPNOTSUPP;
+}
+static int dlpar_memory_readd_by_index(u32 drc_index)
+{
+ return -EOPNOTSUPP;
+}
+
+static int dlpar_memory_remove_by_ic(u32 lmbs_to_remove, u32 drc_index)
+{
+ return -EOPNOTSUPP;
+}
+#endif /* CONFIG_MEMORY_HOTREMOVE */
+
+static int dlpar_add_lmb(struct drmem_lmb *lmb)
+{
+ unsigned long block_sz;
+ int nid, rc;
+
+ if (lmb->flags & DRCONF_MEM_ASSIGNED)
+ return -EINVAL;
+
+ rc = update_lmb_associativity_index(lmb);
+ if (rc) {
+ dlpar_release_drc(lmb->drc_index);
+ return rc;
+ }
+
+ block_sz = memory_block_size_bytes();
+
+ /* Find the node id for this address */
+ nid = memory_add_physaddr_to_nid(lmb->base_addr);
+
+ /* Add the memory */
+ rc = __add_memory(nid, lmb->base_addr, block_sz);
+ if (rc) {
+ invalidate_lmb_associativity_index(lmb);
+ return rc;
+ }
+
+ rc = dlpar_online_lmb(lmb);
+ if (rc) {
+ __remove_memory(nid, lmb->base_addr, block_sz);
+ invalidate_lmb_associativity_index(lmb);
+ } else {
+ lmb->flags |= DRCONF_MEM_ASSIGNED;
+ }
+
+ return rc;
+}
+
+static int dlpar_memory_add_by_count(u32 lmbs_to_add)
+{
+ struct drmem_lmb *lmb;
+ int lmbs_available = 0;
+ int lmbs_added = 0;
+ int rc;
+
+ pr_info("Attempting to hot-add %d LMB(s)\n", lmbs_to_add);
+
+ if (lmbs_to_add == 0)
+ return -EINVAL;
+
+ /* Validate that there are enough LMBs to satisfy the request */
+ for_each_drmem_lmb(lmb) {
+ if (!(lmb->flags & DRCONF_MEM_ASSIGNED))
+ lmbs_available++;
+
+ if (lmbs_available == lmbs_to_add)
+ break;
+ }
+
+ if (lmbs_available < lmbs_to_add)
+ return -EINVAL;
+
+ for_each_drmem_lmb(lmb) {
+ if (lmb->flags & DRCONF_MEM_ASSIGNED)
+ continue;
+
+ rc = dlpar_acquire_drc(lmb->drc_index);
+ if (rc)
+ continue;
+
+ rc = dlpar_add_lmb(lmb);
+ if (rc) {
+ dlpar_release_drc(lmb->drc_index);
+ continue;
+ }
+
+ /* Mark this lmb so we can remove it later if all of the
+ * requested LMBs cannot be added.
+ */
+ drmem_mark_lmb_reserved(lmb);
+
+ lmbs_added++;
+ if (lmbs_added == lmbs_to_add)
+ break;
+ }
+
+ if (lmbs_added != lmbs_to_add) {
+ pr_err("Memory hot-add failed, removing any added LMBs\n");
+
+ for_each_drmem_lmb(lmb) {
+ if (!drmem_lmb_reserved(lmb))
+ continue;
+
+ rc = dlpar_remove_lmb(lmb);
+ if (rc)
+ pr_err("Failed to remove LMB, drc index %x\n",
+ lmb->drc_index);
+ else
+ dlpar_release_drc(lmb->drc_index);
+
+ drmem_remove_lmb_reservation(lmb);
+ }
+ rc = -EINVAL;
+ } else {
+ for_each_drmem_lmb(lmb) {
+ if (!drmem_lmb_reserved(lmb))
+ continue;
+
+ pr_info("Memory at %llx (drc index %x) was hot-added\n",
+ lmb->base_addr, lmb->drc_index);
+ drmem_remove_lmb_reservation(lmb);
+ }
+ rc = 0;
+ }
+
+ return rc;
+}
+
+static int dlpar_memory_add_by_index(u32 drc_index)
+{
+ struct drmem_lmb *lmb;
+ int rc, lmb_found;
+
+ pr_info("Attempting to hot-add LMB, drc index %x\n", drc_index);
+
+ lmb_found = 0;
+ for_each_drmem_lmb(lmb) {
+ if (lmb->drc_index == drc_index) {
+ lmb_found = 1;
+ rc = dlpar_acquire_drc(lmb->drc_index);
+ if (!rc) {
+ rc = dlpar_add_lmb(lmb);
+ if (rc)
+ dlpar_release_drc(lmb->drc_index);
+ }
+
+ break;
+ }
+ }
+
+ if (!lmb_found)
+ rc = -EINVAL;
+
+ if (rc)
+ pr_info("Failed to hot-add memory, drc index %x\n", drc_index);
+ else
+ pr_info("Memory at %llx (drc index %x) was hot-added\n",
+ lmb->base_addr, drc_index);
+
+ return rc;
+}
+
+static int dlpar_memory_add_by_ic(u32 lmbs_to_add, u32 drc_index)
+{
+ struct drmem_lmb *lmb, *start_lmb, *end_lmb;
+ int lmbs_available = 0;
+ int rc;
+
+ pr_info("Attempting to hot-add %u LMB(s) at index %x\n",
+ lmbs_to_add, drc_index);
+
+ if (lmbs_to_add == 0)
+ return -EINVAL;
+
+ rc = get_lmb_range(drc_index, lmbs_to_add, &start_lmb, &end_lmb);
+ if (rc)
+ return -EINVAL;
+
+ /* Validate that the LMBs in this range are not reserved */
+ for_each_drmem_lmb_in_range(lmb, start_lmb, end_lmb) {
+ if (lmb->flags & DRCONF_MEM_RESERVED)
+ break;
+
+ lmbs_available++;
+ }
+
+ if (lmbs_available < lmbs_to_add)
+ return -EINVAL;
+
+ for_each_drmem_lmb_in_range(lmb, start_lmb, end_lmb) {
+ if (lmb->flags & DRCONF_MEM_ASSIGNED)
+ continue;
+
+ rc = dlpar_acquire_drc(lmb->drc_index);
+ if (rc)
+ break;
+
+ rc = dlpar_add_lmb(lmb);
+ if (rc) {
+ dlpar_release_drc(lmb->drc_index);
+ break;
+ }
+
+ drmem_mark_lmb_reserved(lmb);
+ }
+
+ if (rc) {
+ pr_err("Memory indexed-count-add failed, removing any added LMBs\n");
+
+ for_each_drmem_lmb_in_range(lmb, start_lmb, end_lmb) {
+ if (!drmem_lmb_reserved(lmb))
+ continue;
+
+ rc = dlpar_remove_lmb(lmb);
+ if (rc)
+ pr_err("Failed to remove LMB, drc index %x\n",
+ lmb->drc_index);
+ else
+ dlpar_release_drc(lmb->drc_index);
+
+ drmem_remove_lmb_reservation(lmb);
+ }
+ rc = -EINVAL;
+ } else {
+ for_each_drmem_lmb_in_range(lmb, start_lmb, end_lmb) {
+ if (!drmem_lmb_reserved(lmb))
+ continue;
+
+ pr_info("Memory at %llx (drc index %x) was hot-added\n",
+ lmb->base_addr, lmb->drc_index);
+ drmem_remove_lmb_reservation(lmb);
+ }
+ }
+
+ return rc;
+}
+
+int dlpar_memory(struct pseries_hp_errorlog *hp_elog)
+{
+ u32 count, drc_index;
+ int rc;
+
+ lock_device_hotplug();
+
+ switch (hp_elog->action) {
+ case PSERIES_HP_ELOG_ACTION_ADD:
+ if (hp_elog->id_type == PSERIES_HP_ELOG_ID_DRC_COUNT) {
+ count = hp_elog->_drc_u.drc_count;
+ rc = dlpar_memory_add_by_count(count);
+ } else if (hp_elog->id_type == PSERIES_HP_ELOG_ID_DRC_INDEX) {
+ drc_index = hp_elog->_drc_u.drc_index;
+ rc = dlpar_memory_add_by_index(drc_index);
+ } else if (hp_elog->id_type == PSERIES_HP_ELOG_ID_DRC_IC) {
+ count = hp_elog->_drc_u.ic.count;
+ drc_index = hp_elog->_drc_u.ic.index;
+ rc = dlpar_memory_add_by_ic(count, drc_index);
+ } else {
+ rc = -EINVAL;
+ }
+
+ break;
+ case PSERIES_HP_ELOG_ACTION_REMOVE:
+ if (hp_elog->id_type == PSERIES_HP_ELOG_ID_DRC_COUNT) {
+ count = hp_elog->_drc_u.drc_count;
+ rc = dlpar_memory_remove_by_count(count);
+ } else if (hp_elog->id_type == PSERIES_HP_ELOG_ID_DRC_INDEX) {
+ drc_index = hp_elog->_drc_u.drc_index;
+ rc = dlpar_memory_remove_by_index(drc_index);
+ } else if (hp_elog->id_type == PSERIES_HP_ELOG_ID_DRC_IC) {
+ count = hp_elog->_drc_u.ic.count;
+ drc_index = hp_elog->_drc_u.ic.index;
+ rc = dlpar_memory_remove_by_ic(count, drc_index);
+ } else {
+ rc = -EINVAL;
+ }
+
+ break;
+ case PSERIES_HP_ELOG_ACTION_READD:
+ drc_index = hp_elog->_drc_u.drc_index;
+ rc = dlpar_memory_readd_by_index(drc_index);
+ break;
+ default:
+ pr_err("Invalid action (%d) specified\n", hp_elog->action);
+ rc = -EINVAL;
+ break;
+ }
+
+ if (!rc) {
+ rtas_hp_event = true;
+ rc = drmem_update_dt();
+ rtas_hp_event = false;
+ }
+
+ unlock_device_hotplug();
+ return rc;
+}
+
+static int pseries_add_mem_node(struct device_node *np)
+{
+ const char *type;
+ const __be32 *regs;
+ unsigned long base;
+ unsigned int lmb_size;
+ int ret = -EINVAL;
+
+ /*
+ * Check to see if we are actually adding memory
+ */
+ type = of_get_property(np, "device_type", NULL);
+ if (type == NULL || strcmp(type, "memory") != 0)
+ return 0;
+
+ /*
+ * Find the base and size of the memblock
+ */
+ regs = of_get_property(np, "reg", NULL);
+ if (!regs)
+ return ret;
+
+ base = be64_to_cpu(*(unsigned long *)regs);
+ lmb_size = be32_to_cpu(regs[3]);
+
+ /*
+ * Update memory region to represent the memory add
+ */
+ ret = memblock_add(base, lmb_size);
+ return (ret < 0) ? -EINVAL : 0;
+}
+
+static int pseries_update_drconf_memory(struct of_reconfig_data *pr)
+{
+ struct of_drconf_cell_v1 *new_drmem, *old_drmem;
+ unsigned long memblock_size;
+ u32 entries;
+ __be32 *p;
+ int i, rc = -EINVAL;
+
+ if (rtas_hp_event)
+ return 0;
+
+ memblock_size = pseries_memory_block_size();
+ if (!memblock_size)
+ return -EINVAL;
+
+ if (!pr->old_prop)
+ return 0;
+
+ p = (__be32 *) pr->old_prop->value;
+ if (!p)
+ return -EINVAL;
+
+ /* The first int of the property is the number of lmb's described
+ * by the property. This is followed by an array of of_drconf_cell
+ * entries. Get the number of entries and skip to the array of
+ * of_drconf_cell's.
+ */
+ entries = be32_to_cpu(*p++);
+ old_drmem = (struct of_drconf_cell_v1 *)p;
+
+ p = (__be32 *)pr->prop->value;
+ p++;
+ new_drmem = (struct of_drconf_cell_v1 *)p;
+
+ for (i = 0; i < entries; i++) {
+ if ((be32_to_cpu(old_drmem[i].flags) & DRCONF_MEM_ASSIGNED) &&
+ (!(be32_to_cpu(new_drmem[i].flags) & DRCONF_MEM_ASSIGNED))) {
+ rc = pseries_remove_memblock(
+ be64_to_cpu(old_drmem[i].base_addr),
+ memblock_size);
+ break;
+ } else if ((!(be32_to_cpu(old_drmem[i].flags) &
+ DRCONF_MEM_ASSIGNED)) &&
+ (be32_to_cpu(new_drmem[i].flags) &
+ DRCONF_MEM_ASSIGNED)) {
+ rc = memblock_add(be64_to_cpu(old_drmem[i].base_addr),
+ memblock_size);
+ rc = (rc < 0) ? -EINVAL : 0;
+ break;
+ }
+ }
+ return rc;
+}
+
+static int pseries_memory_notifier(struct notifier_block *nb,
+ unsigned long action, void *data)
+{
+ struct of_reconfig_data *rd = data;
+ int err = 0;
+
+ switch (action) {
+ case OF_RECONFIG_ATTACH_NODE:
+ err = pseries_add_mem_node(rd->dn);
+ break;
+ case OF_RECONFIG_DETACH_NODE:
+ err = pseries_remove_mem_node(rd->dn);
+ break;
+ case OF_RECONFIG_UPDATE_PROPERTY:
+ if (!strcmp(rd->prop->name, "ibm,dynamic-memory"))
+ err = pseries_update_drconf_memory(rd);
+ break;
+ }
+ return notifier_from_errno(err);
+}
+
+static struct notifier_block pseries_mem_nb = {
+ .notifier_call = pseries_memory_notifier,
+};
+
+static int __init pseries_memory_hotplug_init(void)
+{
+ if (firmware_has_feature(FW_FEATURE_LPAR))
+ of_reconfig_notifier_register(&pseries_mem_nb);
+
+ return 0;
+}
+machine_device_initcall(pseries, pseries_memory_hotplug_init);
diff --git a/arch/powerpc/platforms/pseries/hvCall.S b/arch/powerpc/platforms/pseries/hvCall.S
new file mode 100644
index 000000000..50dc9426d
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/hvCall.S
@@ -0,0 +1,339 @@
+/*
+ * This file contains the generic code to perform a call to the
+ * pSeries LPAR hypervisor.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <linux/jump_label.h>
+#include <asm/hvcall.h>
+#include <asm/processor.h>
+#include <asm/ppc_asm.h>
+#include <asm/asm-offsets.h>
+#include <asm/ptrace.h>
+#include <asm/feature-fixups.h>
+
+ .section ".text"
+
+#ifdef CONFIG_TRACEPOINTS
+
+#ifndef CONFIG_JUMP_LABEL
+ .section ".toc","aw"
+
+ .globl hcall_tracepoint_refcount
+hcall_tracepoint_refcount:
+ .8byte 0
+
+ .section ".text"
+#endif
+
+/*
+ * precall must preserve all registers. use unused STK_PARAM()
+ * areas to save snapshots and opcode.
+ */
+#define HCALL_INST_PRECALL(FIRST_REG) \
+ mflr r0; \
+ std r3,STK_PARAM(R3)(r1); \
+ std r4,STK_PARAM(R4)(r1); \
+ std r5,STK_PARAM(R5)(r1); \
+ std r6,STK_PARAM(R6)(r1); \
+ std r7,STK_PARAM(R7)(r1); \
+ std r8,STK_PARAM(R8)(r1); \
+ std r9,STK_PARAM(R9)(r1); \
+ std r10,STK_PARAM(R10)(r1); \
+ std r0,16(r1); \
+ addi r4,r1,STK_PARAM(FIRST_REG); \
+ stdu r1,-STACK_FRAME_OVERHEAD(r1); \
+ bl __trace_hcall_entry; \
+ ld r3,STACK_FRAME_OVERHEAD+STK_PARAM(R3)(r1); \
+ ld r4,STACK_FRAME_OVERHEAD+STK_PARAM(R4)(r1); \
+ ld r5,STACK_FRAME_OVERHEAD+STK_PARAM(R5)(r1); \
+ ld r6,STACK_FRAME_OVERHEAD+STK_PARAM(R6)(r1); \
+ ld r7,STACK_FRAME_OVERHEAD+STK_PARAM(R7)(r1); \
+ ld r8,STACK_FRAME_OVERHEAD+STK_PARAM(R8)(r1); \
+ ld r9,STACK_FRAME_OVERHEAD+STK_PARAM(R9)(r1); \
+ ld r10,STACK_FRAME_OVERHEAD+STK_PARAM(R10)(r1)
+
+/*
+ * postcall is performed immediately before function return which
+ * allows liberal use of volatile registers.
+ */
+#define __HCALL_INST_POSTCALL \
+ ld r0,STACK_FRAME_OVERHEAD+STK_PARAM(R3)(r1); \
+ std r3,STACK_FRAME_OVERHEAD+STK_PARAM(R3)(r1); \
+ mr r4,r3; \
+ mr r3,r0; \
+ bl __trace_hcall_exit; \
+ ld r0,STACK_FRAME_OVERHEAD+16(r1); \
+ addi r1,r1,STACK_FRAME_OVERHEAD; \
+ ld r3,STK_PARAM(R3)(r1); \
+ mtlr r0
+
+#define HCALL_INST_POSTCALL_NORETS \
+ li r5,0; \
+ __HCALL_INST_POSTCALL
+
+#define HCALL_INST_POSTCALL(BUFREG) \
+ mr r5,BUFREG; \
+ __HCALL_INST_POSTCALL
+
+#ifdef CONFIG_JUMP_LABEL
+#define HCALL_BRANCH(LABEL) \
+ ARCH_STATIC_BRANCH(LABEL, hcall_tracepoint_key)
+#else
+
+/*
+ * We branch around this in early init (eg when populating the MMU
+ * hashtable) by using an unconditional cpu feature.
+ */
+#define HCALL_BRANCH(LABEL) \
+BEGIN_FTR_SECTION; \
+ b 1f; \
+END_FTR_SECTION(0, 1); \
+ ld r12,hcall_tracepoint_refcount@toc(r2); \
+ std r12,32(r1); \
+ cmpdi r12,0; \
+ bne- LABEL; \
+1:
+#endif
+
+#else
+#define HCALL_INST_PRECALL(FIRST_ARG)
+#define HCALL_INST_POSTCALL_NORETS
+#define HCALL_INST_POSTCALL(BUFREG)
+#define HCALL_BRANCH(LABEL)
+#endif
+
+_GLOBAL_TOC(plpar_hcall_norets)
+ HMT_MEDIUM
+
+ mfcr r0
+ stw r0,8(r1)
+ HCALL_BRANCH(plpar_hcall_norets_trace)
+ HVSC /* invoke the hypervisor */
+
+ lwz r0,8(r1)
+ mtcrf 0xff,r0
+ blr /* return r3 = status */
+
+#ifdef CONFIG_TRACEPOINTS
+plpar_hcall_norets_trace:
+ HCALL_INST_PRECALL(R4)
+ HVSC
+ HCALL_INST_POSTCALL_NORETS
+ lwz r0,8(r1)
+ mtcrf 0xff,r0
+ blr
+#endif
+
+_GLOBAL_TOC(plpar_hcall)
+ HMT_MEDIUM
+
+ mfcr r0
+ stw r0,8(r1)
+
+ HCALL_BRANCH(plpar_hcall_trace)
+
+ std r4,STK_PARAM(R4)(r1) /* Save ret buffer */
+
+ mr r4,r5
+ mr r5,r6
+ mr r6,r7
+ mr r7,r8
+ mr r8,r9
+ mr r9,r10
+
+ HVSC /* invoke the hypervisor */
+
+ ld r12,STK_PARAM(R4)(r1)
+ std r4, 0(r12)
+ std r5, 8(r12)
+ std r6, 16(r12)
+ std r7, 24(r12)
+
+ lwz r0,8(r1)
+ mtcrf 0xff,r0
+
+ blr /* return r3 = status */
+
+#ifdef CONFIG_TRACEPOINTS
+plpar_hcall_trace:
+ HCALL_INST_PRECALL(R5)
+
+ std r4,STK_PARAM(R4)(r1)
+ mr r0,r4
+
+ mr r4,r5
+ mr r5,r6
+ mr r6,r7
+ mr r7,r8
+ mr r8,r9
+ mr r9,r10
+
+ HVSC
+
+ ld r12,STK_PARAM(R4)(r1)
+ std r4,0(r12)
+ std r5,8(r12)
+ std r6,16(r12)
+ std r7,24(r12)
+
+ HCALL_INST_POSTCALL(r12)
+
+ lwz r0,8(r1)
+ mtcrf 0xff,r0
+
+ blr
+#endif
+
+/*
+ * plpar_hcall_raw can be called in real mode. kexec/kdump need some
+ * hypervisor calls to be executed in real mode. So plpar_hcall_raw
+ * does not access the per cpu hypervisor call statistics variables,
+ * since these variables may not be present in the RMO region.
+ */
+_GLOBAL(plpar_hcall_raw)
+ HMT_MEDIUM
+
+ mfcr r0
+ stw r0,8(r1)
+
+ std r4,STK_PARAM(R4)(r1) /* Save ret buffer */
+
+ mr r4,r5
+ mr r5,r6
+ mr r6,r7
+ mr r7,r8
+ mr r8,r9
+ mr r9,r10
+
+ HVSC /* invoke the hypervisor */
+
+ ld r12,STK_PARAM(R4)(r1)
+ std r4, 0(r12)
+ std r5, 8(r12)
+ std r6, 16(r12)
+ std r7, 24(r12)
+
+ lwz r0,8(r1)
+ mtcrf 0xff,r0
+
+ blr /* return r3 = status */
+
+_GLOBAL_TOC(plpar_hcall9)
+ HMT_MEDIUM
+
+ mfcr r0
+ stw r0,8(r1)
+
+ HCALL_BRANCH(plpar_hcall9_trace)
+
+ std r4,STK_PARAM(R4)(r1) /* Save ret buffer */
+
+ mr r4,r5
+ mr r5,r6
+ mr r6,r7
+ mr r7,r8
+ mr r8,r9
+ mr r9,r10
+ ld r10,STK_PARAM(R11)(r1) /* put arg7 in R10 */
+ ld r11,STK_PARAM(R12)(r1) /* put arg8 in R11 */
+ ld r12,STK_PARAM(R13)(r1) /* put arg9 in R12 */
+
+ HVSC /* invoke the hypervisor */
+
+ mr r0,r12
+ ld r12,STK_PARAM(R4)(r1)
+ std r4, 0(r12)
+ std r5, 8(r12)
+ std r6, 16(r12)
+ std r7, 24(r12)
+ std r8, 32(r12)
+ std r9, 40(r12)
+ std r10,48(r12)
+ std r11,56(r12)
+ std r0, 64(r12)
+
+ lwz r0,8(r1)
+ mtcrf 0xff,r0
+
+ blr /* return r3 = status */
+
+#ifdef CONFIG_TRACEPOINTS
+plpar_hcall9_trace:
+ HCALL_INST_PRECALL(R5)
+
+ std r4,STK_PARAM(R4)(r1)
+ mr r0,r4
+
+ mr r4,r5
+ mr r5,r6
+ mr r6,r7
+ mr r7,r8
+ mr r8,r9
+ mr r9,r10
+ ld r10,STACK_FRAME_OVERHEAD+STK_PARAM(R11)(r1)
+ ld r11,STACK_FRAME_OVERHEAD+STK_PARAM(R12)(r1)
+ ld r12,STACK_FRAME_OVERHEAD+STK_PARAM(R13)(r1)
+
+ HVSC
+
+ mr r0,r12
+ ld r12,STACK_FRAME_OVERHEAD+STK_PARAM(R4)(r1)
+ std r4,0(r12)
+ std r5,8(r12)
+ std r6,16(r12)
+ std r7,24(r12)
+ std r8,32(r12)
+ std r9,40(r12)
+ std r10,48(r12)
+ std r11,56(r12)
+ std r0,64(r12)
+
+ HCALL_INST_POSTCALL(r12)
+
+ lwz r0,8(r1)
+ mtcrf 0xff,r0
+
+ blr
+#endif
+
+/* See plpar_hcall_raw to see why this is needed */
+_GLOBAL(plpar_hcall9_raw)
+ HMT_MEDIUM
+
+ mfcr r0
+ stw r0,8(r1)
+
+ std r4,STK_PARAM(R4)(r1) /* Save ret buffer */
+
+ mr r4,r5
+ mr r5,r6
+ mr r6,r7
+ mr r7,r8
+ mr r8,r9
+ mr r9,r10
+ ld r10,STK_PARAM(R11)(r1) /* put arg7 in R10 */
+ ld r11,STK_PARAM(R12)(r1) /* put arg8 in R11 */
+ ld r12,STK_PARAM(R13)(r1) /* put arg9 in R12 */
+
+ HVSC /* invoke the hypervisor */
+
+ mr r0,r12
+ ld r12,STK_PARAM(R4)(r1)
+ std r4, 0(r12)
+ std r5, 8(r12)
+ std r6, 16(r12)
+ std r7, 24(r12)
+ std r8, 32(r12)
+ std r9, 40(r12)
+ std r10,48(r12)
+ std r11,56(r12)
+ std r0, 64(r12)
+
+ lwz r0,8(r1)
+ mtcrf 0xff,r0
+
+ blr /* return r3 = status */
diff --git a/arch/powerpc/platforms/pseries/hvCall_inst.c b/arch/powerpc/platforms/pseries/hvCall_inst.c
new file mode 100644
index 000000000..6da320c78
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/hvCall_inst.c
@@ -0,0 +1,176 @@
+/*
+ * Copyright (C) 2006 Mike Kravetz IBM Corporation
+ *
+ * Hypervisor Call Instrumentation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/percpu.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+#include <linux/cpumask.h>
+#include <asm/hvcall.h>
+#include <asm/firmware.h>
+#include <asm/cputable.h>
+#include <asm/trace.h>
+#include <asm/machdep.h>
+
+/* For hcall instrumentation. One structure per-hcall, per-CPU */
+struct hcall_stats {
+ unsigned long num_calls; /* number of calls (on this CPU) */
+ unsigned long tb_total; /* total wall time (mftb) of calls. */
+ unsigned long purr_total; /* total cpu time (PURR) of calls. */
+ unsigned long tb_start;
+ unsigned long purr_start;
+};
+#define HCALL_STAT_ARRAY_SIZE ((MAX_HCALL_OPCODE >> 2) + 1)
+
+DEFINE_PER_CPU(struct hcall_stats[HCALL_STAT_ARRAY_SIZE], hcall_stats);
+
+/*
+ * Routines for displaying the statistics in debugfs
+ */
+static void *hc_start(struct seq_file *m, loff_t *pos)
+{
+ if ((int)*pos < (HCALL_STAT_ARRAY_SIZE-1))
+ return (void *)(unsigned long)(*pos + 1);
+
+ return NULL;
+}
+
+static void *hc_next(struct seq_file *m, void *p, loff_t * pos)
+{
+ ++*pos;
+
+ return hc_start(m, pos);
+}
+
+static void hc_stop(struct seq_file *m, void *p)
+{
+}
+
+static int hc_show(struct seq_file *m, void *p)
+{
+ unsigned long h_num = (unsigned long)p;
+ struct hcall_stats *hs = m->private;
+
+ if (hs[h_num].num_calls) {
+ if (cpu_has_feature(CPU_FTR_PURR))
+ seq_printf(m, "%lu %lu %lu %lu\n", h_num<<2,
+ hs[h_num].num_calls,
+ hs[h_num].tb_total,
+ hs[h_num].purr_total);
+ else
+ seq_printf(m, "%lu %lu %lu\n", h_num<<2,
+ hs[h_num].num_calls,
+ hs[h_num].tb_total);
+ }
+
+ return 0;
+}
+
+static const struct seq_operations hcall_inst_seq_ops = {
+ .start = hc_start,
+ .next = hc_next,
+ .stop = hc_stop,
+ .show = hc_show
+};
+
+static int hcall_inst_seq_open(struct inode *inode, struct file *file)
+{
+ int rc;
+ struct seq_file *seq;
+
+ rc = seq_open(file, &hcall_inst_seq_ops);
+ seq = file->private_data;
+ seq->private = file_inode(file)->i_private;
+
+ return rc;
+}
+
+static const struct file_operations hcall_inst_seq_fops = {
+ .open = hcall_inst_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+#define HCALL_ROOT_DIR "hcall_inst"
+#define CPU_NAME_BUF_SIZE 32
+
+
+static void probe_hcall_entry(void *ignored, unsigned long opcode, unsigned long *args)
+{
+ struct hcall_stats *h;
+
+ if (opcode > MAX_HCALL_OPCODE)
+ return;
+
+ h = this_cpu_ptr(&hcall_stats[opcode / 4]);
+ h->tb_start = mftb();
+ h->purr_start = mfspr(SPRN_PURR);
+}
+
+static void probe_hcall_exit(void *ignored, unsigned long opcode, long retval,
+ unsigned long *retbuf)
+{
+ struct hcall_stats *h;
+
+ if (opcode > MAX_HCALL_OPCODE)
+ return;
+
+ h = this_cpu_ptr(&hcall_stats[opcode / 4]);
+ h->num_calls++;
+ h->tb_total += mftb() - h->tb_start;
+ h->purr_total += mfspr(SPRN_PURR) - h->purr_start;
+}
+
+static int __init hcall_inst_init(void)
+{
+ struct dentry *hcall_root;
+ struct dentry *hcall_file;
+ char cpu_name_buf[CPU_NAME_BUF_SIZE];
+ int cpu;
+
+ if (!firmware_has_feature(FW_FEATURE_LPAR))
+ return 0;
+
+ if (register_trace_hcall_entry(probe_hcall_entry, NULL))
+ return -EINVAL;
+
+ if (register_trace_hcall_exit(probe_hcall_exit, NULL)) {
+ unregister_trace_hcall_entry(probe_hcall_entry, NULL);
+ return -EINVAL;
+ }
+
+ hcall_root = debugfs_create_dir(HCALL_ROOT_DIR, NULL);
+ if (!hcall_root)
+ return -ENOMEM;
+
+ for_each_possible_cpu(cpu) {
+ snprintf(cpu_name_buf, CPU_NAME_BUF_SIZE, "cpu%d", cpu);
+ hcall_file = debugfs_create_file(cpu_name_buf, 0444,
+ hcall_root,
+ per_cpu(hcall_stats, cpu),
+ &hcall_inst_seq_fops);
+ if (!hcall_file)
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+machine_device_initcall(pseries, hcall_inst_init);
diff --git a/arch/powerpc/platforms/pseries/hvconsole.c b/arch/powerpc/platforms/pseries/hvconsole.c
new file mode 100644
index 000000000..73ec15cd2
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/hvconsole.c
@@ -0,0 +1,88 @@
+/*
+ * hvconsole.c
+ * Copyright (C) 2004 Hollis Blanchard, IBM Corporation
+ * Copyright (C) 2004 IBM Corporation
+ *
+ * Additional Author(s):
+ * Ryan S. Arnold <rsa@us.ibm.com>
+ *
+ * LPAR console support.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/export.h>
+#include <linux/errno.h>
+#include <asm/hvcall.h>
+#include <asm/hvconsole.h>
+#include <asm/plpar_wrappers.h>
+
+/**
+ * hvc_get_chars - retrieve characters from firmware for denoted vterm adapter
+ * @vtermno: The vtermno or unit_address of the adapter from which to fetch the
+ * data.
+ * @buf: The character buffer into which to put the character data fetched from
+ * firmware.
+ * @count: not used?
+ */
+int hvc_get_chars(uint32_t vtermno, char *buf, int count)
+{
+ long ret;
+ unsigned long retbuf[PLPAR_HCALL_BUFSIZE];
+ unsigned long *lbuf = (unsigned long *)buf;
+
+ ret = plpar_hcall(H_GET_TERM_CHAR, retbuf, vtermno);
+ lbuf[0] = be64_to_cpu(retbuf[1]);
+ lbuf[1] = be64_to_cpu(retbuf[2]);
+
+ if (ret == H_SUCCESS)
+ return retbuf[0];
+
+ return 0;
+}
+
+EXPORT_SYMBOL(hvc_get_chars);
+
+
+/**
+ * hvc_put_chars: send characters to firmware for denoted vterm adapter
+ * @vtermno: The vtermno or unit_address of the adapter from which the data
+ * originated.
+ * @buf: The character buffer that contains the character data to send to
+ * firmware. Must be at least 16 bytes, even if count is less than 16.
+ * @count: Send this number of characters.
+ */
+int hvc_put_chars(uint32_t vtermno, const char *buf, int count)
+{
+ unsigned long *lbuf = (unsigned long *) buf;
+ long ret;
+
+
+ /* hcall will ret H_PARAMETER if 'count' exceeds firmware max.*/
+ if (count > MAX_VIO_PUT_CHARS)
+ count = MAX_VIO_PUT_CHARS;
+
+ ret = plpar_hcall_norets(H_PUT_TERM_CHAR, vtermno, count,
+ cpu_to_be64(lbuf[0]),
+ cpu_to_be64(lbuf[1]));
+ if (ret == H_SUCCESS)
+ return count;
+ if (ret == H_BUSY)
+ return -EAGAIN;
+ return -EIO;
+}
+
+EXPORT_SYMBOL(hvc_put_chars);
diff --git a/arch/powerpc/platforms/pseries/hvcserver.c b/arch/powerpc/platforms/pseries/hvcserver.c
new file mode 100644
index 000000000..94a6e5612
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/hvcserver.c
@@ -0,0 +1,252 @@
+/*
+ * hvcserver.c
+ * Copyright (C) 2004 Ryan S Arnold, IBM Corporation
+ *
+ * PPC64 virtual I/O console server support.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+
+#include <asm/hvcall.h>
+#include <asm/hvcserver.h>
+#include <asm/io.h>
+
+#define HVCS_ARCH_VERSION "1.0.0"
+
+MODULE_AUTHOR("Ryan S. Arnold <rsa@us.ibm.com>");
+MODULE_DESCRIPTION("IBM hvcs ppc64 API");
+MODULE_LICENSE("GPL");
+MODULE_VERSION(HVCS_ARCH_VERSION);
+
+/*
+ * Convert arch specific return codes into relevant errnos. The hvcs
+ * functions aren't performance sensitive, so this conversion isn't an
+ * issue.
+ */
+static int hvcs_convert(long to_convert)
+{
+ switch (to_convert) {
+ case H_SUCCESS:
+ return 0;
+ case H_PARAMETER:
+ return -EINVAL;
+ case H_HARDWARE:
+ return -EIO;
+ case H_BUSY:
+ case H_LONG_BUSY_ORDER_1_MSEC:
+ case H_LONG_BUSY_ORDER_10_MSEC:
+ case H_LONG_BUSY_ORDER_100_MSEC:
+ case H_LONG_BUSY_ORDER_1_SEC:
+ case H_LONG_BUSY_ORDER_10_SEC:
+ case H_LONG_BUSY_ORDER_100_SEC:
+ return -EBUSY;
+ case H_FUNCTION: /* fall through */
+ default:
+ return -EPERM;
+ }
+}
+
+/**
+ * hvcs_free_partner_info - free pi allocated by hvcs_get_partner_info
+ * @head: list_head pointer for an allocated list of partner info structs to
+ * free.
+ *
+ * This function is used to free the partner info list that was returned by
+ * calling hvcs_get_partner_info().
+ */
+int hvcs_free_partner_info(struct list_head *head)
+{
+ struct hvcs_partner_info *pi;
+ struct list_head *element;
+
+ if (!head)
+ return -EINVAL;
+
+ while (!list_empty(head)) {
+ element = head->next;
+ pi = list_entry(element, struct hvcs_partner_info, node);
+ list_del(element);
+ kfree(pi);
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(hvcs_free_partner_info);
+
+/* Helper function for hvcs_get_partner_info */
+static int hvcs_next_partner(uint32_t unit_address,
+ unsigned long last_p_partition_ID,
+ unsigned long last_p_unit_address, unsigned long *pi_buff)
+
+{
+ long retval;
+ retval = plpar_hcall_norets(H_VTERM_PARTNER_INFO, unit_address,
+ last_p_partition_ID,
+ last_p_unit_address, virt_to_phys(pi_buff));
+ return hvcs_convert(retval);
+}
+
+/**
+ * hvcs_get_partner_info - Get all of the partner info for a vty-server adapter
+ * @unit_address: The unit_address of the vty-server adapter for which this
+ * function is fetching partner info.
+ * @head: An initialized list_head pointer to an empty list to use to return the
+ * list of partner info fetched from the hypervisor to the caller.
+ * @pi_buff: A page sized buffer pre-allocated prior to calling this function
+ * that is to be used to be used by firmware as an iterator to keep track
+ * of the partner info retrieval.
+ *
+ * This function returns non-zero on success, or if there is no partner info.
+ *
+ * The pi_buff is pre-allocated prior to calling this function because this
+ * function may be called with a spin_lock held and kmalloc of a page is not
+ * recommended as GFP_ATOMIC.
+ *
+ * The first long of this buffer is used to store a partner unit address. The
+ * second long is used to store a partner partition ID and starting at
+ * pi_buff[2] is the 79 character Converged Location Code (diff size than the
+ * unsigned longs, hence the casting mumbo jumbo you see later).
+ *
+ * Invocation of this function should always be followed by an invocation of
+ * hvcs_free_partner_info() using a pointer to the SAME list head instance
+ * that was passed as a parameter to this function.
+ */
+int hvcs_get_partner_info(uint32_t unit_address, struct list_head *head,
+ unsigned long *pi_buff)
+{
+ /*
+ * Dealt with as longs because of the hcall interface even though the
+ * values are uint32_t.
+ */
+ unsigned long last_p_partition_ID;
+ unsigned long last_p_unit_address;
+ struct hvcs_partner_info *next_partner_info = NULL;
+ int more = 1;
+ int retval;
+
+ /* invalid parameters */
+ if (!head || !pi_buff)
+ return -EINVAL;
+
+ memset(pi_buff, 0x00, PAGE_SIZE);
+ last_p_partition_ID = last_p_unit_address = ~0UL;
+ INIT_LIST_HEAD(head);
+
+ do {
+ retval = hvcs_next_partner(unit_address, last_p_partition_ID,
+ last_p_unit_address, pi_buff);
+ if (retval) {
+ /*
+ * Don't indicate that we've failed if we have
+ * any list elements.
+ */
+ if (!list_empty(head))
+ return 0;
+ return retval;
+ }
+
+ last_p_partition_ID = be64_to_cpu(pi_buff[0]);
+ last_p_unit_address = be64_to_cpu(pi_buff[1]);
+
+ /* This indicates that there are no further partners */
+ if (last_p_partition_ID == ~0UL
+ && last_p_unit_address == ~0UL)
+ break;
+
+ /* This is a very small struct and will be freed soon in
+ * hvcs_free_partner_info(). */
+ next_partner_info = kmalloc(sizeof(struct hvcs_partner_info),
+ GFP_ATOMIC);
+
+ if (!next_partner_info) {
+ printk(KERN_WARNING "HVCONSOLE: kmalloc() failed to"
+ " allocate partner info struct.\n");
+ hvcs_free_partner_info(head);
+ return -ENOMEM;
+ }
+
+ next_partner_info->unit_address
+ = (unsigned int)last_p_unit_address;
+ next_partner_info->partition_ID
+ = (unsigned int)last_p_partition_ID;
+
+ /* copy the Null-term char too */
+ strlcpy(&next_partner_info->location_code[0],
+ (char *)&pi_buff[2],
+ sizeof(next_partner_info->location_code));
+
+ list_add_tail(&(next_partner_info->node), head);
+ next_partner_info = NULL;
+
+ } while (more);
+
+ return 0;
+}
+EXPORT_SYMBOL(hvcs_get_partner_info);
+
+/**
+ * hvcs_register_connection - establish a connection between this vty-server and
+ * a vty.
+ * @unit_address: The unit address of the vty-server adapter that is to be
+ * establish a connection.
+ * @p_partition_ID: The partition ID of the vty adapter that is to be connected.
+ * @p_unit_address: The unit address of the vty adapter to which the vty-server
+ * is to be connected.
+ *
+ * If this function is called once and -EINVAL is returned it may
+ * indicate that the partner info needs to be refreshed for the
+ * target unit address at which point the caller must invoke
+ * hvcs_get_partner_info() and then call this function again. If,
+ * for a second time, -EINVAL is returned then it indicates that
+ * there is probably already a partner connection registered to a
+ * different vty-server adapter. It is also possible that a second
+ * -EINVAL may indicate that one of the parms is not valid, for
+ * instance if the link was removed between the vty-server adapter
+ * and the vty adapter that you are trying to open. Don't shoot the
+ * messenger. Firmware implemented it this way.
+ */
+int hvcs_register_connection( uint32_t unit_address,
+ uint32_t p_partition_ID, uint32_t p_unit_address)
+{
+ long retval;
+ retval = plpar_hcall_norets(H_REGISTER_VTERM, unit_address,
+ p_partition_ID, p_unit_address);
+ return hvcs_convert(retval);
+}
+EXPORT_SYMBOL(hvcs_register_connection);
+
+/**
+ * hvcs_free_connection - free the connection between a vty-server and vty
+ * @unit_address: The unit address of the vty-server that is to have its
+ * connection severed.
+ *
+ * This function is used to free the partner connection between a vty-server
+ * adapter and a vty adapter.
+ *
+ * If -EBUSY is returned continue to call this function until 0 is returned.
+ */
+int hvcs_free_connection(uint32_t unit_address)
+{
+ long retval;
+ retval = plpar_hcall_norets(H_FREE_VTERM, unit_address);
+ return hvcs_convert(retval);
+}
+EXPORT_SYMBOL(hvcs_free_connection);
diff --git a/arch/powerpc/platforms/pseries/ibmebus.c b/arch/powerpc/platforms/pseries/ibmebus.c
new file mode 100644
index 000000000..c7c1140c1
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/ibmebus.c
@@ -0,0 +1,469 @@
+/*
+ * IBM PowerPC IBM eBus Infrastructure Support.
+ *
+ * Copyright (c) 2005 IBM Corporation
+ * Joachim Fenkes <fenkes@de.ibm.com>
+ * Heiko J Schick <schickhj@de.ibm.com>
+ *
+ * All rights reserved.
+ *
+ * This source code is distributed under a dual license of GPL v2.0 and OpenIB
+ * BSD.
+ *
+ * OpenIB BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/init.h>
+#include <linux/export.h>
+#include <linux/console.h>
+#include <linux/kobject.h>
+#include <linux/dma-mapping.h>
+#include <linux/interrupt.h>
+#include <linux/of.h>
+#include <linux/slab.h>
+#include <linux/stat.h>
+#include <linux/of_platform.h>
+#include <asm/ibmebus.h>
+
+static struct device ibmebus_bus_device = { /* fake "parent" device */
+ .init_name = "ibmebus",
+};
+
+struct bus_type ibmebus_bus_type;
+
+/* These devices will automatically be added to the bus during init */
+static const struct of_device_id ibmebus_matches[] __initconst = {
+ { .compatible = "IBM,lhca" },
+ { .compatible = "IBM,lhea" },
+ {},
+};
+
+static void *ibmebus_alloc_coherent(struct device *dev,
+ size_t size,
+ dma_addr_t *dma_handle,
+ gfp_t flag,
+ unsigned long attrs)
+{
+ void *mem;
+
+ mem = kmalloc(size, flag);
+ *dma_handle = (dma_addr_t)mem;
+
+ return mem;
+}
+
+static void ibmebus_free_coherent(struct device *dev,
+ size_t size, void *vaddr,
+ dma_addr_t dma_handle,
+ unsigned long attrs)
+{
+ kfree(vaddr);
+}
+
+static dma_addr_t ibmebus_map_page(struct device *dev,
+ struct page *page,
+ unsigned long offset,
+ size_t size,
+ enum dma_data_direction direction,
+ unsigned long attrs)
+{
+ return (dma_addr_t)(page_address(page) + offset);
+}
+
+static void ibmebus_unmap_page(struct device *dev,
+ dma_addr_t dma_addr,
+ size_t size,
+ enum dma_data_direction direction,
+ unsigned long attrs)
+{
+ return;
+}
+
+static int ibmebus_map_sg(struct device *dev,
+ struct scatterlist *sgl,
+ int nents, enum dma_data_direction direction,
+ unsigned long attrs)
+{
+ struct scatterlist *sg;
+ int i;
+
+ for_each_sg(sgl, sg, nents, i) {
+ sg->dma_address = (dma_addr_t) sg_virt(sg);
+ sg->dma_length = sg->length;
+ }
+
+ return nents;
+}
+
+static void ibmebus_unmap_sg(struct device *dev,
+ struct scatterlist *sg,
+ int nents, enum dma_data_direction direction,
+ unsigned long attrs)
+{
+ return;
+}
+
+static int ibmebus_dma_supported(struct device *dev, u64 mask)
+{
+ return mask == DMA_BIT_MASK(64);
+}
+
+static u64 ibmebus_dma_get_required_mask(struct device *dev)
+{
+ return DMA_BIT_MASK(64);
+}
+
+static const struct dma_map_ops ibmebus_dma_ops = {
+ .alloc = ibmebus_alloc_coherent,
+ .free = ibmebus_free_coherent,
+ .map_sg = ibmebus_map_sg,
+ .unmap_sg = ibmebus_unmap_sg,
+ .dma_supported = ibmebus_dma_supported,
+ .get_required_mask = ibmebus_dma_get_required_mask,
+ .map_page = ibmebus_map_page,
+ .unmap_page = ibmebus_unmap_page,
+};
+
+static int ibmebus_match_path(struct device *dev, void *data)
+{
+ struct device_node *dn = to_platform_device(dev)->dev.of_node;
+ return (of_find_node_by_path(data) == dn);
+}
+
+static int ibmebus_match_node(struct device *dev, void *data)
+{
+ return to_platform_device(dev)->dev.of_node == data;
+}
+
+static int ibmebus_create_device(struct device_node *dn)
+{
+ struct platform_device *dev;
+ int ret;
+
+ dev = of_device_alloc(dn, NULL, &ibmebus_bus_device);
+ if (!dev)
+ return -ENOMEM;
+
+ dev->dev.bus = &ibmebus_bus_type;
+ dev->dev.dma_ops = &ibmebus_dma_ops;
+
+ ret = of_device_add(dev);
+ if (ret)
+ platform_device_put(dev);
+ return ret;
+}
+
+static int ibmebus_create_devices(const struct of_device_id *matches)
+{
+ struct device_node *root, *child;
+ struct device *dev;
+ int ret = 0;
+
+ root = of_find_node_by_path("/");
+
+ for_each_child_of_node(root, child) {
+ if (!of_match_node(matches, child))
+ continue;
+
+ dev = bus_find_device(&ibmebus_bus_type, NULL, child,
+ ibmebus_match_node);
+ if (dev) {
+ put_device(dev);
+ continue;
+ }
+
+ ret = ibmebus_create_device(child);
+ if (ret) {
+ printk(KERN_ERR "%s: failed to create device (%i)",
+ __func__, ret);
+ of_node_put(child);
+ break;
+ }
+ }
+
+ of_node_put(root);
+ return ret;
+}
+
+int ibmebus_register_driver(struct platform_driver *drv)
+{
+ /* If the driver uses devices that ibmebus doesn't know, add them */
+ ibmebus_create_devices(drv->driver.of_match_table);
+
+ drv->driver.bus = &ibmebus_bus_type;
+ return driver_register(&drv->driver);
+}
+EXPORT_SYMBOL(ibmebus_register_driver);
+
+void ibmebus_unregister_driver(struct platform_driver *drv)
+{
+ driver_unregister(&drv->driver);
+}
+EXPORT_SYMBOL(ibmebus_unregister_driver);
+
+int ibmebus_request_irq(u32 ist, irq_handler_t handler,
+ unsigned long irq_flags, const char *devname,
+ void *dev_id)
+{
+ unsigned int irq = irq_create_mapping(NULL, ist);
+
+ if (!irq)
+ return -EINVAL;
+
+ return request_irq(irq, handler, irq_flags, devname, dev_id);
+}
+EXPORT_SYMBOL(ibmebus_request_irq);
+
+void ibmebus_free_irq(u32 ist, void *dev_id)
+{
+ unsigned int irq = irq_find_mapping(NULL, ist);
+
+ free_irq(irq, dev_id);
+ irq_dispose_mapping(irq);
+}
+EXPORT_SYMBOL(ibmebus_free_irq);
+
+static char *ibmebus_chomp(const char *in, size_t count)
+{
+ char *out = kmalloc(count + 1, GFP_KERNEL);
+
+ if (!out)
+ return NULL;
+
+ memcpy(out, in, count);
+ out[count] = '\0';
+ if (out[count - 1] == '\n')
+ out[count - 1] = '\0';
+
+ return out;
+}
+
+static ssize_t ibmebus_store_probe(struct bus_type *bus,
+ const char *buf, size_t count)
+{
+ struct device_node *dn = NULL;
+ struct device *dev;
+ char *path;
+ ssize_t rc = 0;
+
+ path = ibmebus_chomp(buf, count);
+ if (!path)
+ return -ENOMEM;
+
+ dev = bus_find_device(&ibmebus_bus_type, NULL, path,
+ ibmebus_match_path);
+ if (dev) {
+ put_device(dev);
+ printk(KERN_WARNING "%s: %s has already been probed\n",
+ __func__, path);
+ rc = -EEXIST;
+ goto out;
+ }
+
+ if ((dn = of_find_node_by_path(path))) {
+ rc = ibmebus_create_device(dn);
+ of_node_put(dn);
+ } else {
+ printk(KERN_WARNING "%s: no such device node: %s\n",
+ __func__, path);
+ rc = -ENODEV;
+ }
+
+out:
+ kfree(path);
+ if (rc)
+ return rc;
+ return count;
+}
+static BUS_ATTR(probe, 0200, NULL, ibmebus_store_probe);
+
+static ssize_t ibmebus_store_remove(struct bus_type *bus,
+ const char *buf, size_t count)
+{
+ struct device *dev;
+ char *path;
+
+ path = ibmebus_chomp(buf, count);
+ if (!path)
+ return -ENOMEM;
+
+ if ((dev = bus_find_device(&ibmebus_bus_type, NULL, path,
+ ibmebus_match_path))) {
+ of_device_unregister(to_platform_device(dev));
+ put_device(dev);
+
+ kfree(path);
+ return count;
+ } else {
+ printk(KERN_WARNING "%s: %s not on the bus\n",
+ __func__, path);
+
+ kfree(path);
+ return -ENODEV;
+ }
+}
+static BUS_ATTR(remove, 0200, NULL, ibmebus_store_remove);
+
+static struct attribute *ibmbus_bus_attrs[] = {
+ &bus_attr_probe.attr,
+ &bus_attr_remove.attr,
+ NULL,
+};
+ATTRIBUTE_GROUPS(ibmbus_bus);
+
+static int ibmebus_bus_bus_match(struct device *dev, struct device_driver *drv)
+{
+ const struct of_device_id *matches = drv->of_match_table;
+
+ if (!matches)
+ return 0;
+
+ return of_match_device(matches, dev) != NULL;
+}
+
+static int ibmebus_bus_device_probe(struct device *dev)
+{
+ int error = -ENODEV;
+ struct platform_driver *drv;
+ struct platform_device *of_dev;
+
+ drv = to_platform_driver(dev->driver);
+ of_dev = to_platform_device(dev);
+
+ if (!drv->probe)
+ return error;
+
+ of_dev_get(of_dev);
+
+ if (of_driver_match_device(dev, dev->driver))
+ error = drv->probe(of_dev);
+ if (error)
+ of_dev_put(of_dev);
+
+ return error;
+}
+
+static int ibmebus_bus_device_remove(struct device *dev)
+{
+ struct platform_device *of_dev = to_platform_device(dev);
+ struct platform_driver *drv = to_platform_driver(dev->driver);
+
+ if (dev->driver && drv->remove)
+ drv->remove(of_dev);
+ return 0;
+}
+
+static void ibmebus_bus_device_shutdown(struct device *dev)
+{
+ struct platform_device *of_dev = to_platform_device(dev);
+ struct platform_driver *drv = to_platform_driver(dev->driver);
+
+ if (dev->driver && drv->shutdown)
+ drv->shutdown(of_dev);
+}
+
+/*
+ * ibmebus_bus_device_attrs
+ */
+static ssize_t devspec_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct platform_device *ofdev;
+
+ ofdev = to_platform_device(dev);
+ return sprintf(buf, "%pOF\n", ofdev->dev.of_node);
+}
+static DEVICE_ATTR_RO(devspec);
+
+static ssize_t name_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct platform_device *ofdev;
+
+ ofdev = to_platform_device(dev);
+ return sprintf(buf, "%s\n", ofdev->dev.of_node->name);
+}
+static DEVICE_ATTR_RO(name);
+
+static ssize_t modalias_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ return of_device_modalias(dev, buf, PAGE_SIZE);
+}
+static DEVICE_ATTR_RO(modalias);
+
+static struct attribute *ibmebus_bus_device_attrs[] = {
+ &dev_attr_devspec.attr,
+ &dev_attr_name.attr,
+ &dev_attr_modalias.attr,
+ NULL,
+};
+ATTRIBUTE_GROUPS(ibmebus_bus_device);
+
+struct bus_type ibmebus_bus_type = {
+ .name = "ibmebus",
+ .uevent = of_device_uevent_modalias,
+ .bus_groups = ibmbus_bus_groups,
+ .match = ibmebus_bus_bus_match,
+ .probe = ibmebus_bus_device_probe,
+ .remove = ibmebus_bus_device_remove,
+ .shutdown = ibmebus_bus_device_shutdown,
+ .dev_groups = ibmebus_bus_device_groups,
+};
+EXPORT_SYMBOL(ibmebus_bus_type);
+
+static int __init ibmebus_bus_init(void)
+{
+ int err;
+
+ printk(KERN_INFO "IBM eBus Device Driver\n");
+
+ err = bus_register(&ibmebus_bus_type);
+ if (err) {
+ printk(KERN_ERR "%s: failed to register IBM eBus.\n",
+ __func__);
+ return err;
+ }
+
+ err = device_register(&ibmebus_bus_device);
+ if (err) {
+ printk(KERN_WARNING "%s: device_register returned %i\n",
+ __func__, err);
+ bus_unregister(&ibmebus_bus_type);
+
+ return err;
+ }
+
+ err = ibmebus_create_devices(ibmebus_matches);
+ if (err) {
+ device_unregister(&ibmebus_bus_device);
+ bus_unregister(&ibmebus_bus_type);
+ return err;
+ }
+
+ return 0;
+}
+postcore_initcall(ibmebus_bus_init);
diff --git a/arch/powerpc/platforms/pseries/io_event_irq.c b/arch/powerpc/platforms/pseries/io_event_irq.c
new file mode 100644
index 000000000..f053bda64
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/io_event_irq.c
@@ -0,0 +1,165 @@
+/*
+ * Copyright 2010 2011 Mark Nelson and Tseng-Hui (Frank) Lin, IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/export.h>
+#include <linux/irq.h>
+#include <linux/interrupt.h>
+#include <linux/of.h>
+#include <linux/list.h>
+#include <linux/notifier.h>
+
+#include <asm/machdep.h>
+#include <asm/rtas.h>
+#include <asm/irq.h>
+#include <asm/io_event_irq.h>
+
+#include "pseries.h"
+
+/*
+ * IO event interrupt is a mechanism provided by RTAS to return
+ * information about hardware error and non-error events. Device
+ * drivers can register their event handlers to receive events.
+ * Device drivers are expected to use atomic_notifier_chain_register()
+ * and atomic_notifier_chain_unregister() to register and unregister
+ * their event handlers. Since multiple IO event types and scopes
+ * share an IO event interrupt, the event handlers are called one
+ * by one until the IO event is claimed by one of the handlers.
+ * The event handlers are expected to return NOTIFY_OK if the
+ * event is handled by the event handler or NOTIFY_DONE if the
+ * event does not belong to the handler.
+ *
+ * Usage:
+ *
+ * Notifier function:
+ * #include <asm/io_event_irq.h>
+ * int event_handler(struct notifier_block *nb, unsigned long val, void *data) {
+ * p = (struct pseries_io_event_sect_data *) data;
+ * if (! is_my_event(p->scope, p->event_type)) return NOTIFY_DONE;
+ * :
+ * :
+ * return NOTIFY_OK;
+ * }
+ * struct notifier_block event_nb = {
+ * .notifier_call = event_handler,
+ * }
+ *
+ * Registration:
+ * atomic_notifier_chain_register(&pseries_ioei_notifier_list, &event_nb);
+ *
+ * Unregistration:
+ * atomic_notifier_chain_unregister(&pseries_ioei_notifier_list, &event_nb);
+ */
+
+ATOMIC_NOTIFIER_HEAD(pseries_ioei_notifier_list);
+EXPORT_SYMBOL_GPL(pseries_ioei_notifier_list);
+
+static int ioei_check_exception_token;
+
+static char ioei_rtas_buf[RTAS_DATA_BUF_SIZE] __cacheline_aligned;
+
+/**
+ * Find the data portion of an IO Event section from event log.
+ * @elog: RTAS error/event log.
+ *
+ * Return:
+ * pointer to a valid IO event section data. NULL if not found.
+ */
+static struct pseries_io_event * ioei_find_event(struct rtas_error_log *elog)
+{
+ struct pseries_errorlog *sect;
+
+ /* We should only ever get called for io-event interrupts, but if
+ * we do get called for another type then something went wrong so
+ * make some noise about it.
+ * RTAS_TYPE_IO only exists in extended event log version 6 or later.
+ * No need to check event log version.
+ */
+ if (unlikely(rtas_error_type(elog) != RTAS_TYPE_IO)) {
+ printk_once(KERN_WARNING"io_event_irq: Unexpected event type %d",
+ rtas_error_type(elog));
+ return NULL;
+ }
+
+ sect = get_pseries_errorlog(elog, PSERIES_ELOG_SECT_ID_IO_EVENT);
+ if (unlikely(!sect)) {
+ printk_once(KERN_WARNING "io_event_irq: RTAS extended event "
+ "log does not contain an IO Event section. "
+ "Could be a bug in system firmware!\n");
+ return NULL;
+ }
+ return (struct pseries_io_event *) &sect->data;
+}
+
+/*
+ * PAPR:
+ * - check-exception returns the first found error or event and clear that
+ * error or event so it is reported once.
+ * - Each interrupt returns one event. If a plateform chooses to report
+ * multiple events through a single interrupt, it must ensure that the
+ * interrupt remains asserted until check-exception has been used to
+ * process all out-standing events for that interrupt.
+ *
+ * Implementation notes:
+ * - Events must be processed in the order they are returned. Hence,
+ * sequential in nature.
+ * - The owner of an event is determined by combinations of scope,
+ * event type, and sub-type. There is no easy way to pre-sort clients
+ * by scope or event type alone. For example, Torrent ISR route change
+ * event is reported with scope 0x00 (Not Applicable) rather than
+ * 0x3B (Torrent-hub). It is better to let the clients to identify
+ * who owns the event.
+ */
+
+static irqreturn_t ioei_interrupt(int irq, void *dev_id)
+{
+ struct pseries_io_event *event;
+ int rtas_rc;
+
+ for (;;) {
+ rtas_rc = rtas_call(ioei_check_exception_token, 6, 1, NULL,
+ RTAS_VECTOR_EXTERNAL_INTERRUPT,
+ virq_to_hw(irq),
+ RTAS_IO_EVENTS, 1 /* Time Critical */,
+ __pa(ioei_rtas_buf),
+ RTAS_DATA_BUF_SIZE);
+ if (rtas_rc != 0)
+ break;
+
+ event = ioei_find_event((struct rtas_error_log *)ioei_rtas_buf);
+ if (!event)
+ continue;
+
+ atomic_notifier_call_chain(&pseries_ioei_notifier_list,
+ 0, event);
+ }
+ return IRQ_HANDLED;
+}
+
+static int __init ioei_init(void)
+{
+ struct device_node *np;
+
+ ioei_check_exception_token = rtas_token("check-exception");
+ if (ioei_check_exception_token == RTAS_UNKNOWN_SERVICE)
+ return -ENODEV;
+
+ np = of_find_node_by_path("/event-sources/ibm,io-events");
+ if (np) {
+ request_event_sources_irqs(np, ioei_interrupt, "IO_EVENT");
+ pr_info("IBM I/O event interrupts enabled\n");
+ of_node_put(np);
+ } else {
+ return -ENODEV;
+ }
+ return 0;
+}
+machine_subsys_initcall(pseries, ioei_init);
+
diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
new file mode 100644
index 000000000..b1a08cb76
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -0,0 +1,1413 @@
+/*
+ * Copyright (C) 2001 Mike Corrigan & Dave Engebretsen, IBM Corporation
+ *
+ * Rewrite, cleanup:
+ *
+ * Copyright (C) 2004 Olof Johansson <olof@lixom.net>, IBM Corporation
+ * Copyright (C) 2006 Olof Johansson <olof@lixom.net>
+ *
+ * Dynamic DMA mapping support, pSeries-specific parts, both SMP and LPAR.
+ *
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include <linux/memblock.h>
+#include <linux/spinlock.h>
+#include <linux/string.h>
+#include <linux/pci.h>
+#include <linux/dma-mapping.h>
+#include <linux/crash_dump.h>
+#include <linux/memory.h>
+#include <linux/of.h>
+#include <linux/iommu.h>
+#include <linux/rculist.h>
+#include <asm/io.h>
+#include <asm/prom.h>
+#include <asm/rtas.h>
+#include <asm/iommu.h>
+#include <asm/pci-bridge.h>
+#include <asm/machdep.h>
+#include <asm/firmware.h>
+#include <asm/tce.h>
+#include <asm/ppc-pci.h>
+#include <asm/udbg.h>
+#include <asm/mmzone.h>
+#include <asm/plpar_wrappers.h>
+
+#include "pseries.h"
+
+static struct iommu_table_group *iommu_pseries_alloc_group(int node)
+{
+ struct iommu_table_group *table_group;
+ struct iommu_table *tbl;
+ struct iommu_table_group_link *tgl;
+
+ table_group = kzalloc_node(sizeof(struct iommu_table_group), GFP_KERNEL,
+ node);
+ if (!table_group)
+ return NULL;
+
+ tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL, node);
+ if (!tbl)
+ goto free_group;
+
+ tgl = kzalloc_node(sizeof(struct iommu_table_group_link), GFP_KERNEL,
+ node);
+ if (!tgl)
+ goto free_table;
+
+ INIT_LIST_HEAD_RCU(&tbl->it_group_list);
+ kref_init(&tbl->it_kref);
+ tgl->table_group = table_group;
+ list_add_rcu(&tgl->next, &tbl->it_group_list);
+
+ table_group->tables[0] = tbl;
+
+ return table_group;
+
+free_table:
+ kfree(tbl);
+free_group:
+ kfree(table_group);
+ return NULL;
+}
+
+static void iommu_pseries_free_group(struct iommu_table_group *table_group,
+ const char *node_name)
+{
+ struct iommu_table *tbl;
+#ifdef CONFIG_IOMMU_API
+ struct iommu_table_group_link *tgl;
+#endif
+
+ if (!table_group)
+ return;
+
+ tbl = table_group->tables[0];
+#ifdef CONFIG_IOMMU_API
+ tgl = list_first_entry_or_null(&tbl->it_group_list,
+ struct iommu_table_group_link, next);
+
+ WARN_ON_ONCE(!tgl);
+ if (tgl) {
+ list_del_rcu(&tgl->next);
+ kfree(tgl);
+ }
+ if (table_group->group) {
+ iommu_group_put(table_group->group);
+ BUG_ON(table_group->group);
+ }
+#endif
+ iommu_tce_table_put(tbl);
+
+ kfree(table_group);
+}
+
+static int tce_build_pSeries(struct iommu_table *tbl, long index,
+ long npages, unsigned long uaddr,
+ enum dma_data_direction direction,
+ unsigned long attrs)
+{
+ u64 proto_tce;
+ __be64 *tcep, *tces;
+ u64 rpn;
+
+ proto_tce = TCE_PCI_READ; // Read allowed
+
+ if (direction != DMA_TO_DEVICE)
+ proto_tce |= TCE_PCI_WRITE;
+
+ tces = tcep = ((__be64 *)tbl->it_base) + index;
+
+ while (npages--) {
+ /* can't move this out since we might cross MEMBLOCK boundary */
+ rpn = __pa(uaddr) >> TCE_SHIFT;
+ *tcep = cpu_to_be64(proto_tce | (rpn & TCE_RPN_MASK) << TCE_RPN_SHIFT);
+
+ uaddr += TCE_PAGE_SIZE;
+ tcep++;
+ }
+ return 0;
+}
+
+
+static void tce_free_pSeries(struct iommu_table *tbl, long index, long npages)
+{
+ __be64 *tcep, *tces;
+
+ tces = tcep = ((__be64 *)tbl->it_base) + index;
+
+ while (npages--)
+ *(tcep++) = 0;
+}
+
+static unsigned long tce_get_pseries(struct iommu_table *tbl, long index)
+{
+ __be64 *tcep;
+
+ tcep = ((__be64 *)tbl->it_base) + index;
+
+ return be64_to_cpu(*tcep);
+}
+
+static void tce_free_pSeriesLP(unsigned long liobn, long, long);
+static void tce_freemulti_pSeriesLP(struct iommu_table*, long, long);
+
+static int tce_build_pSeriesLP(unsigned long liobn, long tcenum, long tceshift,
+ long npages, unsigned long uaddr,
+ enum dma_data_direction direction,
+ unsigned long attrs)
+{
+ u64 rc = 0;
+ u64 proto_tce, tce;
+ u64 rpn;
+ int ret = 0;
+ long tcenum_start = tcenum, npages_start = npages;
+
+ rpn = __pa(uaddr) >> tceshift;
+ proto_tce = TCE_PCI_READ;
+ if (direction != DMA_TO_DEVICE)
+ proto_tce |= TCE_PCI_WRITE;
+
+ while (npages--) {
+ tce = proto_tce | (rpn & TCE_RPN_MASK) << tceshift;
+ rc = plpar_tce_put((u64)liobn, (u64)tcenum << tceshift, tce);
+
+ if (unlikely(rc == H_NOT_ENOUGH_RESOURCES)) {
+ ret = (int)rc;
+ tce_free_pSeriesLP(liobn, tcenum_start,
+ (npages_start - (npages + 1)));
+ break;
+ }
+
+ if (rc && printk_ratelimit()) {
+ printk("tce_build_pSeriesLP: plpar_tce_put failed. rc=%lld\n", rc);
+ printk("\tindex = 0x%llx\n", (u64)liobn);
+ printk("\ttcenum = 0x%llx\n", (u64)tcenum);
+ printk("\ttce val = 0x%llx\n", tce );
+ dump_stack();
+ }
+
+ tcenum++;
+ rpn++;
+ }
+ return ret;
+}
+
+static DEFINE_PER_CPU(__be64 *, tce_page);
+
+static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum,
+ long npages, unsigned long uaddr,
+ enum dma_data_direction direction,
+ unsigned long attrs)
+{
+ u64 rc = 0;
+ u64 proto_tce;
+ __be64 *tcep;
+ u64 rpn;
+ long l, limit;
+ long tcenum_start = tcenum, npages_start = npages;
+ int ret = 0;
+ unsigned long flags;
+
+ if ((npages == 1) || !firmware_has_feature(FW_FEATURE_MULTITCE)) {
+ return tce_build_pSeriesLP(tbl->it_index, tcenum,
+ tbl->it_page_shift, npages, uaddr,
+ direction, attrs);
+ }
+
+ local_irq_save(flags); /* to protect tcep and the page behind it */
+
+ tcep = __this_cpu_read(tce_page);
+
+ /* This is safe to do since interrupts are off when we're called
+ * from iommu_alloc{,_sg}()
+ */
+ if (!tcep) {
+ tcep = (__be64 *)__get_free_page(GFP_ATOMIC);
+ /* If allocation fails, fall back to the loop implementation */
+ if (!tcep) {
+ local_irq_restore(flags);
+ return tce_build_pSeriesLP(tbl->it_index, tcenum,
+ tbl->it_page_shift,
+ npages, uaddr, direction, attrs);
+ }
+ __this_cpu_write(tce_page, tcep);
+ }
+
+ rpn = __pa(uaddr) >> TCE_SHIFT;
+ proto_tce = TCE_PCI_READ;
+ if (direction != DMA_TO_DEVICE)
+ proto_tce |= TCE_PCI_WRITE;
+
+ /* We can map max one pageful of TCEs at a time */
+ do {
+ /*
+ * Set up the page with TCE data, looping through and setting
+ * the values.
+ */
+ limit = min_t(long, npages, 4096/TCE_ENTRY_SIZE);
+
+ for (l = 0; l < limit; l++) {
+ tcep[l] = cpu_to_be64(proto_tce | (rpn & TCE_RPN_MASK) << TCE_RPN_SHIFT);
+ rpn++;
+ }
+
+ rc = plpar_tce_put_indirect((u64)tbl->it_index,
+ (u64)tcenum << 12,
+ (u64)__pa(tcep),
+ limit);
+
+ npages -= limit;
+ tcenum += limit;
+ } while (npages > 0 && !rc);
+
+ local_irq_restore(flags);
+
+ if (unlikely(rc == H_NOT_ENOUGH_RESOURCES)) {
+ ret = (int)rc;
+ tce_freemulti_pSeriesLP(tbl, tcenum_start,
+ (npages_start - (npages + limit)));
+ return ret;
+ }
+
+ if (rc && printk_ratelimit()) {
+ printk("tce_buildmulti_pSeriesLP: plpar_tce_put failed. rc=%lld\n", rc);
+ printk("\tindex = 0x%llx\n", (u64)tbl->it_index);
+ printk("\tnpages = 0x%llx\n", (u64)npages);
+ printk("\ttce[0] val = 0x%llx\n", tcep[0]);
+ dump_stack();
+ }
+ return ret;
+}
+
+static void tce_free_pSeriesLP(unsigned long liobn, long tcenum, long npages)
+{
+ u64 rc;
+
+ while (npages--) {
+ rc = plpar_tce_put((u64)liobn, (u64)tcenum << 12, 0);
+
+ if (rc && printk_ratelimit()) {
+ printk("tce_free_pSeriesLP: plpar_tce_put failed. rc=%lld\n", rc);
+ printk("\tindex = 0x%llx\n", (u64)liobn);
+ printk("\ttcenum = 0x%llx\n", (u64)tcenum);
+ dump_stack();
+ }
+
+ tcenum++;
+ }
+}
+
+
+static void tce_freemulti_pSeriesLP(struct iommu_table *tbl, long tcenum, long npages)
+{
+ u64 rc;
+
+ if (!firmware_has_feature(FW_FEATURE_MULTITCE))
+ return tce_free_pSeriesLP(tbl->it_index, tcenum, npages);
+
+ rc = plpar_tce_stuff((u64)tbl->it_index, (u64)tcenum << 12, 0, npages);
+
+ if (rc && printk_ratelimit()) {
+ printk("tce_freemulti_pSeriesLP: plpar_tce_stuff failed\n");
+ printk("\trc = %lld\n", rc);
+ printk("\tindex = 0x%llx\n", (u64)tbl->it_index);
+ printk("\tnpages = 0x%llx\n", (u64)npages);
+ dump_stack();
+ }
+}
+
+static unsigned long tce_get_pSeriesLP(struct iommu_table *tbl, long tcenum)
+{
+ u64 rc;
+ unsigned long tce_ret;
+
+ rc = plpar_tce_get((u64)tbl->it_index, (u64)tcenum << 12, &tce_ret);
+
+ if (rc && printk_ratelimit()) {
+ printk("tce_get_pSeriesLP: plpar_tce_get failed. rc=%lld\n", rc);
+ printk("\tindex = 0x%llx\n", (u64)tbl->it_index);
+ printk("\ttcenum = 0x%llx\n", (u64)tcenum);
+ dump_stack();
+ }
+
+ return tce_ret;
+}
+
+/* this is compatible with cells for the device tree property */
+struct dynamic_dma_window_prop {
+ __be32 liobn; /* tce table number */
+ __be64 dma_base; /* address hi,lo */
+ __be32 tce_shift; /* ilog2(tce_page_size) */
+ __be32 window_shift; /* ilog2(tce_window_size) */
+};
+
+struct direct_window {
+ struct device_node *device;
+ const struct dynamic_dma_window_prop *prop;
+ struct list_head list;
+};
+
+/* Dynamic DMA Window support */
+struct ddw_query_response {
+ u32 windows_available;
+ u32 largest_available_block;
+ u32 page_size;
+ u32 migration_capable;
+};
+
+struct ddw_create_response {
+ u32 liobn;
+ u32 addr_hi;
+ u32 addr_lo;
+};
+
+static LIST_HEAD(direct_window_list);
+/* prevents races between memory on/offline and window creation */
+static DEFINE_SPINLOCK(direct_window_list_lock);
+/* protects initializing window twice for same device */
+static DEFINE_MUTEX(direct_window_init_mutex);
+#define DIRECT64_PROPNAME "linux,direct64-ddr-window-info"
+
+static int tce_clearrange_multi_pSeriesLP(unsigned long start_pfn,
+ unsigned long num_pfn, const void *arg)
+{
+ const struct dynamic_dma_window_prop *maprange = arg;
+ int rc;
+ u64 tce_size, num_tce, dma_offset, next;
+ u32 tce_shift;
+ long limit;
+
+ tce_shift = be32_to_cpu(maprange->tce_shift);
+ tce_size = 1ULL << tce_shift;
+ next = start_pfn << PAGE_SHIFT;
+ num_tce = num_pfn << PAGE_SHIFT;
+
+ /* round back to the beginning of the tce page size */
+ num_tce += next & (tce_size - 1);
+ next &= ~(tce_size - 1);
+
+ /* covert to number of tces */
+ num_tce |= tce_size - 1;
+ num_tce >>= tce_shift;
+
+ do {
+ /*
+ * Set up the page with TCE data, looping through and setting
+ * the values.
+ */
+ limit = min_t(long, num_tce, 512);
+ dma_offset = next + be64_to_cpu(maprange->dma_base);
+
+ rc = plpar_tce_stuff((u64)be32_to_cpu(maprange->liobn),
+ dma_offset,
+ 0, limit);
+ next += limit * tce_size;
+ num_tce -= limit;
+ } while (num_tce > 0 && !rc);
+
+ return rc;
+}
+
+static int tce_setrange_multi_pSeriesLP(unsigned long start_pfn,
+ unsigned long num_pfn, const void *arg)
+{
+ const struct dynamic_dma_window_prop *maprange = arg;
+ u64 tce_size, num_tce, dma_offset, next, proto_tce, liobn;
+ __be64 *tcep;
+ u32 tce_shift;
+ u64 rc = 0;
+ long l, limit;
+
+ if (!firmware_has_feature(FW_FEATURE_MULTITCE)) {
+ unsigned long tceshift = be32_to_cpu(maprange->tce_shift);
+ unsigned long dmastart = (start_pfn << PAGE_SHIFT) +
+ be64_to_cpu(maprange->dma_base);
+ unsigned long tcenum = dmastart >> tceshift;
+ unsigned long npages = num_pfn << PAGE_SHIFT >> tceshift;
+ void *uaddr = __va(start_pfn << PAGE_SHIFT);
+
+ return tce_build_pSeriesLP(be32_to_cpu(maprange->liobn),
+ tcenum, tceshift, npages, (unsigned long) uaddr,
+ DMA_BIDIRECTIONAL, 0);
+ }
+
+ local_irq_disable(); /* to protect tcep and the page behind it */
+ tcep = __this_cpu_read(tce_page);
+
+ if (!tcep) {
+ tcep = (__be64 *)__get_free_page(GFP_ATOMIC);
+ if (!tcep) {
+ local_irq_enable();
+ return -ENOMEM;
+ }
+ __this_cpu_write(tce_page, tcep);
+ }
+
+ proto_tce = TCE_PCI_READ | TCE_PCI_WRITE;
+
+ liobn = (u64)be32_to_cpu(maprange->liobn);
+ tce_shift = be32_to_cpu(maprange->tce_shift);
+ tce_size = 1ULL << tce_shift;
+ next = start_pfn << PAGE_SHIFT;
+ num_tce = num_pfn << PAGE_SHIFT;
+
+ /* round back to the beginning of the tce page size */
+ num_tce += next & (tce_size - 1);
+ next &= ~(tce_size - 1);
+
+ /* covert to number of tces */
+ num_tce |= tce_size - 1;
+ num_tce >>= tce_shift;
+
+ /* We can map max one pageful of TCEs at a time */
+ do {
+ /*
+ * Set up the page with TCE data, looping through and setting
+ * the values.
+ */
+ limit = min_t(long, num_tce, 4096/TCE_ENTRY_SIZE);
+ dma_offset = next + be64_to_cpu(maprange->dma_base);
+
+ for (l = 0; l < limit; l++) {
+ tcep[l] = cpu_to_be64(proto_tce | next);
+ next += tce_size;
+ }
+
+ rc = plpar_tce_put_indirect(liobn,
+ dma_offset,
+ (u64)__pa(tcep),
+ limit);
+
+ num_tce -= limit;
+ } while (num_tce > 0 && !rc);
+
+ /* error cleanup: caller will clear whole range */
+
+ local_irq_enable();
+ return rc;
+}
+
+static int tce_setrange_multi_pSeriesLP_walk(unsigned long start_pfn,
+ unsigned long num_pfn, void *arg)
+{
+ return tce_setrange_multi_pSeriesLP(start_pfn, num_pfn, arg);
+}
+
+static void iommu_table_setparms(struct pci_controller *phb,
+ struct device_node *dn,
+ struct iommu_table *tbl)
+{
+ struct device_node *node;
+ const unsigned long *basep;
+ const u32 *sizep;
+
+ node = phb->dn;
+
+ basep = of_get_property(node, "linux,tce-base", NULL);
+ sizep = of_get_property(node, "linux,tce-size", NULL);
+ if (basep == NULL || sizep == NULL) {
+ printk(KERN_ERR "PCI_DMA: iommu_table_setparms: %pOF has "
+ "missing tce entries !\n", dn);
+ return;
+ }
+
+ tbl->it_base = (unsigned long)__va(*basep);
+
+ if (!is_kdump_kernel())
+ memset((void *)tbl->it_base, 0, *sizep);
+
+ tbl->it_busno = phb->bus->number;
+ tbl->it_page_shift = IOMMU_PAGE_SHIFT_4K;
+
+ /* Units of tce entries */
+ tbl->it_offset = phb->dma_window_base_cur >> tbl->it_page_shift;
+
+ /* Test if we are going over 2GB of DMA space */
+ if (phb->dma_window_base_cur + phb->dma_window_size > 0x80000000ul) {
+ udbg_printf("PCI_DMA: Unexpected number of IOAs under this PHB.\n");
+ panic("PCI_DMA: Unexpected number of IOAs under this PHB.\n");
+ }
+
+ phb->dma_window_base_cur += phb->dma_window_size;
+
+ /* Set the tce table size - measured in entries */
+ tbl->it_size = phb->dma_window_size >> tbl->it_page_shift;
+
+ tbl->it_index = 0;
+ tbl->it_blocksize = 16;
+ tbl->it_type = TCE_PCI;
+}
+
+/*
+ * iommu_table_setparms_lpar
+ *
+ * Function: On pSeries LPAR systems, return TCE table info, given a pci bus.
+ */
+static void iommu_table_setparms_lpar(struct pci_controller *phb,
+ struct device_node *dn,
+ struct iommu_table *tbl,
+ struct iommu_table_group *table_group,
+ const __be32 *dma_window)
+{
+ unsigned long offset, size;
+
+ of_parse_dma_window(dn, dma_window, &tbl->it_index, &offset, &size);
+
+ tbl->it_busno = phb->bus->number;
+ tbl->it_page_shift = IOMMU_PAGE_SHIFT_4K;
+ tbl->it_base = 0;
+ tbl->it_blocksize = 16;
+ tbl->it_type = TCE_PCI;
+ tbl->it_offset = offset >> tbl->it_page_shift;
+ tbl->it_size = size >> tbl->it_page_shift;
+
+ table_group->tce32_start = offset;
+ table_group->tce32_size = size;
+}
+
+struct iommu_table_ops iommu_table_pseries_ops = {
+ .set = tce_build_pSeries,
+ .clear = tce_free_pSeries,
+ .get = tce_get_pseries
+};
+
+static void pci_dma_bus_setup_pSeries(struct pci_bus *bus)
+{
+ struct device_node *dn;
+ struct iommu_table *tbl;
+ struct device_node *isa_dn, *isa_dn_orig;
+ struct device_node *tmp;
+ struct pci_dn *pci;
+ int children;
+
+ dn = pci_bus_to_OF_node(bus);
+
+ pr_debug("pci_dma_bus_setup_pSeries: setting up bus %pOF\n", dn);
+
+ if (bus->self) {
+ /* This is not a root bus, any setup will be done for the
+ * device-side of the bridge in iommu_dev_setup_pSeries().
+ */
+ return;
+ }
+ pci = PCI_DN(dn);
+
+ /* Check if the ISA bus on the system is under
+ * this PHB.
+ */
+ isa_dn = isa_dn_orig = of_find_node_by_type(NULL, "isa");
+
+ while (isa_dn && isa_dn != dn)
+ isa_dn = isa_dn->parent;
+
+ of_node_put(isa_dn_orig);
+
+ /* Count number of direct PCI children of the PHB. */
+ for (children = 0, tmp = dn->child; tmp; tmp = tmp->sibling)
+ children++;
+
+ pr_debug("Children: %d\n", children);
+
+ /* Calculate amount of DMA window per slot. Each window must be
+ * a power of two (due to pci_alloc_consistent requirements).
+ *
+ * Keep 256MB aside for PHBs with ISA.
+ */
+
+ if (!isa_dn) {
+ /* No ISA/IDE - just set window size and return */
+ pci->phb->dma_window_size = 0x80000000ul; /* To be divided */
+
+ while (pci->phb->dma_window_size * children > 0x80000000ul)
+ pci->phb->dma_window_size >>= 1;
+ pr_debug("No ISA/IDE, window size is 0x%llx\n",
+ pci->phb->dma_window_size);
+ pci->phb->dma_window_base_cur = 0;
+
+ return;
+ }
+
+ /* If we have ISA, then we probably have an IDE
+ * controller too. Allocate a 128MB table but
+ * skip the first 128MB to avoid stepping on ISA
+ * space.
+ */
+ pci->phb->dma_window_size = 0x8000000ul;
+ pci->phb->dma_window_base_cur = 0x8000000ul;
+
+ pci->table_group = iommu_pseries_alloc_group(pci->phb->node);
+ tbl = pci->table_group->tables[0];
+
+ iommu_table_setparms(pci->phb, dn, tbl);
+ tbl->it_ops = &iommu_table_pseries_ops;
+ iommu_init_table(tbl, pci->phb->node);
+ iommu_register_group(pci->table_group, pci_domain_nr(bus), 0);
+
+ /* Divide the rest (1.75GB) among the children */
+ pci->phb->dma_window_size = 0x80000000ul;
+ while (pci->phb->dma_window_size * children > 0x70000000ul)
+ pci->phb->dma_window_size >>= 1;
+
+ pr_debug("ISA/IDE, window size is 0x%llx\n", pci->phb->dma_window_size);
+}
+
+#ifdef CONFIG_IOMMU_API
+static int tce_exchange_pseries(struct iommu_table *tbl, long index, unsigned
+ long *tce, enum dma_data_direction *direction)
+{
+ long rc;
+ unsigned long ioba = (unsigned long) index << tbl->it_page_shift;
+ unsigned long flags, oldtce = 0;
+ u64 proto_tce = iommu_direction_to_tce_perm(*direction);
+ unsigned long newtce = *tce | proto_tce;
+
+ spin_lock_irqsave(&tbl->large_pool.lock, flags);
+
+ rc = plpar_tce_get((u64)tbl->it_index, ioba, &oldtce);
+ if (!rc)
+ rc = plpar_tce_put((u64)tbl->it_index, ioba, newtce);
+
+ if (!rc) {
+ *direction = iommu_tce_direction(oldtce);
+ *tce = oldtce & ~(TCE_PCI_READ | TCE_PCI_WRITE);
+ }
+
+ spin_unlock_irqrestore(&tbl->large_pool.lock, flags);
+
+ return rc;
+}
+#endif
+
+struct iommu_table_ops iommu_table_lpar_multi_ops = {
+ .set = tce_buildmulti_pSeriesLP,
+#ifdef CONFIG_IOMMU_API
+ .exchange = tce_exchange_pseries,
+#endif
+ .clear = tce_freemulti_pSeriesLP,
+ .get = tce_get_pSeriesLP
+};
+
+static void pci_dma_bus_setup_pSeriesLP(struct pci_bus *bus)
+{
+ struct iommu_table *tbl;
+ struct device_node *dn, *pdn;
+ struct pci_dn *ppci;
+ const __be32 *dma_window = NULL;
+
+ dn = pci_bus_to_OF_node(bus);
+
+ pr_debug("pci_dma_bus_setup_pSeriesLP: setting up bus %pOF\n",
+ dn);
+
+ /* Find nearest ibm,dma-window, walking up the device tree */
+ for (pdn = dn; pdn != NULL; pdn = pdn->parent) {
+ dma_window = of_get_property(pdn, "ibm,dma-window", NULL);
+ if (dma_window != NULL)
+ break;
+ }
+
+ if (dma_window == NULL) {
+ pr_debug(" no ibm,dma-window property !\n");
+ return;
+ }
+
+ ppci = PCI_DN(pdn);
+
+ pr_debug(" parent is %pOF, iommu_table: 0x%p\n",
+ pdn, ppci->table_group);
+
+ if (!ppci->table_group) {
+ ppci->table_group = iommu_pseries_alloc_group(ppci->phb->node);
+ tbl = ppci->table_group->tables[0];
+ iommu_table_setparms_lpar(ppci->phb, pdn, tbl,
+ ppci->table_group, dma_window);
+ tbl->it_ops = &iommu_table_lpar_multi_ops;
+ iommu_init_table(tbl, ppci->phb->node);
+ iommu_register_group(ppci->table_group,
+ pci_domain_nr(bus), 0);
+ pr_debug(" created table: %p\n", ppci->table_group);
+ }
+}
+
+
+static void pci_dma_dev_setup_pSeries(struct pci_dev *dev)
+{
+ struct device_node *dn;
+ struct iommu_table *tbl;
+
+ pr_debug("pci_dma_dev_setup_pSeries: %s\n", pci_name(dev));
+
+ dn = dev->dev.of_node;
+
+ /* If we're the direct child of a root bus, then we need to allocate
+ * an iommu table ourselves. The bus setup code should have setup
+ * the window sizes already.
+ */
+ if (!dev->bus->self) {
+ struct pci_controller *phb = PCI_DN(dn)->phb;
+
+ pr_debug(" --> first child, no bridge. Allocating iommu table.\n");
+ PCI_DN(dn)->table_group = iommu_pseries_alloc_group(phb->node);
+ tbl = PCI_DN(dn)->table_group->tables[0];
+ iommu_table_setparms(phb, dn, tbl);
+ tbl->it_ops = &iommu_table_pseries_ops;
+ iommu_init_table(tbl, phb->node);
+ iommu_register_group(PCI_DN(dn)->table_group,
+ pci_domain_nr(phb->bus), 0);
+ set_iommu_table_base(&dev->dev, tbl);
+ iommu_add_device(&dev->dev);
+ return;
+ }
+
+ /* If this device is further down the bus tree, search upwards until
+ * an already allocated iommu table is found and use that.
+ */
+
+ while (dn && PCI_DN(dn) && PCI_DN(dn)->table_group == NULL)
+ dn = dn->parent;
+
+ if (dn && PCI_DN(dn)) {
+ set_iommu_table_base(&dev->dev,
+ PCI_DN(dn)->table_group->tables[0]);
+ iommu_add_device(&dev->dev);
+ } else
+ printk(KERN_WARNING "iommu: Device %s has no iommu table\n",
+ pci_name(dev));
+}
+
+static int __read_mostly disable_ddw;
+
+static int __init disable_ddw_setup(char *str)
+{
+ disable_ddw = 1;
+ printk(KERN_INFO "ppc iommu: disabling ddw.\n");
+
+ return 0;
+}
+
+early_param("disable_ddw", disable_ddw_setup);
+
+static void remove_ddw(struct device_node *np, bool remove_prop)
+{
+ struct dynamic_dma_window_prop *dwp;
+ struct property *win64;
+ u32 ddw_avail[3];
+ u64 liobn;
+ int ret = 0;
+
+ ret = of_property_read_u32_array(np, "ibm,ddw-applicable",
+ &ddw_avail[0], 3);
+
+ win64 = of_find_property(np, DIRECT64_PROPNAME, NULL);
+ if (!win64)
+ return;
+
+ if (ret || win64->length < sizeof(*dwp))
+ goto delprop;
+
+ dwp = win64->value;
+ liobn = (u64)be32_to_cpu(dwp->liobn);
+
+ /* clear the whole window, note the arg is in kernel pages */
+ ret = tce_clearrange_multi_pSeriesLP(0,
+ 1ULL << (be32_to_cpu(dwp->window_shift) - PAGE_SHIFT), dwp);
+ if (ret)
+ pr_warn("%pOF failed to clear tces in window.\n",
+ np);
+ else
+ pr_debug("%pOF successfully cleared tces in window.\n",
+ np);
+
+ ret = rtas_call(ddw_avail[2], 1, 1, NULL, liobn);
+ if (ret)
+ pr_warn("%pOF: failed to remove direct window: rtas returned "
+ "%d to ibm,remove-pe-dma-window(%x) %llx\n",
+ np, ret, ddw_avail[2], liobn);
+ else
+ pr_debug("%pOF: successfully removed direct window: rtas returned "
+ "%d to ibm,remove-pe-dma-window(%x) %llx\n",
+ np, ret, ddw_avail[2], liobn);
+
+delprop:
+ if (remove_prop)
+ ret = of_remove_property(np, win64);
+ if (ret)
+ pr_warn("%pOF: failed to remove direct window property: %d\n",
+ np, ret);
+}
+
+static u64 find_existing_ddw(struct device_node *pdn)
+{
+ struct direct_window *window;
+ const struct dynamic_dma_window_prop *direct64;
+ u64 dma_addr = 0;
+
+ spin_lock(&direct_window_list_lock);
+ /* check if we already created a window and dupe that config if so */
+ list_for_each_entry(window, &direct_window_list, list) {
+ if (window->device == pdn) {
+ direct64 = window->prop;
+ dma_addr = be64_to_cpu(direct64->dma_base);
+ break;
+ }
+ }
+ spin_unlock(&direct_window_list_lock);
+
+ return dma_addr;
+}
+
+static int find_existing_ddw_windows(void)
+{
+ int len;
+ struct device_node *pdn;
+ struct direct_window *window;
+ const struct dynamic_dma_window_prop *direct64;
+
+ if (!firmware_has_feature(FW_FEATURE_LPAR))
+ return 0;
+
+ for_each_node_with_property(pdn, DIRECT64_PROPNAME) {
+ direct64 = of_get_property(pdn, DIRECT64_PROPNAME, &len);
+ if (!direct64)
+ continue;
+
+ window = kzalloc(sizeof(*window), GFP_KERNEL);
+ if (!window || len < sizeof(struct dynamic_dma_window_prop)) {
+ kfree(window);
+ remove_ddw(pdn, true);
+ continue;
+ }
+
+ window->device = pdn;
+ window->prop = direct64;
+ spin_lock(&direct_window_list_lock);
+ list_add(&window->list, &direct_window_list);
+ spin_unlock(&direct_window_list_lock);
+ }
+
+ return 0;
+}
+machine_arch_initcall(pseries, find_existing_ddw_windows);
+
+static int query_ddw(struct pci_dev *dev, const u32 *ddw_avail,
+ struct ddw_query_response *query)
+{
+ struct device_node *dn;
+ struct pci_dn *pdn;
+ u32 cfg_addr;
+ u64 buid;
+ int ret;
+
+ /*
+ * Get the config address and phb buid of the PE window.
+ * Rely on eeh to retrieve this for us.
+ * Retrieve them from the pci device, not the node with the
+ * dma-window property
+ */
+ dn = pci_device_to_OF_node(dev);
+ pdn = PCI_DN(dn);
+ buid = pdn->phb->buid;
+ cfg_addr = ((pdn->busno << 16) | (pdn->devfn << 8));
+
+ ret = rtas_call(ddw_avail[0], 3, 5, (u32 *)query,
+ cfg_addr, BUID_HI(buid), BUID_LO(buid));
+ dev_info(&dev->dev, "ibm,query-pe-dma-windows(%x) %x %x %x"
+ " returned %d\n", ddw_avail[0], cfg_addr, BUID_HI(buid),
+ BUID_LO(buid), ret);
+ return ret;
+}
+
+static int create_ddw(struct pci_dev *dev, const u32 *ddw_avail,
+ struct ddw_create_response *create, int page_shift,
+ int window_shift)
+{
+ struct device_node *dn;
+ struct pci_dn *pdn;
+ u32 cfg_addr;
+ u64 buid;
+ int ret;
+
+ /*
+ * Get the config address and phb buid of the PE window.
+ * Rely on eeh to retrieve this for us.
+ * Retrieve them from the pci device, not the node with the
+ * dma-window property
+ */
+ dn = pci_device_to_OF_node(dev);
+ pdn = PCI_DN(dn);
+ buid = pdn->phb->buid;
+ cfg_addr = ((pdn->busno << 16) | (pdn->devfn << 8));
+
+ do {
+ /* extra outputs are LIOBN and dma-addr (hi, lo) */
+ ret = rtas_call(ddw_avail[1], 5, 4, (u32 *)create,
+ cfg_addr, BUID_HI(buid), BUID_LO(buid),
+ page_shift, window_shift);
+ } while (rtas_busy_delay(ret));
+ dev_info(&dev->dev,
+ "ibm,create-pe-dma-window(%x) %x %x %x %x %x returned %d "
+ "(liobn = 0x%x starting addr = %x %x)\n", ddw_avail[1],
+ cfg_addr, BUID_HI(buid), BUID_LO(buid), page_shift,
+ window_shift, ret, create->liobn, create->addr_hi, create->addr_lo);
+
+ return ret;
+}
+
+struct failed_ddw_pdn {
+ struct device_node *pdn;
+ struct list_head list;
+};
+
+static LIST_HEAD(failed_ddw_pdn_list);
+
+/*
+ * If the PE supports dynamic dma windows, and there is space for a table
+ * that can map all pages in a linear offset, then setup such a table,
+ * and record the dma-offset in the struct device.
+ *
+ * dev: the pci device we are checking
+ * pdn: the parent pe node with the ibm,dma_window property
+ * Future: also check if we can remap the base window for our base page size
+ *
+ * returns the dma offset for use by dma_set_mask
+ */
+static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn)
+{
+ int len, ret;
+ struct ddw_query_response query;
+ struct ddw_create_response create;
+ int page_shift;
+ u64 dma_addr, max_addr;
+ struct device_node *dn;
+ u32 ddw_avail[3];
+ struct direct_window *window;
+ struct property *win64;
+ struct dynamic_dma_window_prop *ddwprop;
+ struct failed_ddw_pdn *fpdn;
+
+ mutex_lock(&direct_window_init_mutex);
+
+ dma_addr = find_existing_ddw(pdn);
+ if (dma_addr != 0)
+ goto out_unlock;
+
+ /*
+ * If we already went through this for a previous function of
+ * the same device and failed, we don't want to muck with the
+ * DMA window again, as it will race with in-flight operations
+ * and can lead to EEHs. The above mutex protects access to the
+ * list.
+ */
+ list_for_each_entry(fpdn, &failed_ddw_pdn_list, list) {
+ if (fpdn->pdn == pdn)
+ goto out_unlock;
+ }
+
+ /*
+ * the ibm,ddw-applicable property holds the tokens for:
+ * ibm,query-pe-dma-window
+ * ibm,create-pe-dma-window
+ * ibm,remove-pe-dma-window
+ * for the given node in that order.
+ * the property is actually in the parent, not the PE
+ */
+ ret = of_property_read_u32_array(pdn, "ibm,ddw-applicable",
+ &ddw_avail[0], 3);
+ if (ret)
+ goto out_failed;
+
+ /*
+ * Query if there is a second window of size to map the
+ * whole partition. Query returns number of windows, largest
+ * block assigned to PE (partition endpoint), and two bitmasks
+ * of page sizes: supported and supported for migrate-dma.
+ */
+ dn = pci_device_to_OF_node(dev);
+ ret = query_ddw(dev, ddw_avail, &query);
+ if (ret != 0)
+ goto out_failed;
+
+ if (query.windows_available == 0) {
+ /*
+ * no additional windows are available for this device.
+ * We might be able to reallocate the existing window,
+ * trading in for a larger page size.
+ */
+ dev_dbg(&dev->dev, "no free dynamic windows");
+ goto out_failed;
+ }
+ if (query.page_size & 4) {
+ page_shift = 24; /* 16MB */
+ } else if (query.page_size & 2) {
+ page_shift = 16; /* 64kB */
+ } else if (query.page_size & 1) {
+ page_shift = 12; /* 4kB */
+ } else {
+ dev_dbg(&dev->dev, "no supported direct page size in mask %x",
+ query.page_size);
+ goto out_failed;
+ }
+ /* verify the window * number of ptes will map the partition */
+ /* check largest block * page size > max memory hotplug addr */
+ max_addr = memory_hotplug_max();
+ if (query.largest_available_block < (max_addr >> page_shift)) {
+ dev_dbg(&dev->dev, "can't map partition max 0x%llx with %u "
+ "%llu-sized pages\n", max_addr, query.largest_available_block,
+ 1ULL << page_shift);
+ goto out_failed;
+ }
+ len = order_base_2(max_addr);
+ win64 = kzalloc(sizeof(struct property), GFP_KERNEL);
+ if (!win64) {
+ dev_info(&dev->dev,
+ "couldn't allocate property for 64bit dma window\n");
+ goto out_failed;
+ }
+ win64->name = kstrdup(DIRECT64_PROPNAME, GFP_KERNEL);
+ win64->value = ddwprop = kmalloc(sizeof(*ddwprop), GFP_KERNEL);
+ win64->length = sizeof(*ddwprop);
+ if (!win64->name || !win64->value) {
+ dev_info(&dev->dev,
+ "couldn't allocate property name and value\n");
+ goto out_free_prop;
+ }
+
+ ret = create_ddw(dev, ddw_avail, &create, page_shift, len);
+ if (ret != 0)
+ goto out_free_prop;
+
+ ddwprop->liobn = cpu_to_be32(create.liobn);
+ ddwprop->dma_base = cpu_to_be64(((u64)create.addr_hi << 32) |
+ create.addr_lo);
+ ddwprop->tce_shift = cpu_to_be32(page_shift);
+ ddwprop->window_shift = cpu_to_be32(len);
+
+ dev_dbg(&dev->dev, "created tce table LIOBN 0x%x for %pOF\n",
+ create.liobn, dn);
+
+ window = kzalloc(sizeof(*window), GFP_KERNEL);
+ if (!window)
+ goto out_clear_window;
+
+ ret = walk_system_ram_range(0, memblock_end_of_DRAM() >> PAGE_SHIFT,
+ win64->value, tce_setrange_multi_pSeriesLP_walk);
+ if (ret) {
+ dev_info(&dev->dev, "failed to map direct window for %pOF: %d\n",
+ dn, ret);
+ goto out_free_window;
+ }
+
+ ret = of_add_property(pdn, win64);
+ if (ret) {
+ dev_err(&dev->dev, "unable to add dma window property for %pOF: %d",
+ pdn, ret);
+ goto out_free_window;
+ }
+
+ window->device = pdn;
+ window->prop = ddwprop;
+ spin_lock(&direct_window_list_lock);
+ list_add(&window->list, &direct_window_list);
+ spin_unlock(&direct_window_list_lock);
+
+ dma_addr = be64_to_cpu(ddwprop->dma_base);
+ goto out_unlock;
+
+out_free_window:
+ kfree(window);
+
+out_clear_window:
+ remove_ddw(pdn, true);
+
+out_free_prop:
+ kfree(win64->name);
+ kfree(win64->value);
+ kfree(win64);
+
+out_failed:
+
+ fpdn = kzalloc(sizeof(*fpdn), GFP_KERNEL);
+ if (!fpdn)
+ goto out_unlock;
+ fpdn->pdn = pdn;
+ list_add(&fpdn->list, &failed_ddw_pdn_list);
+
+out_unlock:
+ mutex_unlock(&direct_window_init_mutex);
+ return dma_addr;
+}
+
+static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev)
+{
+ struct device_node *pdn, *dn;
+ struct iommu_table *tbl;
+ const __be32 *dma_window = NULL;
+ struct pci_dn *pci;
+
+ pr_debug("pci_dma_dev_setup_pSeriesLP: %s\n", pci_name(dev));
+
+ /* dev setup for LPAR is a little tricky, since the device tree might
+ * contain the dma-window properties per-device and not necessarily
+ * for the bus. So we need to search upwards in the tree until we
+ * either hit a dma-window property, OR find a parent with a table
+ * already allocated.
+ */
+ dn = pci_device_to_OF_node(dev);
+ pr_debug(" node is %pOF\n", dn);
+
+ for (pdn = dn; pdn && PCI_DN(pdn) && !PCI_DN(pdn)->table_group;
+ pdn = pdn->parent) {
+ dma_window = of_get_property(pdn, "ibm,dma-window", NULL);
+ if (dma_window)
+ break;
+ }
+
+ if (!pdn || !PCI_DN(pdn)) {
+ printk(KERN_WARNING "pci_dma_dev_setup_pSeriesLP: "
+ "no DMA window found for pci dev=%s dn=%pOF\n",
+ pci_name(dev), dn);
+ return;
+ }
+ pr_debug(" parent is %pOF\n", pdn);
+
+ pci = PCI_DN(pdn);
+ if (!pci->table_group) {
+ pci->table_group = iommu_pseries_alloc_group(pci->phb->node);
+ tbl = pci->table_group->tables[0];
+ iommu_table_setparms_lpar(pci->phb, pdn, tbl,
+ pci->table_group, dma_window);
+ tbl->it_ops = &iommu_table_lpar_multi_ops;
+ iommu_init_table(tbl, pci->phb->node);
+ iommu_register_group(pci->table_group,
+ pci_domain_nr(pci->phb->bus), 0);
+ pr_debug(" created table: %p\n", pci->table_group);
+ } else {
+ pr_debug(" found DMA window, table: %p\n", pci->table_group);
+ }
+
+ set_iommu_table_base(&dev->dev, pci->table_group->tables[0]);
+ iommu_add_device(&dev->dev);
+}
+
+static int dma_set_mask_pSeriesLP(struct device *dev, u64 dma_mask)
+{
+ bool ddw_enabled = false;
+ struct device_node *pdn, *dn;
+ struct pci_dev *pdev;
+ const __be32 *dma_window = NULL;
+ u64 dma_offset;
+
+ if (!dev->dma_mask)
+ return -EIO;
+
+ if (!dev_is_pci(dev))
+ goto check_mask;
+
+ pdev = to_pci_dev(dev);
+
+ /* only attempt to use a new window if 64-bit DMA is requested */
+ if (!disable_ddw && dma_mask == DMA_BIT_MASK(64)) {
+ dn = pci_device_to_OF_node(pdev);
+ dev_dbg(dev, "node is %pOF\n", dn);
+
+ /*
+ * the device tree might contain the dma-window properties
+ * per-device and not necessarily for the bus. So we need to
+ * search upwards in the tree until we either hit a dma-window
+ * property, OR find a parent with a table already allocated.
+ */
+ for (pdn = dn; pdn && PCI_DN(pdn) && !PCI_DN(pdn)->table_group;
+ pdn = pdn->parent) {
+ dma_window = of_get_property(pdn, "ibm,dma-window", NULL);
+ if (dma_window)
+ break;
+ }
+ if (pdn && PCI_DN(pdn)) {
+ dma_offset = enable_ddw(pdev, pdn);
+ if (dma_offset != 0) {
+ dev_info(dev, "Using 64-bit direct DMA at offset %llx\n", dma_offset);
+ set_dma_offset(dev, dma_offset);
+ set_dma_ops(dev, &dma_nommu_ops);
+ ddw_enabled = true;
+ }
+ }
+ }
+
+ /* fall back on iommu ops */
+ if (!ddw_enabled && get_dma_ops(dev) != &dma_iommu_ops) {
+ dev_info(dev, "Restoring 32-bit DMA via iommu\n");
+ set_dma_ops(dev, &dma_iommu_ops);
+ }
+
+check_mask:
+ if (!dma_supported(dev, dma_mask))
+ return -EIO;
+
+ *dev->dma_mask = dma_mask;
+ return 0;
+}
+
+static u64 dma_get_required_mask_pSeriesLP(struct device *dev)
+{
+ if (!dev->dma_mask)
+ return 0;
+
+ if (!disable_ddw && dev_is_pci(dev)) {
+ struct pci_dev *pdev = to_pci_dev(dev);
+ struct device_node *dn;
+
+ dn = pci_device_to_OF_node(pdev);
+
+ /* search upwards for ibm,dma-window */
+ for (; dn && PCI_DN(dn) && !PCI_DN(dn)->table_group;
+ dn = dn->parent)
+ if (of_get_property(dn, "ibm,dma-window", NULL))
+ break;
+ /* if there is a ibm,ddw-applicable property require 64 bits */
+ if (dn && PCI_DN(dn) &&
+ of_get_property(dn, "ibm,ddw-applicable", NULL))
+ return DMA_BIT_MASK(64);
+ }
+
+ return dma_iommu_ops.get_required_mask(dev);
+}
+
+static int iommu_mem_notifier(struct notifier_block *nb, unsigned long action,
+ void *data)
+{
+ struct direct_window *window;
+ struct memory_notify *arg = data;
+ int ret = 0;
+
+ switch (action) {
+ case MEM_GOING_ONLINE:
+ spin_lock(&direct_window_list_lock);
+ list_for_each_entry(window, &direct_window_list, list) {
+ ret |= tce_setrange_multi_pSeriesLP(arg->start_pfn,
+ arg->nr_pages, window->prop);
+ /* XXX log error */
+ }
+ spin_unlock(&direct_window_list_lock);
+ break;
+ case MEM_CANCEL_ONLINE:
+ case MEM_OFFLINE:
+ spin_lock(&direct_window_list_lock);
+ list_for_each_entry(window, &direct_window_list, list) {
+ ret |= tce_clearrange_multi_pSeriesLP(arg->start_pfn,
+ arg->nr_pages, window->prop);
+ /* XXX log error */
+ }
+ spin_unlock(&direct_window_list_lock);
+ break;
+ default:
+ break;
+ }
+ if (ret && action != MEM_CANCEL_ONLINE)
+ return NOTIFY_BAD;
+
+ return NOTIFY_OK;
+}
+
+static struct notifier_block iommu_mem_nb = {
+ .notifier_call = iommu_mem_notifier,
+};
+
+static int iommu_reconfig_notifier(struct notifier_block *nb, unsigned long action, void *data)
+{
+ int err = NOTIFY_OK;
+ struct of_reconfig_data *rd = data;
+ struct device_node *np = rd->dn;
+ struct pci_dn *pci = PCI_DN(np);
+ struct direct_window *window;
+
+ switch (action) {
+ case OF_RECONFIG_DETACH_NODE:
+ /*
+ * Removing the property will invoke the reconfig
+ * notifier again, which causes dead-lock on the
+ * read-write semaphore of the notifier chain. So
+ * we have to remove the property when releasing
+ * the device node.
+ */
+ remove_ddw(np, false);
+ if (pci && pci->table_group)
+ iommu_pseries_free_group(pci->table_group,
+ np->full_name);
+
+ spin_lock(&direct_window_list_lock);
+ list_for_each_entry(window, &direct_window_list, list) {
+ if (window->device == np) {
+ list_del(&window->list);
+ kfree(window);
+ break;
+ }
+ }
+ spin_unlock(&direct_window_list_lock);
+ break;
+ default:
+ err = NOTIFY_DONE;
+ break;
+ }
+ return err;
+}
+
+static struct notifier_block iommu_reconfig_nb = {
+ .notifier_call = iommu_reconfig_notifier,
+};
+
+/* These are called very early. */
+void iommu_init_early_pSeries(void)
+{
+ if (of_chosen && of_get_property(of_chosen, "linux,iommu-off", NULL))
+ return;
+
+ if (firmware_has_feature(FW_FEATURE_LPAR)) {
+ pseries_pci_controller_ops.dma_bus_setup = pci_dma_bus_setup_pSeriesLP;
+ pseries_pci_controller_ops.dma_dev_setup = pci_dma_dev_setup_pSeriesLP;
+ ppc_md.dma_set_mask = dma_set_mask_pSeriesLP;
+ ppc_md.dma_get_required_mask = dma_get_required_mask_pSeriesLP;
+ } else {
+ pseries_pci_controller_ops.dma_bus_setup = pci_dma_bus_setup_pSeries;
+ pseries_pci_controller_ops.dma_dev_setup = pci_dma_dev_setup_pSeries;
+ }
+
+
+ of_reconfig_notifier_register(&iommu_reconfig_nb);
+ register_memory_notifier(&iommu_mem_nb);
+
+ set_pci_dma_ops(&dma_iommu_ops);
+}
+
+static int __init disable_multitce(char *str)
+{
+ if (strcmp(str, "off") == 0 &&
+ firmware_has_feature(FW_FEATURE_LPAR) &&
+ firmware_has_feature(FW_FEATURE_MULTITCE)) {
+ printk(KERN_INFO "Disabling MULTITCE firmware feature\n");
+ powerpc_firmware_features &= ~FW_FEATURE_MULTITCE;
+ }
+ return 1;
+}
+
+__setup("multitce=", disable_multitce);
+
+machine_subsys_initcall_sync(pseries, tce_iommu_bus_notifier_init);
diff --git a/arch/powerpc/platforms/pseries/kexec.c b/arch/powerpc/platforms/pseries/kexec.c
new file mode 100644
index 000000000..23f54223e
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/kexec.c
@@ -0,0 +1,67 @@
+/*
+ * Copyright 2006 Michael Ellerman, IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/interrupt.h>
+
+#include <asm/machdep.h>
+#include <asm/page.h>
+#include <asm/firmware.h>
+#include <asm/kexec.h>
+#include <asm/xics.h>
+#include <asm/xive.h>
+#include <asm/smp.h>
+#include <asm/plpar_wrappers.h>
+
+#include "pseries.h"
+
+void pseries_kexec_cpu_down(int crash_shutdown, int secondary)
+{
+ /*
+ * Don't risk a hypervisor call if we're crashing
+ * XXX: Why? The hypervisor is not crashing. It might be better
+ * to at least attempt unregister to avoid the hypervisor stepping
+ * on our memory.
+ */
+ if (firmware_has_feature(FW_FEATURE_SPLPAR) && !crash_shutdown) {
+ int ret;
+ int cpu = smp_processor_id();
+ int hwcpu = hard_smp_processor_id();
+
+ if (get_lppaca()->dtl_enable_mask) {
+ ret = unregister_dtl(hwcpu);
+ if (ret) {
+ pr_err("WARNING: DTL deregistration for cpu "
+ "%d (hw %d) failed with %d\n",
+ cpu, hwcpu, ret);
+ }
+ }
+
+ ret = unregister_slb_shadow(hwcpu);
+ if (ret) {
+ pr_err("WARNING: SLB shadow buffer deregistration "
+ "for cpu %d (hw %d) failed with %d\n",
+ cpu, hwcpu, ret);
+ }
+
+ ret = unregister_vpa(hwcpu);
+ if (ret) {
+ pr_err("WARNING: VPA deregistration for cpu %d "
+ "(hw %d) failed with %d\n", cpu, hwcpu, ret);
+ }
+ }
+
+ if (xive_enabled()) {
+ xive_teardown_cpu();
+
+ if (!secondary)
+ xive_shutdown();
+ } else
+ xics_kexec_teardown_cpu(secondary);
+}
diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c
new file mode 100644
index 000000000..d660a9061
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/lpar.c
@@ -0,0 +1,1088 @@
+/*
+ * pSeries_lpar.c
+ * Copyright (C) 2001 Todd Inglett, IBM Corporation
+ *
+ * pSeries LPAR support.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+/* Enables debugging of low-level hash table routines - careful! */
+#undef DEBUG
+#define pr_fmt(fmt) "lpar: " fmt
+
+#include <linux/kernel.h>
+#include <linux/dma-mapping.h>
+#include <linux/console.h>
+#include <linux/export.h>
+#include <linux/jump_label.h>
+#include <linux/delay.h>
+#include <linux/stop_machine.h>
+#include <asm/processor.h>
+#include <asm/mmu.h>
+#include <asm/page.h>
+#include <asm/pgtable.h>
+#include <asm/machdep.h>
+#include <asm/mmu_context.h>
+#include <asm/iommu.h>
+#include <asm/tlb.h>
+#include <asm/prom.h>
+#include <asm/cputable.h>
+#include <asm/udbg.h>
+#include <asm/smp.h>
+#include <asm/trace.h>
+#include <asm/firmware.h>
+#include <asm/plpar_wrappers.h>
+#include <asm/kexec.h>
+#include <asm/fadump.h>
+#include <asm/asm-prototypes.h>
+#include <asm/debugfs.h>
+
+#include "pseries.h"
+
+/* Flag bits for H_BULK_REMOVE */
+#define HBR_REQUEST 0x4000000000000000UL
+#define HBR_RESPONSE 0x8000000000000000UL
+#define HBR_END 0xc000000000000000UL
+#define HBR_AVPN 0x0200000000000000UL
+#define HBR_ANDCOND 0x0100000000000000UL
+
+
+/* in hvCall.S */
+EXPORT_SYMBOL(plpar_hcall);
+EXPORT_SYMBOL(plpar_hcall9);
+EXPORT_SYMBOL(plpar_hcall_norets);
+
+void vpa_init(int cpu)
+{
+ int hwcpu = get_hard_smp_processor_id(cpu);
+ unsigned long addr;
+ long ret;
+ struct paca_struct *pp;
+ struct dtl_entry *dtl;
+
+ /*
+ * The spec says it "may be problematic" if CPU x registers the VPA of
+ * CPU y. We should never do that, but wail if we ever do.
+ */
+ WARN_ON(cpu != smp_processor_id());
+
+ if (cpu_has_feature(CPU_FTR_ALTIVEC))
+ lppaca_of(cpu).vmxregs_in_use = 1;
+
+ if (cpu_has_feature(CPU_FTR_ARCH_207S))
+ lppaca_of(cpu).ebb_regs_in_use = 1;
+
+ addr = __pa(&lppaca_of(cpu));
+ ret = register_vpa(hwcpu, addr);
+
+ if (ret) {
+ pr_err("WARNING: VPA registration for cpu %d (hw %d) of area "
+ "%lx failed with %ld\n", cpu, hwcpu, addr, ret);
+ return;
+ }
+
+#ifdef CONFIG_PPC_BOOK3S_64
+ /*
+ * PAPR says this feature is SLB-Buffer but firmware never
+ * reports that. All SPLPAR support SLB shadow buffer.
+ */
+ if (!radix_enabled() && firmware_has_feature(FW_FEATURE_SPLPAR)) {
+ addr = __pa(paca_ptrs[cpu]->slb_shadow_ptr);
+ ret = register_slb_shadow(hwcpu, addr);
+ if (ret)
+ pr_err("WARNING: SLB shadow buffer registration for "
+ "cpu %d (hw %d) of area %lx failed with %ld\n",
+ cpu, hwcpu, addr, ret);
+ }
+#endif /* CONFIG_PPC_BOOK3S_64 */
+
+ /*
+ * Register dispatch trace log, if one has been allocated.
+ */
+ pp = paca_ptrs[cpu];
+ dtl = pp->dispatch_log;
+ if (dtl) {
+ pp->dtl_ridx = 0;
+ pp->dtl_curr = dtl;
+ lppaca_of(cpu).dtl_idx = 0;
+
+ /* hypervisor reads buffer length from this field */
+ dtl->enqueue_to_dispatch_time = cpu_to_be32(DISPATCH_LOG_BYTES);
+ ret = register_dtl(hwcpu, __pa(dtl));
+ if (ret)
+ pr_err("WARNING: DTL registration of cpu %d (hw %d) "
+ "failed with %ld\n", smp_processor_id(),
+ hwcpu, ret);
+ lppaca_of(cpu).dtl_enable_mask = 2;
+ }
+}
+
+#ifdef CONFIG_PPC_BOOK3S_64
+
+static long pSeries_lpar_hpte_insert(unsigned long hpte_group,
+ unsigned long vpn, unsigned long pa,
+ unsigned long rflags, unsigned long vflags,
+ int psize, int apsize, int ssize)
+{
+ unsigned long lpar_rc;
+ unsigned long flags;
+ unsigned long slot;
+ unsigned long hpte_v, hpte_r;
+
+ if (!(vflags & HPTE_V_BOLTED))
+ pr_devel("hpte_insert(group=%lx, vpn=%016lx, "
+ "pa=%016lx, rflags=%lx, vflags=%lx, psize=%d)\n",
+ hpte_group, vpn, pa, rflags, vflags, psize);
+
+ hpte_v = hpte_encode_v(vpn, psize, apsize, ssize) | vflags | HPTE_V_VALID;
+ hpte_r = hpte_encode_r(pa, psize, apsize) | rflags;
+
+ if (!(vflags & HPTE_V_BOLTED))
+ pr_devel(" hpte_v=%016lx, hpte_r=%016lx\n", hpte_v, hpte_r);
+
+ /* Now fill in the actual HPTE */
+ /* Set CEC cookie to 0 */
+ /* Zero page = 0 */
+ /* I-cache Invalidate = 0 */
+ /* I-cache synchronize = 0 */
+ /* Exact = 0 */
+ flags = 0;
+
+ if (firmware_has_feature(FW_FEATURE_XCMO) && !(hpte_r & HPTE_R_N))
+ flags |= H_COALESCE_CAND;
+
+ lpar_rc = plpar_pte_enter(flags, hpte_group, hpte_v, hpte_r, &slot);
+ if (unlikely(lpar_rc == H_PTEG_FULL)) {
+ pr_devel("Hash table group is full\n");
+ return -1;
+ }
+
+ /*
+ * Since we try and ioremap PHBs we don't own, the pte insert
+ * will fail. However we must catch the failure in hash_page
+ * or we will loop forever, so return -2 in this case.
+ */
+ if (unlikely(lpar_rc != H_SUCCESS)) {
+ pr_err("Failed hash pte insert with error %ld\n", lpar_rc);
+ return -2;
+ }
+ if (!(vflags & HPTE_V_BOLTED))
+ pr_devel(" -> slot: %lu\n", slot & 7);
+
+ /* Because of iSeries, we have to pass down the secondary
+ * bucket bit here as well
+ */
+ return (slot & 7) | (!!(vflags & HPTE_V_SECONDARY) << 3);
+}
+
+static DEFINE_SPINLOCK(pSeries_lpar_tlbie_lock);
+
+static long pSeries_lpar_hpte_remove(unsigned long hpte_group)
+{
+ unsigned long slot_offset;
+ unsigned long lpar_rc;
+ int i;
+ unsigned long dummy1, dummy2;
+
+ /* pick a random slot to start at */
+ slot_offset = mftb() & 0x7;
+
+ for (i = 0; i < HPTES_PER_GROUP; i++) {
+
+ /* don't remove a bolted entry */
+ lpar_rc = plpar_pte_remove(H_ANDCOND, hpte_group + slot_offset,
+ (0x1UL << 4), &dummy1, &dummy2);
+ if (lpar_rc == H_SUCCESS)
+ return i;
+
+ /*
+ * The test for adjunct partition is performed before the
+ * ANDCOND test. H_RESOURCE may be returned, so we need to
+ * check for that as well.
+ */
+ BUG_ON(lpar_rc != H_NOT_FOUND && lpar_rc != H_RESOURCE);
+
+ slot_offset++;
+ slot_offset &= 0x7;
+ }
+
+ return -1;
+}
+
+static void manual_hpte_clear_all(void)
+{
+ unsigned long size_bytes = 1UL << ppc64_pft_size;
+ unsigned long hpte_count = size_bytes >> 4;
+ struct {
+ unsigned long pteh;
+ unsigned long ptel;
+ } ptes[4];
+ long lpar_rc;
+ unsigned long i, j;
+
+ /* Read in batches of 4,
+ * invalidate only valid entries not in the VRMA
+ * hpte_count will be a multiple of 4
+ */
+ for (i = 0; i < hpte_count; i += 4) {
+ lpar_rc = plpar_pte_read_4_raw(0, i, (void *)ptes);
+ if (lpar_rc != H_SUCCESS) {
+ pr_info("Failed to read hash page table at %ld err %ld\n",
+ i, lpar_rc);
+ continue;
+ }
+ for (j = 0; j < 4; j++){
+ if ((ptes[j].pteh & HPTE_V_VRMA_MASK) ==
+ HPTE_V_VRMA_MASK)
+ continue;
+ if (ptes[j].pteh & HPTE_V_VALID)
+ plpar_pte_remove_raw(0, i + j, 0,
+ &(ptes[j].pteh), &(ptes[j].ptel));
+ }
+ }
+}
+
+static int hcall_hpte_clear_all(void)
+{
+ int rc;
+
+ do {
+ rc = plpar_hcall_norets(H_CLEAR_HPT);
+ } while (rc == H_CONTINUE);
+
+ return rc;
+}
+
+static void pseries_hpte_clear_all(void)
+{
+ int rc;
+
+ rc = hcall_hpte_clear_all();
+ if (rc != H_SUCCESS)
+ manual_hpte_clear_all();
+
+#ifdef __LITTLE_ENDIAN__
+ /*
+ * Reset exceptions to big endian.
+ *
+ * FIXME this is a hack for kexec, we need to reset the exception
+ * endian before starting the new kernel and this is a convenient place
+ * to do it.
+ *
+ * This is also called on boot when a fadump happens. In that case we
+ * must not change the exception endian mode.
+ */
+ if (firmware_has_feature(FW_FEATURE_SET_MODE) && !is_fadump_active())
+ pseries_big_endian_exceptions();
+#endif
+}
+
+/*
+ * NOTE: for updatepp ops we are fortunate that the linux "newpp" bits and
+ * the low 3 bits of flags happen to line up. So no transform is needed.
+ * We can probably optimize here and assume the high bits of newpp are
+ * already zero. For now I am paranoid.
+ */
+static long pSeries_lpar_hpte_updatepp(unsigned long slot,
+ unsigned long newpp,
+ unsigned long vpn,
+ int psize, int apsize,
+ int ssize, unsigned long inv_flags)
+{
+ unsigned long lpar_rc;
+ unsigned long flags;
+ unsigned long want_v;
+
+ want_v = hpte_encode_avpn(vpn, psize, ssize);
+
+ flags = (newpp & 7) | H_AVPN;
+ if (mmu_has_feature(MMU_FTR_KERNEL_RO))
+ /* Move pp0 into bit 8 (IBM 55) */
+ flags |= (newpp & HPTE_R_PP0) >> 55;
+
+ pr_devel(" update: avpnv=%016lx, hash=%016lx, f=%lx, psize: %d ...",
+ want_v, slot, flags, psize);
+
+ lpar_rc = plpar_pte_protect(flags, slot, want_v);
+
+ if (lpar_rc == H_NOT_FOUND) {
+ pr_devel("not found !\n");
+ return -1;
+ }
+
+ pr_devel("ok\n");
+
+ BUG_ON(lpar_rc != H_SUCCESS);
+
+ return 0;
+}
+
+static long __pSeries_lpar_hpte_find(unsigned long want_v, unsigned long hpte_group)
+{
+ long lpar_rc;
+ unsigned long i, j;
+ struct {
+ unsigned long pteh;
+ unsigned long ptel;
+ } ptes[4];
+
+ for (i = 0; i < HPTES_PER_GROUP; i += 4, hpte_group += 4) {
+
+ lpar_rc = plpar_pte_read_4(0, hpte_group, (void *)ptes);
+ if (lpar_rc != H_SUCCESS) {
+ pr_info("Failed to read hash page table at %ld err %ld\n",
+ hpte_group, lpar_rc);
+ continue;
+ }
+
+ for (j = 0; j < 4; j++) {
+ if (HPTE_V_COMPARE(ptes[j].pteh, want_v) &&
+ (ptes[j].pteh & HPTE_V_VALID))
+ return i + j;
+ }
+ }
+
+ return -1;
+}
+
+static long pSeries_lpar_hpte_find(unsigned long vpn, int psize, int ssize)
+{
+ long slot;
+ unsigned long hash;
+ unsigned long want_v;
+ unsigned long hpte_group;
+
+ hash = hpt_hash(vpn, mmu_psize_defs[psize].shift, ssize);
+ want_v = hpte_encode_avpn(vpn, psize, ssize);
+
+ /* Bolted entries are always in the primary group */
+ hpte_group = (hash & htab_hash_mask) * HPTES_PER_GROUP;
+ slot = __pSeries_lpar_hpte_find(want_v, hpte_group);
+ if (slot < 0)
+ return -1;
+ return hpte_group + slot;
+}
+
+static void pSeries_lpar_hpte_updateboltedpp(unsigned long newpp,
+ unsigned long ea,
+ int psize, int ssize)
+{
+ unsigned long vpn;
+ unsigned long lpar_rc, slot, vsid, flags;
+
+ vsid = get_kernel_vsid(ea, ssize);
+ vpn = hpt_vpn(ea, vsid, ssize);
+
+ slot = pSeries_lpar_hpte_find(vpn, psize, ssize);
+ BUG_ON(slot == -1);
+
+ flags = newpp & 7;
+ if (mmu_has_feature(MMU_FTR_KERNEL_RO))
+ /* Move pp0 into bit 8 (IBM 55) */
+ flags |= (newpp & HPTE_R_PP0) >> 55;
+
+ lpar_rc = plpar_pte_protect(flags, slot, 0);
+
+ BUG_ON(lpar_rc != H_SUCCESS);
+}
+
+static void pSeries_lpar_hpte_invalidate(unsigned long slot, unsigned long vpn,
+ int psize, int apsize,
+ int ssize, int local)
+{
+ unsigned long want_v;
+ unsigned long lpar_rc;
+ unsigned long dummy1, dummy2;
+
+ pr_devel(" inval : slot=%lx, vpn=%016lx, psize: %d, local: %d\n",
+ slot, vpn, psize, local);
+
+ want_v = hpte_encode_avpn(vpn, psize, ssize);
+ lpar_rc = plpar_pte_remove(H_AVPN, slot, want_v, &dummy1, &dummy2);
+ if (lpar_rc == H_NOT_FOUND)
+ return;
+
+ BUG_ON(lpar_rc != H_SUCCESS);
+}
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+/*
+ * Limit iterations holding pSeries_lpar_tlbie_lock to 3. We also need
+ * to make sure that we avoid bouncing the hypervisor tlbie lock.
+ */
+#define PPC64_HUGE_HPTE_BATCH 12
+
+static void __pSeries_lpar_hugepage_invalidate(unsigned long *slot,
+ unsigned long *vpn, int count,
+ int psize, int ssize)
+{
+ unsigned long param[PLPAR_HCALL9_BUFSIZE];
+ int i = 0, pix = 0, rc;
+ unsigned long flags = 0;
+ int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE);
+
+ if (lock_tlbie)
+ spin_lock_irqsave(&pSeries_lpar_tlbie_lock, flags);
+
+ for (i = 0; i < count; i++) {
+
+ if (!firmware_has_feature(FW_FEATURE_BULK_REMOVE)) {
+ pSeries_lpar_hpte_invalidate(slot[i], vpn[i], psize, 0,
+ ssize, 0);
+ } else {
+ param[pix] = HBR_REQUEST | HBR_AVPN | slot[i];
+ param[pix+1] = hpte_encode_avpn(vpn[i], psize, ssize);
+ pix += 2;
+ if (pix == 8) {
+ rc = plpar_hcall9(H_BULK_REMOVE, param,
+ param[0], param[1], param[2],
+ param[3], param[4], param[5],
+ param[6], param[7]);
+ BUG_ON(rc != H_SUCCESS);
+ pix = 0;
+ }
+ }
+ }
+ if (pix) {
+ param[pix] = HBR_END;
+ rc = plpar_hcall9(H_BULK_REMOVE, param, param[0], param[1],
+ param[2], param[3], param[4], param[5],
+ param[6], param[7]);
+ BUG_ON(rc != H_SUCCESS);
+ }
+
+ if (lock_tlbie)
+ spin_unlock_irqrestore(&pSeries_lpar_tlbie_lock, flags);
+}
+
+static void pSeries_lpar_hugepage_invalidate(unsigned long vsid,
+ unsigned long addr,
+ unsigned char *hpte_slot_array,
+ int psize, int ssize, int local)
+{
+ int i, index = 0;
+ unsigned long s_addr = addr;
+ unsigned int max_hpte_count, valid;
+ unsigned long vpn_array[PPC64_HUGE_HPTE_BATCH];
+ unsigned long slot_array[PPC64_HUGE_HPTE_BATCH];
+ unsigned long shift, hidx, vpn = 0, hash, slot;
+
+ shift = mmu_psize_defs[psize].shift;
+ max_hpte_count = 1U << (PMD_SHIFT - shift);
+
+ for (i = 0; i < max_hpte_count; i++) {
+ valid = hpte_valid(hpte_slot_array, i);
+ if (!valid)
+ continue;
+ hidx = hpte_hash_index(hpte_slot_array, i);
+
+ /* get the vpn */
+ addr = s_addr + (i * (1ul << shift));
+ vpn = hpt_vpn(addr, vsid, ssize);
+ hash = hpt_hash(vpn, shift, ssize);
+ if (hidx & _PTEIDX_SECONDARY)
+ hash = ~hash;
+
+ slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
+ slot += hidx & _PTEIDX_GROUP_IX;
+
+ slot_array[index] = slot;
+ vpn_array[index] = vpn;
+ if (index == PPC64_HUGE_HPTE_BATCH - 1) {
+ /*
+ * Now do a bluk invalidate
+ */
+ __pSeries_lpar_hugepage_invalidate(slot_array,
+ vpn_array,
+ PPC64_HUGE_HPTE_BATCH,
+ psize, ssize);
+ index = 0;
+ } else
+ index++;
+ }
+ if (index)
+ __pSeries_lpar_hugepage_invalidate(slot_array, vpn_array,
+ index, psize, ssize);
+}
+#else
+static void pSeries_lpar_hugepage_invalidate(unsigned long vsid,
+ unsigned long addr,
+ unsigned char *hpte_slot_array,
+ int psize, int ssize, int local)
+{
+ WARN(1, "%s called without THP support\n", __func__);
+}
+#endif
+
+static int pSeries_lpar_hpte_removebolted(unsigned long ea,
+ int psize, int ssize)
+{
+ unsigned long vpn;
+ unsigned long slot, vsid;
+
+ vsid = get_kernel_vsid(ea, ssize);
+ vpn = hpt_vpn(ea, vsid, ssize);
+
+ slot = pSeries_lpar_hpte_find(vpn, psize, ssize);
+ if (slot == -1)
+ return -ENOENT;
+
+ /*
+ * lpar doesn't use the passed actual page size
+ */
+ pSeries_lpar_hpte_invalidate(slot, vpn, psize, 0, ssize, 0);
+ return 0;
+}
+
+/*
+ * Take a spinlock around flushes to avoid bouncing the hypervisor tlbie
+ * lock.
+ */
+static void pSeries_lpar_flush_hash_range(unsigned long number, int local)
+{
+ unsigned long vpn;
+ unsigned long i, pix, rc;
+ unsigned long flags = 0;
+ struct ppc64_tlb_batch *batch = this_cpu_ptr(&ppc64_tlb_batch);
+ int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE);
+ unsigned long param[PLPAR_HCALL9_BUFSIZE];
+ unsigned long hash, index, shift, hidx, slot;
+ real_pte_t pte;
+ int psize, ssize;
+
+ if (lock_tlbie)
+ spin_lock_irqsave(&pSeries_lpar_tlbie_lock, flags);
+
+ psize = batch->psize;
+ ssize = batch->ssize;
+ pix = 0;
+ for (i = 0; i < number; i++) {
+ vpn = batch->vpn[i];
+ pte = batch->pte[i];
+ pte_iterate_hashed_subpages(pte, psize, vpn, index, shift) {
+ hash = hpt_hash(vpn, shift, ssize);
+ hidx = __rpte_to_hidx(pte, index);
+ if (hidx & _PTEIDX_SECONDARY)
+ hash = ~hash;
+ slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
+ slot += hidx & _PTEIDX_GROUP_IX;
+ if (!firmware_has_feature(FW_FEATURE_BULK_REMOVE)) {
+ /*
+ * lpar doesn't use the passed actual page size
+ */
+ pSeries_lpar_hpte_invalidate(slot, vpn, psize,
+ 0, ssize, local);
+ } else {
+ param[pix] = HBR_REQUEST | HBR_AVPN | slot;
+ param[pix+1] = hpte_encode_avpn(vpn, psize,
+ ssize);
+ pix += 2;
+ if (pix == 8) {
+ rc = plpar_hcall9(H_BULK_REMOVE, param,
+ param[0], param[1], param[2],
+ param[3], param[4], param[5],
+ param[6], param[7]);
+ BUG_ON(rc != H_SUCCESS);
+ pix = 0;
+ }
+ }
+ } pte_iterate_hashed_end();
+ }
+ if (pix) {
+ param[pix] = HBR_END;
+ rc = plpar_hcall9(H_BULK_REMOVE, param, param[0], param[1],
+ param[2], param[3], param[4], param[5],
+ param[6], param[7]);
+ BUG_ON(rc != H_SUCCESS);
+ }
+
+ if (lock_tlbie)
+ spin_unlock_irqrestore(&pSeries_lpar_tlbie_lock, flags);
+}
+
+static int __init disable_bulk_remove(char *str)
+{
+ if (strcmp(str, "off") == 0 &&
+ firmware_has_feature(FW_FEATURE_BULK_REMOVE)) {
+ pr_info("Disabling BULK_REMOVE firmware feature");
+ powerpc_firmware_features &= ~FW_FEATURE_BULK_REMOVE;
+ }
+ return 1;
+}
+
+__setup("bulk_remove=", disable_bulk_remove);
+
+#define HPT_RESIZE_TIMEOUT 10000 /* ms */
+
+struct hpt_resize_state {
+ unsigned long shift;
+ int commit_rc;
+};
+
+static int pseries_lpar_resize_hpt_commit(void *data)
+{
+ struct hpt_resize_state *state = data;
+
+ state->commit_rc = plpar_resize_hpt_commit(0, state->shift);
+ if (state->commit_rc != H_SUCCESS)
+ return -EIO;
+
+ /* Hypervisor has transitioned the HTAB, update our globals */
+ ppc64_pft_size = state->shift;
+ htab_size_bytes = 1UL << ppc64_pft_size;
+ htab_hash_mask = (htab_size_bytes >> 7) - 1;
+
+ return 0;
+}
+
+/*
+ * Must be called in process context. The caller must hold the
+ * cpus_lock.
+ */
+static int pseries_lpar_resize_hpt(unsigned long shift)
+{
+ struct hpt_resize_state state = {
+ .shift = shift,
+ .commit_rc = H_FUNCTION,
+ };
+ unsigned int delay, total_delay = 0;
+ int rc;
+ ktime_t t0, t1, t2;
+
+ might_sleep();
+
+ if (!firmware_has_feature(FW_FEATURE_HPT_RESIZE))
+ return -ENODEV;
+
+ pr_info("Attempting to resize HPT to shift %lu\n", shift);
+
+ t0 = ktime_get();
+
+ rc = plpar_resize_hpt_prepare(0, shift);
+ while (H_IS_LONG_BUSY(rc)) {
+ delay = get_longbusy_msecs(rc);
+ total_delay += delay;
+ if (total_delay > HPT_RESIZE_TIMEOUT) {
+ /* prepare with shift==0 cancels an in-progress resize */
+ rc = plpar_resize_hpt_prepare(0, 0);
+ if (rc != H_SUCCESS)
+ pr_warn("Unexpected error %d cancelling timed out HPT resize\n",
+ rc);
+ return -ETIMEDOUT;
+ }
+ msleep(delay);
+ rc = plpar_resize_hpt_prepare(0, shift);
+ };
+
+ switch (rc) {
+ case H_SUCCESS:
+ /* Continue on */
+ break;
+
+ case H_PARAMETER:
+ return -EINVAL;
+ case H_RESOURCE:
+ return -EPERM;
+ default:
+ pr_warn("Unexpected error %d from H_RESIZE_HPT_PREPARE\n", rc);
+ return -EIO;
+ }
+
+ t1 = ktime_get();
+
+ rc = stop_machine_cpuslocked(pseries_lpar_resize_hpt_commit,
+ &state, NULL);
+
+ t2 = ktime_get();
+
+ if (rc != 0) {
+ switch (state.commit_rc) {
+ case H_PTEG_FULL:
+ pr_warn("Hash collision while resizing HPT\n");
+ return -ENOSPC;
+
+ default:
+ pr_warn("Unexpected error %d from H_RESIZE_HPT_COMMIT\n",
+ state.commit_rc);
+ return -EIO;
+ };
+ }
+
+ pr_info("HPT resize to shift %lu complete (%lld ms / %lld ms)\n",
+ shift, (long long) ktime_ms_delta(t1, t0),
+ (long long) ktime_ms_delta(t2, t1));
+
+ return 0;
+}
+
+static int pseries_lpar_register_process_table(unsigned long base,
+ unsigned long page_size, unsigned long table_size)
+{
+ long rc;
+ unsigned long flags = 0;
+
+ if (table_size)
+ flags |= PROC_TABLE_NEW;
+ if (radix_enabled())
+ flags |= PROC_TABLE_RADIX | PROC_TABLE_GTSE;
+ else
+ flags |= PROC_TABLE_HPT_SLB;
+ for (;;) {
+ rc = plpar_hcall_norets(H_REGISTER_PROC_TBL, flags, base,
+ page_size, table_size);
+ if (!H_IS_LONG_BUSY(rc))
+ break;
+ mdelay(get_longbusy_msecs(rc));
+ }
+ if (rc != H_SUCCESS) {
+ pr_err("Failed to register process table (rc=%ld)\n", rc);
+ BUG();
+ }
+ return rc;
+}
+
+void __init hpte_init_pseries(void)
+{
+ mmu_hash_ops.hpte_invalidate = pSeries_lpar_hpte_invalidate;
+ mmu_hash_ops.hpte_updatepp = pSeries_lpar_hpte_updatepp;
+ mmu_hash_ops.hpte_updateboltedpp = pSeries_lpar_hpte_updateboltedpp;
+ mmu_hash_ops.hpte_insert = pSeries_lpar_hpte_insert;
+ mmu_hash_ops.hpte_remove = pSeries_lpar_hpte_remove;
+ mmu_hash_ops.hpte_removebolted = pSeries_lpar_hpte_removebolted;
+ mmu_hash_ops.flush_hash_range = pSeries_lpar_flush_hash_range;
+ mmu_hash_ops.hpte_clear_all = pseries_hpte_clear_all;
+ mmu_hash_ops.hugepage_invalidate = pSeries_lpar_hugepage_invalidate;
+ register_process_table = pseries_lpar_register_process_table;
+
+ if (firmware_has_feature(FW_FEATURE_HPT_RESIZE))
+ mmu_hash_ops.resize_hpt = pseries_lpar_resize_hpt;
+}
+
+void radix_init_pseries(void)
+{
+ pr_info("Using radix MMU under hypervisor\n");
+ register_process_table = pseries_lpar_register_process_table;
+}
+
+#ifdef CONFIG_PPC_SMLPAR
+#define CMO_FREE_HINT_DEFAULT 1
+static int cmo_free_hint_flag = CMO_FREE_HINT_DEFAULT;
+
+static int __init cmo_free_hint(char *str)
+{
+ char *parm;
+ parm = strstrip(str);
+
+ if (strcasecmp(parm, "no") == 0 || strcasecmp(parm, "off") == 0) {
+ pr_info("%s: CMO free page hinting is not active.\n", __func__);
+ cmo_free_hint_flag = 0;
+ return 1;
+ }
+
+ cmo_free_hint_flag = 1;
+ pr_info("%s: CMO free page hinting is active.\n", __func__);
+
+ if (strcasecmp(parm, "yes") == 0 || strcasecmp(parm, "on") == 0)
+ return 1;
+
+ return 0;
+}
+
+__setup("cmo_free_hint=", cmo_free_hint);
+
+static void pSeries_set_page_state(struct page *page, int order,
+ unsigned long state)
+{
+ int i, j;
+ unsigned long cmo_page_sz, addr;
+
+ cmo_page_sz = cmo_get_page_size();
+ addr = __pa((unsigned long)page_address(page));
+
+ for (i = 0; i < (1 << order); i++, addr += PAGE_SIZE) {
+ for (j = 0; j < PAGE_SIZE; j += cmo_page_sz)
+ plpar_hcall_norets(H_PAGE_INIT, state, addr + j, 0);
+ }
+}
+
+void arch_free_page(struct page *page, int order)
+{
+ if (radix_enabled())
+ return;
+ if (!cmo_free_hint_flag || !firmware_has_feature(FW_FEATURE_CMO))
+ return;
+
+ pSeries_set_page_state(page, order, H_PAGE_SET_UNUSED);
+}
+EXPORT_SYMBOL(arch_free_page);
+
+#endif /* CONFIG_PPC_SMLPAR */
+#endif /* CONFIG_PPC_BOOK3S_64 */
+
+#ifdef CONFIG_TRACEPOINTS
+#ifdef CONFIG_JUMP_LABEL
+struct static_key hcall_tracepoint_key = STATIC_KEY_INIT;
+
+int hcall_tracepoint_regfunc(void)
+{
+ static_key_slow_inc(&hcall_tracepoint_key);
+ return 0;
+}
+
+void hcall_tracepoint_unregfunc(void)
+{
+ static_key_slow_dec(&hcall_tracepoint_key);
+}
+#else
+/*
+ * We optimise our hcall path by placing hcall_tracepoint_refcount
+ * directly in the TOC so we can check if the hcall tracepoints are
+ * enabled via a single load.
+ */
+
+/* NB: reg/unreg are called while guarded with the tracepoints_mutex */
+extern long hcall_tracepoint_refcount;
+
+int hcall_tracepoint_regfunc(void)
+{
+ hcall_tracepoint_refcount++;
+ return 0;
+}
+
+void hcall_tracepoint_unregfunc(void)
+{
+ hcall_tracepoint_refcount--;
+}
+#endif
+
+/*
+ * Since the tracing code might execute hcalls we need to guard against
+ * recursion. One example of this are spinlocks calling H_YIELD on
+ * shared processor partitions.
+ */
+static DEFINE_PER_CPU(unsigned int, hcall_trace_depth);
+
+
+void __trace_hcall_entry(unsigned long opcode, unsigned long *args)
+{
+ unsigned long flags;
+ unsigned int *depth;
+
+ /*
+ * We cannot call tracepoints inside RCU idle regions which
+ * means we must not trace H_CEDE.
+ */
+ if (opcode == H_CEDE)
+ return;
+
+ local_irq_save(flags);
+
+ depth = this_cpu_ptr(&hcall_trace_depth);
+
+ if (*depth)
+ goto out;
+
+ (*depth)++;
+ preempt_disable();
+ trace_hcall_entry(opcode, args);
+ (*depth)--;
+
+out:
+ local_irq_restore(flags);
+}
+
+void __trace_hcall_exit(long opcode, long retval, unsigned long *retbuf)
+{
+ unsigned long flags;
+ unsigned int *depth;
+
+ if (opcode == H_CEDE)
+ return;
+
+ local_irq_save(flags);
+
+ depth = this_cpu_ptr(&hcall_trace_depth);
+
+ if (*depth)
+ goto out;
+
+ (*depth)++;
+ trace_hcall_exit(opcode, retval, retbuf);
+ preempt_enable();
+ (*depth)--;
+
+out:
+ local_irq_restore(flags);
+}
+#endif
+
+/**
+ * h_get_mpp
+ * H_GET_MPP hcall returns info in 7 parms
+ */
+int h_get_mpp(struct hvcall_mpp_data *mpp_data)
+{
+ int rc;
+ unsigned long retbuf[PLPAR_HCALL9_BUFSIZE];
+
+ rc = plpar_hcall9(H_GET_MPP, retbuf);
+
+ mpp_data->entitled_mem = retbuf[0];
+ mpp_data->mapped_mem = retbuf[1];
+
+ mpp_data->group_num = (retbuf[2] >> 2 * 8) & 0xffff;
+ mpp_data->pool_num = retbuf[2] & 0xffff;
+
+ mpp_data->mem_weight = (retbuf[3] >> 7 * 8) & 0xff;
+ mpp_data->unallocated_mem_weight = (retbuf[3] >> 6 * 8) & 0xff;
+ mpp_data->unallocated_entitlement = retbuf[3] & 0xffffffffffffUL;
+
+ mpp_data->pool_size = retbuf[4];
+ mpp_data->loan_request = retbuf[5];
+ mpp_data->backing_mem = retbuf[6];
+
+ return rc;
+}
+EXPORT_SYMBOL(h_get_mpp);
+
+int h_get_mpp_x(struct hvcall_mpp_x_data *mpp_x_data)
+{
+ int rc;
+ unsigned long retbuf[PLPAR_HCALL9_BUFSIZE] = { 0 };
+
+ rc = plpar_hcall9(H_GET_MPP_X, retbuf);
+
+ mpp_x_data->coalesced_bytes = retbuf[0];
+ mpp_x_data->pool_coalesced_bytes = retbuf[1];
+ mpp_x_data->pool_purr_cycles = retbuf[2];
+ mpp_x_data->pool_spurr_cycles = retbuf[3];
+
+ return rc;
+}
+
+static unsigned long vsid_unscramble(unsigned long vsid, int ssize)
+{
+ unsigned long protovsid;
+ unsigned long va_bits = VA_BITS;
+ unsigned long modinv, vsid_modulus;
+ unsigned long max_mod_inv, tmp_modinv;
+
+ if (!mmu_has_feature(MMU_FTR_68_BIT_VA))
+ va_bits = 65;
+
+ if (ssize == MMU_SEGSIZE_256M) {
+ modinv = VSID_MULINV_256M;
+ vsid_modulus = ((1UL << (va_bits - SID_SHIFT)) - 1);
+ } else {
+ modinv = VSID_MULINV_1T;
+ vsid_modulus = ((1UL << (va_bits - SID_SHIFT_1T)) - 1);
+ }
+
+ /*
+ * vsid outside our range.
+ */
+ if (vsid >= vsid_modulus)
+ return 0;
+
+ /*
+ * If modinv is the modular multiplicate inverse of (x % vsid_modulus)
+ * and vsid = (protovsid * x) % vsid_modulus, then we say:
+ * protovsid = (vsid * modinv) % vsid_modulus
+ */
+
+ /* Check if (vsid * modinv) overflow (63 bits) */
+ max_mod_inv = 0x7fffffffffffffffull / vsid;
+ if (modinv < max_mod_inv)
+ return (vsid * modinv) % vsid_modulus;
+
+ tmp_modinv = modinv/max_mod_inv;
+ modinv %= max_mod_inv;
+
+ protovsid = (((vsid * max_mod_inv) % vsid_modulus) * tmp_modinv) % vsid_modulus;
+ protovsid = (protovsid + vsid * modinv) % vsid_modulus;
+
+ return protovsid;
+}
+
+static int __init reserve_vrma_context_id(void)
+{
+ unsigned long protovsid;
+
+ /*
+ * Reserve context ids which map to reserved virtual addresses. For now
+ * we only reserve the context id which maps to the VRMA VSID. We ignore
+ * the addresses in "ibm,adjunct-virtual-addresses" because we don't
+ * enable adjunct support via the "ibm,client-architecture-support"
+ * interface.
+ */
+ protovsid = vsid_unscramble(VRMA_VSID, MMU_SEGSIZE_1T);
+ hash__reserve_context_id(protovsid >> ESID_BITS_1T);
+ return 0;
+}
+machine_device_initcall(pseries, reserve_vrma_context_id);
+
+#ifdef CONFIG_DEBUG_FS
+/* debugfs file interface for vpa data */
+static ssize_t vpa_file_read(struct file *filp, char __user *buf, size_t len,
+ loff_t *pos)
+{
+ int cpu = (long)filp->private_data;
+ struct lppaca *lppaca = &lppaca_of(cpu);
+
+ return simple_read_from_buffer(buf, len, pos, lppaca,
+ sizeof(struct lppaca));
+}
+
+static const struct file_operations vpa_fops = {
+ .open = simple_open,
+ .read = vpa_file_read,
+ .llseek = default_llseek,
+};
+
+static int __init vpa_debugfs_init(void)
+{
+ char name[16];
+ long i;
+ struct dentry *vpa_dir;
+
+ if (!firmware_has_feature(FW_FEATURE_SPLPAR))
+ return 0;
+
+ vpa_dir = debugfs_create_dir("vpa", powerpc_debugfs_root);
+ if (!vpa_dir) {
+ pr_warn("%s: can't create vpa root dir\n", __func__);
+ return -ENOMEM;
+ }
+
+ /* set up the per-cpu vpa file*/
+ for_each_possible_cpu(i) {
+ struct dentry *d;
+
+ sprintf(name, "cpu-%ld", i);
+
+ d = debugfs_create_file(name, 0400, vpa_dir, (void *)i,
+ &vpa_fops);
+ if (!d) {
+ pr_warn("%s: can't create per-cpu vpa file\n",
+ __func__);
+ return -ENOMEM;
+ }
+ }
+
+ return 0;
+}
+machine_arch_initcall(pseries, vpa_debugfs_init);
+#endif /* CONFIG_DEBUG_FS */
diff --git a/arch/powerpc/platforms/pseries/lparcfg.c b/arch/powerpc/platforms/pseries/lparcfg.c
new file mode 100644
index 000000000..7c872dc01
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/lparcfg.c
@@ -0,0 +1,714 @@
+/*
+ * PowerPC64 LPAR Configuration Information Driver
+ *
+ * Dave Engebretsen engebret@us.ibm.com
+ * Copyright (c) 2003 Dave Engebretsen
+ * Will Schmidt willschm@us.ibm.com
+ * SPLPAR updates, Copyright (c) 2003 Will Schmidt IBM Corporation.
+ * seq_file updates, Copyright (c) 2004 Will Schmidt IBM Corporation.
+ * Nathan Lynch nathanl@austin.ibm.com
+ * Added lparcfg_write, Copyright (C) 2004 Nathan Lynch IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * This driver creates a proc file at /proc/ppc64/lparcfg which contains
+ * keyword - value pairs that specify the configuration of the partition.
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <linux/proc_fs.h>
+#include <linux/init.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <asm/lppaca.h>
+#include <asm/hvcall.h>
+#include <asm/firmware.h>
+#include <asm/rtas.h>
+#include <asm/time.h>
+#include <asm/prom.h>
+#include <asm/vdso_datapage.h>
+#include <asm/vio.h>
+#include <asm/mmu.h>
+#include <asm/machdep.h>
+
+#include "pseries.h"
+
+/*
+ * This isn't a module but we expose that to userspace
+ * via /proc so leave the definitions here
+ */
+#define MODULE_VERS "1.9"
+#define MODULE_NAME "lparcfg"
+
+/* #define LPARCFG_DEBUG */
+
+/*
+ * Track sum of all purrs across all processors. This is used to further
+ * calculate usage values by different applications
+ */
+static void cpu_get_purr(void *arg)
+{
+ atomic64_t *sum = arg;
+
+ atomic64_add(mfspr(SPRN_PURR), sum);
+}
+
+static unsigned long get_purr(void)
+{
+ atomic64_t purr = ATOMIC64_INIT(0);
+
+ on_each_cpu(cpu_get_purr, &purr, 1);
+
+ return atomic64_read(&purr);
+}
+
+/*
+ * Methods used to fetch LPAR data when running on a pSeries platform.
+ */
+
+struct hvcall_ppp_data {
+ u64 entitlement;
+ u64 unallocated_entitlement;
+ u16 group_num;
+ u16 pool_num;
+ u8 capped;
+ u8 weight;
+ u8 unallocated_weight;
+ u16 active_procs_in_pool;
+ u16 active_system_procs;
+ u16 phys_platform_procs;
+ u32 max_proc_cap_avail;
+ u32 entitled_proc_cap_avail;
+};
+
+/*
+ * H_GET_PPP hcall returns info in 4 parms.
+ * entitled_capacity,unallocated_capacity,
+ * aggregation, resource_capability).
+ *
+ * R4 = Entitled Processor Capacity Percentage.
+ * R5 = Unallocated Processor Capacity Percentage.
+ * R6 (AABBCCDDEEFFGGHH).
+ * XXXX - reserved (0)
+ * XXXX - reserved (0)
+ * XXXX - Group Number
+ * XXXX - Pool Number.
+ * R7 (IIJJKKLLMMNNOOPP).
+ * XX - reserved. (0)
+ * XX - bit 0-6 reserved (0). bit 7 is Capped indicator.
+ * XX - variable processor Capacity Weight
+ * XX - Unallocated Variable Processor Capacity Weight.
+ * XXXX - Active processors in Physical Processor Pool.
+ * XXXX - Processors active on platform.
+ * R8 (QQQQRRRRRRSSSSSS). if ibm,partition-performance-parameters-level >= 1
+ * XXXX - Physical platform procs allocated to virtualization.
+ * XXXXXX - Max procs capacity % available to the partitions pool.
+ * XXXXXX - Entitled procs capacity % available to the
+ * partitions pool.
+ */
+static unsigned int h_get_ppp(struct hvcall_ppp_data *ppp_data)
+{
+ unsigned long rc;
+ unsigned long retbuf[PLPAR_HCALL9_BUFSIZE];
+
+ rc = plpar_hcall9(H_GET_PPP, retbuf);
+
+ ppp_data->entitlement = retbuf[0];
+ ppp_data->unallocated_entitlement = retbuf[1];
+
+ ppp_data->group_num = (retbuf[2] >> 2 * 8) & 0xffff;
+ ppp_data->pool_num = retbuf[2] & 0xffff;
+
+ ppp_data->capped = (retbuf[3] >> 6 * 8) & 0x01;
+ ppp_data->weight = (retbuf[3] >> 5 * 8) & 0xff;
+ ppp_data->unallocated_weight = (retbuf[3] >> 4 * 8) & 0xff;
+ ppp_data->active_procs_in_pool = (retbuf[3] >> 2 * 8) & 0xffff;
+ ppp_data->active_system_procs = retbuf[3] & 0xffff;
+
+ ppp_data->phys_platform_procs = retbuf[4] >> 6 * 8;
+ ppp_data->max_proc_cap_avail = (retbuf[4] >> 3 * 8) & 0xffffff;
+ ppp_data->entitled_proc_cap_avail = retbuf[4] & 0xffffff;
+
+ return rc;
+}
+
+static unsigned h_pic(unsigned long *pool_idle_time,
+ unsigned long *num_procs)
+{
+ unsigned long rc;
+ unsigned long retbuf[PLPAR_HCALL_BUFSIZE];
+
+ rc = plpar_hcall(H_PIC, retbuf);
+
+ *pool_idle_time = retbuf[0];
+ *num_procs = retbuf[1];
+
+ return rc;
+}
+
+/*
+ * parse_ppp_data
+ * Parse out the data returned from h_get_ppp and h_pic
+ */
+static void parse_ppp_data(struct seq_file *m)
+{
+ struct hvcall_ppp_data ppp_data;
+ struct device_node *root;
+ const __be32 *perf_level;
+ int rc;
+
+ rc = h_get_ppp(&ppp_data);
+ if (rc)
+ return;
+
+ seq_printf(m, "partition_entitled_capacity=%lld\n",
+ ppp_data.entitlement);
+ seq_printf(m, "group=%d\n", ppp_data.group_num);
+ seq_printf(m, "system_active_processors=%d\n",
+ ppp_data.active_system_procs);
+
+ /* pool related entries are appropriate for shared configs */
+ if (lppaca_shared_proc(get_lppaca())) {
+ unsigned long pool_idle_time, pool_procs;
+
+ seq_printf(m, "pool=%d\n", ppp_data.pool_num);
+
+ /* report pool_capacity in percentage */
+ seq_printf(m, "pool_capacity=%d\n",
+ ppp_data.active_procs_in_pool * 100);
+
+ h_pic(&pool_idle_time, &pool_procs);
+ seq_printf(m, "pool_idle_time=%ld\n", pool_idle_time);
+ seq_printf(m, "pool_num_procs=%ld\n", pool_procs);
+ }
+
+ seq_printf(m, "unallocated_capacity_weight=%d\n",
+ ppp_data.unallocated_weight);
+ seq_printf(m, "capacity_weight=%d\n", ppp_data.weight);
+ seq_printf(m, "capped=%d\n", ppp_data.capped);
+ seq_printf(m, "unallocated_capacity=%lld\n",
+ ppp_data.unallocated_entitlement);
+
+ /* The last bits of information returned from h_get_ppp are only
+ * valid if the ibm,partition-performance-parameters-level
+ * property is >= 1.
+ */
+ root = of_find_node_by_path("/");
+ if (root) {
+ perf_level = of_get_property(root,
+ "ibm,partition-performance-parameters-level",
+ NULL);
+ if (perf_level && (be32_to_cpup(perf_level) >= 1)) {
+ seq_printf(m,
+ "physical_procs_allocated_to_virtualization=%d\n",
+ ppp_data.phys_platform_procs);
+ seq_printf(m, "max_proc_capacity_available=%d\n",
+ ppp_data.max_proc_cap_avail);
+ seq_printf(m, "entitled_proc_capacity_available=%d\n",
+ ppp_data.entitled_proc_cap_avail);
+ }
+
+ of_node_put(root);
+ }
+}
+
+/**
+ * parse_mpp_data
+ * Parse out data returned from h_get_mpp
+ */
+static void parse_mpp_data(struct seq_file *m)
+{
+ struct hvcall_mpp_data mpp_data;
+ int rc;
+
+ rc = h_get_mpp(&mpp_data);
+ if (rc)
+ return;
+
+ seq_printf(m, "entitled_memory=%ld\n", mpp_data.entitled_mem);
+
+ if (mpp_data.mapped_mem != -1)
+ seq_printf(m, "mapped_entitled_memory=%ld\n",
+ mpp_data.mapped_mem);
+
+ seq_printf(m, "entitled_memory_group_number=%d\n", mpp_data.group_num);
+ seq_printf(m, "entitled_memory_pool_number=%d\n", mpp_data.pool_num);
+
+ seq_printf(m, "entitled_memory_weight=%d\n", mpp_data.mem_weight);
+ seq_printf(m, "unallocated_entitled_memory_weight=%d\n",
+ mpp_data.unallocated_mem_weight);
+ seq_printf(m, "unallocated_io_mapping_entitlement=%ld\n",
+ mpp_data.unallocated_entitlement);
+
+ if (mpp_data.pool_size != -1)
+ seq_printf(m, "entitled_memory_pool_size=%ld bytes\n",
+ mpp_data.pool_size);
+
+ seq_printf(m, "entitled_memory_loan_request=%ld\n",
+ mpp_data.loan_request);
+
+ seq_printf(m, "backing_memory=%ld bytes\n", mpp_data.backing_mem);
+}
+
+/**
+ * parse_mpp_x_data
+ * Parse out data returned from h_get_mpp_x
+ */
+static void parse_mpp_x_data(struct seq_file *m)
+{
+ struct hvcall_mpp_x_data mpp_x_data;
+
+ if (!firmware_has_feature(FW_FEATURE_XCMO))
+ return;
+ if (h_get_mpp_x(&mpp_x_data))
+ return;
+
+ seq_printf(m, "coalesced_bytes=%ld\n", mpp_x_data.coalesced_bytes);
+
+ if (mpp_x_data.pool_coalesced_bytes)
+ seq_printf(m, "pool_coalesced_bytes=%ld\n",
+ mpp_x_data.pool_coalesced_bytes);
+ if (mpp_x_data.pool_purr_cycles)
+ seq_printf(m, "coalesce_pool_purr=%ld\n", mpp_x_data.pool_purr_cycles);
+ if (mpp_x_data.pool_spurr_cycles)
+ seq_printf(m, "coalesce_pool_spurr=%ld\n", mpp_x_data.pool_spurr_cycles);
+}
+
+#define SPLPAR_CHARACTERISTICS_TOKEN 20
+#define SPLPAR_MAXLENGTH 1026*(sizeof(char))
+
+/*
+ * parse_system_parameter_string()
+ * Retrieve the potential_processors, max_entitled_capacity and friends
+ * through the get-system-parameter rtas call. Replace keyword strings as
+ * necessary.
+ */
+static void parse_system_parameter_string(struct seq_file *m)
+{
+ int call_status;
+
+ unsigned char *local_buffer = kmalloc(SPLPAR_MAXLENGTH, GFP_KERNEL);
+ if (!local_buffer) {
+ printk(KERN_ERR "%s %s kmalloc failure at line %d\n",
+ __FILE__, __func__, __LINE__);
+ return;
+ }
+
+ spin_lock(&rtas_data_buf_lock);
+ memset(rtas_data_buf, 0, SPLPAR_MAXLENGTH);
+ call_status = rtas_call(rtas_token("ibm,get-system-parameter"), 3, 1,
+ NULL,
+ SPLPAR_CHARACTERISTICS_TOKEN,
+ __pa(rtas_data_buf),
+ RTAS_DATA_BUF_SIZE);
+ memcpy(local_buffer, rtas_data_buf, SPLPAR_MAXLENGTH);
+ local_buffer[SPLPAR_MAXLENGTH - 1] = '\0';
+ spin_unlock(&rtas_data_buf_lock);
+
+ if (call_status != 0) {
+ printk(KERN_INFO
+ "%s %s Error calling get-system-parameter (0x%x)\n",
+ __FILE__, __func__, call_status);
+ } else {
+ int splpar_strlen;
+ int idx, w_idx;
+ char *workbuffer = kzalloc(SPLPAR_MAXLENGTH, GFP_KERNEL);
+ if (!workbuffer) {
+ printk(KERN_ERR "%s %s kmalloc failure at line %d\n",
+ __FILE__, __func__, __LINE__);
+ kfree(local_buffer);
+ return;
+ }
+#ifdef LPARCFG_DEBUG
+ printk(KERN_INFO "success calling get-system-parameter\n");
+#endif
+ splpar_strlen = local_buffer[0] * 256 + local_buffer[1];
+ local_buffer += 2; /* step over strlen value */
+
+ w_idx = 0;
+ idx = 0;
+ while ((*local_buffer) && (idx < splpar_strlen)) {
+ workbuffer[w_idx++] = local_buffer[idx++];
+ if ((local_buffer[idx] == ',')
+ || (local_buffer[idx] == '\0')) {
+ workbuffer[w_idx] = '\0';
+ if (w_idx) {
+ /* avoid the empty string */
+ seq_printf(m, "%s\n", workbuffer);
+ }
+ memset(workbuffer, 0, SPLPAR_MAXLENGTH);
+ idx++; /* skip the comma */
+ w_idx = 0;
+ } else if (local_buffer[idx] == '=') {
+ /* code here to replace workbuffer contents
+ with different keyword strings */
+ if (0 == strcmp(workbuffer, "MaxEntCap")) {
+ strcpy(workbuffer,
+ "partition_max_entitled_capacity");
+ w_idx = strlen(workbuffer);
+ }
+ if (0 == strcmp(workbuffer, "MaxPlatProcs")) {
+ strcpy(workbuffer,
+ "system_potential_processors");
+ w_idx = strlen(workbuffer);
+ }
+ }
+ }
+ kfree(workbuffer);
+ local_buffer -= 2; /* back up over strlen value */
+ }
+ kfree(local_buffer);
+}
+
+/* Return the number of processors in the system.
+ * This function reads through the device tree and counts
+ * the virtual processors, this does not include threads.
+ */
+static int lparcfg_count_active_processors(void)
+{
+ struct device_node *cpus_dn;
+ int count = 0;
+
+ for_each_node_by_type(cpus_dn, "cpu") {
+#ifdef LPARCFG_DEBUG
+ printk(KERN_ERR "cpus_dn %p\n", cpus_dn);
+#endif
+ count++;
+ }
+ return count;
+}
+
+static void pseries_cmo_data(struct seq_file *m)
+{
+ int cpu;
+ unsigned long cmo_faults = 0;
+ unsigned long cmo_fault_time = 0;
+
+ seq_printf(m, "cmo_enabled=%d\n", firmware_has_feature(FW_FEATURE_CMO));
+
+ if (!firmware_has_feature(FW_FEATURE_CMO))
+ return;
+
+ for_each_possible_cpu(cpu) {
+ cmo_faults += be64_to_cpu(lppaca_of(cpu).cmo_faults);
+ cmo_fault_time += be64_to_cpu(lppaca_of(cpu).cmo_fault_time);
+ }
+
+ seq_printf(m, "cmo_faults=%lu\n", cmo_faults);
+ seq_printf(m, "cmo_fault_time_usec=%lu\n",
+ cmo_fault_time / tb_ticks_per_usec);
+ seq_printf(m, "cmo_primary_psp=%d\n", cmo_get_primary_psp());
+ seq_printf(m, "cmo_secondary_psp=%d\n", cmo_get_secondary_psp());
+ seq_printf(m, "cmo_page_size=%lu\n", cmo_get_page_size());
+}
+
+static void splpar_dispatch_data(struct seq_file *m)
+{
+ int cpu;
+ unsigned long dispatches = 0;
+ unsigned long dispatch_dispersions = 0;
+
+ for_each_possible_cpu(cpu) {
+ dispatches += be32_to_cpu(lppaca_of(cpu).yield_count);
+ dispatch_dispersions +=
+ be32_to_cpu(lppaca_of(cpu).dispersion_count);
+ }
+
+ seq_printf(m, "dispatches=%lu\n", dispatches);
+ seq_printf(m, "dispatch_dispersions=%lu\n", dispatch_dispersions);
+}
+
+static void parse_em_data(struct seq_file *m)
+{
+ unsigned long retbuf[PLPAR_HCALL_BUFSIZE];
+
+ if (firmware_has_feature(FW_FEATURE_LPAR) &&
+ plpar_hcall(H_GET_EM_PARMS, retbuf) == H_SUCCESS)
+ seq_printf(m, "power_mode_data=%016lx\n", retbuf[0]);
+}
+
+static int pseries_lparcfg_data(struct seq_file *m, void *v)
+{
+ int partition_potential_processors;
+ int partition_active_processors;
+ struct device_node *rtas_node;
+ const __be32 *lrdrp = NULL;
+
+ rtas_node = of_find_node_by_path("/rtas");
+ if (rtas_node)
+ lrdrp = of_get_property(rtas_node, "ibm,lrdr-capacity", NULL);
+
+ if (lrdrp == NULL) {
+ partition_potential_processors = vdso_data->processorCount;
+ } else {
+ partition_potential_processors = be32_to_cpup(lrdrp + 4);
+ }
+ of_node_put(rtas_node);
+
+ partition_active_processors = lparcfg_count_active_processors();
+
+ if (firmware_has_feature(FW_FEATURE_SPLPAR)) {
+ /* this call handles the ibm,get-system-parameter contents */
+ parse_system_parameter_string(m);
+ parse_ppp_data(m);
+ parse_mpp_data(m);
+ parse_mpp_x_data(m);
+ pseries_cmo_data(m);
+ splpar_dispatch_data(m);
+
+ seq_printf(m, "purr=%ld\n", get_purr());
+ } else { /* non SPLPAR case */
+
+ seq_printf(m, "system_active_processors=%d\n",
+ partition_potential_processors);
+
+ seq_printf(m, "system_potential_processors=%d\n",
+ partition_potential_processors);
+
+ seq_printf(m, "partition_max_entitled_capacity=%d\n",
+ partition_potential_processors * 100);
+
+ seq_printf(m, "partition_entitled_capacity=%d\n",
+ partition_active_processors * 100);
+ }
+
+ seq_printf(m, "partition_active_processors=%d\n",
+ partition_active_processors);
+
+ seq_printf(m, "partition_potential_processors=%d\n",
+ partition_potential_processors);
+
+ seq_printf(m, "shared_processor_mode=%d\n",
+ lppaca_shared_proc(get_lppaca()));
+
+#ifdef CONFIG_PPC_BOOK3S_64
+ seq_printf(m, "slb_size=%d\n", mmu_slb_size);
+#endif
+ parse_em_data(m);
+
+ return 0;
+}
+
+static ssize_t update_ppp(u64 *entitlement, u8 *weight)
+{
+ struct hvcall_ppp_data ppp_data;
+ u8 new_weight;
+ u64 new_entitled;
+ ssize_t retval;
+
+ /* Get our current parameters */
+ retval = h_get_ppp(&ppp_data);
+ if (retval)
+ return retval;
+
+ if (entitlement) {
+ new_weight = ppp_data.weight;
+ new_entitled = *entitlement;
+ } else if (weight) {
+ new_weight = *weight;
+ new_entitled = ppp_data.entitlement;
+ } else
+ return -EINVAL;
+
+ pr_debug("%s: current_entitled = %llu, current_weight = %u\n",
+ __func__, ppp_data.entitlement, ppp_data.weight);
+
+ pr_debug("%s: new_entitled = %llu, new_weight = %u\n",
+ __func__, new_entitled, new_weight);
+
+ retval = plpar_hcall_norets(H_SET_PPP, new_entitled, new_weight);
+ return retval;
+}
+
+/**
+ * update_mpp
+ *
+ * Update the memory entitlement and weight for the partition. Caller must
+ * specify either a new entitlement or weight, not both, to be updated
+ * since the h_set_mpp call takes both entitlement and weight as parameters.
+ */
+static ssize_t update_mpp(u64 *entitlement, u8 *weight)
+{
+ struct hvcall_mpp_data mpp_data;
+ u64 new_entitled;
+ u8 new_weight;
+ ssize_t rc;
+
+ if (entitlement) {
+ /* Check with vio to ensure the new memory entitlement
+ * can be handled.
+ */
+ rc = vio_cmo_entitlement_update(*entitlement);
+ if (rc)
+ return rc;
+ }
+
+ rc = h_get_mpp(&mpp_data);
+ if (rc)
+ return rc;
+
+ if (entitlement) {
+ new_weight = mpp_data.mem_weight;
+ new_entitled = *entitlement;
+ } else if (weight) {
+ new_weight = *weight;
+ new_entitled = mpp_data.entitled_mem;
+ } else
+ return -EINVAL;
+
+ pr_debug("%s: current_entitled = %lu, current_weight = %u\n",
+ __func__, mpp_data.entitled_mem, mpp_data.mem_weight);
+
+ pr_debug("%s: new_entitled = %llu, new_weight = %u\n",
+ __func__, new_entitled, new_weight);
+
+ rc = plpar_hcall_norets(H_SET_MPP, new_entitled, new_weight);
+ return rc;
+}
+
+/*
+ * Interface for changing system parameters (variable capacity weight
+ * and entitled capacity). Format of input is "param_name=value";
+ * anything after value is ignored. Valid parameters at this time are
+ * "partition_entitled_capacity" and "capacity_weight". We use
+ * H_SET_PPP to alter parameters.
+ *
+ * This function should be invoked only on systems with
+ * FW_FEATURE_SPLPAR.
+ */
+static ssize_t lparcfg_write(struct file *file, const char __user * buf,
+ size_t count, loff_t * off)
+{
+ int kbuf_sz = 64;
+ char kbuf[kbuf_sz];
+ char *tmp;
+ u64 new_entitled, *new_entitled_ptr = &new_entitled;
+ u8 new_weight, *new_weight_ptr = &new_weight;
+ ssize_t retval;
+
+ if (!firmware_has_feature(FW_FEATURE_SPLPAR))
+ return -EINVAL;
+
+ if (count > kbuf_sz)
+ return -EINVAL;
+
+ if (copy_from_user(kbuf, buf, count))
+ return -EFAULT;
+
+ kbuf[count - 1] = '\0';
+ tmp = strchr(kbuf, '=');
+ if (!tmp)
+ return -EINVAL;
+
+ *tmp++ = '\0';
+
+ if (!strcmp(kbuf, "partition_entitled_capacity")) {
+ char *endp;
+ *new_entitled_ptr = (u64) simple_strtoul(tmp, &endp, 10);
+ if (endp == tmp)
+ return -EINVAL;
+
+ retval = update_ppp(new_entitled_ptr, NULL);
+ } else if (!strcmp(kbuf, "capacity_weight")) {
+ char *endp;
+ *new_weight_ptr = (u8) simple_strtoul(tmp, &endp, 10);
+ if (endp == tmp)
+ return -EINVAL;
+
+ retval = update_ppp(NULL, new_weight_ptr);
+ } else if (!strcmp(kbuf, "entitled_memory")) {
+ char *endp;
+ *new_entitled_ptr = (u64) simple_strtoul(tmp, &endp, 10);
+ if (endp == tmp)
+ return -EINVAL;
+
+ retval = update_mpp(new_entitled_ptr, NULL);
+ } else if (!strcmp(kbuf, "entitled_memory_weight")) {
+ char *endp;
+ *new_weight_ptr = (u8) simple_strtoul(tmp, &endp, 10);
+ if (endp == tmp)
+ return -EINVAL;
+
+ retval = update_mpp(NULL, new_weight_ptr);
+ } else
+ return -EINVAL;
+
+ if (retval == H_SUCCESS || retval == H_CONSTRAINED) {
+ retval = count;
+ } else if (retval == H_BUSY) {
+ retval = -EBUSY;
+ } else if (retval == H_HARDWARE) {
+ retval = -EIO;
+ } else if (retval == H_PARAMETER) {
+ retval = -EINVAL;
+ }
+
+ return retval;
+}
+
+static int lparcfg_data(struct seq_file *m, void *v)
+{
+ struct device_node *rootdn;
+ const char *model = "";
+ const char *system_id = "";
+ const char *tmp;
+ const __be32 *lp_index_ptr;
+ unsigned int lp_index = 0;
+
+ seq_printf(m, "%s %s\n", MODULE_NAME, MODULE_VERS);
+
+ rootdn = of_find_node_by_path("/");
+ if (rootdn) {
+ tmp = of_get_property(rootdn, "model", NULL);
+ if (tmp)
+ model = tmp;
+ tmp = of_get_property(rootdn, "system-id", NULL);
+ if (tmp)
+ system_id = tmp;
+ lp_index_ptr = of_get_property(rootdn, "ibm,partition-no",
+ NULL);
+ if (lp_index_ptr)
+ lp_index = be32_to_cpup(lp_index_ptr);
+ of_node_put(rootdn);
+ }
+ seq_printf(m, "serial_number=%s\n", system_id);
+ seq_printf(m, "system_type=%s\n", model);
+ seq_printf(m, "partition_id=%d\n", (int)lp_index);
+
+ return pseries_lparcfg_data(m, v);
+}
+
+static int lparcfg_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, lparcfg_data, NULL);
+}
+
+static const struct file_operations lparcfg_fops = {
+ .read = seq_read,
+ .write = lparcfg_write,
+ .open = lparcfg_open,
+ .release = single_release,
+ .llseek = seq_lseek,
+};
+
+static int __init lparcfg_init(void)
+{
+ umode_t mode = 0444;
+
+ /* Allow writing if we have FW_FEATURE_SPLPAR */
+ if (firmware_has_feature(FW_FEATURE_SPLPAR))
+ mode |= 0200;
+
+ if (!proc_create("powerpc/lparcfg", mode, NULL, &lparcfg_fops)) {
+ printk(KERN_ERR "Failed to create powerpc/lparcfg\n");
+ return -EIO;
+ }
+ return 0;
+}
+machine_device_initcall(pseries, lparcfg_init);
diff --git a/arch/powerpc/platforms/pseries/mobility.c b/arch/powerpc/platforms/pseries/mobility.c
new file mode 100644
index 000000000..70744b4fb
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/mobility.c
@@ -0,0 +1,439 @@
+/*
+ * Support for Partition Mobility/Migration
+ *
+ * Copyright (C) 2010 Nathan Fontenot
+ * Copyright (C) 2010 IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ */
+
+#include <linux/cpu.h>
+#include <linux/kernel.h>
+#include <linux/kobject.h>
+#include <linux/sched.h>
+#include <linux/smp.h>
+#include <linux/stat.h>
+#include <linux/completion.h>
+#include <linux/device.h>
+#include <linux/delay.h>
+#include <linux/slab.h>
+#include <linux/stringify.h>
+
+#include <asm/machdep.h>
+#include <asm/rtas.h>
+#include "pseries.h"
+#include "../../kernel/cacheinfo.h"
+
+static struct kobject *mobility_kobj;
+
+struct update_props_workarea {
+ __be32 phandle;
+ __be32 state;
+ __be64 reserved;
+ __be32 nprops;
+} __packed;
+
+#define NODE_ACTION_MASK 0xff000000
+#define NODE_COUNT_MASK 0x00ffffff
+
+#define DELETE_DT_NODE 0x01000000
+#define UPDATE_DT_NODE 0x02000000
+#define ADD_DT_NODE 0x03000000
+
+#define MIGRATION_SCOPE (1)
+#define PRRN_SCOPE -2
+
+static int mobility_rtas_call(int token, char *buf, s32 scope)
+{
+ int rc;
+
+ spin_lock(&rtas_data_buf_lock);
+
+ memcpy(rtas_data_buf, buf, RTAS_DATA_BUF_SIZE);
+ rc = rtas_call(token, 2, 1, NULL, rtas_data_buf, scope);
+ memcpy(buf, rtas_data_buf, RTAS_DATA_BUF_SIZE);
+
+ spin_unlock(&rtas_data_buf_lock);
+ return rc;
+}
+
+static int delete_dt_node(__be32 phandle)
+{
+ struct device_node *dn;
+
+ dn = of_find_node_by_phandle(be32_to_cpu(phandle));
+ if (!dn)
+ return -ENOENT;
+
+ dlpar_detach_node(dn);
+ of_node_put(dn);
+ return 0;
+}
+
+static int update_dt_property(struct device_node *dn, struct property **prop,
+ const char *name, u32 vd, char *value)
+{
+ struct property *new_prop = *prop;
+ int more = 0;
+
+ /* A negative 'vd' value indicates that only part of the new property
+ * value is contained in the buffer and we need to call
+ * ibm,update-properties again to get the rest of the value.
+ *
+ * A negative value is also the two's compliment of the actual value.
+ */
+ if (vd & 0x80000000) {
+ vd = ~vd + 1;
+ more = 1;
+ }
+
+ if (new_prop) {
+ /* partial property fixup */
+ char *new_data = kzalloc(new_prop->length + vd, GFP_KERNEL);
+ if (!new_data)
+ return -ENOMEM;
+
+ memcpy(new_data, new_prop->value, new_prop->length);
+ memcpy(new_data + new_prop->length, value, vd);
+
+ kfree(new_prop->value);
+ new_prop->value = new_data;
+ new_prop->length += vd;
+ } else {
+ new_prop = kzalloc(sizeof(*new_prop), GFP_KERNEL);
+ if (!new_prop)
+ return -ENOMEM;
+
+ new_prop->name = kstrdup(name, GFP_KERNEL);
+ if (!new_prop->name) {
+ kfree(new_prop);
+ return -ENOMEM;
+ }
+
+ new_prop->length = vd;
+ new_prop->value = kzalloc(new_prop->length, GFP_KERNEL);
+ if (!new_prop->value) {
+ kfree(new_prop->name);
+ kfree(new_prop);
+ return -ENOMEM;
+ }
+
+ memcpy(new_prop->value, value, vd);
+ *prop = new_prop;
+ }
+
+ if (!more) {
+ of_update_property(dn, new_prop);
+ *prop = NULL;
+ }
+
+ return 0;
+}
+
+static int update_dt_node(__be32 phandle, s32 scope)
+{
+ struct update_props_workarea *upwa;
+ struct device_node *dn;
+ struct property *prop = NULL;
+ int i, rc, rtas_rc;
+ char *prop_data;
+ char *rtas_buf;
+ int update_properties_token;
+ u32 nprops;
+ u32 vd;
+
+ update_properties_token = rtas_token("ibm,update-properties");
+ if (update_properties_token == RTAS_UNKNOWN_SERVICE)
+ return -EINVAL;
+
+ rtas_buf = kzalloc(RTAS_DATA_BUF_SIZE, GFP_KERNEL);
+ if (!rtas_buf)
+ return -ENOMEM;
+
+ dn = of_find_node_by_phandle(be32_to_cpu(phandle));
+ if (!dn) {
+ kfree(rtas_buf);
+ return -ENOENT;
+ }
+
+ upwa = (struct update_props_workarea *)&rtas_buf[0];
+ upwa->phandle = phandle;
+
+ do {
+ rtas_rc = mobility_rtas_call(update_properties_token, rtas_buf,
+ scope);
+ if (rtas_rc < 0)
+ break;
+
+ prop_data = rtas_buf + sizeof(*upwa);
+ nprops = be32_to_cpu(upwa->nprops);
+
+ /* On the first call to ibm,update-properties for a node the
+ * the first property value descriptor contains an empty
+ * property name, the property value length encoded as u32,
+ * and the property value is the node path being updated.
+ */
+ if (*prop_data == 0) {
+ prop_data++;
+ vd = be32_to_cpu(*(__be32 *)prop_data);
+ prop_data += vd + sizeof(vd);
+ nprops--;
+ }
+
+ for (i = 0; i < nprops; i++) {
+ char *prop_name;
+
+ prop_name = prop_data;
+ prop_data += strlen(prop_name) + 1;
+ vd = be32_to_cpu(*(__be32 *)prop_data);
+ prop_data += sizeof(vd);
+
+ switch (vd) {
+ case 0x00000000:
+ /* name only property, nothing to do */
+ break;
+
+ case 0x80000000:
+ of_remove_property(dn, of_find_property(dn,
+ prop_name, NULL));
+ prop = NULL;
+ break;
+
+ default:
+ rc = update_dt_property(dn, &prop, prop_name,
+ vd, prop_data);
+ if (rc) {
+ printk(KERN_ERR "Could not update %s"
+ " property\n", prop_name);
+ }
+
+ prop_data += vd;
+ }
+
+ cond_resched();
+ }
+
+ cond_resched();
+ } while (rtas_rc == 1);
+
+ of_node_put(dn);
+ kfree(rtas_buf);
+ return 0;
+}
+
+static int add_dt_node(__be32 parent_phandle, __be32 drc_index)
+{
+ struct device_node *dn;
+ struct device_node *parent_dn;
+ int rc;
+
+ parent_dn = of_find_node_by_phandle(be32_to_cpu(parent_phandle));
+ if (!parent_dn)
+ return -ENOENT;
+
+ dn = dlpar_configure_connector(drc_index, parent_dn);
+ if (!dn) {
+ of_node_put(parent_dn);
+ return -ENOENT;
+ }
+
+ rc = dlpar_attach_node(dn, parent_dn);
+ if (rc)
+ dlpar_free_cc_nodes(dn);
+
+ of_node_put(parent_dn);
+ return rc;
+}
+
+static void prrn_update_node(__be32 phandle)
+{
+ struct pseries_hp_errorlog *hp_elog;
+ struct device_node *dn;
+
+ /*
+ * If a node is found from a the given phandle, the phandle does not
+ * represent the drc index of an LMB and we can ignore.
+ */
+ dn = of_find_node_by_phandle(be32_to_cpu(phandle));
+ if (dn) {
+ of_node_put(dn);
+ return;
+ }
+
+ hp_elog = kzalloc(sizeof(*hp_elog), GFP_KERNEL);
+ if(!hp_elog)
+ return;
+
+ hp_elog->resource = PSERIES_HP_ELOG_RESOURCE_MEM;
+ hp_elog->action = PSERIES_HP_ELOG_ACTION_READD;
+ hp_elog->id_type = PSERIES_HP_ELOG_ID_DRC_INDEX;
+ hp_elog->_drc_u.drc_index = phandle;
+
+ queue_hotplug_event(hp_elog, NULL, NULL);
+
+ kfree(hp_elog);
+}
+
+int pseries_devicetree_update(s32 scope)
+{
+ char *rtas_buf;
+ __be32 *data;
+ int update_nodes_token;
+ int rc;
+
+ update_nodes_token = rtas_token("ibm,update-nodes");
+ if (update_nodes_token == RTAS_UNKNOWN_SERVICE)
+ return -EINVAL;
+
+ rtas_buf = kzalloc(RTAS_DATA_BUF_SIZE, GFP_KERNEL);
+ if (!rtas_buf)
+ return -ENOMEM;
+
+ do {
+ rc = mobility_rtas_call(update_nodes_token, rtas_buf, scope);
+ if (rc && rc != 1)
+ break;
+
+ data = (__be32 *)rtas_buf + 4;
+ while (be32_to_cpu(*data) & NODE_ACTION_MASK) {
+ int i;
+ u32 action = be32_to_cpu(*data) & NODE_ACTION_MASK;
+ u32 node_count = be32_to_cpu(*data) & NODE_COUNT_MASK;
+
+ data++;
+
+ for (i = 0; i < node_count; i++) {
+ __be32 phandle = *data++;
+ __be32 drc_index;
+
+ switch (action) {
+ case DELETE_DT_NODE:
+ delete_dt_node(phandle);
+ break;
+ case UPDATE_DT_NODE:
+ update_dt_node(phandle, scope);
+
+ if (scope == PRRN_SCOPE)
+ prrn_update_node(phandle);
+
+ break;
+ case ADD_DT_NODE:
+ drc_index = *data++;
+ add_dt_node(phandle, drc_index);
+ break;
+ }
+
+ cond_resched();
+ }
+ }
+
+ cond_resched();
+ } while (rc == 1);
+
+ kfree(rtas_buf);
+ return rc;
+}
+
+void post_mobility_fixup(void)
+{
+ int rc;
+ int activate_fw_token;
+
+ activate_fw_token = rtas_token("ibm,activate-firmware");
+ if (activate_fw_token == RTAS_UNKNOWN_SERVICE) {
+ printk(KERN_ERR "Could not make post-mobility "
+ "activate-fw call.\n");
+ return;
+ }
+
+ do {
+ rc = rtas_call(activate_fw_token, 0, 1, NULL);
+ } while (rtas_busy_delay(rc));
+
+ if (rc)
+ printk(KERN_ERR "Post-mobility activate-fw failed: %d\n", rc);
+
+ /*
+ * We don't want CPUs to go online/offline while the device
+ * tree is being updated.
+ */
+ cpus_read_lock();
+
+ /*
+ * It's common for the destination firmware to replace cache
+ * nodes. Release all of the cacheinfo hierarchy's references
+ * before updating the device tree.
+ */
+ cacheinfo_teardown();
+
+ rc = pseries_devicetree_update(MIGRATION_SCOPE);
+ if (rc)
+ printk(KERN_ERR "Post-mobility device tree update "
+ "failed: %d\n", rc);
+
+ cacheinfo_rebuild();
+
+ cpus_read_unlock();
+
+ /* Possibly switch to a new RFI flush type */
+ pseries_setup_rfi_flush();
+
+ return;
+}
+
+static ssize_t migration_store(struct class *class,
+ struct class_attribute *attr, const char *buf,
+ size_t count)
+{
+ u64 streamid;
+ int rc;
+
+ rc = kstrtou64(buf, 0, &streamid);
+ if (rc)
+ return rc;
+
+ do {
+ rc = rtas_ibm_suspend_me(streamid);
+ if (rc == -EAGAIN)
+ ssleep(1);
+ } while (rc == -EAGAIN);
+
+ if (rc)
+ return rc;
+
+ post_mobility_fixup();
+ return count;
+}
+
+/*
+ * Used by drmgr to determine the kernel behavior of the migration interface.
+ *
+ * Version 1: Performs all PAPR requirements for migration including
+ * firmware activation and device tree update.
+ */
+#define MIGRATION_API_VERSION 1
+
+static CLASS_ATTR_WO(migration);
+static CLASS_ATTR_STRING(api_version, 0444, __stringify(MIGRATION_API_VERSION));
+
+static int __init mobility_sysfs_init(void)
+{
+ int rc;
+
+ mobility_kobj = kobject_create_and_add("mobility", kernel_kobj);
+ if (!mobility_kobj)
+ return -ENOMEM;
+
+ rc = sysfs_create_file(mobility_kobj, &class_attr_migration.attr);
+ if (rc)
+ pr_err("mobility: unable to create migration sysfs file (%d)\n", rc);
+
+ rc = sysfs_create_file(mobility_kobj, &class_attr_api_version.attr.attr);
+ if (rc)
+ pr_err("mobility: unable to create api_version sysfs file (%d)\n", rc);
+
+ return 0;
+}
+machine_device_initcall(pseries, mobility_sysfs_init);
diff --git a/arch/powerpc/platforms/pseries/msi.c b/arch/powerpc/platforms/pseries/msi.c
new file mode 100644
index 000000000..b74969481
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/msi.c
@@ -0,0 +1,531 @@
+/*
+ * Copyright 2006 Jake Moilanen <moilanen@austin.ibm.com>, IBM Corp.
+ * Copyright 2006-2007 Michael Ellerman, IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2 of the
+ * License.
+ *
+ */
+
+#include <linux/device.h>
+#include <linux/irq.h>
+#include <linux/msi.h>
+
+#include <asm/rtas.h>
+#include <asm/hw_irq.h>
+#include <asm/ppc-pci.h>
+#include <asm/machdep.h>
+
+#include "pseries.h"
+
+static int query_token, change_token;
+
+#define RTAS_QUERY_FN 0
+#define RTAS_CHANGE_FN 1
+#define RTAS_RESET_FN 2
+#define RTAS_CHANGE_MSI_FN 3
+#define RTAS_CHANGE_MSIX_FN 4
+#define RTAS_CHANGE_32MSI_FN 5
+
+/* RTAS Helpers */
+
+static int rtas_change_msi(struct pci_dn *pdn, u32 func, u32 num_irqs)
+{
+ u32 addr, seq_num, rtas_ret[3];
+ unsigned long buid;
+ int rc;
+
+ addr = rtas_config_addr(pdn->busno, pdn->devfn, 0);
+ buid = pdn->phb->buid;
+
+ seq_num = 1;
+ do {
+ if (func == RTAS_CHANGE_MSI_FN || func == RTAS_CHANGE_MSIX_FN ||
+ func == RTAS_CHANGE_32MSI_FN)
+ rc = rtas_call(change_token, 6, 4, rtas_ret, addr,
+ BUID_HI(buid), BUID_LO(buid),
+ func, num_irqs, seq_num);
+ else
+ rc = rtas_call(change_token, 6, 3, rtas_ret, addr,
+ BUID_HI(buid), BUID_LO(buid),
+ func, num_irqs, seq_num);
+
+ seq_num = rtas_ret[1];
+ } while (rtas_busy_delay(rc));
+
+ /*
+ * If the RTAS call succeeded, return the number of irqs allocated.
+ * If not, make sure we return a negative error code.
+ */
+ if (rc == 0)
+ rc = rtas_ret[0];
+ else if (rc > 0)
+ rc = -rc;
+
+ pr_debug("rtas_msi: ibm,change_msi(func=%d,num=%d), got %d rc = %d\n",
+ func, num_irqs, rtas_ret[0], rc);
+
+ return rc;
+}
+
+static void rtas_disable_msi(struct pci_dev *pdev)
+{
+ struct pci_dn *pdn;
+
+ pdn = pci_get_pdn(pdev);
+ if (!pdn)
+ return;
+
+ /*
+ * disabling MSI with the explicit interface also disables MSI-X
+ */
+ if (rtas_change_msi(pdn, RTAS_CHANGE_MSI_FN, 0) != 0) {
+ /*
+ * may have failed because explicit interface is not
+ * present
+ */
+ if (rtas_change_msi(pdn, RTAS_CHANGE_FN, 0) != 0) {
+ pr_debug("rtas_msi: Setting MSIs to 0 failed!\n");
+ }
+ }
+}
+
+static int rtas_query_irq_number(struct pci_dn *pdn, int offset)
+{
+ u32 addr, rtas_ret[2];
+ unsigned long buid;
+ int rc;
+
+ addr = rtas_config_addr(pdn->busno, pdn->devfn, 0);
+ buid = pdn->phb->buid;
+
+ do {
+ rc = rtas_call(query_token, 4, 3, rtas_ret, addr,
+ BUID_HI(buid), BUID_LO(buid), offset);
+ } while (rtas_busy_delay(rc));
+
+ if (rc) {
+ pr_debug("rtas_msi: error (%d) querying source number\n", rc);
+ return rc;
+ }
+
+ return rtas_ret[0];
+}
+
+static void rtas_teardown_msi_irqs(struct pci_dev *pdev)
+{
+ struct msi_desc *entry;
+
+ for_each_pci_msi_entry(entry, pdev) {
+ if (!entry->irq)
+ continue;
+
+ irq_set_msi_desc(entry->irq, NULL);
+ irq_dispose_mapping(entry->irq);
+ }
+
+ rtas_disable_msi(pdev);
+}
+
+static int check_req(struct pci_dev *pdev, int nvec, char *prop_name)
+{
+ struct device_node *dn;
+ const __be32 *p;
+ u32 req_msi;
+
+ dn = pci_device_to_OF_node(pdev);
+
+ p = of_get_property(dn, prop_name, NULL);
+ if (!p) {
+ pr_debug("rtas_msi: No %s on %pOF\n", prop_name, dn);
+ return -ENOENT;
+ }
+
+ req_msi = be32_to_cpup(p);
+ if (req_msi < nvec) {
+ pr_debug("rtas_msi: %s requests < %d MSIs\n", prop_name, nvec);
+
+ if (req_msi == 0) /* Be paranoid */
+ return -ENOSPC;
+
+ return req_msi;
+ }
+
+ return 0;
+}
+
+static int check_req_msi(struct pci_dev *pdev, int nvec)
+{
+ return check_req(pdev, nvec, "ibm,req#msi");
+}
+
+static int check_req_msix(struct pci_dev *pdev, int nvec)
+{
+ return check_req(pdev, nvec, "ibm,req#msi-x");
+}
+
+/* Quota calculation */
+
+static struct device_node *find_pe_total_msi(struct pci_dev *dev, int *total)
+{
+ struct device_node *dn;
+ const __be32 *p;
+
+ dn = of_node_get(pci_device_to_OF_node(dev));
+ while (dn) {
+ p = of_get_property(dn, "ibm,pe-total-#msi", NULL);
+ if (p) {
+ pr_debug("rtas_msi: found prop on dn %pOF\n",
+ dn);
+ *total = be32_to_cpup(p);
+ return dn;
+ }
+
+ dn = of_get_next_parent(dn);
+ }
+
+ return NULL;
+}
+
+static struct device_node *find_pe_dn(struct pci_dev *dev, int *total)
+{
+ struct device_node *dn;
+ struct eeh_dev *edev;
+
+ /* Found our PE and assume 8 at that point. */
+
+ dn = pci_device_to_OF_node(dev);
+ if (!dn)
+ return NULL;
+
+ /* Get the top level device in the PE */
+ edev = pdn_to_eeh_dev(PCI_DN(dn));
+ if (edev->pe)
+ edev = list_first_entry(&edev->pe->edevs, struct eeh_dev, list);
+ dn = pci_device_to_OF_node(edev->pdev);
+ if (!dn)
+ return NULL;
+
+ /* We actually want the parent */
+ dn = of_get_parent(dn);
+ if (!dn)
+ return NULL;
+
+ /* Hardcode of 8 for old firmwares */
+ *total = 8;
+ pr_debug("rtas_msi: using PE dn %pOF\n", dn);
+
+ return dn;
+}
+
+struct msi_counts {
+ struct device_node *requestor;
+ int num_devices;
+ int request;
+ int quota;
+ int spare;
+ int over_quota;
+};
+
+static void *count_non_bridge_devices(struct device_node *dn, void *data)
+{
+ struct msi_counts *counts = data;
+ const __be32 *p;
+ u32 class;
+
+ pr_debug("rtas_msi: counting %pOF\n", dn);
+
+ p = of_get_property(dn, "class-code", NULL);
+ class = p ? be32_to_cpup(p) : 0;
+
+ if ((class >> 8) != PCI_CLASS_BRIDGE_PCI)
+ counts->num_devices++;
+
+ return NULL;
+}
+
+static void *count_spare_msis(struct device_node *dn, void *data)
+{
+ struct msi_counts *counts = data;
+ const __be32 *p;
+ int req;
+
+ if (dn == counts->requestor)
+ req = counts->request;
+ else {
+ /* We don't know if a driver will try to use MSI or MSI-X,
+ * so we just have to punt and use the larger of the two. */
+ req = 0;
+ p = of_get_property(dn, "ibm,req#msi", NULL);
+ if (p)
+ req = be32_to_cpup(p);
+
+ p = of_get_property(dn, "ibm,req#msi-x", NULL);
+ if (p)
+ req = max(req, (int)be32_to_cpup(p));
+ }
+
+ if (req < counts->quota)
+ counts->spare += counts->quota - req;
+ else if (req > counts->quota)
+ counts->over_quota++;
+
+ return NULL;
+}
+
+static int msi_quota_for_device(struct pci_dev *dev, int request)
+{
+ struct device_node *pe_dn;
+ struct msi_counts counts;
+ int total;
+
+ pr_debug("rtas_msi: calc quota for %s, request %d\n", pci_name(dev),
+ request);
+
+ pe_dn = find_pe_total_msi(dev, &total);
+ if (!pe_dn)
+ pe_dn = find_pe_dn(dev, &total);
+
+ if (!pe_dn) {
+ pr_err("rtas_msi: couldn't find PE for %s\n", pci_name(dev));
+ goto out;
+ }
+
+ pr_debug("rtas_msi: found PE %pOF\n", pe_dn);
+
+ memset(&counts, 0, sizeof(struct msi_counts));
+
+ /* Work out how many devices we have below this PE */
+ pci_traverse_device_nodes(pe_dn, count_non_bridge_devices, &counts);
+
+ if (counts.num_devices == 0) {
+ pr_err("rtas_msi: found 0 devices under PE for %s\n",
+ pci_name(dev));
+ goto out;
+ }
+
+ counts.quota = total / counts.num_devices;
+ if (request <= counts.quota)
+ goto out;
+
+ /* else, we have some more calculating to do */
+ counts.requestor = pci_device_to_OF_node(dev);
+ counts.request = request;
+ pci_traverse_device_nodes(pe_dn, count_spare_msis, &counts);
+
+ /* If the quota isn't an integer multiple of the total, we can
+ * use the remainder as spare MSIs for anyone that wants them. */
+ counts.spare += total % counts.num_devices;
+
+ /* Divide any spare by the number of over-quota requestors */
+ if (counts.over_quota)
+ counts.quota += counts.spare / counts.over_quota;
+
+ /* And finally clamp the request to the possibly adjusted quota */
+ request = min(counts.quota, request);
+
+ pr_debug("rtas_msi: request clamped to quota %d\n", request);
+out:
+ of_node_put(pe_dn);
+
+ return request;
+}
+
+static int check_msix_entries(struct pci_dev *pdev)
+{
+ struct msi_desc *entry;
+ int expected;
+
+ /* There's no way for us to express to firmware that we want
+ * a discontiguous, or non-zero based, range of MSI-X entries.
+ * So we must reject such requests. */
+
+ expected = 0;
+ for_each_pci_msi_entry(entry, pdev) {
+ if (entry->msi_attrib.entry_nr != expected) {
+ pr_debug("rtas_msi: bad MSI-X entries.\n");
+ return -EINVAL;
+ }
+ expected++;
+ }
+
+ return 0;
+}
+
+static void rtas_hack_32bit_msi_gen2(struct pci_dev *pdev)
+{
+ u32 addr_hi, addr_lo;
+
+ /*
+ * We should only get in here for IODA1 configs. This is based on the
+ * fact that we using RTAS for MSIs, we don't have the 32 bit MSI RTAS
+ * support, and we are in a PCIe Gen2 slot.
+ */
+ dev_info(&pdev->dev,
+ "rtas_msi: No 32 bit MSI firmware support, forcing 32 bit MSI\n");
+ pci_read_config_dword(pdev, pdev->msi_cap + PCI_MSI_ADDRESS_HI, &addr_hi);
+ addr_lo = 0xffff0000 | ((addr_hi >> (48 - 32)) << 4);
+ pci_write_config_dword(pdev, pdev->msi_cap + PCI_MSI_ADDRESS_LO, addr_lo);
+ pci_write_config_dword(pdev, pdev->msi_cap + PCI_MSI_ADDRESS_HI, 0);
+}
+
+static int rtas_setup_msi_irqs(struct pci_dev *pdev, int nvec_in, int type)
+{
+ struct pci_dn *pdn;
+ int hwirq, virq, i, quota, rc;
+ struct msi_desc *entry;
+ struct msi_msg msg;
+ int nvec = nvec_in;
+ int use_32bit_msi_hack = 0;
+
+ if (type == PCI_CAP_ID_MSIX)
+ rc = check_req_msix(pdev, nvec);
+ else
+ rc = check_req_msi(pdev, nvec);
+
+ if (rc)
+ return rc;
+
+ quota = msi_quota_for_device(pdev, nvec);
+
+ if (quota && quota < nvec)
+ return quota;
+
+ if (type == PCI_CAP_ID_MSIX && check_msix_entries(pdev))
+ return -EINVAL;
+
+ /*
+ * Firmware currently refuse any non power of two allocation
+ * so we round up if the quota will allow it.
+ */
+ if (type == PCI_CAP_ID_MSIX) {
+ int m = roundup_pow_of_two(nvec);
+ quota = msi_quota_for_device(pdev, m);
+
+ if (quota >= m)
+ nvec = m;
+ }
+
+ pdn = pci_get_pdn(pdev);
+
+ /*
+ * Try the new more explicit firmware interface, if that fails fall
+ * back to the old interface. The old interface is known to never
+ * return MSI-Xs.
+ */
+again:
+ if (type == PCI_CAP_ID_MSI) {
+ if (pdev->no_64bit_msi) {
+ rc = rtas_change_msi(pdn, RTAS_CHANGE_32MSI_FN, nvec);
+ if (rc < 0) {
+ /*
+ * We only want to run the 32 bit MSI hack below if
+ * the max bus speed is Gen2 speed
+ */
+ if (pdev->bus->max_bus_speed != PCIE_SPEED_5_0GT)
+ return rc;
+
+ use_32bit_msi_hack = 1;
+ }
+ } else
+ rc = -1;
+
+ if (rc < 0)
+ rc = rtas_change_msi(pdn, RTAS_CHANGE_MSI_FN, nvec);
+
+ if (rc < 0) {
+ pr_debug("rtas_msi: trying the old firmware call.\n");
+ rc = rtas_change_msi(pdn, RTAS_CHANGE_FN, nvec);
+ }
+
+ if (use_32bit_msi_hack && rc > 0)
+ rtas_hack_32bit_msi_gen2(pdev);
+ } else
+ rc = rtas_change_msi(pdn, RTAS_CHANGE_MSIX_FN, nvec);
+
+ if (rc != nvec) {
+ if (nvec != nvec_in) {
+ nvec = nvec_in;
+ goto again;
+ }
+ pr_debug("rtas_msi: rtas_change_msi() failed\n");
+ return rc;
+ }
+
+ i = 0;
+ for_each_pci_msi_entry(entry, pdev) {
+ hwirq = rtas_query_irq_number(pdn, i++);
+ if (hwirq < 0) {
+ pr_debug("rtas_msi: error (%d) getting hwirq\n", rc);
+ return hwirq;
+ }
+
+ virq = irq_create_mapping(NULL, hwirq);
+
+ if (!virq) {
+ pr_debug("rtas_msi: Failed mapping hwirq %d\n", hwirq);
+ return -ENOSPC;
+ }
+
+ dev_dbg(&pdev->dev, "rtas_msi: allocated virq %d\n", virq);
+ irq_set_msi_desc(virq, entry);
+
+ /* Read config space back so we can restore after reset */
+ __pci_read_msi_msg(entry, &msg);
+ entry->msg = msg;
+ }
+
+ return 0;
+}
+
+static void rtas_msi_pci_irq_fixup(struct pci_dev *pdev)
+{
+ /* No LSI -> leave MSIs (if any) configured */
+ if (!pdev->irq) {
+ dev_dbg(&pdev->dev, "rtas_msi: no LSI, nothing to do.\n");
+ return;
+ }
+
+ /* No MSI -> MSIs can't have been assigned by fw, leave LSI */
+ if (check_req_msi(pdev, 1) && check_req_msix(pdev, 1)) {
+ dev_dbg(&pdev->dev, "rtas_msi: no req#msi/x, nothing to do.\n");
+ return;
+ }
+
+ dev_dbg(&pdev->dev, "rtas_msi: disabling existing MSI.\n");
+ rtas_disable_msi(pdev);
+}
+
+static int rtas_msi_init(void)
+{
+ struct pci_controller *phb;
+
+ query_token = rtas_token("ibm,query-interrupt-source-number");
+ change_token = rtas_token("ibm,change-msi");
+
+ if ((query_token == RTAS_UNKNOWN_SERVICE) ||
+ (change_token == RTAS_UNKNOWN_SERVICE)) {
+ pr_debug("rtas_msi: no RTAS tokens, no MSI support.\n");
+ return -1;
+ }
+
+ pr_debug("rtas_msi: Registering RTAS MSI callbacks.\n");
+
+ WARN_ON(pseries_pci_controller_ops.setup_msi_irqs);
+ pseries_pci_controller_ops.setup_msi_irqs = rtas_setup_msi_irqs;
+ pseries_pci_controller_ops.teardown_msi_irqs = rtas_teardown_msi_irqs;
+
+ list_for_each_entry(phb, &hose_list, list_node) {
+ WARN_ON(phb->controller_ops.setup_msi_irqs);
+ phb->controller_ops.setup_msi_irqs = rtas_setup_msi_irqs;
+ phb->controller_ops.teardown_msi_irqs = rtas_teardown_msi_irqs;
+ }
+
+ WARN_ON(ppc_md.pci_irq_fixup);
+ ppc_md.pci_irq_fixup = rtas_msi_pci_irq_fixup;
+
+ return 0;
+}
+machine_arch_initcall(pseries, rtas_msi_init);
diff --git a/arch/powerpc/platforms/pseries/nvram.c b/arch/powerpc/platforms/pseries/nvram.c
new file mode 100644
index 000000000..69cedc1b3
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/nvram.c
@@ -0,0 +1,247 @@
+/*
+ * c 2001 PPC 64 Team, IBM Corp
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * /dev/nvram driver for PPC64
+ *
+ * This perhaps should live in drivers/char
+ */
+
+
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <linux/spinlock.h>
+#include <linux/slab.h>
+#include <linux/ctype.h>
+#include <linux/uaccess.h>
+#include <asm/nvram.h>
+#include <asm/rtas.h>
+#include <asm/prom.h>
+#include <asm/machdep.h>
+
+/* Max bytes to read/write in one go */
+#define NVRW_CNT 0x20
+
+static unsigned int nvram_size;
+static int nvram_fetch, nvram_store;
+static char nvram_buf[NVRW_CNT]; /* assume this is in the first 4GB */
+static DEFINE_SPINLOCK(nvram_lock);
+
+/* See clobbering_unread_rtas_event() */
+#define NVRAM_RTAS_READ_TIMEOUT 5 /* seconds */
+static time64_t last_unread_rtas_event; /* timestamp */
+
+#ifdef CONFIG_PSTORE
+time64_t last_rtas_event;
+#endif
+
+static ssize_t pSeries_nvram_read(char *buf, size_t count, loff_t *index)
+{
+ unsigned int i;
+ unsigned long len;
+ int done;
+ unsigned long flags;
+ char *p = buf;
+
+
+ if (nvram_size == 0 || nvram_fetch == RTAS_UNKNOWN_SERVICE)
+ return -ENODEV;
+
+ if (*index >= nvram_size)
+ return 0;
+
+ i = *index;
+ if (i + count > nvram_size)
+ count = nvram_size - i;
+
+ spin_lock_irqsave(&nvram_lock, flags);
+
+ for (; count != 0; count -= len) {
+ len = count;
+ if (len > NVRW_CNT)
+ len = NVRW_CNT;
+
+ if ((rtas_call(nvram_fetch, 3, 2, &done, i, __pa(nvram_buf),
+ len) != 0) || len != done) {
+ spin_unlock_irqrestore(&nvram_lock, flags);
+ return -EIO;
+ }
+
+ memcpy(p, nvram_buf, len);
+
+ p += len;
+ i += len;
+ }
+
+ spin_unlock_irqrestore(&nvram_lock, flags);
+
+ *index = i;
+ return p - buf;
+}
+
+static ssize_t pSeries_nvram_write(char *buf, size_t count, loff_t *index)
+{
+ unsigned int i;
+ unsigned long len;
+ int done;
+ unsigned long flags;
+ const char *p = buf;
+
+ if (nvram_size == 0 || nvram_store == RTAS_UNKNOWN_SERVICE)
+ return -ENODEV;
+
+ if (*index >= nvram_size)
+ return 0;
+
+ i = *index;
+ if (i + count > nvram_size)
+ count = nvram_size - i;
+
+ spin_lock_irqsave(&nvram_lock, flags);
+
+ for (; count != 0; count -= len) {
+ len = count;
+ if (len > NVRW_CNT)
+ len = NVRW_CNT;
+
+ memcpy(nvram_buf, p, len);
+
+ if ((rtas_call(nvram_store, 3, 2, &done, i, __pa(nvram_buf),
+ len) != 0) || len != done) {
+ spin_unlock_irqrestore(&nvram_lock, flags);
+ return -EIO;
+ }
+
+ p += len;
+ i += len;
+ }
+ spin_unlock_irqrestore(&nvram_lock, flags);
+
+ *index = i;
+ return p - buf;
+}
+
+static ssize_t pSeries_nvram_get_size(void)
+{
+ return nvram_size ? nvram_size : -ENODEV;
+}
+
+/* nvram_write_error_log
+ *
+ * We need to buffer the error logs into nvram to ensure that we have
+ * the failure information to decode.
+ */
+int nvram_write_error_log(char * buff, int length,
+ unsigned int err_type, unsigned int error_log_cnt)
+{
+ int rc = nvram_write_os_partition(&rtas_log_partition, buff, length,
+ err_type, error_log_cnt);
+ if (!rc) {
+ last_unread_rtas_event = ktime_get_real_seconds();
+#ifdef CONFIG_PSTORE
+ last_rtas_event = ktime_get_real_seconds();
+#endif
+ }
+
+ return rc;
+}
+
+/* nvram_read_error_log
+ *
+ * Reads nvram for error log for at most 'length'
+ */
+int nvram_read_error_log(char *buff, int length,
+ unsigned int *err_type, unsigned int *error_log_cnt)
+{
+ return nvram_read_partition(&rtas_log_partition, buff, length,
+ err_type, error_log_cnt);
+}
+
+/* This doesn't actually zero anything, but it sets the event_logged
+ * word to tell that this event is safely in syslog.
+ */
+int nvram_clear_error_log(void)
+{
+ loff_t tmp_index;
+ int clear_word = ERR_FLAG_ALREADY_LOGGED;
+ int rc;
+
+ if (rtas_log_partition.index == -1)
+ return -1;
+
+ tmp_index = rtas_log_partition.index;
+
+ rc = ppc_md.nvram_write((char *)&clear_word, sizeof(int), &tmp_index);
+ if (rc <= 0) {
+ printk(KERN_ERR "nvram_clear_error_log: Failed nvram_write (%d)\n", rc);
+ return rc;
+ }
+ last_unread_rtas_event = 0;
+
+ return 0;
+}
+
+/*
+ * Are we using the ibm,rtas-log for oops/panic reports? And if so,
+ * would logging this oops/panic overwrite an RTAS event that rtas_errd
+ * hasn't had a chance to read and process? Return 1 if so, else 0.
+ *
+ * We assume that if rtas_errd hasn't read the RTAS event in
+ * NVRAM_RTAS_READ_TIMEOUT seconds, it's probably not going to.
+ */
+int clobbering_unread_rtas_event(void)
+{
+ return (oops_log_partition.index == rtas_log_partition.index
+ && last_unread_rtas_event
+ && ktime_get_real_seconds() - last_unread_rtas_event <=
+ NVRAM_RTAS_READ_TIMEOUT);
+}
+
+static int __init pseries_nvram_init_log_partitions(void)
+{
+ int rc;
+
+ /* Scan nvram for partitions */
+ nvram_scan_partitions();
+
+ rc = nvram_init_os_partition(&rtas_log_partition);
+ nvram_init_oops_partition(rc == 0);
+ return 0;
+}
+machine_arch_initcall(pseries, pseries_nvram_init_log_partitions);
+
+int __init pSeries_nvram_init(void)
+{
+ struct device_node *nvram;
+ const __be32 *nbytes_p;
+ unsigned int proplen;
+
+ nvram = of_find_node_by_type(NULL, "nvram");
+ if (nvram == NULL)
+ return -ENODEV;
+
+ nbytes_p = of_get_property(nvram, "#bytes", &proplen);
+ if (nbytes_p == NULL || proplen != sizeof(unsigned int)) {
+ of_node_put(nvram);
+ return -EIO;
+ }
+
+ nvram_size = be32_to_cpup(nbytes_p);
+
+ nvram_fetch = rtas_token("nvram-fetch");
+ nvram_store = rtas_token("nvram-store");
+ printk(KERN_INFO "PPC64 nvram contains %d bytes\n", nvram_size);
+ of_node_put(nvram);
+
+ ppc_md.nvram_read = pSeries_nvram_read;
+ ppc_md.nvram_write = pSeries_nvram_write;
+ ppc_md.nvram_size = pSeries_nvram_get_size;
+
+ return 0;
+}
+
diff --git a/arch/powerpc/platforms/pseries/of_helpers.c b/arch/powerpc/platforms/pseries/of_helpers.c
new file mode 100644
index 000000000..6df192f38
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/of_helpers.c
@@ -0,0 +1,99 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/string.h>
+#include <linux/err.h>
+#include <linux/slab.h>
+#include <linux/of.h>
+#include <asm/prom.h>
+
+#include "of_helpers.h"
+
+/**
+ * pseries_of_derive_parent - basically like dirname(1)
+ * @path: the full_name of a node to be added to the tree
+ *
+ * Returns the node which should be the parent of the node
+ * described by path. E.g., for path = "/foo/bar", returns
+ * the node with full_name = "/foo".
+ */
+struct device_node *pseries_of_derive_parent(const char *path)
+{
+ struct device_node *parent;
+ char *parent_path = "/";
+ const char *tail;
+
+ /* We do not want the trailing '/' character */
+ tail = kbasename(path) - 1;
+
+ /* reject if path is "/" */
+ if (!strcmp(path, "/"))
+ return ERR_PTR(-EINVAL);
+
+ if (tail > path) {
+ parent_path = kstrndup(path, tail - path, GFP_KERNEL);
+ if (!parent_path)
+ return ERR_PTR(-ENOMEM);
+ }
+ parent = of_find_node_by_path(parent_path);
+ if (strcmp(parent_path, "/"))
+ kfree(parent_path);
+ return parent ? parent : ERR_PTR(-EINVAL);
+}
+
+
+/* Helper Routines to convert between drc_index to cpu numbers */
+
+int of_read_drc_info_cell(struct property **prop, const __be32 **curval,
+ struct of_drc_info *data)
+{
+ const char *p;
+ const __be32 *p2;
+
+ if (!data)
+ return -EINVAL;
+
+ /* Get drc-type:encode-string */
+ p = data->drc_type = (char*) (*curval);
+ p = of_prop_next_string(*prop, p);
+ if (!p)
+ return -EINVAL;
+
+ /* Get drc-name-prefix:encode-string */
+ data->drc_name_prefix = (char *)p;
+ p = of_prop_next_string(*prop, p);
+ if (!p)
+ return -EINVAL;
+
+ /* Get drc-index-start:encode-int */
+ p2 = (const __be32 *)p;
+ p2 = of_prop_next_u32(*prop, p2, &data->drc_index_start);
+ if (!p2)
+ return -EINVAL;
+
+ /* Get drc-name-suffix-start:encode-int */
+ p2 = of_prop_next_u32(*prop, p2, &data->drc_name_suffix_start);
+ if (!p2)
+ return -EINVAL;
+
+ /* Get number-sequential-elements:encode-int */
+ p2 = of_prop_next_u32(*prop, p2, &data->num_sequential_elems);
+ if (!p2)
+ return -EINVAL;
+
+ /* Get sequential-increment:encode-int */
+ p2 = of_prop_next_u32(*prop, p2, &data->sequential_inc);
+ if (!p2)
+ return -EINVAL;
+
+ /* Get drc-power-domain:encode-int */
+ p2 = of_prop_next_u32(*prop, p2, &data->drc_power_domain);
+ if (!p2)
+ return -EINVAL;
+
+ /* Should now know end of current entry */
+ (*curval) = (void *)p2;
+ data->last_drc_index = data->drc_index_start +
+ ((data->num_sequential_elems - 1) * data->sequential_inc);
+
+ return 0;
+}
+EXPORT_SYMBOL(of_read_drc_info_cell);
diff --git a/arch/powerpc/platforms/pseries/of_helpers.h b/arch/powerpc/platforms/pseries/of_helpers.h
new file mode 100644
index 000000000..decad6553
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/of_helpers.h
@@ -0,0 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _PSERIES_OF_HELPERS_H
+#define _PSERIES_OF_HELPERS_H
+
+#include <linux/of.h>
+
+struct device_node *pseries_of_derive_parent(const char *path);
+
+#endif /* _PSERIES_OF_HELPERS_H */
diff --git a/arch/powerpc/platforms/pseries/offline_states.h b/arch/powerpc/platforms/pseries/offline_states.h
new file mode 100644
index 000000000..51414aee2
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/offline_states.h
@@ -0,0 +1,38 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _OFFLINE_STATES_H_
+#define _OFFLINE_STATES_H_
+
+/* Cpu offline states go here */
+enum cpu_state_vals {
+ CPU_STATE_OFFLINE,
+ CPU_STATE_INACTIVE,
+ CPU_STATE_ONLINE,
+ CPU_MAX_OFFLINE_STATES
+};
+
+#ifdef CONFIG_HOTPLUG_CPU
+extern enum cpu_state_vals get_cpu_current_state(int cpu);
+extern void set_cpu_current_state(int cpu, enum cpu_state_vals state);
+extern void set_preferred_offline_state(int cpu, enum cpu_state_vals state);
+extern void set_default_offline_state(int cpu);
+#else
+static inline enum cpu_state_vals get_cpu_current_state(int cpu)
+{
+ return CPU_STATE_ONLINE;
+}
+
+static inline void set_cpu_current_state(int cpu, enum cpu_state_vals state)
+{
+}
+
+static inline void set_preferred_offline_state(int cpu, enum cpu_state_vals state)
+{
+}
+
+static inline void set_default_offline_state(int cpu)
+{
+}
+#endif
+
+extern enum cpu_state_vals get_preferred_offline_state(int cpu);
+#endif
diff --git a/arch/powerpc/platforms/pseries/pci.c b/arch/powerpc/platforms/pseries/pci.c
new file mode 100644
index 000000000..eab96637d
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/pci.c
@@ -0,0 +1,345 @@
+/*
+ * Copyright (C) 2001 Dave Engebretsen, IBM Corporation
+ * Copyright (C) 2003 Anton Blanchard <anton@au.ibm.com>, IBM
+ *
+ * pSeries specific routines for PCI.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/init.h>
+#include <linux/ioport.h>
+#include <linux/kernel.h>
+#include <linux/pci.h>
+#include <linux/string.h>
+
+#include <asm/eeh.h>
+#include <asm/pci-bridge.h>
+#include <asm/prom.h>
+#include <asm/ppc-pci.h>
+#include "pseries.h"
+
+#if 0
+void pcibios_name_device(struct pci_dev *dev)
+{
+ struct device_node *dn;
+
+ /*
+ * Add IBM loc code (slot) as a prefix to the device names for service
+ */
+ dn = pci_device_to_OF_node(dev);
+ if (dn) {
+ const char *loc_code = of_get_property(dn, "ibm,loc-code",
+ NULL);
+ if (loc_code) {
+ int loc_len = strlen(loc_code);
+ if (loc_len < sizeof(dev->dev.name)) {
+ memmove(dev->dev.name+loc_len+1, dev->dev.name,
+ sizeof(dev->dev.name)-loc_len-1);
+ memcpy(dev->dev.name, loc_code, loc_len);
+ dev->dev.name[loc_len] = ' ';
+ dev->dev.name[sizeof(dev->dev.name)-1] = '\0';
+ }
+ }
+ }
+}
+DECLARE_PCI_FIXUP_HEADER(PCI_ANY_ID, PCI_ANY_ID, pcibios_name_device);
+#endif
+
+#ifdef CONFIG_PCI_IOV
+#define MAX_VFS_FOR_MAP_PE 256
+struct pe_map_bar_entry {
+ __be64 bar; /* Input: Virtual Function BAR */
+ __be16 rid; /* Input: Virtual Function Router ID */
+ __be16 pe_num; /* Output: Virtual Function PE Number */
+ __be32 reserved; /* Reserved Space */
+};
+
+int pseries_send_map_pe(struct pci_dev *pdev,
+ u16 num_vfs,
+ struct pe_map_bar_entry *vf_pe_array)
+{
+ struct pci_dn *pdn;
+ int rc;
+ unsigned long buid, addr;
+ int ibm_map_pes = rtas_token("ibm,open-sriov-map-pe-number");
+
+ if (ibm_map_pes == RTAS_UNKNOWN_SERVICE)
+ return -EINVAL;
+
+ pdn = pci_get_pdn(pdev);
+ addr = rtas_config_addr(pdn->busno, pdn->devfn, 0);
+ buid = pdn->phb->buid;
+ spin_lock(&rtas_data_buf_lock);
+ memcpy(rtas_data_buf, vf_pe_array,
+ RTAS_DATA_BUF_SIZE);
+ rc = rtas_call(ibm_map_pes, 5, 1, NULL, addr,
+ BUID_HI(buid), BUID_LO(buid),
+ rtas_data_buf,
+ num_vfs * sizeof(struct pe_map_bar_entry));
+ memcpy(vf_pe_array, rtas_data_buf, RTAS_DATA_BUF_SIZE);
+ spin_unlock(&rtas_data_buf_lock);
+
+ if (rc)
+ dev_err(&pdev->dev,
+ "%s: Failed to associate pes PE#%lx, rc=%x\n",
+ __func__, addr, rc);
+
+ return rc;
+}
+
+void pseries_set_pe_num(struct pci_dev *pdev, u16 vf_index, __be16 pe_num)
+{
+ struct pci_dn *pdn;
+
+ pdn = pci_get_pdn(pdev);
+ pdn->pe_num_map[vf_index] = be16_to_cpu(pe_num);
+ dev_dbg(&pdev->dev, "VF %04x:%02x:%02x.%x associated with PE#%x\n",
+ pci_domain_nr(pdev->bus),
+ pdev->bus->number,
+ PCI_SLOT(pci_iov_virtfn_devfn(pdev, vf_index)),
+ PCI_FUNC(pci_iov_virtfn_devfn(pdev, vf_index)),
+ pdn->pe_num_map[vf_index]);
+}
+
+int pseries_associate_pes(struct pci_dev *pdev, u16 num_vfs)
+{
+ struct pci_dn *pdn;
+ int i, rc, vf_index;
+ struct pe_map_bar_entry *vf_pe_array;
+ struct resource *res;
+ u64 size;
+
+ vf_pe_array = kzalloc(RTAS_DATA_BUF_SIZE, GFP_KERNEL);
+ if (!vf_pe_array)
+ return -ENOMEM;
+
+ pdn = pci_get_pdn(pdev);
+ /* create firmware structure to associate pes */
+ for (vf_index = 0; vf_index < num_vfs; vf_index++) {
+ pdn->pe_num_map[vf_index] = IODA_INVALID_PE;
+ for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
+ res = &pdev->resource[i + PCI_IOV_RESOURCES];
+ if (!res->parent)
+ continue;
+ size = pcibios_iov_resource_alignment(pdev, i +
+ PCI_IOV_RESOURCES);
+ vf_pe_array[vf_index].bar =
+ cpu_to_be64(res->start + size * vf_index);
+ vf_pe_array[vf_index].rid =
+ cpu_to_be16((pci_iov_virtfn_bus(pdev, vf_index)
+ << 8) | pci_iov_virtfn_devfn(pdev,
+ vf_index));
+ vf_pe_array[vf_index].pe_num =
+ cpu_to_be16(IODA_INVALID_PE);
+ }
+ }
+
+ rc = pseries_send_map_pe(pdev, num_vfs, vf_pe_array);
+ /* Only zero is success */
+ if (!rc)
+ for (vf_index = 0; vf_index < num_vfs; vf_index++)
+ pseries_set_pe_num(pdev, vf_index,
+ vf_pe_array[vf_index].pe_num);
+
+ kfree(vf_pe_array);
+ return rc;
+}
+
+int pseries_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
+{
+ struct pci_dn *pdn;
+ int rc;
+ const int *max_vfs;
+ int max_config_vfs;
+ struct device_node *dn = pci_device_to_OF_node(pdev);
+
+ max_vfs = of_get_property(dn, "ibm,number-of-configurable-vfs", NULL);
+
+ if (!max_vfs)
+ return -EINVAL;
+
+ /* First integer stores max config */
+ max_config_vfs = of_read_number(&max_vfs[0], 1);
+ if (max_config_vfs < num_vfs && num_vfs > MAX_VFS_FOR_MAP_PE) {
+ dev_err(&pdev->dev,
+ "Num VFs %x > %x Configurable VFs\n",
+ num_vfs, (num_vfs > MAX_VFS_FOR_MAP_PE) ?
+ MAX_VFS_FOR_MAP_PE : max_config_vfs);
+ return -EINVAL;
+ }
+
+ pdn = pci_get_pdn(pdev);
+ pdn->pe_num_map = kmalloc_array(num_vfs,
+ sizeof(*pdn->pe_num_map),
+ GFP_KERNEL);
+ if (!pdn->pe_num_map)
+ return -ENOMEM;
+
+ rc = pseries_associate_pes(pdev, num_vfs);
+
+ /* Anything other than zero is failure */
+ if (rc) {
+ dev_err(&pdev->dev, "Failure to enable sriov: %x\n", rc);
+ kfree(pdn->pe_num_map);
+ } else {
+ pci_vf_drivers_autoprobe(pdev, false);
+ }
+
+ return rc;
+}
+
+int pseries_pcibios_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
+{
+ /* Allocate PCI data */
+ add_dev_pci_data(pdev);
+ return pseries_pci_sriov_enable(pdev, num_vfs);
+}
+
+int pseries_pcibios_sriov_disable(struct pci_dev *pdev)
+{
+ struct pci_dn *pdn;
+
+ pdn = pci_get_pdn(pdev);
+ /* Releasing pe_num_map */
+ kfree(pdn->pe_num_map);
+ /* Release PCI data */
+ remove_dev_pci_data(pdev);
+ pci_vf_drivers_autoprobe(pdev, true);
+ return 0;
+}
+#endif
+
+static void __init pSeries_request_regions(void)
+{
+ if (!isa_io_base)
+ return;
+
+ request_region(0x20,0x20,"pic1");
+ request_region(0xa0,0x20,"pic2");
+ request_region(0x00,0x20,"dma1");
+ request_region(0x40,0x20,"timer");
+ request_region(0x80,0x10,"dma page reg");
+ request_region(0xc0,0x20,"dma2");
+}
+
+void __init pSeries_final_fixup(void)
+{
+ pSeries_request_regions();
+
+ eeh_addr_cache_build();
+
+#ifdef CONFIG_PCI_IOV
+ ppc_md.pcibios_sriov_enable = pseries_pcibios_sriov_enable;
+ ppc_md.pcibios_sriov_disable = pseries_pcibios_sriov_disable;
+#endif
+}
+
+/*
+ * Assume the winbond 82c105 is the IDE controller on a
+ * p610/p615/p630. We should probably be more careful in case
+ * someone tries to plug in a similar adapter.
+ */
+static void fixup_winbond_82c105(struct pci_dev* dev)
+{
+ int i;
+ unsigned int reg;
+
+ if (!machine_is(pseries))
+ return;
+
+ printk("Using INTC for W82c105 IDE controller.\n");
+ pci_read_config_dword(dev, 0x40, &reg);
+ /* Enable LEGIRQ to use INTC instead of ISA interrupts */
+ pci_write_config_dword(dev, 0x40, reg | (1<<11));
+
+ for (i = 0; i < DEVICE_COUNT_RESOURCE; ++i) {
+ /* zap the 2nd function of the winbond chip */
+ if (dev->resource[i].flags & IORESOURCE_IO
+ && dev->bus->number == 0 && dev->devfn == 0x81)
+ dev->resource[i].flags &= ~IORESOURCE_IO;
+ if (dev->resource[i].start == 0 && dev->resource[i].end) {
+ dev->resource[i].flags = 0;
+ dev->resource[i].end = 0;
+ }
+ }
+}
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_WINBOND, PCI_DEVICE_ID_WINBOND_82C105,
+ fixup_winbond_82c105);
+
+int pseries_root_bridge_prepare(struct pci_host_bridge *bridge)
+{
+ struct device_node *dn, *pdn;
+ struct pci_bus *bus;
+ u32 pcie_link_speed_stats[2];
+ int rc;
+
+ bus = bridge->bus;
+
+ /* Rely on the pcibios_free_controller_deferred() callback. */
+ pci_set_host_bridge_release(bridge, pcibios_free_controller_deferred,
+ (void *) pci_bus_to_host(bus));
+
+ dn = pcibios_get_phb_of_node(bus);
+ if (!dn)
+ return 0;
+
+ for (pdn = dn; pdn != NULL; pdn = of_get_next_parent(pdn)) {
+ rc = of_property_read_u32_array(pdn,
+ "ibm,pcie-link-speed-stats",
+ &pcie_link_speed_stats[0], 2);
+ if (!rc)
+ break;
+ }
+
+ of_node_put(pdn);
+
+ if (rc) {
+ pr_debug("no ibm,pcie-link-speed-stats property\n");
+ return 0;
+ }
+
+ switch (pcie_link_speed_stats[0]) {
+ case 0x01:
+ bus->max_bus_speed = PCIE_SPEED_2_5GT;
+ break;
+ case 0x02:
+ bus->max_bus_speed = PCIE_SPEED_5_0GT;
+ break;
+ case 0x04:
+ bus->max_bus_speed = PCIE_SPEED_8_0GT;
+ break;
+ default:
+ bus->max_bus_speed = PCI_SPEED_UNKNOWN;
+ break;
+ }
+
+ switch (pcie_link_speed_stats[1]) {
+ case 0x01:
+ bus->cur_bus_speed = PCIE_SPEED_2_5GT;
+ break;
+ case 0x02:
+ bus->cur_bus_speed = PCIE_SPEED_5_0GT;
+ break;
+ case 0x04:
+ bus->cur_bus_speed = PCIE_SPEED_8_0GT;
+ break;
+ default:
+ bus->cur_bus_speed = PCI_SPEED_UNKNOWN;
+ break;
+ }
+
+ return 0;
+}
diff --git a/arch/powerpc/platforms/pseries/pci_dlpar.c b/arch/powerpc/platforms/pseries/pci_dlpar.c
new file mode 100644
index 000000000..afca4b737
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/pci_dlpar.c
@@ -0,0 +1,119 @@
+/*
+ * PCI Dynamic LPAR, PCI Hot Plug and PCI EEH recovery code
+ * for RPA-compliant PPC64 platform.
+ * Copyright (C) 2003 Linda Xie <lxie@us.ibm.com>
+ * Copyright (C) 2005 International Business Machines
+ *
+ * Updates, 2005, John Rose <johnrose@austin.ibm.com>
+ * Updates, 2005, Linas Vepstas <linas@austin.ibm.com>
+ *
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/pci.h>
+#include <linux/export.h>
+#include <asm/pci-bridge.h>
+#include <asm/ppc-pci.h>
+#include <asm/firmware.h>
+#include <asm/eeh.h>
+
+#include "pseries.h"
+
+struct pci_controller *init_phb_dynamic(struct device_node *dn)
+{
+ struct pci_controller *phb;
+
+ pr_debug("PCI: Initializing new hotplug PHB %pOF\n", dn);
+
+ phb = pcibios_alloc_controller(dn);
+ if (!phb)
+ return NULL;
+ rtas_setup_phb(phb);
+ pci_process_bridge_OF_ranges(phb, dn, 0);
+ phb->controller_ops = pseries_pci_controller_ops;
+
+ pci_devs_phb_init_dynamic(phb);
+
+ /* Create EEH devices for the PHB */
+ eeh_dev_phb_init_dynamic(phb);
+
+ if (dn->child)
+ eeh_add_device_tree_early(PCI_DN(dn));
+
+ pcibios_scan_phb(phb);
+ pcibios_finish_adding_to_bus(phb->bus);
+
+ return phb;
+}
+EXPORT_SYMBOL_GPL(init_phb_dynamic);
+
+/* RPA-specific bits for removing PHBs */
+int remove_phb_dynamic(struct pci_controller *phb)
+{
+ struct pci_bus *b = phb->bus;
+ struct pci_host_bridge *host_bridge = to_pci_host_bridge(b->bridge);
+ struct resource *res;
+ int rc, i;
+
+ pr_debug("PCI: Removing PHB %04x:%02x...\n",
+ pci_domain_nr(b), b->number);
+
+ /* We cannot to remove a root bus that has children */
+ if (!(list_empty(&b->children) && list_empty(&b->devices)))
+ return -EBUSY;
+
+ /* We -know- there aren't any child devices anymore at this stage
+ * and thus, we can safely unmap the IO space as it's not in use
+ */
+ res = &phb->io_resource;
+ if (res->flags & IORESOURCE_IO) {
+ rc = pcibios_unmap_io_space(b);
+ if (rc) {
+ printk(KERN_ERR "%s: failed to unmap IO on bus %s\n",
+ __func__, b->name);
+ return 1;
+ }
+ }
+
+ /* Remove the PCI bus and unregister the bridge device from sysfs */
+ phb->bus = NULL;
+ pci_remove_bus(b);
+ host_bridge->bus = NULL;
+ device_unregister(&host_bridge->dev);
+
+ /* Now release the IO resource */
+ if (res->flags & IORESOURCE_IO)
+ release_resource(res);
+
+ /* Release memory resources */
+ for (i = 0; i < 3; ++i) {
+ res = &phb->mem_resources[i];
+ if (!(res->flags & IORESOURCE_MEM))
+ continue;
+ release_resource(res);
+ }
+
+ /*
+ * The pci_controller data structure is freed by
+ * the pcibios_free_controller_deferred() callback;
+ * see pseries_root_bridge_prepare().
+ */
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(remove_phb_dynamic);
diff --git a/arch/powerpc/platforms/pseries/power.c b/arch/powerpc/platforms/pseries/power.c
new file mode 100644
index 000000000..a4a0b57d1
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/power.c
@@ -0,0 +1,84 @@
+/*
+ * Interface for power-management for ppc64 compliant platform
+ *
+ * Manish Ahuja <mahuja@us.ibm.com>
+ *
+ * Feb 2007
+ *
+ * Copyright (C) 2007 IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; version 2 of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/kobject.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <asm/machdep.h>
+
+#include "pseries.h"
+
+unsigned long rtas_poweron_auto; /* default and normal state is 0 */
+
+static ssize_t auto_poweron_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%lu\n", rtas_poweron_auto);
+}
+
+static ssize_t auto_poweron_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t n)
+{
+ int ret;
+ unsigned long ups_restart;
+ ret = sscanf(buf, "%lu", &ups_restart);
+
+ if ((ret == 1) && ((ups_restart == 1) || (ups_restart == 0))){
+ rtas_poweron_auto = ups_restart;
+ return n;
+ }
+ return -EINVAL;
+}
+
+static struct kobj_attribute auto_poweron_attr =
+ __ATTR(auto_poweron, 0644, auto_poweron_show, auto_poweron_store);
+
+#ifndef CONFIG_PM
+struct kobject *power_kobj;
+
+static struct attribute *g[] = {
+ &auto_poweron_attr.attr,
+ NULL,
+};
+
+static struct attribute_group attr_group = {
+ .attrs = g,
+};
+
+static int __init pm_init(void)
+{
+ power_kobj = kobject_create_and_add("power", NULL);
+ if (!power_kobj)
+ return -ENOMEM;
+ return sysfs_create_group(power_kobj, &attr_group);
+}
+machine_core_initcall(pseries, pm_init);
+#else
+static int __init apo_pm_init(void)
+{
+ return (sysfs_create_file(power_kobj, &auto_poweron_attr.attr));
+}
+machine_device_initcall(pseries, apo_pm_init);
+#endif
diff --git a/arch/powerpc/platforms/pseries/pseries.h b/arch/powerpc/platforms/pseries/pseries.h
new file mode 100644
index 000000000..60db2ee51
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/pseries.h
@@ -0,0 +1,113 @@
+/*
+ * Copyright 2006 IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#ifndef _PSERIES_PSERIES_H
+#define _PSERIES_PSERIES_H
+
+#include <linux/interrupt.h>
+#include <asm/rtas.h>
+
+struct device_node;
+
+extern void request_event_sources_irqs(struct device_node *np,
+ irq_handler_t handler, const char *name);
+
+#include <linux/of.h>
+
+struct pt_regs;
+
+extern int pSeries_system_reset_exception(struct pt_regs *regs);
+extern int pSeries_machine_check_exception(struct pt_regs *regs);
+
+#ifdef CONFIG_SMP
+extern void smp_init_pseries(void);
+
+/* Get state of physical CPU from query_cpu_stopped */
+int smp_query_cpu_stopped(unsigned int pcpu);
+#define QCSS_STOPPED 0
+#define QCSS_STOPPING 1
+#define QCSS_NOT_STOPPED 2
+#define QCSS_HARDWARE_ERROR -1
+#define QCSS_HARDWARE_BUSY -2
+#else
+static inline void smp_init_pseries(void) { };
+#endif
+
+extern void pseries_kexec_cpu_down(int crash_shutdown, int secondary);
+
+extern void pSeries_final_fixup(void);
+
+/* Poweron flag used for enabling auto ups restart */
+extern unsigned long rtas_poweron_auto;
+
+/* Provided by HVC VIO */
+extern void hvc_vio_init_early(void);
+
+/* Dynamic logical Partitioning/Mobility */
+extern void dlpar_free_cc_nodes(struct device_node *);
+extern void dlpar_free_cc_property(struct property *);
+extern struct device_node *dlpar_configure_connector(__be32,
+ struct device_node *);
+extern int dlpar_attach_node(struct device_node *, struct device_node *);
+extern int dlpar_detach_node(struct device_node *);
+extern int dlpar_acquire_drc(u32 drc_index);
+extern int dlpar_release_drc(u32 drc_index);
+
+void queue_hotplug_event(struct pseries_hp_errorlog *hp_errlog,
+ struct completion *hotplug_done, int *rc);
+#ifdef CONFIG_MEMORY_HOTPLUG
+int dlpar_memory(struct pseries_hp_errorlog *hp_elog);
+#else
+static inline int dlpar_memory(struct pseries_hp_errorlog *hp_elog)
+{
+ return -EOPNOTSUPP;
+}
+#endif
+
+#ifdef CONFIG_HOTPLUG_CPU
+int dlpar_cpu(struct pseries_hp_errorlog *hp_elog);
+#else
+static inline int dlpar_cpu(struct pseries_hp_errorlog *hp_elog)
+{
+ return -EOPNOTSUPP;
+}
+#endif
+
+/* PCI root bridge prepare function override for pseries */
+struct pci_host_bridge;
+int pseries_root_bridge_prepare(struct pci_host_bridge *bridge);
+
+extern struct pci_controller_ops pseries_pci_controller_ops;
+
+unsigned long pseries_memory_block_size(void);
+
+extern int CMO_PrPSP;
+extern int CMO_SecPSP;
+extern unsigned long CMO_PageSize;
+
+static inline int cmo_get_primary_psp(void)
+{
+ return CMO_PrPSP;
+}
+
+static inline int cmo_get_secondary_psp(void)
+{
+ return CMO_SecPSP;
+}
+
+static inline unsigned long cmo_get_page_size(void)
+{
+ return CMO_PageSize;
+}
+
+int dlpar_workqueue_init(void);
+
+void pseries_setup_rfi_flush(void);
+
+#endif /* _PSERIES_PSERIES_H */
diff --git a/arch/powerpc/platforms/pseries/pseries_energy.c b/arch/powerpc/platforms/pseries/pseries_energy.c
new file mode 100644
index 000000000..921f12182
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/pseries_energy.c
@@ -0,0 +1,368 @@
+/*
+ * POWER platform energy management driver
+ * Copyright (C) 2010 IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ *
+ * This pseries platform device driver provides access to
+ * platform energy management capabilities.
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <linux/seq_file.h>
+#include <linux/device.h>
+#include <linux/cpu.h>
+#include <linux/of.h>
+#include <asm/cputhreads.h>
+#include <asm/page.h>
+#include <asm/hvcall.h>
+#include <asm/firmware.h>
+#include <asm/prom.h>
+
+
+#define MODULE_VERS "1.0"
+#define MODULE_NAME "pseries_energy"
+
+/* Driver flags */
+
+static int sysfs_entries;
+
+/* Helper routines */
+
+/* Helper Routines to convert between drc_index to cpu numbers */
+
+static u32 cpu_to_drc_index(int cpu)
+{
+ struct device_node *dn = NULL;
+ int thread_index;
+ int rc = 1;
+ u32 ret = 0;
+
+ dn = of_find_node_by_path("/cpus");
+ if (dn == NULL)
+ goto err;
+
+ /* Convert logical cpu number to core number */
+ thread_index = cpu_core_index_of_thread(cpu);
+
+ if (firmware_has_feature(FW_FEATURE_DRC_INFO)) {
+ struct property *info = NULL;
+ struct of_drc_info drc;
+ int j;
+ u32 num_set_entries;
+ const __be32 *value;
+
+ info = of_find_property(dn, "ibm,drc-info", NULL);
+ if (info == NULL)
+ goto err_of_node_put;
+
+ value = of_prop_next_u32(info, NULL, &num_set_entries);
+ if (!value)
+ goto err_of_node_put;
+
+ for (j = 0; j < num_set_entries; j++) {
+
+ of_read_drc_info_cell(&info, &value, &drc);
+ if (strncmp(drc.drc_type, "CPU", 3))
+ goto err;
+
+ if (thread_index < drc.last_drc_index)
+ break;
+ }
+
+ ret = drc.drc_index_start + (thread_index * drc.sequential_inc);
+ } else {
+ u32 nr_drc_indexes, thread_drc_index;
+
+ /*
+ * The first element of ibm,drc-indexes array is the
+ * number of drc_indexes returned in the list. Hence
+ * thread_index+1 will get the drc_index corresponding
+ * to core number thread_index.
+ */
+ rc = of_property_read_u32_index(dn, "ibm,drc-indexes",
+ 0, &nr_drc_indexes);
+ if (rc)
+ goto err_of_node_put;
+
+ WARN_ON_ONCE(thread_index > nr_drc_indexes);
+ rc = of_property_read_u32_index(dn, "ibm,drc-indexes",
+ thread_index + 1,
+ &thread_drc_index);
+ if (rc)
+ goto err_of_node_put;
+
+ ret = thread_drc_index;
+ }
+
+ rc = 0;
+
+err_of_node_put:
+ of_node_put(dn);
+err:
+ if (rc)
+ printk(KERN_WARNING "cpu_to_drc_index(%d) failed", cpu);
+ return ret;
+}
+
+static int drc_index_to_cpu(u32 drc_index)
+{
+ struct device_node *dn = NULL;
+ const int *indexes;
+ int thread_index = 0, cpu = 0;
+ int rc = 1;
+
+ dn = of_find_node_by_path("/cpus");
+ if (dn == NULL)
+ goto err;
+
+ if (firmware_has_feature(FW_FEATURE_DRC_INFO)) {
+ struct property *info = NULL;
+ struct of_drc_info drc;
+ int j;
+ u32 num_set_entries;
+ const __be32 *value;
+
+ info = of_find_property(dn, "ibm,drc-info", NULL);
+ if (info == NULL)
+ goto err_of_node_put;
+
+ value = of_prop_next_u32(info, NULL, &num_set_entries);
+ if (!value)
+ goto err_of_node_put;
+
+ for (j = 0; j < num_set_entries; j++) {
+
+ of_read_drc_info_cell(&info, &value, &drc);
+ if (strncmp(drc.drc_type, "CPU", 3))
+ goto err;
+
+ if (drc_index > drc.last_drc_index) {
+ cpu += drc.num_sequential_elems;
+ continue;
+ }
+ cpu += ((drc_index - drc.drc_index_start) /
+ drc.sequential_inc);
+
+ thread_index = cpu_first_thread_of_core(cpu);
+ rc = 0;
+ break;
+ }
+ } else {
+ unsigned long int i;
+
+ indexes = of_get_property(dn, "ibm,drc-indexes", NULL);
+ if (indexes == NULL)
+ goto err_of_node_put;
+ /*
+ * First element in the array is the number of drc_indexes
+ * returned. Search through the list to find the matching
+ * drc_index and get the core number
+ */
+ for (i = 0; i < indexes[0]; i++) {
+ if (indexes[i + 1] == drc_index)
+ break;
+ }
+ /* Convert core number to logical cpu number */
+ thread_index = cpu_first_thread_of_core(i);
+ rc = 0;
+ }
+
+err_of_node_put:
+ of_node_put(dn);
+err:
+ if (rc)
+ printk(KERN_WARNING "drc_index_to_cpu(%d) failed", drc_index);
+ return thread_index;
+}
+
+/*
+ * pseries hypervisor call H_BEST_ENERGY provides hints to OS on
+ * preferred logical cpus to activate or deactivate for optimized
+ * energy consumption.
+ */
+
+#define FLAGS_MODE1 0x004E200000080E01UL
+#define FLAGS_MODE2 0x004E200000080401UL
+#define FLAGS_ACTIVATE 0x100
+
+static ssize_t get_best_energy_list(char *page, int activate)
+{
+ int rc, cnt, i, cpu;
+ unsigned long retbuf[PLPAR_HCALL9_BUFSIZE];
+ unsigned long flags = 0;
+ u32 *buf_page;
+ char *s = page;
+
+ buf_page = (u32 *) get_zeroed_page(GFP_KERNEL);
+ if (!buf_page)
+ return -ENOMEM;
+
+ flags = FLAGS_MODE1;
+ if (activate)
+ flags |= FLAGS_ACTIVATE;
+
+ rc = plpar_hcall9(H_BEST_ENERGY, retbuf, flags, 0, __pa(buf_page),
+ 0, 0, 0, 0, 0, 0);
+ if (rc != H_SUCCESS) {
+ free_page((unsigned long) buf_page);
+ return -EINVAL;
+ }
+
+ cnt = retbuf[0];
+ for (i = 0; i < cnt; i++) {
+ cpu = drc_index_to_cpu(buf_page[2*i+1]);
+ if ((cpu_online(cpu) && !activate) ||
+ (!cpu_online(cpu) && activate))
+ s += sprintf(s, "%d,", cpu);
+ }
+ if (s > page) { /* Something to show */
+ s--; /* Suppress last comma */
+ s += sprintf(s, "\n");
+ }
+
+ free_page((unsigned long) buf_page);
+ return s-page;
+}
+
+static ssize_t get_best_energy_data(struct device *dev,
+ char *page, int activate)
+{
+ int rc;
+ unsigned long retbuf[PLPAR_HCALL9_BUFSIZE];
+ unsigned long flags = 0;
+
+ flags = FLAGS_MODE2;
+ if (activate)
+ flags |= FLAGS_ACTIVATE;
+
+ rc = plpar_hcall9(H_BEST_ENERGY, retbuf, flags,
+ cpu_to_drc_index(dev->id),
+ 0, 0, 0, 0, 0, 0, 0);
+
+ if (rc != H_SUCCESS)
+ return -EINVAL;
+
+ return sprintf(page, "%lu\n", retbuf[1] >> 32);
+}
+
+/* Wrapper functions */
+
+static ssize_t cpu_activate_hint_list_show(struct device *dev,
+ struct device_attribute *attr, char *page)
+{
+ return get_best_energy_list(page, 1);
+}
+
+static ssize_t cpu_deactivate_hint_list_show(struct device *dev,
+ struct device_attribute *attr, char *page)
+{
+ return get_best_energy_list(page, 0);
+}
+
+static ssize_t percpu_activate_hint_show(struct device *dev,
+ struct device_attribute *attr, char *page)
+{
+ return get_best_energy_data(dev, page, 1);
+}
+
+static ssize_t percpu_deactivate_hint_show(struct device *dev,
+ struct device_attribute *attr, char *page)
+{
+ return get_best_energy_data(dev, page, 0);
+}
+
+/*
+ * Create sysfs interface:
+ * /sys/devices/system/cpu/pseries_activate_hint_list
+ * /sys/devices/system/cpu/pseries_deactivate_hint_list
+ * Comma separated list of cpus to activate or deactivate
+ * /sys/devices/system/cpu/cpuN/pseries_activate_hint
+ * /sys/devices/system/cpu/cpuN/pseries_deactivate_hint
+ * Per-cpu value of the hint
+ */
+
+static struct device_attribute attr_cpu_activate_hint_list =
+ __ATTR(pseries_activate_hint_list, 0444,
+ cpu_activate_hint_list_show, NULL);
+
+static struct device_attribute attr_cpu_deactivate_hint_list =
+ __ATTR(pseries_deactivate_hint_list, 0444,
+ cpu_deactivate_hint_list_show, NULL);
+
+static struct device_attribute attr_percpu_activate_hint =
+ __ATTR(pseries_activate_hint, 0444,
+ percpu_activate_hint_show, NULL);
+
+static struct device_attribute attr_percpu_deactivate_hint =
+ __ATTR(pseries_deactivate_hint, 0444,
+ percpu_deactivate_hint_show, NULL);
+
+static int __init pseries_energy_init(void)
+{
+ int cpu, err;
+ struct device *cpu_dev;
+
+ if (!firmware_has_feature(FW_FEATURE_BEST_ENERGY))
+ return 0; /* H_BEST_ENERGY hcall not supported */
+
+ /* Create the sysfs files */
+ err = device_create_file(cpu_subsys.dev_root,
+ &attr_cpu_activate_hint_list);
+ if (!err)
+ err = device_create_file(cpu_subsys.dev_root,
+ &attr_cpu_deactivate_hint_list);
+
+ if (err)
+ return err;
+ for_each_possible_cpu(cpu) {
+ cpu_dev = get_cpu_device(cpu);
+ err = device_create_file(cpu_dev,
+ &attr_percpu_activate_hint);
+ if (err)
+ break;
+ err = device_create_file(cpu_dev,
+ &attr_percpu_deactivate_hint);
+ if (err)
+ break;
+ }
+
+ if (err)
+ return err;
+
+ sysfs_entries = 1; /* Removed entries on cleanup */
+ return 0;
+
+}
+
+static void __exit pseries_energy_cleanup(void)
+{
+ int cpu;
+ struct device *cpu_dev;
+
+ if (!sysfs_entries)
+ return;
+
+ /* Remove the sysfs files */
+ device_remove_file(cpu_subsys.dev_root, &attr_cpu_activate_hint_list);
+ device_remove_file(cpu_subsys.dev_root, &attr_cpu_deactivate_hint_list);
+
+ for_each_possible_cpu(cpu) {
+ cpu_dev = get_cpu_device(cpu);
+ sysfs_remove_file(&cpu_dev->kobj,
+ &attr_percpu_activate_hint.attr);
+ sysfs_remove_file(&cpu_dev->kobj,
+ &attr_percpu_deactivate_hint.attr);
+ }
+}
+
+module_init(pseries_energy_init);
+module_exit(pseries_energy_cleanup);
+MODULE_DESCRIPTION("Driver for pSeries platform energy management");
+MODULE_AUTHOR("Vaidyanathan Srinivasan");
+MODULE_LICENSE("GPL");
diff --git a/arch/powerpc/platforms/pseries/ras.c b/arch/powerpc/platforms/pseries/ras.c
new file mode 100644
index 000000000..e82710868
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/ras.c
@@ -0,0 +1,513 @@
+/*
+ * Copyright (C) 2001 Dave Engebretsen IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/sched.h>
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/of.h>
+#include <linux/fs.h>
+#include <linux/reboot.h>
+#include <linux/irq_work.h>
+
+#include <asm/machdep.h>
+#include <asm/rtas.h>
+#include <asm/firmware.h>
+
+#include "pseries.h"
+
+static unsigned char ras_log_buf[RTAS_ERROR_LOG_MAX];
+static DEFINE_SPINLOCK(ras_log_buf_lock);
+
+static int ras_check_exception_token;
+
+static void mce_process_errlog_event(struct irq_work *work);
+static struct irq_work mce_errlog_process_work = {
+ .func = mce_process_errlog_event,
+};
+
+#define EPOW_SENSOR_TOKEN 9
+#define EPOW_SENSOR_INDEX 0
+
+/* EPOW events counter variable */
+static int num_epow_events;
+
+static irqreturn_t ras_hotplug_interrupt(int irq, void *dev_id);
+static irqreturn_t ras_epow_interrupt(int irq, void *dev_id);
+static irqreturn_t ras_error_interrupt(int irq, void *dev_id);
+
+
+/*
+ * Enable the hotplug interrupt late because processing them may touch other
+ * devices or systems (e.g. hugepages) that have not been initialized at the
+ * subsys stage.
+ */
+int __init init_ras_hotplug_IRQ(void)
+{
+ struct device_node *np;
+
+ /* Hotplug Events */
+ np = of_find_node_by_path("/event-sources/hot-plug-events");
+ if (np != NULL) {
+ if (dlpar_workqueue_init() == 0)
+ request_event_sources_irqs(np, ras_hotplug_interrupt,
+ "RAS_HOTPLUG");
+ of_node_put(np);
+ }
+
+ return 0;
+}
+machine_late_initcall(pseries, init_ras_hotplug_IRQ);
+
+/*
+ * Initialize handlers for the set of interrupts caused by hardware errors
+ * and power system events.
+ */
+static int __init init_ras_IRQ(void)
+{
+ struct device_node *np;
+
+ ras_check_exception_token = rtas_token("check-exception");
+
+ /* Internal Errors */
+ np = of_find_node_by_path("/event-sources/internal-errors");
+ if (np != NULL) {
+ request_event_sources_irqs(np, ras_error_interrupt,
+ "RAS_ERROR");
+ of_node_put(np);
+ }
+
+ /* EPOW Events */
+ np = of_find_node_by_path("/event-sources/epow-events");
+ if (np != NULL) {
+ request_event_sources_irqs(np, ras_epow_interrupt, "RAS_EPOW");
+ of_node_put(np);
+ }
+
+ return 0;
+}
+machine_subsys_initcall(pseries, init_ras_IRQ);
+
+#define EPOW_SHUTDOWN_NORMAL 1
+#define EPOW_SHUTDOWN_ON_UPS 2
+#define EPOW_SHUTDOWN_LOSS_OF_CRITICAL_FUNCTIONS 3
+#define EPOW_SHUTDOWN_AMBIENT_TEMPERATURE_TOO_HIGH 4
+
+static void handle_system_shutdown(char event_modifier)
+{
+ switch (event_modifier) {
+ case EPOW_SHUTDOWN_NORMAL:
+ pr_emerg("Power off requested\n");
+ orderly_poweroff(true);
+ break;
+
+ case EPOW_SHUTDOWN_ON_UPS:
+ pr_emerg("Loss of system power detected. System is running on"
+ " UPS/battery. Check RTAS error log for details\n");
+ break;
+
+ case EPOW_SHUTDOWN_LOSS_OF_CRITICAL_FUNCTIONS:
+ pr_emerg("Loss of system critical functions detected. Check"
+ " RTAS error log for details\n");
+ orderly_poweroff(true);
+ break;
+
+ case EPOW_SHUTDOWN_AMBIENT_TEMPERATURE_TOO_HIGH:
+ pr_emerg("High ambient temperature detected. Check RTAS"
+ " error log for details\n");
+ orderly_poweroff(true);
+ break;
+
+ default:
+ pr_err("Unknown power/cooling shutdown event (modifier = %d)\n",
+ event_modifier);
+ }
+}
+
+struct epow_errorlog {
+ unsigned char sensor_value;
+ unsigned char event_modifier;
+ unsigned char extended_modifier;
+ unsigned char reserved;
+ unsigned char platform_reason;
+};
+
+#define EPOW_RESET 0
+#define EPOW_WARN_COOLING 1
+#define EPOW_WARN_POWER 2
+#define EPOW_SYSTEM_SHUTDOWN 3
+#define EPOW_SYSTEM_HALT 4
+#define EPOW_MAIN_ENCLOSURE 5
+#define EPOW_POWER_OFF 7
+
+static void rtas_parse_epow_errlog(struct rtas_error_log *log)
+{
+ struct pseries_errorlog *pseries_log;
+ struct epow_errorlog *epow_log;
+ char action_code;
+ char modifier;
+
+ pseries_log = get_pseries_errorlog(log, PSERIES_ELOG_SECT_ID_EPOW);
+ if (pseries_log == NULL)
+ return;
+
+ epow_log = (struct epow_errorlog *)pseries_log->data;
+ action_code = epow_log->sensor_value & 0xF; /* bottom 4 bits */
+ modifier = epow_log->event_modifier & 0xF; /* bottom 4 bits */
+
+ switch (action_code) {
+ case EPOW_RESET:
+ if (num_epow_events) {
+ pr_info("Non critical power/cooling issue cleared\n");
+ num_epow_events--;
+ }
+ break;
+
+ case EPOW_WARN_COOLING:
+ pr_info("Non-critical cooling issue detected. Check RTAS error"
+ " log for details\n");
+ break;
+
+ case EPOW_WARN_POWER:
+ pr_info("Non-critical power issue detected. Check RTAS error"
+ " log for details\n");
+ break;
+
+ case EPOW_SYSTEM_SHUTDOWN:
+ handle_system_shutdown(epow_log->event_modifier);
+ break;
+
+ case EPOW_SYSTEM_HALT:
+ pr_emerg("Critical power/cooling issue detected. Check RTAS"
+ " error log for details. Powering off.\n");
+ orderly_poweroff(true);
+ break;
+
+ case EPOW_MAIN_ENCLOSURE:
+ case EPOW_POWER_OFF:
+ pr_emerg("System about to lose power. Check RTAS error log "
+ " for details. Powering off immediately.\n");
+ emergency_sync();
+ kernel_power_off();
+ break;
+
+ default:
+ pr_err("Unknown power/cooling event (action code = %d)\n",
+ action_code);
+ }
+
+ /* Increment epow events counter variable */
+ if (action_code != EPOW_RESET)
+ num_epow_events++;
+}
+
+static irqreturn_t ras_hotplug_interrupt(int irq, void *dev_id)
+{
+ struct pseries_errorlog *pseries_log;
+ struct pseries_hp_errorlog *hp_elog;
+
+ spin_lock(&ras_log_buf_lock);
+
+ rtas_call(ras_check_exception_token, 6, 1, NULL,
+ RTAS_VECTOR_EXTERNAL_INTERRUPT, virq_to_hw(irq),
+ RTAS_HOTPLUG_EVENTS, 0, __pa(&ras_log_buf),
+ rtas_get_error_log_max());
+
+ pseries_log = get_pseries_errorlog((struct rtas_error_log *)ras_log_buf,
+ PSERIES_ELOG_SECT_ID_HOTPLUG);
+ hp_elog = (struct pseries_hp_errorlog *)pseries_log->data;
+
+ /*
+ * Since PCI hotplug is not currently supported on pseries, put PCI
+ * hotplug events on the ras_log_buf to be handled by rtas_errd.
+ */
+ if (hp_elog->resource == PSERIES_HP_ELOG_RESOURCE_MEM ||
+ hp_elog->resource == PSERIES_HP_ELOG_RESOURCE_CPU)
+ queue_hotplug_event(hp_elog, NULL, NULL);
+ else
+ log_error(ras_log_buf, ERR_TYPE_RTAS_LOG, 0);
+
+ spin_unlock(&ras_log_buf_lock);
+ return IRQ_HANDLED;
+}
+
+/* Handle environmental and power warning (EPOW) interrupts. */
+static irqreturn_t ras_epow_interrupt(int irq, void *dev_id)
+{
+ int status;
+ int state;
+ int critical;
+
+ status = rtas_get_sensor_fast(EPOW_SENSOR_TOKEN, EPOW_SENSOR_INDEX,
+ &state);
+
+ if (state > 3)
+ critical = 1; /* Time Critical */
+ else
+ critical = 0;
+
+ spin_lock(&ras_log_buf_lock);
+
+ status = rtas_call(ras_check_exception_token, 6, 1, NULL,
+ RTAS_VECTOR_EXTERNAL_INTERRUPT,
+ virq_to_hw(irq),
+ RTAS_EPOW_WARNING,
+ critical, __pa(&ras_log_buf),
+ rtas_get_error_log_max());
+
+ log_error(ras_log_buf, ERR_TYPE_RTAS_LOG, 0);
+
+ rtas_parse_epow_errlog((struct rtas_error_log *)ras_log_buf);
+
+ spin_unlock(&ras_log_buf_lock);
+ return IRQ_HANDLED;
+}
+
+/*
+ * Handle hardware error interrupts.
+ *
+ * RTAS check-exception is called to collect data on the exception. If
+ * the error is deemed recoverable, we log a warning and return.
+ * For nonrecoverable errors, an error is logged and we stop all processing
+ * as quickly as possible in order to prevent propagation of the failure.
+ */
+static irqreturn_t ras_error_interrupt(int irq, void *dev_id)
+{
+ struct rtas_error_log *rtas_elog;
+ int status;
+ int fatal;
+
+ spin_lock(&ras_log_buf_lock);
+
+ status = rtas_call(ras_check_exception_token, 6, 1, NULL,
+ RTAS_VECTOR_EXTERNAL_INTERRUPT,
+ virq_to_hw(irq),
+ RTAS_INTERNAL_ERROR, 1 /* Time Critical */,
+ __pa(&ras_log_buf),
+ rtas_get_error_log_max());
+
+ rtas_elog = (struct rtas_error_log *)ras_log_buf;
+
+ if (status == 0 &&
+ rtas_error_severity(rtas_elog) >= RTAS_SEVERITY_ERROR_SYNC)
+ fatal = 1;
+ else
+ fatal = 0;
+
+ /* format and print the extended information */
+ log_error(ras_log_buf, ERR_TYPE_RTAS_LOG, fatal);
+
+ if (fatal) {
+ pr_emerg("Fatal hardware error detected. Check RTAS error"
+ " log for details. Powering off immediately\n");
+ emergency_sync();
+ kernel_power_off();
+ } else {
+ pr_err("Recoverable hardware error detected\n");
+ }
+
+ spin_unlock(&ras_log_buf_lock);
+ return IRQ_HANDLED;
+}
+
+/*
+ * Some versions of FWNMI place the buffer inside the 4kB page starting at
+ * 0x7000. Other versions place it inside the rtas buffer. We check both.
+ * Minimum size of the buffer is 16 bytes.
+ */
+#define VALID_FWNMI_BUFFER(A) \
+ ((((A) >= 0x7000) && ((A) <= 0x8000 - 16)) || \
+ (((A) >= rtas.base) && ((A) <= (rtas.base + rtas.size - 16))))
+
+static inline struct rtas_error_log *fwnmi_get_errlog(void)
+{
+ return (struct rtas_error_log *)local_paca->mce_data_buf;
+}
+
+/*
+ * Get the error information for errors coming through the
+ * FWNMI vectors. The pt_regs' r3 will be updated to reflect
+ * the actual r3 if possible, and a ptr to the error log entry
+ * will be returned if found.
+ *
+ * Use one buffer mce_data_buf per cpu to store RTAS error.
+ *
+ * The mce_data_buf does not have any locks or protection around it,
+ * if a second machine check comes in, or a system reset is done
+ * before we have logged the error, then we will get corruption in the
+ * error log. This is preferable over holding off on calling
+ * ibm,nmi-interlock which would result in us checkstopping if a
+ * second machine check did come in.
+ */
+static struct rtas_error_log *fwnmi_get_errinfo(struct pt_regs *regs)
+{
+ unsigned long *savep;
+ struct rtas_error_log *h;
+
+ /* Mask top two bits */
+ regs->gpr[3] &= ~(0x3UL << 62);
+
+ if (!VALID_FWNMI_BUFFER(regs->gpr[3])) {
+ printk(KERN_ERR "FWNMI: corrupt r3 0x%016lx\n", regs->gpr[3]);
+ return NULL;
+ }
+
+ savep = __va(regs->gpr[3]);
+ regs->gpr[3] = be64_to_cpu(savep[0]); /* restore original r3 */
+
+ h = (struct rtas_error_log *)&savep[1];
+ /* Use the per cpu buffer from paca to store rtas error log */
+ memset(local_paca->mce_data_buf, 0, RTAS_ERROR_LOG_MAX);
+ if (!rtas_error_extended(h)) {
+ memcpy(local_paca->mce_data_buf, h, sizeof(__u64));
+ } else {
+ int len, error_log_length;
+
+ error_log_length = 8 + rtas_error_extended_log_length(h);
+ len = min_t(int, error_log_length, RTAS_ERROR_LOG_MAX);
+ memcpy(local_paca->mce_data_buf, h, len);
+ }
+
+ return (struct rtas_error_log *)local_paca->mce_data_buf;
+}
+
+/* Call this when done with the data returned by FWNMI_get_errinfo.
+ * It will release the saved data area for other CPUs in the
+ * partition to receive FWNMI errors.
+ */
+static void fwnmi_release_errinfo(void)
+{
+ int ret = rtas_call(rtas_token("ibm,nmi-interlock"), 0, 1, NULL);
+ if (ret != 0)
+ printk(KERN_ERR "FWNMI: nmi-interlock failed: %d\n", ret);
+}
+
+int pSeries_system_reset_exception(struct pt_regs *regs)
+{
+#ifdef __LITTLE_ENDIAN__
+ /*
+ * Some firmware byteswaps SRR registers and gives incorrect SRR1. Try
+ * to detect the bad SRR1 pattern here. Flip the NIP back to correct
+ * endian for reporting purposes. Unfortunately the MSR can't be fixed,
+ * so clear it. It will be missing MSR_RI so we won't try to recover.
+ */
+ if ((be64_to_cpu(regs->msr) &
+ (MSR_LE|MSR_RI|MSR_DR|MSR_IR|MSR_ME|MSR_PR|
+ MSR_ILE|MSR_HV|MSR_SF)) == (MSR_DR|MSR_SF)) {
+ regs->nip = be64_to_cpu((__be64)regs->nip);
+ regs->msr = 0;
+ }
+#endif
+
+ if (fwnmi_active) {
+ struct rtas_error_log *errhdr = fwnmi_get_errinfo(regs);
+ if (errhdr) {
+ /* XXX Should look at FWNMI information */
+ }
+ fwnmi_release_errinfo();
+ }
+
+ if (smp_handle_nmi_ipi(regs))
+ return 1;
+
+ return 0; /* need to perform reset */
+}
+
+/*
+ * Process MCE rtas errlog event.
+ */
+static void mce_process_errlog_event(struct irq_work *work)
+{
+ struct rtas_error_log *err;
+
+ err = fwnmi_get_errlog();
+ log_error((char *)err, ERR_TYPE_RTAS_LOG, 0);
+}
+
+/*
+ * See if we can recover from a machine check exception.
+ * This is only called on power4 (or above) and only via
+ * the Firmware Non-Maskable Interrupts (fwnmi) handler
+ * which provides the error analysis for us.
+ *
+ * Return 1 if corrected (or delivered a signal).
+ * Return 0 if there is nothing we can do.
+ */
+static int recover_mce(struct pt_regs *regs, struct rtas_error_log *err)
+{
+ int recovered = 0;
+ int disposition = rtas_error_disposition(err);
+
+ if (!(regs->msr & MSR_RI)) {
+ /* If MSR_RI isn't set, we cannot recover */
+ recovered = 0;
+
+ } else if (disposition == RTAS_DISP_FULLY_RECOVERED) {
+ /* Platform corrected itself */
+ recovered = 1;
+
+ } else if (disposition == RTAS_DISP_LIMITED_RECOVERY) {
+ /* Platform corrected itself but could be degraded */
+ printk(KERN_ERR "MCE: limited recovery, system may "
+ "be degraded\n");
+ recovered = 1;
+
+ } else if (user_mode(regs) && !is_global_init(current) &&
+ rtas_error_severity(err) == RTAS_SEVERITY_ERROR_SYNC) {
+
+ /*
+ * If we received a synchronous error when in userspace
+ * kill the task. Firmware may report details of the fail
+ * asynchronously, so we can't rely on the target and type
+ * fields being valid here.
+ */
+ printk(KERN_ERR "MCE: uncorrectable error, killing task "
+ "%s:%d\n", current->comm, current->pid);
+
+ _exception(SIGBUS, regs, BUS_MCEERR_AR, regs->nip);
+ recovered = 1;
+ }
+
+ /* Queue irq work to log this rtas event later. */
+ irq_work_queue(&mce_errlog_process_work);
+
+ return recovered;
+}
+
+/*
+ * Handle a machine check.
+ *
+ * Note that on Power 4 and beyond Firmware Non-Maskable Interrupts (fwnmi)
+ * should be present. If so the handler which called us tells us if the
+ * error was recovered (never true if RI=0).
+ *
+ * On hardware prior to Power 4 these exceptions were asynchronous which
+ * means we can't tell exactly where it occurred and so we can't recover.
+ */
+int pSeries_machine_check_exception(struct pt_regs *regs)
+{
+ struct rtas_error_log *errp;
+
+ if (fwnmi_active) {
+ errp = fwnmi_get_errinfo(regs);
+ fwnmi_release_errinfo();
+ if (errp && recover_mce(regs, errp))
+ return 1;
+ }
+
+ return 0;
+}
diff --git a/arch/powerpc/platforms/pseries/reconfig.c b/arch/powerpc/platforms/pseries/reconfig.c
new file mode 100644
index 000000000..0e0208117
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/reconfig.c
@@ -0,0 +1,414 @@
+/*
+ * pSeries_reconfig.c - support for dynamic reconfiguration (including PCI
+ * Hotplug and Dynamic Logical Partitioning on RPA platforms).
+ *
+ * Copyright (C) 2005 Nathan Lynch
+ * Copyright (C) 2005 IBM Corporation
+ *
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ */
+
+#include <linux/kernel.h>
+#include <linux/notifier.h>
+#include <linux/proc_fs.h>
+#include <linux/slab.h>
+#include <linux/of.h>
+
+#include <asm/prom.h>
+#include <asm/machdep.h>
+#include <linux/uaccess.h>
+#include <asm/mmu.h>
+
+#include "of_helpers.h"
+
+static int pSeries_reconfig_add_node(const char *path, struct property *proplist)
+{
+ struct device_node *np;
+ int err = -ENOMEM;
+
+ np = kzalloc(sizeof(*np), GFP_KERNEL);
+ if (!np)
+ goto out_err;
+
+ np->full_name = kstrdup(kbasename(path), GFP_KERNEL);
+ if (!np->full_name)
+ goto out_err;
+
+ np->properties = proplist;
+ of_node_set_flag(np, OF_DYNAMIC);
+ of_node_init(np);
+
+ np->parent = pseries_of_derive_parent(path);
+ if (IS_ERR(np->parent)) {
+ err = PTR_ERR(np->parent);
+ goto out_err;
+ }
+
+ err = of_attach_node(np);
+ if (err) {
+ printk(KERN_ERR "Failed to add device node %s\n", path);
+ goto out_err;
+ }
+
+ of_node_put(np->parent);
+
+ return 0;
+
+out_err:
+ if (np) {
+ of_node_put(np->parent);
+ kfree(np->full_name);
+ kfree(np);
+ }
+ return err;
+}
+
+static int pSeries_reconfig_remove_node(struct device_node *np)
+{
+ struct device_node *parent, *child;
+
+ parent = of_get_parent(np);
+ if (!parent)
+ return -EINVAL;
+
+ if ((child = of_get_next_child(np, NULL))) {
+ of_node_put(child);
+ of_node_put(parent);
+ return -EBUSY;
+ }
+
+ of_detach_node(np);
+ of_node_put(parent);
+ return 0;
+}
+
+/*
+ * /proc/powerpc/ofdt - yucky binary interface for adding and removing
+ * OF device nodes. Should be deprecated as soon as we get an
+ * in-kernel wrapper for the RTAS ibm,configure-connector call.
+ */
+
+static void release_prop_list(const struct property *prop)
+{
+ struct property *next;
+ for (; prop; prop = next) {
+ next = prop->next;
+ kfree(prop->name);
+ kfree(prop->value);
+ kfree(prop);
+ }
+
+}
+
+/**
+ * parse_next_property - process the next property from raw input buffer
+ * @buf: input buffer, must be nul-terminated
+ * @end: end of the input buffer + 1, for validation
+ * @name: return value; set to property name in buf
+ * @length: return value; set to length of value
+ * @value: return value; set to the property value in buf
+ *
+ * Note that the caller must make copies of the name and value returned,
+ * this function does no allocation or copying of the data. Return value
+ * is set to the next name in buf, or NULL on error.
+ */
+static char * parse_next_property(char *buf, char *end, char **name, int *length,
+ unsigned char **value)
+{
+ char *tmp;
+
+ *name = buf;
+
+ tmp = strchr(buf, ' ');
+ if (!tmp) {
+ printk(KERN_ERR "property parse failed in %s at line %d\n",
+ __func__, __LINE__);
+ return NULL;
+ }
+ *tmp = '\0';
+
+ if (++tmp >= end) {
+ printk(KERN_ERR "property parse failed in %s at line %d\n",
+ __func__, __LINE__);
+ return NULL;
+ }
+
+ /* now we're on the length */
+ *length = -1;
+ *length = simple_strtoul(tmp, &tmp, 10);
+ if (*length == -1) {
+ printk(KERN_ERR "property parse failed in %s at line %d\n",
+ __func__, __LINE__);
+ return NULL;
+ }
+ if (*tmp != ' ' || ++tmp >= end) {
+ printk(KERN_ERR "property parse failed in %s at line %d\n",
+ __func__, __LINE__);
+ return NULL;
+ }
+
+ /* now we're on the value */
+ *value = tmp;
+ tmp += *length;
+ if (tmp > end) {
+ printk(KERN_ERR "property parse failed in %s at line %d\n",
+ __func__, __LINE__);
+ return NULL;
+ }
+ else if (tmp < end && *tmp != ' ' && *tmp != '\0') {
+ printk(KERN_ERR "property parse failed in %s at line %d\n",
+ __func__, __LINE__);
+ return NULL;
+ }
+ tmp++;
+
+ /* and now we should be on the next name, or the end */
+ return tmp;
+}
+
+static struct property *new_property(const char *name, const int length,
+ const unsigned char *value, struct property *last)
+{
+ struct property *new = kzalloc(sizeof(*new), GFP_KERNEL);
+
+ if (!new)
+ return NULL;
+
+ if (!(new->name = kstrdup(name, GFP_KERNEL)))
+ goto cleanup;
+ if (!(new->value = kmalloc(length + 1, GFP_KERNEL)))
+ goto cleanup;
+
+ memcpy(new->value, value, length);
+ *(((char *)new->value) + length) = 0;
+ new->length = length;
+ new->next = last;
+ return new;
+
+cleanup:
+ kfree(new->name);
+ kfree(new->value);
+ kfree(new);
+ return NULL;
+}
+
+static int do_add_node(char *buf, size_t bufsize)
+{
+ char *path, *end, *name;
+ struct device_node *np;
+ struct property *prop = NULL;
+ unsigned char* value;
+ int length, rv = 0;
+
+ end = buf + bufsize;
+ path = buf;
+ buf = strchr(buf, ' ');
+ if (!buf)
+ return -EINVAL;
+ *buf = '\0';
+ buf++;
+
+ if ((np = of_find_node_by_path(path))) {
+ of_node_put(np);
+ return -EINVAL;
+ }
+
+ /* rv = build_prop_list(tmp, bufsize - (tmp - buf), &proplist); */
+ while (buf < end &&
+ (buf = parse_next_property(buf, end, &name, &length, &value))) {
+ struct property *last = prop;
+
+ prop = new_property(name, length, value, last);
+ if (!prop) {
+ rv = -ENOMEM;
+ prop = last;
+ goto out;
+ }
+ }
+ if (!buf) {
+ rv = -EINVAL;
+ goto out;
+ }
+
+ rv = pSeries_reconfig_add_node(path, prop);
+
+out:
+ if (rv)
+ release_prop_list(prop);
+ return rv;
+}
+
+static int do_remove_node(char *buf)
+{
+ struct device_node *node;
+ int rv = -ENODEV;
+
+ if ((node = of_find_node_by_path(buf)))
+ rv = pSeries_reconfig_remove_node(node);
+
+ of_node_put(node);
+ return rv;
+}
+
+static char *parse_node(char *buf, size_t bufsize, struct device_node **npp)
+{
+ char *handle_str;
+ phandle handle;
+ *npp = NULL;
+
+ handle_str = buf;
+
+ buf = strchr(buf, ' ');
+ if (!buf)
+ return NULL;
+ *buf = '\0';
+ buf++;
+
+ handle = simple_strtoul(handle_str, NULL, 0);
+
+ *npp = of_find_node_by_phandle(handle);
+ return buf;
+}
+
+static int do_add_property(char *buf, size_t bufsize)
+{
+ struct property *prop = NULL;
+ struct device_node *np;
+ unsigned char *value;
+ char *name, *end;
+ int length;
+ end = buf + bufsize;
+ buf = parse_node(buf, bufsize, &np);
+
+ if (!np)
+ return -ENODEV;
+
+ if (parse_next_property(buf, end, &name, &length, &value) == NULL)
+ return -EINVAL;
+
+ prop = new_property(name, length, value, NULL);
+ if (!prop)
+ return -ENOMEM;
+
+ of_add_property(np, prop);
+
+ return 0;
+}
+
+static int do_remove_property(char *buf, size_t bufsize)
+{
+ struct device_node *np;
+ char *tmp;
+ buf = parse_node(buf, bufsize, &np);
+
+ if (!np)
+ return -ENODEV;
+
+ tmp = strchr(buf,' ');
+ if (tmp)
+ *tmp = '\0';
+
+ if (strlen(buf) == 0)
+ return -EINVAL;
+
+ return of_remove_property(np, of_find_property(np, buf, NULL));
+}
+
+static int do_update_property(char *buf, size_t bufsize)
+{
+ struct device_node *np;
+ unsigned char *value;
+ char *name, *end, *next_prop;
+ int length;
+ struct property *newprop;
+ buf = parse_node(buf, bufsize, &np);
+ end = buf + bufsize;
+
+ if (!np)
+ return -ENODEV;
+
+ next_prop = parse_next_property(buf, end, &name, &length, &value);
+ if (!next_prop)
+ return -EINVAL;
+
+ if (!strlen(name))
+ return -ENODEV;
+
+ newprop = new_property(name, length, value, NULL);
+ if (!newprop)
+ return -ENOMEM;
+
+ if (!strcmp(name, "slb-size") || !strcmp(name, "ibm,slb-size"))
+ slb_set_size(*(int *)value);
+
+ return of_update_property(np, newprop);
+}
+
+/**
+ * ofdt_write - perform operations on the Open Firmware device tree
+ *
+ * @file: not used
+ * @buf: command and arguments
+ * @count: size of the command buffer
+ * @off: not used
+ *
+ * Operations supported at this time are addition and removal of
+ * whole nodes along with their properties. Operations on individual
+ * properties are not implemented (yet).
+ */
+static ssize_t ofdt_write(struct file *file, const char __user *buf, size_t count,
+ loff_t *off)
+{
+ int rv;
+ char *kbuf;
+ char *tmp;
+
+ kbuf = memdup_user_nul(buf, count);
+ if (IS_ERR(kbuf))
+ return PTR_ERR(kbuf);
+
+ tmp = strchr(kbuf, ' ');
+ if (!tmp) {
+ rv = -EINVAL;
+ goto out;
+ }
+ *tmp = '\0';
+ tmp++;
+
+ if (!strcmp(kbuf, "add_node"))
+ rv = do_add_node(tmp, count - (tmp - kbuf));
+ else if (!strcmp(kbuf, "remove_node"))
+ rv = do_remove_node(tmp);
+ else if (!strcmp(kbuf, "add_property"))
+ rv = do_add_property(tmp, count - (tmp - kbuf));
+ else if (!strcmp(kbuf, "remove_property"))
+ rv = do_remove_property(tmp, count - (tmp - kbuf));
+ else if (!strcmp(kbuf, "update_property"))
+ rv = do_update_property(tmp, count - (tmp - kbuf));
+ else
+ rv = -EINVAL;
+out:
+ kfree(kbuf);
+ return rv ? rv : count;
+}
+
+static const struct file_operations ofdt_fops = {
+ .write = ofdt_write,
+ .llseek = noop_llseek,
+};
+
+/* create /proc/powerpc/ofdt write-only by root */
+static int proc_ppc64_create_ofdt(void)
+{
+ struct proc_dir_entry *ent;
+
+ ent = proc_create("powerpc/ofdt", 0200, NULL, &ofdt_fops);
+ if (ent)
+ proc_set_size(ent, 0);
+
+ return 0;
+}
+machine_device_initcall(pseries, proc_ppc64_create_ofdt);
diff --git a/arch/powerpc/platforms/pseries/rng.c b/arch/powerpc/platforms/pseries/rng.c
new file mode 100644
index 000000000..262b8c5e1
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/rng.c
@@ -0,0 +1,46 @@
+/*
+ * Copyright 2013, Michael Ellerman, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#define pr_fmt(fmt) "pseries-rng: " fmt
+
+#include <linux/kernel.h>
+#include <linux/of.h>
+#include <asm/archrandom.h>
+#include <asm/machdep.h>
+#include <asm/plpar_wrappers.h>
+
+
+static int pseries_get_random_long(unsigned long *v)
+{
+ unsigned long retbuf[PLPAR_HCALL_BUFSIZE];
+
+ if (plpar_hcall(H_RANDOM, retbuf) == H_SUCCESS) {
+ *v = retbuf[0];
+ return 1;
+ }
+
+ return 0;
+}
+
+static __init int rng_init(void)
+{
+ struct device_node *dn;
+
+ dn = of_find_compatible_node(NULL, NULL, "ibm,random");
+ if (!dn)
+ return -ENODEV;
+
+ pr_info("Registering arch random hook.\n");
+
+ ppc_md.get_random_seed = pseries_get_random_long;
+
+ of_node_put(dn);
+ return 0;
+}
+machine_subsys_initcall(pseries, rng_init);
diff --git a/arch/powerpc/platforms/pseries/scanlog.c b/arch/powerpc/platforms/pseries/scanlog.c
new file mode 100644
index 000000000..054ce7a16
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/scanlog.c
@@ -0,0 +1,200 @@
+/*
+ * c 2001 PPC 64 Team, IBM Corp
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * scan-log-data driver for PPC64 Todd Inglett <tinglett@vnet.ibm.com>
+ *
+ * When ppc64 hardware fails the service processor dumps internal state
+ * of the system. After a reboot the operating system can access a dump
+ * of this data using this driver. A dump exists if the device-tree
+ * /chosen/ibm,scan-log-data property exists.
+ *
+ * This driver exports /proc/powerpc/scan-log-dump which can be read.
+ * The driver supports only sequential reads.
+ *
+ * The driver looks at a write to the driver for the single word "reset".
+ * If given, the driver will reset the scanlog so the platform can free it.
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <linux/proc_fs.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <asm/rtas.h>
+#include <asm/prom.h>
+
+#define MODULE_VERS "1.0"
+#define MODULE_NAME "scanlog"
+
+/* Status returns from ibm,scan-log-dump */
+#define SCANLOG_COMPLETE 0
+#define SCANLOG_HWERROR -1
+#define SCANLOG_CONTINUE 1
+
+
+static unsigned int ibm_scan_log_dump; /* RTAS token */
+static unsigned int *scanlog_buffer; /* The data buffer */
+
+static ssize_t scanlog_read(struct file *file, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ unsigned int *data = scanlog_buffer;
+ int status;
+ unsigned long len, off;
+ unsigned int wait_time;
+
+ if (count > RTAS_DATA_BUF_SIZE)
+ count = RTAS_DATA_BUF_SIZE;
+
+ if (count < 1024) {
+ /* This is the min supported by this RTAS call. Rather
+ * than do all the buffering we insist the user code handle
+ * larger reads. As long as cp works... :)
+ */
+ printk(KERN_ERR "scanlog: cannot perform a small read (%ld)\n", count);
+ return -EINVAL;
+ }
+
+ if (!access_ok(VERIFY_WRITE, buf, count))
+ return -EFAULT;
+
+ for (;;) {
+ wait_time = 500; /* default wait if no data */
+ spin_lock(&rtas_data_buf_lock);
+ memcpy(rtas_data_buf, data, RTAS_DATA_BUF_SIZE);
+ status = rtas_call(ibm_scan_log_dump, 2, 1, NULL,
+ (u32) __pa(rtas_data_buf), (u32) count);
+ memcpy(data, rtas_data_buf, RTAS_DATA_BUF_SIZE);
+ spin_unlock(&rtas_data_buf_lock);
+
+ pr_debug("scanlog: status=%d, data[0]=%x, data[1]=%x, " \
+ "data[2]=%x\n", status, data[0], data[1], data[2]);
+ switch (status) {
+ case SCANLOG_COMPLETE:
+ pr_debug("scanlog: hit eof\n");
+ return 0;
+ case SCANLOG_HWERROR:
+ pr_debug("scanlog: hardware error reading data\n");
+ return -EIO;
+ case SCANLOG_CONTINUE:
+ /* We may or may not have data yet */
+ len = data[1];
+ off = data[2];
+ if (len > 0) {
+ if (copy_to_user(buf, ((char *)data)+off, len))
+ return -EFAULT;
+ return len;
+ }
+ /* Break to sleep default time */
+ break;
+ default:
+ /* Assume extended busy */
+ wait_time = rtas_busy_delay_time(status);
+ if (!wait_time) {
+ printk(KERN_ERR "scanlog: unknown error " \
+ "from rtas: %d\n", status);
+ return -EIO;
+ }
+ }
+ /* Apparently no data yet. Wait and try again. */
+ msleep_interruptible(wait_time);
+ }
+ /*NOTREACHED*/
+}
+
+static ssize_t scanlog_write(struct file * file, const char __user * buf,
+ size_t count, loff_t *ppos)
+{
+ char stkbuf[20];
+ int status;
+
+ if (count > 19) count = 19;
+ if (copy_from_user (stkbuf, buf, count)) {
+ return -EFAULT;
+ }
+ stkbuf[count] = 0;
+
+ if (buf) {
+ if (strncmp(stkbuf, "reset", 5) == 0) {
+ pr_debug("scanlog: reset scanlog\n");
+ status = rtas_call(ibm_scan_log_dump, 2, 1, NULL, 0, 0);
+ pr_debug("scanlog: rtas returns %d\n", status);
+ }
+ }
+ return count;
+}
+
+static int scanlog_open(struct inode * inode, struct file * file)
+{
+ unsigned int *data = scanlog_buffer;
+
+ if (data[0] != 0) {
+ /* This imperfect test stops a second copy of the
+ * data (or a reset while data is being copied)
+ */
+ return -EBUSY;
+ }
+
+ data[0] = 0; /* re-init so we restart the scan */
+
+ return 0;
+}
+
+static int scanlog_release(struct inode * inode, struct file * file)
+{
+ unsigned int *data = scanlog_buffer;
+
+ data[0] = 0;
+ return 0;
+}
+
+static const struct file_operations scanlog_fops = {
+ .owner = THIS_MODULE,
+ .read = scanlog_read,
+ .write = scanlog_write,
+ .open = scanlog_open,
+ .release = scanlog_release,
+ .llseek = noop_llseek,
+};
+
+static int __init scanlog_init(void)
+{
+ struct proc_dir_entry *ent;
+ int err = -ENOMEM;
+
+ ibm_scan_log_dump = rtas_token("ibm,scan-log-dump");
+ if (ibm_scan_log_dump == RTAS_UNKNOWN_SERVICE)
+ return -ENODEV;
+
+ /* Ideally we could allocate a buffer < 4G */
+ scanlog_buffer = kzalloc(RTAS_DATA_BUF_SIZE, GFP_KERNEL);
+ if (!scanlog_buffer)
+ goto err;
+
+ ent = proc_create("powerpc/rtas/scan-log-dump", 0400, NULL,
+ &scanlog_fops);
+ if (!ent)
+ goto err;
+ return 0;
+err:
+ kfree(scanlog_buffer);
+ return err;
+}
+
+static void __exit scanlog_cleanup(void)
+{
+ remove_proc_entry("powerpc/rtas/scan-log-dump", NULL);
+ kfree(scanlog_buffer);
+}
+
+module_init(scanlog_init);
+module_exit(scanlog_cleanup);
+MODULE_LICENSE("GPL");
diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c
new file mode 100644
index 000000000..885d910bf
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/setup.c
@@ -0,0 +1,1046 @@
+/*
+ * 64-bit pSeries and RS/6000 setup code.
+ *
+ * Copyright (C) 1995 Linus Torvalds
+ * Adapted from 'alpha' version by Gary Thomas
+ * Modified by Cort Dougan (cort@cs.nmt.edu)
+ * Modified by PPC64 Team, IBM Corp
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+/*
+ * bootup setup stuff..
+ */
+
+#include <linux/cpu.h>
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/stddef.h>
+#include <linux/unistd.h>
+#include <linux/user.h>
+#include <linux/tty.h>
+#include <linux/major.h>
+#include <linux/interrupt.h>
+#include <linux/reboot.h>
+#include <linux/init.h>
+#include <linux/ioport.h>
+#include <linux/console.h>
+#include <linux/pci.h>
+#include <linux/utsname.h>
+#include <linux/adb.h>
+#include <linux/export.h>
+#include <linux/delay.h>
+#include <linux/irq.h>
+#include <linux/seq_file.h>
+#include <linux/root_dev.h>
+#include <linux/of.h>
+#include <linux/of_pci.h>
+#include <linux/memblock.h>
+
+#include <asm/mmu.h>
+#include <asm/processor.h>
+#include <asm/io.h>
+#include <asm/pgtable.h>
+#include <asm/prom.h>
+#include <asm/rtas.h>
+#include <asm/pci-bridge.h>
+#include <asm/iommu.h>
+#include <asm/dma.h>
+#include <asm/machdep.h>
+#include <asm/irq.h>
+#include <asm/time.h>
+#include <asm/nvram.h>
+#include <asm/pmc.h>
+#include <asm/xics.h>
+#include <asm/xive.h>
+#include <asm/ppc-pci.h>
+#include <asm/i8259.h>
+#include <asm/udbg.h>
+#include <asm/smp.h>
+#include <asm/firmware.h>
+#include <asm/eeh.h>
+#include <asm/reg.h>
+#include <asm/plpar_wrappers.h>
+#include <asm/kexec.h>
+#include <asm/isa-bridge.h>
+#include <asm/security_features.h>
+#include <asm/asm-const.h>
+
+#include "pseries.h"
+#include "../../../../drivers/pci/pci.h"
+
+DEFINE_STATIC_KEY_FALSE(shared_processor);
+EXPORT_SYMBOL(shared_processor);
+
+int CMO_PrPSP = -1;
+int CMO_SecPSP = -1;
+unsigned long CMO_PageSize = (ASM_CONST(1) << IOMMU_PAGE_SHIFT_4K);
+EXPORT_SYMBOL(CMO_PageSize);
+
+int fwnmi_active; /* TRUE if an FWNMI handler is present */
+
+static void pSeries_show_cpuinfo(struct seq_file *m)
+{
+ struct device_node *root;
+ const char *model = "";
+
+ root = of_find_node_by_path("/");
+ if (root)
+ model = of_get_property(root, "model", NULL);
+ seq_printf(m, "machine\t\t: CHRP %s\n", model);
+ of_node_put(root);
+ if (radix_enabled())
+ seq_printf(m, "MMU\t\t: Radix\n");
+ else
+ seq_printf(m, "MMU\t\t: Hash\n");
+}
+
+/* Initialize firmware assisted non-maskable interrupts if
+ * the firmware supports this feature.
+ */
+static void __init fwnmi_init(void)
+{
+ unsigned long system_reset_addr, machine_check_addr;
+ u8 *mce_data_buf;
+ unsigned int i;
+ int nr_cpus = num_possible_cpus();
+
+ int ibm_nmi_register = rtas_token("ibm,nmi-register");
+ if (ibm_nmi_register == RTAS_UNKNOWN_SERVICE)
+ return;
+
+ /* If the kernel's not linked at zero we point the firmware at low
+ * addresses anyway, and use a trampoline to get to the real code. */
+ system_reset_addr = __pa(system_reset_fwnmi) - PHYSICAL_START;
+ machine_check_addr = __pa(machine_check_fwnmi) - PHYSICAL_START;
+
+ if (0 == rtas_call(ibm_nmi_register, 2, 1, NULL, system_reset_addr,
+ machine_check_addr))
+ fwnmi_active = 1;
+
+ /*
+ * Allocate a chunk for per cpu buffer to hold rtas errorlog.
+ * It will be used in real mode mce handler, hence it needs to be
+ * below RMA.
+ */
+ mce_data_buf = __va(memblock_alloc_base(RTAS_ERROR_LOG_MAX * nr_cpus,
+ RTAS_ERROR_LOG_MAX, ppc64_rma_size));
+ for_each_possible_cpu(i) {
+ paca_ptrs[i]->mce_data_buf = mce_data_buf +
+ (RTAS_ERROR_LOG_MAX * i);
+ }
+}
+
+static void pseries_8259_cascade(struct irq_desc *desc)
+{
+ struct irq_chip *chip = irq_desc_get_chip(desc);
+ unsigned int cascade_irq = i8259_irq();
+
+ if (cascade_irq)
+ generic_handle_irq(cascade_irq);
+
+ chip->irq_eoi(&desc->irq_data);
+}
+
+static void __init pseries_setup_i8259_cascade(void)
+{
+ struct device_node *np, *old, *found = NULL;
+ unsigned int cascade;
+ const u32 *addrp;
+ unsigned long intack = 0;
+ int naddr;
+
+ for_each_node_by_type(np, "interrupt-controller") {
+ if (of_device_is_compatible(np, "chrp,iic")) {
+ found = np;
+ break;
+ }
+ }
+
+ if (found == NULL) {
+ printk(KERN_DEBUG "pic: no ISA interrupt controller\n");
+ return;
+ }
+
+ cascade = irq_of_parse_and_map(found, 0);
+ if (!cascade) {
+ printk(KERN_ERR "pic: failed to map cascade interrupt");
+ return;
+ }
+ pr_debug("pic: cascade mapped to irq %d\n", cascade);
+
+ for (old = of_node_get(found); old != NULL ; old = np) {
+ np = of_get_parent(old);
+ of_node_put(old);
+ if (np == NULL)
+ break;
+ if (strcmp(np->name, "pci") != 0)
+ continue;
+ addrp = of_get_property(np, "8259-interrupt-acknowledge", NULL);
+ if (addrp == NULL)
+ continue;
+ naddr = of_n_addr_cells(np);
+ intack = addrp[naddr-1];
+ if (naddr > 1)
+ intack |= ((unsigned long)addrp[naddr-2]) << 32;
+ }
+ if (intack)
+ printk(KERN_DEBUG "pic: PCI 8259 intack at 0x%016lx\n", intack);
+ i8259_init(found, intack);
+ of_node_put(found);
+ irq_set_chained_handler(cascade, pseries_8259_cascade);
+}
+
+static void __init pseries_init_irq(void)
+{
+ /* Try using a XIVE if available, otherwise use a XICS */
+ if (!xive_spapr_init()) {
+ xics_init();
+ pseries_setup_i8259_cascade();
+ }
+}
+
+static void pseries_lpar_enable_pmcs(void)
+{
+ unsigned long set, reset;
+
+ set = 1UL << 63;
+ reset = 0;
+ plpar_hcall_norets(H_PERFMON, set, reset);
+}
+
+static int pci_dn_reconfig_notifier(struct notifier_block *nb, unsigned long action, void *data)
+{
+ struct of_reconfig_data *rd = data;
+ struct device_node *parent, *np = rd->dn;
+ struct pci_dn *pdn;
+ int err = NOTIFY_OK;
+
+ switch (action) {
+ case OF_RECONFIG_ATTACH_NODE:
+ parent = of_get_parent(np);
+ pdn = parent ? PCI_DN(parent) : NULL;
+ if (pdn)
+ pci_add_device_node_info(pdn->phb, np);
+
+ of_node_put(parent);
+ break;
+ case OF_RECONFIG_DETACH_NODE:
+ pdn = PCI_DN(np);
+ if (pdn)
+ list_del(&pdn->list);
+ break;
+ default:
+ err = NOTIFY_DONE;
+ break;
+ }
+ return err;
+}
+
+static struct notifier_block pci_dn_reconfig_nb = {
+ .notifier_call = pci_dn_reconfig_notifier,
+};
+
+struct kmem_cache *dtl_cache;
+
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
+/*
+ * Allocate space for the dispatch trace log for all possible cpus
+ * and register the buffers with the hypervisor. This is used for
+ * computing time stolen by the hypervisor.
+ */
+static int alloc_dispatch_logs(void)
+{
+ int cpu, ret;
+ struct paca_struct *pp;
+ struct dtl_entry *dtl;
+
+ if (!firmware_has_feature(FW_FEATURE_SPLPAR))
+ return 0;
+
+ if (!dtl_cache)
+ return 0;
+
+ for_each_possible_cpu(cpu) {
+ pp = paca_ptrs[cpu];
+ dtl = kmem_cache_alloc(dtl_cache, GFP_KERNEL);
+ if (!dtl) {
+ pr_warn("Failed to allocate dispatch trace log for cpu %d\n",
+ cpu);
+ pr_warn("Stolen time statistics will be unreliable\n");
+ break;
+ }
+
+ pp->dtl_ridx = 0;
+ pp->dispatch_log = dtl;
+ pp->dispatch_log_end = dtl + N_DISPATCH_LOG;
+ pp->dtl_curr = dtl;
+ }
+
+ /* Register the DTL for the current (boot) cpu */
+ dtl = get_paca()->dispatch_log;
+ get_paca()->dtl_ridx = 0;
+ get_paca()->dtl_curr = dtl;
+ get_paca()->lppaca_ptr->dtl_idx = 0;
+
+ /* hypervisor reads buffer length from this field */
+ dtl->enqueue_to_dispatch_time = cpu_to_be32(DISPATCH_LOG_BYTES);
+ ret = register_dtl(hard_smp_processor_id(), __pa(dtl));
+ if (ret)
+ pr_err("WARNING: DTL registration of cpu %d (hw %d) failed "
+ "with %d\n", smp_processor_id(),
+ hard_smp_processor_id(), ret);
+ get_paca()->lppaca_ptr->dtl_enable_mask = 2;
+
+ return 0;
+}
+#else /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
+static inline int alloc_dispatch_logs(void)
+{
+ return 0;
+}
+#endif /* CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
+
+static int alloc_dispatch_log_kmem_cache(void)
+{
+ dtl_cache = kmem_cache_create("dtl", DISPATCH_LOG_BYTES,
+ DISPATCH_LOG_BYTES, 0, NULL);
+ if (!dtl_cache) {
+ pr_warn("Failed to create dispatch trace log buffer cache\n");
+ pr_warn("Stolen time statistics will be unreliable\n");
+ return 0;
+ }
+
+ return alloc_dispatch_logs();
+}
+machine_early_initcall(pseries, alloc_dispatch_log_kmem_cache);
+
+static void pseries_lpar_idle(void)
+{
+ /*
+ * Default handler to go into low thread priority and possibly
+ * low power mode by ceding processor to hypervisor
+ */
+
+ if (!prep_irq_for_idle())
+ return;
+
+ /* Indicate to hypervisor that we are idle. */
+ get_lppaca()->idle = 1;
+
+ /*
+ * Yield the processor to the hypervisor. We return if
+ * an external interrupt occurs (which are driven prior
+ * to returning here) or if a prod occurs from another
+ * processor. When returning here, external interrupts
+ * are enabled.
+ */
+ cede_processor();
+
+ get_lppaca()->idle = 0;
+}
+
+/*
+ * Enable relocation on during exceptions. This has partition wide scope and
+ * may take a while to complete, if it takes longer than one second we will
+ * just give up rather than wasting any more time on this - if that turns out
+ * to ever be a problem in practice we can move this into a kernel thread to
+ * finish off the process later in boot.
+ */
+void pseries_enable_reloc_on_exc(void)
+{
+ long rc;
+ unsigned int delay, total_delay = 0;
+
+ while (1) {
+ rc = enable_reloc_on_exceptions();
+ if (!H_IS_LONG_BUSY(rc)) {
+ if (rc == H_P2) {
+ pr_info("Relocation on exceptions not"
+ " supported\n");
+ } else if (rc != H_SUCCESS) {
+ pr_warn("Unable to enable relocation"
+ " on exceptions: %ld\n", rc);
+ }
+ break;
+ }
+
+ delay = get_longbusy_msecs(rc);
+ total_delay += delay;
+ if (total_delay > 1000) {
+ pr_warn("Warning: Giving up waiting to enable "
+ "relocation on exceptions (%u msec)!\n",
+ total_delay);
+ return;
+ }
+
+ mdelay(delay);
+ }
+}
+EXPORT_SYMBOL(pseries_enable_reloc_on_exc);
+
+void pseries_disable_reloc_on_exc(void)
+{
+ long rc;
+
+ while (1) {
+ rc = disable_reloc_on_exceptions();
+ if (!H_IS_LONG_BUSY(rc))
+ break;
+ mdelay(get_longbusy_msecs(rc));
+ }
+ if (rc != H_SUCCESS)
+ pr_warn("Warning: Failed to disable relocation on exceptions: %ld\n",
+ rc);
+}
+EXPORT_SYMBOL(pseries_disable_reloc_on_exc);
+
+#ifdef CONFIG_KEXEC_CORE
+static void pSeries_machine_kexec(struct kimage *image)
+{
+ if (firmware_has_feature(FW_FEATURE_SET_MODE))
+ pseries_disable_reloc_on_exc();
+
+ default_machine_kexec(image);
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+void pseries_big_endian_exceptions(void)
+{
+ long rc;
+
+ while (1) {
+ rc = enable_big_endian_exceptions();
+ if (!H_IS_LONG_BUSY(rc))
+ break;
+ mdelay(get_longbusy_msecs(rc));
+ }
+
+ /*
+ * At this point it is unlikely panic() will get anything
+ * out to the user, since this is called very late in kexec
+ * but at least this will stop us from continuing on further
+ * and creating an even more difficult to debug situation.
+ *
+ * There is a known problem when kdump'ing, if cpus are offline
+ * the above call will fail. Rather than panicking again, keep
+ * going and hope the kdump kernel is also little endian, which
+ * it usually is.
+ */
+ if (rc && !kdump_in_progress())
+ panic("Could not enable big endian exceptions");
+}
+
+void pseries_little_endian_exceptions(void)
+{
+ long rc;
+
+ while (1) {
+ rc = enable_little_endian_exceptions();
+ if (!H_IS_LONG_BUSY(rc))
+ break;
+ mdelay(get_longbusy_msecs(rc));
+ }
+ if (rc) {
+ ppc_md.progress("H_SET_MODE LE exception fail", 0);
+ panic("Could not enable little endian exceptions");
+ }
+}
+#endif
+
+static void __init find_and_init_phbs(void)
+{
+ struct device_node *node;
+ struct pci_controller *phb;
+ struct device_node *root = of_find_node_by_path("/");
+
+ for_each_child_of_node(root, node) {
+ if (node->type == NULL || (strcmp(node->type, "pci") != 0 &&
+ strcmp(node->type, "pciex") != 0))
+ continue;
+
+ phb = pcibios_alloc_controller(node);
+ if (!phb)
+ continue;
+ rtas_setup_phb(phb);
+ pci_process_bridge_OF_ranges(phb, node, 0);
+ isa_bridge_find_early(phb);
+ phb->controller_ops = pseries_pci_controller_ops;
+ }
+
+ of_node_put(root);
+
+ /*
+ * PCI_PROBE_ONLY and PCI_REASSIGN_ALL_BUS can be set via properties
+ * in chosen.
+ */
+ of_pci_check_probe_only();
+}
+
+static void init_cpu_char_feature_flags(struct h_cpu_char_result *result)
+{
+ /*
+ * The features below are disabled by default, so we instead look to see
+ * if firmware has *enabled* them, and set them if so.
+ */
+ if (result->character & H_CPU_CHAR_SPEC_BAR_ORI31)
+ security_ftr_set(SEC_FTR_SPEC_BAR_ORI31);
+
+ if (result->character & H_CPU_CHAR_BCCTRL_SERIALISED)
+ security_ftr_set(SEC_FTR_BCCTRL_SERIALISED);
+
+ if (result->character & H_CPU_CHAR_L1D_FLUSH_ORI30)
+ security_ftr_set(SEC_FTR_L1D_FLUSH_ORI30);
+
+ if (result->character & H_CPU_CHAR_L1D_FLUSH_TRIG2)
+ security_ftr_set(SEC_FTR_L1D_FLUSH_TRIG2);
+
+ if (result->character & H_CPU_CHAR_L1D_THREAD_PRIV)
+ security_ftr_set(SEC_FTR_L1D_THREAD_PRIV);
+
+ if (result->character & H_CPU_CHAR_COUNT_CACHE_DISABLED)
+ security_ftr_set(SEC_FTR_COUNT_CACHE_DISABLED);
+
+ if (result->character & H_CPU_CHAR_BCCTR_FLUSH_ASSIST)
+ security_ftr_set(SEC_FTR_BCCTR_FLUSH_ASSIST);
+
+ if (result->behaviour & H_CPU_BEHAV_FLUSH_COUNT_CACHE)
+ security_ftr_set(SEC_FTR_FLUSH_COUNT_CACHE);
+
+ /*
+ * The features below are enabled by default, so we instead look to see
+ * if firmware has *disabled* them, and clear them if so.
+ */
+ if (!(result->behaviour & H_CPU_BEHAV_FAVOUR_SECURITY))
+ security_ftr_clear(SEC_FTR_FAVOUR_SECURITY);
+
+ if (!(result->behaviour & H_CPU_BEHAV_L1D_FLUSH_PR))
+ security_ftr_clear(SEC_FTR_L1D_FLUSH_PR);
+
+ if (!(result->behaviour & H_CPU_BEHAV_BNDS_CHK_SPEC_BAR))
+ security_ftr_clear(SEC_FTR_BNDS_CHK_SPEC_BAR);
+}
+
+void pseries_setup_rfi_flush(void)
+{
+ struct h_cpu_char_result result;
+ enum l1d_flush_type types;
+ bool enable;
+ long rc;
+
+ /*
+ * Set features to the defaults assumed by init_cpu_char_feature_flags()
+ * so it can set/clear again any features that might have changed after
+ * migration, and in case the hypercall fails and it is not even called.
+ */
+ powerpc_security_features = SEC_FTR_DEFAULT;
+
+ rc = plpar_get_cpu_characteristics(&result);
+ if (rc == H_SUCCESS)
+ init_cpu_char_feature_flags(&result);
+
+ /*
+ * We're the guest so this doesn't apply to us, clear it to simplify
+ * handling of it elsewhere.
+ */
+ security_ftr_clear(SEC_FTR_L1D_FLUSH_HV);
+
+ types = L1D_FLUSH_FALLBACK;
+
+ if (security_ftr_enabled(SEC_FTR_L1D_FLUSH_TRIG2))
+ types |= L1D_FLUSH_MTTRIG;
+
+ if (security_ftr_enabled(SEC_FTR_L1D_FLUSH_ORI30))
+ types |= L1D_FLUSH_ORI;
+
+ enable = security_ftr_enabled(SEC_FTR_FAVOUR_SECURITY) && \
+ security_ftr_enabled(SEC_FTR_L1D_FLUSH_PR);
+
+ setup_rfi_flush(types, enable);
+ setup_count_cache_flush();
+
+ enable = security_ftr_enabled(SEC_FTR_FAVOUR_SECURITY) &&
+ security_ftr_enabled(SEC_FTR_L1D_FLUSH_ENTRY);
+ setup_entry_flush(enable);
+
+ enable = security_ftr_enabled(SEC_FTR_FAVOUR_SECURITY) &&
+ security_ftr_enabled(SEC_FTR_L1D_FLUSH_UACCESS);
+ setup_uaccess_flush(enable);
+}
+
+#ifdef CONFIG_PCI_IOV
+enum rtas_iov_fw_value_map {
+ NUM_RES_PROPERTY = 0, /* Number of Resources */
+ LOW_INT = 1, /* Lowest 32 bits of Address */
+ START_OF_ENTRIES = 2, /* Always start of entry */
+ APERTURE_PROPERTY = 2, /* Start of entry+ to Aperture Size */
+ WDW_SIZE_PROPERTY = 4, /* Start of entry+ to Window Size */
+ NEXT_ENTRY = 7 /* Go to next entry on array */
+};
+
+enum get_iov_fw_value_index {
+ BAR_ADDRS = 1, /* Get Bar Address */
+ APERTURE_SIZE = 2, /* Get Aperture Size */
+ WDW_SIZE = 3 /* Get Window Size */
+};
+
+resource_size_t pseries_get_iov_fw_value(struct pci_dev *dev, int resno,
+ enum get_iov_fw_value_index value)
+{
+ const int *indexes;
+ struct device_node *dn = pci_device_to_OF_node(dev);
+ int i, num_res, ret = 0;
+
+ indexes = of_get_property(dn, "ibm,open-sriov-vf-bar-info", NULL);
+ if (!indexes)
+ return 0;
+
+ /*
+ * First element in the array is the number of Bars
+ * returned. Search through the list to find the matching
+ * bar
+ */
+ num_res = of_read_number(&indexes[NUM_RES_PROPERTY], 1);
+ if (resno >= num_res)
+ return 0; /* or an errror */
+
+ i = START_OF_ENTRIES + NEXT_ENTRY * resno;
+ switch (value) {
+ case BAR_ADDRS:
+ ret = of_read_number(&indexes[i], 2);
+ break;
+ case APERTURE_SIZE:
+ ret = of_read_number(&indexes[i + APERTURE_PROPERTY], 2);
+ break;
+ case WDW_SIZE:
+ ret = of_read_number(&indexes[i + WDW_SIZE_PROPERTY], 2);
+ break;
+ }
+
+ return ret;
+}
+
+void of_pci_set_vf_bar_size(struct pci_dev *dev, const int *indexes)
+{
+ struct resource *res;
+ resource_size_t base, size;
+ int i, r, num_res;
+
+ num_res = of_read_number(&indexes[NUM_RES_PROPERTY], 1);
+ num_res = min_t(int, num_res, PCI_SRIOV_NUM_BARS);
+ for (i = START_OF_ENTRIES, r = 0; r < num_res && r < PCI_SRIOV_NUM_BARS;
+ i += NEXT_ENTRY, r++) {
+ res = &dev->resource[r + PCI_IOV_RESOURCES];
+ base = of_read_number(&indexes[i], 2);
+ size = of_read_number(&indexes[i + APERTURE_PROPERTY], 2);
+ res->flags = pci_parse_of_flags(of_read_number
+ (&indexes[i + LOW_INT], 1), 0);
+ res->flags |= (IORESOURCE_MEM_64 | IORESOURCE_PCI_FIXED);
+ res->name = pci_name(dev);
+ res->start = base;
+ res->end = base + size - 1;
+ }
+}
+
+void of_pci_parse_iov_addrs(struct pci_dev *dev, const int *indexes)
+{
+ struct resource *res, *root, *conflict;
+ resource_size_t base, size;
+ int i, r, num_res;
+
+ /*
+ * First element in the array is the number of Bars
+ * returned. Search through the list to find the matching
+ * bars assign them from firmware into resources structure.
+ */
+ num_res = of_read_number(&indexes[NUM_RES_PROPERTY], 1);
+ for (i = START_OF_ENTRIES, r = 0; r < num_res && r < PCI_SRIOV_NUM_BARS;
+ i += NEXT_ENTRY, r++) {
+ res = &dev->resource[r + PCI_IOV_RESOURCES];
+ base = of_read_number(&indexes[i], 2);
+ size = of_read_number(&indexes[i + WDW_SIZE_PROPERTY], 2);
+ res->name = pci_name(dev);
+ res->start = base;
+ res->end = base + size - 1;
+ root = &iomem_resource;
+ dev_dbg(&dev->dev,
+ "pSeries IOV BAR %d: trying firmware assignment %pR\n",
+ r + PCI_IOV_RESOURCES, res);
+ conflict = request_resource_conflict(root, res);
+ if (conflict) {
+ dev_info(&dev->dev,
+ "BAR %d: %pR conflicts with %s %pR\n",
+ r + PCI_IOV_RESOURCES, res,
+ conflict->name, conflict);
+ res->flags |= IORESOURCE_UNSET;
+ }
+ }
+}
+
+static void pseries_disable_sriov_resources(struct pci_dev *pdev)
+{
+ int i;
+
+ pci_warn(pdev, "No hypervisor support for SR-IOV on this device, IOV BARs disabled.\n");
+ for (i = 0; i < PCI_SRIOV_NUM_BARS; i++)
+ pdev->resource[i + PCI_IOV_RESOURCES].flags = 0;
+}
+
+static void pseries_pci_fixup_resources(struct pci_dev *pdev)
+{
+ const int *indexes;
+ struct device_node *dn = pci_device_to_OF_node(pdev);
+
+ /*Firmware must support open sriov otherwise dont configure*/
+ indexes = of_get_property(dn, "ibm,open-sriov-vf-bar-info", NULL);
+ if (indexes)
+ of_pci_set_vf_bar_size(pdev, indexes);
+ else
+ pseries_disable_sriov_resources(pdev);
+}
+
+static void pseries_pci_fixup_iov_resources(struct pci_dev *pdev)
+{
+ const int *indexes;
+ struct device_node *dn = pci_device_to_OF_node(pdev);
+
+ if (!pdev->is_physfn || pci_dev_is_added(pdev))
+ return;
+ /*Firmware must support open sriov otherwise dont configure*/
+ indexes = of_get_property(dn, "ibm,open-sriov-vf-bar-info", NULL);
+ if (indexes)
+ of_pci_parse_iov_addrs(pdev, indexes);
+ else
+ pseries_disable_sriov_resources(pdev);
+}
+
+static resource_size_t pseries_pci_iov_resource_alignment(struct pci_dev *pdev,
+ int resno)
+{
+ const __be32 *reg;
+ struct device_node *dn = pci_device_to_OF_node(pdev);
+
+ /*Firmware must support open sriov otherwise report regular alignment*/
+ reg = of_get_property(dn, "ibm,is-open-sriov-pf", NULL);
+ if (!reg)
+ return pci_iov_resource_size(pdev, resno);
+
+ if (!pdev->is_physfn)
+ return 0;
+ return pseries_get_iov_fw_value(pdev,
+ resno - PCI_IOV_RESOURCES,
+ APERTURE_SIZE);
+}
+#endif
+
+static void __init pSeries_setup_arch(void)
+{
+ set_arch_panic_timeout(10, ARCH_PANIC_TIMEOUT);
+
+ /* Discover PIC type and setup ppc_md accordingly */
+ smp_init_pseries();
+
+
+ /* openpic global configuration register (64-bit format). */
+ /* openpic Interrupt Source Unit pointer (64-bit format). */
+ /* python0 facility area (mmio) (64-bit format) REAL address. */
+
+ /* init to some ~sane value until calibrate_delay() runs */
+ loops_per_jiffy = 50000000;
+
+ fwnmi_init();
+
+ pseries_setup_rfi_flush();
+ setup_stf_barrier();
+
+ /* By default, only probe PCI (can be overridden by rtas_pci) */
+ pci_add_flags(PCI_PROBE_ONLY);
+
+ /* Find and initialize PCI host bridges */
+ init_pci_config_tokens();
+ find_and_init_phbs();
+ of_reconfig_notifier_register(&pci_dn_reconfig_nb);
+
+ pSeries_nvram_init();
+
+ if (firmware_has_feature(FW_FEATURE_LPAR)) {
+ vpa_init(boot_cpuid);
+
+ if (lppaca_shared_proc(get_lppaca()))
+ static_branch_enable(&shared_processor);
+
+ ppc_md.power_save = pseries_lpar_idle;
+ ppc_md.enable_pmcs = pseries_lpar_enable_pmcs;
+#ifdef CONFIG_PCI_IOV
+ ppc_md.pcibios_fixup_resources =
+ pseries_pci_fixup_resources;
+ ppc_md.pcibios_fixup_sriov =
+ pseries_pci_fixup_iov_resources;
+ ppc_md.pcibios_iov_resource_alignment =
+ pseries_pci_iov_resource_alignment;
+#endif
+ } else {
+ /* No special idle routine */
+ ppc_md.enable_pmcs = power4_enable_pmcs;
+ }
+
+ ppc_md.pcibios_root_bridge_prepare = pseries_root_bridge_prepare;
+}
+
+static void pseries_panic(char *str)
+{
+ panic_flush_kmsg_end();
+ rtas_os_term(str);
+}
+
+static int __init pSeries_init_panel(void)
+{
+ /* Manually leave the kernel version on the panel. */
+#ifdef __BIG_ENDIAN__
+ ppc_md.progress("Linux ppc64\n", 0);
+#else
+ ppc_md.progress("Linux ppc64le\n", 0);
+#endif
+ ppc_md.progress(init_utsname()->version, 0);
+
+ return 0;
+}
+machine_arch_initcall(pseries, pSeries_init_panel);
+
+static int pseries_set_dabr(unsigned long dabr, unsigned long dabrx)
+{
+ return plpar_hcall_norets(H_SET_DABR, dabr);
+}
+
+static int pseries_set_xdabr(unsigned long dabr, unsigned long dabrx)
+{
+ /* Have to set at least one bit in the DABRX according to PAPR */
+ if (dabrx == 0 && dabr == 0)
+ dabrx = DABRX_USER;
+ /* PAPR says we can only set kernel and user bits */
+ dabrx &= DABRX_KERNEL | DABRX_USER;
+
+ return plpar_hcall_norets(H_SET_XDABR, dabr, dabrx);
+}
+
+static int pseries_set_dawr(unsigned long dawr, unsigned long dawrx)
+{
+ /* PAPR says we can't set HYP */
+ dawrx &= ~DAWRX_HYP;
+
+ return plpar_set_watchpoint0(dawr, dawrx);
+}
+
+#define CMO_CHARACTERISTICS_TOKEN 44
+#define CMO_MAXLENGTH 1026
+
+void pSeries_coalesce_init(void)
+{
+ struct hvcall_mpp_x_data mpp_x_data;
+
+ if (firmware_has_feature(FW_FEATURE_CMO) && !h_get_mpp_x(&mpp_x_data))
+ powerpc_firmware_features |= FW_FEATURE_XCMO;
+ else
+ powerpc_firmware_features &= ~FW_FEATURE_XCMO;
+}
+
+/**
+ * fw_cmo_feature_init - FW_FEATURE_CMO is not stored in ibm,hypertas-functions,
+ * handle that here. (Stolen from parse_system_parameter_string)
+ */
+static void pSeries_cmo_feature_init(void)
+{
+ char *ptr, *key, *value, *end;
+ int call_status;
+ int page_order = IOMMU_PAGE_SHIFT_4K;
+
+ pr_debug(" -> fw_cmo_feature_init()\n");
+ spin_lock(&rtas_data_buf_lock);
+ memset(rtas_data_buf, 0, RTAS_DATA_BUF_SIZE);
+ call_status = rtas_call(rtas_token("ibm,get-system-parameter"), 3, 1,
+ NULL,
+ CMO_CHARACTERISTICS_TOKEN,
+ __pa(rtas_data_buf),
+ RTAS_DATA_BUF_SIZE);
+
+ if (call_status != 0) {
+ spin_unlock(&rtas_data_buf_lock);
+ pr_debug("CMO not available\n");
+ pr_debug(" <- fw_cmo_feature_init()\n");
+ return;
+ }
+
+ end = rtas_data_buf + CMO_MAXLENGTH - 2;
+ ptr = rtas_data_buf + 2; /* step over strlen value */
+ key = value = ptr;
+
+ while (*ptr && (ptr <= end)) {
+ /* Separate the key and value by replacing '=' with '\0' and
+ * point the value at the string after the '='
+ */
+ if (ptr[0] == '=') {
+ ptr[0] = '\0';
+ value = ptr + 1;
+ } else if (ptr[0] == '\0' || ptr[0] == ',') {
+ /* Terminate the string containing the key/value pair */
+ ptr[0] = '\0';
+
+ if (key == value) {
+ pr_debug("Malformed key/value pair\n");
+ /* Never found a '=', end processing */
+ break;
+ }
+
+ if (0 == strcmp(key, "CMOPageSize"))
+ page_order = simple_strtol(value, NULL, 10);
+ else if (0 == strcmp(key, "PrPSP"))
+ CMO_PrPSP = simple_strtol(value, NULL, 10);
+ else if (0 == strcmp(key, "SecPSP"))
+ CMO_SecPSP = simple_strtol(value, NULL, 10);
+ value = key = ptr + 1;
+ }
+ ptr++;
+ }
+
+ /* Page size is returned as the power of 2 of the page size,
+ * convert to the page size in bytes before returning
+ */
+ CMO_PageSize = 1 << page_order;
+ pr_debug("CMO_PageSize = %lu\n", CMO_PageSize);
+
+ if (CMO_PrPSP != -1 || CMO_SecPSP != -1) {
+ pr_info("CMO enabled\n");
+ pr_debug("CMO enabled, PrPSP=%d, SecPSP=%d\n", CMO_PrPSP,
+ CMO_SecPSP);
+ powerpc_firmware_features |= FW_FEATURE_CMO;
+ pSeries_coalesce_init();
+ } else
+ pr_debug("CMO not enabled, PrPSP=%d, SecPSP=%d\n", CMO_PrPSP,
+ CMO_SecPSP);
+ spin_unlock(&rtas_data_buf_lock);
+ pr_debug(" <- fw_cmo_feature_init()\n");
+}
+
+/*
+ * Early initialization. Relocation is on but do not reference unbolted pages
+ */
+static void __init pseries_init(void)
+{
+ pr_debug(" -> pseries_init()\n");
+
+#ifdef CONFIG_HVC_CONSOLE
+ if (firmware_has_feature(FW_FEATURE_LPAR))
+ hvc_vio_init_early();
+#endif
+ if (firmware_has_feature(FW_FEATURE_XDABR))
+ ppc_md.set_dabr = pseries_set_xdabr;
+ else if (firmware_has_feature(FW_FEATURE_DABR))
+ ppc_md.set_dabr = pseries_set_dabr;
+
+ if (firmware_has_feature(FW_FEATURE_SET_MODE))
+ ppc_md.set_dawr = pseries_set_dawr;
+
+ pSeries_cmo_feature_init();
+ iommu_init_early_pSeries();
+
+ pr_debug(" <- pseries_init()\n");
+}
+
+/**
+ * pseries_power_off - tell firmware about how to power off the system.
+ *
+ * This function calls either the power-off rtas token in normal cases
+ * or the ibm,power-off-ups token (if present & requested) in case of
+ * a power failure. If power-off token is used, power on will only be
+ * possible with power button press. If ibm,power-off-ups token is used
+ * it will allow auto poweron after power is restored.
+ */
+static void pseries_power_off(void)
+{
+ int rc;
+ int rtas_poweroff_ups_token = rtas_token("ibm,power-off-ups");
+
+ if (rtas_flash_term_hook)
+ rtas_flash_term_hook(SYS_POWER_OFF);
+
+ if (rtas_poweron_auto == 0 ||
+ rtas_poweroff_ups_token == RTAS_UNKNOWN_SERVICE) {
+ rc = rtas_call(rtas_token("power-off"), 2, 1, NULL, -1, -1);
+ printk(KERN_INFO "RTAS power-off returned %d\n", rc);
+ } else {
+ rc = rtas_call(rtas_poweroff_ups_token, 0, 1, NULL);
+ printk(KERN_INFO "RTAS ibm,power-off-ups returned %d\n", rc);
+ }
+ for (;;);
+}
+
+static int __init pSeries_probe(void)
+{
+ const char *dtype = of_get_property(of_root, "device_type", NULL);
+
+ if (dtype == NULL)
+ return 0;
+ if (strcmp(dtype, "chrp"))
+ return 0;
+
+ /* Cell blades firmware claims to be chrp while it's not. Until this
+ * is fixed, we need to avoid those here.
+ */
+ if (of_machine_is_compatible("IBM,CPBW-1.0") ||
+ of_machine_is_compatible("IBM,CBEA"))
+ return 0;
+
+ pm_power_off = pseries_power_off;
+
+ pr_debug("Machine is%s LPAR !\n",
+ (powerpc_firmware_features & FW_FEATURE_LPAR) ? "" : " not");
+
+ pseries_init();
+
+ return 1;
+}
+
+static int pSeries_pci_probe_mode(struct pci_bus *bus)
+{
+ if (firmware_has_feature(FW_FEATURE_LPAR))
+ return PCI_PROBE_DEVTREE;
+ return PCI_PROBE_NORMAL;
+}
+
+struct pci_controller_ops pseries_pci_controller_ops = {
+ .probe_mode = pSeries_pci_probe_mode,
+};
+
+define_machine(pseries) {
+ .name = "pSeries",
+ .probe = pSeries_probe,
+ .setup_arch = pSeries_setup_arch,
+ .init_IRQ = pseries_init_irq,
+ .show_cpuinfo = pSeries_show_cpuinfo,
+ .log_error = pSeries_log_error,
+ .pcibios_fixup = pSeries_final_fixup,
+ .restart = rtas_restart,
+ .halt = rtas_halt,
+ .panic = pseries_panic,
+ .get_boot_time = rtas_get_boot_time,
+ .get_rtc_time = rtas_get_rtc_time,
+ .set_rtc_time = rtas_set_rtc_time,
+ .calibrate_decr = generic_calibrate_decr,
+ .progress = rtas_progress,
+ .system_reset_exception = pSeries_system_reset_exception,
+ .machine_check_exception = pSeries_machine_check_exception,
+#ifdef CONFIG_KEXEC_CORE
+ .machine_kexec = pSeries_machine_kexec,
+ .kexec_cpu_down = pseries_kexec_cpu_down,
+#endif
+#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
+ .memory_block_size = pseries_memory_block_size,
+#endif
+};
diff --git a/arch/powerpc/platforms/pseries/smp.c b/arch/powerpc/platforms/pseries/smp.c
new file mode 100644
index 000000000..3df46123c
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/smp.c
@@ -0,0 +1,292 @@
+/*
+ * SMP support for pSeries machines.
+ *
+ * Dave Engebretsen, Peter Bergner, and
+ * Mike Corrigan {engebret|bergner|mikec}@us.ibm.com
+ *
+ * Plus various changes from other IBM teams...
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/smp.h>
+#include <linux/interrupt.h>
+#include <linux/delay.h>
+#include <linux/init.h>
+#include <linux/spinlock.h>
+#include <linux/cache.h>
+#include <linux/err.h>
+#include <linux/device.h>
+#include <linux/cpu.h>
+
+#include <asm/ptrace.h>
+#include <linux/atomic.h>
+#include <asm/irq.h>
+#include <asm/page.h>
+#include <asm/pgtable.h>
+#include <asm/io.h>
+#include <asm/prom.h>
+#include <asm/smp.h>
+#include <asm/paca.h>
+#include <asm/machdep.h>
+#include <asm/cputable.h>
+#include <asm/firmware.h>
+#include <asm/rtas.h>
+#include <asm/vdso_datapage.h>
+#include <asm/cputhreads.h>
+#include <asm/xics.h>
+#include <asm/xive.h>
+#include <asm/dbell.h>
+#include <asm/plpar_wrappers.h>
+#include <asm/code-patching.h>
+
+#include "pseries.h"
+#include "offline_states.h"
+
+
+/*
+ * The Primary thread of each non-boot processor was started from the OF client
+ * interface by prom_hold_cpus and is spinning on secondary_hold_spinloop.
+ */
+static cpumask_var_t of_spin_mask;
+
+/* Query where a cpu is now. Return codes #defined in plpar_wrappers.h */
+int smp_query_cpu_stopped(unsigned int pcpu)
+{
+ int cpu_status, status;
+ int qcss_tok = rtas_token("query-cpu-stopped-state");
+
+ if (qcss_tok == RTAS_UNKNOWN_SERVICE) {
+ printk_once(KERN_INFO
+ "Firmware doesn't support query-cpu-stopped-state\n");
+ return QCSS_HARDWARE_ERROR;
+ }
+
+ status = rtas_call(qcss_tok, 1, 2, &cpu_status, pcpu);
+ if (status != 0) {
+ printk(KERN_ERR
+ "RTAS query-cpu-stopped-state failed: %i\n", status);
+ return status;
+ }
+
+ return cpu_status;
+}
+
+/**
+ * smp_startup_cpu() - start the given cpu
+ *
+ * At boot time, there is nothing to do for primary threads which were
+ * started from Open Firmware. For anything else, call RTAS with the
+ * appropriate start location.
+ *
+ * Returns:
+ * 0 - failure
+ * 1 - success
+ */
+static inline int smp_startup_cpu(unsigned int lcpu)
+{
+ int status;
+ unsigned long start_here =
+ __pa(ppc_function_entry(generic_secondary_smp_init));
+ unsigned int pcpu;
+ int start_cpu;
+
+ if (cpumask_test_cpu(lcpu, of_spin_mask))
+ /* Already started by OF and sitting in spin loop */
+ return 1;
+
+ pcpu = get_hard_smp_processor_id(lcpu);
+
+ /* Check to see if the CPU out of FW already for kexec */
+ if (smp_query_cpu_stopped(pcpu) == QCSS_NOT_STOPPED){
+ cpumask_set_cpu(lcpu, of_spin_mask);
+ return 1;
+ }
+
+ /* Fixup atomic count: it exited inside IRQ handler. */
+ task_thread_info(paca_ptrs[lcpu]->__current)->preempt_count = 0;
+#ifdef CONFIG_HOTPLUG_CPU
+ if (get_cpu_current_state(lcpu) == CPU_STATE_INACTIVE)
+ goto out;
+#endif
+ /*
+ * If the RTAS start-cpu token does not exist then presume the
+ * cpu is already spinning.
+ */
+ start_cpu = rtas_token("start-cpu");
+ if (start_cpu == RTAS_UNKNOWN_SERVICE)
+ return 1;
+
+ status = rtas_call(start_cpu, 3, 1, NULL, pcpu, start_here, pcpu);
+ if (status != 0) {
+ printk(KERN_ERR "start-cpu failed: %i\n", status);
+ return 0;
+ }
+
+#ifdef CONFIG_HOTPLUG_CPU
+out:
+#endif
+ return 1;
+}
+
+static void smp_setup_cpu(int cpu)
+{
+ if (xive_enabled())
+ xive_smp_setup_cpu();
+ else if (cpu != boot_cpuid)
+ xics_setup_cpu();
+
+ if (firmware_has_feature(FW_FEATURE_SPLPAR))
+ vpa_init(cpu);
+
+ cpumask_clear_cpu(cpu, of_spin_mask);
+#ifdef CONFIG_HOTPLUG_CPU
+ set_cpu_current_state(cpu, CPU_STATE_ONLINE);
+ set_default_offline_state(cpu);
+#endif
+}
+
+static int smp_pSeries_kick_cpu(int nr)
+{
+ if (nr < 0 || nr >= nr_cpu_ids)
+ return -EINVAL;
+
+ if (!smp_startup_cpu(nr))
+ return -ENOENT;
+
+ /*
+ * The processor is currently spinning, waiting for the
+ * cpu_start field to become non-zero After we set cpu_start,
+ * the processor will continue on to secondary_start
+ */
+ paca_ptrs[nr]->cpu_start = 1;
+#ifdef CONFIG_HOTPLUG_CPU
+ set_preferred_offline_state(nr, CPU_STATE_ONLINE);
+
+ if (get_cpu_current_state(nr) == CPU_STATE_INACTIVE) {
+ long rc;
+ unsigned long hcpuid;
+
+ hcpuid = get_hard_smp_processor_id(nr);
+ rc = plpar_hcall_norets(H_PROD, hcpuid);
+ if (rc != H_SUCCESS)
+ printk(KERN_ERR "Error: Prod to wake up processor %d "
+ "Ret= %ld\n", nr, rc);
+ }
+#endif
+
+ return 0;
+}
+
+static int pseries_smp_prepare_cpu(int cpu)
+{
+ if (xive_enabled())
+ return xive_smp_prepare_cpu(cpu);
+ return 0;
+}
+
+static void smp_pseries_cause_ipi(int cpu)
+{
+ /* POWER9 should not use this handler */
+ if (doorbell_try_core_ipi(cpu))
+ return;
+
+ icp_ops->cause_ipi(cpu);
+}
+
+static int pseries_cause_nmi_ipi(int cpu)
+{
+ int hwcpu;
+
+ if (cpu == NMI_IPI_ALL_OTHERS) {
+ hwcpu = H_SIGNAL_SYS_RESET_ALL_OTHERS;
+ } else {
+ if (cpu < 0) {
+ WARN_ONCE(true, "incorrect cpu parameter %d", cpu);
+ return 0;
+ }
+
+ hwcpu = get_hard_smp_processor_id(cpu);
+ }
+
+ if (plpar_signal_sys_reset(hwcpu) == H_SUCCESS)
+ return 1;
+
+ return 0;
+}
+
+static __init void pSeries_smp_probe_xics(void)
+{
+ xics_smp_probe();
+
+ if (cpu_has_feature(CPU_FTR_DBELL))
+ smp_ops->cause_ipi = smp_pseries_cause_ipi;
+ else
+ smp_ops->cause_ipi = icp_ops->cause_ipi;
+}
+
+static __init void pSeries_smp_probe(void)
+{
+ if (xive_enabled())
+ /*
+ * Don't use P9 doorbells when XIVE is enabled. IPIs
+ * using MMIOs should be faster
+ */
+ xive_smp_probe();
+ else
+ pSeries_smp_probe_xics();
+}
+
+static struct smp_ops_t pseries_smp_ops = {
+ .message_pass = NULL, /* Use smp_muxed_ipi_message_pass */
+ .cause_ipi = NULL, /* Filled at runtime by pSeries_smp_probe() */
+ .cause_nmi_ipi = pseries_cause_nmi_ipi,
+ .probe = pSeries_smp_probe,
+ .prepare_cpu = pseries_smp_prepare_cpu,
+ .kick_cpu = smp_pSeries_kick_cpu,
+ .setup_cpu = smp_setup_cpu,
+ .cpu_bootable = smp_generic_cpu_bootable,
+};
+
+/* This is called very early */
+void __init smp_init_pseries(void)
+{
+ int i;
+
+ pr_debug(" -> smp_init_pSeries()\n");
+ smp_ops = &pseries_smp_ops;
+
+ alloc_bootmem_cpumask_var(&of_spin_mask);
+
+ /*
+ * Mark threads which are still spinning in hold loops
+ *
+ * We know prom_init will not have started them if RTAS supports
+ * query-cpu-stopped-state.
+ */
+ if (rtas_token("query-cpu-stopped-state") == RTAS_UNKNOWN_SERVICE) {
+ if (cpu_has_feature(CPU_FTR_SMT)) {
+ for_each_present_cpu(i) {
+ if (cpu_thread_in_core(i) == 0)
+ cpumask_set_cpu(i, of_spin_mask);
+ }
+ } else
+ cpumask_copy(of_spin_mask, cpu_present_mask);
+
+ cpumask_clear_cpu(boot_cpuid, of_spin_mask);
+ }
+
+ /* Non-lpar has additional take/give timebase */
+ if (rtas_token("freeze-time-base") != RTAS_UNKNOWN_SERVICE) {
+ smp_ops->give_timebase = rtas_give_timebase;
+ smp_ops->take_timebase = rtas_take_timebase;
+ }
+
+ pr_debug(" <- smp_init_pSeries()\n");
+}
diff --git a/arch/powerpc/platforms/pseries/suspend.c b/arch/powerpc/platforms/pseries/suspend.c
new file mode 100644
index 000000000..5414d3295
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/suspend.c
@@ -0,0 +1,278 @@
+/*
+ * Copyright (C) 2010 Brian King IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/cpu.h>
+#include <linux/delay.h>
+#include <linux/suspend.h>
+#include <linux/stat.h>
+#include <asm/firmware.h>
+#include <asm/hvcall.h>
+#include <asm/machdep.h>
+#include <asm/mmu.h>
+#include <asm/rtas.h>
+#include <asm/topology.h>
+
+static u64 stream_id;
+static struct device suspend_dev;
+static DECLARE_COMPLETION(suspend_work);
+static struct rtas_suspend_me_data suspend_data;
+static atomic_t suspending;
+
+/**
+ * pseries_suspend_begin - First phase of hibernation
+ *
+ * Check to ensure we are in a valid state to hibernate
+ *
+ * Return value:
+ * 0 on success / other on failure
+ **/
+static int pseries_suspend_begin(suspend_state_t state)
+{
+ long vasi_state, rc;
+ unsigned long retbuf[PLPAR_HCALL_BUFSIZE];
+
+ /* Make sure the state is valid */
+ rc = plpar_hcall(H_VASI_STATE, retbuf, stream_id);
+
+ vasi_state = retbuf[0];
+
+ if (rc) {
+ pr_err("pseries_suspend_begin: vasi_state returned %ld\n",rc);
+ return rc;
+ } else if (vasi_state == H_VASI_ENABLED) {
+ return -EAGAIN;
+ } else if (vasi_state != H_VASI_SUSPENDING) {
+ pr_err("pseries_suspend_begin: vasi_state returned state %ld\n",
+ vasi_state);
+ return -EIO;
+ }
+
+ return 0;
+}
+
+/**
+ * pseries_suspend_cpu - Suspend a single CPU
+ *
+ * Makes the H_JOIN call to suspend the CPU
+ *
+ **/
+static int pseries_suspend_cpu(void)
+{
+ if (atomic_read(&suspending))
+ return rtas_suspend_cpu(&suspend_data);
+ return 0;
+}
+
+/**
+ * pseries_suspend_enable_irqs
+ *
+ * Post suspend configuration updates
+ *
+ **/
+static void pseries_suspend_enable_irqs(void)
+{
+ /*
+ * Update configuration which can be modified based on device tree
+ * changes during resume.
+ */
+ post_mobility_fixup();
+}
+
+/**
+ * pseries_suspend_enter - Final phase of hibernation
+ *
+ * Return value:
+ * 0 on success / other on failure
+ **/
+static int pseries_suspend_enter(suspend_state_t state)
+{
+ int rc = rtas_suspend_last_cpu(&suspend_data);
+
+ atomic_set(&suspending, 0);
+ atomic_set(&suspend_data.done, 1);
+ return rc;
+}
+
+/**
+ * pseries_prepare_late - Prepare to suspend all other CPUs
+ *
+ * Return value:
+ * 0 on success / other on failure
+ **/
+static int pseries_prepare_late(void)
+{
+ atomic_set(&suspending, 1);
+ atomic_set(&suspend_data.working, 0);
+ atomic_set(&suspend_data.done, 0);
+ atomic_set(&suspend_data.error, 0);
+ suspend_data.complete = &suspend_work;
+ reinit_completion(&suspend_work);
+ return 0;
+}
+
+/**
+ * store_hibernate - Initiate partition hibernation
+ * @dev: subsys root device
+ * @attr: device attribute struct
+ * @buf: buffer
+ * @count: buffer size
+ *
+ * Write the stream ID received from the HMC to this file
+ * to trigger hibernating the partition
+ *
+ * Return value:
+ * number of bytes printed to buffer / other on failure
+ **/
+static ssize_t store_hibernate(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ cpumask_var_t offline_mask;
+ int rc;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (!alloc_cpumask_var(&offline_mask, GFP_KERNEL))
+ return -ENOMEM;
+
+ stream_id = simple_strtoul(buf, NULL, 16);
+
+ do {
+ rc = pseries_suspend_begin(PM_SUSPEND_MEM);
+ if (rc == -EAGAIN)
+ ssleep(1);
+ } while (rc == -EAGAIN);
+
+ if (!rc) {
+ /* All present CPUs must be online */
+ cpumask_andnot(offline_mask, cpu_present_mask,
+ cpu_online_mask);
+ rc = rtas_online_cpus_mask(offline_mask);
+ if (rc) {
+ pr_err("%s: Could not bring present CPUs online.\n",
+ __func__);
+ goto out;
+ }
+
+ stop_topology_update();
+ rc = pm_suspend(PM_SUSPEND_MEM);
+ start_topology_update();
+
+ /* Take down CPUs not online prior to suspend */
+ if (!rtas_offline_cpus_mask(offline_mask))
+ pr_warn("%s: Could not restore CPUs to offline "
+ "state.\n", __func__);
+ }
+
+ stream_id = 0;
+
+ if (!rc)
+ rc = count;
+out:
+ free_cpumask_var(offline_mask);
+ return rc;
+}
+
+#define USER_DT_UPDATE 0
+#define KERN_DT_UPDATE 1
+
+/**
+ * show_hibernate - Report device tree update responsibilty
+ * @dev: subsys root device
+ * @attr: device attribute struct
+ * @buf: buffer
+ *
+ * Report whether a device tree update is performed by the kernel after a
+ * resume, or if drmgr must coordinate the update from user space.
+ *
+ * Return value:
+ * 0 if drmgr is to initiate update, and 1 otherwise
+ **/
+static ssize_t show_hibernate(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ return sprintf(buf, "%d\n", KERN_DT_UPDATE);
+}
+
+static DEVICE_ATTR(hibernate, 0644, show_hibernate, store_hibernate);
+
+static struct bus_type suspend_subsys = {
+ .name = "power",
+ .dev_name = "power",
+};
+
+static const struct platform_suspend_ops pseries_suspend_ops = {
+ .valid = suspend_valid_only_mem,
+ .prepare_late = pseries_prepare_late,
+ .enter = pseries_suspend_enter,
+};
+
+/**
+ * pseries_suspend_sysfs_register - Register with sysfs
+ *
+ * Return value:
+ * 0 on success / other on failure
+ **/
+static int pseries_suspend_sysfs_register(struct device *dev)
+{
+ int rc;
+
+ if ((rc = subsys_system_register(&suspend_subsys, NULL)))
+ return rc;
+
+ dev->id = 0;
+ dev->bus = &suspend_subsys;
+
+ if ((rc = device_create_file(suspend_subsys.dev_root, &dev_attr_hibernate)))
+ goto subsys_unregister;
+
+ return 0;
+
+subsys_unregister:
+ bus_unregister(&suspend_subsys);
+ return rc;
+}
+
+/**
+ * pseries_suspend_init - initcall for pSeries suspend
+ *
+ * Return value:
+ * 0 on success / other on failure
+ **/
+static int __init pseries_suspend_init(void)
+{
+ int rc;
+
+ if (!firmware_has_feature(FW_FEATURE_LPAR))
+ return 0;
+
+ suspend_data.token = rtas_token("ibm,suspend-me");
+ if (suspend_data.token == RTAS_UNKNOWN_SERVICE)
+ return 0;
+
+ if ((rc = pseries_suspend_sysfs_register(&suspend_dev)))
+ return rc;
+
+ ppc_md.suspend_disable_cpu = pseries_suspend_cpu;
+ ppc_md.suspend_enable_irqs = pseries_suspend_enable_irqs;
+ suspend_set_ops(&pseries_suspend_ops);
+ return 0;
+}
+machine_device_initcall(pseries, pseries_suspend_init);
diff --git a/arch/powerpc/platforms/pseries/vio.c b/arch/powerpc/platforms/pseries/vio.c
new file mode 100644
index 000000000..0e7778be4
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/vio.c
@@ -0,0 +1,1724 @@
+/*
+ * IBM PowerPC Virtual I/O Infrastructure Support.
+ *
+ * Copyright (c) 2003,2008 IBM Corp.
+ * Dave Engebretsen engebret@us.ibm.com
+ * Santiago Leon santil@us.ibm.com
+ * Hollis Blanchard <hollisb@us.ibm.com>
+ * Stephen Rothwell
+ * Robert Jennings <rcjenn@us.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/cpu.h>
+#include <linux/types.h>
+#include <linux/delay.h>
+#include <linux/stat.h>
+#include <linux/device.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/console.h>
+#include <linux/export.h>
+#include <linux/mm.h>
+#include <linux/dma-mapping.h>
+#include <linux/kobject.h>
+
+#include <asm/iommu.h>
+#include <asm/dma.h>
+#include <asm/vio.h>
+#include <asm/prom.h>
+#include <asm/firmware.h>
+#include <asm/tce.h>
+#include <asm/page.h>
+#include <asm/hvcall.h>
+
+static struct vio_dev vio_bus_device = { /* fake "parent" device */
+ .name = "vio",
+ .type = "",
+ .dev.init_name = "vio",
+ .dev.bus = &vio_bus_type,
+};
+
+#ifdef CONFIG_PPC_SMLPAR
+/**
+ * vio_cmo_pool - A pool of IO memory for CMO use
+ *
+ * @size: The size of the pool in bytes
+ * @free: The amount of free memory in the pool
+ */
+struct vio_cmo_pool {
+ size_t size;
+ size_t free;
+};
+
+/* How many ms to delay queued balance work */
+#define VIO_CMO_BALANCE_DELAY 100
+
+/* Portion out IO memory to CMO devices by this chunk size */
+#define VIO_CMO_BALANCE_CHUNK 131072
+
+/**
+ * vio_cmo_dev_entry - A device that is CMO-enabled and requires entitlement
+ *
+ * @vio_dev: struct vio_dev pointer
+ * @list: pointer to other devices on bus that are being tracked
+ */
+struct vio_cmo_dev_entry {
+ struct vio_dev *viodev;
+ struct list_head list;
+};
+
+/**
+ * vio_cmo - VIO bus accounting structure for CMO entitlement
+ *
+ * @lock: spinlock for entire structure
+ * @balance_q: work queue for balancing system entitlement
+ * @device_list: list of CMO-enabled devices requiring entitlement
+ * @entitled: total system entitlement in bytes
+ * @reserve: pool of memory from which devices reserve entitlement, incl. spare
+ * @excess: pool of excess entitlement not needed for device reserves or spare
+ * @spare: IO memory for device hotplug functionality
+ * @min: minimum necessary for system operation
+ * @desired: desired memory for system operation
+ * @curr: bytes currently allocated
+ * @high: high water mark for IO data usage
+ */
+static struct vio_cmo {
+ spinlock_t lock;
+ struct delayed_work balance_q;
+ struct list_head device_list;
+ size_t entitled;
+ struct vio_cmo_pool reserve;
+ struct vio_cmo_pool excess;
+ size_t spare;
+ size_t min;
+ size_t desired;
+ size_t curr;
+ size_t high;
+} vio_cmo;
+
+/**
+ * vio_cmo_OF_devices - Count the number of OF devices that have DMA windows
+ */
+static int vio_cmo_num_OF_devs(void)
+{
+ struct device_node *node_vroot;
+ int count = 0;
+
+ /*
+ * Count the number of vdevice entries with an
+ * ibm,my-dma-window OF property
+ */
+ node_vroot = of_find_node_by_name(NULL, "vdevice");
+ if (node_vroot) {
+ struct device_node *of_node;
+ struct property *prop;
+
+ for_each_child_of_node(node_vroot, of_node) {
+ prop = of_find_property(of_node, "ibm,my-dma-window",
+ NULL);
+ if (prop)
+ count++;
+ }
+ }
+ of_node_put(node_vroot);
+ return count;
+}
+
+/**
+ * vio_cmo_alloc - allocate IO memory for CMO-enable devices
+ *
+ * @viodev: VIO device requesting IO memory
+ * @size: size of allocation requested
+ *
+ * Allocations come from memory reserved for the devices and any excess
+ * IO memory available to all devices. The spare pool used to service
+ * hotplug must be equal to %VIO_CMO_MIN_ENT for the excess pool to be
+ * made available.
+ *
+ * Return codes:
+ * 0 for successful allocation and -ENOMEM for a failure
+ */
+static inline int vio_cmo_alloc(struct vio_dev *viodev, size_t size)
+{
+ unsigned long flags;
+ size_t reserve_free = 0;
+ size_t excess_free = 0;
+ int ret = -ENOMEM;
+
+ spin_lock_irqsave(&vio_cmo.lock, flags);
+
+ /* Determine the amount of free entitlement available in reserve */
+ if (viodev->cmo.entitled > viodev->cmo.allocated)
+ reserve_free = viodev->cmo.entitled - viodev->cmo.allocated;
+
+ /* If spare is not fulfilled, the excess pool can not be used. */
+ if (vio_cmo.spare >= VIO_CMO_MIN_ENT)
+ excess_free = vio_cmo.excess.free;
+
+ /* The request can be satisfied */
+ if ((reserve_free + excess_free) >= size) {
+ vio_cmo.curr += size;
+ if (vio_cmo.curr > vio_cmo.high)
+ vio_cmo.high = vio_cmo.curr;
+ viodev->cmo.allocated += size;
+ size -= min(reserve_free, size);
+ vio_cmo.excess.free -= size;
+ ret = 0;
+ }
+
+ spin_unlock_irqrestore(&vio_cmo.lock, flags);
+ return ret;
+}
+
+/**
+ * vio_cmo_dealloc - deallocate IO memory from CMO-enable devices
+ * @viodev: VIO device freeing IO memory
+ * @size: size of deallocation
+ *
+ * IO memory is freed by the device back to the correct memory pools.
+ * The spare pool is replenished first from either memory pool, then
+ * the reserve pool is used to reduce device entitlement, the excess
+ * pool is used to increase the reserve pool toward the desired entitlement
+ * target, and then the remaining memory is returned to the pools.
+ *
+ */
+static inline void vio_cmo_dealloc(struct vio_dev *viodev, size_t size)
+{
+ unsigned long flags;
+ size_t spare_needed = 0;
+ size_t excess_freed = 0;
+ size_t reserve_freed = size;
+ size_t tmp;
+ int balance = 0;
+
+ spin_lock_irqsave(&vio_cmo.lock, flags);
+ vio_cmo.curr -= size;
+
+ /* Amount of memory freed from the excess pool */
+ if (viodev->cmo.allocated > viodev->cmo.entitled) {
+ excess_freed = min(reserve_freed, (viodev->cmo.allocated -
+ viodev->cmo.entitled));
+ reserve_freed -= excess_freed;
+ }
+
+ /* Remove allocation from device */
+ viodev->cmo.allocated -= (reserve_freed + excess_freed);
+
+ /* Spare is a subset of the reserve pool, replenish it first. */
+ spare_needed = VIO_CMO_MIN_ENT - vio_cmo.spare;
+
+ /*
+ * Replenish the spare in the reserve pool from the excess pool.
+ * This moves entitlement into the reserve pool.
+ */
+ if (spare_needed && excess_freed) {
+ tmp = min(excess_freed, spare_needed);
+ vio_cmo.excess.size -= tmp;
+ vio_cmo.reserve.size += tmp;
+ vio_cmo.spare += tmp;
+ excess_freed -= tmp;
+ spare_needed -= tmp;
+ balance = 1;
+ }
+
+ /*
+ * Replenish the spare in the reserve pool from the reserve pool.
+ * This removes entitlement from the device down to VIO_CMO_MIN_ENT,
+ * if needed, and gives it to the spare pool. The amount of used
+ * memory in this pool does not change.
+ */
+ if (spare_needed && reserve_freed) {
+ tmp = min3(spare_needed, reserve_freed, (viodev->cmo.entitled - VIO_CMO_MIN_ENT));
+
+ vio_cmo.spare += tmp;
+ viodev->cmo.entitled -= tmp;
+ reserve_freed -= tmp;
+ spare_needed -= tmp;
+ balance = 1;
+ }
+
+ /*
+ * Increase the reserve pool until the desired allocation is met.
+ * Move an allocation freed from the excess pool into the reserve
+ * pool and schedule a balance operation.
+ */
+ if (excess_freed && (vio_cmo.desired > vio_cmo.reserve.size)) {
+ tmp = min(excess_freed, (vio_cmo.desired - vio_cmo.reserve.size));
+
+ vio_cmo.excess.size -= tmp;
+ vio_cmo.reserve.size += tmp;
+ excess_freed -= tmp;
+ balance = 1;
+ }
+
+ /* Return memory from the excess pool to that pool */
+ if (excess_freed)
+ vio_cmo.excess.free += excess_freed;
+
+ if (balance)
+ schedule_delayed_work(&vio_cmo.balance_q, VIO_CMO_BALANCE_DELAY);
+ spin_unlock_irqrestore(&vio_cmo.lock, flags);
+}
+
+/**
+ * vio_cmo_entitlement_update - Manage system entitlement changes
+ *
+ * @new_entitlement: new system entitlement to attempt to accommodate
+ *
+ * Increases in entitlement will be used to fulfill the spare entitlement
+ * and the rest is given to the excess pool. Decreases, if they are
+ * possible, come from the excess pool and from unused device entitlement
+ *
+ * Returns: 0 on success, -ENOMEM when change can not be made
+ */
+int vio_cmo_entitlement_update(size_t new_entitlement)
+{
+ struct vio_dev *viodev;
+ struct vio_cmo_dev_entry *dev_ent;
+ unsigned long flags;
+ size_t avail, delta, tmp;
+
+ spin_lock_irqsave(&vio_cmo.lock, flags);
+
+ /* Entitlement increases */
+ if (new_entitlement > vio_cmo.entitled) {
+ delta = new_entitlement - vio_cmo.entitled;
+
+ /* Fulfill spare allocation */
+ if (vio_cmo.spare < VIO_CMO_MIN_ENT) {
+ tmp = min(delta, (VIO_CMO_MIN_ENT - vio_cmo.spare));
+ vio_cmo.spare += tmp;
+ vio_cmo.reserve.size += tmp;
+ delta -= tmp;
+ }
+
+ /* Remaining new allocation goes to the excess pool */
+ vio_cmo.entitled += delta;
+ vio_cmo.excess.size += delta;
+ vio_cmo.excess.free += delta;
+
+ goto out;
+ }
+
+ /* Entitlement decreases */
+ delta = vio_cmo.entitled - new_entitlement;
+ avail = vio_cmo.excess.free;
+
+ /*
+ * Need to check how much unused entitlement each device can
+ * sacrifice to fulfill entitlement change.
+ */
+ list_for_each_entry(dev_ent, &vio_cmo.device_list, list) {
+ if (avail >= delta)
+ break;
+
+ viodev = dev_ent->viodev;
+ if ((viodev->cmo.entitled > viodev->cmo.allocated) &&
+ (viodev->cmo.entitled > VIO_CMO_MIN_ENT))
+ avail += viodev->cmo.entitled -
+ max_t(size_t, viodev->cmo.allocated,
+ VIO_CMO_MIN_ENT);
+ }
+
+ if (delta <= avail) {
+ vio_cmo.entitled -= delta;
+
+ /* Take entitlement from the excess pool first */
+ tmp = min(vio_cmo.excess.free, delta);
+ vio_cmo.excess.size -= tmp;
+ vio_cmo.excess.free -= tmp;
+ delta -= tmp;
+
+ /*
+ * Remove all but VIO_CMO_MIN_ENT bytes from devices
+ * until entitlement change is served
+ */
+ list_for_each_entry(dev_ent, &vio_cmo.device_list, list) {
+ if (!delta)
+ break;
+
+ viodev = dev_ent->viodev;
+ tmp = 0;
+ if ((viodev->cmo.entitled > viodev->cmo.allocated) &&
+ (viodev->cmo.entitled > VIO_CMO_MIN_ENT))
+ tmp = viodev->cmo.entitled -
+ max_t(size_t, viodev->cmo.allocated,
+ VIO_CMO_MIN_ENT);
+ viodev->cmo.entitled -= min(tmp, delta);
+ delta -= min(tmp, delta);
+ }
+ } else {
+ spin_unlock_irqrestore(&vio_cmo.lock, flags);
+ return -ENOMEM;
+ }
+
+out:
+ schedule_delayed_work(&vio_cmo.balance_q, 0);
+ spin_unlock_irqrestore(&vio_cmo.lock, flags);
+ return 0;
+}
+
+/**
+ * vio_cmo_balance - Balance entitlement among devices
+ *
+ * @work: work queue structure for this operation
+ *
+ * Any system entitlement above the minimum needed for devices, or
+ * already allocated to devices, can be distributed to the devices.
+ * The list of devices is iterated through to recalculate the desired
+ * entitlement level and to determine how much entitlement above the
+ * minimum entitlement is allocated to devices.
+ *
+ * Small chunks of the available entitlement are given to devices until
+ * their requirements are fulfilled or there is no entitlement left to give.
+ * Upon completion sizes of the reserve and excess pools are calculated.
+ *
+ * The system minimum entitlement level is also recalculated here.
+ * Entitlement will be reserved for devices even after vio_bus_remove to
+ * accommodate reloading the driver. The OF tree is walked to count the
+ * number of devices present and this will remove entitlement for devices
+ * that have actually left the system after having vio_bus_remove called.
+ */
+static void vio_cmo_balance(struct work_struct *work)
+{
+ struct vio_cmo *cmo;
+ struct vio_dev *viodev;
+ struct vio_cmo_dev_entry *dev_ent;
+ unsigned long flags;
+ size_t avail = 0, level, chunk, need;
+ int devcount = 0, fulfilled;
+
+ cmo = container_of(work, struct vio_cmo, balance_q.work);
+
+ spin_lock_irqsave(&vio_cmo.lock, flags);
+
+ /* Calculate minimum entitlement and fulfill spare */
+ cmo->min = vio_cmo_num_OF_devs() * VIO_CMO_MIN_ENT;
+ BUG_ON(cmo->min > cmo->entitled);
+ cmo->spare = min_t(size_t, VIO_CMO_MIN_ENT, (cmo->entitled - cmo->min));
+ cmo->min += cmo->spare;
+ cmo->desired = cmo->min;
+
+ /*
+ * Determine how much entitlement is available and reset device
+ * entitlements
+ */
+ avail = cmo->entitled - cmo->spare;
+ list_for_each_entry(dev_ent, &vio_cmo.device_list, list) {
+ viodev = dev_ent->viodev;
+ devcount++;
+ viodev->cmo.entitled = VIO_CMO_MIN_ENT;
+ cmo->desired += (viodev->cmo.desired - VIO_CMO_MIN_ENT);
+ avail -= max_t(size_t, viodev->cmo.allocated, VIO_CMO_MIN_ENT);
+ }
+
+ /*
+ * Having provided each device with the minimum entitlement, loop
+ * over the devices portioning out the remaining entitlement
+ * until there is nothing left.
+ */
+ level = VIO_CMO_MIN_ENT;
+ while (avail) {
+ fulfilled = 0;
+ list_for_each_entry(dev_ent, &vio_cmo.device_list, list) {
+ viodev = dev_ent->viodev;
+
+ if (viodev->cmo.desired <= level) {
+ fulfilled++;
+ continue;
+ }
+
+ /*
+ * Give the device up to VIO_CMO_BALANCE_CHUNK
+ * bytes of entitlement, but do not exceed the
+ * desired level of entitlement for the device.
+ */
+ chunk = min_t(size_t, avail, VIO_CMO_BALANCE_CHUNK);
+ chunk = min(chunk, (viodev->cmo.desired -
+ viodev->cmo.entitled));
+ viodev->cmo.entitled += chunk;
+
+ /*
+ * If the memory for this entitlement increase was
+ * already allocated to the device it does not come
+ * from the available pool being portioned out.
+ */
+ need = max(viodev->cmo.allocated, viodev->cmo.entitled)-
+ max(viodev->cmo.allocated, level);
+ avail -= need;
+
+ }
+ if (fulfilled == devcount)
+ break;
+ level += VIO_CMO_BALANCE_CHUNK;
+ }
+
+ /* Calculate new reserve and excess pool sizes */
+ cmo->reserve.size = cmo->min;
+ cmo->excess.free = 0;
+ cmo->excess.size = 0;
+ need = 0;
+ list_for_each_entry(dev_ent, &vio_cmo.device_list, list) {
+ viodev = dev_ent->viodev;
+ /* Calculated reserve size above the minimum entitlement */
+ if (viodev->cmo.entitled)
+ cmo->reserve.size += (viodev->cmo.entitled -
+ VIO_CMO_MIN_ENT);
+ /* Calculated used excess entitlement */
+ if (viodev->cmo.allocated > viodev->cmo.entitled)
+ need += viodev->cmo.allocated - viodev->cmo.entitled;
+ }
+ cmo->excess.size = cmo->entitled - cmo->reserve.size;
+ cmo->excess.free = cmo->excess.size - need;
+
+ cancel_delayed_work(to_delayed_work(work));
+ spin_unlock_irqrestore(&vio_cmo.lock, flags);
+}
+
+static void *vio_dma_iommu_alloc_coherent(struct device *dev, size_t size,
+ dma_addr_t *dma_handle, gfp_t flag,
+ unsigned long attrs)
+{
+ struct vio_dev *viodev = to_vio_dev(dev);
+ void *ret;
+
+ if (vio_cmo_alloc(viodev, roundup(size, PAGE_SIZE))) {
+ atomic_inc(&viodev->cmo.allocs_failed);
+ return NULL;
+ }
+
+ ret = dma_iommu_ops.alloc(dev, size, dma_handle, flag, attrs);
+ if (unlikely(ret == NULL)) {
+ vio_cmo_dealloc(viodev, roundup(size, PAGE_SIZE));
+ atomic_inc(&viodev->cmo.allocs_failed);
+ }
+
+ return ret;
+}
+
+static void vio_dma_iommu_free_coherent(struct device *dev, size_t size,
+ void *vaddr, dma_addr_t dma_handle,
+ unsigned long attrs)
+{
+ struct vio_dev *viodev = to_vio_dev(dev);
+
+ dma_iommu_ops.free(dev, size, vaddr, dma_handle, attrs);
+
+ vio_cmo_dealloc(viodev, roundup(size, PAGE_SIZE));
+}
+
+static dma_addr_t vio_dma_iommu_map_page(struct device *dev, struct page *page,
+ unsigned long offset, size_t size,
+ enum dma_data_direction direction,
+ unsigned long attrs)
+{
+ struct vio_dev *viodev = to_vio_dev(dev);
+ struct iommu_table *tbl;
+ dma_addr_t ret = IOMMU_MAPPING_ERROR;
+
+ tbl = get_iommu_table_base(dev);
+ if (vio_cmo_alloc(viodev, roundup(size, IOMMU_PAGE_SIZE(tbl)))) {
+ atomic_inc(&viodev->cmo.allocs_failed);
+ return ret;
+ }
+
+ ret = dma_iommu_ops.map_page(dev, page, offset, size, direction, attrs);
+ if (unlikely(dma_mapping_error(dev, ret))) {
+ vio_cmo_dealloc(viodev, roundup(size, IOMMU_PAGE_SIZE(tbl)));
+ atomic_inc(&viodev->cmo.allocs_failed);
+ }
+
+ return ret;
+}
+
+static void vio_dma_iommu_unmap_page(struct device *dev, dma_addr_t dma_handle,
+ size_t size,
+ enum dma_data_direction direction,
+ unsigned long attrs)
+{
+ struct vio_dev *viodev = to_vio_dev(dev);
+ struct iommu_table *tbl;
+
+ tbl = get_iommu_table_base(dev);
+ dma_iommu_ops.unmap_page(dev, dma_handle, size, direction, attrs);
+
+ vio_cmo_dealloc(viodev, roundup(size, IOMMU_PAGE_SIZE(tbl)));
+}
+
+static int vio_dma_iommu_map_sg(struct device *dev, struct scatterlist *sglist,
+ int nelems, enum dma_data_direction direction,
+ unsigned long attrs)
+{
+ struct vio_dev *viodev = to_vio_dev(dev);
+ struct iommu_table *tbl;
+ struct scatterlist *sgl;
+ int ret, count;
+ size_t alloc_size = 0;
+
+ tbl = get_iommu_table_base(dev);
+ for_each_sg(sglist, sgl, nelems, count)
+ alloc_size += roundup(sgl->length, IOMMU_PAGE_SIZE(tbl));
+
+ if (vio_cmo_alloc(viodev, alloc_size)) {
+ atomic_inc(&viodev->cmo.allocs_failed);
+ return 0;
+ }
+
+ ret = dma_iommu_ops.map_sg(dev, sglist, nelems, direction, attrs);
+
+ if (unlikely(!ret)) {
+ vio_cmo_dealloc(viodev, alloc_size);
+ atomic_inc(&viodev->cmo.allocs_failed);
+ return ret;
+ }
+
+ for_each_sg(sglist, sgl, ret, count)
+ alloc_size -= roundup(sgl->dma_length, IOMMU_PAGE_SIZE(tbl));
+ if (alloc_size)
+ vio_cmo_dealloc(viodev, alloc_size);
+
+ return ret;
+}
+
+static void vio_dma_iommu_unmap_sg(struct device *dev,
+ struct scatterlist *sglist, int nelems,
+ enum dma_data_direction direction,
+ unsigned long attrs)
+{
+ struct vio_dev *viodev = to_vio_dev(dev);
+ struct iommu_table *tbl;
+ struct scatterlist *sgl;
+ size_t alloc_size = 0;
+ int count;
+
+ tbl = get_iommu_table_base(dev);
+ for_each_sg(sglist, sgl, nelems, count)
+ alloc_size += roundup(sgl->dma_length, IOMMU_PAGE_SIZE(tbl));
+
+ dma_iommu_ops.unmap_sg(dev, sglist, nelems, direction, attrs);
+
+ vio_cmo_dealloc(viodev, alloc_size);
+}
+
+static int vio_dma_iommu_dma_supported(struct device *dev, u64 mask)
+{
+ return dma_iommu_ops.dma_supported(dev, mask);
+}
+
+static u64 vio_dma_get_required_mask(struct device *dev)
+{
+ return dma_iommu_ops.get_required_mask(dev);
+}
+
+static const struct dma_map_ops vio_dma_mapping_ops = {
+ .alloc = vio_dma_iommu_alloc_coherent,
+ .free = vio_dma_iommu_free_coherent,
+ .mmap = dma_nommu_mmap_coherent,
+ .map_sg = vio_dma_iommu_map_sg,
+ .unmap_sg = vio_dma_iommu_unmap_sg,
+ .map_page = vio_dma_iommu_map_page,
+ .unmap_page = vio_dma_iommu_unmap_page,
+ .dma_supported = vio_dma_iommu_dma_supported,
+ .get_required_mask = vio_dma_get_required_mask,
+ .mapping_error = dma_iommu_mapping_error,
+};
+
+/**
+ * vio_cmo_set_dev_desired - Set desired entitlement for a device
+ *
+ * @viodev: struct vio_dev for device to alter
+ * @desired: new desired entitlement level in bytes
+ *
+ * For use by devices to request a change to their entitlement at runtime or
+ * through sysfs. The desired entitlement level is changed and a balancing
+ * of system resources is scheduled to run in the future.
+ */
+void vio_cmo_set_dev_desired(struct vio_dev *viodev, size_t desired)
+{
+ unsigned long flags;
+ struct vio_cmo_dev_entry *dev_ent;
+ int found = 0;
+
+ if (!firmware_has_feature(FW_FEATURE_CMO))
+ return;
+
+ spin_lock_irqsave(&vio_cmo.lock, flags);
+ if (desired < VIO_CMO_MIN_ENT)
+ desired = VIO_CMO_MIN_ENT;
+
+ /*
+ * Changes will not be made for devices not in the device list.
+ * If it is not in the device list, then no driver is loaded
+ * for the device and it can not receive entitlement.
+ */
+ list_for_each_entry(dev_ent, &vio_cmo.device_list, list)
+ if (viodev == dev_ent->viodev) {
+ found = 1;
+ break;
+ }
+ if (!found) {
+ spin_unlock_irqrestore(&vio_cmo.lock, flags);
+ return;
+ }
+
+ /* Increase/decrease in desired device entitlement */
+ if (desired >= viodev->cmo.desired) {
+ /* Just bump the bus and device values prior to a balance*/
+ vio_cmo.desired += desired - viodev->cmo.desired;
+ viodev->cmo.desired = desired;
+ } else {
+ /* Decrease bus and device values for desired entitlement */
+ vio_cmo.desired -= viodev->cmo.desired - desired;
+ viodev->cmo.desired = desired;
+ /*
+ * If less entitlement is desired than current entitlement, move
+ * any reserve memory in the change region to the excess pool.
+ */
+ if (viodev->cmo.entitled > desired) {
+ vio_cmo.reserve.size -= viodev->cmo.entitled - desired;
+ vio_cmo.excess.size += viodev->cmo.entitled - desired;
+ /*
+ * If entitlement moving from the reserve pool to the
+ * excess pool is currently unused, add to the excess
+ * free counter.
+ */
+ if (viodev->cmo.allocated < viodev->cmo.entitled)
+ vio_cmo.excess.free += viodev->cmo.entitled -
+ max(viodev->cmo.allocated, desired);
+ viodev->cmo.entitled = desired;
+ }
+ }
+ schedule_delayed_work(&vio_cmo.balance_q, 0);
+ spin_unlock_irqrestore(&vio_cmo.lock, flags);
+}
+
+/**
+ * vio_cmo_bus_probe - Handle CMO specific bus probe activities
+ *
+ * @viodev - Pointer to struct vio_dev for device
+ *
+ * Determine the devices IO memory entitlement needs, attempting
+ * to satisfy the system minimum entitlement at first and scheduling
+ * a balance operation to take care of the rest at a later time.
+ *
+ * Returns: 0 on success, -EINVAL when device doesn't support CMO, and
+ * -ENOMEM when entitlement is not available for device or
+ * device entry.
+ *
+ */
+static int vio_cmo_bus_probe(struct vio_dev *viodev)
+{
+ struct vio_cmo_dev_entry *dev_ent;
+ struct device *dev = &viodev->dev;
+ struct iommu_table *tbl;
+ struct vio_driver *viodrv = to_vio_driver(dev->driver);
+ unsigned long flags;
+ size_t size;
+ bool dma_capable = false;
+
+ tbl = get_iommu_table_base(dev);
+
+ /* A device requires entitlement if it has a DMA window property */
+ switch (viodev->family) {
+ case VDEVICE:
+ if (of_get_property(viodev->dev.of_node,
+ "ibm,my-dma-window", NULL))
+ dma_capable = true;
+ break;
+ case PFO:
+ dma_capable = false;
+ break;
+ default:
+ dev_warn(dev, "unknown device family: %d\n", viodev->family);
+ BUG();
+ break;
+ }
+
+ /* Configure entitlement for the device. */
+ if (dma_capable) {
+ /* Check that the driver is CMO enabled and get desired DMA */
+ if (!viodrv->get_desired_dma) {
+ dev_err(dev, "%s: device driver does not support CMO\n",
+ __func__);
+ return -EINVAL;
+ }
+
+ viodev->cmo.desired =
+ IOMMU_PAGE_ALIGN(viodrv->get_desired_dma(viodev), tbl);
+ if (viodev->cmo.desired < VIO_CMO_MIN_ENT)
+ viodev->cmo.desired = VIO_CMO_MIN_ENT;
+ size = VIO_CMO_MIN_ENT;
+
+ dev_ent = kmalloc(sizeof(struct vio_cmo_dev_entry),
+ GFP_KERNEL);
+ if (!dev_ent)
+ return -ENOMEM;
+
+ dev_ent->viodev = viodev;
+ spin_lock_irqsave(&vio_cmo.lock, flags);
+ list_add(&dev_ent->list, &vio_cmo.device_list);
+ } else {
+ viodev->cmo.desired = 0;
+ size = 0;
+ spin_lock_irqsave(&vio_cmo.lock, flags);
+ }
+
+ /*
+ * If the needs for vio_cmo.min have not changed since they
+ * were last set, the number of devices in the OF tree has
+ * been constant and the IO memory for this is already in
+ * the reserve pool.
+ */
+ if (vio_cmo.min == ((vio_cmo_num_OF_devs() + 1) *
+ VIO_CMO_MIN_ENT)) {
+ /* Updated desired entitlement if device requires it */
+ if (size)
+ vio_cmo.desired += (viodev->cmo.desired -
+ VIO_CMO_MIN_ENT);
+ } else {
+ size_t tmp;
+
+ tmp = vio_cmo.spare + vio_cmo.excess.free;
+ if (tmp < size) {
+ dev_err(dev, "%s: insufficient free "
+ "entitlement to add device. "
+ "Need %lu, have %lu\n", __func__,
+ size, (vio_cmo.spare + tmp));
+ spin_unlock_irqrestore(&vio_cmo.lock, flags);
+ return -ENOMEM;
+ }
+
+ /* Use excess pool first to fulfill request */
+ tmp = min(size, vio_cmo.excess.free);
+ vio_cmo.excess.free -= tmp;
+ vio_cmo.excess.size -= tmp;
+ vio_cmo.reserve.size += tmp;
+
+ /* Use spare if excess pool was insufficient */
+ vio_cmo.spare -= size - tmp;
+
+ /* Update bus accounting */
+ vio_cmo.min += size;
+ vio_cmo.desired += viodev->cmo.desired;
+ }
+ spin_unlock_irqrestore(&vio_cmo.lock, flags);
+ return 0;
+}
+
+/**
+ * vio_cmo_bus_remove - Handle CMO specific bus removal activities
+ *
+ * @viodev - Pointer to struct vio_dev for device
+ *
+ * Remove the device from the cmo device list. The minimum entitlement
+ * will be reserved for the device as long as it is in the system. The
+ * rest of the entitlement the device had been allocated will be returned
+ * to the system.
+ */
+static void vio_cmo_bus_remove(struct vio_dev *viodev)
+{
+ struct vio_cmo_dev_entry *dev_ent;
+ unsigned long flags;
+ size_t tmp;
+
+ spin_lock_irqsave(&vio_cmo.lock, flags);
+ if (viodev->cmo.allocated) {
+ dev_err(&viodev->dev, "%s: device had %lu bytes of IO "
+ "allocated after remove operation.\n",
+ __func__, viodev->cmo.allocated);
+ BUG();
+ }
+
+ /*
+ * Remove the device from the device list being maintained for
+ * CMO enabled devices.
+ */
+ list_for_each_entry(dev_ent, &vio_cmo.device_list, list)
+ if (viodev == dev_ent->viodev) {
+ list_del(&dev_ent->list);
+ kfree(dev_ent);
+ break;
+ }
+
+ /*
+ * Devices may not require any entitlement and they do not need
+ * to be processed. Otherwise, return the device's entitlement
+ * back to the pools.
+ */
+ if (viodev->cmo.entitled) {
+ /*
+ * This device has not yet left the OF tree, it's
+ * minimum entitlement remains in vio_cmo.min and
+ * vio_cmo.desired
+ */
+ vio_cmo.desired -= (viodev->cmo.desired - VIO_CMO_MIN_ENT);
+
+ /*
+ * Save min allocation for device in reserve as long
+ * as it exists in OF tree as determined by later
+ * balance operation
+ */
+ viodev->cmo.entitled -= VIO_CMO_MIN_ENT;
+
+ /* Replenish spare from freed reserve pool */
+ if (viodev->cmo.entitled && (vio_cmo.spare < VIO_CMO_MIN_ENT)) {
+ tmp = min(viodev->cmo.entitled, (VIO_CMO_MIN_ENT -
+ vio_cmo.spare));
+ vio_cmo.spare += tmp;
+ viodev->cmo.entitled -= tmp;
+ }
+
+ /* Remaining reserve goes to excess pool */
+ vio_cmo.excess.size += viodev->cmo.entitled;
+ vio_cmo.excess.free += viodev->cmo.entitled;
+ vio_cmo.reserve.size -= viodev->cmo.entitled;
+
+ /*
+ * Until the device is removed it will keep a
+ * minimum entitlement; this will guarantee that
+ * a module unload/load will result in a success.
+ */
+ viodev->cmo.entitled = VIO_CMO_MIN_ENT;
+ viodev->cmo.desired = VIO_CMO_MIN_ENT;
+ atomic_set(&viodev->cmo.allocs_failed, 0);
+ }
+
+ spin_unlock_irqrestore(&vio_cmo.lock, flags);
+}
+
+static void vio_cmo_set_dma_ops(struct vio_dev *viodev)
+{
+ set_dma_ops(&viodev->dev, &vio_dma_mapping_ops);
+}
+
+/**
+ * vio_cmo_bus_init - CMO entitlement initialization at bus init time
+ *
+ * Set up the reserve and excess entitlement pools based on available
+ * system entitlement and the number of devices in the OF tree that
+ * require entitlement in the reserve pool.
+ */
+static void vio_cmo_bus_init(void)
+{
+ struct hvcall_mpp_data mpp_data;
+ int err;
+
+ memset(&vio_cmo, 0, sizeof(struct vio_cmo));
+ spin_lock_init(&vio_cmo.lock);
+ INIT_LIST_HEAD(&vio_cmo.device_list);
+ INIT_DELAYED_WORK(&vio_cmo.balance_q, vio_cmo_balance);
+
+ /* Get current system entitlement */
+ err = h_get_mpp(&mpp_data);
+
+ /*
+ * On failure, continue with entitlement set to 0, will panic()
+ * later when spare is reserved.
+ */
+ if (err != H_SUCCESS) {
+ printk(KERN_ERR "%s: unable to determine system IO "\
+ "entitlement. (%d)\n", __func__, err);
+ vio_cmo.entitled = 0;
+ } else {
+ vio_cmo.entitled = mpp_data.entitled_mem;
+ }
+
+ /* Set reservation and check against entitlement */
+ vio_cmo.spare = VIO_CMO_MIN_ENT;
+ vio_cmo.reserve.size = vio_cmo.spare;
+ vio_cmo.reserve.size += (vio_cmo_num_OF_devs() *
+ VIO_CMO_MIN_ENT);
+ if (vio_cmo.reserve.size > vio_cmo.entitled) {
+ printk(KERN_ERR "%s: insufficient system entitlement\n",
+ __func__);
+ panic("%s: Insufficient system entitlement", __func__);
+ }
+
+ /* Set the remaining accounting variables */
+ vio_cmo.excess.size = vio_cmo.entitled - vio_cmo.reserve.size;
+ vio_cmo.excess.free = vio_cmo.excess.size;
+ vio_cmo.min = vio_cmo.reserve.size;
+ vio_cmo.desired = vio_cmo.reserve.size;
+}
+
+/* sysfs device functions and data structures for CMO */
+
+#define viodev_cmo_rd_attr(name) \
+static ssize_t cmo_##name##_show(struct device *dev, \
+ struct device_attribute *attr, \
+ char *buf) \
+{ \
+ return sprintf(buf, "%lu\n", to_vio_dev(dev)->cmo.name); \
+}
+
+static ssize_t cmo_allocs_failed_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct vio_dev *viodev = to_vio_dev(dev);
+ return sprintf(buf, "%d\n", atomic_read(&viodev->cmo.allocs_failed));
+}
+
+static ssize_t cmo_allocs_failed_store(struct device *dev,
+ struct device_attribute *attr, const char *buf, size_t count)
+{
+ struct vio_dev *viodev = to_vio_dev(dev);
+ atomic_set(&viodev->cmo.allocs_failed, 0);
+ return count;
+}
+
+static ssize_t cmo_desired_store(struct device *dev,
+ struct device_attribute *attr, const char *buf, size_t count)
+{
+ struct vio_dev *viodev = to_vio_dev(dev);
+ size_t new_desired;
+ int ret;
+
+ ret = kstrtoul(buf, 10, &new_desired);
+ if (ret)
+ return ret;
+
+ vio_cmo_set_dev_desired(viodev, new_desired);
+ return count;
+}
+
+viodev_cmo_rd_attr(desired);
+viodev_cmo_rd_attr(entitled);
+viodev_cmo_rd_attr(allocated);
+
+static ssize_t name_show(struct device *, struct device_attribute *, char *);
+static ssize_t devspec_show(struct device *, struct device_attribute *, char *);
+static ssize_t modalias_show(struct device *dev, struct device_attribute *attr,
+ char *buf);
+
+static struct device_attribute dev_attr_name;
+static struct device_attribute dev_attr_devspec;
+static struct device_attribute dev_attr_modalias;
+
+static DEVICE_ATTR_RO(cmo_entitled);
+static DEVICE_ATTR_RO(cmo_allocated);
+static DEVICE_ATTR_RW(cmo_desired);
+static DEVICE_ATTR_RW(cmo_allocs_failed);
+
+static struct attribute *vio_cmo_dev_attrs[] = {
+ &dev_attr_name.attr,
+ &dev_attr_devspec.attr,
+ &dev_attr_modalias.attr,
+ &dev_attr_cmo_entitled.attr,
+ &dev_attr_cmo_allocated.attr,
+ &dev_attr_cmo_desired.attr,
+ &dev_attr_cmo_allocs_failed.attr,
+ NULL,
+};
+ATTRIBUTE_GROUPS(vio_cmo_dev);
+
+/* sysfs bus functions and data structures for CMO */
+
+#define viobus_cmo_rd_attr(name) \
+static ssize_t cmo_bus_##name##_show(struct bus_type *bt, char *buf) \
+{ \
+ return sprintf(buf, "%lu\n", vio_cmo.name); \
+} \
+static struct bus_attribute bus_attr_cmo_bus_##name = \
+ __ATTR(cmo_##name, S_IRUGO, cmo_bus_##name##_show, NULL)
+
+#define viobus_cmo_pool_rd_attr(name, var) \
+static ssize_t \
+cmo_##name##_##var##_show(struct bus_type *bt, char *buf) \
+{ \
+ return sprintf(buf, "%lu\n", vio_cmo.name.var); \
+} \
+static BUS_ATTR_RO(cmo_##name##_##var)
+
+viobus_cmo_rd_attr(entitled);
+viobus_cmo_rd_attr(spare);
+viobus_cmo_rd_attr(min);
+viobus_cmo_rd_attr(desired);
+viobus_cmo_rd_attr(curr);
+viobus_cmo_pool_rd_attr(reserve, size);
+viobus_cmo_pool_rd_attr(excess, size);
+viobus_cmo_pool_rd_attr(excess, free);
+
+static ssize_t cmo_high_show(struct bus_type *bt, char *buf)
+{
+ return sprintf(buf, "%lu\n", vio_cmo.high);
+}
+
+static ssize_t cmo_high_store(struct bus_type *bt, const char *buf,
+ size_t count)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&vio_cmo.lock, flags);
+ vio_cmo.high = vio_cmo.curr;
+ spin_unlock_irqrestore(&vio_cmo.lock, flags);
+
+ return count;
+}
+static BUS_ATTR_RW(cmo_high);
+
+static struct attribute *vio_bus_attrs[] = {
+ &bus_attr_cmo_bus_entitled.attr,
+ &bus_attr_cmo_bus_spare.attr,
+ &bus_attr_cmo_bus_min.attr,
+ &bus_attr_cmo_bus_desired.attr,
+ &bus_attr_cmo_bus_curr.attr,
+ &bus_attr_cmo_high.attr,
+ &bus_attr_cmo_reserve_size.attr,
+ &bus_attr_cmo_excess_size.attr,
+ &bus_attr_cmo_excess_free.attr,
+ NULL,
+};
+ATTRIBUTE_GROUPS(vio_bus);
+
+static void vio_cmo_sysfs_init(void)
+{
+ vio_bus_type.dev_groups = vio_cmo_dev_groups;
+ vio_bus_type.bus_groups = vio_bus_groups;
+}
+#else /* CONFIG_PPC_SMLPAR */
+int vio_cmo_entitlement_update(size_t new_entitlement) { return 0; }
+void vio_cmo_set_dev_desired(struct vio_dev *viodev, size_t desired) {}
+static int vio_cmo_bus_probe(struct vio_dev *viodev) { return 0; }
+static void vio_cmo_bus_remove(struct vio_dev *viodev) {}
+static void vio_cmo_set_dma_ops(struct vio_dev *viodev) {}
+static void vio_cmo_bus_init(void) {}
+static void vio_cmo_sysfs_init(void) { }
+#endif /* CONFIG_PPC_SMLPAR */
+EXPORT_SYMBOL(vio_cmo_entitlement_update);
+EXPORT_SYMBOL(vio_cmo_set_dev_desired);
+
+
+/*
+ * Platform Facilities Option (PFO) support
+ */
+
+/**
+ * vio_h_cop_sync - Perform a synchronous PFO co-processor operation
+ *
+ * @vdev - Pointer to a struct vio_dev for device
+ * @op - Pointer to a struct vio_pfo_op for the operation parameters
+ *
+ * Calls the hypervisor to synchronously perform the PFO operation
+ * described in @op. In the case of a busy response from the hypervisor,
+ * the operation will be re-submitted indefinitely unless a non-zero timeout
+ * is specified or an error occurs. The timeout places a limit on when to
+ * stop re-submitting a operation, the total time can be exceeded if an
+ * operation is in progress.
+ *
+ * If op->hcall_ret is not NULL, this will be set to the return from the
+ * last h_cop_op call or it will be 0 if an error not involving the h_call
+ * was encountered.
+ *
+ * Returns:
+ * 0 on success,
+ * -EINVAL if the h_call fails due to an invalid parameter,
+ * -E2BIG if the h_call can not be performed synchronously,
+ * -EBUSY if a timeout is specified and has elapsed,
+ * -EACCES if the memory area for data/status has been rescinded, or
+ * -EPERM if a hardware fault has been indicated
+ */
+int vio_h_cop_sync(struct vio_dev *vdev, struct vio_pfo_op *op)
+{
+ struct device *dev = &vdev->dev;
+ unsigned long deadline = 0;
+ long hret = 0;
+ int ret = 0;
+
+ if (op->timeout)
+ deadline = jiffies + msecs_to_jiffies(op->timeout);
+
+ while (true) {
+ hret = plpar_hcall_norets(H_COP, op->flags,
+ vdev->resource_id,
+ op->in, op->inlen, op->out,
+ op->outlen, op->csbcpb);
+
+ if (hret == H_SUCCESS ||
+ (hret != H_NOT_ENOUGH_RESOURCES &&
+ hret != H_BUSY && hret != H_RESOURCE) ||
+ (op->timeout && time_after(deadline, jiffies)))
+ break;
+
+ dev_dbg(dev, "%s: hcall ret(%ld), retrying.\n", __func__, hret);
+ }
+
+ switch (hret) {
+ case H_SUCCESS:
+ ret = 0;
+ break;
+ case H_OP_MODE:
+ case H_TOO_BIG:
+ ret = -E2BIG;
+ break;
+ case H_RESCINDED:
+ ret = -EACCES;
+ break;
+ case H_HARDWARE:
+ ret = -EPERM;
+ break;
+ case H_NOT_ENOUGH_RESOURCES:
+ case H_RESOURCE:
+ case H_BUSY:
+ ret = -EBUSY;
+ break;
+ default:
+ ret = -EINVAL;
+ break;
+ }
+
+ if (ret)
+ dev_dbg(dev, "%s: Sync h_cop_op failure (ret:%d) (hret:%ld)\n",
+ __func__, ret, hret);
+
+ op->hcall_err = hret;
+ return ret;
+}
+EXPORT_SYMBOL(vio_h_cop_sync);
+
+static struct iommu_table *vio_build_iommu_table(struct vio_dev *dev)
+{
+ const __be32 *dma_window;
+ struct iommu_table *tbl;
+ unsigned long offset, size;
+
+ dma_window = of_get_property(dev->dev.of_node,
+ "ibm,my-dma-window", NULL);
+ if (!dma_window)
+ return NULL;
+
+ tbl = kzalloc(sizeof(*tbl), GFP_KERNEL);
+ if (tbl == NULL)
+ return NULL;
+
+ kref_init(&tbl->it_kref);
+
+ of_parse_dma_window(dev->dev.of_node, dma_window,
+ &tbl->it_index, &offset, &size);
+
+ /* TCE table size - measured in tce entries */
+ tbl->it_page_shift = IOMMU_PAGE_SHIFT_4K;
+ tbl->it_size = size >> tbl->it_page_shift;
+ /* offset for VIO should always be 0 */
+ tbl->it_offset = offset >> tbl->it_page_shift;
+ tbl->it_busno = 0;
+ tbl->it_type = TCE_VB;
+ tbl->it_blocksize = 16;
+
+ if (firmware_has_feature(FW_FEATURE_LPAR))
+ tbl->it_ops = &iommu_table_lpar_multi_ops;
+ else
+ tbl->it_ops = &iommu_table_pseries_ops;
+
+ return iommu_init_table(tbl, -1);
+}
+
+/**
+ * vio_match_device: - Tell if a VIO device has a matching
+ * VIO device id structure.
+ * @ids: array of VIO device id structures to search in
+ * @dev: the VIO device structure to match against
+ *
+ * Used by a driver to check whether a VIO device present in the
+ * system is in its list of supported devices. Returns the matching
+ * vio_device_id structure or NULL if there is no match.
+ */
+static const struct vio_device_id *vio_match_device(
+ const struct vio_device_id *ids, const struct vio_dev *dev)
+{
+ while (ids->type[0] != '\0') {
+ if ((strncmp(dev->type, ids->type, strlen(ids->type)) == 0) &&
+ of_device_is_compatible(dev->dev.of_node,
+ ids->compat))
+ return ids;
+ ids++;
+ }
+ return NULL;
+}
+
+/*
+ * Convert from struct device to struct vio_dev and pass to driver.
+ * dev->driver has already been set by generic code because vio_bus_match
+ * succeeded.
+ */
+static int vio_bus_probe(struct device *dev)
+{
+ struct vio_dev *viodev = to_vio_dev(dev);
+ struct vio_driver *viodrv = to_vio_driver(dev->driver);
+ const struct vio_device_id *id;
+ int error = -ENODEV;
+
+ if (!viodrv->probe)
+ return error;
+
+ id = vio_match_device(viodrv->id_table, viodev);
+ if (id) {
+ memset(&viodev->cmo, 0, sizeof(viodev->cmo));
+ if (firmware_has_feature(FW_FEATURE_CMO)) {
+ error = vio_cmo_bus_probe(viodev);
+ if (error)
+ return error;
+ }
+ error = viodrv->probe(viodev, id);
+ if (error && firmware_has_feature(FW_FEATURE_CMO))
+ vio_cmo_bus_remove(viodev);
+ }
+
+ return error;
+}
+
+/* convert from struct device to struct vio_dev and pass to driver. */
+static int vio_bus_remove(struct device *dev)
+{
+ struct vio_dev *viodev = to_vio_dev(dev);
+ struct vio_driver *viodrv = to_vio_driver(dev->driver);
+ struct device *devptr;
+ int ret = 1;
+
+ /*
+ * Hold a reference to the device after the remove function is called
+ * to allow for CMO accounting cleanup for the device.
+ */
+ devptr = get_device(dev);
+
+ if (viodrv->remove)
+ ret = viodrv->remove(viodev);
+
+ if (!ret && firmware_has_feature(FW_FEATURE_CMO))
+ vio_cmo_bus_remove(viodev);
+
+ put_device(devptr);
+ return ret;
+}
+
+/**
+ * vio_register_driver: - Register a new vio driver
+ * @viodrv: The vio_driver structure to be registered.
+ */
+int __vio_register_driver(struct vio_driver *viodrv, struct module *owner,
+ const char *mod_name)
+{
+ pr_debug("%s: driver %s registering\n", __func__, viodrv->name);
+
+ /* fill in 'struct driver' fields */
+ viodrv->driver.name = viodrv->name;
+ viodrv->driver.pm = viodrv->pm;
+ viodrv->driver.bus = &vio_bus_type;
+ viodrv->driver.owner = owner;
+ viodrv->driver.mod_name = mod_name;
+
+ return driver_register(&viodrv->driver);
+}
+EXPORT_SYMBOL(__vio_register_driver);
+
+/**
+ * vio_unregister_driver - Remove registration of vio driver.
+ * @viodrv: The vio_driver struct to be removed form registration
+ */
+void vio_unregister_driver(struct vio_driver *viodrv)
+{
+ driver_unregister(&viodrv->driver);
+}
+EXPORT_SYMBOL(vio_unregister_driver);
+
+/* vio_dev refcount hit 0 */
+static void vio_dev_release(struct device *dev)
+{
+ struct iommu_table *tbl = get_iommu_table_base(dev);
+
+ if (tbl)
+ iommu_tce_table_put(tbl);
+ of_node_put(dev->of_node);
+ kfree(to_vio_dev(dev));
+}
+
+/**
+ * vio_register_device_node: - Register a new vio device.
+ * @of_node: The OF node for this device.
+ *
+ * Creates and initializes a vio_dev structure from the data in
+ * of_node and adds it to the list of virtual devices.
+ * Returns a pointer to the created vio_dev or NULL if node has
+ * NULL device_type or compatible fields.
+ */
+struct vio_dev *vio_register_device_node(struct device_node *of_node)
+{
+ struct vio_dev *viodev;
+ struct device_node *parent_node;
+ const __be32 *prop;
+ enum vio_dev_family family;
+ const char *of_node_name = of_node->name ? of_node->name : "<unknown>";
+
+ /*
+ * Determine if this node is a under the /vdevice node or under the
+ * /ibm,platform-facilities node. This decides the device's family.
+ */
+ parent_node = of_get_parent(of_node);
+ if (parent_node) {
+ if (!strcmp(parent_node->type, "ibm,platform-facilities"))
+ family = PFO;
+ else if (!strcmp(parent_node->type, "vdevice"))
+ family = VDEVICE;
+ else {
+ pr_warn("%s: parent(%pOF) of %s not recognized.\n",
+ __func__,
+ parent_node,
+ of_node_name);
+ of_node_put(parent_node);
+ return NULL;
+ }
+ of_node_put(parent_node);
+ } else {
+ pr_warn("%s: could not determine the parent of node %s.\n",
+ __func__, of_node_name);
+ return NULL;
+ }
+
+ if (family == PFO) {
+ if (of_get_property(of_node, "interrupt-controller", NULL)) {
+ pr_debug("%s: Skipping the interrupt controller %s.\n",
+ __func__, of_node_name);
+ return NULL;
+ }
+ }
+
+ /* allocate a vio_dev for this node */
+ viodev = kzalloc(sizeof(struct vio_dev), GFP_KERNEL);
+ if (viodev == NULL) {
+ pr_warn("%s: allocation failure for VIO device.\n", __func__);
+ return NULL;
+ }
+
+ /* we need the 'device_type' property, in order to match with drivers */
+ viodev->family = family;
+ if (viodev->family == VDEVICE) {
+ unsigned int unit_address;
+
+ if (of_node->type != NULL)
+ viodev->type = of_node->type;
+ else {
+ pr_warn("%s: node %s is missing the 'device_type' "
+ "property.\n", __func__, of_node_name);
+ goto out;
+ }
+
+ prop = of_get_property(of_node, "reg", NULL);
+ if (prop == NULL) {
+ pr_warn("%s: node %s missing 'reg'\n",
+ __func__, of_node_name);
+ goto out;
+ }
+ unit_address = of_read_number(prop, 1);
+ dev_set_name(&viodev->dev, "%x", unit_address);
+ viodev->irq = irq_of_parse_and_map(of_node, 0);
+ viodev->unit_address = unit_address;
+ } else {
+ /* PFO devices need their resource_id for submitting COP_OPs
+ * This is an optional field for devices, but is required when
+ * performing synchronous ops */
+ prop = of_get_property(of_node, "ibm,resource-id", NULL);
+ if (prop != NULL)
+ viodev->resource_id = of_read_number(prop, 1);
+
+ dev_set_name(&viodev->dev, "%s", of_node_name);
+ viodev->type = of_node_name;
+ viodev->irq = 0;
+ }
+
+ viodev->name = of_node->name;
+ viodev->dev.of_node = of_node_get(of_node);
+
+ set_dev_node(&viodev->dev, of_node_to_nid(of_node));
+
+ /* init generic 'struct device' fields: */
+ viodev->dev.parent = &vio_bus_device.dev;
+ viodev->dev.bus = &vio_bus_type;
+ viodev->dev.release = vio_dev_release;
+
+ if (of_get_property(viodev->dev.of_node, "ibm,my-dma-window", NULL)) {
+ if (firmware_has_feature(FW_FEATURE_CMO))
+ vio_cmo_set_dma_ops(viodev);
+ else
+ set_dma_ops(&viodev->dev, &dma_iommu_ops);
+
+ set_iommu_table_base(&viodev->dev,
+ vio_build_iommu_table(viodev));
+
+ /* needed to ensure proper operation of coherent allocations
+ * later, in case driver doesn't set it explicitly */
+ viodev->dev.coherent_dma_mask = DMA_BIT_MASK(64);
+ viodev->dev.dma_mask = &viodev->dev.coherent_dma_mask;
+ }
+
+ /* register with generic device framework */
+ if (device_register(&viodev->dev)) {
+ printk(KERN_ERR "%s: failed to register device %s\n",
+ __func__, dev_name(&viodev->dev));
+ put_device(&viodev->dev);
+ return NULL;
+ }
+
+ return viodev;
+
+out: /* Use this exit point for any return prior to device_register */
+ kfree(viodev);
+
+ return NULL;
+}
+EXPORT_SYMBOL(vio_register_device_node);
+
+/*
+ * vio_bus_scan_for_devices - Scan OF and register each child device
+ * @root_name - OF node name for the root of the subtree to search.
+ * This must be non-NULL
+ *
+ * Starting from the root node provide, register the device node for
+ * each child beneath the root.
+ */
+static void vio_bus_scan_register_devices(char *root_name)
+{
+ struct device_node *node_root, *node_child;
+
+ if (!root_name)
+ return;
+
+ node_root = of_find_node_by_name(NULL, root_name);
+ if (node_root) {
+
+ /*
+ * Create struct vio_devices for each virtual device in
+ * the device tree. Drivers will associate with them later.
+ */
+ node_child = of_get_next_child(node_root, NULL);
+ while (node_child) {
+ vio_register_device_node(node_child);
+ node_child = of_get_next_child(node_root, node_child);
+ }
+ of_node_put(node_root);
+ }
+}
+
+/**
+ * vio_bus_init: - Initialize the virtual IO bus
+ */
+static int __init vio_bus_init(void)
+{
+ int err;
+
+ if (firmware_has_feature(FW_FEATURE_CMO))
+ vio_cmo_sysfs_init();
+
+ err = bus_register(&vio_bus_type);
+ if (err) {
+ printk(KERN_ERR "failed to register VIO bus\n");
+ return err;
+ }
+
+ /*
+ * The fake parent of all vio devices, just to give us
+ * a nice directory
+ */
+ err = device_register(&vio_bus_device.dev);
+ if (err) {
+ printk(KERN_WARNING "%s: device_register returned %i\n",
+ __func__, err);
+ return err;
+ }
+
+ if (firmware_has_feature(FW_FEATURE_CMO))
+ vio_cmo_bus_init();
+
+ return 0;
+}
+postcore_initcall(vio_bus_init);
+
+static int __init vio_device_init(void)
+{
+ vio_bus_scan_register_devices("vdevice");
+ vio_bus_scan_register_devices("ibm,platform-facilities");
+
+ return 0;
+}
+device_initcall(vio_device_init);
+
+static ssize_t name_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%s\n", to_vio_dev(dev)->name);
+}
+static DEVICE_ATTR_RO(name);
+
+static ssize_t devspec_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct device_node *of_node = dev->of_node;
+
+ return sprintf(buf, "%pOF\n", of_node);
+}
+static DEVICE_ATTR_RO(devspec);
+
+static ssize_t modalias_show(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ const struct vio_dev *vio_dev = to_vio_dev(dev);
+ struct device_node *dn;
+ const char *cp;
+
+ dn = dev->of_node;
+ if (!dn) {
+ strcpy(buf, "\n");
+ return strlen(buf);
+ }
+ cp = of_get_property(dn, "compatible", NULL);
+ if (!cp) {
+ strcpy(buf, "\n");
+ return strlen(buf);
+ }
+
+ return sprintf(buf, "vio:T%sS%s\n", vio_dev->type, cp);
+}
+static DEVICE_ATTR_RO(modalias);
+
+static struct attribute *vio_dev_attrs[] = {
+ &dev_attr_name.attr,
+ &dev_attr_devspec.attr,
+ &dev_attr_modalias.attr,
+ NULL,
+};
+ATTRIBUTE_GROUPS(vio_dev);
+
+void vio_unregister_device(struct vio_dev *viodev)
+{
+ device_unregister(&viodev->dev);
+ if (viodev->family == VDEVICE)
+ irq_dispose_mapping(viodev->irq);
+}
+EXPORT_SYMBOL(vio_unregister_device);
+
+static int vio_bus_match(struct device *dev, struct device_driver *drv)
+{
+ const struct vio_dev *vio_dev = to_vio_dev(dev);
+ struct vio_driver *vio_drv = to_vio_driver(drv);
+ const struct vio_device_id *ids = vio_drv->id_table;
+
+ return (ids != NULL) && (vio_match_device(ids, vio_dev) != NULL);
+}
+
+static int vio_hotplug(struct device *dev, struct kobj_uevent_env *env)
+{
+ const struct vio_dev *vio_dev = to_vio_dev(dev);
+ struct device_node *dn;
+ const char *cp;
+
+ dn = dev->of_node;
+ if (!dn)
+ return -ENODEV;
+ cp = of_get_property(dn, "compatible", NULL);
+ if (!cp)
+ return -ENODEV;
+
+ add_uevent_var(env, "MODALIAS=vio:T%sS%s", vio_dev->type, cp);
+ return 0;
+}
+
+struct bus_type vio_bus_type = {
+ .name = "vio",
+ .dev_groups = vio_dev_groups,
+ .uevent = vio_hotplug,
+ .match = vio_bus_match,
+ .probe = vio_bus_probe,
+ .remove = vio_bus_remove,
+};
+
+/**
+ * vio_get_attribute: - get attribute for virtual device
+ * @vdev: The vio device to get property.
+ * @which: The property/attribute to be extracted.
+ * @length: Pointer to length of returned data size (unused if NULL).
+ *
+ * Calls prom.c's of_get_property() to return the value of the
+ * attribute specified by @which
+*/
+const void *vio_get_attribute(struct vio_dev *vdev, char *which, int *length)
+{
+ return of_get_property(vdev->dev.of_node, which, length);
+}
+EXPORT_SYMBOL(vio_get_attribute);
+
+#ifdef CONFIG_PPC_PSERIES
+/* vio_find_name() - internal because only vio.c knows how we formatted the
+ * kobject name
+ */
+static struct vio_dev *vio_find_name(const char *name)
+{
+ struct device *found;
+
+ found = bus_find_device_by_name(&vio_bus_type, NULL, name);
+ if (!found)
+ return NULL;
+
+ return to_vio_dev(found);
+}
+
+/**
+ * vio_find_node - find an already-registered vio_dev
+ * @vnode: device_node of the virtual device we're looking for
+ *
+ * Takes a reference to the embedded struct device which needs to be dropped
+ * after use.
+ */
+struct vio_dev *vio_find_node(struct device_node *vnode)
+{
+ char kobj_name[20];
+ struct device_node *vnode_parent;
+ const char *dev_type;
+
+ vnode_parent = of_get_parent(vnode);
+ if (!vnode_parent)
+ return NULL;
+
+ dev_type = of_get_property(vnode_parent, "device_type", NULL);
+ of_node_put(vnode_parent);
+ if (!dev_type)
+ return NULL;
+
+ /* construct the kobject name from the device node */
+ if (!strcmp(dev_type, "vdevice")) {
+ const __be32 *prop;
+
+ prop = of_get_property(vnode, "reg", NULL);
+ if (!prop)
+ return NULL;
+ snprintf(kobj_name, sizeof(kobj_name), "%x",
+ (uint32_t)of_read_number(prop, 1));
+ } else if (!strcmp(dev_type, "ibm,platform-facilities"))
+ snprintf(kobj_name, sizeof(kobj_name), "%s", vnode->name);
+ else
+ return NULL;
+
+ return vio_find_name(kobj_name);
+}
+EXPORT_SYMBOL(vio_find_node);
+
+int vio_enable_interrupts(struct vio_dev *dev)
+{
+ int rc = h_vio_signal(dev->unit_address, VIO_IRQ_ENABLE);
+ if (rc != H_SUCCESS)
+ printk(KERN_ERR "vio: Error 0x%x enabling interrupts\n", rc);
+ return rc;
+}
+EXPORT_SYMBOL(vio_enable_interrupts);
+
+int vio_disable_interrupts(struct vio_dev *dev)
+{
+ int rc = h_vio_signal(dev->unit_address, VIO_IRQ_DISABLE);
+ if (rc != H_SUCCESS)
+ printk(KERN_ERR "vio: Error 0x%x disabling interrupts\n", rc);
+ return rc;
+}
+EXPORT_SYMBOL(vio_disable_interrupts);
+#endif /* CONFIG_PPC_PSERIES */