39 files changed, 16781 insertions, 0 deletions
diff --git a/arch/powerpc/platforms/pseries/Kconfig b/arch/powerpc/platforms/pseries/Kconfig
new file mode 100644
index 000000000..0c698fd6d
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/Kconfig
@@ -0,0 +1,142 @@
+# SPDX-License-Identifier: GPL-2.0
+config PPC_PSERIES
+	depends on PPC64 && PPC_BOOK3S
+	bool "IBM pSeries & new (POWER5-based) iSeries"
+	select HAVE_PCSPKR_PLATFORM
+	select MPIC
+	select OF_DYNAMIC
+	select PCI
+	select PCI_MSI
+	select PPC_XICS
+	select PPC_XIVE_SPAPR
+	select PPC_ICP_NATIVE
+	select PPC_ICP_HV
+	select PPC_ICS_RTAS
+	select PPC_I8259
+	select PPC_RTAS
+	select PPC_RTAS_DAEMON
+	select RTAS_ERROR_LOGGING
+	select PPC_UDBG_16550
+	select PPC_NATIVE
+	select PPC_DOORBELL
+	select HOTPLUG_CPU
+	select ARCH_RANDOM
+	select PPC_DOORBELL
+	select FORCE_SMP
+	default y
+
+config PPC_SPLPAR
+	depends on PPC_PSERIES
+	bool "Support for shared-processor logical partitions"
+	default n
+	help
+	  Enabling this option will make the kernel run more efficiently
+	  on logically-partitioned pSeries systems which use shared
+	  processors, that is, which share physical processors between
+	  two or more partitions.
+
+config DTL
+	bool "Dispatch Trace Log"
+	depends on PPC_SPLPAR && DEBUG_FS
+	help
+	  SPLPAR machines can log hypervisor preempt & dispatch events to a
+	  kernel buffer. Saying Y here will enable logging these events,
+	  which are accessible through a debugfs file.
+
+	  Say N if you are unsure.
+
+config PSERIES_ENERGY
+	tristate "pSeries energy management capabilities driver"
+	depends on PPC_PSERIES
+	default y
+	help
+	  Provides interface to platform energy management capabilities
+	  on supported PSERIES platforms.
+	  Provides: /sys/devices/system/cpu/pseries_(de)activation_hint_list
+	  and /sys/devices/system/cpu/cpuN/pseries_(de)activation_hint
+
+config SCANLOG
+	tristate "Scanlog dump interface"
+	depends on RTAS_PROC && PPC_PSERIES
+
+config IO_EVENT_IRQ
+	bool "IO Event Interrupt support"
+	depends on PPC_PSERIES
+	default y
+	help
+	  Select this option, if you want to enable support for IO Event
+	  interrupts. IO event interrupt is a mechanism provided by RTAS
+	  to return information about hardware error and non-error events
+	  which may need OS attention. RTAS returns events for multiple
+	  event types and scopes. Device drivers can register their handlers
+	  to receive events.
+
+	  This option will only enable the IO event platform code. You
+	  will still need to enable or compile the actual drivers
+	  that use this infrastructure to handle IO event interrupts.
+
+	  Say Y if you are unsure.
+
+config LPARCFG
+	bool "LPAR Configuration Data"
+	depends on PPC_PSERIES
+	help
+	Provide system capacity information via human readable
+	<key word>=<value> pairs through a /proc/ppc64/lparcfg interface.
+
+config PPC_PSERIES_DEBUG
+	depends on PPC_PSERIES && PPC_EARLY_DEBUG
+	bool "Enable extra debug logging in platforms/pseries"
+        help
+	  Say Y here if you want the pseries core to produce a bunch of
+	  debug messages to the system log. Select this if you are having a
+	  problem with the pseries core and want to see more of what is
+	  going on. This does not enable debugging in lpar.c, which must
+	  be manually done due to its verbosity.
+	default y
+
+config PPC_SMLPAR
+	bool "Support for shared-memory logical partitions"
+	depends on PPC_PSERIES
+	select LPARCFG
+	default n
+	help
+	  Select this option to enable shared memory partition support.
+	  With this option a system running in an LPAR can be given more
+	  memory than physically available and will allow firmware to
+	  balance memory across many LPARs.
+
+config CMM
+	tristate "Collaborative memory management"
+	depends on PPC_SMLPAR
+	default y
+	help
+	  Select this option, if you want to enable the kernel interface
+	  to reduce the memory size of the system. This is accomplished
+	  by allocating pages of memory and put them "on hold". This only
+	  makes sense for a system running in an LPAR where the unused pages
+	  will be reused for other LPARs. The interface allows firmware to
+	  balance memory across many LPARs.
+
+config HV_PERF_CTRS
+       bool "Hypervisor supplied PMU events (24x7 & GPCI)"
+       default y
+       depends on PERF_EVENTS && PPC_PSERIES
+       help
+	  Enable access to hypervisor supplied counters in perf. Currently,
+	  this enables code that uses the hcall GetPerfCounterInfo and 24x7
+	  interfaces to retrieve counters. GPCI exists on Power 6 and later
+	  systems. 24x7 is available on Power 8 and later systems.
+
+          If unsure, select Y.
+
+config IBMVIO
+	depends on PPC_PSERIES
+	bool
+	default y
+
+config IBMEBUS
+	depends on PPC_PSERIES && !CPU_LITTLE_ENDIAN
+	bool "Support for GX bus based adapters"
+	help
+	  Bus device driver for GX bus based adapters.
diff --git a/arch/powerpc/platforms/pseries/Makefile b/arch/powerpc/platforms/pseries/Makefile
new file mode 100644
index 000000000..7e89d5c47
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/Makefile
@@ -0,0 +1,30 @@
+# SPDX-License-Identifier: GPL-2.0
+ccflags-$(CONFIG_PPC64)			:= $(NO_MINIMAL_TOC)
+ccflags-$(CONFIG_PPC_PSERIES_DEBUG)	+= -DDEBUG
+
+obj-y			:= lpar.o hvCall.o nvram.o reconfig.o \
+			   of_helpers.o \
+			   setup.o iommu.o event_sources.o ras.o \
+			   firmware.o power.o dlpar.o mobility.o rng.o \
+			   pci.o pci_dlpar.o eeh_pseries.o msi.o
+obj-$(CONFIG_SMP)	+= smp.o
+obj-$(CONFIG_SCANLOG)	+= scanlog.o
+obj-$(CONFIG_KEXEC_CORE)	+= kexec.o
+obj-$(CONFIG_PSERIES_ENERGY)	+= pseries_energy.o
+
+obj-$(CONFIG_HOTPLUG_CPU)	+= hotplug-cpu.o
+obj-$(CONFIG_MEMORY_HOTPLUG)	+= hotplug-memory.o
+
+obj-$(CONFIG_HVC_CONSOLE)	+= hvconsole.o
+obj-$(CONFIG_HVCS)		+= hvcserver.o
+obj-$(CONFIG_HCALL_STATS)	+= hvCall_inst.o
+obj-$(CONFIG_CMM)		+= cmm.o
+obj-$(CONFIG_DTL)		+= dtl.o
+obj-$(CONFIG_IO_EVENT_IRQ)	+= io_event_irq.o
+obj-$(CONFIG_LPARCFG)		+= lparcfg.o
+obj-$(CONFIG_IBMVIO)		+= vio.o
+obj-$(CONFIG_IBMEBUS)		+= ibmebus.o
+
+ifdef CONFIG_PPC_PSERIES
+obj-$(CONFIG_SUSPEND)		+= suspend.o
+endif
diff --git a/arch/powerpc/platforms/pseries/cmm.c b/arch/powerpc/platforms/pseries/cmm.c
new file mode 100644
index 000000000..502ebcc6c
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/cmm.c
@@ -0,0 +1,778 @@
+/*
+ * Collaborative memory management interface.
+ *
+ * Copyright (C) 2008 IBM Corporation
+ * Author(s): Brian King (brking@linux.vnet.ibm.com),
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ */
+
+#include <linux/ctype.h>
+#include <linux/delay.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/gfp.h>
+#include <linux/kthread.h>
+#include <linux/module.h>
+#include <linux/oom.h>
+#include <linux/reboot.h>
+#include <linux/sched.h>
+#include <linux/stringify.h>
+#include <linux/swap.h>
+#include <linux/device.h>
+#include <asm/firmware.h>
+#include <asm/hvcall.h>
+#include <asm/mmu.h>
+#include <asm/pgalloc.h>
+#include <linux/uaccess.h>
+#include <linux/memory.h>
+#include <asm/plpar_wrappers.h>
+
+#include "pseries.h"
+
+#define CMM_DRIVER_VERSION	"1.0.0"
+#define CMM_DEFAULT_DELAY	1
+#define CMM_HOTPLUG_DELAY	5
+#define CMM_DEBUG			0
+#define CMM_DISABLE		0
+#define CMM_OOM_KB		1024
+#define CMM_MIN_MEM_MB		256
+#define KB2PAGES(_p)		((_p)>>(PAGE_SHIFT-10))
+#define PAGES2KB(_p)		((_p)<<(PAGE_SHIFT-10))
+/*
+ * The priority level tries to ensure that this notifier is called as
+ * late as possible to reduce thrashing in the shared memory pool.
+ */
+#define CMM_MEM_HOTPLUG_PRI	1
+#define CMM_MEM_ISOLATE_PRI	15
+
+static unsigned int delay = CMM_DEFAULT_DELAY;
+static unsigned int hotplug_delay = CMM_HOTPLUG_DELAY;
+static unsigned int oom_kb = CMM_OOM_KB;
+static unsigned int cmm_debug = CMM_DEBUG;
+static unsigned int cmm_disabled = CMM_DISABLE;
+static unsigned long min_mem_mb = CMM_MIN_MEM_MB;
+static struct device cmm_dev;
+
+MODULE_AUTHOR("Brian King <brking@linux.vnet.ibm.com>");
+MODULE_DESCRIPTION("IBM System p Collaborative Memory Manager");
+MODULE_LICENSE("GPL");
+MODULE_VERSION(CMM_DRIVER_VERSION);
+
+module_param_named(delay, delay, uint, 0644);
+MODULE_PARM_DESC(delay, "Delay (in seconds) between polls to query hypervisor paging requests. "
+		 "[Default=" __stringify(CMM_DEFAULT_DELAY) "]");
+module_param_named(hotplug_delay, hotplug_delay, uint, 0644);
+MODULE_PARM_DESC(hotplug_delay, "Delay (in seconds) after memory hotplug remove "
+		 "before loaning resumes. "
+		 "[Default=" __stringify(CMM_HOTPLUG_DELAY) "]");
+module_param_named(oom_kb, oom_kb, uint, 0644);
+MODULE_PARM_DESC(oom_kb, "Amount of memory in kb to free on OOM. "
+		 "[Default=" __stringify(CMM_OOM_KB) "]");
+module_param_named(min_mem_mb, min_mem_mb, ulong, 0644);
+MODULE_PARM_DESC(min_mem_mb, "Minimum amount of memory (in MB) to not balloon. "
+		 "[Default=" __stringify(CMM_MIN_MEM_MB) "]");
+module_param_named(debug, cmm_debug, uint, 0644);
+MODULE_PARM_DESC(debug, "Enable module debugging logging. Set to 1 to enable. "
+		 "[Default=" __stringify(CMM_DEBUG) "]");
+
+#define CMM_NR_PAGES ((PAGE_SIZE - sizeof(void *) - sizeof(unsigned long)) / sizeof(unsigned long))
+
+#define cmm_dbg(...) if (cmm_debug) { printk(KERN_INFO "cmm: "__VA_ARGS__); }
+
+struct cmm_page_array {
+	struct cmm_page_array *next;
+	unsigned long index;
+	unsigned long page[CMM_NR_PAGES];
+};
+
+static unsigned long loaned_pages;
+static unsigned long loaned_pages_target;
+static unsigned long oom_freed_pages;
+
+static struct cmm_page_array *cmm_page_list;
+static DEFINE_SPINLOCK(cmm_lock);
+
+static DEFINE_MUTEX(hotplug_mutex);
+static int hotplug_occurred; /* protected by the hotplug mutex */
+
+static struct task_struct *cmm_thread_ptr;
+
+static long plpar_page_set_loaned(unsigned long vpa)
+{
+	unsigned long cmo_page_sz = cmo_get_page_size();
+	long rc = 0;
+	int i;
+
+	for (i = 0; !rc && i < PAGE_SIZE; i += cmo_page_sz)
+		rc = plpar_hcall_norets(H_PAGE_INIT, H_PAGE_SET_LOANED, vpa + i, 0);
+
+	for (i -= cmo_page_sz; rc && i != 0; i -= cmo_page_sz)
+		plpar_hcall_norets(H_PAGE_INIT, H_PAGE_SET_ACTIVE,
+				   vpa + i - cmo_page_sz, 0);
+
+	return rc;
+}
+
+static long plpar_page_set_active(unsigned long vpa)
+{
+	unsigned long cmo_page_sz = cmo_get_page_size();
+	long rc = 0;
+	int i;
+
+	for (i = 0; !rc && i < PAGE_SIZE; i += cmo_page_sz)
+		rc = plpar_hcall_norets(H_PAGE_INIT, H_PAGE_SET_ACTIVE, vpa + i, 0);
+
+	for (i -= cmo_page_sz; rc && i != 0; i -= cmo_page_sz)
+		plpar_hcall_norets(H_PAGE_INIT, H_PAGE_SET_LOANED,
+				   vpa + i - cmo_page_sz, 0);
+
+	return rc;
+}
+
+/**
+ * cmm_alloc_pages - Allocate pages and mark them as loaned
+ * @nr:	number of pages to allocate
+ *
+ * Return value:
+ * 	number of pages requested to be allocated which were not
+ **/
+static long cmm_alloc_pages(long nr)
+{
+	struct cmm_page_array *pa, *npa;
+	unsigned long addr;
+	long rc;
+
+	cmm_dbg("Begin request for %ld pages\n", nr);
+
+	while (nr) {
+		/* Exit if a hotplug operation is in progress or occurred */
+		if (mutex_trylock(&hotplug_mutex)) {
+			if (hotplug_occurred) {
+				mutex_unlock(&hotplug_mutex);
+				break;
+			}
+			mutex_unlock(&hotplug_mutex);
+		} else {
+			break;
+		}
+
+		addr = __get_free_page(GFP_NOIO | __GFP_NOWARN |
+				       __GFP_NORETRY | __GFP_NOMEMALLOC);
+		if (!addr)
+			break;
+		spin_lock(&cmm_lock);
+		pa = cmm_page_list;
+		if (!pa || pa->index >= CMM_NR_PAGES) {
+			/* Need a new page for the page list. */
+			spin_unlock(&cmm_lock);
+			npa = (struct cmm_page_array *)__get_free_page(
+					GFP_NOIO | __GFP_NOWARN |
+					__GFP_NORETRY | __GFP_NOMEMALLOC);
+			if (!npa) {
+				pr_info("%s: Can not allocate new page list\n", __func__);
+				free_page(addr);
+				break;
+			}
+			spin_lock(&cmm_lock);
+			pa = cmm_page_list;
+
+			if (!pa || pa->index >= CMM_NR_PAGES) {
+				npa->next = pa;
+				npa->index = 0;
+				pa = npa;
+				cmm_page_list = pa;
+			} else
+				free_page((unsigned long) npa);
+		}
+
+		if ((rc = plpar_page_set_loaned(__pa(addr)))) {
+			pr_err("%s: Can not set page to loaned. rc=%ld\n", __func__, rc);
+			spin_unlock(&cmm_lock);
+			free_page(addr);
+			break;
+		}
+
+		pa->page[pa->index++] = addr;
+		loaned_pages++;
+		totalram_pages--;
+		spin_unlock(&cmm_lock);
+		nr--;
+	}
+
+	cmm_dbg("End request with %ld pages unfulfilled\n", nr);
+	return nr;
+}
+
+/**
+ * cmm_free_pages - Free pages and mark them as active
+ * @nr:	number of pages to free
+ *
+ * Return value:
+ * 	number of pages requested to be freed which were not
+ **/
+static long cmm_free_pages(long nr)
+{
+	struct cmm_page_array *pa;
+	unsigned long addr;
+
+	cmm_dbg("Begin free of %ld pages.\n", nr);
+	spin_lock(&cmm_lock);
+	pa = cmm_page_list;
+	while (nr) {
+		if (!pa || pa->index <= 0)
+			break;
+		addr = pa->page[--pa->index];
+
+		if (pa->index == 0) {
+			pa = pa->next;
+			free_page((unsigned long) cmm_page_list);
+			cmm_page_list = pa;
+		}
+
+		plpar_page_set_active(__pa(addr));
+		free_page(addr);
+		loaned_pages--;
+		nr--;
+		totalram_pages++;
+	}
+	spin_unlock(&cmm_lock);
+	cmm_dbg("End request with %ld pages unfulfilled\n", nr);
+	return nr;
+}
+
+/**
+ * cmm_oom_notify - OOM notifier
+ * @self:	notifier block struct
+ * @dummy:	not used
+ * @parm:	returned - number of pages freed
+ *
+ * Return value:
+ * 	NOTIFY_OK
+ **/
+static int cmm_oom_notify(struct notifier_block *self,
+			  unsigned long dummy, void *parm)
+{
+	unsigned long *freed = parm;
+	long nr = KB2PAGES(oom_kb);
+
+	cmm_dbg("OOM processing started\n");
+	nr = cmm_free_pages(nr);
+	loaned_pages_target = loaned_pages;
+	*freed += KB2PAGES(oom_kb) - nr;
+	oom_freed_pages += KB2PAGES(oom_kb) - nr;
+	cmm_dbg("OOM processing complete\n");
+	return NOTIFY_OK;
+}
+
+/**
+ * cmm_get_mpp - Read memory performance parameters
+ *
+ * Makes hcall to query the current page loan request from the hypervisor.
+ *
+ * Return value:
+ * 	nothing
+ **/
+static void cmm_get_mpp(void)
+{
+	int rc;
+	struct hvcall_mpp_data mpp_data;
+	signed long active_pages_target, page_loan_request, target;
+	signed long total_pages = totalram_pages + loaned_pages;
+	signed long min_mem_pages = (min_mem_mb * 1024 * 1024) / PAGE_SIZE;
+
+	rc = h_get_mpp(&mpp_data);
+
+	if (rc != H_SUCCESS)
+		return;
+
+	page_loan_request = div_s64((s64)mpp_data.loan_request, PAGE_SIZE);
+	target = page_loan_request + (signed long)loaned_pages;
+
+	if (target < 0 || total_pages < min_mem_pages)
+		target = 0;
+
+	if (target > oom_freed_pages)
+		target -= oom_freed_pages;
+	else
+		target = 0;
+
+	active_pages_target = total_pages - target;
+
+	if (min_mem_pages > active_pages_target)
+		target = total_pages - min_mem_pages;
+
+	if (target < 0)
+		target = 0;
+
+	loaned_pages_target = target;
+
+	cmm_dbg("delta = %ld, loaned = %lu, target = %lu, oom = %lu, totalram = %lu\n",
+		page_loan_request, loaned_pages, loaned_pages_target,
+		oom_freed_pages, totalram_pages);
+}
+
+static struct notifier_block cmm_oom_nb = {
+	.notifier_call = cmm_oom_notify
+};
+
+/**
+ * cmm_thread - CMM task thread
+ * @dummy:	not used
+ *
+ * Return value:
+ * 	0
+ **/
+static int cmm_thread(void *dummy)
+{
+	unsigned long timeleft;
+
+	while (1) {
+		timeleft = msleep_interruptible(delay * 1000);
+
+		if (kthread_should_stop() || timeleft)
+			break;
+
+		if (mutex_trylock(&hotplug_mutex)) {
+			if (hotplug_occurred) {
+				hotplug_occurred = 0;
+				mutex_unlock(&hotplug_mutex);
+				cmm_dbg("Hotplug operation has occurred, "
+						"loaning activity suspended "
+						"for %d seconds.\n",
+						hotplug_delay);
+				timeleft = msleep_interruptible(hotplug_delay *
+						1000);
+				if (kthread_should_stop() || timeleft)
+					break;
+				continue;
+			}
+			mutex_unlock(&hotplug_mutex);
+		} else {
+			cmm_dbg("Hotplug operation in progress, activity "
+					"suspended\n");
+			continue;
+		}
+
+		cmm_get_mpp();
+
+		if (loaned_pages_target > loaned_pages) {
+			if (cmm_alloc_pages(loaned_pages_target - loaned_pages))
+				loaned_pages_target = loaned_pages;
+		} else if (loaned_pages_target < loaned_pages)
+			cmm_free_pages(loaned_pages - loaned_pages_target);
+	}
+	return 0;
+}
+
+#define CMM_SHOW(name, format, args...)			\
+	static ssize_t show_##name(struct device *dev,	\
+				   struct device_attribute *attr,	\
+				   char *buf)			\
+	{							\
+		return sprintf(buf, format, ##args);		\
+	}							\
+	static DEVICE_ATTR(name, 0444, show_##name, NULL)
+
+CMM_SHOW(loaned_kb, "%lu\n", PAGES2KB(loaned_pages));
+CMM_SHOW(loaned_target_kb, "%lu\n", PAGES2KB(loaned_pages_target));
+
+static ssize_t show_oom_pages(struct device *dev,
+			      struct device_attribute *attr, char *buf)
+{
+	return sprintf(buf, "%lu\n", PAGES2KB(oom_freed_pages));
+}
+
+static ssize_t store_oom_pages(struct device *dev,
+			       struct device_attribute *attr,
+			       const char *buf, size_t count)
+{
+	unsigned long val = simple_strtoul (buf, NULL, 10);
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+	if (val != 0)
+		return -EBADMSG;
+
+	oom_freed_pages = 0;
+	return count;
+}
+
+static DEVICE_ATTR(oom_freed_kb, 0644,
+		   show_oom_pages, store_oom_pages);
+
+static struct device_attribute *cmm_attrs[] = {
+	&dev_attr_loaned_kb,
+	&dev_attr_loaned_target_kb,
+	&dev_attr_oom_freed_kb,
+};
+
+static struct bus_type cmm_subsys = {
+	.name = "cmm",
+	.dev_name = "cmm",
+};
+
+static void cmm_release_device(struct device *dev)
+{
+}
+
+/**
+ * cmm_sysfs_register - Register with sysfs
+ *
+ * Return value:
+ * 	0 on success / other on failure
+ **/
+static int cmm_sysfs_register(struct device *dev)
+{
+	int i, rc;
+
+	if ((rc = subsys_system_register(&cmm_subsys, NULL)))
+		return rc;
+
+	dev->id = 0;
+	dev->bus = &cmm_subsys;
+	dev->release = cmm_release_device;
+
+	if ((rc = device_register(dev)))
+		goto subsys_unregister;
+
+	for (i = 0; i < ARRAY_SIZE(cmm_attrs); i++) {
+		if ((rc = device_create_file(dev, cmm_attrs[i])))
+			goto fail;
+	}
+
+	return 0;
+
+fail:
+	while (--i >= 0)
+		device_remove_file(dev, cmm_attrs[i]);
+	device_unregister(dev);
+subsys_unregister:
+	bus_unregister(&cmm_subsys);
+	return rc;
+}
+
+/**
+ * cmm_unregister_sysfs - Unregister from sysfs
+ *
+ **/
+static void cmm_unregister_sysfs(struct device *dev)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(cmm_attrs); i++)
+		device_remove_file(dev, cmm_attrs[i]);
+	device_unregister(dev);
+	bus_unregister(&cmm_subsys);
+}
+
+/**
+ * cmm_reboot_notifier - Make sure pages are not still marked as "loaned"
+ *
+ **/
+static int cmm_reboot_notifier(struct notifier_block *nb,
+			       unsigned long action, void *unused)
+{
+	if (action == SYS_RESTART) {
+		if (cmm_thread_ptr)
+			kthread_stop(cmm_thread_ptr);
+		cmm_thread_ptr = NULL;
+		cmm_free_pages(loaned_pages);
+	}
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block cmm_reboot_nb = {
+	.notifier_call = cmm_reboot_notifier,
+};
+
+/**
+ * cmm_count_pages - Count the number of pages loaned in a particular range.
+ *
+ * @arg: memory_isolate_notify structure with address range and count
+ *
+ * Return value:
+ *      0 on success
+ **/
+static unsigned long cmm_count_pages(void *arg)
+{
+	struct memory_isolate_notify *marg = arg;
+	struct cmm_page_array *pa;
+	unsigned long start = (unsigned long)pfn_to_kaddr(marg->start_pfn);
+	unsigned long end = start + (marg->nr_pages << PAGE_SHIFT);
+	unsigned long idx;
+
+	spin_lock(&cmm_lock);
+	pa = cmm_page_list;
+	while (pa) {
+		if ((unsigned long)pa >= start && (unsigned long)pa < end)
+			marg->pages_found++;
+		for (idx = 0; idx < pa->index; idx++)
+			if (pa->page[idx] >= start && pa->page[idx] < end)
+				marg->pages_found++;
+		pa = pa->next;
+	}
+	spin_unlock(&cmm_lock);
+	return 0;
+}
+
+/**
+ * cmm_memory_isolate_cb - Handle memory isolation notifier calls
+ * @self:	notifier block struct
+ * @action:	action to take
+ * @arg:	struct memory_isolate_notify data for handler
+ *
+ * Return value:
+ *	NOTIFY_OK or notifier error based on subfunction return value
+ **/
+static int cmm_memory_isolate_cb(struct notifier_block *self,
+				 unsigned long action, void *arg)
+{
+	int ret = 0;
+
+	if (action == MEM_ISOLATE_COUNT)
+		ret = cmm_count_pages(arg);
+
+	return notifier_from_errno(ret);
+}
+
+static struct notifier_block cmm_mem_isolate_nb = {
+	.notifier_call = cmm_memory_isolate_cb,
+	.priority = CMM_MEM_ISOLATE_PRI
+};
+
+/**
+ * cmm_mem_going_offline - Unloan pages where memory is to be removed
+ * @arg: memory_notify structure with page range to be offlined
+ *
+ * Return value:
+ *	0 on success
+ **/
+static int cmm_mem_going_offline(void *arg)
+{
+	struct memory_notify *marg = arg;
+	unsigned long start_page = (unsigned long)pfn_to_kaddr(marg->start_pfn);
+	unsigned long end_page = start_page + (marg->nr_pages << PAGE_SHIFT);
+	struct cmm_page_array *pa_curr, *pa_last, *npa;
+	unsigned long idx;
+	unsigned long freed = 0;
+
+	cmm_dbg("Memory going offline, searching 0x%lx (%ld pages).\n",
+			start_page, marg->nr_pages);
+	spin_lock(&cmm_lock);
+
+	/* Search the page list for pages in the range to be offlined */
+	pa_last = pa_curr = cmm_page_list;
+	while (pa_curr) {
+		for (idx = (pa_curr->index - 1); (idx + 1) > 0; idx--) {
+			if ((pa_curr->page[idx] < start_page) ||
+			    (pa_curr->page[idx] >= end_page))
+				continue;
+
+			plpar_page_set_active(__pa(pa_curr->page[idx]));
+			free_page(pa_curr->page[idx]);
+			freed++;
+			loaned_pages--;
+			totalram_pages++;
+			pa_curr->page[idx] = pa_last->page[--pa_last->index];
+			if (pa_last->index == 0) {
+				if (pa_curr == pa_last)
+					pa_curr = pa_last->next;
+				pa_last = pa_last->next;
+				free_page((unsigned long)cmm_page_list);
+				cmm_page_list = pa_last;
+			}
+		}
+		pa_curr = pa_curr->next;
+	}
+
+	/* Search for page list structures in the range to be offlined */
+	pa_last = NULL;
+	pa_curr = cmm_page_list;
+	while (pa_curr) {
+		if (((unsigned long)pa_curr >= start_page) &&
+				((unsigned long)pa_curr < end_page)) {
+			npa = (struct cmm_page_array *)__get_free_page(
+					GFP_NOIO | __GFP_NOWARN |
+					__GFP_NORETRY | __GFP_NOMEMALLOC);
+			if (!npa) {
+				spin_unlock(&cmm_lock);
+				cmm_dbg("Failed to allocate memory for list "
+						"management. Memory hotplug "
+						"failed.\n");
+				return -ENOMEM;
+			}
+			memcpy(npa, pa_curr, PAGE_SIZE);
+			if (pa_curr == cmm_page_list)
+				cmm_page_list = npa;
+			if (pa_last)
+				pa_last->next = npa;
+			free_page((unsigned long) pa_curr);
+			freed++;
+			pa_curr = npa;
+		}
+
+		pa_last = pa_curr;
+		pa_curr = pa_curr->next;
+	}
+
+	spin_unlock(&cmm_lock);
+	cmm_dbg("Released %ld pages in the search range.\n", freed);
+
+	return 0;
+}
+
+/**
+ * cmm_memory_cb - Handle memory hotplug notifier calls
+ * @self:	notifier block struct
+ * @action:	action to take
+ * @arg:	struct memory_notify data for handler
+ *
+ * Return value:
+ *	NOTIFY_OK or notifier error based on subfunction return value
+ *
+ **/
+static int cmm_memory_cb(struct notifier_block *self,
+			unsigned long action, void *arg)
+{
+	int ret = 0;
+
+	switch (action) {
+	case MEM_GOING_OFFLINE:
+		mutex_lock(&hotplug_mutex);
+		hotplug_occurred = 1;
+		ret = cmm_mem_going_offline(arg);
+		break;
+	case MEM_OFFLINE:
+	case MEM_CANCEL_OFFLINE:
+		mutex_unlock(&hotplug_mutex);
+		cmm_dbg("Memory offline operation complete.\n");
+		break;
+	case MEM_GOING_ONLINE:
+	case MEM_ONLINE:
+	case MEM_CANCEL_ONLINE:
+		break;
+	}
+
+	return notifier_from_errno(ret);
+}
+
+static struct notifier_block cmm_mem_nb = {
+	.notifier_call = cmm_memory_cb,
+	.priority = CMM_MEM_HOTPLUG_PRI
+};
+
+/**
+ * cmm_init - Module initialization
+ *
+ * Return value:
+ * 	0 on success / other on failure
+ **/
+static int cmm_init(void)
+{
+	int rc = -ENOMEM;
+
+	if (!firmware_has_feature(FW_FEATURE_CMO))
+		return -EOPNOTSUPP;
+
+	if ((rc = register_oom_notifier(&cmm_oom_nb)) < 0)
+		return rc;
+
+	if ((rc = register_reboot_notifier(&cmm_reboot_nb)))
+		goto out_oom_notifier;
+
+	if ((rc = cmm_sysfs_register(&cmm_dev)))
+		goto out_reboot_notifier;
+
+	if (register_memory_notifier(&cmm_mem_nb) ||
+	    register_memory_isolate_notifier(&cmm_mem_isolate_nb))
+		goto out_unregister_notifier;
+
+	if (cmm_disabled)
+		return rc;
+
+	cmm_thread_ptr = kthread_run(cmm_thread, NULL, "cmmthread");
+	if (IS_ERR(cmm_thread_ptr)) {
+		rc = PTR_ERR(cmm_thread_ptr);
+		goto out_unregister_notifier;
+	}
+
+	return rc;
+
+out_unregister_notifier:
+	unregister_memory_notifier(&cmm_mem_nb);
+	unregister_memory_isolate_notifier(&cmm_mem_isolate_nb);
+	cmm_unregister_sysfs(&cmm_dev);
+out_reboot_notifier:
+	unregister_reboot_notifier(&cmm_reboot_nb);
+out_oom_notifier:
+	unregister_oom_notifier(&cmm_oom_nb);
+	return rc;
+}
+
+/**
+ * cmm_exit - Module exit
+ *
+ * Return value:
+ * 	nothing
+ **/
+static void cmm_exit(void)
+{
+	if (cmm_thread_ptr)
+		kthread_stop(cmm_thread_ptr);
+	unregister_oom_notifier(&cmm_oom_nb);
+	unregister_reboot_notifier(&cmm_reboot_nb);
+	unregister_memory_notifier(&cmm_mem_nb);
+	unregister_memory_isolate_notifier(&cmm_mem_isolate_nb);
+	cmm_free_pages(loaned_pages);
+	cmm_unregister_sysfs(&cmm_dev);
+}
+
+/**
+ * cmm_set_disable - Disable/Enable CMM
+ *
+ * Return value:
+ * 	0 on success / other on failure
+ **/
+static int cmm_set_disable(const char *val, const struct kernel_param *kp)
+{
+	int disable = simple_strtoul(val, NULL, 10);
+
+	if (disable != 0 && disable != 1)
+		return -EINVAL;
+
+	if (disable && !cmm_disabled) {
+		if (cmm_thread_ptr)
+			kthread_stop(cmm_thread_ptr);
+		cmm_thread_ptr = NULL;
+		cmm_free_pages(loaned_pages);
+	} else if (!disable && cmm_disabled) {
+		cmm_thread_ptr = kthread_run(cmm_thread, NULL, "cmmthread");
+		if (IS_ERR(cmm_thread_ptr))
+			return PTR_ERR(cmm_thread_ptr);
+	}
+
+	cmm_disabled = disable;
+	return 0;
+}
+
+module_param_call(disable, cmm_set_disable, param_get_uint,
+		  &cmm_disabled, 0644);
+MODULE_PARM_DESC(disable, "Disable CMM. Set to 1 to disable. "
+		 "[Default=" __stringify(CMM_DISABLE) "]");
+
+module_init(cmm_init);
+module_exit(cmm_exit);
diff --git a/arch/powerpc/platforms/pseries/dlpar.c b/arch/powerpc/platforms/pseries/dlpar.c
new file mode 100644
index 000000000..90fd03b9d
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/dlpar.c
@@ -0,0 +1,604 @@
+/*
+ * Support for dynamic reconfiguration for PCI, Memory, and CPU
+ * Hotplug and Dynamic Logical Partitioning on RPA platforms.
+ *
+ * Copyright (C) 2009 Nathan Fontenot
+ * Copyright (C) 2009 IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ */
+
+#define pr_fmt(fmt)	"dlpar: " fmt
+
+#include <linux/kernel.h>
+#include <linux/notifier.h>
+#include <linux/spinlock.h>
+#include <linux/cpu.h>
+#include <linux/slab.h>
+#include <linux/of.h>
+
+#include "of_helpers.h"
+#include "pseries.h"
+
+#include <asm/prom.h>
+#include <asm/machdep.h>
+#include <linux/uaccess.h>
+#include <asm/rtas.h>
+
+static struct workqueue_struct *pseries_hp_wq;
+
+struct pseries_hp_work {
+	struct work_struct work;
+	struct pseries_hp_errorlog *errlog;
+	struct completion *hp_completion;
+	int *rc;
+};
+
+struct cc_workarea {
+	__be32	drc_index;
+	__be32	zero;
+	__be32	name_offset;
+	__be32	prop_length;
+	__be32	prop_offset;
+};
+
+void dlpar_free_cc_property(struct property *prop)
+{
+	kfree(prop->name);
+	kfree(prop->value);
+	kfree(prop);
+}
+
+static struct property *dlpar_parse_cc_property(struct cc_workarea *ccwa)
+{
+	struct property *prop;
+	char *name;
+	char *value;
+
+	prop = kzalloc(sizeof(*prop), GFP_KERNEL);
+	if (!prop)
+		return NULL;
+
+	name = (char *)ccwa + be32_to_cpu(ccwa->name_offset);
+	prop->name = kstrdup(name, GFP_KERNEL);
+	if (!prop->name) {
+		dlpar_free_cc_property(prop);
+		return NULL;
+	}
+
+	prop->length = be32_to_cpu(ccwa->prop_length);
+	value = (char *)ccwa + be32_to_cpu(ccwa->prop_offset);
+	prop->value = kmemdup(value, prop->length, GFP_KERNEL);
+	if (!prop->value) {
+		dlpar_free_cc_property(prop);
+		return NULL;
+	}
+
+	return prop;
+}
+
+static struct device_node *dlpar_parse_cc_node(struct cc_workarea *ccwa)
+{
+	struct device_node *dn;
+	const char *name;
+
+	dn = kzalloc(sizeof(*dn), GFP_KERNEL);
+	if (!dn)
+		return NULL;
+
+	name = (const char *)ccwa + be32_to_cpu(ccwa->name_offset);
+	dn->full_name = kstrdup(name, GFP_KERNEL);
+	if (!dn->full_name) {
+		kfree(dn);
+		return NULL;
+	}
+
+	of_node_set_flag(dn, OF_DYNAMIC);
+	of_node_init(dn);
+
+	return dn;
+}
+
+static void dlpar_free_one_cc_node(struct device_node *dn)
+{
+	struct property *prop;
+
+	while (dn->properties) {
+		prop = dn->properties;
+		dn->properties = prop->next;
+		dlpar_free_cc_property(prop);
+	}
+
+	kfree(dn->full_name);
+	kfree(dn);
+}
+
+void dlpar_free_cc_nodes(struct device_node *dn)
+{
+	if (dn->child)
+		dlpar_free_cc_nodes(dn->child);
+
+	if (dn->sibling)
+		dlpar_free_cc_nodes(dn->sibling);
+
+	dlpar_free_one_cc_node(dn);
+}
+
+#define COMPLETE	0
+#define NEXT_SIBLING    1
+#define NEXT_CHILD      2
+#define NEXT_PROPERTY   3
+#define PREV_PARENT     4
+#define MORE_MEMORY     5
+#define ERR_CFG_USE     -9003
+
+struct device_node *dlpar_configure_connector(__be32 drc_index,
+					      struct device_node *parent)
+{
+	struct device_node *dn;
+	struct device_node *first_dn = NULL;
+	struct device_node *last_dn = NULL;
+	struct property *property;
+	struct property *last_property = NULL;
+	struct cc_workarea *ccwa;
+	char *data_buf;
+	int cc_token;
+	int rc = -1;
+
+	cc_token = rtas_token("ibm,configure-connector");
+	if (cc_token == RTAS_UNKNOWN_SERVICE)
+		return NULL;
+
+	data_buf = kzalloc(RTAS_DATA_BUF_SIZE, GFP_KERNEL);
+	if (!data_buf)
+		return NULL;
+
+	ccwa = (struct cc_workarea *)&data_buf[0];
+	ccwa->drc_index = drc_index;
+	ccwa->zero = 0;
+
+	do {
+		/* Since we release the rtas_data_buf lock between configure
+		 * connector calls we want to re-populate the rtas_data_buffer
+		 * with the contents of the previous call.
+		 */
+		spin_lock(&rtas_data_buf_lock);
+
+		memcpy(rtas_data_buf, data_buf, RTAS_DATA_BUF_SIZE);
+		rc = rtas_call(cc_token, 2, 1, NULL, rtas_data_buf, NULL);
+		memcpy(data_buf, rtas_data_buf, RTAS_DATA_BUF_SIZE);
+
+		spin_unlock(&rtas_data_buf_lock);
+
+		if (rtas_busy_delay(rc))
+			continue;
+
+		switch (rc) {
+		case COMPLETE:
+			break;
+
+		case NEXT_SIBLING:
+			dn = dlpar_parse_cc_node(ccwa);
+			if (!dn)
+				goto cc_error;
+
+			dn->parent = last_dn->parent;
+			last_dn->sibling = dn;
+			last_dn = dn;
+			break;
+
+		case NEXT_CHILD:
+			dn = dlpar_parse_cc_node(ccwa);
+			if (!dn)
+				goto cc_error;
+
+			if (!first_dn) {
+				dn->parent = parent;
+				first_dn = dn;
+			} else {
+				dn->parent = last_dn;
+				if (last_dn)
+					last_dn->child = dn;
+			}
+
+			last_dn = dn;
+			break;
+
+		case NEXT_PROPERTY:
+			property = dlpar_parse_cc_property(ccwa);
+			if (!property)
+				goto cc_error;
+
+			if (!last_dn->properties)
+				last_dn->properties = property;
+			else
+				last_property->next = property;
+
+			last_property = property;
+			break;
+
+		case PREV_PARENT:
+			last_dn = last_dn->parent;
+			break;
+
+		case MORE_MEMORY:
+		case ERR_CFG_USE:
+		default:
+			printk(KERN_ERR "Unexpected Error (%d) "
+			       "returned from configure-connector\n", rc);
+			goto cc_error;
+		}
+	} while (rc);
+
+cc_error:
+	kfree(data_buf);
+
+	if (rc) {
+		if (first_dn)
+			dlpar_free_cc_nodes(first_dn);
+
+		return NULL;
+	}
+
+	return first_dn;
+}
+
+int dlpar_attach_node(struct device_node *dn, struct device_node *parent)
+{
+	int rc;
+
+	dn->parent = parent;
+
+	rc = of_attach_node(dn);
+	if (rc) {
+		printk(KERN_ERR "Failed to add device node %pOF\n", dn);
+		return rc;
+	}
+
+	return 0;
+}
+
+int dlpar_detach_node(struct device_node *dn)
+{
+	struct device_node *child;
+	int rc;
+
+	child = of_get_next_child(dn, NULL);
+	while (child) {
+		dlpar_detach_node(child);
+		child = of_get_next_child(dn, child);
+	}
+
+	rc = of_detach_node(dn);
+	if (rc)
+		return rc;
+
+	of_node_put(dn);
+
+	return 0;
+}
+
+#define DR_ENTITY_SENSE		9003
+#define DR_ENTITY_PRESENT	1
+#define DR_ENTITY_UNUSABLE	2
+#define ALLOCATION_STATE	9003
+#define ALLOC_UNUSABLE		0
+#define ALLOC_USABLE		1
+#define ISOLATION_STATE		9001
+#define ISOLATE			0
+#define UNISOLATE		1
+
+int dlpar_acquire_drc(u32 drc_index)
+{
+	int dr_status, rc;
+
+	rc = rtas_call(rtas_token("get-sensor-state"), 2, 2, &dr_status,
+		       DR_ENTITY_SENSE, drc_index);
+	if (rc || dr_status != DR_ENTITY_UNUSABLE)
+		return -1;
+
+	rc = rtas_set_indicator(ALLOCATION_STATE, drc_index, ALLOC_USABLE);
+	if (rc)
+		return rc;
+
+	rc = rtas_set_indicator(ISOLATION_STATE, drc_index, UNISOLATE);
+	if (rc) {
+		rtas_set_indicator(ALLOCATION_STATE, drc_index, ALLOC_UNUSABLE);
+		return rc;
+	}
+
+	return 0;
+}
+
+int dlpar_release_drc(u32 drc_index)
+{
+	int dr_status, rc;
+
+	rc = rtas_call(rtas_token("get-sensor-state"), 2, 2, &dr_status,
+		       DR_ENTITY_SENSE, drc_index);
+	if (rc || dr_status != DR_ENTITY_PRESENT)
+		return -1;
+
+	rc = rtas_set_indicator(ISOLATION_STATE, drc_index, ISOLATE);
+	if (rc)
+		return rc;
+
+	rc = rtas_set_indicator(ALLOCATION_STATE, drc_index, ALLOC_UNUSABLE);
+	if (rc) {
+		rtas_set_indicator(ISOLATION_STATE, drc_index, UNISOLATE);
+		return rc;
+	}
+
+	return 0;
+}
+
+static int handle_dlpar_errorlog(struct pseries_hp_errorlog *hp_elog)
+{
+	int rc;
+
+	/* pseries error logs are in BE format, convert to cpu type */
+	switch (hp_elog->id_type) {
+	case PSERIES_HP_ELOG_ID_DRC_COUNT:
+		hp_elog->_drc_u.drc_count =
+				be32_to_cpu(hp_elog->_drc_u.drc_count);
+		break;
+	case PSERIES_HP_ELOG_ID_DRC_INDEX:
+		hp_elog->_drc_u.drc_index =
+				be32_to_cpu(hp_elog->_drc_u.drc_index);
+		break;
+	case PSERIES_HP_ELOG_ID_DRC_IC:
+		hp_elog->_drc_u.ic.count =
+				be32_to_cpu(hp_elog->_drc_u.ic.count);
+		hp_elog->_drc_u.ic.index =
+				be32_to_cpu(hp_elog->_drc_u.ic.index);
+	}
+
+	switch (hp_elog->resource) {
+	case PSERIES_HP_ELOG_RESOURCE_MEM:
+		rc = dlpar_memory(hp_elog);
+		break;
+	case PSERIES_HP_ELOG_RESOURCE_CPU:
+		rc = dlpar_cpu(hp_elog);
+		break;
+	default:
+		pr_warn_ratelimited("Invalid resource (%d) specified\n",
+				    hp_elog->resource);
+		rc = -EINVAL;
+	}
+
+	return rc;
+}
+
+static void pseries_hp_work_fn(struct work_struct *work)
+{
+	struct pseries_hp_work *hp_work =
+			container_of(work, struct pseries_hp_work, work);
+
+	if (hp_work->rc)
+		*(hp_work->rc) = handle_dlpar_errorlog(hp_work->errlog);
+	else
+		handle_dlpar_errorlog(hp_work->errlog);
+
+	if (hp_work->hp_completion)
+		complete(hp_work->hp_completion);
+
+	kfree(hp_work->errlog);
+	kfree((void *)work);
+}
+
+void queue_hotplug_event(struct pseries_hp_errorlog *hp_errlog,
+			 struct completion *hotplug_done, int *rc)
+{
+	struct pseries_hp_work *work;
+	struct pseries_hp_errorlog *hp_errlog_copy;
+
+	hp_errlog_copy = kmalloc(sizeof(struct pseries_hp_errorlog),
+				 GFP_KERNEL);
+	memcpy(hp_errlog_copy, hp_errlog, sizeof(struct pseries_hp_errorlog));
+
+	work = kmalloc(sizeof(struct pseries_hp_work), GFP_KERNEL);
+	if (work) {
+		INIT_WORK((struct work_struct *)work, pseries_hp_work_fn);
+		work->errlog = hp_errlog_copy;
+		work->hp_completion = hotplug_done;
+		work->rc = rc;
+		queue_work(pseries_hp_wq, (struct work_struct *)work);
+	} else {
+		*rc = -ENOMEM;
+		kfree(hp_errlog_copy);
+		complete(hotplug_done);
+	}
+}
+
+static int dlpar_parse_resource(char **cmd, struct pseries_hp_errorlog *hp_elog)
+{
+	char *arg;
+
+	arg = strsep(cmd, " ");
+	if (!arg)
+		return -EINVAL;
+
+	if (sysfs_streq(arg, "memory")) {
+		hp_elog->resource = PSERIES_HP_ELOG_RESOURCE_MEM;
+	} else if (sysfs_streq(arg, "cpu")) {
+		hp_elog->resource = PSERIES_HP_ELOG_RESOURCE_CPU;
+	} else {
+		pr_err("Invalid resource specified.\n");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int dlpar_parse_action(char **cmd, struct pseries_hp_errorlog *hp_elog)
+{
+	char *arg;
+
+	arg = strsep(cmd, " ");
+	if (!arg)
+		return -EINVAL;
+
+	if (sysfs_streq(arg, "add")) {
+		hp_elog->action = PSERIES_HP_ELOG_ACTION_ADD;
+	} else if (sysfs_streq(arg, "remove")) {
+		hp_elog->action = PSERIES_HP_ELOG_ACTION_REMOVE;
+	} else {
+		pr_err("Invalid action specified.\n");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int dlpar_parse_id_type(char **cmd, struct pseries_hp_errorlog *hp_elog)
+{
+	char *arg;
+	u32 count, index;
+
+	arg = strsep(cmd, " ");
+	if (!arg)
+		return -EINVAL;
+
+	if (sysfs_streq(arg, "indexed-count")) {
+		hp_elog->id_type = PSERIES_HP_ELOG_ID_DRC_IC;
+		arg = strsep(cmd, " ");
+		if (!arg) {
+			pr_err("No DRC count specified.\n");
+			return -EINVAL;
+		}
+
+		if (kstrtou32(arg, 0, &count)) {
+			pr_err("Invalid DRC count specified.\n");
+			return -EINVAL;
+		}
+
+		arg = strsep(cmd, " ");
+		if (!arg) {
+			pr_err("No DRC Index specified.\n");
+			return -EINVAL;
+		}
+
+		if (kstrtou32(arg, 0, &index)) {
+			pr_err("Invalid DRC Index specified.\n");
+			return -EINVAL;
+		}
+
+		hp_elog->_drc_u.ic.count = cpu_to_be32(count);
+		hp_elog->_drc_u.ic.index = cpu_to_be32(index);
+	} else if (sysfs_streq(arg, "index")) {
+		hp_elog->id_type = PSERIES_HP_ELOG_ID_DRC_INDEX;
+		arg = strsep(cmd, " ");
+		if (!arg) {
+			pr_err("No DRC Index specified.\n");
+			return -EINVAL;
+		}
+
+		if (kstrtou32(arg, 0, &index)) {
+			pr_err("Invalid DRC Index specified.\n");
+			return -EINVAL;
+		}
+
+		hp_elog->_drc_u.drc_index = cpu_to_be32(index);
+	} else if (sysfs_streq(arg, "count")) {
+		hp_elog->id_type = PSERIES_HP_ELOG_ID_DRC_COUNT;
+		arg = strsep(cmd, " ");
+		if (!arg) {
+			pr_err("No DRC count specified.\n");
+			return -EINVAL;
+		}
+
+		if (kstrtou32(arg, 0, &count)) {
+			pr_err("Invalid DRC count specified.\n");
+			return -EINVAL;
+		}
+
+		hp_elog->_drc_u.drc_count = cpu_to_be32(count);
+	} else {
+		pr_err("Invalid id_type specified.\n");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static ssize_t dlpar_store(struct class *class, struct class_attribute *attr,
+			   const char *buf, size_t count)
+{
+	struct pseries_hp_errorlog *hp_elog;
+	struct completion hotplug_done;
+	char *argbuf;
+	char *args;
+	int rc;
+
+	args = argbuf = kstrdup(buf, GFP_KERNEL);
+	hp_elog = kzalloc(sizeof(*hp_elog), GFP_KERNEL);
+	if (!hp_elog || !argbuf) {
+		pr_info("Could not allocate resources for DLPAR operation\n");
+		kfree(argbuf);
+		kfree(hp_elog);
+		return -ENOMEM;
+	}
+
+	/*
+	 * Parse out the request from the user, this will be in the form:
+	 * <resource> <action> <id_type> <id>
+	 */
+	rc = dlpar_parse_resource(&args, hp_elog);
+	if (rc)
+		goto dlpar_store_out;
+
+	rc = dlpar_parse_action(&args, hp_elog);
+	if (rc)
+		goto dlpar_store_out;
+
+	rc = dlpar_parse_id_type(&args, hp_elog);
+	if (rc)
+		goto dlpar_store_out;
+
+	init_completion(&hotplug_done);
+	queue_hotplug_event(hp_elog, &hotplug_done, &rc);
+	wait_for_completion(&hotplug_done);
+
+dlpar_store_out:
+	kfree(argbuf);
+	kfree(hp_elog);
+
+	if (rc)
+		pr_err("Could not handle DLPAR request \"%s\"\n", buf);
+
+	return rc ? rc : count;
+}
+
+static ssize_t dlpar_show(struct class *class, struct class_attribute *attr,
+			  char *buf)
+{
+	return sprintf(buf, "%s\n", "memory,cpu");
+}
+
+static CLASS_ATTR_RW(dlpar);
+
+int __init dlpar_workqueue_init(void)
+{
+	if (pseries_hp_wq)
+		return 0;
+
+	pseries_hp_wq = alloc_workqueue("pseries hotplug workqueue",
+			WQ_UNBOUND, 1);
+
+	return pseries_hp_wq ? 0 : -ENOMEM;
+}
+
+static int __init dlpar_sysfs_init(void)
+{
+	int rc;
+
+	rc = dlpar_workqueue_init();
+	if (rc)
+		return rc;
+
+	return sysfs_create_file(kernel_kobj, &class_attr_dlpar.attr);
+}
+machine_device_initcall(pseries, dlpar_sysfs_init);
+
diff --git a/arch/powerpc/platforms/pseries/dtl.c b/arch/powerpc/platforms/pseries/dtl.c
new file mode 100644
index 000000000..ef6595153
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/dtl.c
@@ -0,0 +1,394 @@
+/*
+ * Virtual Processor Dispatch Trace Log
+ *
+ * (C) Copyright IBM Corporation 2009
+ *
+ * Author: Jeremy Kerr <jk@ozlabs.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <asm/smp.h>
+#include <linux/uaccess.h>
+#include <asm/firmware.h>
+#include <asm/lppaca.h>
+#include <asm/debugfs.h>
+#include <asm/plpar_wrappers.h>
+#include <asm/machdep.h>
+
+struct dtl {
+	struct dtl_entry	*buf;
+	struct dentry		*file;
+	int			cpu;
+	int			buf_entries;
+	u64			last_idx;
+	spinlock_t		lock;
+};
+static DEFINE_PER_CPU(struct dtl, cpu_dtl);
+
+/*
+ * Dispatch trace log event mask:
+ * 0x7: 0x1: voluntary virtual processor waits
+ *      0x2: time-slice preempts
+ *      0x4: virtual partition memory page faults
+ */
+static u8 dtl_event_mask = 0x7;
+
+
+/*
+ * Size of per-cpu log buffers. Firmware requires that the buffer does
+ * not cross a 4k boundary.
+ */
+static int dtl_buf_entries = N_DISPATCH_LOG;
+
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
+struct dtl_ring {
+	u64	write_index;
+	struct dtl_entry *write_ptr;
+	struct dtl_entry *buf;
+	struct dtl_entry *buf_end;
+	u8	saved_dtl_mask;
+};
+
+static DEFINE_PER_CPU(struct dtl_ring, dtl_rings);
+
+static atomic_t dtl_count;
+
+/*
+ * The cpu accounting code controls the DTL ring buffer, and we get
+ * given entries as they are processed.
+ */
+static void consume_dtle(struct dtl_entry *dtle, u64 index)
+{
+	struct dtl_ring *dtlr = this_cpu_ptr(&dtl_rings);
+	struct dtl_entry *wp = dtlr->write_ptr;
+	struct lppaca *vpa = local_paca->lppaca_ptr;
+
+	if (!wp)
+		return;
+
+	*wp = *dtle;
+	barrier();
+
+	/* check for hypervisor ring buffer overflow, ignore this entry if so */
+	if (index + N_DISPATCH_LOG < be64_to_cpu(vpa->dtl_idx))
+		return;
+
+	++wp;
+	if (wp == dtlr->buf_end)
+		wp = dtlr->buf;
+	dtlr->write_ptr = wp;
+
+	/* incrementing write_index makes the new entry visible */
+	smp_wmb();
+	++dtlr->write_index;
+}
+
+static int dtl_start(struct dtl *dtl)
+{
+	struct dtl_ring *dtlr = &per_cpu(dtl_rings, dtl->cpu);
+
+	dtlr->buf = dtl->buf;
+	dtlr->buf_end = dtl->buf + dtl->buf_entries;
+	dtlr->write_index = 0;
+
+	/* setting write_ptr enables logging into our buffer */
+	smp_wmb();
+	dtlr->write_ptr = dtl->buf;
+
+	/* enable event logging */
+	dtlr->saved_dtl_mask = lppaca_of(dtl->cpu).dtl_enable_mask;
+	lppaca_of(dtl->cpu).dtl_enable_mask |= dtl_event_mask;
+
+	dtl_consumer = consume_dtle;
+	atomic_inc(&dtl_count);
+	return 0;
+}
+
+static void dtl_stop(struct dtl *dtl)
+{
+	struct dtl_ring *dtlr = &per_cpu(dtl_rings, dtl->cpu);
+
+	dtlr->write_ptr = NULL;
+	smp_wmb();
+
+	dtlr->buf = NULL;
+
+	/* restore dtl_enable_mask */
+	lppaca_of(dtl->cpu).dtl_enable_mask = dtlr->saved_dtl_mask;
+
+	if (atomic_dec_and_test(&dtl_count))
+		dtl_consumer = NULL;
+}
+
+static u64 dtl_current_index(struct dtl *dtl)
+{
+	return per_cpu(dtl_rings, dtl->cpu).write_index;
+}
+
+#else /* CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
+
+static int dtl_start(struct dtl *dtl)
+{
+	unsigned long addr;
+	int ret, hwcpu;
+
+	/* Register our dtl buffer with the hypervisor. The HV expects the
+	 * buffer size to be passed in the second word of the buffer */
+	((u32 *)dtl->buf)[1] = cpu_to_be32(DISPATCH_LOG_BYTES);
+
+	hwcpu = get_hard_smp_processor_id(dtl->cpu);
+	addr = __pa(dtl->buf);
+	ret = register_dtl(hwcpu, addr);
+	if (ret) {
+		printk(KERN_WARNING "%s: DTL registration for cpu %d (hw %d) "
+		       "failed with %d\n", __func__, dtl->cpu, hwcpu, ret);
+		return -EIO;
+	}
+
+	/* set our initial buffer indices */
+	lppaca_of(dtl->cpu).dtl_idx = 0;
+
+	/* ensure that our updates to the lppaca fields have occurred before
+	 * we actually enable the logging */
+	smp_wmb();
+
+	/* enable event logging */
+	lppaca_of(dtl->cpu).dtl_enable_mask = dtl_event_mask;
+
+	return 0;
+}
+
+static void dtl_stop(struct dtl *dtl)
+{
+	int hwcpu = get_hard_smp_processor_id(dtl->cpu);
+
+	lppaca_of(dtl->cpu).dtl_enable_mask = 0x0;
+
+	unregister_dtl(hwcpu);
+}
+
+static u64 dtl_current_index(struct dtl *dtl)
+{
+	return be64_to_cpu(lppaca_of(dtl->cpu).dtl_idx);
+}
+#endif /* CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
+
+static int dtl_enable(struct dtl *dtl)
+{
+	long int n_entries;
+	long int rc;
+	struct dtl_entry *buf = NULL;
+
+	if (!dtl_cache)
+		return -ENOMEM;
+
+	/* only allow one reader */
+	if (dtl->buf)
+		return -EBUSY;
+
+	n_entries = dtl_buf_entries;
+	buf = kmem_cache_alloc_node(dtl_cache, GFP_KERNEL, cpu_to_node(dtl->cpu));
+	if (!buf) {
+		printk(KERN_WARNING "%s: buffer alloc failed for cpu %d\n",
+				__func__, dtl->cpu);
+		return -ENOMEM;
+	}
+
+	spin_lock(&dtl->lock);
+	rc = -EBUSY;
+	if (!dtl->buf) {
+		/* store the original allocation size for use during read */
+		dtl->buf_entries = n_entries;
+		dtl->buf = buf;
+		dtl->last_idx = 0;
+		rc = dtl_start(dtl);
+		if (rc)
+			dtl->buf = NULL;
+	}
+	spin_unlock(&dtl->lock);
+
+	if (rc)
+		kmem_cache_free(dtl_cache, buf);
+	return rc;
+}
+
+static void dtl_disable(struct dtl *dtl)
+{
+	spin_lock(&dtl->lock);
+	dtl_stop(dtl);
+	kmem_cache_free(dtl_cache, dtl->buf);
+	dtl->buf = NULL;
+	dtl->buf_entries = 0;
+	spin_unlock(&dtl->lock);
+}
+
+/* file interface */
+
+static int dtl_file_open(struct inode *inode, struct file *filp)
+{
+	struct dtl *dtl = inode->i_private;
+	int rc;
+
+	rc = dtl_enable(dtl);
+	if (rc)
+		return rc;
+
+	filp->private_data = dtl;
+	return 0;
+}
+
+static int dtl_file_release(struct inode *inode, struct file *filp)
+{
+	struct dtl *dtl = inode->i_private;
+	dtl_disable(dtl);
+	return 0;
+}
+
+static ssize_t dtl_file_read(struct file *filp, char __user *buf, size_t len,
+		loff_t *pos)
+{
+	long int rc, n_read, n_req, read_size;
+	struct dtl *dtl;
+	u64 cur_idx, last_idx, i;
+
+	if ((len % sizeof(struct dtl_entry)) != 0)
+		return -EINVAL;
+
+	dtl = filp->private_data;
+
+	/* requested number of entries to read */
+	n_req = len / sizeof(struct dtl_entry);
+
+	/* actual number of entries read */
+	n_read = 0;
+
+	spin_lock(&dtl->lock);
+
+	cur_idx = dtl_current_index(dtl);
+	last_idx = dtl->last_idx;
+
+	if (last_idx + dtl->buf_entries <= cur_idx)
+		last_idx = cur_idx - dtl->buf_entries + 1;
+
+	if (last_idx + n_req > cur_idx)
+		n_req = cur_idx - last_idx;
+
+	if (n_req > 0)
+		dtl->last_idx = last_idx + n_req;
+
+	spin_unlock(&dtl->lock);
+
+	if (n_req <= 0)
+		return 0;
+
+	i = last_idx % dtl->buf_entries;
+
+	/* read the tail of the buffer if we've wrapped */
+	if (i + n_req > dtl->buf_entries) {
+		read_size = dtl->buf_entries - i;
+
+		rc = copy_to_user(buf, &dtl->buf[i],
+				read_size * sizeof(struct dtl_entry));
+		if (rc)
+			return -EFAULT;
+
+		i = 0;
+		n_req -= read_size;
+		n_read += read_size;
+		buf += read_size * sizeof(struct dtl_entry);
+	}
+
+	/* .. and now the head */
+	rc = copy_to_user(buf, &dtl->buf[i], n_req * sizeof(struct dtl_entry));
+	if (rc)
+		return -EFAULT;
+
+	n_read += n_req;
+
+	return n_read * sizeof(struct dtl_entry);
+}
+
+static const struct file_operations dtl_fops = {
+	.open		= dtl_file_open,
+	.release	= dtl_file_release,
+	.read		= dtl_file_read,
+	.llseek		= no_llseek,
+};
+
+static struct dentry *dtl_dir;
+
+static int dtl_setup_file(struct dtl *dtl)
+{
+	char name[10];
+
+	sprintf(name, "cpu-%d", dtl->cpu);
+
+	dtl->file = debugfs_create_file(name, 0400, dtl_dir, dtl, &dtl_fops);
+	if (!dtl->file)
+		return -ENOMEM;
+
+	return 0;
+}
+
+static int dtl_init(void)
+{
+	struct dentry *event_mask_file, *buf_entries_file;
+	int rc, i;
+
+	if (!firmware_has_feature(FW_FEATURE_SPLPAR))
+		return -ENODEV;
+
+	/* set up common debugfs structure */
+
+	rc = -ENOMEM;
+	dtl_dir = debugfs_create_dir("dtl", powerpc_debugfs_root);
+	if (!dtl_dir) {
+		printk(KERN_WARNING "%s: can't create dtl root dir\n",
+				__func__);
+		goto err;
+	}
+
+	event_mask_file = debugfs_create_x8("dtl_event_mask", 0600,
+				dtl_dir, &dtl_event_mask);
+	buf_entries_file = debugfs_create_u32("dtl_buf_entries", 0400,
+				dtl_dir, &dtl_buf_entries);
+
+	if (!event_mask_file || !buf_entries_file) {
+		printk(KERN_WARNING "%s: can't create dtl files\n", __func__);
+		goto err_remove_dir;
+	}
+
+	/* set up the per-cpu log structures */
+	for_each_possible_cpu(i) {
+		struct dtl *dtl = &per_cpu(cpu_dtl, i);
+		spin_lock_init(&dtl->lock);
+		dtl->cpu = i;
+
+		rc = dtl_setup_file(dtl);
+		if (rc)
+			goto err_remove_dir;
+	}
+
+	return 0;
+
+err_remove_dir:
+	debugfs_remove_recursive(dtl_dir);
+err:
+	return rc;
+}
+machine_arch_initcall(pseries, dtl_init);
diff --git a/arch/powerpc/platforms/pseries/eeh_pseries.c b/arch/powerpc/platforms/pseries/eeh_pseries.c
new file mode 100644
index 000000000..823cb27ef
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/eeh_pseries.c
@@ -0,0 +1,884 @@
+/*
+ * The file intends to implement the platform dependent EEH operations on pseries.
+ * Actually, the pseries platform is built based on RTAS heavily. That means the
+ * pseries platform dependent EEH operations will be built on RTAS calls. The functions
+ * are derived from arch/powerpc/platforms/pseries/eeh.c and necessary cleanup has
+ * been done.
+ *
+ * Copyright Benjamin Herrenschmidt & Gavin Shan, IBM Corporation 2011.
+ * Copyright IBM Corporation 2001, 2005, 2006
+ * Copyright Dave Engebretsen & Todd Inglett 2001
+ * Copyright Linas Vepstas 2005, 2006
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ */
+
+#include <linux/atomic.h>
+#include <linux/delay.h>
+#include <linux/export.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/of.h>
+#include <linux/pci.h>
+#include <linux/proc_fs.h>
+#include <linux/rbtree.h>
+#include <linux/sched.h>
+#include <linux/seq_file.h>
+#include <linux/spinlock.h>
+
+#include <asm/eeh.h>
+#include <asm/eeh_event.h>
+#include <asm/io.h>
+#include <asm/machdep.h>
+#include <asm/ppc-pci.h>
+#include <asm/rtas.h>
+
+/* RTAS tokens */
+static int ibm_set_eeh_option;
+static int ibm_set_slot_reset;
+static int ibm_read_slot_reset_state;
+static int ibm_read_slot_reset_state2;
+static int ibm_slot_error_detail;
+static int ibm_get_config_addr_info;
+static int ibm_get_config_addr_info2;
+static int ibm_configure_pe;
+
+#ifdef CONFIG_PCI_IOV
+void pseries_pcibios_bus_add_device(struct pci_dev *pdev)
+{
+	struct pci_dn *pdn = pci_get_pdn(pdev);
+	struct pci_dn *physfn_pdn;
+	struct eeh_dev *edev;
+
+	if (!pdev->is_virtfn)
+		return;
+
+	pdn->device_id  =  pdev->device;
+	pdn->vendor_id  =  pdev->vendor;
+	pdn->class_code =  pdev->class;
+	/*
+	 * Last allow unfreeze return code used for retrieval
+	 * by user space in eeh-sysfs to show the last command
+	 * completion from platform.
+	 */
+	pdn->last_allow_rc =  0;
+	physfn_pdn      =  pci_get_pdn(pdev->physfn);
+	pdn->pe_number  =  physfn_pdn->pe_num_map[pdn->vf_index];
+	edev = pdn_to_eeh_dev(pdn);
+
+	/*
+	 * The following operations will fail if VF's sysfs files
+	 * aren't created or its resources aren't finalized.
+	 */
+	eeh_add_device_early(pdn);
+	eeh_add_device_late(pdev);
+	edev->pe_config_addr =  (pdn->busno << 16) | (pdn->devfn << 8);
+	eeh_rmv_from_parent_pe(edev); /* Remove as it is adding to bus pe */
+	eeh_add_to_parent_pe(edev);   /* Add as VF PE type */
+	eeh_sysfs_add_device(pdev);
+
+}
+#endif
+
+/*
+ * Buffer for reporting slot-error-detail rtas calls. Its here
+ * in BSS, and not dynamically alloced, so that it ends up in
+ * RMO where RTAS can access it.
+ */
+static unsigned char slot_errbuf[RTAS_ERROR_LOG_MAX];
+static DEFINE_SPINLOCK(slot_errbuf_lock);
+static int eeh_error_buf_size;
+
+/**
+ * pseries_eeh_init - EEH platform dependent initialization
+ *
+ * EEH platform dependent initialization on pseries.
+ */
+static int pseries_eeh_init(void)
+{
+	/* figure out EEH RTAS function call tokens */
+	ibm_set_eeh_option		= rtas_token("ibm,set-eeh-option");
+	ibm_set_slot_reset		= rtas_token("ibm,set-slot-reset");
+	ibm_read_slot_reset_state2	= rtas_token("ibm,read-slot-reset-state2");
+	ibm_read_slot_reset_state	= rtas_token("ibm,read-slot-reset-state");
+	ibm_slot_error_detail		= rtas_token("ibm,slot-error-detail");
+	ibm_get_config_addr_info2	= rtas_token("ibm,get-config-addr-info2");
+	ibm_get_config_addr_info	= rtas_token("ibm,get-config-addr-info");
+	ibm_configure_pe		= rtas_token("ibm,configure-pe");
+
+	/*
+	 * ibm,configure-pe and ibm,configure-bridge have the same semantics,
+	 * however ibm,configure-pe can be faster.  If we can't find
+	 * ibm,configure-pe then fall back to using ibm,configure-bridge.
+	 */
+	if (ibm_configure_pe == RTAS_UNKNOWN_SERVICE)
+		ibm_configure_pe 	= rtas_token("ibm,configure-bridge");
+
+	/*
+	 * Necessary sanity check. We needn't check "get-config-addr-info"
+	 * and its variant since the old firmware probably support address
+	 * of domain/bus/slot/function for EEH RTAS operations.
+	 */
+	if (ibm_set_eeh_option == RTAS_UNKNOWN_SERVICE		||
+	    ibm_set_slot_reset == RTAS_UNKNOWN_SERVICE		||
+	    (ibm_read_slot_reset_state2 == RTAS_UNKNOWN_SERVICE &&
+	     ibm_read_slot_reset_state == RTAS_UNKNOWN_SERVICE)	||
+	    ibm_slot_error_detail == RTAS_UNKNOWN_SERVICE	||
+	    ibm_configure_pe == RTAS_UNKNOWN_SERVICE) {
+		pr_info("EEH functionality not supported\n");
+		return -EINVAL;
+	}
+
+	/* Initialize error log lock and size */
+	spin_lock_init(&slot_errbuf_lock);
+	eeh_error_buf_size = rtas_token("rtas-error-log-max");
+	if (eeh_error_buf_size == RTAS_UNKNOWN_SERVICE) {
+		pr_info("%s: unknown EEH error log size\n",
+			__func__);
+		eeh_error_buf_size = 1024;
+	} else if (eeh_error_buf_size > RTAS_ERROR_LOG_MAX) {
+		pr_info("%s: EEH error log size %d exceeds the maximal %d\n",
+			__func__, eeh_error_buf_size, RTAS_ERROR_LOG_MAX);
+		eeh_error_buf_size = RTAS_ERROR_LOG_MAX;
+	}
+
+	/* Set EEH probe mode */
+	eeh_add_flag(EEH_PROBE_MODE_DEVTREE | EEH_ENABLE_IO_FOR_LOG);
+
+#ifdef CONFIG_PCI_IOV
+	/* Set EEH machine dependent code */
+	ppc_md.pcibios_bus_add_device = pseries_pcibios_bus_add_device;
+#endif
+
+	return 0;
+}
+
+static int pseries_eeh_cap_start(struct pci_dn *pdn)
+{
+	u32 status;
+
+	if (!pdn)
+		return 0;
+
+	rtas_read_config(pdn, PCI_STATUS, 2, &status);
+	if (!(status & PCI_STATUS_CAP_LIST))
+		return 0;
+
+	return PCI_CAPABILITY_LIST;
+}
+
+
+static int pseries_eeh_find_cap(struct pci_dn *pdn, int cap)
+{
+	int pos = pseries_eeh_cap_start(pdn);
+	int cnt = 48;	/* Maximal number of capabilities */
+	u32 id;
+
+	if (!pos)
+		return 0;
+
+        while (cnt--) {
+		rtas_read_config(pdn, pos, 1, &pos);
+		if (pos < 0x40)
+			break;
+		pos &= ~3;
+		rtas_read_config(pdn, pos + PCI_CAP_LIST_ID, 1, &id);
+		if (id == 0xff)
+			break;
+		if (id == cap)
+			return pos;
+		pos += PCI_CAP_LIST_NEXT;
+	}
+
+	return 0;
+}
+
+static int pseries_eeh_find_ecap(struct pci_dn *pdn, int cap)
+{
+	struct eeh_dev *edev = pdn_to_eeh_dev(pdn);
+	u32 header;
+	int pos = 256;
+	int ttl = (4096 - 256) / 8;
+
+	if (!edev || !edev->pcie_cap)
+		return 0;
+	if (rtas_read_config(pdn, pos, 4, &header) != PCIBIOS_SUCCESSFUL)
+		return 0;
+	else if (!header)
+		return 0;
+
+	while (ttl-- > 0) {
+		if (PCI_EXT_CAP_ID(header) == cap && pos)
+			return pos;
+
+		pos = PCI_EXT_CAP_NEXT(header);
+		if (pos < 256)
+			break;
+
+		if (rtas_read_config(pdn, pos, 4, &header) != PCIBIOS_SUCCESSFUL)
+			break;
+	}
+
+	return 0;
+}
+
+/**
+ * pseries_eeh_probe - EEH probe on the given device
+ * @pdn: PCI device node
+ * @data: Unused
+ *
+ * When EEH module is installed during system boot, all PCI devices
+ * are checked one by one to see if it supports EEH. The function
+ * is introduced for the purpose.
+ */
+static void *pseries_eeh_probe(struct pci_dn *pdn, void *data)
+{
+	struct eeh_dev *edev;
+	struct eeh_pe pe;
+	u32 pcie_flags;
+	int enable = 0;
+	int ret;
+
+	/* Retrieve OF node and eeh device */
+	edev = pdn_to_eeh_dev(pdn);
+	if (!edev || edev->pe)
+		return NULL;
+
+	/* Check class/vendor/device IDs */
+	if (!pdn->vendor_id || !pdn->device_id || !pdn->class_code)
+		return NULL;
+
+	/* Skip for PCI-ISA bridge */
+        if ((pdn->class_code >> 8) == PCI_CLASS_BRIDGE_ISA)
+		return NULL;
+
+	/*
+	 * Update class code and mode of eeh device. We need
+	 * correctly reflects that current device is root port
+	 * or PCIe switch downstream port.
+	 */
+	edev->class_code = pdn->class_code;
+	edev->pcix_cap = pseries_eeh_find_cap(pdn, PCI_CAP_ID_PCIX);
+	edev->pcie_cap = pseries_eeh_find_cap(pdn, PCI_CAP_ID_EXP);
+	edev->aer_cap = pseries_eeh_find_ecap(pdn, PCI_EXT_CAP_ID_ERR);
+	edev->mode &= 0xFFFFFF00;
+	if ((edev->class_code >> 8) == PCI_CLASS_BRIDGE_PCI) {
+		edev->mode |= EEH_DEV_BRIDGE;
+		if (edev->pcie_cap) {
+			rtas_read_config(pdn, edev->pcie_cap + PCI_EXP_FLAGS,
+					 2, &pcie_flags);
+			pcie_flags = (pcie_flags & PCI_EXP_FLAGS_TYPE) >> 4;
+			if (pcie_flags == PCI_EXP_TYPE_ROOT_PORT)
+				edev->mode |= EEH_DEV_ROOT_PORT;
+			else if (pcie_flags == PCI_EXP_TYPE_DOWNSTREAM)
+				edev->mode |= EEH_DEV_DS_PORT;
+		}
+	}
+
+	/* Initialize the fake PE */
+	memset(&pe, 0, sizeof(struct eeh_pe));
+	pe.phb = pdn->phb;
+	pe.config_addr = (pdn->busno << 16) | (pdn->devfn << 8);
+
+	/* Enable EEH on the device */
+	ret = eeh_ops->set_option(&pe, EEH_OPT_ENABLE);
+	if (!ret) {
+		/* Retrieve PE address */
+		edev->pe_config_addr = eeh_ops->get_pe_addr(&pe);
+		pe.addr = edev->pe_config_addr;
+
+		/* Some older systems (Power4) allow the ibm,set-eeh-option
+		 * call to succeed even on nodes where EEH is not supported.
+		 * Verify support explicitly.
+		 */
+		ret = eeh_ops->get_state(&pe, NULL);
+		if (ret > 0 && ret != EEH_STATE_NOT_SUPPORT)
+			enable = 1;
+
+		if (enable) {
+			eeh_add_flag(EEH_ENABLED);
+			eeh_add_to_parent_pe(edev);
+
+			pr_debug("%s: EEH enabled on %02x:%02x.%01x PHB#%x-PE#%x\n",
+				__func__, pdn->busno, PCI_SLOT(pdn->devfn),
+				PCI_FUNC(pdn->devfn), pe.phb->global_number,
+				pe.addr);
+		} else if (pdn->parent && pdn_to_eeh_dev(pdn->parent) &&
+			   (pdn_to_eeh_dev(pdn->parent))->pe) {
+			/* This device doesn't support EEH, but it may have an
+			 * EEH parent, in which case we mark it as supported.
+			 */
+			edev->pe_config_addr = pdn_to_eeh_dev(pdn->parent)->pe_config_addr;
+			eeh_add_to_parent_pe(edev);
+		}
+	}
+
+	/* Save memory bars */
+	eeh_save_bars(edev);
+
+	return NULL;
+}
+
+/**
+ * pseries_eeh_set_option - Initialize EEH or MMIO/DMA reenable
+ * @pe: EEH PE
+ * @option: operation to be issued
+ *
+ * The function is used to control the EEH functionality globally.
+ * Currently, following options are support according to PAPR:
+ * Enable EEH, Disable EEH, Enable MMIO and Enable DMA
+ */
+static int pseries_eeh_set_option(struct eeh_pe *pe, int option)
+{
+	int ret = 0;
+	int config_addr;
+
+	/*
+	 * When we're enabling or disabling EEH functioality on
+	 * the particular PE, the PE config address is possibly
+	 * unavailable. Therefore, we have to figure it out from
+	 * the FDT node.
+	 */
+	switch (option) {
+	case EEH_OPT_DISABLE:
+	case EEH_OPT_ENABLE:
+	case EEH_OPT_THAW_MMIO:
+	case EEH_OPT_THAW_DMA:
+		config_addr = pe->config_addr;
+		if (pe->addr)
+			config_addr = pe->addr;
+		break;
+	case EEH_OPT_FREEZE_PE:
+		/* Not support */
+		return 0;
+	default:
+		pr_err("%s: Invalid option %d\n",
+			__func__, option);
+		return -EINVAL;
+	}
+
+	ret = rtas_call(ibm_set_eeh_option, 4, 1, NULL,
+			config_addr, BUID_HI(pe->phb->buid),
+			BUID_LO(pe->phb->buid), option);
+
+	return ret;
+}
+
+/**
+ * pseries_eeh_get_pe_addr - Retrieve PE address
+ * @pe: EEH PE
+ *
+ * Retrieve the assocated PE address. Actually, there're 2 RTAS
+ * function calls dedicated for the purpose. We need implement
+ * it through the new function and then the old one. Besides,
+ * you should make sure the config address is figured out from
+ * FDT node before calling the function.
+ *
+ * It's notable that zero'ed return value means invalid PE config
+ * address.
+ */
+static int pseries_eeh_get_pe_addr(struct eeh_pe *pe)
+{
+	int ret = 0;
+	int rets[3];
+
+	if (ibm_get_config_addr_info2 != RTAS_UNKNOWN_SERVICE) {
+		/*
+		 * First of all, we need to make sure there has one PE
+		 * associated with the device. Otherwise, PE address is
+		 * meaningless.
+		 */
+		ret = rtas_call(ibm_get_config_addr_info2, 4, 2, rets,
+				pe->config_addr, BUID_HI(pe->phb->buid),
+				BUID_LO(pe->phb->buid), 1);
+		if (ret || (rets[0] == 0))
+			return 0;
+
+		/* Retrieve the associated PE config address */
+		ret = rtas_call(ibm_get_config_addr_info2, 4, 2, rets,
+				pe->config_addr, BUID_HI(pe->phb->buid),
+				BUID_LO(pe->phb->buid), 0);
+		if (ret) {
+			pr_warn("%s: Failed to get address for PHB#%x-PE#%x\n",
+				__func__, pe->phb->global_number, pe->config_addr);
+			return 0;
+		}
+
+		return rets[0];
+	}
+
+	if (ibm_get_config_addr_info != RTAS_UNKNOWN_SERVICE) {
+		ret = rtas_call(ibm_get_config_addr_info, 4, 2, rets,
+				pe->config_addr, BUID_HI(pe->phb->buid),
+				BUID_LO(pe->phb->buid), 0);
+		if (ret) {
+			pr_warn("%s: Failed to get address for PHB#%x-PE#%x\n",
+				__func__, pe->phb->global_number, pe->config_addr);
+			return 0;
+		}
+
+		return rets[0];
+	}
+
+	return ret;
+}
+
+/**
+ * pseries_eeh_get_state - Retrieve PE state
+ * @pe: EEH PE
+ * @state: return value
+ *
+ * Retrieve the state of the specified PE. On RTAS compliant
+ * pseries platform, there already has one dedicated RTAS function
+ * for the purpose. It's notable that the associated PE config address
+ * might be ready when calling the function. Therefore, endeavour to
+ * use the PE config address if possible. Further more, there're 2
+ * RTAS calls for the purpose, we need to try the new one and back
+ * to the old one if the new one couldn't work properly.
+ */
+static int pseries_eeh_get_state(struct eeh_pe *pe, int *state)
+{
+	int config_addr;
+	int ret;
+	int rets[4];
+	int result;
+
+	/* Figure out PE config address if possible */
+	config_addr = pe->config_addr;
+	if (pe->addr)
+		config_addr = pe->addr;
+
+	if (ibm_read_slot_reset_state2 != RTAS_UNKNOWN_SERVICE) {
+		ret = rtas_call(ibm_read_slot_reset_state2, 3, 4, rets,
+				config_addr, BUID_HI(pe->phb->buid),
+				BUID_LO(pe->phb->buid));
+	} else if (ibm_read_slot_reset_state != RTAS_UNKNOWN_SERVICE) {
+		/* Fake PE unavailable info */
+		rets[2] = 0;
+		ret = rtas_call(ibm_read_slot_reset_state, 3, 3, rets,
+				config_addr, BUID_HI(pe->phb->buid),
+				BUID_LO(pe->phb->buid));
+	} else {
+		return EEH_STATE_NOT_SUPPORT;
+	}
+
+	if (ret)
+		return ret;
+
+	/* Parse the result out */
+	if (!rets[1])
+		return EEH_STATE_NOT_SUPPORT;
+
+	switch(rets[0]) {
+	case 0:
+		result = EEH_STATE_MMIO_ACTIVE |
+			 EEH_STATE_DMA_ACTIVE;
+		break;
+	case 1:
+		result = EEH_STATE_RESET_ACTIVE |
+			 EEH_STATE_MMIO_ACTIVE  |
+			 EEH_STATE_DMA_ACTIVE;
+		break;
+	case 2:
+		result = 0;
+		break;
+	case 4:
+		result = EEH_STATE_MMIO_ENABLED;
+		break;
+	case 5:
+		if (rets[2]) {
+			if (state) *state = rets[2];
+			result = EEH_STATE_UNAVAILABLE;
+		} else {
+			result = EEH_STATE_NOT_SUPPORT;
+		}
+		break;
+	default:
+		result = EEH_STATE_NOT_SUPPORT;
+	}
+
+	return result;
+}
+
+/**
+ * pseries_eeh_reset - Reset the specified PE
+ * @pe: EEH PE
+ * @option: reset option
+ *
+ * Reset the specified PE
+ */
+static int pseries_eeh_reset(struct eeh_pe *pe, int option)
+{
+	int config_addr;
+	int ret;
+
+	/* Figure out PE address */
+	config_addr = pe->config_addr;
+	if (pe->addr)
+		config_addr = pe->addr;
+
+	/* Reset PE through RTAS call */
+	ret = rtas_call(ibm_set_slot_reset, 4, 1, NULL,
+			config_addr, BUID_HI(pe->phb->buid),
+			BUID_LO(pe->phb->buid), option);
+
+	/* If fundamental-reset not supported, try hot-reset */
+	if (option == EEH_RESET_FUNDAMENTAL &&
+	    ret == -8) {
+		option = EEH_RESET_HOT;
+		ret = rtas_call(ibm_set_slot_reset, 4, 1, NULL,
+				config_addr, BUID_HI(pe->phb->buid),
+				BUID_LO(pe->phb->buid), option);
+	}
+
+	/* We need reset hold or settlement delay */
+	if (option == EEH_RESET_FUNDAMENTAL ||
+	    option == EEH_RESET_HOT)
+		msleep(EEH_PE_RST_HOLD_TIME);
+	else
+		msleep(EEH_PE_RST_SETTLE_TIME);
+
+	return ret;
+}
+
+/**
+ * pseries_eeh_wait_state - Wait for PE state
+ * @pe: EEH PE
+ * @max_wait: maximal period in millisecond
+ *
+ * Wait for the state of associated PE. It might take some time
+ * to retrieve the PE's state.
+ */
+static int pseries_eeh_wait_state(struct eeh_pe *pe, int max_wait)
+{
+	int ret;
+	int mwait;
+
+	/*
+	 * According to PAPR, the state of PE might be temporarily
+	 * unavailable. Under the circumstance, we have to wait
+	 * for indicated time determined by firmware. The maximal
+	 * wait time is 5 minutes, which is acquired from the original
+	 * EEH implementation. Also, the original implementation
+	 * also defined the minimal wait time as 1 second.
+	 */
+#define EEH_STATE_MIN_WAIT_TIME	(1000)
+#define EEH_STATE_MAX_WAIT_TIME	(300 * 1000)
+
+	while (1) {
+		ret = pseries_eeh_get_state(pe, &mwait);
+
+		/*
+		 * If the PE's state is temporarily unavailable,
+		 * we have to wait for the specified time. Otherwise,
+		 * the PE's state will be returned immediately.
+		 */
+		if (ret != EEH_STATE_UNAVAILABLE)
+			return ret;
+
+		if (max_wait <= 0) {
+			pr_warn("%s: Timeout when getting PE's state (%d)\n",
+				__func__, max_wait);
+			return EEH_STATE_NOT_SUPPORT;
+		}
+
+		if (mwait <= 0) {
+			pr_warn("%s: Firmware returned bad wait value %d\n",
+				__func__, mwait);
+			mwait = EEH_STATE_MIN_WAIT_TIME;
+		} else if (mwait > EEH_STATE_MAX_WAIT_TIME) {
+			pr_warn("%s: Firmware returned too long wait value %d\n",
+				__func__, mwait);
+			mwait = EEH_STATE_MAX_WAIT_TIME;
+		}
+
+		max_wait -= mwait;
+		msleep(mwait);
+	}
+
+	return EEH_STATE_NOT_SUPPORT;
+}
+
+/**
+ * pseries_eeh_get_log - Retrieve error log
+ * @pe: EEH PE
+ * @severity: temporary or permanent error log
+ * @drv_log: driver log to be combined with retrieved error log
+ * @len: length of driver log
+ *
+ * Retrieve the temporary or permanent error from the PE.
+ * Actually, the error will be retrieved through the dedicated
+ * RTAS call.
+ */
+static int pseries_eeh_get_log(struct eeh_pe *pe, int severity, char *drv_log, unsigned long len)
+{
+	int config_addr;
+	unsigned long flags;
+	int ret;
+
+	spin_lock_irqsave(&slot_errbuf_lock, flags);
+	memset(slot_errbuf, 0, eeh_error_buf_size);
+
+	/* Figure out the PE address */
+	config_addr = pe->config_addr;
+	if (pe->addr)
+		config_addr = pe->addr;
+
+	ret = rtas_call(ibm_slot_error_detail, 8, 1, NULL, config_addr,
+			BUID_HI(pe->phb->buid), BUID_LO(pe->phb->buid),
+			virt_to_phys(drv_log), len,
+			virt_to_phys(slot_errbuf), eeh_error_buf_size,
+			severity);
+	if (!ret)
+		log_error(slot_errbuf, ERR_TYPE_RTAS_LOG, 0);
+	spin_unlock_irqrestore(&slot_errbuf_lock, flags);
+
+	return ret;
+}
+
+/**
+ * pseries_eeh_configure_bridge - Configure PCI bridges in the indicated PE
+ * @pe: EEH PE
+ *
+ * The function will be called to reconfigure the bridges included
+ * in the specified PE so that the mulfunctional PE would be recovered
+ * again.
+ */
+static int pseries_eeh_configure_bridge(struct eeh_pe *pe)
+{
+	int config_addr;
+	int ret;
+	/* Waiting 0.2s maximum before skipping configuration */
+	int max_wait = 200;
+
+	/* Figure out the PE address */
+	config_addr = pe->config_addr;
+	if (pe->addr)
+		config_addr = pe->addr;
+
+	while (max_wait > 0) {
+		ret = rtas_call(ibm_configure_pe, 3, 1, NULL,
+				config_addr, BUID_HI(pe->phb->buid),
+				BUID_LO(pe->phb->buid));
+
+		if (!ret)
+			return ret;
+
+		/*
+		 * If RTAS returns a delay value that's above 100ms, cut it
+		 * down to 100ms in case firmware made a mistake.  For more
+		 * on how these delay values work see rtas_busy_delay_time
+		 */
+		if (ret > RTAS_EXTENDED_DELAY_MIN+2 &&
+		    ret <= RTAS_EXTENDED_DELAY_MAX)
+			ret = RTAS_EXTENDED_DELAY_MIN+2;
+
+		max_wait -= rtas_busy_delay_time(ret);
+
+		if (max_wait < 0)
+			break;
+
+		rtas_busy_delay(ret);
+	}
+
+	pr_warn("%s: Unable to configure bridge PHB#%x-PE#%x (%d)\n",
+		__func__, pe->phb->global_number, pe->addr, ret);
+	return ret;
+}
+
+/**
+ * pseries_eeh_read_config - Read PCI config space
+ * @pdn: PCI device node
+ * @where: PCI address
+ * @size: size to read
+ * @val: return value
+ *
+ * Read config space from the speicifed device
+ */
+static int pseries_eeh_read_config(struct pci_dn *pdn, int where, int size, u32 *val)
+{
+	return rtas_read_config(pdn, where, size, val);
+}
+
+/**
+ * pseries_eeh_write_config - Write PCI config space
+ * @pdn: PCI device node
+ * @where: PCI address
+ * @size: size to write
+ * @val: value to be written
+ *
+ * Write config space to the specified device
+ */
+static int pseries_eeh_write_config(struct pci_dn *pdn, int where, int size, u32 val)
+{
+	return rtas_write_config(pdn, where, size, val);
+}
+
+static int pseries_eeh_restore_config(struct pci_dn *pdn)
+{
+	struct eeh_dev *edev = pdn_to_eeh_dev(pdn);
+	s64 ret = 0;
+
+	if (!edev)
+		return -EEXIST;
+
+	/*
+	 * FIXME: The MPS, error routing rules, timeout setting are worthy
+	 * to be exported by firmware in extendible way.
+	 */
+	if (edev->physfn)
+		ret = eeh_restore_vf_config(pdn);
+
+	if (ret) {
+		pr_warn("%s: Can't reinit PCI dev 0x%x (%lld)\n",
+			__func__, edev->pe_config_addr, ret);
+		return -EIO;
+	}
+
+	return ret;
+}
+
+#ifdef CONFIG_PCI_IOV
+int pseries_send_allow_unfreeze(struct pci_dn *pdn,
+				u16 *vf_pe_array, int cur_vfs)
+{
+	int rc;
+	int ibm_allow_unfreeze = rtas_token("ibm,open-sriov-allow-unfreeze");
+	unsigned long buid, addr;
+
+	addr = rtas_config_addr(pdn->busno, pdn->devfn, 0);
+	buid = pdn->phb->buid;
+	spin_lock(&rtas_data_buf_lock);
+	memcpy(rtas_data_buf, vf_pe_array, RTAS_DATA_BUF_SIZE);
+	rc = rtas_call(ibm_allow_unfreeze, 5, 1, NULL,
+		       addr,
+		       BUID_HI(buid),
+		       BUID_LO(buid),
+		       rtas_data_buf, cur_vfs * sizeof(u16));
+	spin_unlock(&rtas_data_buf_lock);
+	if (rc)
+		pr_warn("%s: Failed to allow unfreeze for PHB#%x-PE#%lx, rc=%x\n",
+			__func__,
+			pdn->phb->global_number, addr, rc);
+	return rc;
+}
+
+static int pseries_call_allow_unfreeze(struct eeh_dev *edev)
+{
+	struct pci_dn *pdn, *tmp, *parent, *physfn_pdn;
+	int cur_vfs = 0, rc = 0, vf_index, bus, devfn;
+	u16 *vf_pe_array;
+
+	vf_pe_array = kzalloc(RTAS_DATA_BUF_SIZE, GFP_KERNEL);
+	if (!vf_pe_array)
+		return -ENOMEM;
+	if (pci_num_vf(edev->physfn ? edev->physfn : edev->pdev)) {
+		if (edev->pdev->is_physfn) {
+			cur_vfs = pci_num_vf(edev->pdev);
+			pdn = eeh_dev_to_pdn(edev);
+			parent = pdn->parent;
+			for (vf_index = 0; vf_index < cur_vfs; vf_index++)
+				vf_pe_array[vf_index] =
+					cpu_to_be16(pdn->pe_num_map[vf_index]);
+			rc = pseries_send_allow_unfreeze(pdn, vf_pe_array,
+							 cur_vfs);
+			pdn->last_allow_rc = rc;
+			for (vf_index = 0; vf_index < cur_vfs; vf_index++) {
+				list_for_each_entry_safe(pdn, tmp,
+							 &parent->child_list,
+							 list) {
+					bus = pci_iov_virtfn_bus(edev->pdev,
+								 vf_index);
+					devfn = pci_iov_virtfn_devfn(edev->pdev,
+								     vf_index);
+					if (pdn->busno != bus ||
+					    pdn->devfn != devfn)
+						continue;
+					pdn->last_allow_rc = rc;
+				}
+			}
+		} else {
+			pdn = pci_get_pdn(edev->pdev);
+			vf_pe_array[0] = cpu_to_be16(pdn->pe_number);
+			physfn_pdn = pci_get_pdn(edev->physfn);
+			rc = pseries_send_allow_unfreeze(physfn_pdn,
+							 vf_pe_array, 1);
+			pdn->last_allow_rc = rc;
+		}
+	}
+
+	kfree(vf_pe_array);
+	return rc;
+}
+
+static int pseries_notify_resume(struct pci_dn *pdn)
+{
+	struct eeh_dev *edev = pdn_to_eeh_dev(pdn);
+
+	if (!edev)
+		return -EEXIST;
+
+	if (rtas_token("ibm,open-sriov-allow-unfreeze")
+	    == RTAS_UNKNOWN_SERVICE)
+		return -EINVAL;
+
+	if (edev->pdev->is_physfn || edev->pdev->is_virtfn)
+		return pseries_call_allow_unfreeze(edev);
+
+	return 0;
+}
+#endif
+
+static struct eeh_ops pseries_eeh_ops = {
+	.name			= "pseries",
+	.init			= pseries_eeh_init,
+	.probe			= pseries_eeh_probe,
+	.set_option		= pseries_eeh_set_option,
+	.get_pe_addr		= pseries_eeh_get_pe_addr,
+	.get_state		= pseries_eeh_get_state,
+	.reset			= pseries_eeh_reset,
+	.wait_state		= pseries_eeh_wait_state,
+	.get_log		= pseries_eeh_get_log,
+	.configure_bridge       = pseries_eeh_configure_bridge,
+	.err_inject		= NULL,
+	.read_config		= pseries_eeh_read_config,
+	.write_config		= pseries_eeh_write_config,
+	.next_error		= NULL,
+	.restore_config		= pseries_eeh_restore_config,
+#ifdef CONFIG_PCI_IOV
+	.notify_resume		= pseries_notify_resume
+#endif
+};
+
+/**
+ * eeh_pseries_init - Register platform dependent EEH operations
+ *
+ * EEH initialization on pseries platform. This function should be
+ * called before any EEH related functions.
+ */
+static int __init eeh_pseries_init(void)
+{
+	int ret;
+
+	ret = eeh_ops_register(&pseries_eeh_ops);
+	if (!ret)
+		pr_info("EEH: pSeries platform initialized\n");
+	else
+		pr_info("EEH: pSeries platform initialization failure (%d)\n",
+			ret);
+
+	return ret;
+}
+machine_early_initcall(pseries, eeh_pseries_init);
diff --git a/arch/powerpc/platforms/pseries/event_sources.c b/arch/powerpc/platforms/pseries/event_sources.c
new file mode 100644
index 000000000..6eeb0d4ba
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/event_sources.c
@@ -0,0 +1,57 @@
+/*
+ * Copyright (C) 2001 Dave Engebretsen IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ */
+
+#include <asm/prom.h>
+
+#include "pseries.h"
+
+void request_event_sources_irqs(struct device_node *np,
+				irq_handler_t handler,
+				const char *name)
+{
+	int i, index, count = 0;
+	struct of_phandle_args oirq;
+	unsigned int virqs[16];
+
+	/* First try to do a proper OF tree parsing */
+	for (index = 0; of_irq_parse_one(np, index, &oirq) == 0;
+	     index++) {
+		if (count > 15)
+			break;
+		virqs[count] = irq_create_of_mapping(&oirq);
+		if (!virqs[count]) {
+			pr_err("event-sources: Unable to allocate "
+			       "interrupt number for %pOF\n",
+			       np);
+			WARN_ON(1);
+		} else {
+			count++;
+		}
+	}
+
+	/* Now request them */
+	for (i = 0; i < count; i++) {
+		if (request_irq(virqs[i], handler, 0, name, NULL)) {
+			pr_err("event-sources: Unable to request interrupt "
+			       "%d for %pOF\n", virqs[i], np);
+			WARN_ON(1);
+			return;
+		}
+	}
+}
+
diff --git a/arch/powerpc/platforms/pseries/firmware.c b/arch/powerpc/platforms/pseries/firmware.c
new file mode 100644
index 000000000..a3bbeb436
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/firmware.c
@@ -0,0 +1,180 @@
+/*
+ *  pSeries firmware setup code.
+ *
+ *  Portions from arch/powerpc/platforms/pseries/setup.c:
+ *   Copyright (C) 1995  Linus Torvalds
+ *   Adapted from 'alpha' version by Gary Thomas
+ *   Modified by Cort Dougan (cort@cs.nmt.edu)
+ *   Modified by PPC64 Team, IBM Corp
+ *
+ *  Portions from arch/powerpc/kernel/firmware.c
+ *   Copyright (C) 2001 Ben. Herrenschmidt (benh@kernel.crashing.org)
+ *   Modifications for ppc64:
+ *    Copyright (C) 2003 Dave Engebretsen <engebret@us.ibm.com>
+ *    Copyright (C) 2005 Stephen Rothwell, IBM Corporation
+ *
+ *  Copyright 2006 IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+
+#include <linux/of_fdt.h>
+#include <asm/firmware.h>
+#include <asm/prom.h>
+#include <asm/udbg.h>
+
+#include "pseries.h"
+
+struct hypertas_fw_feature {
+    unsigned long val;
+    char * name;
+};
+
+/*
+ * The names in this table match names in rtas/ibm,hypertas-functions.  If the
+ * entry ends in a '*', only upto the '*' is matched.  Otherwise the entire
+ * string must match.
+ */
+static __initdata struct hypertas_fw_feature
+hypertas_fw_features_table[] = {
+	{FW_FEATURE_PFT,		"hcall-pft"},
+	{FW_FEATURE_TCE,		"hcall-tce"},
+	{FW_FEATURE_SPRG0,		"hcall-sprg0"},
+	{FW_FEATURE_DABR,		"hcall-dabr"},
+	{FW_FEATURE_COPY,		"hcall-copy"},
+	{FW_FEATURE_ASR,		"hcall-asr"},
+	{FW_FEATURE_DEBUG,		"hcall-debug"},
+	{FW_FEATURE_PERF,		"hcall-perf"},
+	{FW_FEATURE_DUMP,		"hcall-dump"},
+	{FW_FEATURE_INTERRUPT,		"hcall-interrupt"},
+	{FW_FEATURE_MIGRATE,		"hcall-migrate"},
+	{FW_FEATURE_PERFMON,		"hcall-perfmon"},
+	{FW_FEATURE_CRQ,		"hcall-crq"},
+	{FW_FEATURE_VIO,		"hcall-vio"},
+	{FW_FEATURE_RDMA,		"hcall-rdma"},
+	{FW_FEATURE_LLAN,		"hcall-lLAN"},
+	{FW_FEATURE_BULK_REMOVE,	"hcall-bulk"},
+	{FW_FEATURE_XDABR,		"hcall-xdabr"},
+	{FW_FEATURE_MULTITCE,		"hcall-multi-tce"},
+	{FW_FEATURE_SPLPAR,		"hcall-splpar"},
+	{FW_FEATURE_VPHN,		"hcall-vphn"},
+	{FW_FEATURE_SET_MODE,		"hcall-set-mode"},
+	{FW_FEATURE_BEST_ENERGY,	"hcall-best-energy-1*"},
+	{FW_FEATURE_HPT_RESIZE,		"hcall-hpt-resize"},
+};
+
+/* Build up the firmware features bitmask using the contents of
+ * device-tree/ibm,hypertas-functions.  Ultimately this functionality may
+ * be moved into prom.c prom_init().
+ */
+static void __init fw_hypertas_feature_init(const char *hypertas,
+					    unsigned long len)
+{
+	const char *s;
+	int i;
+
+	pr_debug(" -> fw_hypertas_feature_init()\n");
+
+	for (s = hypertas; s < hypertas + len; s += strlen(s) + 1) {
+		for (i = 0; i < ARRAY_SIZE(hypertas_fw_features_table); i++) {
+			const char *name = hypertas_fw_features_table[i].name;
+			size_t size;
+
+			/*
+			 * If there is a '*' at the end of name, only check
+			 * upto there
+			 */
+			size = strlen(name);
+			if (size && name[size - 1] == '*') {
+				if (strncmp(name, s, size - 1))
+					continue;
+			} else if (strcmp(name, s))
+				continue;
+
+			/* we have a match */
+			powerpc_firmware_features |=
+				hypertas_fw_features_table[i].val;
+			break;
+		}
+	}
+
+	pr_debug(" <- fw_hypertas_feature_init()\n");
+}
+
+struct vec5_fw_feature {
+	unsigned long	val;
+	unsigned int	feature;
+};
+
+static __initdata struct vec5_fw_feature
+vec5_fw_features_table[] = {
+	{FW_FEATURE_TYPE1_AFFINITY,	OV5_TYPE1_AFFINITY},
+	{FW_FEATURE_PRRN,		OV5_PRRN},
+	{FW_FEATURE_DRMEM_V2,		OV5_DRMEM_V2},
+	{FW_FEATURE_DRC_INFO,		OV5_DRC_INFO},
+};
+
+static void __init fw_vec5_feature_init(const char *vec5, unsigned long len)
+{
+	unsigned int index, feat;
+	int i;
+
+	pr_debug(" -> fw_vec5_feature_init()\n");
+
+	for (i = 0; i < ARRAY_SIZE(vec5_fw_features_table); i++) {
+		index = OV5_INDX(vec5_fw_features_table[i].feature);
+		feat = OV5_FEAT(vec5_fw_features_table[i].feature);
+
+		if (index < len && (vec5[index] & feat))
+			powerpc_firmware_features |=
+				vec5_fw_features_table[i].val;
+	}
+
+	pr_debug(" <- fw_vec5_feature_init()\n");
+}
+
+/*
+ * Called very early, MMU is off, device-tree isn't unflattened
+ */
+static int __init probe_fw_features(unsigned long node, const char *uname, int
+				    depth, void *data)
+{
+	const char *prop;
+	int len;
+	static int hypertas_found;
+	static int vec5_found;
+
+	if (depth != 1)
+		return 0;
+
+	if (!strcmp(uname, "rtas") || !strcmp(uname, "rtas@0")) {
+		prop = of_get_flat_dt_prop(node, "ibm,hypertas-functions",
+					   &len);
+		if (prop) {
+			powerpc_firmware_features |= FW_FEATURE_LPAR;
+			fw_hypertas_feature_init(prop, len);
+		}
+
+		hypertas_found = 1;
+	}
+
+	if (!strcmp(uname, "chosen")) {
+		prop = of_get_flat_dt_prop(node, "ibm,architecture-vec-5",
+					   &len);
+		if (prop)
+			fw_vec5_feature_init(prop, len);
+
+		vec5_found = 1;
+	}
+
+	return hypertas_found && vec5_found;
+}
+
+void __init pseries_probe_fw_features(void)
+{
+	of_scan_flat_dt(probe_fw_features, NULL);
+}
diff --git a/arch/powerpc/platforms/pseries/hotplug-cpu.c b/arch/powerpc/platforms/pseries/hotplug-cpu.c
new file mode 100644
index 000000000..8bfb97d07
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/hotplug-cpu.c
@@ -0,0 +1,975 @@
+/*
+ * pseries CPU Hotplug infrastructure.
+ *
+ * Split out from arch/powerpc/platforms/pseries/setup.c
+ *  arch/powerpc/kernel/rtas.c, and arch/powerpc/platforms/pseries/smp.c
+ *
+ * Peter Bergner, IBM	March 2001.
+ * Copyright (C) 2001 IBM.
+ * Dave Engebretsen, Peter Bergner, and
+ * Mike Corrigan {engebret|bergner|mikec}@us.ibm.com
+ * Plus various changes from other IBM teams...
+ *
+ * Copyright (C) 2006 Michael Ellerman, IBM Corporation
+ *
+ *      This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ */
+
+#define pr_fmt(fmt)     "pseries-hotplug-cpu: " fmt
+
+#include <linux/kernel.h>
+#include <linux/interrupt.h>
+#include <linux/delay.h>
+#include <linux/sched.h>	/* for idle_task_exit */
+#include <linux/sched/hotplug.h>
+#include <linux/cpu.h>
+#include <linux/of.h>
+#include <linux/slab.h>
+#include <asm/prom.h>
+#include <asm/rtas.h>
+#include <asm/firmware.h>
+#include <asm/machdep.h>
+#include <asm/vdso_datapage.h>
+#include <asm/xics.h>
+#include <asm/xive.h>
+#include <asm/plpar_wrappers.h>
+#include <asm/topology.h>
+
+#include "pseries.h"
+#include "offline_states.h"
+
+/* This version can't take the spinlock, because it never returns */
+static int rtas_stop_self_token = RTAS_UNKNOWN_SERVICE;
+
+static DEFINE_PER_CPU(enum cpu_state_vals, preferred_offline_state) =
+							CPU_STATE_OFFLINE;
+static DEFINE_PER_CPU(enum cpu_state_vals, current_state) = CPU_STATE_OFFLINE;
+
+static enum cpu_state_vals default_offline_state = CPU_STATE_OFFLINE;
+
+static bool cede_offline_enabled __read_mostly = true;
+
+/*
+ * Enable/disable cede_offline when available.
+ */
+static int __init setup_cede_offline(char *str)
+{
+	return (kstrtobool(str, &cede_offline_enabled) == 0);
+}
+
+__setup("cede_offline=", setup_cede_offline);
+
+enum cpu_state_vals get_cpu_current_state(int cpu)
+{
+	return per_cpu(current_state, cpu);
+}
+
+void set_cpu_current_state(int cpu, enum cpu_state_vals state)
+{
+	per_cpu(current_state, cpu) = state;
+}
+
+enum cpu_state_vals get_preferred_offline_state(int cpu)
+{
+	return per_cpu(preferred_offline_state, cpu);
+}
+
+void set_preferred_offline_state(int cpu, enum cpu_state_vals state)
+{
+	per_cpu(preferred_offline_state, cpu) = state;
+}
+
+void set_default_offline_state(int cpu)
+{
+	per_cpu(preferred_offline_state, cpu) = default_offline_state;
+}
+
+static void rtas_stop_self(void)
+{
+	static struct rtas_args args;
+
+	local_irq_disable();
+
+	BUG_ON(rtas_stop_self_token == RTAS_UNKNOWN_SERVICE);
+
+	rtas_call_unlocked(&args, rtas_stop_self_token, 0, 1, NULL);
+
+	panic("Alas, I survived.\n");
+}
+
+static void pseries_mach_cpu_die(void)
+{
+	unsigned int cpu = smp_processor_id();
+	unsigned int hwcpu = hard_smp_processor_id();
+	u8 cede_latency_hint = 0;
+
+	local_irq_disable();
+	idle_task_exit();
+	if (xive_enabled())
+		xive_teardown_cpu();
+	else
+		xics_teardown_cpu();
+
+	if (get_preferred_offline_state(cpu) == CPU_STATE_INACTIVE) {
+		set_cpu_current_state(cpu, CPU_STATE_INACTIVE);
+		if (ppc_md.suspend_disable_cpu)
+			ppc_md.suspend_disable_cpu();
+
+		cede_latency_hint = 2;
+
+		get_lppaca()->idle = 1;
+		if (!lppaca_shared_proc(get_lppaca()))
+			get_lppaca()->donate_dedicated_cpu = 1;
+
+		while (get_preferred_offline_state(cpu) == CPU_STATE_INACTIVE) {
+			while (!prep_irq_for_idle()) {
+				local_irq_enable();
+				local_irq_disable();
+			}
+
+			extended_cede_processor(cede_latency_hint);
+		}
+
+		local_irq_disable();
+
+		if (!lppaca_shared_proc(get_lppaca()))
+			get_lppaca()->donate_dedicated_cpu = 0;
+		get_lppaca()->idle = 0;
+
+		if (get_preferred_offline_state(cpu) == CPU_STATE_ONLINE) {
+			unregister_slb_shadow(hwcpu);
+
+			hard_irq_disable();
+			/*
+			 * Call to start_secondary_resume() will not return.
+			 * Kernel stack will be reset and start_secondary()
+			 * will be called to continue the online operation.
+			 */
+			start_secondary_resume();
+		}
+	}
+
+	/* Requested state is CPU_STATE_OFFLINE at this point */
+	WARN_ON(get_preferred_offline_state(cpu) != CPU_STATE_OFFLINE);
+
+	set_cpu_current_state(cpu, CPU_STATE_OFFLINE);
+	unregister_slb_shadow(hwcpu);
+	rtas_stop_self();
+
+	/* Should never get here... */
+	BUG();
+	for(;;);
+}
+
+static int pseries_cpu_disable(void)
+{
+	int cpu = smp_processor_id();
+
+	set_cpu_online(cpu, false);
+	vdso_data->processorCount--;
+
+	/*fix boot_cpuid here*/
+	if (cpu == boot_cpuid)
+		boot_cpuid = cpumask_any(cpu_online_mask);
+
+	/* FIXME: abstract this to not be platform specific later on */
+	if (xive_enabled())
+		xive_smp_disable_cpu();
+	else
+		xics_migrate_irqs_away();
+	return 0;
+}
+
+/*
+ * pseries_cpu_die: Wait for the cpu to die.
+ * @cpu: logical processor id of the CPU whose death we're awaiting.
+ *
+ * This function is called from the context of the thread which is performing
+ * the cpu-offline. Here we wait for long enough to allow the cpu in question
+ * to self-destroy so that the cpu-offline thread can send the CPU_DEAD
+ * notifications.
+ *
+ * OTOH, pseries_mach_cpu_die() is called by the @cpu when it wants to
+ * self-destruct.
+ */
+static void pseries_cpu_die(unsigned int cpu)
+{
+	int tries;
+	int cpu_status = 1;
+	unsigned int pcpu = get_hard_smp_processor_id(cpu);
+
+	if (get_preferred_offline_state(cpu) == CPU_STATE_INACTIVE) {
+		cpu_status = 1;
+		for (tries = 0; tries < 5000; tries++) {
+			if (get_cpu_current_state(cpu) == CPU_STATE_INACTIVE) {
+				cpu_status = 0;
+				break;
+			}
+			msleep(1);
+		}
+	} else if (get_preferred_offline_state(cpu) == CPU_STATE_OFFLINE) {
+
+		for (tries = 0; tries < 25; tries++) {
+			cpu_status = smp_query_cpu_stopped(pcpu);
+			if (cpu_status == QCSS_STOPPED ||
+			    cpu_status == QCSS_HARDWARE_ERROR)
+				break;
+			cpu_relax();
+		}
+	}
+
+	if (cpu_status != 0) {
+		printk("Querying DEAD? cpu %i (%i) shows %i\n",
+		       cpu, pcpu, cpu_status);
+	}
+
+	/* Isolation and deallocation are definitely done by
+	 * drslot_chrp_cpu.  If they were not they would be
+	 * done here.  Change isolate state to Isolate and
+	 * change allocation-state to Unusable.
+	 */
+	paca_ptrs[cpu]->cpu_start = 0;
+}
+
+/*
+ * Update cpu_present_mask and paca(s) for a new cpu node.  The wrinkle
+ * here is that a cpu device node may represent up to two logical cpus
+ * in the SMT case.  We must honor the assumption in other code that
+ * the logical ids for sibling SMT threads x and y are adjacent, such
+ * that x^1 == y and y^1 == x.
+ */
+static int pseries_add_processor(struct device_node *np)
+{
+	unsigned int cpu;
+	cpumask_var_t candidate_mask, tmp;
+	int err = -ENOSPC, len, nthreads, i;
+	const __be32 *intserv;
+
+	intserv = of_get_property(np, "ibm,ppc-interrupt-server#s", &len);
+	if (!intserv)
+		return 0;
+
+	zalloc_cpumask_var(&candidate_mask, GFP_KERNEL);
+	zalloc_cpumask_var(&tmp, GFP_KERNEL);
+
+	nthreads = len / sizeof(u32);
+	for (i = 0; i < nthreads; i++)
+		cpumask_set_cpu(i, tmp);
+
+	cpu_maps_update_begin();
+
+	BUG_ON(!cpumask_subset(cpu_present_mask, cpu_possible_mask));
+
+	/* Get a bitmap of unoccupied slots. */
+	cpumask_xor(candidate_mask, cpu_possible_mask, cpu_present_mask);
+	if (cpumask_empty(candidate_mask)) {
+		/* If we get here, it most likely means that NR_CPUS is
+		 * less than the partition's max processors setting.
+		 */
+		printk(KERN_ERR "Cannot add cpu %pOF; this system configuration"
+		       " supports %d logical cpus.\n", np,
+		       num_possible_cpus());
+		goto out_unlock;
+	}
+
+	while (!cpumask_empty(tmp))
+		if (cpumask_subset(tmp, candidate_mask))
+			/* Found a range where we can insert the new cpu(s) */
+			break;
+		else
+			cpumask_shift_left(tmp, tmp, nthreads);
+
+	if (cpumask_empty(tmp)) {
+		printk(KERN_ERR "Unable to find space in cpu_present_mask for"
+		       " processor %s with %d thread(s)\n", np->name,
+		       nthreads);
+		goto out_unlock;
+	}
+
+	for_each_cpu(cpu, tmp) {
+		BUG_ON(cpu_present(cpu));
+		set_cpu_present(cpu, true);
+		set_hard_smp_processor_id(cpu, be32_to_cpu(*intserv++));
+	}
+	err = 0;
+out_unlock:
+	cpu_maps_update_done();
+	free_cpumask_var(candidate_mask);
+	free_cpumask_var(tmp);
+	return err;
+}
+
+/*
+ * Update the present map for a cpu node which is going away, and set
+ * the hard id in the paca(s) to -1 to be consistent with boot time
+ * convention for non-present cpus.
+ */
+static void pseries_remove_processor(struct device_node *np)
+{
+	unsigned int cpu;
+	int len, nthreads, i;
+	const __be32 *intserv;
+	u32 thread;
+
+	intserv = of_get_property(np, "ibm,ppc-interrupt-server#s", &len);
+	if (!intserv)
+		return;
+
+	nthreads = len / sizeof(u32);
+
+	cpu_maps_update_begin();
+	for (i = 0; i < nthreads; i++) {
+		thread = be32_to_cpu(intserv[i]);
+		for_each_present_cpu(cpu) {
+			if (get_hard_smp_processor_id(cpu) != thread)
+				continue;
+			BUG_ON(cpu_online(cpu));
+			set_cpu_present(cpu, false);
+			set_hard_smp_processor_id(cpu, -1);
+			update_numa_cpu_lookup_table(cpu, -1);
+			break;
+		}
+		if (cpu >= nr_cpu_ids)
+			printk(KERN_WARNING "Could not find cpu to remove "
+			       "with physical id 0x%x\n", thread);
+	}
+	cpu_maps_update_done();
+}
+
+static int dlpar_online_cpu(struct device_node *dn)
+{
+	int rc = 0;
+	unsigned int cpu;
+	int len, nthreads, i;
+	const __be32 *intserv;
+	u32 thread;
+
+	intserv = of_get_property(dn, "ibm,ppc-interrupt-server#s", &len);
+	if (!intserv)
+		return -EINVAL;
+
+	nthreads = len / sizeof(u32);
+
+	cpu_maps_update_begin();
+	for (i = 0; i < nthreads; i++) {
+		thread = be32_to_cpu(intserv[i]);
+		for_each_present_cpu(cpu) {
+			if (get_hard_smp_processor_id(cpu) != thread)
+				continue;
+			BUG_ON(get_cpu_current_state(cpu)
+					!= CPU_STATE_OFFLINE);
+			cpu_maps_update_done();
+			timed_topology_update(1);
+			find_and_online_cpu_nid(cpu);
+			rc = device_online(get_cpu_device(cpu));
+			if (rc)
+				goto out;
+			cpu_maps_update_begin();
+
+			break;
+		}
+		if (cpu == num_possible_cpus())
+			printk(KERN_WARNING "Could not find cpu to online "
+			       "with physical id 0x%x\n", thread);
+	}
+	cpu_maps_update_done();
+
+out:
+	return rc;
+
+}
+
+static bool dlpar_cpu_exists(struct device_node *parent, u32 drc_index)
+{
+	struct device_node *child = NULL;
+	u32 my_drc_index;
+	bool found;
+	int rc;
+
+	/* Assume cpu doesn't exist */
+	found = false;
+
+	for_each_child_of_node(parent, child) {
+		rc = of_property_read_u32(child, "ibm,my-drc-index",
+					  &my_drc_index);
+		if (rc)
+			continue;
+
+		if (my_drc_index == drc_index) {
+			of_node_put(child);
+			found = true;
+			break;
+		}
+	}
+
+	return found;
+}
+
+static bool valid_cpu_drc_index(struct device_node *parent, u32 drc_index)
+{
+	bool found = false;
+	int rc, index;
+
+	index = 0;
+	while (!found) {
+		u32 drc;
+
+		rc = of_property_read_u32_index(parent, "ibm,drc-indexes",
+						index++, &drc);
+		if (rc)
+			break;
+
+		if (drc == drc_index)
+			found = true;
+	}
+
+	return found;
+}
+
+static ssize_t dlpar_cpu_add(u32 drc_index)
+{
+	struct device_node *dn, *parent;
+	int rc, saved_rc;
+
+	pr_debug("Attempting to add CPU, drc index: %x\n", drc_index);
+
+	parent = of_find_node_by_path("/cpus");
+	if (!parent) {
+		pr_warn("Failed to find CPU root node \"/cpus\"\n");
+		return -ENODEV;
+	}
+
+	if (dlpar_cpu_exists(parent, drc_index)) {
+		of_node_put(parent);
+		pr_warn("CPU with drc index %x already exists\n", drc_index);
+		return -EINVAL;
+	}
+
+	if (!valid_cpu_drc_index(parent, drc_index)) {
+		of_node_put(parent);
+		pr_warn("Cannot find CPU (drc index %x) to add.\n", drc_index);
+		return -EINVAL;
+	}
+
+	rc = dlpar_acquire_drc(drc_index);
+	if (rc) {
+		pr_warn("Failed to acquire DRC, rc: %d, drc index: %x\n",
+			rc, drc_index);
+		of_node_put(parent);
+		return -EINVAL;
+	}
+
+	dn = dlpar_configure_connector(cpu_to_be32(drc_index), parent);
+	if (!dn) {
+		pr_warn("Failed call to configure-connector, drc index: %x\n",
+			drc_index);
+		dlpar_release_drc(drc_index);
+		of_node_put(parent);
+		return -EINVAL;
+	}
+
+	rc = dlpar_attach_node(dn, parent);
+
+	/* Regardless we are done with parent now */
+	of_node_put(parent);
+
+	if (rc) {
+		saved_rc = rc;
+		pr_warn("Failed to attach node %s, rc: %d, drc index: %x\n",
+			dn->name, rc, drc_index);
+
+		rc = dlpar_release_drc(drc_index);
+		if (!rc)
+			dlpar_free_cc_nodes(dn);
+
+		return saved_rc;
+	}
+
+	rc = dlpar_online_cpu(dn);
+	if (rc) {
+		saved_rc = rc;
+		pr_warn("Failed to online cpu %s, rc: %d, drc index: %x\n",
+			dn->name, rc, drc_index);
+
+		rc = dlpar_detach_node(dn);
+		if (!rc)
+			dlpar_release_drc(drc_index);
+
+		return saved_rc;
+	}
+
+	pr_debug("Successfully added CPU %s, drc index: %x\n", dn->name,
+		 drc_index);
+	return rc;
+}
+
+static int dlpar_offline_cpu(struct device_node *dn)
+{
+	int rc = 0;
+	unsigned int cpu;
+	int len, nthreads, i;
+	const __be32 *intserv;
+	u32 thread;
+
+	intserv = of_get_property(dn, "ibm,ppc-interrupt-server#s", &len);
+	if (!intserv)
+		return -EINVAL;
+
+	nthreads = len / sizeof(u32);
+
+	cpu_maps_update_begin();
+	for (i = 0; i < nthreads; i++) {
+		thread = be32_to_cpu(intserv[i]);
+		for_each_present_cpu(cpu) {
+			if (get_hard_smp_processor_id(cpu) != thread)
+				continue;
+
+			if (get_cpu_current_state(cpu) == CPU_STATE_OFFLINE)
+				break;
+
+			if (get_cpu_current_state(cpu) == CPU_STATE_ONLINE) {
+				set_preferred_offline_state(cpu,
+							    CPU_STATE_OFFLINE);
+				cpu_maps_update_done();
+				timed_topology_update(1);
+				rc = device_offline(get_cpu_device(cpu));
+				if (rc)
+					goto out;
+				cpu_maps_update_begin();
+				break;
+
+			}
+
+			/*
+			 * The cpu is in CPU_STATE_INACTIVE.
+			 * Upgrade it's state to CPU_STATE_OFFLINE.
+			 */
+			set_preferred_offline_state(cpu, CPU_STATE_OFFLINE);
+			BUG_ON(plpar_hcall_norets(H_PROD, thread)
+								!= H_SUCCESS);
+			__cpu_die(cpu);
+			break;
+		}
+		if (cpu == num_possible_cpus())
+			printk(KERN_WARNING "Could not find cpu to offline with physical id 0x%x\n", thread);
+	}
+	cpu_maps_update_done();
+
+out:
+	return rc;
+
+}
+
+static ssize_t dlpar_cpu_remove(struct device_node *dn, u32 drc_index)
+{
+	int rc;
+
+	pr_debug("Attempting to remove CPU %s, drc index: %x\n",
+		 dn->name, drc_index);
+
+	rc = dlpar_offline_cpu(dn);
+	if (rc) {
+		pr_warn("Failed to offline CPU %s, rc: %d\n", dn->name, rc);
+		return -EINVAL;
+	}
+
+	rc = dlpar_release_drc(drc_index);
+	if (rc) {
+		pr_warn("Failed to release drc (%x) for CPU %s, rc: %d\n",
+			drc_index, dn->name, rc);
+		dlpar_online_cpu(dn);
+		return rc;
+	}
+
+	rc = dlpar_detach_node(dn);
+	if (rc) {
+		int saved_rc = rc;
+
+		pr_warn("Failed to detach CPU %s, rc: %d", dn->name, rc);
+
+		rc = dlpar_acquire_drc(drc_index);
+		if (!rc)
+			dlpar_online_cpu(dn);
+
+		return saved_rc;
+	}
+
+	pr_debug("Successfully removed CPU, drc index: %x\n", drc_index);
+	return 0;
+}
+
+static struct device_node *cpu_drc_index_to_dn(u32 drc_index)
+{
+	struct device_node *dn;
+	u32 my_index;
+	int rc;
+
+	for_each_node_by_type(dn, "cpu") {
+		rc = of_property_read_u32(dn, "ibm,my-drc-index", &my_index);
+		if (rc)
+			continue;
+
+		if (my_index == drc_index)
+			break;
+	}
+
+	return dn;
+}
+
+static int dlpar_cpu_remove_by_index(u32 drc_index)
+{
+	struct device_node *dn;
+	int rc;
+
+	dn = cpu_drc_index_to_dn(drc_index);
+	if (!dn) {
+		pr_warn("Cannot find CPU (drc index %x) to remove\n",
+			drc_index);
+		return -ENODEV;
+	}
+
+	rc = dlpar_cpu_remove(dn, drc_index);
+	of_node_put(dn);
+	return rc;
+}
+
+static int find_dlpar_cpus_to_remove(u32 *cpu_drcs, int cpus_to_remove)
+{
+	struct device_node *dn;
+	int cpus_found = 0;
+	int rc;
+
+	/* We want to find cpus_to_remove + 1 CPUs to ensure we do not
+	 * remove the last CPU.
+	 */
+	for_each_node_by_type(dn, "cpu") {
+		cpus_found++;
+
+		if (cpus_found > cpus_to_remove) {
+			of_node_put(dn);
+			break;
+		}
+
+		/* Note that cpus_found is always 1 ahead of the index
+		 * into the cpu_drcs array, so we use cpus_found - 1
+		 */
+		rc = of_property_read_u32(dn, "ibm,my-drc-index",
+					  &cpu_drcs[cpus_found - 1]);
+		if (rc) {
+			pr_warn("Error occurred getting drc-index for %s\n",
+				dn->name);
+			of_node_put(dn);
+			return -1;
+		}
+	}
+
+	if (cpus_found < cpus_to_remove) {
+		pr_warn("Failed to find enough CPUs (%d of %d) to remove\n",
+			cpus_found, cpus_to_remove);
+	} else if (cpus_found == cpus_to_remove) {
+		pr_warn("Cannot remove all CPUs\n");
+	}
+
+	return cpus_found;
+}
+
+static int dlpar_cpu_remove_by_count(u32 cpus_to_remove)
+{
+	u32 *cpu_drcs;
+	int cpus_found;
+	int cpus_removed = 0;
+	int i, rc;
+
+	pr_debug("Attempting to hot-remove %d CPUs\n", cpus_to_remove);
+
+	cpu_drcs = kcalloc(cpus_to_remove, sizeof(*cpu_drcs), GFP_KERNEL);
+	if (!cpu_drcs)
+		return -EINVAL;
+
+	cpus_found = find_dlpar_cpus_to_remove(cpu_drcs, cpus_to_remove);
+	if (cpus_found <= cpus_to_remove) {
+		kfree(cpu_drcs);
+		return -EINVAL;
+	}
+
+	for (i = 0; i < cpus_to_remove; i++) {
+		rc = dlpar_cpu_remove_by_index(cpu_drcs[i]);
+		if (rc)
+			break;
+
+		cpus_removed++;
+	}
+
+	if (cpus_removed != cpus_to_remove) {
+		pr_warn("CPU hot-remove failed, adding back removed CPUs\n");
+
+		for (i = 0; i < cpus_removed; i++)
+			dlpar_cpu_add(cpu_drcs[i]);
+
+		rc = -EINVAL;
+	} else {
+		rc = 0;
+	}
+
+	kfree(cpu_drcs);
+	return rc;
+}
+
+static int find_dlpar_cpus_to_add(u32 *cpu_drcs, u32 cpus_to_add)
+{
+	struct device_node *parent;
+	int cpus_found = 0;
+	int index, rc;
+
+	parent = of_find_node_by_path("/cpus");
+	if (!parent) {
+		pr_warn("Could not find CPU root node in device tree\n");
+		kfree(cpu_drcs);
+		return -1;
+	}
+
+	/* Search the ibm,drc-indexes array for possible CPU drcs to
+	 * add. Note that the format of the ibm,drc-indexes array is
+	 * the number of entries in the array followed by the array
+	 * of drc values so we start looking at index = 1.
+	 */
+	index = 1;
+	while (cpus_found < cpus_to_add) {
+		u32 drc;
+
+		rc = of_property_read_u32_index(parent, "ibm,drc-indexes",
+						index++, &drc);
+		if (rc)
+			break;
+
+		if (dlpar_cpu_exists(parent, drc))
+			continue;
+
+		cpu_drcs[cpus_found++] = drc;
+	}
+
+	of_node_put(parent);
+	return cpus_found;
+}
+
+static int dlpar_cpu_add_by_count(u32 cpus_to_add)
+{
+	u32 *cpu_drcs;
+	int cpus_added = 0;
+	int cpus_found;
+	int i, rc;
+
+	pr_debug("Attempting to hot-add %d CPUs\n", cpus_to_add);
+
+	cpu_drcs = kcalloc(cpus_to_add, sizeof(*cpu_drcs), GFP_KERNEL);
+	if (!cpu_drcs)
+		return -EINVAL;
+
+	cpus_found = find_dlpar_cpus_to_add(cpu_drcs, cpus_to_add);
+	if (cpus_found < cpus_to_add) {
+		pr_warn("Failed to find enough CPUs (%d of %d) to add\n",
+			cpus_found, cpus_to_add);
+		kfree(cpu_drcs);
+		return -EINVAL;
+	}
+
+	for (i = 0; i < cpus_to_add; i++) {
+		rc = dlpar_cpu_add(cpu_drcs[i]);
+		if (rc)
+			break;
+
+		cpus_added++;
+	}
+
+	if (cpus_added < cpus_to_add) {
+		pr_warn("CPU hot-add failed, removing any added CPUs\n");
+
+		for (i = 0; i < cpus_added; i++)
+			dlpar_cpu_remove_by_index(cpu_drcs[i]);
+
+		rc = -EINVAL;
+	} else {
+		rc = 0;
+	}
+
+	kfree(cpu_drcs);
+	return rc;
+}
+
+int dlpar_cpu_readd(int cpu)
+{
+	struct device_node *dn;
+	struct device *dev;
+	u32 drc_index;
+	int rc;
+
+	dev = get_cpu_device(cpu);
+	dn = dev->of_node;
+
+	rc = of_property_read_u32(dn, "ibm,my-drc-index", &drc_index);
+
+	rc = dlpar_cpu_remove_by_index(drc_index);
+	if (!rc)
+		rc = dlpar_cpu_add(drc_index);
+
+	return rc;
+}
+
+int dlpar_cpu(struct pseries_hp_errorlog *hp_elog)
+{
+	u32 count, drc_index;
+	int rc;
+
+	count = hp_elog->_drc_u.drc_count;
+	drc_index = hp_elog->_drc_u.drc_index;
+
+	lock_device_hotplug();
+
+	switch (hp_elog->action) {
+	case PSERIES_HP_ELOG_ACTION_REMOVE:
+		if (hp_elog->id_type == PSERIES_HP_ELOG_ID_DRC_COUNT)
+			rc = dlpar_cpu_remove_by_count(count);
+		else if (hp_elog->id_type == PSERIES_HP_ELOG_ID_DRC_INDEX)
+			rc = dlpar_cpu_remove_by_index(drc_index);
+		else
+			rc = -EINVAL;
+		break;
+	case PSERIES_HP_ELOG_ACTION_ADD:
+		if (hp_elog->id_type == PSERIES_HP_ELOG_ID_DRC_COUNT)
+			rc = dlpar_cpu_add_by_count(count);
+		else if (hp_elog->id_type == PSERIES_HP_ELOG_ID_DRC_INDEX)
+			rc = dlpar_cpu_add(drc_index);
+		else
+			rc = -EINVAL;
+		break;
+	default:
+		pr_err("Invalid action (%d) specified\n", hp_elog->action);
+		rc = -EINVAL;
+		break;
+	}
+
+	unlock_device_hotplug();
+	return rc;
+}
+
+#ifdef CONFIG_ARCH_CPU_PROBE_RELEASE
+
+static ssize_t dlpar_cpu_probe(const char *buf, size_t count)
+{
+	u32 drc_index;
+	int rc;
+
+	rc = kstrtou32(buf, 0, &drc_index);
+	if (rc)
+		return -EINVAL;
+
+	rc = dlpar_cpu_add(drc_index);
+
+	return rc ? rc : count;
+}
+
+static ssize_t dlpar_cpu_release(const char *buf, size_t count)
+{
+	struct device_node *dn;
+	u32 drc_index;
+	int rc;
+
+	dn = of_find_node_by_path(buf);
+	if (!dn)
+		return -EINVAL;
+
+	rc = of_property_read_u32(dn, "ibm,my-drc-index", &drc_index);
+	if (rc) {
+		of_node_put(dn);
+		return -EINVAL;
+	}
+
+	rc = dlpar_cpu_remove(dn, drc_index);
+	of_node_put(dn);
+
+	return rc ? rc : count;
+}
+
+#endif /* CONFIG_ARCH_CPU_PROBE_RELEASE */
+
+static int pseries_smp_notifier(struct notifier_block *nb,
+				unsigned long action, void *data)
+{
+	struct of_reconfig_data *rd = data;
+	int err = 0;
+
+	switch (action) {
+	case OF_RECONFIG_ATTACH_NODE:
+		err = pseries_add_processor(rd->dn);
+		break;
+	case OF_RECONFIG_DETACH_NODE:
+		pseries_remove_processor(rd->dn);
+		break;
+	}
+	return notifier_from_errno(err);
+}
+
+static struct notifier_block pseries_smp_nb = {
+	.notifier_call = pseries_smp_notifier,
+};
+
+#define MAX_CEDE_LATENCY_LEVELS		4
+#define	CEDE_LATENCY_PARAM_LENGTH	10
+#define CEDE_LATENCY_PARAM_MAX_LENGTH	\
+	(MAX_CEDE_LATENCY_LEVELS * CEDE_LATENCY_PARAM_LENGTH * sizeof(char))
+#define CEDE_LATENCY_TOKEN		45
+
+static char cede_parameters[CEDE_LATENCY_PARAM_MAX_LENGTH];
+
+static int parse_cede_parameters(void)
+{
+	memset(cede_parameters, 0, CEDE_LATENCY_PARAM_MAX_LENGTH);
+	return rtas_call(rtas_token("ibm,get-system-parameter"), 3, 1,
+			 NULL,
+			 CEDE_LATENCY_TOKEN,
+			 __pa(cede_parameters),
+			 CEDE_LATENCY_PARAM_MAX_LENGTH);
+}
+
+static int __init pseries_cpu_hotplug_init(void)
+{
+	int cpu;
+	int qcss_tok;
+
+#ifdef CONFIG_ARCH_CPU_PROBE_RELEASE
+	ppc_md.cpu_probe = dlpar_cpu_probe;
+	ppc_md.cpu_release = dlpar_cpu_release;
+#endif /* CONFIG_ARCH_CPU_PROBE_RELEASE */
+
+	rtas_stop_self_token = rtas_token("stop-self");
+	qcss_tok = rtas_token("query-cpu-stopped-state");
+
+	if (rtas_stop_self_token == RTAS_UNKNOWN_SERVICE ||
+			qcss_tok == RTAS_UNKNOWN_SERVICE) {
+		printk(KERN_INFO "CPU Hotplug not supported by firmware "
+				"- disabling.\n");
+		return 0;
+	}
+
+	ppc_md.cpu_die = pseries_mach_cpu_die;
+	smp_ops->cpu_disable = pseries_cpu_disable;
+	smp_ops->cpu_die = pseries_cpu_die;
+
+	/* Processors can be added/removed only on LPAR */
+	if (firmware_has_feature(FW_FEATURE_LPAR)) {
+		of_reconfig_notifier_register(&pseries_smp_nb);
+		cpu_maps_update_begin();
+		if (cede_offline_enabled && parse_cede_parameters() == 0) {
+			default_offline_state = CPU_STATE_INACTIVE;
+			for_each_online_cpu(cpu)
+				set_default_offline_state(cpu);
+		}
+		cpu_maps_update_done();
+	}
+
+	return 0;
+}
+machine_arch_initcall(pseries, pseries_cpu_hotplug_init);
diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c b/arch/powerpc/platforms/pseries/hotplug-memory.c
new file mode 100644
index 000000000..afabe6918
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/hotplug-memory.c
@@ -0,0 +1,1061 @@
+/*
+ * pseries Memory Hotplug infrastructure.
+ *
+ * Copyright (C) 2008 Badari Pulavarty, IBM Corporation
+ *
+ *      This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ */
+
+#define pr_fmt(fmt)	"pseries-hotplug-mem: " fmt
+
+#include <linux/of.h>
+#include <linux/of_address.h>
+#include <linux/memblock.h>
+#include <linux/memory.h>
+#include <linux/memory_hotplug.h>
+#include <linux/slab.h>
+
+#include <asm/firmware.h>
+#include <asm/machdep.h>
+#include <asm/prom.h>
+#include <asm/sparsemem.h>
+#include <asm/fadump.h>
+#include <asm/drmem.h>
+#include "pseries.h"
+
+static bool rtas_hp_event;
+
+unsigned long pseries_memory_block_size(void)
+{
+	struct device_node *np;
+	u64 memblock_size = MIN_MEMORY_BLOCK_SIZE;
+	struct resource r;
+
+	np = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
+	if (np) {
+		const __be64 *size;
+
+		size = of_get_property(np, "ibm,lmb-size", NULL);
+		if (size)
+			memblock_size = be64_to_cpup(size);
+		of_node_put(np);
+	} else  if (machine_is(pseries)) {
+		/* This fallback really only applies to pseries */
+		unsigned int memzero_size = 0;
+
+		np = of_find_node_by_path("/memory@0");
+		if (np) {
+			if (!of_address_to_resource(np, 0, &r))
+				memzero_size = resource_size(&r);
+			of_node_put(np);
+		}
+
+		if (memzero_size) {
+			/* We now know the size of memory@0, use this to find
+			 * the first memoryblock and get its size.
+			 */
+			char buf[64];
+
+			sprintf(buf, "/memory@%x", memzero_size);
+			np = of_find_node_by_path(buf);
+			if (np) {
+				if (!of_address_to_resource(np, 0, &r))
+					memblock_size = resource_size(&r);
+				of_node_put(np);
+			}
+		}
+	}
+	return memblock_size;
+}
+
+static void dlpar_free_property(struct property *prop)
+{
+	kfree(prop->name);
+	kfree(prop->value);
+	kfree(prop);
+}
+
+static struct property *dlpar_clone_property(struct property *prop,
+					     u32 prop_size)
+{
+	struct property *new_prop;
+
+	new_prop = kzalloc(sizeof(*new_prop), GFP_KERNEL);
+	if (!new_prop)
+		return NULL;
+
+	new_prop->name = kstrdup(prop->name, GFP_KERNEL);
+	new_prop->value = kzalloc(prop_size, GFP_KERNEL);
+	if (!new_prop->name || !new_prop->value) {
+		dlpar_free_property(new_prop);
+		return NULL;
+	}
+
+	memcpy(new_prop->value, prop->value, prop->length);
+	new_prop->length = prop_size;
+
+	of_property_set_flag(new_prop, OF_DYNAMIC);
+	return new_prop;
+}
+
+static bool find_aa_index(struct device_node *dr_node,
+			 struct property *ala_prop,
+			 const u32 *lmb_assoc, u32 *aa_index)
+{
+	u32 *assoc_arrays, new_prop_size;
+	struct property *new_prop;
+	int aa_arrays, aa_array_entries, aa_array_sz;
+	int i, index;
+
+	/*
+	 * The ibm,associativity-lookup-arrays property is defined to be
+	 * a 32-bit value specifying the number of associativity arrays
+	 * followed by a 32-bitvalue specifying the number of entries per
+	 * array, followed by the associativity arrays.
+	 */
+	assoc_arrays = ala_prop->value;
+
+	aa_arrays = be32_to_cpu(assoc_arrays[0]);
+	aa_array_entries = be32_to_cpu(assoc_arrays[1]);
+	aa_array_sz = aa_array_entries * sizeof(u32);
+
+	for (i = 0; i < aa_arrays; i++) {
+		index = (i * aa_array_entries) + 2;
+
+		if (memcmp(&assoc_arrays[index], &lmb_assoc[1], aa_array_sz))
+			continue;
+
+		*aa_index = i;
+		return true;
+	}
+
+	new_prop_size = ala_prop->length + aa_array_sz;
+	new_prop = dlpar_clone_property(ala_prop, new_prop_size);
+	if (!new_prop)
+		return false;
+
+	assoc_arrays = new_prop->value;
+
+	/* increment the number of entries in the lookup array */
+	assoc_arrays[0] = cpu_to_be32(aa_arrays + 1);
+
+	/* copy the new associativity into the lookup array */
+	index = aa_arrays * aa_array_entries + 2;
+	memcpy(&assoc_arrays[index], &lmb_assoc[1], aa_array_sz);
+
+	of_update_property(dr_node, new_prop);
+
+	/*
+	 * The associativity lookup array index for this lmb is
+	 * number of entries - 1 since we added its associativity
+	 * to the end of the lookup array.
+	 */
+	*aa_index = be32_to_cpu(assoc_arrays[0]) - 1;
+	return true;
+}
+
+static int update_lmb_associativity_index(struct drmem_lmb *lmb)
+{
+	struct device_node *parent, *lmb_node, *dr_node;
+	struct property *ala_prop;
+	const u32 *lmb_assoc;
+	u32 aa_index;
+	bool found;
+
+	parent = of_find_node_by_path("/");
+	if (!parent)
+		return -ENODEV;
+
+	lmb_node = dlpar_configure_connector(cpu_to_be32(lmb->drc_index),
+					     parent);
+	of_node_put(parent);
+	if (!lmb_node)
+		return -EINVAL;
+
+	lmb_assoc = of_get_property(lmb_node, "ibm,associativity", NULL);
+	if (!lmb_assoc) {
+		dlpar_free_cc_nodes(lmb_node);
+		return -ENODEV;
+	}
+
+	dr_node = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
+	if (!dr_node) {
+		dlpar_free_cc_nodes(lmb_node);
+		return -ENODEV;
+	}
+
+	ala_prop = of_find_property(dr_node, "ibm,associativity-lookup-arrays",
+				    NULL);
+	if (!ala_prop) {
+		of_node_put(dr_node);
+		dlpar_free_cc_nodes(lmb_node);
+		return -ENODEV;
+	}
+
+	found = find_aa_index(dr_node, ala_prop, lmb_assoc, &aa_index);
+
+	of_node_put(dr_node);
+	dlpar_free_cc_nodes(lmb_node);
+
+	if (!found) {
+		pr_err("Could not find LMB associativity\n");
+		return -1;
+	}
+
+	lmb->aa_index = aa_index;
+	return 0;
+}
+
+static struct memory_block *lmb_to_memblock(struct drmem_lmb *lmb)
+{
+	unsigned long section_nr;
+	struct mem_section *mem_sect;
+	struct memory_block *mem_block;
+
+	section_nr = pfn_to_section_nr(PFN_DOWN(lmb->base_addr));
+	mem_sect = __nr_to_section(section_nr);
+
+	mem_block = find_memory_block(mem_sect);
+	return mem_block;
+}
+
+static int get_lmb_range(u32 drc_index, int n_lmbs,
+			 struct drmem_lmb **start_lmb,
+			 struct drmem_lmb **end_lmb)
+{
+	struct drmem_lmb *lmb, *start, *end;
+	struct drmem_lmb *limit;
+
+	start = NULL;
+	for_each_drmem_lmb(lmb) {
+		if (lmb->drc_index == drc_index) {
+			start = lmb;
+			break;
+		}
+	}
+
+	if (!start)
+		return -EINVAL;
+
+	end = &start[n_lmbs];
+
+	limit = &drmem_info->lmbs[drmem_info->n_lmbs];
+	if (end > limit)
+		return -EINVAL;
+
+	*start_lmb = start;
+	*end_lmb = end;
+	return 0;
+}
+
+static int dlpar_change_lmb_state(struct drmem_lmb *lmb, bool online)
+{
+	struct memory_block *mem_block;
+	int rc;
+
+	mem_block = lmb_to_memblock(lmb);
+	if (!mem_block)
+		return -EINVAL;
+
+	if (online && mem_block->dev.offline)
+		rc = device_online(&mem_block->dev);
+	else if (!online && !mem_block->dev.offline)
+		rc = device_offline(&mem_block->dev);
+	else
+		rc = 0;
+
+	put_device(&mem_block->dev);
+
+	return rc;
+}
+
+static int dlpar_online_lmb(struct drmem_lmb *lmb)
+{
+	return dlpar_change_lmb_state(lmb, true);
+}
+
+#ifdef CONFIG_MEMORY_HOTREMOVE
+static int dlpar_offline_lmb(struct drmem_lmb *lmb)
+{
+	return dlpar_change_lmb_state(lmb, false);
+}
+
+static int pseries_remove_memblock(unsigned long base, unsigned int memblock_size)
+{
+	unsigned long block_sz, start_pfn;
+	int sections_per_block;
+	int i, nid;
+
+	start_pfn = base >> PAGE_SHIFT;
+
+	lock_device_hotplug();
+
+	if (!pfn_valid(start_pfn))
+		goto out;
+
+	block_sz = pseries_memory_block_size();
+	sections_per_block = block_sz / MIN_MEMORY_BLOCK_SIZE;
+	nid = memory_add_physaddr_to_nid(base);
+
+	for (i = 0; i < sections_per_block; i++) {
+		__remove_memory(nid, base, MIN_MEMORY_BLOCK_SIZE);
+		base += MIN_MEMORY_BLOCK_SIZE;
+	}
+
+out:
+	/* Update memory regions for memory remove */
+	memblock_remove(base, memblock_size);
+	unlock_device_hotplug();
+	return 0;
+}
+
+static int pseries_remove_mem_node(struct device_node *np)
+{
+	const char *type;
+	const __be32 *regs;
+	unsigned long base;
+	unsigned int lmb_size;
+	int ret = -EINVAL;
+
+	/*
+	 * Check to see if we are actually removing memory
+	 */
+	type = of_get_property(np, "device_type", NULL);
+	if (type == NULL || strcmp(type, "memory") != 0)
+		return 0;
+
+	/*
+	 * Find the base address and size of the memblock
+	 */
+	regs = of_get_property(np, "reg", NULL);
+	if (!regs)
+		return ret;
+
+	base = be64_to_cpu(*(unsigned long *)regs);
+	lmb_size = be32_to_cpu(regs[3]);
+
+	pseries_remove_memblock(base, lmb_size);
+	return 0;
+}
+
+static bool lmb_is_removable(struct drmem_lmb *lmb)
+{
+	int i, scns_per_block;
+	int rc = 1;
+	unsigned long pfn, block_sz;
+	u64 phys_addr;
+
+	if (!(lmb->flags & DRCONF_MEM_ASSIGNED))
+		return false;
+
+	block_sz = memory_block_size_bytes();
+	scns_per_block = block_sz / MIN_MEMORY_BLOCK_SIZE;
+	phys_addr = lmb->base_addr;
+
+#ifdef CONFIG_FA_DUMP
+	/*
+	 * Don't hot-remove memory that falls in fadump boot memory area
+	 * and memory that is reserved for capturing old kernel memory.
+	 */
+	if (is_fadump_memory_area(phys_addr, block_sz))
+		return false;
+#endif
+
+	for (i = 0; i < scns_per_block; i++) {
+		pfn = PFN_DOWN(phys_addr);
+		if (!pfn_present(pfn)) {
+			phys_addr += MIN_MEMORY_BLOCK_SIZE;
+			continue;
+		}
+
+		rc &= is_mem_section_removable(pfn, PAGES_PER_SECTION);
+		phys_addr += MIN_MEMORY_BLOCK_SIZE;
+	}
+
+	return rc ? true : false;
+}
+
+static int dlpar_add_lmb(struct drmem_lmb *);
+
+static int dlpar_remove_lmb(struct drmem_lmb *lmb)
+{
+	unsigned long block_sz;
+	int nid, rc;
+
+	if (!lmb_is_removable(lmb))
+		return -EINVAL;
+
+	rc = dlpar_offline_lmb(lmb);
+	if (rc)
+		return rc;
+
+	block_sz = pseries_memory_block_size();
+	nid = memory_add_physaddr_to_nid(lmb->base_addr);
+
+	__remove_memory(nid, lmb->base_addr, block_sz);
+
+	/* Update memory regions for memory remove */
+	memblock_remove(lmb->base_addr, block_sz);
+
+	invalidate_lmb_associativity_index(lmb);
+	lmb->flags &= ~DRCONF_MEM_ASSIGNED;
+
+	return 0;
+}
+
+static int dlpar_memory_remove_by_count(u32 lmbs_to_remove)
+{
+	struct drmem_lmb *lmb;
+	int lmbs_removed = 0;
+	int lmbs_available = 0;
+	int rc;
+
+	pr_info("Attempting to hot-remove %d LMB(s)\n", lmbs_to_remove);
+
+	if (lmbs_to_remove == 0)
+		return -EINVAL;
+
+	/* Validate that there are enough LMBs to satisfy the request */
+	for_each_drmem_lmb(lmb) {
+		if (lmb_is_removable(lmb))
+			lmbs_available++;
+
+		if (lmbs_available == lmbs_to_remove)
+			break;
+	}
+
+	if (lmbs_available < lmbs_to_remove) {
+		pr_info("Not enough LMBs available (%d of %d) to satisfy request\n",
+			lmbs_available, lmbs_to_remove);
+		return -EINVAL;
+	}
+
+	for_each_drmem_lmb(lmb) {
+		rc = dlpar_remove_lmb(lmb);
+		if (rc)
+			continue;
+
+		/* Mark this lmb so we can add it later if all of the
+		 * requested LMBs cannot be removed.
+		 */
+		drmem_mark_lmb_reserved(lmb);
+
+		lmbs_removed++;
+		if (lmbs_removed == lmbs_to_remove)
+			break;
+	}
+
+	if (lmbs_removed != lmbs_to_remove) {
+		pr_err("Memory hot-remove failed, adding LMB's back\n");
+
+		for_each_drmem_lmb(lmb) {
+			if (!drmem_lmb_reserved(lmb))
+				continue;
+
+			rc = dlpar_add_lmb(lmb);
+			if (rc)
+				pr_err("Failed to add LMB back, drc index %x\n",
+				       lmb->drc_index);
+
+			drmem_remove_lmb_reservation(lmb);
+		}
+
+		rc = -EINVAL;
+	} else {
+		for_each_drmem_lmb(lmb) {
+			if (!drmem_lmb_reserved(lmb))
+				continue;
+
+			dlpar_release_drc(lmb->drc_index);
+			pr_info("Memory at %llx was hot-removed\n",
+				lmb->base_addr);
+
+			drmem_remove_lmb_reservation(lmb);
+		}
+		rc = 0;
+	}
+
+	return rc;
+}
+
+static int dlpar_memory_remove_by_index(u32 drc_index)
+{
+	struct drmem_lmb *lmb;
+	int lmb_found;
+	int rc;
+
+	pr_info("Attempting to hot-remove LMB, drc index %x\n", drc_index);
+
+	lmb_found = 0;
+	for_each_drmem_lmb(lmb) {
+		if (lmb->drc_index == drc_index) {
+			lmb_found = 1;
+			rc = dlpar_remove_lmb(lmb);
+			if (!rc)
+				dlpar_release_drc(lmb->drc_index);
+
+			break;
+		}
+	}
+
+	if (!lmb_found)
+		rc = -EINVAL;
+
+	if (rc)
+		pr_info("Failed to hot-remove memory at %llx\n",
+			lmb->base_addr);
+	else
+		pr_info("Memory at %llx was hot-removed\n", lmb->base_addr);
+
+	return rc;
+}
+
+static int dlpar_memory_readd_by_index(u32 drc_index)
+{
+	struct drmem_lmb *lmb;
+	int lmb_found;
+	int rc;
+
+	pr_info("Attempting to update LMB, drc index %x\n", drc_index);
+
+	lmb_found = 0;
+	for_each_drmem_lmb(lmb) {
+		if (lmb->drc_index == drc_index) {
+			lmb_found = 1;
+			rc = dlpar_remove_lmb(lmb);
+			if (!rc) {
+				rc = dlpar_add_lmb(lmb);
+				if (rc)
+					dlpar_release_drc(lmb->drc_index);
+			}
+			break;
+		}
+	}
+
+	if (!lmb_found)
+		rc = -EINVAL;
+
+	if (rc)
+		pr_info("Failed to update memory at %llx\n",
+			lmb->base_addr);
+	else
+		pr_info("Memory at %llx was updated\n", lmb->base_addr);
+
+	return rc;
+}
+
+static int dlpar_memory_remove_by_ic(u32 lmbs_to_remove, u32 drc_index)
+{
+	struct drmem_lmb *lmb, *start_lmb, *end_lmb;
+	int lmbs_available = 0;
+	int rc;
+
+	pr_info("Attempting to hot-remove %u LMB(s) at %x\n",
+		lmbs_to_remove, drc_index);
+
+	if (lmbs_to_remove == 0)
+		return -EINVAL;
+
+	rc = get_lmb_range(drc_index, lmbs_to_remove, &start_lmb, &end_lmb);
+	if (rc)
+		return -EINVAL;
+
+	/* Validate that there are enough LMBs to satisfy the request */
+	for_each_drmem_lmb_in_range(lmb, start_lmb, end_lmb) {
+		if (lmb->flags & DRCONF_MEM_RESERVED)
+			break;
+
+		lmbs_available++;
+	}
+
+	if (lmbs_available < lmbs_to_remove)
+		return -EINVAL;
+
+	for_each_drmem_lmb_in_range(lmb, start_lmb, end_lmb) {
+		if (!(lmb->flags & DRCONF_MEM_ASSIGNED))
+			continue;
+
+		rc = dlpar_remove_lmb(lmb);
+		if (rc)
+			break;
+
+		drmem_mark_lmb_reserved(lmb);
+	}
+
+	if (rc) {
+		pr_err("Memory indexed-count-remove failed, adding any removed LMBs\n");
+
+
+		for_each_drmem_lmb_in_range(lmb, start_lmb, end_lmb) {
+			if (!drmem_lmb_reserved(lmb))
+				continue;
+
+			rc = dlpar_add_lmb(lmb);
+			if (rc)
+				pr_err("Failed to add LMB, drc index %x\n",
+				       lmb->drc_index);
+
+			drmem_remove_lmb_reservation(lmb);
+		}
+		rc = -EINVAL;
+	} else {
+		for_each_drmem_lmb_in_range(lmb, start_lmb, end_lmb) {
+			if (!drmem_lmb_reserved(lmb))
+				continue;
+
+			dlpar_release_drc(lmb->drc_index);
+			pr_info("Memory at %llx (drc index %x) was hot-removed\n",
+				lmb->base_addr, lmb->drc_index);
+
+			drmem_remove_lmb_reservation(lmb);
+		}
+	}
+
+	return rc;
+}
+
+#else
+static inline int pseries_remove_memblock(unsigned long base,
+					  unsigned int memblock_size)
+{
+	return -EOPNOTSUPP;
+}
+static inline int pseries_remove_mem_node(struct device_node *np)
+{
+	return 0;
+}
+static inline int dlpar_memory_remove(struct pseries_hp_errorlog *hp_elog)
+{
+	return -EOPNOTSUPP;
+}
+static int dlpar_remove_lmb(struct drmem_lmb *lmb)
+{
+	return -EOPNOTSUPP;
+}
+static int dlpar_memory_remove_by_count(u32 lmbs_to_remove)
+{
+	return -EOPNOTSUPP;
+}
+static int dlpar_memory_remove_by_index(u32 drc_index)
+{
+	return -EOPNOTSUPP;
+}
+static int dlpar_memory_readd_by_index(u32 drc_index)
+{
+	return -EOPNOTSUPP;
+}
+
+static int dlpar_memory_remove_by_ic(u32 lmbs_to_remove, u32 drc_index)
+{
+	return -EOPNOTSUPP;
+}
+#endif /* CONFIG_MEMORY_HOTREMOVE */
+
+static int dlpar_add_lmb(struct drmem_lmb *lmb)
+{
+	unsigned long block_sz;
+	int nid, rc;
+
+	if (lmb->flags & DRCONF_MEM_ASSIGNED)
+		return -EINVAL;
+
+	rc = update_lmb_associativity_index(lmb);
+	if (rc) {
+		dlpar_release_drc(lmb->drc_index);
+		return rc;
+	}
+
+	block_sz = memory_block_size_bytes();
+
+	/* Find the node id for this address */
+	nid = memory_add_physaddr_to_nid(lmb->base_addr);
+
+	/* Add the memory */
+	rc = __add_memory(nid, lmb->base_addr, block_sz);
+	if (rc) {
+		invalidate_lmb_associativity_index(lmb);
+		return rc;
+	}
+
+	rc = dlpar_online_lmb(lmb);
+	if (rc) {
+		__remove_memory(nid, lmb->base_addr, block_sz);
+		invalidate_lmb_associativity_index(lmb);
+	} else {
+		lmb->flags |= DRCONF_MEM_ASSIGNED;
+	}
+
+	return rc;
+}
+
+static int dlpar_memory_add_by_count(u32 lmbs_to_add)
+{
+	struct drmem_lmb *lmb;
+	int lmbs_available = 0;
+	int lmbs_added = 0;
+	int rc;
+
+	pr_info("Attempting to hot-add %d LMB(s)\n", lmbs_to_add);
+
+	if (lmbs_to_add == 0)
+		return -EINVAL;
+
+	/* Validate that there are enough LMBs to satisfy the request */
+	for_each_drmem_lmb(lmb) {
+		if (!(lmb->flags & DRCONF_MEM_ASSIGNED))
+			lmbs_available++;
+
+		if (lmbs_available == lmbs_to_add)
+			break;
+	}
+
+	if (lmbs_available < lmbs_to_add)
+		return -EINVAL;
+
+	for_each_drmem_lmb(lmb) {
+		if (lmb->flags & DRCONF_MEM_ASSIGNED)
+			continue;
+
+		rc = dlpar_acquire_drc(lmb->drc_index);
+		if (rc)
+			continue;
+
+		rc = dlpar_add_lmb(lmb);
+		if (rc) {
+			dlpar_release_drc(lmb->drc_index);
+			continue;
+		}
+
+		/* Mark this lmb so we can remove it later if all of the
+		 * requested LMBs cannot be added.
+		 */
+		drmem_mark_lmb_reserved(lmb);
+
+		lmbs_added++;
+		if (lmbs_added == lmbs_to_add)
+			break;
+	}
+
+	if (lmbs_added != lmbs_to_add) {
+		pr_err("Memory hot-add failed, removing any added LMBs\n");
+
+		for_each_drmem_lmb(lmb) {
+			if (!drmem_lmb_reserved(lmb))
+				continue;
+
+			rc = dlpar_remove_lmb(lmb);
+			if (rc)
+				pr_err("Failed to remove LMB, drc index %x\n",
+				       lmb->drc_index);
+			else
+				dlpar_release_drc(lmb->drc_index);
+
+			drmem_remove_lmb_reservation(lmb);
+		}
+		rc = -EINVAL;
+	} else {
+		for_each_drmem_lmb(lmb) {
+			if (!drmem_lmb_reserved(lmb))
+				continue;
+
+			pr_info("Memory at %llx (drc index %x) was hot-added\n",
+				lmb->base_addr, lmb->drc_index);
+			drmem_remove_lmb_reservation(lmb);
+		}
+		rc = 0;
+	}
+
+	return rc;
+}
+
+static int dlpar_memory_add_by_index(u32 drc_index)
+{
+	struct drmem_lmb *lmb;
+	int rc, lmb_found;
+
+	pr_info("Attempting to hot-add LMB, drc index %x\n", drc_index);
+
+	lmb_found = 0;
+	for_each_drmem_lmb(lmb) {
+		if (lmb->drc_index == drc_index) {
+			lmb_found = 1;
+			rc = dlpar_acquire_drc(lmb->drc_index);
+			if (!rc) {
+				rc = dlpar_add_lmb(lmb);
+				if (rc)
+					dlpar_release_drc(lmb->drc_index);
+			}
+
+			break;
+		}
+	}
+
+	if (!lmb_found)
+		rc = -EINVAL;
+
+	if (rc)
+		pr_info("Failed to hot-add memory, drc index %x\n", drc_index);
+	else
+		pr_info("Memory at %llx (drc index %x) was hot-added\n",
+			lmb->base_addr, drc_index);
+
+	return rc;
+}
+
+static int dlpar_memory_add_by_ic(u32 lmbs_to_add, u32 drc_index)
+{
+	struct drmem_lmb *lmb, *start_lmb, *end_lmb;
+	int lmbs_available = 0;
+	int rc;
+
+	pr_info("Attempting to hot-add %u LMB(s) at index %x\n",
+		lmbs_to_add, drc_index);
+
+	if (lmbs_to_add == 0)
+		return -EINVAL;
+
+	rc = get_lmb_range(drc_index, lmbs_to_add, &start_lmb, &end_lmb);
+	if (rc)
+		return -EINVAL;
+
+	/* Validate that the LMBs in this range are not reserved */
+	for_each_drmem_lmb_in_range(lmb, start_lmb, end_lmb) {
+		if (lmb->flags & DRCONF_MEM_RESERVED)
+			break;
+
+		lmbs_available++;
+	}
+
+	if (lmbs_available < lmbs_to_add)
+		return -EINVAL;
+
+	for_each_drmem_lmb_in_range(lmb, start_lmb, end_lmb) {
+		if (lmb->flags & DRCONF_MEM_ASSIGNED)
+			continue;
+
+		rc = dlpar_acquire_drc(lmb->drc_index);
+		if (rc)
+			break;
+
+		rc = dlpar_add_lmb(lmb);
+		if (rc) {
+			dlpar_release_drc(lmb->drc_index);
+			break;
+		}
+
+		drmem_mark_lmb_reserved(lmb);
+	}
+
+	if (rc) {
+		pr_err("Memory indexed-count-add failed, removing any added LMBs\n");
+
+		for_each_drmem_lmb_in_range(lmb, start_lmb, end_lmb) {
+			if (!drmem_lmb_reserved(lmb))
+				continue;
+
+			rc = dlpar_remove_lmb(lmb);
+			if (rc)
+				pr_err("Failed to remove LMB, drc index %x\n",
+				       lmb->drc_index);
+			else
+				dlpar_release_drc(lmb->drc_index);
+
+			drmem_remove_lmb_reservation(lmb);
+		}
+		rc = -EINVAL;
+	} else {
+		for_each_drmem_lmb_in_range(lmb, start_lmb, end_lmb) {
+			if (!drmem_lmb_reserved(lmb))
+				continue;
+
+			pr_info("Memory at %llx (drc index %x) was hot-added\n",
+				lmb->base_addr, lmb->drc_index);
+			drmem_remove_lmb_reservation(lmb);
+		}
+	}
+
+	return rc;
+}
+
+int dlpar_memory(struct pseries_hp_errorlog *hp_elog)
+{
+	u32 count, drc_index;
+	int rc;
+
+	lock_device_hotplug();
+
+	switch (hp_elog->action) {
+	case PSERIES_HP_ELOG_ACTION_ADD:
+		if (hp_elog->id_type == PSERIES_HP_ELOG_ID_DRC_COUNT) {
+			count = hp_elog->_drc_u.drc_count;
+			rc = dlpar_memory_add_by_count(count);
+		} else if (hp_elog->id_type == PSERIES_HP_ELOG_ID_DRC_INDEX) {
+			drc_index = hp_elog->_drc_u.drc_index;
+			rc = dlpar_memory_add_by_index(drc_index);
+		} else if (hp_elog->id_type == PSERIES_HP_ELOG_ID_DRC_IC) {
+			count = hp_elog->_drc_u.ic.count;
+			drc_index = hp_elog->_drc_u.ic.index;
+			rc = dlpar_memory_add_by_ic(count, drc_index);
+		} else {
+			rc = -EINVAL;
+		}
+
+		break;
+	case PSERIES_HP_ELOG_ACTION_REMOVE:
+		if (hp_elog->id_type == PSERIES_HP_ELOG_ID_DRC_COUNT) {
+			count = hp_elog->_drc_u.drc_count;
+			rc = dlpar_memory_remove_by_count(count);
+		} else if (hp_elog->id_type == PSERIES_HP_ELOG_ID_DRC_INDEX) {
+			drc_index = hp_elog->_drc_u.drc_index;
+			rc = dlpar_memory_remove_by_index(drc_index);
+		} else if (hp_elog->id_type == PSERIES_HP_ELOG_ID_DRC_IC) {
+			count = hp_elog->_drc_u.ic.count;
+			drc_index = hp_elog->_drc_u.ic.index;
+			rc = dlpar_memory_remove_by_ic(count, drc_index);
+		} else {
+			rc = -EINVAL;
+		}
+
+		break;
+	case PSERIES_HP_ELOG_ACTION_READD:
+		drc_index = hp_elog->_drc_u.drc_index;
+		rc = dlpar_memory_readd_by_index(drc_index);
+		break;
+	default:
+		pr_err("Invalid action (%d) specified\n", hp_elog->action);
+		rc = -EINVAL;
+		break;
+	}
+
+	if (!rc) {
+		rtas_hp_event = true;
+		rc = drmem_update_dt();
+		rtas_hp_event = false;
+	}
+
+	unlock_device_hotplug();
+	return rc;
+}
+
+static int pseries_add_mem_node(struct device_node *np)
+{
+	const char *type;
+	const __be32 *regs;
+	unsigned long base;
+	unsigned int lmb_size;
+	int ret = -EINVAL;
+
+	/*
+	 * Check to see if we are actually adding memory
+	 */
+	type = of_get_property(np, "device_type", NULL);
+	if (type == NULL || strcmp(type, "memory") != 0)
+		return 0;
+
+	/*
+	 * Find the base and size of the memblock
+	 */
+	regs = of_get_property(np, "reg", NULL);
+	if (!regs)
+		return ret;
+
+	base = be64_to_cpu(*(unsigned long *)regs);
+	lmb_size = be32_to_cpu(regs[3]);
+
+	/*
+	 * Update memory region to represent the memory add
+	 */
+	ret = memblock_add(base, lmb_size);
+	return (ret < 0) ? -EINVAL : 0;
+}
+
+static int pseries_update_drconf_memory(struct of_reconfig_data *pr)
+{
+	struct of_drconf_cell_v1 *new_drmem, *old_drmem;
+	unsigned long memblock_size;
+	u32 entries;
+	__be32 *p;
+	int i, rc = -EINVAL;
+
+	if (rtas_hp_event)
+		return 0;
+
+	memblock_size = pseries_memory_block_size();
+	if (!memblock_size)
+		return -EINVAL;
+
+	if (!pr->old_prop)
+		return 0;
+
+	p = (__be32 *) pr->old_prop->value;
+	if (!p)
+		return -EINVAL;
+
+	/* The first int of the property is the number of lmb's described
+	 * by the property. This is followed by an array of of_drconf_cell
+	 * entries. Get the number of entries and skip to the array of
+	 * of_drconf_cell's.
+	 */
+	entries = be32_to_cpu(*p++);
+	old_drmem = (struct of_drconf_cell_v1 *)p;
+
+	p = (__be32 *)pr->prop->value;
+	p++;
+	new_drmem = (struct of_drconf_cell_v1 *)p;
+
+	for (i = 0; i < entries; i++) {
+		if ((be32_to_cpu(old_drmem[i].flags) & DRCONF_MEM_ASSIGNED) &&
+		    (!(be32_to_cpu(new_drmem[i].flags) & DRCONF_MEM_ASSIGNED))) {
+			rc = pseries_remove_memblock(
+				be64_to_cpu(old_drmem[i].base_addr),
+						     memblock_size);
+			break;
+		} else if ((!(be32_to_cpu(old_drmem[i].flags) &
+			    DRCONF_MEM_ASSIGNED)) &&
+			    (be32_to_cpu(new_drmem[i].flags) &
+			    DRCONF_MEM_ASSIGNED)) {
+			rc = memblock_add(be64_to_cpu(old_drmem[i].base_addr),
+					  memblock_size);
+			rc = (rc < 0) ? -EINVAL : 0;
+			break;
+		}
+	}
+	return rc;
+}
+
+static int pseries_memory_notifier(struct notifier_block *nb,
+				   unsigned long action, void *data)
+{
+	struct of_reconfig_data *rd = data;
+	int err = 0;
+
+	switch (action) {
+	case OF_RECONFIG_ATTACH_NODE:
+		err = pseries_add_mem_node(rd->dn);
+		break;
+	case OF_RECONFIG_DETACH_NODE:
+		err = pseries_remove_mem_node(rd->dn);
+		break;
+	case OF_RECONFIG_UPDATE_PROPERTY:
+		if (!strcmp(rd->prop->name, "ibm,dynamic-memory"))
+			err = pseries_update_drconf_memory(rd);
+		break;
+	}
+	return notifier_from_errno(err);
+}
+
+static struct notifier_block pseries_mem_nb = {
+	.notifier_call = pseries_memory_notifier,
+};
+
+static int __init pseries_memory_hotplug_init(void)
+{
+	if (firmware_has_feature(FW_FEATURE_LPAR))
+		of_reconfig_notifier_register(&pseries_mem_nb);
+
+	return 0;
+}
+machine_device_initcall(pseries, pseries_memory_hotplug_init);
diff --git a/arch/powerpc/platforms/pseries/hvCall.S b/arch/powerpc/platforms/pseries/hvCall.S
new file mode 100644
index 000000000..50dc9426d
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/hvCall.S
@@ -0,0 +1,339 @@
+/*
+ * This file contains the generic code to perform a call to the
+ * pSeries LPAR hypervisor.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <linux/jump_label.h>
+#include <asm/hvcall.h>
+#include <asm/processor.h>
+#include <asm/ppc_asm.h>
+#include <asm/asm-offsets.h>
+#include <asm/ptrace.h>
+#include <asm/feature-fixups.h>
+
+	.section	".text"
+	
+#ifdef CONFIG_TRACEPOINTS
+
+#ifndef CONFIG_JUMP_LABEL
+	.section	".toc","aw"
+
+	.globl hcall_tracepoint_refcount
+hcall_tracepoint_refcount:
+	.8byte	0
+
+	.section	".text"
+#endif
+
+/*
+ * precall must preserve all registers.  use unused STK_PARAM()
+ * areas to save snapshots and opcode.
+ */
+#define HCALL_INST_PRECALL(FIRST_REG)				\
+	mflr	r0;						\
+	std	r3,STK_PARAM(R3)(r1);				\
+	std	r4,STK_PARAM(R4)(r1);				\
+	std	r5,STK_PARAM(R5)(r1);				\
+	std	r6,STK_PARAM(R6)(r1);				\
+	std	r7,STK_PARAM(R7)(r1);				\
+	std	r8,STK_PARAM(R8)(r1);				\
+	std	r9,STK_PARAM(R9)(r1);				\
+	std	r10,STK_PARAM(R10)(r1);				\
+	std	r0,16(r1);					\
+	addi	r4,r1,STK_PARAM(FIRST_REG);			\
+	stdu	r1,-STACK_FRAME_OVERHEAD(r1);			\
+	bl	__trace_hcall_entry;				\
+	ld	r3,STACK_FRAME_OVERHEAD+STK_PARAM(R3)(r1);	\
+	ld	r4,STACK_FRAME_OVERHEAD+STK_PARAM(R4)(r1);	\
+	ld	r5,STACK_FRAME_OVERHEAD+STK_PARAM(R5)(r1);	\
+	ld	r6,STACK_FRAME_OVERHEAD+STK_PARAM(R6)(r1);	\
+	ld	r7,STACK_FRAME_OVERHEAD+STK_PARAM(R7)(r1);	\
+	ld	r8,STACK_FRAME_OVERHEAD+STK_PARAM(R8)(r1);	\
+	ld	r9,STACK_FRAME_OVERHEAD+STK_PARAM(R9)(r1);	\
+	ld	r10,STACK_FRAME_OVERHEAD+STK_PARAM(R10)(r1)
+
+/*
+ * postcall is performed immediately before function return which
+ * allows liberal use of volatile registers.
+ */
+#define __HCALL_INST_POSTCALL					\
+	ld	r0,STACK_FRAME_OVERHEAD+STK_PARAM(R3)(r1);	\
+	std	r3,STACK_FRAME_OVERHEAD+STK_PARAM(R3)(r1);	\
+	mr	r4,r3;						\
+	mr	r3,r0;						\
+	bl	__trace_hcall_exit;				\
+	ld	r0,STACK_FRAME_OVERHEAD+16(r1);			\
+	addi	r1,r1,STACK_FRAME_OVERHEAD;			\
+	ld	r3,STK_PARAM(R3)(r1);				\
+	mtlr	r0
+
+#define HCALL_INST_POSTCALL_NORETS				\
+	li	r5,0;						\
+	__HCALL_INST_POSTCALL
+
+#define HCALL_INST_POSTCALL(BUFREG)				\
+	mr	r5,BUFREG;					\
+	__HCALL_INST_POSTCALL
+
+#ifdef CONFIG_JUMP_LABEL
+#define HCALL_BRANCH(LABEL)					\
+	ARCH_STATIC_BRANCH(LABEL, hcall_tracepoint_key)
+#else
+
+/*
+ * We branch around this in early init (eg when populating the MMU
+ * hashtable) by using an unconditional cpu feature.
+ */
+#define HCALL_BRANCH(LABEL)					\
+BEGIN_FTR_SECTION;						\
+	b	1f;						\
+END_FTR_SECTION(0, 1);						\
+	ld	r12,hcall_tracepoint_refcount@toc(r2);		\
+	std	r12,32(r1);					\
+	cmpdi	r12,0;						\
+	bne-	LABEL;						\
+1:
+#endif
+
+#else
+#define HCALL_INST_PRECALL(FIRST_ARG)
+#define HCALL_INST_POSTCALL_NORETS
+#define HCALL_INST_POSTCALL(BUFREG)
+#define HCALL_BRANCH(LABEL)
+#endif
+
+_GLOBAL_TOC(plpar_hcall_norets)
+	HMT_MEDIUM
+
+	mfcr	r0
+	stw	r0,8(r1)
+	HCALL_BRANCH(plpar_hcall_norets_trace)
+	HVSC				/* invoke the hypervisor */
+
+	lwz	r0,8(r1)
+	mtcrf	0xff,r0
+	blr				/* return r3 = status */
+
+#ifdef CONFIG_TRACEPOINTS
+plpar_hcall_norets_trace:
+	HCALL_INST_PRECALL(R4)
+	HVSC
+	HCALL_INST_POSTCALL_NORETS
+	lwz	r0,8(r1)
+	mtcrf	0xff,r0
+	blr
+#endif
+
+_GLOBAL_TOC(plpar_hcall)
+	HMT_MEDIUM
+
+	mfcr	r0
+	stw	r0,8(r1)
+
+	HCALL_BRANCH(plpar_hcall_trace)
+
+	std     r4,STK_PARAM(R4)(r1)     /* Save ret buffer */
+
+	mr	r4,r5
+	mr	r5,r6
+	mr	r6,r7
+	mr	r7,r8
+	mr	r8,r9
+	mr	r9,r10
+
+	HVSC				/* invoke the hypervisor */
+
+	ld	r12,STK_PARAM(R4)(r1)
+	std	r4,  0(r12)
+	std	r5,  8(r12)
+	std	r6, 16(r12)
+	std	r7, 24(r12)
+
+	lwz	r0,8(r1)
+	mtcrf	0xff,r0
+
+	blr				/* return r3 = status */
+
+#ifdef CONFIG_TRACEPOINTS
+plpar_hcall_trace:
+	HCALL_INST_PRECALL(R5)
+
+	std	r4,STK_PARAM(R4)(r1)
+	mr	r0,r4
+
+	mr	r4,r5
+	mr	r5,r6
+	mr	r6,r7
+	mr	r7,r8
+	mr	r8,r9
+	mr	r9,r10
+
+	HVSC
+
+	ld	r12,STK_PARAM(R4)(r1)
+	std	r4,0(r12)
+	std	r5,8(r12)
+	std	r6,16(r12)
+	std	r7,24(r12)
+
+	HCALL_INST_POSTCALL(r12)
+
+	lwz	r0,8(r1)
+	mtcrf	0xff,r0
+
+	blr
+#endif
+
+/*
+ * plpar_hcall_raw can be called in real mode. kexec/kdump need some
+ * hypervisor calls to be executed in real mode. So plpar_hcall_raw
+ * does not access the per cpu hypervisor call statistics variables,
+ * since these variables may not be present in the RMO region.
+ */
+_GLOBAL(plpar_hcall_raw)
+	HMT_MEDIUM
+
+	mfcr	r0
+	stw	r0,8(r1)
+
+	std     r4,STK_PARAM(R4)(r1)     /* Save ret buffer */
+
+	mr	r4,r5
+	mr	r5,r6
+	mr	r6,r7
+	mr	r7,r8
+	mr	r8,r9
+	mr	r9,r10
+
+	HVSC				/* invoke the hypervisor */
+
+	ld	r12,STK_PARAM(R4)(r1)
+	std	r4,  0(r12)
+	std	r5,  8(r12)
+	std	r6, 16(r12)
+	std	r7, 24(r12)
+
+	lwz	r0,8(r1)
+	mtcrf	0xff,r0
+
+	blr				/* return r3 = status */
+
+_GLOBAL_TOC(plpar_hcall9)
+	HMT_MEDIUM
+
+	mfcr	r0
+	stw	r0,8(r1)
+
+	HCALL_BRANCH(plpar_hcall9_trace)
+
+	std     r4,STK_PARAM(R4)(r1)     /* Save ret buffer */
+
+	mr	r4,r5
+	mr	r5,r6
+	mr	r6,r7
+	mr	r7,r8
+	mr	r8,r9
+	mr	r9,r10
+	ld	r10,STK_PARAM(R11)(r1)	 /* put arg7 in R10 */
+	ld	r11,STK_PARAM(R12)(r1)	 /* put arg8 in R11 */
+	ld	r12,STK_PARAM(R13)(r1)    /* put arg9 in R12 */
+
+	HVSC				/* invoke the hypervisor */
+
+	mr	r0,r12
+	ld	r12,STK_PARAM(R4)(r1)
+	std	r4,  0(r12)
+	std	r5,  8(r12)
+	std	r6, 16(r12)
+	std	r7, 24(r12)
+	std	r8, 32(r12)
+	std	r9, 40(r12)
+	std	r10,48(r12)
+	std	r11,56(r12)
+	std	r0, 64(r12)
+
+	lwz	r0,8(r1)
+	mtcrf	0xff,r0
+
+	blr				/* return r3 = status */
+
+#ifdef CONFIG_TRACEPOINTS
+plpar_hcall9_trace:
+	HCALL_INST_PRECALL(R5)
+
+	std	r4,STK_PARAM(R4)(r1)
+	mr	r0,r4
+
+	mr	r4,r5
+	mr	r5,r6
+	mr	r6,r7
+	mr	r7,r8
+	mr	r8,r9
+	mr	r9,r10
+	ld	r10,STACK_FRAME_OVERHEAD+STK_PARAM(R11)(r1)
+	ld	r11,STACK_FRAME_OVERHEAD+STK_PARAM(R12)(r1)
+	ld	r12,STACK_FRAME_OVERHEAD+STK_PARAM(R13)(r1)
+
+	HVSC
+
+	mr	r0,r12
+	ld	r12,STACK_FRAME_OVERHEAD+STK_PARAM(R4)(r1)
+	std	r4,0(r12)
+	std	r5,8(r12)
+	std	r6,16(r12)
+	std	r7,24(r12)
+	std	r8,32(r12)
+	std	r9,40(r12)
+	std	r10,48(r12)
+	std	r11,56(r12)
+	std	r0,64(r12)
+
+	HCALL_INST_POSTCALL(r12)
+
+	lwz	r0,8(r1)
+	mtcrf	0xff,r0
+
+	blr
+#endif
+
+/* See plpar_hcall_raw to see why this is needed */
+_GLOBAL(plpar_hcall9_raw)
+	HMT_MEDIUM
+
+	mfcr	r0
+	stw	r0,8(r1)
+
+	std     r4,STK_PARAM(R4)(r1)     /* Save ret buffer */
+
+	mr	r4,r5
+	mr	r5,r6
+	mr	r6,r7
+	mr	r7,r8
+	mr	r8,r9
+	mr	r9,r10
+	ld	r10,STK_PARAM(R11)(r1)	 /* put arg7 in R10 */
+	ld	r11,STK_PARAM(R12)(r1)	 /* put arg8 in R11 */
+	ld	r12,STK_PARAM(R13)(r1)    /* put arg9 in R12 */
+
+	HVSC				/* invoke the hypervisor */
+
+	mr	r0,r12
+	ld	r12,STK_PARAM(R4)(r1)
+	std	r4,  0(r12)
+	std	r5,  8(r12)
+	std	r6, 16(r12)
+	std	r7, 24(r12)
+	std	r8, 32(r12)
+	std	r9, 40(r12)
+	std	r10,48(r12)
+	std	r11,56(r12)
+	std	r0, 64(r12)
+
+	lwz	r0,8(r1)
+	mtcrf	0xff,r0
+
+	blr				/* return r3 = status */
diff --git a/arch/powerpc/platforms/pseries/hvCall_inst.c b/arch/powerpc/platforms/pseries/hvCall_inst.c
new file mode 100644
index 000000000..6da320c78
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/hvCall_inst.c
@@ -0,0 +1,176 @@
+/*
+ * Copyright (C) 2006 Mike Kravetz IBM Corporation
+ *
+ * Hypervisor Call Instrumentation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/percpu.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+#include <linux/cpumask.h>
+#include <asm/hvcall.h>
+#include <asm/firmware.h>
+#include <asm/cputable.h>
+#include <asm/trace.h>
+#include <asm/machdep.h>
+
+/* For hcall instrumentation. One structure per-hcall, per-CPU */
+struct hcall_stats {
+	unsigned long	num_calls;	/* number of calls (on this CPU) */
+	unsigned long	tb_total;	/* total wall time (mftb) of calls. */
+	unsigned long	purr_total;	/* total cpu time (PURR) of calls. */
+	unsigned long	tb_start;
+	unsigned long	purr_start;
+};
+#define HCALL_STAT_ARRAY_SIZE	((MAX_HCALL_OPCODE >> 2) + 1)
+
+DEFINE_PER_CPU(struct hcall_stats[HCALL_STAT_ARRAY_SIZE], hcall_stats);
+
+/*
+ * Routines for displaying the statistics in debugfs
+ */
+static void *hc_start(struct seq_file *m, loff_t *pos)
+{
+	if ((int)*pos < (HCALL_STAT_ARRAY_SIZE-1))
+		return (void *)(unsigned long)(*pos + 1);
+
+	return NULL;
+}
+
+static void *hc_next(struct seq_file *m, void *p, loff_t * pos)
+{
+	++*pos;
+
+	return hc_start(m, pos);
+}
+
+static void hc_stop(struct seq_file *m, void *p)
+{
+}
+
+static int hc_show(struct seq_file *m, void *p)
+{
+	unsigned long h_num = (unsigned long)p;
+	struct hcall_stats *hs = m->private;
+
+	if (hs[h_num].num_calls) {
+		if (cpu_has_feature(CPU_FTR_PURR))
+			seq_printf(m, "%lu %lu %lu %lu\n", h_num<<2,
+				   hs[h_num].num_calls,
+				   hs[h_num].tb_total,
+				   hs[h_num].purr_total);
+		else
+			seq_printf(m, "%lu %lu %lu\n", h_num<<2,
+				   hs[h_num].num_calls,
+				   hs[h_num].tb_total);
+	}
+
+	return 0;
+}
+
+static const struct seq_operations hcall_inst_seq_ops = {
+        .start = hc_start,
+        .next  = hc_next,
+        .stop  = hc_stop,
+        .show  = hc_show
+};
+
+static int hcall_inst_seq_open(struct inode *inode, struct file *file)
+{
+	int rc;
+	struct seq_file *seq;
+
+	rc = seq_open(file, &hcall_inst_seq_ops);
+	seq = file->private_data;
+	seq->private = file_inode(file)->i_private;
+
+	return rc;
+}
+
+static const struct file_operations hcall_inst_seq_fops = {
+	.open = hcall_inst_seq_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = seq_release,
+};
+
+#define	HCALL_ROOT_DIR		"hcall_inst"
+#define CPU_NAME_BUF_SIZE	32
+
+
+static void probe_hcall_entry(void *ignored, unsigned long opcode, unsigned long *args)
+{
+	struct hcall_stats *h;
+
+	if (opcode > MAX_HCALL_OPCODE)
+		return;
+
+	h = this_cpu_ptr(&hcall_stats[opcode / 4]);
+	h->tb_start = mftb();
+	h->purr_start = mfspr(SPRN_PURR);
+}
+
+static void probe_hcall_exit(void *ignored, unsigned long opcode, long retval,
+			     unsigned long *retbuf)
+{
+	struct hcall_stats *h;
+
+	if (opcode > MAX_HCALL_OPCODE)
+		return;
+
+	h = this_cpu_ptr(&hcall_stats[opcode / 4]);
+	h->num_calls++;
+	h->tb_total += mftb() - h->tb_start;
+	h->purr_total += mfspr(SPRN_PURR) - h->purr_start;
+}
+
+static int __init hcall_inst_init(void)
+{
+	struct dentry *hcall_root;
+	struct dentry *hcall_file;
+	char cpu_name_buf[CPU_NAME_BUF_SIZE];
+	int cpu;
+
+	if (!firmware_has_feature(FW_FEATURE_LPAR))
+		return 0;
+
+	if (register_trace_hcall_entry(probe_hcall_entry, NULL))
+		return -EINVAL;
+
+	if (register_trace_hcall_exit(probe_hcall_exit, NULL)) {
+		unregister_trace_hcall_entry(probe_hcall_entry, NULL);
+		return -EINVAL;
+	}
+
+	hcall_root = debugfs_create_dir(HCALL_ROOT_DIR, NULL);
+	if (!hcall_root)
+		return -ENOMEM;
+
+	for_each_possible_cpu(cpu) {
+		snprintf(cpu_name_buf, CPU_NAME_BUF_SIZE, "cpu%d", cpu);
+		hcall_file = debugfs_create_file(cpu_name_buf, 0444,
+						 hcall_root,
+						 per_cpu(hcall_stats, cpu),
+						 &hcall_inst_seq_fops);
+		if (!hcall_file)
+			return -ENOMEM;
+	}
+
+	return 0;
+}
+machine_device_initcall(pseries, hcall_inst_init);
diff --git a/arch/powerpc/platforms/pseries/hvconsole.c b/arch/powerpc/platforms/pseries/hvconsole.c
new file mode 100644
index 000000000..73ec15cd2
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/hvconsole.c
@@ -0,0 +1,88 @@
+/*
+ * hvconsole.c
+ * Copyright (C) 2004 Hollis Blanchard, IBM Corporation
+ * Copyright (C) 2004 IBM Corporation
+ *
+ * Additional Author(s):
+ *  Ryan S. Arnold <rsa@us.ibm.com>
+ *
+ * LPAR console support.
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/export.h>
+#include <linux/errno.h>
+#include <asm/hvcall.h>
+#include <asm/hvconsole.h>
+#include <asm/plpar_wrappers.h>
+
+/**
+ * hvc_get_chars - retrieve characters from firmware for denoted vterm adapter
+ * @vtermno: The vtermno or unit_address of the adapter from which to fetch the
+ *	data.
+ * @buf: The character buffer into which to put the character data fetched from
+ *	firmware.
+ * @count: not used?
+ */
+int hvc_get_chars(uint32_t vtermno, char *buf, int count)
+{
+	long ret;
+	unsigned long retbuf[PLPAR_HCALL_BUFSIZE];
+	unsigned long *lbuf = (unsigned long *)buf;
+
+	ret = plpar_hcall(H_GET_TERM_CHAR, retbuf, vtermno);
+	lbuf[0] = be64_to_cpu(retbuf[1]);
+	lbuf[1] = be64_to_cpu(retbuf[2]);
+
+	if (ret == H_SUCCESS)
+		return retbuf[0];
+
+	return 0;
+}
+
+EXPORT_SYMBOL(hvc_get_chars);
+
+
+/**
+ * hvc_put_chars: send characters to firmware for denoted vterm adapter
+ * @vtermno: The vtermno or unit_address of the adapter from which the data
+ *	originated.
+ * @buf: The character buffer that contains the character data to send to
+ *	firmware. Must be at least 16 bytes, even if count is less than 16.
+ * @count: Send this number of characters.
+ */
+int hvc_put_chars(uint32_t vtermno, const char *buf, int count)
+{
+	unsigned long *lbuf = (unsigned long *) buf;
+	long ret;
+
+
+	/* hcall will ret H_PARAMETER if 'count' exceeds firmware max.*/
+	if (count > MAX_VIO_PUT_CHARS)
+		count = MAX_VIO_PUT_CHARS;
+
+	ret = plpar_hcall_norets(H_PUT_TERM_CHAR, vtermno, count,
+				 cpu_to_be64(lbuf[0]),
+				 cpu_to_be64(lbuf[1]));
+	if (ret == H_SUCCESS)
+		return count;
+	if (ret == H_BUSY)
+		return -EAGAIN;
+	return -EIO;
+}
+
+EXPORT_SYMBOL(hvc_put_chars);
diff --git a/arch/powerpc/platforms/pseries/hvcserver.c b/arch/powerpc/platforms/pseries/hvcserver.c
new file mode 100644
index 000000000..94a6e5612
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/hvcserver.c
@@ -0,0 +1,252 @@
+/*
+ * hvcserver.c
+ * Copyright (C) 2004 Ryan S Arnold, IBM Corporation
+ *
+ * PPC64 virtual I/O console server support.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+
+#include <asm/hvcall.h>
+#include <asm/hvcserver.h>
+#include <asm/io.h>
+
+#define HVCS_ARCH_VERSION "1.0.0"
+
+MODULE_AUTHOR("Ryan S. Arnold <rsa@us.ibm.com>");
+MODULE_DESCRIPTION("IBM hvcs ppc64 API");
+MODULE_LICENSE("GPL");
+MODULE_VERSION(HVCS_ARCH_VERSION);
+
+/*
+ * Convert arch specific return codes into relevant errnos.  The hvcs
+ * functions aren't performance sensitive, so this conversion isn't an
+ * issue.
+ */
+static int hvcs_convert(long to_convert)
+{
+	switch (to_convert) {
+		case H_SUCCESS:
+			return 0;
+		case H_PARAMETER:
+			return -EINVAL;
+		case H_HARDWARE:
+			return -EIO;
+		case H_BUSY:
+		case H_LONG_BUSY_ORDER_1_MSEC:
+		case H_LONG_BUSY_ORDER_10_MSEC:
+		case H_LONG_BUSY_ORDER_100_MSEC:
+		case H_LONG_BUSY_ORDER_1_SEC:
+		case H_LONG_BUSY_ORDER_10_SEC:
+		case H_LONG_BUSY_ORDER_100_SEC:
+			return -EBUSY;
+		case H_FUNCTION: /* fall through */
+		default:
+			return -EPERM;
+	}
+}
+
+/**
+ * hvcs_free_partner_info - free pi allocated by hvcs_get_partner_info
+ * @head: list_head pointer for an allocated list of partner info structs to
+ *	free.
+ *
+ * This function is used to free the partner info list that was returned by
+ * calling hvcs_get_partner_info().
+ */
+int hvcs_free_partner_info(struct list_head *head)
+{
+	struct hvcs_partner_info *pi;
+	struct list_head *element;
+
+	if (!head)
+		return -EINVAL;
+
+	while (!list_empty(head)) {
+		element = head->next;
+		pi = list_entry(element, struct hvcs_partner_info, node);
+		list_del(element);
+		kfree(pi);
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(hvcs_free_partner_info);
+
+/* Helper function for hvcs_get_partner_info */
+static int hvcs_next_partner(uint32_t unit_address,
+		unsigned long last_p_partition_ID,
+		unsigned long last_p_unit_address, unsigned long *pi_buff)
+
+{
+	long retval;
+	retval = plpar_hcall_norets(H_VTERM_PARTNER_INFO, unit_address,
+			last_p_partition_ID,
+				last_p_unit_address, virt_to_phys(pi_buff));
+	return hvcs_convert(retval);
+}
+
+/**
+ * hvcs_get_partner_info - Get all of the partner info for a vty-server adapter
+ * @unit_address: The unit_address of the vty-server adapter for which this
+ *	function is fetching partner info.
+ * @head: An initialized list_head pointer to an empty list to use to return the
+ *	list of partner info fetched from the hypervisor to the caller.
+ * @pi_buff: A page sized buffer pre-allocated prior to calling this function
+ *	that is to be used to be used by firmware as an iterator to keep track
+ *	of the partner info retrieval.
+ *
+ * This function returns non-zero on success, or if there is no partner info.
+ *
+ * The pi_buff is pre-allocated prior to calling this function because this
+ * function may be called with a spin_lock held and kmalloc of a page is not
+ * recommended as GFP_ATOMIC.
+ *
+ * The first long of this buffer is used to store a partner unit address.  The
+ * second long is used to store a partner partition ID and starting at
+ * pi_buff[2] is the 79 character Converged Location Code (diff size than the
+ * unsigned longs, hence the casting mumbo jumbo you see later).
+ *
+ * Invocation of this function should always be followed by an invocation of
+ * hvcs_free_partner_info() using a pointer to the SAME list head instance
+ * that was passed as a parameter to this function.
+ */
+int hvcs_get_partner_info(uint32_t unit_address, struct list_head *head,
+		unsigned long *pi_buff)
+{
+	/*
+	 * Dealt with as longs because of the hcall interface even though the
+	 * values are uint32_t.
+	 */
+	unsigned long	last_p_partition_ID;
+	unsigned long	last_p_unit_address;
+	struct hvcs_partner_info *next_partner_info = NULL;
+	int more = 1;
+	int retval;
+
+	/* invalid parameters */
+	if (!head || !pi_buff)
+		return -EINVAL;
+
+	memset(pi_buff, 0x00, PAGE_SIZE);
+	last_p_partition_ID = last_p_unit_address = ~0UL;
+	INIT_LIST_HEAD(head);
+
+	do {
+		retval = hvcs_next_partner(unit_address, last_p_partition_ID,
+				last_p_unit_address, pi_buff);
+		if (retval) {
+			/*
+			 * Don't indicate that we've failed if we have
+			 * any list elements.
+			 */
+			if (!list_empty(head))
+				return 0;
+			return retval;
+		}
+
+		last_p_partition_ID = be64_to_cpu(pi_buff[0]);
+		last_p_unit_address = be64_to_cpu(pi_buff[1]);
+
+		/* This indicates that there are no further partners */
+		if (last_p_partition_ID == ~0UL
+				&& last_p_unit_address == ~0UL)
+			break;
+
+		/* This is a very small struct and will be freed soon in
+		 * hvcs_free_partner_info(). */
+		next_partner_info = kmalloc(sizeof(struct hvcs_partner_info),
+				GFP_ATOMIC);
+
+		if (!next_partner_info) {
+			printk(KERN_WARNING "HVCONSOLE: kmalloc() failed to"
+				" allocate partner info struct.\n");
+			hvcs_free_partner_info(head);
+			return -ENOMEM;
+		}
+
+		next_partner_info->unit_address
+			= (unsigned int)last_p_unit_address;
+		next_partner_info->partition_ID
+			= (unsigned int)last_p_partition_ID;
+
+		/* copy the Null-term char too */
+		strlcpy(&next_partner_info->location_code[0],
+			(char *)&pi_buff[2],
+			sizeof(next_partner_info->location_code));
+
+		list_add_tail(&(next_partner_info->node), head);
+		next_partner_info = NULL;
+
+	} while (more);
+
+	return 0;
+}
+EXPORT_SYMBOL(hvcs_get_partner_info);
+
+/**
+ * hvcs_register_connection - establish a connection between this vty-server and
+ *	a vty.
+ * @unit_address: The unit address of the vty-server adapter that is to be
+ *	establish a connection.
+ * @p_partition_ID: The partition ID of the vty adapter that is to be connected.
+ * @p_unit_address: The unit address of the vty adapter to which the vty-server
+ *	is to be connected.
+ *
+ * If this function is called once and -EINVAL is returned it may
+ * indicate that the partner info needs to be refreshed for the
+ * target unit address at which point the caller must invoke
+ * hvcs_get_partner_info() and then call this function again.  If,
+ * for a second time, -EINVAL is returned then it indicates that
+ * there is probably already a partner connection registered to a
+ * different vty-server adapter.  It is also possible that a second
+ * -EINVAL may indicate that one of the parms is not valid, for
+ * instance if the link was removed between the vty-server adapter
+ * and the vty adapter that you are trying to open.  Don't shoot the
+ * messenger.  Firmware implemented it this way.
+ */
+int hvcs_register_connection( uint32_t unit_address,
+		uint32_t p_partition_ID, uint32_t p_unit_address)
+{
+	long retval;
+	retval = plpar_hcall_norets(H_REGISTER_VTERM, unit_address,
+				p_partition_ID, p_unit_address);
+	return hvcs_convert(retval);
+}
+EXPORT_SYMBOL(hvcs_register_connection);
+
+/**
+ * hvcs_free_connection - free the connection between a vty-server and vty
+ * @unit_address: The unit address of the vty-server that is to have its
+ *	connection severed.
+ *
+ * This function is used to free the partner connection between a vty-server
+ * adapter and a vty adapter.
+ *
+ * If -EBUSY is returned continue to call this function until 0 is returned.
+ */
+int hvcs_free_connection(uint32_t unit_address)
+{
+	long retval;
+	retval = plpar_hcall_norets(H_FREE_VTERM, unit_address);
+	return hvcs_convert(retval);
+}
+EXPORT_SYMBOL(hvcs_free_connection);
diff --git a/arch/powerpc/platforms/pseries/ibmebus.c b/arch/powerpc/platforms/pseries/ibmebus.c
new file mode 100644
index 000000000..c7c1140c1
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/ibmebus.c
@@ -0,0 +1,469 @@
+/*
+ * IBM PowerPC IBM eBus Infrastructure Support.
+ *
+ * Copyright (c) 2005 IBM Corporation
+ *  Joachim Fenkes <fenkes@de.ibm.com>
+ *  Heiko J Schick <schickhj@de.ibm.com>
+ *
+ * All rights reserved.
+ *
+ * This source code is distributed under a dual license of GPL v2.0 and OpenIB
+ * BSD.
+ *
+ * OpenIB BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/init.h>
+#include <linux/export.h>
+#include <linux/console.h>
+#include <linux/kobject.h>
+#include <linux/dma-mapping.h>
+#include <linux/interrupt.h>
+#include <linux/of.h>
+#include <linux/slab.h>
+#include <linux/stat.h>
+#include <linux/of_platform.h>
+#include <asm/ibmebus.h>
+
+static struct device ibmebus_bus_device = { /* fake "parent" device */
+	.init_name = "ibmebus",
+};
+
+struct bus_type ibmebus_bus_type;
+
+/* These devices will automatically be added to the bus during init */
+static const struct of_device_id ibmebus_matches[] __initconst = {
+	{ .compatible = "IBM,lhca" },
+	{ .compatible = "IBM,lhea" },
+	{},
+};
+
+static void *ibmebus_alloc_coherent(struct device *dev,
+				    size_t size,
+				    dma_addr_t *dma_handle,
+				    gfp_t flag,
+				    unsigned long attrs)
+{
+	void *mem;
+
+	mem = kmalloc(size, flag);
+	*dma_handle = (dma_addr_t)mem;
+
+	return mem;
+}
+
+static void ibmebus_free_coherent(struct device *dev,
+				  size_t size, void *vaddr,
+				  dma_addr_t dma_handle,
+				  unsigned long attrs)
+{
+	kfree(vaddr);
+}
+
+static dma_addr_t ibmebus_map_page(struct device *dev,
+				   struct page *page,
+				   unsigned long offset,
+				   size_t size,
+				   enum dma_data_direction direction,
+				   unsigned long attrs)
+{
+	return (dma_addr_t)(page_address(page) + offset);
+}
+
+static void ibmebus_unmap_page(struct device *dev,
+			       dma_addr_t dma_addr,
+			       size_t size,
+			       enum dma_data_direction direction,
+			       unsigned long attrs)
+{
+	return;
+}
+
+static int ibmebus_map_sg(struct device *dev,
+			  struct scatterlist *sgl,
+			  int nents, enum dma_data_direction direction,
+			  unsigned long attrs)
+{
+	struct scatterlist *sg;
+	int i;
+
+	for_each_sg(sgl, sg, nents, i) {
+		sg->dma_address = (dma_addr_t) sg_virt(sg);
+		sg->dma_length = sg->length;
+	}
+
+	return nents;
+}
+
+static void ibmebus_unmap_sg(struct device *dev,
+			     struct scatterlist *sg,
+			     int nents, enum dma_data_direction direction,
+			     unsigned long attrs)
+{
+	return;
+}
+
+static int ibmebus_dma_supported(struct device *dev, u64 mask)
+{
+	return mask == DMA_BIT_MASK(64);
+}
+
+static u64 ibmebus_dma_get_required_mask(struct device *dev)
+{
+	return DMA_BIT_MASK(64);
+}
+
+static const struct dma_map_ops ibmebus_dma_ops = {
+	.alloc              = ibmebus_alloc_coherent,
+	.free               = ibmebus_free_coherent,
+	.map_sg             = ibmebus_map_sg,
+	.unmap_sg           = ibmebus_unmap_sg,
+	.dma_supported      = ibmebus_dma_supported,
+	.get_required_mask  = ibmebus_dma_get_required_mask,
+	.map_page           = ibmebus_map_page,
+	.unmap_page         = ibmebus_unmap_page,
+};
+
+static int ibmebus_match_path(struct device *dev, void *data)
+{
+	struct device_node *dn = to_platform_device(dev)->dev.of_node;
+	return (of_find_node_by_path(data) == dn);
+}
+
+static int ibmebus_match_node(struct device *dev, void *data)
+{
+	return to_platform_device(dev)->dev.of_node == data;
+}
+
+static int ibmebus_create_device(struct device_node *dn)
+{
+	struct platform_device *dev;
+	int ret;
+
+	dev = of_device_alloc(dn, NULL, &ibmebus_bus_device);
+	if (!dev)
+		return -ENOMEM;
+
+	dev->dev.bus = &ibmebus_bus_type;
+	dev->dev.dma_ops = &ibmebus_dma_ops;
+
+	ret = of_device_add(dev);
+	if (ret)
+		platform_device_put(dev);
+	return ret;
+}
+
+static int ibmebus_create_devices(const struct of_device_id *matches)
+{
+	struct device_node *root, *child;
+	struct device *dev;
+	int ret = 0;
+
+	root = of_find_node_by_path("/");
+
+	for_each_child_of_node(root, child) {
+		if (!of_match_node(matches, child))
+			continue;
+
+		dev = bus_find_device(&ibmebus_bus_type, NULL, child,
+				      ibmebus_match_node);
+		if (dev) {
+			put_device(dev);
+			continue;
+		}
+
+		ret = ibmebus_create_device(child);
+		if (ret) {
+			printk(KERN_ERR "%s: failed to create device (%i)",
+			       __func__, ret);
+			of_node_put(child);
+			break;
+		}
+	}
+
+	of_node_put(root);
+	return ret;
+}
+
+int ibmebus_register_driver(struct platform_driver *drv)
+{
+	/* If the driver uses devices that ibmebus doesn't know, add them */
+	ibmebus_create_devices(drv->driver.of_match_table);
+
+	drv->driver.bus = &ibmebus_bus_type;
+	return driver_register(&drv->driver);
+}
+EXPORT_SYMBOL(ibmebus_register_driver);
+
+void ibmebus_unregister_driver(struct platform_driver *drv)
+{
+	driver_unregister(&drv->driver);
+}
+EXPORT_SYMBOL(ibmebus_unregister_driver);
+
+int ibmebus_request_irq(u32 ist, irq_handler_t handler,
+			unsigned long irq_flags, const char *devname,
+			void *dev_id)
+{
+	unsigned int irq = irq_create_mapping(NULL, ist);
+
+	if (!irq)
+		return -EINVAL;
+
+	return request_irq(irq, handler, irq_flags, devname, dev_id);
+}
+EXPORT_SYMBOL(ibmebus_request_irq);
+
+void ibmebus_free_irq(u32 ist, void *dev_id)
+{
+	unsigned int irq = irq_find_mapping(NULL, ist);
+
+	free_irq(irq, dev_id);
+	irq_dispose_mapping(irq);
+}
+EXPORT_SYMBOL(ibmebus_free_irq);
+
+static char *ibmebus_chomp(const char *in, size_t count)
+{
+	char *out = kmalloc(count + 1, GFP_KERNEL);
+
+	if (!out)
+		return NULL;
+
+	memcpy(out, in, count);
+	out[count] = '\0';
+	if (out[count - 1] == '\n')
+		out[count - 1] = '\0';
+
+	return out;
+}
+
+static ssize_t ibmebus_store_probe(struct bus_type *bus,
+				   const char *buf, size_t count)
+{
+	struct device_node *dn = NULL;
+	struct device *dev;
+	char *path;
+	ssize_t rc = 0;
+
+	path = ibmebus_chomp(buf, count);
+	if (!path)
+		return -ENOMEM;
+
+	dev = bus_find_device(&ibmebus_bus_type, NULL, path,
+			      ibmebus_match_path);
+	if (dev) {
+		put_device(dev);
+		printk(KERN_WARNING "%s: %s has already been probed\n",
+		       __func__, path);
+		rc = -EEXIST;
+		goto out;
+	}
+
+	if ((dn = of_find_node_by_path(path))) {
+		rc = ibmebus_create_device(dn);
+		of_node_put(dn);
+	} else {
+		printk(KERN_WARNING "%s: no such device node: %s\n",
+		       __func__, path);
+		rc = -ENODEV;
+	}
+
+out:
+	kfree(path);
+	if (rc)
+		return rc;
+	return count;
+}
+static BUS_ATTR(probe, 0200, NULL, ibmebus_store_probe);
+
+static ssize_t ibmebus_store_remove(struct bus_type *bus,
+				    const char *buf, size_t count)
+{
+	struct device *dev;
+	char *path;
+
+	path = ibmebus_chomp(buf, count);
+	if (!path)
+		return -ENOMEM;
+
+	if ((dev = bus_find_device(&ibmebus_bus_type, NULL, path,
+				   ibmebus_match_path))) {
+		of_device_unregister(to_platform_device(dev));
+		put_device(dev);
+
+		kfree(path);
+		return count;
+	} else {
+		printk(KERN_WARNING "%s: %s not on the bus\n",
+		       __func__, path);
+
+		kfree(path);
+		return -ENODEV;
+	}
+}
+static BUS_ATTR(remove, 0200, NULL, ibmebus_store_remove);
+
+static struct attribute *ibmbus_bus_attrs[] = {
+	&bus_attr_probe.attr,
+	&bus_attr_remove.attr,
+	NULL,
+};
+ATTRIBUTE_GROUPS(ibmbus_bus);
+
+static int ibmebus_bus_bus_match(struct device *dev, struct device_driver *drv)
+{
+	const struct of_device_id *matches = drv->of_match_table;
+
+	if (!matches)
+		return 0;
+
+	return of_match_device(matches, dev) != NULL;
+}
+
+static int ibmebus_bus_device_probe(struct device *dev)
+{
+	int error = -ENODEV;
+	struct platform_driver *drv;
+	struct platform_device *of_dev;
+
+	drv = to_platform_driver(dev->driver);
+	of_dev = to_platform_device(dev);
+
+	if (!drv->probe)
+		return error;
+
+	of_dev_get(of_dev);
+
+	if (of_driver_match_device(dev, dev->driver))
+		error = drv->probe(of_dev);
+	if (error)
+		of_dev_put(of_dev);
+
+	return error;
+}
+
+static int ibmebus_bus_device_remove(struct device *dev)
+{
+	struct platform_device *of_dev = to_platform_device(dev);
+	struct platform_driver *drv = to_platform_driver(dev->driver);
+
+	if (dev->driver && drv->remove)
+		drv->remove(of_dev);
+	return 0;
+}
+
+static void ibmebus_bus_device_shutdown(struct device *dev)
+{
+	struct platform_device *of_dev = to_platform_device(dev);
+	struct platform_driver *drv = to_platform_driver(dev->driver);
+
+	if (dev->driver && drv->shutdown)
+		drv->shutdown(of_dev);
+}
+
+/*
+ * ibmebus_bus_device_attrs
+ */
+static ssize_t devspec_show(struct device *dev,
+				struct device_attribute *attr, char *buf)
+{
+	struct platform_device *ofdev;
+
+	ofdev = to_platform_device(dev);
+	return sprintf(buf, "%pOF\n", ofdev->dev.of_node);
+}
+static DEVICE_ATTR_RO(devspec);
+
+static ssize_t name_show(struct device *dev,
+				struct device_attribute *attr, char *buf)
+{
+	struct platform_device *ofdev;
+
+	ofdev = to_platform_device(dev);
+	return sprintf(buf, "%s\n", ofdev->dev.of_node->name);
+}
+static DEVICE_ATTR_RO(name);
+
+static ssize_t modalias_show(struct device *dev,
+				struct device_attribute *attr, char *buf)
+{
+	return of_device_modalias(dev, buf, PAGE_SIZE);
+}
+static DEVICE_ATTR_RO(modalias);
+
+static struct attribute *ibmebus_bus_device_attrs[] = {
+	&dev_attr_devspec.attr,
+	&dev_attr_name.attr,
+	&dev_attr_modalias.attr,
+	NULL,
+};
+ATTRIBUTE_GROUPS(ibmebus_bus_device);
+
+struct bus_type ibmebus_bus_type = {
+	.name      = "ibmebus",
+	.uevent    = of_device_uevent_modalias,
+	.bus_groups = ibmbus_bus_groups,
+	.match     = ibmebus_bus_bus_match,
+	.probe     = ibmebus_bus_device_probe,
+	.remove    = ibmebus_bus_device_remove,
+	.shutdown  = ibmebus_bus_device_shutdown,
+	.dev_groups = ibmebus_bus_device_groups,
+};
+EXPORT_SYMBOL(ibmebus_bus_type);
+
+static int __init ibmebus_bus_init(void)
+{
+	int err;
+
+	printk(KERN_INFO "IBM eBus Device Driver\n");
+
+	err = bus_register(&ibmebus_bus_type);
+	if (err) {
+		printk(KERN_ERR "%s: failed to register IBM eBus.\n",
+		       __func__);
+		return err;
+	}
+
+	err = device_register(&ibmebus_bus_device);
+	if (err) {
+		printk(KERN_WARNING "%s: device_register returned %i\n",
+		       __func__, err);
+		bus_unregister(&ibmebus_bus_type);
+
+		return err;
+	}
+
+	err = ibmebus_create_devices(ibmebus_matches);
+	if (err) {
+		device_unregister(&ibmebus_bus_device);
+		bus_unregister(&ibmebus_bus_type);
+		return err;
+	}
+
+	return 0;
+}
+postcore_initcall(ibmebus_bus_init);
diff --git a/arch/powerpc/platforms/pseries/io_event_irq.c b/arch/powerpc/platforms/pseries/io_event_irq.c
new file mode 100644
index 000000000..f053bda64
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/io_event_irq.c
@@ -0,0 +1,165 @@
+/*
+ * Copyright 2010 2011 Mark Nelson and Tseng-Hui (Frank) Lin, IBM Corporation
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version
+ *  2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/export.h>
+#include <linux/irq.h>
+#include <linux/interrupt.h>
+#include <linux/of.h>
+#include <linux/list.h>
+#include <linux/notifier.h>
+
+#include <asm/machdep.h>
+#include <asm/rtas.h>
+#include <asm/irq.h>
+#include <asm/io_event_irq.h>
+
+#include "pseries.h"
+
+/*
+ * IO event interrupt is a mechanism provided by RTAS to return
+ * information about hardware error and non-error events. Device
+ * drivers can register their event handlers to receive events.
+ * Device drivers are expected to use atomic_notifier_chain_register()
+ * and atomic_notifier_chain_unregister() to register and unregister
+ * their event handlers. Since multiple IO event types and scopes
+ * share an IO event interrupt, the event handlers are called one
+ * by one until the IO event is claimed by one of the handlers.
+ * The event handlers are expected to return NOTIFY_OK if the
+ * event is handled by the event handler or NOTIFY_DONE if the
+ * event does not belong to the handler.
+ *
+ * Usage:
+ *
+ * Notifier function:
+ * #include <asm/io_event_irq.h>
+ * int event_handler(struct notifier_block *nb, unsigned long val, void *data) {
+ * 	p = (struct pseries_io_event_sect_data *) data;
+ * 	if (! is_my_event(p->scope, p->event_type)) return NOTIFY_DONE;
+ * 		:
+ * 		:
+ * 	return NOTIFY_OK;
+ * }
+ * struct notifier_block event_nb = {
+ * 	.notifier_call = event_handler,
+ * }
+ *
+ * Registration:
+ * atomic_notifier_chain_register(&pseries_ioei_notifier_list, &event_nb);
+ *
+ * Unregistration:
+ * atomic_notifier_chain_unregister(&pseries_ioei_notifier_list, &event_nb);
+ */
+
+ATOMIC_NOTIFIER_HEAD(pseries_ioei_notifier_list);
+EXPORT_SYMBOL_GPL(pseries_ioei_notifier_list);
+
+static int ioei_check_exception_token;
+
+static char ioei_rtas_buf[RTAS_DATA_BUF_SIZE] __cacheline_aligned;
+
+/**
+ * Find the data portion of an IO Event section from event log.
+ * @elog: RTAS error/event log.
+ *
+ * Return:
+ * 	pointer to a valid IO event section data. NULL if not found.
+ */
+static struct pseries_io_event * ioei_find_event(struct rtas_error_log *elog)
+{
+	struct pseries_errorlog *sect;
+
+	/* We should only ever get called for io-event interrupts, but if
+	 * we do get called for another type then something went wrong so
+	 * make some noise about it.
+	 * RTAS_TYPE_IO only exists in extended event log version 6 or later.
+	 * No need to check event log version.
+	 */
+	if (unlikely(rtas_error_type(elog) != RTAS_TYPE_IO)) {
+		printk_once(KERN_WARNING"io_event_irq: Unexpected event type %d",
+			    rtas_error_type(elog));
+		return NULL;
+	}
+
+	sect = get_pseries_errorlog(elog, PSERIES_ELOG_SECT_ID_IO_EVENT);
+	if (unlikely(!sect)) {
+		printk_once(KERN_WARNING "io_event_irq: RTAS extended event "
+			    "log does not contain an IO Event section. "
+			    "Could be a bug in system firmware!\n");
+		return NULL;
+	}
+	return (struct pseries_io_event *) &sect->data;
+}
+
+/*
+ * PAPR:
+ * - check-exception returns the first found error or event and clear that
+ *   error or event so it is reported once.
+ * - Each interrupt returns one event. If a plateform chooses to report
+ *   multiple events through a single interrupt, it must ensure that the
+ *   interrupt remains asserted until check-exception has been used to
+ *   process all out-standing events for that interrupt.
+ *
+ * Implementation notes:
+ * - Events must be processed in the order they are returned. Hence,
+ *   sequential in nature.
+ * - The owner of an event is determined by combinations of scope,
+ *   event type, and sub-type. There is no easy way to pre-sort clients
+ *   by scope or event type alone. For example, Torrent ISR route change
+ *   event is reported with scope 0x00 (Not Applicable) rather than
+ *   0x3B (Torrent-hub). It is better to let the clients to identify
+ *   who owns the event.
+ */
+
+static irqreturn_t ioei_interrupt(int irq, void *dev_id)
+{
+	struct pseries_io_event *event;
+	int rtas_rc;
+
+	for (;;) {
+		rtas_rc = rtas_call(ioei_check_exception_token, 6, 1, NULL,
+				    RTAS_VECTOR_EXTERNAL_INTERRUPT,
+				    virq_to_hw(irq),
+				    RTAS_IO_EVENTS, 1 /* Time Critical */,
+				    __pa(ioei_rtas_buf),
+				    RTAS_DATA_BUF_SIZE);
+		if (rtas_rc != 0)
+			break;
+
+		event = ioei_find_event((struct rtas_error_log *)ioei_rtas_buf);
+		if (!event)
+			continue;
+
+		atomic_notifier_call_chain(&pseries_ioei_notifier_list,
+					   0, event);
+	}
+	return IRQ_HANDLED;
+}
+
+static int __init ioei_init(void)
+{
+	struct device_node *np;
+
+	ioei_check_exception_token = rtas_token("check-exception");
+	if (ioei_check_exception_token == RTAS_UNKNOWN_SERVICE)
+		return -ENODEV;
+
+	np = of_find_node_by_path("/event-sources/ibm,io-events");
+	if (np) {
+		request_event_sources_irqs(np, ioei_interrupt, "IO_EVENT");
+		pr_info("IBM I/O event interrupts enabled\n");
+		of_node_put(np);
+	} else {
+		return -ENODEV;
+	}
+	return 0;
+}
+machine_subsys_initcall(pseries, ioei_init);
+
diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
new file mode 100644
index 000000000..b1a08cb76
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -0,0 +1,1413 @@
+/*
+ * Copyright (C) 2001 Mike Corrigan & Dave Engebretsen, IBM Corporation
+ *
+ * Rewrite, cleanup:
+ *
+ * Copyright (C) 2004 Olof Johansson <olof@lixom.net>, IBM Corporation
+ * Copyright (C) 2006 Olof Johansson <olof@lixom.net>
+ *
+ * Dynamic DMA mapping support, pSeries-specific parts, both SMP and LPAR.
+ *
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ */
+
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include <linux/memblock.h>
+#include <linux/spinlock.h>
+#include <linux/string.h>
+#include <linux/pci.h>
+#include <linux/dma-mapping.h>
+#include <linux/crash_dump.h>
+#include <linux/memory.h>
+#include <linux/of.h>
+#include <linux/iommu.h>
+#include <linux/rculist.h>
+#include <asm/io.h>
+#include <asm/prom.h>
+#include <asm/rtas.h>
+#include <asm/iommu.h>
+#include <asm/pci-bridge.h>
+#include <asm/machdep.h>
+#include <asm/firmware.h>
+#include <asm/tce.h>
+#include <asm/ppc-pci.h>
+#include <asm/udbg.h>
+#include <asm/mmzone.h>
+#include <asm/plpar_wrappers.h>
+
+#include "pseries.h"
+
+static struct iommu_table_group *iommu_pseries_alloc_group(int node)
+{
+	struct iommu_table_group *table_group;
+	struct iommu_table *tbl;
+	struct iommu_table_group_link *tgl;
+
+	table_group = kzalloc_node(sizeof(struct iommu_table_group), GFP_KERNEL,
+			   node);
+	if (!table_group)
+		return NULL;
+
+	tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL, node);
+	if (!tbl)
+		goto free_group;
+
+	tgl = kzalloc_node(sizeof(struct iommu_table_group_link), GFP_KERNEL,
+			node);
+	if (!tgl)
+		goto free_table;
+
+	INIT_LIST_HEAD_RCU(&tbl->it_group_list);
+	kref_init(&tbl->it_kref);
+	tgl->table_group = table_group;
+	list_add_rcu(&tgl->next, &tbl->it_group_list);
+
+	table_group->tables[0] = tbl;
+
+	return table_group;
+
+free_table:
+	kfree(tbl);
+free_group:
+	kfree(table_group);
+	return NULL;
+}
+
+static void iommu_pseries_free_group(struct iommu_table_group *table_group,
+		const char *node_name)
+{
+	struct iommu_table *tbl;
+#ifdef CONFIG_IOMMU_API
+	struct iommu_table_group_link *tgl;
+#endif
+
+	if (!table_group)
+		return;
+
+	tbl = table_group->tables[0];
+#ifdef CONFIG_IOMMU_API
+	tgl = list_first_entry_or_null(&tbl->it_group_list,
+			struct iommu_table_group_link, next);
+
+	WARN_ON_ONCE(!tgl);
+	if (tgl) {
+		list_del_rcu(&tgl->next);
+		kfree(tgl);
+	}
+	if (table_group->group) {
+		iommu_group_put(table_group->group);
+		BUG_ON(table_group->group);
+	}
+#endif
+	iommu_tce_table_put(tbl);
+
+	kfree(table_group);
+}
+
+static int tce_build_pSeries(struct iommu_table *tbl, long index,
+			      long npages, unsigned long uaddr,
+			      enum dma_data_direction direction,
+			      unsigned long attrs)
+{
+	u64 proto_tce;
+	__be64 *tcep, *tces;
+	u64 rpn;
+
+	proto_tce = TCE_PCI_READ; // Read allowed
+
+	if (direction != DMA_TO_DEVICE)
+		proto_tce |= TCE_PCI_WRITE;
+
+	tces = tcep = ((__be64 *)tbl->it_base) + index;
+
+	while (npages--) {
+		/* can't move this out since we might cross MEMBLOCK boundary */
+		rpn = __pa(uaddr) >> TCE_SHIFT;
+		*tcep = cpu_to_be64(proto_tce | (rpn & TCE_RPN_MASK) << TCE_RPN_SHIFT);
+
+		uaddr += TCE_PAGE_SIZE;
+		tcep++;
+	}
+	return 0;
+}
+
+
+static void tce_free_pSeries(struct iommu_table *tbl, long index, long npages)
+{
+	__be64 *tcep, *tces;
+
+	tces = tcep = ((__be64 *)tbl->it_base) + index;
+
+	while (npages--)
+		*(tcep++) = 0;
+}
+
+static unsigned long tce_get_pseries(struct iommu_table *tbl, long index)
+{
+	__be64 *tcep;
+
+	tcep = ((__be64 *)tbl->it_base) + index;
+
+	return be64_to_cpu(*tcep);
+}
+
+static void tce_free_pSeriesLP(unsigned long liobn, long, long);
+static void tce_freemulti_pSeriesLP(struct iommu_table*, long, long);
+
+static int tce_build_pSeriesLP(unsigned long liobn, long tcenum, long tceshift,
+				long npages, unsigned long uaddr,
+				enum dma_data_direction direction,
+				unsigned long attrs)
+{
+	u64 rc = 0;
+	u64 proto_tce, tce;
+	u64 rpn;
+	int ret = 0;
+	long tcenum_start = tcenum, npages_start = npages;
+
+	rpn = __pa(uaddr) >> tceshift;
+	proto_tce = TCE_PCI_READ;
+	if (direction != DMA_TO_DEVICE)
+		proto_tce |= TCE_PCI_WRITE;
+
+	while (npages--) {
+		tce = proto_tce | (rpn & TCE_RPN_MASK) << tceshift;
+		rc = plpar_tce_put((u64)liobn, (u64)tcenum << tceshift, tce);
+
+		if (unlikely(rc == H_NOT_ENOUGH_RESOURCES)) {
+			ret = (int)rc;
+			tce_free_pSeriesLP(liobn, tcenum_start,
+			                   (npages_start - (npages + 1)));
+			break;
+		}
+
+		if (rc && printk_ratelimit()) {
+			printk("tce_build_pSeriesLP: plpar_tce_put failed. rc=%lld\n", rc);
+			printk("\tindex   = 0x%llx\n", (u64)liobn);
+			printk("\ttcenum  = 0x%llx\n", (u64)tcenum);
+			printk("\ttce val = 0x%llx\n", tce );
+			dump_stack();
+		}
+
+		tcenum++;
+		rpn++;
+	}
+	return ret;
+}
+
+static DEFINE_PER_CPU(__be64 *, tce_page);
+
+static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum,
+				     long npages, unsigned long uaddr,
+				     enum dma_data_direction direction,
+				     unsigned long attrs)
+{
+	u64 rc = 0;
+	u64 proto_tce;
+	__be64 *tcep;
+	u64 rpn;
+	long l, limit;
+	long tcenum_start = tcenum, npages_start = npages;
+	int ret = 0;
+	unsigned long flags;
+
+	if ((npages == 1) || !firmware_has_feature(FW_FEATURE_MULTITCE)) {
+		return tce_build_pSeriesLP(tbl->it_index, tcenum,
+					   tbl->it_page_shift, npages, uaddr,
+		                           direction, attrs);
+	}
+
+	local_irq_save(flags);	/* to protect tcep and the page behind it */
+
+	tcep = __this_cpu_read(tce_page);
+
+	/* This is safe to do since interrupts are off when we're called
+	 * from iommu_alloc{,_sg}()
+	 */
+	if (!tcep) {
+		tcep = (__be64 *)__get_free_page(GFP_ATOMIC);
+		/* If allocation fails, fall back to the loop implementation */
+		if (!tcep) {
+			local_irq_restore(flags);
+			return tce_build_pSeriesLP(tbl->it_index, tcenum,
+					tbl->it_page_shift,
+					npages, uaddr, direction, attrs);
+		}
+		__this_cpu_write(tce_page, tcep);
+	}
+
+	rpn = __pa(uaddr) >> TCE_SHIFT;
+	proto_tce = TCE_PCI_READ;
+	if (direction != DMA_TO_DEVICE)
+		proto_tce |= TCE_PCI_WRITE;
+
+	/* We can map max one pageful of TCEs at a time */
+	do {
+		/*
+		 * Set up the page with TCE data, looping through and setting
+		 * the values.
+		 */
+		limit = min_t(long, npages, 4096/TCE_ENTRY_SIZE);
+
+		for (l = 0; l < limit; l++) {
+			tcep[l] = cpu_to_be64(proto_tce | (rpn & TCE_RPN_MASK) << TCE_RPN_SHIFT);
+			rpn++;
+		}
+
+		rc = plpar_tce_put_indirect((u64)tbl->it_index,
+					    (u64)tcenum << 12,
+					    (u64)__pa(tcep),
+					    limit);
+
+		npages -= limit;
+		tcenum += limit;
+	} while (npages > 0 && !rc);
+
+	local_irq_restore(flags);
+
+	if (unlikely(rc == H_NOT_ENOUGH_RESOURCES)) {
+		ret = (int)rc;
+		tce_freemulti_pSeriesLP(tbl, tcenum_start,
+		                        (npages_start - (npages + limit)));
+		return ret;
+	}
+
+	if (rc && printk_ratelimit()) {
+		printk("tce_buildmulti_pSeriesLP: plpar_tce_put failed. rc=%lld\n", rc);
+		printk("\tindex   = 0x%llx\n", (u64)tbl->it_index);
+		printk("\tnpages  = 0x%llx\n", (u64)npages);
+		printk("\ttce[0] val = 0x%llx\n", tcep[0]);
+		dump_stack();
+	}
+	return ret;
+}
+
+static void tce_free_pSeriesLP(unsigned long liobn, long tcenum, long npages)
+{
+	u64 rc;
+
+	while (npages--) {
+		rc = plpar_tce_put((u64)liobn, (u64)tcenum << 12, 0);
+
+		if (rc && printk_ratelimit()) {
+			printk("tce_free_pSeriesLP: plpar_tce_put failed. rc=%lld\n", rc);
+			printk("\tindex   = 0x%llx\n", (u64)liobn);
+			printk("\ttcenum  = 0x%llx\n", (u64)tcenum);
+			dump_stack();
+		}
+
+		tcenum++;
+	}
+}
+
+
+static void tce_freemulti_pSeriesLP(struct iommu_table *tbl, long tcenum, long npages)
+{
+	u64 rc;
+
+	if (!firmware_has_feature(FW_FEATURE_MULTITCE))
+		return tce_free_pSeriesLP(tbl->it_index, tcenum, npages);
+
+	rc = plpar_tce_stuff((u64)tbl->it_index, (u64)tcenum << 12, 0, npages);
+
+	if (rc && printk_ratelimit()) {
+		printk("tce_freemulti_pSeriesLP: plpar_tce_stuff failed\n");
+		printk("\trc      = %lld\n", rc);
+		printk("\tindex   = 0x%llx\n", (u64)tbl->it_index);
+		printk("\tnpages  = 0x%llx\n", (u64)npages);
+		dump_stack();
+	}
+}
+
+static unsigned long tce_get_pSeriesLP(struct iommu_table *tbl, long tcenum)
+{
+	u64 rc;
+	unsigned long tce_ret;
+
+	rc = plpar_tce_get((u64)tbl->it_index, (u64)tcenum << 12, &tce_ret);
+
+	if (rc && printk_ratelimit()) {
+		printk("tce_get_pSeriesLP: plpar_tce_get failed. rc=%lld\n", rc);
+		printk("\tindex   = 0x%llx\n", (u64)tbl->it_index);
+		printk("\ttcenum  = 0x%llx\n", (u64)tcenum);
+		dump_stack();
+	}
+
+	return tce_ret;
+}
+
+/* this is compatible with cells for the device tree property */
+struct dynamic_dma_window_prop {
+	__be32	liobn;		/* tce table number */
+	__be64	dma_base;	/* address hi,lo */
+	__be32	tce_shift;	/* ilog2(tce_page_size) */
+	__be32	window_shift;	/* ilog2(tce_window_size) */
+};
+
+struct direct_window {
+	struct device_node *device;
+	const struct dynamic_dma_window_prop *prop;
+	struct list_head list;
+};
+
+/* Dynamic DMA Window support */
+struct ddw_query_response {
+	u32 windows_available;
+	u32 largest_available_block;
+	u32 page_size;
+	u32 migration_capable;
+};
+
+struct ddw_create_response {
+	u32 liobn;
+	u32 addr_hi;
+	u32 addr_lo;
+};
+
+static LIST_HEAD(direct_window_list);
+/* prevents races between memory on/offline and window creation */
+static DEFINE_SPINLOCK(direct_window_list_lock);
+/* protects initializing window twice for same device */
+static DEFINE_MUTEX(direct_window_init_mutex);
+#define DIRECT64_PROPNAME "linux,direct64-ddr-window-info"
+
+static int tce_clearrange_multi_pSeriesLP(unsigned long start_pfn,
+					unsigned long num_pfn, const void *arg)
+{
+	const struct dynamic_dma_window_prop *maprange = arg;
+	int rc;
+	u64 tce_size, num_tce, dma_offset, next;
+	u32 tce_shift;
+	long limit;
+
+	tce_shift = be32_to_cpu(maprange->tce_shift);
+	tce_size = 1ULL << tce_shift;
+	next = start_pfn << PAGE_SHIFT;
+	num_tce = num_pfn << PAGE_SHIFT;
+
+	/* round back to the beginning of the tce page size */
+	num_tce += next & (tce_size - 1);
+	next &= ~(tce_size - 1);
+
+	/* covert to number of tces */
+	num_tce |= tce_size - 1;
+	num_tce >>= tce_shift;
+
+	do {
+		/*
+		 * Set up the page with TCE data, looping through and setting
+		 * the values.
+		 */
+		limit = min_t(long, num_tce, 512);
+		dma_offset = next + be64_to_cpu(maprange->dma_base);
+
+		rc = plpar_tce_stuff((u64)be32_to_cpu(maprange->liobn),
+					     dma_offset,
+					     0, limit);
+		next += limit * tce_size;
+		num_tce -= limit;
+	} while (num_tce > 0 && !rc);
+
+	return rc;
+}
+
+static int tce_setrange_multi_pSeriesLP(unsigned long start_pfn,
+					unsigned long num_pfn, const void *arg)
+{
+	const struct dynamic_dma_window_prop *maprange = arg;
+	u64 tce_size, num_tce, dma_offset, next, proto_tce, liobn;
+	__be64 *tcep;
+	u32 tce_shift;
+	u64 rc = 0;
+	long l, limit;
+
+	if (!firmware_has_feature(FW_FEATURE_MULTITCE)) {
+		unsigned long tceshift = be32_to_cpu(maprange->tce_shift);
+		unsigned long dmastart = (start_pfn << PAGE_SHIFT) +
+				be64_to_cpu(maprange->dma_base);
+		unsigned long tcenum = dmastart >> tceshift;
+		unsigned long npages = num_pfn << PAGE_SHIFT >> tceshift;
+		void *uaddr = __va(start_pfn << PAGE_SHIFT);
+
+		return tce_build_pSeriesLP(be32_to_cpu(maprange->liobn),
+				tcenum, tceshift, npages, (unsigned long) uaddr,
+				DMA_BIDIRECTIONAL, 0);
+	}
+
+	local_irq_disable();	/* to protect tcep and the page behind it */
+	tcep = __this_cpu_read(tce_page);
+
+	if (!tcep) {
+		tcep = (__be64 *)__get_free_page(GFP_ATOMIC);
+		if (!tcep) {
+			local_irq_enable();
+			return -ENOMEM;
+		}
+		__this_cpu_write(tce_page, tcep);
+	}
+
+	proto_tce = TCE_PCI_READ | TCE_PCI_WRITE;
+
+	liobn = (u64)be32_to_cpu(maprange->liobn);
+	tce_shift = be32_to_cpu(maprange->tce_shift);
+	tce_size = 1ULL << tce_shift;
+	next = start_pfn << PAGE_SHIFT;
+	num_tce = num_pfn << PAGE_SHIFT;
+
+	/* round back to the beginning of the tce page size */
+	num_tce += next & (tce_size - 1);
+	next &= ~(tce_size - 1);
+
+	/* covert to number of tces */
+	num_tce |= tce_size - 1;
+	num_tce >>= tce_shift;
+
+	/* We can map max one pageful of TCEs at a time */
+	do {
+		/*
+		 * Set up the page with TCE data, looping through and setting
+		 * the values.
+		 */
+		limit = min_t(long, num_tce, 4096/TCE_ENTRY_SIZE);
+		dma_offset = next + be64_to_cpu(maprange->dma_base);
+
+		for (l = 0; l < limit; l++) {
+			tcep[l] = cpu_to_be64(proto_tce | next);
+			next += tce_size;
+		}
+
+		rc = plpar_tce_put_indirect(liobn,
+					    dma_offset,
+					    (u64)__pa(tcep),
+					    limit);
+
+		num_tce -= limit;
+	} while (num_tce > 0 && !rc);
+
+	/* error cleanup: caller will clear whole range */
+
+	local_irq_enable();
+	return rc;
+}
+
+static int tce_setrange_multi_pSeriesLP_walk(unsigned long start_pfn,
+		unsigned long num_pfn, void *arg)
+{
+	return tce_setrange_multi_pSeriesLP(start_pfn, num_pfn, arg);
+}
+
+static void iommu_table_setparms(struct pci_controller *phb,
+				 struct device_node *dn,
+				 struct iommu_table *tbl)
+{
+	struct device_node *node;
+	const unsigned long *basep;
+	const u32 *sizep;
+
+	node = phb->dn;
+
+	basep = of_get_property(node, "linux,tce-base", NULL);
+	sizep = of_get_property(node, "linux,tce-size", NULL);
+	if (basep == NULL || sizep == NULL) {
+		printk(KERN_ERR "PCI_DMA: iommu_table_setparms: %pOF has "
+				"missing tce entries !\n", dn);
+		return;
+	}
+
+	tbl->it_base = (unsigned long)__va(*basep);
+
+	if (!is_kdump_kernel())
+		memset((void *)tbl->it_base, 0, *sizep);
+
+	tbl->it_busno = phb->bus->number;
+	tbl->it_page_shift = IOMMU_PAGE_SHIFT_4K;
+
+	/* Units of tce entries */
+	tbl->it_offset = phb->dma_window_base_cur >> tbl->it_page_shift;
+
+	/* Test if we are going over 2GB of DMA space */
+	if (phb->dma_window_base_cur + phb->dma_window_size > 0x80000000ul) {
+		udbg_printf("PCI_DMA: Unexpected number of IOAs under this PHB.\n");
+		panic("PCI_DMA: Unexpected number of IOAs under this PHB.\n");
+	}
+
+	phb->dma_window_base_cur += phb->dma_window_size;
+
+	/* Set the tce table size - measured in entries */
+	tbl->it_size = phb->dma_window_size >> tbl->it_page_shift;
+
+	tbl->it_index = 0;
+	tbl->it_blocksize = 16;
+	tbl->it_type = TCE_PCI;
+}
+
+/*
+ * iommu_table_setparms_lpar
+ *
+ * Function: On pSeries LPAR systems, return TCE table info, given a pci bus.
+ */
+static void iommu_table_setparms_lpar(struct pci_controller *phb,
+				      struct device_node *dn,
+				      struct iommu_table *tbl,
+				      struct iommu_table_group *table_group,
+				      const __be32 *dma_window)
+{
+	unsigned long offset, size;
+
+	of_parse_dma_window(dn, dma_window, &tbl->it_index, &offset, &size);
+
+	tbl->it_busno = phb->bus->number;
+	tbl->it_page_shift = IOMMU_PAGE_SHIFT_4K;
+	tbl->it_base   = 0;
+	tbl->it_blocksize  = 16;
+	tbl->it_type = TCE_PCI;
+	tbl->it_offset = offset >> tbl->it_page_shift;
+	tbl->it_size = size >> tbl->it_page_shift;
+
+	table_group->tce32_start = offset;
+	table_group->tce32_size = size;
+}
+
+struct iommu_table_ops iommu_table_pseries_ops = {
+	.set = tce_build_pSeries,
+	.clear = tce_free_pSeries,
+	.get = tce_get_pseries
+};
+
+static void pci_dma_bus_setup_pSeries(struct pci_bus *bus)
+{
+	struct device_node *dn;
+	struct iommu_table *tbl;
+	struct device_node *isa_dn, *isa_dn_orig;
+	struct device_node *tmp;
+	struct pci_dn *pci;
+	int children;
+
+	dn = pci_bus_to_OF_node(bus);
+
+	pr_debug("pci_dma_bus_setup_pSeries: setting up bus %pOF\n", dn);
+
+	if (bus->self) {
+		/* This is not a root bus, any setup will be done for the
+		 * device-side of the bridge in iommu_dev_setup_pSeries().
+		 */
+		return;
+	}
+	pci = PCI_DN(dn);
+
+	/* Check if the ISA bus on the system is under
+	 * this PHB.
+	 */
+	isa_dn = isa_dn_orig = of_find_node_by_type(NULL, "isa");
+
+	while (isa_dn && isa_dn != dn)
+		isa_dn = isa_dn->parent;
+
+	of_node_put(isa_dn_orig);
+
+	/* Count number of direct PCI children of the PHB. */
+	for (children = 0, tmp = dn->child; tmp; tmp = tmp->sibling)
+		children++;
+
+	pr_debug("Children: %d\n", children);
+
+	/* Calculate amount of DMA window per slot. Each window must be
+	 * a power of two (due to pci_alloc_consistent requirements).
+	 *
+	 * Keep 256MB aside for PHBs with ISA.
+	 */
+
+	if (!isa_dn) {
+		/* No ISA/IDE - just set window size and return */
+		pci->phb->dma_window_size = 0x80000000ul; /* To be divided */
+
+		while (pci->phb->dma_window_size * children > 0x80000000ul)
+			pci->phb->dma_window_size >>= 1;
+		pr_debug("No ISA/IDE, window size is 0x%llx\n",
+			 pci->phb->dma_window_size);
+		pci->phb->dma_window_base_cur = 0;
+
+		return;
+	}
+
+	/* If we have ISA, then we probably have an IDE
+	 * controller too. Allocate a 128MB table but
+	 * skip the first 128MB to avoid stepping on ISA
+	 * space.
+	 */
+	pci->phb->dma_window_size = 0x8000000ul;
+	pci->phb->dma_window_base_cur = 0x8000000ul;
+
+	pci->table_group = iommu_pseries_alloc_group(pci->phb->node);
+	tbl = pci->table_group->tables[0];
+
+	iommu_table_setparms(pci->phb, dn, tbl);
+	tbl->it_ops = &iommu_table_pseries_ops;
+	iommu_init_table(tbl, pci->phb->node);
+	iommu_register_group(pci->table_group, pci_domain_nr(bus), 0);
+
+	/* Divide the rest (1.75GB) among the children */
+	pci->phb->dma_window_size = 0x80000000ul;
+	while (pci->phb->dma_window_size * children > 0x70000000ul)
+		pci->phb->dma_window_size >>= 1;
+
+	pr_debug("ISA/IDE, window size is 0x%llx\n", pci->phb->dma_window_size);
+}
+
+#ifdef CONFIG_IOMMU_API
+static int tce_exchange_pseries(struct iommu_table *tbl, long index, unsigned
+				long *tce, enum dma_data_direction *direction)
+{
+	long rc;
+	unsigned long ioba = (unsigned long) index << tbl->it_page_shift;
+	unsigned long flags, oldtce = 0;
+	u64 proto_tce = iommu_direction_to_tce_perm(*direction);
+	unsigned long newtce = *tce | proto_tce;
+
+	spin_lock_irqsave(&tbl->large_pool.lock, flags);
+
+	rc = plpar_tce_get((u64)tbl->it_index, ioba, &oldtce);
+	if (!rc)
+		rc = plpar_tce_put((u64)tbl->it_index, ioba, newtce);
+
+	if (!rc) {
+		*direction = iommu_tce_direction(oldtce);
+		*tce = oldtce & ~(TCE_PCI_READ | TCE_PCI_WRITE);
+	}
+
+	spin_unlock_irqrestore(&tbl->large_pool.lock, flags);
+
+	return rc;
+}
+#endif
+
+struct iommu_table_ops iommu_table_lpar_multi_ops = {
+	.set = tce_buildmulti_pSeriesLP,
+#ifdef CONFIG_IOMMU_API
+	.exchange = tce_exchange_pseries,
+#endif
+	.clear = tce_freemulti_pSeriesLP,
+	.get = tce_get_pSeriesLP
+};
+
+static void pci_dma_bus_setup_pSeriesLP(struct pci_bus *bus)
+{
+	struct iommu_table *tbl;
+	struct device_node *dn, *pdn;
+	struct pci_dn *ppci;
+	const __be32 *dma_window = NULL;
+
+	dn = pci_bus_to_OF_node(bus);
+
+	pr_debug("pci_dma_bus_setup_pSeriesLP: setting up bus %pOF\n",
+		 dn);
+
+	/* Find nearest ibm,dma-window, walking up the device tree */
+	for (pdn = dn; pdn != NULL; pdn = pdn->parent) {
+		dma_window = of_get_property(pdn, "ibm,dma-window", NULL);
+		if (dma_window != NULL)
+			break;
+	}
+
+	if (dma_window == NULL) {
+		pr_debug("  no ibm,dma-window property !\n");
+		return;
+	}
+
+	ppci = PCI_DN(pdn);
+
+	pr_debug("  parent is %pOF, iommu_table: 0x%p\n",
+		 pdn, ppci->table_group);
+
+	if (!ppci->table_group) {
+		ppci->table_group = iommu_pseries_alloc_group(ppci->phb->node);
+		tbl = ppci->table_group->tables[0];
+		iommu_table_setparms_lpar(ppci->phb, pdn, tbl,
+				ppci->table_group, dma_window);
+		tbl->it_ops = &iommu_table_lpar_multi_ops;
+		iommu_init_table(tbl, ppci->phb->node);
+		iommu_register_group(ppci->table_group,
+				pci_domain_nr(bus), 0);
+		pr_debug("  created table: %p\n", ppci->table_group);
+	}
+}
+
+
+static void pci_dma_dev_setup_pSeries(struct pci_dev *dev)
+{
+	struct device_node *dn;
+	struct iommu_table *tbl;
+
+	pr_debug("pci_dma_dev_setup_pSeries: %s\n", pci_name(dev));
+
+	dn = dev->dev.of_node;
+
+	/* If we're the direct child of a root bus, then we need to allocate
+	 * an iommu table ourselves. The bus setup code should have setup
+	 * the window sizes already.
+	 */
+	if (!dev->bus->self) {
+		struct pci_controller *phb = PCI_DN(dn)->phb;
+
+		pr_debug(" --> first child, no bridge. Allocating iommu table.\n");
+		PCI_DN(dn)->table_group = iommu_pseries_alloc_group(phb->node);
+		tbl = PCI_DN(dn)->table_group->tables[0];
+		iommu_table_setparms(phb, dn, tbl);
+		tbl->it_ops = &iommu_table_pseries_ops;
+		iommu_init_table(tbl, phb->node);
+		iommu_register_group(PCI_DN(dn)->table_group,
+				pci_domain_nr(phb->bus), 0);
+		set_iommu_table_base(&dev->dev, tbl);
+		iommu_add_device(&dev->dev);
+		return;
+	}
+
+	/* If this device is further down the bus tree, search upwards until
+	 * an already allocated iommu table is found and use that.
+	 */
+
+	while (dn && PCI_DN(dn) && PCI_DN(dn)->table_group == NULL)
+		dn = dn->parent;
+
+	if (dn && PCI_DN(dn)) {
+		set_iommu_table_base(&dev->dev,
+				PCI_DN(dn)->table_group->tables[0]);
+		iommu_add_device(&dev->dev);
+	} else
+		printk(KERN_WARNING "iommu: Device %s has no iommu table\n",
+		       pci_name(dev));
+}
+
+static int __read_mostly disable_ddw;
+
+static int __init disable_ddw_setup(char *str)
+{
+	disable_ddw = 1;
+	printk(KERN_INFO "ppc iommu: disabling ddw.\n");
+
+	return 0;
+}
+
+early_param("disable_ddw", disable_ddw_setup);
+
+static void remove_ddw(struct device_node *np, bool remove_prop)
+{
+	struct dynamic_dma_window_prop *dwp;
+	struct property *win64;
+	u32 ddw_avail[3];
+	u64 liobn;
+	int ret = 0;
+
+	ret = of_property_read_u32_array(np, "ibm,ddw-applicable",
+					 &ddw_avail[0], 3);
+
+	win64 = of_find_property(np, DIRECT64_PROPNAME, NULL);
+	if (!win64)
+		return;
+
+	if (ret || win64->length < sizeof(*dwp))
+		goto delprop;
+
+	dwp = win64->value;
+	liobn = (u64)be32_to_cpu(dwp->liobn);
+
+	/* clear the whole window, note the arg is in kernel pages */
+	ret = tce_clearrange_multi_pSeriesLP(0,
+		1ULL << (be32_to_cpu(dwp->window_shift) - PAGE_SHIFT), dwp);
+	if (ret)
+		pr_warn("%pOF failed to clear tces in window.\n",
+			np);
+	else
+		pr_debug("%pOF successfully cleared tces in window.\n",
+			 np);
+
+	ret = rtas_call(ddw_avail[2], 1, 1, NULL, liobn);
+	if (ret)
+		pr_warn("%pOF: failed to remove direct window: rtas returned "
+			"%d to ibm,remove-pe-dma-window(%x) %llx\n",
+			np, ret, ddw_avail[2], liobn);
+	else
+		pr_debug("%pOF: successfully removed direct window: rtas returned "
+			"%d to ibm,remove-pe-dma-window(%x) %llx\n",
+			np, ret, ddw_avail[2], liobn);
+
+delprop:
+	if (remove_prop)
+		ret = of_remove_property(np, win64);
+	if (ret)
+		pr_warn("%pOF: failed to remove direct window property: %d\n",
+			np, ret);
+}
+
+static u64 find_existing_ddw(struct device_node *pdn)
+{
+	struct direct_window *window;
+	const struct dynamic_dma_window_prop *direct64;
+	u64 dma_addr = 0;
+
+	spin_lock(&direct_window_list_lock);
+	/* check if we already created a window and dupe that config if so */
+	list_for_each_entry(window, &direct_window_list, list) {
+		if (window->device == pdn) {
+			direct64 = window->prop;
+			dma_addr = be64_to_cpu(direct64->dma_base);
+			break;
+		}
+	}
+	spin_unlock(&direct_window_list_lock);
+
+	return dma_addr;
+}
+
+static int find_existing_ddw_windows(void)
+{
+	int len;
+	struct device_node *pdn;
+	struct direct_window *window;
+	const struct dynamic_dma_window_prop *direct64;
+
+	if (!firmware_has_feature(FW_FEATURE_LPAR))
+		return 0;
+
+	for_each_node_with_property(pdn, DIRECT64_PROPNAME) {
+		direct64 = of_get_property(pdn, DIRECT64_PROPNAME, &len);
+		if (!direct64)
+			continue;
+
+		window = kzalloc(sizeof(*window), GFP_KERNEL);
+		if (!window || len < sizeof(struct dynamic_dma_window_prop)) {
+			kfree(window);
+			remove_ddw(pdn, true);
+			continue;
+		}
+
+		window->device = pdn;
+		window->prop = direct64;
+		spin_lock(&direct_window_list_lock);
+		list_add(&window->list, &direct_window_list);
+		spin_unlock(&direct_window_list_lock);
+	}
+
+	return 0;
+}
+machine_arch_initcall(pseries, find_existing_ddw_windows);
+
+static int query_ddw(struct pci_dev *dev, const u32 *ddw_avail,
+			struct ddw_query_response *query)
+{
+	struct device_node *dn;
+	struct pci_dn *pdn;
+	u32 cfg_addr;
+	u64 buid;
+	int ret;
+
+	/*
+	 * Get the config address and phb buid of the PE window.
+	 * Rely on eeh to retrieve this for us.
+	 * Retrieve them from the pci device, not the node with the
+	 * dma-window property
+	 */
+	dn = pci_device_to_OF_node(dev);
+	pdn = PCI_DN(dn);
+	buid = pdn->phb->buid;
+	cfg_addr = ((pdn->busno << 16) | (pdn->devfn << 8));
+
+	ret = rtas_call(ddw_avail[0], 3, 5, (u32 *)query,
+		  cfg_addr, BUID_HI(buid), BUID_LO(buid));
+	dev_info(&dev->dev, "ibm,query-pe-dma-windows(%x) %x %x %x"
+		" returned %d\n", ddw_avail[0], cfg_addr, BUID_HI(buid),
+		BUID_LO(buid), ret);
+	return ret;
+}
+
+static int create_ddw(struct pci_dev *dev, const u32 *ddw_avail,
+			struct ddw_create_response *create, int page_shift,
+			int window_shift)
+{
+	struct device_node *dn;
+	struct pci_dn *pdn;
+	u32 cfg_addr;
+	u64 buid;
+	int ret;
+
+	/*
+	 * Get the config address and phb buid of the PE window.
+	 * Rely on eeh to retrieve this for us.
+	 * Retrieve them from the pci device, not the node with the
+	 * dma-window property
+	 */
+	dn = pci_device_to_OF_node(dev);
+	pdn = PCI_DN(dn);
+	buid = pdn->phb->buid;
+	cfg_addr = ((pdn->busno << 16) | (pdn->devfn << 8));
+
+	do {
+		/* extra outputs are LIOBN and dma-addr (hi, lo) */
+		ret = rtas_call(ddw_avail[1], 5, 4, (u32 *)create,
+				cfg_addr, BUID_HI(buid), BUID_LO(buid),
+				page_shift, window_shift);
+	} while (rtas_busy_delay(ret));
+	dev_info(&dev->dev,
+		"ibm,create-pe-dma-window(%x) %x %x %x %x %x returned %d "
+		"(liobn = 0x%x starting addr = %x %x)\n", ddw_avail[1],
+		 cfg_addr, BUID_HI(buid), BUID_LO(buid), page_shift,
+		 window_shift, ret, create->liobn, create->addr_hi, create->addr_lo);
+
+	return ret;
+}
+
+struct failed_ddw_pdn {
+	struct device_node *pdn;
+	struct list_head list;
+};
+
+static LIST_HEAD(failed_ddw_pdn_list);
+
+/*
+ * If the PE supports dynamic dma windows, and there is space for a table
+ * that can map all pages in a linear offset, then setup such a table,
+ * and record the dma-offset in the struct device.
+ *
+ * dev: the pci device we are checking
+ * pdn: the parent pe node with the ibm,dma_window property
+ * Future: also check if we can remap the base window for our base page size
+ *
+ * returns the dma offset for use by dma_set_mask
+ */
+static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn)
+{
+	int len, ret;
+	struct ddw_query_response query;
+	struct ddw_create_response create;
+	int page_shift;
+	u64 dma_addr, max_addr;
+	struct device_node *dn;
+	u32 ddw_avail[3];
+	struct direct_window *window;
+	struct property *win64;
+	struct dynamic_dma_window_prop *ddwprop;
+	struct failed_ddw_pdn *fpdn;
+
+	mutex_lock(&direct_window_init_mutex);
+
+	dma_addr = find_existing_ddw(pdn);
+	if (dma_addr != 0)
+		goto out_unlock;
+
+	/*
+	 * If we already went through this for a previous function of
+	 * the same device and failed, we don't want to muck with the
+	 * DMA window again, as it will race with in-flight operations
+	 * and can lead to EEHs. The above mutex protects access to the
+	 * list.
+	 */
+	list_for_each_entry(fpdn, &failed_ddw_pdn_list, list) {
+		if (fpdn->pdn == pdn)
+			goto out_unlock;
+	}
+
+	/*
+	 * the ibm,ddw-applicable property holds the tokens for:
+	 * ibm,query-pe-dma-window
+	 * ibm,create-pe-dma-window
+	 * ibm,remove-pe-dma-window
+	 * for the given node in that order.
+	 * the property is actually in the parent, not the PE
+	 */
+	ret = of_property_read_u32_array(pdn, "ibm,ddw-applicable",
+					 &ddw_avail[0], 3);
+	if (ret)
+		goto out_failed;
+
+       /*
+	 * Query if there is a second window of size to map the
+	 * whole partition.  Query returns number of windows, largest
+	 * block assigned to PE (partition endpoint), and two bitmasks
+	 * of page sizes: supported and supported for migrate-dma.
+	 */
+	dn = pci_device_to_OF_node(dev);
+	ret = query_ddw(dev, ddw_avail, &query);
+	if (ret != 0)
+		goto out_failed;
+
+	if (query.windows_available == 0) {
+		/*
+		 * no additional windows are available for this device.
+		 * We might be able to reallocate the existing window,
+		 * trading in for a larger page size.
+		 */
+		dev_dbg(&dev->dev, "no free dynamic windows");
+		goto out_failed;
+	}
+	if (query.page_size & 4) {
+		page_shift = 24; /* 16MB */
+	} else if (query.page_size & 2) {
+		page_shift = 16; /* 64kB */
+	} else if (query.page_size & 1) {
+		page_shift = 12; /* 4kB */
+	} else {
+		dev_dbg(&dev->dev, "no supported direct page size in mask %x",
+			  query.page_size);
+		goto out_failed;
+	}
+	/* verify the window * number of ptes will map the partition */
+	/* check largest block * page size > max memory hotplug addr */
+	max_addr = memory_hotplug_max();
+	if (query.largest_available_block < (max_addr >> page_shift)) {
+		dev_dbg(&dev->dev, "can't map partition max 0x%llx with %u "
+			  "%llu-sized pages\n", max_addr,  query.largest_available_block,
+			  1ULL << page_shift);
+		goto out_failed;
+	}
+	len = order_base_2(max_addr);
+	win64 = kzalloc(sizeof(struct property), GFP_KERNEL);
+	if (!win64) {
+		dev_info(&dev->dev,
+			"couldn't allocate property for 64bit dma window\n");
+		goto out_failed;
+	}
+	win64->name = kstrdup(DIRECT64_PROPNAME, GFP_KERNEL);
+	win64->value = ddwprop = kmalloc(sizeof(*ddwprop), GFP_KERNEL);
+	win64->length = sizeof(*ddwprop);
+	if (!win64->name || !win64->value) {
+		dev_info(&dev->dev,
+			"couldn't allocate property name and value\n");
+		goto out_free_prop;
+	}
+
+	ret = create_ddw(dev, ddw_avail, &create, page_shift, len);
+	if (ret != 0)
+		goto out_free_prop;
+
+	ddwprop->liobn = cpu_to_be32(create.liobn);
+	ddwprop->dma_base = cpu_to_be64(((u64)create.addr_hi << 32) |
+			create.addr_lo);
+	ddwprop->tce_shift = cpu_to_be32(page_shift);
+	ddwprop->window_shift = cpu_to_be32(len);
+
+	dev_dbg(&dev->dev, "created tce table LIOBN 0x%x for %pOF\n",
+		  create.liobn, dn);
+
+	window = kzalloc(sizeof(*window), GFP_KERNEL);
+	if (!window)
+		goto out_clear_window;
+
+	ret = walk_system_ram_range(0, memblock_end_of_DRAM() >> PAGE_SHIFT,
+			win64->value, tce_setrange_multi_pSeriesLP_walk);
+	if (ret) {
+		dev_info(&dev->dev, "failed to map direct window for %pOF: %d\n",
+			 dn, ret);
+		goto out_free_window;
+	}
+
+	ret = of_add_property(pdn, win64);
+	if (ret) {
+		dev_err(&dev->dev, "unable to add dma window property for %pOF: %d",
+			 pdn, ret);
+		goto out_free_window;
+	}
+
+	window->device = pdn;
+	window->prop = ddwprop;
+	spin_lock(&direct_window_list_lock);
+	list_add(&window->list, &direct_window_list);
+	spin_unlock(&direct_window_list_lock);
+
+	dma_addr = be64_to_cpu(ddwprop->dma_base);
+	goto out_unlock;
+
+out_free_window:
+	kfree(window);
+
+out_clear_window:
+	remove_ddw(pdn, true);
+
+out_free_prop:
+	kfree(win64->name);
+	kfree(win64->value);
+	kfree(win64);
+
+out_failed:
+
+	fpdn = kzalloc(sizeof(*fpdn), GFP_KERNEL);
+	if (!fpdn)
+		goto out_unlock;
+	fpdn->pdn = pdn;
+	list_add(&fpdn->list, &failed_ddw_pdn_list);
+
+out_unlock:
+	mutex_unlock(&direct_window_init_mutex);
+	return dma_addr;
+}
+
+static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev)
+{
+	struct device_node *pdn, *dn;
+	struct iommu_table *tbl;
+	const __be32 *dma_window = NULL;
+	struct pci_dn *pci;
+
+	pr_debug("pci_dma_dev_setup_pSeriesLP: %s\n", pci_name(dev));
+
+	/* dev setup for LPAR is a little tricky, since the device tree might
+	 * contain the dma-window properties per-device and not necessarily
+	 * for the bus. So we need to search upwards in the tree until we
+	 * either hit a dma-window property, OR find a parent with a table
+	 * already allocated.
+	 */
+	dn = pci_device_to_OF_node(dev);
+	pr_debug("  node is %pOF\n", dn);
+
+	for (pdn = dn; pdn && PCI_DN(pdn) && !PCI_DN(pdn)->table_group;
+	     pdn = pdn->parent) {
+		dma_window = of_get_property(pdn, "ibm,dma-window", NULL);
+		if (dma_window)
+			break;
+	}
+
+	if (!pdn || !PCI_DN(pdn)) {
+		printk(KERN_WARNING "pci_dma_dev_setup_pSeriesLP: "
+		       "no DMA window found for pci dev=%s dn=%pOF\n",
+				 pci_name(dev), dn);
+		return;
+	}
+	pr_debug("  parent is %pOF\n", pdn);
+
+	pci = PCI_DN(pdn);
+	if (!pci->table_group) {
+		pci->table_group = iommu_pseries_alloc_group(pci->phb->node);
+		tbl = pci->table_group->tables[0];
+		iommu_table_setparms_lpar(pci->phb, pdn, tbl,
+				pci->table_group, dma_window);
+		tbl->it_ops = &iommu_table_lpar_multi_ops;
+		iommu_init_table(tbl, pci->phb->node);
+		iommu_register_group(pci->table_group,
+				pci_domain_nr(pci->phb->bus), 0);
+		pr_debug("  created table: %p\n", pci->table_group);
+	} else {
+		pr_debug("  found DMA window, table: %p\n", pci->table_group);
+	}
+
+	set_iommu_table_base(&dev->dev, pci->table_group->tables[0]);
+	iommu_add_device(&dev->dev);
+}
+
+static int dma_set_mask_pSeriesLP(struct device *dev, u64 dma_mask)
+{
+	bool ddw_enabled = false;
+	struct device_node *pdn, *dn;
+	struct pci_dev *pdev;
+	const __be32 *dma_window = NULL;
+	u64 dma_offset;
+
+	if (!dev->dma_mask)
+		return -EIO;
+
+	if (!dev_is_pci(dev))
+		goto check_mask;
+
+	pdev = to_pci_dev(dev);
+
+	/* only attempt to use a new window if 64-bit DMA is requested */
+	if (!disable_ddw && dma_mask == DMA_BIT_MASK(64)) {
+		dn = pci_device_to_OF_node(pdev);
+		dev_dbg(dev, "node is %pOF\n", dn);
+
+		/*
+		 * the device tree might contain the dma-window properties
+		 * per-device and not necessarily for the bus. So we need to
+		 * search upwards in the tree until we either hit a dma-window
+		 * property, OR find a parent with a table already allocated.
+		 */
+		for (pdn = dn; pdn && PCI_DN(pdn) && !PCI_DN(pdn)->table_group;
+				pdn = pdn->parent) {
+			dma_window = of_get_property(pdn, "ibm,dma-window", NULL);
+			if (dma_window)
+				break;
+		}
+		if (pdn && PCI_DN(pdn)) {
+			dma_offset = enable_ddw(pdev, pdn);
+			if (dma_offset != 0) {
+				dev_info(dev, "Using 64-bit direct DMA at offset %llx\n", dma_offset);
+				set_dma_offset(dev, dma_offset);
+				set_dma_ops(dev, &dma_nommu_ops);
+				ddw_enabled = true;
+			}
+		}
+	}
+
+	/* fall back on iommu ops */
+	if (!ddw_enabled && get_dma_ops(dev) != &dma_iommu_ops) {
+		dev_info(dev, "Restoring 32-bit DMA via iommu\n");
+		set_dma_ops(dev, &dma_iommu_ops);
+	}
+
+check_mask:
+	if (!dma_supported(dev, dma_mask))
+		return -EIO;
+
+	*dev->dma_mask = dma_mask;
+	return 0;
+}
+
+static u64 dma_get_required_mask_pSeriesLP(struct device *dev)
+{
+	if (!dev->dma_mask)
+		return 0;
+
+	if (!disable_ddw && dev_is_pci(dev)) {
+		struct pci_dev *pdev = to_pci_dev(dev);
+		struct device_node *dn;
+
+		dn = pci_device_to_OF_node(pdev);
+
+		/* search upwards for ibm,dma-window */
+		for (; dn && PCI_DN(dn) && !PCI_DN(dn)->table_group;
+				dn = dn->parent)
+			if (of_get_property(dn, "ibm,dma-window", NULL))
+				break;
+		/* if there is a ibm,ddw-applicable property require 64 bits */
+		if (dn && PCI_DN(dn) &&
+				of_get_property(dn, "ibm,ddw-applicable", NULL))
+			return DMA_BIT_MASK(64);
+	}
+
+	return dma_iommu_ops.get_required_mask(dev);
+}
+
+static int iommu_mem_notifier(struct notifier_block *nb, unsigned long action,
+		void *data)
+{
+	struct direct_window *window;
+	struct memory_notify *arg = data;
+	int ret = 0;
+
+	switch (action) {
+	case MEM_GOING_ONLINE:
+		spin_lock(&direct_window_list_lock);
+		list_for_each_entry(window, &direct_window_list, list) {
+			ret |= tce_setrange_multi_pSeriesLP(arg->start_pfn,
+					arg->nr_pages, window->prop);
+			/* XXX log error */
+		}
+		spin_unlock(&direct_window_list_lock);
+		break;
+	case MEM_CANCEL_ONLINE:
+	case MEM_OFFLINE:
+		spin_lock(&direct_window_list_lock);
+		list_for_each_entry(window, &direct_window_list, list) {
+			ret |= tce_clearrange_multi_pSeriesLP(arg->start_pfn,
+					arg->nr_pages, window->prop);
+			/* XXX log error */
+		}
+		spin_unlock(&direct_window_list_lock);
+		break;
+	default:
+		break;
+	}
+	if (ret && action != MEM_CANCEL_ONLINE)
+		return NOTIFY_BAD;
+
+	return NOTIFY_OK;
+}
+
+static struct notifier_block iommu_mem_nb = {
+	.notifier_call = iommu_mem_notifier,
+};
+
+static int iommu_reconfig_notifier(struct notifier_block *nb, unsigned long action, void *data)
+{
+	int err = NOTIFY_OK;
+	struct of_reconfig_data *rd = data;
+	struct device_node *np = rd->dn;
+	struct pci_dn *pci = PCI_DN(np);
+	struct direct_window *window;
+
+	switch (action) {
+	case OF_RECONFIG_DETACH_NODE:
+		/*
+		 * Removing the property will invoke the reconfig
+		 * notifier again, which causes dead-lock on the
+		 * read-write semaphore of the notifier chain. So
+		 * we have to remove the property when releasing
+		 * the device node.
+		 */
+		remove_ddw(np, false);
+		if (pci && pci->table_group)
+			iommu_pseries_free_group(pci->table_group,
+					np->full_name);
+
+		spin_lock(&direct_window_list_lock);
+		list_for_each_entry(window, &direct_window_list, list) {
+			if (window->device == np) {
+				list_del(&window->list);
+				kfree(window);
+				break;
+			}
+		}
+		spin_unlock(&direct_window_list_lock);
+		break;
+	default:
+		err = NOTIFY_DONE;
+		break;
+	}
+	return err;
+}
+
+static struct notifier_block iommu_reconfig_nb = {
+	.notifier_call = iommu_reconfig_notifier,
+};
+
+/* These are called very early. */
+void iommu_init_early_pSeries(void)
+{
+	if (of_chosen && of_get_property(of_chosen, "linux,iommu-off", NULL))
+		return;
+
+	if (firmware_has_feature(FW_FEATURE_LPAR)) {
+		pseries_pci_controller_ops.dma_bus_setup = pci_dma_bus_setup_pSeriesLP;
+		pseries_pci_controller_ops.dma_dev_setup = pci_dma_dev_setup_pSeriesLP;
+		ppc_md.dma_set_mask = dma_set_mask_pSeriesLP;
+		ppc_md.dma_get_required_mask = dma_get_required_mask_pSeriesLP;
+	} else {
+		pseries_pci_controller_ops.dma_bus_setup = pci_dma_bus_setup_pSeries;
+		pseries_pci_controller_ops.dma_dev_setup = pci_dma_dev_setup_pSeries;
+	}
+
+
+	of_reconfig_notifier_register(&iommu_reconfig_nb);
+	register_memory_notifier(&iommu_mem_nb);
+
+	set_pci_dma_ops(&dma_iommu_ops);
+}
+
+static int __init disable_multitce(char *str)
+{
+	if (strcmp(str, "off") == 0 &&
+	    firmware_has_feature(FW_FEATURE_LPAR) &&
+	    firmware_has_feature(FW_FEATURE_MULTITCE)) {
+		printk(KERN_INFO "Disabling MULTITCE firmware feature\n");
+		powerpc_firmware_features &= ~FW_FEATURE_MULTITCE;
+	}
+	return 1;
+}
+
+__setup("multitce=", disable_multitce);
+
+machine_subsys_initcall_sync(pseries, tce_iommu_bus_notifier_init);
diff --git a/arch/powerpc/platforms/pseries/kexec.c b/arch/powerpc/platforms/pseries/kexec.c
new file mode 100644
index 000000000..23f54223e
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/kexec.c
@@ -0,0 +1,67 @@
+/*
+ *  Copyright 2006 Michael Ellerman, IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/interrupt.h>
+
+#include <asm/machdep.h>
+#include <asm/page.h>
+#include <asm/firmware.h>
+#include <asm/kexec.h>
+#include <asm/xics.h>
+#include <asm/xive.h>
+#include <asm/smp.h>
+#include <asm/plpar_wrappers.h>
+
+#include "pseries.h"
+
+void pseries_kexec_cpu_down(int crash_shutdown, int secondary)
+{
+	/*
+	 * Don't risk a hypervisor call if we're crashing
+	 * XXX: Why? The hypervisor is not crashing. It might be better
+	 * to at least attempt unregister to avoid the hypervisor stepping
+	 * on our memory.
+	 */
+	if (firmware_has_feature(FW_FEATURE_SPLPAR) && !crash_shutdown) {
+		int ret;
+		int cpu = smp_processor_id();
+		int hwcpu = hard_smp_processor_id();
+
+		if (get_lppaca()->dtl_enable_mask) {
+			ret = unregister_dtl(hwcpu);
+			if (ret) {
+				pr_err("WARNING: DTL deregistration for cpu "
+				       "%d (hw %d) failed with %d\n",
+				       cpu, hwcpu, ret);
+			}
+		}
+
+		ret = unregister_slb_shadow(hwcpu);
+		if (ret) {
+			pr_err("WARNING: SLB shadow buffer deregistration "
+			       "for cpu %d (hw %d) failed with %d\n",
+			       cpu, hwcpu, ret);
+		}
+
+		ret = unregister_vpa(hwcpu);
+		if (ret) {
+			pr_err("WARNING: VPA deregistration for cpu %d "
+			       "(hw %d) failed with %d\n", cpu, hwcpu, ret);
+		}
+	}
+
+	if (xive_enabled()) {
+		xive_teardown_cpu();
+
+		if (!secondary)
+			xive_shutdown();
+	} else
+		xics_kexec_teardown_cpu(secondary);
+}
diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c
new file mode 100644
index 000000000..d660a9061
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/lpar.c
@@ -0,0 +1,1088 @@
+/*
+ * pSeries_lpar.c
+ * Copyright (C) 2001 Todd Inglett, IBM Corporation
+ *
+ * pSeries LPAR support.
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ */
+
+/* Enables debugging of low-level hash table routines - careful! */
+#undef DEBUG
+#define pr_fmt(fmt) "lpar: " fmt
+
+#include <linux/kernel.h>
+#include <linux/dma-mapping.h>
+#include <linux/console.h>
+#include <linux/export.h>
+#include <linux/jump_label.h>
+#include <linux/delay.h>
+#include <linux/stop_machine.h>
+#include <asm/processor.h>
+#include <asm/mmu.h>
+#include <asm/page.h>
+#include <asm/pgtable.h>
+#include <asm/machdep.h>
+#include <asm/mmu_context.h>
+#include <asm/iommu.h>
+#include <asm/tlb.h>
+#include <asm/prom.h>
+#include <asm/cputable.h>
+#include <asm/udbg.h>
+#include <asm/smp.h>
+#include <asm/trace.h>
+#include <asm/firmware.h>
+#include <asm/plpar_wrappers.h>
+#include <asm/kexec.h>
+#include <asm/fadump.h>
+#include <asm/asm-prototypes.h>
+#include <asm/debugfs.h>
+
+#include "pseries.h"
+
+/* Flag bits for H_BULK_REMOVE */
+#define HBR_REQUEST	0x4000000000000000UL
+#define HBR_RESPONSE	0x8000000000000000UL
+#define HBR_END		0xc000000000000000UL
+#define HBR_AVPN	0x0200000000000000UL
+#define HBR_ANDCOND	0x0100000000000000UL
+
+
+/* in hvCall.S */
+EXPORT_SYMBOL(plpar_hcall);
+EXPORT_SYMBOL(plpar_hcall9);
+EXPORT_SYMBOL(plpar_hcall_norets);
+
+void vpa_init(int cpu)
+{
+	int hwcpu = get_hard_smp_processor_id(cpu);
+	unsigned long addr;
+	long ret;
+	struct paca_struct *pp;
+	struct dtl_entry *dtl;
+
+	/*
+	 * The spec says it "may be problematic" if CPU x registers the VPA of
+	 * CPU y. We should never do that, but wail if we ever do.
+	 */
+	WARN_ON(cpu != smp_processor_id());
+
+	if (cpu_has_feature(CPU_FTR_ALTIVEC))
+		lppaca_of(cpu).vmxregs_in_use = 1;
+
+	if (cpu_has_feature(CPU_FTR_ARCH_207S))
+		lppaca_of(cpu).ebb_regs_in_use = 1;
+
+	addr = __pa(&lppaca_of(cpu));
+	ret = register_vpa(hwcpu, addr);
+
+	if (ret) {
+		pr_err("WARNING: VPA registration for cpu %d (hw %d) of area "
+		       "%lx failed with %ld\n", cpu, hwcpu, addr, ret);
+		return;
+	}
+
+#ifdef CONFIG_PPC_BOOK3S_64
+	/*
+	 * PAPR says this feature is SLB-Buffer but firmware never
+	 * reports that.  All SPLPAR support SLB shadow buffer.
+	 */
+	if (!radix_enabled() && firmware_has_feature(FW_FEATURE_SPLPAR)) {
+		addr = __pa(paca_ptrs[cpu]->slb_shadow_ptr);
+		ret = register_slb_shadow(hwcpu, addr);
+		if (ret)
+			pr_err("WARNING: SLB shadow buffer registration for "
+			       "cpu %d (hw %d) of area %lx failed with %ld\n",
+			       cpu, hwcpu, addr, ret);
+	}
+#endif /* CONFIG_PPC_BOOK3S_64 */
+
+	/*
+	 * Register dispatch trace log, if one has been allocated.
+	 */
+	pp = paca_ptrs[cpu];
+	dtl = pp->dispatch_log;
+	if (dtl) {
+		pp->dtl_ridx = 0;
+		pp->dtl_curr = dtl;
+		lppaca_of(cpu).dtl_idx = 0;
+
+		/* hypervisor reads buffer length from this field */
+		dtl->enqueue_to_dispatch_time = cpu_to_be32(DISPATCH_LOG_BYTES);
+		ret = register_dtl(hwcpu, __pa(dtl));
+		if (ret)
+			pr_err("WARNING: DTL registration of cpu %d (hw %d) "
+			       "failed with %ld\n", smp_processor_id(),
+			       hwcpu, ret);
+		lppaca_of(cpu).dtl_enable_mask = 2;
+	}
+}
+
+#ifdef CONFIG_PPC_BOOK3S_64
+
+static long pSeries_lpar_hpte_insert(unsigned long hpte_group,
+				     unsigned long vpn, unsigned long pa,
+				     unsigned long rflags, unsigned long vflags,
+				     int psize, int apsize, int ssize)
+{
+	unsigned long lpar_rc;
+	unsigned long flags;
+	unsigned long slot;
+	unsigned long hpte_v, hpte_r;
+
+	if (!(vflags & HPTE_V_BOLTED))
+		pr_devel("hpte_insert(group=%lx, vpn=%016lx, "
+			 "pa=%016lx, rflags=%lx, vflags=%lx, psize=%d)\n",
+			 hpte_group, vpn,  pa, rflags, vflags, psize);
+
+	hpte_v = hpte_encode_v(vpn, psize, apsize, ssize) | vflags | HPTE_V_VALID;
+	hpte_r = hpte_encode_r(pa, psize, apsize) | rflags;
+
+	if (!(vflags & HPTE_V_BOLTED))
+		pr_devel(" hpte_v=%016lx, hpte_r=%016lx\n", hpte_v, hpte_r);
+
+	/* Now fill in the actual HPTE */
+	/* Set CEC cookie to 0         */
+	/* Zero page = 0               */
+	/* I-cache Invalidate = 0      */
+	/* I-cache synchronize = 0     */
+	/* Exact = 0                   */
+	flags = 0;
+
+	if (firmware_has_feature(FW_FEATURE_XCMO) && !(hpte_r & HPTE_R_N))
+		flags |= H_COALESCE_CAND;
+
+	lpar_rc = plpar_pte_enter(flags, hpte_group, hpte_v, hpte_r, &slot);
+	if (unlikely(lpar_rc == H_PTEG_FULL)) {
+		pr_devel("Hash table group is full\n");
+		return -1;
+	}
+
+	/*
+	 * Since we try and ioremap PHBs we don't own, the pte insert
+	 * will fail. However we must catch the failure in hash_page
+	 * or we will loop forever, so return -2 in this case.
+	 */
+	if (unlikely(lpar_rc != H_SUCCESS)) {
+		pr_err("Failed hash pte insert with error %ld\n", lpar_rc);
+		return -2;
+	}
+	if (!(vflags & HPTE_V_BOLTED))
+		pr_devel(" -> slot: %lu\n", slot & 7);
+
+	/* Because of iSeries, we have to pass down the secondary
+	 * bucket bit here as well
+	 */
+	return (slot & 7) | (!!(vflags & HPTE_V_SECONDARY) << 3);
+}
+
+static DEFINE_SPINLOCK(pSeries_lpar_tlbie_lock);
+
+static long pSeries_lpar_hpte_remove(unsigned long hpte_group)
+{
+	unsigned long slot_offset;
+	unsigned long lpar_rc;
+	int i;
+	unsigned long dummy1, dummy2;
+
+	/* pick a random slot to start at */
+	slot_offset = mftb() & 0x7;
+
+	for (i = 0; i < HPTES_PER_GROUP; i++) {
+
+		/* don't remove a bolted entry */
+		lpar_rc = plpar_pte_remove(H_ANDCOND, hpte_group + slot_offset,
+					   (0x1UL << 4), &dummy1, &dummy2);
+		if (lpar_rc == H_SUCCESS)
+			return i;
+
+		/*
+		 * The test for adjunct partition is performed before the
+		 * ANDCOND test.  H_RESOURCE may be returned, so we need to
+		 * check for that as well.
+		 */
+		BUG_ON(lpar_rc != H_NOT_FOUND && lpar_rc != H_RESOURCE);
+
+		slot_offset++;
+		slot_offset &= 0x7;
+	}
+
+	return -1;
+}
+
+static void manual_hpte_clear_all(void)
+{
+	unsigned long size_bytes = 1UL << ppc64_pft_size;
+	unsigned long hpte_count = size_bytes >> 4;
+	struct {
+		unsigned long pteh;
+		unsigned long ptel;
+	} ptes[4];
+	long lpar_rc;
+	unsigned long i, j;
+
+	/* Read in batches of 4,
+	 * invalidate only valid entries not in the VRMA
+	 * hpte_count will be a multiple of 4
+         */
+	for (i = 0; i < hpte_count; i += 4) {
+		lpar_rc = plpar_pte_read_4_raw(0, i, (void *)ptes);
+		if (lpar_rc != H_SUCCESS) {
+			pr_info("Failed to read hash page table at %ld err %ld\n",
+				i, lpar_rc);
+			continue;
+		}
+		for (j = 0; j < 4; j++){
+			if ((ptes[j].pteh & HPTE_V_VRMA_MASK) ==
+				HPTE_V_VRMA_MASK)
+				continue;
+			if (ptes[j].pteh & HPTE_V_VALID)
+				plpar_pte_remove_raw(0, i + j, 0,
+					&(ptes[j].pteh), &(ptes[j].ptel));
+		}
+	}
+}
+
+static int hcall_hpte_clear_all(void)
+{
+	int rc;
+
+	do {
+		rc = plpar_hcall_norets(H_CLEAR_HPT);
+	} while (rc == H_CONTINUE);
+
+	return rc;
+}
+
+static void pseries_hpte_clear_all(void)
+{
+	int rc;
+
+	rc = hcall_hpte_clear_all();
+	if (rc != H_SUCCESS)
+		manual_hpte_clear_all();
+
+#ifdef __LITTLE_ENDIAN__
+	/*
+	 * Reset exceptions to big endian.
+	 *
+	 * FIXME this is a hack for kexec, we need to reset the exception
+	 * endian before starting the new kernel and this is a convenient place
+	 * to do it.
+	 *
+	 * This is also called on boot when a fadump happens. In that case we
+	 * must not change the exception endian mode.
+	 */
+	if (firmware_has_feature(FW_FEATURE_SET_MODE) && !is_fadump_active())
+		pseries_big_endian_exceptions();
+#endif
+}
+
+/*
+ * NOTE: for updatepp ops we are fortunate that the linux "newpp" bits and
+ * the low 3 bits of flags happen to line up.  So no transform is needed.
+ * We can probably optimize here and assume the high bits of newpp are
+ * already zero.  For now I am paranoid.
+ */
+static long pSeries_lpar_hpte_updatepp(unsigned long slot,
+				       unsigned long newpp,
+				       unsigned long vpn,
+				       int psize, int apsize,
+				       int ssize, unsigned long inv_flags)
+{
+	unsigned long lpar_rc;
+	unsigned long flags;
+	unsigned long want_v;
+
+	want_v = hpte_encode_avpn(vpn, psize, ssize);
+
+	flags = (newpp & 7) | H_AVPN;
+	if (mmu_has_feature(MMU_FTR_KERNEL_RO))
+		/* Move pp0 into bit 8 (IBM 55) */
+		flags |= (newpp & HPTE_R_PP0) >> 55;
+
+	pr_devel("    update: avpnv=%016lx, hash=%016lx, f=%lx, psize: %d ...",
+		 want_v, slot, flags, psize);
+
+	lpar_rc = plpar_pte_protect(flags, slot, want_v);
+
+	if (lpar_rc == H_NOT_FOUND) {
+		pr_devel("not found !\n");
+		return -1;
+	}
+
+	pr_devel("ok\n");
+
+	BUG_ON(lpar_rc != H_SUCCESS);
+
+	return 0;
+}
+
+static long __pSeries_lpar_hpte_find(unsigned long want_v, unsigned long hpte_group)
+{
+	long lpar_rc;
+	unsigned long i, j;
+	struct {
+		unsigned long pteh;
+		unsigned long ptel;
+	} ptes[4];
+
+	for (i = 0; i < HPTES_PER_GROUP; i += 4, hpte_group += 4) {
+
+		lpar_rc = plpar_pte_read_4(0, hpte_group, (void *)ptes);
+		if (lpar_rc != H_SUCCESS) {
+			pr_info("Failed to read hash page table at %ld err %ld\n",
+				hpte_group, lpar_rc);
+			continue;
+		}
+
+		for (j = 0; j < 4; j++) {
+			if (HPTE_V_COMPARE(ptes[j].pteh, want_v) &&
+			    (ptes[j].pteh & HPTE_V_VALID))
+				return i + j;
+		}
+	}
+
+	return -1;
+}
+
+static long pSeries_lpar_hpte_find(unsigned long vpn, int psize, int ssize)
+{
+	long slot;
+	unsigned long hash;
+	unsigned long want_v;
+	unsigned long hpte_group;
+
+	hash = hpt_hash(vpn, mmu_psize_defs[psize].shift, ssize);
+	want_v = hpte_encode_avpn(vpn, psize, ssize);
+
+	/* Bolted entries are always in the primary group */
+	hpte_group = (hash & htab_hash_mask) * HPTES_PER_GROUP;
+	slot = __pSeries_lpar_hpte_find(want_v, hpte_group);
+	if (slot < 0)
+		return -1;
+	return hpte_group + slot;
+}
+
+static void pSeries_lpar_hpte_updateboltedpp(unsigned long newpp,
+					     unsigned long ea,
+					     int psize, int ssize)
+{
+	unsigned long vpn;
+	unsigned long lpar_rc, slot, vsid, flags;
+
+	vsid = get_kernel_vsid(ea, ssize);
+	vpn = hpt_vpn(ea, vsid, ssize);
+
+	slot = pSeries_lpar_hpte_find(vpn, psize, ssize);
+	BUG_ON(slot == -1);
+
+	flags = newpp & 7;
+	if (mmu_has_feature(MMU_FTR_KERNEL_RO))
+		/* Move pp0 into bit 8 (IBM 55) */
+		flags |= (newpp & HPTE_R_PP0) >> 55;
+
+	lpar_rc = plpar_pte_protect(flags, slot, 0);
+
+	BUG_ON(lpar_rc != H_SUCCESS);
+}
+
+static void pSeries_lpar_hpte_invalidate(unsigned long slot, unsigned long vpn,
+					 int psize, int apsize,
+					 int ssize, int local)
+{
+	unsigned long want_v;
+	unsigned long lpar_rc;
+	unsigned long dummy1, dummy2;
+
+	pr_devel("    inval : slot=%lx, vpn=%016lx, psize: %d, local: %d\n",
+		 slot, vpn, psize, local);
+
+	want_v = hpte_encode_avpn(vpn, psize, ssize);
+	lpar_rc = plpar_pte_remove(H_AVPN, slot, want_v, &dummy1, &dummy2);
+	if (lpar_rc == H_NOT_FOUND)
+		return;
+
+	BUG_ON(lpar_rc != H_SUCCESS);
+}
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+/*
+ * Limit iterations holding pSeries_lpar_tlbie_lock to 3. We also need
+ * to make sure that we avoid bouncing the hypervisor tlbie lock.
+ */
+#define PPC64_HUGE_HPTE_BATCH 12
+
+static void __pSeries_lpar_hugepage_invalidate(unsigned long *slot,
+					     unsigned long *vpn, int count,
+					     int psize, int ssize)
+{
+	unsigned long param[PLPAR_HCALL9_BUFSIZE];
+	int i = 0, pix = 0, rc;
+	unsigned long flags = 0;
+	int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE);
+
+	if (lock_tlbie)
+		spin_lock_irqsave(&pSeries_lpar_tlbie_lock, flags);
+
+	for (i = 0; i < count; i++) {
+
+		if (!firmware_has_feature(FW_FEATURE_BULK_REMOVE)) {
+			pSeries_lpar_hpte_invalidate(slot[i], vpn[i], psize, 0,
+						     ssize, 0);
+		} else {
+			param[pix] = HBR_REQUEST | HBR_AVPN | slot[i];
+			param[pix+1] = hpte_encode_avpn(vpn[i], psize, ssize);
+			pix += 2;
+			if (pix == 8) {
+				rc = plpar_hcall9(H_BULK_REMOVE, param,
+						  param[0], param[1], param[2],
+						  param[3], param[4], param[5],
+						  param[6], param[7]);
+				BUG_ON(rc != H_SUCCESS);
+				pix = 0;
+			}
+		}
+	}
+	if (pix) {
+		param[pix] = HBR_END;
+		rc = plpar_hcall9(H_BULK_REMOVE, param, param[0], param[1],
+				  param[2], param[3], param[4], param[5],
+				  param[6], param[7]);
+		BUG_ON(rc != H_SUCCESS);
+	}
+
+	if (lock_tlbie)
+		spin_unlock_irqrestore(&pSeries_lpar_tlbie_lock, flags);
+}
+
+static void pSeries_lpar_hugepage_invalidate(unsigned long vsid,
+					     unsigned long addr,
+					     unsigned char *hpte_slot_array,
+					     int psize, int ssize, int local)
+{
+	int i, index = 0;
+	unsigned long s_addr = addr;
+	unsigned int max_hpte_count, valid;
+	unsigned long vpn_array[PPC64_HUGE_HPTE_BATCH];
+	unsigned long slot_array[PPC64_HUGE_HPTE_BATCH];
+	unsigned long shift, hidx, vpn = 0, hash, slot;
+
+	shift = mmu_psize_defs[psize].shift;
+	max_hpte_count = 1U << (PMD_SHIFT - shift);
+
+	for (i = 0; i < max_hpte_count; i++) {
+		valid = hpte_valid(hpte_slot_array, i);
+		if (!valid)
+			continue;
+		hidx =  hpte_hash_index(hpte_slot_array, i);
+
+		/* get the vpn */
+		addr = s_addr + (i * (1ul << shift));
+		vpn = hpt_vpn(addr, vsid, ssize);
+		hash = hpt_hash(vpn, shift, ssize);
+		if (hidx & _PTEIDX_SECONDARY)
+			hash = ~hash;
+
+		slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
+		slot += hidx & _PTEIDX_GROUP_IX;
+
+		slot_array[index] = slot;
+		vpn_array[index] = vpn;
+		if (index == PPC64_HUGE_HPTE_BATCH - 1) {
+			/*
+			 * Now do a bluk invalidate
+			 */
+			__pSeries_lpar_hugepage_invalidate(slot_array,
+							   vpn_array,
+							   PPC64_HUGE_HPTE_BATCH,
+							   psize, ssize);
+			index = 0;
+		} else
+			index++;
+	}
+	if (index)
+		__pSeries_lpar_hugepage_invalidate(slot_array, vpn_array,
+						   index, psize, ssize);
+}
+#else
+static void pSeries_lpar_hugepage_invalidate(unsigned long vsid,
+					     unsigned long addr,
+					     unsigned char *hpte_slot_array,
+					     int psize, int ssize, int local)
+{
+	WARN(1, "%s called without THP support\n", __func__);
+}
+#endif
+
+static int pSeries_lpar_hpte_removebolted(unsigned long ea,
+					  int psize, int ssize)
+{
+	unsigned long vpn;
+	unsigned long slot, vsid;
+
+	vsid = get_kernel_vsid(ea, ssize);
+	vpn = hpt_vpn(ea, vsid, ssize);
+
+	slot = pSeries_lpar_hpte_find(vpn, psize, ssize);
+	if (slot == -1)
+		return -ENOENT;
+
+	/*
+	 * lpar doesn't use the passed actual page size
+	 */
+	pSeries_lpar_hpte_invalidate(slot, vpn, psize, 0, ssize, 0);
+	return 0;
+}
+
+/*
+ * Take a spinlock around flushes to avoid bouncing the hypervisor tlbie
+ * lock.
+ */
+static void pSeries_lpar_flush_hash_range(unsigned long number, int local)
+{
+	unsigned long vpn;
+	unsigned long i, pix, rc;
+	unsigned long flags = 0;
+	struct ppc64_tlb_batch *batch = this_cpu_ptr(&ppc64_tlb_batch);
+	int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE);
+	unsigned long param[PLPAR_HCALL9_BUFSIZE];
+	unsigned long hash, index, shift, hidx, slot;
+	real_pte_t pte;
+	int psize, ssize;
+
+	if (lock_tlbie)
+		spin_lock_irqsave(&pSeries_lpar_tlbie_lock, flags);
+
+	psize = batch->psize;
+	ssize = batch->ssize;
+	pix = 0;
+	for (i = 0; i < number; i++) {
+		vpn = batch->vpn[i];
+		pte = batch->pte[i];
+		pte_iterate_hashed_subpages(pte, psize, vpn, index, shift) {
+			hash = hpt_hash(vpn, shift, ssize);
+			hidx = __rpte_to_hidx(pte, index);
+			if (hidx & _PTEIDX_SECONDARY)
+				hash = ~hash;
+			slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
+			slot += hidx & _PTEIDX_GROUP_IX;
+			if (!firmware_has_feature(FW_FEATURE_BULK_REMOVE)) {
+				/*
+				 * lpar doesn't use the passed actual page size
+				 */
+				pSeries_lpar_hpte_invalidate(slot, vpn, psize,
+							     0, ssize, local);
+			} else {
+				param[pix] = HBR_REQUEST | HBR_AVPN | slot;
+				param[pix+1] = hpte_encode_avpn(vpn, psize,
+								ssize);
+				pix += 2;
+				if (pix == 8) {
+					rc = plpar_hcall9(H_BULK_REMOVE, param,
+						param[0], param[1], param[2],
+						param[3], param[4], param[5],
+						param[6], param[7]);
+					BUG_ON(rc != H_SUCCESS);
+					pix = 0;
+				}
+			}
+		} pte_iterate_hashed_end();
+	}
+	if (pix) {
+		param[pix] = HBR_END;
+		rc = plpar_hcall9(H_BULK_REMOVE, param, param[0], param[1],
+				  param[2], param[3], param[4], param[5],
+				  param[6], param[7]);
+		BUG_ON(rc != H_SUCCESS);
+	}
+
+	if (lock_tlbie)
+		spin_unlock_irqrestore(&pSeries_lpar_tlbie_lock, flags);
+}
+
+static int __init disable_bulk_remove(char *str)
+{
+	if (strcmp(str, "off") == 0 &&
+	    firmware_has_feature(FW_FEATURE_BULK_REMOVE)) {
+		pr_info("Disabling BULK_REMOVE firmware feature");
+		powerpc_firmware_features &= ~FW_FEATURE_BULK_REMOVE;
+	}
+	return 1;
+}
+
+__setup("bulk_remove=", disable_bulk_remove);
+
+#define HPT_RESIZE_TIMEOUT	10000 /* ms */
+
+struct hpt_resize_state {
+	unsigned long shift;
+	int commit_rc;
+};
+
+static int pseries_lpar_resize_hpt_commit(void *data)
+{
+	struct hpt_resize_state *state = data;
+
+	state->commit_rc = plpar_resize_hpt_commit(0, state->shift);
+	if (state->commit_rc != H_SUCCESS)
+		return -EIO;
+
+	/* Hypervisor has transitioned the HTAB, update our globals */
+	ppc64_pft_size = state->shift;
+	htab_size_bytes = 1UL << ppc64_pft_size;
+	htab_hash_mask = (htab_size_bytes >> 7) - 1;
+
+	return 0;
+}
+
+/*
+ * Must be called in process context. The caller must hold the
+ * cpus_lock.
+ */
+static int pseries_lpar_resize_hpt(unsigned long shift)
+{
+	struct hpt_resize_state state = {
+		.shift = shift,
+		.commit_rc = H_FUNCTION,
+	};
+	unsigned int delay, total_delay = 0;
+	int rc;
+	ktime_t t0, t1, t2;
+
+	might_sleep();
+
+	if (!firmware_has_feature(FW_FEATURE_HPT_RESIZE))
+		return -ENODEV;
+
+	pr_info("Attempting to resize HPT to shift %lu\n", shift);
+
+	t0 = ktime_get();
+
+	rc = plpar_resize_hpt_prepare(0, shift);
+	while (H_IS_LONG_BUSY(rc)) {
+		delay = get_longbusy_msecs(rc);
+		total_delay += delay;
+		if (total_delay > HPT_RESIZE_TIMEOUT) {
+			/* prepare with shift==0 cancels an in-progress resize */
+			rc = plpar_resize_hpt_prepare(0, 0);
+			if (rc != H_SUCCESS)
+				pr_warn("Unexpected error %d cancelling timed out HPT resize\n",
+				       rc);
+			return -ETIMEDOUT;
+		}
+		msleep(delay);
+		rc = plpar_resize_hpt_prepare(0, shift);
+	};
+
+	switch (rc) {
+	case H_SUCCESS:
+		/* Continue on */
+		break;
+
+	case H_PARAMETER:
+		return -EINVAL;
+	case H_RESOURCE:
+		return -EPERM;
+	default:
+		pr_warn("Unexpected error %d from H_RESIZE_HPT_PREPARE\n", rc);
+		return -EIO;
+	}
+
+	t1 = ktime_get();
+
+	rc = stop_machine_cpuslocked(pseries_lpar_resize_hpt_commit,
+				     &state, NULL);
+
+	t2 = ktime_get();
+
+	if (rc != 0) {
+		switch (state.commit_rc) {
+		case H_PTEG_FULL:
+			pr_warn("Hash collision while resizing HPT\n");
+			return -ENOSPC;
+
+		default:
+			pr_warn("Unexpected error %d from H_RESIZE_HPT_COMMIT\n",
+				state.commit_rc);
+			return -EIO;
+		};
+	}
+
+	pr_info("HPT resize to shift %lu complete (%lld ms / %lld ms)\n",
+		shift, (long long) ktime_ms_delta(t1, t0),
+		(long long) ktime_ms_delta(t2, t1));
+
+	return 0;
+}
+
+static int pseries_lpar_register_process_table(unsigned long base,
+			unsigned long page_size, unsigned long table_size)
+{
+	long rc;
+	unsigned long flags = 0;
+
+	if (table_size)
+		flags |= PROC_TABLE_NEW;
+	if (radix_enabled())
+		flags |= PROC_TABLE_RADIX | PROC_TABLE_GTSE;
+	else
+		flags |= PROC_TABLE_HPT_SLB;
+	for (;;) {
+		rc = plpar_hcall_norets(H_REGISTER_PROC_TBL, flags, base,
+					page_size, table_size);
+		if (!H_IS_LONG_BUSY(rc))
+			break;
+		mdelay(get_longbusy_msecs(rc));
+	}
+	if (rc != H_SUCCESS) {
+		pr_err("Failed to register process table (rc=%ld)\n", rc);
+		BUG();
+	}
+	return rc;
+}
+
+void __init hpte_init_pseries(void)
+{
+	mmu_hash_ops.hpte_invalidate	 = pSeries_lpar_hpte_invalidate;
+	mmu_hash_ops.hpte_updatepp	 = pSeries_lpar_hpte_updatepp;
+	mmu_hash_ops.hpte_updateboltedpp = pSeries_lpar_hpte_updateboltedpp;
+	mmu_hash_ops.hpte_insert	 = pSeries_lpar_hpte_insert;
+	mmu_hash_ops.hpte_remove	 = pSeries_lpar_hpte_remove;
+	mmu_hash_ops.hpte_removebolted   = pSeries_lpar_hpte_removebolted;
+	mmu_hash_ops.flush_hash_range	 = pSeries_lpar_flush_hash_range;
+	mmu_hash_ops.hpte_clear_all      = pseries_hpte_clear_all;
+	mmu_hash_ops.hugepage_invalidate = pSeries_lpar_hugepage_invalidate;
+	register_process_table		 = pseries_lpar_register_process_table;
+
+	if (firmware_has_feature(FW_FEATURE_HPT_RESIZE))
+		mmu_hash_ops.resize_hpt = pseries_lpar_resize_hpt;
+}
+
+void radix_init_pseries(void)
+{
+	pr_info("Using radix MMU under hypervisor\n");
+	register_process_table = pseries_lpar_register_process_table;
+}
+
+#ifdef CONFIG_PPC_SMLPAR
+#define CMO_FREE_HINT_DEFAULT 1
+static int cmo_free_hint_flag = CMO_FREE_HINT_DEFAULT;
+
+static int __init cmo_free_hint(char *str)
+{
+	char *parm;
+	parm = strstrip(str);
+
+	if (strcasecmp(parm, "no") == 0 || strcasecmp(parm, "off") == 0) {
+		pr_info("%s: CMO free page hinting is not active.\n", __func__);
+		cmo_free_hint_flag = 0;
+		return 1;
+	}
+
+	cmo_free_hint_flag = 1;
+	pr_info("%s: CMO free page hinting is active.\n", __func__);
+
+	if (strcasecmp(parm, "yes") == 0 || strcasecmp(parm, "on") == 0)
+		return 1;
+
+	return 0;
+}
+
+__setup("cmo_free_hint=", cmo_free_hint);
+
+static void pSeries_set_page_state(struct page *page, int order,
+				   unsigned long state)
+{
+	int i, j;
+	unsigned long cmo_page_sz, addr;
+
+	cmo_page_sz = cmo_get_page_size();
+	addr = __pa((unsigned long)page_address(page));
+
+	for (i = 0; i < (1 << order); i++, addr += PAGE_SIZE) {
+		for (j = 0; j < PAGE_SIZE; j += cmo_page_sz)
+			plpar_hcall_norets(H_PAGE_INIT, state, addr + j, 0);
+	}
+}
+
+void arch_free_page(struct page *page, int order)
+{
+	if (radix_enabled())
+		return;
+	if (!cmo_free_hint_flag || !firmware_has_feature(FW_FEATURE_CMO))
+		return;
+
+	pSeries_set_page_state(page, order, H_PAGE_SET_UNUSED);
+}
+EXPORT_SYMBOL(arch_free_page);
+
+#endif /* CONFIG_PPC_SMLPAR */
+#endif /* CONFIG_PPC_BOOK3S_64 */
+
+#ifdef CONFIG_TRACEPOINTS
+#ifdef CONFIG_JUMP_LABEL
+struct static_key hcall_tracepoint_key = STATIC_KEY_INIT;
+
+int hcall_tracepoint_regfunc(void)
+{
+	static_key_slow_inc(&hcall_tracepoint_key);
+	return 0;
+}
+
+void hcall_tracepoint_unregfunc(void)
+{
+	static_key_slow_dec(&hcall_tracepoint_key);
+}
+#else
+/*
+ * We optimise our hcall path by placing hcall_tracepoint_refcount
+ * directly in the TOC so we can check if the hcall tracepoints are
+ * enabled via a single load.
+ */
+
+/* NB: reg/unreg are called while guarded with the tracepoints_mutex */
+extern long hcall_tracepoint_refcount;
+
+int hcall_tracepoint_regfunc(void)
+{
+	hcall_tracepoint_refcount++;
+	return 0;
+}
+
+void hcall_tracepoint_unregfunc(void)
+{
+	hcall_tracepoint_refcount--;
+}
+#endif
+
+/*
+ * Since the tracing code might execute hcalls we need to guard against
+ * recursion. One example of this are spinlocks calling H_YIELD on
+ * shared processor partitions.
+ */
+static DEFINE_PER_CPU(unsigned int, hcall_trace_depth);
+
+
+void __trace_hcall_entry(unsigned long opcode, unsigned long *args)
+{
+	unsigned long flags;
+	unsigned int *depth;
+
+	/*
+	 * We cannot call tracepoints inside RCU idle regions which
+	 * means we must not trace H_CEDE.
+	 */
+	if (opcode == H_CEDE)
+		return;
+
+	local_irq_save(flags);
+
+	depth = this_cpu_ptr(&hcall_trace_depth);
+
+	if (*depth)
+		goto out;
+
+	(*depth)++;
+	preempt_disable();
+	trace_hcall_entry(opcode, args);
+	(*depth)--;
+
+out:
+	local_irq_restore(flags);
+}
+
+void __trace_hcall_exit(long opcode, long retval, unsigned long *retbuf)
+{
+	unsigned long flags;
+	unsigned int *depth;
+
+	if (opcode == H_CEDE)
+		return;
+
+	local_irq_save(flags);
+
+	depth = this_cpu_ptr(&hcall_trace_depth);
+
+	if (*depth)
+		goto out;
+
+	(*depth)++;
+	trace_hcall_exit(opcode, retval, retbuf);
+	preempt_enable();
+	(*depth)--;
+
+out:
+	local_irq_restore(flags);
+}
+#endif
+
+/**
+ * h_get_mpp
+ * H_GET_MPP hcall returns info in 7 parms
+ */
+int h_get_mpp(struct hvcall_mpp_data *mpp_data)
+{
+	int rc;
+	unsigned long retbuf[PLPAR_HCALL9_BUFSIZE];
+
+	rc = plpar_hcall9(H_GET_MPP, retbuf);
+
+	mpp_data->entitled_mem = retbuf[0];
+	mpp_data->mapped_mem = retbuf[1];
+
+	mpp_data->group_num = (retbuf[2] >> 2 * 8) & 0xffff;
+	mpp_data->pool_num = retbuf[2] & 0xffff;
+
+	mpp_data->mem_weight = (retbuf[3] >> 7 * 8) & 0xff;
+	mpp_data->unallocated_mem_weight = (retbuf[3] >> 6 * 8) & 0xff;
+	mpp_data->unallocated_entitlement = retbuf[3] & 0xffffffffffffUL;
+
+	mpp_data->pool_size = retbuf[4];
+	mpp_data->loan_request = retbuf[5];
+	mpp_data->backing_mem = retbuf[6];
+
+	return rc;
+}
+EXPORT_SYMBOL(h_get_mpp);
+
+int h_get_mpp_x(struct hvcall_mpp_x_data *mpp_x_data)
+{
+	int rc;
+	unsigned long retbuf[PLPAR_HCALL9_BUFSIZE] = { 0 };
+
+	rc = plpar_hcall9(H_GET_MPP_X, retbuf);
+
+	mpp_x_data->coalesced_bytes = retbuf[0];
+	mpp_x_data->pool_coalesced_bytes = retbuf[1];
+	mpp_x_data->pool_purr_cycles = retbuf[2];
+	mpp_x_data->pool_spurr_cycles = retbuf[3];
+
+	return rc;
+}
+
+static unsigned long vsid_unscramble(unsigned long vsid, int ssize)
+{
+	unsigned long protovsid;
+	unsigned long va_bits = VA_BITS;
+	unsigned long modinv, vsid_modulus;
+	unsigned long max_mod_inv, tmp_modinv;
+
+	if (!mmu_has_feature(MMU_FTR_68_BIT_VA))
+		va_bits = 65;
+
+	if (ssize == MMU_SEGSIZE_256M) {
+		modinv = VSID_MULINV_256M;
+		vsid_modulus = ((1UL << (va_bits - SID_SHIFT)) - 1);
+	} else {
+		modinv = VSID_MULINV_1T;
+		vsid_modulus = ((1UL << (va_bits - SID_SHIFT_1T)) - 1);
+	}
+
+	/*
+	 * vsid outside our range.
+	 */
+	if (vsid >= vsid_modulus)
+		return 0;
+
+	/*
+	 * If modinv is the modular multiplicate inverse of (x % vsid_modulus)
+	 * and vsid = (protovsid * x) % vsid_modulus, then we say:
+	 *   protovsid = (vsid * modinv) % vsid_modulus
+	 */
+
+	/* Check if (vsid * modinv) overflow (63 bits) */
+	max_mod_inv = 0x7fffffffffffffffull / vsid;
+	if (modinv < max_mod_inv)
+		return (vsid * modinv) % vsid_modulus;
+
+	tmp_modinv = modinv/max_mod_inv;
+	modinv %= max_mod_inv;
+
+	protovsid = (((vsid * max_mod_inv) % vsid_modulus) * tmp_modinv) % vsid_modulus;
+	protovsid = (protovsid + vsid * modinv) % vsid_modulus;
+
+	return protovsid;
+}
+
+static int __init reserve_vrma_context_id(void)
+{
+	unsigned long protovsid;
+
+	/*
+	 * Reserve context ids which map to reserved virtual addresses. For now
+	 * we only reserve the context id which maps to the VRMA VSID. We ignore
+	 * the addresses in "ibm,adjunct-virtual-addresses" because we don't
+	 * enable adjunct support via the "ibm,client-architecture-support"
+	 * interface.
+	 */
+	protovsid = vsid_unscramble(VRMA_VSID, MMU_SEGSIZE_1T);
+	hash__reserve_context_id(protovsid >> ESID_BITS_1T);
+	return 0;
+}
+machine_device_initcall(pseries, reserve_vrma_context_id);
+
+#ifdef CONFIG_DEBUG_FS
+/* debugfs file interface for vpa data */
+static ssize_t vpa_file_read(struct file *filp, char __user *buf, size_t len,
+			      loff_t *pos)
+{
+	int cpu = (long)filp->private_data;
+	struct lppaca *lppaca = &lppaca_of(cpu);
+
+	return simple_read_from_buffer(buf, len, pos, lppaca,
+				sizeof(struct lppaca));
+}
+
+static const struct file_operations vpa_fops = {
+	.open		= simple_open,
+	.read		= vpa_file_read,
+	.llseek		= default_llseek,
+};
+
+static int __init vpa_debugfs_init(void)
+{
+	char name[16];
+	long i;
+	struct dentry *vpa_dir;
+
+	if (!firmware_has_feature(FW_FEATURE_SPLPAR))
+		return 0;
+
+	vpa_dir = debugfs_create_dir("vpa", powerpc_debugfs_root);
+	if (!vpa_dir) {
+		pr_warn("%s: can't create vpa root dir\n", __func__);
+		return -ENOMEM;
+	}
+
+	/* set up the per-cpu vpa file*/
+	for_each_possible_cpu(i) {
+		struct dentry *d;
+
+		sprintf(name, "cpu-%ld", i);
+
+		d = debugfs_create_file(name, 0400, vpa_dir, (void *)i,
+					&vpa_fops);
+		if (!d) {
+			pr_warn("%s: can't create per-cpu vpa file\n",
+					__func__);
+			return -ENOMEM;
+		}
+	}
+
+	return 0;
+}
+machine_arch_initcall(pseries, vpa_debugfs_init);
+#endif /* CONFIG_DEBUG_FS */
diff --git a/arch/powerpc/platforms/pseries/lparcfg.c b/arch/powerpc/platforms/pseries/lparcfg.c
new file mode 100644
index 000000000..7c872dc01
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/lparcfg.c
@@ -0,0 +1,714 @@
+/*
+ * PowerPC64 LPAR Configuration Information Driver
+ *
+ * Dave Engebretsen engebret@us.ibm.com
+ *    Copyright (c) 2003 Dave Engebretsen
+ * Will Schmidt willschm@us.ibm.com
+ *    SPLPAR updates, Copyright (c) 2003 Will Schmidt IBM Corporation.
+ *    seq_file updates, Copyright (c) 2004 Will Schmidt IBM Corporation.
+ * Nathan Lynch nathanl@austin.ibm.com
+ *    Added lparcfg_write, Copyright (C) 2004 Nathan Lynch IBM Corporation.
+ *
+ *      This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ *
+ * This driver creates a proc file at /proc/ppc64/lparcfg which contains
+ * keyword - value pairs that specify the configuration of the partition.
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <linux/proc_fs.h>
+#include <linux/init.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <asm/lppaca.h>
+#include <asm/hvcall.h>
+#include <asm/firmware.h>
+#include <asm/rtas.h>
+#include <asm/time.h>
+#include <asm/prom.h>
+#include <asm/vdso_datapage.h>
+#include <asm/vio.h>
+#include <asm/mmu.h>
+#include <asm/machdep.h>
+
+#include "pseries.h"
+
+/*
+ * This isn't a module but we expose that to userspace
+ * via /proc so leave the definitions here
+ */
+#define MODULE_VERS "1.9"
+#define MODULE_NAME "lparcfg"
+
+/* #define LPARCFG_DEBUG */
+
+/*
+ * Track sum of all purrs across all processors. This is used to further
+ * calculate usage values by different applications
+ */
+static void cpu_get_purr(void *arg)
+{
+	atomic64_t *sum = arg;
+
+	atomic64_add(mfspr(SPRN_PURR), sum);
+}
+
+static unsigned long get_purr(void)
+{
+	atomic64_t purr = ATOMIC64_INIT(0);
+
+	on_each_cpu(cpu_get_purr, &purr, 1);
+
+	return atomic64_read(&purr);
+}
+
+/*
+ * Methods used to fetch LPAR data when running on a pSeries platform.
+ */
+
+struct hvcall_ppp_data {
+	u64	entitlement;
+	u64	unallocated_entitlement;
+	u16	group_num;
+	u16	pool_num;
+	u8	capped;
+	u8	weight;
+	u8	unallocated_weight;
+	u16	active_procs_in_pool;
+	u16	active_system_procs;
+	u16	phys_platform_procs;
+	u32	max_proc_cap_avail;
+	u32	entitled_proc_cap_avail;
+};
+
+/*
+ * H_GET_PPP hcall returns info in 4 parms.
+ *  entitled_capacity,unallocated_capacity,
+ *  aggregation, resource_capability).
+ *
+ *  R4 = Entitled Processor Capacity Percentage.
+ *  R5 = Unallocated Processor Capacity Percentage.
+ *  R6 (AABBCCDDEEFFGGHH).
+ *      XXXX - reserved (0)
+ *          XXXX - reserved (0)
+ *              XXXX - Group Number
+ *                  XXXX - Pool Number.
+ *  R7 (IIJJKKLLMMNNOOPP).
+ *      XX - reserved. (0)
+ *        XX - bit 0-6 reserved (0).   bit 7 is Capped indicator.
+ *          XX - variable processor Capacity Weight
+ *            XX - Unallocated Variable Processor Capacity Weight.
+ *              XXXX - Active processors in Physical Processor Pool.
+ *                  XXXX  - Processors active on platform.
+ *  R8 (QQQQRRRRRRSSSSSS). if ibm,partition-performance-parameters-level >= 1
+ *	XXXX - Physical platform procs allocated to virtualization.
+ *	    XXXXXX - Max procs capacity % available to the partitions pool.
+ *	          XXXXXX - Entitled procs capacity % available to the
+ *			   partitions pool.
+ */
+static unsigned int h_get_ppp(struct hvcall_ppp_data *ppp_data)
+{
+	unsigned long rc;
+	unsigned long retbuf[PLPAR_HCALL9_BUFSIZE];
+
+	rc = plpar_hcall9(H_GET_PPP, retbuf);
+
+	ppp_data->entitlement = retbuf[0];
+	ppp_data->unallocated_entitlement = retbuf[1];
+
+	ppp_data->group_num = (retbuf[2] >> 2 * 8) & 0xffff;
+	ppp_data->pool_num = retbuf[2] & 0xffff;
+
+	ppp_data->capped = (retbuf[3] >> 6 * 8) & 0x01;
+	ppp_data->weight = (retbuf[3] >> 5 * 8) & 0xff;
+	ppp_data->unallocated_weight = (retbuf[3] >> 4 * 8) & 0xff;
+	ppp_data->active_procs_in_pool = (retbuf[3] >> 2 * 8) & 0xffff;
+	ppp_data->active_system_procs = retbuf[3] & 0xffff;
+
+	ppp_data->phys_platform_procs = retbuf[4] >> 6 * 8;
+	ppp_data->max_proc_cap_avail = (retbuf[4] >> 3 * 8) & 0xffffff;
+	ppp_data->entitled_proc_cap_avail = retbuf[4] & 0xffffff;
+
+	return rc;
+}
+
+static unsigned h_pic(unsigned long *pool_idle_time,
+		      unsigned long *num_procs)
+{
+	unsigned long rc;
+	unsigned long retbuf[PLPAR_HCALL_BUFSIZE];
+
+	rc = plpar_hcall(H_PIC, retbuf);
+
+	*pool_idle_time = retbuf[0];
+	*num_procs = retbuf[1];
+
+	return rc;
+}
+
+/*
+ * parse_ppp_data
+ * Parse out the data returned from h_get_ppp and h_pic
+ */
+static void parse_ppp_data(struct seq_file *m)
+{
+	struct hvcall_ppp_data ppp_data;
+	struct device_node *root;
+	const __be32 *perf_level;
+	int rc;
+
+	rc = h_get_ppp(&ppp_data);
+	if (rc)
+		return;
+
+	seq_printf(m, "partition_entitled_capacity=%lld\n",
+	           ppp_data.entitlement);
+	seq_printf(m, "group=%d\n", ppp_data.group_num);
+	seq_printf(m, "system_active_processors=%d\n",
+	           ppp_data.active_system_procs);
+
+	/* pool related entries are appropriate for shared configs */
+	if (lppaca_shared_proc(get_lppaca())) {
+		unsigned long pool_idle_time, pool_procs;
+
+		seq_printf(m, "pool=%d\n", ppp_data.pool_num);
+
+		/* report pool_capacity in percentage */
+		seq_printf(m, "pool_capacity=%d\n",
+			   ppp_data.active_procs_in_pool * 100);
+
+		h_pic(&pool_idle_time, &pool_procs);
+		seq_printf(m, "pool_idle_time=%ld\n", pool_idle_time);
+		seq_printf(m, "pool_num_procs=%ld\n", pool_procs);
+	}
+
+	seq_printf(m, "unallocated_capacity_weight=%d\n",
+		   ppp_data.unallocated_weight);
+	seq_printf(m, "capacity_weight=%d\n", ppp_data.weight);
+	seq_printf(m, "capped=%d\n", ppp_data.capped);
+	seq_printf(m, "unallocated_capacity=%lld\n",
+		   ppp_data.unallocated_entitlement);
+
+	/* The last bits of information returned from h_get_ppp are only
+	 * valid if the ibm,partition-performance-parameters-level
+	 * property is >= 1.
+	 */
+	root = of_find_node_by_path("/");
+	if (root) {
+		perf_level = of_get_property(root,
+				"ibm,partition-performance-parameters-level",
+					     NULL);
+		if (perf_level && (be32_to_cpup(perf_level) >= 1)) {
+			seq_printf(m,
+			    "physical_procs_allocated_to_virtualization=%d\n",
+				   ppp_data.phys_platform_procs);
+			seq_printf(m, "max_proc_capacity_available=%d\n",
+				   ppp_data.max_proc_cap_avail);
+			seq_printf(m, "entitled_proc_capacity_available=%d\n",
+				   ppp_data.entitled_proc_cap_avail);
+		}
+
+		of_node_put(root);
+	}
+}
+
+/**
+ * parse_mpp_data
+ * Parse out data returned from h_get_mpp
+ */
+static void parse_mpp_data(struct seq_file *m)
+{
+	struct hvcall_mpp_data mpp_data;
+	int rc;
+
+	rc = h_get_mpp(&mpp_data);
+	if (rc)
+		return;
+
+	seq_printf(m, "entitled_memory=%ld\n", mpp_data.entitled_mem);
+
+	if (mpp_data.mapped_mem != -1)
+		seq_printf(m, "mapped_entitled_memory=%ld\n",
+		           mpp_data.mapped_mem);
+
+	seq_printf(m, "entitled_memory_group_number=%d\n", mpp_data.group_num);
+	seq_printf(m, "entitled_memory_pool_number=%d\n", mpp_data.pool_num);
+
+	seq_printf(m, "entitled_memory_weight=%d\n", mpp_data.mem_weight);
+	seq_printf(m, "unallocated_entitled_memory_weight=%d\n",
+	           mpp_data.unallocated_mem_weight);
+	seq_printf(m, "unallocated_io_mapping_entitlement=%ld\n",
+	           mpp_data.unallocated_entitlement);
+
+	if (mpp_data.pool_size != -1)
+		seq_printf(m, "entitled_memory_pool_size=%ld bytes\n",
+		           mpp_data.pool_size);
+
+	seq_printf(m, "entitled_memory_loan_request=%ld\n",
+	           mpp_data.loan_request);
+
+	seq_printf(m, "backing_memory=%ld bytes\n", mpp_data.backing_mem);
+}
+
+/**
+ * parse_mpp_x_data
+ * Parse out data returned from h_get_mpp_x
+ */
+static void parse_mpp_x_data(struct seq_file *m)
+{
+	struct hvcall_mpp_x_data mpp_x_data;
+
+	if (!firmware_has_feature(FW_FEATURE_XCMO))
+		return;
+	if (h_get_mpp_x(&mpp_x_data))
+		return;
+
+	seq_printf(m, "coalesced_bytes=%ld\n", mpp_x_data.coalesced_bytes);
+
+	if (mpp_x_data.pool_coalesced_bytes)
+		seq_printf(m, "pool_coalesced_bytes=%ld\n",
+			   mpp_x_data.pool_coalesced_bytes);
+	if (mpp_x_data.pool_purr_cycles)
+		seq_printf(m, "coalesce_pool_purr=%ld\n", mpp_x_data.pool_purr_cycles);
+	if (mpp_x_data.pool_spurr_cycles)
+		seq_printf(m, "coalesce_pool_spurr=%ld\n", mpp_x_data.pool_spurr_cycles);
+}
+
+#define SPLPAR_CHARACTERISTICS_TOKEN 20
+#define SPLPAR_MAXLENGTH 1026*(sizeof(char))
+
+/*
+ * parse_system_parameter_string()
+ * Retrieve the potential_processors, max_entitled_capacity and friends
+ * through the get-system-parameter rtas call.  Replace keyword strings as
+ * necessary.
+ */
+static void parse_system_parameter_string(struct seq_file *m)
+{
+	int call_status;
+
+	unsigned char *local_buffer = kmalloc(SPLPAR_MAXLENGTH, GFP_KERNEL);
+	if (!local_buffer) {
+		printk(KERN_ERR "%s %s kmalloc failure at line %d\n",
+		       __FILE__, __func__, __LINE__);
+		return;
+	}
+
+	spin_lock(&rtas_data_buf_lock);
+	memset(rtas_data_buf, 0, SPLPAR_MAXLENGTH);
+	call_status = rtas_call(rtas_token("ibm,get-system-parameter"), 3, 1,
+				NULL,
+				SPLPAR_CHARACTERISTICS_TOKEN,
+				__pa(rtas_data_buf),
+				RTAS_DATA_BUF_SIZE);
+	memcpy(local_buffer, rtas_data_buf, SPLPAR_MAXLENGTH);
+	local_buffer[SPLPAR_MAXLENGTH - 1] = '\0';
+	spin_unlock(&rtas_data_buf_lock);
+
+	if (call_status != 0) {
+		printk(KERN_INFO
+		       "%s %s Error calling get-system-parameter (0x%x)\n",
+		       __FILE__, __func__, call_status);
+	} else {
+		int splpar_strlen;
+		int idx, w_idx;
+		char *workbuffer = kzalloc(SPLPAR_MAXLENGTH, GFP_KERNEL);
+		if (!workbuffer) {
+			printk(KERN_ERR "%s %s kmalloc failure at line %d\n",
+			       __FILE__, __func__, __LINE__);
+			kfree(local_buffer);
+			return;
+		}
+#ifdef LPARCFG_DEBUG
+		printk(KERN_INFO "success calling get-system-parameter\n");
+#endif
+		splpar_strlen = local_buffer[0] * 256 + local_buffer[1];
+		local_buffer += 2;	/* step over strlen value */
+
+		w_idx = 0;
+		idx = 0;
+		while ((*local_buffer) && (idx < splpar_strlen)) {
+			workbuffer[w_idx++] = local_buffer[idx++];
+			if ((local_buffer[idx] == ',')
+			    || (local_buffer[idx] == '\0')) {
+				workbuffer[w_idx] = '\0';
+				if (w_idx) {
+					/* avoid the empty string */
+					seq_printf(m, "%s\n", workbuffer);
+				}
+				memset(workbuffer, 0, SPLPAR_MAXLENGTH);
+				idx++;	/* skip the comma */
+				w_idx = 0;
+			} else if (local_buffer[idx] == '=') {
+				/* code here to replace workbuffer contents
+				   with different keyword strings */
+				if (0 == strcmp(workbuffer, "MaxEntCap")) {
+					strcpy(workbuffer,
+					       "partition_max_entitled_capacity");
+					w_idx = strlen(workbuffer);
+				}
+				if (0 == strcmp(workbuffer, "MaxPlatProcs")) {
+					strcpy(workbuffer,
+					       "system_potential_processors");
+					w_idx = strlen(workbuffer);
+				}
+			}
+		}
+		kfree(workbuffer);
+		local_buffer -= 2;	/* back up over strlen value */
+	}
+	kfree(local_buffer);
+}
+
+/* Return the number of processors in the system.
+ * This function reads through the device tree and counts
+ * the virtual processors, this does not include threads.
+ */
+static int lparcfg_count_active_processors(void)
+{
+	struct device_node *cpus_dn;
+	int count = 0;
+
+	for_each_node_by_type(cpus_dn, "cpu") {
+#ifdef LPARCFG_DEBUG
+		printk(KERN_ERR "cpus_dn %p\n", cpus_dn);
+#endif
+		count++;
+	}
+	return count;
+}
+
+static void pseries_cmo_data(struct seq_file *m)
+{
+	int cpu;
+	unsigned long cmo_faults = 0;
+	unsigned long cmo_fault_time = 0;
+
+	seq_printf(m, "cmo_enabled=%d\n", firmware_has_feature(FW_FEATURE_CMO));
+
+	if (!firmware_has_feature(FW_FEATURE_CMO))
+		return;
+
+	for_each_possible_cpu(cpu) {
+		cmo_faults += be64_to_cpu(lppaca_of(cpu).cmo_faults);
+		cmo_fault_time += be64_to_cpu(lppaca_of(cpu).cmo_fault_time);
+	}
+
+	seq_printf(m, "cmo_faults=%lu\n", cmo_faults);
+	seq_printf(m, "cmo_fault_time_usec=%lu\n",
+		   cmo_fault_time / tb_ticks_per_usec);
+	seq_printf(m, "cmo_primary_psp=%d\n", cmo_get_primary_psp());
+	seq_printf(m, "cmo_secondary_psp=%d\n", cmo_get_secondary_psp());
+	seq_printf(m, "cmo_page_size=%lu\n", cmo_get_page_size());
+}
+
+static void splpar_dispatch_data(struct seq_file *m)
+{
+	int cpu;
+	unsigned long dispatches = 0;
+	unsigned long dispatch_dispersions = 0;
+
+	for_each_possible_cpu(cpu) {
+		dispatches += be32_to_cpu(lppaca_of(cpu).yield_count);
+		dispatch_dispersions +=
+			be32_to_cpu(lppaca_of(cpu).dispersion_count);
+	}
+
+	seq_printf(m, "dispatches=%lu\n", dispatches);
+	seq_printf(m, "dispatch_dispersions=%lu\n", dispatch_dispersions);
+}
+
+static void parse_em_data(struct seq_file *m)
+{
+	unsigned long retbuf[PLPAR_HCALL_BUFSIZE];
+
+	if (firmware_has_feature(FW_FEATURE_LPAR) &&
+	    plpar_hcall(H_GET_EM_PARMS, retbuf) == H_SUCCESS)
+		seq_printf(m, "power_mode_data=%016lx\n", retbuf[0]);
+}
+
+static int pseries_lparcfg_data(struct seq_file *m, void *v)
+{
+	int partition_potential_processors;
+	int partition_active_processors;
+	struct device_node *rtas_node;
+	const __be32 *lrdrp = NULL;
+
+	rtas_node = of_find_node_by_path("/rtas");
+	if (rtas_node)
+		lrdrp = of_get_property(rtas_node, "ibm,lrdr-capacity", NULL);
+
+	if (lrdrp == NULL) {
+		partition_potential_processors = vdso_data->processorCount;
+	} else {
+		partition_potential_processors = be32_to_cpup(lrdrp + 4);
+	}
+	of_node_put(rtas_node);
+
+	partition_active_processors = lparcfg_count_active_processors();
+
+	if (firmware_has_feature(FW_FEATURE_SPLPAR)) {
+		/* this call handles the ibm,get-system-parameter contents */
+		parse_system_parameter_string(m);
+		parse_ppp_data(m);
+		parse_mpp_data(m);
+		parse_mpp_x_data(m);
+		pseries_cmo_data(m);
+		splpar_dispatch_data(m);
+
+		seq_printf(m, "purr=%ld\n", get_purr());
+	} else {		/* non SPLPAR case */
+
+		seq_printf(m, "system_active_processors=%d\n",
+			   partition_potential_processors);
+
+		seq_printf(m, "system_potential_processors=%d\n",
+			   partition_potential_processors);
+
+		seq_printf(m, "partition_max_entitled_capacity=%d\n",
+			   partition_potential_processors * 100);
+
+		seq_printf(m, "partition_entitled_capacity=%d\n",
+			   partition_active_processors * 100);
+	}
+
+	seq_printf(m, "partition_active_processors=%d\n",
+		   partition_active_processors);
+
+	seq_printf(m, "partition_potential_processors=%d\n",
+		   partition_potential_processors);
+
+	seq_printf(m, "shared_processor_mode=%d\n",
+		   lppaca_shared_proc(get_lppaca()));
+
+#ifdef CONFIG_PPC_BOOK3S_64
+	seq_printf(m, "slb_size=%d\n", mmu_slb_size);
+#endif
+	parse_em_data(m);
+
+	return 0;
+}
+
+static ssize_t update_ppp(u64 *entitlement, u8 *weight)
+{
+	struct hvcall_ppp_data ppp_data;
+	u8 new_weight;
+	u64 new_entitled;
+	ssize_t retval;
+
+	/* Get our current parameters */
+	retval = h_get_ppp(&ppp_data);
+	if (retval)
+		return retval;
+
+	if (entitlement) {
+		new_weight = ppp_data.weight;
+		new_entitled = *entitlement;
+	} else if (weight) {
+		new_weight = *weight;
+		new_entitled = ppp_data.entitlement;
+	} else
+		return -EINVAL;
+
+	pr_debug("%s: current_entitled = %llu, current_weight = %u\n",
+		 __func__, ppp_data.entitlement, ppp_data.weight);
+
+	pr_debug("%s: new_entitled = %llu, new_weight = %u\n",
+		 __func__, new_entitled, new_weight);
+
+	retval = plpar_hcall_norets(H_SET_PPP, new_entitled, new_weight);
+	return retval;
+}
+
+/**
+ * update_mpp
+ *
+ * Update the memory entitlement and weight for the partition.  Caller must
+ * specify either a new entitlement or weight, not both, to be updated
+ * since the h_set_mpp call takes both entitlement and weight as parameters.
+ */
+static ssize_t update_mpp(u64 *entitlement, u8 *weight)
+{
+	struct hvcall_mpp_data mpp_data;
+	u64 new_entitled;
+	u8 new_weight;
+	ssize_t rc;
+
+	if (entitlement) {
+		/* Check with vio to ensure the new memory entitlement
+		 * can be handled.
+		 */
+		rc = vio_cmo_entitlement_update(*entitlement);
+		if (rc)
+			return rc;
+	}
+
+	rc = h_get_mpp(&mpp_data);
+	if (rc)
+		return rc;
+
+	if (entitlement) {
+		new_weight = mpp_data.mem_weight;
+		new_entitled = *entitlement;
+	} else if (weight) {
+		new_weight = *weight;
+		new_entitled = mpp_data.entitled_mem;
+	} else
+		return -EINVAL;
+
+	pr_debug("%s: current_entitled = %lu, current_weight = %u\n",
+	         __func__, mpp_data.entitled_mem, mpp_data.mem_weight);
+
+	pr_debug("%s: new_entitled = %llu, new_weight = %u\n",
+		 __func__, new_entitled, new_weight);
+
+	rc = plpar_hcall_norets(H_SET_MPP, new_entitled, new_weight);
+	return rc;
+}
+
+/*
+ * Interface for changing system parameters (variable capacity weight
+ * and entitled capacity).  Format of input is "param_name=value";
+ * anything after value is ignored.  Valid parameters at this time are
+ * "partition_entitled_capacity" and "capacity_weight".  We use
+ * H_SET_PPP to alter parameters.
+ *
+ * This function should be invoked only on systems with
+ * FW_FEATURE_SPLPAR.
+ */
+static ssize_t lparcfg_write(struct file *file, const char __user * buf,
+			     size_t count, loff_t * off)
+{
+	int kbuf_sz = 64;
+	char kbuf[kbuf_sz];
+	char *tmp;
+	u64 new_entitled, *new_entitled_ptr = &new_entitled;
+	u8 new_weight, *new_weight_ptr = &new_weight;
+	ssize_t retval;
+
+	if (!firmware_has_feature(FW_FEATURE_SPLPAR))
+		return -EINVAL;
+
+	if (count > kbuf_sz)
+		return -EINVAL;
+
+	if (copy_from_user(kbuf, buf, count))
+		return -EFAULT;
+
+	kbuf[count - 1] = '\0';
+	tmp = strchr(kbuf, '=');
+	if (!tmp)
+		return -EINVAL;
+
+	*tmp++ = '\0';
+
+	if (!strcmp(kbuf, "partition_entitled_capacity")) {
+		char *endp;
+		*new_entitled_ptr = (u64) simple_strtoul(tmp, &endp, 10);
+		if (endp == tmp)
+			return -EINVAL;
+
+		retval = update_ppp(new_entitled_ptr, NULL);
+	} else if (!strcmp(kbuf, "capacity_weight")) {
+		char *endp;
+		*new_weight_ptr = (u8) simple_strtoul(tmp, &endp, 10);
+		if (endp == tmp)
+			return -EINVAL;
+
+		retval = update_ppp(NULL, new_weight_ptr);
+	} else if (!strcmp(kbuf, "entitled_memory")) {
+		char *endp;
+		*new_entitled_ptr = (u64) simple_strtoul(tmp, &endp, 10);
+		if (endp == tmp)
+			return -EINVAL;
+
+		retval = update_mpp(new_entitled_ptr, NULL);
+	} else if (!strcmp(kbuf, "entitled_memory_weight")) {
+		char *endp;
+		*new_weight_ptr = (u8) simple_strtoul(tmp, &endp, 10);
+		if (endp == tmp)
+			return -EINVAL;
+
+		retval = update_mpp(NULL, new_weight_ptr);
+	} else
+		return -EINVAL;
+
+	if (retval == H_SUCCESS || retval == H_CONSTRAINED) {
+		retval = count;
+	} else if (retval == H_BUSY) {
+		retval = -EBUSY;
+	} else if (retval == H_HARDWARE) {
+		retval = -EIO;
+	} else if (retval == H_PARAMETER) {
+		retval = -EINVAL;
+	}
+
+	return retval;
+}
+
+static int lparcfg_data(struct seq_file *m, void *v)
+{
+	struct device_node *rootdn;
+	const char *model = "";
+	const char *system_id = "";
+	const char *tmp;
+	const __be32 *lp_index_ptr;
+	unsigned int lp_index = 0;
+
+	seq_printf(m, "%s %s\n", MODULE_NAME, MODULE_VERS);
+
+	rootdn = of_find_node_by_path("/");
+	if (rootdn) {
+		tmp = of_get_property(rootdn, "model", NULL);
+		if (tmp)
+			model = tmp;
+		tmp = of_get_property(rootdn, "system-id", NULL);
+		if (tmp)
+			system_id = tmp;
+		lp_index_ptr = of_get_property(rootdn, "ibm,partition-no",
+					NULL);
+		if (lp_index_ptr)
+			lp_index = be32_to_cpup(lp_index_ptr);
+		of_node_put(rootdn);
+	}
+	seq_printf(m, "serial_number=%s\n", system_id);
+	seq_printf(m, "system_type=%s\n", model);
+	seq_printf(m, "partition_id=%d\n", (int)lp_index);
+
+	return pseries_lparcfg_data(m, v);
+}
+
+static int lparcfg_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, lparcfg_data, NULL);
+}
+
+static const struct file_operations lparcfg_fops = {
+	.read		= seq_read,
+	.write		= lparcfg_write,
+	.open		= lparcfg_open,
+	.release	= single_release,
+	.llseek		= seq_lseek,
+};
+
+static int __init lparcfg_init(void)
+{
+	umode_t mode = 0444;
+
+	/* Allow writing if we have FW_FEATURE_SPLPAR */
+	if (firmware_has_feature(FW_FEATURE_SPLPAR))
+		mode |= 0200;
+
+	if (!proc_create("powerpc/lparcfg", mode, NULL, &lparcfg_fops)) {
+		printk(KERN_ERR "Failed to create powerpc/lparcfg\n");
+		return -EIO;
+	}
+	return 0;
+}
+machine_device_initcall(pseries, lparcfg_init);
diff --git a/arch/powerpc/platforms/pseries/mobility.c b/arch/powerpc/platforms/pseries/mobility.c
new file mode 100644
index 000000000..70744b4fb
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/mobility.c
@@ -0,0 +1,439 @@
+/*
+ * Support for Partition Mobility/Migration
+ *
+ * Copyright (C) 2010 Nathan Fontenot
+ * Copyright (C) 2010 IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ */
+
+#include <linux/cpu.h>
+#include <linux/kernel.h>
+#include <linux/kobject.h>
+#include <linux/sched.h>
+#include <linux/smp.h>
+#include <linux/stat.h>
+#include <linux/completion.h>
+#include <linux/device.h>
+#include <linux/delay.h>
+#include <linux/slab.h>
+#include <linux/stringify.h>
+
+#include <asm/machdep.h>
+#include <asm/rtas.h>
+#include "pseries.h"
+#include "../../kernel/cacheinfo.h"
+
+static struct kobject *mobility_kobj;
+
+struct update_props_workarea {
+	__be32 phandle;
+	__be32 state;
+	__be64 reserved;
+	__be32 nprops;
+} __packed;
+
+#define NODE_ACTION_MASK	0xff000000
+#define NODE_COUNT_MASK		0x00ffffff
+
+#define DELETE_DT_NODE	0x01000000
+#define UPDATE_DT_NODE	0x02000000
+#define ADD_DT_NODE	0x03000000
+
+#define MIGRATION_SCOPE	(1)
+#define PRRN_SCOPE -2
+
+static int mobility_rtas_call(int token, char *buf, s32 scope)
+{
+	int rc;
+
+	spin_lock(&rtas_data_buf_lock);
+
+	memcpy(rtas_data_buf, buf, RTAS_DATA_BUF_SIZE);
+	rc = rtas_call(token, 2, 1, NULL, rtas_data_buf, scope);
+	memcpy(buf, rtas_data_buf, RTAS_DATA_BUF_SIZE);
+
+	spin_unlock(&rtas_data_buf_lock);
+	return rc;
+}
+
+static int delete_dt_node(__be32 phandle)
+{
+	struct device_node *dn;
+
+	dn = of_find_node_by_phandle(be32_to_cpu(phandle));
+	if (!dn)
+		return -ENOENT;
+
+	dlpar_detach_node(dn);
+	of_node_put(dn);
+	return 0;
+}
+
+static int update_dt_property(struct device_node *dn, struct property **prop,
+			      const char *name, u32 vd, char *value)
+{
+	struct property *new_prop = *prop;
+	int more = 0;
+
+	/* A negative 'vd' value indicates that only part of the new property
+	 * value is contained in the buffer and we need to call
+	 * ibm,update-properties again to get the rest of the value.
+	 *
+	 * A negative value is also the two's compliment of the actual value.
+	 */
+	if (vd & 0x80000000) {
+		vd = ~vd + 1;
+		more = 1;
+	}
+
+	if (new_prop) {
+		/* partial property fixup */
+		char *new_data = kzalloc(new_prop->length + vd, GFP_KERNEL);
+		if (!new_data)
+			return -ENOMEM;
+
+		memcpy(new_data, new_prop->value, new_prop->length);
+		memcpy(new_data + new_prop->length, value, vd);
+
+		kfree(new_prop->value);
+		new_prop->value = new_data;
+		new_prop->length += vd;
+	} else {
+		new_prop = kzalloc(sizeof(*new_prop), GFP_KERNEL);
+		if (!new_prop)
+			return -ENOMEM;
+
+		new_prop->name = kstrdup(name, GFP_KERNEL);
+		if (!new_prop->name) {
+			kfree(new_prop);
+			return -ENOMEM;
+		}
+
+		new_prop->length = vd;
+		new_prop->value = kzalloc(new_prop->length, GFP_KERNEL);
+		if (!new_prop->value) {
+			kfree(new_prop->name);
+			kfree(new_prop);
+			return -ENOMEM;
+		}
+
+		memcpy(new_prop->value, value, vd);
+		*prop = new_prop;
+	}
+
+	if (!more) {
+		of_update_property(dn, new_prop);
+		*prop = NULL;
+	}
+
+	return 0;
+}
+
+static int update_dt_node(__be32 phandle, s32 scope)
+{
+	struct update_props_workarea *upwa;
+	struct device_node *dn;
+	struct property *prop = NULL;
+	int i, rc, rtas_rc;
+	char *prop_data;
+	char *rtas_buf;
+	int update_properties_token;
+	u32 nprops;
+	u32 vd;
+
+	update_properties_token = rtas_token("ibm,update-properties");
+	if (update_properties_token == RTAS_UNKNOWN_SERVICE)
+		return -EINVAL;
+
+	rtas_buf = kzalloc(RTAS_DATA_BUF_SIZE, GFP_KERNEL);
+	if (!rtas_buf)
+		return -ENOMEM;
+
+	dn = of_find_node_by_phandle(be32_to_cpu(phandle));
+	if (!dn) {
+		kfree(rtas_buf);
+		return -ENOENT;
+	}
+
+	upwa = (struct update_props_workarea *)&rtas_buf[0];
+	upwa->phandle = phandle;
+
+	do {
+		rtas_rc = mobility_rtas_call(update_properties_token, rtas_buf,
+					scope);
+		if (rtas_rc < 0)
+			break;
+
+		prop_data = rtas_buf + sizeof(*upwa);
+		nprops = be32_to_cpu(upwa->nprops);
+
+		/* On the first call to ibm,update-properties for a node the
+		 * the first property value descriptor contains an empty
+		 * property name, the property value length encoded as u32,
+		 * and the property value is the node path being updated.
+		 */
+		if (*prop_data == 0) {
+			prop_data++;
+			vd = be32_to_cpu(*(__be32 *)prop_data);
+			prop_data += vd + sizeof(vd);
+			nprops--;
+		}
+
+		for (i = 0; i < nprops; i++) {
+			char *prop_name;
+
+			prop_name = prop_data;
+			prop_data += strlen(prop_name) + 1;
+			vd = be32_to_cpu(*(__be32 *)prop_data);
+			prop_data += sizeof(vd);
+
+			switch (vd) {
+			case 0x00000000:
+				/* name only property, nothing to do */
+				break;
+
+			case 0x80000000:
+				of_remove_property(dn, of_find_property(dn,
+							prop_name, NULL));
+				prop = NULL;
+				break;
+
+			default:
+				rc = update_dt_property(dn, &prop, prop_name,
+							vd, prop_data);
+				if (rc) {
+					printk(KERN_ERR "Could not update %s"
+					       " property\n", prop_name);
+				}
+
+				prop_data += vd;
+			}
+
+			cond_resched();
+		}
+
+		cond_resched();
+	} while (rtas_rc == 1);
+
+	of_node_put(dn);
+	kfree(rtas_buf);
+	return 0;
+}
+
+static int add_dt_node(__be32 parent_phandle, __be32 drc_index)
+{
+	struct device_node *dn;
+	struct device_node *parent_dn;
+	int rc;
+
+	parent_dn = of_find_node_by_phandle(be32_to_cpu(parent_phandle));
+	if (!parent_dn)
+		return -ENOENT;
+
+	dn = dlpar_configure_connector(drc_index, parent_dn);
+	if (!dn) {
+		of_node_put(parent_dn);
+		return -ENOENT;
+	}
+
+	rc = dlpar_attach_node(dn, parent_dn);
+	if (rc)
+		dlpar_free_cc_nodes(dn);
+
+	of_node_put(parent_dn);
+	return rc;
+}
+
+static void prrn_update_node(__be32 phandle)
+{
+	struct pseries_hp_errorlog *hp_elog;
+	struct device_node *dn;
+
+	/*
+	 * If a node is found from a the given phandle, the phandle does not
+	 * represent the drc index of an LMB and we can ignore.
+	 */
+	dn = of_find_node_by_phandle(be32_to_cpu(phandle));
+	if (dn) {
+		of_node_put(dn);
+		return;
+	}
+
+	hp_elog = kzalloc(sizeof(*hp_elog), GFP_KERNEL);
+	if(!hp_elog)
+		return;
+
+	hp_elog->resource = PSERIES_HP_ELOG_RESOURCE_MEM;
+	hp_elog->action = PSERIES_HP_ELOG_ACTION_READD;
+	hp_elog->id_type = PSERIES_HP_ELOG_ID_DRC_INDEX;
+	hp_elog->_drc_u.drc_index = phandle;
+
+	queue_hotplug_event(hp_elog, NULL, NULL);
+
+	kfree(hp_elog);
+}
+
+int pseries_devicetree_update(s32 scope)
+{
+	char *rtas_buf;
+	__be32 *data;
+	int update_nodes_token;
+	int rc;
+
+	update_nodes_token = rtas_token("ibm,update-nodes");
+	if (update_nodes_token == RTAS_UNKNOWN_SERVICE)
+		return -EINVAL;
+
+	rtas_buf = kzalloc(RTAS_DATA_BUF_SIZE, GFP_KERNEL);
+	if (!rtas_buf)
+		return -ENOMEM;
+
+	do {
+		rc = mobility_rtas_call(update_nodes_token, rtas_buf, scope);
+		if (rc && rc != 1)
+			break;
+
+		data = (__be32 *)rtas_buf + 4;
+		while (be32_to_cpu(*data) & NODE_ACTION_MASK) {
+			int i;
+			u32 action = be32_to_cpu(*data) & NODE_ACTION_MASK;
+			u32 node_count = be32_to_cpu(*data) & NODE_COUNT_MASK;
+
+			data++;
+
+			for (i = 0; i < node_count; i++) {
+				__be32 phandle = *data++;
+				__be32 drc_index;
+
+				switch (action) {
+				case DELETE_DT_NODE:
+					delete_dt_node(phandle);
+					break;
+				case UPDATE_DT_NODE:
+					update_dt_node(phandle, scope);
+
+					if (scope == PRRN_SCOPE)
+						prrn_update_node(phandle);
+
+					break;
+				case ADD_DT_NODE:
+					drc_index = *data++;
+					add_dt_node(phandle, drc_index);
+					break;
+				}
+
+				cond_resched();
+			}
+		}
+
+		cond_resched();
+	} while (rc == 1);
+
+	kfree(rtas_buf);
+	return rc;
+}
+
+void post_mobility_fixup(void)
+{
+	int rc;
+	int activate_fw_token;
+
+	activate_fw_token = rtas_token("ibm,activate-firmware");
+	if (activate_fw_token == RTAS_UNKNOWN_SERVICE) {
+		printk(KERN_ERR "Could not make post-mobility "
+		       "activate-fw call.\n");
+		return;
+	}
+
+	do {
+		rc = rtas_call(activate_fw_token, 0, 1, NULL);
+	} while (rtas_busy_delay(rc));
+
+	if (rc)
+		printk(KERN_ERR "Post-mobility activate-fw failed: %d\n", rc);
+
+	/*
+	 * We don't want CPUs to go online/offline while the device
+	 * tree is being updated.
+	 */
+	cpus_read_lock();
+
+	/*
+	 * It's common for the destination firmware to replace cache
+	 * nodes.  Release all of the cacheinfo hierarchy's references
+	 * before updating the device tree.
+	 */
+	cacheinfo_teardown();
+
+	rc = pseries_devicetree_update(MIGRATION_SCOPE);
+	if (rc)
+		printk(KERN_ERR "Post-mobility device tree update "
+			"failed: %d\n", rc);
+
+	cacheinfo_rebuild();
+
+	cpus_read_unlock();
+
+	/* Possibly switch to a new RFI flush type */
+	pseries_setup_rfi_flush();
+
+	return;
+}
+
+static ssize_t migration_store(struct class *class,
+			       struct class_attribute *attr, const char *buf,
+			       size_t count)
+{
+	u64 streamid;
+	int rc;
+
+	rc = kstrtou64(buf, 0, &streamid);
+	if (rc)
+		return rc;
+
+	do {
+		rc = rtas_ibm_suspend_me(streamid);
+		if (rc == -EAGAIN)
+			ssleep(1);
+	} while (rc == -EAGAIN);
+
+	if (rc)
+		return rc;
+
+	post_mobility_fixup();
+	return count;
+}
+
+/*
+ * Used by drmgr to determine the kernel behavior of the migration interface.
+ *
+ * Version 1: Performs all PAPR requirements for migration including
+ *	firmware activation and device tree update.
+ */
+#define MIGRATION_API_VERSION	1
+
+static CLASS_ATTR_WO(migration);
+static CLASS_ATTR_STRING(api_version, 0444, __stringify(MIGRATION_API_VERSION));
+
+static int __init mobility_sysfs_init(void)
+{
+	int rc;
+
+	mobility_kobj = kobject_create_and_add("mobility", kernel_kobj);
+	if (!mobility_kobj)
+		return -ENOMEM;
+
+	rc = sysfs_create_file(mobility_kobj, &class_attr_migration.attr);
+	if (rc)
+		pr_err("mobility: unable to create migration sysfs file (%d)\n", rc);
+
+	rc = sysfs_create_file(mobility_kobj, &class_attr_api_version.attr.attr);
+	if (rc)
+		pr_err("mobility: unable to create api_version sysfs file (%d)\n", rc);
+
+	return 0;
+}
+machine_device_initcall(pseries, mobility_sysfs_init);
diff --git a/arch/powerpc/platforms/pseries/msi.c b/arch/powerpc/platforms/pseries/msi.c
new file mode 100644
index 000000000..b74969481
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/msi.c
@@ -0,0 +1,531 @@
+/*
+ * Copyright 2006 Jake Moilanen <moilanen@austin.ibm.com>, IBM Corp.
+ * Copyright 2006-2007 Michael Ellerman, IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2 of the
+ * License.
+ *
+ */
+
+#include <linux/device.h>
+#include <linux/irq.h>
+#include <linux/msi.h>
+
+#include <asm/rtas.h>
+#include <asm/hw_irq.h>
+#include <asm/ppc-pci.h>
+#include <asm/machdep.h>
+
+#include "pseries.h"
+
+static int query_token, change_token;
+
+#define RTAS_QUERY_FN		0
+#define RTAS_CHANGE_FN		1
+#define RTAS_RESET_FN		2
+#define RTAS_CHANGE_MSI_FN	3
+#define RTAS_CHANGE_MSIX_FN	4
+#define RTAS_CHANGE_32MSI_FN	5
+
+/* RTAS Helpers */
+
+static int rtas_change_msi(struct pci_dn *pdn, u32 func, u32 num_irqs)
+{
+	u32 addr, seq_num, rtas_ret[3];
+	unsigned long buid;
+	int rc;
+
+	addr = rtas_config_addr(pdn->busno, pdn->devfn, 0);
+	buid = pdn->phb->buid;
+
+	seq_num = 1;
+	do {
+		if (func == RTAS_CHANGE_MSI_FN || func == RTAS_CHANGE_MSIX_FN ||
+		    func == RTAS_CHANGE_32MSI_FN)
+			rc = rtas_call(change_token, 6, 4, rtas_ret, addr,
+					BUID_HI(buid), BUID_LO(buid),
+					func, num_irqs, seq_num);
+		else
+			rc = rtas_call(change_token, 6, 3, rtas_ret, addr,
+					BUID_HI(buid), BUID_LO(buid),
+					func, num_irqs, seq_num);
+
+		seq_num = rtas_ret[1];
+	} while (rtas_busy_delay(rc));
+
+	/*
+	 * If the RTAS call succeeded, return the number of irqs allocated.
+	 * If not, make sure we return a negative error code.
+	 */
+	if (rc == 0)
+		rc = rtas_ret[0];
+	else if (rc > 0)
+		rc = -rc;
+
+	pr_debug("rtas_msi: ibm,change_msi(func=%d,num=%d), got %d rc = %d\n",
+		 func, num_irqs, rtas_ret[0], rc);
+
+	return rc;
+}
+
+static void rtas_disable_msi(struct pci_dev *pdev)
+{
+	struct pci_dn *pdn;
+
+	pdn = pci_get_pdn(pdev);
+	if (!pdn)
+		return;
+
+	/*
+	 * disabling MSI with the explicit interface also disables MSI-X
+	 */
+	if (rtas_change_msi(pdn, RTAS_CHANGE_MSI_FN, 0) != 0) {
+		/* 
+		 * may have failed because explicit interface is not
+		 * present
+		 */
+		if (rtas_change_msi(pdn, RTAS_CHANGE_FN, 0) != 0) {
+			pr_debug("rtas_msi: Setting MSIs to 0 failed!\n");
+		}
+	}
+}
+
+static int rtas_query_irq_number(struct pci_dn *pdn, int offset)
+{
+	u32 addr, rtas_ret[2];
+	unsigned long buid;
+	int rc;
+
+	addr = rtas_config_addr(pdn->busno, pdn->devfn, 0);
+	buid = pdn->phb->buid;
+
+	do {
+		rc = rtas_call(query_token, 4, 3, rtas_ret, addr,
+			       BUID_HI(buid), BUID_LO(buid), offset);
+	} while (rtas_busy_delay(rc));
+
+	if (rc) {
+		pr_debug("rtas_msi: error (%d) querying source number\n", rc);
+		return rc;
+	}
+
+	return rtas_ret[0];
+}
+
+static void rtas_teardown_msi_irqs(struct pci_dev *pdev)
+{
+	struct msi_desc *entry;
+
+	for_each_pci_msi_entry(entry, pdev) {
+		if (!entry->irq)
+			continue;
+
+		irq_set_msi_desc(entry->irq, NULL);
+		irq_dispose_mapping(entry->irq);
+	}
+
+	rtas_disable_msi(pdev);
+}
+
+static int check_req(struct pci_dev *pdev, int nvec, char *prop_name)
+{
+	struct device_node *dn;
+	const __be32 *p;
+	u32 req_msi;
+
+	dn = pci_device_to_OF_node(pdev);
+
+	p = of_get_property(dn, prop_name, NULL);
+	if (!p) {
+		pr_debug("rtas_msi: No %s on %pOF\n", prop_name, dn);
+		return -ENOENT;
+	}
+
+	req_msi = be32_to_cpup(p);
+	if (req_msi < nvec) {
+		pr_debug("rtas_msi: %s requests < %d MSIs\n", prop_name, nvec);
+
+		if (req_msi == 0) /* Be paranoid */
+			return -ENOSPC;
+
+		return req_msi;
+	}
+
+	return 0;
+}
+
+static int check_req_msi(struct pci_dev *pdev, int nvec)
+{
+	return check_req(pdev, nvec, "ibm,req#msi");
+}
+
+static int check_req_msix(struct pci_dev *pdev, int nvec)
+{
+	return check_req(pdev, nvec, "ibm,req#msi-x");
+}
+
+/* Quota calculation */
+
+static struct device_node *find_pe_total_msi(struct pci_dev *dev, int *total)
+{
+	struct device_node *dn;
+	const __be32 *p;
+
+	dn = of_node_get(pci_device_to_OF_node(dev));
+	while (dn) {
+		p = of_get_property(dn, "ibm,pe-total-#msi", NULL);
+		if (p) {
+			pr_debug("rtas_msi: found prop on dn %pOF\n",
+				dn);
+			*total = be32_to_cpup(p);
+			return dn;
+		}
+
+		dn = of_get_next_parent(dn);
+	}
+
+	return NULL;
+}
+
+static struct device_node *find_pe_dn(struct pci_dev *dev, int *total)
+{
+	struct device_node *dn;
+	struct eeh_dev *edev;
+
+	/* Found our PE and assume 8 at that point. */
+
+	dn = pci_device_to_OF_node(dev);
+	if (!dn)
+		return NULL;
+
+	/* Get the top level device in the PE */
+	edev = pdn_to_eeh_dev(PCI_DN(dn));
+	if (edev->pe)
+		edev = list_first_entry(&edev->pe->edevs, struct eeh_dev, list);
+	dn = pci_device_to_OF_node(edev->pdev);
+	if (!dn)
+		return NULL;
+
+	/* We actually want the parent */
+	dn = of_get_parent(dn);
+	if (!dn)
+		return NULL;
+
+	/* Hardcode of 8 for old firmwares */
+	*total = 8;
+	pr_debug("rtas_msi: using PE dn %pOF\n", dn);
+
+	return dn;
+}
+
+struct msi_counts {
+	struct device_node *requestor;
+	int num_devices;
+	int request;
+	int quota;
+	int spare;
+	int over_quota;
+};
+
+static void *count_non_bridge_devices(struct device_node *dn, void *data)
+{
+	struct msi_counts *counts = data;
+	const __be32 *p;
+	u32 class;
+
+	pr_debug("rtas_msi: counting %pOF\n", dn);
+
+	p = of_get_property(dn, "class-code", NULL);
+	class = p ? be32_to_cpup(p) : 0;
+
+	if ((class >> 8) != PCI_CLASS_BRIDGE_PCI)
+		counts->num_devices++;
+
+	return NULL;
+}
+
+static void *count_spare_msis(struct device_node *dn, void *data)
+{
+	struct msi_counts *counts = data;
+	const __be32 *p;
+	int req;
+
+	if (dn == counts->requestor)
+		req = counts->request;
+	else {
+		/* We don't know if a driver will try to use MSI or MSI-X,
+		 * so we just have to punt and use the larger of the two. */
+		req = 0;
+		p = of_get_property(dn, "ibm,req#msi", NULL);
+		if (p)
+			req = be32_to_cpup(p);
+
+		p = of_get_property(dn, "ibm,req#msi-x", NULL);
+		if (p)
+			req = max(req, (int)be32_to_cpup(p));
+	}
+
+	if (req < counts->quota)
+		counts->spare += counts->quota - req;
+	else if (req > counts->quota)
+		counts->over_quota++;
+
+	return NULL;
+}
+
+static int msi_quota_for_device(struct pci_dev *dev, int request)
+{
+	struct device_node *pe_dn;
+	struct msi_counts counts;
+	int total;
+
+	pr_debug("rtas_msi: calc quota for %s, request %d\n", pci_name(dev),
+		  request);
+
+	pe_dn = find_pe_total_msi(dev, &total);
+	if (!pe_dn)
+		pe_dn = find_pe_dn(dev, &total);
+
+	if (!pe_dn) {
+		pr_err("rtas_msi: couldn't find PE for %s\n", pci_name(dev));
+		goto out;
+	}
+
+	pr_debug("rtas_msi: found PE %pOF\n", pe_dn);
+
+	memset(&counts, 0, sizeof(struct msi_counts));
+
+	/* Work out how many devices we have below this PE */
+	pci_traverse_device_nodes(pe_dn, count_non_bridge_devices, &counts);
+
+	if (counts.num_devices == 0) {
+		pr_err("rtas_msi: found 0 devices under PE for %s\n",
+			pci_name(dev));
+		goto out;
+	}
+
+	counts.quota = total / counts.num_devices;
+	if (request <= counts.quota)
+		goto out;
+
+	/* else, we have some more calculating to do */
+	counts.requestor = pci_device_to_OF_node(dev);
+	counts.request = request;
+	pci_traverse_device_nodes(pe_dn, count_spare_msis, &counts);
+
+	/* If the quota isn't an integer multiple of the total, we can
+	 * use the remainder as spare MSIs for anyone that wants them. */
+	counts.spare += total % counts.num_devices;
+
+	/* Divide any spare by the number of over-quota requestors */
+	if (counts.over_quota)
+		counts.quota += counts.spare / counts.over_quota;
+
+	/* And finally clamp the request to the possibly adjusted quota */
+	request = min(counts.quota, request);
+
+	pr_debug("rtas_msi: request clamped to quota %d\n", request);
+out:
+	of_node_put(pe_dn);
+
+	return request;
+}
+
+static int check_msix_entries(struct pci_dev *pdev)
+{
+	struct msi_desc *entry;
+	int expected;
+
+	/* There's no way for us to express to firmware that we want
+	 * a discontiguous, or non-zero based, range of MSI-X entries.
+	 * So we must reject such requests. */
+
+	expected = 0;
+	for_each_pci_msi_entry(entry, pdev) {
+		if (entry->msi_attrib.entry_nr != expected) {
+			pr_debug("rtas_msi: bad MSI-X entries.\n");
+			return -EINVAL;
+		}
+		expected++;
+	}
+
+	return 0;
+}
+
+static void rtas_hack_32bit_msi_gen2(struct pci_dev *pdev)
+{
+	u32 addr_hi, addr_lo;
+
+	/*
+	 * We should only get in here for IODA1 configs. This is based on the
+	 * fact that we using RTAS for MSIs, we don't have the 32 bit MSI RTAS
+	 * support, and we are in a PCIe Gen2 slot.
+	 */
+	dev_info(&pdev->dev,
+		 "rtas_msi: No 32 bit MSI firmware support, forcing 32 bit MSI\n");
+	pci_read_config_dword(pdev, pdev->msi_cap + PCI_MSI_ADDRESS_HI, &addr_hi);
+	addr_lo = 0xffff0000 | ((addr_hi >> (48 - 32)) << 4);
+	pci_write_config_dword(pdev, pdev->msi_cap + PCI_MSI_ADDRESS_LO, addr_lo);
+	pci_write_config_dword(pdev, pdev->msi_cap + PCI_MSI_ADDRESS_HI, 0);
+}
+
+static int rtas_setup_msi_irqs(struct pci_dev *pdev, int nvec_in, int type)
+{
+	struct pci_dn *pdn;
+	int hwirq, virq, i, quota, rc;
+	struct msi_desc *entry;
+	struct msi_msg msg;
+	int nvec = nvec_in;
+	int use_32bit_msi_hack = 0;
+
+	if (type == PCI_CAP_ID_MSIX)
+		rc = check_req_msix(pdev, nvec);
+	else
+		rc = check_req_msi(pdev, nvec);
+
+	if (rc)
+		return rc;
+
+	quota = msi_quota_for_device(pdev, nvec);
+
+	if (quota && quota < nvec)
+		return quota;
+
+	if (type == PCI_CAP_ID_MSIX && check_msix_entries(pdev))
+		return -EINVAL;
+
+	/*
+	 * Firmware currently refuse any non power of two allocation
+	 * so we round up if the quota will allow it.
+	 */
+	if (type == PCI_CAP_ID_MSIX) {
+		int m = roundup_pow_of_two(nvec);
+		quota = msi_quota_for_device(pdev, m);
+
+		if (quota >= m)
+			nvec = m;
+	}
+
+	pdn = pci_get_pdn(pdev);
+
+	/*
+	 * Try the new more explicit firmware interface, if that fails fall
+	 * back to the old interface. The old interface is known to never
+	 * return MSI-Xs.
+	 */
+again:
+	if (type == PCI_CAP_ID_MSI) {
+		if (pdev->no_64bit_msi) {
+			rc = rtas_change_msi(pdn, RTAS_CHANGE_32MSI_FN, nvec);
+			if (rc < 0) {
+				/*
+				 * We only want to run the 32 bit MSI hack below if
+				 * the max bus speed is Gen2 speed
+				 */
+				if (pdev->bus->max_bus_speed != PCIE_SPEED_5_0GT)
+					return rc;
+
+				use_32bit_msi_hack = 1;
+			}
+		} else
+			rc = -1;
+
+		if (rc < 0)
+			rc = rtas_change_msi(pdn, RTAS_CHANGE_MSI_FN, nvec);
+
+		if (rc < 0) {
+			pr_debug("rtas_msi: trying the old firmware call.\n");
+			rc = rtas_change_msi(pdn, RTAS_CHANGE_FN, nvec);
+		}
+
+		if (use_32bit_msi_hack && rc > 0)
+			rtas_hack_32bit_msi_gen2(pdev);
+	} else
+		rc = rtas_change_msi(pdn, RTAS_CHANGE_MSIX_FN, nvec);
+
+	if (rc != nvec) {
+		if (nvec != nvec_in) {
+			nvec = nvec_in;
+			goto again;
+		}
+		pr_debug("rtas_msi: rtas_change_msi() failed\n");
+		return rc;
+	}
+
+	i = 0;
+	for_each_pci_msi_entry(entry, pdev) {
+		hwirq = rtas_query_irq_number(pdn, i++);
+		if (hwirq < 0) {
+			pr_debug("rtas_msi: error (%d) getting hwirq\n", rc);
+			return hwirq;
+		}
+
+		virq = irq_create_mapping(NULL, hwirq);
+
+		if (!virq) {
+			pr_debug("rtas_msi: Failed mapping hwirq %d\n", hwirq);
+			return -ENOSPC;
+		}
+
+		dev_dbg(&pdev->dev, "rtas_msi: allocated virq %d\n", virq);
+		irq_set_msi_desc(virq, entry);
+
+		/* Read config space back so we can restore after reset */
+		__pci_read_msi_msg(entry, &msg);
+		entry->msg = msg;
+	}
+
+	return 0;
+}
+
+static void rtas_msi_pci_irq_fixup(struct pci_dev *pdev)
+{
+	/* No LSI -> leave MSIs (if any) configured */
+	if (!pdev->irq) {
+		dev_dbg(&pdev->dev, "rtas_msi: no LSI, nothing to do.\n");
+		return;
+	}
+
+	/* No MSI -> MSIs can't have been assigned by fw, leave LSI */
+	if (check_req_msi(pdev, 1) && check_req_msix(pdev, 1)) {
+		dev_dbg(&pdev->dev, "rtas_msi: no req#msi/x, nothing to do.\n");
+		return;
+	}
+
+	dev_dbg(&pdev->dev, "rtas_msi: disabling existing MSI.\n");
+	rtas_disable_msi(pdev);
+}
+
+static int rtas_msi_init(void)
+{
+	struct pci_controller *phb;
+
+	query_token  = rtas_token("ibm,query-interrupt-source-number");
+	change_token = rtas_token("ibm,change-msi");
+
+	if ((query_token == RTAS_UNKNOWN_SERVICE) ||
+			(change_token == RTAS_UNKNOWN_SERVICE)) {
+		pr_debug("rtas_msi: no RTAS tokens, no MSI support.\n");
+		return -1;
+	}
+
+	pr_debug("rtas_msi: Registering RTAS MSI callbacks.\n");
+
+	WARN_ON(pseries_pci_controller_ops.setup_msi_irqs);
+	pseries_pci_controller_ops.setup_msi_irqs = rtas_setup_msi_irqs;
+	pseries_pci_controller_ops.teardown_msi_irqs = rtas_teardown_msi_irqs;
+
+	list_for_each_entry(phb, &hose_list, list_node) {
+		WARN_ON(phb->controller_ops.setup_msi_irqs);
+		phb->controller_ops.setup_msi_irqs = rtas_setup_msi_irqs;
+		phb->controller_ops.teardown_msi_irqs = rtas_teardown_msi_irqs;
+	}
+
+	WARN_ON(ppc_md.pci_irq_fixup);
+	ppc_md.pci_irq_fixup = rtas_msi_pci_irq_fixup;
+
+	return 0;
+}
+machine_arch_initcall(pseries, rtas_msi_init);
diff --git a/arch/powerpc/platforms/pseries/nvram.c b/arch/powerpc/platforms/pseries/nvram.c
new file mode 100644
index 000000000..69cedc1b3
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/nvram.c
@@ -0,0 +1,247 @@
+/*
+ *  c 2001 PPC 64 Team, IBM Corp
+ *
+ *      This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ *
+ * /dev/nvram driver for PPC64
+ *
+ * This perhaps should live in drivers/char
+ */
+
+
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <linux/spinlock.h>
+#include <linux/slab.h>
+#include <linux/ctype.h>
+#include <linux/uaccess.h>
+#include <asm/nvram.h>
+#include <asm/rtas.h>
+#include <asm/prom.h>
+#include <asm/machdep.h>
+
+/* Max bytes to read/write in one go */
+#define NVRW_CNT 0x20
+
+static unsigned int nvram_size;
+static int nvram_fetch, nvram_store;
+static char nvram_buf[NVRW_CNT];	/* assume this is in the first 4GB */
+static DEFINE_SPINLOCK(nvram_lock);
+
+/* See clobbering_unread_rtas_event() */
+#define NVRAM_RTAS_READ_TIMEOUT 5		/* seconds */
+static time64_t last_unread_rtas_event;		/* timestamp */
+
+#ifdef CONFIG_PSTORE
+time64_t last_rtas_event;
+#endif
+
+static ssize_t pSeries_nvram_read(char *buf, size_t count, loff_t *index)
+{
+	unsigned int i;
+	unsigned long len;
+	int done;
+	unsigned long flags;
+	char *p = buf;
+
+
+	if (nvram_size == 0 || nvram_fetch == RTAS_UNKNOWN_SERVICE)
+		return -ENODEV;
+
+	if (*index >= nvram_size)
+		return 0;
+
+	i = *index;
+	if (i + count > nvram_size)
+		count = nvram_size - i;
+
+	spin_lock_irqsave(&nvram_lock, flags);
+
+	for (; count != 0; count -= len) {
+		len = count;
+		if (len > NVRW_CNT)
+			len = NVRW_CNT;
+		
+		if ((rtas_call(nvram_fetch, 3, 2, &done, i, __pa(nvram_buf),
+			       len) != 0) || len != done) {
+			spin_unlock_irqrestore(&nvram_lock, flags);
+			return -EIO;
+		}
+		
+		memcpy(p, nvram_buf, len);
+
+		p += len;
+		i += len;
+	}
+
+	spin_unlock_irqrestore(&nvram_lock, flags);
+	
+	*index = i;
+	return p - buf;
+}
+
+static ssize_t pSeries_nvram_write(char *buf, size_t count, loff_t *index)
+{
+	unsigned int i;
+	unsigned long len;
+	int done;
+	unsigned long flags;
+	const char *p = buf;
+
+	if (nvram_size == 0 || nvram_store == RTAS_UNKNOWN_SERVICE)
+		return -ENODEV;
+
+	if (*index >= nvram_size)
+		return 0;
+
+	i = *index;
+	if (i + count > nvram_size)
+		count = nvram_size - i;
+
+	spin_lock_irqsave(&nvram_lock, flags);
+
+	for (; count != 0; count -= len) {
+		len = count;
+		if (len > NVRW_CNT)
+			len = NVRW_CNT;
+
+		memcpy(nvram_buf, p, len);
+
+		if ((rtas_call(nvram_store, 3, 2, &done, i, __pa(nvram_buf),
+			       len) != 0) || len != done) {
+			spin_unlock_irqrestore(&nvram_lock, flags);
+			return -EIO;
+		}
+		
+		p += len;
+		i += len;
+	}
+	spin_unlock_irqrestore(&nvram_lock, flags);
+	
+	*index = i;
+	return p - buf;
+}
+
+static ssize_t pSeries_nvram_get_size(void)
+{
+	return nvram_size ? nvram_size : -ENODEV;
+}
+
+/* nvram_write_error_log
+ *
+ * We need to buffer the error logs into nvram to ensure that we have
+ * the failure information to decode.
+ */
+int nvram_write_error_log(char * buff, int length,
+                          unsigned int err_type, unsigned int error_log_cnt)
+{
+	int rc = nvram_write_os_partition(&rtas_log_partition, buff, length,
+						err_type, error_log_cnt);
+	if (!rc) {
+		last_unread_rtas_event = ktime_get_real_seconds();
+#ifdef CONFIG_PSTORE
+		last_rtas_event = ktime_get_real_seconds();
+#endif
+	}
+
+	return rc;
+}
+
+/* nvram_read_error_log
+ *
+ * Reads nvram for error log for at most 'length'
+ */
+int nvram_read_error_log(char *buff, int length,
+			unsigned int *err_type, unsigned int *error_log_cnt)
+{
+	return nvram_read_partition(&rtas_log_partition, buff, length,
+						err_type, error_log_cnt);
+}
+
+/* This doesn't actually zero anything, but it sets the event_logged
+ * word to tell that this event is safely in syslog.
+ */
+int nvram_clear_error_log(void)
+{
+	loff_t tmp_index;
+	int clear_word = ERR_FLAG_ALREADY_LOGGED;
+	int rc;
+
+	if (rtas_log_partition.index == -1)
+		return -1;
+
+	tmp_index = rtas_log_partition.index;
+	
+	rc = ppc_md.nvram_write((char *)&clear_word, sizeof(int), &tmp_index);
+	if (rc <= 0) {
+		printk(KERN_ERR "nvram_clear_error_log: Failed nvram_write (%d)\n", rc);
+		return rc;
+	}
+	last_unread_rtas_event = 0;
+
+	return 0;
+}
+
+/*
+ * Are we using the ibm,rtas-log for oops/panic reports?  And if so,
+ * would logging this oops/panic overwrite an RTAS event that rtas_errd
+ * hasn't had a chance to read and process?  Return 1 if so, else 0.
+ *
+ * We assume that if rtas_errd hasn't read the RTAS event in
+ * NVRAM_RTAS_READ_TIMEOUT seconds, it's probably not going to.
+ */
+int clobbering_unread_rtas_event(void)
+{
+	return (oops_log_partition.index == rtas_log_partition.index
+		&& last_unread_rtas_event
+		&& ktime_get_real_seconds() - last_unread_rtas_event <=
+						NVRAM_RTAS_READ_TIMEOUT);
+}
+
+static int __init pseries_nvram_init_log_partitions(void)
+{
+	int rc;
+
+	/* Scan nvram for partitions */
+	nvram_scan_partitions();
+
+	rc = nvram_init_os_partition(&rtas_log_partition);
+	nvram_init_oops_partition(rc == 0);
+	return 0;
+}
+machine_arch_initcall(pseries, pseries_nvram_init_log_partitions);
+
+int __init pSeries_nvram_init(void)
+{
+	struct device_node *nvram;
+	const __be32 *nbytes_p;
+	unsigned int proplen;
+
+	nvram = of_find_node_by_type(NULL, "nvram");
+	if (nvram == NULL)
+		return -ENODEV;
+
+	nbytes_p = of_get_property(nvram, "#bytes", &proplen);
+	if (nbytes_p == NULL || proplen != sizeof(unsigned int)) {
+		of_node_put(nvram);
+		return -EIO;
+	}
+
+	nvram_size = be32_to_cpup(nbytes_p);
+
+	nvram_fetch = rtas_token("nvram-fetch");
+	nvram_store = rtas_token("nvram-store");
+	printk(KERN_INFO "PPC64 nvram contains %d bytes\n", nvram_size);
+	of_node_put(nvram);
+
+	ppc_md.nvram_read	= pSeries_nvram_read;
+	ppc_md.nvram_write	= pSeries_nvram_write;
+	ppc_md.nvram_size	= pSeries_nvram_get_size;
+
+	return 0;
+}
+
diff --git a/arch/powerpc/platforms/pseries/of_helpers.c b/arch/powerpc/platforms/pseries/of_helpers.c
new file mode 100644
index 000000000..6df192f38
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/of_helpers.c
@@ -0,0 +1,99 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/string.h>
+#include <linux/err.h>
+#include <linux/slab.h>
+#include <linux/of.h>
+#include <asm/prom.h>
+
+#include "of_helpers.h"
+
+/**
+ * pseries_of_derive_parent - basically like dirname(1)
+ * @path:  the full_name of a node to be added to the tree
+ *
+ * Returns the node which should be the parent of the node
+ * described by path.  E.g., for path = "/foo/bar", returns
+ * the node with full_name = "/foo".
+ */
+struct device_node *pseries_of_derive_parent(const char *path)
+{
+	struct device_node *parent;
+	char *parent_path = "/";
+	const char *tail;
+
+	/* We do not want the trailing '/' character */
+	tail = kbasename(path) - 1;
+
+	/* reject if path is "/" */
+	if (!strcmp(path, "/"))
+		return ERR_PTR(-EINVAL);
+
+	if (tail > path) {
+		parent_path = kstrndup(path, tail - path, GFP_KERNEL);
+		if (!parent_path)
+			return ERR_PTR(-ENOMEM);
+	}
+	parent = of_find_node_by_path(parent_path);
+	if (strcmp(parent_path, "/"))
+		kfree(parent_path);
+	return parent ? parent : ERR_PTR(-EINVAL);
+}
+
+
+/* Helper Routines to convert between drc_index to cpu numbers */
+
+int of_read_drc_info_cell(struct property **prop, const __be32 **curval,
+			struct of_drc_info *data)
+{
+	const char *p;
+	const __be32 *p2;
+
+	if (!data)
+		return -EINVAL;
+
+	/* Get drc-type:encode-string */
+	p = data->drc_type = (char*) (*curval);
+	p = of_prop_next_string(*prop, p);
+	if (!p)
+		return -EINVAL;
+
+	/* Get drc-name-prefix:encode-string */
+	data->drc_name_prefix = (char *)p;
+	p = of_prop_next_string(*prop, p);
+	if (!p)
+		return -EINVAL;
+
+	/* Get drc-index-start:encode-int */
+	p2 = (const __be32 *)p;
+	p2 = of_prop_next_u32(*prop, p2, &data->drc_index_start);
+	if (!p2)
+		return -EINVAL;
+
+	/* Get drc-name-suffix-start:encode-int */
+	p2 = of_prop_next_u32(*prop, p2, &data->drc_name_suffix_start);
+	if (!p2)
+		return -EINVAL;
+
+	/* Get number-sequential-elements:encode-int */
+	p2 = of_prop_next_u32(*prop, p2, &data->num_sequential_elems);
+	if (!p2)
+		return -EINVAL;
+
+	/* Get sequential-increment:encode-int */
+	p2 = of_prop_next_u32(*prop, p2, &data->sequential_inc);
+	if (!p2)
+		return -EINVAL;
+
+	/* Get drc-power-domain:encode-int */
+	p2 = of_prop_next_u32(*prop, p2, &data->drc_power_domain);
+	if (!p2)
+		return -EINVAL;
+
+	/* Should now know end of current entry */
+	(*curval) = (void *)p2;
+	data->last_drc_index = data->drc_index_start +
+		((data->num_sequential_elems - 1) * data->sequential_inc);
+
+	return 0;
+}
+EXPORT_SYMBOL(of_read_drc_info_cell);
diff --git a/arch/powerpc/platforms/pseries/of_helpers.h b/arch/powerpc/platforms/pseries/of_helpers.h
new file mode 100644
index 000000000..decad6553
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/of_helpers.h
@@ -0,0 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _PSERIES_OF_HELPERS_H
+#define _PSERIES_OF_HELPERS_H
+
+#include <linux/of.h>
+
+struct device_node *pseries_of_derive_parent(const char *path);
+
+#endif /* _PSERIES_OF_HELPERS_H */
diff --git a/arch/powerpc/platforms/pseries/offline_states.h b/arch/powerpc/platforms/pseries/offline_states.h
new file mode 100644
index 000000000..51414aee2
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/offline_states.h
@@ -0,0 +1,38 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _OFFLINE_STATES_H_
+#define _OFFLINE_STATES_H_
+
+/* Cpu offline states go here */
+enum cpu_state_vals {
+	CPU_STATE_OFFLINE,
+	CPU_STATE_INACTIVE,
+	CPU_STATE_ONLINE,
+	CPU_MAX_OFFLINE_STATES
+};
+
+#ifdef CONFIG_HOTPLUG_CPU
+extern enum cpu_state_vals get_cpu_current_state(int cpu);
+extern void set_cpu_current_state(int cpu, enum cpu_state_vals state);
+extern void set_preferred_offline_state(int cpu, enum cpu_state_vals state);
+extern void set_default_offline_state(int cpu);
+#else
+static inline enum cpu_state_vals get_cpu_current_state(int cpu)
+{
+	return CPU_STATE_ONLINE;
+}
+
+static inline void set_cpu_current_state(int cpu, enum cpu_state_vals state)
+{
+}
+
+static inline void set_preferred_offline_state(int cpu, enum cpu_state_vals state)
+{
+}
+
+static inline void set_default_offline_state(int cpu)
+{
+}
+#endif
+
+extern enum cpu_state_vals get_preferred_offline_state(int cpu);
+#endif
diff --git a/arch/powerpc/platforms/pseries/pci.c b/arch/powerpc/platforms/pseries/pci.c
new file mode 100644
index 000000000..eab96637d
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/pci.c
@@ -0,0 +1,345 @@
+/*
+ * Copyright (C) 2001 Dave Engebretsen, IBM Corporation
+ * Copyright (C) 2003 Anton Blanchard <anton@au.ibm.com>, IBM
+ *
+ * pSeries specific routines for PCI.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ */
+
+#include <linux/init.h>
+#include <linux/ioport.h>
+#include <linux/kernel.h>
+#include <linux/pci.h>
+#include <linux/string.h>
+
+#include <asm/eeh.h>
+#include <asm/pci-bridge.h>
+#include <asm/prom.h>
+#include <asm/ppc-pci.h>
+#include "pseries.h"
+
+#if 0
+void pcibios_name_device(struct pci_dev *dev)
+{
+	struct device_node *dn;
+
+	/*
+	 * Add IBM loc code (slot) as a prefix to the device names for service
+	 */
+	dn = pci_device_to_OF_node(dev);
+	if (dn) {
+		const char *loc_code = of_get_property(dn, "ibm,loc-code",
+				NULL);
+		if (loc_code) {
+			int loc_len = strlen(loc_code);
+			if (loc_len < sizeof(dev->dev.name)) {
+				memmove(dev->dev.name+loc_len+1, dev->dev.name,
+					sizeof(dev->dev.name)-loc_len-1);
+				memcpy(dev->dev.name, loc_code, loc_len);
+				dev->dev.name[loc_len] = ' ';
+				dev->dev.name[sizeof(dev->dev.name)-1] = '\0';
+			}
+		}
+	}
+}
+DECLARE_PCI_FIXUP_HEADER(PCI_ANY_ID, PCI_ANY_ID, pcibios_name_device);
+#endif
+
+#ifdef CONFIG_PCI_IOV
+#define MAX_VFS_FOR_MAP_PE 256
+struct pe_map_bar_entry {
+	__be64     bar;       /* Input:  Virtual Function BAR */
+	__be16     rid;       /* Input:  Virtual Function Router ID */
+	__be16     pe_num;    /* Output: Virtual Function PE Number */
+	__be32     reserved;  /* Reserved Space */
+};
+
+int pseries_send_map_pe(struct pci_dev *pdev,
+			u16 num_vfs,
+			struct pe_map_bar_entry *vf_pe_array)
+{
+	struct pci_dn *pdn;
+	int rc;
+	unsigned long buid, addr;
+	int ibm_map_pes = rtas_token("ibm,open-sriov-map-pe-number");
+
+	if (ibm_map_pes == RTAS_UNKNOWN_SERVICE)
+		return -EINVAL;
+
+	pdn = pci_get_pdn(pdev);
+	addr = rtas_config_addr(pdn->busno, pdn->devfn, 0);
+	buid = pdn->phb->buid;
+	spin_lock(&rtas_data_buf_lock);
+	memcpy(rtas_data_buf, vf_pe_array,
+	       RTAS_DATA_BUF_SIZE);
+	rc = rtas_call(ibm_map_pes, 5, 1, NULL, addr,
+		       BUID_HI(buid), BUID_LO(buid),
+		       rtas_data_buf,
+		       num_vfs * sizeof(struct pe_map_bar_entry));
+	memcpy(vf_pe_array, rtas_data_buf, RTAS_DATA_BUF_SIZE);
+	spin_unlock(&rtas_data_buf_lock);
+
+	if (rc)
+		dev_err(&pdev->dev,
+			"%s: Failed to associate pes PE#%lx, rc=%x\n",
+			__func__,  addr, rc);
+
+	return rc;
+}
+
+void pseries_set_pe_num(struct pci_dev *pdev, u16 vf_index, __be16 pe_num)
+{
+	struct pci_dn *pdn;
+
+	pdn = pci_get_pdn(pdev);
+	pdn->pe_num_map[vf_index] = be16_to_cpu(pe_num);
+	dev_dbg(&pdev->dev, "VF %04x:%02x:%02x.%x associated with PE#%x\n",
+		pci_domain_nr(pdev->bus),
+		pdev->bus->number,
+		PCI_SLOT(pci_iov_virtfn_devfn(pdev, vf_index)),
+		PCI_FUNC(pci_iov_virtfn_devfn(pdev, vf_index)),
+		pdn->pe_num_map[vf_index]);
+}
+
+int pseries_associate_pes(struct pci_dev *pdev, u16 num_vfs)
+{
+	struct pci_dn *pdn;
+	int i, rc, vf_index;
+	struct pe_map_bar_entry *vf_pe_array;
+	struct resource *res;
+	u64 size;
+
+	vf_pe_array = kzalloc(RTAS_DATA_BUF_SIZE, GFP_KERNEL);
+	if (!vf_pe_array)
+		return -ENOMEM;
+
+	pdn = pci_get_pdn(pdev);
+	/* create firmware structure to associate pes */
+	for (vf_index = 0; vf_index < num_vfs; vf_index++) {
+		pdn->pe_num_map[vf_index] = IODA_INVALID_PE;
+		for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
+			res = &pdev->resource[i + PCI_IOV_RESOURCES];
+			if (!res->parent)
+				continue;
+			size = pcibios_iov_resource_alignment(pdev, i +
+					PCI_IOV_RESOURCES);
+			vf_pe_array[vf_index].bar =
+				cpu_to_be64(res->start + size * vf_index);
+			vf_pe_array[vf_index].rid =
+				cpu_to_be16((pci_iov_virtfn_bus(pdev, vf_index)
+					    << 8) | pci_iov_virtfn_devfn(pdev,
+					    vf_index));
+			vf_pe_array[vf_index].pe_num =
+				cpu_to_be16(IODA_INVALID_PE);
+		}
+	}
+
+	rc = pseries_send_map_pe(pdev, num_vfs, vf_pe_array);
+	/* Only zero is success */
+	if (!rc)
+		for (vf_index = 0; vf_index < num_vfs; vf_index++)
+			pseries_set_pe_num(pdev, vf_index,
+					   vf_pe_array[vf_index].pe_num);
+
+	kfree(vf_pe_array);
+	return rc;
+}
+
+int pseries_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
+{
+	struct pci_dn         *pdn;
+	int                    rc;
+	const int *max_vfs;
+	int max_config_vfs;
+	struct device_node *dn = pci_device_to_OF_node(pdev);
+
+	max_vfs = of_get_property(dn, "ibm,number-of-configurable-vfs", NULL);
+
+	if (!max_vfs)
+		return -EINVAL;
+
+	/* First integer stores max config */
+	max_config_vfs = of_read_number(&max_vfs[0], 1);
+	if (max_config_vfs < num_vfs && num_vfs > MAX_VFS_FOR_MAP_PE) {
+		dev_err(&pdev->dev,
+			"Num VFs %x > %x Configurable VFs\n",
+			num_vfs, (num_vfs > MAX_VFS_FOR_MAP_PE) ?
+			MAX_VFS_FOR_MAP_PE : max_config_vfs);
+		return -EINVAL;
+	}
+
+	pdn = pci_get_pdn(pdev);
+	pdn->pe_num_map = kmalloc_array(num_vfs,
+					sizeof(*pdn->pe_num_map),
+					GFP_KERNEL);
+	if (!pdn->pe_num_map)
+		return -ENOMEM;
+
+	rc = pseries_associate_pes(pdev, num_vfs);
+
+	/* Anything other than zero is failure */
+	if (rc) {
+		dev_err(&pdev->dev, "Failure to enable sriov: %x\n", rc);
+		kfree(pdn->pe_num_map);
+	} else {
+		pci_vf_drivers_autoprobe(pdev, false);
+	}
+
+	return rc;
+}
+
+int pseries_pcibios_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
+{
+	/* Allocate PCI data */
+	add_dev_pci_data(pdev);
+	return pseries_pci_sriov_enable(pdev, num_vfs);
+}
+
+int pseries_pcibios_sriov_disable(struct pci_dev *pdev)
+{
+	struct pci_dn         *pdn;
+
+	pdn = pci_get_pdn(pdev);
+	/* Releasing pe_num_map */
+	kfree(pdn->pe_num_map);
+	/* Release PCI data */
+	remove_dev_pci_data(pdev);
+	pci_vf_drivers_autoprobe(pdev, true);
+	return 0;
+}
+#endif
+
+static void __init pSeries_request_regions(void)
+{
+	if (!isa_io_base)
+		return;
+
+	request_region(0x20,0x20,"pic1");
+	request_region(0xa0,0x20,"pic2");
+	request_region(0x00,0x20,"dma1");
+	request_region(0x40,0x20,"timer");
+	request_region(0x80,0x10,"dma page reg");
+	request_region(0xc0,0x20,"dma2");
+}
+
+void __init pSeries_final_fixup(void)
+{
+	pSeries_request_regions();
+
+	eeh_addr_cache_build();
+
+#ifdef CONFIG_PCI_IOV
+	ppc_md.pcibios_sriov_enable = pseries_pcibios_sriov_enable;
+	ppc_md.pcibios_sriov_disable = pseries_pcibios_sriov_disable;
+#endif
+}
+
+/*
+ * Assume the winbond 82c105 is the IDE controller on a
+ * p610/p615/p630. We should probably be more careful in case
+ * someone tries to plug in a similar adapter.
+ */
+static void fixup_winbond_82c105(struct pci_dev* dev)
+{
+	int i;
+	unsigned int reg;
+
+	if (!machine_is(pseries))
+		return;
+
+	printk("Using INTC for W82c105 IDE controller.\n");
+	pci_read_config_dword(dev, 0x40, &reg);
+	/* Enable LEGIRQ to use INTC instead of ISA interrupts */
+	pci_write_config_dword(dev, 0x40, reg | (1<<11));
+
+	for (i = 0; i < DEVICE_COUNT_RESOURCE; ++i) {
+		/* zap the 2nd function of the winbond chip */
+		if (dev->resource[i].flags & IORESOURCE_IO
+		    && dev->bus->number == 0 && dev->devfn == 0x81)
+			dev->resource[i].flags &= ~IORESOURCE_IO;
+		if (dev->resource[i].start == 0 && dev->resource[i].end) {
+			dev->resource[i].flags = 0;
+			dev->resource[i].end = 0;
+		}
+	}
+}
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_WINBOND, PCI_DEVICE_ID_WINBOND_82C105,
+			 fixup_winbond_82c105);
+
+int pseries_root_bridge_prepare(struct pci_host_bridge *bridge)
+{
+	struct device_node *dn, *pdn;
+	struct pci_bus *bus;
+	u32 pcie_link_speed_stats[2];
+	int rc;
+
+	bus = bridge->bus;
+
+	/* Rely on the pcibios_free_controller_deferred() callback. */
+	pci_set_host_bridge_release(bridge, pcibios_free_controller_deferred,
+					(void *) pci_bus_to_host(bus));
+
+	dn = pcibios_get_phb_of_node(bus);
+	if (!dn)
+		return 0;
+
+	for (pdn = dn; pdn != NULL; pdn = of_get_next_parent(pdn)) {
+		rc = of_property_read_u32_array(pdn,
+				"ibm,pcie-link-speed-stats",
+				&pcie_link_speed_stats[0], 2);
+		if (!rc)
+			break;
+	}
+
+	of_node_put(pdn);
+
+	if (rc) {
+		pr_debug("no ibm,pcie-link-speed-stats property\n");
+		return 0;
+	}
+
+	switch (pcie_link_speed_stats[0]) {
+	case 0x01:
+		bus->max_bus_speed = PCIE_SPEED_2_5GT;
+		break;
+	case 0x02:
+		bus->max_bus_speed = PCIE_SPEED_5_0GT;
+		break;
+	case 0x04:
+		bus->max_bus_speed = PCIE_SPEED_8_0GT;
+		break;
+	default:
+		bus->max_bus_speed = PCI_SPEED_UNKNOWN;
+		break;
+	}
+
+	switch (pcie_link_speed_stats[1]) {
+	case 0x01:
+		bus->cur_bus_speed = PCIE_SPEED_2_5GT;
+		break;
+	case 0x02:
+		bus->cur_bus_speed = PCIE_SPEED_5_0GT;
+		break;
+	case 0x04:
+		bus->cur_bus_speed = PCIE_SPEED_8_0GT;
+		break;
+	default:
+		bus->cur_bus_speed = PCI_SPEED_UNKNOWN;
+		break;
+	}
+
+	return 0;
+}
diff --git a/arch/powerpc/platforms/pseries/pci_dlpar.c b/arch/powerpc/platforms/pseries/pci_dlpar.c
new file mode 100644
index 000000000..afca4b737
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/pci_dlpar.c
@@ -0,0 +1,119 @@
+/*
+ * PCI Dynamic LPAR, PCI Hot Plug and PCI EEH recovery code
+ * for RPA-compliant PPC64 platform.
+ * Copyright (C) 2003 Linda Xie <lxie@us.ibm.com>
+ * Copyright (C) 2005 International Business Machines
+ *
+ * Updates, 2005, John Rose <johnrose@austin.ibm.com>
+ * Updates, 2005, Linas Vepstas <linas@austin.ibm.com>
+ *
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT.  See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/pci.h>
+#include <linux/export.h>
+#include <asm/pci-bridge.h>
+#include <asm/ppc-pci.h>
+#include <asm/firmware.h>
+#include <asm/eeh.h>
+
+#include "pseries.h"
+
+struct pci_controller *init_phb_dynamic(struct device_node *dn)
+{
+	struct pci_controller *phb;
+
+	pr_debug("PCI: Initializing new hotplug PHB %pOF\n", dn);
+
+	phb = pcibios_alloc_controller(dn);
+	if (!phb)
+		return NULL;
+	rtas_setup_phb(phb);
+	pci_process_bridge_OF_ranges(phb, dn, 0);
+	phb->controller_ops = pseries_pci_controller_ops;
+
+	pci_devs_phb_init_dynamic(phb);
+
+	/* Create EEH devices for the PHB */
+	eeh_dev_phb_init_dynamic(phb);
+
+	if (dn->child)
+		eeh_add_device_tree_early(PCI_DN(dn));
+
+	pcibios_scan_phb(phb);
+	pcibios_finish_adding_to_bus(phb->bus);
+
+	return phb;
+}
+EXPORT_SYMBOL_GPL(init_phb_dynamic);
+
+/* RPA-specific bits for removing PHBs */
+int remove_phb_dynamic(struct pci_controller *phb)
+{
+	struct pci_bus *b = phb->bus;
+	struct pci_host_bridge *host_bridge = to_pci_host_bridge(b->bridge);
+	struct resource *res;
+	int rc, i;
+
+	pr_debug("PCI: Removing PHB %04x:%02x...\n",
+		 pci_domain_nr(b), b->number);
+
+	/* We cannot to remove a root bus that has children */
+	if (!(list_empty(&b->children) && list_empty(&b->devices)))
+		return -EBUSY;
+
+	/* We -know- there aren't any child devices anymore at this stage
+	 * and thus, we can safely unmap the IO space as it's not in use
+	 */
+	res = &phb->io_resource;
+	if (res->flags & IORESOURCE_IO) {
+		rc = pcibios_unmap_io_space(b);
+		if (rc) {
+			printk(KERN_ERR "%s: failed to unmap IO on bus %s\n",
+			       __func__, b->name);
+			return 1;
+		}
+	}
+
+	/* Remove the PCI bus and unregister the bridge device from sysfs */
+	phb->bus = NULL;
+	pci_remove_bus(b);
+	host_bridge->bus = NULL;
+	device_unregister(&host_bridge->dev);
+
+	/* Now release the IO resource */
+	if (res->flags & IORESOURCE_IO)
+		release_resource(res);
+
+	/* Release memory resources */
+	for (i = 0; i < 3; ++i) {
+		res = &phb->mem_resources[i];
+		if (!(res->flags & IORESOURCE_MEM))
+			continue;
+		release_resource(res);
+	}
+
+	/*
+	 * The pci_controller data structure is freed by
+	 * the pcibios_free_controller_deferred() callback;
+	 * see pseries_root_bridge_prepare().
+	 */
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(remove_phb_dynamic);
diff --git a/arch/powerpc/platforms/pseries/power.c b/arch/powerpc/platforms/pseries/power.c
new file mode 100644
index 000000000..a4a0b57d1
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/power.c
@@ -0,0 +1,84 @@
+/*
+ *  Interface for power-management for ppc64 compliant platform
+ *
+ *  Manish Ahuja <mahuja@us.ibm.com>
+ *
+ *  Feb 2007
+ *
+ *  Copyright (C) 2007 IBM Corporation.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; version 2 of the License.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/kobject.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <asm/machdep.h>
+
+#include "pseries.h"
+
+unsigned long rtas_poweron_auto; /* default and normal state is 0 */
+
+static ssize_t auto_poweron_show(struct kobject *kobj,
+				 struct kobj_attribute *attr, char *buf)
+{
+        return sprintf(buf, "%lu\n", rtas_poweron_auto);
+}
+
+static ssize_t auto_poweron_store(struct kobject *kobj,
+				  struct kobj_attribute *attr,
+				  const char *buf, size_t n)
+{
+	int ret;
+	unsigned long ups_restart;
+	ret = sscanf(buf, "%lu", &ups_restart);
+
+	if ((ret == 1) && ((ups_restart == 1) || (ups_restart == 0))){
+		rtas_poweron_auto = ups_restart;
+		return n;
+	}
+	return -EINVAL;
+}
+
+static struct kobj_attribute auto_poweron_attr =
+	__ATTR(auto_poweron, 0644, auto_poweron_show, auto_poweron_store);
+
+#ifndef CONFIG_PM
+struct kobject *power_kobj;
+
+static struct attribute *g[] = {
+        &auto_poweron_attr.attr,
+        NULL,
+};
+
+static struct attribute_group attr_group = {
+        .attrs = g,
+};
+
+static int __init pm_init(void)
+{
+	power_kobj = kobject_create_and_add("power", NULL);
+	if (!power_kobj)
+		return -ENOMEM;
+	return sysfs_create_group(power_kobj, &attr_group);
+}
+machine_core_initcall(pseries, pm_init);
+#else
+static int __init apo_pm_init(void)
+{
+	return (sysfs_create_file(power_kobj, &auto_poweron_attr.attr));
+}
+machine_device_initcall(pseries, apo_pm_init);
+#endif
diff --git a/arch/powerpc/platforms/pseries/pseries.h b/arch/powerpc/platforms/pseries/pseries.h
new file mode 100644
index 000000000..60db2ee51
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/pseries.h
@@ -0,0 +1,113 @@
+/*
+ * Copyright 2006 IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#ifndef _PSERIES_PSERIES_H
+#define _PSERIES_PSERIES_H
+
+#include <linux/interrupt.h>
+#include <asm/rtas.h>
+
+struct device_node;
+
+extern void request_event_sources_irqs(struct device_node *np,
+				       irq_handler_t handler, const char *name);
+
+#include <linux/of.h>
+
+struct pt_regs;
+
+extern int pSeries_system_reset_exception(struct pt_regs *regs);
+extern int pSeries_machine_check_exception(struct pt_regs *regs);
+
+#ifdef CONFIG_SMP
+extern void smp_init_pseries(void);
+
+/* Get state of physical CPU from query_cpu_stopped */
+int smp_query_cpu_stopped(unsigned int pcpu);
+#define QCSS_STOPPED 0
+#define QCSS_STOPPING 1
+#define QCSS_NOT_STOPPED 2
+#define QCSS_HARDWARE_ERROR -1
+#define QCSS_HARDWARE_BUSY -2
+#else
+static inline void smp_init_pseries(void) { };
+#endif
+
+extern void pseries_kexec_cpu_down(int crash_shutdown, int secondary);
+
+extern void pSeries_final_fixup(void);
+
+/* Poweron flag used for enabling auto ups restart */
+extern unsigned long rtas_poweron_auto;
+
+/* Provided by HVC VIO */
+extern void hvc_vio_init_early(void);
+
+/* Dynamic logical Partitioning/Mobility */
+extern void dlpar_free_cc_nodes(struct device_node *);
+extern void dlpar_free_cc_property(struct property *);
+extern struct device_node *dlpar_configure_connector(__be32,
+						struct device_node *);
+extern int dlpar_attach_node(struct device_node *, struct device_node *);
+extern int dlpar_detach_node(struct device_node *);
+extern int dlpar_acquire_drc(u32 drc_index);
+extern int dlpar_release_drc(u32 drc_index);
+
+void queue_hotplug_event(struct pseries_hp_errorlog *hp_errlog,
+			 struct completion *hotplug_done, int *rc);
+#ifdef CONFIG_MEMORY_HOTPLUG
+int dlpar_memory(struct pseries_hp_errorlog *hp_elog);
+#else
+static inline int dlpar_memory(struct pseries_hp_errorlog *hp_elog)
+{
+	return -EOPNOTSUPP;
+}
+#endif
+
+#ifdef CONFIG_HOTPLUG_CPU
+int dlpar_cpu(struct pseries_hp_errorlog *hp_elog);
+#else
+static inline int dlpar_cpu(struct pseries_hp_errorlog *hp_elog)
+{
+	return -EOPNOTSUPP;
+}
+#endif
+
+/* PCI root bridge prepare function override for pseries */
+struct pci_host_bridge;
+int pseries_root_bridge_prepare(struct pci_host_bridge *bridge);
+
+extern struct pci_controller_ops pseries_pci_controller_ops;
+
+unsigned long pseries_memory_block_size(void);
+
+extern int CMO_PrPSP;
+extern int CMO_SecPSP;
+extern unsigned long CMO_PageSize;
+
+static inline int cmo_get_primary_psp(void)
+{
+	return CMO_PrPSP;
+}
+
+static inline int cmo_get_secondary_psp(void)
+{
+	return CMO_SecPSP;
+}
+
+static inline unsigned long cmo_get_page_size(void)
+{
+	return CMO_PageSize;
+}
+
+int dlpar_workqueue_init(void);
+
+void pseries_setup_rfi_flush(void);
+
+#endif /* _PSERIES_PSERIES_H */
diff --git a/arch/powerpc/platforms/pseries/pseries_energy.c b/arch/powerpc/platforms/pseries/pseries_energy.c
new file mode 100644
index 000000000..921f12182
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/pseries_energy.c
@@ -0,0 +1,368 @@
+/*
+ * POWER platform energy management driver
+ * Copyright (C) 2010 IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ *
+ * This pseries platform device driver provides access to
+ * platform energy management capabilities.
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <linux/seq_file.h>
+#include <linux/device.h>
+#include <linux/cpu.h>
+#include <linux/of.h>
+#include <asm/cputhreads.h>
+#include <asm/page.h>
+#include <asm/hvcall.h>
+#include <asm/firmware.h>
+#include <asm/prom.h>
+
+
+#define MODULE_VERS "1.0"
+#define MODULE_NAME "pseries_energy"
+
+/* Driver flags */
+
+static int sysfs_entries;
+
+/* Helper routines */
+
+/* Helper Routines to convert between drc_index to cpu numbers */
+
+static u32 cpu_to_drc_index(int cpu)
+{
+	struct device_node *dn = NULL;
+	int thread_index;
+	int rc = 1;
+	u32 ret = 0;
+
+	dn = of_find_node_by_path("/cpus");
+	if (dn == NULL)
+		goto err;
+
+	/* Convert logical cpu number to core number */
+	thread_index = cpu_core_index_of_thread(cpu);
+
+	if (firmware_has_feature(FW_FEATURE_DRC_INFO)) {
+		struct property *info = NULL;
+		struct of_drc_info drc;
+		int j;
+		u32 num_set_entries;
+		const __be32 *value;
+
+		info = of_find_property(dn, "ibm,drc-info", NULL);
+		if (info == NULL)
+			goto err_of_node_put;
+
+		value = of_prop_next_u32(info, NULL, &num_set_entries);
+		if (!value)
+			goto err_of_node_put;
+
+		for (j = 0; j < num_set_entries; j++) {
+
+			of_read_drc_info_cell(&info, &value, &drc);
+			if (strncmp(drc.drc_type, "CPU", 3))
+				goto err;
+
+			if (thread_index < drc.last_drc_index)
+				break;
+		}
+
+		ret = drc.drc_index_start + (thread_index * drc.sequential_inc);
+	} else {
+		u32 nr_drc_indexes, thread_drc_index;
+
+		/*
+		 * The first element of ibm,drc-indexes array is the
+		 * number of drc_indexes returned in the list.  Hence
+		 * thread_index+1 will get the drc_index corresponding
+		 * to core number thread_index.
+		 */
+		rc = of_property_read_u32_index(dn, "ibm,drc-indexes",
+						0, &nr_drc_indexes);
+		if (rc)
+			goto err_of_node_put;
+
+		WARN_ON_ONCE(thread_index > nr_drc_indexes);
+		rc = of_property_read_u32_index(dn, "ibm,drc-indexes",
+						thread_index + 1,
+						&thread_drc_index);
+		if (rc)
+			goto err_of_node_put;
+
+		ret = thread_drc_index;
+	}
+
+	rc = 0;
+
+err_of_node_put:
+	of_node_put(dn);
+err:
+	if (rc)
+		printk(KERN_WARNING "cpu_to_drc_index(%d) failed", cpu);
+	return ret;
+}
+
+static int drc_index_to_cpu(u32 drc_index)
+{
+	struct device_node *dn = NULL;
+	const int *indexes;
+	int thread_index = 0, cpu = 0;
+	int rc = 1;
+
+	dn = of_find_node_by_path("/cpus");
+	if (dn == NULL)
+		goto err;
+
+	if (firmware_has_feature(FW_FEATURE_DRC_INFO)) {
+		struct property *info = NULL;
+		struct of_drc_info drc;
+		int j;
+		u32 num_set_entries;
+		const __be32 *value;
+
+		info = of_find_property(dn, "ibm,drc-info", NULL);
+		if (info == NULL)
+			goto err_of_node_put;
+
+		value = of_prop_next_u32(info, NULL, &num_set_entries);
+		if (!value)
+			goto err_of_node_put;
+
+		for (j = 0; j < num_set_entries; j++) {
+
+			of_read_drc_info_cell(&info, &value, &drc);
+			if (strncmp(drc.drc_type, "CPU", 3))
+				goto err;
+
+			if (drc_index > drc.last_drc_index) {
+				cpu += drc.num_sequential_elems;
+				continue;
+			}
+			cpu += ((drc_index - drc.drc_index_start) /
+				drc.sequential_inc);
+
+			thread_index = cpu_first_thread_of_core(cpu);
+			rc = 0;
+			break;
+		}
+	} else {
+		unsigned long int i;
+
+		indexes = of_get_property(dn, "ibm,drc-indexes", NULL);
+		if (indexes == NULL)
+			goto err_of_node_put;
+		/*
+		 * First element in the array is the number of drc_indexes
+		 * returned.  Search through the list to find the matching
+		 * drc_index and get the core number
+		 */
+		for (i = 0; i < indexes[0]; i++) {
+			if (indexes[i + 1] == drc_index)
+				break;
+		}
+		/* Convert core number to logical cpu number */
+		thread_index = cpu_first_thread_of_core(i);
+		rc = 0;
+	}
+
+err_of_node_put:
+	of_node_put(dn);
+err:
+	if (rc)
+		printk(KERN_WARNING "drc_index_to_cpu(%d) failed", drc_index);
+	return thread_index;
+}
+
+/*
+ * pseries hypervisor call H_BEST_ENERGY provides hints to OS on
+ * preferred logical cpus to activate or deactivate for optimized
+ * energy consumption.
+ */
+
+#define FLAGS_MODE1	0x004E200000080E01UL
+#define FLAGS_MODE2	0x004E200000080401UL
+#define FLAGS_ACTIVATE  0x100
+
+static ssize_t get_best_energy_list(char *page, int activate)
+{
+	int rc, cnt, i, cpu;
+	unsigned long retbuf[PLPAR_HCALL9_BUFSIZE];
+	unsigned long flags = 0;
+	u32 *buf_page;
+	char *s = page;
+
+	buf_page = (u32 *) get_zeroed_page(GFP_KERNEL);
+	if (!buf_page)
+		return -ENOMEM;
+
+	flags = FLAGS_MODE1;
+	if (activate)
+		flags |= FLAGS_ACTIVATE;
+
+	rc = plpar_hcall9(H_BEST_ENERGY, retbuf, flags, 0, __pa(buf_page),
+				0, 0, 0, 0, 0, 0);
+	if (rc != H_SUCCESS) {
+		free_page((unsigned long) buf_page);
+		return -EINVAL;
+	}
+
+	cnt = retbuf[0];
+	for (i = 0; i < cnt; i++) {
+		cpu = drc_index_to_cpu(buf_page[2*i+1]);
+		if ((cpu_online(cpu) && !activate) ||
+		    (!cpu_online(cpu) && activate))
+			s += sprintf(s, "%d,", cpu);
+	}
+	if (s > page) { /* Something to show */
+		s--; /* Suppress last comma */
+		s += sprintf(s, "\n");
+	}
+
+	free_page((unsigned long) buf_page);
+	return s-page;
+}
+
+static ssize_t get_best_energy_data(struct device *dev,
+					char *page, int activate)
+{
+	int rc;
+	unsigned long retbuf[PLPAR_HCALL9_BUFSIZE];
+	unsigned long flags = 0;
+
+	flags = FLAGS_MODE2;
+	if (activate)
+		flags |= FLAGS_ACTIVATE;
+
+	rc = plpar_hcall9(H_BEST_ENERGY, retbuf, flags,
+				cpu_to_drc_index(dev->id),
+				0, 0, 0, 0, 0, 0, 0);
+
+	if (rc != H_SUCCESS)
+		return -EINVAL;
+
+	return sprintf(page, "%lu\n", retbuf[1] >> 32);
+}
+
+/* Wrapper functions */
+
+static ssize_t cpu_activate_hint_list_show(struct device *dev,
+			struct device_attribute *attr, char *page)
+{
+	return get_best_energy_list(page, 1);
+}
+
+static ssize_t cpu_deactivate_hint_list_show(struct device *dev,
+			struct device_attribute *attr, char *page)
+{
+	return get_best_energy_list(page, 0);
+}
+
+static ssize_t percpu_activate_hint_show(struct device *dev,
+			struct device_attribute *attr, char *page)
+{
+	return get_best_energy_data(dev, page, 1);
+}
+
+static ssize_t percpu_deactivate_hint_show(struct device *dev,
+			struct device_attribute *attr, char *page)
+{
+	return get_best_energy_data(dev, page, 0);
+}
+
+/*
+ * Create sysfs interface:
+ * /sys/devices/system/cpu/pseries_activate_hint_list
+ * /sys/devices/system/cpu/pseries_deactivate_hint_list
+ *	Comma separated list of cpus to activate or deactivate
+ * /sys/devices/system/cpu/cpuN/pseries_activate_hint
+ * /sys/devices/system/cpu/cpuN/pseries_deactivate_hint
+ *	Per-cpu value of the hint
+ */
+
+static struct device_attribute attr_cpu_activate_hint_list =
+		__ATTR(pseries_activate_hint_list, 0444,
+		cpu_activate_hint_list_show, NULL);
+
+static struct device_attribute attr_cpu_deactivate_hint_list =
+		__ATTR(pseries_deactivate_hint_list, 0444,
+		cpu_deactivate_hint_list_show, NULL);
+
+static struct device_attribute attr_percpu_activate_hint =
+		__ATTR(pseries_activate_hint, 0444,
+		percpu_activate_hint_show, NULL);
+
+static struct device_attribute attr_percpu_deactivate_hint =
+		__ATTR(pseries_deactivate_hint, 0444,
+		percpu_deactivate_hint_show, NULL);
+
+static int __init pseries_energy_init(void)
+{
+	int cpu, err;
+	struct device *cpu_dev;
+
+	if (!firmware_has_feature(FW_FEATURE_BEST_ENERGY))
+		return 0; /* H_BEST_ENERGY hcall not supported */
+
+	/* Create the sysfs files */
+	err = device_create_file(cpu_subsys.dev_root,
+				&attr_cpu_activate_hint_list);
+	if (!err)
+		err = device_create_file(cpu_subsys.dev_root,
+				&attr_cpu_deactivate_hint_list);
+
+	if (err)
+		return err;
+	for_each_possible_cpu(cpu) {
+		cpu_dev = get_cpu_device(cpu);
+		err = device_create_file(cpu_dev,
+				&attr_percpu_activate_hint);
+		if (err)
+			break;
+		err = device_create_file(cpu_dev,
+				&attr_percpu_deactivate_hint);
+		if (err)
+			break;
+	}
+
+	if (err)
+		return err;
+
+	sysfs_entries = 1; /* Removed entries on cleanup */
+	return 0;
+
+}
+
+static void __exit pseries_energy_cleanup(void)
+{
+	int cpu;
+	struct device *cpu_dev;
+
+	if (!sysfs_entries)
+		return;
+
+	/* Remove the sysfs files */
+	device_remove_file(cpu_subsys.dev_root, &attr_cpu_activate_hint_list);
+	device_remove_file(cpu_subsys.dev_root, &attr_cpu_deactivate_hint_list);
+
+	for_each_possible_cpu(cpu) {
+		cpu_dev = get_cpu_device(cpu);
+		sysfs_remove_file(&cpu_dev->kobj,
+				&attr_percpu_activate_hint.attr);
+		sysfs_remove_file(&cpu_dev->kobj,
+				&attr_percpu_deactivate_hint.attr);
+	}
+}
+
+module_init(pseries_energy_init);
+module_exit(pseries_energy_cleanup);
+MODULE_DESCRIPTION("Driver for pSeries platform energy management");
+MODULE_AUTHOR("Vaidyanathan Srinivasan");
+MODULE_LICENSE("GPL");
diff --git a/arch/powerpc/platforms/pseries/ras.c b/arch/powerpc/platforms/pseries/ras.c
new file mode 100644
index 000000000..e82710868
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/ras.c
@@ -0,0 +1,513 @@
+/*
+ * Copyright (C) 2001 Dave Engebretsen IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ */
+
+#include <linux/sched.h>
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/of.h>
+#include <linux/fs.h>
+#include <linux/reboot.h>
+#include <linux/irq_work.h>
+
+#include <asm/machdep.h>
+#include <asm/rtas.h>
+#include <asm/firmware.h>
+
+#include "pseries.h"
+
+static unsigned char ras_log_buf[RTAS_ERROR_LOG_MAX];
+static DEFINE_SPINLOCK(ras_log_buf_lock);
+
+static int ras_check_exception_token;
+
+static void mce_process_errlog_event(struct irq_work *work);
+static struct irq_work mce_errlog_process_work = {
+	.func = mce_process_errlog_event,
+};
+
+#define EPOW_SENSOR_TOKEN	9
+#define EPOW_SENSOR_INDEX	0
+
+/* EPOW events counter variable */
+static int num_epow_events;
+
+static irqreturn_t ras_hotplug_interrupt(int irq, void *dev_id);
+static irqreturn_t ras_epow_interrupt(int irq, void *dev_id);
+static irqreturn_t ras_error_interrupt(int irq, void *dev_id);
+
+
+/*
+ * Enable the hotplug interrupt late because processing them may touch other
+ * devices or systems (e.g. hugepages) that have not been initialized at the
+ * subsys stage.
+ */
+int __init init_ras_hotplug_IRQ(void)
+{
+	struct device_node *np;
+
+	/* Hotplug Events */
+	np = of_find_node_by_path("/event-sources/hot-plug-events");
+	if (np != NULL) {
+		if (dlpar_workqueue_init() == 0)
+			request_event_sources_irqs(np, ras_hotplug_interrupt,
+						   "RAS_HOTPLUG");
+		of_node_put(np);
+	}
+
+	return 0;
+}
+machine_late_initcall(pseries, init_ras_hotplug_IRQ);
+
+/*
+ * Initialize handlers for the set of interrupts caused by hardware errors
+ * and power system events.
+ */
+static int __init init_ras_IRQ(void)
+{
+	struct device_node *np;
+
+	ras_check_exception_token = rtas_token("check-exception");
+
+	/* Internal Errors */
+	np = of_find_node_by_path("/event-sources/internal-errors");
+	if (np != NULL) {
+		request_event_sources_irqs(np, ras_error_interrupt,
+					   "RAS_ERROR");
+		of_node_put(np);
+	}
+
+	/* EPOW Events */
+	np = of_find_node_by_path("/event-sources/epow-events");
+	if (np != NULL) {
+		request_event_sources_irqs(np, ras_epow_interrupt, "RAS_EPOW");
+		of_node_put(np);
+	}
+
+	return 0;
+}
+machine_subsys_initcall(pseries, init_ras_IRQ);
+
+#define EPOW_SHUTDOWN_NORMAL				1
+#define EPOW_SHUTDOWN_ON_UPS				2
+#define EPOW_SHUTDOWN_LOSS_OF_CRITICAL_FUNCTIONS	3
+#define EPOW_SHUTDOWN_AMBIENT_TEMPERATURE_TOO_HIGH	4
+
+static void handle_system_shutdown(char event_modifier)
+{
+	switch (event_modifier) {
+	case EPOW_SHUTDOWN_NORMAL:
+		pr_emerg("Power off requested\n");
+		orderly_poweroff(true);
+		break;
+
+	case EPOW_SHUTDOWN_ON_UPS:
+		pr_emerg("Loss of system power detected. System is running on"
+			 " UPS/battery. Check RTAS error log for details\n");
+		break;
+
+	case EPOW_SHUTDOWN_LOSS_OF_CRITICAL_FUNCTIONS:
+		pr_emerg("Loss of system critical functions detected. Check"
+			 " RTAS error log for details\n");
+		orderly_poweroff(true);
+		break;
+
+	case EPOW_SHUTDOWN_AMBIENT_TEMPERATURE_TOO_HIGH:
+		pr_emerg("High ambient temperature detected. Check RTAS"
+			 " error log for details\n");
+		orderly_poweroff(true);
+		break;
+
+	default:
+		pr_err("Unknown power/cooling shutdown event (modifier = %d)\n",
+			event_modifier);
+	}
+}
+
+struct epow_errorlog {
+	unsigned char sensor_value;
+	unsigned char event_modifier;
+	unsigned char extended_modifier;
+	unsigned char reserved;
+	unsigned char platform_reason;
+};
+
+#define EPOW_RESET			0
+#define EPOW_WARN_COOLING		1
+#define EPOW_WARN_POWER			2
+#define EPOW_SYSTEM_SHUTDOWN		3
+#define EPOW_SYSTEM_HALT		4
+#define EPOW_MAIN_ENCLOSURE		5
+#define EPOW_POWER_OFF			7
+
+static void rtas_parse_epow_errlog(struct rtas_error_log *log)
+{
+	struct pseries_errorlog *pseries_log;
+	struct epow_errorlog *epow_log;
+	char action_code;
+	char modifier;
+
+	pseries_log = get_pseries_errorlog(log, PSERIES_ELOG_SECT_ID_EPOW);
+	if (pseries_log == NULL)
+		return;
+
+	epow_log = (struct epow_errorlog *)pseries_log->data;
+	action_code = epow_log->sensor_value & 0xF;	/* bottom 4 bits */
+	modifier = epow_log->event_modifier & 0xF;	/* bottom 4 bits */
+
+	switch (action_code) {
+	case EPOW_RESET:
+		if (num_epow_events) {
+			pr_info("Non critical power/cooling issue cleared\n");
+			num_epow_events--;
+		}
+		break;
+
+	case EPOW_WARN_COOLING:
+		pr_info("Non-critical cooling issue detected. Check RTAS error"
+			" log for details\n");
+		break;
+
+	case EPOW_WARN_POWER:
+		pr_info("Non-critical power issue detected. Check RTAS error"
+			" log for details\n");
+		break;
+
+	case EPOW_SYSTEM_SHUTDOWN:
+		handle_system_shutdown(epow_log->event_modifier);
+		break;
+
+	case EPOW_SYSTEM_HALT:
+		pr_emerg("Critical power/cooling issue detected. Check RTAS"
+			 " error log for details. Powering off.\n");
+		orderly_poweroff(true);
+		break;
+
+	case EPOW_MAIN_ENCLOSURE:
+	case EPOW_POWER_OFF:
+		pr_emerg("System about to lose power. Check RTAS error log "
+			 " for details. Powering off immediately.\n");
+		emergency_sync();
+		kernel_power_off();
+		break;
+
+	default:
+		pr_err("Unknown power/cooling event (action code  = %d)\n",
+			action_code);
+	}
+
+	/* Increment epow events counter variable */
+	if (action_code != EPOW_RESET)
+		num_epow_events++;
+}
+
+static irqreturn_t ras_hotplug_interrupt(int irq, void *dev_id)
+{
+	struct pseries_errorlog *pseries_log;
+	struct pseries_hp_errorlog *hp_elog;
+
+	spin_lock(&ras_log_buf_lock);
+
+	rtas_call(ras_check_exception_token, 6, 1, NULL,
+		  RTAS_VECTOR_EXTERNAL_INTERRUPT, virq_to_hw(irq),
+		  RTAS_HOTPLUG_EVENTS, 0, __pa(&ras_log_buf),
+		  rtas_get_error_log_max());
+
+	pseries_log = get_pseries_errorlog((struct rtas_error_log *)ras_log_buf,
+					   PSERIES_ELOG_SECT_ID_HOTPLUG);
+	hp_elog = (struct pseries_hp_errorlog *)pseries_log->data;
+
+	/*
+	 * Since PCI hotplug is not currently supported on pseries, put PCI
+	 * hotplug events on the ras_log_buf to be handled by rtas_errd.
+	 */
+	if (hp_elog->resource == PSERIES_HP_ELOG_RESOURCE_MEM ||
+	    hp_elog->resource == PSERIES_HP_ELOG_RESOURCE_CPU)
+		queue_hotplug_event(hp_elog, NULL, NULL);
+	else
+		log_error(ras_log_buf, ERR_TYPE_RTAS_LOG, 0);
+
+	spin_unlock(&ras_log_buf_lock);
+	return IRQ_HANDLED;
+}
+
+/* Handle environmental and power warning (EPOW) interrupts. */
+static irqreturn_t ras_epow_interrupt(int irq, void *dev_id)
+{
+	int status;
+	int state;
+	int critical;
+
+	status = rtas_get_sensor_fast(EPOW_SENSOR_TOKEN, EPOW_SENSOR_INDEX,
+				      &state);
+
+	if (state > 3)
+		critical = 1;		/* Time Critical */
+	else
+		critical = 0;
+
+	spin_lock(&ras_log_buf_lock);
+
+	status = rtas_call(ras_check_exception_token, 6, 1, NULL,
+			   RTAS_VECTOR_EXTERNAL_INTERRUPT,
+			   virq_to_hw(irq),
+			   RTAS_EPOW_WARNING,
+			   critical, __pa(&ras_log_buf),
+				rtas_get_error_log_max());
+
+	log_error(ras_log_buf, ERR_TYPE_RTAS_LOG, 0);
+
+	rtas_parse_epow_errlog((struct rtas_error_log *)ras_log_buf);
+
+	spin_unlock(&ras_log_buf_lock);
+	return IRQ_HANDLED;
+}
+
+/*
+ * Handle hardware error interrupts.
+ *
+ * RTAS check-exception is called to collect data on the exception.  If
+ * the error is deemed recoverable, we log a warning and return.
+ * For nonrecoverable errors, an error is logged and we stop all processing
+ * as quickly as possible in order to prevent propagation of the failure.
+ */
+static irqreturn_t ras_error_interrupt(int irq, void *dev_id)
+{
+	struct rtas_error_log *rtas_elog;
+	int status;
+	int fatal;
+
+	spin_lock(&ras_log_buf_lock);
+
+	status = rtas_call(ras_check_exception_token, 6, 1, NULL,
+			   RTAS_VECTOR_EXTERNAL_INTERRUPT,
+			   virq_to_hw(irq),
+			   RTAS_INTERNAL_ERROR, 1 /* Time Critical */,
+			   __pa(&ras_log_buf),
+				rtas_get_error_log_max());
+
+	rtas_elog = (struct rtas_error_log *)ras_log_buf;
+
+	if (status == 0 &&
+	    rtas_error_severity(rtas_elog) >= RTAS_SEVERITY_ERROR_SYNC)
+		fatal = 1;
+	else
+		fatal = 0;
+
+	/* format and print the extended information */
+	log_error(ras_log_buf, ERR_TYPE_RTAS_LOG, fatal);
+
+	if (fatal) {
+		pr_emerg("Fatal hardware error detected. Check RTAS error"
+			 " log for details. Powering off immediately\n");
+		emergency_sync();
+		kernel_power_off();
+	} else {
+		pr_err("Recoverable hardware error detected\n");
+	}
+
+	spin_unlock(&ras_log_buf_lock);
+	return IRQ_HANDLED;
+}
+
+/*
+ * Some versions of FWNMI place the buffer inside the 4kB page starting at
+ * 0x7000. Other versions place it inside the rtas buffer. We check both.
+ * Minimum size of the buffer is 16 bytes.
+ */
+#define VALID_FWNMI_BUFFER(A) \
+	((((A) >= 0x7000) && ((A) <= 0x8000 - 16)) || \
+	(((A) >= rtas.base) && ((A) <= (rtas.base + rtas.size - 16))))
+
+static inline struct rtas_error_log *fwnmi_get_errlog(void)
+{
+	return (struct rtas_error_log *)local_paca->mce_data_buf;
+}
+
+/*
+ * Get the error information for errors coming through the
+ * FWNMI vectors.  The pt_regs' r3 will be updated to reflect
+ * the actual r3 if possible, and a ptr to the error log entry
+ * will be returned if found.
+ *
+ * Use one buffer mce_data_buf per cpu to store RTAS error.
+ *
+ * The mce_data_buf does not have any locks or protection around it,
+ * if a second machine check comes in, or a system reset is done
+ * before we have logged the error, then we will get corruption in the
+ * error log.  This is preferable over holding off on calling
+ * ibm,nmi-interlock which would result in us checkstopping if a
+ * second machine check did come in.
+ */
+static struct rtas_error_log *fwnmi_get_errinfo(struct pt_regs *regs)
+{
+	unsigned long *savep;
+	struct rtas_error_log *h;
+
+	/* Mask top two bits */
+	regs->gpr[3] &= ~(0x3UL << 62);
+
+	if (!VALID_FWNMI_BUFFER(regs->gpr[3])) {
+		printk(KERN_ERR "FWNMI: corrupt r3 0x%016lx\n", regs->gpr[3]);
+		return NULL;
+	}
+
+	savep = __va(regs->gpr[3]);
+	regs->gpr[3] = be64_to_cpu(savep[0]);	/* restore original r3 */
+
+	h = (struct rtas_error_log *)&savep[1];
+	/* Use the per cpu buffer from paca to store rtas error log */
+	memset(local_paca->mce_data_buf, 0, RTAS_ERROR_LOG_MAX);
+	if (!rtas_error_extended(h)) {
+		memcpy(local_paca->mce_data_buf, h, sizeof(__u64));
+	} else {
+		int len, error_log_length;
+
+		error_log_length = 8 + rtas_error_extended_log_length(h);
+		len = min_t(int, error_log_length, RTAS_ERROR_LOG_MAX);
+		memcpy(local_paca->mce_data_buf, h, len);
+	}
+
+	return (struct rtas_error_log *)local_paca->mce_data_buf;
+}
+
+/* Call this when done with the data returned by FWNMI_get_errinfo.
+ * It will release the saved data area for other CPUs in the
+ * partition to receive FWNMI errors.
+ */
+static void fwnmi_release_errinfo(void)
+{
+	int ret = rtas_call(rtas_token("ibm,nmi-interlock"), 0, 1, NULL);
+	if (ret != 0)
+		printk(KERN_ERR "FWNMI: nmi-interlock failed: %d\n", ret);
+}
+
+int pSeries_system_reset_exception(struct pt_regs *regs)
+{
+#ifdef __LITTLE_ENDIAN__
+	/*
+	 * Some firmware byteswaps SRR registers and gives incorrect SRR1. Try
+	 * to detect the bad SRR1 pattern here. Flip the NIP back to correct
+	 * endian for reporting purposes. Unfortunately the MSR can't be fixed,
+	 * so clear it. It will be missing MSR_RI so we won't try to recover.
+	 */
+	if ((be64_to_cpu(regs->msr) &
+			(MSR_LE|MSR_RI|MSR_DR|MSR_IR|MSR_ME|MSR_PR|
+			 MSR_ILE|MSR_HV|MSR_SF)) == (MSR_DR|MSR_SF)) {
+		regs->nip = be64_to_cpu((__be64)regs->nip);
+		regs->msr = 0;
+	}
+#endif
+
+	if (fwnmi_active) {
+		struct rtas_error_log *errhdr = fwnmi_get_errinfo(regs);
+		if (errhdr) {
+			/* XXX Should look at FWNMI information */
+		}
+		fwnmi_release_errinfo();
+	}
+
+	if (smp_handle_nmi_ipi(regs))
+		return 1;
+
+	return 0; /* need to perform reset */
+}
+
+/*
+ * Process MCE rtas errlog event.
+ */
+static void mce_process_errlog_event(struct irq_work *work)
+{
+	struct rtas_error_log *err;
+
+	err = fwnmi_get_errlog();
+	log_error((char *)err, ERR_TYPE_RTAS_LOG, 0);
+}
+
+/*
+ * See if we can recover from a machine check exception.
+ * This is only called on power4 (or above) and only via
+ * the Firmware Non-Maskable Interrupts (fwnmi) handler
+ * which provides the error analysis for us.
+ *
+ * Return 1 if corrected (or delivered a signal).
+ * Return 0 if there is nothing we can do.
+ */
+static int recover_mce(struct pt_regs *regs, struct rtas_error_log *err)
+{
+	int recovered = 0;
+	int disposition = rtas_error_disposition(err);
+
+	if (!(regs->msr & MSR_RI)) {
+		/* If MSR_RI isn't set, we cannot recover */
+		recovered = 0;
+
+	} else if (disposition == RTAS_DISP_FULLY_RECOVERED) {
+		/* Platform corrected itself */
+		recovered = 1;
+
+	} else if (disposition == RTAS_DISP_LIMITED_RECOVERY) {
+		/* Platform corrected itself but could be degraded */
+		printk(KERN_ERR "MCE: limited recovery, system may "
+		       "be degraded\n");
+		recovered = 1;
+
+	} else if (user_mode(regs) && !is_global_init(current) &&
+		   rtas_error_severity(err) == RTAS_SEVERITY_ERROR_SYNC) {
+
+		/*
+		 * If we received a synchronous error when in userspace
+		 * kill the task. Firmware may report details of the fail
+		 * asynchronously, so we can't rely on the target and type
+		 * fields being valid here.
+		 */
+		printk(KERN_ERR "MCE: uncorrectable error, killing task "
+		       "%s:%d\n", current->comm, current->pid);
+
+		_exception(SIGBUS, regs, BUS_MCEERR_AR, regs->nip);
+		recovered = 1;
+	}
+
+	/* Queue irq work to log this rtas event later. */
+	irq_work_queue(&mce_errlog_process_work);
+
+	return recovered;
+}
+
+/*
+ * Handle a machine check.
+ *
+ * Note that on Power 4 and beyond Firmware Non-Maskable Interrupts (fwnmi)
+ * should be present.  If so the handler which called us tells us if the
+ * error was recovered (never true if RI=0).
+ *
+ * On hardware prior to Power 4 these exceptions were asynchronous which
+ * means we can't tell exactly where it occurred and so we can't recover.
+ */
+int pSeries_machine_check_exception(struct pt_regs *regs)
+{
+	struct rtas_error_log *errp;
+
+	if (fwnmi_active) {
+		errp = fwnmi_get_errinfo(regs);
+		fwnmi_release_errinfo();
+		if (errp && recover_mce(regs, errp))
+			return 1;
+	}
+
+	return 0;
+}
diff --git a/arch/powerpc/platforms/pseries/reconfig.c b/arch/powerpc/platforms/pseries/reconfig.c
new file mode 100644
index 000000000..0e0208117
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/reconfig.c
@@ -0,0 +1,414 @@
+/*
+ * pSeries_reconfig.c - support for dynamic reconfiguration (including PCI
+ * Hotplug and Dynamic Logical Partitioning on RPA platforms).
+ *
+ * Copyright (C) 2005 Nathan Lynch
+ * Copyright (C) 2005 IBM Corporation
+ *
+ *
+ *	This program is free software; you can redistribute it and/or
+ *	modify it under the terms of the GNU General Public License version
+ *	2 as published by the Free Software Foundation.
+ */
+
+#include <linux/kernel.h>
+#include <linux/notifier.h>
+#include <linux/proc_fs.h>
+#include <linux/slab.h>
+#include <linux/of.h>
+
+#include <asm/prom.h>
+#include <asm/machdep.h>
+#include <linux/uaccess.h>
+#include <asm/mmu.h>
+
+#include "of_helpers.h"
+
+static int pSeries_reconfig_add_node(const char *path, struct property *proplist)
+{
+	struct device_node *np;
+	int err = -ENOMEM;
+
+	np = kzalloc(sizeof(*np), GFP_KERNEL);
+	if (!np)
+		goto out_err;
+
+	np->full_name = kstrdup(kbasename(path), GFP_KERNEL);
+	if (!np->full_name)
+		goto out_err;
+
+	np->properties = proplist;
+	of_node_set_flag(np, OF_DYNAMIC);
+	of_node_init(np);
+
+	np->parent = pseries_of_derive_parent(path);
+	if (IS_ERR(np->parent)) {
+		err = PTR_ERR(np->parent);
+		goto out_err;
+	}
+
+	err = of_attach_node(np);
+	if (err) {
+		printk(KERN_ERR "Failed to add device node %s\n", path);
+		goto out_err;
+	}
+
+	of_node_put(np->parent);
+
+	return 0;
+
+out_err:
+	if (np) {
+		of_node_put(np->parent);
+		kfree(np->full_name);
+		kfree(np);
+	}
+	return err;
+}
+
+static int pSeries_reconfig_remove_node(struct device_node *np)
+{
+	struct device_node *parent, *child;
+
+	parent = of_get_parent(np);
+	if (!parent)
+		return -EINVAL;
+
+	if ((child = of_get_next_child(np, NULL))) {
+		of_node_put(child);
+		of_node_put(parent);
+		return -EBUSY;
+	}
+
+	of_detach_node(np);
+	of_node_put(parent);
+	return 0;
+}
+
+/*
+ * /proc/powerpc/ofdt - yucky binary interface for adding and removing
+ * OF device nodes.  Should be deprecated as soon as we get an
+ * in-kernel wrapper for the RTAS ibm,configure-connector call.
+ */
+
+static void release_prop_list(const struct property *prop)
+{
+	struct property *next;
+	for (; prop; prop = next) {
+		next = prop->next;
+		kfree(prop->name);
+		kfree(prop->value);
+		kfree(prop);
+	}
+
+}
+
+/**
+ * parse_next_property - process the next property from raw input buffer
+ * @buf: input buffer, must be nul-terminated
+ * @end: end of the input buffer + 1, for validation
+ * @name: return value; set to property name in buf
+ * @length: return value; set to length of value
+ * @value: return value; set to the property value in buf
+ *
+ * Note that the caller must make copies of the name and value returned,
+ * this function does no allocation or copying of the data.  Return value
+ * is set to the next name in buf, or NULL on error.
+ */
+static char * parse_next_property(char *buf, char *end, char **name, int *length,
+				  unsigned char **value)
+{
+	char *tmp;
+
+	*name = buf;
+
+	tmp = strchr(buf, ' ');
+	if (!tmp) {
+		printk(KERN_ERR "property parse failed in %s at line %d\n",
+		       __func__, __LINE__);
+		return NULL;
+	}
+	*tmp = '\0';
+
+	if (++tmp >= end) {
+		printk(KERN_ERR "property parse failed in %s at line %d\n",
+		       __func__, __LINE__);
+		return NULL;
+	}
+
+	/* now we're on the length */
+	*length = -1;
+	*length = simple_strtoul(tmp, &tmp, 10);
+	if (*length == -1) {
+		printk(KERN_ERR "property parse failed in %s at line %d\n",
+		       __func__, __LINE__);
+		return NULL;
+	}
+	if (*tmp != ' ' || ++tmp >= end) {
+		printk(KERN_ERR "property parse failed in %s at line %d\n",
+		       __func__, __LINE__);
+		return NULL;
+	}
+
+	/* now we're on the value */
+	*value = tmp;
+	tmp += *length;
+	if (tmp > end) {
+		printk(KERN_ERR "property parse failed in %s at line %d\n",
+		       __func__, __LINE__);
+		return NULL;
+	}
+	else if (tmp < end && *tmp != ' ' && *tmp != '\0') {
+		printk(KERN_ERR "property parse failed in %s at line %d\n",
+		       __func__, __LINE__);
+		return NULL;
+	}
+	tmp++;
+
+	/* and now we should be on the next name, or the end */
+	return tmp;
+}
+
+static struct property *new_property(const char *name, const int length,
+				     const unsigned char *value, struct property *last)
+{
+	struct property *new = kzalloc(sizeof(*new), GFP_KERNEL);
+
+	if (!new)
+		return NULL;
+
+	if (!(new->name = kstrdup(name, GFP_KERNEL)))
+		goto cleanup;
+	if (!(new->value = kmalloc(length + 1, GFP_KERNEL)))
+		goto cleanup;
+
+	memcpy(new->value, value, length);
+	*(((char *)new->value) + length) = 0;
+	new->length = length;
+	new->next = last;
+	return new;
+
+cleanup:
+	kfree(new->name);
+	kfree(new->value);
+	kfree(new);
+	return NULL;
+}
+
+static int do_add_node(char *buf, size_t bufsize)
+{
+	char *path, *end, *name;
+	struct device_node *np;
+	struct property *prop = NULL;
+	unsigned char* value;
+	int length, rv = 0;
+
+	end = buf + bufsize;
+	path = buf;
+	buf = strchr(buf, ' ');
+	if (!buf)
+		return -EINVAL;
+	*buf = '\0';
+	buf++;
+
+	if ((np = of_find_node_by_path(path))) {
+		of_node_put(np);
+		return -EINVAL;
+	}
+
+	/* rv = build_prop_list(tmp, bufsize - (tmp - buf), &proplist); */
+	while (buf < end &&
+	       (buf = parse_next_property(buf, end, &name, &length, &value))) {
+		struct property *last = prop;
+
+		prop = new_property(name, length, value, last);
+		if (!prop) {
+			rv = -ENOMEM;
+			prop = last;
+			goto out;
+		}
+	}
+	if (!buf) {
+		rv = -EINVAL;
+		goto out;
+	}
+
+	rv = pSeries_reconfig_add_node(path, prop);
+
+out:
+	if (rv)
+		release_prop_list(prop);
+	return rv;
+}
+
+static int do_remove_node(char *buf)
+{
+	struct device_node *node;
+	int rv = -ENODEV;
+
+	if ((node = of_find_node_by_path(buf)))
+		rv = pSeries_reconfig_remove_node(node);
+
+	of_node_put(node);
+	return rv;
+}
+
+static char *parse_node(char *buf, size_t bufsize, struct device_node **npp)
+{
+	char *handle_str;
+	phandle handle;
+	*npp = NULL;
+
+	handle_str = buf;
+
+	buf = strchr(buf, ' ');
+	if (!buf)
+		return NULL;
+	*buf = '\0';
+	buf++;
+
+	handle = simple_strtoul(handle_str, NULL, 0);
+
+	*npp = of_find_node_by_phandle(handle);
+	return buf;
+}
+
+static int do_add_property(char *buf, size_t bufsize)
+{
+	struct property *prop = NULL;
+	struct device_node *np;
+	unsigned char *value;
+	char *name, *end;
+	int length;
+	end = buf + bufsize;
+	buf = parse_node(buf, bufsize, &np);
+
+	if (!np)
+		return -ENODEV;
+
+	if (parse_next_property(buf, end, &name, &length, &value) == NULL)
+		return -EINVAL;
+
+	prop = new_property(name, length, value, NULL);
+	if (!prop)
+		return -ENOMEM;
+
+	of_add_property(np, prop);
+
+	return 0;
+}
+
+static int do_remove_property(char *buf, size_t bufsize)
+{
+	struct device_node *np;
+	char *tmp;
+	buf = parse_node(buf, bufsize, &np);
+
+	if (!np)
+		return -ENODEV;
+
+	tmp = strchr(buf,' ');
+	if (tmp)
+		*tmp = '\0';
+
+	if (strlen(buf) == 0)
+		return -EINVAL;
+
+	return of_remove_property(np, of_find_property(np, buf, NULL));
+}
+
+static int do_update_property(char *buf, size_t bufsize)
+{
+	struct device_node *np;
+	unsigned char *value;
+	char *name, *end, *next_prop;
+	int length;
+	struct property *newprop;
+	buf = parse_node(buf, bufsize, &np);
+	end = buf + bufsize;
+
+	if (!np)
+		return -ENODEV;
+
+	next_prop = parse_next_property(buf, end, &name, &length, &value);
+	if (!next_prop)
+		return -EINVAL;
+
+	if (!strlen(name))
+		return -ENODEV;
+
+	newprop = new_property(name, length, value, NULL);
+	if (!newprop)
+		return -ENOMEM;
+
+	if (!strcmp(name, "slb-size") || !strcmp(name, "ibm,slb-size"))
+		slb_set_size(*(int *)value);
+
+	return of_update_property(np, newprop);
+}
+
+/**
+ * ofdt_write - perform operations on the Open Firmware device tree
+ *
+ * @file: not used
+ * @buf: command and arguments
+ * @count: size of the command buffer
+ * @off: not used
+ *
+ * Operations supported at this time are addition and removal of
+ * whole nodes along with their properties.  Operations on individual
+ * properties are not implemented (yet).
+ */
+static ssize_t ofdt_write(struct file *file, const char __user *buf, size_t count,
+			  loff_t *off)
+{
+	int rv;
+	char *kbuf;
+	char *tmp;
+
+	kbuf = memdup_user_nul(buf, count);
+	if (IS_ERR(kbuf))
+		return PTR_ERR(kbuf);
+
+	tmp = strchr(kbuf, ' ');
+	if (!tmp) {
+		rv = -EINVAL;
+		goto out;
+	}
+	*tmp = '\0';
+	tmp++;
+
+	if (!strcmp(kbuf, "add_node"))
+		rv = do_add_node(tmp, count - (tmp - kbuf));
+	else if (!strcmp(kbuf, "remove_node"))
+		rv = do_remove_node(tmp);
+	else if (!strcmp(kbuf, "add_property"))
+		rv = do_add_property(tmp, count - (tmp - kbuf));
+	else if (!strcmp(kbuf, "remove_property"))
+		rv = do_remove_property(tmp, count - (tmp - kbuf));
+	else if (!strcmp(kbuf, "update_property"))
+		rv = do_update_property(tmp, count - (tmp - kbuf));
+	else
+		rv = -EINVAL;
+out:
+	kfree(kbuf);
+	return rv ? rv : count;
+}
+
+static const struct file_operations ofdt_fops = {
+	.write = ofdt_write,
+	.llseek = noop_llseek,
+};
+
+/* create /proc/powerpc/ofdt write-only by root */
+static int proc_ppc64_create_ofdt(void)
+{
+	struct proc_dir_entry *ent;
+
+	ent = proc_create("powerpc/ofdt", 0200, NULL, &ofdt_fops);
+	if (ent)
+		proc_set_size(ent, 0);
+
+	return 0;
+}
+machine_device_initcall(pseries, proc_ppc64_create_ofdt);
diff --git a/arch/powerpc/platforms/pseries/rng.c b/arch/powerpc/platforms/pseries/rng.c
new file mode 100644
index 000000000..262b8c5e1
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/rng.c
@@ -0,0 +1,46 @@
+/*
+ * Copyright 2013, Michael Ellerman, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#define pr_fmt(fmt)	"pseries-rng: " fmt
+
+#include <linux/kernel.h>
+#include <linux/of.h>
+#include <asm/archrandom.h>
+#include <asm/machdep.h>
+#include <asm/plpar_wrappers.h>
+
+
+static int pseries_get_random_long(unsigned long *v)
+{
+	unsigned long retbuf[PLPAR_HCALL_BUFSIZE];
+
+	if (plpar_hcall(H_RANDOM, retbuf) == H_SUCCESS) {
+		*v = retbuf[0];
+		return 1;
+	}
+
+	return 0;
+}
+
+static __init int rng_init(void)
+{
+	struct device_node *dn;
+
+	dn = of_find_compatible_node(NULL, NULL, "ibm,random");
+	if (!dn)
+		return -ENODEV;
+
+	pr_info("Registering arch random hook.\n");
+
+	ppc_md.get_random_seed = pseries_get_random_long;
+
+	of_node_put(dn);
+	return 0;
+}
+machine_subsys_initcall(pseries, rng_init);
diff --git a/arch/powerpc/platforms/pseries/scanlog.c b/arch/powerpc/platforms/pseries/scanlog.c
new file mode 100644
index 000000000..054ce7a16
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/scanlog.c
@@ -0,0 +1,200 @@
+/*
+ *  c 2001 PPC 64 Team, IBM Corp
+ *
+ *      This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ *
+ * scan-log-data driver for PPC64  Todd Inglett <tinglett@vnet.ibm.com>
+ *
+ * When ppc64 hardware fails the service processor dumps internal state
+ * of the system.  After a reboot the operating system can access a dump
+ * of this data using this driver.  A dump exists if the device-tree
+ * /chosen/ibm,scan-log-data property exists.
+ *
+ * This driver exports /proc/powerpc/scan-log-dump which can be read.
+ * The driver supports only sequential reads.
+ *
+ * The driver looks at a write to the driver for the single word "reset".
+ * If given, the driver will reset the scanlog so the platform can free it.
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <linux/proc_fs.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <asm/rtas.h>
+#include <asm/prom.h>
+
+#define MODULE_VERS "1.0"
+#define MODULE_NAME "scanlog"
+
+/* Status returns from ibm,scan-log-dump */
+#define SCANLOG_COMPLETE 0
+#define SCANLOG_HWERROR -1
+#define SCANLOG_CONTINUE 1
+
+
+static unsigned int ibm_scan_log_dump;			/* RTAS token */
+static unsigned int *scanlog_buffer;			/* The data buffer */
+
+static ssize_t scanlog_read(struct file *file, char __user *buf,
+			    size_t count, loff_t *ppos)
+{
+	unsigned int *data = scanlog_buffer;
+	int status;
+	unsigned long len, off;
+	unsigned int wait_time;
+
+	if (count > RTAS_DATA_BUF_SIZE)
+		count = RTAS_DATA_BUF_SIZE;
+
+	if (count < 1024) {
+		/* This is the min supported by this RTAS call.  Rather
+		 * than do all the buffering we insist the user code handle
+		 * larger reads.  As long as cp works... :)
+		 */
+		printk(KERN_ERR "scanlog: cannot perform a small read (%ld)\n", count);
+		return -EINVAL;
+	}
+
+	if (!access_ok(VERIFY_WRITE, buf, count))
+		return -EFAULT;
+
+	for (;;) {
+		wait_time = 500;	/* default wait if no data */
+		spin_lock(&rtas_data_buf_lock);
+		memcpy(rtas_data_buf, data, RTAS_DATA_BUF_SIZE);
+		status = rtas_call(ibm_scan_log_dump, 2, 1, NULL,
+				   (u32) __pa(rtas_data_buf), (u32) count);
+		memcpy(data, rtas_data_buf, RTAS_DATA_BUF_SIZE);
+		spin_unlock(&rtas_data_buf_lock);
+
+		pr_debug("scanlog: status=%d, data[0]=%x, data[1]=%x, " \
+			 "data[2]=%x\n", status, data[0], data[1], data[2]);
+		switch (status) {
+		    case SCANLOG_COMPLETE:
+			pr_debug("scanlog: hit eof\n");
+			return 0;
+		    case SCANLOG_HWERROR:
+			pr_debug("scanlog: hardware error reading data\n");
+			return -EIO;
+		    case SCANLOG_CONTINUE:
+			/* We may or may not have data yet */
+			len = data[1];
+			off = data[2];
+			if (len > 0) {
+				if (copy_to_user(buf, ((char *)data)+off, len))
+					return -EFAULT;
+				return len;
+			}
+			/* Break to sleep default time */
+			break;
+		    default:
+			/* Assume extended busy */
+			wait_time = rtas_busy_delay_time(status);
+			if (!wait_time) {
+				printk(KERN_ERR "scanlog: unknown error " \
+				       "from rtas: %d\n", status);
+				return -EIO;
+			}
+		}
+		/* Apparently no data yet.  Wait and try again. */
+		msleep_interruptible(wait_time);
+	}
+	/*NOTREACHED*/
+}
+
+static ssize_t scanlog_write(struct file * file, const char __user * buf,
+			     size_t count, loff_t *ppos)
+{
+	char stkbuf[20];
+	int status;
+
+	if (count > 19) count = 19;
+	if (copy_from_user (stkbuf, buf, count)) {
+		return -EFAULT;
+	}
+	stkbuf[count] = 0;
+
+	if (buf) {
+		if (strncmp(stkbuf, "reset", 5) == 0) {
+			pr_debug("scanlog: reset scanlog\n");
+			status = rtas_call(ibm_scan_log_dump, 2, 1, NULL, 0, 0);
+			pr_debug("scanlog: rtas returns %d\n", status);
+		}
+	}
+	return count;
+}
+
+static int scanlog_open(struct inode * inode, struct file * file)
+{
+	unsigned int *data = scanlog_buffer;
+
+	if (data[0] != 0) {
+		/* This imperfect test stops a second copy of the
+		 * data (or a reset while data is being copied)
+		 */
+		return -EBUSY;
+	}
+
+	data[0] = 0;	/* re-init so we restart the scan */
+
+	return 0;
+}
+
+static int scanlog_release(struct inode * inode, struct file * file)
+{
+	unsigned int *data = scanlog_buffer;
+
+	data[0] = 0;
+	return 0;
+}
+
+static const struct file_operations scanlog_fops = {
+	.owner		= THIS_MODULE,
+	.read		= scanlog_read,
+	.write		= scanlog_write,
+	.open		= scanlog_open,
+	.release	= scanlog_release,
+	.llseek		= noop_llseek,
+};
+
+static int __init scanlog_init(void)
+{
+	struct proc_dir_entry *ent;
+	int err = -ENOMEM;
+
+	ibm_scan_log_dump = rtas_token("ibm,scan-log-dump");
+	if (ibm_scan_log_dump == RTAS_UNKNOWN_SERVICE)
+		return -ENODEV;
+
+	/* Ideally we could allocate a buffer < 4G */
+	scanlog_buffer = kzalloc(RTAS_DATA_BUF_SIZE, GFP_KERNEL);
+	if (!scanlog_buffer)
+		goto err;
+
+	ent = proc_create("powerpc/rtas/scan-log-dump", 0400, NULL,
+			  &scanlog_fops);
+	if (!ent)
+		goto err;
+	return 0;
+err:
+	kfree(scanlog_buffer);
+	return err;
+}
+
+static void __exit scanlog_cleanup(void)
+{
+	remove_proc_entry("powerpc/rtas/scan-log-dump", NULL);
+	kfree(scanlog_buffer);
+}
+
+module_init(scanlog_init);
+module_exit(scanlog_cleanup);
+MODULE_LICENSE("GPL");
diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c
new file mode 100644
index 000000000..885d910bf
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/setup.c
@@ -0,0 +1,1046 @@
+/*
+ *  64-bit pSeries and RS/6000 setup code.
+ *
+ *  Copyright (C) 1995  Linus Torvalds
+ *  Adapted from 'alpha' version by Gary Thomas
+ *  Modified by Cort Dougan (cort@cs.nmt.edu)
+ *  Modified by PPC64 Team, IBM Corp
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+/*
+ * bootup setup stuff..
+ */
+
+#include <linux/cpu.h>
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/stddef.h>
+#include <linux/unistd.h>
+#include <linux/user.h>
+#include <linux/tty.h>
+#include <linux/major.h>
+#include <linux/interrupt.h>
+#include <linux/reboot.h>
+#include <linux/init.h>
+#include <linux/ioport.h>
+#include <linux/console.h>
+#include <linux/pci.h>
+#include <linux/utsname.h>
+#include <linux/adb.h>
+#include <linux/export.h>
+#include <linux/delay.h>
+#include <linux/irq.h>
+#include <linux/seq_file.h>
+#include <linux/root_dev.h>
+#include <linux/of.h>
+#include <linux/of_pci.h>
+#include <linux/memblock.h>
+
+#include <asm/mmu.h>
+#include <asm/processor.h>
+#include <asm/io.h>
+#include <asm/pgtable.h>
+#include <asm/prom.h>
+#include <asm/rtas.h>
+#include <asm/pci-bridge.h>
+#include <asm/iommu.h>
+#include <asm/dma.h>
+#include <asm/machdep.h>
+#include <asm/irq.h>
+#include <asm/time.h>
+#include <asm/nvram.h>
+#include <asm/pmc.h>
+#include <asm/xics.h>
+#include <asm/xive.h>
+#include <asm/ppc-pci.h>
+#include <asm/i8259.h>
+#include <asm/udbg.h>
+#include <asm/smp.h>
+#include <asm/firmware.h>
+#include <asm/eeh.h>
+#include <asm/reg.h>
+#include <asm/plpar_wrappers.h>
+#include <asm/kexec.h>
+#include <asm/isa-bridge.h>
+#include <asm/security_features.h>
+#include <asm/asm-const.h>
+
+#include "pseries.h"
+#include "../../../../drivers/pci/pci.h"
+
+DEFINE_STATIC_KEY_FALSE(shared_processor);
+EXPORT_SYMBOL(shared_processor);
+
+int CMO_PrPSP = -1;
+int CMO_SecPSP = -1;
+unsigned long CMO_PageSize = (ASM_CONST(1) << IOMMU_PAGE_SHIFT_4K);
+EXPORT_SYMBOL(CMO_PageSize);
+
+int fwnmi_active;  /* TRUE if an FWNMI handler is present */
+
+static void pSeries_show_cpuinfo(struct seq_file *m)
+{
+	struct device_node *root;
+	const char *model = "";
+
+	root = of_find_node_by_path("/");
+	if (root)
+		model = of_get_property(root, "model", NULL);
+	seq_printf(m, "machine\t\t: CHRP %s\n", model);
+	of_node_put(root);
+	if (radix_enabled())
+		seq_printf(m, "MMU\t\t: Radix\n");
+	else
+		seq_printf(m, "MMU\t\t: Hash\n");
+}
+
+/* Initialize firmware assisted non-maskable interrupts if
+ * the firmware supports this feature.
+ */
+static void __init fwnmi_init(void)
+{
+	unsigned long system_reset_addr, machine_check_addr;
+	u8 *mce_data_buf;
+	unsigned int i;
+	int nr_cpus = num_possible_cpus();
+
+	int ibm_nmi_register = rtas_token("ibm,nmi-register");
+	if (ibm_nmi_register == RTAS_UNKNOWN_SERVICE)
+		return;
+
+	/* If the kernel's not linked at zero we point the firmware at low
+	 * addresses anyway, and use a trampoline to get to the real code. */
+	system_reset_addr  = __pa(system_reset_fwnmi) - PHYSICAL_START;
+	machine_check_addr = __pa(machine_check_fwnmi) - PHYSICAL_START;
+
+	if (0 == rtas_call(ibm_nmi_register, 2, 1, NULL, system_reset_addr,
+				machine_check_addr))
+		fwnmi_active = 1;
+
+	/*
+	 * Allocate a chunk for per cpu buffer to hold rtas errorlog.
+	 * It will be used in real mode mce handler, hence it needs to be
+	 * below RMA.
+	 */
+	mce_data_buf = __va(memblock_alloc_base(RTAS_ERROR_LOG_MAX * nr_cpus,
+					RTAS_ERROR_LOG_MAX, ppc64_rma_size));
+	for_each_possible_cpu(i) {
+		paca_ptrs[i]->mce_data_buf = mce_data_buf +
+						(RTAS_ERROR_LOG_MAX * i);
+	}
+}
+
+static void pseries_8259_cascade(struct irq_desc *desc)
+{
+	struct irq_chip *chip = irq_desc_get_chip(desc);
+	unsigned int cascade_irq = i8259_irq();
+
+	if (cascade_irq)
+		generic_handle_irq(cascade_irq);
+
+	chip->irq_eoi(&desc->irq_data);
+}
+
+static void __init pseries_setup_i8259_cascade(void)
+{
+	struct device_node *np, *old, *found = NULL;
+	unsigned int cascade;
+	const u32 *addrp;
+	unsigned long intack = 0;
+	int naddr;
+
+	for_each_node_by_type(np, "interrupt-controller") {
+		if (of_device_is_compatible(np, "chrp,iic")) {
+			found = np;
+			break;
+		}
+	}
+
+	if (found == NULL) {
+		printk(KERN_DEBUG "pic: no ISA interrupt controller\n");
+		return;
+	}
+
+	cascade = irq_of_parse_and_map(found, 0);
+	if (!cascade) {
+		printk(KERN_ERR "pic: failed to map cascade interrupt");
+		return;
+	}
+	pr_debug("pic: cascade mapped to irq %d\n", cascade);
+
+	for (old = of_node_get(found); old != NULL ; old = np) {
+		np = of_get_parent(old);
+		of_node_put(old);
+		if (np == NULL)
+			break;
+		if (strcmp(np->name, "pci") != 0)
+			continue;
+		addrp = of_get_property(np, "8259-interrupt-acknowledge", NULL);
+		if (addrp == NULL)
+			continue;
+		naddr = of_n_addr_cells(np);
+		intack = addrp[naddr-1];
+		if (naddr > 1)
+			intack |= ((unsigned long)addrp[naddr-2]) << 32;
+	}
+	if (intack)
+		printk(KERN_DEBUG "pic: PCI 8259 intack at 0x%016lx\n", intack);
+	i8259_init(found, intack);
+	of_node_put(found);
+	irq_set_chained_handler(cascade, pseries_8259_cascade);
+}
+
+static void __init pseries_init_irq(void)
+{
+	/* Try using a XIVE if available, otherwise use a XICS */
+	if (!xive_spapr_init()) {
+		xics_init();
+		pseries_setup_i8259_cascade();
+	}
+}
+
+static void pseries_lpar_enable_pmcs(void)
+{
+	unsigned long set, reset;
+
+	set = 1UL << 63;
+	reset = 0;
+	plpar_hcall_norets(H_PERFMON, set, reset);
+}
+
+static int pci_dn_reconfig_notifier(struct notifier_block *nb, unsigned long action, void *data)
+{
+	struct of_reconfig_data *rd = data;
+	struct device_node *parent, *np = rd->dn;
+	struct pci_dn *pdn;
+	int err = NOTIFY_OK;
+
+	switch (action) {
+	case OF_RECONFIG_ATTACH_NODE:
+		parent = of_get_parent(np);
+		pdn = parent ? PCI_DN(parent) : NULL;
+		if (pdn)
+			pci_add_device_node_info(pdn->phb, np);
+
+		of_node_put(parent);
+		break;
+	case OF_RECONFIG_DETACH_NODE:
+		pdn = PCI_DN(np);
+		if (pdn)
+			list_del(&pdn->list);
+		break;
+	default:
+		err = NOTIFY_DONE;
+		break;
+	}
+	return err;
+}
+
+static struct notifier_block pci_dn_reconfig_nb = {
+	.notifier_call = pci_dn_reconfig_notifier,
+};
+
+struct kmem_cache *dtl_cache;
+
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
+/*
+ * Allocate space for the dispatch trace log for all possible cpus
+ * and register the buffers with the hypervisor.  This is used for
+ * computing time stolen by the hypervisor.
+ */
+static int alloc_dispatch_logs(void)
+{
+	int cpu, ret;
+	struct paca_struct *pp;
+	struct dtl_entry *dtl;
+
+	if (!firmware_has_feature(FW_FEATURE_SPLPAR))
+		return 0;
+
+	if (!dtl_cache)
+		return 0;
+
+	for_each_possible_cpu(cpu) {
+		pp = paca_ptrs[cpu];
+		dtl = kmem_cache_alloc(dtl_cache, GFP_KERNEL);
+		if (!dtl) {
+			pr_warn("Failed to allocate dispatch trace log for cpu %d\n",
+				cpu);
+			pr_warn("Stolen time statistics will be unreliable\n");
+			break;
+		}
+
+		pp->dtl_ridx = 0;
+		pp->dispatch_log = dtl;
+		pp->dispatch_log_end = dtl + N_DISPATCH_LOG;
+		pp->dtl_curr = dtl;
+	}
+
+	/* Register the DTL for the current (boot) cpu */
+	dtl = get_paca()->dispatch_log;
+	get_paca()->dtl_ridx = 0;
+	get_paca()->dtl_curr = dtl;
+	get_paca()->lppaca_ptr->dtl_idx = 0;
+
+	/* hypervisor reads buffer length from this field */
+	dtl->enqueue_to_dispatch_time = cpu_to_be32(DISPATCH_LOG_BYTES);
+	ret = register_dtl(hard_smp_processor_id(), __pa(dtl));
+	if (ret)
+		pr_err("WARNING: DTL registration of cpu %d (hw %d) failed "
+		       "with %d\n", smp_processor_id(),
+		       hard_smp_processor_id(), ret);
+	get_paca()->lppaca_ptr->dtl_enable_mask = 2;
+
+	return 0;
+}
+#else /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
+static inline int alloc_dispatch_logs(void)
+{
+	return 0;
+}
+#endif /* CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
+
+static int alloc_dispatch_log_kmem_cache(void)
+{
+	dtl_cache = kmem_cache_create("dtl", DISPATCH_LOG_BYTES,
+						DISPATCH_LOG_BYTES, 0, NULL);
+	if (!dtl_cache) {
+		pr_warn("Failed to create dispatch trace log buffer cache\n");
+		pr_warn("Stolen time statistics will be unreliable\n");
+		return 0;
+	}
+
+	return alloc_dispatch_logs();
+}
+machine_early_initcall(pseries, alloc_dispatch_log_kmem_cache);
+
+static void pseries_lpar_idle(void)
+{
+	/*
+	 * Default handler to go into low thread priority and possibly
+	 * low power mode by ceding processor to hypervisor
+	 */
+
+	if (!prep_irq_for_idle())
+		return;
+
+	/* Indicate to hypervisor that we are idle. */
+	get_lppaca()->idle = 1;
+
+	/*
+	 * Yield the processor to the hypervisor.  We return if
+	 * an external interrupt occurs (which are driven prior
+	 * to returning here) or if a prod occurs from another
+	 * processor. When returning here, external interrupts
+	 * are enabled.
+	 */
+	cede_processor();
+
+	get_lppaca()->idle = 0;
+}
+
+/*
+ * Enable relocation on during exceptions. This has partition wide scope and
+ * may take a while to complete, if it takes longer than one second we will
+ * just give up rather than wasting any more time on this - if that turns out
+ * to ever be a problem in practice we can move this into a kernel thread to
+ * finish off the process later in boot.
+ */
+void pseries_enable_reloc_on_exc(void)
+{
+	long rc;
+	unsigned int delay, total_delay = 0;
+
+	while (1) {
+		rc = enable_reloc_on_exceptions();
+		if (!H_IS_LONG_BUSY(rc)) {
+			if (rc == H_P2) {
+				pr_info("Relocation on exceptions not"
+					" supported\n");
+			} else if (rc != H_SUCCESS) {
+				pr_warn("Unable to enable relocation"
+					" on exceptions: %ld\n", rc);
+			}
+			break;
+		}
+
+		delay = get_longbusy_msecs(rc);
+		total_delay += delay;
+		if (total_delay > 1000) {
+			pr_warn("Warning: Giving up waiting to enable "
+				"relocation on exceptions (%u msec)!\n",
+				total_delay);
+			return;
+		}
+
+		mdelay(delay);
+	}
+}
+EXPORT_SYMBOL(pseries_enable_reloc_on_exc);
+
+void pseries_disable_reloc_on_exc(void)
+{
+	long rc;
+
+	while (1) {
+		rc = disable_reloc_on_exceptions();
+		if (!H_IS_LONG_BUSY(rc))
+			break;
+		mdelay(get_longbusy_msecs(rc));
+	}
+	if (rc != H_SUCCESS)
+		pr_warn("Warning: Failed to disable relocation on exceptions: %ld\n",
+			rc);
+}
+EXPORT_SYMBOL(pseries_disable_reloc_on_exc);
+
+#ifdef CONFIG_KEXEC_CORE
+static void pSeries_machine_kexec(struct kimage *image)
+{
+	if (firmware_has_feature(FW_FEATURE_SET_MODE))
+		pseries_disable_reloc_on_exc();
+
+	default_machine_kexec(image);
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+void pseries_big_endian_exceptions(void)
+{
+	long rc;
+
+	while (1) {
+		rc = enable_big_endian_exceptions();
+		if (!H_IS_LONG_BUSY(rc))
+			break;
+		mdelay(get_longbusy_msecs(rc));
+	}
+
+	/*
+	 * At this point it is unlikely panic() will get anything
+	 * out to the user, since this is called very late in kexec
+	 * but at least this will stop us from continuing on further
+	 * and creating an even more difficult to debug situation.
+	 *
+	 * There is a known problem when kdump'ing, if cpus are offline
+	 * the above call will fail. Rather than panicking again, keep
+	 * going and hope the kdump kernel is also little endian, which
+	 * it usually is.
+	 */
+	if (rc && !kdump_in_progress())
+		panic("Could not enable big endian exceptions");
+}
+
+void pseries_little_endian_exceptions(void)
+{
+	long rc;
+
+	while (1) {
+		rc = enable_little_endian_exceptions();
+		if (!H_IS_LONG_BUSY(rc))
+			break;
+		mdelay(get_longbusy_msecs(rc));
+	}
+	if (rc) {
+		ppc_md.progress("H_SET_MODE LE exception fail", 0);
+		panic("Could not enable little endian exceptions");
+	}
+}
+#endif
+
+static void __init find_and_init_phbs(void)
+{
+	struct device_node *node;
+	struct pci_controller *phb;
+	struct device_node *root = of_find_node_by_path("/");
+
+	for_each_child_of_node(root, node) {
+		if (node->type == NULL || (strcmp(node->type, "pci") != 0 &&
+					   strcmp(node->type, "pciex") != 0))
+			continue;
+
+		phb = pcibios_alloc_controller(node);
+		if (!phb)
+			continue;
+		rtas_setup_phb(phb);
+		pci_process_bridge_OF_ranges(phb, node, 0);
+		isa_bridge_find_early(phb);
+		phb->controller_ops = pseries_pci_controller_ops;
+	}
+
+	of_node_put(root);
+
+	/*
+	 * PCI_PROBE_ONLY and PCI_REASSIGN_ALL_BUS can be set via properties
+	 * in chosen.
+	 */
+	of_pci_check_probe_only();
+}
+
+static void init_cpu_char_feature_flags(struct h_cpu_char_result *result)
+{
+	/*
+	 * The features below are disabled by default, so we instead look to see
+	 * if firmware has *enabled* them, and set them if so.
+	 */
+	if (result->character & H_CPU_CHAR_SPEC_BAR_ORI31)
+		security_ftr_set(SEC_FTR_SPEC_BAR_ORI31);
+
+	if (result->character & H_CPU_CHAR_BCCTRL_SERIALISED)
+		security_ftr_set(SEC_FTR_BCCTRL_SERIALISED);
+
+	if (result->character & H_CPU_CHAR_L1D_FLUSH_ORI30)
+		security_ftr_set(SEC_FTR_L1D_FLUSH_ORI30);
+
+	if (result->character & H_CPU_CHAR_L1D_FLUSH_TRIG2)
+		security_ftr_set(SEC_FTR_L1D_FLUSH_TRIG2);
+
+	if (result->character & H_CPU_CHAR_L1D_THREAD_PRIV)
+		security_ftr_set(SEC_FTR_L1D_THREAD_PRIV);
+
+	if (result->character & H_CPU_CHAR_COUNT_CACHE_DISABLED)
+		security_ftr_set(SEC_FTR_COUNT_CACHE_DISABLED);
+
+	if (result->character & H_CPU_CHAR_BCCTR_FLUSH_ASSIST)
+		security_ftr_set(SEC_FTR_BCCTR_FLUSH_ASSIST);
+
+	if (result->behaviour & H_CPU_BEHAV_FLUSH_COUNT_CACHE)
+		security_ftr_set(SEC_FTR_FLUSH_COUNT_CACHE);
+
+	/*
+	 * The features below are enabled by default, so we instead look to see
+	 * if firmware has *disabled* them, and clear them if so.
+	 */
+	if (!(result->behaviour & H_CPU_BEHAV_FAVOUR_SECURITY))
+		security_ftr_clear(SEC_FTR_FAVOUR_SECURITY);
+
+	if (!(result->behaviour & H_CPU_BEHAV_L1D_FLUSH_PR))
+		security_ftr_clear(SEC_FTR_L1D_FLUSH_PR);
+
+	if (!(result->behaviour & H_CPU_BEHAV_BNDS_CHK_SPEC_BAR))
+		security_ftr_clear(SEC_FTR_BNDS_CHK_SPEC_BAR);
+}
+
+void pseries_setup_rfi_flush(void)
+{
+	struct h_cpu_char_result result;
+	enum l1d_flush_type types;
+	bool enable;
+	long rc;
+
+	/*
+	 * Set features to the defaults assumed by init_cpu_char_feature_flags()
+	 * so it can set/clear again any features that might have changed after
+	 * migration, and in case the hypercall fails and it is not even called.
+	 */
+	powerpc_security_features = SEC_FTR_DEFAULT;
+
+	rc = plpar_get_cpu_characteristics(&result);
+	if (rc == H_SUCCESS)
+		init_cpu_char_feature_flags(&result);
+
+	/*
+	 * We're the guest so this doesn't apply to us, clear it to simplify
+	 * handling of it elsewhere.
+	 */
+	security_ftr_clear(SEC_FTR_L1D_FLUSH_HV);
+
+	types = L1D_FLUSH_FALLBACK;
+
+	if (security_ftr_enabled(SEC_FTR_L1D_FLUSH_TRIG2))
+		types |= L1D_FLUSH_MTTRIG;
+
+	if (security_ftr_enabled(SEC_FTR_L1D_FLUSH_ORI30))
+		types |= L1D_FLUSH_ORI;
+
+	enable = security_ftr_enabled(SEC_FTR_FAVOUR_SECURITY) && \
+		 security_ftr_enabled(SEC_FTR_L1D_FLUSH_PR);
+
+	setup_rfi_flush(types, enable);
+	setup_count_cache_flush();
+
+	enable = security_ftr_enabled(SEC_FTR_FAVOUR_SECURITY) &&
+		 security_ftr_enabled(SEC_FTR_L1D_FLUSH_ENTRY);
+	setup_entry_flush(enable);
+
+	enable = security_ftr_enabled(SEC_FTR_FAVOUR_SECURITY) &&
+		 security_ftr_enabled(SEC_FTR_L1D_FLUSH_UACCESS);
+	setup_uaccess_flush(enable);
+}
+
+#ifdef CONFIG_PCI_IOV
+enum rtas_iov_fw_value_map {
+	NUM_RES_PROPERTY  = 0, /* Number of Resources */
+	LOW_INT           = 1, /* Lowest 32 bits of Address */
+	START_OF_ENTRIES  = 2, /* Always start of entry */
+	APERTURE_PROPERTY = 2, /* Start of entry+ to  Aperture Size */
+	WDW_SIZE_PROPERTY = 4, /* Start of entry+ to Window Size */
+	NEXT_ENTRY        = 7  /* Go to next entry on array */
+};
+
+enum get_iov_fw_value_index {
+	BAR_ADDRS     = 1,    /*  Get Bar Address */
+	APERTURE_SIZE = 2,    /*  Get Aperture Size */
+	WDW_SIZE      = 3     /*  Get Window Size */
+};
+
+resource_size_t pseries_get_iov_fw_value(struct pci_dev *dev, int resno,
+					 enum get_iov_fw_value_index value)
+{
+	const int *indexes;
+	struct device_node *dn = pci_device_to_OF_node(dev);
+	int i, num_res, ret = 0;
+
+	indexes = of_get_property(dn, "ibm,open-sriov-vf-bar-info", NULL);
+	if (!indexes)
+		return  0;
+
+	/*
+	 * First element in the array is the number of Bars
+	 * returned.  Search through the list to find the matching
+	 * bar
+	 */
+	num_res = of_read_number(&indexes[NUM_RES_PROPERTY], 1);
+	if (resno >= num_res)
+		return 0; /* or an errror */
+
+	i = START_OF_ENTRIES + NEXT_ENTRY * resno;
+	switch (value) {
+	case BAR_ADDRS:
+		ret = of_read_number(&indexes[i], 2);
+		break;
+	case APERTURE_SIZE:
+		ret = of_read_number(&indexes[i + APERTURE_PROPERTY], 2);
+		break;
+	case WDW_SIZE:
+		ret = of_read_number(&indexes[i + WDW_SIZE_PROPERTY], 2);
+		break;
+	}
+
+	return ret;
+}
+
+void of_pci_set_vf_bar_size(struct pci_dev *dev, const int *indexes)
+{
+	struct resource *res;
+	resource_size_t base, size;
+	int i, r, num_res;
+
+	num_res = of_read_number(&indexes[NUM_RES_PROPERTY], 1);
+	num_res = min_t(int, num_res, PCI_SRIOV_NUM_BARS);
+	for (i = START_OF_ENTRIES, r = 0; r < num_res && r < PCI_SRIOV_NUM_BARS;
+	     i += NEXT_ENTRY, r++) {
+		res = &dev->resource[r + PCI_IOV_RESOURCES];
+		base = of_read_number(&indexes[i], 2);
+		size = of_read_number(&indexes[i + APERTURE_PROPERTY], 2);
+		res->flags = pci_parse_of_flags(of_read_number
+						(&indexes[i + LOW_INT], 1), 0);
+		res->flags |= (IORESOURCE_MEM_64 | IORESOURCE_PCI_FIXED);
+		res->name = pci_name(dev);
+		res->start = base;
+		res->end = base + size - 1;
+	}
+}
+
+void of_pci_parse_iov_addrs(struct pci_dev *dev, const int *indexes)
+{
+	struct resource *res, *root, *conflict;
+	resource_size_t base, size;
+	int i, r, num_res;
+
+	/*
+	 * First element in the array is the number of Bars
+	 * returned.  Search through the list to find the matching
+	 * bars assign them from firmware into resources structure.
+	 */
+	num_res = of_read_number(&indexes[NUM_RES_PROPERTY], 1);
+	for (i = START_OF_ENTRIES, r = 0; r < num_res && r < PCI_SRIOV_NUM_BARS;
+	     i += NEXT_ENTRY, r++) {
+		res = &dev->resource[r + PCI_IOV_RESOURCES];
+		base = of_read_number(&indexes[i], 2);
+		size = of_read_number(&indexes[i + WDW_SIZE_PROPERTY], 2);
+		res->name = pci_name(dev);
+		res->start = base;
+		res->end = base + size - 1;
+		root = &iomem_resource;
+		dev_dbg(&dev->dev,
+			"pSeries IOV BAR %d: trying firmware assignment %pR\n",
+			 r + PCI_IOV_RESOURCES, res);
+		conflict = request_resource_conflict(root, res);
+		if (conflict) {
+			dev_info(&dev->dev,
+				 "BAR %d: %pR conflicts with %s %pR\n",
+				 r + PCI_IOV_RESOURCES, res,
+				 conflict->name, conflict);
+			res->flags |= IORESOURCE_UNSET;
+		}
+	}
+}
+
+static void pseries_disable_sriov_resources(struct pci_dev *pdev)
+{
+	int i;
+
+	pci_warn(pdev, "No hypervisor support for SR-IOV on this device, IOV BARs disabled.\n");
+	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++)
+		pdev->resource[i + PCI_IOV_RESOURCES].flags = 0;
+}
+
+static void pseries_pci_fixup_resources(struct pci_dev *pdev)
+{
+	const int *indexes;
+	struct device_node *dn = pci_device_to_OF_node(pdev);
+
+	/*Firmware must support open sriov otherwise dont configure*/
+	indexes = of_get_property(dn, "ibm,open-sriov-vf-bar-info", NULL);
+	if (indexes)
+		of_pci_set_vf_bar_size(pdev, indexes);
+	else
+		pseries_disable_sriov_resources(pdev);
+}
+
+static void pseries_pci_fixup_iov_resources(struct pci_dev *pdev)
+{
+	const int *indexes;
+	struct device_node *dn = pci_device_to_OF_node(pdev);
+
+	if (!pdev->is_physfn || pci_dev_is_added(pdev))
+		return;
+	/*Firmware must support open sriov otherwise dont configure*/
+	indexes = of_get_property(dn, "ibm,open-sriov-vf-bar-info", NULL);
+	if (indexes)
+		of_pci_parse_iov_addrs(pdev, indexes);
+	else
+		pseries_disable_sriov_resources(pdev);
+}
+
+static resource_size_t pseries_pci_iov_resource_alignment(struct pci_dev *pdev,
+							  int resno)
+{
+	const __be32 *reg;
+	struct device_node *dn = pci_device_to_OF_node(pdev);
+
+	/*Firmware must support open sriov otherwise report regular alignment*/
+	reg = of_get_property(dn, "ibm,is-open-sriov-pf", NULL);
+	if (!reg)
+		return pci_iov_resource_size(pdev, resno);
+
+	if (!pdev->is_physfn)
+		return 0;
+	return pseries_get_iov_fw_value(pdev,
+					resno - PCI_IOV_RESOURCES,
+					APERTURE_SIZE);
+}
+#endif
+
+static void __init pSeries_setup_arch(void)
+{
+	set_arch_panic_timeout(10, ARCH_PANIC_TIMEOUT);
+
+	/* Discover PIC type and setup ppc_md accordingly */
+	smp_init_pseries();
+
+
+	/* openpic global configuration register (64-bit format). */
+	/* openpic Interrupt Source Unit pointer (64-bit format). */
+	/* python0 facility area (mmio) (64-bit format) REAL address. */
+
+	/* init to some ~sane value until calibrate_delay() runs */
+	loops_per_jiffy = 50000000;
+
+	fwnmi_init();
+
+	pseries_setup_rfi_flush();
+	setup_stf_barrier();
+
+	/* By default, only probe PCI (can be overridden by rtas_pci) */
+	pci_add_flags(PCI_PROBE_ONLY);
+
+	/* Find and initialize PCI host bridges */
+	init_pci_config_tokens();
+	find_and_init_phbs();
+	of_reconfig_notifier_register(&pci_dn_reconfig_nb);
+
+	pSeries_nvram_init();
+
+	if (firmware_has_feature(FW_FEATURE_LPAR)) {
+		vpa_init(boot_cpuid);
+
+		if (lppaca_shared_proc(get_lppaca()))
+			static_branch_enable(&shared_processor);
+
+		ppc_md.power_save = pseries_lpar_idle;
+		ppc_md.enable_pmcs = pseries_lpar_enable_pmcs;
+#ifdef CONFIG_PCI_IOV
+		ppc_md.pcibios_fixup_resources =
+			pseries_pci_fixup_resources;
+		ppc_md.pcibios_fixup_sriov =
+			pseries_pci_fixup_iov_resources;
+		ppc_md.pcibios_iov_resource_alignment =
+			pseries_pci_iov_resource_alignment;
+#endif
+	} else {
+		/* No special idle routine */
+		ppc_md.enable_pmcs = power4_enable_pmcs;
+	}
+
+	ppc_md.pcibios_root_bridge_prepare = pseries_root_bridge_prepare;
+}
+
+static void pseries_panic(char *str)
+{
+	panic_flush_kmsg_end();
+	rtas_os_term(str);
+}
+
+static int __init pSeries_init_panel(void)
+{
+	/* Manually leave the kernel version on the panel. */
+#ifdef __BIG_ENDIAN__
+	ppc_md.progress("Linux ppc64\n", 0);
+#else
+	ppc_md.progress("Linux ppc64le\n", 0);
+#endif
+	ppc_md.progress(init_utsname()->version, 0);
+
+	return 0;
+}
+machine_arch_initcall(pseries, pSeries_init_panel);
+
+static int pseries_set_dabr(unsigned long dabr, unsigned long dabrx)
+{
+	return plpar_hcall_norets(H_SET_DABR, dabr);
+}
+
+static int pseries_set_xdabr(unsigned long dabr, unsigned long dabrx)
+{
+	/* Have to set at least one bit in the DABRX according to PAPR */
+	if (dabrx == 0 && dabr == 0)
+		dabrx = DABRX_USER;
+	/* PAPR says we can only set kernel and user bits */
+	dabrx &= DABRX_KERNEL | DABRX_USER;
+
+	return plpar_hcall_norets(H_SET_XDABR, dabr, dabrx);
+}
+
+static int pseries_set_dawr(unsigned long dawr, unsigned long dawrx)
+{
+	/* PAPR says we can't set HYP */
+	dawrx &= ~DAWRX_HYP;
+
+	return  plpar_set_watchpoint0(dawr, dawrx);
+}
+
+#define CMO_CHARACTERISTICS_TOKEN 44
+#define CMO_MAXLENGTH 1026
+
+void pSeries_coalesce_init(void)
+{
+	struct hvcall_mpp_x_data mpp_x_data;
+
+	if (firmware_has_feature(FW_FEATURE_CMO) && !h_get_mpp_x(&mpp_x_data))
+		powerpc_firmware_features |= FW_FEATURE_XCMO;
+	else
+		powerpc_firmware_features &= ~FW_FEATURE_XCMO;
+}
+
+/**
+ * fw_cmo_feature_init - FW_FEATURE_CMO is not stored in ibm,hypertas-functions,
+ * handle that here. (Stolen from parse_system_parameter_string)
+ */
+static void pSeries_cmo_feature_init(void)
+{
+	char *ptr, *key, *value, *end;
+	int call_status;
+	int page_order = IOMMU_PAGE_SHIFT_4K;
+
+	pr_debug(" -> fw_cmo_feature_init()\n");
+	spin_lock(&rtas_data_buf_lock);
+	memset(rtas_data_buf, 0, RTAS_DATA_BUF_SIZE);
+	call_status = rtas_call(rtas_token("ibm,get-system-parameter"), 3, 1,
+				NULL,
+				CMO_CHARACTERISTICS_TOKEN,
+				__pa(rtas_data_buf),
+				RTAS_DATA_BUF_SIZE);
+
+	if (call_status != 0) {
+		spin_unlock(&rtas_data_buf_lock);
+		pr_debug("CMO not available\n");
+		pr_debug(" <- fw_cmo_feature_init()\n");
+		return;
+	}
+
+	end = rtas_data_buf + CMO_MAXLENGTH - 2;
+	ptr = rtas_data_buf + 2;	/* step over strlen value */
+	key = value = ptr;
+
+	while (*ptr && (ptr <= end)) {
+		/* Separate the key and value by replacing '=' with '\0' and
+		 * point the value at the string after the '='
+		 */
+		if (ptr[0] == '=') {
+			ptr[0] = '\0';
+			value = ptr + 1;
+		} else if (ptr[0] == '\0' || ptr[0] == ',') {
+			/* Terminate the string containing the key/value pair */
+			ptr[0] = '\0';
+
+			if (key == value) {
+				pr_debug("Malformed key/value pair\n");
+				/* Never found a '=', end processing */
+				break;
+			}
+
+			if (0 == strcmp(key, "CMOPageSize"))
+				page_order = simple_strtol(value, NULL, 10);
+			else if (0 == strcmp(key, "PrPSP"))
+				CMO_PrPSP = simple_strtol(value, NULL, 10);
+			else if (0 == strcmp(key, "SecPSP"))
+				CMO_SecPSP = simple_strtol(value, NULL, 10);
+			value = key = ptr + 1;
+		}
+		ptr++;
+	}
+
+	/* Page size is returned as the power of 2 of the page size,
+	 * convert to the page size in bytes before returning
+	 */
+	CMO_PageSize = 1 << page_order;
+	pr_debug("CMO_PageSize = %lu\n", CMO_PageSize);
+
+	if (CMO_PrPSP != -1 || CMO_SecPSP != -1) {
+		pr_info("CMO enabled\n");
+		pr_debug("CMO enabled, PrPSP=%d, SecPSP=%d\n", CMO_PrPSP,
+		         CMO_SecPSP);
+		powerpc_firmware_features |= FW_FEATURE_CMO;
+		pSeries_coalesce_init();
+	} else
+		pr_debug("CMO not enabled, PrPSP=%d, SecPSP=%d\n", CMO_PrPSP,
+		         CMO_SecPSP);
+	spin_unlock(&rtas_data_buf_lock);
+	pr_debug(" <- fw_cmo_feature_init()\n");
+}
+
+/*
+ * Early initialization.  Relocation is on but do not reference unbolted pages
+ */
+static void __init pseries_init(void)
+{
+	pr_debug(" -> pseries_init()\n");
+
+#ifdef CONFIG_HVC_CONSOLE
+	if (firmware_has_feature(FW_FEATURE_LPAR))
+		hvc_vio_init_early();
+#endif
+	if (firmware_has_feature(FW_FEATURE_XDABR))
+		ppc_md.set_dabr = pseries_set_xdabr;
+	else if (firmware_has_feature(FW_FEATURE_DABR))
+		ppc_md.set_dabr = pseries_set_dabr;
+
+	if (firmware_has_feature(FW_FEATURE_SET_MODE))
+		ppc_md.set_dawr = pseries_set_dawr;
+
+	pSeries_cmo_feature_init();
+	iommu_init_early_pSeries();
+
+	pr_debug(" <- pseries_init()\n");
+}
+
+/**
+ * pseries_power_off - tell firmware about how to power off the system.
+ *
+ * This function calls either the power-off rtas token in normal cases
+ * or the ibm,power-off-ups token (if present & requested) in case of
+ * a power failure. If power-off token is used, power on will only be
+ * possible with power button press. If ibm,power-off-ups token is used
+ * it will allow auto poweron after power is restored.
+ */
+static void pseries_power_off(void)
+{
+	int rc;
+	int rtas_poweroff_ups_token = rtas_token("ibm,power-off-ups");
+
+	if (rtas_flash_term_hook)
+		rtas_flash_term_hook(SYS_POWER_OFF);
+
+	if (rtas_poweron_auto == 0 ||
+		rtas_poweroff_ups_token == RTAS_UNKNOWN_SERVICE) {
+		rc = rtas_call(rtas_token("power-off"), 2, 1, NULL, -1, -1);
+		printk(KERN_INFO "RTAS power-off returned %d\n", rc);
+	} else {
+		rc = rtas_call(rtas_poweroff_ups_token, 0, 1, NULL);
+		printk(KERN_INFO "RTAS ibm,power-off-ups returned %d\n", rc);
+	}
+	for (;;);
+}
+
+static int __init pSeries_probe(void)
+{
+	const char *dtype = of_get_property(of_root, "device_type", NULL);
+
+ 	if (dtype == NULL)
+ 		return 0;
+ 	if (strcmp(dtype, "chrp"))
+		return 0;
+
+	/* Cell blades firmware claims to be chrp while it's not. Until this
+	 * is fixed, we need to avoid those here.
+	 */
+	if (of_machine_is_compatible("IBM,CPBW-1.0") ||
+	    of_machine_is_compatible("IBM,CBEA"))
+		return 0;
+
+	pm_power_off = pseries_power_off;
+
+	pr_debug("Machine is%s LPAR !\n",
+	         (powerpc_firmware_features & FW_FEATURE_LPAR) ? "" : " not");
+
+	pseries_init();
+
+	return 1;
+}
+
+static int pSeries_pci_probe_mode(struct pci_bus *bus)
+{
+	if (firmware_has_feature(FW_FEATURE_LPAR))
+		return PCI_PROBE_DEVTREE;
+	return PCI_PROBE_NORMAL;
+}
+
+struct pci_controller_ops pseries_pci_controller_ops = {
+	.probe_mode		= pSeries_pci_probe_mode,
+};
+
+define_machine(pseries) {
+	.name			= "pSeries",
+	.probe			= pSeries_probe,
+	.setup_arch		= pSeries_setup_arch,
+	.init_IRQ		= pseries_init_irq,
+	.show_cpuinfo		= pSeries_show_cpuinfo,
+	.log_error		= pSeries_log_error,
+	.pcibios_fixup		= pSeries_final_fixup,
+	.restart		= rtas_restart,
+	.halt			= rtas_halt,
+	.panic			= pseries_panic,
+	.get_boot_time		= rtas_get_boot_time,
+	.get_rtc_time		= rtas_get_rtc_time,
+	.set_rtc_time		= rtas_set_rtc_time,
+	.calibrate_decr		= generic_calibrate_decr,
+	.progress		= rtas_progress,
+	.system_reset_exception = pSeries_system_reset_exception,
+	.machine_check_exception = pSeries_machine_check_exception,
+#ifdef CONFIG_KEXEC_CORE
+	.machine_kexec          = pSeries_machine_kexec,
+	.kexec_cpu_down         = pseries_kexec_cpu_down,
+#endif
+#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
+	.memory_block_size	= pseries_memory_block_size,
+#endif
+};
diff --git a/arch/powerpc/platforms/pseries/smp.c b/arch/powerpc/platforms/pseries/smp.c
new file mode 100644
index 000000000..3df46123c
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/smp.c
@@ -0,0 +1,292 @@
+/*
+ * SMP support for pSeries machines.
+ *
+ * Dave Engebretsen, Peter Bergner, and
+ * Mike Corrigan {engebret|bergner|mikec}@us.ibm.com
+ *
+ * Plus various changes from other IBM teams...
+ *
+ *      This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ */
+
+
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/smp.h>
+#include <linux/interrupt.h>
+#include <linux/delay.h>
+#include <linux/init.h>
+#include <linux/spinlock.h>
+#include <linux/cache.h>
+#include <linux/err.h>
+#include <linux/device.h>
+#include <linux/cpu.h>
+
+#include <asm/ptrace.h>
+#include <linux/atomic.h>
+#include <asm/irq.h>
+#include <asm/page.h>
+#include <asm/pgtable.h>
+#include <asm/io.h>
+#include <asm/prom.h>
+#include <asm/smp.h>
+#include <asm/paca.h>
+#include <asm/machdep.h>
+#include <asm/cputable.h>
+#include <asm/firmware.h>
+#include <asm/rtas.h>
+#include <asm/vdso_datapage.h>
+#include <asm/cputhreads.h>
+#include <asm/xics.h>
+#include <asm/xive.h>
+#include <asm/dbell.h>
+#include <asm/plpar_wrappers.h>
+#include <asm/code-patching.h>
+
+#include "pseries.h"
+#include "offline_states.h"
+
+
+/*
+ * The Primary thread of each non-boot processor was started from the OF client
+ * interface by prom_hold_cpus and is spinning on secondary_hold_spinloop.
+ */
+static cpumask_var_t of_spin_mask;
+
+/* Query where a cpu is now.  Return codes #defined in plpar_wrappers.h */
+int smp_query_cpu_stopped(unsigned int pcpu)
+{
+	int cpu_status, status;
+	int qcss_tok = rtas_token("query-cpu-stopped-state");
+
+	if (qcss_tok == RTAS_UNKNOWN_SERVICE) {
+		printk_once(KERN_INFO
+			"Firmware doesn't support query-cpu-stopped-state\n");
+		return QCSS_HARDWARE_ERROR;
+	}
+
+	status = rtas_call(qcss_tok, 1, 2, &cpu_status, pcpu);
+	if (status != 0) {
+		printk(KERN_ERR
+		       "RTAS query-cpu-stopped-state failed: %i\n", status);
+		return status;
+	}
+
+	return cpu_status;
+}
+
+/**
+ * smp_startup_cpu() - start the given cpu
+ *
+ * At boot time, there is nothing to do for primary threads which were
+ * started from Open Firmware.  For anything else, call RTAS with the
+ * appropriate start location.
+ *
+ * Returns:
+ *	0	- failure
+ *	1	- success
+ */
+static inline int smp_startup_cpu(unsigned int lcpu)
+{
+	int status;
+	unsigned long start_here =
+			__pa(ppc_function_entry(generic_secondary_smp_init));
+	unsigned int pcpu;
+	int start_cpu;
+
+	if (cpumask_test_cpu(lcpu, of_spin_mask))
+		/* Already started by OF and sitting in spin loop */
+		return 1;
+
+	pcpu = get_hard_smp_processor_id(lcpu);
+
+	/* Check to see if the CPU out of FW already for kexec */
+	if (smp_query_cpu_stopped(pcpu) == QCSS_NOT_STOPPED){
+		cpumask_set_cpu(lcpu, of_spin_mask);
+		return 1;
+	}
+
+	/* Fixup atomic count: it exited inside IRQ handler. */
+	task_thread_info(paca_ptrs[lcpu]->__current)->preempt_count	= 0;
+#ifdef CONFIG_HOTPLUG_CPU
+	if (get_cpu_current_state(lcpu) == CPU_STATE_INACTIVE)
+		goto out;
+#endif
+	/* 
+	 * If the RTAS start-cpu token does not exist then presume the
+	 * cpu is already spinning.
+	 */
+	start_cpu = rtas_token("start-cpu");
+	if (start_cpu == RTAS_UNKNOWN_SERVICE)
+		return 1;
+
+	status = rtas_call(start_cpu, 3, 1, NULL, pcpu, start_here, pcpu);
+	if (status != 0) {
+		printk(KERN_ERR "start-cpu failed: %i\n", status);
+		return 0;
+	}
+
+#ifdef CONFIG_HOTPLUG_CPU
+out:
+#endif
+	return 1;
+}
+
+static void smp_setup_cpu(int cpu)
+{
+	if (xive_enabled())
+		xive_smp_setup_cpu();
+	else if (cpu != boot_cpuid)
+		xics_setup_cpu();
+
+	if (firmware_has_feature(FW_FEATURE_SPLPAR))
+		vpa_init(cpu);
+
+	cpumask_clear_cpu(cpu, of_spin_mask);
+#ifdef CONFIG_HOTPLUG_CPU
+	set_cpu_current_state(cpu, CPU_STATE_ONLINE);
+	set_default_offline_state(cpu);
+#endif
+}
+
+static int smp_pSeries_kick_cpu(int nr)
+{
+	if (nr < 0 || nr >= nr_cpu_ids)
+		return -EINVAL;
+
+	if (!smp_startup_cpu(nr))
+		return -ENOENT;
+
+	/*
+	 * The processor is currently spinning, waiting for the
+	 * cpu_start field to become non-zero After we set cpu_start,
+	 * the processor will continue on to secondary_start
+	 */
+	paca_ptrs[nr]->cpu_start = 1;
+#ifdef CONFIG_HOTPLUG_CPU
+	set_preferred_offline_state(nr, CPU_STATE_ONLINE);
+
+	if (get_cpu_current_state(nr) == CPU_STATE_INACTIVE) {
+		long rc;
+		unsigned long hcpuid;
+
+		hcpuid = get_hard_smp_processor_id(nr);
+		rc = plpar_hcall_norets(H_PROD, hcpuid);
+		if (rc != H_SUCCESS)
+			printk(KERN_ERR "Error: Prod to wake up processor %d "
+						"Ret= %ld\n", nr, rc);
+	}
+#endif
+
+	return 0;
+}
+
+static int pseries_smp_prepare_cpu(int cpu)
+{
+	if (xive_enabled())
+		return xive_smp_prepare_cpu(cpu);
+	return 0;
+}
+
+static void smp_pseries_cause_ipi(int cpu)
+{
+	/* POWER9 should not use this handler */
+	if (doorbell_try_core_ipi(cpu))
+		return;
+
+	icp_ops->cause_ipi(cpu);
+}
+
+static int pseries_cause_nmi_ipi(int cpu)
+{
+	int hwcpu;
+
+	if (cpu == NMI_IPI_ALL_OTHERS) {
+		hwcpu = H_SIGNAL_SYS_RESET_ALL_OTHERS;
+	} else {
+		if (cpu < 0) {
+			WARN_ONCE(true, "incorrect cpu parameter %d", cpu);
+			return 0;
+		}
+
+		hwcpu = get_hard_smp_processor_id(cpu);
+	}
+
+	if (plpar_signal_sys_reset(hwcpu) == H_SUCCESS)
+		return 1;
+
+	return 0;
+}
+
+static __init void pSeries_smp_probe_xics(void)
+{
+	xics_smp_probe();
+
+	if (cpu_has_feature(CPU_FTR_DBELL))
+		smp_ops->cause_ipi = smp_pseries_cause_ipi;
+	else
+		smp_ops->cause_ipi = icp_ops->cause_ipi;
+}
+
+static __init void pSeries_smp_probe(void)
+{
+	if (xive_enabled())
+		/*
+		 * Don't use P9 doorbells when XIVE is enabled. IPIs
+		 * using MMIOs should be faster
+		 */
+		xive_smp_probe();
+	else
+		pSeries_smp_probe_xics();
+}
+
+static struct smp_ops_t pseries_smp_ops = {
+	.message_pass	= NULL,	/* Use smp_muxed_ipi_message_pass */
+	.cause_ipi	= NULL,	/* Filled at runtime by pSeries_smp_probe() */
+	.cause_nmi_ipi	= pseries_cause_nmi_ipi,
+	.probe		= pSeries_smp_probe,
+	.prepare_cpu	= pseries_smp_prepare_cpu,
+	.kick_cpu	= smp_pSeries_kick_cpu,
+	.setup_cpu	= smp_setup_cpu,
+	.cpu_bootable	= smp_generic_cpu_bootable,
+};
+
+/* This is called very early */
+void __init smp_init_pseries(void)
+{
+	int i;
+
+	pr_debug(" -> smp_init_pSeries()\n");
+	smp_ops = &pseries_smp_ops;
+
+	alloc_bootmem_cpumask_var(&of_spin_mask);
+
+	/*
+	 * Mark threads which are still spinning in hold loops
+	 *
+	 * We know prom_init will not have started them if RTAS supports
+	 * query-cpu-stopped-state.
+	 */
+	if (rtas_token("query-cpu-stopped-state") == RTAS_UNKNOWN_SERVICE) {
+		if (cpu_has_feature(CPU_FTR_SMT)) {
+			for_each_present_cpu(i) {
+				if (cpu_thread_in_core(i) == 0)
+					cpumask_set_cpu(i, of_spin_mask);
+			}
+		} else
+			cpumask_copy(of_spin_mask, cpu_present_mask);
+
+		cpumask_clear_cpu(boot_cpuid, of_spin_mask);
+	}
+
+	/* Non-lpar has additional take/give timebase */
+	if (rtas_token("freeze-time-base") != RTAS_UNKNOWN_SERVICE) {
+		smp_ops->give_timebase = rtas_give_timebase;
+		smp_ops->take_timebase = rtas_take_timebase;
+	}
+
+	pr_debug(" <- smp_init_pSeries()\n");
+}
diff --git a/arch/powerpc/platforms/pseries/suspend.c b/arch/powerpc/platforms/pseries/suspend.c
new file mode 100644
index 000000000..5414d3295
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/suspend.c
@@ -0,0 +1,278 @@
+/*
+  * Copyright (C) 2010 Brian King IBM Corporation
+  *
+  * This program is free software; you can redistribute it and/or modify
+  * it under the terms of the GNU General Public License as published by
+  * the Free Software Foundation; either version 2 of the License, or
+  * (at your option) any later version.
+  *
+  * This program is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  * GNU General Public License for more details.
+  *
+  * You should have received a copy of the GNU General Public License
+  * along with this program; if not, write to the Free Software
+  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+  */
+
+#include <linux/cpu.h>
+#include <linux/delay.h>
+#include <linux/suspend.h>
+#include <linux/stat.h>
+#include <asm/firmware.h>
+#include <asm/hvcall.h>
+#include <asm/machdep.h>
+#include <asm/mmu.h>
+#include <asm/rtas.h>
+#include <asm/topology.h>
+
+static u64 stream_id;
+static struct device suspend_dev;
+static DECLARE_COMPLETION(suspend_work);
+static struct rtas_suspend_me_data suspend_data;
+static atomic_t suspending;
+
+/**
+ * pseries_suspend_begin - First phase of hibernation
+ *
+ * Check to ensure we are in a valid state to hibernate
+ *
+ * Return value:
+ * 	0 on success / other on failure
+ **/
+static int pseries_suspend_begin(suspend_state_t state)
+{
+	long vasi_state, rc;
+	unsigned long retbuf[PLPAR_HCALL_BUFSIZE];
+
+	/* Make sure the state is valid */
+	rc = plpar_hcall(H_VASI_STATE, retbuf, stream_id);
+
+	vasi_state = retbuf[0];
+
+	if (rc) {
+		pr_err("pseries_suspend_begin: vasi_state returned %ld\n",rc);
+		return rc;
+	} else if (vasi_state == H_VASI_ENABLED) {
+		return -EAGAIN;
+	} else if (vasi_state != H_VASI_SUSPENDING) {
+		pr_err("pseries_suspend_begin: vasi_state returned state %ld\n",
+		       vasi_state);
+		return -EIO;
+	}
+
+	return 0;
+}
+
+/**
+ * pseries_suspend_cpu - Suspend a single CPU
+ *
+ * Makes the H_JOIN call to suspend the CPU
+ *
+ **/
+static int pseries_suspend_cpu(void)
+{
+	if (atomic_read(&suspending))
+		return rtas_suspend_cpu(&suspend_data);
+	return 0;
+}
+
+/**
+ * pseries_suspend_enable_irqs
+ *
+ * Post suspend configuration updates
+ *
+ **/
+static void pseries_suspend_enable_irqs(void)
+{
+	/*
+	 * Update configuration which can be modified based on device tree
+	 * changes during resume.
+	 */
+	post_mobility_fixup();
+}
+
+/**
+ * pseries_suspend_enter - Final phase of hibernation
+ *
+ * Return value:
+ * 	0 on success / other on failure
+ **/
+static int pseries_suspend_enter(suspend_state_t state)
+{
+	int rc = rtas_suspend_last_cpu(&suspend_data);
+
+	atomic_set(&suspending, 0);
+	atomic_set(&suspend_data.done, 1);
+	return rc;
+}
+
+/**
+ * pseries_prepare_late - Prepare to suspend all other CPUs
+ *
+ * Return value:
+ * 	0 on success / other on failure
+ **/
+static int pseries_prepare_late(void)
+{
+	atomic_set(&suspending, 1);
+	atomic_set(&suspend_data.working, 0);
+	atomic_set(&suspend_data.done, 0);
+	atomic_set(&suspend_data.error, 0);
+	suspend_data.complete = &suspend_work;
+	reinit_completion(&suspend_work);
+	return 0;
+}
+
+/**
+ * store_hibernate - Initiate partition hibernation
+ * @dev:		subsys root device
+ * @attr:		device attribute struct
+ * @buf:		buffer
+ * @count:		buffer size
+ *
+ * Write the stream ID received from the HMC to this file
+ * to trigger hibernating the partition
+ *
+ * Return value:
+ * 	number of bytes printed to buffer / other on failure
+ **/
+static ssize_t store_hibernate(struct device *dev,
+			       struct device_attribute *attr,
+			       const char *buf, size_t count)
+{
+	cpumask_var_t offline_mask;
+	int rc;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (!alloc_cpumask_var(&offline_mask, GFP_KERNEL))
+		return -ENOMEM;
+
+	stream_id = simple_strtoul(buf, NULL, 16);
+
+	do {
+		rc = pseries_suspend_begin(PM_SUSPEND_MEM);
+		if (rc == -EAGAIN)
+			ssleep(1);
+	} while (rc == -EAGAIN);
+
+	if (!rc) {
+		/* All present CPUs must be online */
+		cpumask_andnot(offline_mask, cpu_present_mask,
+				cpu_online_mask);
+		rc = rtas_online_cpus_mask(offline_mask);
+		if (rc) {
+			pr_err("%s: Could not bring present CPUs online.\n",
+					__func__);
+			goto out;
+		}
+
+		stop_topology_update();
+		rc = pm_suspend(PM_SUSPEND_MEM);
+		start_topology_update();
+
+		/* Take down CPUs not online prior to suspend */
+		if (!rtas_offline_cpus_mask(offline_mask))
+			pr_warn("%s: Could not restore CPUs to offline "
+					"state.\n", __func__);
+	}
+
+	stream_id = 0;
+
+	if (!rc)
+		rc = count;
+out:
+	free_cpumask_var(offline_mask);
+	return rc;
+}
+
+#define USER_DT_UPDATE	0
+#define KERN_DT_UPDATE	1
+
+/**
+ * show_hibernate - Report device tree update responsibilty
+ * @dev:		subsys root device
+ * @attr:		device attribute struct
+ * @buf:		buffer
+ *
+ * Report whether a device tree update is performed by the kernel after a
+ * resume, or if drmgr must coordinate the update from user space.
+ *
+ * Return value:
+ *	0 if drmgr is to initiate update, and 1 otherwise
+ **/
+static ssize_t show_hibernate(struct device *dev,
+			      struct device_attribute *attr,
+			      char *buf)
+{
+	return sprintf(buf, "%d\n", KERN_DT_UPDATE);
+}
+
+static DEVICE_ATTR(hibernate, 0644, show_hibernate, store_hibernate);
+
+static struct bus_type suspend_subsys = {
+	.name = "power",
+	.dev_name = "power",
+};
+
+static const struct platform_suspend_ops pseries_suspend_ops = {
+	.valid		= suspend_valid_only_mem,
+	.prepare_late	= pseries_prepare_late,
+	.enter		= pseries_suspend_enter,
+};
+
+/**
+ * pseries_suspend_sysfs_register - Register with sysfs
+ *
+ * Return value:
+ * 	0 on success / other on failure
+ **/
+static int pseries_suspend_sysfs_register(struct device *dev)
+{
+	int rc;
+
+	if ((rc = subsys_system_register(&suspend_subsys, NULL)))
+		return rc;
+
+	dev->id = 0;
+	dev->bus = &suspend_subsys;
+
+	if ((rc = device_create_file(suspend_subsys.dev_root, &dev_attr_hibernate)))
+		goto subsys_unregister;
+
+	return 0;
+
+subsys_unregister:
+	bus_unregister(&suspend_subsys);
+	return rc;
+}
+
+/**
+ * pseries_suspend_init - initcall for pSeries suspend
+ *
+ * Return value:
+ * 	0 on success / other on failure
+ **/
+static int __init pseries_suspend_init(void)
+{
+	int rc;
+
+	if (!firmware_has_feature(FW_FEATURE_LPAR))
+		return 0;
+
+	suspend_data.token = rtas_token("ibm,suspend-me");
+	if (suspend_data.token == RTAS_UNKNOWN_SERVICE)
+		return 0;
+
+	if ((rc = pseries_suspend_sysfs_register(&suspend_dev)))
+		return rc;
+
+	ppc_md.suspend_disable_cpu = pseries_suspend_cpu;
+	ppc_md.suspend_enable_irqs = pseries_suspend_enable_irqs;
+	suspend_set_ops(&pseries_suspend_ops);
+	return 0;
+}
+machine_device_initcall(pseries, pseries_suspend_init);
diff --git a/arch/powerpc/platforms/pseries/vio.c b/arch/powerpc/platforms/pseries/vio.c
new file mode 100644
index 000000000..0e7778be4
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/vio.c
@@ -0,0 +1,1724 @@
+/*
+ * IBM PowerPC Virtual I/O Infrastructure Support.
+ *
+ *    Copyright (c) 2003,2008 IBM Corp.
+ *     Dave Engebretsen engebret@us.ibm.com
+ *     Santiago Leon santil@us.ibm.com
+ *     Hollis Blanchard <hollisb@us.ibm.com>
+ *     Stephen Rothwell
+ *     Robert Jennings <rcjenn@us.ibm.com>
+ *
+ *      This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/cpu.h>
+#include <linux/types.h>
+#include <linux/delay.h>
+#include <linux/stat.h>
+#include <linux/device.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/console.h>
+#include <linux/export.h>
+#include <linux/mm.h>
+#include <linux/dma-mapping.h>
+#include <linux/kobject.h>
+
+#include <asm/iommu.h>
+#include <asm/dma.h>
+#include <asm/vio.h>
+#include <asm/prom.h>
+#include <asm/firmware.h>
+#include <asm/tce.h>
+#include <asm/page.h>
+#include <asm/hvcall.h>
+
+static struct vio_dev vio_bus_device  = { /* fake "parent" device */
+	.name = "vio",
+	.type = "",
+	.dev.init_name = "vio",
+	.dev.bus = &vio_bus_type,
+};
+
+#ifdef CONFIG_PPC_SMLPAR
+/**
+ * vio_cmo_pool - A pool of IO memory for CMO use
+ *
+ * @size: The size of the pool in bytes
+ * @free: The amount of free memory in the pool
+ */
+struct vio_cmo_pool {
+	size_t size;
+	size_t free;
+};
+
+/* How many ms to delay queued balance work */
+#define VIO_CMO_BALANCE_DELAY 100
+
+/* Portion out IO memory to CMO devices by this chunk size */
+#define VIO_CMO_BALANCE_CHUNK 131072
+
+/**
+ * vio_cmo_dev_entry - A device that is CMO-enabled and requires entitlement
+ *
+ * @vio_dev: struct vio_dev pointer
+ * @list: pointer to other devices on bus that are being tracked
+ */
+struct vio_cmo_dev_entry {
+	struct vio_dev *viodev;
+	struct list_head list;
+};
+
+/**
+ * vio_cmo - VIO bus accounting structure for CMO entitlement
+ *
+ * @lock: spinlock for entire structure
+ * @balance_q: work queue for balancing system entitlement
+ * @device_list: list of CMO-enabled devices requiring entitlement
+ * @entitled: total system entitlement in bytes
+ * @reserve: pool of memory from which devices reserve entitlement, incl. spare
+ * @excess: pool of excess entitlement not needed for device reserves or spare
+ * @spare: IO memory for device hotplug functionality
+ * @min: minimum necessary for system operation
+ * @desired: desired memory for system operation
+ * @curr: bytes currently allocated
+ * @high: high water mark for IO data usage
+ */
+static struct vio_cmo {
+	spinlock_t lock;
+	struct delayed_work balance_q;
+	struct list_head device_list;
+	size_t entitled;
+	struct vio_cmo_pool reserve;
+	struct vio_cmo_pool excess;
+	size_t spare;
+	size_t min;
+	size_t desired;
+	size_t curr;
+	size_t high;
+} vio_cmo;
+
+/**
+ * vio_cmo_OF_devices - Count the number of OF devices that have DMA windows
+ */
+static int vio_cmo_num_OF_devs(void)
+{
+	struct device_node *node_vroot;
+	int count = 0;
+
+	/*
+	 * Count the number of vdevice entries with an
+	 * ibm,my-dma-window OF property
+	 */
+	node_vroot = of_find_node_by_name(NULL, "vdevice");
+	if (node_vroot) {
+		struct device_node *of_node;
+		struct property *prop;
+
+		for_each_child_of_node(node_vroot, of_node) {
+			prop = of_find_property(of_node, "ibm,my-dma-window",
+			                       NULL);
+			if (prop)
+				count++;
+		}
+	}
+	of_node_put(node_vroot);
+	return count;
+}
+
+/**
+ * vio_cmo_alloc - allocate IO memory for CMO-enable devices
+ *
+ * @viodev: VIO device requesting IO memory
+ * @size: size of allocation requested
+ *
+ * Allocations come from memory reserved for the devices and any excess
+ * IO memory available to all devices.  The spare pool used to service
+ * hotplug must be equal to %VIO_CMO_MIN_ENT for the excess pool to be
+ * made available.
+ *
+ * Return codes:
+ *  0 for successful allocation and -ENOMEM for a failure
+ */
+static inline int vio_cmo_alloc(struct vio_dev *viodev, size_t size)
+{
+	unsigned long flags;
+	size_t reserve_free = 0;
+	size_t excess_free = 0;
+	int ret = -ENOMEM;
+
+	spin_lock_irqsave(&vio_cmo.lock, flags);
+
+	/* Determine the amount of free entitlement available in reserve */
+	if (viodev->cmo.entitled > viodev->cmo.allocated)
+		reserve_free = viodev->cmo.entitled - viodev->cmo.allocated;
+
+	/* If spare is not fulfilled, the excess pool can not be used. */
+	if (vio_cmo.spare >= VIO_CMO_MIN_ENT)
+		excess_free = vio_cmo.excess.free;
+
+	/* The request can be satisfied */
+	if ((reserve_free + excess_free) >= size) {
+		vio_cmo.curr += size;
+		if (vio_cmo.curr > vio_cmo.high)
+			vio_cmo.high = vio_cmo.curr;
+		viodev->cmo.allocated += size;
+		size -= min(reserve_free, size);
+		vio_cmo.excess.free -= size;
+		ret = 0;
+	}
+
+	spin_unlock_irqrestore(&vio_cmo.lock, flags);
+	return ret;
+}
+
+/**
+ * vio_cmo_dealloc - deallocate IO memory from CMO-enable devices
+ * @viodev: VIO device freeing IO memory
+ * @size: size of deallocation
+ *
+ * IO memory is freed by the device back to the correct memory pools.
+ * The spare pool is replenished first from either memory pool, then
+ * the reserve pool is used to reduce device entitlement, the excess
+ * pool is used to increase the reserve pool toward the desired entitlement
+ * target, and then the remaining memory is returned to the pools.
+ *
+ */
+static inline void vio_cmo_dealloc(struct vio_dev *viodev, size_t size)
+{
+	unsigned long flags;
+	size_t spare_needed = 0;
+	size_t excess_freed = 0;
+	size_t reserve_freed = size;
+	size_t tmp;
+	int balance = 0;
+
+	spin_lock_irqsave(&vio_cmo.lock, flags);
+	vio_cmo.curr -= size;
+
+	/* Amount of memory freed from the excess pool */
+	if (viodev->cmo.allocated > viodev->cmo.entitled) {
+		excess_freed = min(reserve_freed, (viodev->cmo.allocated -
+		                                   viodev->cmo.entitled));
+		reserve_freed -= excess_freed;
+	}
+
+	/* Remove allocation from device */
+	viodev->cmo.allocated -= (reserve_freed + excess_freed);
+
+	/* Spare is a subset of the reserve pool, replenish it first. */
+	spare_needed = VIO_CMO_MIN_ENT - vio_cmo.spare;
+
+	/*
+	 * Replenish the spare in the reserve pool from the excess pool.
+	 * This moves entitlement into the reserve pool.
+	 */
+	if (spare_needed && excess_freed) {
+		tmp = min(excess_freed, spare_needed);
+		vio_cmo.excess.size -= tmp;
+		vio_cmo.reserve.size += tmp;
+		vio_cmo.spare += tmp;
+		excess_freed -= tmp;
+		spare_needed -= tmp;
+		balance = 1;
+	}
+
+	/*
+	 * Replenish the spare in the reserve pool from the reserve pool.
+	 * This removes entitlement from the device down to VIO_CMO_MIN_ENT,
+	 * if needed, and gives it to the spare pool. The amount of used
+	 * memory in this pool does not change.
+	 */
+	if (spare_needed && reserve_freed) {
+		tmp = min3(spare_needed, reserve_freed, (viodev->cmo.entitled - VIO_CMO_MIN_ENT));
+
+		vio_cmo.spare += tmp;
+		viodev->cmo.entitled -= tmp;
+		reserve_freed -= tmp;
+		spare_needed -= tmp;
+		balance = 1;
+	}
+
+	/*
+	 * Increase the reserve pool until the desired allocation is met.
+	 * Move an allocation freed from the excess pool into the reserve
+	 * pool and schedule a balance operation.
+	 */
+	if (excess_freed && (vio_cmo.desired > vio_cmo.reserve.size)) {
+		tmp = min(excess_freed, (vio_cmo.desired - vio_cmo.reserve.size));
+
+		vio_cmo.excess.size -= tmp;
+		vio_cmo.reserve.size += tmp;
+		excess_freed -= tmp;
+		balance = 1;
+	}
+
+	/* Return memory from the excess pool to that pool */
+	if (excess_freed)
+		vio_cmo.excess.free += excess_freed;
+
+	if (balance)
+		schedule_delayed_work(&vio_cmo.balance_q, VIO_CMO_BALANCE_DELAY);
+	spin_unlock_irqrestore(&vio_cmo.lock, flags);
+}
+
+/**
+ * vio_cmo_entitlement_update - Manage system entitlement changes
+ *
+ * @new_entitlement: new system entitlement to attempt to accommodate
+ *
+ * Increases in entitlement will be used to fulfill the spare entitlement
+ * and the rest is given to the excess pool.  Decreases, if they are
+ * possible, come from the excess pool and from unused device entitlement
+ *
+ * Returns: 0 on success, -ENOMEM when change can not be made
+ */
+int vio_cmo_entitlement_update(size_t new_entitlement)
+{
+	struct vio_dev *viodev;
+	struct vio_cmo_dev_entry *dev_ent;
+	unsigned long flags;
+	size_t avail, delta, tmp;
+
+	spin_lock_irqsave(&vio_cmo.lock, flags);
+
+	/* Entitlement increases */
+	if (new_entitlement > vio_cmo.entitled) {
+		delta = new_entitlement - vio_cmo.entitled;
+
+		/* Fulfill spare allocation */
+		if (vio_cmo.spare < VIO_CMO_MIN_ENT) {
+			tmp = min(delta, (VIO_CMO_MIN_ENT - vio_cmo.spare));
+			vio_cmo.spare += tmp;
+			vio_cmo.reserve.size += tmp;
+			delta -= tmp;
+		}
+
+		/* Remaining new allocation goes to the excess pool */
+		vio_cmo.entitled += delta;
+		vio_cmo.excess.size += delta;
+		vio_cmo.excess.free += delta;
+
+		goto out;
+	}
+
+	/* Entitlement decreases */
+	delta = vio_cmo.entitled - new_entitlement;
+	avail = vio_cmo.excess.free;
+
+	/*
+	 * Need to check how much unused entitlement each device can
+	 * sacrifice to fulfill entitlement change.
+	 */
+	list_for_each_entry(dev_ent, &vio_cmo.device_list, list) {
+		if (avail >= delta)
+			break;
+
+		viodev = dev_ent->viodev;
+		if ((viodev->cmo.entitled > viodev->cmo.allocated) &&
+		    (viodev->cmo.entitled > VIO_CMO_MIN_ENT))
+				avail += viodev->cmo.entitled -
+				         max_t(size_t, viodev->cmo.allocated,
+				               VIO_CMO_MIN_ENT);
+	}
+
+	if (delta <= avail) {
+		vio_cmo.entitled -= delta;
+
+		/* Take entitlement from the excess pool first */
+		tmp = min(vio_cmo.excess.free, delta);
+		vio_cmo.excess.size -= tmp;
+		vio_cmo.excess.free -= tmp;
+		delta -= tmp;
+
+		/*
+		 * Remove all but VIO_CMO_MIN_ENT bytes from devices
+		 * until entitlement change is served
+		 */
+		list_for_each_entry(dev_ent, &vio_cmo.device_list, list) {
+			if (!delta)
+				break;
+
+			viodev = dev_ent->viodev;
+			tmp = 0;
+			if ((viodev->cmo.entitled > viodev->cmo.allocated) &&
+			    (viodev->cmo.entitled > VIO_CMO_MIN_ENT))
+				tmp = viodev->cmo.entitled -
+				      max_t(size_t, viodev->cmo.allocated,
+				            VIO_CMO_MIN_ENT);
+			viodev->cmo.entitled -= min(tmp, delta);
+			delta -= min(tmp, delta);
+		}
+	} else {
+		spin_unlock_irqrestore(&vio_cmo.lock, flags);
+		return -ENOMEM;
+	}
+
+out:
+	schedule_delayed_work(&vio_cmo.balance_q, 0);
+	spin_unlock_irqrestore(&vio_cmo.lock, flags);
+	return 0;
+}
+
+/**
+ * vio_cmo_balance - Balance entitlement among devices
+ *
+ * @work: work queue structure for this operation
+ *
+ * Any system entitlement above the minimum needed for devices, or
+ * already allocated to devices, can be distributed to the devices.
+ * The list of devices is iterated through to recalculate the desired
+ * entitlement level and to determine how much entitlement above the
+ * minimum entitlement is allocated to devices.
+ *
+ * Small chunks of the available entitlement are given to devices until
+ * their requirements are fulfilled or there is no entitlement left to give.
+ * Upon completion sizes of the reserve and excess pools are calculated.
+ *
+ * The system minimum entitlement level is also recalculated here.
+ * Entitlement will be reserved for devices even after vio_bus_remove to
+ * accommodate reloading the driver.  The OF tree is walked to count the
+ * number of devices present and this will remove entitlement for devices
+ * that have actually left the system after having vio_bus_remove called.
+ */
+static void vio_cmo_balance(struct work_struct *work)
+{
+	struct vio_cmo *cmo;
+	struct vio_dev *viodev;
+	struct vio_cmo_dev_entry *dev_ent;
+	unsigned long flags;
+	size_t avail = 0, level, chunk, need;
+	int devcount = 0, fulfilled;
+
+	cmo = container_of(work, struct vio_cmo, balance_q.work);
+
+	spin_lock_irqsave(&vio_cmo.lock, flags);
+
+	/* Calculate minimum entitlement and fulfill spare */
+	cmo->min = vio_cmo_num_OF_devs() * VIO_CMO_MIN_ENT;
+	BUG_ON(cmo->min > cmo->entitled);
+	cmo->spare = min_t(size_t, VIO_CMO_MIN_ENT, (cmo->entitled - cmo->min));
+	cmo->min += cmo->spare;
+	cmo->desired = cmo->min;
+
+	/*
+	 * Determine how much entitlement is available and reset device
+	 * entitlements
+	 */
+	avail = cmo->entitled - cmo->spare;
+	list_for_each_entry(dev_ent, &vio_cmo.device_list, list) {
+		viodev = dev_ent->viodev;
+		devcount++;
+		viodev->cmo.entitled = VIO_CMO_MIN_ENT;
+		cmo->desired += (viodev->cmo.desired - VIO_CMO_MIN_ENT);
+		avail -= max_t(size_t, viodev->cmo.allocated, VIO_CMO_MIN_ENT);
+	}
+
+	/*
+	 * Having provided each device with the minimum entitlement, loop
+	 * over the devices portioning out the remaining entitlement
+	 * until there is nothing left.
+	 */
+	level = VIO_CMO_MIN_ENT;
+	while (avail) {
+		fulfilled = 0;
+		list_for_each_entry(dev_ent, &vio_cmo.device_list, list) {
+			viodev = dev_ent->viodev;
+
+			if (viodev->cmo.desired <= level) {
+				fulfilled++;
+				continue;
+			}
+
+			/*
+			 * Give the device up to VIO_CMO_BALANCE_CHUNK
+			 * bytes of entitlement, but do not exceed the
+			 * desired level of entitlement for the device.
+			 */
+			chunk = min_t(size_t, avail, VIO_CMO_BALANCE_CHUNK);
+			chunk = min(chunk, (viodev->cmo.desired -
+			                    viodev->cmo.entitled));
+			viodev->cmo.entitled += chunk;
+
+			/*
+			 * If the memory for this entitlement increase was
+			 * already allocated to the device it does not come
+			 * from the available pool being portioned out.
+			 */
+			need = max(viodev->cmo.allocated, viodev->cmo.entitled)-
+			       max(viodev->cmo.allocated, level);
+			avail -= need;
+
+		}
+		if (fulfilled == devcount)
+			break;
+		level += VIO_CMO_BALANCE_CHUNK;
+	}
+
+	/* Calculate new reserve and excess pool sizes */
+	cmo->reserve.size = cmo->min;
+	cmo->excess.free = 0;
+	cmo->excess.size = 0;
+	need = 0;
+	list_for_each_entry(dev_ent, &vio_cmo.device_list, list) {
+		viodev = dev_ent->viodev;
+		/* Calculated reserve size above the minimum entitlement */
+		if (viodev->cmo.entitled)
+			cmo->reserve.size += (viodev->cmo.entitled -
+			                      VIO_CMO_MIN_ENT);
+		/* Calculated used excess entitlement */
+		if (viodev->cmo.allocated > viodev->cmo.entitled)
+			need += viodev->cmo.allocated - viodev->cmo.entitled;
+	}
+	cmo->excess.size = cmo->entitled - cmo->reserve.size;
+	cmo->excess.free = cmo->excess.size - need;
+
+	cancel_delayed_work(to_delayed_work(work));
+	spin_unlock_irqrestore(&vio_cmo.lock, flags);
+}
+
+static void *vio_dma_iommu_alloc_coherent(struct device *dev, size_t size,
+					  dma_addr_t *dma_handle, gfp_t flag,
+					  unsigned long attrs)
+{
+	struct vio_dev *viodev = to_vio_dev(dev);
+	void *ret;
+
+	if (vio_cmo_alloc(viodev, roundup(size, PAGE_SIZE))) {
+		atomic_inc(&viodev->cmo.allocs_failed);
+		return NULL;
+	}
+
+	ret = dma_iommu_ops.alloc(dev, size, dma_handle, flag, attrs);
+	if (unlikely(ret == NULL)) {
+		vio_cmo_dealloc(viodev, roundup(size, PAGE_SIZE));
+		atomic_inc(&viodev->cmo.allocs_failed);
+	}
+
+	return ret;
+}
+
+static void vio_dma_iommu_free_coherent(struct device *dev, size_t size,
+					void *vaddr, dma_addr_t dma_handle,
+					unsigned long attrs)
+{
+	struct vio_dev *viodev = to_vio_dev(dev);
+
+	dma_iommu_ops.free(dev, size, vaddr, dma_handle, attrs);
+
+	vio_cmo_dealloc(viodev, roundup(size, PAGE_SIZE));
+}
+
+static dma_addr_t vio_dma_iommu_map_page(struct device *dev, struct page *page,
+                                         unsigned long offset, size_t size,
+                                         enum dma_data_direction direction,
+                                         unsigned long attrs)
+{
+	struct vio_dev *viodev = to_vio_dev(dev);
+	struct iommu_table *tbl;
+	dma_addr_t ret = IOMMU_MAPPING_ERROR;
+
+	tbl = get_iommu_table_base(dev);
+	if (vio_cmo_alloc(viodev, roundup(size, IOMMU_PAGE_SIZE(tbl)))) {
+		atomic_inc(&viodev->cmo.allocs_failed);
+		return ret;
+	}
+
+	ret = dma_iommu_ops.map_page(dev, page, offset, size, direction, attrs);
+	if (unlikely(dma_mapping_error(dev, ret))) {
+		vio_cmo_dealloc(viodev, roundup(size, IOMMU_PAGE_SIZE(tbl)));
+		atomic_inc(&viodev->cmo.allocs_failed);
+	}
+
+	return ret;
+}
+
+static void vio_dma_iommu_unmap_page(struct device *dev, dma_addr_t dma_handle,
+				     size_t size,
+				     enum dma_data_direction direction,
+				     unsigned long attrs)
+{
+	struct vio_dev *viodev = to_vio_dev(dev);
+	struct iommu_table *tbl;
+
+	tbl = get_iommu_table_base(dev);
+	dma_iommu_ops.unmap_page(dev, dma_handle, size, direction, attrs);
+
+	vio_cmo_dealloc(viodev, roundup(size, IOMMU_PAGE_SIZE(tbl)));
+}
+
+static int vio_dma_iommu_map_sg(struct device *dev, struct scatterlist *sglist,
+                                int nelems, enum dma_data_direction direction,
+                                unsigned long attrs)
+{
+	struct vio_dev *viodev = to_vio_dev(dev);
+	struct iommu_table *tbl;
+	struct scatterlist *sgl;
+	int ret, count;
+	size_t alloc_size = 0;
+
+	tbl = get_iommu_table_base(dev);
+	for_each_sg(sglist, sgl, nelems, count)
+		alloc_size += roundup(sgl->length, IOMMU_PAGE_SIZE(tbl));
+
+	if (vio_cmo_alloc(viodev, alloc_size)) {
+		atomic_inc(&viodev->cmo.allocs_failed);
+		return 0;
+	}
+
+	ret = dma_iommu_ops.map_sg(dev, sglist, nelems, direction, attrs);
+
+	if (unlikely(!ret)) {
+		vio_cmo_dealloc(viodev, alloc_size);
+		atomic_inc(&viodev->cmo.allocs_failed);
+		return ret;
+	}
+
+	for_each_sg(sglist, sgl, ret, count)
+		alloc_size -= roundup(sgl->dma_length, IOMMU_PAGE_SIZE(tbl));
+	if (alloc_size)
+		vio_cmo_dealloc(viodev, alloc_size);
+
+	return ret;
+}
+
+static void vio_dma_iommu_unmap_sg(struct device *dev,
+		struct scatterlist *sglist, int nelems,
+		enum dma_data_direction direction,
+		unsigned long attrs)
+{
+	struct vio_dev *viodev = to_vio_dev(dev);
+	struct iommu_table *tbl;
+	struct scatterlist *sgl;
+	size_t alloc_size = 0;
+	int count;
+
+	tbl = get_iommu_table_base(dev);
+	for_each_sg(sglist, sgl, nelems, count)
+		alloc_size += roundup(sgl->dma_length, IOMMU_PAGE_SIZE(tbl));
+
+	dma_iommu_ops.unmap_sg(dev, sglist, nelems, direction, attrs);
+
+	vio_cmo_dealloc(viodev, alloc_size);
+}
+
+static int vio_dma_iommu_dma_supported(struct device *dev, u64 mask)
+{
+        return dma_iommu_ops.dma_supported(dev, mask);
+}
+
+static u64 vio_dma_get_required_mask(struct device *dev)
+{
+        return dma_iommu_ops.get_required_mask(dev);
+}
+
+static const struct dma_map_ops vio_dma_mapping_ops = {
+	.alloc             = vio_dma_iommu_alloc_coherent,
+	.free              = vio_dma_iommu_free_coherent,
+	.mmap		   = dma_nommu_mmap_coherent,
+	.map_sg            = vio_dma_iommu_map_sg,
+	.unmap_sg          = vio_dma_iommu_unmap_sg,
+	.map_page          = vio_dma_iommu_map_page,
+	.unmap_page        = vio_dma_iommu_unmap_page,
+	.dma_supported     = vio_dma_iommu_dma_supported,
+	.get_required_mask = vio_dma_get_required_mask,
+	.mapping_error	   = dma_iommu_mapping_error,
+};
+
+/**
+ * vio_cmo_set_dev_desired - Set desired entitlement for a device
+ *
+ * @viodev: struct vio_dev for device to alter
+ * @desired: new desired entitlement level in bytes
+ *
+ * For use by devices to request a change to their entitlement at runtime or
+ * through sysfs.  The desired entitlement level is changed and a balancing
+ * of system resources is scheduled to run in the future.
+ */
+void vio_cmo_set_dev_desired(struct vio_dev *viodev, size_t desired)
+{
+	unsigned long flags;
+	struct vio_cmo_dev_entry *dev_ent;
+	int found = 0;
+
+	if (!firmware_has_feature(FW_FEATURE_CMO))
+		return;
+
+	spin_lock_irqsave(&vio_cmo.lock, flags);
+	if (desired < VIO_CMO_MIN_ENT)
+		desired = VIO_CMO_MIN_ENT;
+
+	/*
+	 * Changes will not be made for devices not in the device list.
+	 * If it is not in the device list, then no driver is loaded
+	 * for the device and it can not receive entitlement.
+	 */
+	list_for_each_entry(dev_ent, &vio_cmo.device_list, list)
+		if (viodev == dev_ent->viodev) {
+			found = 1;
+			break;
+		}
+	if (!found) {
+		spin_unlock_irqrestore(&vio_cmo.lock, flags);
+		return;
+	}
+
+	/* Increase/decrease in desired device entitlement */
+	if (desired >= viodev->cmo.desired) {
+		/* Just bump the bus and device values prior to a balance*/
+		vio_cmo.desired += desired - viodev->cmo.desired;
+		viodev->cmo.desired = desired;
+	} else {
+		/* Decrease bus and device values for desired entitlement */
+		vio_cmo.desired -= viodev->cmo.desired - desired;
+		viodev->cmo.desired = desired;
+		/*
+		 * If less entitlement is desired than current entitlement, move
+		 * any reserve memory in the change region to the excess pool.
+		 */
+		if (viodev->cmo.entitled > desired) {
+			vio_cmo.reserve.size -= viodev->cmo.entitled - desired;
+			vio_cmo.excess.size += viodev->cmo.entitled - desired;
+			/*
+			 * If entitlement moving from the reserve pool to the
+			 * excess pool is currently unused, add to the excess
+			 * free counter.
+			 */
+			if (viodev->cmo.allocated < viodev->cmo.entitled)
+				vio_cmo.excess.free += viodev->cmo.entitled -
+				                       max(viodev->cmo.allocated, desired);
+			viodev->cmo.entitled = desired;
+		}
+	}
+	schedule_delayed_work(&vio_cmo.balance_q, 0);
+	spin_unlock_irqrestore(&vio_cmo.lock, flags);
+}
+
+/**
+ * vio_cmo_bus_probe - Handle CMO specific bus probe activities
+ *
+ * @viodev - Pointer to struct vio_dev for device
+ *
+ * Determine the devices IO memory entitlement needs, attempting
+ * to satisfy the system minimum entitlement at first and scheduling
+ * a balance operation to take care of the rest at a later time.
+ *
+ * Returns: 0 on success, -EINVAL when device doesn't support CMO, and
+ *          -ENOMEM when entitlement is not available for device or
+ *          device entry.
+ *
+ */
+static int vio_cmo_bus_probe(struct vio_dev *viodev)
+{
+	struct vio_cmo_dev_entry *dev_ent;
+	struct device *dev = &viodev->dev;
+	struct iommu_table *tbl;
+	struct vio_driver *viodrv = to_vio_driver(dev->driver);
+	unsigned long flags;
+	size_t size;
+	bool dma_capable = false;
+
+	tbl = get_iommu_table_base(dev);
+
+	/* A device requires entitlement if it has a DMA window property */
+	switch (viodev->family) {
+	case VDEVICE:
+		if (of_get_property(viodev->dev.of_node,
+					"ibm,my-dma-window", NULL))
+			dma_capable = true;
+		break;
+	case PFO:
+		dma_capable = false;
+		break;
+	default:
+		dev_warn(dev, "unknown device family: %d\n", viodev->family);
+		BUG();
+		break;
+	}
+
+	/* Configure entitlement for the device. */
+	if (dma_capable) {
+		/* Check that the driver is CMO enabled and get desired DMA */
+		if (!viodrv->get_desired_dma) {
+			dev_err(dev, "%s: device driver does not support CMO\n",
+			        __func__);
+			return -EINVAL;
+		}
+
+		viodev->cmo.desired =
+			IOMMU_PAGE_ALIGN(viodrv->get_desired_dma(viodev), tbl);
+		if (viodev->cmo.desired < VIO_CMO_MIN_ENT)
+			viodev->cmo.desired = VIO_CMO_MIN_ENT;
+		size = VIO_CMO_MIN_ENT;
+
+		dev_ent = kmalloc(sizeof(struct vio_cmo_dev_entry),
+		                  GFP_KERNEL);
+		if (!dev_ent)
+			return -ENOMEM;
+
+		dev_ent->viodev = viodev;
+		spin_lock_irqsave(&vio_cmo.lock, flags);
+		list_add(&dev_ent->list, &vio_cmo.device_list);
+	} else {
+		viodev->cmo.desired = 0;
+		size = 0;
+		spin_lock_irqsave(&vio_cmo.lock, flags);
+	}
+
+	/*
+	 * If the needs for vio_cmo.min have not changed since they
+	 * were last set, the number of devices in the OF tree has
+	 * been constant and the IO memory for this is already in
+	 * the reserve pool.
+	 */
+	if (vio_cmo.min == ((vio_cmo_num_OF_devs() + 1) *
+	                    VIO_CMO_MIN_ENT)) {
+		/* Updated desired entitlement if device requires it */
+		if (size)
+			vio_cmo.desired += (viodev->cmo.desired -
+		                        VIO_CMO_MIN_ENT);
+	} else {
+		size_t tmp;
+
+		tmp = vio_cmo.spare + vio_cmo.excess.free;
+		if (tmp < size) {
+			dev_err(dev, "%s: insufficient free "
+			        "entitlement to add device. "
+			        "Need %lu, have %lu\n", __func__,
+				size, (vio_cmo.spare + tmp));
+			spin_unlock_irqrestore(&vio_cmo.lock, flags);
+			return -ENOMEM;
+		}
+
+		/* Use excess pool first to fulfill request */
+		tmp = min(size, vio_cmo.excess.free);
+		vio_cmo.excess.free -= tmp;
+		vio_cmo.excess.size -= tmp;
+		vio_cmo.reserve.size += tmp;
+
+		/* Use spare if excess pool was insufficient */
+		vio_cmo.spare -= size - tmp;
+
+		/* Update bus accounting */
+		vio_cmo.min += size;
+		vio_cmo.desired += viodev->cmo.desired;
+	}
+	spin_unlock_irqrestore(&vio_cmo.lock, flags);
+	return 0;
+}
+
+/**
+ * vio_cmo_bus_remove - Handle CMO specific bus removal activities
+ *
+ * @viodev - Pointer to struct vio_dev for device
+ *
+ * Remove the device from the cmo device list.  The minimum entitlement
+ * will be reserved for the device as long as it is in the system.  The
+ * rest of the entitlement the device had been allocated will be returned
+ * to the system.
+ */
+static void vio_cmo_bus_remove(struct vio_dev *viodev)
+{
+	struct vio_cmo_dev_entry *dev_ent;
+	unsigned long flags;
+	size_t tmp;
+
+	spin_lock_irqsave(&vio_cmo.lock, flags);
+	if (viodev->cmo.allocated) {
+		dev_err(&viodev->dev, "%s: device had %lu bytes of IO "
+		        "allocated after remove operation.\n",
+		        __func__, viodev->cmo.allocated);
+		BUG();
+	}
+
+	/*
+	 * Remove the device from the device list being maintained for
+	 * CMO enabled devices.
+	 */
+	list_for_each_entry(dev_ent, &vio_cmo.device_list, list)
+		if (viodev == dev_ent->viodev) {
+			list_del(&dev_ent->list);
+			kfree(dev_ent);
+			break;
+		}
+
+	/*
+	 * Devices may not require any entitlement and they do not need
+	 * to be processed.  Otherwise, return the device's entitlement
+	 * back to the pools.
+	 */
+	if (viodev->cmo.entitled) {
+		/*
+		 * This device has not yet left the OF tree, it's
+		 * minimum entitlement remains in vio_cmo.min and
+		 * vio_cmo.desired
+		 */
+		vio_cmo.desired -= (viodev->cmo.desired - VIO_CMO_MIN_ENT);
+
+		/*
+		 * Save min allocation for device in reserve as long
+		 * as it exists in OF tree as determined by later
+		 * balance operation
+		 */
+		viodev->cmo.entitled -= VIO_CMO_MIN_ENT;
+
+		/* Replenish spare from freed reserve pool */
+		if (viodev->cmo.entitled && (vio_cmo.spare < VIO_CMO_MIN_ENT)) {
+			tmp = min(viodev->cmo.entitled, (VIO_CMO_MIN_ENT -
+			                                 vio_cmo.spare));
+			vio_cmo.spare += tmp;
+			viodev->cmo.entitled -= tmp;
+		}
+
+		/* Remaining reserve goes to excess pool */
+		vio_cmo.excess.size += viodev->cmo.entitled;
+		vio_cmo.excess.free += viodev->cmo.entitled;
+		vio_cmo.reserve.size -= viodev->cmo.entitled;
+
+		/*
+		 * Until the device is removed it will keep a
+		 * minimum entitlement; this will guarantee that
+		 * a module unload/load will result in a success.
+		 */
+		viodev->cmo.entitled = VIO_CMO_MIN_ENT;
+		viodev->cmo.desired = VIO_CMO_MIN_ENT;
+		atomic_set(&viodev->cmo.allocs_failed, 0);
+	}
+
+	spin_unlock_irqrestore(&vio_cmo.lock, flags);
+}
+
+static void vio_cmo_set_dma_ops(struct vio_dev *viodev)
+{
+	set_dma_ops(&viodev->dev, &vio_dma_mapping_ops);
+}
+
+/**
+ * vio_cmo_bus_init - CMO entitlement initialization at bus init time
+ *
+ * Set up the reserve and excess entitlement pools based on available
+ * system entitlement and the number of devices in the OF tree that
+ * require entitlement in the reserve pool.
+ */
+static void vio_cmo_bus_init(void)
+{
+	struct hvcall_mpp_data mpp_data;
+	int err;
+
+	memset(&vio_cmo, 0, sizeof(struct vio_cmo));
+	spin_lock_init(&vio_cmo.lock);
+	INIT_LIST_HEAD(&vio_cmo.device_list);
+	INIT_DELAYED_WORK(&vio_cmo.balance_q, vio_cmo_balance);
+
+	/* Get current system entitlement */
+	err = h_get_mpp(&mpp_data);
+
+	/*
+	 * On failure, continue with entitlement set to 0, will panic()
+	 * later when spare is reserved.
+	 */
+	if (err != H_SUCCESS) {
+		printk(KERN_ERR "%s: unable to determine system IO "\
+		       "entitlement. (%d)\n", __func__, err);
+		vio_cmo.entitled = 0;
+	} else {
+		vio_cmo.entitled = mpp_data.entitled_mem;
+	}
+
+	/* Set reservation and check against entitlement */
+	vio_cmo.spare = VIO_CMO_MIN_ENT;
+	vio_cmo.reserve.size = vio_cmo.spare;
+	vio_cmo.reserve.size += (vio_cmo_num_OF_devs() *
+	                         VIO_CMO_MIN_ENT);
+	if (vio_cmo.reserve.size > vio_cmo.entitled) {
+		printk(KERN_ERR "%s: insufficient system entitlement\n",
+		       __func__);
+		panic("%s: Insufficient system entitlement", __func__);
+	}
+
+	/* Set the remaining accounting variables */
+	vio_cmo.excess.size = vio_cmo.entitled - vio_cmo.reserve.size;
+	vio_cmo.excess.free = vio_cmo.excess.size;
+	vio_cmo.min = vio_cmo.reserve.size;
+	vio_cmo.desired = vio_cmo.reserve.size;
+}
+
+/* sysfs device functions and data structures for CMO */
+
+#define viodev_cmo_rd_attr(name)                                        \
+static ssize_t cmo_##name##_show(struct device *dev,                    \
+                                        struct device_attribute *attr,  \
+                                         char *buf)                     \
+{                                                                       \
+	return sprintf(buf, "%lu\n", to_vio_dev(dev)->cmo.name);        \
+}
+
+static ssize_t cmo_allocs_failed_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct vio_dev *viodev = to_vio_dev(dev);
+	return sprintf(buf, "%d\n", atomic_read(&viodev->cmo.allocs_failed));
+}
+
+static ssize_t cmo_allocs_failed_store(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t count)
+{
+	struct vio_dev *viodev = to_vio_dev(dev);
+	atomic_set(&viodev->cmo.allocs_failed, 0);
+	return count;
+}
+
+static ssize_t cmo_desired_store(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t count)
+{
+	struct vio_dev *viodev = to_vio_dev(dev);
+	size_t new_desired;
+	int ret;
+
+	ret = kstrtoul(buf, 10, &new_desired);
+	if (ret)
+		return ret;
+
+	vio_cmo_set_dev_desired(viodev, new_desired);
+	return count;
+}
+
+viodev_cmo_rd_attr(desired);
+viodev_cmo_rd_attr(entitled);
+viodev_cmo_rd_attr(allocated);
+
+static ssize_t name_show(struct device *, struct device_attribute *, char *);
+static ssize_t devspec_show(struct device *, struct device_attribute *, char *);
+static ssize_t modalias_show(struct device *dev, struct device_attribute *attr,
+			     char *buf);
+
+static struct device_attribute dev_attr_name;
+static struct device_attribute dev_attr_devspec;
+static struct device_attribute dev_attr_modalias;
+
+static DEVICE_ATTR_RO(cmo_entitled);
+static DEVICE_ATTR_RO(cmo_allocated);
+static DEVICE_ATTR_RW(cmo_desired);
+static DEVICE_ATTR_RW(cmo_allocs_failed);
+
+static struct attribute *vio_cmo_dev_attrs[] = {
+	&dev_attr_name.attr,
+	&dev_attr_devspec.attr,
+	&dev_attr_modalias.attr,
+	&dev_attr_cmo_entitled.attr,
+	&dev_attr_cmo_allocated.attr,
+	&dev_attr_cmo_desired.attr,
+	&dev_attr_cmo_allocs_failed.attr,
+	NULL,
+};
+ATTRIBUTE_GROUPS(vio_cmo_dev);
+
+/* sysfs bus functions and data structures for CMO */
+
+#define viobus_cmo_rd_attr(name)                                        \
+static ssize_t cmo_bus_##name##_show(struct bus_type *bt, char *buf)    \
+{                                                                       \
+	return sprintf(buf, "%lu\n", vio_cmo.name);                     \
+}                                                                       \
+static struct bus_attribute bus_attr_cmo_bus_##name =			\
+	__ATTR(cmo_##name, S_IRUGO, cmo_bus_##name##_show, NULL)
+
+#define viobus_cmo_pool_rd_attr(name, var)                              \
+static ssize_t                                                          \
+cmo_##name##_##var##_show(struct bus_type *bt, char *buf)               \
+{                                                                       \
+	return sprintf(buf, "%lu\n", vio_cmo.name.var);                 \
+}                                                                       \
+static BUS_ATTR_RO(cmo_##name##_##var)
+
+viobus_cmo_rd_attr(entitled);
+viobus_cmo_rd_attr(spare);
+viobus_cmo_rd_attr(min);
+viobus_cmo_rd_attr(desired);
+viobus_cmo_rd_attr(curr);
+viobus_cmo_pool_rd_attr(reserve, size);
+viobus_cmo_pool_rd_attr(excess, size);
+viobus_cmo_pool_rd_attr(excess, free);
+
+static ssize_t cmo_high_show(struct bus_type *bt, char *buf)
+{
+	return sprintf(buf, "%lu\n", vio_cmo.high);
+}
+
+static ssize_t cmo_high_store(struct bus_type *bt, const char *buf,
+			      size_t count)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&vio_cmo.lock, flags);
+	vio_cmo.high = vio_cmo.curr;
+	spin_unlock_irqrestore(&vio_cmo.lock, flags);
+
+	return count;
+}
+static BUS_ATTR_RW(cmo_high);
+
+static struct attribute *vio_bus_attrs[] = {
+	&bus_attr_cmo_bus_entitled.attr,
+	&bus_attr_cmo_bus_spare.attr,
+	&bus_attr_cmo_bus_min.attr,
+	&bus_attr_cmo_bus_desired.attr,
+	&bus_attr_cmo_bus_curr.attr,
+	&bus_attr_cmo_high.attr,
+	&bus_attr_cmo_reserve_size.attr,
+	&bus_attr_cmo_excess_size.attr,
+	&bus_attr_cmo_excess_free.attr,
+	NULL,
+};
+ATTRIBUTE_GROUPS(vio_bus);
+
+static void vio_cmo_sysfs_init(void)
+{
+	vio_bus_type.dev_groups = vio_cmo_dev_groups;
+	vio_bus_type.bus_groups = vio_bus_groups;
+}
+#else /* CONFIG_PPC_SMLPAR */
+int vio_cmo_entitlement_update(size_t new_entitlement) { return 0; }
+void vio_cmo_set_dev_desired(struct vio_dev *viodev, size_t desired) {}
+static int vio_cmo_bus_probe(struct vio_dev *viodev) { return 0; }
+static void vio_cmo_bus_remove(struct vio_dev *viodev) {}
+static void vio_cmo_set_dma_ops(struct vio_dev *viodev) {}
+static void vio_cmo_bus_init(void) {}
+static void vio_cmo_sysfs_init(void) { }
+#endif /* CONFIG_PPC_SMLPAR */
+EXPORT_SYMBOL(vio_cmo_entitlement_update);
+EXPORT_SYMBOL(vio_cmo_set_dev_desired);
+
+
+/*
+ * Platform Facilities Option (PFO) support
+ */
+
+/**
+ * vio_h_cop_sync - Perform a synchronous PFO co-processor operation
+ *
+ * @vdev - Pointer to a struct vio_dev for device
+ * @op - Pointer to a struct vio_pfo_op for the operation parameters
+ *
+ * Calls the hypervisor to synchronously perform the PFO operation
+ * described in @op.  In the case of a busy response from the hypervisor,
+ * the operation will be re-submitted indefinitely unless a non-zero timeout
+ * is specified or an error occurs. The timeout places a limit on when to
+ * stop re-submitting a operation, the total time can be exceeded if an
+ * operation is in progress.
+ *
+ * If op->hcall_ret is not NULL, this will be set to the return from the
+ * last h_cop_op call or it will be 0 if an error not involving the h_call
+ * was encountered.
+ *
+ * Returns:
+ *	0 on success,
+ *	-EINVAL if the h_call fails due to an invalid parameter,
+ *	-E2BIG if the h_call can not be performed synchronously,
+ *	-EBUSY if a timeout is specified and has elapsed,
+ *	-EACCES if the memory area for data/status has been rescinded, or
+ *	-EPERM if a hardware fault has been indicated
+ */
+int vio_h_cop_sync(struct vio_dev *vdev, struct vio_pfo_op *op)
+{
+	struct device *dev = &vdev->dev;
+	unsigned long deadline = 0;
+	long hret = 0;
+	int ret = 0;
+
+	if (op->timeout)
+		deadline = jiffies + msecs_to_jiffies(op->timeout);
+
+	while (true) {
+		hret = plpar_hcall_norets(H_COP, op->flags,
+				vdev->resource_id,
+				op->in, op->inlen, op->out,
+				op->outlen, op->csbcpb);
+
+		if (hret == H_SUCCESS ||
+		    (hret != H_NOT_ENOUGH_RESOURCES &&
+		     hret != H_BUSY && hret != H_RESOURCE) ||
+		    (op->timeout && time_after(deadline, jiffies)))
+			break;
+
+		dev_dbg(dev, "%s: hcall ret(%ld), retrying.\n", __func__, hret);
+	}
+
+	switch (hret) {
+	case H_SUCCESS:
+		ret = 0;
+		break;
+	case H_OP_MODE:
+	case H_TOO_BIG:
+		ret = -E2BIG;
+		break;
+	case H_RESCINDED:
+		ret = -EACCES;
+		break;
+	case H_HARDWARE:
+		ret = -EPERM;
+		break;
+	case H_NOT_ENOUGH_RESOURCES:
+	case H_RESOURCE:
+	case H_BUSY:
+		ret = -EBUSY;
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	}
+
+	if (ret)
+		dev_dbg(dev, "%s: Sync h_cop_op failure (ret:%d) (hret:%ld)\n",
+				__func__, ret, hret);
+
+	op->hcall_err = hret;
+	return ret;
+}
+EXPORT_SYMBOL(vio_h_cop_sync);
+
+static struct iommu_table *vio_build_iommu_table(struct vio_dev *dev)
+{
+	const __be32 *dma_window;
+	struct iommu_table *tbl;
+	unsigned long offset, size;
+
+	dma_window = of_get_property(dev->dev.of_node,
+				  "ibm,my-dma-window", NULL);
+	if (!dma_window)
+		return NULL;
+
+	tbl = kzalloc(sizeof(*tbl), GFP_KERNEL);
+	if (tbl == NULL)
+		return NULL;
+
+	kref_init(&tbl->it_kref);
+
+	of_parse_dma_window(dev->dev.of_node, dma_window,
+			    &tbl->it_index, &offset, &size);
+
+	/* TCE table size - measured in tce entries */
+	tbl->it_page_shift = IOMMU_PAGE_SHIFT_4K;
+	tbl->it_size = size >> tbl->it_page_shift;
+	/* offset for VIO should always be 0 */
+	tbl->it_offset = offset >> tbl->it_page_shift;
+	tbl->it_busno = 0;
+	tbl->it_type = TCE_VB;
+	tbl->it_blocksize = 16;
+
+	if (firmware_has_feature(FW_FEATURE_LPAR))
+		tbl->it_ops = &iommu_table_lpar_multi_ops;
+	else
+		tbl->it_ops = &iommu_table_pseries_ops;
+
+	return iommu_init_table(tbl, -1);
+}
+
+/**
+ * vio_match_device: - Tell if a VIO device has a matching
+ *			VIO device id structure.
+ * @ids:	array of VIO device id structures to search in
+ * @dev:	the VIO device structure to match against
+ *
+ * Used by a driver to check whether a VIO device present in the
+ * system is in its list of supported devices. Returns the matching
+ * vio_device_id structure or NULL if there is no match.
+ */
+static const struct vio_device_id *vio_match_device(
+		const struct vio_device_id *ids, const struct vio_dev *dev)
+{
+	while (ids->type[0] != '\0') {
+		if ((strncmp(dev->type, ids->type, strlen(ids->type)) == 0) &&
+		    of_device_is_compatible(dev->dev.of_node,
+					 ids->compat))
+			return ids;
+		ids++;
+	}
+	return NULL;
+}
+
+/*
+ * Convert from struct device to struct vio_dev and pass to driver.
+ * dev->driver has already been set by generic code because vio_bus_match
+ * succeeded.
+ */
+static int vio_bus_probe(struct device *dev)
+{
+	struct vio_dev *viodev = to_vio_dev(dev);
+	struct vio_driver *viodrv = to_vio_driver(dev->driver);
+	const struct vio_device_id *id;
+	int error = -ENODEV;
+
+	if (!viodrv->probe)
+		return error;
+
+	id = vio_match_device(viodrv->id_table, viodev);
+	if (id) {
+		memset(&viodev->cmo, 0, sizeof(viodev->cmo));
+		if (firmware_has_feature(FW_FEATURE_CMO)) {
+			error = vio_cmo_bus_probe(viodev);
+			if (error)
+				return error;
+		}
+		error = viodrv->probe(viodev, id);
+		if (error && firmware_has_feature(FW_FEATURE_CMO))
+			vio_cmo_bus_remove(viodev);
+	}
+
+	return error;
+}
+
+/* convert from struct device to struct vio_dev and pass to driver. */
+static int vio_bus_remove(struct device *dev)
+{
+	struct vio_dev *viodev = to_vio_dev(dev);
+	struct vio_driver *viodrv = to_vio_driver(dev->driver);
+	struct device *devptr;
+	int ret = 1;
+
+	/*
+	 * Hold a reference to the device after the remove function is called
+	 * to allow for CMO accounting cleanup for the device.
+	 */
+	devptr = get_device(dev);
+
+	if (viodrv->remove)
+		ret = viodrv->remove(viodev);
+
+	if (!ret && firmware_has_feature(FW_FEATURE_CMO))
+		vio_cmo_bus_remove(viodev);
+
+	put_device(devptr);
+	return ret;
+}
+
+/**
+ * vio_register_driver: - Register a new vio driver
+ * @viodrv:	The vio_driver structure to be registered.
+ */
+int __vio_register_driver(struct vio_driver *viodrv, struct module *owner,
+			  const char *mod_name)
+{
+	pr_debug("%s: driver %s registering\n", __func__, viodrv->name);
+
+	/* fill in 'struct driver' fields */
+	viodrv->driver.name = viodrv->name;
+	viodrv->driver.pm = viodrv->pm;
+	viodrv->driver.bus = &vio_bus_type;
+	viodrv->driver.owner = owner;
+	viodrv->driver.mod_name = mod_name;
+
+	return driver_register(&viodrv->driver);
+}
+EXPORT_SYMBOL(__vio_register_driver);
+
+/**
+ * vio_unregister_driver - Remove registration of vio driver.
+ * @viodrv:	The vio_driver struct to be removed form registration
+ */
+void vio_unregister_driver(struct vio_driver *viodrv)
+{
+	driver_unregister(&viodrv->driver);
+}
+EXPORT_SYMBOL(vio_unregister_driver);
+
+/* vio_dev refcount hit 0 */
+static void vio_dev_release(struct device *dev)
+{
+	struct iommu_table *tbl = get_iommu_table_base(dev);
+
+	if (tbl)
+		iommu_tce_table_put(tbl);
+	of_node_put(dev->of_node);
+	kfree(to_vio_dev(dev));
+}
+
+/**
+ * vio_register_device_node: - Register a new vio device.
+ * @of_node:	The OF node for this device.
+ *
+ * Creates and initializes a vio_dev structure from the data in
+ * of_node and adds it to the list of virtual devices.
+ * Returns a pointer to the created vio_dev or NULL if node has
+ * NULL device_type or compatible fields.
+ */
+struct vio_dev *vio_register_device_node(struct device_node *of_node)
+{
+	struct vio_dev *viodev;
+	struct device_node *parent_node;
+	const __be32 *prop;
+	enum vio_dev_family family;
+	const char *of_node_name = of_node->name ? of_node->name : "<unknown>";
+
+	/*
+	 * Determine if this node is a under the /vdevice node or under the
+	 * /ibm,platform-facilities node.  This decides the device's family.
+	 */
+	parent_node = of_get_parent(of_node);
+	if (parent_node) {
+		if (!strcmp(parent_node->type, "ibm,platform-facilities"))
+			family = PFO;
+		else if (!strcmp(parent_node->type, "vdevice"))
+			family = VDEVICE;
+		else {
+			pr_warn("%s: parent(%pOF) of %s not recognized.\n",
+					__func__,
+					parent_node,
+					of_node_name);
+			of_node_put(parent_node);
+			return NULL;
+		}
+		of_node_put(parent_node);
+	} else {
+		pr_warn("%s: could not determine the parent of node %s.\n",
+				__func__, of_node_name);
+		return NULL;
+	}
+
+	if (family == PFO) {
+		if (of_get_property(of_node, "interrupt-controller", NULL)) {
+			pr_debug("%s: Skipping the interrupt controller %s.\n",
+					__func__, of_node_name);
+			return NULL;
+		}
+	}
+
+	/* allocate a vio_dev for this node */
+	viodev = kzalloc(sizeof(struct vio_dev), GFP_KERNEL);
+	if (viodev == NULL) {
+		pr_warn("%s: allocation failure for VIO device.\n", __func__);
+		return NULL;
+	}
+
+	/* we need the 'device_type' property, in order to match with drivers */
+	viodev->family = family;
+	if (viodev->family == VDEVICE) {
+		unsigned int unit_address;
+
+		if (of_node->type != NULL)
+			viodev->type = of_node->type;
+		else {
+			pr_warn("%s: node %s is missing the 'device_type' "
+					"property.\n", __func__, of_node_name);
+			goto out;
+		}
+
+		prop = of_get_property(of_node, "reg", NULL);
+		if (prop == NULL) {
+			pr_warn("%s: node %s missing 'reg'\n",
+					__func__, of_node_name);
+			goto out;
+		}
+		unit_address = of_read_number(prop, 1);
+		dev_set_name(&viodev->dev, "%x", unit_address);
+		viodev->irq = irq_of_parse_and_map(of_node, 0);
+		viodev->unit_address = unit_address;
+	} else {
+		/* PFO devices need their resource_id for submitting COP_OPs
+		 * This is an optional field for devices, but is required when
+		 * performing synchronous ops */
+		prop = of_get_property(of_node, "ibm,resource-id", NULL);
+		if (prop != NULL)
+			viodev->resource_id = of_read_number(prop, 1);
+
+		dev_set_name(&viodev->dev, "%s", of_node_name);
+		viodev->type = of_node_name;
+		viodev->irq = 0;
+	}
+
+	viodev->name = of_node->name;
+	viodev->dev.of_node = of_node_get(of_node);
+
+	set_dev_node(&viodev->dev, of_node_to_nid(of_node));
+
+	/* init generic 'struct device' fields: */
+	viodev->dev.parent = &vio_bus_device.dev;
+	viodev->dev.bus = &vio_bus_type;
+	viodev->dev.release = vio_dev_release;
+
+	if (of_get_property(viodev->dev.of_node, "ibm,my-dma-window", NULL)) {
+		if (firmware_has_feature(FW_FEATURE_CMO))
+			vio_cmo_set_dma_ops(viodev);
+		else
+			set_dma_ops(&viodev->dev, &dma_iommu_ops);
+
+		set_iommu_table_base(&viodev->dev,
+				     vio_build_iommu_table(viodev));
+
+		/* needed to ensure proper operation of coherent allocations
+		 * later, in case driver doesn't set it explicitly */
+		viodev->dev.coherent_dma_mask = DMA_BIT_MASK(64);
+		viodev->dev.dma_mask = &viodev->dev.coherent_dma_mask;
+	}
+
+	/* register with generic device framework */
+	if (device_register(&viodev->dev)) {
+		printk(KERN_ERR "%s: failed to register device %s\n",
+				__func__, dev_name(&viodev->dev));
+		put_device(&viodev->dev);
+		return NULL;
+	}
+
+	return viodev;
+
+out:	/* Use this exit point for any return prior to device_register */
+	kfree(viodev);
+
+	return NULL;
+}
+EXPORT_SYMBOL(vio_register_device_node);
+
+/*
+ * vio_bus_scan_for_devices - Scan OF and register each child device
+ * @root_name - OF node name for the root of the subtree to search.
+ *		This must be non-NULL
+ *
+ * Starting from the root node provide, register the device node for
+ * each child beneath the root.
+ */
+static void vio_bus_scan_register_devices(char *root_name)
+{
+	struct device_node *node_root, *node_child;
+
+	if (!root_name)
+		return;
+
+	node_root = of_find_node_by_name(NULL, root_name);
+	if (node_root) {
+
+		/*
+		 * Create struct vio_devices for each virtual device in
+		 * the device tree. Drivers will associate with them later.
+		 */
+		node_child = of_get_next_child(node_root, NULL);
+		while (node_child) {
+			vio_register_device_node(node_child);
+			node_child = of_get_next_child(node_root, node_child);
+		}
+		of_node_put(node_root);
+	}
+}
+
+/**
+ * vio_bus_init: - Initialize the virtual IO bus
+ */
+static int __init vio_bus_init(void)
+{
+	int err;
+
+	if (firmware_has_feature(FW_FEATURE_CMO))
+		vio_cmo_sysfs_init();
+
+	err = bus_register(&vio_bus_type);
+	if (err) {
+		printk(KERN_ERR "failed to register VIO bus\n");
+		return err;
+	}
+
+	/*
+	 * The fake parent of all vio devices, just to give us
+	 * a nice directory
+	 */
+	err = device_register(&vio_bus_device.dev);
+	if (err) {
+		printk(KERN_WARNING "%s: device_register returned %i\n",
+				__func__, err);
+		return err;
+	}
+
+	if (firmware_has_feature(FW_FEATURE_CMO))
+		vio_cmo_bus_init();
+
+	return 0;
+}
+postcore_initcall(vio_bus_init);
+
+static int __init vio_device_init(void)
+{
+	vio_bus_scan_register_devices("vdevice");
+	vio_bus_scan_register_devices("ibm,platform-facilities");
+
+	return 0;
+}
+device_initcall(vio_device_init);
+
+static ssize_t name_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	return sprintf(buf, "%s\n", to_vio_dev(dev)->name);
+}
+static DEVICE_ATTR_RO(name);
+
+static ssize_t devspec_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct device_node *of_node = dev->of_node;
+
+	return sprintf(buf, "%pOF\n", of_node);
+}
+static DEVICE_ATTR_RO(devspec);
+
+static ssize_t modalias_show(struct device *dev, struct device_attribute *attr,
+			     char *buf)
+{
+	const struct vio_dev *vio_dev = to_vio_dev(dev);
+	struct device_node *dn;
+	const char *cp;
+
+	dn = dev->of_node;
+	if (!dn) {
+		strcpy(buf, "\n");
+		return strlen(buf);
+	}
+	cp = of_get_property(dn, "compatible", NULL);
+	if (!cp) {
+		strcpy(buf, "\n");
+		return strlen(buf);
+	}
+
+	return sprintf(buf, "vio:T%sS%s\n", vio_dev->type, cp);
+}
+static DEVICE_ATTR_RO(modalias);
+
+static struct attribute *vio_dev_attrs[] = {
+	&dev_attr_name.attr,
+	&dev_attr_devspec.attr,
+	&dev_attr_modalias.attr,
+	NULL,
+};
+ATTRIBUTE_GROUPS(vio_dev);
+
+void vio_unregister_device(struct vio_dev *viodev)
+{
+	device_unregister(&viodev->dev);
+	if (viodev->family == VDEVICE)
+		irq_dispose_mapping(viodev->irq);
+}
+EXPORT_SYMBOL(vio_unregister_device);
+
+static int vio_bus_match(struct device *dev, struct device_driver *drv)
+{
+	const struct vio_dev *vio_dev = to_vio_dev(dev);
+	struct vio_driver *vio_drv = to_vio_driver(drv);
+	const struct vio_device_id *ids = vio_drv->id_table;
+
+	return (ids != NULL) && (vio_match_device(ids, vio_dev) != NULL);
+}
+
+static int vio_hotplug(struct device *dev, struct kobj_uevent_env *env)
+{
+	const struct vio_dev *vio_dev = to_vio_dev(dev);
+	struct device_node *dn;
+	const char *cp;
+
+	dn = dev->of_node;
+	if (!dn)
+		return -ENODEV;
+	cp = of_get_property(dn, "compatible", NULL);
+	if (!cp)
+		return -ENODEV;
+
+	add_uevent_var(env, "MODALIAS=vio:T%sS%s", vio_dev->type, cp);
+	return 0;
+}
+
+struct bus_type vio_bus_type = {
+	.name = "vio",
+	.dev_groups = vio_dev_groups,
+	.uevent = vio_hotplug,
+	.match = vio_bus_match,
+	.probe = vio_bus_probe,
+	.remove = vio_bus_remove,
+};
+
+/**
+ * vio_get_attribute: - get attribute for virtual device
+ * @vdev:	The vio device to get property.
+ * @which:	The property/attribute to be extracted.
+ * @length:	Pointer to length of returned data size (unused if NULL).
+ *
+ * Calls prom.c's of_get_property() to return the value of the
+ * attribute specified by @which
+*/
+const void *vio_get_attribute(struct vio_dev *vdev, char *which, int *length)
+{
+	return of_get_property(vdev->dev.of_node, which, length);
+}
+EXPORT_SYMBOL(vio_get_attribute);
+
+#ifdef CONFIG_PPC_PSERIES
+/* vio_find_name() - internal because only vio.c knows how we formatted the
+ * kobject name
+ */
+static struct vio_dev *vio_find_name(const char *name)
+{
+	struct device *found;
+
+	found = bus_find_device_by_name(&vio_bus_type, NULL, name);
+	if (!found)
+		return NULL;
+
+	return to_vio_dev(found);
+}
+
+/**
+ * vio_find_node - find an already-registered vio_dev
+ * @vnode: device_node of the virtual device we're looking for
+ *
+ * Takes a reference to the embedded struct device which needs to be dropped
+ * after use.
+ */
+struct vio_dev *vio_find_node(struct device_node *vnode)
+{
+	char kobj_name[20];
+	struct device_node *vnode_parent;
+	const char *dev_type;
+
+	vnode_parent = of_get_parent(vnode);
+	if (!vnode_parent)
+		return NULL;
+
+	dev_type = of_get_property(vnode_parent, "device_type", NULL);
+	of_node_put(vnode_parent);
+	if (!dev_type)
+		return NULL;
+
+	/* construct the kobject name from the device node */
+	if (!strcmp(dev_type, "vdevice")) {
+		const __be32 *prop;
+		
+		prop = of_get_property(vnode, "reg", NULL);
+		if (!prop)
+			return NULL;
+		snprintf(kobj_name, sizeof(kobj_name), "%x",
+			 (uint32_t)of_read_number(prop, 1));
+	} else if (!strcmp(dev_type, "ibm,platform-facilities"))
+		snprintf(kobj_name, sizeof(kobj_name), "%s", vnode->name);
+	else
+		return NULL;
+
+	return vio_find_name(kobj_name);
+}
+EXPORT_SYMBOL(vio_find_node);
+
+int vio_enable_interrupts(struct vio_dev *dev)
+{
+	int rc = h_vio_signal(dev->unit_address, VIO_IRQ_ENABLE);
+	if (rc != H_SUCCESS)
+		printk(KERN_ERR "vio: Error 0x%x enabling interrupts\n", rc);
+	return rc;
+}
+EXPORT_SYMBOL(vio_enable_interrupts);
+
+int vio_disable_interrupts(struct vio_dev *dev)
+{
+	int rc = h_vio_signal(dev->unit_address, VIO_IRQ_DISABLE);
+	if (rc != H_SUCCESS)
+		printk(KERN_ERR "vio: Error 0x%x disabling interrupts\n", rc);
+	return rc;
+}
+EXPORT_SYMBOL(vio_disable_interrupts);
+#endif /* CONFIG_PPC_PSERIES */